2125 files changed, 259580 insertions, 99904 deletions
diff --git a/llvm/lib/Analysis/AliasAnalysis.cpp b/llvm/lib/Analysis/AliasAnalysis.cpp
index 1c7678a602d8..fec2415a0e45 100644
--- a/llvm/lib/Analysis/AliasAnalysis.cpp
+++ b/llvm/lib/Analysis/AliasAnalysis.cpp
@@ -58,7 +58,7 @@ using namespace llvm;
 
 /// Allow disabling BasicAA from the AA results. This is particularly useful
 /// when testing to isolate a single AA implementation.
-static cl::opt<bool> DisableBasicAA("disable-basicaa", cl::Hidden,
+static cl::opt<bool> DisableBasicAA("disable-basic-aa", cl::Hidden,
                                     cl::init(false));
 
 AAResults::AAResults(AAResults &&Arg)
@@ -196,8 +196,7 @@ ModRefInfo AAResults::getModRefInfo(const CallBase *Call,
   // Try to refine the mod-ref info further using other API entry points to the
   // aggregate set of AA results.
   auto MRB = getModRefBehavior(Call);
-  if (MRB == FMRB_DoesNotAccessMemory ||
-      MRB == FMRB_OnlyAccessesInaccessibleMem)
+  if (onlyAccessesInaccessibleMem(MRB))
     return ModRefInfo::NoModRef;
 
   if (onlyReadsMemory(MRB))
@@ -631,16 +630,14 @@ ModRefInfo AAResults::getModRefInfo(const AtomicRMWInst *RMW,
 
 /// Return information about whether a particular call site modifies
 /// or reads the specified memory location \p MemLoc before instruction \p I
-/// in a BasicBlock. An ordered basic block \p OBB can be used to speed up
-/// instruction-ordering queries inside the BasicBlock containing \p I.
+/// in a BasicBlock.
 /// FIXME: this is really just shoring-up a deficiency in alias analysis.
 /// BasicAA isn't willing to spend linear time determining whether an alloca
 /// was captured before or after this particular call, while we are. However,
 /// with a smarter AA in place, this test is just wasting compile time.
 ModRefInfo AAResults::callCapturesBefore(const Instruction *I,
                                          const MemoryLocation &MemLoc,
-                                         DominatorTree *DT,
-                                         OrderedBasicBlock *OBB) {
+                                         DominatorTree *DT) {
   if (!DT)
     return ModRefInfo::ModRef;
 
@@ -656,8 +653,7 @@ ModRefInfo AAResults::callCapturesBefore(const Instruction *I,
 
   if (PointerMayBeCapturedBefore(Object, /* ReturnCaptures */ true,
                                  /* StoreCaptures */ true, I, DT,
-                                 /* include Object */ true,
-                                 /* OrderedBasicBlock */ OBB))
+                                 /* include Object */ true))
     return ModRefInfo::ModRef;
 
   unsigned ArgNo = 0;
diff --git a/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp b/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
index 2e44bbd3a8ca..b1433c579af8 100644
--- a/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
+++ b/llvm/lib/Analysis/AliasAnalysisEvaluator.cpp
@@ -114,7 +114,7 @@ void AAEvaluator::runInternal(Function &F, AAResults &AA) {
       Stores.insert(&*I);
     Instruction &Inst = *I;
     if (auto *Call = dyn_cast<CallBase>(&Inst)) {
-      Value *Callee = Call->getCalledValue();
+      Value *Callee = Call->getCalledOperand();
       // Skip actual functions for direct function calls.
       if (!isa<Function>(Callee) && isInterestingPointer(Callee))
         Pointers.insert(Callee);
diff --git a/llvm/lib/Analysis/AliasAnalysisSummary.cpp b/llvm/lib/Analysis/AliasAnalysisSummary.cpp
index 2f3396a44117..d9c5732da1f3 100644
--- a/llvm/lib/Analysis/AliasAnalysisSummary.cpp
+++ b/llvm/lib/Analysis/AliasAnalysisSummary.cpp
@@ -1,5 +1,6 @@
 #include "AliasAnalysisSummary.h"
 #include "llvm/IR/Argument.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Compiler.h"
 
diff --git a/llvm/lib/Analysis/AliasAnalysisSummary.h b/llvm/lib/Analysis/AliasAnalysisSummary.h
index fe75b03cedef..10d49f9c0113 100644
--- a/llvm/lib/Analysis/AliasAnalysisSummary.h
+++ b/llvm/lib/Analysis/AliasAnalysisSummary.h
@@ -37,10 +37,13 @@
 #include "llvm/ADT/DenseMapInfo.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/InstrTypes.h"
 #include <bitset>
 
 namespace llvm {
+
+class CallBase;
+class Value;
+
 namespace cflaa {
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Analysis/AliasSetTracker.cpp b/llvm/lib/Analysis/AliasSetTracker.cpp
index 5cc5ab597ef9..5cc68f05dc0e 100644
--- a/llvm/lib/Analysis/AliasSetTracker.cpp
+++ b/llvm/lib/Analysis/AliasSetTracker.cpp
@@ -677,7 +677,7 @@ void AliasSet::print(raw_ostream &OS) const {
       I.getPointer()->printAsOperand(OS << "(");
       if (I.getSize() == LocationSize::unknown())
         OS << ", unknown)";
-      else 
+      else
         OS << ", " << I.getSize() << ")";
     }
   }
diff --git a/llvm/lib/Analysis/AssumeBundleQueries.cpp b/llvm/lib/Analysis/AssumeBundleQueries.cpp
new file mode 100644
index 000000000000..05fe05a0bd85
--- /dev/null
+++ b/llvm/lib/Analysis/AssumeBundleQueries.cpp
@@ -0,0 +1,213 @@
+//===- AssumeBundleQueries.cpp - tool to query assume bundles ---*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "assume-queries"
+
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/DebugCounter.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+STATISTIC(NumAssumeQueries, "Number of Queries into an assume assume bundles");
+STATISTIC(
+    NumUsefullAssumeQueries,
+    "Number of Queries into an assume assume bundles that were satisfied");
+
+DEBUG_COUNTER(AssumeQueryCounter, "assume-queries-counter",
+              "Controls which assumes gets created");
+
+static bool bundleHasArgument(const CallBase::BundleOpInfo &BOI, unsigned Idx) {
+  return BOI.End - BOI.Begin > Idx;
+}
+
+static Value *getValueFromBundleOpInfo(CallInst &Assume,
+                                       const CallBase::BundleOpInfo &BOI,
+                                       unsigned Idx) {
+  assert(bundleHasArgument(BOI, Idx) && "index out of range");
+  return (Assume.op_begin() + BOI.Begin + Idx)->get();
+}
+
+bool llvm::hasAttributeInAssume(CallInst &AssumeCI, Value *IsOn,
+                                StringRef AttrName, uint64_t *ArgVal) {
+  assert(isa<IntrinsicInst>(AssumeCI) &&
+         "this function is intended to be used on llvm.assume");
+  IntrinsicInst &Assume = cast<IntrinsicInst>(AssumeCI);
+  assert(Assume.getIntrinsicID() == Intrinsic::assume &&
+         "this function is intended to be used on llvm.assume");
+  assert(Attribute::isExistingAttribute(AttrName) &&
+         "this attribute doesn't exist");
+  assert((ArgVal == nullptr || Attribute::doesAttrKindHaveArgument(
+                                   Attribute::getAttrKindFromName(AttrName))) &&
+         "requested value for an attribute that has no argument");
+  if (Assume.bundle_op_infos().empty())
+    return false;
+
+  for (auto &BOI : Assume.bundle_op_infos()) {
+    if (BOI.Tag->getKey() != AttrName)
+      continue;
+    if (IsOn && (BOI.End - BOI.Begin <= ABA_WasOn ||
+                 IsOn != getValueFromBundleOpInfo(Assume, BOI, ABA_WasOn)))
+      continue;
+    if (ArgVal) {
+      assert(BOI.End - BOI.Begin > ABA_Argument);
+      *ArgVal =
+          cast<ConstantInt>(getValueFromBundleOpInfo(Assume, BOI, ABA_Argument))
+              ->getZExtValue();
+    }
+    return true;
+  }
+  return false;
+}
+
+void llvm::fillMapFromAssume(CallInst &AssumeCI, RetainedKnowledgeMap &Result) {
+  IntrinsicInst &Assume = cast<IntrinsicInst>(AssumeCI);
+  assert(Assume.getIntrinsicID() == Intrinsic::assume &&
+         "this function is intended to be used on llvm.assume");
+  for (auto &Bundles : Assume.bundle_op_infos()) {
+    std::pair<Value *, Attribute::AttrKind> Key{
+        nullptr, Attribute::getAttrKindFromName(Bundles.Tag->getKey())};
+    if (bundleHasArgument(Bundles, ABA_WasOn))
+      Key.first = getValueFromBundleOpInfo(Assume, Bundles, ABA_WasOn);
+
+    if (Key.first == nullptr && Key.second == Attribute::None)
+      continue;
+    if (!bundleHasArgument(Bundles, ABA_Argument)) {
+      Result[Key][&Assume] = {0, 0};
+      continue;
+    }
+    unsigned Val = cast<ConstantInt>(
+                       getValueFromBundleOpInfo(Assume, Bundles, ABA_Argument))
+                       ->getZExtValue();
+    auto Lookup = Result.find(Key);
+    if (Lookup == Result.end() || !Lookup->second.count(&Assume)) {
+      Result[Key][&Assume] = {Val, Val};
+      continue;
+    }
+    Lookup->second[&Assume].Min = std::min(Val, Lookup->second[&Assume].Min);
+    Lookup->second[&Assume].Max = std::max(Val, Lookup->second[&Assume].Max);
+  }
+}
+
+RetainedKnowledge
+llvm::getKnowledgeFromBundle(CallInst &Assume,
+                             const CallBase::BundleOpInfo &BOI) {
+  RetainedKnowledge Result;
+  Result.AttrKind = Attribute::getAttrKindFromName(BOI.Tag->getKey());
+  if (bundleHasArgument(BOI, ABA_WasOn))
+    Result.WasOn = getValueFromBundleOpInfo(Assume, BOI, ABA_WasOn);
+  auto GetArgOr1 = [&](unsigned Idx) -> unsigned {
+    if (auto *ConstInt = dyn_cast<ConstantInt>(
+            getValueFromBundleOpInfo(Assume, BOI, ABA_Argument + Idx)))
+      return ConstInt->getZExtValue();
+    return 1;
+  };
+  if (BOI.End - BOI.Begin > ABA_Argument)
+    Result.ArgValue = GetArgOr1(0);
+  if (Result.AttrKind == Attribute::Alignment)
+    if (BOI.End - BOI.Begin > ABA_Argument + 1)
+      Result.ArgValue = MinAlign(Result.ArgValue, GetArgOr1(1));
+  return Result;
+}
+
+RetainedKnowledge llvm::getKnowledgeFromOperandInAssume(CallInst &AssumeCI,
+                                                        unsigned Idx) {
+  IntrinsicInst &Assume = cast<IntrinsicInst>(AssumeCI);
+  assert(Assume.getIntrinsicID() == Intrinsic::assume &&
+         "this function is intended to be used on llvm.assume");
+  CallBase::BundleOpInfo BOI = Assume.getBundleOpInfoForOperand(Idx);
+  return getKnowledgeFromBundle(AssumeCI, BOI);
+}
+
+bool llvm::isAssumeWithEmptyBundle(CallInst &CI) {
+  IntrinsicInst &Assume = cast<IntrinsicInst>(CI);
+  assert(Assume.getIntrinsicID() == Intrinsic::assume &&
+         "this function is intended to be used on llvm.assume");
+  return none_of(Assume.bundle_op_infos(),
+                 [](const CallBase::BundleOpInfo &BOI) {
+                   return BOI.Tag->getKey() != IgnoreBundleTag;
+                 });
+}
+
+static CallInst::BundleOpInfo *getBundleFromUse(const Use *U) {
+  auto *Intr = dyn_cast<IntrinsicInst>(U->getUser());
+  if (!match(U->getUser(),
+             m_Intrinsic<Intrinsic::assume>(m_Unless(m_Specific(U->get())))))
+    return nullptr;
+  return &Intr->getBundleOpInfoForOperand(U->getOperandNo());
+}
+
+RetainedKnowledge
+llvm::getKnowledgeFromUse(const Use *U,
+                          ArrayRef<Attribute::AttrKind> AttrKinds) {
+  CallInst::BundleOpInfo* Bundle = getBundleFromUse(U);
+  if (!Bundle)
+    return RetainedKnowledge::none();
+  RetainedKnowledge RK =
+      getKnowledgeFromBundle(*cast<CallInst>(U->getUser()), *Bundle);
+  for (auto Attr : AttrKinds)
+    if (Attr == RK.AttrKind)
+      return RK;
+  return RetainedKnowledge::none();
+}
+
+RetainedKnowledge
+llvm::getKnowledgeForValue(const Value *V,
+                           ArrayRef<Attribute::AttrKind> AttrKinds,
+                           AssumptionCache *AC,
+                           function_ref<bool(RetainedKnowledge, Instruction *,
+                                             const CallBase::BundleOpInfo *)>
+                               Filter) {
+  NumAssumeQueries++;
+  if (!DebugCounter::shouldExecute(AssumeQueryCounter))
+    return RetainedKnowledge::none();
+  if (AC) {
+    for (AssumptionCache::ResultElem &Elem : AC->assumptionsFor(V)) {
+      IntrinsicInst *II = cast_or_null<IntrinsicInst>(Elem.Assume);
+      if (!II || Elem.Index == AssumptionCache::ExprResultIdx)
+        continue;
+      if (RetainedKnowledge RK = getKnowledgeFromBundle(
+              *II, II->bundle_op_info_begin()[Elem.Index]))
+        if (is_contained(AttrKinds, RK.AttrKind) &&
+            Filter(RK, II, &II->bundle_op_info_begin()[Elem.Index])) {
+          NumUsefullAssumeQueries++;
+          return RK;
+        }
+    }
+    return RetainedKnowledge::none();
+  }
+  for (const auto &U : V->uses()) {
+    CallInst::BundleOpInfo* Bundle = getBundleFromUse(&U);
+    if (!Bundle)
+      continue;
+    if (RetainedKnowledge RK =
+            getKnowledgeFromBundle(*cast<CallInst>(U.getUser()), *Bundle))
+      if (is_contained(AttrKinds, RK.AttrKind) &&
+          Filter(RK, cast<Instruction>(U.getUser()), Bundle)) {
+        NumUsefullAssumeQueries++;
+        return RK;
+      }
+  }
+  return RetainedKnowledge::none();
+}
+
+RetainedKnowledge llvm::getKnowledgeValidInContext(
+    const Value *V, ArrayRef<Attribute::AttrKind> AttrKinds,
+    const Instruction *CtxI, const DominatorTree *DT, AssumptionCache *AC) {
+  return getKnowledgeForValue(V, AttrKinds, AC,
+                              [&](auto, Instruction *I, auto) {
+                                return isValidAssumeForContext(I, CtxI, DT);
+                              });
+}
diff --git a/llvm/lib/Analysis/AssumptionCache.cpp b/llvm/lib/Analysis/AssumptionCache.cpp
index f4d4a5ac8f88..16bfd5c75902 100644
--- a/llvm/lib/Analysis/AssumptionCache.cpp
+++ b/llvm/lib/Analysis/AssumptionCache.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -41,7 +42,7 @@ static cl::opt<bool>
                           cl::desc("Enable verification of assumption cache"),
                           cl::init(false));
 
-SmallVector<WeakTrackingVH, 1> &
+SmallVector<AssumptionCache::ResultElem, 1> &
 AssumptionCache::getOrInsertAffectedValues(Value *V) {
   // Try using find_as first to avoid creating extra value handles just for the
   // purpose of doing the lookup.
@@ -50,32 +51,39 @@ AssumptionCache::getOrInsertAffectedValues(Value *V) {
     return AVI->second;
 
   auto AVIP = AffectedValues.insert(
-      {AffectedValueCallbackVH(V, this), SmallVector<WeakTrackingVH, 1>()});
+      {AffectedValueCallbackVH(V, this), SmallVector<ResultElem, 1>()});
   return AVIP.first->second;
 }
 
-static void findAffectedValues(CallInst *CI,
-                               SmallVectorImpl<Value *> &Affected) {
+static void
+findAffectedValues(CallInst *CI,
+                   SmallVectorImpl<AssumptionCache::ResultElem> &Affected) {
   // Note: This code must be kept in-sync with the code in
   // computeKnownBitsFromAssume in ValueTracking.
 
-  auto AddAffected = [&Affected](Value *V) {
+  auto AddAffected = [&Affected](Value *V, unsigned Idx =
+                                               AssumptionCache::ExprResultIdx) {
     if (isa<Argument>(V)) {
-      Affected.push_back(V);
+      Affected.push_back({V, Idx});
     } else if (auto *I = dyn_cast<Instruction>(V)) {
-      Affected.push_back(I);
+      Affected.push_back({I, Idx});
 
       // Peek through unary operators to find the source of the condition.
       Value *Op;
       if (match(I, m_BitCast(m_Value(Op))) ||
-          match(I, m_PtrToInt(m_Value(Op))) ||
-          match(I, m_Not(m_Value(Op)))) {
+          match(I, m_PtrToInt(m_Value(Op))) || match(I, m_Not(m_Value(Op)))) {
         if (isa<Instruction>(Op) || isa<Argument>(Op))
-          Affected.push_back(Op);
+          Affected.push_back({Op, Idx});
       }
     }
   };
 
+  for (unsigned Idx = 0; Idx != CI->getNumOperandBundles(); Idx++) {
+    if (CI->getOperandBundleAt(Idx).Inputs.size() > ABA_WasOn &&
+        CI->getOperandBundleAt(Idx).getTagName() != IgnoreBundleTag)
+      AddAffected(CI->getOperandBundleAt(Idx).Inputs[ABA_WasOn], Idx);
+  }
+
   Value *Cond = CI->getArgOperand(0), *A, *B;
   AddAffected(Cond);
 
@@ -112,28 +120,44 @@ static void findAffectedValues(CallInst *CI,
 }
 
 void AssumptionCache::updateAffectedValues(CallInst *CI) {
-  SmallVector<Value *, 16> Affected;
+  SmallVector<AssumptionCache::ResultElem, 16> Affected;
   findAffectedValues(CI, Affected);
 
   for (auto &AV : Affected) {
-    auto &AVV = getOrInsertAffectedValues(AV);
-    if (std::find(AVV.begin(), AVV.end(), CI) == AVV.end())
-      AVV.push_back(CI);
+    auto &AVV = getOrInsertAffectedValues(AV.Assume);
+    if (std::find_if(AVV.begin(), AVV.end(), [&](ResultElem &Elem) {
+          return Elem.Assume == CI && Elem.Index == AV.Index;
+        }) == AVV.end())
+      AVV.push_back({CI, AV.Index});
   }
 }
 
 void AssumptionCache::unregisterAssumption(CallInst *CI) {
-  SmallVector<Value *, 16> Affected;
+  SmallVector<AssumptionCache::ResultElem, 16> Affected;
   findAffectedValues(CI, Affected);
 
   for (auto &AV : Affected) {
-    auto AVI = AffectedValues.find_as(AV);
-    if (AVI != AffectedValues.end())
+    auto AVI = AffectedValues.find_as(AV.Assume);
+    if (AVI == AffectedValues.end())
+      continue;
+    bool Found = false;
+    bool HasNonnull = false;
+    for (ResultElem &Elem : AVI->second) {
+      if (Elem.Assume == CI) {
+        Found = true;
+        Elem.Assume = nullptr;
+      }
+      HasNonnull |= !!Elem.Assume;
+      if (HasNonnull && Found)
+        break;
+    }
+    assert(Found && "already unregistered or incorrect cache state");
+    if (!HasNonnull)
       AffectedValues.erase(AVI);
   }
 
   AssumeHandles.erase(
-      remove_if(AssumeHandles, [CI](WeakTrackingVH &VH) { return CI == VH; }),
+      remove_if(AssumeHandles, [CI](ResultElem &RE) { return CI == RE; }),
       AssumeHandles.end());
 }
 
@@ -177,7 +201,7 @@ void AssumptionCache::scanFunction() {
   for (BasicBlock &B : F)
     for (Instruction &II : B)
       if (match(&II, m_Intrinsic<Intrinsic::assume>()))
-        AssumeHandles.push_back(&II);
+        AssumeHandles.push_back({&II, ExprResultIdx});
 
   // Mark the scan as complete.
   Scanned = true;
@@ -196,7 +220,7 @@ void AssumptionCache::registerAssumption(CallInst *CI) {
   if (!Scanned)
     return;
 
-  AssumeHandles.push_back(CI);
+  AssumeHandles.push_back({CI, ExprResultIdx});
 
 #ifndef NDEBUG
   assert(CI->getParent() &&
diff --git a/llvm/lib/Analysis/BasicAliasAnalysis.cpp b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
index e852d663c6b4..74664098ce1d 100644
--- a/llvm/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/BasicAliasAnalysis.cpp
@@ -65,16 +65,16 @@
 using namespace llvm;
 
 /// Enable analysis of recursive PHI nodes.
-static cl::opt<bool> EnableRecPhiAnalysis("basicaa-recphi", cl::Hidden,
+static cl::opt<bool> EnableRecPhiAnalysis("basic-aa-recphi", cl::Hidden,
                                           cl::init(false));
 
 /// By default, even on 32-bit architectures we use 64-bit integers for
 /// calculations. This will allow us to more-aggressively decompose indexing
 /// expressions calculated using i64 values (e.g., long long in C) which is
 /// common enough to worry about.
-static cl::opt<bool> ForceAtLeast64Bits("basicaa-force-at-least-64b",
+static cl::opt<bool> ForceAtLeast64Bits("basic-aa-force-at-least-64b",
                                         cl::Hidden, cl::init(true));
-static cl::opt<bool> DoubleCalcBits("basicaa-double-calc-bits",
+static cl::opt<bool> DoubleCalcBits("basic-aa-double-calc-bits",
                                     cl::Hidden, cl::init(false));
 
 /// SearchLimitReached / SearchTimes shows how often the limit of
@@ -433,7 +433,7 @@ static bool isObjectSize(const Value *V, uint64_t Size, const DataLayout &DL,
 /// an issue, for example, in particular for 32b pointers with negative indices
 /// that rely on two's complement wrap-arounds for precise alias information
 /// where the maximum pointer size is 64b.
-static APInt adjustToPointerSize(APInt Offset, unsigned PointerSize) {
+static APInt adjustToPointerSize(const APInt &Offset, unsigned PointerSize) {
   assert(PointerSize <= Offset.getBitWidth() && "Invalid PointerSize!");
   unsigned ShiftBits = Offset.getBitWidth() - PointerSize;
   return (Offset << ShiftBits).ashr(ShiftBits);
@@ -492,7 +492,13 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
 
     const GEPOperator *GEPOp = dyn_cast<GEPOperator>(Op);
     if (!GEPOp) {
-      if (const auto *Call = dyn_cast<CallBase>(V)) {
+      if (const auto *PHI = dyn_cast<PHINode>(V)) {
+        // Look through single-arg phi nodes created by LCSSA.
+        if (PHI->getNumIncomingValues() == 1) {
+          V = PHI->getIncomingValue(0);
+          continue;
+        }
+      } else if (const auto *Call = dyn_cast<CallBase>(V)) {
         // CaptureTracking can know about special capturing properties of some
         // intrinsics like launder.invariant.group, that can't be expressed with
         // the attributes, but have properties like returning aliasing pointer.
@@ -508,19 +514,6 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
         }
       }
 
-      // If it's not a GEP, hand it off to SimplifyInstruction to see if it
-      // can come up with something. This matches what GetUnderlyingObject does.
-      if (const Instruction *I = dyn_cast<Instruction>(V))
-        // TODO: Get a DominatorTree and AssumptionCache and use them here
-        // (these are both now available in this function, but this should be
-        // updated when GetUnderlyingObject is updated). TLI should be
-        // provided also.
-        if (const Value *Simplified =
-                SimplifyInstruction(const_cast<Instruction *>(I), DL)) {
-          V = Simplified;
-          continue;
-        }
-
       Decomposed.Base = V;
       return false;
     }
@@ -531,6 +524,14 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
       return false;
     }
 
+    // Don't attempt to analyze GEPs if index scale is not a compile-time
+    // constant.
+    if (isa<ScalableVectorType>(GEPOp->getSourceElementType())) {
+      Decomposed.Base = V;
+      Decomposed.HasCompileTimeConstantScale = false;
+      return false;
+    }
+
     unsigned AS = GEPOp->getPointerAddressSpace();
     // Walk the indices of the GEP, accumulating them into BaseOff/VarIndices.
     gep_type_iterator GTI = gep_type_begin(GEPOp);
@@ -557,15 +558,16 @@ bool BasicAAResult::DecomposeGEPExpression(const Value *V,
         if (CIdx->isZero())
           continue;
         Decomposed.OtherOffset +=
-          (DL.getTypeAllocSize(GTI.getIndexedType()) *
-            CIdx->getValue().sextOrSelf(MaxPointerSize))
-              .sextOrTrunc(MaxPointerSize);
+            (DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize() *
+             CIdx->getValue().sextOrSelf(MaxPointerSize))
+                .sextOrTrunc(MaxPointerSize);
         continue;
       }
 
       GepHasConstantOffset = false;
 
-      APInt Scale(MaxPointerSize, DL.getTypeAllocSize(GTI.getIndexedType()));
+      APInt Scale(MaxPointerSize,
+                  DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize());
       unsigned ZExtBits = 0, SExtBits = 0;
 
       // If the integer type is smaller than the pointer size, it is implicitly
@@ -723,7 +725,7 @@ FunctionModRefBehavior BasicAAResult::getModRefBehavior(const CallBase *Call) {
   if (Call->onlyReadsMemory())
     Min = FMRB_OnlyReadsMemory;
   else if (Call->doesNotReadMemory())
-    Min = FMRB_DoesNotReadMemory;
+    Min = FMRB_OnlyWritesMemory;
 
   if (Call->onlyAccessesArgMemory())
     Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees);
@@ -756,7 +758,7 @@ FunctionModRefBehavior BasicAAResult::getModRefBehavior(const Function *F) {
   if (F->onlyReadsMemory())
     Min = FMRB_OnlyReadsMemory;
   else if (F->doesNotReadMemory())
-    Min = FMRB_DoesNotReadMemory;
+    Min = FMRB_OnlyWritesMemory;
 
   if (F->onlyAccessesArgMemory())
     Min = FunctionModRefBehavior(Min & FMRB_OnlyAccessesArgumentPointees);
@@ -960,7 +962,7 @@ ModRefInfo BasicAAResult::getModRefInfo(const CallBase *Call,
     }
   }
 
-  // If the call is to malloc or calloc, we can assume that it doesn't
+  // If the call is malloc/calloc like, we can assume that it doesn't
   // modify any IR visible value.  This is only valid because we assume these
   // routines do not read values visible in the IR.  TODO: Consider special
   // casing realloc and strdup routines which access only their arguments as
@@ -1145,11 +1147,11 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
     GEP1->getSourceElementType(), IntermediateIndices);
   StructType *LastIndexedStruct = dyn_cast<StructType>(Ty);
 
-  if (isa<SequentialType>(Ty)) {
+  if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
     // We know that:
     // - both GEPs begin indexing from the exact same pointer;
     // - the last indices in both GEPs are constants, indexing into a sequential
-    //   type (array or pointer);
+    //   type (array or vector);
     // - both GEPs only index through arrays prior to that.
     //
     // Because array indices greater than the number of elements are valid in
@@ -1157,8 +1159,9 @@ static AliasResult aliasSameBasePointerGEPs(const GEPOperator *GEP1,
     // GEP1 and GEP2 we cannot guarantee that the last indexed arrays don't
     // partially overlap. We also need to check that the loaded size matches
     // the element size, otherwise we could still have overlap.
+    Type *LastElementTy = GetElementPtrInst::getTypeAtIndex(Ty, (uint64_t)0);
     const uint64_t ElementSize =
-        DL.getTypeStoreSize(cast<SequentialType>(Ty)->getElementType());
+        DL.getTypeStoreSize(LastElementTy).getFixedSize();
     if (V1Size != ElementSize || V2Size != ElementSize)
       return MayAlias;
 
@@ -1316,12 +1319,20 @@ AliasResult BasicAAResult::aliasGEP(
   unsigned MaxPointerSize = getMaxPointerSize(DL);
   DecompGEP1.StructOffset = DecompGEP1.OtherOffset = APInt(MaxPointerSize, 0);
   DecompGEP2.StructOffset = DecompGEP2.OtherOffset = APInt(MaxPointerSize, 0);
+  DecompGEP1.HasCompileTimeConstantScale =
+      DecompGEP2.HasCompileTimeConstantScale = true;
 
   bool GEP1MaxLookupReached =
     DecomposeGEPExpression(GEP1, DecompGEP1, DL, &AC, DT);
   bool GEP2MaxLookupReached =
     DecomposeGEPExpression(V2, DecompGEP2, DL, &AC, DT);
 
+  // Don't attempt to analyze the decomposed GEP if index scale is not a
+  // compile-time constant.
+  if (!DecompGEP1.HasCompileTimeConstantScale ||
+      !DecompGEP2.HasCompileTimeConstantScale)
+    return MayAlias;
+
   APInt GEP1BaseOffset = DecompGEP1.StructOffset + DecompGEP1.OtherOffset;
   APInt GEP2BaseOffset = DecompGEP2.StructOffset + DecompGEP2.OtherOffset;
 
@@ -1713,6 +1724,10 @@ AliasResult BasicAAResult::aliasPHI(const PHINode *PN, LocationSize PNSize,
   // Other results are not possible.
   if (Alias == MayAlias)
     return MayAlias;
+  // With recursive phis we cannot guarantee that MustAlias/PartialAlias will
+  // remain valid to all elements and needs to conservatively return MayAlias.
+  if (isRecursive && Alias != NoAlias)
+    return MayAlias;
 
   // If all sources of the PHI node NoAlias or MustAlias V2, then returns
   // NoAlias / MustAlias. Otherwise, returns MayAlias.
@@ -1978,7 +1993,7 @@ void BasicAAResult::GetIndexDifference(
 
 bool BasicAAResult::constantOffsetHeuristic(
     const SmallVectorImpl<VariableGEPIndex> &VarIndices,
-    LocationSize MaybeV1Size, LocationSize MaybeV2Size, APInt BaseOffset,
+    LocationSize MaybeV1Size, LocationSize MaybeV2Size, const APInt &BaseOffset,
     AssumptionCache *AC, DominatorTree *DT) {
   if (VarIndices.size() != 2 || MaybeV1Size == LocationSize::unknown() ||
       MaybeV2Size == LocationSize::unknown())
@@ -2058,13 +2073,14 @@ char BasicAAWrapperPass::ID = 0;
 
 void BasicAAWrapperPass::anchor() {}
 
-INITIALIZE_PASS_BEGIN(BasicAAWrapperPass, "basicaa",
-                      "Basic Alias Analysis (stateless AA impl)", false, true)
+INITIALIZE_PASS_BEGIN(BasicAAWrapperPass, "basic-aa",
+                      "Basic Alias Analysis (stateless AA impl)", true, true)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
-INITIALIZE_PASS_END(BasicAAWrapperPass, "basicaa",
-                    "Basic Alias Analysis (stateless AA impl)", false, true)
+INITIALIZE_PASS_DEPENDENCY(PhiValuesWrapperPass)
+INITIALIZE_PASS_END(BasicAAWrapperPass, "basic-aa",
+                    "Basic Alias Analysis (stateless AA impl)", true, true)
 
 FunctionPass *llvm::createBasicAAWrapperPass() {
   return new BasicAAWrapperPass();
diff --git a/llvm/lib/Analysis/BlockFrequencyInfo.cpp b/llvm/lib/Analysis/BlockFrequencyInfo.cpp
index 544bd7757ae4..b9b1fded9de3 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfo.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfo.cpp
@@ -98,7 +98,7 @@ static GVDAGType getGVDT() {
 template <>
 struct GraphTraits<BlockFrequencyInfo *> {
   using NodeRef = const BasicBlock *;
-  using ChildIteratorType = succ_const_iterator;
+  using ChildIteratorType = const_succ_iterator;
   using nodes_iterator = pointer_iterator<Function::const_iterator>;
 
   static NodeRef getEntryNode(const BlockFrequencyInfo *G) {
@@ -287,6 +287,11 @@ void BlockFrequencyInfo::print(raw_ostream &OS) const {
     BFI->print(OS);
 }
 
+void BlockFrequencyInfo::verifyMatch(BlockFrequencyInfo &Other) const {
+  if (BFI)
+    BFI->verifyMatch(*Other.BFI);
+}
+
 INITIALIZE_PASS_BEGIN(BlockFrequencyInfoWrapperPass, "block-freq",
                       "Block Frequency Analysis", true, true)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
diff --git a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
index 0db6dd04a7e8..e4fda2472b3a 100644
--- a/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
+++ b/llvm/lib/Analysis/BlockFrequencyInfoImpl.cpp
@@ -40,6 +40,12 @@ using namespace llvm::bfi_detail;
 
 #define DEBUG_TYPE "block-freq"
 
+cl::opt<bool> CheckBFIUnknownBlockQueries(
+    "check-bfi-unknown-block-queries",
+    cl::init(false), cl::Hidden,
+    cl::desc("Check if block frequency is queried for an unknown block "
+             "for debugging missed BFI updates"));
+
 ScaledNumber<uint64_t> BlockMass::toScaled() const {
   if (isFull())
     return ScaledNumber<uint64_t>(1, 0);
@@ -550,8 +556,17 @@ void BlockFrequencyInfoImplBase::finalizeMetrics() {
 
 BlockFrequency
 BlockFrequencyInfoImplBase::getBlockFreq(const BlockNode &Node) const {
-  if (!Node.isValid())
+  if (!Node.isValid()) {
+#ifndef NDEBUG
+    if (CheckBFIUnknownBlockQueries) {
+      SmallString<256> Msg;
+      raw_svector_ostream OS(Msg);
+      OS << "*** Detected BFI query for unknown block " << getBlockName(Node);
+      report_fatal_error(OS.str());
+    }
+#endif
     return 0;
+  }
   return Freqs[Node.Index].Integer;
 }
 
diff --git a/llvm/lib/Analysis/BranchProbabilityInfo.cpp b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
index ffba65b5ed5e..a396b5ad21c6 100644
--- a/llvm/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/llvm/lib/Analysis/BranchProbabilityInfo.cpp
@@ -61,6 +61,7 @@ INITIALIZE_PASS_BEGIN(BranchProbabilityInfoWrapperPass, "branch-prob",
                       "Branch Probability Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_END(BranchProbabilityInfoWrapperPass, "branch-prob",
                     "Branch Probability Analysis", false, true)
 
@@ -101,7 +102,7 @@ static const uint32_t LBH_UNLIKELY_WEIGHT = 62;
 ///
 /// This is the probability for a branch being taken to a block that terminates
 /// (eventually) in unreachable. These are predicted as unlikely as possible.
-/// All reachable probability will equally share the remaining part.
+/// All reachable probability will proportionally share the remaining part.
 static const BranchProbability UR_TAKEN_PROB = BranchProbability::getRaw(1);
 
 /// Weight for a branch taken going into a cold block.
@@ -240,7 +241,7 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
   SmallVector<unsigned, 4> UnreachableEdges;
   SmallVector<unsigned, 4> ReachableEdges;
 
-  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
+  for (const_succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
     if (PostDominatedByUnreachable.count(*I))
       UnreachableEdges.push_back(I.getSuccessorIndex());
     else
@@ -250,10 +251,13 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
   if (UnreachableEdges.empty())
     return false;
 
+  SmallVector<BranchProbability, 4> EdgeProbabilities(
+      BB->getTerminator()->getNumSuccessors(), BranchProbability::getUnknown());
   if (ReachableEdges.empty()) {
     BranchProbability Prob(1, UnreachableEdges.size());
     for (unsigned SuccIdx : UnreachableEdges)
-      setEdgeProbability(BB, SuccIdx, Prob);
+      EdgeProbabilities[SuccIdx] = Prob;
+    setEdgeProbability(BB, EdgeProbabilities);
     return true;
   }
 
@@ -263,10 +267,11 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
       ReachableEdges.size();
 
   for (unsigned SuccIdx : UnreachableEdges)
-    setEdgeProbability(BB, SuccIdx, UnreachableProb);
+    EdgeProbabilities[SuccIdx] = UnreachableProb;
   for (unsigned SuccIdx : ReachableEdges)
-    setEdgeProbability(BB, SuccIdx, ReachableProb);
+    EdgeProbabilities[SuccIdx] = ReachableProb;
 
+  setEdgeProbability(BB, EdgeProbabilities);
   return true;
 }
 
@@ -277,7 +282,8 @@ bool BranchProbabilityInfo::calcUnreachableHeuristics(const BasicBlock *BB) {
 bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
   const Instruction *TI = BB->getTerminator();
   assert(TI->getNumSuccessors() > 1 && "expected more than one successor!");
-  if (!(isa<BranchInst>(TI) || isa<SwitchInst>(TI) || isa<IndirectBrInst>(TI)))
+  if (!(isa<BranchInst>(TI) || isa<SwitchInst>(TI) || isa<IndirectBrInst>(TI) ||
+        isa<InvokeInst>(TI)))
     return false;
 
   MDNode *WeightsNode = TI->getMetadata(LLVMContext::MD_prof);
@@ -300,19 +306,19 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
   SmallVector<unsigned, 2> UnreachableIdxs;
   SmallVector<unsigned, 2> ReachableIdxs;
   Weights.reserve(TI->getNumSuccessors());
-  for (unsigned i = 1, e = WeightsNode->getNumOperands(); i != e; ++i) {
+  for (unsigned I = 1, E = WeightsNode->getNumOperands(); I != E; ++I) {
     ConstantInt *Weight =
-        mdconst::dyn_extract<ConstantInt>(WeightsNode->getOperand(i));
+        mdconst::dyn_extract<ConstantInt>(WeightsNode->getOperand(I));
     if (!Weight)
       return false;
     assert(Weight->getValue().getActiveBits() <= 32 &&
            "Too many bits for uint32_t");
     Weights.push_back(Weight->getZExtValue());
     WeightSum += Weights.back();
-    if (PostDominatedByUnreachable.count(TI->getSuccessor(i - 1)))
-      UnreachableIdxs.push_back(i - 1);
+    if (PostDominatedByUnreachable.count(TI->getSuccessor(I - 1)))
+      UnreachableIdxs.push_back(I - 1);
     else
-      ReachableIdxs.push_back(i - 1);
+      ReachableIdxs.push_back(I - 1);
   }
   assert(Weights.size() == TI->getNumSuccessors() && "Checked above");
 
@@ -323,47 +329,93 @@ bool BranchProbabilityInfo::calcMetadataWeights(const BasicBlock *BB) {
 
   if (ScalingFactor > 1) {
     WeightSum = 0;
-    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i) {
-      Weights[i] /= ScalingFactor;
-      WeightSum += Weights[i];
+    for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I) {
+      Weights[I] /= ScalingFactor;
+      WeightSum += Weights[I];
     }
   }
   assert(WeightSum <= UINT32_MAX &&
          "Expected weights to scale down to 32 bits");
 
   if (WeightSum == 0 || ReachableIdxs.size() == 0) {
-    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-      Weights[i] = 1;
+    for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I)
+      Weights[I] = 1;
     WeightSum = TI->getNumSuccessors();
   }
 
   // Set the probability.
   SmallVector<BranchProbability, 2> BP;
-  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-    BP.push_back({ Weights[i], static_cast<uint32_t>(WeightSum) });
+  for (unsigned I = 0, E = TI->getNumSuccessors(); I != E; ++I)
+    BP.push_back({ Weights[I], static_cast<uint32_t>(WeightSum) });
 
   // Examine the metadata against unreachable heuristic.
   // If the unreachable heuristic is more strong then we use it for this edge.
-  if (UnreachableIdxs.size() > 0 && ReachableIdxs.size() > 0) {
-    auto ToDistribute = BranchProbability::getZero();
-    auto UnreachableProb = UR_TAKEN_PROB;
-    for (auto i : UnreachableIdxs)
-      if (UnreachableProb < BP[i]) {
-        ToDistribute += BP[i] - UnreachableProb;
-        BP[i] = UnreachableProb;
-      }
+  if (UnreachableIdxs.size() == 0 || ReachableIdxs.size() == 0) {
+    setEdgeProbability(BB, BP);
+    return true;
+  }
+
+  auto UnreachableProb = UR_TAKEN_PROB;
+  for (auto I : UnreachableIdxs)
+    if (UnreachableProb < BP[I]) {
+      BP[I] = UnreachableProb;
+    }
 
-    // If we modified the probability of some edges then we must distribute
-    // the difference between reachable blocks.
-    if (ToDistribute > BranchProbability::getZero()) {
-      BranchProbability PerEdge = ToDistribute / ReachableIdxs.size();
-      for (auto i : ReachableIdxs)
-        BP[i] += PerEdge;
+  // Sum of all edge probabilities must be 1.0. If we modified the probability
+  // of some edges then we must distribute the introduced difference over the
+  // reachable blocks.
+  //
+  // Proportional distribution: the relation between probabilities of the
+  // reachable edges is kept unchanged. That is for any reachable edges i and j:
+  //   newBP[i] / newBP[j] == oldBP[i] / oldBP[j] =>
+  //   newBP[i] / oldBP[i] == newBP[j] / oldBP[j] == K
+  // Where K is independent of i,j.
+  //   newBP[i] == oldBP[i] * K
+  // We need to find K.
+  // Make sum of all reachables of the left and right parts:
+  //   sum_of_reachable(newBP) == K * sum_of_reachable(oldBP)
+  // Sum of newBP must be equal to 1.0:
+  //   sum_of_reachable(newBP) + sum_of_unreachable(newBP) == 1.0 =>
+  //   sum_of_reachable(newBP) = 1.0 - sum_of_unreachable(newBP)
+  // Where sum_of_unreachable(newBP) is what has been just changed.
+  // Finally:
+  //   K == sum_of_reachable(newBP) / sum_of_reachable(oldBP) =>
+  //   K == (1.0 - sum_of_unreachable(newBP)) / sum_of_reachable(oldBP)
+  BranchProbability NewUnreachableSum = BranchProbability::getZero();
+  for (auto I : UnreachableIdxs)
+    NewUnreachableSum += BP[I];
+
+  BranchProbability NewReachableSum =
+      BranchProbability::getOne() - NewUnreachableSum;
+
+  BranchProbability OldReachableSum = BranchProbability::getZero();
+  for (auto I : ReachableIdxs)
+    OldReachableSum += BP[I];
+
+  if (OldReachableSum != NewReachableSum) { // Anything to dsitribute?
+    if (OldReachableSum.isZero()) {
+      // If all oldBP[i] are zeroes then the proportional distribution results
+      // in all zero probabilities and the error stays big. In this case we
+      // evenly spread NewReachableSum over the reachable edges.
+      BranchProbability PerEdge = NewReachableSum / ReachableIdxs.size();
+      for (auto I : ReachableIdxs)
+        BP[I] = PerEdge;
+    } else {
+      for (auto I : ReachableIdxs) {
+        // We use uint64_t to avoid double rounding error of the following
+        // calculation: BP[i] = BP[i] * NewReachableSum / OldReachableSum
+        // The formula is taken from the private constructor
+        // BranchProbability(uint32_t Numerator, uint32_t Denominator)
+        uint64_t Mul = static_cast<uint64_t>(NewReachableSum.getNumerator()) *
+                       BP[I].getNumerator();
+        uint32_t Div = static_cast<uint32_t>(
+            divideNearest(Mul, OldReachableSum.getNumerator()));
+        BP[I] = BranchProbability::getRaw(Div);
+      }
     }
   }
 
-  for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-    setEdgeProbability(BB, i, BP[i]);
+  setEdgeProbability(BB, BP);
 
   return true;
 }
@@ -386,7 +438,7 @@ bool BranchProbabilityInfo::calcColdCallHeuristics(const BasicBlock *BB) {
   // Determine which successors are post-dominated by a cold block.
   SmallVector<unsigned, 4> ColdEdges;
   SmallVector<unsigned, 4> NormalEdges;
-  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
+  for (const_succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I)
     if (PostDominatedByColdCall.count(*I))
       ColdEdges.push_back(I.getSuccessorIndex());
     else
@@ -396,10 +448,13 @@ bool BranchProbabilityInfo::calcColdCallHeuristics(const BasicBlock *BB) {
   if (ColdEdges.empty())
     return false;
 
+  SmallVector<BranchProbability, 4> EdgeProbabilities(
+      BB->getTerminator()->getNumSuccessors(), BranchProbability::getUnknown());
   if (NormalEdges.empty()) {
     BranchProbability Prob(1, ColdEdges.size());
     for (unsigned SuccIdx : ColdEdges)
-      setEdgeProbability(BB, SuccIdx, Prob);
+      EdgeProbabilities[SuccIdx] = Prob;
+    setEdgeProbability(BB, EdgeProbabilities);
     return true;
   }
 
@@ -411,10 +466,11 @@ bool BranchProbabilityInfo::calcColdCallHeuristics(const BasicBlock *BB) {
       (CC_TAKEN_WEIGHT + CC_NONTAKEN_WEIGHT) * uint64_t(NormalEdges.size()));
 
   for (unsigned SuccIdx : ColdEdges)
-    setEdgeProbability(BB, SuccIdx, ColdProb);
+    EdgeProbabilities[SuccIdx] = ColdProb;
   for (unsigned SuccIdx : NormalEdges)
-    setEdgeProbability(BB, SuccIdx, NormalProb);
+    EdgeProbabilities[SuccIdx] = NormalProb;
 
+  setEdgeProbability(BB, EdgeProbabilities);
   return true;
 }
 
@@ -437,19 +493,21 @@ bool BranchProbabilityInfo::calcPointerHeuristics(const BasicBlock *BB) {
 
   assert(CI->getOperand(1)->getType()->isPointerTy());
 
+  BranchProbability TakenProb(PH_TAKEN_WEIGHT,
+                              PH_TAKEN_WEIGHT + PH_NONTAKEN_WEIGHT);
+  BranchProbability UntakenProb(PH_NONTAKEN_WEIGHT,
+                                PH_TAKEN_WEIGHT + PH_NONTAKEN_WEIGHT);
+
   // p != 0   ->   isProb = true
   // p == 0   ->   isProb = false
   // p != q   ->   isProb = true
   // p == q   ->   isProb = false;
-  unsigned TakenIdx = 0, NonTakenIdx = 1;
   bool isProb = CI->getPredicate() == ICmpInst::ICMP_NE;
   if (!isProb)
-    std::swap(TakenIdx, NonTakenIdx);
+    std::swap(TakenProb, UntakenProb);
 
-  BranchProbability TakenProb(PH_TAKEN_WEIGHT,
-                              PH_TAKEN_WEIGHT + PH_NONTAKEN_WEIGHT);
-  setEdgeProbability(BB, TakenIdx, TakenProb);
-  setEdgeProbability(BB, NonTakenIdx, TakenProb.getCompl());
+  setEdgeProbability(
+      BB, SmallVector<BranchProbability, 2>({TakenProb, UntakenProb}));
   return true;
 }
 
@@ -614,7 +672,7 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB,
   SmallVector<unsigned, 8> InEdges; // Edges from header to the loop.
   SmallVector<unsigned, 8> UnlikelyEdges;
 
-  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+  for (const_succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
     // Use LoopInfo if we have it, otherwise fall-back to SCC info to catch
     // irreducible loops.
     if (L) {
@@ -646,18 +704,20 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB,
                    (UnlikelyEdges.empty() ? 0 : LBH_UNLIKELY_WEIGHT) +
                    (ExitingEdges.empty() ? 0 : LBH_NONTAKEN_WEIGHT);
 
+  SmallVector<BranchProbability, 4> EdgeProbabilities(
+      BB->getTerminator()->getNumSuccessors(), BranchProbability::getUnknown());
   if (uint32_t numBackEdges = BackEdges.size()) {
     BranchProbability TakenProb = BranchProbability(LBH_TAKEN_WEIGHT, Denom);
     auto Prob = TakenProb / numBackEdges;
     for (unsigned SuccIdx : BackEdges)
-      setEdgeProbability(BB, SuccIdx, Prob);
+      EdgeProbabilities[SuccIdx] = Prob;
   }
 
   if (uint32_t numInEdges = InEdges.size()) {
     BranchProbability TakenProb = BranchProbability(LBH_TAKEN_WEIGHT, Denom);
     auto Prob = TakenProb / numInEdges;
     for (unsigned SuccIdx : InEdges)
-      setEdgeProbability(BB, SuccIdx, Prob);
+      EdgeProbabilities[SuccIdx] = Prob;
   }
 
   if (uint32_t numExitingEdges = ExitingEdges.size()) {
@@ -665,7 +725,7 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB,
                                                        Denom);
     auto Prob = NotTakenProb / numExitingEdges;
     for (unsigned SuccIdx : ExitingEdges)
-      setEdgeProbability(BB, SuccIdx, Prob);
+      EdgeProbabilities[SuccIdx] = Prob;
   }
 
   if (uint32_t numUnlikelyEdges = UnlikelyEdges.size()) {
@@ -673,9 +733,10 @@ bool BranchProbabilityInfo::calcLoopBranchHeuristics(const BasicBlock *BB,
                                                        Denom);
     auto Prob = UnlikelyProb / numUnlikelyEdges;
     for (unsigned SuccIdx : UnlikelyEdges)
-      setEdgeProbability(BB, SuccIdx, Prob);
+      EdgeProbabilities[SuccIdx] = Prob;
   }
 
+  setEdgeProbability(BB, EdgeProbabilities);
   return true;
 }
 
@@ -786,15 +847,15 @@ bool BranchProbabilityInfo::calcZeroHeuristics(const BasicBlock *BB,
     return false;
   }
 
-  unsigned TakenIdx = 0, NonTakenIdx = 1;
-
-  if (!isProb)
-    std::swap(TakenIdx, NonTakenIdx);
-
   BranchProbability TakenProb(ZH_TAKEN_WEIGHT,
                               ZH_TAKEN_WEIGHT + ZH_NONTAKEN_WEIGHT);
-  setEdgeProbability(BB, TakenIdx, TakenProb);
-  setEdgeProbability(BB, NonTakenIdx, TakenProb.getCompl());
+  BranchProbability UntakenProb(ZH_NONTAKEN_WEIGHT,
+                                ZH_TAKEN_WEIGHT + ZH_NONTAKEN_WEIGHT);
+  if (!isProb)
+    std::swap(TakenProb, UntakenProb);
+
+  setEdgeProbability(
+      BB, SmallVector<BranchProbability, 2>({TakenProb, UntakenProb}));
   return true;
 }
 
@@ -829,14 +890,13 @@ bool BranchProbabilityInfo::calcFloatingPointHeuristics(const BasicBlock *BB) {
     return false;
   }
 
-  unsigned TakenIdx = 0, NonTakenIdx = 1;
-
+  BranchProbability TakenProb(TakenWeight, TakenWeight + NontakenWeight);
+  BranchProbability UntakenProb(NontakenWeight, TakenWeight + NontakenWeight);
   if (!isProb)
-    std::swap(TakenIdx, NonTakenIdx);
+    std::swap(TakenProb, UntakenProb);
 
-  BranchProbability TakenProb(TakenWeight, TakenWeight + NontakenWeight);
-  setEdgeProbability(BB, TakenIdx, TakenProb);
-  setEdgeProbability(BB, NonTakenIdx, TakenProb.getCompl());
+  setEdgeProbability(
+      BB, SmallVector<BranchProbability, 2>({TakenProb, UntakenProb}));
   return true;
 }
 
@@ -847,13 +907,23 @@ bool BranchProbabilityInfo::calcInvokeHeuristics(const BasicBlock *BB) {
 
   BranchProbability TakenProb(IH_TAKEN_WEIGHT,
                               IH_TAKEN_WEIGHT + IH_NONTAKEN_WEIGHT);
-  setEdgeProbability(BB, 0 /*Index for Normal*/, TakenProb);
-  setEdgeProbability(BB, 1 /*Index for Unwind*/, TakenProb.getCompl());
+  setEdgeProbability(
+      BB, SmallVector<BranchProbability, 2>({TakenProb, TakenProb.getCompl()}));
   return true;
 }
 
 void BranchProbabilityInfo::releaseMemory() {
   Probs.clear();
+  Handles.clear();
+}
+
+bool BranchProbabilityInfo::invalidate(Function &, const PreservedAnalyses &PA,
+                                       FunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<BranchProbabilityAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>() ||
+           PAC.preservedSet<CFGAnalyses>());
 }
 
 void BranchProbabilityInfo::print(raw_ostream &OS) const {
@@ -862,7 +932,7 @@ void BranchProbabilityInfo::print(raw_ostream &OS) const {
   // or the function it is currently running over.
   assert(LastF && "Cannot print prior to running over a function");
   for (const auto &BI : *LastF) {
-    for (succ_const_iterator SI = succ_begin(&BI), SE = succ_end(&BI); SI != SE;
+    for (const_succ_iterator SI = succ_begin(&BI), SE = succ_end(&BI); SI != SE;
          ++SI) {
       printEdgeProbability(OS << "  ", &BI, *SI);
     }
@@ -881,7 +951,7 @@ BranchProbabilityInfo::getHotSucc(const BasicBlock *BB) const {
   auto MaxProb = BranchProbability::getZero();
   const BasicBlock *MaxSucc = nullptr;
 
-  for (succ_const_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+  for (const_succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
     const BasicBlock *Succ = *I;
     auto Prob = getEdgeProbability(BB, Succ);
     if (Prob > MaxProb) {
@@ -914,7 +984,7 @@ BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
 
 BranchProbability
 BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
-                                          succ_const_iterator Dst) const {
+                                          const_succ_iterator Dst) const {
   return getEdgeProbability(Src, Dst.getSuccessorIndex());
 }
 
@@ -925,8 +995,10 @@ BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
                                           const BasicBlock *Dst) const {
   auto Prob = BranchProbability::getZero();
   bool FoundProb = false;
-  for (succ_const_iterator I = succ_begin(Src), E = succ_end(Src); I != E; ++I)
+  uint32_t EdgeCount = 0;
+  for (const_succ_iterator I = succ_begin(Src), E = succ_end(Src); I != E; ++I)
     if (*I == Dst) {
+      ++EdgeCount;
       auto MapI = Probs.find(std::make_pair(Src, I.getSuccessorIndex()));
       if (MapI != Probs.end()) {
         FoundProb = true;
@@ -934,7 +1006,7 @@ BranchProbabilityInfo::getEdgeProbability(const BasicBlock *Src,
       }
     }
   uint32_t succ_num = std::distance(succ_begin(Src), succ_end(Src));
-  return FoundProb ? Prob : BranchProbability(1, succ_num);
+  return FoundProb ? Prob : BranchProbability(EdgeCount, succ_num);
 }
 
 /// Set the edge probability for a given edge specified by PredBlock and an
@@ -949,6 +1021,28 @@ void BranchProbabilityInfo::setEdgeProbability(const BasicBlock *Src,
                     << "\n");
 }
 
+/// Set the edge probability for all edges at once.
+void BranchProbabilityInfo::setEdgeProbability(
+    const BasicBlock *Src, const SmallVectorImpl<BranchProbability> &Probs) {
+  assert(Src->getTerminator()->getNumSuccessors() == Probs.size());
+  if (Probs.size() == 0)
+    return; // Nothing to set.
+
+  uint64_t TotalNumerator = 0;
+  for (unsigned SuccIdx = 0; SuccIdx < Probs.size(); ++SuccIdx) {
+    setEdgeProbability(Src, SuccIdx, Probs[SuccIdx]);
+    TotalNumerator += Probs[SuccIdx].getNumerator();
+  }
+
+  // Because of rounding errors the total probability cannot be checked to be
+  // 1.0 exactly. That is TotalNumerator == BranchProbability::getDenominator.
+  // Instead, every single probability in Probs must be as accurate as possible.
+  // This results in error 1/denominator at most, thus the total absolute error
+  // should be within Probs.size / BranchProbability::getDenominator.
+  assert(TotalNumerator <= BranchProbability::getDenominator() + Probs.size());
+  assert(TotalNumerator >= BranchProbability::getDenominator() - Probs.size());
+}
+
 raw_ostream &
 BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS,
                                             const BasicBlock *Src,
@@ -962,15 +1056,16 @@ BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS,
 }
 
 void BranchProbabilityInfo::eraseBlock(const BasicBlock *BB) {
-  for (auto I = Probs.begin(), E = Probs.end(); I != E; ++I) {
-    auto Key = I->first;
-    if (Key.first == BB)
-      Probs.erase(Key);
+  for (const_succ_iterator I = succ_begin(BB), E = succ_end(BB); I != E; ++I) {
+    auto MapI = Probs.find(std::make_pair(BB, I.getSuccessorIndex()));
+    if (MapI != Probs.end())
+      Probs.erase(MapI);
   }
 }
 
 void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI,
-                                      const TargetLibraryInfo *TLI) {
+                                      const TargetLibraryInfo *TLI,
+                                      PostDominatorTree *PDT) {
   LLVM_DEBUG(dbgs() << "---- Branch Probability Info : " << F.getName()
                     << " ----\n\n");
   LastF = &F; // Store the last function we ran on for printing.
@@ -998,10 +1093,15 @@ void BranchProbabilityInfo::calculate(const Function &F, const LoopInfo &LI,
     LLVM_DEBUG(dbgs() << "\n");
   }
 
-  std::unique_ptr<PostDominatorTree> PDT =
-      std::make_unique<PostDominatorTree>(const_cast<Function &>(F));
-  computePostDominatedByUnreachable(F, PDT.get());
-  computePostDominatedByColdCall(F, PDT.get());
+  std::unique_ptr<PostDominatorTree> PDTPtr;
+
+  if (!PDT) {
+    PDTPtr = std::make_unique<PostDominatorTree>(const_cast<Function &>(F));
+    PDT = PDTPtr.get();
+  }
+
+  computePostDominatedByUnreachable(F, PDT);
+  computePostDominatedByColdCall(F, PDT);
 
   // Walk the basic blocks in post-order so that we can build up state about
   // the successors of a block iteratively.
@@ -1047,6 +1147,7 @@ void BranchProbabilityInfoWrapperPass::getAnalysisUsage(
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<LoopInfoWrapperPass>();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
+  AU.addRequired<PostDominatorTreeWrapperPass>();
   AU.setPreservesAll();
 }
 
@@ -1054,7 +1155,9 @@ bool BranchProbabilityInfoWrapperPass::runOnFunction(Function &F) {
   const LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   const TargetLibraryInfo &TLI =
       getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-  BPI.calculate(F, LI, &TLI);
+  PostDominatorTree &PDT =
+      getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+  BPI.calculate(F, LI, &TLI, &PDT);
   return false;
 }
 
@@ -1069,7 +1172,9 @@ AnalysisKey BranchProbabilityAnalysis::Key;
 BranchProbabilityInfo
 BranchProbabilityAnalysis::run(Function &F, FunctionAnalysisManager &AM) {
   BranchProbabilityInfo BPI;
-  BPI.calculate(F, AM.getResult<LoopAnalysis>(F), &AM.getResult<TargetLibraryAnalysis>(F));
+  BPI.calculate(F, AM.getResult<LoopAnalysis>(F),
+                &AM.getResult<TargetLibraryAnalysis>(F),
+                &AM.getResult<PostDominatorTreeAnalysis>(F));
   return BPI;
 }
 
diff --git a/llvm/lib/Analysis/CFG.cpp b/llvm/lib/Analysis/CFG.cpp
index 8215b4ecbb03..b46a6951dd25 100644
--- a/llvm/lib/Analysis/CFG.cpp
+++ b/llvm/lib/Analysis/CFG.cpp
@@ -12,8 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/CFG.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Dominators.h"
 
@@ -31,16 +29,16 @@ void llvm::FindFunctionBackedges(const Function &F,
     return;
 
   SmallPtrSet<const BasicBlock*, 8> Visited;
-  SmallVector<std::pair<const BasicBlock*, succ_const_iterator>, 8> VisitStack;
+  SmallVector<std::pair<const BasicBlock *, const_succ_iterator>, 8> VisitStack;
   SmallPtrSet<const BasicBlock*, 8> InStack;
 
   Visited.insert(BB);
   VisitStack.push_back(std::make_pair(BB, succ_begin(BB)));
   InStack.insert(BB);
   do {
-    std::pair<const BasicBlock*, succ_const_iterator> &Top = VisitStack.back();
+    std::pair<const BasicBlock *, const_succ_iterator> &Top = VisitStack.back();
     const BasicBlock *ParentBB = Top.first;
-    succ_const_iterator &I = Top.second;
+    const_succ_iterator &I = Top.second;
 
     bool FoundNew = false;
     while (I != succ_end(ParentBB)) {
diff --git a/llvm/lib/Analysis/CFGPrinter.cpp b/llvm/lib/Analysis/CFGPrinter.cpp
index 88e7d3bdede1..cf4afc8cfd9c 100644
--- a/llvm/lib/Analysis/CFGPrinter.cpp
+++ b/llvm/lib/Analysis/CFGPrinter.cpp
@@ -18,69 +18,135 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/CFGPrinter.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/FileSystem.h"
+#include <algorithm>
+
 using namespace llvm;
 
-static cl::opt<std::string> CFGFuncName(
-    "cfg-func-name", cl::Hidden,
-    cl::desc("The name of a function (or its substring)"
-             " whose CFG is viewed/printed."));
+static cl::opt<std::string>
+    CFGFuncName("cfg-func-name", cl::Hidden,
+                cl::desc("The name of a function (or its substring)"
+                         " whose CFG is viewed/printed."));
 
 static cl::opt<std::string> CFGDotFilenamePrefix(
     "cfg-dot-filename-prefix", cl::Hidden,
     cl::desc("The prefix used for the CFG dot file names."));
 
-namespace {
-  struct CFGViewerLegacyPass : public FunctionPass {
-    static char ID; // Pass identifcation, replacement for typeid
-    CFGViewerLegacyPass() : FunctionPass(ID) {
-      initializeCFGViewerLegacyPassPass(*PassRegistry::getPassRegistry());
-    }
+static cl::opt<bool> HideUnreachablePaths("cfg-hide-unreachable-paths",
+                                          cl::init(false));
 
-    bool runOnFunction(Function &F) override {
-      F.viewCFG();
-      return false;
-    }
+static cl::opt<bool> HideDeoptimizePaths("cfg-hide-deoptimize-paths",
+                                         cl::init(false));
 
-    void print(raw_ostream &OS, const Module* = nullptr) const override {}
+static cl::opt<bool> ShowHeatColors("cfg-heat-colors", cl::init(true),
+                                    cl::Hidden,
+                                    cl::desc("Show heat colors in CFG"));
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesAll();
-    }
-  };
-}
+static cl::opt<bool> UseRawEdgeWeight("cfg-raw-weights", cl::init(false),
+                                      cl::Hidden,
+                                      cl::desc("Use raw weights for labels. "
+                                               "Use percentages as default."));
 
-char CFGViewerLegacyPass::ID = 0;
-INITIALIZE_PASS(CFGViewerLegacyPass, "view-cfg", "View CFG of function", false, true)
+static cl::opt<bool>
+    ShowEdgeWeight("cfg-weights", cl::init(false), cl::Hidden,
+                   cl::desc("Show edges labeled with weights"));
 
-PreservedAnalyses CFGViewerPass::run(Function &F,
-                                     FunctionAnalysisManager &AM) {
-  F.viewCFG();
-  return PreservedAnalyses::all();
+static void writeCFGToDotFile(Function &F, BlockFrequencyInfo *BFI,
+                              BranchProbabilityInfo *BPI, uint64_t MaxFreq,
+                              bool CFGOnly = false) {
+  std::string Filename =
+      (CFGDotFilenamePrefix + "." + F.getName() + ".dot").str();
+  errs() << "Writing '" << Filename << "'...";
+
+  std::error_code EC;
+  raw_fd_ostream File(Filename, EC, sys::fs::F_Text);
+
+  DOTFuncInfo CFGInfo(&F, BFI, BPI, MaxFreq);
+  CFGInfo.setHeatColors(ShowHeatColors);
+  CFGInfo.setEdgeWeights(ShowEdgeWeight);
+  CFGInfo.setRawEdgeWeights(UseRawEdgeWeight);
+
+  if (!EC)
+    WriteGraph(File, &CFGInfo, CFGOnly);
+  else
+    errs() << "  error opening file for writing!";
+  errs() << "\n";
 }
 
+static void viewCFG(Function &F, const BlockFrequencyInfo *BFI,
+                    const BranchProbabilityInfo *BPI, uint64_t MaxFreq,
+                    bool CFGOnly = false) {
+  DOTFuncInfo CFGInfo(&F, BFI, BPI, MaxFreq);
+  CFGInfo.setHeatColors(ShowHeatColors);
+  CFGInfo.setEdgeWeights(ShowEdgeWeight);
+  CFGInfo.setRawEdgeWeights(UseRawEdgeWeight);
+
+  ViewGraph(&CFGInfo, "cfg." + F.getName(), CFGOnly);
+}
 
 namespace {
-  struct CFGOnlyViewerLegacyPass : public FunctionPass {
-    static char ID; // Pass identifcation, replacement for typeid
-    CFGOnlyViewerLegacyPass() : FunctionPass(ID) {
-      initializeCFGOnlyViewerLegacyPassPass(*PassRegistry::getPassRegistry());
-    }
+struct CFGViewerLegacyPass : public FunctionPass {
+  static char ID; // Pass identifcation, replacement for typeid
+  CFGViewerLegacyPass() : FunctionPass(ID) {
+    initializeCFGViewerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+    auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+    viewCFG(F, BFI, BPI, getMaxFreq(F, BFI));
+    return false;
+  }
+
+  void print(raw_ostream &OS, const Module * = nullptr) const override {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
+    AU.setPreservesAll();
+  }
+};
+}
 
-    bool runOnFunction(Function &F) override {
-      F.viewCFGOnly();
-      return false;
-    }
+char CFGViewerLegacyPass::ID = 0;
+INITIALIZE_PASS(CFGViewerLegacyPass, "view-cfg", "View CFG of function", false,
+                true)
 
-    void print(raw_ostream &OS, const Module* = nullptr) const override {}
+PreservedAnalyses CFGViewerPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto *BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
+  auto *BPI = &AM.getResult<BranchProbabilityAnalysis>(F);
+  viewCFG(F, BFI, BPI, getMaxFreq(F, BFI));
+  return PreservedAnalyses::all();
+}
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesAll();
-    }
-  };
+namespace {
+struct CFGOnlyViewerLegacyPass : public FunctionPass {
+  static char ID; // Pass identifcation, replacement for typeid
+  CFGOnlyViewerLegacyPass() : FunctionPass(ID) {
+    initializeCFGOnlyViewerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+    auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+    viewCFG(F, BFI, BPI, getMaxFreq(F, BFI), /*CFGOnly=*/true);
+    return false;
+  }
+
+  void print(raw_ostream &OS, const Module * = nullptr) const override {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
+    AU.setPreservesAll();
+  }
+};
 }
 
 char CFGOnlyViewerLegacyPass::ID = 0;
@@ -89,84 +155,83 @@ INITIALIZE_PASS(CFGOnlyViewerLegacyPass, "view-cfg-only",
 
 PreservedAnalyses CFGOnlyViewerPass::run(Function &F,
                                          FunctionAnalysisManager &AM) {
-  F.viewCFGOnly();
+  auto *BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
+  auto *BPI = &AM.getResult<BranchProbabilityAnalysis>(F);
+  viewCFG(F, BFI, BPI, getMaxFreq(F, BFI), /*CFGOnly=*/true);
   return PreservedAnalyses::all();
 }
 
-static void writeCFGToDotFile(Function &F, bool CFGOnly = false) {
-  if (!CFGFuncName.empty() && !F.getName().contains(CFGFuncName))
-     return;
-  std::string Filename =
-      (CFGDotFilenamePrefix + "." + F.getName() + ".dot").str();
-  errs() << "Writing '" << Filename << "'...";
-
-  std::error_code EC;
-  raw_fd_ostream File(Filename, EC, sys::fs::OF_Text);
-
-  if (!EC)
-    WriteGraph(File, (const Function*)&F, CFGOnly);
-  else
-    errs() << "  error opening file for writing!";
-  errs() << "\n";
-}
-
 namespace {
-  struct CFGPrinterLegacyPass : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    CFGPrinterLegacyPass() : FunctionPass(ID) {
-      initializeCFGPrinterLegacyPassPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override {
-      writeCFGToDotFile(F);
-      return false;
-    }
-
-    void print(raw_ostream &OS, const Module* = nullptr) const override {}
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesAll();
-    }
-  };
+struct CFGPrinterLegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  CFGPrinterLegacyPass() : FunctionPass(ID) {
+    initializeCFGPrinterLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+    auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+    writeCFGToDotFile(F, BFI, BPI, getMaxFreq(F, BFI));
+    return false;
+  }
+
+  void print(raw_ostream &OS, const Module * = nullptr) const override {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
+    AU.setPreservesAll();
+  }
+};
 }
 
 char CFGPrinterLegacyPass::ID = 0;
-INITIALIZE_PASS(CFGPrinterLegacyPass, "dot-cfg", "Print CFG of function to 'dot' file",
-                false, true)
+INITIALIZE_PASS(CFGPrinterLegacyPass, "dot-cfg",
+                "Print CFG of function to 'dot' file", false, true)
 
 PreservedAnalyses CFGPrinterPass::run(Function &F,
                                       FunctionAnalysisManager &AM) {
-  writeCFGToDotFile(F);
+  auto *BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
+  auto *BPI = &AM.getResult<BranchProbabilityAnalysis>(F);
+  writeCFGToDotFile(F, BFI, BPI, getMaxFreq(F, BFI));
   return PreservedAnalyses::all();
 }
 
 namespace {
-  struct CFGOnlyPrinterLegacyPass : public FunctionPass {
-    static char ID; // Pass identification, replacement for typeid
-    CFGOnlyPrinterLegacyPass() : FunctionPass(ID) {
-      initializeCFGOnlyPrinterLegacyPassPass(*PassRegistry::getPassRegistry());
-    }
-
-    bool runOnFunction(Function &F) override {
-      writeCFGToDotFile(F, /*CFGOnly=*/true);
-      return false;
-    }
-    void print(raw_ostream &OS, const Module* = nullptr) const override {}
-
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesAll();
-    }
-  };
+struct CFGOnlyPrinterLegacyPass : public FunctionPass {
+  static char ID; // Pass identification, replacement for typeid
+  CFGOnlyPrinterLegacyPass() : FunctionPass(ID) {
+    initializeCFGOnlyPrinterLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnFunction(Function &F) override {
+    auto *BPI = &getAnalysis<BranchProbabilityInfoWrapperPass>().getBPI();
+    auto *BFI = &getAnalysis<BlockFrequencyInfoWrapperPass>().getBFI();
+    writeCFGToDotFile(F, BFI, BPI, getMaxFreq(F, BFI), /*CFGOnly=*/true);
+    return false;
+  }
+  void print(raw_ostream &OS, const Module * = nullptr) const override {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    FunctionPass::getAnalysisUsage(AU);
+    AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<BranchProbabilityInfoWrapperPass>();
+    AU.setPreservesAll();
+  }
+};
 }
 
 char CFGOnlyPrinterLegacyPass::ID = 0;
 INITIALIZE_PASS(CFGOnlyPrinterLegacyPass, "dot-cfg-only",
-   "Print CFG of function to 'dot' file (with no function bodies)",
-   false, true)
+                "Print CFG of function to 'dot' file (with no function bodies)",
+                false, true)
 
 PreservedAnalyses CFGOnlyPrinterPass::run(Function &F,
                                           FunctionAnalysisManager &AM) {
-  writeCFGToDotFile(F, /*CFGOnly=*/true);
+  auto *BFI = &AM.getResult<BlockFrequencyAnalysis>(F);
+  auto *BPI = &AM.getResult<BranchProbabilityAnalysis>(F);
+  writeCFGToDotFile(F, BFI, BPI, getMaxFreq(F, BFI), /*CFGOnly=*/true);
   return PreservedAnalyses::all();
 }
 
@@ -175,10 +240,14 @@ PreservedAnalyses CFGOnlyPrinterPass::run(Function &F,
 /// program, displaying the CFG of the current function.  This depends on there
 /// being a 'dot' and 'gv' program in your path.
 ///
-void Function::viewCFG() const {
+void Function::viewCFG() const { viewCFG(false, nullptr, nullptr); }
+
+void Function::viewCFG(bool ViewCFGOnly, const BlockFrequencyInfo *BFI,
+                       const BranchProbabilityInfo *BPI) const {
   if (!CFGFuncName.empty() && !getName().contains(CFGFuncName))
-     return;
-  ViewGraph(this, "cfg" + getName());
+    return;
+  DOTFuncInfo CFGInfo(this, BFI, BPI, BFI ? getMaxFreq(*this, BFI) : 0);
+  ViewGraph(&CFGInfo, "cfg" + getName(), ViewCFGOnly);
 }
 
 /// viewCFGOnly - This function is meant for use from the debugger.  It works
@@ -186,17 +255,45 @@ void Function::viewCFG() const {
 /// into the nodes, just the label.  If you are only interested in the CFG
 /// this can make the graph smaller.
 ///
-void Function::viewCFGOnly() const {
-  if (!CFGFuncName.empty() && !getName().contains(CFGFuncName))
-     return;
-  ViewGraph(this, "cfg" + getName(), true);
+void Function::viewCFGOnly() const { viewCFGOnly(nullptr, nullptr); }
+
+void Function::viewCFGOnly(const BlockFrequencyInfo *BFI,
+                           const BranchProbabilityInfo *BPI) const {
+  viewCFG(true, BFI, BPI);
 }
 
-FunctionPass *llvm::createCFGPrinterLegacyPassPass () {
+FunctionPass *llvm::createCFGPrinterLegacyPassPass() {
   return new CFGPrinterLegacyPass();
 }
 
-FunctionPass *llvm::createCFGOnlyPrinterLegacyPassPass () {
+FunctionPass *llvm::createCFGOnlyPrinterLegacyPassPass() {
   return new CFGOnlyPrinterLegacyPass();
 }
 
+void DOTGraphTraits<DOTFuncInfo *>::computeHiddenNodes(const Function *F) {
+  auto evaluateBB = [&](const BasicBlock *Node) {
+    if (succ_begin(Node) == succ_end(Node)) {
+      const Instruction *TI = Node->getTerminator();
+      isHiddenBasicBlock[Node] =
+          (HideUnreachablePaths && isa<UnreachableInst>(TI)) ||
+          (HideDeoptimizePaths && Node->getTerminatingDeoptimizeCall());
+      return;
+    }
+    isHiddenBasicBlock[Node] = std::all_of(
+        succ_begin(Node), succ_end(Node),
+        [this](const BasicBlock *BB) { return isHiddenBasicBlock[BB]; });
+  };
+  /// The post order traversal iteration is done to know the status of
+  /// isHiddenBasicBlock for all the successors on the current BB.
+  for_each(po_begin(&F->getEntryBlock()), po_end(&F->getEntryBlock()),
+           evaluateBB);
+}
+
+bool DOTGraphTraits<DOTFuncInfo *>::isNodeHidden(const BasicBlock *Node) {
+  // If both restricting flags are false, all nodes are displayed.
+  if (!HideUnreachablePaths && !HideDeoptimizePaths)
+    return false;
+  if (isHiddenBasicBlock.find(Node) == isHiddenBasicBlock.end())
+    computeHiddenNodes(Node->getParent());
+  return isHiddenBasicBlock[Node];
+}
diff --git a/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp b/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
index eb5c96e6eeca..179f0633df06 100644
--- a/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/CFLAndersAliasAnalysis.cpp
@@ -552,7 +552,7 @@ bool CFLAndersAAResult::FunctionInfo::mayAlias(
       return std::less<const Value *>()(LHS.Val, RHS.Val);
     };
 #ifdef EXPENSIVE_CHECKS
-    assert(std::is_sorted(Itr->second.begin(), Itr->second.end(), Comparator));
+    assert(llvm::is_sorted(Itr->second, Comparator));
 #endif
     auto RangePair = std::equal_range(Itr->second.begin(), Itr->second.end(),
                                       OffsetValue{RHS, 0}, Comparator);
diff --git a/llvm/lib/Analysis/CGSCCPassManager.cpp b/llvm/lib/Analysis/CGSCCPassManager.cpp
index a0b3f83cca6a..fd3166f8cd0c 100644
--- a/llvm/lib/Analysis/CGSCCPassManager.cpp
+++ b/llvm/lib/Analysis/CGSCCPassManager.cpp
@@ -15,14 +15,15 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/LazyCallGraph.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/PassManagerImpl.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/TimeProfiler.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
@@ -67,16 +68,24 @@ PassManager<LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,
   // a pointer that we can update.
   LazyCallGraph::SCC *C = &InitialC;
 
-  for (auto &Pass : Passes) {
-    if (DebugLogging)
-      dbgs() << "Running pass: " << Pass->name() << " on " << *C << "\n";
+  // Get Function analysis manager from its proxy.
+  FunctionAnalysisManager &FAM =
+      AM.getCachedResult<FunctionAnalysisManagerCGSCCProxy>(*C)->getManager();
 
+  for (auto &Pass : Passes) {
     // Check the PassInstrumentation's BeforePass callbacks before running the
     // pass, skip its execution completely if asked to (callback returns false).
     if (!PI.runBeforePass(*Pass, *C))
       continue;
 
-    PreservedAnalyses PassPA = Pass->run(*C, AM, G, UR);
+    if (DebugLogging)
+      dbgs() << "Running pass: " << Pass->name() << " on " << *C << "\n";
+
+    PreservedAnalyses PassPA;
+    {
+      TimeTraceScope TimeScope(Pass->name());
+      PassPA = Pass->run(*C, AM, G, UR);
+    }
 
     if (UR.InvalidatedSCCs.count(C))
       PI.runAfterPassInvalidated<LazyCallGraph::SCC>(*Pass);
@@ -85,6 +94,12 @@ PassManager<LazyCallGraph::SCC, CGSCCAnalysisManager, LazyCallGraph &,
 
     // Update the SCC if necessary.
     C = UR.UpdatedC ? UR.UpdatedC : C;
+    if (UR.UpdatedC) {
+      // If C is updated, also create a proxy and update FAM inside the result.
+      auto *ResultFAMCP =
+          &AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, G);
+      ResultFAMCP->updateFAM(FAM);
+    }
 
     // If the CGSCC pass wasn't able to provide a valid updated SCC, the
     // current SCC may simply need to be skipped if invalid.
@@ -218,23 +233,22 @@ FunctionAnalysisManagerCGSCCProxy::Result
 FunctionAnalysisManagerCGSCCProxy::run(LazyCallGraph::SCC &C,
                                        CGSCCAnalysisManager &AM,
                                        LazyCallGraph &CG) {
-  // Collect the FunctionAnalysisManager from the Module layer and use that to
-  // build the proxy result.
-  //
-  // This allows us to rely on the FunctionAnalysisMangaerModuleProxy to
-  // invalidate the function analyses.
-  auto &MAM = AM.getResult<ModuleAnalysisManagerCGSCCProxy>(C, CG).getManager();
+  // Note: unconditionally getting checking that the proxy exists may get it at
+  // this point. There are cases when this is being run unnecessarily, but
+  // it is cheap and having the assertion in place is more valuable.
+  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerCGSCCProxy>(C, CG);
   Module &M = *C.begin()->getFunction().getParent();
-  auto *FAMProxy = MAM.getCachedResult<FunctionAnalysisManagerModuleProxy>(M);
-  assert(FAMProxy && "The CGSCC pass manager requires that the FAM module "
-                     "proxy is run on the module prior to entering the CGSCC "
-                     "walk.");
-
-  // Note that we special-case invalidation handling of this proxy in the CGSCC
-  // analysis manager's Module proxy. This avoids the need to do anything
-  // special here to recompute all of this if ever the FAM's module proxy goes
-  // away.
-  return Result(FAMProxy->getManager());
+  bool ProxyExists =
+      MAMProxy.cachedResultExists<FunctionAnalysisManagerModuleProxy>(M);
+  assert(ProxyExists &&
+         "The CGSCC pass manager requires that the FAM module proxy is run "
+         "on the module prior to entering the CGSCC walk");
+  (void)ProxyExists;
+
+  // We just return an empty result. The caller will use the updateFAM interface
+  // to correctly register the relevant FunctionAnalysisManager based on the
+  // context in which this proxy is run.
+  return Result();
 }
 
 bool FunctionAnalysisManagerCGSCCProxy::Result::invalidate(
@@ -244,8 +258,8 @@ bool FunctionAnalysisManagerCGSCCProxy::Result::invalidate(
   if (PA.areAllPreserved())
     return false; // This is still a valid proxy.
 
-  // If this proxy isn't marked as preserved, then even if the result remains
-  // valid, the key itself may no longer be valid, so we clear everything.
+  // All updates to preserve valid results are done below, so we don't need to
+  // invalidate this proxy.
   //
   // Note that in order to preserve this proxy, a module pass must ensure that
   // the FAM has been completely updated to handle the deletion of functions.
@@ -257,7 +271,7 @@ bool FunctionAnalysisManagerCGSCCProxy::Result::invalidate(
     for (LazyCallGraph::Node &N : C)
       FAM->clear(N.getFunction(), N.getFunction().getName());
 
-    return true;
+    return false;
   }
 
   // Directly check if the relevant set is preserved.
@@ -306,9 +320,10 @@ bool FunctionAnalysisManagerCGSCCProxy::Result::invalidate(
 
 } // end namespace llvm
 
-/// When a new SCC is created for the graph and there might be function
-/// analysis results cached for the functions now in that SCC two forms of
-/// updates are required.
+/// When a new SCC is created for the graph we first update the
+/// FunctionAnalysisManager in the Proxy's result.
+/// As there might be function analysis results cached for the functions now in
+/// that SCC, two forms of  updates are required.
 ///
 /// First, a proxy from the SCC to the FunctionAnalysisManager needs to be
 /// created so that any subsequent invalidation events to the SCC are
@@ -320,10 +335,9 @@ bool FunctionAnalysisManagerCGSCCProxy::Result::invalidate(
 /// function analyses so that they don't retain stale handles.
 static void updateNewSCCFunctionAnalyses(LazyCallGraph::SCC &C,
                                          LazyCallGraph &G,
-                                         CGSCCAnalysisManager &AM) {
-  // Get the relevant function analysis manager.
-  auto &FAM =
-      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, G).getManager();
+                                         CGSCCAnalysisManager &AM,
+                                         FunctionAnalysisManager &FAM) {
+  AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, G).updateFAM(FAM);
 
   // Now walk the functions in this SCC and invalidate any function analysis
   // results that might have outer dependencies on an SCC analysis.
@@ -387,8 +401,10 @@ incorporateNewSCCRange(const SCCRangeT &NewSCCRange, LazyCallGraph &G,
 
   // If we had a cached FAM proxy originally, we will want to create more of
   // them for each SCC that was split off.
-  bool NeedFAMProxy =
-      AM.getCachedResult<FunctionAnalysisManagerCGSCCProxy>(*OldC) != nullptr;
+  FunctionAnalysisManager *FAM = nullptr;
+  if (auto *FAMProxy =
+          AM.getCachedResult<FunctionAnalysisManagerCGSCCProxy>(*OldC))
+    FAM = &FAMProxy->getManager();
 
   // We need to propagate an invalidation call to all but the newly current SCC
   // because the outer pass manager won't do that for us after splitting them.
@@ -402,8 +418,8 @@ incorporateNewSCCRange(const SCCRangeT &NewSCCRange, LazyCallGraph &G,
   AM.invalidate(*OldC, PA);
 
   // Ensure the now-current SCC's function analyses are updated.
-  if (NeedFAMProxy)
-    updateNewSCCFunctionAnalyses(*C, G, AM);
+  if (FAM)
+    updateNewSCCFunctionAnalyses(*C, G, AM, *FAM);
 
   for (SCC &NewC : llvm::reverse(make_range(std::next(NewSCCRange.begin()),
                                             NewSCCRange.end()))) {
@@ -413,8 +429,8 @@ incorporateNewSCCRange(const SCCRangeT &NewSCCRange, LazyCallGraph &G,
     LLVM_DEBUG(dbgs() << "Enqueuing a newly formed SCC:" << NewC << "\n");
 
     // Ensure new SCCs' function analyses are updated.
-    if (NeedFAMProxy)
-      updateNewSCCFunctionAnalyses(NewC, G, AM);
+    if (FAM)
+      updateNewSCCFunctionAnalyses(NewC, G, AM, *FAM);
 
     // Also propagate a normal invalidation to the new SCC as only the current
     // will get one from the pass manager infrastructure.
@@ -423,9 +439,10 @@ incorporateNewSCCRange(const SCCRangeT &NewSCCRange, LazyCallGraph &G,
   return C;
 }
 
-LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
+static LazyCallGraph::SCC &updateCGAndAnalysisManagerForPass(
     LazyCallGraph &G, LazyCallGraph::SCC &InitialC, LazyCallGraph::Node &N,
-    CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR) {
+    CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR,
+    FunctionAnalysisManager &FAM, bool FunctionPass) {
   using Node = LazyCallGraph::Node;
   using Edge = LazyCallGraph::Edge;
   using SCC = LazyCallGraph::SCC;
@@ -443,28 +460,28 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
   SmallPtrSet<Node *, 16> RetainedEdges;
   SmallSetVector<Node *, 4> PromotedRefTargets;
   SmallSetVector<Node *, 4> DemotedCallTargets;
+  SmallSetVector<Node *, 4> NewCallEdges;
+  SmallSetVector<Node *, 4> NewRefEdges;
 
   // First walk the function and handle all called functions. We do this first
   // because if there is a single call edge, whether there are ref edges is
   // irrelevant.
   for (Instruction &I : instructions(F))
-    if (auto CS = CallSite(&I))
-      if (Function *Callee = CS.getCalledFunction())
+    if (auto *CB = dyn_cast<CallBase>(&I))
+      if (Function *Callee = CB->getCalledFunction())
         if (Visited.insert(Callee).second && !Callee->isDeclaration()) {
           Node &CalleeN = *G.lookup(*Callee);
           Edge *E = N->lookup(CalleeN);
-          // FIXME: We should really handle adding new calls. While it will
-          // make downstream usage more complex, there is no fundamental
-          // limitation and it will allow passes within the CGSCC to be a bit
-          // more flexible in what transforms they can do. Until then, we
-          // verify that new calls haven't been introduced.
-          assert(E && "No function transformations should introduce *new* "
-                      "call edges! Any new calls should be modeled as "
-                      "promoted existing ref edges!");
+          assert((E || !FunctionPass) &&
+                 "No function transformations should introduce *new* "
+                 "call edges! Any new calls should be modeled as "
+                 "promoted existing ref edges!");
           bool Inserted = RetainedEdges.insert(&CalleeN).second;
           (void)Inserted;
           assert(Inserted && "We should never visit a function twice.");
-          if (!E->isCall())
+          if (!E)
+            NewCallEdges.insert(&CalleeN);
+          else if (!E->isCall())
             PromotedRefTargets.insert(&CalleeN);
         }
 
@@ -478,19 +495,42 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
   auto VisitRef = [&](Function &Referee) {
     Node &RefereeN = *G.lookup(Referee);
     Edge *E = N->lookup(RefereeN);
-    // FIXME: Similarly to new calls, we also currently preclude
-    // introducing new references. See above for details.
-    assert(E && "No function transformations should introduce *new* ref "
-                "edges! Any new ref edges would require IPO which "
-                "function passes aren't allowed to do!");
+    assert((E || !FunctionPass) &&
+           "No function transformations should introduce *new* ref "
+           "edges! Any new ref edges would require IPO which "
+           "function passes aren't allowed to do!");
     bool Inserted = RetainedEdges.insert(&RefereeN).second;
     (void)Inserted;
     assert(Inserted && "We should never visit a function twice.");
-    if (E->isCall())
+    if (!E)
+      NewRefEdges.insert(&RefereeN);
+    else if (E->isCall())
       DemotedCallTargets.insert(&RefereeN);
   };
   LazyCallGraph::visitReferences(Worklist, Visited, VisitRef);
 
+  // Handle new ref edges.
+  for (Node *RefTarget : NewRefEdges) {
+    SCC &TargetC = *G.lookupSCC(*RefTarget);
+    RefSCC &TargetRC = TargetC.getOuterRefSCC();
+    (void)TargetRC;
+    // TODO: This only allows trivial edges to be added for now.
+    assert((RC == &TargetRC ||
+           RC->isAncestorOf(TargetRC)) && "New ref edge is not trivial!");
+    RC->insertTrivialRefEdge(N, *RefTarget);
+  }
+
+  // Handle new call edges.
+  for (Node *CallTarget : NewCallEdges) {
+    SCC &TargetC = *G.lookupSCC(*CallTarget);
+    RefSCC &TargetRC = TargetC.getOuterRefSCC();
+    (void)TargetRC;
+    // TODO: This only allows trivial edges to be added for now.
+    assert((RC == &TargetRC ||
+           RC->isAncestorOf(TargetRC)) && "New call edge is not trivial!");
+    RC->insertTrivialCallEdge(N, *CallTarget);
+  }
+
   // Include synthetic reference edges to known, defined lib functions.
   for (auto *F : G.getLibFunctions())
     // While the list of lib functions doesn't have repeats, don't re-visit
@@ -658,7 +698,7 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
       // analysis manager, we need to create a proxy in the new current SCC as
       // the invalidated SCCs had their functions moved.
       if (HasFunctionAnalysisProxy)
-        AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, G);
+        AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, G).updateFAM(FAM);
 
       // Any analyses cached for this SCC are no longer precise as the shape
       // has changed by introducing this cycle. However, we have taken care to
@@ -707,3 +747,18 @@ LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
 
   return *C;
 }
+
+LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForFunctionPass(
+    LazyCallGraph &G, LazyCallGraph::SCC &InitialC, LazyCallGraph::Node &N,
+    CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR,
+    FunctionAnalysisManager &FAM) {
+  return updateCGAndAnalysisManagerForPass(G, InitialC, N, AM, UR, FAM,
+                                           /* FunctionPass */ true);
+}
+LazyCallGraph::SCC &llvm::updateCGAndAnalysisManagerForCGSCCPass(
+    LazyCallGraph &G, LazyCallGraph::SCC &InitialC, LazyCallGraph::Node &N,
+    CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR,
+    FunctionAnalysisManager &FAM) {
+  return updateCGAndAnalysisManagerForPass(G, InitialC, N, AM, UR, FAM,
+                                           /* FunctionPass */ false);
+}
diff --git a/llvm/lib/Analysis/CallGraph.cpp b/llvm/lib/Analysis/CallGraph.cpp
index 8e8a50178518..55adb454b733 100644
--- a/llvm/lib/Analysis/CallGraph.cpp
+++ b/llvm/lib/Analysis/CallGraph.cpp
@@ -10,7 +10,9 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/IR/AbstractCallSite.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
@@ -30,10 +32,11 @@ using namespace llvm;
 
 CallGraph::CallGraph(Module &M)
     : M(M), ExternalCallingNode(getOrInsertFunction(nullptr)),
-      CallsExternalNode(std::make_unique<CallGraphNode>(nullptr)) {
-  // Add every function to the call graph.
+      CallsExternalNode(std::make_unique<CallGraphNode>(this, nullptr)) {
+  // Add every interesting function to the call graph.
   for (Function &F : M)
-    addToCallGraph(&F);
+    if (!isDbgInfoIntrinsic(F.getIntrinsicID()))
+      addToCallGraph(&F);
 }
 
 CallGraph::CallGraph(CallGraph &&Arg)
@@ -42,6 +45,11 @@ CallGraph::CallGraph(CallGraph &&Arg)
       CallsExternalNode(std::move(Arg.CallsExternalNode)) {
   Arg.FunctionMap.clear();
   Arg.ExternalCallingNode = nullptr;
+
+  // Update parent CG for all call graph's nodes.
+  CallsExternalNode->CG = this;
+  for (auto &P : FunctionMap)
+    P.second->CG = this;
 }
 
 CallGraph::~CallGraph() {
@@ -57,14 +65,30 @@ CallGraph::~CallGraph() {
 #endif
 }
 
+bool CallGraph::invalidate(Module &, const PreservedAnalyses &PA,
+                           ModuleAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<CallGraphAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Module>>() ||
+           PAC.preservedSet<CFGAnalyses>());
+}
+
 void CallGraph::addToCallGraph(Function *F) {
   CallGraphNode *Node = getOrInsertFunction(F);
 
-  // If this function has external linkage or has its address taken, anything
-  // could call it.
-  if (!F->hasLocalLinkage() || F->hasAddressTaken())
+  // If this function has external linkage or has its address taken and
+  // it is not a callback, then anything could call it.
+  if (!F->hasLocalLinkage() ||
+      F->hasAddressTaken(nullptr, /*IgnoreCallbackUses=*/true))
     ExternalCallingNode->addCalledFunction(nullptr, Node);
 
+  populateCallGraphNode(Node);
+}
+
+void CallGraph::populateCallGraphNode(CallGraphNode *Node) {
+  Function *F = Node->getFunction();
+
   // If this function is not defined in this translation unit, it could call
   // anything.
   if (F->isDeclaration() && !F->isIntrinsic())
@@ -82,6 +106,11 @@ void CallGraph::addToCallGraph(Function *F) {
           Node->addCalledFunction(Call, CallsExternalNode.get());
         else if (!Callee->isIntrinsic())
           Node->addCalledFunction(Call, getOrInsertFunction(Callee));
+
+        // Add reference to callback functions.
+        forEachCallbackFunction(*Call, [=](Function *CB) {
+          Node->addCalledFunction(nullptr, getOrInsertFunction(CB));
+        });
       }
     }
 }
@@ -112,6 +141,16 @@ void CallGraph::print(raw_ostream &OS) const {
 LLVM_DUMP_METHOD void CallGraph::dump() const { print(dbgs()); }
 #endif
 
+void CallGraph::ReplaceExternalCallEdge(CallGraphNode *Old,
+                                        CallGraphNode *New) {
+  for (auto &CR : ExternalCallingNode->CalledFunctions)
+    if (CR.second == Old) {
+      CR.second->DropRef();
+      CR.second = New;
+      CR.second->AddRef();
+    }
+}
+
 // removeFunctionFromModule - Unlink the function from this module, returning
 // it.  Because this removes the function from the module, the call graph node
 // is destroyed.  This is only valid if the function does not call any other
@@ -151,7 +190,7 @@ CallGraphNode *CallGraph::getOrInsertFunction(const Function *F) {
     return CGN.get();
 
   assert((!F || F->getParent() == &M) && "Function not in current module!");
-  CGN = std::make_unique<CallGraphNode>(const_cast<Function *>(F));
+  CGN = std::make_unique<CallGraphNode>(this, const_cast<Function *>(F));
   return CGN.get();
 }
 
@@ -187,10 +226,15 @@ LLVM_DUMP_METHOD void CallGraphNode::dump() const { print(dbgs()); }
 void CallGraphNode::removeCallEdgeFor(CallBase &Call) {
   for (CalledFunctionsVector::iterator I = CalledFunctions.begin(); ; ++I) {
     assert(I != CalledFunctions.end() && "Cannot find callsite to remove!");
-    if (I->first == &Call) {
+    if (I->first && *I->first == &Call) {
       I->second->DropRef();
       *I = CalledFunctions.back();
       CalledFunctions.pop_back();
+
+      // Remove all references to callback functions if there are any.
+      forEachCallbackFunction(Call, [=](Function *CB) {
+        removeOneAbstractEdgeTo(CG->getOrInsertFunction(CB));
+      });
       return;
     }
   }
@@ -215,7 +259,7 @@ void CallGraphNode::removeOneAbstractEdgeTo(CallGraphNode *Callee) {
   for (CalledFunctionsVector::iterator I = CalledFunctions.begin(); ; ++I) {
     assert(I != CalledFunctions.end() && "Cannot find callee to remove!");
     CallRecord &CR = *I;
-    if (CR.second == Callee && CR.first == nullptr) {
+    if (CR.second == Callee && !CR.first) {
       Callee->DropRef();
       *I = CalledFunctions.back();
       CalledFunctions.pop_back();
@@ -231,11 +275,19 @@ void CallGraphNode::replaceCallEdge(CallBase &Call, CallBase &NewCall,
                                     CallGraphNode *NewNode) {
   for (CalledFunctionsVector::iterator I = CalledFunctions.begin(); ; ++I) {
     assert(I != CalledFunctions.end() && "Cannot find callsite to remove!");
-    if (I->first == &Call) {
+    if (I->first && *I->first == &Call) {
       I->second->DropRef();
       I->first = &NewCall;
       I->second = NewNode;
       NewNode->AddRef();
+
+      // Refresh callback references.
+      forEachCallbackFunction(Call, [=](Function *CB) {
+        removeOneAbstractEdgeTo(CG->getOrInsertFunction(CB));
+      });
+      forEachCallbackFunction(NewCall, [=](Function *CB) {
+        addCalledFunction(nullptr, CG->getOrInsertFunction(CB));
+      });
       return;
     }
   }
diff --git a/llvm/lib/Analysis/CallGraphSCCPass.cpp b/llvm/lib/Analysis/CallGraphSCCPass.cpp
index 196ef400bc4e..91f8029cc326 100644
--- a/llvm/lib/Analysis/CallGraphSCCPass.cpp
+++ b/llvm/lib/Analysis/CallGraphSCCPass.cpp
@@ -19,6 +19,7 @@
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/CallGraph.h"
+#include "llvm/IR/AbstractCallSite.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/Intrinsics.h"
@@ -225,22 +226,51 @@ bool CGPassManager::RefreshCallGraph(const CallGraphSCC &CurSCC, CallGraph &CG,
     // invalidated and removed.
     unsigned NumDirectRemoved = 0, NumIndirectRemoved = 0;
 
+    CallGraphNode::iterator CGNEnd = CGN->end();
+
+    auto RemoveAndCheckForDone = [&](CallGraphNode::iterator I) {
+      // Just remove the edge from the set of callees, keep track of whether
+      // I points to the last element of the vector.
+      bool WasLast = I + 1 == CGNEnd;
+      CGN->removeCallEdge(I);
+
+      // If I pointed to the last element of the vector, we have to bail out:
+      // iterator checking rejects comparisons of the resultant pointer with
+      // end.
+      if (WasLast)
+        return true;
+
+      CGNEnd = CGN->end();
+      return false;
+    };
+
     // Get the set of call sites currently in the function.
-    for (CallGraphNode::iterator I = CGN->begin(), E = CGN->end(); I != E; ) {
+    for (CallGraphNode::iterator I = CGN->begin(); I != CGNEnd;) {
+      // Delete "reference" call records that do not have call instruction. We
+      // reinsert them as needed later. However, keep them in checking mode.
+      if (!I->first) {
+        if (CheckingMode) {
+          ++I;
+          continue;
+        }
+        if (RemoveAndCheckForDone(I))
+          break;
+        continue;
+      }
+
       // If this call site is null, then the function pass deleted the call
       // entirely and the WeakTrackingVH nulled it out.
-      auto *Call = dyn_cast_or_null<CallBase>(I->first);
-      if (!I->first ||
+      auto *Call = dyn_cast_or_null<CallBase>(*I->first);
+      if (!Call ||
           // If we've already seen this call site, then the FunctionPass RAUW'd
           // one call with another, which resulted in two "uses" in the edge
           // list of the same call.
-          Calls.count(I->first) ||
+          Calls.count(Call) ||
 
           // If the call edge is not from a call or invoke, or it is a
           // instrinsic call, then the function pass RAUW'd a call with
           // another value. This can happen when constant folding happens
           // of well known functions etc.
-          !Call ||
           (Call->getCalledFunction() &&
            Call->getCalledFunction()->isIntrinsic() &&
            Intrinsic::isLeaf(Call->getCalledFunction()->getIntrinsicID()))) {
@@ -253,28 +283,18 @@ bool CGPassManager::RefreshCallGraph(const CallGraphSCC &CurSCC, CallGraph &CG,
         else
           ++NumDirectRemoved;
 
-        // Just remove the edge from the set of callees, keep track of whether
-        // I points to the last element of the vector.
-        bool WasLast = I + 1 == E;
-        CGN->removeCallEdge(I);
-
-        // If I pointed to the last element of the vector, we have to bail out:
-        // iterator checking rejects comparisons of the resultant pointer with
-        // end.
-        if (WasLast)
+        if (RemoveAndCheckForDone(I))
           break;
-        E = CGN->end();
         continue;
       }
 
-      assert(!Calls.count(I->first) &&
-             "Call site occurs in node multiple times");
+      assert(!Calls.count(Call) && "Call site occurs in node multiple times");
 
       if (Call) {
         Function *Callee = Call->getCalledFunction();
         // Ignore intrinsics because they're not really function calls.
         if (!Callee || !(Callee->isIntrinsic()))
-          Calls.insert(std::make_pair(I->first, I->second));
+          Calls.insert(std::make_pair(Call, I->second));
       }
       ++I;
     }
@@ -292,6 +312,15 @@ bool CGPassManager::RefreshCallGraph(const CallGraphSCC &CurSCC, CallGraph &CG,
         if (Callee && Callee->isIntrinsic())
           continue;
 
+        // If we are not in checking mode, insert potential callback calls as
+        // references. This is not a requirement but helps to iterate over the
+        // functions in the right order.
+        if (!CheckingMode) {
+          forEachCallbackFunction(*Call, [&](Function *CB) {
+            CGN->addCalledFunction(nullptr, CG.getOrInsertFunction(CB));
+          });
+        }
+
         // If this call site already existed in the callgraph, just verify it
         // matches up to expectations and remove it from Calls.
         DenseMap<Value *, CallGraphNode *>::iterator ExistingIt =
@@ -549,7 +578,10 @@ void CallGraphSCC::ReplaceNode(CallGraphNode *Old, CallGraphNode *New) {
   for (unsigned i = 0; ; ++i) {
     assert(i != Nodes.size() && "Node not in SCC");
     if (Nodes[i] != Old) continue;
-    Nodes[i] = New;
+    if (New)
+      Nodes[i] = New;
+    else
+      Nodes.erase(Nodes.begin() + i);
     break;
   }
 
@@ -559,6 +591,10 @@ void CallGraphSCC::ReplaceNode(CallGraphNode *Old, CallGraphNode *New) {
   CGI->ReplaceNode(Old, New);
 }
 
+void CallGraphSCC::DeleteNode(CallGraphNode *Old) {
+  ReplaceNode(Old, /*New=*/nullptr);
+}
+
 //===----------------------------------------------------------------------===//
 // CallGraphSCCPass Implementation
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Analysis/CallPrinter.cpp b/llvm/lib/Analysis/CallPrinter.cpp
index 7246b73bfd4b..bb447411ec47 100644
--- a/llvm/lib/Analysis/CallPrinter.cpp
+++ b/llvm/lib/Analysis/CallPrinter.cpp
@@ -14,63 +14,279 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/CallPrinter.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/DOTGraphTraitsPass.h"
+#include "llvm/Analysis/HeatUtils.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
 
 using namespace llvm;
 
+// This option shows static (relative) call counts.
+// FIXME:
+// Need to show real counts when profile data is available
+static cl::opt<bool> ShowHeatColors("callgraph-heat-colors", cl::init(false),
+                                    cl::Hidden,
+                                    cl::desc("Show heat colors in call-graph"));
+
+static cl::opt<bool>
+    ShowEdgeWeight("callgraph-show-weights", cl::init(false), cl::Hidden,
+                       cl::desc("Show edges labeled with weights"));
+
+static cl::opt<bool>
+    CallMultiGraph("callgraph-multigraph", cl::init(false), cl::Hidden,
+            cl::desc("Show call-multigraph (do not remove parallel edges)"));
+
+static cl::opt<std::string> CallGraphDotFilenamePrefix(
+    "callgraph-dot-filename-prefix", cl::Hidden,
+    cl::desc("The prefix used for the CallGraph dot file names."));
+
 namespace llvm {
 
-template <> struct DOTGraphTraits<CallGraph *> : public DefaultDOTGraphTraits {
+class CallGraphDOTInfo {
+private:
+  Module *M;
+  CallGraph *CG;
+  DenseMap<const Function *, uint64_t> Freq;
+  uint64_t MaxFreq;
+
+public:
+  std::function<BlockFrequencyInfo *(Function &)> LookupBFI;
+
+  CallGraphDOTInfo(Module *M, CallGraph *CG,
+                   function_ref<BlockFrequencyInfo *(Function &)> LookupBFI)
+      : M(M), CG(CG), LookupBFI(LookupBFI) {
+    MaxFreq = 0;
+
+    for (auto F = M->getFunctionList().begin(); F != M->getFunctionList().end(); ++F) {
+      uint64_t localSumFreq = 0;
+      SmallSet<Function *, 16> Callers;
+      for (User *U : (*F).users())
+        if (isa<CallInst>(U))
+          Callers.insert(cast<Instruction>(U)->getFunction());
+      for (auto iter = Callers.begin() ; iter != Callers.end() ; ++iter)
+        localSumFreq += getNumOfCalls((**iter), *F);
+      if (localSumFreq >= MaxFreq)
+        MaxFreq = localSumFreq;
+      Freq[&*F] = localSumFreq;
+    }
+    if (!CallMultiGraph)
+      removeParallelEdges();
+  }
+
+  Module *getModule() const { return M; }
+
+  CallGraph *getCallGraph() const { return CG; }
+
+  uint64_t getFreq(const Function *F) { return Freq[F]; }
+
+  uint64_t getMaxFreq() { return MaxFreq; }
+
+private:
+  void removeParallelEdges() {
+    for (auto &I : (*CG)) {
+      CallGraphNode *Node = I.second.get();
+
+      bool FoundParallelEdge = true;
+      while (FoundParallelEdge) {
+        SmallSet<Function *, 16> Visited;
+        FoundParallelEdge = false;
+        for (auto CI = Node->begin(), CE = Node->end(); CI != CE; CI++) {
+          if (!(Visited.insert(CI->second->getFunction())).second) {
+            FoundParallelEdge = true;
+            Node->removeCallEdge(CI);
+            break;
+          }
+        }
+      }
+    }
+  }
+};
+
+template <>
+struct GraphTraits<CallGraphDOTInfo *>
+    : public GraphTraits<const CallGraphNode *> {
+  static NodeRef getEntryNode(CallGraphDOTInfo *CGInfo) {
+    // Start at the external node!
+    return CGInfo->getCallGraph()->getExternalCallingNode();
+  }
+
+  typedef std::pair<const Function *const, std::unique_ptr<CallGraphNode>>
+      PairTy;
+  static const CallGraphNode *CGGetValuePtr(const PairTy &P) {
+    return P.second.get();
+  }
+
+  // nodes_iterator/begin/end - Allow iteration over all nodes in the graph
+  typedef mapped_iterator<CallGraph::const_iterator, decltype(&CGGetValuePtr)>
+      nodes_iterator;
+
+  static nodes_iterator nodes_begin(CallGraphDOTInfo *CGInfo) {
+    return nodes_iterator(CGInfo->getCallGraph()->begin(), &CGGetValuePtr);
+  }
+  static nodes_iterator nodes_end(CallGraphDOTInfo *CGInfo) {
+    return nodes_iterator(CGInfo->getCallGraph()->end(), &CGGetValuePtr);
+  }
+};
+
+template <>
+struct DOTGraphTraits<CallGraphDOTInfo *> : public DefaultDOTGraphTraits {
+
   DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
-  static std::string getGraphName(CallGraph *Graph) { return "Call graph"; }
+  static std::string getGraphName(CallGraphDOTInfo *CGInfo) {
+    return "Call graph: " +
+           std::string(CGInfo->getModule()->getModuleIdentifier());
+  }
 
-  std::string getNodeLabel(CallGraphNode *Node, CallGraph *Graph) {
-    if (Function *Func = Node->getFunction())
-      return Func->getName();
+  static bool isNodeHidden(const CallGraphNode *Node) {
+    if (CallMultiGraph || Node->getFunction())
+      return false;
+    return true;
+  }
 
+  std::string getNodeLabel(const CallGraphNode *Node,
+                           CallGraphDOTInfo *CGInfo) {
+    if (Node == CGInfo->getCallGraph()->getExternalCallingNode())
+      return "external caller";
+    if (Node == CGInfo->getCallGraph()->getCallsExternalNode())
+      return "external callee";
+
+    if (Function *Func = Node->getFunction())
+      return std::string(Func->getName());
     return "external node";
   }
-};
+  static const CallGraphNode *CGGetValuePtr(CallGraphNode::CallRecord P) {
+    return P.second;
+  }
+
+  // nodes_iterator/begin/end - Allow iteration over all nodes in the graph
+  typedef mapped_iterator<CallGraphNode::const_iterator,
+                          decltype(&CGGetValuePtr)>
+      nodes_iterator;
+
+  std::string getEdgeAttributes(const CallGraphNode *Node, nodes_iterator I,
+                                CallGraphDOTInfo *CGInfo) {
+    if (!ShowEdgeWeight)
+      return "";
+
+    Function *Caller = Node->getFunction();
+    if (Caller == nullptr || Caller->isDeclaration())
+      return "";
+
+    Function *Callee = (*I)->getFunction();
+    if (Callee == nullptr)
+      return "";
 
-struct AnalysisCallGraphWrapperPassTraits {
-  static CallGraph *getGraph(CallGraphWrapperPass *P) {
-    return &P->getCallGraph();
+    uint64_t Counter = getNumOfCalls(*Caller, *Callee);
+    double Width =
+        1 + 2 * (double(Counter) / CGInfo->getMaxFreq());
+    std::string Attrs = "label=\"" + std::to_string(Counter) +
+                        "\" penwidth=" + std::to_string(Width);
+    return Attrs;
+  }
+
+  std::string getNodeAttributes(const CallGraphNode *Node,
+                                CallGraphDOTInfo *CGInfo) {
+    Function *F = Node->getFunction();
+    if (F == nullptr)
+      return "";
+    std::string attrs = "";
+    if (ShowHeatColors) {
+      uint64_t freq = CGInfo->getFreq(F);
+      std::string color = getHeatColor(freq, CGInfo->getMaxFreq());
+      std::string edgeColor = (freq <= (CGInfo->getMaxFreq() / 2))
+                                  ? getHeatColor(0)
+                                  : getHeatColor(1);
+      attrs = "color=\"" + edgeColor + "ff\", style=filled, fillcolor=\"" +
+              color + "80\"";
+    }
+    return attrs;
   }
 };
 
 } // end llvm namespace
 
 namespace {
-
-struct CallGraphViewer
-    : public DOTGraphTraitsModuleViewer<CallGraphWrapperPass, true, CallGraph *,
-                                        AnalysisCallGraphWrapperPassTraits> {
+// Viewer
+class CallGraphViewer : public ModulePass {
+public:
   static char ID;
+  CallGraphViewer() : ModulePass(ID) {}
 
-  CallGraphViewer()
-      : DOTGraphTraitsModuleViewer<CallGraphWrapperPass, true, CallGraph *,
-                                   AnalysisCallGraphWrapperPassTraits>(
-            "callgraph", ID) {
-    initializeCallGraphViewerPass(*PassRegistry::getPassRegistry());
-  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnModule(Module &M) override;
 };
 
-struct CallGraphDOTPrinter : public DOTGraphTraitsModulePrinter<
-                              CallGraphWrapperPass, true, CallGraph *,
-                              AnalysisCallGraphWrapperPassTraits> {
+void CallGraphViewer::getAnalysisUsage(AnalysisUsage &AU) const {
+  ModulePass::getAnalysisUsage(AU);
+  AU.addRequired<BlockFrequencyInfoWrapperPass>();
+  AU.setPreservesAll();
+}
+
+bool CallGraphViewer::runOnModule(Module &M) {
+  auto LookupBFI = [this](Function &F) {
+    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+  };
+
+  CallGraph CG(M);
+  CallGraphDOTInfo CFGInfo(&M, &CG, LookupBFI);
+
+  std::string Title =
+      DOTGraphTraits<CallGraphDOTInfo *>::getGraphName(&CFGInfo);
+  ViewGraph(&CFGInfo, "callgraph", true, Title);
+
+  return false;
+}
+
+// DOT Printer
+
+class CallGraphDOTPrinter : public ModulePass {
+public:
   static char ID;
+  CallGraphDOTPrinter() : ModulePass(ID) {}
 
-  CallGraphDOTPrinter()
-      : DOTGraphTraitsModulePrinter<CallGraphWrapperPass, true, CallGraph *,
-                                    AnalysisCallGraphWrapperPassTraits>(
-            "callgraph", ID) {
-    initializeCallGraphDOTPrinterPass(*PassRegistry::getPassRegistry());
-  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnModule(Module &M) override;
 };
 
+void CallGraphDOTPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
+  ModulePass::getAnalysisUsage(AU);
+  AU.addRequired<BlockFrequencyInfoWrapperPass>();
+  AU.setPreservesAll();
+}
+
+bool CallGraphDOTPrinter::runOnModule(Module &M) {
+  auto LookupBFI = [this](Function &F) {
+    return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
+  };
+
+  std::string Filename;
+  if (!CallGraphDotFilenamePrefix.empty())
+    Filename = (CallGraphDotFilenamePrefix + ".callgraph.dot");
+  else
+    Filename = (std::string(M.getModuleIdentifier()) + ".callgraph.dot");
+  errs() << "Writing '" << Filename << "'...";
+
+  std::error_code EC;
+  raw_fd_ostream File(Filename, EC, sys::fs::F_Text);
+
+  CallGraph CG(M);
+  CallGraphDOTInfo CFGInfo(&M, &CG, LookupBFI);
+
+  if (!EC)
+    WriteGraph(File, &CFGInfo);
+  else
+    errs() << "  error opening file for writing!";
+  errs() << "\n";
+
+  return false;
+}
+
 } // end anonymous namespace
 
 char CallGraphViewer::ID = 0;
diff --git a/llvm/lib/Analysis/CaptureTracking.cpp b/llvm/lib/Analysis/CaptureTracking.cpp
index 20e2f06540a3..8b101e3b2cc4 100644
--- a/llvm/lib/Analysis/CaptureTracking.cpp
+++ b/llvm/lib/Analysis/CaptureTracking.cpp
@@ -20,15 +20,30 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
-#include "llvm/Analysis/OrderedBasicBlock.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
+/// The default value for MaxUsesToExplore argument. It's relatively small to
+/// keep the cost of analysis reasonable for clients like BasicAliasAnalysis,
+/// where the results can't be cached.
+/// TODO: we should probably introduce a caching CaptureTracking analysis and
+/// use it where possible. The caching version can use much higher limit or
+/// don't have this cap at all.
+static cl::opt<unsigned>
+DefaultMaxUsesToExplore("capture-tracking-max-uses-to-explore", cl::Hidden,
+                        cl::desc("Maximal number of uses to explore."),
+                        cl::init(20));
+
+unsigned llvm::getDefaultMaxUsesToExploreForCaptureTracking() {
+  return DefaultMaxUsesToExplore;
+}
+
 CaptureTracker::~CaptureTracker() {}
 
 bool CaptureTracker::shouldExplore(const Use *U) { return true; }
@@ -76,8 +91,8 @@ namespace {
   struct CapturesBefore : public CaptureTracker {
 
     CapturesBefore(bool ReturnCaptures, const Instruction *I, const DominatorTree *DT,
-                   bool IncludeI, OrderedBasicBlock *IC)
-      : OrderedBB(IC), BeforeHere(I), DT(DT),
+                   bool IncludeI)
+      : BeforeHere(I), DT(DT),
         ReturnCaptures(ReturnCaptures), IncludeI(IncludeI), Captured(false) {}
 
     void tooManyUses() override { Captured = true; }
@@ -90,9 +105,7 @@ namespace {
         return true;
 
       // Compute the case where both instructions are inside the same basic
-      // block. Since instructions in the same BB as BeforeHere are numbered in
-      // 'OrderedBB', avoid using 'dominates' and 'isPotentiallyReachable'
-      // which are very expensive for large basic blocks.
+      // block.
       if (BB == BeforeHere->getParent()) {
         // 'I' dominates 'BeforeHere' => not safe to prune.
         //
@@ -102,7 +115,7 @@ namespace {
         // UseBB == BB, avoid pruning.
         if (isa<InvokeInst>(BeforeHere) || isa<PHINode>(I) || I == BeforeHere)
           return false;
-        if (!OrderedBB->dominates(BeforeHere, I))
+        if (!BeforeHere->comesBefore(I))
           return false;
 
         // 'BeforeHere' comes before 'I', it's safe to prune if we also
@@ -153,7 +166,6 @@ namespace {
       return true;
     }
 
-    OrderedBasicBlock *OrderedBB;
     const Instruction *BeforeHere;
     const DominatorTree *DT;
 
@@ -196,39 +208,35 @@ bool llvm::PointerMayBeCaptured(const Value *V,
 /// returning the value (or part of it) from the function counts as capturing
 /// it or not.  The boolean StoreCaptures specified whether storing the value
 /// (or part of it) into memory anywhere automatically counts as capturing it
-/// or not. A ordered basic block \p OBB can be used in order to speed up
-/// queries about relative order among instructions in the same basic block.
+/// or not.
 bool llvm::PointerMayBeCapturedBefore(const Value *V, bool ReturnCaptures,
                                       bool StoreCaptures, const Instruction *I,
                                       const DominatorTree *DT, bool IncludeI,
-                                      OrderedBasicBlock *OBB,
                                       unsigned MaxUsesToExplore) {
   assert(!isa<GlobalValue>(V) &&
          "It doesn't make sense to ask whether a global is captured.");
-  bool UseNewOBB = OBB == nullptr;
 
   if (!DT)
     return PointerMayBeCaptured(V, ReturnCaptures, StoreCaptures,
                                 MaxUsesToExplore);
-  if (UseNewOBB)
-    OBB = new OrderedBasicBlock(I->getParent());
 
   // TODO: See comment in PointerMayBeCaptured regarding what could be done
   // with StoreCaptures.
 
-  CapturesBefore CB(ReturnCaptures, I, DT, IncludeI, OBB);
+  CapturesBefore CB(ReturnCaptures, I, DT, IncludeI);
   PointerMayBeCaptured(V, &CB, MaxUsesToExplore);
-
-  if (UseNewOBB)
-    delete OBB;
   return CB.Captured;
 }
 
 void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker,
                                 unsigned MaxUsesToExplore) {
   assert(V->getType()->isPointerTy() && "Capture is for pointers only!");
-  SmallVector<const Use *, DefaultMaxUsesToExplore> Worklist;
-  SmallSet<const Use *, DefaultMaxUsesToExplore> Visited;
+  if (MaxUsesToExplore == 0)
+    MaxUsesToExplore = DefaultMaxUsesToExplore;
+
+  SmallVector<const Use *, 20> Worklist;
+  Worklist.reserve(getDefaultMaxUsesToExploreForCaptureTracking());
+  SmallSet<const Use *, 20> Visited;
 
   auto AddUses = [&](const Value *V) {
     unsigned Count = 0;
diff --git a/llvm/lib/Analysis/CodeMetrics.cpp b/llvm/lib/Analysis/CodeMetrics.cpp
index 627d955c865f..0b2b6f9bfa46 100644
--- a/llvm/lib/Analysis/CodeMetrics.cpp
+++ b/llvm/lib/Analysis/CodeMetrics.cpp
@@ -11,14 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/CodeMetrics.h"
+#include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "code-metrics"
 
@@ -171,7 +170,7 @@ void CodeMetrics::analyzeBasicBlock(const BasicBlock *BB,
       if (InvI->cannotDuplicate())
         notDuplicatable = true;
 
-    NumInsts += TTI.getUserCost(&I);
+    NumInsts += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
   }
 
   if (isa<ReturnInst>(BB->getTerminator()))
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index b32924e6497a..8c66decaaf58 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -23,6 +23,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -37,7 +38,9 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsAMDGPU.h"
 #include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
@@ -115,8 +118,8 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
       // to simplify things.
       if (SrcEltTy->isFloatingPointTy()) {
         unsigned FPWidth = SrcEltTy->getPrimitiveSizeInBits();
-        Type *SrcIVTy =
-          VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumSrcElts);
+        auto *SrcIVTy = FixedVectorType::get(
+            IntegerType::get(C->getContext(), FPWidth), NumSrcElts);
         // Ask IR to do the conversion now that #elts line up.
         C = ConstantExpr::getBitCast(C, SrcIVTy);
       }
@@ -152,11 +155,11 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
 
   // If the element types match, IR can fold it.
   unsigned NumDstElt = DestVTy->getNumElements();
-  unsigned NumSrcElt = C->getType()->getVectorNumElements();
+  unsigned NumSrcElt = cast<VectorType>(C->getType())->getNumElements();
   if (NumDstElt == NumSrcElt)
     return ConstantExpr::getBitCast(C, DestTy);
 
-  Type *SrcEltTy = C->getType()->getVectorElementType();
+  Type *SrcEltTy = cast<VectorType>(C->getType())->getElementType();
   Type *DstEltTy = DestVTy->getElementType();
 
   // Otherwise, we're changing the number of elements in a vector, which
@@ -172,8 +175,8 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
   if (DstEltTy->isFloatingPointTy()) {
     // Fold to an vector of integers with same size as our FP type.
     unsigned FPWidth = DstEltTy->getPrimitiveSizeInBits();
-    Type *DestIVTy =
-      VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumDstElt);
+    auto *DestIVTy = FixedVectorType::get(
+        IntegerType::get(C->getContext(), FPWidth), NumDstElt);
     // Recursively handle this integer conversion, if possible.
     C = FoldBitCast(C, DestIVTy, DL);
 
@@ -185,8 +188,8 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
   // it to integer first.
   if (SrcEltTy->isFloatingPointTy()) {
     unsigned FPWidth = SrcEltTy->getPrimitiveSizeInBits();
-    Type *SrcIVTy =
-      VectorType::get(IntegerType::get(C->getContext(), FPWidth), NumSrcElt);
+    auto *SrcIVTy = FixedVectorType::get(
+        IntegerType::get(C->getContext(), FPWidth), NumSrcElt);
     // Ask IR to do the conversion now that #elts line up.
     C = ConstantExpr::getBitCast(C, SrcIVTy);
     // If IR wasn't able to fold it, bail out.
@@ -215,7 +218,8 @@ Constant *FoldBitCast(Constant *C, Type *DestTy, const DataLayout &DL) {
       for (unsigned j = 0; j != Ratio; ++j) {
         Constant *Src = C->getAggregateElement(SrcElt++);
         if (Src && isa<UndefValue>(Src))
-          Src = Constant::getNullValue(C->getType()->getVectorElementType());
+          Src = Constant::getNullValue(
+              cast<VectorType>(C->getType())->getElementType());
         else
           Src = dyn_cast_or_null<ConstantInt>(Src);
         if (!Src)  // Reject constantexpr elements.
@@ -329,10 +333,25 @@ Constant *llvm::ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy,
                                          const DataLayout &DL) {
   do {
     Type *SrcTy = C->getType();
+    uint64_t DestSize = DL.getTypeSizeInBits(DestTy);
+    uint64_t SrcSize = DL.getTypeSizeInBits(SrcTy);
+    if (SrcSize < DestSize)
+      return nullptr;
+
+    // Catch the obvious splat cases (since all-zeros can coerce non-integral
+    // pointers legally).
+    if (C->isNullValue() && !DestTy->isX86_MMXTy())
+      return Constant::getNullValue(DestTy);
+    if (C->isAllOnesValue() && !DestTy->isX86_MMXTy() &&
+        !DestTy->isPtrOrPtrVectorTy()) // Don't get ones for ptr types!
+      return Constant::getAllOnesValue(DestTy);
 
     // If the type sizes are the same and a cast is legal, just directly
     // cast the constant.
-    if (DL.getTypeSizeInBits(DestTy) == DL.getTypeSizeInBits(SrcTy)) {
+    // But be careful not to coerce non-integral pointers illegally.
+    if (SrcSize == DestSize &&
+        DL.isNonIntegralPointerType(SrcTy->getScalarType()) ==
+            DL.isNonIntegralPointerType(DestTy->getScalarType())) {
       Instruction::CastOps Cast = Instruction::BitCast;
       // If we are going from a pointer to int or vice versa, we spell the cast
       // differently.
@@ -361,7 +380,7 @@ Constant *llvm::ConstantFoldLoadThroughBitcast(Constant *C, Type *DestTy,
       Constant *ElemC;
       do {
         ElemC = C->getAggregateElement(Elem++);
-      } while (ElemC && DL.getTypeSizeInBits(ElemC->getType()) == 0);
+      } while (ElemC && DL.getTypeSizeInBits(ElemC->getType()).isZero());
       C = ElemC;
     } else {
       C = C->getAggregateElement(0u);
@@ -460,15 +479,18 @@ bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset, unsigned char *CurPtr,
 
   if (isa<ConstantArray>(C) || isa<ConstantVector>(C) ||
       isa<ConstantDataSequential>(C)) {
-    Type *EltTy = C->getType()->getSequentialElementType();
+    uint64_t NumElts;
+    Type *EltTy;
+    if (auto *AT = dyn_cast<ArrayType>(C->getType())) {
+      NumElts = AT->getNumElements();
+      EltTy = AT->getElementType();
+    } else {
+      NumElts = cast<VectorType>(C->getType())->getNumElements();
+      EltTy = cast<VectorType>(C->getType())->getElementType();
+    }
     uint64_t EltSize = DL.getTypeAllocSize(EltTy);
     uint64_t Index = ByteOffset / EltSize;
     uint64_t Offset = ByteOffset - Index * EltSize;
-    uint64_t NumElts;
-    if (auto *AT = dyn_cast<ArrayType>(C->getType()))
-      NumElts = AT->getNumElements();
-    else
-      NumElts = C->getType()->getVectorNumElements();
 
     for (; Index != NumElts; ++Index) {
       if (!ReadDataFromGlobal(C->getAggregateElement(Index), Offset, CurPtr,
@@ -501,6 +523,10 @@ bool ReadDataFromGlobal(Constant *C, uint64_t ByteOffset, unsigned char *CurPtr,
 
 Constant *FoldReinterpretLoadFromConstPtr(Constant *C, Type *LoadTy,
                                           const DataLayout &DL) {
+  // Bail out early. Not expect to load from scalable global variable.
+  if (isa<ScalableVectorType>(LoadTy))
+    return nullptr;
+
   auto *PTy = cast<PointerType>(C->getType());
   auto *IntType = dyn_cast<IntegerType>(LoadTy);
 
@@ -520,8 +546,8 @@ Constant *FoldReinterpretLoadFromConstPtr(Constant *C, Type *LoadTy,
     else if (LoadTy->isDoubleTy())
       MapTy = Type::getInt64Ty(C->getContext());
     else if (LoadTy->isVectorTy()) {
-      MapTy = PointerType::getIntNTy(C->getContext(),
-                                     DL.getTypeSizeInBits(LoadTy));
+      MapTy = PointerType::getIntNTy(
+          C->getContext(), DL.getTypeSizeInBits(LoadTy).getFixedSize());
     } else
       return nullptr;
 
@@ -561,7 +587,8 @@ Constant *FoldReinterpretLoadFromConstPtr(Constant *C, Type *LoadTy,
     return nullptr;
 
   int64_t Offset = OffsetAI.getSExtValue();
-  int64_t InitializerSize = DL.getTypeAllocSize(GV->getInitializer()->getType());
+  int64_t InitializerSize =
+      DL.getTypeAllocSize(GV->getInitializer()->getType()).getFixedSize();
 
   // If we're not accessing anything in this constant, the result is undefined.
   if (Offset <= -1 * static_cast<int64_t>(BytesLoaded))
@@ -734,8 +761,7 @@ Constant *SymbolicallyEvaluateBinop(unsigned Opc, Constant *Op0, Constant *Op1,
       return Op1;
     }
 
-    Known0.Zero |= Known1.Zero;
-    Known0.One &= Known1.One;
+    Known0 &= Known1;
     if (Known0.isConstant())
       return ConstantInt::get(Op0->getType(), Known0.getConstant());
   }
@@ -794,10 +820,7 @@ Constant *CastGEPIndices(Type *SrcElemTy, ArrayRef<Constant *> Ops,
 
   Constant *C = ConstantExpr::getGetElementPtr(
       SrcElemTy, Ops[0], NewIdxs, /*InBounds=*/false, InRangeIndex);
-  if (Constant *Folded = ConstantFoldConstant(C, DL, TLI))
-    C = Folded;
-
-  return C;
+  return ConstantFoldConstant(C, DL, TLI);
 }
 
 /// Strip the pointer casts, but preserve the address space information.
@@ -828,7 +851,7 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
   Type *SrcElemTy = GEP->getSourceElementType();
   Type *ResElemTy = GEP->getResultElementType();
   Type *ResTy = GEP->getType();
-  if (!SrcElemTy->isSized())
+  if (!SrcElemTy->isSized() || isa<ScalableVectorType>(SrcElemTy))
     return nullptr;
 
   if (Constant *C = CastGEPIndices(SrcElemTy, Ops, ResTy,
@@ -857,9 +880,7 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
             Constant *Res = ConstantExpr::getPtrToInt(Ptr, CE->getType());
             Res = ConstantExpr::getSub(Res, CE->getOperand(1));
             Res = ConstantExpr::getIntToPtr(Res, ResTy);
-            if (auto *FoldedRes = ConstantFoldConstant(Res, DL, TLI))
-              Res = FoldedRes;
-            return Res;
+            return ConstantFoldConstant(Res, DL, TLI);
           }
         }
         return nullptr;
@@ -932,11 +953,11 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
         // Only handle pointers to sized types, not pointers to functions.
         if (!Ty->isSized())
           return nullptr;
-      } else if (auto *ATy = dyn_cast<SequentialType>(Ty)) {
-        Ty = ATy->getElementType();
       } else {
-        // We've reached some non-indexable type.
-        break;
+        Type *NextTy = GetElementPtrInst::getTypeAtIndex(Ty, (uint64_t)0);
+        if (!NextTy)
+          break;
+        Ty = NextTy;
       }
 
       // Determine which element of the array the offset points into.
@@ -1062,7 +1083,8 @@ Constant *ConstantFoldInstOperandsImpl(const Value *InstOrCE, unsigned Opcode,
   case Instruction::InsertElement:
     return ConstantExpr::getInsertElement(Ops[0], Ops[1], Ops[2]);
   case Instruction::ShuffleVector:
-    return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2]);
+    return ConstantExpr::getShuffleVector(
+        Ops[0], Ops[1], cast<ShuffleVectorInst>(InstOrCE)->getShuffleMask());
   }
 }
 
@@ -1079,23 +1101,19 @@ ConstantFoldConstantImpl(const Constant *C, const DataLayout &DL,
                          const TargetLibraryInfo *TLI,
                          SmallDenseMap<Constant *, Constant *> &FoldedOps) {
   if (!isa<ConstantVector>(C) && !isa<ConstantExpr>(C))
-    return nullptr;
+    return const_cast<Constant *>(C);
 
   SmallVector<Constant *, 8> Ops;
-  for (const Use &NewU : C->operands()) {
-    auto *NewC = cast<Constant>(&NewU);
+  for (const Use &OldU : C->operands()) {
+    Constant *OldC = cast<Constant>(&OldU);
+    Constant *NewC = OldC;
     // Recursively fold the ConstantExpr's operands. If we have already folded
     // a ConstantExpr, we don't have to process it again.
-    if (isa<ConstantVector>(NewC) || isa<ConstantExpr>(NewC)) {
-      auto It = FoldedOps.find(NewC);
+    if (isa<ConstantVector>(OldC) || isa<ConstantExpr>(OldC)) {
+      auto It = FoldedOps.find(OldC);
       if (It == FoldedOps.end()) {
-        if (auto *FoldedC =
-                ConstantFoldConstantImpl(NewC, DL, TLI, FoldedOps)) {
-          FoldedOps.insert({NewC, FoldedC});
-          NewC = FoldedC;
-        } else {
-          FoldedOps.insert({NewC, NewC});
-        }
+        NewC = ConstantFoldConstantImpl(OldC, DL, TLI, FoldedOps);
+        FoldedOps.insert({OldC, NewC});
       } else {
         NewC = It->second;
       }
@@ -1136,8 +1154,7 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I, const DataLayout &DL,
       if (!C)
         return nullptr;
       // Fold the PHI's operands.
-      if (auto *FoldedC = ConstantFoldConstantImpl(C, DL, TLI, FoldedOps))
-        C = FoldedC;
+      C = ConstantFoldConstantImpl(C, DL, TLI, FoldedOps);
       // If the incoming value is a different constant to
       // the one we saw previously, then give up.
       if (CommonValue && C != CommonValue)
@@ -1159,9 +1176,7 @@ Constant *llvm::ConstantFoldInstruction(Instruction *I, const DataLayout &DL,
   for (const Use &OpU : I->operands()) {
     auto *Op = cast<Constant>(&OpU);
     // Fold the Instruction's operands.
-    if (auto *FoldedOp = ConstantFoldConstantImpl(Op, DL, TLI, FoldedOps))
-      Op = FoldedOp;
-
+    Op = ConstantFoldConstantImpl(Op, DL, TLI, FoldedOps);
     Ops.push_back(Op);
   }
 
@@ -1400,41 +1415,19 @@ llvm::ConstantFoldLoadThroughGEPIndices(Constant *C,
 //
 
 bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
-  if (Call->isNoBuiltin() || Call->isStrictFP())
+  if (Call->isNoBuiltin())
     return false;
   switch (F->getIntrinsicID()) {
-  case Intrinsic::fabs:
-  case Intrinsic::minnum:
-  case Intrinsic::maxnum:
-  case Intrinsic::minimum:
-  case Intrinsic::maximum:
-  case Intrinsic::log:
-  case Intrinsic::log2:
-  case Intrinsic::log10:
-  case Intrinsic::exp:
-  case Intrinsic::exp2:
-  case Intrinsic::floor:
-  case Intrinsic::ceil:
-  case Intrinsic::sqrt:
-  case Intrinsic::sin:
-  case Intrinsic::cos:
-  case Intrinsic::trunc:
-  case Intrinsic::rint:
-  case Intrinsic::nearbyint:
-  case Intrinsic::pow:
-  case Intrinsic::powi:
+  // Operations that do not operate floating-point numbers and do not depend on
+  // FP environment can be folded even in strictfp functions.
   case Intrinsic::bswap:
   case Intrinsic::ctpop:
   case Intrinsic::ctlz:
   case Intrinsic::cttz:
   case Intrinsic::fshl:
   case Intrinsic::fshr:
-  case Intrinsic::fma:
-  case Intrinsic::fmuladd:
-  case Intrinsic::copysign:
   case Intrinsic::launder_invariant_group:
   case Intrinsic::strip_invariant_group:
-  case Intrinsic::round:
   case Intrinsic::masked_load:
   case Intrinsic::sadd_with_overflow:
   case Intrinsic::uadd_with_overflow:
@@ -1448,9 +1441,49 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::usub_sat:
   case Intrinsic::smul_fix:
   case Intrinsic::smul_fix_sat:
+  case Intrinsic::bitreverse:
+  case Intrinsic::is_constant:
+  case Intrinsic::experimental_vector_reduce_add:
+  case Intrinsic::experimental_vector_reduce_mul:
+  case Intrinsic::experimental_vector_reduce_and:
+  case Intrinsic::experimental_vector_reduce_or:
+  case Intrinsic::experimental_vector_reduce_xor:
+  case Intrinsic::experimental_vector_reduce_smin:
+  case Intrinsic::experimental_vector_reduce_smax:
+  case Intrinsic::experimental_vector_reduce_umin:
+  case Intrinsic::experimental_vector_reduce_umax:
+    return true;
+
+  // Floating point operations cannot be folded in strictfp functions in
+  // general case. They can be folded if FP environment is known to compiler.
+  case Intrinsic::minnum:
+  case Intrinsic::maxnum:
+  case Intrinsic::minimum:
+  case Intrinsic::maximum:
+  case Intrinsic::log:
+  case Intrinsic::log2:
+  case Intrinsic::log10:
+  case Intrinsic::exp:
+  case Intrinsic::exp2:
+  case Intrinsic::sqrt:
+  case Intrinsic::sin:
+  case Intrinsic::cos:
+  case Intrinsic::pow:
+  case Intrinsic::powi:
+  case Intrinsic::fma:
+  case Intrinsic::fmuladd:
   case Intrinsic::convert_from_fp16:
   case Intrinsic::convert_to_fp16:
-  case Intrinsic::bitreverse:
+  case Intrinsic::amdgcn_cos:
+  case Intrinsic::amdgcn_cubeid:
+  case Intrinsic::amdgcn_cubema:
+  case Intrinsic::amdgcn_cubesc:
+  case Intrinsic::amdgcn_cubetc:
+  case Intrinsic::amdgcn_fmul_legacy:
+  case Intrinsic::amdgcn_fract:
+  case Intrinsic::amdgcn_ldexp:
+  case Intrinsic::amdgcn_sin:
+  // The intrinsics below depend on rounding mode in MXCSR.
   case Intrinsic::x86_sse_cvtss2si:
   case Intrinsic::x86_sse_cvtss2si64:
   case Intrinsic::x86_sse_cvttss2si:
@@ -1475,14 +1508,37 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case Intrinsic::x86_avx512_vcvtsd2usi64:
   case Intrinsic::x86_avx512_cvttsd2usi:
   case Intrinsic::x86_avx512_cvttsd2usi64:
-  case Intrinsic::is_constant:
+    return !Call->isStrictFP();
+
+  // Sign operations are actually bitwise operations, they do not raise
+  // exceptions even for SNANs.
+  case Intrinsic::fabs:
+  case Intrinsic::copysign:
+  // Non-constrained variants of rounding operations means default FP
+  // environment, they can be folded in any case.
+  case Intrinsic::ceil:
+  case Intrinsic::floor:
+  case Intrinsic::round:
+  case Intrinsic::roundeven:
+  case Intrinsic::trunc:
+  case Intrinsic::nearbyint:
+  case Intrinsic::rint:
+  // Constrained intrinsics can be folded if FP environment is known
+  // to compiler.
+  case Intrinsic::experimental_constrained_ceil:
+  case Intrinsic::experimental_constrained_floor:
+  case Intrinsic::experimental_constrained_round:
+  case Intrinsic::experimental_constrained_roundeven:
+  case Intrinsic::experimental_constrained_trunc:
+  case Intrinsic::experimental_constrained_nearbyint:
+  case Intrinsic::experimental_constrained_rint:
     return true;
   default:
     return false;
   case Intrinsic::not_intrinsic: break;
   }
 
-  if (!F->hasName())
+  if (!F->hasName() || Call->isStrictFP())
     return false;
 
   // In these cases, the check of the length is required.  We don't want to
@@ -1517,7 +1573,8 @@ bool llvm::canConstantFoldCallTo(const CallBase *Call, const Function *F) {
   case 'p':
     return Name == "pow" || Name == "powf";
   case 'r':
-    return Name == "rint" || Name == "rintf" ||
+    return Name == "remainder" || Name == "remainderf" ||
+           Name == "rint" || Name == "rintf" ||
            Name == "round" || Name == "roundf";
   case 's':
     return Name == "sin" || Name == "sinf" ||
@@ -1616,6 +1673,53 @@ Constant *ConstantFoldBinaryFP(double (*NativeFP)(double, double), double V,
   return GetConstantFoldFPValue(V, Ty);
 }
 
+Constant *ConstantFoldVectorReduce(Intrinsic::ID IID, Constant *Op) {
+  FixedVectorType *VT = dyn_cast<FixedVectorType>(Op->getType());
+  if (!VT)
+    return nullptr;
+  ConstantInt *CI = dyn_cast<ConstantInt>(Op->getAggregateElement(0U));
+  if (!CI)
+    return nullptr;
+  APInt Acc = CI->getValue();
+
+  for (unsigned I = 1; I < VT->getNumElements(); I++) {
+    if (!(CI = dyn_cast<ConstantInt>(Op->getAggregateElement(I))))
+      return nullptr;
+    const APInt &X = CI->getValue();
+    switch (IID) {
+    case Intrinsic::experimental_vector_reduce_add:
+      Acc = Acc + X;
+      break;
+    case Intrinsic::experimental_vector_reduce_mul:
+      Acc = Acc * X;
+      break;
+    case Intrinsic::experimental_vector_reduce_and:
+      Acc = Acc & X;
+      break;
+    case Intrinsic::experimental_vector_reduce_or:
+      Acc = Acc | X;
+      break;
+    case Intrinsic::experimental_vector_reduce_xor:
+      Acc = Acc ^ X;
+      break;
+    case Intrinsic::experimental_vector_reduce_smin:
+      Acc = APIntOps::smin(Acc, X);
+      break;
+    case Intrinsic::experimental_vector_reduce_smax:
+      Acc = APIntOps::smax(Acc, X);
+      break;
+    case Intrinsic::experimental_vector_reduce_umin:
+      Acc = APIntOps::umin(Acc, X);
+      break;
+    case Intrinsic::experimental_vector_reduce_umax:
+      Acc = APIntOps::umax(Acc, X);
+      break;
+    }
+  }
+
+  return ConstantInt::get(Op->getContext(), Acc);
+}
+
 /// Attempt to fold an SSE floating point to integer conversion of a constant
 /// floating point. If roundTowardZero is false, the default IEEE rounding is
 /// used (toward nearest, ties to even). This matches the behavior of the
@@ -1756,6 +1860,11 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       return ConstantFP::get(Ty->getContext(), U);
     }
 
+    if (IntrinsicID == Intrinsic::roundeven) {
+      U.roundToIntegral(APFloat::rmNearestTiesToEven);
+      return ConstantFP::get(Ty->getContext(), U);
+    }
+
     if (IntrinsicID == Intrinsic::ceil) {
       U.roundToIntegral(APFloat::rmTowardPositive);
       return ConstantFP::get(Ty->getContext(), U);
@@ -1776,10 +1885,70 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
       return ConstantFP::get(Ty->getContext(), U);
     }
 
+    if (IntrinsicID == Intrinsic::amdgcn_fract) {
+      // The v_fract instruction behaves like the OpenCL spec, which defines
+      // fract(x) as fmin(x - floor(x), 0x1.fffffep-1f): "The min() operator is
+      //   there to prevent fract(-small) from returning 1.0. It returns the
+      //   largest positive floating-point number less than 1.0."
+      APFloat FloorU(U);
+      FloorU.roundToIntegral(APFloat::rmTowardNegative);
+      APFloat FractU(U - FloorU);
+      APFloat AlmostOne(U.getSemantics(), 1);
+      AlmostOne.next(/*nextDown*/ true);
+      return ConstantFP::get(Ty->getContext(), minimum(FractU, AlmostOne));
+    }
+
+    // Rounding operations (floor, trunc, ceil, round and nearbyint) do not
+    // raise FP exceptions, unless the argument is signaling NaN.
+
+    Optional<APFloat::roundingMode> RM;
+    switch (IntrinsicID) {
+    default:
+      break;
+    case Intrinsic::experimental_constrained_nearbyint:
+    case Intrinsic::experimental_constrained_rint: {
+      auto CI = cast<ConstrainedFPIntrinsic>(Call);
+      RM = CI->getRoundingMode();
+      if (!RM || RM.getValue() == RoundingMode::Dynamic)
+        return nullptr;
+      break;
+    }
+    case Intrinsic::experimental_constrained_round:
+      RM = APFloat::rmNearestTiesToAway;
+      break;
+    case Intrinsic::experimental_constrained_ceil:
+      RM = APFloat::rmTowardPositive;
+      break;
+    case Intrinsic::experimental_constrained_floor:
+      RM = APFloat::rmTowardNegative;
+      break;
+    case Intrinsic::experimental_constrained_trunc:
+      RM = APFloat::rmTowardZero;
+      break;
+    }
+    if (RM) {
+      auto CI = cast<ConstrainedFPIntrinsic>(Call);
+      if (U.isFinite()) {
+        APFloat::opStatus St = U.roundToIntegral(*RM);
+        if (IntrinsicID == Intrinsic::experimental_constrained_rint &&
+            St == APFloat::opInexact) {
+          Optional<fp::ExceptionBehavior> EB = CI->getExceptionBehavior();
+          if (EB && *EB == fp::ebStrict)
+            return nullptr;
+        }
+      } else if (U.isSignaling()) {
+        Optional<fp::ExceptionBehavior> EB = CI->getExceptionBehavior();
+        if (EB && *EB != fp::ebIgnore)
+          return nullptr;
+        U = APFloat::getQNaN(U.getSemantics());
+      }
+      return ConstantFP::get(Ty->getContext(), U);
+    }
+
     /// We only fold functions with finite arguments. Folding NaN and inf is
     /// likely to be aborted with an exception anyway, and some host libms
     /// have known errors raising exceptions.
-    if (Op->getValueAPF().isNaN() || Op->getValueAPF().isInfinity())
+    if (!U.isFinite())
       return nullptr;
 
     /// Currently APFloat versions of these functions do not exist, so we use
@@ -1809,6 +1978,26 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
         return ConstantFoldFP(cos, V, Ty);
       case Intrinsic::sqrt:
         return ConstantFoldFP(sqrt, V, Ty);
+      case Intrinsic::amdgcn_cos:
+      case Intrinsic::amdgcn_sin:
+        if (V < -256.0 || V > 256.0)
+          // The gfx8 and gfx9 architectures handle arguments outside the range
+          // [-256, 256] differently. This should be a rare case so bail out
+          // rather than trying to handle the difference.
+          return nullptr;
+        bool IsCos = IntrinsicID == Intrinsic::amdgcn_cos;
+        double V4 = V * 4.0;
+        if (V4 == floor(V4)) {
+          // Force exact results for quarter-integer inputs.
+          const double SinVals[4] = { 0.0, 1.0, 0.0, -1.0 };
+          V = SinVals[((int)V4 + (IsCos ? 1 : 0)) & 3];
+        } else {
+          if (IsCos)
+            V = cos(V * 2.0 * numbers::pi);
+          else
+            V = sin(V * 2.0 * numbers::pi);
+        }
+        return GetConstantFoldFPValue(V, Ty);
     }
 
     if (!TLI)
@@ -1990,12 +2179,40 @@ static Constant *ConstantFoldScalarCall1(StringRef Name,
     }
   }
 
+  if (isa<ConstantAggregateZero>(Operands[0])) {
+    switch (IntrinsicID) {
+    default: break;
+    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::experimental_vector_reduce_smin:
+    case Intrinsic::experimental_vector_reduce_smax:
+    case Intrinsic::experimental_vector_reduce_umin:
+    case Intrinsic::experimental_vector_reduce_umax:
+      return ConstantInt::get(Ty, 0);
+    }
+  }
+
   // Support ConstantVector in case we have an Undef in the top.
   if (isa<ConstantVector>(Operands[0]) ||
       isa<ConstantDataVector>(Operands[0])) {
     auto *Op = cast<Constant>(Operands[0]);
     switch (IntrinsicID) {
     default: break;
+    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::experimental_vector_reduce_mul:
+    case Intrinsic::experimental_vector_reduce_and:
+    case Intrinsic::experimental_vector_reduce_or:
+    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::experimental_vector_reduce_smin:
+    case Intrinsic::experimental_vector_reduce_smax:
+    case Intrinsic::experimental_vector_reduce_umin:
+    case Intrinsic::experimental_vector_reduce_umax:
+      if (Constant *C = ConstantFoldVectorReduce(IntrinsicID, Op))
+        return C;
+      break;
     case Intrinsic::x86_sse_cvtss2si:
     case Intrinsic::x86_sse_cvtss2si64:
     case Intrinsic::x86_sse2_cvtsd2si:
@@ -2074,6 +2291,16 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
         return ConstantFP::get(Ty->getContext(), maximum(C1, C2));
       }
 
+      if (IntrinsicID == Intrinsic::amdgcn_fmul_legacy) {
+        const APFloat &C1 = Op1->getValueAPF();
+        const APFloat &C2 = Op2->getValueAPF();
+        // The legacy behaviour is that multiplying zero by anything, even NaN
+        // or infinity, gives +0.0.
+        if (C1.isZero() || C2.isZero())
+          return ConstantFP::getNullValue(Ty);
+        return ConstantFP::get(Ty->getContext(), C1 * C2);
+      }
+
       if (!TLI)
         return nullptr;
 
@@ -2097,6 +2324,14 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
             return ConstantFP::get(Ty->getContext(), V);
         }
         break;
+      case LibFunc_remainder:
+      case LibFunc_remainderf:
+        if (TLI->has(Func)) {
+          APFloat V = Op1->getValueAPF();
+          if (APFloat::opStatus::opOK == V.remainder(Op2->getValueAPF()))
+            return ConstantFP::get(Ty->getContext(), V);
+        }
+        break;
       case LibFunc_atan2:
       case LibFunc_atan2f:
       case LibFunc_atan2_finite:
@@ -2118,6 +2353,16 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
         return ConstantFP::get(Ty->getContext(),
                                APFloat((double)std::pow((double)Op1V,
                                                  (int)Op2C->getZExtValue())));
+
+      if (IntrinsicID == Intrinsic::amdgcn_ldexp) {
+        // FIXME: Should flush denorms depending on FP mode, but that's ignored
+        // everywhere else.
+
+        // scalbn is equivalent to ldexp with float radix 2
+        APFloat Result = scalbn(Op1->getValueAPF(), Op2C->getSExtValue(),
+                                APFloat::rmNearestTiesToEven);
+        return ConstantFP::get(Ty->getContext(), Result);
+      }
     }
     return nullptr;
   }
@@ -2275,6 +2520,61 @@ static Constant *ConstantFoldScalarCall2(StringRef Name,
   return nullptr;
 }
 
+static APFloat ConstantFoldAMDGCNCubeIntrinsic(Intrinsic::ID IntrinsicID,
+                                               const APFloat &S0,
+                                               const APFloat &S1,
+                                               const APFloat &S2) {
+  unsigned ID;
+  const fltSemantics &Sem = S0.getSemantics();
+  APFloat MA(Sem), SC(Sem), TC(Sem);
+  if (abs(S2) >= abs(S0) && abs(S2) >= abs(S1)) {
+    if (S2.isNegative() && S2.isNonZero() && !S2.isNaN()) {
+      // S2 < 0
+      ID = 5;
+      SC = -S0;
+    } else {
+      ID = 4;
+      SC = S0;
+    }
+    MA = S2;
+    TC = -S1;
+  } else if (abs(S1) >= abs(S0)) {
+    if (S1.isNegative() && S1.isNonZero() && !S1.isNaN()) {
+      // S1 < 0
+      ID = 3;
+      TC = -S2;
+    } else {
+      ID = 2;
+      TC = S2;
+    }
+    MA = S1;
+    SC = S0;
+  } else {
+    if (S0.isNegative() && S0.isNonZero() && !S0.isNaN()) {
+      // S0 < 0
+      ID = 1;
+      SC = S2;
+    } else {
+      ID = 0;
+      SC = -S2;
+    }
+    MA = S0;
+    TC = -S1;
+  }
+  switch (IntrinsicID) {
+  default:
+    llvm_unreachable("unhandled amdgcn cube intrinsic");
+  case Intrinsic::amdgcn_cubeid:
+    return APFloat(Sem, ID);
+  case Intrinsic::amdgcn_cubema:
+    return MA + MA;
+  case Intrinsic::amdgcn_cubesc:
+    return SC;
+  case Intrinsic::amdgcn_cubetc:
+    return TC;
+  }
+}
+
 static Constant *ConstantFoldScalarCall3(StringRef Name,
                                          Intrinsic::ID IntrinsicID,
                                          Type *Ty,
@@ -2295,6 +2595,15 @@ static Constant *ConstantFoldScalarCall3(StringRef Name,
                              APFloat::rmNearestTiesToEven);
           return ConstantFP::get(Ty->getContext(), V);
         }
+        case Intrinsic::amdgcn_cubeid:
+        case Intrinsic::amdgcn_cubema:
+        case Intrinsic::amdgcn_cubesc:
+        case Intrinsic::amdgcn_cubetc: {
+          APFloat V = ConstantFoldAMDGCNCubeIntrinsic(
+              IntrinsicID, Op1->getValueAPF(), Op2->getValueAPF(),
+              Op3->getValueAPF());
+          return ConstantFP::get(Ty->getContext(), V);
+        }
         }
       }
     }
@@ -2313,8 +2622,8 @@ static Constant *ConstantFoldScalarCall3(StringRef Name,
           // how rounding should be done, and provide their own folding to be
           // consistent with rounding. This is the same approach as used by
           // DAGTypeLegalizer::ExpandIntRes_MULFIX.
-          APInt Lhs = Op1->getValue();
-          APInt Rhs = Op2->getValue();
+          const APInt &Lhs = Op1->getValue();
+          const APInt &Rhs = Op2->getValue();
           unsigned Scale = Op3->getValue().getZExtValue();
           unsigned Width = Lhs.getBitWidth();
           assert(Scale < Width && "Illegal scale.");
@@ -2395,19 +2704,26 @@ static Constant *ConstantFoldVectorCall(StringRef Name,
                                         const DataLayout &DL,
                                         const TargetLibraryInfo *TLI,
                                         const CallBase *Call) {
-  SmallVector<Constant *, 4> Result(VTy->getNumElements());
+  // Do not iterate on scalable vector. The number of elements is unknown at
+  // compile-time.
+  if (isa<ScalableVectorType>(VTy))
+    return nullptr;
+
+  auto *FVTy = cast<FixedVectorType>(VTy);
+
+  SmallVector<Constant *, 4> Result(FVTy->getNumElements());
   SmallVector<Constant *, 4> Lane(Operands.size());
-  Type *Ty = VTy->getElementType();
+  Type *Ty = FVTy->getElementType();
 
   if (IntrinsicID == Intrinsic::masked_load) {
     auto *SrcPtr = Operands[0];
     auto *Mask = Operands[2];
     auto *Passthru = Operands[3];
 
-    Constant *VecData = ConstantFoldLoadFromConstPtr(SrcPtr, VTy, DL);
+    Constant *VecData = ConstantFoldLoadFromConstPtr(SrcPtr, FVTy, DL);
 
     SmallVector<Constant *, 32> NewElements;
-    for (unsigned I = 0, E = VTy->getNumElements(); I != E; ++I) {
+    for (unsigned I = 0, E = FVTy->getNumElements(); I != E; ++I) {
       auto *MaskElt = Mask->getAggregateElement(I);
       if (!MaskElt)
         break;
@@ -2433,12 +2749,12 @@ static Constant *ConstantFoldVectorCall(StringRef Name,
         return nullptr;
       }
     }
-    if (NewElements.size() != VTy->getNumElements())
+    if (NewElements.size() != FVTy->getNumElements())
       return nullptr;
     return ConstantVector::get(NewElements);
   }
 
-  for (unsigned I = 0, E = VTy->getNumElements(); I != E; ++I) {
+  for (unsigned I = 0, E = FVTy->getNumElements(); I != E; ++I) {
     // Gather a column of constants.
     for (unsigned J = 0, JE = Operands.size(); J != JE; ++J) {
       // Some intrinsics use a scalar type for certain arguments.
@@ -2470,7 +2786,7 @@ static Constant *ConstantFoldVectorCall(StringRef Name,
 Constant *llvm::ConstantFoldCall(const CallBase *Call, Function *F,
                                  ArrayRef<Constant *> Operands,
                                  const TargetLibraryInfo *TLI) {
-  if (Call->isNoBuiltin() || Call->isStrictFP())
+  if (Call->isNoBuiltin())
     return nullptr;
   if (!F->hasName())
     return nullptr;
@@ -2520,11 +2836,9 @@ bool llvm::isMathLibCallNoop(const CallBase *Call,
       case LibFunc_expf:
         // FIXME: These boundaries are slightly conservative.
         if (OpC->getType()->isDoubleTy())
-          return Op.compare(APFloat(-745.0)) != APFloat::cmpLessThan &&
-                 Op.compare(APFloat(709.0)) != APFloat::cmpGreaterThan;
+          return !(Op < APFloat(-745.0) || Op > APFloat(709.0));
         if (OpC->getType()->isFloatTy())
-          return Op.compare(APFloat(-103.0f)) != APFloat::cmpLessThan &&
-                 Op.compare(APFloat(88.0f)) != APFloat::cmpGreaterThan;
+          return !(Op < APFloat(-103.0f) || Op > APFloat(88.0f));
         break;
 
       case LibFunc_exp2l:
@@ -2532,11 +2846,9 @@ bool llvm::isMathLibCallNoop(const CallBase *Call,
       case LibFunc_exp2f:
         // FIXME: These boundaries are slightly conservative.
         if (OpC->getType()->isDoubleTy())
-          return Op.compare(APFloat(-1074.0)) != APFloat::cmpLessThan &&
-                 Op.compare(APFloat(1023.0)) != APFloat::cmpGreaterThan;
+          return !(Op < APFloat(-1074.0) || Op > APFloat(1023.0));
         if (OpC->getType()->isFloatTy())
-          return Op.compare(APFloat(-149.0f)) != APFloat::cmpLessThan &&
-                 Op.compare(APFloat(127.0f)) != APFloat::cmpGreaterThan;
+          return !(Op < APFloat(-149.0f) || Op > APFloat(127.0f));
         break;
 
       case LibFunc_sinl:
@@ -2566,10 +2878,8 @@ bool llvm::isMathLibCallNoop(const CallBase *Call,
       case LibFunc_acosl:
       case LibFunc_acos:
       case LibFunc_acosf:
-        return Op.compare(APFloat(Op.getSemantics(), "-1")) !=
-                   APFloat::cmpLessThan &&
-               Op.compare(APFloat(Op.getSemantics(), "1")) !=
-                   APFloat::cmpGreaterThan;
+        return !(Op < APFloat(Op.getSemantics(), "-1") ||
+                 Op > APFloat(Op.getSemantics(), "1"));
 
       case LibFunc_sinh:
       case LibFunc_cosh:
@@ -2579,11 +2889,9 @@ bool llvm::isMathLibCallNoop(const CallBase *Call,
       case LibFunc_coshl:
         // FIXME: These boundaries are slightly conservative.
         if (OpC->getType()->isDoubleTy())
-          return Op.compare(APFloat(-710.0)) != APFloat::cmpLessThan &&
-                 Op.compare(APFloat(710.0)) != APFloat::cmpGreaterThan;
+          return !(Op < APFloat(-710.0) || Op > APFloat(710.0));
         if (OpC->getType()->isFloatTy())
-          return Op.compare(APFloat(-89.0f)) != APFloat::cmpLessThan &&
-                 Op.compare(APFloat(89.0f)) != APFloat::cmpGreaterThan;
+          return !(Op < APFloat(-89.0f) || Op > APFloat(89.0f));
         break;
 
       case LibFunc_sqrtl:
@@ -2626,6 +2934,9 @@ bool llvm::isMathLibCallNoop(const CallBase *Call,
       case LibFunc_fmodl:
       case LibFunc_fmod:
       case LibFunc_fmodf:
+      case LibFunc_remainderl:
+      case LibFunc_remainder:
+      case LibFunc_remainderf:
         return Op0.isNaN() || Op1.isNaN() ||
                (!Op0.isInfinity() && !Op1.isZero());
 
@@ -2637,3 +2948,5 @@ bool llvm::isMathLibCallNoop(const CallBase *Call,
 
   return false;
 }
+
+void TargetFolder::anchor() {}
diff --git a/llvm/lib/Analysis/DDG.cpp b/llvm/lib/Analysis/DDG.cpp
index 90ce13e6f650..280d9ef79efa 100644
--- a/llvm/lib/Analysis/DDG.cpp
+++ b/llvm/lib/Analysis/DDG.cpp
@@ -16,6 +16,11 @@
 
 using namespace llvm;
 
+static cl::opt<bool> SimplifyDDG(
+    "ddg-simplify", cl::init(true), cl::Hidden, cl::ZeroOrMore,
+    cl::desc(
+        "Simplify DDG by merging nodes that have less interesting edges."));
+
 static cl::opt<bool>
     CreatePiBlocks("ddg-pi-blocks", cl::init(true), cl::Hidden, cl::ZeroOrMore,
                    cl::desc("Create pi-block nodes."));
@@ -257,10 +262,47 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const DataDependenceGraph &G) {
   return OS;
 }
 
-bool DDGBuilder::shouldCreatePiBlocks() const {
-  return CreatePiBlocks;
+//===--------------------------------------------------------------------===//
+// DDGBuilder implementation
+//===--------------------------------------------------------------------===//
+
+bool DDGBuilder::areNodesMergeable(const DDGNode &Src,
+                                   const DDGNode &Tgt) const {
+  // Only merge two nodes if they are both simple nodes and the consecutive
+  // instructions after merging belong to the same BB.
+  const auto *SimpleSrc = dyn_cast<const SimpleDDGNode>(&Src);
+  const auto *SimpleTgt = dyn_cast<const SimpleDDGNode>(&Tgt);
+  if (!SimpleSrc || !SimpleTgt)
+    return false;
+
+  return SimpleSrc->getLastInstruction()->getParent() ==
+         SimpleTgt->getFirstInstruction()->getParent();
 }
 
+void DDGBuilder::mergeNodes(DDGNode &A, DDGNode &B) {
+  DDGEdge &EdgeToFold = A.back();
+  assert(A.getEdges().size() == 1 && EdgeToFold.getTargetNode() == B &&
+         "Expected A to have a single edge to B.");
+  assert(isa<SimpleDDGNode>(&A) && isa<SimpleDDGNode>(&B) &&
+         "Expected simple nodes");
+
+  // Copy instructions from B to the end of A.
+  cast<SimpleDDGNode>(&A)->appendInstructions(*cast<SimpleDDGNode>(&B));
+
+  // Move to A any outgoing edges from B.
+  for (DDGEdge *BE : B)
+    Graph.connect(A, BE->getTargetNode(), *BE);
+
+  A.removeEdge(EdgeToFold);
+  destroyEdge(EdgeToFold);
+  Graph.removeNode(B);
+  destroyNode(B);
+}
+
+bool DDGBuilder::shouldSimplify() const { return SimplifyDDG; }
+
+bool DDGBuilder::shouldCreatePiBlocks() const { return CreatePiBlocks; }
+
 //===--------------------------------------------------------------------===//
 // DDG Analysis Passes
 //===--------------------------------------------------------------------===//
diff --git a/llvm/lib/Analysis/DependenceAnalysis.cpp b/llvm/lib/Analysis/DependenceAnalysis.cpp
index 9b38053c196b..bcfeef7fb8ab 100644
--- a/llvm/lib/Analysis/DependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DependenceAnalysis.cpp
@@ -647,7 +647,7 @@ void Dependence::dump(raw_ostream &OS) const {
 // tbaa, non-overlapping regions etc), then it is known there is no dependecy.
 // Otherwise the underlying objects are checked to see if they point to
 // different identifiable objects.
-static AliasResult underlyingObjectsAlias(AliasAnalysis *AA,
+static AliasResult underlyingObjectsAlias(AAResults *AA,
                                           const DataLayout &DL,
                                           const MemoryLocation &LocA,
                                           const MemoryLocation &LocB) {
@@ -3264,23 +3264,134 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
   assert(isLoadOrStore(Dst) && "instruction is not load or store");
   Value *SrcPtr = getLoadStorePointerOperand(Src);
   Value *DstPtr = getLoadStorePointerOperand(Dst);
-
   Loop *SrcLoop = LI->getLoopFor(Src->getParent());
   Loop *DstLoop = LI->getLoopFor(Dst->getParent());
+  const SCEV *SrcAccessFn = SE->getSCEVAtScope(SrcPtr, SrcLoop);
+  const SCEV *DstAccessFn = SE->getSCEVAtScope(DstPtr, DstLoop);
+  const SCEVUnknown *SrcBase =
+      dyn_cast<SCEVUnknown>(SE->getPointerBase(SrcAccessFn));
+  const SCEVUnknown *DstBase =
+      dyn_cast<SCEVUnknown>(SE->getPointerBase(DstAccessFn));
+
+  if (!SrcBase || !DstBase || SrcBase != DstBase)
+    return false;
 
-  // Below code mimics the code in Delinearization.cpp
-  const SCEV *SrcAccessFn =
-    SE->getSCEVAtScope(SrcPtr, SrcLoop);
-  const SCEV *DstAccessFn =
-    SE->getSCEVAtScope(DstPtr, DstLoop);
+  SmallVector<const SCEV *, 4> SrcSubscripts, DstSubscripts;
+
+  if (!tryDelinearizeFixedSize(Src, Dst, SrcAccessFn, DstAccessFn,
+                               SrcSubscripts, DstSubscripts) &&
+      !tryDelinearizeParametricSize(Src, Dst, SrcAccessFn, DstAccessFn,
+                                    SrcSubscripts, DstSubscripts))
+    return false;
+
+  int Size = SrcSubscripts.size();
+  LLVM_DEBUG({
+    dbgs() << "\nSrcSubscripts: ";
+    for (int I = 0; I < Size; I++)
+      dbgs() << *SrcSubscripts[I];
+    dbgs() << "\nDstSubscripts: ";
+    for (int I = 0; I < Size; I++)
+      dbgs() << *DstSubscripts[I];
+  });
 
+  // The delinearization transforms a single-subscript MIV dependence test into
+  // a multi-subscript SIV dependence test that is easier to compute. So we
+  // resize Pair to contain as many pairs of subscripts as the delinearization
+  // has found, and then initialize the pairs following the delinearization.
+  Pair.resize(Size);
+  for (int I = 0; I < Size; ++I) {
+    Pair[I].Src = SrcSubscripts[I];
+    Pair[I].Dst = DstSubscripts[I];
+    unifySubscriptType(&Pair[I]);
+  }
+
+  return true;
+}
+
+bool DependenceInfo::tryDelinearizeFixedSize(
+    Instruction *Src, Instruction *Dst, const SCEV *SrcAccessFn,
+    const SCEV *DstAccessFn, SmallVectorImpl<const SCEV *> &SrcSubscripts,
+    SmallVectorImpl<const SCEV *> &DstSubscripts) {
+
+  // In general we cannot safely assume that the subscripts recovered from GEPs
+  // are in the range of values defined for their corresponding array
+  // dimensions. For example some C language usage/interpretation make it
+  // impossible to verify this at compile-time. As such we give up here unless
+  // we can assume that the subscripts do not overlap into neighboring
+  // dimensions and that the number of dimensions matches the number of
+  // subscripts being recovered.
+  if (!DisableDelinearizationChecks)
+    return false;
+
+  Value *SrcPtr = getLoadStorePointerOperand(Src);
+  Value *DstPtr = getLoadStorePointerOperand(Dst);
   const SCEVUnknown *SrcBase =
       dyn_cast<SCEVUnknown>(SE->getPointerBase(SrcAccessFn));
   const SCEVUnknown *DstBase =
       dyn_cast<SCEVUnknown>(SE->getPointerBase(DstAccessFn));
+  assert(SrcBase && DstBase && SrcBase == DstBase &&
+         "expected src and dst scev unknowns to be equal");
 
-  if (!SrcBase || !DstBase || SrcBase != DstBase)
+  // Check the simple case where the array dimensions are fixed size.
+  auto *SrcGEP = dyn_cast<GetElementPtrInst>(SrcPtr);
+  auto *DstGEP = dyn_cast<GetElementPtrInst>(DstPtr);
+  if (!SrcGEP || !DstGEP)
+    return false;
+
+  SmallVector<int, 4> SrcSizes, DstSizes;
+  SE->getIndexExpressionsFromGEP(SrcGEP, SrcSubscripts, SrcSizes);
+  SE->getIndexExpressionsFromGEP(DstGEP, DstSubscripts, DstSizes);
+
+  // Check that the two size arrays are non-empty and equal in length and
+  // value.
+  if (SrcSizes.empty() || SrcSubscripts.size() <= 1 ||
+      SrcSizes.size() != DstSizes.size() ||
+      !std::equal(SrcSizes.begin(), SrcSizes.end(), DstSizes.begin())) {
+    SrcSubscripts.clear();
+    DstSubscripts.clear();
     return false;
+  }
+
+  Value *SrcBasePtr = SrcGEP->getOperand(0);
+  Value *DstBasePtr = DstGEP->getOperand(0);
+  while (auto *PCast = dyn_cast<BitCastInst>(SrcBasePtr))
+    SrcBasePtr = PCast->getOperand(0);
+  while (auto *PCast = dyn_cast<BitCastInst>(DstBasePtr))
+    DstBasePtr = PCast->getOperand(0);
+
+  // Check that for identical base pointers we do not miss index offsets
+  // that have been added before this GEP is applied.
+  if (SrcBasePtr == SrcBase->getValue() && DstBasePtr == DstBase->getValue()) {
+    assert(SrcSubscripts.size() == DstSubscripts.size() &&
+           SrcSubscripts.size() == SrcSizes.size() + 1 &&
+           "Expected equal number of entries in the list of sizes and "
+           "subscripts.");
+    LLVM_DEBUG({
+      dbgs() << "Delinearized subscripts of fixed-size array\n"
+             << "SrcGEP:" << *SrcGEP << "\n"
+             << "DstGEP:" << *DstGEP << "\n";
+    });
+    return true;
+  }
+
+  SrcSubscripts.clear();
+  DstSubscripts.clear();
+  return false;
+}
+
+bool DependenceInfo::tryDelinearizeParametricSize(
+    Instruction *Src, Instruction *Dst, const SCEV *SrcAccessFn,
+    const SCEV *DstAccessFn, SmallVectorImpl<const SCEV *> &SrcSubscripts,
+    SmallVectorImpl<const SCEV *> &DstSubscripts) {
+
+  Value *SrcPtr = getLoadStorePointerOperand(Src);
+  Value *DstPtr = getLoadStorePointerOperand(Dst);
+  const SCEVUnknown *SrcBase =
+      dyn_cast<SCEVUnknown>(SE->getPointerBase(SrcAccessFn));
+  const SCEVUnknown *DstBase =
+      dyn_cast<SCEVUnknown>(SE->getPointerBase(DstAccessFn));
+  assert(SrcBase && DstBase && SrcBase == DstBase &&
+         "expected src and dst scev unknowns to be equal");
 
   const SCEV *ElementSize = SE->getElementSize(Src);
   if (ElementSize != SE->getElementSize(Dst))
@@ -3304,7 +3415,6 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
   SE->findArrayDimensions(Terms, Sizes, ElementSize);
 
   // Third step: compute the access functions for each subscript.
-  SmallVector<const SCEV *, 4> SrcSubscripts, DstSubscripts;
   SE->computeAccessFunctions(SrcAR, SrcSubscripts, Sizes);
   SE->computeAccessFunctions(DstAR, DstSubscripts, Sizes);
 
@@ -3313,7 +3423,7 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
       SrcSubscripts.size() != DstSubscripts.size())
     return false;
 
-  int size = SrcSubscripts.size();
+  size_t Size = SrcSubscripts.size();
 
   // Statically check that the array bounds are in-range. The first subscript we
   // don't have a size for and it cannot overflow into another subscript, so is
@@ -3322,40 +3432,20 @@ bool DependenceInfo::tryDelinearize(Instruction *Src, Instruction *Dst,
   // FIXME: It may be better to record these sizes and add them as constraints
   // to the dependency checks.
   if (!DisableDelinearizationChecks)
-    for (int i = 1; i < size; ++i) {
-      if (!isKnownNonNegative(SrcSubscripts[i], SrcPtr))
+    for (size_t I = 1; I < Size; ++I) {
+      if (!isKnownNonNegative(SrcSubscripts[I], SrcPtr))
         return false;
 
-      if (!isKnownLessThan(SrcSubscripts[i], Sizes[i - 1]))
+      if (!isKnownLessThan(SrcSubscripts[I], Sizes[I - 1]))
         return false;
 
-      if (!isKnownNonNegative(DstSubscripts[i], DstPtr))
+      if (!isKnownNonNegative(DstSubscripts[I], DstPtr))
         return false;
 
-      if (!isKnownLessThan(DstSubscripts[i], Sizes[i - 1]))
+      if (!isKnownLessThan(DstSubscripts[I], Sizes[I - 1]))
         return false;
     }
 
-  LLVM_DEBUG({
-    dbgs() << "\nSrcSubscripts: ";
-    for (int i = 0; i < size; i++)
-      dbgs() << *SrcSubscripts[i];
-    dbgs() << "\nDstSubscripts: ";
-    for (int i = 0; i < size; i++)
-      dbgs() << *DstSubscripts[i];
-  });
-
-  // The delinearization transforms a single-subscript MIV dependence test into
-  // a multi-subscript SIV dependence test that is easier to compute. So we
-  // resize Pair to contain as many pairs of subscripts as the delinearization
-  // has found, and then initialize the pairs following the delinearization.
-  Pair.resize(size);
-  for (int i = 0; i < size; ++i) {
-    Pair[i].Src = SrcSubscripts[i];
-    Pair[i].Dst = DstSubscripts[i];
-    unifySubscriptType(&Pair[i]);
-  }
-
   return true;
 }
 
diff --git a/llvm/lib/Analysis/DependenceGraphBuilder.cpp b/llvm/lib/Analysis/DependenceGraphBuilder.cpp
index e8a1a2fff919..7a98d844e4cb 100644
--- a/llvm/lib/Analysis/DependenceGraphBuilder.cpp
+++ b/llvm/lib/Analysis/DependenceGraphBuilder.cpp
@@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/DependenceGraphBuilder.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/EnumeratedArray.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/Statistic.h"
@@ -374,6 +375,109 @@ void AbstractDependenceGraphBuilder<G>::createMemoryDependencyEdges() {
   }
 }
 
+template <class G> void AbstractDependenceGraphBuilder<G>::simplify() {
+  if (!shouldSimplify())
+    return;
+  LLVM_DEBUG(dbgs() << "==== Start of Graph Simplification ===\n");
+
+  // This algorithm works by first collecting a set of candidate nodes that have
+  // an out-degree of one (in terms of def-use edges), and then ignoring those
+  // whose targets have an in-degree more than one. Each node in the resulting
+  // set can then be merged with its corresponding target and put back into the
+  // worklist until no further merge candidates are available.
+  SmallPtrSet<NodeType *, 32> CandidateSourceNodes;
+
+  // A mapping between nodes and their in-degree. To save space, this map
+  // only contains nodes that are targets of nodes in the CandidateSourceNodes.
+  DenseMap<NodeType *, unsigned> TargetInDegreeMap;
+
+  for (NodeType *N : Graph) {
+    if (N->getEdges().size() != 1)
+      continue;
+    EdgeType &Edge = N->back();
+    if (!Edge.isDefUse())
+      continue;
+    CandidateSourceNodes.insert(N);
+
+    // Insert an element into the in-degree map and initialize to zero. The
+    // count will get updated in the next step.
+    TargetInDegreeMap.insert({&Edge.getTargetNode(), 0});
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "Size of candidate src node list:" << CandidateSourceNodes.size()
+           << "\nNode with single outgoing def-use edge:\n";
+    for (NodeType *N : CandidateSourceNodes) {
+      dbgs() << N << "\n";
+    }
+  });
+
+  for (NodeType *N : Graph) {
+    for (EdgeType *E : *N) {
+      NodeType *Tgt = &E->getTargetNode();
+      auto TgtIT = TargetInDegreeMap.find(Tgt);
+      if (TgtIT != TargetInDegreeMap.end())
+        ++(TgtIT->second);
+    }
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "Size of target in-degree map:" << TargetInDegreeMap.size()
+           << "\nContent of in-degree map:\n";
+    for (auto &I : TargetInDegreeMap) {
+      dbgs() << I.first << " --> " << I.second << "\n";
+    }
+  });
+
+  SmallVector<NodeType *, 32> Worklist(CandidateSourceNodes.begin(),
+                                       CandidateSourceNodes.end());
+  while (!Worklist.empty()) {
+    NodeType &Src = *Worklist.pop_back_val();
+    // As nodes get merged, we need to skip any node that has been removed from
+    // the candidate set (see below).
+    if (!CandidateSourceNodes.erase(&Src))
+      continue;
+
+    assert(Src.getEdges().size() == 1 &&
+           "Expected a single edge from the candidate src node.");
+    NodeType &Tgt = Src.back().getTargetNode();
+    assert(TargetInDegreeMap.find(&Tgt) != TargetInDegreeMap.end() &&
+           "Expected target to be in the in-degree map.");
+
+    if (TargetInDegreeMap[&Tgt] != 1)
+      continue;
+
+    if (!areNodesMergeable(Src, Tgt))
+      continue;
+
+    // Do not merge if there is also an edge from target to src (immediate
+    // cycle).
+    if (Tgt.hasEdgeTo(Src))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "Merging:" << Src << "\nWith:" << Tgt << "\n");
+
+    mergeNodes(Src, Tgt);
+
+    // If the target node is in the candidate set itself, we need to put the
+    // src node back into the worklist again so it gives the target a chance
+    // to get merged into it. For example if we have:
+    // {(a)->(b), (b)->(c), (c)->(d), ...} and the worklist is initially {b, a},
+    // then after merging (a) and (b) together, we need to put (a,b) back in
+    // the worklist so that (c) can get merged in as well resulting in
+    // {(a,b,c) -> d}
+    // We also need to remove the old target (b), from the worklist. We first
+    // remove it from the candidate set here, and skip any item from the
+    // worklist that is not in the set.
+    if (CandidateSourceNodes.erase(&Tgt)) {
+      Worklist.push_back(&Src);
+      CandidateSourceNodes.insert(&Src);
+      LLVM_DEBUG(dbgs() << "Putting " << &Src << " back in the worklist.\n");
+    }
+  }
+  LLVM_DEBUG(dbgs() << "=== End of Graph Simplification ===\n");
+}
+
 template <class G>
 void AbstractDependenceGraphBuilder<G>::sortNodesTopologically() {
 
diff --git a/llvm/lib/Analysis/DivergenceAnalysis.cpp b/llvm/lib/Analysis/DivergenceAnalysis.cpp
index 3d1be1e1cce0..343406c9bba1 100644
--- a/llvm/lib/Analysis/DivergenceAnalysis.cpp
+++ b/llvm/lib/Analysis/DivergenceAnalysis.cpp
@@ -184,6 +184,17 @@ bool DivergenceAnalysis::inRegion(const BasicBlock &BB) const {
   return (!RegionLoop && BB.getParent() == &F) || RegionLoop->contains(&BB);
 }
 
+static bool usesLiveOut(const Instruction &I, const Loop *DivLoop) {
+  for (auto &Op : I.operands()) {
+    auto *OpInst = dyn_cast<Instruction>(&Op);
+    if (!OpInst)
+      continue;
+    if (DivLoop->contains(OpInst->getParent()))
+      return true;
+  }
+  return false;
+}
+
 // marks all users of loop-carried values of the loop headed by LoopHeader as
 // divergent
 void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) {
@@ -227,16 +238,14 @@ void DivergenceAnalysis::taintLoopLiveOuts(const BasicBlock &LoopHeader) {
         continue;
       if (isDivergent(I))
         continue;
+      if (!usesLiveOut(I, DivLoop))
+        continue;
 
-      for (auto &Op : I.operands()) {
-        auto *OpInst = dyn_cast<Instruction>(&Op);
-        if (!OpInst)
-          continue;
-        if (DivLoop->contains(OpInst->getParent())) {
-          markDivergent(I);
-          pushUsers(I);
-          break;
-        }
+      markDivergent(I);
+      if (I.isTerminator()) {
+        propagateBranchDivergence(I);
+      } else {
+        pushUsers(I);
       }
     }
 
@@ -286,14 +295,11 @@ bool DivergenceAnalysis::propagateJoinDivergence(const BasicBlock &JoinBlock,
   // push non-divergent phi nodes in JoinBlock to the worklist
   pushPHINodes(JoinBlock);
 
-  // JoinBlock is a divergent loop exit
-  if (BranchLoop && !BranchLoop->contains(&JoinBlock)) {
-    return true;
-  }
-
   // disjoint-paths divergent at JoinBlock
   markBlockJoinDivergent(JoinBlock);
-  return false;
+
+  // JoinBlock is a divergent loop exit
+  return BranchLoop && !BranchLoop->contains(&JoinBlock);
 }
 
 void DivergenceAnalysis::propagateBranchDivergence(const Instruction &Term) {
@@ -301,6 +307,10 @@ void DivergenceAnalysis::propagateBranchDivergence(const Instruction &Term) {
 
   markDivergent(Term);
 
+  // Don't propagate divergence from unreachable blocks.
+  if (!DT.isReachableFromEntry(Term.getParent()))
+    return;
+
   const auto *BranchLoop = LI.getLoopFor(Term.getParent());
 
   // whether there is a divergent loop exit from BranchLoop (if any)
diff --git a/llvm/lib/Analysis/DomPrinter.cpp b/llvm/lib/Analysis/DomPrinter.cpp
index 024a0fb49950..ebbe0d3e2c5f 100644
--- a/llvm/lib/Analysis/DomPrinter.cpp
+++ b/llvm/lib/Analysis/DomPrinter.cpp
@@ -40,11 +40,11 @@ struct DOTGraphTraits<DomTreeNode*> : public DefaultDOTGraphTraits {
 
 
     if (isSimple())
-      return DOTGraphTraits<const Function*>
-        ::getSimpleNodeLabel(BB, BB->getParent());
+      return DOTGraphTraits<DOTFuncInfo *>
+        ::getSimpleNodeLabel(BB, nullptr);
     else
-      return DOTGraphTraits<const Function*>
-        ::getCompleteNodeLabel(BB, BB->getParent());
+      return DOTGraphTraits<DOTFuncInfo *>
+        ::getCompleteNodeLabel(BB, nullptr);
   }
 };
 
diff --git a/llvm/lib/Analysis/DomTreeUpdater.cpp b/llvm/lib/Analysis/DomTreeUpdater.cpp
index b374334ea371..9594da0a4f91 100644
--- a/llvm/lib/Analysis/DomTreeUpdater.cpp
+++ b/llvm/lib/Analysis/DomTreeUpdater.cpp
@@ -14,7 +14,7 @@
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/PostDominators.h"
-#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Support/GenericDomTree.h"
 #include <algorithm>
 #include <functional>
@@ -507,7 +507,7 @@ LLVM_DUMP_METHOD void DomTreeUpdater::dump() const {
 
   OS << "Pending DeletedBBs:\n";
   Index = 0;
-  for (auto BB : DeletedBBs) {
+  for (const auto *BB : DeletedBBs) {
     OS << "  " << Index << " : ";
     ++Index;
     if (BB->hasName())
@@ -519,7 +519,7 @@ LLVM_DUMP_METHOD void DomTreeUpdater::dump() const {
 
   OS << "Pending Callbacks:\n";
   Index = 0;
-  for (auto BB : Callbacks) {
+  for (const auto &BB : Callbacks) {
     OS << "  " << Index << " : ";
     ++Index;
     if (BB->hasName())
diff --git a/llvm/lib/Analysis/GlobalsModRef.cpp b/llvm/lib/Analysis/GlobalsModRef.cpp
index 4361e0dc9bbd..8c8ccf04ebba 100644
--- a/llvm/lib/Analysis/GlobalsModRef.cpp
+++ b/llvm/lib/Analysis/GlobalsModRef.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -28,6 +29,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "globalsmodref-aa"
@@ -77,7 +79,7 @@ class GlobalsAAResult::FunctionInfo {
     static inline AlignedMap *getFromVoidPointer(void *P) {
       return (AlignedMap *)P;
     }
-    enum { NumLowBitsAvailable = 3 };
+    static constexpr int NumLowBitsAvailable = 3;
     static_assert(alignof(AlignedMap) >= (1 << NumLowBitsAvailable),
                   "AlignedMap insufficiently aligned to have enough low bits.");
   };
@@ -810,6 +812,14 @@ bool GlobalsAAResult::isNonEscapingGlobalNoAlias(const GlobalValue *GV,
   return true;
 }
 
+bool GlobalsAAResult::invalidate(Module &, const PreservedAnalyses &PA,
+                                 ModuleAnalysisManager::Invalidator &) {
+  // Check whether the analysis has been explicitly invalidated. Otherwise, it's
+  // stateless and remains preserved.
+  auto PAC = PA.getChecker<GlobalsAA>();
+  return !PAC.preservedWhenStateless();
+}
+
 /// alias - If one of the pointers is to a global that we are tracking, and the
 /// other is some random pointer, we know there cannot be an alias, because the
 /// address of the global isn't taken.
diff --git a/llvm/lib/Analysis/GuardUtils.cpp b/llvm/lib/Analysis/GuardUtils.cpp
index d48283279858..cd132c56991f 100644
--- a/llvm/lib/Analysis/GuardUtils.cpp
+++ b/llvm/lib/Analysis/GuardUtils.cpp
@@ -47,7 +47,7 @@ bool llvm::parseWidenableBranch(const User *U, Value *&Condition,
 
   Use *C, *WC;
   if (parseWidenableBranch(const_cast<User*>(U), C, WC, IfTrueBB, IfFalseBB)) {
-    if (C) 
+    if (C)
       Condition = C->get();
     else
       Condition = ConstantInt::getTrue(IfTrueBB->getContext());
@@ -66,10 +66,10 @@ bool llvm::parseWidenableBranch(User *U, Use *&C,Use *&WC,
   auto *Cond = BI->getCondition();
   if (!Cond->hasOneUse())
     return false;
-  
+
   IfTrueBB = BI->getSuccessor(0);
   IfFalseBB = BI->getSuccessor(1);
-  
+
   if (match(Cond, m_Intrinsic<Intrinsic::experimental_widenable_condition>())) {
     WC = &BI->getOperandUse(0);
     C = nullptr;
@@ -88,7 +88,7 @@ bool llvm::parseWidenableBranch(User *U, Use *&C,Use *&WC,
   if (!And)
     // Could be a constexpr
     return false;
-  
+
   if (match(A, m_Intrinsic<Intrinsic::experimental_widenable_condition>()) &&
       A->hasOneUse()) {
     WC = &And->getOperandUse(0);
diff --git a/llvm/lib/Analysis/HeatUtils.cpp b/llvm/lib/Analysis/HeatUtils.cpp
new file mode 100644
index 000000000000..a1a11be5fee3
--- /dev/null
+++ b/llvm/lib/Analysis/HeatUtils.cpp
@@ -0,0 +1,78 @@
+//===-- HeatUtils.cpp - Utility for printing heat colors --------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility for printing heat colors based on heuristics or profiling
+// information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/HeatUtils.h"
+#include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/IR/Instructions.h"
+
+namespace llvm {
+
+static const unsigned heatSize = 100;
+static const char heatPalette[heatSize][8] = {
+    "#3d50c3", "#4055c8", "#4358cb", "#465ecf", "#4961d2", "#4c66d6", "#4f69d9",
+    "#536edd", "#5572df", "#5977e3", "#5b7ae5", "#5f7fe8", "#6282ea", "#6687ed",
+    "#6a8bef", "#6c8ff1", "#7093f3", "#7396f5", "#779af7", "#7a9df8", "#7ea1fa",
+    "#81a4fb", "#85a8fc", "#88abfd", "#8caffe", "#8fb1fe", "#93b5fe", "#96b7ff",
+    "#9abbff", "#9ebeff", "#a1c0ff", "#a5c3fe", "#a7c5fe", "#abc8fd", "#aec9fc",
+    "#b2ccfb", "#b5cdfa", "#b9d0f9", "#bbd1f8", "#bfd3f6", "#c1d4f4", "#c5d6f2",
+    "#c7d7f0", "#cbd8ee", "#cedaeb", "#d1dae9", "#d4dbe6", "#d6dce4", "#d9dce1",
+    "#dbdcde", "#dedcdb", "#e0dbd8", "#e3d9d3", "#e5d8d1", "#e8d6cc", "#ead5c9",
+    "#ecd3c5", "#eed0c0", "#efcebd", "#f1ccb8", "#f2cab5", "#f3c7b1", "#f4c5ad",
+    "#f5c1a9", "#f6bfa6", "#f7bca1", "#f7b99e", "#f7b599", "#f7b396", "#f7af91",
+    "#f7ac8e", "#f7a889", "#f6a385", "#f5a081", "#f59c7d", "#f4987a", "#f39475",
+    "#f29072", "#f08b6e", "#ef886b", "#ed8366", "#ec7f63", "#e97a5f", "#e8765c",
+    "#e57058", "#e36c55", "#e16751", "#de614d", "#dc5d4a", "#d85646", "#d65244",
+    "#d24b40", "#d0473d", "#cc403a", "#ca3b37", "#c53334", "#c32e31", "#be242e",
+    "#bb1b2c", "#b70d28"};
+
+uint64_t
+getNumOfCalls(Function &callerFunction, Function &calledFunction) {
+  uint64_t counter = 0;
+  for (User *U : calledFunction.users()) {
+    if (auto CI = dyn_cast<CallInst>(U)) {
+      if (CI->getCaller() == (&callerFunction)) {
+          counter += 1;
+      }
+    }
+  }
+  return counter;
+}
+
+uint64_t getMaxFreq(const Function &F, const BlockFrequencyInfo *BFI) {
+  uint64_t maxFreq = 0;
+  for (const BasicBlock &BB : F) {
+    uint64_t freqVal = BFI->getBlockFreq(&BB).getFrequency();
+    if (freqVal >= maxFreq)
+      maxFreq = freqVal;
+  }
+  return maxFreq;
+}
+
+std::string getHeatColor(uint64_t freq, uint64_t maxFreq) {
+  if (freq > maxFreq)
+    freq = maxFreq;
+  double percent = (freq > 0) ? log2(double(freq)) / log2(maxFreq) : 0;
+  return getHeatColor(percent);
+}
+
+std::string getHeatColor(double percent) {
+  if (percent > 1.0)
+    percent = 1.0;
+  if (percent < 0.0)
+    percent = 0.0;
+  unsigned colorId = unsigned(round(percent * (heatSize - 1.0)));
+  return heatPalette[colorId];
+}
+
+} // namespace llvm
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index ac81cba836f8..6686848d75c9 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
diff --git a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
index dc4cbc371ef4..c32aa0340ceb 100644
--- a/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
+++ b/llvm/lib/Analysis/IndirectCallPromotionAnalysis.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/IndirectCallVisitor.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/Instructions.h"
diff --git a/llvm/lib/Analysis/InlineAdvisor.cpp b/llvm/lib/Analysis/InlineAdvisor.cpp
new file mode 100644
index 000000000000..74a536d1ce2f
--- /dev/null
+++ b/llvm/lib/Analysis/InlineAdvisor.cpp
@@ -0,0 +1,408 @@
+//===- InlineAdvisor.cpp - analysis pass implementation -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements InlineAdvisorAnalysis and DefaultInlineAdvisor, and
+// related types.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/InlineAdvisor.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <sstream>
+
+using namespace llvm;
+#define DEBUG_TYPE "inline"
+
+// This weirdly named statistic tracks the number of times that, when attempting
+// to inline a function A into B, we analyze the callers of B in order to see
+// if those would be more profitable and blocked inline steps.
+STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed");
+
+/// Flag to add inline messages as callsite attributes 'inline-remark'.
+static cl::opt<bool>
+    InlineRemarkAttribute("inline-remark-attribute", cl::init(false),
+                          cl::Hidden,
+                          cl::desc("Enable adding inline-remark attribute to"
+                                   " callsites processed by inliner but decided"
+                                   " to be not inlined"));
+
+// An integer used to limit the cost of inline deferral.  The default negative
+// number tells shouldBeDeferred to only take the secondary cost into account.
+static cl::opt<int>
+    InlineDeferralScale("inline-deferral-scale",
+                        cl::desc("Scale to limit the cost of inline deferral"),
+                        cl::init(2), cl::Hidden);
+
+namespace {
+class DefaultInlineAdvice : public InlineAdvice {
+public:
+  DefaultInlineAdvice(DefaultInlineAdvisor *Advisor, CallBase &CB,
+                      Optional<InlineCost> OIC, OptimizationRemarkEmitter &ORE)
+      : InlineAdvice(Advisor, CB, ORE, OIC.hasValue()), OriginalCB(&CB),
+        OIC(OIC) {}
+
+private:
+  void recordUnsuccessfulInliningImpl(const InlineResult &Result) override {
+    using namespace ore;
+    llvm::setInlineRemark(*OriginalCB, std::string(Result.getFailureReason()) +
+                                           "; " + inlineCostStr(*OIC));
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
+             << NV("Callee", Callee) << " will not be inlined into "
+             << NV("Caller", Caller) << ": "
+             << NV("Reason", Result.getFailureReason());
+    });
+  }
+
+  void recordInliningWithCalleeDeletedImpl() override {
+    emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC);
+  }
+
+  void recordInliningImpl() override {
+    emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC);
+  }
+
+private:
+  CallBase *const OriginalCB;
+  Optional<InlineCost> OIC;
+};
+
+} // namespace
+
+llvm::Optional<llvm::InlineCost>
+getDefaultInlineAdvice(CallBase &CB, FunctionAnalysisManager &FAM,
+                       const InlineParams &Params) {
+  Function &Caller = *CB.getCaller();
+  ProfileSummaryInfo *PSI =
+      FAM.getResult<ModuleAnalysisManagerFunctionProxy>(Caller)
+          .getCachedResult<ProfileSummaryAnalysis>(
+              *CB.getParent()->getParent()->getParent());
+
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(Caller);
+  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+    return FAM.getResult<AssumptionAnalysis>(F);
+  };
+  auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & {
+    return FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+  auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+
+  auto GetInlineCost = [&](CallBase &CB) {
+    Function &Callee = *CB.getCalledFunction();
+    auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee);
+    bool RemarksEnabled =
+        Callee.getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
+            DEBUG_TYPE);
+    return getInlineCost(CB, Params, CalleeTTI, GetAssumptionCache, GetTLI,
+                         GetBFI, PSI, RemarksEnabled ? &ORE : nullptr);
+  };
+  return llvm::shouldInline(CB, GetInlineCost, ORE,
+                            Params.EnableDeferral.hasValue() &&
+                                Params.EnableDeferral.getValue());
+}
+
+std::unique_ptr<InlineAdvice> DefaultInlineAdvisor::getAdvice(CallBase &CB) {
+  auto OIC = getDefaultInlineAdvice(CB, FAM, Params);
+  return std::make_unique<DefaultInlineAdvice>(
+      this, CB, OIC,
+      FAM.getResult<OptimizationRemarkEmitterAnalysis>(*CB.getCaller()));
+}
+
+InlineAdvice::InlineAdvice(InlineAdvisor *Advisor, CallBase &CB,
+                           OptimizationRemarkEmitter &ORE,
+                           bool IsInliningRecommended)
+    : Advisor(Advisor), Caller(CB.getCaller()), Callee(CB.getCalledFunction()),
+      DLoc(CB.getDebugLoc()), Block(CB.getParent()), ORE(ORE),
+      IsInliningRecommended(IsInliningRecommended) {}
+
+void InlineAdvisor::markFunctionAsDeleted(Function *F) {
+  assert((!DeletedFunctions.count(F)) &&
+         "Cannot put cause a function to become dead twice!");
+  DeletedFunctions.insert(F);
+}
+
+void InlineAdvisor::freeDeletedFunctions() {
+  for (auto *F : DeletedFunctions)
+    delete F;
+  DeletedFunctions.clear();
+}
+
+void InlineAdvice::recordInliningWithCalleeDeleted() {
+  markRecorded();
+  Advisor->markFunctionAsDeleted(Callee);
+  recordInliningWithCalleeDeletedImpl();
+}
+
+AnalysisKey InlineAdvisorAnalysis::Key;
+
+bool InlineAdvisorAnalysis::Result::tryCreate(InlineParams Params,
+                                              InliningAdvisorMode Mode) {
+  auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  switch (Mode) {
+  case InliningAdvisorMode::Default:
+    Advisor.reset(new DefaultInlineAdvisor(FAM, Params));
+    break;
+  case InliningAdvisorMode::Development:
+    // To be added subsequently under conditional compilation.
+    break;
+  case InliningAdvisorMode::Release:
+#ifdef LLVM_HAVE_TF_AOT
+    Advisor = llvm::getReleaseModeAdvisor(M, MAM);
+#endif
+    break;
+  }
+  return !!Advisor;
+}
+
+/// Return true if inlining of CB can block the caller from being
+/// inlined which is proved to be more beneficial. \p IC is the
+/// estimated inline cost associated with callsite \p CB.
+/// \p TotalSecondaryCost will be set to the estimated cost of inlining the
+/// caller if \p CB is suppressed for inlining.
+static bool
+shouldBeDeferred(Function *Caller, InlineCost IC, int &TotalSecondaryCost,
+                 function_ref<InlineCost(CallBase &CB)> GetInlineCost) {
+  // For now we only handle local or inline functions.
+  if (!Caller->hasLocalLinkage() && !Caller->hasLinkOnceODRLinkage())
+    return false;
+  // If the cost of inlining CB is non-positive, it is not going to prevent the
+  // caller from being inlined into its callers and hence we don't need to
+  // defer.
+  if (IC.getCost() <= 0)
+    return false;
+  // Try to detect the case where the current inlining candidate caller (call
+  // it B) is a static or linkonce-ODR function and is an inlining candidate
+  // elsewhere, and the current candidate callee (call it C) is large enough
+  // that inlining it into B would make B too big to inline later. In these
+  // circumstances it may be best not to inline C into B, but to inline B into
+  // its callers.
+  //
+  // This only applies to static and linkonce-ODR functions because those are
+  // expected to be available for inlining in the translation units where they
+  // are used. Thus we will always have the opportunity to make local inlining
+  // decisions. Importantly the linkonce-ODR linkage covers inline functions
+  // and templates in C++.
+  //
+  // FIXME: All of this logic should be sunk into getInlineCost. It relies on
+  // the internal implementation of the inline cost metrics rather than
+  // treating them as truly abstract units etc.
+  TotalSecondaryCost = 0;
+  // The candidate cost to be imposed upon the current function.
+  int CandidateCost = IC.getCost() - 1;
+  // If the caller has local linkage and can be inlined to all its callers, we
+  // can apply a huge negative bonus to TotalSecondaryCost.
+  bool ApplyLastCallBonus = Caller->hasLocalLinkage() && !Caller->hasOneUse();
+  // This bool tracks what happens if we DO inline C into B.
+  bool InliningPreventsSomeOuterInline = false;
+  unsigned NumCallerUsers = 0;
+  for (User *U : Caller->users()) {
+    CallBase *CS2 = dyn_cast<CallBase>(U);
+
+    // If this isn't a call to Caller (it could be some other sort
+    // of reference) skip it.  Such references will prevent the caller
+    // from being removed.
+    if (!CS2 || CS2->getCalledFunction() != Caller) {
+      ApplyLastCallBonus = false;
+      continue;
+    }
+
+    InlineCost IC2 = GetInlineCost(*CS2);
+    ++NumCallerCallersAnalyzed;
+    if (!IC2) {
+      ApplyLastCallBonus = false;
+      continue;
+    }
+    if (IC2.isAlways())
+      continue;
+
+    // See if inlining of the original callsite would erase the cost delta of
+    // this callsite. We subtract off the penalty for the call instruction,
+    // which we would be deleting.
+    if (IC2.getCostDelta() <= CandidateCost) {
+      InliningPreventsSomeOuterInline = true;
+      TotalSecondaryCost += IC2.getCost();
+      NumCallerUsers++;
+    }
+  }
+
+  if (!InliningPreventsSomeOuterInline)
+    return false;
+
+  // If all outer calls to Caller would get inlined, the cost for the last
+  // one is set very low by getInlineCost, in anticipation that Caller will
+  // be removed entirely.  We did not account for this above unless there
+  // is only one caller of Caller.
+  if (ApplyLastCallBonus)
+    TotalSecondaryCost -= InlineConstants::LastCallToStaticBonus;
+
+  // If InlineDeferralScale is negative, then ignore the cost of primary
+  // inlining -- IC.getCost() multiplied by the number of callers to Caller.
+  if (InlineDeferralScale < 0)
+    return TotalSecondaryCost < IC.getCost();
+
+  int TotalCost = TotalSecondaryCost + IC.getCost() * NumCallerUsers;
+  int Allowance = IC.getCost() * InlineDeferralScale;
+  return TotalCost < Allowance;
+}
+
+namespace llvm {
+static std::basic_ostream<char> &operator<<(std::basic_ostream<char> &R,
+                                            const ore::NV &Arg) {
+  return R << Arg.Val;
+}
+
+template <class RemarkT>
+RemarkT &operator<<(RemarkT &&R, const InlineCost &IC) {
+  using namespace ore;
+  if (IC.isAlways()) {
+    R << "(cost=always)";
+  } else if (IC.isNever()) {
+    R << "(cost=never)";
+  } else {
+    R << "(cost=" << ore::NV("Cost", IC.getCost())
+      << ", threshold=" << ore::NV("Threshold", IC.getThreshold()) << ")";
+  }
+  if (const char *Reason = IC.getReason())
+    R << ": " << ore::NV("Reason", Reason);
+  return R;
+}
+} // namespace llvm
+
+std::string llvm::inlineCostStr(const InlineCost &IC) {
+  std::stringstream Remark;
+  Remark << IC;
+  return Remark.str();
+}
+
+void llvm::setInlineRemark(CallBase &CB, StringRef Message) {
+  if (!InlineRemarkAttribute)
+    return;
+
+  Attribute Attr = Attribute::get(CB.getContext(), "inline-remark", Message);
+  CB.addAttribute(AttributeList::FunctionIndex, Attr);
+}
+
+/// Return the cost only if the inliner should attempt to inline at the given
+/// CallSite. If we return the cost, we will emit an optimisation remark later
+/// using that cost, so we won't do so from this function. Return None if
+/// inlining should not be attempted.
+Optional<InlineCost>
+llvm::shouldInline(CallBase &CB,
+                   function_ref<InlineCost(CallBase &CB)> GetInlineCost,
+                   OptimizationRemarkEmitter &ORE, bool EnableDeferral) {
+  using namespace ore;
+
+  InlineCost IC = GetInlineCost(CB);
+  Instruction *Call = &CB;
+  Function *Callee = CB.getCalledFunction();
+  Function *Caller = CB.getCaller();
+
+  if (IC.isAlways()) {
+    LLVM_DEBUG(dbgs() << "    Inlining " << inlineCostStr(IC)
+                      << ", Call: " << CB << "\n");
+    return IC;
+  }
+
+  if (!IC) {
+    LLVM_DEBUG(dbgs() << "    NOT Inlining " << inlineCostStr(IC)
+                      << ", Call: " << CB << "\n");
+    if (IC.isNever()) {
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
+               << NV("Callee", Callee) << " not inlined into "
+               << NV("Caller", Caller) << " because it should never be inlined "
+               << IC;
+      });
+    } else {
+      ORE.emit([&]() {
+        return OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
+               << NV("Callee", Callee) << " not inlined into "
+               << NV("Caller", Caller) << " because too costly to inline "
+               << IC;
+      });
+    }
+    setInlineRemark(CB, inlineCostStr(IC));
+    return None;
+  }
+
+  int TotalSecondaryCost = 0;
+  if (EnableDeferral &&
+      shouldBeDeferred(Caller, IC, TotalSecondaryCost, GetInlineCost)) {
+    LLVM_DEBUG(dbgs() << "    NOT Inlining: " << CB
+                      << " Cost = " << IC.getCost()
+                      << ", outer Cost = " << TotalSecondaryCost << '\n');
+    ORE.emit([&]() {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts",
+                                      Call)
+             << "Not inlining. Cost of inlining " << NV("Callee", Callee)
+             << " increases the cost of inlining " << NV("Caller", Caller)
+             << " in other contexts";
+    });
+    setInlineRemark(CB, "deferred");
+    // IC does not bool() to false, so get an InlineCost that will.
+    // This will not be inspected to make an error message.
+    return None;
+  }
+
+  LLVM_DEBUG(dbgs() << "    Inlining " << inlineCostStr(IC) << ", Call: " << CB
+                    << '\n');
+  return IC;
+}
+
+void llvm::addLocationToRemarks(OptimizationRemark &Remark, DebugLoc DLoc) {
+  if (!DLoc.get())
+    return;
+
+  bool First = true;
+  Remark << " at callsite ";
+  for (DILocation *DIL = DLoc.get(); DIL; DIL = DIL->getInlinedAt()) {
+    if (!First)
+      Remark << " @ ";
+    unsigned int Offset = DIL->getLine();
+    Offset -= DIL->getScope()->getSubprogram()->getLine();
+    unsigned int Discriminator = DIL->getBaseDiscriminator();
+    StringRef Name = DIL->getScope()->getSubprogram()->getLinkageName();
+    if (Name.empty())
+      Name = DIL->getScope()->getSubprogram()->getName();
+    Remark << Name << ":" << ore::NV("Line", Offset);
+    if (Discriminator)
+      Remark << "." << ore::NV("Disc", Discriminator);
+    First = false;
+  }
+}
+
+void llvm::emitInlinedInto(OptimizationRemarkEmitter &ORE, DebugLoc DLoc,
+                           const BasicBlock *Block, const Function &Callee,
+                           const Function &Caller, const InlineCost &IC,
+                           bool ForProfileContext, const char *PassName) {
+  ORE.emit([&]() {
+    bool AlwaysInline = IC.isAlways();
+    StringRef RemarkName = AlwaysInline ? "AlwaysInline" : "Inlined";
+    OptimizationRemark Remark(PassName ? PassName : DEBUG_TYPE, RemarkName,
+                              DLoc, Block);
+    Remark << ore::NV("Callee", &Callee) << " inlined into ";
+    Remark << ore::NV("Caller", &Caller);
+    if (ForProfileContext)
+      Remark << " to match profiling context";
+    Remark << " with " << IC;
+    addLocationToRemarks(Remark, DLoc);
+    return Remark;
+  });
+}
diff --git a/llvm/lib/Analysis/InlineCost.cpp b/llvm/lib/Analysis/InlineCost.cpp
index de83a48aad16..33d714406d7f 100644
--- a/llvm/lib/Analysis/InlineCost.cpp
+++ b/llvm/lib/Analysis/InlineCost.cpp
@@ -24,9 +24,11 @@
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
@@ -38,6 +40,7 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -46,6 +49,15 @@ using namespace llvm;
 
 STATISTIC(NumCallsAnalyzed, "Number of call sites analyzed");
 
+static cl::opt<int>
+    DefaultThreshold("inlinedefault-threshold", cl::Hidden, cl::init(225),
+                     cl::ZeroOrMore,
+                     cl::desc("Default amount of inlining to perform"));
+
+static cl::opt<bool> PrintInstructionComments(
+    "print-instruction-comments", cl::Hidden, cl::init(false),
+    cl::desc("Prints comments for instruction based on inline cost analysis"));
+
 static cl::opt<int> InlineThreshold(
     "inline-threshold", cl::Hidden, cl::init(225), cl::ZeroOrMore,
     cl::desc("Control the amount of inlining to perform (default = 225)"));
@@ -92,8 +104,52 @@ static cl::opt<bool> OptComputeFullInlineCost(
     cl::desc("Compute the full inline cost of a call site even when the cost "
              "exceeds the threshold."));
 
+static cl::opt<bool> InlineCallerSupersetNoBuiltin(
+    "inline-caller-superset-nobuiltin", cl::Hidden, cl::init(true),
+    cl::ZeroOrMore,
+    cl::desc("Allow inlining when caller has a superset of callee's nobuiltin "
+             "attributes."));
+
+static cl::opt<bool> DisableGEPConstOperand(
+    "disable-gep-const-evaluation", cl::Hidden, cl::init(false),
+    cl::desc("Disables evaluation of GetElementPtr with constant operands"));
+
 namespace {
 class InlineCostCallAnalyzer;
+
+// This struct is used to store information about inline cost of a
+// particular instruction
+struct InstructionCostDetail {
+  int CostBefore = 0;
+  int CostAfter = 0;
+  int ThresholdBefore = 0;
+  int ThresholdAfter = 0;
+
+  int getThresholdDelta() const { return ThresholdAfter - ThresholdBefore; }
+
+  int getCostDelta() const { return CostAfter - CostBefore; }
+
+  bool hasThresholdChanged() const { return ThresholdAfter != ThresholdBefore; }
+};
+
+class InlineCostAnnotationWriter : public AssemblyAnnotationWriter {
+private:
+  InlineCostCallAnalyzer *const ICCA;
+
+public:
+  InlineCostAnnotationWriter(InlineCostCallAnalyzer *ICCA) : ICCA(ICCA) {}
+  virtual void emitInstructionAnnot(const Instruction *I,
+                                    formatted_raw_ostream &OS) override;
+};
+
+/// Carry out call site analysis, in order to evaluate inlinability.
+/// NOTE: the type is currently used as implementation detail of functions such
+/// as llvm::getInlineCost. Note the function_ref constructor parameters - the
+/// expectation is that they come from the outer scope, from the wrapper
+/// functions. If we want to support constructing CallAnalyzer objects where
+/// lambdas are provided inline at construction, or where the object needs to
+/// otherwise survive past the scope of the provided functions, we need to
+/// revisit the argument types.
 class CallAnalyzer : public InstVisitor<CallAnalyzer, bool> {
   typedef InstVisitor<CallAnalyzer, bool> Base;
   friend class InstVisitor<CallAnalyzer, bool>;
@@ -104,10 +160,10 @@ protected:
   const TargetTransformInfo &TTI;
 
   /// Getter for the cache of @llvm.assume intrinsics.
-  std::function<AssumptionCache &(Function &)> &GetAssumptionCache;
+  function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
 
   /// Getter for BlockFrequencyInfo
-  Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI;
+  function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
 
   /// Profile summary information.
   ProfileSummaryInfo *PSI;
@@ -130,11 +186,16 @@ protected:
   /// Called after a basic block was analyzed.
   virtual void onBlockAnalyzed(const BasicBlock *BB) {}
 
+  /// Called before an instruction was analyzed
+  virtual void onInstructionAnalysisStart(const Instruction *I) {}
+
+  /// Called after an instruction was analyzed
+  virtual void onInstructionAnalysisFinish(const Instruction *I) {}
+
   /// Called at the end of the analysis of the callsite. Return the outcome of
   /// the analysis, i.e. 'InlineResult(true)' if the inlining may happen, or
   /// the reason it can't.
-  virtual InlineResult finalizeAnalysis() { return true; }
-
+  virtual InlineResult finalizeAnalysis() { return InlineResult::success(); }
   /// Called when we're about to start processing a basic block, and every time
   /// we are done processing an instruction. Return true if there is no point in
   /// continuing the analysis (e.g. we've determined already the call site is
@@ -145,8 +206,7 @@ protected:
   /// contexts propagated).  It checks callsite-specific information. Return a
   /// reason analysis can't continue if that's the case, or 'true' if it may
   /// continue.
-  virtual InlineResult onAnalysisStart() { return true; }
-
+  virtual InlineResult onAnalysisStart() { return InlineResult::success(); }
   /// Called if the analysis engine decides SROA cannot be done for the given
   /// alloca.
   virtual void onDisableSROA(AllocaInst *Arg) {}
@@ -187,7 +247,7 @@ protected:
 
   /// Called to account for any other instruction not specifically accounted
   /// for.
-  virtual void onCommonInstructionSimplification() {}
+  virtual void onMissedSimplification() {}
 
   /// Start accounting potential benefits due to SROA for the given alloca.
   virtual void onInitializeSROAArg(AllocaInst *Arg) {}
@@ -236,9 +296,7 @@ protected:
   DenseMap<Value *, AllocaInst *> SROAArgValues;
 
   /// Keep track of Allocas for which we believe we may get SROA optimization.
-  /// We don't delete entries in SROAArgValue because we still want
-  /// isAllocaDerivedArg to function correctly.
-  DenseSet<AllocaInst *> EnabledSROAArgValues;
+  DenseSet<AllocaInst *> EnabledSROAAllocas;
 
   /// Keep track of values which map to a pointer base and constant offset.
   DenseMap<Value *, std::pair<Value *, APInt>> ConstantOffsetPtrs;
@@ -258,8 +316,7 @@ protected:
 
   AllocaInst *getSROAArgForValueOrNull(Value *V) const {
     auto It = SROAArgValues.find(V);
-    if (It == SROAArgValues.end() ||
-        EnabledSROAArgValues.count(It->second) == 0)
+    if (It == SROAArgValues.end() || EnabledSROAAllocas.count(It->second) == 0)
       return nullptr;
     return It->second;
   }
@@ -337,17 +394,24 @@ protected:
   bool visitUnreachableInst(UnreachableInst &I);
 
 public:
-  CallAnalyzer(const TargetTransformInfo &TTI,
-               std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
-               Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
-               ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE,
-               Function &Callee, CallBase &Call)
+  CallAnalyzer(
+      Function &Callee, CallBase &Call, const TargetTransformInfo &TTI,
+      function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
+      function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
+      ProfileSummaryInfo *PSI = nullptr,
+      OptimizationRemarkEmitter *ORE = nullptr)
       : TTI(TTI), GetAssumptionCache(GetAssumptionCache), GetBFI(GetBFI),
         PSI(PSI), F(Callee), DL(F.getParent()->getDataLayout()), ORE(ORE),
         CandidateCall(Call), EnableLoadElimination(true) {}
 
   InlineResult analyze();
 
+  Optional<Constant*> getSimplifiedValue(Instruction *I) {
+    if (SimplifiedValues.find(I) != SimplifiedValues.end())
+      return SimplifiedValues[I];
+    return None;
+  }
+
   // Keep a bunch of stats about the cost savings found so we can print them
   // out when debugging.
   unsigned NumConstantArgs = 0;
@@ -375,6 +439,11 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
   /// Tunable parameters that control the analysis.
   const InlineParams &Params;
 
+  // This DenseMap stores the delta change in cost and threshold after
+  // accounting for the given instruction. The map is filled only with the
+  // flag PrintInstructionComments on.
+  DenseMap<const Instruction *, InstructionCostDetail> InstructionCostDetailMap;
+
   /// Upper bound for the inlining cost. Bonuses are being applied to account
   /// for speculative "expected profit" of the inlining decision.
   int Threshold = 0;
@@ -382,6 +451,9 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
   /// Attempt to evaluate indirect calls to boost its inline cost.
   const bool BoostIndirectCalls;
 
+  /// Ignore the threshold when finalizing analysis.
+  const bool IgnoreThreshold;
+
   /// Inlining cost measured in abstract units, accounts for all the
   /// instructions expected to be executed for a given function invocation.
   /// Instructions that are statically proven to be dead based on call-site
@@ -456,9 +528,9 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
           InlineConstants::IndirectCallThreshold;
       /// FIXME: if InlineCostCallAnalyzer is derived from, this may need
       /// to instantiate the derived class.
-      InlineCostCallAnalyzer CA(TTI, GetAssumptionCache, GetBFI, PSI, ORE, *F,
-                                Call, IndirectCallParams, false);
-      if (CA.analyze()) {
+      InlineCostCallAnalyzer CA(*F, Call, IndirectCallParams, TTI,
+                                GetAssumptionCache, GetBFI, PSI, ORE, false);
+      if (CA.analyze().isSuccess()) {
         // We were able to inline the indirect call! Subtract the cost from the
         // threshold to get the bonus we want to apply, but don't go below zero.
         Cost -= std::max(0, CA.getThreshold() - CA.getCost());
@@ -507,7 +579,7 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
 
     addCost(SwitchCost, (int64_t)CostUpperBound);
   }
-  void onCommonInstructionSimplification() override {
+  void onMissedSimplification() override {
     addCost(InlineConstants::InstrCost);
   }
 
@@ -515,7 +587,6 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     assert(Arg != nullptr &&
            "Should not initialize SROA costs for null value.");
     SROAArgCosts[Arg] = 0;
-    EnabledSROAArgValues.insert(Arg);
   }
 
   void onAggregateSROAUse(AllocaInst *SROAArg) override {
@@ -538,6 +609,25 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
       SingleBB = false;
     }
   }
+
+  void onInstructionAnalysisStart(const Instruction *I) override {
+    // This function is called to store the initial cost of inlining before
+    // the given instruction was assessed.
+    if (!PrintInstructionComments)
+      return;
+    InstructionCostDetailMap[I].CostBefore = Cost;
+    InstructionCostDetailMap[I].ThresholdBefore = Threshold;
+  }
+
+  void onInstructionAnalysisFinish(const Instruction *I) override {
+    // This function is called to find new values of cost and threshold after
+    // the instruction has been assessed.
+    if (!PrintInstructionComments)
+      return;
+    InstructionCostDetailMap[I].CostAfter = Cost;
+    InstructionCostDetailMap[I].ThresholdAfter = Threshold;
+  }
+
   InlineResult finalizeAnalysis() override {
     // Loops generally act a lot like calls in that they act like barriers to
     // movement, require a certain amount of setup, etc. So when optimising for
@@ -566,12 +656,14 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
     else if (NumVectorInstructions <= NumInstructions / 2)
       Threshold -= VectorBonus / 2;
 
-    return Cost < std::max(1, Threshold);
+    if (IgnoreThreshold || Cost < std::max(1, Threshold))
+      return InlineResult::success();
+    return InlineResult::failure("Cost over threshold.");
   }
   bool shouldStop() override {
     // Bail out the moment we cross the threshold. This means we'll under-count
     // the cost, but only when undercounting doesn't matter.
-    return Cost >= Threshold && !ComputeFullInlineCost;
+    return !IgnoreThreshold && Cost >= Threshold && !ComputeFullInlineCost;
   }
 
   void onLoadEliminationOpportunity() override {
@@ -618,25 +710,42 @@ class InlineCostCallAnalyzer final : public CallAnalyzer {
 
     // Check if we're done. This can happen due to bonuses and penalties.
     if (Cost >= Threshold && !ComputeFullInlineCost)
-      return "high cost";
+      return InlineResult::failure("high cost");
 
-    return true;
+    return InlineResult::success();
   }
 
 public:
   InlineCostCallAnalyzer(
+      Function &Callee, CallBase &Call, const InlineParams &Params,
       const TargetTransformInfo &TTI,
-      std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
-      Optional<function_ref<BlockFrequencyInfo &(Function &)>> &GetBFI,
-      ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE, Function &Callee,
-      CallBase &Call, const InlineParams &Params, bool BoostIndirect = true)
-      : CallAnalyzer(TTI, GetAssumptionCache, GetBFI, PSI, ORE, Callee, Call),
+      function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
+      function_ref<BlockFrequencyInfo &(Function &)> GetBFI = nullptr,
+      ProfileSummaryInfo *PSI = nullptr,
+      OptimizationRemarkEmitter *ORE = nullptr, bool BoostIndirect = true,
+      bool IgnoreThreshold = false)
+      : CallAnalyzer(Callee, Call, TTI, GetAssumptionCache, GetBFI, PSI, ORE),
         ComputeFullInlineCost(OptComputeFullInlineCost ||
                               Params.ComputeFullInlineCost || ORE),
         Params(Params), Threshold(Params.DefaultThreshold),
-        BoostIndirectCalls(BoostIndirect) {}
+        BoostIndirectCalls(BoostIndirect), IgnoreThreshold(IgnoreThreshold),
+        Writer(this) {}
+
+  /// Annotation Writer for instruction details
+  InlineCostAnnotationWriter Writer;
+
   void dump();
 
+  // Prints the same analysis as dump(), but its definition is not dependent
+  // on the build.
+  void print();
+
+  Optional<InstructionCostDetail> getCostDetails(const Instruction *I) {
+    if (InstructionCostDetailMap.find(I) != InstructionCostDetailMap.end())
+      return InstructionCostDetailMap[I];
+    return None;
+  }
+
   virtual ~InlineCostCallAnalyzer() {}
   int getThreshold() { return Threshold; }
   int getCost() { return Cost; }
@@ -650,9 +759,35 @@ bool CallAnalyzer::isAllocaDerivedArg(Value *V) {
 
 void CallAnalyzer::disableSROAForArg(AllocaInst *SROAArg) {
   onDisableSROA(SROAArg);
-  EnabledSROAArgValues.erase(SROAArg);
+  EnabledSROAAllocas.erase(SROAArg);
   disableLoadElimination();
 }
+
+void InlineCostAnnotationWriter::emitInstructionAnnot(const Instruction *I,
+                                                formatted_raw_ostream &OS) {
+  // The cost of inlining of the given instruction is printed always.
+  // The threshold delta is printed only when it is non-zero. It happens
+  // when we decided to give a bonus at a particular instruction.
+  Optional<InstructionCostDetail> Record = ICCA->getCostDetails(I);
+  if (!Record)
+    OS << "; No analysis for the instruction";
+  else {
+    OS << "; cost before = " << Record->CostBefore
+       << ", cost after = " << Record->CostAfter
+       << ", threshold before = " << Record->ThresholdBefore
+       << ", threshold after = " << Record->ThresholdAfter << ", ";
+    OS << "cost delta = " << Record->getCostDelta();
+    if (Record->hasThresholdChanged())
+      OS << ", threshold delta = " << Record->getThresholdDelta();
+  }
+  auto C = ICCA->getSimplifiedValue(const_cast<Instruction *>(I));
+  if (C) {
+    OS << ", simplified to ";
+    C.getValue()->print(OS, true);
+  }
+  OS << "\n";
+}
+
 /// If 'V' maps to a SROA candidate, disable SROA for it.
 void CallAnalyzer::disableSROA(Value *V) {
   if (auto *SROAArg = getSROAArgForValueOrNull(V)) {
@@ -711,7 +846,9 @@ bool CallAnalyzer::isGEPFree(GetElementPtrInst &GEP) {
       Operands.push_back(SimpleOp);
     else
       Operands.push_back(*I);
-  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&GEP, Operands);
+  return TargetTransformInfo::TCC_Free ==
+         TTI.getUserCost(&GEP, Operands,
+                         TargetTransformInfo::TCK_SizeAndLatency);
 }
 
 bool CallAnalyzer::visitAlloca(AllocaInst &I) {
@@ -720,10 +857,22 @@ bool CallAnalyzer::visitAlloca(AllocaInst &I) {
   if (I.isArrayAllocation()) {
     Constant *Size = SimplifiedValues.lookup(I.getArraySize());
     if (auto *AllocSize = dyn_cast_or_null<ConstantInt>(Size)) {
+      // Sometimes a dynamic alloca could be converted into a static alloca
+      // after this constant prop, and become a huge static alloca on an
+      // unconditional CFG path. Avoid inlining if this is going to happen above
+      // a threshold.
+      // FIXME: If the threshold is removed or lowered too much, we could end up
+      // being too pessimistic and prevent inlining non-problematic code. This
+      // could result in unintended perf regressions. A better overall strategy
+      // is needed to track stack usage during inlining.
       Type *Ty = I.getAllocatedType();
       AllocatedSize = SaturatingMultiplyAdd(
           AllocSize->getLimitedValue(), DL.getTypeAllocSize(Ty).getFixedSize(),
           AllocatedSize);
+      if (AllocatedSize > InlineConstants::MaxSimplifiedDynamicAllocaToInline) {
+        HasDynamicAlloca = true;
+        return false;
+      }
       return Base::visitAlloca(I);
     }
   }
@@ -874,6 +1023,16 @@ bool CallAnalyzer::visitGetElementPtr(GetElementPtrInst &I) {
     return true;
   };
 
+  if (!DisableGEPConstOperand)
+    if (simplifyInstruction(I, [&](SmallVectorImpl<Constant *> &COps) {
+        SmallVector<Constant *, 2> Indices;
+        for (unsigned int Index = 1 ; Index < COps.size() ; ++Index)
+            Indices.push_back(COps[Index]);
+        return ConstantExpr::getGetElementPtr(I.getSourceElementType(), COps[0],
+                                              Indices, I.isInBounds());
+        }))
+      return true;
+
   if ((I.isInBounds() && canFoldInboundsGEP(I)) || IsGEPOffsetConstant(I)) {
     if (SROAArg)
       SROAArgValues[&I] = SROAArg;
@@ -959,7 +1118,8 @@ bool CallAnalyzer::visitPtrToInt(PtrToIntInst &I) {
   if (auto *SROAArg = getSROAArgForValueOrNull(I.getOperand(0)))
     SROAArgValues[&I] = SROAArg;
 
-  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
+  return TargetTransformInfo::TCC_Free ==
+         TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
 }
 
 bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
@@ -983,7 +1143,8 @@ bool CallAnalyzer::visitIntToPtr(IntToPtrInst &I) {
   if (auto *SROAArg = getSROAArgForValueOrNull(Op))
     SROAArgValues[&I] = SROAArg;
 
-  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
+  return TargetTransformInfo::TCC_Free ==
+         TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
 }
 
 bool CallAnalyzer::visitCastInst(CastInst &I) {
@@ -993,7 +1154,8 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
       }))
     return true;
 
-  // Disable SROA in the face of arbitrary casts we don't whitelist elsewhere.
+  // Disable SROA in the face of arbitrary casts we don't explicitly list
+  // elsewhere.
   disableSROA(I.getOperand(0));
 
   // If this is a floating-point cast, and the target says this operation
@@ -1013,7 +1175,8 @@ bool CallAnalyzer::visitCastInst(CastInst &I) {
     break;
   }
 
-  return TargetTransformInfo::TCC_Free == TTI.getUserCost(&I);
+  return TargetTransformInfo::TCC_Free ==
+         TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
 }
 
 bool CallAnalyzer::visitUnaryInstruction(UnaryInstruction &I) {
@@ -1085,7 +1248,7 @@ bool InlineCostCallAnalyzer::isColdCallSite(CallBase &Call,
   // If global profile summary is available, then callsite's coldness is
   // determined based on that.
   if (PSI && PSI->hasProfileSummary())
-    return PSI->isColdCallSite(CallSite(&Call), CallerBFI);
+    return PSI->isColdCallSite(Call, CallerBFI);
 
   // Otherwise we need BFI to be available.
   if (!CallerBFI)
@@ -1109,8 +1272,7 @@ InlineCostCallAnalyzer::getHotCallSiteThreshold(CallBase &Call,
 
   // If global profile summary is available, then callsite's hotness is
   // determined based on that.
-  if (PSI && PSI->hasProfileSummary() &&
-      PSI->isHotCallSite(CallSite(&Call), CallerBFI))
+  if (PSI && PSI->hasProfileSummary() && PSI->isHotCallSite(Call, CallerBFI))
     return Params.HotCallSiteThreshold;
 
   // Otherwise we need BFI to be available and to have a locally hot callsite
@@ -1200,7 +1362,7 @@ void InlineCostCallAnalyzer::updateThreshold(CallBase &Call, Function &Callee) {
     // Callsite hotness and coldness can be determined if sample profile is
     // used (which adds hotness metadata to calls) or if caller's
     // BlockFrequencyInfo is available.
-    BlockFrequencyInfo *CallerBFI = GetBFI ? &((*GetBFI)(*Caller)) : nullptr;
+    BlockFrequencyInfo *CallerBFI = GetBFI ? &(GetBFI(*Caller)) : nullptr;
     auto HotCallSiteThreshold = getHotCallSiteThreshold(Call, CallerBFI);
     if (!Caller->hasOptSize() && HotCallSiteThreshold) {
       LLVM_DEBUG(dbgs() << "Hot callsite.\n");
@@ -1667,7 +1829,7 @@ bool CallAnalyzer::visitSwitchInst(SwitchInst &SI) {
   // does not (yet) fire.
 
   unsigned JumpTableSize = 0;
-  BlockFrequencyInfo *BFI = GetBFI ? &((*GetBFI)(F)) : nullptr;
+  BlockFrequencyInfo *BFI = GetBFI ? &(GetBFI(F)) : nullptr;
   unsigned NumCaseCluster =
       TTI.getEstimatedNumberOfCaseClusters(SI, JumpTableSize, PSI, BFI);
 
@@ -1716,7 +1878,8 @@ bool CallAnalyzer::visitUnreachableInst(UnreachableInst &I) {
 bool CallAnalyzer::visitInstruction(Instruction &I) {
   // Some instructions are free. All of the free intrinsics can also be
   // handled by SROA, etc.
-  if (TargetTransformInfo::TCC_Free == TTI.getUserCost(&I))
+  if (TargetTransformInfo::TCC_Free ==
+      TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency))
     return true;
 
   // We found something we don't understand or can't handle. Mark any SROA-able
@@ -1761,33 +1924,36 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB,
     // all of the per-instruction logic. The visit tree returns true if we
     // consumed the instruction in any way, and false if the instruction's base
     // cost should count against inlining.
+    onInstructionAnalysisStart(&*I);
+
     if (Base::visit(&*I))
       ++NumInstructionsSimplified;
     else
-      onCommonInstructionSimplification();
+      onMissedSimplification();
 
+    onInstructionAnalysisFinish(&*I);
     using namespace ore;
     // If the visit this instruction detected an uninlinable pattern, abort.
-    InlineResult IR;
+    InlineResult IR = InlineResult::success();
     if (IsRecursiveCall)
-      IR = "recursive";
+      IR = InlineResult::failure("recursive");
     else if (ExposesReturnsTwice)
-      IR = "exposes returns twice";
+      IR = InlineResult::failure("exposes returns twice");
     else if (HasDynamicAlloca)
-      IR = "dynamic alloca";
+      IR = InlineResult::failure("dynamic alloca");
     else if (HasIndirectBr)
-      IR = "indirect branch";
+      IR = InlineResult::failure("indirect branch");
     else if (HasUninlineableIntrinsic)
-      IR = "uninlinable intrinsic";
+      IR = InlineResult::failure("uninlinable intrinsic");
     else if (InitsVargArgs)
-      IR = "varargs";
-    if (!IR) {
+      IR = InlineResult::failure("varargs");
+    if (!IR.isSuccess()) {
       if (ORE)
         ORE->emit([&]() {
           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
                                           &CandidateCall)
                  << NV("Callee", &F) << " has uninlinable pattern ("
-                 << NV("InlineResult", IR.message)
+                 << NV("InlineResult", IR.getFailureReason())
                  << ") and cost is not fully computed";
         });
       return IR;
@@ -1798,22 +1964,25 @@ CallAnalyzer::analyzeBlock(BasicBlock *BB,
     // the caller stack usage dramatically.
     if (IsCallerRecursive &&
         AllocatedSize > InlineConstants::TotalAllocaSizeRecursiveCaller) {
-      InlineResult IR = "recursive and allocates too much stack space";
+      auto IR =
+          InlineResult::failure("recursive and allocates too much stack space");
       if (ORE)
         ORE->emit([&]() {
           return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline",
                                           &CandidateCall)
-                 << NV("Callee", &F) << " is " << NV("InlineResult", IR.message)
+                 << NV("Callee", &F) << " is "
+                 << NV("InlineResult", IR.getFailureReason())
                  << ". Cost is not fully computed";
         });
       return IR;
     }
 
     if (shouldStop())
-      return false;
+      return InlineResult::failure(
+          "Call site analysis is not favorable to inlining.");
   }
 
-  return true;
+  return InlineResult::success();
 }
 
 /// Compute the base pointer and cumulative constant offsets for V.
@@ -1904,11 +2073,11 @@ InlineResult CallAnalyzer::analyze() {
   ++NumCallsAnalyzed;
 
   auto Result = onAnalysisStart();
-  if (!Result)
+  if (!Result.isSuccess())
     return Result;
 
   if (F.empty())
-    return true;
+    return InlineResult::success();
 
   Function *Caller = CandidateCall.getFunction();
   // Check if the caller function is recursive itself.
@@ -1937,6 +2106,7 @@ InlineResult CallAnalyzer::analyze() {
       if (auto *SROAArg = dyn_cast<AllocaInst>(PtrArg)) {
         SROAArgValues[&*FAI] = SROAArg;
         onInitializeSROAArg(SROAArg);
+        EnabledSROAAllocas.insert(SROAArg);
       }
     }
   }
@@ -1983,12 +2153,12 @@ InlineResult CallAnalyzer::analyze() {
     if (BB->hasAddressTaken())
       for (User *U : BlockAddress::get(&*BB)->users())
         if (!isa<CallBrInst>(*U))
-          return "blockaddress used outside of callbr";
+          return InlineResult::failure("blockaddress used outside of callbr");
 
     // Analyze the cost of this block. If we blow through the threshold, this
     // returns false, and we can bail on out.
     InlineResult IR = analyzeBlock(BB, EphValues);
-    if (!IR)
+    if (!IR.isSuccess())
       return IR;
 
     Instruction *TI = BB->getTerminator();
@@ -2034,15 +2204,15 @@ InlineResult CallAnalyzer::analyze() {
   // inlining this would cause the removal of the caller (so the instruction
   // is not actually duplicated, just moved).
   if (!OnlyOneCallAndLocalLinkage && ContainsNoDuplicateCall)
-    return "noduplicate";
+    return InlineResult::failure("noduplicate");
 
   return finalizeAnalysis();
 }
 
-#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-/// Dump stats about this call's analysis.
-LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() {
+void InlineCostCallAnalyzer::print() {
 #define DEBUG_PRINT_STAT(x) dbgs() << "      " #x ": " << x << "\n"
+  if (PrintInstructionComments)
+    F.print(dbgs(), &Writer);
   DEBUG_PRINT_STAT(NumConstantArgs);
   DEBUG_PRINT_STAT(NumConstantOffsetPtrArgs);
   DEBUG_PRINT_STAT(NumAllocaArgs);
@@ -2058,14 +2228,27 @@ LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() {
   DEBUG_PRINT_STAT(Threshold);
 #undef DEBUG_PRINT_STAT
 }
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+/// Dump stats about this call's analysis.
+LLVM_DUMP_METHOD void InlineCostCallAnalyzer::dump() {
+  print();
+}
 #endif
 
 /// Test that there are no attribute conflicts between Caller and Callee
 ///        that prevent inlining.
-static bool functionsHaveCompatibleAttributes(Function *Caller,
-                                              Function *Callee,
-                                              TargetTransformInfo &TTI) {
+static bool functionsHaveCompatibleAttributes(
+    Function *Caller, Function *Callee, TargetTransformInfo &TTI,
+    function_ref<const TargetLibraryInfo &(Function &)> &GetTLI) {
+  // Note that CalleeTLI must be a copy not a reference. The legacy pass manager
+  // caches the most recently created TLI in the TargetLibraryInfoWrapperPass
+  // object, and always returns the same object (which is overwritten on each
+  // GetTLI call). Therefore we copy the first result.
+  auto CalleeTLI = GetTLI(*Callee);
   return TTI.areInlineCompatible(Caller, Callee) &&
+         GetTLI(*Caller).areInlineCompatible(CalleeTLI,
+                                             InlineCallerSupersetNoBuiltin) &&
          AttributeFuncs::areInlineCompatible(*Caller, *Callee);
 }
 
@@ -2104,23 +2287,46 @@ int llvm::getCallsiteCost(CallBase &Call, const DataLayout &DL) {
 
 InlineCost llvm::getInlineCost(
     CallBase &Call, const InlineParams &Params, TargetTransformInfo &CalleeTTI,
-    std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
-    Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
+    function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
+    function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
+    function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
   return getInlineCost(Call, Call.getCalledFunction(), Params, CalleeTTI,
-                       GetAssumptionCache, GetBFI, PSI, ORE);
+                       GetAssumptionCache, GetTLI, GetBFI, PSI, ORE);
 }
 
-InlineCost llvm::getInlineCost(
-    CallBase &Call, Function *Callee, const InlineParams &Params,
-    TargetTransformInfo &CalleeTTI,
-    std::function<AssumptionCache &(Function &)> &GetAssumptionCache,
-    Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI,
+Optional<int> llvm::getInliningCostEstimate(
+    CallBase &Call, TargetTransformInfo &CalleeTTI,
+    function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
+    function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
     ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
+  const InlineParams Params = {/* DefaultThreshold*/ 0,
+                               /*HintThreshold*/ {},
+                               /*ColdThreshold*/ {},
+                               /*OptSizeThreshold*/ {},
+                               /*OptMinSizeThreshold*/ {},
+                               /*HotCallSiteThreshold*/ {},
+                               /*LocallyHotCallSiteThreshold*/ {},
+                               /*ColdCallSiteThreshold*/ {},
+                               /*ComputeFullInlineCost*/ true,
+                               /*EnableDeferral*/ true};
+
+  InlineCostCallAnalyzer CA(*Call.getCalledFunction(), Call, Params, CalleeTTI,
+                            GetAssumptionCache, GetBFI, PSI, ORE, true,
+                            /*IgnoreThreshold*/ true);
+  auto R = CA.analyze();
+  if (!R.isSuccess())
+    return None;
+  return CA.getCost();
+}
+
+Optional<InlineResult> llvm::getAttributeBasedInliningDecision(
+    CallBase &Call, Function *Callee, TargetTransformInfo &CalleeTTI,
+    function_ref<const TargetLibraryInfo &(Function &)> GetTLI) {
 
   // Cannot inline indirect calls.
   if (!Callee)
-    return llvm::InlineCost::getNever("indirect call");
+    return InlineResult::failure("indirect call");
 
   // Never inline calls with byval arguments that does not have the alloca
   // address space. Since byval arguments can be replaced with a copy to an
@@ -2132,59 +2338,80 @@ InlineCost llvm::getInlineCost(
     if (Call.isByValArgument(I)) {
       PointerType *PTy = cast<PointerType>(Call.getArgOperand(I)->getType());
       if (PTy->getAddressSpace() != AllocaAS)
-        return llvm::InlineCost::getNever("byval arguments without alloca"
-                                          " address space");
+        return InlineResult::failure("byval arguments without alloca"
+                                     " address space");
     }
 
   // Calls to functions with always-inline attributes should be inlined
   // whenever possible.
   if (Call.hasFnAttr(Attribute::AlwaysInline)) {
     auto IsViable = isInlineViable(*Callee);
-    if (IsViable)
-      return llvm::InlineCost::getAlways("always inline attribute");
-    return llvm::InlineCost::getNever(IsViable.message);
+    if (IsViable.isSuccess())
+      return InlineResult::success();
+    return InlineResult::failure(IsViable.getFailureReason());
   }
 
   // Never inline functions with conflicting attributes (unless callee has
   // always-inline attribute).
   Function *Caller = Call.getCaller();
-  if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI))
-    return llvm::InlineCost::getNever("conflicting attributes");
+  if (!functionsHaveCompatibleAttributes(Caller, Callee, CalleeTTI, GetTLI))
+    return InlineResult::failure("conflicting attributes");
 
   // Don't inline this call if the caller has the optnone attribute.
   if (Caller->hasOptNone())
-    return llvm::InlineCost::getNever("optnone attribute");
+    return InlineResult::failure("optnone attribute");
 
   // Don't inline a function that treats null pointer as valid into a caller
   // that does not have this attribute.
   if (!Caller->nullPointerIsDefined() && Callee->nullPointerIsDefined())
-    return llvm::InlineCost::getNever("nullptr definitions incompatible");
+    return InlineResult::failure("nullptr definitions incompatible");
 
   // Don't inline functions which can be interposed at link-time.
   if (Callee->isInterposable())
-    return llvm::InlineCost::getNever("interposable");
+    return InlineResult::failure("interposable");
 
   // Don't inline functions marked noinline.
   if (Callee->hasFnAttribute(Attribute::NoInline))
-    return llvm::InlineCost::getNever("noinline function attribute");
+    return InlineResult::failure("noinline function attribute");
 
   // Don't inline call sites marked noinline.
   if (Call.isNoInline())
-    return llvm::InlineCost::getNever("noinline call site attribute");
+    return InlineResult::failure("noinline call site attribute");
+
+  return None;
+}
+
+InlineCost llvm::getInlineCost(
+    CallBase &Call, Function *Callee, const InlineParams &Params,
+    TargetTransformInfo &CalleeTTI,
+    function_ref<AssumptionCache &(Function &)> GetAssumptionCache,
+    function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
+    function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+    ProfileSummaryInfo *PSI, OptimizationRemarkEmitter *ORE) {
+
+  auto UserDecision =
+      llvm::getAttributeBasedInliningDecision(Call, Callee, CalleeTTI, GetTLI);
+
+  if (UserDecision.hasValue()) {
+    if (UserDecision->isSuccess())
+      return llvm::InlineCost::getAlways("always inline attribute");
+    return llvm::InlineCost::getNever(UserDecision->getFailureReason());
+  }
 
   LLVM_DEBUG(llvm::dbgs() << "      Analyzing call of " << Callee->getName()
-                          << "... (caller:" << Caller->getName() << ")\n");
+                          << "... (caller:" << Call.getCaller()->getName()
+                          << ")\n");
 
-  InlineCostCallAnalyzer CA(CalleeTTI, GetAssumptionCache, GetBFI, PSI, ORE,
-                            *Callee, Call, Params);
+  InlineCostCallAnalyzer CA(*Callee, Call, Params, CalleeTTI,
+                            GetAssumptionCache, GetBFI, PSI, ORE);
   InlineResult ShouldInline = CA.analyze();
 
   LLVM_DEBUG(CA.dump());
 
   // Check if there was a reason to force inlining or no inlining.
-  if (!ShouldInline && CA.getCost() < CA.getThreshold())
-    return InlineCost::getNever(ShouldInline.message);
-  if (ShouldInline && CA.getCost() >= CA.getThreshold())
+  if (!ShouldInline.isSuccess() && CA.getCost() < CA.getThreshold())
+    return InlineCost::getNever(ShouldInline.getFailureReason());
+  if (ShouldInline.isSuccess() && CA.getCost() >= CA.getThreshold())
     return InlineCost::getAlways("empty function");
 
   return llvm::InlineCost::get(CA.getCost(), CA.getThreshold());
@@ -2195,14 +2422,14 @@ InlineResult llvm::isInlineViable(Function &F) {
   for (Function::iterator BI = F.begin(), BE = F.end(); BI != BE; ++BI) {
     // Disallow inlining of functions which contain indirect branches.
     if (isa<IndirectBrInst>(BI->getTerminator()))
-      return "contains indirect branches";
+      return InlineResult::failure("contains indirect branches");
 
     // Disallow inlining of blockaddresses which are used by non-callbr
     // instructions.
     if (BI->hasAddressTaken())
       for (User *U : BlockAddress::get(&*BI)->users())
         if (!isa<CallBrInst>(*U))
-          return "blockaddress used outside of callbr";
+          return InlineResult::failure("blockaddress used outside of callbr");
 
     for (auto &II : *BI) {
       CallBase *Call = dyn_cast<CallBase>(&II);
@@ -2211,13 +2438,13 @@ InlineResult llvm::isInlineViable(Function &F) {
 
       // Disallow recursive calls.
       if (&F == Call->getCalledFunction())
-        return "recursive call";
+        return InlineResult::failure("recursive call");
 
       // Disallow calls which expose returns-twice to a function not previously
       // attributed as such.
       if (!ReturnsTwice && isa<CallInst>(Call) &&
           cast<CallInst>(Call)->canReturnTwice())
-        return "exposes returns-twice attribute";
+        return InlineResult::failure("exposes returns-twice attribute");
 
       if (Call->getCalledFunction())
         switch (Call->getCalledFunction()->getIntrinsicID()) {
@@ -2226,20 +2453,23 @@ InlineResult llvm::isInlineViable(Function &F) {
         case llvm::Intrinsic::icall_branch_funnel:
           // Disallow inlining of @llvm.icall.branch.funnel because current
           // backend can't separate call targets from call arguments.
-          return "disallowed inlining of @llvm.icall.branch.funnel";
+          return InlineResult::failure(
+              "disallowed inlining of @llvm.icall.branch.funnel");
         case llvm::Intrinsic::localescape:
           // Disallow inlining functions that call @llvm.localescape. Doing this
           // correctly would require major changes to the inliner.
-          return "disallowed inlining of @llvm.localescape";
+          return InlineResult::failure(
+              "disallowed inlining of @llvm.localescape");
         case llvm::Intrinsic::vastart:
           // Disallow inlining of functions that initialize VarArgs with
           // va_start.
-          return "contains VarArgs initialized with va_start";
+          return InlineResult::failure(
+              "contains VarArgs initialized with va_start");
         }
     }
   }
 
-  return true;
+  return InlineResult::success();
 }
 
 // APIs to create InlineParams based on command line flags and/or other
@@ -2299,7 +2529,7 @@ InlineParams llvm::getInlineParams(int Threshold) {
 }
 
 InlineParams llvm::getInlineParams() {
-  return getInlineParams(InlineThreshold);
+  return getInlineParams(DefaultThreshold);
 }
 
 // Compute the default threshold for inlining based on the opt level and the
@@ -2312,7 +2542,7 @@ static int computeThresholdFromOptLevels(unsigned OptLevel,
     return InlineConstants::OptSizeThreshold;
   if (SizeOptLevel == 2) // -Oz
     return InlineConstants::OptMinSizeThreshold;
-  return InlineThreshold;
+  return DefaultThreshold;
 }
 
 InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
@@ -2325,3 +2555,40 @@ InlineParams llvm::getInlineParams(unsigned OptLevel, unsigned SizeOptLevel) {
     Params.LocallyHotCallSiteThreshold = LocallyHotCallSiteThreshold;
   return Params;
 }
+
+PreservedAnalyses
+InlineCostAnnotationPrinterPass::run(Function &F,
+                                     FunctionAnalysisManager &FAM) {
+  PrintInstructionComments = true;
+  std::function<AssumptionCache &(Function &)> GetAssumptionCache = [&](
+      Function &F) -> AssumptionCache & {
+    return FAM.getResult<AssumptionAnalysis>(F);
+  };
+  Module *M = F.getParent();
+  ProfileSummaryInfo PSI(*M);
+  DataLayout DL(M);
+  TargetTransformInfo TTI(DL);
+  // FIXME: Redesign the usage of InlineParams to expand the scope of this pass.
+  // In the current implementation, the type of InlineParams doesn't matter as
+  // the pass serves only for verification of inliner's decisions.
+  // We can add a flag which determines InlineParams for this run. Right now,
+  // the default InlineParams are used.
+  const InlineParams Params = llvm::getInlineParams();
+    for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (CallInst *CI = dyn_cast<CallInst>(&I)) {
+        Function *CalledFunction = CI->getCalledFunction();
+        if (!CalledFunction || CalledFunction->isDeclaration())
+          continue;
+        OptimizationRemarkEmitter ORE(CalledFunction);
+        InlineCostCallAnalyzer ICCA(*CalledFunction, *CI, Params, TTI,
+                                    GetAssumptionCache, nullptr, &PSI, &ORE);
+        ICCA.analyze();
+        OS << "      Analyzing call of " << CalledFunction->getName()
+           << "... (caller:" << CI->getCaller()->getName() << ")\n";
+        ICCA.print();
+      }
+    }
+  }
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Analysis/InlineFeaturesAnalysis.cpp b/llvm/lib/Analysis/InlineFeaturesAnalysis.cpp
new file mode 100644
index 000000000000..90f521bbaab4
--- /dev/null
+++ b/llvm/lib/Analysis/InlineFeaturesAnalysis.cpp
@@ -0,0 +1,41 @@
+//===- InlineFeaturesAnalysis.cpp - Feature extraction for ML Policies ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements an analysis extracting function features, which may be
+// used by ML-driven policies, for example.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/InlineFeaturesAnalysis.h"
+#include "llvm/IR/Instructions.h"
+
+using namespace llvm;
+
+AnalysisKey InlineFeaturesAnalysis::Key;
+
+InlineFeaturesAnalysis::Result
+InlineFeaturesAnalysis::run(const Function &F, FunctionAnalysisManager &FAM) {
+  Result Ret;
+  Ret.Uses = ((!F.hasLocalLinkage()) ? 1 : 0) + F.getNumUses();
+  for (const auto &BB : F) {
+    ++Ret.BasicBlockCount;
+    if (const auto *BI = dyn_cast<BranchInst>(BB.getTerminator())) {
+      if (BI->isConditional())
+        Ret.BlocksReachedFromConditionalInstruction += BI->getNumSuccessors();
+    } else if (const auto *SI = dyn_cast<SwitchInst>(BB.getTerminator()))
+      Ret.BlocksReachedFromConditionalInstruction +=
+          (SI->getNumCases() + (nullptr != SI->getDefaultDest()));
+    for (const auto &I : BB)
+      if (auto *CS = dyn_cast<CallBase>(&I)) {
+        const auto *Callee = CS->getCalledFunction();
+        if (Callee && !Callee->isIntrinsic() && !Callee->isDeclaration())
+          ++Ret.DirectCallsToDefinedFunctions;
+      }
+  }
+  return Ret;
+}
+\ No newline at end of file
diff --git a/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp b/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp
new file mode 100644
index 000000000000..ebc59879d357
--- /dev/null
+++ b/llvm/lib/Analysis/InlineSizeEstimatorAnalysis.cpp
@@ -0,0 +1,299 @@
+//===- InlineSizeEstimatorAnalysis.cpp - IR to native size from ML model --===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This implements feature and label extraction for offline supervised learning
+// of a IR to native size model.
+//
+//===----------------------------------------------------------------------===//
+#include "llvm/Analysis/InlineSizeEstimatorAnalysis.h"
+
+#ifdef LLVM_HAVE_TF_API
+#include "llvm/Analysis/Utils/TFUtils.h"
+#endif
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/BasicBlock.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/MC/MCAsmLayout.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include <algorithm>
+#include <deque>
+
+using namespace llvm;
+
+AnalysisKey InlineSizeEstimatorAnalysis::Key;
+
+#define DEBUG_TYPE "inline-size-estimator"
+
+#ifdef LLVM_HAVE_TF_API
+cl::opt<std::string> TFIR2NativeModelPath(
+    "ml-inliner-ir2native-model", cl::Hidden,
+    cl::desc("Path to saved model evaluating native size from IR."));
+
+namespace {
+unsigned getMaxInstructionID() {
+#define LAST_OTHER_INST(NR) return NR;
+#include "llvm/IR/Instruction.def"
+}
+
+class IRToNativeSizeLearning {
+public:
+  enum class NamedFeatureIndex : size_t {
+    InitialSize,
+    Blocks,
+    Calls,
+    IsLocal,
+    IsLinkOnceODR,
+    IsLinkOnce,
+    Loops,
+    MaxLoopDepth,
+    MaxDomTreeLevel,
+
+    NumNamedFeatures
+  };
+  static const size_t NumNamedFeatures =
+      static_cast<size_t>(NamedFeatureIndex::NumNamedFeatures);
+  struct FunctionFeatures {
+    static std::vector<std::pair<size_t, size_t>>
+        ImportantInstructionSuccessions;
+    static const size_t FeatureCount;
+
+    std::array<int32_t, NumNamedFeatures> NamedFeatures = {0};
+    std::vector<int32_t> InstructionHistogram;
+    std::vector<int32_t> InstructionPairHistogram;
+
+    void fillTensor(int32_t *Ptr) const;
+    int32_t &operator[](NamedFeatureIndex Pos) {
+      return NamedFeatures[static_cast<size_t>(Pos)];
+    }
+  };
+  IRToNativeSizeLearning() = default;
+
+  static FunctionFeatures getFunctionFeatures(Function &F,
+                                              FunctionAnalysisManager &FAM);
+
+private:
+  /// Sort once the feature tuples.
+  struct SortFeatureTuples {
+    bool IsSorted = false;
+    SortFeatureTuples() {
+      std::sort(FunctionFeatures::ImportantInstructionSuccessions.begin(),
+                FunctionFeatures::ImportantInstructionSuccessions.end());
+      IsSorted = true;
+    }
+  };
+
+  static llvm::ManagedStatic<SortFeatureTuples> TupleSorter;
+
+  static bool ensureSortedTuples() { return TupleSorter->IsSorted; }
+};
+llvm::ManagedStatic<IRToNativeSizeLearning::SortFeatureTuples>
+    IRToNativeSizeLearning::TupleSorter;
+
+// This is a point in time - we determined including these pairs of
+// consecutive instructions (in the IR layout available at inline time) as
+// features improves the model performance. We want to move away from manual
+// feature selection.
+// The vector is given in opcode pairs rather than labels because 1) labels
+// weren't readily available, and 2) the successions were hand - extracted
+std::vector<std::pair<size_t, size_t>>
+    IRToNativeSizeLearning::FunctionFeatures::ImportantInstructionSuccessions =
+        {{1, 34},  {15, 27}, {53, 53}, {53, 34}, {1, 11},  {32, 2},  {2, 48},
+         {28, 48}, {1, 45},  {49, 32}, {57, 56}, {55, 53}, {1, 28},  {57, 34},
+         {1, 1},   {32, 28}, {32, 15}, {49, 28}, {53, 1},  {2, 53},  {48, 34},
+         {28, 53}, {2, 32},  {1, 40},  {32, 48}, {29, 56}, {56, 32}, {55, 56},
+         {48, 56}, {1, 31},  {33, 34}, {2, 28},  {1, 12},  {55, 1},  {31, 31},
+         {65, 1},  {33, 56}, {32, 32}, {13, 13}, {1, 26},  {13, 26}, {2, 1},
+         {1, 33},  {47, 49}, {64, 1},  {2, 38},  {34, 53}, {48, 2},  {55, 34},
+         {34, 32}, {1, 5},   {56, 13}, {2, 2},   {2, 49},  {33, 2},  {49, 39},
+         {56, 49}, {33, 49}, {32, 39}, {39, 57}, {29, 33}, {31, 34}, {32, 29},
+         {47, 15}, {13, 34}, {2, 33},  {32, 49}, {49, 34}, {56, 33}, {1, 30},
+         {33, 33}, {31, 33}, {2, 29},  {56, 7},  {32, 13}, {2, 55},  {56, 56},
+         {2, 34},  {1, 42},  {34, 49}, {1, 20},  {32, 33}, {1, 25},  {53, 28},
+         {1, 14},  {31, 49}, {28, 2},  {2, 13},  {2, 56},  {1, 32},  {56, 53},
+         {65, 65}, {33, 53}, {64, 64}, {13, 2},  {34, 33}, {1, 4},   {49, 2},
+         {1, 9},   {56, 1},  {33, 1},  {53, 57}, {32, 53}, {13, 56}, {32, 56},
+         {55, 55}, {1, 18},  {49, 56}, {34, 34}, {1, 7},   {56, 64}, {32, 1},
+         {13, 33}, {55, 28}, {49, 33}, {57, 57}, {56, 34}, {34, 56}, {33, 32},
+         {32, 40}, {1, 29},  {53, 2},  {34, 1},  {32, 34}, {49, 49}, {1, 24},
+         {40, 34}, {1, 13},  {38, 34}, {29, 2},  {34, 2},  {1, 39},  {1, 22},
+         {1, 27},  {49, 1},  {1, 8},   {56, 2}};
+
+// We have: 9 calculated features (the features here); 1 feature for each
+// instruction opcode; and 1 feature for each manually-identified sequence.
+// For the latter 2, we build a histogram: we count the number of
+// occurrences of each instruction opcode or succession of instructions,
+// respectively.
+// Note that instruction opcodes start from 1. For convenience, we also have an
+// always 0 feature for the '0' opcode, hence the extra 1.
+const size_t IRToNativeSizeLearning::FunctionFeatures::FeatureCount =
+    IRToNativeSizeLearning::FunctionFeatures::ImportantInstructionSuccessions
+        .size() +
+    getMaxInstructionID() + 1 + IRToNativeSizeLearning::NumNamedFeatures;
+
+size_t getSize(Function &F, TargetTransformInfo &TTI) {
+  size_t Ret = 0;
+  for (auto &BB : F)
+    for (auto &I : BB)
+      Ret += TTI.getInstructionCost(
+          &I, TargetTransformInfo::TargetCostKind::TCK_CodeSize);
+  return Ret;
+}
+
+size_t getSize(Function &F, FunctionAnalysisManager &FAM) {
+  auto &TTI = FAM.getResult<TargetIRAnalysis>(F);
+  return getSize(F, TTI);
+}
+
+unsigned getMaxDominatorTreeDepth(const Function &F,
+                                  const DominatorTree &Tree) {
+  unsigned Ret = 0;
+  for (auto &BB : F)
+    if (auto *TN = Tree.getNode(&BB))
+      Ret = std::max(Ret, TN->getLevel());
+  return Ret;
+}
+} // namespace
+
+IRToNativeSizeLearning::FunctionFeatures
+IRToNativeSizeLearning::getFunctionFeatures(Function &F,
+                                            FunctionAnalysisManager &FAM) {
+  assert(ensureSortedTuples() && "expected lazy initialization");
+
+  auto &DomTree = FAM.getResult<DominatorTreeAnalysis>(F);
+  FunctionFeatures FF;
+  size_t InstrCount = getMaxInstructionID() + 1;
+  FF.InstructionHistogram.resize(InstrCount);
+
+  FF.InstructionPairHistogram.resize(
+      FunctionFeatures::ImportantInstructionSuccessions.size());
+
+  auto StartID = 0;
+  auto LastID = StartID;
+  auto getPairIndex = [](size_t a, size_t b) {
+    auto I =
+        std::find(FunctionFeatures::ImportantInstructionSuccessions.begin(),
+                  FunctionFeatures::ImportantInstructionSuccessions.end(),
+                  std::make_pair(a, b));
+    if (I == FunctionFeatures::ImportantInstructionSuccessions.end())
+      return -1;
+    return static_cast<int>(std::distance(
+        FunctionFeatures::ImportantInstructionSuccessions.begin(), I));
+  };
+
+  // We don't want debug calls, because they'd just add noise.
+  for (auto &BB : F) {
+    for (auto I = BB.instructionsWithoutDebug().begin(),
+              E = BB.instructionsWithoutDebug().end();
+         I != E; ++I) {
+      auto ID = I->getOpcode();
+
+      ++FF.InstructionHistogram[ID];
+      int PairIndex = getPairIndex(LastID, ID);
+      if (PairIndex >= 0)
+        ++FF.InstructionPairHistogram[PairIndex];
+      LastID = ID;
+      if (isa<CallBase>(*I))
+        ++FF[NamedFeatureIndex::Calls];
+    }
+  }
+
+  FF[NamedFeatureIndex::InitialSize] = getSize(F, FAM);
+  FF[NamedFeatureIndex::IsLocal] = F.hasLocalLinkage();
+  FF[NamedFeatureIndex::IsLinkOnceODR] = F.hasLinkOnceODRLinkage();
+  FF[NamedFeatureIndex::IsLinkOnce] = F.hasLinkOnceLinkage();
+  FF[NamedFeatureIndex::Blocks] =
+      std::distance(F.getBasicBlockList().begin(), F.getBasicBlockList().end());
+  auto &LI = FAM.getResult<LoopAnalysis>(F);
+  FF[NamedFeatureIndex::Loops] = std::distance(LI.begin(), LI.end());
+  for (auto &L : LI)
+    FF[NamedFeatureIndex::MaxLoopDepth] =
+        std::max(FF[NamedFeatureIndex::MaxLoopDepth],
+                 static_cast<int32_t>(L->getLoopDepth()));
+  FF[NamedFeatureIndex::MaxDomTreeLevel] = getMaxDominatorTreeDepth(F, DomTree);
+  return FF;
+}
+
+void IRToNativeSizeLearning::FunctionFeatures::fillTensor(int32_t *Ptr) const {
+  std::copy(NamedFeatures.begin(), NamedFeatures.end(), Ptr);
+  Ptr += NamedFeatures.size();
+  std::copy(InstructionHistogram.begin(), InstructionHistogram.end(), Ptr);
+  Ptr += InstructionHistogram.size();
+  std::copy(InstructionPairHistogram.begin(), InstructionPairHistogram.end(),
+            Ptr);
+}
+
+bool InlineSizeEstimatorAnalysis::isEvaluatorRequested() {
+  return !TFIR2NativeModelPath.empty();
+}
+
+InlineSizeEstimatorAnalysis::InlineSizeEstimatorAnalysis() {
+  if (!isEvaluatorRequested()) {
+    return;
+  }
+  std::vector<std::string> InputNames{"serving_default_input_1"};
+  std::vector<std::string> OutputName{"StatefulPartitionedCall"};
+  Evaluator = std::make_unique<TFModelEvaluator>(
+      TFIR2NativeModelPath.getValue().c_str(), InputNames, OutputName);
+  if (!Evaluator || !Evaluator->isValid()) {
+    Evaluator.reset();
+    return;
+  }
+  static const std::vector<int64_t> Dim{
+      1, static_cast<int64_t>(
+             IRToNativeSizeLearning::FunctionFeatures::FeatureCount)};
+
+  Evaluator->initInput<int32_t>(0, Dim);
+}
+
+InlineSizeEstimatorAnalysis::Result
+InlineSizeEstimatorAnalysis::run(const Function &F,
+                                 FunctionAnalysisManager &FAM) {
+  if (!Evaluator)
+    return None;
+  auto Features = IRToNativeSizeLearning::getFunctionFeatures(
+      const_cast<Function &>(F), FAM);
+  int32_t *V = Evaluator->getInput<int32_t>(0);
+  Features.fillTensor(V);
+  auto ER = Evaluator->evaluate();
+  if (!ER)
+    return None;
+  float Ret = *ER->getTensorValue<float>(0);
+  if (Ret < 0.0)
+    Ret = 0.0;
+  return static_cast<size_t>(Ret);
+}
+
+InlineSizeEstimatorAnalysis::~InlineSizeEstimatorAnalysis() {}
+InlineSizeEstimatorAnalysis::InlineSizeEstimatorAnalysis(
+    InlineSizeEstimatorAnalysis &&Other)
+    : Evaluator(std::move(Other.Evaluator)) {}
+
+#else
+namespace llvm {
+class TFModelEvaluator {};
+} // namespace llvm
+InlineSizeEstimatorAnalysis::InlineSizeEstimatorAnalysis() {}
+InlineSizeEstimatorAnalysis ::InlineSizeEstimatorAnalysis(
+    InlineSizeEstimatorAnalysis &&) {}
+InlineSizeEstimatorAnalysis::~InlineSizeEstimatorAnalysis() {}
+InlineSizeEstimatorAnalysis::Result
+InlineSizeEstimatorAnalysis::run(const Function &F,
+                                 FunctionAnalysisManager &FAM) {
+  return None;
+}
+bool InlineSizeEstimatorAnalysis::isEvaluatorRequested() { return false; }
+#endif
+\ No newline at end of file
diff --git a/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp b/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp
index 415797d6a378..c26cdf2266da 100644
--- a/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp
+++ b/llvm/lib/Analysis/InstructionPrecedenceTracking.cpp
@@ -59,7 +59,7 @@ bool InstructionPrecedenceTracking::isPreceededBySpecialInstruction(
     const Instruction *Insn) {
   const Instruction *MaybeFirstSpecial =
       getFirstSpecialInstruction(Insn->getParent());
-  return MaybeFirstSpecial && OI.dominates(MaybeFirstSpecial, Insn);
+  return MaybeFirstSpecial && MaybeFirstSpecial->comesBefore(Insn);
 }
 
 void InstructionPrecedenceTracking::fill(const BasicBlock *BB) {
@@ -104,18 +104,14 @@ void InstructionPrecedenceTracking::insertInstructionTo(const Instruction *Inst,
                                                         const BasicBlock *BB) {
   if (isSpecialInstruction(Inst))
     FirstSpecialInsts.erase(BB);
-  OI.invalidateBlock(BB);
 }
 
 void InstructionPrecedenceTracking::removeInstruction(const Instruction *Inst) {
   if (isSpecialInstruction(Inst))
     FirstSpecialInsts.erase(Inst->getParent());
-  OI.invalidateBlock(Inst->getParent());
 }
 
 void InstructionPrecedenceTracking::clear() {
-  for (auto It : FirstSpecialInsts)
-    OI.invalidateBlock(It.first);
   FirstSpecialInsts.clear();
 #ifndef NDEBUG
   // The map should be valid after clearing (at least empty).
@@ -130,26 +126,7 @@ bool ImplicitControlFlowTracking::isSpecialInstruction(
   // to avoid wrong assumptions of sort "if A is executed and B post-dominates
   // A, then B is also executed". This is not true is there is an implicit
   // control flow instruction (e.g. a guard) between them.
-  //
-  // TODO: Currently, isGuaranteedToTransferExecutionToSuccessor returns false
-  // for volatile stores and loads because they can trap. The discussion on
-  // whether or not it is correct is still ongoing. We might want to get rid
-  // of this logic in the future. Anyways, trapping instructions shouldn't
-  // introduce implicit control flow, so we explicitly allow them here. This
-  // must be removed once isGuaranteedToTransferExecutionToSuccessor is fixed.
-  if (isGuaranteedToTransferExecutionToSuccessor(Insn))
-    return false;
-  if (isa<LoadInst>(Insn)) {
-    assert(cast<LoadInst>(Insn)->isVolatile() &&
-           "Non-volatile load should transfer execution to successor!");
-    return false;
-  }
-  if (isa<StoreInst>(Insn)) {
-    assert(cast<StoreInst>(Insn)->isVolatile() &&
-           "Non-volatile store should transfer execution to successor!");
-    return false;
-  }
-  return true;
+  return !isGuaranteedToTransferExecutionToSuccessor(Insn);
 }
 
 bool MemoryWriteTracking::isSpecialInstruction(
diff --git a/llvm/lib/Analysis/InstructionSimplify.cpp b/llvm/lib/Analysis/InstructionSimplify.cpp
index d7510c899101..0975a65d183e 100644
--- a/llvm/lib/Analysis/InstructionSimplify.cpp
+++ b/llvm/lib/Analysis/InstructionSimplify.cpp
@@ -222,7 +222,7 @@ static bool valueDominatesPHI(Value *V, PHINode *P, const DominatorTree *DT) {
   // Otherwise, if the instruction is in the entry block and is not an invoke,
   // then it obviously dominates all phi nodes.
   if (I->getParent() == &I->getFunction()->getEntryBlock() &&
-      !isa<InvokeInst>(I))
+      !isa<InvokeInst>(I) && !isa<CallBrInst>(I))
     return true;
 
   return false;
@@ -707,9 +707,8 @@ static Constant *stripAndComputeConstantOffsets(const DataLayout &DL, Value *&V,
   Offset = Offset.sextOrTrunc(IntIdxTy->getIntegerBitWidth());
 
   Constant *OffsetIntPtr = ConstantInt::get(IntIdxTy, Offset);
-  if (V->getType()->isVectorTy())
-    return ConstantVector::getSplat(V->getType()->getVectorNumElements(),
-                                    OffsetIntPtr);
+  if (VectorType *VecTy = dyn_cast<VectorType>(V->getType()))
+    return ConstantVector::getSplat(VecTy->getElementCount(), OffsetIntPtr);
   return OffsetIntPtr;
 }
 
@@ -943,11 +942,12 @@ static Value *simplifyDivRem(Value *Op0, Value *Op1, bool IsDiv) {
   if (match(Op1, m_Zero()))
     return UndefValue::get(Ty);
 
-  // If any element of a constant divisor vector is zero or undef, the whole op
-  // is undef.
+  // If any element of a constant divisor fixed width vector is zero or undef,
+  // the whole op is undef.
   auto *Op1C = dyn_cast<Constant>(Op1);
-  if (Op1C && Ty->isVectorTy()) {
-    unsigned NumElts = Ty->getVectorNumElements();
+  auto *VTy = dyn_cast<FixedVectorType>(Ty);
+  if (Op1C && VTy) {
+    unsigned NumElts = VTy->getNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Elt = Op1C->getAggregateElement(i);
       if (Elt && (Elt->isNullValue() || isa<UndefValue>(Elt)))
@@ -1222,7 +1222,8 @@ static bool isUndefShift(Value *Amount) {
 
   // If all lanes of a vector shift are undefined the whole shift is.
   if (isa<ConstantVector>(C) || isa<ConstantDataVector>(C)) {
-    for (unsigned I = 0, E = C->getType()->getVectorNumElements(); I != E; ++I)
+    for (unsigned I = 0, E = cast<VectorType>(C->getType())->getNumElements();
+         I != E; ++I)
       if (!isUndefShift(C->getAggregateElement(I)))
         return false;
     return true;
@@ -1429,9 +1430,6 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
     if (match(UnsignedICmp,
               m_c_ICmp(UnsignedPred, m_Specific(A), m_Specific(B))) &&
         ICmpInst::isUnsigned(UnsignedPred)) {
-      if (UnsignedICmp->getOperand(0) != A)
-        UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
-
       // A >=/<= B || (A - B) != 0  <-->  true
       if ((UnsignedPred == ICmpInst::ICMP_UGE ||
            UnsignedPred == ICmpInst::ICMP_ULE) &&
@@ -1461,9 +1459,6 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
     //   Y <  A || Y == 0  --> Y <  A  iff B != 0
     if (match(UnsignedICmp,
               m_c_ICmp(UnsignedPred, m_Specific(Y), m_Specific(A)))) {
-      if (UnsignedICmp->getOperand(0) != Y)
-        UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
-
       if (UnsignedPred == ICmpInst::ICMP_UGE && IsAnd &&
           EqPred == ICmpInst::ICMP_NE &&
           isKnownNonZero(B, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
@@ -1485,10 +1480,11 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
   else
     return nullptr;
 
-  // X < Y && Y != 0  -->  X < Y
-  // X < Y || Y != 0  -->  Y != 0
-  if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE)
-    return IsAnd ? UnsignedICmp : ZeroICmp;
+  // X > Y && Y == 0  -->  Y == 0  iff X != 0
+  // X > Y || Y == 0  -->  X > Y   iff X != 0
+  if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
+      isKnownNonZero(X, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
+    return IsAnd ? ZeroICmp : UnsignedICmp;
 
   // X <= Y && Y != 0  -->  X <= Y  iff X != 0
   // X <= Y || Y != 0  -->  Y != 0  iff X != 0
@@ -1496,17 +1492,21 @@ static Value *simplifyUnsignedRangeCheck(ICmpInst *ZeroICmp,
       isKnownNonZero(X, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
     return IsAnd ? UnsignedICmp : ZeroICmp;
 
+  // The transforms below here are expected to be handled more generally with
+  // simplifyAndOrOfICmpsWithLimitConst() or in InstCombine's
+  // foldAndOrOfICmpsWithConstEq(). If we are looking to trim optimizer overlap,
+  // these are candidates for removal.
+
+  // X < Y && Y != 0  -->  X < Y
+  // X < Y || Y != 0  -->  Y != 0
+  if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_NE)
+    return IsAnd ? UnsignedICmp : ZeroICmp;
+
   // X >= Y && Y == 0  -->  Y == 0
   // X >= Y || Y == 0  -->  X >= Y
   if (UnsignedPred == ICmpInst::ICMP_UGE && EqPred == ICmpInst::ICMP_EQ)
     return IsAnd ? ZeroICmp : UnsignedICmp;
 
-  // X > Y && Y == 0  -->  Y == 0  iff X != 0
-  // X > Y || Y == 0  -->  X > Y   iff X != 0
-  if (UnsignedPred == ICmpInst::ICMP_UGT && EqPred == ICmpInst::ICMP_EQ &&
-      isKnownNonZero(X, Q.DL, /*Depth=*/0, Q.AC, Q.CxtI, Q.DT))
-    return IsAnd ? ZeroICmp : UnsignedICmp;
-
   // X < Y && Y == 0  -->  false
   if (UnsignedPred == ICmpInst::ICMP_ULT && EqPred == ICmpInst::ICMP_EQ &&
       IsAnd)
@@ -1695,6 +1695,64 @@ static Value *simplifyAndOfICmpsWithAdd(ICmpInst *Op0, ICmpInst *Op1,
   return nullptr;
 }
 
+/// Try to eliminate compares with signed or unsigned min/max constants.
+static Value *simplifyAndOrOfICmpsWithLimitConst(ICmpInst *Cmp0, ICmpInst *Cmp1,
+                                                 bool IsAnd) {
+  // Canonicalize an equality compare as Cmp0.
+  if (Cmp1->isEquality())
+    std::swap(Cmp0, Cmp1);
+  if (!Cmp0->isEquality())
+    return nullptr;
+
+  // The equality compare must be against a constant. Convert the 'null' pointer
+  // constant to an integer zero value.
+  APInt MinMaxC;
+  const APInt *C;
+  if (match(Cmp0->getOperand(1), m_APInt(C)))
+    MinMaxC = *C;
+  else if (isa<ConstantPointerNull>(Cmp0->getOperand(1)))
+    MinMaxC = APInt::getNullValue(8);
+  else
+    return nullptr;
+
+  // The non-equality compare must include a common operand (X). Canonicalize
+  // the common operand as operand 0 (the predicate is swapped if the common
+  // operand was operand 1).
+  ICmpInst::Predicate Pred0 = Cmp0->getPredicate();
+  Value *X = Cmp0->getOperand(0);
+  ICmpInst::Predicate Pred1;
+  if (!match(Cmp1, m_c_ICmp(Pred1, m_Specific(X), m_Value())) ||
+      ICmpInst::isEquality(Pred1))
+    return nullptr;
+
+  // DeMorganize if this is 'or': P0 || P1 --> !P0 && !P1.
+  if (!IsAnd) {
+    Pred0 = ICmpInst::getInversePredicate(Pred0);
+    Pred1 = ICmpInst::getInversePredicate(Pred1);
+  }
+
+  // Normalize to unsigned compare and unsigned min/max value.
+  // Example for 8-bit: -128 + 128 -> 0; 127 + 128 -> 255
+  if (ICmpInst::isSigned(Pred1)) {
+    Pred1 = ICmpInst::getUnsignedPredicate(Pred1);
+    MinMaxC += APInt::getSignedMinValue(MinMaxC.getBitWidth());
+  }
+
+  // (X != MAX) && (X < Y) --> X < Y
+  // (X == MAX) || (X >= Y) --> X >= Y
+  if (MinMaxC.isMaxValue())
+    if (Pred0 == ICmpInst::ICMP_NE && Pred1 == ICmpInst::ICMP_ULT)
+      return Cmp1;
+
+  // (X != MIN) && (X > Y) -->  X > Y
+  // (X == MIN) || (X <= Y) --> X <= Y
+  if (MinMaxC.isMinValue())
+    if (Pred0 == ICmpInst::ICMP_NE && Pred1 == ICmpInst::ICMP_UGT)
+      return Cmp1;
+
+  return nullptr;
+}
+
 static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1,
                                  const SimplifyQuery &Q) {
   if (Value *X = simplifyUnsignedRangeCheck(Op0, Op1, /*IsAnd=*/true, Q))
@@ -1710,6 +1768,9 @@ static Value *simplifyAndOfICmps(ICmpInst *Op0, ICmpInst *Op1,
   if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, true))
     return X;
 
+  if (Value *X = simplifyAndOrOfICmpsWithLimitConst(Op0, Op1, true))
+    return X;
+
   if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, true))
     return X;
 
@@ -1783,6 +1844,9 @@ static Value *simplifyOrOfICmps(ICmpInst *Op0, ICmpInst *Op1,
   if (Value *X = simplifyAndOrOfICmpsWithConstants(Op0, Op1, false))
     return X;
 
+  if (Value *X = simplifyAndOrOfICmpsWithLimitConst(Op0, Op1, false))
+    return X;
+
   if (Value *X = simplifyAndOrOfICmpsWithZero(Op0, Op1, false))
     return X;
 
@@ -2131,7 +2195,7 @@ static Value *SimplifyOrInst(Value *Op0, Value *Op1, const SimplifyQuery &Q,
     return Constant::getAllOnesValue(Op1->getType());
 
   // A | ~(A & ?) = -1
-  if (match(Op1, m_Not(m_c_And(m_Specific(Op1), m_Value()))))
+  if (match(Op1, m_Not(m_c_And(m_Specific(Op0), m_Value()))))
     return Constant::getAllOnesValue(Op0->getType());
 
   Value *A, *B;
@@ -2347,10 +2411,9 @@ computePointerICmp(const DataLayout &DL, const TargetLibraryInfo *TLI,
   RHS = RHS->stripPointerCasts();
 
   // A non-null pointer is not equal to a null pointer.
-  if (llvm::isKnownNonZero(LHS, DL, 0, nullptr, nullptr, nullptr,
-                           IIQ.UseInstrInfo) &&
-      isa<ConstantPointerNull>(RHS) &&
-      (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE))
+  if (isa<ConstantPointerNull>(RHS) && ICmpInst::isEquality(Pred) &&
+      llvm::isKnownNonZero(LHS, DL, 0, nullptr, nullptr, nullptr,
+                           IIQ.UseInstrInfo))
     return ConstantInt::get(GetCompareTy(LHS),
                             !CmpInst::isTrueWhenEqual(Pred));
 
@@ -3218,6 +3281,30 @@ static Value *simplifyICmpWithMinMax(CmpInst::Predicate Pred, Value *LHS,
   return nullptr;
 }
 
+static Value *simplifyICmpWithDominatingAssume(CmpInst::Predicate Predicate,
+                                               Value *LHS, Value *RHS,
+                                               const SimplifyQuery &Q) {
+  // Gracefully handle instructions that have not been inserted yet.
+  if (!Q.AC || !Q.CxtI || !Q.CxtI->getParent())
+    return nullptr;
+
+  for (Value *AssumeBaseOp : {LHS, RHS}) {
+    for (auto &AssumeVH : Q.AC->assumptionsFor(AssumeBaseOp)) {
+      if (!AssumeVH)
+        continue;
+
+      CallInst *Assume = cast<CallInst>(AssumeVH);
+      if (Optional<bool> Imp =
+              isImpliedCondition(Assume->getArgOperand(0), Predicate, LHS, RHS,
+                                 Q.DL))
+        if (isValidAssumeForContext(Assume, Q.CxtI, Q.DT))
+          return ConstantInt::get(GetCompareTy(LHS), *Imp);
+    }
+  }
+
+  return nullptr;
+}
+
 /// Given operands for an ICmpInst, see if we can fold the result.
 /// If not, this returns null.
 static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
@@ -3318,6 +3405,15 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                                           MaxRecurse-1))
             return V;
       }
+      // Fold (zext X) ule (sext X), (zext X) sge (sext X) to true.
+      else if (SExtInst *RI = dyn_cast<SExtInst>(RHS)) {
+        if (SrcOp == RI->getOperand(0)) {
+          if (Pred == ICmpInst::ICMP_ULE || Pred == ICmpInst::ICMP_SGE)
+            return ConstantInt::getTrue(ITy);
+          if (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_SLT)
+            return ConstantInt::getFalse(ITy);
+        }
+      }
       // Turn icmp (zext X), Cst into a compare of X and Cst if Cst is extended
       // too.  If not, then try to deduce the result of the comparison.
       else if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
@@ -3377,6 +3473,15 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
                                           Q, MaxRecurse-1))
             return V;
       }
+      // Fold (sext X) uge (zext X), (sext X) sle (zext X) to true.
+      else if (ZExtInst *RI = dyn_cast<ZExtInst>(RHS)) {
+        if (SrcOp == RI->getOperand(0)) {
+          if (Pred == ICmpInst::ICMP_UGE || Pred == ICmpInst::ICMP_SLE)
+            return ConstantInt::getTrue(ITy);
+          if (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_SGT)
+            return ConstantInt::getFalse(ITy);
+        }
+      }
       // Turn icmp (sext X), Cst into a compare of X and Cst if Cst is extended
       // too.  If not, then try to deduce the result of the comparison.
       else if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS)) {
@@ -3452,6 +3557,9 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
   if (Value *V = simplifyICmpWithMinMax(Pred, LHS, RHS, Q, MaxRecurse))
     return V;
 
+  if (Value *V = simplifyICmpWithDominatingAssume(Pred, LHS, RHS, Q))
+    return V;
+
   // Simplify comparisons of related pointers using a powerful, recursive
   // GEP-walk when we have target data available..
   if (LHS->getType()->isPointerTy())
@@ -3487,7 +3595,8 @@ static Value *SimplifyICmpInst(unsigned Predicate, Value *LHS, Value *RHS,
         SmallVector<Value *, 4> IndicesRHS(GRHS->idx_begin(), GRHS->idx_end());
         Constant *NewRHS = ConstantExpr::getGetElementPtr(
             GLHS->getSourceElementType(), Null, IndicesRHS);
-        return ConstantExpr::getICmp(Pred, NewLHS, NewRHS);
+        Constant *NewICmp = ConstantExpr::getICmp(Pred, NewLHS, NewRHS);
+        return ConstantFoldConstant(NewICmp, Q.DL);
       }
     }
   }
@@ -3622,9 +3731,9 @@ static Value *SimplifyFCmpInst(unsigned Predicate, Value *LHS, Value *RHS,
     // Check comparison of [minnum/maxnum with constant] with other constant.
     const APFloat *C2;
     if ((match(LHS, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_APFloat(C2))) &&
-         C2->compare(*C) == APFloat::cmpLessThan) ||
+         *C2 < *C) ||
         (match(LHS, m_Intrinsic<Intrinsic::maxnum>(m_Value(), m_APFloat(C2))) &&
-         C2->compare(*C) == APFloat::cmpGreaterThan)) {
+         *C2 > *C)) {
       bool IsMaxNum =
           cast<IntrinsicInst>(LHS)->getIntrinsicID() == Intrinsic::maxnum;
       // The ordered relationship and minnum/maxnum guarantee that we do not
@@ -4009,11 +4118,47 @@ static Value *SimplifySelectInst(Value *Cond, Value *TrueVal, Value *FalseVal,
   if (TrueVal == FalseVal)
     return TrueVal;
 
-  if (isa<UndefValue>(TrueVal))   // select ?, undef, X -> X
+  // If the true or false value is undef, we can fold to the other value as
+  // long as the other value isn't poison.
+  // select ?, undef, X -> X
+  if (isa<UndefValue>(TrueVal) &&
+      isGuaranteedNotToBeUndefOrPoison(FalseVal, Q.CxtI, Q.DT))
     return FalseVal;
-  if (isa<UndefValue>(FalseVal))   // select ?, X, undef -> X
+  // select ?, X, undef -> X
+  if (isa<UndefValue>(FalseVal) &&
+      isGuaranteedNotToBeUndefOrPoison(TrueVal, Q.CxtI, Q.DT))
     return TrueVal;
 
+  // Deal with partial undef vector constants: select ?, VecC, VecC' --> VecC''
+  Constant *TrueC, *FalseC;
+  if (TrueVal->getType()->isVectorTy() && match(TrueVal, m_Constant(TrueC)) &&
+      match(FalseVal, m_Constant(FalseC))) {
+    unsigned NumElts = cast<VectorType>(TrueC->getType())->getNumElements();
+    SmallVector<Constant *, 16> NewC;
+    for (unsigned i = 0; i != NumElts; ++i) {
+      // Bail out on incomplete vector constants.
+      Constant *TEltC = TrueC->getAggregateElement(i);
+      Constant *FEltC = FalseC->getAggregateElement(i);
+      if (!TEltC || !FEltC)
+        break;
+
+      // If the elements match (undef or not), that value is the result. If only
+      // one element is undef, choose the defined element as the safe result.
+      if (TEltC == FEltC)
+        NewC.push_back(TEltC);
+      else if (isa<UndefValue>(TEltC) &&
+               isGuaranteedNotToBeUndefOrPoison(FEltC))
+        NewC.push_back(FEltC);
+      else if (isa<UndefValue>(FEltC) &&
+               isGuaranteedNotToBeUndefOrPoison(TEltC))
+        NewC.push_back(TEltC);
+      else
+        break;
+    }
+    if (NewC.size() == NumElts)
+      return ConstantVector::get(NewC);
+  }
+
   if (Value *V =
           simplifySelectWithICmpCond(Cond, TrueVal, FalseVal, Q, MaxRecurse))
     return V;
@@ -4052,20 +4197,22 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
   Type *LastType = GetElementPtrInst::getIndexedType(SrcTy, Ops.slice(1));
   Type *GEPTy = PointerType::get(LastType, AS);
   if (VectorType *VT = dyn_cast<VectorType>(Ops[0]->getType()))
-    GEPTy = VectorType::get(GEPTy, VT->getNumElements());
+    GEPTy = VectorType::get(GEPTy, VT->getElementCount());
   else if (VectorType *VT = dyn_cast<VectorType>(Ops[1]->getType()))
-    GEPTy = VectorType::get(GEPTy, VT->getNumElements());
+    GEPTy = VectorType::get(GEPTy, VT->getElementCount());
 
   if (isa<UndefValue>(Ops[0]))
     return UndefValue::get(GEPTy);
 
+  bool IsScalableVec = isa<ScalableVectorType>(SrcTy);
+
   if (Ops.size() == 2) {
     // getelementptr P, 0 -> P.
     if (match(Ops[1], m_Zero()) && Ops[0]->getType() == GEPTy)
       return Ops[0];
 
     Type *Ty = SrcTy;
-    if (Ty->isSized()) {
+    if (!IsScalableVec && Ty->isSized()) {
       Value *P;
       uint64_t C;
       uint64_t TyAllocSize = Q.DL.getTypeAllocSize(Ty);
@@ -4113,7 +4260,7 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
     }
   }
 
-  if (Q.DL.getTypeAllocSize(LastType) == 1 &&
+  if (!IsScalableVec && Q.DL.getTypeAllocSize(LastType) == 1 &&
       all_of(Ops.slice(1).drop_back(1),
              [](Value *Idx) { return match(Idx, m_Zero()); })) {
     unsigned IdxWidth =
@@ -4145,9 +4292,7 @@ static Value *SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
 
   auto *CE = ConstantExpr::getGetElementPtr(SrcTy, cast<Constant>(Ops[0]),
                                             Ops.slice(1));
-  if (auto *CEFolded = ConstantFoldConstant(CE, Q.DL))
-    return CEFolded;
-  return CE;
+  return ConstantFoldConstant(CE, Q.DL);
 }
 
 Value *llvm::SimplifyGEPInst(Type *SrcTy, ArrayRef<Value *> Ops,
@@ -4199,10 +4344,10 @@ Value *llvm::SimplifyInsertElementInst(Value *Vec, Value *Val, Value *Idx,
   if (VecC && ValC && IdxC)
     return ConstantFoldInsertElementInstruction(VecC, ValC, IdxC);
 
-  // Fold into undef if index is out of bounds.
+  // For fixed-length vector, fold into undef if index is out of bounds.
   if (auto *CI = dyn_cast<ConstantInt>(Idx)) {
-    uint64_t NumElements = cast<VectorType>(Vec->getType())->getNumElements();
-    if (CI->uge(NumElements))
+    if (isa<FixedVectorType>(Vec->getType()) &&
+        CI->uge(cast<FixedVectorType>(Vec->getType())->getNumElements()))
       return UndefValue::get(Vec->getType());
   }
 
@@ -4210,15 +4355,15 @@ Value *llvm::SimplifyInsertElementInst(Value *Vec, Value *Val, Value *Idx,
   if (isa<UndefValue>(Idx))
     return UndefValue::get(Vec->getType());
 
-  // Inserting an undef scalar? Assume it is the same value as the existing
-  // vector element.
-  if (isa<UndefValue>(Val))
+  // If the scalar is undef, and there is no risk of propagating poison from the
+  // vector value, simplify to the vector value.
+  if (isa<UndefValue>(Val) && isGuaranteedNotToBeUndefOrPoison(Vec))
     return Vec;
 
   // If we are extracting a value from a vector, then inserting it into the same
   // place, that's the input vector:
   // insertelt Vec, (extractelt Vec, Idx), Idx --> Vec
-  if (match(Val, m_ExtractElement(m_Specific(Vec), m_Specific(Idx))))
+  if (match(Val, m_ExtractElt(m_Specific(Vec), m_Specific(Idx))))
     return Vec;
 
   return nullptr;
@@ -4258,6 +4403,7 @@ Value *llvm::SimplifyExtractValueInst(Value *Agg, ArrayRef<unsigned> Idxs,
 /// If not, this returns null.
 static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const SimplifyQuery &,
                                          unsigned) {
+  auto *VecVTy = cast<VectorType>(Vec->getType());
   if (auto *CVec = dyn_cast<Constant>(Vec)) {
     if (auto *CIdx = dyn_cast<Constant>(Idx))
       return ConstantFoldExtractElementInstruction(CVec, CIdx);
@@ -4267,15 +4413,16 @@ static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const SimplifyQ
       return Splat;
 
     if (isa<UndefValue>(Vec))
-      return UndefValue::get(Vec->getType()->getVectorElementType());
+      return UndefValue::get(VecVTy->getElementType());
   }
 
   // If extracting a specified index from the vector, see if we can recursively
   // find a previously computed scalar that was inserted into the vector.
   if (auto *IdxC = dyn_cast<ConstantInt>(Idx)) {
-    if (IdxC->getValue().uge(Vec->getType()->getVectorNumElements()))
-      // definitely out of bounds, thus undefined result
-      return UndefValue::get(Vec->getType()->getVectorElementType());
+    // For fixed-length vector, fold into undef if index is out of bounds.
+    if (isa<FixedVectorType>(VecVTy) &&
+        IdxC->getValue().uge(VecVTy->getNumElements()))
+      return UndefValue::get(VecVTy->getElementType());
     if (Value *Elt = findScalarElement(Vec, IdxC->getZExtValue()))
       return Elt;
   }
@@ -4283,7 +4430,7 @@ static Value *SimplifyExtractElementInst(Value *Vec, Value *Idx, const SimplifyQ
   // An undef extract index can be arbitrarily chosen to be an out-of-range
   // index value, which would result in the instruction being undef.
   if (isa<UndefValue>(Idx))
-    return UndefValue::get(Vec->getType()->getVectorElementType());
+    return UndefValue::get(VecVTy->getElementType());
 
   return nullptr;
 }
@@ -4380,7 +4527,7 @@ static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
     return nullptr;
 
   // The mask value chooses which source operand we need to look at next.
-  int InVecNumElts = Op0->getType()->getVectorNumElements();
+  int InVecNumElts = cast<VectorType>(Op0->getType())->getNumElements();
   int RootElt = MaskVal;
   Value *SourceOp = Op0;
   if (MaskVal >= InVecNumElts) {
@@ -4416,59 +4563,68 @@ static Value *foldIdentityShuffles(int DestElt, Value *Op0, Value *Op1,
   return RootVec;
 }
 
-static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
-                                        Type *RetTy, const SimplifyQuery &Q,
+static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1,
+                                        ArrayRef<int> Mask, Type *RetTy,
+                                        const SimplifyQuery &Q,
                                         unsigned MaxRecurse) {
-  if (isa<UndefValue>(Mask))
+  if (all_of(Mask, [](int Elem) { return Elem == UndefMaskElem; }))
     return UndefValue::get(RetTy);
 
-  Type *InVecTy = Op0->getType();
-  unsigned MaskNumElts = Mask->getType()->getVectorNumElements();
-  unsigned InVecNumElts = InVecTy->getVectorNumElements();
+  auto *InVecTy = cast<VectorType>(Op0->getType());
+  unsigned MaskNumElts = Mask.size();
+  ElementCount InVecEltCount = InVecTy->getElementCount();
+
+  bool Scalable = InVecEltCount.Scalable;
 
   SmallVector<int, 32> Indices;
-  ShuffleVectorInst::getShuffleMask(Mask, Indices);
-  assert(MaskNumElts == Indices.size() &&
-         "Size of Indices not same as number of mask elements?");
+  Indices.assign(Mask.begin(), Mask.end());
 
   // Canonicalization: If mask does not select elements from an input vector,
   // replace that input vector with undef.
-  bool MaskSelects0 = false, MaskSelects1 = false;
-  for (unsigned i = 0; i != MaskNumElts; ++i) {
-    if (Indices[i] == -1)
-      continue;
-    if ((unsigned)Indices[i] < InVecNumElts)
-      MaskSelects0 = true;
-    else
-      MaskSelects1 = true;
+  if (!Scalable) {
+    bool MaskSelects0 = false, MaskSelects1 = false;
+    unsigned InVecNumElts = InVecEltCount.Min;
+    for (unsigned i = 0; i != MaskNumElts; ++i) {
+      if (Indices[i] == -1)
+        continue;
+      if ((unsigned)Indices[i] < InVecNumElts)
+        MaskSelects0 = true;
+      else
+        MaskSelects1 = true;
+    }
+    if (!MaskSelects0)
+      Op0 = UndefValue::get(InVecTy);
+    if (!MaskSelects1)
+      Op1 = UndefValue::get(InVecTy);
   }
-  if (!MaskSelects0)
-    Op0 = UndefValue::get(InVecTy);
-  if (!MaskSelects1)
-    Op1 = UndefValue::get(InVecTy);
 
   auto *Op0Const = dyn_cast<Constant>(Op0);
   auto *Op1Const = dyn_cast<Constant>(Op1);
 
-  // If all operands are constant, constant fold the shuffle.
-  if (Op0Const && Op1Const)
+  // If all operands are constant, constant fold the shuffle. This
+  // transformation depends on the value of the mask which is not known at
+  // compile time for scalable vectors
+  if (!Scalable && Op0Const && Op1Const)
     return ConstantFoldShuffleVectorInstruction(Op0Const, Op1Const, Mask);
 
   // Canonicalization: if only one input vector is constant, it shall be the
-  // second one.
-  if (Op0Const && !Op1Const) {
+  // second one. This transformation depends on the value of the mask which
+  // is not known at compile time for scalable vectors
+  if (!Scalable && Op0Const && !Op1Const) {
     std::swap(Op0, Op1);
-    ShuffleVectorInst::commuteShuffleMask(Indices, InVecNumElts);
+    ShuffleVectorInst::commuteShuffleMask(Indices, InVecEltCount.Min);
   }
 
   // A splat of an inserted scalar constant becomes a vector constant:
   // shuf (inselt ?, C, IndexC), undef, <IndexC, IndexC...> --> <C, C...>
   // NOTE: We may have commuted above, so analyze the updated Indices, not the
   //       original mask constant.
+  // NOTE: This transformation depends on the value of the mask which is not
+  // known at compile time for scalable vectors
   Constant *C;
   ConstantInt *IndexC;
-  if (match(Op0, m_InsertElement(m_Value(), m_Constant(C),
-                                 m_ConstantInt(IndexC)))) {
+  if (!Scalable && match(Op0, m_InsertElt(m_Value(), m_Constant(C),
+                                          m_ConstantInt(IndexC)))) {
     // Match a splat shuffle mask of the insert index allowing undef elements.
     int InsertIndex = IndexC->getZExtValue();
     if (all_of(Indices, [InsertIndex](int MaskElt) {
@@ -4489,9 +4645,14 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
   // value type is same as the input vectors' type.
   if (auto *OpShuf = dyn_cast<ShuffleVectorInst>(Op0))
     if (isa<UndefValue>(Op1) && RetTy == InVecTy &&
-        OpShuf->getMask()->getSplatValue())
+        is_splat(OpShuf->getShuffleMask()))
       return Op0;
 
+  // All remaining transformation depend on the value of the mask, which is
+  // not known at compile time for scalable vectors.
+  if (Scalable)
+    return nullptr;
+
   // Don't fold a shuffle with undef mask elements. This may get folded in a
   // better way using demanded bits or other analysis.
   // TODO: Should we allow this?
@@ -4517,8 +4678,9 @@ static Value *SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
 }
 
 /// Given operands for a ShuffleVectorInst, fold the result or return null.
-Value *llvm::SimplifyShuffleVectorInst(Value *Op0, Value *Op1, Constant *Mask,
-                                       Type *RetTy, const SimplifyQuery &Q) {
+Value *llvm::SimplifyShuffleVectorInst(Value *Op0, Value *Op1,
+                                       ArrayRef<int> Mask, Type *RetTy,
+                                       const SimplifyQuery &Q) {
   return ::SimplifyShuffleVectorInst(Op0, Op1, Mask, RetTy, Q, RecursionLimit);
 }
 
@@ -4562,14 +4724,24 @@ static Constant *propagateNaN(Constant *In) {
 /// Perform folds that are common to any floating-point operation. This implies
 /// transforms based on undef/NaN because the operation itself makes no
 /// difference to the result.
-static Constant *simplifyFPOp(ArrayRef<Value *> Ops) {
-  if (any_of(Ops, [](Value *V) { return isa<UndefValue>(V); }))
-    return ConstantFP::getNaN(Ops[0]->getType());
-
-  for (Value *V : Ops)
-    if (match(V, m_NaN()))
+static Constant *simplifyFPOp(ArrayRef<Value *> Ops,
+                              FastMathFlags FMF = FastMathFlags()) {
+  for (Value *V : Ops) {
+    bool IsNan = match(V, m_NaN());
+    bool IsInf = match(V, m_Inf());
+    bool IsUndef = match(V, m_Undef());
+
+    // If this operation has 'nnan' or 'ninf' and at least 1 disallowed operand
+    // (an undef operand can be chosen to be Nan/Inf), then the result of
+    // this operation is poison. That result can be relaxed to undef.
+    if (FMF.noNaNs() && (IsNan || IsUndef))
+      return UndefValue::get(V->getType());
+    if (FMF.noInfs() && (IsInf || IsUndef))
+      return UndefValue::get(V->getType());
+
+    if (IsUndef || IsNan)
       return propagateNaN(cast<Constant>(V));
-
+  }
   return nullptr;
 }
 
@@ -4580,7 +4752,7 @@ static Value *SimplifyFAddInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FAdd, Op0, Op1, Q))
     return C;
 
-  if (Constant *C = simplifyFPOp({Op0, Op1}))
+  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF))
     return C;
 
   // fadd X, -0 ==> X
@@ -4627,7 +4799,7 @@ static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FSub, Op0, Op1, Q))
     return C;
 
-  if (Constant *C = simplifyFPOp({Op0, Op1}))
+  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF))
     return C;
 
   // fsub X, +0 ==> X
@@ -4669,7 +4841,7 @@ static Value *SimplifyFSubInst(Value *Op0, Value *Op1, FastMathFlags FMF,
 
 static Value *SimplifyFMAFMul(Value *Op0, Value *Op1, FastMathFlags FMF,
                               const SimplifyQuery &Q, unsigned MaxRecurse) {
-  if (Constant *C = simplifyFPOp({Op0, Op1}))
+  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF))
     return C;
 
   // fmul X, 1.0 ==> X
@@ -4736,7 +4908,7 @@ static Value *SimplifyFDivInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FDiv, Op0, Op1, Q))
     return C;
 
-  if (Constant *C = simplifyFPOp({Op0, Op1}))
+  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF))
     return C;
 
   // X / 1.0 -> X
@@ -4781,7 +4953,7 @@ static Value *SimplifyFRemInst(Value *Op0, Value *Op1, FastMathFlags FMF,
   if (Constant *C = foldOrCommuteConstant(Instruction::FRem, Op0, Op1, Q))
     return C;
 
-  if (Constant *C = simplifyFPOp({Op0, Op1}))
+  if (Constant *C = simplifyFPOp({Op0, Op1}, FMF))
     return C;
 
   // Unlike fdiv, the result of frem always matches the sign of the dividend.
@@ -4942,6 +5114,7 @@ static bool IsIdempotent(Intrinsic::ID ID) {
   case Intrinsic::rint:
   case Intrinsic::nearbyint:
   case Intrinsic::round:
+  case Intrinsic::roundeven:
   case Intrinsic::canonicalize:
     return true;
   }
@@ -5057,6 +5230,7 @@ static Value *simplifyUnaryIntrinsic(Function *F, Value *Op0,
   case Intrinsic::trunc:
   case Intrinsic::ceil:
   case Intrinsic::round:
+  case Intrinsic::roundeven:
   case Intrinsic::nearbyint:
   case Intrinsic::rint: {
     // floor (sitofp x) -> sitofp x
@@ -5288,7 +5462,12 @@ static Value *simplifyIntrinsic(CallBase *Call, const SimplifyQuery &Q) {
 }
 
 Value *llvm::SimplifyCall(CallBase *Call, const SimplifyQuery &Q) {
-  Value *Callee = Call->getCalledValue();
+  Value *Callee = Call->getCalledOperand();
+
+  // musttail calls can only be simplified if they are also DCEd.
+  // As we can't guarantee this here, don't simplify them.
+  if (Call->isMustTailCall())
+    return nullptr;
 
   // call undef -> undef
   // call null -> undef
@@ -5311,8 +5490,11 @@ Value *llvm::SimplifyCall(CallBase *Call, const SimplifyQuery &Q) {
   ConstantArgs.reserve(NumArgs);
   for (auto &Arg : Call->args()) {
     Constant *C = dyn_cast<Constant>(&Arg);
-    if (!C)
+    if (!C) {
+      if (isa<MetadataAsValue>(Arg.get()))
+        continue;
       return nullptr;
+    }
     ConstantArgs.push_back(C);
   }
 
@@ -5320,16 +5502,16 @@ Value *llvm::SimplifyCall(CallBase *Call, const SimplifyQuery &Q) {
 }
 
 /// Given operands for a Freeze, see if we can fold the result.
-static Value *SimplifyFreezeInst(Value *Op0) {
+static Value *SimplifyFreezeInst(Value *Op0, const SimplifyQuery &Q) {
   // Use a utility function defined in ValueTracking.
-  if (llvm::isGuaranteedNotToBeUndefOrPoison(Op0))
+  if (llvm::isGuaranteedNotToBeUndefOrPoison(Op0, Q.CxtI, Q.DT))
     return Op0;
   // We have room for improvement.
   return nullptr;
 }
 
 Value *llvm::SimplifyFreezeInst(Value *Op0, const SimplifyQuery &Q) {
-  return ::SimplifyFreezeInst(Op0);
+  return ::SimplifyFreezeInst(Op0, Q);
 }
 
 /// See if we can compute a simplified version of this instruction.
@@ -5463,8 +5645,9 @@ Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
   }
   case Instruction::ShuffleVector: {
     auto *SVI = cast<ShuffleVectorInst>(I);
-    Result = SimplifyShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
-                                       SVI->getMask(), SVI->getType(), Q);
+    Result =
+        SimplifyShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
+                                  SVI->getShuffleMask(), SVI->getType(), Q);
     break;
   }
   case Instruction::PHI:
@@ -5489,14 +5672,6 @@ Value *llvm::SimplifyInstruction(Instruction *I, const SimplifyQuery &SQ,
     break;
   }
 
-  // In general, it is possible for computeKnownBits to determine all bits in a
-  // value even when the operands are not all constants.
-  if (!Result && I->getType()->isIntOrIntVectorTy()) {
-    KnownBits Known = computeKnownBits(I, Q.DL, /*Depth*/ 0, Q.AC, I, Q.DT, ORE);
-    if (Known.isConstant())
-      Result = ConstantInt::get(I->getType(), Known.getConstant());
-  }
-
   /// If called on unreachable code, the above logic may report that the
   /// instruction simplified to itself.  Make life easier for users by
   /// detecting that case here, returning a safe value instead.
diff --git a/llvm/lib/Analysis/LazyCallGraph.cpp b/llvm/lib/Analysis/LazyCallGraph.cpp
index ef31c1e0ba8c..efded17cef4e 100644
--- a/llvm/lib/Analysis/LazyCallGraph.cpp
+++ b/llvm/lib/Analysis/LazyCallGraph.cpp
@@ -15,8 +15,8 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Instruction.h"
@@ -99,8 +99,8 @@ LazyCallGraph::EdgeSequence &LazyCallGraph::Node::populateSlow() {
   // safety of optimizing a direct call edge.
   for (BasicBlock &BB : *F)
     for (Instruction &I : BB) {
-      if (auto CS = CallSite(&I))
-        if (Function *Callee = CS.getCalledFunction())
+      if (auto *CB = dyn_cast<CallBase>(&I))
+        if (Function *Callee = CB->getCalledFunction())
           if (!Callee->isDeclaration())
             if (Callees.insert(Callee).second) {
               Visited.insert(Callee);
@@ -146,8 +146,11 @@ LLVM_DUMP_METHOD void LazyCallGraph::Node::dump() const {
 static bool isKnownLibFunction(Function &F, TargetLibraryInfo &TLI) {
   LibFunc LF;
 
-  // Either this is a normal library function or a "vectorizable" function.
-  return TLI.getLibFunc(F, LF) || TLI.isFunctionVectorizable(F.getName());
+  // Either this is a normal library function or a "vectorizable"
+  // function.  Not using the VFDatabase here because this query
+  // is related only to libraries handled via the TLI.
+  return TLI.getLibFunc(F, LF) ||
+         TLI.isKnownVectorFunctionInLibrary(F.getName());
 }
 
 LazyCallGraph::LazyCallGraph(
@@ -211,6 +214,15 @@ LazyCallGraph::LazyCallGraph(LazyCallGraph &&G)
   updateGraphPtrs();
 }
 
+bool LazyCallGraph::invalidate(Module &, const PreservedAnalyses &PA,
+                               ModuleAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on functions, or the function's
+  // CFG have been preserved.
+  auto PAC = PA.getChecker<llvm::LazyCallGraphAnalysis>();
+  return !(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Module>>() ||
+           PAC.preservedSet<CFGAnalyses>());
+}
+
 LazyCallGraph &LazyCallGraph::operator=(LazyCallGraph &&G) {
   BPA = std::move(G.BPA);
   NodeMap = std::move(G.NodeMap);
@@ -1553,6 +1565,21 @@ void LazyCallGraph::removeDeadFunction(Function &F) {
   // allocators.
 }
 
+void LazyCallGraph::addNewFunctionIntoSCC(Function &NewF, SCC &C) {
+  addNodeToSCC(C, createNode(NewF));
+}
+
+void LazyCallGraph::addNewFunctionIntoRefSCC(Function &NewF, RefSCC &RC) {
+  Node &N = createNode(NewF);
+
+  auto *C = createSCC(RC, SmallVector<Node *, 1>());
+  addNodeToSCC(*C, N);
+
+  auto Index = RC.SCCIndices.size();
+  RC.SCCIndices[C] = Index;
+  RC.SCCs.push_back(C);
+}
+
 LazyCallGraph::Node &LazyCallGraph::insertInto(Function &F, Node *&MappedN) {
   return *new (MappedN = BPA.Allocate()) Node(*this, F);
 }
@@ -1567,6 +1594,21 @@ void LazyCallGraph::updateGraphPtrs() {
     RC->G = this;
 }
 
+LazyCallGraph::Node &LazyCallGraph::createNode(Function &F) {
+  assert(!lookup(F) && "node already exists");
+
+  Node &N = get(F);
+  NodeMap[&F] = &N;
+  N.DFSNumber = N.LowLink = -1;
+  N.populate();
+  return N;
+}
+
+void LazyCallGraph::addNodeToSCC(LazyCallGraph::SCC &C, Node &N) {
+  C.Nodes.push_back(&N);
+  SCCMap[&N] = &C;
+}
+
 template <typename RootsT, typename GetBeginT, typename GetEndT,
           typename GetNodeT, typename FormSCCCallbackT>
 void LazyCallGraph::buildGenericSCCs(RootsT &&Roots, GetBeginT &&GetBegin,
@@ -1788,11 +1830,12 @@ LazyCallGraphDOTPrinterPass::LazyCallGraphDOTPrinterPass(raw_ostream &OS)
     : OS(OS) {}
 
 static void printNodeDOT(raw_ostream &OS, LazyCallGraph::Node &N) {
-  std::string Name = "\"" + DOT::EscapeString(N.getFunction().getName()) + "\"";
+  std::string Name =
+      "\"" + DOT::EscapeString(std::string(N.getFunction().getName())) + "\"";
 
   for (LazyCallGraph::Edge &E : N.populate()) {
     OS << "  " << Name << " -> \""
-       << DOT::EscapeString(E.getFunction().getName()) << "\"";
+       << DOT::EscapeString(std::string(E.getFunction().getName())) << "\"";
     if (!E.isCall()) // It is a ref edge.
       OS << " [style=dashed,label=\"ref\"]";
     OS << ";\n";
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index bad2de9e5f5e..f5ffa7286b3b 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -96,9 +96,9 @@ static ValueLatticeElement intersect(const ValueLatticeElement &A,
                                      const ValueLatticeElement &B) {
   // Undefined is the strongest state.  It means the value is known to be along
   // an unreachable path.
-  if (A.isUndefined())
+  if (A.isUnknown())
     return A;
-  if (B.isUndefined())
+  if (B.isUnknown())
     return B;
 
   // If we gave up for one, but got a useable fact from the other, use it.
@@ -121,11 +121,12 @@ static ValueLatticeElement intersect(const ValueLatticeElement &A,
 
   // Intersect two constant ranges
   ConstantRange Range =
-    A.getConstantRange().intersectWith(B.getConstantRange());
-  // Note: An empty range is implicitly converted to overdefined internally.
-  // TODO: We could instead use Undefined here since we've proven a conflict
-  // and thus know this path must be unreachable.
-  return ValueLatticeElement::getRange(std::move(Range));
+      A.getConstantRange().intersectWith(B.getConstantRange());
+  // Note: An empty range is implicitly converted to unknown or undef depending
+  // on MayIncludeUndef internally.
+  return ValueLatticeElement::getRange(
+      std::move(Range), /*MayIncludeUndef=*/A.isConstantRangeIncludingUndef() |
+                            B.isConstantRangeIncludingUndef());
 }
 
 //===----------------------------------------------------------------------===//
@@ -136,12 +137,9 @@ namespace {
   /// A callback value handle updates the cache when values are erased.
   class LazyValueInfoCache;
   struct LVIValueHandle final : public CallbackVH {
-    // Needs to access getValPtr(), which is protected.
-    friend struct DenseMapInfo<LVIValueHandle>;
-
     LazyValueInfoCache *Parent;
 
-    LVIValueHandle(Value *V, LazyValueInfoCache *P)
+    LVIValueHandle(Value *V, LazyValueInfoCache *P = nullptr)
       : CallbackVH(V), Parent(P) { }
 
     void deleted() override;
@@ -155,89 +153,77 @@ namespace {
   /// This is the cache kept by LazyValueInfo which
   /// maintains information about queries across the clients' queries.
   class LazyValueInfoCache {
-    /// This is all of the cached block information for exactly one Value*.
-    /// The entries are sorted by the BasicBlock* of the
-    /// entries, allowing us to do a lookup with a binary search.
-    /// Over-defined lattice values are recorded in OverDefinedCache to reduce
-    /// memory overhead.
-    struct ValueCacheEntryTy {
-      ValueCacheEntryTy(Value *V, LazyValueInfoCache *P) : Handle(V, P) {}
-      LVIValueHandle Handle;
-      SmallDenseMap<PoisoningVH<BasicBlock>, ValueLatticeElement, 4> BlockVals;
+    /// This is all of the cached information for one basic block. It contains
+    /// the per-value lattice elements, as well as a separate set for
+    /// overdefined values to reduce memory usage.
+    struct BlockCacheEntry {
+      SmallDenseMap<AssertingVH<Value>, ValueLatticeElement, 4> LatticeElements;
+      SmallDenseSet<AssertingVH<Value>, 4> OverDefined;
     };
 
-    /// This tracks, on a per-block basis, the set of values that are
-    /// over-defined at the end of that block.
-    typedef DenseMap<PoisoningVH<BasicBlock>, SmallPtrSet<Value *, 4>>
-        OverDefinedCacheTy;
-    /// Keep track of all blocks that we have ever seen, so we
-    /// don't spend time removing unused blocks from our caches.
-    DenseSet<PoisoningVH<BasicBlock> > SeenBlocks;
+    /// Cached information per basic block.
+    DenseMap<PoisoningVH<BasicBlock>, std::unique_ptr<BlockCacheEntry>>
+        BlockCache;
+    /// Set of value handles used to erase values from the cache on deletion.
+    DenseSet<LVIValueHandle, DenseMapInfo<Value *>> ValueHandles;
+
+    const BlockCacheEntry *getBlockEntry(BasicBlock *BB) const {
+      auto It = BlockCache.find_as(BB);
+      if (It == BlockCache.end())
+        return nullptr;
+      return It->second.get();
+    }
+
+    BlockCacheEntry *getOrCreateBlockEntry(BasicBlock *BB) {
+      auto It = BlockCache.find_as(BB);
+      if (It == BlockCache.end())
+        It = BlockCache.insert({ BB, std::make_unique<BlockCacheEntry>() })
+                       .first;
 
-    /// This is all of the cached information for all values,
-    /// mapped from Value* to key information.
-    DenseMap<Value *, std::unique_ptr<ValueCacheEntryTy>> ValueCache;
-    OverDefinedCacheTy OverDefinedCache;
+      return It->second.get();
+    }
 
+    void addValueHandle(Value *Val) {
+      auto HandleIt = ValueHandles.find_as(Val);
+      if (HandleIt == ValueHandles.end())
+        ValueHandles.insert({ Val, this });
+    }
 
   public:
     void insertResult(Value *Val, BasicBlock *BB,
                       const ValueLatticeElement &Result) {
-      SeenBlocks.insert(BB);
+      BlockCacheEntry *Entry = getOrCreateBlockEntry(BB);
 
       // Insert over-defined values into their own cache to reduce memory
       // overhead.
       if (Result.isOverdefined())
-        OverDefinedCache[BB].insert(Val);
-      else {
-        auto It = ValueCache.find_as(Val);
-        if (It == ValueCache.end()) {
-          ValueCache[Val] = std::make_unique<ValueCacheEntryTy>(Val, this);
-          It = ValueCache.find_as(Val);
-          assert(It != ValueCache.end() && "Val was just added to the map!");
-        }
-        It->second->BlockVals[BB] = Result;
-      }
-    }
-
-    bool isOverdefined(Value *V, BasicBlock *BB) const {
-      auto ODI = OverDefinedCache.find(BB);
-
-      if (ODI == OverDefinedCache.end())
-        return false;
+        Entry->OverDefined.insert(Val);
+      else
+        Entry->LatticeElements.insert({ Val, Result });
 
-      return ODI->second.count(V);
+      addValueHandle(Val);
     }
 
-    bool hasCachedValueInfo(Value *V, BasicBlock *BB) const {
-      if (isOverdefined(V, BB))
-        return true;
-
-      auto I = ValueCache.find_as(V);
-      if (I == ValueCache.end())
-        return false;
-
-      return I->second->BlockVals.count(BB);
-    }
+    Optional<ValueLatticeElement> getCachedValueInfo(Value *V,
+                                                     BasicBlock *BB) const {
+      const BlockCacheEntry *Entry = getBlockEntry(BB);
+      if (!Entry)
+        return None;
 
-    ValueLatticeElement getCachedValueInfo(Value *V, BasicBlock *BB) const {
-      if (isOverdefined(V, BB))
+      if (Entry->OverDefined.count(V))
         return ValueLatticeElement::getOverdefined();
 
-      auto I = ValueCache.find_as(V);
-      if (I == ValueCache.end())
-        return ValueLatticeElement();
-      auto BBI = I->second->BlockVals.find(BB);
-      if (BBI == I->second->BlockVals.end())
-        return ValueLatticeElement();
-      return BBI->second;
+      auto LatticeIt = Entry->LatticeElements.find_as(V);
+      if (LatticeIt == Entry->LatticeElements.end())
+        return None;
+
+      return LatticeIt->second;
     }
 
     /// clear - Empty the cache.
     void clear() {
-      SeenBlocks.clear();
-      ValueCache.clear();
-      OverDefinedCache.clear();
+      BlockCache.clear();
+      ValueHandles.clear();
     }
 
     /// Inform the cache that a given value has been deleted.
@@ -251,23 +237,18 @@ namespace {
     /// OldSucc might have (unless also overdefined in NewSucc).  This just
     /// flushes elements from the cache and does not add any.
     void threadEdgeImpl(BasicBlock *OldSucc,BasicBlock *NewSucc);
-
-    friend struct LVIValueHandle;
   };
 }
 
 void LazyValueInfoCache::eraseValue(Value *V) {
-  for (auto I = OverDefinedCache.begin(), E = OverDefinedCache.end(); I != E;) {
-    // Copy and increment the iterator immediately so we can erase behind
-    // ourselves.
-    auto Iter = I++;
-    SmallPtrSetImpl<Value *> &ValueSet = Iter->second;
-    ValueSet.erase(V);
-    if (ValueSet.empty())
-      OverDefinedCache.erase(Iter);
+  for (auto &Pair : BlockCache) {
+    Pair.second->LatticeElements.erase(V);
+    Pair.second->OverDefined.erase(V);
   }
 
-  ValueCache.erase(V);
+  auto HandleIt = ValueHandles.find_as(V);
+  if (HandleIt != ValueHandles.end())
+    ValueHandles.erase(HandleIt);
 }
 
 void LVIValueHandle::deleted() {
@@ -277,18 +258,7 @@ void LVIValueHandle::deleted() {
 }
 
 void LazyValueInfoCache::eraseBlock(BasicBlock *BB) {
-  // Shortcut if we have never seen this block.
-  DenseSet<PoisoningVH<BasicBlock> >::iterator I = SeenBlocks.find(BB);
-  if (I == SeenBlocks.end())
-    return;
-  SeenBlocks.erase(I);
-
-  auto ODI = OverDefinedCache.find(BB);
-  if (ODI != OverDefinedCache.end())
-    OverDefinedCache.erase(ODI);
-
-  for (auto &I : ValueCache)
-    I.second->BlockVals.erase(BB);
+  BlockCache.erase(BB);
 }
 
 void LazyValueInfoCache::threadEdgeImpl(BasicBlock *OldSucc,
@@ -306,10 +276,11 @@ void LazyValueInfoCache::threadEdgeImpl(BasicBlock *OldSucc,
   std::vector<BasicBlock*> worklist;
   worklist.push_back(OldSucc);
 
-  auto I = OverDefinedCache.find(OldSucc);
-  if (I == OverDefinedCache.end())
+  const BlockCacheEntry *Entry = getBlockEntry(OldSucc);
+  if (!Entry || Entry->OverDefined.empty())
     return; // Nothing to process here.
-  SmallVector<Value *, 4> ValsToClear(I->second.begin(), I->second.end());
+  SmallVector<Value *, 4> ValsToClear(Entry->OverDefined.begin(),
+                                      Entry->OverDefined.end());
 
   // Use a worklist to perform a depth-first search of OldSucc's successors.
   // NOTE: We do not need a visited list since any blocks we have already
@@ -323,10 +294,10 @@ void LazyValueInfoCache::threadEdgeImpl(BasicBlock *OldSucc,
     if (ToUpdate == NewSucc) continue;
 
     // If a value was marked overdefined in OldSucc, and is here too...
-    auto OI = OverDefinedCache.find(ToUpdate);
-    if (OI == OverDefinedCache.end())
+    auto OI = BlockCache.find_as(ToUpdate);
+    if (OI == BlockCache.end() || OI->second->OverDefined.empty())
       continue;
-    SmallPtrSetImpl<Value *> &ValueSet = OI->second;
+    auto &ValueSet = OI->second->OverDefined;
 
     bool changed = false;
     for (Value *V : ValsToClear) {
@@ -336,11 +307,6 @@ void LazyValueInfoCache::threadEdgeImpl(BasicBlock *OldSucc,
       // If we removed anything, then we potentially need to update
       // blocks successors too.
       changed = true;
-
-      if (ValueSet.empty()) {
-        OverDefinedCache.erase(OI);
-        break;
-      }
     }
 
     if (!changed) continue;
@@ -357,156 +323,137 @@ class LazyValueInfoImpl;
 class LazyValueInfoAnnotatedWriter : public AssemblyAnnotationWriter {
   LazyValueInfoImpl *LVIImpl;
   // While analyzing which blocks we can solve values for, we need the dominator
-  // information. Since this is an optional parameter in LVI, we require this
-  // DomTreeAnalysis pass in the printer pass, and pass the dominator
-  // tree to the LazyValueInfoAnnotatedWriter.
+  // information.
   DominatorTree &DT;
 
 public:
   LazyValueInfoAnnotatedWriter(LazyValueInfoImpl *L, DominatorTree &DTree)
       : LVIImpl(L), DT(DTree) {}
 
-  virtual void emitBasicBlockStartAnnot(const BasicBlock *BB,
-                                        formatted_raw_ostream &OS);
+  void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                formatted_raw_ostream &OS) override;
 
-  virtual void emitInstructionAnnot(const Instruction *I,
-                                    formatted_raw_ostream &OS);
+  void emitInstructionAnnot(const Instruction *I,
+                            formatted_raw_ostream &OS) override;
 };
 }
 namespace {
-  // The actual implementation of the lazy analysis and update.  Note that the
-  // inheritance from LazyValueInfoCache is intended to be temporary while
-  // splitting the code and then transitioning to a has-a relationship.
-  class LazyValueInfoImpl {
-
-    /// Cached results from previous queries
-    LazyValueInfoCache TheCache;
-
-    /// This stack holds the state of the value solver during a query.
-    /// It basically emulates the callstack of the naive
-    /// recursive value lookup process.
-    SmallVector<std::pair<BasicBlock*, Value*>, 8> BlockValueStack;
-
-    /// Keeps track of which block-value pairs are in BlockValueStack.
-    DenseSet<std::pair<BasicBlock*, Value*> > BlockValueSet;
-
-    /// Push BV onto BlockValueStack unless it's already in there.
-    /// Returns true on success.
-    bool pushBlockValue(const std::pair<BasicBlock *, Value *> &BV) {
-      if (!BlockValueSet.insert(BV).second)
-        return false;  // It's already in the stack.
-
-      LLVM_DEBUG(dbgs() << "PUSH: " << *BV.second << " in "
-                        << BV.first->getName() << "\n");
-      BlockValueStack.push_back(BV);
-      return true;
-    }
+// The actual implementation of the lazy analysis and update.  Note that the
+// inheritance from LazyValueInfoCache is intended to be temporary while
+// splitting the code and then transitioning to a has-a relationship.
+class LazyValueInfoImpl {
+
+  /// Cached results from previous queries
+  LazyValueInfoCache TheCache;
+
+  /// This stack holds the state of the value solver during a query.
+  /// It basically emulates the callstack of the naive
+  /// recursive value lookup process.
+  SmallVector<std::pair<BasicBlock*, Value*>, 8> BlockValueStack;
+
+  /// Keeps track of which block-value pairs are in BlockValueStack.
+  DenseSet<std::pair<BasicBlock*, Value*> > BlockValueSet;
+
+  /// Push BV onto BlockValueStack unless it's already in there.
+  /// Returns true on success.
+  bool pushBlockValue(const std::pair<BasicBlock *, Value *> &BV) {
+    if (!BlockValueSet.insert(BV).second)
+      return false;  // It's already in the stack.
+
+    LLVM_DEBUG(dbgs() << "PUSH: " << *BV.second << " in "
+                      << BV.first->getName() << "\n");
+    BlockValueStack.push_back(BV);
+    return true;
+  }
+
+  AssumptionCache *AC;  ///< A pointer to the cache of @llvm.assume calls.
+  const DataLayout &DL; ///< A mandatory DataLayout
 
-    AssumptionCache *AC;  ///< A pointer to the cache of @llvm.assume calls.
-    const DataLayout &DL; ///< A mandatory DataLayout
-    DominatorTree *DT;    ///< An optional DT pointer.
-    DominatorTree *DisabledDT; ///< Stores DT if it's disabled.
+  /// Declaration of the llvm.experimental.guard() intrinsic,
+  /// if it exists in the module.
+  Function *GuardDecl;
 
-  ValueLatticeElement getBlockValue(Value *Val, BasicBlock *BB);
-  bool getEdgeValue(Value *V, BasicBlock *F, BasicBlock *T,
-                    ValueLatticeElement &Result, Instruction *CxtI = nullptr);
-  bool hasBlockValue(Value *Val, BasicBlock *BB);
+  Optional<ValueLatticeElement> getBlockValue(Value *Val, BasicBlock *BB);
+  Optional<ValueLatticeElement> getEdgeValue(Value *V, BasicBlock *F,
+                                BasicBlock *T, Instruction *CxtI = nullptr);
 
   // These methods process one work item and may add more. A false value
   // returned means that the work item was not completely processed and must
   // be revisited after going through the new items.
   bool solveBlockValue(Value *Val, BasicBlock *BB);
-  bool solveBlockValueImpl(ValueLatticeElement &Res, Value *Val,
-                           BasicBlock *BB);
-  bool solveBlockValueNonLocal(ValueLatticeElement &BBLV, Value *Val,
-                               BasicBlock *BB);
-  bool solveBlockValuePHINode(ValueLatticeElement &BBLV, PHINode *PN,
-                              BasicBlock *BB);
-  bool solveBlockValueSelect(ValueLatticeElement &BBLV, SelectInst *S,
-                             BasicBlock *BB);
+  Optional<ValueLatticeElement> solveBlockValueImpl(Value *Val, BasicBlock *BB);
+  Optional<ValueLatticeElement> solveBlockValueNonLocal(Value *Val,
+                                                        BasicBlock *BB);
+  Optional<ValueLatticeElement> solveBlockValuePHINode(PHINode *PN,
+                                                       BasicBlock *BB);
+  Optional<ValueLatticeElement> solveBlockValueSelect(SelectInst *S,
+                                                      BasicBlock *BB);
   Optional<ConstantRange> getRangeForOperand(unsigned Op, Instruction *I,
                                              BasicBlock *BB);
-  bool solveBlockValueBinaryOpImpl(
-      ValueLatticeElement &BBLV, Instruction *I, BasicBlock *BB,
+  Optional<ValueLatticeElement> solveBlockValueBinaryOpImpl(
+      Instruction *I, BasicBlock *BB,
       std::function<ConstantRange(const ConstantRange &,
                                   const ConstantRange &)> OpFn);
-  bool solveBlockValueBinaryOp(ValueLatticeElement &BBLV, BinaryOperator *BBI,
-                               BasicBlock *BB);
-  bool solveBlockValueCast(ValueLatticeElement &BBLV, CastInst *CI,
-                           BasicBlock *BB);
-  bool solveBlockValueOverflowIntrinsic(
-      ValueLatticeElement &BBLV, WithOverflowInst *WO, BasicBlock *BB);
-  bool solveBlockValueSaturatingIntrinsic(ValueLatticeElement &BBLV,
-                                          SaturatingInst *SI, BasicBlock *BB);
-  bool solveBlockValueIntrinsic(ValueLatticeElement &BBLV, IntrinsicInst *II,
-                                BasicBlock *BB);
-  bool solveBlockValueExtractValue(ValueLatticeElement &BBLV,
-                                   ExtractValueInst *EVI, BasicBlock *BB);
+  Optional<ValueLatticeElement> solveBlockValueBinaryOp(BinaryOperator *BBI,
+                                                        BasicBlock *BB);
+  Optional<ValueLatticeElement> solveBlockValueCast(CastInst *CI,
+                                                    BasicBlock *BB);
+  Optional<ValueLatticeElement> solveBlockValueOverflowIntrinsic(
+      WithOverflowInst *WO, BasicBlock *BB);
+  Optional<ValueLatticeElement> solveBlockValueSaturatingIntrinsic(
+      SaturatingInst *SI, BasicBlock *BB);
+  Optional<ValueLatticeElement> solveBlockValueIntrinsic(IntrinsicInst *II,
+                                                         BasicBlock *BB);
+  Optional<ValueLatticeElement> solveBlockValueExtractValue(
+      ExtractValueInst *EVI, BasicBlock *BB);
   void intersectAssumeOrGuardBlockValueConstantRange(Value *Val,
                                                      ValueLatticeElement &BBLV,
                                                      Instruction *BBI);
 
   void solve();
 
-  public:
-    /// This is the query interface to determine the lattice
-    /// value for the specified Value* at the end of the specified block.
-    ValueLatticeElement getValueInBlock(Value *V, BasicBlock *BB,
-                                        Instruction *CxtI = nullptr);
-
-    /// This is the query interface to determine the lattice
-    /// value for the specified Value* at the specified instruction (generally
-    /// from an assume intrinsic).
-    ValueLatticeElement getValueAt(Value *V, Instruction *CxtI);
-
-    /// This is the query interface to determine the lattice
-    /// value for the specified Value* that is true on the specified edge.
-    ValueLatticeElement getValueOnEdge(Value *V, BasicBlock *FromBB,
-                                       BasicBlock *ToBB,
-                                   Instruction *CxtI = nullptr);
-
-    /// Complete flush all previously computed values
-    void clear() {
-      TheCache.clear();
-    }
-
-    /// Printing the LazyValueInfo Analysis.
-    void printLVI(Function &F, DominatorTree &DTree, raw_ostream &OS) {
-        LazyValueInfoAnnotatedWriter Writer(this, DTree);
-        F.print(OS, &Writer);
-    }
-
-    /// This is part of the update interface to inform the cache
-    /// that a block has been deleted.
-    void eraseBlock(BasicBlock *BB) {
-      TheCache.eraseBlock(BB);
-    }
+public:
+  /// This is the query interface to determine the lattice
+  /// value for the specified Value* at the end of the specified block.
+  ValueLatticeElement getValueInBlock(Value *V, BasicBlock *BB,
+                                      Instruction *CxtI = nullptr);
+
+  /// This is the query interface to determine the lattice
+  /// value for the specified Value* at the specified instruction (generally
+  /// from an assume intrinsic).
+  ValueLatticeElement getValueAt(Value *V, Instruction *CxtI);
+
+  /// This is the query interface to determine the lattice
+  /// value for the specified Value* that is true on the specified edge.
+  ValueLatticeElement getValueOnEdge(Value *V, BasicBlock *FromBB,
+                                     BasicBlock *ToBB,
+                                     Instruction *CxtI = nullptr);
+
+  /// Complete flush all previously computed values
+  void clear() {
+    TheCache.clear();
+  }
 
-    /// Disables use of the DominatorTree within LVI.
-    void disableDT() {
-      if (DT) {
-        assert(!DisabledDT && "Both DT and DisabledDT are not nullptr!");
-        std::swap(DT, DisabledDT);
-      }
-    }
+  /// Printing the LazyValueInfo Analysis.
+  void printLVI(Function &F, DominatorTree &DTree, raw_ostream &OS) {
+    LazyValueInfoAnnotatedWriter Writer(this, DTree);
+    F.print(OS, &Writer);
+  }
 
-    /// Enables use of the DominatorTree within LVI. Does nothing if the class
-    /// instance was initialized without a DT pointer.
-    void enableDT() {
-      if (DisabledDT) {
-        assert(!DT && "Both DT and DisabledDT are not nullptr!");
-        std::swap(DT, DisabledDT);
-      }
-    }
+  /// This is part of the update interface to inform the cache
+  /// that a block has been deleted.
+  void eraseBlock(BasicBlock *BB) {
+    TheCache.eraseBlock(BB);
+  }
 
-    /// This is the update interface to inform the cache that an edge from
-    /// PredBB to OldSucc has been threaded to be from PredBB to NewSucc.
-    void threadEdge(BasicBlock *PredBB,BasicBlock *OldSucc,BasicBlock *NewSucc);
+  /// This is the update interface to inform the cache that an edge from
+  /// PredBB to OldSucc has been threaded to be from PredBB to NewSucc.
+  void threadEdge(BasicBlock *PredBB,BasicBlock *OldSucc,BasicBlock *NewSucc);
 
-    LazyValueInfoImpl(AssumptionCache *AC, const DataLayout &DL,
-                       DominatorTree *DT = nullptr)
-        : AC(AC), DL(DL), DT(DT), DisabledDT(nullptr) {}
-  };
+  LazyValueInfoImpl(AssumptionCache *AC, const DataLayout &DL,
+                    Function *GuardDecl)
+      : AC(AC), DL(DL), GuardDecl(GuardDecl) {}
+};
 } // end anonymous namespace
 
 
@@ -545,12 +492,14 @@ void LazyValueInfoImpl::solve() {
     if (solveBlockValue(e.second, e.first)) {
       // The work item was completely processed.
       assert(BlockValueStack.back() == e && "Nothing should have been pushed!");
-      assert(TheCache.hasCachedValueInfo(e.second, e.first) &&
-             "Result should be in cache!");
-
+#ifndef NDEBUG
+      Optional<ValueLatticeElement> BBLV =
+          TheCache.getCachedValueInfo(e.second, e.first);
+      assert(BBLV && "Result should be in cache!");
       LLVM_DEBUG(
           dbgs() << "POP " << *e.second << " in " << e.first->getName() << " = "
-                 << TheCache.getCachedValueInfo(e.second, e.first) << "\n");
+                 << *BBLV << "\n");
+#endif
 
       BlockValueStack.pop_back();
       BlockValueSet.erase(e);
@@ -561,21 +510,22 @@ void LazyValueInfoImpl::solve() {
   }
 }
 
-bool LazyValueInfoImpl::hasBlockValue(Value *Val, BasicBlock *BB) {
-  // If already a constant, there is nothing to compute.
-  if (isa<Constant>(Val))
-    return true;
-
-  return TheCache.hasCachedValueInfo(Val, BB);
-}
-
-ValueLatticeElement LazyValueInfoImpl::getBlockValue(Value *Val,
-                                                     BasicBlock *BB) {
+Optional<ValueLatticeElement> LazyValueInfoImpl::getBlockValue(Value *Val,
+                                                               BasicBlock *BB) {
   // If already a constant, there is nothing to compute.
   if (Constant *VC = dyn_cast<Constant>(Val))
     return ValueLatticeElement::get(VC);
 
-  return TheCache.getCachedValueInfo(Val, BB);
+  if (Optional<ValueLatticeElement> OptLatticeVal =
+          TheCache.getCachedValueInfo(Val, BB))
+    return OptLatticeVal;
+
+  // We have hit a cycle, assume overdefined.
+  if (!pushBlockValue({ BB, Val }))
+    return ValueLatticeElement::getOverdefined();
+
+  // Yet to be resolved.
+  return None;
 }
 
 static ValueLatticeElement getFromRangeMetadata(Instruction *BBI) {
@@ -596,43 +546,32 @@ static ValueLatticeElement getFromRangeMetadata(Instruction *BBI) {
 }
 
 bool LazyValueInfoImpl::solveBlockValue(Value *Val, BasicBlock *BB) {
-  if (isa<Constant>(Val))
-    return true;
-
-  if (TheCache.hasCachedValueInfo(Val, BB)) {
-    // If we have a cached value, use that.
-    LLVM_DEBUG(dbgs() << "  reuse BB '" << BB->getName() << "' val="
-                      << TheCache.getCachedValueInfo(Val, BB) << '\n');
-
-    // Since we're reusing a cached value, we don't need to update the
-    // OverDefinedCache. The cache will have been properly updated whenever the
-    // cached value was inserted.
-    return true;
-  }
+  assert(!isa<Constant>(Val) && "Value should not be constant");
+  assert(!TheCache.getCachedValueInfo(Val, BB) &&
+         "Value should not be in cache");
 
   // Hold off inserting this value into the Cache in case we have to return
   // false and come back later.
-  ValueLatticeElement Res;
-  if (!solveBlockValueImpl(Res, Val, BB))
+  Optional<ValueLatticeElement> Res = solveBlockValueImpl(Val, BB);
+  if (!Res)
     // Work pushed, will revisit
     return false;
 
-  TheCache.insertResult(Val, BB, Res);
+  TheCache.insertResult(Val, BB, *Res);
   return true;
 }
 
-bool LazyValueInfoImpl::solveBlockValueImpl(ValueLatticeElement &Res,
-                                            Value *Val, BasicBlock *BB) {
-
+Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueImpl(
+    Value *Val, BasicBlock *BB) {
   Instruction *BBI = dyn_cast<Instruction>(Val);
   if (!BBI || BBI->getParent() != BB)
-    return solveBlockValueNonLocal(Res, Val, BB);
+    return solveBlockValueNonLocal(Val, BB);
 
   if (PHINode *PN = dyn_cast<PHINode>(BBI))
-    return solveBlockValuePHINode(Res, PN, BB);
+    return solveBlockValuePHINode(PN, BB);
 
   if (auto *SI = dyn_cast<SelectInst>(BBI))
-    return solveBlockValueSelect(Res, SI, BB);
+    return solveBlockValueSelect(SI, BB);
 
   // If this value is a nonnull pointer, record it's range and bailout.  Note
   // that for all other pointer typed values, we terminate the search at the
@@ -644,28 +583,26 @@ bool LazyValueInfoImpl::solveBlockValueImpl(ValueLatticeElement &Res,
   // instruction is placed, even if it could legally be hoisted much higher.
   // That is unfortunate.
   PointerType *PT = dyn_cast<PointerType>(BBI->getType());
-  if (PT && isKnownNonZero(BBI, DL)) {
-    Res = ValueLatticeElement::getNot(ConstantPointerNull::get(PT));
-    return true;
-  }
+  if (PT && isKnownNonZero(BBI, DL))
+    return ValueLatticeElement::getNot(ConstantPointerNull::get(PT));
+
   if (BBI->getType()->isIntegerTy()) {
     if (auto *CI = dyn_cast<CastInst>(BBI))
-      return solveBlockValueCast(Res, CI, BB);
+      return solveBlockValueCast(CI, BB);
 
     if (BinaryOperator *BO = dyn_cast<BinaryOperator>(BBI))
-      return solveBlockValueBinaryOp(Res, BO, BB);
+      return solveBlockValueBinaryOp(BO, BB);
 
     if (auto *EVI = dyn_cast<ExtractValueInst>(BBI))
-      return solveBlockValueExtractValue(Res, EVI, BB);
+      return solveBlockValueExtractValue(EVI, BB);
 
     if (auto *II = dyn_cast<IntrinsicInst>(BBI))
-      return solveBlockValueIntrinsic(Res, II, BB);
+      return solveBlockValueIntrinsic(II, BB);
   }
 
   LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
                     << "' - unknown inst def found.\n");
-  Res = getFromRangeMetadata(BBI);
-  return true;
+  return getFromRangeMetadata(BBI);
 }
 
 static bool InstructionDereferencesPointer(Instruction *I, Value *Ptr) {
@@ -717,8 +654,8 @@ static bool isObjectDereferencedInBlock(Value *Val, BasicBlock *BB) {
   return false;
 }
 
-bool LazyValueInfoImpl::solveBlockValueNonLocal(ValueLatticeElement &BBLV,
-                                                 Value *Val, BasicBlock *BB) {
+Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueNonLocal(
+    Value *Val, BasicBlock *BB) {
   ValueLatticeElement Result;  // Start Undefined.
 
   // If this is the entry block, we must be asking about an argument.  The
@@ -731,13 +668,10 @@ bool LazyValueInfoImpl::solveBlockValueNonLocal(ValueLatticeElement &BBLV,
     if (PTy &&
         (isKnownNonZero(Val, DL) ||
           (isObjectDereferencedInBlock(Val, BB) &&
-           !NullPointerIsDefined(BB->getParent(), PTy->getAddressSpace())))) {
-      Result = ValueLatticeElement::getNot(ConstantPointerNull::get(PTy));
-    } else {
-      Result = ValueLatticeElement::getOverdefined();
-    }
-    BBLV = Result;
-    return true;
+           !NullPointerIsDefined(BB->getParent(), PTy->getAddressSpace()))))
+      return ValueLatticeElement::getNot(ConstantPointerNull::get(PTy));
+    else
+      return ValueLatticeElement::getOverdefined();
   }
 
   // Loop over all of our predecessors, merging what we know from them into
@@ -750,12 +684,12 @@ bool LazyValueInfoImpl::solveBlockValueNonLocal(ValueLatticeElement &BBLV,
   // canonicalizing to make this true rather than relying on this happy
   // accident.
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
-    ValueLatticeElement EdgeResult;
-    if (!getEdgeValue(Val, *PI, BB, EdgeResult))
+    Optional<ValueLatticeElement> EdgeResult = getEdgeValue(Val, *PI, BB);
+    if (!EdgeResult)
       // Explore that input, then return here
-      return false;
+      return None;
 
-    Result.mergeIn(EdgeResult, DL);
+    Result.mergeIn(*EdgeResult);
 
     // If we hit overdefined, exit early.  The BlockVals entry is already set
     // to overdefined.
@@ -770,19 +704,17 @@ bool LazyValueInfoImpl::solveBlockValueNonLocal(ValueLatticeElement &BBLV,
         Result = ValueLatticeElement::getNot(ConstantPointerNull::get(PTy));
       }
 
-      BBLV = Result;
-      return true;
+      return Result;
     }
   }
 
   // Return the merged value, which is more precise than 'overdefined'.
   assert(!Result.isOverdefined());
-  BBLV = Result;
-  return true;
+  return Result;
 }
 
-bool LazyValueInfoImpl::solveBlockValuePHINode(ValueLatticeElement &BBLV,
-                                               PHINode *PN, BasicBlock *BB) {
+Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValuePHINode(
+    PHINode *PN, BasicBlock *BB) {
   ValueLatticeElement Result;  // Start Undefined.
 
   // Loop over all of our predecessors, merging what we know from them into
@@ -791,15 +723,16 @@ bool LazyValueInfoImpl::solveBlockValuePHINode(ValueLatticeElement &BBLV,
   for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
     BasicBlock *PhiBB = PN->getIncomingBlock(i);
     Value *PhiVal = PN->getIncomingValue(i);
-    ValueLatticeElement EdgeResult;
     // Note that we can provide PN as the context value to getEdgeValue, even
     // though the results will be cached, because PN is the value being used as
     // the cache key in the caller.
-    if (!getEdgeValue(PhiVal, PhiBB, BB, EdgeResult, PN))
+    Optional<ValueLatticeElement> EdgeResult =
+        getEdgeValue(PhiVal, PhiBB, BB, PN);
+    if (!EdgeResult)
       // Explore that input, then return here
-      return false;
+      return None;
 
-    Result.mergeIn(EdgeResult, DL);
+    Result.mergeIn(*EdgeResult);
 
     // If we hit overdefined, exit early.  The BlockVals entry is already set
     // to overdefined.
@@ -807,15 +740,13 @@ bool LazyValueInfoImpl::solveBlockValuePHINode(ValueLatticeElement &BBLV,
       LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
                         << "' - overdefined because of pred (local).\n");
 
-      BBLV = Result;
-      return true;
+      return Result;
     }
   }
 
   // Return the merged value, which is more precise than 'overdefined'.
   assert(!Result.isOverdefined() && "Possible PHI in entry block?");
-  BBLV = Result;
-  return true;
+  return Result;
 }
 
 static ValueLatticeElement getValueFromCondition(Value *Val, Value *Cond,
@@ -829,63 +760,59 @@ void LazyValueInfoImpl::intersectAssumeOrGuardBlockValueConstantRange(
   if (!BBI)
     return;
 
+  BasicBlock *BB = BBI->getParent();
   for (auto &AssumeVH : AC->assumptionsFor(Val)) {
     if (!AssumeVH)
       continue;
+
+    // Only check assumes in the block of the context instruction. Other
+    // assumes will have already been taken into account when the value was
+    // propagated from predecessor blocks.
     auto *I = cast<CallInst>(AssumeVH);
-    if (!isValidAssumeForContext(I, BBI, DT))
+    if (I->getParent() != BB || !isValidAssumeForContext(I, BBI))
       continue;
 
     BBLV = intersect(BBLV, getValueFromCondition(Val, I->getArgOperand(0)));
   }
 
   // If guards are not used in the module, don't spend time looking for them
-  auto *GuardDecl = BBI->getModule()->getFunction(
-          Intrinsic::getName(Intrinsic::experimental_guard));
   if (!GuardDecl || GuardDecl->use_empty())
     return;
 
-  if (BBI->getIterator() == BBI->getParent()->begin())
+  if (BBI->getIterator() == BB->begin())
     return;
   for (Instruction &I : make_range(std::next(BBI->getIterator().getReverse()),
-                                   BBI->getParent()->rend())) {
+                                   BB->rend())) {
     Value *Cond = nullptr;
     if (match(&I, m_Intrinsic<Intrinsic::experimental_guard>(m_Value(Cond))))
       BBLV = intersect(BBLV, getValueFromCondition(Val, Cond));
   }
 }
 
-bool LazyValueInfoImpl::solveBlockValueSelect(ValueLatticeElement &BBLV,
-                                              SelectInst *SI, BasicBlock *BB) {
-
+Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueSelect(
+    SelectInst *SI, BasicBlock *BB) {
   // Recurse on our inputs if needed
-  if (!hasBlockValue(SI->getTrueValue(), BB)) {
-    if (pushBlockValue(std::make_pair(BB, SI->getTrueValue())))
-      return false;
-    BBLV = ValueLatticeElement::getOverdefined();
-    return true;
-  }
-  ValueLatticeElement TrueVal = getBlockValue(SI->getTrueValue(), BB);
+  Optional<ValueLatticeElement> OptTrueVal =
+      getBlockValue(SI->getTrueValue(), BB);
+  if (!OptTrueVal)
+    return None;
+  ValueLatticeElement &TrueVal = *OptTrueVal;
+
   // If we hit overdefined, don't ask more queries.  We want to avoid poisoning
   // extra slots in the table if we can.
-  if (TrueVal.isOverdefined()) {
-    BBLV = ValueLatticeElement::getOverdefined();
-    return true;
-  }
+  if (TrueVal.isOverdefined())
+    return ValueLatticeElement::getOverdefined();
+
+  Optional<ValueLatticeElement> OptFalseVal =
+      getBlockValue(SI->getFalseValue(), BB);
+  if (!OptFalseVal)
+    return None;
+  ValueLatticeElement &FalseVal = *OptFalseVal;
 
-  if (!hasBlockValue(SI->getFalseValue(), BB)) {
-    if (pushBlockValue(std::make_pair(BB, SI->getFalseValue())))
-      return false;
-    BBLV = ValueLatticeElement::getOverdefined();
-    return true;
-  }
-  ValueLatticeElement FalseVal = getBlockValue(SI->getFalseValue(), BB);
   // If we hit overdefined, don't ask more queries.  We want to avoid poisoning
   // extra slots in the table if we can.
-  if (FalseVal.isOverdefined()) {
-    BBLV = ValueLatticeElement::getOverdefined();
-    return true;
-  }
+  if (FalseVal.isOverdefined())
+    return ValueLatticeElement::getOverdefined();
 
   if (TrueVal.isConstantRange() && FalseVal.isConstantRange()) {
     const ConstantRange &TrueCR = TrueVal.getConstantRange();
@@ -911,31 +838,28 @@ bool LazyValueInfoImpl::solveBlockValueSelect(ValueLatticeElement &BBLV,
           return TrueCR.umax(FalseCR);
         };
       }();
-      BBLV = ValueLatticeElement::getRange(ResultCR);
-      return true;
+      return ValueLatticeElement::getRange(
+          ResultCR, TrueVal.isConstantRangeIncludingUndef() |
+                        FalseVal.isConstantRangeIncludingUndef());
     }
 
     if (SPR.Flavor == SPF_ABS) {
-      if (LHS == SI->getTrueValue()) {
-        BBLV = ValueLatticeElement::getRange(TrueCR.abs());
-        return true;
-      }
-      if (LHS == SI->getFalseValue()) {
-        BBLV = ValueLatticeElement::getRange(FalseCR.abs());
-        return true;
-      }
+      if (LHS == SI->getTrueValue())
+        return ValueLatticeElement::getRange(
+            TrueCR.abs(), TrueVal.isConstantRangeIncludingUndef());
+      if (LHS == SI->getFalseValue())
+        return ValueLatticeElement::getRange(
+            FalseCR.abs(), FalseVal.isConstantRangeIncludingUndef());
     }
 
     if (SPR.Flavor == SPF_NABS) {
       ConstantRange Zero(APInt::getNullValue(TrueCR.getBitWidth()));
-      if (LHS == SI->getTrueValue()) {
-        BBLV = ValueLatticeElement::getRange(Zero.sub(TrueCR.abs()));
-        return true;
-      }
-      if (LHS == SI->getFalseValue()) {
-        BBLV = ValueLatticeElement::getRange(Zero.sub(FalseCR.abs()));
-        return true;
-      }
+      if (LHS == SI->getTrueValue())
+        return ValueLatticeElement::getRange(
+            Zero.sub(TrueCR.abs()), FalseVal.isConstantRangeIncludingUndef());
+      if (LHS == SI->getFalseValue())
+        return ValueLatticeElement::getRange(
+            Zero.sub(FalseCR.abs()), FalseVal.isConstantRangeIncludingUndef());
     }
   }
 
@@ -990,41 +914,34 @@ bool LazyValueInfoImpl::solveBlockValueSelect(ValueLatticeElement &BBLV,
     }
   }
 
-  ValueLatticeElement Result;  // Start Undefined.
-  Result.mergeIn(TrueVal, DL);
-  Result.mergeIn(FalseVal, DL);
-  BBLV = Result;
-  return true;
+  ValueLatticeElement Result = TrueVal;
+  Result.mergeIn(FalseVal);
+  return Result;
 }
 
 Optional<ConstantRange> LazyValueInfoImpl::getRangeForOperand(unsigned Op,
                                                               Instruction *I,
                                                               BasicBlock *BB) {
-  if (!hasBlockValue(I->getOperand(Op), BB))
-    if (pushBlockValue(std::make_pair(BB, I->getOperand(Op))))
-      return None;
+  Optional<ValueLatticeElement> OptVal = getBlockValue(I->getOperand(Op), BB);
+  if (!OptVal)
+    return None;
+
+  ValueLatticeElement &Val = *OptVal;
+  intersectAssumeOrGuardBlockValueConstantRange(I->getOperand(Op), Val, I);
+  if (Val.isConstantRange())
+    return Val.getConstantRange();
 
   const unsigned OperandBitWidth =
     DL.getTypeSizeInBits(I->getOperand(Op)->getType());
-  ConstantRange Range = ConstantRange::getFull(OperandBitWidth);
-  if (hasBlockValue(I->getOperand(Op), BB)) {
-    ValueLatticeElement Val = getBlockValue(I->getOperand(Op), BB);
-    intersectAssumeOrGuardBlockValueConstantRange(I->getOperand(Op), Val, I);
-    if (Val.isConstantRange())
-      Range = Val.getConstantRange();
-  }
-  return Range;
+  return ConstantRange::getFull(OperandBitWidth);
 }
 
-bool LazyValueInfoImpl::solveBlockValueCast(ValueLatticeElement &BBLV,
-                                            CastInst *CI,
-                                            BasicBlock *BB) {
-  if (!CI->getOperand(0)->getType()->isSized()) {
-    // Without knowing how wide the input is, we can't analyze it in any useful
-    // way.
-    BBLV = ValueLatticeElement::getOverdefined();
-    return true;
-  }
+Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueCast(
+    CastInst *CI, BasicBlock *BB) {
+  // Without knowing how wide the input is, we can't analyze it in any useful
+  // way.
+  if (!CI->getOperand(0)->getType()->isSized())
+    return ValueLatticeElement::getOverdefined();
 
   // Filter out casts we don't know how to reason about before attempting to
   // recurse on our operand.  This can cut a long search short if we know we're
@@ -1039,8 +956,7 @@ bool LazyValueInfoImpl::solveBlockValueCast(ValueLatticeElement &BBLV,
     // Unhandled instructions are overdefined.
     LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
                       << "' - overdefined (unknown cast).\n");
-    BBLV = ValueLatticeElement::getOverdefined();
-    return true;
+    return ValueLatticeElement::getOverdefined();
   }
 
   // Figure out the range of the LHS.  If that fails, we still apply the
@@ -1049,21 +965,20 @@ bool LazyValueInfoImpl::solveBlockValueCast(ValueLatticeElement &BBLV,
   Optional<ConstantRange> LHSRes = getRangeForOperand(0, CI, BB);
   if (!LHSRes.hasValue())
     // More work to do before applying this transfer rule.
-    return false;
-  ConstantRange LHSRange = LHSRes.getValue();
+    return None;
+  const ConstantRange &LHSRange = LHSRes.getValue();
 
   const unsigned ResultBitWidth = CI->getType()->getIntegerBitWidth();
 
   // NOTE: We're currently limited by the set of operations that ConstantRange
   // can evaluate symbolically.  Enhancing that set will allows us to analyze
   // more definitions.
-  BBLV = ValueLatticeElement::getRange(LHSRange.castOp(CI->getOpcode(),
+  return ValueLatticeElement::getRange(LHSRange.castOp(CI->getOpcode(),
                                                        ResultBitWidth));
-  return true;
 }
 
-bool LazyValueInfoImpl::solveBlockValueBinaryOpImpl(
-    ValueLatticeElement &BBLV, Instruction *I, BasicBlock *BB,
+Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueBinaryOpImpl(
+    Instruction *I, BasicBlock *BB,
     std::function<ConstantRange(const ConstantRange &,
                                 const ConstantRange &)> OpFn) {
   // Figure out the ranges of the operands.  If that fails, use a
@@ -1074,26 +989,22 @@ bool LazyValueInfoImpl::solveBlockValueBinaryOpImpl(
   Optional<ConstantRange> RHSRes = getRangeForOperand(1, I, BB);
   if (!LHSRes.hasValue() || !RHSRes.hasValue())
     // More work to do before applying this transfer rule.
-    return false;
+    return None;
 
-  ConstantRange LHSRange = LHSRes.getValue();
-  ConstantRange RHSRange = RHSRes.getValue();
-  BBLV = ValueLatticeElement::getRange(OpFn(LHSRange, RHSRange));
-  return true;
+  const ConstantRange &LHSRange = LHSRes.getValue();
+  const ConstantRange &RHSRange = RHSRes.getValue();
+  return ValueLatticeElement::getRange(OpFn(LHSRange, RHSRange));
 }
 
-bool LazyValueInfoImpl::solveBlockValueBinaryOp(ValueLatticeElement &BBLV,
-                                                BinaryOperator *BO,
-                                                BasicBlock *BB) {
-
+Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueBinaryOp(
+    BinaryOperator *BO, BasicBlock *BB) {
   assert(BO->getOperand(0)->getType()->isSized() &&
          "all operands to binary operators are sized");
   if (BO->getOpcode() == Instruction::Xor) {
     // Xor is the only operation not supported by ConstantRange::binaryOp().
     LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
                       << "' - overdefined (unknown binary operator).\n");
-    BBLV = ValueLatticeElement::getOverdefined();
-    return true;
+    return ValueLatticeElement::getOverdefined();
   }
 
   if (auto *OBO = dyn_cast<OverflowingBinaryOperator>(BO)) {
@@ -1104,47 +1015,49 @@ bool LazyValueInfoImpl::solveBlockValueBinaryOp(ValueLatticeElement &BBLV,
       NoWrapKind |= OverflowingBinaryOperator::NoSignedWrap;
 
     return solveBlockValueBinaryOpImpl(
-        BBLV, BO, BB,
+        BO, BB,
         [BO, NoWrapKind](const ConstantRange &CR1, const ConstantRange &CR2) {
           return CR1.overflowingBinaryOp(BO->getOpcode(), CR2, NoWrapKind);
         });
   }
 
   return solveBlockValueBinaryOpImpl(
-      BBLV, BO, BB, [BO](const ConstantRange &CR1, const ConstantRange &CR2) {
+      BO, BB, [BO](const ConstantRange &CR1, const ConstantRange &CR2) {
         return CR1.binaryOp(BO->getOpcode(), CR2);
       });
 }
 
-bool LazyValueInfoImpl::solveBlockValueOverflowIntrinsic(
-    ValueLatticeElement &BBLV, WithOverflowInst *WO, BasicBlock *BB) {
-  return solveBlockValueBinaryOpImpl(BBLV, WO, BB,
-      [WO](const ConstantRange &CR1, const ConstantRange &CR2) {
+Optional<ValueLatticeElement>
+LazyValueInfoImpl::solveBlockValueOverflowIntrinsic(WithOverflowInst *WO,
+                                                    BasicBlock *BB) {
+  return solveBlockValueBinaryOpImpl(
+      WO, BB, [WO](const ConstantRange &CR1, const ConstantRange &CR2) {
         return CR1.binaryOp(WO->getBinaryOp(), CR2);
       });
 }
 
-bool LazyValueInfoImpl::solveBlockValueSaturatingIntrinsic(
-    ValueLatticeElement &BBLV, SaturatingInst *SI, BasicBlock *BB) {
+Optional<ValueLatticeElement>
+LazyValueInfoImpl::solveBlockValueSaturatingIntrinsic(SaturatingInst *SI,
+                                                      BasicBlock *BB) {
   switch (SI->getIntrinsicID()) {
   case Intrinsic::uadd_sat:
     return solveBlockValueBinaryOpImpl(
-        BBLV, SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) {
+        SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) {
           return CR1.uadd_sat(CR2);
         });
   case Intrinsic::usub_sat:
     return solveBlockValueBinaryOpImpl(
-        BBLV, SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) {
+        SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) {
           return CR1.usub_sat(CR2);
         });
   case Intrinsic::sadd_sat:
     return solveBlockValueBinaryOpImpl(
-        BBLV, SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) {
+        SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) {
           return CR1.sadd_sat(CR2);
         });
   case Intrinsic::ssub_sat:
     return solveBlockValueBinaryOpImpl(
-        BBLV, SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) {
+        SI, BB, [](const ConstantRange &CR1, const ConstantRange &CR2) {
           return CR1.ssub_sat(CR2);
         });
   default:
@@ -1152,58 +1065,71 @@ bool LazyValueInfoImpl::solveBlockValueSaturatingIntrinsic(
   }
 }
 
-bool LazyValueInfoImpl::solveBlockValueIntrinsic(ValueLatticeElement &BBLV,
-                                                 IntrinsicInst *II,
-                                                 BasicBlock *BB) {
+Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueIntrinsic(
+    IntrinsicInst *II, BasicBlock *BB) {
   if (auto *SI = dyn_cast<SaturatingInst>(II))
-    return solveBlockValueSaturatingIntrinsic(BBLV, SI, BB);
+    return solveBlockValueSaturatingIntrinsic(SI, BB);
 
   LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
                     << "' - overdefined (unknown intrinsic).\n");
-  BBLV = ValueLatticeElement::getOverdefined();
-  return true;
+  return ValueLatticeElement::getOverdefined();
 }
 
-bool LazyValueInfoImpl::solveBlockValueExtractValue(
-    ValueLatticeElement &BBLV, ExtractValueInst *EVI, BasicBlock *BB) {
+Optional<ValueLatticeElement> LazyValueInfoImpl::solveBlockValueExtractValue(
+    ExtractValueInst *EVI, BasicBlock *BB) {
   if (auto *WO = dyn_cast<WithOverflowInst>(EVI->getAggregateOperand()))
     if (EVI->getNumIndices() == 1 && *EVI->idx_begin() == 0)
-      return solveBlockValueOverflowIntrinsic(BBLV, WO, BB);
+      return solveBlockValueOverflowIntrinsic(WO, BB);
 
   // Handle extractvalue of insertvalue to allow further simplification
   // based on replaced with.overflow intrinsics.
   if (Value *V = SimplifyExtractValueInst(
           EVI->getAggregateOperand(), EVI->getIndices(),
-          EVI->getModule()->getDataLayout())) {
-    if (!hasBlockValue(V, BB)) {
-      if (pushBlockValue({ BB, V }))
-        return false;
-      BBLV = ValueLatticeElement::getOverdefined();
-      return true;
-    }
-    BBLV = getBlockValue(V, BB);
-    return true;
-  }
+          EVI->getModule()->getDataLayout()))
+    return getBlockValue(V, BB);
 
   LLVM_DEBUG(dbgs() << " compute BB '" << BB->getName()
                     << "' - overdefined (unknown extractvalue).\n");
-  BBLV = ValueLatticeElement::getOverdefined();
-  return true;
+  return ValueLatticeElement::getOverdefined();
+}
+
+static bool matchICmpOperand(const APInt *&Offset, Value *LHS, Value *Val,
+                             ICmpInst::Predicate Pred) {
+  if (LHS == Val)
+    return true;
+
+  // Handle range checking idiom produced by InstCombine. We will subtract the
+  // offset from the allowed range for RHS in this case.
+  if (match(LHS, m_Add(m_Specific(Val), m_APInt(Offset))))
+    return true;
+
+  // If (x | y) < C, then (x < C) && (y < C).
+  if (match(LHS, m_c_Or(m_Specific(Val), m_Value())) &&
+      (Pred == ICmpInst::ICMP_ULT || Pred == ICmpInst::ICMP_ULE))
+    return true;
+
+  // If (x & y) > C, then (x > C) && (y > C).
+  if (match(LHS, m_c_And(m_Specific(Val), m_Value())) &&
+      (Pred == ICmpInst::ICMP_UGT || Pred == ICmpInst::ICMP_UGE))
+    return true;
+
+  return false;
 }
 
 static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI,
                                                      bool isTrueDest) {
   Value *LHS = ICI->getOperand(0);
   Value *RHS = ICI->getOperand(1);
-  CmpInst::Predicate Predicate = ICI->getPredicate();
+
+  // Get the predicate that must hold along the considered edge.
+  CmpInst::Predicate EdgePred =
+      isTrueDest ? ICI->getPredicate() : ICI->getInversePredicate();
 
   if (isa<Constant>(RHS)) {
     if (ICI->isEquality() && LHS == Val) {
-      // We know that V has the RHS constant if this is a true SETEQ or
-      // false SETNE.
-      if (isTrueDest == (Predicate == ICmpInst::ICMP_EQ))
+      if (EdgePred == ICmpInst::ICMP_EQ)
         return ValueLatticeElement::get(cast<Constant>(RHS));
-      else
+      else if (!isa<UndefValue>(RHS))
         return ValueLatticeElement::getNot(cast<Constant>(RHS));
     }
   }
@@ -1211,47 +1137,31 @@ static ValueLatticeElement getValueFromICmpCondition(Value *Val, ICmpInst *ICI,
   if (!Val->getType()->isIntegerTy())
     return ValueLatticeElement::getOverdefined();
 
-  // Use ConstantRange::makeAllowedICmpRegion in order to determine the possible
-  // range of Val guaranteed by the condition. Recognize comparisons in the from
-  // of:
-  //  icmp <pred> Val, ...
-  //  icmp <pred> (add Val, Offset), ...
-  // The latter is the range checking idiom that InstCombine produces. Subtract
-  // the offset from the allowed range for RHS in this case.
-
-  // Val or (add Val, Offset) can be on either hand of the comparison
-  if (LHS != Val && !match(LHS, m_Add(m_Specific(Val), m_ConstantInt()))) {
+  const APInt *Offset = nullptr;
+  if (!matchICmpOperand(Offset, LHS, Val, EdgePred)) {
     std::swap(LHS, RHS);
-    Predicate = CmpInst::getSwappedPredicate(Predicate);
+    EdgePred = CmpInst::getSwappedPredicate(EdgePred);
+    if (!matchICmpOperand(Offset, LHS, Val, EdgePred))
+      return ValueLatticeElement::getOverdefined();
   }
 
-  ConstantInt *Offset = nullptr;
-  if (LHS != Val)
-    match(LHS, m_Add(m_Specific(Val), m_ConstantInt(Offset)));
-
-  if (LHS == Val || Offset) {
-    // Calculate the range of values that are allowed by the comparison
-    ConstantRange RHSRange(RHS->getType()->getIntegerBitWidth(),
-                           /*isFullSet=*/true);
-    if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS))
-      RHSRange = ConstantRange(CI->getValue());
-    else if (Instruction *I = dyn_cast<Instruction>(RHS))
-      if (auto *Ranges = I->getMetadata(LLVMContext::MD_range))
-        RHSRange = getConstantRangeFromMetadata(*Ranges);
-
-    // If we're interested in the false dest, invert the condition
-    CmpInst::Predicate Pred =
-            isTrueDest ? Predicate : CmpInst::getInversePredicate(Predicate);
-    ConstantRange TrueValues =
-            ConstantRange::makeAllowedICmpRegion(Pred, RHSRange);
-
-    if (Offset) // Apply the offset from above.
-      TrueValues = TrueValues.subtract(Offset->getValue());
-
-    return ValueLatticeElement::getRange(std::move(TrueValues));
-  }
+  // Calculate the range of values that are allowed by the comparison.
+  ConstantRange RHSRange(RHS->getType()->getIntegerBitWidth(),
+                         /*isFullSet=*/true);
+  if (ConstantInt *CI = dyn_cast<ConstantInt>(RHS))
+    RHSRange = ConstantRange(CI->getValue());
+  else if (Instruction *I = dyn_cast<Instruction>(RHS))
+    if (auto *Ranges = I->getMetadata(LLVMContext::MD_range))
+      RHSRange = getConstantRangeFromMetadata(*Ranges);
 
-  return ValueLatticeElement::getOverdefined();
+  // If we're interested in the false dest, invert the condition
+  ConstantRange TrueValues =
+      ConstantRange::makeAllowedICmpRegion(EdgePred, RHSRange);
+
+  if (Offset) // Apply the offset from above.
+    TrueValues = TrueValues.subtract(*Offset);
+
+  return ValueLatticeElement::getRange(std::move(TrueValues));
 }
 
 // Handle conditions of the form
@@ -1278,11 +1188,11 @@ static ValueLatticeElement getValueFromOverflowCondition(
 
 static ValueLatticeElement
 getValueFromCondition(Value *Val, Value *Cond, bool isTrueDest,
-                      DenseMap<Value*, ValueLatticeElement> &Visited);
+                      SmallDenseMap<Value*, ValueLatticeElement> &Visited);
 
 static ValueLatticeElement
 getValueFromConditionImpl(Value *Val, Value *Cond, bool isTrueDest,
-                          DenseMap<Value*, ValueLatticeElement> &Visited) {
+                          SmallDenseMap<Value*, ValueLatticeElement> &Visited) {
   if (ICmpInst *ICI = dyn_cast<ICmpInst>(Cond))
     return getValueFromICmpCondition(Val, ICI, isTrueDest);
 
@@ -1315,7 +1225,7 @@ getValueFromConditionImpl(Value *Val, Value *Cond, bool isTrueDest,
 
 static ValueLatticeElement
 getValueFromCondition(Value *Val, Value *Cond, bool isTrueDest,
-                      DenseMap<Value*, ValueLatticeElement> &Visited) {
+                      SmallDenseMap<Value*, ValueLatticeElement> &Visited) {
   auto I = Visited.find(Cond);
   if (I != Visited.end())
     return I->second;
@@ -1328,7 +1238,7 @@ getValueFromCondition(Value *Val, Value *Cond, bool isTrueDest,
 ValueLatticeElement getValueFromCondition(Value *Val, Value *Cond,
                                           bool isTrueDest) {
   assert(Cond && "precondition");
-  DenseMap<Value*, ValueLatticeElement> Visited;
+  SmallDenseMap<Value*, ValueLatticeElement> Visited;
   return getValueFromCondition(Val, Cond, isTrueDest, Visited);
 }
 
@@ -1380,8 +1290,9 @@ static ValueLatticeElement constantFoldUser(User *Usr, Value *Op,
 /// Compute the value of Val on the edge BBFrom -> BBTo. Returns false if
 /// Val is not constrained on the edge.  Result is unspecified if return value
 /// is false.
-static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
-                              BasicBlock *BBTo, ValueLatticeElement &Result) {
+static Optional<ValueLatticeElement> getEdgeValueLocal(Value *Val,
+                                                       BasicBlock *BBFrom,
+                                                       BasicBlock *BBTo) {
   // TODO: Handle more complex conditionals. If (v == 0 || v2 < 1) is false, we
   // know that v != 0.
   if (BranchInst *BI = dyn_cast<BranchInst>(BBFrom->getTerminator())) {
@@ -1396,17 +1307,16 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
 
       // If V is the condition of the branch itself, then we know exactly what
       // it is.
-      if (Condition == Val) {
-        Result = ValueLatticeElement::get(ConstantInt::get(
+      if (Condition == Val)
+        return ValueLatticeElement::get(ConstantInt::get(
                               Type::getInt1Ty(Val->getContext()), isTrueDest));
-        return true;
-      }
 
       // If the condition of the branch is an equality comparison, we may be
       // able to infer the value.
-      Result = getValueFromCondition(Val, Condition, isTrueDest);
+      ValueLatticeElement Result = getValueFromCondition(Val, Condition,
+                                                         isTrueDest);
       if (!Result.isOverdefined())
-        return true;
+        return Result;
 
       if (User *Usr = dyn_cast<User>(Val)) {
         assert(Result.isOverdefined() && "Result isn't overdefined");
@@ -1446,7 +1356,7 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
         }
       }
       if (!Result.isOverdefined())
-        return true;
+        return Result;
     }
   }
 
@@ -1455,7 +1365,7 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
   if (SwitchInst *SI = dyn_cast<SwitchInst>(BBFrom->getTerminator())) {
     Value *Condition = SI->getCondition();
     if (!isa<IntegerType>(Val->getType()))
-      return false;
+      return None;
     bool ValUsesConditionAndMayBeFoldable = false;
     if (Condition != Val) {
       // Check if Val has Condition as an operand.
@@ -1463,7 +1373,7 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
         ValUsesConditionAndMayBeFoldable = isOperationFoldable(Usr) &&
             usesOperand(Usr, Condition);
       if (!ValUsesConditionAndMayBeFoldable)
-        return false;
+        return None;
     }
     assert((Condition == Val || ValUsesConditionAndMayBeFoldable) &&
            "Condition != Val nor Val doesn't use Condition");
@@ -1481,7 +1391,7 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
         ValueLatticeElement EdgeLatticeVal =
             constantFoldUser(Usr, Condition, CaseValue, DL);
         if (EdgeLatticeVal.isOverdefined())
-          return false;
+          return None;
         EdgeVal = EdgeLatticeVal.getConstantRange();
       }
       if (DefaultCase) {
@@ -1496,46 +1406,31 @@ static bool getEdgeValueLocal(Value *Val, BasicBlock *BBFrom,
       } else if (Case.getCaseSuccessor() == BBTo)
         EdgesVals = EdgesVals.unionWith(EdgeVal);
     }
-    Result = ValueLatticeElement::getRange(std::move(EdgesVals));
-    return true;
+    return ValueLatticeElement::getRange(std::move(EdgesVals));
   }
-  return false;
+  return None;
 }
 
 /// Compute the value of Val on the edge BBFrom -> BBTo or the value at
 /// the basic block if the edge does not constrain Val.
-bool LazyValueInfoImpl::getEdgeValue(Value *Val, BasicBlock *BBFrom,
-                                     BasicBlock *BBTo,
-                                     ValueLatticeElement &Result,
-                                     Instruction *CxtI) {
+Optional<ValueLatticeElement> LazyValueInfoImpl::getEdgeValue(
+    Value *Val, BasicBlock *BBFrom, BasicBlock *BBTo, Instruction *CxtI) {
   // If already a constant, there is nothing to compute.
-  if (Constant *VC = dyn_cast<Constant>(Val)) {
-    Result = ValueLatticeElement::get(VC);
-    return true;
-  }
-
-  ValueLatticeElement LocalResult;
-  if (!getEdgeValueLocal(Val, BBFrom, BBTo, LocalResult))
-    // If we couldn't constrain the value on the edge, LocalResult doesn't
-    // provide any information.
-    LocalResult = ValueLatticeElement::getOverdefined();
+  if (Constant *VC = dyn_cast<Constant>(Val))
+    return ValueLatticeElement::get(VC);
 
-  if (hasSingleValue(LocalResult)) {
+  ValueLatticeElement LocalResult = getEdgeValueLocal(Val, BBFrom, BBTo)
+      .getValueOr(ValueLatticeElement::getOverdefined());
+  if (hasSingleValue(LocalResult))
     // Can't get any more precise here
-    Result = LocalResult;
-    return true;
-  }
+    return LocalResult;
 
-  if (!hasBlockValue(Val, BBFrom)) {
-    if (pushBlockValue(std::make_pair(BBFrom, Val)))
-      return false;
-    // No new information.
-    Result = LocalResult;
-    return true;
-  }
+  Optional<ValueLatticeElement> OptInBlock = getBlockValue(Val, BBFrom);
+  if (!OptInBlock)
+    return None;
+  ValueLatticeElement &InBlock = *OptInBlock;
 
   // Try to intersect ranges of the BB and the constraint on the edge.
-  ValueLatticeElement InBlock = getBlockValue(Val, BBFrom);
   intersectAssumeOrGuardBlockValueConstantRange(Val, InBlock,
                                                 BBFrom->getTerminator());
   // We can use the context instruction (generically the ultimate instruction
@@ -1548,8 +1443,7 @@ bool LazyValueInfoImpl::getEdgeValue(Value *Val, BasicBlock *BBFrom,
   // but then the result is not cached.
   intersectAssumeOrGuardBlockValueConstantRange(Val, InBlock, CxtI);
 
-  Result = intersect(LocalResult, InBlock);
-  return true;
+  return intersect(LocalResult, InBlock);
 }
 
 ValueLatticeElement LazyValueInfoImpl::getValueInBlock(Value *V, BasicBlock *BB,
@@ -1558,11 +1452,13 @@ ValueLatticeElement LazyValueInfoImpl::getValueInBlock(Value *V, BasicBlock *BB,
                     << BB->getName() << "'\n");
 
   assert(BlockValueStack.empty() && BlockValueSet.empty());
-  if (!hasBlockValue(V, BB)) {
-    pushBlockValue(std::make_pair(BB, V));
+  Optional<ValueLatticeElement> OptResult = getBlockValue(V, BB);
+  if (!OptResult) {
     solve();
+    OptResult = getBlockValue(V, BB);
+    assert(OptResult && "Value not available after solving");
   }
-  ValueLatticeElement Result = getBlockValue(V, BB);
+  ValueLatticeElement Result = *OptResult;
   intersectAssumeOrGuardBlockValueConstantRange(V, Result, CxtI);
 
   LLVM_DEBUG(dbgs() << "  Result = " << Result << "\n");
@@ -1592,16 +1488,15 @@ getValueOnEdge(Value *V, BasicBlock *FromBB, BasicBlock *ToBB,
                     << FromBB->getName() << "' to '" << ToBB->getName()
                     << "'\n");
 
-  ValueLatticeElement Result;
-  if (!getEdgeValue(V, FromBB, ToBB, Result, CxtI)) {
+  Optional<ValueLatticeElement> Result = getEdgeValue(V, FromBB, ToBB, CxtI);
+  if (!Result) {
     solve();
-    bool WasFastQuery = getEdgeValue(V, FromBB, ToBB, Result, CxtI);
-    (void)WasFastQuery;
-    assert(WasFastQuery && "More work to do after problem solved?");
+    Result = getEdgeValue(V, FromBB, ToBB, CxtI);
+    assert(Result && "More work to do after problem solved?");
   }
 
-  LLVM_DEBUG(dbgs() << "  Result = " << Result << "\n");
-  return Result;
+  LLVM_DEBUG(dbgs() << "  Result = " << *Result << "\n");
+  return *Result;
 }
 
 void LazyValueInfoImpl::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
@@ -1615,26 +1510,23 @@ void LazyValueInfoImpl::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
 
 /// This lazily constructs the LazyValueInfoImpl.
 static LazyValueInfoImpl &getImpl(void *&PImpl, AssumptionCache *AC,
-                                  const DataLayout *DL,
-                                  DominatorTree *DT = nullptr) {
+                                  const Module *M) {
   if (!PImpl) {
-    assert(DL && "getCache() called with a null DataLayout");
-    PImpl = new LazyValueInfoImpl(AC, *DL, DT);
+    assert(M && "getCache() called with a null Module");
+    const DataLayout &DL = M->getDataLayout();
+    Function *GuardDecl = M->getFunction(
+        Intrinsic::getName(Intrinsic::experimental_guard));
+    PImpl = new LazyValueInfoImpl(AC, DL, GuardDecl);
   }
   return *static_cast<LazyValueInfoImpl*>(PImpl);
 }
 
 bool LazyValueInfoWrapperPass::runOnFunction(Function &F) {
   Info.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
-  const DataLayout &DL = F.getParent()->getDataLayout();
-
-  DominatorTreeWrapperPass *DTWP =
-      getAnalysisIfAvailable<DominatorTreeWrapperPass>();
-  Info.DT = DTWP ? &DTWP->getDomTree() : nullptr;
   Info.TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
 
   if (Info.PImpl)
-    getImpl(Info.PImpl, Info.AC, &DL, Info.DT).clear();
+    getImpl(Info.PImpl, Info.AC, F.getParent()).clear();
 
   // Fully lazy.
   return false;
@@ -1663,8 +1555,7 @@ bool LazyValueInfo::invalidate(Function &F, const PreservedAnalyses &PA,
   // We need to invalidate if we have either failed to preserve this analyses
   // result directly or if any of its dependencies have been invalidated.
   auto PAC = PA.getChecker<LazyValueAnalysis>();
-  if (!(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>()) ||
-      (DT && Inv.invalidate<DominatorTreeAnalysis>(F, PA)))
+  if (!(PAC.preserved() || PAC.preservedSet<AllAnalysesOn<Function>>()))
     return true;
 
   return false;
@@ -1676,9 +1567,8 @@ LazyValueInfo LazyValueAnalysis::run(Function &F,
                                      FunctionAnalysisManager &FAM) {
   auto &AC = FAM.getResult<AssumptionAnalysis>(F);
   auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
-  auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
 
-  return LazyValueInfo(&AC, &F.getParent()->getDataLayout(), &TLI, DT);
+  return LazyValueInfo(&AC, &F.getParent()->getDataLayout(), &TLI);
 }
 
 /// Returns true if we can statically tell that this value will never be a
@@ -1701,9 +1591,8 @@ Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB,
   if (isKnownNonConstant(V))
     return nullptr;
 
-  const DataLayout &DL = BB->getModule()->getDataLayout();
   ValueLatticeElement Result =
-      getImpl(PImpl, AC, &DL, DT).getValueInBlock(V, BB, CxtI);
+      getImpl(PImpl, AC, BB->getModule()).getValueInBlock(V, BB, CxtI);
 
   if (Result.isConstant())
     return Result.getConstant();
@@ -1716,16 +1605,16 @@ Constant *LazyValueInfo::getConstant(Value *V, BasicBlock *BB,
 }
 
 ConstantRange LazyValueInfo::getConstantRange(Value *V, BasicBlock *BB,
-                                              Instruction *CxtI) {
+                                              Instruction *CxtI,
+                                              bool UndefAllowed) {
   assert(V->getType()->isIntegerTy());
   unsigned Width = V->getType()->getIntegerBitWidth();
-  const DataLayout &DL = BB->getModule()->getDataLayout();
   ValueLatticeElement Result =
-      getImpl(PImpl, AC, &DL, DT).getValueInBlock(V, BB, CxtI);
-  if (Result.isUndefined())
+      getImpl(PImpl, AC, BB->getModule()).getValueInBlock(V, BB, CxtI);
+  if (Result.isUnknown())
     return ConstantRange::getEmpty(Width);
-  if (Result.isConstantRange())
-    return Result.getConstantRange();
+  if (Result.isConstantRange(UndefAllowed))
+    return Result.getConstantRange(UndefAllowed);
   // We represent ConstantInt constants as constant ranges but other kinds
   // of integer constants, i.e. ConstantExpr will be tagged as constants
   assert(!(Result.isConstant() && isa<ConstantInt>(Result.getConstant())) &&
@@ -1738,9 +1627,9 @@ ConstantRange LazyValueInfo::getConstantRange(Value *V, BasicBlock *BB,
 Constant *LazyValueInfo::getConstantOnEdge(Value *V, BasicBlock *FromBB,
                                            BasicBlock *ToBB,
                                            Instruction *CxtI) {
-  const DataLayout &DL = FromBB->getModule()->getDataLayout();
+  Module *M = FromBB->getModule();
   ValueLatticeElement Result =
-      getImpl(PImpl, AC, &DL, DT).getValueOnEdge(V, FromBB, ToBB, CxtI);
+      getImpl(PImpl, AC, M).getValueOnEdge(V, FromBB, ToBB, CxtI);
 
   if (Result.isConstant())
     return Result.getConstant();
@@ -1757,11 +1646,11 @@ ConstantRange LazyValueInfo::getConstantRangeOnEdge(Value *V,
                                                     BasicBlock *ToBB,
                                                     Instruction *CxtI) {
   unsigned Width = V->getType()->getIntegerBitWidth();
-  const DataLayout &DL = FromBB->getModule()->getDataLayout();
+  Module *M = FromBB->getModule();
   ValueLatticeElement Result =
-      getImpl(PImpl, AC, &DL, DT).getValueOnEdge(V, FromBB, ToBB, CxtI);
+      getImpl(PImpl, AC, M).getValueOnEdge(V, FromBB, ToBB, CxtI);
 
-  if (Result.isUndefined())
+  if (Result.isUnknown())
     return ConstantRange::getEmpty(Width);
   if (Result.isConstantRange())
     return Result.getConstantRange();
@@ -1843,11 +1732,11 @@ LazyValueInfo::Tristate
 LazyValueInfo::getPredicateOnEdge(unsigned Pred, Value *V, Constant *C,
                                   BasicBlock *FromBB, BasicBlock *ToBB,
                                   Instruction *CxtI) {
-  const DataLayout &DL = FromBB->getModule()->getDataLayout();
+  Module *M = FromBB->getModule();
   ValueLatticeElement Result =
-      getImpl(PImpl, AC, &DL, DT).getValueOnEdge(V, FromBB, ToBB, CxtI);
+      getImpl(PImpl, AC, M).getValueOnEdge(V, FromBB, ToBB, CxtI);
 
-  return getPredicateResult(Pred, C, Result, DL, TLI);
+  return getPredicateResult(Pred, C, Result, M->getDataLayout(), TLI);
 }
 
 LazyValueInfo::Tristate
@@ -1857,7 +1746,8 @@ LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C,
   // isKnownNonZero can tell us the result of the predicate, we can
   // return it quickly. But this is only a fastpath, and falling
   // through would still be correct.
-  const DataLayout &DL = CxtI->getModule()->getDataLayout();
+  Module *M = CxtI->getModule();
+  const DataLayout &DL = M->getDataLayout();
   if (V->getType()->isPointerTy() && C->isNullValue() &&
       isKnownNonZero(V->stripPointerCastsSameRepresentation(), DL)) {
     if (Pred == ICmpInst::ICMP_EQ)
@@ -1865,7 +1755,7 @@ LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C,
     else if (Pred == ICmpInst::ICMP_NE)
       return LazyValueInfo::True;
   }
-  ValueLatticeElement Result = getImpl(PImpl, AC, &DL, DT).getValueAt(V, CxtI);
+  ValueLatticeElement Result = getImpl(PImpl, AC, M).getValueAt(V, CxtI);
   Tristate Ret = getPredicateResult(Pred, C, Result, DL, TLI);
   if (Ret != Unknown)
     return Ret;
@@ -1954,35 +1844,24 @@ LazyValueInfo::getPredicateAt(unsigned Pred, Value *V, Constant *C,
 void LazyValueInfo::threadEdge(BasicBlock *PredBB, BasicBlock *OldSucc,
                                BasicBlock *NewSucc) {
   if (PImpl) {
-    const DataLayout &DL = PredBB->getModule()->getDataLayout();
-    getImpl(PImpl, AC, &DL, DT).threadEdge(PredBB, OldSucc, NewSucc);
+    getImpl(PImpl, AC, PredBB->getModule())
+        .threadEdge(PredBB, OldSucc, NewSucc);
   }
 }
 
 void LazyValueInfo::eraseBlock(BasicBlock *BB) {
   if (PImpl) {
-    const DataLayout &DL = BB->getModule()->getDataLayout();
-    getImpl(PImpl, AC, &DL, DT).eraseBlock(BB);
+    getImpl(PImpl, AC, BB->getModule()).eraseBlock(BB);
   }
 }
 
 
 void LazyValueInfo::printLVI(Function &F, DominatorTree &DTree, raw_ostream &OS) {
   if (PImpl) {
-    getImpl(PImpl, AC, DL, DT).printLVI(F, DTree, OS);
+    getImpl(PImpl, AC, F.getParent()).printLVI(F, DTree, OS);
   }
 }
 
-void LazyValueInfo::disableDT() {
-  if (PImpl)
-    getImpl(PImpl, AC, DL, DT).disableDT();
-}
-
-void LazyValueInfo::enableDT() {
-  if (PImpl)
-    getImpl(PImpl, AC, DL, DT).enableDT();
-}
-
 // Print the LVI for the function arguments at the start of each basic block.
 void LazyValueInfoAnnotatedWriter::emitBasicBlockStartAnnot(
     const BasicBlock *BB, formatted_raw_ostream &OS) {
@@ -1991,7 +1870,7 @@ void LazyValueInfoAnnotatedWriter::emitBasicBlockStartAnnot(
   for (auto &Arg : F->args()) {
     ValueLatticeElement Result = LVIImpl->getValueInBlock(
         const_cast<Argument *>(&Arg), const_cast<BasicBlock *>(BB));
-    if (Result.isUndefined())
+    if (Result.isUnknown())
       continue;
     OS << "; LatticeVal for: '" << Arg << "' is: " << Result << "\n";
   }
diff --git a/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp b/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp
index 0f274429f11f..10ead1019206 100644
--- a/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp
+++ b/llvm/lib/Analysis/LegacyDivergenceAnalysis.cpp
@@ -301,14 +301,13 @@ FunctionPass *llvm::createLegacyDivergenceAnalysisPass() {
 void LegacyDivergenceAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<DominatorTreeWrapperPass>();
   AU.addRequired<PostDominatorTreeWrapperPass>();
-  if (UseGPUDA)
-    AU.addRequired<LoopInfoWrapperPass>();
+  AU.addRequired<LoopInfoWrapperPass>();
   AU.setPreservesAll();
 }
 
 bool LegacyDivergenceAnalysis::shouldUseGPUDivergenceAnalysis(
-    const Function &F) const {
-  if (!UseGPUDA)
+    const Function &F, const TargetTransformInfo &TTI) const {
+  if (!(UseGPUDA || TTI.useGPUDivergenceAnalysis()))
     return false;
 
   // GPUDivergenceAnalysis requires a reducible CFG.
@@ -337,7 +336,7 @@ bool LegacyDivergenceAnalysis::runOnFunction(Function &F) {
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
 
-  if (shouldUseGPUDivergenceAnalysis(F)) {
+  if (shouldUseGPUDivergenceAnalysis(F, TTI)) {
     // run the new GPU divergence analysis
     auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
     gpuDA = std::make_unique<GPUDivergenceAnalysis>(F, DT, PDT, LI, TTI);
diff --git a/llvm/lib/Analysis/Lint.cpp b/llvm/lib/Analysis/Lint.cpp
index ba945eb4318f..564c00dbad98 100644
--- a/llvm/lib/Analysis/Lint.cpp
+++ b/llvm/lib/Analysis/Lint.cpp
@@ -49,7 +49,6 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -93,15 +92,12 @@ namespace {
 
     void visitFunction(Function &F);
 
-    void visitCallSite(CallSite CS);
-    void visitMemoryReference(Instruction &I, Value *Ptr,
-                              uint64_t Size, unsigned Align,
-                              Type *Ty, unsigned Flags);
+    void visitCallBase(CallBase &CB);
+    void visitMemoryReference(Instruction &I, Value *Ptr, uint64_t Size,
+                              MaybeAlign Alignment, Type *Ty, unsigned Flags);
     void visitEHBeginCatch(IntrinsicInst *II);
     void visitEHEndCatch(IntrinsicInst *II);
 
-    void visitCallInst(CallInst &I);
-    void visitInvokeInst(InvokeInst &I);
     void visitReturnInst(ReturnInst &I);
     void visitLoadInst(LoadInst &I);
     void visitStoreInst(StoreInst &I);
@@ -222,21 +218,20 @@ void Lint::visitFunction(Function &F) {
   // TODO: Check for irreducible control flow.
 }
 
-void Lint::visitCallSite(CallSite CS) {
-  Instruction &I = *CS.getInstruction();
-  Value *Callee = CS.getCalledValue();
+void Lint::visitCallBase(CallBase &I) {
+  Value *Callee = I.getCalledOperand();
 
-  visitMemoryReference(I, Callee, MemoryLocation::UnknownSize, 0, nullptr,
+  visitMemoryReference(I, Callee, MemoryLocation::UnknownSize, None, nullptr,
                        MemRef::Callee);
 
   if (Function *F = dyn_cast<Function>(findValue(Callee,
                                                  /*OffsetOk=*/false))) {
-    Assert(CS.getCallingConv() == F->getCallingConv(),
+    Assert(I.getCallingConv() == F->getCallingConv(),
            "Undefined behavior: Caller and callee calling convention differ",
            &I);
 
     FunctionType *FT = F->getFunctionType();
-    unsigned NumActualArgs = CS.arg_size();
+    unsigned NumActualArgs = I.arg_size();
 
     Assert(FT->isVarArg() ? FT->getNumParams() <= NumActualArgs
                           : FT->getNumParams() == NumActualArgs,
@@ -252,7 +247,7 @@ void Lint::visitCallSite(CallSite CS) {
     // Check argument types (in case the callee was casted) and attributes.
     // TODO: Verify that caller and callee attributes are compatible.
     Function::arg_iterator PI = F->arg_begin(), PE = F->arg_end();
-    CallSite::arg_iterator AI = CS.arg_begin(), AE = CS.arg_end();
+    auto AI = I.arg_begin(), AE = I.arg_end();
     for (; AI != AE; ++AI) {
       Value *Actual = *AI;
       if (PI != PE) {
@@ -266,16 +261,15 @@ void Lint::visitCallSite(CallSite CS) {
         // not fully precise because we don't know the sizes of the dereferenced
         // memory regions.
         if (Formal->hasNoAliasAttr() && Actual->getType()->isPointerTy()) {
-          AttributeList PAL = CS.getAttributes();
+          AttributeList PAL = I.getAttributes();
           unsigned ArgNo = 0;
-          for (CallSite::arg_iterator BI = CS.arg_begin(); BI != AE;
-               ++BI, ++ArgNo) {
+          for (auto BI = I.arg_begin(); BI != AE; ++BI, ++ArgNo) {
             // Skip ByVal arguments since they will be memcpy'd to the callee's
             // stack so we're not really passing the pointer anyway.
             if (PAL.hasParamAttribute(ArgNo, Attribute::ByVal))
               continue;
             // If both arguments are readonly, they have no dependence.
-            if (Formal->onlyReadsMemory() && CS.onlyReadsMemory(ArgNo))
+            if (Formal->onlyReadsMemory() && I.onlyReadsMemory(ArgNo))
               continue;
             if (AI != BI && (*BI)->getType()->isPointerTy()) {
               AliasResult Result = AA->alias(*AI, *BI);
@@ -290,19 +284,18 @@ void Lint::visitCallSite(CallSite CS) {
           Type *Ty =
             cast<PointerType>(Formal->getType())->getElementType();
           visitMemoryReference(I, Actual, DL->getTypeStoreSize(Ty),
-                               DL->getABITypeAlignment(Ty), Ty,
+                               DL->getABITypeAlign(Ty), Ty,
                                MemRef::Read | MemRef::Write);
         }
       }
     }
   }
 
-  if (CS.isCall()) {
-    const CallInst *CI = cast<CallInst>(CS.getInstruction());
+  if (const auto *CI = dyn_cast<CallInst>(&I)) {
     if (CI->isTailCall()) {
       const AttributeList &PAL = CI->getAttributes();
       unsigned ArgNo = 0;
-      for (Value *Arg : CS.args()) {
+      for (Value *Arg : I.args()) {
         // Skip ByVal arguments since they will be memcpy'd to the callee's
         // stack anyway.
         if (PAL.hasParamAttribute(ArgNo++, Attribute::ByVal))
@@ -327,9 +320,9 @@ void Lint::visitCallSite(CallSite CS) {
       MemCpyInst *MCI = cast<MemCpyInst>(&I);
       // TODO: If the size is known, use it.
       visitMemoryReference(I, MCI->getDest(), MemoryLocation::UnknownSize,
-                           MCI->getDestAlignment(), nullptr, MemRef::Write);
+                           MCI->getDestAlign(), nullptr, MemRef::Write);
       visitMemoryReference(I, MCI->getSource(), MemoryLocation::UnknownSize,
-                           MCI->getSourceAlignment(), nullptr, MemRef::Read);
+                           MCI->getSourceAlign(), nullptr, MemRef::Read);
 
       // Check that the memcpy arguments don't overlap. The AliasAnalysis API
       // isn't expressive enough for what we really want to do. Known partial
@@ -345,20 +338,36 @@ void Lint::visitCallSite(CallSite CS) {
              "Undefined behavior: memcpy source and destination overlap", &I);
       break;
     }
+    case Intrinsic::memcpy_inline: {
+      MemCpyInlineInst *MCII = cast<MemCpyInlineInst>(&I);
+      const uint64_t Size = MCII->getLength()->getValue().getLimitedValue();
+      visitMemoryReference(I, MCII->getDest(), Size, MCII->getDestAlign(),
+                           nullptr, MemRef::Write);
+      visitMemoryReference(I, MCII->getSource(), Size, MCII->getSourceAlign(),
+                           nullptr, MemRef::Read);
+
+      // Check that the memcpy arguments don't overlap. The AliasAnalysis API
+      // isn't expressive enough for what we really want to do. Known partial
+      // overlap is not distinguished from the case where nothing is known.
+      const LocationSize LS = LocationSize::precise(Size);
+      Assert(AA->alias(MCII->getSource(), LS, MCII->getDest(), LS) != MustAlias,
+             "Undefined behavior: memcpy source and destination overlap", &I);
+      break;
+    }
     case Intrinsic::memmove: {
       MemMoveInst *MMI = cast<MemMoveInst>(&I);
       // TODO: If the size is known, use it.
       visitMemoryReference(I, MMI->getDest(), MemoryLocation::UnknownSize,
-                           MMI->getDestAlignment(), nullptr, MemRef::Write);
+                           MMI->getDestAlign(), nullptr, MemRef::Write);
       visitMemoryReference(I, MMI->getSource(), MemoryLocation::UnknownSize,
-                           MMI->getSourceAlignment(), nullptr, MemRef::Read);
+                           MMI->getSourceAlign(), nullptr, MemRef::Read);
       break;
     }
     case Intrinsic::memset: {
       MemSetInst *MSI = cast<MemSetInst>(&I);
       // TODO: If the size is known, use it.
       visitMemoryReference(I, MSI->getDest(), MemoryLocation::UnknownSize,
-                           MSI->getDestAlignment(), nullptr, MemRef::Write);
+                           MSI->getDestAlign(), nullptr, MemRef::Write);
       break;
     }
 
@@ -367,38 +376,30 @@ void Lint::visitCallSite(CallSite CS) {
              "Undefined behavior: va_start called in a non-varargs function",
              &I);
 
-      visitMemoryReference(I, CS.getArgument(0), MemoryLocation::UnknownSize, 0,
-                           nullptr, MemRef::Read | MemRef::Write);
+      visitMemoryReference(I, I.getArgOperand(0), MemoryLocation::UnknownSize,
+                           None, nullptr, MemRef::Read | MemRef::Write);
       break;
     case Intrinsic::vacopy:
-      visitMemoryReference(I, CS.getArgument(0), MemoryLocation::UnknownSize, 0,
-                           nullptr, MemRef::Write);
-      visitMemoryReference(I, CS.getArgument(1), MemoryLocation::UnknownSize, 0,
-                           nullptr, MemRef::Read);
+      visitMemoryReference(I, I.getArgOperand(0), MemoryLocation::UnknownSize,
+                           None, nullptr, MemRef::Write);
+      visitMemoryReference(I, I.getArgOperand(1), MemoryLocation::UnknownSize,
+                           None, nullptr, MemRef::Read);
       break;
     case Intrinsic::vaend:
-      visitMemoryReference(I, CS.getArgument(0), MemoryLocation::UnknownSize, 0,
-                           nullptr, MemRef::Read | MemRef::Write);
+      visitMemoryReference(I, I.getArgOperand(0), MemoryLocation::UnknownSize,
+                           None, nullptr, MemRef::Read | MemRef::Write);
       break;
 
     case Intrinsic::stackrestore:
       // Stackrestore doesn't read or write memory, but it sets the
       // stack pointer, which the compiler may read from or write to
       // at any time, so check it for both readability and writeability.
-      visitMemoryReference(I, CS.getArgument(0), MemoryLocation::UnknownSize, 0,
-                           nullptr, MemRef::Read | MemRef::Write);
+      visitMemoryReference(I, I.getArgOperand(0), MemoryLocation::UnknownSize,
+                           None, nullptr, MemRef::Read | MemRef::Write);
       break;
     }
 }
 
-void Lint::visitCallInst(CallInst &I) {
-  return visitCallSite(&I);
-}
-
-void Lint::visitInvokeInst(InvokeInst &I) {
-  return visitCallSite(&I);
-}
-
 void Lint::visitReturnInst(ReturnInst &I) {
   Function *F = I.getParent()->getParent();
   Assert(!F->doesNotReturn(),
@@ -412,9 +413,8 @@ void Lint::visitReturnInst(ReturnInst &I) {
 
 // TODO: Check that the reference is in bounds.
 // TODO: Check readnone/readonly function attributes.
-void Lint::visitMemoryReference(Instruction &I,
-                                Value *Ptr, uint64_t Size, unsigned Align,
-                                Type *Ty, unsigned Flags) {
+void Lint::visitMemoryReference(Instruction &I, Value *Ptr, uint64_t Size,
+                                MaybeAlign Align, Type *Ty, unsigned Flags) {
   // If no memory is being referenced, it doesn't matter if the pointer
   // is valid.
   if (Size == 0)
@@ -465,15 +465,13 @@ void Lint::visitMemoryReference(Instruction &I,
     // something we can handle and if so extract the size of this base object
     // along with its alignment.
     uint64_t BaseSize = MemoryLocation::UnknownSize;
-    unsigned BaseAlign = 0;
+    MaybeAlign BaseAlign;
 
     if (AllocaInst *AI = dyn_cast<AllocaInst>(Base)) {
       Type *ATy = AI->getAllocatedType();
       if (!AI->isArrayAllocation() && ATy->isSized())
         BaseSize = DL->getTypeAllocSize(ATy);
-      BaseAlign = AI->getAlignment();
-      if (BaseAlign == 0 && ATy->isSized())
-        BaseAlign = DL->getABITypeAlignment(ATy);
+      BaseAlign = AI->getAlign();
     } else if (GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
       // If the global may be defined differently in another compilation unit
       // then don't warn about funky memory accesses.
@@ -481,9 +479,9 @@ void Lint::visitMemoryReference(Instruction &I,
         Type *GTy = GV->getValueType();
         if (GTy->isSized())
           BaseSize = DL->getTypeAllocSize(GTy);
-        BaseAlign = GV->getAlignment();
-        if (BaseAlign == 0 && GTy->isSized())
-          BaseAlign = DL->getABITypeAlignment(GTy);
+        BaseAlign = GV->getAlign();
+        if (!BaseAlign && GTy->isSized())
+          BaseAlign = DL->getABITypeAlign(GTy);
       }
     }
 
@@ -496,24 +494,24 @@ void Lint::visitMemoryReference(Instruction &I,
 
     // Accesses that say that the memory is more aligned than it is are not
     // defined.
-    if (Align == 0 && Ty && Ty->isSized())
-      Align = DL->getABITypeAlignment(Ty);
-    Assert(!BaseAlign || Align <= MinAlign(BaseAlign, Offset),
-           "Undefined behavior: Memory reference address is misaligned", &I);
+    if (!Align && Ty && Ty->isSized())
+      Align = DL->getABITypeAlign(Ty);
+    if (BaseAlign && Align)
+      Assert(*Align <= commonAlignment(*BaseAlign, Offset),
+             "Undefined behavior: Memory reference address is misaligned", &I);
   }
 }
 
 void Lint::visitLoadInst(LoadInst &I) {
   visitMemoryReference(I, I.getPointerOperand(),
-                       DL->getTypeStoreSize(I.getType()), I.getAlignment(),
+                       DL->getTypeStoreSize(I.getType()), I.getAlign(),
                        I.getType(), MemRef::Read);
 }
 
 void Lint::visitStoreInst(StoreInst &I) {
   visitMemoryReference(I, I.getPointerOperand(),
                        DL->getTypeStoreSize(I.getOperand(0)->getType()),
-                       I.getAlignment(),
-                       I.getOperand(0)->getType(), MemRef::Write);
+                       I.getAlign(), I.getOperand(0)->getType(), MemRef::Write);
 }
 
 void Lint::visitXor(BinaryOperator &I) {
@@ -612,12 +610,12 @@ void Lint::visitAllocaInst(AllocaInst &I) {
 }
 
 void Lint::visitVAArgInst(VAArgInst &I) {
-  visitMemoryReference(I, I.getOperand(0), MemoryLocation::UnknownSize, 0,
+  visitMemoryReference(I, I.getOperand(0), MemoryLocation::UnknownSize, None,
                        nullptr, MemRef::Read | MemRef::Write);
 }
 
 void Lint::visitIndirectBrInst(IndirectBrInst &I) {
-  visitMemoryReference(I, I.getAddress(), MemoryLocation::UnknownSize, 0,
+  visitMemoryReference(I, I.getAddress(), MemoryLocation::UnknownSize, None,
                        nullptr, MemRef::Branchee);
 
   Assert(I.getNumDestinations() != 0,
@@ -689,8 +687,7 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
     }
   } else if (PHINode *PN = dyn_cast<PHINode>(V)) {
     if (Value *W = PN->hasConstantValue())
-      if (W != V)
-        return findValueImpl(W, OffsetOk, Visited);
+      return findValueImpl(W, OffsetOk, Visited);
   } else if (CastInst *CI = dyn_cast<CastInst>(V)) {
     if (CI->isNoopCast(*DL))
       return findValueImpl(CI->getOperand(0), OffsetOk, Visited);
@@ -719,9 +716,9 @@ Value *Lint::findValueImpl(Value *V, bool OffsetOk,
     if (Value *W = SimplifyInstruction(Inst, {*DL, TLI, DT, AC}))
       return findValueImpl(W, OffsetOk, Visited);
   } else if (auto *C = dyn_cast<Constant>(V)) {
-    if (Value *W = ConstantFoldConstant(C, *DL, TLI))
-      if (W && W != V)
-        return findValueImpl(W, OffsetOk, Visited);
+    Value *W = ConstantFoldConstant(C, *DL, TLI);
+    if (W != V)
+      return findValueImpl(W, OffsetOk, Visited);
   }
 
   return V;
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index a7d07c0b6183..e5245225d905 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -27,24 +27,12 @@
 
 using namespace llvm;
 
-static MaybeAlign getBaseAlign(const Value *Base, const DataLayout &DL) {
-  if (const MaybeAlign PA = Base->getPointerAlignment(DL))
-    return *PA;
-  Type *const Ty = Base->getType()->getPointerElementType();
-  if (!Ty->isSized())
-    return None;
-  return Align(DL.getABITypeAlignment(Ty));
-}
-
 static bool isAligned(const Value *Base, const APInt &Offset, Align Alignment,
                       const DataLayout &DL) {
-  if (MaybeAlign BA = getBaseAlign(Base, DL)) {
-    const APInt APBaseAlign(Offset.getBitWidth(), BA->value());
-    const APInt APAlign(Offset.getBitWidth(), Alignment.value());
-    assert(APAlign.isPowerOf2() && "must be a power of 2!");
-    return APBaseAlign.uge(APAlign) && !(Offset & (APAlign - 1));
-  }
-  return false;
+  Align BA = Base->getPointerAlignment(DL);
+  const APInt APAlign(Offset.getBitWidth(), Alignment.value());
+  assert(APAlign.isPowerOf2() && "must be a power of 2!");
+  return BA >= Alignment && !(Offset & (APAlign - 1));
 }
 
 /// Test if V is always a pointer to allocated and suitably aligned memory for
@@ -52,7 +40,13 @@ static bool isAligned(const Value *Base, const APInt &Offset, Align Alignment,
 static bool isDereferenceableAndAlignedPointer(
     const Value *V, Align Alignment, const APInt &Size, const DataLayout &DL,
     const Instruction *CtxI, const DominatorTree *DT,
-    SmallPtrSetImpl<const Value *> &Visited) {
+    SmallPtrSetImpl<const Value *> &Visited, unsigned MaxDepth) {
+  assert(V->getType()->isPointerTy() && "Base must be pointer");
+
+  // Recursion limit.
+  if (MaxDepth-- == 0)
+    return false;
+
   // Already visited?  Bail out, we've likely hit unreachable code.
   if (!Visited.insert(V).second)
     return false;
@@ -61,9 +55,11 @@ static bool isDereferenceableAndAlignedPointer(
   // malloc may return null.
 
   // bitcast instructions are no-ops as far as dereferenceability is concerned.
-  if (const BitCastOperator *BC = dyn_cast<BitCastOperator>(V))
-    return isDereferenceableAndAlignedPointer(BC->getOperand(0), Alignment,
-                                              Size, DL, CtxI, DT, Visited);
+  if (const BitCastOperator *BC = dyn_cast<BitCastOperator>(V)) {
+    if (BC->getSrcTy()->isPointerTy())
+      return isDereferenceableAndAlignedPointer(
+          BC->getOperand(0), Alignment, Size, DL, CtxI, DT, Visited, MaxDepth);
+  }
 
   bool CheckForNonNull = false;
   APInt KnownDerefBytes(Size.getBitWidth(),
@@ -72,7 +68,7 @@ static bool isDereferenceableAndAlignedPointer(
     if (!CheckForNonNull || isKnownNonZero(V, DL, 0, nullptr, CtxI, DT)) {
       // As we recursed through GEPs to get here, we've incrementally checked
       // that each step advanced by a multiple of the alignment. If our base is
-      // properly aligned, then the original offset accessed must also be.  
+      // properly aligned, then the original offset accessed must also be.
       Type *Ty = V->getType();
       assert(Ty->isSized() && "must be sized");
       APInt Offset(DL.getTypeStoreSizeInBits(Ty), 0);
@@ -99,22 +95,22 @@ static bool isDereferenceableAndAlignedPointer(
     // addrspacecast, so we can't do arithmetic directly on the APInt values.
     return isDereferenceableAndAlignedPointer(
         Base, Alignment, Offset + Size.sextOrTrunc(Offset.getBitWidth()), DL,
-        CtxI, DT, Visited);
+        CtxI, DT, Visited, MaxDepth);
   }
 
   // For gc.relocate, look through relocations
   if (const GCRelocateInst *RelocateInst = dyn_cast<GCRelocateInst>(V))
     return isDereferenceableAndAlignedPointer(
-        RelocateInst->getDerivedPtr(), Alignment, Size, DL, CtxI, DT, Visited);
+      RelocateInst->getDerivedPtr(), Alignment, Size, DL, CtxI, DT, Visited, MaxDepth);
 
   if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(V))
     return isDereferenceableAndAlignedPointer(ASC->getOperand(0), Alignment,
-                                              Size, DL, CtxI, DT, Visited);
+                                              Size, DL, CtxI, DT, Visited, MaxDepth);
 
   if (const auto *Call = dyn_cast<CallBase>(V))
     if (auto *RP = getArgumentAliasingToReturnedPointer(Call, true))
       return isDereferenceableAndAlignedPointer(RP, Alignment, Size, DL, CtxI,
-                                                DT, Visited);
+                                                DT, Visited, MaxDepth);
 
   // If we don't know, assume the worst.
   return false;
@@ -128,11 +124,11 @@ bool llvm::isDereferenceableAndAlignedPointer(const Value *V, Align Alignment,
   // Note: At the moment, Size can be zero.  This ends up being interpreted as
   // a query of whether [Base, V] is dereferenceable and V is aligned (since
   // that's what the implementation happened to do).  It's unclear if this is
-  // the desired semantic, but at least SelectionDAG does exercise this case.  
-  
+  // the desired semantic, but at least SelectionDAG does exercise this case.
+
   SmallPtrSet<const Value *, 32> Visited;
   return ::isDereferenceableAndAlignedPointer(V, Alignment, Size, DL, CtxI, DT,
-                                              Visited);
+                                              Visited, 16);
 }
 
 bool llvm::isDereferenceableAndAlignedPointer(const Value *V, Type *Ty,
@@ -140,9 +136,11 @@ bool llvm::isDereferenceableAndAlignedPointer(const Value *V, Type *Ty,
                                               const DataLayout &DL,
                                               const Instruction *CtxI,
                                               const DominatorTree *DT) {
-  if (!Ty->isSized())
+  // For unsized types or scalable vectors we don't know exactly how many bytes
+  // are dereferenced, so bail out.
+  if (!Ty->isSized() || isa<ScalableVectorType>(Ty))
     return false;
-  
+
   // When dereferenceability information is provided by a dereferenceable
   // attribute, we know exactly how many bytes are dereferenceable. If we can
   // determine the exact offset to the attributed variable, we can use that
@@ -160,7 +158,7 @@ bool llvm::isDereferenceablePointer(const Value *V, Type *Ty,
                                     const DataLayout &DL,
                                     const Instruction *CtxI,
                                     const DominatorTree *DT) {
-  return isDereferenceableAndAlignedPointer(V, Ty, Align::None(), DL, CtxI, DT);
+  return isDereferenceableAndAlignedPointer(V, Ty, Align(1), DL, CtxI, DT);
 }
 
 /// Test if A and B will obviously have the same value.
@@ -202,8 +200,7 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
 
   APInt EltSize(DL.getIndexTypeSizeInBits(Ptr->getType()),
                 DL.getTypeStoreSize(LI->getType()));
-  const Align Alignment = DL.getValueOrABITypeAlignment(
-      MaybeAlign(LI->getAlignment()), LI->getType());
+  const Align Alignment = LI->getAlign();
 
   Instruction *HeaderFirstNonPHI = L->getHeader()->getFirstNonPHI();
 
@@ -259,14 +256,10 @@ bool llvm::isDereferenceableAndAlignedInLoop(LoadInst *LI, Loop *L,
 ///
 /// This uses the pointee type to determine how many bytes need to be safe to
 /// load from the pointer.
-bool llvm::isSafeToLoadUnconditionally(Value *V, MaybeAlign MA, APInt &Size,
+bool llvm::isSafeToLoadUnconditionally(Value *V, Align Alignment, APInt &Size,
                                        const DataLayout &DL,
                                        Instruction *ScanFrom,
                                        const DominatorTree *DT) {
-  // Zero alignment means that the load has the ABI alignment for the target
-  const Align Alignment =
-      DL.getValueOrABITypeAlignment(MA, V->getType()->getPointerElementType());
-
   // If DT is not specified we can't make context-sensitive query
   const Instruction* CtxI = DT ? ScanFrom : nullptr;
   if (isDereferenceableAndAlignedPointer(V, Alignment, Size, DL, CtxI, DT))
@@ -301,7 +294,8 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, MaybeAlign MA, APInt &Size,
       return false;
 
     Value *AccessedPtr;
-    MaybeAlign MaybeAccessedAlign;
+    Type *AccessedTy;
+    Align AccessedAlign;
     if (LoadInst *LI = dyn_cast<LoadInst>(BBI)) {
       // Ignore volatile loads. The execution of a volatile load cannot
       // be used to prove an address is backed by regular memory; it can,
@@ -309,20 +303,18 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, MaybeAlign MA, APInt &Size,
       if (LI->isVolatile())
         continue;
       AccessedPtr = LI->getPointerOperand();
-      MaybeAccessedAlign = MaybeAlign(LI->getAlignment());
+      AccessedTy = LI->getType();
+      AccessedAlign = LI->getAlign();
     } else if (StoreInst *SI = dyn_cast<StoreInst>(BBI)) {
       // Ignore volatile stores (see comment for loads).
       if (SI->isVolatile())
         continue;
       AccessedPtr = SI->getPointerOperand();
-      MaybeAccessedAlign = MaybeAlign(SI->getAlignment());
+      AccessedTy = SI->getValueOperand()->getType();
+      AccessedAlign = SI->getAlign();
     } else
       continue;
 
-    Type *AccessedTy = AccessedPtr->getType()->getPointerElementType();
-
-    const Align AccessedAlign =
-        DL.getValueOrABITypeAlignment(MaybeAccessedAlign, AccessedTy);
     if (AccessedAlign < Alignment)
       continue;
 
@@ -338,7 +330,7 @@ bool llvm::isSafeToLoadUnconditionally(Value *V, MaybeAlign MA, APInt &Size,
   return false;
 }
 
-bool llvm::isSafeToLoadUnconditionally(Value *V, Type *Ty, MaybeAlign Alignment,
+bool llvm::isSafeToLoadUnconditionally(Value *V, Type *Ty, Align Alignment,
                                        const DataLayout &DL,
                                        Instruction *ScanFrom,
                                        const DominatorTree *DT) {
@@ -362,7 +354,7 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
                                       BasicBlock *ScanBB,
                                       BasicBlock::iterator &ScanFrom,
                                       unsigned MaxInstsToScan,
-                                      AliasAnalysis *AA, bool *IsLoad,
+                                      AAResults *AA, bool *IsLoad,
                                       unsigned *NumScanedInst) {
   // Don't CSE load that is volatile or anything stronger than unordered.
   if (!Load->isUnordered())
@@ -373,11 +365,33 @@ Value *llvm::FindAvailableLoadedValue(LoadInst *Load,
       ScanFrom, MaxInstsToScan, AA, IsLoad, NumScanedInst);
 }
 
+// Check if the load and the store have the same base, constant offsets and
+// non-overlapping access ranges.
+static bool AreNonOverlapSameBaseLoadAndStore(
+    Value *LoadPtr, Type *LoadTy, Value *StorePtr, Type *StoreTy,
+    const DataLayout &DL) {
+  APInt LoadOffset(DL.getTypeSizeInBits(LoadPtr->getType()), 0);
+  APInt StoreOffset(DL.getTypeSizeInBits(StorePtr->getType()), 0);
+  Value *LoadBase = LoadPtr->stripAndAccumulateConstantOffsets(
+      DL, LoadOffset, /* AllowNonInbounds */ false);
+  Value *StoreBase = StorePtr->stripAndAccumulateConstantOffsets(
+      DL, StoreOffset, /* AllowNonInbounds */ false);
+  if (LoadBase != StoreBase)
+    return false;
+  auto LoadAccessSize = LocationSize::precise(DL.getTypeStoreSize(LoadTy));
+  auto StoreAccessSize = LocationSize::precise(DL.getTypeStoreSize(StoreTy));
+  ConstantRange LoadRange(LoadOffset,
+                          LoadOffset + LoadAccessSize.toRaw());
+  ConstantRange StoreRange(StoreOffset,
+                           StoreOffset + StoreAccessSize.toRaw());
+  return LoadRange.intersectWith(StoreRange).isEmptySet();
+}
+
 Value *llvm::FindAvailablePtrLoadStore(Value *Ptr, Type *AccessTy,
                                        bool AtLeastAtomic, BasicBlock *ScanBB,
                                        BasicBlock::iterator &ScanFrom,
                                        unsigned MaxInstsToScan,
-                                       AliasAnalysis *AA, bool *IsLoadCSE,
+                                       AAResults *AA, bool *IsLoadCSE,
                                        unsigned *NumScanedInst) {
   if (MaxInstsToScan == 0)
     MaxInstsToScan = ~0U;
@@ -451,10 +465,21 @@ Value *llvm::FindAvailablePtrLoadStore(Value *Ptr, Type *AccessTy,
           StrippedPtr != StorePtr)
         continue;
 
-      // If we have alias analysis and it says the store won't modify the loaded
-      // value, ignore the store.
-      if (AA && !isModSet(AA->getModRefInfo(SI, StrippedPtr, AccessSize)))
-        continue;
+      if (!AA) {
+        // When AA isn't available, but if the load and the store have the same
+        // base, constant offsets and non-overlapping access ranges, ignore the
+        // store. This is a simple form of alias analysis that is used by the
+        // inliner. FIXME: use BasicAA if possible.
+        if (AreNonOverlapSameBaseLoadAndStore(
+                Ptr, AccessTy, SI->getPointerOperand(),
+                SI->getValueOperand()->getType(), DL))
+          continue;
+      } else {
+        // If we have alias analysis and it says the store won't modify the
+        // loaded value, ignore the store.
+        if (!isModSet(AA->getModRefInfo(SI, StrippedPtr, AccessSize)))
+          continue;
+      }
 
       // Otherwise the store that may or may not alias the pointer, bail out.
       ++ScanFrom;
diff --git a/llvm/lib/Analysis/LoopAccessAnalysis.cpp b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
index 26fa5112c29a..ae282a7a1095 100644
--- a/llvm/lib/Analysis/LoopAccessAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopAccessAnalysis.cpp
@@ -30,7 +30,6 @@
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -43,7 +42,6 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -174,6 +172,13 @@ const SCEV *llvm::replaceSymbolicStrideSCEV(PredicatedScalarEvolution &PSE,
   return OrigSCEV;
 }
 
+RuntimeCheckingPtrGroup::RuntimeCheckingPtrGroup(
+    unsigned Index, RuntimePointerChecking &RtCheck)
+    : RtCheck(RtCheck), High(RtCheck.Pointers[Index].End),
+      Low(RtCheck.Pointers[Index].Start) {
+  Members.push_back(Index);
+}
+
 /// Calculate Start and End points of memory access.
 /// Let's assume A is the first access and B is a memory access on N-th loop
 /// iteration. Then B is calculated as:
@@ -231,14 +236,14 @@ void RuntimePointerChecking::insert(Loop *Lp, Value *Ptr, bool WritePtr,
   Pointers.emplace_back(Ptr, ScStart, ScEnd, WritePtr, DepSetId, ASId, Sc);
 }
 
-SmallVector<RuntimePointerChecking::PointerCheck, 4>
+SmallVector<RuntimePointerCheck, 4>
 RuntimePointerChecking::generateChecks() const {
-  SmallVector<PointerCheck, 4> Checks;
+  SmallVector<RuntimePointerCheck, 4> Checks;
 
   for (unsigned I = 0; I < CheckingGroups.size(); ++I) {
     for (unsigned J = I + 1; J < CheckingGroups.size(); ++J) {
-      const RuntimePointerChecking::CheckingPtrGroup &CGI = CheckingGroups[I];
-      const RuntimePointerChecking::CheckingPtrGroup &CGJ = CheckingGroups[J];
+      const RuntimeCheckingPtrGroup &CGI = CheckingGroups[I];
+      const RuntimeCheckingPtrGroup &CGJ = CheckingGroups[J];
 
       if (needsChecking(CGI, CGJ))
         Checks.push_back(std::make_pair(&CGI, &CGJ));
@@ -254,8 +259,8 @@ void RuntimePointerChecking::generateChecks(
   Checks = generateChecks();
 }
 
-bool RuntimePointerChecking::needsChecking(const CheckingPtrGroup &M,
-                                           const CheckingPtrGroup &N) const {
+bool RuntimePointerChecking::needsChecking(
+    const RuntimeCheckingPtrGroup &M, const RuntimeCheckingPtrGroup &N) const {
   for (unsigned I = 0, EI = M.Members.size(); EI != I; ++I)
     for (unsigned J = 0, EJ = N.Members.size(); EJ != J; ++J)
       if (needsChecking(M.Members[I], N.Members[J]))
@@ -277,7 +282,7 @@ static const SCEV *getMinFromExprs(const SCEV *I, const SCEV *J,
   return I;
 }
 
-bool RuntimePointerChecking::CheckingPtrGroup::addPointer(unsigned Index) {
+bool RuntimeCheckingPtrGroup::addPointer(unsigned Index) {
   const SCEV *Start = RtCheck.Pointers[Index].Start;
   const SCEV *End = RtCheck.Pointers[Index].End;
 
@@ -352,7 +357,7 @@ void RuntimePointerChecking::groupChecks(
   // pointers to the same underlying object.
   if (!UseDependencies) {
     for (unsigned I = 0; I < Pointers.size(); ++I)
-      CheckingGroups.push_back(CheckingPtrGroup(I, *this));
+      CheckingGroups.push_back(RuntimeCheckingPtrGroup(I, *this));
     return;
   }
 
@@ -378,7 +383,7 @@ void RuntimePointerChecking::groupChecks(
     MemoryDepChecker::MemAccessInfo Access(Pointers[I].PointerValue,
                                            Pointers[I].IsWritePtr);
 
-    SmallVector<CheckingPtrGroup, 2> Groups;
+    SmallVector<RuntimeCheckingPtrGroup, 2> Groups;
     auto LeaderI = DepCands.findValue(DepCands.getLeaderValue(Access));
 
     // Because DepCands is constructed by visiting accesses in the order in
@@ -395,7 +400,7 @@ void RuntimePointerChecking::groupChecks(
 
       // Go through all the existing sets and see if we can find one
       // which can include this pointer.
-      for (CheckingPtrGroup &Group : Groups) {
+      for (RuntimeCheckingPtrGroup &Group : Groups) {
         // Don't perform more than a certain amount of comparisons.
         // This should limit the cost of grouping the pointers to something
         // reasonable.  If we do end up hitting this threshold, the algorithm
@@ -415,7 +420,7 @@ void RuntimePointerChecking::groupChecks(
         // We couldn't add this pointer to any existing set or the threshold
         // for the number of comparisons has been reached. Create a new group
         // to hold the current pointer.
-        Groups.push_back(CheckingPtrGroup(Pointer, *this));
+        Groups.push_back(RuntimeCheckingPtrGroup(Pointer, *this));
     }
 
     // We've computed the grouped checks for this partition.
@@ -451,7 +456,7 @@ bool RuntimePointerChecking::needsChecking(unsigned I, unsigned J) const {
 }
 
 void RuntimePointerChecking::printChecks(
-    raw_ostream &OS, const SmallVectorImpl<PointerCheck> &Checks,
+    raw_ostream &OS, const SmallVectorImpl<RuntimePointerCheck> &Checks,
     unsigned Depth) const {
   unsigned N = 0;
   for (const auto &Check : Checks) {
@@ -500,7 +505,7 @@ public:
   typedef PointerIntPair<Value *, 1, bool> MemAccessInfo;
   typedef SmallVector<MemAccessInfo, 8> MemAccessInfoList;
 
-  AccessAnalysis(const DataLayout &Dl, Loop *TheLoop, AliasAnalysis *AA,
+  AccessAnalysis(const DataLayout &Dl, Loop *TheLoop, AAResults *AA,
                  LoopInfo *LI, MemoryDepChecker::DepCandidates &DA,
                  PredicatedScalarEvolution &PSE)
       : DL(Dl), TheLoop(TheLoop), AST(*AA), LI(LI), DepCands(DA),
@@ -700,18 +705,19 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
   // to place a runtime bound check.
   bool CanDoRT = true;
 
-  bool NeedRTCheck = false;
+  bool MayNeedRTCheck = false;
   if (!IsRTCheckAnalysisNeeded) return true;
 
   bool IsDepCheckNeeded = isDependencyCheckNeeded();
 
   // We assign a consecutive id to access from different alias sets.
   // Accesses between different groups doesn't need to be checked.
-  unsigned ASId = 1;
+  unsigned ASId = 0;
   for (auto &AS : AST) {
     int NumReadPtrChecks = 0;
     int NumWritePtrChecks = 0;
     bool CanDoAliasSetRT = true;
+    ++ASId;
 
     // We assign consecutive id to access from different dependence sets.
     // Accesses within the same set don't need a runtime check.
@@ -742,14 +748,30 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     // check them.  But there is no need to checks if there is only one
     // dependence set for this alias set.
     //
-    // Note that this function computes CanDoRT and NeedRTCheck independently.
-    // For example CanDoRT=false, NeedRTCheck=false means that we have a pointer
-    // for which we couldn't find the bounds but we don't actually need to emit
-    // any checks so it does not matter.
+    // Note that this function computes CanDoRT and MayNeedRTCheck
+    // independently. For example CanDoRT=false, MayNeedRTCheck=false means that
+    // we have a pointer for which we couldn't find the bounds but we don't
+    // actually need to emit any checks so it does not matter.
     bool NeedsAliasSetRTCheck = false;
-    if (!(IsDepCheckNeeded && CanDoAliasSetRT && RunningDepId == 2))
+    if (!(IsDepCheckNeeded && CanDoAliasSetRT && RunningDepId == 2)) {
       NeedsAliasSetRTCheck = (NumWritePtrChecks >= 2 ||
                              (NumReadPtrChecks >= 1 && NumWritePtrChecks >= 1));
+      // For alias sets without at least 2 writes or 1 write and 1 read, there
+      // is no need to generate RT checks and CanDoAliasSetRT for this alias set
+      // does not impact whether runtime checks can be generated.
+      if (!NeedsAliasSetRTCheck) {
+        assert((AS.size() <= 1 ||
+                all_of(AS,
+                       [this](auto AC) {
+                         MemAccessInfo AccessWrite(AC.getValue(), true);
+                         return DepCands.findValue(AccessWrite) ==
+                                DepCands.end();
+                       })) &&
+               "Can only skip updating CanDoRT below, if all entries in AS "
+               "are reads or there is at most 1 entry");
+        continue;
+      }
+    }
 
     // We need to perform run-time alias checks, but some pointers had bounds
     // that couldn't be checked.
@@ -768,7 +790,7 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     }
 
     CanDoRT &= CanDoAliasSetRT;
-    NeedRTCheck |= NeedsAliasSetRTCheck;
+    MayNeedRTCheck |= NeedsAliasSetRTCheck;
     ++ASId;
   }
 
@@ -802,15 +824,18 @@ bool AccessAnalysis::canCheckPtrAtRT(RuntimePointerChecking &RtCheck,
     }
   }
 
-  if (NeedRTCheck && CanDoRT)
+  if (MayNeedRTCheck && CanDoRT)
     RtCheck.generateChecks(DepCands, IsDepCheckNeeded);
 
   LLVM_DEBUG(dbgs() << "LAA: We need to do " << RtCheck.getNumberOfChecks()
                     << " pointer comparisons.\n");
 
-  RtCheck.Need = NeedRTCheck;
+  // If we can do run-time checks, but there are no checks, no runtime checks
+  // are needed. This can happen when all pointers point to the same underlying
+  // object for example.
+  RtCheck.Need = CanDoRT ? RtCheck.getNumberOfChecks() != 0 : MayNeedRTCheck;
 
-  bool CanDoRTIfNeeded = !NeedRTCheck || CanDoRT;
+  bool CanDoRTIfNeeded = !RtCheck.Need || CanDoRT;
   if (!CanDoRTIfNeeded)
     RtCheck.reset();
   return CanDoRTIfNeeded;
@@ -1787,7 +1812,7 @@ bool LoopAccessInfo::canAnalyzeLoop() {
   return true;
 }
 
-void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
+void LoopAccessInfo::analyzeLoop(AAResults *AA, LoopInfo *LI,
                                  const TargetLibraryInfo *TLI,
                                  DominatorTree *DT) {
   typedef SmallPtrSet<Value*, 16> ValueSet;
@@ -1810,6 +1835,10 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
 
   const bool IsAnnotatedParallel = TheLoop->isAnnotatedParallel();
 
+  const bool EnableMemAccessVersioningOfLoop =
+      EnableMemAccessVersioning &&
+      !TheLoop->getHeader()->getParent()->hasOptSize();
+
   // For each block.
   for (BasicBlock *BB : TheLoop->blocks()) {
     // Scan the BB and collect legal loads and stores. Also detect any
@@ -1845,7 +1874,7 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
         // If the function has an explicit vectorized counterpart, we can safely
         // assume that it can be vectorized.
         if (Call && !Call->isNoBuiltin() && Call->getCalledFunction() &&
-            TLI->isFunctionVectorizable(Call->getCalledFunction()->getName()))
+            !VFDatabase::getMappings(*Call).empty())
           continue;
 
         auto *Ld = dyn_cast<LoadInst>(&I);
@@ -1865,7 +1894,7 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
         NumLoads++;
         Loads.push_back(Ld);
         DepChecker->addAccess(Ld);
-        if (EnableMemAccessVersioning)
+        if (EnableMemAccessVersioningOfLoop)
           collectStridedAccess(Ld);
         continue;
       }
@@ -1889,7 +1918,7 @@ void LoopAccessInfo::analyzeLoop(AliasAnalysis *AA, LoopInfo *LI,
         NumStores++;
         Stores.push_back(St);
         DepChecker->addAccess(St);
-        if (EnableMemAccessVersioning)
+        if (EnableMemAccessVersioningOfLoop)
           collectStridedAccess(St);
       }
     } // Next instr.
@@ -2116,169 +2145,6 @@ bool LoopAccessInfo::isUniform(Value *V) const {
   return (SE->isLoopInvariant(SE->getSCEV(V), TheLoop));
 }
 
-// FIXME: this function is currently a duplicate of the one in
-// LoopVectorize.cpp.
-static Instruction *getFirstInst(Instruction *FirstInst, Value *V,
-                                 Instruction *Loc) {
-  if (FirstInst)
-    return FirstInst;
-  if (Instruction *I = dyn_cast<Instruction>(V))
-    return I->getParent() == Loc->getParent() ? I : nullptr;
-  return nullptr;
-}
-
-namespace {
-
-/// IR Values for the lower and upper bounds of a pointer evolution.  We
-/// need to use value-handles because SCEV expansion can invalidate previously
-/// expanded values.  Thus expansion of a pointer can invalidate the bounds for
-/// a previous one.
-struct PointerBounds {
-  TrackingVH<Value> Start;
-  TrackingVH<Value> End;
-};
-
-} // end anonymous namespace
-
-/// Expand code for the lower and upper bound of the pointer group \p CG
-/// in \p TheLoop.  \return the values for the bounds.
-static PointerBounds
-expandBounds(const RuntimePointerChecking::CheckingPtrGroup *CG, Loop *TheLoop,
-             Instruction *Loc, SCEVExpander &Exp, ScalarEvolution *SE,
-             const RuntimePointerChecking &PtrRtChecking) {
-  Value *Ptr = PtrRtChecking.Pointers[CG->Members[0]].PointerValue;
-  const SCEV *Sc = SE->getSCEV(Ptr);
-
-  unsigned AS = Ptr->getType()->getPointerAddressSpace();
-  LLVMContext &Ctx = Loc->getContext();
-
-  // Use this type for pointer arithmetic.
-  Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
-
-  if (SE->isLoopInvariant(Sc, TheLoop)) {
-    LLVM_DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:"
-                      << *Ptr << "\n");
-    // Ptr could be in the loop body. If so, expand a new one at the correct
-    // location.
-    Instruction *Inst = dyn_cast<Instruction>(Ptr);
-    Value *NewPtr = (Inst && TheLoop->contains(Inst))
-                        ? Exp.expandCodeFor(Sc, PtrArithTy, Loc)
-                        : Ptr;
-    // We must return a half-open range, which means incrementing Sc.
-    const SCEV *ScPlusOne = SE->getAddExpr(Sc, SE->getOne(PtrArithTy));
-    Value *NewPtrPlusOne = Exp.expandCodeFor(ScPlusOne, PtrArithTy, Loc);
-    return {NewPtr, NewPtrPlusOne};
-  } else {
-    Value *Start = nullptr, *End = nullptr;
-    LLVM_DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
-    Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc);
-    End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc);
-    LLVM_DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High
-                      << "\n");
-    return {Start, End};
-  }
-}
-
-/// Turns a collection of checks into a collection of expanded upper and
-/// lower bounds for both pointers in the check.
-static SmallVector<std::pair<PointerBounds, PointerBounds>, 4> expandBounds(
-    const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks,
-    Loop *L, Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp,
-    const RuntimePointerChecking &PtrRtChecking) {
-  SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds;
-
-  // Here we're relying on the SCEV Expander's cache to only emit code for the
-  // same bounds once.
-  transform(
-      PointerChecks, std::back_inserter(ChecksWithBounds),
-      [&](const RuntimePointerChecking::PointerCheck &Check) {
-        PointerBounds
-          First = expandBounds(Check.first, L, Loc, Exp, SE, PtrRtChecking),
-          Second = expandBounds(Check.second, L, Loc, Exp, SE, PtrRtChecking);
-        return std::make_pair(First, Second);
-      });
-
-  return ChecksWithBounds;
-}
-
-std::pair<Instruction *, Instruction *> LoopAccessInfo::addRuntimeChecks(
-    Instruction *Loc,
-    const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &PointerChecks)
-    const {
-  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
-  auto *SE = PSE->getSE();
-  SCEVExpander Exp(*SE, DL, "induction");
-  auto ExpandedChecks =
-      expandBounds(PointerChecks, TheLoop, Loc, SE, Exp, *PtrRtChecking);
-
-  LLVMContext &Ctx = Loc->getContext();
-  Instruction *FirstInst = nullptr;
-  IRBuilder<> ChkBuilder(Loc);
-  // Our instructions might fold to a constant.
-  Value *MemoryRuntimeCheck = nullptr;
-
-  for (const auto &Check : ExpandedChecks) {
-    const PointerBounds &A = Check.first, &B = Check.second;
-    // Check if two pointers (A and B) conflict where conflict is computed as:
-    // start(A) <= end(B) && start(B) <= end(A)
-    unsigned AS0 = A.Start->getType()->getPointerAddressSpace();
-    unsigned AS1 = B.Start->getType()->getPointerAddressSpace();
-
-    assert((AS0 == B.End->getType()->getPointerAddressSpace()) &&
-           (AS1 == A.End->getType()->getPointerAddressSpace()) &&
-           "Trying to bounds check pointers with different address spaces");
-
-    Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
-    Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
-
-    Value *Start0 = ChkBuilder.CreateBitCast(A.Start, PtrArithTy0, "bc");
-    Value *Start1 = ChkBuilder.CreateBitCast(B.Start, PtrArithTy1, "bc");
-    Value *End0 =   ChkBuilder.CreateBitCast(A.End,   PtrArithTy1, "bc");
-    Value *End1 =   ChkBuilder.CreateBitCast(B.End,   PtrArithTy0, "bc");
-
-    // [A|B].Start points to the first accessed byte under base [A|B].
-    // [A|B].End points to the last accessed byte, plus one.
-    // There is no conflict when the intervals are disjoint:
-    // NoConflict = (B.Start >= A.End) || (A.Start >= B.End)
-    //
-    // bound0 = (B.Start < A.End)
-    // bound1 = (A.Start < B.End)
-    //  IsConflict = bound0 & bound1
-    Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0");
-    FirstInst = getFirstInst(FirstInst, Cmp0, Loc);
-    Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1");
-    FirstInst = getFirstInst(FirstInst, Cmp1, Loc);
-    Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
-    FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
-    if (MemoryRuntimeCheck) {
-      IsConflict =
-          ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
-      FirstInst = getFirstInst(FirstInst, IsConflict, Loc);
-    }
-    MemoryRuntimeCheck = IsConflict;
-  }
-
-  if (!MemoryRuntimeCheck)
-    return std::make_pair(nullptr, nullptr);
-
-  // We have to do this trickery because the IRBuilder might fold the check to a
-  // constant expression in which case there is no Instruction anchored in a
-  // the block.
-  Instruction *Check = BinaryOperator::CreateAnd(MemoryRuntimeCheck,
-                                                 ConstantInt::getTrue(Ctx));
-  ChkBuilder.Insert(Check, "memcheck.conflict");
-  FirstInst = getFirstInst(FirstInst, Check, Loc);
-  return std::make_pair(FirstInst, Check);
-}
-
-std::pair<Instruction *, Instruction *>
-LoopAccessInfo::addRuntimeChecks(Instruction *Loc) const {
-  if (!PtrRtChecking->Need)
-    return std::make_pair(nullptr, nullptr);
-
-  return addRuntimeChecks(Loc, PtrRtChecking->getChecks());
-}
-
 void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
   Value *Ptr = nullptr;
   if (LoadInst *LI = dyn_cast<LoadInst>(MemAccess))
@@ -2343,7 +2209,7 @@ void LoopAccessInfo::collectStridedAccess(Value *MemAccess) {
 }
 
 LoopAccessInfo::LoopAccessInfo(Loop *L, ScalarEvolution *SE,
-                               const TargetLibraryInfo *TLI, AliasAnalysis *AA,
+                               const TargetLibraryInfo *TLI, AAResults *AA,
                                DominatorTree *DT, LoopInfo *LI)
     : PSE(std::make_unique<PredicatedScalarEvolution>(*SE, *L)),
       PtrRtChecking(std::make_unique<RuntimePointerChecking>(SE)),
diff --git a/llvm/lib/Analysis/LoopAnalysisManager.cpp b/llvm/lib/Analysis/LoopAnalysisManager.cpp
index 02d40fb8d72a..21017c04da99 100644
--- a/llvm/lib/Analysis/LoopAnalysisManager.cpp
+++ b/llvm/lib/Analysis/LoopAnalysisManager.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/PassManagerImpl.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Analysis/LoopCacheAnalysis.cpp b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
index 25325ec1be02..6ba247a87c22 100644
--- a/llvm/lib/Analysis/LoopCacheAnalysis.cpp
+++ b/llvm/lib/Analysis/LoopCacheAnalysis.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/BreadthFirstIterator.h"
 #include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 
@@ -64,10 +65,10 @@ static Loop *getInnerMostLoop(const LoopVectorTy &Loops) {
     return LastLoop;
   }
 
-  return (std::is_sorted(Loops.begin(), Loops.end(),
-                         [](const Loop *L1, const Loop *L2) {
-                           return L1->getLoopDepth() < L2->getLoopDepth();
-                         }))
+  return (llvm::is_sorted(Loops,
+                          [](const Loop *L1, const Loop *L2) {
+                            return L1->getLoopDepth() < L2->getLoopDepth();
+                          }))
              ? LastLoop
              : nullptr;
 }
@@ -90,7 +91,11 @@ static bool isOneDimensionalArray(const SCEV &AccessFn, const SCEV &ElemSize,
   if (!SE.isLoopInvariant(Start, &L) || !SE.isLoopInvariant(Step, &L))
     return false;
 
-  return AR->getStepRecurrence(SE) == &ElemSize;
+  const SCEV *StepRec = AR->getStepRecurrence(SE);
+  if (StepRec && SE.isKnownNegative(StepRec))
+    StepRec = SE.getNegativeSCEV(StepRec);
+
+  return StepRec == &ElemSize;
 }
 
 /// Compute the trip count for the given loop \p L. Return the SCEV expression
@@ -285,10 +290,13 @@ CacheCostTy IndexedReference::computeRefCost(const Loop &L,
     const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
     const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS);
     Type *WiderType = SE.getWiderType(Stride->getType(), TripCount->getType());
-    Stride = SE.getNoopOrSignExtend(Stride, WiderType);
+    if (SE.isKnownNegative(Stride))
+      Stride = SE.getNegativeSCEV(Stride);
+    Stride = SE.getNoopOrAnyExtend(Stride, WiderType);
     TripCount = SE.getNoopOrAnyExtend(TripCount, WiderType);
     const SCEV *Numerator = SE.getMulExpr(Stride, TripCount);
     RefCost = SE.getUDivExpr(Numerator, CacheLineSize);
+
     LLVM_DEBUG(dbgs().indent(4)
                << "Access is consecutive: RefCost=(TripCount*Stride)/CLS="
                << *RefCost << "\n");
@@ -349,6 +357,19 @@ bool IndexedReference::delinearize(const LoopInfo &LI) {
         return false;
       }
 
+      // The array may be accessed in reverse, for example:
+      //   for (i = N; i > 0; i--)
+      //     A[i] = 0;
+      // In this case, reconstruct the access function using the absolute value
+      // of the step recurrence.
+      const SCEVAddRecExpr *AccessFnAR = dyn_cast<SCEVAddRecExpr>(AccessFn);
+      const SCEV *StepRec = AccessFnAR ? AccessFnAR->getStepRecurrence(SE) : nullptr;
+
+      if (StepRec && SE.isKnownNegative(StepRec))
+        AccessFn = SE.getAddRecExpr(AccessFnAR->getStart(),
+                                    SE.getNegativeSCEV(StepRec),
+                                    AccessFnAR->getLoop(),
+                                    AccessFnAR->getNoWrapFlags());
       const SCEV *Div = SE.getUDivExactExpr(AccessFn, ElemSize);
       Subscripts.push_back(Div);
       Sizes.push_back(ElemSize);
@@ -396,6 +417,7 @@ bool IndexedReference::isConsecutive(const Loop &L, unsigned CLS) const {
   const SCEV *Stride = SE.getMulExpr(Coeff, ElemSize);
   const SCEV *CacheLineSize = SE.getConstant(Stride->getType(), CLS);
 
+  Stride = SE.isKnownNegative(Stride) ? SE.getNegativeSCEV(Stride) : Stride;
   return SE.isKnownPredicate(ICmpInst::ICMP_ULT, Stride, CacheLineSize);
 }
 
@@ -537,6 +559,18 @@ bool CacheCost::populateReferenceGroups(ReferenceGroupsTy &RefGroups) const {
           dbgs().indent(2) << Representative << "\n";
         });
 
+
+       // FIXME: Both positive and negative access functions will be placed
+       // into the same reference group, resulting in a bi-directional array
+       // access such as:
+       //   for (i = N; i > 0; i--)
+       //     A[i] = A[N - i];
+       // having the same cost calculation as a single dimention access pattern
+       //   for (i = 0; i < N; i++)
+       //     A[i] = A[i];
+       // when in actuality, depending on the array size, the first example
+       // should have a cost closer to 2x the second due to the two cache
+       // access per iteration from opposite ends of the array
         Optional<bool> HasTemporalReuse =
             R->hasTemporalReuse(Representative, *TRT, *InnerMostLoop, DI, AA);
         Optional<bool> HasSpacialReuse =
diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp
index 3dc29b40834c..b5af210f1b92 100644
--- a/llvm/lib/Analysis/LoopInfo.cpp
+++ b/llvm/lib/Analysis/LoopInfo.cpp
@@ -420,7 +420,7 @@ bool Loop::isCanonical(ScalarEvolution &SE) const {
 
 // Check that 'BB' doesn't have any uses outside of the 'L'
 static bool isBlockInLCSSAForm(const Loop &L, const BasicBlock &BB,
-                               DominatorTree &DT) {
+                               const DominatorTree &DT) {
   for (const Instruction &I : BB) {
     // Tokens can't be used in PHI nodes and live-out tokens prevent loop
     // optimizations, so for the purposes of considered LCSSA form, we
@@ -446,14 +446,15 @@ static bool isBlockInLCSSAForm(const Loop &L, const BasicBlock &BB,
   return true;
 }
 
-bool Loop::isLCSSAForm(DominatorTree &DT) const {
+bool Loop::isLCSSAForm(const DominatorTree &DT) const {
   // For each block we check that it doesn't have any uses outside of this loop.
   return all_of(this->blocks(), [&](const BasicBlock *BB) {
     return isBlockInLCSSAForm(*this, *BB, DT);
   });
 }
 
-bool Loop::isRecursivelyLCSSAForm(DominatorTree &DT, const LoopInfo &LI) const {
+bool Loop::isRecursivelyLCSSAForm(const DominatorTree &DT,
+                                  const LoopInfo &LI) const {
   // For each block we check that it doesn't have any uses outside of its
   // innermost loop. This process will transitively guarantee that the current
   // loop and all of the nested loops are in LCSSA form.
@@ -480,8 +481,8 @@ bool Loop::isSafeToClone() const {
       return false;
 
     for (Instruction &I : *BB)
-      if (auto CS = CallSite(&I))
-        if (CS.cannotDuplicate())
+      if (auto *CB = dyn_cast<CallBase>(&I))
+        if (CB->cannotDuplicate())
           return false;
   }
   return true;
diff --git a/llvm/lib/Analysis/LoopNestAnalysis.cpp b/llvm/lib/Analysis/LoopNestAnalysis.cpp
new file mode 100644
index 000000000000..61e53de93151
--- /dev/null
+++ b/llvm/lib/Analysis/LoopNestAnalysis.cpp
@@ -0,0 +1,296 @@
+//===- LoopNestAnalysis.cpp - Loop Nest Analysis --------------------------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// The implementation for the loop nest analysis.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopNestAnalysis.h"
+#include "llvm/ADT/BreadthFirstIterator.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/PostDominators.h"
+#include "llvm/Analysis/ValueTracking.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "loopnest"
+#ifndef NDEBUG
+static const char *VerboseDebug = DEBUG_TYPE "-verbose";
+#endif
+
+/// Determine whether the loops structure violates basic requirements for
+/// perfect nesting:
+///  - the inner loop should be the outer loop's only child
+///  - the outer loop header should 'flow' into the inner loop preheader
+///    or jump around the inner loop to the outer loop latch
+///  - if the inner loop latch exits the inner loop, it should 'flow' into
+///    the outer loop latch.
+/// Returns true if the loop structure satisfies the basic requirements and
+/// false otherwise.
+static bool checkLoopsStructure(const Loop &OuterLoop, const Loop &InnerLoop,
+                                ScalarEvolution &SE);
+
+//===----------------------------------------------------------------------===//
+// LoopNest implementation
+//
+
+LoopNest::LoopNest(Loop &Root, ScalarEvolution &SE)
+    : MaxPerfectDepth(getMaxPerfectDepth(Root, SE)) {
+  for (Loop *L : breadth_first(&Root))
+    Loops.push_back(L);
+}
+
+std::unique_ptr<LoopNest> LoopNest::getLoopNest(Loop &Root,
+                                                ScalarEvolution &SE) {
+  return std::make_unique<LoopNest>(Root, SE);
+}
+
+bool LoopNest::arePerfectlyNested(const Loop &OuterLoop, const Loop &InnerLoop,
+                                  ScalarEvolution &SE) {
+  assert(!OuterLoop.getSubLoops().empty() && "Outer loop should have subloops");
+  assert(InnerLoop.getParentLoop() && "Inner loop should have a parent");
+  LLVM_DEBUG(dbgs() << "Checking whether loop '" << OuterLoop.getName()
+                    << "' and '" << InnerLoop.getName()
+                    << "' are perfectly nested.\n");
+
+  // Determine whether the loops structure satisfies the following requirements:
+  //  - the inner loop should be the outer loop's only child
+  //  - the outer loop header should 'flow' into the inner loop preheader
+  //    or jump around the inner loop to the outer loop latch
+  //  - if the inner loop latch exits the inner loop, it should 'flow' into
+  //    the outer loop latch.
+  if (!checkLoopsStructure(OuterLoop, InnerLoop, SE)) {
+    LLVM_DEBUG(dbgs() << "Not perfectly nested: invalid loop structure.\n");
+    return false;
+  }
+
+  // Bail out if we cannot retrieve the outer loop bounds.
+  auto OuterLoopLB = OuterLoop.getBounds(SE);
+  if (OuterLoopLB == None) {
+    LLVM_DEBUG(dbgs() << "Cannot compute loop bounds of OuterLoop: "
+                      << OuterLoop << "\n";);
+    return false;
+  }
+
+  // Identify the outer loop latch comparison instruction.
+  const BasicBlock *Latch = OuterLoop.getLoopLatch();
+  assert(Latch && "Expecting a valid loop latch");
+  const BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());
+  assert(BI && BI->isConditional() &&
+         "Expecting loop latch terminator to be a branch instruction");
+
+  const CmpInst *OuterLoopLatchCmp = dyn_cast<CmpInst>(BI->getCondition());
+  DEBUG_WITH_TYPE(
+      VerboseDebug, if (OuterLoopLatchCmp) {
+        dbgs() << "Outer loop latch compare instruction: " << *OuterLoopLatchCmp
+               << "\n";
+      });
+
+  // Identify the inner loop guard instruction.
+  BranchInst *InnerGuard = InnerLoop.getLoopGuardBranch();
+  const CmpInst *InnerLoopGuardCmp =
+      (InnerGuard) ? dyn_cast<CmpInst>(InnerGuard->getCondition()) : nullptr;
+
+  DEBUG_WITH_TYPE(
+      VerboseDebug, if (InnerLoopGuardCmp) {
+        dbgs() << "Inner loop guard compare instruction: " << *InnerLoopGuardCmp
+               << "\n";
+      });
+
+  // Determine whether instructions in a basic block are one of:
+  //  - the inner loop guard comparison
+  //  - the outer loop latch comparison
+  //  - the outer loop induction variable increment
+  //  - a phi node, a cast or a branch
+  auto containsOnlySafeInstructions = [&](const BasicBlock &BB) {
+    return llvm::all_of(BB, [&](const Instruction &I) {
+      bool isAllowed = isSafeToSpeculativelyExecute(&I) || isa<PHINode>(I) ||
+                       isa<BranchInst>(I);
+      if (!isAllowed) {
+        DEBUG_WITH_TYPE(VerboseDebug, {
+          dbgs() << "Instruction: " << I << "\nin basic block: " << BB
+                 << " is considered unsafe.\n";
+        });
+        return false;
+      }
+
+      // The only binary instruction allowed is the outer loop step instruction,
+      // the only comparison instructions allowed are the inner loop guard
+      // compare instruction and the outer loop latch compare instruction.
+      if ((isa<BinaryOperator>(I) && &I != &OuterLoopLB->getStepInst()) ||
+          (isa<CmpInst>(I) && &I != OuterLoopLatchCmp &&
+           &I != InnerLoopGuardCmp)) {
+        DEBUG_WITH_TYPE(VerboseDebug, {
+          dbgs() << "Instruction: " << I << "\nin basic block:" << BB
+                 << "is unsafe.\n";
+        });
+        return false;
+      }
+      return true;
+    });
+  };
+
+  // Check the code surrounding the inner loop for instructions that are deemed
+  // unsafe.
+  const BasicBlock *OuterLoopHeader = OuterLoop.getHeader();
+  const BasicBlock *OuterLoopLatch = OuterLoop.getLoopLatch();
+  const BasicBlock *InnerLoopPreHeader = InnerLoop.getLoopPreheader();
+
+  if (!containsOnlySafeInstructions(*OuterLoopHeader) ||
+      !containsOnlySafeInstructions(*OuterLoopLatch) ||
+      (InnerLoopPreHeader != OuterLoopHeader &&
+       !containsOnlySafeInstructions(*InnerLoopPreHeader)) ||
+      !containsOnlySafeInstructions(*InnerLoop.getExitBlock())) {
+    LLVM_DEBUG(dbgs() << "Not perfectly nested: code surrounding inner loop is "
+                         "unsafe\n";);
+    return false;
+  }
+
+  LLVM_DEBUG(dbgs() << "Loop '" << OuterLoop.getName() << "' and '"
+                    << InnerLoop.getName() << "' are perfectly nested.\n");
+
+  return true;
+}
+
+SmallVector<LoopVectorTy, 4>
+LoopNest::getPerfectLoops(ScalarEvolution &SE) const {
+  SmallVector<LoopVectorTy, 4> LV;
+  LoopVectorTy PerfectNest;
+
+  for (Loop *L : depth_first(const_cast<Loop *>(Loops.front()))) {
+    if (PerfectNest.empty())
+      PerfectNest.push_back(L);
+
+    auto &SubLoops = L->getSubLoops();
+    if (SubLoops.size() == 1 && arePerfectlyNested(*L, *SubLoops.front(), SE)) {
+      PerfectNest.push_back(SubLoops.front());
+    } else {
+      LV.push_back(PerfectNest);
+      PerfectNest.clear();
+    }
+  }
+
+  return LV;
+}
+
+unsigned LoopNest::getMaxPerfectDepth(const Loop &Root, ScalarEvolution &SE) {
+  LLVM_DEBUG(dbgs() << "Get maximum perfect depth of loop nest rooted by loop '"
+                    << Root.getName() << "'\n");
+
+  const Loop *CurrentLoop = &Root;
+  const auto *SubLoops = &CurrentLoop->getSubLoops();
+  unsigned CurrentDepth = 1;
+
+  while (SubLoops->size() == 1) {
+    const Loop *InnerLoop = SubLoops->front();
+    if (!arePerfectlyNested(*CurrentLoop, *InnerLoop, SE)) {
+      LLVM_DEBUG({
+        dbgs() << "Not a perfect nest: loop '" << CurrentLoop->getName()
+               << "' is not perfectly nested with loop '"
+               << InnerLoop->getName() << "'\n";
+      });
+      break;
+    }
+
+    CurrentLoop = InnerLoop;
+    SubLoops = &CurrentLoop->getSubLoops();
+    ++CurrentDepth;
+  }
+
+  return CurrentDepth;
+}
+
+static bool checkLoopsStructure(const Loop &OuterLoop, const Loop &InnerLoop,
+                                ScalarEvolution &SE) {
+  // The inner loop must be the only outer loop's child.
+  if ((OuterLoop.getSubLoops().size() != 1) ||
+      (InnerLoop.getParentLoop() != &OuterLoop))
+    return false;
+
+  // We expect loops in normal form which have a preheader, header, latch...
+  if (!OuterLoop.isLoopSimplifyForm() || !InnerLoop.isLoopSimplifyForm())
+    return false;
+
+  const BasicBlock *OuterLoopHeader = OuterLoop.getHeader();
+  const BasicBlock *OuterLoopLatch = OuterLoop.getLoopLatch();
+  const BasicBlock *InnerLoopPreHeader = InnerLoop.getLoopPreheader();
+  const BasicBlock *InnerLoopLatch = InnerLoop.getLoopLatch();
+  const BasicBlock *InnerLoopExit = InnerLoop.getExitBlock();
+
+  // We expect rotated loops. The inner loop should have a single exit block.
+  if (OuterLoop.getExitingBlock() != OuterLoopLatch ||
+      InnerLoop.getExitingBlock() != InnerLoopLatch || !InnerLoopExit)
+    return false;
+
+  // Ensure the only branch that may exist between the loops is the inner loop
+  // guard.
+  if (OuterLoopHeader != InnerLoopPreHeader) {
+    const BranchInst *BI =
+        dyn_cast<BranchInst>(OuterLoopHeader->getTerminator());
+
+    if (!BI || BI != InnerLoop.getLoopGuardBranch())
+      return false;
+
+    // The successors of the inner loop guard should be the inner loop
+    // preheader and the outer loop latch.
+    for (const BasicBlock *Succ : BI->successors()) {
+      if (Succ == InnerLoopPreHeader)
+        continue;
+      if (Succ == OuterLoopLatch)
+        continue;
+
+      DEBUG_WITH_TYPE(VerboseDebug, {
+        dbgs() << "Inner loop guard successor " << Succ->getName()
+               << " doesn't lead to inner loop preheader or "
+                  "outer loop latch.\n";
+      });
+      return false;
+    }
+  }
+
+  // Ensure the inner loop exit block leads to the outer loop latch.
+  if (InnerLoopExit->getSingleSuccessor() != OuterLoopLatch) {
+    DEBUG_WITH_TYPE(
+        VerboseDebug,
+        dbgs() << "Inner loop exit block " << *InnerLoopExit
+               << " does not directly lead to the outer loop latch.\n";);
+    return false;
+  }
+
+  return true;
+}
+
+raw_ostream &llvm::operator<<(raw_ostream &OS, const LoopNest &LN) {
+  OS << "IsPerfect=";
+  if (LN.getMaxPerfectDepth() == LN.getNestDepth())
+    OS << "true";
+  else
+    OS << "false";
+  OS << ", Depth=" << LN.getNestDepth();
+  OS << ", OutermostLoop: " << LN.getOutermostLoop().getName();
+  OS << ", Loops: ( ";
+  for (const Loop *L : LN.getLoops())
+    OS << L->getName() << " ";
+  OS << ")";
+
+  return OS;
+}
+
+//===----------------------------------------------------------------------===//
+// LoopNestPrinterPass implementation
+//
+
+PreservedAnalyses LoopNestPrinterPass::run(Loop &L, LoopAnalysisManager &AM,
+                                           LoopStandardAnalysisResults &AR,
+                                           LPMUpdater &U) {
+  if (auto LN = LoopNest::getLoopNest(L, AR.SE))
+    OS << *LN << "\n";
+
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Analysis/LoopPass.cpp b/llvm/lib/Analysis/LoopPass.cpp
index 507f5f442865..520f06003dd2 100644
--- a/llvm/lib/Analysis/LoopPass.cpp
+++ b/llvm/lib/Analysis/LoopPass.cpp
@@ -93,38 +93,6 @@ void LPPassManager::addLoop(Loop &L) {
   }
 }
 
-/// cloneBasicBlockSimpleAnalysis - Invoke cloneBasicBlockAnalysis hook for
-/// all loop passes.
-void LPPassManager::cloneBasicBlockSimpleAnalysis(BasicBlock *From,
-                                                  BasicBlock *To, Loop *L) {
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    LoopPass *LP = getContainedPass(Index);
-    LP->cloneBasicBlockAnalysis(From, To, L);
-  }
-}
-
-/// deleteSimpleAnalysisValue - Invoke deleteAnalysisValue hook for all passes.
-void LPPassManager::deleteSimpleAnalysisValue(Value *V, Loop *L) {
-  if (BasicBlock *BB = dyn_cast<BasicBlock>(V)) {
-    for (Instruction &I : *BB) {
-      deleteSimpleAnalysisValue(&I, L);
-    }
-  }
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    LoopPass *LP = getContainedPass(Index);
-    LP->deleteAnalysisValue(V, L);
-  }
-}
-
-/// Invoke deleteAnalysisLoop hook for all passes.
-void LPPassManager::deleteSimpleAnalysisLoop(Loop *L) {
-  for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
-    LoopPass *LP = getContainedPass(Index);
-    LP->deleteAnalysisLoop(L);
-  }
-}
-
-
 // Recurse through all subloops and all loops  into LQ.
 static void addLoopIntoQueue(Loop *L, std::deque<Loop *> &LQ) {
   LQ.push_back(L);
@@ -246,10 +214,7 @@ bool LPPassManager::runOnFunction(Function &F) {
                                         : CurrentLoop->getName());
       dumpPreservedSet(P);
 
-      if (CurrentLoopDeleted) {
-        // Notify passes that the loop is being deleted.
-        deleteSimpleAnalysisLoop(CurrentLoop);
-      } else {
+      if (!CurrentLoopDeleted) {
         // Manually check that this loop is still healthy. This is done
         // instead of relying on LoopInfo::verifyLoop since LoopInfo
         // is a function pass and it's really expensive to verify every
diff --git a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
index 762623de41e9..b04cc46bd272 100644
--- a/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
+++ b/llvm/lib/Analysis/LoopUnrollAnalyzer.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/LoopUnrollAnalyzer.h"
+#include "llvm/Analysis/LoopInfo.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/Analysis/MLInlineAdvisor.cpp b/llvm/lib/Analysis/MLInlineAdvisor.cpp
new file mode 100644
index 000000000000..45873f260f23
--- /dev/null
+++ b/llvm/lib/Analysis/MLInlineAdvisor.cpp
@@ -0,0 +1,301 @@
+//===- MLInlineAdvisor.cpp - machine learned InlineAdvisor ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the interface between the inliner and a learned model.
+// It delegates model evaluation to either the AOT compiled model (the
+// 'release' mode) or a runtime-loaded model (the 'development' case).
+//
+//===----------------------------------------------------------------------===//
+#include <limits>
+#include <unordered_map>
+#include <unordered_set>
+
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/InlineCost.h"
+#include "llvm/Analysis/InlineFeaturesAnalysis.h"
+#include "llvm/Analysis/MLInlineAdvisor.h"
+#include "llvm/Analysis/MLModelRunner.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PassManager.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Path.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "inline-ml"
+
+static cl::opt<float> SizeIncreaseThreshold(
+    "ml-advisor-size-increase-threshold", cl::Hidden,
+    cl::desc("Maximum factor by which expected native size may increase before "
+             "blocking any further inlining."),
+    cl::init(2.0));
+
+const std::array<std::string, NumberOfFeatures> llvm::FeatureNameMap{
+#define POPULATE_NAMES(INDEX_NAME, NAME, COMMENT) NAME,
+    INLINE_FEATURE_ITERATOR(POPULATE_NAMES)
+#undef POPULATE_NAMES
+};
+
+const char *const llvm::DecisionName = "inlining_decision";
+const char *const llvm::DefaultDecisionName = "inlining_default";
+const char *const llvm::RewardName = "delta_size";
+
+CallBase *getInlinableCS(Instruction &I) {
+  if (auto *CS = dyn_cast<CallBase>(&I))
+    if (Function *Callee = CS->getCalledFunction()) {
+      if (!Callee->isDeclaration()) {
+        return CS;
+      }
+    }
+  return nullptr;
+}
+
+MLInlineAdvisor::MLInlineAdvisor(Module &M, ModuleAnalysisManager &MAM,
+                                 std::unique_ptr<MLModelRunner> Runner)
+    : InlineAdvisor(
+          MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager()),
+      M(M), ModelRunner(std::move(Runner)), CG(new CallGraph(M)),
+      InitialIRSize(getModuleIRSize()), CurrentIRSize(InitialIRSize) {
+  assert(ModelRunner);
+
+  // Extract the 'call site height' feature - the position of a call site
+  // relative to the farthest statically reachable SCC node. We don't mutate
+  // this value while inlining happens. Empirically, this feature proved
+  // critical in behavioral cloning - i.e. training a model to mimic the manual
+  // heuristic's decisions - and, thus, equally important for training for
+  // improvement.
+  for (auto I = scc_begin(CG.get()); !I.isAtEnd(); ++I) {
+    const std::vector<CallGraphNode *> &CGNodes = *I;
+    unsigned Level = 0;
+    for (auto *CGNode : CGNodes) {
+      Function *F = CGNode->getFunction();
+      if (!F || F->isDeclaration())
+        continue;
+      for (auto &I : instructions(F)) {
+        if (auto *CS = getInlinableCS(I)) {
+          auto *Called = CS->getCalledFunction();
+          auto Pos = FunctionLevels.find(Called);
+          // In bottom up traversal, an inlinable callee is either in the
+          // same SCC, or to a function in a visited SCC. So not finding its
+          // level means we haven't visited it yet, meaning it's in this SCC.
+          if (Pos == FunctionLevels.end())
+            continue;
+          Level = std::max(Level, Pos->second + 1);
+        }
+      }
+    }
+    for (auto *CGNode : CGNodes) {
+      Function *F = CGNode->getFunction();
+      if (F && !F->isDeclaration())
+        FunctionLevels[F] = Level;
+    }
+  }
+}
+
+void MLInlineAdvisor::onPassEntry() {
+  // Function passes executed between InlinerPass runs may have changed the
+  // module-wide features.
+  NodeCount = 0;
+  EdgeCount = 0;
+  for (auto &F : M)
+    if (!F.isDeclaration()) {
+      ++NodeCount;
+      EdgeCount += getLocalCalls(F);
+    }
+}
+
+int64_t MLInlineAdvisor::getLocalCalls(Function &F) {
+  return FAM.getResult<InlineFeaturesAnalysis>(F).DirectCallsToDefinedFunctions;
+}
+
+// Update the internal state of the advisor, and force invalidate feature
+// analysis. Currently, we maintain minimal (and very simple) global state - the
+// number of functions and the number of static calls. We also keep track of the
+// total IR size in this module, to stop misbehaving policies at a certain bloat
+// factor (SizeIncreaseThreshold)
+void MLInlineAdvisor::onSuccessfulInlining(const MLInlineAdvice &Advice,
+                                           bool CalleeWasDeleted) {
+  assert(!ForceStop);
+  Function *Caller = Advice.getCaller();
+  Function *Callee = Advice.getCallee();
+
+  // The caller features aren't valid anymore.
+  FAM.invalidate<InlineFeaturesAnalysis>(*Caller);
+  int64_t IRSizeAfter =
+      getIRSize(*Caller) + (CalleeWasDeleted ? 0 : Advice.CalleeIRSize);
+  CurrentIRSize += IRSizeAfter - (Advice.CallerIRSize + Advice.CalleeIRSize);
+  if (CurrentIRSize > SizeIncreaseThreshold * InitialIRSize)
+    ForceStop = true;
+
+  // We can delta-update module-wide features. We know the inlining only changed
+  // the caller, and maybe the callee (by deleting the latter).
+  // Nodes are simple to update.
+  // For edges, we 'forget' the edges that the caller and callee used to have
+  // before inlining, and add back what they currently have together.
+  int64_t NewCallerAndCalleeEdges =
+      FAM.getResult<InlineFeaturesAnalysis>(*Caller)
+          .DirectCallsToDefinedFunctions;
+
+  if (CalleeWasDeleted)
+    --NodeCount;
+  else
+    NewCallerAndCalleeEdges += FAM.getResult<InlineFeaturesAnalysis>(*Callee)
+                                   .DirectCallsToDefinedFunctions;
+  EdgeCount += (NewCallerAndCalleeEdges - Advice.CallerAndCalleeEdges);
+  assert(CurrentIRSize >= 0 && EdgeCount >= 0 && NodeCount >= 0);
+}
+
+int64_t MLInlineAdvisor::getModuleIRSize() const {
+  int64_t Ret = 0;
+  for (auto &F : CG->getModule())
+    if (!F.isDeclaration())
+      Ret += getIRSize(F);
+  return Ret;
+}
+
+std::unique_ptr<InlineAdvice> MLInlineAdvisor::getAdvice(CallBase &CB) {
+  auto &Caller = *CB.getCaller();
+  auto &Callee = *CB.getCalledFunction();
+
+  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
+    return FAM.getResult<AssumptionAnalysis>(F);
+  };
+  auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+
+  auto &TIR = FAM.getResult<TargetIRAnalysis>(Callee);
+  auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(Caller);
+
+  auto TrivialDecision =
+      llvm::getAttributeBasedInliningDecision(CB, &Callee, TIR, GetTLI);
+
+  // If this is a "never inline" case, there won't be any changes to internal
+  // state we need to track, so we can just return the base InlineAdvice, which
+  // will do nothing interesting.
+  // Same thing if this is a recursive case.
+  if ((TrivialDecision.hasValue() && !TrivialDecision->isSuccess()) ||
+      &Caller == &Callee)
+    return std::make_unique<InlineAdvice>(this, CB, ORE, false);
+
+  bool Mandatory = TrivialDecision.hasValue() && TrivialDecision->isSuccess();
+
+  // If we need to stop, we won't want to track anymore any state changes, so
+  // we just return the base InlineAdvice, which acts as a noop.
+  if (ForceStop) {
+    ORE.emit([&] {
+      return OptimizationRemarkMissed(DEBUG_TYPE, "ForceStop", &CB)
+             << "Won't attempt inlining because module size grew too much.";
+    });
+    return std::make_unique<InlineAdvice>(this, CB, ORE, Mandatory);
+  }
+
+  int CostEstimate = 0;
+  if (!Mandatory) {
+    auto IsCallSiteInlinable =
+        llvm::getInliningCostEstimate(CB, TIR, GetAssumptionCache);
+    if (!IsCallSiteInlinable) {
+      // We can't inline this for correctness reasons, so return the base
+      // InlineAdvice, as we don't care about tracking any state changes (which
+      // won't happen).
+      return std::make_unique<InlineAdvice>(this, CB, ORE, false);
+    }
+    CostEstimate = *IsCallSiteInlinable;
+  }
+
+  if (Mandatory)
+    return getMandatoryAdvice(CB, ORE);
+
+  auto NrCtantParams = 0;
+  for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) {
+    NrCtantParams += (isa<Constant>(*I));
+  }
+
+  auto &CallerBefore = FAM.getResult<InlineFeaturesAnalysis>(Caller);
+  auto &CalleeBefore = FAM.getResult<InlineFeaturesAnalysis>(Callee);
+
+  ModelRunner->setFeature(FeatureIndex::CalleeBasicBlockCount,
+                          CalleeBefore.BasicBlockCount);
+  ModelRunner->setFeature(FeatureIndex::CallSiteHeight,
+                          FunctionLevels[&Caller]);
+  ModelRunner->setFeature(FeatureIndex::NodeCount, NodeCount);
+  ModelRunner->setFeature(FeatureIndex::NrCtantParams, NrCtantParams);
+  ModelRunner->setFeature(FeatureIndex::CostEstimate, CostEstimate);
+  ModelRunner->setFeature(FeatureIndex::EdgeCount, EdgeCount);
+  ModelRunner->setFeature(FeatureIndex::CallerUsers, CallerBefore.Uses);
+  ModelRunner->setFeature(FeatureIndex::CallerConditionallyExecutedBlocks,
+                          CallerBefore.BlocksReachedFromConditionalInstruction);
+  ModelRunner->setFeature(FeatureIndex::CallerBasicBlockCount,
+                          CallerBefore.BasicBlockCount);
+  ModelRunner->setFeature(FeatureIndex::CalleeConditionallyExecutedBlocks,
+                          CalleeBefore.BlocksReachedFromConditionalInstruction);
+  ModelRunner->setFeature(FeatureIndex::CalleeUsers, CalleeBefore.Uses);
+  return getAdviceFromModel(CB, ORE);
+}
+
+std::unique_ptr<MLInlineAdvice>
+MLInlineAdvisor::getAdviceFromModel(CallBase &CB,
+                                    OptimizationRemarkEmitter &ORE) {
+  return std::make_unique<MLInlineAdvice>(this, CB, ORE, ModelRunner->run());
+}
+
+std::unique_ptr<MLInlineAdvice>
+MLInlineAdvisor::getMandatoryAdvice(CallBase &CB,
+                                    OptimizationRemarkEmitter &ORE) {
+  return std::make_unique<MLInlineAdvice>(this, CB, ORE, true);
+}
+
+void MLInlineAdvice::reportContextForRemark(
+    DiagnosticInfoOptimizationBase &OR) {
+  using namespace ore;
+  OR << NV("Callee", Callee->getName());
+  for (size_t I = 0; I < NumberOfFeatures; ++I)
+    OR << NV(FeatureNameMap[I], getAdvisor()->getModelRunner().getFeature(I));
+  OR << NV("ShouldInline", isInliningRecommended());
+}
+
+void MLInlineAdvice::recordInliningImpl() {
+  ORE.emit([&]() {
+    OptimizationRemark R(DEBUG_TYPE, "InliningSuccess", DLoc, Block);
+    reportContextForRemark(R);
+    return R;
+  });
+  getAdvisor()->onSuccessfulInlining(*this, /*CalleeWasDeleted*/ false);
+}
+
+void MLInlineAdvice::recordInliningWithCalleeDeletedImpl() {
+  ORE.emit([&]() {
+    OptimizationRemark R(DEBUG_TYPE, "InliningSuccessWithCalleeDeleted", DLoc,
+                         Block);
+    reportContextForRemark(R);
+    return R;
+  });
+  getAdvisor()->onSuccessfulInlining(*this, /*CalleeWasDeleted*/ true);
+}
+
+void MLInlineAdvice::recordUnsuccessfulInliningImpl(
+    const InlineResult &Result) {
+  ORE.emit([&]() {
+    OptimizationRemarkMissed R(DEBUG_TYPE, "InliningAttemptedAndUnsuccessful",
+                               DLoc, Block);
+    reportContextForRemark(R);
+    return R;
+  });
+}
+void MLInlineAdvice::recordUnattemptedInliningImpl() {
+  ORE.emit([&]() {
+    OptimizationRemarkMissed R(DEBUG_TYPE, "IniningNotAttempted", DLoc, Block);
+    reportContextForRemark(R);
+    return R;
+  });
+}
+\ No newline at end of file
diff --git a/llvm/lib/Analysis/MemDepPrinter.cpp b/llvm/lib/Analysis/MemDepPrinter.cpp
index 2c57e63251c6..9524ec96bb61 100644
--- a/llvm/lib/Analysis/MemDepPrinter.cpp
+++ b/llvm/lib/Analysis/MemDepPrinter.cpp
@@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SetVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/Passes.h"
 #include "llvm/IR/InstIterator.h"
@@ -17,6 +18,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 namespace {
diff --git a/llvm/lib/Analysis/MemDerefPrinter.cpp b/llvm/lib/Analysis/MemDerefPrinter.cpp
index 5d824067df53..564410b8af08 100644
--- a/llvm/lib/Analysis/MemDerefPrinter.cpp
+++ b/llvm/lib/Analysis/MemDerefPrinter.cpp
@@ -8,12 +8,13 @@
 
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/Passes.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index 427e6fd3ace2..0b61b1c0eabd 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -52,11 +52,12 @@ using namespace llvm;
 enum AllocType : uint8_t {
   OpNewLike          = 1<<0, // allocates; never returns null
   MallocLike         = 1<<1 | OpNewLike, // allocates; may return null
-  CallocLike         = 1<<2, // allocates + bzero
-  ReallocLike        = 1<<3, // reallocates
-  StrDupLike         = 1<<4,
-  MallocOrCallocLike = MallocLike | CallocLike,
-  AllocLike          = MallocLike | CallocLike | StrDupLike,
+  AlignedAllocLike   = 1<<2, // allocates with alignment; may return null
+  CallocLike         = 1<<3, // allocates + bzero
+  ReallocLike        = 1<<4, // reallocates
+  StrDupLike         = 1<<5,
+  MallocOrCallocLike = MallocLike | CallocLike | AlignedAllocLike,
+  AllocLike          = MallocOrCallocLike | StrDupLike,
   AnyAlloc           = AllocLike | ReallocLike
 };
 
@@ -100,6 +101,7 @@ static const std::pair<LibFunc, AllocFnsTy> AllocationFnData[] = {
   {LibFunc_msvc_new_array_int_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned int, nothrow)
   {LibFunc_msvc_new_array_longlong,         {OpNewLike,   1, 0,  -1}}, // new[](unsigned long long)
   {LibFunc_msvc_new_array_longlong_nothrow, {MallocLike,  2, 0,  -1}}, // new[](unsigned long long, nothrow)
+  {LibFunc_aligned_alloc,       {AlignedAllocLike, 2, 1,  -1}},
   {LibFunc_calloc,              {CallocLike,  2, 0,   1}},
   {LibFunc_realloc,             {ReallocLike, 2, 1,  -1}},
   {LibFunc_reallocf,            {ReallocLike, 2, 1,  -1}},
@@ -117,13 +119,13 @@ static const Function *getCalledFunction(const Value *V, bool LookThroughBitCast
   if (LookThroughBitCast)
     V = V->stripPointerCasts();
 
-  ImmutableCallSite CS(V);
-  if (!CS.getInstruction())
+  const auto *CB = dyn_cast<CallBase>(V);
+  if (!CB)
     return nullptr;
 
-  IsNoBuiltin = CS.isNoBuiltin();
+  IsNoBuiltin = CB->isNoBuiltin();
 
-  if (const Function *Callee = CS.getCalledFunction())
+  if (const Function *Callee = CB->getCalledFunction())
     return Callee;
   return nullptr;
 }
@@ -225,8 +227,9 @@ static Optional<AllocFnsTy> getAllocationSize(const Value *V,
 }
 
 static bool hasNoAliasAttr(const Value *V, bool LookThroughBitCast) {
-  ImmutableCallSite CS(LookThroughBitCast ? V->stripPointerCasts() : V);
-  return CS && CS.hasRetAttr(Attribute::NoAlias);
+  const auto *CB =
+      dyn_cast<CallBase>(LookThroughBitCast ? V->stripPointerCasts() : V);
+  return CB && CB->hasRetAttr(Attribute::NoAlias);
 }
 
 /// Tests if a value is a call or invoke to a library function that
@@ -266,6 +269,20 @@ bool llvm::isMallocLikeFn(
 }
 
 /// Tests if a value is a call or invoke to a library function that
+/// allocates uninitialized memory with alignment (such as aligned_alloc).
+bool llvm::isAlignedAllocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
+                                bool LookThroughBitCast) {
+  return getAllocationData(V, AlignedAllocLike, TLI, LookThroughBitCast)
+      .hasValue();
+}
+bool llvm::isAlignedAllocLikeFn(
+    const Value *V, function_ref<const TargetLibraryInfo &(Function &)> GetTLI,
+    bool LookThroughBitCast) {
+  return getAllocationData(V, AlignedAllocLike, GetTLI, LookThroughBitCast)
+      .hasValue();
+}
+
+/// Tests if a value is a call or invoke to a library function that
 /// allocates zero-filled memory (such as calloc).
 bool llvm::isCallocLikeFn(const Value *V, const TargetLibraryInfo *TLI,
                           bool LookThroughBitCast) {
@@ -439,7 +456,11 @@ bool llvm::isLibFreeFunction(const Function *F, const LibFunc TLIFn) {
            TLIFn == LibFunc_msvc_delete_array_ptr64_nothrow)   // delete[](void*, nothrow)
     ExpectedNumParams = 2;
   else if (TLIFn == LibFunc_ZdaPvSt11align_val_tRKSt9nothrow_t || // delete(void*, align_val_t, nothrow)
-           TLIFn == LibFunc_ZdlPvSt11align_val_tRKSt9nothrow_t) // delete[](void*, align_val_t, nothrow)
+           TLIFn == LibFunc_ZdlPvSt11align_val_tRKSt9nothrow_t || // delete[](void*, align_val_t, nothrow)
+           TLIFn == LibFunc_ZdlPvjSt11align_val_t || // delete(void*, unsigned long, align_val_t)
+           TLIFn == LibFunc_ZdlPvmSt11align_val_t || // delete(void*, unsigned long, align_val_t)
+           TLIFn == LibFunc_ZdaPvjSt11align_val_t || // delete[](void*, unsigned int, align_val_t)
+           TLIFn == LibFunc_ZdaPvmSt11align_val_t) // delete[](void*, unsigned long, align_val_t)
     ExpectedNumParams = 3;
   else
     return false;
@@ -633,6 +654,9 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
   if (!I.getAllocatedType()->isSized())
     return unknown();
 
+  if (isa<ScalableVectorType>(I.getAllocatedType()))
+    return unknown();
+
   APInt Size(IntTyBits, DL.getTypeAllocSize(I.getAllocatedType()));
   if (!I.isArrayAllocation())
     return std::make_pair(align(Size, I.getAlignment()), Zero);
@@ -653,7 +677,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
 
 SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
   // No interprocedural analysis is done at the moment.
-  if (!A.hasByValOrInAllocaAttr()) {
+  if (!A.hasPassPointeeByValueAttr()) {
     ++ObjectVisitorArgument;
     return unknown();
   }
@@ -662,21 +686,21 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitArgument(Argument &A) {
   return std::make_pair(align(Size, A.getParamAlignment()), Zero);
 }
 
-SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
-  Optional<AllocFnsTy> FnData = getAllocationSize(CS.getInstruction(), TLI);
+SizeOffsetType ObjectSizeOffsetVisitor::visitCallBase(CallBase &CB) {
+  Optional<AllocFnsTy> FnData = getAllocationSize(&CB, TLI);
   if (!FnData)
     return unknown();
 
   // Handle strdup-like functions separately.
   if (FnData->AllocTy == StrDupLike) {
-    APInt Size(IntTyBits, GetStringLength(CS.getArgument(0)));
+    APInt Size(IntTyBits, GetStringLength(CB.getArgOperand(0)));
     if (!Size)
       return unknown();
 
     // Strndup limits strlen.
     if (FnData->FstParam > 0) {
       ConstantInt *Arg =
-          dyn_cast<ConstantInt>(CS.getArgument(FnData->FstParam));
+          dyn_cast<ConstantInt>(CB.getArgOperand(FnData->FstParam));
       if (!Arg)
         return unknown();
 
@@ -687,7 +711,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
     return std::make_pair(Size, Zero);
   }
 
-  ConstantInt *Arg = dyn_cast<ConstantInt>(CS.getArgument(FnData->FstParam));
+  ConstantInt *Arg = dyn_cast<ConstantInt>(CB.getArgOperand(FnData->FstParam));
   if (!Arg)
     return unknown();
 
@@ -699,7 +723,7 @@ SizeOffsetType ObjectSizeOffsetVisitor::visitCallSite(CallSite CS) {
   if (FnData->SndParam < 0)
     return std::make_pair(Size, Zero);
 
-  Arg = dyn_cast<ConstantInt>(CS.getArgument(FnData->SndParam));
+  Arg = dyn_cast<ConstantInt>(CB.getArgOperand(FnData->SndParam));
   if (!Arg)
     return unknown();
 
@@ -927,8 +951,8 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitAllocaInst(AllocaInst &I) {
   return std::make_pair(Size, Zero);
 }
 
-SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitCallSite(CallSite CS) {
-  Optional<AllocFnsTy> FnData = getAllocationSize(CS.getInstruction(), TLI);
+SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitCallBase(CallBase &CB) {
+  Optional<AllocFnsTy> FnData = getAllocationSize(&CB, TLI);
   if (!FnData)
     return unknown();
 
@@ -938,12 +962,12 @@ SizeOffsetEvalType ObjectSizeOffsetEvaluator::visitCallSite(CallSite CS) {
     return unknown();
   }
 
-  Value *FirstArg = CS.getArgument(FnData->FstParam);
+  Value *FirstArg = CB.getArgOperand(FnData->FstParam);
   FirstArg = Builder.CreateZExtOrTrunc(FirstArg, IntTy);
   if (FnData->SndParam < 0)
     return std::make_pair(FirstArg, Zero);
 
-  Value *SecondArg = CS.getArgument(FnData->SndParam);
+  Value *SecondArg = CB.getArgOperand(FnData->SndParam);
   SecondArg = Builder.CreateZExtOrTrunc(SecondArg, IntTy);
   Value *Size = Builder.CreateMul(FirstArg, SecondArg);
   return std::make_pair(Size, Zero);
diff --git a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
index a97a56e25805..566eba5c54af 100644
--- a/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/OrderedBasicBlock.h"
 #include "llvm/Analysis/PHITransAddr.h"
 #include "llvm/Analysis/PhiValues.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -238,83 +237,6 @@ MemDepResult MemoryDependenceResults::getCallDependencyFrom(
   return MemDepResult::getNonFuncLocal();
 }
 
-unsigned MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
-    const Value *MemLocBase, int64_t MemLocOffs, unsigned MemLocSize,
-    const LoadInst *LI) {
-  // We can only extend simple integer loads.
-  if (!isa<IntegerType>(LI->getType()) || !LI->isSimple())
-    return 0;
-
-  // Load widening is hostile to ThreadSanitizer: it may cause false positives
-  // or make the reports more cryptic (access sizes are wrong).
-  if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeThread))
-    return 0;
-
-  const DataLayout &DL = LI->getModule()->getDataLayout();
-
-  // Get the base of this load.
-  int64_t LIOffs = 0;
-  const Value *LIBase =
-      GetPointerBaseWithConstantOffset(LI->getPointerOperand(), LIOffs, DL);
-
-  // If the two pointers are not based on the same pointer, we can't tell that
-  // they are related.
-  if (LIBase != MemLocBase)
-    return 0;
-
-  // Okay, the two values are based on the same pointer, but returned as
-  // no-alias.  This happens when we have things like two byte loads at "P+1"
-  // and "P+3".  Check to see if increasing the size of the "LI" load up to its
-  // alignment (or the largest native integer type) will allow us to load all
-  // the bits required by MemLoc.
-
-  // If MemLoc is before LI, then no widening of LI will help us out.
-  if (MemLocOffs < LIOffs)
-    return 0;
-
-  // Get the alignment of the load in bytes.  We assume that it is safe to load
-  // any legal integer up to this size without a problem.  For example, if we're
-  // looking at an i8 load on x86-32 that is known 1024 byte aligned, we can
-  // widen it up to an i32 load.  If it is known 2-byte aligned, we can widen it
-  // to i16.
-  unsigned LoadAlign = LI->getAlignment();
-
-  int64_t MemLocEnd = MemLocOffs + MemLocSize;
-
-  // If no amount of rounding up will let MemLoc fit into LI, then bail out.
-  if (LIOffs + LoadAlign < MemLocEnd)
-    return 0;
-
-  // This is the size of the load to try.  Start with the next larger power of
-  // two.
-  unsigned NewLoadByteSize = LI->getType()->getPrimitiveSizeInBits() / 8U;
-  NewLoadByteSize = NextPowerOf2(NewLoadByteSize);
-
-  while (true) {
-    // If this load size is bigger than our known alignment or would not fit
-    // into a native integer register, then we fail.
-    if (NewLoadByteSize > LoadAlign ||
-        !DL.fitsInLegalInteger(NewLoadByteSize * 8))
-      return 0;
-
-    if (LIOffs + NewLoadByteSize > MemLocEnd &&
-        (LI->getParent()->getParent()->hasFnAttribute(
-             Attribute::SanitizeAddress) ||
-         LI->getParent()->getParent()->hasFnAttribute(
-             Attribute::SanitizeHWAddress)))
-      // We will be reading past the location accessed by the original program.
-      // While this is safe in a regular build, Address Safety analysis tools
-      // may start reporting false warnings. So, don't do widening.
-      return 0;
-
-    // If a load of this width would include all of MemLoc, then we succeed.
-    if (LIOffs + NewLoadByteSize >= MemLocEnd)
-      return NewLoadByteSize;
-
-    NewLoadByteSize <<= 1;
-  }
-}
-
 static bool isVolatile(Instruction *Inst) {
   if (auto *LI = dyn_cast<LoadInst>(Inst))
     return LI->isVolatile();
@@ -327,8 +249,7 @@ static bool isVolatile(Instruction *Inst) {
 
 MemDepResult MemoryDependenceResults::getPointerDependencyFrom(
     const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt,
-    BasicBlock *BB, Instruction *QueryInst, unsigned *Limit,
-    OrderedBasicBlock *OBB) {
+    BasicBlock *BB, Instruction *QueryInst, unsigned *Limit) {
   MemDepResult InvariantGroupDependency = MemDepResult::getUnknown();
   if (QueryInst != nullptr) {
     if (auto *LI = dyn_cast<LoadInst>(QueryInst)) {
@@ -339,7 +260,7 @@ MemDepResult MemoryDependenceResults::getPointerDependencyFrom(
     }
   }
   MemDepResult SimpleDep = getSimplePointerDependencyFrom(
-      MemLoc, isLoad, ScanIt, BB, QueryInst, Limit, OBB);
+      MemLoc, isLoad, ScanIt, BB, QueryInst, Limit);
   if (SimpleDep.isDef())
     return SimpleDep;
   // Non-local invariant group dependency indicates there is non local Def
@@ -440,8 +361,7 @@ MemoryDependenceResults::getInvariantGroupPointerDependency(LoadInst *LI,
 
 MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
     const MemoryLocation &MemLoc, bool isLoad, BasicBlock::iterator ScanIt,
-    BasicBlock *BB, Instruction *QueryInst, unsigned *Limit,
-    OrderedBasicBlock *OBB) {
+    BasicBlock *BB, Instruction *QueryInst, unsigned *Limit) {
   bool isInvariantLoad = false;
 
   unsigned DefaultLimit = getDefaultBlockScanLimit();
@@ -488,15 +408,6 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
 
   const DataLayout &DL = BB->getModule()->getDataLayout();
 
-  // If the caller did not provide an ordered basic block,
-  // create one to lazily compute and cache instruction
-  // positions inside a BB. This is used to provide fast queries for relative
-  // position between two instructions in a BB and can be used by
-  // AliasAnalysis::callCapturesBefore.
-  OrderedBasicBlock OBBTmp(BB);
-  if (!OBB)
-    OBB = &OBBTmp;
-
   // Return "true" if and only if the instruction I is either a non-simple
   // load or a non-simple store.
   auto isNonSimpleLoadOrStore = [](Instruction *I) -> bool {
@@ -686,7 +597,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
     ModRefInfo MR = AA.getModRefInfo(Inst, MemLoc);
     // If necessary, perform additional analysis.
     if (isModAndRefSet(MR))
-      MR = AA.callCapturesBefore(Inst, MemLoc, &DT, OBB);
+      MR = AA.callCapturesBefore(Inst, MemLoc, &DT);
     switch (clearMust(MR)) {
     case ModRefInfo::NoModRef:
       // If the call has no effect on the queried pointer, just ignore it.
@@ -712,8 +623,7 @@ MemDepResult MemoryDependenceResults::getSimplePointerDependencyFrom(
   return MemDepResult::getNonFuncLocal();
 }
 
-MemDepResult MemoryDependenceResults::getDependency(Instruction *QueryInst,
-                                                    OrderedBasicBlock *OBB) {
+MemDepResult MemoryDependenceResults::getDependency(Instruction *QueryInst) {
   Instruction *ScanPos = QueryInst;
 
   // Check for a cached result
@@ -753,7 +663,7 @@ MemDepResult MemoryDependenceResults::getDependency(Instruction *QueryInst,
 
       LocalCache =
           getPointerDependencyFrom(MemLoc, isLoad, ScanPos->getIterator(),
-                                   QueryParent, QueryInst, nullptr, OBB);
+                                   QueryParent, QueryInst, nullptr);
     } else if (auto *QueryCall = dyn_cast<CallBase>(QueryInst)) {
       bool isReadOnly = AA.onlyReadsMemory(QueryCall);
       LocalCache = getCallDependencyFrom(QueryCall, isReadOnly,
@@ -979,6 +889,11 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock(
     Instruction *QueryInst, const MemoryLocation &Loc, bool isLoad,
     BasicBlock *BB, NonLocalDepInfo *Cache, unsigned NumSortedEntries) {
 
+  bool isInvariantLoad = false;
+
+  if (LoadInst *LI = dyn_cast_or_null<LoadInst>(QueryInst))
+    isInvariantLoad = LI->getMetadata(LLVMContext::MD_invariant_load);
+
   // Do a binary search to see if we already have an entry for this block in
   // the cache set.  If so, find it.
   NonLocalDepInfo::iterator Entry = std::upper_bound(
@@ -990,6 +905,13 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock(
   if (Entry != Cache->begin() + NumSortedEntries && Entry->getBB() == BB)
     ExistingResult = &*Entry;
 
+  // Use cached result for invariant load only if there is no dependency for non
+  // invariant load. In this case invariant load can not have any dependency as
+  // well.
+  if (ExistingResult && isInvariantLoad &&
+      !ExistingResult->getResult().isNonFuncLocal())
+    ExistingResult = nullptr;
+
   // If we have a cached entry, and it is non-dirty, use it as the value for
   // this dependency.
   if (ExistingResult && !ExistingResult->getResult().isDirty()) {
@@ -1018,6 +940,10 @@ MemDepResult MemoryDependenceResults::GetNonLocalInfoForBlock(
   MemDepResult Dep =
       getPointerDependencyFrom(Loc, isLoad, ScanPos, BB, QueryInst);
 
+  // Don't cache results for invariant load.
+  if (isInvariantLoad)
+    return Dep;
+
   // If we had a dirty entry for the block, update it.  Otherwise, just add
   // a new entry.
   if (ExistingResult)
@@ -1094,7 +1020,8 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
     Instruction *QueryInst, const PHITransAddr &Pointer,
     const MemoryLocation &Loc, bool isLoad, BasicBlock *StartBB,
     SmallVectorImpl<NonLocalDepResult> &Result,
-    DenseMap<BasicBlock *, Value *> &Visited, bool SkipFirstBlock) {
+    DenseMap<BasicBlock *, Value *> &Visited, bool SkipFirstBlock,
+    bool IsIncomplete) {
   // Look up the cached info for Pointer.
   ValueIsLoadPair CacheKey(Pointer.getAddr(), isLoad);
 
@@ -1106,6 +1033,10 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
   InitialNLPI.Size = Loc.Size;
   InitialNLPI.AATags = Loc.AATags;
 
+  bool isInvariantLoad = false;
+  if (LoadInst *LI = dyn_cast_or_null<LoadInst>(QueryInst))
+    isInvariantLoad = LI->getMetadata(LLVMContext::MD_invariant_load);
+
   // Get the NLPI for CacheKey, inserting one into the map if it doesn't
   // already have one.
   std::pair<CachedNonLocalPointerInfo::iterator, bool> Pair =
@@ -1114,7 +1045,8 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
 
   // If we already have a cache entry for this CacheKey, we may need to do some
   // work to reconcile the cache entry and the current query.
-  if (!Pair.second) {
+  // Invariant loads don't participate in caching. Thus no need to reconcile.
+  if (!isInvariantLoad && !Pair.second) {
     if (CacheInfo->Size != Loc.Size) {
       bool ThrowOutEverything;
       if (CacheInfo->Size.hasValue() && Loc.Size.hasValue()) {
@@ -1138,12 +1070,16 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
           if (Instruction *Inst = Entry.getResult().getInst())
             RemoveFromReverseMap(ReverseNonLocalPtrDeps, Inst, CacheKey);
         CacheInfo->NonLocalDeps.clear();
+        // The cache is cleared (in the above line) so we will have lost
+        // information about blocks we have already visited. We therefore must
+        // assume that the cache information is incomplete.
+        IsIncomplete = true;
       } else {
         // This query's Size is less than the cached one. Conservatively restart
         // the query using the greater size.
         return getNonLocalPointerDepFromBB(
             QueryInst, Pointer, Loc.getWithNewSize(CacheInfo->Size), isLoad,
-            StartBB, Result, Visited, SkipFirstBlock);
+            StartBB, Result, Visited, SkipFirstBlock, IsIncomplete);
       }
     }
 
@@ -1158,11 +1094,15 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
           if (Instruction *Inst = Entry.getResult().getInst())
             RemoveFromReverseMap(ReverseNonLocalPtrDeps, Inst, CacheKey);
         CacheInfo->NonLocalDeps.clear();
+        // The cache is cleared (in the above line) so we will have lost
+        // information about blocks we have already visited. We therefore must
+        // assume that the cache information is incomplete.
+        IsIncomplete = true;
       }
       if (Loc.AATags)
         return getNonLocalPointerDepFromBB(
             QueryInst, Pointer, Loc.getWithoutAATags(), isLoad, StartBB, Result,
-            Visited, SkipFirstBlock);
+            Visited, SkipFirstBlock, IsIncomplete);
     }
   }
 
@@ -1170,7 +1110,13 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
 
   // If we have valid cached information for exactly the block we are
   // investigating, just return it with no recomputation.
-  if (CacheInfo->Pair == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) {
+  // Don't use cached information for invariant loads since it is valid for
+  // non-invariant loads only.
+  //
+  // Don't use cached information for invariant loads since it is valid for
+  // non-invariant loads only.
+  if (!IsIncomplete && !isInvariantLoad &&
+      CacheInfo->Pair == BBSkipFirstBlockPair(StartBB, SkipFirstBlock)) {
     // We have a fully cached result for this query then we can just return the
     // cached results and populate the visited set.  However, we have to verify
     // that we don't already have conflicting results for these blocks.  Check
@@ -1207,13 +1153,18 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
   }
 
   // Otherwise, either this is a new block, a block with an invalid cache
-  // pointer or one that we're about to invalidate by putting more info into it
-  // than its valid cache info.  If empty, the result will be valid cache info,
-  // otherwise it isn't.
-  if (Cache->empty())
-    CacheInfo->Pair = BBSkipFirstBlockPair(StartBB, SkipFirstBlock);
-  else
-    CacheInfo->Pair = BBSkipFirstBlockPair();
+  // pointer or one that we're about to invalidate by putting more info into
+  // it than its valid cache info.  If empty and not explicitly indicated as
+  // incomplete, the result will be valid cache info, otherwise it isn't.
+  //
+  // Invariant loads don't affect cache in any way thus no need to update
+  // CacheInfo as well.
+  if (!isInvariantLoad) {
+    if (!IsIncomplete && Cache->empty())
+      CacheInfo->Pair = BBSkipFirstBlockPair(StartBB, SkipFirstBlock);
+    else
+      CacheInfo->Pair = BBSkipFirstBlockPair();
+  }
 
   SmallVector<BasicBlock *, 32> Worklist;
   Worklist.push_back(StartBB);
@@ -1454,22 +1405,27 @@ bool MemoryDependenceResults::getNonLocalPointerDepFromBB(
     if (SkipFirstBlock)
       return false;
 
-    bool foundBlock = false;
-    for (NonLocalDepEntry &I : llvm::reverse(*Cache)) {
-      if (I.getBB() != BB)
-        continue;
+    // Results of invariant loads are not cached thus no need to update cached
+    // information.
+    if (!isInvariantLoad) {
+      for (NonLocalDepEntry &I : llvm::reverse(*Cache)) {
+        if (I.getBB() != BB)
+          continue;
 
-      assert((GotWorklistLimit || I.getResult().isNonLocal() ||
-              !DT.isReachableFromEntry(BB)) &&
-             "Should only be here with transparent block");
-      foundBlock = true;
-      I.setResult(MemDepResult::getUnknown());
-      Result.push_back(
-          NonLocalDepResult(I.getBB(), I.getResult(), Pointer.getAddr()));
-      break;
+        assert((GotWorklistLimit || I.getResult().isNonLocal() ||
+                !DT.isReachableFromEntry(BB)) &&
+               "Should only be here with transparent block");
+
+        I.setResult(MemDepResult::getUnknown());
+
+
+        break;
+      }
     }
-    (void)foundBlock; (void)GotWorklistLimit;
-    assert((foundBlock || GotWorklistLimit) && "Current block not in cache?");
+    (void)GotWorklistLimit;
+    // Go ahead and report unknown dependence.
+    Result.push_back(
+        NonLocalDepResult(BB, MemDepResult::getUnknown(), Pointer.getAddr()));
   }
 
   // Okay, we're done now.  If we added new values to the cache, re-sort it.
@@ -1562,15 +1518,25 @@ void MemoryDependenceResults::removeInstruction(Instruction *RemInst) {
     LocalDeps.erase(LocalDepEntry);
   }
 
-  // If we have any cached pointer dependencies on this instruction, remove
-  // them.  If the instruction has non-pointer type, then it can't be a pointer
-  // base.
+  // If we have any cached dependencies on this instruction, remove
+  // them.
 
-  // Remove it from both the load info and the store info.  The instruction
-  // can't be in either of these maps if it is non-pointer.
+  // If the instruction is a pointer, remove it from both the load info and the
+  // store info.
   if (RemInst->getType()->isPointerTy()) {
     RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair(RemInst, false));
     RemoveCachedNonLocalPointerDependencies(ValueIsLoadPair(RemInst, true));
+  } else {
+    // Otherwise, if the instructions is in the map directly, it must be a load.
+    // Remove it.
+    auto toRemoveIt = NonLocalDefsCache.find(RemInst);
+    if (toRemoveIt != NonLocalDefsCache.end()) {
+      assert(isa<LoadInst>(RemInst) &&
+             "only load instructions should be added directly");
+      const Instruction *DepV = toRemoveIt->second.getResult().getInst();
+      ReverseNonLocalDefsCache.find(DepV)->second.erase(RemInst);
+      NonLocalDefsCache.erase(toRemoveIt);
+    }
   }
 
   // Loop over all of the things that depend on the instruction we're removing.
diff --git a/llvm/lib/Analysis/MemoryLocation.cpp b/llvm/lib/Analysis/MemoryLocation.cpp
index 103cdea148e5..4c31d6786ed8 100644
--- a/llvm/lib/Analysis/MemoryLocation.cpp
+++ b/llvm/lib/Analysis/MemoryLocation.cpp
@@ -83,6 +83,23 @@ MemoryLocation MemoryLocation::get(const AtomicRMWInst *RMWI) {
                         AATags);
 }
 
+Optional<MemoryLocation> MemoryLocation::getOrNone(const Instruction *Inst) {
+  switch (Inst->getOpcode()) {
+  case Instruction::Load:
+    return get(cast<LoadInst>(Inst));
+  case Instruction::Store:
+    return get(cast<StoreInst>(Inst));
+  case Instruction::VAArg:
+    return get(cast<VAArgInst>(Inst));
+  case Instruction::AtomicCmpXchg:
+    return get(cast<AtomicCmpXchgInst>(Inst));
+  case Instruction::AtomicRMW:
+    return get(cast<AtomicRMWInst>(Inst));
+  default:
+    return None;
+  }
+}
+
 MemoryLocation MemoryLocation::getForSource(const MemTransferInst *MTI) {
   return getForSource(cast<AnyMemTransferInst>(MTI));
 }
diff --git a/llvm/lib/Analysis/MemorySSA.cpp b/llvm/lib/Analysis/MemorySSA.cpp
index bf8dc94bfbf9..f2f5fd70f471 100644
--- a/llvm/lib/Analysis/MemorySSA.cpp
+++ b/llvm/lib/Analysis/MemorySSA.cpp
@@ -167,7 +167,7 @@ public:
     if (!IsCall)
       return Loc == Other.Loc;
 
-    if (Call->getCalledValue() != Other.Call->getCalledValue())
+    if (Call->getCalledOperand() != Other.Call->getCalledOperand())
       return false;
 
     return Call->arg_size() == Other.Call->arg_size() &&
@@ -203,7 +203,7 @@ template <> struct DenseMapInfo<MemoryLocOrCall> {
 
     hash_code hash =
         hash_combine(MLOC.IsCall, DenseMapInfo<const Value *>::getHashValue(
-                                      MLOC.getCall()->getCalledValue()));
+                                      MLOC.getCall()->getCalledOperand()));
 
     for (const Value *Arg : MLOC.getCall()->args())
       hash = hash_combine(hash, DenseMapInfo<const Value *>::getHashValue(Arg));
@@ -466,7 +466,8 @@ checkClobberSanity(const MemoryAccess *Start, MemoryAccess *ClobberAt,
 
       assert(isa<MemoryPhi>(MA));
       Worklist.append(
-          upward_defs_begin({const_cast<MemoryAccess *>(MA), MAP.second}),
+          upward_defs_begin({const_cast<MemoryAccess *>(MA), MAP.second},
+                            MSSA.getDomTree()),
           upward_defs_end());
     }
   }
@@ -595,8 +596,8 @@ template <class AliasAnalysisType> class ClobberWalker {
 
   void addSearches(MemoryPhi *Phi, SmallVectorImpl<ListIndex> &PausedSearches,
                    ListIndex PriorNode) {
-    auto UpwardDefs = make_range(upward_defs_begin({Phi, Paths[PriorNode].Loc}),
-                                 upward_defs_end());
+    auto UpwardDefs = make_range(
+        upward_defs_begin({Phi, Paths[PriorNode].Loc}, DT), upward_defs_end());
     for (const MemoryAccessPair &P : UpwardDefs) {
       PausedSearches.push_back(Paths.size());
       Paths.emplace_back(P.second, P.first, PriorNode);
@@ -2298,7 +2299,10 @@ bool MemorySSAWrapperPass::runOnFunction(Function &F) {
   return false;
 }
 
-void MemorySSAWrapperPass::verifyAnalysis() const { MSSA->verifyMemorySSA(); }
+void MemorySSAWrapperPass::verifyAnalysis() const {
+  if (VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
+}
 
 void MemorySSAWrapperPass::print(raw_ostream &OS, const Module *M) const {
   MSSA->print(OS);
diff --git a/llvm/lib/Analysis/MemorySSAUpdater.cpp b/llvm/lib/Analysis/MemorySSAUpdater.cpp
index 473268982f2d..85af091772e7 100644
--- a/llvm/lib/Analysis/MemorySSAUpdater.cpp
+++ b/llvm/lib/Analysis/MemorySSAUpdater.cpp
@@ -10,11 +10,13 @@
 //
 //===----------------------------------------------------------------===//
 #include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/Analysis/IteratedDominanceFrontier.h"
 #include "llvm/Analysis/MemorySSA.h"
+#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/GlobalVariable.h"
@@ -781,24 +783,24 @@ void MemorySSAUpdater::updateExitBlocksForClonedLoop(
 
 void MemorySSAUpdater::applyUpdates(ArrayRef<CFGUpdate> Updates,
                                     DominatorTree &DT) {
-  SmallVector<CFGUpdate, 4> RevDeleteUpdates;
+  SmallVector<CFGUpdate, 4> DeleteUpdates;
   SmallVector<CFGUpdate, 4> InsertUpdates;
   for (auto &Update : Updates) {
     if (Update.getKind() == DT.Insert)
       InsertUpdates.push_back({DT.Insert, Update.getFrom(), Update.getTo()});
     else
-      RevDeleteUpdates.push_back({DT.Insert, Update.getFrom(), Update.getTo()});
+      DeleteUpdates.push_back({DT.Delete, Update.getFrom(), Update.getTo()});
   }
 
-  if (!RevDeleteUpdates.empty()) {
+  if (!DeleteUpdates.empty()) {
     // Update for inserted edges: use newDT and snapshot CFG as if deletes had
     // not occurred.
     // FIXME: This creates a new DT, so it's more expensive to do mix
     // delete/inserts vs just inserts. We can do an incremental update on the DT
     // to revert deletes, than re-delete the edges. Teaching DT to do this, is
     // part of a pending cleanup.
-    DominatorTree NewDT(DT, RevDeleteUpdates);
-    GraphDiff<BasicBlock *> GD(RevDeleteUpdates);
+    DominatorTree NewDT(DT, DeleteUpdates);
+    GraphDiff<BasicBlock *> GD(DeleteUpdates, /*ReverseApplyUpdates=*/true);
     applyInsertUpdates(InsertUpdates, NewDT, &GD);
   } else {
     GraphDiff<BasicBlock *> GD;
@@ -806,7 +808,7 @@ void MemorySSAUpdater::applyUpdates(ArrayRef<CFGUpdate> Updates,
   }
 
   // Update for deleted edges
-  for (auto &Update : RevDeleteUpdates)
+  for (auto &Update : DeleteUpdates)
     removeEdge(Update.getFrom(), Update.getTo());
 }
 
diff --git a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
index 8a1206f49c21..e7d529d0b51e 100644
--- a/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
+++ b/llvm/lib/Analysis/ModuleSummaryAnalysis.cpp
@@ -25,10 +25,10 @@
 #include "llvm/Analysis/IndirectCallPromotionAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/StackSafetyAnalysis.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
@@ -83,7 +83,7 @@ cl::opt<std::string> ModuleSummaryDotFile(
 // to know when computing summary for global var, because if global variable
 // references basic block address we can't import it separately from function
 // containing that basic block. For simplicity we currently don't import such
-// global vars at all. When importing function we aren't interested if any 
+// global vars at all. When importing function we aren't interested if any
 // instruction in it takes an address of any basic block, because instruction
 // can only take an address of basic block located in the same function.
 static bool findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
@@ -99,7 +99,7 @@ static bool findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
     if (!Visited.insert(U).second)
       continue;
 
-    ImmutableCallSite CS(U);
+    const auto *CB = dyn_cast<CallBase>(U);
 
     for (const auto &OI : U->operands()) {
       const User *Operand = dyn_cast<User>(OI);
@@ -113,7 +113,7 @@ static bool findRefEdges(ModuleSummaryIndex &Index, const User *CurUser,
         // We have a reference to a global value. This should be added to
         // the reference set unless it is a callee. Callees are handled
         // specially by WriteFunction and are added to a separate list.
-        if (!(CS && CS.isCallee(&OI)))
+        if (!(CB && CB->isCallee(&OI)))
           RefEdges.insert(Index.getOrInsertValueInfo(GV));
         continue;
       }
@@ -145,7 +145,7 @@ static void addVCallToSet(DevirtCallSite Call, GlobalValue::GUID Guid,
                           SetVector<FunctionSummary::ConstVCall> &ConstVCalls) {
   std::vector<uint64_t> Args;
   // Start from the second argument to skip the "this" pointer.
-  for (auto &Arg : make_range(Call.CS.arg_begin() + 1, Call.CS.arg_end())) {
+  for (auto &Arg : make_range(Call.CB.arg_begin() + 1, Call.CB.arg_end())) {
     auto *CI = dyn_cast<ConstantInt>(Arg);
     if (!CI || CI->getBitWidth() > 64) {
       VCalls.insert({Guid, Call.Offset});
@@ -239,12 +239,12 @@ static bool isNonVolatileStore(const Instruction *I) {
   return false;
 }
 
-static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
-                                   const Function &F, BlockFrequencyInfo *BFI,
-                                   ProfileSummaryInfo *PSI, DominatorTree &DT,
-                                   bool HasLocalsInUsedOrAsm,
-                                   DenseSet<GlobalValue::GUID> &CantBePromoted,
-                                   bool IsThinLTO) {
+static void computeFunctionSummary(
+    ModuleSummaryIndex &Index, const Module &M, const Function &F,
+    BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, DominatorTree &DT,
+    bool HasLocalsInUsedOrAsm, DenseSet<GlobalValue::GUID> &CantBePromoted,
+    bool IsThinLTO,
+    std::function<const StackSafetyInfo *(const Function &F)> GetSSICallback) {
   // Summary not currently supported for anonymous functions, they should
   // have been named.
   assert(F.hasName());
@@ -304,8 +304,8 @@ static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
         }
       }
       findRefEdges(Index, &I, RefEdges, Visited);
-      auto CS = ImmutableCallSite(&I);
-      if (!CS)
+      const auto *CB = dyn_cast<CallBase>(&I);
+      if (!CB)
         continue;
 
       const auto *CI = dyn_cast<CallInst>(&I);
@@ -317,8 +317,8 @@ static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       if (HasLocalsInUsedOrAsm && CI && CI->isInlineAsm())
         HasInlineAsmMaybeReferencingInternal = true;
 
-      auto *CalledValue = CS.getCalledValue();
-      auto *CalledFunction = CS.getCalledFunction();
+      auto *CalledValue = CB->getCalledOperand();
+      auto *CalledFunction = CB->getCalledFunction();
       if (CalledValue && !CalledFunction) {
         CalledValue = CalledValue->stripPointerCasts();
         // Stripping pointer casts can reveal a called function.
@@ -341,7 +341,7 @@ static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
         }
         // We should have named any anonymous globals
         assert(CalledFunction->hasName());
-        auto ScaledCount = PSI->getProfileCount(&I, BFI);
+        auto ScaledCount = PSI->getProfileCount(*CB, BFI);
         auto Hotness = ScaledCount ? getHotness(ScaledCount.getValue(), PSI)
                                    : CalleeInfo::HotnessType::Unknown;
         if (ForceSummaryEdgesCold != FunctionSummary::FSHT_None)
@@ -391,6 +391,7 @@ static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
               .updateHotness(getHotness(Candidate.Count, PSI));
       }
     }
+  Index.addBlockCount(F.size());
 
   std::vector<ValueInfo> Refs;
   if (IsThinLTO) {
@@ -469,12 +470,15 @@ static void computeFunctionSummary(ModuleSummaryIndex &Index, const Module &M,
       // Don't try to import functions with noinline attribute.
       F.getAttributes().hasFnAttribute(Attribute::NoInline),
       F.hasFnAttribute(Attribute::AlwaysInline)};
+  std::vector<FunctionSummary::ParamAccess> ParamAccesses;
+  if (auto *SSI = GetSSICallback(F))
+    ParamAccesses = SSI->getParamAccesses();
   auto FuncSummary = std::make_unique<FunctionSummary>(
       Flags, NumInsts, FunFlags, /*EntryCount=*/0, std::move(Refs),
       CallGraphEdges.takeVector(), TypeTests.takeVector(),
       TypeTestAssumeVCalls.takeVector(), TypeCheckedLoadVCalls.takeVector(),
       TypeTestAssumeConstVCalls.takeVector(),
-      TypeCheckedLoadConstVCalls.takeVector());
+      TypeCheckedLoadConstVCalls.takeVector(), std::move(ParamAccesses));
   if (NonRenamableLocal)
     CantBePromoted.insert(F.getGUID());
   Index.addGlobalValueSummary(F, std::move(FuncSummary));
@@ -599,7 +603,10 @@ static void computeVariableSummary(ModuleSummaryIndex &Index,
   bool CanBeInternalized =
       !V.hasComdat() && !V.hasAppendingLinkage() && !V.isInterposable() &&
       !V.hasAvailableExternallyLinkage() && !V.hasDLLExportStorageClass();
-  GlobalVarSummary::GVarFlags VarFlags(CanBeInternalized, CanBeInternalized);
+  bool Constant = V.isConstant();
+  GlobalVarSummary::GVarFlags VarFlags(CanBeInternalized,
+                                       Constant ? false : CanBeInternalized,
+                                       Constant, V.getVCallVisibility());
   auto GVarSummary = std::make_unique<GlobalVarSummary>(Flags, VarFlags,
                                                          RefEdges.takeVector());
   if (NonRenamableLocal)
@@ -640,7 +647,8 @@ static void setLiveRoot(ModuleSummaryIndex &Index, StringRef Name) {
 ModuleSummaryIndex llvm::buildModuleSummaryIndex(
     const Module &M,
     std::function<BlockFrequencyInfo *(const Function &F)> GetBFICallback,
-    ProfileSummaryInfo *PSI) {
+    ProfileSummaryInfo *PSI,
+    std::function<const StackSafetyInfo *(const Function &F)> GetSSICallback) {
   assert(PSI);
   bool EnableSplitLTOUnit = false;
   if (auto *MD = mdconst::extract_or_null<ConstantInt>(
@@ -713,12 +721,16 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
                     ArrayRef<FunctionSummary::VFuncId>{},
                     ArrayRef<FunctionSummary::VFuncId>{},
                     ArrayRef<FunctionSummary::ConstVCall>{},
-                    ArrayRef<FunctionSummary::ConstVCall>{});
+                    ArrayRef<FunctionSummary::ConstVCall>{},
+                    ArrayRef<FunctionSummary::ParamAccess>{});
             Index.addGlobalValueSummary(*GV, std::move(Summary));
           } else {
             std::unique_ptr<GlobalVarSummary> Summary =
                 std::make_unique<GlobalVarSummary>(
-                    GVFlags, GlobalVarSummary::GVarFlags(false, false),
+                    GVFlags,
+                    GlobalVarSummary::GVarFlags(
+                        false, false, cast<GlobalVariable>(GV)->isConstant(),
+                        GlobalObject::VCallVisibilityPublic),
                     ArrayRef<ValueInfo>{});
             Index.addGlobalValueSummary(*GV, std::move(Summary));
           }
@@ -750,7 +762,7 @@ ModuleSummaryIndex llvm::buildModuleSummaryIndex(
 
     computeFunctionSummary(Index, M, F, BFI, PSI, DT,
                            !LocalsUsed.empty() || HasLocalInlineAsmSymbol,
-                           CantBePromoted, IsThinLTO);
+                           CantBePromoted, IsThinLTO, GetSSICallback);
   }
 
   // Compute summaries for all variables defined in module, and save in the
@@ -832,13 +844,19 @@ ModuleSummaryIndex
 ModuleSummaryIndexAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
   ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  bool NeedSSI = needsParamAccessSummary(M);
   return buildModuleSummaryIndex(
       M,
       [&FAM](const Function &F) {
         return &FAM.getResult<BlockFrequencyAnalysis>(
             *const_cast<Function *>(&F));
       },
-      &PSI);
+      &PSI,
+      [&FAM, NeedSSI](const Function &F) -> const StackSafetyInfo * {
+        return NeedSSI ? &FAM.getResult<StackSafetyAnalysis>(
+                             const_cast<Function &>(F))
+                       : nullptr;
+      });
 }
 
 char ModuleSummaryIndexWrapperPass::ID = 0;
@@ -847,6 +865,7 @@ INITIALIZE_PASS_BEGIN(ModuleSummaryIndexWrapperPass, "module-summary-analysis",
                       "Module Summary Analysis", false, true)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(StackSafetyInfoWrapperPass)
 INITIALIZE_PASS_END(ModuleSummaryIndexWrapperPass, "module-summary-analysis",
                     "Module Summary Analysis", false, true)
 
@@ -861,6 +880,7 @@ ModuleSummaryIndexWrapperPass::ModuleSummaryIndexWrapperPass()
 
 bool ModuleSummaryIndexWrapperPass::runOnModule(Module &M) {
   auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  bool NeedSSI = needsParamAccessSummary(M);
   Index.emplace(buildModuleSummaryIndex(
       M,
       [this](const Function &F) {
@@ -868,7 +888,13 @@ bool ModuleSummaryIndexWrapperPass::runOnModule(Module &M) {
                          *const_cast<Function *>(&F))
                      .getBFI());
       },
-      PSI));
+      PSI,
+      [&](const Function &F) -> const StackSafetyInfo * {
+        return NeedSSI ? &getAnalysis<StackSafetyInfoWrapperPass>(
+                              const_cast<Function &>(F))
+                              .getResult()
+                       : nullptr;
+      }));
   return false;
 }
 
@@ -881,4 +907,27 @@ void ModuleSummaryIndexWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<BlockFrequencyInfoWrapperPass>();
   AU.addRequired<ProfileSummaryInfoWrapperPass>();
+  AU.addRequired<StackSafetyInfoWrapperPass>();
+}
+
+char ImmutableModuleSummaryIndexWrapperPass::ID = 0;
+
+ImmutableModuleSummaryIndexWrapperPass::ImmutableModuleSummaryIndexWrapperPass(
+    const ModuleSummaryIndex *Index)
+    : ImmutablePass(ID), Index(Index) {
+  initializeImmutableModuleSummaryIndexWrapperPassPass(
+      *PassRegistry::getPassRegistry());
 }
+
+void ImmutableModuleSummaryIndexWrapperPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+}
+
+ImmutablePass *llvm::createImmutableModuleSummaryIndexWrapperPass(
+    const ModuleSummaryIndex *Index) {
+  return new ImmutableModuleSummaryIndexWrapperPass(Index);
+}
+
+INITIALIZE_PASS(ImmutableModuleSummaryIndexWrapperPass, "module-summary-info",
+                "Module summary info", false, true)
diff --git a/llvm/lib/Analysis/MustExecute.cpp b/llvm/lib/Analysis/MustExecute.cpp
index 952c2cbfec4e..6e3ff67bdddb 100644
--- a/llvm/lib/Analysis/MustExecute.cpp
+++ b/llvm/lib/Analysis/MustExecute.cpp
@@ -357,23 +357,29 @@ ModulePass *llvm::createMustBeExecutedContextPrinter() {
 bool MustBeExecutedContextPrinter::runOnModule(Module &M) {
   // We provide non-PM analysis here because the old PM doesn't like to query
   // function passes from a module pass.
-  SmallVector<PostDominatorTree *, 8> PDTs;
-  SmallVector<DominatorTree *, 8> DTs;
-  SmallVector<LoopInfo *, 8> LIs;
+  SmallVector<std::unique_ptr<PostDominatorTree>, 8> PDTs;
+  SmallVector<std::unique_ptr<DominatorTree>, 8> DTs;
+  SmallVector<std::unique_ptr<LoopInfo>, 8> LIs;
 
   GetterTy<LoopInfo> LIGetter = [&](const Function &F) {
-    DominatorTree *DT = new DominatorTree(const_cast<Function &>(F));
-    LoopInfo *LI = new LoopInfo(*DT);
-    DTs.push_back(DT);
-    LIs.push_back(LI);
-    return LI;
+    DTs.push_back(std::make_unique<DominatorTree>(const_cast<Function &>(F)));
+    LIs.push_back(std::make_unique<LoopInfo>(*DTs.back()));
+    return LIs.back().get();
+  };
+  GetterTy<DominatorTree> DTGetter = [&](const Function &F) {
+    DTs.push_back(std::make_unique<DominatorTree>(const_cast<Function&>(F)));
+    return DTs.back().get();
   };
   GetterTy<PostDominatorTree> PDTGetter = [&](const Function &F) {
-    PostDominatorTree *PDT = new PostDominatorTree(const_cast<Function &>(F));
-    PDTs.push_back(PDT);
-    return PDT;
+    PDTs.push_back(
+        std::make_unique<PostDominatorTree>(const_cast<Function &>(F)));
+    return PDTs.back().get();
   };
-  MustBeExecutedContextExplorer Explorer(true, LIGetter, PDTGetter);
+  MustBeExecutedContextExplorer Explorer(
+      /* ExploreInterBlock */ true,
+      /* ExploreCFGForward */ true,
+      /* ExploreCFGBackward */ true, LIGetter, DTGetter, PDTGetter);
+
   for (Function &F : M) {
     for (Instruction &I : instructions(F)) {
       dbgs() << "-- Explore context of: " << I << "\n";
@@ -383,9 +389,6 @@ bool MustBeExecutedContextPrinter::runOnModule(Module &M) {
     }
   }
 
-  DeleteContainerPointers(PDTs);
-  DeleteContainerPointers(LIs);
-  DeleteContainerPointers(DTs);
   return false;
 }
 
@@ -475,13 +478,13 @@ static bool maybeEndlessLoop(const Loop &L) {
   return true;
 }
 
-static bool mayContainIrreducibleControl(const Function &F, const LoopInfo *LI) {
+bool llvm::mayContainIrreducibleControl(const Function &F, const LoopInfo *LI) {
   if (!LI)
     return false;
   using RPOTraversal = ReversePostOrderTraversal<const Function *>;
   RPOTraversal FuncRPOT(&F);
-  return !containsIrreducibleCFG<const BasicBlock *, const RPOTraversal,
-                                 const LoopInfo>(FuncRPOT, *LI);
+  return containsIrreducibleCFG<const BasicBlock *, const RPOTraversal,
+                                const LoopInfo>(FuncRPOT, *LI);
 }
 
 /// Lookup \p Key in \p Map and return the result, potentially after
@@ -632,6 +635,72 @@ MustBeExecutedContextExplorer::findForwardJoinPoint(const BasicBlock *InitBB) {
   LLVM_DEBUG(dbgs() << "\tJoin block: " << JoinBB->getName() << "\n");
   return JoinBB;
 }
+const BasicBlock *
+MustBeExecutedContextExplorer::findBackwardJoinPoint(const BasicBlock *InitBB) {
+  const LoopInfo *LI = LIGetter(*InitBB->getParent());
+  const DominatorTree *DT = DTGetter(*InitBB->getParent());
+  LLVM_DEBUG(dbgs() << "\tFind backward join point for " << InitBB->getName()
+                    << (LI ? " [LI]" : "") << (DT ? " [DT]" : ""));
+
+  // Try to determine a join block through the help of the dominance tree. If no
+  // tree was provided, we perform simple pattern matching for one block
+  // conditionals only.
+  if (DT)
+    if (const auto *InitNode = DT->getNode(InitBB))
+      if (const auto *IDomNode = InitNode->getIDom())
+        return IDomNode->getBlock();
+
+  const Loop *L = LI ? LI->getLoopFor(InitBB) : nullptr;
+  const BasicBlock *HeaderBB = L ? L->getHeader() : nullptr;
+
+  // Determine the predecessor blocks but ignore backedges.
+  SmallVector<const BasicBlock *, 8> Worklist;
+  for (const BasicBlock *PredBB : predecessors(InitBB)) {
+    bool IsBackedge =
+        (PredBB == InitBB) || (HeaderBB == InitBB && L->contains(PredBB));
+    // Loop backedges are ignored in backwards propagation: control has to come
+    // from somewhere.
+    if (!IsBackedge)
+      Worklist.push_back(PredBB);
+  }
+
+  // If there are no other predecessor blocks, there is no join point.
+  if (Worklist.empty())
+    return nullptr;
+
+  // If there is one predecessor block, it is the join point.
+  if (Worklist.size() == 1)
+    return Worklist[0];
+
+  const BasicBlock *JoinBB = nullptr;
+  if (Worklist.size() == 2) {
+    const BasicBlock *Pred0 = Worklist[0];
+    const BasicBlock *Pred1 = Worklist[1];
+    const BasicBlock *Pred0UniquePred = Pred0->getUniquePredecessor();
+    const BasicBlock *Pred1UniquePred = Pred1->getUniquePredecessor();
+    if (Pred0 == Pred1UniquePred) {
+      // InitBB <-          Pred0 = JoinBB
+      // InitBB <- Pred1 <- Pred0 = JoinBB
+      JoinBB = Pred0;
+    } else if (Pred1 == Pred0UniquePred) {
+      // InitBB <- Pred0 <- Pred1 = JoinBB
+      // InitBB <-          Pred1 = JoinBB
+      JoinBB = Pred1;
+    } else if (Pred0UniquePred == Pred1UniquePred) {
+      // InitBB <- Pred0 <- JoinBB
+      // InitBB <- Pred1 <- JoinBB
+      JoinBB = Pred0UniquePred;
+    }
+  }
+
+  if (!JoinBB && L)
+    JoinBB = L->getHeader();
+
+  // In backwards direction there is no need to show termination of previous
+  // instructions. If they do not terminate, the code afterward is dead, making
+  // any information/transformation correct anyway.
+  return JoinBB;
+}
 
 const Instruction *
 MustBeExecutedContextExplorer::getMustBeExecutedNextInstruction(
@@ -690,6 +759,47 @@ MustBeExecutedContextExplorer::getMustBeExecutedNextInstruction(
   return nullptr;
 }
 
+const Instruction *
+MustBeExecutedContextExplorer::getMustBeExecutedPrevInstruction(
+    MustBeExecutedIterator &It, const Instruction *PP) {
+  if (!PP)
+    return PP;
+
+  bool IsFirst = !(PP->getPrevNode());
+  LLVM_DEBUG(dbgs() << "Find next instruction for " << *PP
+                    << (IsFirst ? " [IsFirst]" : "") << "\n");
+
+  // If we explore only inside a given basic block we stop at the first
+  // instruction.
+  if (!ExploreInterBlock && IsFirst) {
+    LLVM_DEBUG(dbgs() << "\tReached block front in intra-block mode, done\n");
+    return nullptr;
+  }
+
+  // The block and function that contains the current position.
+  const BasicBlock *PPBlock = PP->getParent();
+
+  // If we are inside a block we know what instruction was executed before, the
+  // previous one.
+  if (!IsFirst) {
+    const Instruction *PrevPP = PP->getPrevNode();
+    LLVM_DEBUG(
+        dbgs() << "\tIntermediate instruction, continue with previous\n");
+    // We did not enter a callee so we simply return the previous instruction.
+    return PrevPP;
+  }
+
+  // Finally, we have to handle the case where the program point is the first in
+  // a block but not in the function. We use the findBackwardJoinPoint helper
+  // function with information about the function and helper analyses, if
+  // available.
+  if (const BasicBlock *JoinBB = findBackwardJoinPoint(PPBlock))
+    return &JoinBB->back();
+
+  LLVM_DEBUG(dbgs() << "\tNo join point found\n");
+  return nullptr;
+}
+
 MustBeExecutedIterator::MustBeExecutedIterator(
     MustBeExecutedContextExplorer &Explorer, const Instruction *I)
     : Explorer(Explorer), CurInst(I) {
@@ -697,16 +807,31 @@ MustBeExecutedIterator::MustBeExecutedIterator(
 }
 
 void MustBeExecutedIterator::reset(const Instruction *I) {
-  CurInst = I;
   Visited.clear();
-  Visited.insert(I);
+  resetInstruction(I);
+}
+
+void MustBeExecutedIterator::resetInstruction(const Instruction *I) {
+  CurInst = I;
+  Head = Tail = nullptr;
+  Visited.insert({I, ExplorationDirection::FORWARD});
+  Visited.insert({I, ExplorationDirection::BACKWARD});
+  if (Explorer.ExploreCFGForward)
+    Head = I;
+  if (Explorer.ExploreCFGBackward)
+    Tail = I;
 }
 
 const Instruction *MustBeExecutedIterator::advance() {
   assert(CurInst && "Cannot advance an end iterator!");
-  const Instruction *Next =
-      Explorer.getMustBeExecutedNextInstruction(*this, CurInst);
-  if (Next && !Visited.insert(Next).second)
-    Next = nullptr;
-  return Next;
+  Head = Explorer.getMustBeExecutedNextInstruction(*this, Head);
+  if (Head && Visited.insert({Head, ExplorationDirection ::FORWARD}).second)
+    return Head;
+  Head = nullptr;
+
+  Tail = Explorer.getMustBeExecutedPrevInstruction(*this, Tail);
+  if (Tail && Visited.insert({Tail, ExplorationDirection ::BACKWARD}).second)
+    return Tail;
+  Tail = nullptr;
+  return nullptr;
 }
diff --git a/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp b/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
index 811033e73147..80e019f5fc92 100644
--- a/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
+++ b/llvm/lib/Analysis/ObjCARCAliasAnalysis.cpp
@@ -24,12 +24,12 @@
 
 #include "llvm/Analysis/ObjCARCAliasAnalysis.h"
 #include "llvm/Analysis/ObjCARCAnalysisUtils.h"
+#include "llvm/Analysis/Passes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/PassAnalysisSupport.h"
-#include "llvm/PassSupport.h"
+#include "llvm/Pass.h"
 
 #define DEBUG_TYPE "objc-arc-aa"
 
diff --git a/llvm/lib/Analysis/ObjCARCInstKind.cpp b/llvm/lib/Analysis/ObjCARCInstKind.cpp
index 0e96c6e975c9..fb416a79ac26 100644
--- a/llvm/lib/Analysis/ObjCARCInstKind.cpp
+++ b/llvm/lib/Analysis/ObjCARCInstKind.cpp
@@ -153,7 +153,7 @@ ARCInstKind llvm::objcarc::GetFunctionClass(const Function *F) {
   }
 }
 
-// A whitelist of intrinsics that we know do not use objc pointers or decrement
+// A list of intrinsics that we know do not use objc pointers or decrement
 // ref counts.
 static bool isInertIntrinsic(unsigned ID) {
   // TODO: Make this into a covered switch.
@@ -192,7 +192,7 @@ static bool isInertIntrinsic(unsigned ID) {
   }
 }
 
-// A whitelist of intrinsics that we know do not use objc pointers or decrement
+// A list of intrinsics that we know do not use objc pointers or decrement
 // ref counts.
 static bool isUseOnlyIntrinsic(unsigned ID) {
   // We are conservative and even though intrinsics are unlikely to touch
@@ -234,11 +234,11 @@ ARCInstKind llvm::objcarc::GetARCInstKind(const Value *V) {
       }
 
       // Otherwise, be conservative.
-      return GetCallSiteClass(CI);
+      return GetCallSiteClass(*CI);
     }
     case Instruction::Invoke:
       // Otherwise, be conservative.
-      return GetCallSiteClass(cast<InvokeInst>(I));
+      return GetCallSiteClass(cast<InvokeInst>(*I));
     case Instruction::BitCast:
     case Instruction::GetElementPtr:
     case Instruction::Select:
diff --git a/llvm/lib/Analysis/OptimizationRemarkEmitter.cpp b/llvm/lib/Analysis/OptimizationRemarkEmitter.cpp
index 44e6637f6337..2cdf7a177216 100644
--- a/llvm/lib/Analysis/OptimizationRemarkEmitter.cpp
+++ b/llvm/lib/Analysis/OptimizationRemarkEmitter.cpp
@@ -36,8 +36,7 @@ OptimizationRemarkEmitter::OptimizationRemarkEmitter(const Function *F)
   LI.analyze(DT);
 
   // Then compute BranchProbabilityInfo.
-  BranchProbabilityInfo BPI;
-  BPI.calculate(*F, LI);
+  BranchProbabilityInfo BPI(*F, LI);
 
   // Finally compute BFI.
   OwnedBFI = std::make_unique<BlockFrequencyInfo>(*F, BPI, LI);
@@ -47,6 +46,10 @@ OptimizationRemarkEmitter::OptimizationRemarkEmitter(const Function *F)
 bool OptimizationRemarkEmitter::invalidate(
     Function &F, const PreservedAnalyses &PA,
     FunctionAnalysisManager::Invalidator &Inv) {
+  if (OwnedBFI.get()) {
+    OwnedBFI.reset();
+    BFI = nullptr;
+  }
   // This analysis has no state and so can be trivially preserved but it needs
   // a fresh view of BFI if it was constructed with one.
   if (BFI && Inv.invalidate<BlockFrequencyAnalysis>(F, PA))
diff --git a/llvm/lib/Analysis/OrderedBasicBlock.cpp b/llvm/lib/Analysis/OrderedBasicBlock.cpp
deleted file mode 100644
index 48f2a4020c66..000000000000
--- a/llvm/lib/Analysis/OrderedBasicBlock.cpp
+++ /dev/null
@@ -1,111 +0,0 @@
-//===- OrderedBasicBlock.cpp --------------------------------- -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the OrderedBasicBlock class. OrderedBasicBlock
-// maintains an interface where clients can query if one instruction comes
-// before another in a BasicBlock. Since BasicBlock currently lacks a reliable
-// way to query relative position between instructions one can use
-// OrderedBasicBlock to do such queries. OrderedBasicBlock is lazily built on a
-// source BasicBlock and maintains an internal Instruction -> Position map. A
-// OrderedBasicBlock instance should be discarded whenever the source
-// BasicBlock changes.
-//
-// It's currently used by the CaptureTracker in order to find relative
-// positions of a pair of instructions inside a BasicBlock.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/OrderedBasicBlock.h"
-#include "llvm/IR/Instruction.h"
-using namespace llvm;
-
-OrderedBasicBlock::OrderedBasicBlock(const BasicBlock *BasicB)
-    : NextInstPos(0), BB(BasicB) {
-  LastInstFound = BB->end();
-}
-
-/// Given no cached results, find if \p A comes before \p B in \p BB.
-/// Cache and number out instruction while walking \p BB.
-bool OrderedBasicBlock::comesBefore(const Instruction *A,
-                                    const Instruction *B) {
-  const Instruction *Inst = nullptr;
-  assert(!(LastInstFound == BB->end() && NextInstPos != 0) &&
-         "Instruction supposed to be in NumberedInsts");
-  assert(A->getParent() == BB && "Instruction supposed to be in the block!");
-  assert(B->getParent() == BB && "Instruction supposed to be in the block!");
-
-  // Start the search with the instruction found in the last lookup round.
-  auto II = BB->begin();
-  auto IE = BB->end();
-  if (LastInstFound != IE)
-    II = std::next(LastInstFound);
-
-  // Number all instructions up to the point where we find 'A' or 'B'.
-  for (; II != IE; ++II) {
-    Inst = cast<Instruction>(II);
-    NumberedInsts[Inst] = NextInstPos++;
-    if (Inst == A || Inst == B)
-      break;
-  }
-
-  assert(II != IE && "Instruction not found?");
-  assert((Inst == A || Inst == B) && "Should find A or B");
-  LastInstFound = II;
-  return Inst != B;
-}
-
-/// Find out whether \p A dominates \p B, meaning whether \p A
-/// comes before \p B in \p BB. This is a simplification that considers
-/// cached instruction positions and ignores other basic blocks, being
-/// only relevant to compare relative instructions positions inside \p BB.
-bool OrderedBasicBlock::dominates(const Instruction *A, const Instruction *B) {
-  assert(A->getParent() == B->getParent() &&
-         "Instructions must be in the same basic block!");
-  assert(A->getParent() == BB && "Instructions must be in the tracked block!");
-
-  // First we lookup the instructions. If they don't exist, lookup will give us
-  // back ::end(). If they both exist, we compare the numbers. Otherwise, if NA
-  // exists and NB doesn't, it means NA must come before NB because we would
-  // have numbered NB as well if it didn't. The same is true for NB. If it
-  // exists, but NA does not, NA must come after it. If neither exist, we need
-  // to number the block and cache the results (by calling comesBefore).
-  auto NAI = NumberedInsts.find(A);
-  auto NBI = NumberedInsts.find(B);
-  if (NAI != NumberedInsts.end() && NBI != NumberedInsts.end())
-    return NAI->second < NBI->second;
-  if (NAI != NumberedInsts.end())
-    return true;
-  if (NBI != NumberedInsts.end())
-    return false;
-
-  return comesBefore(A, B);
-}
-
-void OrderedBasicBlock::eraseInstruction(const Instruction *I) {
-  if (LastInstFound != BB->end() && I == &*LastInstFound) {
-    if (LastInstFound == BB->begin()) {
-      LastInstFound = BB->end();
-      NextInstPos = 0;
-    } else
-      LastInstFound--;
-  }
-
-  NumberedInsts.erase(I);
-}
-
-void OrderedBasicBlock::replaceInstruction(const Instruction *Old,
-                                           const Instruction *New) {
-  auto OI = NumberedInsts.find(Old);
-  if (OI == NumberedInsts.end())
-    return;
-
-  NumberedInsts.insert({New, OI->second});
-  if (LastInstFound != BB->end() && Old == &*LastInstFound)
-    LastInstFound = New->getIterator();
-  NumberedInsts.erase(Old);
-}
diff --git a/llvm/lib/Analysis/OrderedInstructions.cpp b/llvm/lib/Analysis/OrderedInstructions.cpp
deleted file mode 100644
index e947e5e388a8..000000000000
--- a/llvm/lib/Analysis/OrderedInstructions.cpp
+++ /dev/null
@@ -1,50 +0,0 @@
-//===-- OrderedInstructions.cpp - Instruction dominance function ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines utility to check dominance relation of 2 instructions.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Analysis/OrderedInstructions.h"
-using namespace llvm;
-
-bool OrderedInstructions::localDominates(const Instruction *InstA,
-                                         const Instruction *InstB) const {
-  assert(InstA->getParent() == InstB->getParent() &&
-         "Instructions must be in the same basic block");
-
-  const BasicBlock *IBB = InstA->getParent();
-  auto OBB = OBBMap.find(IBB);
-  if (OBB == OBBMap.end())
-    OBB = OBBMap.insert({IBB, std::make_unique<OrderedBasicBlock>(IBB)}).first;
-  return OBB->second->dominates(InstA, InstB);
-}
-
-/// Given 2 instructions, use OrderedBasicBlock to check for dominance relation
-/// if the instructions are in the same basic block, Otherwise, use dominator
-/// tree.
-bool OrderedInstructions::dominates(const Instruction *InstA,
-                                    const Instruction *InstB) const {
-  // Use ordered basic block to do dominance check in case the 2 instructions
-  // are in the same basic block.
-  if (InstA->getParent() == InstB->getParent())
-    return localDominates(InstA, InstB);
-  return DT->dominates(InstA->getParent(), InstB->getParent());
-}
-
-bool OrderedInstructions::dfsBefore(const Instruction *InstA,
-                                    const Instruction *InstB) const {
-  // Use ordered basic block in case the 2 instructions are in the same basic
-  // block.
-  if (InstA->getParent() == InstB->getParent())
-    return localDominates(InstA, InstB);
-
-  DomTreeNode *DA = DT->getNode(InstA->getParent());
-  DomTreeNode *DB = DT->getNode(InstB->getParent());
-  return DA->getDFSNumIn() < DB->getDFSNumIn();
-}
diff --git a/llvm/lib/Analysis/ProfileSummaryInfo.cpp b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
index 911d39d9a263..c9671d4f5c2e 100644
--- a/llvm/lib/Analysis/ProfileSummaryInfo.cpp
+++ b/llvm/lib/Analysis/ProfileSummaryInfo.cpp
@@ -14,11 +14,12 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/ProfileData/ProfileCommon.h"
 #include "llvm/Support/CommandLine.h"
 using namespace llvm;
 
@@ -66,48 +67,52 @@ static cl::opt<int> ProfileSummaryColdCount(
     cl::desc("A fixed cold count that overrides the count derived from"
              " profile-summary-cutoff-cold"));
 
-// Find the summary entry for a desired percentile of counts.
-static const ProfileSummaryEntry &getEntryForPercentile(SummaryEntryVector &DS,
-                                                        uint64_t Percentile) {
-  auto It = partition_point(DS, [=](const ProfileSummaryEntry &Entry) {
-    return Entry.Cutoff < Percentile;
-  });
-  // The required percentile has to be <= one of the percentiles in the
-  // detailed summary.
-  if (It == DS.end())
-    report_fatal_error("Desired percentile exceeds the maximum cutoff");
-  return *It;
-}
+static cl::opt<bool> PartialProfile(
+    "partial-profile", cl::Hidden, cl::init(false),
+    cl::desc("Specify the current profile is used as a partial profile."));
+
+cl::opt<bool> ScalePartialSampleProfileWorkingSetSize(
+    "scale-partial-sample-profile-working-set-size", cl::Hidden, cl::init(true),
+    cl::desc(
+        "If true, scale the working set size of the partial sample profile "
+        "by the partial profile ratio to reflect the size of the program "
+        "being compiled."));
+
+static cl::opt<double> PartialSampleProfileWorkingSetSizeScaleFactor(
+    "partial-sample-profile-working-set-size-scale-factor", cl::Hidden,
+    cl::init(0.008),
+    cl::desc("The scale factor used to scale the working set size of the "
+             "partial sample profile along with the partial profile ratio. "
+             "This includes the factor of the profile counter per block "
+             "and the factor to scale the working set size to use the same "
+             "shared thresholds as PGO."));
 
 // The profile summary metadata may be attached either by the frontend or by
 // any backend passes (IR level instrumentation, for example). This method
 // checks if the Summary is null and if so checks if the summary metadata is now
-// available in the module and parses it to get the Summary object. Returns true
-// if a valid Summary is available.
-bool ProfileSummaryInfo::computeSummary() {
-  if (Summary)
-    return true;
+// available in the module and parses it to get the Summary object.
+void ProfileSummaryInfo::refresh() {
+  if (hasProfileSummary())
+    return;
   // First try to get context sensitive ProfileSummary.
   auto *SummaryMD = M.getProfileSummary(/* IsCS */ true);
-  if (SummaryMD) {
+  if (SummaryMD)
     Summary.reset(ProfileSummary::getFromMD(SummaryMD));
-    return true;
+
+  if (!hasProfileSummary()) {
+    // This will actually return PSK_Instr or PSK_Sample summary.
+    SummaryMD = M.getProfileSummary(/* IsCS */ false);
+    if (SummaryMD)
+      Summary.reset(ProfileSummary::getFromMD(SummaryMD));
   }
-  // This will actually return PSK_Instr or PSK_Sample summary.
-  SummaryMD = M.getProfileSummary(/* IsCS */ false);
-  if (!SummaryMD)
-    return false;
-  Summary.reset(ProfileSummary::getFromMD(SummaryMD));
-  return true;
+  if (!hasProfileSummary())
+    return;
+  computeThresholds();
 }
 
-Optional<uint64_t>
-ProfileSummaryInfo::getProfileCount(const Instruction *Inst,
-                                    BlockFrequencyInfo *BFI,
-                                    bool AllowSynthetic) {
-  if (!Inst)
-    return None;
-  assert((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
+Optional<uint64_t> ProfileSummaryInfo::getProfileCount(
+    const CallBase &Call, BlockFrequencyInfo *BFI, bool AllowSynthetic) const {
+  assert((isa<CallInst>(Call) || isa<InvokeInst>(Call)) &&
          "We can only get profile count for call/invoke instruction.");
   if (hasSampleProfile()) {
     // In sample PGO mode, check if there is a profile metadata on the
@@ -115,20 +120,20 @@ ProfileSummaryInfo::getProfileCount(const Instruction *Inst,
     // since the sampled entry count may not be accurate. If there is no
     // annotated on the instruction, return None.
     uint64_t TotalCount;
-    if (Inst->extractProfTotalWeight(TotalCount))
+    if (Call.extractProfTotalWeight(TotalCount))
       return TotalCount;
     return None;
   }
   if (BFI)
-    return BFI->getBlockProfileCount(Inst->getParent(), AllowSynthetic);
+    return BFI->getBlockProfileCount(Call.getParent(), AllowSynthetic);
   return None;
 }
 
 /// Returns true if the function's entry is hot. If it returns false, it
 /// either means it is not hot or it is unknown whether it is hot or not (for
 /// example, no profile data is available).
-bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) {
-  if (!F || !computeSummary())
+bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) const {
+  if (!F || !hasProfileSummary())
     return false;
   auto FunctionCount = F->getEntryCount();
   // FIXME: The heuristic used below for determining hotness is based on
@@ -142,9 +147,9 @@ bool ProfileSummaryInfo::isFunctionEntryHot(const Function *F) {
 /// hot total call edge count.
 /// If it returns false, it either means it is not hot or it is unknown
 /// (for example, no profile data is available).
-bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F,
-                                                  BlockFrequencyInfo &BFI) {
-  if (!F || !computeSummary())
+bool ProfileSummaryInfo::isFunctionHotInCallGraph(
+    const Function *F, BlockFrequencyInfo &BFI) const {
+  if (!F || !hasProfileSummary())
     return false;
   if (auto FunctionCount = F->getEntryCount())
     if (isHotCount(FunctionCount.getCount()))
@@ -155,7 +160,7 @@ bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F,
     for (const auto &BB : *F)
       for (const auto &I : BB)
         if (isa<CallInst>(I) || isa<InvokeInst>(I))
-          if (auto CallCount = getProfileCount(&I, nullptr))
+          if (auto CallCount = getProfileCount(cast<CallBase>(I), nullptr))
             TotalCallCount += CallCount.getValue();
     if (isHotCount(TotalCallCount))
       return true;
@@ -171,9 +176,9 @@ bool ProfileSummaryInfo::isFunctionHotInCallGraph(const Function *F,
 /// the total call edge count is cold.
 /// If it returns false, it either means it is not cold or it is unknown
 /// (for example, no profile data is available).
-bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F,
-                                                   BlockFrequencyInfo &BFI) {
-  if (!F || !computeSummary())
+bool ProfileSummaryInfo::isFunctionColdInCallGraph(
+    const Function *F, BlockFrequencyInfo &BFI) const {
+  if (!F || !hasProfileSummary())
     return false;
   if (auto FunctionCount = F->getEntryCount())
     if (!isColdCount(FunctionCount.getCount()))
@@ -184,7 +189,7 @@ bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F,
     for (const auto &BB : *F)
       for (const auto &I : BB)
         if (isa<CallInst>(I) || isa<InvokeInst>(I))
-          if (auto CallCount = getProfileCount(&I, nullptr))
+          if (auto CallCount = getProfileCount(cast<CallBase>(I), nullptr))
             TotalCallCount += CallCount.getValue();
     if (!isColdCount(TotalCallCount))
       return false;
@@ -195,40 +200,67 @@ bool ProfileSummaryInfo::isFunctionColdInCallGraph(const Function *F,
   return true;
 }
 
-// Like isFunctionHotInCallGraph but for a given cutoff.
-bool ProfileSummaryInfo::isFunctionHotInCallGraphNthPercentile(
-    int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) {
-  if (!F || !computeSummary())
+bool ProfileSummaryInfo::isFunctionHotnessUnknown(const Function &F) const {
+  assert(hasPartialSampleProfile() && "Expect partial sample profile");
+  return !F.getEntryCount().hasValue();
+}
+
+template <bool isHot>
+bool ProfileSummaryInfo::isFunctionHotOrColdInCallGraphNthPercentile(
+    int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) const {
+  if (!F || !hasProfileSummary())
     return false;
-  if (auto FunctionCount = F->getEntryCount())
-    if (isHotCountNthPercentile(PercentileCutoff, FunctionCount.getCount()))
+  if (auto FunctionCount = F->getEntryCount()) {
+    if (isHot &&
+        isHotCountNthPercentile(PercentileCutoff, FunctionCount.getCount()))
       return true;
-
+    if (!isHot &&
+        !isColdCountNthPercentile(PercentileCutoff, FunctionCount.getCount()))
+      return false;
+  }
   if (hasSampleProfile()) {
     uint64_t TotalCallCount = 0;
     for (const auto &BB : *F)
       for (const auto &I : BB)
         if (isa<CallInst>(I) || isa<InvokeInst>(I))
-          if (auto CallCount = getProfileCount(&I, nullptr))
+          if (auto CallCount = getProfileCount(cast<CallBase>(I), nullptr))
             TotalCallCount += CallCount.getValue();
-    if (isHotCountNthPercentile(PercentileCutoff, TotalCallCount))
+    if (isHot && isHotCountNthPercentile(PercentileCutoff, TotalCallCount))
       return true;
+    if (!isHot && !isColdCountNthPercentile(PercentileCutoff, TotalCallCount))
+      return false;
   }
-  for (const auto &BB : *F)
-    if (isHotBlockNthPercentile(PercentileCutoff, &BB, &BFI))
+  for (const auto &BB : *F) {
+    if (isHot && isHotBlockNthPercentile(PercentileCutoff, &BB, &BFI))
       return true;
-  return false;
+    if (!isHot && !isColdBlockNthPercentile(PercentileCutoff, &BB, &BFI))
+      return false;
+  }
+  return !isHot;
+}
+
+// Like isFunctionHotInCallGraph but for a given cutoff.
+bool ProfileSummaryInfo::isFunctionHotInCallGraphNthPercentile(
+    int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) const {
+  return isFunctionHotOrColdInCallGraphNthPercentile<true>(
+      PercentileCutoff, F, BFI);
+}
+
+bool ProfileSummaryInfo::isFunctionColdInCallGraphNthPercentile(
+    int PercentileCutoff, const Function *F, BlockFrequencyInfo &BFI) const {
+  return isFunctionHotOrColdInCallGraphNthPercentile<false>(
+      PercentileCutoff, F, BFI);
 }
 
 /// Returns true if the function's entry is a cold. If it returns false, it
 /// either means it is not cold or it is unknown whether it is cold or not (for
 /// example, no profile data is available).
-bool ProfileSummaryInfo::isFunctionEntryCold(const Function *F) {
+bool ProfileSummaryInfo::isFunctionEntryCold(const Function *F) const {
   if (!F)
     return false;
   if (F->hasFnAttribute(Attribute::Cold))
     return true;
-  if (!computeSummary())
+  if (!hasProfileSummary())
     return false;
   auto FunctionCount = F->getEntryCount();
   // FIXME: The heuristic used below for determining coldness is based on
@@ -239,116 +271,151 @@ bool ProfileSummaryInfo::isFunctionEntryCold(const Function *F) {
 
 /// Compute the hot and cold thresholds.
 void ProfileSummaryInfo::computeThresholds() {
-  if (!computeSummary())
-    return;
   auto &DetailedSummary = Summary->getDetailedSummary();
-  auto &HotEntry =
-      getEntryForPercentile(DetailedSummary, ProfileSummaryCutoffHot);
+  auto &HotEntry = ProfileSummaryBuilder::getEntryForPercentile(
+      DetailedSummary, ProfileSummaryCutoffHot);
   HotCountThreshold = HotEntry.MinCount;
   if (ProfileSummaryHotCount.getNumOccurrences() > 0)
     HotCountThreshold = ProfileSummaryHotCount;
-  auto &ColdEntry =
-      getEntryForPercentile(DetailedSummary, ProfileSummaryCutoffCold);
+  auto &ColdEntry = ProfileSummaryBuilder::getEntryForPercentile(
+      DetailedSummary, ProfileSummaryCutoffCold);
   ColdCountThreshold = ColdEntry.MinCount;
   if (ProfileSummaryColdCount.getNumOccurrences() > 0)
     ColdCountThreshold = ProfileSummaryColdCount;
   assert(ColdCountThreshold <= HotCountThreshold &&
          "Cold count threshold cannot exceed hot count threshold!");
-  HasHugeWorkingSetSize =
-      HotEntry.NumCounts > ProfileSummaryHugeWorkingSetSizeThreshold;
-  HasLargeWorkingSetSize =
-      HotEntry.NumCounts > ProfileSummaryLargeWorkingSetSizeThreshold;
+  if (!hasPartialSampleProfile() || !ScalePartialSampleProfileWorkingSetSize) {
+    HasHugeWorkingSetSize =
+        HotEntry.NumCounts > ProfileSummaryHugeWorkingSetSizeThreshold;
+    HasLargeWorkingSetSize =
+        HotEntry.NumCounts > ProfileSummaryLargeWorkingSetSizeThreshold;
+  } else {
+    // Scale the working set size of the partial sample profile to reflect the
+    // size of the program being compiled.
+    double PartialProfileRatio = Summary->getPartialProfileRatio();
+    uint64_t ScaledHotEntryNumCounts =
+        static_cast<uint64_t>(HotEntry.NumCounts * PartialProfileRatio *
+                              PartialSampleProfileWorkingSetSizeScaleFactor);
+    HasHugeWorkingSetSize =
+        ScaledHotEntryNumCounts > ProfileSummaryHugeWorkingSetSizeThreshold;
+    HasLargeWorkingSetSize =
+        ScaledHotEntryNumCounts > ProfileSummaryLargeWorkingSetSizeThreshold;
+  }
 }
 
-Optional<uint64_t> ProfileSummaryInfo::computeThreshold(int PercentileCutoff) {
-  if (!computeSummary())
+Optional<uint64_t>
+ProfileSummaryInfo::computeThreshold(int PercentileCutoff) const {
+  if (!hasProfileSummary())
     return None;
   auto iter = ThresholdCache.find(PercentileCutoff);
   if (iter != ThresholdCache.end()) {
     return iter->second;
   }
   auto &DetailedSummary = Summary->getDetailedSummary();
-  auto &Entry =
-      getEntryForPercentile(DetailedSummary, PercentileCutoff);
+  auto &Entry = ProfileSummaryBuilder::getEntryForPercentile(DetailedSummary,
+                                                             PercentileCutoff);
   uint64_t CountThreshold = Entry.MinCount;
   ThresholdCache[PercentileCutoff] = CountThreshold;
   return CountThreshold;
 }
 
-bool ProfileSummaryInfo::hasHugeWorkingSetSize() {
-  if (!HasHugeWorkingSetSize)
-    computeThresholds();
+bool ProfileSummaryInfo::hasHugeWorkingSetSize() const {
   return HasHugeWorkingSetSize && HasHugeWorkingSetSize.getValue();
 }
 
-bool ProfileSummaryInfo::hasLargeWorkingSetSize() {
-  if (!HasLargeWorkingSetSize)
-    computeThresholds();
+bool ProfileSummaryInfo::hasLargeWorkingSetSize() const {
   return HasLargeWorkingSetSize && HasLargeWorkingSetSize.getValue();
 }
 
-bool ProfileSummaryInfo::isHotCount(uint64_t C) {
-  if (!HotCountThreshold)
-    computeThresholds();
+bool ProfileSummaryInfo::isHotCount(uint64_t C) const {
   return HotCountThreshold && C >= HotCountThreshold.getValue();
 }
 
-bool ProfileSummaryInfo::isColdCount(uint64_t C) {
-  if (!ColdCountThreshold)
-    computeThresholds();
+bool ProfileSummaryInfo::isColdCount(uint64_t C) const {
   return ColdCountThreshold && C <= ColdCountThreshold.getValue();
 }
 
-bool ProfileSummaryInfo::isHotCountNthPercentile(int PercentileCutoff, uint64_t C) {
+template <bool isHot>
+bool ProfileSummaryInfo::isHotOrColdCountNthPercentile(int PercentileCutoff,
+                                                       uint64_t C) const {
   auto CountThreshold = computeThreshold(PercentileCutoff);
-  return CountThreshold && C >= CountThreshold.getValue();
+  if (isHot)
+    return CountThreshold && C >= CountThreshold.getValue();
+  else
+    return CountThreshold && C <= CountThreshold.getValue();
 }
 
-uint64_t ProfileSummaryInfo::getOrCompHotCountThreshold() {
-  if (!HotCountThreshold)
-    computeThresholds();
+bool ProfileSummaryInfo::isHotCountNthPercentile(int PercentileCutoff,
+                                                 uint64_t C) const {
+  return isHotOrColdCountNthPercentile<true>(PercentileCutoff, C);
+}
+
+bool ProfileSummaryInfo::isColdCountNthPercentile(int PercentileCutoff,
+                                                  uint64_t C) const {
+  return isHotOrColdCountNthPercentile<false>(PercentileCutoff, C);
+}
+
+uint64_t ProfileSummaryInfo::getOrCompHotCountThreshold() const {
   return HotCountThreshold ? HotCountThreshold.getValue() : UINT64_MAX;
 }
 
-uint64_t ProfileSummaryInfo::getOrCompColdCountThreshold() {
-  if (!ColdCountThreshold)
-    computeThresholds();
+uint64_t ProfileSummaryInfo::getOrCompColdCountThreshold() const {
   return ColdCountThreshold ? ColdCountThreshold.getValue() : 0;
 }
 
-bool ProfileSummaryInfo::isHotBlock(const BasicBlock *BB, BlockFrequencyInfo *BFI) {
+bool ProfileSummaryInfo::isHotBlock(const BasicBlock *BB,
+                                    BlockFrequencyInfo *BFI) const {
   auto Count = BFI->getBlockProfileCount(BB);
   return Count && isHotCount(*Count);
 }
 
 bool ProfileSummaryInfo::isColdBlock(const BasicBlock *BB,
-                                  BlockFrequencyInfo *BFI) {
+                                     BlockFrequencyInfo *BFI) const {
   auto Count = BFI->getBlockProfileCount(BB);
   return Count && isColdCount(*Count);
 }
 
-bool ProfileSummaryInfo::isHotBlockNthPercentile(int PercentileCutoff,
-                                                 const BasicBlock *BB,
-                                                 BlockFrequencyInfo *BFI) {
+template <bool isHot>
+bool ProfileSummaryInfo::isHotOrColdBlockNthPercentile(
+    int PercentileCutoff, const BasicBlock *BB, BlockFrequencyInfo *BFI) const {
   auto Count = BFI->getBlockProfileCount(BB);
-  return Count && isHotCountNthPercentile(PercentileCutoff, *Count);
+  if (isHot)
+    return Count && isHotCountNthPercentile(PercentileCutoff, *Count);
+  else
+    return Count && isColdCountNthPercentile(PercentileCutoff, *Count);
+}
+
+bool ProfileSummaryInfo::isHotBlockNthPercentile(
+    int PercentileCutoff, const BasicBlock *BB, BlockFrequencyInfo *BFI) const {
+  return isHotOrColdBlockNthPercentile<true>(PercentileCutoff, BB, BFI);
+}
+
+bool ProfileSummaryInfo::isColdBlockNthPercentile(
+    int PercentileCutoff, const BasicBlock *BB, BlockFrequencyInfo *BFI) const {
+  return isHotOrColdBlockNthPercentile<false>(PercentileCutoff, BB, BFI);
 }
 
-bool ProfileSummaryInfo::isHotCallSite(const CallSite &CS,
-                                       BlockFrequencyInfo *BFI) {
-  auto C = getProfileCount(CS.getInstruction(), BFI);
+bool ProfileSummaryInfo::isHotCallSite(const CallBase &CB,
+                                       BlockFrequencyInfo *BFI) const {
+  auto C = getProfileCount(CB, BFI);
   return C && isHotCount(*C);
 }
 
-bool ProfileSummaryInfo::isColdCallSite(const CallSite &CS,
-                                        BlockFrequencyInfo *BFI) {
-  auto C = getProfileCount(CS.getInstruction(), BFI);
+bool ProfileSummaryInfo::isColdCallSite(const CallBase &CB,
+                                        BlockFrequencyInfo *BFI) const {
+  auto C = getProfileCount(CB, BFI);
   if (C)
     return isColdCount(*C);
 
   // In SamplePGO, if the caller has been sampled, and there is no profile
   // annotated on the callsite, we consider the callsite as cold.
-  return hasSampleProfile() && CS.getCaller()->hasProfileData();
+  return hasSampleProfile() && CB.getCaller()->hasProfileData();
+}
+
+bool ProfileSummaryInfo::hasPartialSampleProfile() const {
+  return hasProfileSummary() &&
+         Summary->getKind() == ProfileSummary::PSK_Sample &&
+         (PartialProfile || Summary->isPartialProfile());
 }
 
 INITIALIZE_PASS(ProfileSummaryInfoWrapperPass, "profile-summary-info",
diff --git a/llvm/lib/Analysis/RegionPrinter.cpp b/llvm/lib/Analysis/RegionPrinter.cpp
index 020ff85d1b98..1fb5faaa6a71 100644
--- a/llvm/lib/Analysis/RegionPrinter.cpp
+++ b/llvm/lib/Analysis/RegionPrinter.cpp
@@ -47,11 +47,11 @@ struct DOTGraphTraits<RegionNode*> : public DefaultDOTGraphTraits {
       BasicBlock *BB = Node->getNodeAs<BasicBlock>();
 
       if (isSimple())
-        return DOTGraphTraits<const Function*>
-          ::getSimpleNodeLabel(BB, BB->getParent());
+        return DOTGraphTraits<DOTFuncInfo *>
+          ::getSimpleNodeLabel(BB, nullptr);
       else
-        return DOTGraphTraits<const Function*>
-          ::getCompleteNodeLabel(BB, BB->getParent());
+        return DOTGraphTraits<DOTFuncInfo *>
+          ::getCompleteNodeLabel(BB, nullptr);
     }
 
     return "Not implemented";
diff --git a/llvm/lib/Analysis/ReleaseModeModelRunner.cpp b/llvm/lib/Analysis/ReleaseModeModelRunner.cpp
new file mode 100644
index 000000000000..4c0ffbc17ff7
--- /dev/null
+++ b/llvm/lib/Analysis/ReleaseModeModelRunner.cpp
@@ -0,0 +1,87 @@
+//===- ReleaseModeModelRunner.cpp - Fast, precompiled model runner  -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a model runner wrapping an AOT compiled ML model.
+// Only inference is supported.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/InlineModelFeatureMaps.h"
+#include "llvm/Analysis/MLInlineAdvisor.h"
+
+// codegen-ed file
+#include "InlinerSizeModel.h" // NOLINT
+
+#include <memory>
+#include <vector>
+
+using namespace llvm;
+namespace {
+
+static const char *const FeedPrefix = "feed_";
+static const char *const FetchPrefix = "fetch_";
+
+/// MLModelRunner - production mode implementation. It uses a AOT-compiled
+/// SavedModel for efficient execution.
+class ReleaseModeModelRunner final : public MLModelRunner {
+public:
+  ReleaseModeModelRunner(LLVMContext &Ctx);
+  virtual ~ReleaseModeModelRunner() = default;
+
+  bool run() override;
+
+  void setFeature(FeatureIndex Index, int64_t Value) override;
+  int64_t getFeature(int Index) const override;
+
+private:
+  std::vector<int32_t> FeatureIndices;
+  int32_t ResultIndex = -1;
+  std::unique_ptr<llvm::InlinerSizeModel> CompiledModel;
+};
+} // namespace
+
+ReleaseModeModelRunner::ReleaseModeModelRunner(LLVMContext &Ctx)
+    : MLModelRunner(Ctx),
+      CompiledModel(std::make_unique<llvm::InlinerSizeModel>()) {
+  assert(CompiledModel && "The CompiledModel should be valid");
+
+  FeatureIndices.reserve(NumberOfFeatures);
+
+  for (size_t I = 0; I < NumberOfFeatures; ++I) {
+    const int Index =
+        CompiledModel->LookupArgIndex(FeedPrefix + FeatureNameMap[I]);
+    assert(Index >= 0 && "Cannot find Feature in inlining model");
+    FeatureIndices[I] = Index;
+  }
+
+  ResultIndex =
+      CompiledModel->LookupResultIndex(std::string(FetchPrefix) + DecisionName);
+  assert(ResultIndex >= 0 && "Cannot find DecisionName in inlining model");
+}
+
+int64_t ReleaseModeModelRunner::getFeature(int Index) const {
+  return *static_cast<int64_t *>(
+      CompiledModel->arg_data(FeatureIndices[Index]));
+}
+
+void ReleaseModeModelRunner::setFeature(FeatureIndex Index, int64_t Value) {
+  *static_cast<int64_t *>(CompiledModel->arg_data(
+      FeatureIndices[static_cast<size_t>(Index)])) = Value;
+}
+
+bool ReleaseModeModelRunner::run() {
+  CompiledModel->Run();
+  return static_cast<bool>(
+      *static_cast<int64_t *>(CompiledModel->result_data(ResultIndex)));
+}
+
+std::unique_ptr<InlineAdvisor>
+llvm::getReleaseModeAdvisor(Module &M, ModuleAnalysisManager &MAM) {
+  auto AOTRunner = std::make_unique<ReleaseModeModelRunner>(M.getContext());
+  return std::make_unique<MLInlineAdvisor>(M, MAM, std::move(AOTRunner));
+}
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 26a9a5ddf1ea..48c686b73260 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -79,6 +79,7 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/ScalarEvolutionDivision.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -86,7 +87,6 @@
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
@@ -848,273 +848,14 @@ static void GroupByComplexity(SmallVectorImpl<const SCEV *> &Ops,
   }
 }
 
-// Returns the size of the SCEV S.
-static inline int sizeOfSCEV(const SCEV *S) {
-  struct FindSCEVSize {
-    int Size = 0;
-
-    FindSCEVSize() = default;
-
-    bool follow(const SCEV *S) {
-      ++Size;
-      // Keep looking at all operands of S.
-      return true;
-    }
-
-    bool isDone() const {
-      return false;
-    }
-  };
-
-  FindSCEVSize F;
-  SCEVTraversal<FindSCEVSize> ST(F);
-  ST.visitAll(S);
-  return F.Size;
-}
-
-/// Returns true if the subtree of \p S contains at least HugeExprThreshold
-/// nodes.
-static bool isHugeExpression(const SCEV *S) {
-  return S->getExpressionSize() >= HugeExprThreshold;
-}
-
-/// Returns true of \p Ops contains a huge SCEV (see definition above).
+/// Returns true if \p Ops contains a huge SCEV (the subtree of S contains at
+/// least HugeExprThreshold nodes).
 static bool hasHugeExpression(ArrayRef<const SCEV *> Ops) {
-  return any_of(Ops, isHugeExpression);
+  return any_of(Ops, [](const SCEV *S) {
+    return S->getExpressionSize() >= HugeExprThreshold;
+  });
 }
 
-namespace {
-
-struct SCEVDivision : public SCEVVisitor<SCEVDivision, void> {
-public:
-  // Computes the Quotient and Remainder of the division of Numerator by
-  // Denominator.
-  static void divide(ScalarEvolution &SE, const SCEV *Numerator,
-                     const SCEV *Denominator, const SCEV **Quotient,
-                     const SCEV **Remainder) {
-    assert(Numerator && Denominator && "Uninitialized SCEV");
-
-    SCEVDivision D(SE, Numerator, Denominator);
-
-    // Check for the trivial case here to avoid having to check for it in the
-    // rest of the code.
-    if (Numerator == Denominator) {
-      *Quotient = D.One;
-      *Remainder = D.Zero;
-      return;
-    }
-
-    if (Numerator->isZero()) {
-      *Quotient = D.Zero;
-      *Remainder = D.Zero;
-      return;
-    }
-
-    // A simple case when N/1. The quotient is N.
-    if (Denominator->isOne()) {
-      *Quotient = Numerator;
-      *Remainder = D.Zero;
-      return;
-    }
-
-    // Split the Denominator when it is a product.
-    if (const SCEVMulExpr *T = dyn_cast<SCEVMulExpr>(Denominator)) {
-      const SCEV *Q, *R;
-      *Quotient = Numerator;
-      for (const SCEV *Op : T->operands()) {
-        divide(SE, *Quotient, Op, &Q, &R);
-        *Quotient = Q;
-
-        // Bail out when the Numerator is not divisible by one of the terms of
-        // the Denominator.
-        if (!R->isZero()) {
-          *Quotient = D.Zero;
-          *Remainder = Numerator;
-          return;
-        }
-      }
-      *Remainder = D.Zero;
-      return;
-    }
-
-    D.visit(Numerator);
-    *Quotient = D.Quotient;
-    *Remainder = D.Remainder;
-  }
-
-  // Except in the trivial case described above, we do not know how to divide
-  // Expr by Denominator for the following functions with empty implementation.
-  void visitTruncateExpr(const SCEVTruncateExpr *Numerator) {}
-  void visitZeroExtendExpr(const SCEVZeroExtendExpr *Numerator) {}
-  void visitSignExtendExpr(const SCEVSignExtendExpr *Numerator) {}
-  void visitUDivExpr(const SCEVUDivExpr *Numerator) {}
-  void visitSMaxExpr(const SCEVSMaxExpr *Numerator) {}
-  void visitUMaxExpr(const SCEVUMaxExpr *Numerator) {}
-  void visitSMinExpr(const SCEVSMinExpr *Numerator) {}
-  void visitUMinExpr(const SCEVUMinExpr *Numerator) {}
-  void visitUnknown(const SCEVUnknown *Numerator) {}
-  void visitCouldNotCompute(const SCEVCouldNotCompute *Numerator) {}
-
-  void visitConstant(const SCEVConstant *Numerator) {
-    if (const SCEVConstant *D = dyn_cast<SCEVConstant>(Denominator)) {
-      APInt NumeratorVal = Numerator->getAPInt();
-      APInt DenominatorVal = D->getAPInt();
-      uint32_t NumeratorBW = NumeratorVal.getBitWidth();
-      uint32_t DenominatorBW = DenominatorVal.getBitWidth();
-
-      if (NumeratorBW > DenominatorBW)
-        DenominatorVal = DenominatorVal.sext(NumeratorBW);
-      else if (NumeratorBW < DenominatorBW)
-        NumeratorVal = NumeratorVal.sext(DenominatorBW);
-
-      APInt QuotientVal(NumeratorVal.getBitWidth(), 0);
-      APInt RemainderVal(NumeratorVal.getBitWidth(), 0);
-      APInt::sdivrem(NumeratorVal, DenominatorVal, QuotientVal, RemainderVal);
-      Quotient = SE.getConstant(QuotientVal);
-      Remainder = SE.getConstant(RemainderVal);
-      return;
-    }
-  }
-
-  void visitAddRecExpr(const SCEVAddRecExpr *Numerator) {
-    const SCEV *StartQ, *StartR, *StepQ, *StepR;
-    if (!Numerator->isAffine())
-      return cannotDivide(Numerator);
-    divide(SE, Numerator->getStart(), Denominator, &StartQ, &StartR);
-    divide(SE, Numerator->getStepRecurrence(SE), Denominator, &StepQ, &StepR);
-    // Bail out if the types do not match.
-    Type *Ty = Denominator->getType();
-    if (Ty != StartQ->getType() || Ty != StartR->getType() ||
-        Ty != StepQ->getType() || Ty != StepR->getType())
-      return cannotDivide(Numerator);
-    Quotient = SE.getAddRecExpr(StartQ, StepQ, Numerator->getLoop(),
-                                Numerator->getNoWrapFlags());
-    Remainder = SE.getAddRecExpr(StartR, StepR, Numerator->getLoop(),
-                                 Numerator->getNoWrapFlags());
-  }
-
-  void visitAddExpr(const SCEVAddExpr *Numerator) {
-    SmallVector<const SCEV *, 2> Qs, Rs;
-    Type *Ty = Denominator->getType();
-
-    for (const SCEV *Op : Numerator->operands()) {
-      const SCEV *Q, *R;
-      divide(SE, Op, Denominator, &Q, &R);
-
-      // Bail out if types do not match.
-      if (Ty != Q->getType() || Ty != R->getType())
-        return cannotDivide(Numerator);
-
-      Qs.push_back(Q);
-      Rs.push_back(R);
-    }
-
-    if (Qs.size() == 1) {
-      Quotient = Qs[0];
-      Remainder = Rs[0];
-      return;
-    }
-
-    Quotient = SE.getAddExpr(Qs);
-    Remainder = SE.getAddExpr(Rs);
-  }
-
-  void visitMulExpr(const SCEVMulExpr *Numerator) {
-    SmallVector<const SCEV *, 2> Qs;
-    Type *Ty = Denominator->getType();
-
-    bool FoundDenominatorTerm = false;
-    for (const SCEV *Op : Numerator->operands()) {
-      // Bail out if types do not match.
-      if (Ty != Op->getType())
-        return cannotDivide(Numerator);
-
-      if (FoundDenominatorTerm) {
-        Qs.push_back(Op);
-        continue;
-      }
-
-      // Check whether Denominator divides one of the product operands.
-      const SCEV *Q, *R;
-      divide(SE, Op, Denominator, &Q, &R);
-      if (!R->isZero()) {
-        Qs.push_back(Op);
-        continue;
-      }
-
-      // Bail out if types do not match.
-      if (Ty != Q->getType())
-        return cannotDivide(Numerator);
-
-      FoundDenominatorTerm = true;
-      Qs.push_back(Q);
-    }
-
-    if (FoundDenominatorTerm) {
-      Remainder = Zero;
-      if (Qs.size() == 1)
-        Quotient = Qs[0];
-      else
-        Quotient = SE.getMulExpr(Qs);
-      return;
-    }
-
-    if (!isa<SCEVUnknown>(Denominator))
-      return cannotDivide(Numerator);
-
-    // The Remainder is obtained by replacing Denominator by 0 in Numerator.
-    ValueToValueMap RewriteMap;
-    RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
-        cast<SCEVConstant>(Zero)->getValue();
-    Remainder = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
-
-    if (Remainder->isZero()) {
-      // The Quotient is obtained by replacing Denominator by 1 in Numerator.
-      RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
-          cast<SCEVConstant>(One)->getValue();
-      Quotient =
-          SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
-      return;
-    }
-
-    // Quotient is (Numerator - Remainder) divided by Denominator.
-    const SCEV *Q, *R;
-    const SCEV *Diff = SE.getMinusSCEV(Numerator, Remainder);
-    // This SCEV does not seem to simplify: fail the division here.
-    if (sizeOfSCEV(Diff) > sizeOfSCEV(Numerator))
-      return cannotDivide(Numerator);
-    divide(SE, Diff, Denominator, &Q, &R);
-    if (R != Zero)
-      return cannotDivide(Numerator);
-    Quotient = Q;
-  }
-
-private:
-  SCEVDivision(ScalarEvolution &S, const SCEV *Numerator,
-               const SCEV *Denominator)
-      : SE(S), Denominator(Denominator) {
-    Zero = SE.getZero(Denominator->getType());
-    One = SE.getOne(Denominator->getType());
-
-    // We generally do not know how to divide Expr by Denominator. We
-    // initialize the division to a "cannot divide" state to simplify the rest
-    // of the code.
-    cannotDivide(Numerator);
-  }
-
-  // Convenience function for giving up on the division. We set the quotient to
-  // be equal to zero and the remainder to be equal to the numerator.
-  void cannotDivide(const SCEV *Numerator) {
-    Quotient = Zero;
-    Remainder = Numerator;
-  }
-
-  ScalarEvolution &SE;
-  const SCEV *Denominator, *Quotient, *Remainder, *Zero, *One;
-};
-
-} // end anonymous namespace
-
 //===----------------------------------------------------------------------===//
 //                      Simple SCEV method implementations
 //===----------------------------------------------------------------------===//
@@ -1612,7 +1353,7 @@ bool ScalarEvolution::proveNoWrapByVaryingStart(const SCEV *Start,
 static APInt extractConstantWithoutWrapping(ScalarEvolution &SE,
                                             const SCEVConstant *ConstantTerm,
                                             const SCEVAddExpr *WholeAddExpr) {
-  const APInt C = ConstantTerm->getAPInt();
+  const APInt &C = ConstantTerm->getAPInt();
   const unsigned BitWidth = C.getBitWidth();
   // Find number of trailing zeros of (x + y + ...) w/o the C first:
   uint32_t TZ = BitWidth;
@@ -2455,6 +2196,11 @@ const SCEV *ScalarEvolution::getAddExpr(SmallVectorImpl<const SCEV *> &Ops,
   if (Depth > MaxArithDepth || hasHugeExpression(Ops))
     return getOrCreateAddExpr(Ops, Flags);
 
+  if (SCEV *S = std::get<0>(findExistingSCEVInCache(scAddExpr, Ops))) {
+    static_cast<SCEVAddExpr *>(S)->setNoWrapFlags(Flags);
+    return S;
+  }
+
   // Okay, check to see if the same value occurs in the operand list more than
   // once.  If so, merge them together into an multiply expression.  Since we
   // sorted the list, these values are required to be adjacent.
@@ -2930,10 +2676,17 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
 
   Flags = StrengthenNoWrapFlags(this, scMulExpr, Ops, Flags);
 
-  // Limit recursion calls depth.
-  if (Depth > MaxArithDepth || hasHugeExpression(Ops))
+  // Limit recursion calls depth, but fold all-constant expressions.
+  // `Ops` is sorted, so it's enough to check just last one.
+  if ((Depth > MaxArithDepth || hasHugeExpression(Ops)) &&
+      !isa<SCEVConstant>(Ops.back()))
     return getOrCreateMulExpr(Ops, Flags);
 
+  if (SCEV *S = std::get<0>(findExistingSCEVInCache(scMulExpr, Ops))) {
+    static_cast<SCEVMulExpr *>(S)->setNoWrapFlags(Flags);
+    return S;
+  }
+
   // If there are any constants, fold them together.
   unsigned Idx = 0;
   if (const SCEVConstant *LHSC = dyn_cast<SCEVConstant>(Ops[0])) {
@@ -3104,8 +2857,7 @@ const SCEV *ScalarEvolution::getMulExpr(SmallVectorImpl<const SCEV *> &Ops,
       // Limit max number of arguments to avoid creation of unreasonably big
       // SCEVAddRecs with very complex operands.
       if (AddRec->getNumOperands() + OtherAddRec->getNumOperands() - 1 >
-          MaxAddRecSize || isHugeExpression(AddRec) ||
-          isHugeExpression(OtherAddRec))
+          MaxAddRecSize || hasHugeExpression({AddRec, OtherAddRec}))
         continue;
 
       bool Overflow = false;
@@ -3197,6 +2949,14 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
          getEffectiveSCEVType(RHS->getType()) &&
          "SCEVUDivExpr operand types don't match!");
 
+  FoldingSetNodeID ID;
+  ID.AddInteger(scUDivExpr);
+  ID.AddPointer(LHS);
+  ID.AddPointer(RHS);
+  void *IP = nullptr;
+  if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP))
+    return S;
+
   if (const SCEVConstant *RHSC = dyn_cast<SCEVConstant>(RHS)) {
     if (RHSC->getValue()->isOne())
       return LHS;                               // X udiv 1 --> x
@@ -3243,9 +3003,24 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
                             AR->getLoop(), SCEV::FlagAnyWrap)) {
             const APInt &StartInt = StartC->getAPInt();
             const APInt &StartRem = StartInt.urem(StepInt);
-            if (StartRem != 0)
-              LHS = getAddRecExpr(getConstant(StartInt - StartRem), Step,
-                                  AR->getLoop(), SCEV::FlagNW);
+            if (StartRem != 0) {
+              const SCEV *NewLHS =
+                  getAddRecExpr(getConstant(StartInt - StartRem), Step,
+                                AR->getLoop(), SCEV::FlagNW);
+              if (LHS != NewLHS) {
+                LHS = NewLHS;
+
+                // Reset the ID to include the new LHS, and check if it is
+                // already cached.
+                ID.clear();
+                ID.AddInteger(scUDivExpr);
+                ID.AddPointer(LHS);
+                ID.AddPointer(RHS);
+                IP = nullptr;
+                if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP))
+                  return S;
+              }
+            }
           }
         }
       // (A*B)/C --> A*(B/C) if safe and B/C can be folded.
@@ -3310,11 +3085,9 @@ const SCEV *ScalarEvolution::getUDivExpr(const SCEV *LHS,
     }
   }
 
-  FoldingSetNodeID ID;
-  ID.AddInteger(scUDivExpr);
-  ID.AddPointer(LHS);
-  ID.AddPointer(RHS);
-  void *IP = nullptr;
+  // The Insertion Point (IP) might be invalid by now (due to UniqueSCEVs
+  // changes). Make sure we get a new one.
+  IP = nullptr;
   if (const SCEV *S = UniqueSCEVs.FindNodeOrInsertPos(ID, IP)) return S;
   SCEV *S = new (SCEVAllocator) SCEVUDivExpr(ID.Intern(SCEVAllocator),
                                              LHS, RHS);
@@ -3505,9 +3278,8 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
                                              : SCEV::FlagAnyWrap;
 
   const SCEV *TotalOffset = getZero(IntIdxTy);
-  // The array size is unimportant. The first thing we do on CurTy is getting
-  // its element type.
-  Type *CurTy = ArrayType::get(GEP->getSourceElementType(), 0);
+  Type *CurTy = GEP->getType();
+  bool FirstIter = true;
   for (const SCEV *IndexExpr : IndexExprs) {
     // Compute the (potentially symbolic) offset in bytes for this index.
     if (StructType *STy = dyn_cast<StructType>(CurTy)) {
@@ -3523,7 +3295,14 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
       CurTy = STy->getTypeAtIndex(Index);
     } else {
       // Update CurTy to its element type.
-      CurTy = cast<SequentialType>(CurTy)->getElementType();
+      if (FirstIter) {
+        assert(isa<PointerType>(CurTy) &&
+               "The first index of a GEP indexes a pointer");
+        CurTy = GEP->getSourceElementType();
+        FirstIter = false;
+      } else {
+        CurTy = GetElementPtrInst::getTypeAtIndex(CurTy, (uint64_t)0);
+      }
       // For an array, add the element offset, explicitly scaled.
       const SCEV *ElementSize = getSizeOfExpr(IntIdxTy, CurTy);
       // Getelementptr indices are signed.
@@ -3538,10 +3317,13 @@ ScalarEvolution::getGEPExpr(GEPOperator *GEP,
   }
 
   // Add the total offset from all the GEP indices to the base.
-  return getAddExpr(BaseExpr, TotalOffset, Wrap);
+  auto *GEPExpr = getAddExpr(BaseExpr, TotalOffset, Wrap);
+  assert(BaseExpr->getType() == GEPExpr->getType() &&
+         "GEP should not change type mid-flight.");
+  return GEPExpr;
 }
 
-std::tuple<const SCEV *, FoldingSetNodeID, void *>
+std::tuple<SCEV *, FoldingSetNodeID, void *>
 ScalarEvolution::findExistingSCEVInCache(int SCEVType,
                                          ArrayRef<const SCEV *> Ops) {
   FoldingSetNodeID ID;
@@ -3549,7 +3331,7 @@ ScalarEvolution::findExistingSCEVInCache(int SCEVType,
   ID.AddInteger(SCEVType);
   for (unsigned i = 0, e = Ops.size(); i != e; ++i)
     ID.AddPointer(Ops[i]);
-  return std::tuple<const SCEV *, FoldingSetNodeID, void *>(
+  return std::tuple<SCEV *, FoldingSetNodeID, void *>(
       UniqueSCEVs.FindNodeOrInsertPos(ID, IP), std::move(ID), IP);
 }
 
@@ -3727,6 +3509,12 @@ const SCEV *ScalarEvolution::getSizeOfExpr(Type *IntTy, Type *AllocTy) {
   // We can bypass creating a target-independent
   // constant expression and then folding it back into a ConstantInt.
   // This is just a compile-time optimization.
+  if (isa<ScalableVectorType>(AllocTy)) {
+    Constant *NullPtr = Constant::getNullValue(AllocTy->getPointerTo());
+    Constant *One = ConstantInt::get(IntTy, 1);
+    Constant *GEP = ConstantExpr::getGetElementPtr(AllocTy, NullPtr, One);
+    return getSCEV(ConstantExpr::getPtrToInt(GEP, IntTy));
+  }
   return getConstant(IntTy, getDataLayout().getTypeAllocSize(AllocTy));
 }
 
@@ -3820,7 +3608,8 @@ bool ScalarEvolution::containsAddRecurrence(const SCEV *S) {
   if (I != HasRecMap.end())
     return I->second;
 
-  bool FoundAddRec = SCEVExprContains(S, isa<SCEVAddRecExpr, const SCEV *>);
+  bool FoundAddRec =
+      SCEVExprContains(S, [](const SCEV *S) { return isa<SCEVAddRecExpr>(S); });
   HasRecMap.insert({S, FoundAddRec});
   return FoundAddRec;
 }
@@ -4167,23 +3956,25 @@ const SCEV *ScalarEvolution::getPointerBase(const SCEV *V) {
   if (!V->getType()->isPointerTy())
     return V;
 
-  if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(V)) {
-    return getPointerBase(Cast->getOperand());
-  } else if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(V)) {
-    const SCEV *PtrOp = nullptr;
-    for (const SCEV *NAryOp : NAry->operands()) {
-      if (NAryOp->getType()->isPointerTy()) {
-        // Cannot find the base of an expression with multiple pointer operands.
-        if (PtrOp)
-          return V;
-        PtrOp = NAryOp;
+  while (true) {
+    if (const SCEVCastExpr *Cast = dyn_cast<SCEVCastExpr>(V)) {
+      V = Cast->getOperand();
+    } else if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(V)) {
+      const SCEV *PtrOp = nullptr;
+      for (const SCEV *NAryOp : NAry->operands()) {
+        if (NAryOp->getType()->isPointerTy()) {
+          // Cannot find the base of an expression with multiple pointer ops.
+          if (PtrOp)
+            return V;
+          PtrOp = NAryOp;
+        }
       }
-    }
-    if (!PtrOp)
+      if (!PtrOp) // All operands were non-pointer.
+        return V;
+      V = PtrOp;
+    } else // Not something we can look further into.
       return V;
-    return getPointerBase(PtrOp);
   }
-  return V;
 }
 
 /// Push users of the given Instruction onto the given Worklist.
@@ -5740,7 +5531,7 @@ ScalarEvolution::getRangeRef(const SCEV *S,
       // For a SCEVUnknown, ask ValueTracking.
       KnownBits Known = computeKnownBits(U->getValue(), DL, 0, &AC, nullptr, &DT);
       if (Known.getBitWidth() != BitWidth)
-        Known = Known.zextOrTrunc(BitWidth, true);
+        Known = Known.zextOrTrunc(BitWidth);
       // If Known does not result in full-set, intersect with it.
       if (Known.getMinValue() != Known.getMaxValue() + 1)
         ConservativeResult = ConservativeResult.intersectWith(
@@ -6032,7 +5823,7 @@ bool ScalarEvolution::isSCEVExprNeverPoison(const Instruction *I) {
     return false;
 
   // Only proceed if we can prove that I does not yield poison.
-  if (!programUndefinedIfFullPoison(I))
+  if (!programUndefinedIfPoison(I))
     return false;
 
   // At this point we know that if I is executed, then it does not wrap
@@ -6112,7 +5903,7 @@ bool ScalarEvolution::isAddRecNeverPoison(const Instruction *I, const Loop *L) {
   SmallVector<const Instruction *, 8> PoisonStack;
 
   // We start by assuming \c I, the post-inc add recurrence, is poison.  Only
-  // things that are known to be fully poison under that assumption go on the
+  // things that are known to be poison under that assumption go on the
   // PoisonStack.
   Pushed.insert(I);
   PoisonStack.push_back(I);
@@ -6122,7 +5913,7 @@ bool ScalarEvolution::isAddRecNeverPoison(const Instruction *I, const Loop *L) {
     const Instruction *Poison = PoisonStack.pop_back_val();
 
     for (auto *PoisonUser : Poison->users()) {
-      if (propagatesFullPoison(cast<Instruction>(PoisonUser))) {
+      if (propagatesPoison(cast<Instruction>(PoisonUser))) {
         if (Pushed.insert(cast<Instruction>(PoisonUser)).second)
           PoisonStack.push_back(cast<Instruction>(PoisonUser));
       } else if (auto *BI = dyn_cast<BranchInst>(PoisonUser)) {
@@ -6349,15 +6140,8 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
         if (GetMinTrailingZeros(LHS) >=
             (CIVal.getBitWidth() - CIVal.countLeadingZeros())) {
           // Build a plain add SCEV.
-          const SCEV *S = getAddExpr(LHS, getSCEV(CI));
-          // If the LHS of the add was an addrec and it has no-wrap flags,
-          // transfer the no-wrap flags, since an or won't introduce a wrap.
-          if (const SCEVAddRecExpr *NewAR = dyn_cast<SCEVAddRecExpr>(S)) {
-            const SCEVAddRecExpr *OldAR = cast<SCEVAddRecExpr>(LHS);
-            const_cast<SCEVAddRecExpr *>(NewAR)->setNoWrapFlags(
-                OldAR->getNoWrapFlags());
-          }
-          return S;
+          return getAddExpr(LHS, getSCEV(CI),
+                            (SCEV::NoWrapFlags)(SCEV::FlagNUW | SCEV::FlagNSW));
         }
       }
       break;
@@ -6413,15 +6197,19 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
         if (SA->getValue().uge(BitWidth))
           break;
 
-        // It is currently not resolved how to interpret NSW for left
-        // shift by BitWidth - 1, so we avoid applying flags in that
-        // case. Remove this check (or this comment) once the situation
-        // is resolved. See
-        // http://lists.llvm.org/pipermail/llvm-dev/2015-April/084195.html
-        // and http://reviews.llvm.org/D8890 .
+        // We can safely preserve the nuw flag in all cases. It's also safe to
+        // turn a nuw nsw shl into a nuw nsw mul. However, nsw in isolation
+        // requires special handling. It can be preserved as long as we're not
+        // left shifting by bitwidth - 1.
         auto Flags = SCEV::FlagAnyWrap;
-        if (BO->Op && SA->getValue().ult(BitWidth - 1))
-          Flags = getNoWrapFlagsFromUB(BO->Op);
+        if (BO->Op) {
+          auto MulFlags = getNoWrapFlagsFromUB(BO->Op);
+          if ((MulFlags & SCEV::FlagNSW) &&
+              ((MulFlags & SCEV::FlagNUW) || SA->getValue().ult(BitWidth - 1)))
+            Flags = (SCEV::NoWrapFlags)(Flags | SCEV::FlagNSW);
+          if (MulFlags & SCEV::FlagNUW)
+            Flags = (SCEV::NoWrapFlags)(Flags | SCEV::FlagNUW);
+        }
 
         Constant *X = ConstantInt::get(
             getContext(), APInt::getOneBitSet(BitWidth, SA->getZExtValue()));
@@ -6515,6 +6303,20 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
       return getSCEV(U->getOperand(0));
     break;
 
+  case Instruction::SDiv:
+    // If both operands are non-negative, this is just an udiv.
+    if (isKnownNonNegative(getSCEV(U->getOperand(0))) &&
+        isKnownNonNegative(getSCEV(U->getOperand(1))))
+      return getUDivExpr(getSCEV(U->getOperand(0)), getSCEV(U->getOperand(1)));
+    break;
+
+  case Instruction::SRem:
+    // If both operands are non-negative, this is just an urem.
+    if (isKnownNonNegative(getSCEV(U->getOperand(0))) &&
+        isKnownNonNegative(getSCEV(U->getOperand(1))))
+      return getURemExpr(getSCEV(U->getOperand(0)), getSCEV(U->getOperand(1)));
+    break;
+
   // It's tempting to handle inttoptr and ptrtoint as no-ops, however this can
   // lead to pointer expressions which cannot safely be expanded to GEPs,
   // because ScalarEvolution doesn't respect the GEP aliasing rules when
@@ -6538,7 +6340,7 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
 
   case Instruction::Call:
   case Instruction::Invoke:
-    if (Value *RV = CallSite(U).getReturnedArgOperand())
+    if (Value *RV = cast<CallBase>(U)->getReturnedArgOperand())
       return getSCEV(RV);
     break;
   }
@@ -6644,7 +6446,7 @@ const SCEV *ScalarEvolution::getExitCount(const Loop *L,
                                           BasicBlock *ExitingBlock,
                                           ExitCountKind Kind) {
   switch (Kind) {
-  case Exact: 
+  case Exact:
     return getBackedgeTakenInfo(L).getExact(ExitingBlock, this);
   case ConstantMaximum:
     return getBackedgeTakenInfo(L).getMax(ExitingBlock, this);
@@ -6661,7 +6463,7 @@ ScalarEvolution::getPredicatedBackedgeTakenCount(const Loop *L,
 const SCEV *ScalarEvolution::getBackedgeTakenCount(const Loop *L,
                                                    ExitCountKind Kind) {
   switch (Kind) {
-  case Exact: 
+  case Exact:
     return getBackedgeTakenInfo(L).getExact(L, this);
   case ConstantMaximum:
     return getBackedgeTakenInfo(L).getMax(this);
@@ -6924,6 +6726,10 @@ void ScalarEvolution::forgetValue(Value *V) {
   }
 }
 
+void ScalarEvolution::forgetLoopDispositions(const Loop *L) {
+  LoopDispositions.clear();
+}
+
 /// Get the exact loop backedge taken count considering all loop exits. A
 /// computable result can only be returned for loops with all exiting blocks
 /// dominating the latch. howFarToZero assumes that the limit of each loop test
@@ -8244,10 +8050,11 @@ const SCEV *ScalarEvolution::computeSCEVAtScope(const SCEV *V, const Loop *L) {
           if (!isa<SCEVCouldNotCompute>(BackedgeTakenCount) &&
               isKnownPositive(BackedgeTakenCount) &&
               PN->getNumIncomingValues() == 2) {
+
             unsigned InLoopPred = LI->contains(PN->getIncomingBlock(0)) ? 0 : 1;
-            const SCEV *OnBackedge = getSCEV(PN->getIncomingValue(InLoopPred));
-            if (IsAvailableOnEntry(LI, DT, OnBackedge, PN->getParent()))
-              return OnBackedge;
+            Value *BackedgeVal = PN->getIncomingValue(InLoopPred);
+            if (LI->isLoopInvariant(BackedgeVal))
+              return getSCEV(BackedgeVal);
           }
           if (auto *BTCC = dyn_cast<SCEVConstant>(BackedgeTakenCount)) {
             // Okay, we know how many times the containing loop executes.  If
@@ -9226,9 +9033,11 @@ bool ScalarEvolution::isKnownViaInduction(ICmpInst::Predicate Pred,
       !isAvailableAtLoopEntry(SplitRHS.first, MDL))
     return false;
 
-  return isLoopEntryGuardedByCond(MDL, Pred, SplitLHS.first, SplitRHS.first) &&
-         isLoopBackedgeGuardedByCond(MDL, Pred, SplitLHS.second,
-                                     SplitRHS.second);
+  // It seems backedge guard check is faster than entry one so in some cases
+  // it can speed up whole estimation by short circuit
+  return isLoopBackedgeGuardedByCond(MDL, Pred, SplitLHS.second,
+                                     SplitRHS.second) &&
+         isLoopEntryGuardedByCond(MDL, Pred, SplitLHS.first, SplitRHS.first);
 }
 
 bool ScalarEvolution::isKnownPredicate(ICmpInst::Predicate Pred,
@@ -11161,8 +10970,9 @@ static bool findArrayDimensionsRec(ScalarEvolution &SE,
 // Returns true when one of the SCEVs of Terms contains a SCEVUnknown parameter.
 static inline bool containsParameters(SmallVectorImpl<const SCEV *> &Terms) {
   for (const SCEV *T : Terms)
-    if (SCEVExprContains(T, isa<SCEVUnknown, const SCEV *>))
+    if (SCEVExprContains(T, [](const SCEV *S) { return isa<SCEVUnknown>(S); }))
       return true;
+
   return false;
 }
 
@@ -11411,6 +11221,51 @@ void ScalarEvolution::delinearize(const SCEV *Expr,
   });
 }
 
+bool ScalarEvolution::getIndexExpressionsFromGEP(
+    const GetElementPtrInst *GEP, SmallVectorImpl<const SCEV *> &Subscripts,
+    SmallVectorImpl<int> &Sizes) {
+  assert(Subscripts.empty() && Sizes.empty() &&
+         "Expected output lists to be empty on entry to this function.");
+  assert(GEP && "getIndexExpressionsFromGEP called with a null GEP");
+  Type *Ty = GEP->getPointerOperandType();
+  bool DroppedFirstDim = false;
+  for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
+    const SCEV *Expr = getSCEV(GEP->getOperand(i));
+    if (i == 1) {
+      if (auto *PtrTy = dyn_cast<PointerType>(Ty)) {
+        Ty = PtrTy->getElementType();
+      } else if (auto *ArrayTy = dyn_cast<ArrayType>(Ty)) {
+        Ty = ArrayTy->getElementType();
+      } else {
+        Subscripts.clear();
+        Sizes.clear();
+        return false;
+      }
+      if (auto *Const = dyn_cast<SCEVConstant>(Expr))
+        if (Const->getValue()->isZero()) {
+          DroppedFirstDim = true;
+          continue;
+        }
+      Subscripts.push_back(Expr);
+      continue;
+    }
+
+    auto *ArrayTy = dyn_cast<ArrayType>(Ty);
+    if (!ArrayTy) {
+      Subscripts.clear();
+      Sizes.clear();
+      return false;
+    }
+
+    Subscripts.push_back(Expr);
+    if (!(DroppedFirstDim && i == 2))
+      Sizes.push_back(ArrayTy->getNumElements());
+
+    Ty = ArrayTy->getElementType();
+  }
+  return !Subscripts.empty();
+}
+
 //===----------------------------------------------------------------------===//
 //                   SCEVCallbackVH Class Implementation
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Analysis/ScalarEvolutionDivision.cpp b/llvm/lib/Analysis/ScalarEvolutionDivision.cpp
new file mode 100644
index 000000000000..19bf5766f448
--- /dev/null
+++ b/llvm/lib/Analysis/ScalarEvolutionDivision.cpp
@@ -0,0 +1,259 @@
+//===- ScalarEvolutionDivision.h - See below --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the class that knows how to divide SCEV's.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/ScalarEvolutionDivision.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/ErrorHandling.h"
+#include <cassert>
+#include <cstdint>
+
+namespace llvm {
+class Type;
+}
+
+using namespace llvm;
+
+namespace {
+
+static inline int sizeOfSCEV(const SCEV *S) {
+  struct FindSCEVSize {
+    int Size = 0;
+
+    FindSCEVSize() = default;
+
+    bool follow(const SCEV *S) {
+      ++Size;
+      // Keep looking at all operands of S.
+      return true;
+    }
+
+    bool isDone() const { return false; }
+  };
+
+  FindSCEVSize F;
+  SCEVTraversal<FindSCEVSize> ST(F);
+  ST.visitAll(S);
+  return F.Size;
+}
+
+} // namespace
+
+// Computes the Quotient and Remainder of the division of Numerator by
+// Denominator.
+void SCEVDivision::divide(ScalarEvolution &SE, const SCEV *Numerator,
+                          const SCEV *Denominator, const SCEV **Quotient,
+                          const SCEV **Remainder) {
+  assert(Numerator && Denominator && "Uninitialized SCEV");
+
+  SCEVDivision D(SE, Numerator, Denominator);
+
+  // Check for the trivial case here to avoid having to check for it in the
+  // rest of the code.
+  if (Numerator == Denominator) {
+    *Quotient = D.One;
+    *Remainder = D.Zero;
+    return;
+  }
+
+  if (Numerator->isZero()) {
+    *Quotient = D.Zero;
+    *Remainder = D.Zero;
+    return;
+  }
+
+  // A simple case when N/1. The quotient is N.
+  if (Denominator->isOne()) {
+    *Quotient = Numerator;
+    *Remainder = D.Zero;
+    return;
+  }
+
+  // Split the Denominator when it is a product.
+  if (const SCEVMulExpr *T = dyn_cast<SCEVMulExpr>(Denominator)) {
+    const SCEV *Q, *R;
+    *Quotient = Numerator;
+    for (const SCEV *Op : T->operands()) {
+      divide(SE, *Quotient, Op, &Q, &R);
+      *Quotient = Q;
+
+      // Bail out when the Numerator is not divisible by one of the terms of
+      // the Denominator.
+      if (!R->isZero()) {
+        *Quotient = D.Zero;
+        *Remainder = Numerator;
+        return;
+      }
+    }
+    *Remainder = D.Zero;
+    return;
+  }
+
+  D.visit(Numerator);
+  *Quotient = D.Quotient;
+  *Remainder = D.Remainder;
+}
+
+void SCEVDivision::visitConstant(const SCEVConstant *Numerator) {
+  if (const SCEVConstant *D = dyn_cast<SCEVConstant>(Denominator)) {
+    APInt NumeratorVal = Numerator->getAPInt();
+    APInt DenominatorVal = D->getAPInt();
+    uint32_t NumeratorBW = NumeratorVal.getBitWidth();
+    uint32_t DenominatorBW = DenominatorVal.getBitWidth();
+
+    if (NumeratorBW > DenominatorBW)
+      DenominatorVal = DenominatorVal.sext(NumeratorBW);
+    else if (NumeratorBW < DenominatorBW)
+      NumeratorVal = NumeratorVal.sext(DenominatorBW);
+
+    APInt QuotientVal(NumeratorVal.getBitWidth(), 0);
+    APInt RemainderVal(NumeratorVal.getBitWidth(), 0);
+    APInt::sdivrem(NumeratorVal, DenominatorVal, QuotientVal, RemainderVal);
+    Quotient = SE.getConstant(QuotientVal);
+    Remainder = SE.getConstant(RemainderVal);
+    return;
+  }
+}
+
+void SCEVDivision::visitAddRecExpr(const SCEVAddRecExpr *Numerator) {
+  const SCEV *StartQ, *StartR, *StepQ, *StepR;
+  if (!Numerator->isAffine())
+    return cannotDivide(Numerator);
+  divide(SE, Numerator->getStart(), Denominator, &StartQ, &StartR);
+  divide(SE, Numerator->getStepRecurrence(SE), Denominator, &StepQ, &StepR);
+  // Bail out if the types do not match.
+  Type *Ty = Denominator->getType();
+  if (Ty != StartQ->getType() || Ty != StartR->getType() ||
+      Ty != StepQ->getType() || Ty != StepR->getType())
+    return cannotDivide(Numerator);
+  Quotient = SE.getAddRecExpr(StartQ, StepQ, Numerator->getLoop(),
+                              Numerator->getNoWrapFlags());
+  Remainder = SE.getAddRecExpr(StartR, StepR, Numerator->getLoop(),
+                               Numerator->getNoWrapFlags());
+}
+
+void SCEVDivision::visitAddExpr(const SCEVAddExpr *Numerator) {
+  SmallVector<const SCEV *, 2> Qs, Rs;
+  Type *Ty = Denominator->getType();
+
+  for (const SCEV *Op : Numerator->operands()) {
+    const SCEV *Q, *R;
+    divide(SE, Op, Denominator, &Q, &R);
+
+    // Bail out if types do not match.
+    if (Ty != Q->getType() || Ty != R->getType())
+      return cannotDivide(Numerator);
+
+    Qs.push_back(Q);
+    Rs.push_back(R);
+  }
+
+  if (Qs.size() == 1) {
+    Quotient = Qs[0];
+    Remainder = Rs[0];
+    return;
+  }
+
+  Quotient = SE.getAddExpr(Qs);
+  Remainder = SE.getAddExpr(Rs);
+}
+
+void SCEVDivision::visitMulExpr(const SCEVMulExpr *Numerator) {
+  SmallVector<const SCEV *, 2> Qs;
+  Type *Ty = Denominator->getType();
+
+  bool FoundDenominatorTerm = false;
+  for (const SCEV *Op : Numerator->operands()) {
+    // Bail out if types do not match.
+    if (Ty != Op->getType())
+      return cannotDivide(Numerator);
+
+    if (FoundDenominatorTerm) {
+      Qs.push_back(Op);
+      continue;
+    }
+
+    // Check whether Denominator divides one of the product operands.
+    const SCEV *Q, *R;
+    divide(SE, Op, Denominator, &Q, &R);
+    if (!R->isZero()) {
+      Qs.push_back(Op);
+      continue;
+    }
+
+    // Bail out if types do not match.
+    if (Ty != Q->getType())
+      return cannotDivide(Numerator);
+
+    FoundDenominatorTerm = true;
+    Qs.push_back(Q);
+  }
+
+  if (FoundDenominatorTerm) {
+    Remainder = Zero;
+    if (Qs.size() == 1)
+      Quotient = Qs[0];
+    else
+      Quotient = SE.getMulExpr(Qs);
+    return;
+  }
+
+  if (!isa<SCEVUnknown>(Denominator))
+    return cannotDivide(Numerator);
+
+  // The Remainder is obtained by replacing Denominator by 0 in Numerator.
+  ValueToValueMap RewriteMap;
+  RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
+      cast<SCEVConstant>(Zero)->getValue();
+  Remainder = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
+
+  if (Remainder->isZero()) {
+    // The Quotient is obtained by replacing Denominator by 1 in Numerator.
+    RewriteMap[cast<SCEVUnknown>(Denominator)->getValue()] =
+        cast<SCEVConstant>(One)->getValue();
+    Quotient = SCEVParameterRewriter::rewrite(Numerator, SE, RewriteMap, true);
+    return;
+  }
+
+  // Quotient is (Numerator - Remainder) divided by Denominator.
+  const SCEV *Q, *R;
+  const SCEV *Diff = SE.getMinusSCEV(Numerator, Remainder);
+  // This SCEV does not seem to simplify: fail the division here.
+  if (sizeOfSCEV(Diff) > sizeOfSCEV(Numerator))
+    return cannotDivide(Numerator);
+  divide(SE, Diff, Denominator, &Q, &R);
+  if (R != Zero)
+    return cannotDivide(Numerator);
+  Quotient = Q;
+}
+
+SCEVDivision::SCEVDivision(ScalarEvolution &S, const SCEV *Numerator,
+                           const SCEV *Denominator)
+    : SE(S), Denominator(Denominator) {
+  Zero = SE.getZero(Denominator->getType());
+  One = SE.getOne(Denominator->getType());
+
+  // We generally do not know how to divide Expr by Denominator. We initialize
+  // the division to a "cannot divide" state to simplify the rest of the code.
+  cannotDivide(Numerator);
+}
+
+// Convenience function for giving up on the division. We set the quotient to
+// be equal to zero and the remainder to be equal to the numerator.
+void SCEVDivision::cannotDivide(const SCEV *Numerator) {
+  Quotient = Zero;
+  Remainder = Numerator;
+}
diff --git a/llvm/lib/CodeGen/SafeStackColoring.cpp b/llvm/lib/Analysis/StackLifetime.cpp
index 04a5c4b6d892..9727b7a33d1f 100644
--- a/llvm/lib/CodeGen/SafeStackColoring.cpp
+++ b/llvm/lib/Analysis/StackLifetime.cpp
@@ -1,4 +1,4 @@
-//===- SafeStackColoring.cpp - SafeStack frame coloring -------------------===//
+//===- StackLifetime.cpp - Alloca Lifetime Analysis -----------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,45 +6,64 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SafeStackColoring.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
+#include "llvm/Analysis/StackLifetime.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/IR/AssemblyAnnotationWriter.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/Instruction.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
+#include "llvm/Support/FormattedStream.h"
+#include <algorithm>
+#include <memory>
 #include <tuple>
-#include <utility>
 
 using namespace llvm;
-using namespace llvm::safestack;
 
-#define DEBUG_TYPE "safestackcoloring"
+#define DEBUG_TYPE "stack-lifetime"
 
-// Disabled by default due to PR32143.
-static cl::opt<bool> ClColoring("safe-stack-coloring",
-                                cl::desc("enable safe stack coloring"),
-                                cl::Hidden, cl::init(false));
-
-const StackColoring::LiveRange &StackColoring::getLiveRange(AllocaInst *AI) {
+const StackLifetime::LiveRange &
+StackLifetime::getLiveRange(const AllocaInst *AI) const {
   const auto IT = AllocaNumbering.find(AI);
   assert(IT != AllocaNumbering.end());
   return LiveRanges[IT->second];
 }
 
-bool StackColoring::readMarker(Instruction *I, bool *IsStart) {
+bool StackLifetime::isReachable(const Instruction *I) const {
+  return BlockInstRange.find(I->getParent()) != BlockInstRange.end();
+}
+
+bool StackLifetime::isAliveAfter(const AllocaInst *AI,
+                                 const Instruction *I) const {
+  const BasicBlock *BB = I->getParent();
+  auto ItBB = BlockInstRange.find(BB);
+  assert(ItBB != BlockInstRange.end() && "Unreachable is not expected");
+
+  // Search the block for the first instruction following 'I'.
+  auto It = std::upper_bound(Instructions.begin() + ItBB->getSecond().first + 1,
+                             Instructions.begin() + ItBB->getSecond().second, I,
+                             [](const Instruction *L, const Instruction *R) {
+                               return L->comesBefore(R);
+                             });
+  --It;
+  unsigned InstNum = It - Instructions.begin();
+  return getLiveRange(AI).test(InstNum);
+}
+
+static bool readMarker(const Instruction *I, bool *IsStart) {
   if (!I->isLifetimeStartOrEnd())
     return false;
 
@@ -53,33 +72,24 @@ bool StackColoring::readMarker(Instruction *I, bool *IsStart) {
   return true;
 }
 
-void StackColoring::removeAllMarkers() {
-  for (auto *I : Markers) {
-    auto *Op = dyn_cast<Instruction>(I->getOperand(1));
-    I->eraseFromParent();
-    // Remove the operand bitcast, too, if it has no more uses left.
-    if (Op && Op->use_empty())
-      Op->eraseFromParent();
-  }
-}
-
-void StackColoring::collectMarkers() {
+void StackLifetime::collectMarkers() {
   InterestingAllocas.resize(NumAllocas);
-  DenseMap<BasicBlock *, SmallDenseMap<Instruction *, Marker>> BBMarkerSet;
+  DenseMap<const BasicBlock *, SmallDenseMap<const IntrinsicInst *, Marker>>
+      BBMarkerSet;
 
   // Compute the set of start/end markers per basic block.
   for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo) {
-    AllocaInst *AI = Allocas[AllocaNo];
-    SmallVector<Instruction *, 8> WorkList;
+    const AllocaInst *AI = Allocas[AllocaNo];
+    SmallVector<const Instruction *, 8> WorkList;
     WorkList.push_back(AI);
     while (!WorkList.empty()) {
-      Instruction *I = WorkList.pop_back_val();
-      for (User *U : I->users()) {
+      const Instruction *I = WorkList.pop_back_val();
+      for (const User *U : I->users()) {
         if (auto *BI = dyn_cast<BitCastInst>(U)) {
           WorkList.push_back(BI);
           continue;
         }
-        auto *UI = dyn_cast<Instruction>(U);
+        auto *UI = dyn_cast<IntrinsicInst>(U);
         if (!UI)
           continue;
         bool IsStart;
@@ -88,7 +98,6 @@ void StackColoring::collectMarkers() {
         if (IsStart)
           InterestingAllocas.set(AllocaNo);
         BBMarkerSet[UI->getParent()][UI] = {AllocaNo, IsStart};
-        Markers.push_back(UI);
       }
     }
   }
@@ -101,40 +110,34 @@ void StackColoring::collectMarkers() {
   // * the list of markers in the instruction order
   // * the sets of allocas whose lifetime starts or ends in this BB
   LLVM_DEBUG(dbgs() << "Instructions:\n");
-  unsigned InstNo = 0;
-  for (BasicBlock *BB : depth_first(&F)) {
-    LLVM_DEBUG(dbgs() << "  " << InstNo << ": BB " << BB->getName() << "\n");
-    unsigned BBStart = InstNo++;
+  for (const BasicBlock *BB : depth_first(&F)) {
+    LLVM_DEBUG(dbgs() << "  " << Instructions.size() << ": BB " << BB->getName()
+                      << "\n");
+    auto BBStart = Instructions.size();
+    Instructions.push_back(nullptr);
 
-    BlockLifetimeInfo &BlockInfo = BlockLiveness[BB];
-    BlockInfo.Begin.resize(NumAllocas);
-    BlockInfo.End.resize(NumAllocas);
-    BlockInfo.LiveIn.resize(NumAllocas);
-    BlockInfo.LiveOut.resize(NumAllocas);
+    BlockLifetimeInfo &BlockInfo =
+        BlockLiveness.try_emplace(BB, NumAllocas).first->getSecond();
 
     auto &BlockMarkerSet = BBMarkerSet[BB];
     if (BlockMarkerSet.empty()) {
-      unsigned BBEnd = InstNo;
-      BlockInstRange[BB] = std::make_pair(BBStart, BBEnd);
+      BlockInstRange[BB] = std::make_pair(BBStart, Instructions.size());
       continue;
     }
 
-    auto ProcessMarker = [&](Instruction *I, const Marker &M) {
-      LLVM_DEBUG(dbgs() << "  " << InstNo << ":  "
+    auto ProcessMarker = [&](const IntrinsicInst *I, const Marker &M) {
+      LLVM_DEBUG(dbgs() << "  " << Instructions.size() << ":  "
                         << (M.IsStart ? "start " : "end   ") << M.AllocaNo
                         << ", " << *I << "\n");
 
-      BBMarkers[BB].push_back({InstNo, M});
-
-      InstructionNumbering[I] = InstNo++;
+      BBMarkers[BB].push_back({Instructions.size(), M});
+      Instructions.push_back(I);
 
       if (M.IsStart) {
-        if (BlockInfo.End.test(M.AllocaNo))
-          BlockInfo.End.reset(M.AllocaNo);
+        BlockInfo.End.reset(M.AllocaNo);
         BlockInfo.Begin.set(M.AllocaNo);
       } else {
-        if (BlockInfo.Begin.test(M.AllocaNo))
-          BlockInfo.Begin.reset(M.AllocaNo);
+        BlockInfo.Begin.reset(M.AllocaNo);
         BlockInfo.End.set(M.AllocaNo);
       }
     };
@@ -144,27 +147,28 @@ void StackColoring::collectMarkers() {
                     BlockMarkerSet.begin()->getSecond());
     } else {
       // Scan the BB to determine the marker order.
-      for (Instruction &I : *BB) {
-        auto It = BlockMarkerSet.find(&I);
+      for (const Instruction &I : *BB) {
+        const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
+        if (!II)
+          continue;
+        auto It = BlockMarkerSet.find(II);
         if (It == BlockMarkerSet.end())
           continue;
-        ProcessMarker(&I, It->getSecond());
+        ProcessMarker(II, It->getSecond());
       }
     }
 
-    unsigned BBEnd = InstNo;
-    BlockInstRange[BB] = std::make_pair(BBStart, BBEnd);
+    BlockInstRange[BB] = std::make_pair(BBStart, Instructions.size());
   }
-  NumInst = InstNo;
 }
 
-void StackColoring::calculateLocalLiveness() {
-  bool changed = true;
-  while (changed) {
-    changed = false;
+void StackLifetime::calculateLocalLiveness() {
+  bool Changed = true;
+  while (Changed) {
+    Changed = false;
 
-    for (BasicBlock *BB : depth_first(&F)) {
-      BlockLifetimeInfo &BlockInfo = BlockLiveness[BB];
+    for (const BasicBlock *BB : depth_first(&F)) {
+      BlockLifetimeInfo &BlockInfo = BlockLiveness.find(BB)->getSecond();
 
       // Compute LiveIn by unioning together the LiveOut sets of all preds.
       BitVector LocalLiveIn;
@@ -173,7 +177,17 @@ void StackColoring::calculateLocalLiveness() {
         // If a predecessor is unreachable, ignore it.
         if (I == BlockLiveness.end())
           continue;
-        LocalLiveIn |= I->second.LiveOut;
+        switch (Type) {
+        case LivenessType::May:
+          LocalLiveIn |= I->second.LiveOut;
+          break;
+        case LivenessType::Must:
+          if (LocalLiveIn.empty())
+            LocalLiveIn = I->second.LiveOut;
+          else
+            LocalLiveIn &= I->second.LiveOut;
+          break;
+        }
       }
 
       // Compute LiveOut by subtracting out lifetimes that end in this
@@ -189,22 +203,21 @@ void StackColoring::calculateLocalLiveness() {
 
       // Update block LiveIn set, noting whether it has changed.
       if (LocalLiveIn.test(BlockInfo.LiveIn)) {
-        changed = true;
         BlockInfo.LiveIn |= LocalLiveIn;
       }
 
       // Update block LiveOut set, noting whether it has changed.
       if (LocalLiveOut.test(BlockInfo.LiveOut)) {
-        changed = true;
+        Changed = true;
         BlockInfo.LiveOut |= LocalLiveOut;
       }
     }
   } // while changed.
 }
 
-void StackColoring::calculateLiveIntervals() {
+void StackLifetime::calculateLiveIntervals() {
   for (auto IT : BlockLiveness) {
-    BasicBlock *BB = IT.getFirst();
+    const BasicBlock *BB = IT.getFirst();
     BlockLifetimeInfo &BlockInfo = IT.getSecond();
     unsigned BBStart, BBEnd;
     std::tie(BBStart, BBEnd) = BlockInstRange[BB];
@@ -238,7 +251,7 @@ void StackColoring::calculateLiveIntervals() {
       } else {
         assert(!Ended.test(AllocaNo));
         if (Started.test(AllocaNo)) {
-          LiveRanges[AllocaNo].AddRange(Start[AllocaNo], InstNo);
+          LiveRanges[AllocaNo].addRange(Start[AllocaNo], InstNo);
           Started.reset(AllocaNo);
         }
         Ended.set(AllocaNo);
@@ -247,23 +260,23 @@ void StackColoring::calculateLiveIntervals() {
 
     for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo)
       if (Started.test(AllocaNo))
-        LiveRanges[AllocaNo].AddRange(Start[AllocaNo], BBEnd);
+        LiveRanges[AllocaNo].addRange(Start[AllocaNo], BBEnd);
   }
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void StackColoring::dumpAllocas() {
+LLVM_DUMP_METHOD void StackLifetime::dumpAllocas() const {
   dbgs() << "Allocas:\n";
   for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo)
     dbgs() << "  " << AllocaNo << ": " << *Allocas[AllocaNo] << "\n";
 }
 
-LLVM_DUMP_METHOD void StackColoring::dumpBlockLiveness() {
+LLVM_DUMP_METHOD void StackLifetime::dumpBlockLiveness() const {
   dbgs() << "Block liveness:\n";
   for (auto IT : BlockLiveness) {
-    BasicBlock *BB = IT.getFirst();
-    BlockLifetimeInfo &BlockInfo = BlockLiveness[BB];
-    auto BlockRange = BlockInstRange[BB];
+    const BasicBlock *BB = IT.getFirst();
+    const BlockLifetimeInfo &BlockInfo = BlockLiveness.find(BB)->getSecond();
+    auto BlockRange = BlockInstRange.find(BB)->getSecond();
     dbgs() << "  BB [" << BlockRange.first << ", " << BlockRange.second
            << "): begin " << BlockInfo.Begin << ", end " << BlockInfo.End
            << ", livein " << BlockInfo.LiveIn << ", liveout "
@@ -271,34 +284,27 @@ LLVM_DUMP_METHOD void StackColoring::dumpBlockLiveness() {
   }
 }
 
-LLVM_DUMP_METHOD void StackColoring::dumpLiveRanges() {
+LLVM_DUMP_METHOD void StackLifetime::dumpLiveRanges() const {
   dbgs() << "Alloca liveness:\n";
-  for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo) {
-    LiveRange &Range = LiveRanges[AllocaNo];
-    dbgs() << "  " << AllocaNo << ": " << Range << "\n";
-  }
+  for (unsigned AllocaNo = 0; AllocaNo < NumAllocas; ++AllocaNo)
+    dbgs() << "  " << AllocaNo << ": " << LiveRanges[AllocaNo] << "\n";
 }
 #endif
 
-void StackColoring::run() {
+StackLifetime::StackLifetime(const Function &F,
+                             ArrayRef<const AllocaInst *> Allocas,
+                             LivenessType Type)
+    : F(F), Type(Type), Allocas(Allocas), NumAllocas(Allocas.size()) {
   LLVM_DEBUG(dumpAllocas());
 
   for (unsigned I = 0; I < NumAllocas; ++I)
     AllocaNumbering[Allocas[I]] = I;
-  LiveRanges.resize(NumAllocas);
 
   collectMarkers();
+}
 
-  if (!ClColoring) {
-    for (auto &R : LiveRanges) {
-      R.SetMaximum(1);
-      R.AddRange(0, 1);
-    }
-    return;
-  }
-
-  for (auto &R : LiveRanges)
-    R.SetMaximum(NumInst);
+void StackLifetime::run() {
+  LiveRanges.resize(NumAllocas, LiveRange(Instructions.size()));
   for (unsigned I = 0; I < NumAllocas; ++I)
     if (!InterestingAllocas.test(I))
       LiveRanges[I] = getFullLiveRange();
@@ -308,3 +314,60 @@ void StackColoring::run() {
   calculateLiveIntervals();
   LLVM_DEBUG(dumpLiveRanges());
 }
+
+class StackLifetime::LifetimeAnnotationWriter
+    : public AssemblyAnnotationWriter {
+  const StackLifetime &SL;
+
+  void printInstrAlive(unsigned InstrNo, formatted_raw_ostream &OS) {
+    SmallVector<StringRef, 16> Names;
+    for (const auto &KV : SL.AllocaNumbering) {
+      if (SL.LiveRanges[KV.getSecond()].test(InstrNo))
+        Names.push_back(KV.getFirst()->getName());
+    }
+    llvm::sort(Names);
+    OS << "  ; Alive: <" << llvm::join(Names, " ") << ">\n";
+  }
+
+  void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                formatted_raw_ostream &OS) override {
+    auto ItBB = SL.BlockInstRange.find(BB);
+    if (ItBB == SL.BlockInstRange.end())
+      return; // Unreachable.
+    printInstrAlive(ItBB->getSecond().first, OS);
+  }
+
+  void printInfoComment(const Value &V, formatted_raw_ostream &OS) override {
+    const Instruction *Instr = dyn_cast<Instruction>(&V);
+    if (!Instr || !SL.isReachable(Instr))
+      return;
+
+    SmallVector<StringRef, 16> Names;
+    for (const auto &KV : SL.AllocaNumbering) {
+      if (SL.isAliveAfter(KV.getFirst(), Instr))
+        Names.push_back(KV.getFirst()->getName());
+    }
+    llvm::sort(Names);
+    OS << "\n  ; Alive: <" << llvm::join(Names, " ") << ">\n";
+  }
+
+public:
+  LifetimeAnnotationWriter(const StackLifetime &SL) : SL(SL) {}
+};
+
+void StackLifetime::print(raw_ostream &OS) {
+  LifetimeAnnotationWriter AAW(*this);
+  F.print(OS, &AAW);
+}
+
+PreservedAnalyses StackLifetimePrinterPass::run(Function &F,
+                                                FunctionAnalysisManager &AM) {
+  SmallVector<const AllocaInst *, 8> Allocas;
+  for (auto &I : instructions(F))
+    if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+      Allocas.push_back(AI);
+  StackLifetime SL(F, Allocas, Type);
+  SL.run();
+  SL.print(OS);
+  return PreservedAnalyses::all();
+}
diff --git a/llvm/lib/Analysis/StackSafetyAnalysis.cpp b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
index 7f5bedabbd80..bbfc303aefac 100644
--- a/llvm/lib/Analysis/StackSafetyAnalysis.cpp
+++ b/llvm/lib/Analysis/StackSafetyAnalysis.cpp
@@ -9,56 +9,49 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Analysis/StackSafetyAnalysis.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
-#include "llvm/IR/CallSite.h"
+#include "llvm/Analysis/StackLifetime.h"
+#include "llvm/IR/ConstantRange.h"
+#include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <memory>
 
 using namespace llvm;
 
 #define DEBUG_TYPE "stack-safety"
 
+STATISTIC(NumAllocaStackSafe, "Number of safe allocas");
+STATISTIC(NumAllocaTotal, "Number of total allocas");
+
 static cl::opt<int> StackSafetyMaxIterations("stack-safety-max-iterations",
                                              cl::init(20), cl::Hidden);
 
-namespace {
-
-/// Rewrite an SCEV expression for a memory access address to an expression that
-/// represents offset from the given alloca.
-class AllocaOffsetRewriter : public SCEVRewriteVisitor<AllocaOffsetRewriter> {
-  const Value *AllocaPtr;
+static cl::opt<bool> StackSafetyPrint("stack-safety-print", cl::init(false),
+                                      cl::Hidden);
 
-public:
-  AllocaOffsetRewriter(ScalarEvolution &SE, const Value *AllocaPtr)
-      : SCEVRewriteVisitor(SE), AllocaPtr(AllocaPtr) {}
-
-  const SCEV *visit(const SCEV *Expr) {
-    // Only re-write the expression if the alloca is used in an addition
-    // expression (it can be used in other types of expressions if it's cast to
-    // an int and passed as an argument.)
-    if (!isa<SCEVAddRecExpr>(Expr) && !isa<SCEVAddExpr>(Expr) &&
-        !isa<SCEVUnknown>(Expr))
-      return Expr;
-    return SCEVRewriteVisitor<AllocaOffsetRewriter>::visit(Expr);
-  }
+static cl::opt<bool> StackSafetyRun("stack-safety-run", cl::init(false),
+                                    cl::Hidden);
 
-  const SCEV *visitUnknown(const SCEVUnknown *Expr) {
-    // FIXME: look through one or several levels of definitions?
-    // This can be inttoptr(AllocaPtr) and SCEV would not unwrap
-    // it for us.
-    if (Expr->getValue() == AllocaPtr)
-      return SE.getZero(Expr->getType());
-    return Expr;
-  }
-};
+namespace {
 
 /// Describes use of address in as a function call argument.
-struct PassAsArgInfo {
+template <typename CalleeTy> struct CallInfo {
   /// Function being called.
-  const GlobalValue *Callee = nullptr;
+  const CalleeTy *Callee = nullptr;
   /// Index of argument which pass address.
   size_t ParamNo = 0;
   // Offset range of address from base address (alloca or calling function
@@ -66,234 +59,262 @@ struct PassAsArgInfo {
   // Range should never set to empty-set, that is an invalid access range
   // that can cause empty-set to be propagated with ConstantRange::add
   ConstantRange Offset;
-  PassAsArgInfo(const GlobalValue *Callee, size_t ParamNo, ConstantRange Offset)
+  CallInfo(const CalleeTy *Callee, size_t ParamNo, const ConstantRange &Offset)
       : Callee(Callee), ParamNo(ParamNo), Offset(Offset) {}
-
-  StringRef getName() const { return Callee->getName(); }
 };
 
-raw_ostream &operator<<(raw_ostream &OS, const PassAsArgInfo &P) {
-  return OS << "@" << P.getName() << "(arg" << P.ParamNo << ", " << P.Offset
-            << ")";
+template <typename CalleeTy>
+raw_ostream &operator<<(raw_ostream &OS, const CallInfo<CalleeTy> &P) {
+  return OS << "@" << P.Callee->getName() << "(arg" << P.ParamNo << ", "
+            << P.Offset << ")";
 }
 
 /// Describe uses of address (alloca or parameter) inside of the function.
-struct UseInfo {
+template <typename CalleeTy> struct UseInfo {
   // Access range if the address (alloca or parameters).
   // It is allowed to be empty-set when there are no known accesses.
   ConstantRange Range;
 
   // List of calls which pass address as an argument.
-  SmallVector<PassAsArgInfo, 4> Calls;
+  SmallVector<CallInfo<CalleeTy>, 4> Calls;
 
-  explicit UseInfo(unsigned PointerSize) : Range{PointerSize, false} {}
+  UseInfo(unsigned PointerSize) : Range{PointerSize, false} {}
 
-  void updateRange(ConstantRange R) { Range = Range.unionWith(R); }
+  void updateRange(const ConstantRange &R) {
+    assert(!R.isUpperSignWrapped());
+    Range = Range.unionWith(R);
+    assert(!Range.isUpperSignWrapped());
+  }
 };
 
-raw_ostream &operator<<(raw_ostream &OS, const UseInfo &U) {
+template <typename CalleeTy>
+raw_ostream &operator<<(raw_ostream &OS, const UseInfo<CalleeTy> &U) {
   OS << U.Range;
   for (auto &Call : U.Calls)
     OS << ", " << Call;
   return OS;
 }
 
-struct AllocaInfo {
-  const AllocaInst *AI = nullptr;
-  uint64_t Size = 0;
-  UseInfo Use;
-
-  AllocaInfo(unsigned PointerSize, const AllocaInst *AI, uint64_t Size)
-      : AI(AI), Size(Size), Use(PointerSize) {}
-
-  StringRef getName() const { return AI->getName(); }
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const AllocaInfo &A) {
-  return OS << A.getName() << "[" << A.Size << "]: " << A.Use;
+// Check if we should bailout for such ranges.
+bool isUnsafe(const ConstantRange &R) {
+  return R.isEmptySet() || R.isFullSet() || R.isUpperSignWrapped();
 }
 
-struct ParamInfo {
-  const Argument *Arg = nullptr;
-  UseInfo Use;
-
-  explicit ParamInfo(unsigned PointerSize, const Argument *Arg)
-      : Arg(Arg), Use(PointerSize) {}
-
-  StringRef getName() const { return Arg ? Arg->getName() : "<N/A>"; }
-};
-
-raw_ostream &operator<<(raw_ostream &OS, const ParamInfo &P) {
-  return OS << P.getName() << "[]: " << P.Use;
+ConstantRange addOverflowNever(const ConstantRange &L, const ConstantRange &R) {
+  if (L.signedAddMayOverflow(R) !=
+      ConstantRange::OverflowResult::NeverOverflows)
+    return ConstantRange(L.getBitWidth(), true);
+  return L.add(R);
 }
 
-/// Calculate the allocation size of a given alloca. Returns 0 if the
-/// size can not be statically determined.
-uint64_t getStaticAllocaAllocationSize(const AllocaInst *AI) {
-  const DataLayout &DL = AI->getModule()->getDataLayout();
-  uint64_t Size = DL.getTypeAllocSize(AI->getAllocatedType());
-  if (AI->isArrayAllocation()) {
-    auto C = dyn_cast<ConstantInt>(AI->getArraySize());
+/// Calculate the allocation size of a given alloca. Returns empty range
+// in case of confution.
+ConstantRange getStaticAllocaSizeRange(const AllocaInst &AI) {
+  const DataLayout &DL = AI.getModule()->getDataLayout();
+  TypeSize TS = DL.getTypeAllocSize(AI.getAllocatedType());
+  unsigned PointerSize = DL.getMaxPointerSizeInBits();
+  // Fallback to empty range for alloca size.
+  ConstantRange R = ConstantRange::getEmpty(PointerSize);
+  if (TS.isScalable())
+    return R;
+  APInt APSize(PointerSize, TS.getFixedSize(), true);
+  if (APSize.isNonPositive())
+    return R;
+  if (AI.isArrayAllocation()) {
+    const auto *C = dyn_cast<ConstantInt>(AI.getArraySize());
     if (!C)
-      return 0;
-    Size *= C->getZExtValue();
+      return R;
+    bool Overflow = false;
+    APInt Mul = C->getValue();
+    if (Mul.isNonPositive())
+      return R;
+    Mul = Mul.sextOrTrunc(PointerSize);
+    APSize = APSize.smul_ov(Mul, Overflow);
+    if (Overflow)
+      return R;
   }
-  return Size;
+  R = ConstantRange(APInt::getNullValue(PointerSize), APSize);
+  assert(!isUnsafe(R));
+  return R;
 }
 
-} // end anonymous namespace
-
-/// Describes uses of allocas and parameters inside of a single function.
-struct StackSafetyInfo::FunctionInfo {
-  // May be a Function or a GlobalAlias
-  const GlobalValue *GV = nullptr;
-  // Informations about allocas uses.
-  SmallVector<AllocaInfo, 4> Allocas;
-  // Informations about parameters uses.
-  SmallVector<ParamInfo, 4> Params;
+template <typename CalleeTy> struct FunctionInfo {
+  std::map<const AllocaInst *, UseInfo<CalleeTy>> Allocas;
+  std::map<uint32_t, UseInfo<CalleeTy>> Params;
   // TODO: describe return value as depending on one or more of its arguments.
 
   // StackSafetyDataFlowAnalysis counter stored here for faster access.
   int UpdateCount = 0;
 
-  FunctionInfo(const StackSafetyInfo &SSI) : FunctionInfo(*SSI.Info) {}
-
-  explicit FunctionInfo(const Function *F) : GV(F){};
-  // Creates FunctionInfo that forwards all the parameters to the aliasee.
-  explicit FunctionInfo(const GlobalAlias *A);
-
-  FunctionInfo(FunctionInfo &&) = default;
-
-  bool IsDSOLocal() const { return GV->isDSOLocal(); };
-
-  bool IsInterposable() const { return GV->isInterposable(); };
-
-  StringRef getName() const { return GV->getName(); }
-
-  void print(raw_ostream &O) const {
+  void print(raw_ostream &O, StringRef Name, const Function *F) const {
     // TODO: Consider different printout format after
     // StackSafetyDataFlowAnalysis. Calls and parameters are irrelevant then.
-    O << "  @" << getName() << (IsDSOLocal() ? "" : " dso_preemptable")
-      << (IsInterposable() ? " interposable" : "") << "\n";
+    O << "  @" << Name << ((F && F->isDSOLocal()) ? "" : " dso_preemptable")
+      << ((F && F->isInterposable()) ? " interposable" : "") << "\n";
+
     O << "    args uses:\n";
-    for (auto &P : Params)
-      O << "      " << P << "\n";
+    for (auto &KV : Params) {
+      O << "      ";
+      if (F)
+        O << F->getArg(KV.first)->getName();
+      else
+        O << formatv("arg{0}", KV.first);
+      O << "[]: " << KV.second << "\n";
+    }
+
     O << "    allocas uses:\n";
-    for (auto &AS : Allocas)
-      O << "      " << AS << "\n";
+    if (F) {
+      for (auto &I : instructions(F)) {
+        if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
+          auto &AS = Allocas.find(AI)->second;
+          O << "      " << AI->getName() << "["
+            << getStaticAllocaSizeRange(*AI).getUpper() << "]: " << AS << "\n";
+        }
+      }
+    } else {
+      assert(Allocas.empty());
+    }
   }
+};
+
+using GVToSSI = std::map<const GlobalValue *, FunctionInfo<GlobalValue>>;
 
-private:
-  FunctionInfo(const FunctionInfo &) = default;
+} // namespace
+
+struct StackSafetyInfo::InfoTy {
+  FunctionInfo<GlobalValue> Info;
 };
 
-StackSafetyInfo::FunctionInfo::FunctionInfo(const GlobalAlias *A) : GV(A) {
-  unsigned PointerSize = A->getParent()->getDataLayout().getPointerSizeInBits();
-  const GlobalObject *Aliasee = A->getBaseObject();
-  const FunctionType *Type = cast<FunctionType>(Aliasee->getValueType());
-  // 'Forward' all parameters to this alias to the aliasee
-  for (unsigned ArgNo = 0; ArgNo < Type->getNumParams(); ArgNo++) {
-    Params.emplace_back(PointerSize, nullptr);
-    UseInfo &US = Params.back().Use;
-    US.Calls.emplace_back(Aliasee, ArgNo, ConstantRange(APInt(PointerSize, 0)));
-  }
-}
+struct StackSafetyGlobalInfo::InfoTy {
+  GVToSSI Info;
+  SmallPtrSet<const AllocaInst *, 8> SafeAllocas;
+};
 
 namespace {
 
 class StackSafetyLocalAnalysis {
-  const Function &F;
+  Function &F;
   const DataLayout &DL;
   ScalarEvolution &SE;
   unsigned PointerSize = 0;
 
   const ConstantRange UnknownRange;
 
-  ConstantRange offsetFromAlloca(Value *Addr, const Value *AllocaPtr);
-  ConstantRange getAccessRange(Value *Addr, const Value *AllocaPtr,
-                               uint64_t AccessSize);
+  ConstantRange offsetFrom(Value *Addr, Value *Base);
+  ConstantRange getAccessRange(Value *Addr, Value *Base,
+                               const ConstantRange &SizeRange);
+  ConstantRange getAccessRange(Value *Addr, Value *Base, TypeSize Size);
   ConstantRange getMemIntrinsicAccessRange(const MemIntrinsic *MI, const Use &U,
-                                           const Value *AllocaPtr);
+                                           Value *Base);
 
-  bool analyzeAllUses(const Value *Ptr, UseInfo &AS);
-
-  ConstantRange getRange(uint64_t Lower, uint64_t Upper) const {
-    return ConstantRange(APInt(PointerSize, Lower), APInt(PointerSize, Upper));
-  }
+  bool analyzeAllUses(Value *Ptr, UseInfo<GlobalValue> &AS,
+                      const StackLifetime &SL);
 
 public:
-  StackSafetyLocalAnalysis(const Function &F, ScalarEvolution &SE)
+  StackSafetyLocalAnalysis(Function &F, ScalarEvolution &SE)
       : F(F), DL(F.getParent()->getDataLayout()), SE(SE),
         PointerSize(DL.getPointerSizeInBits()),
         UnknownRange(PointerSize, true) {}
 
   // Run the transformation on the associated function.
-  StackSafetyInfo run();
+  FunctionInfo<GlobalValue> run();
 };
 
-ConstantRange
-StackSafetyLocalAnalysis::offsetFromAlloca(Value *Addr,
-                                           const Value *AllocaPtr) {
-  if (!SE.isSCEVable(Addr->getType()))
+ConstantRange StackSafetyLocalAnalysis::offsetFrom(Value *Addr, Value *Base) {
+  if (!SE.isSCEVable(Addr->getType()) || !SE.isSCEVable(Base->getType()))
     return UnknownRange;
 
-  AllocaOffsetRewriter Rewriter(SE, AllocaPtr);
-  const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
-  ConstantRange Offset = SE.getUnsignedRange(Expr).zextOrTrunc(PointerSize);
-  assert(!Offset.isEmptySet());
-  return Offset;
+  auto *PtrTy = IntegerType::getInt8PtrTy(SE.getContext());
+  const SCEV *AddrExp = SE.getTruncateOrZeroExtend(SE.getSCEV(Addr), PtrTy);
+  const SCEV *BaseExp = SE.getTruncateOrZeroExtend(SE.getSCEV(Base), PtrTy);
+  const SCEV *Diff = SE.getMinusSCEV(AddrExp, BaseExp);
+
+  ConstantRange Offset = SE.getSignedRange(Diff);
+  if (isUnsafe(Offset))
+    return UnknownRange;
+  return Offset.sextOrTrunc(PointerSize);
 }
 
-ConstantRange StackSafetyLocalAnalysis::getAccessRange(Value *Addr,
-                                                       const Value *AllocaPtr,
-                                                       uint64_t AccessSize) {
-  if (!SE.isSCEVable(Addr->getType()))
+ConstantRange
+StackSafetyLocalAnalysis::getAccessRange(Value *Addr, Value *Base,
+                                         const ConstantRange &SizeRange) {
+  // Zero-size loads and stores do not access memory.
+  if (SizeRange.isEmptySet())
+    return ConstantRange::getEmpty(PointerSize);
+  assert(!isUnsafe(SizeRange));
+
+  ConstantRange Offsets = offsetFrom(Addr, Base);
+  if (isUnsafe(Offsets))
     return UnknownRange;
 
-  AllocaOffsetRewriter Rewriter(SE, AllocaPtr);
-  const SCEV *Expr = Rewriter.visit(SE.getSCEV(Addr));
+  Offsets = addOverflowNever(Offsets, SizeRange);
+  if (isUnsafe(Offsets))
+    return UnknownRange;
+  return Offsets;
+}
 
-  ConstantRange AccessStartRange =
-      SE.getUnsignedRange(Expr).zextOrTrunc(PointerSize);
-  ConstantRange SizeRange = getRange(0, AccessSize);
-  ConstantRange AccessRange = AccessStartRange.add(SizeRange);
-  assert(!AccessRange.isEmptySet());
-  return AccessRange;
+ConstantRange StackSafetyLocalAnalysis::getAccessRange(Value *Addr, Value *Base,
+                                                       TypeSize Size) {
+  if (Size.isScalable())
+    return UnknownRange;
+  APInt APSize(PointerSize, Size.getFixedSize(), true);
+  if (APSize.isNegative())
+    return UnknownRange;
+  return getAccessRange(
+      Addr, Base, ConstantRange(APInt::getNullValue(PointerSize), APSize));
 }
 
 ConstantRange StackSafetyLocalAnalysis::getMemIntrinsicAccessRange(
-    const MemIntrinsic *MI, const Use &U, const Value *AllocaPtr) {
-  if (auto MTI = dyn_cast<MemTransferInst>(MI)) {
+    const MemIntrinsic *MI, const Use &U, Value *Base) {
+  if (const auto *MTI = dyn_cast<MemTransferInst>(MI)) {
     if (MTI->getRawSource() != U && MTI->getRawDest() != U)
-      return getRange(0, 1);
+      return ConstantRange::getEmpty(PointerSize);
   } else {
     if (MI->getRawDest() != U)
-      return getRange(0, 1);
+      return ConstantRange::getEmpty(PointerSize);
   }
-  const auto *Len = dyn_cast<ConstantInt>(MI->getLength());
-  // Non-constant size => unsafe. FIXME: try SCEV getRange.
-  if (!Len)
+
+  auto *CalculationTy = IntegerType::getIntNTy(SE.getContext(), PointerSize);
+  if (!SE.isSCEVable(MI->getLength()->getType()))
     return UnknownRange;
-  ConstantRange AccessRange = getAccessRange(U, AllocaPtr, Len->getZExtValue());
-  return AccessRange;
+
+  const SCEV *Expr =
+      SE.getTruncateOrZeroExtend(SE.getSCEV(MI->getLength()), CalculationTy);
+  ConstantRange Sizes = SE.getSignedRange(Expr);
+  if (Sizes.getUpper().isNegative() || isUnsafe(Sizes))
+    return UnknownRange;
+  Sizes = Sizes.sextOrTrunc(PointerSize);
+  ConstantRange SizeRange(APInt::getNullValue(PointerSize),
+                          Sizes.getUpper() - 1);
+  return getAccessRange(U, Base, SizeRange);
 }
 
 /// The function analyzes all local uses of Ptr (alloca or argument) and
 /// calculates local access range and all function calls where it was used.
-bool StackSafetyLocalAnalysis::analyzeAllUses(const Value *Ptr, UseInfo &US) {
+bool StackSafetyLocalAnalysis::analyzeAllUses(Value *Ptr,
+                                              UseInfo<GlobalValue> &US,
+                                              const StackLifetime &SL) {
   SmallPtrSet<const Value *, 16> Visited;
   SmallVector<const Value *, 8> WorkList;
   WorkList.push_back(Ptr);
+  const AllocaInst *AI = dyn_cast<AllocaInst>(Ptr);
 
   // A DFS search through all uses of the alloca in bitcasts/PHI/GEPs/etc.
   while (!WorkList.empty()) {
     const Value *V = WorkList.pop_back_val();
     for (const Use &UI : V->uses()) {
-      auto I = cast<const Instruction>(UI.getUser());
+      const auto *I = cast<Instruction>(UI.getUser());
+      if (!SL.isReachable(I))
+        continue;
+
       assert(V == UI.get());
 
       switch (I->getOpcode()) {
       case Instruction::Load: {
+        if (AI && !SL.isAliveAfter(AI, I)) {
+          US.updateRange(UnknownRange);
+          return false;
+        }
         US.updateRange(
             getAccessRange(UI, Ptr, DL.getTypeStoreSize(I->getType())));
         break;
@@ -308,6 +329,10 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(const Value *Ptr, UseInfo &US) {
           US.updateRange(UnknownRange);
           return false;
         }
+        if (AI && !SL.isAliveAfter(AI, I)) {
+          US.updateRange(UnknownRange);
+          return false;
+        }
         US.updateRange(getAccessRange(
             UI, Ptr, DL.getTypeStoreSize(I->getOperand(0)->getType())));
         break;
@@ -322,36 +347,44 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(const Value *Ptr, UseInfo &US) {
 
       case Instruction::Call:
       case Instruction::Invoke: {
-        ImmutableCallSite CS(I);
-
         if (I->isLifetimeStartOrEnd())
           break;
 
+        if (AI && !SL.isAliveAfter(AI, I)) {
+          US.updateRange(UnknownRange);
+          return false;
+        }
+
         if (const MemIntrinsic *MI = dyn_cast<MemIntrinsic>(I)) {
           US.updateRange(getMemIntrinsicAccessRange(MI, UI, Ptr));
           break;
         }
 
+        const auto &CB = cast<CallBase>(*I);
+        if (!CB.isArgOperand(&UI)) {
+          US.updateRange(UnknownRange);
+          return false;
+        }
+
+        unsigned ArgNo = CB.getArgOperandNo(&UI);
+        if (CB.isByValArgument(ArgNo)) {
+          US.updateRange(getAccessRange(
+              UI, Ptr, DL.getTypeStoreSize(CB.getParamByValType(ArgNo))));
+          break;
+        }
+
         // FIXME: consult devirt?
         // Do not follow aliases, otherwise we could inadvertently follow
         // dso_preemptable aliases or aliases with interposable linkage.
         const GlobalValue *Callee =
-            dyn_cast<GlobalValue>(CS.getCalledValue()->stripPointerCasts());
+            dyn_cast<GlobalValue>(CB.getCalledOperand()->stripPointerCasts());
         if (!Callee) {
           US.updateRange(UnknownRange);
           return false;
         }
 
         assert(isa<Function>(Callee) || isa<GlobalAlias>(Callee));
-
-        ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
-        for (ImmutableCallSite::arg_iterator A = B; A != E; ++A) {
-          if (A->get() == V) {
-            ConstantRange Offset = offsetFromAlloca(UI, Ptr);
-            US.Calls.emplace_back(Callee, A - B, Offset);
-          }
-        }
-
+        US.Calls.emplace_back(Callee, ArgNo, offsetFrom(UI, Ptr));
         break;
       }
 
@@ -365,51 +398,52 @@ bool StackSafetyLocalAnalysis::analyzeAllUses(const Value *Ptr, UseInfo &US) {
   return true;
 }
 
-StackSafetyInfo StackSafetyLocalAnalysis::run() {
-  StackSafetyInfo::FunctionInfo Info(&F);
+FunctionInfo<GlobalValue> StackSafetyLocalAnalysis::run() {
+  FunctionInfo<GlobalValue> Info;
   assert(!F.isDeclaration() &&
          "Can't run StackSafety on a function declaration");
 
   LLVM_DEBUG(dbgs() << "[StackSafety] " << F.getName() << "\n");
 
-  for (auto &I : instructions(F)) {
-    if (auto AI = dyn_cast<AllocaInst>(&I)) {
-      Info.Allocas.emplace_back(PointerSize, AI,
-                                getStaticAllocaAllocationSize(AI));
-      AllocaInfo &AS = Info.Allocas.back();
-      analyzeAllUses(AI, AS.Use);
-    }
+  SmallVector<AllocaInst *, 64> Allocas;
+  for (auto &I : instructions(F))
+    if (auto *AI = dyn_cast<AllocaInst>(&I))
+      Allocas.push_back(AI);
+  StackLifetime SL(F, Allocas, StackLifetime::LivenessType::Must);
+  SL.run();
+
+  for (auto *AI : Allocas) {
+    auto &UI = Info.Allocas.emplace(AI, PointerSize).first->second;
+    analyzeAllUses(AI, UI, SL);
   }
 
-  for (const Argument &A : make_range(F.arg_begin(), F.arg_end())) {
-    Info.Params.emplace_back(PointerSize, &A);
-    ParamInfo &PS = Info.Params.back();
-    analyzeAllUses(&A, PS.Use);
+  for (Argument &A : make_range(F.arg_begin(), F.arg_end())) {
+    // Non pointers and bypass arguments are not going to be used in any global
+    // processing.
+    if (A.getType()->isPointerTy() && !A.hasByValAttr()) {
+      auto &UI = Info.Params.emplace(A.getArgNo(), PointerSize).first->second;
+      analyzeAllUses(&A, UI, SL);
+    }
   }
 
+  LLVM_DEBUG(Info.print(dbgs(), F.getName(), &F));
   LLVM_DEBUG(dbgs() << "[StackSafety] done\n");
-  LLVM_DEBUG(Info.print(dbgs()));
-  return StackSafetyInfo(std::move(Info));
+  return Info;
 }
 
-class StackSafetyDataFlowAnalysis {
-  using FunctionMap =
-      std::map<const GlobalValue *, StackSafetyInfo::FunctionInfo>;
+template <typename CalleeTy> class StackSafetyDataFlowAnalysis {
+  using FunctionMap = std::map<const CalleeTy *, FunctionInfo<CalleeTy>>;
 
   FunctionMap Functions;
-  // Callee-to-Caller multimap.
-  DenseMap<const GlobalValue *, SmallVector<const GlobalValue *, 4>> Callers;
-  SetVector<const GlobalValue *> WorkList;
-
-  unsigned PointerSize = 0;
   const ConstantRange UnknownRange;
 
-  ConstantRange getArgumentAccessRange(const GlobalValue *Callee,
-                                       unsigned ParamNo) const;
-  bool updateOneUse(UseInfo &US, bool UpdateToFullSet);
-  void updateOneNode(const GlobalValue *Callee,
-                     StackSafetyInfo::FunctionInfo &FS);
-  void updateOneNode(const GlobalValue *Callee) {
+  // Callee-to-Caller multimap.
+  DenseMap<const CalleeTy *, SmallVector<const CalleeTy *, 4>> Callers;
+  SetVector<const CalleeTy *> WorkList;
+
+  bool updateOneUse(UseInfo<CalleeTy> &US, bool UpdateToFullSet);
+  void updateOneNode(const CalleeTy *Callee, FunctionInfo<CalleeTy> &FS);
+  void updateOneNode(const CalleeTy *Callee) {
     updateOneNode(Callee, Functions.find(Callee)->second);
   }
   void updateAllNodes() {
@@ -422,51 +456,46 @@ class StackSafetyDataFlowAnalysis {
 #endif
 
 public:
-  StackSafetyDataFlowAnalysis(
-      Module &M, std::function<const StackSafetyInfo &(Function &)> FI);
-  StackSafetyGlobalInfo run();
-};
+  StackSafetyDataFlowAnalysis(uint32_t PointerBitWidth, FunctionMap Functions)
+      : Functions(std::move(Functions)),
+        UnknownRange(ConstantRange::getFull(PointerBitWidth)) {}
 
-StackSafetyDataFlowAnalysis::StackSafetyDataFlowAnalysis(
-    Module &M, std::function<const StackSafetyInfo &(Function &)> FI)
-    : PointerSize(M.getDataLayout().getPointerSizeInBits()),
-      UnknownRange(PointerSize, true) {
-  // Without ThinLTO, run the local analysis for every function in the TU and
-  // then run the DFA.
-  for (auto &F : M.functions())
-    if (!F.isDeclaration())
-      Functions.emplace(&F, FI(F));
-  for (auto &A : M.aliases())
-    if (isa<Function>(A.getBaseObject()))
-      Functions.emplace(&A, StackSafetyInfo::FunctionInfo(&A));
-}
+  const FunctionMap &run();
 
-ConstantRange
-StackSafetyDataFlowAnalysis::getArgumentAccessRange(const GlobalValue *Callee,
-                                                    unsigned ParamNo) const {
-  auto IT = Functions.find(Callee);
+  ConstantRange getArgumentAccessRange(const CalleeTy *Callee, unsigned ParamNo,
+                                       const ConstantRange &Offsets) const;
+};
+
+template <typename CalleeTy>
+ConstantRange StackSafetyDataFlowAnalysis<CalleeTy>::getArgumentAccessRange(
+    const CalleeTy *Callee, unsigned ParamNo,
+    const ConstantRange &Offsets) const {
+  auto FnIt = Functions.find(Callee);
   // Unknown callee (outside of LTO domain or an indirect call).
-  if (IT == Functions.end())
+  if (FnIt == Functions.end())
     return UnknownRange;
-  const StackSafetyInfo::FunctionInfo &FS = IT->second;
-  // The definition of this symbol may not be the definition in this linkage
-  // unit.
-  if (!FS.IsDSOLocal() || FS.IsInterposable())
+  auto &FS = FnIt->second;
+  auto ParamIt = FS.Params.find(ParamNo);
+  if (ParamIt == FS.Params.end())
     return UnknownRange;
-  if (ParamNo >= FS.Params.size()) // possibly vararg
+  auto &Access = ParamIt->second.Range;
+  if (Access.isEmptySet())
+    return Access;
+  if (Access.isFullSet())
     return UnknownRange;
-  return FS.Params[ParamNo].Use.Range;
+  return addOverflowNever(Access, Offsets);
 }
 
-bool StackSafetyDataFlowAnalysis::updateOneUse(UseInfo &US,
-                                               bool UpdateToFullSet) {
+template <typename CalleeTy>
+bool StackSafetyDataFlowAnalysis<CalleeTy>::updateOneUse(UseInfo<CalleeTy> &US,
+                                                         bool UpdateToFullSet) {
   bool Changed = false;
   for (auto &CS : US.Calls) {
     assert(!CS.Offset.isEmptySet() &&
            "Param range can't be empty-set, invalid offset range");
 
-    ConstantRange CalleeRange = getArgumentAccessRange(CS.Callee, CS.ParamNo);
-    CalleeRange = CalleeRange.add(CS.Offset);
+    ConstantRange CalleeRange =
+        getArgumentAccessRange(CS.Callee, CS.ParamNo, CS.Offset);
     if (!US.Range.contains(CalleeRange)) {
       Changed = true;
       if (UpdateToFullSet)
@@ -478,19 +507,18 @@ bool StackSafetyDataFlowAnalysis::updateOneUse(UseInfo &US,
   return Changed;
 }
 
-void StackSafetyDataFlowAnalysis::updateOneNode(
-    const GlobalValue *Callee, StackSafetyInfo::FunctionInfo &FS) {
+template <typename CalleeTy>
+void StackSafetyDataFlowAnalysis<CalleeTy>::updateOneNode(
+    const CalleeTy *Callee, FunctionInfo<CalleeTy> &FS) {
   bool UpdateToFullSet = FS.UpdateCount > StackSafetyMaxIterations;
   bool Changed = false;
-  for (auto &AS : FS.Allocas)
-    Changed |= updateOneUse(AS.Use, UpdateToFullSet);
-  for (auto &PS : FS.Params)
-    Changed |= updateOneUse(PS.Use, UpdateToFullSet);
+  for (auto &KV : FS.Params)
+    Changed |= updateOneUse(KV.second, UpdateToFullSet);
 
   if (Changed) {
     LLVM_DEBUG(dbgs() << "=== update [" << FS.UpdateCount
-                      << (UpdateToFullSet ? ", full-set" : "") << "] "
-                      << FS.getName() << "\n");
+                      << (UpdateToFullSet ? ", full-set" : "") << "] " << &FS
+                      << "\n");
     // Callers of this function may need updating.
     for (auto &CallerID : Callers[Callee])
       WorkList.insert(CallerID);
@@ -499,19 +527,14 @@ void StackSafetyDataFlowAnalysis::updateOneNode(
   }
 }
 
-void StackSafetyDataFlowAnalysis::runDataFlow() {
-  Callers.clear();
-  WorkList.clear();
-
-  SmallVector<const GlobalValue *, 16> Callees;
+template <typename CalleeTy>
+void StackSafetyDataFlowAnalysis<CalleeTy>::runDataFlow() {
+  SmallVector<const CalleeTy *, 16> Callees;
   for (auto &F : Functions) {
     Callees.clear();
-    StackSafetyInfo::FunctionInfo &FS = F.second;
-    for (auto &AS : FS.Allocas)
-      for (auto &CS : AS.Use.Calls)
-        Callees.push_back(CS.Callee);
-    for (auto &PS : FS.Params)
-      for (auto &CS : PS.Use.Calls)
+    auto &FS = F.second;
+    for (auto &KV : FS.Params)
+      for (auto &CS : KV.second.Calls)
         Callees.push_back(CS.Callee);
 
     llvm::sort(Callees);
@@ -524,65 +547,284 @@ void StackSafetyDataFlowAnalysis::runDataFlow() {
   updateAllNodes();
 
   while (!WorkList.empty()) {
-    const GlobalValue *Callee = WorkList.back();
+    const CalleeTy *Callee = WorkList.back();
     WorkList.pop_back();
     updateOneNode(Callee);
   }
 }
 
 #ifndef NDEBUG
-void StackSafetyDataFlowAnalysis::verifyFixedPoint() {
+template <typename CalleeTy>
+void StackSafetyDataFlowAnalysis<CalleeTy>::verifyFixedPoint() {
   WorkList.clear();
   updateAllNodes();
   assert(WorkList.empty());
 }
 #endif
 
-StackSafetyGlobalInfo StackSafetyDataFlowAnalysis::run() {
+template <typename CalleeTy>
+const typename StackSafetyDataFlowAnalysis<CalleeTy>::FunctionMap &
+StackSafetyDataFlowAnalysis<CalleeTy>::run() {
   runDataFlow();
   LLVM_DEBUG(verifyFixedPoint());
+  return Functions;
+}
 
-  StackSafetyGlobalInfo SSI;
-  for (auto &F : Functions)
-    SSI.emplace(F.first, std::move(F.second));
-  return SSI;
+FunctionSummary *resolveCallee(GlobalValueSummary *S) {
+  while (S) {
+    if (!S->isLive() || !S->isDSOLocal())
+      return nullptr;
+    if (FunctionSummary *FS = dyn_cast<FunctionSummary>(S))
+      return FS;
+    AliasSummary *AS = dyn_cast<AliasSummary>(S);
+    if (!AS)
+      return nullptr;
+    S = AS->getBaseObject();
+    if (S == AS)
+      return nullptr;
+  }
+  return nullptr;
 }
 
-void print(const StackSafetyGlobalInfo &SSI, raw_ostream &O, const Module &M) {
-  size_t Count = 0;
-  for (auto &F : M.functions())
-    if (!F.isDeclaration()) {
-      SSI.find(&F)->second.print(O);
-      O << "\n";
-      ++Count;
+const Function *findCalleeInModule(const GlobalValue *GV) {
+  while (GV) {
+    if (GV->isDeclaration() || GV->isInterposable() || !GV->isDSOLocal())
+      return nullptr;
+    if (const Function *F = dyn_cast<Function>(GV))
+      return F;
+    const GlobalAlias *A = dyn_cast<GlobalAlias>(GV);
+    if (!A)
+      return nullptr;
+    GV = A->getBaseObject();
+    if (GV == A)
+      return nullptr;
+  }
+  return nullptr;
+}
+
+GlobalValueSummary *getGlobalValueSummary(const ModuleSummaryIndex *Index,
+                                          uint64_t ValueGUID) {
+  auto VI = Index->getValueInfo(ValueGUID);
+  if (!VI || VI.getSummaryList().empty())
+    return nullptr;
+  assert(VI.getSummaryList().size() == 1);
+  auto &Summary = VI.getSummaryList()[0];
+  return Summary.get();
+}
+
+const ConstantRange *findParamAccess(const FunctionSummary &FS,
+                                     uint32_t ParamNo) {
+  assert(FS.isLive());
+  assert(FS.isDSOLocal());
+  for (auto &PS : FS.paramAccesses())
+    if (ParamNo == PS.ParamNo)
+      return &PS.Use;
+  return nullptr;
+}
+
+void resolveAllCalls(UseInfo<GlobalValue> &Use,
+                     const ModuleSummaryIndex *Index) {
+  ConstantRange FullSet(Use.Range.getBitWidth(), true);
+  for (auto &C : Use.Calls) {
+    const Function *F = findCalleeInModule(C.Callee);
+    if (F) {
+      C.Callee = F;
+      continue;
     }
-  for (auto &A : M.aliases()) {
-    SSI.find(&A)->second.print(O);
-    O << "\n";
-    ++Count;
+
+    if (!Index)
+      return Use.updateRange(FullSet);
+    GlobalValueSummary *GVS = getGlobalValueSummary(Index, C.Callee->getGUID());
+
+    FunctionSummary *FS = resolveCallee(GVS);
+    if (!FS)
+      return Use.updateRange(FullSet);
+    const ConstantRange *Found = findParamAccess(*FS, C.ParamNo);
+    if (!Found)
+      return Use.updateRange(FullSet);
+    ConstantRange Access = Found->sextOrTrunc(Use.Range.getBitWidth());
+    Use.updateRange(addOverflowNever(Access, C.Offset));
+    C.Callee = nullptr;
   }
-  assert(Count == SSI.size() && "Unexpected functions in the result");
+
+  Use.Calls.erase(std::remove_if(Use.Calls.begin(), Use.Calls.end(),
+                                 [](auto &T) { return !T.Callee; }),
+                  Use.Calls.end());
+}
+
+GVToSSI createGlobalStackSafetyInfo(
+    std::map<const GlobalValue *, FunctionInfo<GlobalValue>> Functions,
+    const ModuleSummaryIndex *Index) {
+  GVToSSI SSI;
+  if (Functions.empty())
+    return SSI;
+
+  // FIXME: Simplify printing and remove copying here.
+  auto Copy = Functions;
+
+  for (auto &FnKV : Copy)
+    for (auto &KV : FnKV.second.Params)
+      resolveAllCalls(KV.second, Index);
+
+  uint32_t PointerSize = Copy.begin()
+                             ->first->getParent()
+                             ->getDataLayout()
+                             .getMaxPointerSizeInBits();
+  StackSafetyDataFlowAnalysis<GlobalValue> SSDFA(PointerSize, std::move(Copy));
+
+  for (auto &F : SSDFA.run()) {
+    auto FI = F.second;
+    auto &SrcF = Functions[F.first];
+    for (auto &KV : FI.Allocas) {
+      auto &A = KV.second;
+      resolveAllCalls(A, Index);
+      for (auto &C : A.Calls) {
+        A.updateRange(
+            SSDFA.getArgumentAccessRange(C.Callee, C.ParamNo, C.Offset));
+      }
+      // FIXME: This is needed only to preserve calls in print() results.
+      A.Calls = SrcF.Allocas.find(KV.first)->second.Calls;
+    }
+    for (auto &KV : FI.Params) {
+      auto &P = KV.second;
+      P.Calls = SrcF.Params.find(KV.first)->second.Calls;
+    }
+    SSI[F.first] = std::move(FI);
+  }
+
+  return SSI;
 }
 
 } // end anonymous namespace
 
 StackSafetyInfo::StackSafetyInfo() = default;
+
+StackSafetyInfo::StackSafetyInfo(Function *F,
+                                 std::function<ScalarEvolution &()> GetSE)
+    : F(F), GetSE(GetSE) {}
+
 StackSafetyInfo::StackSafetyInfo(StackSafetyInfo &&) = default;
-StackSafetyInfo &StackSafetyInfo::operator=(StackSafetyInfo &&) = default;
 
-StackSafetyInfo::StackSafetyInfo(FunctionInfo &&Info)
-    : Info(new FunctionInfo(std::move(Info))) {}
+StackSafetyInfo &StackSafetyInfo::operator=(StackSafetyInfo &&) = default;
 
 StackSafetyInfo::~StackSafetyInfo() = default;
 
-void StackSafetyInfo::print(raw_ostream &O) const { Info->print(O); }
+const StackSafetyInfo::InfoTy &StackSafetyInfo::getInfo() const {
+  if (!Info) {
+    StackSafetyLocalAnalysis SSLA(*F, GetSE());
+    Info.reset(new InfoTy{SSLA.run()});
+  }
+  return *Info;
+}
+
+void StackSafetyInfo::print(raw_ostream &O) const {
+  getInfo().Info.print(O, F->getName(), dyn_cast<Function>(F));
+}
+
+const StackSafetyGlobalInfo::InfoTy &StackSafetyGlobalInfo::getInfo() const {
+  if (!Info) {
+    std::map<const GlobalValue *, FunctionInfo<GlobalValue>> Functions;
+    for (auto &F : M->functions()) {
+      if (!F.isDeclaration()) {
+        auto FI = GetSSI(F).getInfo().Info;
+        Functions.emplace(&F, std::move(FI));
+      }
+    }
+    Info.reset(new InfoTy{
+        createGlobalStackSafetyInfo(std::move(Functions), Index), {}});
+    for (auto &FnKV : Info->Info) {
+      for (auto &KV : FnKV.second.Allocas) {
+        ++NumAllocaTotal;
+        const AllocaInst *AI = KV.first;
+        if (getStaticAllocaSizeRange(*AI).contains(KV.second.Range)) {
+          Info->SafeAllocas.insert(AI);
+          ++NumAllocaStackSafe;
+        }
+      }
+    }
+    if (StackSafetyPrint)
+      print(errs());
+  }
+  return *Info;
+}
+
+std::vector<FunctionSummary::ParamAccess>
+StackSafetyInfo::getParamAccesses() const {
+  // Implementation transforms internal representation of parameter information
+  // into FunctionSummary format.
+  std::vector<FunctionSummary::ParamAccess> ParamAccesses;
+  for (const auto &KV : getInfo().Info.Params) {
+    auto &PS = KV.second;
+    // Parameter accessed by any or unknown offset, represented as FullSet by
+    // StackSafety, is handled as the parameter for which we have no
+    // StackSafety info at all. So drop it to reduce summary size.
+    if (PS.Range.isFullSet())
+      continue;
+
+    ParamAccesses.emplace_back(KV.first, PS.Range);
+    FunctionSummary::ParamAccess &Param = ParamAccesses.back();
+
+    Param.Calls.reserve(PS.Calls.size());
+    for (auto &C : PS.Calls) {
+      // Parameter forwarded into another function by any or unknown offset
+      // will make ParamAccess::Range as FullSet anyway. So we can drop the
+      // entire parameter like we did above.
+      // TODO(vitalybuka): Return already filtered parameters from getInfo().
+      if (C.Offset.isFullSet()) {
+        ParamAccesses.pop_back();
+        break;
+      }
+      Param.Calls.emplace_back(C.ParamNo, C.Callee->getGUID(), C.Offset);
+    }
+  }
+  return ParamAccesses;
+}
+
+StackSafetyGlobalInfo::StackSafetyGlobalInfo() = default;
+
+StackSafetyGlobalInfo::StackSafetyGlobalInfo(
+    Module *M, std::function<const StackSafetyInfo &(Function &F)> GetSSI,
+    const ModuleSummaryIndex *Index)
+    : M(M), GetSSI(GetSSI), Index(Index) {
+  if (StackSafetyRun)
+    getInfo();
+}
+
+StackSafetyGlobalInfo::StackSafetyGlobalInfo(StackSafetyGlobalInfo &&) =
+    default;
+
+StackSafetyGlobalInfo &
+StackSafetyGlobalInfo::operator=(StackSafetyGlobalInfo &&) = default;
+
+StackSafetyGlobalInfo::~StackSafetyGlobalInfo() = default;
+
+bool StackSafetyGlobalInfo::isSafe(const AllocaInst &AI) const {
+  const auto &Info = getInfo();
+  return Info.SafeAllocas.count(&AI);
+}
+
+void StackSafetyGlobalInfo::print(raw_ostream &O) const {
+  auto &SSI = getInfo().Info;
+  if (SSI.empty())
+    return;
+  const Module &M = *SSI.begin()->first->getParent();
+  for (auto &F : M.functions()) {
+    if (!F.isDeclaration()) {
+      SSI.find(&F)->second.print(O, F.getName(), &F);
+      O << "\n";
+    }
+  }
+}
+
+LLVM_DUMP_METHOD void StackSafetyGlobalInfo::dump() const { print(dbgs()); }
 
 AnalysisKey StackSafetyAnalysis::Key;
 
 StackSafetyInfo StackSafetyAnalysis::run(Function &F,
                                          FunctionAnalysisManager &AM) {
-  StackSafetyLocalAnalysis SSLA(F, AM.getResult<ScalarEvolutionAnalysis>(F));
-  return SSLA.run();
+  return StackSafetyInfo(&F, [&AM, &F]() -> ScalarEvolution & {
+    return AM.getResult<ScalarEvolutionAnalysis>(F);
+  });
 }
 
 PreservedAnalyses StackSafetyPrinterPass::run(Function &F,
@@ -599,7 +841,7 @@ StackSafetyInfoWrapperPass::StackSafetyInfoWrapperPass() : FunctionPass(ID) {
 }
 
 void StackSafetyInfoWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addRequiredTransitive<ScalarEvolutionWrapperPass>();
   AU.setPreservesAll();
 }
 
@@ -608,9 +850,8 @@ void StackSafetyInfoWrapperPass::print(raw_ostream &O, const Module *M) const {
 }
 
 bool StackSafetyInfoWrapperPass::runOnFunction(Function &F) {
-  StackSafetyLocalAnalysis SSLA(
-      F, getAnalysis<ScalarEvolutionWrapperPass>().getSE());
-  SSI = StackSafetyInfo(SSLA.run());
+  auto *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  SSI = {&F, [SE]() -> ScalarEvolution & { return *SE; }};
   return false;
 }
 
@@ -618,20 +859,20 @@ AnalysisKey StackSafetyGlobalAnalysis::Key;
 
 StackSafetyGlobalInfo
 StackSafetyGlobalAnalysis::run(Module &M, ModuleAnalysisManager &AM) {
+  // FIXME: Lookup Module Summary.
   FunctionAnalysisManager &FAM =
       AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-
-  StackSafetyDataFlowAnalysis SSDFA(
-      M, [&FAM](Function &F) -> const StackSafetyInfo & {
-        return FAM.getResult<StackSafetyAnalysis>(F);
-      });
-  return SSDFA.run();
+  return {&M,
+          [&FAM](Function &F) -> const StackSafetyInfo & {
+            return FAM.getResult<StackSafetyAnalysis>(F);
+          },
+          nullptr};
 }
 
 PreservedAnalyses StackSafetyGlobalPrinterPass::run(Module &M,
                                                     ModuleAnalysisManager &AM) {
   OS << "'Stack Safety Analysis' for module '" << M.getName() << "'\n";
-  print(AM.getResult<StackSafetyGlobalAnalysis>(M), OS, M);
+  AM.getResult<StackSafetyGlobalAnalysis>(M).print(OS);
   return PreservedAnalyses::all();
 }
 
@@ -643,25 +884,96 @@ StackSafetyGlobalInfoWrapperPass::StackSafetyGlobalInfoWrapperPass()
       *PassRegistry::getPassRegistry());
 }
 
+StackSafetyGlobalInfoWrapperPass::~StackSafetyGlobalInfoWrapperPass() = default;
+
 void StackSafetyGlobalInfoWrapperPass::print(raw_ostream &O,
                                              const Module *M) const {
-  ::print(SSI, O, *M);
+  SSGI.print(O);
 }
 
 void StackSafetyGlobalInfoWrapperPass::getAnalysisUsage(
     AnalysisUsage &AU) const {
+  AU.setPreservesAll();
   AU.addRequired<StackSafetyInfoWrapperPass>();
 }
 
 bool StackSafetyGlobalInfoWrapperPass::runOnModule(Module &M) {
-  StackSafetyDataFlowAnalysis SSDFA(
-      M, [this](Function &F) -> const StackSafetyInfo & {
-        return getAnalysis<StackSafetyInfoWrapperPass>(F).getResult();
-      });
-  SSI = SSDFA.run();
+  const ModuleSummaryIndex *ImportSummary = nullptr;
+  if (auto *IndexWrapperPass =
+          getAnalysisIfAvailable<ImmutableModuleSummaryIndexWrapperPass>())
+    ImportSummary = IndexWrapperPass->getIndex();
+
+  SSGI = {&M,
+          [this](Function &F) -> const StackSafetyInfo & {
+            return getAnalysis<StackSafetyInfoWrapperPass>(F).getResult();
+          },
+          ImportSummary};
   return false;
 }
 
+bool llvm::needsParamAccessSummary(const Module &M) {
+  if (StackSafetyRun)
+    return true;
+  for (auto &F : M.functions())
+    if (F.hasFnAttribute(Attribute::SanitizeMemTag))
+      return true;
+  return false;
+}
+
+void llvm::generateParamAccessSummary(ModuleSummaryIndex &Index) {
+  const ConstantRange FullSet(FunctionSummary::ParamAccess::RangeWidth, true);
+  std::map<const FunctionSummary *, FunctionInfo<FunctionSummary>> Functions;
+
+  // Convert the ModuleSummaryIndex to a FunctionMap
+  for (auto &GVS : Index) {
+    for (auto &GV : GVS.second.SummaryList) {
+      FunctionSummary *FS = dyn_cast<FunctionSummary>(GV.get());
+      if (!FS)
+        continue;
+      if (FS->isLive() && FS->isDSOLocal()) {
+        FunctionInfo<FunctionSummary> FI;
+        for (auto &PS : FS->paramAccesses()) {
+          auto &US =
+              FI.Params
+                  .emplace(PS.ParamNo, FunctionSummary::ParamAccess::RangeWidth)
+                  .first->second;
+          US.Range = PS.Use;
+          for (auto &Call : PS.Calls) {
+            assert(!Call.Offsets.isFullSet());
+            FunctionSummary *S = resolveCallee(
+                Index.findSummaryInModule(Call.Callee, FS->modulePath()));
+            if (!S) {
+              US.Range = FullSet;
+              US.Calls.clear();
+              break;
+            }
+            US.Calls.emplace_back(S, Call.ParamNo, Call.Offsets);
+          }
+        }
+        Functions.emplace(FS, std::move(FI));
+      }
+      // Reset data for all summaries. Alive and DSO local will be set back from
+      // of data flow results below. Anything else will not be accessed
+      // by ThinLTO backend, so we can save on bitcode size.
+      FS->setParamAccesses({});
+    }
+  }
+  StackSafetyDataFlowAnalysis<FunctionSummary> SSDFA(
+      FunctionSummary::ParamAccess::RangeWidth, std::move(Functions));
+  for (auto &KV : SSDFA.run()) {
+    std::vector<FunctionSummary::ParamAccess> NewParams;
+    NewParams.reserve(KV.second.Params.size());
+    for (auto &Param : KV.second.Params) {
+      NewParams.emplace_back();
+      FunctionSummary::ParamAccess &New = NewParams.back();
+      New.ParamNo = Param.first;
+      New.Use = Param.second.Range; // Only range is needed.
+    }
+    const_cast<FunctionSummary *>(KV.first)->setParamAccesses(
+        std::move(NewParams));
+  }
+}
+
 static const char LocalPassArg[] = "stack-safety-local";
 static const char LocalPassName[] = "Stack Safety Local Analysis";
 INITIALIZE_PASS_BEGIN(StackSafetyInfoWrapperPass, LocalPassArg, LocalPassName,
@@ -672,7 +984,8 @@ INITIALIZE_PASS_END(StackSafetyInfoWrapperPass, LocalPassArg, LocalPassName,
 
 static const char GlobalPassName[] = "Stack Safety Analysis";
 INITIALIZE_PASS_BEGIN(StackSafetyGlobalInfoWrapperPass, DEBUG_TYPE,
-                      GlobalPassName, false, false)
+                      GlobalPassName, false, true)
 INITIALIZE_PASS_DEPENDENCY(StackSafetyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ImmutableModuleSummaryIndexWrapperPass)
 INITIALIZE_PASS_END(StackSafetyGlobalInfoWrapperPass, DEBUG_TYPE,
-                    GlobalPassName, false, false)
+                    GlobalPassName, false, true)
diff --git a/llvm/lib/Analysis/SyncDependenceAnalysis.cpp b/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
index 8447dc87069d..ccf520dcea66 100644
--- a/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
+++ b/llvm/lib/Analysis/SyncDependenceAnalysis.cpp
@@ -244,12 +244,12 @@ struct DivergencePropagator {
     );
 
     auto ItBeginRPO = FuncRPOT.begin();
+    auto ItEndRPO = FuncRPOT.end();
 
     // skip until term (TODO RPOT won't let us start at @term directly)
-    for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {}
-
-    auto ItEndRPO = FuncRPOT.end();
-    assert(ItBeginRPO != ItEndRPO);
+    for (; *ItBeginRPO != &RootBlock; ++ItBeginRPO) {
+      assert(ItBeginRPO != ItEndRPO && "Unable to find RootBlock");
+    }
 
     // propagate definitions at the immediate successors of the node in RPO
     auto ItBlockRPO = ItBeginRPO;
@@ -369,7 +369,7 @@ SyncDependenceAnalysis::join_blocks(const Instruction &Term) {
   // compute all join points
   DivergencePropagator Propagator{FuncRPOT, DT, PDT, LI};
   const auto &TermBlock = *Term.getParent();
-  auto JoinBlocks = Propagator.computeJoinPoints<succ_const_range>(
+  auto JoinBlocks = Propagator.computeJoinPoints<const_succ_range>(
       TermBlock, successors(Term.getParent()), LI.getLoopFor(&TermBlock));
 
   auto ItInserted = CachedBranchJoins.emplace(&Term, std::move(JoinBlocks));
diff --git a/llvm/lib/Analysis/SyntheticCountsUtils.cpp b/llvm/lib/Analysis/SyntheticCountsUtils.cpp
index 22766e5f07f5..a3edce76cd88 100644
--- a/llvm/lib/Analysis/SyntheticCountsUtils.cpp
+++ b/llvm/lib/Analysis/SyntheticCountsUtils.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/SCCIterator.h"
 #include "llvm/Analysis/CallGraph.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
diff --git a/llvm/lib/Analysis/TFUtils.cpp b/llvm/lib/Analysis/TFUtils.cpp
new file mode 100644
index 000000000000..19e6d626e238
--- /dev/null
+++ b/llvm/lib/Analysis/TFUtils.cpp
@@ -0,0 +1,289 @@
+//===- TFUtils.cpp - tensorflow evaluation utilities ----------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements utilities for interfacing with tensorflow C APIs.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/Utils/TFUtils.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/ManagedStatic.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "tensorflow/c/c_api.h"
+#include "tensorflow/c/c_api_experimental.h"
+
+#include <cassert>
+
+using namespace llvm;
+
+namespace {
+
+using TFGraphPtr = std::unique_ptr<TF_Graph, decltype(&TF_DeleteGraph)>;
+using TFSessionOptionsPtr =
+    std::unique_ptr<TF_SessionOptions, decltype(&TF_DeleteSessionOptions)>;
+using TFStatusPtr = std::unique_ptr<TF_Status, decltype(&TF_DeleteStatus)>;
+
+struct TFInitializer {
+  TFInitializer() {
+    assert(!IsInitialized && "TFInitialized should be called only once");
+    int Argc = 1;
+    const char *Name = "";
+    const char **NamePtr = &Name;
+    TF_InitMain(Name, &Argc, const_cast<char ***>(&NamePtr));
+    IsInitialized = true;
+  }
+  bool IsInitialized = false;
+};
+
+llvm::ManagedStatic<TFInitializer> TFLibInitializer;
+
+bool ensureInitTF() { return TFLibInitializer->IsInitialized; }
+
+TFGraphPtr createTFGraph() {
+  return TFGraphPtr(TF_NewGraph(), &TF_DeleteGraph);
+}
+
+TFStatusPtr createTFStatus() {
+  return TFStatusPtr(TF_NewStatus(), &TF_DeleteStatus);
+}
+
+TFSessionOptionsPtr createTFSessionOptions() {
+  return TFSessionOptionsPtr(TF_NewSessionOptions(), &TF_DeleteSessionOptions);
+}
+} // namespace
+
+namespace llvm {
+class EvaluationResultImpl {
+public:
+  EvaluationResultImpl(size_t OutputSize)
+      : OutputSize(OutputSize), Output(OutputSize){};
+
+  ~EvaluationResultImpl() {
+    for (auto *P : Output)
+      if (P)
+        TF_DeleteTensor(P);
+  }
+
+  EvaluationResultImpl(const EvaluationResultImpl &) = delete;
+  EvaluationResultImpl(EvaluationResultImpl &&Other) = delete;
+  std::vector<TF_Tensor *> &getOutput() { return Output; }
+
+private:
+  const size_t OutputSize;
+  std::vector<TF_Tensor *> Output;
+};
+
+class TFModelEvaluatorImpl {
+public:
+  TFModelEvaluatorImpl(StringRef SavedModelPath,
+                       const std::vector<std::string> &InputNames,
+                       const std::vector<std::string> &OutputNames,
+                       const char *Tags);
+
+  bool isValid() const { return IsValid; }
+  size_t OutputSize() const { return OutputFeed.size(); }
+
+  void evaluate(TF_Tensor **Output, TF_Status *Status) {
+    TF_SessionRun(Session, nullptr, InputFeed.data(), Input.data(),
+                  Input.size(), OutputFeed.data(), Output, OutputFeed.size(),
+                  nullptr, 0, nullptr, Status);
+  }
+
+  void initInput(size_t Index, TF_DataType Type,
+                 const std::vector<int64_t> &Dimensions);
+  const std::vector<TF_Tensor *> &getInput() const { return Input; }
+
+  ~TFModelEvaluatorImpl();
+
+private:
+  /// The objects necessary for carrying out an evaluation of the SavedModel.
+  /// They are expensive to set up, and we maintain them accross all the
+  /// evaluations of the model.
+  TF_Session *Session = nullptr;
+  TFGraphPtr Graph;
+  TFSessionOptionsPtr Options;
+
+  /// The specification of the input nodes.
+  std::vector<TF_Output> InputFeed;
+
+  /// The input tensors. They must match by index of the corresponding InputFeed
+  /// value. We set up the tensors once and just mutate theirs scalars before
+  /// each evaluation. The input tensors keep their value after an evaluation.
+  std::vector<TF_Tensor *> Input;
+
+  /// The specification of the output nodes. When evaluating, the tensors in the
+  /// output tensor vector must match by index the corresponding element in the
+  /// OutputFeed.
+  std::vector<TF_Output> OutputFeed;
+
+  void invalidate() { IsValid = false; }
+
+  bool IsValid = true;
+
+  /// Reusable utility for ensuring we can bind the requested Name to a node in
+  /// the SavedModel Graph.
+  bool checkReportAndInvalidate(const TF_Output &Output, StringRef Name);
+};
+} // namespace llvm
+
+TFModelEvaluatorImpl::TFModelEvaluatorImpl(
+    StringRef SavedModelPath, const std::vector<std::string> &InputNames,
+    const std::vector<std::string> &OutputNames, const char *Tags)
+    : Graph(createTFGraph()), Options(createTFSessionOptions()),
+      InputFeed(InputNames.size()), Input(InputNames.size()),
+      OutputFeed(OutputNames.size()) {
+  if (!ensureInitTF()) {
+    errs() << "Tensorflow should have been initialized";
+    return;
+  }
+  auto Status = createTFStatus();
+
+  Session = TF_LoadSessionFromSavedModel(Options.get(), nullptr,
+                                         SavedModelPath.str().c_str(), &Tags, 1,
+                                         Graph.get(), nullptr, Status.get());
+  if (TF_GetCode(Status.get()) != TF_Code::TF_OK) {
+    errs() << TF_Message(Status.get());
+    invalidate();
+  }
+  for (size_t I = 0; I < InputNames.size(); ++I) {
+    InputFeed[I] = {
+        TF_GraphOperationByName(Graph.get(), (InputNames[I]).c_str()), 0};
+    if (!checkReportAndInvalidate(InputFeed[I], InputNames[I]))
+      return;
+  }
+  for (size_t I = 0; I < OutputNames.size(); ++I) {
+    OutputFeed[I] = {
+        TF_GraphOperationByName(Graph.get(), (OutputNames[I]).c_str()), 0};
+    if (!checkReportAndInvalidate(OutputFeed[I], OutputNames[I]))
+      return;
+  }
+}
+
+TFModelEvaluator::TFModelEvaluator(StringRef SavedModelPath,
+                                   const std::vector<std::string> &InputNames,
+                                   const std::vector<std::string> &OutputNames,
+                                   const char *Tags)
+    : Impl(new TFModelEvaluatorImpl(SavedModelPath, InputNames, OutputNames,
+                                    Tags)) {
+  if (!Impl->isValid())
+    Impl.reset();
+}
+
+TFModelEvaluatorImpl::~TFModelEvaluatorImpl() {
+  for (auto *T : Input) {
+    TF_DeleteTensor(T);
+  }
+  if (Session == nullptr)
+    return;
+  auto Status = createTFStatus();
+  TF_DeleteSession(Session, Status.get());
+  Session = nullptr;
+  if (TF_GetCode(Status.get()) != TF_Code::TF_OK)
+    errs() << "Could not delete TF session";
+}
+
+bool TFModelEvaluatorImpl::checkReportAndInvalidate(const TF_Output &Output,
+                                                    StringRef Name) {
+  if (Output.oper)
+    return true;
+  errs() << "Could not find TF_Output named: " + Name;
+  IsValid = false;
+  return IsValid;
+}
+
+Optional<TFModelEvaluator::EvaluationResult> TFModelEvaluator::evaluate() {
+  if (!isValid())
+    return None;
+  std::unique_ptr<EvaluationResultImpl> Ret =
+      std::make_unique<EvaluationResultImpl>(Impl->OutputSize());
+  auto Status = createTFStatus();
+  Impl->evaluate(Ret->getOutput().data(), Status.get());
+  if (TF_GetCode(Status.get()) != TF_Code::TF_OK) {
+    errs() << TF_Message(Status.get());
+    Impl.reset();
+    return None;
+  }
+  return EvaluationResult(std::move(Ret));
+}
+
+void TFModelEvaluatorImpl::initInput(size_t Index, TF_DataType Type,
+                                     const std::vector<int64_t> &Dimensions) {
+  int64_t TotalSize = TF_DataTypeSize(Type);
+  for (auto &D : Dimensions)
+    TotalSize *= D;
+
+  Input[Index] =
+      TF_AllocateTensor(Type, Dimensions.data(), Dimensions.size(), TotalSize);
+  std::memset(TF_TensorData(Input[Index]), 0, TotalSize);
+}
+
+void *TFModelEvaluator::getUntypedInput(size_t Index) {
+  return TF_TensorData(Impl->getInput()[Index]);
+}
+
+TFModelEvaluator::EvaluationResult::EvaluationResult(
+    std::unique_ptr<EvaluationResultImpl> Impl)
+    : Impl(std::move(Impl)) {}
+
+TFModelEvaluator::EvaluationResult::EvaluationResult(EvaluationResult &&Other)
+    : Impl(std::move(Other.Impl)) {}
+
+void *TFModelEvaluator::EvaluationResult::getUntypedTensorValue(size_t Index) {
+  return TF_TensorData(Impl->getOutput()[Index]);
+}
+
+void TFModelEvaluator::initInput(size_t Index, int TypeIndex,
+                                 const std::vector<int64_t> &Dimensions) {
+  Impl->initInput(Index, static_cast<TF_DataType>(TypeIndex), Dimensions);
+}
+
+template <> int TFModelEvaluator::getModelTypeIndex<float>() {
+  return TF_FLOAT;
+}
+
+template <> int TFModelEvaluator::getModelTypeIndex<double>() {
+  return TF_DOUBLE;
+}
+
+template <> int TFModelEvaluator::getModelTypeIndex<int8_t>() {
+  return TF_INT8;
+}
+
+template <> int TFModelEvaluator::getModelTypeIndex<uint8_t>() {
+  return TF_UINT8;
+}
+
+template <> int TFModelEvaluator::getModelTypeIndex<int16_t>() {
+  return TF_INT16;
+}
+
+template <> int TFModelEvaluator::getModelTypeIndex<uint16_t>() {
+  return TF_UINT16;
+}
+
+template <> int TFModelEvaluator::getModelTypeIndex<int32_t>() {
+  return TF_INT32;
+}
+
+template <> int TFModelEvaluator::getModelTypeIndex<uint32_t>() {
+  return TF_UINT32;
+}
+
+template <> int TFModelEvaluator::getModelTypeIndex<int64_t>() {
+  return TF_INT64;
+}
+
+template <> int TFModelEvaluator::getModelTypeIndex<uint64_t>() {
+  return TF_UINT64;
+}
+
+TFModelEvaluator::EvaluationResult::~EvaluationResult() {}
+TFModelEvaluator::~TFModelEvaluator() {}
diff --git a/llvm/lib/Analysis/TargetLibraryInfo.cpp b/llvm/lib/Analysis/TargetLibraryInfo.cpp
index c7238db43aab..60cfb04634c4 100644
--- a/llvm/lib/Analysis/TargetLibraryInfo.cpp
+++ b/llvm/lib/Analysis/TargetLibraryInfo.cpp
@@ -69,11 +69,10 @@ static bool hasBcmp(const Triple &TT) {
 static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
                        ArrayRef<StringLiteral> StandardNames) {
   // Verify that the StandardNames array is in alphabetical order.
-  assert(std::is_sorted(StandardNames.begin(), StandardNames.end(),
-                        [](StringRef LHS, StringRef RHS) {
-                          return LHS < RHS;
-                        }) &&
-         "TargetLibraryInfoImpl function names must be sorted");
+  assert(
+      llvm::is_sorted(StandardNames,
+                      [](StringRef LHS, StringRef RHS) { return LHS < RHS; }) &&
+      "TargetLibraryInfoImpl function names must be sorted");
 
   // Set IO unlocked variants as unavailable
   // Set them as available per system below
@@ -105,14 +104,12 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
   TLI.setShouldExtI32Return(ShouldExtI32Return);
   TLI.setShouldSignExtI32Param(ShouldSignExtI32Param);
 
-  if (T.getArch() == Triple::r600 ||
-      T.getArch() == Triple::amdgcn)
+  if (T.isAMDGPU())
     TLI.disableAllFunctions();
 
   // There are no library implementations of memcpy and memset for AMD gpus and
   // these can be difficult to lower in the backend.
-  if (T.getArch() == Triple::r600 ||
-      T.getArch() == Triple::amdgcn) {
+  if (T.isAMDGPU()) {
     TLI.setUnavailable(LibFunc_memcpy);
     TLI.setUnavailable(LibFunc_memset);
     TLI.setUnavailable(LibFunc_memset_pattern16);
@@ -210,6 +207,7 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
       TLI.setUnavailable(LibFunc_logf);
       TLI.setUnavailable(LibFunc_modff);
       TLI.setUnavailable(LibFunc_powf);
+      TLI.setUnavailable(LibFunc_remainderf);
       TLI.setUnavailable(LibFunc_sinf);
       TLI.setUnavailable(LibFunc_sinhf);
       TLI.setUnavailable(LibFunc_sqrtf);
@@ -239,6 +237,7 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_logl);
     TLI.setUnavailable(LibFunc_modfl);
     TLI.setUnavailable(LibFunc_powl);
+    TLI.setUnavailable(LibFunc_remainderl);
     TLI.setUnavailable(LibFunc_sinl);
     TLI.setUnavailable(LibFunc_sinhl);
     TLI.setUnavailable(LibFunc_sqrtl);
@@ -470,6 +469,9 @@ static void initialize(TargetLibraryInfoImpl &TLI, const Triple &T,
     TLI.setUnavailable(LibFunc_tmpfile64);
 
     // Relaxed math functions are included in math-finite.h on Linux (GLIBC).
+    // Note that math-finite.h is no longer supported by top-of-tree GLIBC,
+    // so we keep these functions around just so that they're recognized by
+    // the ConstantFolder.
     TLI.setUnavailable(LibFunc_acos_finite);
     TLI.setUnavailable(LibFunc_acosf_finite);
     TLI.setUnavailable(LibFunc_acosl_finite);
@@ -659,6 +661,11 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
             FTy.getParamType(1)->isPointerTy() &&
             FTy.getParamType(2)->isPointerTy() &&
             FTy.getReturnType()->isIntegerTy(32));
+  case LibFunc_strlen_chk:
+    --NumParams;
+    if (!IsSizeTTy(FTy.getParamType(NumParams)))
+      return false;
+    LLVM_FALLTHROUGH;
   case LibFunc_strlen:
     return (NumParams == 1 && FTy.getParamType(0)->isPointerTy() &&
             FTy.getReturnType()->isIntegerTy());
@@ -893,6 +900,8 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
             FTy.getParamType(1)->isPointerTy());
   case LibFunc_write:
     return (NumParams == 3 && FTy.getParamType(1)->isPointerTy());
+  case LibFunc_aligned_alloc:
+    return (NumParams == 2 && FTy.getReturnType()->isPointerTy());
   case LibFunc_bcopy:
   case LibFunc_bcmp:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy() &&
@@ -1209,6 +1218,14 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_ZdlPvSt11align_val_tRKSt9nothrow_t:
   // void operator delete[](void*, align_val_t, nothrow)
   case LibFunc_ZdaPvSt11align_val_tRKSt9nothrow_t:
+  // void operator delete(void*, unsigned int, align_val_t)
+  case LibFunc_ZdlPvjSt11align_val_t:
+  // void operator delete(void*, unsigned long, align_val_t)
+  case LibFunc_ZdlPvmSt11align_val_t:
+  // void operator delete[](void*, unsigned int, align_val_t);
+  case LibFunc_ZdaPvjSt11align_val_t:
+  // void operator delete[](void*, unsigned long, align_val_t);
+  case LibFunc_ZdaPvmSt11align_val_t:
     return (NumParams == 3 && FTy.getParamType(0)->isPointerTy());
 
   case LibFunc_memset_pattern16:
@@ -1332,6 +1349,9 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_round:
   case LibFunc_roundf:
   case LibFunc_roundl:
+  case LibFunc_roundeven:
+  case LibFunc_roundevenf:
+  case LibFunc_roundevenl:
   case LibFunc_sin:
   case LibFunc_sinf:
   case LibFunc_sinh:
@@ -1374,6 +1394,9 @@ bool TargetLibraryInfoImpl::isValidProtoForLibFunc(const FunctionType &FTy,
   case LibFunc_fmod:
   case LibFunc_fmodf:
   case LibFunc_fmodl:
+  case LibFunc_remainder:
+  case LibFunc_remainderf:
+  case LibFunc_remainderl:
   case LibFunc_copysign:
   case LibFunc_copysignf:
   case LibFunc_copysignl:
@@ -1478,9 +1501,9 @@ bool TargetLibraryInfoImpl::getLibFunc(const Function &FDecl,
                                        LibFunc &F) const {
   // Intrinsics don't overlap w/libcalls; if our module has a large number of
   // intrinsics, this ends up being an interesting compile time win since we
-  // avoid string normalization and comparison. 
+  // avoid string normalization and comparison.
   if (FDecl.isIntrinsic()) return false;
-  
+
   const DataLayout *DL =
       FDecl.getParent() ? &FDecl.getParent()->getDataLayout() : nullptr;
   return getLibFunc(FDecl.getName(), F) &&
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index f2c63f789d89..2f051e53790b 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -11,8 +11,8 @@
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/TargetTransformInfoImpl.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
@@ -42,18 +42,109 @@ struct NoTTIImpl : TargetTransformInfoImplCRTPBase<NoTTIImpl> {
   explicit NoTTIImpl(const DataLayout &DL)
       : TargetTransformInfoImplCRTPBase<NoTTIImpl>(DL) {}
 };
-}
+} // namespace
 
 bool HardwareLoopInfo::canAnalyze(LoopInfo &LI) {
   // If the loop has irreducible control flow, it can not be converted to
   // Hardware loop.
-  LoopBlocksRPO RPOT(L);  
+  LoopBlocksRPO RPOT(L);
   RPOT.perform(&LI);
   if (containsIrreducibleCFG<const BasicBlock *>(RPOT, LI))
     return false;
   return true;
 }
 
+IntrinsicCostAttributes::IntrinsicCostAttributes(const IntrinsicInst &I) :
+    II(&I), RetTy(I.getType()), IID(I.getIntrinsicID()) {
+
+ FunctionType *FTy = I.getCalledFunction()->getFunctionType();
+ ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
+ Arguments.insert(Arguments.begin(), I.arg_begin(), I.arg_end());
+ if (auto *FPMO = dyn_cast<FPMathOperator>(&I))
+   FMF = FPMO->getFastMathFlags();
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
+                                                 const CallBase &CI) :
+  II(dyn_cast<IntrinsicInst>(&CI)),  RetTy(CI.getType()), IID(Id) {
+
+  if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
+    FMF = FPMO->getFastMathFlags();
+
+  FunctionType *FTy =
+    CI.getCalledFunction()->getFunctionType();
+  ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
+                                                 const CallBase &CI,
+                                                 unsigned Factor) :
+    RetTy(CI.getType()), IID(Id), VF(Factor) {
+
+  if (auto *FPMO = dyn_cast<FPMathOperator>(&CI))
+    FMF = FPMO->getFastMathFlags();
+
+  Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end());
+  FunctionType *FTy =
+    CI.getCalledFunction()->getFunctionType();
+  ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id,
+                                                 const CallBase &CI,
+                                                 unsigned Factor,
+                                                 unsigned ScalarCost) :
+    RetTy(CI.getType()), IID(Id), VF(Factor), ScalarizationCost(ScalarCost) {
+
+  if (const auto *FPMO = dyn_cast<FPMathOperator>(&CI))
+    FMF = FPMO->getFastMathFlags();
+
+  Arguments.insert(Arguments.begin(), CI.arg_begin(), CI.arg_end());
+  FunctionType *FTy =
+    CI.getCalledFunction()->getFunctionType();
+  ParamTys.insert(ParamTys.begin(), FTy->param_begin(), FTy->param_end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+                                                 ArrayRef<Type *> Tys,
+                                                 FastMathFlags Flags) :
+    RetTy(RTy), IID(Id), FMF(Flags) {
+  ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+                                                 ArrayRef<Type *> Tys,
+                                                 FastMathFlags Flags,
+                                                 unsigned ScalarCost) :
+    RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
+  ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+                                                 ArrayRef<Type *> Tys,
+                                                 FastMathFlags Flags,
+                                                 unsigned ScalarCost,
+                                                 const IntrinsicInst *I) :
+    II(I), RetTy(RTy), IID(Id), FMF(Flags), ScalarizationCost(ScalarCost) {
+  ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *RTy,
+                                                 ArrayRef<Type *> Tys) :
+    RetTy(RTy), IID(Id) {
+  ParamTys.insert(ParamTys.begin(), Tys.begin(), Tys.end());
+}
+
+IntrinsicCostAttributes::IntrinsicCostAttributes(Intrinsic::ID Id, Type *Ty,
+                                                 ArrayRef<const Value *> Args)
+    : RetTy(Ty), IID(Id) {
+
+  Arguments.insert(Arguments.begin(), Args.begin(), Args.end());
+  ParamTys.reserve(Arguments.size());
+  for (unsigned Idx = 0, Size = Arguments.size(); Idx != Size; ++Idx)
+    ParamTys.push_back(Arguments[Idx]->getType());
+}
+
 bool HardwareLoopInfo::isHardwareLoopCandidate(ScalarEvolution &SE,
                                                LoopInfo &LI, DominatorTree &DT,
                                                bool ForceNestedLoop,
@@ -146,28 +237,6 @@ TargetTransformInfo &TargetTransformInfo::operator=(TargetTransformInfo &&RHS) {
   return *this;
 }
 
-int TargetTransformInfo::getOperationCost(unsigned Opcode, Type *Ty,
-                                          Type *OpTy) const {
-  int Cost = TTIImpl->getOperationCost(Opcode, Ty, OpTy);
-  assert(Cost >= 0 && "TTI should not produce negative costs!");
-  return Cost;
-}
-
-int TargetTransformInfo::getCallCost(FunctionType *FTy, int NumArgs,
-                                     const User *U) const {
-  int Cost = TTIImpl->getCallCost(FTy, NumArgs, U);
-  assert(Cost >= 0 && "TTI should not produce negative costs!");
-  return Cost;
-}
-
-int TargetTransformInfo::getCallCost(const Function *F,
-                                     ArrayRef<const Value *> Arguments,
-                                     const User *U) const {
-  int Cost = TTIImpl->getCallCost(F, Arguments, U);
-  assert(Cost >= 0 && "TTI should not produce negative costs!");
-  return Cost;
-}
-
 unsigned TargetTransformInfo::getInliningThresholdMultiplier() const {
   return TTIImpl->getInliningThresholdMultiplier();
 }
@@ -177,34 +246,23 @@ int TargetTransformInfo::getInlinerVectorBonusPercent() const {
 }
 
 int TargetTransformInfo::getGEPCost(Type *PointeeType, const Value *Ptr,
-                                    ArrayRef<const Value *> Operands) const {
-  return TTIImpl->getGEPCost(PointeeType, Ptr, Operands);
-}
-
-int TargetTransformInfo::getExtCost(const Instruction *I,
-                                    const Value *Src) const {
-  return TTIImpl->getExtCost(I, Src);
-}
-
-int TargetTransformInfo::getIntrinsicCost(
-    Intrinsic::ID IID, Type *RetTy, ArrayRef<const Value *> Arguments,
-    const User *U) const {
-  int Cost = TTIImpl->getIntrinsicCost(IID, RetTy, Arguments, U);
-  assert(Cost >= 0 && "TTI should not produce negative costs!");
-  return Cost;
+                                    ArrayRef<const Value *> Operands,
+                                    TTI::TargetCostKind CostKind) const {
+  return TTIImpl->getGEPCost(PointeeType, Ptr, Operands, CostKind);
 }
 
-unsigned
-TargetTransformInfo::getEstimatedNumberOfCaseClusters(
+unsigned TargetTransformInfo::getEstimatedNumberOfCaseClusters(
     const SwitchInst &SI, unsigned &JTSize, ProfileSummaryInfo *PSI,
     BlockFrequencyInfo *BFI) const {
   return TTIImpl->getEstimatedNumberOfCaseClusters(SI, JTSize, PSI, BFI);
 }
 
 int TargetTransformInfo::getUserCost(const User *U,
-    ArrayRef<const Value *> Operands) const {
-  int Cost = TTIImpl->getUserCost(U, Operands);
-  assert(Cost >= 0 && "TTI should not produce negative costs!");
+                                     ArrayRef<const Value *> Operands,
+                                     enum TargetCostKind CostKind) const {
+  int Cost = TTIImpl->getUserCost(U, Operands, CostKind);
+  assert((CostKind == TTI::TCK_RecipThroughput || Cost >= 0) &&
+         "TTI should not produce negative costs!");
   return Cost;
 }
 
@@ -212,6 +270,10 @@ bool TargetTransformInfo::hasBranchDivergence() const {
   return TTIImpl->hasBranchDivergence();
 }
 
+bool TargetTransformInfo::useGPUDivergenceAnalysis() const {
+  return TTIImpl->useGPUDivergenceAnalysis();
+}
+
 bool TargetTransformInfo::isSourceOfDivergence(const Value *V) const {
   return TTIImpl->isSourceOfDivergence(V);
 }
@@ -225,12 +287,17 @@ unsigned TargetTransformInfo::getFlatAddressSpace() const {
 }
 
 bool TargetTransformInfo::collectFlatAddressOperands(
-  SmallVectorImpl<int> &OpIndexes, Intrinsic::ID IID) const  {
+    SmallVectorImpl<int> &OpIndexes, Intrinsic::ID IID) const {
   return TTIImpl->collectFlatAddressOperands(OpIndexes, IID);
 }
 
-bool TargetTransformInfo::rewriteIntrinsicWithAddressSpace(
-  IntrinsicInst *II, Value *OldV, Value *NewV) const {
+bool TargetTransformInfo::isNoopAddrSpaceCast(unsigned FromAS,
+                                              unsigned ToAS) const {
+  return TTIImpl->isNoopAddrSpaceCast(FromAS, ToAS);
+}
+
+Value *TargetTransformInfo::rewriteIntrinsicWithAddressSpace(
+    IntrinsicInst *II, Value *OldV, Value *NewV) const {
   return TTIImpl->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
 }
 
@@ -239,22 +306,32 @@ bool TargetTransformInfo::isLoweredToCall(const Function *F) const {
 }
 
 bool TargetTransformInfo::isHardwareLoopProfitable(
-  Loop *L, ScalarEvolution &SE, AssumptionCache &AC,
-  TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const {
+    Loop *L, ScalarEvolution &SE, AssumptionCache &AC,
+    TargetLibraryInfo *LibInfo, HardwareLoopInfo &HWLoopInfo) const {
   return TTIImpl->isHardwareLoopProfitable(L, SE, AC, LibInfo, HWLoopInfo);
 }
 
-bool TargetTransformInfo::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
-    ScalarEvolution &SE, AssumptionCache &AC, TargetLibraryInfo *TLI,
-    DominatorTree *DT, const LoopAccessInfo *LAI) const {
+bool TargetTransformInfo::preferPredicateOverEpilogue(
+    Loop *L, LoopInfo *LI, ScalarEvolution &SE, AssumptionCache &AC,
+    TargetLibraryInfo *TLI, DominatorTree *DT,
+    const LoopAccessInfo *LAI) const {
   return TTIImpl->preferPredicateOverEpilogue(L, LI, SE, AC, TLI, DT, LAI);
 }
 
+bool TargetTransformInfo::emitGetActiveLaneMask() const {
+  return TTIImpl->emitGetActiveLaneMask();
+}
+
 void TargetTransformInfo::getUnrollingPreferences(
     Loop *L, ScalarEvolution &SE, UnrollingPreferences &UP) const {
   return TTIImpl->getUnrollingPreferences(L, SE, UP);
 }
 
+void TargetTransformInfo::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                                                PeelingPreferences &PP) const {
+  return TTIImpl->getPeelingPreferences(L, SE, PP);
+}
+
 bool TargetTransformInfo::isLegalAddImmediate(int64_t Imm) const {
   return TTIImpl->isLegalAddImmediate(Imm);
 }
@@ -265,8 +342,7 @@ bool TargetTransformInfo::isLegalICmpImmediate(int64_t Imm) const {
 
 bool TargetTransformInfo::isLegalAddressingMode(Type *Ty, GlobalValue *BaseGV,
                                                 int64_t BaseOffset,
-                                                bool HasBaseReg,
-                                                int64_t Scale,
+                                                bool HasBaseReg, int64_t Scale,
                                                 unsigned AddrSpace,
                                                 Instruction *I) const {
   return TTIImpl->isLegalAddressingMode(Ty, BaseGV, BaseOffset, HasBaseReg,
@@ -277,6 +353,10 @@ bool TargetTransformInfo::isLSRCostLess(LSRCost &C1, LSRCost &C2) const {
   return TTIImpl->isLSRCostLess(C1, C2);
 }
 
+bool TargetTransformInfo::isProfitableLSRChainElement(Instruction *I) const {
+  return TTIImpl->isProfitableLSRChainElement(I);
+}
+
 bool TargetTransformInfo::canMacroFuseCmp() const {
   return TTIImpl->canMacroFuseCmp();
 }
@@ -297,12 +377,12 @@ bool TargetTransformInfo::shouldFavorBackedgeIndex(const Loop *L) const {
 }
 
 bool TargetTransformInfo::isLegalMaskedStore(Type *DataType,
-                                             MaybeAlign Alignment) const {
+                                             Align Alignment) const {
   return TTIImpl->isLegalMaskedStore(DataType, Alignment);
 }
 
 bool TargetTransformInfo::isLegalMaskedLoad(Type *DataType,
-                                            MaybeAlign Alignment) const {
+                                            Align Alignment) const {
   return TTIImpl->isLegalMaskedLoad(DataType, Alignment);
 }
 
@@ -316,12 +396,12 @@ bool TargetTransformInfo::isLegalNTLoad(Type *DataType, Align Alignment) const {
 }
 
 bool TargetTransformInfo::isLegalMaskedGather(Type *DataType,
-                                              MaybeAlign Alignment) const {
+                                              Align Alignment) const {
   return TTIImpl->isLegalMaskedGather(DataType, Alignment);
 }
 
 bool TargetTransformInfo::isLegalMaskedScatter(Type *DataType,
-                                               MaybeAlign Alignment) const {
+                                               Align Alignment) const {
   return TTIImpl->isLegalMaskedScatter(DataType, Alignment);
 }
 
@@ -348,8 +428,7 @@ bool TargetTransformInfo::prefersVectorizedAddressing() const {
 
 int TargetTransformInfo::getScalingFactorCost(Type *Ty, GlobalValue *BaseGV,
                                               int64_t BaseOffset,
-                                              bool HasBaseReg,
-                                              int64_t Scale,
+                                              bool HasBaseReg, int64_t Scale,
                                               unsigned AddrSpace) const {
   int Cost = TTIImpl->getScalingFactorCost(Ty, BaseGV, BaseOffset, HasBaseReg,
                                            Scale, AddrSpace);
@@ -378,7 +457,8 @@ bool TargetTransformInfo::isTypeLegal(Type *Ty) const {
 bool TargetTransformInfo::shouldBuildLookupTables() const {
   return TTIImpl->shouldBuildLookupTables();
 }
-bool TargetTransformInfo::shouldBuildLookupTablesForConstant(Constant *C) const {
+bool TargetTransformInfo::shouldBuildLookupTablesForConstant(
+    Constant *C) const {
   return TTIImpl->shouldBuildLookupTablesForConstant(C);
 }
 
@@ -386,14 +466,15 @@ bool TargetTransformInfo::useColdCCForColdCall(Function &F) const {
   return TTIImpl->useColdCCForColdCall(F);
 }
 
-unsigned TargetTransformInfo::
-getScalarizationOverhead(Type *Ty, bool Insert, bool Extract) const {
-  return TTIImpl->getScalarizationOverhead(Ty, Insert, Extract);
+unsigned
+TargetTransformInfo::getScalarizationOverhead(VectorType *Ty,
+                                              const APInt &DemandedElts,
+                                              bool Insert, bool Extract) const {
+  return TTIImpl->getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
 }
 
-unsigned TargetTransformInfo::
-getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
-                                 unsigned VF) const {
+unsigned TargetTransformInfo::getOperandsScalarizationOverhead(
+    ArrayRef<const Value *> Args, unsigned VF) const {
   return TTIImpl->getOperandsScalarizationOverhead(Args, VF);
 }
 
@@ -401,7 +482,8 @@ bool TargetTransformInfo::supportsEfficientVectorElementLoadStore() const {
   return TTIImpl->supportsEfficientVectorElementLoadStore();
 }
 
-bool TargetTransformInfo::enableAggressiveInterleaving(bool LoopHasReductions) const {
+bool TargetTransformInfo::enableAggressiveInterleaving(
+    bool LoopHasReductions) const {
   return TTIImpl->enableAggressiveInterleaving(LoopHasReductions);
 }
 
@@ -427,8 +509,8 @@ bool TargetTransformInfo::allowsMisalignedMemoryAccesses(LLVMContext &Context,
                                                          unsigned AddressSpace,
                                                          unsigned Alignment,
                                                          bool *Fast) const {
-  return TTIImpl->allowsMisalignedMemoryAccesses(Context, BitWidth, AddressSpace,
-                                                 Alignment, Fast);
+  return TTIImpl->allowsMisalignedMemoryAccesses(Context, BitWidth,
+                                                 AddressSpace, Alignment, Fast);
 }
 
 TargetTransformInfo::PopcntSupportKind
@@ -458,22 +540,27 @@ int TargetTransformInfo::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
   return Cost;
 }
 
-int TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty) const {
-  int Cost = TTIImpl->getIntImmCost(Imm, Ty);
+int TargetTransformInfo::getIntImmCost(const APInt &Imm, Type *Ty,
+                                       TTI::TargetCostKind CostKind) const {
+  int Cost = TTIImpl->getIntImmCost(Imm, Ty, CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
-int TargetTransformInfo::getIntImmCostInst(unsigned Opcode, unsigned Idx,
-                                           const APInt &Imm, Type *Ty) const {
-  int Cost = TTIImpl->getIntImmCostInst(Opcode, Idx, Imm, Ty);
+int
+TargetTransformInfo::getIntImmCostInst(unsigned Opcode, unsigned Idx,
+                                       const APInt &Imm, Type *Ty,
+                                       TTI::TargetCostKind CostKind) const {
+  int Cost = TTIImpl->getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
-int TargetTransformInfo::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
-                                             const APInt &Imm, Type *Ty) const {
-  int Cost = TTIImpl->getIntImmCostIntrin(IID, Idx, Imm, Ty);
+int
+TargetTransformInfo::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
+                                         const APInt &Imm, Type *Ty,
+                                         TTI::TargetCostKind CostKind) const {
+  int Cost = TTIImpl->getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -482,11 +569,12 @@ unsigned TargetTransformInfo::getNumberOfRegisters(unsigned ClassID) const {
   return TTIImpl->getNumberOfRegisters(ClassID);
 }
 
-unsigned TargetTransformInfo::getRegisterClassForType(bool Vector, Type *Ty) const {
+unsigned TargetTransformInfo::getRegisterClassForType(bool Vector,
+                                                      Type *Ty) const {
   return TTIImpl->getRegisterClassForType(Vector, Ty);
 }
 
-const char* TargetTransformInfo::getRegisterClassName(unsigned ClassID) const {
+const char *TargetTransformInfo::getRegisterClassName(unsigned ClassID) const {
   return TTIImpl->getRegisterClassName(ClassID);
 }
 
@@ -516,13 +604,13 @@ unsigned TargetTransformInfo::getCacheLineSize() const {
   return TTIImpl->getCacheLineSize();
 }
 
-llvm::Optional<unsigned> TargetTransformInfo::getCacheSize(CacheLevel Level)
-  const {
+llvm::Optional<unsigned>
+TargetTransformInfo::getCacheSize(CacheLevel Level) const {
   return TTIImpl->getCacheSize(Level);
 }
 
-llvm::Optional<unsigned> TargetTransformInfo::getCacheAssociativity(
-  CacheLevel Level) const {
+llvm::Optional<unsigned>
+TargetTransformInfo::getCacheAssociativity(CacheLevel Level) const {
   return TTIImpl->getCacheAssociativity(Level);
 }
 
@@ -530,24 +618,32 @@ unsigned TargetTransformInfo::getPrefetchDistance() const {
   return TTIImpl->getPrefetchDistance();
 }
 
-unsigned TargetTransformInfo::getMinPrefetchStride() const {
-  return TTIImpl->getMinPrefetchStride();
+unsigned TargetTransformInfo::getMinPrefetchStride(
+    unsigned NumMemAccesses, unsigned NumStridedMemAccesses,
+    unsigned NumPrefetches, bool HasCall) const {
+  return TTIImpl->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+                                       NumPrefetches, HasCall);
 }
 
 unsigned TargetTransformInfo::getMaxPrefetchIterationsAhead() const {
   return TTIImpl->getMaxPrefetchIterationsAhead();
 }
 
+bool TargetTransformInfo::enableWritePrefetching() const {
+  return TTIImpl->enableWritePrefetching();
+}
+
 unsigned TargetTransformInfo::getMaxInterleaveFactor(unsigned VF) const {
   return TTIImpl->getMaxInterleaveFactor(VF);
 }
 
 TargetTransformInfo::OperandValueKind
-TargetTransformInfo::getOperandInfo(Value *V, OperandValueProperties &OpProps) {
+TargetTransformInfo::getOperandInfo(const Value *V,
+                                    OperandValueProperties &OpProps) {
   OperandValueKind OpInfo = OK_AnyValue;
   OpProps = OP_None;
 
-  if (auto *CI = dyn_cast<ConstantInt>(V)) {
+  if (const auto *CI = dyn_cast<ConstantInt>(V)) {
     if (CI->getValue().isPowerOf2())
       OpProps = OP_PowerOf2;
     return OK_UniformConstantValue;
@@ -556,7 +652,7 @@ TargetTransformInfo::getOperandInfo(Value *V, OperandValueProperties &OpProps) {
   // A broadcast shuffle creates a uniform value.
   // TODO: Add support for non-zero index broadcasts.
   // TODO: Add support for different source vector width.
-  if (auto *ShuffleInst = dyn_cast<ShuffleVectorInst>(V))
+  if (const auto *ShuffleInst = dyn_cast<ShuffleVectorInst>(V))
     if (ShuffleInst->isZeroEltSplat())
       OpInfo = OK_UniformValue;
 
@@ -571,7 +667,7 @@ TargetTransformInfo::getOperandInfo(Value *V, OperandValueProperties &OpProps) {
       if (auto *CI = dyn_cast<ConstantInt>(Splat))
         if (CI->getValue().isPowerOf2())
           OpProps = OP_PowerOf2;
-    } else if (auto *CDS = dyn_cast<ConstantDataSequential>(V)) {
+    } else if (const auto *CDS = dyn_cast<ConstantDataSequential>(V)) {
       OpProps = OP_PowerOf2;
       for (unsigned I = 0, E = CDS->getNumElements(); I != E; ++I) {
         if (auto *CI = dyn_cast<ConstantInt>(CDS->getElementAsConstant(I)))
@@ -592,28 +688,31 @@ TargetTransformInfo::getOperandInfo(Value *V, OperandValueProperties &OpProps) {
 }
 
 int TargetTransformInfo::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, OperandValueKind Opd1Info,
+    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+    OperandValueKind Opd1Info,
     OperandValueKind Opd2Info, OperandValueProperties Opd1PropInfo,
     OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
     const Instruction *CxtI) const {
   int Cost = TTIImpl->getArithmeticInstrCost(
-      Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo, Args, CxtI);
+      Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo,
+      Args, CxtI);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
-int TargetTransformInfo::getShuffleCost(ShuffleKind Kind, Type *Ty, int Index,
-                                        Type *SubTp) const {
+int TargetTransformInfo::getShuffleCost(ShuffleKind Kind, VectorType *Ty,
+                                        int Index, VectorType *SubTp) const {
   int Cost = TTIImpl->getShuffleCost(Kind, Ty, Index, SubTp);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
-int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst,
-                                 Type *Src, const Instruction *I) const {
-  assert ((I == nullptr || I->getOpcode() == Opcode) &&
-          "Opcode should reflect passed instruction.");
-  int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src, I);
+int TargetTransformInfo::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                          TTI::TargetCostKind CostKind,
+                                          const Instruction *I) const {
+  assert((I == nullptr || I->getOpcode() == Opcode) &&
+         "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getCastInstrCost(Opcode, Dst, Src, CostKind, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -626,17 +725,20 @@ int TargetTransformInfo::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
   return Cost;
 }
 
-int TargetTransformInfo::getCFInstrCost(unsigned Opcode) const {
-  int Cost = TTIImpl->getCFInstrCost(Opcode);
+int TargetTransformInfo::getCFInstrCost(unsigned Opcode,
+                                        TTI::TargetCostKind CostKind) const {
+  int Cost = TTIImpl->getCFInstrCost(Opcode, CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
 int TargetTransformInfo::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                 Type *CondTy, const Instruction *I) const {
-  assert ((I == nullptr || I->getOpcode() == Opcode) &&
-          "Opcode should reflect passed instruction.");
-  int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+                                            Type *CondTy,
+                                            TTI::TargetCostKind CostKind,
+                                            const Instruction *I) const {
+  assert((I == nullptr || I->getOpcode() == Opcode) &&
+         "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -649,65 +751,59 @@ int TargetTransformInfo::getVectorInstrCost(unsigned Opcode, Type *Val,
 }
 
 int TargetTransformInfo::getMemoryOpCost(unsigned Opcode, Type *Src,
-                                         MaybeAlign Alignment,
-                                         unsigned AddressSpace,
+                                         Align Alignment, unsigned AddressSpace,
+                                         TTI::TargetCostKind CostKind,
                                          const Instruction *I) const {
-  assert ((I == nullptr || I->getOpcode() == Opcode) &&
-          "Opcode should reflect passed instruction.");
-  int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
+  assert((I == nullptr || I->getOpcode() == Opcode) &&
+         "Opcode should reflect passed instruction.");
+  int Cost = TTIImpl->getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                      CostKind, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
-int TargetTransformInfo::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
-                                               unsigned Alignment,
-                                               unsigned AddressSpace) const {
+int TargetTransformInfo::getMaskedMemoryOpCost(
+    unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
+    TTI::TargetCostKind CostKind) const {
   int Cost =
-      TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+      TTIImpl->getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                     CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
-int TargetTransformInfo::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
-                                                Value *Ptr, bool VariableMask,
-                                                unsigned Alignment) const {
+int TargetTransformInfo::getGatherScatterOpCost(
+    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
+    Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) const {
   int Cost = TTIImpl->getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
-                                             Alignment);
+                                             Alignment, CostKind, I);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
 int TargetTransformInfo::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-    unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
-    bool UseMaskForGaps) const {
-  int Cost = TTIImpl->getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                                 Alignment, AddressSpace,
-                                                 UseMaskForCond,
-                                                 UseMaskForGaps);
-  assert(Cost >= 0 && "TTI should not produce negative costs!");
-  return Cost;
-}
-
-int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                    ArrayRef<Type *> Tys, FastMathFlags FMF,
-                                    unsigned ScalarizationCostPassed) const {
-  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
-                                            ScalarizationCostPassed);
+    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+    bool UseMaskForCond, bool UseMaskForGaps) const {
+  int Cost = TTIImpl->getInterleavedMemoryOpCost(
+      Opcode, VecTy, Factor, Indices, Alignment, AddressSpace, CostKind,
+      UseMaskForCond, UseMaskForGaps);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
-int TargetTransformInfo::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-           ArrayRef<Value *> Args, FastMathFlags FMF, unsigned VF) const {
-  int Cost = TTIImpl->getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+int
+TargetTransformInfo::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                                           TTI::TargetCostKind CostKind) const {
+  int Cost = TTIImpl->getIntrinsicInstrCost(ICA, CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
 int TargetTransformInfo::getCallInstrCost(Function *F, Type *RetTy,
-                                          ArrayRef<Type *> Tys) const {
-  int Cost = TTIImpl->getCallInstrCost(F, RetTy, Tys);
+                                          ArrayRef<Type *> Tys,
+                                          TTI::TargetCostKind CostKind) const {
+  int Cost = TTIImpl->getCallInstrCost(F, RetTy, Tys, CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -730,18 +826,22 @@ int TargetTransformInfo::getMemcpyCost(const Instruction *I) const {
   return Cost;
 }
 
-int TargetTransformInfo::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
-                                                    bool IsPairwiseForm) const {
-  int Cost = TTIImpl->getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm);
+int TargetTransformInfo::getArithmeticReductionCost(unsigned Opcode,
+                                                    VectorType *Ty,
+                                                    bool IsPairwiseForm,
+                                                    TTI::TargetCostKind CostKind) const {
+  int Cost = TTIImpl->getArithmeticReductionCost(Opcode, Ty, IsPairwiseForm,
+                                                 CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
 
-int TargetTransformInfo::getMinMaxReductionCost(Type *Ty, Type *CondTy,
-                                                bool IsPairwiseForm,
-                                                bool IsUnsigned) const {
+int TargetTransformInfo::getMinMaxReductionCost(
+    VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned,
+    TTI::TargetCostKind CostKind) const {
   int Cost =
-      TTIImpl->getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned);
+      TTIImpl->getMinMaxReductionCost(Ty, CondTy, IsPairwiseForm, IsUnsigned,
+                                      CostKind);
   assert(Cost >= 0 && "TTI should not produce negative costs!");
   return Cost;
 }
@@ -765,18 +865,19 @@ Value *TargetTransformInfo::getOrCreateResultFromMemIntrinsic(
   return TTIImpl->getOrCreateResultFromMemIntrinsic(Inst, ExpectedType);
 }
 
-Type *TargetTransformInfo::getMemcpyLoopLoweringType(LLVMContext &Context,
-                                                     Value *Length,
-                                                     unsigned SrcAlign,
-                                                     unsigned DestAlign) const {
-  return TTIImpl->getMemcpyLoopLoweringType(Context, Length, SrcAlign,
-                                            DestAlign);
+Type *TargetTransformInfo::getMemcpyLoopLoweringType(
+    LLVMContext &Context, Value *Length, unsigned SrcAddrSpace,
+    unsigned DestAddrSpace, unsigned SrcAlign, unsigned DestAlign) const {
+  return TTIImpl->getMemcpyLoopLoweringType(Context, Length, SrcAddrSpace,
+                                            DestAddrSpace, SrcAlign, DestAlign);
 }
 
 void TargetTransformInfo::getMemcpyLoopResidualLoweringType(
     SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
-    unsigned RemainingBytes, unsigned SrcAlign, unsigned DestAlign) const {
+    unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
+    unsigned SrcAlign, unsigned DestAlign) const {
   TTIImpl->getMemcpyLoopResidualLoweringType(OpsOut, Context, RemainingBytes,
+                                             SrcAddrSpace, DestAddrSpace,
                                              SrcAlign, DestAlign);
 }
 
@@ -814,13 +915,13 @@ bool TargetTransformInfo::isLegalToVectorizeStore(StoreInst *SI) const {
 }
 
 bool TargetTransformInfo::isLegalToVectorizeLoadChain(
-    unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const {
+    unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const {
   return TTIImpl->isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment,
                                               AddrSpace);
 }
 
 bool TargetTransformInfo::isLegalToVectorizeStoreChain(
-    unsigned ChainSizeInBytes, unsigned Alignment, unsigned AddrSpace) const {
+    unsigned ChainSizeInBytes, Align Alignment, unsigned AddrSpace) const {
   return TTIImpl->isLegalToVectorizeStoreChain(ChainSizeInBytes, Alignment,
                                                AddrSpace);
 }
@@ -839,8 +940,8 @@ unsigned TargetTransformInfo::getStoreVectorFactor(unsigned VF,
   return TTIImpl->getStoreVectorFactor(VF, StoreSize, ChainSizeInBytes, VecTy);
 }
 
-bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode,
-                                                Type *Ty, ReductionFlags Flags) const {
+bool TargetTransformInfo::useReductionIntrinsic(unsigned Opcode, Type *Ty,
+                                                ReductionFlags Flags) const {
   return TTIImpl->useReductionIntrinsic(Opcode, Ty, Flags);
 }
 
@@ -865,46 +966,21 @@ static bool matchPairwiseShuffleMask(ShuffleVectorInst *SI, bool IsLeft,
   else if (!SI)
     return false;
 
-  SmallVector<int, 32> Mask(SI->getType()->getVectorNumElements(), -1);
+  SmallVector<int, 32> Mask(SI->getType()->getNumElements(), -1);
 
   // Build a mask of 0, 2, ... (left) or 1, 3, ... (right) depending on whether
   // we look at the left or right side.
   for (unsigned i = 0, e = (1 << Level), val = !IsLeft; i != e; ++i, val += 2)
     Mask[i] = val;
 
-  SmallVector<int, 16> ActualMask = SI->getShuffleMask();
+  ArrayRef<int> ActualMask = SI->getShuffleMask();
   return Mask == ActualMask;
 }
 
-namespace {
-/// Kind of the reduction data.
-enum ReductionKind {
-  RK_None,           /// Not a reduction.
-  RK_Arithmetic,     /// Binary reduction data.
-  RK_MinMax,         /// Min/max reduction data.
-  RK_UnsignedMinMax, /// Unsigned min/max reduction data.
-};
-/// Contains opcode + LHS/RHS parts of the reduction operations.
-struct ReductionData {
-  ReductionData() = delete;
-  ReductionData(ReductionKind Kind, unsigned Opcode, Value *LHS, Value *RHS)
-      : Opcode(Opcode), LHS(LHS), RHS(RHS), Kind(Kind) {
-    assert(Kind != RK_None && "expected binary or min/max reduction only.");
-  }
-  unsigned Opcode = 0;
-  Value *LHS = nullptr;
-  Value *RHS = nullptr;
-  ReductionKind Kind = RK_None;
-  bool hasSameData(ReductionData &RD) const {
-    return Kind == RD.Kind && Opcode == RD.Opcode;
-  }
-};
-} // namespace
-
-static Optional<ReductionData> getReductionData(Instruction *I) {
+static Optional<TTI::ReductionData> getReductionData(Instruction *I) {
   Value *L, *R;
   if (m_BinOp(m_Value(L), m_Value(R)).match(I))
-    return ReductionData(RK_Arithmetic, I->getOpcode(), L, R);
+    return TTI::ReductionData(TTI::RK_Arithmetic, I->getOpcode(), L, R);
   if (auto *SI = dyn_cast<SelectInst>(I)) {
     if (m_SMin(m_Value(L), m_Value(R)).match(SI) ||
         m_SMax(m_Value(L), m_Value(R)).match(SI) ||
@@ -913,20 +989,20 @@ static Optional<ReductionData> getReductionData(Instruction *I) {
         m_UnordFMin(m_Value(L), m_Value(R)).match(SI) ||
         m_UnordFMax(m_Value(L), m_Value(R)).match(SI)) {
       auto *CI = cast<CmpInst>(SI->getCondition());
-      return ReductionData(RK_MinMax, CI->getOpcode(), L, R);
+      return TTI::ReductionData(TTI::RK_MinMax, CI->getOpcode(), L, R);
     }
     if (m_UMin(m_Value(L), m_Value(R)).match(SI) ||
         m_UMax(m_Value(L), m_Value(R)).match(SI)) {
       auto *CI = cast<CmpInst>(SI->getCondition());
-      return ReductionData(RK_UnsignedMinMax, CI->getOpcode(), L, R);
+      return TTI::ReductionData(TTI::RK_UnsignedMinMax, CI->getOpcode(), L, R);
     }
   }
   return llvm::None;
 }
 
-static ReductionKind matchPairwiseReductionAtLevel(Instruction *I,
-                                                   unsigned Level,
-                                                   unsigned NumLevels) {
+static TTI::ReductionKind matchPairwiseReductionAtLevel(Instruction *I,
+                                                        unsigned Level,
+                                                        unsigned NumLevels) {
   // Match one level of pairwise operations.
   // %rdx.shuf.0.0 = shufflevector <4 x float> %rdx, <4 x float> undef,
   //       <4 x i32> <i32 0, i32 2 , i32 undef, i32 undef>
@@ -934,24 +1010,24 @@ static ReductionKind matchPairwiseReductionAtLevel(Instruction *I,
   //       <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
   // %bin.rdx.0 = fadd <4 x float> %rdx.shuf.0.0, %rdx.shuf.0.1
   if (!I)
-    return RK_None;
+    return TTI::RK_None;
 
   assert(I->getType()->isVectorTy() && "Expecting a vector type");
 
-  Optional<ReductionData> RD = getReductionData(I);
+  Optional<TTI::ReductionData> RD = getReductionData(I);
   if (!RD)
-    return RK_None;
+    return TTI::RK_None;
 
   ShuffleVectorInst *LS = dyn_cast<ShuffleVectorInst>(RD->LHS);
   if (!LS && Level)
-    return RK_None;
+    return TTI::RK_None;
   ShuffleVectorInst *RS = dyn_cast<ShuffleVectorInst>(RD->RHS);
   if (!RS && Level)
-    return RK_None;
+    return TTI::RK_None;
 
   // On level 0 we can omit one shufflevector instruction.
   if (!Level && !RS && !LS)
-    return RK_None;
+    return TTI::RK_None;
 
   // Shuffle inputs must match.
   Value *NextLevelOpL = LS ? LS->getOperand(0) : nullptr;
@@ -960,7 +1036,7 @@ static ReductionKind matchPairwiseReductionAtLevel(Instruction *I,
   if (NextLevelOpR && NextLevelOpL) {
     // If we have two shuffles their operands must match.
     if (NextLevelOpL != NextLevelOpR)
-      return RK_None;
+      return TTI::RK_None;
 
     NextLevelOp = NextLevelOpL;
   } else if (Level == 0 && (NextLevelOpR || NextLevelOpL)) {
@@ -971,46 +1047,48 @@ static ReductionKind matchPairwiseReductionAtLevel(Instruction *I,
     //  %NextLevelOpL = shufflevector %R, <1, undef ...>
     //  %BinOp        = fadd          %NextLevelOpL, %R
     if (NextLevelOpL && NextLevelOpL != RD->RHS)
-      return RK_None;
+      return TTI::RK_None;
     else if (NextLevelOpR && NextLevelOpR != RD->LHS)
-      return RK_None;
+      return TTI::RK_None;
 
     NextLevelOp = NextLevelOpL ? RD->RHS : RD->LHS;
   } else
-    return RK_None;
+    return TTI::RK_None;
 
   // Check that the next levels binary operation exists and matches with the
   // current one.
   if (Level + 1 != NumLevels) {
-    Optional<ReductionData> NextLevelRD =
+    if (!isa<Instruction>(NextLevelOp))
+      return TTI::RK_None;
+    Optional<TTI::ReductionData> NextLevelRD =
         getReductionData(cast<Instruction>(NextLevelOp));
     if (!NextLevelRD || !RD->hasSameData(*NextLevelRD))
-      return RK_None;
+      return TTI::RK_None;
   }
 
   // Shuffle mask for pairwise operation must match.
   if (matchPairwiseShuffleMask(LS, /*IsLeft=*/true, Level)) {
     if (!matchPairwiseShuffleMask(RS, /*IsLeft=*/false, Level))
-      return RK_None;
+      return TTI::RK_None;
   } else if (matchPairwiseShuffleMask(RS, /*IsLeft=*/true, Level)) {
     if (!matchPairwiseShuffleMask(LS, /*IsLeft=*/false, Level))
-      return RK_None;
+      return TTI::RK_None;
   } else {
-    return RK_None;
+    return TTI::RK_None;
   }
 
   if (++Level == NumLevels)
     return RD->Kind;
 
   // Match next level.
-  return matchPairwiseReductionAtLevel(cast<Instruction>(NextLevelOp), Level,
+  return matchPairwiseReductionAtLevel(dyn_cast<Instruction>(NextLevelOp), Level,
                                        NumLevels);
 }
 
-static ReductionKind matchPairwiseReduction(const ExtractElementInst *ReduxRoot,
-                                            unsigned &Opcode, Type *&Ty) {
+TTI::ReductionKind TTI::matchPairwiseReduction(
+  const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty) {
   if (!EnableReduxCost)
-    return RK_None;
+    return TTI::RK_None;
 
   // Need to extract the first element.
   ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
@@ -1018,19 +1096,19 @@ static ReductionKind matchPairwiseReduction(const ExtractElementInst *ReduxRoot,
   if (CI)
     Idx = CI->getZExtValue();
   if (Idx != 0)
-    return RK_None;
+    return TTI::RK_None;
 
   auto *RdxStart = dyn_cast<Instruction>(ReduxRoot->getOperand(0));
   if (!RdxStart)
-    return RK_None;
-  Optional<ReductionData> RD = getReductionData(RdxStart);
+    return TTI::RK_None;
+  Optional<TTI::ReductionData> RD = getReductionData(RdxStart);
   if (!RD)
-    return RK_None;
+    return TTI::RK_None;
 
-  Type *VecTy = RdxStart->getType();
-  unsigned NumVecElems = VecTy->getVectorNumElements();
+  auto *VecTy = cast<VectorType>(RdxStart->getType());
+  unsigned NumVecElems = VecTy->getNumElements();
   if (!isPowerOf2_32(NumVecElems))
-    return RK_None;
+    return TTI::RK_None;
 
   // We look for a sequence of shuffle,shuffle,add triples like the following
   // that builds a pairwise reduction tree.
@@ -1051,8 +1129,8 @@ static ReductionKind matchPairwiseReduction(const ExtractElementInst *ReduxRoot,
   // %bin.rdx8 = fadd <4 x float> %rdx.shuf.1.0, %rdx.shuf.1.1
   // %r = extractelement <4 x float> %bin.rdx8, i32 0
   if (matchPairwiseReductionAtLevel(RdxStart, 0, Log2_32(NumVecElems)) ==
-      RK_None)
-    return RK_None;
+      TTI::RK_None)
+    return TTI::RK_None;
 
   Opcode = RD->Opcode;
   Ty = VecTy;
@@ -1071,11 +1149,11 @@ getShuffleAndOtherOprd(Value *L, Value *R) {
   return std::make_pair(L, S);
 }
 
-static ReductionKind
-matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
-                              unsigned &Opcode, Type *&Ty) {
+TTI::ReductionKind TTI::matchVectorSplittingReduction(
+  const ExtractElementInst *ReduxRoot, unsigned &Opcode, VectorType *&Ty) {
+
   if (!EnableReduxCost)
-    return RK_None;
+    return TTI::RK_None;
 
   // Need to extract the first element.
   ConstantInt *CI = dyn_cast<ConstantInt>(ReduxRoot->getOperand(1));
@@ -1083,19 +1161,19 @@ matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
   if (CI)
     Idx = CI->getZExtValue();
   if (Idx != 0)
-    return RK_None;
+    return TTI::RK_None;
 
   auto *RdxStart = dyn_cast<Instruction>(ReduxRoot->getOperand(0));
   if (!RdxStart)
-    return RK_None;
-  Optional<ReductionData> RD = getReductionData(RdxStart);
+    return TTI::RK_None;
+  Optional<TTI::ReductionData> RD = getReductionData(RdxStart);
   if (!RD)
-    return RK_None;
+    return TTI::RK_None;
 
-  Type *VecTy = ReduxRoot->getOperand(0)->getType();
-  unsigned NumVecElems = VecTy->getVectorNumElements();
+  auto *VecTy = cast<VectorType>(ReduxRoot->getOperand(0)->getType());
+  unsigned NumVecElems = VecTy->getNumElements();
   if (!isPowerOf2_32(NumVecElems))
-    return RK_None;
+    return TTI::RK_None;
 
   // We look for a sequence of shuffles and adds like the following matching one
   // fadd, shuffle vector pair at a time.
@@ -1115,10 +1193,10 @@ matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
   while (NumVecElemsRemain - 1) {
     // Check for the right reduction operation.
     if (!RdxOp)
-      return RK_None;
-    Optional<ReductionData> RDLevel = getReductionData(RdxOp);
+      return TTI::RK_None;
+    Optional<TTI::ReductionData> RDLevel = getReductionData(RdxOp);
     if (!RDLevel || !RDLevel->hasSameData(*RD))
-      return RK_None;
+      return TTI::RK_None;
 
     Value *NextRdxOp;
     ShuffleVectorInst *Shuffle;
@@ -1127,9 +1205,9 @@ matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
 
     // Check the current reduction operation and the shuffle use the same value.
     if (Shuffle == nullptr)
-      return RK_None;
+      return TTI::RK_None;
     if (Shuffle->getOperand(0) != NextRdxOp)
-      return RK_None;
+      return TTI::RK_None;
 
     // Check that shuffle masks matches.
     for (unsigned j = 0; j != MaskStart; ++j)
@@ -1137,9 +1215,9 @@ matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
     // Fill the rest of the mask with -1 for undef.
     std::fill(&ShuffleMask[MaskStart], ShuffleMask.end(), -1);
 
-    SmallVector<int, 16> Mask = Shuffle->getShuffleMask();
+    ArrayRef<int> Mask = Shuffle->getShuffleMask();
     if (ShuffleMask != Mask)
-      return RK_None;
+      return TTI::RK_None;
 
     RdxOp = dyn_cast<Instruction>(NextRdxOp);
     NumVecElemsRemain /= 2;
@@ -1152,15 +1230,13 @@ matchVectorSplittingReduction(const ExtractElementInst *ReduxRoot,
 }
 
 int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
   switch (I->getOpcode()) {
   case Instruction::GetElementPtr:
-    return getUserCost(I);
-
   case Instruction::Ret:
   case Instruction::PHI:
-  case Instruction::Br: {
-    return getCFInstrCost(I->getOpcode());
-  }
+  case Instruction::Br:
   case Instruction::Add:
   case Instruction::FAdd:
   case Instruction::Sub:
@@ -1178,48 +1254,13 @@ int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
   case Instruction::AShr:
   case Instruction::And:
   case Instruction::Or:
-  case Instruction::Xor: {
-    TargetTransformInfo::OperandValueKind Op1VK, Op2VK;
-    TargetTransformInfo::OperandValueProperties Op1VP, Op2VP;
-    Op1VK = getOperandInfo(I->getOperand(0), Op1VP);
-    Op2VK = getOperandInfo(I->getOperand(1), Op2VP);
-    SmallVector<const Value *, 2> Operands(I->operand_values());
-    return getArithmeticInstrCost(I->getOpcode(), I->getType(), Op1VK, Op2VK,
-                                  Op1VP, Op2VP, Operands, I);
-  }
-  case Instruction::FNeg: {
-    TargetTransformInfo::OperandValueKind Op1VK, Op2VK;
-    TargetTransformInfo::OperandValueProperties Op1VP, Op2VP;
-    Op1VK = getOperandInfo(I->getOperand(0), Op1VP);
-    Op2VK = OK_AnyValue;
-    Op2VP = OP_None;
-    SmallVector<const Value *, 2> Operands(I->operand_values());
-    return getArithmeticInstrCost(I->getOpcode(), I->getType(), Op1VK, Op2VK,
-                                  Op1VP, Op2VP, Operands, I);
-  }
-  case Instruction::Select: {
-    const SelectInst *SI = cast<SelectInst>(I);
-    Type *CondTy = SI->getCondition()->getType();
-    return getCmpSelInstrCost(I->getOpcode(), I->getType(), CondTy, I);
-  }
+  case Instruction::Xor:
+  case Instruction::FNeg:
+  case Instruction::Select:
   case Instruction::ICmp:
-  case Instruction::FCmp: {
-    Type *ValTy = I->getOperand(0)->getType();
-    return getCmpSelInstrCost(I->getOpcode(), ValTy, I->getType(), I);
-  }
-  case Instruction::Store: {
-    const StoreInst *SI = cast<StoreInst>(I);
-    Type *ValTy = SI->getValueOperand()->getType();
-    return getMemoryOpCost(I->getOpcode(), ValTy,
-                           MaybeAlign(SI->getAlignment()),
-                           SI->getPointerAddressSpace(), I);
-  }
-  case Instruction::Load: {
-    const LoadInst *LI = cast<LoadInst>(I);
-    return getMemoryOpCost(I->getOpcode(), I->getType(),
-                           MaybeAlign(LI->getAlignment()),
-                           LI->getPointerAddressSpace(), I);
-  }
+  case Instruction::FCmp:
+  case Instruction::Store:
+  case Instruction::Load:
   case Instruction::ZExt:
   case Instruction::SExt:
   case Instruction::FPToUI:
@@ -1232,113 +1273,13 @@ int TargetTransformInfo::getInstructionThroughput(const Instruction *I) const {
   case Instruction::Trunc:
   case Instruction::FPTrunc:
   case Instruction::BitCast:
-  case Instruction::AddrSpaceCast: {
-    Type *SrcTy = I->getOperand(0)->getType();
-    return getCastInstrCost(I->getOpcode(), I->getType(), SrcTy, I);
-  }
-  case Instruction::ExtractElement: {
-    const ExtractElementInst * EEI = cast<ExtractElementInst>(I);
-    ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
-    unsigned Idx = -1;
-    if (CI)
-      Idx = CI->getZExtValue();
-
-    // Try to match a reduction sequence (series of shufflevector and vector
-    // adds followed by a extractelement).
-    unsigned ReduxOpCode;
-    Type *ReduxType;
-
-    switch (matchVectorSplittingReduction(EEI, ReduxOpCode, ReduxType)) {
-    case RK_Arithmetic:
-      return getArithmeticReductionCost(ReduxOpCode, ReduxType,
-                                             /*IsPairwiseForm=*/false);
-    case RK_MinMax:
-      return getMinMaxReductionCost(
-          ReduxType, CmpInst::makeCmpResultType(ReduxType),
-          /*IsPairwiseForm=*/false, /*IsUnsigned=*/false);
-    case RK_UnsignedMinMax:
-      return getMinMaxReductionCost(
-          ReduxType, CmpInst::makeCmpResultType(ReduxType),
-          /*IsPairwiseForm=*/false, /*IsUnsigned=*/true);
-    case RK_None:
-      break;
-    }
-
-    switch (matchPairwiseReduction(EEI, ReduxOpCode, ReduxType)) {
-    case RK_Arithmetic:
-      return getArithmeticReductionCost(ReduxOpCode, ReduxType,
-                                             /*IsPairwiseForm=*/true);
-    case RK_MinMax:
-      return getMinMaxReductionCost(
-          ReduxType, CmpInst::makeCmpResultType(ReduxType),
-          /*IsPairwiseForm=*/true, /*IsUnsigned=*/false);
-    case RK_UnsignedMinMax:
-      return getMinMaxReductionCost(
-          ReduxType, CmpInst::makeCmpResultType(ReduxType),
-          /*IsPairwiseForm=*/true, /*IsUnsigned=*/true);
-    case RK_None:
-      break;
-    }
-
-    return getVectorInstrCost(I->getOpcode(),
-                                   EEI->getOperand(0)->getType(), Idx);
-  }
-  case Instruction::InsertElement: {
-    const InsertElementInst * IE = cast<InsertElementInst>(I);
-    ConstantInt *CI = dyn_cast<ConstantInt>(IE->getOperand(2));
-    unsigned Idx = -1;
-    if (CI)
-      Idx = CI->getZExtValue();
-    return getVectorInstrCost(I->getOpcode(),
-                                   IE->getType(), Idx);
-  }
+  case Instruction::AddrSpaceCast:
+  case Instruction::ExtractElement:
+  case Instruction::InsertElement:
   case Instruction::ExtractValue:
-    return 0; // Model all ExtractValue nodes as free.
-  case Instruction::ShuffleVector: {
-    const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
-    Type *Ty = Shuffle->getType();
-    Type *SrcTy = Shuffle->getOperand(0)->getType();
-
-    // TODO: Identify and add costs for insert subvector, etc.
-    int SubIndex;
-    if (Shuffle->isExtractSubvectorMask(SubIndex))
-      return TTIImpl->getShuffleCost(SK_ExtractSubvector, SrcTy, SubIndex, Ty);
-
-    if (Shuffle->changesLength())
-      return -1;
-
-    if (Shuffle->isIdentity())
-      return 0;
-
-    if (Shuffle->isReverse())
-      return TTIImpl->getShuffleCost(SK_Reverse, Ty, 0, nullptr);
-
-    if (Shuffle->isSelect())
-      return TTIImpl->getShuffleCost(SK_Select, Ty, 0, nullptr);
-
-    if (Shuffle->isTranspose())
-      return TTIImpl->getShuffleCost(SK_Transpose, Ty, 0, nullptr);
-
-    if (Shuffle->isZeroEltSplat())
-      return TTIImpl->getShuffleCost(SK_Broadcast, Ty, 0, nullptr);
-
-    if (Shuffle->isSingleSource())
-      return TTIImpl->getShuffleCost(SK_PermuteSingleSrc, Ty, 0, nullptr);
-
-    return TTIImpl->getShuffleCost(SK_PermuteTwoSrc, Ty, 0, nullptr);
-  }
+  case Instruction::ShuffleVector:
   case Instruction::Call:
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
-      SmallVector<Value *, 4> Args(II->arg_operands());
-
-      FastMathFlags FMF;
-      if (auto *FPMO = dyn_cast<FPMathOperator>(II))
-        FMF = FPMO->getFastMathFlags();
-
-      return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(),
-                                        Args, FMF);
-    }
-    return -1;
+    return getUserCost(I, CostKind);
   default:
     // We don't have any information on this instruction.
     return -1;
diff --git a/llvm/lib/Analysis/TypeMetadataUtils.cpp b/llvm/lib/Analysis/TypeMetadataUtils.cpp
index 072d291f3f93..8735d56f907a 100644
--- a/llvm/lib/Analysis/TypeMetadataUtils.cpp
+++ b/llvm/lib/Analysis/TypeMetadataUtils.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 
@@ -37,10 +38,10 @@ findCallsAtConstantOffset(SmallVectorImpl<DevirtCallSite> &DevirtCalls,
     if (isa<BitCastInst>(User)) {
       findCallsAtConstantOffset(DevirtCalls, HasNonCallUses, User, Offset, CI,
                                 DT);
-    } else if (auto CI = dyn_cast<CallInst>(User)) {
-      DevirtCalls.push_back({Offset, CI});
-    } else if (auto II = dyn_cast<InvokeInst>(User)) {
-      DevirtCalls.push_back({Offset, II});
+    } else if (auto *CI = dyn_cast<CallInst>(User)) {
+      DevirtCalls.push_back({Offset, *CI});
+    } else if (auto *II = dyn_cast<InvokeInst>(User)) {
+      DevirtCalls.push_back({Offset, *II});
     } else if (HasNonCallUses) {
       *HasNonCallUses = true;
     }
diff --git a/llvm/lib/Analysis/VFABIDemangling.cpp b/llvm/lib/Analysis/VFABIDemangling.cpp
index a331b95e818b..0192a216b2f7 100644
--- a/llvm/lib/Analysis/VFABIDemangling.cpp
+++ b/llvm/lib/Analysis/VFABIDemangling.cpp
@@ -70,6 +70,9 @@ ParseRet tryParseMask(StringRef &MangledName, bool &IsMasked) {
 ///
 ParseRet tryParseVLEN(StringRef &ParseString, unsigned &VF, bool &IsScalable) {
   if (ParseString.consume_front("x")) {
+    // Set VF to 0, to be later adjusted to a value grater than zero
+    // by looking at the signature of the vector function with
+    // `getECFromSignature`.
     VF = 0;
     IsScalable = true;
     return ParseRet::OK;
@@ -78,6 +81,10 @@ ParseRet tryParseVLEN(StringRef &ParseString, unsigned &VF, bool &IsScalable) {
   if (ParseString.consumeInteger(10, VF))
     return ParseRet::Error;
 
+  // The token `0` is invalid for VLEN.
+  if (VF == 0)
+    return ParseRet::Error;
+
   IsScalable = false;
   return ParseRet::OK;
 }
@@ -207,28 +214,6 @@ ParseRet tryParseLinearWithCompileTimeStep(StringRef &ParseString,
   return ParseRet::None;
 }
 
-/// The function looks for the following strings at the beginning of
-/// the input string `ParseString`:
-///
-/// "u" <number>
-///
-/// On success, it removes the parsed parameter from `ParseString`,
-/// sets `PKind` to the correspondent enum value, sets `Pos` to
-/// <number>, and return success.  On a syntax error, it return a
-/// parsing error. If nothing is parsed, it returns None.
-ParseRet tryParseUniform(StringRef &ParseString, VFParamKind &PKind, int &Pos) {
-  // "u" <Pos>
-  const char *UniformToken = "u";
-  if (ParseString.consume_front(UniformToken)) {
-    PKind = VFABI::getVFParamKindFromString(UniformToken);
-    if (ParseString.consumeInteger(10, Pos))
-      return ParseRet::Error;
-
-    return ParseRet::OK;
-  }
-  return ParseRet::None;
-}
-
 /// Looks into the <parameters> part of the mangled name in search
 /// for valid paramaters at the beginning of the string
 /// `ParseString`.
@@ -245,6 +230,12 @@ ParseRet tryParseParameter(StringRef &ParseString, VFParamKind &PKind,
     return ParseRet::OK;
   }
 
+  if (ParseString.consume_front("u")) {
+    PKind = VFParamKind::OMP_Uniform;
+    StepOrPos = 0;
+    return ParseRet::OK;
+  }
+
   const ParseRet HasLinearRuntime =
       tryParseLinearWithRuntimeStep(ParseString, PKind, StepOrPos);
   if (HasLinearRuntime != ParseRet::None)
@@ -255,10 +246,6 @@ ParseRet tryParseParameter(StringRef &ParseString, VFParamKind &PKind,
   if (HasLinearCompileTime != ParseRet::None)
     return HasLinearCompileTime;
 
-  const ParseRet HasUniform = tryParseUniform(ParseString, PKind, StepOrPos);
-  if (HasUniform != ParseRet::None)
-    return HasUniform;
-
   return ParseRet::None;
 }
 
@@ -287,11 +274,50 @@ ParseRet tryParseAlign(StringRef &ParseString, Align &Alignment) {
 
   return ParseRet::None;
 }
+#ifndef NDEBUG
+// Verify the assumtion that all vectors in the signature of a vector
+// function have the same number of elements.
+bool verifyAllVectorsHaveSameWidth(FunctionType *Signature) {
+  SmallVector<VectorType *, 2> VecTys;
+  if (auto *RetTy = dyn_cast<VectorType>(Signature->getReturnType()))
+    VecTys.push_back(RetTy);
+  for (auto *Ty : Signature->params())
+    if (auto *VTy = dyn_cast<VectorType>(Ty))
+      VecTys.push_back(VTy);
+
+  if (VecTys.size() <= 1)
+    return true;
+
+  assert(VecTys.size() > 1 && "Invalid number of elements.");
+  const ElementCount EC = VecTys[0]->getElementCount();
+  return llvm::all_of(
+      llvm::make_range(VecTys.begin() + 1, VecTys.end()),
+      [&EC](VectorType *VTy) { return (EC == VTy->getElementCount()); });
+}
+
+#endif // NDEBUG
+
+// Extract the VectorizationFactor from a given function signature,
+// under the assumtion that all vectors have the same number of
+// elements, i.e. same ElementCount.Min.
+ElementCount getECFromSignature(FunctionType *Signature) {
+  assert(verifyAllVectorsHaveSameWidth(Signature) &&
+         "Invalid vector signature.");
+
+  if (auto *RetTy = dyn_cast<VectorType>(Signature->getReturnType()))
+    return RetTy->getElementCount();
+  for (auto *Ty : Signature->params())
+    if (auto *VTy = dyn_cast<VectorType>(Ty))
+      return VTy->getElementCount();
+
+  return ElementCount(/*Min=*/1, /*Scalable=*/false);
+}
 } // namespace
 
 // Format of the ABI name:
 // _ZGV<isa><mask><vlen><parameters>_<scalarname>[(<redirection>)]
-Optional<VFInfo> VFABI::tryDemangleForVFABI(StringRef MangledName) {
+Optional<VFInfo> VFABI::tryDemangleForVFABI(StringRef MangledName,
+                                            const Module &M) {
   const StringRef OriginalName = MangledName;
   // Assume there is no custom name <redirection>, and therefore the
   // vector name consists of
@@ -402,8 +428,34 @@ Optional<VFInfo> VFABI::tryDemangleForVFABI(StringRef MangledName) {
     assert(Parameters.back().ParamKind == VFParamKind::GlobalPredicate &&
            "The global predicate must be the last parameter");
 
+  // Adjust the VF for scalable signatures. The EC.Min is not encoded
+  // in the name of the function, but it is encoded in the IR
+  // signature of the function. We need to extract this information
+  // because it is needed by the loop vectorizer, which reasons in
+  // terms of VectorizationFactor or ElementCount. In particular, we
+  // need to make sure that the VF field of the VFShape class is never
+  // set to 0.
+  if (IsScalable) {
+    const Function *F = M.getFunction(VectorName);
+    // The declaration of the function must be present in the module
+    // to be able to retrieve its signature.
+    if (!F)
+      return None;
+    const ElementCount EC = getECFromSignature(F->getFunctionType());
+    VF = EC.Min;
+  }
+
+  // Sanity checks.
+  // 1. We don't accept a zero lanes vectorization factor.
+  // 2. We don't accept the demangling if the vector function is not
+  // present in the module.
+  if (VF == 0)
+    return None;
+  if (!M.getFunction(VectorName))
+    return None;
+
   const VFShape Shape({VF, IsScalable, Parameters});
-  return VFInfo({Shape, ScalarName, VectorName, ISA});
+  return VFInfo({Shape, std::string(ScalarName), std::string(VectorName), ISA});
 }
 
 VFParamKind VFABI::getVFParamKindFromString(const StringRef Token) {
diff --git a/llvm/lib/Analysis/ValueLattice.cpp b/llvm/lib/Analysis/ValueLattice.cpp
index a0115a0eec36..627166e2409d 100644
--- a/llvm/lib/Analysis/ValueLattice.cpp
+++ b/llvm/lib/Analysis/ValueLattice.cpp
@@ -10,13 +10,21 @@
 
 namespace llvm {
 raw_ostream &operator<<(raw_ostream &OS, const ValueLatticeElement &Val) {
-  if (Val.isUndefined())
-    return OS << "undefined";
+  if (Val.isUnknown())
+    return OS << "unknown";
+  if (Val.isUndef())
+    return OS << "undef";
   if (Val.isOverdefined())
     return OS << "overdefined";
 
   if (Val.isNotConstant())
     return OS << "notconstant<" << *Val.getNotConstant() << ">";
+
+  if (Val.isConstantRangeIncludingUndef())
+    return OS << "constantrange incl. undef <"
+              << Val.getConstantRange(true).getLower() << ", "
+              << Val.getConstantRange(true).getUpper() << ">";
+
   if (Val.isConstantRange())
     return OS << "constantrange<" << Val.getConstantRange().getLower() << ", "
               << Val.getConstantRange().getUpper() << ">";
diff --git a/llvm/lib/Analysis/ValueLatticeUtils.cpp b/llvm/lib/Analysis/ValueLatticeUtils.cpp
index 3f9287e26ce7..53638c351f72 100644
--- a/llvm/lib/Analysis/ValueLatticeUtils.cpp
+++ b/llvm/lib/Analysis/ValueLatticeUtils.cpp
@@ -28,16 +28,14 @@ bool llvm::canTrackGlobalVariableInterprocedurally(GlobalVariable *GV) {
   if (GV->isConstant() || !GV->hasLocalLinkage() ||
       !GV->hasDefinitiveInitializer())
     return false;
-  return !any_of(GV->users(), [&](User *U) {
-    if (auto *Store = dyn_cast<StoreInst>(U)) {
-      if (Store->getValueOperand() == GV || Store->isVolatile())
-        return true;
-    } else if (auto *Load = dyn_cast<LoadInst>(U)) {
-      if (Load->isVolatile())
-        return true;
-    } else {
-      return true;
-    }
+  return all_of(GV->users(), [&](User *U) {
+    // Currently all users of a global variable have to be none-volatile loads
+    // or stores and the global cannot be stored itself.
+    if (auto *Store = dyn_cast<StoreInst>(U))
+      return Store->getValueOperand() != GV && !Store->isVolatile();
+    if (auto *Load = dyn_cast<LoadInst>(U))
+      return !Load->isVolatile();
+
     return false;
   });
 }
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index ad6765e2514b..43caaa62c2ec 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -24,6 +24,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/GuardUtils.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -34,7 +35,6 @@
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
@@ -163,8 +163,61 @@ static const Instruction *safeCxtI(const Value *V, const Instruction *CxtI) {
   return nullptr;
 }
 
-static void computeKnownBits(const Value *V, KnownBits &Known,
-                             unsigned Depth, const Query &Q);
+static bool getShuffleDemandedElts(const ShuffleVectorInst *Shuf,
+                                   const APInt &DemandedElts,
+                                   APInt &DemandedLHS, APInt &DemandedRHS) {
+  // The length of scalable vectors is unknown at compile time, thus we
+  // cannot check their values
+  if (isa<ScalableVectorType>(Shuf->getType()))
+    return false;
+
+  int NumElts =
+      cast<VectorType>(Shuf->getOperand(0)->getType())->getNumElements();
+  int NumMaskElts = Shuf->getType()->getNumElements();
+  DemandedLHS = DemandedRHS = APInt::getNullValue(NumElts);
+  if (DemandedElts.isNullValue())
+    return true;
+  // Simple case of a shuffle with zeroinitializer.
+  if (all_of(Shuf->getShuffleMask(), [](int Elt) { return Elt == 0; })) {
+    DemandedLHS.setBit(0);
+    return true;
+  }
+  for (int i = 0; i != NumMaskElts; ++i) {
+    if (!DemandedElts[i])
+      continue;
+    int M = Shuf->getMaskValue(i);
+    assert(M < (NumElts * 2) && "Invalid shuffle mask constant");
+
+    // For undef elements, we don't know anything about the common state of
+    // the shuffle result.
+    if (M == -1)
+      return false;
+    if (M < NumElts)
+      DemandedLHS.setBit(M % NumElts);
+    else
+      DemandedRHS.setBit(M % NumElts);
+  }
+
+  return true;
+}
+
+static void computeKnownBits(const Value *V, const APInt &DemandedElts,
+                             KnownBits &Known, unsigned Depth, const Query &Q);
+
+static void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
+                             const Query &Q) {
+  // FIXME: We currently have no way to represent the DemandedElts of a scalable
+  // vector
+  if (isa<ScalableVectorType>(V->getType())) {
+    Known.resetAll();
+    return;
+  }
+
+  auto *FVTy = dyn_cast<FixedVectorType>(V->getType());
+  APInt DemandedElts =
+      FVTy ? APInt::getAllOnesValue(FVTy->getNumElements()) : APInt(1, 1);
+  computeKnownBits(V, DemandedElts, Known, Depth, Q);
+}
 
 void llvm::computeKnownBits(const Value *V, KnownBits &Known,
                             const DataLayout &DL, unsigned Depth,
@@ -175,6 +228,18 @@ void llvm::computeKnownBits(const Value *V, KnownBits &Known,
                      Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo, ORE));
 }
 
+void llvm::computeKnownBits(const Value *V, const APInt &DemandedElts,
+                            KnownBits &Known, const DataLayout &DL,
+                            unsigned Depth, AssumptionCache *AC,
+                            const Instruction *CxtI, const DominatorTree *DT,
+                            OptimizationRemarkEmitter *ORE, bool UseInstrInfo) {
+  ::computeKnownBits(V, DemandedElts, Known, Depth,
+                     Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo, ORE));
+}
+
+static KnownBits computeKnownBits(const Value *V, const APInt &DemandedElts,
+                                  unsigned Depth, const Query &Q);
+
 static KnownBits computeKnownBits(const Value *V, unsigned Depth,
                                   const Query &Q);
 
@@ -188,6 +253,17 @@ KnownBits llvm::computeKnownBits(const Value *V, const DataLayout &DL,
       V, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo, ORE));
 }
 
+KnownBits llvm::computeKnownBits(const Value *V, const APInt &DemandedElts,
+                                 const DataLayout &DL, unsigned Depth,
+                                 AssumptionCache *AC, const Instruction *CxtI,
+                                 const DominatorTree *DT,
+                                 OptimizationRemarkEmitter *ORE,
+                                 bool UseInstrInfo) {
+  return ::computeKnownBits(
+      V, DemandedElts, Depth,
+      Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo, ORE));
+}
+
 bool llvm::haveNoCommonBitsSet(const Value *LHS, const Value *RHS,
                                const DataLayout &DL, AssumptionCache *AC,
                                const Instruction *CxtI, const DominatorTree *DT,
@@ -235,6 +311,9 @@ bool llvm::isKnownToBeAPowerOfTwo(const Value *V, const DataLayout &DL,
       V, OrZero, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo));
 }
 
+static bool isKnownNonZero(const Value *V, const APInt &DemandedElts,
+                           unsigned Depth, const Query &Q);
+
 static bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q);
 
 bool llvm::isKnownNonZero(const Value *V, const DataLayout &DL, unsigned Depth,
@@ -295,8 +374,21 @@ bool llvm::MaskedValueIsZero(const Value *V, const APInt &Mask,
       V, Mask, Depth, Query(DL, AC, safeCxtI(V, CxtI), DT, UseInstrInfo));
 }
 
+static unsigned ComputeNumSignBits(const Value *V, const APInt &DemandedElts,
+                                   unsigned Depth, const Query &Q);
+
 static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
-                                   const Query &Q);
+                                   const Query &Q) {
+  // FIXME: We currently have no way to represent the DemandedElts of a scalable
+  // vector
+  if (isa<ScalableVectorType>(V->getType()))
+    return 1;
+
+  auto *FVTy = dyn_cast<FixedVectorType>(V->getType());
+  APInt DemandedElts =
+      FVTy ? APInt::getAllOnesValue(FVTy->getNumElements()) : APInt(1, 1);
+  return ComputeNumSignBits(V, DemandedElts, Depth, Q);
+}
 
 unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL,
                                   unsigned Depth, AssumptionCache *AC,
@@ -307,26 +399,27 @@ unsigned llvm::ComputeNumSignBits(const Value *V, const DataLayout &DL,
 }
 
 static void computeKnownBitsAddSub(bool Add, const Value *Op0, const Value *Op1,
-                                   bool NSW,
+                                   bool NSW, const APInt &DemandedElts,
                                    KnownBits &KnownOut, KnownBits &Known2,
                                    unsigned Depth, const Query &Q) {
-  unsigned BitWidth = KnownOut.getBitWidth();
+  computeKnownBits(Op1, DemandedElts, KnownOut, Depth + 1, Q);
 
-  // If an initial sequence of bits in the result is not needed, the
-  // corresponding bits in the operands are not needed.
-  KnownBits LHSKnown(BitWidth);
-  computeKnownBits(Op0, LHSKnown, Depth + 1, Q);
-  computeKnownBits(Op1, Known2, Depth + 1, Q);
+  // If one operand is unknown and we have no nowrap information,
+  // the result will be unknown independently of the second operand.
+  if (KnownOut.isUnknown() && !NSW)
+    return;
 
-  KnownOut = KnownBits::computeForAddSub(Add, NSW, LHSKnown, Known2);
+  computeKnownBits(Op0, DemandedElts, Known2, Depth + 1, Q);
+  KnownOut = KnownBits::computeForAddSub(Add, NSW, Known2, KnownOut);
 }
 
 static void computeKnownBitsMul(const Value *Op0, const Value *Op1, bool NSW,
-                                KnownBits &Known, KnownBits &Known2,
-                                unsigned Depth, const Query &Q) {
+                                const APInt &DemandedElts, KnownBits &Known,
+                                KnownBits &Known2, unsigned Depth,
+                                const Query &Q) {
   unsigned BitWidth = Known.getBitWidth();
-  computeKnownBits(Op1, Known, Depth + 1, Q);
-  computeKnownBits(Op0, Known2, Depth + 1, Q);
+  computeKnownBits(Op1, DemandedElts, Known, Depth + 1, Q);
+  computeKnownBits(Op0, DemandedElts, Known2, Depth + 1, Q);
 
   bool isKnownNegative = false;
   bool isKnownNonNegative = false;
@@ -535,6 +628,29 @@ bool llvm::isValidAssumeForContext(const Instruction *Inv,
   //     feeding the assume is trivially true, thus causing the removal of
   //     the assume).
 
+  if (Inv->getParent() == CxtI->getParent()) {
+    // If Inv and CtxI are in the same block, check if the assume (Inv) is first
+    // in the BB.
+    if (Inv->comesBefore(CxtI))
+      return true;
+
+    // Don't let an assume affect itself - this would cause the problems
+    // `isEphemeralValueOf` is trying to prevent, and it would also make
+    // the loop below go out of bounds.
+    if (Inv == CxtI)
+      return false;
+
+    // The context comes first, but they're both in the same block.
+    // Make sure there is nothing in between that might interrupt
+    // the control flow, not even CxtI itself.
+    for (BasicBlock::const_iterator I(CxtI), IE(Inv); I != IE; ++I)
+      if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
+        return false;
+
+    return !isEphemeralValueOf(Inv, CxtI);
+  }
+
+  // Inv and CxtI are in different blocks.
   if (DT) {
     if (DT->dominates(Inv, CxtI))
       return true;
@@ -543,37 +659,7 @@ bool llvm::isValidAssumeForContext(const Instruction *Inv,
     return true;
   }
 
-  // With or without a DT, the only remaining case we will check is if the
-  // instructions are in the same BB.  Give up if that is not the case.
-  if (Inv->getParent() != CxtI->getParent())
-    return false;
-
-  // If we have a dom tree, then we now know that the assume doesn't dominate
-  // the other instruction.  If we don't have a dom tree then we can check if
-  // the assume is first in the BB.
-  if (!DT) {
-    // Search forward from the assume until we reach the context (or the end
-    // of the block); the common case is that the assume will come first.
-    for (auto I = std::next(BasicBlock::const_iterator(Inv)),
-         IE = Inv->getParent()->end(); I != IE; ++I)
-      if (&*I == CxtI)
-        return true;
-  }
-
-  // Don't let an assume affect itself - this would cause the problems
-  // `isEphemeralValueOf` is trying to prevent, and it would also make
-  // the loop below go out of bounds.
-  if (Inv == CxtI)
-    return false;
-
-  // The context comes first, but they're both in the same block.
-  // Make sure there is nothing in between that might interrupt
-  // the control flow, not even CxtI itself.
-  for (BasicBlock::const_iterator I(CxtI), IE(Inv); I != IE; ++I)
-    if (!isGuaranteedToTransferExecutionToSuccessor(&*I))
-      return false;
-
-  return !isEphemeralValueOf(Inv, CxtI);
+  return false;
 }
 
 static bool isKnownNonZeroFromAssume(const Value *V, const Query &Q) {
@@ -592,10 +678,6 @@ static bool isKnownNonZeroFromAssume(const Value *V, const Query &Q) {
     CmpInst::Predicate Pred;
     if (!match(Cmp, m_c_ICmp(Pred, m_V, m_Value(RHS))))
       return false;
-    // Canonicalize 'v' to be on the LHS of the comparison.
-    if (Cmp->getOperand(1) != RHS)
-      Pred = CmpInst::getSwappedPredicate(Pred);
-
     // assume(v u> y) -> assume(v != 0)
     if (Pred == ICmpInst::ICMP_UGT)
       return true;
@@ -615,6 +697,16 @@ static bool isKnownNonZeroFromAssume(const Value *V, const Query &Q) {
     return !TrueValues.contains(APInt::getNullValue(CI->getBitWidth()));
   };
 
+  if (Q.CxtI && V->getType()->isPointerTy()) {
+    SmallVector<Attribute::AttrKind, 2> AttrKinds{Attribute::NonNull};
+    if (!NullPointerIsDefined(Q.CxtI->getFunction(),
+                              V->getType()->getPointerAddressSpace()))
+      AttrKinds.push_back(Attribute::Dereferenceable);
+
+    if (getKnowledgeValidInContext(V, AttrKinds, Q.CxtI, Q.DT, Q.AC))
+      return true;
+  }
+
   for (auto &AssumeVH : Q.AC->assumptionsFor(V)) {
     if (!AssumeVH)
       continue;
@@ -693,6 +785,7 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
     if (!Cmp)
       continue;
 
+    // Note that ptrtoint may change the bitwidth.
     Value *A, *B;
     auto m_V = m_CombineOr(m_Specific(V), m_PtrToInt(m_Specific(V)));
 
@@ -705,18 +798,18 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       // assume(v = a)
       if (match(Cmp, m_c_ICmp(Pred, m_V, m_Value(A))) &&
           isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
         Known.Zero |= RHSKnown.Zero;
         Known.One  |= RHSKnown.One;
       // assume(v & b = a)
       } else if (match(Cmp,
                        m_c_ICmp(Pred, m_c_And(m_V, m_Value(B)), m_Value(A))) &&
                  isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
-        KnownBits MaskKnown(BitWidth);
-        computeKnownBits(B, MaskKnown, Depth+1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
+        KnownBits MaskKnown =
+            computeKnownBits(B, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
 
         // For those bits in the mask that are known to be one, we can propagate
         // known bits from the RHS to V.
@@ -726,10 +819,10 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       } else if (match(Cmp, m_c_ICmp(Pred, m_Not(m_c_And(m_V, m_Value(B))),
                                      m_Value(A))) &&
                  isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
-        KnownBits MaskKnown(BitWidth);
-        computeKnownBits(B, MaskKnown, Depth+1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
+        KnownBits MaskKnown =
+            computeKnownBits(B, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
 
         // For those bits in the mask that are known to be one, we can propagate
         // inverted known bits from the RHS to V.
@@ -739,10 +832,10 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       } else if (match(Cmp,
                        m_c_ICmp(Pred, m_c_Or(m_V, m_Value(B)), m_Value(A))) &&
                  isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
-        KnownBits BKnown(BitWidth);
-        computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
+        KnownBits BKnown =
+            computeKnownBits(B, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
 
         // For those bits in B that are known to be zero, we can propagate known
         // bits from the RHS to V.
@@ -752,10 +845,10 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       } else if (match(Cmp, m_c_ICmp(Pred, m_Not(m_c_Or(m_V, m_Value(B))),
                                      m_Value(A))) &&
                  isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
-        KnownBits BKnown(BitWidth);
-        computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
+        KnownBits BKnown =
+            computeKnownBits(B, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
 
         // For those bits in B that are known to be zero, we can propagate
         // inverted known bits from the RHS to V.
@@ -765,10 +858,10 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       } else if (match(Cmp,
                        m_c_ICmp(Pred, m_c_Xor(m_V, m_Value(B)), m_Value(A))) &&
                  isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
-        KnownBits BKnown(BitWidth);
-        computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
+        KnownBits BKnown =
+            computeKnownBits(B, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
 
         // For those bits in B that are known to be zero, we can propagate known
         // bits from the RHS to V. For those bits in B that are known to be one,
@@ -781,10 +874,10 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       } else if (match(Cmp, m_c_ICmp(Pred, m_Not(m_c_Xor(m_V, m_Value(B))),
                                      m_Value(A))) &&
                  isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
-        KnownBits BKnown(BitWidth);
-        computeKnownBits(B, BKnown, Depth+1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
+        KnownBits BKnown =
+            computeKnownBits(B, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
 
         // For those bits in B that are known to be zero, we can propagate
         // inverted known bits from the RHS to V. For those bits in B that are
@@ -797,8 +890,9 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       } else if (match(Cmp, m_c_ICmp(Pred, m_Shl(m_V, m_ConstantInt(C)),
                                      m_Value(A))) &&
                  isValidAssumeForContext(I, Q.CxtI, Q.DT) && C < BitWidth) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
+
         // For those bits in RHS that are known, we can propagate them to known
         // bits in V shifted to the right by C.
         RHSKnown.Zero.lshrInPlace(C);
@@ -809,8 +903,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       } else if (match(Cmp, m_c_ICmp(Pred, m_Not(m_Shl(m_V, m_ConstantInt(C))),
                                      m_Value(A))) &&
                  isValidAssumeForContext(I, Q.CxtI, Q.DT) && C < BitWidth) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
         // For those bits in RHS that are known, we can propagate them inverted
         // to known bits in V shifted to the right by C.
         RHSKnown.One.lshrInPlace(C);
@@ -821,8 +915,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       } else if (match(Cmp, m_c_ICmp(Pred, m_Shr(m_V, m_ConstantInt(C)),
                                      m_Value(A))) &&
                  isValidAssumeForContext(I, Q.CxtI, Q.DT) && C < BitWidth) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
         // For those bits in RHS that are known, we can propagate them to known
         // bits in V shifted to the right by C.
         Known.Zero |= RHSKnown.Zero << C;
@@ -831,8 +925,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       } else if (match(Cmp, m_c_ICmp(Pred, m_Not(m_Shr(m_V, m_ConstantInt(C))),
                                      m_Value(A))) &&
                  isValidAssumeForContext(I, Q.CxtI, Q.DT) && C < BitWidth) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
         // For those bits in RHS that are known, we can propagate them inverted
         // to known bits in V shifted to the right by C.
         Known.Zero |= RHSKnown.One  << C;
@@ -843,8 +937,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       // assume(v >=_s c) where c is non-negative
       if (match(Cmp, m_ICmp(Pred, m_V, m_Value(A))) &&
           isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth + 1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth + 1, Query(Q, I)).anyextOrTrunc(BitWidth);
 
         if (RHSKnown.isNonNegative()) {
           // We know that the sign bit is zero.
@@ -856,8 +950,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       // assume(v >_s c) where c is at least -1.
       if (match(Cmp, m_ICmp(Pred, m_V, m_Value(A))) &&
           isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth + 1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth + 1, Query(Q, I)).anyextOrTrunc(BitWidth);
 
         if (RHSKnown.isAllOnes() || RHSKnown.isNonNegative()) {
           // We know that the sign bit is zero.
@@ -869,8 +963,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       // assume(v <=_s c) where c is negative
       if (match(Cmp, m_ICmp(Pred, m_V, m_Value(A))) &&
           isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth + 1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth + 1, Query(Q, I)).anyextOrTrunc(BitWidth);
 
         if (RHSKnown.isNegative()) {
           // We know that the sign bit is one.
@@ -882,8 +976,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       // assume(v <_s c) where c is non-positive
       if (match(Cmp, m_ICmp(Pred, m_V, m_Value(A))) &&
           isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
 
         if (RHSKnown.isZero() || RHSKnown.isNegative()) {
           // We know that the sign bit is one.
@@ -895,8 +989,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       // assume(v <=_u c)
       if (match(Cmp, m_ICmp(Pred, m_V, m_Value(A))) &&
           isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
 
         // Whatever high bits in c are zero are known to be zero.
         Known.Zero.setHighBits(RHSKnown.countMinLeadingZeros());
@@ -906,8 +1000,8 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
       // assume(v <_u c)
       if (match(Cmp, m_ICmp(Pred, m_V, m_Value(A))) &&
           isValidAssumeForContext(I, Q.CxtI, Q.DT)) {
-        KnownBits RHSKnown(BitWidth);
-        computeKnownBits(A, RHSKnown, Depth+1, Query(Q, I));
+        KnownBits RHSKnown =
+            computeKnownBits(A, Depth+1, Query(Q, I)).anyextOrTrunc(BitWidth);
 
         // If the RHS is known zero, then this assumption must be wrong (nothing
         // is unsigned less than zero). Signal a conflict and get out of here.
@@ -957,16 +1051,17 @@ static void computeKnownBitsFromAssume(const Value *V, KnownBits &Known,
 /// amount. The results from calling KZF and KOF are conservatively combined for
 /// all permitted shift amounts.
 static void computeKnownBitsFromShiftOperator(
-    const Operator *I, KnownBits &Known, KnownBits &Known2,
-    unsigned Depth, const Query &Q,
+    const Operator *I, const APInt &DemandedElts, KnownBits &Known,
+    KnownBits &Known2, unsigned Depth, const Query &Q,
     function_ref<APInt(const APInt &, unsigned)> KZF,
     function_ref<APInt(const APInt &, unsigned)> KOF) {
   unsigned BitWidth = Known.getBitWidth();
 
-  if (auto *SA = dyn_cast<ConstantInt>(I->getOperand(1))) {
-    unsigned ShiftAmt = SA->getLimitedValue(BitWidth-1);
+  computeKnownBits(I->getOperand(1), DemandedElts, Known, Depth + 1, Q);
+  if (Known.isConstant()) {
+    unsigned ShiftAmt = Known.getConstant().getLimitedValue(BitWidth - 1);
 
-    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), DemandedElts, Known, Depth + 1, Q);
     Known.Zero = KZF(Known.Zero, ShiftAmt);
     Known.One  = KOF(Known.One, ShiftAmt);
     // If the known bits conflict, this must be an overflowing left shift, so
@@ -978,11 +1073,10 @@ static void computeKnownBitsFromShiftOperator(
     return;
   }
 
-  computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
-
   // If the shift amount could be greater than or equal to the bit-width of the
   // LHS, the value could be poison, but bail out because the check below is
-  // expensive. TODO: Should we just carry on?
+  // expensive.
+  // TODO: Should we just carry on?
   if (Known.getMaxValue().uge(BitWidth)) {
     Known.resetAll();
     return;
@@ -1006,12 +1100,13 @@ static void computeKnownBitsFromShiftOperator(
   // Early exit if we can't constrain any well-defined shift amount.
   if (!(ShiftAmtKZ & (PowerOf2Ceil(BitWidth) - 1)) &&
       !(ShiftAmtKO & (PowerOf2Ceil(BitWidth) - 1))) {
-    ShifterOperandIsNonZero = isKnownNonZero(I->getOperand(1), Depth + 1, Q);
+    ShifterOperandIsNonZero =
+        isKnownNonZero(I->getOperand(1), DemandedElts, Depth + 1, Q);
     if (!*ShifterOperandIsNonZero)
       return;
   }
 
-  computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+  computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q);
 
   Known.Zero.setAllBits();
   Known.One.setAllBits();
@@ -1028,7 +1123,7 @@ static void computeKnownBitsFromShiftOperator(
     if (ShiftAmt == 0) {
       if (!ShifterOperandIsNonZero.hasValue())
         ShifterOperandIsNonZero =
-            isKnownNonZero(I->getOperand(1), Depth + 1, Q);
+            isKnownNonZero(I->getOperand(1), DemandedElts, Depth + 1, Q);
       if (*ShifterOperandIsNonZero)
         continue;
     }
@@ -1043,11 +1138,13 @@ static void computeKnownBitsFromShiftOperator(
     Known.setAllZero();
 }
 
-static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
-                                         unsigned Depth, const Query &Q) {
+static void computeKnownBitsFromOperator(const Operator *I,
+                                         const APInt &DemandedElts,
+                                         KnownBits &Known, unsigned Depth,
+                                         const Query &Q) {
   unsigned BitWidth = Known.getBitWidth();
 
-  KnownBits Known2(Known);
+  KnownBits Known2(BitWidth);
   switch (I->getOpcode()) {
   default: break;
   case Instruction::Load:
@@ -1057,13 +1154,10 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
     break;
   case Instruction::And: {
     // If either the LHS or the RHS are Zero, the result is zero.
-    computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
-    computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), DemandedElts, Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q);
 
-    // Output known-1 bits are only known if set in both the LHS & RHS.
-    Known.One &= Known2.One;
-    // Output known-0 are known to be clear if zero in either the LHS | RHS.
-    Known.Zero |= Known2.Zero;
+    Known &= Known2;
 
     // and(x, add (x, -1)) is a common idiom that always clears the low bit;
     // here we handle the more general case of adding any odd number by
@@ -1074,36 +1168,28 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
     if (!Known.Zero[0] && !Known.One[0] &&
         match(I, m_c_BinOp(m_Value(X), m_Add(m_Deferred(X), m_Value(Y))))) {
       Known2.resetAll();
-      computeKnownBits(Y, Known2, Depth + 1, Q);
+      computeKnownBits(Y, DemandedElts, Known2, Depth + 1, Q);
       if (Known2.countMinTrailingOnes() > 0)
         Known.Zero.setBit(0);
     }
     break;
   }
   case Instruction::Or:
-    computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
-    computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+    computeKnownBits(I->getOperand(1), DemandedElts, Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q);
 
-    // Output known-0 bits are only known if clear in both the LHS & RHS.
-    Known.Zero &= Known2.Zero;
-    // Output known-1 are known to be set if set in either the LHS | RHS.
-    Known.One |= Known2.One;
+    Known |= Known2;
     break;
-  case Instruction::Xor: {
-    computeKnownBits(I->getOperand(1), Known, Depth + 1, Q);
-    computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+  case Instruction::Xor:
+    computeKnownBits(I->getOperand(1), DemandedElts, Known, Depth + 1, Q);
+    computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q);
 
-    // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
-    // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
-    Known.Zero = std::move(KnownZeroOut);
+    Known ^= Known2;
     break;
-  }
   case Instruction::Mul: {
     bool NSW = Q.IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(I));
-    computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, Known,
-                        Known2, Depth, Q);
+    computeKnownBitsMul(I->getOperand(0), I->getOperand(1), NSW, DemandedElts,
+                        Known, Known2, Depth, Q);
     break;
   }
   case Instruction::UDiv: {
@@ -1207,9 +1293,9 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
       Q.DL.getTypeSizeInBits(ScalarTy);
 
     assert(SrcBitWidth && "SrcBitWidth can't be zero");
-    Known = Known.zextOrTrunc(SrcBitWidth, false);
+    Known = Known.anyextOrTrunc(SrcBitWidth);
     computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
-    Known = Known.zextOrTrunc(BitWidth, true /* ExtendedBitsAreKnownZero */);
+    Known = Known.zextOrTrunc(BitWidth);
     break;
   }
   case Instruction::BitCast: {
@@ -1254,7 +1340,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
       return KOResult;
     };
 
-    computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
+    computeKnownBitsFromShiftOperator(I, DemandedElts, Known, Known2, Depth, Q,
+                                      KZF, KOF);
     break;
   }
   case Instruction::LShr: {
@@ -1270,7 +1357,8 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
       return KnownOne.lshr(ShiftAmt);
     };
 
-    computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
+    computeKnownBitsFromShiftOperator(I, DemandedElts, Known, Known2, Depth, Q,
+                                      KZF, KOF);
     break;
   }
   case Instruction::AShr: {
@@ -1283,19 +1371,20 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
       return KnownOne.ashr(ShiftAmt);
     };
 
-    computeKnownBitsFromShiftOperator(I, Known, Known2, Depth, Q, KZF, KOF);
+    computeKnownBitsFromShiftOperator(I, DemandedElts, Known, Known2, Depth, Q,
+                                      KZF, KOF);
     break;
   }
   case Instruction::Sub: {
     bool NSW = Q.IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(I));
     computeKnownBitsAddSub(false, I->getOperand(0), I->getOperand(1), NSW,
-                           Known, Known2, Depth, Q);
+                           DemandedElts, Known, Known2, Depth, Q);
     break;
   }
   case Instruction::Add: {
     bool NSW = Q.IIQ.hasNoSignedWrap(cast<OverflowingBinaryOperator>(I));
     computeKnownBitsAddSub(true, I->getOperand(0), I->getOperand(1), NSW,
-                           Known, Known2, Depth, Q);
+                           DemandedElts, Known, Known2, Depth, Q);
     break;
   }
   case Instruction::SRem:
@@ -1355,17 +1444,9 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
     Known.Zero.setHighBits(Leaders);
     break;
   }
-
-  case Instruction::Alloca: {
-    const AllocaInst *AI = cast<AllocaInst>(I);
-    unsigned Align = AI->getAlignment();
-    if (Align == 0)
-      Align = Q.DL.getABITypeAlignment(AI->getAllocatedType());
-
-    if (Align > 0)
-      Known.Zero.setLowBits(countTrailingZeros(Align));
+  case Instruction::Alloca:
+    Known.Zero.setLowBits(Log2(cast<AllocaInst>(I)->getAlign()));
     break;
-  }
   case Instruction::GetElementPtr: {
     // Analyze all of the subscripts of this getelementptr instruction
     // to determine if we can prove known low zero bits.
@@ -1375,6 +1456,10 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
 
     gep_type_iterator GTI = gep_type_begin(I);
     for (unsigned i = 1, e = I->getNumOperands(); i != e; ++i, ++GTI) {
+      // TrailZ can only become smaller, short-circuit if we hit zero.
+      if (TrailZ == 0)
+        break;
+
       Value *Index = I->getOperand(i);
       if (StructType *STy = GTI.getStructTypeOrNull()) {
         // Handle struct member offset arithmetic.
@@ -1400,7 +1485,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
           break;
         }
         unsigned GEPOpiBits = Index->getType()->getScalarSizeInBits();
-        uint64_t TypeSize = Q.DL.getTypeAllocSize(IndexedTy);
+        uint64_t TypeSize = Q.DL.getTypeAllocSize(IndexedTy).getKnownMinSize();
         LocalKnown.Zero = LocalKnown.One = APInt(GEPOpiBits, 0);
         computeKnownBits(Index, LocalKnown, Depth + 1, Q);
         TrailZ = std::min(TrailZ,
@@ -1457,7 +1542,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
           computeKnownBits(R, Known2, Depth + 1, RecQ);
 
           // We need to take the minimum number of known bits
-          KnownBits Known3(Known);
+          KnownBits Known3(BitWidth);
           RecQ.CxtI = LInst;
           computeKnownBits(L, Known3, Depth + 1, RecQ);
 
@@ -1549,7 +1634,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
     if (MDNode *MD =
             Q.IIQ.getMetadata(cast<Instruction>(I), LLVMContext::MD_range))
       computeKnownBitsFromRangeMetadata(*MD, Known);
-    if (const Value *RV = ImmutableCallSite(I).getReturnedArgOperand()) {
+    if (const Value *RV = cast<CallBase>(I)->getReturnedArgOperand()) {
       computeKnownBits(RV, Known2, Depth + 1, Q);
       Known.Zero |= Known2.Zero;
       Known.One |= Known2.One;
@@ -1558,12 +1643,12 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
       switch (II->getIntrinsicID()) {
       default: break;
       case Intrinsic::bitreverse:
-        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q);
         Known.Zero |= Known2.Zero.reverseBits();
         Known.One |= Known2.One.reverseBits();
         break;
       case Intrinsic::bswap:
-        computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
+        computeKnownBits(I->getOperand(0), DemandedElts, Known2, Depth + 1, Q);
         Known.Zero |= Known2.Zero.byteSwap();
         Known.One |= Known2.One.byteSwap();
         break;
@@ -1611,7 +1696,7 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
         if (II->getIntrinsicID() == Intrinsic::fshr)
           ShiftAmt = BitWidth - ShiftAmt;
 
-        KnownBits Known3(Known);
+        KnownBits Known3(BitWidth);
         computeKnownBits(I->getOperand(0), Known2, Depth + 1, Q);
         computeKnownBits(I->getOperand(1), Known3, Depth + 1, Q);
 
@@ -1658,13 +1743,85 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
       }
     }
     break;
-  case Instruction::ExtractElement:
-    // Look through extract element. At the moment we keep this simple and skip
-    // tracking the specific element. But at least we might find information
-    // valid for all elements of the vector (for example if vector is sign
-    // extended, shifted, etc).
-    computeKnownBits(I->getOperand(0), Known, Depth + 1, Q);
+  case Instruction::ShuffleVector: {
+    auto *Shuf = dyn_cast<ShuffleVectorInst>(I);
+    // FIXME: Do we need to handle ConstantExpr involving shufflevectors?
+    if (!Shuf) {
+      Known.resetAll();
+      return;
+    }
+    // For undef elements, we don't know anything about the common state of
+    // the shuffle result.
+    APInt DemandedLHS, DemandedRHS;
+    if (!getShuffleDemandedElts(Shuf, DemandedElts, DemandedLHS, DemandedRHS)) {
+      Known.resetAll();
+      return;
+    }
+    Known.One.setAllBits();
+    Known.Zero.setAllBits();
+    if (!!DemandedLHS) {
+      const Value *LHS = Shuf->getOperand(0);
+      computeKnownBits(LHS, DemandedLHS, Known, Depth + 1, Q);
+      // If we don't know any bits, early out.
+      if (Known.isUnknown())
+        break;
+    }
+    if (!!DemandedRHS) {
+      const Value *RHS = Shuf->getOperand(1);
+      computeKnownBits(RHS, DemandedRHS, Known2, Depth + 1, Q);
+      Known.One &= Known2.One;
+      Known.Zero &= Known2.Zero;
+    }
+    break;
+  }
+  case Instruction::InsertElement: {
+    const Value *Vec = I->getOperand(0);
+    const Value *Elt = I->getOperand(1);
+    auto *CIdx = dyn_cast<ConstantInt>(I->getOperand(2));
+    // Early out if the index is non-constant or out-of-range.
+    unsigned NumElts = DemandedElts.getBitWidth();
+    if (!CIdx || CIdx->getValue().uge(NumElts)) {
+      Known.resetAll();
+      return;
+    }
+    Known.One.setAllBits();
+    Known.Zero.setAllBits();
+    unsigned EltIdx = CIdx->getZExtValue();
+    // Do we demand the inserted element?
+    if (DemandedElts[EltIdx]) {
+      computeKnownBits(Elt, Known, Depth + 1, Q);
+      // If we don't know any bits, early out.
+      if (Known.isUnknown())
+        break;
+    }
+    // We don't need the base vector element that has been inserted.
+    APInt DemandedVecElts = DemandedElts;
+    DemandedVecElts.clearBit(EltIdx);
+    if (!!DemandedVecElts) {
+      computeKnownBits(Vec, DemandedVecElts, Known2, Depth + 1, Q);
+      Known.One &= Known2.One;
+      Known.Zero &= Known2.Zero;
+    }
     break;
+  }
+  case Instruction::ExtractElement: {
+    // Look through extract element. If the index is non-constant or
+    // out-of-range demand all elements, otherwise just the extracted element.
+    const Value *Vec = I->getOperand(0);
+    const Value *Idx = I->getOperand(1);
+    auto *CIdx = dyn_cast<ConstantInt>(Idx);
+    if (isa<ScalableVectorType>(Vec->getType())) {
+      // FIXME: there's probably *something* we can do with scalable vectors
+      Known.resetAll();
+      break;
+    }
+    unsigned NumElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
+    APInt DemandedVecElts = APInt::getAllOnesValue(NumElts);
+    if (CIdx && CIdx->getValue().ult(NumElts))
+      DemandedVecElts = APInt::getOneBitSet(NumElts, CIdx->getZExtValue());
+    computeKnownBits(Vec, DemandedVecElts, Known, Depth + 1, Q);
+    break;
+  }
   case Instruction::ExtractValue:
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I->getOperand(0))) {
       const ExtractValueInst *EVI = cast<ExtractValueInst>(I);
@@ -1675,28 +1832,38 @@ static void computeKnownBitsFromOperator(const Operator *I, KnownBits &Known,
         case Intrinsic::uadd_with_overflow:
         case Intrinsic::sadd_with_overflow:
           computeKnownBitsAddSub(true, II->getArgOperand(0),
-                                 II->getArgOperand(1), false, Known, Known2,
-                                 Depth, Q);
+                                 II->getArgOperand(1), false, DemandedElts,
+                                 Known, Known2, Depth, Q);
           break;
         case Intrinsic::usub_with_overflow:
         case Intrinsic::ssub_with_overflow:
           computeKnownBitsAddSub(false, II->getArgOperand(0),
-                                 II->getArgOperand(1), false, Known, Known2,
-                                 Depth, Q);
+                                 II->getArgOperand(1), false, DemandedElts,
+                                 Known, Known2, Depth, Q);
           break;
         case Intrinsic::umul_with_overflow:
         case Intrinsic::smul_with_overflow:
           computeKnownBitsMul(II->getArgOperand(0), II->getArgOperand(1), false,
-                              Known, Known2, Depth, Q);
+                              DemandedElts, Known, Known2, Depth, Q);
           break;
         }
       }
     }
+    break;
   }
 }
 
 /// Determine which bits of V are known to be either zero or one and return
 /// them.
+KnownBits computeKnownBits(const Value *V, const APInt &DemandedElts,
+                           unsigned Depth, const Query &Q) {
+  KnownBits Known(getBitWidth(V->getType(), Q.DL));
+  computeKnownBits(V, DemandedElts, Known, Depth, Q);
+  return Known;
+}
+
+/// Determine which bits of V are known to be either zero or one and return
+/// them.
 KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q) {
   KnownBits Known(getBitWidth(V->getType(), Q.DL));
   computeKnownBits(V, Known, Depth, Q);
@@ -1717,23 +1884,44 @@ KnownBits computeKnownBits(const Value *V, unsigned Depth, const Query &Q) {
 /// type, and vectors of integers.  In the case
 /// where V is a vector, known zero, and known one values are the
 /// same width as the vector element, and the bit is set only if it is true
-/// for all of the elements in the vector.
-void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
-                      const Query &Q) {
+/// for all of the demanded elements in the vector specified by DemandedElts.
+void computeKnownBits(const Value *V, const APInt &DemandedElts,
+                      KnownBits &Known, unsigned Depth, const Query &Q) {
+  if (!DemandedElts || isa<ScalableVectorType>(V->getType())) {
+    // No demanded elts or V is a scalable vector, better to assume we don't
+    // know anything.
+    Known.resetAll();
+    return;
+  }
+
   assert(V && "No Value?");
   assert(Depth <= MaxDepth && "Limit Search Depth");
+
+#ifndef NDEBUG
+  Type *Ty = V->getType();
   unsigned BitWidth = Known.getBitWidth();
 
-  assert((V->getType()->isIntOrIntVectorTy(BitWidth) ||
-          V->getType()->isPtrOrPtrVectorTy()) &&
+  assert((Ty->isIntOrIntVectorTy(BitWidth) || Ty->isPtrOrPtrVectorTy()) &&
          "Not integer or pointer type!");
 
-  Type *ScalarTy = V->getType()->getScalarType();
-  unsigned ExpectedWidth = ScalarTy->isPointerTy() ?
-    Q.DL.getPointerTypeSizeInBits(ScalarTy) : Q.DL.getTypeSizeInBits(ScalarTy);
-  assert(ExpectedWidth == BitWidth && "V and Known should have same BitWidth");
-  (void)BitWidth;
-  (void)ExpectedWidth;
+  if (auto *FVTy = dyn_cast<FixedVectorType>(Ty)) {
+    assert(
+        FVTy->getNumElements() == DemandedElts.getBitWidth() &&
+        "DemandedElt width should equal the fixed vector number of elements");
+  } else {
+    assert(DemandedElts == APInt(1, 1) &&
+           "DemandedElt width should be 1 for scalars");
+  }
+
+  Type *ScalarTy = Ty->getScalarType();
+  if (ScalarTy->isPointerTy()) {
+    assert(BitWidth == Q.DL.getPointerTypeSizeInBits(ScalarTy) &&
+           "V and Known should have same BitWidth");
+  } else {
+    assert(BitWidth == Q.DL.getTypeSizeInBits(ScalarTy) &&
+           "V and Known should have same BitWidth");
+  }
+#endif
 
   const APInt *C;
   if (match(V, m_APInt(C))) {
@@ -1749,12 +1937,14 @@ void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
   }
   // Handle a constant vector by taking the intersection of the known bits of
   // each element.
-  if (const ConstantDataSequential *CDS = dyn_cast<ConstantDataSequential>(V)) {
-    // We know that CDS must be a vector of integers. Take the intersection of
+  if (const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(V)) {
+    // We know that CDV must be a vector of integers. Take the intersection of
     // each element.
     Known.Zero.setAllBits(); Known.One.setAllBits();
-    for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i) {
-      APInt Elt = CDS->getElementAsAPInt(i);
+    for (unsigned i = 0, e = CDV->getNumElements(); i != e; ++i) {
+      if (!DemandedElts[i])
+        continue;
+      APInt Elt = CDV->getElementAsAPInt(i);
       Known.Zero &= ~Elt;
       Known.One &= Elt;
     }
@@ -1766,6 +1956,8 @@ void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
     // each element.
     Known.Zero.setAllBits(); Known.One.setAllBits();
     for (unsigned i = 0, e = CV->getNumOperands(); i != e; ++i) {
+      if (!DemandedElts[i])
+        continue;
       Constant *Element = CV->getAggregateElement(i);
       auto *ElementCI = dyn_cast_or_null<ConstantInt>(Element);
       if (!ElementCI) {
@@ -1804,13 +1996,12 @@ void computeKnownBits(const Value *V, KnownBits &Known, unsigned Depth,
   }
 
   if (const Operator *I = dyn_cast<Operator>(V))
-    computeKnownBitsFromOperator(I, Known, Depth, Q);
+    computeKnownBitsFromOperator(I, DemandedElts, Known, Depth, Q);
 
   // Aligned pointers have trailing zeros - refine Known.Zero set
-  if (V->getType()->isPointerTy()) {
-    const MaybeAlign Align = V->getPointerAlignment(Q.DL);
-    if (Align)
-      Known.Zero.setLowBits(countTrailingZeros(Align->value()));
+  if (isa<PointerType>(V->getType())) {
+    Align Alignment = V->getPointerAlignment(Q.DL);
+    Known.Zero.setLowBits(countTrailingZeros(Alignment.value()));
   }
 
   // computeKnownBitsFromAssume strictly refines Known.
@@ -1960,7 +2151,7 @@ static bool isGEPKnownNonNull(const GEPOperator *GEP, unsigned Depth,
     }
 
     // If we have a zero-sized type, the index doesn't matter. Keep looping.
-    if (Q.DL.getTypeAllocSize(GTI.getIndexedType()) == 0)
+    if (Q.DL.getTypeAllocSize(GTI.getIndexedType()).getKnownMinSize() == 0)
       continue;
 
     // Fast path the constant operand case both for efficiency and so we don't
@@ -2004,11 +2195,11 @@ static bool isKnownNonNullFromDominatingCondition(const Value *V,
 
     // If the value is used as an argument to a call or invoke, then argument
     // attributes may provide an answer about null-ness.
-    if (auto CS = ImmutableCallSite(U))
-      if (auto *CalledFunc = CS.getCalledFunction())
+    if (const auto *CB = dyn_cast<CallBase>(U))
+      if (auto *CalledFunc = CB->getCalledFunction())
         for (const Argument &Arg : CalledFunc->args())
-          if (CS.getArgOperand(Arg.getArgNo()) == V &&
-              Arg.hasNonNullAttr() && DT->dominates(CS.getInstruction(), CtxI))
+          if (CB->getArgOperand(Arg.getArgNo()) == V &&
+              Arg.hasNonNullAttr() && DT->dominates(CB, CtxI))
             return true;
 
     // If the value is used as a load/store, then the pointer must be non null.
@@ -2088,12 +2279,18 @@ static bool rangeMetadataExcludesValue(const MDNode* Ranges, const APInt& Value)
 }
 
 /// Return true if the given value is known to be non-zero when defined. For
-/// vectors, return true if every element is known to be non-zero when
+/// vectors, return true if every demanded element is known to be non-zero when
 /// defined. For pointers, if the context instruction and dominator tree are
 /// specified, perform context-sensitive analysis and return true if the
 /// pointer couldn't possibly be null at the specified instruction.
 /// Supports values with integer or pointer type and vectors of integers.
-bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
+bool isKnownNonZero(const Value *V, const APInt &DemandedElts, unsigned Depth,
+                    const Query &Q) {
+  // FIXME: We currently have no way to represent the DemandedElts of a scalable
+  // vector
+  if (isa<ScalableVectorType>(V->getType()))
+    return false;
+
   if (auto *C = dyn_cast<Constant>(V)) {
     if (C->isNullValue())
       return false;
@@ -2112,8 +2309,10 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
 
     // For constant vectors, check that all elements are undefined or known
     // non-zero to determine that the whole vector is known non-zero.
-    if (auto *VecTy = dyn_cast<VectorType>(C->getType())) {
+    if (auto *VecTy = dyn_cast<FixedVectorType>(C->getType())) {
       for (unsigned i = 0, e = VecTy->getNumElements(); i != e; ++i) {
+        if (!DemandedElts[i])
+          continue;
         Constant *Elt = C->getAggregateElement(i);
         if (!Elt || Elt->isNullValue())
           return false;
@@ -2161,7 +2360,7 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
 
     // A byval, inalloca, or nonnull argument is never null.
     if (const Argument *A = dyn_cast<Argument>(V))
-      if (A->hasByValOrInAllocaAttr() || A->hasNonNullAttr())
+      if (A->hasPassPointeeByValueAttr() || A->hasNonNullAttr())
         return true;
 
     // A Load tagged with nonnull metadata is never null.
@@ -2214,7 +2413,8 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
   // X | Y != 0 if X != 0 or Y != 0.
   Value *X = nullptr, *Y = nullptr;
   if (match(V, m_Or(m_Value(X), m_Value(Y))))
-    return isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q);
+    return isKnownNonZero(X, DemandedElts, Depth, Q) ||
+           isKnownNonZero(Y, DemandedElts, Depth, Q);
 
   // ext X != 0 if X != 0.
   if (isa<SExtInst>(V) || isa<ZExtInst>(V))
@@ -2229,7 +2429,7 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
       return isKnownNonZero(X, Depth, Q);
 
     KnownBits Known(BitWidth);
-    computeKnownBits(X, Known, Depth, Q);
+    computeKnownBits(X, DemandedElts, Known, Depth, Q);
     if (Known.One[0])
       return true;
   }
@@ -2241,7 +2441,7 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
     if (BO->isExact())
       return isKnownNonZero(X, Depth, Q);
 
-    KnownBits Known = computeKnownBits(X, Depth, Q);
+    KnownBits Known = computeKnownBits(X, DemandedElts, Depth, Q);
     if (Known.isNegative())
       return true;
 
@@ -2255,22 +2455,23 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
         return true;
       // Are all the bits to be shifted out known zero?
       if (Known.countMinTrailingZeros() >= ShiftVal)
-        return isKnownNonZero(X, Depth, Q);
+        return isKnownNonZero(X, DemandedElts, Depth, Q);
     }
   }
   // div exact can only produce a zero if the dividend is zero.
   else if (match(V, m_Exact(m_IDiv(m_Value(X), m_Value())))) {
-    return isKnownNonZero(X, Depth, Q);
+    return isKnownNonZero(X, DemandedElts, Depth, Q);
   }
   // X + Y.
   else if (match(V, m_Add(m_Value(X), m_Value(Y)))) {
-    KnownBits XKnown = computeKnownBits(X, Depth, Q);
-    KnownBits YKnown = computeKnownBits(Y, Depth, Q);
+    KnownBits XKnown = computeKnownBits(X, DemandedElts, Depth, Q);
+    KnownBits YKnown = computeKnownBits(Y, DemandedElts, Depth, Q);
 
     // If X and Y are both non-negative (as signed values) then their sum is not
     // zero unless both X and Y are zero.
     if (XKnown.isNonNegative() && YKnown.isNonNegative())
-      if (isKnownNonZero(X, Depth, Q) || isKnownNonZero(Y, Depth, Q))
+      if (isKnownNonZero(X, DemandedElts, Depth, Q) ||
+          isKnownNonZero(Y, DemandedElts, Depth, Q))
         return true;
 
     // If X and Y are both negative (as signed values) then their sum is not
@@ -2301,13 +2502,14 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
     // If X and Y are non-zero then so is X * Y as long as the multiplication
     // does not overflow.
     if ((Q.IIQ.hasNoSignedWrap(BO) || Q.IIQ.hasNoUnsignedWrap(BO)) &&
-        isKnownNonZero(X, Depth, Q) && isKnownNonZero(Y, Depth, Q))
+        isKnownNonZero(X, DemandedElts, Depth, Q) &&
+        isKnownNonZero(Y, DemandedElts, Depth, Q))
       return true;
   }
   // (C ? X : Y) != 0 if X != 0 and Y != 0.
   else if (const SelectInst *SI = dyn_cast<SelectInst>(V)) {
-    if (isKnownNonZero(SI->getTrueValue(), Depth, Q) &&
-        isKnownNonZero(SI->getFalseValue(), Depth, Q))
+    if (isKnownNonZero(SI->getTrueValue(), DemandedElts, Depth, Q) &&
+        isKnownNonZero(SI->getFalseValue(), DemandedElts, Depth, Q))
       return true;
   }
   // PHI
@@ -2337,12 +2539,35 @@ bool isKnownNonZero(const Value *V, unsigned Depth, const Query &Q) {
     if (AllNonZeroConstants)
       return true;
   }
+  // ExtractElement
+  else if (const auto *EEI = dyn_cast<ExtractElementInst>(V)) {
+    const Value *Vec = EEI->getVectorOperand();
+    const Value *Idx = EEI->getIndexOperand();
+    auto *CIdx = dyn_cast<ConstantInt>(Idx);
+    unsigned NumElts = cast<FixedVectorType>(Vec->getType())->getNumElements();
+    APInt DemandedVecElts = APInt::getAllOnesValue(NumElts);
+    if (CIdx && CIdx->getValue().ult(NumElts))
+      DemandedVecElts = APInt::getOneBitSet(NumElts, CIdx->getZExtValue());
+    return isKnownNonZero(Vec, DemandedVecElts, Depth, Q);
+  }
 
   KnownBits Known(BitWidth);
-  computeKnownBits(V, Known, Depth, Q);
+  computeKnownBits(V, DemandedElts, Known, Depth, Q);
   return Known.One != 0;
 }
 
+bool isKnownNonZero(const Value* V, unsigned Depth, const Query& Q) {
+  // FIXME: We currently have no way to represent the DemandedElts of a scalable
+  // vector
+  if (isa<ScalableVectorType>(V->getType()))
+    return false;
+
+  auto *FVTy = dyn_cast<FixedVectorType>(V->getType());
+  APInt DemandedElts =
+      FVTy ? APInt::getAllOnesValue(FVTy->getNumElements()) : APInt(1, 1);
+  return isKnownNonZero(V, DemandedElts, Depth, Q);
+}
+
 /// Return true if V2 == V1 + X, where X is known non-zero.
 static bool isAddOfNonZero(const Value *V1, const Value *V2, const Query &Q) {
   const BinaryOperator *BO = dyn_cast<BinaryOperator>(V1);
@@ -2433,14 +2658,17 @@ static bool isSignedMinMaxClamp(const Value *Select, const Value *&In,
 /// or if any element was not analyzed; otherwise, return the count for the
 /// element with the minimum number of sign bits.
 static unsigned computeNumSignBitsVectorConstant(const Value *V,
+                                                 const APInt &DemandedElts,
                                                  unsigned TyBits) {
   const auto *CV = dyn_cast<Constant>(V);
-  if (!CV || !CV->getType()->isVectorTy())
+  if (!CV || !isa<FixedVectorType>(CV->getType()))
     return 0;
 
   unsigned MinSignBits = TyBits;
-  unsigned NumElts = CV->getType()->getVectorNumElements();
+  unsigned NumElts = cast<FixedVectorType>(CV->getType())->getNumElements();
   for (unsigned i = 0; i != NumElts; ++i) {
+    if (!DemandedElts[i])
+      continue;
     // If we find a non-ConstantInt, bail out.
     auto *Elt = dyn_cast_or_null<ConstantInt>(CV->getAggregateElement(i));
     if (!Elt)
@@ -2452,12 +2680,13 @@ static unsigned computeNumSignBitsVectorConstant(const Value *V,
   return MinSignBits;
 }
 
-static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
-                                       const Query &Q);
+static unsigned ComputeNumSignBitsImpl(const Value *V,
+                                       const APInt &DemandedElts,
+                                       unsigned Depth, const Query &Q);
 
-static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
-                                   const Query &Q) {
-  unsigned Result = ComputeNumSignBitsImpl(V, Depth, Q);
+static unsigned ComputeNumSignBits(const Value *V, const APInt &DemandedElts,
+                                   unsigned Depth, const Query &Q) {
+  unsigned Result = ComputeNumSignBitsImpl(V, DemandedElts, Depth, Q);
   assert(Result > 0 && "At least one sign bit needs to be present!");
   return Result;
 }
@@ -2467,16 +2696,36 @@ static unsigned ComputeNumSignBits(const Value *V, unsigned Depth,
 /// (itself), but other cases can give us information. For example, immediately
 /// after an "ashr X, 2", we know that the top 3 bits are all equal to each
 /// other, so we return 3. For vectors, return the number of sign bits for the
-/// vector element with the minimum number of known sign bits.
-static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
-                                       const Query &Q) {
+/// vector element with the minimum number of known sign bits of the demanded
+/// elements in the vector specified by DemandedElts.
+static unsigned ComputeNumSignBitsImpl(const Value *V,
+                                       const APInt &DemandedElts,
+                                       unsigned Depth, const Query &Q) {
+  Type *Ty = V->getType();
+
+  // FIXME: We currently have no way to represent the DemandedElts of a scalable
+  // vector
+  if (isa<ScalableVectorType>(Ty))
+    return 1;
+
+#ifndef NDEBUG
   assert(Depth <= MaxDepth && "Limit Search Depth");
 
+  if (auto *FVTy = dyn_cast<FixedVectorType>(Ty)) {
+    assert(
+        FVTy->getNumElements() == DemandedElts.getBitWidth() &&
+        "DemandedElt width should equal the fixed vector number of elements");
+  } else {
+    assert(DemandedElts == APInt(1, 1) &&
+           "DemandedElt width should be 1 for scalars");
+  }
+#endif
+
   // We return the minimum number of sign bits that are guaranteed to be present
   // in V, so for undef we have to conservatively return 1.  We don't have the
   // same behavior for poison though -- that's a FIXME today.
 
-  Type *ScalarTy = V->getType()->getScalarType();
+  Type *ScalarTy = Ty->getScalarType();
   unsigned TyBits = ScalarTy->isPointerTy() ?
     Q.DL.getPointerTypeSizeInBits(ScalarTy) :
     Q.DL.getTypeSizeInBits(ScalarTy);
@@ -2702,40 +2951,37 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
       return ComputeNumSignBits(U->getOperand(0), Depth + 1, Q);
 
     case Instruction::ShuffleVector: {
-      // TODO: This is copied almost directly from the SelectionDAG version of
-      //       ComputeNumSignBits. It would be better if we could share common
-      //       code. If not, make sure that changes are translated to the DAG.
-
       // Collect the minimum number of sign bits that are shared by every vector
       // element referenced by the shuffle.
-      auto *Shuf = cast<ShuffleVectorInst>(U);
-      int NumElts = Shuf->getOperand(0)->getType()->getVectorNumElements();
-      int NumMaskElts = Shuf->getMask()->getType()->getVectorNumElements();
-      APInt DemandedLHS(NumElts, 0), DemandedRHS(NumElts, 0);
-      for (int i = 0; i != NumMaskElts; ++i) {
-        int M = Shuf->getMaskValue(i);
-        assert(M < NumElts * 2 && "Invalid shuffle mask constant");
-        // For undef elements, we don't know anything about the common state of
-        // the shuffle result.
-        if (M == -1)
-          return 1;
-        if (M < NumElts)
-          DemandedLHS.setBit(M % NumElts);
-        else
-          DemandedRHS.setBit(M % NumElts);
+      auto *Shuf = dyn_cast<ShuffleVectorInst>(U);
+      if (!Shuf) {
+        // FIXME: Add support for shufflevector constant expressions.
+        return 1;
       }
+      APInt DemandedLHS, DemandedRHS;
+      // For undef elements, we don't know anything about the common state of
+      // the shuffle result.
+      if (!getShuffleDemandedElts(Shuf, DemandedElts, DemandedLHS, DemandedRHS))
+        return 1;
       Tmp = std::numeric_limits<unsigned>::max();
-      if (!!DemandedLHS)
-        Tmp = ComputeNumSignBits(Shuf->getOperand(0), Depth + 1, Q);
+      if (!!DemandedLHS) {
+        const Value *LHS = Shuf->getOperand(0);
+        Tmp = ComputeNumSignBits(LHS, DemandedLHS, Depth + 1, Q);
+      }
+      // If we don't know anything, early out and try computeKnownBits
+      // fall-back.
+      if (Tmp == 1)
+        break;
       if (!!DemandedRHS) {
-        Tmp2 = ComputeNumSignBits(Shuf->getOperand(1), Depth + 1, Q);
+        const Value *RHS = Shuf->getOperand(1);
+        Tmp2 = ComputeNumSignBits(RHS, DemandedRHS, Depth + 1, Q);
         Tmp = std::min(Tmp, Tmp2);
       }
       // If we don't know anything, early out and try computeKnownBits
       // fall-back.
       if (Tmp == 1)
         break;
-      assert(Tmp <= V->getType()->getScalarSizeInBits() &&
+      assert(Tmp <= Ty->getScalarSizeInBits() &&
              "Failed to determine minimum sign bits");
       return Tmp;
     }
@@ -2747,11 +2993,12 @@ static unsigned ComputeNumSignBitsImpl(const Value *V, unsigned Depth,
 
   // If we can examine all elements of a vector constant successfully, we're
   // done (we can't do any better than that). If not, keep trying.
-  if (unsigned VecSignBits = computeNumSignBitsVectorConstant(V, TyBits))
+  if (unsigned VecSignBits =
+          computeNumSignBitsVectorConstant(V, DemandedElts, TyBits))
     return VecSignBits;
 
   KnownBits Known(TyBits);
-  computeKnownBits(V, Known, Depth, Q);
+  computeKnownBits(V, DemandedElts, Known, Depth, Q);
 
   // If we know that the sign bit is either zero or one, determine the number of
   // identical bits in the top of the input value.
@@ -2877,30 +3124,23 @@ bool llvm::ComputeMultiple(Value *V, unsigned Base, Value *&Multiple,
   return false;
 }
 
-Intrinsic::ID llvm::getIntrinsicForCallSite(ImmutableCallSite ICS,
+Intrinsic::ID llvm::getIntrinsicForCallSite(const CallBase &CB,
                                             const TargetLibraryInfo *TLI) {
-  const Function *F = ICS.getCalledFunction();
+  const Function *F = CB.getCalledFunction();
   if (!F)
     return Intrinsic::not_intrinsic;
 
   if (F->isIntrinsic())
     return F->getIntrinsicID();
 
-  if (!TLI)
-    return Intrinsic::not_intrinsic;
-
+  // We are going to infer semantics of a library function based on mapping it
+  // to an LLVM intrinsic. Check that the library function is available from
+  // this callbase and in this environment.
   LibFunc Func;
-  // We're going to make assumptions on the semantics of the functions, check
-  // that the target knows that it's available in this environment and it does
-  // not have local linkage.
-  if (!F || F->hasLocalLinkage() || !TLI->getLibFunc(*F, Func))
+  if (F->hasLocalLinkage() || !TLI || !TLI->getLibFunc(CB, Func) ||
+      !CB.onlyReadsMemory())
     return Intrinsic::not_intrinsic;
 
-  if (!ICS.onlyReadsMemory())
-    return Intrinsic::not_intrinsic;
-
-  // Otherwise check if we have a call to a function that can be turned into a
-  // vector intrinsic.
   switch (Func) {
   default:
     break;
@@ -2972,6 +3212,10 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(ImmutableCallSite ICS,
   case LibFunc_roundf:
   case LibFunc_roundl:
     return Intrinsic::round;
+  case LibFunc_roundeven:
+  case LibFunc_roundevenf:
+  case LibFunc_roundevenl:
+    return Intrinsic::roundeven;
   case LibFunc_pow:
   case LibFunc_powf:
   case LibFunc_powl:
@@ -2987,6 +3231,9 @@ Intrinsic::ID llvm::getIntrinsicForCallSite(ImmutableCallSite ICS,
 
 /// Return true if we can prove that the specified FP value is never equal to
 /// -0.0.
+/// NOTE: Do not check 'nsz' here because that fast-math-flag does not guarantee
+///       that a value is not -0.0. It only guarantees that -0.0 may be treated
+///       the same as +0.0 in floating-point ops.
 ///
 /// NOTE: this function will need to be revisited when we support non-default
 /// rounding modes!
@@ -3003,11 +3250,6 @@ bool llvm::CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI,
   if (!Op)
     return false;
 
-  // Check if the nsz fast-math flag is set.
-  if (auto *FPO = dyn_cast<FPMathOperator>(Op))
-    if (FPO->hasNoSignedZeros())
-      return true;
-
   // (fadd x, 0.0) is guaranteed to return +0.0, not -0.0.
   if (match(Op, m_FAdd(m_Value(), m_PosZeroFP())))
     return true;
@@ -3017,7 +3259,7 @@ bool llvm::CannotBeNegativeZero(const Value *V, const TargetLibraryInfo *TLI,
     return true;
 
   if (auto *Call = dyn_cast<CallInst>(Op)) {
-    Intrinsic::ID IID = getIntrinsicForCallSite(Call, TLI);
+    Intrinsic::ID IID = getIntrinsicForCallSite(*Call, TLI);
     switch (IID) {
     default:
       break;
@@ -3053,8 +3295,8 @@ static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
 
   // Handle vector of constants.
   if (auto *CV = dyn_cast<Constant>(V)) {
-    if (CV->getType()->isVectorTy()) {
-      unsigned NumElts = CV->getType()->getVectorNumElements();
+    if (auto *CVFVTy = dyn_cast<FixedVectorType>(CV->getType())) {
+      unsigned NumElts = CVFVTy->getNumElements();
       for (unsigned i = 0; i != NumElts; ++i) {
         auto *CFP = dyn_cast_or_null<ConstantFP>(CV->getAggregateElement(i));
         if (!CFP)
@@ -3083,14 +3325,15 @@ static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
   case Instruction::UIToFP:
     return true;
   case Instruction::FMul:
-    // x*x is always non-negative or a NaN.
+  case Instruction::FDiv:
+    // X * X is always non-negative or a NaN.
+    // X / X is always exactly 1.0 or a NaN.
     if (I->getOperand(0) == I->getOperand(1) &&
         (!SignBitOnly || cast<FPMathOperator>(I)->hasNoNaNs()))
       return true;
 
     LLVM_FALLTHROUGH;
   case Instruction::FAdd:
-  case Instruction::FDiv:
   case Instruction::FRem:
     return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
                                            Depth + 1) &&
@@ -3114,17 +3357,32 @@ static bool cannotBeOrderedLessThanZeroImpl(const Value *V,
                                            Depth + 1);
   case Instruction::Call:
     const auto *CI = cast<CallInst>(I);
-    Intrinsic::ID IID = getIntrinsicForCallSite(CI, TLI);
+    Intrinsic::ID IID = getIntrinsicForCallSite(*CI, TLI);
     switch (IID) {
     default:
       break;
-    case Intrinsic::maxnum:
-      return (isKnownNeverNaN(I->getOperand(0), TLI) &&
-              cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI,
-                                              SignBitOnly, Depth + 1)) ||
-            (isKnownNeverNaN(I->getOperand(1), TLI) &&
-              cannotBeOrderedLessThanZeroImpl(I->getOperand(1), TLI,
-                                              SignBitOnly, Depth + 1));
+    case Intrinsic::maxnum: {
+      Value *V0 = I->getOperand(0), *V1 = I->getOperand(1);
+      auto isPositiveNum = [&](Value *V) {
+        if (SignBitOnly) {
+          // With SignBitOnly, this is tricky because the result of
+          // maxnum(+0.0, -0.0) is unspecified. Just check if the operand is
+          // a constant strictly greater than 0.0.
+          const APFloat *C;
+          return match(V, m_APFloat(C)) &&
+                 *C > APFloat::getZero(C->getSemantics());
+        }
+
+        // -0.0 compares equal to 0.0, so if this operand is at least -0.0,
+        // maxnum can't be ordered-less-than-zero.
+        return isKnownNeverNaN(V, TLI) &&
+               cannotBeOrderedLessThanZeroImpl(V, TLI, false, Depth + 1);
+      };
+
+      // TODO: This could be improved. We could also check that neither operand
+      //       has its sign bit set (and at least 1 is not-NAN?).
+      return isPositiveNum(V0) || isPositiveNum(V1);
+    }
 
     case Intrinsic::maximum:
       return cannotBeOrderedLessThanZeroImpl(I->getOperand(0), TLI, SignBitOnly,
@@ -3225,24 +3483,26 @@ bool llvm::isKnownNeverInfinity(const Value *V, const TargetLibraryInfo *TLI,
     }
   }
 
-  // Bail out for constant expressions, but try to handle vector constants.
-  if (!V->getType()->isVectorTy() || !isa<Constant>(V))
-    return false;
-
-  // For vectors, verify that each element is not infinity.
-  unsigned NumElts = V->getType()->getVectorNumElements();
-  for (unsigned i = 0; i != NumElts; ++i) {
-    Constant *Elt = cast<Constant>(V)->getAggregateElement(i);
-    if (!Elt)
-      return false;
-    if (isa<UndefValue>(Elt))
-      continue;
-    auto *CElt = dyn_cast<ConstantFP>(Elt);
-    if (!CElt || CElt->isInfinity())
-      return false;
+  // try to handle fixed width vector constants
+  if (isa<FixedVectorType>(V->getType()) && isa<Constant>(V)) {
+    // For vectors, verify that each element is not infinity.
+    unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Elt = cast<Constant>(V)->getAggregateElement(i);
+      if (!Elt)
+        return false;
+      if (isa<UndefValue>(Elt))
+        continue;
+      auto *CElt = dyn_cast<ConstantFP>(Elt);
+      if (!CElt || CElt->isInfinity())
+        return false;
+    }
+    // All elements were confirmed non-infinity or undefined.
+    return true;
   }
-  // All elements were confirmed non-infinity or undefined.
-  return true;
+
+  // was not able to prove that V never contains infinity
+  return false;
 }
 
 bool llvm::isKnownNeverNaN(const Value *V, const TargetLibraryInfo *TLI,
@@ -3312,6 +3572,7 @@ bool llvm::isKnownNeverNaN(const Value *V, const TargetLibraryInfo *TLI,
     case Intrinsic::rint:
     case Intrinsic::nearbyint:
     case Intrinsic::round:
+    case Intrinsic::roundeven:
       return isKnownNeverNaN(II->getArgOperand(0), TLI, Depth + 1);
     case Intrinsic::sqrt:
       return isKnownNeverNaN(II->getArgOperand(0), TLI, Depth + 1) &&
@@ -3326,24 +3587,26 @@ bool llvm::isKnownNeverNaN(const Value *V, const TargetLibraryInfo *TLI,
     }
   }
 
-  // Bail out for constant expressions, but try to handle vector constants.
-  if (!V->getType()->isVectorTy() || !isa<Constant>(V))
-    return false;
-
-  // For vectors, verify that each element is not NaN.
-  unsigned NumElts = V->getType()->getVectorNumElements();
-  for (unsigned i = 0; i != NumElts; ++i) {
-    Constant *Elt = cast<Constant>(V)->getAggregateElement(i);
-    if (!Elt)
-      return false;
-    if (isa<UndefValue>(Elt))
-      continue;
-    auto *CElt = dyn_cast<ConstantFP>(Elt);
-    if (!CElt || CElt->isNaN())
-      return false;
+  // Try to handle fixed width vector constants
+  if (isa<FixedVectorType>(V->getType()) && isa<Constant>(V)) {
+    // For vectors, verify that each element is not NaN.
+    unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
+    for (unsigned i = 0; i != NumElts; ++i) {
+      Constant *Elt = cast<Constant>(V)->getAggregateElement(i);
+      if (!Elt)
+        return false;
+      if (isa<UndefValue>(Elt))
+        continue;
+      auto *CElt = dyn_cast<ConstantFP>(Elt);
+      if (!CElt || CElt->isNaN())
+        return false;
+    }
+    // All elements were confirmed not-NaN or undefined.
+    return true;
   }
-  // All elements were confirmed not-NaN or undefined.
-  return true;
+
+  // Was not able to prove that V never contains NaN
+  return false;
 }
 
 Value *llvm::isBytewiseValue(Value *V, const DataLayout &DL) {
@@ -3359,8 +3622,8 @@ Value *llvm::isBytewiseValue(Value *V, const DataLayout &DL) {
   if (isa<UndefValue>(V))
     return UndefInt8;
 
-  const uint64_t Size = DL.getTypeStoreSize(V->getType());
-  if (!Size)
+  // Return Undef for zero-sized type.
+  if (!DL.getTypeStoreSize(V->getType()).isNonZero())
     return UndefInt8;
 
   Constant *C = dyn_cast<Constant>(V);
@@ -3678,7 +3941,7 @@ bool llvm::getConstantDataArrayInfo(const Value *V,
       Array = nullptr;
     } else {
       const DataLayout &DL = GV->getParent()->getDataLayout();
-      uint64_t SizeInBytes = DL.getTypeStoreSize(GVTy);
+      uint64_t SizeInBytes = DL.getTypeStoreSize(GVTy).getFixedSize();
       uint64_t Length = SizeInBytes / (ElementSize / 8);
       if (Length <= Offset)
         return false;
@@ -3839,12 +4102,17 @@ llvm::getArgumentAliasingToReturnedPointer(const CallBase *Call,
 
 bool llvm::isIntrinsicReturningPointerAliasingArgumentWithoutCapturing(
     const CallBase *Call, bool MustPreserveNullness) {
-  return Call->getIntrinsicID() == Intrinsic::launder_invariant_group ||
-         Call->getIntrinsicID() == Intrinsic::strip_invariant_group ||
-         Call->getIntrinsicID() == Intrinsic::aarch64_irg ||
-         Call->getIntrinsicID() == Intrinsic::aarch64_tagp ||
-         (!MustPreserveNullness &&
-          Call->getIntrinsicID() == Intrinsic::ptrmask);
+  switch (Call->getIntrinsicID()) {
+  case Intrinsic::launder_invariant_group:
+  case Intrinsic::strip_invariant_group:
+  case Intrinsic::aarch64_irg:
+  case Intrinsic::aarch64_tagp:
+    return true;
+  case Intrinsic::ptrmask:
+    return !MustPreserveNullness;
+  default:
+    return false;
+  }
 }
 
 /// \p PN defines a loop-variant pointer to an object.  Check if the
@@ -3884,15 +4152,20 @@ Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL,
     } else if (Operator::getOpcode(V) == Instruction::BitCast ||
                Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       V = cast<Operator>(V)->getOperand(0);
+      if (!V->getType()->isPointerTy())
+        return V;
     } else if (GlobalAlias *GA = dyn_cast<GlobalAlias>(V)) {
       if (GA->isInterposable())
         return V;
       V = GA->getAliasee();
-    } else if (isa<AllocaInst>(V)) {
-      // An alloca can't be further simplified.
-      return V;
     } else {
-      if (auto *Call = dyn_cast<CallBase>(V)) {
+      if (auto *PHI = dyn_cast<PHINode>(V)) {
+        // Look through single-arg phi nodes created by LCSSA.
+        if (PHI->getNumIncomingValues() == 1) {
+          V = PHI->getIncomingValue(0);
+          continue;
+        }
+      } else if (auto *Call = dyn_cast<CallBase>(V)) {
         // CaptureTracking can know about special capturing properties of some
         // intrinsics like launder.invariant.group, that can't be expressed with
         // the attributes, but have properties like returning aliasing pointer.
@@ -3908,14 +4181,6 @@ Value *llvm::GetUnderlyingObject(Value *V, const DataLayout &DL,
         }
       }
 
-      // See if InstructionSimplify knows any relevant tricks.
-      if (Instruction *I = dyn_cast<Instruction>(V))
-        // TODO: Acquire a DominatorTree and AssumptionCache and use them.
-        if (Value *Simplified = SimplifyInstruction(I, {DL, I})) {
-          V = Simplified;
-          continue;
-        }
-
       return V;
     }
     assert(V->getType()->isPointerTy() && "Unexpected operand type!");
@@ -4309,6 +4574,16 @@ OverflowResult llvm::computeOverflowForUnsignedSub(const Value *LHS,
                                                    AssumptionCache *AC,
                                                    const Instruction *CxtI,
                                                    const DominatorTree *DT) {
+  // Checking for conditions implied by dominating conditions may be expensive.
+  // Limit it to usub_with_overflow calls for now.
+  if (match(CxtI,
+            m_Intrinsic<Intrinsic::usub_with_overflow>(m_Value(), m_Value())))
+    if (auto C =
+            isImpliedByDomCondition(CmpInst::ICMP_UGE, LHS, RHS, CxtI, DL)) {
+      if (*C)
+        return OverflowResult::NeverOverflows;
+      return OverflowResult::AlwaysOverflowsLow;
+    }
   ConstantRange LHSRange = computeConstantRangeIncludingKnownBits(
       LHS, /*ForSigned=*/false, DL, /*Depth=*/0, AC, CxtI, DT);
   ConstantRange RHSRange = computeConstantRangeIncludingKnownBits(
@@ -4385,7 +4660,100 @@ bool llvm::isOverflowIntrinsicNoWrap(const WithOverflowInst *WO,
   return llvm::any_of(GuardingBranches, AllUsesGuardedByBranch);
 }
 
-bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V) {
+bool llvm::canCreatePoison(const Instruction *I) {
+  // See whether I has flags that may create poison
+  if (isa<OverflowingBinaryOperator>(I) &&
+      (I->hasNoSignedWrap() || I->hasNoUnsignedWrap()))
+    return true;
+  if (isa<PossiblyExactOperator>(I) && I->isExact())
+    return true;
+  if (auto *FP = dyn_cast<FPMathOperator>(I)) {
+    auto FMF = FP->getFastMathFlags();
+    if (FMF.noNaNs() || FMF.noInfs())
+      return true;
+  }
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
+    if (GEP->isInBounds())
+      return true;
+
+  unsigned Opcode = I->getOpcode();
+
+  // Check whether opcode is a poison-generating operation
+  switch (Opcode) {
+  case Instruction::Shl:
+  case Instruction::AShr:
+  case Instruction::LShr: {
+    // Shifts return poison if shiftwidth is larger than the bitwidth.
+    if (auto *C = dyn_cast<Constant>(I->getOperand(1))) {
+      SmallVector<Constant *, 4> ShiftAmounts;
+      if (auto *FVTy = dyn_cast<FixedVectorType>(C->getType())) {
+        unsigned NumElts = FVTy->getNumElements();
+        for (unsigned i = 0; i < NumElts; ++i)
+          ShiftAmounts.push_back(C->getAggregateElement(i));
+      } else if (isa<ScalableVectorType>(C->getType()))
+        return true; // Can't tell, just return true to be safe
+      else
+        ShiftAmounts.push_back(C);
+
+      bool Safe = llvm::all_of(ShiftAmounts, [](Constant *C) {
+        auto *CI = dyn_cast<ConstantInt>(C);
+        return CI && CI->getZExtValue() < C->getType()->getIntegerBitWidth();
+      });
+      return !Safe;
+    }
+    return true;
+  }
+  case Instruction::FPToSI:
+  case Instruction::FPToUI:
+    // fptosi/ui yields poison if the resulting value does not fit in the
+    // destination type.
+    return true;
+  case Instruction::Call:
+  case Instruction::CallBr:
+  case Instruction::Invoke:
+    // Function calls can return a poison value even if args are non-poison
+    // values.
+    return true;
+  case Instruction::InsertElement:
+  case Instruction::ExtractElement: {
+    // If index exceeds the length of the vector, it returns poison
+    auto *VTy = cast<VectorType>(I->getOperand(0)->getType());
+    unsigned IdxOp = I->getOpcode() == Instruction::InsertElement ? 2 : 1;
+    auto *Idx = dyn_cast<ConstantInt>(I->getOperand(IdxOp));
+    if (!Idx || Idx->getZExtValue() >= VTy->getElementCount().Min)
+      return true;
+    return false;
+  }
+  case Instruction::FNeg:
+  case Instruction::PHI:
+  case Instruction::Select:
+  case Instruction::URem:
+  case Instruction::SRem:
+  case Instruction::ShuffleVector:
+  case Instruction::ExtractValue:
+  case Instruction::InsertValue:
+  case Instruction::Freeze:
+  case Instruction::ICmp:
+  case Instruction::FCmp:
+  case Instruction::GetElementPtr:
+    return false;
+  default:
+    if (isa<CastInst>(I))
+      return false;
+    else if (isa<BinaryOperator>(I))
+      return false;
+    // Be conservative and return true.
+    return true;
+  }
+}
+
+bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V,
+                                            const Instruction *CtxI,
+                                            const DominatorTree *DT,
+                                            unsigned Depth) {
+  if (Depth >= MaxDepth)
+    return false;
+
   // If the value is a freeze instruction, then it can never
   // be undef or poison.
   if (isa<FreezeInst>(V))
@@ -4393,10 +4761,100 @@ bool llvm::isGuaranteedNotToBeUndefOrPoison(const Value *V) {
   // TODO: Some instructions are guaranteed to return neither undef
   // nor poison if their arguments are not poison/undef.
 
-  // TODO: Deal with other Constant subclasses.
-  if (isa<ConstantInt>(V) || isa<GlobalVariable>(V))
+  if (auto *C = dyn_cast<Constant>(V)) {
+    // TODO: We can analyze ConstExpr by opcode to determine if there is any
+    //       possibility of poison.
+    if (isa<UndefValue>(C) || isa<ConstantExpr>(C))
+      return false;
+
+    if (isa<ConstantInt>(C) || isa<GlobalVariable>(C) || isa<ConstantFP>(V) ||
+        isa<ConstantPointerNull>(C) || isa<Function>(C))
+      return true;
+
+    if (C->getType()->isVectorTy())
+      return !C->containsUndefElement() && !C->containsConstantExpression();
+
+    // TODO: Recursively analyze aggregates or other constants.
+    return false;
+  }
+
+  // Strip cast operations from a pointer value.
+  // Note that stripPointerCastsSameRepresentation can strip off getelementptr
+  // inbounds with zero offset. To guarantee that the result isn't poison, the
+  // stripped pointer is checked as it has to be pointing into an allocated
+  // object or be null `null` to ensure `inbounds` getelement pointers with a
+  // zero offset could not produce poison.
+  // It can strip off addrspacecast that do not change bit representation as
+  // well. We believe that such addrspacecast is equivalent to no-op.
+  auto *StrippedV = V->stripPointerCastsSameRepresentation();
+  if (isa<AllocaInst>(StrippedV) || isa<GlobalVariable>(StrippedV) ||
+      isa<Function>(StrippedV) || isa<ConstantPointerNull>(StrippedV))
     return true;
 
+  auto OpCheck = [&](const Value *V) {
+    return isGuaranteedNotToBeUndefOrPoison(V, CtxI, DT, Depth + 1);
+  };
+
+  if (auto *I = dyn_cast<Instruction>(V)) {
+    switch (I->getOpcode()) {
+    case Instruction::GetElementPtr: {
+      auto *GEPI = dyn_cast<GetElementPtrInst>(I);
+      if (!GEPI->isInBounds() && llvm::all_of(GEPI->operands(), OpCheck))
+        return true;
+      break;
+    }
+    case Instruction::FCmp: {
+      auto *FI = dyn_cast<FCmpInst>(I);
+      if (FI->getFastMathFlags().none() &&
+          llvm::all_of(FI->operands(), OpCheck))
+        return true;
+      break;
+    }
+    case Instruction::BitCast:
+    case Instruction::PHI:
+    case Instruction::ICmp:
+      if (llvm::all_of(I->operands(), OpCheck))
+        return true;
+      break;
+    default:
+      break;
+    }
+
+    if (programUndefinedIfPoison(I) && I->getType()->isIntegerTy(1))
+      // Note: once we have an agreement that poison is a value-wise concept,
+      // we can remove the isIntegerTy(1) constraint.
+      return true;
+  }
+
+  // CxtI may be null or a cloned instruction.
+  if (!CtxI || !CtxI->getParent() || !DT)
+    return false;
+
+  auto *DNode = DT->getNode(CtxI->getParent());
+  if (!DNode)
+    // Unreachable block
+    return false;
+
+  // If V is used as a branch condition before reaching CtxI, V cannot be
+  // undef or poison.
+  //   br V, BB1, BB2
+  // BB1:
+  //   CtxI ; V cannot be undef or poison here
+  auto *Dominator = DNode->getIDom();
+  while (Dominator) {
+    auto *TI = Dominator->getBlock()->getTerminator();
+
+    if (auto BI = dyn_cast<BranchInst>(TI)) {
+      if (BI->isConditional() && BI->getCondition() == V)
+        return true;
+    } else if (auto SI = dyn_cast<SwitchInst>(TI)) {
+      if (SI->getCondition() == V)
+        return true;
+    }
+
+    Dominator = Dominator->getIDom();
+  }
+
   return false;
 }
 
@@ -4436,14 +4894,14 @@ bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
     return false;
 
   // Calls can throw, or contain an infinite loop, or kill the process.
-  if (auto CS = ImmutableCallSite(I)) {
+  if (const auto *CB = dyn_cast<CallBase>(I)) {
     // Call sites that throw have implicit non-local control flow.
-    if (!CS.doesNotThrow())
+    if (!CB->doesNotThrow())
       return false;
 
     // A function which doens't throw and has "willreturn" attribute will
     // always return.
-    if (CS.hasFnAttr(Attribute::WillReturn))
+    if (CB->hasFnAttr(Attribute::WillReturn))
       return true;
 
     // Non-throwing call sites can loop infinitely, call exit/pthread_exit
@@ -4462,7 +4920,7 @@ bool llvm::isGuaranteedToTransferExecutionToSuccessor(const Instruction *I) {
 
     // FIXME: This isn't aggressive enough; a call which only writes to a global
     // is guaranteed to return.
-    return CS.onlyReadsMemory() || CS.onlyAccessesArgMemory();
+    return CB->onlyReadsMemory() || CB->onlyAccessesArgMemory();
   }
 
   // Other instructions return normally.
@@ -4493,41 +4951,28 @@ bool llvm::isGuaranteedToExecuteForEveryIteration(const Instruction *I,
   llvm_unreachable("Instruction not contained in its own parent basic block.");
 }
 
-bool llvm::propagatesFullPoison(const Instruction *I) {
-  // TODO: This should include all instructions apart from phis, selects and
-  // call-like instructions.
+bool llvm::propagatesPoison(const Instruction *I) {
   switch (I->getOpcode()) {
-  case Instruction::Add:
-  case Instruction::Sub:
-  case Instruction::Xor:
-  case Instruction::Trunc:
-  case Instruction::BitCast:
-  case Instruction::AddrSpaceCast:
-  case Instruction::Mul:
-  case Instruction::Shl:
-  case Instruction::GetElementPtr:
-    // These operations all propagate poison unconditionally. Note that poison
-    // is not any particular value, so xor or subtraction of poison with
-    // itself still yields poison, not zero.
-    return true;
-
-  case Instruction::AShr:
-  case Instruction::SExt:
-    // For these operations, one bit of the input is replicated across
-    // multiple output bits. A replicated poison bit is still poison.
-    return true;
-
+  case Instruction::Freeze:
+  case Instruction::Select:
+  case Instruction::PHI:
+  case Instruction::Call:
+  case Instruction::Invoke:
+    return false;
   case Instruction::ICmp:
-    // Comparing poison with any value yields poison.  This is why, for
-    // instance, x s< (x +nsw 1) can be folded to true.
+  case Instruction::FCmp:
+  case Instruction::GetElementPtr:
     return true;
-
   default:
+    if (isa<BinaryOperator>(I) || isa<UnaryOperator>(I) || isa<CastInst>(I))
+      return true;
+
+    // Be conservative and return false.
     return false;
   }
 }
 
-const Value *llvm::getGuaranteedNonFullPoisonOp(const Instruction *I) {
+const Value *llvm::getGuaranteedNonPoisonOp(const Instruction *I) {
   switch (I->getOpcode()) {
     case Instruction::Store:
       return cast<StoreInst>(I)->getPointerOperand();
@@ -4547,23 +4992,30 @@ const Value *llvm::getGuaranteedNonFullPoisonOp(const Instruction *I) {
     case Instruction::SRem:
       return I->getOperand(1);
 
+    case Instruction::Call:
+      if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::assume:
+          return II->getArgOperand(0);
+        default:
+          return nullptr;
+        }
+      }
+      return nullptr;
+
     default:
-      // Note: It's really tempting to think that a conditional branch or
-      // switch should be listed here, but that's incorrect.  It's not
-      // branching off of poison which is UB, it is executing a side effecting
-      // instruction which follows the branch.
       return nullptr;
   }
 }
 
 bool llvm::mustTriggerUB(const Instruction *I,
                          const SmallSet<const Value *, 16>& KnownPoison) {
-  auto *NotPoison = getGuaranteedNonFullPoisonOp(I);
+  auto *NotPoison = getGuaranteedNonPoisonOp(I);
   return (NotPoison && KnownPoison.count(NotPoison));
 }
 
 
-bool llvm::programUndefinedIfFullPoison(const Instruction *PoisonI) {
+bool llvm::programUndefinedIfPoison(const Instruction *PoisonI) {
   // We currently only look for uses of poison values within the same basic
   // block, as that makes it easier to guarantee that the uses will be
   // executed given that PoisonI is executed.
@@ -4596,7 +5048,7 @@ bool llvm::programUndefinedIfFullPoison(const Instruction *PoisonI) {
       if (YieldsPoison.count(&I)) {
         for (const User *User : I.users()) {
           const Instruction *UserI = cast<Instruction>(User);
-          if (propagatesFullPoison(UserI))
+          if (propagatesPoison(UserI))
             YieldsPoison.insert(User);
         }
       }
@@ -4633,6 +5085,9 @@ static bool isKnownNonNaN(const Value *V, FastMathFlags FMF) {
     return true;
   }
 
+  if (isa<ConstantAggregateZero>(V))
+    return true;
+
   return false;
 }
 
@@ -4689,7 +5144,7 @@ static SelectPatternResult matchFastFloatClamp(CmpInst::Predicate Pred,
     if (match(FalseVal,
               m_CombineOr(m_OrdFMin(m_Specific(CmpLHS), m_APFloat(FC2)),
                           m_UnordFMin(m_Specific(CmpLHS), m_APFloat(FC2)))) &&
-        FC1->compare(*FC2) == APFloat::cmpResult::cmpLessThan)
+        *FC1 < *FC2)
       return {SPF_FMAXNUM, SPNB_RETURNS_ANY, false};
     break;
   case CmpInst::FCMP_OGT:
@@ -4699,7 +5154,7 @@ static SelectPatternResult matchFastFloatClamp(CmpInst::Predicate Pred,
     if (match(FalseVal,
               m_CombineOr(m_OrdFMax(m_Specific(CmpLHS), m_APFloat(FC2)),
                           m_UnordFMax(m_Specific(CmpLHS), m_APFloat(FC2)))) &&
-        FC1->compare(*FC2) == APFloat::cmpResult::cmpGreaterThan)
+        *FC1 > *FC2)
       return {SPF_FMINNUM, SPNB_RETURNS_ANY, false};
     break;
   default:
@@ -4840,6 +5295,21 @@ static SelectPatternResult matchMinMaxOfMinMax(CmpInst::Predicate Pred,
   return {SPF_UNKNOWN, SPNB_NA, false};
 }
 
+/// If the input value is the result of a 'not' op, constant integer, or vector
+/// splat of a constant integer, return the bitwise-not source value.
+/// TODO: This could be extended to handle non-splat vector integer constants.
+static Value *getNotValue(Value *V) {
+  Value *NotV;
+  if (match(V, m_Not(m_Value(NotV))))
+    return NotV;
+
+  const APInt *C;
+  if (match(V, m_APInt(C)))
+    return ConstantInt::get(V->getType(), ~(*C));
+
+  return nullptr;
+}
+
 /// Match non-obvious integer minimum and maximum sequences.
 static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
                                        Value *CmpLHS, Value *CmpRHS,
@@ -4858,6 +5328,31 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
   if (SPR.Flavor != SelectPatternFlavor::SPF_UNKNOWN)
     return SPR;
 
+  // Look through 'not' ops to find disguised min/max.
+  // (X > Y) ? ~X : ~Y ==> (~X < ~Y) ? ~X : ~Y ==> MIN(~X, ~Y)
+  // (X < Y) ? ~X : ~Y ==> (~X > ~Y) ? ~X : ~Y ==> MAX(~X, ~Y)
+  if (CmpLHS == getNotValue(TrueVal) && CmpRHS == getNotValue(FalseVal)) {
+    switch (Pred) {
+    case CmpInst::ICMP_SGT: return {SPF_SMIN, SPNB_NA, false};
+    case CmpInst::ICMP_SLT: return {SPF_SMAX, SPNB_NA, false};
+    case CmpInst::ICMP_UGT: return {SPF_UMIN, SPNB_NA, false};
+    case CmpInst::ICMP_ULT: return {SPF_UMAX, SPNB_NA, false};
+    default: break;
+    }
+  }
+
+  // (X > Y) ? ~Y : ~X ==> (~X < ~Y) ? ~Y : ~X ==> MAX(~Y, ~X)
+  // (X < Y) ? ~Y : ~X ==> (~X > ~Y) ? ~Y : ~X ==> MIN(~Y, ~X)
+  if (CmpLHS == getNotValue(FalseVal) && CmpRHS == getNotValue(TrueVal)) {
+    switch (Pred) {
+    case CmpInst::ICMP_SGT: return {SPF_SMAX, SPNB_NA, false};
+    case CmpInst::ICMP_SLT: return {SPF_SMIN, SPNB_NA, false};
+    case CmpInst::ICMP_UGT: return {SPF_UMAX, SPNB_NA, false};
+    case CmpInst::ICMP_ULT: return {SPF_UMIN, SPNB_NA, false};
+    default: break;
+    }
+  }
+
   if (Pred != CmpInst::ICMP_SGT && Pred != CmpInst::ICMP_SLT)
     return {SPF_UNKNOWN, SPNB_NA, false};
 
@@ -4898,19 +5393,6 @@ static SelectPatternResult matchMinMax(CmpInst::Predicate Pred,
       return {CmpLHS == FalseVal ? SPF_UMAX : SPF_UMIN, SPNB_NA, false};
   }
 
-  // Look through 'not' ops to find disguised signed min/max.
-  // (X >s C) ? ~X : ~C ==> (~X <s ~C) ? ~X : ~C ==> SMIN(~X, ~C)
-  // (X <s C) ? ~X : ~C ==> (~X >s ~C) ? ~X : ~C ==> SMAX(~X, ~C)
-  if (match(TrueVal, m_Not(m_Specific(CmpLHS))) &&
-      match(FalseVal, m_APInt(C2)) && ~(*C1) == *C2)
-    return {Pred == CmpInst::ICMP_SGT ? SPF_SMIN : SPF_SMAX, SPNB_NA, false};
-
-  // (X >s C) ? ~C : ~X ==> (~X <s ~C) ? ~C : ~X ==> SMAX(~C, ~X)
-  // (X <s C) ? ~C : ~X ==> (~X >s ~C) ? ~C : ~X ==> SMIN(~C, ~X)
-  if (match(FalseVal, m_Not(m_Specific(CmpLHS))) &&
-      match(TrueVal, m_APInt(C2)) && ~(*C1) == *C2)
-    return {Pred == CmpInst::ICMP_SGT ? SPF_SMAX : SPF_SMIN, SPNB_NA, false};
-
   return {SPF_UNKNOWN, SPNB_NA, false};
 }
 
@@ -5445,20 +5927,18 @@ isImpliedCondMatchingImmOperands(CmpInst::Predicate APred,
 /// Return true if LHS implies RHS is true.  Return false if LHS implies RHS is
 /// false.  Otherwise, return None if we can't infer anything.
 static Optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
-                                         const ICmpInst *RHS,
+                                         CmpInst::Predicate BPred,
+                                         const Value *BLHS, const Value *BRHS,
                                          const DataLayout &DL, bool LHSIsTrue,
                                          unsigned Depth) {
   Value *ALHS = LHS->getOperand(0);
   Value *ARHS = LHS->getOperand(1);
+
   // The rest of the logic assumes the LHS condition is true.  If that's not the
   // case, invert the predicate to make it so.
-  ICmpInst::Predicate APred =
+  CmpInst::Predicate APred =
       LHSIsTrue ? LHS->getPredicate() : LHS->getInversePredicate();
 
-  Value *BLHS = RHS->getOperand(0);
-  Value *BRHS = RHS->getOperand(1);
-  ICmpInst::Predicate BPred = RHS->getPredicate();
-
   // Can we infer anything when the two compares have matching operands?
   bool AreSwappedOps;
   if (isMatchingOps(ALHS, ARHS, BLHS, BRHS, AreSwappedOps)) {
@@ -5489,10 +5969,11 @@ static Optional<bool> isImpliedCondICmps(const ICmpInst *LHS,
 /// Return true if LHS implies RHS is true.  Return false if LHS implies RHS is
 /// false.  Otherwise, return None if we can't infer anything.  We expect the
 /// RHS to be an icmp and the LHS to be an 'and' or an 'or' instruction.
-static Optional<bool> isImpliedCondAndOr(const BinaryOperator *LHS,
-                                         const ICmpInst *RHS,
-                                         const DataLayout &DL, bool LHSIsTrue,
-                                         unsigned Depth) {
+static Optional<bool>
+isImpliedCondAndOr(const BinaryOperator *LHS, CmpInst::Predicate RHSPred,
+                   const Value *RHSOp0, const Value *RHSOp1,
+
+                   const DataLayout &DL, bool LHSIsTrue, unsigned Depth) {
   // The LHS must be an 'or' or an 'and' instruction.
   assert((LHS->getOpcode() == Instruction::And ||
           LHS->getOpcode() == Instruction::Or) &&
@@ -5507,36 +5988,33 @@ static Optional<bool> isImpliedCondAndOr(const BinaryOperator *LHS,
   if ((!LHSIsTrue && match(LHS, m_Or(m_Value(ALHS), m_Value(ARHS)))) ||
       (LHSIsTrue && match(LHS, m_And(m_Value(ALHS), m_Value(ARHS))))) {
     // FIXME: Make this non-recursion.
-    if (Optional<bool> Implication =
-            isImpliedCondition(ALHS, RHS, DL, LHSIsTrue, Depth + 1))
+    if (Optional<bool> Implication = isImpliedCondition(
+            ALHS, RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue, Depth + 1))
       return Implication;
-    if (Optional<bool> Implication =
-            isImpliedCondition(ARHS, RHS, DL, LHSIsTrue, Depth + 1))
+    if (Optional<bool> Implication = isImpliedCondition(
+            ARHS, RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue, Depth + 1))
       return Implication;
     return None;
   }
   return None;
 }
 
-Optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
-                                        const DataLayout &DL, bool LHSIsTrue,
-                                        unsigned Depth) {
+Optional<bool>
+llvm::isImpliedCondition(const Value *LHS, CmpInst::Predicate RHSPred,
+                         const Value *RHSOp0, const Value *RHSOp1,
+                         const DataLayout &DL, bool LHSIsTrue, unsigned Depth) {
   // Bail out when we hit the limit.
   if (Depth == MaxDepth)
     return None;
 
   // A mismatch occurs when we compare a scalar cmp to a vector cmp, for
   // example.
-  if (LHS->getType() != RHS->getType())
+  if (RHSOp0->getType()->isVectorTy() != LHS->getType()->isVectorTy())
     return None;
 
   Type *OpTy = LHS->getType();
   assert(OpTy->isIntOrIntVectorTy(1) && "Expected integer type only!");
 
-  // LHS ==> RHS by definition
-  if (LHS == RHS)
-    return LHSIsTrue;
-
   // FIXME: Extending the code below to handle vectors.
   if (OpTy->isVectorTy())
     return None;
@@ -5545,51 +6023,87 @@ Optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
 
   // Both LHS and RHS are icmps.
   const ICmpInst *LHSCmp = dyn_cast<ICmpInst>(LHS);
-  const ICmpInst *RHSCmp = dyn_cast<ICmpInst>(RHS);
-  if (LHSCmp && RHSCmp)
-    return isImpliedCondICmps(LHSCmp, RHSCmp, DL, LHSIsTrue, Depth);
+  if (LHSCmp)
+    return isImpliedCondICmps(LHSCmp, RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue,
+                              Depth);
 
-  // The LHS should be an 'or' or an 'and' instruction.  We expect the RHS to be
-  // an icmp. FIXME: Add support for and/or on the RHS.
+  /// The LHS should be an 'or' or an 'and' instruction.  We expect the RHS to
+  /// be / an icmp. FIXME: Add support for and/or on the RHS.
   const BinaryOperator *LHSBO = dyn_cast<BinaryOperator>(LHS);
-  if (LHSBO && RHSCmp) {
+  if (LHSBO) {
     if ((LHSBO->getOpcode() == Instruction::And ||
          LHSBO->getOpcode() == Instruction::Or))
-      return isImpliedCondAndOr(LHSBO, RHSCmp, DL, LHSIsTrue, Depth);
+      return isImpliedCondAndOr(LHSBO, RHSPred, RHSOp0, RHSOp1, DL, LHSIsTrue,
+                                Depth);
   }
   return None;
 }
 
-Optional<bool> llvm::isImpliedByDomCondition(const Value *Cond,
-                                             const Instruction *ContextI,
-                                             const DataLayout &DL) {
-  assert(Cond->getType()->isIntOrIntVectorTy(1) && "Condition must be bool");
+Optional<bool> llvm::isImpliedCondition(const Value *LHS, const Value *RHS,
+                                        const DataLayout &DL, bool LHSIsTrue,
+                                        unsigned Depth) {
+  // LHS ==> RHS by definition
+  if (LHS == RHS)
+    return LHSIsTrue;
+
+  const ICmpInst *RHSCmp = dyn_cast<ICmpInst>(RHS);
+  if (RHSCmp)
+    return isImpliedCondition(LHS, RHSCmp->getPredicate(),
+                              RHSCmp->getOperand(0), RHSCmp->getOperand(1), DL,
+                              LHSIsTrue, Depth);
+  return None;
+}
+
+// Returns a pair (Condition, ConditionIsTrue), where Condition is a branch
+// condition dominating ContextI or nullptr, if no condition is found.
+static std::pair<Value *, bool>
+getDomPredecessorCondition(const Instruction *ContextI) {
   if (!ContextI || !ContextI->getParent())
-    return None;
+    return {nullptr, false};
 
   // TODO: This is a poor/cheap way to determine dominance. Should we use a
   // dominator tree (eg, from a SimplifyQuery) instead?
   const BasicBlock *ContextBB = ContextI->getParent();
   const BasicBlock *PredBB = ContextBB->getSinglePredecessor();
   if (!PredBB)
-    return None;
+    return {nullptr, false};
 
   // We need a conditional branch in the predecessor.
   Value *PredCond;
   BasicBlock *TrueBB, *FalseBB;
   if (!match(PredBB->getTerminator(), m_Br(m_Value(PredCond), TrueBB, FalseBB)))
-    return None;
+    return {nullptr, false};
 
   // The branch should get simplified. Don't bother simplifying this condition.
   if (TrueBB == FalseBB)
-    return None;
+    return {nullptr, false};
 
   assert((TrueBB == ContextBB || FalseBB == ContextBB) &&
          "Predecessor block does not point to successor?");
 
   // Is this condition implied by the predecessor condition?
-  bool CondIsTrue = TrueBB == ContextBB;
-  return isImpliedCondition(PredCond, Cond, DL, CondIsTrue);
+  return {PredCond, TrueBB == ContextBB};
+}
+
+Optional<bool> llvm::isImpliedByDomCondition(const Value *Cond,
+                                             const Instruction *ContextI,
+                                             const DataLayout &DL) {
+  assert(Cond->getType()->isIntOrIntVectorTy(1) && "Condition must be bool");
+  auto PredCond = getDomPredecessorCondition(ContextI);
+  if (PredCond.first)
+    return isImpliedCondition(PredCond.first, Cond, DL, PredCond.second);
+  return None;
+}
+
+Optional<bool> llvm::isImpliedByDomCondition(CmpInst::Predicate Pred,
+                                             const Value *LHS, const Value *RHS,
+                                             const Instruction *ContextI,
+                                             const DataLayout &DL) {
+  auto PredCond = getDomPredecessorCondition(ContextI);
+  if (PredCond.first)
+    return isImpliedCondition(PredCond.first, Pred, LHS, RHS, DL,
+                              PredCond.second);
+  return None;
 }
 
 static void setLimitsForBinOp(const BinaryOperator &BO, APInt &Lower,
@@ -5861,9 +6375,15 @@ static void setLimitsForSelectPattern(const SelectInst &SI, APInt &Lower,
   }
 }
 
-ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo) {
+ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo,
+                                         AssumptionCache *AC,
+                                         const Instruction *CtxI,
+                                         unsigned Depth) {
   assert(V->getType()->isIntOrIntVectorTy() && "Expected integer instruction");
 
+  if (Depth == MaxDepth)
+    return ConstantRange::getFull(V->getType()->getScalarSizeInBits());
+
   const APInt *C;
   if (match(V, m_APInt(C)))
     return ConstantRange(*C);
@@ -5885,6 +6405,31 @@ ConstantRange llvm::computeConstantRange(const Value *V, bool UseInstrInfo) {
     if (auto *Range = IIQ.getMetadata(I, LLVMContext::MD_range))
       CR = CR.intersectWith(getConstantRangeFromMetadata(*Range));
 
+  if (CtxI && AC) {
+    // Try to restrict the range based on information from assumptions.
+    for (auto &AssumeVH : AC->assumptionsFor(V)) {
+      if (!AssumeVH)
+        continue;
+      CallInst *I = cast<CallInst>(AssumeVH);
+      assert(I->getParent()->getParent() == CtxI->getParent()->getParent() &&
+             "Got assumption for the wrong function!");
+      assert(I->getCalledFunction()->getIntrinsicID() == Intrinsic::assume &&
+             "must be an assume intrinsic");
+
+      if (!isValidAssumeForContext(I, CtxI, nullptr))
+        continue;
+      Value *Arg = I->getArgOperand(0);
+      ICmpInst *Cmp = dyn_cast<ICmpInst>(Arg);
+      // Currently we just use information from comparisons.
+      if (!Cmp || Cmp->getOperand(0) != V)
+        continue;
+      ConstantRange RHS = computeConstantRange(Cmp->getOperand(1), UseInstrInfo,
+                                               AC, I, Depth + 1);
+      CR = CR.intersectWith(
+          ConstantRange::makeSatisfyingICmpRegion(Cmp->getPredicate(), RHS));
+    }
+  }
+
   return CR;
 }
 
@@ -5910,10 +6455,12 @@ getOffsetFromIndex(const GEPOperator *GEP, unsigned Idx, const DataLayout &DL) {
       continue;
     }
 
-    // Otherwise, we have a sequential type like an array or vector.  Multiply
-    // the index by the ElementSize.
-    uint64_t Size = DL.getTypeAllocSize(GTI.getIndexedType());
-    Offset += Size * OpC->getSExtValue();
+    // Otherwise, we have a sequential type like an array or fixed-length
+    // vector. Multiply the index by the ElementSize.
+    TypeSize Size = DL.getTypeAllocSize(GTI.getIndexedType());
+    if (Size.isScalable())
+      return None;
+    Offset += Size.getFixedSize() * OpC->getSExtValue();
   }
 
   return Offset;
diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index c45ab941a142..23531b65ea32 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -78,6 +78,7 @@ bool llvm::isTriviallyVectorizable(Intrinsic::ID ID) {
   case Intrinsic::rint:
   case Intrinsic::nearbyint:
   case Intrinsic::round:
+  case Intrinsic::roundeven:
   case Intrinsic::pow:
   case Intrinsic::fma:
   case Intrinsic::fmuladd:
@@ -112,7 +113,7 @@ bool llvm::hasVectorInstrinsicScalarOpd(Intrinsic::ID ID,
 /// its ID, in case it does not found it return not_intrinsic.
 Intrinsic::ID llvm::getVectorIntrinsicIDForCall(const CallInst *CI,
                                                 const TargetLibraryInfo *TLI) {
-  Intrinsic::ID ID = getIntrinsicForCallSite(CI, TLI);
+  Intrinsic::ID ID = getIntrinsicForCallSite(*CI, TLI);
   if (ID == Intrinsic::not_intrinsic)
     return Intrinsic::not_intrinsic;
 
@@ -262,9 +263,12 @@ Value *llvm::getStrideFromPointer(Value *Ptr, ScalarEvolution *SE, Loop *Lp) {
 Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
   assert(V->getType()->isVectorTy() && "Not looking at a vector?");
   VectorType *VTy = cast<VectorType>(V->getType());
-  unsigned Width = VTy->getNumElements();
-  if (EltNo >= Width)  // Out of range access.
-    return UndefValue::get(VTy->getElementType());
+  // For fixed-length vector, return undef for out of range access.
+  if (auto *FVTy = dyn_cast<FixedVectorType>(VTy)) {
+    unsigned Width = FVTy->getNumElements();
+    if (EltNo >= Width)
+      return UndefValue::get(FVTy->getElementType());
+  }
 
   if (Constant *C = dyn_cast<Constant>(V))
     return C->getAggregateElement(EltNo);
@@ -285,8 +289,11 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
     return findScalarElement(III->getOperand(0), EltNo);
   }
 
-  if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V)) {
-    unsigned LHSWidth = SVI->getOperand(0)->getType()->getVectorNumElements();
+  ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(V);
+  // Restrict the following transformation to fixed-length vector.
+  if (SVI && isa<FixedVectorType>(SVI->getType())) {
+    unsigned LHSWidth =
+        cast<FixedVectorType>(SVI->getOperand(0)->getType())->getNumElements();
     int InEl = SVI->getMaskValue(EltNo);
     if (InEl < 0)
       return UndefValue::get(VTy->getElementType());
@@ -307,6 +314,24 @@ Value *llvm::findScalarElement(Value *V, unsigned EltNo) {
   return nullptr;
 }
 
+int llvm::getSplatIndex(ArrayRef<int> Mask) {
+  int SplatIndex = -1;
+  for (int M : Mask) {
+    // Ignore invalid (undefined) mask elements.
+    if (M < 0)
+      continue;
+
+    // There can be only 1 non-negative mask element value if this is a splat.
+    if (SplatIndex != -1 && SplatIndex != M)
+      return -1;
+
+    // Initialize the splat index to the 1st non-negative mask element.
+    SplatIndex = M;
+  }
+  assert((SplatIndex == -1 || SplatIndex >= 0) && "Negative index?");
+  return SplatIndex;
+}
+
 /// Get splat value if the input is a splat vector or return nullptr.
 /// This function is not fully general. It checks only 2 cases:
 /// the input value is (1) a splat constant vector or (2) a sequence
@@ -318,9 +343,9 @@ const llvm::Value *llvm::getSplatValue(const Value *V) {
 
   // shuf (inselt ?, Splat, 0), ?, <0, undef, 0, ...>
   Value *Splat;
-  if (match(V, m_ShuffleVector(m_InsertElement(m_Value(), m_Value(Splat),
-                                               m_ZeroInt()),
-                               m_Value(), m_ZeroInt())))
+  if (match(V,
+            m_Shuffle(m_InsertElt(m_Value(), m_Value(Splat), m_ZeroInt()),
+                      m_Value(), m_ZeroMask())))
     return Splat;
 
   return nullptr;
@@ -330,21 +355,32 @@ const llvm::Value *llvm::getSplatValue(const Value *V) {
 // adjusted if needed.
 const unsigned MaxDepth = 6;
 
-bool llvm::isSplatValue(const Value *V, unsigned Depth) {
+bool llvm::isSplatValue(const Value *V, int Index, unsigned Depth) {
   assert(Depth <= MaxDepth && "Limit Search Depth");
 
   if (isa<VectorType>(V->getType())) {
     if (isa<UndefValue>(V))
       return true;
-    // FIXME: Constant splat analysis does not allow undef elements.
+    // FIXME: We can allow undefs, but if Index was specified, we may want to
+    //        check that the constant is defined at that index.
     if (auto *C = dyn_cast<Constant>(V))
       return C->getSplatValue() != nullptr;
   }
 
-  // FIXME: Constant splat analysis does not allow undef elements.
-  Constant *Mask;
-  if (match(V, m_ShuffleVector(m_Value(), m_Value(), m_Constant(Mask))))
-    return Mask->getSplatValue() != nullptr;
+  if (auto *Shuf = dyn_cast<ShuffleVectorInst>(V)) {
+    // FIXME: We can safely allow undefs here. If Index was specified, we will
+    //        check that the mask elt is defined at the required index.
+    if (!is_splat(Shuf->getShuffleMask()))
+      return false;
+
+    // Match any index.
+    if (Index == -1)
+      return true;
+
+    // Match a specific element. The mask should be defined at and match the
+    // specified index.
+    return Shuf->getMaskValue(Index) == Index;
+  }
 
   // The remaining tests are all recursive, so bail out if we hit the limit.
   if (Depth++ == MaxDepth)
@@ -353,18 +389,91 @@ bool llvm::isSplatValue(const Value *V, unsigned Depth) {
   // If both operands of a binop are splats, the result is a splat.
   Value *X, *Y, *Z;
   if (match(V, m_BinOp(m_Value(X), m_Value(Y))))
-    return isSplatValue(X, Depth) && isSplatValue(Y, Depth);
+    return isSplatValue(X, Index, Depth) && isSplatValue(Y, Index, Depth);
 
   // If all operands of a select are splats, the result is a splat.
   if (match(V, m_Select(m_Value(X), m_Value(Y), m_Value(Z))))
-    return isSplatValue(X, Depth) && isSplatValue(Y, Depth) &&
-           isSplatValue(Z, Depth);
+    return isSplatValue(X, Index, Depth) && isSplatValue(Y, Index, Depth) &&
+           isSplatValue(Z, Index, Depth);
 
   // TODO: Add support for unary ops (fneg), casts, intrinsics (overflow ops).
 
   return false;
 }
 
+void llvm::narrowShuffleMaskElts(int Scale, ArrayRef<int> Mask,
+                                 SmallVectorImpl<int> &ScaledMask) {
+  assert(Scale > 0 && "Unexpected scaling factor");
+
+  // Fast-path: if no scaling, then it is just a copy.
+  if (Scale == 1) {
+    ScaledMask.assign(Mask.begin(), Mask.end());
+    return;
+  }
+
+  ScaledMask.clear();
+  for (int MaskElt : Mask) {
+    if (MaskElt >= 0) {
+      assert(((uint64_t)Scale * MaskElt + (Scale - 1)) <=
+                 std::numeric_limits<int32_t>::max() &&
+             "Overflowed 32-bits");
+    }
+    for (int SliceElt = 0; SliceElt != Scale; ++SliceElt)
+      ScaledMask.push_back(MaskElt < 0 ? MaskElt : Scale * MaskElt + SliceElt);
+  }
+}
+
+bool llvm::widenShuffleMaskElts(int Scale, ArrayRef<int> Mask,
+                                SmallVectorImpl<int> &ScaledMask) {
+  assert(Scale > 0 && "Unexpected scaling factor");
+
+  // Fast-path: if no scaling, then it is just a copy.
+  if (Scale == 1) {
+    ScaledMask.assign(Mask.begin(), Mask.end());
+    return true;
+  }
+
+  // We must map the original elements down evenly to a type with less elements.
+  int NumElts = Mask.size();
+  if (NumElts % Scale != 0)
+    return false;
+
+  ScaledMask.clear();
+  ScaledMask.reserve(NumElts / Scale);
+
+  // Step through the input mask by splitting into Scale-sized slices.
+  do {
+    ArrayRef<int> MaskSlice = Mask.take_front(Scale);
+    assert((int)MaskSlice.size() == Scale && "Expected Scale-sized slice.");
+
+    // The first element of the slice determines how we evaluate this slice.
+    int SliceFront = MaskSlice.front();
+    if (SliceFront < 0) {
+      // Negative values (undef or other "sentinel" values) must be equal across
+      // the entire slice.
+      if (!is_splat(MaskSlice))
+        return false;
+      ScaledMask.push_back(SliceFront);
+    } else {
+      // A positive mask element must be cleanly divisible.
+      if (SliceFront % Scale != 0)
+        return false;
+      // Elements of the slice must be consecutive.
+      for (int i = 1; i < Scale; ++i)
+        if (MaskSlice[i] != SliceFront + i)
+          return false;
+      ScaledMask.push_back(SliceFront / Scale);
+    }
+    Mask = Mask.drop_front(Scale);
+  } while (!Mask.empty());
+
+  assert((int)ScaledMask.size() * Scale == NumElts && "Unexpected scaled mask");
+
+  // All elements of the original mask can be scaled down to map to the elements
+  // of a mask with wider elements.
+  return true;
+}
+
 MapVector<Instruction *, uint64_t>
 llvm::computeMinimumValueSizes(ArrayRef<BasicBlock *> Blocks, DemandedBits &DB,
                                const TargetTransformInfo *TTI) {
@@ -636,7 +745,7 @@ Instruction *llvm::propagateMetadata(Instruction *Inst, ArrayRef<Value *> VL) {
 }
 
 Constant *
-llvm::createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
+llvm::createBitMaskForGaps(IRBuilderBase &Builder, unsigned VF,
                            const InterleaveGroup<Instruction> &Group) {
   // All 1's means mask is not needed.
   if (Group.getNumMembers() == Group.getFactor())
@@ -655,52 +764,52 @@ llvm::createBitMaskForGaps(IRBuilder<> &Builder, unsigned VF,
   return ConstantVector::get(Mask);
 }
 
-Constant *llvm::createReplicatedMask(IRBuilder<> &Builder, 
-                                     unsigned ReplicationFactor, unsigned VF) {
-  SmallVector<Constant *, 16> MaskVec;
+llvm::SmallVector<int, 16>
+llvm::createReplicatedMask(unsigned ReplicationFactor, unsigned VF) {
+  SmallVector<int, 16> MaskVec;
   for (unsigned i = 0; i < VF; i++)
     for (unsigned j = 0; j < ReplicationFactor; j++)
-      MaskVec.push_back(Builder.getInt32(i));
+      MaskVec.push_back(i);
 
-  return ConstantVector::get(MaskVec);
+  return MaskVec;
 }
 
-Constant *llvm::createInterleaveMask(IRBuilder<> &Builder, unsigned VF,
-                                     unsigned NumVecs) {
-  SmallVector<Constant *, 16> Mask;
+llvm::SmallVector<int, 16> llvm::createInterleaveMask(unsigned VF,
+                                                      unsigned NumVecs) {
+  SmallVector<int, 16> Mask;
   for (unsigned i = 0; i < VF; i++)
     for (unsigned j = 0; j < NumVecs; j++)
-      Mask.push_back(Builder.getInt32(j * VF + i));
+      Mask.push_back(j * VF + i);
 
-  return ConstantVector::get(Mask);
+  return Mask;
 }
 
-Constant *llvm::createStrideMask(IRBuilder<> &Builder, unsigned Start,
-                                 unsigned Stride, unsigned VF) {
-  SmallVector<Constant *, 16> Mask;
+llvm::SmallVector<int, 16>
+llvm::createStrideMask(unsigned Start, unsigned Stride, unsigned VF) {
+  SmallVector<int, 16> Mask;
   for (unsigned i = 0; i < VF; i++)
-    Mask.push_back(Builder.getInt32(Start + i * Stride));
+    Mask.push_back(Start + i * Stride);
 
-  return ConstantVector::get(Mask);
+  return Mask;
 }
 
-Constant *llvm::createSequentialMask(IRBuilder<> &Builder, unsigned Start,
-                                     unsigned NumInts, unsigned NumUndefs) {
-  SmallVector<Constant *, 16> Mask;
+llvm::SmallVector<int, 16> llvm::createSequentialMask(unsigned Start,
+                                                      unsigned NumInts,
+                                                      unsigned NumUndefs) {
+  SmallVector<int, 16> Mask;
   for (unsigned i = 0; i < NumInts; i++)
-    Mask.push_back(Builder.getInt32(Start + i));
+    Mask.push_back(Start + i);
 
-  Constant *Undef = UndefValue::get(Builder.getInt32Ty());
   for (unsigned i = 0; i < NumUndefs; i++)
-    Mask.push_back(Undef);
+    Mask.push_back(-1);
 
-  return ConstantVector::get(Mask);
+  return Mask;
 }
 
 /// A helper function for concatenating vectors. This function concatenates two
 /// vectors having the same element type. If the second vector has fewer
 /// elements than the first, it is padded with undefs.
-static Value *concatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
+static Value *concatenateTwoVectors(IRBuilderBase &Builder, Value *V1,
                                     Value *V2) {
   VectorType *VecTy1 = dyn_cast<VectorType>(V1->getType());
   VectorType *VecTy2 = dyn_cast<VectorType>(V2->getType());
@@ -714,16 +823,17 @@ static Value *concatenateTwoVectors(IRBuilder<> &Builder, Value *V1,
 
   if (NumElts1 > NumElts2) {
     // Extend with UNDEFs.
-    Constant *ExtMask =
-        createSequentialMask(Builder, 0, NumElts2, NumElts1 - NumElts2);
-    V2 = Builder.CreateShuffleVector(V2, UndefValue::get(VecTy2), ExtMask);
+    V2 = Builder.CreateShuffleVector(
+        V2, UndefValue::get(VecTy2),
+        createSequentialMask(0, NumElts2, NumElts1 - NumElts2));
   }
 
-  Constant *Mask = createSequentialMask(Builder, 0, NumElts1 + NumElts2, 0);
-  return Builder.CreateShuffleVector(V1, V2, Mask);
+  return Builder.CreateShuffleVector(
+      V1, V2, createSequentialMask(0, NumElts1 + NumElts2, 0));
 }
 
-Value *llvm::concatenateVectors(IRBuilder<> &Builder, ArrayRef<Value *> Vecs) {
+Value *llvm::concatenateVectors(IRBuilderBase &Builder,
+                                ArrayRef<Value *> Vecs) {
   unsigned NumVecs = Vecs.size();
   assert(NumVecs > 1 && "Should be at least two vectors");
 
@@ -756,8 +866,9 @@ bool llvm::maskIsAllZeroOrUndef(Value *Mask) {
     return false;
   if (ConstMask->isNullValue() || isa<UndefValue>(ConstMask))
     return true;
-  for (unsigned I = 0, E = ConstMask->getType()->getVectorNumElements(); I != E;
-       ++I) {
+  for (unsigned I = 0,
+                E = cast<VectorType>(ConstMask->getType())->getNumElements();
+       I != E; ++I) {
     if (auto *MaskElt = ConstMask->getAggregateElement(I))
       if (MaskElt->isNullValue() || isa<UndefValue>(MaskElt))
         continue;
@@ -773,8 +884,9 @@ bool llvm::maskIsAllOneOrUndef(Value *Mask) {
     return false;
   if (ConstMask->isAllOnesValue() || isa<UndefValue>(ConstMask))
     return true;
-  for (unsigned I = 0, E = ConstMask->getType()->getVectorNumElements(); I != E;
-       ++I) {
+  for (unsigned I = 0,
+                E = cast<VectorType>(ConstMask->getType())->getNumElements();
+       I != E; ++I) {
     if (auto *MaskElt = ConstMask->getAggregateElement(I))
       if (MaskElt->isAllOnesValue() || isa<UndefValue>(MaskElt))
         continue;
@@ -835,13 +947,8 @@ void InterleavedAccessInfo::collectConstStrideAccesses(
       const SCEV *Scev = replaceSymbolicStrideSCEV(PSE, Strides, Ptr);
       PointerType *PtrTy = cast<PointerType>(Ptr->getType());
       uint64_t Size = DL.getTypeAllocSize(PtrTy->getElementType());
-
-      // An alignment of 0 means target ABI alignment.
-      MaybeAlign Alignment = MaybeAlign(getLoadStoreAlignment(&I));
-      if (!Alignment)
-        Alignment = Align(DL.getABITypeAlignment(PtrTy->getElementType()));
-
-      AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size, *Alignment);
+      AccessStrideInfo[&I] = StrideDescriptor(Stride, Scev, Size,
+                                              getLoadStoreAlignment(&I));
     }
 }
 
@@ -922,7 +1029,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
     // create a group for B, we continue with the bottom-up algorithm to ensure
     // we don't break any of B's dependences.
     InterleaveGroup<Instruction> *Group = nullptr;
-    if (isStrided(DesB.Stride) && 
+    if (isStrided(DesB.Stride) &&
         (!isPredicated(B->getParent()) || EnablePredicatedInterleavedMemAccesses)) {
       Group = getInterleaveGroup(B);
       if (!Group) {
@@ -1023,8 +1130,8 @@ void InterleavedAccessInfo::analyzeInterleaving(
 
       // All members of a predicated interleave-group must have the same predicate,
       // and currently must reside in the same BB.
-      BasicBlock *BlockA = A->getParent();  
-      BasicBlock *BlockB = B->getParent();  
+      BasicBlock *BlockA = A->getParent();
+      BasicBlock *BlockB = B->getParent();
       if ((isPredicated(BlockA) || isPredicated(BlockB)) &&
           (!EnablePredicatedInterleavedMemAccesses || BlockA != BlockB))
         continue;
@@ -1127,22 +1234,23 @@ void InterleavedAccessInfo::invalidateGroupsRequiringScalarEpilogue() {
   if (!requiresScalarEpilogue())
     return;
 
-  // Avoid releasing a Group twice.
-  SmallPtrSet<InterleaveGroup<Instruction> *, 4> DelSet;
-  for (auto &I : InterleaveGroupMap) {
-    InterleaveGroup<Instruction> *Group = I.second;
-    if (Group->requiresScalarEpilogue())
-      DelSet.insert(Group);
-  }
-  for (auto *Ptr : DelSet) {
+  bool ReleasedGroup = false;
+  // Release groups requiring scalar epilogues. Note that this also removes them
+  // from InterleaveGroups.
+  for (auto *Group : make_early_inc_range(InterleaveGroups)) {
+    if (!Group->requiresScalarEpilogue())
+      continue;
     LLVM_DEBUG(
         dbgs()
         << "LV: Invalidate candidate interleaved group due to gaps that "
            "require a scalar epilogue (not allowed under optsize) and cannot "
            "be masked (not enabled). \n");
-    releaseGroup(Ptr);
+    releaseGroup(Group);
+    ReleasedGroup = true;
   }
-
+  assert(ReleasedGroup && "At least one group must be invalidated, as a "
+                          "scalar epilogue was required");
+  (void)ReleasedGroup;
   RequiresScalarEpilogue = false;
 }
 
@@ -1161,6 +1269,18 @@ void InterleaveGroup<Instruction>::addMetadata(Instruction *NewInst) const {
 }
 }
 
+std::string VFABI::mangleTLIVectorName(StringRef VectorName,
+                                       StringRef ScalarName, unsigned numArgs,
+                                       unsigned VF) {
+  SmallString<256> Buffer;
+  llvm::raw_svector_ostream Out(Buffer);
+  Out << "_ZGV" << VFABI::_LLVM_ << "N" << VF;
+  for (unsigned I = 0; I < numArgs; ++I)
+    Out << "v";
+  Out << "_" << ScalarName << "(" << VectorName << ")";
+  return std::string(Out.str());
+}
+
 void VFABI::getVectorVariantNames(
     const CallInst &CI, SmallVectorImpl<std::string> &VariantMappings) {
   const StringRef S =
@@ -1174,12 +1294,13 @@ void VFABI::getVectorVariantNames(
 
   for (auto &S : SetVector<StringRef>(ListAttr.begin(), ListAttr.end())) {
 #ifndef NDEBUG
-    Optional<VFInfo> Info = VFABI::tryDemangleForVFABI(S);
+    LLVM_DEBUG(dbgs() << "VFABI: adding mapping '" << S << "'\n");
+    Optional<VFInfo> Info = VFABI::tryDemangleForVFABI(S, *(CI.getModule()));
     assert(Info.hasValue() && "Invalid name for a VFABI variant.");
     assert(CI.getModule()->getFunction(Info.getValue().VectorName) &&
            "Vector function is missing.");
 #endif
-    VariantMappings.push_back(S);
+    VariantMappings.push_back(std::string(S));
   }
 }
 
diff --git a/llvm/lib/Analysis/models/inliner/saved_model.pbtxt b/llvm/lib/Analysis/models/inliner/saved_model.pbtxt
new file mode 100644
index 000000000000..ec522a8b7c35
--- /dev/null
+++ b/llvm/lib/Analysis/models/inliner/saved_model.pbtxt
@@ -0,0 +1,32634 @@
+saved_model_schema_version: 1
+meta_graphs {
+  meta_info_def {
+    stripped_op_list {
+      op {
+        name: "Const"
+        output_arg {
+          name: "output"
+          type_attr: "dtype"
+        }
+        attr {
+          name: "value"
+          type: "tensor"
+        }
+        attr {
+          name: "dtype"
+          type: "type"
+        }
+      }
+      op {
+        name: "NoOp"
+      }
+      op {
+        name: "PartitionedCall"
+        input_arg {
+          name: "args"
+          type_list_attr: "Tin"
+        }
+        output_arg {
+          name: "output"
+          type_list_attr: "Tout"
+        }
+        attr {
+          name: "Tin"
+          type: "list(type)"
+          has_minimum: true
+        }
+        attr {
+          name: "Tout"
+          type: "list(type)"
+          has_minimum: true
+        }
+        attr {
+          name: "f"
+          type: "func"
+        }
+        attr {
+          name: "config"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+        attr {
+          name: "config_proto"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+        attr {
+          name: "executor_type"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+      }
+      op {
+        name: "Placeholder"
+        output_arg {
+          name: "output"
+          type_attr: "dtype"
+        }
+        attr {
+          name: "dtype"
+          type: "type"
+        }
+        attr {
+          name: "shape"
+          type: "shape"
+          default_value {
+            shape {
+              unknown_rank: true
+            }
+          }
+        }
+      }
+      op {
+        name: "ReadVariableOp"
+        input_arg {
+          name: "resource"
+          type: DT_RESOURCE
+        }
+        output_arg {
+          name: "value"
+          type_attr: "dtype"
+        }
+        attr {
+          name: "dtype"
+          type: "type"
+        }
+        is_stateful: true
+      }
+      op {
+        name: "StatefulPartitionedCall"
+        input_arg {
+          name: "args"
+          type_list_attr: "Tin"
+        }
+        output_arg {
+          name: "output"
+          type_list_attr: "Tout"
+        }
+        attr {
+          name: "Tin"
+          type: "list(type)"
+          has_minimum: true
+        }
+        attr {
+          name: "Tout"
+          type: "list(type)"
+          has_minimum: true
+        }
+        attr {
+          name: "f"
+          type: "func"
+        }
+        attr {
+          name: "config"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+        attr {
+          name: "config_proto"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+        attr {
+          name: "executor_type"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+        is_stateful: true
+      }
+      op {
+        name: "VarHandleOp"
+        output_arg {
+          name: "resource"
+          type: DT_RESOURCE
+        }
+        attr {
+          name: "container"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+        attr {
+          name: "shared_name"
+          type: "string"
+          default_value {
+            s: ""
+          }
+        }
+        attr {
+          name: "dtype"
+          type: "type"
+        }
+        attr {
+          name: "shape"
+          type: "shape"
+        }
+        attr {
+          name: "allowed_devices"
+          type: "list(string)"
+          default_value {
+            list {
+            }
+          }
+        }
+        is_stateful: true
+      }
+    }
+    tags: "serve"
+    tensorflow_version: "1.15.0"
+    tensorflow_git_version: "unknown"
+    stripped_default_attrs: true
+  }
+  graph_def {
+    node {
+      name: "train_step"
+      op: "VarHandleOp"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+          }
+        }
+      }
+      attr {
+        key: "shared_name"
+        value {
+          s: "train_step"
+        }
+      }
+    }
+    node {
+      name: "train_step/Read/ReadVariableOp"
+      op: "ReadVariableOp"
+      input: "train_step"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+    }
+    node {
+      name: "QNetwork/EncodingNetwork/dense/kernel"
+      op: "VarHandleOp"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 34
+            }
+            dim {
+              size: 100
+            }
+          }
+        }
+      }
+      attr {
+        key: "shared_name"
+        value {
+          s: "QNetwork/EncodingNetwork/dense/kernel"
+        }
+      }
+    }
+    node {
+      name: "QNetwork/EncodingNetwork/dense/kernel/Read/ReadVariableOp"
+      op: "ReadVariableOp"
+      input: "QNetwork/EncodingNetwork/dense/kernel"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 34
+              }
+              dim {
+                size: 100
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node {
+      name: "QNetwork/EncodingNetwork/dense/bias"
+      op: "VarHandleOp"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 100
+            }
+          }
+        }
+      }
+      attr {
+        key: "shared_name"
+        value {
+          s: "QNetwork/EncodingNetwork/dense/bias"
+        }
+      }
+    }
+    node {
+      name: "QNetwork/EncodingNetwork/dense/bias/Read/ReadVariableOp"
+      op: "ReadVariableOp"
+      input: "QNetwork/EncodingNetwork/dense/bias"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 100
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node {
+      name: "QNetwork/EncodingNetwork/dense_1/kernel"
+      op: "VarHandleOp"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 100
+            }
+            dim {
+              size: 40
+            }
+          }
+        }
+      }
+      attr {
+        key: "shared_name"
+        value {
+          s: "QNetwork/EncodingNetwork/dense_1/kernel"
+        }
+      }
+    }
+    node {
+      name: "QNetwork/EncodingNetwork/dense_1/kernel/Read/ReadVariableOp"
+      op: "ReadVariableOp"
+      input: "QNetwork/EncodingNetwork/dense_1/kernel"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 100
+              }
+              dim {
+                size: 40
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node {
+      name: "QNetwork/EncodingNetwork/dense_1/bias"
+      op: "VarHandleOp"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 40
+            }
+          }
+        }
+      }
+      attr {
+        key: "shared_name"
+        value {
+          s: "QNetwork/EncodingNetwork/dense_1/bias"
+        }
+      }
+    }
+    node {
+      name: "QNetwork/EncodingNetwork/dense_1/bias/Read/ReadVariableOp"
+      op: "ReadVariableOp"
+      input: "QNetwork/EncodingNetwork/dense_1/bias"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 40
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node {
+      name: "QNetwork/dense_2/kernel"
+      op: "VarHandleOp"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 40
+            }
+            dim {
+              size: 2
+            }
+          }
+        }
+      }
+      attr {
+        key: "shared_name"
+        value {
+          s: "QNetwork/dense_2/kernel"
+        }
+      }
+    }
+    node {
+      name: "QNetwork/dense_2/kernel/Read/ReadVariableOp"
+      op: "ReadVariableOp"
+      input: "QNetwork/dense_2/kernel"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 40
+              }
+              dim {
+                size: 2
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node {
+      name: "QNetwork/dense_2/bias"
+      op: "VarHandleOp"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 2
+            }
+          }
+        }
+      }
+      attr {
+        key: "shared_name"
+        value {
+          s: "QNetwork/dense_2/bias"
+        }
+      }
+    }
+    node {
+      name: "QNetwork/dense_2/bias/Read/ReadVariableOp"
+      op: "ReadVariableOp"
+      input: "QNetwork/dense_2/bias"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 2
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+    }
+    node {
+      name: "NoOp"
+      op: "NoOp"
+    }
+    node {
+      name: "Const"
+      op: "Const"
+      device: "/device:CPU:0"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "value"
+        value {
+          tensor {
+            dtype: DT_STRING
+            tensor_shape {
+            }
+            string_val: "\nu\n\023\010\001\022\017_time_step_spec\n\024\010\002\022\020_trajectory_spec\n\023\010\003\022\017_wrapped_policy\n\016\010\004\022\ntrain_step\n\023\010\005\022\017model_variables\n\016\010\006\022\nsignatures\n\030\n\017\010\007\022\013observation\n\005\010\007\022\0013\n\030\n\017\010\007\022\013observation\n\005\010\007\022\0011\n;\n\016\010\010\022\n_q_network\n\023\010\001\022\017_time_step_spec\n\024\010\t\022\020_trajectory_spec\nE\022C\n\016VARIABLE_VALUE\022\ntrain_step\032%train_step/.ATTRIBUTES/VARIABLE_VALUE\n*\n\005\010\n\022\0010\n\005\010\013\022\0011\n\005\010\014\022\0012\n\005\010\r\022\0013\n\005\010\016\022\0014\n\005\010\017\022\0015\n\000\n\000\n\214\001\n\026\010\020\022\022_input_tensor_spec\n\014\010\021\022\010_encoder\n\022\010\022\022\016_q_value_layer\n\r\010\023\022\tvariables\n\031\010\024\022\025regularization_losses\n\027\010\025\022\023trainable_variables\n\r\010\026\022\tkeras_api\n\030\n\017\010\007\022\013observation\n\005\010\007\022\0011\ng\022e\n\016VARIABLE_VALUE\022%QNetwork/EncodingNetwork/dense/kernel\032,model_variables/0/.ATTRIBUTES/VARIABLE_VALUE\ne\022c\n\016VARIABLE_VALUE\022#QNetwork/EncodingNetwork/dense/bias\032,model_variables/1/.ATTRIBUTES/VARIABLE_VALUE\ni\022g\n\016VARIABLE_VALUE\022\'QNetwork/EncodingNetwork/dense_1/kernel\032,model_variables/2/.ATTRIBUTES/VARIABLE_VALUE\ng\022e\n\016VARIABLE_VALUE\022%QNetwork/EncodingNetwork/dense_1/bias\032,model_variables/3/.ATTRIBUTES/VARIABLE_VALUE\nY\022W\n\016VARIABLE_VALUE\022\027QNetwork/dense_2/kernel\032,model_variables/4/.ATTRIBUTES/VARIABLE_VALUE\nW\022U\n\016VARIABLE_VALUE\022\025QNetwork/dense_2/bias\032,model_variables/5/.ATTRIBUTES/VARIABLE_VALUE\n\000\n\334\001\n\026\010\027\022\022_input_tensor_spec\n\027\010\030\022\023_preprocessing_nest\n\036\010\031\022\032_flat_preprocessing_layers\n\033\010\032\022\027_preprocessing_combiner\n\032\010\033\022\026_postprocessing_layers\n\r\010\034\022\tvariables\n\031\010\035\022\025regularization_losses\n\027\010\036\022\023trainable_variables\n\r\010\037\022\tkeras_api\nh\n\n\010\016\022\006kernel\n\010\010\017\022\004bias\n\r\010 \022\tvariables\n\031\010!\022\025regularization_losses\n\027\010\"\022\023trainable_variables\n\r\010#\022\tkeras_api\n*\n\005\010\n\022\0010\n\005\010\013\022\0011\n\005\010\014\022\0012\n\005\010\r\022\0013\n\005\010\016\022\0014\n\005\010\017\022\0015\n\000\n*\n\005\010\n\022\0010\n\005\010\013\022\0011\n\005\010\014\022\0012\n\005\010\r\022\0013\n\005\010\016\022\0014\n\005\010\017\022\0015\n\255\001\n\021\010$\022\rlayer_metrics\n\r\010\023\022\tvariables\n\037\010%\022\033layer_regularization_losses\n\013\010&\022\007metrics\n\n\010\'\022\006layers\n\031\010\024\022\025regularization_losses\n\033\010(\022\027non_trainable_variables\n\027\010\025\022\023trainable_variables\n\000\n\000\nV\n\005\010)\022\0010\n\005\010*\022\0011\n\005\010+\022\0012\n\005\010,\022\0013\n\005\010-\022\0014\n\005\010.\022\0015\n\005\010/\022\0016\n\005\0100\022\0017\n\005\0101\022\0018\n\005\0102\022\0019\n\006\0103\022\00210\n\006\0104\022\00211\nR\n\r\0105\022\tvariables\n\031\0106\022\025regularization_losses\n\027\0107\022\023trainable_variables\n\r\0108\022\tkeras_api\n\025\n\005\0109\022\0010\n\005\010:\022\0011\n\005\010;\022\0012\n\034\n\005\010\n\022\0010\n\005\010\013\022\0011\n\005\010\014\022\0012\n\005\010\r\022\0013\n\000\n\034\n\005\010\n\022\0010\n\005\010\013\022\0011\n\005\010\014\022\0012\n\005\010\r\022\0013\n\255\001\n\021\010<\022\rlayer_metrics\n\r\010\034\022\tvariables\n\037\010=\022\033layer_regularization_losses\n\013\010>\022\007metrics\n\n\010?\022\006layers\n\031\010\035\022\025regularization_losses\n\033\010@\022\027non_trainable_variables\n\027\010\036\022\023trainable_variables\n\016\n\005\010\016\022\0010\n\005\010\017\022\0011\n\000\n\016\n\005\010\016\022\0010\n\005\010\017\022\0011\n\255\001\n\021\010A\022\rlayer_metrics\n\r\010 \022\tvariables\n\037\010B\022\033layer_regularization_losses\n\013\010C\022\007metrics\n\n\010D\022\006layers\n\031\010!\022\025regularization_losses\n\033\010E\022\027non_trainable_variables\n\027\010\"\022\023trainable_variables\n\000\n\000\n\000\n\016\n\005\010\021\022\0010\n\005\010\022\022\0011\n\000\nR\n\r\010F\022\tvariables\n\031\010G\022\025regularization_losses\n\027\010H\022\023trainable_variables\n\r\010I\022\tkeras_api\nR\n\r\010J\022\tvariables\n\031\010K\022\025regularization_losses\n\027\010L\022\023trainable_variables\n\r\010M\022\tkeras_api\nR\n\r\010N\022\tvariables\n\031\010O\022\025regularization_losses\n\027\010P\022\023trainable_variables\n\r\010Q\022\tkeras_api\nR\n\r\010R\022\tvariables\n\031\010S\022\025regularization_losses\n\027\010T\022\023trainable_variables\n\r\010U\022\tkeras_api\nR\n\r\010V\022\tvariables\n\031\010W\022\025regularization_losses\n\027\010X\022\023trainable_variables\n\r\010Y\022\tkeras_api\nR\n\r\010Z\022\tvariables\n\031\010[\022\025regularization_losses\n\027\010\\\022\023trainable_variables\n\r\010]\022\tkeras_api\nR\n\r\010^\022\tvariables\n\031\010_\022\025regularization_losses\n\027\010`\022\023trainable_variables\n\r\010a\022\tkeras_api\nR\n\r\010b\022\tvariables\n\031\010c\022\025regularization_losses\n\027\010d\022\023trainable_variables\n\r\010e\022\tkeras_api\nR\n\r\010f\022\tvariables\n\031\010g\022\025regularization_losses\n\027\010h\022\023trainable_variables\n\r\010i\022\tkeras_api\nR\n\r\010j\022\tvariables\n\031\010k\022\025regularization_losses\n\027\010l\022\023trainable_variables\n\r\010m\022\tkeras_api\nR\n\r\010n\022\tvariables\n\031\010o\022\025regularization_losses\n\027\010p\022\023trainable_variables\n\r\010q\022\tkeras_api\nR\n\r\010r\022\tvariables\n\031\010s\022\025regularization_losses\n\027\010t\022\023trainable_variables\n\r\010u\022\tkeras_api\n\000\n\000\n\000\n\255\001\n\021\010v\022\rlayer_metrics\n\r\0105\022\tvariables\n\037\010w\022\033layer_regularization_losses\n\013\010x\022\007metrics\n\n\010y\022\006layers\n\031\0106\022\025regularization_losses\n\033\010z\022\027non_trainable_variables\n\027\0107\022\023trainable_variables\nR\n\r\010{\022\tvariables\n\031\010|\022\025regularization_losses\n\027\010}\022\023trainable_variables\n\r\010~\022\tkeras_api\nk\n\n\010\n\022\006kernel\n\010\010\013\022\004bias\n\r\010\177\022\tvariables\n\032\010\200\001\022\025regularization_losses\n\030\010\201\001\022\023trainable_variables\n\016\010\202\001\022\tkeras_api\nl\n\n\010\014\022\006kernel\n\010\010\r\022\004bias\n\016\010\203\001\022\tvariables\n\032\010\204\001\022\025regularization_losses\n\030\010\205\001\022\023trainable_variables\n\016\010\206\001\022\tkeras_api\n\000\n\000\n\000\nv\n\005\010)\022\0010\n\005\010*\022\0011\n\005\010+\022\0012\n\005\010,\022\0013\n\005\010-\022\0014\n\005\010.\022\0015\n\005\010/\022\0016\n\005\0100\022\0017\n\005\0101\022\0018\n\005\0102\022\0019\n\006\0103\022\00210\n\006\0104\022\00211\n\006\010\032\022\00212\n\006\0109\022\00213\n\006\010:\022\00214\n\006\010;\022\00215\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\262\001\n\022\010\207\001\022\rlayer_metrics\n\r\010F\022\tvariables\n \010\210\001\022\033layer_regularization_losses\n\014\010\211\001\022\007metrics\n\013\010\212\001\022\006layers\n\031\010G\022\025regularization_losses\n\034\010\213\001\022\027non_trainable_variables\n\027\010H\022\023trainable_variables\n\000\n\000\n\000\n\262\001\n\022\010\214\001\022\rlayer_metrics\n\r\010J\022\tvariables\n \010\215\001\022\033layer_regularization_losses\n\014\010\216\001\022\007metrics\n\013\010\217\001\022\006layers\n\031\010K\022\025regularization_losses\n\034\010\220\001\022\027non_trainable_variables\n\027\010L\022\023trainable_variables\n\000\n\000\n\000\n\262\001\n\022\010\221\001\022\rlayer_metrics\n\r\010N\022\tvariables\n \010\222\001\022\033layer_regularization_losses\n\014\010\223\001\022\007metrics\n\013\010\224\001\022\006layers\n\031\010O\022\025regularization_losses\n\034\010\225\001\022\027non_trainable_variables\n\027\010P\022\023trainable_variables\n\000\n\000\n\000\n\262\001\n\022\010\226\001\022\rlayer_metrics\n\r\010R\022\tvariables\n \010\227\001\022\033layer_regularization_losses\n\014\010\230\001\022\007metrics\n\013\010\231\001\022\006layers\n\031\010S\022\025regularization_losses\n\034\010\232\001\022\027non_trainable_variables\n\027\010T\022\023trainable_variables\n\000\n\000\n\000\n\262\001\n\022\010\233\001\022\rlayer_metrics\n\r\010V\022\tvariables\n \010\234\001\022\033layer_regularization_losses\n\014\010\235\001\022\007metrics\n\013\010\236\001\022\006layers\n\031\010W\022\025regularization_losses\n\034\010\237\001\022\027non_trainable_variables\n\027\010X\022\023trainable_variables\n\000\n\000\n\000\n\262\001\n\022\010\240\001\022\rlayer_metrics\n\r\010Z\022\tvariables\n \010\241\001\022\033layer_regularization_losses\n\014\010\242\001\022\007metrics\n\013\010\243\001\022\006layers\n\031\010[\022\025regularization_losses\n\034\010\244\001\022\027non_trainable_variables\n\027\010\\\022\023trainable_variables\n\000\n\000\n\000\n\262\001\n\022\010\245\001\022\rlayer_metrics\n\r\010^\022\tvariables\n \010\246\001\022\033layer_regularization_losses\n\014\010\247\001\022\007metrics\n\013\010\250\001\022\006layers\n\031\010_\022\025regularization_losses\n\034\010\251\001\022\027non_trainable_variables\n\027\010`\022\023trainable_variables\n\000\n\000\n\000\n\262\001\n\022\010\252\001\022\rlayer_metrics\n\r\010b\022\tvariables\n \010\253\001\022\033layer_regularization_losses\n\014\010\254\001\022\007metrics\n\013\010\255\001\022\006layers\n\031\010c\022\025regularization_losses\n\034\010\256\001\022\027non_trainable_variables\n\027\010d\022\023trainable_variables\n\000\n\000\n\000\n\262\001\n\022\010\257\001\022\rlayer_metrics\n\r\010f\022\tvariables\n \010\260\001\022\033layer_regularization_losses\n\014\010\261\001\022\007metrics\n\013\010\262\001\022\006layers\n\031\010g\022\025regularization_losses\n\034\010\263\001\022\027non_trainable_variables\n\027\010h\022\023trainable_variables\n\000\n\000\n\000\n\262\001\n\022\010\264\001\022\rlayer_metrics\n\r\010j\022\tvariables\n \010\265\001\022\033layer_regularization_losses\n\014\010\266\001\022\007metrics\n\013\010\267\001\022\006layers\n\031\010k\022\025regularization_losses\n\034\010\270\001\022\027non_trainable_variables\n\027\010l\022\023trainable_variables\n\000\n\000\n\000\n\262\001\n\022\010\271\001\022\rlayer_metrics\n\r\010n\022\tvariables\n \010\272\001\022\033layer_regularization_losses\n\014\010\273\001\022\007metrics\n\013\010\274\001\022\006layers\n\031\010o\022\025regularization_losses\n\034\010\275\001\022\027non_trainable_variables\n\027\010p\022\023trainable_variables\n\000\n\000\n\000\n\262\001\n\022\010\276\001\022\rlayer_metrics\n\r\010r\022\tvariables\n \010\277\001\022\033layer_regularization_losses\n\014\010\300\001\022\007metrics\n\013\010\301\001\022\006layers\n\031\010s\022\025regularization_losses\n\034\010\302\001\022\027non_trainable_variables\n\027\010t\022\023trainable_variables\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\262\001\n\022\010\303\001\022\rlayer_metrics\n\r\010{\022\tvariables\n \010\304\001\022\033layer_regularization_losses\n\014\010\305\001\022\007metrics\n\013\010\306\001\022\006layers\n\031\010|\022\025regularization_losses\n\034\010\307\001\022\027non_trainable_variables\n\027\010}\022\023trainable_variables\n\016\n\005\010\n\022\0010\n\005\010\013\022\0011\n\000\n\016\n\005\010\n\022\0010\n\005\010\013\022\0011\n\264\001\n\022\010\310\001\022\rlayer_metrics\n\r\010\177\022\tvariables\n \010\311\001\022\033layer_regularization_losses\n\014\010\312\001\022\007metrics\n\013\010\313\001\022\006layers\n\032\010\200\001\022\025regularization_losses\n\034\010\314\001\022\027non_trainable_variables\n\030\010\201\001\022\023trainable_variables\n\016\n\005\010\014\022\0010\n\005\010\r\022\0011\n\000\n\016\n\005\010\014\022\0010\n\005\010\r\022\0011\n\265\001\n\022\010\315\001\022\rlayer_metrics\n\016\010\203\001\022\tvariables\n \010\316\001\022\033layer_regularization_losses\n\014\010\317\001\022\007metrics\n\013\010\320\001\022\006layers\n\032\010\204\001\022\025regularization_losses\n\034\010\321\001\022\027non_trainable_variables\n\030\010\205\001\022\023trainable_variables\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000\n\000"
+          }
+        }
+      }
+    }
+    node {
+      name: "action_callee_basic_block_count"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "action_callee_conditionally_executed_blocks"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "action_callee_users"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "action_caller_basic_block_count"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "action_caller_conditionally_executed_blocks"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "action_caller_users"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "action_callsite_height"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "action_cost_estimate"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "action_discount"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "action_edge_count"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "action_inlining_default"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "action_node_count"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "action_nr_ctant_params"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT64
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "action_reward"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_FLOAT
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "action_step_type"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_INT32
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+    }
+    node {
+      name: "StatefulPartitionedCall"
+      op: "StatefulPartitionedCall"
+      input: "action_callee_basic_block_count"
+      input: "action_callee_conditionally_executed_blocks"
+      input: "action_callee_users"
+      input: "action_caller_basic_block_count"
+      input: "action_caller_conditionally_executed_blocks"
+      input: "action_caller_users"
+      input: "action_callsite_height"
+      input: "action_cost_estimate"
+      input: "action_discount"
+      input: "action_edge_count"
+      input: "action_inlining_default"
+      input: "action_node_count"
+      input: "action_nr_ctant_params"
+      input: "action_reward"
+      input: "action_step_type"
+      input: "QNetwork/EncodingNetwork/dense/kernel"
+      input: "QNetwork/EncodingNetwork/dense/bias"
+      input: "QNetwork/EncodingNetwork/dense_1/kernel"
+      input: "QNetwork/EncodingNetwork/dense_1/bias"
+      input: "QNetwork/dense_2/kernel"
+      input: "QNetwork/dense_2/bias"
+      attr {
+        key: "Tin"
+        value {
+          list {
+            type: DT_INT64
+            type: DT_INT64
+            type: DT_INT64
+            type: DT_INT64
+            type: DT_INT64
+            type: DT_INT64
+            type: DT_INT64
+            type: DT_INT64
+            type: DT_FLOAT
+            type: DT_INT64
+            type: DT_INT64
+            type: DT_INT64
+            type: DT_INT64
+            type: DT_FLOAT
+            type: DT_INT32
+            type: DT_RESOURCE
+            type: DT_RESOURCE
+            type: DT_RESOURCE
+            type: DT_RESOURCE
+            type: DT_RESOURCE
+            type: DT_RESOURCE
+          }
+        }
+      }
+      attr {
+        key: "Tout"
+        value {
+          list {
+            type: DT_INT64
+          }
+        }
+      }
+      attr {
+        key: "_collective_manager_ids"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+              dim {
+                size: 1
+              }
+            }
+          }
+        }
+      }
+      attr {
+        key: "_read_only_resource_inputs"
+        value {
+          list {
+            i: 15
+            i: 16
+            i: 17
+            i: 18
+            i: 19
+            i: 20
+          }
+        }
+      }
+      attr {
+        key: "config_proto"
+        value {
+          s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001"
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "__inference_signature_wrapper_4619026"
+          }
+        }
+      }
+    }
+    node {
+      name: "PartitionedCall"
+      op: "PartitionedCall"
+      attr {
+        key: "Tin"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "Tout"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "_collective_manager_ids"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "_read_only_resource_inputs"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "config_proto"
+        value {
+          s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001"
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "__inference_signature_wrapper_4619033"
+          }
+        }
+      }
+    }
+    node {
+      name: "StatefulPartitionedCall_1"
+      op: "StatefulPartitionedCall"
+      input: "train_step"
+      attr {
+        key: "Tin"
+        value {
+          list {
+            type: DT_RESOURCE
+          }
+        }
+      }
+      attr {
+        key: "Tout"
+        value {
+          list {
+            type: DT_INT64
+          }
+        }
+      }
+      attr {
+        key: "_collective_manager_ids"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "_read_only_resource_inputs"
+        value {
+          list {
+            i: 0
+          }
+        }
+      }
+      attr {
+        key: "config_proto"
+        value {
+          s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001"
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "__inference_signature_wrapper_4619048"
+          }
+        }
+      }
+    }
+    node {
+      name: "saver_filename"
+      op: "Placeholder"
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "dtype"
+        value {
+          type: DT_STRING
+        }
+      }
+      attr {
+        key: "shape"
+        value {
+          shape {
+          }
+        }
+      }
+    }
+    node {
+      name: "StatefulPartitionedCall_2"
+      op: "StatefulPartitionedCall"
+      input: "saver_filename"
+      input: "train_step/Read/ReadVariableOp"
+      input: "QNetwork/EncodingNetwork/dense/kernel/Read/ReadVariableOp"
+      input: "QNetwork/EncodingNetwork/dense/bias/Read/ReadVariableOp"
+      input: "QNetwork/EncodingNetwork/dense_1/kernel/Read/ReadVariableOp"
+      input: "QNetwork/EncodingNetwork/dense_1/bias/Read/ReadVariableOp"
+      input: "QNetwork/dense_2/kernel/Read/ReadVariableOp"
+      input: "QNetwork/dense_2/bias/Read/ReadVariableOp"
+      input: "Const"
+      attr {
+        key: "Tin"
+        value {
+          list {
+            type: DT_STRING
+            type: DT_INT64
+            type: DT_FLOAT
+            type: DT_FLOAT
+            type: DT_FLOAT
+            type: DT_FLOAT
+            type: DT_FLOAT
+            type: DT_FLOAT
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "Tout"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "_collective_manager_ids"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "_read_only_resource_inputs"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "config_proto"
+        value {
+          s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001"
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "__inference__traced_save_4619143"
+          }
+        }
+      }
+    }
+    node {
+      name: "StatefulPartitionedCall_3"
+      op: "StatefulPartitionedCall"
+      input: "saver_filename"
+      input: "train_step"
+      input: "QNetwork/EncodingNetwork/dense/kernel"
+      input: "QNetwork/EncodingNetwork/dense/bias"
+      input: "QNetwork/EncodingNetwork/dense_1/kernel"
+      input: "QNetwork/EncodingNetwork/dense_1/bias"
+      input: "QNetwork/dense_2/kernel"
+      input: "QNetwork/dense_2/bias"
+      attr {
+        key: "Tin"
+        value {
+          list {
+            type: DT_STRING
+            type: DT_RESOURCE
+            type: DT_RESOURCE
+            type: DT_RESOURCE
+            type: DT_RESOURCE
+            type: DT_RESOURCE
+            type: DT_RESOURCE
+            type: DT_RESOURCE
+          }
+        }
+      }
+      attr {
+        key: "Tout"
+        value {
+          list {
+            type: DT_STRING
+          }
+        }
+      }
+      attr {
+        key: "_collective_manager_ids"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "_output_shapes"
+        value {
+          list {
+            shape {
+            }
+          }
+        }
+      }
+      attr {
+        key: "_read_only_resource_inputs"
+        value {
+          list {
+          }
+        }
+      }
+      attr {
+        key: "config_proto"
+        value {
+          s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001"
+        }
+      }
+      attr {
+        key: "f"
+        value {
+          func {
+            name: "__inference__traced_restore_4619176"
+          }
+        }
+      }
+    }
+    library {
+      function {
+        signature {
+          name: "__inference_signature_wrapper_4619048"
+          input_arg {
+            name: "unknown"
+            type: DT_RESOURCE
+          }
+          output_arg {
+            name: "identity"
+            type: DT_INT64
+          }
+          is_stateful: true
+          control_output: "StatefulPartitionedCall"
+        }
+        node_def {
+          name: "StatefulPartitionedCall"
+          op: "StatefulPartitionedCall"
+          input: "unknown"
+          attr {
+            key: "Tin"
+            value {
+              list {
+                type: DT_RESOURCE
+              }
+            }
+          }
+          attr {
+            key: "Tout"
+            value {
+              list {
+                type: DT_INT64
+              }
+            }
+          }
+          attr {
+            key: "_collective_manager_ids"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "_read_only_resource_inputs"
+            value {
+              list {
+                i: 0
+              }
+            }
+          }
+          attr {
+            key: "config_proto"
+            value {
+              s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001"
+            }
+          }
+          attr {
+            key: "f"
+            value {
+              func {
+                name: "__inference_function_with_signature_4619040"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "StatefulPartitionedCall"
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "StatefulPartitionedCall:output:0"
+          input: "^StatefulPartitionedCall"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity"
+          }
+        }
+        ret {
+          key: "identity"
+          value: "Identity:output:0"
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+                unknown_rank: true
+              }
+            }
+          }
+        }
+        control_ret {
+          key: "StatefulPartitionedCall"
+          value: "StatefulPartitionedCall"
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference_function_with_signature_4619029"
+        }
+        node_def {
+          name: "PartitionedCall"
+          op: "PartitionedCall"
+          attr {
+            key: "Tin"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "Tout"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "_collective_manager_ids"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "_read_only_resource_inputs"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "config_proto"
+            value {
+              s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001"
+            }
+          }
+          attr {
+            key: "f"
+            value {
+              func {
+                name: "__inference_function_722"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "PartitionedCall"
+          }
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference_action_931"
+          input_arg {
+            name: "time_step"
+            type: DT_INT32
+          }
+          input_arg {
+            name: "time_step_1"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "time_step_2"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "time_step_3"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_4"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_5"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_6"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_7"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_8"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_9"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_10"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_11"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_12"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_13"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_14"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "qnetwork_encodingnetwork_dense_matmul_readvariableop_resource"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "qnetwork_encodingnetwork_dense_biasadd_readvariableop_resource"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "qnetwork_encodingnetwork_dense_1_matmul_readvariableop_resource"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "qnetwork_encodingnetwork_dense_1_biasadd_readvariableop_resource"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "qnetwork_dense_2_matmul_readvariableop_resource"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "qnetwork_dense_2_biasadd_readvariableop_resource"
+            type: DT_RESOURCE
+          }
+          output_arg {
+            name: "identity"
+            type: DT_INT64
+          }
+          is_stateful: true
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda/expand_dims/ExpandDims/dim"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda/expand_dims/ExpandDims/dim"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda/expand_dims/ExpandDims"
+          op: "ExpandDims"
+          input: "time_step_3"
+          input: "QNetwork/EncodingNetwork/lambda/expand_dims/ExpandDims/dim:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda/expand_dims/ExpandDims"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda/Bucketize"
+          op: "Bucketize"
+          input: "QNetwork/EncodingNetwork/lambda/expand_dims/ExpandDims:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "boundaries"
+            value {
+              list {
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 10
+                f: 10
+                f: 11
+                f: 12
+                f: 13
+                f: 14
+                f: 14
+                f: 14
+                f: 16
+                f: 17
+                f: 19
+                f: 23
+                f: 27
+                f: 39
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda/Bucketize"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda/Cast"
+          op: "Cast"
+          input: "QNetwork/EncodingNetwork/lambda/Bucketize:output:0"
+          attr {
+            key: "DstT"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "SrcT"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda/Cast"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda/truediv/y"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_FLOAT
+                tensor_shape {
+                }
+                float_val: 999
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda/truediv/y"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda/truediv"
+          op: "RealDiv"
+          input: "QNetwork/EncodingNetwork/lambda/Cast:y:0"
+          input: "QNetwork/EncodingNetwork/lambda/truediv/y:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda/truediv"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda/Sqrt"
+          op: "Sqrt"
+          input: "QNetwork/EncodingNetwork/lambda/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda/Sqrt"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda/mul"
+          op: "Mul"
+          input: "QNetwork/EncodingNetwork/lambda/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda/mul"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda/concat/axis"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda/concat/axis"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda/concat"
+          op: "ConcatV2"
+          input: "QNetwork/EncodingNetwork/lambda/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda/Sqrt:y:0"
+          input: "QNetwork/EncodingNetwork/lambda/mul:z:0"
+          input: "QNetwork/EncodingNetwork/lambda/concat/axis:output:0"
+          attr {
+            key: "N"
+            value {
+              i: 3
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 3
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda/concat"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_1/expand_dims/ExpandDims/dim"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_1/expand_dims/ExpandDims/dim"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_1/expand_dims/ExpandDims"
+          op: "ExpandDims"
+          input: "time_step_4"
+          input: "QNetwork/EncodingNetwork/lambda_1/expand_dims/ExpandDims/dim:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_1/expand_dims/ExpandDims"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_1/Bucketize"
+          op: "Bucketize"
+          input: "QNetwork/EncodingNetwork/lambda_1/expand_dims/ExpandDims:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "boundaries"
+            value {
+              list {
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 3
+                f: 3
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 7
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 9
+                f: 10
+                f: 10
+                f: 10
+                f: 12
+                f: 12
+                f: 12
+                f: 14
+                f: 14
+                f: 18
+                f: 20
+                f: 23
+                f: 30
+                f: 41
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_1/Bucketize"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_1/Cast"
+          op: "Cast"
+          input: "QNetwork/EncodingNetwork/lambda_1/Bucketize:output:0"
+          attr {
+            key: "DstT"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "SrcT"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_1/Cast"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_1/truediv/y"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_FLOAT
+                tensor_shape {
+                }
+                float_val: 999
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_1/truediv/y"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_1/truediv"
+          op: "RealDiv"
+          input: "QNetwork/EncodingNetwork/lambda_1/Cast:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_1/truediv/y:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_1/truediv"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_1/Sqrt"
+          op: "Sqrt"
+          input: "QNetwork/EncodingNetwork/lambda_1/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_1/Sqrt"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_1/mul"
+          op: "Mul"
+          input: "QNetwork/EncodingNetwork/lambda_1/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_1/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_1/mul"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_1/concat/axis"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_1/concat/axis"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_1/concat"
+          op: "ConcatV2"
+          input: "QNetwork/EncodingNetwork/lambda_1/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_1/Sqrt:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_1/mul:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_1/concat/axis:output:0"
+          attr {
+            key: "N"
+            value {
+              i: 3
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 3
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_1/concat"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_2/expand_dims/ExpandDims/dim"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_2/expand_dims/ExpandDims/dim"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_2/expand_dims/ExpandDims"
+          op: "ExpandDims"
+          input: "time_step_5"
+          input: "QNetwork/EncodingNetwork/lambda_2/expand_dims/ExpandDims/dim:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_2/expand_dims/ExpandDims"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_2/Bucketize"
+          op: "Bucketize"
+          input: "QNetwork/EncodingNetwork/lambda_2/expand_dims/ExpandDims:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "boundaries"
+            value {
+              list {
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 21
+                f: 21
+                f: 21
+                f: 21
+                f: 21
+                f: 21
+                f: 21
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 23
+                f: 23
+                f: 23
+                f: 24
+                f: 24
+                f: 24
+                f: 25
+                f: 25
+                f: 25
+                f: 25
+                f: 25
+                f: 25
+                f: 26
+                f: 26
+                f: 26
+                f: 27
+                f: 27
+                f: 27
+                f: 27
+                f: 28
+                f: 28
+                f: 29
+                f: 29
+                f: 29
+                f: 29
+                f: 30
+                f: 30
+                f: 31
+                f: 31
+                f: 31
+                f: 31
+                f: 32
+                f: 32
+                f: 33
+                f: 33
+                f: 33
+                f: 34
+                f: 34
+                f: 34
+                f: 34
+                f: 35
+                f: 35
+                f: 36
+                f: 36
+                f: 37
+                f: 37
+                f: 37
+                f: 38
+                f: 38
+                f: 39
+                f: 39
+                f: 40
+                f: 40
+                f: 41
+                f: 41
+                f: 41
+                f: 42
+                f: 43
+                f: 43
+                f: 44
+                f: 44
+                f: 45
+                f: 45
+                f: 46
+                f: 46
+                f: 46
+                f: 47
+                f: 47
+                f: 48
+                f: 49
+                f: 49
+                f: 50
+                f: 50
+                f: 51
+                f: 52
+                f: 53
+                f: 53
+                f: 54
+                f: 55
+                f: 56
+                f: 57
+                f: 57
+                f: 58
+                f: 59
+                f: 60
+                f: 61
+                f: 61
+                f: 63
+                f: 63
+                f: 64
+                f: 65
+                f: 66
+                f: 67
+                f: 67
+                f: 69
+                f: 70
+                f: 71
+                f: 72
+                f: 73
+                f: 74
+                f: 75
+                f: 77
+                f: 78
+                f: 79
+                f: 80
+                f: 81
+                f: 82
+                f: 83
+                f: 85
+                f: 86
+                f: 88
+                f: 89
+                f: 91
+                f: 92
+                f: 94
+                f: 96
+                f: 97
+                f: 99
+                f: 100
+                f: 101
+                f: 103
+                f: 105
+                f: 107
+                f: 109
+                f: 111
+                f: 113
+                f: 115
+                f: 118
+                f: 121
+                f: 123
+                f: 126
+                f: 128
+                f: 130
+                f: 133
+                f: 135
+                f: 137
+                f: 140
+                f: 143
+                f: 146
+                f: 148
+                f: 151
+                f: 154
+                f: 157
+                f: 161
+                f: 163
+                f: 166
+                f: 169
+                f: 173
+                f: 178
+                f: 183
+                f: 189
+                f: 193
+                f: 197
+                f: 202
+                f: 208
+                f: 213
+                f: 218
+                f: 223
+                f: 228
+                f: 233
+                f: 239
+                f: 245
+                f: 250
+                f: 257
+                f: 262
+                f: 269
+                f: 277
+                f: 284
+                f: 292
+                f: 300
+                f: 308
+                f: 319
+                f: 329
+                f: 340
+                f: 349
+                f: 359
+                f: 371
+                f: 382
+                f: 394
+                f: 410
+                f: 423
+                f: 435
+                f: 445
+                f: 462
+                f: 480
+                f: 492
+                f: 506
+                f: 519
+                f: 536
+                f: 557
+                f: 577
+                f: 598
+                f: 622
+                f: 655
+                f: 679
+                f: 707
+                f: 733
+                f: 751
+                f: 787
+                f: 814
+                f: 847
+                f: 897
+                f: 934
+                f: 997
+                f: 1062
+                f: 1111
+                f: 1181
+                f: 1275
+                f: 1385
+                f: 1465
+                f: 1603
+                f: 1769
+                f: 2057
+                f: 2257
+                f: 2803
+                f: 3468
+                f: 4417
+                f: 6538
+                f: 16126
+                f: 23446
+                f: 33536
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_2/Bucketize"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_2/Cast"
+          op: "Cast"
+          input: "QNetwork/EncodingNetwork/lambda_2/Bucketize:output:0"
+          attr {
+            key: "DstT"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "SrcT"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_2/Cast"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_2/truediv/y"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_FLOAT
+                tensor_shape {
+                }
+                float_val: 999
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_2/truediv/y"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_2/truediv"
+          op: "RealDiv"
+          input: "QNetwork/EncodingNetwork/lambda_2/Cast:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_2/truediv/y:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_2/truediv"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_2/Sqrt"
+          op: "Sqrt"
+          input: "QNetwork/EncodingNetwork/lambda_2/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_2/Sqrt"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_2/mul"
+          op: "Mul"
+          input: "QNetwork/EncodingNetwork/lambda_2/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_2/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_2/mul"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_2/concat/axis"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_2/concat/axis"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_2/concat"
+          op: "ConcatV2"
+          input: "QNetwork/EncodingNetwork/lambda_2/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_2/Sqrt:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_2/mul:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_2/concat/axis:output:0"
+          attr {
+            key: "N"
+            value {
+              i: 3
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 3
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_2/concat"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_3/expand_dims/ExpandDims/dim"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_3/expand_dims/ExpandDims/dim"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_3/expand_dims/ExpandDims"
+          op: "ExpandDims"
+          input: "time_step_6"
+          input: "QNetwork/EncodingNetwork/lambda_3/expand_dims/ExpandDims/dim:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_3/expand_dims/ExpandDims"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_3/Bucketize"
+          op: "Bucketize"
+          input: "QNetwork/EncodingNetwork/lambda_3/expand_dims/ExpandDims:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "boundaries"
+            value {
+              list {
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 21
+                f: 21
+                f: 21
+                f: 21
+                f: 21
+                f: 21
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 23
+                f: 23
+                f: 23
+                f: 24
+                f: 24
+                f: 24
+                f: 24
+                f: 24
+                f: 24
+                f: 25
+                f: 25
+                f: 25
+                f: 25
+                f: 25
+                f: 26
+                f: 26
+                f: 26
+                f: 26
+                f: 27
+                f: 27
+                f: 27
+                f: 27
+                f: 27
+                f: 28
+                f: 28
+                f: 28
+                f: 29
+                f: 29
+                f: 29
+                f: 29
+                f: 30
+                f: 30
+                f: 30
+                f: 31
+                f: 31
+                f: 31
+                f: 32
+                f: 32
+                f: 32
+                f: 33
+                f: 33
+                f: 33
+                f: 34
+                f: 34
+                f: 34
+                f: 34
+                f: 35
+                f: 35
+                f: 35
+                f: 36
+                f: 36
+                f: 36
+                f: 37
+                f: 37
+                f: 37
+                f: 38
+                f: 38
+                f: 38
+                f: 38
+                f: 39
+                f: 39
+                f: 40
+                f: 40
+                f: 41
+                f: 41
+                f: 42
+                f: 43
+                f: 43
+                f: 44
+                f: 45
+                f: 45
+                f: 46
+                f: 47
+                f: 47
+                f: 48
+                f: 49
+                f: 49
+                f: 50
+                f: 50
+                f: 52
+                f: 52
+                f: 53
+                f: 54
+                f: 55
+                f: 55
+                f: 57
+                f: 58
+                f: 59
+                f: 60
+                f: 62
+                f: 64
+                f: 65
+                f: 66
+                f: 68
+                f: 70
+                f: 70
+                f: 70
+                f: 70
+                f: 70
+                f: 71
+                f: 73
+                f: 75
+                f: 76
+                f: 78
+                f: 81
+                f: 84
+                f: 86
+                f: 90
+                f: 94
+                f: 98
+                f: 101
+                f: 106
+                f: 111
+                f: 117
+                f: 123
+                f: 130
+                f: 138
+                f: 146
+                f: 157
+                f: 163
+                f: 176
+                f: 187
+                f: 198
+                f: 214
+                f: 227
+                f: 252
+                f: 280
+                f: 327
+                f: 395
+                f: 506
+                f: 671
+                f: 1025
+                f: 1971
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_3/Bucketize"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_3/Cast"
+          op: "Cast"
+          input: "QNetwork/EncodingNetwork/lambda_3/Bucketize:output:0"
+          attr {
+            key: "DstT"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "SrcT"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_3/Cast"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_3/truediv/y"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_FLOAT
+                tensor_shape {
+                }
+                float_val: 999
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_3/truediv/y"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_3/truediv"
+          op: "RealDiv"
+          input: "QNetwork/EncodingNetwork/lambda_3/Cast:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_3/truediv/y:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_3/truediv"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_3/Sqrt"
+          op: "Sqrt"
+          input: "QNetwork/EncodingNetwork/lambda_3/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_3/Sqrt"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_3/mul"
+          op: "Mul"
+          input: "QNetwork/EncodingNetwork/lambda_3/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_3/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_3/mul"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_3/concat/axis"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_3/concat/axis"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_3/concat"
+          op: "ConcatV2"
+          input: "QNetwork/EncodingNetwork/lambda_3/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_3/Sqrt:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_3/mul:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_3/concat/axis:output:0"
+          attr {
+            key: "N"
+            value {
+              i: 3
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 3
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_3/concat"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_4/expand_dims/ExpandDims/dim"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_4/expand_dims/ExpandDims/dim"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_4/expand_dims/ExpandDims"
+          op: "ExpandDims"
+          input: "time_step_7"
+          input: "QNetwork/EncodingNetwork/lambda_4/expand_dims/ExpandDims/dim:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_4/expand_dims/ExpandDims"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_4/Bucketize"
+          op: "Bucketize"
+          input: "QNetwork/EncodingNetwork/lambda_4/expand_dims/ExpandDims:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "boundaries"
+            value {
+              list {
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 5
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 7
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 11
+                f: 11
+                f: 11
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 13
+                f: 13
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 19
+                f: 19
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 21
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 24
+                f: 24
+                f: 24
+                f: 24
+                f: 24
+                f: 24
+                f: 24
+                f: 24
+                f: 25
+                f: 26
+                f: 26
+                f: 26
+                f: 26
+                f: 26
+                f: 26
+                f: 26
+                f: 26
+                f: 26
+                f: 26
+                f: 27
+                f: 28
+                f: 28
+                f: 28
+                f: 28
+                f: 28
+                f: 29
+                f: 30
+                f: 30
+                f: 30
+                f: 30
+                f: 30
+                f: 30
+                f: 31
+                f: 32
+                f: 32
+                f: 32
+                f: 32
+                f: 32
+                f: 34
+                f: 34
+                f: 34
+                f: 34
+                f: 34
+                f: 34
+                f: 35
+                f: 36
+                f: 36
+                f: 36
+                f: 37
+                f: 38
+                f: 38
+                f: 38
+                f: 39
+                f: 40
+                f: 40
+                f: 41
+                f: 42
+                f: 42
+                f: 43
+                f: 44
+                f: 44
+                f: 46
+                f: 46
+                f: 47
+                f: 48
+                f: 48
+                f: 50
+                f: 50
+                f: 52
+                f: 52
+                f: 54
+                f: 55
+                f: 55
+                f: 56
+                f: 57
+                f: 58
+                f: 60
+                f: 60
+                f: 60
+                f: 60
+                f: 60
+                f: 60
+                f: 62
+                f: 62
+                f: 64
+                f: 65
+                f: 66
+                f: 68
+                f: 70
+                f: 72
+                f: 74
+                f: 77
+                f: 80
+                f: 82
+                f: 86
+                f: 89
+                f: 92
+                f: 96
+                f: 99
+                f: 104
+                f: 108
+                f: 114
+                f: 119
+                f: 125
+                f: 131
+                f: 139
+                f: 146
+                f: 157
+                f: 167
+                f: 176
+                f: 188
+                f: 198
+                f: 215
+                f: 236
+                f: 262
+                f: 306
+                f: 376
+                f: 462
+                f: 596
+                f: 942
+                f: 1428
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_4/Bucketize"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_4/Cast"
+          op: "Cast"
+          input: "QNetwork/EncodingNetwork/lambda_4/Bucketize:output:0"
+          attr {
+            key: "DstT"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "SrcT"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_4/Cast"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_4/truediv/y"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_FLOAT
+                tensor_shape {
+                }
+                float_val: 999
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_4/truediv/y"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_4/truediv"
+          op: "RealDiv"
+          input: "QNetwork/EncodingNetwork/lambda_4/Cast:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_4/truediv/y:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_4/truediv"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_4/Sqrt"
+          op: "Sqrt"
+          input: "QNetwork/EncodingNetwork/lambda_4/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_4/Sqrt"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_4/mul"
+          op: "Mul"
+          input: "QNetwork/EncodingNetwork/lambda_4/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_4/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_4/mul"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_4/concat/axis"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_4/concat/axis"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_4/concat"
+          op: "ConcatV2"
+          input: "QNetwork/EncodingNetwork/lambda_4/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_4/Sqrt:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_4/mul:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_4/concat/axis:output:0"
+          attr {
+            key: "N"
+            value {
+              i: 3
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 3
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_4/concat"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_5/expand_dims/ExpandDims/dim"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_5/expand_dims/ExpandDims/dim"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_5/expand_dims/ExpandDims"
+          op: "ExpandDims"
+          input: "time_step_8"
+          input: "QNetwork/EncodingNetwork/lambda_5/expand_dims/ExpandDims/dim:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_5/expand_dims/ExpandDims"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_5/Bucketize"
+          op: "Bucketize"
+          input: "QNetwork/EncodingNetwork/lambda_5/expand_dims/ExpandDims:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "boundaries"
+            value {
+              list {
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 10
+                f: 10
+                f: 11
+                f: 11
+                f: 12
+                f: 13
+                f: 14
+                f: 15
+                f: 16
+                f: 18
+                f: 20
+                f: 23
+                f: 29
+                f: 38
+                f: 60
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_5/Bucketize"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_5/Cast"
+          op: "Cast"
+          input: "QNetwork/EncodingNetwork/lambda_5/Bucketize:output:0"
+          attr {
+            key: "DstT"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "SrcT"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_5/Cast"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_5/truediv/y"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_FLOAT
+                tensor_shape {
+                }
+                float_val: 999
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_5/truediv/y"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_5/truediv"
+          op: "RealDiv"
+          input: "QNetwork/EncodingNetwork/lambda_5/Cast:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_5/truediv/y:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_5/truediv"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_5/Sqrt"
+          op: "Sqrt"
+          input: "QNetwork/EncodingNetwork/lambda_5/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_5/Sqrt"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_5/mul"
+          op: "Mul"
+          input: "QNetwork/EncodingNetwork/lambda_5/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_5/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_5/mul"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_5/concat/axis"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_5/concat/axis"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_5/concat"
+          op: "ConcatV2"
+          input: "QNetwork/EncodingNetwork/lambda_5/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_5/Sqrt:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_5/mul:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_5/concat/axis:output:0"
+          attr {
+            key: "N"
+            value {
+              i: 3
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 3
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_5/concat"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_6/expand_dims/ExpandDims/dim"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_6/expand_dims/ExpandDims/dim"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_6/expand_dims/ExpandDims"
+          op: "ExpandDims"
+          input: "time_step_9"
+          input: "QNetwork/EncodingNetwork/lambda_6/expand_dims/ExpandDims/dim:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_6/expand_dims/ExpandDims"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_6/Bucketize"
+          op: "Bucketize"
+          input: "QNetwork/EncodingNetwork/lambda_6/expand_dims/ExpandDims:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "boundaries"
+            value {
+              list {
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 3
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 4
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 6
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 7
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 8
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 9
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 11
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 12
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 13
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 14
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 16
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 17
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 18
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 19
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 21
+                f: 21
+                f: 21
+                f: 21
+                f: 21
+                f: 21
+                f: 21
+                f: 21
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 22
+                f: 23
+                f: 23
+                f: 23
+                f: 23
+                f: 23
+                f: 23
+                f: 23
+                f: 24
+                f: 24
+                f: 24
+                f: 24
+                f: 24
+                f: 25
+                f: 25
+                f: 25
+                f: 25
+                f: 25
+                f: 26
+                f: 26
+                f: 26
+                f: 26
+                f: 27
+                f: 27
+                f: 27
+                f: 28
+                f: 28
+                f: 28
+                f: 29
+                f: 29
+                f: 30
+                f: 30
+                f: 30
+                f: 31
+                f: 31
+                f: 32
+                f: 32
+                f: 33
+                f: 33
+                f: 34
+                f: 35
+                f: 37
+                f: 38
+                f: 40
+                f: 46
+                f: 51
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_6/Bucketize"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_6/Cast"
+          op: "Cast"
+          input: "QNetwork/EncodingNetwork/lambda_6/Bucketize:output:0"
+          attr {
+            key: "DstT"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "SrcT"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_6/Cast"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_6/truediv/y"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_FLOAT
+                tensor_shape {
+                }
+                float_val: 999
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_6/truediv/y"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_6/truediv"
+          op: "RealDiv"
+          input: "QNetwork/EncodingNetwork/lambda_6/Cast:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_6/truediv/y:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_6/truediv"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_6/Sqrt"
+          op: "Sqrt"
+          input: "QNetwork/EncodingNetwork/lambda_6/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_6/Sqrt"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_6/mul"
+          op: "Mul"
+          input: "QNetwork/EncodingNetwork/lambda_6/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_6/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_6/mul"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_6/concat/axis"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_6/concat/axis"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_6/concat"
+          op: "ConcatV2"
+          input: "QNetwork/EncodingNetwork/lambda_6/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_6/Sqrt:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_6/mul:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_6/concat/axis:output:0"
+          attr {
+            key: "N"
+            value {
+              i: 3
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 3
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_6/concat"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_7/expand_dims/ExpandDims/dim"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_7/expand_dims/ExpandDims/dim"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_7/expand_dims/ExpandDims"
+          op: "ExpandDims"
+          input: "time_step_10"
+          input: "QNetwork/EncodingNetwork/lambda_7/expand_dims/ExpandDims/dim:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_7/expand_dims/ExpandDims"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_7/Bucketize"
+          op: "Bucketize"
+          input: "QNetwork/EncodingNetwork/lambda_7/expand_dims/ExpandDims:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "boundaries"
+            value {
+              list {
+                f: -15035
+                f: -15030
+                f: -15025
+                f: -15000
+                f: -14985
+                f: -14945
+                f: -14745
+                f: -70
+                f: -55
+                f: -55
+                f: -50
+                f: -50
+                f: -50
+                f: -45
+                f: -45
+                f: -45
+                f: -45
+                f: -45
+                f: -45
+                f: -45
+                f: -45
+                f: -45
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -40
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -35
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -30
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -25
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -20
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -15
+                f: -10
+                f: -10
+                f: -10
+                f: -10
+                f: -10
+                f: -10
+                f: -10
+                f: -10
+                f: -10
+                f: -10
+                f: -10
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: -5
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 5
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 10
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 15
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 20
+                f: 25
+                f: 25
+                f: 25
+                f: 25
+                f: 25
+                f: 25
+                f: 25
+                f: 30
+                f: 30
+                f: 30
+                f: 30
+                f: 30
+                f: 30
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 35
+                f: 40
+                f: 40
+                f: 40
+                f: 40
+                f: 40
+                f: 40
+                f: 40
+                f: 40
+                f: 40
+                f: 40
+                f: 40
+                f: 40
+                f: 45
+                f: 45
+                f: 45
+                f: 45
+                f: 45
+                f: 45
+                f: 45
+                f: 45
+                f: 45
+                f: 45
+                f: 50
+                f: 50
+                f: 50
+                f: 50
+                f: 50
+                f: 50
+                f: 50
+                f: 50
+                f: 50
+                f: 55
+                f: 55
+                f: 60
+                f: 60
+                f: 60
+                f: 60
+                f: 60
+                f: 60
+                f: 60
+                f: 60
+                f: 60
+                f: 60
+                f: 65
+                f: 65
+                f: 65
+                f: 65
+                f: 65
+                f: 65
+                f: 65
+                f: 65
+                f: 65
+                f: 65
+                f: 65
+                f: 65
+                f: 70
+                f: 70
+                f: 70
+                f: 70
+                f: 70
+                f: 70
+                f: 70
+                f: 75
+                f: 75
+                f: 80
+                f: 80
+                f: 80
+                f: 85
+                f: 85
+                f: 85
+                f: 90
+                f: 90
+                f: 90
+                f: 90
+                f: 95
+                f: 95
+                f: 100
+                f: 100
+                f: 105
+                f: 110
+                f: 115
+                f: 120
+                f: 125
+                f: 125
+                f: 130
+                f: 140
+                f: 140
+                f: 145
+                f: 150
+                f: 155
+                f: 160
+                f: 160
+                f: 165
+                f: 170
+                f: 175
+                f: 180
+                f: 190
+                f: 200
+                f: 210
+                f: 215
+                f: 220
+                f: 220
+                f: 230
+                f: 235
+                f: 245
+                f: 250
+                f: 260
+                f: 275
+                f: 290
+                f: 305
+                f: 325
+                f: 350
+                f: 370
+                f: 390
+                f: 425
+                f: 460
+                f: 500
+                f: 560
+                f: 650
+                f: 790
+                f: 1025
+                f: 1600
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_7/Bucketize"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_7/Cast"
+          op: "Cast"
+          input: "QNetwork/EncodingNetwork/lambda_7/Bucketize:output:0"
+          attr {
+            key: "DstT"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "SrcT"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_7/Cast"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_7/truediv/y"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_FLOAT
+                tensor_shape {
+                }
+                float_val: 999
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_7/truediv/y"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_7/truediv"
+          op: "RealDiv"
+          input: "QNetwork/EncodingNetwork/lambda_7/Cast:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_7/truediv/y:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_7/truediv"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_7/Sqrt"
+          op: "Sqrt"
+          input: "QNetwork/EncodingNetwork/lambda_7/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_7/Sqrt"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_7/mul"
+          op: "Mul"
+          input: "QNetwork/EncodingNetwork/lambda_7/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_7/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_7/mul"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_7/concat/axis"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_7/concat/axis"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_7/concat"
+          op: "ConcatV2"
+          input: "QNetwork/EncodingNetwork/lambda_7/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_7/Sqrt:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_7/mul:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_7/concat/axis:output:0"
+          attr {
+            key: "N"
+            value {
+              i: 3
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 3
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_7/concat"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_8/expand_dims/ExpandDims/dim"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_8/expand_dims/ExpandDims/dim"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_8/expand_dims/ExpandDims"
+          op: "ExpandDims"
+          input: "time_step_11"
+          input: "QNetwork/EncodingNetwork/lambda_8/expand_dims/ExpandDims/dim:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_8/expand_dims/ExpandDims"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_8/Bucketize"
+          op: "Bucketize"
+          input: "QNetwork/EncodingNetwork/lambda_8/expand_dims/ExpandDims:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "boundaries"
+            value {
+              list {
+                f: 18
+                f: 29
+                f: 39
+                f: 48
+                f: 57
+                f: 64
+                f: 70
+                f: 76
+                f: 82
+                f: 87
+                f: 92
+                f: 97
+                f: 101
+                f: 105
+                f: 109
+                f: 113
+                f: 116
+                f: 120
+                f: 123
+                f: 127
+                f: 130
+                f: 134
+                f: 137
+                f: 140
+                f: 143
+                f: 146
+                f: 149
+                f: 152
+                f: 156
+                f: 159
+                f: 162
+                f: 165
+                f: 168
+                f: 171
+                f: 174
+                f: 177
+                f: 180
+                f: 183
+                f: 186
+                f: 188
+                f: 191
+                f: 194
+                f: 197
+                f: 200
+                f: 203
+                f: 205
+                f: 208
+                f: 211
+                f: 214
+                f: 217
+                f: 219
+                f: 222
+                f: 225
+                f: 228
+                f: 231
+                f: 233
+                f: 236
+                f: 239
+                f: 242
+                f: 244
+                f: 247
+                f: 250
+                f: 253
+                f: 255
+                f: 258
+                f: 261
+                f: 264
+                f: 266
+                f: 269
+                f: 272
+                f: 275
+                f: 278
+                f: 280
+                f: 283
+                f: 286
+                f: 289
+                f: 292
+                f: 294
+                f: 297
+                f: 300
+                f: 303
+                f: 305
+                f: 308
+                f: 311
+                f: 314
+                f: 317
+                f: 319
+                f: 322
+                f: 325
+                f: 327
+                f: 330
+                f: 333
+                f: 336
+                f: 339
+                f: 341
+                f: 344
+                f: 347
+                f: 350
+                f: 353
+                f: 355
+                f: 358
+                f: 361
+                f: 364
+                f: 367
+                f: 370
+                f: 373
+                f: 375
+                f: 378
+                f: 381
+                f: 384
+                f: 387
+                f: 390
+                f: 393
+                f: 396
+                f: 399
+                f: 401
+                f: 404
+                f: 407
+                f: 410
+                f: 413
+                f: 416
+                f: 419
+                f: 422
+                f: 425
+                f: 428
+                f: 431
+                f: 434
+                f: 437
+                f: 440
+                f: 443
+                f: 446
+                f: 449
+                f: 452
+                f: 455
+                f: 458
+                f: 461
+                f: 464
+                f: 467
+                f: 470
+                f: 473
+                f: 476
+                f: 479
+                f: 483
+                f: 486
+                f: 489
+                f: 492
+                f: 495
+                f: 498
+                f: 501
+                f: 504
+                f: 507
+                f: 511
+                f: 514
+                f: 517
+                f: 520
+                f: 523
+                f: 526
+                f: 530
+                f: 533
+                f: 536
+                f: 539
+                f: 542
+                f: 545
+                f: 549
+                f: 552
+                f: 555
+                f: 558
+                f: 562
+                f: 565
+                f: 569
+                f: 572
+                f: 575
+                f: 579
+                f: 582
+                f: 585
+                f: 589
+                f: 592
+                f: 595
+                f: 599
+                f: 602
+                f: 605
+                f: 609
+                f: 612
+                f: 616
+                f: 620
+                f: 623
+                f: 626
+                f: 630
+                f: 634
+                f: 637
+                f: 641
+                f: 644
+                f: 648
+                f: 651
+                f: 655
+                f: 658
+                f: 662
+                f: 665
+                f: 669
+                f: 672
+                f: 676
+                f: 680
+                f: 683
+                f: 687
+                f: 691
+                f: 694
+                f: 698
+                f: 702
+                f: 705
+                f: 709
+                f: 712
+                f: 716
+                f: 720
+                f: 724
+                f: 727
+                f: 731
+                f: 735
+                f: 739
+                f: 742
+                f: 746
+                f: 750
+                f: 754
+                f: 758
+                f: 761
+                f: 765
+                f: 769
+                f: 773
+                f: 777
+                f: 780
+                f: 784
+                f: 788
+                f: 792
+                f: 796
+                f: 800
+                f: 804
+                f: 808
+                f: 812
+                f: 816
+                f: 820
+                f: 823
+                f: 828
+                f: 832
+                f: 836
+                f: 840
+                f: 844
+                f: 848
+                f: 852
+                f: 856
+                f: 860
+                f: 864
+                f: 868
+                f: 873
+                f: 877
+                f: 881
+                f: 885
+                f: 889
+                f: 893
+                f: 897
+                f: 902
+                f: 906
+                f: 910
+                f: 914
+                f: 919
+                f: 923
+                f: 927
+                f: 931
+                f: 935
+                f: 940
+                f: 944
+                f: 948
+                f: 953
+                f: 957
+                f: 962
+                f: 966
+                f: 970
+                f: 975
+                f: 979
+                f: 984
+                f: 988
+                f: 993
+                f: 997
+                f: 1002
+                f: 1006
+                f: 1011
+                f: 1015
+                f: 1020
+                f: 1024
+                f: 1029
+                f: 1034
+                f: 1038
+                f: 1043
+                f: 1047
+                f: 1052
+                f: 1057
+                f: 1062
+                f: 1066
+                f: 1071
+                f: 1076
+                f: 1081
+                f: 1086
+                f: 1090
+                f: 1095
+                f: 1100
+                f: 1105
+                f: 1110
+                f: 1114
+                f: 1119
+                f: 1124
+                f: 1129
+                f: 1134
+                f: 1139
+                f: 1144
+                f: 1149
+                f: 1154
+                f: 1159
+                f: 1164
+                f: 1169
+                f: 1174
+                f: 1179
+                f: 1184
+                f: 1189
+                f: 1194
+                f: 1199
+                f: 1204
+                f: 1209
+                f: 1215
+                f: 1220
+                f: 1225
+                f: 1230
+                f: 1235
+                f: 1241
+                f: 1246
+                f: 1251
+                f: 1257
+                f: 1262
+                f: 1267
+                f: 1273
+                f: 1278
+                f: 1284
+                f: 1289
+                f: 1294
+                f: 1300
+                f: 1305
+                f: 1311
+                f: 1316
+                f: 1322
+                f: 1327
+                f: 1333
+                f: 1338
+                f: 1344
+                f: 1350
+                f: 1355
+                f: 1361
+                f: 1367
+                f: 1372
+                f: 1378
+                f: 1383
+                f: 1389
+                f: 1395
+                f: 1401
+                f: 1407
+                f: 1413
+                f: 1418
+                f: 1424
+                f: 1430
+                f: 1436
+                f: 1442
+                f: 1448
+                f: 1454
+                f: 1459
+                f: 1465
+                f: 1472
+                f: 1477
+                f: 1483
+                f: 1489
+                f: 1495
+                f: 1501
+                f: 1507
+                f: 1514
+                f: 1520
+                f: 1526
+                f: 1532
+                f: 1538
+                f: 1545
+                f: 1551
+                f: 1557
+                f: 1564
+                f: 1570
+                f: 1576
+                f: 1583
+                f: 1589
+                f: 1596
+                f: 1602
+                f: 1608
+                f: 1615
+                f: 1621
+                f: 1628
+                f: 1634
+                f: 1641
+                f: 1647
+                f: 1654
+                f: 1661
+                f: 1667
+                f: 1674
+                f: 1681
+                f: 1687
+                f: 1694
+                f: 1701
+                f: 1708
+                f: 1715
+                f: 1722
+                f: 1729
+                f: 1735
+                f: 1742
+                f: 1749
+                f: 1756
+                f: 1763
+                f: 1770
+                f: 1777
+                f: 1784
+                f: 1791
+                f: 1798
+                f: 1806
+                f: 1812
+                f: 1820
+                f: 1827
+                f: 1835
+                f: 1841
+                f: 1849
+                f: 1856
+                f: 1863
+                f: 1871
+                f: 1878
+                f: 1885
+                f: 1893
+                f: 1901
+                f: 1908
+                f: 1915
+                f: 1923
+                f: 1930
+                f: 1938
+                f: 1946
+                f: 1953
+                f: 1961
+                f: 1969
+                f: 1976
+                f: 1984
+                f: 1992
+                f: 2000
+                f: 2007
+                f: 2015
+                f: 2023
+                f: 2031
+                f: 2039
+                f: 2047
+                f: 2055
+                f: 2063
+                f: 2071
+                f: 2079
+                f: 2087
+                f: 2095
+                f: 2104
+                f: 2112
+                f: 2120
+                f: 2128
+                f: 2137
+                f: 2146
+                f: 2154
+                f: 2162
+                f: 2171
+                f: 2179
+                f: 2188
+                f: 2197
+                f: 2205
+                f: 2214
+                f: 2223
+                f: 2232
+                f: 2241
+                f: 2250
+                f: 2258
+                f: 2268
+                f: 2277
+                f: 2285
+                f: 2294
+                f: 2304
+                f: 2313
+                f: 2322
+                f: 2331
+                f: 2340
+                f: 2350
+                f: 2359
+                f: 2368
+                f: 2378
+                f: 2388
+                f: 2397
+                f: 2407
+                f: 2416
+                f: 2426
+                f: 2436
+                f: 2446
+                f: 2455
+                f: 2465
+                f: 2475
+                f: 2485
+                f: 2495
+                f: 2505
+                f: 2515
+                f: 2525
+                f: 2535
+                f: 2545
+                f: 2556
+                f: 2566
+                f: 2577
+                f: 2587
+                f: 2598
+                f: 2609
+                f: 2620
+                f: 2631
+                f: 2641
+                f: 2652
+                f: 2663
+                f: 2674
+                f: 2685
+                f: 2696
+                f: 2708
+                f: 2719
+                f: 2730
+                f: 2742
+                f: 2753
+                f: 2764
+                f: 2776
+                f: 2788
+                f: 2799
+                f: 2811
+                f: 2823
+                f: 2835
+                f: 2847
+                f: 2858
+                f: 2870
+                f: 2882
+                f: 2894
+                f: 2906
+                f: 2919
+                f: 2931
+                f: 2943
+                f: 2956
+                f: 2968
+                f: 2981
+                f: 2994
+                f: 3006
+                f: 3019
+                f: 3032
+                f: 3045
+                f: 3058
+                f: 3070
+                f: 3083
+                f: 3096
+                f: 3109
+                f: 3121
+                f: 3134
+                f: 3148
+                f: 3161
+                f: 3174
+                f: 3187
+                f: 3200
+                f: 3214
+                f: 3228
+                f: 3242
+                f: 3255
+                f: 3268
+                f: 3283
+                f: 3297
+                f: 3310
+                f: 3325
+                f: 3340
+                f: 3353
+                f: 3368
+                f: 3383
+                f: 3398
+                f: 3412
+                f: 3427
+                f: 3442
+                f: 3457
+                f: 3471
+                f: 3487
+                f: 3502
+                f: 3516
+                f: 3531
+                f: 3546
+                f: 3561
+                f: 3577
+                f: 3593
+                f: 3608
+                f: 3625
+                f: 3641
+                f: 3657
+                f: 3673
+                f: 3690
+                f: 3706
+                f: 3722
+                f: 3738
+                f: 3755
+                f: 3772
+                f: 3789
+                f: 3805
+                f: 3823
+                f: 3839
+                f: 3856
+                f: 3873
+                f: 3891
+                f: 3908
+                f: 3926
+                f: 3944
+                f: 3960
+                f: 3977
+                f: 3995
+                f: 4013
+                f: 4031
+                f: 4048
+                f: 4067
+                f: 4085
+                f: 4104
+                f: 4122
+                f: 4140
+                f: 4159
+                f: 4177
+                f: 4196
+                f: 4215
+                f: 4234
+                f: 4253
+                f: 4272
+                f: 4291
+                f: 4311
+                f: 4332
+                f: 4351
+                f: 4371
+                f: 4391
+                f: 4412
+                f: 4433
+                f: 4454
+                f: 4474
+                f: 4496
+                f: 4518
+                f: 4538
+                f: 4558
+                f: 4579
+                f: 4601
+                f: 4619
+                f: 4640
+                f: 4662
+                f: 4684
+                f: 4706
+                f: 4728
+                f: 4751
+                f: 4771
+                f: 4794
+                f: 4818
+                f: 4840
+                f: 4863
+                f: 4887
+                f: 4910
+                f: 4933
+                f: 4956
+                f: 4980
+                f: 5004
+                f: 5028
+                f: 5052
+                f: 5076
+                f: 5100
+                f: 5125
+                f: 5152
+                f: 5175
+                f: 5200
+                f: 5226
+                f: 5251
+                f: 5278
+                f: 5304
+                f: 5329
+                f: 5354
+                f: 5381
+                f: 5407
+                f: 5433
+                f: 5460
+                f: 5488
+                f: 5516
+                f: 5544
+                f: 5573
+                f: 5600
+                f: 5628
+                f: 5656
+                f: 5684
+                f: 5713
+                f: 5741
+                f: 5771
+                f: 5799
+                f: 5830
+                f: 5860
+                f: 5891
+                f: 5921
+                f: 5951
+                f: 5980
+                f: 6010
+                f: 6041
+                f: 6073
+                f: 6105
+                f: 6133
+                f: 6163
+                f: 6195
+                f: 6227
+                f: 6258
+                f: 6291
+                f: 6322
+                f: 6356
+                f: 6390
+                f: 6424
+                f: 6457
+                f: 6491
+                f: 6527
+                f: 6561
+                f: 6596
+                f: 6631
+                f: 6665
+                f: 6701
+                f: 6736
+                f: 6771
+                f: 6805
+                f: 6840
+                f: 6877
+                f: 6911
+                f: 6947
+                f: 6985
+                f: 7022
+                f: 7059
+                f: 7097
+                f: 7135
+                f: 7174
+                f: 7212
+                f: 7251
+                f: 7289
+                f: 7327
+                f: 7366
+                f: 7406
+                f: 7447
+                f: 7486
+                f: 7525
+                f: 7566
+                f: 7606
+                f: 7646
+                f: 7688
+                f: 7728
+                f: 7771
+                f: 7814
+                f: 7859
+                f: 7901
+                f: 7949
+                f: 7992
+                f: 8036
+                f: 8082
+                f: 8127
+                f: 8173
+                f: 8218
+                f: 8262
+                f: 8309
+                f: 8353
+                f: 8397
+                f: 8444
+                f: 8489
+                f: 8539
+                f: 8585
+                f: 8632
+                f: 8682
+                f: 8727
+                f: 8777
+                f: 8828
+                f: 8879
+                f: 8929
+                f: 8982
+                f: 9037
+                f: 9087
+                f: 9140
+                f: 9193
+                f: 9250
+                f: 9305
+                f: 9361
+                f: 9418
+                f: 9475
+                f: 9532
+                f: 9589
+                f: 9644
+                f: 9699
+                f: 9758
+                f: 9818
+                f: 9875
+                f: 9935
+                f: 9997
+                f: 10057
+                f: 10117
+                f: 10174
+                f: 10232
+                f: 10296
+                f: 10356
+                f: 10419
+                f: 10482
+                f: 10546
+                f: 10608
+                f: 10670
+                f: 10729
+                f: 10790
+                f: 10855
+                f: 10920
+                f: 10990
+                f: 11054
+                f: 11118
+                f: 11181
+                f: 11248
+                f: 11316
+                f: 11385
+                f: 11454
+                f: 11526
+                f: 11597
+                f: 11667
+                f: 11740
+                f: 11820
+                f: 11897
+                f: 11973
+                f: 12046
+                f: 12126
+                f: 12204
+                f: 12287
+                f: 12370
+                f: 12456
+                f: 12538
+                f: 12627
+                f: 12714
+                f: 12799
+                f: 12883
+                f: 12971
+                f: 13062
+                f: 13154
+                f: 13233
+                f: 13328
+                f: 13418
+                f: 13511
+                f: 13607
+                f: 13709
+                f: 13806
+                f: 13903
+                f: 14002
+                f: 14104
+                f: 14200
+                f: 14288
+                f: 14391
+                f: 14488
+                f: 14590
+                f: 14698
+                f: 14808
+                f: 14910
+                f: 15020
+                f: 15126
+                f: 15238
+                f: 15347
+                f: 15456
+                f: 15574
+                f: 15692
+                f: 15786
+                f: 15896
+                f: 16016
+                f: 16136
+                f: 16250
+                f: 16352
+                f: 16474
+                f: 16575
+                f: 16702
+                f: 16835
+                f: 16965
+                f: 17096
+                f: 17232
+                f: 17370
+                f: 17443
+                f: 17581
+                f: 17719
+                f: 17864
+                f: 17976
+                f: 18116
+                f: 18250
+                f: 18396
+                f: 18540
+                f: 18690
+                f: 18840
+                f: 18989
+                f: 19136
+                f: 19294
+                f: 19445
+                f: 19589
+                f: 19750
+                f: 19905
+                f: 20064
+                f: 20191
+                f: 20325
+                f: 20497
+                f: 20662
+                f: 20833
+                f: 20981
+                f: 21152
+                f: 21334
+                f: 21510
+                f: 21642
+                f: 21821
+                f: 22001
+                f: 22186
+                f: 22379
+                f: 22568
+                f: 22770
+                f: 22958
+                f: 23162
+                f: 23360
+                f: 23524
+                f: 23737
+                f: 23960
+                f: 24175
+                f: 24395
+                f: 24631
+                f: 24865
+                f: 25091
+                f: 25327
+                f: 25580
+                f: 25833
+                f: 26089
+                f: 26361
+                f: 26636
+                f: 26889
+                f: 27155
+                f: 27436
+                f: 27715
+                f: 28003
+                f: 28303
+                f: 28600
+                f: 28916
+                f: 29223
+                f: 29553
+                f: 29884
+                f: 30200
+                f: 30538
+                f: 30868
+                f: 31211
+                f: 31548
+                f: 31881
+                f: 32253
+                f: 32605
+                f: 32980
+                f: 33385
+                f: 33805
+                f: 34254
+                f: 34723
+                f: 35167
+                f: 35666
+                f: 36125
+                f: 36652
+                f: 37177
+                f: 37739
+                f: 38321
+                f: 38932
+                f: 39640
+                f: 40337
+                f: 41000
+                f: 41626
+                f: 42385
+                f: 43122
+                f: 43890
+                f: 44687
+                f: 45609
+                f: 46520
+                f: 47489
+                f: 48432
+                f: 49458
+                f: 50511
+                f: 51561
+                f: 52568
+                f: 53676
+                f: 54936
+                f: 56071
+                f: 57302
+                f: 58513
+                f: 59800
+                f: 61192
+                f: 62702
+                f: 64205
+                f: 65868
+                f: 67780
+                f: 69960
+                f: 72330
+                f: 74918
+                f: 77540
+                f: 80344
+                f: 83727
+                f: 87662
+                f: 93589
+                f: 101441
+                f: 110544
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_8/Bucketize"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_8/Cast"
+          op: "Cast"
+          input: "QNetwork/EncodingNetwork/lambda_8/Bucketize:output:0"
+          attr {
+            key: "DstT"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "SrcT"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_8/Cast"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_8/truediv/y"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_FLOAT
+                tensor_shape {
+                }
+                float_val: 999
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_8/truediv/y"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_8/truediv"
+          op: "RealDiv"
+          input: "QNetwork/EncodingNetwork/lambda_8/Cast:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_8/truediv/y:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_8/truediv"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_8/Sqrt"
+          op: "Sqrt"
+          input: "QNetwork/EncodingNetwork/lambda_8/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_8/Sqrt"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_8/mul"
+          op: "Mul"
+          input: "QNetwork/EncodingNetwork/lambda_8/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_8/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_8/mul"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_8/concat/axis"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_8/concat/axis"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_8/concat"
+          op: "ConcatV2"
+          input: "QNetwork/EncodingNetwork/lambda_8/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_8/Sqrt:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_8/mul:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_8/concat/axis:output:0"
+          attr {
+            key: "N"
+            value {
+              i: 3
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 3
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_8/concat"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_9/expand_dims/ExpandDims/dim"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_9/expand_dims/ExpandDims/dim"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_9/expand_dims/ExpandDims"
+          op: "ExpandDims"
+          input: "time_step_12"
+          input: "QNetwork/EncodingNetwork/lambda_9/expand_dims/ExpandDims/dim:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_9/expand_dims/ExpandDims"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_9/zeros_like"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_FLOAT
+                tensor_shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+                float_val: 0
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_9/zeros_like"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_10/expand_dims/ExpandDims/dim"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_10/expand_dims/ExpandDims/dim"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_10/expand_dims/ExpandDims"
+          op: "ExpandDims"
+          input: "time_step_13"
+          input: "QNetwork/EncodingNetwork/lambda_10/expand_dims/ExpandDims/dim:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_10/expand_dims/ExpandDims"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_10/Bucketize"
+          op: "Bucketize"
+          input: "QNetwork/EncodingNetwork/lambda_10/expand_dims/ExpandDims:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "boundaries"
+            value {
+              list {
+                f: 13
+                f: 38
+                f: 56
+                f: 70
+                f: 82
+                f: 94
+                f: 104
+                f: 114
+                f: 123
+                f: 131
+                f: 139
+                f: 148
+                f: 152
+                f: 153
+                f: 158
+                f: 163
+                f: 170
+                f: 174
+                f: 178
+                f: 180
+                f: 183
+                f: 186
+                f: 188
+                f: 190
+                f: 192
+                f: 196
+                f: 198
+                f: 201
+                f: 205
+                f: 208
+                f: 212
+                f: 215
+                f: 219
+                f: 221
+                f: 225
+                f: 227
+                f: 229
+                f: 232
+                f: 233
+                f: 236
+                f: 239
+                f: 242
+                f: 245
+                f: 248
+                f: 250
+                f: 252
+                f: 254
+                f: 256
+                f: 259
+                f: 261
+                f: 264
+                f: 267
+                f: 270
+                f: 272
+                f: 275
+                f: 278
+                f: 280
+                f: 283
+                f: 285
+                f: 287
+                f: 290
+                f: 293
+                f: 295
+                f: 297
+                f: 300
+                f: 303
+                f: 305
+                f: 308
+                f: 311
+                f: 313
+                f: 316
+                f: 319
+                f: 322
+                f: 325
+                f: 329
+                f: 331
+                f: 333
+                f: 336
+                f: 338
+                f: 340
+                f: 343
+                f: 345
+                f: 347
+                f: 347
+                f: 349
+                f: 351
+                f: 353
+                f: 355
+                f: 357
+                f: 359
+                f: 361
+                f: 363
+                f: 365
+                f: 368
+                f: 369
+                f: 371
+                f: 373
+                f: 375
+                f: 377
+                f: 380
+                f: 382
+                f: 385
+                f: 387
+                f: 389
+                f: 391
+                f: 394
+                f: 396
+                f: 398
+                f: 400
+                f: 403
+                f: 405
+                f: 408
+                f: 410
+                f: 412
+                f: 415
+                f: 417
+                f: 420
+                f: 422
+                f: 425
+                f: 427
+                f: 429
+                f: 432
+                f: 434
+                f: 437
+                f: 439
+                f: 442
+                f: 444
+                f: 446
+                f: 449
+                f: 451
+                f: 454
+                f: 456
+                f: 458
+                f: 461
+                f: 463
+                f: 466
+                f: 469
+                f: 472
+                f: 474
+                f: 476
+                f: 479
+                f: 482
+                f: 483
+                f: 486
+                f: 489
+                f: 492
+                f: 495
+                f: 498
+                f: 500
+                f: 503
+                f: 505
+                f: 508
+                f: 510
+                f: 513
+                f: 516
+                f: 519
+                f: 522
+                f: 524
+                f: 528
+                f: 530
+                f: 533
+                f: 536
+                f: 539
+                f: 541
+                f: 544
+                f: 547
+                f: 550
+                f: 553
+                f: 556
+                f: 559
+                f: 561
+                f: 563
+                f: 567
+                f: 570
+                f: 572
+                f: 575
+                f: 577
+                f: 580
+                f: 584
+                f: 586
+                f: 589
+                f: 592
+                f: 595
+                f: 598
+                f: 601
+                f: 605
+                f: 607
+                f: 611
+                f: 613
+                f: 617
+                f: 620
+                f: 623
+                f: 626
+                f: 629
+                f: 632
+                f: 635
+                f: 639
+                f: 642
+                f: 645
+                f: 648
+                f: 651
+                f: 654
+                f: 657
+                f: 660
+                f: 662
+                f: 666
+                f: 669
+                f: 672
+                f: 676
+                f: 679
+                f: 682
+                f: 685
+                f: 688
+                f: 690
+                f: 693
+                f: 696
+                f: 699
+                f: 702
+                f: 705
+                f: 709
+                f: 712
+                f: 714
+                f: 718
+                f: 721
+                f: 724
+                f: 726
+                f: 728
+                f: 729
+                f: 731
+                f: 734
+                f: 737
+                f: 741
+                f: 745
+                f: 748
+                f: 750
+                f: 753
+                f: 756
+                f: 760
+                f: 763
+                f: 766
+                f: 770
+                f: 773
+                f: 776
+                f: 779
+                f: 782
+                f: 786
+                f: 788
+                f: 793
+                f: 796
+                f: 798
+                f: 802
+                f: 805
+                f: 808
+                f: 811
+                f: 815
+                f: 818
+                f: 820
+                f: 824
+                f: 827
+                f: 829
+                f: 832
+                f: 835
+                f: 838
+                f: 842
+                f: 846
+                f: 849
+                f: 854
+                f: 857
+                f: 860
+                f: 864
+                f: 867
+                f: 871
+                f: 875
+                f: 879
+                f: 882
+                f: 887
+                f: 890
+                f: 893
+                f: 897
+                f: 901
+                f: 905
+                f: 908
+                f: 911
+                f: 915
+                f: 918
+                f: 921
+                f: 925
+                f: 929
+                f: 932
+                f: 934
+                f: 937
+                f: 940
+                f: 943
+                f: 946
+                f: 950
+                f: 953
+                f: 956
+                f: 961
+                f: 965
+                f: 969
+                f: 973
+                f: 976
+                f: 980
+                f: 982
+                f: 985
+                f: 990
+                f: 994
+                f: 997
+                f: 1001
+                f: 1005
+                f: 1007
+                f: 1010
+                f: 1014
+                f: 1018
+                f: 1022
+                f: 1025
+                f: 1028
+                f: 1033
+                f: 1035
+                f: 1038
+                f: 1042
+                f: 1047
+                f: 1052
+                f: 1056
+                f: 1060
+                f: 1063
+                f: 1067
+                f: 1071
+                f: 1075
+                f: 1079
+                f: 1083
+                f: 1086
+                f: 1088
+                f: 1092
+                f: 1097
+                f: 1102
+                f: 1106
+                f: 1109
+                f: 1113
+                f: 1117
+                f: 1120
+                f: 1125
+                f: 1129
+                f: 1134
+                f: 1137
+                f: 1142
+                f: 1146
+                f: 1150
+                f: 1151
+                f: 1155
+                f: 1159
+                f: 1162
+                f: 1166
+                f: 1170
+                f: 1174
+                f: 1177
+                f: 1181
+                f: 1185
+                f: 1188
+                f: 1193
+                f: 1196
+                f: 1203
+                f: 1207
+                f: 1212
+                f: 1214
+                f: 1217
+                f: 1220
+                f: 1222
+                f: 1222
+                f: 1226
+                f: 1229
+                f: 1233
+                f: 1237
+                f: 1241
+                f: 1246
+                f: 1250
+                f: 1253
+                f: 1257
+                f: 1262
+                f: 1267
+                f: 1272
+                f: 1278
+                f: 1283
+                f: 1287
+                f: 1293
+                f: 1297
+                f: 1301
+                f: 1304
+                f: 1309
+                f: 1315
+                f: 1320
+                f: 1325
+                f: 1329
+                f: 1333
+                f: 1336
+                f: 1341
+                f: 1344
+                f: 1348
+                f: 1351
+                f: 1357
+                f: 1363
+                f: 1368
+                f: 1374
+                f: 1379
+                f: 1383
+                f: 1386
+                f: 1391
+                f: 1395
+                f: 1399
+                f: 1403
+                f: 1407
+                f: 1410
+                f: 1415
+                f: 1418
+                f: 1423
+                f: 1428
+                f: 1432
+                f: 1436
+                f: 1438
+                f: 1442
+                f: 1446
+                f: 1450
+                f: 1454
+                f: 1462
+                f: 1467
+                f: 1472
+                f: 1477
+                f: 1483
+                f: 1488
+                f: 1492
+                f: 1496
+                f: 1503
+                f: 1508
+                f: 1513
+                f: 1518
+                f: 1520
+                f: 1526
+                f: 1531
+                f: 1534
+                f: 1538
+                f: 1542
+                f: 1546
+                f: 1552
+                f: 1558
+                f: 1564
+                f: 1568
+                f: 1573
+                f: 1578
+                f: 1581
+                f: 1590
+                f: 1596
+                f: 1601
+                f: 1606
+                f: 1611
+                f: 1616
+                f: 1622
+                f: 1629
+                f: 1634
+                f: 1640
+                f: 1647
+                f: 1651
+                f: 1657
+                f: 1660
+                f: 1665
+                f: 1672
+                f: 1678
+                f: 1686
+                f: 1692
+                f: 1698
+                f: 1704
+                f: 1709
+                f: 1714
+                f: 1719
+                f: 1724
+                f: 1730
+                f: 1737
+                f: 1744
+                f: 1751
+                f: 1755
+                f: 1761
+                f: 1764
+                f: 1772
+                f: 1778
+                f: 1784
+                f: 1789
+                f: 1799
+                f: 1804
+                f: 1811
+                f: 1819
+                f: 1825
+                f: 1830
+                f: 1838
+                f: 1849
+                f: 1858
+                f: 1862
+                f: 1868
+                f: 1872
+                f: 1878
+                f: 1885
+                f: 1888
+                f: 1892
+                f: 1897
+                f: 1902
+                f: 1907
+                f: 1919
+                f: 1926
+                f: 1932
+                f: 1936
+                f: 1941
+                f: 1946
+                f: 1952
+                f: 1960
+                f: 1968
+                f: 1977
+                f: 1985
+                f: 1992
+                f: 1997
+                f: 2006
+                f: 2012
+                f: 2018
+                f: 2026
+                f: 2034
+                f: 2044
+                f: 2050
+                f: 2057
+                f: 2064
+                f: 2069
+                f: 2075
+                f: 2082
+                f: 2091
+                f: 2098
+                f: 2107
+                f: 2122
+                f: 2126
+                f: 2135
+                f: 2146
+                f: 2149
+                f: 2157
+                f: 2163
+                f: 2172
+                f: 2178
+                f: 2184
+                f: 2191
+                f: 2198
+                f: 2208
+                f: 2216
+                f: 2223
+                f: 2235
+                f: 2242
+                f: 2252
+                f: 2263
+                f: 2272
+                f: 2277
+                f: 2288
+                f: 2296
+                f: 2306
+                f: 2311
+                f: 2318
+                f: 2323
+                f: 2334
+                f: 2341
+                f: 2356
+                f: 2366
+                f: 2373
+                f: 2379
+                f: 2386
+                f: 2407
+                f: 2416
+                f: 2423
+                f: 2432
+                f: 2438
+                f: 2448
+                f: 2453
+                f: 2464
+                f: 2473
+                f: 2473
+                f: 2481
+                f: 2492
+                f: 2504
+                f: 2511
+                f: 2523
+                f: 2529
+                f: 2537
+                f: 2545
+                f: 2556
+                f: 2566
+                f: 2575
+                f: 2584
+                f: 2592
+                f: 2602
+                f: 2613
+                f: 2624
+                f: 2636
+                f: 2643
+                f: 2647
+                f: 2652
+                f: 2664
+                f: 2675
+                f: 2688
+                f: 2693
+                f: 2702
+                f: 2709
+                f: 2722
+                f: 2739
+                f: 2754
+                f: 2766
+                f: 2776
+                f: 2786
+                f: 2799
+                f: 2810
+                f: 2832
+                f: 2840
+                f: 2849
+                f: 2860
+                f: 2873
+                f: 2889
+                f: 2908
+                f: 2914
+                f: 2926
+                f: 2939
+                f: 2950
+                f: 2961
+                f: 2969
+                f: 2978
+                f: 2990
+                f: 2999
+                f: 3023
+                f: 3032
+                f: 3049
+                f: 3066
+                f: 3085
+                f: 3101
+                f: 3107
+                f: 3117
+                f: 3129
+                f: 3144
+                f: 3167
+                f: 3190
+                f: 3212
+                f: 3229
+                f: 3238
+                f: 3264
+                f: 3293
+                f: 3302
+                f: 3309
+                f: 3314
+                f: 3323
+                f: 3344
+                f: 3352
+                f: 3362
+                f: 3390
+                f: 3400
+                f: 3411
+                f: 3435
+                f: 3456
+                f: 3470
+                f: 3485
+                f: 3498
+                f: 3505
+                f: 3519
+                f: 3539
+                f: 3545
+                f: 3545
+                f: 3560
+                f: 3576
+                f: 3597
+                f: 3607
+                f: 3621
+                f: 3641
+                f: 3665
+                f: 3679
+                f: 3701
+                f: 3714
+                f: 3733
+                f: 3741
+                f: 3745
+                f: 3757
+                f: 3773
+                f: 3787
+                f: 3795
+                f: 3805
+                f: 3822
+                f: 3835
+                f: 3844
+                f: 3861
+                f: 3872
+                f: 3878
+                f: 3897
+                f: 3919
+                f: 3941
+                f: 3971
+                f: 4004
+                f: 4014
+                f: 4019
+                f: 4061
+                f: 4068
+                f: 4089
+                f: 4108
+                f: 4117
+                f: 4125
+                f: 4146
+                f: 4165
+                f: 4194
+                f: 4204
+                f: 4224
+                f: 4236
+                f: 4263
+                f: 4290
+                f: 4301
+                f: 4319
+                f: 4326
+                f: 4347
+                f: 4369
+                f: 4386
+                f: 4413
+                f: 4435
+                f: 4451
+                f: 4451
+                f: 4451
+                f: 4476
+                f: 4500
+                f: 4539
+                f: 4579
+                f: 4592
+                f: 4600
+                f: 4622
+                f: 4650
+                f: 4683
+                f: 4714
+                f: 4742
+                f: 4755
+                f: 4771
+                f: 4788
+                f: 4816
+                f: 4828
+                f: 4831
+                f: 4831
+                f: 4831
+                f: 4843
+                f: 4852
+                f: 4865
+                f: 4896
+                f: 4915
+                f: 4931
+                f: 4952
+                f: 4965
+                f: 4983
+                f: 5007
+                f: 5043
+                f: 5061
+                f: 5081
+                f: 5095
+                f: 5122
+                f: 5143
+                f: 5171
+                f: 5204
+                f: 5226
+                f: 5233
+                f: 5250
+                f: 5281
+                f: 5320
+                f: 5323
+                f: 5328
+                f: 5345
+                f: 5374
+                f: 5413
+                f: 5466
+                f: 5492
+                f: 5524
+                f: 5555
+                f: 5567
+                f: 5610
+                f: 5676
+                f: 5701
+                f: 5716
+                f: 5744
+                f: 5768
+                f: 5795
+                f: 5818
+                f: 5854
+                f: 5906
+                f: 5934
+                f: 5960
+                f: 5975
+                f: 5993
+                f: 6025
+                f: 6034
+                f: 6051
+                f: 6082
+                f: 6106
+                f: 6125
+                f: 6159
+                f: 6187
+                f: 6242
+                f: 6287
+                f: 6311
+                f: 6332
+                f: 6348
+                f: 6358
+                f: 6368
+                f: 6377
+                f: 6402
+                f: 6407
+                f: 6428
+                f: 6450
+                f: 6475
+                f: 6498
+                f: 6505
+                f: 6533
+                f: 6565
+                f: 6580
+                f: 6595
+                f: 6611
+                f: 6654
+                f: 6658
+                f: 6705
+                f: 6751
+                f: 6786
+                f: 6828
+                f: 6876
+                f: 6896
+                f: 6948
+                f: 6964
+                f: 7065
+                f: 7082
+                f: 7118
+                f: 7184
+                f: 7214
+                f: 7271
+                f: 7310
+                f: 7357
+                f: 7405
+                f: 7506
+                f: 7613
+                f: 7641
+                f: 7675
+                f: 7720
+                f: 7781
+                f: 7833
+                f: 7860
+                f: 7898
+                f: 7929
+                f: 8044
+                f: 8104
+                f: 8148
+                f: 8236
+                f: 8273
+                f: 8313
+                f: 8349
+                f: 8381
+                f: 8409
+                f: 8498
+                f: 8507
+                f: 8524
+                f: 8570
+                f: 8607
+                f: 8630
+                f: 8637
+                f: 8675
+                f: 8700
+                f: 8714
+                f: 8734
+                f: 8776
+                f: 8836
+                f: 8854
+                f: 8867
+                f: 8868
+                f: 9065
+                f: 9113
+                f: 9121
+                f: 9241
+                f: 9357
+                f: 9360
+                f: 9585
+                f: 9613
+                f: 9684
+                f: 9727
+                f: 9751
+                f: 9777
+                f: 9802
+                f: 9889
+                f: 9903
+                f: 9914
+                f: 9978
+                f: 10061
+                f: 10192
+                f: 10213
+                f: 10345
+                f: 10369
+                f: 10404
+                f: 10430
+                f: 10471
+                f: 10481
+                f: 10489
+                f: 10492
+                f: 10494
+                f: 10524
+                f: 10554
+                f: 10557
+                f: 10560
+                f: 10562
+                f: 10641
+                f: 10716
+                f: 10842
+                f: 10897
+                f: 10967
+                f: 11053
+                f: 11128
+                f: 11137
+                f: 11328
+                f: 11336
+                f: 11401
+                f: 11532
+                f: 11573
+                f: 11860
+                f: 11880
+                f: 12013
+                f: 12305
+                f: 12358
+                f: 12386
+                f: 12404
+                f: 12456
+                f: 12456
+                f: 12476
+                f: 12615
+                f: 12677
+                f: 12981
+                f: 13094
+                f: 13197
+                f: 13708
+                f: 13717
+                f: 13788
+                f: 14049
+                f: 14112
+                f: 14224
+                f: 14257
+                f: 14681
+                f: 14901
+                f: 15006
+                f: 15071
+                f: 15100
+                f: 15248
+                f: 15669
+                f: 15877
+                f: 15953
+                f: 15953
+                f: 16066
+                f: 16072
+                f: 16271
+                f: 16292
+                f: 16386
+                f: 16490
+                f: 16633
+                f: 16670
+                f: 16834
+                f: 16896
+                f: 17543
+                f: 17693
+                f: 17800
+                f: 17859
+                f: 18397
+                f: 18811
+                f: 18826
+                f: 18971
+                f: 19304
+                f: 19319
+                f: 19695
+                f: 20378
+                f: 20865
+                f: 21313
+                f: 21330
+                f: 22321
+                f: 22760
+                f: 22770
+                f: 23783
+                f: 23785
+                f: 24525
+                f: 24844
+                f: 24848
+                f: 24964
+                f: 24966
+                f: 27468
+                f: 27478
+                f: 27555
+                f: 27555
+                f: 28215
+                f: 28219
+                f: 28336
+                f: 28490
+                f: 30213
+                f: 30228
+                f: 30242
+                f: 34116
+                f: 43518
+                f: 43518
+                f: 43518
+                f: 43852
+                f: 43852
+                f: 43852
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_10/Bucketize"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_10/Cast"
+          op: "Cast"
+          input: "QNetwork/EncodingNetwork/lambda_10/Bucketize:output:0"
+          attr {
+            key: "DstT"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "SrcT"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_10/Cast"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_10/truediv/y"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_FLOAT
+                tensor_shape {
+                }
+                float_val: 999
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_10/truediv/y"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_10/truediv"
+          op: "RealDiv"
+          input: "QNetwork/EncodingNetwork/lambda_10/Cast:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_10/truediv/y:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_10/truediv"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_10/Sqrt"
+          op: "Sqrt"
+          input: "QNetwork/EncodingNetwork/lambda_10/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_10/Sqrt"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_10/mul"
+          op: "Mul"
+          input: "QNetwork/EncodingNetwork/lambda_10/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_10/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_10/mul"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_10/concat/axis"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_10/concat/axis"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_10/concat"
+          op: "ConcatV2"
+          input: "QNetwork/EncodingNetwork/lambda_10/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_10/Sqrt:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_10/mul:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_10/concat/axis:output:0"
+          attr {
+            key: "N"
+            value {
+              i: 3
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 3
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_10/concat"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_11/expand_dims/ExpandDims/dim"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_11/expand_dims/ExpandDims/dim"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_11/expand_dims/ExpandDims"
+          op: "ExpandDims"
+          input: "time_step_14"
+          input: "QNetwork/EncodingNetwork/lambda_11/expand_dims/ExpandDims/dim:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_11/expand_dims/ExpandDims"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_11/Bucketize"
+          op: "Bucketize"
+          input: "QNetwork/EncodingNetwork/lambda_11/expand_dims/ExpandDims:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "boundaries"
+            value {
+              list {
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 0
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 1
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 2
+                f: 3
+                f: 4
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_11/Bucketize"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_11/Cast"
+          op: "Cast"
+          input: "QNetwork/EncodingNetwork/lambda_11/Bucketize:output:0"
+          attr {
+            key: "DstT"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "SrcT"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_11/Cast"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_11/truediv/y"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_FLOAT
+                tensor_shape {
+                }
+                float_val: 999
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_11/truediv/y"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_11/truediv"
+          op: "RealDiv"
+          input: "QNetwork/EncodingNetwork/lambda_11/Cast:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_11/truediv/y:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_11/truediv"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_11/Sqrt"
+          op: "Sqrt"
+          input: "QNetwork/EncodingNetwork/lambda_11/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_11/Sqrt"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_11/mul"
+          op: "Mul"
+          input: "QNetwork/EncodingNetwork/lambda_11/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_11/truediv:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_11/mul"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_11/concat/axis"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_11/concat/axis"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/lambda_11/concat"
+          op: "ConcatV2"
+          input: "QNetwork/EncodingNetwork/lambda_11/truediv:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_11/Sqrt:y:0"
+          input: "QNetwork/EncodingNetwork/lambda_11/mul:z:0"
+          input: "QNetwork/EncodingNetwork/lambda_11/concat/axis:output:0"
+          attr {
+            key: "N"
+            value {
+              i: 3
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 3
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/lambda_11/concat"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/concatenate/concat/axis"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: 1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/concatenate/concat/axis"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/concatenate/concat"
+          op: "ConcatV2"
+          input: "QNetwork/EncodingNetwork/lambda/concat:output:0"
+          input: "QNetwork/EncodingNetwork/lambda_1/concat:output:0"
+          input: "QNetwork/EncodingNetwork/lambda_2/concat:output:0"
+          input: "QNetwork/EncodingNetwork/lambda_3/concat:output:0"
+          input: "QNetwork/EncodingNetwork/lambda_4/concat:output:0"
+          input: "QNetwork/EncodingNetwork/lambda_5/concat:output:0"
+          input: "QNetwork/EncodingNetwork/lambda_6/concat:output:0"
+          input: "QNetwork/EncodingNetwork/lambda_7/concat:output:0"
+          input: "QNetwork/EncodingNetwork/lambda_8/concat:output:0"
+          input: "QNetwork/EncodingNetwork/lambda_9/zeros_like:output:0"
+          input: "QNetwork/EncodingNetwork/lambda_10/concat:output:0"
+          input: "QNetwork/EncodingNetwork/lambda_11/concat:output:0"
+          input: "QNetwork/EncodingNetwork/concatenate/concat/axis:output:0"
+          attr {
+            key: "N"
+            value {
+              i: 12
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 34
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/concatenate/concat"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/flatten/Const"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 2
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                  dim {
+                    size: 2
+                  }
+                }
+                tensor_content: "\377\377\377\377\"\000\000\000"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/flatten/Const"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/flatten/Reshape"
+          op: "Reshape"
+          input: "QNetwork/EncodingNetwork/concatenate/concat:output:0"
+          input: "QNetwork/EncodingNetwork/flatten/Const:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 34
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/flatten/Reshape"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/dense/MatMul/ReadVariableOp"
+          op: "ReadVariableOp"
+          input: "qnetwork_encodingnetwork_dense_matmul_readvariableop_resource"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 34
+                  }
+                  dim {
+                    size: 100
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/dense/MatMul/ReadVariableOp"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/dense/MatMul"
+          op: "MatMul"
+          input: "QNetwork/EncodingNetwork/flatten/Reshape:output:0"
+          input: "QNetwork/EncodingNetwork/dense/MatMul/ReadVariableOp:value:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 100
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/dense/MatMul"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/dense/BiasAdd/ReadVariableOp"
+          op: "ReadVariableOp"
+          input: "qnetwork_encodingnetwork_dense_biasadd_readvariableop_resource"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 100
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/dense/BiasAdd/ReadVariableOp"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/dense/BiasAdd"
+          op: "BiasAdd"
+          input: "QNetwork/EncodingNetwork/dense/MatMul:product:0"
+          input: "QNetwork/EncodingNetwork/dense/BiasAdd/ReadVariableOp:value:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 100
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/dense/BiasAdd"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/dense/Relu"
+          op: "Relu"
+          input: "QNetwork/EncodingNetwork/dense/BiasAdd:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 100
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/dense/Relu"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/dense_1/MatMul/ReadVariableOp"
+          op: "ReadVariableOp"
+          input: "qnetwork_encodingnetwork_dense_1_matmul_readvariableop_resource"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 100
+                  }
+                  dim {
+                    size: 40
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/dense_1/MatMul/ReadVariableOp"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/dense_1/MatMul"
+          op: "MatMul"
+          input: "QNetwork/EncodingNetwork/dense/Relu:activations:0"
+          input: "QNetwork/EncodingNetwork/dense_1/MatMul/ReadVariableOp:value:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 40
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/dense_1/MatMul"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/dense_1/BiasAdd/ReadVariableOp"
+          op: "ReadVariableOp"
+          input: "qnetwork_encodingnetwork_dense_1_biasadd_readvariableop_resource"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 40
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/dense_1/BiasAdd/ReadVariableOp"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/dense_1/BiasAdd"
+          op: "BiasAdd"
+          input: "QNetwork/EncodingNetwork/dense_1/MatMul:product:0"
+          input: "QNetwork/EncodingNetwork/dense_1/BiasAdd/ReadVariableOp:value:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 40
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/dense_1/BiasAdd"
+          }
+        }
+        node_def {
+          name: "QNetwork/EncodingNetwork/dense_1/Relu"
+          op: "Relu"
+          input: "QNetwork/EncodingNetwork/dense_1/BiasAdd:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 40
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/EncodingNetwork/dense_1/Relu"
+          }
+        }
+        node_def {
+          name: "QNetwork/dense_2/MatMul/ReadVariableOp"
+          op: "ReadVariableOp"
+          input: "qnetwork_dense_2_matmul_readvariableop_resource"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 40
+                  }
+                  dim {
+                    size: 2
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/dense_2/MatMul/ReadVariableOp"
+          }
+        }
+        node_def {
+          name: "QNetwork/dense_2/MatMul"
+          op: "MatMul"
+          input: "QNetwork/EncodingNetwork/dense_1/Relu:activations:0"
+          input: "QNetwork/dense_2/MatMul/ReadVariableOp:value:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 2
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/dense_2/MatMul"
+          }
+        }
+        node_def {
+          name: "QNetwork/dense_2/BiasAdd/ReadVariableOp"
+          op: "ReadVariableOp"
+          input: "qnetwork_dense_2_biasadd_readvariableop_resource"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 2
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/dense_2/BiasAdd/ReadVariableOp"
+          }
+        }
+        node_def {
+          name: "QNetwork/dense_2/BiasAdd"
+          op: "BiasAdd"
+          input: "QNetwork/dense_2/MatMul:product:0"
+          input: "QNetwork/dense_2/BiasAdd/ReadVariableOp:value:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 2
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "QNetwork/dense_2/BiasAdd"
+          }
+        }
+        node_def {
+          name: "ShiftedCategorical_1/mode/ArgMax/dimension"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: -1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "ShiftedCategorical_1/mode/ArgMax/dimension"
+          }
+        }
+        node_def {
+          name: "ShiftedCategorical_1/mode/ArgMax"
+          op: "ArgMax"
+          input: "QNetwork/dense_2/BiasAdd:output:0"
+          input: "ShiftedCategorical_1/mode/ArgMax/dimension:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "ShiftedCategorical_1/mode/ArgMax"
+          }
+        }
+        node_def {
+          name: "add/y"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT64
+                tensor_shape {
+                }
+                int64_val: 0
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "add/y"
+          }
+        }
+        node_def {
+          name: "add"
+          op: "AddV2"
+          input: "ShiftedCategorical_1/mode/ArgMax:output:0"
+          input: "add/y:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "add"
+          }
+        }
+        node_def {
+          name: "Deterministic/atol"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT64
+                tensor_shape {
+                }
+                int64_val: 0
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic/atol"
+          }
+        }
+        node_def {
+          name: "Deterministic/rtol"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT64
+                tensor_shape {
+                }
+                int64_val: 0
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic/rtol"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/sample_shape/x"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_FLOAT
+                tensor_shape {
+                  dim {
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/sample_shape/x"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/sample_shape"
+          op: "Cast"
+          input: "Deterministic_1/sample/sample_shape/x:output:0"
+          attr {
+            key: "DstT"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "SrcT"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/sample_shape"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/Shape"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                  dim {
+                    size: 1
+                  }
+                }
+                int_val: 1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/Shape"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/Shape_1"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                  dim {
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/Shape_1"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/Shape_2"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                  dim {
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/Shape_2"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/BroadcastArgs"
+          op: "BroadcastArgs"
+          input: "Deterministic_1/sample/Shape_1:output:0"
+          input: "Deterministic_1/sample/Shape_2:output:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/BroadcastArgs"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/BroadcastArgs_1"
+          op: "BroadcastArgs"
+          input: "Deterministic_1/sample/Shape:output:0"
+          input: "Deterministic_1/sample/BroadcastArgs:r0:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/BroadcastArgs_1"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/Const"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                  dim {
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/Const"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/concat/values_0"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                  dim {
+                    size: 1
+                  }
+                }
+                int_val: 1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/concat/values_0"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/concat/axis"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: 0
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/concat/axis"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/concat"
+          op: "ConcatV2"
+          input: "Deterministic_1/sample/concat/values_0:output:0"
+          input: "Deterministic_1/sample/BroadcastArgs_1:r0:0"
+          input: "Deterministic_1/sample/Const:output:0"
+          input: "Deterministic_1/sample/concat/axis:output:0"
+          attr {
+            key: "N"
+            value {
+              i: 3
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 2
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/concat"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/BroadcastTo"
+          op: "BroadcastTo"
+          input: "add:z:0"
+          input: "Deterministic_1/sample/concat:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/BroadcastTo"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/Shape_3"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 2
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                  dim {
+                    size: 2
+                  }
+                }
+                tensor_content: "\001\000\000\000\001\000\000\000"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/Shape_3"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/strided_slice/stack"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                  dim {
+                    size: 1
+                  }
+                }
+                int_val: 1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/strided_slice/stack"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/strided_slice/stack_1"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                  dim {
+                    size: 1
+                  }
+                }
+                int_val: 0
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/strided_slice/stack_1"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/strided_slice/stack_2"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                  dim {
+                    size: 1
+                  }
+                }
+                int_val: 1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/strided_slice/stack_2"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/strided_slice"
+          op: "StridedSlice"
+          input: "Deterministic_1/sample/Shape_3:output:0"
+          input: "Deterministic_1/sample/strided_slice/stack:output:0"
+          input: "Deterministic_1/sample/strided_slice/stack_1:output:0"
+          input: "Deterministic_1/sample/strided_slice/stack_2:output:0"
+          attr {
+            key: "Index"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "end_mask"
+            value {
+              i: 1
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/strided_slice"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/concat_1/axis"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: 0
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/concat_1/axis"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/concat_1"
+          op: "ConcatV2"
+          input: "Deterministic_1/sample/sample_shape:y:0"
+          input: "Deterministic_1/sample/strided_slice:output:0"
+          input: "Deterministic_1/sample/concat_1/axis:output:0"
+          attr {
+            key: "N"
+            value {
+              i: 2
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/concat_1"
+          }
+        }
+        node_def {
+          name: "Deterministic_1/sample/Reshape"
+          op: "Reshape"
+          input: "Deterministic_1/sample/BroadcastTo:output:0"
+          input: "Deterministic_1/sample/concat_1:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Deterministic_1/sample/Reshape"
+          }
+        }
+        node_def {
+          name: "clip_by_value/Minimum/y"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT64
+                tensor_shape {
+                }
+                int64_val: 1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "clip_by_value/Minimum/y"
+          }
+        }
+        node_def {
+          name: "clip_by_value/Minimum"
+          op: "Minimum"
+          input: "Deterministic_1/sample/Reshape:output:0"
+          input: "clip_by_value/Minimum/y:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "clip_by_value/Minimum"
+          }
+        }
+        node_def {
+          name: "clip_by_value/y"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT64
+                tensor_shape {
+                }
+                int64_val: 0
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "clip_by_value/y"
+          }
+        }
+        node_def {
+          name: "clip_by_value"
+          op: "Maximum"
+          input: "clip_by_value/Minimum:z:0"
+          input: "clip_by_value/y:output:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "clip_by_value"
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "clip_by_value:z:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity"
+          }
+        }
+        ret {
+          key: "identity"
+          value: "Identity:output:0"
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 1
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 2
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 3
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 4
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 5
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 6
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 7
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 8
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 9
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 10
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 11
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 12
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 13
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 14
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 15
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 16
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 17
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 18
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 19
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 20
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference_signature_wrapper_4619033"
+        }
+        node_def {
+          name: "PartitionedCall"
+          op: "PartitionedCall"
+          attr {
+            key: "Tin"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "Tout"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "_collective_manager_ids"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "_read_only_resource_inputs"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "config_proto"
+            value {
+              s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001"
+            }
+          }
+          attr {
+            key: "f"
+            value {
+              func {
+                name: "__inference_function_with_signature_4619029"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "PartitionedCall"
+          }
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference__traced_save_4619143"
+          input_arg {
+            name: "file_prefix"
+            type: DT_STRING
+          }
+          input_arg {
+            name: "savev2_train_step_read_readvariableop"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "savev2_qnetwork_encodingnetwork_dense_kernel_read_readvariableop"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "savev2_qnetwork_encodingnetwork_dense_bias_read_readvariableop"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "savev2_qnetwork_encodingnetwork_dense_1_kernel_read_readvariableop"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "savev2_qnetwork_encodingnetwork_dense_1_bias_read_readvariableop"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "savev2_qnetwork_dense_2_kernel_read_readvariableop"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "savev2_qnetwork_dense_2_bias_read_readvariableop"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "savev2_1_const"
+            type: DT_STRING
+          }
+          output_arg {
+            name: "identity_1"
+            type: DT_STRING
+          }
+          is_stateful: true
+          control_output: "MergeV2Checkpoints"
+          control_output: "SaveV2"
+          control_output: "SaveV2_1"
+        }
+        node_def {
+          name: "StaticRegexFullMatch"
+          op: "StaticRegexFullMatch"
+          input: "file_prefix"
+          device: "/device:CPU:*"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "pattern"
+            value {
+              s: "^s3://.*"
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "StaticRegexFullMatch"
+          }
+        }
+        node_def {
+          name: "Const"
+          op: "Const"
+          device: "/device:CPU:*"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                }
+                string_val: ".part"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Const"
+          }
+        }
+        node_def {
+          name: "Const_1"
+          op: "Const"
+          device: "/device:CPU:*"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                }
+                string_val: "_temp_f4c8d2e64931472295be68a11e57e937/part"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Const_1"
+          }
+        }
+        node_def {
+          name: "Select"
+          op: "Select"
+          input: "StaticRegexFullMatch:output:0"
+          input: "Const:output:0"
+          input: "Const_1:output:0"
+          device: "/device:CPU:*"
+          attr {
+            key: "T"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Select"
+          }
+        }
+        node_def {
+          name: "StringJoin"
+          op: "StringJoin"
+          input: "file_prefix"
+          input: "Select:output:0"
+          device: "/device:CPU:*"
+          attr {
+            key: "N"
+            value {
+              i: 2
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "StringJoin"
+          }
+        }
+        node_def {
+          name: "num_shards"
+          op: "Const"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: 2
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "num_shards"
+          }
+        }
+        node_def {
+          name: "ShardedFilename/shard"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: 0
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "ShardedFilename/shard"
+          }
+        }
+        node_def {
+          name: "ShardedFilename"
+          op: "ShardedFilename"
+          input: "StringJoin:output:0"
+          input: "ShardedFilename/shard:output:0"
+          input: "num_shards:output:0"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "ShardedFilename"
+          }
+        }
+        node_def {
+          name: "SaveV2/tensor_names"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 7
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                  dim {
+                    size: 7
+                  }
+                }
+                string_val: "train_step/.ATTRIBUTES/VARIABLE_VALUE"
+                string_val: "model_variables/0/.ATTRIBUTES/VARIABLE_VALUE"
+                string_val: "model_variables/1/.ATTRIBUTES/VARIABLE_VALUE"
+                string_val: "model_variables/2/.ATTRIBUTES/VARIABLE_VALUE"
+                string_val: "model_variables/3/.ATTRIBUTES/VARIABLE_VALUE"
+                string_val: "model_variables/4/.ATTRIBUTES/VARIABLE_VALUE"
+                string_val: "model_variables/5/.ATTRIBUTES/VARIABLE_VALUE"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "SaveV2/tensor_names"
+          }
+        }
+        node_def {
+          name: "SaveV2/shape_and_slices"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 7
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                  dim {
+                    size: 7
+                  }
+                }
+                string_val: ""
+                string_val: ""
+                string_val: ""
+                string_val: ""
+                string_val: ""
+                string_val: ""
+                string_val: ""
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "SaveV2/shape_and_slices"
+          }
+        }
+        node_def {
+          name: "SaveV2"
+          op: "SaveV2"
+          input: "ShardedFilename:filename:0"
+          input: "SaveV2/tensor_names:output:0"
+          input: "SaveV2/shape_and_slices:output:0"
+          input: "savev2_train_step_read_readvariableop"
+          input: "savev2_qnetwork_encodingnetwork_dense_kernel_read_readvariableop"
+          input: "savev2_qnetwork_encodingnetwork_dense_bias_read_readvariableop"
+          input: "savev2_qnetwork_encodingnetwork_dense_1_kernel_read_readvariableop"
+          input: "savev2_qnetwork_encodingnetwork_dense_1_bias_read_readvariableop"
+          input: "savev2_qnetwork_dense_2_kernel_read_readvariableop"
+          input: "savev2_qnetwork_dense_2_bias_read_readvariableop"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "dtypes"
+            value {
+              list {
+                type: DT_INT64
+                type: DT_FLOAT
+                type: DT_FLOAT
+                type: DT_FLOAT
+                type: DT_FLOAT
+                type: DT_FLOAT
+                type: DT_FLOAT
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "SaveV2"
+          }
+        }
+        node_def {
+          name: "ShardedFilename_1/shard"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT32
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_INT32
+                tensor_shape {
+                }
+                int_val: 1
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "ShardedFilename_1/shard"
+          }
+        }
+        node_def {
+          name: "ShardedFilename_1"
+          op: "ShardedFilename"
+          input: "StringJoin:output:0"
+          input: "ShardedFilename_1/shard:output:0"
+          input: "num_shards:output:0"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "ShardedFilename_1"
+          }
+        }
+        node_def {
+          name: "SaveV2_1/tensor_names"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                  dim {
+                    size: 1
+                  }
+                }
+                string_val: "_CHECKPOINTABLE_OBJECT_GRAPH"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "SaveV2_1/tensor_names"
+          }
+        }
+        node_def {
+          name: "SaveV2_1/shape_and_slices"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                  dim {
+                    size: 1
+                  }
+                }
+                string_val: ""
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "SaveV2_1/shape_and_slices"
+          }
+        }
+        node_def {
+          name: "SaveV2_1"
+          op: "SaveV2"
+          input: "ShardedFilename_1:filename:0"
+          input: "SaveV2_1/tensor_names:output:0"
+          input: "SaveV2_1/shape_and_slices:output:0"
+          input: "savev2_1_const"
+          input: "^SaveV2"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "dtypes"
+            value {
+              list {
+                type: DT_STRING
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "SaveV2_1"
+          }
+        }
+        node_def {
+          name: "MergeV2Checkpoints/checkpoint_prefixes"
+          op: "Pack"
+          input: "ShardedFilename:filename:0"
+          input: "ShardedFilename_1:filename:0"
+          input: "^SaveV2"
+          input: "^SaveV2_1"
+          device: "/device:CPU:0"
+          attr {
+            key: "N"
+            value {
+              i: 2
+            }
+          }
+          attr {
+            key: "T"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 2
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "MergeV2Checkpoints/checkpoint_prefixes"
+          }
+        }
+        node_def {
+          name: "MergeV2Checkpoints"
+          op: "MergeV2Checkpoints"
+          input: "MergeV2Checkpoints/checkpoint_prefixes:output:0"
+          input: "file_prefix"
+          input: "^SaveV2_1"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "MergeV2Checkpoints"
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "file_prefix"
+          input: "^MergeV2Checkpoints"
+          device: "/device:CPU:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity"
+          }
+        }
+        node_def {
+          name: "Identity_1"
+          op: "Identity"
+          input: "Identity:output:0"
+          input: "^MergeV2Checkpoints"
+          input: "^SaveV2"
+          input: "^SaveV2_1"
+          attr {
+            key: "T"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity_1"
+          }
+        }
+        ret {
+          key: "identity_1"
+          value: "Identity_1:output:0"
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+              }
+              shape {
+              }
+              shape {
+                dim {
+                  size: 34
+                }
+                dim {
+                  size: 100
+                }
+              }
+              shape {
+                dim {
+                  size: 100
+                }
+              }
+              shape {
+                dim {
+                  size: 100
+                }
+                dim {
+                  size: 40
+                }
+              }
+              shape {
+                dim {
+                  size: 40
+                }
+              }
+              shape {
+                dim {
+                  size: 40
+                }
+                dim {
+                  size: 2
+                }
+              }
+              shape {
+                dim {
+                  size: 2
+                }
+              }
+              shape {
+              }
+            }
+          }
+        }
+        control_ret {
+          key: "MergeV2Checkpoints"
+          value: "MergeV2Checkpoints"
+        }
+        control_ret {
+          key: "SaveV2"
+          value: "SaveV2"
+        }
+        control_ret {
+          key: "SaveV2_1"
+          value: "SaveV2_1"
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "file_prefix"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 1
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 2
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 34
+                    }
+                    dim {
+                      size: 100
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 3
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 100
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 4
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 100
+                    }
+                    dim {
+                      size: 40
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 5
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 40
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 6
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 40
+                    }
+                    dim {
+                      size: 2
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 7
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 2
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 8
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference_function_722"
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference_signature_wrapper_4619026"
+          input_arg {
+            name: "callee_basic_block_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "callee_conditionally_executed_blocks"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "callee_users"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "caller_basic_block_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "caller_conditionally_executed_blocks"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "caller_users"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "callsite_height"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "cost_estimate"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "discount"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "edge_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "inlining_default"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "node_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "nr_ctant_params"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "reward"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "step_type"
+            type: DT_INT32
+          }
+          input_arg {
+            name: "unknown"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_0"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_1"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_2"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_3"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_4"
+            type: DT_RESOURCE
+          }
+          output_arg {
+            name: "identity"
+            type: DT_INT64
+          }
+          is_stateful: true
+          control_output: "StatefulPartitionedCall"
+        }
+        node_def {
+          name: "StatefulPartitionedCall"
+          op: "StatefulPartitionedCall"
+          input: "step_type"
+          input: "reward"
+          input: "discount"
+          input: "callee_basic_block_count"
+          input: "callee_conditionally_executed_blocks"
+          input: "callee_users"
+          input: "caller_basic_block_count"
+          input: "caller_conditionally_executed_blocks"
+          input: "caller_users"
+          input: "callsite_height"
+          input: "cost_estimate"
+          input: "edge_count"
+          input: "inlining_default"
+          input: "node_count"
+          input: "nr_ctant_params"
+          input: "unknown"
+          input: "unknown_0"
+          input: "unknown_1"
+          input: "unknown_2"
+          input: "unknown_3"
+          input: "unknown_4"
+          attr {
+            key: "Tin"
+            value {
+              list {
+                type: DT_INT32
+                type: DT_FLOAT
+                type: DT_FLOAT
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+              }
+            }
+          }
+          attr {
+            key: "Tout"
+            value {
+              list {
+                type: DT_INT64
+              }
+            }
+          }
+          attr {
+            key: "_collective_manager_ids"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "_read_only_resource_inputs"
+            value {
+              list {
+                i: 15
+                i: 16
+                i: 17
+                i: 18
+                i: 19
+                i: 20
+              }
+            }
+          }
+          attr {
+            key: "config_proto"
+            value {
+              s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001"
+            }
+          }
+          attr {
+            key: "f"
+            value {
+              func {
+                name: "__inference_function_with_signature_4618993"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "StatefulPartitionedCall"
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "StatefulPartitionedCall:output:0"
+          input: "^StatefulPartitionedCall"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity"
+          }
+        }
+        ret {
+          key: "identity"
+          value: "Identity:output:0"
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+            }
+          }
+        }
+        control_ret {
+          key: "StatefulPartitionedCall"
+          value: "StatefulPartitionedCall"
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "callee_basic_block_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 1
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "callee_conditionally_executed_blocks"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 2
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "callee_users"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 3
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "caller_basic_block_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 4
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "caller_conditionally_executed_blocks"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 5
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "caller_users"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 6
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "callsite_height"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 7
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "cost_estimate"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 8
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "discount"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 9
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "edge_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 10
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "inlining_default"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 11
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "node_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 12
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "nr_ctant_params"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 13
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "reward"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 14
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "step_type"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 15
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 16
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 17
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 18
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 19
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 20
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference_function_with_signature_4618993"
+          input_arg {
+            name: "step_type"
+            type: DT_INT32
+          }
+          input_arg {
+            name: "reward"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "discount"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "callee_basic_block_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "callee_conditionally_executed_blocks"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "callee_users"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "caller_basic_block_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "caller_conditionally_executed_blocks"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "caller_users"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "callsite_height"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "cost_estimate"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "edge_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "inlining_default"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "node_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "nr_ctant_params"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "unknown"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_0"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_1"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_2"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_3"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_4"
+            type: DT_RESOURCE
+          }
+          output_arg {
+            name: "identity"
+            type: DT_INT64
+          }
+          is_stateful: true
+          control_output: "StatefulPartitionedCall"
+        }
+        node_def {
+          name: "StatefulPartitionedCall"
+          op: "StatefulPartitionedCall"
+          input: "step_type"
+          input: "reward"
+          input: "discount"
+          input: "callee_basic_block_count"
+          input: "callee_conditionally_executed_blocks"
+          input: "callee_users"
+          input: "caller_basic_block_count"
+          input: "caller_conditionally_executed_blocks"
+          input: "caller_users"
+          input: "callsite_height"
+          input: "cost_estimate"
+          input: "edge_count"
+          input: "inlining_default"
+          input: "node_count"
+          input: "nr_ctant_params"
+          input: "unknown"
+          input: "unknown_0"
+          input: "unknown_1"
+          input: "unknown_2"
+          input: "unknown_3"
+          input: "unknown_4"
+          attr {
+            key: "Tin"
+            value {
+              list {
+                type: DT_INT32
+                type: DT_FLOAT
+                type: DT_FLOAT
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+              }
+            }
+          }
+          attr {
+            key: "Tout"
+            value {
+              list {
+                type: DT_INT64
+              }
+            }
+          }
+          attr {
+            key: "_collective_manager_ids"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "_read_only_resource_inputs"
+            value {
+              list {
+                i: 15
+                i: 16
+                i: 17
+                i: 18
+                i: 19
+                i: 20
+              }
+            }
+          }
+          attr {
+            key: "config_proto"
+            value {
+              s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001"
+            }
+          }
+          attr {
+            key: "f"
+            value {
+              func {
+                name: "__inference_polymorphic_action_fn_4618978"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "StatefulPartitionedCall"
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "StatefulPartitionedCall:output:0"
+          input: "^StatefulPartitionedCall"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity"
+          }
+        }
+        ret {
+          key: "identity"
+          value: "Identity:output:0"
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+            }
+          }
+        }
+        control_ret {
+          key: "StatefulPartitionedCall"
+          value: "StatefulPartitionedCall"
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "step_type"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 1
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "reward"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 2
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "discount"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 3
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "callee_basic_block_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 4
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "callee_conditionally_executed_blocks"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 5
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "callee_users"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 6
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "caller_basic_block_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 7
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "caller_conditionally_executed_blocks"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 8
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "caller_users"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 9
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "callsite_height"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 10
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "cost_estimate"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 11
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "edge_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 12
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "inlining_default"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 13
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "node_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 14
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "nr_ctant_params"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 15
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 16
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 17
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 18
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 19
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 20
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference_polymorphic_action_fn_4619080"
+          input_arg {
+            name: "time_step_step_type"
+            type: DT_INT32
+          }
+          input_arg {
+            name: "time_step_reward"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "time_step_discount"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "time_step_observation_callee_basic_block_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_observation_callee_conditionally_executed_blocks"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_observation_callee_users"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_observation_caller_basic_block_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_observation_caller_conditionally_executed_blocks"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_observation_caller_users"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_observation_callsite_height"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_observation_cost_estimate"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_observation_edge_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_observation_inlining_default"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_observation_node_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_observation_nr_ctant_params"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "unknown"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_0"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_1"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_2"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_3"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_4"
+            type: DT_RESOURCE
+          }
+          output_arg {
+            name: "identity"
+            type: DT_INT64
+          }
+          is_stateful: true
+          control_output: "StatefulPartitionedCall"
+        }
+        node_def {
+          name: "StatefulPartitionedCall"
+          op: "StatefulPartitionedCall"
+          input: "time_step_step_type"
+          input: "time_step_reward"
+          input: "time_step_discount"
+          input: "time_step_observation_callee_basic_block_count"
+          input: "time_step_observation_callee_conditionally_executed_blocks"
+          input: "time_step_observation_callee_users"
+          input: "time_step_observation_caller_basic_block_count"
+          input: "time_step_observation_caller_conditionally_executed_blocks"
+          input: "time_step_observation_caller_users"
+          input: "time_step_observation_callsite_height"
+          input: "time_step_observation_cost_estimate"
+          input: "time_step_observation_edge_count"
+          input: "time_step_observation_inlining_default"
+          input: "time_step_observation_node_count"
+          input: "time_step_observation_nr_ctant_params"
+          input: "unknown"
+          input: "unknown_0"
+          input: "unknown_1"
+          input: "unknown_2"
+          input: "unknown_3"
+          input: "unknown_4"
+          attr {
+            key: "Tin"
+            value {
+              list {
+                type: DT_INT32
+                type: DT_FLOAT
+                type: DT_FLOAT
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+              }
+            }
+          }
+          attr {
+            key: "Tout"
+            value {
+              list {
+                type: DT_INT64
+              }
+            }
+          }
+          attr {
+            key: "_collective_manager_ids"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "_read_only_resource_inputs"
+            value {
+              list {
+                i: 15
+                i: 16
+                i: 17
+                i: 18
+                i: 19
+                i: 20
+              }
+            }
+          }
+          attr {
+            key: "config_proto"
+            value {
+              s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001"
+            }
+          }
+          attr {
+            key: "f"
+            value {
+              func {
+                name: "__inference_action_931"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "StatefulPartitionedCall"
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "StatefulPartitionedCall:output:0"
+          input: "^StatefulPartitionedCall"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity"
+          }
+        }
+        ret {
+          key: "identity"
+          value: "Identity:output:0"
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+            }
+          }
+        }
+        control_ret {
+          key: "StatefulPartitionedCall"
+          value: "StatefulPartitionedCall"
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/step_type"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 1
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/reward"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 2
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/discount"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 3
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/observation/callee_basic_block_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 4
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/observation/callee_conditionally_executed_blocks"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 5
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/observation/callee_users"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 6
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/observation/caller_basic_block_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 7
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/observation/caller_conditionally_executed_blocks"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 8
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/observation/caller_users"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 9
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/observation/callsite_height"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 10
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/observation/cost_estimate"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 11
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/observation/edge_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 12
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/observation/inlining_default"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 13
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/observation/node_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 14
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step/observation/nr_ctant_params"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 15
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 16
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 17
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 18
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 19
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 20
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference_function_with_signature_4619040"
+          input_arg {
+            name: "unknown"
+            type: DT_RESOURCE
+          }
+          output_arg {
+            name: "identity"
+            type: DT_INT64
+          }
+          is_stateful: true
+          control_output: "StatefulPartitionedCall"
+        }
+        node_def {
+          name: "StatefulPartitionedCall"
+          op: "StatefulPartitionedCall"
+          input: "unknown"
+          attr {
+            key: "Tin"
+            value {
+              list {
+                type: DT_RESOURCE
+              }
+            }
+          }
+          attr {
+            key: "Tout"
+            value {
+              list {
+                type: DT_INT64
+              }
+            }
+          }
+          attr {
+            key: "_collective_manager_ids"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "_read_only_resource_inputs"
+            value {
+              list {
+                i: 0
+              }
+            }
+          }
+          attr {
+            key: "config_proto"
+            value {
+              s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001"
+            }
+          }
+          attr {
+            key: "f"
+            value {
+              func {
+                name: "__inference_<lambda>_728"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "StatefulPartitionedCall"
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "StatefulPartitionedCall:output:0"
+          input: "^StatefulPartitionedCall"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity"
+          }
+        }
+        ret {
+          key: "identity"
+          value: "Identity:output:0"
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+                unknown_rank: true
+              }
+            }
+          }
+        }
+        control_ret {
+          key: "StatefulPartitionedCall"
+          value: "StatefulPartitionedCall"
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference_polymorphic_action_fn_4618978"
+          input_arg {
+            name: "time_step"
+            type: DT_INT32
+          }
+          input_arg {
+            name: "time_step_1"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "time_step_2"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "time_step_3"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_4"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_5"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_6"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_7"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_8"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_9"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_10"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_11"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_12"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_13"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "time_step_14"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "unknown"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_0"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_1"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_2"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_3"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_4"
+            type: DT_RESOURCE
+          }
+          output_arg {
+            name: "identity"
+            type: DT_INT64
+          }
+          is_stateful: true
+          control_output: "StatefulPartitionedCall"
+        }
+        node_def {
+          name: "StatefulPartitionedCall"
+          op: "StatefulPartitionedCall"
+          input: "time_step"
+          input: "time_step_1"
+          input: "time_step_2"
+          input: "time_step_3"
+          input: "time_step_4"
+          input: "time_step_5"
+          input: "time_step_6"
+          input: "time_step_7"
+          input: "time_step_8"
+          input: "time_step_9"
+          input: "time_step_10"
+          input: "time_step_11"
+          input: "time_step_12"
+          input: "time_step_13"
+          input: "time_step_14"
+          input: "unknown"
+          input: "unknown_0"
+          input: "unknown_1"
+          input: "unknown_2"
+          input: "unknown_3"
+          input: "unknown_4"
+          attr {
+            key: "Tin"
+            value {
+              list {
+                type: DT_INT32
+                type: DT_FLOAT
+                type: DT_FLOAT
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+              }
+            }
+          }
+          attr {
+            key: "Tout"
+            value {
+              list {
+                type: DT_INT64
+              }
+            }
+          }
+          attr {
+            key: "_collective_manager_ids"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "_read_only_resource_inputs"
+            value {
+              list {
+                i: 15
+                i: 16
+                i: 17
+                i: 18
+                i: 19
+                i: 20
+              }
+            }
+          }
+          attr {
+            key: "config_proto"
+            value {
+              s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001"
+            }
+          }
+          attr {
+            key: "f"
+            value {
+              func {
+                name: "__inference_action_931"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "StatefulPartitionedCall"
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "StatefulPartitionedCall:output:0"
+          input: "^StatefulPartitionedCall"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity"
+          }
+        }
+        ret {
+          key: "identity"
+          value: "Identity:output:0"
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+            }
+          }
+        }
+        control_ret {
+          key: "StatefulPartitionedCall"
+          value: "StatefulPartitionedCall"
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 1
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 2
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 3
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 4
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 5
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 6
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 7
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 8
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 9
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 10
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 11
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 12
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 13
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 14
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "time_step"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 15
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 16
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 17
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 18
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 19
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 20
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference_polymorphic_action_fn_946"
+          input_arg {
+            name: "step_type"
+            type: DT_INT32
+          }
+          input_arg {
+            name: "reward"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "discount"
+            type: DT_FLOAT
+          }
+          input_arg {
+            name: "callee_basic_block_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "callee_conditionally_executed_blocks"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "callee_users"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "caller_basic_block_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "caller_conditionally_executed_blocks"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "caller_users"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "callsite_height"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "cost_estimate"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "edge_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "inlining_default"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "node_count"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "nr_ctant_params"
+            type: DT_INT64
+          }
+          input_arg {
+            name: "unknown"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_0"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_1"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_2"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_3"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "unknown_4"
+            type: DT_RESOURCE
+          }
+          output_arg {
+            name: "identity"
+            type: DT_INT64
+          }
+          is_stateful: true
+          control_output: "StatefulPartitionedCall"
+        }
+        node_def {
+          name: "StatefulPartitionedCall"
+          op: "StatefulPartitionedCall"
+          input: "step_type"
+          input: "reward"
+          input: "discount"
+          input: "callee_basic_block_count"
+          input: "callee_conditionally_executed_blocks"
+          input: "callee_users"
+          input: "caller_basic_block_count"
+          input: "caller_conditionally_executed_blocks"
+          input: "caller_users"
+          input: "callsite_height"
+          input: "cost_estimate"
+          input: "edge_count"
+          input: "inlining_default"
+          input: "node_count"
+          input: "nr_ctant_params"
+          input: "unknown"
+          input: "unknown_0"
+          input: "unknown_1"
+          input: "unknown_2"
+          input: "unknown_3"
+          input: "unknown_4"
+          attr {
+            key: "Tin"
+            value {
+              list {
+                type: DT_INT32
+                type: DT_FLOAT
+                type: DT_FLOAT
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_INT64
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+                type: DT_RESOURCE
+              }
+            }
+          }
+          attr {
+            key: "Tout"
+            value {
+              list {
+                type: DT_INT64
+              }
+            }
+          }
+          attr {
+            key: "_collective_manager_ids"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "_read_only_resource_inputs"
+            value {
+              list {
+                i: 15
+                i: 16
+                i: 17
+                i: 18
+                i: 19
+                i: 20
+              }
+            }
+          }
+          attr {
+            key: "config_proto"
+            value {
+              s: "\n\007\n\003CPU\020\001\n\007\n\003GPU\020\0012\005*\0010J\0008\001"
+            }
+          }
+          attr {
+            key: "f"
+            value {
+              func {
+                name: "__inference_action_931"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "StatefulPartitionedCall"
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "StatefulPartitionedCall:output:0"
+          input: "^StatefulPartitionedCall"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity"
+          }
+        }
+        ret {
+          key: "identity"
+          value: "Identity:output:0"
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                dim {
+                  size: 1
+                }
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+            }
+          }
+        }
+        control_ret {
+          key: "StatefulPartitionedCall"
+          value: "StatefulPartitionedCall"
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "step_type"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 1
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "reward"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 2
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "discount"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 3
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "callee_basic_block_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 4
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "callee_conditionally_executed_blocks"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 5
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "callee_users"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 6
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "caller_basic_block_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 7
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "caller_conditionally_executed_blocks"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 8
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "caller_users"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 9
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "callsite_height"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 10
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "cost_estimate"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 11
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "edge_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 12
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "inlining_default"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 13
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "node_count"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 14
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "nr_ctant_params"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 15
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 16
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 17
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 18
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 19
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 20
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference__traced_restore_4619176"
+          input_arg {
+            name: "file_prefix"
+            type: DT_STRING
+          }
+          input_arg {
+            name: "assignvariableop_train_step"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "assignvariableop_1_qnetwork_encodingnetwork_dense_kernel"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "assignvariableop_2_qnetwork_encodingnetwork_dense_bias"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "assignvariableop_3_qnetwork_encodingnetwork_dense_1_kernel"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "assignvariableop_4_qnetwork_encodingnetwork_dense_1_bias"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "assignvariableop_5_qnetwork_dense_2_kernel"
+            type: DT_RESOURCE
+          }
+          input_arg {
+            name: "assignvariableop_6_qnetwork_dense_2_bias"
+            type: DT_RESOURCE
+          }
+          output_arg {
+            name: "identity_8"
+            type: DT_STRING
+          }
+          is_stateful: true
+          control_output: "AssignVariableOp"
+          control_output: "AssignVariableOp_1"
+          control_output: "AssignVariableOp_2"
+          control_output: "AssignVariableOp_3"
+          control_output: "AssignVariableOp_4"
+          control_output: "AssignVariableOp_5"
+          control_output: "AssignVariableOp_6"
+          control_output: "RestoreV2"
+          control_output: "RestoreV2_1"
+        }
+        node_def {
+          name: "RestoreV2/tensor_names"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 7
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                  dim {
+                    size: 7
+                  }
+                }
+                string_val: "train_step/.ATTRIBUTES/VARIABLE_VALUE"
+                string_val: "model_variables/0/.ATTRIBUTES/VARIABLE_VALUE"
+                string_val: "model_variables/1/.ATTRIBUTES/VARIABLE_VALUE"
+                string_val: "model_variables/2/.ATTRIBUTES/VARIABLE_VALUE"
+                string_val: "model_variables/3/.ATTRIBUTES/VARIABLE_VALUE"
+                string_val: "model_variables/4/.ATTRIBUTES/VARIABLE_VALUE"
+                string_val: "model_variables/5/.ATTRIBUTES/VARIABLE_VALUE"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "RestoreV2/tensor_names"
+          }
+        }
+        node_def {
+          name: "RestoreV2/shape_and_slices"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 7
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                  dim {
+                    size: 7
+                  }
+                }
+                string_val: ""
+                string_val: ""
+                string_val: ""
+                string_val: ""
+                string_val: ""
+                string_val: ""
+                string_val: ""
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "RestoreV2/shape_and_slices"
+          }
+        }
+        node_def {
+          name: "RestoreV2"
+          op: "RestoreV2"
+          input: "file_prefix"
+          input: "RestoreV2/tensor_names:output:0"
+          input: "RestoreV2/shape_and_slices:output:0"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  unknown_rank: true
+                }
+                shape {
+                  unknown_rank: true
+                }
+                shape {
+                  unknown_rank: true
+                }
+                shape {
+                  unknown_rank: true
+                }
+                shape {
+                  unknown_rank: true
+                }
+                shape {
+                  unknown_rank: true
+                }
+                shape {
+                  unknown_rank: true
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtypes"
+            value {
+              list {
+                type: DT_INT64
+                type: DT_FLOAT
+                type: DT_FLOAT
+                type: DT_FLOAT
+                type: DT_FLOAT
+                type: DT_FLOAT
+                type: DT_FLOAT
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "RestoreV2"
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "RestoreV2:tensors:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  unknown_rank: true
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity"
+          }
+        }
+        node_def {
+          name: "AssignVariableOp"
+          op: "AssignVariableOp"
+          input: "assignvariableop_train_step"
+          input: "Identity:output:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT64
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "AssignVariableOp"
+          }
+        }
+        node_def {
+          name: "Identity_1"
+          op: "Identity"
+          input: "RestoreV2:tensors:1"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  unknown_rank: true
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity_1"
+          }
+        }
+        node_def {
+          name: "AssignVariableOp_1"
+          op: "AssignVariableOp"
+          input: "assignvariableop_1_qnetwork_encodingnetwork_dense_kernel"
+          input: "Identity_1:output:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "AssignVariableOp_1"
+          }
+        }
+        node_def {
+          name: "Identity_2"
+          op: "Identity"
+          input: "RestoreV2:tensors:2"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  unknown_rank: true
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity_2"
+          }
+        }
+        node_def {
+          name: "AssignVariableOp_2"
+          op: "AssignVariableOp"
+          input: "assignvariableop_2_qnetwork_encodingnetwork_dense_bias"
+          input: "Identity_2:output:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "AssignVariableOp_2"
+          }
+        }
+        node_def {
+          name: "Identity_3"
+          op: "Identity"
+          input: "RestoreV2:tensors:3"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  unknown_rank: true
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity_3"
+          }
+        }
+        node_def {
+          name: "AssignVariableOp_3"
+          op: "AssignVariableOp"
+          input: "assignvariableop_3_qnetwork_encodingnetwork_dense_1_kernel"
+          input: "Identity_3:output:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "AssignVariableOp_3"
+          }
+        }
+        node_def {
+          name: "Identity_4"
+          op: "Identity"
+          input: "RestoreV2:tensors:4"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  unknown_rank: true
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity_4"
+          }
+        }
+        node_def {
+          name: "AssignVariableOp_4"
+          op: "AssignVariableOp"
+          input: "assignvariableop_4_qnetwork_encodingnetwork_dense_1_bias"
+          input: "Identity_4:output:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "AssignVariableOp_4"
+          }
+        }
+        node_def {
+          name: "Identity_5"
+          op: "Identity"
+          input: "RestoreV2:tensors:5"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  unknown_rank: true
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity_5"
+          }
+        }
+        node_def {
+          name: "AssignVariableOp_5"
+          op: "AssignVariableOp"
+          input: "assignvariableop_5_qnetwork_dense_2_kernel"
+          input: "Identity_5:output:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "AssignVariableOp_5"
+          }
+        }
+        node_def {
+          name: "Identity_6"
+          op: "Identity"
+          input: "RestoreV2:tensors:6"
+          attr {
+            key: "T"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  unknown_rank: true
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity_6"
+          }
+        }
+        node_def {
+          name: "AssignVariableOp_6"
+          op: "AssignVariableOp"
+          input: "assignvariableop_6_qnetwork_dense_2_bias"
+          input: "Identity_6:output:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_FLOAT
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "AssignVariableOp_6"
+          }
+        }
+        node_def {
+          name: "RestoreV2_1/tensor_names"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                  dim {
+                    size: 1
+                  }
+                }
+                string_val: "_CHECKPOINTABLE_OBJECT_GRAPH"
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "RestoreV2_1/tensor_names"
+          }
+        }
+        node_def {
+          name: "RestoreV2_1/shape_and_slices"
+          op: "Const"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  dim {
+                    size: 1
+                  }
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "value"
+            value {
+              tensor {
+                dtype: DT_STRING
+                tensor_shape {
+                  dim {
+                    size: 1
+                  }
+                }
+                string_val: ""
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "RestoreV2_1/shape_and_slices"
+          }
+        }
+        node_def {
+          name: "RestoreV2_1"
+          op: "RestoreV2"
+          input: "file_prefix"
+          input: "RestoreV2_1/tensor_names:output:0"
+          input: "RestoreV2_1/shape_and_slices:output:0"
+          input: "^RestoreV2"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                  unknown_rank: true
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtypes"
+            value {
+              list {
+                type: DT_STRING
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "RestoreV2_1"
+          }
+        }
+        node_def {
+          name: "NoOp"
+          op: "NoOp"
+          device: "/device:CPU:0"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "NoOp"
+          }
+        }
+        node_def {
+          name: "Identity_7"
+          op: "Identity"
+          input: "file_prefix"
+          input: "^AssignVariableOp"
+          input: "^AssignVariableOp_1"
+          input: "^AssignVariableOp_2"
+          input: "^AssignVariableOp_3"
+          input: "^AssignVariableOp_4"
+          input: "^AssignVariableOp_5"
+          input: "^AssignVariableOp_6"
+          input: "^NoOp"
+          device: "/device:CPU:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity_7"
+          }
+        }
+        node_def {
+          name: "Identity_8"
+          op: "Identity"
+          input: "Identity_7:output:0"
+          input: "^AssignVariableOp"
+          input: "^AssignVariableOp_1"
+          input: "^AssignVariableOp_2"
+          input: "^AssignVariableOp_3"
+          input: "^AssignVariableOp_4"
+          input: "^AssignVariableOp_5"
+          input: "^AssignVariableOp_6"
+          input: "^RestoreV2"
+          input: "^RestoreV2_1"
+          attr {
+            key: "T"
+            value {
+              type: DT_STRING
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity_8"
+          }
+        }
+        ret {
+          key: "identity_8"
+          value: "Identity_8:output:0"
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+              shape {
+                unknown_rank: true
+              }
+            }
+          }
+        }
+        control_ret {
+          key: "AssignVariableOp"
+          value: "AssignVariableOp"
+        }
+        control_ret {
+          key: "AssignVariableOp_1"
+          value: "AssignVariableOp_1"
+        }
+        control_ret {
+          key: "AssignVariableOp_2"
+          value: "AssignVariableOp_2"
+        }
+        control_ret {
+          key: "AssignVariableOp_3"
+          value: "AssignVariableOp_3"
+        }
+        control_ret {
+          key: "AssignVariableOp_4"
+          value: "AssignVariableOp_4"
+        }
+        control_ret {
+          key: "AssignVariableOp_5"
+          value: "AssignVariableOp_5"
+        }
+        control_ret {
+          key: "AssignVariableOp_6"
+          value: "AssignVariableOp_6"
+        }
+        control_ret {
+          key: "RestoreV2"
+          value: "RestoreV2"
+        }
+        control_ret {
+          key: "RestoreV2_1"
+          value: "RestoreV2_1"
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+            attr {
+              key: "_user_specified_name"
+              value {
+                s: "file_prefix"
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 1
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 2
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 3
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 4
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 5
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 6
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 7
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+      function {
+        signature {
+          name: "__inference_<lambda>_728"
+          input_arg {
+            name: "readvariableop_resource"
+            type: DT_RESOURCE
+          }
+          output_arg {
+            name: "identity"
+            type: DT_INT64
+          }
+          is_stateful: true
+        }
+        node_def {
+          name: "ReadVariableOp"
+          op: "ReadVariableOp"
+          input: "readvariableop_resource"
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          attr {
+            key: "dtype"
+            value {
+              type: DT_INT64
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "ReadVariableOp"
+          }
+        }
+        node_def {
+          name: "Identity"
+          op: "Identity"
+          input: "ReadVariableOp:value:0"
+          attr {
+            key: "T"
+            value {
+              type: DT_INT64
+            }
+          }
+          attr {
+            key: "_output_shapes"
+            value {
+              list {
+                shape {
+                }
+              }
+            }
+          }
+          experimental_debug_info {
+            original_node_names: "Identity"
+          }
+        }
+        ret {
+          key: "identity"
+          value: "Identity:output:0"
+        }
+        attr {
+          key: "_input_shapes"
+          value {
+            list {
+              shape {
+                unknown_rank: true
+              }
+            }
+          }
+        }
+        arg_attr {
+          key: 0
+          value {
+            attr {
+              key: "_output_shapes"
+              value {
+                list {
+                  shape {
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    versions {
+      producer: 357
+      min_consumer: 12
+    }
+  }
+  saver_def {
+    filename_tensor_name: "saver_filename:0"
+    save_tensor_name: "StatefulPartitionedCall_2:0"
+    restore_op_name: "StatefulPartitionedCall_3"
+    version: V2
+  }
+  collection_def {
+    key: "saved_model_main_op"
+    value {
+      node_list {
+        value: "NoOp"
+      }
+    }
+  }
+  signature_def {
+    key: "__saved_model_init_op"
+    value {
+      outputs {
+        key: "__saved_model_init_op"
+        value {
+          name: "NoOp"
+          tensor_shape {
+            unknown_rank: true
+          }
+        }
+      }
+    }
+  }
+  signature_def {
+    key: "action"
+    value {
+      inputs {
+        key: "callee_basic_block_count"
+        value {
+          name: "action_callee_basic_block_count:0"
+          dtype: DT_INT64
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      inputs {
+        key: "callee_conditionally_executed_blocks"
+        value {
+          name: "action_callee_conditionally_executed_blocks:0"
+          dtype: DT_INT64
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      inputs {
+        key: "callee_users"
+        value {
+          name: "action_callee_users:0"
+          dtype: DT_INT64
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      inputs {
+        key: "caller_basic_block_count"
+        value {
+          name: "action_caller_basic_block_count:0"
+          dtype: DT_INT64
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      inputs {
+        key: "caller_conditionally_executed_blocks"
+        value {
+          name: "action_caller_conditionally_executed_blocks:0"
+          dtype: DT_INT64
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      inputs {
+        key: "caller_users"
+        value {
+          name: "action_caller_users:0"
+          dtype: DT_INT64
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      inputs {
+        key: "callsite_height"
+        value {
+          name: "action_callsite_height:0"
+          dtype: DT_INT64
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      inputs {
+        key: "cost_estimate"
+        value {
+          name: "action_cost_estimate:0"
+          dtype: DT_INT64
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      inputs {
+        key: "discount"
+        value {
+          name: "action_discount:0"
+          dtype: DT_FLOAT
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      inputs {
+        key: "edge_count"
+        value {
+          name: "action_edge_count:0"
+          dtype: DT_INT64
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      inputs {
+        key: "inlining_default"
+        value {
+          name: "action_inlining_default:0"
+          dtype: DT_INT64
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      inputs {
+        key: "node_count"
+        value {
+          name: "action_node_count:0"
+          dtype: DT_INT64
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      inputs {
+        key: "nr_ctant_params"
+        value {
+          name: "action_nr_ctant_params:0"
+          dtype: DT_INT64
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      inputs {
+        key: "reward"
+        value {
+          name: "action_reward:0"
+          dtype: DT_FLOAT
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      inputs {
+        key: "step_type"
+        value {
+          name: "action_step_type:0"
+          dtype: DT_INT32
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      outputs {
+        key: "inlining_decision"
+        value {
+          name: "StatefulPartitionedCall:0"
+          dtype: DT_INT64
+          tensor_shape {
+            dim {
+              size: 1
+            }
+          }
+        }
+      }
+      method_name: "tensorflow/serving/predict"
+    }
+  }
+  signature_def {
+    key: "get_initial_state"
+    value {
+      method_name: "tensorflow/serving/predict"
+    }
+  }
+  signature_def {
+    key: "get_train_step"
+    value {
+      outputs {
+        key: "int64"
+        value {
+          name: "StatefulPartitionedCall_1:0"
+          dtype: DT_INT64
+          tensor_shape {
+          }
+        }
+      }
+      method_name: "tensorflow/serving/predict"
+    }
+  }
+  object_graph_def {
+    nodes {
+      children {
+        node_id: 1
+        local_name: "_time_step_spec"
+      }
+      children {
+        node_id: 2
+        local_name: "_trajectory_spec"
+      }
+      children {
+        node_id: 3
+        local_name: "_wrapped_policy"
+      }
+      children {
+        node_id: 4
+        local_name: "train_step"
+      }
+      children {
+        node_id: 5
+        local_name: "model_variables"
+      }
+      children {
+        node_id: 6
+        local_name: "signatures"
+      }
+      children {
+        node_id: 210
+        local_name: "action"
+      }
+      children {
+        node_id: 211
+        local_name: "get_initial_state"
+      }
+      children {
+        node_id: 212
+        local_name: "get_train_step"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 7
+        local_name: "observation"
+      }
+      children {
+        node_id: 7
+        local_name: "3"
+      }
+      user_object {
+        identifier: "trackable_tuple_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 7
+        local_name: "observation"
+      }
+      children {
+        node_id: 7
+        local_name: "1"
+      }
+      user_object {
+        identifier: "trackable_tuple_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 8
+        local_name: "_q_network"
+      }
+      children {
+        node_id: 1
+        local_name: "_time_step_spec"
+      }
+      children {
+        node_id: 9
+        local_name: "_trajectory_spec"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      variable {
+        dtype: DT_INT64
+        shape {
+        }
+        name: "train_step"
+      }
+    }
+    nodes {
+      children {
+        node_id: 10
+        local_name: "0"
+      }
+      children {
+        node_id: 11
+        local_name: "1"
+      }
+      children {
+        node_id: 12
+        local_name: "2"
+      }
+      children {
+        node_id: 13
+        local_name: "3"
+      }
+      children {
+        node_id: 14
+        local_name: "4"
+      }
+      children {
+        node_id: 15
+        local_name: "5"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 213
+        local_name: "action"
+      }
+      children {
+        node_id: 214
+        local_name: "get_initial_state"
+      }
+      children {
+        node_id: 215
+        local_name: "get_train_step"
+      }
+      user_object {
+        identifier: "signature_map"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 16
+        local_name: "_input_tensor_spec"
+      }
+      children {
+        node_id: 17
+        local_name: "_encoder"
+      }
+      children {
+        node_id: 18
+        local_name: "_q_value_layer"
+      }
+      children {
+        node_id: 19
+        local_name: "variables"
+      }
+      children {
+        node_id: 20
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 21
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 22
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 216
+        local_name: "__call__"
+      }
+      children {
+        node_id: 217
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_network"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"QNetwork\", \"name\": \"QNetwork\", \"trainable\": true, \"expects_training_arg\": true, \"dtype\": \"float32\", \"batch_input_shape\": null, \"config\": {\"layer was saved without config\": true}, \"is_graph_network\": false}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 7
+        local_name: "observation"
+      }
+      children {
+        node_id: 7
+        local_name: "1"
+      }
+      user_object {
+        identifier: "trackable_tuple_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      variable {
+        dtype: DT_FLOAT
+        shape {
+          dim {
+            size: 34
+          }
+          dim {
+            size: 100
+          }
+        }
+        trainable: true
+        name: "QNetwork/EncodingNetwork/dense/kernel"
+      }
+    }
+    nodes {
+      variable {
+        dtype: DT_FLOAT
+        shape {
+          dim {
+            size: 100
+          }
+        }
+        trainable: true
+        name: "QNetwork/EncodingNetwork/dense/bias"
+      }
+    }
+    nodes {
+      variable {
+        dtype: DT_FLOAT
+        shape {
+          dim {
+            size: 100
+          }
+          dim {
+            size: 40
+          }
+        }
+        trainable: true
+        name: "QNetwork/EncodingNetwork/dense_1/kernel"
+      }
+    }
+    nodes {
+      variable {
+        dtype: DT_FLOAT
+        shape {
+          dim {
+            size: 40
+          }
+        }
+        trainable: true
+        name: "QNetwork/EncodingNetwork/dense_1/bias"
+      }
+    }
+    nodes {
+      variable {
+        dtype: DT_FLOAT
+        shape {
+          dim {
+            size: 40
+          }
+          dim {
+            size: 2
+          }
+        }
+        trainable: true
+        name: "QNetwork/dense_2/kernel"
+      }
+    }
+    nodes {
+      variable {
+        dtype: DT_FLOAT
+        shape {
+          dim {
+            size: 2
+          }
+        }
+        trainable: true
+        name: "QNetwork/dense_2/bias"
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 23
+        local_name: "_input_tensor_spec"
+      }
+      children {
+        node_id: 24
+        local_name: "_preprocessing_nest"
+      }
+      children {
+        node_id: 25
+        local_name: "_flat_preprocessing_layers"
+      }
+      children {
+        node_id: 26
+        local_name: "_preprocessing_combiner"
+      }
+      children {
+        node_id: 27
+        local_name: "_postprocessing_layers"
+      }
+      children {
+        node_id: 28
+        local_name: "variables"
+      }
+      children {
+        node_id: 29
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 30
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 31
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 218
+        local_name: "__call__"
+      }
+      children {
+        node_id: 219
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_network"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"EncodingNetwork\", \"name\": \"EncodingNetwork\", \"trainable\": true, \"expects_training_arg\": true, \"dtype\": \"float32\", \"batch_input_shape\": null, \"config\": {\"layer was saved without config\": true}, \"is_graph_network\": false}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 14
+        local_name: "kernel"
+      }
+      children {
+        node_id: 15
+        local_name: "bias"
+      }
+      children {
+        node_id: 32
+        local_name: "variables"
+      }
+      children {
+        node_id: 33
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 34
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 35
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 220
+        local_name: "__call__"
+      }
+      children {
+        node_id: 221
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Dense\", \"name\": \"dense_2\", \"trainable\": true, \"expects_training_arg\": false, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"dense_2\", \"trainable\": true, \"dtype\": \"float32\", \"units\": 2, \"activation\": \"linear\", \"use_bias\": true, \"kernel_initializer\": {\"class_name\": \"RandomUniform\", \"config\": {\"minval\": -0.03, \"maxval\": 0.03, \"seed\": null, \"dtype\": \"float32\"}}, \"bias_initializer\": {\"class_name\": \"Constant\", \"config\": {\"value\": -0.2, \"dtype\": \"float32\"}}, \"kernel_regularizer\": null, \"bias_regularizer\": null, \"activity_regularizer\": null, \"kernel_constraint\": null, \"bias_constraint\": null}, \"input_spec\": {\"class_name\": \"InputSpec\", \"config\": {\"dtype\": null, \"shape\": null, \"ndim\": null, \"max_ndim\": null, \"min_ndim\": 2, \"axes\": {\"-1\": 40}}}, \"build_input_shape\": {\"class_name\": \"TensorShape\", \"items\": [0, 40]}}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 10
+        local_name: "0"
+      }
+      children {
+        node_id: 11
+        local_name: "1"
+      }
+      children {
+        node_id: 12
+        local_name: "2"
+      }
+      children {
+        node_id: 13
+        local_name: "3"
+      }
+      children {
+        node_id: 14
+        local_name: "4"
+      }
+      children {
+        node_id: 15
+        local_name: "5"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 10
+        local_name: "0"
+      }
+      children {
+        node_id: 11
+        local_name: "1"
+      }
+      children {
+        node_id: 12
+        local_name: "2"
+      }
+      children {
+        node_id: 13
+        local_name: "3"
+      }
+      children {
+        node_id: 14
+        local_name: "4"
+      }
+      children {
+        node_id: 15
+        local_name: "5"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 36
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 19
+        local_name: "variables"
+      }
+      children {
+        node_id: 37
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 38
+        local_name: "metrics"
+      }
+      children {
+        node_id: 39
+        local_name: "layers"
+      }
+      children {
+        node_id: 20
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 40
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 21
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 216
+        local_name: "__call__"
+      }
+      children {
+        node_id: 217
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 217
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 41
+        local_name: "0"
+      }
+      children {
+        node_id: 42
+        local_name: "1"
+      }
+      children {
+        node_id: 43
+        local_name: "2"
+      }
+      children {
+        node_id: 44
+        local_name: "3"
+      }
+      children {
+        node_id: 45
+        local_name: "4"
+      }
+      children {
+        node_id: 46
+        local_name: "5"
+      }
+      children {
+        node_id: 47
+        local_name: "6"
+      }
+      children {
+        node_id: 48
+        local_name: "7"
+      }
+      children {
+        node_id: 49
+        local_name: "8"
+      }
+      children {
+        node_id: 50
+        local_name: "9"
+      }
+      children {
+        node_id: 51
+        local_name: "10"
+      }
+      children {
+        node_id: 52
+        local_name: "11"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 53
+        local_name: "variables"
+      }
+      children {
+        node_id: 54
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 55
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 56
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 222
+        local_name: "__call__"
+      }
+      children {
+        node_id: 223
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Concatenate\", \"name\": \"concatenate\", \"trainable\": true, \"expects_training_arg\": false, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"concatenate\", \"trainable\": true, \"dtype\": \"float32\", \"axis\": -1}, \"build_input_shape\": [{\"class_name\": \"TensorShape\", \"items\": [0, 3]}, {\"class_name\": \"TensorShape\", \"items\": [0, 3]}, {\"class_name\": \"TensorShape\", \"items\": [0, 3]}, {\"class_name\": \"TensorShape\", \"items\": [0, 3]}, {\"class_name\": \"TensorShape\", \"items\": [0, 3]}, {\"class_name\": \"TensorShape\", \"items\": [0, 3]}, {\"class_name\": \"TensorShape\", \"items\": [0, 3]}, {\"class_name\": \"TensorShape\", \"items\": [0, 3]}, {\"class_name\": \"TensorShape\", \"items\": [0, 3]}, {\"class_name\": \"TensorShape\", \"items\": [0, 1]}, {\"class_name\": \"TensorShape\", \"items\": [0, 3]}, {\"class_name\": \"TensorShape\", \"items\": [0, 3]}]}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 57
+        local_name: "0"
+      }
+      children {
+        node_id: 58
+        local_name: "1"
+      }
+      children {
+        node_id: 59
+        local_name: "2"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 10
+        local_name: "0"
+      }
+      children {
+        node_id: 11
+        local_name: "1"
+      }
+      children {
+        node_id: 12
+        local_name: "2"
+      }
+      children {
+        node_id: 13
+        local_name: "3"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 10
+        local_name: "0"
+      }
+      children {
+        node_id: 11
+        local_name: "1"
+      }
+      children {
+        node_id: 12
+        local_name: "2"
+      }
+      children {
+        node_id: 13
+        local_name: "3"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 60
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 28
+        local_name: "variables"
+      }
+      children {
+        node_id: 61
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 62
+        local_name: "metrics"
+      }
+      children {
+        node_id: 63
+        local_name: "layers"
+      }
+      children {
+        node_id: 29
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 64
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 30
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 218
+        local_name: "__call__"
+      }
+      children {
+        node_id: 219
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 219
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 14
+        local_name: "0"
+      }
+      children {
+        node_id: 15
+        local_name: "1"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 14
+        local_name: "0"
+      }
+      children {
+        node_id: 15
+        local_name: "1"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 65
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 32
+        local_name: "variables"
+      }
+      children {
+        node_id: 66
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 67
+        local_name: "metrics"
+      }
+      children {
+        node_id: 68
+        local_name: "layers"
+      }
+      children {
+        node_id: 33
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 69
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 34
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 220
+        local_name: "__call__"
+      }
+      children {
+        node_id: 221
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 221
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 17
+        local_name: "0"
+      }
+      children {
+        node_id: 18
+        local_name: "1"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 70
+        local_name: "variables"
+      }
+      children {
+        node_id: 71
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 72
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 73
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 224
+        local_name: "__call__"
+      }
+      children {
+        node_id: 225
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Lambda\", \"name\": \"lambda\", \"trainable\": true, \"expects_training_arg\": true, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"lambda\", \"trainable\": true, \"dtype\": \"float32\", \"function\": {\"class_name\": \"__tuple__\", \"items\": [\"4wEAAAAAAAAAAwAAAAUAAAATAAAAc0QAAACIAHwAgwF9AXQAagF0AmoDfAGIAYMCdABqBIMCdAWI\\nAYMBGwB9AnQAagZ8AnQAagd8AoMBfAJ8AhQAZwNkA2QCjQJTACkETukBAAAAKQHaBGF4aXPp////\\n/ykI2gJ0ZtoEY2FzdNoOY29udHJpYl9sYXllcnPaCWJ1Y2tldGl6ZdoHZmxvYXQzMtoDbGVu2gZj\\nb25jYXTaBHNxcnQpA9oDb2Jz2gxleHBhbmRlZF9vYnPaAXgpAtoOZXhwYW5kX2RpbXNfb3DaCHF1\\nYW50aWxlqQD69C9leHBvcnQvaGRhMy9ib3JnbGV0L2xvY2FsX3JhbV9mc19kaXJzLzAueXVuZGlf\\nbXVwcGV0XzBfMTIyNzA4MzMuMTMueXVuZGkuMTk0NzMxNDE3OTYxLjhmNGY5Zjk4Y2I3YTMwNTUv\\nYnVpbGRfdGFyZ2V0X3RyYWluX3Bhcl9kOTc1NzUzNzAxNmEyZWI4L3RyYWluLnBhci9nb29nbGUz\\nL2xlYXJuaW5nL3NtYXJ0Y2hvaWNlcy9yZXNlYXJjaC9jbGllbnRzL2NvbXBpbGVyX29wdC9wb2xp\\nY3lfdHJhaW5pbmcvZmVhdHVyZV9vcHMucHnaDW5vcm1hbGl6YXRpb24wAAAAcwoAAAAAAQgBBAEK\\nARAB\\n\", null, {\"class_name\": \"__tuple__\", \"items\": [{\"class_name\": \"ExpandDims\", \"config\": {\"name\": \"expand_dims\", \"trainable\": true, \"dtype\": \"float32\", \"axis\": -1}}, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 9.0, 9.0, 9.0, 9.0, 10.0, 10.0, 11.0, 12.0, 13.0, 14.0, 14.0, 14.0, 16.0, 17.0, 19.0, 23.0, 27.0, 39.0]]}]}, \"function_type\": \"lambda\", \"module\": \"google3.learning.smartchoices.research.clients.compiler_opt.policy_training.feature_ops\", \"output_shape\": null, \"output_shape_type\": \"raw\", \"output_shape_module\": null, \"arguments\": {}}}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 74
+        local_name: "variables"
+      }
+      children {
+        node_id: 75
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 76
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 77
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 226
+        local_name: "__call__"
+      }
+      children {
+        node_id: 227
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Lambda\", \"name\": \"lambda_1\", \"trainable\": true, \"expects_training_arg\": true, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"lambda_1\", \"trainable\": true, \"dtype\": \"float32\", \"function\": {\"class_name\": \"__tuple__\", \"items\": [\"4wEAAAAAAAAAAwAAAAUAAAATAAAAc0QAAACIAHwAgwF9AXQAagF0AmoDfAGIAYMCdABqBIMCdAWI\\nAYMBGwB9AnQAagZ8AnQAagd8AoMBfAJ8AhQAZwNkA2QCjQJTACkETukBAAAAKQHaBGF4aXPp////\\n/ykI2gJ0ZtoEY2FzdNoOY29udHJpYl9sYXllcnPaCWJ1Y2tldGl6ZdoHZmxvYXQzMtoDbGVu2gZj\\nb25jYXTaBHNxcnQpA9oDb2Jz2gxleHBhbmRlZF9vYnPaAXgpAtoOZXhwYW5kX2RpbXNfb3DaCHF1\\nYW50aWxlqQD69C9leHBvcnQvaGRhMy9ib3JnbGV0L2xvY2FsX3JhbV9mc19kaXJzLzAueXVuZGlf\\nbXVwcGV0XzBfMTIyNzA4MzMuMTMueXVuZGkuMTk0NzMxNDE3OTYxLjhmNGY5Zjk4Y2I3YTMwNTUv\\nYnVpbGRfdGFyZ2V0X3RyYWluX3Bhcl9kOTc1NzUzNzAxNmEyZWI4L3RyYWluLnBhci9nb29nbGUz\\nL2xlYXJuaW5nL3NtYXJ0Y2hvaWNlcy9yZXNlYXJjaC9jbGllbnRzL2NvbXBpbGVyX29wdC9wb2xp\\nY3lfdHJhaW5pbmcvZmVhdHVyZV9vcHMucHnaDW5vcm1hbGl6YXRpb24wAAAAcwoAAAAAAQgBBAEK\\nARAB\\n\", null, {\"class_name\": \"__tuple__\", \"items\": [{\"class_name\": \"ExpandDims\", \"config\": {\"name\": \"expand_dims\", \"trainable\": true, \"dtype\": \"float32\", \"axis\": -1}}, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 7.0, 8.0, 8.0, 8.0, 8.0, 9.0, 10.0, 10.0, 10.0, 12.0, 12.0, 12.0, 14.0, 14.0, 18.0, 20.0, 23.0, 30.0, 41.0]]}]}, \"function_type\": \"lambda\", \"module\": \"google3.learning.smartchoices.research.clients.compiler_opt.policy_training.feature_ops\", \"output_shape\": null, \"output_shape_type\": \"raw\", \"output_shape_module\": null, \"arguments\": {}}}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 78
+        local_name: "variables"
+      }
+      children {
+        node_id: 79
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 80
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 81
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 228
+        local_name: "__call__"
+      }
+      children {
+        node_id: 229
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Lambda\", \"name\": \"lambda_2\", \"trainable\": true, \"expects_training_arg\": true, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"lambda_2\", \"trainable\": true, \"dtype\": \"float32\", \"function\": {\"class_name\": \"__tuple__\", \"items\": [\"4wEAAAAAAAAAAwAAAAUAAAATAAAAc0QAAACIAHwAgwF9AXQAagF0AmoDfAGIAYMCdABqBIMCdAWI\\nAYMBGwB9AnQAagZ8AnQAagd8AoMBfAJ8AhQAZwNkA2QCjQJTACkETukBAAAAKQHaBGF4aXPp////\\n/ykI2gJ0ZtoEY2FzdNoOY29udHJpYl9sYXllcnPaCWJ1Y2tldGl6ZdoHZmxvYXQzMtoDbGVu2gZj\\nb25jYXTaBHNxcnQpA9oDb2Jz2gxleHBhbmRlZF9vYnPaAXgpAtoOZXhwYW5kX2RpbXNfb3DaCHF1\\nYW50aWxlqQD69C9leHBvcnQvaGRhMy9ib3JnbGV0L2xvY2FsX3JhbV9mc19kaXJzLzAueXVuZGlf\\nbXVwcGV0XzBfMTIyNzA4MzMuMTMueXVuZGkuMTk0NzMxNDE3OTYxLjhmNGY5Zjk4Y2I3YTMwNTUv\\nYnVpbGRfdGFyZ2V0X3RyYWluX3Bhcl9kOTc1NzUzNzAxNmEyZWI4L3RyYWluLnBhci9nb29nbGUz\\nL2xlYXJuaW5nL3NtYXJ0Y2hvaWNlcy9yZXNlYXJjaC9jbGllbnRzL2NvbXBpbGVyX29wdC9wb2xp\\nY3lfdHJhaW5pbmcvZmVhdHVyZV9vcHMucHnaDW5vcm1hbGl6YXRpb24wAAAAcwoAAAAAAQgBBAEK\\nARAB\\n\", null, {\"class_name\": \"__tuple__\", \"items\": [{\"class_name\": \"ExpandDims\", \"config\": {\"name\": \"expand_dims\", \"trainable\": true, \"dtype\": \"float32\", \"axis\": -1}}, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 18.0, 18.0, 18.0, 18.0, 18.0, 19.0, 19.0, 19.0, 19.0, 19.0, 20.0, 20.0, 20.0, 20.0, 20.0, 21.0, 21.0, 21.0, 21.0, 21.0, 21.0, 21.0, 22.0, 22.0, 22.0, 22.0, 23.0, 23.0, 23.0, 24.0, 24.0, 24.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 26.0, 26.0, 26.0, 27.0, 27.0, 27.0, 27.0, 28.0, 28.0, 29.0, 29.0, 29.0, 29.0, 30.0, 30.0, 31.0, 31.0, 31.0, 31.0, 32.0, 32.0, 33.0, 33.0, 33.0, 34.0, 34.0, 34.0, 34.0, 35.0, 35.0, 36.0, 36.0, 37.0, 37.0, 37.0, 38.0, 38.0, 39.0, 39.0, 40.0, 40.0, 41.0, 41.0, 41.0, 42.0, 43.0, 43.0, 44.0, 44.0, 45.0, 45.0, 46.0, 46.0, 46.0, 47.0, 47.0, 48.0, 49.0, 49.0, 50.0, 50.0, 51.0, 52.0, 53.0, 53.0, 54.0, 55.0, 56.0, 57.0, 57.0, 58.0, 59.0, 60.0, 61.0, 61.0, 63.0, 63.0, 64.0, 65.0, 66.0, 67.0, 67.0, 69.0, 70.0, 71.0, 72.0, 73.0, 74.0, 75.0, 77.0, 78.0, 79.0, 80.0, 81.0, 82.0, 83.0, 85.0, 86.0, 88.0, 89.0, 91.0, 92.0, 94.0, 96.0, 97.0, 99.0, 100.0, 101.0, 103.0, 105.0, 107.0, 109.0, 111.0, 113.0, 115.0, 118.0, 121.0, 123.0, 126.0, 128.0, 130.0, 133.0, 135.0, 137.0, 140.0, 143.0, 146.0, 148.0, 151.0, 154.0, 157.0, 161.0, 163.0, 166.0, 169.0, 173.0, 178.0, 183.0, 189.0, 193.0, 197.0, 202.0, 208.0, 213.0, 218.0, 223.0, 228.0, 233.0, 239.0, 245.0, 250.0, 257.0, 262.0, 269.0, 277.0, 284.0, 292.0, 300.0, 308.0, 319.0, 329.0, 340.0, 349.0, 359.0, 371.0, 382.0, 394.0, 410.0, 423.0, 435.0, 445.0, 462.0, 480.0, 492.0, 506.0, 519.0, 536.0, 557.0, 577.0, 598.0, 622.0, 655.0, 679.0, 707.0, 733.0, 751.0, 787.0, 814.0, 847.0, 897.0, 934.0, 997.0, 1062.0, 1111.0, 1181.0, 1275.0, 1385.0, 1465.0, 1603.0, 1769.0, 2057.0, 2257.0, 2803.0, 3468.0, 4417.0, 6538.0, 16126.0, 23446.0, 33536.0]]}]}, \"function_type\": \"lambda\", \"module\": \"google3.learning.smartchoices.research.clients.compiler_opt.policy_training.feature_ops\", \"output_shape\": null, \"output_shape_type\": \"raw\", \"output_shape_module\": null, \"arguments\": {}}}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 82
+        local_name: "variables"
+      }
+      children {
+        node_id: 83
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 84
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 85
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 230
+        local_name: "__call__"
+      }
+      children {
+        node_id: 231
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Lambda\", \"name\": \"lambda_3\", \"trainable\": true, \"expects_training_arg\": true, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"lambda_3\", \"trainable\": true, \"dtype\": \"float32\", \"function\": {\"class_name\": \"__tuple__\", \"items\": [\"4wEAAAAAAAAAAwAAAAUAAAATAAAAc0QAAACIAHwAgwF9AXQAagF0AmoDfAGIAYMCdABqBIMCdAWI\\nAYMBGwB9AnQAagZ8AnQAagd8AoMBfAJ8AhQAZwNkA2QCjQJTACkETukBAAAAKQHaBGF4aXPp////\\n/ykI2gJ0ZtoEY2FzdNoOY29udHJpYl9sYXllcnPaCWJ1Y2tldGl6ZdoHZmxvYXQzMtoDbGVu2gZj\\nb25jYXTaBHNxcnQpA9oDb2Jz2gxleHBhbmRlZF9vYnPaAXgpAtoOZXhwYW5kX2RpbXNfb3DaCHF1\\nYW50aWxlqQD69C9leHBvcnQvaGRhMy9ib3JnbGV0L2xvY2FsX3JhbV9mc19kaXJzLzAueXVuZGlf\\nbXVwcGV0XzBfMTIyNzA4MzMuMTMueXVuZGkuMTk0NzMxNDE3OTYxLjhmNGY5Zjk4Y2I3YTMwNTUv\\nYnVpbGRfdGFyZ2V0X3RyYWluX3Bhcl9kOTc1NzUzNzAxNmEyZWI4L3RyYWluLnBhci9nb29nbGUz\\nL2xlYXJuaW5nL3NtYXJ0Y2hvaWNlcy9yZXNlYXJjaC9jbGllbnRzL2NvbXBpbGVyX29wdC9wb2xp\\nY3lfdHJhaW5pbmcvZmVhdHVyZV9vcHMucHnaDW5vcm1hbGl6YXRpb24wAAAAcwoAAAAAAQgBBAEK\\nARAB\\n\", null, {\"class_name\": \"__tuple__\", \"items\": [{\"class_name\": \"ExpandDims\", \"config\": {\"name\": \"expand_dims\", \"trainable\": true, \"dtype\": \"float32\", \"axis\": -1}}, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 20.0, 20.0, 20.0, 20.0, 20.0, 21.0, 21.0, 21.0, 21.0, 21.0, 21.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 23.0, 23.0, 23.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 25.0, 25.0, 25.0, 25.0, 25.0, 26.0, 26.0, 26.0, 26.0, 27.0, 27.0, 27.0, 27.0, 27.0, 28.0, 28.0, 28.0, 29.0, 29.0, 29.0, 29.0, 30.0, 30.0, 30.0, 31.0, 31.0, 31.0, 32.0, 32.0, 32.0, 33.0, 33.0, 33.0, 34.0, 34.0, 34.0, 34.0, 35.0, 35.0, 35.0, 36.0, 36.0, 36.0, 37.0, 37.0, 37.0, 38.0, 38.0, 38.0, 38.0, 39.0, 39.0, 40.0, 40.0, 41.0, 41.0, 42.0, 43.0, 43.0, 44.0, 45.0, 45.0, 46.0, 47.0, 47.0, 48.0, 49.0, 49.0, 50.0, 50.0, 52.0, 52.0, 53.0, 54.0, 55.0, 55.0, 57.0, 58.0, 59.0, 60.0, 62.0, 64.0, 65.0, 66.0, 68.0, 70.0, 70.0, 70.0, 70.0, 70.0, 71.0, 73.0, 75.0, 76.0, 78.0, 81.0, 84.0, 86.0, 90.0, 94.0, 98.0, 101.0, 106.0, 111.0, 117.0, 123.0, 130.0, 138.0, 146.0, 157.0, 163.0, 176.0, 187.0, 198.0, 214.0, 227.0, 252.0, 280.0, 327.0, 395.0, 506.0, 671.0, 1025.0, 1971.0]]}]}, \"function_type\": \"lambda\", \"module\": \"google3.learning.smartchoices.research.clients.compiler_opt.policy_training.feature_ops\", \"output_shape\": null, \"output_shape_type\": \"raw\", \"output_shape_module\": null, \"arguments\": {}}}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 86
+        local_name: "variables"
+      }
+      children {
+        node_id: 87
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 88
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 89
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 232
+        local_name: "__call__"
+      }
+      children {
+        node_id: 233
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Lambda\", \"name\": \"lambda_4\", \"trainable\": true, \"expects_training_arg\": true, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"lambda_4\", \"trainable\": true, \"dtype\": \"float32\", \"function\": {\"class_name\": \"__tuple__\", \"items\": [\"4wEAAAAAAAAAAwAAAAUAAAATAAAAc0QAAACIAHwAgwF9AXQAagF0AmoDfAGIAYMCdABqBIMCdAWI\\nAYMBGwB9AnQAagZ8AnQAagd8AoMBfAJ8AhQAZwNkA2QCjQJTACkETukBAAAAKQHaBGF4aXPp////\\n/ykI2gJ0ZtoEY2FzdNoOY29udHJpYl9sYXllcnPaCWJ1Y2tldGl6ZdoHZmxvYXQzMtoDbGVu2gZj\\nb25jYXTaBHNxcnQpA9oDb2Jz2gxleHBhbmRlZF9vYnPaAXgpAtoOZXhwYW5kX2RpbXNfb3DaCHF1\\nYW50aWxlqQD69C9leHBvcnQvaGRhMy9ib3JnbGV0L2xvY2FsX3JhbV9mc19kaXJzLzAueXVuZGlf\\nbXVwcGV0XzBfMTIyNzA4MzMuMTMueXVuZGkuMTk0NzMxNDE3OTYxLjhmNGY5Zjk4Y2I3YTMwNTUv\\nYnVpbGRfdGFyZ2V0X3RyYWluX3Bhcl9kOTc1NzUzNzAxNmEyZWI4L3RyYWluLnBhci9nb29nbGUz\\nL2xlYXJuaW5nL3NtYXJ0Y2hvaWNlcy9yZXNlYXJjaC9jbGllbnRzL2NvbXBpbGVyX29wdC9wb2xp\\nY3lfdHJhaW5pbmcvZmVhdHVyZV9vcHMucHnaDW5vcm1hbGl6YXRpb24wAAAAcwoAAAAAAQgBBAEK\\nARAB\\n\", null, {\"class_name\": \"__tuple__\", \"items\": [{\"class_name\": \"ExpandDims\", \"config\": {\"name\": \"expand_dims\", \"trainable\": true, \"dtype\": \"float32\", \"axis\": -1}}, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 11.0, 11.0, 11.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 13.0, 13.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 19.0, 19.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 21.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 24.0, 25.0, 26.0, 26.0, 26.0, 26.0, 26.0, 26.0, 26.0, 26.0, 26.0, 26.0, 27.0, 28.0, 28.0, 28.0, 28.0, 28.0, 29.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 31.0, 32.0, 32.0, 32.0, 32.0, 32.0, 34.0, 34.0, 34.0, 34.0, 34.0, 34.0, 35.0, 36.0, 36.0, 36.0, 37.0, 38.0, 38.0, 38.0, 39.0, 40.0, 40.0, 41.0, 42.0, 42.0, 43.0, 44.0, 44.0, 46.0, 46.0, 47.0, 48.0, 48.0, 50.0, 50.0, 52.0, 52.0, 54.0, 55.0, 55.0, 56.0, 57.0, 58.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 62.0, 62.0, 64.0, 65.0, 66.0, 68.0, 70.0, 72.0, 74.0, 77.0, 80.0, 82.0, 86.0, 89.0, 92.0, 96.0, 99.0, 104.0, 108.0, 114.0, 119.0, 125.0, 131.0, 139.0, 146.0, 157.0, 167.0, 176.0, 188.0, 198.0, 215.0, 236.0, 262.0, 306.0, 376.0, 462.0, 596.0, 942.0, 1428.0]]}]}, \"function_type\": \"lambda\", \"module\": \"google3.learning.smartchoices.research.clients.compiler_opt.policy_training.feature_ops\", \"output_shape\": null, \"output_shape_type\": \"raw\", \"output_shape_module\": null, \"arguments\": {}}}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 90
+        local_name: "variables"
+      }
+      children {
+        node_id: 91
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 92
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 93
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 234
+        local_name: "__call__"
+      }
+      children {
+        node_id: 235
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Lambda\", \"name\": \"lambda_5\", \"trainable\": true, \"expects_training_arg\": true, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"lambda_5\", \"trainable\": true, \"dtype\": \"float32\", \"function\": {\"class_name\": \"__tuple__\", \"items\": [\"4wEAAAAAAAAAAwAAAAUAAAATAAAAc0QAAACIAHwAgwF9AXQAagF0AmoDfAGIAYMCdABqBIMCdAWI\\nAYMBGwB9AnQAagZ8AnQAagd8AoMBfAJ8AhQAZwNkA2QCjQJTACkETukBAAAAKQHaBGF4aXPp////\\n/ykI2gJ0ZtoEY2FzdNoOY29udHJpYl9sYXllcnPaCWJ1Y2tldGl6ZdoHZmxvYXQzMtoDbGVu2gZj\\nb25jYXTaBHNxcnQpA9oDb2Jz2gxleHBhbmRlZF9vYnPaAXgpAtoOZXhwYW5kX2RpbXNfb3DaCHF1\\nYW50aWxlqQD69C9leHBvcnQvaGRhMy9ib3JnbGV0L2xvY2FsX3JhbV9mc19kaXJzLzAueXVuZGlf\\nbXVwcGV0XzBfMTIyNzA4MzMuMTMueXVuZGkuMTk0NzMxNDE3OTYxLjhmNGY5Zjk4Y2I3YTMwNTUv\\nYnVpbGRfdGFyZ2V0X3RyYWluX3Bhcl9kOTc1NzUzNzAxNmEyZWI4L3RyYWluLnBhci9nb29nbGUz\\nL2xlYXJuaW5nL3NtYXJ0Y2hvaWNlcy9yZXNlYXJjaC9jbGllbnRzL2NvbXBpbGVyX29wdC9wb2xp\\nY3lfdHJhaW5pbmcvZmVhdHVyZV9vcHMucHnaDW5vcm1hbGl6YXRpb24wAAAAcwoAAAAAAQgBBAEK\\nARAB\\n\", null, {\"class_name\": \"__tuple__\", \"items\": [{\"class_name\": \"ExpandDims\", \"config\": {\"name\": \"expand_dims\", \"trainable\": true, \"dtype\": \"float32\", \"axis\": -1}}, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0, 9.0, 9.0, 9.0, 9.0, 9.0, 10.0, 10.0, 11.0, 11.0, 12.0, 13.0, 14.0, 15.0, 16.0, 18.0, 20.0, 23.0, 29.0, 38.0, 60.0]]}]}, \"function_type\": \"lambda\", \"module\": \"google3.learning.smartchoices.research.clients.compiler_opt.policy_training.feature_ops\", \"output_shape\": null, \"output_shape_type\": \"raw\", \"output_shape_module\": null, \"arguments\": {}}}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 94
+        local_name: "variables"
+      }
+      children {
+        node_id: 95
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 96
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 97
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 236
+        local_name: "__call__"
+      }
+      children {
+        node_id: 237
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Lambda\", \"name\": \"lambda_6\", \"trainable\": true, \"expects_training_arg\": true, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"lambda_6\", \"trainable\": true, \"dtype\": \"float32\", \"function\": {\"class_name\": \"__tuple__\", \"items\": [\"4wEAAAAAAAAAAwAAAAUAAAATAAAAc0QAAACIAHwAgwF9AXQAagF0AmoDfAGIAYMCdABqBIMCdAWI\\nAYMBGwB9AnQAagZ8AnQAagd8AoMBfAJ8AhQAZwNkA2QCjQJTACkETukBAAAAKQHaBGF4aXPp////\\n/ykI2gJ0ZtoEY2FzdNoOY29udHJpYl9sYXllcnPaCWJ1Y2tldGl6ZdoHZmxvYXQzMtoDbGVu2gZj\\nb25jYXTaBHNxcnQpA9oDb2Jz2gxleHBhbmRlZF9vYnPaAXgpAtoOZXhwYW5kX2RpbXNfb3DaCHF1\\nYW50aWxlqQD69C9leHBvcnQvaGRhMy9ib3JnbGV0L2xvY2FsX3JhbV9mc19kaXJzLzAueXVuZGlf\\nbXVwcGV0XzBfMTIyNzA4MzMuMTMueXVuZGkuMTk0NzMxNDE3OTYxLjhmNGY5Zjk4Y2I3YTMwNTUv\\nYnVpbGRfdGFyZ2V0X3RyYWluX3Bhcl9kOTc1NzUzNzAxNmEyZWI4L3RyYWluLnBhci9nb29nbGUz\\nL2xlYXJuaW5nL3NtYXJ0Y2hvaWNlcy9yZXNlYXJjaC9jbGllbnRzL2NvbXBpbGVyX29wdC9wb2xp\\nY3lfdHJhaW5pbmcvZmVhdHVyZV9vcHMucHnaDW5vcm1hbGl6YXRpb24wAAAAcwoAAAAAAQgBBAEK\\nARAB\\n\", null, {\"class_name\": \"__tuple__\", \"items\": [{\"class_name\": \"ExpandDims\", \"config\": {\"name\": \"expand_dims\", \"trainable\": true, \"dtype\": \"float32\", \"axis\": -1}}, [1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 3.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 4.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 6.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 7.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 8.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 9.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 11.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 12.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 13.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 14.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 16.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 17.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 18.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 19.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 21.0, 21.0, 21.0, 21.0, 21.0, 21.0, 21.0, 21.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 22.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 23.0, 24.0, 24.0, 24.0, 24.0, 24.0, 25.0, 25.0, 25.0, 25.0, 25.0, 26.0, 26.0, 26.0, 26.0, 27.0, 27.0, 27.0, 28.0, 28.0, 28.0, 29.0, 29.0, 30.0, 30.0, 30.0, 31.0, 31.0, 32.0, 32.0, 33.0, 33.0, 34.0, 35.0, 37.0, 38.0, 40.0, 46.0, 51.0]]}]}, \"function_type\": \"lambda\", \"module\": \"google3.learning.smartchoices.research.clients.compiler_opt.policy_training.feature_ops\", \"output_shape\": null, \"output_shape_type\": \"raw\", \"output_shape_module\": null, \"arguments\": {}}}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 98
+        local_name: "variables"
+      }
+      children {
+        node_id: 99
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 100
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 101
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 238
+        local_name: "__call__"
+      }
+      children {
+        node_id: 239
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Lambda\", \"name\": \"lambda_7\", \"trainable\": true, \"expects_training_arg\": true, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"lambda_7\", \"trainable\": true, \"dtype\": \"float32\", \"function\": {\"class_name\": \"__tuple__\", \"items\": [\"4wEAAAAAAAAAAwAAAAUAAAATAAAAc0QAAACIAHwAgwF9AXQAagF0AmoDfAGIAYMCdABqBIMCdAWI\\nAYMBGwB9AnQAagZ8AnQAagd8AoMBfAJ8AhQAZwNkA2QCjQJTACkETukBAAAAKQHaBGF4aXPp////\\n/ykI2gJ0ZtoEY2FzdNoOY29udHJpYl9sYXllcnPaCWJ1Y2tldGl6ZdoHZmxvYXQzMtoDbGVu2gZj\\nb25jYXTaBHNxcnQpA9oDb2Jz2gxleHBhbmRlZF9vYnPaAXgpAtoOZXhwYW5kX2RpbXNfb3DaCHF1\\nYW50aWxlqQD69C9leHBvcnQvaGRhMy9ib3JnbGV0L2xvY2FsX3JhbV9mc19kaXJzLzAueXVuZGlf\\nbXVwcGV0XzBfMTIyNzA4MzMuMTMueXVuZGkuMTk0NzMxNDE3OTYxLjhmNGY5Zjk4Y2I3YTMwNTUv\\nYnVpbGRfdGFyZ2V0X3RyYWluX3Bhcl9kOTc1NzUzNzAxNmEyZWI4L3RyYWluLnBhci9nb29nbGUz\\nL2xlYXJuaW5nL3NtYXJ0Y2hvaWNlcy9yZXNlYXJjaC9jbGllbnRzL2NvbXBpbGVyX29wdC9wb2xp\\nY3lfdHJhaW5pbmcvZmVhdHVyZV9vcHMucHnaDW5vcm1hbGl6YXRpb24wAAAAcwoAAAAAAQgBBAEK\\nARAB\\n\", null, {\"class_name\": \"__tuple__\", \"items\": [{\"class_name\": \"ExpandDims\", \"config\": {\"name\": \"expand_dims\", \"trainable\": true, \"dtype\": \"float32\", \"axis\": -1}}, [-15035.0, -15030.0, -15025.0, -15000.0, -14985.0, -14945.0, -14745.0, -70.0, -55.0, -55.0, -50.0, -50.0, -50.0, -45.0, -45.0, -45.0, -45.0, -45.0, -45.0, -45.0, -45.0, -45.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -40.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -35.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -30.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -25.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -20.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -15.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -10.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, -5.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 5.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 10.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 15.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 20.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 25.0, 30.0, 30.0, 30.0, 30.0, 30.0, 30.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 35.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 40.0, 45.0, 45.0, 45.0, 45.0, 45.0, 45.0, 45.0, 45.0, 45.0, 45.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 50.0, 55.0, 55.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 60.0, 65.0, 65.0, 65.0, 65.0, 65.0, 65.0, 65.0, 65.0, 65.0, 65.0, 65.0, 65.0, 70.0, 70.0, 70.0, 70.0, 70.0, 70.0, 70.0, 75.0, 75.0, 80.0, 80.0, 80.0, 85.0, 85.0, 85.0, 90.0, 90.0, 90.0, 90.0, 95.0, 95.0, 100.0, 100.0, 105.0, 110.0, 115.0, 120.0, 125.0, 125.0, 130.0, 140.0, 140.0, 145.0, 150.0, 155.0, 160.0, 160.0, 165.0, 170.0, 175.0, 180.0, 190.0, 200.0, 210.0, 215.0, 220.0, 220.0, 230.0, 235.0, 245.0, 250.0, 260.0, 275.0, 290.0, 305.0, 325.0, 350.0, 370.0, 390.0, 425.0, 460.0, 500.0, 560.0, 650.0, 790.0, 1025.0, 1600.0]]}]}, \"function_type\": \"lambda\", \"module\": \"google3.learning.smartchoices.research.clients.compiler_opt.policy_training.feature_ops\", \"output_shape\": null, \"output_shape_type\": \"raw\", \"output_shape_module\": null, \"arguments\": {}}}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 102
+        local_name: "variables"
+      }
+      children {
+        node_id: 103
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 104
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 105
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 240
+        local_name: "__call__"
+      }
+      children {
+        node_id: 241
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Lambda\", \"name\": \"lambda_8\", \"trainable\": true, \"expects_training_arg\": true, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"lambda_8\", \"trainable\": true, \"dtype\": \"float32\", \"function\": {\"class_name\": \"__tuple__\", \"items\": [\"4wEAAAAAAAAAAwAAAAUAAAATAAAAc0QAAACIAHwAgwF9AXQAagF0AmoDfAGIAYMCdABqBIMCdAWI\\nAYMBGwB9AnQAagZ8AnQAagd8AoMBfAJ8AhQAZwNkA2QCjQJTACkETukBAAAAKQHaBGF4aXPp////\\n/ykI2gJ0ZtoEY2FzdNoOY29udHJpYl9sYXllcnPaCWJ1Y2tldGl6ZdoHZmxvYXQzMtoDbGVu2gZj\\nb25jYXTaBHNxcnQpA9oDb2Jz2gxleHBhbmRlZF9vYnPaAXgpAtoOZXhwYW5kX2RpbXNfb3DaCHF1\\nYW50aWxlqQD69C9leHBvcnQvaGRhMy9ib3JnbGV0L2xvY2FsX3JhbV9mc19kaXJzLzAueXVuZGlf\\nbXVwcGV0XzBfMTIyNzA4MzMuMTMueXVuZGkuMTk0NzMxNDE3OTYxLjhmNGY5Zjk4Y2I3YTMwNTUv\\nYnVpbGRfdGFyZ2V0X3RyYWluX3Bhcl9kOTc1NzUzNzAxNmEyZWI4L3RyYWluLnBhci9nb29nbGUz\\nL2xlYXJuaW5nL3NtYXJ0Y2hvaWNlcy9yZXNlYXJjaC9jbGllbnRzL2NvbXBpbGVyX29wdC9wb2xp\\nY3lfdHJhaW5pbmcvZmVhdHVyZV9vcHMucHnaDW5vcm1hbGl6YXRpb24wAAAAcwoAAAAAAQgBBAEK\\nARAB\\n\", null, {\"class_name\": \"__tuple__\", \"items\": [{\"class_name\": \"ExpandDims\", \"config\": {\"name\": \"expand_dims\", \"trainable\": true, \"dtype\": \"float32\", \"axis\": -1}}, [18.0, 29.0, 39.0, 48.0, 57.0, 64.0, 70.0, 76.0, 82.0, 87.0, 92.0, 97.0, 101.0, 105.0, 109.0, 113.0, 116.0, 120.0, 123.0, 127.0, 130.0, 134.0, 137.0, 140.0, 143.0, 146.0, 149.0, 152.0, 156.0, 159.0, 162.0, 165.0, 168.0, 171.0, 174.0, 177.0, 180.0, 183.0, 186.0, 188.0, 191.0, 194.0, 197.0, 200.0, 203.0, 205.0, 208.0, 211.0, 214.0, 217.0, 219.0, 222.0, 225.0, 228.0, 231.0, 233.0, 236.0, 239.0, 242.0, 244.0, 247.0, 250.0, 253.0, 255.0, 258.0, 261.0, 264.0, 266.0, 269.0, 272.0, 275.0, 278.0, 280.0, 283.0, 286.0, 289.0, 292.0, 294.0, 297.0, 300.0, 303.0, 305.0, 308.0, 311.0, 314.0, 317.0, 319.0, 322.0, 325.0, 327.0, 330.0, 333.0, 336.0, 339.0, 341.0, 344.0, 347.0, 350.0, 353.0, 355.0, 358.0, 361.0, 364.0, 367.0, 370.0, 373.0, 375.0, 378.0, 381.0, 384.0, 387.0, 390.0, 393.0, 396.0, 399.0, 401.0, 404.0, 407.0, 410.0, 413.0, 416.0, 419.0, 422.0, 425.0, 428.0, 431.0, 434.0, 437.0, 440.0, 443.0, 446.0, 449.0, 452.0, 455.0, 458.0, 461.0, 464.0, 467.0, 470.0, 473.0, 476.0, 479.0, 483.0, 486.0, 489.0, 492.0, 495.0, 498.0, 501.0, 504.0, 507.0, 511.0, 514.0, 517.0, 520.0, 523.0, 526.0, 530.0, 533.0, 536.0, 539.0, 542.0, 545.0, 549.0, 552.0, 555.0, 558.0, 562.0, 565.0, 569.0, 572.0, 575.0, 579.0, 582.0, 585.0, 589.0, 592.0, 595.0, 599.0, 602.0, 605.0, 609.0, 612.0, 616.0, 620.0, 623.0, 626.0, 630.0, 634.0, 637.0, 641.0, 644.0, 648.0, 651.0, 655.0, 658.0, 662.0, 665.0, 669.0, 672.0, 676.0, 680.0, 683.0, 687.0, 691.0, 694.0, 698.0, 702.0, 705.0, 709.0, 712.0, 716.0, 720.0, 724.0, 727.0, 731.0, 735.0, 739.0, 742.0, 746.0, 750.0, 754.0, 758.0, 761.0, 765.0, 769.0, 773.0, 777.0, 780.0, 784.0, 788.0, 792.0, 796.0, 800.0, 804.0, 808.0, 812.0, 816.0, 820.0, 823.0, 828.0, 832.0, 836.0, 840.0, 844.0, 848.0, 852.0, 856.0, 860.0, 864.0, 868.0, 873.0, 877.0, 881.0, 885.0, 889.0, 893.0, 897.0, 902.0, 906.0, 910.0, 914.0, 919.0, 923.0, 927.0, 931.0, 935.0, 940.0, 944.0, 948.0, 953.0, 957.0, 962.0, 966.0, 970.0, 975.0, 979.0, 984.0, 988.0, 993.0, 997.0, 1002.0, 1006.0, 1011.0, 1015.0, 1020.0, 1024.0, 1029.0, 1034.0, 1038.0, 1043.0, 1047.0, 1052.0, 1057.0, 1062.0, 1066.0, 1071.0, 1076.0, 1081.0, 1086.0, 1090.0, 1095.0, 1100.0, 1105.0, 1110.0, 1114.0, 1119.0, 1124.0, 1129.0, 1134.0, 1139.0, 1144.0, 1149.0, 1154.0, 1159.0, 1164.0, 1169.0, 1174.0, 1179.0, 1184.0, 1189.0, 1194.0, 1199.0, 1204.0, 1209.0, 1215.0, 1220.0, 1225.0, 1230.0, 1235.0, 1241.0, 1246.0, 1251.0, 1257.0, 1262.0, 1267.0, 1273.0, 1278.0, 1284.0, 1289.0, 1294.0, 1300.0, 1305.0, 1311.0, 1316.0, 1322.0, 1327.0, 1333.0, 1338.0, 1344.0, 1350.0, 1355.0, 1361.0, 1367.0, 1372.0, 1378.0, 1383.0, 1389.0, 1395.0, 1401.0, 1407.0, 1413.0, 1418.0, 1424.0, 1430.0, 1436.0, 1442.0, 1448.0, 1454.0, 1459.0, 1465.0, 1472.0, 1477.0, 1483.0, 1489.0, 1495.0, 1501.0, 1507.0, 1514.0, 1520.0, 1526.0, 1532.0, 1538.0, 1545.0, 1551.0, 1557.0, 1564.0, 1570.0, 1576.0, 1583.0, 1589.0, 1596.0, 1602.0, 1608.0, 1615.0, 1621.0, 1628.0, 1634.0, 1641.0, 1647.0, 1654.0, 1661.0, 1667.0, 1674.0, 1681.0, 1687.0, 1694.0, 1701.0, 1708.0, 1715.0, 1722.0, 1729.0, 1735.0, 1742.0, 1749.0, 1756.0, 1763.0, 1770.0, 1777.0, 1784.0, 1791.0, 1798.0, 1806.0, 1812.0, 1820.0, 1827.0, 1835.0, 1841.0, 1849.0, 1856.0, 1863.0, 1871.0, 1878.0, 1885.0, 1893.0, 1901.0, 1908.0, 1915.0, 1923.0, 1930.0, 1938.0, 1946.0, 1953.0, 1961.0, 1969.0, 1976.0, 1984.0, 1992.0, 2000.0, 2007.0, 2015.0, 2023.0, 2031.0, 2039.0, 2047.0, 2055.0, 2063.0, 2071.0, 2079.0, 2087.0, 2095.0, 2104.0, 2112.0, 2120.0, 2128.0, 2137.0, 2146.0, 2154.0, 2162.0, 2171.0, 2179.0, 2188.0, 2197.0, 2205.0, 2214.0, 2223.0, 2232.0, 2241.0, 2250.0, 2258.0, 2268.0, 2277.0, 2285.0, 2294.0, 2304.0, 2313.0, 2322.0, 2331.0, 2340.0, 2350.0, 2359.0, 2368.0, 2378.0, 2388.0, 2397.0, 2407.0, 2416.0, 2426.0, 2436.0, 2446.0, 2455.0, 2465.0, 2475.0, 2485.0, 2495.0, 2505.0, 2515.0, 2525.0, 2535.0, 2545.0, 2556.0, 2566.0, 2577.0, 2587.0, 2598.0, 2609.0, 2620.0, 2631.0, 2641.0, 2652.0, 2663.0, 2674.0, 2685.0, 2696.0, 2708.0, 2719.0, 2730.0, 2742.0, 2753.0, 2764.0, 2776.0, 2788.0, 2799.0, 2811.0, 2823.0, 2835.0, 2847.0, 2858.0, 2870.0, 2882.0, 2894.0, 2906.0, 2919.0, 2931.0, 2943.0, 2956.0, 2968.0, 2981.0, 2994.0, 3006.0, 3019.0, 3032.0, 3045.0, 3058.0, 3070.0, 3083.0, 3096.0, 3109.0, 3121.0, 3134.0, 3148.0, 3161.0, 3174.0, 3187.0, 3200.0, 3214.0, 3228.0, 3242.0, 3255.0, 3268.0, 3283.0, 3297.0, 3310.0, 3325.0, 3340.0, 3353.0, 3368.0, 3383.0, 3398.0, 3412.0, 3427.0, 3442.0, 3457.0, 3471.0, 3487.0, 3502.0, 3516.0, 3531.0, 3546.0, 3561.0, 3577.0, 3593.0, 3608.0, 3625.0, 3641.0, 3657.0, 3673.0, 3690.0, 3706.0, 3722.0, 3738.0, 3755.0, 3772.0, 3789.0, 3805.0, 3823.0, 3839.0, 3856.0, 3873.0, 3891.0, 3908.0, 3926.0, 3944.0, 3960.0, 3977.0, 3995.0, 4013.0, 4031.0, 4048.0, 4067.0, 4085.0, 4104.0, 4122.0, 4140.0, 4159.0, 4177.0, 4196.0, 4215.0, 4234.0, 4253.0, 4272.0, 4291.0, 4311.0, 4332.0, 4351.0, 4371.0, 4391.0, 4412.0, 4433.0, 4454.0, 4474.0, 4496.0, 4518.0, 4538.0, 4558.0, 4579.0, 4601.0, 4619.0, 4640.0, 4662.0, 4684.0, 4706.0, 4728.0, 4751.0, 4771.0, 4794.0, 4818.0, 4840.0, 4863.0, 4887.0, 4910.0, 4933.0, 4956.0, 4980.0, 5004.0, 5028.0, 5052.0, 5076.0, 5100.0, 5125.0, 5152.0, 5175.0, 5200.0, 5226.0, 5251.0, 5278.0, 5304.0, 5329.0, 5354.0, 5381.0, 5407.0, 5433.0, 5460.0, 5488.0, 5516.0, 5544.0, 5573.0, 5600.0, 5628.0, 5656.0, 5684.0, 5713.0, 5741.0, 5771.0, 5799.0, 5830.0, 5860.0, 5891.0, 5921.0, 5951.0, 5980.0, 6010.0, 6041.0, 6073.0, 6105.0, 6133.0, 6163.0, 6195.0, 6227.0, 6258.0, 6291.0, 6322.0, 6356.0, 6390.0, 6424.0, 6457.0, 6491.0, 6527.0, 6561.0, 6596.0, 6631.0, 6665.0, 6701.0, 6736.0, 6771.0, 6805.0, 6840.0, 6877.0, 6911.0, 6947.0, 6985.0, 7022.0, 7059.0, 7097.0, 7135.0, 7174.0, 7212.0, 7251.0, 7289.0, 7327.0, 7366.0, 7406.0, 7447.0, 7486.0, 7525.0, 7566.0, 7606.0, 7646.0, 7688.0, 7728.0, 7771.0, 7814.0, 7859.0, 7901.0, 7949.0, 7992.0, 8036.0, 8082.0, 8127.0, 8173.0, 8218.0, 8262.0, 8309.0, 8353.0, 8397.0, 8444.0, 8489.0, 8539.0, 8585.0, 8632.0, 8682.0, 8727.0, 8777.0, 8828.0, 8879.0, 8929.0, 8982.0, 9037.0, 9087.0, 9140.0, 9193.0, 9250.0, 9305.0, 9361.0, 9418.0, 9475.0, 9532.0, 9589.0, 9644.0, 9699.0, 9758.0, 9818.0, 9875.0, 9935.0, 9997.0, 10057.0, 10117.0, 10174.0, 10232.0, 10296.0, 10356.0, 10419.0, 10482.0, 10546.0, 10608.0, 10670.0, 10729.0, 10790.0, 10855.0, 10920.0, 10990.0, 11054.0, 11118.0, 11181.0, 11248.0, 11316.0, 11385.0, 11454.0, 11526.0, 11597.0, 11667.0, 11740.0, 11820.0, 11897.0, 11973.0, 12046.0, 12126.0, 12204.0, 12287.0, 12370.0, 12456.0, 12538.0, 12627.0, 12714.0, 12799.0, 12883.0, 12971.0, 13062.0, 13154.0, 13233.0, 13328.0, 13418.0, 13511.0, 13607.0, 13709.0, 13806.0, 13903.0, 14002.0, 14104.0, 14200.0, 14288.0, 14391.0, 14488.0, 14590.0, 14698.0, 14808.0, 14910.0, 15020.0, 15126.0, 15238.0, 15347.0, 15456.0, 15574.0, 15692.0, 15786.0, 15896.0, 16016.0, 16136.0, 16250.0, 16352.0, 16474.0, 16575.0, 16702.0, 16835.0, 16965.0, 17096.0, 17232.0, 17370.0, 17443.0, 17581.0, 17719.0, 17864.0, 17976.0, 18116.0, 18250.0, 18396.0, 18540.0, 18690.0, 18840.0, 18989.0, 19136.0, 19294.0, 19445.0, 19589.0, 19750.0, 19905.0, 20064.0, 20191.0, 20325.0, 20497.0, 20662.0, 20833.0, 20981.0, 21152.0, 21334.0, 21510.0, 21642.0, 21821.0, 22001.0, 22186.0, 22379.0, 22568.0, 22770.0, 22958.0, 23162.0, 23360.0, 23524.0, 23737.0, 23960.0, 24175.0, 24395.0, 24631.0, 24865.0, 25091.0, 25327.0, 25580.0, 25833.0, 26089.0, 26361.0, 26636.0, 26889.0, 27155.0, 27436.0, 27715.0, 28003.0, 28303.0, 28600.0, 28916.0, 29223.0, 29553.0, 29884.0, 30200.0, 30538.0, 30868.0, 31211.0, 31548.0, 31881.0, 32253.0, 32605.0, 32980.0, 33385.0, 33805.0, 34254.0, 34723.0, 35167.0, 35666.0, 36125.0, 36652.0, 37177.0, 37739.0, 38321.0, 38932.0, 39640.0, 40337.0, 41000.0, 41626.0, 42385.0, 43122.0, 43890.0, 44687.0, 45609.0, 46520.0, 47489.0, 48432.0, 49458.0, 50511.0, 51561.0, 52568.0, 53676.0, 54936.0, 56071.0, 57302.0, 58513.0, 59800.0, 61192.0, 62702.0, 64205.0, 65868.0, 67780.0, 69960.0, 72330.0, 74918.0, 77540.0, 80344.0, 83727.0, 87662.0, 93589.0, 101441.0, 110544.0]]}]}, \"function_type\": \"lambda\", \"module\": \"google3.learning.smartchoices.research.clients.compiler_opt.policy_training.feature_ops\", \"output_shape\": null, \"output_shape_type\": \"raw\", \"output_shape_module\": null, \"arguments\": {}}}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 106
+        local_name: "variables"
+      }
+      children {
+        node_id: 107
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 108
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 109
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 242
+        local_name: "__call__"
+      }
+      children {
+        node_id: 243
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Lambda\", \"name\": \"lambda_9\", \"trainable\": true, \"expects_training_arg\": true, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"lambda_9\", \"trainable\": true, \"dtype\": \"float32\", \"function\": {\"class_name\": \"__tuple__\", \"items\": [\"4wEAAAAAAAAAAgAAAAQAAAATAAAAcxgAAACIAHwAgwF9AXQAagF8AXQAagJkAY0CUwApAk4pAdoF\\nZHR5cGUpA9oCdGbaCnplcm9zX2xpa2XaB2Zsb2F0MzIpAtoDb2Jz2gxleHBhbmRlZF9vYnMpAdoO\\nZXhwYW5kX2RpbXNfb3CpAPr0L2V4cG9ydC9oZGEzL2JvcmdsZXQvbG9jYWxfcmFtX2ZzX2RpcnMv\\nMC55dW5kaV9tdXBwZXRfMF8xMjI3MDgzMy4xMy55dW5kaS4xOTQ3MzE0MTc5NjEuOGY0ZjlmOThj\\nYjdhMzA1NS9idWlsZF90YXJnZXRfdHJhaW5fcGFyX2Q5NzU3NTM3MDE2YTJlYjgvdHJhaW4ucGFy\\nL2dvb2dsZTMvbGVhcm5pbmcvc21hcnRjaG9pY2VzL3Jlc2VhcmNoL2NsaWVudHMvY29tcGlsZXJf\\nb3B0L3BvbGljeV90cmFpbmluZy9mZWF0dXJlX29wcy5wedoPZGlzY2FyZF9mZWF0dXJlJwAAAHME\\nAAAAAAEIAQ==\\n\", null, {\"class_name\": \"__tuple__\", \"items\": [{\"class_name\": \"ExpandDims\", \"config\": {\"name\": \"expand_dims\", \"trainable\": true, \"dtype\": \"float32\", \"axis\": -1}}]}]}, \"function_type\": \"lambda\", \"module\": \"google3.learning.smartchoices.research.clients.compiler_opt.policy_training.feature_ops\", \"output_shape\": null, \"output_shape_type\": \"raw\", \"output_shape_module\": null, \"arguments\": {}}}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 110
+        local_name: "variables"
+      }
+      children {
+        node_id: 111
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 112
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 113
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 244
+        local_name: "__call__"
+      }
+      children {
+        node_id: 245
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Lambda\", \"name\": \"lambda_10\", \"trainable\": true, \"expects_training_arg\": true, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"lambda_10\", \"trainable\": true, \"dtype\": \"float32\", \"function\": {\"class_name\": \"__tuple__\", \"items\": [\"4wEAAAAAAAAAAwAAAAUAAAATAAAAc0QAAACIAHwAgwF9AXQAagF0AmoDfAGIAYMCdABqBIMCdAWI\\nAYMBGwB9AnQAagZ8AnQAagd8AoMBfAJ8AhQAZwNkA2QCjQJTACkETukBAAAAKQHaBGF4aXPp////\\n/ykI2gJ0ZtoEY2FzdNoOY29udHJpYl9sYXllcnPaCWJ1Y2tldGl6ZdoHZmxvYXQzMtoDbGVu2gZj\\nb25jYXTaBHNxcnQpA9oDb2Jz2gxleHBhbmRlZF9vYnPaAXgpAtoOZXhwYW5kX2RpbXNfb3DaCHF1\\nYW50aWxlqQD69C9leHBvcnQvaGRhMy9ib3JnbGV0L2xvY2FsX3JhbV9mc19kaXJzLzAueXVuZGlf\\nbXVwcGV0XzBfMTIyNzA4MzMuMTMueXVuZGkuMTk0NzMxNDE3OTYxLjhmNGY5Zjk4Y2I3YTMwNTUv\\nYnVpbGRfdGFyZ2V0X3RyYWluX3Bhcl9kOTc1NzUzNzAxNmEyZWI4L3RyYWluLnBhci9nb29nbGUz\\nL2xlYXJuaW5nL3NtYXJ0Y2hvaWNlcy9yZXNlYXJjaC9jbGllbnRzL2NvbXBpbGVyX29wdC9wb2xp\\nY3lfdHJhaW5pbmcvZmVhdHVyZV9vcHMucHnaDW5vcm1hbGl6YXRpb24wAAAAcwoAAAAAAQgBBAEK\\nARAB\\n\", null, {\"class_name\": \"__tuple__\", \"items\": [{\"class_name\": \"ExpandDims\", \"config\": {\"name\": \"expand_dims\", \"trainable\": true, \"dtype\": \"float32\", \"axis\": -1}}, [13.0, 38.0, 56.0, 70.0, 82.0, 94.0, 104.0, 114.0, 123.0, 131.0, 139.0, 148.0, 152.0, 153.0, 158.0, 163.0, 170.0, 174.0, 178.0, 180.0, 183.0, 186.0, 188.0, 190.0, 192.0, 196.0, 198.0, 201.0, 205.0, 208.0, 212.0, 215.0, 219.0, 221.0, 225.0, 227.0, 229.0, 232.0, 233.0, 236.0, 239.0, 242.0, 245.0, 248.0, 250.0, 252.0, 254.0, 256.0, 259.0, 261.0, 264.0, 267.0, 270.0, 272.0, 275.0, 278.0, 280.0, 283.0, 285.0, 287.0, 290.0, 293.0, 295.0, 297.0, 300.0, 303.0, 305.0, 308.0, 311.0, 313.0, 316.0, 319.0, 322.0, 325.0, 329.0, 331.0, 333.0, 336.0, 338.0, 340.0, 343.0, 345.0, 347.0, 347.0, 349.0, 351.0, 353.0, 355.0, 357.0, 359.0, 361.0, 363.0, 365.0, 368.0, 369.0, 371.0, 373.0, 375.0, 377.0, 380.0, 382.0, 385.0, 387.0, 389.0, 391.0, 394.0, 396.0, 398.0, 400.0, 403.0, 405.0, 408.0, 410.0, 412.0, 415.0, 417.0, 420.0, 422.0, 425.0, 427.0, 429.0, 432.0, 434.0, 437.0, 439.0, 442.0, 444.0, 446.0, 449.0, 451.0, 454.0, 456.0, 458.0, 461.0, 463.0, 466.0, 469.0, 472.0, 474.0, 476.0, 479.0, 482.0, 483.0, 486.0, 489.0, 492.0, 495.0, 498.0, 500.0, 503.0, 505.0, 508.0, 510.0, 513.0, 516.0, 519.0, 522.0, 524.0, 528.0, 530.0, 533.0, 536.0, 539.0, 541.0, 544.0, 547.0, 550.0, 553.0, 556.0, 559.0, 561.0, 563.0, 567.0, 570.0, 572.0, 575.0, 577.0, 580.0, 584.0, 586.0, 589.0, 592.0, 595.0, 598.0, 601.0, 605.0, 607.0, 611.0, 613.0, 617.0, 620.0, 623.0, 626.0, 629.0, 632.0, 635.0, 639.0, 642.0, 645.0, 648.0, 651.0, 654.0, 657.0, 660.0, 662.0, 666.0, 669.0, 672.0, 676.0, 679.0, 682.0, 685.0, 688.0, 690.0, 693.0, 696.0, 699.0, 702.0, 705.0, 709.0, 712.0, 714.0, 718.0, 721.0, 724.0, 726.0, 728.0, 729.0, 731.0, 734.0, 737.0, 741.0, 745.0, 748.0, 750.0, 753.0, 756.0, 760.0, 763.0, 766.0, 770.0, 773.0, 776.0, 779.0, 782.0, 786.0, 788.0, 793.0, 796.0, 798.0, 802.0, 805.0, 808.0, 811.0, 815.0, 818.0, 820.0, 824.0, 827.0, 829.0, 832.0, 835.0, 838.0, 842.0, 846.0, 849.0, 854.0, 857.0, 860.0, 864.0, 867.0, 871.0, 875.0, 879.0, 882.0, 887.0, 890.0, 893.0, 897.0, 901.0, 905.0, 908.0, 911.0, 915.0, 918.0, 921.0, 925.0, 929.0, 932.0, 934.0, 937.0, 940.0, 943.0, 946.0, 950.0, 953.0, 956.0, 961.0, 965.0, 969.0, 973.0, 976.0, 980.0, 982.0, 985.0, 990.0, 994.0, 997.0, 1001.0, 1005.0, 1007.0, 1010.0, 1014.0, 1018.0, 1022.0, 1025.0, 1028.0, 1033.0, 1035.0, 1038.0, 1042.0, 1047.0, 1052.0, 1056.0, 1060.0, 1063.0, 1067.0, 1071.0, 1075.0, 1079.0, 1083.0, 1086.0, 1088.0, 1092.0, 1097.0, 1102.0, 1106.0, 1109.0, 1113.0, 1117.0, 1120.0, 1125.0, 1129.0, 1134.0, 1137.0, 1142.0, 1146.0, 1150.0, 1151.0, 1155.0, 1159.0, 1162.0, 1166.0, 1170.0, 1174.0, 1177.0, 1181.0, 1185.0, 1188.0, 1193.0, 1196.0, 1203.0, 1207.0, 1212.0, 1214.0, 1217.0, 1220.0, 1222.0, 1222.0, 1226.0, 1229.0, 1233.0, 1237.0, 1241.0, 1246.0, 1250.0, 1253.0, 1257.0, 1262.0, 1267.0, 1272.0, 1278.0, 1283.0, 1287.0, 1293.0, 1297.0, 1301.0, 1304.0, 1309.0, 1315.0, 1320.0, 1325.0, 1329.0, 1333.0, 1336.0, 1341.0, 1344.0, 1348.0, 1351.0, 1357.0, 1363.0, 1368.0, 1374.0, 1379.0, 1383.0, 1386.0, 1391.0, 1395.0, 1399.0, 1403.0, 1407.0, 1410.0, 1415.0, 1418.0, 1423.0, 1428.0, 1432.0, 1436.0, 1438.0, 1442.0, 1446.0, 1450.0, 1454.0, 1462.0, 1467.0, 1472.0, 1477.0, 1483.0, 1488.0, 1492.0, 1496.0, 1503.0, 1508.0, 1513.0, 1518.0, 1520.0, 1526.0, 1531.0, 1534.0, 1538.0, 1542.0, 1546.0, 1552.0, 1558.0, 1564.0, 1568.0, 1573.0, 1578.0, 1581.0, 1590.0, 1596.0, 1601.0, 1606.0, 1611.0, 1616.0, 1622.0, 1629.0, 1634.0, 1640.0, 1647.0, 1651.0, 1657.0, 1660.0, 1665.0, 1672.0, 1678.0, 1686.0, 1692.0, 1698.0, 1704.0, 1709.0, 1714.0, 1719.0, 1724.0, 1730.0, 1737.0, 1744.0, 1751.0, 1755.0, 1761.0, 1764.0, 1772.0, 1778.0, 1784.0, 1789.0, 1799.0, 1804.0, 1811.0, 1819.0, 1825.0, 1830.0, 1838.0, 1849.0, 1858.0, 1862.0, 1868.0, 1872.0, 1878.0, 1885.0, 1888.0, 1892.0, 1897.0, 1902.0, 1907.0, 1919.0, 1926.0, 1932.0, 1936.0, 1941.0, 1946.0, 1952.0, 1960.0, 1968.0, 1977.0, 1985.0, 1992.0, 1997.0, 2006.0, 2012.0, 2018.0, 2026.0, 2034.0, 2044.0, 2050.0, 2057.0, 2064.0, 2069.0, 2075.0, 2082.0, 2091.0, 2098.0, 2107.0, 2122.0, 2126.0, 2135.0, 2146.0, 2149.0, 2157.0, 2163.0, 2172.0, 2178.0, 2184.0, 2191.0, 2198.0, 2208.0, 2216.0, 2223.0, 2235.0, 2242.0, 2252.0, 2263.0, 2272.0, 2277.0, 2288.0, 2296.0, 2306.0, 2311.0, 2318.0, 2323.0, 2334.0, 2341.0, 2356.0, 2366.0, 2373.0, 2379.0, 2386.0, 2407.0, 2416.0, 2423.0, 2432.0, 2438.0, 2448.0, 2453.0, 2464.0, 2473.0, 2473.0, 2481.0, 2492.0, 2504.0, 2511.0, 2523.0, 2529.0, 2537.0, 2545.0, 2556.0, 2566.0, 2575.0, 2584.0, 2592.0, 2602.0, 2613.0, 2624.0, 2636.0, 2643.0, 2647.0, 2652.0, 2664.0, 2675.0, 2688.0, 2693.0, 2702.0, 2709.0, 2722.0, 2739.0, 2754.0, 2766.0, 2776.0, 2786.0, 2799.0, 2810.0, 2832.0, 2840.0, 2849.0, 2860.0, 2873.0, 2889.0, 2908.0, 2914.0, 2926.0, 2939.0, 2950.0, 2961.0, 2969.0, 2978.0, 2990.0, 2999.0, 3023.0, 3032.0, 3049.0, 3066.0, 3085.0, 3101.0, 3107.0, 3117.0, 3129.0, 3144.0, 3167.0, 3190.0, 3212.0, 3229.0, 3238.0, 3264.0, 3293.0, 3302.0, 3309.0, 3314.0, 3323.0, 3344.0, 3352.0, 3362.0, 3390.0, 3400.0, 3411.0, 3435.0, 3456.0, 3470.0, 3485.0, 3498.0, 3505.0, 3519.0, 3539.0, 3545.0, 3545.0, 3560.0, 3576.0, 3597.0, 3607.0, 3621.0, 3641.0, 3665.0, 3679.0, 3701.0, 3714.0, 3733.0, 3741.0, 3745.0, 3757.0, 3773.0, 3787.0, 3795.0, 3805.0, 3822.0, 3835.0, 3844.0, 3861.0, 3872.0, 3878.0, 3897.0, 3919.0, 3941.0, 3971.0, 4004.0, 4014.0, 4019.0, 4061.0, 4068.0, 4089.0, 4108.0, 4117.0, 4125.0, 4146.0, 4165.0, 4194.0, 4204.0, 4224.0, 4236.0, 4263.0, 4290.0, 4301.0, 4319.0, 4326.0, 4347.0, 4369.0, 4386.0, 4413.0, 4435.0, 4451.0, 4451.0, 4451.0, 4476.0, 4500.0, 4539.0, 4579.0, 4592.0, 4600.0, 4622.0, 4650.0, 4683.0, 4714.0, 4742.0, 4755.0, 4771.0, 4788.0, 4816.0, 4828.0, 4831.0, 4831.0, 4831.0, 4843.0, 4852.0, 4865.0, 4896.0, 4915.0, 4931.0, 4952.0, 4965.0, 4983.0, 5007.0, 5043.0, 5061.0, 5081.0, 5095.0, 5122.0, 5143.0, 5171.0, 5204.0, 5226.0, 5233.0, 5250.0, 5281.0, 5320.0, 5323.0, 5328.0, 5345.0, 5374.0, 5413.0, 5466.0, 5492.0, 5524.0, 5555.0, 5567.0, 5610.0, 5676.0, 5701.0, 5716.0, 5744.0, 5768.0, 5795.0, 5818.0, 5854.0, 5906.0, 5934.0, 5960.0, 5975.0, 5993.0, 6025.0, 6034.0, 6051.0, 6082.0, 6106.0, 6125.0, 6159.0, 6187.0, 6242.0, 6287.0, 6311.0, 6332.0, 6348.0, 6358.0, 6368.0, 6377.0, 6402.0, 6407.0, 6428.0, 6450.0, 6475.0, 6498.0, 6505.0, 6533.0, 6565.0, 6580.0, 6595.0, 6611.0, 6654.0, 6658.0, 6705.0, 6751.0, 6786.0, 6828.0, 6876.0, 6896.0, 6948.0, 6964.0, 7065.0, 7082.0, 7118.0, 7184.0, 7214.0, 7271.0, 7310.0, 7357.0, 7405.0, 7506.0, 7613.0, 7641.0, 7675.0, 7720.0, 7781.0, 7833.0, 7860.0, 7898.0, 7929.0, 8044.0, 8104.0, 8148.0, 8236.0, 8273.0, 8313.0, 8349.0, 8381.0, 8409.0, 8498.0, 8507.0, 8524.0, 8570.0, 8607.0, 8630.0, 8637.0, 8675.0, 8700.0, 8714.0, 8734.0, 8776.0, 8836.0, 8854.0, 8867.0, 8868.0, 9065.0, 9113.0, 9121.0, 9241.0, 9357.0, 9360.0, 9585.0, 9613.0, 9684.0, 9727.0, 9751.0, 9777.0, 9802.0, 9889.0, 9903.0, 9914.0, 9978.0, 10061.0, 10192.0, 10213.0, 10345.0, 10369.0, 10404.0, 10430.0, 10471.0, 10481.0, 10489.0, 10492.0, 10494.0, 10524.0, 10554.0, 10557.0, 10560.0, 10562.0, 10641.0, 10716.0, 10842.0, 10897.0, 10967.0, 11053.0, 11128.0, 11137.0, 11328.0, 11336.0, 11401.0, 11532.0, 11573.0, 11860.0, 11880.0, 12013.0, 12305.0, 12358.0, 12386.0, 12404.0, 12456.0, 12456.0, 12476.0, 12615.0, 12677.0, 12981.0, 13094.0, 13197.0, 13708.0, 13717.0, 13788.0, 14049.0, 14112.0, 14224.0, 14257.0, 14681.0, 14901.0, 15006.0, 15071.0, 15100.0, 15248.0, 15669.0, 15877.0, 15953.0, 15953.0, 16066.0, 16072.0, 16271.0, 16292.0, 16386.0, 16490.0, 16633.0, 16670.0, 16834.0, 16896.0, 17543.0, 17693.0, 17800.0, 17859.0, 18397.0, 18811.0, 18826.0, 18971.0, 19304.0, 19319.0, 19695.0, 20378.0, 20865.0, 21313.0, 21330.0, 22321.0, 22760.0, 22770.0, 23783.0, 23785.0, 24525.0, 24844.0, 24848.0, 24964.0, 24966.0, 27468.0, 27478.0, 27555.0, 27555.0, 28215.0, 28219.0, 28336.0, 28490.0, 30213.0, 30228.0, 30242.0, 34116.0, 43518.0, 43518.0, 43518.0, 43852.0, 43852.0, 43852.0]]}]}, \"function_type\": \"lambda\", \"module\": \"google3.learning.smartchoices.research.clients.compiler_opt.policy_training.feature_ops\", \"output_shape\": null, \"output_shape_type\": \"raw\", \"output_shape_module\": null, \"arguments\": {}}}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 114
+        local_name: "variables"
+      }
+      children {
+        node_id: 115
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 116
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 117
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 246
+        local_name: "__call__"
+      }
+      children {
+        node_id: 247
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Lambda\", \"name\": \"lambda_11\", \"trainable\": true, \"expects_training_arg\": true, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"lambda_11\", \"trainable\": true, \"dtype\": \"float32\", \"function\": {\"class_name\": \"__tuple__\", \"items\": [\"4wEAAAAAAAAAAwAAAAUAAAATAAAAc0QAAACIAHwAgwF9AXQAagF0AmoDfAGIAYMCdABqBIMCdAWI\\nAYMBGwB9AnQAagZ8AnQAagd8AoMBfAJ8AhQAZwNkA2QCjQJTACkETukBAAAAKQHaBGF4aXPp////\\n/ykI2gJ0ZtoEY2FzdNoOY29udHJpYl9sYXllcnPaCWJ1Y2tldGl6ZdoHZmxvYXQzMtoDbGVu2gZj\\nb25jYXTaBHNxcnQpA9oDb2Jz2gxleHBhbmRlZF9vYnPaAXgpAtoOZXhwYW5kX2RpbXNfb3DaCHF1\\nYW50aWxlqQD69C9leHBvcnQvaGRhMy9ib3JnbGV0L2xvY2FsX3JhbV9mc19kaXJzLzAueXVuZGlf\\nbXVwcGV0XzBfMTIyNzA4MzMuMTMueXVuZGkuMTk0NzMxNDE3OTYxLjhmNGY5Zjk4Y2I3YTMwNTUv\\nYnVpbGRfdGFyZ2V0X3RyYWluX3Bhcl9kOTc1NzUzNzAxNmEyZWI4L3RyYWluLnBhci9nb29nbGUz\\nL2xlYXJuaW5nL3NtYXJ0Y2hvaWNlcy9yZXNlYXJjaC9jbGllbnRzL2NvbXBpbGVyX29wdC9wb2xp\\nY3lfdHJhaW5pbmcvZmVhdHVyZV9vcHMucHnaDW5vcm1hbGl6YXRpb24wAAAAcwoAAAAAAQgBBAEK\\nARAB\\n\", null, {\"class_name\": \"__tuple__\", \"items\": [{\"class_name\": \"ExpandDims\", \"config\": {\"name\": \"expand_dims\", \"trainable\": true, \"dtype\": \"float32\", \"axis\": -1}}, [0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 2.0, 3.0, 4.0]]}]}, \"function_type\": \"lambda\", \"module\": \"google3.learning.smartchoices.research.clients.compiler_opt.policy_training.feature_ops\", \"output_shape\": null, \"output_shape_type\": \"raw\", \"output_shape_module\": null, \"arguments\": {}}}"
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 118
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 53
+        local_name: "variables"
+      }
+      children {
+        node_id: 119
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 120
+        local_name: "metrics"
+      }
+      children {
+        node_id: 121
+        local_name: "layers"
+      }
+      children {
+        node_id: 54
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 122
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 55
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 222
+        local_name: "__call__"
+      }
+      children {
+        node_id: 223
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 223
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 123
+        local_name: "variables"
+      }
+      children {
+        node_id: 124
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 125
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 126
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 248
+        local_name: "__call__"
+      }
+      children {
+        node_id: 249
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Flatten\", \"name\": \"flatten\", \"trainable\": true, \"expects_training_arg\": false, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"flatten\", \"trainable\": true, \"dtype\": \"float32\", \"data_format\": \"channels_last\"}, \"input_spec\": {\"class_name\": \"InputSpec\", \"config\": {\"dtype\": null, \"shape\": null, \"ndim\": null, \"max_ndim\": null, \"min_ndim\": 1, \"axes\": {}}}}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 10
+        local_name: "kernel"
+      }
+      children {
+        node_id: 11
+        local_name: "bias"
+      }
+      children {
+        node_id: 127
+        local_name: "variables"
+      }
+      children {
+        node_id: 128
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 129
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 130
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 250
+        local_name: "__call__"
+      }
+      children {
+        node_id: 251
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Dense\", \"name\": \"dense\", \"trainable\": true, \"expects_training_arg\": false, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"dense\", \"trainable\": true, \"dtype\": \"float32\", \"units\": 100, \"activation\": \"relu\", \"use_bias\": true, \"kernel_initializer\": {\"class_name\": \"VarianceScaling\", \"config\": {\"scale\": 2.0, \"mode\": \"fan_in\", \"distribution\": \"truncated_normal\", \"seed\": null, \"dtype\": \"float32\"}}, \"bias_initializer\": {\"class_name\": \"Zeros\", \"config\": {}}, \"kernel_regularizer\": null, \"bias_regularizer\": null, \"activity_regularizer\": null, \"kernel_constraint\": null, \"bias_constraint\": null}, \"input_spec\": {\"class_name\": \"InputSpec\", \"config\": {\"dtype\": null, \"shape\": null, \"ndim\": null, \"max_ndim\": null, \"min_ndim\": 2, \"axes\": {\"-1\": 34}}}, \"build_input_shape\": {\"class_name\": \"TensorShape\", \"items\": [0, 34]}}"
+      }
+    }
+    nodes {
+      children {
+        node_id: 12
+        local_name: "kernel"
+      }
+      children {
+        node_id: 13
+        local_name: "bias"
+      }
+      children {
+        node_id: 131
+        local_name: "variables"
+      }
+      children {
+        node_id: 132
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 133
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 134
+        local_name: "keras_api"
+      }
+      children {
+        node_id: 252
+        local_name: "__call__"
+      }
+      children {
+        node_id: 253
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      user_object {
+        identifier: "_tf_keras_layer"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+        metadata: "{\"class_name\": \"Dense\", \"name\": \"dense_1\", \"trainable\": true, \"expects_training_arg\": false, \"dtype\": \"float32\", \"batch_input_shape\": null, \"stateful\": false, \"config\": {\"name\": \"dense_1\", \"trainable\": true, \"dtype\": \"float32\", \"units\": 40, \"activation\": \"relu\", \"use_bias\": true, \"kernel_initializer\": {\"class_name\": \"VarianceScaling\", \"config\": {\"scale\": 2.0, \"mode\": \"fan_in\", \"distribution\": \"truncated_normal\", \"seed\": null, \"dtype\": \"float32\"}}, \"bias_initializer\": {\"class_name\": \"Zeros\", \"config\": {}}, \"kernel_regularizer\": null, \"bias_regularizer\": null, \"activity_regularizer\": null, \"kernel_constraint\": null, \"bias_constraint\": null}, \"input_spec\": {\"class_name\": \"InputSpec\", \"config\": {\"dtype\": null, \"shape\": null, \"ndim\": null, \"max_ndim\": null, \"min_ndim\": 2, \"axes\": {\"-1\": 100}}}, \"build_input_shape\": {\"class_name\": \"TensorShape\", \"items\": [0, 100]}}"
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 41
+        local_name: "0"
+      }
+      children {
+        node_id: 42
+        local_name: "1"
+      }
+      children {
+        node_id: 43
+        local_name: "2"
+      }
+      children {
+        node_id: 44
+        local_name: "3"
+      }
+      children {
+        node_id: 45
+        local_name: "4"
+      }
+      children {
+        node_id: 46
+        local_name: "5"
+      }
+      children {
+        node_id: 47
+        local_name: "6"
+      }
+      children {
+        node_id: 48
+        local_name: "7"
+      }
+      children {
+        node_id: 49
+        local_name: "8"
+      }
+      children {
+        node_id: 50
+        local_name: "9"
+      }
+      children {
+        node_id: 51
+        local_name: "10"
+      }
+      children {
+        node_id: 52
+        local_name: "11"
+      }
+      children {
+        node_id: 26
+        local_name: "12"
+      }
+      children {
+        node_id: 57
+        local_name: "13"
+      }
+      children {
+        node_id: 58
+        local_name: "14"
+      }
+      children {
+        node_id: 59
+        local_name: "15"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 135
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 70
+        local_name: "variables"
+      }
+      children {
+        node_id: 136
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 137
+        local_name: "metrics"
+      }
+      children {
+        node_id: 138
+        local_name: "layers"
+      }
+      children {
+        node_id: 71
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 139
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 72
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 224
+        local_name: "__call__"
+      }
+      children {
+        node_id: 225
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 225
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 140
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 74
+        local_name: "variables"
+      }
+      children {
+        node_id: 141
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 142
+        local_name: "metrics"
+      }
+      children {
+        node_id: 143
+        local_name: "layers"
+      }
+      children {
+        node_id: 75
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 144
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 76
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 226
+        local_name: "__call__"
+      }
+      children {
+        node_id: 227
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 227
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 145
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 78
+        local_name: "variables"
+      }
+      children {
+        node_id: 146
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 147
+        local_name: "metrics"
+      }
+      children {
+        node_id: 148
+        local_name: "layers"
+      }
+      children {
+        node_id: 79
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 149
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 80
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 228
+        local_name: "__call__"
+      }
+      children {
+        node_id: 229
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 229
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 150
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 82
+        local_name: "variables"
+      }
+      children {
+        node_id: 151
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 152
+        local_name: "metrics"
+      }
+      children {
+        node_id: 153
+        local_name: "layers"
+      }
+      children {
+        node_id: 83
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 154
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 84
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 230
+        local_name: "__call__"
+      }
+      children {
+        node_id: 231
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 231
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 155
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 86
+        local_name: "variables"
+      }
+      children {
+        node_id: 156
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 157
+        local_name: "metrics"
+      }
+      children {
+        node_id: 158
+        local_name: "layers"
+      }
+      children {
+        node_id: 87
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 159
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 88
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 232
+        local_name: "__call__"
+      }
+      children {
+        node_id: 233
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 233
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 160
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 90
+        local_name: "variables"
+      }
+      children {
+        node_id: 161
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 162
+        local_name: "metrics"
+      }
+      children {
+        node_id: 163
+        local_name: "layers"
+      }
+      children {
+        node_id: 91
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 164
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 92
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 234
+        local_name: "__call__"
+      }
+      children {
+        node_id: 235
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 235
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 165
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 94
+        local_name: "variables"
+      }
+      children {
+        node_id: 166
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 167
+        local_name: "metrics"
+      }
+      children {
+        node_id: 168
+        local_name: "layers"
+      }
+      children {
+        node_id: 95
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 169
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 96
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 236
+        local_name: "__call__"
+      }
+      children {
+        node_id: 237
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 237
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 170
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 98
+        local_name: "variables"
+      }
+      children {
+        node_id: 171
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 172
+        local_name: "metrics"
+      }
+      children {
+        node_id: 173
+        local_name: "layers"
+      }
+      children {
+        node_id: 99
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 174
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 100
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 238
+        local_name: "__call__"
+      }
+      children {
+        node_id: 239
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 239
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 175
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 102
+        local_name: "variables"
+      }
+      children {
+        node_id: 176
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 177
+        local_name: "metrics"
+      }
+      children {
+        node_id: 178
+        local_name: "layers"
+      }
+      children {
+        node_id: 103
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 179
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 104
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 240
+        local_name: "__call__"
+      }
+      children {
+        node_id: 241
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 241
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 180
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 106
+        local_name: "variables"
+      }
+      children {
+        node_id: 181
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 182
+        local_name: "metrics"
+      }
+      children {
+        node_id: 183
+        local_name: "layers"
+      }
+      children {
+        node_id: 107
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 184
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 108
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 242
+        local_name: "__call__"
+      }
+      children {
+        node_id: 243
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 243
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 185
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 110
+        local_name: "variables"
+      }
+      children {
+        node_id: 186
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 187
+        local_name: "metrics"
+      }
+      children {
+        node_id: 188
+        local_name: "layers"
+      }
+      children {
+        node_id: 111
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 189
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 112
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 244
+        local_name: "__call__"
+      }
+      children {
+        node_id: 245
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 245
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 190
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 114
+        local_name: "variables"
+      }
+      children {
+        node_id: 191
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 192
+        local_name: "metrics"
+      }
+      children {
+        node_id: 193
+        local_name: "layers"
+      }
+      children {
+        node_id: 115
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 194
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 116
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 246
+        local_name: "__call__"
+      }
+      children {
+        node_id: 247
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 247
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 195
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 123
+        local_name: "variables"
+      }
+      children {
+        node_id: 196
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 197
+        local_name: "metrics"
+      }
+      children {
+        node_id: 198
+        local_name: "layers"
+      }
+      children {
+        node_id: 124
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 199
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 125
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 248
+        local_name: "__call__"
+      }
+      children {
+        node_id: 249
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 249
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 10
+        local_name: "0"
+      }
+      children {
+        node_id: 11
+        local_name: "1"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 10
+        local_name: "0"
+      }
+      children {
+        node_id: 11
+        local_name: "1"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 200
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 127
+        local_name: "variables"
+      }
+      children {
+        node_id: 201
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 202
+        local_name: "metrics"
+      }
+      children {
+        node_id: 203
+        local_name: "layers"
+      }
+      children {
+        node_id: 128
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 204
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 129
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 250
+        local_name: "__call__"
+      }
+      children {
+        node_id: 251
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 251
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 12
+        local_name: "0"
+      }
+      children {
+        node_id: 13
+        local_name: "1"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 12
+        local_name: "0"
+      }
+      children {
+        node_id: 13
+        local_name: "1"
+      }
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      children {
+        node_id: 205
+        local_name: "layer_metrics"
+      }
+      children {
+        node_id: 131
+        local_name: "variables"
+      }
+      children {
+        node_id: 206
+        local_name: "layer_regularization_losses"
+      }
+      children {
+        node_id: 207
+        local_name: "metrics"
+      }
+      children {
+        node_id: 208
+        local_name: "layers"
+      }
+      children {
+        node_id: 132
+        local_name: "regularization_losses"
+      }
+      children {
+        node_id: 209
+        local_name: "non_trainable_variables"
+      }
+      children {
+        node_id: 133
+        local_name: "trainable_variables"
+      }
+      children {
+        node_id: 252
+        local_name: "__call__"
+      }
+      children {
+        node_id: 253
+        local_name: "call_and_return_all_conditional_losses"
+      }
+      children {
+        node_id: 253
+        local_name: "call_and_return_conditional_losses"
+      }
+      user_object {
+        identifier: "_generic_user_object"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_dict_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      user_object {
+        identifier: "trackable_list_wrapper"
+        version {
+          producer: 1
+          min_consumer: 1
+        }
+      }
+    }
+    nodes {
+      function {
+        concrete_functions: "__inference_polymorphic_action_fn_4619080"
+        concrete_functions: "__inference_polymorphic_action_fn_946"
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "time_step"
+                    }
+                    values {
+                      string_value: "policy_state"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  tuple_value {
+                    values {
+                      tuple_value {
+                      }
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        concrete_functions: "__inference_function_722"
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      bare_concrete_function {
+        concrete_function_name: "__inference_<lambda>_728"
+      }
+    }
+    nodes {
+      bare_concrete_function {
+        concrete_function_name: "__inference_signature_wrapper_4619026"
+        argument_keywords: "callee_basic_block_count"
+        argument_keywords: "callee_conditionally_executed_blocks"
+        argument_keywords: "callee_users"
+        argument_keywords: "caller_basic_block_count"
+        argument_keywords: "caller_conditionally_executed_blocks"
+        argument_keywords: "caller_users"
+        argument_keywords: "callsite_height"
+        argument_keywords: "cost_estimate"
+        argument_keywords: "discount"
+        argument_keywords: "edge_count"
+        argument_keywords: "inlining_default"
+        argument_keywords: "node_count"
+        argument_keywords: "nr_ctant_params"
+        argument_keywords: "reward"
+        argument_keywords: "step_type"
+      }
+    }
+    nodes {
+      bare_concrete_function {
+        concrete_function_name: "__inference_signature_wrapper_4619033"
+      }
+    }
+    nodes {
+      bare_concrete_function {
+        concrete_function_name: "__inference_signature_wrapper_4619048"
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "observation"
+                    }
+                    values {
+                      string_value: "step_type"
+                    }
+                    values {
+                      string_value: "network_state"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      tuple_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "observation"
+                    }
+                    values {
+                      string_value: "step_type"
+                    }
+                    values {
+                      string_value: "network_state"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      tuple_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "observation"
+                    }
+                    values {
+                      string_value: "step_type"
+                    }
+                    values {
+                      string_value: "network_state"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      tuple_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "observation"
+                    }
+                    values {
+                      string_value: "step_type"
+                    }
+                    values {
+                      string_value: "network_state"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      tuple_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                    values {
+                      string_value: "mask"
+                    }
+                    values {
+                      string_value: "training"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  list_value {
+                    values {
+                      none_value {
+                      }
+                    }
+                    values {
+                      bool_value: false
+                    }
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    nodes {
+      function {
+        function_spec {
+          fullargspec {
+            named_tuple_value {
+              name: "FullArgSpec"
+              values {
+                key: "args"
+                value {
+                  list_value {
+                    values {
+                      string_value: "self"
+                    }
+                    values {
+                      string_value: "inputs"
+                    }
+                  }
+                }
+              }
+              values {
+                key: "varargs"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "varkw"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "defaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlyargs"
+                value {
+                  list_value {
+                  }
+                }
+              }
+              values {
+                key: "kwonlydefaults"
+                value {
+                  none_value {
+                  }
+                }
+              }
+              values {
+                key: "annotations"
+                value {
+                  dict_value {
+                  }
+                }
+              }
+            }
+          }
+          is_method: true
+          input_signature {
+            none_value {
+            }
+          }
+        }
+      }
+    }
+    concrete_functions {
+      key: "__inference_<lambda>_728"
+      value {
+        bound_inputs: 4
+        canonicalized_input_signature {
+          tuple_value {
+            values {
+              tuple_value {
+              }
+            }
+            values {
+              dict_value {
+              }
+            }
+          }
+        }
+        output_signature {
+          tensor_spec_value {
+            shape {
+            }
+            dtype: DT_INT64
+          }
+        }
+      }
+    }
+    concrete_functions {
+      key: "__inference_function_722"
+      value {
+        canonicalized_input_signature {
+          tuple_value {
+            values {
+              tuple_value {
+              }
+            }
+            values {
+              dict_value {
+              }
+            }
+          }
+        }
+        output_signature {
+          tuple_value {
+          }
+        }
+      }
+    }
+    concrete_functions {
+      key: "__inference_polymorphic_action_fn_4619080"
+      value {
+        bound_inputs: 10
+        bound_inputs: 11
+        bound_inputs: 12
+        bound_inputs: 13
+        bound_inputs: 14
+        bound_inputs: 15
+        canonicalized_input_signature {
+          tuple_value {
+            values {
+              tuple_value {
+                values {
+                  named_tuple_value {
+                    name: "TimeStep"
+                    values {
+                      key: "step_type"
+                      value {
+                        tensor_spec_value {
+                          name: "time_step/step_type"
+                          shape {
+                            dim {
+                              size: 1
+                            }
+                          }
+                          dtype: DT_INT32
+                        }
+                      }
+                    }
+                    values {
+                      key: "reward"
+                      value {
+                        tensor_spec_value {
+                          name: "time_step/reward"
+                          shape {
+                            dim {
+                              size: 1
+                            }
+                          }
+                          dtype: DT_FLOAT
+                        }
+                      }
+                    }
+                    values {
+                      key: "discount"
+                      value {
+                        tensor_spec_value {
+                          name: "time_step/discount"
+                          shape {
+                            dim {
+                              size: 1
+                            }
+                          }
+                          dtype: DT_FLOAT
+                        }
+                      }
+                    }
+                    values {
+                      key: "observation"
+                      value {
+                        dict_value {
+                          fields {
+                            key: "callee_basic_block_count"
+                            value {
+                              tensor_spec_value {
+                                name: "time_step/observation/callee_basic_block_count"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "callee_conditionally_executed_blocks"
+                            value {
+                              tensor_spec_value {
+                                name: "time_step/observation/callee_conditionally_executed_blocks"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "callee_users"
+                            value {
+                              tensor_spec_value {
+                                name: "time_step/observation/callee_users"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "caller_basic_block_count"
+                            value {
+                              tensor_spec_value {
+                                name: "time_step/observation/caller_basic_block_count"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "caller_conditionally_executed_blocks"
+                            value {
+                              tensor_spec_value {
+                                name: "time_step/observation/caller_conditionally_executed_blocks"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "caller_users"
+                            value {
+                              tensor_spec_value {
+                                name: "time_step/observation/caller_users"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "callsite_height"
+                            value {
+                              tensor_spec_value {
+                                name: "time_step/observation/callsite_height"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "cost_estimate"
+                            value {
+                              tensor_spec_value {
+                                name: "time_step/observation/cost_estimate"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "edge_count"
+                            value {
+                              tensor_spec_value {
+                                name: "time_step/observation/edge_count"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "inlining_default"
+                            value {
+                              tensor_spec_value {
+                                name: "time_step/observation/inlining_default"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "node_count"
+                            value {
+                              tensor_spec_value {
+                                name: "time_step/observation/node_count"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "nr_ctant_params"
+                            value {
+                              tensor_spec_value {
+                                name: "time_step/observation/nr_ctant_params"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+                values {
+                  tuple_value {
+                  }
+                }
+              }
+            }
+            values {
+              dict_value {
+              }
+            }
+          }
+        }
+        output_signature {
+          named_tuple_value {
+            name: "PolicyStep"
+            values {
+              key: "action"
+              value {
+                tensor_spec_value {
+                  name: "action"
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                  dtype: DT_INT64
+                }
+              }
+            }
+            values {
+              key: "state"
+              value {
+                tuple_value {
+                }
+              }
+            }
+            values {
+              key: "info"
+              value {
+                tuple_value {
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    concrete_functions {
+      key: "__inference_polymorphic_action_fn_946"
+      value {
+        bound_inputs: 10
+        bound_inputs: 11
+        bound_inputs: 12
+        bound_inputs: 13
+        bound_inputs: 14
+        bound_inputs: 15
+        canonicalized_input_signature {
+          tuple_value {
+            values {
+              tuple_value {
+                values {
+                  named_tuple_value {
+                    name: "TimeStep"
+                    values {
+                      key: "step_type"
+                      value {
+                        tensor_spec_value {
+                          name: "step_type"
+                          shape {
+                            dim {
+                              size: 1
+                            }
+                          }
+                          dtype: DT_INT32
+                        }
+                      }
+                    }
+                    values {
+                      key: "reward"
+                      value {
+                        tensor_spec_value {
+                          name: "reward"
+                          shape {
+                            dim {
+                              size: 1
+                            }
+                          }
+                          dtype: DT_FLOAT
+                        }
+                      }
+                    }
+                    values {
+                      key: "discount"
+                      value {
+                        tensor_spec_value {
+                          name: "discount"
+                          shape {
+                            dim {
+                              size: 1
+                            }
+                          }
+                          dtype: DT_FLOAT
+                        }
+                      }
+                    }
+                    values {
+                      key: "observation"
+                      value {
+                        dict_value {
+                          fields {
+                            key: "callee_basic_block_count"
+                            value {
+                              tensor_spec_value {
+                                name: "callee_basic_block_count"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "callee_conditionally_executed_blocks"
+                            value {
+                              tensor_spec_value {
+                                name: "callee_conditionally_executed_blocks"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "callee_users"
+                            value {
+                              tensor_spec_value {
+                                name: "callee_users"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "caller_basic_block_count"
+                            value {
+                              tensor_spec_value {
+                                name: "caller_basic_block_count"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "caller_conditionally_executed_blocks"
+                            value {
+                              tensor_spec_value {
+                                name: "caller_conditionally_executed_blocks"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "caller_users"
+                            value {
+                              tensor_spec_value {
+                                name: "caller_users"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "callsite_height"
+                            value {
+                              tensor_spec_value {
+                                name: "callsite_height"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "cost_estimate"
+                            value {
+                              tensor_spec_value {
+                                name: "cost_estimate"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "edge_count"
+                            value {
+                              tensor_spec_value {
+                                name: "edge_count"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "inlining_default"
+                            value {
+                              tensor_spec_value {
+                                name: "inlining_default"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "node_count"
+                            value {
+                              tensor_spec_value {
+                                name: "node_count"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                          fields {
+                            key: "nr_ctant_params"
+                            value {
+                              tensor_spec_value {
+                                name: "nr_ctant_params"
+                                shape {
+                                  dim {
+                                    size: 1
+                                  }
+                                }
+                                dtype: DT_INT64
+                              }
+                            }
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+                values {
+                  tuple_value {
+                  }
+                }
+              }
+            }
+            values {
+              dict_value {
+              }
+            }
+          }
+        }
+        output_signature {
+          named_tuple_value {
+            name: "PolicyStep"
+            values {
+              key: "action"
+              value {
+                tensor_spec_value {
+                  name: "action"
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                  dtype: DT_INT64
+                }
+              }
+            }
+            values {
+              key: "state"
+              value {
+                tuple_value {
+                }
+              }
+            }
+            values {
+              key: "info"
+              value {
+                tuple_value {
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    concrete_functions {
+      key: "__inference_signature_wrapper_4619026"
+      value {
+        bound_inputs: 10
+        bound_inputs: 11
+        bound_inputs: 12
+        bound_inputs: 13
+        bound_inputs: 14
+        bound_inputs: 15
+        canonicalized_input_signature {
+          tuple_value {
+            values {
+              tuple_value {
+              }
+            }
+            values {
+              dict_value {
+                fields {
+                  key: "callee_basic_block_count"
+                  value {
+                    tensor_spec_value {
+                      name: "callee_basic_block_count"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_INT64
+                    }
+                  }
+                }
+                fields {
+                  key: "callee_conditionally_executed_blocks"
+                  value {
+                    tensor_spec_value {
+                      name: "callee_conditionally_executed_blocks"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_INT64
+                    }
+                  }
+                }
+                fields {
+                  key: "callee_users"
+                  value {
+                    tensor_spec_value {
+                      name: "callee_users"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_INT64
+                    }
+                  }
+                }
+                fields {
+                  key: "caller_basic_block_count"
+                  value {
+                    tensor_spec_value {
+                      name: "caller_basic_block_count"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_INT64
+                    }
+                  }
+                }
+                fields {
+                  key: "caller_conditionally_executed_blocks"
+                  value {
+                    tensor_spec_value {
+                      name: "caller_conditionally_executed_blocks"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_INT64
+                    }
+                  }
+                }
+                fields {
+                  key: "caller_users"
+                  value {
+                    tensor_spec_value {
+                      name: "caller_users"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_INT64
+                    }
+                  }
+                }
+                fields {
+                  key: "callsite_height"
+                  value {
+                    tensor_spec_value {
+                      name: "callsite_height"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_INT64
+                    }
+                  }
+                }
+                fields {
+                  key: "cost_estimate"
+                  value {
+                    tensor_spec_value {
+                      name: "cost_estimate"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_INT64
+                    }
+                  }
+                }
+                fields {
+                  key: "discount"
+                  value {
+                    tensor_spec_value {
+                      name: "discount"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_FLOAT
+                    }
+                  }
+                }
+                fields {
+                  key: "edge_count"
+                  value {
+                    tensor_spec_value {
+                      name: "edge_count"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_INT64
+                    }
+                  }
+                }
+                fields {
+                  key: "inlining_default"
+                  value {
+                    tensor_spec_value {
+                      name: "inlining_default"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_INT64
+                    }
+                  }
+                }
+                fields {
+                  key: "node_count"
+                  value {
+                    tensor_spec_value {
+                      name: "node_count"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_INT64
+                    }
+                  }
+                }
+                fields {
+                  key: "nr_ctant_params"
+                  value {
+                    tensor_spec_value {
+                      name: "nr_ctant_params"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_INT64
+                    }
+                  }
+                }
+                fields {
+                  key: "reward"
+                  value {
+                    tensor_spec_value {
+                      name: "reward"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_FLOAT
+                    }
+                  }
+                }
+                fields {
+                  key: "step_type"
+                  value {
+                    tensor_spec_value {
+                      name: "step_type"
+                      shape {
+                        dim {
+                          size: 1
+                        }
+                      }
+                      dtype: DT_INT32
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+        output_signature {
+          dict_value {
+            fields {
+              key: "inlining_decision"
+              value {
+                tensor_spec_value {
+                  name: "inlining_decision"
+                  shape {
+                    dim {
+                      size: 1
+                    }
+                  }
+                  dtype: DT_INT64
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+    concrete_functions {
+      key: "__inference_signature_wrapper_4619033"
+      value {
+        canonicalized_input_signature {
+          tuple_value {
+            values {
+              tuple_value {
+              }
+            }
+            values {
+              dict_value {
+              }
+            }
+          }
+        }
+        output_signature {
+          dict_value {
+          }
+        }
+      }
+    }
+    concrete_functions {
+      key: "__inference_signature_wrapper_4619048"
+      value {
+        bound_inputs: 4
+        canonicalized_input_signature {
+          tuple_value {
+            values {
+              tuple_value {
+              }
+            }
+            values {
+              dict_value {
+              }
+            }
+          }
+        }
+        output_signature {
+          dict_value {
+            fields {
+              key: "int64"
+              value {
+                tensor_spec_value {
+                  name: "int64"
+                  shape {
+                  }
+                  dtype: DT_INT64
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
diff --git a/llvm/lib/Analysis/models/inliner/variables/variables.data-00000-of-00001 b/llvm/lib/Analysis/models/inliner/variables/variables.data-00000-of-00001
new file mode 100644
index 000000000000..ee7d7060867e
--- /dev/null
+++ b/llvm/lib/Analysis/models/inliner/variables/variables.data-00000-of-00001
diff --git a/llvm/lib/Analysis/models/inliner/variables/variables.index b/llvm/lib/Analysis/models/inliner/variables/variables.index
new file mode 100644
index 000000000000..7e0c10c1780e
--- /dev/null
+++ b/llvm/lib/Analysis/models/inliner/variables/variables.index
diff --git a/llvm/lib/AsmParser/LLLexer.cpp b/llvm/lib/AsmParser/LLLexer.cpp
index d96b5e0bff5a..777ce3abdddd 100644
--- a/llvm/lib/AsmParser/LLLexer.cpp
+++ b/llvm/lib/AsmParser/LLLexer.cpp
@@ -658,15 +658,19 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(noinline);
   KEYWORD(norecurse);
   KEYWORD(nonlazybind);
+  KEYWORD(nomerge);
   KEYWORD(nonnull);
   KEYWORD(noredzone);
   KEYWORD(noreturn);
   KEYWORD(nosync);
   KEYWORD(nocf_check);
+  KEYWORD(noundef);
   KEYWORD(nounwind);
+  KEYWORD(null_pointer_is_valid);
   KEYWORD(optforfuzzing);
   KEYWORD(optnone);
   KEYWORD(optsize);
+  KEYWORD(preallocated);
   KEYWORD(readnone);
   KEYWORD(readonly);
   KEYWORD(returned);
@@ -738,6 +742,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(name);
   KEYWORD(summaries);
   KEYWORD(flags);
+  KEYWORD(blockcount);
   KEYWORD(linkage);
   KEYWORD(notEligibleToImport);
   KEYWORD(live);
@@ -754,6 +759,8 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(alwaysInline);
   KEYWORD(calls);
   KEYWORD(callee);
+  KEYWORD(params);
+  KEYWORD(param);
   KEYWORD(hotness);
   KEYWORD(unknown);
   KEYWORD(hot);
@@ -788,6 +795,7 @@ lltok::Kind LLLexer::LexIdentifier() {
   KEYWORD(sizeM1);
   KEYWORD(bitMask);
   KEYWORD(inlineBits);
+  KEYWORD(vcall_visibility);
   KEYWORD(wpdResolutions);
   KEYWORD(wpdRes);
   KEYWORD(indir);
@@ -817,6 +825,7 @@ lltok::Kind LLLexer::LexIdentifier() {
 
   TYPEKEYWORD("void",      Type::getVoidTy(Context));
   TYPEKEYWORD("half",      Type::getHalfTy(Context));
+  TYPEKEYWORD("bfloat",    Type::getBFloatTy(Context));
   TYPEKEYWORD("float",     Type::getFloatTy(Context));
   TYPEKEYWORD("double",    Type::getDoubleTy(Context));
   TYPEKEYWORD("x86_fp80",  Type::getX86_FP80Ty(Context));
@@ -982,11 +991,13 @@ lltok::Kind LLLexer::LexIdentifier() {
 ///    HexFP128Constant  0xL[0-9A-Fa-f]+
 ///    HexPPC128Constant 0xM[0-9A-Fa-f]+
 ///    HexHalfConstant   0xH[0-9A-Fa-f]+
+///    HexBFloatConstant 0xR[0-9A-Fa-f]+
 lltok::Kind LLLexer::Lex0x() {
   CurPtr = TokStart + 2;
 
   char Kind;
-  if ((CurPtr[0] >= 'K' && CurPtr[0] <= 'M') || CurPtr[0] == 'H') {
+  if ((CurPtr[0] >= 'K' && CurPtr[0] <= 'M') || CurPtr[0] == 'H' ||
+      CurPtr[0] == 'R') {
     Kind = *CurPtr++;
   } else {
     Kind = 'J';
@@ -1004,7 +1015,7 @@ lltok::Kind LLLexer::Lex0x() {
   if (Kind == 'J') {
     // HexFPConstant - Floating point constant represented in IEEE format as a
     // hexadecimal number for when exponential notation is not precise enough.
-    // Half, Float, and double only.
+    // Half, BFloat, Float, and double only.
     APFloatVal = APFloat(APFloat::IEEEdouble(),
                          APInt(64, HexIntToVal(TokStart + 2, CurPtr)));
     return lltok::APFloat;
@@ -1032,6 +1043,11 @@ lltok::Kind LLLexer::Lex0x() {
     APFloatVal = APFloat(APFloat::IEEEhalf(),
                          APInt(16,HexIntToVal(TokStart+3, CurPtr)));
     return lltok::APFloat;
+  case 'R':
+    // Brain floating point
+    APFloatVal = APFloat(APFloat::BFloat(),
+                         APInt(16, HexIntToVal(TokStart + 3, CurPtr)));
+    return lltok::APFloat;
   }
 }
 
diff --git a/llvm/lib/AsmParser/LLLexer.h b/llvm/lib/AsmParser/LLLexer.h
index 4d3a2920e937..c97d9781c33b 100644
--- a/llvm/lib/AsmParser/LLLexer.h
+++ b/llvm/lib/AsmParser/LLLexer.h
@@ -16,13 +16,13 @@
 #include "LLToken.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/APSInt.h"
-#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/SMLoc.h"
 #include <string>
 
 namespace llvm {
-  class MemoryBuffer;
   class Type;
   class SMDiagnostic;
+  class SourceMgr;
   class LLVMContext;
 
   class LLLexer {
diff --git a/llvm/lib/AsmParser/LLParser.cpp b/llvm/lib/AsmParser/LLParser.cpp
index 1a17f633ae16..c9f21ee83826 100644
--- a/llvm/lib/AsmParser/LLParser.cpp
+++ b/llvm/lib/AsmParser/LLParser.cpp
@@ -11,9 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "LLParser.h"
+#include "LLToken.h"
+#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/None.h"
-#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/AsmParser/SlotMapping.h"
@@ -23,6 +24,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Comdat.h"
+#include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -30,14 +32,10 @@
 #include "llvm/IR/GlobalIFunc.h"
 #include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/Instruction.h"
-#include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/Operator.h"
-#include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/IR/ValueSymbolTable.h"
 #include "llvm/Support/Casting.h"
@@ -61,7 +59,8 @@ static std::string getTypeString(Type *T) {
 }
 
 /// Run: module ::= toplevelentity*
-bool LLParser::Run() {
+bool LLParser::Run(bool UpgradeDebugInfo,
+                   DataLayoutCallbackTy DataLayoutCallback) {
   // Prime the lexer.
   Lex.Lex();
 
@@ -70,7 +69,15 @@ bool LLParser::Run() {
         Lex.getLoc(),
         "Can't read textual IR with a Context that discards named Values");
 
-  return ParseTopLevelEntities() || ValidateEndOfModule() ||
+  if (M) {
+    if (ParseTargetDefinitions())
+      return true;
+
+    if (auto LayoutOverride = DataLayoutCallback(M->getTargetTriple()))
+      M->setDataLayout(*LayoutOverride);
+  }
+
+  return ParseTopLevelEntities() || ValidateEndOfModule(UpgradeDebugInfo) ||
          ValidateEndOfIndex();
 }
 
@@ -118,7 +125,7 @@ void LLParser::restoreParsingState(const SlotMapping *Slots) {
 
 /// ValidateEndOfModule - Do final validity and sanity checks at the end of the
 /// module.
-bool LLParser::ValidateEndOfModule() {
+bool LLParser::ValidateEndOfModule(bool UpgradeDebugInfo) {
   if (!M)
     return false;
   // Handle any function attribute group forward references.
@@ -294,6 +301,23 @@ bool LLParser::ValidateEndOfIndex() {
 // Top-Level Entities
 //===----------------------------------------------------------------------===//
 
+bool LLParser::ParseTargetDefinitions() {
+  while (true) {
+    switch (Lex.getKind()) {
+    case lltok::kw_target:
+      if (ParseTargetDefinition())
+        return true;
+      break;
+    case lltok::kw_source_filename:
+      if (ParseSourceFileName())
+        return true;
+      break;
+    default:
+      return false;
+    }
+  }
+}
+
 bool LLParser::ParseTopLevelEntities() {
   // If there is no Module, then parse just the summary index entries.
   if (!M) {
@@ -322,11 +346,6 @@ bool LLParser::ParseTopLevelEntities() {
     case lltok::kw_declare: if (ParseDeclare()) return true; break;
     case lltok::kw_define:  if (ParseDefine()) return true; break;
     case lltok::kw_module:  if (ParseModuleAsm()) return true; break;
-    case lltok::kw_target:  if (ParseTargetDefinition()) return true; break;
-    case lltok::kw_source_filename:
-      if (ParseSourceFileName())
-        return true;
-      break;
     case lltok::kw_deplibs: if (ParseDepLibs()) return true; break;
     case lltok::LocalVarID: if (ParseUnnamedType()) return true; break;
     case lltok::LocalVar:   if (ParseNamedType()) return true; break;
@@ -383,8 +402,7 @@ bool LLParser::ParseTargetDefinition() {
     if (ParseToken(lltok::equal, "expected '=' after target datalayout") ||
         ParseStringConstant(Str))
       return true;
-    if (DataLayoutStr.empty())
-      M->setDataLayout(Str);
+    M->setDataLayout(Str);
     return false;
   }
 }
@@ -835,6 +853,12 @@ bool LLParser::ParseSummaryEntry() {
   case lltok::kw_typeidCompatibleVTable:
     result = ParseTypeIdCompatibleVtableEntry(SummaryID);
     break;
+  case lltok::kw_flags:
+    result = ParseSummaryIndexFlags();
+    break;
+  case lltok::kw_blockcount:
+    result = ParseBlockCount();
+    break;
   default:
     result = Error(Lex.getLoc(), "unexpected summary kind");
     break;
@@ -1286,12 +1310,15 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
       B.addAttribute(Attribute::NoImplicitFloat); break;
     case lltok::kw_noinline: B.addAttribute(Attribute::NoInline); break;
     case lltok::kw_nonlazybind: B.addAttribute(Attribute::NonLazyBind); break;
+    case lltok::kw_nomerge: B.addAttribute(Attribute::NoMerge); break;
     case lltok::kw_noredzone: B.addAttribute(Attribute::NoRedZone); break;
     case lltok::kw_noreturn: B.addAttribute(Attribute::NoReturn); break;
     case lltok::kw_nosync: B.addAttribute(Attribute::NoSync); break;
     case lltok::kw_nocf_check: B.addAttribute(Attribute::NoCfCheck); break;
     case lltok::kw_norecurse: B.addAttribute(Attribute::NoRecurse); break;
     case lltok::kw_nounwind: B.addAttribute(Attribute::NoUnwind); break;
+    case lltok::kw_null_pointer_is_valid:
+      B.addAttribute(Attribute::NullPointerIsValid); break;
     case lltok::kw_optforfuzzing:
       B.addAttribute(Attribute::OptForFuzzing); break;
     case lltok::kw_optnone: B.addAttribute(Attribute::OptimizeNone); break;
@@ -1325,6 +1352,13 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_uwtable: B.addAttribute(Attribute::UWTable); break;
     case lltok::kw_willreturn: B.addAttribute(Attribute::WillReturn); break;
     case lltok::kw_writeonly: B.addAttribute(Attribute::WriteOnly); break;
+    case lltok::kw_preallocated: {
+      Type *Ty;
+      if (ParsePreallocated(Ty))
+        return true;
+      B.addPreallocatedAttr(Ty);
+      break;
+    }
 
     // Error handling.
     case lltok::kw_inreg:
@@ -1340,6 +1374,7 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
     case lltok::kw_inalloca:
     case lltok::kw_nest:
     case lltok::kw_noalias:
+    case lltok::kw_noundef:
     case lltok::kw_nocapture:
     case lltok::kw_nonnull:
     case lltok::kw_returned:
@@ -1353,7 +1388,9 @@ bool LLParser::ParseFnAttributeValuePairs(AttrBuilder &B,
       break;
     }
 
-    Lex.Lex();
+    // ParsePreallocated() consumes token
+    if (Token != lltok::kw_preallocated)
+      Lex.Lex();
   }
 }
 
@@ -1605,7 +1642,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     }
     case lltok::kw_align: {
       MaybeAlign Alignment;
-      if (ParseOptionalAlignment(Alignment))
+      if (ParseOptionalAlignment(Alignment, true))
         return true;
       B.addAlignmentAttr(Alignment);
       continue;
@@ -1617,6 +1654,13 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
       B.addByValAttr(Ty);
       continue;
     }
+    case lltok::kw_preallocated: {
+      Type *Ty;
+      if (ParsePreallocated(Ty))
+        return true;
+      B.addPreallocatedAttr(Ty);
+      continue;
+    }
     case lltok::kw_dereferenceable: {
       uint64_t Bytes;
       if (ParseOptionalDerefAttrBytes(lltok::kw_dereferenceable, Bytes))
@@ -1634,6 +1678,9 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_inalloca:        B.addAttribute(Attribute::InAlloca); break;
     case lltok::kw_inreg:           B.addAttribute(Attribute::InReg); break;
     case lltok::kw_nest:            B.addAttribute(Attribute::Nest); break;
+    case lltok::kw_noundef:
+      B.addAttribute(Attribute::NoUndef);
+      break;
     case lltok::kw_noalias:         B.addAttribute(Attribute::NoAlias); break;
     case lltok::kw_nocapture:       B.addAttribute(Attribute::NoCapture); break;
     case lltok::kw_nofree:          B.addAttribute(Attribute::NoFree); break;
@@ -1662,6 +1709,7 @@ bool LLParser::ParseOptionalParamAttrs(AttrBuilder &B) {
     case lltok::kw_noimplicitfloat:
     case lltok::kw_noinline:
     case lltok::kw_nonlazybind:
+    case lltok::kw_nomerge:
     case lltok::kw_noredzone:
     case lltok::kw_noreturn:
     case lltok::kw_nocf_check:
@@ -1730,6 +1778,9 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     }
     case lltok::kw_inreg:           B.addAttribute(Attribute::InReg); break;
     case lltok::kw_noalias:         B.addAttribute(Attribute::NoAlias); break;
+    case lltok::kw_noundef:
+      B.addAttribute(Attribute::NoUndef);
+      break;
     case lltok::kw_nonnull:         B.addAttribute(Attribute::NonNull); break;
     case lltok::kw_signext:         B.addAttribute(Attribute::SExt); break;
     case lltok::kw_zeroext:         B.addAttribute(Attribute::ZExt); break;
@@ -1761,6 +1812,7 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_noimplicitfloat:
     case lltok::kw_noinline:
     case lltok::kw_nonlazybind:
+    case lltok::kw_nomerge:
     case lltok::kw_noredzone:
     case lltok::kw_noreturn:
     case lltok::kw_nocf_check:
@@ -1784,10 +1836,15 @@ bool LLParser::ParseOptionalReturnAttrs(AttrBuilder &B) {
     case lltok::kw_uwtable:
       HaveError |= Error(Lex.getLoc(), "invalid use of function-only attribute");
       break;
-
     case lltok::kw_readnone:
     case lltok::kw_readonly:
       HaveError |= Error(Lex.getLoc(), "invalid use of attribute on return type");
+      break;
+    case lltok::kw_preallocated:
+      HaveError |=
+          Error(Lex.getLoc(),
+                "invalid use of parameter-only/call site-only attribute");
+      break;
     }
 
     Lex.Lex();
@@ -2077,14 +2134,26 @@ bool LLParser::ParseOptionalFunctionMetadata(Function &F) {
 /// ParseOptionalAlignment
 ///   ::= /* empty */
 ///   ::= 'align' 4
-bool LLParser::ParseOptionalAlignment(MaybeAlign &Alignment) {
+bool LLParser::ParseOptionalAlignment(MaybeAlign &Alignment, bool AllowParens) {
   Alignment = None;
   if (!EatIfPresent(lltok::kw_align))
     return false;
   LocTy AlignLoc = Lex.getLoc();
   uint32_t Value = 0;
+
+  LocTy ParenLoc = Lex.getLoc();
+  bool HaveParens = false;
+  if (AllowParens) {
+    if (EatIfPresent(lltok::lparen))
+      HaveParens = true;
+  }
+
   if (ParseUInt32(Value))
     return true;
+
+  if (HaveParens && !EatIfPresent(lltok::rparen))
+    return Error(ParenLoc, "expected ')'");
+
   if (!isPowerOf2_32(Value))
     return Error(AlignLoc, "alignment is not a power of two");
   if (Value > Value::MaximumAlignment)
@@ -2499,6 +2568,21 @@ bool LLParser::ParseByValWithOptionalType(Type *&Result) {
   return false;
 }
 
+/// ParsePreallocated
+///   ::= preallocated(<ty>)
+bool LLParser::ParsePreallocated(Type *&Result) {
+  Result = nullptr;
+  if (!EatIfPresent(lltok::kw_preallocated))
+    return true;
+  if (!EatIfPresent(lltok::lparen))
+    return Error(Lex.getLoc(), "expected '('");
+  if (ParseType(Result))
+    return true;
+  if (!EatIfPresent(lltok::rparen))
+    return Error(Lex.getLoc(), "expected ')'");
+  return false;
+}
+
 /// ParseOptionalOperandBundles
 ///    ::= /*empty*/
 ///    ::= '[' OperandBundle [, OperandBundle ]* ']'
@@ -3416,7 +3500,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
     ID.Kind = ValID::t_Constant;
     return false;
   }
- 
+
   // Unary Operators.
   case lltok::kw_fneg: {
     unsigned Opc = Lex.getUIntVal();
@@ -3426,7 +3510,7 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
         ParseGlobalTypeAndValue(Val) ||
         ParseToken(lltok::rparen, "expected ')' in unary constantexpr"))
       return true;
-    
+
     // Check that the type is valid for the operator.
     switch (Opc) {
     case Instruction::FNeg:
@@ -3586,15 +3670,17 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
             "explicit pointee type doesn't match operand's pointee type");
 
       unsigned GEPWidth =
-          BaseType->isVectorTy() ? BaseType->getVectorNumElements() : 0;
+          BaseType->isVectorTy()
+              ? cast<FixedVectorType>(BaseType)->getNumElements()
+              : 0;
 
       ArrayRef<Constant *> Indices(Elts.begin() + 1, Elts.end());
       for (Constant *Val : Indices) {
         Type *ValTy = Val->getType();
         if (!ValTy->isIntOrIntVectorTy())
           return Error(ID.Loc, "getelementptr index must be an integer");
-        if (ValTy->isVectorTy()) {
-          unsigned ValNumEl = ValTy->getVectorNumElements();
+        if (auto *ValVTy = dyn_cast<VectorType>(ValTy)) {
+          unsigned ValNumEl = cast<FixedVectorType>(ValVTy)->getNumElements();
           if (GEPWidth && (ValNumEl != GEPWidth))
             return Error(
                 ID.Loc,
@@ -3633,8 +3719,9 @@ bool LLParser::ParseValID(ValID &ID, PerFunctionState *PFS) {
         return Error(ID.Loc, "expected three operands to shufflevector");
       if (!ShuffleVectorInst::isValidOperands(Elts[0], Elts[1], Elts[2]))
         return Error(ID.Loc, "invalid operands to shufflevector");
-      ID.ConstantVal =
-                 ConstantExpr::getShuffleVector(Elts[0], Elts[1],Elts[2]);
+      SmallVector<int, 16> Mask;
+      ShuffleVectorInst::getShuffleMask(cast<Constant>(Elts[2]), Mask);
+      ID.ConstantVal = ConstantExpr::getShuffleVector(Elts[0], Elts[1], Mask);
     } else if (Opc == Instruction::ExtractElement) {
       if (Elts.size() != 2)
         return Error(ID.Loc, "expected two operands to extractelement");
@@ -3695,7 +3782,7 @@ bool LLParser::parseOptionalComdat(StringRef GlobalName, Comdat *&C) {
   } else {
     if (GlobalName.empty())
       return TokError("comdat cannot be unnamed");
-    C = getComdat(GlobalName, KwLoc);
+    C = getComdat(std::string(GlobalName), KwLoc);
   }
 
   return false;
@@ -3867,6 +3954,10 @@ struct DISPFlagField : public MDFieldImpl<DISubprogram::DISPFlags> {
   DISPFlagField() : MDFieldImpl(DISubprogram::SPFlagZero) {}
 };
 
+struct MDAPSIntField : public MDFieldImpl<APSInt> {
+  MDAPSIntField() : ImplTy(APSInt()) {}
+};
+
 struct MDSignedField : public MDFieldImpl<int64_t> {
   int64_t Min;
   int64_t Max;
@@ -3946,6 +4037,16 @@ struct MDSignedOrUnsignedField
 namespace llvm {
 
 template <>
+bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDAPSIntField &Result) {
+  if (Lex.getKind() != lltok::APSInt)
+    return TokError("expected integer");
+
+  Result.assign(Lex.getAPSIntVal());
+  Lex.Lex();
+  return false;
+}
+
+template <>
 bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
                             MDUnsignedField &Result) {
   if (Lex.getKind() != lltok::APSInt || Lex.getAPSIntVal().isSigned())
@@ -4277,27 +4378,6 @@ bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
 }
 
 template <>
-bool LLParser::ParseMDField(LocTy Loc, StringRef Name,
-                            MDSignedOrUnsignedField &Result) {
-  if (Lex.getKind() != lltok::APSInt)
-    return false;
-
-  if (Lex.getAPSIntVal().isSigned()) {
-    MDSignedField Res = Result.A;
-    if (ParseMDField(Loc, Name, Res))
-      return true;
-    Result.assign(Res);
-    return false;
-  }
-
-  MDUnsignedField Res = Result.B;
-  if (ParseMDField(Loc, Name, Res))
-    return true;
-  Result.assign(Res);
-  return false;
-}
-
-template <>
 bool LLParser::ParseMDField(LocTy Loc, StringRef Name, MDStringField &Result) {
   LocTy ValueLoc = Lex.getLoc();
   std::string S;
@@ -4446,21 +4526,41 @@ bool LLParser::ParseGenericDINode(MDNode *&Result, bool IsDistinct) {
 /// ParseDISubrange:
 ///   ::= !DISubrange(count: 30, lowerBound: 2)
 ///   ::= !DISubrange(count: !node, lowerBound: 2)
+///   ::= !DISubrange(lowerBound: !node1, upperBound: !node2, stride: !node3)
 bool LLParser::ParseDISubrange(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
-  REQUIRED(count, MDSignedOrMDField, (-1, -1, INT64_MAX, false));              \
-  OPTIONAL(lowerBound, MDSignedField, );
+  OPTIONAL(count, MDSignedOrMDField, (-1, -1, INT64_MAX, false));              \
+  OPTIONAL(lowerBound, MDSignedOrMDField, );                                   \
+  OPTIONAL(upperBound, MDSignedOrMDField, );                                   \
+  OPTIONAL(stride, MDSignedOrMDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
+  Metadata *Count = nullptr;
+  Metadata *LowerBound = nullptr;
+  Metadata *UpperBound = nullptr;
+  Metadata *Stride = nullptr;
   if (count.isMDSignedField())
-    Result = GET_OR_DISTINCT(
-        DISubrange, (Context, count.getMDSignedValue(), lowerBound.Val));
+    Count = ConstantAsMetadata::get(ConstantInt::getSigned(
+        Type::getInt64Ty(Context), count.getMDSignedValue()));
   else if (count.isMDField())
-    Result = GET_OR_DISTINCT(
-        DISubrange, (Context, count.getMDFieldValue(), lowerBound.Val));
-  else
-    return true;
+    Count = count.getMDFieldValue();
+
+  auto convToMetadata = [&](MDSignedOrMDField Bound) -> Metadata * {
+    if (Bound.isMDSignedField())
+      return ConstantAsMetadata::get(ConstantInt::getSigned(
+          Type::getInt64Ty(Context), Bound.getMDSignedValue()));
+    if (Bound.isMDField())
+      return Bound.getMDFieldValue();
+    return nullptr;
+  };
+
+  LowerBound = convToMetadata(lowerBound);
+  UpperBound = convToMetadata(upperBound);
+  Stride = convToMetadata(stride);
+
+  Result = GET_OR_DISTINCT(DISubrange,
+                           (Context, Count, LowerBound, UpperBound, Stride));
 
   return false;
 }
@@ -4470,17 +4570,20 @@ bool LLParser::ParseDISubrange(MDNode *&Result, bool IsDistinct) {
 bool LLParser::ParseDIEnumerator(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(name, MDStringField, );                                             \
-  REQUIRED(value, MDSignedOrUnsignedField, );                                  \
+  REQUIRED(value, MDAPSIntField, );                                            \
   OPTIONAL(isUnsigned, MDBoolField, (false));
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  if (isUnsigned.Val && value.isMDSignedField())
+  if (isUnsigned.Val && value.Val.isNegative())
     return TokError("unsigned enumerator with negative value");
 
-  int64_t Value = value.isMDSignedField()
-                      ? value.getMDSignedValue()
-                      : static_cast<int64_t>(value.getMDUnsignedValue());
+  APSInt Value(value.Val);
+  // Add a leading zero so that unsigned values with the msb set are not
+  // mistaken for negative values when used for signed enumerators.
+  if (!isUnsigned.Val && value.Val.isUnsigned() && value.Val.isSignBitSet())
+    Value = Value.zext(Value.getBitWidth() + 1);
+
   Result =
       GET_OR_DISTINCT(DIEnumerator, (Context, Value, isUnsigned.Val, name.Val));
 
@@ -4557,7 +4660,8 @@ bool LLParser::ParseDICompositeType(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(vtableHolder, MDField, );                                           \
   OPTIONAL(templateParams, MDField, );                                         \
   OPTIONAL(identifier, MDStringField, );                                       \
-  OPTIONAL(discriminator, MDField, );
+  OPTIONAL(discriminator, MDField, );                                          \
+  OPTIONAL(dataLocation, MDField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4566,8 +4670,8 @@ bool LLParser::ParseDICompositeType(MDNode *&Result, bool IsDistinct) {
     if (auto *CT = DICompositeType::buildODRType(
             Context, *identifier.Val, tag.Val, name.Val, file.Val, line.Val,
             scope.Val, baseType.Val, size.Val, align.Val, offset.Val, flags.Val,
-            elements.Val, runtimeLang.Val, vtableHolder.Val,
-            templateParams.Val, discriminator.Val)) {
+            elements.Val, runtimeLang.Val, vtableHolder.Val, templateParams.Val,
+            discriminator.Val, dataLocation.Val)) {
       Result = CT;
       return false;
     }
@@ -4579,7 +4683,7 @@ bool LLParser::ParseDICompositeType(MDNode *&Result, bool IsDistinct) {
       (Context, tag.Val, name.Val, file.Val, line.Val, scope.Val, baseType.Val,
        size.Val, align.Val, offset.Val, flags.Val, elements.Val,
        runtimeLang.Val, vtableHolder.Val, templateParams.Val, identifier.Val,
-       discriminator.Val));
+       discriminator.Val, dataLocation.Val));
   return false;
 }
 
@@ -4633,7 +4737,8 @@ bool LLParser::ParseDIFile(MDNode *&Result, bool IsDistinct) {
 ///                      isOptimized: true, flags: "-O2", runtimeVersion: 1,
 ///                      splitDebugFilename: "abc.debug",
 ///                      emissionKind: FullDebug, enums: !1, retainedTypes: !2,
-///                      globals: !4, imports: !5, macros: !6, dwoId: 0x0abcd)
+///                      globals: !4, imports: !5, macros: !6, dwoId: 0x0abcd,
+///                      sysroot: "/", sdk: "MacOSX.sdk")
 bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   if (!IsDistinct)
     return Lex.Error("missing 'distinct', required for !DICompileUnit");
@@ -4656,7 +4761,9 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(splitDebugInlining, MDBoolField, = true);                           \
   OPTIONAL(debugInfoForProfiling, MDBoolField, = false);                       \
   OPTIONAL(nameTableKind, NameTableKindField, );                               \
-  OPTIONAL(debugBaseAddress, MDBoolField, = false);
+  OPTIONAL(rangesBaseAddress, MDBoolField, = false);                           \
+  OPTIONAL(sysroot, MDStringField, );                                          \
+  OPTIONAL(sdk, MDStringField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4665,7 +4772,7 @@ bool LLParser::ParseDICompileUnit(MDNode *&Result, bool IsDistinct) {
       runtimeVersion.Val, splitDebugFilename.Val, emissionKind.Val, enums.Val,
       retainedTypes.Val, globals.Val, imports.Val, macros.Val, dwoId.Val,
       splitDebugInlining.Val, debugInfoForProfiling.Val, nameTableKind.Val,
-      debugBaseAddress.Val);
+      rangesBaseAddress.Val, sysroot.Val, sdk.Val);
   return false;
 }
 
@@ -4762,7 +4869,7 @@ bool LLParser::ParseDICommonBlock(MDNode *&Result, bool IsDistinct) {
   OPTIONAL(declaration, MDField, );                                            \
   OPTIONAL(name, MDStringField, );                                             \
   OPTIONAL(file, MDField, );                                                   \
-  OPTIONAL(line, LineField, );						       
+  OPTIONAL(line, LineField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
@@ -4820,51 +4927,60 @@ bool LLParser::ParseDIMacroFile(MDNode *&Result, bool IsDistinct) {
 }
 
 /// ParseDIModule:
-///   ::= !DIModule(scope: !0, name: "SomeModule", configMacros: "-DNDEBUG",
-///                 includePath: "/usr/include", sysroot: "/")
+///   ::= !DIModule(scope: !0, name: "SomeModule", configMacros:
+///   "-DNDEBUG", includePath: "/usr/include", apinotes: "module.apinotes",
+///   file: !1, line: 4)
 bool LLParser::ParseDIModule(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   REQUIRED(scope, MDField, );                                                  \
   REQUIRED(name, MDStringField, );                                             \
   OPTIONAL(configMacros, MDStringField, );                                     \
   OPTIONAL(includePath, MDStringField, );                                      \
-  OPTIONAL(sysroot, MDStringField, );
+  OPTIONAL(apinotes, MDStringField, );                                         \
+  OPTIONAL(file, MDField, );                                                   \
+  OPTIONAL(line, LineField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(DIModule, (Context, scope.Val, name.Val,
-                           configMacros.Val, includePath.Val, sysroot.Val));
+  Result = GET_OR_DISTINCT(DIModule, (Context, file.Val, scope.Val, name.Val,
+                                      configMacros.Val, includePath.Val,
+                                      apinotes.Val, line.Val));
   return false;
 }
 
 /// ParseDITemplateTypeParameter:
-///   ::= !DITemplateTypeParameter(name: "Ty", type: !1)
+///   ::= !DITemplateTypeParameter(name: "Ty", type: !1, defaulted: false)
 bool LLParser::ParseDITemplateTypeParameter(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(name, MDStringField, );                                             \
-  REQUIRED(type, MDField, );
+  REQUIRED(type, MDField, );                                                   \
+  OPTIONAL(defaulted, MDBoolField, );
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result =
-      GET_OR_DISTINCT(DITemplateTypeParameter, (Context, name.Val, type.Val));
+  Result = GET_OR_DISTINCT(DITemplateTypeParameter,
+                           (Context, name.Val, type.Val, defaulted.Val));
   return false;
 }
 
 /// ParseDITemplateValueParameter:
 ///   ::= !DITemplateValueParameter(tag: DW_TAG_template_value_parameter,
-///                                 name: "V", type: !1, value: i32 7)
+///                                 name: "V", type: !1, defaulted: false,
+///                                 value: i32 7)
 bool LLParser::ParseDITemplateValueParameter(MDNode *&Result, bool IsDistinct) {
 #define VISIT_MD_FIELDS(OPTIONAL, REQUIRED)                                    \
   OPTIONAL(tag, DwarfTagField, (dwarf::DW_TAG_template_value_parameter));      \
   OPTIONAL(name, MDStringField, );                                             \
   OPTIONAL(type, MDField, );                                                   \
+  OPTIONAL(defaulted, MDBoolField, );                                          \
   REQUIRED(value, MDField, );
+
   PARSE_MD_FIELDS();
 #undef VISIT_MD_FIELDS
 
-  Result = GET_OR_DISTINCT(DITemplateValueParameter,
-                           (Context, tag.Val, name.Val, type.Val, value.Val));
+  Result = GET_OR_DISTINCT(
+      DITemplateValueParameter,
+      (Context, tag.Val, name.Val, type.Val, defaulted.Val, value.Val));
   return false;
 }
 
@@ -5174,13 +5290,16 @@ bool LLParser::ConvertValIDToValue(Type *Ty, ValID &ID, Value *&V,
         !ConstantFP::isValueValidForType(Ty, ID.APFloatVal))
       return Error(ID.Loc, "floating point constant invalid for type");
 
-    // The lexer has no type info, so builds all half, float, and double FP
-    // constants as double.  Fix this here.  Long double does not need this.
+    // The lexer has no type info, so builds all half, bfloat, float, and double
+    // FP constants as double.  Fix this here.  Long double does not need this.
     if (&ID.APFloatVal.getSemantics() == &APFloat::IEEEdouble()) {
       bool Ignored;
       if (Ty->isHalfTy())
         ID.APFloatVal.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven,
                               &Ignored);
+      else if (Ty->isBFloatTy())
+        ID.APFloatVal.convert(APFloat::BFloat(), APFloat::rmNearestTiesToEven,
+                              &Ignored);
       else if (Ty->isFloatTy())
         ID.APFloatVal.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
                               &Ignored);
@@ -5545,7 +5664,7 @@ bool LLParser::PerFunctionState::resolveForwardRefBlockAddresses() {
   ValID ID;
   if (FunctionNumber == -1) {
     ID.Kind = ValID::t_GlobalName;
-    ID.StrVal = F.getName();
+    ID.StrVal = std::string(F.getName());
   } else {
     ID.Kind = ValID::t_GlobalID;
     ID.UIntVal = FunctionNumber;
@@ -6419,9 +6538,6 @@ bool LLParser::ParseCallBr(Instruction *&Inst, PerFunctionState &PFS) {
                           /*IsCall=*/true))
     return true;
 
-  if (isa<InlineAsm>(Callee) && !Ty->getReturnType()->isVoidTy())
-    return Error(RetTypeLoc, "asm-goto outputs not supported");
-
   // Set up the Attribute for the function.
   SmallVector<Value *, 8> Args;
   SmallVector<AttributeSet, 8> ArgAttrs;
@@ -6868,9 +6984,11 @@ bool LLParser::ParseCall(Instruction *&Inst, PerFunctionState &PFS,
   CI->setTailCallKind(TCK);
   CI->setCallingConv(CC);
   if (FMF.any()) {
-    if (!isa<FPMathOperator>(CI))
+    if (!isa<FPMathOperator>(CI)) {
+      CI->deleteValue();
       return Error(CallLoc, "fast-math-flags specified for call without "
                    "floating-point scalar or vector return type");
+    }
     CI->setFastMathFlags(FMF);
   }
   CI->setAttributes(PAL);
@@ -6937,7 +7055,12 @@ int LLParser::ParseAlloc(Instruction *&Inst, PerFunctionState &PFS) {
   if (Size && !Size->getType()->isIntegerTy())
     return Error(SizeLoc, "element count must have integer type");
 
-  AllocaInst *AI = new AllocaInst(Ty, AddrSpace, Size, Alignment);
+  SmallPtrSet<Type *, 4> Visited;
+  if (!Alignment && !Ty->isSized(&Visited))
+    return Error(TyLoc, "Cannot allocate unsized type");
+  if (!Alignment)
+    Alignment = M->getDataLayout().getPrefTypeAlign(Ty);
+  AllocaInst *AI = new AllocaInst(Ty, AddrSpace, Size, *Alignment);
   AI->setUsedWithInAlloca(IsInAlloca);
   AI->setSwiftError(IsSwiftError);
   Inst = AI;
@@ -6987,8 +7110,12 @@ int LLParser::ParseLoad(Instruction *&Inst, PerFunctionState &PFS) {
   if (Ty != cast<PointerType>(Val->getType())->getElementType())
     return Error(ExplicitTypeLoc,
                  "explicit pointee type doesn't match operand's pointee type");
-
-  Inst = new LoadInst(Ty, Val, "", isVolatile, Alignment, Ordering, SSID);
+  SmallPtrSet<Type *, 4> Visited;
+  if (!Alignment && !Ty->isSized(&Visited))
+    return Error(ExplicitTypeLoc, "loading unsized types is not allowed");
+  if (!Alignment)
+    Alignment = M->getDataLayout().getABITypeAlign(Ty);
+  Inst = new LoadInst(Ty, Val, "", isVolatile, *Alignment, Ordering, SSID);
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
@@ -7034,8 +7161,13 @@ int LLParser::ParseStore(Instruction *&Inst, PerFunctionState &PFS) {
   if (Ordering == AtomicOrdering::Acquire ||
       Ordering == AtomicOrdering::AcquireRelease)
     return Error(Loc, "atomic store cannot use Acquire ordering");
+  SmallPtrSet<Type *, 4> Visited;
+  if (!Alignment && !Val->getType()->isSized(&Visited))
+    return Error(Loc, "storing unsized types is not allowed");
+  if (!Alignment)
+    Alignment = M->getDataLayout().getABITypeAlign(Val->getType());
 
-  Inst = new StoreInst(Val, Ptr, isVolatile, Alignment, Ordering, SSID);
+  Inst = new StoreInst(Val, Ptr, isVolatile, *Alignment, Ordering, SSID);
   return AteExtraComma ? InstExtraComma : InstNormal;
 }
 
@@ -7084,8 +7216,13 @@ int LLParser::ParseCmpXchg(Instruction *&Inst, PerFunctionState &PFS) {
     return Error(NewLoc, "new value and pointer type do not match");
   if (!New->getType()->isFirstClassType())
     return Error(NewLoc, "cmpxchg operand must be a first class value");
+
+  Align Alignment(
+      PFS.getFunction().getParent()->getDataLayout().getTypeStoreSize(
+          Cmp->getType()));
+
   AtomicCmpXchgInst *CXI = new AtomicCmpXchgInst(
-      Ptr, Cmp, New, SuccessOrdering, FailureOrdering, SSID);
+      Ptr, Cmp, New, Alignment, SuccessOrdering, FailureOrdering, SSID);
   CXI->setVolatile(isVolatile);
   CXI->setWeak(isWeak);
   Inst = CXI;
@@ -7169,9 +7306,11 @@ int LLParser::ParseAtomicRMW(Instruction *&Inst, PerFunctionState &PFS) {
   if (Size < 8 || (Size & (Size - 1)))
     return Error(ValLoc, "atomicrmw operand must be power-of-two byte-sized"
                          " integer");
-
+  Align Alignment(
+      PFS.getFunction().getParent()->getDataLayout().getTypeStoreSize(
+          Val->getType()));
   AtomicRMWInst *RMWI =
-    new AtomicRMWInst(Operation, Ptr, Val, Ordering, SSID);
+      new AtomicRMWInst(Operation, Ptr, Val, Alignment, Ordering, SSID);
   RMWI->setVolatile(isVolatile);
   Inst = RMWI;
   return AteExtraComma ? InstExtraComma : InstNormal;
@@ -7223,8 +7362,9 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
   bool AteExtraComma = false;
   // GEP returns a vector of pointers if at least one of parameters is a vector.
   // All vector parameters should have the same vector width.
-  unsigned GEPWidth = BaseType->isVectorTy() ?
-    BaseType->getVectorNumElements() : 0;
+  ElementCount GEPWidth = BaseType->isVectorTy()
+                              ? cast<VectorType>(BaseType)->getElementCount()
+                              : ElementCount(0, false);
 
   while (EatIfPresent(lltok::comma)) {
     if (Lex.getKind() == lltok::MetadataVar) {
@@ -7235,9 +7375,9 @@ int LLParser::ParseGetElementPtr(Instruction *&Inst, PerFunctionState &PFS) {
     if (!Val->getType()->isIntOrIntVectorTy())
       return Error(EltLoc, "getelementptr index must be an integer");
 
-    if (Val->getType()->isVectorTy()) {
-      unsigned ValNumEl = Val->getType()->getVectorNumElements();
-      if (GEPWidth && GEPWidth != ValNumEl)
+    if (auto *ValVTy = dyn_cast<VectorType>(Val->getType())) {
+      ElementCount ValNumEl = ValVTy->getElementCount();
+      if (GEPWidth != ElementCount(0, false) && GEPWidth != ValNumEl)
         return Error(EltLoc,
           "getelementptr vector index has a wrong number of elements");
       GEPWidth = ValNumEl;
@@ -7659,6 +7799,9 @@ bool LLParser::ParseTypeTestResolution(TypeTestResolution &TTRes) {
     return true;
 
   switch (Lex.getKind()) {
+  case lltok::kw_unknown:
+    TTRes.TheKind = TypeTestResolution::Unknown;
+    break;
   case lltok::kw_unsat:
     TTRes.TheKind = TypeTestResolution::Unsat;
     break;
@@ -7991,6 +8134,36 @@ void LLParser::AddGlobalValueToIndex(
   }
 }
 
+/// ParseSummaryIndexFlags
+///   ::= 'flags' ':' UInt64
+bool LLParser::ParseSummaryIndexFlags() {
+  assert(Lex.getKind() == lltok::kw_flags);
+  Lex.Lex();
+
+  if (ParseToken(lltok::colon, "expected ':' here"))
+    return true;
+  uint64_t Flags;
+  if (ParseUInt64(Flags))
+    return true;
+  Index->setFlags(Flags);
+  return false;
+}
+
+/// ParseBlockCount
+///   ::= 'blockcount' ':' UInt64
+bool LLParser::ParseBlockCount() {
+  assert(Lex.getKind() == lltok::kw_blockcount);
+  Lex.Lex();
+
+  if (ParseToken(lltok::colon, "expected ':' here"))
+    return true;
+  uint64_t BlockCount;
+  if (ParseUInt64(BlockCount))
+    return true;
+  Index->setBlockCount(BlockCount);
+  return false;
+}
+
 /// ParseGVEntry
 ///   ::= 'gv' ':' '(' ('name' ':' STRINGCONSTANT | 'guid' ':' UInt64)
 ///         [',' 'summaries' ':' Summary[',' Summary]* ]? ')'
@@ -8039,12 +8212,10 @@ bool LLParser::ParseGVEntry(unsigned ID) {
 
   // Have a list of summaries
   if (ParseToken(lltok::kw_summaries, "expected 'summaries' here") ||
-      ParseToken(lltok::colon, "expected ':' here"))
+      ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here"))
     return true;
-
   do {
-    if (ParseToken(lltok::lparen, "expected '(' here"))
-      return true;
     switch (Lex.getKind()) {
     case lltok::kw_function:
       if (ParseFunctionSummary(Name, GUID, ID))
@@ -8061,11 +8232,10 @@ bool LLParser::ParseGVEntry(unsigned ID) {
     default:
       return Error(Lex.getLoc(), "expected summary type");
     }
-    if (ParseToken(lltok::rparen, "expected ')' here"))
-      return true;
   } while (EatIfPresent(lltok::comma));
 
-  if (ParseToken(lltok::rparen, "expected ')' here"))
+  if (ParseToken(lltok::rparen, "expected ')' here") ||
+      ParseToken(lltok::rparen, "expected ')' here"))
     return true;
 
   return false;
@@ -8074,7 +8244,8 @@ bool LLParser::ParseGVEntry(unsigned ID) {
 /// FunctionSummary
 ///   ::= 'function' ':' '(' 'module' ':' ModuleReference ',' GVFlags
 ///         ',' 'insts' ':' UInt32 [',' OptionalFFlags]? [',' OptionalCalls]?
-///         [',' OptionalTypeIdInfo]? [',' OptionalRefs]? ')'
+///         [',' OptionalTypeIdInfo]? [',' OptionalParamAccesses]?
+///         [',' OptionalRefs]? ')'
 bool LLParser::ParseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
                                     unsigned ID) {
   assert(Lex.getKind() == lltok::kw_function);
@@ -8087,6 +8258,7 @@ bool LLParser::ParseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
   unsigned InstCount;
   std::vector<FunctionSummary::EdgeTy> Calls;
   FunctionSummary::TypeIdInfo TypeIdInfo;
+  std::vector<FunctionSummary::ParamAccess> ParamAccesses;
   std::vector<ValueInfo> Refs;
   // Default is all-zeros (conservative values).
   FunctionSummary::FFlags FFlags = {};
@@ -8118,6 +8290,10 @@ bool LLParser::ParseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
       if (ParseOptionalRefs(Refs))
         return true;
       break;
+    case lltok::kw_params:
+      if (ParseOptionalParamAccesses(ParamAccesses))
+        return true;
+      break;
     default:
       return Error(Lex.getLoc(), "expected optional function summary field");
     }
@@ -8132,7 +8308,8 @@ bool LLParser::ParseFunctionSummary(std::string Name, GlobalValue::GUID GUID,
       std::move(TypeIdInfo.TypeTestAssumeVCalls),
       std::move(TypeIdInfo.TypeCheckedLoadVCalls),
       std::move(TypeIdInfo.TypeTestAssumeConstVCalls),
-      std::move(TypeIdInfo.TypeCheckedLoadConstVCalls));
+      std::move(TypeIdInfo.TypeCheckedLoadConstVCalls),
+      std::move(ParamAccesses));
 
   FS->setModulePath(ModulePath);
 
@@ -8155,7 +8332,9 @@ bool LLParser::ParseVariableSummary(std::string Name, GlobalValue::GUID GUID,
       /*Linkage=*/GlobalValue::ExternalLinkage, /*NotEligibleToImport=*/false,
       /*Live=*/false, /*IsLocal=*/false, /*CanAutoHide=*/false);
   GlobalVarSummary::GVarFlags GVarFlags(/*ReadOnly*/ false,
-                                        /* WriteOnly */ false);
+                                        /* WriteOnly */ false,
+                                        /* Constant */ false,
+                                        GlobalObject::VCallVisibilityPublic);
   std::vector<ValueInfo> Refs;
   VTableFuncList VTableFuncs;
   if (ParseToken(lltok::colon, "expected ':' here") ||
@@ -8479,13 +8658,133 @@ bool LLParser::ParseOptionalVTableFuncs(VTableFuncList &VTableFuncs) {
   return false;
 }
 
+/// ParamNo := 'param' ':' UInt64
+bool LLParser::ParseParamNo(uint64_t &ParamNo) {
+  if (ParseToken(lltok::kw_param, "expected 'param' here") ||
+      ParseToken(lltok::colon, "expected ':' here") || ParseUInt64(ParamNo))
+    return true;
+  return false;
+}
+
+/// ParamAccessOffset := 'offset' ':' '[' APSINTVAL ',' APSINTVAL ']'
+bool LLParser::ParseParamAccessOffset(ConstantRange &Range) {
+  APSInt Lower;
+  APSInt Upper;
+  auto ParseAPSInt = [&](APSInt &Val) {
+    if (Lex.getKind() != lltok::APSInt)
+      return TokError("expected integer");
+    Val = Lex.getAPSIntVal();
+    Val = Val.extOrTrunc(FunctionSummary::ParamAccess::RangeWidth);
+    Val.setIsSigned(true);
+    Lex.Lex();
+    return false;
+  };
+  if (ParseToken(lltok::kw_offset, "expected 'offset' here") ||
+      ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lsquare, "expected '[' here") || ParseAPSInt(Lower) ||
+      ParseToken(lltok::comma, "expected ',' here") || ParseAPSInt(Upper) ||
+      ParseToken(lltok::rsquare, "expected ']' here"))
+    return true;
+
+  ++Upper;
+  Range =
+      (Lower == Upper && !Lower.isMaxValue())
+          ? ConstantRange::getEmpty(FunctionSummary::ParamAccess::RangeWidth)
+          : ConstantRange(Lower, Upper);
+
+  return false;
+}
+
+/// ParamAccessCall
+///   := '(' 'callee' ':' GVReference ',' ParamNo ',' ParamAccessOffset ')'
+bool LLParser::ParseParamAccessCall(FunctionSummary::ParamAccess::Call &Call) {
+  if (ParseToken(lltok::lparen, "expected '(' here") ||
+      ParseToken(lltok::kw_callee, "expected 'callee' here") ||
+      ParseToken(lltok::colon, "expected ':' here"))
+    return true;
+
+  unsigned GVId;
+  ValueInfo VI;
+  if (ParseGVReference(VI, GVId))
+    return true;
+
+  Call.Callee = VI.getGUID();
+
+  if (ParseToken(lltok::comma, "expected ',' here") ||
+      ParseParamNo(Call.ParamNo) ||
+      ParseToken(lltok::comma, "expected ',' here") ||
+      ParseParamAccessOffset(Call.Offsets))
+    return true;
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  return false;
+}
+
+/// ParamAccess
+///   := '(' ParamNo ',' ParamAccessOffset [',' OptionalParamAccessCalls]? ')'
+/// OptionalParamAccessCalls := '(' Call [',' Call]* ')'
+bool LLParser::ParseParamAccess(FunctionSummary::ParamAccess &Param) {
+  if (ParseToken(lltok::lparen, "expected '(' here") ||
+      ParseParamNo(Param.ParamNo) ||
+      ParseToken(lltok::comma, "expected ',' here") ||
+      ParseParamAccessOffset(Param.Use))
+    return true;
+
+  if (EatIfPresent(lltok::comma)) {
+    if (ParseToken(lltok::kw_calls, "expected 'calls' here") ||
+        ParseToken(lltok::colon, "expected ':' here") ||
+        ParseToken(lltok::lparen, "expected '(' here"))
+      return true;
+    do {
+      FunctionSummary::ParamAccess::Call Call;
+      if (ParseParamAccessCall(Call))
+        return true;
+      Param.Calls.push_back(Call);
+    } while (EatIfPresent(lltok::comma));
+
+    if (ParseToken(lltok::rparen, "expected ')' here"))
+      return true;
+  }
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  return false;
+}
+
+/// OptionalParamAccesses
+///   := 'params' ':' '(' ParamAccess [',' ParamAccess]* ')'
+bool LLParser::ParseOptionalParamAccesses(
+    std::vector<FunctionSummary::ParamAccess> &Params) {
+  assert(Lex.getKind() == lltok::kw_params);
+  Lex.Lex();
+
+  if (ParseToken(lltok::colon, "expected ':' here") ||
+      ParseToken(lltok::lparen, "expected '(' here"))
+    return true;
+
+  do {
+    FunctionSummary::ParamAccess ParamAccess;
+    if (ParseParamAccess(ParamAccess))
+      return true;
+    Params.push_back(ParamAccess);
+  } while (EatIfPresent(lltok::comma));
+
+  if (ParseToken(lltok::rparen, "expected ')' here"))
+    return true;
+
+  return false;
+}
+
 /// OptionalRefs
 ///   := 'refs' ':' '(' GVReference [',' GVReference]* ')'
 bool LLParser::ParseOptionalRefs(std::vector<ValueInfo> &Refs) {
   assert(Lex.getKind() == lltok::kw_refs);
   Lex.Lex();
 
-  if (ParseToken(lltok::colon, "expected ':' in refs") |
+  if (ParseToken(lltok::colon, "expected ':' in refs") ||
       ParseToken(lltok::lparen, "expected '(' in refs"))
     return true;
 
@@ -8827,7 +9126,8 @@ bool LLParser::ParseGVFlags(GlobalValueSummary::GVFlags &GVFlags) {
 
 /// GVarFlags
 ///   ::= 'varFlags' ':' '(' 'readonly' ':' Flag
-///                      ',' 'writeonly' ':' Flag ')'
+///                      ',' 'writeonly' ':' Flag
+///                      ',' 'constant' ':' Flag ')'
 bool LLParser::ParseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags) {
   assert(Lex.getKind() == lltok::kw_varFlags);
   Lex.Lex();
@@ -8856,6 +9156,16 @@ bool LLParser::ParseGVarFlags(GlobalVarSummary::GVarFlags &GVarFlags) {
         return true;
       GVarFlags.MaybeWriteOnly = Flag;
       break;
+    case lltok::kw_constant:
+      if (ParseRest(Flag))
+        return true;
+      GVarFlags.Constant = Flag;
+      break;
+    case lltok::kw_vcall_visibility:
+      if (ParseRest(Flag))
+        return true;
+      GVarFlags.VCallVisibility = Flag;
+      break;
     default:
       return Error(Lex.getLoc(), "expected gvar flag type");
     }
diff --git a/llvm/lib/AsmParser/LLParser.h b/llvm/lib/AsmParser/LLParser.h
index cf2121dcc70a..ebd8655dc35e 100644
--- a/llvm/lib/AsmParser/LLParser.h
+++ b/llvm/lib/AsmParser/LLParser.h
@@ -16,18 +16,16 @@
 #include "LLLexer.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/AsmParser/Parser.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Type.h"
-#include "llvm/IR/ValueHandle.h"
 #include <map>
 
 namespace llvm {
   class Module;
-  class OpaqueType;
   class Function;
   class Value;
   class BasicBlock;
@@ -38,7 +36,6 @@ namespace llvm {
   class MDString;
   class MDNode;
   struct SlotMapping;
-  class StructType;
 
   /// ValID - Represents a reference of a definition of some sort with no type.
   /// There are several cases where we have to parse the value but where the
@@ -160,23 +157,17 @@ namespace llvm {
     /// UpgradeDebuginfo so it can generate broken bitcode.
     bool UpgradeDebugInfo;
 
-    /// DataLayout string to override that in LLVM assembly.
-    StringRef DataLayoutStr;
-
     std::string SourceFileName;
 
   public:
     LLParser(StringRef F, SourceMgr &SM, SMDiagnostic &Err, Module *M,
              ModuleSummaryIndex *Index, LLVMContext &Context,
-             SlotMapping *Slots = nullptr, bool UpgradeDebugInfo = true,
-             StringRef DataLayoutString = "")
+             SlotMapping *Slots = nullptr)
         : Context(Context), Lex(F, SM, Err, Context), M(M), Index(Index),
-          Slots(Slots), BlockAddressPFS(nullptr),
-          UpgradeDebugInfo(UpgradeDebugInfo), DataLayoutStr(DataLayoutString) {
-      if (!DataLayoutStr.empty())
-        M->setDataLayout(DataLayoutStr);
-    }
-    bool Run();
+          Slots(Slots), BlockAddressPFS(nullptr) {}
+    bool Run(
+        bool UpgradeDebugInfo,
+        DataLayoutCallbackTy DataLayoutCallback = [](Module *) {});
 
     bool parseStandaloneConstantValue(Constant *&C, const SlotMapping *Slots);
 
@@ -281,7 +272,8 @@ namespace llvm {
     void ParseOptionalVisibility(unsigned &Res);
     void ParseOptionalDLLStorageClass(unsigned &Res);
     bool ParseOptionalCallingConv(unsigned &CC);
-    bool ParseOptionalAlignment(MaybeAlign &Alignment);
+    bool ParseOptionalAlignment(MaybeAlign &Alignment,
+                                bool AllowParens = false);
     bool ParseOptionalDerefAttrBytes(lltok::Kind AttrKind, uint64_t &Bytes);
     bool ParseScopeAndOrdering(bool isAtomic, SyncScope::ID &SSID,
                                AtomicOrdering &Ordering);
@@ -306,8 +298,9 @@ namespace llvm {
 
     // Top-Level Entities
     bool ParseTopLevelEntities();
-    bool ValidateEndOfModule();
+    bool ValidateEndOfModule(bool UpgradeDebugInfo);
     bool ValidateEndOfIndex();
+    bool ParseTargetDefinitions();
     bool ParseTargetDefinition();
     bool ParseModuleAsm();
     bool ParseSourceFileName();
@@ -340,6 +333,7 @@ namespace llvm {
                                     std::vector<unsigned> &FwdRefAttrGrps,
                                     bool inAttrGrp, LocTy &BuiltinLoc);
     bool ParseByValWithOptionalType(Type *&Result);
+    bool ParsePreallocated(Type *&Result);
 
     // Module Summary Index Parsing.
     bool SkipModuleSummaryEntry();
@@ -347,6 +341,8 @@ namespace llvm {
     bool ParseModuleEntry(unsigned ID);
     bool ParseModuleReference(StringRef &ModulePath);
     bool ParseGVReference(ValueInfo &VI, unsigned &GVId);
+    bool ParseSummaryIndexFlags();
+    bool ParseBlockCount();
     bool ParseGVEntry(unsigned ID);
     bool ParseFunctionSummary(std::string Name, GlobalValue::GUID, unsigned ID);
     bool ParseVariableSummary(std::string Name, GlobalValue::GUID, unsigned ID);
@@ -370,6 +366,12 @@ namespace llvm {
     bool ParseVFuncId(FunctionSummary::VFuncId &VFuncId,
                       IdToIndexMapType &IdToIndexMap, unsigned Index);
     bool ParseOptionalVTableFuncs(VTableFuncList &VTableFuncs);
+    bool ParseOptionalParamAccesses(
+        std::vector<FunctionSummary::ParamAccess> &Params);
+    bool ParseParamNo(uint64_t &ParamNo);
+    bool ParseParamAccess(FunctionSummary::ParamAccess &Param);
+    bool ParseParamAccessCall(FunctionSummary::ParamAccess::Call &Call);
+    bool ParseParamAccessOffset(ConstantRange &range);
     bool ParseOptionalRefs(std::vector<ValueInfo> &Refs);
     bool ParseTypeIdEntry(unsigned ID);
     bool ParseTypeIdSummary(TypeIdSummary &TIS);
diff --git a/llvm/lib/AsmParser/LLToken.h b/llvm/lib/AsmParser/LLToken.h
index e430e0f6faa0..0fb3bae77dd3 100644
--- a/llvm/lib/AsmParser/LLToken.h
+++ b/llvm/lib/AsmParser/LLToken.h
@@ -196,6 +196,7 @@ enum Kind {
   kw_naked,
   kw_nest,
   kw_noalias,
+  kw_noundef,
   kw_nobuiltin,
   kw_nocapture,
   kw_noduplicate,
@@ -204,15 +205,18 @@ enum Kind {
   kw_noinline,
   kw_norecurse,
   kw_nonlazybind,
+  kw_nomerge,
   kw_nonnull,
   kw_noredzone,
   kw_noreturn,
   kw_nosync,
   kw_nocf_check,
   kw_nounwind,
+  kw_null_pointer_is_valid,
   kw_optforfuzzing,
   kw_optnone,
   kw_optsize,
+  kw_preallocated,
   kw_readnone,
   kw_readonly,
   kw_returned,
@@ -371,6 +375,7 @@ enum Kind {
   kw_name,
   kw_summaries,
   kw_flags,
+  kw_blockcount,
   kw_linkage,
   kw_notEligibleToImport,
   kw_live,
@@ -387,6 +392,8 @@ enum Kind {
   kw_alwaysInline,
   kw_calls,
   kw_callee,
+  kw_params,
+  kw_param,
   kw_hotness,
   kw_unknown,
   kw_hot,
@@ -421,6 +428,7 @@ enum Kind {
   kw_sizeM1,
   kw_bitMask,
   kw_inlineBits,
+  kw_vcall_visibility,
   kw_wpdResolutions,
   kw_wpdRes,
   kw_indir,
diff --git a/llvm/lib/AsmParser/Parser.cpp b/llvm/lib/AsmParser/Parser.cpp
index b7f552a6fccb..8147620181f9 100644
--- a/llvm/lib/AsmParser/Parser.cpp
+++ b/llvm/lib/AsmParser/Parser.cpp
@@ -17,44 +17,50 @@
 #include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/SourceMgr.h"
-#include "llvm/Support/raw_ostream.h"
 #include <cstring>
 #include <system_error>
+
 using namespace llvm;
 
-bool llvm::parseAssemblyInto(MemoryBufferRef F, Module *M,
-                             ModuleSummaryIndex *Index, SMDiagnostic &Err,
-                             SlotMapping *Slots, bool UpgradeDebugInfo,
-                             StringRef DataLayoutString) {
+static bool parseAssemblyInto(MemoryBufferRef F, Module *M,
+                              ModuleSummaryIndex *Index, SMDiagnostic &Err,
+                              SlotMapping *Slots, bool UpgradeDebugInfo,
+                              DataLayoutCallbackTy DataLayoutCallback) {
   SourceMgr SM;
   std::unique_ptr<MemoryBuffer> Buf = MemoryBuffer::getMemBuffer(F);
   SM.AddNewSourceBuffer(std::move(Buf), SMLoc());
 
   LLVMContext Context;
   return LLParser(F.getBuffer(), SM, Err, M, Index,
-                  M ? M->getContext() : Context, Slots, UpgradeDebugInfo,
-                  DataLayoutString)
-      .Run();
+                  M ? M->getContext() : Context, Slots)
+      .Run(UpgradeDebugInfo, DataLayoutCallback);
+}
+
+bool llvm::parseAssemblyInto(MemoryBufferRef F, Module *M,
+                             ModuleSummaryIndex *Index, SMDiagnostic &Err,
+                             SlotMapping *Slots,
+                             DataLayoutCallbackTy DataLayoutCallback) {
+  return ::parseAssemblyInto(F, M, Index, Err, Slots,
+                             /*UpgradeDebugInfo*/ true, DataLayoutCallback);
 }
 
 std::unique_ptr<Module>
 llvm::parseAssembly(MemoryBufferRef F, SMDiagnostic &Err, LLVMContext &Context,
-                    SlotMapping *Slots, bool UpgradeDebugInfo,
-                    StringRef DataLayoutString) {
+                    SlotMapping *Slots,
+                    DataLayoutCallbackTy DataLayoutCallback) {
   std::unique_ptr<Module> M =
       std::make_unique<Module>(F.getBufferIdentifier(), Context);
 
-  if (parseAssemblyInto(F, M.get(), nullptr, Err, Slots, UpgradeDebugInfo,
-                        DataLayoutString))
+  if (parseAssemblyInto(F, M.get(), nullptr, Err, Slots, DataLayoutCallback))
     return nullptr;
 
   return M;
 }
 
-std::unique_ptr<Module>
-llvm::parseAssemblyFile(StringRef Filename, SMDiagnostic &Err,
-                        LLVMContext &Context, SlotMapping *Slots,
-                        bool UpgradeDebugInfo, StringRef DataLayoutString) {
+std::unique_ptr<Module> llvm::parseAssemblyFile(StringRef Filename,
+                                                SMDiagnostic &Err,
+                                                LLVMContext &Context,
+                                                SlotMapping *Slots) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename);
   if (std::error_code EC = FileOrErr.getError()) {
@@ -63,28 +69,40 @@ llvm::parseAssemblyFile(StringRef Filename, SMDiagnostic &Err,
     return nullptr;
   }
 
-  return parseAssembly(FileOrErr.get()->getMemBufferRef(), Err, Context, Slots,
-                       UpgradeDebugInfo, DataLayoutString);
+  return parseAssembly(FileOrErr.get()->getMemBufferRef(), Err, Context, Slots);
 }
 
-ParsedModuleAndIndex llvm::parseAssemblyWithIndex(
-    MemoryBufferRef F, SMDiagnostic &Err, LLVMContext &Context,
-    SlotMapping *Slots, bool UpgradeDebugInfo, StringRef DataLayoutString) {
+static ParsedModuleAndIndex
+parseAssemblyWithIndex(MemoryBufferRef F, SMDiagnostic &Err,
+                       LLVMContext &Context, SlotMapping *Slots,
+                       bool UpgradeDebugInfo,
+                       DataLayoutCallbackTy DataLayoutCallback) {
   std::unique_ptr<Module> M =
       std::make_unique<Module>(F.getBufferIdentifier(), Context);
   std::unique_ptr<ModuleSummaryIndex> Index =
       std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/true);
 
   if (parseAssemblyInto(F, M.get(), Index.get(), Err, Slots, UpgradeDebugInfo,
-                        DataLayoutString))
+                        DataLayoutCallback))
     return {nullptr, nullptr};
 
   return {std::move(M), std::move(Index)};
 }
 
-ParsedModuleAndIndex llvm::parseAssemblyFileWithIndex(
-    StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
-    SlotMapping *Slots, bool UpgradeDebugInfo, StringRef DataLayoutString) {
+ParsedModuleAndIndex llvm::parseAssemblyWithIndex(MemoryBufferRef F,
+                                                  SMDiagnostic &Err,
+                                                  LLVMContext &Context,
+                                                  SlotMapping *Slots) {
+  return ::parseAssemblyWithIndex(F, Err, Context, Slots,
+                                  /*UpgradeDebugInfo*/ true,
+                                  [](StringRef) { return None; });
+}
+
+static ParsedModuleAndIndex
+parseAssemblyFileWithIndex(StringRef Filename, SMDiagnostic &Err,
+                           LLVMContext &Context, SlotMapping *Slots,
+                           bool UpgradeDebugInfo,
+                           DataLayoutCallbackTy DataLayoutCallback) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename);
   if (std::error_code EC = FileOrErr.getError()) {
@@ -95,16 +113,32 @@ ParsedModuleAndIndex llvm::parseAssemblyFileWithIndex(
 
   return parseAssemblyWithIndex(FileOrErr.get()->getMemBufferRef(), Err,
                                 Context, Slots, UpgradeDebugInfo,
-                                DataLayoutString);
+                                DataLayoutCallback);
 }
 
-std::unique_ptr<Module>
-llvm::parseAssemblyString(StringRef AsmString, SMDiagnostic &Err,
-                          LLVMContext &Context, SlotMapping *Slots,
-                          bool UpgradeDebugInfo, StringRef DataLayoutString) {
+ParsedModuleAndIndex
+llvm::parseAssemblyFileWithIndex(StringRef Filename, SMDiagnostic &Err,
+                                 LLVMContext &Context, SlotMapping *Slots,
+                                 DataLayoutCallbackTy DataLayoutCallback) {
+  return ::parseAssemblyFileWithIndex(Filename, Err, Context, Slots,
+                                      /*UpgradeDebugInfo*/ true,
+                                      DataLayoutCallback);
+}
+
+ParsedModuleAndIndex llvm::parseAssemblyFileWithIndexNoUpgradeDebugInfo(
+    StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
+    SlotMapping *Slots, DataLayoutCallbackTy DataLayoutCallback) {
+  return ::parseAssemblyFileWithIndex(Filename, Err, Context, Slots,
+                                      /*UpgradeDebugInfo*/ false,
+                                      DataLayoutCallback);
+}
+
+std::unique_ptr<Module> llvm::parseAssemblyString(StringRef AsmString,
+                                                  SMDiagnostic &Err,
+                                                  LLVMContext &Context,
+                                                  SlotMapping *Slots) {
   MemoryBufferRef F(AsmString, "<string>");
-  return parseAssembly(F, Err, Context, Slots, UpgradeDebugInfo,
-                       DataLayoutString);
+  return parseAssembly(F, Err, Context, Slots);
 }
 
 static bool parseSummaryIndexAssemblyInto(MemoryBufferRef F,
@@ -117,7 +151,8 @@ static bool parseSummaryIndexAssemblyInto(MemoryBufferRef F,
   // The parser holds a reference to a context that is unused when parsing the
   // index, but we need to initialize it.
   LLVMContext unusedContext;
-  return LLParser(F.getBuffer(), SM, Err, nullptr, &Index, unusedContext).Run();
+  return LLParser(F.getBuffer(), SM, Err, nullptr, &Index, unusedContext)
+      .Run(true, [](StringRef) { return None; });
 }
 
 std::unique_ptr<ModuleSummaryIndex>
diff --git a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
index d927171d556c..cd1d872cc219 100644
--- a/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
+++ b/llvm/lib/BinaryFormat/AMDGPUMetadataVerifier.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/BinaryFormat/AMDGPUMetadataVerifier.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/AMDGPUMetadata.h"
 
 namespace llvm {
@@ -126,25 +127,6 @@ bool MetadataVerifier::verifyKernelArgs(msgpack::DocNode &Node) {
                                .Default(false);
                          }))
     return false;
-  if (!verifyScalarEntry(ArgsMap, ".value_type", true,
-                         msgpack::Type::String,
-                         [](msgpack::DocNode &SNode) {
-                           return StringSwitch<bool>(SNode.getString())
-                               .Case("struct", true)
-                               .Case("i8", true)
-                               .Case("u8", true)
-                               .Case("i16", true)
-                               .Case("u16", true)
-                               .Case("f16", true)
-                               .Case("i32", true)
-                               .Case("u32", true)
-                               .Case("f32", true)
-                               .Case("i64", true)
-                               .Case("u64", true)
-                               .Case("f64", true)
-                               .Default(false);
-                         }))
-    return false;
   if (!verifyIntegerEntry(ArgsMap, ".pointee_align", false))
     return false;
   if (!verifyScalarEntry(ArgsMap, ".address_space", false,
diff --git a/llvm/lib/BinaryFormat/Dwarf.cpp b/llvm/lib/BinaryFormat/Dwarf.cpp
index 9ca3317418ce..a497c16685c1 100644
--- a/llvm/lib/BinaryFormat/Dwarf.cpp
+++ b/llvm/lib/BinaryFormat/Dwarf.cpp
@@ -477,6 +477,23 @@ unsigned llvm::dwarf::getMacinfo(StringRef MacinfoString) {
       .Default(DW_MACINFO_invalid);
 }
 
+StringRef llvm::dwarf::MacroString(unsigned Encoding) {
+  switch (Encoding) {
+  default:
+    return StringRef();
+#define HANDLE_DW_MACRO(ID, NAME)                                              \
+  case DW_MACRO_##NAME:                                                        \
+    return "DW_MACRO_" #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  }
+}
+
+unsigned llvm::dwarf::getMacro(StringRef MacroString) {
+  return StringSwitch<unsigned>(MacroString)
+#define HANDLE_DW_MACRO(ID, NAME) .Case("DW_MACRO_" #NAME, ID)
+#include "llvm/BinaryFormat/Dwarf.def"
+      .Default(DW_MACINFO_invalid);
+}
 StringRef llvm::dwarf::RangeListEncodingString(unsigned Encoding) {
   switch (Encoding) {
   default:
@@ -753,7 +770,23 @@ bool llvm::dwarf::isValidFormForVersion(Form F, unsigned Version,
   return ExtensionsOk;
 }
 
+StringRef llvm::dwarf::FormatString(DwarfFormat Format) {
+  switch (Format) {
+  case DWARF32:
+    return "DWARF32";
+  case DWARF64:
+    return "DWARF64";
+  }
+  return StringRef();
+}
+
+StringRef llvm::dwarf::FormatString(bool IsDWARF64) {
+  return FormatString(IsDWARF64 ? DWARF64 : DWARF32);
+}
+
 constexpr char llvm::dwarf::EnumTraits<Attribute>::Type[];
 constexpr char llvm::dwarf::EnumTraits<Form>::Type[];
 constexpr char llvm::dwarf::EnumTraits<Index>::Type[];
 constexpr char llvm::dwarf::EnumTraits<Tag>::Type[];
+constexpr char llvm::dwarf::EnumTraits<LineNumberOps>::Type[];
+constexpr char llvm::dwarf::EnumTraits<LocationAtom>::Type[];
diff --git a/llvm/lib/BinaryFormat/MachO.cpp b/llvm/lib/BinaryFormat/MachO.cpp
new file mode 100644
index 000000000000..2b9eb8025521
--- /dev/null
+++ b/llvm/lib/BinaryFormat/MachO.cpp
@@ -0,0 +1,109 @@
+//===-- llvm/BinaryFormat/MachO.cpp - The MachO file format -----*- C++/-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/Support/ARMTargetParser.h"
+
+using namespace llvm;
+
+static MachO::CPUSubTypeX86 getX86SubType(const Triple &T) {
+  assert(T.isX86());
+  if (T.isArch32Bit())
+    return MachO::CPU_SUBTYPE_I386_ALL;
+
+  assert(T.isArch64Bit());
+  if (T.getArchName() == "x86_64h")
+    return MachO::CPU_SUBTYPE_X86_64_H;
+  return MachO::CPU_SUBTYPE_X86_64_ALL;
+}
+
+static MachO::CPUSubTypeARM getARMSubType(const Triple &T) {
+  assert(T.isARM() || T.isThumb());
+  StringRef Arch = T.getArchName();
+  ARM::ArchKind AK = ARM::parseArch(Arch);
+  switch (AK) {
+  default:
+    return MachO::CPU_SUBTYPE_ARM_V7;
+  case ARM::ArchKind::ARMV4T:
+    return MachO::CPU_SUBTYPE_ARM_V4T;
+  case ARM::ArchKind::ARMV5T:
+  case ARM::ArchKind::ARMV5TE:
+  case ARM::ArchKind::ARMV5TEJ:
+    return MachO::CPU_SUBTYPE_ARM_V5;
+  case ARM::ArchKind::ARMV6:
+  case ARM::ArchKind::ARMV6K:
+    return MachO::CPU_SUBTYPE_ARM_V6;
+  case ARM::ArchKind::ARMV7A:
+    return MachO::CPU_SUBTYPE_ARM_V7;
+  case ARM::ArchKind::ARMV7S:
+    return MachO::CPU_SUBTYPE_ARM_V7S;
+  case ARM::ArchKind::ARMV7K:
+    return MachO::CPU_SUBTYPE_ARM_V7K;
+  case ARM::ArchKind::ARMV6M:
+    return MachO::CPU_SUBTYPE_ARM_V6M;
+  case ARM::ArchKind::ARMV7M:
+    return MachO::CPU_SUBTYPE_ARM_V7M;
+  case ARM::ArchKind::ARMV7EM:
+    return MachO::CPU_SUBTYPE_ARM_V7EM;
+  }
+}
+
+static MachO::CPUSubTypeARM64 getARM64SubType(const Triple &T) {
+  assert(T.isAArch64() || T.getArch() == Triple::aarch64_32);
+  if (T.isArch32Bit())
+    return (MachO::CPUSubTypeARM64)MachO::CPU_SUBTYPE_ARM64_32_V8;
+  if (T.getArchName() == "arm64e")
+    return MachO::CPU_SUBTYPE_ARM64E;
+
+  return MachO::CPU_SUBTYPE_ARM64_ALL;
+}
+
+static MachO::CPUSubTypePowerPC getPowerPCSubType(const Triple &T) {
+  return MachO::CPU_SUBTYPE_POWERPC_ALL;
+}
+
+static Error unsupported(const char *Str, const Triple &T) {
+  return createStringError(std::errc::invalid_argument,
+                           "Unsupported triple for mach-o cpu %s: %s", Str,
+                           T.str().c_str());
+}
+
+Expected<uint32_t> MachO::getCPUType(const Triple &T) {
+  if (!T.isOSBinFormatMachO())
+    return unsupported("type", T);
+  if (T.isX86() && T.isArch32Bit())
+    return MachO::CPU_TYPE_X86;
+  if (T.isX86() && T.isArch64Bit())
+    return MachO::CPU_TYPE_X86_64;
+  if (T.isARM() || T.isThumb())
+    return MachO::CPU_TYPE_ARM;
+  if (T.isAArch64())
+    return MachO::CPU_TYPE_ARM64;
+  if (T.getArch() == Triple::aarch64_32)
+    return MachO::CPU_TYPE_ARM64_32;
+  if (T.getArch() == Triple::ppc)
+    return MachO::CPU_TYPE_POWERPC;
+  if (T.getArch() == Triple::ppc64)
+    return MachO::CPU_TYPE_POWERPC64;
+  return unsupported("type", T);
+}
+
+Expected<uint32_t> MachO::getCPUSubType(const Triple &T) {
+  if (!T.isOSBinFormatMachO())
+    return unsupported("subtype", T);
+  if (T.isX86())
+    return getX86SubType(T);
+  if (T.isARM() || T.isThumb())
+    return getARMSubType(T);
+  if (T.isAArch64() || T.getArch() == Triple::aarch64_32)
+    return getARM64SubType(T);
+  if (T.getArch() == Triple::ppc || T.getArch() == Triple::ppc64)
+    return getPowerPCSubType(T);
+  return unsupported("subtype", T);
+}
diff --git a/llvm/lib/BinaryFormat/Magic.cpp b/llvm/lib/BinaryFormat/Magic.cpp
index bbcbbabededb..61b1504e59b0 100644
--- a/llvm/lib/BinaryFormat/Magic.cpp
+++ b/llvm/lib/BinaryFormat/Magic.cpp
@@ -7,7 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/BinaryFormat/Magic.h"
-
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/BinaryFormat/MachO.h"
diff --git a/llvm/lib/BinaryFormat/MsgPackDocument.cpp b/llvm/lib/BinaryFormat/MsgPackDocument.cpp
index e12c54a37ad0..53720c542e14 100644
--- a/llvm/lib/BinaryFormat/MsgPackDocument.cpp
+++ b/llvm/lib/BinaryFormat/MsgPackDocument.cpp
@@ -40,46 +40,100 @@ DocNode &MapDocNode::operator[](StringRef S) {
 /// Member access for MapDocNode.
 DocNode &MapDocNode::operator[](DocNode Key) {
   assert(!Key.isEmpty());
-  MapTy::value_type Entry(Key, DocNode());
-  auto ItAndInserted = Map->insert(Entry);
-  if (ItAndInserted.second) {
+  DocNode &N = (*Map)[Key];
+  if (N.isEmpty()) {
     // Ensure a new element has its KindAndDoc initialized.
-    ItAndInserted.first->second = getDocument()->getNode();
+    N = getDocument()->getEmptyNode();
   }
-  return ItAndInserted.first->second;
+  return N;
+}
+
+/// Member access for MapDocNode for integer key.
+DocNode &MapDocNode::operator[](int Key) {
+  return (*this)[getDocument()->getNode(Key)];
+}
+DocNode &MapDocNode::operator[](unsigned Key) {
+  return (*this)[getDocument()->getNode(Key)];
+}
+DocNode &MapDocNode::operator[](int64_t Key) {
+  return (*this)[getDocument()->getNode(Key)];
+}
+DocNode &MapDocNode::operator[](uint64_t Key) {
+  return (*this)[getDocument()->getNode(Key)];
 }
 
 /// Array element access. This extends the array if necessary.
 DocNode &ArrayDocNode::operator[](size_t Index) {
   if (size() <= Index) {
     // Ensure new elements have their KindAndDoc initialized.
-    Array->resize(Index + 1, getDocument()->getNode());
+    Array->resize(Index + 1, getDocument()->getEmptyNode());
   }
   return (*Array)[Index];
 }
 
+// Convenience assignment operators. This only works if the destination
+// DocNode has an associated Document, i.e. it was not constructed using the
+// default constructor. The string one does not copy, so the string must
+// remain valid for the lifetime of the Document. Use fromString to avoid
+// that restriction.
+DocNode &DocNode::operator=(StringRef Val) {
+  *this = getDocument()->getNode(Val);
+  return *this;
+}
+DocNode &DocNode::operator=(bool Val) {
+  *this = getDocument()->getNode(Val);
+  return *this;
+}
+DocNode &DocNode::operator=(int Val) {
+  *this = getDocument()->getNode(Val);
+  return *this;
+}
+DocNode &DocNode::operator=(unsigned Val) {
+  *this = getDocument()->getNode(Val);
+  return *this;
+}
+DocNode &DocNode::operator=(int64_t Val) {
+  *this = getDocument()->getNode(Val);
+  return *this;
+}
+DocNode &DocNode::operator=(uint64_t Val) {
+  *this = getDocument()->getNode(Val);
+  return *this;
+}
+
 // A level in the document reading stack.
 struct StackLevel {
+  StackLevel(DocNode Node, size_t StartIndex, size_t Length,
+             DocNode *MapEntry = nullptr)
+      : Node(Node), Index(StartIndex), End(StartIndex + Length),
+        MapEntry(MapEntry) {}
   DocNode Node;
-  size_t Length;
+  size_t Index;
+  size_t End;
   // Points to map entry when we have just processed a map key.
   DocNode *MapEntry;
+  DocNode MapKey;
 };
 
-// Read a document from a binary msgpack blob.
+// Read a document from a binary msgpack blob, merging into anything already in
+// the Document.
 // The blob data must remain valid for the lifetime of this Document (because a
 // string object in the document contains a StringRef into the original blob).
 // If Multi, then this sets root to an array and adds top-level objects to it.
 // If !Multi, then it only reads a single top-level object, even if there are
 // more, and sets root to that.
-// Returns false if failed due to illegal format.
-bool Document::readFromBlob(StringRef Blob, bool Multi) {
+// Returns false if failed due to illegal format or merge error.
+
+bool Document::readFromBlob(
+    StringRef Blob, bool Multi,
+    function_ref<int(DocNode *DestNode, DocNode SrcNode, DocNode MapKey)>
+        Merger) {
   msgpack::Reader MPReader(Blob);
   SmallVector<StackLevel, 4> Stack;
   if (Multi) {
     // Create the array for multiple top-level objects.
     Root = getArrayNode();
-    Stack.push_back(StackLevel({Root, (size_t)-1, nullptr}));
+    Stack.push_back(StackLevel(Root, 0, (size_t)-1));
   }
   do {
     // On to next element (or key if doing a map key next).
@@ -124,29 +178,47 @@ bool Document::readFromBlob(StringRef Blob, bool Multi) {
     }
 
     // Store it.
+    DocNode *DestNode = nullptr;
     if (Stack.empty())
-      Root = Node;
+      DestNode = &Root;
     else if (Stack.back().Node.getKind() == Type::Array) {
       // Reading an array entry.
       auto &Array = Stack.back().Node.getArray();
-      Array.push_back(Node);
+      DestNode = &Array[Stack.back().Index++];
     } else {
       auto &Map = Stack.back().Node.getMap();
       if (!Stack.back().MapEntry) {
         // Reading a map key.
+        Stack.back().MapKey = Node;
         Stack.back().MapEntry = &Map[Node];
-      } else {
-        // Reading the value for the map key read in the last iteration.
-        *Stack.back().MapEntry = Node;
-        Stack.back().MapEntry = nullptr;
+        continue;
       }
+      // Reading the value for the map key read in the last iteration.
+      DestNode = Stack.back().MapEntry;
+      Stack.back().MapEntry = nullptr;
+      ++Stack.back().Index;
     }
+    int MergeResult = 0;
+    if (!DestNode->isEmpty()) {
+      // In a merge, there is already a value at this position. Call the
+      // callback to attempt to resolve the conflict. The resolution must result
+      // in an array or map if Node is an array or map respectively.
+      DocNode MapKey = !Stack.empty() && !Stack.back().MapKey.isEmpty()
+                           ? Stack.back().MapKey
+                           : getNode();
+      MergeResult = Merger(DestNode, Node, MapKey);
+      if (MergeResult < 0)
+        return false; // Merge conflict resolution failed
+      assert(!((Node.isMap() && !DestNode->isMap()) ||
+               (Node.isArray() && !DestNode->isArray())));
+    } else
+      *DestNode = Node;
 
     // See if we're starting a new array or map.
-    switch (Node.getKind()) {
+    switch (DestNode->getKind()) {
     case msgpack::Type::Array:
     case msgpack::Type::Map:
-      Stack.push_back(StackLevel({Node, Obj.Length, nullptr}));
+      Stack.push_back(StackLevel(*DestNode, MergeResult, Obj.Length, nullptr));
       break;
     default:
       break;
@@ -154,14 +226,10 @@ bool Document::readFromBlob(StringRef Blob, bool Multi) {
 
     // Pop finished stack levels.
     while (!Stack.empty()) {
-      if (Stack.back().Node.getKind() == msgpack::Type::Array) {
-        if (Stack.back().Node.getArray().size() != Stack.back().Length)
-          break;
-      } else {
-        if (Stack.back().MapEntry ||
-            Stack.back().Node.getMap().size() != Stack.back().Length)
-          break;
-      }
+      if (Stack.back().MapEntry)
+        break;
+      if (Stack.back().Index != Stack.back().End)
+        break;
       Stack.pop_back();
     }
   } while (!Stack.empty());
diff --git a/llvm/lib/BinaryFormat/Wasm.cpp b/llvm/lib/BinaryFormat/Wasm.cpp
index d46be481edb3..88608168783b 100644
--- a/llvm/lib/BinaryFormat/Wasm.cpp
+++ b/llvm/lib/BinaryFormat/Wasm.cpp
@@ -39,9 +39,13 @@ std::string llvm::wasm::relocTypetoString(uint32_t Type) {
 bool llvm::wasm::relocTypeHasAddend(uint32_t Type) {
   switch (Type) {
   case R_WASM_MEMORY_ADDR_LEB:
+  case R_WASM_MEMORY_ADDR_LEB64:
   case R_WASM_MEMORY_ADDR_SLEB:
+  case R_WASM_MEMORY_ADDR_SLEB64:
   case R_WASM_MEMORY_ADDR_REL_SLEB:
+  case R_WASM_MEMORY_ADDR_REL_SLEB64:
   case R_WASM_MEMORY_ADDR_I32:
+  case R_WASM_MEMORY_ADDR_I64:
   case R_WASM_FUNCTION_OFFSET_I32:
   case R_WASM_SECTION_OFFSET_I32:
     return true;
diff --git a/llvm/lib/BinaryFormat/XCOFF.cpp b/llvm/lib/BinaryFormat/XCOFF.cpp
index 29ccbaea3584..3c8a2cdbc3aa 100644
--- a/llvm/lib/BinaryFormat/XCOFF.cpp
+++ b/llvm/lib/BinaryFormat/XCOFF.cpp
@@ -7,28 +7,72 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/BinaryFormat/XCOFF.h"
+#include "llvm/ADT/StringRef.h"
 
 using namespace llvm;
 
+#define SMC_CASE(A)                                                            \
+  case XCOFF::XMC_##A:                                                         \
+    return #A;
 StringRef XCOFF::getMappingClassString(XCOFF::StorageMappingClass SMC) {
   switch (SMC) {
-  case XCOFF::XMC_DS:
-    return "DS";
-  case XCOFF::XMC_RW:
-    return "RW";
-  case XCOFF::XMC_PR:
-    return "PR";
-  case XCOFF::XMC_TC0:
-    return "TC0";
-  case XCOFF::XMC_BS:
-    return "BS";
-  case XCOFF::XMC_RO:
-    return "RO";
-  case XCOFF::XMC_UA:
-    return "UA";
-  case XCOFF::XMC_TC:
-    return "TC";
-  default:
-    report_fatal_error("Unhandled storage-mapping class.");
+    SMC_CASE(PR)
+    SMC_CASE(RO)
+    SMC_CASE(DB)
+    SMC_CASE(GL)
+    SMC_CASE(XO)
+    SMC_CASE(SV)
+    SMC_CASE(SV64)
+    SMC_CASE(SV3264)
+    SMC_CASE(TI)
+    SMC_CASE(TB)
+    SMC_CASE(RW)
+    SMC_CASE(TC0)
+    SMC_CASE(TC)
+    SMC_CASE(TD)
+    SMC_CASE(DS)
+    SMC_CASE(UA)
+    SMC_CASE(BS)
+    SMC_CASE(UC)
+    SMC_CASE(TL)
+    SMC_CASE(UL)
+    SMC_CASE(TE)
+#undef SMC_CASE
   }
+
+  // TODO: need to add a test case for "Unknown" and other SMC.
+  return "Unknown";
+}
+
+#define RELOC_CASE(A)                                                          \
+  case XCOFF::A:                                                               \
+    return #A;
+StringRef XCOFF::getRelocationTypeString(XCOFF::RelocationType Type) {
+  switch (Type) {
+    RELOC_CASE(R_POS)
+    RELOC_CASE(R_RL)
+    RELOC_CASE(R_RLA)
+    RELOC_CASE(R_NEG)
+    RELOC_CASE(R_REL)
+    RELOC_CASE(R_TOC)
+    RELOC_CASE(R_TRL)
+    RELOC_CASE(R_TRLA)
+    RELOC_CASE(R_GL)
+    RELOC_CASE(R_TCL)
+    RELOC_CASE(R_REF)
+    RELOC_CASE(R_BA)
+    RELOC_CASE(R_BR)
+    RELOC_CASE(R_RBA)
+    RELOC_CASE(R_RBR)
+    RELOC_CASE(R_TLS)
+    RELOC_CASE(R_TLS_IE)
+    RELOC_CASE(R_TLS_LD)
+    RELOC_CASE(R_TLS_LE)
+    RELOC_CASE(R_TLSM)
+    RELOC_CASE(R_TLSML)
+    RELOC_CASE(R_TOCU)
+    RELOC_CASE(R_TOCL)
+  }
+  return "Unknown";
 }
+#undef RELOC_CASE
diff --git a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
index e70caa83c8c1..2ce064c7685a 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeAnalyzer.cpp
@@ -130,7 +130,7 @@ static Optional<const char *> GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(MODULE_CODE, DATALAYOUT)
       STRINGIFY_CODE(MODULE_CODE, ASM)
       STRINGIFY_CODE(MODULE_CODE, SECTIONNAME)
-      STRINGIFY_CODE(MODULE_CODE, DEPLIB) // FIXME: Remove in 4.0
+      STRINGIFY_CODE(MODULE_CODE, DEPLIB) // Deprecated, present in old bitcode
       STRINGIFY_CODE(MODULE_CODE, GLOBALVAR)
       STRINGIFY_CODE(MODULE_CODE, FUNCTION)
       STRINGIFY_CODE(MODULE_CODE, ALIAS)
@@ -305,6 +305,8 @@ static Optional<const char *> GetCodeName(unsigned CodeID, unsigned BlockID,
       STRINGIFY_CODE(FS, CFI_FUNCTION_DECLS)
       STRINGIFY_CODE(FS, TYPE_ID)
       STRINGIFY_CODE(FS, TYPE_ID_METADATA)
+      STRINGIFY_CODE(FS, BLOCK_COUNT)
+      STRINGIFY_CODE(FS, PARAM_ACCESS)
     }
   case bitc::METADATA_ATTACHMENT_ID:
     switch (CodeID) {
@@ -910,17 +912,14 @@ Error BitcodeAnalyzer::parseBlock(unsigned BlockID, unsigned IndentLevel,
             Hasher.update(ArrayRef<uint8_t>(Ptr, BlockSize));
             Hash = Hasher.result();
           }
-          SmallString<20> RecordedHash;
-          RecordedHash.resize(20);
+          std::array<char, 20> RecordedHash;
           int Pos = 0;
           for (auto &Val : Record) {
             assert(!(Val >> 32) && "Unexpected high bits set");
-            RecordedHash[Pos++] = (Val >> 24) & 0xFF;
-            RecordedHash[Pos++] = (Val >> 16) & 0xFF;
-            RecordedHash[Pos++] = (Val >> 8) & 0xFF;
-            RecordedHash[Pos++] = (Val >> 0) & 0xFF;
+            support::endian::write32be(&RecordedHash[Pos], Val);
+            Pos += 4;
           }
-          if (Hash == RecordedHash)
+          if (Hash == StringRef(RecordedHash.data(), RecordedHash.size()))
             O->OS << " (match)";
           else
             O->OS << " (!mismatch!)";
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 33464412edc5..659e26c2bd25 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -27,7 +27,6 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Comdat.h"
 #include "llvm/IR/Constant.h"
@@ -153,8 +152,7 @@ static bool convertToString(ArrayRef<uint64_t> Record, unsigned Idx,
   if (Idx > Record.size())
     return true;
 
-  for (unsigned i = Idx, e = Record.size(); i != e; ++i)
-    Result += (char)Record[i];
+  Result.append(Record.begin() + Idx, Record.end());
   return false;
 }
 
@@ -578,8 +576,11 @@ public:
 
   /// Main interface to parsing a bitcode buffer.
   /// \returns true if an error occurred.
-  Error parseBitcodeInto(Module *M, bool ShouldLazyLoadMetadata = false,
-                         bool IsImporting = false);
+  Error parseBitcodeInto(
+      Module *M, bool ShouldLazyLoadMetadata = false, bool IsImporting = false,
+      DataLayoutCallbackTy DataLayoutCallback = [](std::string) {
+        return None;
+      });
 
   static uint64_t decodeSignRotatedValue(uint64_t V);
 
@@ -724,7 +725,9 @@ private:
   /// a corresponding error code.
   Error parseAlignmentValue(uint64_t Exponent, MaybeAlign &Alignment);
   Error parseAttrKind(uint64_t Code, Attribute::AttrKind *Kind);
-  Error parseModule(uint64_t ResumeBit, bool ShouldLazyLoadMetadata = false);
+  Error parseModule(
+      uint64_t ResumeBit, bool ShouldLazyLoadMetadata = false,
+      DataLayoutCallbackTy DataLayoutCallback = [](StringRef) { return None; });
 
   Error parseComdatRecord(ArrayRef<uint64_t> Record);
   Error parseGlobalVarRecord(ArrayRef<uint64_t> Record);
@@ -859,7 +862,7 @@ BitcodeReader::BitcodeReader(BitstreamCursor Stream, StringRef Strtab,
                              LLVMContext &Context)
     : BitcodeReaderBase(std::move(Stream), Strtab), Context(Context),
       ValueList(Context, Stream.SizeInBytes()) {
-  this->ProducerIdentification = ProducerIdentification;
+  this->ProducerIdentification = std::string(ProducerIdentification);
 }
 
 Error BitcodeReader::materializeForwardReferencedFunctions() {
@@ -985,8 +988,10 @@ static GlobalValueSummary::GVFlags getDecodedGVSummaryFlags(uint64_t RawFlags,
 
 // Decode the flags for GlobalVariable in the summary
 static GlobalVarSummary::GVarFlags getDecodedGVarFlags(uint64_t RawFlags) {
-  return GlobalVarSummary::GVarFlags((RawFlags & 0x1) ? true : false,
-                                     (RawFlags & 0x2) ? true : false);
+  return GlobalVarSummary::GVarFlags(
+      (RawFlags & 0x1) ? true : false, (RawFlags & 0x2) ? true : false,
+      (RawFlags & 0x4) ? true : false,
+      (GlobalObject::VCallVisibility)(RawFlags >> 3));
 }
 
 static GlobalValue::VisibilityTypes getDecodedVisibility(unsigned Val) {
@@ -1216,6 +1221,8 @@ StructType *BitcodeReader::createIdentifiedStructType(LLVMContext &Context) {
 static uint64_t getRawAttributeMask(Attribute::AttrKind Val) {
   switch (Val) {
   case Attribute::EndAttrKinds:
+  case Attribute::EmptyKey:
+  case Attribute::TombstoneKey:
     llvm_unreachable("Synthetic enumerators which should never get here");
 
   case Attribute::None:            return 0;
@@ -1281,25 +1288,10 @@ static uint64_t getRawAttributeMask(Attribute::AttrKind Val) {
     return 1ULL << 62;
   case Attribute::NoFree:
     return 1ULL << 63;
-  case Attribute::NoSync:
-    llvm_unreachable("nosync attribute not supported in raw format");
-    break;
-  case Attribute::Dereferenceable:
-    llvm_unreachable("dereferenceable attribute not supported in raw format");
-    break;
-  case Attribute::DereferenceableOrNull:
-    llvm_unreachable("dereferenceable_or_null attribute not supported in raw "
-                     "format");
-    break;
-  case Attribute::ArgMemOnly:
-    llvm_unreachable("argmemonly attribute not supported in raw format");
-    break;
-  case Attribute::AllocSize:
-    llvm_unreachable("allocsize not supported in raw format");
-    break;
-  case Attribute::SanitizeMemTag:
-    llvm_unreachable("sanitize_memtag attribute not supported in raw format");
-    break;
+  default:
+    // Other attributes are not supported in the raw format,
+    // as we ran out of space.
+    return 0;
   }
   llvm_unreachable("Unsupported attribute type");
 }
@@ -1309,13 +1301,6 @@ static void addRawAttributeValue(AttrBuilder &B, uint64_t Val) {
 
   for (Attribute::AttrKind I = Attribute::None; I != Attribute::EndAttrKinds;
        I = Attribute::AttrKind(I + 1)) {
-    if (I == Attribute::SanitizeMemTag ||
-        I == Attribute::Dereferenceable ||
-        I == Attribute::DereferenceableOrNull ||
-        I == Attribute::ArgMemOnly ||
-        I == Attribute::AllocSize ||
-        I == Attribute::NoSync)
-      continue;
     if (uint64_t A = (Val & getRawAttributeMask(I))) {
       if (I == Attribute::Alignment)
         B.addAlignmentAttr(1ULL << ((A >> 16) - 1));
@@ -1332,8 +1317,6 @@ static void addRawAttributeValue(AttrBuilder &B, uint64_t Val) {
 /// 'encodeLLVMAttributesForBitcode'.
 static void decodeLLVMAttributesForBitcode(AttrBuilder &B,
                                            uint64_t EncodedAttrs) {
-  // FIXME: Remove in 4.0.
-
   // The alignment is stored as a 16-bit raw value from bits 31--16.  We shift
   // the bits above 31 down by 11 bits.
   unsigned Alignment = (EncodedAttrs & (0xffffULL << 16)) >> 16;
@@ -1384,7 +1367,7 @@ Error BitcodeReader::parseAttributeBlock() {
     default:  // Default behavior: ignore.
       break;
     case bitc::PARAMATTR_CODE_ENTRY_OLD: // ENTRY: [paramidx0, attr0, ...]
-      // FIXME: Remove in 4.0.
+      // Deprecated, but still needed to read old bitcode files.
       if (Record.size() & 1)
         return error("Invalid record");
 
@@ -1461,6 +1444,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::NoInline;
   case bitc::ATTR_KIND_NO_RECURSE:
     return Attribute::NoRecurse;
+  case bitc::ATTR_KIND_NO_MERGE:
+    return Attribute::NoMerge;
   case bitc::ATTR_KIND_NON_LAZY_BIND:
     return Attribute::NonLazyBind;
   case bitc::ATTR_KIND_NON_NULL:
@@ -1481,6 +1466,8 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::NoCfCheck;
   case bitc::ATTR_KIND_NO_UNWIND:
     return Attribute::NoUnwind;
+  case bitc::ATTR_KIND_NULL_POINTER_IS_VALID:
+    return Attribute::NullPointerIsValid;
   case bitc::ATTR_KIND_OPT_FOR_FUZZING:
     return Attribute::OptForFuzzing;
   case bitc::ATTR_KIND_OPTIMIZE_FOR_SIZE:
@@ -1541,6 +1528,10 @@ static Attribute::AttrKind getAttrFromCode(uint64_t Code) {
     return Attribute::ImmArg;
   case bitc::ATTR_KIND_SANITIZE_MEMTAG:
     return Attribute::SanitizeMemTag;
+  case bitc::ATTR_KIND_PREALLOCATED:
+    return Attribute::Preallocated;
+  case bitc::ATTR_KIND_NOUNDEF:
+    return Attribute::NoUndef;
   }
 }
 
@@ -1656,12 +1647,15 @@ Error BitcodeReader::parseAttributeGroupBlock() {
           Attribute::AttrKind Kind;
           if (Error Err = parseAttrKind(Record[++i], &Kind))
             return Err;
-          if (Kind == Attribute::ByVal)
+          if (Kind == Attribute::ByVal) {
             B.addByValAttr(HasType ? getTypeByID(Record[++i]) : nullptr);
+          } else if (Kind == Attribute::Preallocated) {
+            B.addPreallocatedAttr(getTypeByID(Record[++i]));
+          }
         }
       }
 
-      UpgradeFramePointerAttributes(B);
+      UpgradeAttributes(B);
       MAttributeGroups[GrpID] = AttributeList::get(Context, Idx, B);
       break;
     }
@@ -1727,6 +1721,9 @@ Error BitcodeReader::parseTypeTableBody() {
     case bitc::TYPE_CODE_HALF:     // HALF
       ResultTy = Type::getHalfTy(Context);
       break;
+    case bitc::TYPE_CODE_BFLOAT:    // BFLOAT
+      ResultTy = Type::getBFloatTy(Context);
+      break;
     case bitc::TYPE_CODE_FLOAT:     // FLOAT
       ResultTy = Type::getFloatTy(Context);
       break;
@@ -1780,7 +1777,7 @@ Error BitcodeReader::parseTypeTableBody() {
       break;
     }
     case bitc::TYPE_CODE_FUNCTION_OLD: {
-      // FIXME: attrid is dead, remove it in LLVM 4.0
+      // Deprecated, but still needed to read old bitcode files.
       // FUNCTION: [vararg, attrid, retty, paramty x N]
       if (Record.size() < 3)
         return error("Invalid record");
@@ -2314,7 +2311,7 @@ Error BitcodeReader::resolveGlobalAndIndirectSymbolInits() {
   return Error::success();
 }
 
-static APInt readWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
+APInt llvm::readWideAPInt(ArrayRef<uint64_t> Vals, unsigned TypeBits) {
   SmallVector<uint64_t, 8> Words(Vals.size());
   transform(Vals, Words.begin(),
                  BitcodeReader::decodeSignRotatedValue);
@@ -2333,6 +2330,16 @@ Error BitcodeReader::parseConstants() {
   Type *CurFullTy = Type::getInt32Ty(Context);
   unsigned NextCstNo = ValueList.size();
 
+  struct DelayedShufTy {
+    VectorType *OpTy;
+    VectorType *RTy;
+    Type *CurFullTy;
+    uint64_t Op0Idx;
+    uint64_t Op1Idx;
+    uint64_t Op2Idx;
+    unsigned CstNo;
+  };
+  std::vector<DelayedShufTy> DelayedShuffles;
   while (true) {
     Expected<BitstreamEntry> MaybeEntry = Stream.advanceSkippingSubblocks();
     if (!MaybeEntry)
@@ -2344,11 +2351,35 @@ Error BitcodeReader::parseConstants() {
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
+      // Once all the constants have been read, go through and resolve forward
+      // references.
+      //
+      // We have to treat shuffles specially because they don't have three
+      // operands anymore.  We need to convert the shuffle mask into an array,
+      // and we can't convert a forward reference.
+      for (auto &DelayedShuffle : DelayedShuffles) {
+        VectorType *OpTy = DelayedShuffle.OpTy;
+        VectorType *RTy = DelayedShuffle.RTy;
+        uint64_t Op0Idx = DelayedShuffle.Op0Idx;
+        uint64_t Op1Idx = DelayedShuffle.Op1Idx;
+        uint64_t Op2Idx = DelayedShuffle.Op2Idx;
+        uint64_t CstNo = DelayedShuffle.CstNo;
+        Constant *Op0 = ValueList.getConstantFwdRef(Op0Idx, OpTy);
+        Constant *Op1 = ValueList.getConstantFwdRef(Op1Idx, OpTy);
+        Type *ShufTy =
+            VectorType::get(Type::getInt32Ty(Context), RTy->getElementCount());
+        Constant *Op2 = ValueList.getConstantFwdRef(Op2Idx, ShufTy);
+        if (!ShuffleVectorInst::isValidOperands(Op0, Op1, Op2))
+          return error("Invalid shufflevector operands");
+        SmallVector<int, 16> Mask;
+        ShuffleVectorInst::getShuffleMask(Op2, Mask);
+        Value *V = ConstantExpr::getShuffleVector(Op0, Op1, Mask);
+        ValueList.assignValue(V, CstNo, DelayedShuffle.CurFullTy);
+      }
+
       if (NextCstNo != ValueList.size())
         return error("Invalid constant reference");
 
-      // Once all the constants have been read, go through and resolve forward
-      // references.
       ValueList.resolveConstantForwardRefs();
       return Error::success();
     case BitstreamEntry::Record:
@@ -2404,6 +2435,9 @@ Error BitcodeReader::parseConstants() {
       if (CurTy->isHalfTy())
         V = ConstantFP::get(Context, APFloat(APFloat::IEEEhalf(),
                                              APInt(16, (uint16_t)Record[0])));
+      else if (CurTy->isBFloatTy())
+        V = ConstantFP::get(Context, APFloat(APFloat::BFloat(),
+                                             APInt(16, (uint32_t)Record[0])));
       else if (CurTy->isFloatTy())
         V = ConstantFP::get(Context, APFloat(APFloat::IEEEsingle(),
                                              APInt(32, (uint32_t)Record[0])));
@@ -2469,7 +2503,11 @@ Error BitcodeReader::parseConstants() {
       if (Record.empty())
         return error("Invalid record");
 
-      Type *EltTy = cast<SequentialType>(CurTy)->getElementType();
+      Type *EltTy;
+      if (auto *Array = dyn_cast<ArrayType>(CurTy))
+        EltTy = Array->getElementType();
+      else
+        EltTy = cast<VectorType>(CurTy)->getElementType();
       if (EltTy->isIntegerTy(8)) {
         SmallVector<uint8_t, 16> Elts(Record.begin(), Record.end());
         if (isa<VectorType>(CurTy))
@@ -2497,21 +2535,27 @@ Error BitcodeReader::parseConstants() {
       } else if (EltTy->isHalfTy()) {
         SmallVector<uint16_t, 16> Elts(Record.begin(), Record.end());
         if (isa<VectorType>(CurTy))
-          V = ConstantDataVector::getFP(Context, Elts);
+          V = ConstantDataVector::getFP(EltTy, Elts);
+        else
+          V = ConstantDataArray::getFP(EltTy, Elts);
+      } else if (EltTy->isBFloatTy()) {
+        SmallVector<uint16_t, 16> Elts(Record.begin(), Record.end());
+        if (isa<VectorType>(CurTy))
+          V = ConstantDataVector::getFP(EltTy, Elts);
         else
-          V = ConstantDataArray::getFP(Context, Elts);
+          V = ConstantDataArray::getFP(EltTy, Elts);
       } else if (EltTy->isFloatTy()) {
         SmallVector<uint32_t, 16> Elts(Record.begin(), Record.end());
         if (isa<VectorType>(CurTy))
-          V = ConstantDataVector::getFP(Context, Elts);
+          V = ConstantDataVector::getFP(EltTy, Elts);
         else
-          V = ConstantDataArray::getFP(Context, Elts);
+          V = ConstantDataArray::getFP(EltTy, Elts);
       } else if (EltTy->isDoubleTy()) {
         SmallVector<uint64_t, 16> Elts(Record.begin(), Record.end());
         if (isa<VectorType>(CurTy))
-          V = ConstantDataVector::getFP(Context, Elts);
+          V = ConstantDataVector::getFP(EltTy, Elts);
         else
-          V = ConstantDataArray::getFP(Context, Elts);
+          V = ConstantDataArray::getFP(EltTy, Elts);
       } else {
         return error("Invalid type for value");
       }
@@ -2629,12 +2673,13 @@ Error BitcodeReader::parseConstants() {
 
       Type *SelectorTy = Type::getInt1Ty(Context);
 
-      // The selector might be an i1 or an <n x i1>
+      // The selector might be an i1, an <n x i1>, or a <vscale x n x i1>
       // Get the type from the ValueList before getting a forward ref.
       if (VectorType *VTy = dyn_cast<VectorType>(CurTy))
         if (Value *V = ValueList[Record[0]])
           if (SelectorTy != V->getType())
-            SelectorTy = VectorType::get(SelectorTy, VTy->getNumElements());
+            SelectorTy = VectorType::get(SelectorTy,
+                                         VTy->getElementCount());
 
       V = ConstantExpr::getSelect(ValueList.getConstantFwdRef(Record[0],
                                                               SelectorTy),
@@ -2657,8 +2702,10 @@ Error BitcodeReader::parseConstants() {
         if (!IdxTy)
           return error("Invalid record");
         Op1 = ValueList.getConstantFwdRef(Record[3], IdxTy);
-      } else // TODO: Remove with llvm 4.0
+      } else {
+        // Deprecated, but still needed to read old bitcode files.
         Op1 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context));
+      }
       if (!Op1)
         return error("Invalid record");
       V = ConstantExpr::getExtractElement(Op0, Op1);
@@ -2678,8 +2725,10 @@ Error BitcodeReader::parseConstants() {
         if (!IdxTy)
           return error("Invalid record");
         Op2 = ValueList.getConstantFwdRef(Record[3], IdxTy);
-      } else // TODO: Remove with llvm 4.0
+      } else {
+        // Deprecated, but still needed to read old bitcode files.
         Op2 = ValueList.getConstantFwdRef(Record[2], Type::getInt32Ty(Context));
+      }
       if (!Op2)
         return error("Invalid record");
       V = ConstantExpr::getInsertElement(Op0, Op1, Op2);
@@ -2689,13 +2738,10 @@ Error BitcodeReader::parseConstants() {
       VectorType *OpTy = dyn_cast<VectorType>(CurTy);
       if (Record.size() < 3 || !OpTy)
         return error("Invalid record");
-      Constant *Op0 = ValueList.getConstantFwdRef(Record[0], OpTy);
-      Constant *Op1 = ValueList.getConstantFwdRef(Record[1], OpTy);
-      Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
-                                                 OpTy->getNumElements());
-      Constant *Op2 = ValueList.getConstantFwdRef(Record[2], ShufTy);
-      V = ConstantExpr::getShuffleVector(Op0, Op1, Op2);
-      break;
+      DelayedShuffles.push_back(
+          {OpTy, OpTy, CurFullTy, Record[0], Record[1], Record[2], NextCstNo});
+      ++NextCstNo;
+      continue;
     }
     case bitc::CST_CODE_CE_SHUFVEC_EX: { // [opty, opval, opval, opval]
       VectorType *RTy = dyn_cast<VectorType>(CurTy);
@@ -2703,13 +2749,10 @@ Error BitcodeReader::parseConstants() {
         dyn_cast_or_null<VectorType>(getTypeByID(Record[0]));
       if (Record.size() < 4 || !RTy || !OpTy)
         return error("Invalid record");
-      Constant *Op0 = ValueList.getConstantFwdRef(Record[1], OpTy);
-      Constant *Op1 = ValueList.getConstantFwdRef(Record[2], OpTy);
-      Type *ShufTy = VectorType::get(Type::getInt32Ty(Context),
-                                                 RTy->getNumElements());
-      Constant *Op2 = ValueList.getConstantFwdRef(Record[3], ShufTy);
-      V = ConstantExpr::getShuffleVector(Op0, Op1, Op2);
-      break;
+      DelayedShuffles.push_back(
+          {OpTy, RTy, CurFullTy, Record[1], Record[2], Record[3], NextCstNo});
+      ++NextCstNo;
+      continue;
     }
     case bitc::CST_CODE_CE_CMP: {     // CE_CMP: [opty, opval, opval, pred]
       if (Record.size() < 4)
@@ -2727,7 +2770,7 @@ Error BitcodeReader::parseConstants() {
       break;
     }
     // This maintains backward compatibility, pre-asm dialect keywords.
-    // FIXME: Remove with the 4.0 release.
+    // Deprecated, but still needed to read old bitcode files.
     case bitc::CST_CODE_INLINEASM_OLD: {
       if (Record.size() < 2)
         return error("Invalid record");
@@ -2967,6 +3010,7 @@ Error BitcodeReader::globalCleanup() {
     return error("Malformed global initializer set");
 
   // Look for intrinsic functions which need to be upgraded at some point
+  // and functions that need to have their function attributes upgraded.
   for (Function &F : *TheModule) {
     MDLoader->upgradeDebugIntrinsics(F);
     Function *NewFn;
@@ -2977,6 +3021,8 @@ Error BitcodeReader::globalCleanup() {
       // loaded in the same LLVMContext (LTO scenario). In this case we should
       // remangle intrinsics names as well.
       RemangledIntrinsics[&F] = Remangled.getValue();
+    // Look for functions that rely on old function attribute behavior.
+    UpgradeFunctionAttributes(F);
   }
 
   // Look for global variables which need to be renamed.
@@ -3125,8 +3171,8 @@ Error BitcodeReader::parseGlobalVarRecord(ArrayRef<uint64_t> Record) {
   }
   GlobalValue::VisibilityTypes Visibility = GlobalValue::DefaultVisibility;
   // Local linkage must have default visibility.
+  // auto-upgrade `hidden` and `protected` for old bitcode.
   if (Record.size() > 6 && !GlobalValue::isLocalLinkage(Linkage))
-    // FIXME: Change to an error if non-default in 4.0.
     Visibility = getDecodedVisibility(Record[6]);
 
   GlobalVariable::ThreadLocalMode TLM = GlobalVariable::NotThreadLocal;
@@ -3255,8 +3301,8 @@ Error BitcodeReader::parseFunctionRecord(ArrayRef<uint64_t> Record) {
     Func->setSection(SectionTable[Record[6] - 1]);
   }
   // Local linkage must have default visibility.
+  // auto-upgrade `hidden` and `protected` for old bitcode.
   if (!Func->hasLocalLinkage())
-    // FIXME: Change to an error if non-default in 4.0.
     Func->setVisibility(getDecodedVisibility(Record[7]));
   if (Record.size() > 8 && Record[8]) {
     if (Record[8] - 1 >= GCTable.size())
@@ -3363,12 +3409,11 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
 
   assert(NewGA->getValueType() == flattenPointerTypes(FullTy) &&
          "Incorrect fully structured type provided for GlobalIndirectSymbol");
-  // Old bitcode files didn't have visibility field.
   // Local linkage must have default visibility.
+  // auto-upgrade `hidden` and `protected` for old bitcode.
   if (OpNum != Record.size()) {
     auto VisInd = OpNum++;
     if (!NewGA->hasLocalLinkage())
-      // FIXME: Change to an error if non-default in 4.0.
       NewGA->setVisibility(getDecodedVisibility(Record[VisInd]));
   }
   if (BitCode == bitc::MODULE_CODE_ALIAS ||
@@ -3402,7 +3447,8 @@ Error BitcodeReader::parseGlobalIndirectSymbolRecord(
 }
 
 Error BitcodeReader::parseModule(uint64_t ResumeBit,
-                                 bool ShouldLazyLoadMetadata) {
+                                 bool ShouldLazyLoadMetadata,
+                                 DataLayoutCallbackTy DataLayoutCallback) {
   if (ResumeBit) {
     if (Error JumpFailed = Stream.JumpToBit(ResumeBit))
       return JumpFailed;
@@ -3411,6 +3457,26 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
 
   SmallVector<uint64_t, 64> Record;
 
+  // Parts of bitcode parsing depend on the datalayout.  Make sure we
+  // finalize the datalayout before we run any of that code.
+  bool ResolvedDataLayout = false;
+  auto ResolveDataLayout = [&] {
+    if (ResolvedDataLayout)
+      return;
+
+    // datalayout and triple can't be parsed after this point.
+    ResolvedDataLayout = true;
+
+    // Upgrade data layout string.
+    std::string DL = llvm::UpgradeDataLayoutString(
+        TheModule->getDataLayoutStr(), TheModule->getTargetTriple());
+    TheModule->setDataLayout(DL);
+
+    if (auto LayoutOverride =
+            DataLayoutCallback(TheModule->getTargetTriple()))
+      TheModule->setDataLayout(*LayoutOverride);
+  };
+
   // Read all the records for this module.
   while (true) {
     Expected<llvm::BitstreamEntry> MaybeEntry = Stream.advance();
@@ -3422,6 +3488,7 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
     case BitstreamEntry::Error:
       return error("Malformed block");
     case BitstreamEntry::EndBlock:
+      ResolveDataLayout();
       return globalCleanup();
 
     case BitstreamEntry::SubBlock:
@@ -3486,6 +3553,8 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
           return Err;
         break;
       case bitc::FUNCTION_BLOCK_ID:
+        ResolveDataLayout();
+
         // If this is the first function body we've seen, reverse the
         // FunctionsWithBodies list.
         if (!SeenFirstFunctionBody) {
@@ -3572,6 +3641,8 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
       break;
     }
     case bitc::MODULE_CODE_TRIPLE: {  // TRIPLE: [strchr x N]
+      if (ResolvedDataLayout)
+        return error("target triple too late in module");
       std::string S;
       if (convertToString(Record, 0, S))
         return error("Invalid record");
@@ -3579,6 +3650,8 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
       break;
     }
     case bitc::MODULE_CODE_DATALAYOUT: {  // DATALAYOUT: [strchr x N]
+      if (ResolvedDataLayout)
+        return error("datalayout too late in module");
       std::string S;
       if (convertToString(Record, 0, S))
         return error("Invalid record");
@@ -3593,7 +3666,7 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
       break;
     }
     case bitc::MODULE_CODE_DEPLIB: {  // DEPLIB: [strchr x N]
-      // FIXME: Remove in 4.0.
+      // Deprecated, but still needed to read old bitcode files.
       std::string S;
       if (convertToString(Record, 0, S))
         return error("Invalid record");
@@ -3623,6 +3696,7 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
         return Err;
       break;
     case bitc::MODULE_CODE_FUNCTION:
+      ResolveDataLayout();
       if (Error Err = parseFunctionRecord(Record))
         return Err;
       break;
@@ -3650,20 +3724,16 @@ Error BitcodeReader::parseModule(uint64_t ResumeBit,
       break;
     }
     Record.clear();
-
-    // Upgrade data layout string.
-    std::string DL = llvm::UpgradeDataLayoutString(
-        TheModule->getDataLayoutStr(), TheModule->getTargetTriple());
-    TheModule->setDataLayout(DL);
   }
 }
 
 Error BitcodeReader::parseBitcodeInto(Module *M, bool ShouldLazyLoadMetadata,
-                                      bool IsImporting) {
+                                      bool IsImporting,
+                                      DataLayoutCallbackTy DataLayoutCallback) {
   TheModule = M;
   MDLoader = MetadataLoader(Stream, *M, ValueList, IsImporting,
                             [&](unsigned ID) { return getTypeByID(ID); });
-  return parseModule(0, ShouldLazyLoadMetadata);
+  return parseModule(0, ShouldLazyLoadMetadata, DataLayoutCallback);
 }
 
 Error BitcodeReader::typeCheckLoadStoreInst(Type *ValType, Type *PtrType) {
@@ -4135,7 +4205,7 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       if (!Vec->getType()->isVectorTy())
         return error("Invalid type for value");
       I = ExtractElementInst::Create(Vec, Idx);
-      FullTy = FullTy->getVectorElementType();
+      FullTy = cast<VectorType>(FullTy)->getElementType();
       InstructionList.push_back(I);
       break;
     }
@@ -4167,9 +4237,11 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
         return error("Invalid record");
       if (!Vec1->getType()->isVectorTy() || !Vec2->getType()->isVectorTy())
         return error("Invalid type for value");
+
       I = new ShuffleVectorInst(Vec1, Vec2, Mask);
-      FullTy = VectorType::get(FullTy->getVectorElementType(),
-                               Mask->getType()->getVectorNumElements());
+      FullTy =
+          VectorType::get(cast<VectorType>(FullTy)->getElementType(),
+                          cast<VectorType>(Mask->getType())->getElementCount());
       InstructionList.push_back(I);
       break;
     }
@@ -4763,7 +4835,13 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       const DataLayout &DL = TheModule->getDataLayout();
       unsigned AS = DL.getAllocaAddrSpace();
 
-      AllocaInst *AI = new AllocaInst(Ty, AS, Size, Align);
+      SmallPtrSet<Type *, 4> Visited;
+      if (!Align && !Ty->isSized(&Visited))
+        return error("alloca of unsized type");
+      if (!Align)
+        Align = DL.getPrefTypeAlign(Ty);
+
+      AllocaInst *AI = new AllocaInst(Ty, AS, Size, *Align);
       AI->setUsedWithInAlloca(InAlloca);
       AI->setSwiftError(SwiftError);
       I = AI;
@@ -4794,7 +4872,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       MaybeAlign Align;
       if (Error Err = parseAlignmentValue(Record[OpNum], Align))
         return Err;
-      I = new LoadInst(Ty, Op, "", Record[OpNum + 1], Align);
+      SmallPtrSet<Type *, 4> Visited;
+      if (!Align && !Ty->isSized(&Visited))
+        return error("load of unsized type");
+      if (!Align)
+        Align = TheModule->getDataLayout().getABITypeAlign(Ty);
+      I = new LoadInst(Ty, Op, "", Record[OpNum + 1], *Align);
       InstructionList.push_back(I);
       break;
     }
@@ -4831,7 +4914,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       MaybeAlign Align;
       if (Error Err = parseAlignmentValue(Record[OpNum], Align))
         return Err;
-      I = new LoadInst(Ty, Op, "", Record[OpNum + 1], Align, Ordering, SSID);
+      if (!Align)
+        return error("Alignment missing from atomic load");
+      I = new LoadInst(Ty, Op, "", Record[OpNum + 1], *Align, Ordering, SSID);
       InstructionList.push_back(I);
       break;
     }
@@ -4853,7 +4938,12 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       MaybeAlign Align;
       if (Error Err = parseAlignmentValue(Record[OpNum], Align))
         return Err;
-      I = new StoreInst(Val, Ptr, Record[OpNum + 1], Align);
+      SmallPtrSet<Type *, 4> Visited;
+      if (!Align && !Val->getType()->isSized(&Visited))
+        return error("store of unsized type");
+      if (!Align)
+        Align = TheModule->getDataLayout().getABITypeAlign(Val->getType());
+      I = new StoreInst(Val, Ptr, Record[OpNum + 1], *Align);
       InstructionList.push_back(I);
       break;
     }
@@ -4886,7 +4976,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       MaybeAlign Align;
       if (Error Err = parseAlignmentValue(Record[OpNum], Align))
         return Err;
-      I = new StoreInst(Val, Ptr, Record[OpNum + 1], Align, Ordering, SSID);
+      if (!Align)
+        return error("Alignment missing from atomic store");
+      I = new StoreInst(Val, Ptr, Record[OpNum + 1], *Align, Ordering, SSID);
       InstructionList.push_back(I);
       break;
     }
@@ -4930,8 +5022,10 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
       else
         FailureOrdering = getDecodedOrdering(Record[OpNum + 3]);
 
-      I = new AtomicCmpXchgInst(Ptr, Cmp, New, SuccessOrdering, FailureOrdering,
-                                SSID);
+      Align Alignment(
+          TheModule->getDataLayout().getTypeStoreSize(Cmp->getType()));
+      I = new AtomicCmpXchgInst(Ptr, Cmp, New, Alignment, SuccessOrdering,
+                                FailureOrdering, SSID);
       FullTy = StructType::get(Context, {FullTy, Type::getInt1Ty(Context)});
       cast<AtomicCmpXchgInst>(I)->setVolatile(Record[OpNum]);
 
@@ -4968,7 +5062,9 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
           Ordering == AtomicOrdering::Unordered)
         return error("Invalid record");
       SyncScope::ID SSID = getDecodedSyncScopeID(Record[OpNum + 3]);
-      I = new AtomicRMWInst(Operation, Ptr, Val, Ordering, SSID);
+      Align Alignment(
+          TheModule->getDataLayout().getTypeStoreSize(Val->getType()));
+      I = new AtomicRMWInst(Operation, Ptr, Val, Alignment, Ordering, SSID);
       FullTy = getPointerElementFlatType(FullTy);
       cast<AtomicRMWInst>(I)->setVolatile(Record[OpNum+1]);
       InstructionList.push_back(I);
@@ -5161,8 +5257,8 @@ Error BitcodeReader::parseFunctionBody(Function *F) {
             !FullTy->isPointerTy() && !isa<StructType>(FullTy) &&
             !isa<ArrayType>(FullTy) &&
             (!isa<VectorType>(FullTy) ||
-             FullTy->getVectorElementType()->isFloatingPointTy() ||
-             FullTy->getVectorElementType()->isIntegerTy()) &&
+             cast<VectorType>(FullTy)->getElementType()->isFloatingPointTy() ||
+             cast<VectorType>(FullTy)->getElementType()->isIntegerTy()) &&
             "Structured types must be assigned with corresponding non-opaque "
             "pointer type");
       }
@@ -5277,7 +5373,7 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
     for (auto UI = I.first->materialized_user_begin(), UE = I.first->user_end();
          UI != UE;)
       // Don't expect any other users than call sites
-      CallSite(*UI++).setCalledFunction(I.second);
+      cast<CallBase>(*UI++)->setCalledFunction(I.second);
 
   // Finish fn->subprogram upgrade for materialized functions.
   if (DISubprogram *SP = MDLoader->lookupSubprogramForFunction(F))
@@ -5294,6 +5390,9 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
     }
   }
 
+  // Look for functions that rely on old function attribute behavior.
+  UpgradeFunctionAttributes(*F);
+
   // Bring in any functions that this function forward-referenced via
   // blockaddresses.
   return materializeForwardReferencedFunctions();
@@ -5722,6 +5821,41 @@ static void parseTypeIdSummaryRecord(ArrayRef<uint64_t> Record,
     parseWholeProgramDevirtResolution(Record, Strtab, Slot, TypeId);
 }
 
+static std::vector<FunctionSummary::ParamAccess>
+parseParamAccesses(ArrayRef<uint64_t> Record) {
+  auto ReadRange = [&]() {
+    APInt Lower(FunctionSummary::ParamAccess::RangeWidth,
+                BitcodeReader::decodeSignRotatedValue(Record.front()));
+    Record = Record.drop_front();
+    APInt Upper(FunctionSummary::ParamAccess::RangeWidth,
+                BitcodeReader::decodeSignRotatedValue(Record.front()));
+    Record = Record.drop_front();
+    ConstantRange Range{Lower, Upper};
+    assert(!Range.isFullSet());
+    assert(!Range.isUpperSignWrapped());
+    return Range;
+  };
+
+  std::vector<FunctionSummary::ParamAccess> PendingParamAccesses;
+  while (!Record.empty()) {
+    PendingParamAccesses.emplace_back();
+    FunctionSummary::ParamAccess &ParamAccess = PendingParamAccesses.back();
+    ParamAccess.ParamNo = Record.front();
+    Record = Record.drop_front();
+    ParamAccess.Use = ReadRange();
+    ParamAccess.Calls.resize(Record.front());
+    Record = Record.drop_front();
+    for (auto &Call : ParamAccess.Calls) {
+      Call.ParamNo = Record.front();
+      Record = Record.drop_front();
+      Call.Callee = Record.front();
+      Record = Record.drop_front();
+      Call.Offsets = ReadRange();
+    }
+  }
+  return PendingParamAccesses;
+}
+
 void ModuleSummaryIndexBitcodeReader::parseTypeIdCompatibleVtableInfo(
     ArrayRef<uint64_t> Record, size_t &Slot,
     TypeIdCompatibleVtableInfo &TypeId) {
@@ -5799,6 +5933,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       PendingTypeCheckedLoadVCalls;
   std::vector<FunctionSummary::ConstVCall> PendingTypeTestAssumeConstVCalls,
       PendingTypeCheckedLoadConstVCalls;
+  std::vector<FunctionSummary::ParamAccess> PendingParamAccesses;
 
   while (true) {
     Expected<BitstreamEntry> MaybeEntry = Stream.advanceSkippingSubblocks();
@@ -5832,35 +5967,7 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
     default: // Default behavior: ignore.
       break;
     case bitc::FS_FLAGS: {  // [flags]
-      uint64_t Flags = Record[0];
-      // Scan flags.
-      assert(Flags <= 0x3f && "Unexpected bits in flag");
-
-      // 1 bit: WithGlobalValueDeadStripping flag.
-      // Set on combined index only.
-      if (Flags & 0x1)
-        TheIndex.setWithGlobalValueDeadStripping();
-      // 1 bit: SkipModuleByDistributedBackend flag.
-      // Set on combined index only.
-      if (Flags & 0x2)
-        TheIndex.setSkipModuleByDistributedBackend();
-      // 1 bit: HasSyntheticEntryCounts flag.
-      // Set on combined index only.
-      if (Flags & 0x4)
-        TheIndex.setHasSyntheticEntryCounts();
-      // 1 bit: DisableSplitLTOUnit flag.
-      // Set on per module indexes. It is up to the client to validate
-      // the consistency of this flag across modules being linked.
-      if (Flags & 0x8)
-        TheIndex.setEnableSplitLTOUnit();
-      // 1 bit: PartiallySplitLTOUnits flag.
-      // Set on combined index only.
-      if (Flags & 0x10)
-        TheIndex.setPartiallySplitLTOUnits();
-      // 1 bit: WithAttributePropagation flag.
-      // Set on combined index only.
-      if (Flags & 0x20)
-        TheIndex.setWithAttributePropagation();
+      TheIndex.setFlags(Record[0]);
       break;
     }
     case bitc::FS_VALUE_GUID: { // [valueid, refguid]
@@ -5925,7 +6032,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
           std::move(PendingTypeTestAssumeVCalls),
           std::move(PendingTypeCheckedLoadVCalls),
           std::move(PendingTypeTestAssumeConstVCalls),
-          std::move(PendingTypeCheckedLoadConstVCalls));
+          std::move(PendingTypeCheckedLoadConstVCalls),
+          std::move(PendingParamAccesses));
       auto VIAndOriginalGUID = getValueInfoFromValueId(ValueID);
       FS->setModulePath(getThisModule()->first());
       FS->setOriginalName(VIAndOriginalGUID.second);
@@ -5965,7 +6073,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       uint64_t RawFlags = Record[1];
       unsigned RefArrayStart = 2;
       GlobalVarSummary::GVarFlags GVF(/* ReadOnly */ false,
-                                      /* WriteOnly */ false);
+                                      /* WriteOnly */ false,
+                                      /* Constant */ false,
+                                      GlobalObject::VCallVisibilityPublic);
       auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
       if (Version >= 5) {
         GVF = getDecodedGVarFlags(Record[2]);
@@ -6065,7 +6175,8 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
           std::move(PendingTypeTestAssumeVCalls),
           std::move(PendingTypeCheckedLoadVCalls),
           std::move(PendingTypeTestAssumeConstVCalls),
-          std::move(PendingTypeCheckedLoadConstVCalls));
+          std::move(PendingTypeCheckedLoadConstVCalls),
+          std::move(PendingParamAccesses));
       LastSeenSummary = FS.get();
       LastSeenGUID = VI.getGUID();
       FS->setModulePath(ModuleIdMap[ModuleId]);
@@ -6101,7 +6212,9 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
       uint64_t RawFlags = Record[2];
       unsigned RefArrayStart = 3;
       GlobalVarSummary::GVarFlags GVF(/* ReadOnly */ false,
-                                      /* WriteOnly */ false);
+                                      /* WriteOnly */ false,
+                                      /* Constant */ false,
+                                      GlobalObject::VCallVisibilityPublic);
       auto Flags = getDecodedGVSummaryFlags(RawFlags, Version);
       if (Version >= 5) {
         GVF = getDecodedGVarFlags(Record[3]);
@@ -6181,6 +6294,15 @@ Error ModuleSummaryIndexBitcodeReader::parseEntireSummary(unsigned ID) {
     case bitc::FS_TYPE_ID_METADATA:
       parseTypeIdCompatibleVtableSummaryRecord(Record);
       break;
+
+    case bitc::FS_BLOCK_COUNT:
+      TheIndex.addBlockCount(Record[0]);
+      break;
+
+    case bitc::FS_PARAM_ACCESS: {
+      PendingParamAccesses = parseParamAccesses(Record);
+      break;
+    }
     }
   }
   llvm_unreachable("Exit infinite loop");
@@ -6452,7 +6574,8 @@ llvm::getBitcodeFileContents(MemoryBufferRef Buffer) {
 /// everything.
 Expected<std::unique_ptr<Module>>
 BitcodeModule::getModuleImpl(LLVMContext &Context, bool MaterializeAll,
-                             bool ShouldLazyLoadMetadata, bool IsImporting) {
+                             bool ShouldLazyLoadMetadata, bool IsImporting,
+                             DataLayoutCallbackTy DataLayoutCallback) {
   BitstreamCursor Stream(Buffer);
 
   std::string ProducerIdentification;
@@ -6477,8 +6600,8 @@ BitcodeModule::getModuleImpl(LLVMContext &Context, bool MaterializeAll,
   M->setMaterializer(R);
 
   // Delay parsing Metadata if ShouldLazyLoadMetadata is true.
-  if (Error Err =
-          R->parseBitcodeInto(M.get(), ShouldLazyLoadMetadata, IsImporting))
+  if (Error Err = R->parseBitcodeInto(M.get(), ShouldLazyLoadMetadata,
+                                      IsImporting, DataLayoutCallback))
     return std::move(Err);
 
   if (MaterializeAll) {
@@ -6496,7 +6619,8 @@ BitcodeModule::getModuleImpl(LLVMContext &Context, bool MaterializeAll,
 Expected<std::unique_ptr<Module>>
 BitcodeModule::getLazyModule(LLVMContext &Context, bool ShouldLazyLoadMetadata,
                              bool IsImporting) {
-  return getModuleImpl(Context, false, ShouldLazyLoadMetadata, IsImporting);
+  return getModuleImpl(Context, false, ShouldLazyLoadMetadata, IsImporting,
+                       [](StringRef) { return None; });
 }
 
 // Parse the specified bitcode buffer and merge the index into CombinedIndex.
@@ -6662,19 +6786,21 @@ Expected<std::unique_ptr<Module>> llvm::getOwningLazyBitcodeModule(
 }
 
 Expected<std::unique_ptr<Module>>
-BitcodeModule::parseModule(LLVMContext &Context) {
-  return getModuleImpl(Context, true, false, false);
+BitcodeModule::parseModule(LLVMContext &Context,
+                           DataLayoutCallbackTy DataLayoutCallback) {
+  return getModuleImpl(Context, true, false, false, DataLayoutCallback);
   // TODO: Restore the use-lists to the in-memory state when the bitcode was
   // written.  We must defer until the Module has been fully materialized.
 }
 
-Expected<std::unique_ptr<Module>> llvm::parseBitcodeFile(MemoryBufferRef Buffer,
-                                                         LLVMContext &Context) {
+Expected<std::unique_ptr<Module>>
+llvm::parseBitcodeFile(MemoryBufferRef Buffer, LLVMContext &Context,
+                       DataLayoutCallbackTy DataLayoutCallback) {
   Expected<BitcodeModule> BM = getSingleModule(Buffer);
   if (!BM)
     return BM.takeError();
 
-  return BM->parseModule(Context);
+  return BM->parseModule(Context, DataLayoutCallback);
 }
 
 Expected<std::string> llvm::getBitcodeTargetTriple(MemoryBufferRef Buffer) {
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
index d16c3b0ff59d..a8bf579bd180 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.cpp
@@ -1132,7 +1132,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_OLD_FN_NODE: {
-    // FIXME: Remove in 4.0.
+    // Deprecated, but still needed to read old bitcode files.
     // This is a LocalAsMetadata record, the only type of function-local
     // metadata.
     if (Record.size() % 2 == 1)
@@ -1162,7 +1162,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_OLD_NODE: {
-    // FIXME: Remove in 4.0.
+    // Deprecated, but still needed to read old bitcode files.
     if (Record.size() % 2 == 1)
       return error("Invalid record");
 
@@ -1258,14 +1258,24 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     // Operand 'count' is interpreted as:
     // - Signed integer (version 0)
     // - Metadata node  (version 1)
+    // Operand 'lowerBound' is interpreted as:
+    // - Signed integer (version 0 and 1)
+    // - Metadata node  (version 2)
+    // Operands 'upperBound' and 'stride' are interpreted as:
+    // - Metadata node  (version 2)
     switch (Record[0] >> 1) {
     case 0:
       Val = GET_OR_DISTINCT(DISubrange,
-                            (Context, Record[1], unrotateSign(Record.back())));
+                            (Context, Record[1], unrotateSign(Record[2])));
       break;
     case 1:
       Val = GET_OR_DISTINCT(DISubrange, (Context, getMDOrNull(Record[1]),
-                                         unrotateSign(Record.back())));
+                                         unrotateSign(Record[2])));
+      break;
+    case 2:
+      Val = GET_OR_DISTINCT(
+          DISubrange, (Context, getMDOrNull(Record[1]), getMDOrNull(Record[2]),
+                       getMDOrNull(Record[3]), getMDOrNull(Record[4])));
       break;
     default:
       return error("Invalid record: Unsupported version of DISubrange");
@@ -1277,14 +1287,24 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_ENUMERATOR: {
-    if (Record.size() != 3)
+    if (Record.size() < 3)
       return error("Invalid record");
 
     IsDistinct = Record[0] & 1;
     bool IsUnsigned = Record[0] & 2;
+    bool IsBigInt = Record[0] & 4;
+    APInt Value;
+
+    if (IsBigInt) {
+      const uint64_t BitWidth = Record[1];
+      const size_t NumWords = Record.size() - 3;
+      Value = readWideAPInt(makeArrayRef(&Record[3], NumWords), BitWidth);
+    } else
+      Value = APInt(64, unrotateSign(Record[1]), !IsUnsigned);
+
     MetadataList.assignValue(
-        GET_OR_DISTINCT(DIEnumerator, (Context, unrotateSign(Record[1]),
-                                       IsUnsigned, getMDString(Record[2]))),
+        GET_OR_DISTINCT(DIEnumerator,
+                        (Context, Value, IsUnsigned, getMDString(Record[2]))),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1330,7 +1350,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_COMPOSITE_TYPE: {
-    if (Record.size() < 16 || Record.size() > 17)
+    if (Record.size() < 16 || Record.size() > 18)
       return error("Invalid record");
 
     // If we have a UUID and this is not a forward declaration, lookup the
@@ -1354,6 +1374,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     Metadata *VTableHolder = nullptr;
     Metadata *TemplateParams = nullptr;
     Metadata *Discriminator = nullptr;
+    Metadata *DataLocation = nullptr;
     auto *Identifier = getMDString(Record[15]);
     // If this module is being parsed so that it can be ThinLTO imported
     // into another module, composite types only need to be imported
@@ -1376,13 +1397,15 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
       TemplateParams = getMDOrNull(Record[14]);
       if (Record.size() > 16)
         Discriminator = getMDOrNull(Record[16]);
+      if (Record.size() > 17)
+        DataLocation = getMDOrNull(Record[17]);
     }
     DICompositeType *CT = nullptr;
     if (Identifier)
       CT = DICompositeType::buildODRType(
           Context, *Identifier, Tag, Name, File, Line, Scope, BaseType,
           SizeInBits, AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
-          VTableHolder, TemplateParams, Discriminator);
+          VTableHolder, TemplateParams, Discriminator, DataLocation);
 
     // Create a node if we didn't get a lazy ODR type.
     if (!CT)
@@ -1390,7 +1413,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
                            (Context, Tag, Name, File, Line, Scope, BaseType,
                             SizeInBits, AlignInBits, OffsetInBits, Flags,
                             Elements, RuntimeLang, VTableHolder, TemplateParams,
-                            Identifier, Discriminator));
+                            Identifier, Discriminator, DataLocation));
     if (!IsNotUsedInTypeRef && Identifier)
       MetadataList.addTypeRef(*Identifier, *cast<DICompositeType>(CT));
 
@@ -1418,15 +1441,19 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
   }
 
   case bitc::METADATA_MODULE: {
-    if (Record.size() != 6)
+    if (Record.size() < 5 || Record.size() > 8)
       return error("Invalid record");
 
+    unsigned Offset = Record.size() >= 7 ? 2 : 1;
     IsDistinct = Record[0];
     MetadataList.assignValue(
-        GET_OR_DISTINCT(DIModule,
-                        (Context, getMDOrNull(Record[1]),
-                         getMDString(Record[2]), getMDString(Record[3]),
-                         getMDString(Record[4]), getMDString(Record[5]))),
+        GET_OR_DISTINCT(
+            DIModule,
+            (Context, Record.size() >= 7 ? getMDOrNull(Record[1]) : nullptr,
+             getMDOrNull(Record[0 + Offset]), getMDString(Record[1 + Offset]),
+             getMDString(Record[2 + Offset]), getMDString(Record[3 + Offset]),
+             getMDString(Record[4 + Offset]),
+             Record.size() <= 7 ? 0 : Record[7])),
         NextMetadataNo);
     NextMetadataNo++;
     break;
@@ -1457,7 +1484,7 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_COMPILE_UNIT: {
-    if (Record.size() < 14 || Record.size() > 19)
+    if (Record.size() < 14 || Record.size() > 22)
       return error("Invalid record");
 
     // Ignore Record[0], which indicates whether this compile unit is
@@ -1473,7 +1500,9 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
         Record.size() <= 16 ? true : Record[16],
         Record.size() <= 17 ? false : Record[17],
         Record.size() <= 18 ? 0 : Record[18],
-        Record.size() <= 19 ? 0 : Record[19]);
+        Record.size() <= 19 ? 0 : Record[19],
+        Record.size() <= 20 ? nullptr : getMDString(Record[20]),
+        Record.size() <= 21 ? nullptr : getMDString(Record[21]));
 
     MetadataList.assignValue(CU, NextMetadataNo);
     NextMetadataNo++;
@@ -1667,27 +1696,34 @@ Error MetadataLoader::MetadataLoaderImpl::parseOneMetadata(
     break;
   }
   case bitc::METADATA_TEMPLATE_TYPE: {
-    if (Record.size() != 3)
+    if (Record.size() < 3 || Record.size() > 4)
       return error("Invalid record");
 
     IsDistinct = Record[0];
-    MetadataList.assignValue(GET_OR_DISTINCT(DITemplateTypeParameter,
-                                             (Context, getMDString(Record[1]),
-                                              getDITypeRefOrNull(Record[2]))),
-                             NextMetadataNo);
+    MetadataList.assignValue(
+        GET_OR_DISTINCT(DITemplateTypeParameter,
+                        (Context, getMDString(Record[1]),
+                         getDITypeRefOrNull(Record[2]),
+                         (Record.size() == 4) ? getMDOrNull(Record[3])
+                                              : getMDOrNull(false))),
+        NextMetadataNo);
     NextMetadataNo++;
     break;
   }
   case bitc::METADATA_TEMPLATE_VALUE: {
-    if (Record.size() != 5)
+    if (Record.size() < 5 || Record.size() > 6)
       return error("Invalid record");
 
     IsDistinct = Record[0];
+
     MetadataList.assignValue(
-        GET_OR_DISTINCT(DITemplateValueParameter,
-                        (Context, Record[1], getMDString(Record[2]),
-                         getDITypeRefOrNull(Record[3]),
-                         getMDOrNull(Record[4]))),
+        GET_OR_DISTINCT(
+            DITemplateValueParameter,
+            (Context, Record[1], getMDString(Record[2]),
+             getDITypeRefOrNull(Record[3]),
+             (Record.size() == 6) ? getMDOrNull(Record[4]) : getMDOrNull(false),
+             (Record.size() == 6) ? getMDOrNull(Record[5])
+                                  : getMDOrNull(Record[4]))),
         NextMetadataNo);
     NextMetadataNo++;
     break;
diff --git a/llvm/lib/Bitcode/Reader/MetadataLoader.h b/llvm/lib/Bitcode/Reader/MetadataLoader.h
index fe2b20273249..709800850f0d 100644
--- a/llvm/lib/Bitcode/Reader/MetadataLoader.h
+++ b/llvm/lib/Bitcode/Reader/MetadataLoader.h
@@ -23,11 +23,9 @@ namespace llvm {
 class BitcodeReaderValueList;
 class BitstreamCursor;
 class DISubprogram;
-class Error;
 class Function;
 class Instruction;
 class Metadata;
-class MDNode;
 class Module;
 class Type;
 
diff --git a/llvm/lib/Bitcode/Reader/ValueList.cpp b/llvm/lib/Bitcode/Reader/ValueList.cpp
index 431995fd40ac..63a206eeb022 100644
--- a/llvm/lib/Bitcode/Reader/ValueList.cpp
+++ b/llvm/lib/Bitcode/Reader/ValueList.cpp
@@ -220,6 +220,6 @@ void BitcodeReaderValueList::resolveConstantForwardRefs() {
 
     // Update all ValueHandles, they should be the only users at this point.
     Placeholder->replaceAllUsesWith(RealVal);
-    Placeholder->deleteValue();
+    delete cast<ConstantPlaceHolder>(Placeholder);
   }
 }
diff --git a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
index dcff7c421fc4..9c15a5f9f193 100644
--- a/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
+++ b/llvm/lib/Bitcode/Writer/BitcodeWriter.cpp
@@ -31,7 +31,6 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Comdat.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -374,7 +373,7 @@ private:
   void writeModuleConstants();
   bool pushValueAndType(const Value *V, unsigned InstID,
                         SmallVectorImpl<unsigned> &Vals);
-  void writeOperandBundles(ImmutableCallSite CS, unsigned InstID);
+  void writeOperandBundles(const CallBase &CB, unsigned InstID);
   void pushValue(const Value *V, unsigned InstID,
                  SmallVectorImpl<unsigned> &Vals);
   void pushValueSigned(const Value *V, unsigned InstID,
@@ -648,6 +647,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_NO_INLINE;
   case Attribute::NoRecurse:
     return bitc::ATTR_KIND_NO_RECURSE;
+  case Attribute::NoMerge:
+    return bitc::ATTR_KIND_NO_MERGE;
   case Attribute::NonLazyBind:
     return bitc::ATTR_KIND_NON_LAZY_BIND;
   case Attribute::NonNull:
@@ -666,6 +667,8 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_NOCF_CHECK;
   case Attribute::NoUnwind:
     return bitc::ATTR_KIND_NO_UNWIND;
+  case Attribute::NullPointerIsValid:
+    return bitc::ATTR_KIND_NULL_POINTER_IS_VALID;
   case Attribute::OptForFuzzing:
     return bitc::ATTR_KIND_OPT_FOR_FUZZING;
   case Attribute::OptimizeForSize:
@@ -726,10 +729,17 @@ static uint64_t getAttrKindEncoding(Attribute::AttrKind Kind) {
     return bitc::ATTR_KIND_IMMARG;
   case Attribute::SanitizeMemTag:
     return bitc::ATTR_KIND_SANITIZE_MEMTAG;
+  case Attribute::Preallocated:
+    return bitc::ATTR_KIND_PREALLOCATED;
+  case Attribute::NoUndef:
+    return bitc::ATTR_KIND_NOUNDEF;
   case Attribute::EndAttrKinds:
     llvm_unreachable("Can not encode end-attribute kinds marker.");
   case Attribute::None:
     llvm_unreachable("Can not encode none-attribute.");
+  case Attribute::EmptyKey:
+  case Attribute::TombstoneKey:
+    llvm_unreachable("Trying to encode EmptyKey/TombstoneKey");
   }
 
   llvm_unreachable("Trying to encode unknown attribute");
@@ -875,6 +885,7 @@ void ModuleBitcodeWriter::writeTypeTable() {
     switch (T->getTypeID()) {
     case Type::VoidTyID:      Code = bitc::TYPE_CODE_VOID;      break;
     case Type::HalfTyID:      Code = bitc::TYPE_CODE_HALF;      break;
+    case Type::BFloatTyID:    Code = bitc::TYPE_CODE_BFLOAT;    break;
     case Type::FloatTyID:     Code = bitc::TYPE_CODE_FLOAT;     break;
     case Type::DoubleTyID:    Code = bitc::TYPE_CODE_DOUBLE;    break;
     case Type::X86_FP80TyID:  Code = bitc::TYPE_CODE_X86_FP80;  break;
@@ -946,15 +957,16 @@ void ModuleBitcodeWriter::writeTypeTable() {
       AbbrevToUse = ArrayAbbrev;
       break;
     }
-    case Type::VectorTyID: {
+    case Type::FixedVectorTyID:
+    case Type::ScalableVectorTyID: {
       VectorType *VT = cast<VectorType>(T);
       // VECTOR [numelts, eltty] or
       //        [numelts, eltty, scalable]
       Code = bitc::TYPE_CODE_VECTOR;
-      TypeVals.push_back(VT->getNumElements());
+      TypeVals.push_back(VT->getElementCount().Min);
       TypeVals.push_back(VE.getTypeID(VT->getElementType()));
-      if (VT->isScalable())
-        TypeVals.push_back(VT->isScalable());
+      if (isa<ScalableVectorType>(VT))
+        TypeVals.push_back(true);
       break;
     }
     }
@@ -1028,7 +1040,8 @@ static uint64_t getEncodedGVSummaryFlags(GlobalValueSummary::GVFlags Flags) {
 }
 
 static uint64_t getEncodedGVarFlags(GlobalVarSummary::GVarFlags Flags) {
-  uint64_t RawFlags = Flags.MaybeReadOnly | (Flags.MaybeWriteOnly << 1);
+  uint64_t RawFlags = Flags.MaybeReadOnly | (Flags.MaybeWriteOnly << 1) |
+                      (Flags.Constant << 2) | Flags.VCallVisibility << 3;
   return RawFlags;
 }
 
@@ -1168,12 +1181,12 @@ void ModuleBitcodeWriter::writeModuleInfo() {
   std::map<std::string, unsigned> GCMap;
   unsigned MaxAlignment = 0;
   unsigned MaxGlobalType = 0;
-  for (const GlobalValue &GV : M.globals()) {
+  for (const GlobalVariable &GV : M.globals()) {
     MaxAlignment = std::max(MaxAlignment, GV.getAlignment());
     MaxGlobalType = std::max(MaxGlobalType, VE.getTypeID(GV.getValueType()));
     if (GV.hasSection()) {
       // Give section names unique ID's.
-      unsigned &Entry = SectionMap[GV.getSection()];
+      unsigned &Entry = SectionMap[std::string(GV.getSection())];
       if (!Entry) {
         writeStringRecord(Stream, bitc::MODULE_CODE_SECTIONNAME, GV.getSection(),
                           0 /*TODO*/);
@@ -1185,7 +1198,7 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     MaxAlignment = std::max(MaxAlignment, F.getAlignment());
     if (F.hasSection()) {
       // Give section names unique ID's.
-      unsigned &Entry = SectionMap[F.getSection()];
+      unsigned &Entry = SectionMap[std::string(F.getSection())];
       if (!Entry) {
         writeStringRecord(Stream, bitc::MODULE_CODE_SECTIONNAME, F.getSection(),
                           0 /*TODO*/);
@@ -1275,7 +1288,8 @@ void ModuleBitcodeWriter::writeModuleInfo() {
                    (VE.getValueID(GV.getInitializer()) + 1));
     Vals.push_back(getEncodedLinkage(GV));
     Vals.push_back(Log2_32(GV.getAlignment())+1);
-    Vals.push_back(GV.hasSection() ? SectionMap[GV.getSection()] : 0);
+    Vals.push_back(GV.hasSection() ? SectionMap[std::string(GV.getSection())]
+                                   : 0);
     if (GV.isThreadLocal() ||
         GV.getVisibility() != GlobalValue::DefaultVisibility ||
         GV.getUnnamedAddr() != GlobalValue::UnnamedAddr::None ||
@@ -1320,7 +1334,8 @@ void ModuleBitcodeWriter::writeModuleInfo() {
     Vals.push_back(getEncodedLinkage(F));
     Vals.push_back(VE.getAttributeListID(F.getAttributes()));
     Vals.push_back(Log2_32(F.getAlignment())+1);
-    Vals.push_back(F.hasSection() ? SectionMap[F.getSection()] : 0);
+    Vals.push_back(F.hasSection() ? SectionMap[std::string(F.getSection())]
+                                  : 0);
     Vals.push_back(getEncodedVisibility(F));
     Vals.push_back(F.hasGC() ? GCMap[F.getGC()] : 0);
     Vals.push_back(getEncodedUnnamedAddr(F));
@@ -1506,29 +1521,46 @@ void ModuleBitcodeWriter::writeGenericDINode(const GenericDINode *N,
   Record.clear();
 }
 
-static uint64_t rotateSign(int64_t I) {
-  uint64_t U = I;
-  return I < 0 ? ~(U << 1) : U << 1;
-}
-
 void ModuleBitcodeWriter::writeDISubrange(const DISubrange *N,
                                           SmallVectorImpl<uint64_t> &Record,
                                           unsigned Abbrev) {
-  const uint64_t Version = 1 << 1;
+  const uint64_t Version = 2 << 1;
   Record.push_back((uint64_t)N->isDistinct() | Version);
   Record.push_back(VE.getMetadataOrNullID(N->getRawCountNode()));
-  Record.push_back(rotateSign(N->getLowerBound()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawLowerBound()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawUpperBound()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawStride()));
 
   Stream.EmitRecord(bitc::METADATA_SUBRANGE, Record, Abbrev);
   Record.clear();
 }
 
+static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) {
+  if ((int64_t)V >= 0)
+    Vals.push_back(V << 1);
+  else
+    Vals.push_back((-V << 1) | 1);
+}
+
+static void emitWideAPInt(SmallVectorImpl<uint64_t> &Vals, const APInt &A) {
+  // We have an arbitrary precision integer value to write whose
+  // bit width is > 64. However, in canonical unsigned integer
+  // format it is likely that the high bits are going to be zero.
+  // So, we only write the number of active words.
+  unsigned NumWords = A.getActiveWords();
+  const uint64_t *RawData = A.getRawData();
+  for (unsigned i = 0; i < NumWords; i++)
+    emitSignedInt64(Vals, RawData[i]);
+}
+
 void ModuleBitcodeWriter::writeDIEnumerator(const DIEnumerator *N,
                                             SmallVectorImpl<uint64_t> &Record,
                                             unsigned Abbrev) {
-  Record.push_back((N->isUnsigned() << 1) | N->isDistinct());
-  Record.push_back(rotateSign(N->getValue()));
+  const uint64_t IsBigInt = 1 << 2;
+  Record.push_back(IsBigInt | (N->isUnsigned() << 1) | N->isDistinct());
+  Record.push_back(N->getValue().getBitWidth());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
+  emitWideAPInt(Record, N->getValue());
 
   Stream.EmitRecord(bitc::METADATA_ENUMERATOR, Record, Abbrev);
   Record.clear();
@@ -1597,6 +1629,7 @@ void ModuleBitcodeWriter::writeDICompositeType(
   Record.push_back(VE.getMetadataOrNullID(N->getTemplateParams().get()));
   Record.push_back(VE.getMetadataOrNullID(N->getRawIdentifier()));
   Record.push_back(VE.getMetadataOrNullID(N->getDiscriminator()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawDataLocation()));
 
   Stream.EmitRecord(bitc::METADATA_COMPOSITE_TYPE, Record, Abbrev);
   Record.clear();
@@ -1661,6 +1694,9 @@ void ModuleBitcodeWriter::writeDICompileUnit(const DICompileUnit *N,
   Record.push_back(N->getSplitDebugInlining());
   Record.push_back(N->getDebugInfoForProfiling());
   Record.push_back((unsigned)N->getNameTableKind());
+  Record.push_back(N->getRangesBaseAddress());
+  Record.push_back(VE.getMetadataOrNullID(N->getRawSysRoot()));
+  Record.push_back(VE.getMetadataOrNullID(N->getRawSDK()));
 
   Stream.EmitRecord(bitc::METADATA_COMPILE_UNIT, Record, Abbrev);
   Record.clear();
@@ -1776,6 +1812,7 @@ void ModuleBitcodeWriter::writeDIModule(const DIModule *N,
   Record.push_back(N->isDistinct());
   for (auto &I : N->operands())
     Record.push_back(VE.getMetadataOrNullID(I));
+  Record.push_back(N->getLineNo());
 
   Stream.EmitRecord(bitc::METADATA_MODULE, Record, Abbrev);
   Record.clear();
@@ -1787,6 +1824,7 @@ void ModuleBitcodeWriter::writeDITemplateTypeParameter(
   Record.push_back(N->isDistinct());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
   Record.push_back(VE.getMetadataOrNullID(N->getType()));
+  Record.push_back(N->isDefault());
 
   Stream.EmitRecord(bitc::METADATA_TEMPLATE_TYPE, Record, Abbrev);
   Record.clear();
@@ -1799,6 +1837,7 @@ void ModuleBitcodeWriter::writeDITemplateValueParameter(
   Record.push_back(N->getTag());
   Record.push_back(VE.getMetadataOrNullID(N->getRawName()));
   Record.push_back(VE.getMetadataOrNullID(N->getType()));
+  Record.push_back(N->isDefault());
   Record.push_back(VE.getMetadataOrNullID(N->getValue()));
 
   Stream.EmitRecord(bitc::METADATA_TEMPLATE_VALUE, Record, Abbrev);
@@ -2258,13 +2297,6 @@ void ModuleBitcodeWriter::writeSyncScopeNames() {
   Stream.ExitBlock();
 }
 
-static void emitSignedInt64(SmallVectorImpl<uint64_t> &Vals, uint64_t V) {
-  if ((int64_t)V >= 0)
-    Vals.push_back(V << 1);
-  else
-    Vals.push_back((-V << 1) | 1);
-}
-
 void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
                                          bool isGlobal) {
   if (FirstVal == LastVal) return;
@@ -2351,21 +2383,14 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
         Code = bitc::CST_CODE_INTEGER;
         AbbrevToUse = CONSTANTS_INTEGER_ABBREV;
       } else {                             // Wide integers, > 64 bits in size.
-        // We have an arbitrary precision integer value to write whose
-        // bit width is > 64. However, in canonical unsigned integer
-        // format it is likely that the high bits are going to be zero.
-        // So, we only write the number of active words.
-        unsigned NWords = IV->getValue().getActiveWords();
-        const uint64_t *RawWords = IV->getValue().getRawData();
-        for (unsigned i = 0; i != NWords; ++i) {
-          emitSignedInt64(Record, RawWords[i]);
-        }
+        emitWideAPInt(Record, IV->getValue());
         Code = bitc::CST_CODE_WIDE_INTEGER;
       }
     } else if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
       Code = bitc::CST_CODE_FLOAT;
       Type *Ty = CFP->getType();
-      if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) {
+      if (Ty->isHalfTy() || Ty->isBFloatTy() || Ty->isFloatTy() ||
+          Ty->isDoubleTy()) {
         Record.push_back(CFP->getValueAPF().bitcastToAPInt().getZExtValue());
       } else if (Ty->isX86_FP80Ty()) {
         // api needed to prevent premature destruction
@@ -2412,7 +2437,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
     } else if (const ConstantDataSequential *CDS =
                   dyn_cast<ConstantDataSequential>(C)) {
       Code = bitc::CST_CODE_DATA;
-      Type *EltTy = CDS->getType()->getElementType();
+      Type *EltTy = CDS->getElementType();
       if (isa<IntegerType>(EltTy)) {
         for (unsigned i = 0, e = CDS->getNumElements(); i != e; ++i)
           Record.push_back(CDS->getElementAsInteger(i));
@@ -2504,7 +2529,7 @@ void ModuleBitcodeWriter::writeConstants(unsigned FirstVal, unsigned LastVal,
         }
         Record.push_back(VE.getValueID(C->getOperand(0)));
         Record.push_back(VE.getValueID(C->getOperand(1)));
-        Record.push_back(VE.getValueID(C->getOperand(2)));
+        Record.push_back(VE.getValueID(CE->getShuffleMaskForBitcode()));
         break;
       case Instruction::ICmp:
       case Instruction::FCmp:
@@ -2566,10 +2591,10 @@ bool ModuleBitcodeWriter::pushValueAndType(const Value *V, unsigned InstID,
   return false;
 }
 
-void ModuleBitcodeWriter::writeOperandBundles(ImmutableCallSite CS,
+void ModuleBitcodeWriter::writeOperandBundles(const CallBase &CS,
                                               unsigned InstID) {
   SmallVector<unsigned, 64> Record;
-  LLVMContext &C = CS.getInstruction()->getContext();
+  LLVMContext &C = CS.getContext();
 
   for (unsigned i = 0, e = CS.getNumOperandBundles(); i != e; ++i) {
     const auto &Bundle = CS.getOperandBundleAt(i);
@@ -2691,7 +2716,8 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
     Code = bitc::FUNC_CODE_INST_SHUFFLEVEC;
     pushValueAndType(I.getOperand(0), InstID, Vals);
     pushValue(I.getOperand(1), InstID, Vals);
-    pushValue(I.getOperand(2), InstID, Vals);
+    pushValue(cast<ShuffleVectorInst>(I).getShuffleMaskForBitcode(), InstID,
+              Vals);
     break;
   case Instruction::ICmp:
   case Instruction::FCmp: {
@@ -2756,11 +2782,11 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
 
   case Instruction::Invoke: {
     const InvokeInst *II = cast<InvokeInst>(&I);
-    const Value *Callee = II->getCalledValue();
+    const Value *Callee = II->getCalledOperand();
     FunctionType *FTy = II->getFunctionType();
 
     if (II->hasOperandBundles())
-      writeOperandBundles(II, InstID);
+      writeOperandBundles(*II, InstID);
 
     Code = bitc::FUNC_CODE_INST_INVOKE;
 
@@ -2832,11 +2858,11 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
   }
   case Instruction::CallBr: {
     const CallBrInst *CBI = cast<CallBrInst>(&I);
-    const Value *Callee = CBI->getCalledValue();
+    const Value *Callee = CBI->getCalledOperand();
     FunctionType *FTy = CBI->getFunctionType();
 
     if (CBI->hasOperandBundles())
-      writeOperandBundles(CBI, InstID);
+      writeOperandBundles(*CBI, InstID);
 
     Code = bitc::FUNC_CODE_INST_CALLBR;
 
@@ -2993,7 +3019,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
     FunctionType *FTy = CI.getFunctionType();
 
     if (CI.hasOperandBundles())
-      writeOperandBundles(&CI, InstID);
+      writeOperandBundles(CI, InstID);
 
     Code = bitc::FUNC_CODE_INST_CALL;
 
@@ -3010,7 +3036,7 @@ void ModuleBitcodeWriter::writeInstruction(const Instruction &I,
       Vals.push_back(Flags);
 
     Vals.push_back(VE.getTypeID(FTy));
-    pushValueAndType(CI.getCalledValue(), InstID, Vals); // Callee
+    pushValueAndType(CI.getCalledOperand(), InstID, Vals); // Callee
 
     // Emit value #'s for the fixed parameters.
     for (unsigned i = 0, e = FTy->getNumParams(); i != e; ++i) {
@@ -3552,6 +3578,29 @@ static void writeFunctionTypeMetadataRecords(BitstreamWriter &Stream,
                      FS->type_test_assume_const_vcalls());
   WriteConstVCallVec(bitc::FS_TYPE_CHECKED_LOAD_CONST_VCALL,
                      FS->type_checked_load_const_vcalls());
+
+  auto WriteRange = [&](ConstantRange Range) {
+    Range = Range.sextOrTrunc(FunctionSummary::ParamAccess::RangeWidth);
+    assert(Range.getLower().getNumWords() == 1);
+    assert(Range.getUpper().getNumWords() == 1);
+    emitSignedInt64(Record, *Range.getLower().getRawData());
+    emitSignedInt64(Record, *Range.getUpper().getRawData());
+  };
+
+  if (!FS->paramAccesses().empty()) {
+    Record.clear();
+    for (auto &Arg : FS->paramAccesses()) {
+      Record.push_back(Arg.ParamNo);
+      WriteRange(Arg.Use);
+      Record.push_back(Arg.Calls.size());
+      for (auto &Call : Arg.Calls) {
+        Record.push_back(Call.ParamNo);
+        Record.push_back(Call.Callee);
+        WriteRange(Call.Offsets);
+      }
+    }
+    Stream.EmitRecord(bitc::FS_PARAM_ACCESS, Record);
+  }
 }
 
 /// Collect type IDs from type tests used by function.
@@ -3882,6 +3931,9 @@ void ModuleBitcodeWriterBase::writePerModuleGlobalValueSummary() {
     NameVals.clear();
   }
 
+  Stream.EmitRecord(bitc::FS_BLOCK_COUNT,
+                    ArrayRef<uint64_t>{Index->getBlockCount()});
+
   Stream.ExitBlock();
 }
 
@@ -3893,20 +3945,7 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
       ArrayRef<uint64_t>{ModuleSummaryIndex::BitcodeSummaryVersion});
 
   // Write the index flags.
-  uint64_t Flags = 0;
-  if (Index.withGlobalValueDeadStripping())
-    Flags |= 0x1;
-  if (Index.skipModuleByDistributedBackend())
-    Flags |= 0x2;
-  if (Index.hasSyntheticEntryCounts())
-    Flags |= 0x4;
-  if (Index.enableSplitLTOUnit())
-    Flags |= 0x8;
-  if (Index.partiallySplitLTOUnits())
-    Flags |= 0x10;
-  if (Index.withAttributePropagation())
-    Flags |= 0x20;
-  Stream.EmitRecord(bitc::FS_FLAGS, ArrayRef<uint64_t>{Flags});
+  Stream.EmitRecord(bitc::FS_FLAGS, ArrayRef<uint64_t>{Index.getFlags()});
 
   for (const auto &GVI : valueIds()) {
     Stream.EmitRecord(bitc::FS_VALUE_GUID,
@@ -4178,6 +4217,9 @@ void IndexBitcodeWriter::writeCombinedGlobalValueSummary() {
     }
   }
 
+  Stream.EmitRecord(bitc::FS_BLOCK_COUNT,
+                    ArrayRef<uint64_t>{Index.getBlockCount()});
+
   Stream.ExitBlock();
 }
 
@@ -4200,7 +4242,7 @@ static void writeIdentificationBlock(BitstreamWriter &Stream) {
   Abbv->Add(BitCodeAbbrevOp(bitc::IDENTIFICATION_CODE_EPOCH));
   Abbv->Add(BitCodeAbbrevOp(BitCodeAbbrevOp::VBR, 6));
   auto EpochAbbrev = Stream.EmitAbbrev(std::move(Abbv));
-  SmallVector<unsigned, 1> Vals = {bitc::BITCODE_CURRENT_EPOCH};
+  constexpr std::array<unsigned, 1> Vals = {{bitc::BITCODE_CURRENT_EPOCH}};
   Stream.EmitRecord(bitc::IDENTIFICATION_CODE_EPOCH, Vals, EpochAbbrev);
   Stream.ExitBlock();
 }
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
index f59c906c7b75..8bdddc27e95a 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.cpp
@@ -88,11 +88,16 @@ static void orderValue(const Value *V, OrderMap &OM) {
   if (OM.lookup(V).first)
     return;
 
-  if (const Constant *C = dyn_cast<Constant>(V))
-    if (C->getNumOperands() && !isa<GlobalValue>(C))
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    if (C->getNumOperands() && !isa<GlobalValue>(C)) {
       for (const Value *Op : C->operands())
         if (!isa<BasicBlock>(Op) && !isa<GlobalValue>(Op))
           orderValue(Op, OM);
+      if (auto *CE = dyn_cast<ConstantExpr>(C))
+        if (CE->getOpcode() == Instruction::ShuffleVector)
+          orderValue(CE->getShuffleMaskForBitcode(), OM);
+    }
+  }
 
   // Note: we cannot cache this lookup above, since inserting into the map
   // changes the map's size, and thus affects the other IDs.
@@ -155,11 +160,14 @@ static OrderMap orderModule(const Module &M) {
     for (const Argument &A : F.args())
       orderValue(&A, OM);
     for (const BasicBlock &BB : F)
-      for (const Instruction &I : BB)
+      for (const Instruction &I : BB) {
         for (const Value *Op : I.operands())
           if ((isa<Constant>(*Op) && !isa<GlobalValue>(*Op)) ||
               isa<InlineAsm>(*Op))
             orderValue(Op, OM);
+        if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
+          orderValue(SVI->getShuffleMaskForBitcode(), OM);
+      }
     for (const BasicBlock &BB : F)
       for (const Instruction &I : BB)
         orderValue(&I, OM);
@@ -223,9 +231,9 @@ static void predictValueUseListOrderImpl(const Value *V, const Function *F,
     return LU->getOperandNo() > RU->getOperandNo();
   });
 
-  if (std::is_sorted(
-          List.begin(), List.end(),
-          [](const Entry &L, const Entry &R) { return L.second < R.second; }))
+  if (llvm::is_sorted(List, [](const Entry &L, const Entry &R) {
+        return L.second < R.second;
+      }))
     // Order is already correct.
     return;
 
@@ -250,11 +258,17 @@ static void predictValueUseListOrder(const Value *V, const Function *F,
     predictValueUseListOrderImpl(V, F, IDPair.first, OM, Stack);
 
   // Recursive descent into constants.
-  if (const Constant *C = dyn_cast<Constant>(V))
-    if (C->getNumOperands()) // Visit GlobalValues.
+  if (const Constant *C = dyn_cast<Constant>(V)) {
+    if (C->getNumOperands()) { // Visit GlobalValues.
       for (const Value *Op : C->operands())
         if (isa<Constant>(Op)) // Visit GlobalValues.
           predictValueUseListOrder(Op, F, OM, Stack);
+      if (auto *CE = dyn_cast<ConstantExpr>(C))
+        if (CE->getOpcode() == Instruction::ShuffleVector)
+          predictValueUseListOrder(CE->getShuffleMaskForBitcode(), F, OM,
+                                   Stack);
+    }
+  }
 }
 
 static UseListOrderStack predictUseListOrder(const Module &M) {
@@ -279,10 +293,14 @@ static UseListOrderStack predictUseListOrder(const Module &M) {
     for (const Argument &A : F.args())
       predictValueUseListOrder(&A, &F, OM, Stack);
     for (const BasicBlock &BB : F)
-      for (const Instruction &I : BB)
+      for (const Instruction &I : BB) {
         for (const Value *Op : I.operands())
           if (isa<Constant>(*Op) || isa<InlineAsm>(*Op)) // Visit GlobalValues.
             predictValueUseListOrder(Op, &F, OM, Stack);
+        if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
+          predictValueUseListOrder(SVI->getShuffleMaskForBitcode(), &F, OM,
+                                   Stack);
+      }
     for (const BasicBlock &BB : F)
       for (const Instruction &I : BB)
         predictValueUseListOrder(&I, &F, OM, Stack);
@@ -413,6 +431,8 @@ ValueEnumerator::ValueEnumerator(const Module &M,
 
           EnumerateMetadata(&F, MD->getMetadata());
         }
+        if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
+          EnumerateType(SVI->getShuffleMaskForBitcode()->getType());
         EnumerateType(I.getType());
         if (const auto *Call = dyn_cast<CallBase>(&I))
           EnumerateAttributes(Call->getAttributes());
@@ -836,6 +856,9 @@ void ValueEnumerator::EnumerateValue(const Value *V) {
            I != E; ++I)
         if (!isa<BasicBlock>(*I)) // Don't enumerate BB operand to BlockAddress.
           EnumerateValue(*I);
+      if (auto *CE = dyn_cast<ConstantExpr>(C))
+        if (CE->getOpcode() == Instruction::ShuffleVector)
+          EnumerateValue(CE->getShuffleMaskForBitcode());
 
       // Finally, add the value.  Doing this could make the ValueID reference be
       // dangling, don't reuse it.
@@ -913,6 +936,9 @@ void ValueEnumerator::EnumerateOperandType(const Value *V) {
 
     EnumerateOperandType(Op);
   }
+  if (auto *CE = dyn_cast<ConstantExpr>(C))
+    if (CE->getOpcode() == Instruction::ShuffleVector)
+      EnumerateOperandType(CE->getShuffleMaskForBitcode());
 }
 
 void ValueEnumerator::EnumerateAttributes(AttributeList PAL) {
@@ -958,11 +984,14 @@ void ValueEnumerator::incorporateFunction(const Function &F) {
 
   // Add all function-level constants to the value table.
   for (const BasicBlock &BB : F) {
-    for (const Instruction &I : BB)
+    for (const Instruction &I : BB) {
       for (const Use &OI : I.operands()) {
         if ((isa<Constant>(OI) && !isa<GlobalValue>(OI)) || isa<InlineAsm>(OI))
           EnumerateValue(OI);
       }
+      if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
+        EnumerateValue(SVI->getShuffleMaskForBitcode());
+    }
     BasicBlocks.push_back(&BB);
     ValueMap[&BB] = BasicBlocks.size();
   }
diff --git a/llvm/lib/Bitcode/Writer/ValueEnumerator.h b/llvm/lib/Bitcode/Writer/ValueEnumerator.h
index 112f0b4a1dc4..3c3bd0d9fdc7 100644
--- a/llvm/lib/Bitcode/Writer/ValueEnumerator.h
+++ b/llvm/lib/Bitcode/Writer/ValueEnumerator.h
@@ -17,8 +17,6 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/UniqueVector.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Type.h"
 #include "llvm/IR/UseListOrder.h"
 #include <cassert>
 #include <cstdint>
diff --git a/llvm/lib/Bitstream/Reader/BitstreamReader.cpp b/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
index 92d7c91a1d35..2739137c1e44 100644
--- a/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
+++ b/llvm/lib/Bitstream/Reader/BitstreamReader.cpp
@@ -214,6 +214,7 @@ Expected<unsigned> BitstreamCursor::readRecord(unsigned AbbrevID,
     if (!MaybeNumElts)
       return MaybeNumElts.takeError();
     uint32_t NumElts = MaybeNumElts.get();
+    Vals.reserve(Vals.size() + NumElts);
 
     for (unsigned i = 0; i != NumElts; ++i)
       if (Expected<uint64_t> MaybeVal = ReadVBR64(6))
@@ -263,6 +264,7 @@ Expected<unsigned> BitstreamCursor::readRecord(unsigned AbbrevID,
       if (!MaybeNumElts)
         return MaybeNumElts.takeError();
       uint32_t NumElts = MaybeNumElts.get();
+      Vals.reserve(Vals.size() + NumElts);
 
       // Get the element encoding.
       if (i + 2 != e)
@@ -334,8 +336,8 @@ Expected<unsigned> BitstreamCursor::readRecord(unsigned AbbrevID,
       *Blob = StringRef(Ptr, NumElts);
     } else {
       // Otherwise, unpack into Vals with zero extension.
-      for (; NumElts; --NumElts)
-        Vals.push_back((unsigned char)*Ptr++);
+      auto *UPtr = reinterpret_cast<const unsigned char *>(Ptr);
+      Vals.append(UPtr, UPtr + NumElts);
     }
   }
 
@@ -458,21 +460,15 @@ BitstreamCursor::ReadBlockInfoBlock(bool ReadBlockInfoNames) {
         return None;
       if (!ReadBlockInfoNames)
         break; // Ignore name.
-      std::string Name;
-      for (unsigned i = 0, e = Record.size(); i != e; ++i)
-        Name += (char)Record[i];
-      CurBlockInfo->Name = Name;
+      CurBlockInfo->Name = std::string(Record.begin(), Record.end());
       break;
     }
       case bitc::BLOCKINFO_CODE_SETRECORDNAME: {
         if (!CurBlockInfo) return None;
         if (!ReadBlockInfoNames)
           break; // Ignore name.
-        std::string Name;
-        for (unsigned i = 1, e = Record.size(); i != e; ++i)
-          Name += (char)Record[i];
-        CurBlockInfo->RecordNames.push_back(std::make_pair((unsigned)Record[0],
-                                                           Name));
+        CurBlockInfo->RecordNames.emplace_back(
+            (unsigned)Record[0], std::string(Record.begin() + 1, Record.end()));
         break;
       }
       }
diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
index f64b775a8b77..acf8553f7205 100644
--- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.cpp
@@ -15,7 +15,6 @@
 
 #include "AggressiveAntiDepBreaker.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -28,7 +27,6 @@
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
@@ -36,10 +34,7 @@
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <map>
-#include <set>
 #include <utility>
-#include <vector>
 
 using namespace llvm;
 
@@ -1011,3 +1006,9 @@ unsigned AggressiveAntiDepBreaker::BreakAntiDependencies(
 
   return Broken;
 }
+
+AntiDepBreaker *llvm::createAggressiveAntiDepBreaker(
+    MachineFunction &MFi, const RegisterClassInfo &RCI,
+    TargetSubtargetInfo::RegClassVector &CriticalPathRCs) {
+  return new AggressiveAntiDepBreaker(MFi, RCI, CriticalPathRCs);
+}
diff --git a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h
index 0cf2e6d78f7f..419cb7626945 100644
--- a/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h
+++ b/llvm/lib/CodeGen/AggressiveAntiDepBreaker.h
@@ -16,8 +16,8 @@
 #ifndef LLVM_LIB_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
 #define LLVM_LIB_CODEGEN_AGGRESSIVEANTIDEPBREAKER_H
 
-#include "AntiDepBreaker.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/AntiDepBreaker.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Support/Compiler.h"
 #include <map>
diff --git a/llvm/lib/CodeGen/AllocationOrder.h b/llvm/lib/CodeGen/AllocationOrder.h
index 9247dd844936..fa0690ab4ea5 100644
--- a/llvm/lib/CodeGen/AllocationOrder.h
+++ b/llvm/lib/CodeGen/AllocationOrder.h
@@ -17,8 +17,9 @@
 #define LLVM_LIB_CODEGEN_ALLOCATIONORDER_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCRegister.h"
 
 namespace llvm {
 
diff --git a/llvm/lib/CodeGen/Analysis.cpp b/llvm/lib/CodeGen/Analysis.cpp
index 1632895fe5fa..7da28ffec85c 100644
--- a/llvm/lib/CodeGen/Analysis.cpp
+++ b/llvm/lib/CodeGen/Analysis.cpp
@@ -25,6 +25,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 
 using namespace llvm;
@@ -312,8 +313,8 @@ static const Value *getNoopInput(const Value *V,
       DataBits = std::min((uint64_t)DataBits,
                          I->getType()->getPrimitiveSizeInBits().getFixedSize());
       NoopInput = Op;
-    } else if (auto CS = ImmutableCallSite(I)) {
-      const Value *ReturnedOp = CS.getReturnedArgOperand();
+    } else if (auto *CB = dyn_cast<CallBase>(I)) {
+      const Value *ReturnedOp = CB->getReturnedArgOperand();
       if (ReturnedOp && isNoopBitcast(ReturnedOp->getType(), I->getType(), TLI))
         NoopInput = ReturnedOp;
     } else if (const InsertValueInst *IVI = dyn_cast<InsertValueInst>(V)) {
@@ -395,7 +396,7 @@ static bool slotOnlyDiscardsData(const Value *RetVal, const Value *CallVal,
 
 /// For an aggregate type, determine whether a given index is within bounds or
 /// not.
-static bool indexReallyValid(CompositeType *T, unsigned Idx) {
+static bool indexReallyValid(Type *T, unsigned Idx) {
   if (ArrayType *AT = dyn_cast<ArrayType>(T))
     return Idx < AT->getNumElements();
 
@@ -419,7 +420,7 @@ static bool indexReallyValid(CompositeType *T, unsigned Idx) {
 /// function again on a finished iterator will repeatedly return
 /// false. SubTypes.back()->getTypeAtIndex(Path.back()) is either an empty
 /// aggregate or a non-aggregate
-static bool advanceToNextLeafType(SmallVectorImpl<CompositeType *> &SubTypes,
+static bool advanceToNextLeafType(SmallVectorImpl<Type *> &SubTypes,
                                   SmallVectorImpl<unsigned> &Path) {
   // First march back up the tree until we can successfully increment one of the
   // coordinates in Path.
@@ -435,16 +436,16 @@ static bool advanceToNextLeafType(SmallVectorImpl<CompositeType *> &SubTypes,
   // We know there's *some* valid leaf now, so march back down the tree picking
   // out the left-most element at each node.
   ++Path.back();
-  Type *DeeperType = SubTypes.back()->getTypeAtIndex(Path.back());
+  Type *DeeperType =
+      ExtractValueInst::getIndexedType(SubTypes.back(), Path.back());
   while (DeeperType->isAggregateType()) {
-    CompositeType *CT = cast<CompositeType>(DeeperType);
-    if (!indexReallyValid(CT, 0))
+    if (!indexReallyValid(DeeperType, 0))
       return true;
 
-    SubTypes.push_back(CT);
+    SubTypes.push_back(DeeperType);
     Path.push_back(0);
 
-    DeeperType = CT->getTypeAtIndex(0U);
+    DeeperType = ExtractValueInst::getIndexedType(DeeperType, 0);
   }
 
   return true;
@@ -460,17 +461,15 @@ static bool advanceToNextLeafType(SmallVectorImpl<CompositeType *> &SubTypes,
 /// For example, if Next was {[0 x i64], {{}, i32, {}}, i32} then we would setup
 /// Path as [1, 1] and SubTypes as [Next, {{}, i32, {}}] to represent the first
 /// i32 in that type.
-static bool firstRealType(Type *Next,
-                          SmallVectorImpl<CompositeType *> &SubTypes,
+static bool firstRealType(Type *Next, SmallVectorImpl<Type *> &SubTypes,
                           SmallVectorImpl<unsigned> &Path) {
   // First initialise the iterator components to the first "leaf" node
   // (i.e. node with no valid sub-type at any index, so {} does count as a leaf
   // despite nominally being an aggregate).
-  while (Next->isAggregateType() &&
-         indexReallyValid(cast<CompositeType>(Next), 0)) {
-    SubTypes.push_back(cast<CompositeType>(Next));
+  while (Type *FirstInner = ExtractValueInst::getIndexedType(Next, 0)) {
+    SubTypes.push_back(Next);
     Path.push_back(0);
-    Next = cast<CompositeType>(Next)->getTypeAtIndex(0U);
+    Next = FirstInner;
   }
 
   // If there's no Path now, Next was originally scalar already (or empty
@@ -480,7 +479,8 @@ static bool firstRealType(Type *Next,
 
   // Otherwise, use normal iteration to keep looking through the tree until we
   // find a non-aggregate type.
-  while (SubTypes.back()->getTypeAtIndex(Path.back())->isAggregateType()) {
+  while (ExtractValueInst::getIndexedType(SubTypes.back(), Path.back())
+             ->isAggregateType()) {
     if (!advanceToNextLeafType(SubTypes, Path))
       return false;
   }
@@ -490,14 +490,15 @@ static bool firstRealType(Type *Next,
 
 /// Set the iterator data-structures to the next non-empty, non-aggregate
 /// subtype.
-static bool nextRealType(SmallVectorImpl<CompositeType *> &SubTypes,
+static bool nextRealType(SmallVectorImpl<Type *> &SubTypes,
                          SmallVectorImpl<unsigned> &Path) {
   do {
     if (!advanceToNextLeafType(SubTypes, Path))
       return false;
 
     assert(!Path.empty() && "found a leaf but didn't set the path?");
-  } while (SubTypes.back()->getTypeAtIndex(Path.back())->isAggregateType());
+  } while (ExtractValueInst::getIndexedType(SubTypes.back(), Path.back())
+               ->isAggregateType());
 
   return true;
 }
@@ -509,9 +510,8 @@ static bool nextRealType(SmallVectorImpl<CompositeType *> &SubTypes,
 /// between it and the return.
 ///
 /// This function only tests target-independent requirements.
-bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
-  const Instruction *I = CS.getInstruction();
-  const BasicBlock *ExitBB = I->getParent();
+bool llvm::isInTailCallPosition(const CallBase &Call, const TargetMachine &TM) {
+  const BasicBlock *ExitBB = Call.getParent();
   const Instruction *Term = ExitBB->getTerminator();
   const ReturnInst *Ret = dyn_cast<ReturnInst>(Term);
 
@@ -525,33 +525,32 @@ bool llvm::isInTailCallPosition(ImmutableCallSite CS, const TargetMachine &TM) {
   // been fully understood.
   if (!Ret &&
       ((!TM.Options.GuaranteedTailCallOpt &&
-        CS.getCallingConv() != CallingConv::Tail) || !isa<UnreachableInst>(Term)))
+        Call.getCallingConv() != CallingConv::Tail) || !isa<UnreachableInst>(Term)))
     return false;
 
   // If I will have a chain, make sure no other instruction that will have a
   // chain interposes between I and the return.
-  if (I->mayHaveSideEffects() || I->mayReadFromMemory() ||
-      !isSafeToSpeculativelyExecute(I))
-    for (BasicBlock::const_iterator BBI = std::prev(ExitBB->end(), 2);; --BBI) {
-      if (&*BBI == I)
-        break;
-      // Debug info intrinsics do not get in the way of tail call optimization.
-      if (isa<DbgInfoIntrinsic>(BBI))
+  // Check for all calls including speculatable functions.
+  for (BasicBlock::const_iterator BBI = std::prev(ExitBB->end(), 2);; --BBI) {
+    if (&*BBI == &Call)
+      break;
+    // Debug info intrinsics do not get in the way of tail call optimization.
+    if (isa<DbgInfoIntrinsic>(BBI))
+      continue;
+    // A lifetime end or assume intrinsic should not stop tail call
+    // optimization.
+    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(BBI))
+      if (II->getIntrinsicID() == Intrinsic::lifetime_end ||
+          II->getIntrinsicID() == Intrinsic::assume)
         continue;
-      // A lifetime end or assume intrinsic should not stop tail call
-      // optimization.
-      if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(BBI))
-        if (II->getIntrinsicID() == Intrinsic::lifetime_end ||
-            II->getIntrinsicID() == Intrinsic::assume)
-          continue;
-      if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() ||
-          !isSafeToSpeculativelyExecute(&*BBI))
-        return false;
-    }
+    if (BBI->mayHaveSideEffects() || BBI->mayReadFromMemory() ||
+        !isSafeToSpeculativelyExecute(&*BBI))
+      return false;
+  }
 
   const Function *F = ExitBB->getParent();
   return returnTypeIsEligibleForTailCall(
-      F, I, Ret, *TM.getSubtargetImpl(*F)->getTargetLowering());
+      F, &Call, Ret, *TM.getSubtargetImpl(*F)->getTargetLowering());
 }
 
 bool llvm::attributesPermitTailCall(const Function *F, const Instruction *I,
@@ -669,7 +668,7 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
   }
 
   SmallVector<unsigned, 4> RetPath, CallPath;
-  SmallVector<CompositeType *, 4> RetSubTypes, CallSubTypes;
+  SmallVector<Type *, 4> RetSubTypes, CallSubTypes;
 
   bool RetEmpty = !firstRealType(RetVal->getType(), RetSubTypes, RetPath);
   bool CallEmpty = !firstRealType(CallVal->getType(), CallSubTypes, CallPath);
@@ -692,7 +691,8 @@ bool llvm::returnTypeIsEligibleForTailCall(const Function *F,
       // We've exhausted the values produced by the tail call instruction, the
       // rest are essentially undef. The type doesn't really matter, but we need
       // *something*.
-      Type *SlotType = RetSubTypes.back()->getTypeAtIndex(RetPath.back());
+      Type *SlotType =
+          ExtractValueInst::getIndexedType(RetSubTypes.back(), RetPath.back());
       CallVal = UndefValue::get(SlotType);
     }
 
diff --git a/llvm/lib/CodeGen/AntiDepBreaker.h b/llvm/lib/CodeGen/AntiDepBreaker.h
deleted file mode 100644
index b11148595136..000000000000
--- a/llvm/lib/CodeGen/AntiDepBreaker.h
+++ /dev/null
@@ -1,87 +0,0 @@
-//===- llvm/CodeGen/AntiDepBreaker.h - Anti-Dependence Breaking -*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the AntiDepBreaker class, which implements
-// anti-dependence breaking heuristics for post-register-allocation scheduling.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_CODEGEN_ANTIDEPBREAKER_H
-#define LLVM_LIB_CODEGEN_ANTIDEPBREAKER_H
-
-#include "llvm/ADT/iterator_range.h"
-#include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineInstr.h"
-#include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/ScheduleDAG.h"
-#include "llvm/Support/Compiler.h"
-#include <cassert>
-#include <utility>
-#include <vector>
-
-namespace llvm {
-
-/// This class works in conjunction with the post-RA scheduler to rename
-/// registers to break register anti-dependencies (WAR hazards).
-class LLVM_LIBRARY_VISIBILITY AntiDepBreaker {
-public:
-  using DbgValueVector =
-      std::vector<std::pair<MachineInstr *, MachineInstr *>>;
-
-  virtual ~AntiDepBreaker();
-
-  /// Initialize anti-dep breaking for a new basic block.
-  virtual void StartBlock(MachineBasicBlock *BB) = 0;
-
-  /// Identifiy anti-dependencies within a basic-block region and break them by
-  /// renaming registers. Return the number of anti-dependencies broken.
-  virtual unsigned BreakAntiDependencies(const std::vector<SUnit> &SUnits,
-                                         MachineBasicBlock::iterator Begin,
-                                         MachineBasicBlock::iterator End,
-                                         unsigned InsertPosIndex,
-                                         DbgValueVector &DbgValues) = 0;
-
-  /// Update liveness information to account for the current
-  /// instruction, which will not be scheduled.
-  virtual void Observe(MachineInstr &MI, unsigned Count,
-                       unsigned InsertPosIndex) = 0;
-
-  /// Finish anti-dep breaking for a basic block.
-  virtual void FinishBlock() = 0;
-
-  /// Update DBG_VALUE if dependency breaker is updating
-  /// other machine instruction to use NewReg.
-  void UpdateDbgValue(MachineInstr &MI, unsigned OldReg, unsigned NewReg) {
-    assert(MI.isDebugValue() && "MI is not DBG_VALUE!");
-    if (MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == OldReg)
-      MI.getOperand(0).setReg(NewReg);
-  }
-
-  /// Update all DBG_VALUE instructions that may be affected by the dependency
-  /// breaker's update of ParentMI to use NewReg.
-  void UpdateDbgValues(const DbgValueVector &DbgValues, MachineInstr *ParentMI,
-                       unsigned OldReg, unsigned NewReg) {
-    // The following code is dependent on the order in which the DbgValues are
-    // constructed in ScheduleDAGInstrs::buildSchedGraph.
-    MachineInstr *PrevDbgMI = nullptr;
-    for (const auto &DV : make_range(DbgValues.crbegin(), DbgValues.crend())) {
-      MachineInstr *PrevMI = DV.second;
-      if ((PrevMI == ParentMI) || (PrevMI == PrevDbgMI)) {
-        MachineInstr *DbgMI = DV.first;
-        UpdateDbgValue(*DbgMI, OldReg, NewReg);
-        PrevDbgMI = DbgMI;
-      } else if (PrevDbgMI) {
-        break; // If no match and already found a DBG_VALUE, we're done.
-      }
-    }
-  }
-};
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_CODEGEN_ANTIDEPBREAKER_H
diff --git a/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp b/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
index f6ef85a5b78f..b634b24377fe 100644
--- a/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/ARMException.cpp
@@ -46,12 +46,12 @@ void ARMException::beginFunction(const MachineFunction *MF) {
   if (MoveType == AsmPrinter::CFI_M_Debug) {
     if (!hasEmittedCFISections) {
       if (Asm->needsOnlyDebugCFIMoves())
-        Asm->OutStreamer->EmitCFISections(false, true);
+        Asm->OutStreamer->emitCFISections(false, true);
       hasEmittedCFISections = true;
     }
 
     shouldEmitCFI = true;
-    Asm->OutStreamer->EmitCFIStartProc(false);
+    Asm->OutStreamer->emitCFIStartProc(false);
   }
 }
 
@@ -75,7 +75,7 @@ void ARMException::endFunction(const MachineFunction *MF) {
     // Emit references to personality.
     if (Per) {
       MCSymbol *PerSym = Asm->getSymbol(Per);
-      Asm->OutStreamer->EmitSymbolAttribute(PerSym, MCSA_Global);
+      Asm->OutStreamer->emitSymbolAttribute(PerSym, MCSA_Global);
       ATS.emitPersonality(PerSym);
     }
 
@@ -109,10 +109,10 @@ void ARMException::emitTypeInfos(unsigned TTypeEncoding,
   for (const GlobalValue *GV : reverse(TypeInfos)) {
     if (VerboseAsm)
       Asm->OutStreamer->AddComment("TypeInfo " + Twine(Entry--));
-    Asm->EmitTTypeReference(GV, TTypeEncoding);
+    Asm->emitTTypeReference(GV, TTypeEncoding);
   }
 
-  Asm->OutStreamer->EmitLabel(TTBaseLabel);
+  Asm->OutStreamer->emitLabel(TTBaseLabel);
 
   // Emit the Exception Specifications.
   if (VerboseAsm && !FilterIds.empty()) {
@@ -129,7 +129,7 @@ void ARMException::emitTypeInfos(unsigned TTypeEncoding,
         Asm->OutStreamer->AddComment("FilterInfo " + Twine(Entry));
     }
 
-    Asm->EmitTTypeReference((TypeID == 0 ? nullptr : TypeInfos[TypeID - 1]),
+    Asm->emitTTypeReference((TypeID == 0 ? nullptr : TypeInfos[TypeID - 1]),
                             TTypeEncoding);
   }
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
index b1b7921ea976..dea0227f7578 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AccelTable.cpp
@@ -271,7 +271,7 @@ void AccelTableWriter::emitOffsets(const MCSymbol *Base) const {
         continue;
       PrevHash = HashValue;
       Asm->OutStreamer->AddComment("Offset in Bucket " + Twine(i));
-      Asm->EmitLabelDifference(Hash->Sym, Base, sizeof(uint32_t));
+      Asm->emitLabelDifference(Hash->Sym, Base, sizeof(uint32_t));
     }
   }
 }
@@ -337,7 +337,7 @@ void AppleAccelTableWriter::emitData() const {
           PrevHash != Hash->HashValue)
         Asm->emitInt32(0);
       // Remember to emit the label for our offset.
-      Asm->OutStreamer->EmitLabel(Hash->Sym);
+      Asm->OutStreamer->emitLabel(Hash->Sym);
       Asm->OutStreamer->AddComment(Hash->Name.getString());
       Asm->emitDwarfStringOffset(Hash->Name);
       Asm->OutStreamer->AddComment("Num DIEs");
@@ -368,9 +368,9 @@ void Dwarf5AccelTableWriter<DataT>::Header::emit(
 
   AsmPrinter *Asm = Ctx.Asm;
   Asm->OutStreamer->AddComment("Header: unit length");
-  Asm->EmitLabelDifference(Ctx.ContributionEnd, Ctx.ContributionStart,
+  Asm->emitLabelDifference(Ctx.ContributionEnd, Ctx.ContributionStart,
                            sizeof(uint32_t));
-  Asm->OutStreamer->EmitLabel(Ctx.ContributionStart);
+  Asm->OutStreamer->emitLabel(Ctx.ContributionStart);
   Asm->OutStreamer->AddComment("Header: version");
   Asm->emitInt16(Version);
   Asm->OutStreamer->AddComment("Header: padding");
@@ -386,12 +386,12 @@ void Dwarf5AccelTableWriter<DataT>::Header::emit(
   Asm->OutStreamer->AddComment("Header: name count");
   Asm->emitInt32(NameCount);
   Asm->OutStreamer->AddComment("Header: abbreviation table size");
-  Asm->EmitLabelDifference(Ctx.AbbrevEnd, Ctx.AbbrevStart, sizeof(uint32_t));
+  Asm->emitLabelDifference(Ctx.AbbrevEnd, Ctx.AbbrevStart, sizeof(uint32_t));
   Asm->OutStreamer->AddComment("Header: augmentation string size");
   assert(AugmentationStringSize % 4 == 0);
   Asm->emitInt32(AugmentationStringSize);
   Asm->OutStreamer->AddComment("Header: augmentation string");
-  Asm->OutStreamer->EmitBytes({AugmentationString, AugmentationStringSize});
+  Asm->OutStreamer->emitBytes({AugmentationString, AugmentationStringSize});
 }
 
 template <typename DataT>
@@ -453,23 +453,23 @@ void Dwarf5AccelTableWriter<DataT>::emitStringOffsets() const {
 
 template <typename DataT>
 void Dwarf5AccelTableWriter<DataT>::emitAbbrevs() const {
-  Asm->OutStreamer->EmitLabel(AbbrevStart);
+  Asm->OutStreamer->emitLabel(AbbrevStart);
   for (const auto &Abbrev : Abbreviations) {
     Asm->OutStreamer->AddComment("Abbrev code");
     assert(Abbrev.first != 0);
-    Asm->EmitULEB128(Abbrev.first);
+    Asm->emitULEB128(Abbrev.first);
     Asm->OutStreamer->AddComment(dwarf::TagString(Abbrev.first));
-    Asm->EmitULEB128(Abbrev.first);
+    Asm->emitULEB128(Abbrev.first);
     for (const auto &AttrEnc : Abbrev.second) {
-      Asm->EmitULEB128(AttrEnc.Index, dwarf::IndexString(AttrEnc.Index).data());
-      Asm->EmitULEB128(AttrEnc.Form,
+      Asm->emitULEB128(AttrEnc.Index, dwarf::IndexString(AttrEnc.Index).data());
+      Asm->emitULEB128(AttrEnc.Form,
                        dwarf::FormEncodingString(AttrEnc.Form).data());
     }
-    Asm->EmitULEB128(0, "End of abbrev");
-    Asm->EmitULEB128(0, "End of abbrev");
+    Asm->emitULEB128(0, "End of abbrev");
+    Asm->emitULEB128(0, "End of abbrev");
   }
-  Asm->EmitULEB128(0, "End of abbrev list");
-  Asm->OutStreamer->EmitLabel(AbbrevEnd);
+  Asm->emitULEB128(0, "End of abbrev list");
+  Asm->OutStreamer->emitLabel(AbbrevEnd);
 }
 
 template <typename DataT>
@@ -478,13 +478,13 @@ void Dwarf5AccelTableWriter<DataT>::emitEntry(const DataT &Entry) const {
   assert(AbbrevIt != Abbreviations.end() &&
          "Why wasn't this abbrev generated?");
 
-  Asm->EmitULEB128(AbbrevIt->first, "Abbreviation code");
+  Asm->emitULEB128(AbbrevIt->first, "Abbreviation code");
   for (const auto &AttrEnc : AbbrevIt->second) {
     Asm->OutStreamer->AddComment(dwarf::IndexString(AttrEnc.Index));
     switch (AttrEnc.Index) {
     case dwarf::DW_IDX_compile_unit: {
       DIEInteger ID(getCUIndexForEntry(Entry));
-      ID.EmitValue(Asm, AttrEnc.Form);
+      ID.emitValue(Asm, AttrEnc.Form);
       break;
     }
     case dwarf::DW_IDX_die_offset:
@@ -498,11 +498,11 @@ void Dwarf5AccelTableWriter<DataT>::emitEntry(const DataT &Entry) const {
 }
 
 template <typename DataT> void Dwarf5AccelTableWriter<DataT>::emitData() const {
-  Asm->OutStreamer->EmitLabel(EntryPool);
+  Asm->OutStreamer->emitLabel(EntryPool);
   for (auto &Bucket : Contents.getBuckets()) {
     for (auto *Hash : Bucket) {
       // Remember to emit the label for our offset.
-      Asm->OutStreamer->EmitLabel(Hash->Sym);
+      Asm->OutStreamer->emitLabel(Hash->Sym);
       for (const auto *Value : Hash->Values)
         emitEntry(*static_cast<const DataT *>(Value));
       Asm->OutStreamer->AddComment("End of list: " + Hash->Name.getString());
@@ -537,8 +537,8 @@ template <typename DataT> void Dwarf5AccelTableWriter<DataT>::emit() const {
   emitOffsets(EntryPool);
   emitAbbrevs();
   emitData();
-  Asm->OutStreamer->EmitValueToAlignment(4, 0);
-  Asm->OutStreamer->EmitLabel(ContributionEnd);
+  Asm->OutStreamer->emitValueToAlignment(4, 0);
+  Asm->OutStreamer->emitLabel(ContributionEnd);
 }
 
 void llvm::emitAppleAccelTableImpl(AsmPrinter *Asm, AccelTableBase &Contents,
diff --git a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
index f11c7de5ed8a..883aaf5aefc4 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AddressPool.cpp
@@ -30,9 +30,9 @@ MCSymbol *AddressPool::emitHeader(AsmPrinter &Asm, MCSection *Section) {
   MCSymbol *EndLabel = Asm.createTempSymbol(Prefix + "end");
 
   Asm.OutStreamer->AddComment("Length of contribution");
-  Asm.EmitLabelDifference(EndLabel, BeginLabel,
+  Asm.emitLabelDifference(EndLabel, BeginLabel,
                           4); // TODO: Support DWARF64 format.
-  Asm.OutStreamer->EmitLabel(BeginLabel);
+  Asm.OutStreamer->emitLabel(BeginLabel);
   Asm.OutStreamer->AddComment("DWARF version number");
   Asm.emitInt16(Asm.getDwarfVersion());
   Asm.OutStreamer->AddComment("Address size");
@@ -58,7 +58,7 @@ void AddressPool::emit(AsmPrinter &Asm, MCSection *AddrSection) {
 
   // Define the symbol that marks the start of the contribution.
   // It is referenced via DW_AT_addr_base.
-  Asm.OutStreamer->EmitLabel(AddressTableBaseSym);
+  Asm.OutStreamer->emitLabel(AddressTableBaseSym);
 
   // Order the address pool entries by ID
   SmallVector<const MCExpr *, 64> Entries(Pool.size());
@@ -70,8 +70,8 @@ void AddressPool::emit(AsmPrinter &Asm, MCSection *AddrSection) {
             : MCSymbolRefExpr::create(I.first, Asm.OutContext);
 
   for (const MCExpr *Entry : Entries)
-    Asm.OutStreamer->EmitValue(Entry, Asm.getDataLayout().getPointerSize());
+    Asm.OutStreamer->emitValue(Entry, Asm.getDataLayout().getPointerSize());
 
   if (EndLabel)
-    Asm.OutStreamer->EmitLabel(EndLabel);
+    Asm.OutStreamer->emitLabel(EndLabel);
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index 6f9aa4dd79fd..f8f7b74baf91 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -31,16 +31,13 @@
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
-#include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/GCMetadata.h"
 #include "llvm/CodeGen/GCMetadataPrinter.h"
 #include "llvm/CodeGen/GCStrategy.h"
-#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -55,7 +52,6 @@
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
-#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
@@ -81,7 +77,6 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
-#include "llvm/IR/RemarkStreamer.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Value.h"
 #include "llvm/MC/MCAsmInfo.h"
@@ -106,6 +101,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Remarks/Remark.h"
 #include "llvm/Remarks/RemarkFormat.h"
+#include "llvm/Remarks/RemarkStreamer.h"
 #include "llvm/Remarks/RemarkStringTable.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -161,11 +157,11 @@ static gcp_map_type &getGCMap(void *&P) {
 
 /// getGVAlignment - Return the alignment to use for the specified global
 /// value.  This rounds up to the preferred alignment if possible and legal.
-Align AsmPrinter::getGVAlignment(const GlobalValue *GV, const DataLayout &DL,
+Align AsmPrinter::getGVAlignment(const GlobalObject *GV, const DataLayout &DL,
                                  Align InAlign) {
   Align Alignment;
   if (const GlobalVariable *GVar = dyn_cast<GlobalVariable>(GV))
-    Alignment = Align(DL.getPreferredAlignment(GVar));
+    Alignment = DL.getPreferredAlign(GVar);
 
   // If InAlign is specified, round it to it.
   if (InAlign > Alignment)
@@ -231,7 +227,7 @@ const MCSubtargetInfo &AsmPrinter::getSubtargetInfo() const {
 }
 
 void AsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
-  S.EmitInstruction(Inst, getSubtargetInfo());
+  S.emitInstruction(Inst, getSubtargetInfo());
 }
 
 void AsmPrinter::emitInitialRawDwarfLocDirective(const MachineFunction &MF) {
@@ -248,11 +244,8 @@ const MCSection *AsmPrinter::getCurrentSection() const {
 void AsmPrinter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   MachineFunctionPass::getAnalysisUsage(AU);
-  AU.addRequired<MachineModuleInfoWrapperPass>();
   AU.addRequired<MachineOptimizationRemarkEmitterPass>();
   AU.addRequired<GCModuleInfo>();
-  AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
-  AU.addRequired<ProfileSummaryInfoWrapperPass>();
 }
 
 bool AsmPrinter::doInitialization(Module &M) {
@@ -277,16 +270,16 @@ bool AsmPrinter::doInitialization(Module &M) {
   // use the directive, where it would need the same conditionalization
   // anyway.
   const Triple &Target = TM.getTargetTriple();
-  OutStreamer->EmitVersionForTarget(Target, M.getSDKVersion());
+  OutStreamer->emitVersionForTarget(Target, M.getSDKVersion());
 
   // Allow the target to emit any magic that it wants at the start of the file.
-  EmitStartOfAsmFile(M);
+  emitStartOfAsmFile(M);
 
   // Very minimal debug info. It is ignored if we emit actual debug info. If we
   // don't, this at least helps the user find where a global came from.
   if (MAI->hasSingleParameterDotFile()) {
     // .file "foo.c"
-    OutStreamer->EmitFileDirective(
+    OutStreamer->emitFileDirective(
         llvm::sys::path::filename(M.getSourceFileName()));
   }
 
@@ -305,21 +298,21 @@ bool AsmPrinter::doInitialization(Module &M) {
         TM.getTargetFeatureString()));
     OutStreamer->AddComment("Start of file scope inline assembly");
     OutStreamer->AddBlankLine();
-    EmitInlineAsm(M.getModuleInlineAsm()+"\n",
+    emitInlineAsm(M.getModuleInlineAsm() + "\n",
                   OutContext.getSubtargetCopy(*STI), TM.Options.MCOptions);
     OutStreamer->AddComment("End of file scope inline assembly");
     OutStreamer->AddBlankLine();
   }
 
   if (MAI->doesSupportDebugInformation()) {
-    bool EmitCodeView = MMI->getModule()->getCodeViewFlag();
+    bool EmitCodeView = M.getCodeViewFlag();
     if (EmitCodeView && TM.getTargetTriple().isOSWindows()) {
       Handlers.emplace_back(std::make_unique<CodeViewDebug>(this),
                             DbgTimerName, DbgTimerDescription,
                             CodeViewLineTablesGroupName,
                             CodeViewLineTablesGroupDescription);
     }
-    if (!EmitCodeView || MMI->getModule()->getDwarfVersion()) {
+    if (!EmitCodeView || M.getDwarfVersion()) {
       DD = new DwarfDebug(this, &M);
       DD->beginModule();
       Handlers.emplace_back(std::unique_ptr<DwarfDebug>(DD), DbgTimerName,
@@ -382,8 +375,7 @@ bool AsmPrinter::doInitialization(Module &M) {
                           DWARFGroupDescription);
 
   // Emit tables for any value of cfguard flag (i.e. cfguard=1 or cfguard=2).
-  if (mdconst::extract_or_null<ConstantInt>(
-          MMI->getModule()->getModuleFlag("cfguard")))
+  if (mdconst::extract_or_null<ConstantInt>(M.getModuleFlag("cfguard")))
     Handlers.emplace_back(std::make_unique<WinCFGuard>(this), CFGuardName,
                           CFGuardDescription, DWARFGroupName,
                           DWARFGroupDescription);
@@ -397,7 +389,7 @@ static bool canBeHidden(const GlobalValue *GV, const MCAsmInfo &MAI) {
   return GV->canBeOmittedFromSymbolTable();
 }
 
-void AsmPrinter::EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
+void AsmPrinter::emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
   GlobalValue::LinkageTypes Linkage = GV->getLinkage();
   switch (Linkage) {
   case GlobalValue::CommonLinkage:
@@ -407,35 +399,31 @@ void AsmPrinter::EmitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const {
   case GlobalValue::WeakODRLinkage:
     if (MAI->hasWeakDefDirective()) {
       // .globl _foo
-      OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Global);
+      OutStreamer->emitSymbolAttribute(GVSym, MCSA_Global);
 
       if (!canBeHidden(GV, *MAI))
         // .weak_definition _foo
-        OutStreamer->EmitSymbolAttribute(GVSym, MCSA_WeakDefinition);
+        OutStreamer->emitSymbolAttribute(GVSym, MCSA_WeakDefinition);
       else
-        OutStreamer->EmitSymbolAttribute(GVSym, MCSA_WeakDefAutoPrivate);
-    } else if (MAI->hasLinkOnceDirective()) {
+        OutStreamer->emitSymbolAttribute(GVSym, MCSA_WeakDefAutoPrivate);
+    } else if (MAI->avoidWeakIfComdat() && GV->hasComdat()) {
       // .globl _foo
-      OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Global);
+      OutStreamer->emitSymbolAttribute(GVSym, MCSA_Global);
       //NOTE: linkonce is handled by the section the symbol was assigned to.
     } else {
       // .weak _foo
-      OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Weak);
+      OutStreamer->emitSymbolAttribute(GVSym, MCSA_Weak);
     }
     return;
   case GlobalValue::ExternalLinkage:
-    // If external, declare as a global symbol: .globl _foo
-    OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Global);
+    OutStreamer->emitSymbolAttribute(GVSym, MCSA_Global);
     return;
   case GlobalValue::PrivateLinkage:
-    return;
   case GlobalValue::InternalLinkage:
-    if (MAI->hasDotLGloblDirective())
-      OutStreamer->EmitSymbolAttribute(GVSym, MCSA_LGlobal);
     return;
-  case GlobalValue::AppendingLinkage:
-  case GlobalValue::AvailableExternallyLinkage:
   case GlobalValue::ExternalWeakLinkage:
+  case GlobalValue::AvailableExternallyLinkage:
+  case GlobalValue::AppendingLinkage:
     llvm_unreachable("Should never emit this");
   }
   llvm_unreachable("Unknown linkage type!");
@@ -450,8 +438,27 @@ MCSymbol *AsmPrinter::getSymbol(const GlobalValue *GV) const {
   return TM.getSymbol(GV);
 }
 
+MCSymbol *AsmPrinter::getSymbolPreferLocal(const GlobalValue &GV) const {
+  // On ELF, use .Lfoo$local if GV is a non-interposable GlobalObject with an
+  // exact definion (intersection of GlobalValue::hasExactDefinition() and
+  // !isInterposable()). These linkages include: external, appending, internal,
+  // private. It may be profitable to use a local alias for external. The
+  // assembler would otherwise be conservative and assume a global default
+  // visibility symbol can be interposable, even if the code generator already
+  // assumed it.
+  if (TM.getTargetTriple().isOSBinFormatELF() && GV.canBenefitFromLocalAlias()) {
+    const Module &M = *GV.getParent();
+    if (TM.getRelocationModel() != Reloc::Static &&
+        M.getPIELevel() == PIELevel::Default)
+      if (GV.isDSOLocal() || (TM.getTargetTriple().isX86() &&
+                              GV.getParent()->noSemanticInterposition()))
+        return getSymbolWithGlobalValueBase(&GV, "$local");
+  }
+  return TM.getSymbol(&GV);
+}
+
 /// EmitGlobalVariable - Emit the specified global variable to the .s file.
-void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+void AsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   bool IsEmuTLSVar = TM.useEmulatedTLS() && GV->isThreadLocal();
   assert(!(IsEmuTLSVar && GV->hasCommonLinkage()) &&
          "No emulated TLS variables in the common section");
@@ -463,7 +470,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   if (GV->hasInitializer()) {
     // Check to see if this is a special global used by LLVM, if so, emit it.
-    if (EmitSpecialLLVMGlobal(GV))
+    if (emitSpecialLLVMGlobal(GV))
       return;
 
     // Skip the emission of global equivalents. The symbol can be emitted later
@@ -486,7 +493,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   // getOrCreateEmuTLSControlSym only creates the symbol with name and default
   // attributes.
   // GV's or GVSym's attributes will be used for the EmittedSym.
-  EmitVisibility(EmittedSym, GV->getVisibility(), !GV->isDeclaration());
+  emitVisibility(EmittedSym, GV->getVisibility(), !GV->isDeclaration());
 
   if (!GV->hasInitializer())   // External globals require no extra code.
     return;
@@ -497,7 +504,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
                        "' is already defined");
 
   if (MAI->hasDotTypeDotSizeDirective())
-    OutStreamer->EmitSymbolAttribute(EmittedSym, MCSA_ELF_TypeObject);
+    OutStreamer->emitSymbolAttribute(EmittedSym, MCSA_ELF_TypeObject);
 
   SectionKind GVKind = TargetLoweringObjectFile::getKindForGlobal(GV, TM);
 
@@ -522,7 +529,7 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     // .comm _foo, 42, 4
     const bool SupportsAlignment =
         getObjFileLowering().getCommDirectiveSupportsAlignment();
-    OutStreamer->EmitCommonSymbol(GVSym, Size,
+    OutStreamer->emitCommonSymbol(GVSym, Size,
                                   SupportsAlignment ? Alignment.value() : 0);
     return;
   }
@@ -536,9 +543,9 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
       TheSection->isVirtualSection()) {
     if (Size == 0)
       Size = 1; // zerofill of 0 bytes is undefined.
-    EmitLinkage(GV, GVSym);
+    emitLinkage(GV, GVSym);
     // .zerofill __DATA, __bss, _foo, 400, 5
-    OutStreamer->EmitZerofill(TheSection, GVSym, Size, Alignment.value());
+    OutStreamer->emitZerofill(TheSection, GVSym, Size, Alignment.value());
     return;
   }
 
@@ -557,16 +564,16 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     // Prefer to simply fall back to .local / .comm in this case.
     if (MAI->getLCOMMDirectiveAlignmentType() != LCOMM::NoAlignment) {
       // .lcomm _foo, 42
-      OutStreamer->EmitLocalCommonSymbol(GVSym, Size, Alignment.value());
+      OutStreamer->emitLocalCommonSymbol(GVSym, Size, Alignment.value());
       return;
     }
 
     // .local _foo
-    OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Local);
+    OutStreamer->emitSymbolAttribute(GVSym, MCSA_Local);
     // .comm _foo, 42, 4
     const bool SupportsAlignment =
         getObjFileLowering().getCommDirectiveSupportsAlignment();
-    OutStreamer->EmitCommonSymbol(GVSym, Size,
+    OutStreamer->emitCommonSymbol(GVSym, Size,
                                   SupportsAlignment ? Alignment.value() : 0);
     return;
   }
@@ -588,14 +595,14 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
     if (GVKind.isThreadBSS()) {
       TheSection = getObjFileLowering().getTLSBSSSection();
-      OutStreamer->EmitTBSSSymbol(TheSection, MangSym, Size, Alignment.value());
+      OutStreamer->emitTBSSSymbol(TheSection, MangSym, Size, Alignment.value());
     } else if (GVKind.isThreadData()) {
       OutStreamer->SwitchSection(TheSection);
 
-      EmitAlignment(Alignment, GV);
-      OutStreamer->EmitLabel(MangSym);
+      emitAlignment(Alignment, GV);
+      OutStreamer->emitLabel(MangSym);
 
-      EmitGlobalConstant(GV->getParent()->getDataLayout(),
+      emitGlobalConstant(GV->getParent()->getDataLayout(),
                          GV->getInitializer());
     }
 
@@ -606,18 +613,18 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
     OutStreamer->SwitchSection(TLVSect);
     // Emit the linkage here.
-    EmitLinkage(GV, GVSym);
-    OutStreamer->EmitLabel(GVSym);
+    emitLinkage(GV, GVSym);
+    OutStreamer->emitLabel(GVSym);
 
     // Three pointers in size:
     //   - __tlv_bootstrap - used to make sure support exists
     //   - spare pointer, used when mapped by the runtime
     //   - pointer to mangled symbol above with initializer
     unsigned PtrSize = DL.getPointerTypeSize(GV->getType());
-    OutStreamer->EmitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"),
+    OutStreamer->emitSymbolValue(GetExternalSymbolSymbol("_tlv_bootstrap"),
                                 PtrSize);
-    OutStreamer->EmitIntValue(0, PtrSize);
-    OutStreamer->EmitSymbolValue(MangSym, PtrSize);
+    OutStreamer->emitIntValue(0, PtrSize);
+    OutStreamer->emitSymbolValue(MangSym, PtrSize);
 
     OutStreamer->AddBlankLine();
     return;
@@ -627,12 +634,15 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
   OutStreamer->SwitchSection(TheSection);
 
-  EmitLinkage(GV, EmittedInitSym);
-  EmitAlignment(Alignment, GV);
+  emitLinkage(GV, EmittedInitSym);
+  emitAlignment(Alignment, GV);
 
-  OutStreamer->EmitLabel(EmittedInitSym);
+  OutStreamer->emitLabel(EmittedInitSym);
+  MCSymbol *LocalAlias = getSymbolPreferLocal(*GV);
+  if (LocalAlias != EmittedInitSym)
+    OutStreamer->emitLabel(LocalAlias);
 
-  EmitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
+  emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
 
   if (MAI->hasDotTypeDotSizeDirective())
     // .size foo, 42
@@ -646,13 +656,15 @@ void AsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 ///
 /// \p Value - The value to emit.
 /// \p Size - The size of the integer (in bytes) to emit.
-void AsmPrinter::EmitDebugValue(const MCExpr *Value, unsigned Size) const {
-  OutStreamer->EmitValue(Value, Size);
+void AsmPrinter::emitDebugValue(const MCExpr *Value, unsigned Size) const {
+  OutStreamer->emitValue(Value, Size);
 }
 
+void AsmPrinter::emitFunctionHeaderComment() {}
+
 /// EmitFunctionHeader - This method emits the header for the current
 /// function.
-void AsmPrinter::EmitFunctionHeader() {
+void AsmPrinter::emitFunctionHeader() {
   const Function &F = MF->getFunction();
 
   if (isVerbose())
@@ -661,29 +673,32 @@ void AsmPrinter::EmitFunctionHeader() {
         << GlobalValue::dropLLVMManglingEscape(F.getName()) << '\n';
 
   // Print out constants referenced by the function
-  EmitConstantPool();
+  emitConstantPool();
 
   // Print the 'header' of function.
-  OutStreamer->SwitchSection(getObjFileLowering().SectionForGlobal(&F, TM));
-  EmitVisibility(CurrentFnSym, F.getVisibility());
+  MF->setSection(getObjFileLowering().SectionForGlobal(&F, TM));
+  OutStreamer->SwitchSection(MF->getSection());
+
+  if (!MAI->hasVisibilityOnlyWithLinkage())
+    emitVisibility(CurrentFnSym, F.getVisibility());
 
-  if (MAI->needsFunctionDescriptors() &&
-      F.getLinkage() != GlobalValue::InternalLinkage)
-    EmitLinkage(&F, CurrentFnDescSym);
+  if (MAI->needsFunctionDescriptors())
+    emitLinkage(&F, CurrentFnDescSym);
 
-  EmitLinkage(&F, CurrentFnSym);
+  emitLinkage(&F, CurrentFnSym);
   if (MAI->hasFunctionAlignment())
-    EmitAlignment(MF->getAlignment(), &F);
+    emitAlignment(MF->getAlignment(), &F);
 
   if (MAI->hasDotTypeDotSizeDirective())
-    OutStreamer->EmitSymbolAttribute(CurrentFnSym, MCSA_ELF_TypeFunction);
+    OutStreamer->emitSymbolAttribute(CurrentFnSym, MCSA_ELF_TypeFunction);
 
   if (F.hasFnAttribute(Attribute::Cold))
-    OutStreamer->EmitSymbolAttribute(CurrentFnSym, MCSA_Cold);
+    OutStreamer->emitSymbolAttribute(CurrentFnSym, MCSA_Cold);
 
   if (isVerbose()) {
     F.printAsOperand(OutStreamer->GetCommentOS(),
                    /*PrintType=*/false, F.getParent());
+    emitFunctionHeaderComment();
     OutStreamer->GetCommentOS() << '\n';
   }
 
@@ -695,44 +710,57 @@ void AsmPrinter::EmitFunctionHeader() {
       // and use the .alt_entry attribute to mark the function's real entry point
       // as an alternative entry point to the prefix-data symbol.
       MCSymbol *PrefixSym = OutContext.createLinkerPrivateTempSymbol();
-      OutStreamer->EmitLabel(PrefixSym);
+      OutStreamer->emitLabel(PrefixSym);
 
-      EmitGlobalConstant(F.getParent()->getDataLayout(), F.getPrefixData());
+      emitGlobalConstant(F.getParent()->getDataLayout(), F.getPrefixData());
 
       // Emit an .alt_entry directive for the actual function symbol.
-      OutStreamer->EmitSymbolAttribute(CurrentFnSym, MCSA_AltEntry);
+      OutStreamer->emitSymbolAttribute(CurrentFnSym, MCSA_AltEntry);
     } else {
-      EmitGlobalConstant(F.getParent()->getDataLayout(), F.getPrefixData());
+      emitGlobalConstant(F.getParent()->getDataLayout(), F.getPrefixData());
     }
   }
 
+  // Emit M NOPs for -fpatchable-function-entry=N,M where M>0. We arbitrarily
+  // place prefix data before NOPs.
+  unsigned PatchableFunctionPrefix = 0;
+  unsigned PatchableFunctionEntry = 0;
+  (void)F.getFnAttribute("patchable-function-prefix")
+      .getValueAsString()
+      .getAsInteger(10, PatchableFunctionPrefix);
+  (void)F.getFnAttribute("patchable-function-entry")
+      .getValueAsString()
+      .getAsInteger(10, PatchableFunctionEntry);
+  if (PatchableFunctionPrefix) {
+    CurrentPatchableFunctionEntrySym =
+        OutContext.createLinkerPrivateTempSymbol();
+    OutStreamer->emitLabel(CurrentPatchableFunctionEntrySym);
+    emitNops(PatchableFunctionPrefix);
+  } else if (PatchableFunctionEntry) {
+    // May be reassigned when emitting the body, to reference the label after
+    // the initial BTI (AArch64) or endbr32/endbr64 (x86).
+    CurrentPatchableFunctionEntrySym = CurrentFnBegin;
+  }
+
   // Emit the function descriptor. This is a virtual function to allow targets
-  // to emit their specific function descriptor.
+  // to emit their specific function descriptor. Right now it is only used by
+  // the AIX target. The PowerPC 64-bit V1 ELF target also uses function
+  // descriptors and should be converted to use this hook as well.
   if (MAI->needsFunctionDescriptors())
-    EmitFunctionDescriptor();
+    emitFunctionDescriptor();
 
   // Emit the CurrentFnSym. This is a virtual function to allow targets to do
   // their wild and crazy things as required.
-  EmitFunctionEntryLabel();
-
-  // If the function had address-taken blocks that got deleted, then we have
-  // references to the dangling symbols.  Emit them at the start of the function
-  // so that we don't get references to undefined symbols.
-  std::vector<MCSymbol*> DeadBlockSyms;
-  MMI->takeDeletedSymbolsForFunction(&F, DeadBlockSyms);
-  for (unsigned i = 0, e = DeadBlockSyms.size(); i != e; ++i) {
-    OutStreamer->AddComment("Address taken block that was later removed");
-    OutStreamer->EmitLabel(DeadBlockSyms[i]);
-  }
+  emitFunctionEntryLabel();
 
   if (CurrentFnBegin) {
     if (MAI->useAssignmentForEHBegin()) {
       MCSymbol *CurPos = OutContext.createTempSymbol();
-      OutStreamer->EmitLabel(CurPos);
-      OutStreamer->EmitAssignment(CurrentFnBegin,
+      OutStreamer->emitLabel(CurPos);
+      OutStreamer->emitAssignment(CurrentFnBegin,
                                  MCSymbolRefExpr::create(CurPos, OutContext));
     } else {
-      OutStreamer->EmitLabel(CurrentFnBegin);
+      OutStreamer->emitLabel(CurrentFnBegin);
     }
   }
 
@@ -745,12 +773,12 @@ void AsmPrinter::EmitFunctionHeader() {
 
   // Emit the prologue data.
   if (F.hasPrologueData())
-    EmitGlobalConstant(F.getParent()->getDataLayout(), F.getPrologueData());
+    emitGlobalConstant(F.getParent()->getDataLayout(), F.getPrologueData());
 }
 
 /// EmitFunctionEntryLabel - Emit the label that is the entrypoint for the
 /// function.  This can be overridden by targets as required to do custom stuff.
-void AsmPrinter::EmitFunctionEntryLabel() {
+void AsmPrinter::emitFunctionEntryLabel() {
   CurrentFnSym->redefineIfPossible();
 
   // The function label could have already been emitted if two symbols end up
@@ -762,7 +790,13 @@ void AsmPrinter::EmitFunctionEntryLabel() {
     report_fatal_error("'" + Twine(CurrentFnSym->getName()) +
                        "' label emitted multiple times to assembly file");
 
-  return OutStreamer->EmitLabel(CurrentFnSym);
+  OutStreamer->emitLabel(CurrentFnSym);
+
+  if (TM.getTargetTriple().isOSBinFormatELF()) {
+    MCSymbol *Sym = getSymbolPreferLocal(MF->getFunction());
+    if (Sym != CurrentFnSym)
+      OutStreamer->emitLabel(Sym);
+  }
 }
 
 /// emitComments - Pretty-print comments for instructions.
@@ -842,7 +876,7 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
   OS << " <- ";
 
   // The second operand is only an offset if it's an immediate.
-  bool MemLoc = MI->getOperand(0).isReg() && MI->getOperand(1).isImm();
+  bool MemLoc = MI->isIndirectDebugValue();
   int64_t Offset = MemLoc ? MI->getOperand(1).getImm() : 0;
   const DIExpression *Expr = MI->getDebugExpression();
   if (Expr->getNumElements()) {
@@ -861,11 +895,11 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
   }
 
   // Register or immediate value. Register 0 means undef.
-  if (MI->getOperand(0).isFPImm()) {
-    APFloat APF = APFloat(MI->getOperand(0).getFPImm()->getValueAPF());
-    if (MI->getOperand(0).getFPImm()->getType()->isFloatTy()) {
+  if (MI->getDebugOperand(0).isFPImm()) {
+    APFloat APF = APFloat(MI->getDebugOperand(0).getFPImm()->getValueAPF());
+    if (MI->getDebugOperand(0).getFPImm()->getType()->isFloatTy()) {
       OS << (double)APF.convertToFloat();
-    } else if (MI->getOperand(0).getFPImm()->getType()->isDoubleTy()) {
+    } else if (MI->getDebugOperand(0).getFPImm()->getType()->isDoubleTy()) {
       OS << APF.convertToDouble();
     } else {
       // There is no good way to print long double.  Convert a copy to
@@ -875,23 +909,23 @@ static bool emitDebugValueComment(const MachineInstr *MI, AsmPrinter &AP) {
                   &ignored);
       OS << "(long double) " << APF.convertToDouble();
     }
-  } else if (MI->getOperand(0).isImm()) {
-    OS << MI->getOperand(0).getImm();
-  } else if (MI->getOperand(0).isCImm()) {
-    MI->getOperand(0).getCImm()->getValue().print(OS, false /*isSigned*/);
-  } else if (MI->getOperand(0).isTargetIndex()) {
-    auto Op = MI->getOperand(0);
+  } else if (MI->getDebugOperand(0).isImm()) {
+    OS << MI->getDebugOperand(0).getImm();
+  } else if (MI->getDebugOperand(0).isCImm()) {
+    MI->getDebugOperand(0).getCImm()->getValue().print(OS, false /*isSigned*/);
+  } else if (MI->getDebugOperand(0).isTargetIndex()) {
+    auto Op = MI->getDebugOperand(0);
     OS << "!target-index(" << Op.getIndex() << "," << Op.getOffset() << ")";
     return true;
   } else {
-    unsigned Reg;
-    if (MI->getOperand(0).isReg()) {
-      Reg = MI->getOperand(0).getReg();
+    Register Reg;
+    if (MI->getDebugOperand(0).isReg()) {
+      Reg = MI->getDebugOperand(0).getReg();
     } else {
-      assert(MI->getOperand(0).isFI() && "Unknown operand type");
+      assert(MI->getDebugOperand(0).isFI() && "Unknown operand type");
       const TargetFrameLowering *TFI = AP.MF->getSubtarget().getFrameLowering();
-      Offset += TFI->getFrameIndexReference(*AP.MF,
-                                            MI->getOperand(0).getIndex(), Reg);
+      Offset += TFI->getFrameIndexReference(
+          *AP.MF, MI->getDebugOperand(0).getIndex(), Reg);
       MemLoc = true;
     }
     if (Reg == 0) {
@@ -985,7 +1019,7 @@ void AsmPrinter::emitFrameAlloc(const MachineInstr &MI) {
   int FrameOffset = MI.getOperand(1).getImm();
 
   // Emit a symbol assignment.
-  OutStreamer->EmitAssignment(FrameAllocSym,
+  OutStreamer->emitAssignment(FrameAllocSym,
                              MCConstantExpr::create(FrameOffset, OutContext));
 }
 
@@ -1008,15 +1042,15 @@ void AsmPrinter::emitStackSizeSection(const MachineFunction &MF) {
 
   const MCSymbol *FunctionSymbol = getFunctionBegin();
   uint64_t StackSize = FrameInfo.getStackSize();
-  OutStreamer->EmitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
-  OutStreamer->EmitULEB128IntValue(StackSize);
+  OutStreamer->emitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
+  OutStreamer->emitULEB128IntValue(StackSize);
 
   OutStreamer->PopSection();
 }
 
-static bool needFuncLabelsForEHOrDebugInfo(const MachineFunction &MF,
-                                           MachineModuleInfo *MMI) {
-  if (!MF.getLandingPads().empty() || MF.hasEHFunclets() || MMI->hasDebugInfo())
+static bool needFuncLabelsForEHOrDebugInfo(const MachineFunction &MF) {
+  MachineModuleInfo &MMI = MF.getMMI();
+  if (!MF.getLandingPads().empty() || MF.hasEHFunclets() || MMI.hasDebugInfo())
     return true;
 
   // We might emit an EH table that uses function begin and end labels even if
@@ -1029,11 +1063,11 @@ static bool needFuncLabelsForEHOrDebugInfo(const MachineFunction &MF,
 
 /// EmitFunctionBody - This method emits the body and trailer for a
 /// function.
-void AsmPrinter::EmitFunctionBody() {
-  EmitFunctionHeader();
+void AsmPrinter::emitFunctionBody() {
+  emitFunctionHeader();
 
   // Emit target-specific gunk before the function body.
-  EmitFunctionBodyStart();
+  emitFunctionBodyStart();
 
   bool ShouldPrintDebugScopes = MMI->hasDebugInfo();
 
@@ -1058,9 +1092,10 @@ void AsmPrinter::EmitFunctionBody() {
   // Print out code for the function.
   bool HasAnyRealCode = false;
   int NumInstsInFunction = 0;
+
   for (auto &MBB : *MF) {
     // Print a label for the basic block.
-    EmitBasicBlockStart(MBB);
+    emitBasicBlockStart(MBB);
     for (auto &MI : MBB) {
       // Print the assembly for the instruction.
       if (!MI.isPosition() && !MI.isImplicitDef() && !MI.isKill() &&
@@ -1071,7 +1106,7 @@ void AsmPrinter::EmitFunctionBody() {
 
       // If there is a pre-instruction symbol, emit a label for it here.
       if (MCSymbol *S = MI.getPreInstrSymbol())
-        OutStreamer->EmitLabel(S);
+        OutStreamer->emitLabel(S);
 
       if (ShouldPrintDebugScopes) {
         for (const HandlerInfo &HI : Handlers) {
@@ -1095,22 +1130,22 @@ void AsmPrinter::EmitFunctionBody() {
       case TargetOpcode::ANNOTATION_LABEL:
       case TargetOpcode::EH_LABEL:
       case TargetOpcode::GC_LABEL:
-        OutStreamer->EmitLabel(MI.getOperand(0).getMCSymbol());
+        OutStreamer->emitLabel(MI.getOperand(0).getMCSymbol());
         break;
       case TargetOpcode::INLINEASM:
       case TargetOpcode::INLINEASM_BR:
-        EmitInlineAsm(&MI);
+        emitInlineAsm(&MI);
         break;
       case TargetOpcode::DBG_VALUE:
         if (isVerbose()) {
           if (!emitDebugValueComment(&MI, *this))
-            EmitInstruction(&MI);
+            emitInstruction(&MI);
         }
         break;
       case TargetOpcode::DBG_LABEL:
         if (isVerbose()) {
           if (!emitDebugLabelComment(&MI, *this))
-            EmitInstruction(&MI);
+            emitInstruction(&MI);
         }
         break;
       case TargetOpcode::IMPLICIT_DEF:
@@ -1120,13 +1155,13 @@ void AsmPrinter::EmitFunctionBody() {
         if (isVerbose()) emitKill(&MI, *this);
         break;
       default:
-        EmitInstruction(&MI);
+        emitInstruction(&MI);
         break;
       }
 
       // If there is a post-instruction symbol, emit a label for it here.
       if (MCSymbol *S = MI.getPostInstrSymbol())
-        OutStreamer->EmitLabel(S);
+        OutStreamer->emitLabel(S);
 
       if (ShouldPrintDebugScopes) {
         for (const HandlerInfo &HI : Handlers) {
@@ -1138,7 +1173,44 @@ void AsmPrinter::EmitFunctionBody() {
       }
     }
 
-    EmitBasicBlockEnd(MBB);
+    // We need a temporary symbol for the end of this basic block, if either we
+    // have BBLabels enabled and we want to emit size directive for the BBs, or
+    // if this basic blocks marks the end of a section (except the section
+    // containing the entry basic block as the end symbol for that section is
+    // CurrentFnEnd).
+    MCSymbol *CurrentBBEnd = nullptr;
+    if ((MAI->hasDotTypeDotSizeDirective() && MF->hasBBLabels()) ||
+        (MBB.isEndSection() && !MBB.sameSection(&MF->front()))) {
+      CurrentBBEnd = OutContext.createTempSymbol();
+      OutStreamer->emitLabel(CurrentBBEnd);
+    }
+
+    // Helper for emitting the size directive associated with a basic block
+    // symbol.
+    auto emitELFSizeDirective = [&](MCSymbol *SymForSize) {
+      assert(CurrentBBEnd && "Basicblock end symbol not set!");
+      const MCExpr *SizeExp = MCBinaryExpr::createSub(
+          MCSymbolRefExpr::create(CurrentBBEnd, OutContext),
+          MCSymbolRefExpr::create(SymForSize, OutContext), OutContext);
+      OutStreamer->emitELFSize(SymForSize, SizeExp);
+    };
+
+    // Emit size directive for the size of each basic block, if BBLabels is
+    // enabled.
+    if (MAI->hasDotTypeDotSizeDirective() && MF->hasBBLabels())
+      emitELFSizeDirective(MBB.getSymbol());
+
+    // Emit size directive for the size of each basic block section once we
+    // get to the end of that section.
+    if (MBB.isEndSection()) {
+      if (!MBB.sameSection(&MF->front())) {
+        if (MAI->hasDotTypeDotSizeDirective())
+          emitELFSizeDirective(CurrentSectionBeginSym);
+        MBBSectionRanges[MBB.getSectionIDNum()] =
+            MBBSectionRange{CurrentSectionBeginSym, CurrentBBEnd};
+      }
+    }
+    emitBasicBlockEnd(MBB);
   }
 
   EmittedInsts += NumInstsInFunction;
@@ -1167,10 +1239,13 @@ void AsmPrinter::EmitFunctionBody() {
     // unspecified.
     if (Noop.getOpcode()) {
       OutStreamer->AddComment("avoids zero-length function");
-      OutStreamer->EmitInstruction(Noop, getSubtargetInfo());
+      emitNops(1);
     }
   }
 
+  // Switch to the original section in case basic block sections was used.
+  OutStreamer->SwitchSection(MF->getSection());
+
   const Function &F = MF->getFunction();
   for (const auto &BB : F) {
     if (!BB.hasAddressTaken())
@@ -1179,17 +1254,17 @@ void AsmPrinter::EmitFunctionBody() {
     if (Sym->isDefined())
       continue;
     OutStreamer->AddComment("Address of block that was removed by CodeGen");
-    OutStreamer->EmitLabel(Sym);
+    OutStreamer->emitLabel(Sym);
   }
 
   // Emit target-specific gunk after the function body.
-  EmitFunctionBodyEnd();
+  emitFunctionBodyEnd();
 
-  if (needFuncLabelsForEHOrDebugInfo(*MF, MMI) ||
+  if (needFuncLabelsForEHOrDebugInfo(*MF) ||
       MAI->hasDotTypeDotSizeDirective()) {
     // Create a symbol for the end of function.
     CurrentFnEnd = createTempSymbol("func_end");
-    OutStreamer->EmitLabel(CurrentFnEnd);
+    OutStreamer->emitLabel(CurrentFnEnd);
   }
 
   // If the target wants a .size directive for the size of the function, emit
@@ -1209,8 +1284,11 @@ void AsmPrinter::EmitFunctionBody() {
     HI.Handler->markFunctionEnd();
   }
 
+  MBBSectionRanges[MF->front().getSectionIDNum()] =
+      MBBSectionRange{CurrentFnBegin, CurrentFnEnd};
+
   // Print out jump tables referenced by the function.
-  EmitJumpTableInfo();
+  emitJumpTableInfo();
 
   // Emit post-function debug and/or EH information.
   for (const HandlerInfo &HI : Handlers) {
@@ -1306,7 +1384,7 @@ void AsmPrinter::emitGlobalGOTEquivs() {
   GlobalGOTEquivs.clear();
 
   for (auto *GV : FailedCandidates)
-    EmitGlobalVariable(GV);
+    emitGlobalVariable(GV);
 }
 
 void AsmPrinter::emitGlobalIndirectSymbol(Module &M,
@@ -1314,9 +1392,9 @@ void AsmPrinter::emitGlobalIndirectSymbol(Module &M,
   MCSymbol *Name = getSymbol(&GIS);
 
   if (GIS.hasExternalLinkage() || !MAI->getWeakRefDirective())
-    OutStreamer->EmitSymbolAttribute(Name, MCSA_Global);
+    OutStreamer->emitSymbolAttribute(Name, MCSA_Global);
   else if (GIS.hasWeakLinkage() || GIS.hasLinkOnceLinkage())
-    OutStreamer->EmitSymbolAttribute(Name, MCSA_WeakReference);
+    OutStreamer->emitSymbolAttribute(Name, MCSA_WeakReference);
   else
     assert(GIS.hasLocalLinkage() && "Invalid alias or ifunc linkage");
 
@@ -1333,19 +1411,22 @@ void AsmPrinter::emitGlobalIndirectSymbol(Module &M,
   // Set the symbol type to function if the alias has a function type.
   // This affects codegen when the aliasee is not a function.
   if (IsFunction)
-    OutStreamer->EmitSymbolAttribute(Name, isa<GlobalIFunc>(GIS)
+    OutStreamer->emitSymbolAttribute(Name, isa<GlobalIFunc>(GIS)
                                                ? MCSA_ELF_TypeIndFunction
                                                : MCSA_ELF_TypeFunction);
 
-  EmitVisibility(Name, GIS.getVisibility());
+  emitVisibility(Name, GIS.getVisibility());
 
   const MCExpr *Expr = lowerConstant(GIS.getIndirectSymbol());
 
   if (isa<GlobalAlias>(&GIS) && MAI->hasAltEntry() && isa<MCBinaryExpr>(Expr))
-    OutStreamer->EmitSymbolAttribute(Name, MCSA_AltEntry);
+    OutStreamer->emitSymbolAttribute(Name, MCSA_AltEntry);
 
   // Emit the directives as assignments aka .set:
-  OutStreamer->EmitAssignment(Name, Expr);
+  OutStreamer->emitAssignment(Name, Expr);
+  MCSymbol *LocalAlias = getSymbolPreferLocal(GIS);
+  if (LocalAlias != Name)
+    OutStreamer->emitAssignment(LocalAlias, Expr);
 
   if (auto *GA = dyn_cast<GlobalAlias>(&GIS)) {
     // If the aliasee does not correspond to a symbol in the output, i.e. the
@@ -1363,7 +1444,7 @@ void AsmPrinter::emitGlobalIndirectSymbol(Module &M,
   }
 }
 
-void AsmPrinter::emitRemarksSection(RemarkStreamer &RS) {
+void AsmPrinter::emitRemarksSection(remarks::RemarkStreamer &RS) {
   if (!RS.needsSection())
     return;
 
@@ -1388,7 +1469,7 @@ void AsmPrinter::emitRemarksSection(RemarkStreamer &RS) {
       OutContext.getObjectFileInfo()->getRemarksSection();
   OutStreamer->SwitchSection(RemarksSection);
 
-  OutStreamer->EmitBinaryData(OS.str());
+  OutStreamer->emitBinaryData(OS.str());
 }
 
 bool AsmPrinter::doFinalization(Module &M) {
@@ -1405,31 +1486,51 @@ bool AsmPrinter::doFinalization(Module &M) {
 
   // Emit global variables.
   for (const auto &G : M.globals())
-    EmitGlobalVariable(&G);
+    emitGlobalVariable(&G);
 
   // Emit remaining GOT equivalent globals.
   emitGlobalGOTEquivs();
 
-  // Emit visibility info for declarations
+  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
+
+  // Emit linkage(XCOFF) and visibility info for declarations
   for (const Function &F : M) {
     if (!F.isDeclarationForLinker())
       continue;
-    GlobalValue::VisibilityTypes V = F.getVisibility();
-    if (V == GlobalValue::DefaultVisibility)
-      continue;
 
     MCSymbol *Name = getSymbol(&F);
-    EmitVisibility(Name, V, false);
+    // Function getSymbol gives us the function descriptor symbol for XCOFF.
+
+    if (!TM.getTargetTriple().isOSBinFormatXCOFF()) {
+      GlobalValue::VisibilityTypes V = F.getVisibility();
+      if (V == GlobalValue::DefaultVisibility)
+        continue;
+
+      emitVisibility(Name, V, false);
+      continue;
+    }
+
+    if (F.isIntrinsic())
+      continue;
+
+    // Handle the XCOFF case.
+    // Variable `Name` is the function descriptor symbol (see above). Get the
+    // function entry point symbol.
+    MCSymbol *FnEntryPointSym = TLOF.getFunctionEntryPointSymbol(&F, TM);
+    if (cast<MCSymbolXCOFF>(FnEntryPointSym)->hasRepresentedCsectSet())
+      // Emit linkage for the function entry point.
+      emitLinkage(&F, FnEntryPointSym);
+
+    // Emit linkage for the function descriptor.
+    emitLinkage(&F, Name);
   }
 
   // Emit the remarks section contents.
   // FIXME: Figure out when is the safest time to emit this section. It should
   // not come after debug info.
-  if (RemarkStreamer *RS = M.getContext().getRemarkStreamer())
+  if (remarks::RemarkStreamer *RS = M.getContext().getMainRemarkStreamer())
     emitRemarksSection(*RS);
 
-  const TargetLoweringObjectFile &TLOF = getObjFileLowering();
-
   TLOF.emitModuleMetadata(*OutStreamer, M);
 
   if (TM.getTargetTriple().isOSBinFormatELF()) {
@@ -1441,10 +1542,10 @@ bool AsmPrinter::doFinalization(Module &M) {
       OutStreamer->SwitchSection(TLOF.getDataSection());
       const DataLayout &DL = M.getDataLayout();
 
-      EmitAlignment(Align(DL.getPointerSize()));
+      emitAlignment(Align(DL.getPointerSize()));
       for (const auto &Stub : Stubs) {
-        OutStreamer->EmitLabel(Stub.first);
-        OutStreamer->EmitSymbolValue(Stub.second.getPointer(),
+        OutStreamer->emitLabel(Stub.first);
+        OutStreamer->emitSymbolValue(Stub.second.getPointer(),
                                      DL.getPointerSize());
       }
     }
@@ -1468,10 +1569,10 @@ bool AsmPrinter::doFinalization(Module &M) {
                 COFF::IMAGE_SCN_LNK_COMDAT,
             SectionKind::getReadOnly(), Stub.first->getName(),
             COFF::IMAGE_COMDAT_SELECT_ANY));
-        EmitAlignment(Align(DL.getPointerSize()));
-        OutStreamer->EmitSymbolAttribute(Stub.first, MCSA_Global);
-        OutStreamer->EmitLabel(Stub.first);
-        OutStreamer->EmitSymbolValue(Stub.second.getPointer(),
+        emitAlignment(Align(DL.getPointerSize()));
+        OutStreamer->emitSymbolAttribute(Stub.first, MCSA_Global);
+        OutStreamer->emitLabel(Stub.first);
+        OutStreamer->emitSymbolValue(Stub.second.getPointer(),
                                      DL.getPointerSize());
       }
     }
@@ -1497,7 +1598,7 @@ bool AsmPrinter::doFinalization(Module &M) {
     for (const auto &GO : M.global_objects()) {
       if (!GO.hasExternalWeakLinkage())
         continue;
-      OutStreamer->EmitSymbolAttribute(getSymbol(&GO), MCSA_WeakReference);
+      OutStreamer->emitSymbolAttribute(getSymbol(&GO), MCSA_WeakReference);
     }
   }
 
@@ -1528,25 +1629,25 @@ bool AsmPrinter::doFinalization(Module &M) {
       MP->finishAssembly(M, *MI, *this);
 
   // Emit llvm.ident metadata in an '.ident' directive.
-  EmitModuleIdents(M);
+  emitModuleIdents(M);
 
   // Emit bytes for llvm.commandline metadata.
-  EmitModuleCommandLines(M);
+  emitModuleCommandLines(M);
 
   // Emit __morestack address if needed for indirect calls.
   if (MMI->usesMorestackAddr()) {
-    unsigned Align = 1;
+    Align Alignment(1);
     MCSection *ReadOnlySection = getObjFileLowering().getSectionForConstant(
         getDataLayout(), SectionKind::getReadOnly(),
-        /*C=*/nullptr, Align);
+        /*C=*/nullptr, Alignment);
     OutStreamer->SwitchSection(ReadOnlySection);
 
     MCSymbol *AddrSymbol =
         OutContext.getOrCreateSymbol(StringRef("__morestack_addr"));
-    OutStreamer->EmitLabel(AddrSymbol);
+    OutStreamer->emitLabel(AddrSymbol);
 
     unsigned PtrSize = MAI->getCodePointerSize();
-    OutStreamer->EmitSymbolValue(GetExternalSymbolSymbol("__morestack"),
+    OutStreamer->emitSymbolValue(GetExternalSymbolSymbol("__morestack"),
                                  PtrSize);
   }
 
@@ -1578,7 +1679,7 @@ bool AsmPrinter::doFinalization(Module &M) {
       OS.flush();
       if (!Flags.empty()) {
         OutStreamer->SwitchSection(TLOF.getDrectveSection());
-        OutStreamer->EmitBytes(Flags);
+        OutStreamer->emitBytes(Flags);
       }
       Flags.clear();
     }
@@ -1604,7 +1705,7 @@ bool AsmPrinter::doFinalization(Module &M) {
 
           if (!Flags.empty()) {
             OutStreamer->SwitchSection(TLOF.getDrectveSection());
-            OutStreamer->EmitBytes(Flags);
+            OutStreamer->emitBytes(Flags);
           }
           Flags.clear();
         }
@@ -1614,12 +1715,12 @@ bool AsmPrinter::doFinalization(Module &M) {
 
   if (TM.Options.EmitAddrsig) {
     // Emit address-significance attributes for all globals.
-    OutStreamer->EmitAddrsig();
+    OutStreamer->emitAddrsig();
     for (const GlobalValue &GV : M.global_values())
       if (!GV.use_empty() && !GV.isThreadLocal() &&
           !GV.hasDLLImportStorageClass() && !GV.getName().startswith("llvm.") &&
           !GV.hasAtLeastLocalUnnamedAddr())
-        OutStreamer->EmitAddrsigSym(getSymbol(&GV));
+        OutStreamer->emitAddrsigSym(getSymbol(&GV));
   }
 
   // Emit symbol partition specifications (ELF only).
@@ -1630,11 +1731,12 @@ bool AsmPrinter::doFinalization(Module &M) {
           GV.getVisibility() != GlobalValue::DefaultVisibility)
         continue;
 
-      OutStreamer->SwitchSection(OutContext.getELFSection(
-          ".llvm_sympart", ELF::SHT_LLVM_SYMPART, 0, 0, "", ++UniqueID));
-      OutStreamer->EmitBytes(GV.getPartition());
-      OutStreamer->EmitZeros(1);
-      OutStreamer->EmitValue(
+      OutStreamer->SwitchSection(
+          OutContext.getELFSection(".llvm_sympart", ELF::SHT_LLVM_SYMPART, 0, 0,
+                                   "", ++UniqueID, nullptr));
+      OutStreamer->emitBytes(GV.getPartition());
+      OutStreamer->emitZeros(1);
+      OutStreamer->emitValue(
           MCSymbolRefExpr::create(getSymbol(&GV), OutContext),
           MAI->getCodePointerSize());
     }
@@ -1642,7 +1744,7 @@ bool AsmPrinter::doFinalization(Module &M) {
 
   // Allow the target to emit any magic that it wants at the end of the file,
   // after everything else has gone out.
-  EmitEndOfAsmFile(M);
+  emitEndOfAsmFile(M);
 
   MMI = nullptr;
 
@@ -1665,30 +1767,31 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   const Function &F = MF.getFunction();
 
   // Get the function symbol.
-  if (MAI->needsFunctionDescriptors()) {
-    assert(TM.getTargetTriple().isOSAIX() && "Function descriptor is only"
-                                             " supported on AIX.");
+  if (!MAI->needsFunctionDescriptors()) {
+    CurrentFnSym = getSymbol(&MF.getFunction());
+  } else {
+    assert(TM.getTargetTriple().isOSAIX() &&
+           "Only AIX uses the function descriptor hooks.");
+    // AIX is unique here in that the name of the symbol emitted for the
+    // function body does not have the same name as the source function's
+    // C-linkage name.
     assert(CurrentFnDescSym && "The function descriptor symbol needs to be"
-		                           " initalized first.");
+                               " initalized first.");
 
     // Get the function entry point symbol.
-    CurrentFnSym =
-        OutContext.getOrCreateSymbol("." + CurrentFnDescSym->getName());
-
-    MCSectionXCOFF *FnEntryPointSec =
-        cast<MCSectionXCOFF>(getObjFileLowering().SectionForGlobal(&F, TM));
-    // Set the containing csect.
-    cast<MCSymbolXCOFF>(CurrentFnSym)->setContainingCsect(FnEntryPointSec);
-  } else {
-    CurrentFnSym = getSymbol(&MF.getFunction());
+    CurrentFnSym = getObjFileLowering().getFunctionEntryPointSymbol(&F, TM);
   }
 
   CurrentFnSymForSize = CurrentFnSym;
   CurrentFnBegin = nullptr;
+  CurrentSectionBeginSym = nullptr;
+  MBBSectionRanges.clear();
   CurExceptionSym = nullptr;
   bool NeedsLocalForSize = MAI->needsLocalForSize();
   if (F.hasFnAttribute("patchable-function-entry") ||
-      needFuncLabelsForEHOrDebugInfo(MF, MMI) || NeedsLocalForSize ||
+      F.hasFnAttribute("function-instrument") ||
+      F.hasFnAttribute("xray-instruction-threshold") ||
+      needFuncLabelsForEHOrDebugInfo(MF) || NeedsLocalForSize ||
       MF.getTarget().Options.EmitStackSizeSection) {
     CurrentFnBegin = createTempSymbol("func_begin");
     if (NeedsLocalForSize)
@@ -1696,13 +1799,6 @@ void AsmPrinter::SetupMachineFunction(MachineFunction &MF) {
   }
 
   ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
-  PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-  MBFI = (PSI && PSI->hasProfileSummary()) ?
-         // ORE conditionally computes MBFI. If available, use it, otherwise
-         // request it.
-         (ORE->getBFI() ? ORE->getBFI() :
-          &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI()) :
-         nullptr;
 }
 
 namespace {
@@ -1710,10 +1806,10 @@ namespace {
 // Keep track the alignment, constpool entries per Section.
   struct SectionCPs {
     MCSection *S;
-    unsigned Alignment;
+    Align Alignment;
     SmallVector<unsigned, 4> CPEs;
 
-    SectionCPs(MCSection *s, unsigned a) : S(s), Alignment(a) {}
+    SectionCPs(MCSection *s, Align a) : S(s), Alignment(a) {}
   };
 
 } // end anonymous namespace
@@ -1722,7 +1818,7 @@ namespace {
 /// representations of the constants in the constant pool MCP. This is
 /// used to print out constants which have been "spilled to memory" by
 /// the code generator.
-void AsmPrinter::EmitConstantPool() {
+void AsmPrinter::emitConstantPool() {
   const MachineConstantPool *MCP = MF->getConstantPool();
   const std::vector<MachineConstantPoolEntry> &CP = MCP->getConstants();
   if (CP.empty()) return;
@@ -1732,7 +1828,7 @@ void AsmPrinter::EmitConstantPool() {
   SmallVector<SectionCPs, 4> CPSections;
   for (unsigned i = 0, e = CP.size(); i != e; ++i) {
     const MachineConstantPoolEntry &CPE = CP[i];
-    unsigned Align = CPE.getAlignment();
+    Align Alignment = CPE.getAlign();
 
     SectionKind Kind = CPE.getSectionKind(&getDataLayout());
 
@@ -1740,8 +1836,8 @@ void AsmPrinter::EmitConstantPool() {
     if (!CPE.isMachineConstantPoolEntry())
       C = CPE.Val.ConstVal;
 
-    MCSection *S = getObjFileLowering().getSectionForConstant(getDataLayout(),
-                                                              Kind, C, Align);
+    MCSection *S = getObjFileLowering().getSectionForConstant(
+        getDataLayout(), Kind, C, Alignment);
 
     // The number of sections are small, just do a linear search from the
     // last section to the first.
@@ -1755,11 +1851,11 @@ void AsmPrinter::EmitConstantPool() {
     }
     if (!Found) {
       SecIdx = CPSections.size();
-      CPSections.push_back(SectionCPs(S, Align));
+      CPSections.push_back(SectionCPs(S, Alignment));
     }
 
-    if (Align > CPSections[SecIdx].Alignment)
-      CPSections[SecIdx].Alignment = Align;
+    if (Alignment > CPSections[SecIdx].Alignment)
+      CPSections[SecIdx].Alignment = Alignment;
     CPSections[SecIdx].CPEs.push_back(i);
   }
 
@@ -1773,14 +1869,9 @@ void AsmPrinter::EmitConstantPool() {
       if (!Sym->isUndefined())
         continue;
 
-      if (TM.getTargetTriple().isOSBinFormatXCOFF()) {
-        cast<MCSymbolXCOFF>(Sym)->setContainingCsect(
-            cast<MCSectionXCOFF>(CPSections[i].S));
-      }
-
       if (CurSection != CPSections[i].S) {
         OutStreamer->SwitchSection(CPSections[i].S);
-        EmitAlignment(Align(CPSections[i].Alignment));
+        emitAlignment(Align(CPSections[i].Alignment));
         CurSection = CPSections[i].S;
         Offset = 0;
       }
@@ -1788,25 +1879,24 @@ void AsmPrinter::EmitConstantPool() {
       MachineConstantPoolEntry CPE = CP[CPI];
 
       // Emit inter-object padding for alignment.
-      unsigned AlignMask = CPE.getAlignment() - 1;
-      unsigned NewOffset = (Offset + AlignMask) & ~AlignMask;
-      OutStreamer->EmitZeros(NewOffset - Offset);
+      unsigned NewOffset = alignTo(Offset, CPE.getAlign());
+      OutStreamer->emitZeros(NewOffset - Offset);
 
       Type *Ty = CPE.getType();
       Offset = NewOffset + getDataLayout().getTypeAllocSize(Ty);
 
-      OutStreamer->EmitLabel(Sym);
+      OutStreamer->emitLabel(Sym);
       if (CPE.isMachineConstantPoolEntry())
-        EmitMachineConstantPoolValue(CPE.Val.MachineCPVal);
+        emitMachineConstantPoolValue(CPE.Val.MachineCPVal);
       else
-        EmitGlobalConstant(getDataLayout(), CPE.Val.ConstVal);
+        emitGlobalConstant(getDataLayout(), CPE.Val.ConstVal);
     }
   }
 }
 
-/// EmitJumpTableInfo - Print assembly representations of the jump tables used
-/// by the current function to the current output stream.
-void AsmPrinter::EmitJumpTableInfo() {
+// Print assembly representations of the jump tables used by the current
+// function.
+void AsmPrinter::emitJumpTableInfo() {
   const DataLayout &DL = MF->getDataLayout();
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   if (!MJTI) return;
@@ -1827,12 +1917,12 @@ void AsmPrinter::EmitJumpTableInfo() {
     OutStreamer->SwitchSection(ReadOnlySection);
   }
 
-  EmitAlignment(Align(MJTI->getEntryAlignment(DL)));
+  emitAlignment(Align(MJTI->getEntryAlignment(DL)));
 
   // Jump tables in code sections are marked with a data_region directive
   // where that's supported.
   if (!JTInDiffSection)
-    OutStreamer->EmitDataRegion(MCDR_DataRegionJT32);
+    OutStreamer->emitDataRegion(MCDR_DataRegionJT32);
 
   for (unsigned JTI = 0, e = JT.size(); JTI != e; ++JTI) {
     const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
@@ -1855,7 +1945,7 @@ void AsmPrinter::EmitJumpTableInfo() {
         // .set LJTSet, LBB32-base
         const MCExpr *LHS =
           MCSymbolRefExpr::create(MBB->getSymbol(), OutContext);
-        OutStreamer->EmitAssignment(GetJTSetSymbol(JTI, MBB->getNumber()),
+        OutStreamer->emitAssignment(GetJTSetSymbol(JTI, MBB->getNumber()),
                                     MCBinaryExpr::createSub(LHS, Base,
                                                             OutContext));
       }
@@ -1869,25 +1959,21 @@ void AsmPrinter::EmitJumpTableInfo() {
       // FIXME: This doesn't have to have any specific name, just any randomly
       // named and numbered local label started with 'l' would work.  Simplify
       // GetJTISymbol.
-      OutStreamer->EmitLabel(GetJTISymbol(JTI, true));
+      OutStreamer->emitLabel(GetJTISymbol(JTI, true));
 
     MCSymbol* JTISymbol = GetJTISymbol(JTI);
-    if (TM.getTargetTriple().isOSBinFormatXCOFF()) {
-      cast<MCSymbolXCOFF>(JTISymbol)->setContainingCsect(
-          cast<MCSectionXCOFF>(TLOF.getSectionForJumpTable(F, TM)));
-    }
-    OutStreamer->EmitLabel(JTISymbol);
+    OutStreamer->emitLabel(JTISymbol);
 
     for (unsigned ii = 0, ee = JTBBs.size(); ii != ee; ++ii)
-      EmitJumpTableEntry(MJTI, JTBBs[ii], JTI);
+      emitJumpTableEntry(MJTI, JTBBs[ii], JTI);
   }
   if (!JTInDiffSection)
-    OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+    OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
 }
 
 /// EmitJumpTableEntry - Emit a jump table entry for the specified MBB to the
 /// current stream.
-void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
+void AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
                                     const MachineBasicBlock *MBB,
                                     unsigned UID) const {
   assert(MBB && MBB->getNumber() >= 0 && "Invalid basic block");
@@ -1909,7 +1995,7 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
     // with a relocation as gp-relative, e.g.:
     //     .gprel32 LBB123
     MCSymbol *MBBSym = MBB->getSymbol();
-    OutStreamer->EmitGPRel32Value(MCSymbolRefExpr::create(MBBSym, OutContext));
+    OutStreamer->emitGPRel32Value(MCSymbolRefExpr::create(MBBSym, OutContext));
     return;
   }
 
@@ -1918,7 +2004,7 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
     // with a relocation as gp-relative, e.g.:
     //     .gpdword LBB123
     MCSymbol *MBBSym = MBB->getSymbol();
-    OutStreamer->EmitGPRel64Value(MCSymbolRefExpr::create(MBBSym, OutContext));
+    OutStreamer->emitGPRel64Value(MCSymbolRefExpr::create(MBBSym, OutContext));
     return;
   }
 
@@ -1946,16 +2032,16 @@ void AsmPrinter::EmitJumpTableEntry(const MachineJumpTableInfo *MJTI,
   assert(Value && "Unknown entry kind!");
 
   unsigned EntrySize = MJTI->getEntrySize(getDataLayout());
-  OutStreamer->EmitValue(Value, EntrySize);
+  OutStreamer->emitValue(Value, EntrySize);
 }
 
 /// EmitSpecialLLVMGlobal - Check to see if the specified global is a
 /// special global used by LLVM.  If so, emit it and return true, otherwise
 /// do nothing and return false.
-bool AsmPrinter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) {
+bool AsmPrinter::emitSpecialLLVMGlobal(const GlobalVariable *GV) {
   if (GV->getName() == "llvm.used") {
     if (MAI->hasNoDeadStrip())    // No need to emit this at all.
-      EmitLLVMUsedList(cast<ConstantArray>(GV->getInitializer()));
+      emitLLVMUsedList(cast<ConstantArray>(GV->getInitializer()));
     return true;
   }
 
@@ -1969,14 +2055,14 @@ bool AsmPrinter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) {
   assert(GV->hasInitializer() && "Not a special LLVM global!");
 
   if (GV->getName() == "llvm.global_ctors") {
-    EmitXXStructorList(GV->getParent()->getDataLayout(), GV->getInitializer(),
+    emitXXStructorList(GV->getParent()->getDataLayout(), GV->getInitializer(),
                        /* isCtor */ true);
 
     return true;
   }
 
   if (GV->getName() == "llvm.global_dtors") {
-    EmitXXStructorList(GV->getParent()->getDataLayout(), GV->getInitializer(),
+    emitXXStructorList(GV->getParent()->getDataLayout(), GV->getInitializer(),
                        /* isCtor */ false);
 
     return true;
@@ -1987,13 +2073,13 @@ bool AsmPrinter::EmitSpecialLLVMGlobal(const GlobalVariable *GV) {
 
 /// EmitLLVMUsedList - For targets that define a MAI::UsedDirective, mark each
 /// global in the specified llvm.used list.
-void AsmPrinter::EmitLLVMUsedList(const ConstantArray *InitList) {
+void AsmPrinter::emitLLVMUsedList(const ConstantArray *InitList) {
   // Should be an array of 'i8*'.
   for (unsigned i = 0, e = InitList->getNumOperands(); i != e; ++i) {
     const GlobalValue *GV =
       dyn_cast<GlobalValue>(InitList->getOperand(i)->stripPointerCasts());
     if (GV)
-      OutStreamer->EmitSymbolAttribute(getSymbol(GV), MCSA_NoDeadStrip);
+      OutStreamer->emitSymbolAttribute(getSymbol(GV), MCSA_NoDeadStrip);
   }
 }
 
@@ -2011,27 +2097,16 @@ struct Structor {
 
 /// EmitXXStructorList - Emit the ctor or dtor list taking into account the init
 /// priority.
-void AsmPrinter::EmitXXStructorList(const DataLayout &DL, const Constant *List,
+void AsmPrinter::emitXXStructorList(const DataLayout &DL, const Constant *List,
                                     bool isCtor) {
   // Should be an array of '{ i32, void ()*, i8* }' structs.  The first value is the
   // init priority.
   if (!isa<ConstantArray>(List)) return;
 
-  // Sanity check the structors list.
-  const ConstantArray *InitList = dyn_cast<ConstantArray>(List);
-  if (!InitList) return; // Not an array!
-  StructType *ETy = dyn_cast<StructType>(InitList->getType()->getElementType());
-  if (!ETy || ETy->getNumElements() != 3 ||
-      !isa<IntegerType>(ETy->getTypeAtIndex(0U)) ||
-      !isa<PointerType>(ETy->getTypeAtIndex(1U)) ||
-      !isa<PointerType>(ETy->getTypeAtIndex(2U)))
-    return; // Not (int, ptr, ptr).
-
   // Gather the structors in a form that's convenient for sorting by priority.
   SmallVector<Structor, 8> Structors;
-  for (Value *O : InitList->operands()) {
-    ConstantStruct *CS = dyn_cast<ConstantStruct>(O);
-    if (!CS) continue; // Malformed.
+  for (Value *O : cast<ConstantArray>(List)->operands()) {
+    auto *CS = cast<ConstantStruct>(O);
     if (CS->getOperand(1)->isNullValue())
       break;  // Found a null terminator, skip the rest.
     ConstantInt *Priority = dyn_cast<ConstantInt>(CS->getOperand(0));
@@ -2069,12 +2144,12 @@ void AsmPrinter::EmitXXStructorList(const DataLayout &DL, const Constant *List,
                 : Obj.getStaticDtorSection(S.Priority, KeySym));
     OutStreamer->SwitchSection(OutputSection);
     if (OutStreamer->getCurrentSection() != OutStreamer->getPreviousSection())
-      EmitAlignment(Align);
-    EmitXXStructor(DL, S.Func);
+      emitAlignment(Align);
+    emitXXStructor(DL, S.Func);
   }
 }
 
-void AsmPrinter::EmitModuleIdents(Module &M) {
+void AsmPrinter::emitModuleIdents(Module &M) {
   if (!MAI->hasIdentDirective())
     return;
 
@@ -2084,12 +2159,12 @@ void AsmPrinter::EmitModuleIdents(Module &M) {
       assert(N->getNumOperands() == 1 &&
              "llvm.ident metadata entry can have only one operand");
       const MDString *S = cast<MDString>(N->getOperand(0));
-      OutStreamer->EmitIdent(S->getString());
+      OutStreamer->emitIdent(S->getString());
     }
   }
 }
 
-void AsmPrinter::EmitModuleCommandLines(Module &M) {
+void AsmPrinter::emitModuleCommandLines(Module &M) {
   MCSection *CommandLine = getObjFileLowering().getSectionForCommandLines();
   if (!CommandLine)
     return;
@@ -2100,14 +2175,14 @@ void AsmPrinter::EmitModuleCommandLines(Module &M) {
 
   OutStreamer->PushSection();
   OutStreamer->SwitchSection(CommandLine);
-  OutStreamer->EmitZeros(1);
+  OutStreamer->emitZeros(1);
   for (unsigned i = 0, e = NMD->getNumOperands(); i != e; ++i) {
     const MDNode *N = NMD->getOperand(i);
     assert(N->getNumOperands() == 1 &&
            "llvm.commandline metadata entry can have only one operand");
     const MDString *S = cast<MDString>(N->getOperand(0));
-    OutStreamer->EmitBytes(S->getString());
-    OutStreamer->EmitZeros(1);
+    OutStreamer->emitBytes(S->getString());
+    OutStreamer->emitZeros(1);
   }
   OutStreamer->PopSection();
 }
@@ -2118,29 +2193,23 @@ void AsmPrinter::EmitModuleCommandLines(Module &M) {
 
 /// Emit a byte directive and value.
 ///
-void AsmPrinter::emitInt8(int Value) const {
-  OutStreamer->EmitIntValue(Value, 1);
-}
+void AsmPrinter::emitInt8(int Value) const { OutStreamer->emitInt8(Value); }
 
 /// Emit a short directive and value.
-void AsmPrinter::emitInt16(int Value) const {
-  OutStreamer->EmitIntValue(Value, 2);
-}
+void AsmPrinter::emitInt16(int Value) const { OutStreamer->emitInt16(Value); }
 
 /// Emit a long directive and value.
-void AsmPrinter::emitInt32(int Value) const {
-  OutStreamer->EmitIntValue(Value, 4);
-}
+void AsmPrinter::emitInt32(int Value) const { OutStreamer->emitInt32(Value); }
 
 /// Emit a long long directive and value.
 void AsmPrinter::emitInt64(uint64_t Value) const {
-  OutStreamer->EmitIntValue(Value, 8);
+  OutStreamer->emitInt64(Value);
 }
 
 /// Emit something like ".long Hi-Lo" where the size in bytes of the directive
 /// is specified by Size and Hi/Lo specify the labels. This implicitly uses
 /// .set if it avoids relocations.
-void AsmPrinter::EmitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
+void AsmPrinter::emitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
                                      unsigned Size) const {
   OutStreamer->emitAbsoluteSymbolDiff(Hi, Lo, Size);
 }
@@ -2148,13 +2217,13 @@ void AsmPrinter::EmitLabelDifference(const MCSymbol *Hi, const MCSymbol *Lo,
 /// EmitLabelPlusOffset - Emit something like ".long Label+Offset"
 /// where the size in bytes of the directive is specified by Size and Label
 /// specifies the label.  This implicitly uses .set if it is available.
-void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
+void AsmPrinter::emitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
                                      unsigned Size,
                                      bool IsSectionRelative) const {
   if (MAI->needsDwarfSectionOffsetDirective() && IsSectionRelative) {
     OutStreamer->EmitCOFFSecRel32(Label, Offset);
     if (Size > 4)
-      OutStreamer->EmitZeros(Size - 4);
+      OutStreamer->emitZeros(Size - 4);
     return;
   }
 
@@ -2164,7 +2233,7 @@ void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
     Expr = MCBinaryExpr::createAdd(
         Expr, MCConstantExpr::create(Offset, OutContext), OutContext);
 
-  OutStreamer->EmitValue(Expr, Size);
+  OutStreamer->emitValue(Expr, Size);
 }
 
 //===----------------------------------------------------------------------===//
@@ -2173,17 +2242,17 @@ void AsmPrinter::EmitLabelPlusOffset(const MCSymbol *Label, uint64_t Offset,
 // two boundary.  If a global value is specified, and if that global has
 // an explicit alignment requested, it will override the alignment request
 // if required for correctness.
-void AsmPrinter::EmitAlignment(Align Alignment, const GlobalObject *GV) const {
+void AsmPrinter::emitAlignment(Align Alignment, const GlobalObject *GV) const {
   if (GV)
     Alignment = getGVAlignment(GV, GV->getParent()->getDataLayout(), Alignment);
 
-  if (Alignment == Align::None())
+  if (Alignment == Align(1))
     return; // 1-byte aligned: no need to emit alignment.
 
   if (getCurrentSection()->getKind().isText())
-    OutStreamer->EmitCodeAlignment(Alignment.value());
+    OutStreamer->emitCodeAlignment(Alignment.value());
   else
-    OutStreamer->EmitValueToAlignment(Alignment.value());
+    OutStreamer->emitValueToAlignment(Alignment.value());
 }
 
 //===----------------------------------------------------------------------===//
@@ -2211,23 +2280,22 @@ const MCExpr *AsmPrinter::lowerConstant(const Constant *CV) {
   }
 
   switch (CE->getOpcode()) {
-  default:
+  default: {
     // If the code isn't optimized, there may be outstanding folding
     // opportunities. Attempt to fold the expression using DataLayout as a
     // last resort before giving up.
-    if (Constant *C = ConstantFoldConstant(CE, getDataLayout()))
-      if (C != CE)
-        return lowerConstant(C);
+    Constant *C = ConstantFoldConstant(CE, getDataLayout());
+    if (C != CE)
+      return lowerConstant(C);
 
     // Otherwise report the problem to the user.
-    {
-      std::string S;
-      raw_string_ostream OS(S);
-      OS << "Unsupported expression in static initializer: ";
-      CE->printAsOperand(OS, /*PrintType=*/false,
-                     !MF ? nullptr : MF->getFunction().getParent());
-      report_fatal_error(OS.str());
-    }
+    std::string S;
+    raw_string_ostream OS(S);
+    OS << "Unsupported expression in static initializer: ";
+    CE->printAsOperand(OS, /*PrintType=*/false,
+                   !MF ? nullptr : MF->getFunction().getParent());
+    report_fatal_error(OS.str());
+  }
   case Instruction::GetElementPtr: {
     // Generate a symbolic expression for the byte address
     APInt OffsetAI(getDataLayout().getPointerTypeSizeInBits(CE->getType()), 0);
@@ -2413,7 +2481,7 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL,
 
   // If this can be emitted with .ascii/.asciz, emit it as such.
   if (CDS->isString())
-    return AP.OutStreamer->EmitBytes(CDS->getAsString());
+    return AP.OutStreamer->emitBytes(CDS->getAsString());
 
   // Otherwise, emit the values in successive locations.
   unsigned ElementByteSize = CDS->getElementByteSize();
@@ -2422,7 +2490,7 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL,
       if (AP.isVerbose())
         AP.OutStreamer->GetCommentOS() << format("0x%" PRIx64 "\n",
                                                  CDS->getElementAsInteger(i));
-      AP.OutStreamer->EmitIntValue(CDS->getElementAsInteger(i),
+      AP.OutStreamer->emitIntValue(CDS->getElementAsInteger(i),
                                    ElementByteSize);
     }
   } else {
@@ -2432,11 +2500,11 @@ static void emitGlobalConstantDataSequential(const DataLayout &DL,
   }
 
   unsigned Size = DL.getTypeAllocSize(CDS->getType());
-  unsigned EmittedSize = DL.getTypeAllocSize(CDS->getType()->getElementType()) *
-                        CDS->getNumElements();
+  unsigned EmittedSize =
+      DL.getTypeAllocSize(CDS->getElementType()) * CDS->getNumElements();
   assert(EmittedSize <= Size && "Size cannot be less than EmittedSize!");
   if (unsigned Padding = Size - EmittedSize)
-    AP.OutStreamer->EmitZeros(Padding);
+    AP.OutStreamer->emitZeros(Padding);
 }
 
 static void emitGlobalConstantArray(const DataLayout &DL,
@@ -2467,7 +2535,7 @@ static void emitGlobalConstantVector(const DataLayout &DL,
   unsigned EmittedSize = DL.getTypeAllocSize(CV->getType()->getElementType()) *
                          CV->getType()->getNumElements();
   if (unsigned Padding = Size - EmittedSize)
-    AP.OutStreamer->EmitZeros(Padding);
+    AP.OutStreamer->emitZeros(Padding);
 }
 
 static void emitGlobalConstantStruct(const DataLayout &DL,
@@ -2492,7 +2560,7 @@ static void emitGlobalConstantStruct(const DataLayout &DL,
     // Insert padding - this may include padding to increase the size of the
     // current field up to the ABI size (if the struct is not packed) as well
     // as padding to ensure that the next field starts at the right offset.
-    AP.OutStreamer->EmitZeros(PadSize);
+    AP.OutStreamer->emitZeros(PadSize);
   }
   assert(SizeSoFar == Layout->getSizeInBytes() &&
          "Layout of constant struct may be incorrect!");
@@ -2524,22 +2592,22 @@ static void emitGlobalConstantFP(APFloat APF, Type *ET, AsmPrinter &AP) {
     int Chunk = API.getNumWords() - 1;
 
     if (TrailingBytes)
-      AP.OutStreamer->EmitIntValue(p[Chunk--], TrailingBytes);
+      AP.OutStreamer->emitIntValueInHexWithPadding(p[Chunk--], TrailingBytes);
 
     for (; Chunk >= 0; --Chunk)
-      AP.OutStreamer->EmitIntValue(p[Chunk], sizeof(uint64_t));
+      AP.OutStreamer->emitIntValueInHexWithPadding(p[Chunk], sizeof(uint64_t));
   } else {
     unsigned Chunk;
     for (Chunk = 0; Chunk < NumBytes / sizeof(uint64_t); ++Chunk)
-      AP.OutStreamer->EmitIntValue(p[Chunk], sizeof(uint64_t));
+      AP.OutStreamer->emitIntValueInHexWithPadding(p[Chunk], sizeof(uint64_t));
 
     if (TrailingBytes)
-      AP.OutStreamer->EmitIntValue(p[Chunk], TrailingBytes);
+      AP.OutStreamer->emitIntValueInHexWithPadding(p[Chunk], TrailingBytes);
   }
 
   // Emit the tail padding for the long double.
   const DataLayout &DL = AP.getDataLayout();
-  AP.OutStreamer->EmitZeros(DL.getTypeAllocSize(ET) - DL.getTypeStoreSize(ET));
+  AP.OutStreamer->emitZeros(DL.getTypeAllocSize(ET) - DL.getTypeStoreSize(ET));
 }
 
 static void emitGlobalConstantFP(const ConstantFP *CFP, AsmPrinter &AP) {
@@ -2570,9 +2638,10 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
       // [chunk1][chunk2] ... [chunkN].
       // The most significant chunk is chunkN and it should be emitted first.
       // However, due to the alignment issue chunkN contains useless bits.
-      // Realign the chunks so that they contain only useless information:
+      // Realign the chunks so that they contain only useful information:
       // ExtraBits     0       1       (BitWidth / 64) - 1
       //       chu[nk1 chu][nk2 chu] ... [nkN-1 chunkN]
+      ExtraBitsSize = alignTo(ExtraBitsSize, 8);
       ExtraBits = Realigned.getRawData()[0] &
         (((uint64_t)-1) >> (64 - ExtraBitsSize));
       Realigned.lshrInPlace(ExtraBitsSize);
@@ -2586,19 +2655,19 @@ static void emitGlobalConstantLargeInt(const ConstantInt *CI, AsmPrinter &AP) {
   const uint64_t *RawData = Realigned.getRawData();
   for (unsigned i = 0, e = BitWidth / 64; i != e; ++i) {
     uint64_t Val = DL.isBigEndian() ? RawData[e - i - 1] : RawData[i];
-    AP.OutStreamer->EmitIntValue(Val, 8);
+    AP.OutStreamer->emitIntValue(Val, 8);
   }
 
   if (ExtraBitsSize) {
     // Emit the extra bits after the 64-bits chunks.
 
     // Emit a directive that fills the expected size.
-    uint64_t Size = AP.getDataLayout().getTypeAllocSize(CI->getType());
+    uint64_t Size = AP.getDataLayout().getTypeStoreSize(CI->getType());
     Size -= (BitWidth / 64) * 8;
     assert(Size && Size * 8 >= ExtraBitsSize &&
            (ExtraBits & (((uint64_t)-1) >> (64 - ExtraBitsSize)))
            == ExtraBits && "Directive too small for extra bits.");
-    AP.OutStreamer->EmitIntValue(ExtraBits, Size);
+    AP.OutStreamer->emitIntValue(ExtraBits, Size);
   }
 }
 
@@ -2705,30 +2774,32 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV,
     BaseCV = dyn_cast<Constant>(CV->user_back());
 
   if (isa<ConstantAggregateZero>(CV) || isa<UndefValue>(CV))
-    return AP.OutStreamer->EmitZeros(Size);
+    return AP.OutStreamer->emitZeros(Size);
 
   if (const ConstantInt *CI = dyn_cast<ConstantInt>(CV)) {
-    switch (Size) {
-    case 1:
-    case 2:
-    case 4:
-    case 8:
+    const uint64_t StoreSize = DL.getTypeStoreSize(CV->getType());
+
+    if (StoreSize < 8) {
       if (AP.isVerbose())
         AP.OutStreamer->GetCommentOS() << format("0x%" PRIx64 "\n",
                                                  CI->getZExtValue());
-      AP.OutStreamer->EmitIntValue(CI->getZExtValue(), Size);
-      return;
-    default:
+      AP.OutStreamer->emitIntValue(CI->getZExtValue(), StoreSize);
+    } else {
       emitGlobalConstantLargeInt(CI, AP);
-      return;
     }
+
+    // Emit tail padding if needed
+    if (Size != StoreSize)
+      AP.OutStreamer->emitZeros(Size - StoreSize);
+
+    return;
   }
 
   if (const ConstantFP *CFP = dyn_cast<ConstantFP>(CV))
     return emitGlobalConstantFP(CFP, AP);
 
   if (isa<ConstantPointerNull>(CV)) {
-    AP.OutStreamer->EmitIntValue(0, Size);
+    AP.OutStreamer->emitIntValue(0, Size);
     return;
   }
 
@@ -2752,7 +2823,7 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV,
       // to emit the value in chunks. Try to constant fold the value and emit it
       // that way.
       Constant *New = ConstantFoldConstant(CE, DL);
-      if (New && New != CE)
+      if (New != CE)
         return emitGlobalConstantImpl(DL, New, AP);
     }
   }
@@ -2770,22 +2841,22 @@ static void emitGlobalConstantImpl(const DataLayout &DL, const Constant *CV,
   if (AP.getObjFileLowering().supportIndirectSymViaGOTPCRel())
     handleIndirectSymViaGOTPCRel(AP, &ME, BaseCV, Offset);
 
-  AP.OutStreamer->EmitValue(ME, Size);
+  AP.OutStreamer->emitValue(ME, Size);
 }
 
 /// EmitGlobalConstant - Print a general LLVM constant to the .s file.
-void AsmPrinter::EmitGlobalConstant(const DataLayout &DL, const Constant *CV) {
+void AsmPrinter::emitGlobalConstant(const DataLayout &DL, const Constant *CV) {
   uint64_t Size = DL.getTypeAllocSize(CV->getType());
   if (Size)
     emitGlobalConstantImpl(DL, CV, *this);
   else if (MAI->hasSubsectionsViaSymbols()) {
     // If the global has zero size, emit a single byte so that two labels don't
     // look like they are at the same location.
-    OutStreamer->EmitIntValue(0, 1);
+    OutStreamer->emitIntValue(0, 1);
   }
 }
 
-void AsmPrinter::EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+void AsmPrinter::emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
   // Target doesn't support this yet!
   llvm_unreachable("Target does not support EmitMachineConstantPoolValue");
 }
@@ -2797,6 +2868,13 @@ void AsmPrinter::printOffset(int64_t Offset, raw_ostream &OS) const {
     OS << Offset;
 }
 
+void AsmPrinter::emitNops(unsigned N) {
+  MCInst Nop;
+  MF->getSubtarget().getInstrInfo()->getNoop(Nop);
+  for (; N; --N)
+    EmitToStreamer(*OutStreamer, Nop);
+}
+
 //===----------------------------------------------------------------------===//
 // Symbol Lowering Routines.
 //===----------------------------------------------------------------------===//
@@ -2822,12 +2900,13 @@ MCSymbol *AsmPrinter::GetCPISymbol(unsigned CPID) const {
       const DataLayout &DL = MF->getDataLayout();
       SectionKind Kind = CPE.getSectionKind(&DL);
       const Constant *C = CPE.Val.ConstVal;
-      unsigned Align = CPE.Alignment;
+      Align Alignment = CPE.Alignment;
       if (const MCSectionCOFF *S = dyn_cast<MCSectionCOFF>(
-              getObjFileLowering().getSectionForConstant(DL, Kind, C, Align))) {
+              getObjFileLowering().getSectionForConstant(DL, Kind, C,
+                                                         Alignment))) {
         if (MCSymbol *Sym = S->getCOMDATSymbol()) {
           if (Sym->isUndefined())
-            OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global);
+            OutStreamer->emitSymbolAttribute(Sym, MCSA_Global);
           return Sym;
         }
       }
@@ -2929,10 +3008,10 @@ static void emitBasicBlockLoopComments(const MachineBasicBlock &MBB,
   PrintChildLoopComment(OS, Loop, AP.getFunctionNumber());
 }
 
-/// EmitBasicBlockStart - This method prints the label for the specified
+/// emitBasicBlockStart - This method prints the label for the specified
 /// MachineBasicBlock, an alignment (if present) and a comment describing
 /// it if appropriate.
-void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
+void AsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
   // End the previous funclet and start a new one.
   if (MBB.isEHFuncletEntry()) {
     for (const HandlerInfo &HI : Handlers) {
@@ -2943,8 +3022,8 @@ void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
 
   // Emit an alignment directive for this block, if needed.
   const Align Alignment = MBB.getAlignment();
-  if (Alignment != Align::None())
-    EmitAlignment(Alignment);
+  if (Alignment != Align(1))
+    emitAlignment(Alignment);
 
   // If the block has its address taken, emit any labels that were used to
   // reference the block.  It is possible that there is more than one label
@@ -2959,7 +3038,7 @@ void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
     // their corresponding BB's address taken in IR
     if (BB->hasAddressTaken())
       for (MCSymbol *Sym : MMI->getAddrLabelSymbolToEmit(BB))
-        OutStreamer->EmitLabel(Sym);
+        OutStreamer->emitLabel(Sym);
   }
 
   // Print some verbose block comments.
@@ -2976,25 +3055,44 @@ void AsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
     emitBasicBlockLoopComments(MBB, MLI, *this);
   }
 
-  // Print the main label for the block.
   if (MBB.pred_empty() ||
-      (isBlockOnlyReachableByFallthrough(&MBB) && !MBB.isEHFuncletEntry() &&
-       !MBB.hasLabelMustBeEmitted())) {
+      (!MF->hasBBLabels() && isBlockOnlyReachableByFallthrough(&MBB) &&
+       !MBB.isEHFuncletEntry() && !MBB.hasLabelMustBeEmitted())) {
     if (isVerbose()) {
       // NOTE: Want this comment at start of line, don't emit with AddComment.
       OutStreamer->emitRawComment(" %bb." + Twine(MBB.getNumber()) + ":",
                                   false);
     }
   } else {
-    if (isVerbose() && MBB.hasLabelMustBeEmitted())
+    if (isVerbose() && MBB.hasLabelMustBeEmitted()) {
       OutStreamer->AddComment("Label of block must be emitted");
-    OutStreamer->EmitLabel(MBB.getSymbol());
+    }
+    auto *BBSymbol = MBB.getSymbol();
+    // Switch to a new section if this basic block must begin a section.
+    if (MBB.isBeginSection()) {
+      OutStreamer->SwitchSection(
+          getObjFileLowering().getSectionForMachineBasicBlock(MF->getFunction(),
+                                                              MBB, TM));
+      CurrentSectionBeginSym = BBSymbol;
+    }
+    OutStreamer->emitLabel(BBSymbol);
+    // With BB sections, each basic block must handle CFI information on its own
+    // if it begins a section.
+    if (MBB.isBeginSection())
+      for (const HandlerInfo &HI : Handlers)
+        HI.Handler->beginBasicBlock(MBB);
   }
 }
 
-void AsmPrinter::EmitBasicBlockEnd(const MachineBasicBlock &MBB) {}
+void AsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) {
+  // Check if CFI information needs to be updated for this MBB with basic block
+  // sections.
+  if (MBB.isEndSection())
+    for (const HandlerInfo &HI : Handlers)
+      HI.Handler->endBasicBlock(MBB);
+}
 
-void AsmPrinter::EmitVisibility(MCSymbol *Sym, unsigned Visibility,
+void AsmPrinter::emitVisibility(MCSymbol *Sym, unsigned Visibility,
                                 bool IsDefinition) const {
   MCSymbolAttr Attr = MCSA_Invalid;
 
@@ -3012,7 +3110,7 @@ void AsmPrinter::EmitVisibility(MCSymbol *Sym, unsigned Visibility,
   }
 
   if (Attr != MCSA_Invalid)
-    OutStreamer->EmitSymbolAttribute(Sym, Attr);
+    OutStreamer->emitSymbolAttribute(Sym, Attr);
 }
 
 /// isBlockOnlyReachableByFallthough - Return true if the basic block has
@@ -3020,6 +3118,10 @@ void AsmPrinter::EmitVisibility(MCSymbol *Sym, unsigned Visibility,
 /// the predecessor and this block is a fall-through.
 bool AsmPrinter::
 isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB) const {
+  // With BasicBlock Sections, beginning of the section is not a fallthrough.
+  if (MBB->isBeginSection())
+    return false;
+
   // If this is a landing pad, it isn't a fall through.  If it has no preds,
   // then nothing falls through to it.
   if (MBB->isEHPad() || MBB->pred_empty())
@@ -3069,11 +3171,10 @@ GCMetadataPrinter *AsmPrinter::GetOrCreateGCPrinter(GCStrategy &S) {
 
   auto Name = S.getName();
 
-  for (GCMetadataPrinterRegistry::iterator
-         I = GCMetadataPrinterRegistry::begin(),
-         E = GCMetadataPrinterRegistry::end(); I != E; ++I)
-    if (Name == I->getName()) {
-      std::unique_ptr<GCMetadataPrinter> GMP = I->instantiate();
+  for (const GCMetadataPrinterRegistry::entry &GCMetaPrinter :
+       GCMetadataPrinterRegistry::entries())
+    if (Name == GCMetaPrinter.getName()) {
+      std::unique_ptr<GCMetadataPrinter> GMP = GCMetaPrinter.instantiate();
       GMP->S = &S;
       auto IterBool = GCMap.insert(std::make_pair(&S, std::move(GMP)));
       return IterBool.first->second.get();
@@ -3111,18 +3212,15 @@ void AsmPrinterHandler::markFunctionEnd() {}
 // In the binary's "xray_instr_map" section, an array of these function entries
 // describes each instrumentation point.  When XRay patches your code, the index
 // into this table will be given to your handler as a patch point identifier.
-void AsmPrinter::XRayFunctionEntry::emit(int Bytes, MCStreamer *Out,
-                                         const MCSymbol *CurrentFnSym) const {
-  Out->EmitSymbolValue(Sled, Bytes);
-  Out->EmitSymbolValue(CurrentFnSym, Bytes);
+void AsmPrinter::XRayFunctionEntry::emit(int Bytes, MCStreamer *Out) const {
   auto Kind8 = static_cast<uint8_t>(Kind);
-  Out->EmitBinaryData(StringRef(reinterpret_cast<const char *>(&Kind8), 1));
-  Out->EmitBinaryData(
+  Out->emitBinaryData(StringRef(reinterpret_cast<const char *>(&Kind8), 1));
+  Out->emitBinaryData(
       StringRef(reinterpret_cast<const char *>(&AlwaysInstrument), 1));
-  Out->EmitBinaryData(StringRef(reinterpret_cast<const char *>(&Version), 1));
+  Out->emitBinaryData(StringRef(reinterpret_cast<const char *>(&Version), 1));
   auto Padding = (4 * Bytes) - ((2 * Bytes) + 3);
   assert(Padding >= 0 && "Instrumentation map entry > 4 * Word Size");
-  Out->EmitZeros(Padding);
+  Out->emitZeros(Padding);
 }
 
 void AsmPrinter::emitXRayTable() {
@@ -3133,28 +3231,34 @@ void AsmPrinter::emitXRayTable() {
   const Function &F = MF->getFunction();
   MCSection *InstMap = nullptr;
   MCSection *FnSledIndex = nullptr;
-  if (MF->getSubtarget().getTargetTriple().isOSBinFormatELF()) {
-    auto Associated = dyn_cast<MCSymbolELF>(CurrentFnSym);
-    assert(Associated != nullptr);
-    auto Flags = ELF::SHF_WRITE | ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER;
-    std::string GroupName;
+  const Triple &TT = TM.getTargetTriple();
+  // Use PC-relative addresses on all targets except MIPS (MIPS64 cannot use
+  // PC-relative addresses because R_MIPS_PC64 does not exist).
+  bool PCRel = !TT.isMIPS();
+  if (TT.isOSBinFormatELF()) {
+    auto LinkedToSym = cast<MCSymbolELF>(CurrentFnSym);
+    auto Flags = ELF::SHF_ALLOC | ELF::SHF_LINK_ORDER;
+    if (!PCRel)
+      Flags |= ELF::SHF_WRITE;
+    StringRef GroupName;
     if (F.hasComdat()) {
       Flags |= ELF::SHF_GROUP;
       GroupName = F.getComdat()->getName();
     }
-
-    auto UniqueID = ++XRayFnUniqueID;
-    InstMap =
-        OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS, Flags, 0,
-                                 GroupName, UniqueID, Associated);
-    FnSledIndex =
-        OutContext.getELFSection("xray_fn_idx", ELF::SHT_PROGBITS, Flags, 0,
-                                 GroupName, UniqueID, Associated);
+    InstMap = OutContext.getELFSection("xray_instr_map", ELF::SHT_PROGBITS,
+                                       Flags, 0, GroupName,
+                                       MCSection::NonUniqueID, LinkedToSym);
+
+    if (!TM.Options.XRayOmitFunctionIndex)
+      FnSledIndex = OutContext.getELFSection(
+          "xray_fn_idx", ELF::SHT_PROGBITS, Flags | ELF::SHF_WRITE, 0,
+          GroupName, MCSection::NonUniqueID, LinkedToSym);
   } else if (MF->getSubtarget().getTargetTriple().isOSBinFormatMachO()) {
     InstMap = OutContext.getMachOSection("__DATA", "xray_instr_map", 0,
                                          SectionKind::getReadOnlyWithRel());
-    FnSledIndex = OutContext.getMachOSection("__DATA", "xray_fn_idx", 0,
-                                             SectionKind::getReadOnlyWithRel());
+    if (!TM.Options.XRayOmitFunctionIndex)
+      FnSledIndex = OutContext.getMachOSection(
+          "__DATA", "xray_fn_idx", 0, SectionKind::getReadOnlyWithRel());
   } else {
     llvm_unreachable("Unsupported target");
   }
@@ -3164,23 +3268,46 @@ void AsmPrinter::emitXRayTable() {
   // Now we switch to the instrumentation map section. Because this is done
   // per-function, we are able to create an index entry that will represent the
   // range of sleds associated with a function.
+  auto &Ctx = OutContext;
   MCSymbol *SledsStart = OutContext.createTempSymbol("xray_sleds_start", true);
   OutStreamer->SwitchSection(InstMap);
-  OutStreamer->EmitLabel(SledsStart);
-  for (const auto &Sled : Sleds)
-    Sled.emit(WordSizeBytes, OutStreamer.get(), CurrentFnSym);
+  OutStreamer->emitLabel(SledsStart);
+  for (const auto &Sled : Sleds) {
+    if (PCRel) {
+      MCSymbol *Dot = Ctx.createTempSymbol();
+      OutStreamer->emitLabel(Dot);
+      OutStreamer->emitValueImpl(
+          MCBinaryExpr::createSub(MCSymbolRefExpr::create(Sled.Sled, Ctx),
+                                  MCSymbolRefExpr::create(Dot, Ctx), Ctx),
+          WordSizeBytes);
+      OutStreamer->emitValueImpl(
+          MCBinaryExpr::createSub(
+              MCSymbolRefExpr::create(CurrentFnBegin, Ctx),
+              MCBinaryExpr::createAdd(
+                  MCSymbolRefExpr::create(Dot, Ctx),
+                  MCConstantExpr::create(WordSizeBytes, Ctx), Ctx),
+              Ctx),
+          WordSizeBytes);
+    } else {
+      OutStreamer->emitSymbolValue(Sled.Sled, WordSizeBytes);
+      OutStreamer->emitSymbolValue(CurrentFnSym, WordSizeBytes);
+    }
+    Sled.emit(WordSizeBytes, OutStreamer.get());
+  }
   MCSymbol *SledsEnd = OutContext.createTempSymbol("xray_sleds_end", true);
-  OutStreamer->EmitLabel(SledsEnd);
+  OutStreamer->emitLabel(SledsEnd);
 
   // We then emit a single entry in the index per function. We use the symbols
   // that bound the instrumentation map as the range for a specific function.
   // Each entry here will be 2 * word size aligned, as we're writing down two
   // pointers. This should work for both 32-bit and 64-bit platforms.
-  OutStreamer->SwitchSection(FnSledIndex);
-  OutStreamer->EmitCodeAlignment(2 * WordSizeBytes);
-  OutStreamer->EmitSymbolValue(SledsStart, WordSizeBytes, false);
-  OutStreamer->EmitSymbolValue(SledsEnd, WordSizeBytes, false);
-  OutStreamer->SwitchSection(PrevSection);
+  if (FnSledIndex) {
+    OutStreamer->SwitchSection(FnSledIndex);
+    OutStreamer->emitCodeAlignment(2 * WordSizeBytes);
+    OutStreamer->emitSymbolValue(SledsStart, WordSizeBytes, false);
+    OutStreamer->emitSymbolValue(SledsEnd, WordSizeBytes, false);
+    OutStreamer->SwitchSection(PrevSection);
+  }
   Sleds.clear();
 }
 
@@ -3199,36 +3326,36 @@ void AsmPrinter::recordSled(MCSymbol *Sled, const MachineInstr &MI,
 
 void AsmPrinter::emitPatchableFunctionEntries() {
   const Function &F = MF->getFunction();
-  if (!F.hasFnAttribute("patchable-function-entry"))
+  unsigned PatchableFunctionPrefix = 0, PatchableFunctionEntry = 0;
+  (void)F.getFnAttribute("patchable-function-prefix")
+      .getValueAsString()
+      .getAsInteger(10, PatchableFunctionPrefix);
+  (void)F.getFnAttribute("patchable-function-entry")
+      .getValueAsString()
+      .getAsInteger(10, PatchableFunctionEntry);
+  if (!PatchableFunctionPrefix && !PatchableFunctionEntry)
     return;
   const unsigned PointerSize = getPointerSize();
   if (TM.getTargetTriple().isOSBinFormatELF()) {
     auto Flags = ELF::SHF_WRITE | ELF::SHF_ALLOC;
+    const MCSymbolELF *LinkedToSym = nullptr;
+    StringRef GroupName;
 
-    // As of binutils 2.33, GNU as does not support section flag "o" or linkage
-    // field "unique". Use SHF_LINK_ORDER if we are using the integrated
-    // assembler.
+    // GNU as < 2.35 did not support section flag 'o'. Use SHF_LINK_ORDER only
+    // if we are using the integrated assembler.
     if (MAI->useIntegratedAssembler()) {
       Flags |= ELF::SHF_LINK_ORDER;
-      std::string GroupName;
       if (F.hasComdat()) {
         Flags |= ELF::SHF_GROUP;
         GroupName = F.getComdat()->getName();
       }
-      MCSection *Section = getObjFileLowering().SectionForGlobal(&F, TM);
-      unsigned UniqueID =
-          PatchableFunctionEntryID
-              .try_emplace(Section, PatchableFunctionEntryID.size())
-              .first->second;
-      OutStreamer->SwitchSection(OutContext.getELFSection(
-          "__patchable_function_entries", ELF::SHT_PROGBITS, Flags, 0,
-          GroupName, UniqueID, cast<MCSymbolELF>(CurrentFnSym)));
-    } else {
-      OutStreamer->SwitchSection(OutContext.getELFSection(
-          "__patchable_function_entries", ELF::SHT_PROGBITS, Flags));
+      LinkedToSym = cast<MCSymbolELF>(CurrentFnSym);
     }
-    EmitAlignment(Align(PointerSize));
-    OutStreamer->EmitSymbolValue(CurrentFnBegin, PointerSize);
+    OutStreamer->SwitchSection(OutContext.getELFSection(
+        "__patchable_function_entries", ELF::SHT_PROGBITS, Flags, 0, GroupName,
+        MCSection::NonUniqueID, LinkedToSym));
+    emitAlignment(Align(PointerSize));
+    OutStreamer->emitSymbolValue(CurrentPatchableFunctionEntrySym, PointerSize);
   }
 }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
index 992e44d95306..d81a9be26d39 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterDwarf.cpp
@@ -36,22 +36,23 @@ using namespace llvm;
 //===----------------------------------------------------------------------===//
 
 /// EmitSLEB128 - emit the specified signed leb128 value.
-void AsmPrinter::EmitSLEB128(int64_t Value, const char *Desc) const {
+void AsmPrinter::emitSLEB128(int64_t Value, const char *Desc) const {
   if (isVerbose() && Desc)
     OutStreamer->AddComment(Desc);
 
-  OutStreamer->EmitSLEB128IntValue(Value);
+  OutStreamer->emitSLEB128IntValue(Value);
 }
 
-void AsmPrinter::EmitULEB128(uint64_t Value, const char *Desc, unsigned PadTo) const {
+void AsmPrinter::emitULEB128(uint64_t Value, const char *Desc,
+                             unsigned PadTo) const {
   if (isVerbose() && Desc)
     OutStreamer->AddComment(Desc);
 
-  OutStreamer->EmitULEB128IntValue(Value, PadTo);
+  OutStreamer->emitULEB128IntValue(Value, PadTo);
 }
 
 /// Emit something like ".uleb128 Hi-Lo".
-void AsmPrinter::EmitLabelDifferenceAsULEB128(const MCSymbol *Hi,
+void AsmPrinter::emitLabelDifferenceAsULEB128(const MCSymbol *Hi,
                                               const MCSymbol *Lo) const {
   OutStreamer->emitAbsoluteSymbolDiffAsULEB128(Hi, Lo);
 }
@@ -105,7 +106,7 @@ static const char *DecodeDWARFEncoding(unsigned Encoding) {
 /// encoding.  If verbose assembly output is enabled, we output comments
 /// describing the encoding.  Desc is an optional string saying what the
 /// encoding is specifying (e.g. "LSDA").
-void AsmPrinter::EmitEncodingByte(unsigned Val, const char *Desc) const {
+void AsmPrinter::emitEncodingByte(unsigned Val, const char *Desc) const {
   if (isVerbose()) {
     if (Desc)
       OutStreamer->AddComment(Twine(Desc) + " Encoding = " +
@@ -114,7 +115,7 @@ void AsmPrinter::EmitEncodingByte(unsigned Val, const char *Desc) const {
       OutStreamer->AddComment(Twine("Encoding = ") + DecodeDWARFEncoding(Val));
   }
 
-  OutStreamer->EmitIntValue(Val, 1);
+  OutStreamer->emitIntValue(Val, 1);
 }
 
 /// GetSizeOfEncodedValue - Return the size of the encoding in bytes.
@@ -136,16 +137,16 @@ unsigned AsmPrinter::GetSizeOfEncodedValue(unsigned Encoding) const {
   }
 }
 
-void AsmPrinter::EmitTTypeReference(const GlobalValue *GV,
+void AsmPrinter::emitTTypeReference(const GlobalValue *GV,
                                     unsigned Encoding) const {
   if (GV) {
     const TargetLoweringObjectFile &TLOF = getObjFileLowering();
 
     const MCExpr *Exp =
         TLOF.getTTypeGlobalReference(GV, Encoding, TM, MMI, *OutStreamer);
-    OutStreamer->EmitValue(Exp, GetSizeOfEncodedValue(Encoding));
+    OutStreamer->emitValue(Exp, GetSizeOfEncodedValue(Encoding));
   } else
-    OutStreamer->EmitIntValue(0, GetSizeOfEncodedValue(Encoding));
+    OutStreamer->emitIntValue(0, GetSizeOfEncodedValue(Encoding));
 }
 
 void AsmPrinter::emitDwarfSymbolReference(const MCSymbol *Label,
@@ -159,13 +160,13 @@ void AsmPrinter::emitDwarfSymbolReference(const MCSymbol *Label,
 
     // If the format uses relocations with dwarf, refer to the symbol directly.
     if (MAI->doesDwarfUseRelocationsAcrossSections()) {
-      OutStreamer->EmitSymbolValue(Label, 4);
+      OutStreamer->emitSymbolValue(Label, 4);
       return;
     }
   }
 
   // Otherwise, emit it as a label difference from the start of the section.
-  EmitLabelDifference(Label, Label->getSection().getBeginSymbol(), 4);
+  emitLabelDifference(Label, Label->getSection().getBeginSymbol(), 4);
 }
 
 void AsmPrinter::emitDwarfStringOffset(DwarfStringPoolEntry S) const {
@@ -179,27 +180,26 @@ void AsmPrinter::emitDwarfStringOffset(DwarfStringPoolEntry S) const {
   emitInt32(S.Offset);
 }
 
-void AsmPrinter::EmitDwarfOffset(const MCSymbol *Label, uint64_t Offset) const {
-  EmitLabelPlusOffset(Label, Offset, MAI->getCodePointerSize());
+void AsmPrinter::emitDwarfOffset(const MCSymbol *Label, uint64_t Offset) const {
+  // TODO: Support DWARF64
+  emitLabelPlusOffset(Label, Offset, 4);
 }
 
-void AsmPrinter::EmitCallSiteOffset(const MCSymbol *Hi,
-                                    const MCSymbol *Lo,
+void AsmPrinter::emitCallSiteOffset(const MCSymbol *Hi, const MCSymbol *Lo,
                                     unsigned Encoding) const {
   // The least significant 3 bits specify the width of the encoding
   if ((Encoding & 0x7) == dwarf::DW_EH_PE_uleb128)
-    EmitLabelDifferenceAsULEB128(Hi, Lo);
+    emitLabelDifferenceAsULEB128(Hi, Lo);
   else
-    EmitLabelDifference(Hi, Lo, GetSizeOfEncodedValue(Encoding));
+    emitLabelDifference(Hi, Lo, GetSizeOfEncodedValue(Encoding));
 }
 
-void AsmPrinter::EmitCallSiteValue(uint64_t Value,
-                                   unsigned Encoding) const {
+void AsmPrinter::emitCallSiteValue(uint64_t Value, unsigned Encoding) const {
   // The least significant 3 bits specify the width of the encoding
   if ((Encoding & 0x7) == dwarf::DW_EH_PE_uleb128)
-    EmitULEB128(Value);
+    emitULEB128(Value);
   else
-    OutStreamer->EmitIntValue(Value, GetSizeOfEncodedValue(Encoding));
+    OutStreamer->emitIntValue(Value, GetSizeOfEncodedValue(Encoding));
 }
 
 //===----------------------------------------------------------------------===//
@@ -211,40 +211,43 @@ void AsmPrinter::emitCFIInstruction(const MCCFIInstruction &Inst) const {
   default:
     llvm_unreachable("Unexpected instruction");
   case MCCFIInstruction::OpDefCfaOffset:
-    OutStreamer->EmitCFIDefCfaOffset(Inst.getOffset());
+    OutStreamer->emitCFIDefCfaOffset(Inst.getOffset());
     break;
   case MCCFIInstruction::OpAdjustCfaOffset:
-    OutStreamer->EmitCFIAdjustCfaOffset(Inst.getOffset());
+    OutStreamer->emitCFIAdjustCfaOffset(Inst.getOffset());
     break;
   case MCCFIInstruction::OpDefCfa:
-    OutStreamer->EmitCFIDefCfa(Inst.getRegister(), Inst.getOffset());
+    OutStreamer->emitCFIDefCfa(Inst.getRegister(), Inst.getOffset());
     break;
   case MCCFIInstruction::OpDefCfaRegister:
-    OutStreamer->EmitCFIDefCfaRegister(Inst.getRegister());
+    OutStreamer->emitCFIDefCfaRegister(Inst.getRegister());
     break;
   case MCCFIInstruction::OpOffset:
-    OutStreamer->EmitCFIOffset(Inst.getRegister(), Inst.getOffset());
+    OutStreamer->emitCFIOffset(Inst.getRegister(), Inst.getOffset());
     break;
   case MCCFIInstruction::OpRegister:
-    OutStreamer->EmitCFIRegister(Inst.getRegister(), Inst.getRegister2());
+    OutStreamer->emitCFIRegister(Inst.getRegister(), Inst.getRegister2());
     break;
   case MCCFIInstruction::OpWindowSave:
-    OutStreamer->EmitCFIWindowSave();
+    OutStreamer->emitCFIWindowSave();
     break;
   case MCCFIInstruction::OpNegateRAState:
-    OutStreamer->EmitCFINegateRAState();
+    OutStreamer->emitCFINegateRAState();
     break;
   case MCCFIInstruction::OpSameValue:
-    OutStreamer->EmitCFISameValue(Inst.getRegister());
+    OutStreamer->emitCFISameValue(Inst.getRegister());
     break;
   case MCCFIInstruction::OpGnuArgsSize:
-    OutStreamer->EmitCFIGnuArgsSize(Inst.getOffset());
+    OutStreamer->emitCFIGnuArgsSize(Inst.getOffset());
     break;
   case MCCFIInstruction::OpEscape:
-    OutStreamer->EmitCFIEscape(Inst.getValues());
+    OutStreamer->emitCFIEscape(Inst.getValues());
     break;
   case MCCFIInstruction::OpRestore:
-    OutStreamer->EmitCFIRestore(Inst.getRegister());
+    OutStreamer->emitCFIRestore(Inst.getRegister());
+    break;
+  case MCCFIInstruction::OpUndefined:
+    OutStreamer->emitCFIUndefined(Inst.getRegister());
     break;
   }
 }
@@ -256,7 +259,7 @@ void AsmPrinter::emitDwarfDIE(const DIE &Die) const {
                             Twine::utohexstr(Die.getOffset()) + ":0x" +
                             Twine::utohexstr(Die.getSize()) + " " +
                             dwarf::TagString(Die.getTag()));
-  EmitULEB128(Die.getAbbrevNumber());
+  emitULEB128(Die.getAbbrevNumber());
 
   // Emit the DIE attribute values.
   for (const auto &V : Die.values()) {
@@ -271,7 +274,7 @@ void AsmPrinter::emitDwarfDIE(const DIE &Die) const {
     }
 
     // Emit an attribute using the defined form.
-    V.EmitValue(this);
+    V.emitValue(this);
   }
 
   // Emit the DIE children if any.
@@ -286,7 +289,7 @@ void AsmPrinter::emitDwarfDIE(const DIE &Die) const {
 
 void AsmPrinter::emitDwarfAbbrev(const DIEAbbrev &Abbrev) const {
   // Emit the abbreviations code (base 1 index.)
-  EmitULEB128(Abbrev.getNumber(), "Abbreviation Code");
+  emitULEB128(Abbrev.getNumber(), "Abbreviation Code");
 
   // Emit the abbreviations data.
   Abbrev.Emit(this);
diff --git a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
index c631cc5360b8..538107cecd8b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/AsmPrinterInlineAsm.cpp
@@ -106,7 +106,7 @@ unsigned AsmPrinter::addInlineAsmDiagBuffer(StringRef AsmStr,
 
 
 /// EmitInlineAsm - Emit a blob of inline asm to the output streamer.
-void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
+void AsmPrinter::emitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
                                const MCTargetOptions &MCOptions,
                                const MDNode *LocMDNode,
                                InlineAsm::AsmDialect Dialect) const {
@@ -127,7 +127,7 @@ void AsmPrinter::EmitInlineAsm(StringRef Str, const MCSubtargetInfo &STI,
   if (!MCAI->useIntegratedAssembler() &&
       !OutStreamer->isIntegratedAssemblerRequired()) {
     emitInlineAsmStart();
-    OutStreamer->EmitRawText(Str);
+    OutStreamer->emitRawText(Str);
     emitInlineAsmEnd(STI, nullptr);
     return;
   }
@@ -489,9 +489,9 @@ static void EmitGCCInlineAsmStr(const char *AsmStr, const MachineInstr *MI,
   OS << '\n' << (char)0;  // null terminate string.
 }
 
-/// EmitInlineAsm - This method formats and emits the specified machine
-/// instruction that is an inline asm.
-void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
+/// This method formats and emits the specified machine instruction that is an
+/// inline asm.
+void AsmPrinter::emitInlineAsm(const MachineInstr *MI) const {
   assert(MI->isInlineAsm() && "printInlineAsm only works on inline asms");
 
   // Count the number of register definitions to find the asm string.
@@ -584,7 +584,7 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
     SrcMgr.PrintMessage(Loc, SourceMgr::DK_Note, Note);
   }
 
-  EmitInlineAsm(OS.str(), getSubtargetInfo(), TM.Options.MCOptions, LocMD,
+  emitInlineAsm(OS.str(), getSubtargetInfo(), TM.Options.MCOptions, LocMD,
                 MI->getInlineAsmDialect());
 
   // Emit the #NOAPP end marker.  This has to happen even if verbose-asm isn't
@@ -592,7 +592,6 @@ void AsmPrinter::EmitInlineAsm(const MachineInstr *MI) const {
   OutStreamer->emitRawComment(MAI->getInlineAsmEnd());
 }
 
-
 /// PrintSpecial - Print information related to the specified machine instr
 /// that is independent of the operand, and may be independent of the instr
 /// itself.  This can be useful for portably encoding the comment character
diff --git a/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h b/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
index 09f7496cd4ef..90929a217368 100644
--- a/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
+++ b/llvm/lib/CodeGen/AsmPrinter/ByteStreamer.h
@@ -30,8 +30,9 @@ class ByteStreamer {
  public:
   // For now we're just handling the calls we need for dwarf emission/hashing.
   virtual void EmitInt8(uint8_t Byte, const Twine &Comment = "") = 0;
-  virtual void EmitSLEB128(uint64_t DWord, const Twine &Comment = "") = 0;
-  virtual void EmitULEB128(uint64_t DWord, const Twine &Comment = "", unsigned PadTo = 0) = 0;
+  virtual void emitSLEB128(uint64_t DWord, const Twine &Comment = "") = 0;
+  virtual void emitULEB128(uint64_t DWord, const Twine &Comment = "",
+                           unsigned PadTo = 0) = 0;
 };
 
 class APByteStreamer final : public ByteStreamer {
@@ -44,13 +45,14 @@ public:
     AP.OutStreamer->AddComment(Comment);
     AP.emitInt8(Byte);
   }
-  void EmitSLEB128(uint64_t DWord, const Twine &Comment) override {
+  void emitSLEB128(uint64_t DWord, const Twine &Comment) override {
     AP.OutStreamer->AddComment(Comment);
-    AP.EmitSLEB128(DWord);
+    AP.emitSLEB128(DWord);
   }
-  void EmitULEB128(uint64_t DWord, const Twine &Comment, unsigned PadTo) override {
+  void emitULEB128(uint64_t DWord, const Twine &Comment,
+                   unsigned PadTo) override {
     AP.OutStreamer->AddComment(Comment);
-    AP.EmitULEB128(DWord);
+    AP.emitULEB128(DWord, nullptr, PadTo);
   }
 };
 
@@ -62,10 +64,11 @@ class HashingByteStreamer final : public ByteStreamer {
   void EmitInt8(uint8_t Byte, const Twine &Comment) override {
     Hash.update(Byte);
   }
-  void EmitSLEB128(uint64_t DWord, const Twine &Comment) override {
+  void emitSLEB128(uint64_t DWord, const Twine &Comment) override {
     Hash.addSLEB128(DWord);
   }
-  void EmitULEB128(uint64_t DWord, const Twine &Comment, unsigned PadTo) override {
+  void emitULEB128(uint64_t DWord, const Twine &Comment,
+                   unsigned PadTo) override {
     Hash.addULEB128(DWord);
   }
 };
@@ -90,7 +93,7 @@ public:
     if (GenerateComments)
       Comments.push_back(Comment.str());
   }
-  void EmitSLEB128(uint64_t DWord, const Twine &Comment) override {
+  void emitSLEB128(uint64_t DWord, const Twine &Comment) override {
     raw_svector_ostream OSE(Buffer);
     unsigned Length = encodeSLEB128(DWord, OSE);
     if (GenerateComments) {
@@ -102,7 +105,8 @@ public:
 
     }
   }
-  void EmitULEB128(uint64_t DWord, const Twine &Comment, unsigned PadTo) override {
+  void emitULEB128(uint64_t DWord, const Twine &Comment,
+                   unsigned PadTo) override {
     raw_svector_ostream OSE(Buffer);
     unsigned Length = encodeULEB128(DWord, OSE, PadTo);
     if (GenerateComments) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
index 62ad356e7f8f..3f053c7a38c7 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.cpp
@@ -101,27 +101,27 @@ public:
   CVMCAdapter(MCStreamer &OS, TypeCollection &TypeTable)
       : OS(&OS), TypeTable(TypeTable) {}
 
-  void EmitBytes(StringRef Data) { OS->EmitBytes(Data); }
+  void emitBytes(StringRef Data) override { OS->emitBytes(Data); }
 
-  void EmitIntValue(uint64_t Value, unsigned Size) {
-    OS->EmitIntValueInHex(Value, Size);
+  void emitIntValue(uint64_t Value, unsigned Size) override {
+    OS->emitIntValueInHex(Value, Size);
   }
 
-  void EmitBinaryData(StringRef Data) { OS->EmitBinaryData(Data); }
+  void emitBinaryData(StringRef Data) override { OS->emitBinaryData(Data); }
 
-  void AddComment(const Twine &T) { OS->AddComment(T); }
+  void AddComment(const Twine &T) override { OS->AddComment(T); }
 
-  void AddRawComment(const Twine &T) { OS->emitRawComment(T); }
+  void AddRawComment(const Twine &T) override { OS->emitRawComment(T); }
 
-  bool isVerboseAsm() { return OS->isVerboseAsm(); }
+  bool isVerboseAsm() override { return OS->isVerboseAsm(); }
 
-  std::string getTypeName(TypeIndex TI) {
+  std::string getTypeName(TypeIndex TI) override {
     std::string TypeName;
     if (!TI.isNoneType()) {
       if (TI.isSimple())
-        TypeName = TypeIndex::simpleTypeName(TI);
+        TypeName = std::string(TypeIndex::simpleTypeName(TI));
       else
-        TypeName = TypeTable.getTypeName(TI);
+        TypeName = std::string(TypeTable.getTypeName(TI));
     }
     return TypeName;
   }
@@ -183,7 +183,7 @@ StringRef CodeViewDebug::getFullFilepath(const DIFile *File) {
   if (Dir.startswith("/") || Filename.startswith("/")) {
     if (llvm::sys::path::is_absolute(Filename, llvm::sys::path::Style::posix))
       return Filename;
-    Filepath = Dir;
+    Filepath = std::string(Dir);
     if (Dir.back() != '/')
       Filepath += '/';
     Filepath += Filename;
@@ -195,7 +195,7 @@ StringRef CodeViewDebug::getFullFilepath(const DIFile *File) {
   // that would increase the IR size and probably not needed for other users.
   // For now, just concatenate and canonicalize the path here.
   if (Filename.find(':') == 1)
-    Filepath = Filename;
+    Filepath = std::string(Filename);
   else
     Filepath = (Dir + "\\" + Filename).str();
 
@@ -250,8 +250,15 @@ unsigned CodeViewDebug::maybeRecordFile(const DIFile *F) {
       ChecksumAsBytes = ArrayRef<uint8_t>(
           reinterpret_cast<const uint8_t *>(CKMem), Checksum.size());
       switch (F->getChecksum()->Kind) {
-      case DIFile::CSK_MD5:  CSKind = FileChecksumKind::MD5; break;
-      case DIFile::CSK_SHA1: CSKind = FileChecksumKind::SHA1; break;
+      case DIFile::CSK_MD5:
+        CSKind = FileChecksumKind::MD5;
+        break;
+      case DIFile::CSK_SHA1:
+        CSKind = FileChecksumKind::SHA1;
+        break;
+      case DIFile::CSK_SHA256:
+        CSKind = FileChecksumKind::SHA256;
+        break;
       }
     }
     bool Success = OS.EmitCVFileDirective(NextId, FullPath, ChecksumAsBytes,
@@ -303,12 +310,19 @@ static StringRef getPrettyScopeName(const DIScope *Scope) {
   return StringRef();
 }
 
-static const DISubprogram *getQualifiedNameComponents(
+const DISubprogram *CodeViewDebug::collectParentScopeNames(
     const DIScope *Scope, SmallVectorImpl<StringRef> &QualifiedNameComponents) {
   const DISubprogram *ClosestSubprogram = nullptr;
   while (Scope != nullptr) {
     if (ClosestSubprogram == nullptr)
       ClosestSubprogram = dyn_cast<DISubprogram>(Scope);
+
+    // If a type appears in a scope chain, make sure it gets emitted. The
+    // frontend will be responsible for deciding if this should be a forward
+    // declaration or a complete type.
+    if (const auto *Ty = dyn_cast<DICompositeType>(Scope))
+      DeferredCompleteTypes.push_back(Ty);
+
     StringRef ScopeName = getPrettyScopeName(Scope);
     if (!ScopeName.empty())
       QualifiedNameComponents.push_back(ScopeName);
@@ -317,24 +331,18 @@ static const DISubprogram *getQualifiedNameComponents(
   return ClosestSubprogram;
 }
 
-static std::string getQualifiedName(ArrayRef<StringRef> QualifiedNameComponents,
+static std::string formatNestedName(ArrayRef<StringRef> QualifiedNameComponents,
                                     StringRef TypeName) {
   std::string FullyQualifiedName;
   for (StringRef QualifiedNameComponent :
        llvm::reverse(QualifiedNameComponents)) {
-    FullyQualifiedName.append(QualifiedNameComponent);
+    FullyQualifiedName.append(std::string(QualifiedNameComponent));
     FullyQualifiedName.append("::");
   }
-  FullyQualifiedName.append(TypeName);
+  FullyQualifiedName.append(std::string(TypeName));
   return FullyQualifiedName;
 }
 
-static std::string getFullyQualifiedName(const DIScope *Scope, StringRef Name) {
-  SmallVector<StringRef, 5> QualifiedNameComponents;
-  getQualifiedNameComponents(Scope, QualifiedNameComponents);
-  return getQualifiedName(QualifiedNameComponents, Name);
-}
-
 struct CodeViewDebug::TypeLoweringScope {
   TypeLoweringScope(CodeViewDebug &CVD) : CVD(CVD) { ++CVD.TypeEmissionLevel; }
   ~TypeLoweringScope() {
@@ -347,7 +355,18 @@ struct CodeViewDebug::TypeLoweringScope {
   CodeViewDebug &CVD;
 };
 
-static std::string getFullyQualifiedName(const DIScope *Ty) {
+std::string CodeViewDebug::getFullyQualifiedName(const DIScope *Scope,
+                                                 StringRef Name) {
+  // Ensure types in the scope chain are emitted as soon as possible.
+  // This can create otherwise a situation where S_UDTs are emitted while
+  // looping in emitDebugInfoForUDTs.
+  TypeLoweringScope S(*this);
+  SmallVector<StringRef, 5> QualifiedNameComponents;
+  collectParentScopeNames(Scope, QualifiedNameComponents);
+  return formatNestedName(QualifiedNameComponents, Name);
+}
+
+std::string CodeViewDebug::getFullyQualifiedName(const DIScope *Ty) {
   const DIScope *Scope = Ty->getScope();
   return getFullyQualifiedName(Scope, getPrettyScopeName(Ty));
 }
@@ -418,10 +437,11 @@ getFunctionOptions(const DISubroutineType *Ty,
       ReturnTy = TypeArray[0];
   }
 
-  if (auto *ReturnDCTy = dyn_cast_or_null<DICompositeType>(ReturnTy)) {
-    if (isNonTrivial(ReturnDCTy))
+  // Add CxxReturnUdt option to functions that return nontrivial record types
+  // or methods that return record types.
+  if (auto *ReturnDCTy = dyn_cast_or_null<DICompositeType>(ReturnTy))
+    if (isNonTrivial(ReturnDCTy) || ClassTy)
       FO |= FunctionOptions::CxxReturnUdt;
-  }
 
   // DISubroutineType is unnamed. Use DISubprogram's i.e. SPName in comparison.
   if (ClassTy && isNonTrivial(ClassTy) && SPName == ClassTy->getName()) {
@@ -543,15 +563,15 @@ void CodeViewDebug::maybeRecordLocation(const DebugLoc &DL,
     addLocIfNotPresent(CurFn->ChildSites, Loc);
   }
 
-  OS.EmitCVLocDirective(FuncId, FileId, DL.getLine(), DL.getCol(),
+  OS.emitCVLocDirective(FuncId, FileId, DL.getLine(), DL.getCol(),
                         /*PrologueEnd=*/false, /*IsStmt=*/false,
                         DL->getFilename(), SMLoc());
 }
 
 void CodeViewDebug::emitCodeViewMagicVersion() {
-  OS.EmitValueToAlignment(4);
+  OS.emitValueToAlignment(4);
   OS.AddComment("Debug section magic");
-  OS.EmitIntValue(COFF::DEBUG_SECTION_MAGIC, 4);
+  OS.emitInt32(COFF::DEBUG_SECTION_MAGIC);
 }
 
 void CodeViewDebug::endModule() {
@@ -600,11 +620,11 @@ void CodeViewDebug::endModule() {
 
   // This subsection holds a file index to offset in string table table.
   OS.AddComment("File index to string table offset subsection");
-  OS.EmitCVFileChecksumsDirective();
+  OS.emitCVFileChecksumsDirective();
 
   // This subsection holds the string table.
   OS.AddComment("String table");
-  OS.EmitCVStringTableDirective();
+  OS.emitCVStringTableDirective();
 
   // Emit S_BUILDINFO, which points to LF_BUILDINFO. Put this in its own symbol
   // subsection in the generic .debug$S section at the end. There is no
@@ -631,7 +651,7 @@ emitNullTerminatedSymbolName(MCStreamer &OS, StringRef S,
   SmallString<32> NullTerminatedString(
       S.take_front(MaxRecordLength - MaxFixedRecordLength - 1));
   NullTerminatedString.push_back('\0');
-  OS.EmitBytes(NullTerminatedString);
+  OS.emitBytes(NullTerminatedString);
 }
 
 void CodeViewDebug::emitTypeInformation() {
@@ -674,13 +694,13 @@ void CodeViewDebug::emitTypeGlobalHashes() {
   // hardcoded to version 0, SHA1.
   OS.SwitchSection(Asm->getObjFileLowering().getCOFFGlobalTypeHashesSection());
 
-  OS.EmitValueToAlignment(4);
+  OS.emitValueToAlignment(4);
   OS.AddComment("Magic");
-  OS.EmitIntValue(COFF::DEBUG_HASHES_SECTION_MAGIC, 4);
+  OS.emitInt32(COFF::DEBUG_HASHES_SECTION_MAGIC);
   OS.AddComment("Section Version");
-  OS.EmitIntValue(0, 2);
+  OS.emitInt16(0);
   OS.AddComment("Hash Algorithm");
-  OS.EmitIntValue(uint16_t(GlobalTypeHashAlg::SHA1_8), 2);
+  OS.emitInt16(uint16_t(GlobalTypeHashAlg::SHA1_8));
 
   TypeIndex TI(TypeIndex::FirstNonSimpleIndex);
   for (const auto &GHR : TypeTable.hashes()) {
@@ -696,7 +716,7 @@ void CodeViewDebug::emitTypeGlobalHashes() {
     assert(GHR.Hash.size() == 8);
     StringRef S(reinterpret_cast<const char *>(GHR.Hash.data()),
                 GHR.Hash.size());
-    OS.EmitBinaryData(S);
+    OS.emitBinaryData(S);
   }
 }
 
@@ -775,16 +795,16 @@ void CodeViewDebug::emitCompilerInformation() {
   // TODO:  Figure out which other flags need to be set.
 
   OS.AddComment("Flags and language");
-  OS.EmitIntValue(Flags, 4);
+  OS.emitInt32(Flags);
 
   OS.AddComment("CPUType");
-  OS.EmitIntValue(static_cast<uint64_t>(TheCPU), 2);
+  OS.emitInt16(static_cast<uint64_t>(TheCPU));
 
   StringRef CompilerVersion = CU->getProducer();
   Version FrontVer = parseVersion(CompilerVersion);
   OS.AddComment("Frontend version");
   for (int N = 0; N < 4; ++N)
-    OS.EmitIntValue(FrontVer.Part[N], 2);
+    OS.emitInt16(FrontVer.Part[N]);
 
   // Some Microsoft tools, like Binscope, expect a backend version number of at
   // least 8.something, so we'll coerce the LLVM version into a form that
@@ -797,7 +817,7 @@ void CodeViewDebug::emitCompilerInformation() {
   Version BackVer = {{ Major, 0, 0, 0 }};
   OS.AddComment("Backend version");
   for (int N = 0; N < 4; ++N)
-    OS.EmitIntValue(BackVer.Part[N], 2);
+    OS.emitInt16(BackVer.Part[N]);
 
   OS.AddComment("Null-terminated compiler version string");
   emitNullTerminatedSymbolName(OS, CompilerVersion);
@@ -841,7 +861,7 @@ void CodeViewDebug::emitBuildInfo() {
   MCSymbol *BISubsecEnd = beginCVSubsection(DebugSubsectionKind::Symbols);
   MCSymbol *BIEnd = beginSymbolRecord(SymbolKind::S_BUILDINFO);
   OS.AddComment("LF_BUILDINFO index");
-  OS.EmitIntValue(BuildInfoIndex.getIndex(), 4);
+  OS.emitInt32(BuildInfoIndex.getIndex());
   endSymbolRecord(BIEnd);
   endCVSubsection(BISubsecEnd);
 }
@@ -858,7 +878,7 @@ void CodeViewDebug::emitInlineeLinesSubsection() {
   // for instance, will display a warning that the breakpoints are not valid if
   // the pdb does not match the source.
   OS.AddComment("Inlinee lines signature");
-  OS.EmitIntValue(unsigned(InlineeLinesSignature::Normal), 4);
+  OS.emitInt32(unsigned(InlineeLinesSignature::Normal));
 
   for (const DISubprogram *SP : InlinedSubprograms) {
     assert(TypeIndices.count({SP, nullptr}));
@@ -870,11 +890,11 @@ void CodeViewDebug::emitInlineeLinesSubsection() {
                   SP->getFilename() + Twine(':') + Twine(SP->getLine()));
     OS.AddBlankLine();
     OS.AddComment("Type index of inlined function");
-    OS.EmitIntValue(InlineeIdx.getIndex(), 4);
+    OS.emitInt32(InlineeIdx.getIndex());
     OS.AddComment("Offset into filechecksum table");
-    OS.EmitCVFileChecksumOffsetDirective(FileId);
+    OS.emitCVFileChecksumOffsetDirective(FileId);
     OS.AddComment("Starting line number");
-    OS.EmitIntValue(SP->getLine(), 4);
+    OS.emitInt32(SP->getLine());
   }
 
   endCVSubsection(InlineEnd);
@@ -890,16 +910,16 @@ void CodeViewDebug::emitInlinedCallSite(const FunctionInfo &FI,
   MCSymbol *InlineEnd = beginSymbolRecord(SymbolKind::S_INLINESITE);
 
   OS.AddComment("PtrParent");
-  OS.EmitIntValue(0, 4);
+  OS.emitInt32(0);
   OS.AddComment("PtrEnd");
-  OS.EmitIntValue(0, 4);
+  OS.emitInt32(0);
   OS.AddComment("Inlinee type index");
-  OS.EmitIntValue(InlineeIdx.getIndex(), 4);
+  OS.emitInt32(InlineeIdx.getIndex());
 
   unsigned FileId = maybeRecordFile(Site.Inlinee->getFile());
   unsigned StartLineNum = Site.Inlinee->getLine();
 
-  OS.EmitCVInlineLinetableDirective(Site.SiteFuncId, FileId, StartLineNum,
+  OS.emitCVInlineLinetableDirective(Site.SiteFuncId, FileId, StartLineNum,
                                     FI.Begin, FI.End);
 
   endSymbolRecord(InlineEnd);
@@ -943,7 +963,8 @@ void CodeViewDebug::switchToDebugSectionForSymbol(const MCSymbol *GVSym) {
 void CodeViewDebug::emitDebugInfoForThunk(const Function *GV,
                                           FunctionInfo &FI,
                                           const MCSymbol *Fn) {
-  std::string FuncName = GlobalValue::dropLLVMManglingEscape(GV->getName());
+  std::string FuncName =
+      std::string(GlobalValue::dropLLVMManglingEscape(GV->getName()));
   const ThunkOrdinal ordinal = ThunkOrdinal::Standard; // Only supported kind.
 
   OS.AddComment("Symbol subsection for " + Twine(FuncName));
@@ -952,11 +973,11 @@ void CodeViewDebug::emitDebugInfoForThunk(const Function *GV,
   // Emit S_THUNK32
   MCSymbol *ThunkRecordEnd = beginSymbolRecord(SymbolKind::S_THUNK32);
   OS.AddComment("PtrParent");
-  OS.EmitIntValue(0, 4);
+  OS.emitInt32(0);
   OS.AddComment("PtrEnd");
-  OS.EmitIntValue(0, 4);
+  OS.emitInt32(0);
   OS.AddComment("PtrNext");
-  OS.EmitIntValue(0, 4);
+  OS.emitInt32(0);
   OS.AddComment("Thunk section relative address");
   OS.EmitCOFFSecRel32(Fn, /*Offset=*/0);
   OS.AddComment("Thunk section index");
@@ -964,7 +985,7 @@ void CodeViewDebug::emitDebugInfoForThunk(const Function *GV,
   OS.AddComment("Code size");
   OS.emitAbsoluteSymbolDiff(FI.End, Fn, 2);
   OS.AddComment("Ordinal");
-  OS.EmitIntValue(unsigned(ordinal), 1);
+  OS.emitInt8(unsigned(ordinal));
   OS.AddComment("Function name");
   emitNullTerminatedSymbolName(OS, FuncName);
   // Additional fields specific to the thunk ordinal would go here.
@@ -1006,7 +1027,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
 
   // If our DISubprogram name is empty, use the mangled name.
   if (FuncName.empty())
-    FuncName = GlobalValue::dropLLVMManglingEscape(GV->getName());
+    FuncName = std::string(GlobalValue::dropLLVMManglingEscape(GV->getName()));
 
   // Emit FPO data, but only on 32-bit x86. No other platforms use it.
   if (Triple(MMI->getModule()->getTargetTriple()).getArch() == Triple::x86)
@@ -1022,27 +1043,27 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
 
     // These fields are filled in by tools like CVPACK which run after the fact.
     OS.AddComment("PtrParent");
-    OS.EmitIntValue(0, 4);
+    OS.emitInt32(0);
     OS.AddComment("PtrEnd");
-    OS.EmitIntValue(0, 4);
+    OS.emitInt32(0);
     OS.AddComment("PtrNext");
-    OS.EmitIntValue(0, 4);
+    OS.emitInt32(0);
     // This is the important bit that tells the debugger where the function
     // code is located and what's its size:
     OS.AddComment("Code size");
     OS.emitAbsoluteSymbolDiff(FI.End, Fn, 4);
     OS.AddComment("Offset after prologue");
-    OS.EmitIntValue(0, 4);
+    OS.emitInt32(0);
     OS.AddComment("Offset before epilogue");
-    OS.EmitIntValue(0, 4);
+    OS.emitInt32(0);
     OS.AddComment("Function type index");
-    OS.EmitIntValue(getFuncIdForSubprogram(GV->getSubprogram()).getIndex(), 4);
+    OS.emitInt32(getFuncIdForSubprogram(GV->getSubprogram()).getIndex());
     OS.AddComment("Function section relative address");
     OS.EmitCOFFSecRel32(Fn, /*Offset=*/0);
     OS.AddComment("Function section index");
     OS.EmitCOFFSectionIndex(Fn);
     OS.AddComment("Flags");
-    OS.EmitIntValue(0, 1);
+    OS.emitInt8(0);
     // Emit the function display name as a null-terminated string.
     OS.AddComment("Function name");
     // Truncate the name so we won't overflow the record length field.
@@ -1052,19 +1073,19 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
     MCSymbol *FrameProcEnd = beginSymbolRecord(SymbolKind::S_FRAMEPROC);
     // Subtract out the CSR size since MSVC excludes that and we include it.
     OS.AddComment("FrameSize");
-    OS.EmitIntValue(FI.FrameSize - FI.CSRSize, 4);
+    OS.emitInt32(FI.FrameSize - FI.CSRSize);
     OS.AddComment("Padding");
-    OS.EmitIntValue(0, 4);
+    OS.emitInt32(0);
     OS.AddComment("Offset of padding");
-    OS.EmitIntValue(0, 4);
+    OS.emitInt32(0);
     OS.AddComment("Bytes of callee saved registers");
-    OS.EmitIntValue(FI.CSRSize, 4);
+    OS.emitInt32(FI.CSRSize);
     OS.AddComment("Exception handler offset");
-    OS.EmitIntValue(0, 4);
+    OS.emitInt32(0);
     OS.AddComment("Exception handler section");
-    OS.EmitIntValue(0, 2);
+    OS.emitInt16(0);
     OS.AddComment("Flags (defines frame register)");
-    OS.EmitIntValue(uint32_t(FI.FrameProcOpts), 4);
+    OS.emitInt32(uint32_t(FI.FrameProcOpts));
     endSymbolRecord(FrameProcEnd);
 
     emitLocalVariableList(FI, FI.Locals);
@@ -1088,13 +1109,13 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
       OS.EmitCOFFSecRel32(Label, /*Offset=*/0);
       // FIXME: Make sure we don't overflow the max record size.
       OS.EmitCOFFSectionIndex(Label);
-      OS.EmitIntValue(Strs->getNumOperands(), 2);
+      OS.emitInt16(Strs->getNumOperands());
       for (Metadata *MD : Strs->operands()) {
         // MDStrings are null terminated, so we can do EmitBytes and get the
         // nice .asciz directive.
         StringRef Str = cast<MDString>(MD)->getString();
         assert(Str.data()[Str.size()] == '\0' && "non-nullterminated MDString");
-        OS.EmitBytes(StringRef(Str.data(), Str.size() + 1));
+        OS.emitBytes(StringRef(Str.data(), Str.size() + 1));
       }
       endSymbolRecord(AnnotEnd);
     }
@@ -1111,7 +1132,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
       OS.AddComment("Call instruction length");
       OS.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 2);
       OS.AddComment("Type index");
-      OS.EmitIntValue(getCompleteTypeIndex(DITy).getIndex(), 4);
+      OS.emitInt32(getCompleteTypeIndex(DITy).getIndex());
       endSymbolRecord(HeapAllocEnd);
     }
 
@@ -1124,7 +1145,7 @@ void CodeViewDebug::emitDebugInfoForFunction(const Function *GV,
   endCVSubsection(SymbolsEnd);
 
   // We have an assembler directive that takes care of the whole line table.
-  OS.EmitCVLinetableDirective(FI.FuncId, Fn, FI.End);
+  OS.emitCVLinetableDirective(FI.FuncId, Fn, FI.End);
 }
 
 CodeViewDebug::LocalVarDefRange
@@ -1173,7 +1194,7 @@ void CodeViewDebug::collectVariableInfoFromMFTable(
     }
 
     // Get the frame register used and the offset.
-    unsigned FrameReg = 0;
+    Register FrameReg;
     int FrameOffset = TFI->getFrameIndexReference(*Asm->MF, VI.Slot, FrameReg);
     uint16_t CVReg = TRI->getCodeViewRegNum(FrameReg);
 
@@ -1468,12 +1489,12 @@ void CodeViewDebug::addToUDTs(const DIType *Ty) {
   if (!shouldEmitUdt(Ty))
     return;
 
-  SmallVector<StringRef, 5> QualifiedNameComponents;
+  SmallVector<StringRef, 5> ParentScopeNames;
   const DISubprogram *ClosestSubprogram =
-      getQualifiedNameComponents(Ty->getScope(), QualifiedNameComponents);
+      collectParentScopeNames(Ty->getScope(), ParentScopeNames);
 
   std::string FullyQualifiedName =
-      getQualifiedName(QualifiedNameComponents, getPrettyScopeName(Ty));
+      formatNestedName(ParentScopeNames, getPrettyScopeName(Ty));
 
   if (ClosestSubprogram == nullptr) {
     GlobalUDTs.emplace_back(std::move(FullyQualifiedName), Ty);
@@ -1571,7 +1592,7 @@ TypeIndex CodeViewDebug::lowerTypeArray(const DICompositeType *Ty) {
     assert(Element->getTag() == dwarf::DW_TAG_subrange_type);
 
     const DISubrange *Subrange = cast<DISubrange>(Element);
-    assert(Subrange->getLowerBound() == 0 &&
+    assert(!Subrange->getRawLowerBound() &&
            "codeview doesn't support subranges with lower bounds");
     int64_t Count = -1;
     if (auto *CI = Subrange->getCount().dyn_cast<ConstantInt*>())
@@ -1767,11 +1788,12 @@ translatePtrToMemberRep(unsigned SizeInBytes, bool IsPMF, unsigned Flags) {
 TypeIndex CodeViewDebug::lowerTypeMemberPointer(const DIDerivedType *Ty,
                                                 PointerOptions PO) {
   assert(Ty->getTag() == dwarf::DW_TAG_ptr_to_member_type);
+  bool IsPMF = isa<DISubroutineType>(Ty->getBaseType());
   TypeIndex ClassTI = getTypeIndex(Ty->getClassType());
-  TypeIndex PointeeTI = getTypeIndex(Ty->getBaseType(), Ty->getClassType());
+  TypeIndex PointeeTI =
+      getTypeIndex(Ty->getBaseType(), IsPMF ? Ty->getClassType() : nullptr);
   PointerKind PK = getPointerSizeInBytes() == 8 ? PointerKind::Near64
                                                 : PointerKind::Near32;
-  bool IsPMF = isa<DISubroutineType>(Ty->getBaseType());
   PointerMode PM = IsPMF ? PointerMode::PointerToMemberFunction
                          : PointerMode::PointerToDataMember;
 
@@ -2063,7 +2085,7 @@ TypeIndex CodeViewDebug::lowerTypeEnum(const DICompositeType *Ty) {
       // order, which is what MSVC does.
       if (auto *Enumerator = dyn_cast_or_null<DIEnumerator>(Element)) {
         EnumeratorRecord ER(MemberAccess::Public,
-                            APSInt::getUnsigned(Enumerator->getValue()),
+                            APSInt(Enumerator->getValue(), true),
                             Enumerator->getName());
         ContinuationBuilder.writeMemberType(ER);
         EnumeratorCount++;
@@ -2248,7 +2270,7 @@ TypeIndex CodeViewDebug::lowerCompleteTypeClass(const DICompositeType *Ty) {
 
   // MSVC appears to set this flag by searching any destructor or method with
   // FunctionOptions::Constructor among the emitted members. Clang AST has all
-  // the members, however special member functions are not yet emitted into 
+  // the members, however special member functions are not yet emitted into
   // debug information. For now checking a class's non-triviality seems enough.
   // FIXME: not true for a nested unnamed struct.
   if (isNonTrivial(Ty))
@@ -2625,9 +2647,9 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
   TypeIndex TI = Var.UseReferenceType
                      ? getTypeIndexForReferenceTo(Var.DIVar->getType())
                      : getCompleteTypeIndex(Var.DIVar->getType());
-  OS.EmitIntValue(TI.getIndex(), 4);
+  OS.emitInt32(TI.getIndex());
   OS.AddComment("Flags");
-  OS.EmitIntValue(static_cast<uint16_t>(Flags), 2);
+  OS.emitInt16(static_cast<uint16_t>(Flags));
   // Truncate the name so we won't overflow the record length field.
   emitNullTerminatedSymbolName(OS, Var.DIVar->getName());
   endSymbolRecord(LocalEnd);
@@ -2660,7 +2682,7 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
                : (EncFP == FI.EncodedLocalFramePtrReg))) {
         DefRangeFramePointerRelHeader DRHdr;
         DRHdr.Offset = Offset;
-        OS.EmitCVDefRangeDirective(DefRange.Ranges, DRHdr);
+        OS.emitCVDefRangeDirective(DefRange.Ranges, DRHdr);
       } else {
         uint16_t RegRelFlags = 0;
         if (DefRange.IsSubfield) {
@@ -2672,7 +2694,7 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
         DRHdr.Register = Reg;
         DRHdr.Flags = RegRelFlags;
         DRHdr.BasePointerOffset = Offset;
-        OS.EmitCVDefRangeDirective(DefRange.Ranges, DRHdr);
+        OS.emitCVDefRangeDirective(DefRange.Ranges, DRHdr);
       }
     } else {
       assert(DefRange.DataOffset == 0 && "unexpected offset into register");
@@ -2681,12 +2703,12 @@ void CodeViewDebug::emitLocalVariable(const FunctionInfo &FI,
         DRHdr.Register = DefRange.CVRegister;
         DRHdr.MayHaveNoName = 0;
         DRHdr.OffsetInParent = DefRange.StructOffset;
-        OS.EmitCVDefRangeDirective(DefRange.Ranges, DRHdr);
+        OS.emitCVDefRangeDirective(DefRange.Ranges, DRHdr);
       } else {
         DefRangeRegisterHeader DRHdr;
         DRHdr.Register = DefRange.CVRegister;
         DRHdr.MayHaveNoName = 0;
-        OS.EmitCVDefRangeDirective(DefRange.Ranges, DRHdr);
+        OS.emitCVDefRangeDirective(DefRange.Ranges, DRHdr);
       }
     }
   }
@@ -2704,9 +2726,9 @@ void CodeViewDebug::emitLexicalBlock(const LexicalBlock &Block,
                                      const FunctionInfo& FI) {
   MCSymbol *RecordEnd = beginSymbolRecord(SymbolKind::S_BLOCK32);
   OS.AddComment("PtrParent");
-  OS.EmitIntValue(0, 4);                                  // PtrParent
+  OS.emitInt32(0); // PtrParent
   OS.AddComment("PtrEnd");
-  OS.EmitIntValue(0, 4);                                  // PtrEnd
+  OS.emitInt32(0); // PtrEnd
   OS.AddComment("Code size");
   OS.emitAbsoluteSymbolDiff(Block.End, Block.Begin, 4);   // Code Size
   OS.AddComment("Function section relative address");
@@ -2914,17 +2936,17 @@ void CodeViewDebug::beginInstruction(const MachineInstr *MI) {
 MCSymbol *CodeViewDebug::beginCVSubsection(DebugSubsectionKind Kind) {
   MCSymbol *BeginLabel = MMI->getContext().createTempSymbol(),
            *EndLabel = MMI->getContext().createTempSymbol();
-  OS.EmitIntValue(unsigned(Kind), 4);
+  OS.emitInt32(unsigned(Kind));
   OS.AddComment("Subsection size");
   OS.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 4);
-  OS.EmitLabel(BeginLabel);
+  OS.emitLabel(BeginLabel);
   return EndLabel;
 }
 
 void CodeViewDebug::endCVSubsection(MCSymbol *EndLabel) {
-  OS.EmitLabel(EndLabel);
+  OS.emitLabel(EndLabel);
   // Every subsection must be aligned to a 4-byte boundary.
-  OS.EmitValueToAlignment(4);
+  OS.emitValueToAlignment(4);
 }
 
 static StringRef getSymbolName(SymbolKind SymKind) {
@@ -2939,10 +2961,10 @@ MCSymbol *CodeViewDebug::beginSymbolRecord(SymbolKind SymKind) {
            *EndLabel = MMI->getContext().createTempSymbol();
   OS.AddComment("Record length");
   OS.emitAbsoluteSymbolDiff(EndLabel, BeginLabel, 2);
-  OS.EmitLabel(BeginLabel);
+  OS.emitLabel(BeginLabel);
   if (OS.isVerboseAsm())
     OS.AddComment("Record kind: " + getSymbolName(SymKind));
-  OS.EmitIntValue(unsigned(SymKind), 2);
+  OS.emitInt16(unsigned(SymKind));
   return EndLabel;
 }
 
@@ -2951,27 +2973,31 @@ void CodeViewDebug::endSymbolRecord(MCSymbol *SymEnd) {
   // an extra copy of every symbol record in LLD. This increases object file
   // size by less than 1% in the clang build, and is compatible with the Visual
   // C++ linker.
-  OS.EmitValueToAlignment(4);
-  OS.EmitLabel(SymEnd);
+  OS.emitValueToAlignment(4);
+  OS.emitLabel(SymEnd);
 }
 
 void CodeViewDebug::emitEndSymbolRecord(SymbolKind EndKind) {
   OS.AddComment("Record length");
-  OS.EmitIntValue(2, 2);
+  OS.emitInt16(2);
   if (OS.isVerboseAsm())
     OS.AddComment("Record kind: " + getSymbolName(EndKind));
-  OS.EmitIntValue(unsigned(EndKind), 2); // Record Kind
+  OS.emitInt16(uint16_t(EndKind)); // Record Kind
 }
 
 void CodeViewDebug::emitDebugInfoForUDTs(
-    ArrayRef<std::pair<std::string, const DIType *>> UDTs) {
+    const std::vector<std::pair<std::string, const DIType *>> &UDTs) {
+#ifndef NDEBUG
+  size_t OriginalSize = UDTs.size();
+#endif
   for (const auto &UDT : UDTs) {
     const DIType *T = UDT.second;
     assert(shouldEmitUdt(T));
-
     MCSymbol *UDTRecordEnd = beginSymbolRecord(SymbolKind::S_UDT);
     OS.AddComment("Type");
-    OS.EmitIntValue(getCompleteTypeIndex(T).getIndex(), 4);
+    OS.emitInt32(getCompleteTypeIndex(T).getIndex());
+    assert(OriginalSize == UDTs.size() &&
+           "getCompleteTypeIndex found new UDTs!");
     emitNullTerminatedSymbolName(OS, UDT.first);
     endSymbolRecord(UDTRecordEnd);
   }
@@ -3075,6 +3101,14 @@ void CodeViewDebug::emitGlobalVariableList(ArrayRef<CVGlobalVariable> Globals) {
 
 void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) {
   const DIGlobalVariable *DIGV = CVGV.DIGV;
+
+  const DIScope *Scope = DIGV->getScope();
+  // For static data members, get the scope from the declaration.
+  if (const auto *MemberDecl = dyn_cast_or_null<DIDerivedType>(
+          DIGV->getRawStaticDataMemberDeclaration()))
+    Scope = MemberDecl->getScope();
+  std::string QualifiedName = getFullyQualifiedName(Scope, DIGV->getName());
+
   if (const GlobalVariable *GV =
           CVGV.GVInfo.dyn_cast<const GlobalVariable *>()) {
     // DataSym record, see SymbolRecord.h for more info. Thread local data
@@ -3087,18 +3121,16 @@ void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) {
                                                       : SymbolKind::S_GDATA32);
     MCSymbol *DataEnd = beginSymbolRecord(DataSym);
     OS.AddComment("Type");
-    OS.EmitIntValue(getCompleteTypeIndex(DIGV->getType()).getIndex(), 4);
+    OS.emitInt32(getCompleteTypeIndex(DIGV->getType()).getIndex());
     OS.AddComment("DataOffset");
     OS.EmitCOFFSecRel32(GVSym, /*Offset=*/0);
     OS.AddComment("Segment");
     OS.EmitCOFFSectionIndex(GVSym);
     OS.AddComment("Name");
     const unsigned LengthOfDataRecord = 12;
-    emitNullTerminatedSymbolName(OS, DIGV->getName(), LengthOfDataRecord);
+    emitNullTerminatedSymbolName(OS, QualifiedName, LengthOfDataRecord);
     endSymbolRecord(DataEnd);
   } else {
-    // FIXME: Currently this only emits the global variables in the IR metadata.
-    // This should also emit enums and static data members.
     const DIExpression *DIE = CVGV.GVInfo.get<const DIExpression *>();
     assert(DIE->isConstant() &&
            "Global constant variables must contain a constant expression.");
@@ -3106,7 +3138,7 @@ void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) {
 
     MCSymbol *SConstantEnd = beginSymbolRecord(SymbolKind::S_CONSTANT);
     OS.AddComment("Type");
-    OS.EmitIntValue(getTypeIndex(DIGV->getType()).getIndex(), 4);
+    OS.emitInt32(getTypeIndex(DIGV->getType()).getIndex());
     OS.AddComment("Value");
 
     // Encoded integers shouldn't need more than 10 bytes.
@@ -3115,16 +3147,10 @@ void CodeViewDebug::emitDebugInfoForGlobal(const CVGlobalVariable &CVGV) {
     CodeViewRecordIO IO(Writer);
     cantFail(IO.mapEncodedInteger(Val));
     StringRef SRef((char *)data, Writer.getOffset());
-    OS.EmitBinaryData(SRef);
+    OS.emitBinaryData(SRef);
 
     OS.AddComment("Name");
-    const DIScope *Scope = DIGV->getScope();
-    // For static data members, get the scope from the declaration.
-    if (const auto *MemberDecl = dyn_cast_or_null<DIDerivedType>(
-            DIGV->getRawStaticDataMemberDeclaration()))
-      Scope = MemberDecl->getScope();
-    emitNullTerminatedSymbolName(OS,
-                                 getFullyQualifiedName(Scope, DIGV->getName()));
+    emitNullTerminatedSymbolName(OS, QualifiedName);
     endSymbolRecord(SConstantEnd);
   }
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
index b56b9047e1a9..82f0293874d0 100644
--- a/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/CodeViewDebug.h
@@ -310,8 +310,8 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
 
   void emitDebugInfoForRetainedTypes();
 
-  void
-  emitDebugInfoForUDTs(ArrayRef<std::pair<std::string, const DIType *>> UDTs);
+  void emitDebugInfoForUDTs(
+      const std::vector<std::pair<std::string, const DIType *>> &UDTs);
 
   void emitDebugInfoForGlobals();
   void emitGlobalVariableList(ArrayRef<CVGlobalVariable> Globals);
@@ -443,6 +443,15 @@ class LLVM_LIBRARY_VISIBILITY CodeViewDebug : public DebugHandlerBase {
                                                codeview::TypeIndex TI,
                                                const DIType *ClassTy = nullptr);
 
+  /// Collect the names of parent scopes, innermost to outermost. Return the
+  /// innermost subprogram scope if present. Ensure that parent type scopes are
+  /// inserted into the type table.
+  const DISubprogram *
+  collectParentScopeNames(const DIScope *Scope,
+                          SmallVectorImpl<StringRef> &ParentScopeNames);
+  std::string getFullyQualifiedName(const DIScope *Scope, StringRef Name);
+  std::string getFullyQualifiedName(const DIScope *Scope);
+
   unsigned getPointerSizeInBytes();
 
 protected:
diff --git a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
index 84b86a71fa5f..edf82fbed650 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DIE.cpp
@@ -67,17 +67,17 @@ void DIEAbbrev::Profile(FoldingSetNodeID &ID) const {
 ///
 void DIEAbbrev::Emit(const AsmPrinter *AP) const {
   // Emit its Dwarf tag type.
-  AP->EmitULEB128(Tag, dwarf::TagString(Tag).data());
+  AP->emitULEB128(Tag, dwarf::TagString(Tag).data());
 
   // Emit whether it has children DIEs.
-  AP->EmitULEB128((unsigned)Children, dwarf::ChildrenString(Children).data());
+  AP->emitULEB128((unsigned)Children, dwarf::ChildrenString(Children).data());
 
   // For each attribute description.
   for (unsigned i = 0, N = Data.size(); i < N; ++i) {
     const DIEAbbrevData &AttrData = Data[i];
 
     // Emit attribute type.
-    AP->EmitULEB128(AttrData.getAttribute(),
+    AP->emitULEB128(AttrData.getAttribute(),
                     dwarf::AttributeString(AttrData.getAttribute()).data());
 
     // Emit form type.
@@ -92,17 +92,17 @@ void DIEAbbrev::Emit(const AsmPrinter *AP) const {
       llvm_unreachable("Invalid form for specified DWARF version");
     }
 #endif
-    AP->EmitULEB128(AttrData.getForm(),
+    AP->emitULEB128(AttrData.getForm(),
                     dwarf::FormEncodingString(AttrData.getForm()).data());
 
     // Emit value for DW_FORM_implicit_const.
     if (AttrData.getForm() == dwarf::DW_FORM_implicit_const)
-      AP->EmitSLEB128(AttrData.getValue());
+      AP->emitSLEB128(AttrData.getValue());
   }
 
   // Mark end of abbreviation.
-  AP->EmitULEB128(0, "EOM(1)");
-  AP->EmitULEB128(0, "EOM(2)");
+  AP->emitULEB128(0, "EOM(1)");
+  AP->emitULEB128(0, "EOM(2)");
 }
 
 LLVM_DUMP_METHOD
@@ -325,13 +325,13 @@ DIEUnit::DIEUnit(uint16_t V, uint8_t A, dwarf::Tag UnitTag)
          "expected a unit TAG");
 }
 
-void DIEValue::EmitValue(const AsmPrinter *AP) const {
+void DIEValue::emitValue(const AsmPrinter *AP) const {
   switch (Ty) {
   case isNone:
     llvm_unreachable("Expected valid DIEValue");
 #define HANDLE_DIEVALUE(T)                                                     \
   case is##T:                                                                  \
-    getDIE##T().EmitValue(AP, Form);                                           \
+    getDIE##T().emitValue(AP, Form);                                           \
     break;
 #include "llvm/CodeGen/DIEValue.def"
   }
@@ -374,7 +374,7 @@ LLVM_DUMP_METHOD void DIEValue::dump() const {
 
 /// EmitValue - Emit integer of appropriate size.
 ///
-void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
+void DIEInteger::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   switch (Form) {
   case dwarf::DW_FORM_implicit_const:
   case dwarf::DW_FORM_flag_present:
@@ -409,7 +409,7 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   case dwarf::DW_FORM_strp_sup:
   case dwarf::DW_FORM_addr:
   case dwarf::DW_FORM_ref_addr:
-    Asm->OutStreamer->EmitIntValue(Integer, SizeOf(Asm, Form));
+    Asm->OutStreamer->emitIntValue(Integer, SizeOf(Asm, Form));
     return;
   case dwarf::DW_FORM_GNU_str_index:
   case dwarf::DW_FORM_GNU_addr_index:
@@ -418,10 +418,10 @@ void DIEInteger::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   case dwarf::DW_FORM_addrx:
   case dwarf::DW_FORM_rnglistx:
   case dwarf::DW_FORM_udata:
-    Asm->EmitULEB128(Integer);
+    Asm->emitULEB128(Integer);
     return;
   case dwarf::DW_FORM_sdata:
-    Asm->EmitSLEB128(Integer);
+    Asm->emitSLEB128(Integer);
     return;
   default: llvm_unreachable("DIE Value form not supported yet");
   }
@@ -465,8 +465,8 @@ void DIEInteger::print(raw_ostream &O) const {
 
 /// EmitValue - Emit expression value.
 ///
-void DIEExpr::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
-  AP->EmitDebugValue(Expr, SizeOf(AP, Form));
+void DIEExpr::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  AP->emitDebugValue(Expr, SizeOf(AP, Form));
 }
 
 /// SizeOf - Determine size of expression value in bytes.
@@ -487,12 +487,11 @@ void DIEExpr::print(raw_ostream &O) const { O << "Expr: " << *Expr; }
 
 /// EmitValue - Emit label value.
 ///
-void DIELabel::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
-  AP->EmitLabelReference(Label, SizeOf(AP, Form),
-                         Form == dwarf::DW_FORM_strp ||
-                             Form == dwarf::DW_FORM_sec_offset ||
-                             Form == dwarf::DW_FORM_ref_addr ||
-                             Form == dwarf::DW_FORM_data4);
+void DIELabel::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  AP->emitLabelReference(
+      Label, SizeOf(AP, Form),
+      Form == dwarf::DW_FORM_strp || Form == dwarf::DW_FORM_sec_offset ||
+          Form == dwarf::DW_FORM_ref_addr || Form == dwarf::DW_FORM_data4);
 }
 
 /// SizeOf - Determine size of label value in bytes.
@@ -511,10 +510,10 @@ void DIELabel::print(raw_ostream &O) const { O << "Lbl: " << Label->getName(); }
 // DIEBaseTypeRef Implementation
 //===----------------------------------------------------------------------===//
 
-void DIEBaseTypeRef::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+void DIEBaseTypeRef::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   uint64_t Offset = CU->ExprRefedBaseTypes[Index].Die->getOffset();
   assert(Offset < (1ULL << (ULEB128PadSize * 7)) && "Offset wont fit");
-  AP->EmitULEB128(Offset, nullptr, ULEB128PadSize);
+  AP->emitULEB128(Offset, nullptr, ULEB128PadSize);
 }
 
 unsigned DIEBaseTypeRef::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
@@ -530,8 +529,8 @@ void DIEBaseTypeRef::print(raw_ostream &O) const { O << "BaseTypeRef: " << Index
 
 /// EmitValue - Emit delta value.
 ///
-void DIEDelta::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
-  AP->EmitLabelDifference(LabelHi, LabelLo, SizeOf(AP, Form));
+void DIEDelta::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+  AP->emitLabelDifference(LabelHi, LabelLo, SizeOf(AP, Form));
 }
 
 /// SizeOf - Determine size of delta value in bytes.
@@ -554,7 +553,7 @@ void DIEDelta::print(raw_ostream &O) const {
 
 /// EmitValue - Emit string value.
 ///
-void DIEString::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+void DIEString::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   // Index of string in symbol table.
   switch (Form) {
   case dwarf::DW_FORM_GNU_str_index:
@@ -563,13 +562,13 @@ void DIEString::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   case dwarf::DW_FORM_strx2:
   case dwarf::DW_FORM_strx3:
   case dwarf::DW_FORM_strx4:
-    DIEInteger(S.getIndex()).EmitValue(AP, Form);
+    DIEInteger(S.getIndex()).emitValue(AP, Form);
     return;
   case dwarf::DW_FORM_strp:
     if (AP->MAI->doesDwarfUseRelocationsAcrossSections())
-      DIELabel(S.getSymbol()).EmitValue(AP, Form);
+      DIELabel(S.getSymbol()).emitValue(AP, Form);
     else
-      DIEInteger(S.getOffset()).EmitValue(AP, Form);
+      DIEInteger(S.getOffset()).emitValue(AP, Form);
     return;
   default:
     llvm_unreachable("Expected valid string form");
@@ -605,9 +604,9 @@ void DIEString::print(raw_ostream &O) const {
 //===----------------------------------------------------------------------===//
 // DIEInlineString Implementation
 //===----------------------------------------------------------------------===//
-void DIEInlineString::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+void DIEInlineString::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_string) {
-    AP->OutStreamer->EmitBytes(S);
+    AP->OutStreamer->emitBytes(S);
     AP->emitInt8(0);
     return;
   }
@@ -630,18 +629,18 @@ void DIEInlineString::print(raw_ostream &O) const {
 
 /// EmitValue - Emit debug information entry offset.
 ///
-void DIEEntry::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+void DIEEntry::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
 
   switch (Form) {
   case dwarf::DW_FORM_ref1:
   case dwarf::DW_FORM_ref2:
   case dwarf::DW_FORM_ref4:
   case dwarf::DW_FORM_ref8:
-    AP->OutStreamer->EmitIntValue(Entry->getOffset(), SizeOf(AP, Form));
+    AP->OutStreamer->emitIntValue(Entry->getOffset(), SizeOf(AP, Form));
     return;
 
   case dwarf::DW_FORM_ref_udata:
-    AP->EmitULEB128(Entry->getOffset());
+    AP->emitULEB128(Entry->getOffset());
     return;
 
   case dwarf::DW_FORM_ref_addr: {
@@ -649,11 +648,11 @@ void DIEEntry::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
     unsigned Addr = Entry->getDebugSectionOffset();
     if (const MCSymbol *SectionSym =
             Entry->getUnit()->getCrossSectionRelativeBaseAddress()) {
-      AP->EmitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form), true);
+      AP->emitLabelPlusOffset(SectionSym, Addr, SizeOf(AP, Form), true);
       return;
     }
 
-    AP->OutStreamer->EmitIntValue(Addr, SizeOf(AP, Form));
+    AP->OutStreamer->emitIntValue(Addr, SizeOf(AP, Form));
     return;
   }
   default:
@@ -711,7 +710,7 @@ unsigned DIELoc::ComputeSize(const AsmPrinter *AP) const {
 
 /// EmitValue - Emit location data.
 ///
-void DIELoc::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
+void DIELoc::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   switch (Form) {
   default: llvm_unreachable("Improper form for block");
   case dwarf::DW_FORM_block1: Asm->emitInt8(Size);    break;
@@ -719,11 +718,12 @@ void DIELoc::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   case dwarf::DW_FORM_block4: Asm->emitInt32(Size);   break;
   case dwarf::DW_FORM_block:
   case dwarf::DW_FORM_exprloc:
-    Asm->EmitULEB128(Size); break;
+    Asm->emitULEB128(Size);
+    break;
   }
 
   for (const auto &V : values())
-    V.EmitValue(Asm);
+    V.emitValue(Asm);
 }
 
 /// SizeOf - Determine size of location data in bytes.
@@ -762,19 +762,21 @@ unsigned DIEBlock::ComputeSize(const AsmPrinter *AP) const {
 
 /// EmitValue - Emit block data.
 ///
-void DIEBlock::EmitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
+void DIEBlock::emitValue(const AsmPrinter *Asm, dwarf::Form Form) const {
   switch (Form) {
   default: llvm_unreachable("Improper form for block");
   case dwarf::DW_FORM_block1: Asm->emitInt8(Size);    break;
   case dwarf::DW_FORM_block2: Asm->emitInt16(Size);   break;
   case dwarf::DW_FORM_block4: Asm->emitInt32(Size);   break;
-  case dwarf::DW_FORM_block:  Asm->EmitULEB128(Size); break;
+  case dwarf::DW_FORM_block:
+    Asm->emitULEB128(Size);
+    break;
   case dwarf::DW_FORM_string: break;
   case dwarf::DW_FORM_data16: break;
   }
 
   for (const auto &V : values())
-    V.EmitValue(Asm);
+    V.emitValue(Asm);
 }
 
 /// SizeOf - Determine size of block data in bytes.
@@ -811,9 +813,9 @@ unsigned DIELocList::SizeOf(const AsmPrinter *AP, dwarf::Form Form) const {
 
 /// EmitValue - Emit label value.
 ///
-void DIELocList::EmitValue(const AsmPrinter *AP, dwarf::Form Form) const {
+void DIELocList::emitValue(const AsmPrinter *AP, dwarf::Form Form) const {
   if (Form == dwarf::DW_FORM_loclistx) {
-    AP->EmitULEB128(Index);
+    AP->emitULEB128(Index);
     return;
   }
   DwarfDebug *DD = AP->getDwarfDebug();
diff --git a/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp b/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
index bfac8850a2a6..f26ef63eedec 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DIEHash.cpp
@@ -17,10 +17,8 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/AsmPrinter.h"
-#include "llvm/CodeGen/DIE.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
-#include "llvm/Support/MD5.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -224,8 +222,9 @@ void DIEHash::hashLocList(const DIELocList &LocList) {
   HashingByteStreamer Streamer(*this);
   DwarfDebug &DD = *AP->getDwarfDebug();
   const DebugLocStream &Locs = DD.getDebugLocs();
-  for (const auto &Entry : Locs.getEntries(Locs.getList(LocList.getValue())))
-    DD.emitDebugLocEntry(Streamer, Entry, nullptr);
+  const DebugLocStream::List &List = Locs.getList(LocList.getValue());
+  for (const DebugLocStream::Entry &Entry : Locs.getEntries(List))
+    DD.emitDebugLocEntry(Streamer, Entry, List.CU);
 }
 
 // Hash an individual attribute \param Attr based on the type of attribute and
@@ -361,7 +360,7 @@ void DIEHash::computeHash(const DIE &Die) {
   for (auto &C : Die.children()) {
     // 7.27 Step 7
     // If C is a nested type entry or a member function entry, ...
-    if (isType(C.getTag()) || C.getTag() == dwarf::DW_TAG_subprogram) {
+    if (isType(C.getTag()) || (C.getTag() == dwarf::DW_TAG_subprogram && isType(C.getParent()->getTag()))) {
       StringRef Name = getDIEStringAttr(C, dwarf::DW_AT_name);
       // ... and has a DW_AT_name attribute
       if (!Name.empty()) {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DIEHash.h b/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
index 2e49514c98be..1a69f6772873 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DIEHash.h
@@ -20,7 +20,6 @@
 namespace llvm {
 
 class AsmPrinter;
-class CompileUnit;
 
 /// An object containing the capability of hashing and adding hash
 /// attributes onto a DIE.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
index 170fc8b6d49f..584b7614915d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DbgEntityHistoryCalculator.cpp
@@ -47,7 +47,8 @@ static Register isDescribedByReg(const MachineInstr &MI) {
     return 0;
   // If location of variable is described using a register (directly or
   // indirectly), this register is always a first operand.
-  return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : Register();
+  return MI.getDebugOperand(0).isReg() ? MI.getDebugOperand(0).getReg()
+                                       : Register();
 }
 
 bool DbgValueHistoryMap::startDbgValue(InlinedEntity Var,
diff --git a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
index 22f458e4b03e..880791a06d93 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DebugHandlerBase.cpp
@@ -32,9 +32,9 @@ DbgVariableLocation::extractFromMachineInstruction(
   DbgVariableLocation Location;
   if (!Instruction.isDebugValue())
     return None;
-  if (!Instruction.getOperand(0).isReg())
+  if (!Instruction.getDebugOperand(0).isReg())
     return None;
-  Location.Register = Instruction.getOperand(0).getReg();
+  Location.Register = Instruction.getDebugOperand(0).getReg();
   Location.FragmentInfo.reset();
   // We only handle expressions generated by DIExpression::appendOffset,
   // which doesn't require a full stack machine.
@@ -124,21 +124,6 @@ MCSymbol *DebugHandlerBase::getLabelAfterInsn(const MachineInstr *MI) {
   return LabelsAfterInsn.lookup(MI);
 }
 
-// Return the function-local offset of an instruction.
-const MCExpr *
-DebugHandlerBase::getFunctionLocalOffsetAfterInsn(const MachineInstr *MI) {
-  MCContext &MC = Asm->OutContext;
-
-  MCSymbol *Start = Asm->getFunctionBegin();
-  const auto *StartRef = MCSymbolRefExpr::create(Start, MC);
-
-  MCSymbol *AfterInsn = getLabelAfterInsn(MI);
-  assert(AfterInsn && "Expected label after instruction");
-  const auto *AfterRef = MCSymbolRefExpr::create(AfterInsn, MC);
-
-  return MCBinaryExpr::createSub(AfterRef, StartRef, MC);
-}
-
 /// If this type is derived from a base type then return base type size.
 uint64_t DebugHandlerBase::getBaseTypeSize(const DIType *Ty) {
   assert(Ty);
@@ -215,7 +200,7 @@ void DebugHandlerBase::beginFunction(const MachineFunction *MF) {
       continue;
 
     auto IsDescribedByReg = [](const MachineInstr *MI) {
-      return MI->getOperand(0).isReg() && MI->getOperand(0).getReg();
+      return MI->getDebugOperand(0).isReg() && MI->getDebugOperand(0).getReg();
     };
 
     // The first mention of a function argument gets the CurrentFnBegin label,
@@ -297,7 +282,7 @@ void DebugHandlerBase::beginInstruction(const MachineInstr *MI) {
 
   if (!PrevLabel) {
     PrevLabel = MMI->getContext().createTempSymbol();
-    Asm->OutStreamer->EmitLabel(PrevLabel);
+    Asm->OutStreamer->emitLabel(PrevLabel);
   }
   I->second = PrevLabel;
 }
@@ -329,7 +314,7 @@ void DebugHandlerBase::endInstruction() {
   // We need a label after this instruction.
   if (!PrevLabel) {
     PrevLabel = MMI->getContext().createTempSymbol();
-    Asm->OutStreamer->EmitLabel(PrevLabel);
+    Asm->OutStreamer->emitLabel(PrevLabel);
   }
   I->second = PrevLabel;
 }
@@ -342,3 +327,17 @@ void DebugHandlerBase::endFunction(const MachineFunction *MF) {
   LabelsBeforeInsn.clear();
   LabelsAfterInsn.clear();
 }
+
+void DebugHandlerBase::beginBasicBlock(const MachineBasicBlock &MBB) {
+  if (!MBB.isBeginSection())
+    return;
+
+  PrevLabel = MBB.getSymbol();
+}
+
+void DebugHandlerBase::endBasicBlock(const MachineBasicBlock &MBB) {
+  if (!MBB.isEndSection())
+    return;
+
+  PrevLabel = nullptr;
+}
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
index facbf22946e4..11ed1062f77e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCFIException.cpp
@@ -47,8 +47,8 @@ void DwarfCFIExceptionBase::markFunctionEnd() {
 }
 
 void DwarfCFIExceptionBase::endFragment() {
-  if (shouldEmitCFI)
-    Asm->OutStreamer->EmitCFIEndProc();
+  if (shouldEmitCFI && !Asm->MF->hasBBSections())
+    Asm->OutStreamer->emitCFIEndProc();
 }
 
 DwarfCFIException::DwarfCFIException(AsmPrinter *A)
@@ -133,13 +133,13 @@ void DwarfCFIException::beginFragment(const MachineBasicBlock *MBB,
 
   if (!hasEmittedCFISections) {
     if (Asm->needsOnlyDebugCFIMoves())
-      Asm->OutStreamer->EmitCFISections(false, true);
+      Asm->OutStreamer->emitCFISections(false, true);
     else if (Asm->TM.Options.ForceDwarfFrameSection)
-      Asm->OutStreamer->EmitCFISections(true, true);
+      Asm->OutStreamer->emitCFISections(true, true);
     hasEmittedCFISections = true;
   }
 
-  Asm->OutStreamer->EmitCFIStartProc(/*IsSimple=*/false);
+  Asm->OutStreamer->emitCFIStartProc(/*IsSimple=*/false);
 
   // Indicate personality routine, if any.
   if (!shouldEmitPersonality)
@@ -157,11 +157,11 @@ void DwarfCFIException::beginFragment(const MachineBasicBlock *MBB,
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   unsigned PerEncoding = TLOF.getPersonalityEncoding();
   const MCSymbol *Sym = TLOF.getCFIPersonalitySymbol(P, Asm->TM, MMI);
-  Asm->OutStreamer->EmitCFIPersonality(Sym, PerEncoding);
+  Asm->OutStreamer->emitCFIPersonality(Sym, PerEncoding);
 
   // Provide LSDA information.
   if (shouldEmitLSDA)
-    Asm->OutStreamer->EmitCFILsda(ESP(Asm), TLOF.getLSDAEncoding());
+    Asm->OutStreamer->emitCFILsda(ESP(Asm), TLOF.getLSDAEncoding());
 }
 
 /// endFunction - Gather and emit post-function exception information.
@@ -172,3 +172,12 @@ void DwarfCFIException::endFunction(const MachineFunction *MF) {
 
   emitExceptionTable();
 }
+
+void DwarfCFIException::beginBasicBlock(const MachineBasicBlock &MBB) {
+  beginFragment(&MBB, getExceptionSym);
+}
+
+void DwarfCFIException::endBasicBlock(const MachineBasicBlock &MBB) {
+  if (shouldEmitCFI)
+    Asm->OutStreamer->emitCFIEndProc();
+}
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 38011102c7b3..296c380ae550 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -37,6 +37,7 @@
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/MC/MachineLocation.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
@@ -113,8 +114,9 @@ unsigned DwarfCompileUnit::getOrCreateSourceID(const DIFile *File) {
   // extend .file to support this.
   unsigned CUID = Asm->OutStreamer->hasRawTextSupport() ? 0 : getUniqueID();
   if (!File)
-    return Asm->OutStreamer->EmitDwarfFileDirective(0, "", "", None, None, CUID);
-  return Asm->OutStreamer->EmitDwarfFileDirective(
+    return Asm->OutStreamer->emitDwarfFileDirective(0, "", "", None, None,
+                                                    CUID);
+  return Asm->OutStreamer->emitDwarfFileDirective(
       0, File->getDirectory(), File->getFilename(), getMD5AsBytes(File),
       File->getSource(), CUID);
 }
@@ -154,7 +156,8 @@ DIE *DwarfCompileUnit::getOrCreateGlobalVariableDIE(
     DeclContext = GV->getScope();
     // Add name and type.
     addString(*VariableDIE, dwarf::DW_AT_name, GV->getDisplayName());
-    addType(*VariableDIE, GTy);
+    if (GTy)
+      addType(*VariableDIE, GTy);
 
     // Add scoping info.
     if (!GV->isLocalToUnit())
@@ -328,6 +331,8 @@ DIE *DwarfCompileUnit::getOrCreateCommonBlock(
 }
 
 void DwarfCompileUnit::addRange(RangeSpan Range) {
+  DD->insertSectionLabel(Range.Begin);
+
   bool SameAsPrevCU = this == DD->getPrevCU();
   DD->setPrevCU(this);
   // If we have no current ranges just add the range and return, otherwise,
@@ -348,8 +353,6 @@ void DwarfCompileUnit::initStmtList() {
   if (CUNode->isDebugDirectivesOnly())
     return;
 
-  // Define start line table label for each Compile Unit.
-  MCSymbol *LineTableStartSym;
   const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
   if (DD->useSectionsAsReferences()) {
     LineTableStartSym = TLOF.getDwarfLineSection()->getBeginSymbol();
@@ -363,13 +366,14 @@ void DwarfCompileUnit::initStmtList() {
   // left in the skeleton CU and so not included.
   // The line table entries are not always emitted in assembly, so it
   // is not okay to use line_table_start here.
-  StmtListValue =
       addSectionLabel(getUnitDie(), dwarf::DW_AT_stmt_list, LineTableStartSym,
                       TLOF.getDwarfLineSection()->getBeginSymbol());
 }
 
 void DwarfCompileUnit::applyStmtList(DIE &D) {
-  D.addValue(DIEValueAllocator, *StmtListValue);
+  const TargetLoweringObjectFile &TLOF = Asm->getObjFileLowering();
+  addSectionLabel(D, dwarf::DW_AT_stmt_list, LineTableStartSym,
+                  TLOF.getDwarfLineSection()->getBeginSymbol());
 }
 
 void DwarfCompileUnit::attachLowHighPC(DIE &D, const MCSymbol *Begin,
@@ -392,7 +396,14 @@ void DwarfCompileUnit::attachLowHighPC(DIE &D, const MCSymbol *Begin,
 DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) {
   DIE *SPDie = getOrCreateSubprogramDIE(SP, includeMinimalInlineScopes());
 
-  attachLowHighPC(*SPDie, Asm->getFunctionBegin(), Asm->getFunctionEnd());
+  SmallVector<RangeSpan, 2> BB_List;
+  // If basic block sections are on, ranges for each basic block section has
+  // to be emitted separately.
+  for (const auto &R : Asm->MBBSectionRanges)
+    BB_List.push_back({R.second.BeginLabel, R.second.EndLabel});
+
+  attachRangesOrLowHighPC(*SPDie, BB_List);
+
   if (DD->useAppleExtensionAttributes() &&
       !DD->getCurrentFunction()->getTarget().Options.DisableFramePointerElim(
           *DD->getCurrentFunction()))
@@ -400,15 +411,60 @@ DIE &DwarfCompileUnit::updateSubprogramScopeDIE(const DISubprogram *SP) {
 
   // Only include DW_AT_frame_base in full debug info
   if (!includeMinimalInlineScopes()) {
-    if (Asm->MF->getTarget().getTargetTriple().isNVPTX()) {
+    const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
+    TargetFrameLowering::DwarfFrameBase FrameBase =
+        TFI->getDwarfFrameBase(*Asm->MF);
+    switch (FrameBase.Kind) {
+    case TargetFrameLowering::DwarfFrameBase::Register: {
+      if (Register::isPhysicalRegister(FrameBase.Location.Reg)) {
+        MachineLocation Location(FrameBase.Location.Reg);
+        addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
+      }
+      break;
+    }
+    case TargetFrameLowering::DwarfFrameBase::CFA: {
       DIELoc *Loc = new (DIEValueAllocator) DIELoc;
       addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_call_frame_cfa);
       addBlock(*SPDie, dwarf::DW_AT_frame_base, Loc);
-    } else {
-      const TargetRegisterInfo *RI = Asm->MF->getSubtarget().getRegisterInfo();
-      MachineLocation Location(RI->getFrameRegister(*Asm->MF));
-      if (Register::isPhysicalRegister(Location.getReg()))
-        addAddress(*SPDie, dwarf::DW_AT_frame_base, Location);
+      break;
+    }
+    case TargetFrameLowering::DwarfFrameBase::WasmFrameBase: {
+      // FIXME: duplicated from Target/WebAssembly/WebAssembly.h
+      // don't want to depend on target specific headers in this code?
+      const unsigned TI_GLOBAL_RELOC = 3;
+      if (FrameBase.Location.WasmLoc.Kind == TI_GLOBAL_RELOC) {
+        // These need to be relocatable.
+        assert(FrameBase.Location.WasmLoc.Index == 0);  // Only SP so far.
+        auto SPSym = cast<MCSymbolWasm>(
+          Asm->GetExternalSymbolSymbol("__stack_pointer"));
+        // FIXME: this repeats what WebAssemblyMCInstLower::
+        // GetExternalSymbolSymbol does, since if there's no code that
+        // refers to this symbol, we have to set it here.
+        SPSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
+        SPSym->setGlobalType(wasm::WasmGlobalType{
+            uint8_t(Asm->getSubtargetInfo().getTargetTriple().getArch() ==
+                            Triple::wasm64
+                        ? wasm::WASM_TYPE_I64
+                        : wasm::WASM_TYPE_I32),
+            true});
+        DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_WASM_location);
+        addSInt(*Loc, dwarf::DW_FORM_sdata, FrameBase.Location.WasmLoc.Kind);
+        addLabel(*Loc, dwarf::DW_FORM_udata, SPSym);
+        DD->addArangeLabel(SymbolCU(this, SPSym));
+        addUInt(*Loc, dwarf::DW_FORM_data1, dwarf::DW_OP_stack_value);
+        addBlock(*SPDie, dwarf::DW_AT_frame_base, Loc);
+      } else {
+        DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+        DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
+        DIExpressionCursor Cursor({});
+        DwarfExpr.addWasmLocation(FrameBase.Location.WasmLoc.Kind,
+            FrameBase.Location.WasmLoc.Index);
+        DwarfExpr.addExpression(std::move(Cursor));
+        addBlock(*SPDie, dwarf::DW_AT_frame_base, DwarfExpr.finalize());
+      }
+      break;
+    }
     }
   }
 
@@ -521,9 +577,33 @@ void DwarfCompileUnit::attachRangesOrLowHighPC(
     DIE &Die, const SmallVectorImpl<InsnRange> &Ranges) {
   SmallVector<RangeSpan, 2> List;
   List.reserve(Ranges.size());
-  for (const InsnRange &R : Ranges)
-    List.push_back(
-        {DD->getLabelBeforeInsn(R.first), DD->getLabelAfterInsn(R.second)});
+  for (const InsnRange &R : Ranges) {
+    auto *BeginLabel = DD->getLabelBeforeInsn(R.first);
+    auto *EndLabel = DD->getLabelAfterInsn(R.second);
+
+    const auto *BeginMBB = R.first->getParent();
+    const auto *EndMBB = R.second->getParent();
+
+    const auto *MBB = BeginMBB;
+    // Basic block sections allows basic block subsets to be placed in unique
+    // sections. For each section, the begin and end label must be added to the
+    // list. If there is more than one range, debug ranges must be used.
+    // Otherwise, low/high PC can be used.
+    // FIXME: Debug Info Emission depends on block order and this assumes that
+    // the order of blocks will be frozen beyond this point.
+    do {
+      if (MBB->sameSection(EndMBB) || MBB->isEndSection()) {
+        auto MBBSectionRange = Asm->MBBSectionRanges[MBB->getSectionIDNum()];
+        List.push_back(
+            {MBB->sameSection(BeginMBB) ? BeginLabel
+                                        : MBBSectionRange.BeginLabel,
+             MBB->sameSection(EndMBB) ? EndLabel : MBBSectionRange.EndLabel});
+      }
+      if (MBB->sameSection(EndMBB))
+        break;
+      MBB = MBB->getNextNode();
+    } while (true);
+  }
   attachRangesOrLowHighPC(Die, std::move(List));
 }
 
@@ -654,7 +734,7 @@ DIE *DwarfCompileUnit::constructVariableDIEImpl(const DbgVariable &DV,
   DIELoc *Loc = new (DIEValueAllocator) DIELoc;
   DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
   for (auto &Fragment : DV.getFrameIndexExprs()) {
-    unsigned FrameReg = 0;
+    Register FrameReg;
     const DIExpression *Expr = Fragment.Expr;
     const TargetFrameLowering *TFI = Asm->MF->getSubtarget().getFrameLowering();
     int Offset = TFI->getFrameIndexReference(*Asm->MF, Fragment.FI, FrameReg);
@@ -719,11 +799,22 @@ static SmallVector<const DIVariable *, 2> dependencies(DbgVariable *Var) {
   auto *Array = dyn_cast<DICompositeType>(Var->getType());
   if (!Array || Array->getTag() != dwarf::DW_TAG_array_type)
     return Result;
+  if (auto *DLVar = Array->getDataLocation())
+    Result.push_back(DLVar);
   for (auto *El : Array->getElements()) {
     if (auto *Subrange = dyn_cast<DISubrange>(El)) {
-      auto Count = Subrange->getCount();
-      if (auto *Dependency = Count.dyn_cast<DIVariable *>())
-        Result.push_back(Dependency);
+      if (auto Count = Subrange->getCount())
+        if (auto *Dependency = Count.dyn_cast<DIVariable *>())
+          Result.push_back(Dependency);
+      if (auto LB = Subrange->getLowerBound())
+        if (auto *Dependency = LB.dyn_cast<DIVariable *>())
+          Result.push_back(Dependency);
+      if (auto UB = Subrange->getUpperBound())
+        if (auto *Dependency = UB.dyn_cast<DIVariable *>())
+          Result.push_back(Dependency);
+      if (auto ST = Subrange->getStride())
+        if (auto *Dependency = ST.dyn_cast<DIVariable *>())
+          Result.push_back(Dependency);
     }
   }
   return Result;
@@ -904,13 +995,12 @@ void DwarfCompileUnit::constructAbstractSubprogramScopeDIE(
     ContextCU->addDIEEntry(*AbsDef, dwarf::DW_AT_object_pointer, *ObjectPointer);
 }
 
-/// Whether to use the GNU analog for a DWARF5 tag, attribute, or location atom.
-static bool useGNUAnalogForDwarf5Feature(DwarfDebug *DD) {
+bool DwarfCompileUnit::useGNUAnalogForDwarf5Feature() const {
   return DD->getDwarfVersion() == 4 && DD->tuneForGDB();
 }
 
 dwarf::Tag DwarfCompileUnit::getDwarf5OrGNUTag(dwarf::Tag Tag) const {
-  if (!useGNUAnalogForDwarf5Feature(DD))
+  if (!useGNUAnalogForDwarf5Feature())
     return Tag;
   switch (Tag) {
   case dwarf::DW_TAG_call_site:
@@ -924,7 +1014,7 @@ dwarf::Tag DwarfCompileUnit::getDwarf5OrGNUTag(dwarf::Tag Tag) const {
 
 dwarf::Attribute
 DwarfCompileUnit::getDwarf5OrGNUAttr(dwarf::Attribute Attr) const {
-  if (!useGNUAnalogForDwarf5Feature(DD))
+  if (!useGNUAnalogForDwarf5Feature())
     return Attr;
   switch (Attr) {
   case dwarf::DW_AT_call_all_calls:
@@ -933,7 +1023,7 @@ DwarfCompileUnit::getDwarf5OrGNUAttr(dwarf::Attribute Attr) const {
     return dwarf::DW_AT_GNU_call_site_target;
   case dwarf::DW_AT_call_origin:
     return dwarf::DW_AT_abstract_origin;
-  case dwarf::DW_AT_call_pc:
+  case dwarf::DW_AT_call_return_pc:
     return dwarf::DW_AT_low_pc;
   case dwarf::DW_AT_call_value:
     return dwarf::DW_AT_GNU_call_site_value;
@@ -946,7 +1036,7 @@ DwarfCompileUnit::getDwarf5OrGNUAttr(dwarf::Attribute Attr) const {
 
 dwarf::LocationAtom
 DwarfCompileUnit::getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const {
-  if (!useGNUAnalogForDwarf5Feature(DD))
+  if (!useGNUAnalogForDwarf5Feature())
     return Loc;
   switch (Loc) {
   case dwarf::DW_OP_entry_value:
@@ -956,9 +1046,12 @@ DwarfCompileUnit::getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const {
   }
 }
 
-DIE &DwarfCompileUnit::constructCallSiteEntryDIE(
-    DIE &ScopeDIE, const DISubprogram *CalleeSP, bool IsTail,
-    const MCSymbol *PCAddr, const MCExpr *PCOffset, unsigned CallReg) {
+DIE &DwarfCompileUnit::constructCallSiteEntryDIE(DIE &ScopeDIE,
+                                                 DIE *CalleeDIE,
+                                                 bool IsTail,
+                                                 const MCSymbol *PCAddr,
+                                                 const MCSymbol *CallAddr,
+                                                 unsigned CallReg) {
   // Insert a call site entry DIE within ScopeDIE.
   DIE &CallSiteDIE = createAndAddDIE(getDwarf5OrGNUTag(dwarf::DW_TAG_call_site),
                                      ScopeDIE, nullptr);
@@ -968,24 +1061,41 @@ DIE &DwarfCompileUnit::constructCallSiteEntryDIE(
     addAddress(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_target),
                MachineLocation(CallReg));
   } else {
-    DIE *CalleeDIE = getDIE(CalleeSP);
-    assert(CalleeDIE && "Could not find DIE for call site entry origin");
+    assert(CalleeDIE && "No DIE for call site entry origin");
     addDIEEntry(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_origin),
                 *CalleeDIE);
   }
 
-  if (IsTail)
+  if (IsTail) {
     // Attach DW_AT_call_tail_call to tail calls for standards compliance.
     addFlag(CallSiteDIE, getDwarf5OrGNUAttr(dwarf::DW_AT_call_tail_call));
 
+    // Attach the address of the branch instruction to allow the debugger to
+    // show where the tail call occurred. This attribute has no GNU analog.
+    //
+    // GDB works backwards from non-standard usage of DW_AT_low_pc (in DWARF4
+    // mode -- equivalently, in DWARF5 mode, DW_AT_call_return_pc) at tail-call
+    // site entries to figure out the PC of tail-calling branch instructions.
+    // This means it doesn't need the compiler to emit DW_AT_call_pc, so we
+    // don't emit it here.
+    //
+    // There's no need to tie non-GDB debuggers to this non-standardness, as it
+    // adds unnecessary complexity to the debugger. For non-GDB debuggers, emit
+    // the standard DW_AT_call_pc info.
+    if (!useGNUAnalogForDwarf5Feature())
+      addLabelAddress(CallSiteDIE, dwarf::DW_AT_call_pc, CallAddr);
+  }
+
   // Attach the return PC to allow the debugger to disambiguate call paths
   // from one function to another.
-  if (DD->getDwarfVersion() == 4 && DD->tuneForGDB()) {
-    assert(PCAddr && "Missing PC information for a call");
-    addLabelAddress(CallSiteDIE, dwarf::DW_AT_low_pc, PCAddr);
-  } else if (!IsTail || DD->tuneForGDB()) {
-    assert(PCOffset && "Missing return PC information for a call");
-    addAddressExpr(CallSiteDIE, dwarf::DW_AT_call_return_pc, PCOffset);
+  //
+  // The return PC is only really needed when the call /isn't/ a tail call, but
+  // GDB expects it in DWARF4 mode, even for tail calls (see the comment above
+  // the DW_AT_call_pc emission logic for an explanation).
+  if (!IsTail || useGNUAnalogForDwarf5Feature()) {
+    assert(PCAddr && "Missing return PC information for a call");
+    addLabelAddress(CallSiteDIE,
+                    getDwarf5OrGNUAttr(dwarf::DW_AT_call_return_pc), PCAddr);
   }
 
   return CallSiteDIE;
@@ -1108,7 +1218,7 @@ void DwarfCompileUnit::emitHeader(bool UseOffsets) {
   // Don't bother labeling the .dwo unit, as its offset isn't used.
   if (!Skeleton && !DD->useSectionsAsReferences()) {
     LabelBegin = Asm->createTempSymbol("cu_begin");
-    Asm->OutStreamer->EmitLabel(LabelBegin);
+    Asm->OutStreamer->emitLabel(LabelBegin);
   }
 
   dwarf::UnitType UT = Skeleton ? dwarf::DW_UT_split_compile
@@ -1219,15 +1329,12 @@ void DwarfCompileUnit::addComplexAddress(const DbgVariable &DV, DIE &Die,
   DIEDwarfExpression DwarfExpr(*Asm, *this, *Loc);
   const DIExpression *DIExpr = DV.getSingleExpression();
   DwarfExpr.addFragmentOffset(DIExpr);
-  if (Location.isIndirect())
-    DwarfExpr.setMemoryLocationKind();
+  DwarfExpr.setLocation(Location, DIExpr);
 
   DIExpressionCursor Cursor(DIExpr);
 
-  if (DIExpr->isEntryValue()) {
-    DwarfExpr.setEntryValueFlag();
+  if (DIExpr->isEntryValue())
     DwarfExpr.beginEntryValueExpression(Cursor);
-  }
 
   const TargetRegisterInfo &TRI = *Asm->MF->getSubtarget().getRegisterInfo();
   if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
@@ -1285,12 +1392,6 @@ void DwarfCompileUnit::addExpr(DIELoc &Die, dwarf::Form Form,
   Die.addValue(DIEValueAllocator, (dwarf::Attribute)0, Form, DIEExpr(Expr));
 }
 
-void DwarfCompileUnit::addAddressExpr(DIE &Die, dwarf::Attribute Attribute,
-                                      const MCExpr *Expr) {
-  Die.addValue(DIEValueAllocator, Attribute, dwarf::DW_FORM_addr,
-               DIEExpr(Expr));
-}
-
 void DwarfCompileUnit::applySubprogramAttributesToDefinition(
     const DISubprogram *SP, DIE &SPDie) {
   auto *SPDecl = SP->getDeclaration();
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
index 8491d078ed89..4ccd8c96dd0d 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfCompileUnit.h
@@ -47,9 +47,9 @@ class DwarfCompileUnit final : public DwarfUnit {
   unsigned UniqueID;
   bool HasRangeLists = false;
 
-  /// The attribute index of DW_AT_stmt_list in the compile unit DIE, avoiding
-  /// the need to search for it in applyStmtList.
-  DIE::value_iterator StmtListValue;
+  /// The start of the unit line section, this is also
+  /// reused in appyStmtList.
+  MCSymbol *LineTableStartSym;
 
   /// Skeleton unit associated with this unit.
   DwarfCompileUnit *Skeleton = nullptr;
@@ -123,6 +123,9 @@ public:
   /// Apply the DW_AT_stmt_list from this compile unit to the specified DIE.
   void applyStmtList(DIE &D);
 
+  /// Get line table start symbol for this unit.
+  MCSymbol *getLineTableStartSym() const { return LineTableStartSym; }
+
   /// A pair of GlobalVariable and DIExpression.
   struct GlobalExpr {
     const GlobalVariable *Var;
@@ -230,6 +233,10 @@ public:
 
   void constructAbstractSubprogramScopeDIE(LexicalScope *Scope);
 
+  /// Whether to use the GNU analog for a DWARF5 tag, attribute, or location
+  /// atom. Only applicable when emitting otherwise DWARF4-compliant debug info.
+  bool useGNUAnalogForDwarf5Feature() const;
+
   /// This takes a DWARF 5 tag and returns it or a GNU analog.
   dwarf::Tag getDwarf5OrGNUTag(dwarf::Tag Tag) const;
 
@@ -240,19 +247,17 @@ public:
   dwarf::LocationAtom getDwarf5OrGNULocationAtom(dwarf::LocationAtom Loc) const;
 
   /// Construct a call site entry DIE describing a call within \p Scope to a
-  /// callee described by \p CalleeSP.
+  /// callee described by \p CalleeDIE.
+  /// \p CalleeDIE is a declaration or definition subprogram DIE for the callee.
+  /// For indirect calls \p CalleeDIE is set to nullptr.
   /// \p IsTail specifies whether the call is a tail call.
-  /// \p PCAddr (used for GDB + DWARF 4 tuning) points to the PC value after
-  /// the call instruction.
-  /// \p PCOffset (used for cases other than GDB + DWARF 4 tuning) must be
-  /// non-zero for non-tail calls (in the case of non-gdb tuning, since for
-  /// GDB + DWARF 5 tuning we still generate PC info for tail calls) or be the
-  /// function-local offset to PC value after the call instruction.
+  /// \p PCAddr points to the PC value after the call instruction.
+  /// \p CallAddr points to the PC value at the call instruction (or is null).
   /// \p CallReg is a register location for an indirect call. For direct calls
   /// the \p CallReg is set to 0.
-  DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, const DISubprogram *CalleeSP,
-                                 bool IsTail, const MCSymbol *PCAddr,
-                                 const MCExpr *PCOffset, unsigned CallReg);
+  DIE &constructCallSiteEntryDIE(DIE &ScopeDIE, DIE *CalleeDIE, bool IsTail,
+                                 const MCSymbol *PCAddr,
+                                 const MCSymbol *CallAddr, unsigned CallReg);
   /// Construct call site parameter DIEs for the \p CallSiteDIE. The \p Params
   /// were collected by the \ref collectCallSiteParameters.
   /// Note: The order of parameters does not matter, since debuggers recognize
@@ -340,9 +345,6 @@ public:
   /// Add a Dwarf expression attribute data and value.
   void addExpr(DIELoc &Die, dwarf::Form Form, const MCExpr *Expr);
 
-  /// Add an attribute containing an address expression to \p Die.
-  void addAddressExpr(DIE &Die, dwarf::Attribute Attribute, const MCExpr *Expr);
-
   void applySubprogramAttributesToDefinition(const DISubprogram *SP,
                                              DIE &SPDie);
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
index fa6800de7955..45ed5256deb9 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.cpp
@@ -95,6 +95,10 @@ static cl::opt<bool> UseDwarfRangesBaseAddressSpecifier(
     "use-dwarf-ranges-base-address-specifier", cl::Hidden,
     cl::desc("Use base address specifiers in debug_ranges"), cl::init(false));
 
+static cl::opt<bool> EmitDwarfDebugEntryValues(
+    "emit-debug-entry-values", cl::Hidden,
+    cl::desc("Emit the debug entry values"), cl::init(false));
+
 static cl::opt<bool> GenerateARangeSection("generate-arange-section",
                                            cl::Hidden,
                                            cl::desc("Generate dwarf aranges"),
@@ -163,6 +167,11 @@ static cl::opt<LinkageNameOption>
                                             "Abstract subprograms")),
                       cl::init(DefaultLinkageNames));
 
+static cl::opt<unsigned> LocationAnalysisSizeLimit(
+    "singlevarlocation-input-bb-limit",
+    cl::desc("Maximum block size to analyze for single-location variables"),
+    cl::init(30000), cl::Hidden);
+
 static const char *const DWARFGroupName = "dwarf";
 static const char *const DWARFGroupDescription = "DWARF Emission";
 static const char *const DbgTimerName = "writer";
@@ -176,11 +185,11 @@ void DebugLocDwarfExpression::emitOp(uint8_t Op, const char *Comment) {
 }
 
 void DebugLocDwarfExpression::emitSigned(int64_t Value) {
-  getActiveStreamer().EmitSLEB128(Value, Twine(Value));
+  getActiveStreamer().emitSLEB128(Value, Twine(Value));
 }
 
 void DebugLocDwarfExpression::emitUnsigned(uint64_t Value) {
-  getActiveStreamer().EmitULEB128(Value, Twine(Value));
+  getActiveStreamer().emitULEB128(Value, Twine(Value));
 }
 
 void DebugLocDwarfExpression::emitData1(uint8_t Value) {
@@ -189,7 +198,7 @@ void DebugLocDwarfExpression::emitData1(uint8_t Value) {
 
 void DebugLocDwarfExpression::emitBaseTypeRef(uint64_t Idx) {
   assert(Idx < (1ULL << (ULEB128PadSize * 7)) && "Idx wont fit");
-  getActiveStreamer().EmitULEB128(Idx, Twine(Idx), ULEB128PadSize);
+  getActiveStreamer().emitULEB128(Idx, Twine(Idx), ULEB128PadSize);
 }
 
 bool DebugLocDwarfExpression::isFrameRegister(const TargetRegisterInfo &TRI,
@@ -232,26 +241,26 @@ const DIType *DbgVariable::getType() const {
 static DbgValueLoc getDebugLocValue(const MachineInstr *MI) {
   const DIExpression *Expr = MI->getDebugExpression();
   assert(MI->getNumOperands() == 4);
-  if (MI->getOperand(0).isReg()) {
-    auto RegOp = MI->getOperand(0);
-    auto Op1 = MI->getOperand(1);
+  if (MI->getDebugOperand(0).isReg()) {
+    auto RegOp = MI->getDebugOperand(0);
+    auto Op1 = MI->getDebugOffset();
     // If the second operand is an immediate, this is a
     // register-indirect address.
     assert((!Op1.isImm() || (Op1.getImm() == 0)) && "unexpected offset");
     MachineLocation MLoc(RegOp.getReg(), Op1.isImm());
     return DbgValueLoc(Expr, MLoc);
   }
-  if (MI->getOperand(0).isTargetIndex()) {
-    auto Op = MI->getOperand(0);
+  if (MI->getDebugOperand(0).isTargetIndex()) {
+    auto Op = MI->getDebugOperand(0);
     return DbgValueLoc(Expr,
                        TargetIndexLocation(Op.getIndex(), Op.getOffset()));
   }
-  if (MI->getOperand(0).isImm())
-    return DbgValueLoc(Expr, MI->getOperand(0).getImm());
-  if (MI->getOperand(0).isFPImm())
-    return DbgValueLoc(Expr, MI->getOperand(0).getFPImm());
-  if (MI->getOperand(0).isCImm())
-    return DbgValueLoc(Expr, MI->getOperand(0).getCImm());
+  if (MI->getDebugOperand(0).isImm())
+    return DbgValueLoc(Expr, MI->getDebugOperand(0).getImm());
+  if (MI->getDebugOperand(0).isFPImm())
+    return DbgValueLoc(Expr, MI->getDebugOperand(0).getFPImm());
+  if (MI->getDebugOperand(0).isCImm())
+    return DbgValueLoc(Expr, MI->getDebugOperand(0).getCImm());
 
   llvm_unreachable("Unexpected 4-operand DBG_VALUE instruction!");
 }
@@ -419,6 +428,12 @@ DwarfDebug::DwarfDebug(AsmPrinter *A, Module *M)
   // a monolithic string offsets table without any header.
   UseSegmentedStringOffsetsTable = DwarfVersion >= 5;
 
+  // Emit call-site-param debug info for GDB and LLDB, if the target supports
+  // the debug entry values feature. It can also be enabled explicitly.
+  EmitDebugEntryValues = (Asm->TM.Options.ShouldEmitDebugEntryValues() &&
+                          (tuneForGDB() || tuneForLLDB())) ||
+                         EmitDwarfDebugEntryValues;
+
   Asm->OutStreamer->getContext().setDwarfVersion(DwarfVersion);
 }
 
@@ -548,11 +563,214 @@ DIE &DwarfDebug::constructSubprogramDefinitionDIE(const DISubprogram *SP) {
   return *CU.getOrCreateSubprogramDIE(SP);
 }
 
+/// Represents a parameter whose call site value can be described by applying a
+/// debug expression to a register in the forwarded register worklist.
+struct FwdRegParamInfo {
+  /// The described parameter register.
+  unsigned ParamReg;
+
+  /// Debug expression that has been built up when walking through the
+  /// instruction chain that produces the parameter's value.
+  const DIExpression *Expr;
+};
+
+/// Register worklist for finding call site values.
+using FwdRegWorklist = MapVector<unsigned, SmallVector<FwdRegParamInfo, 2>>;
+
+/// Append the expression \p Addition to \p Original and return the result.
+static const DIExpression *combineDIExpressions(const DIExpression *Original,
+                                                const DIExpression *Addition) {
+  std::vector<uint64_t> Elts = Addition->getElements().vec();
+  // Avoid multiple DW_OP_stack_values.
+  if (Original->isImplicit() && Addition->isImplicit())
+    erase_if(Elts, [](uint64_t Op) { return Op == dwarf::DW_OP_stack_value; });
+  const DIExpression *CombinedExpr =
+      (Elts.size() > 0) ? DIExpression::append(Original, Elts) : Original;
+  return CombinedExpr;
+}
+
+/// Emit call site parameter entries that are described by the given value and
+/// debug expression.
+template <typename ValT>
+static void finishCallSiteParams(ValT Val, const DIExpression *Expr,
+                                 ArrayRef<FwdRegParamInfo> DescribedParams,
+                                 ParamSet &Params) {
+  for (auto Param : DescribedParams) {
+    bool ShouldCombineExpressions = Expr && Param.Expr->getNumElements() > 0;
+
+    // TODO: Entry value operations can currently not be combined with any
+    // other expressions, so we can't emit call site entries in those cases.
+    if (ShouldCombineExpressions && Expr->isEntryValue())
+      continue;
+
+    // If a parameter's call site value is produced by a chain of
+    // instructions we may have already created an expression for the
+    // parameter when walking through the instructions. Append that to the
+    // base expression.
+    const DIExpression *CombinedExpr =
+        ShouldCombineExpressions ? combineDIExpressions(Expr, Param.Expr)
+                                 : Expr;
+    assert((!CombinedExpr || CombinedExpr->isValid()) &&
+           "Combined debug expression is invalid");
+
+    DbgValueLoc DbgLocVal(CombinedExpr, Val);
+    DbgCallSiteParam CSParm(Param.ParamReg, DbgLocVal);
+    Params.push_back(CSParm);
+    ++NumCSParams;
+  }
+}
+
+/// Add \p Reg to the worklist, if it's not already present, and mark that the
+/// given parameter registers' values can (potentially) be described using
+/// that register and an debug expression.
+static void addToFwdRegWorklist(FwdRegWorklist &Worklist, unsigned Reg,
+                                const DIExpression *Expr,
+                                ArrayRef<FwdRegParamInfo> ParamsToAdd) {
+  auto I = Worklist.insert({Reg, {}});
+  auto &ParamsForFwdReg = I.first->second;
+  for (auto Param : ParamsToAdd) {
+    assert(none_of(ParamsForFwdReg,
+                   [Param](const FwdRegParamInfo &D) {
+                     return D.ParamReg == Param.ParamReg;
+                   }) &&
+           "Same parameter described twice by forwarding reg");
+
+    // If a parameter's call site value is produced by a chain of
+    // instructions we may have already created an expression for the
+    // parameter when walking through the instructions. Append that to the
+    // new expression.
+    const DIExpression *CombinedExpr = combineDIExpressions(Expr, Param.Expr);
+    ParamsForFwdReg.push_back({Param.ParamReg, CombinedExpr});
+  }
+}
+
+/// Interpret values loaded into registers by \p CurMI.
+static void interpretValues(const MachineInstr *CurMI,
+                            FwdRegWorklist &ForwardedRegWorklist,
+                            ParamSet &Params) {
+
+  const MachineFunction *MF = CurMI->getMF();
+  const DIExpression *EmptyExpr =
+      DIExpression::get(MF->getFunction().getContext(), {});
+  const auto &TRI = *MF->getSubtarget().getRegisterInfo();
+  const auto &TII = *MF->getSubtarget().getInstrInfo();
+  const auto &TLI = *MF->getSubtarget().getTargetLowering();
+
+  // If an instruction defines more than one item in the worklist, we may run
+  // into situations where a worklist register's value is (potentially)
+  // described by the previous value of another register that is also defined
+  // by that instruction.
+  //
+  // This can for example occur in cases like this:
+  //
+  //   $r1 = mov 123
+  //   $r0, $r1 = mvrr $r1, 456
+  //   call @foo, $r0, $r1
+  //
+  // When describing $r1's value for the mvrr instruction, we need to make sure
+  // that we don't finalize an entry value for $r0, as that is dependent on the
+  // previous value of $r1 (123 rather than 456).
+  //
+  // In order to not have to distinguish between those cases when finalizing
+  // entry values, we simply postpone adding new parameter registers to the
+  // worklist, by first keeping them in this temporary container until the
+  // instruction has been handled.
+  FwdRegWorklist TmpWorklistItems;
+
+  // If the MI is an instruction defining one or more parameters' forwarding
+  // registers, add those defines.
+  auto getForwardingRegsDefinedByMI = [&](const MachineInstr &MI,
+                                          SmallSetVector<unsigned, 4> &Defs) {
+    if (MI.isDebugInstr())
+      return;
+
+    for (const MachineOperand &MO : MI.operands()) {
+      if (MO.isReg() && MO.isDef() &&
+          Register::isPhysicalRegister(MO.getReg())) {
+        for (auto FwdReg : ForwardedRegWorklist)
+          if (TRI.regsOverlap(FwdReg.first, MO.getReg()))
+            Defs.insert(FwdReg.first);
+      }
+    }
+  };
+
+  // Set of worklist registers that are defined by this instruction.
+  SmallSetVector<unsigned, 4> FwdRegDefs;
+
+  getForwardingRegsDefinedByMI(*CurMI, FwdRegDefs);
+  if (FwdRegDefs.empty())
+    return;
+
+  for (auto ParamFwdReg : FwdRegDefs) {
+    if (auto ParamValue = TII.describeLoadedValue(*CurMI, ParamFwdReg)) {
+      if (ParamValue->first.isImm()) {
+        int64_t Val = ParamValue->first.getImm();
+        finishCallSiteParams(Val, ParamValue->second,
+                             ForwardedRegWorklist[ParamFwdReg], Params);
+      } else if (ParamValue->first.isReg()) {
+        Register RegLoc = ParamValue->first.getReg();
+        unsigned SP = TLI.getStackPointerRegisterToSaveRestore();
+        Register FP = TRI.getFrameRegister(*MF);
+        bool IsSPorFP = (RegLoc == SP) || (RegLoc == FP);
+        if (TRI.isCalleeSavedPhysReg(RegLoc, *MF) || IsSPorFP) {
+          MachineLocation MLoc(RegLoc, /*IsIndirect=*/IsSPorFP);
+          finishCallSiteParams(MLoc, ParamValue->second,
+                               ForwardedRegWorklist[ParamFwdReg], Params);
+        } else {
+          // ParamFwdReg was described by the non-callee saved register
+          // RegLoc. Mark that the call site values for the parameters are
+          // dependent on that register instead of ParamFwdReg. Since RegLoc
+          // may be a register that will be handled in this iteration, we
+          // postpone adding the items to the worklist, and instead keep them
+          // in a temporary container.
+          addToFwdRegWorklist(TmpWorklistItems, RegLoc, ParamValue->second,
+                              ForwardedRegWorklist[ParamFwdReg]);
+        }
+      }
+    }
+  }
+
+  // Remove all registers that this instruction defines from the worklist.
+  for (auto ParamFwdReg : FwdRegDefs)
+    ForwardedRegWorklist.erase(ParamFwdReg);
+
+  // Now that we are done handling this instruction, add items from the
+  // temporary worklist to the real one.
+  for (auto New : TmpWorklistItems)
+    addToFwdRegWorklist(ForwardedRegWorklist, New.first, EmptyExpr, New.second);
+  TmpWorklistItems.clear();
+}
+
+static bool interpretNextInstr(const MachineInstr *CurMI,
+                               FwdRegWorklist &ForwardedRegWorklist,
+                               ParamSet &Params) {
+  // Skip bundle headers.
+  if (CurMI->isBundle())
+    return true;
+
+  // If the next instruction is a call we can not interpret parameter's
+  // forwarding registers or we finished the interpretation of all
+  // parameters.
+  if (CurMI->isCall())
+    return false;
+
+  if (ForwardedRegWorklist.empty())
+    return false;
+
+  // Avoid NOP description.
+  if (CurMI->getNumOperands() == 0)
+    return true;
+
+  interpretValues(CurMI, ForwardedRegWorklist, Params);
+
+  return true;
+}
+
 /// Try to interpret values loaded into registers that forward parameters
 /// for \p CallMI. Store parameters with interpreted value into \p Params.
 static void collectCallSiteParameters(const MachineInstr *CallMI,
                                       ParamSet &Params) {
-  auto *MF = CallMI->getMF();
+  const MachineFunction *MF = CallMI->getMF();
   auto CalleesMap = MF->getCallSitesInfo();
   auto CallFwdRegsInfo = CalleesMap.find(CallMI);
 
@@ -560,18 +778,21 @@ static void collectCallSiteParameters(const MachineInstr *CallMI,
   if (CallFwdRegsInfo == CalleesMap.end())
     return;
 
-  auto *MBB = CallMI->getParent();
-  const auto &TRI = MF->getSubtarget().getRegisterInfo();
-  const auto &TII = MF->getSubtarget().getInstrInfo();
-  const auto &TLI = MF->getSubtarget().getTargetLowering();
+  const MachineBasicBlock *MBB = CallMI->getParent();
 
   // Skip the call instruction.
   auto I = std::next(CallMI->getReverseIterator());
 
-  DenseSet<unsigned> ForwardedRegWorklist;
+  FwdRegWorklist ForwardedRegWorklist;
+
+  const DIExpression *EmptyExpr =
+      DIExpression::get(MF->getFunction().getContext(), {});
+
   // Add all the forwarding registers into the ForwardedRegWorklist.
   for (auto ArgReg : CallFwdRegsInfo->second) {
-    bool InsertedReg = ForwardedRegWorklist.insert(ArgReg.Reg).second;
+    bool InsertedReg =
+        ForwardedRegWorklist.insert({ArgReg.Reg, {{ArgReg.Reg, EmptyExpr}}})
+            .second;
     assert(InsertedReg && "Single register used to forward two arguments?");
     (void)InsertedReg;
   }
@@ -581,107 +802,29 @@ static void collectCallSiteParameters(const MachineInstr *CallMI,
   // the describeLoadedValue()). For those remaining arguments in the working
   // list, for which we do not describe a loaded value by
   // the describeLoadedValue(), we try to generate an entry value expression
-  // for their call site value desctipion, if the call is within the entry MBB.
-  // The RegsForEntryValues maps a forwarding register into the register holding
-  // the entry value.
+  // for their call site value description, if the call is within the entry MBB.
   // TODO: Handle situations when call site parameter value can be described
-  // as the entry value within basic blocks other then the first one.
+  // as the entry value within basic blocks other than the first one.
   bool ShouldTryEmitEntryVals = MBB->getIterator() == MF->begin();
-  DenseMap<unsigned, unsigned> RegsForEntryValues;
 
-  // If the MI is an instruction defining one or more parameters' forwarding
-  // registers, add those defines. We can currently only describe forwarded
-  // registers that are explicitly defined, but keep track of implicit defines
-  // also to remove those registers from the work list.
-  auto getForwardingRegsDefinedByMI = [&](const MachineInstr &MI,
-                                          SmallVectorImpl<unsigned> &Explicit,
-                                          SmallVectorImpl<unsigned> &Implicit) {
-    if (MI.isDebugInstr())
+  // Search for a loading value in forwarding registers inside call delay slot.
+  if (CallMI->hasDelaySlot()) {
+    auto Suc = std::next(CallMI->getIterator());
+    // Only one-instruction delay slot is supported.
+    auto BundleEnd = llvm::getBundleEnd(CallMI->getIterator());
+    (void)BundleEnd;
+    assert(std::next(Suc) == BundleEnd &&
+           "More than one instruction in call delay slot");
+    // Try to interpret value loaded by instruction.
+    if (!interpretNextInstr(&*Suc, ForwardedRegWorklist, Params))
       return;
-
-    for (const MachineOperand &MO : MI.operands()) {
-      if (MO.isReg() && MO.isDef() &&
-          Register::isPhysicalRegister(MO.getReg())) {
-        for (auto FwdReg : ForwardedRegWorklist) {
-          if (TRI->regsOverlap(FwdReg, MO.getReg())) {
-            if (MO.isImplicit())
-              Implicit.push_back(FwdReg);
-            else
-              Explicit.push_back(FwdReg);
-          }
-        }
-      }
-    }
-  };
-
-  auto finishCallSiteParam = [&](DbgValueLoc DbgLocVal, unsigned Reg) {
-    unsigned FwdReg = Reg;
-    if (ShouldTryEmitEntryVals) {
-      auto EntryValReg = RegsForEntryValues.find(Reg);
-      if (EntryValReg != RegsForEntryValues.end())
-        FwdReg = EntryValReg->second;
-    }
-
-    DbgCallSiteParam CSParm(FwdReg, DbgLocVal);
-    Params.push_back(CSParm);
-    ++NumCSParams;
-  };
+  }
 
   // Search for a loading value in forwarding registers.
   for (; I != MBB->rend(); ++I) {
-    // Skip bundle headers.
-    if (I->isBundle())
-      continue;
-
-    // If the next instruction is a call we can not interpret parameter's
-    // forwarding registers or we finished the interpretation of all parameters.
-    if (I->isCall())
-      return;
-
-    if (ForwardedRegWorklist.empty())
+    // Try to interpret values loaded by instruction.
+    if (!interpretNextInstr(&*I, ForwardedRegWorklist, Params))
       return;
-
-    SmallVector<unsigned, 4> ExplicitFwdRegDefs;
-    SmallVector<unsigned, 4> ImplicitFwdRegDefs;
-    getForwardingRegsDefinedByMI(*I, ExplicitFwdRegDefs, ImplicitFwdRegDefs);
-    if (ExplicitFwdRegDefs.empty() && ImplicitFwdRegDefs.empty())
-      continue;
-
-    // If the MI clobbers more then one forwarding register we must remove
-    // all of them from the working list.
-    for (auto Reg : concat<unsigned>(ExplicitFwdRegDefs, ImplicitFwdRegDefs))
-      ForwardedRegWorklist.erase(Reg);
-
-    for (auto ParamFwdReg : ExplicitFwdRegDefs) {
-      if (auto ParamValue = TII->describeLoadedValue(*I, ParamFwdReg)) {
-        if (ParamValue->first.isImm()) {
-          int64_t Val = ParamValue->first.getImm();
-          DbgValueLoc DbgLocVal(ParamValue->second, Val);
-          finishCallSiteParam(DbgLocVal, ParamFwdReg);
-        } else if (ParamValue->first.isReg()) {
-          Register RegLoc = ParamValue->first.getReg();
-          // TODO: For now, there is no use of describing the value loaded into the
-          //       register that is also the source registers (e.g. $r0 = add $r0, x).
-          if (ParamFwdReg == RegLoc)
-            continue;
-
-          unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
-          Register FP = TRI->getFrameRegister(*MF);
-          bool IsSPorFP = (RegLoc == SP) || (RegLoc == FP);
-          if (TRI->isCalleeSavedPhysReg(RegLoc, *MF) || IsSPorFP) {
-            DbgValueLoc DbgLocVal(ParamValue->second,
-                                  MachineLocation(RegLoc,
-                                                  /*IsIndirect=*/IsSPorFP));
-            finishCallSiteParam(DbgLocVal, ParamFwdReg);
-          // TODO: Add support for entry value plus an expression.
-          } else if (ShouldTryEmitEntryVals &&
-                     ParamValue->second->getNumElements() == 0) {
-            ForwardedRegWorklist.insert(RegLoc);
-            RegsForEntryValues[RegLoc] = ParamFwdReg;
-          }
-        }
-      }
-    }
   }
 
   // Emit the call site parameter's value as an entry value.
@@ -690,15 +833,8 @@ static void collectCallSiteParameters(const MachineInstr *CallMI,
     DIExpression *EntryExpr = DIExpression::get(
         MF->getFunction().getContext(), {dwarf::DW_OP_LLVM_entry_value, 1});
     for (auto RegEntry : ForwardedRegWorklist) {
-      unsigned FwdReg = RegEntry;
-      auto EntryValReg = RegsForEntryValues.find(RegEntry);
-        if (EntryValReg != RegsForEntryValues.end())
-          FwdReg = EntryValReg->second;
-
-      DbgValueLoc DbgLocVal(EntryExpr, MachineLocation(RegEntry));
-      DbgCallSiteParam CSParm(FwdReg, DbgLocVal);
-      Params.push_back(CSParm);
-      ++NumCSParams;
+      MachineLocation MLoc(RegEntry.first);
+      finishCallSiteParams(MLoc, EntryExpr, RegEntry.second, Params);
     }
   }
 }
@@ -719,7 +855,25 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
 
   const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
   assert(TII && "TargetInstrInfo not found: cannot label tail calls");
-  bool ApplyGNUExtensions = getDwarfVersion() == 4 && tuneForGDB();
+
+  // Delay slot support check.
+  auto delaySlotSupported = [&](const MachineInstr &MI) {
+    if (!MI.isBundledWithSucc())
+      return false;
+    auto Suc = std::next(MI.getIterator());
+    auto CallInstrBundle = getBundleStart(MI.getIterator());
+    (void)CallInstrBundle;
+    auto DelaySlotBundle = getBundleStart(Suc);
+    (void)DelaySlotBundle;
+    // Ensure that label after call is following delay slot instruction.
+    // Ex. CALL_INSTRUCTION {
+    //       DELAY_SLOT_INSTRUCTION }
+    //      LABEL_AFTER_CALL
+    assert(getLabelAfterInsn(&*CallInstrBundle) ==
+               getLabelAfterInsn(&*DelaySlotBundle) &&
+           "Call and its successor instruction don't have same label after.");
+    return true;
+  };
 
   // Emit call site entries for each call or tail call in the function.
   for (const MachineBasicBlock &MBB : MF) {
@@ -732,11 +886,16 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
 
       // Skip instructions which aren't calls. Both calls and tail-calling jump
       // instructions (e.g TAILJMPd64) are classified correctly here.
-      if (!MI.isCall())
+      if (!MI.isCandidateForCallSiteEntry())
+        continue;
+
+      // Skip instructions marked as frame setup, as they are not interesting to
+      // the user.
+      if (MI.getFlag(MachineInstr::FrameSetup))
         continue;
 
-      // TODO: Add support for targets with delay slots (see: beginInstruction).
-      if (MI.hasDelaySlot())
+      // Check if delay slot support is enabled.
+      if (MI.hasDelaySlot() && !delaySlotSupported(*&MI))
         return;
 
       // If this is a direct call, find the callee's subprogram.
@@ -747,7 +906,7 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
         continue;
 
       unsigned CallReg = 0;
-      const DISubprogram *CalleeSP = nullptr;
+      DIE *CalleeDIE = nullptr;
       const Function *CalleeDecl = nullptr;
       if (CalleeOp.isReg()) {
         CallReg = CalleeOp.getReg();
@@ -757,18 +916,19 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
         CalleeDecl = dyn_cast<Function>(CalleeOp.getGlobal());
         if (!CalleeDecl || !CalleeDecl->getSubprogram())
           continue;
-        CalleeSP = CalleeDecl->getSubprogram();
+        const DISubprogram *CalleeSP = CalleeDecl->getSubprogram();
 
         if (CalleeSP->isDefinition()) {
           // Ensure that a subprogram DIE for the callee is available in the
           // appropriate CU.
-          constructSubprogramDefinitionDIE(CalleeSP);
+          CalleeDIE = &constructSubprogramDefinitionDIE(CalleeSP);
         } else {
           // Create the declaration DIE if it is missing. This is required to
           // support compilation of old bitcode with an incomplete list of
           // retained metadata.
-          CU.getOrCreateSubprogramDIE(CalleeSP);
+          CalleeDIE = CU.getOrCreateSubprogramDIE(CalleeSP);
         }
+        assert(CalleeDIE && "Must have a DIE for the callee");
       }
 
       // TODO: Omit call site entries for runtime calls (objc_msgSend, etc).
@@ -781,25 +941,21 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
       const MachineInstr *TopLevelCallMI =
           MI.isInsideBundle() ? &*getBundleStart(MI.getIterator()) : &MI;
 
-      // For tail calls, for non-gdb tuning, no return PC information is needed.
-      // For regular calls (and tail calls in GDB tuning), the return PC
-      // is needed to disambiguate paths in the call graph which could lead to
-      // some target function.
-      const MCExpr *PCOffset =
-          (IsTail && !tuneForGDB())
-              ? nullptr
-              : getFunctionLocalOffsetAfterInsn(TopLevelCallMI);
-
-      // Return address of a call-like instruction for a normal call or a
-      // jump-like instruction for a tail call. This is needed for
-      // GDB + DWARF 4 tuning.
+      // For non-tail calls, the return PC is needed to disambiguate paths in
+      // the call graph which could lead to some target function. For tail
+      // calls, no return PC information is needed, unless tuning for GDB in
+      // DWARF4 mode in which case we fake a return PC for compatibility.
       const MCSymbol *PCAddr =
-          ApplyGNUExtensions
+          (!IsTail || CU.useGNUAnalogForDwarf5Feature())
               ? const_cast<MCSymbol *>(getLabelAfterInsn(TopLevelCallMI))
               : nullptr;
 
-      assert((IsTail || PCOffset || PCAddr) &&
-             "Call without return PC information");
+      // For tail calls, it's necessary to record the address of the branch
+      // instruction so that the debugger can show where the tail call occurred.
+      const MCSymbol *CallAddr =
+          IsTail ? getLabelBeforeInsn(TopLevelCallMI) : nullptr;
+
+      assert((IsTail || PCAddr) && "Non-tail call without return PC");
 
       LLVM_DEBUG(dbgs() << "CallSiteEntry: " << MF.getName() << " -> "
                         << (CalleeDecl ? CalleeDecl->getName()
@@ -808,13 +964,11 @@ void DwarfDebug::constructCallSiteEntryDIEs(const DISubprogram &SP,
                                                        ->getName(CallReg)))
                         << (IsTail ? " [IsTail]" : "") << "\n");
 
-      DIE &CallSiteDIE =
-            CU.constructCallSiteEntryDIE(ScopeDIE, CalleeSP, IsTail, PCAddr,
-                                         PCOffset, CallReg);
+      DIE &CallSiteDIE = CU.constructCallSiteEntryDIE(
+          ScopeDIE, CalleeDIE, IsTail, PCAddr, CallAddr, CallReg);
 
-      // GDB and LLDB support call site parameter debug info.
-      if (Asm->TM.Options.EnableDebugEntryValues &&
-          (tuneForGDB() || tuneForLLDB())) {
+      // Optionally emit call-site-param debug info.
+      if (emitDebugEntryValues()) {
         ParamSet Params;
         // Try to interpret values of call site parameters.
         collectCallSiteParameters(&MI, Params);
@@ -847,6 +1001,12 @@ void DwarfDebug::finishUnitAttributes(const DICompileUnit *DIUnit,
   NewCU.addUInt(Die, dwarf::DW_AT_language, dwarf::DW_FORM_data2,
                 DIUnit->getSourceLanguage());
   NewCU.addString(Die, dwarf::DW_AT_name, FN);
+  StringRef SysRoot = DIUnit->getSysRoot();
+  if (!SysRoot.empty())
+    NewCU.addString(Die, dwarf::DW_AT_LLVM_sysroot, SysRoot);
+  StringRef SDK = DIUnit->getSDK();
+  if (!SDK.empty())
+    NewCU.addString(Die, dwarf::DW_AT_APPLE_sdk, SDK);
 
   // Add DW_str_offsets_base to the unit DIE, except for split units.
   if (useSegmentedStringOffsetsTable() && !useSplitDwarf())
@@ -859,7 +1019,6 @@ void DwarfDebug::finishUnitAttributes(const DICompileUnit *DIUnit,
     // skeleton CU and so we don't need to duplicate it here.
     if (!CompilationDir.empty())
       NewCU.addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
-
     addGnuPubAttributes(NewCU, Die);
   }
 
@@ -1175,8 +1334,7 @@ void DwarfDebug::finalizeModuleInfo() {
 
     // We don't keep track of which addresses are used in which CU so this
     // is a bit pessimistic under LTO.
-    if ((!AddrPool.isEmpty() || TheCU.hasRangeLists()) &&
-        (getDwarfVersion() >= 5 || HasSplitUnit))
+    if ((HasSplitUnit || getDwarfVersion() >= 5) && !AddrPool.isEmpty())
       U.addAddrTableBase();
 
     if (getDwarfVersion() >= 5) {
@@ -1192,18 +1350,31 @@ void DwarfDebug::finalizeModuleInfo() {
     }
 
     auto *CUNode = cast<DICompileUnit>(P.first);
-    // If compile Unit has macros, emit "DW_AT_macro_info" attribute.
+    // If compile Unit has macros, emit "DW_AT_macro_info/DW_AT_macros"
+    // attribute.
     if (CUNode->getMacros()) {
-      if (useSplitDwarf())
-        TheCU.addSectionDelta(TheCU.getUnitDie(), dwarf::DW_AT_macro_info,
+      if (getDwarfVersion() >= 5) {
+        if (useSplitDwarf())
+          TheCU.addSectionDelta(
+              TheCU.getUnitDie(), dwarf::DW_AT_macros, U.getMacroLabelBegin(),
+              TLOF.getDwarfMacroDWOSection()->getBeginSymbol());
+        else
+          U.addSectionLabel(U.getUnitDie(), dwarf::DW_AT_macros,
                             U.getMacroLabelBegin(),
-                            TLOF.getDwarfMacinfoDWOSection()->getBeginSymbol());
-      else
-        U.addSectionLabel(U.getUnitDie(), dwarf::DW_AT_macro_info,
-                          U.getMacroLabelBegin(),
-                          TLOF.getDwarfMacinfoSection()->getBeginSymbol());
+                            TLOF.getDwarfMacroSection()->getBeginSymbol());
+      } else {
+        if (useSplitDwarf())
+          TheCU.addSectionDelta(
+              TheCU.getUnitDie(), dwarf::DW_AT_macro_info,
+              U.getMacroLabelBegin(),
+              TLOF.getDwarfMacinfoDWOSection()->getBeginSymbol());
+        else
+          U.addSectionLabel(U.getUnitDie(), dwarf::DW_AT_macro_info,
+                            U.getMacroLabelBegin(),
+                            TLOF.getDwarfMacinfoSection()->getBeginSymbol());
+      }
+    }
     }
-  }
 
   // Emit all frontend-produced Skeleton CUs, i.e., Clang modules.
   for (auto *CUNode : MMI->getModule()->debug_compile_units())
@@ -1235,8 +1406,6 @@ void DwarfDebug::endModule() {
   // Finalize the debug info for the module.
   finalizeModuleInfo();
 
-  emitDebugStr();
-
   if (useSplitDwarf())
     // Emit debug_loc.dwo/debug_loclists.dwo section.
     emitDebugLocDWO();
@@ -1261,9 +1430,11 @@ void DwarfDebug::endModule() {
   // Emit info into a debug macinfo.dwo section.
     emitDebugMacinfoDWO();
   else
-  // Emit info into a debug macinfo section.
+    // Emit info into a debug macinfo/macro section.
     emitDebugMacinfo();
 
+  emitDebugStr();
+
   if (useSplitDwarf()) {
     emitDebugStrDWO();
     emitDebugInfoDWO();
@@ -1322,6 +1493,7 @@ void DwarfDebug::ensureAbstractEntityIsCreatedIfScoped(DwarfCompileUnit &CU,
 void DwarfDebug::collectVariableInfoFromMFTable(
     DwarfCompileUnit &TheCU, DenseSet<InlinedEntity> &Processed) {
   SmallDenseMap<InlinedEntity, DbgVariable *> MFVars;
+  LLVM_DEBUG(dbgs() << "DwarfDebug: collecting variables from MF side table\n");
   for (const auto &VI : Asm->MF->getVariableDbgInfo()) {
     if (!VI.Var)
       continue;
@@ -1333,13 +1505,18 @@ void DwarfDebug::collectVariableInfoFromMFTable(
     LexicalScope *Scope = LScopes.findLexicalScope(VI.Loc);
 
     // If variable scope is not found then skip this variable.
-    if (!Scope)
+    if (!Scope) {
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << VI.Var->getName()
+                        << ", no variable scope found\n");
       continue;
+    }
 
     ensureAbstractEntityIsCreatedIfScoped(TheCU, Var.first, Scope->getScopeNode());
     auto RegVar = std::make_unique<DbgVariable>(
                     cast<DILocalVariable>(Var.first), Var.second);
     RegVar->initializeMMI(VI.Expr, VI.Slot);
+    LLVM_DEBUG(dbgs() << "Created DbgVariable for " << VI.Var->getName()
+                      << "\n");
     if (DbgVariable *DbgVar = MFVars.lookup(Var))
       DbgVar->addMMIEntry(*RegVar);
     else if (InfoHolder.addScopeVariable(Scope, RegVar.get())) {
@@ -1367,11 +1544,20 @@ static bool validThroughout(LexicalScopes &LScopes,
   if (LSRange.size() == 0)
     return false;
 
+
   // Determine if the DBG_VALUE is valid at the beginning of its lexical block.
   const MachineInstr *LScopeBegin = LSRange.front().first;
   // Early exit if the lexical scope begins outside of the current block.
   if (LScopeBegin->getParent() != MBB)
     return false;
+
+  // If there are instructions belonging to our scope in another block, and
+  // we're not a constant (see DWARF2 comment below), then we can't be
+  // validThroughout.
+  const MachineInstr *LScopeEnd = LSRange.back().second;
+  if (RangeEnd && LScopeEnd->getParent() != MBB)
+    return false;
+
   MachineBasicBlock::const_reverse_iterator Pred(DbgValue);
   for (++Pred; Pred != MBB->rend(); ++Pred) {
     if (Pred->getFlag(MachineInstr::FrameSetup))
@@ -1392,19 +1578,35 @@ static bool validThroughout(LexicalScopes &LScopes,
   if (!RangeEnd)
     return true;
 
-  // Fail if there are instructions belonging to our scope in another block.
-  const MachineInstr *LScopeEnd = LSRange.back().second;
-  if (LScopeEnd->getParent() != MBB)
-    return false;
-
   // Single, constant DBG_VALUEs in the prologue are promoted to be live
   // throughout the function. This is a hack, presumably for DWARF v2 and not
   // necessarily correct. It would be much better to use a dbg.declare instead
   // if we know the constant is live throughout the scope.
-  if (DbgValue->getOperand(0).isImm() && MBB->pred_empty())
+  if (DbgValue->getDebugOperand(0).isImm() && MBB->pred_empty())
     return true;
 
-  return false;
+  // Now check for situations where an "open-ended" DBG_VALUE isn't enough to
+  // determine eligibility for a single location, e.g. nested scopes, inlined
+  // functions.
+  // FIXME: For now we just handle a simple (but common) case where the scope
+  // is contained in MBB. We could be smarter here.
+  //
+  // At this point we know that our scope ends in MBB. So, if RangeEnd exists
+  // outside of the block we can ignore it; the location is just leaking outside
+  // its scope.
+  assert(LScopeEnd->getParent() == MBB && "Scope ends outside MBB");
+  if (RangeEnd->getParent() != DbgValue->getParent())
+    return true;
+
+  // The location range and variable's enclosing scope are both contained within
+  // MBB, test if location terminates before end of scope.
+  for (auto I = RangeEnd->getIterator(); I != MBB->end(); ++I)
+    if (&*I == LScopeEnd)
+      return false;
+
+  // There's a single location which starts at the scope start, and ends at or
+  // after the scope end.
+  return true;
 }
 
 /// Build the location list for all DBG_VALUEs in the function that
@@ -1440,8 +1642,10 @@ static bool validThroughout(LexicalScopes &LScopes,
 // [1-3)    [(reg0, fragment 0, 32), (reg1, fragment 32, 32)]
 // [3-4)    [(reg1, fragment 32, 32), (123, fragment 64, 32)]
 // [4-)     [(@g, fragment 0, 96)]
-bool DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
-                                   const DbgValueHistoryMap::Entries &Entries) {
+bool DwarfDebug::buildLocationList(
+    SmallVectorImpl<DebugLocEntry> &DebugLoc,
+    const DbgValueHistoryMap::Entries &Entries,
+    DenseSet<const MachineBasicBlock *> &VeryLargeBlocks) {
   using OpenRange =
       std::pair<DbgValueHistoryMap::EntryIndex, DbgValueLoc>;
   SmallVector<OpenRange, 4> OpenRanges;
@@ -1467,7 +1671,8 @@ bool DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
 
     const MCSymbol *EndLabel;
     if (std::next(EI) == Entries.end()) {
-      EndLabel = Asm->getFunctionEnd();
+      const MachineBasicBlock &EndMBB = Asm->MF->back();
+      EndLabel = Asm->MBBSectionRanges[EndMBB.getSectionIDNum()].EndLabel;
       if (EI->isClobber())
         EndMI = EI->getInstr();
     }
@@ -1536,8 +1741,14 @@ bool DwarfDebug::buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
       DebugLoc.pop_back();
   }
 
-  return DebugLoc.size() == 1 && isSafeForSingleLocation &&
-         validThroughout(LScopes, StartDebugMI, EndMI);
+  // If there's a single entry, safe for a single location, and not part of
+  // an over-sized basic block, then ask validThroughout whether this
+  // location can be represented as a single variable location.
+  if (DebugLoc.size() != 1 || !isSafeForSingleLocation)
+    return false;
+  if (VeryLargeBlocks.count(StartDebugMI->getParent()))
+    return false;
+  return validThroughout(LScopes, StartDebugMI, EndMI);
 }
 
 DbgEntity *DwarfDebug::createConcreteEntity(DwarfCompileUnit &TheCU,
@@ -1569,6 +1780,13 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU,
   // Grab the variable info that was squirreled away in the MMI side-table.
   collectVariableInfoFromMFTable(TheCU, Processed);
 
+  // Identify blocks that are unreasonably sized, so that we can later
+  // skip lexical scope analysis over them.
+  DenseSet<const MachineBasicBlock *> VeryLargeBlocks;
+  for (const auto &MBB : *CurFn)
+    if (MBB.size() > LocationAnalysisSizeLimit)
+      VeryLargeBlocks.insert(&MBB);
+
   for (const auto &I : DbgValues) {
     InlinedEntity IV = I.first;
     if (Processed.count(IV))
@@ -1605,7 +1823,8 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU,
     if (HistSize == 1 || SingleValueWithClobber) {
       const auto *End =
           SingleValueWithClobber ? HistoryMapEntries[1].getInstr() : nullptr;
-      if (validThroughout(LScopes, MInsn, End)) {
+      if (VeryLargeBlocks.count(MInsn->getParent()) == 0 &&
+          validThroughout(LScopes, MInsn, End)) {
         RegVar->initializeDbgValue(MInsn);
         continue;
       }
@@ -1620,7 +1839,8 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU,
 
     // Build the location list for this variable.
     SmallVector<DebugLocEntry, 8> Entries;
-    bool isValidSingleLocation = buildLocationList(Entries, HistoryMapEntries);
+    bool isValidSingleLocation =
+        buildLocationList(Entries, HistoryMapEntries, VeryLargeBlocks);
 
     // Check whether buildLocationList managed to merge all locations to one
     // that is valid throughout the variable's scope. If so, produce single
@@ -1689,11 +1909,45 @@ void DwarfDebug::collectEntityInfo(DwarfCompileUnit &TheCU,
 
 // Process beginning of an instruction.
 void DwarfDebug::beginInstruction(const MachineInstr *MI) {
+  const MachineFunction &MF = *MI->getMF();
+  const auto *SP = MF.getFunction().getSubprogram();
+  bool NoDebug =
+      !SP || SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug;
+
+  // Delay slot support check.
+  auto delaySlotSupported = [](const MachineInstr &MI) {
+    if (!MI.isBundledWithSucc())
+      return false;
+    auto Suc = std::next(MI.getIterator());
+    (void)Suc;
+    // Ensure that delay slot instruction is successor of the call instruction.
+    // Ex. CALL_INSTRUCTION {
+    //        DELAY_SLOT_INSTRUCTION }
+    assert(Suc->isBundledWithPred() &&
+           "Call bundle instructions are out of order");
+    return true;
+  };
+
+  // When describing calls, we need a label for the call instruction.
+  if (!NoDebug && SP->areAllCallsDescribed() &&
+      MI->isCandidateForCallSiteEntry(MachineInstr::AnyInBundle) &&
+      (!MI->hasDelaySlot() || delaySlotSupported(*MI))) {
+    const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+    bool IsTail = TII->isTailCall(*MI);
+    // For tail calls, we need the address of the branch instruction for
+    // DW_AT_call_pc.
+    if (IsTail)
+      requestLabelBeforeInsn(MI);
+    // For non-tail calls, we need the return address for the call for
+    // DW_AT_call_return_pc. Under GDB tuning, this information is needed for
+    // tail calls as well.
+    requestLabelAfterInsn(MI);
+  }
+
   DebugHandlerBase::beginInstruction(MI);
   assert(CurMI);
 
-  const auto *SP = MI->getMF()->getFunction().getSubprogram();
-  if (!SP || SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug)
+  if (NoDebug)
     return;
 
   // Check if source location changes, but ignore DBG_VALUE and CFI locations.
@@ -1707,11 +1961,6 @@ void DwarfDebug::beginInstruction(const MachineInstr *MI) {
   unsigned LastAsmLine =
       Asm->OutStreamer->getContext().getCurrentDwarfLoc().getLine();
 
-  // Request a label after the call in order to emit AT_return_pc information
-  // in call site entries. TODO: Add support for targets with delay slots.
-  if (SP->areAllCallsDescribed() && MI->isCall() && !MI->hasDelaySlot())
-    requestLabelAfterInsn(MI);
-
   if (DL == PrevInstLoc) {
     // If we have an ongoing unspecified location, nothing to do here.
     if (!DL)
@@ -1810,7 +2059,7 @@ static void recordSourceLine(AsmPrinter &Asm, unsigned Line, unsigned Col,
     FileNo = static_cast<DwarfCompileUnit &>(*DCUs[CUID])
                  .getOrCreateSourceID(Scope->getFile());
   }
-  Asm.OutStreamer->EmitDwarfLocDirective(FileNo, Line, Col, Flags, 0,
+  Asm.OutStreamer->emitDwarfLocDirective(FileNo, Line, Col, Flags, 0,
                                          Discriminator, Fn);
 }
 
@@ -1842,9 +2091,6 @@ void DwarfDebug::beginFunctionImpl(const MachineFunction *MF) {
   if (SP->getUnit()->getEmissionKind() == DICompileUnit::NoDebug)
     return;
 
-  SectionLabels.insert(std::make_pair(&Asm->getFunctionBegin()->getSection(),
-                                      Asm->getFunctionBegin()));
-
   DwarfCompileUnit &CU = getOrCreateDwarfCompileUnit(SP->getUnit());
 
   // Set DwarfDwarfCompileUnitID in MCContext to the Compile Unit this function
@@ -1892,7 +2138,9 @@ void DwarfDebug::endFunctionImpl(const MachineFunction *MF) {
   collectEntityInfo(TheCU, SP, Processed);
 
   // Add the range of this function to the list of ranges for the CU.
-  TheCU.addRange({Asm->getFunctionBegin(), Asm->getFunctionEnd()});
+  // With basic block sections, add ranges for all basic block sections.
+  for (const auto &R : Asm->MBBSectionRanges)
+    TheCU.addRange({R.second.BeginLabel, R.second.EndLabel});
 
   // Under -gmlt, skip building the subprogram if there are no inlined
   // subroutines inside it. But with -fdebug-info-for-profiling, the subprogram
@@ -2121,7 +2369,7 @@ void DwarfDebug::emitDebugPubSections() {
 
 void DwarfDebug::emitSectionReference(const DwarfCompileUnit &CU) {
   if (useSectionsAsReferences())
-    Asm->EmitDwarfOffset(CU.getSection()->getBeginSymbol(),
+    Asm->emitDwarfOffset(CU.getSection()->getBeginSymbol(),
                          CU.getDebugSectionOffset());
   else
     Asm->emitDwarfSymbolReference(CU.getLabelBegin());
@@ -2137,9 +2385,9 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name,
   Asm->OutStreamer->AddComment("Length of Public " + Name + " Info");
   MCSymbol *BeginLabel = Asm->createTempSymbol("pub" + Name + "_begin");
   MCSymbol *EndLabel = Asm->createTempSymbol("pub" + Name + "_end");
-  Asm->EmitLabelDifference(EndLabel, BeginLabel, 4);
+  Asm->emitLabelDifference(EndLabel, BeginLabel, 4);
 
-  Asm->OutStreamer->EmitLabel(BeginLabel);
+  Asm->OutStreamer->emitLabel(BeginLabel);
 
   Asm->OutStreamer->AddComment("DWARF Version");
   Asm->emitInt16(dwarf::DW_PUBNAMES_VERSION);
@@ -2167,12 +2415,12 @@ void DwarfDebug::emitDebugPubSection(bool GnuStyle, StringRef Name,
     }
 
     Asm->OutStreamer->AddComment("External Name");
-    Asm->OutStreamer->EmitBytes(StringRef(Name, GI.getKeyLength() + 1));
+    Asm->OutStreamer->emitBytes(StringRef(Name, GI.getKeyLength() + 1));
   }
 
   Asm->OutStreamer->AddComment("End Mark");
   Asm->emitInt32(0);
-  Asm->OutStreamer->EmitLabel(EndLabel);
+  Asm->OutStreamer->emitLabel(EndLabel);
 }
 
 /// Emit null-terminated strings into a debug str section.
@@ -2203,7 +2451,7 @@ void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
   DWARFDataExtractor Data(StringRef(DebugLocs.getBytes(Entry).data(),
                                     DebugLocs.getBytes(Entry).size()),
                           Asm->getDataLayout().isLittleEndian(), PtrSize);
-  DWARFExpression Expr(Data, getDwarfVersion(), PtrSize);
+  DWARFExpression Expr(Data, PtrSize, Asm->OutContext.getDwarfFormat());
 
   using Encoding = DWARFExpression::Operation::Encoding;
   uint64_t Offset = 0;
@@ -2216,18 +2464,14 @@ void DwarfDebug::emitDebugLocEntry(ByteStreamer &Streamer,
       if (Op.getDescription().Op[I] == Encoding::SizeNA)
         continue;
       if (Op.getDescription().Op[I] == Encoding::BaseTypeRef) {
-          if (CU) {
-            uint64_t Offset = CU->ExprRefedBaseTypes[Op.getRawOperand(I)].Die->getOffset();
-            assert(Offset < (1ULL << (ULEB128PadSize * 7)) && "Offset wont fit");
-            Asm->EmitULEB128(Offset, nullptr, ULEB128PadSize);
-          } else {
-            // Emit a reference to the 'generic type'.
-            Asm->EmitULEB128(0, nullptr, ULEB128PadSize);
-          }
-          // Make sure comments stay aligned.
-          for (unsigned J = 0; J < ULEB128PadSize; ++J)
-            if (Comment != End)
-              Comment++;
+        uint64_t Offset =
+            CU->ExprRefedBaseTypes[Op.getRawOperand(I)].Die->getOffset();
+        assert(Offset < (1ULL << (ULEB128PadSize * 7)) && "Offset wont fit");
+        Streamer.emitULEB128(Offset, "", ULEB128PadSize);
+        // Make sure comments stay aligned.
+        for (unsigned J = 0; J < ULEB128PadSize; ++J)
+          if (Comment != End)
+            Comment++;
       } else {
         for (uint64_t J = Offset; J < Op.getOperandEndOffset(I); ++J)
           Streamer.EmitInt8(Data.getData()[J], Comment != End ? *(Comment++) : "");
@@ -2253,14 +2497,11 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
       DwarfExpr.addUnsignedConstant(Value.getInt());
   } else if (Value.isLocation()) {
     MachineLocation Location = Value.getLoc();
-    if (Location.isIndirect())
-      DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.setLocation(Location, DIExpr);
     DIExpressionCursor Cursor(DIExpr);
 
-    if (DIExpr->isEntryValue()) {
-      DwarfExpr.setEntryValueFlag();
+    if (DIExpr->isEntryValue())
       DwarfExpr.beginEntryValueExpression(Cursor);
-    }
 
     const TargetRegisterInfo &TRI = *AP.MF->getSubtarget().getRegisterInfo();
     if (!DwarfExpr.addMachineRegExpression(TRI, Cursor, Location.getReg()))
@@ -2270,7 +2511,7 @@ void DwarfDebug::emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
     TargetIndexLocation Loc = Value.getTargetIndexLocation();
     // TODO TargetIndexLocation is a target-independent. Currently only the WebAssembly-specific
     // encoding is supported.
-    DwarfExpr.addWasmLocation(Loc.Index, Loc.Offset);
+    DwarfExpr.addWasmLocation(Loc.Index, static_cast<uint64_t>(Loc.Offset));
   } else if (Value.isConstantFP()) {
     APInt RawBytes = Value.getConstantFP()->getValueAPF().bitcastToAPInt();
     DwarfExpr.addUnsignedConstant(RawBytes);
@@ -2294,8 +2535,7 @@ void DebugLocEntry::finalize(const AsmPrinter &AP,
     assert(llvm::all_of(Values, [](DbgValueLoc P) {
           return P.isFragment();
         }) && "all values are expected to be fragments");
-    assert(std::is_sorted(Values.begin(), Values.end()) &&
-           "fragments are expected to be sorted");
+    assert(llvm::is_sorted(Values) && "fragments are expected to be sorted");
 
     for (auto Fragment : Values)
       DwarfDebug::emitDebugLocValue(AP, BT, Fragment, DwarfExpr);
@@ -2314,7 +2554,7 @@ void DwarfDebug::emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry,
   // Emit the size.
   Asm->OutStreamer->AddComment("Loc expr size");
   if (getDwarfVersion() >= 5)
-    Asm->EmitULEB128(DebugLocs.getBytes(Entry).size());
+    Asm->emitULEB128(DebugLocs.getBytes(Entry).size());
   else if (DebugLocs.getBytes(Entry).size() <= std::numeric_limits<uint16_t>::max())
     Asm->emitInt16(DebugLocs.getBytes(Entry).size());
   else {
@@ -2328,41 +2568,19 @@ void DwarfDebug::emitDebugLocEntryLocation(const DebugLocStream::Entry &Entry,
   emitDebugLocEntry(Streamer, Entry, CU);
 }
 
-// Emit the common part of the DWARF 5 range/locations list tables header.
-static void emitListsTableHeaderStart(AsmPrinter *Asm,
-                                      MCSymbol *TableStart,
-                                      MCSymbol *TableEnd) {
-  // Build the table header, which starts with the length field.
-  Asm->OutStreamer->AddComment("Length");
-  Asm->EmitLabelDifference(TableEnd, TableStart, 4);
-  Asm->OutStreamer->EmitLabel(TableStart);
-  // Version number (DWARF v5 and later).
-  Asm->OutStreamer->AddComment("Version");
-  Asm->emitInt16(Asm->OutStreamer->getContext().getDwarfVersion());
-  // Address size.
-  Asm->OutStreamer->AddComment("Address size");
-  Asm->emitInt8(Asm->MAI->getCodePointerSize());
-  // Segment selector size.
-  Asm->OutStreamer->AddComment("Segment selector size");
-  Asm->emitInt8(0);
-}
-
 // Emit the header of a DWARF 5 range list table list table. Returns the symbol
 // that designates the end of the table for the caller to emit when the table is
 // complete.
 static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm,
                                          const DwarfFile &Holder) {
-  MCSymbol *TableStart = Asm->createTempSymbol("debug_rnglist_table_start");
-  MCSymbol *TableEnd = Asm->createTempSymbol("debug_rnglist_table_end");
-  emitListsTableHeaderStart(Asm, TableStart, TableEnd);
+  MCSymbol *TableEnd = mcdwarf::emitListsTableHeaderStart(*Asm->OutStreamer);
 
   Asm->OutStreamer->AddComment("Offset entry count");
   Asm->emitInt32(Holder.getRangeLists().size());
-  Asm->OutStreamer->EmitLabel(Holder.getRnglistsTableBaseSym());
+  Asm->OutStreamer->emitLabel(Holder.getRnglistsTableBaseSym());
 
   for (const RangeSpanList &List : Holder.getRangeLists())
-    Asm->EmitLabelDifference(List.Label, Holder.getRnglistsTableBaseSym(),
-                             4);
+    Asm->emitLabelDifference(List.Label, Holder.getRnglistsTableBaseSym(), 4);
 
   return TableEnd;
 }
@@ -2372,18 +2590,16 @@ static MCSymbol *emitRnglistsTableHeader(AsmPrinter *Asm,
 // complete.
 static MCSymbol *emitLoclistsTableHeader(AsmPrinter *Asm,
                                          const DwarfDebug &DD) {
-  MCSymbol *TableStart = Asm->createTempSymbol("debug_loclist_table_start");
-  MCSymbol *TableEnd = Asm->createTempSymbol("debug_loclist_table_end");
-  emitListsTableHeaderStart(Asm, TableStart, TableEnd);
+  MCSymbol *TableEnd = mcdwarf::emitListsTableHeaderStart(*Asm->OutStreamer);
 
   const auto &DebugLocs = DD.getDebugLocs();
 
   Asm->OutStreamer->AddComment("Offset entry count");
   Asm->emitInt32(DebugLocs.getLists().size());
-  Asm->OutStreamer->EmitLabel(DebugLocs.getSym());
+  Asm->OutStreamer->emitLabel(DebugLocs.getSym());
 
   for (const auto &List : DebugLocs.getLists())
-    Asm->EmitLabelDifference(List.Label, DebugLocs.getSym(), 4);
+    Asm->emitLabelDifference(List.Label, DebugLocs.getSym(), 4);
 
   return TableEnd;
 }
@@ -2401,7 +2617,7 @@ static void emitRangeList(
   bool UseDwarf5 = DD.getDwarfVersion() >= 5;
 
   // Emit our symbol so we can find the beginning of the range.
-  Asm->OutStreamer->EmitLabel(Sym);
+  Asm->OutStreamer->emitLabel(Sym);
 
   // Gather all the ranges that apply to the same section so they can share
   // a base address entry.
@@ -2420,9 +2636,9 @@ static void emitRangeList(
       if (!UseDwarf5) {
         Base = NewBase;
         BaseIsSet = true;
-        Asm->OutStreamer->EmitIntValue(-1, Size);
+        Asm->OutStreamer->emitIntValue(-1, Size);
         Asm->OutStreamer->AddComment("  base address");
-        Asm->OutStreamer->EmitSymbolValue(Base, Size);
+        Asm->OutStreamer->emitSymbolValue(Base, Size);
       } else if (NewBase != Begin || P.second.size() > 1) {
         // Only use a base address if
         //  * the existing pool address doesn't match (NewBase != Begin)
@@ -2432,13 +2648,13 @@ static void emitRangeList(
         Asm->OutStreamer->AddComment(StringifyEnum(BaseAddressx));
         Asm->emitInt8(BaseAddressx);
         Asm->OutStreamer->AddComment("  base address index");
-        Asm->EmitULEB128(DD.getAddressPool().getIndex(Base));
+        Asm->emitULEB128(DD.getAddressPool().getIndex(Base));
       }
     } else if (BaseIsSet && !UseDwarf5) {
       BaseIsSet = false;
       assert(!Base);
-      Asm->OutStreamer->EmitIntValue(-1, Size);
-      Asm->OutStreamer->EmitIntValue(0, Size);
+      Asm->OutStreamer->emitIntValue(-1, Size);
+      Asm->OutStreamer->emitIntValue(0, Size);
     }
 
     for (const auto *RS : P.second) {
@@ -2452,23 +2668,23 @@ static void emitRangeList(
           Asm->OutStreamer->AddComment(StringifyEnum(OffsetPair));
           Asm->emitInt8(OffsetPair);
           Asm->OutStreamer->AddComment("  starting offset");
-          Asm->EmitLabelDifferenceAsULEB128(Begin, Base);
+          Asm->emitLabelDifferenceAsULEB128(Begin, Base);
           Asm->OutStreamer->AddComment("  ending offset");
-          Asm->EmitLabelDifferenceAsULEB128(End, Base);
+          Asm->emitLabelDifferenceAsULEB128(End, Base);
         } else {
-          Asm->EmitLabelDifference(Begin, Base, Size);
-          Asm->EmitLabelDifference(End, Base, Size);
+          Asm->emitLabelDifference(Begin, Base, Size);
+          Asm->emitLabelDifference(End, Base, Size);
         }
       } else if (UseDwarf5) {
         Asm->OutStreamer->AddComment(StringifyEnum(StartxLength));
         Asm->emitInt8(StartxLength);
         Asm->OutStreamer->AddComment("  start index");
-        Asm->EmitULEB128(DD.getAddressPool().getIndex(Begin));
+        Asm->emitULEB128(DD.getAddressPool().getIndex(Begin));
         Asm->OutStreamer->AddComment("  length");
-        Asm->EmitLabelDifferenceAsULEB128(End, Begin);
+        Asm->emitLabelDifferenceAsULEB128(End, Begin);
       } else {
-        Asm->OutStreamer->EmitSymbolValue(Begin, Size);
-        Asm->OutStreamer->EmitSymbolValue(End, Size);
+        Asm->OutStreamer->emitSymbolValue(Begin, Size);
+        Asm->OutStreamer->emitSymbolValue(End, Size);
       }
       EmitPayload(*RS);
     }
@@ -2479,8 +2695,8 @@ static void emitRangeList(
     Asm->emitInt8(EndOfList);
   } else {
     // Terminate the list with two 0 values.
-    Asm->OutStreamer->EmitIntValue(0, Size);
-    Asm->OutStreamer->EmitIntValue(0, Size);
+    Asm->OutStreamer->emitIntValue(0, Size);
+    Asm->OutStreamer->emitIntValue(0, Size);
   }
 }
 
@@ -2510,7 +2726,7 @@ void DwarfDebug::emitDebugLocImpl(MCSection *Sec) {
     emitLocList(*this, Asm, List);
 
   if (TableEnd)
-    Asm->OutStreamer->EmitLabel(TableEnd);
+    Asm->OutStreamer->emitLabel(TableEnd);
 }
 
 // Emit locations into the .debug_loc/.debug_loclists section.
@@ -2533,7 +2749,7 @@ void DwarfDebug::emitDebugLocDWO() {
   for (const auto &List : DebugLocs.getLists()) {
     Asm->OutStreamer->SwitchSection(
         Asm->getObjFileLowering().getDwarfLocDWOSection());
-    Asm->OutStreamer->EmitLabel(List.Label);
+    Asm->OutStreamer->emitLabel(List.Label);
 
     for (const auto &Entry : DebugLocs.getEntries(List)) {
       // GDB only supports startx_length in pre-standard split-DWARF.
@@ -2541,14 +2757,15 @@ void DwarfDebug::emitDebugLocDWO() {
       // offset_pair, so the implementations can't really share much since they
       // need to use different representations)
       // * as of October 2018, at least
-      // Ideally/in v5, this could use SectionLabels to reuse existing addresses
-      // in the address pool to minimize object size/relocations.
+      //
+      // In v5 (see emitLocList), this uses SectionLabels to reuse existing
+      // addresses in the address pool to minimize object size/relocations.
       Asm->emitInt8(dwarf::DW_LLE_startx_length);
       unsigned idx = AddrPool.getIndex(Entry.Begin);
-      Asm->EmitULEB128(idx);
+      Asm->emitULEB128(idx);
       // Also the pre-standard encoding is slightly different, emitting this as
       // an address-length entry here, but its a ULEB128 in DWARFv5 loclists.
-      Asm->EmitLabelDifference(Entry.End, Entry.Begin, 4);
+      Asm->emitLabelDifference(Entry.End, Entry.Begin, 4);
       emitDebugLocEntryLocation(Entry, List.CU);
     }
     Asm->emitInt8(dwarf::DW_LLE_end_of_list);
@@ -2693,11 +2910,11 @@ void DwarfDebug::emitDebugARanges() {
     Asm->OutStreamer->emitFill(Padding, 0xff);
 
     for (const ArangeSpan &Span : List) {
-      Asm->EmitLabelReference(Span.Start, PtrSize);
+      Asm->emitLabelReference(Span.Start, PtrSize);
 
       // Calculate the size as being from the span start to it's end.
       if (Span.End) {
-        Asm->EmitLabelDifference(Span.End, Span.Start, PtrSize);
+        Asm->emitLabelDifference(Span.End, Span.Start, PtrSize);
       } else {
         // For symbols without an end marker (e.g. common), we
         // write a single arange entry containing just that one symbol.
@@ -2705,13 +2922,13 @@ void DwarfDebug::emitDebugARanges() {
         if (Size == 0)
           Size = 1;
 
-        Asm->OutStreamer->EmitIntValue(Size, PtrSize);
+        Asm->OutStreamer->emitIntValue(Size, PtrSize);
       }
     }
 
     Asm->OutStreamer->AddComment("ARange terminator");
-    Asm->OutStreamer->EmitIntValue(0, PtrSize);
-    Asm->OutStreamer->EmitIntValue(0, PtrSize);
+    Asm->OutStreamer->emitIntValue(0, PtrSize);
+    Asm->OutStreamer->emitIntValue(0, PtrSize);
   }
 }
 
@@ -2747,7 +2964,7 @@ void DwarfDebug::emitDebugRangesImpl(const DwarfFile &Holder, MCSection *Section
     emitRangeList(*this, Asm, List);
 
   if (TableEnd)
-    Asm->OutStreamer->EmitLabel(TableEnd);
+    Asm->OutStreamer->emitLabel(TableEnd);
 }
 
 /// Emit address ranges into the .debug_ranges section or into the DWARF v5
@@ -2766,6 +2983,27 @@ void DwarfDebug::emitDebugRangesDWO() {
                       Asm->getObjFileLowering().getDwarfRnglistsDWOSection());
 }
 
+/// Emit the header of a DWARF 5 macro section.
+static void emitMacroHeader(AsmPrinter *Asm, const DwarfDebug &DD,
+                            const DwarfCompileUnit &CU) {
+  enum HeaderFlagMask {
+#define HANDLE_MACRO_FLAG(ID, NAME) MACRO_FLAG_##NAME = ID,
+#include "llvm/BinaryFormat/Dwarf.def"
+  };
+  uint8_t Flags = 0;
+  Asm->OutStreamer->AddComment("Macro information version");
+  Asm->emitInt16(5);
+  // We are setting Offset and line offset flags unconditionally here,
+  // since we're only supporting DWARF32 and line offset should be mostly
+  // present.
+  // FIXME: Add support for DWARF64.
+  Flags |= MACRO_FLAG_DEBUG_LINE_OFFSET;
+  Asm->OutStreamer->AddComment("Flags: 32 bit, debug_line_offset present");
+  Asm->emitInt8(Flags);
+  Asm->OutStreamer->AddComment("debug_line_offset");
+  Asm->OutStreamer->emitSymbolValue(CU.getLineTableStartSym(), /*Size=*/4);
+}
+
 void DwarfDebug::handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U) {
   for (auto *MN : Nodes) {
     if (auto *M = dyn_cast<DIMacro>(MN))
@@ -2778,26 +3016,72 @@ void DwarfDebug::handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U) {
 }
 
 void DwarfDebug::emitMacro(DIMacro &M) {
-  Asm->EmitULEB128(M.getMacinfoType());
-  Asm->EmitULEB128(M.getLine());
   StringRef Name = M.getName();
   StringRef Value = M.getValue();
-  Asm->OutStreamer->EmitBytes(Name);
-  if (!Value.empty()) {
-    // There should be one space between macro name and macro value.
-    Asm->emitInt8(' ');
-    Asm->OutStreamer->EmitBytes(Value);
+  bool UseMacro = getDwarfVersion() >= 5;
+
+  if (UseMacro) {
+    unsigned Type = M.getMacinfoType() == dwarf::DW_MACINFO_define
+                        ? dwarf::DW_MACRO_define_strx
+                        : dwarf::DW_MACRO_undef_strx;
+    Asm->OutStreamer->AddComment(dwarf::MacroString(Type));
+    Asm->emitULEB128(Type);
+    Asm->OutStreamer->AddComment("Line Number");
+    Asm->emitULEB128(M.getLine());
+    Asm->OutStreamer->AddComment("Macro String");
+    if (!Value.empty())
+      Asm->emitULEB128(this->InfoHolder.getStringPool()
+                           .getIndexedEntry(*Asm, (Name + " " + Value).str())
+                           .getIndex());
+    else
+      // DW_MACRO_undef_strx doesn't have a value, so just emit the macro
+      // string.
+      Asm->emitULEB128(this->InfoHolder.getStringPool()
+                           .getIndexedEntry(*Asm, (Name).str())
+                           .getIndex());
+  } else {
+    Asm->OutStreamer->AddComment(dwarf::MacinfoString(M.getMacinfoType()));
+    Asm->emitULEB128(M.getMacinfoType());
+    Asm->OutStreamer->AddComment("Line Number");
+    Asm->emitULEB128(M.getLine());
+    Asm->OutStreamer->AddComment("Macro String");
+    Asm->OutStreamer->emitBytes(Name);
+    if (!Value.empty()) {
+      // There should be one space between macro name and macro value.
+      Asm->emitInt8(' ');
+      Asm->OutStreamer->AddComment("Macro Value=");
+      Asm->OutStreamer->emitBytes(Value);
+    }
+    Asm->emitInt8('\0');
   }
-  Asm->emitInt8('\0');
+}
+
+void DwarfDebug::emitMacroFileImpl(
+    DIMacroFile &F, DwarfCompileUnit &U, unsigned StartFile, unsigned EndFile,
+    StringRef (*MacroFormToString)(unsigned Form)) {
+
+  Asm->OutStreamer->AddComment(MacroFormToString(StartFile));
+  Asm->emitULEB128(StartFile);
+  Asm->OutStreamer->AddComment("Line Number");
+  Asm->emitULEB128(F.getLine());
+  Asm->OutStreamer->AddComment("File Number");
+  Asm->emitULEB128(U.getOrCreateSourceID(F.getFile()));
+  handleMacroNodes(F.getElements(), U);
+  Asm->OutStreamer->AddComment(MacroFormToString(EndFile));
+  Asm->emitULEB128(EndFile);
 }
 
 void DwarfDebug::emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U) {
+  // DWARFv5 macro and DWARFv4 macinfo share some common encodings,
+  // so for readibility/uniformity, We are explicitly emitting those.
   assert(F.getMacinfoType() == dwarf::DW_MACINFO_start_file);
-  Asm->EmitULEB128(dwarf::DW_MACINFO_start_file);
-  Asm->EmitULEB128(F.getLine());
-  Asm->EmitULEB128(U.getOrCreateSourceID(F.getFile()));
-  handleMacroNodes(F.getElements(), U);
-  Asm->EmitULEB128(dwarf::DW_MACINFO_end_file);
+  bool UseMacro = getDwarfVersion() >= 5;
+  if (UseMacro)
+    emitMacroFileImpl(F, U, dwarf::DW_MACRO_start_file,
+                      dwarf::DW_MACRO_end_file, dwarf::MacroString);
+  else
+    emitMacroFileImpl(F, U, dwarf::DW_MACINFO_start_file,
+                      dwarf::DW_MACINFO_end_file, dwarf::MacinfoString);
 }
 
 void DwarfDebug::emitDebugMacinfoImpl(MCSection *Section) {
@@ -2810,20 +3094,28 @@ void DwarfDebug::emitDebugMacinfoImpl(MCSection *Section) {
     if (Macros.empty())
       continue;
     Asm->OutStreamer->SwitchSection(Section);
-    Asm->OutStreamer->EmitLabel(U.getMacroLabelBegin());
+    Asm->OutStreamer->emitLabel(U.getMacroLabelBegin());
+    if (getDwarfVersion() >= 5)
+      emitMacroHeader(Asm, *this, U);
     handleMacroNodes(Macros, U);
     Asm->OutStreamer->AddComment("End Of Macro List Mark");
     Asm->emitInt8(0);
   }
 }
 
-/// Emit macros into a debug macinfo section.
+/// Emit macros into a debug macinfo/macro section.
 void DwarfDebug::emitDebugMacinfo() {
-  emitDebugMacinfoImpl(Asm->getObjFileLowering().getDwarfMacinfoSection());
+  auto &ObjLower = Asm->getObjFileLowering();
+  emitDebugMacinfoImpl(getDwarfVersion() >= 5
+                           ? ObjLower.getDwarfMacroSection()
+                           : ObjLower.getDwarfMacinfoSection());
 }
 
 void DwarfDebug::emitDebugMacinfoDWO() {
-  emitDebugMacinfoImpl(Asm->getObjFileLowering().getDwarfMacinfoDWOSection());
+  auto &ObjLower = Asm->getObjFileLowering();
+  emitDebugMacinfoImpl(getDwarfVersion() >= 5
+                           ? ObjLower.getDwarfMacroDWOSection()
+                           : ObjLower.getDwarfMacinfoDWOSection());
 }
 
 // DWARF5 Experimental Separate Dwarf emitters.
@@ -2833,7 +3125,6 @@ void DwarfDebug::initSkeletonUnit(const DwarfUnit &U, DIE &Die,
 
   if (!CompilationDir.empty())
     NewU->addString(Die, dwarf::DW_AT_comp_dir, CompilationDir);
-
   addGnuPubAttributes(*NewU, Die);
 
   SkeletonHolder.addUnit(std::move(NewU));
@@ -3087,3 +3378,8 @@ uint16_t DwarfDebug::getDwarfVersion() const {
 const MCSymbol *DwarfDebug::getSectionLabel(const MCSection *S) {
   return SectionLabels.find(S)->second;
 }
+void DwarfDebug::insertSectionLabel(const MCSymbol *S) {
+  if (SectionLabels.insert(std::make_pair(&S->getSection(), S)).second)
+    if (useSplitDwarf() || getDwarfVersion() >= 5)
+      AddrPool.getIndex(S);
+}
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
index fd82b1f98055..ad2f2f3edd8e 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfDebug.h
@@ -49,7 +49,6 @@ namespace llvm {
 
 class AsmPrinter;
 class ByteStreamer;
-class DebugLocEntry;
 class DIE;
 class DwarfCompileUnit;
 class DwarfExpression;
@@ -59,7 +58,6 @@ class LexicalScope;
 class MachineFunction;
 class MCSection;
 class MCSymbol;
-class MDNode;
 class Module;
 
 //===----------------------------------------------------------------------===//
@@ -327,7 +325,7 @@ class DwarfDebug : public DebugHandlerBase {
   const MachineFunction *CurFn = nullptr;
 
   /// If nonnull, stores the CU in which the previous subprogram was contained.
-  const DwarfCompileUnit *PrevCU;
+  const DwarfCompileUnit *PrevCU = nullptr;
 
   /// As an optimization, there is no need to emit an entry in the directory
   /// table for the same directory as DW_AT_comp_dir.
@@ -386,6 +384,11 @@ class DwarfDebug : public DebugHandlerBase {
   /// a monolithic sequence of string offsets.
   bool UseSegmentedStringOffsetsTable;
 
+  /// Enable production of call site parameters needed to print the debug entry
+  /// values. Useful for testing purposes when a debugger does not support the
+  /// feature yet.
+  bool EmitDebugEntryValues;
+
   /// Separated Dwarf Variables
   /// In general these will all be for bits that are left in the
   /// original object file, rather than things that are meant
@@ -523,6 +526,9 @@ class DwarfDebug : public DebugHandlerBase {
   void emitDebugMacinfoImpl(MCSection *Section);
   void emitMacro(DIMacro &M);
   void emitMacroFile(DIMacroFile &F, DwarfCompileUnit &U);
+  void emitMacroFileImpl(DIMacroFile &F, DwarfCompileUnit &U,
+                         unsigned StartFile, unsigned EndFile,
+                         StringRef (*MacroFormToString)(unsigned Form));
   void handleMacroNodes(DIMacroNodeArray Nodes, DwarfCompileUnit &U);
 
   /// DWARF 5 Experimental Split Dwarf Emitters
@@ -586,8 +592,10 @@ class DwarfDebug : public DebugHandlerBase {
   /// function that describe the same variable. If the resulting 
   /// list has only one entry that is valid for entire variable's
   /// scope return true.
-  bool buildLocationList(SmallVectorImpl<DebugLocEntry> &DebugLoc,
-                         const DbgValueHistoryMap::Entries &Entries);
+  bool buildLocationList(
+      SmallVectorImpl<DebugLocEntry> &DebugLoc,
+      const DbgValueHistoryMap::Entries &Entries,
+      DenseSet<const MachineBasicBlock *> &VeryLargeBlocks);
 
   /// Collect variable information from the side table maintained by MF.
   void collectVariableInfoFromMFTable(DwarfCompileUnit &TheCU,
@@ -634,7 +642,6 @@ public:
   void addDwarfTypeUnitType(DwarfCompileUnit &CU, StringRef Identifier,
                             DIE &Die, const DICompositeType *CTy);
 
-  friend class NonTypeUnitContext;
   class NonTypeUnitContext {
     DwarfDebug *DD;
     decltype(DwarfDebug::TypeUnitsUnderConstruction) TypeUnitsUnderConstruction;
@@ -708,6 +715,10 @@ public:
     return UseSegmentedStringOffsetsTable;
   }
 
+  bool emitDebugEntryValues() const {
+    return EmitDebugEntryValues;
+  }
+
   bool shareAcrossDWOCUs() const;
 
   /// Returns the Dwarf Version.
@@ -768,6 +779,7 @@ public:
 
   void addSectionLabel(const MCSymbol *Sym);
   const MCSymbol *getSectionLabel(const MCSection *S);
+  void insertSectionLabel(const MCSymbol *S);
 
   static void emitDebugLocValue(const AsmPrinter &AP, const DIBasicType *BT,
                                 const DbgValueLoc &Value,
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfException.h b/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
index 24bbf58b91ec..c2956380438f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfException.h
@@ -66,6 +66,9 @@ public:
 
   void beginFragment(const MachineBasicBlock *MBB,
                      ExceptionSymbolProvider ESP) override;
+
+  void beginBasicBlock(const MachineBasicBlock &MBB) override;
+  void endBasicBlock(const MachineBasicBlock &MBB) override;
 };
 
 class LLVM_LIBRARY_VISIBILITY ARMException : public DwarfCFIExceptionBase {
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
index 310647f15a5e..d4762121d105 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.cpp
@@ -40,12 +40,12 @@ void DwarfExpression::emitConstu(uint64_t Value) {
 }
 
 void DwarfExpression::addReg(int DwarfReg, const char *Comment) {
- assert(DwarfReg >= 0 && "invalid negative dwarf register number");
- assert((isUnknownLocation() || isRegisterLocation()) &&
-        "location description already locked down");
- LocationKind = Register;
- if (DwarfReg < 32) {
-   emitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment);
+  assert(DwarfReg >= 0 && "invalid negative dwarf register number");
+  assert((isUnknownLocation() || isRegisterLocation()) &&
+         "location description already locked down");
+  LocationKind = Register;
+  if (DwarfReg < 32) {
+    emitOp(dwarf::DW_OP_reg0 + DwarfReg, Comment);
   } else {
     emitOp(dwarf::DW_OP_regx, Comment);
     emitUnsigned(DwarfReg);
@@ -100,7 +100,7 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
                                     unsigned MachineReg, unsigned MaxSize) {
   if (!llvm::Register::isPhysicalRegister(MachineReg)) {
     if (isFrameRegister(TRI, MachineReg)) {
-      DwarfRegs.push_back({-1, 0, nullptr});
+      DwarfRegs.push_back(Register::createRegister(-1, nullptr));
       return true;
     }
     return false;
@@ -110,7 +110,7 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
 
   // If this is a valid register number, emit it.
   if (Reg >= 0) {
-    DwarfRegs.push_back({Reg, 0, nullptr});
+    DwarfRegs.push_back(Register::createRegister(Reg, nullptr));
     return true;
   }
 
@@ -122,7 +122,7 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
       unsigned Idx = TRI.getSubRegIndex(*SR, MachineReg);
       unsigned Size = TRI.getSubRegIdxSize(Idx);
       unsigned RegOffset = TRI.getSubRegIdxOffset(Idx);
-      DwarfRegs.push_back({Reg, 0, "super-register"});
+      DwarfRegs.push_back(Register::createRegister(Reg, "super-register"));
       // Use a DW_OP_bit_piece to describe the sub-register.
       setSubRegisterPiece(Size, RegOffset);
       return true;
@@ -149,8 +149,8 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
     if (Reg < 0)
       continue;
 
-    // Intersection between the bits we already emitted and the bits
-    // covered by this subregister.
+    // Used to build the intersection between the bits we already
+    // emitted and the bits covered by this subregister.
     SmallBitVector CurSubReg(RegSize, false);
     CurSubReg.set(Offset, Offset + Size);
 
@@ -159,10 +159,13 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
     if (Offset < MaxSize && CurSubReg.test(Coverage)) {
       // Emit a piece for any gap in the coverage.
       if (Offset > CurPos)
-        DwarfRegs.push_back(
-            {-1, Offset - CurPos, "no DWARF register encoding"});
-      DwarfRegs.push_back(
-          {Reg, std::min<unsigned>(Size, MaxSize - Offset), "sub-register"});
+        DwarfRegs.push_back(Register::createSubRegister(
+            -1, Offset - CurPos, "no DWARF register encoding"));
+      if (Offset == 0 && Size >= MaxSize)
+        DwarfRegs.push_back(Register::createRegister(Reg, "sub-register"));
+      else
+        DwarfRegs.push_back(Register::createSubRegister(
+            Reg, std::min<unsigned>(Size, MaxSize - Offset), "sub-register"));
     }
     // Mark it as emitted.
     Coverage.set(Offset, Offset + Size);
@@ -173,7 +176,8 @@ bool DwarfExpression::addMachineReg(const TargetRegisterInfo &TRI,
     return false;
   // Found a partial or complete DWARF encoding.
   if (CurPos < RegSize)
-    DwarfRegs.push_back({-1, RegSize - CurPos, "no DWARF register encoding"});
+    DwarfRegs.push_back(Register::createSubRegister(
+        -1, RegSize - CurPos, "no DWARF register encoding"));
   return true;
 }
 
@@ -233,8 +237,17 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
   // If the register can only be described by a complex expression (i.e.,
   // multiple subregisters) it doesn't safely compose with another complex
   // expression. For example, it is not possible to apply a DW_OP_deref
-  // operation to multiple DW_OP_pieces.
-  if (HasComplexExpression && DwarfRegs.size() > 1) {
+  // operation to multiple DW_OP_pieces, since composite location descriptions
+  // do not push anything on the DWARF stack.
+  //
+  // DW_OP_entry_value operations can only hold a DWARF expression or a
+  // register location description, so we can't emit a single entry value
+  // covering a composite location description. In the future we may want to
+  // emit entry value operations for each register location in the composite
+  // location, but until that is supported do not emit anything.
+  if ((HasComplexExpression || IsEmittingEntryValue) && DwarfRegs.size() > 1) {
+    if (IsEmittingEntryValue)
+      cancelEntryValue();
     DwarfRegs.clear();
     LocationKind = Unknown;
     return false;
@@ -244,18 +257,19 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
   // a call site parameter expression and if that expression is just a register
   // location, emit it with addBReg and offset 0, because we should emit a DWARF
   // expression representing a value, rather than a location.
-  if (!isMemoryLocation() && !HasComplexExpression && (!isParameterValue() ||
-                                                       isEntryValue())) {
+  if (!isMemoryLocation() && !HasComplexExpression &&
+      (!isParameterValue() || isEntryValue())) {
     for (auto &Reg : DwarfRegs) {
       if (Reg.DwarfRegNo >= 0)
         addReg(Reg.DwarfRegNo, Reg.Comment);
-      addOpPiece(Reg.Size);
+      addOpPiece(Reg.SubRegSize);
     }
 
     if (isEntryValue())
       finalizeEntryValue();
 
-    if (isEntryValue() && !isParameterValue() && DwarfVersion >= 4)
+    if (isEntryValue() && !isIndirect() && !isParameterValue() &&
+        DwarfVersion >= 4)
       emitOp(dwarf::DW_OP_stack_value);
 
     DwarfRegs.clear();
@@ -276,7 +290,7 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
   auto Reg = DwarfRegs[0];
   bool FBReg = isFrameRegister(TRI, MachineReg);
   int SignedOffset = 0;
-  assert(Reg.Size == 0 && "subregister has same size as superregister");
+  assert(!Reg.isSubRegister() && "full register expected");
 
   // Pattern-match combinations for which more efficient representations exist.
   // [Reg, DW_OP_plus_uconst, Offset] --> [DW_OP_breg, Offset].
@@ -314,6 +328,25 @@ bool DwarfExpression::addMachineRegExpression(const TargetRegisterInfo &TRI,
   return true;
 }
 
+void DwarfExpression::setEntryValueFlags(const MachineLocation &Loc) {
+  LocationFlags |= EntryValue;
+  if (Loc.isIndirect())
+    LocationFlags |= Indirect;
+}
+
+void DwarfExpression::setLocation(const MachineLocation &Loc,
+                                  const DIExpression *DIExpr) {
+  if (Loc.isIndirect())
+    // Do not treat entry value descriptions of indirect parameters as memory
+    // locations. This allows DwarfExpression::addReg() to add DW_OP_regN to an
+    // entry value description.
+    if (!DIExpr->isEntryValue())
+      setMemoryLocationKind();
+
+  if (DIExpr->isEntryValue())
+    setEntryValueFlags(Loc);
+}
+
 void DwarfExpression::beginEntryValueExpression(
     DIExpressionCursor &ExprCursor) {
   auto Op = ExprCursor.take();
@@ -325,7 +358,6 @@ void DwarfExpression::beginEntryValueExpression(
   assert(Op->getArg(0) == 1 &&
          "Can currently only emit entry values covering a single operation");
 
-  emitOp(CU.getDwarf5OrGNULocationAtom(dwarf::DW_OP_entry_value));
   IsEmittingEntryValue = true;
   enableTemporaryBuffer();
 }
@@ -334,6 +366,8 @@ void DwarfExpression::finalizeEntryValue() {
   assert(IsEmittingEntryValue && "Entry value not open?");
   disableTemporaryBuffer();
 
+  emitOp(CU.getDwarf5OrGNULocationAtom(dwarf::DW_OP_entry_value));
+
   // Emit the entry value's size operand.
   unsigned Size = getTemporaryBufferSize();
   emitUnsigned(Size);
@@ -344,7 +378,35 @@ void DwarfExpression::finalizeEntryValue() {
   IsEmittingEntryValue = false;
 }
 
-/// Assuming a well-formed expression, match "DW_OP_deref* DW_OP_LLVM_fragment?".
+void DwarfExpression::cancelEntryValue() {
+  assert(IsEmittingEntryValue && "Entry value not open?");
+  disableTemporaryBuffer();
+
+  // The temporary buffer can't be emptied, so for now just assert that nothing
+  // has been emitted to it.
+  assert(getTemporaryBufferSize() == 0 &&
+         "Began emitting entry value block before cancelling entry value");
+
+  IsEmittingEntryValue = false;
+}
+
+unsigned DwarfExpression::getOrCreateBaseType(unsigned BitSize,
+                                              dwarf::TypeKind Encoding) {
+  // Reuse the base_type if we already have one in this CU otherwise we
+  // create a new one.
+  unsigned I = 0, E = CU.ExprRefedBaseTypes.size();
+  for (; I != E; ++I)
+    if (CU.ExprRefedBaseTypes[I].BitSize == BitSize &&
+        CU.ExprRefedBaseTypes[I].Encoding == Encoding)
+      break;
+
+  if (I == E)
+    CU.ExprRefedBaseTypes.emplace_back(BitSize, Encoding);
+  return I;
+}
+
+/// Assuming a well-formed expression, match "DW_OP_deref*
+/// DW_OP_LLVM_fragment?".
 static bool isMemoryLocation(DIExpressionCursor ExprCursor) {
   while (ExprCursor) {
     auto Op = ExprCursor.take();
@@ -361,6 +423,10 @@ static bool isMemoryLocation(DIExpressionCursor ExprCursor) {
 
 void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
                                     unsigned FragmentOffsetInBits) {
+  // Entry values can currently only cover the initial register location,
+  // and not any other parts of the following DWARF expression.
+  assert(!IsEmittingEntryValue && "Can't emit entry value around expression");
+
   // If we need to mask out a subregister, do it now, unless the next
   // operation would emit an OpPiece anyway.
   auto N = ExprCursor.peek();
@@ -431,6 +497,7 @@ void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
     case dwarf::DW_OP_lit0:
     case dwarf::DW_OP_not:
     case dwarf::DW_OP_dup:
+    case dwarf::DW_OP_push_object_address:
       emitOp(OpNum);
       break;
     case dwarf::DW_OP_deref:
@@ -451,24 +518,13 @@ void DwarfExpression::addExpression(DIExpressionCursor &&ExprCursor,
       dwarf::TypeKind Encoding = static_cast<dwarf::TypeKind>(Op->getArg(1));
       if (DwarfVersion >= 5) {
         emitOp(dwarf::DW_OP_convert);
-        // Reuse the base_type if we already have one in this CU otherwise we
-        // create a new one.
-        unsigned I = 0, E = CU.ExprRefedBaseTypes.size();
-        for (; I != E; ++I)
-          if (CU.ExprRefedBaseTypes[I].BitSize == BitSize &&
-              CU.ExprRefedBaseTypes[I].Encoding == Encoding)
-            break;
-
-        if (I == E)
-          CU.ExprRefedBaseTypes.emplace_back(BitSize, Encoding);
-
         // If targeting a location-list; simply emit the index into the raw
         // byte stream as ULEB128, DwarfDebug::emitDebugLocEntry has been
         // fitted with means to extract it later.
         // If targeting a inlined DW_AT_location; insert a DIEBaseTypeRef
         // (containing the index and a resolve mechanism during emit) into the
         // DIE value list.
-        emitBaseTypeRef(I);
+        emitBaseTypeRef(getOrCreateBaseType(BitSize, Encoding));
       } else {
         if (PrevConvertOp && PrevConvertOp->getArg(0) < BitSize) {
           if (Encoding == dwarf::DW_ATE_signed)
@@ -573,10 +629,10 @@ void DwarfExpression::emitLegacyZExt(unsigned FromBits) {
   emitOp(dwarf::DW_OP_and);
 }
 
-void DwarfExpression::addWasmLocation(unsigned Index, int64_t Offset) {
+void DwarfExpression::addWasmLocation(unsigned Index, uint64_t Offset) {
   assert(LocationKind == Implicit || LocationKind == Unknown);
   LocationKind = Implicit;
   emitOp(dwarf::DW_OP_WASM_location);
   emitUnsigned(Index);
-  emitSigned(Offset);
+  emitUnsigned(Offset);
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
index 46c07b1d5b6b..757b17511453 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfExpression.h
@@ -30,6 +30,7 @@ class APInt;
 class DwarfCompileUnit;
 class DIELoc;
 class TargetRegisterInfo;
+class MachineLocation;
 
 /// Holds a DIExpression and keeps track of how many operands have been consumed
 /// so far.
@@ -107,8 +108,21 @@ protected:
   /// Holds information about all subregisters comprising a register location.
   struct Register {
     int DwarfRegNo;
-    unsigned Size;
+    unsigned SubRegSize;
     const char *Comment;
+
+    /// Create a full register, no extra DW_OP_piece operators necessary.
+    static Register createRegister(int RegNo, const char *Comment) {
+      return {RegNo, 0, Comment};
+    }
+
+    /// Create a subregister that needs a DW_OP_piece operator with SizeInBits.
+    static Register createSubRegister(int RegNo, unsigned SizeInBits,
+                                      const char *Comment) {
+      return {RegNo, SizeInBits, Comment};
+    }
+
+    bool isSubRegister() const { return SubRegSize; }
   };
 
   /// Whether we are currently emitting an entry value operation.
@@ -129,37 +143,31 @@ protected:
   /// The kind of location description being produced.
   enum { Unknown = 0, Register, Memory, Implicit };
 
-  /// The flags of location description being produced.
-  enum { EntryValue = 1, CallSiteParamValue };
+  /// Additional location flags which may be combined with any location kind.
+  /// Currently, entry values are not supported for the Memory location kind.
+  enum { EntryValue = 1 << 0, Indirect = 1 << 1, CallSiteParamValue = 1 << 2 };
 
   unsigned LocationKind : 3;
-  unsigned LocationFlags : 2;
+  unsigned LocationFlags : 3;
   unsigned DwarfVersion : 4;
 
 public:
-  bool isUnknownLocation() const {
-    return LocationKind == Unknown;
-  }
+  /// Set the location (\p Loc) and \ref DIExpression (\p DIExpr) to describe.
+  void setLocation(const MachineLocation &Loc, const DIExpression *DIExpr);
 
-  bool isMemoryLocation() const {
-    return LocationKind == Memory;
-  }
+  bool isUnknownLocation() const { return LocationKind == Unknown; }
 
-  bool isRegisterLocation() const {
-    return LocationKind == Register;
-  }
+  bool isMemoryLocation() const { return LocationKind == Memory; }
 
-  bool isImplicitLocation() const {
-    return LocationKind == Implicit;
-  }
+  bool isRegisterLocation() const { return LocationKind == Register; }
 
-  bool isEntryValue() const {
-    return LocationFlags & EntryValue;
-  }
+  bool isImplicitLocation() const { return LocationKind == Implicit; }
 
-  bool isParameterValue() {
-    return LocationFlags & CallSiteParamValue;
-  }
+  bool isEntryValue() const { return LocationFlags & EntryValue; }
+
+  bool isIndirect() const { return LocationFlags & Indirect; }
+
+  bool isParameterValue() { return LocationFlags & CallSiteParamValue; }
 
   Optional<uint8_t> TagOffset;
 
@@ -209,7 +217,8 @@ protected:
 
   /// Return whether the given machine register is the frame register in the
   /// current function.
-  virtual bool isFrameRegister(const TargetRegisterInfo &TRI, unsigned MachineReg) = 0;
+  virtual bool isFrameRegister(const TargetRegisterInfo &TRI,
+                               unsigned MachineReg) = 0;
 
   /// Emit a DW_OP_reg operation. Note that this is only legal inside a DWARF
   /// register location description.
@@ -267,6 +276,9 @@ protected:
   /// DWARF block which has been emitted to the temporary buffer.
   void finalizeEntryValue();
 
+  /// Cancel the emission of an entry value.
+  void cancelEntryValue();
+
   ~DwarfExpression() = default;
 
 public:
@@ -294,14 +306,10 @@ public:
   }
 
   /// Lock this down to become an entry value location.
-  void setEntryValueFlag() {
-    LocationFlags |= EntryValue;
-  }
+  void setEntryValueFlags(const MachineLocation &Loc);
 
   /// Lock this down to become a call site parameter location.
-  void setCallSiteParamValueFlag() {
-    LocationFlags |= CallSiteParamValue;
-  }
+  void setCallSiteParamValueFlag() { LocationFlags |= CallSiteParamValue; }
 
   /// Emit a machine register location. As an optimization this may also consume
   /// the prefix of a DwarfExpression if a more efficient representation for
@@ -323,6 +331,10 @@ public:
   /// any operands here.
   void beginEntryValueExpression(DIExpressionCursor &ExprCursor);
 
+  /// Return the index of a base type with the given properties and
+  /// create one if necessary.
+  unsigned getOrCreateBaseType(unsigned BitSize, dwarf::TypeKind Encoding);
+
   /// Emit all remaining operations in the DIExpressionCursor.
   ///
   /// \param FragmentOffsetInBits     If this is one fragment out of multiple
@@ -340,7 +352,7 @@ public:
 
   /// Emit location information expressed via WebAssembly location + offset
   /// The Index is an identifier for locals, globals or operand stack.
-  void addWasmLocation(unsigned Index, int64_t Offset);
+  void addWasmLocation(unsigned Index, uint64_t Offset);
 };
 
 /// DwarfExpression implementation for .debug_loc entries.
@@ -374,6 +386,7 @@ class DebugLocDwarfExpression final : public DwarfExpression {
 
   bool isFrameRegister(const TargetRegisterInfo &TRI,
                        unsigned MachineReg) override;
+
 public:
   DebugLocDwarfExpression(unsigned DwarfVersion, BufferByteStreamer &BS,
                           DwarfCompileUnit &CU)
@@ -403,6 +416,7 @@ class DIEDwarfExpression final : public DwarfExpression {
 
   bool isFrameRegister(const TargetRegisterInfo &TRI,
                        unsigned MachineReg) override;
+
 public:
   DIEDwarfExpression(const AsmPrinter &AP, DwarfCompileUnit &CU, DIELoc &DIE);
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
index e5c4db58f477..812e6383288f 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfFile.cpp
@@ -53,7 +53,7 @@ void DwarfFile::emitUnit(DwarfUnit *TheU, bool UseOffsets) {
   Asm->emitDwarfDIE(TheU->getUnitDie());
 
   if (MCSymbol *EndLabel = TheU->getEndLabel())
-    Asm->OutStreamer->EmitLabel(EndLabel);
+    Asm->OutStreamer->emitLabel(EndLabel);
 }
 
 // Compute the size and offset for each DIE.
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
index 2a76dcb1b082..a43929d8e8f7 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfStringPool.cpp
@@ -71,7 +71,7 @@ void DwarfStringPool::emitStringOffsetsTableHeader(AsmPrinter &Asm,
   // referenced by most unit headers via DW_AT_str_offsets_base.
   // Split units do not use the attribute.
   if (StartSym)
-    Asm.OutStreamer->EmitLabel(StartSym);
+    Asm.OutStreamer->emitLabel(StartSym);
 }
 
 void DwarfStringPool::emit(AsmPrinter &Asm, MCSection *StrSection,
@@ -100,12 +100,12 @@ void DwarfStringPool::emit(AsmPrinter &Asm, MCSection *StrSection,
 
     // Emit a label for reference from debug information entries.
     if (ShouldCreateSymbols)
-      Asm.OutStreamer->EmitLabel(Entry->getValue().Symbol);
+      Asm.OutStreamer->emitLabel(Entry->getValue().Symbol);
 
     // Emit the string itself with a terminating null byte.
     Asm.OutStreamer->AddComment("string offset=" +
                                 Twine(Entry->getValue().Offset));
-    Asm.OutStreamer->EmitBytes(
+    Asm.OutStreamer->emitBytes(
         StringRef(Entry->getKeyData(), Entry->getKeyLength() + 1));
   }
 
@@ -125,6 +125,6 @@ void DwarfStringPool::emit(AsmPrinter &Asm, MCSection *StrSection,
       if (UseRelativeOffsets)
         Asm.emitDwarfStringOffset(Entry->getValue());
       else
-        Asm.OutStreamer->EmitIntValue(Entry->getValue().Offset, size);
+        Asm.OutStreamer->emitIntValue(Entry->getValue().Offset, size);
   }
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
index 1aba956c48de..e958f38e486b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.cpp
@@ -1045,6 +1045,8 @@ void DwarfUnit::constructTemplateTypeParameterDIE(
     addType(ParamDIE, TP->getType());
   if (!TP->getName().empty())
     addString(ParamDIE, dwarf::DW_AT_name, TP->getName());
+  if (TP->isDefault() && (DD->getDwarfVersion() >= 5))
+    addFlag(ParamDIE, dwarf::DW_AT_default_value);
 }
 
 void DwarfUnit::constructTemplateValueParameterDIE(
@@ -1057,6 +1059,8 @@ void DwarfUnit::constructTemplateValueParameterDIE(
     addType(ParamDIE, VP->getType());
   if (!VP->getName().empty())
     addString(ParamDIE, dwarf::DW_AT_name, VP->getName());
+  if (VP->isDefault() && (DD->getDwarfVersion() >= 5))
+    addFlag(ParamDIE, dwarf::DW_AT_default_value);
   if (Metadata *Val = VP->getValue()) {
     if (ConstantInt *CI = mdconst::dyn_extract<ConstantInt>(Val))
       addConstantValue(ParamDIE, CI, VP->getType());
@@ -1122,8 +1126,13 @@ DIE *DwarfUnit::getOrCreateModule(const DIModule *M) {
               M->getConfigurationMacros());
   if (!M->getIncludePath().empty())
     addString(MDie, dwarf::DW_AT_LLVM_include_path, M->getIncludePath());
-  if (!M->getSysRoot().empty())
-    addString(MDie, dwarf::DW_AT_LLVM_sysroot, M->getSysRoot());
+  if (!M->getAPINotesFile().empty())
+    addString(MDie, dwarf::DW_AT_LLVM_apinotes, M->getAPINotesFile());
+  if (M->getFile())
+    addUInt(MDie, dwarf::DW_AT_decl_file, None,
+            getOrCreateSourceID(M->getFile()));
+  if (M->getLineNo())
+    addUInt(MDie, dwarf::DW_AT_decl_line, None, M->getLineNo());
 
   return &MDie;
 }
@@ -1165,6 +1174,14 @@ bool DwarfUnit::applySubprogramDefinitionAttributes(const DISubprogram *SP,
   DIE *DeclDie = nullptr;
   StringRef DeclLinkageName;
   if (auto *SPDecl = SP->getDeclaration()) {
+    DITypeRefArray DeclArgs, DefinitionArgs;
+    DeclArgs = SPDecl->getType()->getTypeArray();
+    DefinitionArgs = SP->getType()->getTypeArray();
+
+    if (DeclArgs.size() && DefinitionArgs.size())
+      if (DefinitionArgs[0] != NULL && DeclArgs[0] != DefinitionArgs[0])
+        addType(SPDie, DefinitionArgs[0]);
+
     DeclDie = getDIE(SPDecl);
     assert(DeclDie && "This DIE should've already been constructed when the "
                       "definition DIE was created in "
@@ -1332,20 +1349,40 @@ void DwarfUnit::constructSubrangeDIE(DIE &Buffer, const DISubrange *SR,
   // C/C++. The Count value is the number of elements.  Values are 64 bit. If
   // Count == -1 then the array is unbounded and we do not emit
   // DW_AT_lower_bound and DW_AT_count attributes.
-  int64_t LowerBound = SR->getLowerBound();
   int64_t DefaultLowerBound = getDefaultLowerBound();
   int64_t Count = -1;
   if (auto *CI = SR->getCount().dyn_cast<ConstantInt*>())
     Count = CI->getSExtValue();
 
-  if (DefaultLowerBound == -1 || LowerBound != DefaultLowerBound)
-    addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, None, LowerBound);
+  auto addBoundTypeEntry = [&](dwarf::Attribute Attr,
+                               DISubrange::BoundType Bound) -> void {
+    if (auto *BV = Bound.dyn_cast<DIVariable *>()) {
+      if (auto *VarDIE = getDIE(BV))
+        addDIEEntry(DW_Subrange, Attr, *VarDIE);
+    } else if (auto *BE = Bound.dyn_cast<DIExpression *>()) {
+      DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+      DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+      DwarfExpr.setMemoryLocationKind();
+      DwarfExpr.addExpression(BE);
+      addBlock(DW_Subrange, Attr, DwarfExpr.finalize());
+    } else if (auto *BI = Bound.dyn_cast<ConstantInt *>()) {
+      if (Attr != dwarf::DW_AT_lower_bound || DefaultLowerBound == -1 ||
+          BI->getSExtValue() != DefaultLowerBound)
+        addSInt(DW_Subrange, Attr, dwarf::DW_FORM_sdata, BI->getSExtValue());
+    }
+  };
+
+  addBoundTypeEntry(dwarf::DW_AT_lower_bound, SR->getLowerBound());
 
   if (auto *CV = SR->getCount().dyn_cast<DIVariable*>()) {
     if (auto *CountVarDIE = getDIE(CV))
       addDIEEntry(DW_Subrange, dwarf::DW_AT_count, *CountVarDIE);
   } else if (Count != -1)
     addUInt(DW_Subrange, dwarf::DW_AT_count, None, Count);
+
+  addBoundTypeEntry(dwarf::DW_AT_upper_bound, SR->getUpperBound());
+
+  addBoundTypeEntry(dwarf::DW_AT_byte_stride, SR->getStride());
 }
 
 DIE *DwarfUnit::getIndexTyDie() {
@@ -1397,6 +1434,17 @@ void DwarfUnit::constructArrayTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
               CTy->getSizeInBits() / CHAR_BIT);
   }
 
+  if (DIVariable *Var = CTy->getDataLocation()) {
+    if (auto *VarDIE = getDIE(Var))
+      addDIEEntry(Buffer, dwarf::DW_AT_data_location, *VarDIE);
+  } else if (DIExpression *Expr = CTy->getDataLocationExp()) {
+    DIELoc *Loc = new (DIEValueAllocator) DIELoc;
+    DIEDwarfExpression DwarfExpr(*Asm, getCU(), *Loc);
+    DwarfExpr.setMemoryLocationKind();
+    DwarfExpr.addExpression(Expr);
+    addBlock(Buffer, dwarf::DW_AT_data_location, DwarfExpr.finalize());
+  }
+
   // Emit the element type.
   addType(Buffer, CTy->getBaseType());
 
@@ -1437,8 +1485,7 @@ void DwarfUnit::constructEnumTypeDIE(DIE &Buffer, const DICompositeType *CTy) {
       DIE &Enumerator = createAndAddDIE(dwarf::DW_TAG_enumerator, Buffer);
       StringRef Name = Enum->getName();
       addString(Enumerator, dwarf::DW_AT_name, Name);
-      auto Value = static_cast<uint64_t>(Enum->getValue());
-      addConstantValue(Enumerator, IsUnsigned, Value);
+      addConstantValue(Enumerator, Enum->getValue(), IsUnsigned);
       if (IndexEnumerators)
         addGlobalName(Name, Enumerator, Context);
     }
@@ -1622,8 +1669,8 @@ void DwarfUnit::emitCommonHeader(bool UseOffsets, dwarf::UnitType UT) {
     StringRef Prefix = isDwoUnit() ? "debug_info_dwo_" : "debug_info_";
     MCSymbol *BeginLabel = Asm->createTempSymbol(Prefix + "start");
     EndLabel = Asm->createTempSymbol(Prefix + "end");
-    Asm->EmitLabelDifference(EndLabel, BeginLabel, 4);
-    Asm->OutStreamer->EmitLabel(BeginLabel);
+    Asm->emitLabelDifference(EndLabel, BeginLabel, 4);
+    Asm->OutStreamer->emitLabel(BeginLabel);
   } else
     Asm->emitInt32(getHeaderSize() + getUnitDie().getSize());
 
@@ -1661,10 +1708,10 @@ void DwarfTypeUnit::emitHeader(bool UseOffsets) {
                               DD->useSplitDwarf() ? dwarf::DW_UT_split_type
                                                   : dwarf::DW_UT_type);
   Asm->OutStreamer->AddComment("Type Signature");
-  Asm->OutStreamer->EmitIntValue(TypeSignature, sizeof(TypeSignature));
+  Asm->OutStreamer->emitIntValue(TypeSignature, sizeof(TypeSignature));
   Asm->OutStreamer->AddComment("Type DIE Offset");
   // In a skeleton type unit there is no type DIE so emit a zero offset.
-  Asm->OutStreamer->EmitIntValue(Ty ? Ty->getOffset() : 0,
+  Asm->OutStreamer->emitIntValue(Ty ? Ty->getOffset() : 0,
                                  sizeof(Ty->getOffset()));
 }
 
diff --git a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
index 46c52a1faf4b..34f3a34ed336 100644
--- a/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
+++ b/llvm/lib/CodeGen/AsmPrinter/DwarfUnit.h
@@ -27,7 +27,6 @@
 
 namespace llvm {
 
-class MachineLocation;
 class MachineOperand;
 class ConstantInt;
 class ConstantFP;
diff --git a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
index 31dfaaac836e..99ee4567fa58 100644
--- a/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/EHStreamer.cpp
@@ -426,18 +426,18 @@ MCSymbol *EHStreamer::emitExceptionTable() {
   // EHABI). In this case LSDASection will be NULL.
   if (LSDASection)
     Asm->OutStreamer->SwitchSection(LSDASection);
-  Asm->EmitAlignment(Align(4));
+  Asm->emitAlignment(Align(4));
 
   // Emit the LSDA.
   MCSymbol *GCCETSym =
     Asm->OutContext.getOrCreateSymbol(Twine("GCC_except_table")+
                                       Twine(Asm->getFunctionNumber()));
-  Asm->OutStreamer->EmitLabel(GCCETSym);
-  Asm->OutStreamer->EmitLabel(Asm->getCurExceptionSym());
+  Asm->OutStreamer->emitLabel(GCCETSym);
+  Asm->OutStreamer->emitLabel(Asm->getCurExceptionSym());
 
   // Emit the LSDA header.
-  Asm->EmitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart");
-  Asm->EmitEncodingByte(TTypeEncoding, "@TType");
+  Asm->emitEncodingByte(dwarf::DW_EH_PE_omit, "@LPStart");
+  Asm->emitEncodingByte(TTypeEncoding, "@TType");
 
   MCSymbol *TTBaseLabel = nullptr;
   if (HaveTTData) {
@@ -447,8 +447,8 @@ MCSymbol *EHStreamer::emitExceptionTable() {
     // the type table. See PR35809 or GNU as bug 4029.
     MCSymbol *TTBaseRefLabel = Asm->createTempSymbol("ttbaseref");
     TTBaseLabel = Asm->createTempSymbol("ttbase");
-    Asm->EmitLabelDifferenceAsULEB128(TTBaseLabel, TTBaseRefLabel);
-    Asm->OutStreamer->EmitLabel(TTBaseRefLabel);
+    Asm->emitLabelDifferenceAsULEB128(TTBaseLabel, TTBaseRefLabel);
+    Asm->OutStreamer->emitLabel(TTBaseRefLabel);
   }
 
   bool VerboseAsm = Asm->OutStreamer->isVerboseAsm();
@@ -456,9 +456,9 @@ MCSymbol *EHStreamer::emitExceptionTable() {
   // Emit the landing pad call site table.
   MCSymbol *CstBeginLabel = Asm->createTempSymbol("cst_begin");
   MCSymbol *CstEndLabel = Asm->createTempSymbol("cst_end");
-  Asm->EmitEncodingByte(CallSiteEncoding, "Call site");
-  Asm->EmitLabelDifferenceAsULEB128(CstEndLabel, CstBeginLabel);
-  Asm->OutStreamer->EmitLabel(CstBeginLabel);
+  Asm->emitEncodingByte(CallSiteEncoding, "Call site");
+  Asm->emitLabelDifferenceAsULEB128(CstEndLabel, CstBeginLabel);
+  Asm->OutStreamer->emitLabel(CstBeginLabel);
 
   // SjLj / Wasm Exception handling
   if (IsSJLJ || IsWasm) {
@@ -472,7 +472,7 @@ MCSymbol *EHStreamer::emitExceptionTable() {
         Asm->OutStreamer->AddComment(">> Call Site " + Twine(idx) + " <<");
         Asm->OutStreamer->AddComment("  On exception at call site "+Twine(idx));
       }
-      Asm->EmitULEB128(idx);
+      Asm->emitULEB128(idx);
 
       // Offset of the first associated action record, relative to the start of
       // the action table. This value is biased by 1 (1 indicates the start of
@@ -484,7 +484,7 @@ MCSymbol *EHStreamer::emitExceptionTable() {
           Asm->OutStreamer->AddComment("  Action: " +
                                        Twine((S.Action - 1) / 2 + 1));
       }
-      Asm->EmitULEB128(S.Action);
+      Asm->emitULEB128(S.Action);
     }
   } else {
     // Itanium LSDA exception handling
@@ -524,23 +524,23 @@ MCSymbol *EHStreamer::emitExceptionTable() {
       // Offset of the call site relative to the start of the procedure.
       if (VerboseAsm)
         Asm->OutStreamer->AddComment(">> Call Site " + Twine(++Entry) + " <<");
-      Asm->EmitCallSiteOffset(BeginLabel, EHFuncBeginSym, CallSiteEncoding);
+      Asm->emitCallSiteOffset(BeginLabel, EHFuncBeginSym, CallSiteEncoding);
       if (VerboseAsm)
         Asm->OutStreamer->AddComment(Twine("  Call between ") +
                                      BeginLabel->getName() + " and " +
                                      EndLabel->getName());
-      Asm->EmitCallSiteOffset(EndLabel, BeginLabel, CallSiteEncoding);
+      Asm->emitCallSiteOffset(EndLabel, BeginLabel, CallSiteEncoding);
 
       // Offset of the landing pad relative to the start of the procedure.
       if (!S.LPad) {
         if (VerboseAsm)
           Asm->OutStreamer->AddComment("    has no landing pad");
-        Asm->EmitCallSiteValue(0, CallSiteEncoding);
+        Asm->emitCallSiteValue(0, CallSiteEncoding);
       } else {
         if (VerboseAsm)
           Asm->OutStreamer->AddComment(Twine("    jumps to ") +
                                        S.LPad->LandingPadLabel->getName());
-        Asm->EmitCallSiteOffset(S.LPad->LandingPadLabel, EHFuncBeginSym,
+        Asm->emitCallSiteOffset(S.LPad->LandingPadLabel, EHFuncBeginSym,
                                 CallSiteEncoding);
       }
 
@@ -554,10 +554,10 @@ MCSymbol *EHStreamer::emitExceptionTable() {
           Asm->OutStreamer->AddComment("  On action: " +
                                        Twine((S.Action - 1) / 2 + 1));
       }
-      Asm->EmitULEB128(S.Action);
+      Asm->emitULEB128(S.Action);
     }
   }
-  Asm->OutStreamer->EmitLabel(CstEndLabel);
+  Asm->OutStreamer->emitLabel(CstEndLabel);
 
   // Emit the Action Table.
   int Entry = 0;
@@ -584,7 +584,7 @@ MCSymbol *EHStreamer::emitExceptionTable() {
       else
         Asm->OutStreamer->AddComment("  Cleanup");
     }
-    Asm->EmitSLEB128(Action.ValueForTypeID);
+    Asm->emitSLEB128(Action.ValueForTypeID);
 
     // Action Record
     //
@@ -598,15 +598,15 @@ MCSymbol *EHStreamer::emitExceptionTable() {
         Asm->OutStreamer->AddComment("  Continue to action "+Twine(NextAction));
       }
     }
-    Asm->EmitSLEB128(Action.NextAction);
+    Asm->emitSLEB128(Action.NextAction);
   }
 
   if (HaveTTData) {
-    Asm->EmitAlignment(Align(4));
+    Asm->emitAlignment(Align(4));
     emitTypeInfos(TTypeEncoding, TTBaseLabel);
   }
 
-  Asm->EmitAlignment(Align(4));
+  Asm->emitAlignment(Align(4));
   return GCCETSym;
 }
 
@@ -629,10 +629,10 @@ void EHStreamer::emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) {
                                           TypeInfos.rend())) {
     if (VerboseAsm)
       Asm->OutStreamer->AddComment("TypeInfo " + Twine(Entry--));
-    Asm->EmitTTypeReference(GV, TTypeEncoding);
+    Asm->emitTTypeReference(GV, TTypeEncoding);
   }
 
-  Asm->OutStreamer->EmitLabel(TTBaseLabel);
+  Asm->OutStreamer->emitLabel(TTBaseLabel);
 
   // Emit the Exception Specifications.
   if (VerboseAsm && !FilterIds.empty()) {
@@ -649,6 +649,6 @@ void EHStreamer::emitTypeInfos(unsigned TTypeEncoding, MCSymbol *TTBaseLabel) {
         Asm->OutStreamer->AddComment("FilterInfo " + Twine(Entry));
     }
 
-    Asm->EmitULEB128(TypeID);
+    Asm->emitULEB128(TypeID);
   }
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
index 3849644d1584..59a84e6f2d7b 100644
--- a/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/ErlangGCPrinter.cpp
@@ -72,7 +72,7 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
      **/
 
     // Align to address width.
-    AP.EmitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
+    AP.emitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
 
     // Emit PointCount.
     OS.AddComment("safe point count");
@@ -84,7 +84,7 @@ void ErlangGCPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
       // Emit the address of the safe point.
       OS.AddComment("safe point address");
       MCSymbol *Label = PI->Label;
-      AP.EmitLabelPlusOffset(Label /*Hi*/, 0 /*Offset*/, 4 /*Size*/);
+      AP.emitLabelPlusOffset(Label /*Hi*/, 0 /*Offset*/, 4 /*Size*/);
     }
 
     // Stack information never change in safe points! Only print info from the
diff --git a/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp b/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
index b4eda5fa8c58..8fa83f515910 100644
--- a/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/OcamlGCPrinter.cpp
@@ -66,8 +66,8 @@ static void EmitCamlGlobal(const Module &M, AsmPrinter &AP, const char *Id) {
 
   MCSymbol *Sym = AP.OutContext.getOrCreateSymbol(TmpStr);
 
-  AP.OutStreamer->EmitSymbolAttribute(Sym, MCSA_Global);
-  AP.OutStreamer->EmitLabel(Sym);
+  AP.OutStreamer->emitSymbolAttribute(Sym, MCSA_Global);
+  AP.OutStreamer->emitLabel(Sym);
 }
 
 void OcamlGCMetadataPrinter::beginAssembly(Module &M, GCModuleInfo &Info,
@@ -106,7 +106,7 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
   EmitCamlGlobal(M, AP, "data_end");
 
   // FIXME: Why does ocaml emit this??
-  AP.OutStreamer->EmitIntValue(0, IntPtrSize);
+  AP.OutStreamer->emitIntValue(0, IntPtrSize);
 
   AP.OutStreamer->SwitchSection(AP.getObjFileLowering().getDataSection());
   EmitCamlGlobal(M, AP, "frametable");
@@ -129,7 +129,7 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
     report_fatal_error(" Too much descriptor for ocaml GC");
   }
   AP.emitInt16(NumDescriptors);
-  AP.EmitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
+  AP.emitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
 
   for (GCModuleInfo::FuncInfoVec::iterator I = Info.funcinfo_begin(),
                                            IE = Info.funcinfo_end();
@@ -164,7 +164,7 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
                            Twine(LiveCount) + " >= 65536.");
       }
 
-      AP.OutStreamer->EmitSymbolValue(J->Label, IntPtrSize);
+      AP.OutStreamer->emitSymbolValue(J->Label, IntPtrSize);
       AP.emitInt16(FrameSize);
       AP.emitInt16(LiveCount);
 
@@ -180,7 +180,7 @@ void OcamlGCMetadataPrinter::finishAssembly(Module &M, GCModuleInfo &Info,
         AP.emitInt16(K->StackOffset);
       }
 
-      AP.EmitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
+      AP.emitAlignment(IntPtrSize == 4 ? Align(4) : Align(8));
     }
   }
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp b/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
index 444b0ed17b6d..baef4d2cc849 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WasmException.cpp
@@ -27,7 +27,7 @@ void WasmException::endModule() {
   Mangler::getNameWithPrefix(NameStr, "__cpp_exception", Asm->getDataLayout());
   if (Asm->OutContext.lookupSymbol(NameStr)) {
     MCSymbol *ExceptionSym = Asm->GetExternalSymbolSymbol("__cpp_exception");
-    Asm->OutStreamer->EmitLabel(ExceptionSym);
+    Asm->OutStreamer->emitLabel(ExceptionSym);
   }
 }
 
@@ -58,7 +58,7 @@ void WasmException::endFunction(const MachineFunction *MF) {
   // end marker and set the size as the difference between the start end the end
   // marker.
   MCSymbol *LSDAEndLabel = Asm->createTempSymbol("GCC_except_table_end");
-  Asm->OutStreamer->EmitLabel(LSDAEndLabel);
+  Asm->OutStreamer->emitLabel(LSDAEndLabel);
   MCContext &OutContext = Asm->OutStreamer->getContext();
   const MCExpr *SizeExp = MCBinaryExpr::createSub(
       MCSymbolRefExpr::create(LSDAEndLabel, OutContext),
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
index 0398675577cd..cd8077e7d548 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/FormattedStream.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 using namespace llvm;
 
@@ -203,11 +204,11 @@ void WinException::beginFunclet(const MachineBasicBlock &MBB,
 
     // We want our funclet's entry point to be aligned such that no nops will be
     // present after the label.
-    Asm->EmitAlignment(std::max(Asm->MF->getAlignment(), MBB.getAlignment()),
+    Asm->emitAlignment(std::max(Asm->MF->getAlignment(), MBB.getAlignment()),
                        &F);
 
     // Now that we've emitted the alignment directive, point at our funclet.
-    Asm->OutStreamer->EmitLabel(Sym);
+    Asm->OutStreamer->emitLabel(Sym);
   }
 
   // Mark 'Sym' as starting our funclet.
@@ -276,7 +277,7 @@ void WinException::endFuncletImpl() {
       StringRef FuncLinkageName = GlobalValue::dropLLVMManglingEscape(F.getName());
       MCSymbol *FuncInfoXData = Asm->OutContext.getOrCreateSymbol(
           Twine("$cppxdata$", FuncLinkageName));
-      Asm->OutStreamer->EmitValue(create32bitRef(FuncInfoXData), 4);
+      Asm->OutStreamer->emitValue(create32bitRef(FuncInfoXData), 4);
     } else if (Per == EHPersonality::MSVC_Win64SEH && MF->hasEHFunclets() &&
                !CurrentFuncletEntry->isEHFuncletEntry()) {
       // If this is the parent function in Win64 SEH, emit the LSDA immediately
@@ -336,7 +337,7 @@ const MCExpr *WinException::getOffsetPlusOne(const MCSymbol *OffsetOf,
 int WinException::getFrameIndexOffset(int FrameIndex,
                                       const WinEHFuncInfo &FuncInfo) {
   const TargetFrameLowering &TFI = *Asm->MF->getSubtarget().getFrameLowering();
-  unsigned UnusedReg;
+  Register UnusedReg;
   if (Asm->MAI->usesWindowsCFI()) {
     int Offset =
         TFI.getFrameIndexReferencePreferSP(*Asm->MF, FrameIndex, UnusedReg,
@@ -566,7 +567,7 @@ void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) {
         Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName);
     const MCExpr *MCOffset =
         MCConstantExpr::create(FuncInfo.SEHSetFrameOffset, Ctx);
-    Asm->OutStreamer->EmitAssignment(ParentFrameOffset, MCOffset);
+    Asm->OutStreamer->emitAssignment(ParentFrameOffset, MCOffset);
   }
 
   // Use the assembler to compute the number of table entries through label
@@ -579,9 +580,9 @@ void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) {
   const MCExpr *EntrySize = MCConstantExpr::create(16, Ctx);
   const MCExpr *EntryCount = MCBinaryExpr::createDiv(LabelDiff, EntrySize, Ctx);
   AddComment("Number of call sites");
-  OS.EmitValue(EntryCount, 4);
+  OS.emitValue(EntryCount, 4);
 
-  OS.EmitLabel(TableBegin);
+  OS.emitLabel(TableBegin);
 
   // Iterate over all the invoke try ranges. Unlike MSVC, LLVM currently only
   // models exceptions from invokes. LLVM also allows arbitrary reordering of
@@ -609,7 +610,7 @@ void WinException::emitCSpecificHandlerTable(const MachineFunction *MF) {
     LastEHState = StateChange.NewState;
   }
 
-  OS.EmitLabel(TableEnd);
+  OS.emitLabel(TableEnd);
 }
 
 void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
@@ -641,14 +642,14 @@ void WinException::emitSEHActionsForRange(const WinEHFuncInfo &FuncInfo,
     }
 
     AddComment("LabelStart");
-    OS.EmitValue(getLabel(BeginLabel), 4);
+    OS.emitValue(getLabel(BeginLabel), 4);
     AddComment("LabelEnd");
-    OS.EmitValue(getLabel(EndLabel), 4);
+    OS.emitValue(getLabel(EndLabel), 4);
     AddComment(UME.IsFinally ? "FinallyFunclet" : UME.Filter ? "FilterFunction"
                                                              : "CatchAll");
-    OS.EmitValue(FilterOrFinally, 4);
+    OS.emitValue(FilterOrFinally, 4);
     AddComment(UME.IsFinally ? "Null" : "ExceptionHandler");
-    OS.EmitValue(ExceptOrNull, 4);
+    OS.emitValue(ExceptOrNull, 4);
 
     assert(UME.ToState < State && "states should decrease");
     State = UME.ToState;
@@ -713,55 +714,55 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
   // EHFlags & 1 -> Synchronous exceptions only, no async exceptions.
   // EHFlags & 2 -> ???
   // EHFlags & 4 -> The function is noexcept(true), unwinding can't continue.
-  OS.EmitValueToAlignment(4);
-  OS.EmitLabel(FuncInfoXData);
+  OS.emitValueToAlignment(4);
+  OS.emitLabel(FuncInfoXData);
 
   AddComment("MagicNumber");
-  OS.EmitIntValue(0x19930522, 4);
+  OS.emitInt32(0x19930522);
 
   AddComment("MaxState");
-  OS.EmitIntValue(FuncInfo.CxxUnwindMap.size(), 4);
+  OS.emitInt32(FuncInfo.CxxUnwindMap.size());
 
   AddComment("UnwindMap");
-  OS.EmitValue(create32bitRef(UnwindMapXData), 4);
+  OS.emitValue(create32bitRef(UnwindMapXData), 4);
 
   AddComment("NumTryBlocks");
-  OS.EmitIntValue(FuncInfo.TryBlockMap.size(), 4);
+  OS.emitInt32(FuncInfo.TryBlockMap.size());
 
   AddComment("TryBlockMap");
-  OS.EmitValue(create32bitRef(TryBlockMapXData), 4);
+  OS.emitValue(create32bitRef(TryBlockMapXData), 4);
 
   AddComment("IPMapEntries");
-  OS.EmitIntValue(IPToStateTable.size(), 4);
+  OS.emitInt32(IPToStateTable.size());
 
   AddComment("IPToStateXData");
-  OS.EmitValue(create32bitRef(IPToStateXData), 4);
+  OS.emitValue(create32bitRef(IPToStateXData), 4);
 
   if (Asm->MAI->usesWindowsCFI()) {
     AddComment("UnwindHelp");
-    OS.EmitIntValue(UnwindHelpOffset, 4);
+    OS.emitInt32(UnwindHelpOffset);
   }
 
   AddComment("ESTypeList");
-  OS.EmitIntValue(0, 4);
+  OS.emitInt32(0);
 
   AddComment("EHFlags");
-  OS.EmitIntValue(1, 4);
+  OS.emitInt32(1);
 
   // UnwindMapEntry {
   //   int32_t ToState;
   //   void  (*Action)();
   // };
   if (UnwindMapXData) {
-    OS.EmitLabel(UnwindMapXData);
+    OS.emitLabel(UnwindMapXData);
     for (const CxxUnwindMapEntry &UME : FuncInfo.CxxUnwindMap) {
       MCSymbol *CleanupSym =
           getMCSymbolForMBB(Asm, UME.Cleanup.dyn_cast<MachineBasicBlock *>());
       AddComment("ToState");
-      OS.EmitIntValue(UME.ToState, 4);
+      OS.emitInt32(UME.ToState);
 
       AddComment("Action");
-      OS.EmitValue(create32bitRef(CleanupSym), 4);
+      OS.emitValue(create32bitRef(CleanupSym), 4);
     }
   }
 
@@ -773,7 +774,7 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
   //   HandlerType *HandlerArray;
   // };
   if (TryBlockMapXData) {
-    OS.EmitLabel(TryBlockMapXData);
+    OS.emitLabel(TryBlockMapXData);
     SmallVector<MCSymbol *, 1> HandlerMaps;
     for (size_t I = 0, E = FuncInfo.TryBlockMap.size(); I != E; ++I) {
       const WinEHTryBlockMapEntry &TBME = FuncInfo.TryBlockMap[I];
@@ -795,19 +796,19 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
              "bad trymap interval");
 
       AddComment("TryLow");
-      OS.EmitIntValue(TBME.TryLow, 4);
+      OS.emitInt32(TBME.TryLow);
 
       AddComment("TryHigh");
-      OS.EmitIntValue(TBME.TryHigh, 4);
+      OS.emitInt32(TBME.TryHigh);
 
       AddComment("CatchHigh");
-      OS.EmitIntValue(TBME.CatchHigh, 4);
+      OS.emitInt32(TBME.CatchHigh);
 
       AddComment("NumCatches");
-      OS.EmitIntValue(TBME.HandlerArray.size(), 4);
+      OS.emitInt32(TBME.HandlerArray.size());
 
       AddComment("HandlerArray");
-      OS.EmitValue(create32bitRef(HandlerMapXData), 4);
+      OS.emitValue(create32bitRef(HandlerMapXData), 4);
     }
 
     // All funclets use the same parent frame offset currently.
@@ -829,7 +830,7 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
       //   void          (*Handler)();
       //   int32_t         ParentFrameOffset; // x64 and AArch64 only
       // };
-      OS.EmitLabel(HandlerMapXData);
+      OS.emitLabel(HandlerMapXData);
       for (const WinEHHandlerType &HT : TBME.HandlerArray) {
         // Get the frame escape label with the offset of the catch object. If
         // the index is INT_MAX, then there is no catch object, and we should
@@ -847,20 +848,20 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
             getMCSymbolForMBB(Asm, HT.Handler.dyn_cast<MachineBasicBlock *>());
 
         AddComment("Adjectives");
-        OS.EmitIntValue(HT.Adjectives, 4);
+        OS.emitInt32(HT.Adjectives);
 
         AddComment("Type");
-        OS.EmitValue(create32bitRef(HT.TypeDescriptor), 4);
+        OS.emitValue(create32bitRef(HT.TypeDescriptor), 4);
 
         AddComment("CatchObjOffset");
-        OS.EmitValue(FrameAllocOffsetRef, 4);
+        OS.emitValue(FrameAllocOffsetRef, 4);
 
         AddComment("Handler");
-        OS.EmitValue(create32bitRef(HandlerSym), 4);
+        OS.emitValue(create32bitRef(HandlerSym), 4);
 
         if (shouldEmitPersonality) {
           AddComment("ParentFrameOffset");
-          OS.EmitIntValue(ParentFrameOffset, 4);
+          OS.emitInt32(ParentFrameOffset);
         }
       }
     }
@@ -871,12 +872,12 @@ void WinException::emitCXXFrameHandler3Table(const MachineFunction *MF) {
   //   int32_t State;
   // };
   if (IPToStateXData) {
-    OS.EmitLabel(IPToStateXData);
+    OS.emitLabel(IPToStateXData);
     for (auto &IPStatePair : IPToStateTable) {
       AddComment("IP");
-      OS.EmitValue(IPStatePair.first, 4);
+      OS.emitValue(IPStatePair.first, 4);
       AddComment("ToState");
-      OS.EmitIntValue(IPStatePair.second, 4);
+      OS.emitInt32(IPStatePair.second);
     }
   }
 }
@@ -956,7 +957,7 @@ void WinException::emitEHRegistrationOffsetLabel(const WinEHFuncInfo &FuncInfo,
   MCContext &Ctx = Asm->OutContext;
   MCSymbol *ParentFrameOffset =
       Ctx.getOrCreateParentFrameOffsetSymbol(FLinkageName);
-  Asm->OutStreamer->EmitAssignment(ParentFrameOffset,
+  Asm->OutStreamer->emitAssignment(ParentFrameOffset,
                                    MCConstantExpr::create(Offset, Ctx));
 }
 
@@ -979,8 +980,8 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
 
   // Emit the __ehtable label that we use for llvm.x86.seh.lsda.
   MCSymbol *LSDALabel = Asm->OutContext.getOrCreateLSDASymbol(FLinkageName);
-  OS.EmitValueToAlignment(4);
-  OS.EmitLabel(LSDALabel);
+  OS.emitValueToAlignment(4);
+  OS.emitLabel(LSDALabel);
 
   const auto *Per = cast<Function>(F.getPersonalityFn()->stripPointerCasts());
   StringRef PerName = Per->getName();
@@ -1011,7 +1012,7 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
     int GSCookieOffset = -2;
     const MachineFrameInfo &MFI = MF->getFrameInfo();
     if (MFI.hasStackProtectorIndex()) {
-      unsigned UnusedReg;
+      Register UnusedReg;
       const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
       int SSPIdx = MFI.getStackProtectorIndex();
       GSCookieOffset = TFI->getFrameIndexReference(*MF, SSPIdx, UnusedReg);
@@ -1021,20 +1022,20 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
     // TODO(etienneb): Get rid of this value and change it for and assertion.
     int EHCookieOffset = 9999;
     if (FuncInfo.EHGuardFrameIndex != INT_MAX) {
-      unsigned UnusedReg;
+      Register UnusedReg;
       const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
       int EHGuardIdx = FuncInfo.EHGuardFrameIndex;
       EHCookieOffset = TFI->getFrameIndexReference(*MF, EHGuardIdx, UnusedReg);
     }
 
     AddComment("GSCookieOffset");
-    OS.EmitIntValue(GSCookieOffset, 4);
+    OS.emitInt32(GSCookieOffset);
     AddComment("GSCookieXOROffset");
-    OS.EmitIntValue(0, 4);
+    OS.emitInt32(0);
     AddComment("EHCookieOffset");
-    OS.EmitIntValue(EHCookieOffset, 4);
+    OS.emitInt32(EHCookieOffset);
     AddComment("EHCookieXOROffset");
-    OS.EmitIntValue(0, 4);
+    OS.emitInt32(0);
     BaseState = -2;
   }
 
@@ -1047,11 +1048,11 @@ void WinException::emitExceptHandlerTable(const MachineFunction *MF) {
     // _except_handler4 it's -2. Do that replacement here if necessary.
     int ToState = UME.ToState == -1 ? BaseState : UME.ToState;
     AddComment("ToState");
-    OS.EmitIntValue(ToState, 4);
+    OS.emitInt32(ToState);
     AddComment(UME.IsFinally ? "Null" : "FilterFunction");
-    OS.EmitValue(create32bitRef(UME.Filter), 4);
+    OS.emitValue(create32bitRef(UME.Filter), 4);
     AddComment(UME.IsFinally ? "FinallyFunclet" : "ExceptionHandler");
-    OS.EmitValue(create32bitRef(ExceptOrFinally), 4);
+    OS.emitValue(create32bitRef(ExceptOrFinally), 4);
   }
 }
 
@@ -1124,9 +1125,9 @@ void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
 
   // Write out a sentinel indicating the end of the standard (Windows) xdata
   // and the start of the additional (CLR) info.
-  OS.EmitIntValue(0xffffffff, 4);
+  OS.emitInt32(0xffffffff);
   // Write out the number of funclets
-  OS.EmitIntValue(NumStates, 4);
+  OS.emitInt32(NumStates);
 
   // Walk the machine blocks/instrs, computing and emitting a few things:
   // 1. Emit a list of the offsets to each handler entry, in lexical order.
@@ -1164,7 +1165,7 @@ void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
     }
     // Emit the function/funclet end and, if this is a funclet (and not the
     // root function), record it in the EndSymbolMap.
-    OS.EmitValue(getOffset(EndSymbol, FuncBeginSym), 4);
+    OS.emitValue(getOffset(EndSymbol, FuncBeginSym), 4);
     if (FuncletState != NullState) {
       // Record the end of the handler.
       EndSymbolMap[FuncletState] = EndSymbol;
@@ -1217,7 +1218,7 @@ void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
   }
 
   // Now emit the clause info, starting with the number of clauses.
-  OS.EmitIntValue(Clauses.size(), 4);
+  OS.emitInt32(Clauses.size());
   for (ClrClause &Clause : Clauses) {
     // Emit a CORINFO_EH_CLAUSE :
     /*
@@ -1299,18 +1300,18 @@ void WinException::emitCLRExceptionTable(const MachineFunction *MF) {
       assert(Clause.EnclosingState > MinClauseMap[Clause.State]);
       Flags |= 8;
     }
-    OS.EmitIntValue(Flags, 4);
+    OS.emitInt32(Flags);
 
     // Write the clause start/end
-    OS.EmitValue(ClauseBegin, 4);
-    OS.EmitValue(ClauseEnd, 4);
+    OS.emitValue(ClauseBegin, 4);
+    OS.emitValue(ClauseEnd, 4);
 
     // Write out the handler start/end
-    OS.EmitValue(HandlerBegin, 4);
-    OS.EmitValue(HandlerEnd, 4);
+    OS.emitValue(HandlerBegin, 4);
+    OS.emitValue(HandlerEnd, 4);
 
     // Write out the type token or filter offset
     assert(Entry.HandlerType != ClrHandlerType::Filter && "NYI: filters");
-    OS.EmitIntValue(Entry.TypeToken, 4);
+    OS.emitInt32(Entry.TypeToken);
   }
 }
diff --git a/llvm/lib/CodeGen/AsmPrinter/WinException.h b/llvm/lib/CodeGen/AsmPrinter/WinException.h
index dc5036302131..8bd5d1bc6d2a 100644
--- a/llvm/lib/CodeGen/AsmPrinter/WinException.h
+++ b/llvm/lib/CodeGen/AsmPrinter/WinException.h
@@ -16,12 +16,10 @@
 #include "EHStreamer.h"
 
 namespace llvm {
-class Function;
 class GlobalValue;
 class MachineFunction;
 class MCExpr;
 class MCSection;
-class Value;
 struct WinEHFuncInfo;
 
 class LLVM_LIBRARY_VISIBILITY WinException : public EHStreamer {
diff --git a/llvm/lib/CodeGen/AtomicExpandPass.cpp b/llvm/lib/CodeGen/AtomicExpandPass.cpp
index 37a50cde6391..a5030305435c 100644
--- a/llvm/lib/CodeGen/AtomicExpandPass.cpp
+++ b/llvm/lib/CodeGen/AtomicExpandPass.cpp
@@ -89,7 +89,7 @@ namespace {
         AtomicRMWInst *I,
         TargetLoweringBase::AtomicExpansionKind ExpansionKind);
     AtomicRMWInst *widenPartwordAtomicRMW(AtomicRMWInst *AI);
-    void expandPartwordCmpXchg(AtomicCmpXchgInst *I);
+    bool expandPartwordCmpXchg(AtomicCmpXchgInst *I);
     void expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI);
     void expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI);
 
@@ -105,7 +105,7 @@ namespace {
     bool isIdempotentRMW(AtomicRMWInst *RMWI);
     bool simplifyIdempotentRMW(AtomicRMWInst *RMWI);
 
-    bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, unsigned Align,
+    bool expandAtomicOpToLibcall(Instruction *I, unsigned Size, Align Alignment,
                                  Value *PointerOperand, Value *ValueOperand,
                                  Value *CASExpected, AtomicOrdering Ordering,
                                  AtomicOrdering Ordering2,
@@ -152,47 +152,15 @@ static unsigned getAtomicOpSize(AtomicCmpXchgInst *CASI) {
   return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
 }
 
-// Helper functions to retrieve the alignment of atomic instructions.
-static unsigned getAtomicOpAlign(LoadInst *LI) {
-  unsigned Align = LI->getAlignment();
-  // In the future, if this IR restriction is relaxed, we should
-  // return DataLayout::getABITypeAlignment when there's no align
-  // value.
-  assert(Align != 0 && "An atomic LoadInst always has an explicit alignment");
-  return Align;
-}
-
-static unsigned getAtomicOpAlign(StoreInst *SI) {
-  unsigned Align = SI->getAlignment();
-  // In the future, if this IR restriction is relaxed, we should
-  // return DataLayout::getABITypeAlignment when there's no align
-  // value.
-  assert(Align != 0 && "An atomic StoreInst always has an explicit alignment");
-  return Align;
-}
-
-static unsigned getAtomicOpAlign(AtomicRMWInst *RMWI) {
-  // TODO(PR27168): This instruction has no alignment attribute, but unlike the
-  // default alignment for load/store, the default here is to assume
-  // it has NATURAL alignment, not DataLayout-specified alignment.
-  const DataLayout &DL = RMWI->getModule()->getDataLayout();
-  return DL.getTypeStoreSize(RMWI->getValOperand()->getType());
-}
-
-static unsigned getAtomicOpAlign(AtomicCmpXchgInst *CASI) {
-  // TODO(PR27168): same comment as above.
-  const DataLayout &DL = CASI->getModule()->getDataLayout();
-  return DL.getTypeStoreSize(CASI->getCompareOperand()->getType());
-}
-
 // Determine if a particular atomic operation has a supported size,
 // and is of appropriate alignment, to be passed through for target
 // lowering. (Versus turning into a __atomic libcall)
 template <typename Inst>
 static bool atomicSizeSupported(const TargetLowering *TLI, Inst *I) {
   unsigned Size = getAtomicOpSize(I);
-  unsigned Align = getAtomicOpAlign(I);
-  return Align >= Size && Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
+  Align Alignment = I->getAlign();
+  return Alignment >= Size &&
+         Size <= TLI->getMaxAtomicSizeInBitsSupported() / 8;
 }
 
 bool AtomicExpand::runOnFunction(Function &F) {
@@ -383,7 +351,7 @@ LoadInst *AtomicExpand::convertAtomicLoadToIntegerType(LoadInst *LI) {
   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
 
   auto *NewLI = Builder.CreateLoad(NewTy, NewAddr);
-  NewLI->setAlignment(MaybeAlign(LI->getAlignment()));
+  NewLI->setAlignment(LI->getAlign());
   NewLI->setVolatile(LI->isVolatile());
   NewLI->setAtomic(LI->getOrdering(), LI->getSyncScopeID());
   LLVM_DEBUG(dbgs() << "Replaced " << *LI << " with " << *NewLI << "\n");
@@ -470,7 +438,7 @@ StoreInst *AtomicExpand::convertAtomicStoreToIntegerType(StoreInst *SI) {
   Value *NewAddr = Builder.CreateBitCast(Addr, PT);
 
   StoreInst *NewSI = Builder.CreateStore(NewVal, NewAddr);
-  NewSI->setAlignment(MaybeAlign(SI->getAlignment()));
+  NewSI->setAlignment(SI->getAlign());
   NewSI->setVolatile(SI->isVolatile());
   NewSI->setAtomic(SI->getOrdering(), SI->getSyncScopeID());
   LLVM_DEBUG(dbgs() << "Replaced " << *SI << " with " << *NewSI << "\n");
@@ -570,8 +538,8 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
     unsigned MinCASSize = TLI->getMinCmpXchgSizeInBits() / 8;
     unsigned ValueSize = getAtomicOpSize(AI);
     if (ValueSize < MinCASSize) {
-      llvm_unreachable(
-          "MinCmpXchgSizeInBits not yet supported for LL/SC architectures.");
+      expandPartwordAtomicRMW(AI,
+                              TargetLoweringBase::AtomicExpansionKind::LLSC);
     } else {
       auto PerformOp = [&](IRBuilder<> &Builder, Value *Loaded) {
         return performAtomicOp(AI->getOperation(), Builder, Loaded,
@@ -608,16 +576,43 @@ bool AtomicExpand::tryExpandAtomicRMW(AtomicRMWInst *AI) {
 
 namespace {
 
-/// Result values from createMaskInstrs helper.
 struct PartwordMaskValues {
-  Type *WordType;
-  Type *ValueType;
-  Value *AlignedAddr;
-  Value *ShiftAmt;
-  Value *Mask;
-  Value *Inv_Mask;
+  // These three fields are guaranteed to be set by createMaskInstrs.
+  Type *WordType = nullptr;
+  Type *ValueType = nullptr;
+  Value *AlignedAddr = nullptr;
+  // The remaining fields can be null.
+  Value *ShiftAmt = nullptr;
+  Value *Mask = nullptr;
+  Value *Inv_Mask = nullptr;
 };
 
+LLVM_ATTRIBUTE_UNUSED
+raw_ostream &operator<<(raw_ostream &O, const PartwordMaskValues &PMV) {
+  auto PrintObj = [&O](auto *V) {
+    if (V)
+      O << *V;
+    else
+      O << "nullptr";
+    O << '\n';
+  };
+  O << "PartwordMaskValues {\n";
+  O << "  WordType: ";
+  PrintObj(PMV.WordType);
+  O << "  ValueType: ";
+  PrintObj(PMV.ValueType);
+  O << "  AlignedAddr: ";
+  PrintObj(PMV.AlignedAddr);
+  O << "  ShiftAmt: ";
+  PrintObj(PMV.ShiftAmt);
+  O << "  Mask: ";
+  PrintObj(PMV.Mask);
+  O << "  Inv_Mask: ";
+  PrintObj(PMV.Inv_Mask);
+  O << "}\n";
+  return O;
+}
+
 } // end anonymous namespace
 
 /// This is a helper function which builds instructions to provide
@@ -638,48 +633,74 @@ struct PartwordMaskValues {
 /// Inv_Mask: The inverse of Mask.
 static PartwordMaskValues createMaskInstrs(IRBuilder<> &Builder, Instruction *I,
                                            Type *ValueType, Value *Addr,
-                                           unsigned WordSize) {
-  PartwordMaskValues Ret;
+                                           unsigned MinWordSize) {
+  PartwordMaskValues PMV;
 
-  BasicBlock *BB = I->getParent();
-  Function *F = BB->getParent();
   Module *M = I->getModule();
-
-  LLVMContext &Ctx = F->getContext();
+  LLVMContext &Ctx = M->getContext();
   const DataLayout &DL = M->getDataLayout();
-
   unsigned ValueSize = DL.getTypeStoreSize(ValueType);
 
-  assert(ValueSize < WordSize);
+  PMV.ValueType = ValueType;
+  PMV.WordType = MinWordSize > ValueSize ? Type::getIntNTy(Ctx, MinWordSize * 8)
+                                         : ValueType;
+  if (PMV.ValueType == PMV.WordType) {
+    PMV.AlignedAddr = Addr;
+    return PMV;
+  }
 
-  Ret.ValueType = ValueType;
-  Ret.WordType = Type::getIntNTy(Ctx, WordSize * 8);
+  assert(ValueSize < MinWordSize);
 
   Type *WordPtrType =
-      Ret.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace());
+      PMV.WordType->getPointerTo(Addr->getType()->getPointerAddressSpace());
 
   Value *AddrInt = Builder.CreatePtrToInt(Addr, DL.getIntPtrType(Ctx));
-  Ret.AlignedAddr = Builder.CreateIntToPtr(
-      Builder.CreateAnd(AddrInt, ~(uint64_t)(WordSize - 1)), WordPtrType,
+  PMV.AlignedAddr = Builder.CreateIntToPtr(
+      Builder.CreateAnd(AddrInt, ~(uint64_t)(MinWordSize - 1)), WordPtrType,
       "AlignedAddr");
 
-  Value *PtrLSB = Builder.CreateAnd(AddrInt, WordSize - 1, "PtrLSB");
+  Value *PtrLSB = Builder.CreateAnd(AddrInt, MinWordSize - 1, "PtrLSB");
   if (DL.isLittleEndian()) {
     // turn bytes into bits
-    Ret.ShiftAmt = Builder.CreateShl(PtrLSB, 3);
+    PMV.ShiftAmt = Builder.CreateShl(PtrLSB, 3);
   } else {
     // turn bytes into bits, and count from the other side.
-    Ret.ShiftAmt =
-        Builder.CreateShl(Builder.CreateXor(PtrLSB, WordSize - ValueSize), 3);
+    PMV.ShiftAmt = Builder.CreateShl(
+        Builder.CreateXor(PtrLSB, MinWordSize - ValueSize), 3);
   }
 
-  Ret.ShiftAmt = Builder.CreateTrunc(Ret.ShiftAmt, Ret.WordType, "ShiftAmt");
-  Ret.Mask = Builder.CreateShl(
-      ConstantInt::get(Ret.WordType, (1 << (ValueSize * 8)) - 1), Ret.ShiftAmt,
+  PMV.ShiftAmt = Builder.CreateTrunc(PMV.ShiftAmt, PMV.WordType, "ShiftAmt");
+  PMV.Mask = Builder.CreateShl(
+      ConstantInt::get(PMV.WordType, (1 << (ValueSize * 8)) - 1), PMV.ShiftAmt,
       "Mask");
-  Ret.Inv_Mask = Builder.CreateNot(Ret.Mask, "Inv_Mask");
+  PMV.Inv_Mask = Builder.CreateNot(PMV.Mask, "Inv_Mask");
+  return PMV;
+}
+
+static Value *extractMaskedValue(IRBuilder<> &Builder, Value *WideWord,
+                                 const PartwordMaskValues &PMV) {
+  assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
+  if (PMV.WordType == PMV.ValueType)
+    return WideWord;
+
+  Value *Shift = Builder.CreateLShr(WideWord, PMV.ShiftAmt, "shifted");
+  Value *Trunc = Builder.CreateTrunc(Shift, PMV.ValueType, "extracted");
+  return Trunc;
+}
 
-  return Ret;
+static Value *insertMaskedValue(IRBuilder<> &Builder, Value *WideWord,
+                                Value *Updated, const PartwordMaskValues &PMV) {
+  assert(WideWord->getType() == PMV.WordType && "Widened type mismatch");
+  assert(Updated->getType() == PMV.ValueType && "Value type mismatch");
+  if (PMV.WordType == PMV.ValueType)
+    return Updated;
+
+  Value *ZExt = Builder.CreateZExt(Updated, PMV.WordType, "extended");
+  Value *Shift =
+      Builder.CreateShl(ZExt, PMV.ShiftAmt, "shifted", /*HasNUW*/ true);
+  Value *And = Builder.CreateAnd(WideWord, PMV.Inv_Mask, "unmasked");
+  Value *Or = Builder.CreateOr(And, Shift, "inserted");
+  return Or;
 }
 
 /// Emit IR to implement a masked version of a given atomicrmw
@@ -719,13 +740,9 @@ static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
     // Finally, comparison ops will operate on the full value, so
     // truncate down to the original size, and expand out again after
     // doing the operation.
-    Value *Loaded_Shiftdown = Builder.CreateTrunc(
-        Builder.CreateLShr(Loaded, PMV.ShiftAmt), PMV.ValueType);
-    Value *NewVal = performAtomicOp(Op, Builder, Loaded_Shiftdown, Inc);
-    Value *NewVal_Shiftup = Builder.CreateShl(
-        Builder.CreateZExt(NewVal, PMV.WordType), PMV.ShiftAmt);
-    Value *Loaded_MaskOut = Builder.CreateAnd(Loaded, PMV.Inv_Mask);
-    Value *FinalVal = Builder.CreateOr(Loaded_MaskOut, NewVal_Shiftup);
+    Value *Loaded_Extract = extractMaskedValue(Builder, Loaded, PMV);
+    Value *NewVal = performAtomicOp(Op, Builder, Loaded_Extract, Inc);
+    Value *FinalVal = insertMaskedValue(Builder, Loaded, NewVal, PMV);
     return FinalVal;
   }
   default:
@@ -738,12 +755,10 @@ static Value *performMaskedAtomicOp(AtomicRMWInst::BinOp Op,
 ///
 /// It will create an LL/SC or cmpxchg loop, as appropriate, the same
 /// way as a typical atomicrmw expansion. The only difference here is
-/// that the operation inside of the loop must operate only upon a
+/// that the operation inside of the loop may operate upon only a
 /// part of the value.
 void AtomicExpand::expandPartwordAtomicRMW(
     AtomicRMWInst *AI, TargetLoweringBase::AtomicExpansionKind ExpansionKind) {
-  assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg);
-
   AtomicOrdering MemOpOrder = AI->getOrdering();
 
   IRBuilder<> Builder(AI);
@@ -761,13 +776,18 @@ void AtomicExpand::expandPartwordAtomicRMW(
                                  ValOperand_Shifted, AI->getValOperand(), PMV);
   };
 
-  // TODO: When we're ready to support LLSC conversions too, use
-  // insertRMWLLSCLoop here for ExpansionKind==LLSC.
-  Value *OldResult =
-      insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder,
-                           PerformPartwordOp, createCmpXchgInstFun);
-  Value *FinalOldResult = Builder.CreateTrunc(
-      Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType);
+  Value *OldResult;
+  if (ExpansionKind == TargetLoweringBase::AtomicExpansionKind::CmpXChg) {
+    OldResult =
+        insertRMWCmpXchgLoop(Builder, PMV.WordType, PMV.AlignedAddr, MemOpOrder,
+                             PerformPartwordOp, createCmpXchgInstFun);
+  } else {
+    assert(ExpansionKind == TargetLoweringBase::AtomicExpansionKind::LLSC);
+    OldResult = insertRMWLLSCLoop(Builder, PMV.WordType, PMV.AlignedAddr,
+                                  MemOpOrder, PerformPartwordOp);
+  }
+
+  Value *FinalOldResult = extractMaskedValue(Builder, OldResult, PMV);
   AI->replaceAllUsesWith(FinalOldResult);
   AI->eraseFromParent();
 }
@@ -800,14 +820,13 @@ AtomicRMWInst *AtomicExpand::widenPartwordAtomicRMW(AtomicRMWInst *AI) {
   AtomicRMWInst *NewAI = Builder.CreateAtomicRMW(Op, PMV.AlignedAddr,
                                                  NewOperand, AI->getOrdering());
 
-  Value *FinalOldResult = Builder.CreateTrunc(
-      Builder.CreateLShr(NewAI, PMV.ShiftAmt), PMV.ValueType);
+  Value *FinalOldResult = extractMaskedValue(Builder, NewAI, PMV);
   AI->replaceAllUsesWith(FinalOldResult);
   AI->eraseFromParent();
   return NewAI;
 }
 
-void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
+bool AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
   // The basic idea here is that we're expanding a cmpxchg of a
   // smaller memory size up to a word-sized cmpxchg. To do this, we
   // need to add a retry-loop for strong cmpxchg, so that
@@ -923,14 +942,14 @@ void AtomicExpand::expandPartwordCmpXchg(AtomicCmpXchgInst *CI) {
   // partword.cmpxchg.end:
   Builder.SetInsertPoint(CI);
 
-  Value *FinalOldVal = Builder.CreateTrunc(
-      Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType);
+  Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV);
   Value *Res = UndefValue::get(CI->getType());
   Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
   Res = Builder.CreateInsertValue(Res, Success, 1);
 
   CI->replaceAllUsesWith(Res);
   CI->eraseFromParent();
+  return true;
 }
 
 void AtomicExpand::expandAtomicOpToLLSC(
@@ -965,8 +984,7 @@ void AtomicExpand::expandAtomicRMWToMaskedIntrinsic(AtomicRMWInst *AI) {
   Value *OldResult = TLI->emitMaskedAtomicRMWIntrinsic(
       Builder, AI, PMV.AlignedAddr, ValOperand_Shifted, PMV.Mask, PMV.ShiftAmt,
       AI->getOrdering());
-  Value *FinalOldResult = Builder.CreateTrunc(
-      Builder.CreateLShr(OldResult, PMV.ShiftAmt), PMV.ValueType);
+  Value *FinalOldResult = extractMaskedValue(Builder, OldResult, PMV);
   AI->replaceAllUsesWith(FinalOldResult);
   AI->eraseFromParent();
 }
@@ -987,9 +1005,7 @@ void AtomicExpand::expandAtomicCmpXchgToMaskedIntrinsic(AtomicCmpXchgInst *CI) {
   Value *OldVal = TLI->emitMaskedAtomicCmpXchgIntrinsic(
       Builder, CI, PMV.AlignedAddr, CmpVal_Shifted, NewVal_Shifted, PMV.Mask,
       CI->getSuccessOrdering());
-  Value *FinalOldVal = Builder.CreateTrunc(
-      Builder.CreateLShr(OldVal, PMV.ShiftAmt), PMV.ValueType);
-
+  Value *FinalOldVal = extractMaskedValue(Builder, OldVal, PMV);
   Value *Res = UndefValue::get(CI->getType());
   Res = Builder.CreateInsertValue(Res, FinalOldVal, 0);
   Value *Success = Builder.CreateICmpEQ(
@@ -1126,24 +1142,28 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   //
   // The full expansion we produce is:
   //     [...]
+  // %aligned.addr = ...
   // cmpxchg.start:
-  //     %unreleasedload = @load.linked(%addr)
-  //     %should_store = icmp eq %unreleasedload, %desired
-  //     br i1 %should_store, label %cmpxchg.fencedstore,
+  //     %unreleasedload = @load.linked(%aligned.addr)
+  //     %unreleasedload.extract = extract value from %unreleasedload
+  //     %should_store = icmp eq %unreleasedload.extract, %desired
+  //     br i1 %should_store, label %cmpxchg.releasingstore,
   //                          label %cmpxchg.nostore
   // cmpxchg.releasingstore:
   //     fence?
   //     br label cmpxchg.trystore
   // cmpxchg.trystore:
-  //     %loaded.trystore = phi [%unreleasedload, %releasingstore],
+  //     %loaded.trystore = phi [%unreleasedload, %cmpxchg.releasingstore],
   //                            [%releasedload, %cmpxchg.releasedload]
-  //     %stored = @store_conditional(%new, %addr)
+  //     %updated.new = insert %new into %loaded.trystore
+  //     %stored = @store_conditional(%updated.new, %aligned.addr)
   //     %success = icmp eq i32 %stored, 0
   //     br i1 %success, label %cmpxchg.success,
   //                     label %cmpxchg.releasedload/%cmpxchg.failure
   // cmpxchg.releasedload:
-  //     %releasedload = @load.linked(%addr)
-  //     %should_store = icmp eq %releasedload, %desired
+  //     %releasedload = @load.linked(%aligned.addr)
+  //     %releasedload.extract = extract value from %releasedload
+  //     %should_store = icmp eq %releasedload.extract, %desired
   //     br i1 %should_store, label %cmpxchg.trystore,
   //                          label %cmpxchg.failure
   // cmpxchg.success:
@@ -1159,9 +1179,10 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   //     fence?
   //     br label %cmpxchg.end
   // cmpxchg.end:
-  //     %loaded = phi [%loaded.nostore, %cmpxchg.failure],
-  //                   [%loaded.trystore, %cmpxchg.trystore]
+  //     %loaded.exit = phi [%loaded.nostore, %cmpxchg.failure],
+  //                        [%loaded.trystore, %cmpxchg.trystore]
   //     %success = phi i1 [true, %cmpxchg.success], [false, %cmpxchg.failure]
+  //     %loaded = extract value from %loaded.exit
   //     %restmp = insertvalue { iN, i1 } undef, iN %loaded, 0
   //     %res = insertvalue { iN, i1 } %restmp, i1 %success, 1
   //     [...]
@@ -1187,13 +1208,20 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   Builder.SetInsertPoint(BB);
   if (ShouldInsertFencesForAtomic && UseUnconditionalReleaseBarrier)
     TLI->emitLeadingFence(Builder, CI, SuccessOrder);
+
+  PartwordMaskValues PMV =
+      createMaskInstrs(Builder, CI, CI->getCompareOperand()->getType(), Addr,
+                       TLI->getMinCmpXchgSizeInBits() / 8);
   Builder.CreateBr(StartBB);
 
   // Start the main loop block now that we've taken care of the preliminaries.
   Builder.SetInsertPoint(StartBB);
-  Value *UnreleasedLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
+  Value *UnreleasedLoad =
+      TLI->emitLoadLinked(Builder, PMV.AlignedAddr, MemOpOrder);
+  Value *UnreleasedLoadExtract =
+      extractMaskedValue(Builder, UnreleasedLoad, PMV);
   Value *ShouldStore = Builder.CreateICmpEQ(
-      UnreleasedLoad, CI->getCompareOperand(), "should_store");
+      UnreleasedLoadExtract, CI->getCompareOperand(), "should_store");
 
   // If the cmpxchg doesn't actually need any ordering when it fails, we can
   // jump straight past that fence instruction (if it exists).
@@ -1205,8 +1233,13 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   Builder.CreateBr(TryStoreBB);
 
   Builder.SetInsertPoint(TryStoreBB);
-  Value *StoreSuccess = TLI->emitStoreConditional(
-      Builder, CI->getNewValOperand(), Addr, MemOpOrder);
+  PHINode *LoadedTryStore =
+      Builder.CreatePHI(PMV.WordType, 2, "loaded.trystore");
+  LoadedTryStore->addIncoming(UnreleasedLoad, ReleasingStoreBB);
+  Value *NewValueInsert =
+      insertMaskedValue(Builder, LoadedTryStore, CI->getNewValOperand(), PMV);
+  Value *StoreSuccess =
+      TLI->emitStoreConditional(Builder, NewValueInsert, Addr, MemOpOrder);
   StoreSuccess = Builder.CreateICmpEQ(
       StoreSuccess, ConstantInt::get(Type::getInt32Ty(Ctx), 0), "success");
   BasicBlock *RetryBB = HasReleasedLoadBB ? ReleasedLoadBB : StartBB;
@@ -1216,13 +1249,16 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   Builder.SetInsertPoint(ReleasedLoadBB);
   Value *SecondLoad;
   if (HasReleasedLoadBB) {
-    SecondLoad = TLI->emitLoadLinked(Builder, Addr, MemOpOrder);
-    ShouldStore = Builder.CreateICmpEQ(SecondLoad, CI->getCompareOperand(),
-                                       "should_store");
+    SecondLoad = TLI->emitLoadLinked(Builder, PMV.AlignedAddr, MemOpOrder);
+    Value *SecondLoadExtract = extractMaskedValue(Builder, SecondLoad, PMV);
+    ShouldStore = Builder.CreateICmpEQ(SecondLoadExtract,
+                                       CI->getCompareOperand(), "should_store");
 
     // If the cmpxchg doesn't actually need any ordering when it fails, we can
     // jump straight past that fence instruction (if it exists).
     Builder.CreateCondBr(ShouldStore, TryStoreBB, NoStoreBB);
+    // Update PHI node in TryStoreBB.
+    LoadedTryStore->addIncoming(SecondLoad, ReleasedLoadBB);
   } else
     Builder.CreateUnreachable();
 
@@ -1234,6 +1270,12 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   Builder.CreateBr(ExitBB);
 
   Builder.SetInsertPoint(NoStoreBB);
+  PHINode *LoadedNoStore =
+      Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.nostore");
+  LoadedNoStore->addIncoming(UnreleasedLoad, StartBB);
+  if (HasReleasedLoadBB)
+    LoadedNoStore->addIncoming(SecondLoad, ReleasedLoadBB);
+
   // In the failing case, where we don't execute the store-conditional, the
   // target might want to balance out the load-linked with a dedicated
   // instruction (e.g., on ARM, clearing the exclusive monitor).
@@ -1241,6 +1283,11 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   Builder.CreateBr(FailureBB);
 
   Builder.SetInsertPoint(FailureBB);
+  PHINode *LoadedFailure =
+      Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.failure");
+  LoadedFailure->addIncoming(LoadedNoStore, NoStoreBB);
+  if (CI->isWeak())
+    LoadedFailure->addIncoming(LoadedTryStore, TryStoreBB);
   if (ShouldInsertFencesForAtomic)
     TLI->emitTrailingFence(Builder, CI, FailureOrder);
   Builder.CreateBr(ExitBB);
@@ -1250,32 +1297,20 @@ bool AtomicExpand::expandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
   // subsequent "icmp eq/ne %loaded, %oldval" into a use of an appropriate
   // PHI.
   Builder.SetInsertPoint(ExitBB, ExitBB->begin());
-  PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2);
+  PHINode *LoadedExit =
+      Builder.CreatePHI(UnreleasedLoad->getType(), 2, "loaded.exit");
+  LoadedExit->addIncoming(LoadedTryStore, SuccessBB);
+  LoadedExit->addIncoming(LoadedFailure, FailureBB);
+  PHINode *Success = Builder.CreatePHI(Type::getInt1Ty(Ctx), 2, "success");
   Success->addIncoming(ConstantInt::getTrue(Ctx), SuccessBB);
   Success->addIncoming(ConstantInt::getFalse(Ctx), FailureBB);
 
-  // Setup the builder so we can create any PHIs we need.
-  Value *Loaded;
-  if (!HasReleasedLoadBB)
-    Loaded = UnreleasedLoad;
-  else {
-    Builder.SetInsertPoint(TryStoreBB, TryStoreBB->begin());
-    PHINode *TryStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
-    TryStoreLoaded->addIncoming(UnreleasedLoad, ReleasingStoreBB);
-    TryStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
-
-    Builder.SetInsertPoint(NoStoreBB, NoStoreBB->begin());
-    PHINode *NoStoreLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
-    NoStoreLoaded->addIncoming(UnreleasedLoad, StartBB);
-    NoStoreLoaded->addIncoming(SecondLoad, ReleasedLoadBB);
-
-    Builder.SetInsertPoint(ExitBB, ++ExitBB->begin());
-    PHINode *ExitLoaded = Builder.CreatePHI(UnreleasedLoad->getType(), 2);
-    ExitLoaded->addIncoming(TryStoreLoaded, SuccessBB);
-    ExitLoaded->addIncoming(NoStoreLoaded, FailureBB);
-
-    Loaded = ExitLoaded;
-  }
+  // This is the "exit value" from the cmpxchg expansion. It may be of
+  // a type wider than the one in the cmpxchg instruction.
+  Value *LoadedFull = LoadedExit;
+
+  Builder.SetInsertPoint(ExitBB, std::next(Success->getIterator()));
+  Value *Loaded = extractMaskedValue(Builder, LoadedFull, PMV);
 
   // Look for any users of the cmpxchg that are just comparing the loaded value
   // against the desired one, and replace them with the CFG-derived version.
@@ -1377,7 +1412,7 @@ Value *AtomicExpand::insertRMWCmpXchgLoop(
   Builder.SetInsertPoint(BB);
   LoadInst *InitLoaded = Builder.CreateLoad(ResultTy, Addr);
   // Atomics require at least natural alignment.
-  InitLoaded->setAlignment(MaybeAlign(ResultTy->getPrimitiveSizeInBits() / 8));
+  InitLoaded->setAlignment(Align(ResultTy->getPrimitiveSizeInBits() / 8));
   Builder.CreateBr(LoopBB);
 
   // Start the main loop block now that we've taken care of the preliminaries.
@@ -1414,11 +1449,9 @@ bool AtomicExpand::tryExpandAtomicCmpXchg(AtomicCmpXchgInst *CI) {
     llvm_unreachable("Unhandled case in tryExpandAtomicCmpXchg");
   case TargetLoweringBase::AtomicExpansionKind::None:
     if (ValueSize < MinCASSize)
-      expandPartwordCmpXchg(CI);
+      return expandPartwordCmpXchg(CI);
     return false;
   case TargetLoweringBase::AtomicExpansionKind::LLSC: {
-    assert(ValueSize >= MinCASSize &&
-           "MinCmpXchgSizeInBits not yet supported for LL/SC expansions.");
     return expandAtomicCmpXchg(CI);
   }
   case TargetLoweringBase::AtomicExpansionKind::MaskedIntrinsic:
@@ -1449,7 +1482,7 @@ bool llvm::expandAtomicRMWToCmpXchg(AtomicRMWInst *AI,
 // must be one of the potentially-specialized sizes, and the value
 // type must actually exist in C on the target (otherwise, the
 // function wouldn't actually be defined.)
-static bool canUseSizedAtomicCall(unsigned Size, unsigned Align,
+static bool canUseSizedAtomicCall(unsigned Size, Align Alignment,
                                   const DataLayout &DL) {
   // TODO: "LargestSize" is an approximation for "largest type that
   // you can express in C". It seems to be the case that int128 is
@@ -1459,7 +1492,7 @@ static bool canUseSizedAtomicCall(unsigned Size, unsigned Align,
   // really be some more reliable way in LLVM of determining integer
   // sizes which are valid in the target's C ABI...
   unsigned LargestSize = DL.getLargestLegalIntTypeSizeInBits() >= 64 ? 16 : 8;
-  return Align >= Size &&
+  return Alignment >= Size &&
          (Size == 1 || Size == 2 || Size == 4 || Size == 8 || Size == 16) &&
          Size <= LargestSize;
 }
@@ -1469,10 +1502,9 @@ void AtomicExpand::expandAtomicLoadToLibcall(LoadInst *I) {
       RTLIB::ATOMIC_LOAD,   RTLIB::ATOMIC_LOAD_1, RTLIB::ATOMIC_LOAD_2,
       RTLIB::ATOMIC_LOAD_4, RTLIB::ATOMIC_LOAD_8, RTLIB::ATOMIC_LOAD_16};
   unsigned Size = getAtomicOpSize(I);
-  unsigned Align = getAtomicOpAlign(I);
 
   bool expanded = expandAtomicOpToLibcall(
-      I, Size, Align, I->getPointerOperand(), nullptr, nullptr,
+      I, Size, I->getAlign(), I->getPointerOperand(), nullptr, nullptr,
       I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
   (void)expanded;
   assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Load");
@@ -1483,11 +1515,10 @@ void AtomicExpand::expandAtomicStoreToLibcall(StoreInst *I) {
       RTLIB::ATOMIC_STORE,   RTLIB::ATOMIC_STORE_1, RTLIB::ATOMIC_STORE_2,
       RTLIB::ATOMIC_STORE_4, RTLIB::ATOMIC_STORE_8, RTLIB::ATOMIC_STORE_16};
   unsigned Size = getAtomicOpSize(I);
-  unsigned Align = getAtomicOpAlign(I);
 
   bool expanded = expandAtomicOpToLibcall(
-      I, Size, Align, I->getPointerOperand(), I->getValueOperand(), nullptr,
-      I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
+      I, Size, I->getAlign(), I->getPointerOperand(), I->getValueOperand(),
+      nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
   (void)expanded;
   assert(expanded && "expandAtomicOpToLibcall shouldn't fail tor Store");
 }
@@ -1498,10 +1529,9 @@ void AtomicExpand::expandAtomicCASToLibcall(AtomicCmpXchgInst *I) {
       RTLIB::ATOMIC_COMPARE_EXCHANGE_2, RTLIB::ATOMIC_COMPARE_EXCHANGE_4,
       RTLIB::ATOMIC_COMPARE_EXCHANGE_8, RTLIB::ATOMIC_COMPARE_EXCHANGE_16};
   unsigned Size = getAtomicOpSize(I);
-  unsigned Align = getAtomicOpAlign(I);
 
   bool expanded = expandAtomicOpToLibcall(
-      I, Size, Align, I->getPointerOperand(), I->getNewValOperand(),
+      I, Size, I->getAlign(), I->getPointerOperand(), I->getNewValOperand(),
       I->getCompareOperand(), I->getSuccessOrdering(), I->getFailureOrdering(),
       Libcalls);
   (void)expanded;
@@ -1571,13 +1601,12 @@ void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
   ArrayRef<RTLIB::Libcall> Libcalls = GetRMWLibcall(I->getOperation());
 
   unsigned Size = getAtomicOpSize(I);
-  unsigned Align = getAtomicOpAlign(I);
 
   bool Success = false;
   if (!Libcalls.empty())
     Success = expandAtomicOpToLibcall(
-        I, Size, Align, I->getPointerOperand(), I->getValOperand(), nullptr,
-        I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
+        I, Size, I->getAlign(), I->getPointerOperand(), I->getValOperand(),
+        nullptr, I->getOrdering(), AtomicOrdering::NotAtomic, Libcalls);
 
   // The expansion failed: either there were no libcalls at all for
   // the operation (min/max), or there were only size-specialized
@@ -1608,7 +1637,7 @@ void AtomicExpand::expandAtomicRMWToLibcall(AtomicRMWInst *I) {
 // 'I' are extracted from the Instruction subclass by the
 // caller. Depending on the particular call, some will be null.
 bool AtomicExpand::expandAtomicOpToLibcall(
-    Instruction *I, unsigned Size, unsigned Align, Value *PointerOperand,
+    Instruction *I, unsigned Size, Align Alignment, Value *PointerOperand,
     Value *ValueOperand, Value *CASExpected, AtomicOrdering Ordering,
     AtomicOrdering Ordering2, ArrayRef<RTLIB::Libcall> Libcalls) {
   assert(Libcalls.size() == 6);
@@ -1619,10 +1648,10 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   IRBuilder<> Builder(I);
   IRBuilder<> AllocaBuilder(&I->getFunction()->getEntryBlock().front());
 
-  bool UseSizedLibcall = canUseSizedAtomicCall(Size, Align, DL);
+  bool UseSizedLibcall = canUseSizedAtomicCall(Size, Alignment, DL);
   Type *SizedIntTy = Type::getIntNTy(Ctx, Size * 8);
 
-  unsigned AllocaAlignment = DL.getPrefTypeAlignment(SizedIntTy);
+  const Align AllocaAlignment = DL.getPrefTypeAlign(SizedIntTy);
 
   // TODO: the "order" argument type is "int", not int32. So
   // getInt32Ty may be wrong if the arch uses e.g. 16-bit ints.
@@ -1712,7 +1741,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   // 'expected' argument, if present.
   if (CASExpected) {
     AllocaCASExpected = AllocaBuilder.CreateAlloca(CASExpected->getType());
-    AllocaCASExpected->setAlignment(MaybeAlign(AllocaAlignment));
+    AllocaCASExpected->setAlignment(AllocaAlignment);
     unsigned AllocaAS =  AllocaCASExpected->getType()->getPointerAddressSpace();
 
     AllocaCASExpected_i8 =
@@ -1731,7 +1760,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
       Args.push_back(IntValue);
     } else {
       AllocaValue = AllocaBuilder.CreateAlloca(ValueOperand->getType());
-      AllocaValue->setAlignment(MaybeAlign(AllocaAlignment));
+      AllocaValue->setAlignment(AllocaAlignment);
       AllocaValue_i8 =
           Builder.CreateBitCast(AllocaValue, Type::getInt8PtrTy(Ctx));
       Builder.CreateLifetimeStart(AllocaValue_i8, SizeVal64);
@@ -1743,7 +1772,7 @@ bool AtomicExpand::expandAtomicOpToLibcall(
   // 'ret' argument.
   if (!CASExpected && HasResult && !UseSizedLibcall) {
     AllocaResult = AllocaBuilder.CreateAlloca(I->getType());
-    AllocaResult->setAlignment(MaybeAlign(AllocaAlignment));
+    AllocaResult->setAlignment(AllocaAlignment);
     unsigned AllocaAS =  AllocaResult->getType()->getPointerAddressSpace();
     AllocaResult_i8 =
       Builder.CreateBitCast(AllocaResult, Type::getInt8PtrTy(Ctx, AllocaAS));
diff --git a/llvm/lib/CodeGen/BBSectionsPrepare.cpp b/llvm/lib/CodeGen/BBSectionsPrepare.cpp
new file mode 100644
index 000000000000..a35c4d813acc
--- /dev/null
+++ b/llvm/lib/CodeGen/BBSectionsPrepare.cpp
@@ -0,0 +1,457 @@
+//===-- BBSectionsPrepare.cpp ---=========---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// BBSectionsPrepare implementation.
+//
+// The purpose of this pass is to assign sections to basic blocks when
+// -fbasic-block-sections= option is used. Further, with profile information
+// only the subset of basic blocks with profiles are placed in separate sections
+// and the rest are grouped in a cold section. The exception handling blocks are
+// treated specially to ensure they are all in one seciton.
+//
+// Basic Block Sections
+// ====================
+//
+// With option, -fbasic-block-sections=list, every function may be split into
+// clusters of basic blocks. Every cluster will be emitted into a separate
+// section with its basic blocks sequenced in the given order. To get the
+// optimized performance, the clusters must form an optimal BB layout for the
+// function. Every cluster's section is labeled with a symbol to allow the
+// linker to reorder the sections in any arbitrary sequence. A global order of
+// these sections would encapsulate the function layout.
+//
+// There are a couple of challenges to be addressed:
+//
+// 1. The last basic block of every cluster should not have any implicit
+//    fallthrough to its next basic block, as it can be reordered by the linker.
+//    The compiler should make these fallthroughs explicit by adding
+//    unconditional jumps..
+//
+// 2. All inter-cluster branch targets would now need to be resolved by the
+//    linker as they cannot be calculated during compile time. This is done
+//    using static relocations. Further, the compiler tries to use short branch
+//    instructions on some ISAs for small branch offsets. This is not possible
+//    for inter-cluster branches as the offset is not determined at compile
+//    time, and therefore, long branch instructions have to be used for those.
+//
+// 3. Debug Information (DebugInfo) and Call Frame Information (CFI) emission
+//    needs special handling with basic block sections. DebugInfo needs to be
+//    emitted with more relocations as basic block sections can break a
+//    function into potentially several disjoint pieces, and CFI needs to be
+//    emitted per cluster. This also bloats the object file and binary sizes.
+//
+// Basic Block Labels
+// ==================
+//
+// With -fbasic-block-sections=labels, or when a basic block is placed in a
+// unique section, it is labelled with a symbol.  This allows easy mapping of
+// virtual addresses from PMU profiles back to the corresponding basic blocks.
+// Since the number of basic blocks is large, the labeling bloats the symbol
+// table sizes and the string table sizes significantly. While the binary size
+// does increase, it does not affect performance as the symbol table is not
+// loaded in memory during run-time. The string table size bloat is kept very
+// minimal using a unary naming scheme that uses string suffix compression. The
+// basic blocks for function foo are named "a.BB.foo", "aa.BB.foo", ... This
+// turns out to be very good for string table sizes and the bloat in the string
+// table size for a very large binary is ~8 %.  The naming also allows using
+// the --symbol-ordering-file option in LLD to arbitrarily reorder the
+// sections.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/LineIterator.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Target/TargetMachine.h"
+
+using llvm::SmallSet;
+using llvm::SmallVector;
+using llvm::StringMap;
+using llvm::StringRef;
+using namespace llvm;
+
+namespace {
+
+// This struct represents the cluster information for a machine basic block.
+struct BBClusterInfo {
+  // MachineBasicBlock ID.
+  unsigned MBBNumber;
+  // Cluster ID this basic block belongs to.
+  unsigned ClusterID;
+  // Position of basic block within the cluster.
+  unsigned PositionInCluster;
+};
+
+using ProgramBBClusterInfoMapTy = StringMap<SmallVector<BBClusterInfo, 4>>;
+
+class BBSectionsPrepare : public MachineFunctionPass {
+public:
+  static char ID;
+
+  // This contains the basic-block-sections profile.
+  const MemoryBuffer *MBuf = nullptr;
+
+  // This encapsulates the BB cluster information for the whole program.
+  //
+  // For every function name, it contains the cluster information for (all or
+  // some of) its basic blocks. The cluster information for every basic block
+  // includes its cluster ID along with the position of the basic block in that
+  // cluster.
+  ProgramBBClusterInfoMapTy ProgramBBClusterInfo;
+
+  // Some functions have alias names. We use this map to find the main alias
+  // name for which we have mapping in ProgramBBClusterInfo.
+  StringMap<StringRef> FuncAliasMap;
+
+  BBSectionsPrepare(const MemoryBuffer *Buf)
+      : MachineFunctionPass(ID), MBuf(Buf) {
+    initializeBBSectionsPreparePass(*PassRegistry::getPassRegistry());
+  };
+
+  BBSectionsPrepare() : MachineFunctionPass(ID) {
+    initializeBBSectionsPreparePass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override {
+    return "Basic Block Sections Analysis";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+  /// Read profiles of basic blocks if available here.
+  bool doInitialization(Module &M) override;
+
+  /// Identify basic blocks that need separate sections and prepare to emit them
+  /// accordingly.
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // end anonymous namespace
+
+char BBSectionsPrepare::ID = 0;
+INITIALIZE_PASS(BBSectionsPrepare, "bbsections-prepare",
+                "Prepares for basic block sections, by splitting functions "
+                "into clusters of basic blocks.",
+                false, false)
+
+// This function updates and optimizes the branching instructions of every basic
+// block in a given function to account for changes in the layout.
+static void updateBranches(
+    MachineFunction &MF,
+    const SmallVector<MachineBasicBlock *, 4> &PreLayoutFallThroughs) {
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  SmallVector<MachineOperand, 4> Cond;
+  for (auto &MBB : MF) {
+    auto NextMBBI = std::next(MBB.getIterator());
+    auto *FTMBB = PreLayoutFallThroughs[MBB.getNumber()];
+    // If this block had a fallthrough before we need an explicit unconditional
+    // branch to that block if either
+    //     1- the block ends a section, which means its next block may be
+    //        reorderd by the linker, or
+    //     2- the fallthrough block is not adjacent to the block in the new
+    //        order.
+    if (FTMBB && (MBB.isEndSection() || &*NextMBBI != FTMBB))
+      TII->insertUnconditionalBranch(MBB, FTMBB, MBB.findBranchDebugLoc());
+
+    // We do not optimize branches for machine basic blocks ending sections, as
+    // their adjacent block might be reordered by the linker.
+    if (MBB.isEndSection())
+      continue;
+
+    // It might be possible to optimize branches by flipping the branch
+    // condition.
+    Cond.clear();
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
+    if (TII->analyzeBranch(MBB, TBB, FBB, Cond))
+      continue;
+    MBB.updateTerminator(FTMBB);
+  }
+}
+
+// This function provides the BBCluster information associated with a function.
+// Returns true if a valid association exists and false otherwise.
+static bool getBBClusterInfoForFunction(
+    const MachineFunction &MF, const StringMap<StringRef> FuncAliasMap,
+    const ProgramBBClusterInfoMapTy &ProgramBBClusterInfo,
+    std::vector<Optional<BBClusterInfo>> &V) {
+  // Get the main alias name for the function.
+  auto FuncName = MF.getName();
+  auto R = FuncAliasMap.find(FuncName);
+  StringRef AliasName = R == FuncAliasMap.end() ? FuncName : R->second;
+
+  // Find the assoicated cluster information.
+  auto P = ProgramBBClusterInfo.find(AliasName);
+  if (P == ProgramBBClusterInfo.end())
+    return false;
+
+  if (P->second.empty()) {
+    // This indicates that sections are desired for all basic blocks of this
+    // function. We clear the BBClusterInfo vector to denote this.
+    V.clear();
+    return true;
+  }
+
+  V.resize(MF.getNumBlockIDs());
+  for (auto bbClusterInfo : P->second) {
+    // Bail out if the cluster information contains invalid MBB numbers.
+    if (bbClusterInfo.MBBNumber >= MF.getNumBlockIDs())
+      return false;
+    V[bbClusterInfo.MBBNumber] = bbClusterInfo;
+  }
+  return true;
+}
+
+// This function sorts basic blocks according to the cluster's information.
+// All explicitly specified clusters of basic blocks will be ordered
+// accordingly. All non-specified BBs go into a separate "Cold" section.
+// Additionally, if exception handling landing pads end up in more than one
+// clusters, they are moved into a single "Exception" section. Eventually,
+// clusters are ordered in increasing order of their IDs, with the "Exception"
+// and "Cold" succeeding all other clusters.
+// FuncBBClusterInfo represent the cluster information for basic blocks. If this
+// is empty, it means unique sections for all basic blocks in the function.
+static bool assignSectionsAndSortBasicBlocks(
+    MachineFunction &MF,
+    const std::vector<Optional<BBClusterInfo>> &FuncBBClusterInfo) {
+  assert(MF.hasBBSections() && "BB Sections is not set for function.");
+  // This variable stores the section ID of the cluster containing eh_pads (if
+  // all eh_pads are one cluster). If more than one cluster contain eh_pads, we
+  // set it equal to ExceptionSectionID.
+  Optional<MBBSectionID> EHPadsSectionID;
+
+  for (auto &MBB : MF) {
+    // With the 'all' option, every basic block is placed in a unique section.
+    // With the 'list' option, every basic block is placed in a section
+    // associated with its cluster, unless we want individual unique sections
+    // for every basic block in this function (if FuncBBClusterInfo is empty).
+    if (MF.getTarget().getBBSectionsType() == llvm::BasicBlockSection::All ||
+        FuncBBClusterInfo.empty()) {
+      // If unique sections are desired for all basic blocks of the function, we
+      // set every basic block's section ID equal to its number (basic block
+      // id). This further ensures that basic blocks are ordered canonically.
+      MBB.setSectionID({static_cast<unsigned int>(MBB.getNumber())});
+    } else if (FuncBBClusterInfo[MBB.getNumber()].hasValue())
+      MBB.setSectionID(FuncBBClusterInfo[MBB.getNumber()]->ClusterID);
+    else {
+      // BB goes into the special cold section if it is not specified in the
+      // cluster info map.
+      MBB.setSectionID(MBBSectionID::ColdSectionID);
+    }
+
+    if (MBB.isEHPad() && EHPadsSectionID != MBB.getSectionID() &&
+        EHPadsSectionID != MBBSectionID::ExceptionSectionID) {
+      // If we already have one cluster containing eh_pads, this must be updated
+      // to ExceptionSectionID. Otherwise, we set it equal to the current
+      // section ID.
+      EHPadsSectionID = EHPadsSectionID.hasValue()
+                            ? MBBSectionID::ExceptionSectionID
+                            : MBB.getSectionID();
+    }
+  }
+
+  // If EHPads are in more than one section, this places all of them in the
+  // special exception section.
+  if (EHPadsSectionID == MBBSectionID::ExceptionSectionID)
+    for (auto &MBB : MF)
+      if (MBB.isEHPad())
+        MBB.setSectionID(EHPadsSectionID.getValue());
+
+  SmallVector<MachineBasicBlock *, 4> PreLayoutFallThroughs(
+      MF.getNumBlockIDs());
+  for (auto &MBB : MF)
+    PreLayoutFallThroughs[MBB.getNumber()] = MBB.getFallThrough();
+
+  // We make sure that the cluster including the entry basic block precedes all
+  // other clusters.
+  auto EntryBBSectionID = MF.front().getSectionID();
+
+  // Helper function for ordering BB sections as follows:
+  //   * Entry section (section including the entry block).
+  //   * Regular sections (in increasing order of their Number).
+  //     ...
+  //   * Exception section
+  //   * Cold section
+  auto MBBSectionOrder = [EntryBBSectionID](const MBBSectionID &LHS,
+                                            const MBBSectionID &RHS) {
+    // We make sure that the section containing the entry block precedes all the
+    // other sections.
+    if (LHS == EntryBBSectionID || RHS == EntryBBSectionID)
+      return LHS == EntryBBSectionID;
+    return LHS.Type == RHS.Type ? LHS.Number < RHS.Number : LHS.Type < RHS.Type;
+  };
+
+  // We sort all basic blocks to make sure the basic blocks of every cluster are
+  // contiguous and ordered accordingly. Furthermore, clusters are ordered in
+  // increasing order of their section IDs, with the exception and the
+  // cold section placed at the end of the function.
+  MF.sort([&](MachineBasicBlock &X, MachineBasicBlock &Y) {
+    auto XSectionID = X.getSectionID();
+    auto YSectionID = Y.getSectionID();
+    if (XSectionID != YSectionID)
+      return MBBSectionOrder(XSectionID, YSectionID);
+    // If the two basic block are in the same section, the order is decided by
+    // their position within the section.
+    if (XSectionID.Type == MBBSectionID::SectionType::Default)
+      return FuncBBClusterInfo[X.getNumber()]->PositionInCluster <
+             FuncBBClusterInfo[Y.getNumber()]->PositionInCluster;
+    return X.getNumber() < Y.getNumber();
+  });
+
+  // Set IsBeginSection and IsEndSection according to the assigned section IDs.
+  MF.assignBeginEndSections();
+
+  // After reordering basic blocks, we must update basic block branches to
+  // insert explicit fallthrough branches when required and optimize branches
+  // when possible.
+  updateBranches(MF, PreLayoutFallThroughs);
+
+  return true;
+}
+
+bool BBSectionsPrepare::runOnMachineFunction(MachineFunction &MF) {
+  auto BBSectionsType = MF.getTarget().getBBSectionsType();
+  assert(BBSectionsType != BasicBlockSection::None &&
+         "BB Sections not enabled!");
+  // Renumber blocks before sorting them for basic block sections.  This is
+  // useful during sorting, basic blocks in the same section will retain the
+  // default order.  This renumbering should also be done for basic block
+  // labels to match the profiles with the correct blocks.
+  MF.RenumberBlocks();
+
+  if (BBSectionsType == BasicBlockSection::Labels) {
+    MF.setBBSectionsType(BBSectionsType);
+    MF.createBBLabels();
+    return true;
+  }
+
+  std::vector<Optional<BBClusterInfo>> FuncBBClusterInfo;
+  if (BBSectionsType == BasicBlockSection::List &&
+      !getBBClusterInfoForFunction(MF, FuncAliasMap, ProgramBBClusterInfo,
+                                   FuncBBClusterInfo))
+    return true;
+  MF.setBBSectionsType(BBSectionsType);
+  MF.createBBLabels();
+  assignSectionsAndSortBasicBlocks(MF, FuncBBClusterInfo);
+  return true;
+}
+
+// Basic Block Sections can be enabled for a subset of machine basic blocks.
+// This is done by passing a file containing names of functions for which basic
+// block sections are desired.  Additionally, machine basic block ids of the
+// functions can also be specified for a finer granularity. Moreover, a cluster
+// of basic blocks could be assigned to the same section.
+// A file with basic block sections for all of function main and three blocks
+// for function foo (of which 1 and 2 are placed in a cluster) looks like this:
+// ----------------------------
+// list.txt:
+// !main
+// !foo
+// !!1 2
+// !!4
+static Error getBBClusterInfo(const MemoryBuffer *MBuf,
+                              ProgramBBClusterInfoMapTy &ProgramBBClusterInfo,
+                              StringMap<StringRef> &FuncAliasMap) {
+  assert(MBuf);
+  line_iterator LineIt(*MBuf, /*SkipBlanks=*/true, /*CommentMarker=*/'#');
+
+  auto invalidProfileError = [&](auto Message) {
+    return make_error<StringError>(
+        Twine("Invalid profile " + MBuf->getBufferIdentifier() + " at line " +
+              Twine(LineIt.line_number()) + ": " + Message),
+        inconvertibleErrorCode());
+  };
+
+  auto FI = ProgramBBClusterInfo.end();
+
+  // Current cluster ID corresponding to this function.
+  unsigned CurrentCluster = 0;
+  // Current position in the current cluster.
+  unsigned CurrentPosition = 0;
+
+  // Temporary set to ensure every basic block ID appears once in the clusters
+  // of a function.
+  SmallSet<unsigned, 4> FuncBBIDs;
+
+  for (; !LineIt.is_at_eof(); ++LineIt) {
+    StringRef S(*LineIt);
+    if (S[0] == '@')
+      continue;
+    // Check for the leading "!"
+    if (!S.consume_front("!") || S.empty())
+      break;
+    // Check for second "!" which indicates a cluster of basic blocks.
+    if (S.consume_front("!")) {
+      if (FI == ProgramBBClusterInfo.end())
+        return invalidProfileError(
+            "Cluster list does not follow a function name specifier.");
+      SmallVector<StringRef, 4> BBIndexes;
+      S.split(BBIndexes, ' ');
+      // Reset current cluster position.
+      CurrentPosition = 0;
+      for (auto BBIndexStr : BBIndexes) {
+        unsigned long long BBIndex;
+        if (getAsUnsignedInteger(BBIndexStr, 10, BBIndex))
+          return invalidProfileError(Twine("Unsigned integer expected: '") +
+                                     BBIndexStr + "'.");
+        if (!FuncBBIDs.insert(BBIndex).second)
+          return invalidProfileError(Twine("Duplicate basic block id found '") +
+                                     BBIndexStr + "'.");
+        if (!BBIndex && CurrentPosition)
+          return invalidProfileError("Entry BB (0) does not begin a cluster.");
+
+        FI->second.emplace_back(BBClusterInfo{
+            ((unsigned)BBIndex), CurrentCluster, CurrentPosition++});
+      }
+      CurrentCluster++;
+    } else { // This is a function name specifier.
+      // Function aliases are separated using '/'. We use the first function
+      // name for the cluster info mapping and delegate all other aliases to
+      // this one.
+      SmallVector<StringRef, 4> Aliases;
+      S.split(Aliases, '/');
+      for (size_t i = 1; i < Aliases.size(); ++i)
+        FuncAliasMap.try_emplace(Aliases[i], Aliases.front());
+
+      // Prepare for parsing clusters of this function name.
+      // Start a new cluster map for this function name.
+      FI = ProgramBBClusterInfo.try_emplace(Aliases.front()).first;
+      CurrentCluster = 0;
+      FuncBBIDs.clear();
+    }
+  }
+  return Error::success();
+}
+
+bool BBSectionsPrepare::doInitialization(Module &M) {
+  if (!MBuf)
+    return false;
+  if (auto Err = getBBClusterInfo(MBuf, ProgramBBClusterInfo, FuncAliasMap))
+    report_fatal_error(std::move(Err));
+  return false;
+}
+
+void BBSectionsPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesAll();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+MachineFunctionPass *
+llvm::createBBSectionsPreparePass(const MemoryBuffer *Buf) {
+  return new BBSectionsPrepare(Buf);
+}
diff --git a/llvm/lib/CodeGen/BranchFolding.cpp b/llvm/lib/CodeGen/BranchFolding.cpp
index 4b9c50aeb1d3..c6d5aa37834f 100644
--- a/llvm/lib/CodeGen/BranchFolding.cpp
+++ b/llvm/lib/CodeGen/BranchFolding.cpp
@@ -40,6 +40,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/CodeGen/MBFIWrapper.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -129,15 +130,13 @@ bool BranchFolderPass::runOnMachineFunction(MachineFunction &MF) {
   // HW that requires structurized CFG.
   bool EnableTailMerge = !MF.getTarget().requiresStructuredCFG() &&
                          PassConfig->getEnableTailMerge();
-  BranchFolder::MBFIWrapper MBBFreqInfo(
+  MBFIWrapper MBBFreqInfo(
       getAnalysis<MachineBlockFrequencyInfo>());
   BranchFolder Folder(EnableTailMerge, /*CommonHoist=*/true, MBBFreqInfo,
                       getAnalysis<MachineBranchProbabilityInfo>(),
                       &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI());
-  auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
-  return Folder.OptimizeFunction(
-      MF, MF.getSubtarget().getInstrInfo(), MF.getSubtarget().getRegisterInfo(),
-      MMIWP ? &MMIWP->getMMI() : nullptr);
+  return Folder.OptimizeFunction(MF, MF.getSubtarget().getInstrInfo(),
+                                 MF.getSubtarget().getRegisterInfo());
 }
 
 BranchFolder::BranchFolder(bool defaultEnableTailMerge, bool CommonHoist,
@@ -170,7 +169,7 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
 
   // Update call site info.
   std::for_each(MBB->begin(), MBB->end(), [MF](const MachineInstr &MI) {
-    if (MI.isCall(MachineInstr::IgnoreBundle))
+    if (MI.shouldUpdateCallSiteInfo())
       MF->eraseCallSiteInfo(&MI);
   });
   // Remove the block.
@@ -183,7 +182,6 @@ void BranchFolder::RemoveDeadBlock(MachineBasicBlock *MBB) {
 bool BranchFolder::OptimizeFunction(MachineFunction &MF,
                                     const TargetInstrInfo *tii,
                                     const TargetRegisterInfo *tri,
-                                    MachineModuleInfo *mmi,
                                     MachineLoopInfo *mli, bool AfterPlacement) {
   if (!tii) return false;
 
@@ -193,7 +191,6 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
   AfterBlockPlacement = AfterPlacement;
   TII = tii;
   TRI = tri;
-  MMI = mmi;
   MLI = mli;
   this->MRI = &MRI;
 
@@ -201,14 +198,7 @@ bool BranchFolder::OptimizeFunction(MachineFunction &MF,
   if (!UpdateLiveIns)
     MRI.invalidateLiveness();
 
-  // Fix CFG.  The later algorithms expect it to be right.
   bool MadeChange = false;
-  for (MachineBasicBlock &MBB : MF) {
-    MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
-    SmallVector<MachineOperand, 4> Cond;
-    if (!TII->analyzeBranch(MBB, TBB, FBB, Cond, true))
-      MadeChange |= MBB.CorrectExtraCFGEdges(TBB, FBB, !Cond.empty());
-  }
 
   // Recalculate EH scope membership.
   EHScopeMembership = getEHScopeMembership(MF);
@@ -354,6 +344,9 @@ static unsigned ComputeCommonTailLength(MachineBasicBlock *MBB1,
         MBBI1->isInlineAsm()) {
       break;
     }
+    if (MBBI1->getFlag(MachineInstr::NoMerge) ||
+        MBBI2->getFlag(MachineInstr::NoMerge))
+      break;
     ++TailLen;
     I1 = MBBI1;
     I2 = MBBI2;
@@ -501,42 +494,6 @@ BranchFolder::MergePotentialsElt::operator<(const MergePotentialsElt &o) const {
 #endif
 }
 
-BlockFrequency
-BranchFolder::MBFIWrapper::getBlockFreq(const MachineBasicBlock *MBB) const {
-  auto I = MergedBBFreq.find(MBB);
-
-  if (I != MergedBBFreq.end())
-    return I->second;
-
-  return MBFI.getBlockFreq(MBB);
-}
-
-void BranchFolder::MBFIWrapper::setBlockFreq(const MachineBasicBlock *MBB,
-                                             BlockFrequency F) {
-  MergedBBFreq[MBB] = F;
-}
-
-raw_ostream &
-BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
-                                          const MachineBasicBlock *MBB) const {
-  return MBFI.printBlockFreq(OS, getBlockFreq(MBB));
-}
-
-raw_ostream &
-BranchFolder::MBFIWrapper::printBlockFreq(raw_ostream &OS,
-                                          const BlockFrequency Freq) const {
-  return MBFI.printBlockFreq(OS, Freq);
-}
-
-void BranchFolder::MBFIWrapper::view(const Twine &Name, bool isSimple) {
-  MBFI.view(Name, isSimple);
-}
-
-uint64_t
-BranchFolder::MBFIWrapper::getEntryFreq() const {
-  return MBFI.getEntryFreq();
-}
-
 /// CountTerminators - Count the number of terminators in the given
 /// block and set I to the position of the first non-terminator, if there
 /// is one, or MBB->end() otherwise.
@@ -591,7 +548,7 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
                   MachineBasicBlock *PredBB,
                   DenseMap<const MachineBasicBlock *, int> &EHScopeMembership,
                   bool AfterPlacement,
-                  BranchFolder::MBFIWrapper &MBBFreqInfo,
+                  MBFIWrapper &MBBFreqInfo,
                   ProfileSummaryInfo *PSI) {
   // It is never profitable to tail-merge blocks from two different EH scopes.
   if (!EHScopeMembership.empty()) {
@@ -691,8 +648,8 @@ ProfitableToMerge(MachineBasicBlock *MBB1, MachineBasicBlock *MBB2,
   MachineFunction *MF = MBB1->getParent();
   bool OptForSize =
       MF->getFunction().hasOptSize() ||
-      (llvm::shouldOptimizeForSize(MBB1, PSI, &MBBFreqInfo.getMBFI()) &&
-       llvm::shouldOptimizeForSize(MBB2, PSI, &MBBFreqInfo.getMBFI()));
+      (llvm::shouldOptimizeForSize(MBB1, PSI, &MBBFreqInfo) &&
+       llvm::shouldOptimizeForSize(MBB2, PSI, &MBBFreqInfo));
   return EffectiveTailLen >= 2 && OptForSize &&
          (FullBlockTail1 || FullBlockTail2);
 }
@@ -900,7 +857,7 @@ void BranchFolder::mergeCommonTails(unsigned commonTailIndex) {
       LiveRegs.clear();
       LiveRegs.addLiveOuts(*Pred);
       MachineBasicBlock::iterator InsertBefore = Pred->getFirstTerminator();
-      for (unsigned Reg : NewLiveIns) {
+      for (Register Reg : NewLiveIns) {
         if (!LiveRegs.available(*MRI, Reg))
           continue;
         DebugLoc DL;
@@ -963,10 +920,10 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
       continue;
     }
 
-    // If one of the blocks is the entire common tail (and not the entry
-    // block, which we can't jump to), we can treat all blocks with this same
-    // tail at once.  Use PredBB if that is one of the possibilities, as that
-    // will not introduce any extra branches.
+    // If one of the blocks is the entire common tail (and is not the entry
+    // block/an EH pad, which we can't jump to), we can treat all blocks with
+    // this same tail at once.  Use PredBB if that is one of the possibilities,
+    // as that will not introduce any extra branches.
     MachineBasicBlock *EntryBB =
         &MergePotentials.front().getBlock()->getParent()->front();
     unsigned commonTailIndex = SameTails.size();
@@ -974,19 +931,21 @@ bool BranchFolder::TryTailMergeBlocks(MachineBasicBlock *SuccBB,
     // into the other.
     if (SameTails.size() == 2 &&
         SameTails[0].getBlock()->isLayoutSuccessor(SameTails[1].getBlock()) &&
-        SameTails[1].tailIsWholeBlock())
+        SameTails[1].tailIsWholeBlock() && !SameTails[1].getBlock()->isEHPad())
       commonTailIndex = 1;
     else if (SameTails.size() == 2 &&
              SameTails[1].getBlock()->isLayoutSuccessor(
-                                                     SameTails[0].getBlock()) &&
-             SameTails[0].tailIsWholeBlock())
+                 SameTails[0].getBlock()) &&
+             SameTails[0].tailIsWholeBlock() &&
+             !SameTails[0].getBlock()->isEHPad())
       commonTailIndex = 0;
     else {
       // Otherwise just pick one, favoring the fall-through predecessor if
       // there is one.
       for (unsigned i = 0, e = SameTails.size(); i != e; ++i) {
         MachineBasicBlock *MBB = SameTails[i].getBlock();
-        if (MBB == EntryBB && SameTails[i].tailIsWholeBlock())
+        if ((MBB == EntryBB || MBB->isEHPad()) &&
+            SameTails[i].tailIsWholeBlock())
           continue;
         if (MBB == PredBB) {
           commonTailIndex = i;
@@ -1124,8 +1083,9 @@ bool BranchFolder::TailMergeBlocks(MachineFunction &MF) {
       if (!UniquePreds.insert(PBB).second)
         continue;
 
-      // Skip blocks which may jump to a landing pad. Can't tail merge these.
-      if (PBB->hasEHPadSuccessor())
+      // Skip blocks which may jump to a landing pad or jump from an asm blob.
+      // Can't tail merge these.
+      if (PBB->hasEHPadSuccessor() || PBB->mayHaveInlineAsmBr())
         continue;
 
       // After block placement, only consider predecessors that belong to the
@@ -1371,6 +1331,13 @@ ReoptimizeBlock:
     SameEHScope = MBBEHScope->second == FallThroughEHScope->second;
   }
 
+  // Analyze the branch in the current block. As a side-effect, this may cause
+  // the block to become empty.
+  MachineBasicBlock *CurTBB = nullptr, *CurFBB = nullptr;
+  SmallVector<MachineOperand, 4> CurCond;
+  bool CurUnAnalyzable =
+      TII->analyzeBranch(*MBB, CurTBB, CurFBB, CurCond, true);
+
   // If this block is empty, make everyone use its fall-through, not the block
   // explicitly.  Landing pads should not do this since the landing-pad table
   // points to this block.  Blocks with their addresses taken shouldn't be
@@ -1413,10 +1380,6 @@ ReoptimizeBlock:
   bool PriorUnAnalyzable =
       TII->analyzeBranch(PrevBB, PriorTBB, PriorFBB, PriorCond, true);
   if (!PriorUnAnalyzable) {
-    // If the CFG for the prior block has extra edges, remove them.
-    MadeChange |= PrevBB.CorrectExtraCFGEdges(PriorTBB, PriorFBB,
-                                              !PriorCond.empty());
-
     // If the previous branch is conditional and both conditions go to the same
     // destination, remove the branch, replacing it with an unconditional one or
     // a fall-through.
@@ -1437,7 +1400,7 @@ ReoptimizeBlock:
     // has been used, but it can happen if tail merging splits a fall-through
     // predecessor of a block.
     // This has to check PrevBB->succ_size() because EH edges are ignored by
-    // AnalyzeBranch.
+    // analyzeBranch.
     if (PriorCond.empty() && !PriorTBB && MBB->pred_size() == 1 &&
         PrevBB.succ_size() == 1 &&
         !MBB->hasAddressTaken() && !MBB->isEHPad()) {
@@ -1547,7 +1510,7 @@ ReoptimizeBlock:
 
   bool OptForSize =
       MF.getFunction().hasOptSize() ||
-      llvm::shouldOptimizeForSize(MBB, PSI, &MBBFreqInfo.getMBFI());
+      llvm::shouldOptimizeForSize(MBB, PSI, &MBBFreqInfo);
   if (!IsEmptyBlock(MBB) && MBB->pred_size() == 1 && OptForSize) {
     // Changing "Jcc foo; foo: jmp bar;" into "Jcc bar;" might change the branch
     // direction, thereby defeating careful block placement and regressing
@@ -1584,15 +1547,7 @@ ReoptimizeBlock:
     }
   }
 
-  // Analyze the branch in the current block.
-  MachineBasicBlock *CurTBB = nullptr, *CurFBB = nullptr;
-  SmallVector<MachineOperand, 4> CurCond;
-  bool CurUnAnalyzable =
-      TII->analyzeBranch(*MBB, CurTBB, CurFBB, CurCond, true);
   if (!CurUnAnalyzable) {
-    // If the CFG for the prior block has extra edges, remove them.
-    MadeChange |= MBB->CorrectExtraCFGEdges(CurTBB, CurFBB, !CurCond.empty());
-
     // If this is a two-way branch, and the FBB branches to this block, reverse
     // the condition so the single-basic-block loop is faster.  Instead of:
     //    Loop: xxx; jcc Out; jmp Loop
@@ -1669,7 +1624,7 @@ ReoptimizeBlock:
               PMBB->ReplaceUsesOfBlockWith(MBB, CurTBB);
               // If this change resulted in PMBB ending in a conditional
               // branch where both conditions go to the same destination,
-              // change this to an unconditional branch (and fix the CFG).
+              // change this to an unconditional branch.
               MachineBasicBlock *NewCurTBB = nullptr, *NewCurFBB = nullptr;
               SmallVector<MachineOperand, 4> NewCurCond;
               bool NewCurUnAnalyzable = TII->analyzeBranch(
@@ -1681,7 +1636,6 @@ ReoptimizeBlock:
                 TII->insertBranch(*PMBB, NewCurTBB, nullptr, NewCurCond, pdl);
                 MadeChange = true;
                 ++NumBranchOpts;
-                PMBB->CorrectExtraCFGEdges(NewCurTBB, nullptr, false);
               }
             }
           }
@@ -1712,13 +1666,15 @@ ReoptimizeBlock:
 
     if (!MBB->isEHPad()) {
       // Check all the predecessors of this block.  If one of them has no fall
-      // throughs, move this block right after it.
+      // throughs, and analyzeBranch thinks it _could_ fallthrough to this
+      // block, move this block right after it.
       for (MachineBasicBlock *PredBB : MBB->predecessors()) {
         // Analyze the branch at the end of the pred.
         MachineBasicBlock *PredTBB = nullptr, *PredFBB = nullptr;
         SmallVector<MachineOperand, 4> PredCond;
         if (PredBB != MBB && !PredBB->canFallThrough() &&
             !TII->analyzeBranch(*PredBB, PredTBB, PredFBB, PredCond, true) &&
+            (PredTBB == MBB || PredFBB == MBB) &&
             (!CurFallsThru || !CurTBB || !CurFBB) &&
             (!CurFallsThru || MBB->getNumber() >= PredBB->getNumber())) {
           // If the current block doesn't fall through, just move it.
@@ -1744,21 +1700,24 @@ ReoptimizeBlock:
     }
 
     if (!CurFallsThru) {
-      // Check all successors to see if we can move this block before it.
-      for (MachineBasicBlock *SuccBB : MBB->successors()) {
-        // Analyze the branch at the end of the block before the succ.
-        MachineFunction::iterator SuccPrev = --SuccBB->getIterator();
-
-        // If this block doesn't already fall-through to that successor, and if
-        // the succ doesn't already have a block that can fall through into it,
-        // and if the successor isn't an EH destination, we can arrange for the
-        // fallthrough to happen.
-        if (SuccBB != MBB && &*SuccPrev != MBB &&
-            !SuccPrev->canFallThrough() && !CurUnAnalyzable &&
-            !SuccBB->isEHPad()) {
-          MBB->moveBefore(SuccBB);
-          MadeChange = true;
-          goto ReoptimizeBlock;
+      // Check analyzable branch-successors to see if we can move this block
+      // before one.
+      if (!CurUnAnalyzable) {
+        for (MachineBasicBlock *SuccBB : {CurFBB, CurTBB}) {
+          if (!SuccBB)
+            continue;
+          // Analyze the branch at the end of the block before the succ.
+          MachineFunction::iterator SuccPrev = --SuccBB->getIterator();
+
+          // If this block doesn't already fall-through to that successor, and
+          // if the succ doesn't already have a block that can fall through into
+          // it, we can arrange for the fallthrough to happen.
+          if (SuccBB != MBB && &*SuccPrev != MBB &&
+              !SuccPrev->canFallThrough()) {
+            MBB->moveBefore(SuccBB);
+            MadeChange = true;
+            goto ReoptimizeBlock;
+          }
         }
       }
 
@@ -1817,9 +1776,9 @@ static MachineBasicBlock *findFalseBlock(MachineBasicBlock *BB,
 }
 
 template <class Container>
-static void addRegAndItsAliases(unsigned Reg, const TargetRegisterInfo *TRI,
+static void addRegAndItsAliases(Register Reg, const TargetRegisterInfo *TRI,
                                 Container &Set) {
-  if (Register::isPhysicalRegister(Reg)) {
+  if (Reg.isPhysical()) {
     for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
       Set.insert(*AI);
   } else {
@@ -1838,8 +1797,8 @@ static
 MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
                                                   const TargetInstrInfo *TII,
                                                   const TargetRegisterInfo *TRI,
-                                                  SmallSet<unsigned,4> &Uses,
-                                                  SmallSet<unsigned,4> &Defs) {
+                                                  SmallSet<Register, 4> &Uses,
+                                                  SmallSet<Register, 4> &Defs) {
   MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
   if (!TII->isUnpredicatedTerminator(*Loc))
     return MBB->end();
@@ -1875,8 +1834,7 @@ MachineBasicBlock::iterator findHoistingInsertPosAndDeps(MachineBasicBlock *MBB,
 
   // The terminator is probably a conditional branch, try not to separate the
   // branch from condition setting instruction.
-  MachineBasicBlock::iterator PI =
-    skipDebugInstructionsBackward(std::prev(Loc), MBB->begin());
+  MachineBasicBlock::iterator PI = prev_nodbg(Loc, MBB->begin());
 
   bool IsDef = false;
   for (const MachineOperand &MO : PI->operands()) {
@@ -1951,14 +1909,14 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
   // Find a suitable position to hoist the common instructions to. Also figure
   // out which registers are used or defined by instructions from the insertion
   // point to the end of the block.
-  SmallSet<unsigned, 4> Uses, Defs;
+  SmallSet<Register, 4> Uses, Defs;
   MachineBasicBlock::iterator Loc =
     findHoistingInsertPosAndDeps(MBB, TII, TRI, Uses, Defs);
   if (Loc == MBB->end())
     return false;
 
   bool HasDups = false;
-  SmallSet<unsigned, 4> ActiveDefsSet, AllDefsSet;
+  SmallSet<Register, 4> ActiveDefsSet, AllDefsSet;
   MachineBasicBlock::iterator TIB = TBB->begin();
   MachineBasicBlock::iterator FIB = FBB->begin();
   MachineBasicBlock::iterator TIE = TBB->end();
@@ -2042,7 +2000,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       if (!AllDefsSet.count(Reg)) {
         continue;
       }
-      if (Register::isPhysicalRegister(Reg)) {
+      if (Reg.isPhysical()) {
         for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI)
           ActiveDefsSet.erase(*AI);
       } else {
@@ -2055,7 +2013,7 @@ bool BranchFolder::HoistCommonCodeInSuccs(MachineBasicBlock *MBB) {
       if (!MO.isReg() || !MO.isDef() || MO.isDead())
         continue;
       Register Reg = MO.getReg();
-      if (!Reg || Register::isVirtualRegister(Reg))
+      if (!Reg || Reg.isVirtual())
         continue;
       addRegAndItsAliases(Reg, TRI, ActiveDefsSet);
       addRegAndItsAliases(Reg, TRI, AllDefsSet);
diff --git a/llvm/lib/CodeGen/BranchFolding.h b/llvm/lib/CodeGen/BranchFolding.h
index 7a4c68ea09f5..49c6bcae2db4 100644
--- a/llvm/lib/CodeGen/BranchFolding.h
+++ b/llvm/lib/CodeGen/BranchFolding.h
@@ -13,7 +13,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/Support/BlockFrequency.h"
 #include "llvm/Support/Compiler.h"
 #include <cstdint>
 #include <vector>
@@ -21,21 +20,18 @@
 namespace llvm {
 
 class BasicBlock;
-class MachineBlockFrequencyInfo;
 class MachineBranchProbabilityInfo;
 class MachineFunction;
 class MachineLoopInfo;
 class MachineModuleInfo;
 class MachineRegisterInfo;
+class MBFIWrapper;
 class ProfileSummaryInfo;
-class raw_ostream;
 class TargetInstrInfo;
 class TargetRegisterInfo;
 
   class LLVM_LIBRARY_VISIBILITY BranchFolder {
   public:
-    class MBFIWrapper;
-
     explicit BranchFolder(bool defaultEnableTailMerge,
                           bool CommonHoist,
                           MBFIWrapper &FreqInfo,
@@ -49,7 +45,7 @@ class TargetRegisterInfo;
     /// given function.  Block placement changes the layout and may create new
     /// tail merging opportunities.
     bool OptimizeFunction(MachineFunction &MF, const TargetInstrInfo *tii,
-                          const TargetRegisterInfo *tri, MachineModuleInfo *mmi,
+                          const TargetRegisterInfo *tri,
                           MachineLoopInfo *mli = nullptr,
                           bool AfterPlacement = false);
 
@@ -128,32 +124,9 @@ class TargetRegisterInfo;
     const TargetInstrInfo *TII;
     const MachineRegisterInfo *MRI;
     const TargetRegisterInfo *TRI;
-    MachineModuleInfo *MMI;
     MachineLoopInfo *MLI;
     LivePhysRegs LiveRegs;
 
-  public:
-    /// This class keeps track of branch frequencies of newly created
-    /// blocks and tail-merged blocks.
-    class MBFIWrapper {
-    public:
-      MBFIWrapper(const MachineBlockFrequencyInfo &I) : MBFI(I) {}
-
-      BlockFrequency getBlockFreq(const MachineBasicBlock *MBB) const;
-      void setBlockFreq(const MachineBasicBlock *MBB, BlockFrequency F);
-      raw_ostream &printBlockFreq(raw_ostream &OS,
-                                  const MachineBasicBlock *MBB) const;
-      raw_ostream &printBlockFreq(raw_ostream &OS,
-                                  const BlockFrequency Freq) const;
-      void view(const Twine &Name, bool isSimple = true);
-      uint64_t getEntryFreq() const;
-      const MachineBlockFrequencyInfo &getMBFI() { return MBFI; }
-
-    private:
-      const MachineBlockFrequencyInfo &MBFI;
-      DenseMap<const MachineBasicBlock *, BlockFrequency> MergedBBFreq;
-    };
-
   private:
     MBFIWrapper &MBBFreqInfo;
     const MachineBranchProbabilityInfo &MBPI;
diff --git a/llvm/lib/CodeGen/BranchRelaxation.cpp b/llvm/lib/CodeGen/BranchRelaxation.cpp
index f05517d178ae..5a3ec1a36f96 100644
--- a/llvm/lib/CodeGen/BranchRelaxation.cpp
+++ b/llvm/lib/CodeGen/BranchRelaxation.cpp
@@ -67,16 +67,13 @@ class BranchRelaxation : public MachineFunctionPass {
     unsigned postOffset(const MachineBasicBlock &MBB) const {
       const unsigned PO = Offset + Size;
       const Align Alignment = MBB.getAlignment();
-      if (Alignment == 1)
-        return PO;
-
       const Align ParentAlign = MBB.getParent()->getAlignment();
       if (Alignment <= ParentAlign)
-        return PO + offsetToAlignment(PO, Alignment);
+        return alignTo(PO, Alignment);
 
       // The alignment of this MBB is larger than the function's alignment, so we
       // can't tell whether or not it will insert nops. Assume that it will.
-      return PO + Alignment.value() + offsetToAlignment(PO, Alignment);
+      return alignTo(PO, Alignment) + Alignment.value() - ParentAlign.value();
     }
   };
 
@@ -129,7 +126,6 @@ void BranchRelaxation::verify() {
   unsigned PrevNum = MF->begin()->getNumber();
   for (MachineBasicBlock &MBB : *MF) {
     const unsigned Num = MBB.getNumber();
-    assert(isAligned(MBB.getAlignment(), BlockInfo[Num].Offset));
     assert(!Num || BlockInfo[PrevNum].postOffset(MBB) <= BlockInfo[Num].Offset);
     assert(BlockInfo[Num].Size == computeBlockSize(MBB));
     PrevNum = Num;
@@ -195,10 +191,9 @@ unsigned BranchRelaxation::getInstrOffset(const MachineInstr &MI) const {
 
 void BranchRelaxation::adjustBlockOffsets(MachineBasicBlock &Start) {
   unsigned PrevNum = Start.getNumber();
-  for (auto &MBB : make_range(MachineFunction::iterator(Start), MF->end())) {
+  for (auto &MBB :
+       make_range(std::next(MachineFunction::iterator(Start)), MF->end())) {
     unsigned Num = MBB.getNumber();
-    if (!Num) // block zero is never changed from offset zero.
-      continue;
     // Get the offset and known bits at the end of the layout predecessor.
     // Include the alignment of the current block.
     BlockInfo[Num].Offset = BlockInfo[PrevNum].postOffset(MBB);
@@ -250,8 +245,7 @@ MachineBasicBlock *BranchRelaxation::splitBlockBeforeInstr(MachineInstr &MI,
 
   // Cleanup potential unconditional branch to successor block.
   // Note that updateTerminator may change the size of the blocks.
-  NewBB->updateTerminator();
-  OrigBB->updateTerminator();
+  OrigBB->updateTerminator(NewBB);
 
   // Figure out how large the OrigBB is.  As the first half of the original
   // block, it cannot contain a tablejump.  The size includes
diff --git a/llvm/lib/CodeGen/BreakFalseDeps.cpp b/llvm/lib/CodeGen/BreakFalseDeps.cpp
index 9bae9d36add1..b01a264dd97d 100644
--- a/llvm/lib/CodeGen/BreakFalseDeps.cpp
+++ b/llvm/lib/CodeGen/BreakFalseDeps.cpp
@@ -106,9 +106,18 @@ FunctionPass *llvm::createBreakFalseDeps() { return new BreakFalseDeps(); }
 
 bool BreakFalseDeps::pickBestRegisterForUndef(MachineInstr *MI, unsigned OpIdx,
   unsigned Pref) {
+
+  // We can't change tied operands.
+  if (MI->isRegTiedToDefOperand(OpIdx))
+    return false;
+
   MachineOperand &MO = MI->getOperand(OpIdx);
   assert(MO.isUndef() && "Expected undef machine operand");
 
+  // We can't change registers that aren't renamable.
+  if (!MO.isRenamable())
+    return false;
+
   Register OriginalReg = MO.getReg();
 
   // Update only undef operands that have reg units that are mapped to one root.
diff --git a/llvm/lib/CodeGen/CFIInstrInserter.cpp b/llvm/lib/CodeGen/CFIInstrInserter.cpp
index ef548c84d3c0..23c7fea01f28 100644
--- a/llvm/lib/CodeGen/CFIInstrInserter.cpp
+++ b/llvm/lib/CodeGen/CFIInstrInserter.cpp
@@ -18,6 +18,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/SetOperations.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
@@ -76,15 +78,32 @@ class CFIInstrInserter : public MachineFunctionPass {
     unsigned IncomingCFARegister = 0;
     /// Value of cfa register valid at basic block exit.
     unsigned OutgoingCFARegister = 0;
+    /// Set of callee saved registers saved at basic block entry.
+    BitVector IncomingCSRSaved;
+    /// Set of callee saved registers saved at basic block exit.
+    BitVector OutgoingCSRSaved;
     /// If in/out cfa offset and register values for this block have already
     /// been set or not.
     bool Processed = false;
   };
 
+#define INVALID_REG UINT_MAX
+#define INVALID_OFFSET INT_MAX
+  /// contains the location where CSR register is saved.
+  struct CSRSavedLocation {
+    CSRSavedLocation(Optional<unsigned> R, Optional<int> O)
+        : Reg(R), Offset(O) {}
+    Optional<unsigned> Reg;
+    Optional<int> Offset;
+  };
+
   /// Contains cfa offset and register values valid at entry and exit of basic
   /// blocks.
   std::vector<MBBCFAInfo> MBBVector;
 
+  /// Map the callee save registers to the locations where they are saved.
+  SmallDenseMap<unsigned, CSRSavedLocation, 16> CSRLocMap;
+
   /// Calculate cfa offset and register values valid at entry and exit for all
   /// basic blocks in a function.
   void calculateCFAInfo(MachineFunction &MF);
@@ -105,10 +124,11 @@ class CFIInstrInserter : public MachineFunctionPass {
   /// if needed. The negated value is needed when creating CFI instructions that
   /// set absolute offset.
   int getCorrectCFAOffset(MachineBasicBlock *MBB) {
-    return -MBBVector[MBB->getNumber()].IncomingCFAOffset;
+    return MBBVector[MBB->getNumber()].IncomingCFAOffset;
   }
 
-  void report(const MBBCFAInfo &Pred, const MBBCFAInfo &Succ);
+  void reportCFAError(const MBBCFAInfo &Pred, const MBBCFAInfo &Succ);
+  void reportCSRError(const MBBCFAInfo &Pred, const MBBCFAInfo &Succ);
   /// Go through each MBB in a function and check that outgoing offset and
   /// register of its predecessors match incoming offset and register of that
   /// MBB, as well as that incoming offset and register of its successors match
@@ -132,6 +152,8 @@ void CFIInstrInserter::calculateCFAInfo(MachineFunction &MF) {
   // function.
   unsigned InitialRegister =
       MF.getSubtarget().getFrameLowering()->getInitialCFARegister(MF);
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+  unsigned NumRegs = TRI.getNumRegs();
 
   // Initialize MBBMap.
   for (MachineBasicBlock &MBB : MF) {
@@ -141,17 +163,17 @@ void CFIInstrInserter::calculateCFAInfo(MachineFunction &MF) {
     MBBInfo.OutgoingCFAOffset = InitialOffset;
     MBBInfo.IncomingCFARegister = InitialRegister;
     MBBInfo.OutgoingCFARegister = InitialRegister;
+    MBBInfo.IncomingCSRSaved.resize(NumRegs);
+    MBBInfo.OutgoingCSRSaved.resize(NumRegs);
     MBBVector[MBB.getNumber()] = MBBInfo;
   }
+  CSRLocMap.clear();
 
   // Set in/out cfa info for all blocks in the function. This traversal is based
   // on the assumption that the first block in the function is the entry block
   // i.e. that it has initial cfa offset and register values as incoming CFA
   // information.
-  for (MachineBasicBlock &MBB : MF) {
-    if (MBBVector[MBB.getNumber()].Processed) continue;
-    updateSuccCFAInfo(MBBVector[MBB.getNumber()]);
-  }
+  updateSuccCFAInfo(MBBVector[MF.front().getNumber()]);
 }
 
 void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
@@ -159,12 +181,17 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
   int SetOffset = MBBInfo.IncomingCFAOffset;
   // Outgoing cfa register set by the block.
   unsigned SetRegister = MBBInfo.IncomingCFARegister;
-  const std::vector<MCCFIInstruction> &Instrs =
-      MBBInfo.MBB->getParent()->getFrameInstructions();
+  MachineFunction *MF = MBBInfo.MBB->getParent();
+  const std::vector<MCCFIInstruction> &Instrs = MF->getFrameInstructions();
+  const TargetRegisterInfo &TRI = *MF->getSubtarget().getRegisterInfo();
+  unsigned NumRegs = TRI.getNumRegs();
+  BitVector CSRSaved(NumRegs), CSRRestored(NumRegs);
 
   // Determine cfa offset and register set by the block.
   for (MachineInstr &MI : *MBBInfo.MBB) {
     if (MI.isCFIInstruction()) {
+      Optional<unsigned> CSRReg;
+      Optional<int> CSROffset;
       unsigned CFIIndex = MI.getOperand(0).getCFIIndex();
       const MCCFIInstruction &CFI = Instrs[CFIIndex];
       switch (CFI.getOperation()) {
@@ -181,6 +208,18 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
         SetRegister = CFI.getRegister();
         SetOffset = CFI.getOffset();
         break;
+      case MCCFIInstruction::OpOffset:
+        CSROffset = CFI.getOffset();
+        break;
+      case MCCFIInstruction::OpRegister:
+        CSRReg = CFI.getRegister2();
+        break;
+      case MCCFIInstruction::OpRelOffset:
+        CSROffset = CFI.getOffset() - SetOffset;
+        break;
+      case MCCFIInstruction::OpRestore:
+        CSRRestored.set(CFI.getRegister());
+        break;
       case MCCFIInstruction::OpRememberState:
         // TODO: Add support for handling cfi_remember_state.
 #ifndef NDEBUG
@@ -198,18 +237,24 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
 #endif
         break;
       // Other CFI directives do not affect CFA value.
+      case MCCFIInstruction::OpUndefined:
       case MCCFIInstruction::OpSameValue:
-      case MCCFIInstruction::OpOffset:
-      case MCCFIInstruction::OpRelOffset:
       case MCCFIInstruction::OpEscape:
-      case MCCFIInstruction::OpRestore:
-      case MCCFIInstruction::OpUndefined:
-      case MCCFIInstruction::OpRegister:
       case MCCFIInstruction::OpWindowSave:
       case MCCFIInstruction::OpNegateRAState:
       case MCCFIInstruction::OpGnuArgsSize:
         break;
       }
+      if (CSRReg || CSROffset) {
+        auto It = CSRLocMap.find(CFI.getRegister());
+        if (It == CSRLocMap.end()) {
+          CSRLocMap.insert(
+              {CFI.getRegister(), CSRSavedLocation(CSRReg, CSROffset)});
+        } else if (It->second.Reg != CSRReg || It->second.Offset != CSROffset) {
+          llvm_unreachable("Different saved locations for the same CSR");
+        }
+        CSRSaved.set(CFI.getRegister());
+      }
     }
   }
 
@@ -218,6 +263,11 @@ void CFIInstrInserter::calculateOutgoingCFAInfo(MBBCFAInfo &MBBInfo) {
   // Update outgoing CFA info.
   MBBInfo.OutgoingCFAOffset = SetOffset;
   MBBInfo.OutgoingCFARegister = SetRegister;
+
+  // Update outgoing CSR info.
+  MBBInfo.OutgoingCSRSaved = MBBInfo.IncomingCSRSaved;
+  MBBInfo.OutgoingCSRSaved |= CSRSaved;
+  MBBInfo.OutgoingCSRSaved.reset(CSRRestored);
 }
 
 void CFIInstrInserter::updateSuccCFAInfo(MBBCFAInfo &MBBInfo) {
@@ -227,15 +277,13 @@ void CFIInstrInserter::updateSuccCFAInfo(MBBCFAInfo &MBBInfo) {
   do {
     MachineBasicBlock *Current = Stack.pop_back_val();
     MBBCFAInfo &CurrentInfo = MBBVector[Current->getNumber()];
-    if (CurrentInfo.Processed)
-      continue;
-
     calculateOutgoingCFAInfo(CurrentInfo);
     for (auto *Succ : CurrentInfo.MBB->successors()) {
       MBBCFAInfo &SuccInfo = MBBVector[Succ->getNumber()];
       if (!SuccInfo.Processed) {
         SuccInfo.IncomingCFAOffset = CurrentInfo.OutgoingCFAOffset;
         SuccInfo.IncomingCFARegister = CurrentInfo.OutgoingCFARegister;
+        SuccInfo.IncomingCSRSaved = CurrentInfo.OutgoingCSRSaved;
         Stack.push_back(Succ);
       }
     }
@@ -255,29 +303,31 @@ bool CFIInstrInserter::insertCFIInstrs(MachineFunction &MF) {
     auto MBBI = MBBInfo.MBB->begin();
     DebugLoc DL = MBBInfo.MBB->findDebugLoc(MBBI);
 
-    if (PrevMBBInfo->OutgoingCFAOffset != MBBInfo.IncomingCFAOffset) {
+    // If the current MBB will be placed in a unique section, a full DefCfa
+    // must be emitted.
+    const bool ForceFullCFA = MBB.isBeginSection();
+
+    if ((PrevMBBInfo->OutgoingCFAOffset != MBBInfo.IncomingCFAOffset &&
+         PrevMBBInfo->OutgoingCFARegister != MBBInfo.IncomingCFARegister) ||
+        ForceFullCFA) {
       // If both outgoing offset and register of a previous block don't match
-      // incoming offset and register of this block, add a def_cfa instruction
-      // with the correct offset and register for this block.
-      if (PrevMBBInfo->OutgoingCFARegister != MBBInfo.IncomingCFARegister) {
-        unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
-            nullptr, MBBInfo.IncomingCFARegister, getCorrectCFAOffset(&MBB)));
-        BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-            .addCFIIndex(CFIIndex);
-        // If outgoing offset of a previous block doesn't match incoming offset
-        // of this block, add a def_cfa_offset instruction with the correct
-        // offset for this block.
-      } else {
-        unsigned CFIIndex =
-            MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(
-                nullptr, getCorrectCFAOffset(&MBB)));
-        BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-            .addCFIIndex(CFIIndex);
-      }
+      // incoming offset and register of this block, or if this block begins a
+      // section, add a def_cfa instruction with the correct offset and
+      // register for this block.
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
+          nullptr, MBBInfo.IncomingCFARegister, getCorrectCFAOffset(&MBB)));
+      BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      InsertedCFIInstr = true;
+    } else if (PrevMBBInfo->OutgoingCFAOffset != MBBInfo.IncomingCFAOffset) {
+      // If outgoing offset of a previous block doesn't match incoming offset
+      // of this block, add a def_cfa_offset instruction with the correct
+      // offset for this block.
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(
+          nullptr, getCorrectCFAOffset(&MBB)));
+      BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
       InsertedCFIInstr = true;
-      // If outgoing register of a previous block doesn't match incoming
-      // register of this block, add a def_cfa_register instruction with the
-      // correct register for this block.
     } else if (PrevMBBInfo->OutgoingCFARegister !=
                MBBInfo.IncomingCFARegister) {
       unsigned CFIIndex =
@@ -287,12 +337,53 @@ bool CFIInstrInserter::insertCFIInstrs(MachineFunction &MF) {
           .addCFIIndex(CFIIndex);
       InsertedCFIInstr = true;
     }
+
+    if (ForceFullCFA) {
+      MF.getSubtarget().getFrameLowering()->emitCalleeSavedFrameMoves(
+          *MBBInfo.MBB, MBBI);
+      InsertedCFIInstr = true;
+      PrevMBBInfo = &MBBInfo;
+      continue;
+    }
+
+    BitVector SetDifference = PrevMBBInfo->OutgoingCSRSaved;
+    SetDifference.reset(MBBInfo.IncomingCSRSaved);
+    for (int Reg : SetDifference.set_bits()) {
+      unsigned CFIIndex =
+          MF.addFrameInst(MCCFIInstruction::createRestore(nullptr, Reg));
+      BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      InsertedCFIInstr = true;
+    }
+
+    SetDifference = MBBInfo.IncomingCSRSaved;
+    SetDifference.reset(PrevMBBInfo->OutgoingCSRSaved);
+    for (int Reg : SetDifference.set_bits()) {
+      auto it = CSRLocMap.find(Reg);
+      assert(it != CSRLocMap.end() && "Reg should have an entry in CSRLocMap");
+      unsigned CFIIndex;
+      CSRSavedLocation RO = it->second;
+      if (!RO.Reg && RO.Offset) {
+        CFIIndex = MF.addFrameInst(
+            MCCFIInstruction::createOffset(nullptr, Reg, *RO.Offset));
+      } else if (RO.Reg && !RO.Offset) {
+        CFIIndex = MF.addFrameInst(
+            MCCFIInstruction::createRegister(nullptr, Reg, *RO.Reg));
+      } else {
+        llvm_unreachable("RO.Reg and RO.Offset cannot both be valid/invalid");
+      }
+      BuildMI(*MBBInfo.MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+          .addCFIIndex(CFIIndex);
+      InsertedCFIInstr = true;
+    }
+
     PrevMBBInfo = &MBBInfo;
   }
   return InsertedCFIInstr;
 }
 
-void CFIInstrInserter::report(const MBBCFAInfo &Pred, const MBBCFAInfo &Succ) {
+void CFIInstrInserter::reportCFAError(const MBBCFAInfo &Pred,
+                                      const MBBCFAInfo &Succ) {
   errs() << "*** Inconsistent CFA register and/or offset between pred and succ "
             "***\n";
   errs() << "Pred: " << Pred.MBB->getName() << " #" << Pred.MBB->getNumber()
@@ -307,6 +398,22 @@ void CFIInstrInserter::report(const MBBCFAInfo &Pred, const MBBCFAInfo &Succ) {
          << " incoming CFA Offset:" << Succ.IncomingCFAOffset << "\n";
 }
 
+void CFIInstrInserter::reportCSRError(const MBBCFAInfo &Pred,
+                                      const MBBCFAInfo &Succ) {
+  errs() << "*** Inconsistent CSR Saved between pred and succ in function "
+         << Pred.MBB->getParent()->getName() << " ***\n";
+  errs() << "Pred: " << Pred.MBB->getName() << " #" << Pred.MBB->getNumber()
+         << " outgoing CSR Saved: ";
+  for (int Reg : Pred.OutgoingCSRSaved.set_bits())
+    errs() << Reg << " ";
+  errs() << "\n";
+  errs() << "Succ: " << Succ.MBB->getName() << " #" << Succ.MBB->getNumber()
+         << " incoming CSR Saved: ";
+  for (int Reg : Succ.IncomingCSRSaved.set_bits())
+    errs() << Reg << " ";
+  errs() << "\n";
+}
+
 unsigned CFIInstrInserter::verify(MachineFunction &MF) {
   unsigned ErrorNum = 0;
   for (auto *CurrMBB : depth_first(&MF)) {
@@ -321,7 +428,13 @@ unsigned CFIInstrInserter::verify(MachineFunction &MF) {
         // we don't generate epilogues inside such blocks.
         if (SuccMBBInfo.MBB->succ_empty() && !SuccMBBInfo.MBB->isReturnBlock())
           continue;
-        report(CurrMBBInfo, SuccMBBInfo);
+        reportCFAError(CurrMBBInfo, SuccMBBInfo);
+        ErrorNum++;
+      }
+      // Check that IncomingCSRSaved of every successor matches the
+      // OutgoingCSRSaved of CurrMBB
+      if (SuccMBBInfo.IncomingCSRSaved != CurrMBBInfo.OutgoingCSRSaved) {
+        reportCSRError(CurrMBBInfo, SuccMBBInfo);
         ErrorNum++;
       }
     }
diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp
index bf97aaee3665..5d6ee09c8438 100644
--- a/llvm/lib/CodeGen/CalcSpillWeights.cpp
+++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp
@@ -203,9 +203,10 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start,
   };
   std::set<CopyHint> CopyHints;
 
-  for (MachineRegisterInfo::reg_instr_iterator
-       I = mri.reg_instr_begin(li.reg), E = mri.reg_instr_end();
-       I != E; ) {
+  for (MachineRegisterInfo::reg_instr_nodbg_iterator
+           I = mri.reg_instr_nodbg_begin(li.reg),
+           E = mri.reg_instr_nodbg_end();
+       I != E;) {
     MachineInstr *mi = &*(I++);
 
     // For local split artifacts, we are interested only in instructions between
@@ -215,7 +216,7 @@ float VirtRegAuxInfo::weightCalcHelper(LiveInterval &li, SlotIndex *start,
       continue;
 
     numInstr++;
-    if (mi->isIdentityCopy() || mi->isImplicitDef() || mi->isDebugInstr())
+    if (mi->isIdentityCopy() || mi->isImplicitDef())
       continue;
     if (!visited.insert(mi).second)
       continue;
diff --git a/llvm/lib/CodeGen/CallingConvLower.cpp b/llvm/lib/CodeGen/CallingConvLower.cpp
index a397039180a4..3d8c2c8b00aa 100644
--- a/llvm/lib/CodeGen/CallingConvLower.cpp
+++ b/llvm/lib/CodeGen/CallingConvLower.cpp
@@ -42,29 +42,27 @@ CCState::CCState(CallingConv::ID CC, bool isVarArg, MachineFunction &mf,
 /// its parameter attribute.
 void CCState::HandleByVal(unsigned ValNo, MVT ValVT, MVT LocVT,
                           CCValAssign::LocInfo LocInfo, int MinSize,
-                          int MinAlignment, ISD::ArgFlagsTy ArgFlags) {
-  Align MinAlign(MinAlignment);
-  Align Alignment(ArgFlags.getByValAlign());
+                          Align MinAlign, ISD::ArgFlagsTy ArgFlags) {
+  Align Alignment = ArgFlags.getNonZeroByValAlign();
   unsigned Size  = ArgFlags.getByValSize();
   if (MinSize > (int)Size)
     Size = MinSize;
   if (MinAlign > Alignment)
     Alignment = MinAlign;
   ensureMaxAlignment(Alignment);
-  MF.getSubtarget().getTargetLowering()->HandleByVal(this, Size,
-                                                     Alignment.value());
+  MF.getSubtarget().getTargetLowering()->HandleByVal(this, Size, Alignment);
   Size = unsigned(alignTo(Size, MinAlign));
-  unsigned Offset = AllocateStack(Size, Alignment.value());
+  unsigned Offset = AllocateStack(Size, Alignment);
   addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
 }
 
 /// Mark a register and all of its aliases as allocated.
-void CCState::MarkAllocated(unsigned Reg) {
+void CCState::MarkAllocated(MCPhysReg Reg) {
   for (MCRegAliasIterator AI(Reg, &TRI, true); AI.isValid(); ++AI)
-    UsedRegs[*AI/32] |= 1 << (*AI&31);
+    UsedRegs[*AI / 32] |= 1 << (*AI & 31);
 }
 
-bool CCState::IsShadowAllocatedReg(unsigned Reg) const {
+bool CCState::IsShadowAllocatedReg(MCRegister Reg) const {
   if (!isAllocated(Reg))
     return false;
 
@@ -276,18 +274,14 @@ bool CCState::resultsCompatible(CallingConv::ID CalleeCC,
   for (unsigned I = 0, E = RVLocs1.size(); I != E; ++I) {
     const CCValAssign &Loc1 = RVLocs1[I];
     const CCValAssign &Loc2 = RVLocs2[I];
-    if (Loc1.getLocInfo() != Loc2.getLocInfo())
-      return false;
-    bool RegLoc1 = Loc1.isRegLoc();
-    if (RegLoc1 != Loc2.isRegLoc())
+
+    if ( // Must both be in registers, or both in memory
+        Loc1.isRegLoc() != Loc2.isRegLoc() ||
+        // Must fill the same part of their locations
+        Loc1.getLocInfo() != Loc2.getLocInfo() ||
+        // Memory offset/register number must be the same
+        Loc1.getExtraInfo() != Loc2.getExtraInfo())
       return false;
-    if (RegLoc1) {
-      if (Loc1.getLocReg() != Loc2.getLocReg())
-        return false;
-    } else {
-      if (Loc1.getLocMemOffset() != Loc2.getLocMemOffset())
-        return false;
-    }
   }
   return true;
 }
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 20fc67cc66ae..7a8c022c82da 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -20,12 +20,14 @@ using namespace llvm;
 /// initializeCodeGen - Initialize all passes linked into the CodeGen library.
 void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeAtomicExpandPass(Registry);
+  initializeBBSectionsPreparePass(Registry);
   initializeBranchFolderPassPass(Registry);
   initializeBranchRelaxationPass(Registry);
   initializeCFGuardLongjmpPass(Registry);
   initializeCFIInstrInserterPass(Registry);
   initializeCodeGenPreparePass(Registry);
   initializeDeadMachineInstructionElimPass(Registry);
+  initializeDebugifyMachineModulePass(Registry);
   initializeDetectDeadLanesPass(Registry);
   initializeDwarfEHPreparePass(Registry);
   initializeEarlyIfConverterPass(Registry);
@@ -37,6 +39,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeFEntryInserterPass(Registry);
   initializeFinalizeISelPass(Registry);
   initializeFinalizeMachineBundlesPass(Registry);
+  initializeFixupStatepointCallerSavedPass(Registry);
   initializeFuncletLayoutPass(Registry);
   initializeGCMachineCodeAnalysisPass(Registry);
   initializeGCModuleInfoPass(Registry);
@@ -97,11 +100,13 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeSafeStackLegacyPassPass(Registry);
   initializeScalarizeMaskedMemIntrinPass(Registry);
   initializeShrinkWrapPass(Registry);
+  initializeSjLjEHPreparePass(Registry);
   initializeSlotIndexesPass(Registry);
   initializeStackColoringPass(Registry);
   initializeStackMapLivenessPass(Registry);
   initializeStackProtectorPass(Registry);
   initializeStackSlotColoringPass(Registry);
+  initializeStripDebugMachineModulePass(Registry);
   initializeTailDuplicatePass(Registry);
   initializeTargetPassConfigPass(Registry);
   initializeTwoAddressInstructionPassPass(Registry);
diff --git a/llvm/lib/CodeGen/CodeGenPrepare.cpp b/llvm/lib/CodeGen/CodeGenPrepare.cpp
index f05afd058746..e8b8e6c93cf0 100644
--- a/llvm/lib/CodeGen/CodeGenPrepare.cpp
+++ b/llvm/lib/CodeGen/CodeGenPrepare.cpp
@@ -43,7 +43,6 @@
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -61,7 +60,6 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
-#include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
@@ -178,6 +176,17 @@ static cl::opt<bool> ProfileGuidedSectionPrefix(
     "profile-guided-section-prefix", cl::Hidden, cl::init(true), cl::ZeroOrMore,
     cl::desc("Use profile info to add section prefix for hot/cold functions"));
 
+static cl::opt<bool> ProfileUnknownInSpecialSection(
+    "profile-unknown-in-special-section", cl::Hidden, cl::init(false),
+    cl::ZeroOrMore,
+    cl::desc("In profiling mode like sampleFDO, if a function doesn't have "
+             "profile, we cannot tell the function is cold for sure because "
+             "it may be a function newly added without ever being sampled. "
+             "With the flag enabled, compiler can put such profile unknown "
+             "functions into a special section, so runtime system can choose "
+             "to handle it in a different way than .text section, to save "
+             "RAM for example. "));
+
 static cl::opt<unsigned> FreqRatioToSkipMerge(
     "cgp-freq-ratio-to-skip-merge", cl::Hidden, cl::init(2),
     cl::desc("Skip merging empty blocks if (frequency of empty block) / "
@@ -230,6 +239,15 @@ static cl::opt<bool> EnableICMP_EQToICMP_ST(
     "cgp-icmp-eq2icmp-st", cl::Hidden, cl::init(false),
     cl::desc("Enable ICMP_EQ to ICMP_S(L|G)T conversion."));
 
+static cl::opt<bool>
+    VerifyBFIUpdates("cgp-verify-bfi-updates", cl::Hidden, cl::init(false),
+                     cl::desc("Enable BFI update verification for "
+                              "CodeGenPrepare."));
+
+static cl::opt<bool> OptimizePhiTypes(
+    "cgp-optimize-phi-types", cl::Hidden, cl::init(false),
+    cl::desc("Enable converting phi types in CodeGenPrepare"));
+
 namespace {
 
 enum ExtType {
@@ -327,6 +345,7 @@ class TypePromotionTransaction;
       // FIXME: When we can selectively preserve passes, preserve the domtree.
       AU.addRequired<ProfileSummaryInfoWrapperPass>();
       AU.addRequired<TargetLibraryInfoWrapperPass>();
+      AU.addRequired<TargetPassConfig>();
       AU.addRequired<TargetTransformInfoWrapperPass>();
       AU.addRequired<LoopInfoWrapperPass>();
     }
@@ -368,12 +387,14 @@ class TypePromotionTransaction;
     bool optimizeInst(Instruction *I, bool &ModifiedDT);
     bool optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
                             Type *AccessTy, unsigned AddrSpace);
+    bool optimizeGatherScatterInst(Instruction *MemoryInst, Value *Ptr);
     bool optimizeInlineAsmInst(CallInst *CS);
     bool optimizeCallInst(CallInst *CI, bool &ModifiedDT);
     bool optimizeExt(Instruction *&I);
     bool optimizeExtUses(Instruction *I);
     bool optimizeLoadExt(LoadInst *Load);
     bool optimizeShiftInst(BinaryOperator *BO);
+    bool optimizeFunnelShift(IntrinsicInst *Fsh);
     bool optimizeSelectInst(SelectInst *SI);
     bool optimizeShuffleVectorInst(ShuffleVectorInst *SVI);
     bool optimizeSwitchInst(SwitchInst *SI);
@@ -389,20 +410,25 @@ class TypePromotionTransaction;
                           unsigned CreatedInstsCost = 0);
     bool mergeSExts(Function &F);
     bool splitLargeGEPOffsets();
+    bool optimizePhiType(PHINode *Inst, SmallPtrSetImpl<PHINode *> &Visited,
+                         SmallPtrSetImpl<Instruction *> &DeletedInstrs);
+    bool optimizePhiTypes(Function &F);
     bool performAddressTypePromotion(
         Instruction *&Inst,
         bool AllowPromotionWithoutCommonHeader,
         bool HasPromoted, TypePromotionTransaction &TPT,
         SmallVectorImpl<Instruction *> &SpeculativelyMovedExts);
     bool splitBranchCondition(Function &F, bool &ModifiedDT);
-    bool simplifyOffsetableRelocate(Instruction &I);
+    bool simplifyOffsetableRelocate(GCStatepointInst &I);
 
     bool tryToSinkFreeOperands(Instruction *I);
-    bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, CmpInst *Cmp,
+    bool replaceMathCmpWithIntrinsic(BinaryOperator *BO, Value *Arg0,
+                                     Value *Arg1, CmpInst *Cmp,
                                      Intrinsic::ID IID);
     bool optimizeCmp(CmpInst *Cmp, bool &ModifiedDT);
     bool combineToUSubWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
     bool combineToUAddWithOverflow(CmpInst *Cmp, bool &ModifiedDT);
+    void verifyBFIUpdates(Function &F);
   };
 
 } // end anonymous namespace
@@ -428,12 +454,10 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   InsertedInsts.clear();
   PromotedInsts.clear();
 
-  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
-    TM = &TPC->getTM<TargetMachine>();
-    SubtargetInfo = TM->getSubtargetImpl(F);
-    TLI = SubtargetInfo->getTargetLowering();
-    TRI = SubtargetInfo->getRegisterInfo();
-  }
+  TM = &getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
+  SubtargetInfo = TM->getSubtargetImpl(F);
+  TLI = SubtargetInfo->getTargetLowering();
+  TRI = SubtargetInfo->getRegisterInfo();
   TLInfo = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
@@ -446,14 +470,16 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
       F.setSectionPrefix(".hot");
     else if (PSI->isFunctionColdInCallGraph(&F, *BFI))
       F.setSectionPrefix(".unlikely");
+    else if (ProfileUnknownInSpecialSection && PSI->hasPartialSampleProfile() &&
+             PSI->isFunctionHotnessUnknown(F))
+      F.setSectionPrefix(".unknown");
   }
 
   /// This optimization identifies DIV instructions that can be
   /// profitably bypassed and carried out with a shorter, faster divide.
-  if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI &&
-      TLI->isSlowDivBypassed()) {
+  if (!OptSize && !PSI->hasHugeWorkingSetSize() && TLI->isSlowDivBypassed()) {
     const DenseMap<unsigned int, unsigned int> &BypassWidths =
-       TLI->getBypassSlowDivWidths();
+        TLI->getBypassSlowDivWidths();
     BasicBlock* BB = &*F.begin();
     while (BB != nullptr) {
       // bypassSlowDivision may create new BBs, but we don't want to reapply the
@@ -495,6 +521,10 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
       MadeChange |= mergeSExts(F);
     if (!LargeOffsetGEPMap.empty())
       MadeChange |= splitLargeGEPOffsets();
+    MadeChange |= optimizePhiTypes(F);
+
+    if (MadeChange)
+      eliminateFallThrough(F);
 
     // Really free removed instructions during promotion.
     for (Instruction *I : RemovedInsts)
@@ -550,11 +580,11 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   }
 
   if (!DisableGCOpts) {
-    SmallVector<Instruction *, 2> Statepoints;
+    SmallVector<GCStatepointInst *, 2> Statepoints;
     for (BasicBlock &BB : F)
       for (Instruction &I : BB)
-        if (isStatepoint(I))
-          Statepoints.push_back(&I);
+        if (auto *SP = dyn_cast<GCStatepointInst>(&I))
+          Statepoints.push_back(SP);
     for (auto &I : Statepoints)
       EverMadeChange |= simplifyOffsetableRelocate(*I);
   }
@@ -563,9 +593,23 @@ bool CodeGenPrepare::runOnFunction(Function &F) {
   // preparatory transforms.
   EverMadeChange |= placeDbgValues(F);
 
+#ifndef NDEBUG
+  if (VerifyBFIUpdates)
+    verifyBFIUpdates(F);
+#endif
+
   return EverMadeChange;
 }
 
+// Verify BFI has been updated correctly by recomputing BFI and comparing them.
+void LLVM_ATTRIBUTE_UNUSED CodeGenPrepare::verifyBFIUpdates(Function &F) {
+  DominatorTree NewDT(F);
+  LoopInfo NewLI(NewDT);
+  BranchProbabilityInfo NewBPI(F, NewLI, TLInfo);
+  BlockFrequencyInfo NewBFI(F, NewBPI, NewLI);
+  NewBFI.verifyMatch(*BFI);
+}
+
 /// Merge basic blocks which are connected by a single edge, where one of the
 /// basic blocks has a single successor pointing to the other basic block,
 /// which has a single predecessor.
@@ -749,7 +793,7 @@ bool CodeGenPrepare::isMergingEmptyBlockProfitable(BasicBlock *BB,
   BlockFrequency PredFreq = BFI->getBlockFreq(Pred);
   BlockFrequency BBFreq = BFI->getBlockFreq(BB);
 
-  for (auto SameValueBB : SameIncomingValueBBs)
+  for (auto *SameValueBB : SameIncomingValueBBs)
     if (SameValueBB->getUniquePredecessor() == Pred &&
         DestBB == findDestBlockOfMergeableEmptyBlock(SameValueBB))
       BBFreq += BFI->getBlockFreq(SameValueBB);
@@ -925,7 +969,7 @@ static bool getGEPSmallConstantIntOffsetV(GetElementPtrInst *GEP,
                                           SmallVectorImpl<Value *> &OffsetV) {
   for (unsigned i = 1; i < GEP->getNumOperands(); i++) {
     // Only accept small constant integer operands
-    auto Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
+    auto *Op = dyn_cast<ConstantInt>(GEP->getOperand(i));
     if (!Op || Op->getZExtValue() > 20)
       return false;
   }
@@ -949,7 +993,7 @@ simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
   // be skipped by optimization and we do not care about them.
   for (auto R = RelocatedBase->getParent()->getFirstInsertionPt();
        &*R != RelocatedBase; ++R)
-    if (auto RI = dyn_cast<GCRelocateInst>(R))
+    if (auto *RI = dyn_cast<GCRelocateInst>(R))
       if (RI->getStatepoint() == RelocatedBase->getStatepoint())
         if (RI->getBasePtrIndex() == RelocatedBase->getBasePtrIndex()) {
           RelocatedBase->moveBefore(RI);
@@ -973,7 +1017,7 @@ simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
     }
 
     Value *Base = ToReplace->getBasePtr();
-    auto Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
+    auto *Derived = dyn_cast<GetElementPtrInst>(ToReplace->getDerivedPtr());
     if (!Derived || Derived->getPointerOperand() != Base)
       continue;
 
@@ -1050,10 +1094,9 @@ simplifyRelocatesOffABase(GCRelocateInst *RelocatedBase,
 // %base' = gc.relocate(%tok, i32 4, i32 4)
 // %ptr' = gep %base' + 15
 // %val = load %ptr'
-bool CodeGenPrepare::simplifyOffsetableRelocate(Instruction &I) {
+bool CodeGenPrepare::simplifyOffsetableRelocate(GCStatepointInst &I) {
   bool MadeChange = false;
   SmallVector<GCRelocateInst *, 2> AllRelocateCalls;
-
   for (auto *U : I.users())
     if (GCRelocateInst *Relocate = dyn_cast<GCRelocateInst>(U))
       // Collect all the relocate calls associated with a statepoint
@@ -1187,6 +1230,7 @@ static bool OptimizeNoopCopyExpression(CastInst *CI, const TargetLowering &TLI,
 }
 
 bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
+                                                 Value *Arg0, Value *Arg1,
                                                  CmpInst *Cmp,
                                                  Intrinsic::ID IID) {
   if (BO->getParent() != Cmp->getParent()) {
@@ -1204,8 +1248,6 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
   }
 
   // We allow matching the canonical IR (add X, C) back to (usubo X, -C).
-  Value *Arg0 = BO->getOperand(0);
-  Value *Arg1 = BO->getOperand(1);
   if (BO->getOpcode() == Instruction::Add &&
       IID == Intrinsic::usub_with_overflow) {
     assert(isa<Constant>(Arg1) && "Unexpected input for usubo");
@@ -1215,7 +1257,9 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
   // Insert at the first instruction of the pair.
   Instruction *InsertPt = nullptr;
   for (Instruction &Iter : *Cmp->getParent()) {
-    if (&Iter == BO || &Iter == Cmp) {
+    // If BO is an XOR, it is not guaranteed that it comes after both inputs to
+    // the overflow intrinsic are defined.
+    if ((BO->getOpcode() != Instruction::Xor && &Iter == BO) || &Iter == Cmp) {
       InsertPt = &Iter;
       break;
     }
@@ -1224,12 +1268,16 @@ bool CodeGenPrepare::replaceMathCmpWithIntrinsic(BinaryOperator *BO,
 
   IRBuilder<> Builder(InsertPt);
   Value *MathOV = Builder.CreateBinaryIntrinsic(IID, Arg0, Arg1);
-  Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
+  if (BO->getOpcode() != Instruction::Xor) {
+    Value *Math = Builder.CreateExtractValue(MathOV, 0, "math");
+    BO->replaceAllUsesWith(Math);
+  } else
+    assert(BO->hasOneUse() &&
+           "Patterns with XOr should use the BO only in the compare");
   Value *OV = Builder.CreateExtractValue(MathOV, 1, "ov");
-  BO->replaceAllUsesWith(Math);
   Cmp->replaceAllUsesWith(OV);
-  BO->eraseFromParent();
   Cmp->eraseFromParent();
+  BO->eraseFromParent();
   return true;
 }
 
@@ -1269,12 +1317,17 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
                                                bool &ModifiedDT) {
   Value *A, *B;
   BinaryOperator *Add;
-  if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add))))
+  if (!match(Cmp, m_UAddWithOverflow(m_Value(A), m_Value(B), m_BinOp(Add)))) {
     if (!matchUAddWithOverflowConstantEdgeCases(Cmp, Add))
       return false;
+    // Set A and B in case we match matchUAddWithOverflowConstantEdgeCases.
+    A = Add->getOperand(0);
+    B = Add->getOperand(1);
+  }
 
   if (!TLI->shouldFormOverflowOp(ISD::UADDO,
-                                 TLI->getValueType(*DL, Add->getType())))
+                                 TLI->getValueType(*DL, Add->getType()),
+                                 Add->hasNUsesOrMore(2)))
     return false;
 
   // We don't want to move around uses of condition values this late, so we
@@ -1283,7 +1336,8 @@ bool CodeGenPrepare::combineToUAddWithOverflow(CmpInst *Cmp,
   if (Add->getParent() != Cmp->getParent() && !Add->hasOneUse())
     return false;
 
-  if (!replaceMathCmpWithIntrinsic(Add, Cmp, Intrinsic::uadd_with_overflow))
+  if (!replaceMathCmpWithIntrinsic(Add, A, B, Cmp,
+                                   Intrinsic::uadd_with_overflow))
     return false;
 
   // Reset callers - do not crash by iterating over a dead instruction.
@@ -1341,10 +1395,12 @@ bool CodeGenPrepare::combineToUSubWithOverflow(CmpInst *Cmp,
     return false;
 
   if (!TLI->shouldFormOverflowOp(ISD::USUBO,
-                                 TLI->getValueType(*DL, Sub->getType())))
+                                 TLI->getValueType(*DL, Sub->getType()),
+                                 Sub->hasNUsesOrMore(2)))
     return false;
 
-  if (!replaceMathCmpWithIntrinsic(Sub, Cmp, Intrinsic::usub_with_overflow))
+  if (!replaceMathCmpWithIntrinsic(Sub, Sub->getOperand(0), Sub->getOperand(1),
+                                   Cmp, Intrinsic::usub_with_overflow))
     return false;
 
   // Reset callers - do not crash by iterating over a dead instruction.
@@ -1813,9 +1869,6 @@ static bool despeculateCountZeros(IntrinsicInst *CountZeros,
                                   const TargetLowering *TLI,
                                   const DataLayout *DL,
                                   bool &ModifiedDT) {
-  if (!TLI || !DL)
-    return false;
-
   // If a zero input is undefined, it doesn't make sense to despeculate that.
   if (match(CountZeros->getOperand(1), m_One()))
     return false;
@@ -1877,7 +1930,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
   // Lower inline assembly if we can.
   // If we found an inline asm expession, and if the target knows how to
   // lower it to normal LLVM code, do so now.
-  if (TLI && isa<InlineAsm>(CI->getCalledValue())) {
+  if (CI->isInlineAsm()) {
     if (TLI->ExpandInlineAsm(CI)) {
       // Avoid invalidating the iterator.
       CurInstIterator = BB->begin();
@@ -1894,7 +1947,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
   // Align the pointer arguments to this call if the target thinks it's a good
   // idea
   unsigned MinSize, PrefAlign;
-  if (TLI && TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
+  if (TLI->shouldAlignPointerArgs(CI, MinSize, PrefAlign)) {
     for (auto &Arg : CI->arg_operands()) {
       // We want to align both objects whose address is used directly and
       // objects whose address is used in casts and GEPs, though it only makes
@@ -1912,7 +1965,7 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
       AllocaInst *AI;
       if ((AI = dyn_cast<AllocaInst>(Val)) && AI->getAlignment() < PrefAlign &&
           DL->getTypeAllocSize(AI->getAllocatedType()) >= MinSize + Offset2)
-        AI->setAlignment(MaybeAlign(PrefAlign));
+        AI->setAlignment(Align(PrefAlign));
       // Global variables can only be aligned if they are defined in this
       // object (i.e. they are uniquely initialized in this object), and
       // over-aligning global variables that have an explicit section is
@@ -1927,12 +1980,14 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
     // If this is a memcpy (or similar) then we may be able to improve the
     // alignment
     if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(CI)) {
-      unsigned DestAlign = getKnownAlignment(MI->getDest(), *DL);
-      if (DestAlign > MI->getDestAlignment())
+      Align DestAlign = getKnownAlignment(MI->getDest(), *DL);
+      MaybeAlign MIDestAlign = MI->getDestAlign();
+      if (!MIDestAlign || DestAlign > *MIDestAlign)
         MI->setDestAlignment(DestAlign);
       if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
-        unsigned SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
-        if (SrcAlign > MTI->getSourceAlignment())
+        MaybeAlign MTISrcAlign = MTI->getSourceAlign();
+        Align SrcAlign = getKnownAlignment(MTI->getSource(), *DL);
+        if (!MTISrcAlign || SrcAlign > *MTISrcAlign)
           MTI->setSourceAlignment(SrcAlign);
       }
     }
@@ -1942,8 +1997,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
   // cold block.  This interacts with our handling for loads and stores to
   // ensure that we can fold all uses of a potential addressing computation
   // into their uses.  TODO: generalize this to work over profiling data
-  bool OptForSize = OptSize || llvm::shouldOptimizeForSize(BB, PSI, BFI.get());
-  if (!OptForSize && CI->hasFnAttr(Attribute::Cold))
+  if (CI->hasFnAttr(Attribute::Cold) &&
+      !OptSize && !llvm::shouldOptimizeForSize(BB, PSI, BFI.get()))
     for (auto &Arg : CI->arg_operands()) {
       if (!Arg->getType()->isPointerTy())
         continue;
@@ -1955,10 +2010,15 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
   if (II) {
     switch (II->getIntrinsicID()) {
     default: break;
+    case Intrinsic::assume: {
+      II->eraseFromParent();
+      return true;
+    }
+
     case Intrinsic::experimental_widenable_condition: {
       // Give up on future widening oppurtunties so that we can fold away dead
       // paths and merge blocks before going into block-local instruction
-      // selection.   
+      // selection.
       if (II->use_empty()) {
         II->eraseFromParent();
         return true;
@@ -2008,21 +2068,43 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
     case Intrinsic::ctlz:
       // If counting zeros is expensive, try to avoid it.
       return despeculateCountZeros(II, TLI, DL, ModifiedDT);
+    case Intrinsic::fshl:
+    case Intrinsic::fshr:
+      return optimizeFunnelShift(II);
     case Intrinsic::dbg_value:
       return fixupDbgValue(II);
+    case Intrinsic::vscale: {
+      // If datalayout has no special restrictions on vector data layout,
+      // replace `llvm.vscale` by an equivalent constant expression
+      // to benefit from cheap constant propagation.
+      Type *ScalableVectorTy =
+          VectorType::get(Type::getInt8Ty(II->getContext()), 1, true);
+      if (DL->getTypeAllocSize(ScalableVectorTy).getKnownMinSize() == 8) {
+        auto *Null = Constant::getNullValue(ScalableVectorTy->getPointerTo());
+        auto *One = ConstantInt::getSigned(II->getType(), 1);
+        auto *CGep =
+            ConstantExpr::getGetElementPtr(ScalableVectorTy, Null, One);
+        II->replaceAllUsesWith(ConstantExpr::getPtrToInt(CGep, II->getType()));
+        II->eraseFromParent();
+        return true;
+      }
+      break;
     }
-
-    if (TLI) {
-      SmallVector<Value*, 2> PtrOps;
-      Type *AccessTy;
-      if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
-        while (!PtrOps.empty()) {
-          Value *PtrVal = PtrOps.pop_back_val();
-          unsigned AS = PtrVal->getType()->getPointerAddressSpace();
-          if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
-            return true;
-        }
+    case Intrinsic::masked_gather:
+      return optimizeGatherScatterInst(II, II->getArgOperand(0));
+    case Intrinsic::masked_scatter:
+      return optimizeGatherScatterInst(II, II->getArgOperand(1));
     }
+
+    SmallVector<Value *, 2> PtrOps;
+    Type *AccessTy;
+    if (TLI->getAddrModeArguments(II, PtrOps, AccessTy))
+      while (!PtrOps.empty()) {
+        Value *PtrVal = PtrOps.pop_back_val();
+        unsigned AS = PtrVal->getType()->getPointerAddressSpace();
+        if (optimizeMemoryInst(II, PtrVal, AccessTy, AS))
+          return true;
+      }
   }
 
   // From here on out we're working with named functions.
@@ -2033,7 +2115,8 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
   // to fortified library functions (e.g. __memcpy_chk) that have the default
   // "don't know" as the objectsize.  Anything else should be left alone.
   FortifiedLibCallSimplifier Simplifier(TLInfo, true);
-  if (Value *V = Simplifier.optimizeCall(CI)) {
+  IRBuilder<> Builder(CI);
+  if (Value *V = Simplifier.optimizeCall(CI, Builder)) {
     CI->replaceAllUsesWith(V);
     CI->eraseFromParent();
     return true;
@@ -2073,14 +2156,12 @@ bool CodeGenPrepare::optimizeCallInst(CallInst *CI, bool &ModifiedDT) {
 ///   ret i32 %tmp2
 /// @endcode
 bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT) {
-  if (!TLI)
-    return false;
-
   ReturnInst *RetI = dyn_cast<ReturnInst>(BB->getTerminator());
   if (!RetI)
     return false;
 
   PHINode *PN = nullptr;
+  ExtractValueInst *EVI = nullptr;
   BitCastInst *BCI = nullptr;
   Value *V = RetI->getReturnValue();
   if (V) {
@@ -2088,6 +2169,14 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT
     if (BCI)
       V = BCI->getOperand(0);
 
+    EVI = dyn_cast<ExtractValueInst>(V);
+    if (EVI) {
+      V = EVI->getOperand(0);
+      if (!std::all_of(EVI->idx_begin(), EVI->idx_end(),
+                       [](unsigned idx) { return idx == 0; }))
+        return false;
+    }
+
     PN = dyn_cast<PHINode>(V);
     if (!PN)
       return false;
@@ -2101,7 +2190,9 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT
   if (PN) {
     BasicBlock::iterator BI = BB->begin();
     // Skip over debug and the bitcast.
-    do { ++BI; } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI);
+    do {
+      ++BI;
+    } while (isa<DbgInfoIntrinsic>(BI) || &*BI == BCI || &*BI == EVI);
     if (&*BI != RetI)
       return false;
   } else {
@@ -2157,6 +2248,11 @@ bool CodeGenPrepare::dupRetToEnableTailCallOpts(BasicBlock *BB, bool &ModifiedDT
 
     // Duplicate the return into TailCallBB.
     (void)FoldReturnIntoUncondBranch(RetI, BB, TailCallBB);
+    assert(!VerifyBFIUpdates ||
+           BFI->getBlockFreq(BB) >= BFI->getBlockFreq(TailCallBB));
+    BFI->setBlockFreq(
+        BB,
+        (BFI->getBlockFreq(BB) - BFI->getBlockFreq(TailCallBB)).getFrequency());
     ModifiedDT = Changed = true;
     ++NumRetsDup;
   }
@@ -2354,6 +2450,9 @@ namespace {
 /// This class provides transaction based operation on the IR.
 /// Every change made through this class is recorded in the internal state and
 /// can be undone (rollback) until commit is called.
+/// CGP does not check if instructions could be speculatively executed when
+/// moved. Preserving the original location would pessimize the debugging
+/// experience, as well as negatively impact the quality of sample PGO.
 class TypePromotionTransaction {
   /// This represents the common interface of the individual transaction.
   /// Each class implements the logic for doing one specific modification on
@@ -2516,6 +2615,7 @@ class TypePromotionTransaction {
     /// trunc Opnd to Ty.
     TruncBuilder(Instruction *Opnd, Type *Ty) : TypePromotionAction(Opnd) {
       IRBuilder<> Builder(Opnd);
+      Builder.SetCurrentDebugLocation(DebugLoc());
       Val = Builder.CreateTrunc(Opnd, Ty, "promoted");
       LLVM_DEBUG(dbgs() << "Do: TruncBuilder: " << *Val << "\n");
     }
@@ -2568,6 +2668,7 @@ class TypePromotionTransaction {
     ZExtBuilder(Instruction *InsertPt, Value *Opnd, Type *Ty)
         : TypePromotionAction(InsertPt) {
       IRBuilder<> Builder(InsertPt);
+      Builder.SetCurrentDebugLocation(DebugLoc());
       Val = Builder.CreateZExt(Opnd, Ty, "promoted");
       LLVM_DEBUG(dbgs() << "Do: ZExtBuilder: " << *Val << "\n");
     }
@@ -2721,8 +2822,9 @@ public:
   TypePromotionTransaction(SetOfInstrs &RemovedInsts)
       : RemovedInsts(RemovedInsts) {}
 
-  /// Advocate every changes made in that transaction.
-  void commit();
+  /// Advocate every changes made in that transaction. Return true if any change
+  /// happen.
+  bool commit();
 
   /// Undo all the changes made after the given point.
   void rollback(ConstRestorationPt Point);
@@ -2828,11 +2930,13 @@ TypePromotionTransaction::getRestorationPoint() const {
   return !Actions.empty() ? Actions.back().get() : nullptr;
 }
 
-void TypePromotionTransaction::commit() {
+bool TypePromotionTransaction::commit() {
   for (CommitPt It = Actions.begin(), EndIt = Actions.end(); It != EndIt;
        ++It)
     (*It)->commit();
+  bool Modified = !Actions.empty();
   Actions.clear();
+  return Modified;
 }
 
 void TypePromotionTransaction::rollback(
@@ -3115,7 +3219,7 @@ public:
     SmallPtrSet<Value *, 32> Visited;
     WorkList.push_back(Val);
     while (!WorkList.empty()) {
-      auto P = WorkList.pop_back_val();
+      auto *P = WorkList.pop_back_val();
       if (!Visited.insert(P).second)
         continue;
       if (auto *PI = dyn_cast<Instruction>(P))
@@ -3164,13 +3268,13 @@ public:
 
   void destroyNewNodes(Type *CommonType) {
     // For safe erasing, replace the uses with dummy value first.
-    auto Dummy = UndefValue::get(CommonType);
-    for (auto I : AllPhiNodes) {
+    auto *Dummy = UndefValue::get(CommonType);
+    for (auto *I : AllPhiNodes) {
       I->replaceAllUsesWith(Dummy);
       I->eraseFromParent();
     }
     AllPhiNodes.clear();
-    for (auto I : AllSelectNodes) {
+    for (auto *I : AllSelectNodes) {
       I->replaceAllUsesWith(Dummy);
       I->eraseFromParent();
     }
@@ -3511,7 +3615,7 @@ private:
         // Must be a Phi node then.
         auto *PHI = cast<PHINode>(V);
         // Fill the Phi node with values from predecessors.
-        for (auto B : predecessors(PHI->getParent())) {
+        for (auto *B : predecessors(PHI->getParent())) {
           Value *PV = cast<PHINode>(Current)->getIncomingValueForBlock(B);
           assert(Map.find(PV) != Map.end() && "No predecessor Value!");
           PHI->addIncoming(ST.Get(Map[PV]), B);
@@ -3625,10 +3729,11 @@ bool AddressingModeMatcher::matchScaledValue(Value *ScaleReg, int64_t Scale,
   // X*Scale + C*Scale to addr mode.
   ConstantInt *CI = nullptr; Value *AddLHS = nullptr;
   if (isa<Instruction>(ScaleReg) &&  // not a constant expr.
-      match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI)))) {
+      match(ScaleReg, m_Add(m_Value(AddLHS), m_ConstantInt(CI))) &&
+      CI->getValue().isSignedIntN(64)) {
     TestAddrMode.InBounds = false;
     TestAddrMode.ScaledReg = AddLHS;
-    TestAddrMode.BaseOffs += CI->getSExtValue()*TestAddrMode.Scale;
+    TestAddrMode.BaseOffs += CI->getSExtValue() * TestAddrMode.Scale;
 
     // If this addressing mode is legal, commit it and remember that we folded
     // this instruction.
@@ -3849,7 +3954,7 @@ bool TypePromotionHelper::canGetThrough(const Instruction *Inst,
   // We can get through binary operator, if it is legal. In other words, the
   // binary operator must have a nuw or nsw flag.
   const BinaryOperator *BinOp = dyn_cast<BinaryOperator>(Inst);
-  if (BinOp && isa<OverflowingBinaryOperator>(BinOp) &&
+  if (isa_and_nonnull<OverflowingBinaryOperator>(BinOp) &&
       ((!IsSExt && BinOp->hasNoUnsignedWrap()) ||
        (IsSExt && BinOp->hasNoSignedWrap())))
     return true;
@@ -4251,15 +4356,20 @@ bool AddressingModeMatcher::matchOperationAddr(User *AddrInst, unsigned Opcode,
           cast<ConstantInt>(AddrInst->getOperand(i))->getZExtValue();
         ConstantOffset += SL->getElementOffset(Idx);
       } else {
-        uint64_t TypeSize = DL.getTypeAllocSize(GTI.getIndexedType());
-        if (ConstantInt *CI = dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
-          const APInt &CVal = CI->getValue();
-          if (CVal.getMinSignedBits() <= 64) {
-            ConstantOffset += CVal.getSExtValue() * TypeSize;
-            continue;
+        TypeSize TS = DL.getTypeAllocSize(GTI.getIndexedType());
+        if (TS.isNonZero()) {
+          // The optimisations below currently only work for fixed offsets.
+          if (TS.isScalable())
+            return false;
+          int64_t TypeSize = TS.getFixedSize();
+          if (ConstantInt *CI =
+                  dyn_cast<ConstantInt>(AddrInst->getOperand(i))) {
+            const APInt &CVal = CI->getValue();
+            if (CVal.getMinSignedBits() <= 64) {
+              ConstantOffset += CVal.getSExtValue() * TypeSize;
+              continue;
+            }
           }
-        }
-        if (TypeSize) {  // Scales of zero don't do anything.
           // We only allow one variable index at the moment.
           if (VariableOperand != -1)
             return false;
@@ -4422,11 +4532,13 @@ bool AddressingModeMatcher::matchAddr(Value *Addr, unsigned Depth) {
   TypePromotionTransaction::ConstRestorationPt LastKnownGood =
       TPT.getRestorationPoint();
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Addr)) {
-    // Fold in immediates if legal for the target.
-    AddrMode.BaseOffs += CI->getSExtValue();
-    if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
-      return true;
-    AddrMode.BaseOffs -= CI->getSExtValue();
+    if (CI->getValue().isSignedIntN(64)) {
+      // Fold in immediates if legal for the target.
+      AddrMode.BaseOffs += CI->getSExtValue();
+      if (TLI.isLegalAddressingMode(DL, AddrMode, AccessTy, AddrSpace))
+        return true;
+      AddrMode.BaseOffs -= CI->getSExtValue();
+    }
   } else if (GlobalValue *GV = dyn_cast<GlobalValue>(Addr)) {
     // If this is a global variable, try to fold it into the addressing mode.
     if (!AddrMode.BaseGV) {
@@ -4502,8 +4614,7 @@ static bool IsOperandAMemoryOperand(CallInst *CI, InlineAsm *IA, Value *OpVal,
                                     const TargetRegisterInfo &TRI) {
   const Function *F = CI->getFunction();
   TargetLowering::AsmOperandInfoVector TargetConstraints =
-      TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI,
-                            ImmutableCallSite(CI));
+      TLI.ParseConstraints(F->getParent()->getDataLayout(), &TRI, *CI);
 
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
@@ -4581,14 +4692,16 @@ static bool FindAllMemoryUses(
     }
 
     if (CallInst *CI = dyn_cast<CallInst>(UserI)) {
-      // If this is a cold call, we can sink the addressing calculation into
-      // the cold path.  See optimizeCallInst
-      bool OptForSize = OptSize ||
+      if (CI->hasFnAttr(Attribute::Cold)) {
+        // If this is a cold call, we can sink the addressing calculation into
+        // the cold path.  See optimizeCallInst
+        bool OptForSize = OptSize ||
           llvm::shouldOptimizeForSize(CI->getParent(), PSI, BFI);
-      if (!OptForSize && CI->hasFnAttr(Attribute::Cold))
-        continue;
+        if (!OptForSize)
+          continue;
+      }
 
-      InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue());
+      InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand());
       if (!IA) return true;
 
       // If this is a memory operand, we're cool, otherwise bail out.
@@ -4854,7 +4967,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     TPT.rollback(LastKnownGood);
     return false;
   }
-  TPT.commit();
+  bool Modified = TPT.commit();
 
   // Get the combined AddrMode (or the only AddrMode, if we only had one).
   ExtAddrMode AddrMode = AddrModes.getAddrMode();
@@ -4868,7 +4981,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
                   })) {
     LLVM_DEBUG(dbgs() << "CGP: Found      local addrmode: " << AddrMode
                       << "\n");
-    return false;
+    return Modified;
   }
 
   // Insert this computation right after this user.  Since our caller is
@@ -4891,7 +5004,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
     if (SunkAddr->getType() != Addr->getType())
       SunkAddr = Builder.CreatePointerCast(SunkAddr, Addr->getType());
   } else if (AddrSinkUsingGEPs || (!AddrSinkUsingGEPs.getNumOccurrences() &&
-                                   TM && SubtargetInfo->addrSinkUsingGEPs())) {
+                                   SubtargetInfo->addrSinkUsingGEPs())) {
     // By default, we use the GEP-based method when AA is used later. This
     // prevents new inttoptr/ptrtoint pairs from degrading AA capabilities.
     LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
@@ -4909,7 +5022,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       // We can't add more than one pointer together, nor can we scale a
       // pointer (both of which seem meaningless).
       if (ResultPtr || AddrMode.Scale != 1)
-        return false;
+        return Modified;
 
       ResultPtr = AddrMode.ScaledReg;
       AddrMode.Scale = 0;
@@ -4926,12 +5039,12 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
       Type *ScaledRegTy = AddrMode.ScaledReg->getType();
       if (cast<IntegerType>(IntPtrTy)->getBitWidth() >
           cast<IntegerType>(ScaledRegTy)->getBitWidth())
-        return false;
+        return Modified;
     }
 
     if (AddrMode.BaseGV) {
       if (ResultPtr)
-        return false;
+        return Modified;
 
       ResultPtr = AddrMode.BaseGV;
     }
@@ -4955,7 +5068,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
         !AddrMode.BaseReg && !AddrMode.Scale && !AddrMode.BaseOffs) {
       SunkAddr = Constant::getNullValue(Addr->getType());
     } else if (!ResultPtr) {
-      return false;
+      return Modified;
     } else {
       Type *I8PtrTy =
           Builder.getInt8PtrTy(Addr->getType()->getPointerAddressSpace());
@@ -5040,7 +5153,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
         (ScalePtrTy && DL->isNonIntegralPointerType(ScalePtrTy)) ||
         (AddrMode.BaseGV &&
          DL->isNonIntegralPointerType(AddrMode.BaseGV->getType())))
-      return false;
+      return Modified;
 
     LLVM_DEBUG(dbgs() << "CGP: SINKING nonlocal addrmode: " << AddrMode
                       << " for " << *MemoryInst << "\n");
@@ -5080,7 +5193,7 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
         Instruction *I = dyn_cast_or_null<Instruction>(Result);
         if (I && (Result != AddrMode.BaseReg))
           I->eraseFromParent();
-        return false;
+        return Modified;
       }
       if (AddrMode.Scale != 1)
         V = Builder.CreateMul(V, ConstantInt::get(IntPtrTy, AddrMode.Scale),
@@ -5142,6 +5255,119 @@ bool CodeGenPrepare::optimizeMemoryInst(Instruction *MemoryInst, Value *Addr,
   return true;
 }
 
+/// Rewrite GEP input to gather/scatter to enable SelectionDAGBuilder to find
+/// a uniform base to use for ISD::MGATHER/MSCATTER. SelectionDAGBuilder can
+/// only handle a 2 operand GEP in the same basic block or a splat constant
+/// vector. The 2 operands to the GEP must have a scalar pointer and a vector
+/// index.
+///
+/// If the existing GEP has a vector base pointer that is splat, we can look
+/// through the splat to find the scalar pointer. If we can't find a scalar
+/// pointer there's nothing we can do.
+///
+/// If we have a GEP with more than 2 indices where the middle indices are all
+/// zeroes, we can replace it with 2 GEPs where the second has 2 operands.
+///
+/// If the final index isn't a vector or is a splat, we can emit a scalar GEP
+/// followed by a GEP with an all zeroes vector index. This will enable
+/// SelectionDAGBuilder to use a the scalar GEP as the uniform base and have a
+/// zero index.
+bool CodeGenPrepare::optimizeGatherScatterInst(Instruction *MemoryInst,
+                                               Value *Ptr) {
+  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!GEP || !GEP->hasIndices())
+    return false;
+
+  // If the GEP and the gather/scatter aren't in the same BB, don't optimize.
+  // FIXME: We should support this by sinking the GEP.
+  if (MemoryInst->getParent() != GEP->getParent())
+    return false;
+
+  SmallVector<Value *, 2> Ops(GEP->op_begin(), GEP->op_end());
+
+  bool RewriteGEP = false;
+
+  if (Ops[0]->getType()->isVectorTy()) {
+    Ops[0] = const_cast<Value *>(getSplatValue(Ops[0]));
+    if (!Ops[0])
+      return false;
+    RewriteGEP = true;
+  }
+
+  unsigned FinalIndex = Ops.size() - 1;
+
+  // Ensure all but the last index is 0.
+  // FIXME: This isn't strictly required. All that's required is that they are
+  // all scalars or splats.
+  for (unsigned i = 1; i < FinalIndex; ++i) {
+    auto *C = dyn_cast<Constant>(Ops[i]);
+    if (!C)
+      return false;
+    if (isa<VectorType>(C->getType()))
+      C = C->getSplatValue();
+    auto *CI = dyn_cast_or_null<ConstantInt>(C);
+    if (!CI || !CI->isZero())
+      return false;
+    // Scalarize the index if needed.
+    Ops[i] = CI;
+  }
+
+  // Try to scalarize the final index.
+  if (Ops[FinalIndex]->getType()->isVectorTy()) {
+    if (Value *V = const_cast<Value *>(getSplatValue(Ops[FinalIndex]))) {
+      auto *C = dyn_cast<ConstantInt>(V);
+      // Don't scalarize all zeros vector.
+      if (!C || !C->isZero()) {
+        Ops[FinalIndex] = V;
+        RewriteGEP = true;
+      }
+    }
+  }
+
+  // If we made any changes or the we have extra operands, we need to generate
+  // new instructions.
+  if (!RewriteGEP && Ops.size() == 2)
+    return false;
+
+  unsigned NumElts = cast<FixedVectorType>(Ptr->getType())->getNumElements();
+
+  IRBuilder<> Builder(MemoryInst);
+
+  Type *ScalarIndexTy = DL->getIndexType(Ops[0]->getType()->getScalarType());
+
+  Value *NewAddr;
+
+  // If the final index isn't a vector, emit a scalar GEP containing all ops
+  // and a vector GEP with all zeroes final index.
+  if (!Ops[FinalIndex]->getType()->isVectorTy()) {
+    NewAddr = Builder.CreateGEP(Ops[0], makeArrayRef(Ops).drop_front());
+    auto *IndexTy = FixedVectorType::get(ScalarIndexTy, NumElts);
+    NewAddr = Builder.CreateGEP(NewAddr, Constant::getNullValue(IndexTy));
+  } else {
+    Value *Base = Ops[0];
+    Value *Index = Ops[FinalIndex];
+
+    // Create a scalar GEP if there are more than 2 operands.
+    if (Ops.size() != 2) {
+      // Replace the last index with 0.
+      Ops[FinalIndex] = Constant::getNullValue(ScalarIndexTy);
+      Base = Builder.CreateGEP(Base, makeArrayRef(Ops).drop_front());
+    }
+
+    // Now create the GEP with scalar pointer and vector index.
+    NewAddr = Builder.CreateGEP(Base, Index);
+  }
+
+  MemoryInst->replaceUsesOfWith(Ptr, NewAddr);
+
+  // If we have no uses, recursively delete the value and all dead instructions
+  // using it.
+  if (Ptr->use_empty())
+    RecursivelyDeleteTriviallyDeadInstructions(Ptr, TLInfo);
+
+  return true;
+}
+
 /// If there are any memory operands, use OptimizeMemoryInst to sink their
 /// address computing into the block when possible / profitable.
 bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
@@ -5150,7 +5376,7 @@ bool CodeGenPrepare::optimizeInlineAsmInst(CallInst *CS) {
   const TargetRegisterInfo *TRI =
       TM->getSubtargetImpl(*CS->getFunction())->getRegisterInfo();
   TargetLowering::AsmOperandInfoVector TargetConstraints =
-      TLI->ParseConstraints(*DL, TRI, CS);
+      TLI->ParseConstraints(*DL, TRI, *CS);
   unsigned ArgNo = 0;
   for (unsigned i = 0, e = TargetConstraints.size(); i != e; ++i) {
     TargetLowering::AsmOperandInfo &OpInfo = TargetConstraints[i];
@@ -5231,7 +5457,7 @@ bool CodeGenPrepare::tryToPromoteExts(
   bool Promoted = false;
 
   // Iterate over all the extensions to try to promote them.
-  for (auto I : Exts) {
+  for (auto *I : Exts) {
     // Early check if we directly have ext(load).
     if (isa<LoadInst>(I->getOperand(0))) {
       ProfitablyMovedExts.push_back(I);
@@ -5242,7 +5468,7 @@ bool CodeGenPrepare::tryToPromoteExts(
     // this check inside the for loop is to catch the case where an extension
     // is directly fed by a load because in such case the extension can be moved
     // up without any promotion on its operands.
-    if (!TLI || !TLI->enableExtLdPromotion() || DisableExtLdPromotion)
+    if (!TLI->enableExtLdPromotion() || DisableExtLdPromotion)
       return false;
 
     // Get the action to perform the promotion.
@@ -5292,7 +5518,7 @@ bool CodeGenPrepare::tryToPromoteExts(
     SmallVector<Instruction *, 2> NewlyMovedExts;
     (void)tryToPromoteExts(TPT, NewExts, NewlyMovedExts, TotalCreatedInstsCost);
     bool NewPromoted = false;
-    for (auto ExtInst : NewlyMovedExts) {
+    for (auto *ExtInst : NewlyMovedExts) {
       Instruction *MovedExt = cast<Instruction>(ExtInst);
       Value *ExtOperand = MovedExt->getOperand(0);
       // If we have reached to a load, we need this extra profitability check
@@ -5358,9 +5584,9 @@ bool CodeGenPrepare::mergeSExts(Function &F) {
   return Changed;
 }
 
-// Spliting large data structures so that the GEPs accessing them can have
+// Splitting large data structures so that the GEPs accessing them can have
 // smaller offsets so that they can be sunk to the same blocks as their users.
-// For example, a large struct starting from %base is splitted into two parts
+// For example, a large struct starting from %base is split into two parts
 // where the second part starts from %new_base.
 //
 // Before:
@@ -5421,7 +5647,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {
     int64_t BaseOffset = LargeOffsetGEPs.begin()->second;
     Value *NewBaseGEP = nullptr;
 
-    auto LargeOffsetGEP = LargeOffsetGEPs.begin();
+    auto *LargeOffsetGEP = LargeOffsetGEPs.begin();
     while (LargeOffsetGEP != LargeOffsetGEPs.end()) {
       GetElementPtrInst *GEP = LargeOffsetGEP->first;
       int64_t Offset = LargeOffsetGEP->second;
@@ -5435,7 +5661,7 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {
                                         GEP->getAddressSpace())) {
           // We need to create a new base if the offset to the current base is
           // too large to fit into the addressing mode. So, a very large struct
-          // may be splitted into several parts.
+          // may be split into several parts.
           BaseGEP = GEP;
           BaseOffset = Offset;
           NewBaseGEP = nullptr;
@@ -5506,6 +5732,155 @@ bool CodeGenPrepare::splitLargeGEPOffsets() {
   return Changed;
 }
 
+bool CodeGenPrepare::optimizePhiType(
+    PHINode *I, SmallPtrSetImpl<PHINode *> &Visited,
+    SmallPtrSetImpl<Instruction *> &DeletedInstrs) {
+  // We are looking for a collection on interconnected phi nodes that together
+  // only use loads/bitcasts and are used by stores/bitcasts, and the bitcasts
+  // are of the same type. Convert the whole set of nodes to the type of the
+  // bitcast.
+  Type *PhiTy = I->getType();
+  Type *ConvertTy = nullptr;
+  if (Visited.count(I) ||
+      (!I->getType()->isIntegerTy() && !I->getType()->isFloatingPointTy()))
+    return false;
+
+  SmallVector<Instruction *, 4> Worklist;
+  Worklist.push_back(cast<Instruction>(I));
+  SmallPtrSet<PHINode *, 4> PhiNodes;
+  PhiNodes.insert(I);
+  Visited.insert(I);
+  SmallPtrSet<Instruction *, 4> Defs;
+  SmallPtrSet<Instruction *, 4> Uses;
+
+  while (!Worklist.empty()) {
+    Instruction *II = Worklist.pop_back_val();
+
+    if (auto *Phi = dyn_cast<PHINode>(II)) {
+      // Handle Defs, which might also be PHI's
+      for (Value *V : Phi->incoming_values()) {
+        if (auto *OpPhi = dyn_cast<PHINode>(V)) {
+          if (!PhiNodes.count(OpPhi)) {
+            if (Visited.count(OpPhi))
+              return false;
+            PhiNodes.insert(OpPhi);
+            Visited.insert(OpPhi);
+            Worklist.push_back(OpPhi);
+          }
+        } else if (auto *OpLoad = dyn_cast<LoadInst>(V)) {
+          if (!Defs.count(OpLoad)) {
+            Defs.insert(OpLoad);
+            Worklist.push_back(OpLoad);
+          }
+        } else if (auto *OpEx = dyn_cast<ExtractElementInst>(V)) {
+          if (!Defs.count(OpEx)) {
+            Defs.insert(OpEx);
+            Worklist.push_back(OpEx);
+          }
+        } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {
+          if (!ConvertTy)
+            ConvertTy = OpBC->getOperand(0)->getType();
+          if (OpBC->getOperand(0)->getType() != ConvertTy)
+            return false;
+          if (!Defs.count(OpBC)) {
+            Defs.insert(OpBC);
+            Worklist.push_back(OpBC);
+          }
+        } else if (!isa<UndefValue>(V))
+          return false;
+      }
+    }
+
+    // Handle uses which might also be phi's
+    for (User *V : II->users()) {
+      if (auto *OpPhi = dyn_cast<PHINode>(V)) {
+        if (!PhiNodes.count(OpPhi)) {
+          if (Visited.count(OpPhi))
+            return false;
+          PhiNodes.insert(OpPhi);
+          Visited.insert(OpPhi);
+          Worklist.push_back(OpPhi);
+        }
+      } else if (auto *OpStore = dyn_cast<StoreInst>(V)) {
+        if (OpStore->getOperand(0) != II)
+          return false;
+        Uses.insert(OpStore);
+      } else if (auto *OpBC = dyn_cast<BitCastInst>(V)) {
+        if (!ConvertTy)
+          ConvertTy = OpBC->getType();
+        if (OpBC->getType() != ConvertTy)
+          return false;
+        Uses.insert(OpBC);
+      } else
+        return false;
+    }
+  }
+
+  if (!ConvertTy || !TLI->shouldConvertPhiType(PhiTy, ConvertTy))
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Converting " << *I << "\n  and connected nodes to "
+                    << *ConvertTy << "\n");
+
+  // Create all the new phi nodes of the new type, and bitcast any loads to the
+  // correct type.
+  ValueToValueMap ValMap;
+  ValMap[UndefValue::get(PhiTy)] = UndefValue::get(ConvertTy);
+  for (Instruction *D : Defs) {
+    if (isa<BitCastInst>(D))
+      ValMap[D] = D->getOperand(0);
+    else
+      ValMap[D] =
+          new BitCastInst(D, ConvertTy, D->getName() + ".bc", D->getNextNode());
+  }
+  for (PHINode *Phi : PhiNodes)
+    ValMap[Phi] = PHINode::Create(ConvertTy, Phi->getNumIncomingValues(),
+                                  Phi->getName() + ".tc", Phi);
+  // Pipe together all the PhiNodes.
+  for (PHINode *Phi : PhiNodes) {
+    PHINode *NewPhi = cast<PHINode>(ValMap[Phi]);
+    for (int i = 0, e = Phi->getNumIncomingValues(); i < e; i++)
+      NewPhi->addIncoming(ValMap[Phi->getIncomingValue(i)],
+                          Phi->getIncomingBlock(i));
+  }
+  // And finally pipe up the stores and bitcasts
+  for (Instruction *U : Uses) {
+    if (isa<BitCastInst>(U)) {
+      DeletedInstrs.insert(U);
+      U->replaceAllUsesWith(ValMap[U->getOperand(0)]);
+    } else
+      U->setOperand(0,
+                    new BitCastInst(ValMap[U->getOperand(0)], PhiTy, "bc", U));
+  }
+
+  // Save the removed phis to be deleted later.
+  for (PHINode *Phi : PhiNodes)
+    DeletedInstrs.insert(Phi);
+  return true;
+}
+
+bool CodeGenPrepare::optimizePhiTypes(Function &F) {
+  if (!OptimizePhiTypes)
+    return false;
+
+  bool Changed = false;
+  SmallPtrSet<PHINode *, 4> Visited;
+  SmallPtrSet<Instruction *, 4> DeletedInstrs;
+
+  // Attempt to optimize all the phis in the functions to the correct type.
+  for (auto &BB : F)
+    for (auto &Phi : BB.phis())
+      Changed |= optimizePhiType(&Phi, Visited, DeletedInstrs);
+
+  // Remove any old phi's that have been converted.
+  for (auto *I : DeletedInstrs) {
+    I->replaceAllUsesWith(UndefValue::get(I->getType()));
+    I->eraseFromParent();
+  }
+
+  return Changed;
+}
+
 /// Return true, if an ext(load) can be formed from an extension in
 /// \p MovedExts.
 bool CodeGenPrepare::canFormExtLd(
@@ -5567,11 +5942,6 @@ bool CodeGenPrepare::canFormExtLd(
 /// \p Inst[in/out] the extension may be modified during the process if some
 /// promotions apply.
 bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
-  // ExtLoad formation and address type promotion infrastructure requires TLI to
-  // be effective.
-  if (!TLI)
-    return false;
-
   bool AllowPromotionWithoutCommonHeader = false;
   /// See if it is an interesting sext operations for the address type
   /// promotion before trying to promote it, e.g., the ones with the right
@@ -5596,16 +5966,8 @@ bool CodeGenPrepare::optimizeExt(Instruction *&Inst) {
   if (canFormExtLd(SpeculativelyMovedExts, LI, ExtFedByLoad, HasPromoted)) {
     assert(LI && ExtFedByLoad && "Expect a valid load and extension");
     TPT.commit();
-    // Move the extend into the same block as the load
+    // Move the extend into the same block as the load.
     ExtFedByLoad->moveAfter(LI);
-    // CGP does not check if the zext would be speculatively executed when moved
-    // to the same basic block as the load. Preserving its original location
-    // would pessimize the debugging experience, as well as negatively impact
-    // the quality of sample pgo. We don't want to use "line 0" as that has a
-    // size cost in the line-table section and logically the zext can be seen as
-    // part of the load. Therefore we conservatively reuse the same debug
-    // location for the load and the zext.
-    ExtFedByLoad->setDebugLoc(LI->getDebugLoc());
     ++NumExtsMoved;
     Inst = ExtFedByLoad;
     return true;
@@ -5633,7 +5995,7 @@ bool CodeGenPrepare::performAddressTypePromotion(
   bool Promoted = false;
   SmallPtrSet<Instruction *, 1> UnhandledExts;
   bool AllSeenFirst = true;
-  for (auto I : SpeculativelyMovedExts) {
+  for (auto *I : SpeculativelyMovedExts) {
     Value *HeadOfChain = I->getOperand(0);
     DenseMap<Value *, Instruction *>::iterator AlreadySeen =
         SeenChainsForSExt.find(HeadOfChain);
@@ -5651,7 +6013,7 @@ bool CodeGenPrepare::performAddressTypePromotion(
     TPT.commit();
     if (HasPromoted)
       Promoted = true;
-    for (auto I : SpeculativelyMovedExts) {
+    for (auto *I : SpeculativelyMovedExts) {
       Value *HeadOfChain = I->getOperand(0);
       SeenChainsForSExt[HeadOfChain] = nullptr;
       ValToSExtendedUses[HeadOfChain].push_back(I);
@@ -5662,7 +6024,7 @@ bool CodeGenPrepare::performAddressTypePromotion(
     // This is the first chain visited from the header, keep the current chain
     // as unhandled. Defer to promote this until we encounter another SExt
     // chain derived from the same header.
-    for (auto I : SpeculativelyMovedExts) {
+    for (auto *I : SpeculativelyMovedExts) {
       Value *HeadOfChain = I->getOperand(0);
       SeenChainsForSExt[HeadOfChain] = Inst;
     }
@@ -5670,7 +6032,7 @@ bool CodeGenPrepare::performAddressTypePromotion(
   }
 
   if (!AllSeenFirst && !UnhandledExts.empty())
-    for (auto VisitedSExt : UnhandledExts) {
+    for (auto *VisitedSExt : UnhandledExts) {
       if (RemovedInsts.count(VisitedSExt))
         continue;
       TypePromotionTransaction TPT(RemovedInsts);
@@ -5681,7 +6043,7 @@ bool CodeGenPrepare::performAddressTypePromotion(
       TPT.commit();
       if (HasPromoted)
         Promoted = true;
-      for (auto I : Chains) {
+      for (auto *I : Chains) {
         Value *HeadOfChain = I->getOperand(0);
         // Mark this as handled.
         SeenChainsForSExt[HeadOfChain] = nullptr;
@@ -5701,7 +6063,7 @@ bool CodeGenPrepare::optimizeExtUses(Instruction *I) {
     return false;
 
   // Only do this xform if truncating is free.
-  if (TLI && !TLI->isTruncateFree(I->getType(), Src->getType()))
+  if (!TLI->isTruncateFree(I->getType(), Src->getType()))
     return false;
 
   // Only safe to perform the optimization if the source is also defined in
@@ -5947,7 +6309,8 @@ static bool sinkSelectOperand(const TargetTransformInfo *TTI, Value *V) {
   // If it's safe to speculatively execute, then it should not have side
   // effects; therefore, it's safe to sink and possibly *not* execute.
   return I && I->hasOneUse() && isSafeToSpeculativelyExecute(I) &&
-         TTI->getUserCost(I) >= TargetTransformInfo::TCC_Expensive;
+         TTI->getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency) >=
+         TargetTransformInfo::TCC_Expensive;
 }
 
 /// Returns true if a SelectInst should be turned into an explicit branch.
@@ -6044,13 +6407,47 @@ bool CodeGenPrepare::optimizeShiftInst(BinaryOperator *Shift) {
   return true;
 }
 
+bool CodeGenPrepare::optimizeFunnelShift(IntrinsicInst *Fsh) {
+  Intrinsic::ID Opcode = Fsh->getIntrinsicID();
+  assert((Opcode == Intrinsic::fshl || Opcode == Intrinsic::fshr) &&
+         "Expected a funnel shift");
+
+  // If this is (1) a vector funnel shift, (2) shifts by scalars are cheaper
+  // than general vector shifts, and (3) the shift amount is select-of-splatted
+  // values, hoist the funnel shifts before the select:
+  //   fsh Op0, Op1, (select Cond, TVal, FVal) -->
+  //   select Cond, (fsh Op0, Op1, TVal), (fsh Op0, Op1, FVal)
+  //
+  // This is inverting a generic IR transform when we know that the cost of a
+  // general vector shift is more than the cost of 2 shift-by-scalars.
+  // We can't do this effectively in SDAG because we may not be able to
+  // determine if the select operands are splats from within a basic block.
+  Type *Ty = Fsh->getType();
+  if (!Ty->isVectorTy() || !TLI->isVectorShiftByScalarCheap(Ty))
+    return false;
+  Value *Cond, *TVal, *FVal;
+  if (!match(Fsh->getOperand(2),
+             m_OneUse(m_Select(m_Value(Cond), m_Value(TVal), m_Value(FVal)))))
+    return false;
+  if (!isSplatValue(TVal) || !isSplatValue(FVal))
+    return false;
+
+  IRBuilder<> Builder(Fsh);
+  Value *X = Fsh->getOperand(0), *Y = Fsh->getOperand(1);
+  Value *NewTVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, TVal });
+  Value *NewFVal = Builder.CreateIntrinsic(Opcode, Ty, { X, Y, FVal });
+  Value *NewSel = Builder.CreateSelect(Cond, NewTVal, NewFVal);
+  Fsh->replaceAllUsesWith(NewSel);
+  Fsh->eraseFromParent();
+  return true;
+}
+
 /// If we have a SelectInst that will likely profit from branch prediction,
 /// turn it into a branch.
 bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
   // If branch conversion isn't desirable, exit early.
-  if (DisableSelectToBranch ||
-      OptSize || llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()) ||
-      !TLI)
+  if (DisableSelectToBranch || OptSize ||
+      llvm::shouldOptimizeForSize(SI->getParent(), PSI, BFI.get()))
     return false;
 
   // Find all consecutive select instructions that share the same condition.
@@ -6103,7 +6500,8 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
   // Into:
   //    start:
   //       %cmp = cmp uge i32 %a, %b
-  //       br i1 %cmp, label %select.true, label %select.false
+  //       %cmp.frozen = freeze %cmp
+  //       br i1 %cmp.frozen, label %select.true, label %select.false
   //    select.true:
   //       br label %select.end
   //    select.false:
@@ -6111,6 +6509,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
   //    select.end:
   //       %sel = phi i32 [ %c, %select.true ], [ %d, %select.false ]
   //
+  // %cmp should be frozen, otherwise it may introduce undefined behavior.
   // In addition, we may sink instructions that produce %c or %d from
   // the entry block into the destination(s) of the new branch.
   // If the true or false blocks do not contain a sunken instruction, that
@@ -6122,6 +6521,7 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
   BasicBlock *StartBlock = SI->getParent();
   BasicBlock::iterator SplitPt = ++(BasicBlock::iterator(LastSI));
   BasicBlock *EndBlock = StartBlock->splitBasicBlock(SplitPt, "select.end");
+  BFI->setBlockFreq(EndBlock, BFI->getBlockFreq(StartBlock).getFrequency());
 
   // Delete the unconditional branch that was just created by the split.
   StartBlock->getTerminator()->eraseFromParent();
@@ -6188,7 +6588,9 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
     TT = TrueBlock;
     FT = FalseBlock;
   }
-  IRBuilder<>(SI).CreateCondBr(SI->getCondition(), TT, FT, SI);
+  IRBuilder<> IB(SI);
+  auto *CondFr = IB.CreateFreeze(SI->getCondition(), SI->getName() + ".frozen");
+  IB.CreateCondBr(CondFr, TT, FT, SI);
 
   SmallPtrSet<const Instruction *, 2> INS;
   INS.insert(ASI.begin(), ASI.end());
@@ -6215,79 +6617,54 @@ bool CodeGenPrepare::optimizeSelectInst(SelectInst *SI) {
   return true;
 }
 
-static bool isBroadcastShuffle(ShuffleVectorInst *SVI) {
-  SmallVector<int, 16> Mask(SVI->getShuffleMask());
-  int SplatElem = -1;
-  for (unsigned i = 0; i < Mask.size(); ++i) {
-    if (SplatElem != -1 && Mask[i] != -1 && Mask[i] != SplatElem)
-      return false;
-    SplatElem = Mask[i];
-  }
-
-  return true;
-}
-
-/// Some targets have expensive vector shifts if the lanes aren't all the same
-/// (e.g. x86 only introduced "vpsllvd" and friends with AVX2). In these cases
-/// it's often worth sinking a shufflevector splat down to its use so that
-/// codegen can spot all lanes are identical.
+/// Some targets only accept certain types for splat inputs. For example a VDUP
+/// in MVE takes a GPR (integer) register, and the instruction that incorporate
+/// a VDUP (such as a VADD qd, qm, rm) also require a gpr register.
 bool CodeGenPrepare::optimizeShuffleVectorInst(ShuffleVectorInst *SVI) {
-  BasicBlock *DefBB = SVI->getParent();
-
-  // Only do this xform if variable vector shifts are particularly expensive.
-  if (!TLI || !TLI->isVectorShiftByScalarCheap(SVI->getType()))
+  if (!match(SVI, m_Shuffle(m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
+                            m_Undef(), m_ZeroMask())))
     return false;
-
-  // We only expect better codegen by sinking a shuffle if we can recognise a
-  // constant splat.
-  if (!isBroadcastShuffle(SVI))
+  Type *NewType = TLI->shouldConvertSplatType(SVI);
+  if (!NewType)
     return false;
 
-  // InsertedShuffles - Only insert a shuffle in each block once.
-  DenseMap<BasicBlock*, Instruction*> InsertedShuffles;
-
-  bool MadeChange = false;
-  for (User *U : SVI->users()) {
-    Instruction *UI = cast<Instruction>(U);
-
-    // Figure out which BB this ext is used in.
-    BasicBlock *UserBB = UI->getParent();
-    if (UserBB == DefBB) continue;
-
-    // For now only apply this when the splat is used by a shift instruction.
-    if (!UI->isShift()) continue;
-
-    // Everything checks out, sink the shuffle if the user's block doesn't
-    // already have a copy.
-    Instruction *&InsertedShuffle = InsertedShuffles[UserBB];
+  auto *SVIVecType = cast<FixedVectorType>(SVI->getType());
+  assert(!NewType->isVectorTy() && "Expected a scalar type!");
+  assert(NewType->getScalarSizeInBits() == SVIVecType->getScalarSizeInBits() &&
+         "Expected a type of the same size!");
+  auto *NewVecType =
+      FixedVectorType::get(NewType, SVIVecType->getNumElements());
+
+  // Create a bitcast (shuffle (insert (bitcast(..))))
+  IRBuilder<> Builder(SVI->getContext());
+  Builder.SetInsertPoint(SVI);
+  Value *BC1 = Builder.CreateBitCast(
+      cast<Instruction>(SVI->getOperand(0))->getOperand(1), NewType);
+  Value *Insert = Builder.CreateInsertElement(UndefValue::get(NewVecType), BC1,
+                                              (uint64_t)0);
+  Value *Shuffle = Builder.CreateShuffleVector(
+      Insert, UndefValue::get(NewVecType), SVI->getShuffleMask());
+  Value *BC2 = Builder.CreateBitCast(Shuffle, SVIVecType);
+
+  SVI->replaceAllUsesWith(BC2);
+  RecursivelyDeleteTriviallyDeadInstructions(SVI);
+
+  // Also hoist the bitcast up to its operand if it they are not in the same
+  // block.
+  if (auto *BCI = dyn_cast<Instruction>(BC1))
+    if (auto *Op = dyn_cast<Instruction>(BCI->getOperand(0)))
+      if (BCI->getParent() != Op->getParent() && !isa<PHINode>(Op) &&
+          !Op->isTerminator() && !Op->isEHPad())
+        BCI->moveAfter(Op);
 
-    if (!InsertedShuffle) {
-      BasicBlock::iterator InsertPt = UserBB->getFirstInsertionPt();
-      assert(InsertPt != UserBB->end());
-      InsertedShuffle =
-          new ShuffleVectorInst(SVI->getOperand(0), SVI->getOperand(1),
-                                SVI->getOperand(2), "", &*InsertPt);
-      InsertedShuffle->setDebugLoc(SVI->getDebugLoc());
-    }
-
-    UI->replaceUsesOfWith(SVI, InsertedShuffle);
-    MadeChange = true;
-  }
-
-  // If we removed all uses, nuke the shuffle.
-  if (SVI->use_empty()) {
-    SVI->eraseFromParent();
-    MadeChange = true;
-  }
-
-  return MadeChange;
+  return true;
 }
 
 bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
   // If the operands of I can be folded into a target instruction together with
   // I, duplicate and sink them.
   SmallVector<Use *, 4> OpsToSink;
-  if (!TLI || !TLI->shouldSinkOperands(I, OpsToSink))
+  if (!TLI->shouldSinkOperands(I, OpsToSink))
     return false;
 
   // OpsToSink can contain multiple uses in a use chain (e.g.
@@ -6340,9 +6717,6 @@ bool CodeGenPrepare::tryToSinkFreeOperands(Instruction *I) {
 }
 
 bool CodeGenPrepare::optimizeSwitchInst(SwitchInst *SI) {
-  if (!TLI || !DL)
-    return false;
-
   Value *Cond = SI->getCondition();
   Type *OldType = Cond->getType();
   LLVMContext &Context = Cond->getContext();
@@ -6494,6 +6868,8 @@ class VectorPromoteHelper {
     uint64_t ScalarCost =
         TTI.getVectorInstrCost(Transition->getOpcode(), PromotedType, Index);
     uint64_t VectorCost = StoreExtractCombineCost;
+    enum TargetTransformInfo::TargetCostKind CostKind =
+      TargetTransformInfo::TCK_RecipThroughput;
     for (const auto &Inst : InstsToBePromoted) {
       // Compute the cost.
       // By construction, all instructions being promoted are arithmetic ones.
@@ -6509,8 +6885,9 @@ class VectorPromoteHelper {
           !IsArg0Constant ? TargetTransformInfo::OK_UniformConstantValue
                           : TargetTransformInfo::OK_AnyValue;
       ScalarCost += TTI.getArithmeticInstrCost(
-          Inst->getOpcode(), Inst->getType(), Arg0OVK, Arg1OVK);
+          Inst->getOpcode(), Inst->getType(), CostKind, Arg0OVK, Arg1OVK);
       VectorCost += TTI.getArithmeticInstrCost(Inst->getOpcode(), PromotedType,
+                                               CostKind,
                                                Arg0OVK, Arg1OVK);
     }
     LLVM_DEBUG(
@@ -6539,19 +6916,23 @@ class VectorPromoteHelper {
         UseSplat = true;
     }
 
-    unsigned End = getTransitionType()->getVectorNumElements();
+    ElementCount EC = cast<VectorType>(getTransitionType())->getElementCount();
     if (UseSplat)
-      return ConstantVector::getSplat(End, Val);
-
-    SmallVector<Constant *, 4> ConstVec;
-    UndefValue *UndefVal = UndefValue::get(Val->getType());
-    for (unsigned Idx = 0; Idx != End; ++Idx) {
-      if (Idx == ExtractIdx)
-        ConstVec.push_back(Val);
-      else
-        ConstVec.push_back(UndefVal);
-    }
-    return ConstantVector::get(ConstVec);
+      return ConstantVector::getSplat(EC, Val);
+
+    if (!EC.Scalable) {
+      SmallVector<Constant *, 4> ConstVec;
+      UndefValue *UndefVal = UndefValue::get(Val->getType());
+      for (unsigned Idx = 0; Idx != EC.Min; ++Idx) {
+        if (Idx == ExtractIdx)
+          ConstVec.push_back(Val);
+        else
+          ConstVec.push_back(UndefVal);
+      }
+      return ConstantVector::get(ConstVec);
+    } else
+      llvm_unreachable(
+          "Generate scalable vector for non-splat is unimplemented");
   }
 
   /// Check if promoting to a vector type an operand at \p OperandIdx
@@ -6706,7 +7087,7 @@ void VectorPromoteHelper::promoteImpl(Instruction *ToBePromoted) {
 /// has this feature and this is profitable.
 bool CodeGenPrepare::optimizeExtractElementInst(Instruction *Inst) {
   unsigned CombineCost = std::numeric_limits<unsigned>::max();
-  if (DisableStoreExtract || !TLI ||
+  if (DisableStoreExtract ||
       (!StressStoreExtract &&
        !TLI->canCombineStoreAndExtract(Inst->getOperand(0)->getType(),
                                        Inst->getOperand(1), CombineCost)))
@@ -6793,6 +7174,14 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
                                 const TargetLowering &TLI) {
   // Handle simple but common cases only.
   Type *StoreType = SI.getValueOperand()->getType();
+
+  // The code below assumes shifting a value by <number of bits>,
+  // whereas scalable vectors would have to be shifted by
+  // <2log(vscale) + number of bits> in order to store the
+  // low/high parts. Bailing out for now.
+  if (isa<ScalableVectorType>(StoreType))
+    return false;
+
   if (!DL.typeSizeEqualsStoreSize(StoreType) ||
       DL.getTypeSizeInBits(StoreType) == 0)
     return false;
@@ -6856,12 +7245,19 @@ static bool splitMergedValStore(StoreInst &SI, const DataLayout &DL,
     Value *Addr = Builder.CreateBitCast(
         SI.getOperand(1),
         SplitStoreType->getPointerTo(SI.getPointerAddressSpace()));
-    if ((IsLE && Upper) || (!IsLE && !Upper))
+    Align Alignment = SI.getAlign();
+    const bool IsOffsetStore = (IsLE && Upper) || (!IsLE && !Upper);
+    if (IsOffsetStore) {
       Addr = Builder.CreateGEP(
           SplitStoreType, Addr,
           ConstantInt::get(Type::getInt32Ty(SI.getContext()), 1));
-    Builder.CreateAlignedStore(
-        V, Addr, Upper ? SI.getAlignment() / 2 : SI.getAlignment());
+
+      // When splitting the store in half, naturally one half will retain the
+      // alignment of the original wider store, regardless of whether it was
+      // over-aligned or not, while the other will require adjustment.
+      Alignment = commonAlignment(Alignment, HalfValBitSize / 8);
+    }
+    Builder.CreateAlignedStore(V, Addr, Alignment);
   };
 
   CreateSplitStore(LValue, false);
@@ -6950,7 +7346,8 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
     return false;
   ConstantInt *GEPIIdx = cast<ConstantInt>(GEPI->getOperand(1));
   // Check that GEPI is a cheap one.
-  if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType())
+  if (TTI->getIntImmCost(GEPIIdx->getValue(), GEPIIdx->getType(),
+                         TargetTransformInfo::TCK_SizeAndLatency)
       > TargetTransformInfo::TCC_Basic)
     return false;
   Value *GEPIOp = GEPI->getOperand(0);
@@ -6999,7 +7396,8 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
         cast<ConstantInt>(UGEPI->getOperand(1))->getType())
       return false;
     ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
-    if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType())
+    if (TTI->getIntImmCost(UGEPIIdx->getValue(), UGEPIIdx->getType(),
+                           TargetTransformInfo::TCK_SizeAndLatency)
         > TargetTransformInfo::TCC_Basic)
       return false;
     UGEPIs.push_back(UGEPI);
@@ -7010,7 +7408,9 @@ static bool tryUnmergingGEPsAcrossIndirectBr(GetElementPtrInst *GEPI,
   for (GetElementPtrInst *UGEPI : UGEPIs) {
     ConstantInt *UGEPIIdx = cast<ConstantInt>(UGEPI->getOperand(1));
     APInt NewIdx = UGEPIIdx->getValue() - GEPIIdx->getValue();
-    unsigned ImmCost = TTI->getIntImmCost(NewIdx, GEPIIdx->getType());
+    unsigned ImmCost =
+      TTI->getIntImmCost(NewIdx, GEPIIdx->getType(),
+                         TargetTransformInfo::TCK_SizeAndLatency);
     if (ImmCost > TargetTransformInfo::TCC_Basic)
       return false;
   }
@@ -7067,16 +7467,15 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
     if (isa<Constant>(CI->getOperand(0)))
       return false;
 
-    if (TLI && OptimizeNoopCopyExpression(CI, *TLI, *DL))
+    if (OptimizeNoopCopyExpression(CI, *TLI, *DL))
       return true;
 
     if (isa<ZExtInst>(I) || isa<SExtInst>(I)) {
       /// Sink a zext or sext into its user blocks if the target type doesn't
       /// fit in one register
-      if (TLI &&
-          TLI->getTypeAction(CI->getContext(),
+      if (TLI->getTypeAction(CI->getContext(),
                              TLI->getValueType(*DL, CI->getType())) ==
-              TargetLowering::TypeExpandInteger) {
+          TargetLowering::TypeExpandInteger) {
         return SinkCast(CI);
       } else {
         bool MadeChange = optimizeExt(I);
@@ -7087,30 +7486,24 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
   }
 
   if (auto *Cmp = dyn_cast<CmpInst>(I))
-    if (TLI && optimizeCmp(Cmp, ModifiedDT))
+    if (optimizeCmp(Cmp, ModifiedDT))
       return true;
 
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
     LI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
-    if (TLI) {
-      bool Modified = optimizeLoadExt(LI);
-      unsigned AS = LI->getPointerAddressSpace();
-      Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
-      return Modified;
-    }
-    return false;
+    bool Modified = optimizeLoadExt(LI);
+    unsigned AS = LI->getPointerAddressSpace();
+    Modified |= optimizeMemoryInst(I, I->getOperand(0), LI->getType(), AS);
+    return Modified;
   }
 
   if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-    if (TLI && splitMergedValStore(*SI, *DL, *TLI))
+    if (splitMergedValStore(*SI, *DL, *TLI))
       return true;
     SI->setMetadata(LLVMContext::MD_invariant_group, nullptr);
-    if (TLI) {
-      unsigned AS = SI->getPointerAddressSpace();
-      return optimizeMemoryInst(I, SI->getOperand(1),
-                                SI->getOperand(0)->getType(), AS);
-    }
-    return false;
+    unsigned AS = SI->getPointerAddressSpace();
+    return optimizeMemoryInst(I, SI->getOperand(1),
+                              SI->getOperand(0)->getType(), AS);
   }
 
   if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
@@ -7127,15 +7520,14 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
 
   BinaryOperator *BinOp = dyn_cast<BinaryOperator>(I);
 
-  if (BinOp && (BinOp->getOpcode() == Instruction::And) &&
-      EnableAndCmpSinking && TLI)
+  if (BinOp && (BinOp->getOpcode() == Instruction::And) && EnableAndCmpSinking)
     return sinkAndCmp0Expression(BinOp, *TLI, InsertedInsts);
 
   // TODO: Move this into the switch on opcode - it handles shifts already.
   if (BinOp && (BinOp->getOpcode() == Instruction::AShr ||
                 BinOp->getOpcode() == Instruction::LShr)) {
     ConstantInt *CI = dyn_cast<ConstantInt>(BinOp->getOperand(1));
-    if (TLI && CI && TLI->hasExtractBitsInsn())
+    if (CI && TLI->hasExtractBitsInsn())
       if (OptimizeExtractBits(BinOp, CI, *TLI, *DL))
         return true;
   }
@@ -7158,6 +7550,35 @@ bool CodeGenPrepare::optimizeInst(Instruction *I, bool &ModifiedDT) {
     return false;
   }
 
+  if (FreezeInst *FI = dyn_cast<FreezeInst>(I)) {
+    // freeze(icmp a, const)) -> icmp (freeze a), const
+    // This helps generate efficient conditional jumps.
+    Instruction *CmpI = nullptr;
+    if (ICmpInst *II = dyn_cast<ICmpInst>(FI->getOperand(0)))
+      CmpI = II;
+    else if (FCmpInst *F = dyn_cast<FCmpInst>(FI->getOperand(0)))
+      CmpI = F->getFastMathFlags().none() ? F : nullptr;
+
+    if (CmpI && CmpI->hasOneUse()) {
+      auto Op0 = CmpI->getOperand(0), Op1 = CmpI->getOperand(1);
+      bool Const0 = isa<ConstantInt>(Op0) || isa<ConstantFP>(Op0) ||
+                    isa<ConstantPointerNull>(Op0);
+      bool Const1 = isa<ConstantInt>(Op1) || isa<ConstantFP>(Op1) ||
+                    isa<ConstantPointerNull>(Op1);
+      if (Const0 || Const1) {
+        if (!Const0 || !Const1) {
+          auto *F = new FreezeInst(Const0 ? Op1 : Op0, "", CmpI);
+          F->takeName(FI);
+          CmpI->setOperand(Const0 ? 1 : 0, F);
+        }
+        FI->replaceAllUsesWith(CmpI);
+        FI->eraseFromParent();
+        return true;
+      }
+    }
+    return false;
+  }
+
   if (tryToSinkFreeOperands(I))
     return true;
 
@@ -7214,7 +7635,7 @@ bool CodeGenPrepare::optimizeBlock(BasicBlock &BB, bool &ModifiedDT) {
   }
 
   bool MadeBitReverse = true;
-  while (TLI && MadeBitReverse) {
+  while (MadeBitReverse) {
     MadeBitReverse = false;
     for (auto &I : reverse(BB)) {
       if (makeBitReverse(I, *DL, *TLI)) {
@@ -7326,7 +7747,7 @@ static void scaleWeights(uint64_t &NewTrue, uint64_t &NewFalse) {
 /// FIXME: Remove the (equivalent?) implementation in SelectionDAG.
 ///
 bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {
-  if (!TM || !TM->Options.EnableFastISel || !TLI || TLI->isJumpExpensive())
+  if (!TM->Options.EnableFastISel || TLI->isJumpExpensive())
     return false;
 
   bool MadeChange = false;
@@ -7367,7 +7788,7 @@ bool CodeGenPrepare::splitBranchCondition(Function &F, bool &ModifiedDT) {
     LLVM_DEBUG(dbgs() << "Before branch condition splitting\n"; BB.dump());
 
     // Create a new BB.
-    auto TmpBB =
+    auto *TmpBB =
         BasicBlock::Create(BB.getContext(), BB.getName() + ".cond.split",
                            BB.getParent(), BB.getNextNode());
 
diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
new file mode 100644
index 000000000000..12dadf97e02c
--- /dev/null
+++ b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -0,0 +1,634 @@
+//===-- CommandFlags.cpp - Command Line Flags Interface ---------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains codegen-specific flags that are shared between different
+// command line tools. The tools "llc" and "opt" both use this file to prevent
+// flag duplication.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/CommandFlags.h"
+#include "llvm/IR/Module.h"
+#include "llvm/MC/SubtargetFeature.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Host.h"
+
+using namespace llvm;
+
+#define CGOPT(TY, NAME)                                                        \
+  static cl::opt<TY> *NAME##View;                                              \
+  TY codegen::get##NAME() {                                                    \
+    assert(NAME##View && "RegisterCodeGenFlags not created.");                 \
+    return *NAME##View;                                                        \
+  }
+
+#define CGLIST(TY, NAME)                                                       \
+  static cl::list<TY> *NAME##View;                                             \
+  std::vector<TY> codegen::get##NAME() {                                       \
+    assert(NAME##View && "RegisterCodeGenFlags not created.");                 \
+    return *NAME##View;                                                        \
+  }
+
+#define CGOPT_EXP(TY, NAME)                                                    \
+  CGOPT(TY, NAME)                                                              \
+  Optional<TY> codegen::getExplicit##NAME() {                                  \
+    if (NAME##View->getNumOccurrences()) {                                     \
+      TY res = *NAME##View;                                                    \
+      return res;                                                              \
+    }                                                                          \
+    return None;                                                               \
+  }
+
+CGOPT(std::string, MArch)
+CGOPT(std::string, MCPU)
+CGLIST(std::string, MAttrs)
+CGOPT_EXP(Reloc::Model, RelocModel)
+CGOPT(ThreadModel::Model, ThreadModel)
+CGOPT_EXP(CodeModel::Model, CodeModel)
+CGOPT(ExceptionHandling, ExceptionModel)
+CGOPT_EXP(CodeGenFileType, FileType)
+CGOPT(FramePointer::FP, FramePointerUsage)
+CGOPT(bool, EnableUnsafeFPMath)
+CGOPT(bool, EnableNoInfsFPMath)
+CGOPT(bool, EnableNoNaNsFPMath)
+CGOPT(bool, EnableNoSignedZerosFPMath)
+CGOPT(bool, EnableNoTrappingFPMath)
+CGOPT(DenormalMode::DenormalModeKind, DenormalFPMath)
+CGOPT(DenormalMode::DenormalModeKind, DenormalFP32Math)
+CGOPT(bool, EnableHonorSignDependentRoundingFPMath)
+CGOPT(FloatABI::ABIType, FloatABIForCalls)
+CGOPT(FPOpFusion::FPOpFusionMode, FuseFPOps)
+CGOPT(bool, DontPlaceZerosInBSS)
+CGOPT(bool, EnableGuaranteedTailCallOpt)
+CGOPT(bool, DisableTailCalls)
+CGOPT(bool, StackSymbolOrdering)
+CGOPT(unsigned, OverrideStackAlignment)
+CGOPT(bool, StackRealign)
+CGOPT(std::string, TrapFuncName)
+CGOPT(bool, UseCtors)
+CGOPT(bool, RelaxELFRelocations)
+CGOPT_EXP(bool, DataSections)
+CGOPT_EXP(bool, FunctionSections)
+CGOPT(std::string, BBSections)
+CGOPT(unsigned, TLSSize)
+CGOPT(bool, EmulatedTLS)
+CGOPT(bool, UniqueSectionNames)
+CGOPT(bool, UniqueBasicBlockSectionNames)
+CGOPT(EABI, EABIVersion)
+CGOPT(DebuggerKind, DebuggerTuningOpt)
+CGOPT(bool, EnableStackSizeSection)
+CGOPT(bool, EnableAddrsig)
+CGOPT(bool, EmitCallSiteInfo)
+CGOPT(bool, EnableDebugEntryValues)
+CGOPT(bool, ForceDwarfFrameSection)
+CGOPT(bool, XRayOmitFunctionIndex)
+
+codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
+#define CGBINDOPT(NAME)                                                        \
+  do {                                                                         \
+    NAME##View = std::addressof(NAME);                                         \
+  } while (0)
+
+  static cl::opt<std::string> MArch(
+      "march", cl::desc("Architecture to generate code for (see --version)"));
+  CGBINDOPT(MArch);
+
+  static cl::opt<std::string> MCPU(
+      "mcpu", cl::desc("Target a specific cpu type (-mcpu=help for details)"),
+      cl::value_desc("cpu-name"), cl::init(""));
+  CGBINDOPT(MCPU);
+
+  static cl::list<std::string> MAttrs(
+      "mattr", cl::CommaSeparated,
+      cl::desc("Target specific attributes (-mattr=help for details)"),
+      cl::value_desc("a1,+a2,-a3,..."));
+  CGBINDOPT(MAttrs);
+
+  static cl::opt<Reloc::Model> RelocModel(
+      "relocation-model", cl::desc("Choose relocation model"),
+      cl::values(
+          clEnumValN(Reloc::Static, "static", "Non-relocatable code"),
+          clEnumValN(Reloc::PIC_, "pic",
+                     "Fully relocatable, position independent code"),
+          clEnumValN(Reloc::DynamicNoPIC, "dynamic-no-pic",
+                     "Relocatable external references, non-relocatable code"),
+          clEnumValN(
+              Reloc::ROPI, "ropi",
+              "Code and read-only data relocatable, accessed PC-relative"),
+          clEnumValN(
+              Reloc::RWPI, "rwpi",
+              "Read-write data relocatable, accessed relative to static base"),
+          clEnumValN(Reloc::ROPI_RWPI, "ropi-rwpi",
+                     "Combination of ropi and rwpi")));
+  CGBINDOPT(RelocModel);
+
+  static cl::opt<ThreadModel::Model> ThreadModel(
+      "thread-model", cl::desc("Choose threading model"),
+      cl::init(ThreadModel::POSIX),
+      cl::values(
+          clEnumValN(ThreadModel::POSIX, "posix", "POSIX thread model"),
+          clEnumValN(ThreadModel::Single, "single", "Single thread model")));
+  CGBINDOPT(ThreadModel);
+
+  static cl::opt<CodeModel::Model> CodeModel(
+      "code-model", cl::desc("Choose code model"),
+      cl::values(clEnumValN(CodeModel::Tiny, "tiny", "Tiny code model"),
+                 clEnumValN(CodeModel::Small, "small", "Small code model"),
+                 clEnumValN(CodeModel::Kernel, "kernel", "Kernel code model"),
+                 clEnumValN(CodeModel::Medium, "medium", "Medium code model"),
+                 clEnumValN(CodeModel::Large, "large", "Large code model")));
+  CGBINDOPT(CodeModel);
+
+  static cl::opt<ExceptionHandling> ExceptionModel(
+      "exception-model", cl::desc("exception model"),
+      cl::init(ExceptionHandling::None),
+      cl::values(
+          clEnumValN(ExceptionHandling::None, "default",
+                     "default exception handling model"),
+          clEnumValN(ExceptionHandling::DwarfCFI, "dwarf",
+                     "DWARF-like CFI based exception handling"),
+          clEnumValN(ExceptionHandling::SjLj, "sjlj",
+                     "SjLj exception handling"),
+          clEnumValN(ExceptionHandling::ARM, "arm", "ARM EHABI exceptions"),
+          clEnumValN(ExceptionHandling::WinEH, "wineh",
+                     "Windows exception model"),
+          clEnumValN(ExceptionHandling::Wasm, "wasm",
+                     "WebAssembly exception handling")));
+  CGBINDOPT(ExceptionModel);
+
+  static cl::opt<CodeGenFileType> FileType(
+      "filetype", cl::init(CGFT_AssemblyFile),
+      cl::desc(
+          "Choose a file type (not all types are supported by all targets):"),
+      cl::values(
+          clEnumValN(CGFT_AssemblyFile, "asm", "Emit an assembly ('.s') file"),
+          clEnumValN(CGFT_ObjectFile, "obj",
+                     "Emit a native object ('.o') file"),
+          clEnumValN(CGFT_Null, "null",
+                     "Emit nothing, for performance testing")));
+  CGBINDOPT(FileType);
+
+  static cl::opt<FramePointer::FP> FramePointerUsage(
+      "frame-pointer",
+      cl::desc("Specify frame pointer elimination optimization"),
+      cl::init(FramePointer::None),
+      cl::values(
+          clEnumValN(FramePointer::All, "all",
+                     "Disable frame pointer elimination"),
+          clEnumValN(FramePointer::NonLeaf, "non-leaf",
+                     "Disable frame pointer elimination for non-leaf frame"),
+          clEnumValN(FramePointer::None, "none",
+                     "Enable frame pointer elimination")));
+  CGBINDOPT(FramePointerUsage);
+
+  static cl::opt<bool> EnableUnsafeFPMath(
+      "enable-unsafe-fp-math",
+      cl::desc("Enable optimizations that may decrease FP precision"),
+      cl::init(false));
+  CGBINDOPT(EnableUnsafeFPMath);
+
+  static cl::opt<bool> EnableNoInfsFPMath(
+      "enable-no-infs-fp-math",
+      cl::desc("Enable FP math optimizations that assume no +-Infs"),
+      cl::init(false));
+  CGBINDOPT(EnableNoInfsFPMath);
+
+  static cl::opt<bool> EnableNoNaNsFPMath(
+      "enable-no-nans-fp-math",
+      cl::desc("Enable FP math optimizations that assume no NaNs"),
+      cl::init(false));
+  CGBINDOPT(EnableNoNaNsFPMath);
+
+  static cl::opt<bool> EnableNoSignedZerosFPMath(
+      "enable-no-signed-zeros-fp-math",
+      cl::desc("Enable FP math optimizations that assume "
+               "the sign of 0 is insignificant"),
+      cl::init(false));
+  CGBINDOPT(EnableNoSignedZerosFPMath);
+
+  static cl::opt<bool> EnableNoTrappingFPMath(
+      "enable-no-trapping-fp-math",
+      cl::desc("Enable setting the FP exceptions build "
+               "attribute not to use exceptions"),
+      cl::init(false));
+  CGBINDOPT(EnableNoTrappingFPMath);
+
+  static const auto DenormFlagEnumOptions =
+  cl::values(clEnumValN(DenormalMode::IEEE, "ieee",
+                        "IEEE 754 denormal numbers"),
+             clEnumValN(DenormalMode::PreserveSign, "preserve-sign",
+                        "the sign of a  flushed-to-zero number is preserved "
+                        "in the sign of 0"),
+             clEnumValN(DenormalMode::PositiveZero, "positive-zero",
+                        "denormals are flushed to positive zero"));
+
+  // FIXME: Doesn't have way to specify separate input and output modes.
+  static cl::opt<DenormalMode::DenormalModeKind> DenormalFPMath(
+    "denormal-fp-math",
+    cl::desc("Select which denormal numbers the code is permitted to require"),
+    cl::init(DenormalMode::IEEE),
+    DenormFlagEnumOptions);
+  CGBINDOPT(DenormalFPMath);
+
+  static cl::opt<DenormalMode::DenormalModeKind> DenormalFP32Math(
+    "denormal-fp-math-f32",
+    cl::desc("Select which denormal numbers the code is permitted to require for float"),
+    cl::init(DenormalMode::Invalid),
+    DenormFlagEnumOptions);
+  CGBINDOPT(DenormalFP32Math);
+
+  static cl::opt<bool> EnableHonorSignDependentRoundingFPMath(
+      "enable-sign-dependent-rounding-fp-math", cl::Hidden,
+      cl::desc("Force codegen to assume rounding mode can change dynamically"),
+      cl::init(false));
+  CGBINDOPT(EnableHonorSignDependentRoundingFPMath);
+
+  static cl::opt<FloatABI::ABIType> FloatABIForCalls(
+      "float-abi", cl::desc("Choose float ABI type"),
+      cl::init(FloatABI::Default),
+      cl::values(clEnumValN(FloatABI::Default, "default",
+                            "Target default float ABI type"),
+                 clEnumValN(FloatABI::Soft, "soft",
+                            "Soft float ABI (implied by -soft-float)"),
+                 clEnumValN(FloatABI::Hard, "hard",
+                            "Hard float ABI (uses FP registers)")));
+  CGBINDOPT(FloatABIForCalls);
+
+  static cl::opt<FPOpFusion::FPOpFusionMode> FuseFPOps(
+      "fp-contract", cl::desc("Enable aggressive formation of fused FP ops"),
+      cl::init(FPOpFusion::Standard),
+      cl::values(
+          clEnumValN(FPOpFusion::Fast, "fast",
+                     "Fuse FP ops whenever profitable"),
+          clEnumValN(FPOpFusion::Standard, "on", "Only fuse 'blessed' FP ops."),
+          clEnumValN(FPOpFusion::Strict, "off",
+                     "Only fuse FP ops when the result won't be affected.")));
+  CGBINDOPT(FuseFPOps);
+
+  static cl::opt<bool> DontPlaceZerosInBSS(
+      "nozero-initialized-in-bss",
+      cl::desc("Don't place zero-initialized symbols into bss section"),
+      cl::init(false));
+  CGBINDOPT(DontPlaceZerosInBSS);
+
+  static cl::opt<bool> EnableGuaranteedTailCallOpt(
+      "tailcallopt",
+      cl::desc(
+          "Turn fastcc calls into tail calls by (potentially) changing ABI."),
+      cl::init(false));
+  CGBINDOPT(EnableGuaranteedTailCallOpt);
+
+  static cl::opt<bool> DisableTailCalls(
+      "disable-tail-calls", cl::desc("Never emit tail calls"), cl::init(false));
+  CGBINDOPT(DisableTailCalls);
+
+  static cl::opt<bool> StackSymbolOrdering(
+      "stack-symbol-ordering", cl::desc("Order local stack symbols."),
+      cl::init(true));
+  CGBINDOPT(StackSymbolOrdering);
+
+  static cl::opt<unsigned> OverrideStackAlignment(
+      "stack-alignment", cl::desc("Override default stack alignment"),
+      cl::init(0));
+  CGBINDOPT(OverrideStackAlignment);
+
+  static cl::opt<bool> StackRealign(
+      "stackrealign",
+      cl::desc("Force align the stack to the minimum alignment"),
+      cl::init(false));
+  CGBINDOPT(StackRealign);
+
+  static cl::opt<std::string> TrapFuncName(
+      "trap-func", cl::Hidden,
+      cl::desc("Emit a call to trap function rather than a trap instruction"),
+      cl::init(""));
+  CGBINDOPT(TrapFuncName);
+
+  static cl::opt<bool> UseCtors("use-ctors",
+                                cl::desc("Use .ctors instead of .init_array."),
+                                cl::init(false));
+  CGBINDOPT(UseCtors);
+
+  static cl::opt<bool> RelaxELFRelocations(
+      "relax-elf-relocations",
+      cl::desc(
+          "Emit GOTPCRELX/REX_GOTPCRELX instead of GOTPCREL on x86-64 ELF"),
+      cl::init(false));
+  CGBINDOPT(RelaxELFRelocations);
+
+  static cl::opt<bool> DataSections(
+      "data-sections", cl::desc("Emit data into separate sections"),
+      cl::init(false));
+  CGBINDOPT(DataSections);
+
+  static cl::opt<bool> FunctionSections(
+      "function-sections", cl::desc("Emit functions into separate sections"),
+      cl::init(false));
+  CGBINDOPT(FunctionSections);
+
+  static cl::opt<std::string> BBSections(
+      "basicblock-sections",
+      cl::desc("Emit basic blocks into separate sections"),
+      cl::value_desc("all | <function list (file)> | labels | none"),
+      cl::init("none"));
+  CGBINDOPT(BBSections);
+
+  static cl::opt<unsigned> TLSSize(
+      "tls-size", cl::desc("Bit size of immediate TLS offsets"), cl::init(0));
+  CGBINDOPT(TLSSize);
+
+  static cl::opt<bool> EmulatedTLS(
+      "emulated-tls", cl::desc("Use emulated TLS model"), cl::init(false));
+  CGBINDOPT(EmulatedTLS);
+
+  static cl::opt<bool> UniqueSectionNames(
+      "unique-section-names", cl::desc("Give unique names to every section"),
+      cl::init(true));
+  CGBINDOPT(UniqueSectionNames);
+
+  static cl::opt<bool> UniqueBasicBlockSectionNames(
+      "unique-bb-section-names",
+      cl::desc("Give unique names to every basic block section"),
+      cl::init(false));
+  CGBINDOPT(UniqueBasicBlockSectionNames);
+
+  static cl::opt<EABI> EABIVersion(
+      "meabi", cl::desc("Set EABI type (default depends on triple):"),
+      cl::init(EABI::Default),
+      cl::values(
+          clEnumValN(EABI::Default, "default", "Triple default EABI version"),
+          clEnumValN(EABI::EABI4, "4", "EABI version 4"),
+          clEnumValN(EABI::EABI5, "5", "EABI version 5"),
+          clEnumValN(EABI::GNU, "gnu", "EABI GNU")));
+  CGBINDOPT(EABIVersion);
+
+  static cl::opt<DebuggerKind> DebuggerTuningOpt(
+      "debugger-tune", cl::desc("Tune debug info for a particular debugger"),
+      cl::init(DebuggerKind::Default),
+      cl::values(
+          clEnumValN(DebuggerKind::GDB, "gdb", "gdb"),
+          clEnumValN(DebuggerKind::LLDB, "lldb", "lldb"),
+          clEnumValN(DebuggerKind::SCE, "sce", "SCE targets (e.g. PS4)")));
+  CGBINDOPT(DebuggerTuningOpt);
+
+  static cl::opt<bool> EnableStackSizeSection(
+      "stack-size-section",
+      cl::desc("Emit a section containing stack size metadata"),
+      cl::init(false));
+  CGBINDOPT(EnableStackSizeSection);
+
+  static cl::opt<bool> EnableAddrsig(
+      "addrsig", cl::desc("Emit an address-significance table"),
+      cl::init(false));
+  CGBINDOPT(EnableAddrsig);
+
+  static cl::opt<bool> EmitCallSiteInfo(
+      "emit-call-site-info",
+      cl::desc(
+          "Emit call site debug information, if debug information is enabled."),
+      cl::init(false));
+  CGBINDOPT(EmitCallSiteInfo);
+
+  static cl::opt<bool> EnableDebugEntryValues(
+      "debug-entry-values",
+      cl::desc("Enable debug info for the debug entry values."),
+      cl::init(false));
+  CGBINDOPT(EnableDebugEntryValues);
+
+  static cl::opt<bool> ForceDwarfFrameSection(
+      "force-dwarf-frame-section",
+      cl::desc("Always emit a debug frame section."), cl::init(false));
+  CGBINDOPT(ForceDwarfFrameSection);
+
+  static cl::opt<bool> XRayOmitFunctionIndex(
+      "no-xray-index", cl::desc("Don't emit xray_fn_idx section"),
+      cl::init(false));
+  CGBINDOPT(XRayOmitFunctionIndex);
+
+#undef CGBINDOPT
+
+  mc::RegisterMCTargetOptionsFlags();
+}
+
+llvm::BasicBlockSection
+codegen::getBBSectionsMode(llvm::TargetOptions &Options) {
+  if (getBBSections() == "all")
+    return BasicBlockSection::All;
+  else if (getBBSections() == "labels")
+    return BasicBlockSection::Labels;
+  else if (getBBSections() == "none")
+    return BasicBlockSection::None;
+  else {
+    ErrorOr<std::unique_ptr<MemoryBuffer>> MBOrErr =
+        MemoryBuffer::getFile(getBBSections());
+    if (!MBOrErr) {
+      errs() << "Error loading basic block sections function list file: "
+             << MBOrErr.getError().message() << "\n";
+    } else {
+      Options.BBSectionsFuncListBuf = std::move(*MBOrErr);
+    }
+    return BasicBlockSection::List;
+  }
+}
+
+// Common utility function tightly tied to the options listed here. Initializes
+// a TargetOptions object with CodeGen flags and returns it.
+TargetOptions codegen::InitTargetOptionsFromCodeGenFlags() {
+  TargetOptions Options;
+  Options.AllowFPOpFusion = getFuseFPOps();
+  Options.UnsafeFPMath = getEnableUnsafeFPMath();
+  Options.NoInfsFPMath = getEnableNoInfsFPMath();
+  Options.NoNaNsFPMath = getEnableNoNaNsFPMath();
+  Options.NoSignedZerosFPMath = getEnableNoSignedZerosFPMath();
+  Options.NoTrappingFPMath = getEnableNoTrappingFPMath();
+
+  DenormalMode::DenormalModeKind DenormKind = getDenormalFPMath();
+
+  // FIXME: Should have separate input and output flags
+  Options.setFPDenormalMode(DenormalMode(DenormKind, DenormKind));
+
+  Options.HonorSignDependentRoundingFPMathOption =
+      getEnableHonorSignDependentRoundingFPMath();
+  if (getFloatABIForCalls() != FloatABI::Default)
+    Options.FloatABIType = getFloatABIForCalls();
+  Options.NoZerosInBSS = getDontPlaceZerosInBSS();
+  Options.GuaranteedTailCallOpt = getEnableGuaranteedTailCallOpt();
+  Options.StackAlignmentOverride = getOverrideStackAlignment();
+  Options.StackSymbolOrdering = getStackSymbolOrdering();
+  Options.UseInitArray = !getUseCtors();
+  Options.RelaxELFRelocations = getRelaxELFRelocations();
+  Options.DataSections = getDataSections();
+  Options.FunctionSections = getFunctionSections();
+  Options.BBSections = getBBSectionsMode(Options);
+  Options.UniqueSectionNames = getUniqueSectionNames();
+  Options.UniqueBasicBlockSectionNames = getUniqueBasicBlockSectionNames();
+  Options.TLSSize = getTLSSize();
+  Options.EmulatedTLS = getEmulatedTLS();
+  Options.ExplicitEmulatedTLS = EmulatedTLSView->getNumOccurrences() > 0;
+  Options.ExceptionModel = getExceptionModel();
+  Options.EmitStackSizeSection = getEnableStackSizeSection();
+  Options.EmitAddrsig = getEnableAddrsig();
+  Options.EmitCallSiteInfo = getEmitCallSiteInfo();
+  Options.EnableDebugEntryValues = getEnableDebugEntryValues();
+  Options.ForceDwarfFrameSection = getForceDwarfFrameSection();
+  Options.XRayOmitFunctionIndex = getXRayOmitFunctionIndex();
+
+  Options.MCOptions = mc::InitMCTargetOptionsFromFlags();
+
+  Options.ThreadModel = getThreadModel();
+  Options.EABIVersion = getEABIVersion();
+  Options.DebuggerTuning = getDebuggerTuningOpt();
+
+  return Options;
+}
+
+std::string codegen::getCPUStr() {
+  // If user asked for the 'native' CPU, autodetect here. If autodection fails,
+  // this will set the CPU to an empty string which tells the target to
+  // pick a basic default.
+  if (getMCPU() == "native")
+    return std::string(sys::getHostCPUName());
+
+  return getMCPU();
+}
+
+std::string codegen::getFeaturesStr() {
+  SubtargetFeatures Features;
+
+  // If user asked for the 'native' CPU, we need to autodetect features.
+  // This is necessary for x86 where the CPU might not support all the
+  // features the autodetected CPU name lists in the target. For example,
+  // not all Sandybridge processors support AVX.
+  if (getMCPU() == "native") {
+    StringMap<bool> HostFeatures;
+    if (sys::getHostCPUFeatures(HostFeatures))
+      for (auto &F : HostFeatures)
+        Features.AddFeature(F.first(), F.second);
+  }
+
+  for (auto const &MAttr : getMAttrs())
+    Features.AddFeature(MAttr);
+
+  return Features.getString();
+}
+
+std::vector<std::string> codegen::getFeatureList() {
+  SubtargetFeatures Features;
+
+  // If user asked for the 'native' CPU, we need to autodetect features.
+  // This is necessary for x86 where the CPU might not support all the
+  // features the autodetected CPU name lists in the target. For example,
+  // not all Sandybridge processors support AVX.
+  if (getMCPU() == "native") {
+    StringMap<bool> HostFeatures;
+    if (sys::getHostCPUFeatures(HostFeatures))
+      for (auto &F : HostFeatures)
+        Features.AddFeature(F.first(), F.second);
+  }
+
+  for (auto const &MAttr : getMAttrs())
+    Features.AddFeature(MAttr);
+
+  return Features.getFeatures();
+}
+
+void codegen::renderBoolStringAttr(AttrBuilder &B, StringRef Name, bool Val) {
+  B.addAttribute(Name, Val ? "true" : "false");
+}
+
+#define HANDLE_BOOL_ATTR(CL, AttrName)                                         \
+  do {                                                                         \
+    if (CL->getNumOccurrences() > 0 && !F.hasFnAttribute(AttrName))            \
+      renderBoolStringAttr(NewAttrs, AttrName, *CL);                           \
+  } while (0)
+
+/// Set function attributes of function \p F based on CPU, Features, and command
+/// line flags.
+void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
+                                    Function &F) {
+  auto &Ctx = F.getContext();
+  AttributeList Attrs = F.getAttributes();
+  AttrBuilder NewAttrs;
+
+  if (!CPU.empty() && !F.hasFnAttribute("target-cpu"))
+    NewAttrs.addAttribute("target-cpu", CPU);
+  if (!Features.empty()) {
+    // Append the command line features to any that are already on the function.
+    StringRef OldFeatures =
+        F.getFnAttribute("target-features").getValueAsString();
+    if (OldFeatures.empty())
+      NewAttrs.addAttribute("target-features", Features);
+    else {
+      SmallString<256> Appended(OldFeatures);
+      Appended.push_back(',');
+      Appended.append(Features);
+      NewAttrs.addAttribute("target-features", Appended);
+    }
+  }
+  if (FramePointerUsageView->getNumOccurrences() > 0 &&
+      !F.hasFnAttribute("frame-pointer")) {
+    if (getFramePointerUsage() == FramePointer::All)
+      NewAttrs.addAttribute("frame-pointer", "all");
+    else if (getFramePointerUsage() == FramePointer::NonLeaf)
+      NewAttrs.addAttribute("frame-pointer", "non-leaf");
+    else if (getFramePointerUsage() == FramePointer::None)
+      NewAttrs.addAttribute("frame-pointer", "none");
+  }
+  if (DisableTailCallsView->getNumOccurrences() > 0)
+    NewAttrs.addAttribute("disable-tail-calls",
+                          toStringRef(getDisableTailCalls()));
+  if (getStackRealign())
+    NewAttrs.addAttribute("stackrealign");
+
+  HANDLE_BOOL_ATTR(EnableUnsafeFPMathView, "unsafe-fp-math");
+  HANDLE_BOOL_ATTR(EnableNoInfsFPMathView, "no-infs-fp-math");
+  HANDLE_BOOL_ATTR(EnableNoNaNsFPMathView, "no-nans-fp-math");
+  HANDLE_BOOL_ATTR(EnableNoSignedZerosFPMathView, "no-signed-zeros-fp-math");
+
+  if (DenormalFPMathView->getNumOccurrences() > 0 &&
+      !F.hasFnAttribute("denormal-fp-math")) {
+    DenormalMode::DenormalModeKind DenormKind = getDenormalFPMath();
+
+    // FIXME: Command line flag should expose separate input/output modes.
+    NewAttrs.addAttribute("denormal-fp-math",
+                          DenormalMode(DenormKind, DenormKind).str());
+  }
+
+  if (DenormalFP32MathView->getNumOccurrences() > 0 &&
+      !F.hasFnAttribute("denormal-fp-math-f32")) {
+    // FIXME: Command line flag should expose separate input/output modes.
+    DenormalMode::DenormalModeKind DenormKind = getDenormalFP32Math();
+
+    NewAttrs.addAttribute(
+      "denormal-fp-math-f32",
+      DenormalMode(DenormKind, DenormKind).str());
+  }
+
+  if (TrapFuncNameView->getNumOccurrences() > 0)
+    for (auto &B : F)
+      for (auto &I : B)
+        if (auto *Call = dyn_cast<CallInst>(&I))
+          if (const auto *F = Call->getCalledFunction())
+            if (F->getIntrinsicID() == Intrinsic::debugtrap ||
+                F->getIntrinsicID() == Intrinsic::trap)
+              Call->addAttribute(
+                  AttributeList::FunctionIndex,
+                  Attribute::get(Ctx, "trap-func-name", getTrapFuncName()));
+
+  // Let NewAttrs override Attrs.
+  F.setAttributes(
+      Attrs.addAttributes(Ctx, AttributeList::FunctionIndex, NewAttrs));
+}
+
+/// Set function attributes of functions in Module M based on CPU,
+/// Features, and command line flags.
+void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
+                                    Module &M) {
+  for (Function &F : M)
+    setFunctionAttributes(CPU, Features, F);
+}
diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
index 8d9d48402b31..7ae42b010261 100644
--- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
+++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.cpp
@@ -14,7 +14,6 @@
 
 #include "CriticalAntiDepBreaker.h"
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -33,9 +32,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
-#include <map>
 #include <utility>
-#include <vector>
 
 using namespace llvm;
 
@@ -702,3 +699,9 @@ BreakAntiDependencies(const std::vector<SUnit> &SUnits,
 
   return Broken;
 }
+
+AntiDepBreaker *
+llvm::createCriticalAntiDepBreaker(MachineFunction &MFi,
+                                   const RegisterClassInfo &RCI) {
+  return new CriticalAntiDepBreaker(MFi, RCI);
+}
diff --git a/llvm/lib/CodeGen/CriticalAntiDepBreaker.h b/llvm/lib/CodeGen/CriticalAntiDepBreaker.h
index 4e127ce525c8..640506b6e9ed 100644
--- a/llvm/lib/CodeGen/CriticalAntiDepBreaker.h
+++ b/llvm/lib/CodeGen/CriticalAntiDepBreaker.h
@@ -15,8 +15,8 @@
 #ifndef LLVM_LIB_CODEGEN_CRITICALANTIDEPBREAKER_H
 #define LLVM_LIB_CODEGEN_CRITICALANTIDEPBREAKER_H
 
-#include "AntiDepBreaker.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/CodeGen/AntiDepBreaker.h"
 #include "llvm/Support/Compiler.h"
 #include <map>
 #include <vector>
diff --git a/llvm/lib/CodeGen/DwarfEHPrepare.cpp b/llvm/lib/CodeGen/DwarfEHPrepare.cpp
index af347fd7e73d..c75c957bff8a 100644
--- a/llvm/lib/CodeGen/DwarfEHPrepare.cpp
+++ b/llvm/lib/CodeGen/DwarfEHPrepare.cpp
@@ -48,6 +48,7 @@ namespace {
     // RewindFunction - _Unwind_Resume or the target equivalent.
     FunctionCallee RewindFunction = nullptr;
 
+    CodeGenOpt::Level OptLevel;
     DominatorTree *DT = nullptr;
     const TargetLowering *TLI = nullptr;
 
@@ -61,7 +62,8 @@ namespace {
   public:
     static char ID; // Pass identification, replacement for typeid.
 
-    DwarfEHPrepare() : FunctionPass(ID) {}
+    DwarfEHPrepare(CodeGenOpt::Level OptLevel = CodeGenOpt::Default)
+      : FunctionPass(ID), OptLevel(OptLevel) {}
 
     bool runOnFunction(Function &Fn) override;
 
@@ -89,12 +91,15 @@ INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
 INITIALIZE_PASS_END(DwarfEHPrepare, DEBUG_TYPE,
                     "Prepare DWARF exceptions", false, false)
 
-FunctionPass *llvm::createDwarfEHPass() { return new DwarfEHPrepare(); }
+FunctionPass *llvm::createDwarfEHPass(CodeGenOpt::Level OptLevel) {
+  return new DwarfEHPrepare(OptLevel);
+}
 
 void DwarfEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetPassConfig>();
   AU.addRequired<TargetTransformInfoWrapperPass>();
-  AU.addRequired<DominatorTreeWrapperPass>();
+  if (OptLevel != CodeGenOpt::None)
+    AU.addRequired<DominatorTreeWrapperPass>();
 }
 
 /// GetExceptionObject - Return the exception object from the value passed into
@@ -202,7 +207,10 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
 
   LLVMContext &Ctx = Fn.getContext();
 
-  size_t ResumesLeft = pruneUnreachableResumes(Fn, Resumes, CleanupLPads);
+  size_t ResumesLeft = Resumes.size();
+  if (OptLevel != CodeGenOpt::None)
+    ResumesLeft = pruneUnreachableResumes(Fn, Resumes, CleanupLPads);
+
   if (ResumesLeft == 0)
     return true; // We pruned them all.
 
@@ -259,7 +267,8 @@ bool DwarfEHPrepare::InsertUnwindResumeCalls(Function &Fn) {
 bool DwarfEHPrepare::runOnFunction(Function &Fn) {
   const TargetMachine &TM =
       getAnalysis<TargetPassConfig>().getTM<TargetMachine>();
-  DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  DT = OptLevel != CodeGenOpt::None
+      ? &getAnalysis<DominatorTreeWrapperPass>().getDomTree() : nullptr;
   TLI = TM.getSubtargetImpl(Fn)->getTargetLowering();
   bool Changed = InsertUnwindResumeCalls(Fn);
   DT = nullptr;
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index d45e424184d7..96d4efb856c1 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -91,10 +91,10 @@ public:
   /// The block containing phis after the if-then-else.
   MachineBasicBlock *Tail;
 
-  /// The 'true' conditional block as determined by AnalyzeBranch.
+  /// The 'true' conditional block as determined by analyzeBranch.
   MachineBasicBlock *TBB;
 
-  /// The 'false' conditional block as determined by AnalyzeBranch.
+  /// The 'false' conditional block as determined by analyzeBranch.
   MachineBasicBlock *FBB;
 
   /// isTriangle - When there is no 'else' block, either TBB or FBB will be
@@ -121,7 +121,7 @@ public:
   SmallVector<PHIInfo, 8> PHIs;
 
 private:
-  /// The branch condition determined by AnalyzeBranch.
+  /// The branch condition determined by analyzeBranch.
   SmallVector<MachineOperand, 4> Cond;
 
   /// Instructions in Head that define values used by the conditional blocks.
@@ -486,18 +486,18 @@ bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB, bool Predicate) {
 
   // This is weird, probably some sort of degenerate CFG.
   if (!TBB) {
-    LLVM_DEBUG(dbgs() << "AnalyzeBranch didn't find conditional branch.\n");
+    LLVM_DEBUG(dbgs() << "analyzeBranch didn't find conditional branch.\n");
     return false;
   }
 
   // Make sure the analyzed branch is conditional; one of the successors
   // could be a landing pad. (Empty landing pads can be generated on Windows.)
   if (Cond.empty()) {
-    LLVM_DEBUG(dbgs() << "AnalyzeBranch found an unconditional branch.\n");
+    LLVM_DEBUG(dbgs() << "analyzeBranch found an unconditional branch.\n");
     return false;
   }
 
-  // AnalyzeBranch doesn't set FBB on a fall-through branch.
+  // analyzeBranch doesn't set FBB on a fall-through branch.
   // Make sure it is always set.
   FBB = TBB == Succ0 ? Succ1 : Succ0;
 
@@ -520,8 +520,9 @@ bool SSAIfConv::canConvertIf(MachineBasicBlock *MBB, bool Predicate) {
     assert(Register::isVirtualRegister(PI.FReg) && "Bad PHI");
 
     // Get target information.
-    if (!TII->canInsertSelect(*Head, Cond, PI.TReg, PI.FReg,
-                              PI.CondCycles, PI.TCycles, PI.FCycles)) {
+    if (!TII->canInsertSelect(*Head, Cond, PI.PHI->getOperand(0).getReg(),
+                              PI.TReg, PI.FReg, PI.CondCycles, PI.TCycles,
+                              PI.FCycles)) {
       LLVM_DEBUG(dbgs() << "Can't convert: " << *PI.PHI);
       return false;
     }
@@ -758,7 +759,7 @@ void updateDomTree(MachineDominatorTree *DomTree, const SSAIfConv &IfConv,
     assert(Node != HeadNode && "Cannot erase the head node");
     while (Node->getNumChildren()) {
       assert(Node->getBlock() == IfConv.Tail && "Unexpected children");
-      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+      DomTree->changeImmediateDominator(Node->back(), HeadNode);
     }
     DomTree->eraseNode(B);
   }
diff --git a/llvm/lib/CodeGen/EdgeBundles.cpp b/llvm/lib/CodeGen/EdgeBundles.cpp
index dfaf7f584652..0b2ffda50a39 100644
--- a/llvm/lib/CodeGen/EdgeBundles.cpp
+++ b/llvm/lib/CodeGen/EdgeBundles.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/EdgeBundles.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/Passes.h"
diff --git a/llvm/lib/CodeGen/ExpandMemCmp.cpp b/llvm/lib/CodeGen/ExpandMemCmp.cpp
index a1adf4ef9820..9f85db9de884 100644
--- a/llvm/lib/CodeGen/ExpandMemCmp.cpp
+++ b/llvm/lib/CodeGen/ExpandMemCmp.cpp
@@ -23,7 +23,9 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -76,7 +78,7 @@ class MemCmpExpansion {
   IRBuilder<> Builder;
   // Represents the decomposition in blocks of the expansion. For example,
   // comparing 33 bytes on X86+sse can be done with 2x16-byte loads and
-  // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {32, 1}.
+  // 1x1-byte load, which would be represented as [{16, 0}, {16, 16}, {1, 32}.
   struct LoadEntry {
     LoadEntry(unsigned LoadSize, uint64_t Offset)
         : LoadSize(LoadSize), Offset(Offset) {
@@ -103,8 +105,12 @@ class MemCmpExpansion {
   Value *getMemCmpExpansionZeroCase();
   Value *getMemCmpEqZeroOneBlock();
   Value *getMemCmpOneBlock();
-  Value *getPtrToElementAtOffset(Value *Source, Type *LoadSizeType,
-                                 uint64_t OffsetBytes);
+  struct LoadPair {
+    Value *Lhs = nullptr;
+    Value *Rhs = nullptr;
+  };
+  LoadPair getLoadPair(Type *LoadSizeType, bool NeedsBSwap, Type *CmpSizeType,
+                       unsigned OffsetBytes);
 
   static LoadEntryVector
   computeGreedyLoadSequence(uint64_t Size, llvm::ArrayRef<unsigned> LoadSizes,
@@ -261,18 +267,56 @@ void MemCmpExpansion::createResultBlock() {
                                    EndBlock->getParent(), EndBlock);
 }
 
-/// Return a pointer to an element of type `LoadSizeType` at offset
-/// `OffsetBytes`.
-Value *MemCmpExpansion::getPtrToElementAtOffset(Value *Source,
-                                                Type *LoadSizeType,
-                                                uint64_t OffsetBytes) {
+MemCmpExpansion::LoadPair MemCmpExpansion::getLoadPair(Type *LoadSizeType,
+                                                       bool NeedsBSwap,
+                                                       Type *CmpSizeType,
+                                                       unsigned OffsetBytes) {
+  // Get the memory source at offset `OffsetBytes`.
+  Value *LhsSource = CI->getArgOperand(0);
+  Value *RhsSource = CI->getArgOperand(1);
+  Align LhsAlign = LhsSource->getPointerAlignment(DL);
+  Align RhsAlign = RhsSource->getPointerAlignment(DL);
   if (OffsetBytes > 0) {
     auto *ByteType = Type::getInt8Ty(CI->getContext());
-    Source = Builder.CreateConstGEP1_64(
-        ByteType, Builder.CreateBitCast(Source, ByteType->getPointerTo()),
+    LhsSource = Builder.CreateConstGEP1_64(
+        ByteType, Builder.CreateBitCast(LhsSource, ByteType->getPointerTo()),
+        OffsetBytes);
+    RhsSource = Builder.CreateConstGEP1_64(
+        ByteType, Builder.CreateBitCast(RhsSource, ByteType->getPointerTo()),
         OffsetBytes);
+    LhsAlign = commonAlignment(LhsAlign, OffsetBytes);
+    RhsAlign = commonAlignment(RhsAlign, OffsetBytes);
+  }
+  LhsSource = Builder.CreateBitCast(LhsSource, LoadSizeType->getPointerTo());
+  RhsSource = Builder.CreateBitCast(RhsSource, LoadSizeType->getPointerTo());
+
+  // Create a constant or a load from the source.
+  Value *Lhs = nullptr;
+  if (auto *C = dyn_cast<Constant>(LhsSource))
+    Lhs = ConstantFoldLoadFromConstPtr(C, LoadSizeType, DL);
+  if (!Lhs)
+    Lhs = Builder.CreateAlignedLoad(LoadSizeType, LhsSource, LhsAlign);
+
+  Value *Rhs = nullptr;
+  if (auto *C = dyn_cast<Constant>(RhsSource))
+    Rhs = ConstantFoldLoadFromConstPtr(C, LoadSizeType, DL);
+  if (!Rhs)
+    Rhs = Builder.CreateAlignedLoad(LoadSizeType, RhsSource, RhsAlign);
+
+  // Swap bytes if required.
+  if (NeedsBSwap) {
+    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
+                                                Intrinsic::bswap, LoadSizeType);
+    Lhs = Builder.CreateCall(Bswap, Lhs);
+    Rhs = Builder.CreateCall(Bswap, Rhs);
+  }
+
+  // Zero extend if required.
+  if (CmpSizeType != nullptr && CmpSizeType != LoadSizeType) {
+    Lhs = Builder.CreateZExt(Lhs, CmpSizeType);
+    Rhs = Builder.CreateZExt(Rhs, CmpSizeType);
   }
-  return Builder.CreateBitCast(Source, LoadSizeType->getPointerTo());
+  return {Lhs, Rhs};
 }
 
 // This function creates the IR instructions for loading and comparing 1 byte.
@@ -282,18 +326,10 @@ Value *MemCmpExpansion::getPtrToElementAtOffset(Value *Source,
 void MemCmpExpansion::emitLoadCompareByteBlock(unsigned BlockIndex,
                                                unsigned OffsetBytes) {
   Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
-  Type *LoadSizeType = Type::getInt8Ty(CI->getContext());
-  Value *Source1 =
-      getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType, OffsetBytes);
-  Value *Source2 =
-      getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType, OffsetBytes);
-
-  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
-  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
-
-  LoadSrc1 = Builder.CreateZExt(LoadSrc1, Type::getInt32Ty(CI->getContext()));
-  LoadSrc2 = Builder.CreateZExt(LoadSrc2, Type::getInt32Ty(CI->getContext()));
-  Value *Diff = Builder.CreateSub(LoadSrc1, LoadSrc2);
+  const LoadPair Loads =
+      getLoadPair(Type::getInt8Ty(CI->getContext()), /*NeedsBSwap=*/false,
+                  Type::getInt32Ty(CI->getContext()), OffsetBytes);
+  Value *Diff = Builder.CreateSub(Loads.Lhs, Loads.Rhs);
 
   PhiRes->addIncoming(Diff, LoadCmpBlocks[BlockIndex]);
 
@@ -340,41 +376,19 @@ Value *MemCmpExpansion::getCompareLoadPairs(unsigned BlockIndex,
                     : IntegerType::get(CI->getContext(), MaxLoadSize * 8);
   for (unsigned i = 0; i < NumLoads; ++i, ++LoadIndex) {
     const LoadEntry &CurLoadEntry = LoadSequence[LoadIndex];
-
-    IntegerType *LoadSizeType =
-        IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8);
-
-    Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType,
-                                             CurLoadEntry.Offset);
-    Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType,
-                                             CurLoadEntry.Offset);
-
-    // Get a constant or load a value for each source address.
-    Value *LoadSrc1 = nullptr;
-    if (auto *Source1C = dyn_cast<Constant>(Source1))
-      LoadSrc1 = ConstantFoldLoadFromConstPtr(Source1C, LoadSizeType, DL);
-    if (!LoadSrc1)
-      LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
-
-    Value *LoadSrc2 = nullptr;
-    if (auto *Source2C = dyn_cast<Constant>(Source2))
-      LoadSrc2 = ConstantFoldLoadFromConstPtr(Source2C, LoadSizeType, DL);
-    if (!LoadSrc2)
-      LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
+    const LoadPair Loads = getLoadPair(
+        IntegerType::get(CI->getContext(), CurLoadEntry.LoadSize * 8),
+        /*NeedsBSwap=*/false, MaxLoadType, CurLoadEntry.Offset);
 
     if (NumLoads != 1) {
-      if (LoadSizeType != MaxLoadType) {
-        LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
-        LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
-      }
       // If we have multiple loads per block, we need to generate a composite
       // comparison using xor+or.
-      Diff = Builder.CreateXor(LoadSrc1, LoadSrc2);
+      Diff = Builder.CreateXor(Loads.Lhs, Loads.Rhs);
       Diff = Builder.CreateZExt(Diff, MaxLoadType);
       XorList.push_back(Diff);
     } else {
       // If there's only one load per block, we just compare the loaded values.
-      Cmp = Builder.CreateICmpNE(LoadSrc1, LoadSrc2);
+      Cmp = Builder.CreateICmpNE(Loads.Lhs, Loads.Rhs);
     }
   }
 
@@ -451,35 +465,18 @@ void MemCmpExpansion::emitLoadCompareBlock(unsigned BlockIndex) {
 
   Builder.SetInsertPoint(LoadCmpBlocks[BlockIndex]);
 
-  Value *Source1 = getPtrToElementAtOffset(CI->getArgOperand(0), LoadSizeType,
-                                           CurLoadEntry.Offset);
-  Value *Source2 = getPtrToElementAtOffset(CI->getArgOperand(1), LoadSizeType,
-                                           CurLoadEntry.Offset);
-
-  // Load LoadSizeType from the base address.
-  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
-  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
-
-  if (DL.isLittleEndian()) {
-    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
-                                                Intrinsic::bswap, LoadSizeType);
-    LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
-    LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
-  }
-
-  if (LoadSizeType != MaxLoadType) {
-    LoadSrc1 = Builder.CreateZExt(LoadSrc1, MaxLoadType);
-    LoadSrc2 = Builder.CreateZExt(LoadSrc2, MaxLoadType);
-  }
+  const LoadPair Loads =
+      getLoadPair(LoadSizeType, /*NeedsBSwap=*/DL.isLittleEndian(), MaxLoadType,
+                  CurLoadEntry.Offset);
 
   // Add the loaded values to the phi nodes for calculating memcmp result only
   // if result is not used in a zero equality.
   if (!IsUsedForZeroCmp) {
-    ResBlock.PhiSrc1->addIncoming(LoadSrc1, LoadCmpBlocks[BlockIndex]);
-    ResBlock.PhiSrc2->addIncoming(LoadSrc2, LoadCmpBlocks[BlockIndex]);
+    ResBlock.PhiSrc1->addIncoming(Loads.Lhs, LoadCmpBlocks[BlockIndex]);
+    ResBlock.PhiSrc2->addIncoming(Loads.Rhs, LoadCmpBlocks[BlockIndex]);
   }
 
-  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, LoadSrc1, LoadSrc2);
+  Value *Cmp = Builder.CreateICmp(ICmpInst::ICMP_EQ, Loads.Lhs, Loads.Rhs);
   BasicBlock *NextBB = (BlockIndex == (LoadCmpBlocks.size() - 1))
                            ? EndBlock
                            : LoadCmpBlocks[BlockIndex + 1];
@@ -568,42 +565,27 @@ Value *MemCmpExpansion::getMemCmpEqZeroOneBlock() {
 /// the compare, branch, and phi IR that is required in the general case.
 Value *MemCmpExpansion::getMemCmpOneBlock() {
   Type *LoadSizeType = IntegerType::get(CI->getContext(), Size * 8);
-  Value *Source1 = CI->getArgOperand(0);
-  Value *Source2 = CI->getArgOperand(1);
-
-  // Cast source to LoadSizeType*.
-  if (Source1->getType() != LoadSizeType)
-    Source1 = Builder.CreateBitCast(Source1, LoadSizeType->getPointerTo());
-  if (Source2->getType() != LoadSizeType)
-    Source2 = Builder.CreateBitCast(Source2, LoadSizeType->getPointerTo());
-
-  // Load LoadSizeType from the base address.
-  Value *LoadSrc1 = Builder.CreateLoad(LoadSizeType, Source1);
-  Value *LoadSrc2 = Builder.CreateLoad(LoadSizeType, Source2);
-
-  if (DL.isLittleEndian() && Size != 1) {
-    Function *Bswap = Intrinsic::getDeclaration(CI->getModule(),
-                                                Intrinsic::bswap, LoadSizeType);
-    LoadSrc1 = Builder.CreateCall(Bswap, LoadSrc1);
-    LoadSrc2 = Builder.CreateCall(Bswap, LoadSrc2);
-  }
+  bool NeedsBSwap = DL.isLittleEndian() && Size != 1;
 
+  // The i8 and i16 cases don't need compares. We zext the loaded values and
+  // subtract them to get the suitable negative, zero, or positive i32 result.
   if (Size < 4) {
-    // The i8 and i16 cases don't need compares. We zext the loaded values and
-    // subtract them to get the suitable negative, zero, or positive i32 result.
-    LoadSrc1 = Builder.CreateZExt(LoadSrc1, Builder.getInt32Ty());
-    LoadSrc2 = Builder.CreateZExt(LoadSrc2, Builder.getInt32Ty());
-    return Builder.CreateSub(LoadSrc1, LoadSrc2);
+    const LoadPair Loads =
+        getLoadPair(LoadSizeType, NeedsBSwap, Builder.getInt32Ty(),
+                    /*Offset*/ 0);
+    return Builder.CreateSub(Loads.Lhs, Loads.Rhs);
   }
 
+  const LoadPair Loads = getLoadPair(LoadSizeType, NeedsBSwap, LoadSizeType,
+                                     /*Offset*/ 0);
   // The result of memcmp is negative, zero, or positive, so produce that by
   // subtracting 2 extended compare bits: sub (ugt, ult).
   // If a target prefers to use selects to get -1/0/1, they should be able
   // to transform this later. The inverse transform (going from selects to math)
   // may not be possible in the DAG because the selects got converted into
   // branches before we got there.
-  Value *CmpUGT = Builder.CreateICmpUGT(LoadSrc1, LoadSrc2);
-  Value *CmpULT = Builder.CreateICmpULT(LoadSrc1, LoadSrc2);
+  Value *CmpUGT = Builder.CreateICmpUGT(Loads.Lhs, Loads.Rhs);
+  Value *CmpULT = Builder.CreateICmpULT(Loads.Lhs, Loads.Rhs);
   Value *ZextUGT = Builder.CreateZExt(CmpUGT, Builder.getInt32Ty());
   Value *ZextULT = Builder.CreateZExt(CmpULT, Builder.getInt32Ty());
   return Builder.CreateSub(ZextUGT, ZextULT);
@@ -843,7 +825,7 @@ bool ExpandMemCmpPass::runOnBlock(
       continue;
     }
     LibFunc Func;
-    if (TLI->getLibFunc(ImmutableCallSite(CI), Func) &&
+    if (TLI->getLibFunc(*CI, Func) &&
         (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
         expandMemCmp(CI, TTI, TL, &DL, PSI, BFI)) {
       return true;
@@ -869,6 +851,9 @@ PreservedAnalyses ExpandMemCmpPass::runImpl(
       ++BBIt;
     }
   }
+  if (MadeChanges)
+    for (BasicBlock &BB : F)
+      SimplifyInstructionsInBlock(&BB);
   return MadeChanges ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
diff --git a/llvm/lib/CodeGen/ExpandReductions.cpp b/llvm/lib/CodeGen/ExpandReductions.cpp
index 4ccf1d2c8c50..45f21c1085dd 100644
--- a/llvm/lib/CodeGen/ExpandReductions.cpp
+++ b/llvm/lib/CodeGen/ExpandReductions.cpp
@@ -125,7 +125,8 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
       if (!FMF.allowReassoc())
         Rdx = getOrderedReduction(Builder, Acc, Vec, getOpcode(ID), MRK);
       else {
-        if (!isPowerOf2_32(Vec->getType()->getVectorNumElements()))
+        if (!isPowerOf2_32(
+                cast<FixedVectorType>(Vec->getType())->getNumElements()))
           continue;
 
         Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
@@ -146,7 +147,8 @@ bool expandReductions(Function &F, const TargetTransformInfo *TTI) {
     case Intrinsic::experimental_vector_reduce_fmax:
     case Intrinsic::experimental_vector_reduce_fmin: {
       Value *Vec = II->getArgOperand(0);
-      if (!isPowerOf2_32(Vec->getType()->getVectorNumElements()))
+      if (!isPowerOf2_32(
+              cast<FixedVectorType>(Vec->getType())->getNumElements()))
         continue;
 
       Rdx = getShuffleReduction(Builder, Vec, getOpcode(ID), MRK);
diff --git a/llvm/lib/CodeGen/FEntryInserter.cpp b/llvm/lib/CodeGen/FEntryInserter.cpp
index 4c0f30bce820..c2194929e2e7 100644
--- a/llvm/lib/CodeGen/FEntryInserter.cpp
+++ b/llvm/lib/CodeGen/FEntryInserter.cpp
@@ -35,8 +35,8 @@ struct FEntryInserter : public MachineFunctionPass {
 }
 
 bool FEntryInserter::runOnMachineFunction(MachineFunction &MF) {
-  const std::string FEntryName =
-      MF.getFunction().getFnAttribute("fentry-call").getValueAsString();
+  const std::string FEntryName = std::string(
+      MF.getFunction().getFnAttribute("fentry-call").getValueAsString());
   if (FEntryName != "true")
     return false;
 
diff --git a/llvm/lib/CodeGen/FaultMaps.cpp b/llvm/lib/CodeGen/FaultMaps.cpp
index de0b4fa87098..23560b4cd136 100644
--- a/llvm/lib/CodeGen/FaultMaps.cpp
+++ b/llvm/lib/CodeGen/FaultMaps.cpp
@@ -57,17 +57,17 @@ void FaultMaps::serializeToFaultMapSection() {
   OS.SwitchSection(FaultMapSection);
 
   // Emit a dummy symbol to force section inclusion.
-  OS.EmitLabel(OutContext.getOrCreateSymbol(Twine("__LLVM_FaultMaps")));
+  OS.emitLabel(OutContext.getOrCreateSymbol(Twine("__LLVM_FaultMaps")));
 
   LLVM_DEBUG(dbgs() << "********** Fault Map Output **********\n");
 
   // Header
-  OS.EmitIntValue(FaultMapVersion, 1); // Version.
-  OS.EmitIntValue(0, 1);               // Reserved.
-  OS.EmitIntValue(0, 2);               // Reserved.
+  OS.emitIntValue(FaultMapVersion, 1); // Version.
+  OS.emitIntValue(0, 1);               // Reserved.
+  OS.emitInt16(0);                     // Reserved.
 
   LLVM_DEBUG(dbgs() << WFMP << "#functions = " << FunctionInfos.size() << "\n");
-  OS.EmitIntValue(FunctionInfos.size(), 4);
+  OS.emitInt32(FunctionInfos.size());
 
   LLVM_DEBUG(dbgs() << WFMP << "functions:\n");
 
@@ -80,25 +80,25 @@ void FaultMaps::emitFunctionInfo(const MCSymbol *FnLabel,
   MCStreamer &OS = *AP.OutStreamer;
 
   LLVM_DEBUG(dbgs() << WFMP << "  function addr: " << *FnLabel << "\n");
-  OS.EmitSymbolValue(FnLabel, 8);
+  OS.emitSymbolValue(FnLabel, 8);
 
   LLVM_DEBUG(dbgs() << WFMP << "  #faulting PCs: " << FFI.size() << "\n");
-  OS.EmitIntValue(FFI.size(), 4);
+  OS.emitInt32(FFI.size());
 
-  OS.EmitIntValue(0, 4); // Reserved
+  OS.emitInt32(0); // Reserved
 
   for (auto &Fault : FFI) {
     LLVM_DEBUG(dbgs() << WFMP << "    fault type: "
                       << faultTypeToString(Fault.Kind) << "\n");
-    OS.EmitIntValue(Fault.Kind, 4);
+    OS.emitInt32(Fault.Kind);
 
     LLVM_DEBUG(dbgs() << WFMP << "    faulting PC offset: "
                       << *Fault.FaultingOffsetExpr << "\n");
-    OS.EmitValue(Fault.FaultingOffsetExpr, 4);
+    OS.emitValue(Fault.FaultingOffsetExpr, 4);
 
     LLVM_DEBUG(dbgs() << WFMP << "    fault handler PC offset: "
                       << *Fault.HandlerOffsetExpr << "\n");
-    OS.EmitValue(Fault.HandlerOffsetExpr, 4);
+    OS.emitValue(Fault.HandlerOffsetExpr, 4);
   }
 }
 
diff --git a/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
new file mode 100644
index 000000000000..27319804049d
--- /dev/null
+++ b/llvm/lib/CodeGen/FixupStatepointCallerSaved.cpp
@@ -0,0 +1,311 @@
+//===-- FixupStatepointCallerSaved.cpp - Fixup caller saved registers  ----===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Statepoint instruction in deopt parameters contains values which are
+/// meaningful to the runtime and should be able to be read at the moment the
+/// call returns. So we can say that we need to encode the fact that these
+/// values are "late read" by runtime. If we could express this notion for
+/// register allocator it would produce the right form for us.
+/// The need to fixup (i.e this pass) is specifically handling the fact that
+/// we cannot describe such a late read for the register allocator.
+/// Register allocator may put the value on a register clobbered by the call.
+/// This pass forces the spill of such registers and replaces corresponding
+/// statepoint operands to added spill slots.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/StackMaps.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/Statepoint.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "fixup-statepoint-caller-saved"
+STATISTIC(NumSpilledRegisters, "Number of spilled register");
+STATISTIC(NumSpillSlotsAllocated, "Number of spill slots allocated");
+STATISTIC(NumSpillSlotsExtended, "Number of spill slots extended");
+
+static cl::opt<bool> FixupSCSExtendSlotSize(
+    "fixup-scs-extend-slot-size", cl::Hidden, cl::init(false),
+    cl::desc("Allow spill in spill slot of greater size than register size"),
+    cl::Hidden);
+
+namespace {
+
+class FixupStatepointCallerSaved : public MachineFunctionPass {
+public:
+  static char ID;
+
+  FixupStatepointCallerSaved() : MachineFunctionPass(ID) {
+    initializeFixupStatepointCallerSavedPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  StringRef getPassName() const override {
+    return "Fixup Statepoint Caller Saved";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // End anonymous namespace.
+
+char FixupStatepointCallerSaved::ID = 0;
+char &llvm::FixupStatepointCallerSavedID = FixupStatepointCallerSaved::ID;
+
+INITIALIZE_PASS_BEGIN(FixupStatepointCallerSaved, DEBUG_TYPE,
+                      "Fixup Statepoint Caller Saved", false, false)
+INITIALIZE_PASS_END(FixupStatepointCallerSaved, DEBUG_TYPE,
+                    "Fixup Statepoint Caller Saved", false, false)
+
+// Utility function to get size of the register.
+static unsigned getRegisterSize(const TargetRegisterInfo &TRI, Register Reg) {
+  const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
+  return TRI.getSpillSize(*RC);
+}
+
+namespace {
+// Cache used frame indexes during statepoint re-write to re-use them in
+// processing next statepoint instruction.
+// Two strategies. One is to preserve the size of spill slot while another one
+// extends the size of spill slots to reduce the number of them, causing
+// the less total frame size. But unspill will have "implicit" any extend.
+class FrameIndexesCache {
+private:
+  struct FrameIndexesPerSize {
+    // List of used frame indexes during processing previous statepoints.
+    SmallVector<int, 8> Slots;
+    // Current index of un-used yet frame index.
+    unsigned Index = 0;
+  };
+  MachineFrameInfo &MFI;
+  const TargetRegisterInfo &TRI;
+  // Map size to list of frame indexes of this size. If the mode is
+  // FixupSCSExtendSlotSize then the key 0 is used to keep all frame indexes.
+  // If the size of required spill slot is greater than in a cache then the
+  // size will be increased.
+  DenseMap<unsigned, FrameIndexesPerSize> Cache;
+
+public:
+  FrameIndexesCache(MachineFrameInfo &MFI, const TargetRegisterInfo &TRI)
+      : MFI(MFI), TRI(TRI) {}
+  // Reset the current state of used frame indexes. After invocation of
+  // this function all frame indexes are available for allocation.
+  void reset() {
+    for (auto &It : Cache)
+      It.second.Index = 0;
+  }
+  // Get frame index to spill the register.
+  int getFrameIndex(Register Reg) {
+    unsigned Size = getRegisterSize(TRI, Reg);
+    // In FixupSCSExtendSlotSize mode the bucket with 0 index is used
+    // for all sizes.
+    unsigned Bucket = FixupSCSExtendSlotSize ? 0 : Size;
+    FrameIndexesPerSize &Line = Cache[Bucket];
+    if (Line.Index < Line.Slots.size()) {
+      int FI = Line.Slots[Line.Index++];
+      // If all sizes are kept together we probably need to extend the
+      // spill slot size.
+      if (MFI.getObjectSize(FI) < Size) {
+        MFI.setObjectSize(FI, Size);
+        MFI.setObjectAlignment(FI, Align(Size));
+        NumSpillSlotsExtended++;
+      }
+      return FI;
+    }
+    int FI = MFI.CreateSpillStackObject(Size, Align(Size));
+    NumSpillSlotsAllocated++;
+    Line.Slots.push_back(FI);
+    ++Line.Index;
+    return FI;
+  }
+  // Sort all registers to spill in descendent order. In the
+  // FixupSCSExtendSlotSize mode it will minimize the total frame size.
+  // In non FixupSCSExtendSlotSize mode we can skip this step.
+  void sortRegisters(SmallVectorImpl<Register> &Regs) {
+    if (!FixupSCSExtendSlotSize)
+      return;
+    llvm::sort(Regs.begin(), Regs.end(), [&](Register &A, Register &B) {
+      return getRegisterSize(TRI, A) > getRegisterSize(TRI, B);
+    });
+  }
+};
+
+// Describes the state of the current processing statepoint instruction.
+class StatepointState {
+private:
+  // statepoint instruction.
+  MachineInstr &MI;
+  MachineFunction &MF;
+  const TargetRegisterInfo &TRI;
+  const TargetInstrInfo &TII;
+  MachineFrameInfo &MFI;
+  // Mask with callee saved registers.
+  const uint32_t *Mask;
+  // Cache of frame indexes used on previous instruction processing.
+  FrameIndexesCache &CacheFI;
+  // Operands with physical registers requiring spilling.
+  SmallVector<unsigned, 8> OpsToSpill;
+  // Set of register to spill.
+  SmallVector<Register, 8> RegsToSpill;
+  // Map Register to Frame Slot index.
+  DenseMap<Register, int> RegToSlotIdx;
+
+public:
+  StatepointState(MachineInstr &MI, const uint32_t *Mask,
+                  FrameIndexesCache &CacheFI)
+      : MI(MI), MF(*MI.getMF()), TRI(*MF.getSubtarget().getRegisterInfo()),
+        TII(*MF.getSubtarget().getInstrInfo()), MFI(MF.getFrameInfo()),
+        Mask(Mask), CacheFI(CacheFI) {}
+  // Return true if register is callee saved.
+  bool isCalleeSaved(Register Reg) { return (Mask[Reg / 32] >> Reg % 32) & 1; }
+  // Iterates over statepoint meta args to find caller saver registers.
+  // Also cache the size of found registers.
+  // Returns true if caller save registers found.
+  bool findRegistersToSpill() {
+    SmallSet<Register, 8> VisitedRegs;
+    for (unsigned Idx = StatepointOpers(&MI).getVarIdx(),
+                  EndIdx = MI.getNumOperands();
+         Idx < EndIdx; ++Idx) {
+      MachineOperand &MO = MI.getOperand(Idx);
+      if (!MO.isReg() || MO.isImplicit())
+        continue;
+      Register Reg = MO.getReg();
+      assert(Reg.isPhysical() && "Only physical regs are expected");
+      if (isCalleeSaved(Reg))
+        continue;
+      if (VisitedRegs.insert(Reg).second)
+        RegsToSpill.push_back(Reg);
+      OpsToSpill.push_back(Idx);
+    }
+    CacheFI.sortRegisters(RegsToSpill);
+    return !RegsToSpill.empty();
+  }
+  // Spill all caller saved registers right before statepoint instruction.
+  // Remember frame index where register is spilled.
+  void spillRegisters() {
+    for (Register Reg : RegsToSpill) {
+      int FI = CacheFI.getFrameIndex(Reg);
+      const TargetRegisterClass *RC = TRI.getMinimalPhysRegClass(Reg);
+      TII.storeRegToStackSlot(*MI.getParent(), MI, Reg, true /*is_Kill*/, FI,
+                              RC, &TRI);
+      NumSpilledRegisters++;
+      RegToSlotIdx[Reg] = FI;
+    }
+  }
+  // Re-write statepoint machine instruction to replace caller saved operands
+  // with indirect memory location (frame index).
+  void rewriteStatepoint() {
+    MachineInstr *NewMI =
+        MF.CreateMachineInstr(TII.get(MI.getOpcode()), MI.getDebugLoc(), true);
+    MachineInstrBuilder MIB(MF, NewMI);
+
+    // Add End marker.
+    OpsToSpill.push_back(MI.getNumOperands());
+    unsigned CurOpIdx = 0;
+
+    for (unsigned I = 0; I < MI.getNumOperands(); ++I) {
+      MachineOperand &MO = MI.getOperand(I);
+      if (I == OpsToSpill[CurOpIdx]) {
+        int FI = RegToSlotIdx[MO.getReg()];
+        MIB.addImm(StackMaps::IndirectMemRefOp);
+        MIB.addImm(getRegisterSize(TRI, MO.getReg()));
+        assert(MO.isReg() && "Should be register");
+        assert(MO.getReg().isPhysical() && "Should be physical register");
+        MIB.addFrameIndex(FI);
+        MIB.addImm(0);
+        ++CurOpIdx;
+      } else
+        MIB.add(MO);
+    }
+    assert(CurOpIdx == (OpsToSpill.size() - 1) && "Not all operands processed");
+    // Add mem operands.
+    NewMI->setMemRefs(MF, MI.memoperands());
+    for (auto It : RegToSlotIdx) {
+      int FrameIndex = It.second;
+      auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+      auto *MMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
+                                          getRegisterSize(TRI, It.first),
+                                          MFI.getObjectAlign(FrameIndex));
+      NewMI->addMemOperand(MF, MMO);
+    }
+    // Insert new statepoint and erase old one.
+    MI.getParent()->insert(MI, NewMI);
+    MI.eraseFromParent();
+  }
+};
+
+class StatepointProcessor {
+private:
+  MachineFunction &MF;
+  const TargetRegisterInfo &TRI;
+  FrameIndexesCache CacheFI;
+
+public:
+  StatepointProcessor(MachineFunction &MF)
+      : MF(MF), TRI(*MF.getSubtarget().getRegisterInfo()),
+        CacheFI(MF.getFrameInfo(), TRI) {}
+
+  bool process(MachineInstr &MI) {
+    StatepointOpers SO(&MI);
+    uint64_t Flags = SO.getFlags();
+    // Do nothing for LiveIn, it supports all registers.
+    if (Flags & (uint64_t)StatepointFlags::DeoptLiveIn)
+      return false;
+    CallingConv::ID CC = SO.getCallingConv();
+    const uint32_t *Mask = TRI.getCallPreservedMask(MF, CC);
+    CacheFI.reset();
+    StatepointState SS(MI, Mask, CacheFI);
+
+    if (!SS.findRegistersToSpill())
+      return false;
+
+    SS.spillRegisters();
+    SS.rewriteStatepoint();
+    return true;
+  }
+};
+} // namespace
+
+bool FixupStatepointCallerSaved::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  const Function &F = MF.getFunction();
+  if (!F.hasGC())
+    return false;
+
+  SmallVector<MachineInstr *, 16> Statepoints;
+  for (MachineBasicBlock &BB : MF)
+    for (MachineInstr &I : BB)
+      if (I.getOpcode() == TargetOpcode::STATEPOINT)
+        Statepoints.push_back(&I);
+
+  if (Statepoints.empty())
+    return false;
+
+  bool Changed = false;
+  StatepointProcessor SPP(MF);
+  for (MachineInstr *I : Statepoints)
+    Changed |= SPP.process(*I);
+  return Changed;
+}
diff --git a/llvm/lib/CodeGen/GCMetadata.cpp b/llvm/lib/CodeGen/GCMetadata.cpp
index 600d662e0f99..7c96d838d992 100644
--- a/llvm/lib/CodeGen/GCMetadata.cpp
+++ b/llvm/lib/CodeGen/GCMetadata.cpp
@@ -153,7 +153,7 @@ GCStrategy *GCModuleInfo::getGCStrategy(const StringRef Name) {
   for (auto& Entry : GCRegistry::entries()) {
     if (Name == Entry.getName()) {
       std::unique_ptr<GCStrategy> S = Entry.instantiate();
-      S->Name = Name;
+      S->Name = std::string(Name);
       GCStrategyMap[Name] = S.get();
       GCStrategyList.push_back(std::move(S));
       return GCStrategyList.back().get();
diff --git a/llvm/lib/CodeGen/GCRootLowering.cpp b/llvm/lib/CodeGen/GCRootLowering.cpp
index 90e5f32f53b3..c6730aa6b00d 100644
--- a/llvm/lib/CodeGen/GCRootLowering.cpp
+++ b/llvm/lib/CodeGen/GCRootLowering.cpp
@@ -57,7 +57,6 @@ public:
 /// GCMetadata record for each function.
 class GCMachineCodeAnalysis : public MachineFunctionPass {
   GCFunctionInfo *FI;
-  MachineModuleInfo *MMI;
   const TargetInstrInfo *TII;
 
   void FindSafePoints(MachineFunction &MF);
@@ -160,10 +159,9 @@ static bool InsertRootInitializers(Function &F, ArrayRef<AllocaInst *> Roots) {
 
   for (AllocaInst *Root : Roots)
     if (!InitedRoots.count(Root)) {
-      StoreInst *SI = new StoreInst(
+      new StoreInst(
           ConstantPointerNull::get(cast<PointerType>(Root->getAllocatedType())),
-          Root);
-      SI->insertAfter(Root);
+          Root, Root->getNextNode());
       MadeChange = true;
     }
 
@@ -189,12 +187,12 @@ bool LowerIntrinsics::runOnFunction(Function &F) {
 /// need to be able to ensure each root has been initialized by the point the
 /// first safepoint is reached.  This really should have been done by the
 /// frontend, but the old API made this non-obvious, so we do a potentially
-/// redundant store just in case.  
+/// redundant store just in case.
 bool LowerIntrinsics::DoLowering(Function &F, GCStrategy &S) {
   SmallVector<AllocaInst *, 32> Roots;
 
   bool MadeChange = false;
-  for (BasicBlock &BB : F) 
+  for (BasicBlock &BB : F)
     for (BasicBlock::iterator II = BB.begin(), E = BB.end(); II != E;) {
       IntrinsicInst *CI = dyn_cast<IntrinsicInst>(II++);
       if (!CI)
@@ -250,7 +248,6 @@ GCMachineCodeAnalysis::GCMachineCodeAnalysis() : MachineFunctionPass(ID) {}
 void GCMachineCodeAnalysis::getAnalysisUsage(AnalysisUsage &AU) const {
   MachineFunctionPass::getAnalysisUsage(AU);
   AU.setPreservesAll();
-  AU.addRequired<MachineModuleInfoWrapperPass>();
   AU.addRequired<GCModuleInfo>();
 }
 
@@ -297,7 +294,7 @@ void GCMachineCodeAnalysis::FindStackOffsets(MachineFunction &MF) {
     if (MF.getFrameInfo().isDeadObjectIndex(RI->Num)) {
       RI = FI->removeStackRoot(RI);
     } else {
-      unsigned FrameReg; // FIXME: surely GCRoot ought to store the
+      Register FrameReg; // FIXME: surely GCRoot ought to store the
                          // register that the offset is from?
       RI->StackOffset = TFI->getFrameIndexReference(MF, RI->Num, FrameReg);
       ++RI;
@@ -311,7 +308,6 @@ bool GCMachineCodeAnalysis::runOnMachineFunction(MachineFunction &MF) {
     return false;
 
   FI = &getAnalysis<GCModuleInfo>().getFunctionInfo(MF.getFunction());
-  MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
   TII = MF.getSubtarget().getInstrInfo();
 
   // Find the size of the stack frame.  There may be no correct static frame
diff --git a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
index e6abfcdb92cb..c4d8777615d2 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEInfo.cpp
@@ -52,6 +52,7 @@ bool CSEConfigFull::shouldCSEOpc(unsigned Opc) {
   case TargetOpcode::G_SREM:
   case TargetOpcode::G_CONSTANT:
   case TargetOpcode::G_FCONSTANT:
+  case TargetOpcode::G_IMPLICIT_DEF:
   case TargetOpcode::G_ZEXT:
   case TargetOpcode::G_SEXT:
   case TargetOpcode::G_ANYEXT:
@@ -64,7 +65,7 @@ bool CSEConfigFull::shouldCSEOpc(unsigned Opc) {
 }
 
 bool CSEConfigConstantOnly::shouldCSEOpc(unsigned Opc) {
-  return Opc == TargetOpcode::G_CONSTANT;
+  return Opc == TargetOpcode::G_CONSTANT || Opc == TargetOpcode::G_IMPLICIT_DEF;
 }
 
 std::unique_ptr<CSEConfigBase>
@@ -216,9 +217,6 @@ void GISelCSEInfo::handleRecordedInsts() {
 }
 
 bool GISelCSEInfo::shouldCSE(unsigned Opc) const {
-  // Only GISel opcodes are CSEable
-  if (!isPreISelGenericOpcode(Opc))
-    return false;
   assert(CSEOpt.get() && "CSEConfig not set");
   return CSEOpt->shouldCSEOpc(Opc);
 }
@@ -260,6 +258,39 @@ void GISelCSEInfo::releaseMemory() {
 #endif
 }
 
+Error GISelCSEInfo::verify() {
+#ifndef NDEBUG
+  handleRecordedInsts();
+  // For each instruction in map from MI -> UMI,
+  // Profile(MI) and make sure UMI is found for that profile.
+  for (auto &It : InstrMapping) {
+    FoldingSetNodeID TmpID;
+    GISelInstProfileBuilder(TmpID, *MRI).addNodeID(It.first);
+    void *InsertPos;
+    UniqueMachineInstr *FoundNode =
+        CSEMap.FindNodeOrInsertPos(TmpID, InsertPos);
+    if (FoundNode != It.second)
+      return createStringError(std::errc::not_supported,
+                               "CSEMap mismatch, InstrMapping has MIs without "
+                               "corresponding Nodes in CSEMap");
+  }
+
+  // For every node in the CSEMap, make sure that the InstrMapping
+  // points to it.
+  for (auto It = CSEMap.begin(), End = CSEMap.end(); It != End; ++It) {
+    const UniqueMachineInstr &UMI = *It;
+    if (!InstrMapping.count(UMI.MI))
+      return createStringError(std::errc::not_supported,
+                               "Node in CSE without InstrMapping", UMI.MI);
+
+    if (InstrMapping[UMI.MI] != &UMI)
+      return createStringError(std::make_error_code(std::errc::not_supported),
+                               "Mismatch in CSE mapping");
+  }
+#endif
+  return Error::success();
+}
+
 void GISelCSEInfo::print() {
   LLVM_DEBUG(for (auto &It
                   : OpcodeHitTable) {
@@ -286,7 +317,7 @@ GISelInstProfileBuilder::addNodeIDOpcode(unsigned Opc) const {
 }
 
 const GISelInstProfileBuilder &
-GISelInstProfileBuilder::addNodeIDRegType(const LLT &Ty) const {
+GISelInstProfileBuilder::addNodeIDRegType(const LLT Ty) const {
   uint64_t Val = Ty.getUniqueRAWLLTData();
   ID.AddInteger(Val);
   return *this;
@@ -311,13 +342,13 @@ GISelInstProfileBuilder::addNodeIDImmediate(int64_t Imm) const {
 }
 
 const GISelInstProfileBuilder &
-GISelInstProfileBuilder::addNodeIDRegNum(unsigned Reg) const {
+GISelInstProfileBuilder::addNodeIDRegNum(Register Reg) const {
   ID.AddInteger(Reg);
   return *this;
 }
 
 const GISelInstProfileBuilder &
-GISelInstProfileBuilder::addNodeIDRegType(const unsigned Reg) const {
+GISelInstProfileBuilder::addNodeIDRegType(const Register Reg) const {
   addNodeIDMachineOperand(MachineOperand::CreateReg(Reg, false));
   return *this;
 }
@@ -344,12 +375,14 @@ const GISelInstProfileBuilder &GISelInstProfileBuilder::addNodeIDMachineOperand(
     LLT Ty = MRI.getType(Reg);
     if (Ty.isValid())
       addNodeIDRegType(Ty);
-    auto *RB = MRI.getRegBankOrNull(Reg);
-    if (RB)
-      addNodeIDRegType(RB);
-    auto *RC = MRI.getRegClassOrNull(Reg);
-    if (RC)
-      addNodeIDRegType(RC);
+
+    if (const RegClassOrRegBank &RCOrRB = MRI.getRegClassOrRegBank(Reg)) {
+      if (const auto *RB = RCOrRB.dyn_cast<const RegisterBank *>())
+        addNodeIDRegType(RB);
+      else if (const auto *RC = RCOrRB.dyn_cast<const TargetRegisterClass *>())
+        addNodeIDRegType(RC);
+    }
+
     assert(!MO.isImplicit() && "Unhandled case");
   } else if (MO.isImm())
     ID.AddInteger(MO.getImm());
@@ -369,6 +402,7 @@ GISelCSEInfo &
 GISelCSEAnalysisWrapper::get(std::unique_ptr<CSEConfigBase> CSEOpt,
                              bool Recompute) {
   if (!AlreadyComputed || Recompute) {
+    Info.releaseMemory();
     Info.setCSEConfig(std::move(CSEOpt));
     Info.analyze(*MF);
     AlreadyComputed = true;
diff --git a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
index 51a74793f029..88173dc4d302 100644
--- a/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CSEMIRBuilder.cpp
@@ -129,7 +129,7 @@ CSEMIRBuilder::generateCopiesIfRequired(ArrayRef<DstOp> DstOps,
   if (DstOps.size() == 1) {
     const DstOp &Op = DstOps[0];
     if (Op.getDstOpKind() == DstOp::DstType::Ty_Reg)
-      return buildCopy(Op.getReg(), MIB->getOperand(0).getReg());
+      return buildCopy(Op.getReg(), MIB.getReg(0));
   }
   return MIB;
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
index 4c2dbdd905f3..a7146515c4c9 100644
--- a/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CallLowering.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "call-lowering"
 
@@ -29,48 +30,50 @@ using namespace llvm;
 
 void CallLowering::anchor() {}
 
-bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, ImmutableCallSite CS,
+bool CallLowering::lowerCall(MachineIRBuilder &MIRBuilder, const CallBase &CB,
                              ArrayRef<Register> ResRegs,
                              ArrayRef<ArrayRef<Register>> ArgRegs,
                              Register SwiftErrorVReg,
                              std::function<unsigned()> GetCalleeReg) const {
   CallLoweringInfo Info;
-  auto &DL = CS.getParent()->getParent()->getParent()->getDataLayout();
+  const DataLayout &DL = MIRBuilder.getDataLayout();
 
   // First step is to marshall all the function's parameters into the correct
   // physregs and memory locations. Gather the sequence of argument types that
   // we'll pass to the assigner function.
   unsigned i = 0;
-  unsigned NumFixedArgs = CS.getFunctionType()->getNumParams();
-  for (auto &Arg : CS.args()) {
+  unsigned NumFixedArgs = CB.getFunctionType()->getNumParams();
+  for (auto &Arg : CB.args()) {
     ArgInfo OrigArg{ArgRegs[i], Arg->getType(), ISD::ArgFlagsTy{},
                     i < NumFixedArgs};
-    setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CS);
+    setArgFlags(OrigArg, i + AttributeList::FirstArgIndex, DL, CB);
     Info.OrigArgs.push_back(OrigArg);
     ++i;
   }
 
-  if (const Function *F = CS.getCalledFunction())
+  // Try looking through a bitcast from one function type to another.
+  // Commonly happens with calls to objc_msgSend().
+  const Value *CalleeV = CB.getCalledOperand()->stripPointerCasts();
+  if (const Function *F = dyn_cast<Function>(CalleeV))
     Info.Callee = MachineOperand::CreateGA(F, 0);
   else
     Info.Callee = MachineOperand::CreateReg(GetCalleeReg(), false);
 
-  Info.OrigRet = ArgInfo{ResRegs, CS.getType(), ISD::ArgFlagsTy{}};
+  Info.OrigRet = ArgInfo{ResRegs, CB.getType(), ISD::ArgFlagsTy{}};
   if (!Info.OrigRet.Ty->isVoidTy())
-    setArgFlags(Info.OrigRet, AttributeList::ReturnIndex, DL, CS);
+    setArgFlags(Info.OrigRet, AttributeList::ReturnIndex, DL, CB);
 
-  Info.KnownCallees =
-      CS.getInstruction()->getMetadata(LLVMContext::MD_callees);
-  Info.CallConv = CS.getCallingConv();
+  MachineFunction &MF = MIRBuilder.getMF();
+  Info.KnownCallees = CB.getMetadata(LLVMContext::MD_callees);
+  Info.CallConv = CB.getCallingConv();
   Info.SwiftErrorVReg = SwiftErrorVReg;
-  Info.IsMustTailCall = CS.isMustTailCall();
-  Info.IsTailCall = CS.isTailCall() &&
-                    isInTailCallPosition(CS, MIRBuilder.getMF().getTarget()) &&
-                    (MIRBuilder.getMF()
-                         .getFunction()
-                         .getFnAttribute("disable-tail-calls")
-                         .getValueAsString() != "true");
-  Info.IsVarArg = CS.getFunctionType()->isVarArg();
+  Info.IsMustTailCall = CB.isMustTailCall();
+  Info.IsTailCall =
+      CB.isTailCall() && isInTailCallPosition(CB, MF.getTarget()) &&
+      (MF.getFunction()
+           .getFnAttribute("disable-tail-calls")
+           .getValueAsString() != "true");
+  Info.IsVarArg = CB.getFunctionType()->isVarArg();
   return lowerCall(MIRBuilder, Info);
 }
 
@@ -94,10 +97,12 @@ void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
     Flags.setSwiftError();
   if (Attrs.hasAttribute(OpIdx, Attribute::ByVal))
     Flags.setByVal();
+  if (Attrs.hasAttribute(OpIdx, Attribute::Preallocated))
+    Flags.setPreallocated();
   if (Attrs.hasAttribute(OpIdx, Attribute::InAlloca))
     Flags.setInAlloca();
 
-  if (Flags.isByVal() || Flags.isInAlloca()) {
+  if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) {
     Type *ElementTy = cast<PointerType>(Arg.Ty)->getElementType();
 
     auto Ty = Attrs.getAttribute(OpIdx, Attribute::ByVal).getValueAsType();
@@ -105,16 +110,16 @@ void CallLowering::setArgFlags(CallLowering::ArgInfo &Arg, unsigned OpIdx,
 
     // For ByVal, alignment should be passed from FE.  BE will guess if
     // this info is not there but there are cases it cannot get right.
-    unsigned FrameAlign;
-    if (FuncInfo.getParamAlignment(OpIdx - 2))
-      FrameAlign = FuncInfo.getParamAlignment(OpIdx - 2);
+    Align FrameAlign;
+    if (auto ParamAlign = FuncInfo.getParamAlign(OpIdx - 2))
+      FrameAlign = *ParamAlign;
     else
-      FrameAlign = getTLI()->getByValTypeAlignment(ElementTy, DL);
-    Flags.setByValAlign(Align(FrameAlign));
+      FrameAlign = Align(getTLI()->getByValTypeAlignment(ElementTy, DL));
+    Flags.setByValAlign(FrameAlign);
   }
   if (Attrs.hasAttribute(OpIdx, Attribute::Nest))
     Flags.setNest();
-  Flags.setOrigAlign(Align(DL.getABITypeAlignment(Arg.Ty)));
+  Flags.setOrigAlign(DL.getABITypeAlign(Arg.Ty));
 }
 
 template void
@@ -123,9 +128,9 @@ CallLowering::setArgFlags<Function>(CallLowering::ArgInfo &Arg, unsigned OpIdx,
                                     const Function &FuncInfo) const;
 
 template void
-CallLowering::setArgFlags<CallInst>(CallLowering::ArgInfo &Arg, unsigned OpIdx,
+CallLowering::setArgFlags<CallBase>(CallLowering::ArgInfo &Arg, unsigned OpIdx,
                                     const DataLayout &DL,
-                                    const CallInst &FuncInfo) const;
+                                    const CallBase &FuncInfo) const;
 
 Register CallLowering::packRegs(ArrayRef<Register> SrcRegs, Type *PackedTy,
                                 MachineIRBuilder &MIRBuilder) const {
@@ -157,7 +162,7 @@ void CallLowering::unpackRegs(ArrayRef<Register> DstRegs, Register SrcReg,
                               MachineIRBuilder &MIRBuilder) const {
   assert(DstRegs.size() > 1 && "Nothing to unpack");
 
-  const DataLayout &DL = MIRBuilder.getMF().getDataLayout();
+  const DataLayout &DL = MIRBuilder.getDataLayout();
 
   SmallVector<LLT, 8> LLTs;
   SmallVector<uint64_t, 8> Offsets;
@@ -189,11 +194,11 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
 
   unsigned NumArgs = Args.size();
   for (unsigned i = 0; i != NumArgs; ++i) {
-    MVT CurVT = MVT::getVT(Args[i].Ty);
-    if (Handler.assignArg(i, CurVT, CurVT, CCValAssign::Full, Args[i],
-                          Args[i].Flags[0], CCInfo)) {
-      if (!CurVT.isValid())
-        return false;
+    EVT CurVT = EVT::getEVT(Args[i].Ty);
+    if (!CurVT.isSimple() ||
+        Handler.assignArg(i, CurVT.getSimpleVT(), CurVT.getSimpleVT(),
+                          CCValAssign::Full, Args[i], Args[i].Flags[0],
+                          CCInfo)) {
       MVT NewVT = TLI->getRegisterTypeForCallingConv(
           F.getContext(), F.getCallingConv(), EVT(CurVT));
 
@@ -239,7 +244,7 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
             if (Part == 0) {
               Flags.setSplit();
             } else {
-              Flags.setOrigAlign(Align::None());
+              Flags.setOrigAlign(Align(1));
               if (Part == NumParts - 1)
                 Flags.setSplitEnd();
             }
@@ -272,7 +277,7 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
           if (PartIdx == 0) {
             Flags.setSplit();
           } else {
-            Flags.setOrigAlign(Align::None());
+            Flags.setOrigAlign(Align(1));
             if (PartIdx == NumParts - 1)
               Flags.setSplitEnd();
           }
@@ -293,15 +298,21 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
     assert(VA.getValNo() == i && "Location doesn't correspond to current arg");
 
     if (VA.needsCustom()) {
-      j += Handler.assignCustomValue(Args[i], makeArrayRef(ArgLocs).slice(j));
+      unsigned NumArgRegs =
+          Handler.assignCustomValue(Args[i], makeArrayRef(ArgLocs).slice(j));
+      if (!NumArgRegs)
+        return false;
+      j += NumArgRegs;
       continue;
     }
 
     // FIXME: Pack registers if we have more than one.
     Register ArgReg = Args[i].Regs[0];
 
-    MVT OrigVT = MVT::getVT(Args[i].Ty);
-    MVT VAVT = VA.getValVT();
+    EVT OrigVT = EVT::getEVT(Args[i].Ty);
+    EVT VAVT = VA.getValVT();
+    const LLT OrigTy = getLLTForType(*Args[i].Ty, DL);
+
     if (VA.isRegLoc()) {
       if (Handler.isIncomingArgumentHandler() && VAVT != OrigVT) {
         if (VAVT.getSizeInBits() < OrigVT.getSizeInBits()) {
@@ -323,7 +334,7 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
           MIRBuilder.buildMerge(Args[i].OrigRegs[0], Args[i].Regs);
           continue;
         }
-        const LLT VATy(VAVT);
+        const LLT VATy(VAVT.getSimpleVT());
         Register NewReg =
             MIRBuilder.getMRI()->createGenericVirtualRegister(VATy);
         Handler.assignValueToReg(NewReg, VA.getLocReg(), VA);
@@ -331,7 +342,6 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
         // or do an unmerge to get the lower block of elements.
         if (VATy.isVector() &&
             VATy.getNumElements() > OrigVT.getVectorNumElements()) {
-          const LLT OrigTy(OrigVT);
           // Just handle the case where the VA type is 2 * original type.
           if (VATy.getNumElements() != OrigVT.getVectorNumElements() * 2) {
             LLVM_DEBUG(dbgs()
@@ -371,7 +381,7 @@ bool CallLowering::handleAssignments(CCState &CCInfo,
       unsigned Offset = VA.getLocMemOffset();
       MachinePointerInfo MPO;
       Register StackAddr = Handler.getStackAddress(Size, Offset, MPO);
-      Handler.assignValueToAddress(ArgReg, StackAddr, Size, MPO, VA);
+      Handler.assignValueToAddress(Args[i], StackAddr, Size, MPO, VA);
     } else {
       // FIXME: Support byvals and other weirdness
       return false;
@@ -456,10 +466,19 @@ bool CallLowering::resultsCompatible(CallLoweringInfo &Info,
 }
 
 Register CallLowering::ValueHandler::extendRegister(Register ValReg,
-                                                    CCValAssign &VA) {
+                                                    CCValAssign &VA,
+                                                    unsigned MaxSizeBits) {
   LLT LocTy{VA.getLocVT()};
-  if (LocTy.getSizeInBits() == MRI.getType(ValReg).getSizeInBits())
+  LLT ValTy = MRI.getType(ValReg);
+  if (LocTy.getSizeInBits() == ValTy.getSizeInBits())
     return ValReg;
+
+  if (LocTy.isScalar() && MaxSizeBits && MaxSizeBits < LocTy.getSizeInBits()) {
+    if (MaxSizeBits <= ValTy.getSizeInBits())
+      return ValReg;
+    LocTy = LLT::scalar(MaxSizeBits);
+  }
+
   switch (VA.getLocInfo()) {
   default: break;
   case CCValAssign::Full:
@@ -469,7 +488,7 @@ Register CallLowering::ValueHandler::extendRegister(Register ValReg,
     return ValReg;
   case CCValAssign::AExt: {
     auto MIB = MIRBuilder.buildAnyExt(LocTy, ValReg);
-    return MIB->getOperand(0).getReg();
+    return MIB.getReg(0);
   }
   case CCValAssign::SExt: {
     Register NewReg = MRI.createGenericVirtualRegister(LocTy);
diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index a103e8e4e6e0..194961ae3b21 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -9,6 +9,8 @@
 #include "llvm/CodeGen/GlobalISel/Combiner.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineDominators.h"
@@ -17,11 +19,13 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "gi-combiner"
 
 using namespace llvm;
+using namespace MIPatternMatch;
 
 // Option to allow testing of the combiner while no targets know about indexed
 // addressing.
@@ -33,9 +37,10 @@ static cl::opt<bool>
 
 CombinerHelper::CombinerHelper(GISelChangeObserver &Observer,
                                MachineIRBuilder &B, GISelKnownBits *KB,
-                               MachineDominatorTree *MDT)
+                               MachineDominatorTree *MDT,
+                               const LegalizerInfo *LI)
     : Builder(B), MRI(Builder.getMF().getRegInfo()), Observer(Observer),
-      KB(KB), MDT(MDT) {
+      KB(KB), MDT(MDT), LI(LI) {
   (void)this->KB;
 }
 
@@ -74,36 +79,7 @@ bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
     return false;
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
-
-  // Give up if either DstReg or SrcReg  is a physical register.
-  if (Register::isPhysicalRegister(DstReg) ||
-      Register::isPhysicalRegister(SrcReg))
-    return false;
-
-  // Give up the types don't match.
-  LLT DstTy = MRI.getType(DstReg);
-  LLT SrcTy = MRI.getType(SrcReg);
-  // Give up if one has a valid LLT, but the other doesn't.
-  if (DstTy.isValid() != SrcTy.isValid())
-    return false;
-  // Give up if the types don't match.
-  if (DstTy.isValid() && SrcTy.isValid() && DstTy != SrcTy)
-    return false;
-
-  // Get the register banks and classes.
-  const RegisterBank *DstBank = MRI.getRegBankOrNull(DstReg);
-  const RegisterBank *SrcBank = MRI.getRegBankOrNull(SrcReg);
-  const TargetRegisterClass *DstRC = MRI.getRegClassOrNull(DstReg);
-  const TargetRegisterClass *SrcRC = MRI.getRegClassOrNull(SrcReg);
-
-  // Replace if the register constraints match.
-  if ((SrcRC == DstRC) && (SrcBank == DstBank))
-    return true;
-  // Replace if DstReg has no constraints.
-  if (!DstBank && !DstRC)
-    return true;
-
-  return false;
+  return canReplaceReg(DstReg, SrcReg, MRI);
 }
 void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
   Register DstReg = MI.getOperand(0).getReg();
@@ -294,7 +270,7 @@ namespace {
 /// Select a preference between two uses. CurrentUse is the current preference
 /// while *ForCandidate is attributes of the candidate under consideration.
 PreferredTuple ChoosePreferredUse(PreferredTuple &CurrentUse,
-                                  const LLT &TyForCandidate,
+                                  const LLT TyForCandidate,
                                   unsigned OpcodeForCandidate,
                                   MachineInstr *MIForCandidate) {
   if (!CurrentUse.Ty.isValid()) {
@@ -428,10 +404,23 @@ bool CombinerHelper::matchCombineExtendingLoads(MachineInstr &MI,
                                        ? TargetOpcode::G_SEXT
                                        : TargetOpcode::G_ZEXT;
   Preferred = {LLT(), PreferredOpcode, nullptr};
-  for (auto &UseMI : MRI.use_instructions(LoadValue.getReg())) {
+  for (auto &UseMI : MRI.use_nodbg_instructions(LoadValue.getReg())) {
     if (UseMI.getOpcode() == TargetOpcode::G_SEXT ||
         UseMI.getOpcode() == TargetOpcode::G_ZEXT ||
-        UseMI.getOpcode() == TargetOpcode::G_ANYEXT) {
+        (UseMI.getOpcode() == TargetOpcode::G_ANYEXT)) {
+      // Check for legality.
+      if (LI) {
+        LegalityQuery::MemDesc MMDesc;
+        const auto &MMO = **MI.memoperands_begin();
+        MMDesc.SizeInBits = MMO.getSizeInBits();
+        MMDesc.AlignInBits = MMO.getAlign().value() * 8;
+        MMDesc.Ordering = MMO.getOrdering();
+        LLT UseTy = MRI.getType(UseMI.getOperand(0).getReg());
+        LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+        if (LI->getAction({MI.getOpcode(), {UseTy, SrcTy}, {MMDesc}}).Action !=
+            LegalizeActions::Legal)
+          continue;
+      }
       Preferred = ChoosePreferredUse(Preferred,
                                      MRI.getType(UseMI.getOperand(0).getReg()),
                                      UseMI.getOpcode(), &UseMI);
@@ -498,7 +487,7 @@ void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
         UseMI->getOpcode() == TargetOpcode::G_ANYEXT) {
       Register UseDstReg = UseMI->getOperand(0).getReg();
       MachineOperand &UseSrcMO = UseMI->getOperand(1);
-      const LLT &UseDstTy = MRI.getType(UseDstReg);
+      const LLT UseDstTy = MRI.getType(UseDstReg);
       if (UseDstReg != ChosenDstReg) {
         if (Preferred.Ty == UseDstTy) {
           // If the use has the same type as the preferred use, then merge
@@ -559,7 +548,10 @@ void CombinerHelper::applyCombineExtendingLoads(MachineInstr &MI,
   Observer.changedInstr(MI);
 }
 
-bool CombinerHelper::isPredecessor(MachineInstr &DefMI, MachineInstr &UseMI) {
+bool CombinerHelper::isPredecessor(const MachineInstr &DefMI,
+                                   const MachineInstr &UseMI) {
+  assert(!DefMI.isDebugInstr() && !UseMI.isDebugInstr() &&
+         "shouldn't consider debug uses");
   assert(DefMI.getParent() == UseMI.getParent());
   if (&DefMI == &UseMI)
     return false;
@@ -572,7 +564,10 @@ bool CombinerHelper::isPredecessor(MachineInstr &DefMI, MachineInstr &UseMI) {
   llvm_unreachable("Block must contain instructions");
 }
 
-bool CombinerHelper::dominates(MachineInstr &DefMI, MachineInstr &UseMI) {
+bool CombinerHelper::dominates(const MachineInstr &DefMI,
+                               const MachineInstr &UseMI) {
+  assert(!DefMI.isDebugInstr() && !UseMI.isDebugInstr() &&
+         "shouldn't consider debug uses");
   if (MDT)
     return MDT->dominates(&DefMI, &UseMI);
   else if (DefMI.getParent() != UseMI.getParent())
@@ -581,6 +576,24 @@ bool CombinerHelper::dominates(MachineInstr &DefMI, MachineInstr &UseMI) {
   return isPredecessor(DefMI, UseMI);
 }
 
+bool CombinerHelper::matchSextAlreadyExtended(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
+  Register SrcReg = MI.getOperand(1).getReg();
+  unsigned SrcSignBits = KB->computeNumSignBits(SrcReg);
+  unsigned NumSextBits =
+      MRI.getType(MI.getOperand(0).getReg()).getScalarSizeInBits() -
+      MI.getOperand(2).getImm();
+  return SrcSignBits >= NumSextBits;
+}
+
+bool CombinerHelper::applySextAlreadyExtended(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SEXT_INREG);
+  MachineIRBuilder MIB(MI);
+  MIB.buildCopy(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
                                             Register &Base, Register &Offset) {
   auto &MF = *MI.getParent()->getParent();
@@ -599,7 +612,7 @@ bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
 
   LLVM_DEBUG(dbgs() << "Searching for post-indexing opportunity for: " << MI);
 
-  for (auto &Use : MRI.use_instructions(Base)) {
+  for (auto &Use : MRI.use_nodbg_instructions(Base)) {
     if (Use.getOpcode() != TargetOpcode::G_PTR_ADD)
       continue;
 
@@ -626,7 +639,8 @@ bool CombinerHelper::findPostIndexCandidate(MachineInstr &MI, Register &Addr,
     // forming an indexed one.
 
     bool MemOpDominatesAddrUses = true;
-    for (auto &PtrAddUse : MRI.use_instructions(Use.getOperand(0).getReg())) {
+    for (auto &PtrAddUse :
+         MRI.use_nodbg_instructions(Use.getOperand(0).getReg())) {
       if (!dominates(MI, PtrAddUse)) {
         MemOpDominatesAddrUses = false;
         break;
@@ -661,7 +675,7 @@ bool CombinerHelper::findPreIndexCandidate(MachineInstr &MI, Register &Addr,
 
   Addr = MI.getOperand(1).getReg();
   MachineInstr *AddrDef = getOpcodeDef(TargetOpcode::G_PTR_ADD, Addr, MRI);
-  if (!AddrDef || MRI.hasOneUse(Addr))
+  if (!AddrDef || MRI.hasOneNonDBGUse(Addr))
     return false;
 
   Base = AddrDef->getOperand(1).getReg();
@@ -699,7 +713,7 @@ bool CombinerHelper::findPreIndexCandidate(MachineInstr &MI, Register &Addr,
   // FIXME: check whether all uses of the base pointer are constant PtrAdds.
   // That might allow us to end base's liveness here by adjusting the constant.
 
-  for (auto &UseMI : MRI.use_instructions(Addr)) {
+  for (auto &UseMI : MRI.use_nodbg_instructions(Addr)) {
     if (!dominates(MI, UseMI)) {
       LLVM_DEBUG(dbgs() << "    Skipping, does not dominate all addr uses.");
       return false;
@@ -811,7 +825,7 @@ bool CombinerHelper::matchElideBrByInvertingCond(MachineInstr &MI) {
 
   MachineInstr *CmpMI = MRI.getVRegDef(BrCond->getOperand(0).getReg());
   if (!CmpMI || CmpMI->getOpcode() != TargetOpcode::G_ICMP ||
-      !MRI.hasOneUse(CmpMI->getOperand(0).getReg()))
+      !MRI.hasOneNonDBGUse(CmpMI->getOperand(0).getReg()))
     return false;
   return true;
 }
@@ -854,38 +868,32 @@ static bool shouldLowerMemFuncForSize(const MachineFunction &MF) {
 
 // Returns a list of types to use for memory op lowering in MemOps. A partial
 // port of findOptimalMemOpLowering in TargetLowering.
-static bool findGISelOptimalMemOpLowering(
-    std::vector<LLT> &MemOps, unsigned Limit, uint64_t Size, unsigned DstAlign,
-    unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
-    bool AllowOverlap, unsigned DstAS, unsigned SrcAS,
-    const AttributeList &FuncAttributes, const TargetLowering &TLI) {
-  // If 'SrcAlign' is zero, that means the memory operation does not need to
-  // load the value, i.e. memset or memcpy from constant string. Otherwise,
-  // it's the inferred alignment of the source. 'DstAlign', on the other hand,
-  // is the specified alignment of the memory operation. If it is zero, that
-  // means it's possible to change the alignment of the destination.
-  // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
-  // not need to be loaded.
-  if (SrcAlign != 0 && SrcAlign < DstAlign)
+static bool findGISelOptimalMemOpLowering(std::vector<LLT> &MemOps,
+                                          unsigned Limit, const MemOp &Op,
+                                          unsigned DstAS, unsigned SrcAS,
+                                          const AttributeList &FuncAttributes,
+                                          const TargetLowering &TLI) {
+  if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
     return false;
 
-  LLT Ty = TLI.getOptimalMemOpLLT(Size, DstAlign, SrcAlign, IsMemset,
-                                  ZeroMemset, MemcpyStrSrc, FuncAttributes);
+  LLT Ty = TLI.getOptimalMemOpLLT(Op, FuncAttributes);
 
   if (Ty == LLT()) {
     // Use the largest scalar type whose alignment constraints are satisfied.
     // We only need to check DstAlign here as SrcAlign is always greater or
     // equal to DstAlign (or zero).
     Ty = LLT::scalar(64);
-    while (DstAlign && DstAlign < Ty.getSizeInBytes() &&
-           !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, DstAlign))
-      Ty = LLT::scalar(Ty.getSizeInBytes());
+    if (Op.isFixedDstAlign())
+      while (Op.getDstAlign() < Ty.getSizeInBytes() &&
+             !TLI.allowsMisalignedMemoryAccesses(Ty, DstAS, Op.getDstAlign()))
+        Ty = LLT::scalar(Ty.getSizeInBytes());
     assert(Ty.getSizeInBits() > 0 && "Could not find valid type");
     // FIXME: check for the largest legal type we can load/store to.
   }
 
   unsigned NumMemOps = 0;
-  while (Size != 0) {
+  uint64_t Size = Op.size();
+  while (Size) {
     unsigned TySize = Ty.getSizeInBytes();
     while (TySize > Size) {
       // For now, only use non-vector load / store's for the left-over pieces.
@@ -903,9 +911,10 @@ static bool findGISelOptimalMemOpLowering(
       bool Fast;
       // Need to get a VT equivalent for allowMisalignedMemoryAccesses().
       MVT VT = getMVTForLLT(Ty);
-      if (NumMemOps && AllowOverlap && NewTySize < Size &&
+      if (NumMemOps && Op.allowOverlap() && NewTySize < Size &&
           TLI.allowsMisalignedMemoryAccesses(
-              VT, DstAS, DstAlign, MachineMemOperand::MONone, &Fast) &&
+              VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign().value() : 0,
+              MachineMemOperand::MONone, &Fast) &&
           Fast)
         TySize = Size;
       else {
@@ -926,8 +935,8 @@ static bool findGISelOptimalMemOpLowering(
 
 static Type *getTypeForLLT(LLT Ty, LLVMContext &C) {
   if (Ty.isVector())
-    return VectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
-                           Ty.getNumElements());
+    return FixedVectorType::get(IntegerType::get(C, Ty.getScalarSizeInBits()),
+                                Ty.getNumElements());
   return IntegerType::get(C, Ty.getSizeInBits());
 }
 
@@ -942,12 +951,14 @@ static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
     APInt SplatVal = APInt::getSplat(NumBits, Scalar);
     return MIB.buildConstant(Ty, SplatVal).getReg(0);
   }
-  // FIXME: for vector types create a G_BUILD_VECTOR.
-  if (Ty.isVector())
-    return Register();
 
   // Extend the byte value to the larger type, and then multiply by a magic
   // value 0x010101... in order to replicate it across every byte.
+  // Unless it's zero, in which case just emit a larger G_CONSTANT 0.
+  if (ValVRegAndVal && ValVRegAndVal->Value == 0) {
+    return MIB.buildConstant(Ty, 0).getReg(0);
+  }
+
   LLT ExtType = Ty.getScalarType();
   auto ZExt = MIB.buildZExtOrTrunc(ExtType, Val);
   if (NumBits > 8) {
@@ -956,13 +967,16 @@ static Register getMemsetValue(Register Val, LLT Ty, MachineIRBuilder &MIB) {
     Val = MIB.buildMul(ExtType, ZExt, MagicMI).getReg(0);
   }
 
-  assert(ExtType == Ty && "Vector memset value type not supported yet");
+  // For vector types create a G_BUILD_VECTOR.
+  if (Ty.isVector())
+    Val = MIB.buildSplatVector(Ty, Val).getReg(0);
+
   return Val;
 }
 
-bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst, Register Val,
-                                    unsigned KnownLen, unsigned Align,
-                                    bool IsVolatile) {
+bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst,
+                                    Register Val, unsigned KnownLen,
+                                    Align Alignment, bool IsVolatile) {
   auto &MF = *MI.getParent()->getParent();
   const auto &TLI = *MF.getSubtarget().getTargetLowering();
   auto &DL = MF.getDataLayout();
@@ -987,24 +1001,25 @@ bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst, Register Val
   auto ValVRegAndVal = getConstantVRegValWithLookThrough(Val, MRI);
   bool IsZeroVal = ValVRegAndVal && ValVRegAndVal->Value == 0;
 
-  if (!findGISelOptimalMemOpLowering(
-          MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Align), 0,
-          /*IsMemset=*/true,
-          /*ZeroMemset=*/IsZeroVal, /*MemcpyStrSrc=*/false,
-          /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(), ~0u,
-          MF.getFunction().getAttributes(), TLI))
+  if (!findGISelOptimalMemOpLowering(MemOps, Limit,
+                                     MemOp::Set(KnownLen, DstAlignCanChange,
+                                                Alignment,
+                                                /*IsZeroMemset=*/IsZeroVal,
+                                                /*IsVolatile=*/IsVolatile),
+                                     DstPtrInfo.getAddrSpace(), ~0u,
+                                     MF.getFunction().getAttributes(), TLI))
     return false;
 
   if (DstAlignCanChange) {
     // Get an estimate of the type from the LLT.
     Type *IRTy = getTypeForLLT(MemOps[0], C);
-    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
-    if (NewAlign > Align) {
-      Align = NewAlign;
+    Align NewAlign = DL.getABITypeAlign(IRTy);
+    if (NewAlign > Alignment) {
+      Alignment = NewAlign;
       unsigned FI = FIDef->getOperand(1).getIndex();
       // Give the stack frame object a larger alignment if needed.
-      if (MFI.getObjectAlignment(FI) < Align)
-        MFI.setObjectAlignment(FI, Align);
+      if (MFI.getObjectAlign(FI) < Alignment)
+        MFI.setObjectAlignment(FI, Alignment);
     }
   }
 
@@ -1072,10 +1087,9 @@ bool CombinerHelper::optimizeMemset(MachineInstr &MI, Register Dst, Register Val
   return true;
 }
 
-
 bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
                                     Register Src, unsigned KnownLen,
-                                    unsigned DstAlign, unsigned SrcAlign,
+                                    Align DstAlign, Align SrcAlign,
                                     bool IsVolatile) {
   auto &MF = *MI.getParent()->getParent();
   const auto &TLI = *MF.getSubtarget().getTargetLowering();
@@ -1087,7 +1101,7 @@ bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
   bool DstAlignCanChange = false;
   MachineFrameInfo &MFI = MF.getFrameInfo();
   bool OptSize = shouldLowerMemFuncForSize(MF);
-  unsigned Alignment = MinAlign(DstAlign, SrcAlign);
+  Align Alignment = commonAlignment(DstAlign, SrcAlign);
 
   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
@@ -1106,32 +1120,30 @@ bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
   MachinePointerInfo SrcPtrInfo = SrcMMO.getPointerInfo();
 
   if (!findGISelOptimalMemOpLowering(
-          MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Alignment),
-          SrcAlign,
-          /*IsMemset=*/false,
-          /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
-          /*AllowOverlap=*/!IsVolatile, DstPtrInfo.getAddrSpace(),
-          SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI))
+          MemOps, Limit,
+          MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
+                      IsVolatile),
+          DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
+          MF.getFunction().getAttributes(), TLI))
     return false;
 
   if (DstAlignCanChange) {
     // Get an estimate of the type from the LLT.
     Type *IRTy = getTypeForLLT(MemOps[0], C);
-    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
+    Align NewAlign = DL.getABITypeAlign(IRTy);
 
     // Don't promote to an alignment that would require dynamic stack
     // realignment.
     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->needsStackRealignment(MF))
-      while (NewAlign > Alignment &&
-             DL.exceedsNaturalStackAlignment(Align(NewAlign)))
-        NewAlign /= 2;
+      while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
+        NewAlign = NewAlign / 2;
 
     if (NewAlign > Alignment) {
       Alignment = NewAlign;
       unsigned FI = FIDef->getOperand(1).getIndex();
       // Give the stack frame object a larger alignment if needed.
-      if (MFI.getObjectAlignment(FI) < Alignment)
+      if (MFI.getObjectAlign(FI) < Alignment)
         MFI.setObjectAlignment(FI, Alignment);
     }
   }
@@ -1156,7 +1168,7 @@ bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
     // Construct MMOs for the accesses.
     auto *LoadMMO =
         MF.getMachineMemOperand(&SrcMMO, CurrOffset, CopyTy.getSizeInBytes());
-    auto *StoreMMO = 
+    auto *StoreMMO =
         MF.getMachineMemOperand(&DstMMO, CurrOffset, CopyTy.getSizeInBytes());
 
     // Create the load.
@@ -1182,9 +1194,9 @@ bool CombinerHelper::optimizeMemcpy(MachineInstr &MI, Register Dst,
 }
 
 bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
-                                    Register Src, unsigned KnownLen,
-                                    unsigned DstAlign, unsigned SrcAlign,
-                                    bool IsVolatile) {
+                                     Register Src, unsigned KnownLen,
+                                     Align DstAlign, Align SrcAlign,
+                                     bool IsVolatile) {
   auto &MF = *MI.getParent()->getParent();
   const auto &TLI = *MF.getSubtarget().getTargetLowering();
   auto &DL = MF.getDataLayout();
@@ -1195,7 +1207,7 @@ bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
   bool DstAlignCanChange = false;
   MachineFrameInfo &MFI = MF.getFrameInfo();
   bool OptSize = shouldLowerMemFuncForSize(MF);
-  unsigned Alignment = MinAlign(DstAlign, SrcAlign);
+  Align Alignment = commonAlignment(DstAlign, SrcAlign);
 
   MachineInstr *FIDef = getOpcodeDef(TargetOpcode::G_FRAME_INDEX, Dst, MRI);
   if (FIDef && !MFI.isFixedObjectIndex(FIDef->getOperand(1).getIndex()))
@@ -1213,32 +1225,30 @@ bool CombinerHelper::optimizeMemmove(MachineInstr &MI, Register Dst,
   // to a bug in it's findOptimalMemOpLowering implementation. For now do the
   // same thing here.
   if (!findGISelOptimalMemOpLowering(
-          MemOps, Limit, KnownLen, (DstAlignCanChange ? 0 : Alignment),
-          SrcAlign,
-          /*IsMemset=*/false,
-          /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
-          /*AllowOverlap=*/false, DstPtrInfo.getAddrSpace(),
-          SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes(), TLI))
+          MemOps, Limit,
+          MemOp::Copy(KnownLen, DstAlignCanChange, Alignment, SrcAlign,
+                      /*IsVolatile*/ true),
+          DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
+          MF.getFunction().getAttributes(), TLI))
     return false;
 
   if (DstAlignCanChange) {
     // Get an estimate of the type from the LLT.
     Type *IRTy = getTypeForLLT(MemOps[0], C);
-    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(IRTy);
+    Align NewAlign = DL.getABITypeAlign(IRTy);
 
     // Don't promote to an alignment that would require dynamic stack
     // realignment.
     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->needsStackRealignment(MF))
-      while (NewAlign > Alignment &&
-             DL.exceedsNaturalStackAlignment(Align(NewAlign)))
-        NewAlign /= 2;
+      while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
+        NewAlign = NewAlign / 2;
 
     if (NewAlign > Alignment) {
       Alignment = NewAlign;
       unsigned FI = FIDef->getOperand(1).getIndex();
       // Give the stack frame object a larger alignment if needed.
-      if (MFI.getObjectAlignment(FI) < Alignment)
+      if (MFI.getObjectAlign(FI) < Alignment)
         MFI.setObjectAlignment(FI, Alignment);
     }
   }
@@ -1304,8 +1314,8 @@ bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
   if (IsVolatile)
     return false;
 
-  unsigned DstAlign = MemOp->getBaseAlignment();
-  unsigned SrcAlign = 0;
+  Align DstAlign = MemOp->getBaseAlign();
+  Align SrcAlign;
   Register Dst = MI.getOperand(1).getReg();
   Register Src = MI.getOperand(2).getReg();
   Register Len = MI.getOperand(3).getReg();
@@ -1313,7 +1323,7 @@ bool CombinerHelper::tryCombineMemCpyFamily(MachineInstr &MI, unsigned MaxLen) {
   if (ID != Intrinsic::memset) {
     assert(MMOIt != MI.memoperands_end() && "Expected a second MMO on MI");
     MemOp = *(++MMOIt);
-    SrcAlign = MemOp->getBaseAlignment();
+    SrcAlign = MemOp->getBaseAlign();
   }
 
   // See if this is a constant length copy
@@ -1385,6 +1395,338 @@ bool CombinerHelper::applyPtrAddImmedChain(MachineInstr &MI,
   return true;
 }
 
+bool CombinerHelper::matchCombineMulToShl(MachineInstr &MI,
+                                          unsigned &ShiftVal) {
+  assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
+  auto MaybeImmVal =
+      getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+  if (!MaybeImmVal || !isPowerOf2_64(MaybeImmVal->Value))
+    return false;
+  ShiftVal = Log2_64(MaybeImmVal->Value);
+  return true;
+}
+
+bool CombinerHelper::applyCombineMulToShl(MachineInstr &MI,
+                                          unsigned &ShiftVal) {
+  assert(MI.getOpcode() == TargetOpcode::G_MUL && "Expected a G_MUL");
+  MachineIRBuilder MIB(MI);
+  LLT ShiftTy = MRI.getType(MI.getOperand(0).getReg());
+  auto ShiftCst = MIB.buildConstant(ShiftTy, ShiftVal);
+  Observer.changingInstr(MI);
+  MI.setDesc(MIB.getTII().get(TargetOpcode::G_SHL));
+  MI.getOperand(2).setReg(ShiftCst.getReg(0));
+  Observer.changedInstr(MI);
+  return true;
+}
+
+bool CombinerHelper::matchCombineShiftToUnmerge(MachineInstr &MI,
+                                                unsigned TargetShiftSize,
+                                                unsigned &ShiftVal) {
+  assert((MI.getOpcode() == TargetOpcode::G_SHL ||
+          MI.getOpcode() == TargetOpcode::G_LSHR ||
+          MI.getOpcode() == TargetOpcode::G_ASHR) && "Expected a shift");
+
+  LLT Ty = MRI.getType(MI.getOperand(0).getReg());
+  if (Ty.isVector()) // TODO:
+    return false;
+
+  // Don't narrow further than the requested size.
+  unsigned Size = Ty.getSizeInBits();
+  if (Size <= TargetShiftSize)
+    return false;
+
+  auto MaybeImmVal =
+    getConstantVRegValWithLookThrough(MI.getOperand(2).getReg(), MRI);
+  if (!MaybeImmVal)
+    return false;
+
+  ShiftVal = MaybeImmVal->Value;
+  return ShiftVal >= Size / 2 && ShiftVal < Size;
+}
+
+bool CombinerHelper::applyCombineShiftToUnmerge(MachineInstr &MI,
+                                                const unsigned &ShiftVal) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT Ty = MRI.getType(SrcReg);
+  unsigned Size = Ty.getSizeInBits();
+  unsigned HalfSize = Size / 2;
+  assert(ShiftVal >= HalfSize);
+
+  LLT HalfTy = LLT::scalar(HalfSize);
+
+  Builder.setInstr(MI);
+  auto Unmerge = Builder.buildUnmerge(HalfTy, SrcReg);
+  unsigned NarrowShiftAmt = ShiftVal - HalfSize;
+
+  if (MI.getOpcode() == TargetOpcode::G_LSHR) {
+    Register Narrowed = Unmerge.getReg(1);
+
+    //  dst = G_LSHR s64:x, C for C >= 32
+    // =>
+    //   lo, hi = G_UNMERGE_VALUES x
+    //   dst = G_MERGE_VALUES (G_LSHR hi, C - 32), 0
+
+    if (NarrowShiftAmt != 0) {
+      Narrowed = Builder.buildLShr(HalfTy, Narrowed,
+        Builder.buildConstant(HalfTy, NarrowShiftAmt)).getReg(0);
+    }
+
+    auto Zero = Builder.buildConstant(HalfTy, 0);
+    Builder.buildMerge(DstReg, { Narrowed, Zero });
+  } else if (MI.getOpcode() == TargetOpcode::G_SHL) {
+    Register Narrowed = Unmerge.getReg(0);
+    //  dst = G_SHL s64:x, C for C >= 32
+    // =>
+    //   lo, hi = G_UNMERGE_VALUES x
+    //   dst = G_MERGE_VALUES 0, (G_SHL hi, C - 32)
+    if (NarrowShiftAmt != 0) {
+      Narrowed = Builder.buildShl(HalfTy, Narrowed,
+        Builder.buildConstant(HalfTy, NarrowShiftAmt)).getReg(0);
+    }
+
+    auto Zero = Builder.buildConstant(HalfTy, 0);
+    Builder.buildMerge(DstReg, { Zero, Narrowed });
+  } else {
+    assert(MI.getOpcode() == TargetOpcode::G_ASHR);
+    auto Hi = Builder.buildAShr(
+      HalfTy, Unmerge.getReg(1),
+      Builder.buildConstant(HalfTy, HalfSize - 1));
+
+    if (ShiftVal == HalfSize) {
+      // (G_ASHR i64:x, 32) ->
+      //   G_MERGE_VALUES hi_32(x), (G_ASHR hi_32(x), 31)
+      Builder.buildMerge(DstReg, { Unmerge.getReg(1), Hi });
+    } else if (ShiftVal == Size - 1) {
+      // Don't need a second shift.
+      // (G_ASHR i64:x, 63) ->
+      //   %narrowed = (G_ASHR hi_32(x), 31)
+      //   G_MERGE_VALUES %narrowed, %narrowed
+      Builder.buildMerge(DstReg, { Hi, Hi });
+    } else {
+      auto Lo = Builder.buildAShr(
+        HalfTy, Unmerge.getReg(1),
+        Builder.buildConstant(HalfTy, ShiftVal - HalfSize));
+
+      // (G_ASHR i64:x, C) ->, for C >= 32
+      //   G_MERGE_VALUES (G_ASHR hi_32(x), C - 32), (G_ASHR hi_32(x), 31)
+      Builder.buildMerge(DstReg, { Lo, Hi });
+    }
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::tryCombineShiftToUnmerge(MachineInstr &MI,
+                                              unsigned TargetShiftAmount) {
+  unsigned ShiftAmt;
+  if (matchCombineShiftToUnmerge(MI, TargetShiftAmount, ShiftAmt)) {
+    applyCombineShiftToUnmerge(MI, ShiftAmt);
+    return true;
+  }
+
+  return false;
+}
+
+bool CombinerHelper::matchAnyExplicitUseIsUndef(MachineInstr &MI) {
+  return any_of(MI.explicit_uses(), [this](const MachineOperand &MO) {
+    return MO.isReg() &&
+           getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
+  });
+}
+
+bool CombinerHelper::matchAllExplicitUsesAreUndef(MachineInstr &MI) {
+  return all_of(MI.explicit_uses(), [this](const MachineOperand &MO) {
+    return !MO.isReg() ||
+           getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MO.getReg(), MRI);
+  });
+}
+
+bool CombinerHelper::matchUndefShuffleVectorMask(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+  return all_of(Mask, [](int Elt) { return Elt < 0; });
+}
+
+bool CombinerHelper::matchUndefStore(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_STORE);
+  return getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, MI.getOperand(0).getReg(),
+                      MRI);
+}
+
+bool CombinerHelper::eraseInst(MachineInstr &MI) {
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchEqualDefs(const MachineOperand &MOP1,
+                                    const MachineOperand &MOP2) {
+  if (!MOP1.isReg() || !MOP2.isReg())
+    return false;
+  MachineInstr *I1 = getDefIgnoringCopies(MOP1.getReg(), MRI);
+  if (!I1)
+    return false;
+  MachineInstr *I2 = getDefIgnoringCopies(MOP2.getReg(), MRI);
+  if (!I2)
+    return false;
+
+  // Handle a case like this:
+  //
+  // %0:_(s64), %1:_(s64) = G_UNMERGE_VALUES %2:_(<2 x s64>)
+  //
+  // Even though %0 and %1 are produced by the same instruction they are not
+  // the same values.
+  if (I1 == I2)
+    return MOP1.getReg() == MOP2.getReg();
+
+  // If we have an instruction which loads or stores, we can't guarantee that
+  // it is identical.
+  //
+  // For example, we may have
+  //
+  // %x1 = G_LOAD %addr (load N from @somewhere)
+  // ...
+  // call @foo
+  // ...
+  // %x2 = G_LOAD %addr (load N from @somewhere)
+  // ...
+  // %or = G_OR %x1, %x2
+  //
+  // It's possible that @foo will modify whatever lives at the address we're
+  // loading from. To be safe, let's just assume that all loads and stores
+  // are different (unless we have something which is guaranteed to not
+  // change.)
+  if (I1->mayLoadOrStore() && !I1->isDereferenceableInvariantLoad(nullptr))
+    return false;
+
+  // Check for physical registers on the instructions first to avoid cases
+  // like this:
+  //
+  // %a = COPY $physreg
+  // ...
+  // SOMETHING implicit-def $physreg
+  // ...
+  // %b = COPY $physreg
+  //
+  // These copies are not equivalent.
+  if (any_of(I1->uses(), [](const MachineOperand &MO) {
+        return MO.isReg() && MO.getReg().isPhysical();
+      })) {
+    // Check if we have a case like this:
+    //
+    // %a = COPY $physreg
+    // %b = COPY %a
+    //
+    // In this case, I1 and I2 will both be equal to %a = COPY $physreg.
+    // From that, we know that they must have the same value, since they must
+    // have come from the same COPY.
+    return I1->isIdenticalTo(*I2);
+  }
+
+  // We don't have any physical registers, so we don't necessarily need the
+  // same vreg defs.
+  //
+  // On the off-chance that there's some target instruction feeding into the
+  // instruction, let's use produceSameValue instead of isIdenticalTo.
+  return Builder.getTII().produceSameValue(*I1, *I2, &MRI);
+}
+
+bool CombinerHelper::matchConstantOp(const MachineOperand &MOP, int64_t C) {
+  if (!MOP.isReg())
+    return false;
+  // MIPatternMatch doesn't let us look through G_ZEXT etc.
+  auto ValAndVReg = getConstantVRegValWithLookThrough(MOP.getReg(), MRI);
+  return ValAndVReg && ValAndVReg->Value == C;
+}
+
+bool CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI,
+                                                     unsigned OpIdx) {
+  assert(MI.getNumExplicitDefs() == 1 && "Expected one explicit def?");
+  Register OldReg = MI.getOperand(0).getReg();
+  Register Replacement = MI.getOperand(OpIdx).getReg();
+  assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?");
+  MI.eraseFromParent();
+  replaceRegWith(MRI, OldReg, Replacement);
+  return true;
+}
+
+bool CombinerHelper::matchSelectSameVal(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SELECT);
+  // Match (cond ? x : x)
+  return matchEqualDefs(MI.getOperand(2), MI.getOperand(3)) &&
+         canReplaceReg(MI.getOperand(0).getReg(), MI.getOperand(2).getReg(),
+                       MRI);
+}
+
+bool CombinerHelper::matchBinOpSameVal(MachineInstr &MI) {
+  return matchEqualDefs(MI.getOperand(1), MI.getOperand(2)) &&
+         canReplaceReg(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(),
+                       MRI);
+}
+
+bool CombinerHelper::matchOperandIsZero(MachineInstr &MI, unsigned OpIdx) {
+  return matchConstantOp(MI.getOperand(OpIdx), 0) &&
+         canReplaceReg(MI.getOperand(0).getReg(), MI.getOperand(OpIdx).getReg(),
+                       MRI);
+}
+
+bool CombinerHelper::replaceInstWithFConstant(MachineInstr &MI, double C) {
+  assert(MI.getNumDefs() == 1 && "Expected only one def?");
+  Builder.setInstr(MI);
+  Builder.buildFConstant(MI.getOperand(0), C);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::replaceInstWithConstant(MachineInstr &MI, int64_t C) {
+  assert(MI.getNumDefs() == 1 && "Expected only one def?");
+  Builder.setInstr(MI);
+  Builder.buildConstant(MI.getOperand(0), C);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::replaceInstWithUndef(MachineInstr &MI) {
+  assert(MI.getNumDefs() == 1 && "Expected only one def?");
+  Builder.setInstr(MI);
+  Builder.buildUndef(MI.getOperand(0));
+  MI.eraseFromParent();
+  return true;
+}
+
+bool CombinerHelper::matchSimplifyAddToSub(
+    MachineInstr &MI, std::tuple<Register, Register> &MatchInfo) {
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+  Register &NewLHS = std::get<0>(MatchInfo);
+  Register &NewRHS = std::get<1>(MatchInfo);
+
+  // Helper lambda to check for opportunities for
+  // ((0-A) + B) -> B - A
+  // (A + (0-B)) -> A - B
+  auto CheckFold = [&](Register &MaybeSub, Register &MaybeNewLHS) {
+    int64_t Cst;
+    if (!mi_match(MaybeSub, MRI, m_GSub(m_ICst(Cst), m_Reg(NewRHS))) ||
+        Cst != 0)
+      return false;
+    NewLHS = MaybeNewLHS;
+    return true;
+  };
+
+  return CheckFold(LHS, RHS) || CheckFold(RHS, LHS);
+}
+
+bool CombinerHelper::applySimplifyAddToSub(
+    MachineInstr &MI, std::tuple<Register, Register> &MatchInfo) {
+  Builder.setInstr(MI);
+  Register SubLHS, SubRHS;
+  std::tie(SubLHS, SubRHS) = MatchInfo;
+  Builder.buildSub(MI.getOperand(0).getReg(), SubLHS, SubRHS);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool CombinerHelper::tryCombine(MachineInstr &MI) {
   if (tryCombineCopy(MI))
     return true;
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp b/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp
index 62b903c30b89..bdaa6378e901 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelChangeObserver.cpp
@@ -38,3 +38,11 @@ RAIIDelegateInstaller::RAIIDelegateInstaller(MachineFunction &MF,
 }
 
 RAIIDelegateInstaller::~RAIIDelegateInstaller() { MF.resetDelegate(Delegate); }
+
+RAIIMFObserverInstaller::RAIIMFObserverInstaller(MachineFunction &MF,
+                                                 GISelChangeObserver &Observer)
+    : MF(MF) {
+  MF.setObserver(&Observer);
+}
+
+RAIIMFObserverInstaller::~RAIIMFObserverInstaller() { MF.setObserver(nullptr); }
diff --git a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
index 64023ecfad82..0e9c6e4fab9f 100644
--- a/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/GISelKnownBits.cpp
@@ -11,6 +11,7 @@
 //
 //===------------------
 #include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -24,54 +25,50 @@ using namespace llvm;
 
 char llvm::GISelKnownBitsAnalysis::ID = 0;
 
-INITIALIZE_PASS_BEGIN(GISelKnownBitsAnalysis, DEBUG_TYPE,
-                      "Analysis for ComputingKnownBits", false, true)
-INITIALIZE_PASS_END(GISelKnownBitsAnalysis, DEBUG_TYPE,
-                    "Analysis for ComputingKnownBits", false, true)
+INITIALIZE_PASS(GISelKnownBitsAnalysis, DEBUG_TYPE,
+                "Analysis for ComputingKnownBits", false, true)
 
-GISelKnownBits::GISelKnownBits(MachineFunction &MF)
+GISelKnownBits::GISelKnownBits(MachineFunction &MF, unsigned MaxDepth)
     : MF(MF), MRI(MF.getRegInfo()), TL(*MF.getSubtarget().getTargetLowering()),
-      DL(MF.getFunction().getParent()->getDataLayout()) {}
+      DL(MF.getFunction().getParent()->getDataLayout()), MaxDepth(MaxDepth) {}
 
-Align GISelKnownBits::inferAlignmentForFrameIdx(int FrameIdx, int Offset,
-                                                const MachineFunction &MF) {
-  const MachineFrameInfo &MFI = MF.getFrameInfo();
-  return commonAlignment(Align(MFI.getObjectAlignment(FrameIdx)), Offset);
-  // TODO: How to handle cases with Base + Offset?
-}
-
-MaybeAlign GISelKnownBits::inferPtrAlignment(const MachineInstr &MI) {
-  if (MI.getOpcode() == TargetOpcode::G_FRAME_INDEX) {
-    int FrameIdx = MI.getOperand(1).getIndex();
-    return inferAlignmentForFrameIdx(FrameIdx, 0, *MI.getMF());
+Align GISelKnownBits::computeKnownAlignment(Register R, unsigned Depth) {
+  const MachineInstr *MI = MRI.getVRegDef(R);
+  switch (MI->getOpcode()) {
+  case TargetOpcode::COPY:
+    return computeKnownAlignment(MI->getOperand(1).getReg(), Depth);
+  case TargetOpcode::G_FRAME_INDEX: {
+    int FrameIdx = MI->getOperand(1).getIndex();
+    return MF.getFrameInfo().getObjectAlign(FrameIdx);
+  }
+  case TargetOpcode::G_INTRINSIC:
+  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+  default:
+    return TL.computeKnownAlignForTargetInstr(*this, R, MRI, Depth + 1);
   }
-  return None;
-}
-
-void GISelKnownBits::computeKnownBitsForFrameIndex(Register R, KnownBits &Known,
-                                                   const APInt &DemandedElts,
-                                                   unsigned Depth) {
-  const MachineInstr &MI = *MRI.getVRegDef(R);
-  computeKnownBitsForAlignment(Known, inferPtrAlignment(MI));
-}
-
-void GISelKnownBits::computeKnownBitsForAlignment(KnownBits &Known,
-                                                  MaybeAlign Alignment) {
-  if (Alignment)
-    // The low bits are known zero if the pointer is aligned.
-    Known.Zero.setLowBits(Log2(Alignment));
 }
 
 KnownBits GISelKnownBits::getKnownBits(MachineInstr &MI) {
+  assert(MI.getNumExplicitDefs() == 1 &&
+         "expected single return generic instruction");
   return getKnownBits(MI.getOperand(0).getReg());
 }
 
 KnownBits GISelKnownBits::getKnownBits(Register R) {
-  KnownBits Known;
-  LLT Ty = MRI.getType(R);
+  const LLT Ty = MRI.getType(R);
   APInt DemandedElts =
       Ty.isVector() ? APInt::getAllOnesValue(Ty.getNumElements()) : APInt(1, 1);
+  return getKnownBits(R, DemandedElts);
+}
+
+KnownBits GISelKnownBits::getKnownBits(Register R, const APInt &DemandedElts,
+                                       unsigned Depth) {
+  // For now, we only maintain the cache during one request.
+  assert(ComputeKnownBitsCache.empty() && "Cache should have been cleared");
+
+  KnownBits Known;
   computeKnownBitsImpl(R, Known, DemandedElts);
+  ComputeKnownBitsCache.clear();
   return Known;
 }
 
@@ -87,6 +84,17 @@ APInt GISelKnownBits::getKnownZeroes(Register R) {
 
 APInt GISelKnownBits::getKnownOnes(Register R) { return getKnownBits(R).One; }
 
+LLVM_ATTRIBUTE_UNUSED static void
+dumpResult(const MachineInstr &MI, const KnownBits &Known, unsigned Depth) {
+  dbgs() << "[" << Depth << "] Compute known bits: " << MI << "[" << Depth
+         << "] Computed for: " << MI << "[" << Depth << "] Known: 0x"
+         << (Known.Zero | Known.One).toString(16, false) << "\n"
+         << "[" << Depth << "] Zero: 0x" << Known.Zero.toString(16, false)
+         << "\n"
+         << "[" << Depth << "] One:  0x" << Known.One.toString(16, false)
+         << "\n";
+}
+
 void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
                                           const APInt &DemandedElts,
                                           unsigned Depth) {
@@ -104,12 +112,28 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
   }
 
   unsigned BitWidth = DstTy.getSizeInBits();
+  auto CacheEntry = ComputeKnownBitsCache.find(R);
+  if (CacheEntry != ComputeKnownBitsCache.end()) {
+    Known = CacheEntry->second;
+    LLVM_DEBUG(dbgs() << "Cache hit at ");
+    LLVM_DEBUG(dumpResult(MI, Known, Depth));
+    assert(Known.getBitWidth() == BitWidth && "Cache entry size doesn't match");
+    return;
+  }
   Known = KnownBits(BitWidth); // Don't know anything
 
   if (DstTy.isVector())
     return; // TODO: Handle vectors.
 
-  if (Depth == getMaxDepth())
+  // Depth may get bigger than max depth if it gets passed to a different
+  // GISelKnownBits object.
+  // This may happen when say a generic part uses a GISelKnownBits object
+  // with some max depth, but then we hit TL.computeKnownBitsForTargetInstr
+  // which creates a new GISelKnownBits object with a different and smaller
+  // depth. If we just check for equality, we would never exit if the depth
+  // that is passed down to the target specific GISelKnownBits object is
+  // already bigger than its max depth.
+  if (Depth >= getMaxDepth())
     return;
 
   if (!DemandedElts)
@@ -122,20 +146,53 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     TL.computeKnownBitsForTargetInstr(*this, R, Known, DemandedElts, MRI,
                                       Depth);
     break;
-  case TargetOpcode::COPY: {
-    MachineOperand Dst = MI.getOperand(0);
-    MachineOperand Src = MI.getOperand(1);
-    // Look through trivial copies but don't look through trivial copies of the
-    // form `%1:(s32) = OP %0:gpr32` known-bits analysis is currently unable to
-    // determine the bit width of a register class.
-    //
-    // We can't use NoSubRegister by name as it's defined by each target but
-    // it's always defined to be 0 by tablegen.
-    if (Dst.getSubReg() == 0 /*NoSubRegister*/ && Src.getReg().isVirtual() &&
-        Src.getSubReg() == 0 /*NoSubRegister*/ &&
-        MRI.getType(Src.getReg()).isValid()) {
-      // Don't increment Depth for this one since we didn't do any work.
-      computeKnownBitsImpl(Src.getReg(), Known, DemandedElts, Depth);
+  case TargetOpcode::COPY:
+  case TargetOpcode::G_PHI:
+  case TargetOpcode::PHI: {
+    Known.One = APInt::getAllOnesValue(BitWidth);
+    Known.Zero = APInt::getAllOnesValue(BitWidth);
+    // Destination registers should not have subregisters at this
+    // point of the pipeline, otherwise the main live-range will be
+    // defined more than once, which is against SSA.
+    assert(MI.getOperand(0).getSubReg() == 0 && "Is this code in SSA?");
+    // Record in the cache that we know nothing for MI.
+    // This will get updated later and in the meantime, if we reach that
+    // phi again, because of a loop, we will cut the search thanks to this
+    // cache entry.
+    // We could actually build up more information on the phi by not cutting
+    // the search, but that additional information is more a side effect
+    // than an intended choice.
+    // Therefore, for now, save on compile time until we derive a proper way
+    // to derive known bits for PHIs within loops.
+    ComputeKnownBitsCache[R] = KnownBits(BitWidth);
+    // PHI's operand are a mix of registers and basic blocks interleaved.
+    // We only care about the register ones.
+    for (unsigned Idx = 1; Idx < MI.getNumOperands(); Idx += 2) {
+      const MachineOperand &Src = MI.getOperand(Idx);
+      Register SrcReg = Src.getReg();
+      // Look through trivial copies and phis but don't look through trivial
+      // copies or phis of the form `%1:(s32) = OP %0:gpr32`, known-bits
+      // analysis is currently unable to determine the bit width of a
+      // register class.
+      //
+      // We can't use NoSubRegister by name as it's defined by each target but
+      // it's always defined to be 0 by tablegen.
+      if (SrcReg.isVirtual() && Src.getSubReg() == 0 /*NoSubRegister*/ &&
+          MRI.getType(SrcReg).isValid()) {
+        // For COPYs we don't do anything, don't increase the depth.
+        computeKnownBitsImpl(SrcReg, Known2, DemandedElts,
+                             Depth + (Opcode != TargetOpcode::COPY));
+        Known.One &= Known2.One;
+        Known.Zero &= Known2.Zero;
+        // If we reach a point where we don't know anything
+        // just stop looking through the operands.
+        if (Known.One == 0 && Known.Zero == 0)
+          break;
+      } else {
+        // We know nothing.
+        Known = KnownBits(BitWidth);
+        break;
+      }
     }
     break;
   }
@@ -148,22 +205,17 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     break;
   }
   case TargetOpcode::G_FRAME_INDEX: {
-    computeKnownBitsForFrameIndex(R, Known, DemandedElts);
+    int FrameIdx = MI.getOperand(1).getIndex();
+    TL.computeKnownBitsForFrameIndex(FrameIdx, Known, MF);
     break;
   }
   case TargetOpcode::G_SUB: {
-    // If low bits are known to be zero in both operands, then we know they are
-    // going to be 0 in the result. Both addition and complement operations
-    // preserve the low zero bits.
-    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
                          Depth + 1);
-    unsigned KnownZeroLow = Known2.countMinTrailingZeros();
-    if (KnownZeroLow == 0)
-      break;
     computeKnownBitsImpl(MI.getOperand(2).getReg(), Known2, DemandedElts,
                          Depth + 1);
-    KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
-    Known.Zero.setLowBits(KnownZeroLow);
+    Known = KnownBits::computeForAddSub(/*Add*/ false, /*NSW*/ false, Known,
+                                        Known2);
     break;
   }
   case TargetOpcode::G_XOR: {
@@ -172,11 +224,7 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
                          Depth + 1);
 
-    // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
-    // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
-    Known.Zero = KnownZeroOut;
+    Known ^= Known2;
     break;
   }
   case TargetOpcode::G_PTR_ADD: {
@@ -187,24 +235,12 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     LLVM_FALLTHROUGH;
   }
   case TargetOpcode::G_ADD: {
-    // Output known-0 bits are known if clear or set in both the low clear bits
-    // common to both LHS & RHS.  For example, 8+(X<<3) is known to have the
-    // low 3 bits clear.
-    // Output known-0 bits are also known if the top bits of each input are
-    // known to be clear. For example, if one input has the top 10 bits clear
-    // and the other has the top 8 bits clear, we know the top 7 bits of the
-    // output must be clear.
-    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
+    computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
                          Depth + 1);
-    unsigned KnownZeroHigh = Known2.countMinLeadingZeros();
-    unsigned KnownZeroLow = Known2.countMinTrailingZeros();
     computeKnownBitsImpl(MI.getOperand(2).getReg(), Known2, DemandedElts,
                          Depth + 1);
-    KnownZeroHigh = std::min(KnownZeroHigh, Known2.countMinLeadingZeros());
-    KnownZeroLow = std::min(KnownZeroLow, Known2.countMinTrailingZeros());
-    Known.Zero.setLowBits(KnownZeroLow);
-    if (KnownZeroHigh > 1)
-      Known.Zero.setHighBits(KnownZeroHigh - 1);
+    Known =
+        KnownBits::computeForAddSub(/*Add*/ true, /*NSW*/ false, Known, Known2);
     break;
   }
   case TargetOpcode::G_AND: {
@@ -214,10 +250,7 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
                          Depth + 1);
 
-    // Output known-1 bits are only known if set in both the LHS & RHS.
-    Known.One &= Known2.One;
-    // Output known-0 are known to be clear if zero in either the LHS | RHS.
-    Known.Zero |= Known2.Zero;
+    Known &= Known2;
     break;
   }
   case TargetOpcode::G_OR: {
@@ -227,10 +260,7 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
     computeKnownBitsImpl(MI.getOperand(1).getReg(), Known2, DemandedElts,
                          Depth + 1);
 
-    // Output known-0 bits are only known if clear in both the LHS & RHS.
-    Known.Zero &= Known2.Zero;
-    // Output known-1 are known to be set if set in either the LHS | RHS.
-    Known.One |= Known2.One;
+    Known |= Known2;
     break;
   }
   case TargetOpcode::G_MUL: {
@@ -287,7 +317,7 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
   case TargetOpcode::G_ANYEXT: {
     computeKnownBitsImpl(MI.getOperand(1).getReg(), Known, DemandedElts,
                          Depth + 1);
-    Known = Known.zext(BitWidth, true /* ExtendedBitsAreKnownZero */);
+    Known = Known.zext(BitWidth);
     break;
   }
   case TargetOpcode::G_LOAD: {
@@ -353,9 +383,9 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
                                ? DL.getIndexSizeInBits(SrcTy.getAddressSpace())
                                : SrcTy.getSizeInBits();
     assert(SrcBitWidth && "SrcBitWidth can't be zero");
-    Known = Known.zextOrTrunc(SrcBitWidth, true);
+    Known = Known.zextOrTrunc(SrcBitWidth);
     computeKnownBitsImpl(SrcReg, Known, DemandedElts, Depth + 1);
-    Known = Known.zextOrTrunc(BitWidth, true);
+    Known = Known.zextOrTrunc(BitWidth);
     if (BitWidth > SrcBitWidth)
       Known.Zero.setBitsFrom(SrcBitWidth);
     break;
@@ -363,14 +393,10 @@ void GISelKnownBits::computeKnownBitsImpl(Register R, KnownBits &Known,
   }
 
   assert(!Known.hasConflict() && "Bits known to be one AND zero?");
-  LLVM_DEBUG(dbgs() << "[" << Depth << "] Compute known bits: " << MI << "["
-                    << Depth << "] Computed for: " << MI << "[" << Depth
-                    << "] Known: 0x"
-                    << (Known.Zero | Known.One).toString(16, false) << "\n"
-                    << "[" << Depth << "] Zero: 0x"
-                    << Known.Zero.toString(16, false) << "\n"
-                    << "[" << Depth << "] One:  0x"
-                    << Known.One.toString(16, false) << "\n");
+  LLVM_DEBUG(dumpResult(MI, Known, Depth));
+
+  // Update the cache.
+  ComputeKnownBitsCache[R] = Known;
 }
 
 unsigned GISelKnownBits::computeNumSignBits(Register R,
@@ -389,6 +415,7 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
     return 1; // No demanded elts, better to assume we don't know anything.
 
   LLT DstTy = MRI.getType(R);
+  const unsigned TyBits = DstTy.getScalarSizeInBits();
 
   // Handle the case where this is called on a register that does not have a
   // type constraint. This is unlikely to occur except by looking through copies
@@ -397,6 +424,7 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
   if (!DstTy.isValid())
     return 1;
 
+  unsigned FirstAnswer = 1;
   switch (Opcode) {
   case TargetOpcode::COPY: {
     MachineOperand &Src = MI.getOperand(1);
@@ -414,6 +442,16 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
     unsigned Tmp = DstTy.getScalarSizeInBits() - SrcTy.getScalarSizeInBits();
     return computeNumSignBits(Src, DemandedElts, Depth + 1) + Tmp;
   }
+  case TargetOpcode::G_SEXTLOAD: {
+    Register Dst = MI.getOperand(0).getReg();
+    LLT Ty = MRI.getType(Dst);
+    // TODO: add vector support
+    if (Ty.isVector())
+      break;
+    if (MI.hasOneMemOperand())
+      return Ty.getSizeInBits() - (*MI.memoperands_begin())->getSizeInBits();
+    break;
+  }
   case TargetOpcode::G_TRUNC: {
     Register Src = MI.getOperand(1).getReg();
     LLT SrcTy = MRI.getType(Src);
@@ -426,13 +464,34 @@ unsigned GISelKnownBits::computeNumSignBits(Register R,
       return NumSrcSignBits - (NumSrcBits - DstTyBits);
     break;
   }
-  default:
+  case TargetOpcode::G_INTRINSIC:
+  case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+  default: {
+    unsigned NumBits =
+      TL.computeNumSignBitsForTargetInstr(*this, R, DemandedElts, MRI, Depth);
+    if (NumBits > 1)
+      FirstAnswer = std::max(FirstAnswer, NumBits);
     break;
   }
+  }
+
+  // Finally, if we can prove that the top bits of the result are 0's or 1's,
+  // use this information.
+  KnownBits Known = getKnownBits(R, DemandedElts, Depth);
+  APInt Mask;
+  if (Known.isNonNegative()) {        // sign bit is 0
+    Mask = Known.Zero;
+  } else if (Known.isNegative()) {  // sign bit is 1;
+    Mask = Known.One;
+  } else {
+    // Nothing known.
+    return FirstAnswer;
+  }
 
-  // TODO: Handle target instructions
-  // TODO: Fall back to known bits
-  return 1;
+  // Okay, we know that the sign bit in Mask is set.  Use CLO to determine
+  // the number of identical bits in the top of the input value.
+  Mask <<= Mask.getBitWidth() - TyBits;
+  return std::max(FirstAnswer, Mask.countLeadingOnes());
 }
 
 unsigned GISelKnownBits::computeNumSignBits(Register R, unsigned Depth) {
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index 17eca2b0301c..8f6643b2f193 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -16,12 +16,13 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
+#include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/Analysis.h"
-#include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
+#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
 #include "llvm/CodeGen/LowLevelType.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -47,7 +48,6 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GetElementPtrTypeIterator.h"
 #include "llvm/IR/InlineAsm.h"
-#include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
@@ -232,46 +232,35 @@ int IRTranslator::getOrCreateFrameIndex(const AllocaInst &AI) {
   // Always allocate at least one byte.
   Size = std::max<uint64_t>(Size, 1u);
 
-  unsigned Alignment = AI.getAlignment();
-  if (!Alignment)
-    Alignment = DL->getABITypeAlignment(AI.getAllocatedType());
-
   int &FI = FrameIndices[&AI];
-  FI = MF->getFrameInfo().CreateStackObject(Size, Alignment, false, &AI);
+  FI = MF->getFrameInfo().CreateStackObject(Size, AI.getAlign(), false, &AI);
   return FI;
 }
 
-unsigned IRTranslator::getMemOpAlignment(const Instruction &I) {
-  unsigned Alignment = 0;
-  Type *ValTy = nullptr;
-  if (const StoreInst *SI = dyn_cast<StoreInst>(&I)) {
-    Alignment = SI->getAlignment();
-    ValTy = SI->getValueOperand()->getType();
-  } else if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) {
-    Alignment = LI->getAlignment();
-    ValTy = LI->getType();
-  } else if (const AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(&I)) {
+Align IRTranslator::getMemOpAlign(const Instruction &I) {
+  if (const StoreInst *SI = dyn_cast<StoreInst>(&I))
+    return SI->getAlign();
+  if (const LoadInst *LI = dyn_cast<LoadInst>(&I)) {
+    return LI->getAlign();
+  }
+  if (const AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(&I)) {
     // TODO(PR27168): This instruction has no alignment attribute, but unlike
     // the default alignment for load/store, the default here is to assume
     // it has NATURAL alignment, not DataLayout-specified alignment.
     const DataLayout &DL = AI->getModule()->getDataLayout();
-    Alignment = DL.getTypeStoreSize(AI->getCompareOperand()->getType());
-    ValTy = AI->getCompareOperand()->getType();
-  } else if (const AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(&I)) {
+    return Align(DL.getTypeStoreSize(AI->getCompareOperand()->getType()));
+  }
+  if (const AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(&I)) {
     // TODO(PR27168): This instruction has no alignment attribute, but unlike
     // the default alignment for load/store, the default here is to assume
     // it has NATURAL alignment, not DataLayout-specified alignment.
     const DataLayout &DL = AI->getModule()->getDataLayout();
-    Alignment = DL.getTypeStoreSize(AI->getValOperand()->getType());
-    ValTy = AI->getType();
-  } else {
-    OptimizationRemarkMissed R("gisel-irtranslator", "", &I);
-    R << "unable to translate memop: " << ore::NV("Opcode", &I);
-    reportTranslationError(*MF, *TPC, *ORE, R);
-    return 1;
+    return Align(DL.getTypeStoreSize(AI->getValOperand()->getType()));
   }
-
-  return Alignment ? Alignment : DL->getABITypeAlignment(ValTy);
+  OptimizationRemarkMissed R("gisel-irtranslator", "", &I);
+  R << "unable to translate memop: " << ore::NV("Opcode", &I);
+  reportTranslationError(*MF, *TPC, *ORE, R);
+  return Align(1);
 }
 
 MachineBasicBlock &IRTranslator::getMBB(const BasicBlock &BB) {
@@ -316,7 +305,7 @@ bool IRTranslator::translateFSub(const User &U, MachineIRBuilder &MIRBuilder) {
       Flags = MachineInstr::copyFlagsFromInstruction(I);
     }
     // Negate the last operand of the FSUB
-    MIRBuilder.buildInstr(TargetOpcode::G_FNEG, {Res}, {Op1}, Flags);
+    MIRBuilder.buildFNeg(Res, Op1, Flags);
     return true;
   }
   return translateBinaryOp(TargetOpcode::G_FSUB, U, MIRBuilder);
@@ -330,7 +319,7 @@ bool IRTranslator::translateFNeg(const User &U, MachineIRBuilder &MIRBuilder) {
     const Instruction &I = cast<Instruction>(U);
     Flags = MachineInstr::copyFlagsFromInstruction(I);
   }
-  MIRBuilder.buildInstr(TargetOpcode::G_FNEG, {Res}, {Op0}, Flags);
+  MIRBuilder.buildFNeg(Res, Op0, Flags);
   return true;
 }
 
@@ -353,8 +342,8 @@ bool IRTranslator::translateCompare(const User &U,
         Res, getOrCreateVReg(*Constant::getAllOnesValue(U.getType())));
   else {
     assert(CI && "Instruction should be CmpInst");
-    MIRBuilder.buildInstr(TargetOpcode::G_FCMP, {Res}, {Pred, Op0, Op1},
-                          MachineInstr::copyFlagsFromInstruction(*CI));
+    MIRBuilder.buildFCmp(Pred, Res, Op0, Op1,
+                         MachineInstr::copyFlagsFromInstruction(*CI));
   }
 
   return true;
@@ -603,7 +592,7 @@ void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB,
       Cond =
           MIB.buildICmp(CmpInst::ICMP_SLE, i1Ty, CmpOpReg, CondRHS).getReg(0);
     } else {
-      const LLT &CmpTy = MRI->getType(CmpOpReg);
+      const LLT CmpTy = MRI->getType(CmpOpReg);
       auto Sub = MIB.buildSub({CmpTy}, CmpOpReg, CondLHS);
       auto Diff = MIB.buildConstant(CmpTy, High - Low);
       Cond = MIB.buildICmp(CmpInst::ICMP_ULE, i1Ty, Sub, Diff).getReg(0);
@@ -631,8 +620,7 @@ void IRTranslator::emitSwitchCase(SwitchCG::CaseBlock &CB,
   if (CB.TrueBB == CB.ThisBB->getNextNode()) {
     std::swap(CB.TrueBB, CB.FalseBB);
     auto True = MIB.buildConstant(i1Ty, 1);
-    Cond = MIB.buildInstr(TargetOpcode::G_XOR, {i1Ty}, {Cond, True}, None)
-               .getReg(0);
+    Cond = MIB.buildXor(i1Ty, Cond, True).getReg(0);
   }
 
   MIB.buildBrCond(Cond, *CB.TrueBB);
@@ -842,9 +830,16 @@ bool IRTranslator::translateIndirectBr(const User &U,
   MIRBuilder.buildBrIndirect(Tgt);
 
   // Link successors.
+  SmallPtrSet<const BasicBlock *, 32> AddedSuccessors;
   MachineBasicBlock &CurBB = MIRBuilder.getMBB();
-  for (const BasicBlock *Succ : successors(&BrInst))
+  for (const BasicBlock *Succ : successors(&BrInst)) {
+    // It's legal for indirectbr instructions to have duplicate blocks in the
+    // destination list. We don't allow this in MIR. Skip anything that's
+    // already a successor.
+    if (!AddedSuccessors.insert(Succ).second)
+      continue;
     CurBB.addSuccessor(&getMBB(*Succ));
+  }
 
   return true;
 }
@@ -859,11 +854,6 @@ static bool isSwiftError(const Value *V) {
 
 bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
   const LoadInst &LI = cast<LoadInst>(U);
-
-  auto Flags = LI.isVolatile() ? MachineMemOperand::MOVolatile
-                               : MachineMemOperand::MONone;
-  Flags |= MachineMemOperand::MOLoad;
-
   if (DL->getTypeStoreSize(LI.getType()) == 0)
     return true;
 
@@ -882,6 +872,9 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
     return true;
   }
 
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  MachineMemOperand::Flags Flags = TLI.getLoadMemOperandFlags(LI, *DL);
+
   const MDNode *Ranges =
       Regs.size() == 1 ? LI.getMetadata(LLVMContext::MD_range) : nullptr;
   for (unsigned i = 0; i < Regs.size(); ++i) {
@@ -889,12 +882,12 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
     MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8);
 
     MachinePointerInfo Ptr(LI.getPointerOperand(), Offsets[i] / 8);
-    unsigned BaseAlign = getMemOpAlignment(LI);
+    Align BaseAlign = getMemOpAlign(LI);
     AAMDNodes AAMetadata;
     LI.getAAMetadata(AAMetadata);
     auto MMO = MF->getMachineMemOperand(
-        Ptr, Flags, (MRI->getType(Regs[i]).getSizeInBits() + 7) / 8,
-        MinAlign(BaseAlign, Offsets[i] / 8), AAMetadata, Ranges,
+        Ptr, Flags, MRI->getType(Regs[i]).getSizeInBytes(),
+        commonAlignment(BaseAlign, Offsets[i] / 8), AAMetadata, Ranges,
         LI.getSyncScopeID(), LI.getOrdering());
     MIRBuilder.buildLoad(Regs[i], Addr, *MMO);
   }
@@ -904,10 +897,6 @@ bool IRTranslator::translateLoad(const User &U, MachineIRBuilder &MIRBuilder) {
 
 bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
   const StoreInst &SI = cast<StoreInst>(U);
-  auto Flags = SI.isVolatile() ? MachineMemOperand::MOVolatile
-                               : MachineMemOperand::MONone;
-  Flags |= MachineMemOperand::MOStore;
-
   if (DL->getTypeStoreSize(SI.getValueOperand()->getType()) == 0)
     return true;
 
@@ -927,17 +916,20 @@ bool IRTranslator::translateStore(const User &U, MachineIRBuilder &MIRBuilder) {
     return true;
   }
 
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  MachineMemOperand::Flags Flags = TLI.getStoreMemOperandFlags(SI, *DL);
+
   for (unsigned i = 0; i < Vals.size(); ++i) {
     Register Addr;
     MIRBuilder.materializePtrAdd(Addr, Base, OffsetTy, Offsets[i] / 8);
 
     MachinePointerInfo Ptr(SI.getPointerOperand(), Offsets[i] / 8);
-    unsigned BaseAlign = getMemOpAlignment(SI);
+    Align BaseAlign = getMemOpAlign(SI);
     AAMDNodes AAMetadata;
     SI.getAAMetadata(AAMetadata);
     auto MMO = MF->getMachineMemOperand(
-        Ptr, Flags, (MRI->getType(Vals[i]).getSizeInBits() + 7) / 8,
-        MinAlign(BaseAlign, Offsets[i] / 8), AAMetadata, nullptr,
+        Ptr, Flags, MRI->getType(Vals[i]).getSizeInBytes(),
+        commonAlignment(BaseAlign, Offsets[i] / 8), AAMetadata, nullptr,
         SI.getSyncScopeID(), SI.getOrdering());
     MIRBuilder.buildStore(Vals[i], Addr, *MMO);
   }
@@ -1010,36 +1002,39 @@ bool IRTranslator::translateSelect(const User &U,
   ArrayRef<Register> Op0Regs = getOrCreateVRegs(*U.getOperand(1));
   ArrayRef<Register> Op1Regs = getOrCreateVRegs(*U.getOperand(2));
 
-  const SelectInst &SI = cast<SelectInst>(U);
   uint16_t Flags = 0;
-  if (const CmpInst *Cmp = dyn_cast<CmpInst>(SI.getCondition()))
-    Flags = MachineInstr::copyFlagsFromInstruction(*Cmp);
+  if (const SelectInst *SI = dyn_cast<SelectInst>(&U))
+    Flags = MachineInstr::copyFlagsFromInstruction(*SI);
 
   for (unsigned i = 0; i < ResRegs.size(); ++i) {
-    MIRBuilder.buildInstr(TargetOpcode::G_SELECT, {ResRegs[i]},
-                          {Tst, Op0Regs[i], Op1Regs[i]}, Flags);
+    MIRBuilder.buildSelect(ResRegs[i], Tst, Op0Regs[i], Op1Regs[i], Flags);
   }
 
   return true;
 }
 
+bool IRTranslator::translateCopy(const User &U, const Value &V,
+                                 MachineIRBuilder &MIRBuilder) {
+  Register Src = getOrCreateVReg(V);
+  auto &Regs = *VMap.getVRegs(U);
+  if (Regs.empty()) {
+    Regs.push_back(Src);
+    VMap.getOffsets(U)->push_back(0);
+  } else {
+    // If we already assigned a vreg for this instruction, we can't change that.
+    // Emit a copy to satisfy the users we already emitted.
+    MIRBuilder.buildCopy(Regs[0], Src);
+  }
+  return true;
+}
+
 bool IRTranslator::translateBitCast(const User &U,
                                     MachineIRBuilder &MIRBuilder) {
   // If we're bitcasting to the source type, we can reuse the source vreg.
   if (getLLTForType(*U.getOperand(0)->getType(), *DL) ==
-      getLLTForType(*U.getType(), *DL)) {
-    Register SrcReg = getOrCreateVReg(*U.getOperand(0));
-    auto &Regs = *VMap.getVRegs(U);
-    // If we already assigned a vreg for this bitcast, we can't change that.
-    // Emit a copy to satisfy the users we already emitted.
-    if (!Regs.empty())
-      MIRBuilder.buildCopy(Regs[0], SrcReg);
-    else {
-      Regs.push_back(SrcReg);
-      VMap.getOffsets(U)->push_back(0);
-    }
-    return true;
-  }
+      getLLTForType(*U.getType(), *DL))
+    return translateCopy(U, *U.getOperand(0), MIRBuilder);
+
   return translateCast(TargetOpcode::G_BITCAST, U, MIRBuilder);
 }
 
@@ -1053,10 +1048,6 @@ bool IRTranslator::translateCast(unsigned Opcode, const User &U,
 
 bool IRTranslator::translateGetElementPtr(const User &U,
                                           MachineIRBuilder &MIRBuilder) {
-  // FIXME: support vector GEPs.
-  if (U.getType()->isVectorTy())
-    return false;
-
   Value &Op0 = *U.getOperand(0);
   Register BaseReg = getOrCreateVReg(Op0);
   Type *PtrIRTy = Op0.getType();
@@ -1064,6 +1055,24 @@ bool IRTranslator::translateGetElementPtr(const User &U,
   Type *OffsetIRTy = DL->getIntPtrType(PtrIRTy);
   LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
 
+  // Normalize Vector GEP - all scalar operands should be converted to the
+  // splat vector.
+  unsigned VectorWidth = 0;
+  if (auto *VT = dyn_cast<VectorType>(U.getType()))
+    VectorWidth = cast<FixedVectorType>(VT)->getNumElements();
+
+  // We might need to splat the base pointer into a vector if the offsets
+  // are vectors.
+  if (VectorWidth && !PtrTy.isVector()) {
+    BaseReg =
+        MIRBuilder.buildSplatVector(LLT::vector(VectorWidth, PtrTy), BaseReg)
+            .getReg(0);
+    PtrIRTy = FixedVectorType::get(PtrIRTy, VectorWidth);
+    PtrTy = getLLTForType(*PtrIRTy, *DL);
+    OffsetIRTy = DL->getIntPtrType(PtrIRTy);
+    OffsetTy = getLLTForType(*OffsetIRTy, *DL);
+  }
+
   int64_t Offset = 0;
   for (gep_type_iterator GTI = gep_type_begin(&U), E = gep_type_end(&U);
        GTI != E; ++GTI) {
@@ -1083,7 +1092,6 @@ bool IRTranslator::translateGetElementPtr(const User &U,
       }
 
       if (Offset != 0) {
-        LLT OffsetTy = getLLTForType(*OffsetIRTy, *DL);
         auto OffsetMIB = MIRBuilder.buildConstant({OffsetTy}, Offset);
         BaseReg = MIRBuilder.buildPtrAdd(PtrTy, BaseReg, OffsetMIB.getReg(0))
                       .getReg(0);
@@ -1091,8 +1099,15 @@ bool IRTranslator::translateGetElementPtr(const User &U,
       }
 
       Register IdxReg = getOrCreateVReg(*Idx);
-      if (MRI->getType(IdxReg) != OffsetTy)
+      LLT IdxTy = MRI->getType(IdxReg);
+      if (IdxTy != OffsetTy) {
+        if (!IdxTy.isVector() && VectorWidth) {
+          IdxReg = MIRBuilder.buildSplatVector(
+            OffsetTy.changeElementType(IdxTy), IdxReg).getReg(0);
+        }
+
         IdxReg = MIRBuilder.buildSExtOrTrunc(OffsetTy, IdxReg).getReg(0);
+      }
 
       // N = N + Idx * ElementSize;
       // Avoid doing it for ElementSize of 1.
@@ -1101,7 +1116,7 @@ bool IRTranslator::translateGetElementPtr(const User &U,
         auto ElementSizeMIB = MIRBuilder.buildConstant(
             getLLTForType(*OffsetIRTy, *DL), ElementSize);
         GepOffsetReg =
-            MIRBuilder.buildMul(OffsetTy, ElementSizeMIB, IdxReg).getReg(0);
+            MIRBuilder.buildMul(OffsetTy, IdxReg, ElementSizeMIB).getReg(0);
       } else
         GepOffsetReg = IdxReg;
 
@@ -1111,7 +1126,7 @@ bool IRTranslator::translateGetElementPtr(const User &U,
 
   if (Offset != 0) {
     auto OffsetMIB =
-        MIRBuilder.buildConstant(getLLTForType(*OffsetIRTy, *DL), Offset);
+        MIRBuilder.buildConstant(OffsetTy, Offset);
     MIRBuilder.buildPtrAdd(getOrCreateVReg(U), BaseReg, OffsetMIB.getReg(0));
     return true;
   }
@@ -1133,20 +1148,21 @@ bool IRTranslator::translateMemFunc(const CallInst &CI,
   for (auto AI = CI.arg_begin(), AE = CI.arg_end(); std::next(AI) != AE; ++AI)
     ICall.addUse(getOrCreateVReg(**AI));
 
-  unsigned DstAlign = 0, SrcAlign = 0;
+  Align DstAlign;
+  Align SrcAlign;
   unsigned IsVol =
       cast<ConstantInt>(CI.getArgOperand(CI.getNumArgOperands() - 1))
           ->getZExtValue();
 
   if (auto *MCI = dyn_cast<MemCpyInst>(&CI)) {
-    DstAlign = std::max<unsigned>(MCI->getDestAlignment(), 1);
-    SrcAlign = std::max<unsigned>(MCI->getSourceAlignment(), 1);
+    DstAlign = MCI->getDestAlign().valueOrOne();
+    SrcAlign = MCI->getSourceAlign().valueOrOne();
   } else if (auto *MMI = dyn_cast<MemMoveInst>(&CI)) {
-    DstAlign = std::max<unsigned>(MMI->getDestAlignment(), 1);
-    SrcAlign = std::max<unsigned>(MMI->getSourceAlignment(), 1);
+    DstAlign = MMI->getDestAlign().valueOrOne();
+    SrcAlign = MMI->getSourceAlign().valueOrOne();
   } else {
     auto *MSI = cast<MemSetInst>(&CI);
-    DstAlign = std::max<unsigned>(MSI->getDestAlignment(), 1);
+    DstAlign = MSI->getDestAlign().valueOrOne();
   }
 
   // We need to propagate the tail call flag from the IR inst as an argument.
@@ -1171,8 +1187,8 @@ void IRTranslator::getStackGuard(Register DstReg,
                                  MachineIRBuilder &MIRBuilder) {
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   MRI->setRegClass(DstReg, TRI->getPointerRegClass(*MF));
-  auto MIB = MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD);
-  MIB.addDef(DstReg);
+  auto MIB =
+      MIRBuilder.buildInstr(TargetOpcode::LOAD_STACK_GUARD, {DstReg}, {});
 
   auto &TLI = *MF->getSubtarget().getTargetLowering();
   Value *Global = TLI.getSDagStackGuard(*MF->getFunction().getParent());
@@ -1184,18 +1200,16 @@ void IRTranslator::getStackGuard(Register DstReg,
                MachineMemOperand::MODereferenceable;
   MachineMemOperand *MemRef =
       MF->getMachineMemOperand(MPInfo, Flags, DL->getPointerSizeInBits() / 8,
-                               DL->getPointerABIAlignment(0).value());
+                               DL->getPointerABIAlignment(0));
   MIB.setMemRefs({MemRef});
 }
 
 bool IRTranslator::translateOverflowIntrinsic(const CallInst &CI, unsigned Op,
                                               MachineIRBuilder &MIRBuilder) {
   ArrayRef<Register> ResRegs = getOrCreateVRegs(CI);
-  MIRBuilder.buildInstr(Op)
-      .addDef(ResRegs[0])
-      .addDef(ResRegs[1])
-      .addUse(getOrCreateVReg(*CI.getOperand(0)))
-      .addUse(getOrCreateVReg(*CI.getOperand(1)));
+  MIRBuilder.buildInstr(
+      Op, {ResRegs[0], ResRegs[1]},
+      {getOrCreateVReg(*CI.getOperand(0)), getOrCreateVReg(*CI.getOperand(1))});
 
   return true;
 }
@@ -1206,8 +1220,12 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       break;
     case Intrinsic::bswap:
       return TargetOpcode::G_BSWAP;
-  case Intrinsic::bitreverse:
+    case Intrinsic::bitreverse:
       return TargetOpcode::G_BITREVERSE;
+    case Intrinsic::fshl:
+      return TargetOpcode::G_FSHL;
+    case Intrinsic::fshr:
+      return TargetOpcode::G_FSHR;
     case Intrinsic::ceil:
       return TargetOpcode::G_FCEIL;
     case Intrinsic::cos:
@@ -1258,6 +1276,8 @@ unsigned IRTranslator::getSimpleIntrinsicOpcode(Intrinsic::ID ID) {
       return TargetOpcode::G_INTRINSIC_TRUNC;
     case Intrinsic::readcyclecounter:
       return TargetOpcode::G_READCYCLECOUNTER;
+    case Intrinsic::ptrmask:
+      return TargetOpcode::G_PTRMASK;
   }
   return Intrinsic::not_intrinsic;
 }
@@ -1282,6 +1302,51 @@ bool IRTranslator::translateSimpleIntrinsic(const CallInst &CI,
   return true;
 }
 
+// TODO: Include ConstainedOps.def when all strict instructions are defined.
+static unsigned getConstrainedOpcode(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::experimental_constrained_fadd:
+    return TargetOpcode::G_STRICT_FADD;
+  case Intrinsic::experimental_constrained_fsub:
+    return TargetOpcode::G_STRICT_FSUB;
+  case Intrinsic::experimental_constrained_fmul:
+    return TargetOpcode::G_STRICT_FMUL;
+  case Intrinsic::experimental_constrained_fdiv:
+    return TargetOpcode::G_STRICT_FDIV;
+  case Intrinsic::experimental_constrained_frem:
+    return TargetOpcode::G_STRICT_FREM;
+  case Intrinsic::experimental_constrained_fma:
+    return TargetOpcode::G_STRICT_FMA;
+  case Intrinsic::experimental_constrained_sqrt:
+    return TargetOpcode::G_STRICT_FSQRT;
+  default:
+    return 0;
+  }
+}
+
+bool IRTranslator::translateConstrainedFPIntrinsic(
+  const ConstrainedFPIntrinsic &FPI, MachineIRBuilder &MIRBuilder) {
+  fp::ExceptionBehavior EB = FPI.getExceptionBehavior().getValue();
+
+  unsigned Opcode = getConstrainedOpcode(FPI.getIntrinsicID());
+  if (!Opcode)
+    return false;
+
+  unsigned Flags = MachineInstr::copyFlagsFromInstruction(FPI);
+  if (EB == fp::ExceptionBehavior::ebIgnore)
+    Flags |= MachineInstr::NoFPExcept;
+
+  SmallVector<llvm::SrcOp, 4> VRegs;
+  VRegs.push_back(getOrCreateVReg(*FPI.getArgOperand(0)));
+  if (!FPI.isUnaryOp())
+    VRegs.push_back(getOrCreateVReg(*FPI.getArgOperand(1)));
+  if (FPI.isTernaryOp())
+    VRegs.push_back(getOrCreateVReg(*FPI.getArgOperand(2)));
+
+  MIRBuilder.buildInstr(Opcode, {getOrCreateVReg(FPI)}, VRegs, Flags);
+  return true;
+}
+
 bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
                                            MachineIRBuilder &MIRBuilder) {
 
@@ -1369,10 +1434,10 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     unsigned ListSize = TLI.getVaListSizeInBits(*DL) / 8;
 
     // FIXME: Get alignment
-    MIRBuilder.buildInstr(TargetOpcode::G_VASTART)
-        .addUse(getOrCreateVReg(*Ptr))
-        .addMemOperand(MF->getMachineMemOperand(
-            MachinePointerInfo(Ptr), MachineMemOperand::MOStore, ListSize, 1));
+    MIRBuilder.buildInstr(TargetOpcode::G_VASTART, {}, {getOrCreateVReg(*Ptr)})
+        .addMemOperand(MF->getMachineMemOperand(MachinePointerInfo(Ptr),
+                                                MachineMemOperand::MOStore,
+                                                ListSize, Align(1)));
     return true;
   }
   case Intrinsic::dbg_value: {
@@ -1385,7 +1450,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     if (!V) {
       // Currently the optimizer can produce this; insert an undef to
       // help debugging.  Probably the optimizer should not do this.
-      MIRBuilder.buildDirectDbgValue(0, DI.getVariable(), DI.getExpression());
+      MIRBuilder.buildIndirectDbgValue(0, DI.getVariable(), DI.getExpression());
     } else if (const auto *CI = dyn_cast<Constant>(V)) {
       MIRBuilder.buildConstDbgValue(*CI, DI.getVariable(), DI.getExpression());
     } else {
@@ -1411,6 +1476,14 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
     return translateOverflowIntrinsic(CI, TargetOpcode::G_UMULO, MIRBuilder);
   case Intrinsic::smul_with_overflow:
     return translateOverflowIntrinsic(CI, TargetOpcode::G_SMULO, MIRBuilder);
+  case Intrinsic::uadd_sat:
+    return translateBinaryOp(TargetOpcode::G_UADDSAT, CI, MIRBuilder);
+  case Intrinsic::sadd_sat:
+    return translateBinaryOp(TargetOpcode::G_SADDSAT, CI, MIRBuilder);
+  case Intrinsic::usub_sat:
+    return translateBinaryOp(TargetOpcode::G_USUBSAT, CI, MIRBuilder);
+  case Intrinsic::ssub_sat:
+    return translateBinaryOp(TargetOpcode::G_SSUBSAT, CI, MIRBuilder);
   case Intrinsic::fmuladd: {
     const TargetMachine &TM = MF->getTarget();
     const TargetLowering &TLI = *MF->getSubtarget().getTargetLowering();
@@ -1423,14 +1496,14 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
                                        TLI.getValueType(*DL, CI.getType()))) {
       // TODO: Revisit this to see if we should move this part of the
       // lowering to the combiner.
-      MIRBuilder.buildInstr(TargetOpcode::G_FMA, {Dst}, {Op0, Op1, Op2},
-                            MachineInstr::copyFlagsFromInstruction(CI));
+      MIRBuilder.buildFMA(Dst, Op0, Op1, Op2,
+                          MachineInstr::copyFlagsFromInstruction(CI));
     } else {
       LLT Ty = getLLTForType(*CI.getType(), *DL);
-      auto FMul = MIRBuilder.buildInstr(TargetOpcode::G_FMUL, {Ty}, {Op0, Op1},
-                                        MachineInstr::copyFlagsFromInstruction(CI));
-      MIRBuilder.buildInstr(TargetOpcode::G_FADD, {Dst}, {FMul, Op2},
-                            MachineInstr::copyFlagsFromInstruction(CI));
+      auto FMul = MIRBuilder.buildFMul(
+          Ty, Op0, Op1, MachineInstr::copyFlagsFromInstruction(CI));
+      MIRBuilder.buildFAdd(Dst, FMul, Op2,
+                           MachineInstr::copyFlagsFromInstruction(CI));
     }
     return true;
   }
@@ -1468,7 +1541,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
         *MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
                                   MachineMemOperand::MOStore |
                                       MachineMemOperand::MOVolatile,
-                                  PtrTy.getSizeInBits() / 8, 8));
+                                  PtrTy.getSizeInBits() / 8, Align(8)));
     return true;
   }
   case Intrinsic::stacksave: {
@@ -1508,9 +1581,8 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
                                           : TargetOpcode::G_CTTZ_ZERO_UNDEF
                           : Cst->isZero() ? TargetOpcode::G_CTLZ
                                           : TargetOpcode::G_CTLZ_ZERO_UNDEF;
-    MIRBuilder.buildInstr(Opcode)
-        .addDef(getOrCreateVReg(CI))
-        .addUse(getOrCreateVReg(*CI.getArgOperand(0)));
+    MIRBuilder.buildInstr(Opcode, {getOrCreateVReg(CI)},
+                          {getOrCreateVReg(*CI.getArgOperand(0))});
     return true;
   }
   case Intrinsic::invariant_start: {
@@ -1526,54 +1598,63 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   case Intrinsic::sideeffect:
     // Discard annotate attributes, assumptions, and artificial side-effects.
     return true;
+  case Intrinsic::read_volatile_register:
   case Intrinsic::read_register: {
     Value *Arg = CI.getArgOperand(0);
-    MIRBuilder.buildInstr(TargetOpcode::G_READ_REGISTER)
-      .addDef(getOrCreateVReg(CI))
-      .addMetadata(cast<MDNode>(cast<MetadataAsValue>(Arg)->getMetadata()));
+    MIRBuilder
+        .buildInstr(TargetOpcode::G_READ_REGISTER, {getOrCreateVReg(CI)}, {})
+        .addMetadata(cast<MDNode>(cast<MetadataAsValue>(Arg)->getMetadata()));
+    return true;
+  }
+  case Intrinsic::write_register: {
+    Value *Arg = CI.getArgOperand(0);
+    MIRBuilder.buildInstr(TargetOpcode::G_WRITE_REGISTER)
+      .addMetadata(cast<MDNode>(cast<MetadataAsValue>(Arg)->getMetadata()))
+      .addUse(getOrCreateVReg(*CI.getArgOperand(1)));
     return true;
   }
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)  \
+  case Intrinsic::INTRINSIC:
+#include "llvm/IR/ConstrainedOps.def"
+    return translateConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(CI),
+                                           MIRBuilder);
+
   }
   return false;
 }
 
-bool IRTranslator::translateInlineAsm(const CallInst &CI,
+bool IRTranslator::translateInlineAsm(const CallBase &CB,
                                       MachineIRBuilder &MIRBuilder) {
-  const InlineAsm &IA = cast<InlineAsm>(*CI.getCalledValue());
-  if (!IA.getConstraintString().empty())
-    return false;
 
-  unsigned ExtraInfo = 0;
-  if (IA.hasSideEffects())
-    ExtraInfo |= InlineAsm::Extra_HasSideEffects;
-  if (IA.getDialect() == InlineAsm::AD_Intel)
-    ExtraInfo |= InlineAsm::Extra_AsmDialect;
+  const InlineAsmLowering *ALI = MF->getSubtarget().getInlineAsmLowering();
 
-  MIRBuilder.buildInstr(TargetOpcode::INLINEASM)
-    .addExternalSymbol(IA.getAsmString().c_str())
-    .addImm(ExtraInfo);
+  if (!ALI) {
+    LLVM_DEBUG(
+        dbgs() << "Inline asm lowering is not supported for this target yet\n");
+    return false;
+  }
 
-  return true;
+  return ALI->lowerInlineAsm(
+      MIRBuilder, CB, [&](const Value &Val) { return getOrCreateVRegs(Val); });
 }
 
-bool IRTranslator::translateCallSite(const ImmutableCallSite &CS,
+bool IRTranslator::translateCallBase(const CallBase &CB,
                                      MachineIRBuilder &MIRBuilder) {
-  const Instruction &I = *CS.getInstruction();
-  ArrayRef<Register> Res = getOrCreateVRegs(I);
+  ArrayRef<Register> Res = getOrCreateVRegs(CB);
 
   SmallVector<ArrayRef<Register>, 8> Args;
   Register SwiftInVReg = 0;
   Register SwiftErrorVReg = 0;
-  for (auto &Arg : CS.args()) {
+  for (auto &Arg : CB.args()) {
     if (CLI->supportSwiftError() && isSwiftError(Arg)) {
       assert(SwiftInVReg == 0 && "Expected only one swift error argument");
       LLT Ty = getLLTForType(*Arg->getType(), *DL);
       SwiftInVReg = MRI->createGenericVirtualRegister(Ty);
       MIRBuilder.buildCopy(SwiftInVReg, SwiftError.getOrCreateVRegUseAt(
-                                            &I, &MIRBuilder.getMBB(), Arg));
+                                            &CB, &MIRBuilder.getMBB(), Arg));
       Args.emplace_back(makeArrayRef(SwiftInVReg));
       SwiftErrorVReg =
-          SwiftError.getOrCreateVRegDefAt(&I, &MIRBuilder.getMBB(), Arg);
+          SwiftError.getOrCreateVRegDefAt(&CB, &MIRBuilder.getMBB(), Arg);
       continue;
     }
     Args.push_back(getOrCreateVRegs(*Arg));
@@ -1583,8 +1664,8 @@ bool IRTranslator::translateCallSite(const ImmutableCallSite &CS,
   // optimize into tail calls. Instead, we defer that to selection where a final
   // scan is done to check if any instructions are calls.
   bool Success =
-      CLI->lowerCall(MIRBuilder, CS, Res, Args, SwiftErrorVReg,
-                     [&]() { return getOrCreateVReg(*CS.getCalledValue()); });
+      CLI->lowerCall(MIRBuilder, CB, Res, Args, SwiftErrorVReg,
+                     [&]() { return getOrCreateVReg(*CB.getCalledOperand()); });
 
   // Check if we just inserted a tail call.
   if (Success) {
@@ -1622,7 +1703,7 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   }
 
   if (!F || !F->isIntrinsic() || ID == Intrinsic::not_intrinsic)
-    return translateCallSite(&CI, MIRBuilder);
+    return translateCallBase(CI, MIRBuilder);
 
   assert(ID != Intrinsic::not_intrinsic && "unknown intrinsic");
 
@@ -1670,14 +1751,12 @@ bool IRTranslator::translateCall(const User &U, MachineIRBuilder &MIRBuilder) {
   TargetLowering::IntrinsicInfo Info;
   // TODO: Add a GlobalISel version of getTgtMemIntrinsic.
   if (TLI.getTgtMemIntrinsic(Info, CI, *MF, ID)) {
-    MaybeAlign Align = Info.align;
-    if (!Align)
-      Align = MaybeAlign(
-          DL->getABITypeAlignment(Info.memVT.getTypeForEVT(F->getContext())));
+    Align Alignment = Info.align.getValueOr(
+        DL->getABITypeAlign(Info.memVT.getTypeForEVT(F->getContext())));
 
     uint64_t Size = Info.memVT.getStoreSize();
-    MIB.addMemOperand(MF->getMachineMemOperand(
-        MachinePointerInfo(Info.ptrVal), Info.flags, Size, Align->value()));
+    MIB.addMemOperand(MF->getMachineMemOperand(MachinePointerInfo(Info.ptrVal),
+                                               Info.flags, Size, Alignment));
   }
 
   return true;
@@ -1691,9 +1770,8 @@ bool IRTranslator::translateInvoke(const User &U,
   const BasicBlock *ReturnBB = I.getSuccessor(0);
   const BasicBlock *EHPadBB = I.getSuccessor(1);
 
-  const Value *Callee = I.getCalledValue();
-  const Function *Fn = dyn_cast<Function>(Callee);
-  if (isa<InlineAsm>(Callee))
+  const Function *Fn = I.getCalledFunction();
+  if (I.isInlineAsm())
     return false;
 
   // FIXME: support invoking patchpoint and statepoint intrinsics.
@@ -1717,7 +1795,7 @@ bool IRTranslator::translateInvoke(const User &U,
   MCSymbol *BeginSymbol = Context.createTempSymbol();
   MIRBuilder.buildInstr(TargetOpcode::EH_LABEL).addSym(BeginSymbol);
 
-  if (!translateCallSite(&I, MIRBuilder))
+  if (!translateCallBase(I, MIRBuilder))
     return false;
 
   MCSymbol *EndSymbol = Context.createTempSymbol();
@@ -1817,12 +1895,7 @@ bool IRTranslator::translateAlloca(const User &U,
     return false;
 
   // Now we're in the harder dynamic case.
-  Type *Ty = AI.getAllocatedType();
-  unsigned Align =
-      std::max((unsigned)DL->getPrefTypeAlignment(Ty), AI.getAlignment());
-
   Register NumElts = getOrCreateVReg(*AI.getArraySize());
-
   Type *IntPtrIRTy = DL->getIntPtrType(AI.getType());
   LLT IntPtrTy = getLLTForType(*IntPtrIRTy, *DL);
   if (MRI->getType(NumElts) != IntPtrTy) {
@@ -1831,29 +1904,30 @@ bool IRTranslator::translateAlloca(const User &U,
     NumElts = ExtElts;
   }
 
+  Type *Ty = AI.getAllocatedType();
+
   Register AllocSize = MRI->createGenericVirtualRegister(IntPtrTy);
   Register TySize =
       getOrCreateVReg(*ConstantInt::get(IntPtrIRTy, DL->getTypeAllocSize(Ty)));
   MIRBuilder.buildMul(AllocSize, NumElts, TySize);
 
-  unsigned StackAlign =
-      MF->getSubtarget().getFrameLowering()->getStackAlignment();
-  if (Align <= StackAlign)
-    Align = 0;
-
   // Round the size of the allocation up to the stack alignment size
   // by add SA-1 to the size. This doesn't overflow because we're computing
   // an address inside an alloca.
-  auto SAMinusOne = MIRBuilder.buildConstant(IntPtrTy, StackAlign - 1);
+  Align StackAlign = MF->getSubtarget().getFrameLowering()->getStackAlign();
+  auto SAMinusOne = MIRBuilder.buildConstant(IntPtrTy, StackAlign.value() - 1);
   auto AllocAdd = MIRBuilder.buildAdd(IntPtrTy, AllocSize, SAMinusOne,
                                       MachineInstr::NoUWrap);
   auto AlignCst =
-      MIRBuilder.buildConstant(IntPtrTy, ~(uint64_t)(StackAlign - 1));
+      MIRBuilder.buildConstant(IntPtrTy, ~(uint64_t)(StackAlign.value() - 1));
   auto AlignedAlloc = MIRBuilder.buildAnd(IntPtrTy, AllocAdd, AlignCst);
 
-  MIRBuilder.buildDynStackAlloc(getOrCreateVReg(AI), AlignedAlloc, Align);
+  Align Alignment = std::max(AI.getAlign(), DL->getPrefTypeAlign(Ty));
+  if (Alignment <= StackAlign)
+    Alignment = Align(1);
+  MIRBuilder.buildDynStackAlloc(getOrCreateVReg(AI), AlignedAlloc, Alignment);
 
-  MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, &AI);
+  MF->getFrameInfo().CreateVariableSizedObject(Alignment, &AI);
   assert(MF->getFrameInfo().hasVarSizedObjects());
   return true;
 }
@@ -1863,10 +1937,9 @@ bool IRTranslator::translateVAArg(const User &U, MachineIRBuilder &MIRBuilder) {
   // we're completely discarding the i64/double distinction here (amongst
   // others). Fortunately the ABIs I know of where that matters don't use va_arg
   // anyway but that's not guaranteed.
-  MIRBuilder.buildInstr(TargetOpcode::G_VAARG)
-    .addDef(getOrCreateVReg(U))
-    .addUse(getOrCreateVReg(*U.getOperand(0)))
-    .addImm(DL->getABITypeAlignment(U.getType()));
+  MIRBuilder.buildInstr(TargetOpcode::G_VAARG, {getOrCreateVReg(U)},
+                        {getOrCreateVReg(*U.getOperand(0)),
+                         DL->getABITypeAlign(U.getType()).value()});
   return true;
 }
 
@@ -1874,17 +1947,8 @@ bool IRTranslator::translateInsertElement(const User &U,
                                           MachineIRBuilder &MIRBuilder) {
   // If it is a <1 x Ty> vector, use the scalar as it is
   // not a legal vector type in LLT.
-  if (U.getType()->getVectorNumElements() == 1) {
-    Register Elt = getOrCreateVReg(*U.getOperand(1));
-    auto &Regs = *VMap.getVRegs(U);
-    if (Regs.empty()) {
-      Regs.push_back(Elt);
-      VMap.getOffsets(U)->push_back(0);
-    } else {
-      MIRBuilder.buildCopy(Regs[0], Elt);
-    }
-    return true;
-  }
+  if (cast<FixedVectorType>(U.getType())->getNumElements() == 1)
+    return translateCopy(U, *U.getOperand(1), MIRBuilder);
 
   Register Res = getOrCreateVReg(U);
   Register Val = getOrCreateVReg(*U.getOperand(0));
@@ -1898,17 +1962,9 @@ bool IRTranslator::translateExtractElement(const User &U,
                                            MachineIRBuilder &MIRBuilder) {
   // If it is a <1 x Ty> vector, use the scalar as it is
   // not a legal vector type in LLT.
-  if (U.getOperand(0)->getType()->getVectorNumElements() == 1) {
-    Register Elt = getOrCreateVReg(*U.getOperand(0));
-    auto &Regs = *VMap.getVRegs(U);
-    if (Regs.empty()) {
-      Regs.push_back(Elt);
-      VMap.getOffsets(U)->push_back(0);
-    } else {
-      MIRBuilder.buildCopy(Regs[0], Elt);
-    }
-    return true;
-  }
+  if (cast<FixedVectorType>(U.getOperand(0)->getType())->getNumElements() == 1)
+    return translateCopy(U, *U.getOperand(0), MIRBuilder);
+
   Register Res = getOrCreateVReg(U);
   Register Val = getOrCreateVReg(*U.getOperand(0));
   const auto &TLI = *MF->getSubtarget().getTargetLowering();
@@ -1924,8 +1980,8 @@ bool IRTranslator::translateExtractElement(const User &U,
   if (!Idx)
     Idx = getOrCreateVReg(*U.getOperand(1));
   if (MRI->getType(Idx).getSizeInBits() != PreferredVecIdxWidth) {
-    const LLT &VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
-    Idx = MIRBuilder.buildSExtOrTrunc(VecIdxTy, Idx)->getOperand(0).getReg();
+    const LLT VecIdxTy = LLT::scalar(PreferredVecIdxWidth);
+    Idx = MIRBuilder.buildSExtOrTrunc(VecIdxTy, Idx).getReg(0);
   }
   MIRBuilder.buildExtractVectorElement(Res, Val, Idx);
   return true;
@@ -1933,13 +1989,16 @@ bool IRTranslator::translateExtractElement(const User &U,
 
 bool IRTranslator::translateShuffleVector(const User &U,
                                           MachineIRBuilder &MIRBuilder) {
-  SmallVector<int, 8> Mask;
-  ShuffleVectorInst::getShuffleMask(cast<Constant>(U.getOperand(2)), Mask);
+  ArrayRef<int> Mask;
+  if (auto *SVI = dyn_cast<ShuffleVectorInst>(&U))
+    Mask = SVI->getShuffleMask();
+  else
+    Mask = cast<ConstantExpr>(U).getShuffleMask();
   ArrayRef<int> MaskAlloc = MF->allocateShuffleMask(Mask);
-  MIRBuilder.buildInstr(TargetOpcode::G_SHUFFLE_VECTOR)
-      .addDef(getOrCreateVReg(U))
-      .addUse(getOrCreateVReg(*U.getOperand(0)))
-      .addUse(getOrCreateVReg(*U.getOperand(1)))
+  MIRBuilder
+      .buildInstr(TargetOpcode::G_SHUFFLE_VECTOR, {getOrCreateVReg(U)},
+                  {getOrCreateVReg(*U.getOperand(0)),
+                   getOrCreateVReg(*U.getOperand(1))})
       .addShuffleMask(MaskAlloc);
   return true;
 }
@@ -1961,12 +2020,8 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
                                           MachineIRBuilder &MIRBuilder) {
   const AtomicCmpXchgInst &I = cast<AtomicCmpXchgInst>(U);
 
-  if (I.isWeak())
-    return false;
-
-  auto Flags = I.isVolatile() ? MachineMemOperand::MOVolatile
-                              : MachineMemOperand::MONone;
-  Flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  auto Flags = TLI.getAtomicMemOperandFlags(I, *DL);
 
   Type *ResType = I.getType();
   Type *ValType = ResType->Type::getStructElementType(0);
@@ -1983,21 +2038,18 @@ bool IRTranslator::translateAtomicCmpXchg(const User &U,
 
   MIRBuilder.buildAtomicCmpXchgWithSuccess(
       OldValRes, SuccessRes, Addr, Cmp, NewVal,
-      *MF->getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
-                                Flags, DL->getTypeStoreSize(ValType),
-                                getMemOpAlignment(I), AAMetadata, nullptr,
-                                I.getSyncScopeID(), I.getSuccessOrdering(),
-                                I.getFailureOrdering()));
+      *MF->getMachineMemOperand(
+          MachinePointerInfo(I.getPointerOperand()), Flags,
+          DL->getTypeStoreSize(ValType), getMemOpAlign(I), AAMetadata, nullptr,
+          I.getSyncScopeID(), I.getSuccessOrdering(), I.getFailureOrdering()));
   return true;
 }
 
 bool IRTranslator::translateAtomicRMW(const User &U,
                                       MachineIRBuilder &MIRBuilder) {
   const AtomicRMWInst &I = cast<AtomicRMWInst>(U);
-
-  auto Flags = I.isVolatile() ? MachineMemOperand::MOVolatile
-                              : MachineMemOperand::MONone;
-  Flags |= MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  auto Flags = TLI.getAtomicMemOperandFlags(I, *DL);
 
   Type *ResType = I.getType();
 
@@ -2057,8 +2109,8 @@ bool IRTranslator::translateAtomicRMW(const User &U,
       Opcode, Res, Addr, Val,
       *MF->getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
                                 Flags, DL->getTypeStoreSize(ResType),
-                                getMemOpAlignment(I), AAMetadata,
-                                nullptr, I.getSyncScopeID(), I.getOrdering()));
+                                getMemOpAlign(I), AAMetadata, nullptr,
+                                I.getSyncScopeID(), I.getOrdering()));
   return true;
 }
 
@@ -2070,6 +2122,21 @@ bool IRTranslator::translateFence(const User &U,
   return true;
 }
 
+bool IRTranslator::translateFreeze(const User &U,
+                                   MachineIRBuilder &MIRBuilder) {
+  const ArrayRef<Register> DstRegs = getOrCreateVRegs(U);
+  const ArrayRef<Register> SrcRegs = getOrCreateVRegs(*U.getOperand(0));
+
+  assert(DstRegs.size() == SrcRegs.size() &&
+         "Freeze with different source and destination type?");
+
+  for (unsigned I = 0; I < DstRegs.size(); ++I) {
+    MIRBuilder.buildFreeze(DstRegs[I], SrcRegs[I]);
+  }
+
+  return true;
+}
+
 void IRTranslator::finishPendingPhis() {
 #ifndef NDEBUG
   DILocationVerifier Verifier;
@@ -2122,6 +2189,10 @@ bool IRTranslator::translate(const Instruction &Inst) {
   else
     EntryBuilder->setDebugLoc(DebugLoc());
 
+  auto &TLI = *MF->getSubtarget().getTargetLowering();
+  if (TLI.fallBackToDAGISel(Inst))
+    return false;
+
   switch (Inst.getOpcode()) {
 #define HANDLE_INST(NUM, OPCODE, CLASS)                                        \
   case Instruction::OPCODE:                                                    \
@@ -2139,22 +2210,16 @@ bool IRTranslator::translate(const Constant &C, Register Reg) {
     EntryBuilder->buildFConstant(Reg, *CF);
   else if (isa<UndefValue>(C))
     EntryBuilder->buildUndef(Reg);
-  else if (isa<ConstantPointerNull>(C)) {
-    // As we are trying to build a constant val of 0 into a pointer,
-    // insert a cast to make them correct with respect to types.
-    unsigned NullSize = DL->getTypeSizeInBits(C.getType());
-    auto *ZeroTy = Type::getIntNTy(C.getContext(), NullSize);
-    auto *ZeroVal = ConstantInt::get(ZeroTy, 0);
-    Register ZeroReg = getOrCreateVReg(*ZeroVal);
-    EntryBuilder->buildCast(Reg, ZeroReg);
-  } else if (auto GV = dyn_cast<GlobalValue>(&C))
+  else if (isa<ConstantPointerNull>(C))
+    EntryBuilder->buildConstant(Reg, 0);
+  else if (auto GV = dyn_cast<GlobalValue>(&C))
     EntryBuilder->buildGlobalValue(Reg, GV);
   else if (auto CAZ = dyn_cast<ConstantAggregateZero>(&C)) {
     if (!CAZ->getType()->isVectorTy())
       return false;
     // Return the scalar if it is a <1 x Ty> vector.
     if (CAZ->getNumElements() == 1)
-      return translate(*CAZ->getElementValue(0u), Reg);
+      return translateCopy(C, *CAZ->getElementValue(0u), *EntryBuilder.get());
     SmallVector<Register, 4> Ops;
     for (unsigned i = 0; i < CAZ->getNumElements(); ++i) {
       Constant &Elt = *CAZ->getElementValue(i);
@@ -2164,7 +2229,8 @@ bool IRTranslator::translate(const Constant &C, Register Reg) {
   } else if (auto CV = dyn_cast<ConstantDataVector>(&C)) {
     // Return the scalar if it is a <1 x Ty> vector.
     if (CV->getNumElements() == 1)
-      return translate(*CV->getElementAsConstant(0), Reg);
+      return translateCopy(C, *CV->getElementAsConstant(0),
+                           *EntryBuilder.get());
     SmallVector<Register, 4> Ops;
     for (unsigned i = 0; i < CV->getNumElements(); ++i) {
       Constant &Elt = *CV->getElementAsConstant(i);
@@ -2182,7 +2248,7 @@ bool IRTranslator::translate(const Constant &C, Register Reg) {
     }
   } else if (auto CV = dyn_cast<ConstantVector>(&C)) {
     if (CV->getNumOperands() == 1)
-      return translate(*CV->getOperand(0), Reg);
+      return translateCopy(C, *CV->getOperand(0), *EntryBuilder.get());
     SmallVector<Register, 4> Ops;
     for (unsigned i = 0; i < CV->getNumOperands(); ++i) {
       Ops.push_back(getOrCreateVReg(*CV->getOperand(i)));
@@ -2319,10 +2385,18 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
   // Make our arguments/constants entry block fallthrough to the IR entry block.
   EntryBB->addSuccessor(&getMBB(F.front()));
 
+  if (CLI->fallBackToDAGISel(F)) {
+    OptimizationRemarkMissed R("gisel-irtranslator", "GISelFailure",
+                               F.getSubprogram(), &F.getEntryBlock());
+    R << "unable to lower function: " << ore::NV("Prototype", F.getType());
+    reportTranslationError(*MF, *TPC, *ORE, R);
+    return false;
+  }
+
   // Lower the actual args into this basic block.
   SmallVector<ArrayRef<Register>, 8> VRegArgs;
   for (const Argument &Arg: F.args()) {
-    if (DL->getTypeStoreSize(Arg.getType()) == 0)
+    if (DL->getTypeStoreSize(Arg.getType()).isZero())
       continue; // Don't handle zero sized types.
     ArrayRef<Register> VRegs = getOrCreateVRegs(Arg);
     VRegArgs.push_back(VRegs);
@@ -2352,6 +2426,7 @@ bool IRTranslator::runOnMachineFunction(MachineFunction &CurMF) {
     WrapperObserver.addObserver(&Verifier);
 #endif // ifndef NDEBUG
     RAIIDelegateInstaller DelInstall(*MF, &WrapperObserver);
+    RAIIMFObserverInstaller ObsInstall(*MF, WrapperObserver);
     for (const BasicBlock *BB : RPOT) {
       MachineBasicBlock &MBB = getMBB(*BB);
       // Set the insertion point of all the following translations to
diff --git a/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
new file mode 100644
index 000000000000..2ce1d414e755
--- /dev/null
+++ b/llvm/lib/CodeGen/GlobalISel/InlineAsmLowering.cpp
@@ -0,0 +1,667 @@
+//===-- lib/CodeGen/GlobalISel/InlineAsmLowering.cpp ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the lowering from LLVM IR inline asm to MIR INLINEASM
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
+#include "llvm/CodeGen/Analysis.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/Utils.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Module.h"
+
+#define DEBUG_TYPE "inline-asm-lowering"
+
+using namespace llvm;
+
+void InlineAsmLowering::anchor() {}
+
+namespace {
+
+/// GISelAsmOperandInfo - This contains information for each constraint that we
+/// are lowering.
+class GISelAsmOperandInfo : public TargetLowering::AsmOperandInfo {
+public:
+  /// Regs - If this is a register or register class operand, this
+  /// contains the set of assigned registers corresponding to the operand.
+  SmallVector<Register, 1> Regs;
+
+  explicit GISelAsmOperandInfo(const TargetLowering::AsmOperandInfo &Info)
+      : TargetLowering::AsmOperandInfo(Info) {}
+};
+
+using GISelAsmOperandInfoVector = SmallVector<GISelAsmOperandInfo, 16>;
+
+class ExtraFlags {
+  unsigned Flags = 0;
+
+public:
+  explicit ExtraFlags(const CallBase &CB) {
+    const InlineAsm *IA = cast<InlineAsm>(CB.getCalledOperand());
+    if (IA->hasSideEffects())
+      Flags |= InlineAsm::Extra_HasSideEffects;
+    if (IA->isAlignStack())
+      Flags |= InlineAsm::Extra_IsAlignStack;
+    if (CB.isConvergent())
+      Flags |= InlineAsm::Extra_IsConvergent;
+    Flags |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
+  }
+
+  void update(const TargetLowering::AsmOperandInfo &OpInfo) {
+    // Ideally, we would only check against memory constraints.  However, the
+    // meaning of an Other constraint can be target-specific and we can't easily
+    // reason about it.  Therefore, be conservative and set MayLoad/MayStore
+    // for Other constraints as well.
+    if (OpInfo.ConstraintType == TargetLowering::C_Memory ||
+        OpInfo.ConstraintType == TargetLowering::C_Other) {
+      if (OpInfo.Type == InlineAsm::isInput)
+        Flags |= InlineAsm::Extra_MayLoad;
+      else if (OpInfo.Type == InlineAsm::isOutput)
+        Flags |= InlineAsm::Extra_MayStore;
+      else if (OpInfo.Type == InlineAsm::isClobber)
+        Flags |= (InlineAsm::Extra_MayLoad | InlineAsm::Extra_MayStore);
+    }
+  }
+
+  unsigned get() const { return Flags; }
+};
+
+} // namespace
+
+/// Assign virtual/physical registers for the specified register operand.
+static void getRegistersForValue(MachineFunction &MF,
+                                 MachineIRBuilder &MIRBuilder,
+                                 GISelAsmOperandInfo &OpInfo,
+                                 GISelAsmOperandInfo &RefOpInfo) {
+
+  const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
+  const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+
+  // No work to do for memory operations.
+  if (OpInfo.ConstraintType == TargetLowering::C_Memory)
+    return;
+
+  // If this is a constraint for a single physreg, or a constraint for a
+  // register class, find it.
+  Register AssignedReg;
+  const TargetRegisterClass *RC;
+  std::tie(AssignedReg, RC) = TLI.getRegForInlineAsmConstraint(
+      &TRI, RefOpInfo.ConstraintCode, RefOpInfo.ConstraintVT);
+  // RC is unset only on failure. Return immediately.
+  if (!RC)
+    return;
+
+  // No need to allocate a matching input constraint since the constraint it's
+  // matching to has already been allocated.
+  if (OpInfo.isMatchingInputConstraint())
+    return;
+
+  // Initialize NumRegs.
+  unsigned NumRegs = 1;
+  if (OpInfo.ConstraintVT != MVT::Other)
+    NumRegs =
+        TLI.getNumRegisters(MF.getFunction().getContext(), OpInfo.ConstraintVT);
+
+  // If this is a constraint for a specific physical register, but the type of
+  // the operand requires more than one register to be passed, we allocate the
+  // required amount of physical registers, starting from the selected physical
+  // register.
+  // For this, first retrieve a register iterator for the given register class
+  TargetRegisterClass::iterator I = RC->begin();
+  MachineRegisterInfo &RegInfo = MF.getRegInfo();
+
+  // Advance the iterator to the assigned register (if set)
+  if (AssignedReg) {
+    for (; *I != AssignedReg; ++I)
+      assert(I != RC->end() && "AssignedReg should be a member of provided RC");
+  }
+
+  // Finally, assign the registers. If the AssignedReg isn't set, create virtual
+  // registers with the provided register class
+  for (; NumRegs; --NumRegs, ++I) {
+    assert(I != RC->end() && "Ran out of registers to allocate!");
+    Register R = AssignedReg ? Register(*I) : RegInfo.createVirtualRegister(RC);
+    OpInfo.Regs.push_back(R);
+  }
+}
+
+/// Return an integer indicating how general CT is.
+static unsigned getConstraintGenerality(TargetLowering::ConstraintType CT) {
+  switch (CT) {
+  case TargetLowering::C_Immediate:
+  case TargetLowering::C_Other:
+  case TargetLowering::C_Unknown:
+    return 0;
+  case TargetLowering::C_Register:
+    return 1;
+  case TargetLowering::C_RegisterClass:
+    return 2;
+  case TargetLowering::C_Memory:
+    return 3;
+  }
+  llvm_unreachable("Invalid constraint type");
+}
+
+static void chooseConstraint(TargetLowering::AsmOperandInfo &OpInfo,
+                             const TargetLowering *TLI) {
+  assert(OpInfo.Codes.size() > 1 && "Doesn't have multiple constraint options");
+  unsigned BestIdx = 0;
+  TargetLowering::ConstraintType BestType = TargetLowering::C_Unknown;
+  int BestGenerality = -1;
+
+  // Loop over the options, keeping track of the most general one.
+  for (unsigned i = 0, e = OpInfo.Codes.size(); i != e; ++i) {
+    TargetLowering::ConstraintType CType =
+        TLI->getConstraintType(OpInfo.Codes[i]);
+
+    // Indirect 'other' or 'immediate' constraints are not allowed.
+    if (OpInfo.isIndirect && !(CType == TargetLowering::C_Memory ||
+                               CType == TargetLowering::C_Register ||
+                               CType == TargetLowering::C_RegisterClass))
+      continue;
+
+    // If this is an 'other' or 'immediate' constraint, see if the operand is
+    // valid for it. For example, on X86 we might have an 'rI' constraint. If
+    // the operand is an integer in the range [0..31] we want to use I (saving a
+    // load of a register), otherwise we must use 'r'.
+    if (CType == TargetLowering::C_Other ||
+        CType == TargetLowering::C_Immediate) {
+      assert(OpInfo.Codes[i].size() == 1 &&
+             "Unhandled multi-letter 'other' constraint");
+      // FIXME: prefer immediate constraints if the target allows it
+    }
+
+    // Things with matching constraints can only be registers, per gcc
+    // documentation.  This mainly affects "g" constraints.
+    if (CType == TargetLowering::C_Memory && OpInfo.hasMatchingInput())
+      continue;
+
+    // This constraint letter is more general than the previous one, use it.
+    int Generality = getConstraintGenerality(CType);
+    if (Generality > BestGenerality) {
+      BestType = CType;
+      BestIdx = i;
+      BestGenerality = Generality;
+    }
+  }
+
+  OpInfo.ConstraintCode = OpInfo.Codes[BestIdx];
+  OpInfo.ConstraintType = BestType;
+}
+
+static void computeConstraintToUse(const TargetLowering *TLI,
+                                   TargetLowering::AsmOperandInfo &OpInfo) {
+  assert(!OpInfo.Codes.empty() && "Must have at least one constraint");
+
+  // Single-letter constraints ('r') are very common.
+  if (OpInfo.Codes.size() == 1) {
+    OpInfo.ConstraintCode = OpInfo.Codes[0];
+    OpInfo.ConstraintType = TLI->getConstraintType(OpInfo.ConstraintCode);
+  } else {
+    chooseConstraint(OpInfo, TLI);
+  }
+
+  // 'X' matches anything.
+  if (OpInfo.ConstraintCode == "X" && OpInfo.CallOperandVal) {
+    // Labels and constants are handled elsewhere ('X' is the only thing
+    // that matches labels).  For Functions, the type here is the type of
+    // the result, which is not what we want to look at; leave them alone.
+    Value *Val = OpInfo.CallOperandVal;
+    if (isa<BasicBlock>(Val) || isa<ConstantInt>(Val) || isa<Function>(Val))
+      return;
+
+    // Otherwise, try to resolve it to something we know about by looking at
+    // the actual operand type.
+    if (const char *Repl = TLI->LowerXConstraint(OpInfo.ConstraintVT)) {
+      OpInfo.ConstraintCode = Repl;
+      OpInfo.ConstraintType = TLI->getConstraintType(OpInfo.ConstraintCode);
+    }
+  }
+}
+
+static unsigned getNumOpRegs(const MachineInstr &I, unsigned OpIdx) {
+  unsigned Flag = I.getOperand(OpIdx).getImm();
+  return InlineAsm::getNumOperandRegisters(Flag);
+}
+
+static bool buildAnyextOrCopy(Register Dst, Register Src,
+                              MachineIRBuilder &MIRBuilder) {
+  const TargetRegisterInfo *TRI =
+      MIRBuilder.getMF().getSubtarget().getRegisterInfo();
+  MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+
+  auto SrcTy = MRI->getType(Src);
+  if (!SrcTy.isValid()) {
+    LLVM_DEBUG(dbgs() << "Source type for copy is not valid\n");
+    return false;
+  }
+  unsigned SrcSize = TRI->getRegSizeInBits(Src, *MRI);
+  unsigned DstSize = TRI->getRegSizeInBits(Dst, *MRI);
+
+  if (DstSize < SrcSize) {
+    LLVM_DEBUG(dbgs() << "Input can't fit in destination reg class\n");
+    return false;
+  }
+
+  // Attempt to anyext small scalar sources.
+  if (DstSize > SrcSize) {
+    if (!SrcTy.isScalar()) {
+      LLVM_DEBUG(dbgs() << "Can't extend non-scalar input to size of"
+                           "destination register class\n");
+      return false;
+    }
+    Src = MIRBuilder.buildAnyExt(LLT::scalar(DstSize), Src).getReg(0);
+  }
+
+  MIRBuilder.buildCopy(Dst, Src);
+  return true;
+}
+
+bool InlineAsmLowering::lowerInlineAsm(
+    MachineIRBuilder &MIRBuilder, const CallBase &Call,
+    std::function<ArrayRef<Register>(const Value &Val)> GetOrCreateVRegs)
+    const {
+  const InlineAsm *IA = cast<InlineAsm>(Call.getCalledOperand());
+
+  /// ConstraintOperands - Information about all of the constraints.
+  GISelAsmOperandInfoVector ConstraintOperands;
+
+  MachineFunction &MF = MIRBuilder.getMF();
+  const Function &F = MF.getFunction();
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+
+  MachineRegisterInfo *MRI = MIRBuilder.getMRI();
+
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI->ParseConstraints(DL, TRI, Call);
+
+  ExtraFlags ExtraInfo(Call);
+  unsigned ArgNo = 0; // ArgNo - The argument of the CallInst.
+  unsigned ResNo = 0; // ResNo - The result number of the next output.
+  for (auto &T : TargetConstraints) {
+    ConstraintOperands.push_back(GISelAsmOperandInfo(T));
+    GISelAsmOperandInfo &OpInfo = ConstraintOperands.back();
+
+    // Compute the value type for each operand.
+    if (OpInfo.Type == InlineAsm::isInput ||
+        (OpInfo.Type == InlineAsm::isOutput && OpInfo.isIndirect)) {
+
+      OpInfo.CallOperandVal = const_cast<Value *>(Call.getArgOperand(ArgNo++));
+
+      if (isa<BasicBlock>(OpInfo.CallOperandVal)) {
+        LLVM_DEBUG(dbgs() << "Basic block input operands not supported yet\n");
+        return false;
+      }
+
+      Type *OpTy = OpInfo.CallOperandVal->getType();
+
+      // If this is an indirect operand, the operand is a pointer to the
+      // accessed type.
+      if (OpInfo.isIndirect) {
+        PointerType *PtrTy = dyn_cast<PointerType>(OpTy);
+        if (!PtrTy)
+          report_fatal_error("Indirect operand for inline asm not a pointer!");
+        OpTy = PtrTy->getElementType();
+      }
+
+      // FIXME: Support aggregate input operands
+      if (!OpTy->isSingleValueType()) {
+        LLVM_DEBUG(
+            dbgs() << "Aggregate input operands are not supported yet\n");
+        return false;
+      }
+
+      OpInfo.ConstraintVT = TLI->getValueType(DL, OpTy, true).getSimpleVT();
+
+    } else if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) {
+      assert(!Call.getType()->isVoidTy() && "Bad inline asm!");
+      if (StructType *STy = dyn_cast<StructType>(Call.getType())) {
+        OpInfo.ConstraintVT =
+            TLI->getSimpleValueType(DL, STy->getElementType(ResNo));
+      } else {
+        assert(ResNo == 0 && "Asm only has one result!");
+        OpInfo.ConstraintVT = TLI->getSimpleValueType(DL, Call.getType());
+      }
+      ++ResNo;
+    } else {
+      OpInfo.ConstraintVT = MVT::Other;
+    }
+
+    // Compute the constraint code and ConstraintType to use.
+    computeConstraintToUse(TLI, OpInfo);
+
+    // The selected constraint type might expose new sideeffects
+    ExtraInfo.update(OpInfo);
+  }
+
+  // At this point, all operand types are decided.
+  // Create the MachineInstr, but don't insert it yet since input
+  // operands still need to insert instructions before this one
+  auto Inst = MIRBuilder.buildInstrNoInsert(TargetOpcode::INLINEASM)
+                  .addExternalSymbol(IA->getAsmString().c_str())
+                  .addImm(ExtraInfo.get());
+
+  // Starting from this operand: flag followed by register(s) will be added as
+  // operands to Inst for each constraint. Used for matching input constraints.
+  unsigned StartIdx = Inst->getNumOperands();
+
+  // Collects the output operands for later processing
+  GISelAsmOperandInfoVector OutputOperands;
+
+  for (auto &OpInfo : ConstraintOperands) {
+    GISelAsmOperandInfo &RefOpInfo =
+        OpInfo.isMatchingInputConstraint()
+            ? ConstraintOperands[OpInfo.getMatchedOperand()]
+            : OpInfo;
+
+    // Assign registers for register operands
+    getRegistersForValue(MF, MIRBuilder, OpInfo, RefOpInfo);
+
+    switch (OpInfo.Type) {
+    case InlineAsm::isOutput:
+      if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
+        unsigned ConstraintID =
+            TLI->getInlineAsmMemConstraint(OpInfo.ConstraintCode);
+        assert(ConstraintID != InlineAsm::Constraint_Unknown &&
+               "Failed to convert memory constraint code to constraint id.");
+
+        // Add information to the INLINEASM instruction to know about this
+        // output.
+        unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
+        OpFlags = InlineAsm::getFlagWordForMem(OpFlags, ConstraintID);
+        Inst.addImm(OpFlags);
+        ArrayRef<Register> SourceRegs =
+            GetOrCreateVRegs(*OpInfo.CallOperandVal);
+        assert(
+            SourceRegs.size() == 1 &&
+            "Expected the memory output to fit into a single virtual register");
+        Inst.addReg(SourceRegs[0]);
+      } else {
+        // Otherwise, this outputs to a register (directly for C_Register /
+        // C_RegisterClass. Find a register that we can use.
+        assert(OpInfo.ConstraintType == TargetLowering::C_Register ||
+               OpInfo.ConstraintType == TargetLowering::C_RegisterClass);
+
+        if (OpInfo.Regs.empty()) {
+          LLVM_DEBUG(dbgs()
+                     << "Couldn't allocate output register for constraint\n");
+          return false;
+        }
+
+        // Add information to the INLINEASM instruction to know that this
+        // register is set.
+        unsigned Flag = InlineAsm::getFlagWord(
+            OpInfo.isEarlyClobber ? InlineAsm::Kind_RegDefEarlyClobber
+                                  : InlineAsm::Kind_RegDef,
+            OpInfo.Regs.size());
+        if (OpInfo.Regs.front().isVirtual()) {
+          // Put the register class of the virtual registers in the flag word.
+          // That way, later passes can recompute register class constraints for
+          // inline assembly as well as normal instructions. Don't do this for
+          // tied operands that can use the regclass information from the def.
+          const TargetRegisterClass *RC = MRI->getRegClass(OpInfo.Regs.front());
+          Flag = InlineAsm::getFlagWordForRegClass(Flag, RC->getID());
+        }
+
+        Inst.addImm(Flag);
+
+        for (Register Reg : OpInfo.Regs) {
+          Inst.addReg(Reg,
+                      RegState::Define | getImplRegState(Reg.isPhysical()) |
+                          (OpInfo.isEarlyClobber ? RegState::EarlyClobber : 0));
+        }
+
+        // Remember this output operand for later processing
+        OutputOperands.push_back(OpInfo);
+      }
+
+      break;
+    case InlineAsm::isInput: {
+      if (OpInfo.isMatchingInputConstraint()) {
+        unsigned DefIdx = OpInfo.getMatchedOperand();
+        // Find operand with register def that corresponds to DefIdx.
+        unsigned InstFlagIdx = StartIdx;
+        for (unsigned i = 0; i < DefIdx; ++i)
+          InstFlagIdx += getNumOpRegs(*Inst, InstFlagIdx) + 1;
+        assert(getNumOpRegs(*Inst, InstFlagIdx) == 1 && "Wrong flag");
+
+        unsigned MatchedOperandFlag = Inst->getOperand(InstFlagIdx).getImm();
+        if (InlineAsm::isMemKind(MatchedOperandFlag)) {
+          LLVM_DEBUG(dbgs() << "Matching input constraint to mem operand not "
+                               "supported. This should be target specific.\n");
+          return false;
+        }
+        if (!InlineAsm::isRegDefKind(MatchedOperandFlag) &&
+            !InlineAsm::isRegDefEarlyClobberKind(MatchedOperandFlag)) {
+          LLVM_DEBUG(dbgs() << "Unknown matching constraint\n");
+          return false;
+        }
+
+        // We want to tie input to register in next operand.
+        unsigned DefRegIdx = InstFlagIdx + 1;
+        Register Def = Inst->getOperand(DefRegIdx).getReg();
+
+        // Copy input to new vreg with same reg class as Def
+        const TargetRegisterClass *RC = MRI->getRegClass(Def);
+        ArrayRef<Register> SrcRegs = GetOrCreateVRegs(*OpInfo.CallOperandVal);
+        assert(SrcRegs.size() == 1 && "Single register is expected here");
+        Register Tmp = MRI->createVirtualRegister(RC);
+        if (!buildAnyextOrCopy(Tmp, SrcRegs[0], MIRBuilder))
+          return false;
+
+        // Add Flag and input register operand (Tmp) to Inst. Tie Tmp to Def.
+        unsigned UseFlag = InlineAsm::getFlagWord(InlineAsm::Kind_RegUse, 1);
+        unsigned Flag = InlineAsm::getFlagWordForMatchingOp(UseFlag, DefIdx);
+        Inst.addImm(Flag);
+        Inst.addReg(Tmp);
+        Inst->tieOperands(DefRegIdx, Inst->getNumOperands() - 1);
+        break;
+      }
+
+      if (OpInfo.ConstraintType == TargetLowering::C_Other &&
+          OpInfo.isIndirect) {
+        LLVM_DEBUG(dbgs() << "Indirect input operands with unknown constraint "
+                             "not supported yet\n");
+        return false;
+      }
+
+      if (OpInfo.ConstraintType == TargetLowering::C_Immediate ||
+          OpInfo.ConstraintType == TargetLowering::C_Other) {
+
+        std::vector<MachineOperand> Ops;
+        if (!lowerAsmOperandForConstraint(OpInfo.CallOperandVal,
+                                          OpInfo.ConstraintCode, Ops,
+                                          MIRBuilder)) {
+          LLVM_DEBUG(dbgs() << "Don't support constraint: "
+                            << OpInfo.ConstraintCode << " yet\n");
+          return false;
+        }
+
+        assert(Ops.size() > 0 &&
+               "Expected constraint to be lowered to at least one operand");
+
+        // Add information to the INLINEASM node to know about this input.
+        unsigned OpFlags =
+            InlineAsm::getFlagWord(InlineAsm::Kind_Imm, Ops.size());
+        Inst.addImm(OpFlags);
+        Inst.add(Ops);
+        break;
+      }
+
+      if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
+
+        if (!OpInfo.isIndirect) {
+          LLVM_DEBUG(dbgs()
+                     << "Cannot indirectify memory input operands yet\n");
+          return false;
+        }
+
+        assert(OpInfo.isIndirect && "Operand must be indirect to be a mem!");
+
+        unsigned ConstraintID =
+            TLI->getInlineAsmMemConstraint(OpInfo.ConstraintCode);
+        unsigned OpFlags = InlineAsm::getFlagWord(InlineAsm::Kind_Mem, 1);
+        OpFlags = InlineAsm::getFlagWordForMem(OpFlags, ConstraintID);
+        Inst.addImm(OpFlags);
+        ArrayRef<Register> SourceRegs =
+            GetOrCreateVRegs(*OpInfo.CallOperandVal);
+        assert(
+            SourceRegs.size() == 1 &&
+            "Expected the memory input to fit into a single virtual register");
+        Inst.addReg(SourceRegs[0]);
+        break;
+      }
+
+      assert((OpInfo.ConstraintType == TargetLowering::C_RegisterClass ||
+              OpInfo.ConstraintType == TargetLowering::C_Register) &&
+             "Unknown constraint type!");
+
+      if (OpInfo.isIndirect) {
+        LLVM_DEBUG(dbgs() << "Can't handle indirect register inputs yet "
+                             "for constraint '"
+                          << OpInfo.ConstraintCode << "'\n");
+        return false;
+      }
+
+      // Copy the input into the appropriate registers.
+      if (OpInfo.Regs.empty()) {
+        LLVM_DEBUG(
+            dbgs()
+            << "Couldn't allocate input register for register constraint\n");
+        return false;
+      }
+
+      unsigned NumRegs = OpInfo.Regs.size();
+      ArrayRef<Register> SourceRegs = GetOrCreateVRegs(*OpInfo.CallOperandVal);
+      assert(NumRegs == SourceRegs.size() &&
+             "Expected the number of input registers to match the number of "
+             "source registers");
+
+      if (NumRegs > 1) {
+        LLVM_DEBUG(dbgs() << "Input operands with multiple input registers are "
+                             "not supported yet\n");
+        return false;
+      }
+
+      unsigned Flag = InlineAsm::getFlagWord(InlineAsm::Kind_RegUse, NumRegs);
+      Inst.addImm(Flag);
+      if (!buildAnyextOrCopy(OpInfo.Regs[0], SourceRegs[0], MIRBuilder))
+        return false;
+      Inst.addReg(OpInfo.Regs[0]);
+      break;
+    }
+
+    case InlineAsm::isClobber: {
+
+      unsigned NumRegs = OpInfo.Regs.size();
+      if (NumRegs > 0) {
+        unsigned Flag =
+            InlineAsm::getFlagWord(InlineAsm::Kind_Clobber, NumRegs);
+        Inst.addImm(Flag);
+
+        for (Register Reg : OpInfo.Regs) {
+          Inst.addReg(Reg, RegState::Define | RegState::EarlyClobber |
+                               getImplRegState(Reg.isPhysical()));
+        }
+      }
+      break;
+    }
+    }
+  }
+
+  if (const MDNode *SrcLoc = Call.getMetadata("srcloc"))
+    Inst.addMetadata(SrcLoc);
+
+  // All inputs are handled, insert the instruction now
+  MIRBuilder.insertInstr(Inst);
+
+  // Finally, copy the output operands into the output registers
+  ArrayRef<Register> ResRegs = GetOrCreateVRegs(Call);
+  if (ResRegs.size() != OutputOperands.size()) {
+    LLVM_DEBUG(dbgs() << "Expected the number of output registers to match the "
+                         "number of destination registers\n");
+    return false;
+  }
+  for (unsigned int i = 0, e = ResRegs.size(); i < e; i++) {
+    GISelAsmOperandInfo &OpInfo = OutputOperands[i];
+
+    if (OpInfo.Regs.empty())
+      continue;
+
+    switch (OpInfo.ConstraintType) {
+    case TargetLowering::C_Register:
+    case TargetLowering::C_RegisterClass: {
+      if (OpInfo.Regs.size() > 1) {
+        LLVM_DEBUG(dbgs() << "Output operands with multiple defining "
+                             "registers are not supported yet\n");
+        return false;
+      }
+
+      Register SrcReg = OpInfo.Regs[0];
+      unsigned SrcSize = TRI->getRegSizeInBits(SrcReg, *MRI);
+      if (MRI->getType(ResRegs[i]).getSizeInBits() < SrcSize) {
+        // First copy the non-typed virtual register into a generic virtual
+        // register
+        Register Tmp1Reg =
+            MRI->createGenericVirtualRegister(LLT::scalar(SrcSize));
+        MIRBuilder.buildCopy(Tmp1Reg, SrcReg);
+        // Need to truncate the result of the register
+        MIRBuilder.buildTrunc(ResRegs[i], Tmp1Reg);
+      } else {
+        MIRBuilder.buildCopy(ResRegs[i], SrcReg);
+      }
+      break;
+    }
+    case TargetLowering::C_Immediate:
+    case TargetLowering::C_Other:
+      LLVM_DEBUG(
+          dbgs() << "Cannot lower target specific output constraints yet\n");
+      return false;
+    case TargetLowering::C_Memory:
+      break; // Already handled.
+    case TargetLowering::C_Unknown:
+      LLVM_DEBUG(dbgs() << "Unexpected unknown constraint\n");
+      return false;
+    }
+  }
+
+  return true;
+}
+
+bool InlineAsmLowering::lowerAsmOperandForConstraint(
+    Value *Val, StringRef Constraint, std::vector<MachineOperand> &Ops,
+    MachineIRBuilder &MIRBuilder) const {
+  if (Constraint.size() > 1)
+    return false;
+
+  char ConstraintLetter = Constraint[0];
+  switch (ConstraintLetter) {
+  default:
+    return false;
+  case 'i': // Simple Integer or Relocatable Constant
+    if (ConstantInt *CI = dyn_cast<ConstantInt>(Val)) {
+      assert(CI->getBitWidth() <= 64 &&
+             "expected immediate to fit into 64-bits");
+      // Boolean constants should be zero-extended, others are sign-extended
+      bool IsBool = CI->getBitWidth() == 1;
+      int64_t ExtVal = IsBool ? CI->getZExtValue() : CI->getSExtValue();
+      Ops.push_back(MachineOperand::CreateImm(ExtVal));
+      return true;
+    }
+    return false;
+  }
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
index 7c4fd2d140d3..f32278d07052 100644
--- a/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelect.cpp
@@ -29,6 +29,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "instruction-select"
 
@@ -175,7 +176,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
         auto DstRC = MRI.getRegClass(DstReg);
         if (SrcRC == DstRC) {
           MRI.replaceRegWith(DstReg, SrcReg);
-          MI.eraseFromParentAndMarkDBGValuesForRemoval();
+          MI.eraseFromParent();
         }
       }
     }
@@ -222,9 +223,6 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
     return false;
   }
 #endif
-  auto &TLI = *MF.getSubtarget().getTargetLowering();
-  TLI.finalizeLowering(MF);
-
   // Determine if there are any calls in this machine function. Ported from
   // SelectionDAG.
   MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -240,6 +238,9 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  // FIXME: FinalizeISel pass calls finalizeLowering, so it's called twice.
+  auto &TLI = *MF.getSubtarget().getTargetLowering();
+  TLI.finalizeLowering(MF);
 
   LLVM_DEBUG({
     dbgs() << "Rules covered by selecting function: " << MF.getName() << ":";
@@ -248,11 +249,7 @@ bool InstructionSelect::runOnMachineFunction(MachineFunction &MF) {
     dbgs() << "\n\n";
   });
   CoverageInfo.emit(CoveragePrefix,
-                    MF.getSubtarget()
-                        .getTargetLowering()
-                        ->getTargetMachine()
-                        .getTarget()
-                        .getBackendName());
+                    TLI.getTargetMachine().getTarget().getBackendName());
 
   // If we successfully selected the function nothing is going to use the vreg
   // types after us (otherwise MIRPrinter would need them). Make sure the types
diff --git a/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp b/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
index b9c90e69ddb2..2fedc034d315 100644
--- a/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/InstructionSelector.cpp
@@ -42,7 +42,7 @@ bool InstructionSelector::constrainOperandRegToRegClass(
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   return constrainOperandRegClass(MF, TRI, MRI, TII, RBI, I, RC,
-                                  I.getOperand(OpIdx), OpIdx);
+                                  I.getOperand(OpIdx));
 }
 
 bool InstructionSelector::isOperandImmEqual(
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
index 601d50e9806f..a83742f2138f 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalityPredicates.cpp
@@ -80,22 +80,46 @@ LegalityPredicate LegalityPredicates::isPointer(unsigned TypeIdx,
   };
 }
 
-LegalityPredicate LegalityPredicates::narrowerThan(unsigned TypeIdx,
-                                                   unsigned Size) {
+LegalityPredicate LegalityPredicates::elementTypeIs(unsigned TypeIdx,
+                                                    LLT EltTy) {
+  return [=](const LegalityQuery &Query) {
+    const LLT QueryTy = Query.Types[TypeIdx];
+    return QueryTy.isVector() && QueryTy.getElementType() == EltTy;
+  };
+}
+
+LegalityPredicate LegalityPredicates::scalarNarrowerThan(unsigned TypeIdx,
+                                                         unsigned Size) {
   return [=](const LegalityQuery &Query) {
     const LLT QueryTy = Query.Types[TypeIdx];
     return QueryTy.isScalar() && QueryTy.getSizeInBits() < Size;
   };
 }
 
-LegalityPredicate LegalityPredicates::widerThan(unsigned TypeIdx,
-                                                unsigned Size) {
+LegalityPredicate LegalityPredicates::scalarWiderThan(unsigned TypeIdx,
+                                                      unsigned Size) {
   return [=](const LegalityQuery &Query) {
     const LLT QueryTy = Query.Types[TypeIdx];
     return QueryTy.isScalar() && QueryTy.getSizeInBits() > Size;
   };
 }
 
+LegalityPredicate LegalityPredicates::smallerThan(unsigned TypeIdx0,
+                                                  unsigned TypeIdx1) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx0].getSizeInBits() <
+           Query.Types[TypeIdx1].getSizeInBits();
+  };
+}
+
+LegalityPredicate LegalityPredicates::largerThan(unsigned TypeIdx0,
+                                                  unsigned TypeIdx1) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx0].getSizeInBits() >
+           Query.Types[TypeIdx1].getSizeInBits();
+  };
+}
+
 LegalityPredicate LegalityPredicates::scalarOrEltNarrowerThan(unsigned TypeIdx,
                                                               unsigned Size) {
   return [=](const LegalityQuery &Query) {
@@ -126,6 +150,12 @@ LegalityPredicate LegalityPredicates::sizeNotPow2(unsigned TypeIdx) {
   };
 }
 
+LegalityPredicate LegalityPredicates::sizeIs(unsigned TypeIdx, unsigned Size) {
+  return [=](const LegalityQuery &Query) {
+    return Query.Types[TypeIdx].getSizeInBits() == Size;
+  };
+}
+
 LegalityPredicate LegalityPredicates::sameSize(unsigned TypeIdx0,
                                                unsigned TypeIdx1) {
   return [=](const LegalityQuery &Query) {
diff --git a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
index e789e4a333dc..1d7be54de3b0 100644
--- a/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Legalizer.cpp
@@ -21,6 +21,7 @@
 #include "llvm/CodeGen/GlobalISel/GISelWorkList.h"
 #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
+#include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/CodeGen/MachineOptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -28,6 +29,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Target/TargetMachine.h"
 
 #include <iterator>
@@ -41,6 +43,29 @@ static cl::opt<bool>
                          cl::desc("Should enable CSE in Legalizer"),
                          cl::Optional, cl::init(false));
 
+enum class DebugLocVerifyLevel {
+  None,
+  Legalizations,
+  LegalizationsAndArtifactCombiners,
+};
+#ifndef NDEBUG
+static cl::opt<DebugLocVerifyLevel> VerifyDebugLocs(
+    "verify-legalizer-debug-locs",
+    cl::desc("Verify that debug locations are handled"),
+    cl::values(
+        clEnumValN(DebugLocVerifyLevel::None, "none", "No verification"),
+        clEnumValN(DebugLocVerifyLevel::Legalizations, "legalizations",
+                   "Verify legalizations"),
+        clEnumValN(DebugLocVerifyLevel::LegalizationsAndArtifactCombiners,
+                   "legalizations+artifactcombiners",
+                   "Verify legalizations and artifact combines")),
+    cl::init(DebugLocVerifyLevel::Legalizations));
+#else
+// Always disable it for release builds by preventing the observer from being
+// installed.
+static const DebugLocVerifyLevel VerifyDebugLocs = DebugLocVerifyLevel::None;
+#endif
+
 char Legalizer::ID = 0;
 INITIALIZE_PASS_BEGIN(Legalizer, DEBUG_TYPE,
                       "Legalize the Machine IR a function's Machine IR", false,
@@ -108,7 +133,6 @@ public:
   }
 
   void createdInstr(MachineInstr &MI) override {
-    LLVM_DEBUG(dbgs() << ".. .. New MI: " << MI);
     LLVM_DEBUG(NewMIs.push_back(&MI));
     createdOrChangedInstr(MI);
   }
@@ -143,7 +167,9 @@ public:
 Legalizer::MFResult
 Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI,
                                    ArrayRef<GISelChangeObserver *> AuxObservers,
+                                   LostDebugLocObserver &LocObserver,
                                    MachineIRBuilder &MIRBuilder) {
+  MIRBuilder.setMF(MF);
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
   // Populate worklists.
@@ -180,7 +206,7 @@ Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI,
 
   // Now install the observer as the delegate to MF.
   // This will keep all the observers notified about new insertions/deletions.
-  RAIIDelegateInstaller DelInstall(MF, &WrapperObserver);
+  RAIIMFObsDelInstaller Installer(MF, WrapperObserver);
   LegalizerHelper Helper(MF, LI, WrapperObserver, MIRBuilder);
   LegalizationArtifactCombiner ArtCombiner(MIRBuilder, MRI, LI);
   auto RemoveDeadInstFromLists = [&WrapperObserver](MachineInstr *DeadMI) {
@@ -199,6 +225,7 @@ Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI,
       if (isTriviallyDead(MI, MRI)) {
         LLVM_DEBUG(dbgs() << MI << "Is dead; erasing.\n");
         MI.eraseFromParentAndMarkDBGValuesForRemoval();
+        LocObserver.checkpoint(false);
         continue;
       }
 
@@ -224,6 +251,7 @@ Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI,
         return {Changed, &MI};
       }
       WorkListObserver.printNewInstrs();
+      LocObserver.checkpoint();
       Changed |= Res == LegalizerHelper::Legalized;
     }
     // Try to combine the instructions in RetryList again if there
@@ -238,6 +266,7 @@ Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI,
         return {Changed, RetryList.front()};
       }
     }
+    LocObserver.checkpoint();
     while (!ArtifactList.empty()) {
       MachineInstr &MI = *ArtifactList.pop_back_val();
       assert(isPreISelGenericOpcode(MI.getOpcode()) &&
@@ -246,6 +275,7 @@ Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI,
         LLVM_DEBUG(dbgs() << MI << "Is dead\n");
         RemoveDeadInstFromLists(&MI);
         MI.eraseFromParentAndMarkDBGValuesForRemoval();
+        LocObserver.checkpoint(false);
         continue;
       }
       SmallVector<MachineInstr *, 4> DeadInstructions;
@@ -258,6 +288,9 @@ Legalizer::legalizeMachineFunction(MachineFunction &MF, const LegalizerInfo &LI,
           RemoveDeadInstFromLists(DeadMI);
           DeadMI->eraseFromParentAndMarkDBGValuesForRemoval();
         }
+        LocObserver.checkpoint(
+            VerifyDebugLocs ==
+            DebugLocVerifyLevel::LegalizationsAndArtifactCombiners);
         Changed = true;
         continue;
       }
@@ -305,9 +338,14 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
     // We want CSEInfo in addition to WorkListObserver to observe all changes.
     AuxObservers.push_back(CSEInfo);
   }
+  assert(!CSEInfo || !errorToBool(CSEInfo->verify()));
+  LostDebugLocObserver LocObserver(DEBUG_TYPE);
+  if (VerifyDebugLocs > DebugLocVerifyLevel::None)
+    AuxObservers.push_back(&LocObserver);
 
   const LegalizerInfo &LI = *MF.getSubtarget().getLegalizerInfo();
-  MFResult Result = legalizeMachineFunction(MF, LI, AuxObservers, *MIRBuilder);
+  MFResult Result =
+      legalizeMachineFunction(MF, LI, AuxObservers, LocObserver, *MIRBuilder);
 
   if (Result.FailedOn) {
     reportGISelFailure(MF, TPC, MORE, "gisel-legalize",
@@ -324,5 +362,33 @@ bool Legalizer::runOnMachineFunction(MachineFunction &MF) {
     reportGISelFailure(MF, TPC, MORE, R);
     return false;
   }
+
+  if (LocObserver.getNumLostDebugLocs()) {
+    MachineOptimizationRemarkMissed R("gisel-legalize", "LostDebugLoc",
+                                      MF.getFunction().getSubprogram(),
+                                      /*MBB=*/&*MF.begin());
+    R << "lost "
+      << ore::NV("NumLostDebugLocs", LocObserver.getNumLostDebugLocs())
+      << " debug locations during pass";
+    reportGISelWarning(MF, TPC, MORE, R);
+    // Example remark:
+    // --- !Missed
+    // Pass:            gisel-legalize
+    // Name:            GISelFailure
+    // DebugLoc:        { File: '.../legalize-urem.mir', Line: 1, Column: 0 }
+    // Function:        test_urem_s32
+    // Args:
+    //   - String:          'lost '
+    //   - NumLostDebugLocs: '1'
+    //   - String:          ' debug locations during pass'
+    // ...
+  }
+
+  // If for some reason CSE was not enabled, make sure that we invalidate the
+  // CSEInfo object (as we currently declare that the analysis is preserved).
+  // The next time get on the wrapper is called, it will force it to recompute
+  // the analysis.
+  if (!EnableCSE)
+    Wrapper.setComputed(false);
   return Result.Changed;
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
index 667e1a04dc34..da519f99ad7e 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerHelper.cpp
@@ -63,30 +63,48 @@ getNarrowTypeBreakDown(LLT OrigTy, LLT NarrowTy, LLT &LeftoverTy) {
   return std::make_pair(NumParts, NumLeftover);
 }
 
+static Type *getFloatTypeForLLT(LLVMContext &Ctx, LLT Ty) {
+
+  if (!Ty.isScalar())
+    return nullptr;
+
+  switch (Ty.getSizeInBits()) {
+  case 16:
+    return Type::getHalfTy(Ctx);
+  case 32:
+    return Type::getFloatTy(Ctx);
+  case 64:
+    return Type::getDoubleTy(Ctx);
+  case 128:
+    return Type::getFP128Ty(Ctx);
+  default:
+    return nullptr;
+  }
+}
+
 LegalizerHelper::LegalizerHelper(MachineFunction &MF,
                                  GISelChangeObserver &Observer,
                                  MachineIRBuilder &Builder)
-    : MIRBuilder(Builder), MRI(MF.getRegInfo()),
-      LI(*MF.getSubtarget().getLegalizerInfo()), Observer(Observer) {
-  MIRBuilder.setMF(MF);
+    : MIRBuilder(Builder), Observer(Observer), MRI(MF.getRegInfo()),
+      LI(*MF.getSubtarget().getLegalizerInfo()) {
   MIRBuilder.setChangeObserver(Observer);
 }
 
 LegalizerHelper::LegalizerHelper(MachineFunction &MF, const LegalizerInfo &LI,
                                  GISelChangeObserver &Observer,
                                  MachineIRBuilder &B)
-    : MIRBuilder(B), MRI(MF.getRegInfo()), LI(LI), Observer(Observer) {
-  MIRBuilder.setMF(MF);
+    : MIRBuilder(B), Observer(Observer), MRI(MF.getRegInfo()), LI(LI) {
   MIRBuilder.setChangeObserver(Observer);
 }
 LegalizerHelper::LegalizeResult
 LegalizerHelper::legalizeInstrStep(MachineInstr &MI) {
-  LLVM_DEBUG(dbgs() << "Legalizing: "; MI.print(dbgs()));
+  LLVM_DEBUG(dbgs() << "Legalizing: " << MI);
+
+  MIRBuilder.setInstrAndDebugLoc(MI);
 
   if (MI.getOpcode() == TargetOpcode::G_INTRINSIC ||
       MI.getOpcode() == TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS)
-    return LI.legalizeIntrinsic(MI, MRI, MIRBuilder) ? Legalized
-                                                     : UnableToLegalize;
+    return LI.legalizeIntrinsic(*this, MI) ? Legalized : UnableToLegalize;
   auto Step = LI.getAction(MI, MRI);
   switch (Step.Action) {
   case Legal:
@@ -101,6 +119,9 @@ LegalizerHelper::legalizeInstrStep(MachineInstr &MI) {
   case WidenScalar:
     LLVM_DEBUG(dbgs() << ".. Widen scalar\n");
     return widenScalar(MI, Step.TypeIdx, Step.NewType);
+  case Bitcast:
+    LLVM_DEBUG(dbgs() << ".. Bitcast type\n");
+    return bitcast(MI, Step.TypeIdx, Step.NewType);
   case Lower:
     LLVM_DEBUG(dbgs() << ".. Lower\n");
     return lower(MI, Step.TypeIdx, Step.NewType);
@@ -112,8 +133,7 @@ LegalizerHelper::legalizeInstrStep(MachineInstr &MI) {
     return moreElementsVector(MI, Step.TypeIdx, Step.NewType);
   case Custom:
     LLVM_DEBUG(dbgs() << ".. Custom legalization\n");
-    return LI.legalizeCustom(MI, MRI, MIRBuilder, Observer) ? Legalized
-                                                            : UnableToLegalize;
+    return LI.legalizeCustom(*this, MI) ? Legalized : UnableToLegalize;
   default:
     LLVM_DEBUG(dbgs() << ".. Unable to legalize\n");
     return UnableToLegalize;
@@ -172,26 +192,6 @@ bool LegalizerHelper::extractParts(Register Reg, LLT RegTy,
   return true;
 }
 
-static LLT getGCDType(LLT OrigTy, LLT TargetTy) {
-  if (OrigTy.isVector() && TargetTy.isVector()) {
-    assert(OrigTy.getElementType() == TargetTy.getElementType());
-    int GCD = greatestCommonDivisor(OrigTy.getNumElements(),
-                                    TargetTy.getNumElements());
-    return LLT::scalarOrVector(GCD, OrigTy.getElementType());
-  }
-
-  if (OrigTy.isVector() && !TargetTy.isVector()) {
-    assert(OrigTy.getElementType() == TargetTy);
-    return TargetTy;
-  }
-
-  assert(!OrigTy.isVector() && !TargetTy.isVector());
-
-  int GCD = greatestCommonDivisor(OrigTy.getSizeInBits(),
-                                  TargetTy.getSizeInBits());
-  return LLT::scalar(GCD);
-}
-
 void LegalizerHelper::insertParts(Register DstReg,
                                   LLT ResultTy, LLT PartTy,
                                   ArrayRef<Register> PartRegs,
@@ -237,92 +237,222 @@ void LegalizerHelper::insertParts(Register DstReg,
   }
 }
 
+/// Return the result registers of G_UNMERGE_VALUES \p MI in \p Regs
+static void getUnmergeResults(SmallVectorImpl<Register> &Regs,
+                              const MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_UNMERGE_VALUES);
+
+  const int NumResults = MI.getNumOperands() - 1;
+  Regs.resize(NumResults);
+  for (int I = 0; I != NumResults; ++I)
+    Regs[I] = MI.getOperand(I).getReg();
+}
+
+LLT LegalizerHelper::extractGCDType(SmallVectorImpl<Register> &Parts, LLT DstTy,
+                                    LLT NarrowTy, Register SrcReg) {
+  LLT SrcTy = MRI.getType(SrcReg);
+
+  LLT GCDTy = getGCDType(DstTy, getGCDType(SrcTy, NarrowTy));
+  if (SrcTy == GCDTy) {
+    // If the source already evenly divides the result type, we don't need to do
+    // anything.
+    Parts.push_back(SrcReg);
+  } else {
+    // Need to split into common type sized pieces.
+    auto Unmerge = MIRBuilder.buildUnmerge(GCDTy, SrcReg);
+    getUnmergeResults(Parts, *Unmerge);
+  }
+
+  return GCDTy;
+}
+
+LLT LegalizerHelper::buildLCMMergePieces(LLT DstTy, LLT NarrowTy, LLT GCDTy,
+                                         SmallVectorImpl<Register> &VRegs,
+                                         unsigned PadStrategy) {
+  LLT LCMTy = getLCMType(DstTy, NarrowTy);
+
+  int NumParts = LCMTy.getSizeInBits() / NarrowTy.getSizeInBits();
+  int NumSubParts = NarrowTy.getSizeInBits() / GCDTy.getSizeInBits();
+  int NumOrigSrc = VRegs.size();
+
+  Register PadReg;
+
+  // Get a value we can use to pad the source value if the sources won't evenly
+  // cover the result type.
+  if (NumOrigSrc < NumParts * NumSubParts) {
+    if (PadStrategy == TargetOpcode::G_ZEXT)
+      PadReg = MIRBuilder.buildConstant(GCDTy, 0).getReg(0);
+    else if (PadStrategy == TargetOpcode::G_ANYEXT)
+      PadReg = MIRBuilder.buildUndef(GCDTy).getReg(0);
+    else {
+      assert(PadStrategy == TargetOpcode::G_SEXT);
+
+      // Shift the sign bit of the low register through the high register.
+      auto ShiftAmt =
+        MIRBuilder.buildConstant(LLT::scalar(64), GCDTy.getSizeInBits() - 1);
+      PadReg = MIRBuilder.buildAShr(GCDTy, VRegs.back(), ShiftAmt).getReg(0);
+    }
+  }
+
+  // Registers for the final merge to be produced.
+  SmallVector<Register, 4> Remerge(NumParts);
+
+  // Registers needed for intermediate merges, which will be merged into a
+  // source for Remerge.
+  SmallVector<Register, 4> SubMerge(NumSubParts);
+
+  // Once we've fully read off the end of the original source bits, we can reuse
+  // the same high bits for remaining padding elements.
+  Register AllPadReg;
+
+  // Build merges to the LCM type to cover the original result type.
+  for (int I = 0; I != NumParts; ++I) {
+    bool AllMergePartsArePadding = true;
+
+    // Build the requested merges to the requested type.
+    for (int J = 0; J != NumSubParts; ++J) {
+      int Idx = I * NumSubParts + J;
+      if (Idx >= NumOrigSrc) {
+        SubMerge[J] = PadReg;
+        continue;
+      }
+
+      SubMerge[J] = VRegs[Idx];
+
+      // There are meaningful bits here we can't reuse later.
+      AllMergePartsArePadding = false;
+    }
+
+    // If we've filled up a complete piece with padding bits, we can directly
+    // emit the natural sized constant if applicable, rather than a merge of
+    // smaller constants.
+    if (AllMergePartsArePadding && !AllPadReg) {
+      if (PadStrategy == TargetOpcode::G_ANYEXT)
+        AllPadReg = MIRBuilder.buildUndef(NarrowTy).getReg(0);
+      else if (PadStrategy == TargetOpcode::G_ZEXT)
+        AllPadReg = MIRBuilder.buildConstant(NarrowTy, 0).getReg(0);
+
+      // If this is a sign extension, we can't materialize a trivial constant
+      // with the right type and have to produce a merge.
+    }
+
+    if (AllPadReg) {
+      // Avoid creating additional instructions if we're just adding additional
+      // copies of padding bits.
+      Remerge[I] = AllPadReg;
+      continue;
+    }
+
+    if (NumSubParts == 1)
+      Remerge[I] = SubMerge[0];
+    else
+      Remerge[I] = MIRBuilder.buildMerge(NarrowTy, SubMerge).getReg(0);
+
+    // In the sign extend padding case, re-use the first all-signbit merge.
+    if (AllMergePartsArePadding && !AllPadReg)
+      AllPadReg = Remerge[I];
+  }
+
+  VRegs = std::move(Remerge);
+  return LCMTy;
+}
+
+void LegalizerHelper::buildWidenedRemergeToDst(Register DstReg, LLT LCMTy,
+                                               ArrayRef<Register> RemergeRegs) {
+  LLT DstTy = MRI.getType(DstReg);
+
+  // Create the merge to the widened source, and extract the relevant bits into
+  // the result.
+
+  if (DstTy == LCMTy) {
+    MIRBuilder.buildMerge(DstReg, RemergeRegs);
+    return;
+  }
+
+  auto Remerge = MIRBuilder.buildMerge(LCMTy, RemergeRegs);
+  if (DstTy.isScalar() && LCMTy.isScalar()) {
+    MIRBuilder.buildTrunc(DstReg, Remerge);
+    return;
+  }
+
+  if (LCMTy.isVector()) {
+    MIRBuilder.buildExtract(DstReg, Remerge, 0);
+    return;
+  }
+
+  llvm_unreachable("unhandled case");
+}
+
 static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
+#define RTLIBCASE(LibcallPrefix)                                               \
+  do {                                                                         \
+    switch (Size) {                                                            \
+    case 32:                                                                   \
+      return RTLIB::LibcallPrefix##32;                                         \
+    case 64:                                                                   \
+      return RTLIB::LibcallPrefix##64;                                         \
+    case 128:                                                                  \
+      return RTLIB::LibcallPrefix##128;                                        \
+    default:                                                                   \
+      llvm_unreachable("unexpected size");                                     \
+    }                                                                          \
+  } while (0)
+
+  assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size");
+
   switch (Opcode) {
   case TargetOpcode::G_SDIV:
-    assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size");
-    switch (Size) {
-    case 32:
-      return RTLIB::SDIV_I32;
-    case 64:
-      return RTLIB::SDIV_I64;
-    case 128:
-      return RTLIB::SDIV_I128;
-    default:
-      llvm_unreachable("unexpected size");
-    }
+    RTLIBCASE(SDIV_I);
   case TargetOpcode::G_UDIV:
-    assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size");
-    switch (Size) {
-    case 32:
-      return RTLIB::UDIV_I32;
-    case 64:
-      return RTLIB::UDIV_I64;
-    case 128:
-      return RTLIB::UDIV_I128;
-    default:
-      llvm_unreachable("unexpected size");
-    }
+    RTLIBCASE(UDIV_I);
   case TargetOpcode::G_SREM:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::SREM_I64 : RTLIB::SREM_I32;
+    RTLIBCASE(SREM_I);
   case TargetOpcode::G_UREM:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::UREM_I64 : RTLIB::UREM_I32;
+    RTLIBCASE(UREM_I);
   case TargetOpcode::G_CTLZ_ZERO_UNDEF:
-    assert(Size == 32 && "Unsupported size");
-    return RTLIB::CTLZ_I32;
+    RTLIBCASE(CTLZ_I);
   case TargetOpcode::G_FADD:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::ADD_F64 : RTLIB::ADD_F32;
+    RTLIBCASE(ADD_F);
   case TargetOpcode::G_FSUB:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::SUB_F64 : RTLIB::SUB_F32;
+    RTLIBCASE(SUB_F);
   case TargetOpcode::G_FMUL:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::MUL_F64 : RTLIB::MUL_F32;
+    RTLIBCASE(MUL_F);
   case TargetOpcode::G_FDIV:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::DIV_F64 : RTLIB::DIV_F32;
+    RTLIBCASE(DIV_F);
   case TargetOpcode::G_FEXP:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::EXP_F64 : RTLIB::EXP_F32;
+    RTLIBCASE(EXP_F);
   case TargetOpcode::G_FEXP2:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::EXP2_F64 : RTLIB::EXP2_F32;
+    RTLIBCASE(EXP2_F);
   case TargetOpcode::G_FREM:
-    return Size == 64 ? RTLIB::REM_F64 : RTLIB::REM_F32;
+    RTLIBCASE(REM_F);
   case TargetOpcode::G_FPOW:
-    return Size == 64 ? RTLIB::POW_F64 : RTLIB::POW_F32;
+    RTLIBCASE(POW_F);
   case TargetOpcode::G_FMA:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::FMA_F64 : RTLIB::FMA_F32;
+    RTLIBCASE(FMA_F);
   case TargetOpcode::G_FSIN:
-    assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size");
-    return Size == 128 ? RTLIB::SIN_F128
-                       : Size == 64 ? RTLIB::SIN_F64 : RTLIB::SIN_F32;
+    RTLIBCASE(SIN_F);
   case TargetOpcode::G_FCOS:
-    assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size");
-    return Size == 128 ? RTLIB::COS_F128
-                       : Size == 64 ? RTLIB::COS_F64 : RTLIB::COS_F32;
+    RTLIBCASE(COS_F);
   case TargetOpcode::G_FLOG10:
-    assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size");
-    return Size == 128 ? RTLIB::LOG10_F128
-                       : Size == 64 ? RTLIB::LOG10_F64 : RTLIB::LOG10_F32;
+    RTLIBCASE(LOG10_F);
   case TargetOpcode::G_FLOG:
-    assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size");
-    return Size == 128 ? RTLIB::LOG_F128
-                       : Size == 64 ? RTLIB::LOG_F64 : RTLIB::LOG_F32;
+    RTLIBCASE(LOG_F);
   case TargetOpcode::G_FLOG2:
-    assert((Size == 32 || Size == 64 || Size == 128) && "Unsupported size");
-    return Size == 128 ? RTLIB::LOG2_F128
-                       : Size == 64 ? RTLIB::LOG2_F64 : RTLIB::LOG2_F32;
+    RTLIBCASE(LOG2_F);
   case TargetOpcode::G_FCEIL:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::CEIL_F64 : RTLIB::CEIL_F32;
+    RTLIBCASE(CEIL_F);
   case TargetOpcode::G_FFLOOR:
-    assert((Size == 32 || Size == 64) && "Unsupported size");
-    return Size == 64 ? RTLIB::FLOOR_F64 : RTLIB::FLOOR_F32;
+    RTLIBCASE(FLOOR_F);
+  case TargetOpcode::G_FMINNUM:
+    RTLIBCASE(FMIN_F);
+  case TargetOpcode::G_FMAXNUM:
+    RTLIBCASE(FMAX_F);
+  case TargetOpcode::G_FSQRT:
+    RTLIBCASE(SQRT_F);
+  case TargetOpcode::G_FRINT:
+    RTLIBCASE(RINT_F);
+  case TargetOpcode::G_FNEARBYINT:
+    RTLIBCASE(NEARBYINT_F);
   }
   llvm_unreachable("Unknown libcall function");
 }
@@ -330,7 +460,8 @@ static RTLIB::Libcall getRTLibDesc(unsigned Opcode, unsigned Size) {
 /// True if an instruction is in tail position in its caller. Intended for
 /// legalizing libcalls as tail calls when possible.
 static bool isLibCallInTailPosition(MachineInstr &MI) {
-  const Function &F = MI.getParent()->getParent()->getFunction();
+  MachineBasicBlock &MBB = *MI.getParent();
+  const Function &F = MBB.getParent()->getFunction();
 
   // Conservatively require the attributes of the call to match those of
   // the return. Ignore NoAlias and NonNull because they don't affect the
@@ -349,23 +480,22 @@ static bool isLibCallInTailPosition(MachineInstr &MI) {
 
   // Only tail call if the following instruction is a standard return.
   auto &TII = *MI.getMF()->getSubtarget().getInstrInfo();
-  MachineInstr *Next = MI.getNextNode();
-  if (!Next || TII.isTailCall(*Next) || !Next->isReturn())
+  auto Next = next_nodbg(MI.getIterator(), MBB.instr_end());
+  if (Next == MBB.instr_end() || TII.isTailCall(*Next) || !Next->isReturn())
     return false;
 
   return true;
 }
 
 LegalizerHelper::LegalizeResult
-llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
+llvm::createLibcall(MachineIRBuilder &MIRBuilder, const char *Name,
                     const CallLowering::ArgInfo &Result,
-                    ArrayRef<CallLowering::ArgInfo> Args) {
+                    ArrayRef<CallLowering::ArgInfo> Args,
+                    const CallingConv::ID CC) {
   auto &CLI = *MIRBuilder.getMF().getSubtarget().getCallLowering();
-  auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
-  const char *Name = TLI.getLibcallName(Libcall);
 
   CallLowering::CallLoweringInfo Info;
-  Info.CallConv = TLI.getLibcallCallingConv(Libcall);
+  Info.CallConv = CC;
   Info.Callee = MachineOperand::CreateES(Name);
   Info.OrigRet = Result;
   std::copy(Args.begin(), Args.end(), std::back_inserter(Info.OrigArgs));
@@ -375,6 +505,16 @@ llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
   return LegalizerHelper::Legalized;
 }
 
+LegalizerHelper::LegalizeResult
+llvm::createLibcall(MachineIRBuilder &MIRBuilder, RTLIB::Libcall Libcall,
+                    const CallLowering::ArgInfo &Result,
+                    ArrayRef<CallLowering::ArgInfo> Args) {
+  auto &TLI = *MIRBuilder.getMF().getSubtarget().getTargetLowering();
+  const char *Name = TLI.getLibcallName(Libcall);
+  const CallingConv::ID CC = TLI.getLibcallCallingConv(Libcall);
+  return createLibcall(MIRBuilder, Name, Result, Args, CC);
+}
+
 // Useful for libcalls where all operands have the same type.
 static LegalizerHelper::LegalizeResult
 simpleLibcall(MachineInstr &MI, MachineIRBuilder &MIRBuilder, unsigned Size,
@@ -428,7 +568,7 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
   }
   const char *Name = TLI.getLibcallName(RTLibcall);
 
-  MIRBuilder.setInstr(MI);
+  MIRBuilder.setInstrAndDebugLoc(MI);
 
   CallLowering::CallLoweringInfo Info;
   Info.CallConv = TLI.getLibcallCallingConv(RTLibcall);
@@ -443,14 +583,16 @@ llvm::createMemLibcall(MachineIRBuilder &MIRBuilder, MachineRegisterInfo &MRI,
 
   if (Info.LoweredTailCall) {
     assert(Info.IsTailCall && "Lowered tail call when it wasn't a tail call?");
-    // We must have a return following the call to get past
+    // We must have a return following the call (or debug insts) to get past
     // isLibCallInTailPosition.
-    assert(MI.getNextNode() && MI.getNextNode()->isReturn() &&
-           "Expected instr following MI to be a return?");
-
-    // We lowered a tail call, so the call is now the return from the block.
-    // Delete the old return.
-    MI.getNextNode()->eraseFromParent();
+    do {
+      MachineInstr *Next = MI.getNextNode();
+      assert(Next && (Next->isReturn() || Next->isDebugInstr()) &&
+             "Expected instr following MI to be return or debug inst?");
+      // We lowered a tail call, so the call is now the return from the block.
+      // Delete the old return.
+      Next->eraseFromParent();
+    } while (MI.getNextNode());
   }
 
   return LegalizerHelper::Legalized;
@@ -492,8 +634,6 @@ LegalizerHelper::libcall(MachineInstr &MI) {
   unsigned Size = LLTy.getSizeInBits();
   auto &Ctx = MIRBuilder.getMF().getFunction().getContext();
 
-  MIRBuilder.setInstr(MI);
-
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
@@ -523,37 +663,29 @@ LegalizerHelper::libcall(MachineInstr &MI) {
   case TargetOpcode::G_FEXP:
   case TargetOpcode::G_FEXP2:
   case TargetOpcode::G_FCEIL:
-  case TargetOpcode::G_FFLOOR: {
-    if (Size > 64) {
-      LLVM_DEBUG(dbgs() << "Size " << Size << " too large to legalize.\n");
+  case TargetOpcode::G_FFLOOR:
+  case TargetOpcode::G_FMINNUM:
+  case TargetOpcode::G_FMAXNUM:
+  case TargetOpcode::G_FSQRT:
+  case TargetOpcode::G_FRINT:
+  case TargetOpcode::G_FNEARBYINT: {
+    Type *HLTy = getFloatTypeForLLT(Ctx, LLTy);
+    if (!HLTy || (Size != 32 && Size != 64 && Size != 128)) {
+      LLVM_DEBUG(dbgs() << "No libcall available for size " << Size << ".\n");
       return UnableToLegalize;
     }
-    Type *HLTy = Size == 64 ? Type::getDoubleTy(Ctx) : Type::getFloatTy(Ctx);
     auto Status = simpleLibcall(MI, MIRBuilder, Size, HLTy);
     if (Status != Legalized)
       return Status;
     break;
   }
-  case TargetOpcode::G_FPEXT: {
-    // FIXME: Support other floating point types (half, fp128 etc)
-    unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
-    unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    if (ToSize != 64 || FromSize != 32)
-      return UnableToLegalize;
-    LegalizeResult Status = conversionLibcall(
-        MI, MIRBuilder, Type::getDoubleTy(Ctx), Type::getFloatTy(Ctx));
-    if (Status != Legalized)
-      return Status;
-    break;
-  }
+  case TargetOpcode::G_FPEXT:
   case TargetOpcode::G_FPTRUNC: {
-    // FIXME: Support other floating point types (half, fp128 etc)
-    unsigned FromSize = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
-    unsigned ToSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    if (ToSize != 32 || FromSize != 64)
+    Type *FromTy = getFloatTypeForLLT(Ctx,  MRI.getType(MI.getOperand(1).getReg()));
+    Type *ToTy = getFloatTypeForLLT(Ctx, MRI.getType(MI.getOperand(0).getReg()));
+    if (!FromTy || !ToTy)
       return UnableToLegalize;
-    LegalizeResult Status = conversionLibcall(
-        MI, MIRBuilder, Type::getFloatTy(Ctx), Type::getDoubleTy(Ctx));
+    LegalizeResult Status = conversionLibcall(MI, MIRBuilder, ToTy, FromTy );
     if (Status != Legalized)
       return Status;
     break;
@@ -597,8 +729,6 @@ LegalizerHelper::libcall(MachineInstr &MI) {
 LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
                                                               unsigned TypeIdx,
                                                               LLT NarrowTy) {
-  MIRBuilder.setInstr(MI);
-
   uint64_t SizeOp0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
   uint64_t NarrowSize = NarrowTy.getSizeInBits();
 
@@ -606,19 +736,34 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
   default:
     return UnableToLegalize;
   case TargetOpcode::G_IMPLICIT_DEF: {
-    // FIXME: add support for when SizeOp0 isn't an exact multiple of
-    // NarrowSize.
-    if (SizeOp0 % NarrowSize != 0)
-      return UnableToLegalize;
+    Register DstReg = MI.getOperand(0).getReg();
+    LLT DstTy = MRI.getType(DstReg);
+
+    // If SizeOp0 is not an exact multiple of NarrowSize, emit
+    // G_ANYEXT(G_IMPLICIT_DEF). Cast result to vector if needed.
+    // FIXME: Although this would also be legal for the general case, it causes
+    //  a lot of regressions in the emitted code (superfluous COPYs, artifact
+    //  combines not being hit). This seems to be a problem related to the
+    //  artifact combiner.
+    if (SizeOp0 % NarrowSize != 0) {
+      LLT ImplicitTy = NarrowTy;
+      if (DstTy.isVector())
+        ImplicitTy = LLT::vector(DstTy.getNumElements(), ImplicitTy);
+
+      Register ImplicitReg = MIRBuilder.buildUndef(ImplicitTy).getReg(0);
+      MIRBuilder.buildAnyExt(DstReg, ImplicitReg);
+
+      MI.eraseFromParent();
+      return Legalized;
+    }
+
     int NumParts = SizeOp0 / NarrowSize;
 
     SmallVector<Register, 2> DstRegs;
     for (int i = 0; i < NumParts; ++i)
-      DstRegs.push_back(
-          MIRBuilder.buildUndef(NarrowTy)->getOperand(0).getReg());
+      DstRegs.push_back(MIRBuilder.buildUndef(NarrowTy).getReg(0));
 
-    Register DstReg = MI.getOperand(0).getReg();
-    if(MRI.getType(DstReg).isVector())
+    if (DstTy.isVector())
       MIRBuilder.buildBuildVector(DstReg, DstRegs);
     else
       MIRBuilder.buildMerge(DstReg, DstRegs);
@@ -657,49 +802,10 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     MI.eraseFromParent();
     return Legalized;
   }
-  case TargetOpcode::G_SEXT: {
-    if (TypeIdx != 0)
-      return UnableToLegalize;
-
-    Register SrcReg = MI.getOperand(1).getReg();
-    LLT SrcTy = MRI.getType(SrcReg);
-
-    // FIXME: support the general case where the requested NarrowTy may not be
-    // the same as the source type. E.g. s128 = sext(s32)
-    if ((SrcTy.getSizeInBits() != SizeOp0 / 2) ||
-        SrcTy.getSizeInBits() != NarrowTy.getSizeInBits()) {
-      LLVM_DEBUG(dbgs() << "Can't narrow sext to type " << NarrowTy << "\n");
-      return UnableToLegalize;
-    }
-
-    // Shift the sign bit of the low register through the high register.
-    auto ShiftAmt =
-        MIRBuilder.buildConstant(LLT::scalar(64), NarrowTy.getSizeInBits() - 1);
-    auto Shift = MIRBuilder.buildAShr(NarrowTy, SrcReg, ShiftAmt);
-    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {SrcReg, Shift.getReg(0)});
-    MI.eraseFromParent();
-    return Legalized;
-  }
-  case TargetOpcode::G_ZEXT: {
-    if (TypeIdx != 0)
-      return UnableToLegalize;
-
-    LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
-    uint64_t SizeOp1 = SrcTy.getSizeInBits();
-    if (SizeOp0 % SizeOp1 != 0)
-      return UnableToLegalize;
-
-    // Generate a merge where the bottom bits are taken from the source, and
-    // zero everything else.
-    Register ZeroReg = MIRBuilder.buildConstant(SrcTy, 0).getReg(0);
-    unsigned NumParts = SizeOp0 / SizeOp1;
-    SmallVector<Register, 4> Srcs = {MI.getOperand(1).getReg()};
-    for (unsigned Part = 1; Part < NumParts; ++Part)
-      Srcs.push_back(ZeroReg);
-    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), Srcs);
-    MI.eraseFromParent();
-    return Legalized;
-  }
+  case TargetOpcode::G_SEXT:
+  case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_ANYEXT:
+    return narrowScalarExt(MI, TypeIdx, NarrowTy);
   case TargetOpcode::G_TRUNC: {
     if (TypeIdx != 1)
       return UnableToLegalize;
@@ -710,12 +816,15 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       return UnableToLegalize;
     }
 
-    auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1).getReg());
-    MIRBuilder.buildCopy(MI.getOperand(0).getReg(), Unmerge.getReg(0));
+    auto Unmerge = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
+    MIRBuilder.buildCopy(MI.getOperand(0), Unmerge.getReg(0));
     MI.eraseFromParent();
     return Legalized;
   }
 
+  case TargetOpcode::G_FREEZE:
+    return reduceOperationWidth(MI, TypeIdx, NarrowTy);
+
   case TargetOpcode::G_ADD: {
     // FIXME: add support for when SizeOp0 isn't an exact multiple of
     // NarrowSize.
@@ -779,7 +888,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       DstRegs.push_back(DstReg);
       BorrowIn = BorrowOut;
     }
-    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
+    MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
     MI.eraseFromParent();
     return Legalized;
   }
@@ -800,7 +909,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     if (8 * MMO.getSize() != DstTy.getSizeInBits()) {
       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
       auto &MMO = **MI.memoperands_begin();
-      MIRBuilder.buildLoad(TmpReg, MI.getOperand(1).getReg(), MMO);
+      MIRBuilder.buildLoad(TmpReg, MI.getOperand(1), MMO);
       MIRBuilder.buildAnyExt(DstReg, TmpReg);
       MI.eraseFromParent();
       return Legalized;
@@ -819,12 +928,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     if (MMO.getSizeInBits() == NarrowSize) {
       MIRBuilder.buildLoad(TmpReg, PtrReg, MMO);
     } else {
-      unsigned ExtLoad = ZExt ? TargetOpcode::G_ZEXTLOAD
-        : TargetOpcode::G_SEXTLOAD;
-      MIRBuilder.buildInstr(ExtLoad)
-        .addDef(TmpReg)
-        .addUse(PtrReg)
-        .addMemOperand(&MMO);
+      MIRBuilder.buildLoadInstr(MI.getOpcode(), TmpReg, PtrReg, MMO);
     }
 
     if (ZExt)
@@ -853,7 +957,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       Register TmpReg = MRI.createGenericVirtualRegister(NarrowTy);
       auto &MMO = **MI.memoperands_begin();
       MIRBuilder.buildTrunc(TmpReg, SrcReg);
-      MIRBuilder.buildStore(TmpReg, MI.getOperand(1).getReg(), MMO);
+      MIRBuilder.buildStore(TmpReg, MI.getOperand(1), MMO);
       MI.eraseFromParent();
       return Legalized;
     }
@@ -885,8 +989,19 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
   case TargetOpcode::G_CTTZ:
   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
   case TargetOpcode::G_CTPOP:
-    if (TypeIdx != 0)
-      return UnableToLegalize; // TODO
+    if (TypeIdx == 1)
+      switch (MI.getOpcode()) {
+      case TargetOpcode::G_CTLZ:
+      case TargetOpcode::G_CTLZ_ZERO_UNDEF:
+        return narrowScalarCTLZ(MI, TypeIdx, NarrowTy);
+      case TargetOpcode::G_CTTZ:
+      case TargetOpcode::G_CTTZ_ZERO_UNDEF:
+        return narrowScalarCTTZ(MI, TypeIdx, NarrowTy);
+      case TargetOpcode::G_CTPOP:
+        return narrowScalarCTPOP(MI, TypeIdx, NarrowTy);
+      default:
+        return UnableToLegalize;
+      }
 
     Observer.changingInstr(MI);
     narrowScalarDst(MI, NarrowTy, 0, TargetOpcode::G_ZEXT);
@@ -910,10 +1025,8 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     return Legalized;
   case TargetOpcode::G_PHI: {
     unsigned NumParts = SizeOp0 / NarrowSize;
-    SmallVector<Register, 2> DstRegs;
-    SmallVector<SmallVector<Register, 2>, 2> SrcRegs;
-    DstRegs.resize(NumParts);
-    SrcRegs.resize(MI.getNumOperands() / 2);
+    SmallVector<Register, 2> DstRegs(NumParts);
+    SmallVector<SmallVector<Register, 2>, 2> SrcRegs(MI.getNumOperands() / 2);
     Observer.changingInstr(MI);
     for (unsigned i = 1; i < MI.getNumOperands(); i += 2) {
       MachineBasicBlock &OpMBB = *MI.getOperand(i + 1).getMBB();
@@ -931,7 +1044,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
         MIB.addUse(SrcRegs[j / 2][i]).add(MI.getOperand(j + 1));
     }
     MIRBuilder.setInsertPt(MBB, MBB.getFirstNonPHI());
-    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
+    MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
     Observer.changedInstr(MI);
     MI.eraseFromParent();
     return Legalized;
@@ -955,11 +1068,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     Observer.changingInstr(MI);
     Register LHSL = MRI.createGenericVirtualRegister(NarrowTy);
     Register LHSH = MRI.createGenericVirtualRegister(NarrowTy);
-    MIRBuilder.buildUnmerge({LHSL, LHSH}, MI.getOperand(2).getReg());
+    MIRBuilder.buildUnmerge({LHSL, LHSH}, MI.getOperand(2));
 
     Register RHSL = MRI.createGenericVirtualRegister(NarrowTy);
     Register RHSH = MRI.createGenericVirtualRegister(NarrowTy);
-    MIRBuilder.buildUnmerge({RHSL, RHSH}, MI.getOperand(3).getReg());
+    MIRBuilder.buildUnmerge({RHSL, RHSH}, MI.getOperand(3));
 
     CmpInst::Predicate Pred =
         static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
@@ -970,14 +1083,14 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       MachineInstrBuilder XorH = MIRBuilder.buildXor(NarrowTy, LHSH, RHSH);
       MachineInstrBuilder Or = MIRBuilder.buildOr(NarrowTy, XorL, XorH);
       MachineInstrBuilder Zero = MIRBuilder.buildConstant(NarrowTy, 0);
-      MIRBuilder.buildICmp(Pred, MI.getOperand(0).getReg(), Or, Zero);
+      MIRBuilder.buildICmp(Pred, MI.getOperand(0), Or, Zero);
     } else {
       MachineInstrBuilder CmpH = MIRBuilder.buildICmp(Pred, ResTy, LHSH, RHSH);
       MachineInstrBuilder CmpHEQ =
           MIRBuilder.buildICmp(CmpInst::Predicate::ICMP_EQ, ResTy, LHSH, RHSH);
       MachineInstrBuilder CmpLU = MIRBuilder.buildICmp(
           ICmpInst::getUnsignedPredicate(Pred), ResTy, LHSL, RHSL);
-      MIRBuilder.buildSelect(MI.getOperand(0).getReg(), CmpHEQ, CmpLU, CmpH);
+      MIRBuilder.buildSelect(MI.getOperand(0), CmpHEQ, CmpLU, CmpH);
     }
     Observer.changedInstr(MI);
     MI.eraseFromParent();
@@ -987,8 +1100,6 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     if (TypeIdx != 0)
       return UnableToLegalize;
 
-    if (!MI.getOperand(2).isImm())
-      return UnableToLegalize;
     int64_t SizeInBits = MI.getOperand(2).getImm();
 
     // So long as the new type has more bits than the bits we're extending we
@@ -998,13 +1109,13 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       // We don't lose any non-extension bits by truncating the src and
       // sign-extending the dst.
       MachineOperand &MO1 = MI.getOperand(1);
-      auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1.getReg());
-      MO1.setReg(TruncMIB->getOperand(0).getReg());
+      auto TruncMIB = MIRBuilder.buildTrunc(NarrowTy, MO1);
+      MO1.setReg(TruncMIB.getReg(0));
 
       MachineOperand &MO2 = MI.getOperand(0);
       Register DstExt = MRI.createGenericVirtualRegister(NarrowTy);
       MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
-      MIRBuilder.buildInstr(TargetOpcode::G_SEXT, {MO2.getReg()}, {DstExt});
+      MIRBuilder.buildSExt(MO2, DstExt);
       MO2.setReg(DstExt);
       Observer.changedInstr(MI);
       return Legalized;
@@ -1031,12 +1142,11 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
     }
 
     // Explode the big arguments into smaller chunks.
-    MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1).getReg());
+    MIRBuilder.buildUnmerge(SrcRegs, MI.getOperand(1));
 
     Register AshrCstReg =
         MIRBuilder.buildConstant(NarrowTy, NarrowTy.getScalarSizeInBits() - 1)
-            ->getOperand(0)
-            .getReg();
+            .getReg(0);
     Register FullExtensionReg = 0;
     Register PartialExtensionReg = 0;
 
@@ -1051,11 +1161,9 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
           DstRegs.push_back(FullExtensionReg);
           continue;
         }
-        DstRegs.push_back(MIRBuilder
-                              .buildInstr(TargetOpcode::G_ASHR, {NarrowTy},
-                                          {PartialExtensionReg, AshrCstReg})
-                              ->getOperand(0)
-                              .getReg());
+        DstRegs.push_back(
+            MIRBuilder.buildAShr(NarrowTy, PartialExtensionReg, AshrCstReg)
+                .getReg(0));
         FullExtensionReg = DstRegs.back();
       } else {
         DstRegs.push_back(
@@ -1063,8 +1171,7 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
                 .buildInstr(
                     TargetOpcode::G_SEXT_INREG, {NarrowTy},
                     {SrcRegs[i], SizeInBits % NarrowTy.getScalarSizeInBits()})
-                ->getOperand(0)
-                .getReg());
+                .getReg(0));
         PartialExtensionReg = DstRegs.back();
       }
     }
@@ -1091,28 +1198,57 @@ LegalizerHelper::LegalizeResult LegalizerHelper::narrowScalar(MachineInstr &MI,
       DstRegs.push_back(DstPart.getReg(0));
     }
 
-    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), DstRegs);
+    MIRBuilder.buildMerge(MI.getOperand(0), DstRegs);
 
     Observer.changedInstr(MI);
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_PTRMASK: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    narrowScalarSrc(MI, NarrowTy, 2);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
   }
 }
 
+Register LegalizerHelper::coerceToScalar(Register Val) {
+  LLT Ty = MRI.getType(Val);
+  if (Ty.isScalar())
+    return Val;
+
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  LLT NewTy = LLT::scalar(Ty.getSizeInBits());
+  if (Ty.isPointer()) {
+    if (DL.isNonIntegralAddressSpace(Ty.getAddressSpace()))
+      return Register();
+    return MIRBuilder.buildPtrToInt(NewTy, Val).getReg(0);
+  }
+
+  Register NewVal = Val;
+
+  assert(Ty.isVector());
+  LLT EltTy = Ty.getElementType();
+  if (EltTy.isPointer())
+    NewVal = MIRBuilder.buildPtrToInt(NewTy, NewVal).getReg(0);
+  return MIRBuilder.buildBitcast(NewTy, NewVal).getReg(0);
+}
+
 void LegalizerHelper::widenScalarSrc(MachineInstr &MI, LLT WideTy,
                                      unsigned OpIdx, unsigned ExtOpcode) {
   MachineOperand &MO = MI.getOperand(OpIdx);
-  auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO.getReg()});
-  MO.setReg(ExtB->getOperand(0).getReg());
+  auto ExtB = MIRBuilder.buildInstr(ExtOpcode, {WideTy}, {MO});
+  MO.setReg(ExtB.getReg(0));
 }
 
 void LegalizerHelper::narrowScalarSrc(MachineInstr &MI, LLT NarrowTy,
                                       unsigned OpIdx) {
   MachineOperand &MO = MI.getOperand(OpIdx);
-  auto ExtB = MIRBuilder.buildInstr(TargetOpcode::G_TRUNC, {NarrowTy},
-                                    {MO.getReg()});
-  MO.setReg(ExtB->getOperand(0).getReg());
+  auto ExtB = MIRBuilder.buildTrunc(NarrowTy, MO);
+  MO.setReg(ExtB.getReg(0));
 }
 
 void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
@@ -1120,7 +1256,7 @@ void LegalizerHelper::widenScalarDst(MachineInstr &MI, LLT WideTy,
   MachineOperand &MO = MI.getOperand(OpIdx);
   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
-  MIRBuilder.buildInstr(TruncOpcode, {MO.getReg()}, {DstExt});
+  MIRBuilder.buildInstr(TruncOpcode, {MO}, {DstExt});
   MO.setReg(DstExt);
 }
 
@@ -1129,7 +1265,7 @@ void LegalizerHelper::narrowScalarDst(MachineInstr &MI, LLT NarrowTy,
   MachineOperand &MO = MI.getOperand(OpIdx);
   Register DstTrunc = MRI.createGenericVirtualRegister(NarrowTy);
   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
-  MIRBuilder.buildInstr(ExtOpcode, {MO.getReg()}, {DstTrunc});
+  MIRBuilder.buildInstr(ExtOpcode, {MO}, {DstTrunc});
   MO.setReg(DstTrunc);
 }
 
@@ -1138,7 +1274,7 @@ void LegalizerHelper::moreElementsVectorDst(MachineInstr &MI, LLT WideTy,
   MachineOperand &MO = MI.getOperand(OpIdx);
   Register DstExt = MRI.createGenericVirtualRegister(WideTy);
   MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
-  MIRBuilder.buildExtract(MO.getReg(), DstExt, 0);
+  MIRBuilder.buildExtract(MO, DstExt, 0);
   MO.setReg(DstExt);
 }
 
@@ -1172,6 +1308,19 @@ void LegalizerHelper::moreElementsVectorSrc(MachineInstr &MI, LLT MoreTy,
   MO.setReg(MoreReg);
 }
 
+void LegalizerHelper::bitcastSrc(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
+  MachineOperand &Op = MI.getOperand(OpIdx);
+  Op.setReg(MIRBuilder.buildBitcast(CastTy, Op).getReg(0));
+}
+
+void LegalizerHelper::bitcastDst(MachineInstr &MI, LLT CastTy, unsigned OpIdx) {
+  MachineOperand &MO = MI.getOperand(OpIdx);
+  Register CastDst = MRI.createGenericVirtualRegister(CastTy);
+  MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+  MIRBuilder.buildBitcast(MO, CastDst);
+  MO.setReg(CastDst);
+}
+
 LegalizerHelper::LegalizeResult
 LegalizerHelper::widenScalarMergeValues(MachineInstr &MI, unsigned TypeIdx,
                                         LLT WideTy) {
@@ -1300,10 +1449,10 @@ LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
   if (TypeIdx != 0)
     return UnableToLegalize;
 
-  unsigned NumDst = MI.getNumOperands() - 1;
+  int NumDst = MI.getNumOperands() - 1;
   Register SrcReg = MI.getOperand(NumDst).getReg();
   LLT SrcTy = MRI.getType(SrcReg);
-  if (!SrcTy.isScalar())
+  if (SrcTy.isVector())
     return UnableToLegalize;
 
   Register Dst0Reg = MI.getOperand(0).getReg();
@@ -1311,26 +1460,90 @@ LegalizerHelper::widenScalarUnmergeValues(MachineInstr &MI, unsigned TypeIdx,
   if (!DstTy.isScalar())
     return UnableToLegalize;
 
-  unsigned NewSrcSize = NumDst * WideTy.getSizeInBits();
-  LLT NewSrcTy = LLT::scalar(NewSrcSize);
-  unsigned SizeDiff = WideTy.getSizeInBits() - DstTy.getSizeInBits();
+  if (WideTy.getSizeInBits() >= SrcTy.getSizeInBits()) {
+    if (SrcTy.isPointer()) {
+      const DataLayout &DL = MIRBuilder.getDataLayout();
+      if (DL.isNonIntegralAddressSpace(SrcTy.getAddressSpace())) {
+        LLVM_DEBUG(
+            dbgs() << "Not casting non-integral address space integer\n");
+        return UnableToLegalize;
+      }
+
+      SrcTy = LLT::scalar(SrcTy.getSizeInBits());
+      SrcReg = MIRBuilder.buildPtrToInt(SrcTy, SrcReg).getReg(0);
+    }
+
+    // Widen SrcTy to WideTy. This does not affect the result, but since the
+    // user requested this size, it is probably better handled than SrcTy and
+    // should reduce the total number of legalization artifacts
+    if (WideTy.getSizeInBits() > SrcTy.getSizeInBits()) {
+      SrcTy = WideTy;
+      SrcReg = MIRBuilder.buildAnyExt(WideTy, SrcReg).getReg(0);
+    }
 
-  auto WideSrc = MIRBuilder.buildZExt(NewSrcTy, SrcReg);
+    // Theres no unmerge type to target. Directly extract the bits from the
+    // source type
+    unsigned DstSize = DstTy.getSizeInBits();
 
-  for (unsigned I = 1; I != NumDst; ++I) {
-    auto ShiftAmt = MIRBuilder.buildConstant(NewSrcTy, SizeDiff * I);
-    auto Shl = MIRBuilder.buildShl(NewSrcTy, WideSrc, ShiftAmt);
-    WideSrc = MIRBuilder.buildOr(NewSrcTy, WideSrc, Shl);
+    MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
+    for (int I = 1; I != NumDst; ++I) {
+      auto ShiftAmt = MIRBuilder.buildConstant(SrcTy, DstSize * I);
+      auto Shr = MIRBuilder.buildLShr(SrcTy, SrcReg, ShiftAmt);
+      MIRBuilder.buildTrunc(MI.getOperand(I), Shr);
+    }
+
+    MI.eraseFromParent();
+    return Legalized;
   }
 
-  Observer.changingInstr(MI);
+  // Extend the source to a wider type.
+  LLT LCMTy = getLCMType(SrcTy, WideTy);
 
-  MI.getOperand(NumDst).setReg(WideSrc->getOperand(0).getReg());
-  for (unsigned I = 0; I != NumDst; ++I)
-    widenScalarDst(MI, WideTy, I);
+  Register WideSrc = SrcReg;
+  if (LCMTy.getSizeInBits() != SrcTy.getSizeInBits()) {
+    // TODO: If this is an integral address space, cast to integer and anyext.
+    if (SrcTy.isPointer()) {
+      LLVM_DEBUG(dbgs() << "Widening pointer source types not implemented\n");
+      return UnableToLegalize;
+    }
 
-  Observer.changedInstr(MI);
+    WideSrc = MIRBuilder.buildAnyExt(LCMTy, WideSrc).getReg(0);
+  }
+
+  auto Unmerge = MIRBuilder.buildUnmerge(WideTy, WideSrc);
 
+  // Create a sequence of unmerges to the original results. since we may have
+  // widened the source, we will need to pad the results with dead defs to cover
+  // the source register.
+  // e.g. widen s16 to s32:
+  // %1:_(s16), %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0:_(s48)
+  //
+  // =>
+  //  %4:_(s64) = G_ANYEXT %0:_(s48)
+  //  %5:_(s32), %6:_(s32) = G_UNMERGE_VALUES %4 ; Requested unmerge
+  //  %1:_(s16), %2:_(s16) = G_UNMERGE_VALUES %5 ; unpack to original regs
+  //  %3:_(s16), dead %7 = G_UNMERGE_VALUES %6 ; original reg + extra dead def
+
+  const int NumUnmerge = Unmerge->getNumOperands() - 1;
+  const int PartsPerUnmerge = WideTy.getSizeInBits() / DstTy.getSizeInBits();
+
+  for (int I = 0; I != NumUnmerge; ++I) {
+    auto MIB = MIRBuilder.buildInstr(TargetOpcode::G_UNMERGE_VALUES);
+
+    for (int J = 0; J != PartsPerUnmerge; ++J) {
+      int Idx = I * PartsPerUnmerge + J;
+      if (Idx < NumDst)
+        MIB.addDef(MI.getOperand(Idx).getReg());
+      else {
+        // Create dead def for excess components.
+        MIB.addDef(MRI.createGenericVirtualRegister(DstTy));
+      }
+    }
+
+    MIB.addUse(Unmerge.getReg(I));
+  }
+
+  MI.eraseFromParent();
   return Legalized;
 }
 
@@ -1426,9 +1639,45 @@ LegalizerHelper::widenScalarInsert(MachineInstr &MI, unsigned TypeIdx,
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
-  MIRBuilder.setInstr(MI);
+LegalizerHelper::widenScalarAddSubSat(MachineInstr &MI, unsigned TypeIdx,
+                                      LLT WideTy) {
+  bool IsSigned = MI.getOpcode() == TargetOpcode::G_SADDSAT ||
+                  MI.getOpcode() == TargetOpcode::G_SSUBSAT;
+  // We can convert this to:
+  //   1. Any extend iN to iM
+  //   2. SHL by M-N
+  //   3. [US][ADD|SUB]SAT
+  //   4. L/ASHR by M-N
+  //
+  // It may be more efficient to lower this to a min and a max operation in
+  // the higher precision arithmetic if the promoted operation isn't legal,
+  // but this decision is up to the target's lowering request.
+  Register DstReg = MI.getOperand(0).getReg();
+
+  unsigned NewBits = WideTy.getScalarSizeInBits();
+  unsigned SHLAmount = NewBits - MRI.getType(DstReg).getScalarSizeInBits();
+
+  auto LHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(1));
+  auto RHS = MIRBuilder.buildAnyExt(WideTy, MI.getOperand(2));
+  auto ShiftK = MIRBuilder.buildConstant(WideTy, SHLAmount);
+  auto ShiftL = MIRBuilder.buildShl(WideTy, LHS, ShiftK);
+  auto ShiftR = MIRBuilder.buildShl(WideTy, RHS, ShiftK);
+
+  auto WideInst = MIRBuilder.buildInstr(MI.getOpcode(), {WideTy},
+                                        {ShiftL, ShiftR}, MI.getFlags());
+
+  // Use a shift that will preserve the number of sign bits when the trunc is
+  // folded away.
+  auto Result = IsSigned ? MIRBuilder.buildAShr(WideTy, WideInst, ShiftK)
+                         : MIRBuilder.buildLShr(WideTy, WideInst, ShiftK);
 
+  MIRBuilder.buildTrunc(DstReg, Result);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   switch (MI.getOpcode()) {
   default:
     return UnableToLegalize;
@@ -1444,28 +1693,30 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
   case TargetOpcode::G_USUBO: {
     if (TypeIdx == 1)
       return UnableToLegalize; // TODO
-    auto LHSZext = MIRBuilder.buildInstr(TargetOpcode::G_ZEXT, {WideTy},
-                                         {MI.getOperand(2).getReg()});
-    auto RHSZext = MIRBuilder.buildInstr(TargetOpcode::G_ZEXT, {WideTy},
-                                         {MI.getOperand(3).getReg()});
+    auto LHSZext = MIRBuilder.buildZExt(WideTy, MI.getOperand(2));
+    auto RHSZext = MIRBuilder.buildZExt(WideTy, MI.getOperand(3));
     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_UADDO
                           ? TargetOpcode::G_ADD
                           : TargetOpcode::G_SUB;
     // Do the arithmetic in the larger type.
     auto NewOp = MIRBuilder.buildInstr(Opcode, {WideTy}, {LHSZext, RHSZext});
     LLT OrigTy = MRI.getType(MI.getOperand(0).getReg());
-    APInt Mask = APInt::getAllOnesValue(OrigTy.getSizeInBits());
-    auto AndOp = MIRBuilder.buildInstr(
-        TargetOpcode::G_AND, {WideTy},
-        {NewOp, MIRBuilder.buildConstant(WideTy, Mask.getZExtValue())});
+    APInt Mask =
+        APInt::getLowBitsSet(WideTy.getSizeInBits(), OrigTy.getSizeInBits());
+    auto AndOp = MIRBuilder.buildAnd(
+        WideTy, NewOp, MIRBuilder.buildConstant(WideTy, Mask));
     // There is no overflow if the AndOp is the same as NewOp.
-    MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1).getReg(), NewOp,
-                         AndOp);
+    MIRBuilder.buildICmp(CmpInst::ICMP_NE, MI.getOperand(1), NewOp, AndOp);
     // Now trunc the NewOp to the original result.
-    MIRBuilder.buildTrunc(MI.getOperand(0).getReg(), NewOp);
+    MIRBuilder.buildTrunc(MI.getOperand(0), NewOp);
     MI.eraseFromParent();
     return Legalized;
   }
+  case TargetOpcode::G_SADDSAT:
+  case TargetOpcode::G_SSUBSAT:
+  case TargetOpcode::G_UADDSAT:
+  case TargetOpcode::G_USUBSAT:
+    return widenScalarAddSubSat(MI, TypeIdx, WideTy);
   case TargetOpcode::G_CTTZ:
   case TargetOpcode::G_CTTZ_ZERO_UNDEF:
   case TargetOpcode::G_CTLZ:
@@ -1500,9 +1751,8 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
         MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF) {
       // The correct result is NewOp - (Difference in widety and current ty).
       unsigned SizeDiff = WideTy.getSizeInBits() - CurTy.getSizeInBits();
-      MIBNewOp = MIRBuilder.buildInstr(
-          TargetOpcode::G_SUB, {WideTy},
-          {MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff)});
+      MIBNewOp = MIRBuilder.buildSub(
+          WideTy, MIBNewOp, MIRBuilder.buildConstant(WideTy, SizeDiff));
     }
 
     MIRBuilder.buildZExtOrTrunc(MI.getOperand(0), MIBNewOp);
@@ -1525,10 +1775,7 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     LLT Ty = MRI.getType(DstReg);
     unsigned DiffBits = WideTy.getScalarSizeInBits() - Ty.getScalarSizeInBits();
     MIRBuilder.buildConstant(ShiftAmtReg, DiffBits);
-    MIRBuilder.buildInstr(TargetOpcode::G_LSHR)
-      .addDef(ShrReg)
-      .addUse(DstExt)
-      .addUse(ShiftAmtReg);
+    MIRBuilder.buildLShr(ShrReg, DstExt, ShiftAmtReg);
 
     MIRBuilder.buildTrunc(DstReg, ShrReg);
     Observer.changedInstr(MI);
@@ -1552,6 +1799,13 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     Observer.changedInstr(MI);
     return Legalized;
   }
+  case TargetOpcode::G_FREEZE:
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 1, TargetOpcode::G_ANYEXT);
+    widenScalarDst(MI, WideTy);
+    Observer.changedInstr(MI);
+    return Legalized;
+
   case TargetOpcode::G_ADD:
   case TargetOpcode::G_AND:
   case TargetOpcode::G_MUL:
@@ -1844,9 +2098,10 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
       // TODO: Probably should be zext
       widenScalarSrc(MI, WideTy, 3, TargetOpcode::G_SEXT);
       Observer.changedInstr(MI);
+      return Legalized;
     }
 
-    return Legalized;
+    return UnableToLegalize;
   }
   case TargetOpcode::G_FADD:
   case TargetOpcode::G_FMUL:
@@ -1932,29 +2187,162 @@ LegalizerHelper::widenScalar(MachineInstr &MI, unsigned TypeIdx, LLT WideTy) {
     widenScalarDst(MI, WideTy, 0, TargetOpcode::G_TRUNC);
     Observer.changedInstr(MI);
     return Legalized;
+  case TargetOpcode::G_PTRMASK: {
+    if (TypeIdx != 1)
+      return UnableToLegalize;
+    Observer.changingInstr(MI);
+    widenScalarSrc(MI, WideTy, 2, TargetOpcode::G_ZEXT);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  }
+}
+
+static void getUnmergePieces(SmallVectorImpl<Register> &Pieces,
+                             MachineIRBuilder &B, Register Src, LLT Ty) {
+  auto Unmerge = B.buildUnmerge(Ty, Src);
+  for (int I = 0, E = Unmerge->getNumOperands() - 1; I != E; ++I)
+    Pieces.push_back(Unmerge.getReg(I));
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerBitcast(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+
+  if (SrcTy.isVector()) {
+    LLT SrcEltTy = SrcTy.getElementType();
+    SmallVector<Register, 8> SrcRegs;
+
+    if (DstTy.isVector()) {
+      int NumDstElt = DstTy.getNumElements();
+      int NumSrcElt = SrcTy.getNumElements();
+
+      LLT DstEltTy = DstTy.getElementType();
+      LLT DstCastTy = DstEltTy; // Intermediate bitcast result type
+      LLT SrcPartTy = SrcEltTy; // Original unmerge result type.
+
+      // If there's an element size mismatch, insert intermediate casts to match
+      // the result element type.
+      if (NumSrcElt < NumDstElt) { // Source element type is larger.
+        // %1:_(<4 x s8>) = G_BITCAST %0:_(<2 x s16>)
+        //
+        // =>
+        //
+        // %2:_(s16), %3:_(s16) = G_UNMERGE_VALUES %0
+        // %3:_(<2 x s8>) = G_BITCAST %2
+        // %4:_(<2 x s8>) = G_BITCAST %3
+        // %1:_(<4 x s16>) = G_CONCAT_VECTORS %3, %4
+        DstCastTy = LLT::vector(NumDstElt / NumSrcElt, DstEltTy);
+        SrcPartTy = SrcEltTy;
+      } else if (NumSrcElt > NumDstElt) { // Source element type is smaller.
+        //
+        // %1:_(<2 x s16>) = G_BITCAST %0:_(<4 x s8>)
+        //
+        // =>
+        //
+        // %2:_(<2 x s8>), %3:_(<2 x s8>) = G_UNMERGE_VALUES %0
+        // %3:_(s16) = G_BITCAST %2
+        // %4:_(s16) = G_BITCAST %3
+        // %1:_(<2 x s16>) = G_BUILD_VECTOR %3, %4
+        SrcPartTy = LLT::vector(NumSrcElt / NumDstElt, SrcEltTy);
+        DstCastTy = DstEltTy;
+      }
+
+      getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcPartTy);
+      for (Register &SrcReg : SrcRegs)
+        SrcReg = MIRBuilder.buildBitcast(DstCastTy, SrcReg).getReg(0);
+    } else
+      getUnmergePieces(SrcRegs, MIRBuilder, Src, SrcEltTy);
+
+    MIRBuilder.buildMerge(Dst, SrcRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  if (DstTy.isVector()) {
+    SmallVector<Register, 8> SrcRegs;
+    getUnmergePieces(SrcRegs, MIRBuilder, Src, DstTy.getElementType());
+    MIRBuilder.buildMerge(Dst, SrcRegs);
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::bitcast(MachineInstr &MI, unsigned TypeIdx, LLT CastTy) {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_LOAD: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    bitcastDst(MI, CastTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_STORE: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    Observer.changingInstr(MI);
+    bitcastSrc(MI, CastTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_SELECT: {
+    if (TypeIdx != 0)
+      return UnableToLegalize;
+
+    if (MRI.getType(MI.getOperand(1).getReg()).isVector()) {
+      LLVM_DEBUG(
+          dbgs() << "bitcast action not implemented for vector select\n");
+      return UnableToLegalize;
+    }
+
+    Observer.changingInstr(MI);
+    bitcastSrc(MI, CastTy, 2);
+    bitcastSrc(MI, CastTy, 3);
+    bitcastDst(MI, CastTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  case TargetOpcode::G_AND:
+  case TargetOpcode::G_OR:
+  case TargetOpcode::G_XOR: {
+    Observer.changingInstr(MI);
+    bitcastSrc(MI, CastTy, 1);
+    bitcastSrc(MI, CastTy, 2);
+    bitcastDst(MI, CastTy, 0);
+    Observer.changedInstr(MI);
+    return Legalized;
+  }
+  default:
+    return UnableToLegalize;
   }
 }
 
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   using namespace TargetOpcode;
-  MIRBuilder.setInstr(MI);
 
   switch(MI.getOpcode()) {
   default:
     return UnableToLegalize;
+  case TargetOpcode::G_BITCAST:
+    return lowerBitcast(MI);
   case TargetOpcode::G_SREM:
   case TargetOpcode::G_UREM: {
-    Register QuotReg = MRI.createGenericVirtualRegister(Ty);
-    MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV)
-        .addDef(QuotReg)
-        .addUse(MI.getOperand(1).getReg())
-        .addUse(MI.getOperand(2).getReg());
-
-    Register ProdReg = MRI.createGenericVirtualRegister(Ty);
-    MIRBuilder.buildMul(ProdReg, QuotReg, MI.getOperand(2).getReg());
-    MIRBuilder.buildSub(MI.getOperand(0).getReg(), MI.getOperand(1).getReg(),
-                        ProdReg);
+    auto Quot =
+        MIRBuilder.buildInstr(MI.getOpcode() == G_SREM ? G_SDIV : G_UDIV, {Ty},
+                              {MI.getOperand(1), MI.getOperand(2)});
+
+    auto Prod = MIRBuilder.buildMul(Ty, Quot, MI.getOperand(2));
+    MIRBuilder.buildSub(MI.getOperand(0), MI.getOperand(1), Prod);
     MI.eraseFromParent();
     return Legalized;
   }
@@ -1970,36 +2358,30 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     Register LHS = MI.getOperand(2).getReg();
     Register RHS = MI.getOperand(3).getReg();
 
-    MIRBuilder.buildMul(Res, LHS, RHS);
-
     unsigned Opcode = MI.getOpcode() == TargetOpcode::G_SMULO
                           ? TargetOpcode::G_SMULH
                           : TargetOpcode::G_UMULH;
 
-    Register HiPart = MRI.createGenericVirtualRegister(Ty);
-    MIRBuilder.buildInstr(Opcode)
-      .addDef(HiPart)
-      .addUse(LHS)
-      .addUse(RHS);
+    Observer.changingInstr(MI);
+    const auto &TII = MIRBuilder.getTII();
+    MI.setDesc(TII.get(TargetOpcode::G_MUL));
+    MI.RemoveOperand(1);
+    Observer.changedInstr(MI);
 
-    Register Zero = MRI.createGenericVirtualRegister(Ty);
-    MIRBuilder.buildConstant(Zero, 0);
+    MIRBuilder.setInsertPt(MIRBuilder.getMBB(), ++MIRBuilder.getInsertPt());
+
+    auto HiPart = MIRBuilder.buildInstr(Opcode, {Ty}, {LHS, RHS});
+    auto Zero = MIRBuilder.buildConstant(Ty, 0);
 
     // For *signed* multiply, overflow is detected by checking:
     // (hi != (lo >> bitwidth-1))
     if (Opcode == TargetOpcode::G_SMULH) {
-      Register Shifted = MRI.createGenericVirtualRegister(Ty);
-      Register ShiftAmt = MRI.createGenericVirtualRegister(Ty);
-      MIRBuilder.buildConstant(ShiftAmt, Ty.getSizeInBits() - 1);
-      MIRBuilder.buildInstr(TargetOpcode::G_ASHR)
-        .addDef(Shifted)
-        .addUse(Res)
-        .addUse(ShiftAmt);
+      auto ShiftAmt = MIRBuilder.buildConstant(Ty, Ty.getSizeInBits() - 1);
+      auto Shifted = MIRBuilder.buildAShr(Ty, Res, ShiftAmt);
       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Shifted);
     } else {
       MIRBuilder.buildICmp(CmpInst::ICMP_NE, Overflow, HiPart, Zero);
     }
-    MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_FNEG: {
@@ -2008,31 +2390,16 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     if (Ty.isVector())
       return UnableToLegalize;
     Register Res = MI.getOperand(0).getReg();
-    Type *ZeroTy;
     LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
-    switch (Ty.getSizeInBits()) {
-    case 16:
-      ZeroTy = Type::getHalfTy(Ctx);
-      break;
-    case 32:
-      ZeroTy = Type::getFloatTy(Ctx);
-      break;
-    case 64:
-      ZeroTy = Type::getDoubleTy(Ctx);
-      break;
-    case 128:
-      ZeroTy = Type::getFP128Ty(Ctx);
-      break;
-    default:
-      llvm_unreachable("unexpected floating-point type");
-    }
+    Type *ZeroTy = getFloatTypeForLLT(Ctx, Ty);
+    if (!ZeroTy)
+      return UnableToLegalize;
     ConstantFP &ZeroForNegation =
         *cast<ConstantFP>(ConstantFP::getZeroValueForNegation(ZeroTy));
     auto Zero = MIRBuilder.buildFConstant(Ty, ZeroForNegation);
     Register SubByReg = MI.getOperand(1).getReg();
-    Register ZeroReg = Zero->getOperand(0).getReg();
-    MIRBuilder.buildInstr(TargetOpcode::G_FSUB, {Res}, {ZeroReg, SubByReg},
-                          MI.getFlags());
+    Register ZeroReg = Zero.getReg(0);
+    MIRBuilder.buildFSub(Res, ZeroReg, SubByReg, MI.getFlags());
     MI.eraseFromParent();
     return Legalized;
   }
@@ -2046,13 +2413,15 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     Register LHS = MI.getOperand(1).getReg();
     Register RHS = MI.getOperand(2).getReg();
     Register Neg = MRI.createGenericVirtualRegister(Ty);
-    MIRBuilder.buildInstr(TargetOpcode::G_FNEG).addDef(Neg).addUse(RHS);
-    MIRBuilder.buildInstr(TargetOpcode::G_FADD, {Res}, {LHS, Neg}, MI.getFlags());
+    MIRBuilder.buildFNeg(Neg, RHS);
+    MIRBuilder.buildFAdd(Res, LHS, Neg, MI.getFlags());
     MI.eraseFromParent();
     return Legalized;
   }
   case TargetOpcode::G_FMAD:
     return lowerFMad(MI);
+  case TargetOpcode::G_FFLOOR:
+    return lowerFFloor(MI);
   case TargetOpcode::G_INTRINSIC_ROUND:
     return lowerIntrinsicRound(MI);
   case TargetOpcode::G_ATOMIC_CMPXCHG_WITH_SUCCESS: {
@@ -2089,7 +2458,7 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
         // result values together, before truncating back down to the non-pow-2
         // type.
         // E.g. v1 = i24 load =>
-        // v2 = i32 load (2 byte)
+        // v2 = i32 zextload (2 byte)
         // v3 = i32 load (1 byte)
         // v4 = i32 shl v3, 16
         // v5 = i32 or v4, v2
@@ -2110,11 +2479,11 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
         LLT AnyExtTy = LLT::scalar(AnyExtSize);
         Register LargeLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
         Register SmallLdReg = MRI.createGenericVirtualRegister(AnyExtTy);
-        auto LargeLoad =
-            MIRBuilder.buildLoad(LargeLdReg, PtrReg, *LargeMMO);
+        auto LargeLoad = MIRBuilder.buildLoadInstr(
+            TargetOpcode::G_ZEXTLOAD, LargeLdReg, PtrReg, *LargeMMO);
 
-        auto OffsetCst =
-            MIRBuilder.buildConstant(LLT::scalar(64), LargeSplitSize / 8);
+        auto OffsetCst = MIRBuilder.buildConstant(
+            LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
         Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
         auto SmallPtr =
             MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst.getReg(0));
@@ -2186,8 +2555,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
 
     // Generate the PtrAdd and truncating stores.
     LLT PtrTy = MRI.getType(PtrReg);
-    auto OffsetCst =
-        MIRBuilder.buildConstant(LLT::scalar(64), LargeSplitSize / 8);
+    auto OffsetCst = MIRBuilder.buildConstant(
+            LLT::scalar(PtrTy.getSizeInBits()), LargeSplitSize / 8);
     Register PtrAddReg = MRI.createGenericVirtualRegister(PtrTy);
     auto SmallPtr =
         MIRBuilder.buildPtrAdd(PtrAddReg, PtrReg, OffsetCst.getReg(0));
@@ -2226,12 +2595,10 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     Register LHS = MI.getOperand(2).getReg();
     Register RHS = MI.getOperand(3).getReg();
     Register CarryIn = MI.getOperand(4).getReg();
+    LLT Ty = MRI.getType(Res);
 
-    Register TmpRes = MRI.createGenericVirtualRegister(Ty);
-    Register ZExtCarryIn = MRI.createGenericVirtualRegister(Ty);
-
-    MIRBuilder.buildAdd(TmpRes, LHS, RHS);
-    MIRBuilder.buildZExt(ZExtCarryIn, CarryIn);
+    auto TmpRes = MIRBuilder.buildAdd(Ty, LHS, RHS);
+    auto ZExtCarryIn = MIRBuilder.buildZExt(Ty, CarryIn);
     MIRBuilder.buildAdd(Res, TmpRes, ZExtCarryIn);
     MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CarryOut, Res, LHS);
 
@@ -2256,17 +2623,15 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     Register LHS = MI.getOperand(2).getReg();
     Register RHS = MI.getOperand(3).getReg();
     Register BorrowIn = MI.getOperand(4).getReg();
+    const LLT CondTy = MRI.getType(BorrowOut);
+    const LLT Ty = MRI.getType(Res);
 
-    Register TmpRes = MRI.createGenericVirtualRegister(Ty);
-    Register ZExtBorrowIn = MRI.createGenericVirtualRegister(Ty);
-    Register LHS_EQ_RHS = MRI.createGenericVirtualRegister(LLT::scalar(1));
-    Register LHS_ULT_RHS = MRI.createGenericVirtualRegister(LLT::scalar(1));
-
-    MIRBuilder.buildSub(TmpRes, LHS, RHS);
-    MIRBuilder.buildZExt(ZExtBorrowIn, BorrowIn);
+    auto TmpRes = MIRBuilder.buildSub(Ty, LHS, RHS);
+    auto ZExtBorrowIn = MIRBuilder.buildZExt(Ty, BorrowIn);
     MIRBuilder.buildSub(Res, TmpRes, ZExtBorrowIn);
-    MIRBuilder.buildICmp(CmpInst::ICMP_EQ, LHS_EQ_RHS, LHS, RHS);
-    MIRBuilder.buildICmp(CmpInst::ICMP_ULT, LHS_ULT_RHS, LHS, RHS);
+
+    auto LHS_EQ_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, CondTy, LHS, RHS);
+    auto LHS_ULT_RHS = MIRBuilder.buildICmp(CmpInst::ICMP_ULT, CondTy, LHS, RHS);
     MIRBuilder.buildSelect(BorrowOut, LHS_EQ_RHS, BorrowIn, LHS_ULT_RHS);
 
     MI.eraseFromParent();
@@ -2278,6 +2643,10 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     return lowerSITOFP(MI, TypeIdx, Ty);
   case G_FPTOUI:
     return lowerFPTOUI(MI, TypeIdx, Ty);
+  case G_FPTOSI:
+    return lowerFPTOSI(MI);
+  case G_FPTRUNC:
+    return lowerFPTRUNC(MI, TypeIdx, Ty);
   case G_SMIN:
   case G_SMAX:
   case G_UMIN:
@@ -2288,6 +2657,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   case G_FMINNUM:
   case G_FMAXNUM:
     return lowerFMinNumMaxNum(MI);
+  case G_MERGE_VALUES:
+    return lowerMergeValues(MI);
   case G_UNMERGE_VALUES:
     return lowerUnmergeValues(MI);
   case TargetOpcode::G_SEXT_INREG: {
@@ -2300,8 +2671,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     Register TmpRes = MRI.createGenericVirtualRegister(DstTy);
 
     auto MIBSz = MIRBuilder.buildConstant(DstTy, DstTy.getScalarSizeInBits() - SizeInBits);
-    MIRBuilder.buildInstr(TargetOpcode::G_SHL, {TmpRes}, {SrcReg, MIBSz->getOperand(0).getReg()});
-    MIRBuilder.buildInstr(TargetOpcode::G_ASHR, {DstReg}, {TmpRes, MIBSz->getOperand(0).getReg()});
+    MIRBuilder.buildShl(TmpRes, SrcReg, MIBSz->getOperand(0));
+    MIRBuilder.buildAShr(DstReg, TmpRes, MIBSz->getOperand(0));
     MI.eraseFromParent();
     return Legalized;
   }
@@ -2318,7 +2689,8 @@ LegalizerHelper::lower(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   case G_BITREVERSE:
     return lowerBitreverse(MI);
   case G_READ_REGISTER:
-    return lowerReadRegister(MI);
+  case G_WRITE_REGISTER:
+    return lowerReadWriteRegister(MI);
   }
 }
 
@@ -2350,99 +2722,6 @@ LegalizerHelper::LegalizeResult LegalizerHelper::fewerElementsVectorImplicitDef(
   return Legalized;
 }
 
-LegalizerHelper::LegalizeResult
-LegalizerHelper::fewerElementsVectorBasic(MachineInstr &MI, unsigned TypeIdx,
-                                          LLT NarrowTy) {
-  const unsigned Opc = MI.getOpcode();
-  const unsigned NumOps = MI.getNumOperands() - 1;
-  const unsigned NarrowSize = NarrowTy.getSizeInBits();
-  const Register DstReg = MI.getOperand(0).getReg();
-  const unsigned Flags = MI.getFlags();
-  const LLT DstTy = MRI.getType(DstReg);
-  const unsigned Size = DstTy.getSizeInBits();
-  const int NumParts = Size / NarrowSize;
-  const LLT EltTy = DstTy.getElementType();
-  const unsigned EltSize = EltTy.getSizeInBits();
-  const unsigned BitsForNumParts = NarrowSize * NumParts;
-
-  // Check if we have any leftovers. If we do, then only handle the case where
-  // the leftover is one element.
-  if (BitsForNumParts != Size && BitsForNumParts + EltSize != Size)
-    return UnableToLegalize;
-
-  if (BitsForNumParts != Size) {
-    Register AccumDstReg = MRI.createGenericVirtualRegister(DstTy);
-    MIRBuilder.buildUndef(AccumDstReg);
-
-    // Handle the pieces which evenly divide into the requested type with
-    // extract/op/insert sequence.
-    for (unsigned Offset = 0; Offset < BitsForNumParts; Offset += NarrowSize) {
-      SmallVector<SrcOp, 4> SrcOps;
-      for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) {
-        Register PartOpReg = MRI.createGenericVirtualRegister(NarrowTy);
-        MIRBuilder.buildExtract(PartOpReg, MI.getOperand(I).getReg(), Offset);
-        SrcOps.push_back(PartOpReg);
-      }
-
-      Register PartDstReg = MRI.createGenericVirtualRegister(NarrowTy);
-      MIRBuilder.buildInstr(Opc, {PartDstReg}, SrcOps, Flags);
-
-      Register PartInsertReg = MRI.createGenericVirtualRegister(DstTy);
-      MIRBuilder.buildInsert(PartInsertReg, AccumDstReg, PartDstReg, Offset);
-      AccumDstReg = PartInsertReg;
-    }
-
-    // Handle the remaining element sized leftover piece.
-    SmallVector<SrcOp, 4> SrcOps;
-    for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) {
-      Register PartOpReg = MRI.createGenericVirtualRegister(EltTy);
-      MIRBuilder.buildExtract(PartOpReg, MI.getOperand(I).getReg(),
-                              BitsForNumParts);
-      SrcOps.push_back(PartOpReg);
-    }
-
-    Register PartDstReg = MRI.createGenericVirtualRegister(EltTy);
-    MIRBuilder.buildInstr(Opc, {PartDstReg}, SrcOps, Flags);
-    MIRBuilder.buildInsert(DstReg, AccumDstReg, PartDstReg, BitsForNumParts);
-    MI.eraseFromParent();
-
-    return Legalized;
-  }
-
-  SmallVector<Register, 2> DstRegs, Src0Regs, Src1Regs, Src2Regs;
-
-  extractParts(MI.getOperand(1).getReg(), NarrowTy, NumParts, Src0Regs);
-
-  if (NumOps >= 2)
-    extractParts(MI.getOperand(2).getReg(), NarrowTy, NumParts, Src1Regs);
-
-  if (NumOps >= 3)
-    extractParts(MI.getOperand(3).getReg(), NarrowTy, NumParts, Src2Regs);
-
-  for (int i = 0; i < NumParts; ++i) {
-    Register DstReg = MRI.createGenericVirtualRegister(NarrowTy);
-
-    if (NumOps == 1)
-      MIRBuilder.buildInstr(Opc, {DstReg}, {Src0Regs[i]}, Flags);
-    else if (NumOps == 2) {
-      MIRBuilder.buildInstr(Opc, {DstReg}, {Src0Regs[i], Src1Regs[i]}, Flags);
-    } else if (NumOps == 3) {
-      MIRBuilder.buildInstr(Opc, {DstReg},
-                            {Src0Regs[i], Src1Regs[i], Src2Regs[i]}, Flags);
-    }
-
-    DstRegs.push_back(DstReg);
-  }
-
-  if (NarrowTy.isVector())
-    MIRBuilder.buildConcatVectors(DstReg, DstRegs);
-  else
-    MIRBuilder.buildBuildVector(DstReg, DstRegs);
-
-  MI.eraseFromParent();
-  return Legalized;
-}
-
 // Handle splitting vector operations which need to have the same number of
 // elements in each type index, but each type index may have a different element
 // type.
@@ -2482,7 +2761,6 @@ LegalizerHelper::fewerElementsVectorMultiEltType(
   SmallVector<Register, 4> PartRegs, LeftoverRegs;
 
   for (unsigned I = 1, E = MI.getNumOperands(); I != E; ++I) {
-    LLT LeftoverTy;
     Register SrcReg = MI.getOperand(I).getReg();
     LLT SrcTyI = MRI.getType(SrcReg);
     LLT NarrowTyI = LLT::scalarOrVector(NewNumElts, SrcTyI.getScalarType());
@@ -2571,9 +2849,8 @@ LegalizerHelper::fewerElementsVectorCasts(MachineInstr &MI, unsigned TypeIdx,
 
   for (unsigned I = 0; I < NumParts; ++I) {
     Register DstReg = MRI.createGenericVirtualRegister(NarrowTy0);
-    MachineInstr *NewInst = MIRBuilder.buildInstr(MI.getOpcode())
-      .addDef(DstReg)
-      .addUse(SrcRegs[I]);
+    MachineInstr *NewInst =
+        MIRBuilder.buildInstr(MI.getOpcode(), {DstReg}, {SrcRegs[I]});
 
     NewInst->setFlags(MI.getFlags());
     DstRegs.push_back(DstReg);
@@ -2913,6 +3190,12 @@ LegalizerHelper::reduceLoadStoreWidth(MachineInstr &MI, unsigned TypeIdx,
   Register AddrReg = MI.getOperand(1).getReg();
   LLT ValTy = MRI.getType(ValReg);
 
+  // FIXME: Do we need a distinct NarrowMemory legalize action?
+  if (ValTy.getSizeInBits() != 8 * MMO->getSize()) {
+    LLVM_DEBUG(dbgs() << "Can't narrow extload/truncstore\n");
+    return UnableToLegalize;
+  }
+
   int NumParts = -1;
   int NumLeftover = -1;
   LLT LeftoverTy;
@@ -2981,14 +3264,147 @@ LegalizerHelper::reduceLoadStoreWidth(MachineInstr &MI, unsigned TypeIdx,
 }
 
 LegalizerHelper::LegalizeResult
+LegalizerHelper::reduceOperationWidth(MachineInstr &MI, unsigned int TypeIdx,
+                                      LLT NarrowTy) {
+  assert(TypeIdx == 0 && "only one type index expected");
+
+  const unsigned Opc = MI.getOpcode();
+  const int NumOps = MI.getNumOperands() - 1;
+  const Register DstReg = MI.getOperand(0).getReg();
+  const unsigned Flags = MI.getFlags();
+  const unsigned NarrowSize = NarrowTy.getSizeInBits();
+  const LLT NarrowScalarTy = LLT::scalar(NarrowSize);
+
+  assert(NumOps <= 3 && "expected instruction with 1 result and 1-3 sources");
+
+  // First of all check whether we are narrowing (changing the element type)
+  // or reducing the vector elements
+  const LLT DstTy = MRI.getType(DstReg);
+  const bool IsNarrow = NarrowTy.getScalarType() != DstTy.getScalarType();
+
+  SmallVector<Register, 8> ExtractedRegs[3];
+  SmallVector<Register, 8> Parts;
+
+  unsigned NarrowElts = NarrowTy.isVector() ? NarrowTy.getNumElements() : 1;
+
+  // Break down all the sources into NarrowTy pieces we can operate on. This may
+  // involve creating merges to a wider type, padded with undef.
+  for (int I = 0; I != NumOps; ++I) {
+    Register SrcReg = MI.getOperand(I + 1).getReg();
+    LLT SrcTy = MRI.getType(SrcReg);
+
+    // The type to narrow SrcReg to. For narrowing, this is a smaller scalar.
+    // For fewerElements, this is a smaller vector with the same element type.
+    LLT OpNarrowTy;
+    if (IsNarrow) {
+      OpNarrowTy = NarrowScalarTy;
+
+      // In case of narrowing, we need to cast vectors to scalars for this to
+      // work properly
+      // FIXME: Can we do without the bitcast here if we're narrowing?
+      if (SrcTy.isVector()) {
+        SrcTy = LLT::scalar(SrcTy.getSizeInBits());
+        SrcReg = MIRBuilder.buildBitcast(SrcTy, SrcReg).getReg(0);
+      }
+    } else {
+      OpNarrowTy = LLT::scalarOrVector(NarrowElts, SrcTy.getScalarType());
+    }
+
+    LLT GCDTy = extractGCDType(ExtractedRegs[I], SrcTy, OpNarrowTy, SrcReg);
+
+    // Build a sequence of NarrowTy pieces in ExtractedRegs for this operand.
+    buildLCMMergePieces(SrcTy, OpNarrowTy, GCDTy, ExtractedRegs[I],
+                        TargetOpcode::G_ANYEXT);
+  }
+
+  SmallVector<Register, 8> ResultRegs;
+
+  // Input operands for each sub-instruction.
+  SmallVector<SrcOp, 4> InputRegs(NumOps, Register());
+
+  int NumParts = ExtractedRegs[0].size();
+  const unsigned DstSize = DstTy.getSizeInBits();
+  const LLT DstScalarTy = LLT::scalar(DstSize);
+
+  // Narrowing needs to use scalar types
+  LLT DstLCMTy, NarrowDstTy;
+  if (IsNarrow) {
+    DstLCMTy = getLCMType(DstScalarTy, NarrowScalarTy);
+    NarrowDstTy = NarrowScalarTy;
+  } else {
+    DstLCMTy = getLCMType(DstTy, NarrowTy);
+    NarrowDstTy = NarrowTy;
+  }
+
+  // We widened the source registers to satisfy merge/unmerge size
+  // constraints. We'll have some extra fully undef parts.
+  const int NumRealParts = (DstSize + NarrowSize - 1) / NarrowSize;
+
+  for (int I = 0; I != NumRealParts; ++I) {
+    // Emit this instruction on each of the split pieces.
+    for (int J = 0; J != NumOps; ++J)
+      InputRegs[J] = ExtractedRegs[J][I];
+
+    auto Inst = MIRBuilder.buildInstr(Opc, {NarrowDstTy}, InputRegs, Flags);
+    ResultRegs.push_back(Inst.getReg(0));
+  }
+
+  // Fill out the widened result with undef instead of creating instructions
+  // with undef inputs.
+  int NumUndefParts = NumParts - NumRealParts;
+  if (NumUndefParts != 0)
+    ResultRegs.append(NumUndefParts,
+                      MIRBuilder.buildUndef(NarrowDstTy).getReg(0));
+
+  // Extract the possibly padded result. Use a scratch register if we need to do
+  // a final bitcast, otherwise use the original result register.
+  Register MergeDstReg;
+  if (IsNarrow && DstTy.isVector())
+    MergeDstReg = MRI.createGenericVirtualRegister(DstScalarTy);
+  else
+    MergeDstReg = DstReg;
+
+  buildWidenedRemergeToDst(MergeDstReg, DstLCMTy, ResultRegs);
+
+  // Recast to vector if we narrowed a vector
+  if (IsNarrow && DstTy.isVector())
+    MIRBuilder.buildBitcast(DstReg, MergeDstReg);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::fewerElementsVectorSextInReg(MachineInstr &MI, unsigned TypeIdx,
+                                              LLT NarrowTy) {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  int64_t Imm = MI.getOperand(2).getImm();
+
+  LLT DstTy = MRI.getType(DstReg);
+
+  SmallVector<Register, 8> Parts;
+  LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
+  LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts);
+
+  for (Register &R : Parts)
+    R = MIRBuilder.buildSExtInReg(NarrowTy, R, Imm).getReg(0);
+
+  buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
 LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
                                      LLT NarrowTy) {
   using namespace TargetOpcode;
 
-  MIRBuilder.setInstr(MI);
   switch (MI.getOpcode()) {
   case G_IMPLICIT_DEF:
     return fewerElementsVectorImplicitDef(MI, TypeIdx, NarrowTy);
+  case G_TRUNC:
   case G_AND:
   case G_OR:
   case G_XOR:
@@ -3038,7 +3454,14 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_FMAXNUM_IEEE:
   case G_FMINIMUM:
   case G_FMAXIMUM:
-    return fewerElementsVectorBasic(MI, TypeIdx, NarrowTy);
+  case G_FSHL:
+  case G_FSHR:
+  case G_FREEZE:
+  case G_SADDSAT:
+  case G_SSUBSAT:
+  case G_UADDSAT:
+  case G_USUBSAT:
+    return reduceOperationWidth(MI, TypeIdx, NarrowTy);
   case G_SHL:
   case G_LSHR:
   case G_ASHR:
@@ -3076,6 +3499,8 @@ LegalizerHelper::fewerElementsVector(MachineInstr &MI, unsigned TypeIdx,
   case G_LOAD:
   case G_STORE:
     return reduceLoadStoreWidth(MI, TypeIdx, NarrowTy);
+  case G_SEXT_INREG:
+    return fewerElementsVectorSextInReg(MI, TypeIdx, NarrowTy);
   default:
     return UnableToLegalize;
   }
@@ -3087,10 +3512,10 @@ LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
 
   Register InL = MRI.createGenericVirtualRegister(HalfTy);
   Register InH = MRI.createGenericVirtualRegister(HalfTy);
-  MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1).getReg());
+  MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
 
   if (Amt.isNullValue()) {
-    MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {InL, InH});
+    MIRBuilder.buildMerge(MI.getOperand(0), {InL, InH});
     MI.eraseFromParent();
     return Legalized;
   }
@@ -3163,7 +3588,7 @@ LegalizerHelper::narrowScalarShiftByConstant(MachineInstr &MI, const APInt &Amt,
     }
   }
 
-  MIRBuilder.buildMerge(MI.getOperand(0).getReg(), {Lo.getReg(), Hi.getReg()});
+  MIRBuilder.buildMerge(MI.getOperand(0), {Lo, Hi});
   MI.eraseFromParent();
 
   return Legalized;
@@ -3211,7 +3636,7 @@ LegalizerHelper::narrowScalarShift(MachineInstr &MI, unsigned TypeIdx,
 
   Register InL = MRI.createGenericVirtualRegister(HalfTy);
   Register InH = MRI.createGenericVirtualRegister(HalfTy);
-  MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1).getReg());
+  MIRBuilder.buildUnmerge({InL, InH}, MI.getOperand(1));
 
   auto AmtExcess = MIRBuilder.buildSub(ShiftAmtTy, Amt, NewBits);
   auto AmtLack = MIRBuilder.buildSub(ShiftAmtTy, NewBits, Amt);
@@ -3302,7 +3727,6 @@ LegalizerHelper::moreElementsVectorPhi(MachineInstr &MI, unsigned TypeIdx,
 LegalizerHelper::LegalizeResult
 LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
                                     LLT MoreTy) {
-  MIRBuilder.setInstr(MI);
   unsigned Opc = MI.getOpcode();
   switch (Opc) {
   case TargetOpcode::G_IMPLICIT_DEF:
@@ -3349,6 +3773,7 @@ LegalizerHelper::moreElementsVector(MachineInstr &MI, unsigned TypeIdx,
     Observer.changedInstr(MI);
     return Legalized;
   case TargetOpcode::G_INSERT:
+  case TargetOpcode::G_FREEZE:
     if (TypeIdx != 0)
       return UnableToLegalize;
     Observer.changingInstr(MI);
@@ -3479,10 +3904,10 @@ LegalizerHelper::narrowScalarMul(MachineInstr &MI, LLT NarrowTy) {
   bool IsMulHigh = MI.getOpcode() == TargetOpcode::G_UMULH;
   unsigned DstTmpParts = NumDstParts * (IsMulHigh ? 2 : 1);
 
-  SmallVector<Register, 2> Src1Parts, Src2Parts, DstTmpRegs;
+  SmallVector<Register, 2> Src1Parts, Src2Parts;
+  SmallVector<Register, 2> DstTmpRegs(DstTmpParts);
   extractParts(Src1, NarrowTy, NumSrcParts, Src1Parts);
   extractParts(Src2, NarrowTy, NumSrcParts, Src2Parts);
-  DstTmpRegs.resize(DstTmpParts);
   multiplyRegisters(DstTmpRegs, Src1Parts, Src2Parts, NarrowTy);
 
   // Take only high half of registers if this is high mul.
@@ -3550,10 +3975,12 @@ LegalizerHelper::narrowScalarExtract(MachineInstr &MI, unsigned TypeIdx,
   }
 
   Register DstReg = MI.getOperand(0).getReg();
-  if(MRI.getType(DstReg).isVector())
+  if (MRI.getType(DstReg).isVector())
     MIRBuilder.buildBuildVector(DstReg, DstRegs);
-  else
+  else if (DstRegs.size() > 1)
     MIRBuilder.buildMerge(DstReg, DstRegs);
+  else
+    MIRBuilder.buildCopy(DstReg, DstRegs[0]);
   MI.eraseFromParent();
   return Legalized;
 }
@@ -3657,14 +4084,14 @@ LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
     auto Inst = MIRBuilder.buildInstr(MI.getOpcode(), {NarrowTy},
                                         {Src0Regs[I], Src1Regs[I]});
-    DstRegs.push_back(Inst->getOperand(0).getReg());
+    DstRegs.push_back(Inst.getReg(0));
   }
 
   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
     auto Inst = MIRBuilder.buildInstr(
       MI.getOpcode(),
       {LeftoverTy}, {Src0LeftoverRegs[I], Src1LeftoverRegs[I]});
-    DstLeftoverRegs.push_back(Inst->getOperand(0).getReg());
+    DstLeftoverRegs.push_back(Inst.getReg(0));
   }
 
   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
@@ -3675,6 +4102,28 @@ LegalizerHelper::narrowScalarBasic(MachineInstr &MI, unsigned TypeIdx,
 }
 
 LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarExt(MachineInstr &MI, unsigned TypeIdx,
+                                 LLT NarrowTy) {
+  if (TypeIdx != 0)
+    return UnableToLegalize;
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+
+  LLT DstTy = MRI.getType(DstReg);
+  if (DstTy.isVector())
+    return UnableToLegalize;
+
+  SmallVector<Register, 8> Parts;
+  LLT GCDTy = extractGCDType(Parts, DstTy, NarrowTy, SrcReg);
+  LLT LCMTy = buildLCMMergePieces(DstTy, NarrowTy, GCDTy, Parts, MI.getOpcode());
+  buildWidenedRemergeToDst(DstReg, LCMTy, Parts);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
 LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
                                     LLT NarrowTy) {
   if (TypeIdx != 0)
@@ -3704,13 +4153,13 @@ LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
   for (unsigned I = 0, E = Src1Regs.size(); I != E; ++I) {
     auto Select = MIRBuilder.buildSelect(NarrowTy,
                                          CondReg, Src1Regs[I], Src2Regs[I]);
-    DstRegs.push_back(Select->getOperand(0).getReg());
+    DstRegs.push_back(Select.getReg(0));
   }
 
   for (unsigned I = 0, E = Src1LeftoverRegs.size(); I != E; ++I) {
     auto Select = MIRBuilder.buildSelect(
       LeftoverTy, CondReg, Src1LeftoverRegs[I], Src2LeftoverRegs[I]);
-    DstLeftoverRegs.push_back(Select->getOperand(0).getReg());
+    DstLeftoverRegs.push_back(Select.getReg(0));
   }
 
   insertParts(DstReg, DstTy, NarrowTy, DstRegs,
@@ -3721,6 +4170,103 @@ LegalizerHelper::narrowScalarSelect(MachineInstr &MI, unsigned TypeIdx,
 }
 
 LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarCTLZ(MachineInstr &MI, unsigned TypeIdx,
+                                  LLT NarrowTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT SrcTy = MRI.getType(SrcReg);
+  unsigned NarrowSize = NarrowTy.getSizeInBits();
+
+  if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
+    const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTLZ_ZERO_UNDEF;
+
+    MachineIRBuilder &B = MIRBuilder;
+    auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
+    // ctlz(Hi:Lo) -> Hi == 0 ? (NarrowSize + ctlz(Lo)) : ctlz(Hi)
+    auto C_0 = B.buildConstant(NarrowTy, 0);
+    auto HiIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
+                                UnmergeSrc.getReg(1), C_0);
+    auto LoCTLZ = IsUndef ?
+      B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0)) :
+      B.buildCTLZ(DstTy, UnmergeSrc.getReg(0));
+    auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
+    auto HiIsZeroCTLZ = B.buildAdd(DstTy, LoCTLZ, C_NarrowSize);
+    auto HiCTLZ = B.buildCTLZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1));
+    B.buildSelect(DstReg, HiIsZero, HiIsZeroCTLZ, HiCTLZ);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarCTTZ(MachineInstr &MI, unsigned TypeIdx,
+                                  LLT NarrowTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT SrcTy = MRI.getType(SrcReg);
+  unsigned NarrowSize = NarrowTy.getSizeInBits();
+
+  if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
+    const bool IsUndef = MI.getOpcode() == TargetOpcode::G_CTTZ_ZERO_UNDEF;
+
+    MachineIRBuilder &B = MIRBuilder;
+    auto UnmergeSrc = B.buildUnmerge(NarrowTy, SrcReg);
+    // cttz(Hi:Lo) -> Lo == 0 ? (cttz(Hi) + NarrowSize) : cttz(Lo)
+    auto C_0 = B.buildConstant(NarrowTy, 0);
+    auto LoIsZero = B.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
+                                UnmergeSrc.getReg(0), C_0);
+    auto HiCTTZ = IsUndef ?
+      B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(1)) :
+      B.buildCTTZ(DstTy, UnmergeSrc.getReg(1));
+    auto C_NarrowSize = B.buildConstant(DstTy, NarrowSize);
+    auto LoIsZeroCTTZ = B.buildAdd(DstTy, HiCTTZ, C_NarrowSize);
+    auto LoCTTZ = B.buildCTTZ_ZERO_UNDEF(DstTy, UnmergeSrc.getReg(0));
+    B.buildSelect(DstReg, LoIsZero, LoIsZeroCTTZ, LoCTTZ);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::narrowScalarCTPOP(MachineInstr &MI, unsigned TypeIdx,
+                                   LLT NarrowTy) {
+  if (TypeIdx != 1)
+    return UnableToLegalize;
+
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
+  unsigned NarrowSize = NarrowTy.getSizeInBits();
+
+  if (SrcTy.isScalar() && SrcTy.getSizeInBits() == 2 * NarrowSize) {
+    auto UnmergeSrc = MIRBuilder.buildUnmerge(NarrowTy, MI.getOperand(1));
+
+    auto LoCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(0));
+    auto HiCTPOP = MIRBuilder.buildCTPOP(DstTy, UnmergeSrc.getReg(1));
+    MIRBuilder.buildAdd(DstReg, HiCTPOP, LoCTPOP);
+
+    MI.eraseFromParent();
+    return Legalized;
+  }
+
+  return UnableToLegalize;
+}
+
+LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   unsigned Opc = MI.getOpcode();
   auto &TII = *MI.getMF()->getSubtarget().getInstrInfo();
@@ -3739,18 +4285,20 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     return Legalized;
   }
   case TargetOpcode::G_CTLZ: {
+    Register DstReg = MI.getOperand(0).getReg();
     Register SrcReg = MI.getOperand(1).getReg();
-    unsigned Len = Ty.getSizeInBits();
-    if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {Ty, Ty}})) {
+    LLT DstTy = MRI.getType(DstReg);
+    LLT SrcTy = MRI.getType(SrcReg);
+    unsigned Len = SrcTy.getSizeInBits();
+
+    if (isSupported({TargetOpcode::G_CTLZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
       // If CTLZ_ZERO_UNDEF is supported, emit that and a select for zero.
-      auto MIBCtlzZU = MIRBuilder.buildInstr(TargetOpcode::G_CTLZ_ZERO_UNDEF,
-                                             {Ty}, {SrcReg});
-      auto MIBZero = MIRBuilder.buildConstant(Ty, 0);
-      auto MIBLen = MIRBuilder.buildConstant(Ty, Len);
-      auto MIBICmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
-                                          SrcReg, MIBZero);
-      MIRBuilder.buildSelect(MI.getOperand(0).getReg(), MIBICmp, MIBLen,
-                             MIBCtlzZU);
+      auto CtlzZU = MIRBuilder.buildCTLZ_ZERO_UNDEF(DstTy, SrcReg);
+      auto ZeroSrc = MIRBuilder.buildConstant(SrcTy, 0);
+      auto ICmp = MIRBuilder.buildICmp(
+          CmpInst::ICMP_EQ, SrcTy.changeElementSize(1), SrcReg, ZeroSrc);
+      auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
+      MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CtlzZU);
       MI.eraseFromParent();
       return Legalized;
     }
@@ -3768,16 +4316,14 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     Register Op = SrcReg;
     unsigned NewLen = PowerOf2Ceil(Len);
     for (unsigned i = 0; (1U << i) <= (NewLen / 2); ++i) {
-      auto MIBShiftAmt = MIRBuilder.buildConstant(Ty, 1ULL << i);
-      auto MIBOp = MIRBuilder.buildInstr(
-          TargetOpcode::G_OR, {Ty},
-          {Op, MIRBuilder.buildInstr(TargetOpcode::G_LSHR, {Ty},
-                                     {Op, MIBShiftAmt})});
-      Op = MIBOp->getOperand(0).getReg();
+      auto MIBShiftAmt = MIRBuilder.buildConstant(SrcTy, 1ULL << i);
+      auto MIBOp = MIRBuilder.buildOr(
+          SrcTy, Op, MIRBuilder.buildLShr(SrcTy, Op, MIBShiftAmt));
+      Op = MIBOp.getReg(0);
     }
-    auto MIBPop = MIRBuilder.buildInstr(TargetOpcode::G_CTPOP, {Ty}, {Op});
-    MIRBuilder.buildInstr(TargetOpcode::G_SUB, {MI.getOperand(0).getReg()},
-                          {MIRBuilder.buildConstant(Ty, Len), MIBPop});
+    auto MIBPop = MIRBuilder.buildCTPOP(DstTy, Op);
+    MIRBuilder.buildSub(MI.getOperand(0), MIRBuilder.buildConstant(DstTy, Len),
+                        MIBPop);
     MI.eraseFromParent();
     return Legalized;
   }
@@ -3789,19 +4335,21 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     return Legalized;
   }
   case TargetOpcode::G_CTTZ: {
+    Register DstReg = MI.getOperand(0).getReg();
     Register SrcReg = MI.getOperand(1).getReg();
-    unsigned Len = Ty.getSizeInBits();
-    if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {Ty, Ty}})) {
+    LLT DstTy = MRI.getType(DstReg);
+    LLT SrcTy = MRI.getType(SrcReg);
+
+    unsigned Len = SrcTy.getSizeInBits();
+    if (isSupported({TargetOpcode::G_CTTZ_ZERO_UNDEF, {DstTy, SrcTy}})) {
       // If CTTZ_ZERO_UNDEF is legal or custom, emit that and a select with
       // zero.
-      auto MIBCttzZU = MIRBuilder.buildInstr(TargetOpcode::G_CTTZ_ZERO_UNDEF,
-                                             {Ty}, {SrcReg});
-      auto MIBZero = MIRBuilder.buildConstant(Ty, 0);
-      auto MIBLen = MIRBuilder.buildConstant(Ty, Len);
-      auto MIBICmp = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, LLT::scalar(1),
-                                          SrcReg, MIBZero);
-      MIRBuilder.buildSelect(MI.getOperand(0).getReg(), MIBICmp, MIBLen,
-                             MIBCttzZU);
+      auto CttzZU = MIRBuilder.buildCTTZ_ZERO_UNDEF(DstTy, SrcReg);
+      auto Zero = MIRBuilder.buildConstant(SrcTy, 0);
+      auto ICmp = MIRBuilder.buildICmp(
+          CmpInst::ICMP_EQ, DstTy.changeElementSize(1), SrcReg, Zero);
+      auto LenConst = MIRBuilder.buildConstant(DstTy, Len);
+      MIRBuilder.buildSelect(DstReg, ICmp, LenConst, CttzZU);
       MI.eraseFromParent();
       return Legalized;
     }
@@ -3810,24 +4358,70 @@ LegalizerHelper::lowerBitCount(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     // { return 32 - nlz(~x & (x-1)); }
     // Ref: "Hacker's Delight" by Henry Warren
     auto MIBCstNeg1 = MIRBuilder.buildConstant(Ty, -1);
-    auto MIBNot =
-        MIRBuilder.buildInstr(TargetOpcode::G_XOR, {Ty}, {SrcReg, MIBCstNeg1});
-    auto MIBTmp = MIRBuilder.buildInstr(
-        TargetOpcode::G_AND, {Ty},
-        {MIBNot, MIRBuilder.buildInstr(TargetOpcode::G_ADD, {Ty},
-                                       {SrcReg, MIBCstNeg1})});
+    auto MIBNot = MIRBuilder.buildXor(Ty, SrcReg, MIBCstNeg1);
+    auto MIBTmp = MIRBuilder.buildAnd(
+        Ty, MIBNot, MIRBuilder.buildAdd(Ty, SrcReg, MIBCstNeg1));
     if (!isSupported({TargetOpcode::G_CTPOP, {Ty, Ty}}) &&
         isSupported({TargetOpcode::G_CTLZ, {Ty, Ty}})) {
       auto MIBCstLen = MIRBuilder.buildConstant(Ty, Len);
-      MIRBuilder.buildInstr(
-          TargetOpcode::G_SUB, {MI.getOperand(0).getReg()},
-          {MIBCstLen,
-           MIRBuilder.buildInstr(TargetOpcode::G_CTLZ, {Ty}, {MIBTmp})});
+      MIRBuilder.buildSub(MI.getOperand(0), MIBCstLen,
+                          MIRBuilder.buildCTLZ(Ty, MIBTmp));
       MI.eraseFromParent();
       return Legalized;
     }
     MI.setDesc(TII.get(TargetOpcode::G_CTPOP));
-    MI.getOperand(1).setReg(MIBTmp->getOperand(0).getReg());
+    MI.getOperand(1).setReg(MIBTmp.getReg(0));
+    return Legalized;
+  }
+  case TargetOpcode::G_CTPOP: {
+    unsigned Size = Ty.getSizeInBits();
+    MachineIRBuilder &B = MIRBuilder;
+
+    // Count set bits in blocks of 2 bits. Default approach would be
+    // B2Count = { val & 0x55555555 } + { (val >> 1) & 0x55555555 }
+    // We use following formula instead:
+    // B2Count = val - { (val >> 1) & 0x55555555 }
+    // since it gives same result in blocks of 2 with one instruction less.
+    auto C_1 = B.buildConstant(Ty, 1);
+    auto B2Set1LoTo1Hi = B.buildLShr(Ty, MI.getOperand(1).getReg(), C_1);
+    APInt B2Mask1HiTo0 = APInt::getSplat(Size, APInt(8, 0x55));
+    auto C_B2Mask1HiTo0 = B.buildConstant(Ty, B2Mask1HiTo0);
+    auto B2Count1Hi = B.buildAnd(Ty, B2Set1LoTo1Hi, C_B2Mask1HiTo0);
+    auto B2Count = B.buildSub(Ty, MI.getOperand(1).getReg(), B2Count1Hi);
+
+    // In order to get count in blocks of 4 add values from adjacent block of 2.
+    // B4Count = { B2Count & 0x33333333 } + { (B2Count >> 2) & 0x33333333 }
+    auto C_2 = B.buildConstant(Ty, 2);
+    auto B4Set2LoTo2Hi = B.buildLShr(Ty, B2Count, C_2);
+    APInt B4Mask2HiTo0 = APInt::getSplat(Size, APInt(8, 0x33));
+    auto C_B4Mask2HiTo0 = B.buildConstant(Ty, B4Mask2HiTo0);
+    auto B4HiB2Count = B.buildAnd(Ty, B4Set2LoTo2Hi, C_B4Mask2HiTo0);
+    auto B4LoB2Count = B.buildAnd(Ty, B2Count, C_B4Mask2HiTo0);
+    auto B4Count = B.buildAdd(Ty, B4HiB2Count, B4LoB2Count);
+
+    // For count in blocks of 8 bits we don't have to mask high 4 bits before
+    // addition since count value sits in range {0,...,8} and 4 bits are enough
+    // to hold such binary values. After addition high 4 bits still hold count
+    // of set bits in high 4 bit block, set them to zero and get 8 bit result.
+    // B8Count = { B4Count + (B4Count >> 4) } & 0x0F0F0F0F
+    auto C_4 = B.buildConstant(Ty, 4);
+    auto B8HiB4Count = B.buildLShr(Ty, B4Count, C_4);
+    auto B8CountDirty4Hi = B.buildAdd(Ty, B8HiB4Count, B4Count);
+    APInt B8Mask4HiTo0 = APInt::getSplat(Size, APInt(8, 0x0F));
+    auto C_B8Mask4HiTo0 = B.buildConstant(Ty, B8Mask4HiTo0);
+    auto B8Count = B.buildAnd(Ty, B8CountDirty4Hi, C_B8Mask4HiTo0);
+
+    assert(Size<=128 && "Scalar size is too large for CTPOP lower algorithm");
+    // 8 bits can hold CTPOP result of 128 bit int or smaller. Mul with this
+    // bitmask will set 8 msb in ResTmp to sum of all B8Counts in 8 bit blocks.
+    auto MulMask = B.buildConstant(Ty, APInt::getSplat(Size, APInt(8, 0x01)));
+    auto ResTmp = B.buildMul(Ty, B8Count, MulMask);
+
+    // Shift count result from 8 high bits to low bits.
+    auto C_SizeM8 = B.buildConstant(Ty, Size - 8);
+    B.buildLShr(MI.getOperand(0).getReg(), ResTmp, C_SizeM8);
+
+    MI.eraseFromParent();
     return Legalized;
   }
   }
@@ -3888,6 +4482,7 @@ LegalizerHelper::lowerU64ToF32BitOps(MachineInstr &MI) {
   auto R = MIRBuilder.buildSelect(S32, RCmp, One, Select0);
   MIRBuilder.buildAdd(Dst, V, R);
 
+  MI.eraseFromParent();
   return Legalized;
 }
 
@@ -3960,6 +4555,7 @@ LegalizerHelper::lowerSITOFP(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
     auto SignNotZero = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, S,
                                             MIRBuilder.buildConstant(S64, 0));
     MIRBuilder.buildSelect(Dst, SignNotZero, RNeg, R);
+    MI.eraseFromParent();
     return Legalized;
   }
 
@@ -4010,6 +4606,195 @@ LegalizerHelper::lowerFPTOUI(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   return Legalized;
 }
 
+LegalizerHelper::LegalizeResult LegalizerHelper::lowerFPTOSI(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+  const LLT S64 = LLT::scalar(64);
+  const LLT S32 = LLT::scalar(32);
+
+  // FIXME: Only f32 to i64 conversions are supported.
+  if (SrcTy.getScalarType() != S32 || DstTy.getScalarType() != S64)
+    return UnableToLegalize;
+
+  // Expand f32 -> i64 conversion
+  // This algorithm comes from compiler-rt's implementation of fixsfdi:
+  // https://github.com/llvm/llvm-project/blob/master/compiler-rt/lib/builtins/fixsfdi.c
+
+  unsigned SrcEltBits = SrcTy.getScalarSizeInBits();
+
+  auto ExponentMask = MIRBuilder.buildConstant(SrcTy, 0x7F800000);
+  auto ExponentLoBit = MIRBuilder.buildConstant(SrcTy, 23);
+
+  auto AndExpMask = MIRBuilder.buildAnd(SrcTy, Src, ExponentMask);
+  auto ExponentBits = MIRBuilder.buildLShr(SrcTy, AndExpMask, ExponentLoBit);
+
+  auto SignMask = MIRBuilder.buildConstant(SrcTy,
+                                           APInt::getSignMask(SrcEltBits));
+  auto AndSignMask = MIRBuilder.buildAnd(SrcTy, Src, SignMask);
+  auto SignLowBit = MIRBuilder.buildConstant(SrcTy, SrcEltBits - 1);
+  auto Sign = MIRBuilder.buildAShr(SrcTy, AndSignMask, SignLowBit);
+  Sign = MIRBuilder.buildSExt(DstTy, Sign);
+
+  auto MantissaMask = MIRBuilder.buildConstant(SrcTy, 0x007FFFFF);
+  auto AndMantissaMask = MIRBuilder.buildAnd(SrcTy, Src, MantissaMask);
+  auto K = MIRBuilder.buildConstant(SrcTy, 0x00800000);
+
+  auto R = MIRBuilder.buildOr(SrcTy, AndMantissaMask, K);
+  R = MIRBuilder.buildZExt(DstTy, R);
+
+  auto Bias = MIRBuilder.buildConstant(SrcTy, 127);
+  auto Exponent = MIRBuilder.buildSub(SrcTy, ExponentBits, Bias);
+  auto SubExponent = MIRBuilder.buildSub(SrcTy, Exponent, ExponentLoBit);
+  auto ExponentSub = MIRBuilder.buildSub(SrcTy, ExponentLoBit, Exponent);
+
+  auto Shl = MIRBuilder.buildShl(DstTy, R, SubExponent);
+  auto Srl = MIRBuilder.buildLShr(DstTy, R, ExponentSub);
+
+  const LLT S1 = LLT::scalar(1);
+  auto CmpGt = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,
+                                    S1, Exponent, ExponentLoBit);
+
+  R = MIRBuilder.buildSelect(DstTy, CmpGt, Shl, Srl);
+
+  auto XorSign = MIRBuilder.buildXor(DstTy, R, Sign);
+  auto Ret = MIRBuilder.buildSub(DstTy, XorSign, Sign);
+
+  auto ZeroSrcTy = MIRBuilder.buildConstant(SrcTy, 0);
+
+  auto ExponentLt0 = MIRBuilder.buildICmp(CmpInst::ICMP_SLT,
+                                          S1, Exponent, ZeroSrcTy);
+
+  auto ZeroDstTy = MIRBuilder.buildConstant(DstTy, 0);
+  MIRBuilder.buildSelect(Dst, ExponentLt0, ZeroDstTy, Ret);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+// f64 -> f16 conversion using round-to-nearest-even rounding mode.
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFPTRUNC_F64_TO_F16(MachineInstr &MI) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+
+  if (MRI.getType(Src).isVector()) // TODO: Handle vectors directly.
+    return UnableToLegalize;
+
+  const unsigned ExpMask = 0x7ff;
+  const unsigned ExpBiasf64 = 1023;
+  const unsigned ExpBiasf16 = 15;
+  const LLT S32 = LLT::scalar(32);
+  const LLT S1 = LLT::scalar(1);
+
+  auto Unmerge = MIRBuilder.buildUnmerge(S32, Src);
+  Register U = Unmerge.getReg(0);
+  Register UH = Unmerge.getReg(1);
+
+  auto E = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 20));
+  E = MIRBuilder.buildAnd(S32, E, MIRBuilder.buildConstant(S32, ExpMask));
+
+  // Subtract the fp64 exponent bias (1023) to get the real exponent and
+  // add the f16 bias (15) to get the biased exponent for the f16 format.
+  E = MIRBuilder.buildAdd(
+    S32, E, MIRBuilder.buildConstant(S32, -ExpBiasf64 + ExpBiasf16));
+
+  auto M = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 8));
+  M = MIRBuilder.buildAnd(S32, M, MIRBuilder.buildConstant(S32, 0xffe));
+
+  auto MaskedSig = MIRBuilder.buildAnd(S32, UH,
+                                       MIRBuilder.buildConstant(S32, 0x1ff));
+  MaskedSig = MIRBuilder.buildOr(S32, MaskedSig, U);
+
+  auto Zero = MIRBuilder.buildConstant(S32, 0);
+  auto SigCmpNE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, MaskedSig, Zero);
+  auto Lo40Set = MIRBuilder.buildZExt(S32, SigCmpNE0);
+  M = MIRBuilder.buildOr(S32, M, Lo40Set);
+
+  // (M != 0 ? 0x0200 : 0) | 0x7c00;
+  auto Bits0x200 = MIRBuilder.buildConstant(S32, 0x0200);
+  auto CmpM_NE0 = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1, M, Zero);
+  auto SelectCC = MIRBuilder.buildSelect(S32, CmpM_NE0, Bits0x200, Zero);
+
+  auto Bits0x7c00 = MIRBuilder.buildConstant(S32, 0x7c00);
+  auto I = MIRBuilder.buildOr(S32, SelectCC, Bits0x7c00);
+
+  // N = M | (E << 12);
+  auto EShl12 = MIRBuilder.buildShl(S32, E, MIRBuilder.buildConstant(S32, 12));
+  auto N = MIRBuilder.buildOr(S32, M, EShl12);
+
+  // B = clamp(1-E, 0, 13);
+  auto One = MIRBuilder.buildConstant(S32, 1);
+  auto OneSubExp = MIRBuilder.buildSub(S32, One, E);
+  auto B = MIRBuilder.buildSMax(S32, OneSubExp, Zero);
+  B = MIRBuilder.buildSMin(S32, B, MIRBuilder.buildConstant(S32, 13));
+
+  auto SigSetHigh = MIRBuilder.buildOr(S32, M,
+                                       MIRBuilder.buildConstant(S32, 0x1000));
+
+  auto D = MIRBuilder.buildLShr(S32, SigSetHigh, B);
+  auto D0 = MIRBuilder.buildShl(S32, D, B);
+
+  auto D0_NE_SigSetHigh = MIRBuilder.buildICmp(CmpInst::ICMP_NE, S1,
+                                             D0, SigSetHigh);
+  auto D1 = MIRBuilder.buildZExt(S32, D0_NE_SigSetHigh);
+  D = MIRBuilder.buildOr(S32, D, D1);
+
+  auto CmpELtOne = MIRBuilder.buildICmp(CmpInst::ICMP_SLT, S1, E, One);
+  auto V = MIRBuilder.buildSelect(S32, CmpELtOne, D, N);
+
+  auto VLow3 = MIRBuilder.buildAnd(S32, V, MIRBuilder.buildConstant(S32, 7));
+  V = MIRBuilder.buildLShr(S32, V, MIRBuilder.buildConstant(S32, 2));
+
+  auto VLow3Eq3 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1, VLow3,
+                                       MIRBuilder.buildConstant(S32, 3));
+  auto V0 = MIRBuilder.buildZExt(S32, VLow3Eq3);
+
+  auto VLow3Gt5 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT, S1, VLow3,
+                                       MIRBuilder.buildConstant(S32, 5));
+  auto V1 = MIRBuilder.buildZExt(S32, VLow3Gt5);
+
+  V1 = MIRBuilder.buildOr(S32, V0, V1);
+  V = MIRBuilder.buildAdd(S32, V, V1);
+
+  auto CmpEGt30 = MIRBuilder.buildICmp(CmpInst::ICMP_SGT,  S1,
+                                       E, MIRBuilder.buildConstant(S32, 30));
+  V = MIRBuilder.buildSelect(S32, CmpEGt30,
+                             MIRBuilder.buildConstant(S32, 0x7c00), V);
+
+  auto CmpEGt1039 = MIRBuilder.buildICmp(CmpInst::ICMP_EQ, S1,
+                                         E, MIRBuilder.buildConstant(S32, 1039));
+  V = MIRBuilder.buildSelect(S32, CmpEGt1039, I, V);
+
+  // Extract the sign bit.
+  auto Sign = MIRBuilder.buildLShr(S32, UH, MIRBuilder.buildConstant(S32, 16));
+  Sign = MIRBuilder.buildAnd(S32, Sign, MIRBuilder.buildConstant(S32, 0x8000));
+
+  // Insert the sign bit
+  V = MIRBuilder.buildOr(S32, Sign, V);
+
+  MIRBuilder.buildTrunc(Dst, V);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFPTRUNC(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src);
+  const LLT S64 = LLT::scalar(64);
+  const LLT S16 = LLT::scalar(16);
+
+  if (DstTy.getScalarType() == S16 && SrcTy.getScalarType() == S64)
+    return lowerFPTRUNC_F64_TO_F16(MI);
+
+  return UnableToLegalize;
+}
+
 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
   switch (Opc) {
   case TargetOpcode::G_SMIN:
@@ -4063,7 +4848,7 @@ LegalizerHelper::lowerFCopySign(MachineInstr &MI, unsigned TypeIdx, LLT Ty) {
   MachineInstr *Or;
 
   if (Src0Ty == Src1Ty) {
-    auto And1 = MIRBuilder.buildAnd(Src1Ty, Src0, SignBitMask);
+    auto And1 = MIRBuilder.buildAnd(Src1Ty, Src1, SignBitMask);
     Or = MIRBuilder.buildOr(Dst, And0, And1);
   } else if (Src0Size > Src1Size) {
     auto ShiftAmt = MIRBuilder.buildConstant(Src0Ty, Src0Size - Src1Size);
@@ -4136,6 +4921,39 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerFMad(MachineInstr &MI) {
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
   Register DstReg = MI.getOperand(0).getReg();
+  Register X = MI.getOperand(1).getReg();
+  const unsigned Flags = MI.getFlags();
+  const LLT Ty = MRI.getType(DstReg);
+  const LLT CondTy = Ty.changeElementSize(1);
+
+  // round(x) =>
+  //  t = trunc(x);
+  //  d = fabs(x - t);
+  //  o = copysign(1.0f, x);
+  //  return t + (d >= 0.5 ? o : 0.0);
+
+  auto T = MIRBuilder.buildIntrinsicTrunc(Ty, X, Flags);
+
+  auto Diff = MIRBuilder.buildFSub(Ty, X, T, Flags);
+  auto AbsDiff = MIRBuilder.buildFAbs(Ty, Diff, Flags);
+  auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
+  auto One = MIRBuilder.buildFConstant(Ty, 1.0);
+  auto Half = MIRBuilder.buildFConstant(Ty, 0.5);
+  auto SignOne = MIRBuilder.buildFCopysign(Ty, One, X);
+
+  auto Cmp = MIRBuilder.buildFCmp(CmpInst::FCMP_OGE, CondTy, AbsDiff, Half,
+                                  Flags);
+  auto Sel = MIRBuilder.buildSelect(Ty, Cmp, SignOne, Zero, Flags);
+
+  MIRBuilder.buildFAdd(DstReg, T, Sel, Flags);
+
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerFFloor(MachineInstr &MI) {
+  Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
   unsigned Flags = MI.getFlags();
   LLT Ty = MRI.getType(DstReg);
@@ -4145,8 +4963,8 @@ LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
   // if (src < 0.0 && src != result)
   //   result += -1.0.
 
-  auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
   auto Trunc = MIRBuilder.buildIntrinsicTrunc(Ty, SrcReg, Flags);
+  auto Zero = MIRBuilder.buildFConstant(Ty, 0.0);
 
   auto Lt0 = MIRBuilder.buildFCmp(CmpInst::FCMP_OLT, CondTy,
                                   SrcReg, Zero, Flags);
@@ -4155,7 +4973,48 @@ LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
   auto And = MIRBuilder.buildAnd(CondTy, Lt0, NeTrunc);
   auto AddVal = MIRBuilder.buildSITOFP(Ty, And);
 
-  MIRBuilder.buildFAdd(DstReg, Trunc, AddVal);
+  MIRBuilder.buildFAdd(DstReg, Trunc, AddVal, Flags);
+  MI.eraseFromParent();
+  return Legalized;
+}
+
+LegalizerHelper::LegalizeResult
+LegalizerHelper::lowerMergeValues(MachineInstr &MI) {
+  const unsigned NumOps = MI.getNumOperands();
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0Reg = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(DstReg);
+  LLT SrcTy = MRI.getType(Src0Reg);
+  unsigned PartSize = SrcTy.getSizeInBits();
+
+  LLT WideTy = LLT::scalar(DstTy.getSizeInBits());
+  Register ResultReg = MIRBuilder.buildZExt(WideTy, Src0Reg).getReg(0);
+
+  for (unsigned I = 2; I != NumOps; ++I) {
+    const unsigned Offset = (I - 1) * PartSize;
+
+    Register SrcReg = MI.getOperand(I).getReg();
+    auto ZextInput = MIRBuilder.buildZExt(WideTy, SrcReg);
+
+    Register NextResult = I + 1 == NumOps && WideTy == DstTy ? DstReg :
+      MRI.createGenericVirtualRegister(WideTy);
+
+    auto ShiftAmt = MIRBuilder.buildConstant(WideTy, Offset);
+    auto Shl = MIRBuilder.buildShl(WideTy, ZextInput, ShiftAmt);
+    MIRBuilder.buildOr(NextResult, ResultReg, Shl);
+    ResultReg = NextResult;
+  }
+
+  if (DstTy.isPointer()) {
+    if (MIRBuilder.getDataLayout().isNonIntegralAddressSpace(
+          DstTy.getAddressSpace())) {
+      LLVM_DEBUG(dbgs() << "Not casting nonintegral address space\n");
+      return UnableToLegalize;
+    }
+
+    MIRBuilder.buildIntToPtr(DstReg, ResultReg);
+  }
+
   MI.eraseFromParent();
   return Legalized;
 }
@@ -4163,34 +5022,31 @@ LegalizerHelper::lowerIntrinsicRound(MachineInstr &MI) {
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerUnmergeValues(MachineInstr &MI) {
   const unsigned NumDst = MI.getNumOperands() - 1;
-  const Register SrcReg = MI.getOperand(NumDst).getReg();
-  LLT SrcTy = MRI.getType(SrcReg);
-
+  Register SrcReg = MI.getOperand(NumDst).getReg();
   Register Dst0Reg = MI.getOperand(0).getReg();
   LLT DstTy = MRI.getType(Dst0Reg);
+  if (DstTy.isPointer())
+    return UnableToLegalize; // TODO
 
+  SrcReg = coerceToScalar(SrcReg);
+  if (!SrcReg)
+    return UnableToLegalize;
 
   // Expand scalarizing unmerge as bitcast to integer and shift.
-  if (!DstTy.isVector() && SrcTy.isVector() &&
-      SrcTy.getElementType() == DstTy) {
-    LLT IntTy = LLT::scalar(SrcTy.getSizeInBits());
-    Register Cast = MIRBuilder.buildBitcast(IntTy, SrcReg).getReg(0);
-
-    MIRBuilder.buildTrunc(Dst0Reg, Cast);
-
-    const unsigned DstSize = DstTy.getSizeInBits();
-    unsigned Offset = DstSize;
-    for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
-      auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
-      auto Shift = MIRBuilder.buildLShr(IntTy, Cast, ShiftAmt);
-      MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
-    }
+  LLT IntTy = MRI.getType(SrcReg);
 
-    MI.eraseFromParent();
-    return Legalized;
+  MIRBuilder.buildTrunc(Dst0Reg, SrcReg);
+
+  const unsigned DstSize = DstTy.getSizeInBits();
+  unsigned Offset = DstSize;
+  for (unsigned I = 1; I != NumDst; ++I, Offset += DstSize) {
+    auto ShiftAmt = MIRBuilder.buildConstant(IntTy, Offset);
+    auto Shift = MIRBuilder.buildLShr(IntTy, SrcReg, ShiftAmt);
+    MIRBuilder.buildTrunc(MI.getOperand(I), Shift);
   }
 
-  return UnableToLegalize;
+  MI.eraseFromParent();
+  return Legalized;
 }
 
 LegalizerHelper::LegalizeResult
@@ -4251,16 +5107,19 @@ LegalizerHelper::lowerShuffleVector(MachineInstr &MI) {
 
 LegalizerHelper::LegalizeResult
 LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
+  const auto &MF = *MI.getMF();
+  const auto &TFI = *MF.getSubtarget().getFrameLowering();
+  if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp)
+    return UnableToLegalize;
+
   Register Dst = MI.getOperand(0).getReg();
   Register AllocSize = MI.getOperand(1).getReg();
-  unsigned Align = MI.getOperand(2).getImm();
-
-  const auto &MF = *MI.getMF();
-  const auto &TLI = *MF.getSubtarget().getTargetLowering();
+  Align Alignment = assumeAligned(MI.getOperand(2).getImm());
 
   LLT PtrTy = MRI.getType(Dst);
   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
 
+  const auto &TLI = *MF.getSubtarget().getTargetLowering();
   Register SPReg = TLI.getStackPointerRegisterToSaveRestore();
   auto SPTmp = MIRBuilder.buildCopy(PtrTy, SPReg);
   SPTmp = MIRBuilder.buildCast(IntPtrTy, SPTmp);
@@ -4269,8 +5128,8 @@ LegalizerHelper::lowerDynStackAlloc(MachineInstr &MI) {
   // have to generate an extra instruction to negate the alloc and then use
   // G_PTR_ADD to add the negative offset.
   auto Alloc = MIRBuilder.buildSub(IntPtrTy, SPTmp, AllocSize);
-  if (Align) {
-    APInt AlignMask(IntPtrTy.getSizeInBits(), Align, true);
+  if (Alignment > Align(1)) {
+    APInt AlignMask(IntPtrTy.getSizeInBits(), Alignment.value(), true);
     AlignMask.negate();
     auto AlignCst = MIRBuilder.buildConstant(IntPtrTy, AlignMask);
     Alloc = MIRBuilder.buildAnd(IntPtrTy, Alloc, AlignCst);
@@ -4326,34 +5185,47 @@ LegalizerHelper::LegalizeResult LegalizerHelper::lowerInsert(MachineInstr &MI) {
   LLT DstTy = MRI.getType(Src);
   LLT InsertTy = MRI.getType(InsertSrc);
 
-  if (InsertTy.isScalar() &&
-      (DstTy.isScalar() ||
-       (DstTy.isVector() && DstTy.getElementType() == InsertTy))) {
-    LLT IntDstTy = DstTy;
-    if (!DstTy.isScalar()) {
-      IntDstTy = LLT::scalar(DstTy.getSizeInBits());
-      Src = MIRBuilder.buildBitcast(IntDstTy, Src).getReg(0);
-    }
+  if (InsertTy.isVector() ||
+      (DstTy.isVector() && DstTy.getElementType() != InsertTy))
+    return UnableToLegalize;
 
-    Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
-    if (Offset != 0) {
-      auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
-      ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
-    }
+  const DataLayout &DL = MIRBuilder.getDataLayout();
+  if ((DstTy.isPointer() &&
+       DL.isNonIntegralAddressSpace(DstTy.getAddressSpace())) ||
+      (InsertTy.isPointer() &&
+       DL.isNonIntegralAddressSpace(InsertTy.getAddressSpace()))) {
+    LLVM_DEBUG(dbgs() << "Not casting non-integral address space integer\n");
+    return UnableToLegalize;
+  }
 
-    APInt MaskVal = ~APInt::getBitsSet(DstTy.getSizeInBits(), Offset,
-                                       InsertTy.getSizeInBits());
+  LLT IntDstTy = DstTy;
 
-    auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
-    auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
-    auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
+  if (!DstTy.isScalar()) {
+    IntDstTy = LLT::scalar(DstTy.getSizeInBits());
+    Src = MIRBuilder.buildCast(IntDstTy, Src).getReg(0);
+  }
 
-    MIRBuilder.buildBitcast(Dst, Or);
-    MI.eraseFromParent();
-    return Legalized;
+  if (!InsertTy.isScalar()) {
+    const LLT IntInsertTy = LLT::scalar(InsertTy.getSizeInBits());
+    InsertSrc = MIRBuilder.buildPtrToInt(IntInsertTy, InsertSrc).getReg(0);
   }
 
-  return UnableToLegalize;
+  Register ExtInsSrc = MIRBuilder.buildZExt(IntDstTy, InsertSrc).getReg(0);
+  if (Offset != 0) {
+    auto ShiftAmt = MIRBuilder.buildConstant(IntDstTy, Offset);
+    ExtInsSrc = MIRBuilder.buildShl(IntDstTy, ExtInsSrc, ShiftAmt).getReg(0);
+  }
+
+  APInt MaskVal = APInt::getBitsSetWithWrap(
+      DstTy.getSizeInBits(), Offset + InsertTy.getSizeInBits(), Offset);
+
+  auto Mask = MIRBuilder.buildConstant(IntDstTy, MaskVal);
+  auto MaskedSrc = MIRBuilder.buildAnd(IntDstTy, Src, Mask);
+  auto Or = MIRBuilder.buildOr(IntDstTy, MaskedSrc, ExtInsSrc);
+
+  MIRBuilder.buildCast(Dst, Or);
+  MI.eraseFromParent();
+  return Legalized;
 }
 
 LegalizerHelper::LegalizeResult
@@ -4397,7 +5269,7 @@ LegalizerHelper::lowerBswap(MachineInstr &MI) {
   Register Dst = MI.getOperand(0).getReg();
   Register Src = MI.getOperand(1).getReg();
   const LLT Ty = MRI.getType(Src);
-  unsigned SizeInBytes = Ty.getSizeInBytes();
+  unsigned SizeInBytes = (Ty.getScalarSizeInBits() + 7) / 8;
   unsigned BaseShiftAmt = (SizeInBytes - 1) * 8;
 
   // Swap most and least significant byte, set remaining bytes in Res to zero.
@@ -4470,20 +5342,29 @@ LegalizerHelper::lowerBitreverse(MachineInstr &MI) {
 }
 
 LegalizerHelper::LegalizeResult
-LegalizerHelper::lowerReadRegister(MachineInstr &MI) {
-  Register Dst = MI.getOperand(0).getReg();
-  const LLT Ty = MRI.getType(Dst);
-  const MDString *RegStr = cast<MDString>(
-    cast<MDNode>(MI.getOperand(1).getMetadata())->getOperand(0));
-
+LegalizerHelper::lowerReadWriteRegister(MachineInstr &MI) {
   MachineFunction &MF = MIRBuilder.getMF();
   const TargetSubtargetInfo &STI = MF.getSubtarget();
   const TargetLowering *TLI = STI.getTargetLowering();
-  Register Reg = TLI->getRegisterByName(RegStr->getString().data(), Ty, MF);
-  if (!Reg.isValid())
+
+  bool IsRead = MI.getOpcode() == TargetOpcode::G_READ_REGISTER;
+  int NameOpIdx = IsRead ? 1 : 0;
+  int ValRegIndex = IsRead ? 0 : 1;
+
+  Register ValReg = MI.getOperand(ValRegIndex).getReg();
+  const LLT Ty = MRI.getType(ValReg);
+  const MDString *RegStr = cast<MDString>(
+    cast<MDNode>(MI.getOperand(NameOpIdx).getMetadata())->getOperand(0));
+
+  Register PhysReg = TLI->getRegisterByName(RegStr->getString().data(), Ty, MF);
+  if (!PhysReg.isValid())
     return UnableToLegalize;
 
-  MIRBuilder.buildCopy(Dst, Reg);
+  if (IsRead)
+    MIRBuilder.buildCopy(ValReg, PhysReg);
+  else
+    MIRBuilder.buildCopy(PhysReg, ValReg);
+
   MI.eraseFromParent();
   return Legalized;
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
index 02f6b39e0905..4abd0c4df97a 100644
--- a/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/LegalizerInfo.cpp
@@ -59,6 +59,9 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, LegalizeAction Action) {
   case MoreElements:
     OS << "MoreElements";
     break;
+  case Bitcast:
+    OS << "Bitcast";
+    break;
   case Lower:
     OS << "Lower";
     break;
@@ -173,6 +176,9 @@ static bool mutationIsSane(const LegalizeRule &Rule,
 
     return true;
   }
+  case Bitcast: {
+    return OldTy != NewTy && OldTy.getSizeInBits() == NewTy.getSizeInBits();
+  }
   default:
     return true;
   }
@@ -500,8 +506,7 @@ LegalizerInfo::getAction(const MachineInstr &MI,
   SmallVector<LegalityQuery::MemDesc, 2> MemDescrs;
   for (const auto &MMO : MI.memoperands())
     MemDescrs.push_back({8 * MMO->getSize() /* in bits */,
-                         8 * MMO->getAlignment(),
-                         MMO->getOrdering()});
+                         8 * MMO->getAlign().value(), MMO->getOrdering()});
 
   return getAction({MI.getOpcode(), Types, MemDescrs});
 }
@@ -519,12 +524,6 @@ bool LegalizerInfo::isLegalOrCustom(const MachineInstr &MI,
   return Action == Legal || Action == Custom;
 }
 
-bool LegalizerInfo::legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
-                                   MachineIRBuilder &MIRBuilder,
-                                   GISelChangeObserver &Observer) const {
-  return false;
-}
-
 LegalizerInfo::SizeAndActionsVec
 LegalizerInfo::increaseToLargerTypesAndDecreaseToLargest(
     const SizeAndActionsVec &v, LegalizeAction IncreaseAction,
@@ -575,6 +574,7 @@ LegalizerInfo::findAction(const SizeAndActionsVec &Vec, const uint32_t Size) {
   LegalizeAction Action = Vec[VecIdx].second;
   switch (Action) {
   case Legal:
+  case Bitcast:
   case Lower:
   case Libcall:
   case Custom:
@@ -681,12 +681,6 @@ LegalizerInfo::findVectorLegalAction(const InstrAspect &Aspect) const {
                       IntermediateType.getScalarSizeInBits())};
 }
 
-bool LegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
-                                      MachineRegisterInfo &MRI,
-                                      MachineIRBuilder &MIRBuilder) const {
-  return true;
-}
-
 unsigned LegalizerInfo::getExtOpcodeForWideningConstant(LLT SmallTy) const {
   return SmallTy.isByteSized() ? TargetOpcode::G_SEXT : TargetOpcode::G_ZEXT;
 }
diff --git a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
index 1c4a668e5f31..a07416d08614 100644
--- a/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Localizer.cpp
@@ -13,6 +13,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 
@@ -40,60 +41,6 @@ void Localizer::init(MachineFunction &MF) {
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(MF.getFunction());
 }
 
-bool Localizer::shouldLocalize(const MachineInstr &MI) {
-  // Assuming a spill and reload of a value has a cost of 1 instruction each,
-  // this helper function computes the maximum number of uses we should consider
-  // for remat. E.g. on arm64 global addresses take 2 insts to materialize. We
-  // break even in terms of code size when the original MI has 2 users vs
-  // choosing to potentially spill. Any more than 2 users we we have a net code
-  // size increase. This doesn't take into account register pressure though.
-  auto maxUses = [](unsigned RematCost) {
-    // A cost of 1 means remats are basically free.
-    if (RematCost == 1)
-      return UINT_MAX;
-    if (RematCost == 2)
-      return 2U;
-
-    // Remat is too expensive, only sink if there's one user.
-    if (RematCost > 2)
-      return 1U;
-    llvm_unreachable("Unexpected remat cost");
-  };
-
-  // Helper to walk through uses and terminate if we've reached a limit. Saves
-  // us spending time traversing uses if all we want to know is if it's >= min.
-  auto isUsesAtMost = [&](unsigned Reg, unsigned MaxUses) {
-    unsigned NumUses = 0;
-    auto UI = MRI->use_instr_nodbg_begin(Reg), UE = MRI->use_instr_nodbg_end();
-    for (; UI != UE && NumUses < MaxUses; ++UI) {
-      NumUses++;
-    }
-    // If we haven't reached the end yet then there are more than MaxUses users.
-    return UI == UE;
-  };
-
-  switch (MI.getOpcode()) {
-  default:
-    return false;
-  // Constants-like instructions should be close to their users.
-  // We don't want long live-ranges for them.
-  case TargetOpcode::G_CONSTANT:
-  case TargetOpcode::G_FCONSTANT:
-  case TargetOpcode::G_FRAME_INDEX:
-  case TargetOpcode::G_INTTOPTR:
-    return true;
-  case TargetOpcode::G_GLOBAL_VALUE: {
-    unsigned RematCost = TTI->getGISelRematGlobalCost();
-    Register Reg = MI.getOperand(0).getReg();
-    unsigned MaxUses = maxUses(RematCost);
-    if (MaxUses == UINT_MAX)
-      return true; // Remats are "free" so always localize.
-    bool B = isUsesAtMost(Reg, MaxUses);
-    return B;
-  }
-  }
-}
-
 void Localizer::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<TargetTransformInfoWrapperPass>();
   getSelectionDAGFallbackAnalysisUsage(AU);
@@ -119,9 +66,10 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
   // we only localize instructions in the entry block here. This might change if
   // we start doing CSE across blocks.
   auto &MBB = MF.front();
+  auto &TL = *MF.getSubtarget().getTargetLowering();
   for (auto RI = MBB.rbegin(), RE = MBB.rend(); RI != RE; ++RI) {
     MachineInstr &MI = *RI;
-    if (!shouldLocalize(MI))
+    if (!TL.shouldLocalize(MI, TTI))
       continue;
     LLVM_DEBUG(dbgs() << "Should localize: " << MI);
     assert(MI.getDesc().getNumDefs() == 1 &&
@@ -138,8 +86,13 @@ bool Localizer::localizeInterBlock(MachineFunction &MF,
       LLVM_DEBUG(MachineInstr &MIUse = *MOUse.getParent();
                  dbgs() << "Checking use: " << MIUse
                         << " #Opd: " << MIUse.getOperandNo(&MOUse) << '\n');
-      if (isLocalUse(MOUse, MI, InsertMBB))
+      if (isLocalUse(MOUse, MI, InsertMBB)) {
+        // Even if we're in the same block, if the block is very large we could
+        // still have many long live ranges. Try to do intra-block localization
+        // too.
+        LocalizedInstrs.insert(&MI);
         continue;
+      }
       LLVM_DEBUG(dbgs() << "Fixing non-local use\n");
       Changed = true;
       auto MBBAndReg = std::make_pair(InsertMBB, Reg);
diff --git a/llvm/lib/CodeGen/GlobalISel/LostDebugLocObserver.cpp b/llvm/lib/CodeGen/GlobalISel/LostDebugLocObserver.cpp
new file mode 100644
index 000000000000..6d606e5550f1
--- /dev/null
+++ b/llvm/lib/CodeGen/GlobalISel/LostDebugLocObserver.cpp
@@ -0,0 +1,113 @@
+//===----- llvm/CodeGen/GlobalISel/LostDebugLocObserver.cpp -----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// Tracks DebugLocs between checkpoints and verifies that they are transferred.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/GlobalISel/LostDebugLocObserver.h"
+
+using namespace llvm;
+
+#define LOC_DEBUG(X) DEBUG_WITH_TYPE(DebugType.str().c_str(), X)
+
+void LostDebugLocObserver::analyzeDebugLocations() {
+  if (LostDebugLocs.empty()) {
+    LOC_DEBUG(dbgs() << ".. No debug info was present\n");
+    return;
+  }
+  if (PotentialMIsForDebugLocs.empty()) {
+    LOC_DEBUG(
+        dbgs() << ".. No instructions to carry debug info (dead code?)\n");
+    return;
+  }
+
+  LOC_DEBUG(dbgs() << ".. Searching " << PotentialMIsForDebugLocs.size()
+                   << " instrs for " << LostDebugLocs.size() << " locations\n");
+  SmallPtrSet<MachineInstr *, 4> FoundIn;
+  for (MachineInstr *MI : PotentialMIsForDebugLocs) {
+    if (!MI->getDebugLoc())
+      continue;
+    // Check this first in case there's a matching line-0 location on both input
+    // and output.
+    if (MI->getDebugLoc().getLine() == 0) {
+      LOC_DEBUG(
+          dbgs() << ".. Assuming line-0 location covers remainder (if any)\n");
+      return;
+    }
+    if (LostDebugLocs.erase(MI->getDebugLoc())) {
+      LOC_DEBUG(dbgs() << ".. .. found " << MI->getDebugLoc() << " in " << *MI);
+      FoundIn.insert(MI);
+      continue;
+    }
+  }
+  if (LostDebugLocs.empty())
+    return;
+
+  NumLostDebugLocs += LostDebugLocs.size();
+  LOC_DEBUG({
+    dbgs() << ".. Lost locations:\n";
+    for (const DebugLoc &Loc : LostDebugLocs) {
+      dbgs() << ".. .. ";
+      Loc.print(dbgs());
+      dbgs() << "\n";
+    }
+    dbgs() << ".. MIs with matched locations:\n";
+    for (MachineInstr *MI : FoundIn)
+      if (PotentialMIsForDebugLocs.erase(MI))
+        dbgs() << ".. .. " << *MI;
+    dbgs() << ".. Remaining MIs with unmatched/no locations:\n";
+    for (const MachineInstr *MI : PotentialMIsForDebugLocs)
+      dbgs() << ".. .. " << *MI;
+  });
+}
+
+void LostDebugLocObserver::checkpoint(bool CheckDebugLocs) {
+  if (CheckDebugLocs)
+    analyzeDebugLocations();
+  PotentialMIsForDebugLocs.clear();
+  LostDebugLocs.clear();
+}
+
+void LostDebugLocObserver::createdInstr(MachineInstr &MI) {
+  PotentialMIsForDebugLocs.insert(&MI);
+}
+
+static bool irTranslatorNeverAddsLocations(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return false;
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT:
+  case TargetOpcode::G_IMPLICIT_DEF:
+  case TargetOpcode::G_GLOBAL_VALUE:
+    return true;
+  }
+}
+
+void LostDebugLocObserver::erasingInstr(MachineInstr &MI) {
+  if (irTranslatorNeverAddsLocations(MI.getOpcode()))
+    return;
+
+  PotentialMIsForDebugLocs.erase(&MI);
+  if (MI.getDebugLoc())
+    LostDebugLocs.insert(MI.getDebugLoc());
+}
+
+void LostDebugLocObserver::changingInstr(MachineInstr &MI) {
+  if (irTranslatorNeverAddsLocations(MI.getOpcode()))
+    return;
+
+  PotentialMIsForDebugLocs.erase(&MI);
+  if (MI.getDebugLoc())
+    LostDebugLocs.insert(MI.getDebugLoc());
+}
+
+void LostDebugLocObserver::changedInstr(MachineInstr &MI) {
+  PotentialMIsForDebugLocs.insert(&MI);
+}
diff --git a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
index 67d9dacda61b..10f696d6a3b3 100644
--- a/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/MachineIRBuilder.cpp
@@ -33,48 +33,10 @@ void MachineIRBuilder::setMF(MachineFunction &MF) {
   State.Observer = nullptr;
 }
 
-void MachineIRBuilder::setMBB(MachineBasicBlock &MBB) {
-  State.MBB = &MBB;
-  State.II = MBB.end();
-  assert(&getMF() == MBB.getParent() &&
-         "Basic block is in a different function");
-}
-
-void MachineIRBuilder::setInstr(MachineInstr &MI) {
-  assert(MI.getParent() && "Instruction is not part of a basic block");
-  setMBB(*MI.getParent());
-  State.II = MI.getIterator();
-}
-
-void MachineIRBuilder::setCSEInfo(GISelCSEInfo *Info) { State.CSEInfo = Info; }
-
-void MachineIRBuilder::setInsertPt(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator II) {
-  assert(MBB.getParent() == &getMF() &&
-         "Basic block is in a different function");
-  State.MBB = &MBB;
-  State.II = II;
-}
-
-void MachineIRBuilder::recordInsertion(MachineInstr *InsertedInstr) const {
-  if (State.Observer)
-    State.Observer->createdInstr(*InsertedInstr);
-}
-
-void MachineIRBuilder::setChangeObserver(GISelChangeObserver &Observer) {
-  State.Observer = &Observer;
-}
-
-void MachineIRBuilder::stopObservingChanges() { State.Observer = nullptr; }
-
 //------------------------------------------------------------------------------
 // Build instruction variants.
 //------------------------------------------------------------------------------
 
-MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opcode) {
-  return insertInstr(buildInstrNoInsert(Opcode));
-}
-
 MachineInstrBuilder MachineIRBuilder::buildInstrNoInsert(unsigned Opcode) {
   MachineInstrBuilder MIB = BuildMI(getMF(), getDL(), getTII().get(Opcode));
   return MIB;
@@ -107,13 +69,9 @@ MachineIRBuilder::buildIndirectDbgValue(Register Reg, const MDNode *Variable,
   assert(
       cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
       "Expected inlined-at fields to agree");
-  // DBG_VALUE insts now carry IR-level indirection in their DIExpression
-  // rather than encoding it in the instruction itself.
-  const DIExpression *DIExpr = cast<DIExpression>(Expr);
-  DIExpr = DIExpression::append(DIExpr, {dwarf::DW_OP_deref});
   return insertInstr(BuildMI(getMF(), getDL(),
                              getTII().get(TargetOpcode::DBG_VALUE),
-                             /*IsIndirect*/ false, Reg, Variable, DIExpr));
+                             /*IsIndirect*/ true, Reg, Variable, Expr));
 }
 
 MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI,
@@ -124,15 +82,11 @@ MachineInstrBuilder MachineIRBuilder::buildFIDbgValue(int FI,
   assert(
       cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
       "Expected inlined-at fields to agree");
-  // DBG_VALUE insts now carry IR-level indirection in their DIExpression
-  // rather than encoding it in the instruction itself.
-  const DIExpression *DIExpr = cast<DIExpression>(Expr);
-  DIExpr = DIExpression::append(DIExpr, {dwarf::DW_OP_deref});
   return buildInstr(TargetOpcode::DBG_VALUE)
       .addFrameIndex(FI)
-      .addReg(0)
+      .addImm(0)
       .addMetadata(Variable)
-      .addMetadata(DIExpr);
+      .addMetadata(Expr);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
@@ -143,7 +97,7 @@ MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
   assert(
       cast<DILocalVariable>(Variable)->isValidLocationForIntrinsic(getDL()) &&
       "Expected inlined-at fields to agree");
-  auto MIB = buildInstr(TargetOpcode::DBG_VALUE);
+  auto MIB = buildInstrNoInsert(TargetOpcode::DBG_VALUE);
   if (auto *CI = dyn_cast<ConstantInt>(&C)) {
     if (CI->getBitWidth() > 64)
       MIB.addCImm(CI);
@@ -156,7 +110,8 @@ MachineInstrBuilder MachineIRBuilder::buildConstDbgValue(const Constant &C,
     MIB.addReg(0U);
   }
 
-  return MIB.addReg(0).addMetadata(Variable).addMetadata(Expr);
+  MIB.addImm(0).addMetadata(Variable).addMetadata(Expr);
+  return insertInstr(MIB);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildDbgLabel(const MDNode *Label) {
@@ -170,12 +125,12 @@ MachineInstrBuilder MachineIRBuilder::buildDbgLabel(const MDNode *Label) {
 
 MachineInstrBuilder MachineIRBuilder::buildDynStackAlloc(const DstOp &Res,
                                                          const SrcOp &Size,
-                                                         unsigned Align) {
+                                                         Align Alignment) {
   assert(Res.getLLTTy(*getMRI()).isPointer() && "expected ptr dst type");
   auto MIB = buildInstr(TargetOpcode::G_DYN_STACKALLOC);
   Res.addDefToMIB(*getMRI(), MIB);
   Size.addSrcToMIB(MIB);
-  MIB.addImm(Align);
+  MIB.addImm(Alignment.value());
   return MIB;
 }
 
@@ -207,14 +162,14 @@ MachineInstrBuilder MachineIRBuilder::buildJumpTable(const LLT PtrTy,
       .addJumpTableIndex(JTI);
 }
 
-void MachineIRBuilder::validateBinaryOp(const LLT &Res, const LLT &Op0,
-                                        const LLT &Op1) {
+void MachineIRBuilder::validateBinaryOp(const LLT Res, const LLT Op0,
+                                        const LLT Op1) {
   assert((Res.isScalar() || Res.isVector()) && "invalid operand type");
   assert((Res == Op0 && Res == Op1) && "type mismatch");
 }
 
-void MachineIRBuilder::validateShiftOp(const LLT &Res, const LLT &Op0,
-                                       const LLT &Op1) {
+void MachineIRBuilder::validateShiftOp(const LLT Res, const LLT Op0,
+                                       const LLT Op1) {
   assert((Res.isScalar() || Res.isVector()) && "invalid operand type");
   assert((Res == Op0) && "type mismatch");
 }
@@ -222,16 +177,16 @@ void MachineIRBuilder::validateShiftOp(const LLT &Res, const LLT &Op0,
 MachineInstrBuilder MachineIRBuilder::buildPtrAdd(const DstOp &Res,
                                                   const SrcOp &Op0,
                                                   const SrcOp &Op1) {
-  assert(Res.getLLTTy(*getMRI()).isPointer() &&
+  assert(Res.getLLTTy(*getMRI()).getScalarType().isPointer() &&
          Res.getLLTTy(*getMRI()) == Op0.getLLTTy(*getMRI()) && "type mismatch");
-  assert(Op1.getLLTTy(*getMRI()).isScalar() && "invalid offset type");
+  assert(Op1.getLLTTy(*getMRI()).getScalarType().isScalar() && "invalid offset type");
 
   return buildInstr(TargetOpcode::G_PTR_ADD, {Res}, {Op0, Op1});
 }
 
 Optional<MachineInstrBuilder>
 MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0,
-                                    const LLT &ValueTy, uint64_t Value) {
+                                    const LLT ValueTy, uint64_t Value) {
   assert(Res == 0 && "Res is a result argument");
   assert(ValueTy.isScalar()  && "invalid offset type");
 
@@ -245,17 +200,14 @@ MachineIRBuilder::materializePtrAdd(Register &Res, Register Op0,
   return buildPtrAdd(Res, Op0, Cst.getReg(0));
 }
 
-MachineInstrBuilder MachineIRBuilder::buildPtrMask(const DstOp &Res,
-                                                   const SrcOp &Op0,
-                                                   uint32_t NumBits) {
-  assert(Res.getLLTTy(*getMRI()).isPointer() &&
-         Res.getLLTTy(*getMRI()) == Op0.getLLTTy(*getMRI()) && "type mismatch");
-
-  auto MIB = buildInstr(TargetOpcode::G_PTR_MASK);
-  Res.addDefToMIB(*getMRI(), MIB);
-  Op0.addSrcToMIB(MIB);
-  MIB.addImm(NumBits);
-  return MIB;
+MachineInstrBuilder MachineIRBuilder::buildMaskLowPtrBits(const DstOp &Res,
+                                                          const SrcOp &Op0,
+                                                          uint32_t NumBits) {
+  LLT PtrTy = Res.getLLTTy(*getMRI());
+  LLT MaskTy = LLT::scalar(PtrTy.getSizeInBits());
+  Register MaskReg = getMRI()->createGenericVirtualRegister(MaskTy);
+  buildConstant(MaskReg, maskTrailingZeros<uint64_t>(NumBits));
+  return buildPtrMask(Res, Op0, MaskReg);
 }
 
 MachineInstrBuilder MachineIRBuilder::buildBr(MachineBasicBlock &Dest) {
@@ -298,6 +250,7 @@ MachineInstrBuilder MachineIRBuilder::buildConstant(const DstOp &Res,
   }
 
   auto Const = buildInstr(TargetOpcode::G_CONSTANT);
+  Const->setDebugLoc(DebugLoc());
   Res.addDefToMIB(*getMRI(), Const);
   Const.addCImm(&Val);
   return Const;
@@ -331,6 +284,7 @@ MachineInstrBuilder MachineIRBuilder::buildFConstant(const DstOp &Res,
   }
 
   auto Const = buildInstr(TargetOpcode::G_FCONSTANT);
+  Const->setDebugLoc(DebugLoc());
   Res.addDefToMIB(*getMRI(), Const);
   Const.addFPImm(&Val);
   return Const;
@@ -385,6 +339,23 @@ MachineInstrBuilder MachineIRBuilder::buildLoadInstr(unsigned Opcode,
   return MIB;
 }
 
+MachineInstrBuilder MachineIRBuilder::buildLoadFromOffset(
+  const DstOp &Dst, const SrcOp &BasePtr,
+  MachineMemOperand &BaseMMO, int64_t Offset) {
+  LLT LoadTy = Dst.getLLTTy(*getMRI());
+  MachineMemOperand *OffsetMMO =
+    getMF().getMachineMemOperand(&BaseMMO, Offset, LoadTy.getSizeInBytes());
+
+  if (Offset == 0) // This may be a size or type changing load.
+    return buildLoad(Dst, BasePtr, *OffsetMMO);
+
+  LLT PtrTy = BasePtr.getLLTTy(*getMRI());
+  LLT OffsetTy = LLT::scalar(PtrTy.getSizeInBits());
+  auto ConstOffset = buildConstant(OffsetTy, Offset);
+  auto Ptr = buildPtrAdd(PtrTy, BasePtr, ConstOffset);
+  return buildLoad(Dst, Ptr, *OffsetMMO);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildStore(const SrcOp &Val,
                                                  const SrcOp &Addr,
                                                  MachineMemOperand &MMO) {
@@ -398,22 +369,6 @@ MachineInstrBuilder MachineIRBuilder::buildStore(const SrcOp &Val,
   return MIB;
 }
 
-MachineInstrBuilder MachineIRBuilder::buildUAddo(const DstOp &Res,
-                                                 const DstOp &CarryOut,
-                                                 const SrcOp &Op0,
-                                                 const SrcOp &Op1) {
-  return buildInstr(TargetOpcode::G_UADDO, {Res, CarryOut}, {Op0, Op1});
-}
-
-MachineInstrBuilder MachineIRBuilder::buildUAdde(const DstOp &Res,
-                                                 const DstOp &CarryOut,
-                                                 const SrcOp &Op0,
-                                                 const SrcOp &Op1,
-                                                 const SrcOp &CarryIn) {
-  return buildInstr(TargetOpcode::G_UADDE, {Res, CarryOut},
-                    {Op0, Op1, CarryIn});
-}
-
 MachineInstrBuilder MachineIRBuilder::buildAnyExt(const DstOp &Res,
                                                   const SrcOp &Op) {
   return buildInstr(TargetOpcode::G_ANYEXT, Res, Op);
@@ -537,7 +492,7 @@ void MachineIRBuilder::buildSequence(Register Res, ArrayRef<Register> Ops,
 #ifndef NDEBUG
   assert(Ops.size() == Indices.size() && "incompatible args");
   assert(!Ops.empty() && "invalid trivial sequence");
-  assert(std::is_sorted(Indices.begin(), Indices.end()) &&
+  assert(llvm::is_sorted(Indices) &&
          "sequence offsets must be in ascending order");
 
   assert(getMRI()->getType(Res).isValid() && "invalid operand type");
@@ -587,6 +542,13 @@ MachineInstrBuilder MachineIRBuilder::buildMerge(const DstOp &Res,
   return buildInstr(TargetOpcode::G_MERGE_VALUES, Res, TmpVec);
 }
 
+MachineInstrBuilder
+MachineIRBuilder::buildMerge(const DstOp &Res,
+                             std::initializer_list<SrcOp> Ops) {
+  assert(Ops.size() > 1);
+  return buildInstr(TargetOpcode::G_MERGE_VALUES, Res, Ops);
+}
+
 MachineInstrBuilder MachineIRBuilder::buildUnmerge(ArrayRef<LLT> Res,
                                                    const SrcOp &Op) {
   // Unfortunately to convert from ArrayRef<LLT> to ArrayRef<DstOp>,
@@ -650,22 +612,20 @@ MachineIRBuilder::buildConcatVectors(const DstOp &Res, ArrayRef<Register> Ops) {
   return buildInstr(TargetOpcode::G_CONCAT_VECTORS, Res, TmpVec);
 }
 
-MachineInstrBuilder MachineIRBuilder::buildInsert(Register Res, Register Src,
-                                                  Register Op, unsigned Index) {
-  assert(Index + getMRI()->getType(Op).getSizeInBits() <=
-             getMRI()->getType(Res).getSizeInBits() &&
+MachineInstrBuilder MachineIRBuilder::buildInsert(const DstOp &Res,
+                                                  const SrcOp &Src,
+                                                  const SrcOp &Op,
+                                                  unsigned Index) {
+  assert(Index + Op.getLLTTy(*getMRI()).getSizeInBits() <=
+             Res.getLLTTy(*getMRI()).getSizeInBits() &&
          "insertion past the end of a register");
 
-  if (getMRI()->getType(Res).getSizeInBits() ==
-      getMRI()->getType(Op).getSizeInBits()) {
+  if (Res.getLLTTy(*getMRI()).getSizeInBits() ==
+      Op.getLLTTy(*getMRI()).getSizeInBits()) {
     return buildCast(Res, Op);
   }
 
-  return buildInstr(TargetOpcode::G_INSERT)
-      .addDef(Res)
-      .addUse(Src)
-      .addUse(Op)
-      .addImm(Index);
+  return buildInstr(TargetOpcode::G_INSERT, Res, {Src, Op, uint64_t(Index)});
 }
 
 MachineInstrBuilder MachineIRBuilder::buildIntrinsic(Intrinsic::ID ID,
@@ -915,7 +875,7 @@ MachineIRBuilder::buildBlockAddress(Register Res, const BlockAddress *BA) {
   return buildInstr(TargetOpcode::G_BLOCK_ADDR).addDef(Res).addBlockAddress(BA);
 }
 
-void MachineIRBuilder::validateTruncExt(const LLT &DstTy, const LLT &SrcTy,
+void MachineIRBuilder::validateTruncExt(const LLT DstTy, const LLT SrcTy,
                                         bool IsExtend) {
 #ifndef NDEBUG
   if (DstTy.isVector()) {
@@ -934,8 +894,8 @@ void MachineIRBuilder::validateTruncExt(const LLT &DstTy, const LLT &SrcTy,
 #endif
 }
 
-void MachineIRBuilder::validateSelectOp(const LLT &ResTy, const LLT &TstTy,
-                                        const LLT &Op0Ty, const LLT &Op1Ty) {
+void MachineIRBuilder::validateSelectOp(const LLT ResTy, const LLT TstTy,
+                                        const LLT Op0Ty, const LLT Op1Ty) {
 #ifndef NDEBUG
   assert((ResTy.isScalar() || ResTy.isVector() || ResTy.isPointer()) &&
          "invalid operand type");
@@ -978,7 +938,11 @@ MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc,
   case TargetOpcode::G_SMIN:
   case TargetOpcode::G_SMAX:
   case TargetOpcode::G_UMIN:
-  case TargetOpcode::G_UMAX: {
+  case TargetOpcode::G_UMAX:
+  case TargetOpcode::G_UADDSAT:
+  case TargetOpcode::G_SADDSAT:
+  case TargetOpcode::G_USUBSAT:
+  case TargetOpcode::G_SSUBSAT: {
     // All these are binary ops.
     assert(DstOps.size() == 1 && "Invalid Dst");
     assert(SrcOps.size() == 2 && "Invalid Srcs");
@@ -1013,6 +977,13 @@ MachineInstrBuilder MachineIRBuilder::buildInstr(unsigned Opc,
                      SrcOps[0].getLLTTy(*getMRI()), false);
     break;
   }
+  case TargetOpcode::G_BITCAST: {
+    assert(DstOps.size() == 1 && "Invalid Dst");
+    assert(SrcOps.size() == 1 && "Invalid Srcs");
+    assert(DstOps[0].getLLTTy(*getMRI()).getSizeInBits() ==
+           SrcOps[0].getLLTTy(*getMRI()).getSizeInBits() && "invalid bitcast");
+    break;
+  }
   case TargetOpcode::COPY:
     assert(DstOps.size() == 1 && "Invalid Dst");
     // If the caller wants to add a subreg source it has to be done separately
diff --git a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
index 98e48f5fc1d5..356e0e437d32 100644
--- a/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/RegBankSelect.cpp
@@ -693,6 +693,15 @@ bool RegBankSelect::runOnMachineFunction(MachineFunction &MF) {
       if (isTargetSpecificOpcode(MI.getOpcode()) && !MI.isPreISelOpcode())
         continue;
 
+      // Ignore inline asm instructions: they should use physical
+      // registers/regclasses
+      if (MI.isInlineAsm())
+        continue;
+
+      // Ignore debug info.
+      if (MI.isDebugInstr())
+        continue;
+
       if (!assignInstr(MI)) {
         reportGISelFailure(MF, *TPC, *MORE, "gisel-regbankselect",
                            "unable to map instruction", MI);
diff --git a/llvm/lib/CodeGen/GlobalISel/Utils.cpp b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
index eeec2a5d536a..8a7fb4fbbf2d 100644
--- a/llvm/lib/CodeGen/GlobalISel/Utils.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/Utils.cpp
@@ -12,6 +12,7 @@
 #include "llvm/CodeGen/GlobalISel/Utils.h"
 #include "llvm/ADT/APFloat.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/CodeGen/GlobalISel/GISelChangeObserver.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -27,9 +28,9 @@
 
 using namespace llvm;
 
-unsigned llvm::constrainRegToClass(MachineRegisterInfo &MRI,
+Register llvm::constrainRegToClass(MachineRegisterInfo &MRI,
                                    const TargetInstrInfo &TII,
-                                   const RegisterBankInfo &RBI, unsigned Reg,
+                                   const RegisterBankInfo &RBI, Register Reg,
                                    const TargetRegisterClass &RegClass) {
   if (!RBI.constrainGenericRegister(Reg, RegClass, MRI))
     return MRI.createVirtualRegister(&RegClass);
@@ -37,17 +38,16 @@ unsigned llvm::constrainRegToClass(MachineRegisterInfo &MRI,
   return Reg;
 }
 
-unsigned llvm::constrainOperandRegClass(
+Register llvm::constrainOperandRegClass(
     const MachineFunction &MF, const TargetRegisterInfo &TRI,
     MachineRegisterInfo &MRI, const TargetInstrInfo &TII,
     const RegisterBankInfo &RBI, MachineInstr &InsertPt,
-    const TargetRegisterClass &RegClass, const MachineOperand &RegMO,
-    unsigned OpIdx) {
+    const TargetRegisterClass &RegClass, const MachineOperand &RegMO) {
   Register Reg = RegMO.getReg();
   // Assume physical registers are properly constrained.
   assert(Register::isVirtualRegister(Reg) && "PhysReg not implemented");
 
-  unsigned ConstrainedReg = constrainRegToClass(MRI, TII, RBI, Reg, RegClass);
+  Register ConstrainedReg = constrainRegToClass(MRI, TII, RBI, Reg, RegClass);
   // If we created a new virtual register because the class is not compatible
   // then create a copy between the new and the old register.
   if (ConstrainedReg != Reg) {
@@ -63,11 +63,20 @@ unsigned llvm::constrainOperandRegClass(
               TII.get(TargetOpcode::COPY), Reg)
           .addReg(ConstrainedReg);
     }
+  } else {
+    if (GISelChangeObserver *Observer = MF.getObserver()) {
+      if (!RegMO.isDef()) {
+        MachineInstr *RegDef = MRI.getVRegDef(Reg);
+        Observer->changedInstr(*RegDef);
+      }
+      Observer->changingAllUsesOfReg(MRI, Reg);
+      Observer->finishedChangingAllUsesOfReg();
+    }
   }
   return ConstrainedReg;
 }
 
-unsigned llvm::constrainOperandRegClass(
+Register llvm::constrainOperandRegClass(
     const MachineFunction &MF, const TargetRegisterInfo &TRI,
     MachineRegisterInfo &MRI, const TargetInstrInfo &TII,
     const RegisterBankInfo &RBI, MachineInstr &InsertPt, const MCInstrDesc &II,
@@ -105,7 +114,7 @@ unsigned llvm::constrainOperandRegClass(
     return Reg;
   }
   return constrainOperandRegClass(MF, TRI, MRI, TII, RBI, InsertPt, *RegClass,
-                                  RegMO, OpIdx);
+                                  RegMO);
 }
 
 bool llvm::constrainSelectedInstRegOperands(MachineInstr &I,
@@ -155,6 +164,20 @@ bool llvm::constrainSelectedInstRegOperands(MachineInstr &I,
   return true;
 }
 
+bool llvm::canReplaceReg(Register DstReg, Register SrcReg,
+                         MachineRegisterInfo &MRI) {
+  // Give up if either DstReg or SrcReg  is a physical register.
+  if (DstReg.isPhysical() || SrcReg.isPhysical())
+    return false;
+  // Give up if the types don't match.
+  if (MRI.getType(DstReg) != MRI.getType(SrcReg))
+    return false;
+  // Replace if either DstReg has no constraints or the register
+  // constraints match.
+  return !MRI.getRegClassOrRegBank(DstReg) ||
+         MRI.getRegClassOrRegBank(DstReg) == MRI.getRegClassOrRegBank(SrcReg);
+}
+
 bool llvm::isTriviallyDead(const MachineInstr &MI,
                            const MachineRegisterInfo &MRI) {
   // If we can move an instruction, we can remove it.  Otherwise, it has
@@ -175,22 +198,37 @@ bool llvm::isTriviallyDead(const MachineInstr &MI,
   return true;
 }
 
-void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
-                              MachineOptimizationRemarkEmitter &MORE,
-                              MachineOptimizationRemarkMissed &R) {
-  MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
-
+static void reportGISelDiagnostic(DiagnosticSeverity Severity,
+                                  MachineFunction &MF,
+                                  const TargetPassConfig &TPC,
+                                  MachineOptimizationRemarkEmitter &MORE,
+                                  MachineOptimizationRemarkMissed &R) {
+  bool IsFatal = Severity == DS_Error &&
+                 TPC.isGlobalISelAbortEnabled();
   // Print the function name explicitly if we don't have a debug location (which
   // makes the diagnostic less useful) or if we're going to emit a raw error.
-  if (!R.getLocation().isValid() || TPC.isGlobalISelAbortEnabled())
+  if (!R.getLocation().isValid() || IsFatal)
     R << (" (in function: " + MF.getName() + ")").str();
 
-  if (TPC.isGlobalISelAbortEnabled())
+  if (IsFatal)
     report_fatal_error(R.getMsg());
   else
     MORE.emit(R);
 }
 
+void llvm::reportGISelWarning(MachineFunction &MF, const TargetPassConfig &TPC,
+                              MachineOptimizationRemarkEmitter &MORE,
+                              MachineOptimizationRemarkMissed &R) {
+  reportGISelDiagnostic(DS_Warning, MF, TPC, MORE, R);
+}
+
+void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
+                              MachineOptimizationRemarkEmitter &MORE,
+                              MachineOptimizationRemarkMissed &R) {
+  MF.getProperties().set(MachineFunctionProperties::Property::FailedISel);
+  reportGISelDiagnostic(DS_Error, MF, TPC, MORE, R);
+}
+
 void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
                               MachineOptimizationRemarkEmitter &MORE,
                               const char *PassName, StringRef Msg,
@@ -204,7 +242,7 @@ void llvm::reportGISelFailure(MachineFunction &MF, const TargetPassConfig &TPC,
   reportGISelFailure(MF, TPC, MORE, R);
 }
 
-Optional<int64_t> llvm::getConstantVRegVal(unsigned VReg,
+Optional<int64_t> llvm::getConstantVRegVal(Register VReg,
                                            const MachineRegisterInfo &MRI) {
   Optional<ValueAndVReg> ValAndVReg =
       getConstantVRegValWithLookThrough(VReg, MRI, /*LookThroughInstrs*/ false);
@@ -216,7 +254,7 @@ Optional<int64_t> llvm::getConstantVRegVal(unsigned VReg,
 }
 
 Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
-    unsigned VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs,
+    Register VReg, const MachineRegisterInfo &MRI, bool LookThroughInstrs,
     bool HandleFConstant) {
   SmallVector<std::pair<unsigned, unsigned>, 4> SeenOpcodes;
   MachineInstr *MI;
@@ -292,28 +330,51 @@ Optional<ValueAndVReg> llvm::getConstantVRegValWithLookThrough(
   return ValueAndVReg{Val.getSExtValue(), VReg};
 }
 
-const llvm::ConstantFP* llvm::getConstantFPVRegVal(unsigned VReg,
-                                       const MachineRegisterInfo &MRI) {
+const llvm::ConstantFP *
+llvm::getConstantFPVRegVal(Register VReg, const MachineRegisterInfo &MRI) {
   MachineInstr *MI = MRI.getVRegDef(VReg);
   if (TargetOpcode::G_FCONSTANT != MI->getOpcode())
     return nullptr;
   return MI->getOperand(1).getFPImm();
 }
 
-llvm::MachineInstr *llvm::getDefIgnoringCopies(Register Reg,
-                                               const MachineRegisterInfo &MRI) {
+namespace {
+struct DefinitionAndSourceRegister {
+  llvm::MachineInstr *MI;
+  Register Reg;
+};
+} // namespace
+
+static llvm::Optional<DefinitionAndSourceRegister>
+getDefSrcRegIgnoringCopies(Register Reg, const MachineRegisterInfo &MRI) {
+  Register DefSrcReg = Reg;
   auto *DefMI = MRI.getVRegDef(Reg);
   auto DstTy = MRI.getType(DefMI->getOperand(0).getReg());
   if (!DstTy.isValid())
-    return nullptr;
+    return None;
   while (DefMI->getOpcode() == TargetOpcode::COPY) {
     Register SrcReg = DefMI->getOperand(1).getReg();
     auto SrcTy = MRI.getType(SrcReg);
     if (!SrcTy.isValid() || SrcTy != DstTy)
       break;
     DefMI = MRI.getVRegDef(SrcReg);
+    DefSrcReg = SrcReg;
   }
-  return DefMI;
+  return DefinitionAndSourceRegister{DefMI, DefSrcReg};
+}
+
+llvm::MachineInstr *llvm::getDefIgnoringCopies(Register Reg,
+                                               const MachineRegisterInfo &MRI) {
+  Optional<DefinitionAndSourceRegister> DefSrcReg =
+      getDefSrcRegIgnoringCopies(Reg, MRI);
+  return DefSrcReg ? DefSrcReg->MI : nullptr;
+}
+
+Register llvm::getSrcRegIgnoringCopies(Register Reg,
+                                       const MachineRegisterInfo &MRI) {
+  Optional<DefinitionAndSourceRegister> DefSrcReg =
+      getDefSrcRegIgnoringCopies(Reg, MRI);
+  return DefSrcReg ? DefSrcReg->Reg : Register();
 }
 
 llvm::MachineInstr *llvm::getOpcodeDef(unsigned Opcode, Register Reg,
@@ -335,54 +396,59 @@ APFloat llvm::getAPFloatFromSize(double Val, unsigned Size) {
   return APF;
 }
 
-Optional<APInt> llvm::ConstantFoldBinOp(unsigned Opcode, const unsigned Op1,
-                                        const unsigned Op2,
+Optional<APInt> llvm::ConstantFoldBinOp(unsigned Opcode, const Register Op1,
+                                        const Register Op2,
                                         const MachineRegisterInfo &MRI) {
-  auto MaybeOp1Cst = getConstantVRegVal(Op1, MRI);
   auto MaybeOp2Cst = getConstantVRegVal(Op2, MRI);
-  if (MaybeOp1Cst && MaybeOp2Cst) {
-    LLT Ty = MRI.getType(Op1);
-    APInt C1(Ty.getSizeInBits(), *MaybeOp1Cst, true);
-    APInt C2(Ty.getSizeInBits(), *MaybeOp2Cst, true);
-    switch (Opcode) {
-    default:
+  if (!MaybeOp2Cst)
+    return None;
+
+  auto MaybeOp1Cst = getConstantVRegVal(Op1, MRI);
+  if (!MaybeOp1Cst)
+    return None;
+
+  LLT Ty = MRI.getType(Op1);
+  APInt C1(Ty.getSizeInBits(), *MaybeOp1Cst, true);
+  APInt C2(Ty.getSizeInBits(), *MaybeOp2Cst, true);
+  switch (Opcode) {
+  default:
+    break;
+  case TargetOpcode::G_ADD:
+    return C1 + C2;
+  case TargetOpcode::G_AND:
+    return C1 & C2;
+  case TargetOpcode::G_ASHR:
+    return C1.ashr(C2);
+  case TargetOpcode::G_LSHR:
+    return C1.lshr(C2);
+  case TargetOpcode::G_MUL:
+    return C1 * C2;
+  case TargetOpcode::G_OR:
+    return C1 | C2;
+  case TargetOpcode::G_SHL:
+    return C1 << C2;
+  case TargetOpcode::G_SUB:
+    return C1 - C2;
+  case TargetOpcode::G_XOR:
+    return C1 ^ C2;
+  case TargetOpcode::G_UDIV:
+    if (!C2.getBoolValue())
       break;
-    case TargetOpcode::G_ADD:
-      return C1 + C2;
-    case TargetOpcode::G_AND:
-      return C1 & C2;
-    case TargetOpcode::G_ASHR:
-      return C1.ashr(C2);
-    case TargetOpcode::G_LSHR:
-      return C1.lshr(C2);
-    case TargetOpcode::G_MUL:
-      return C1 * C2;
-    case TargetOpcode::G_OR:
-      return C1 | C2;
-    case TargetOpcode::G_SHL:
-      return C1 << C2;
-    case TargetOpcode::G_SUB:
-      return C1 - C2;
-    case TargetOpcode::G_XOR:
-      return C1 ^ C2;
-    case TargetOpcode::G_UDIV:
-      if (!C2.getBoolValue())
-        break;
-      return C1.udiv(C2);
-    case TargetOpcode::G_SDIV:
-      if (!C2.getBoolValue())
-        break;
-      return C1.sdiv(C2);
-    case TargetOpcode::G_UREM:
-      if (!C2.getBoolValue())
-        break;
-      return C1.urem(C2);
-    case TargetOpcode::G_SREM:
-      if (!C2.getBoolValue())
-        break;
-      return C1.srem(C2);
-    }
+    return C1.udiv(C2);
+  case TargetOpcode::G_SDIV:
+    if (!C2.getBoolValue())
+      break;
+    return C1.sdiv(C2);
+  case TargetOpcode::G_UREM:
+    if (!C2.getBoolValue())
+      break;
+    return C1.urem(C2);
+  case TargetOpcode::G_SREM:
+    if (!C2.getBoolValue())
+      break;
+    return C1.srem(C2);
   }
+
   return None;
 }
 
@@ -411,7 +477,19 @@ bool llvm::isKnownNeverNaN(Register Val, const MachineRegisterInfo &MRI,
   return false;
 }
 
-Optional<APInt> llvm::ConstantFoldExtOp(unsigned Opcode, const unsigned Op1,
+Align llvm::inferAlignFromPtrInfo(MachineFunction &MF,
+                                  const MachinePointerInfo &MPO) {
+  auto PSV = MPO.V.dyn_cast<const PseudoSourceValue *>();
+  if (auto FSPV = dyn_cast_or_null<FixedStackPseudoSourceValue>(PSV)) {
+    MachineFrameInfo &MFI = MF.getFrameInfo();
+    return commonAlignment(MFI.getObjectAlign(FSPV->getFrameIndex()),
+                           MPO.Offset);
+  }
+
+  return Align(1);
+}
+
+Optional<APInt> llvm::ConstantFoldExtOp(unsigned Opcode, const Register Op1,
                                         uint64_t Imm,
                                         const MachineRegisterInfo &MRI) {
   auto MaybeOp1Cst = getConstantVRegVal(Op1, MRI);
@@ -431,3 +509,55 @@ Optional<APInt> llvm::ConstantFoldExtOp(unsigned Opcode, const unsigned Op1,
 void llvm::getSelectionDAGFallbackAnalysisUsage(AnalysisUsage &AU) {
   AU.addPreserved<StackProtector>();
 }
+
+LLT llvm::getLCMType(LLT Ty0, LLT Ty1) {
+  if (!Ty0.isVector() && !Ty1.isVector()) {
+    unsigned Mul = Ty0.getSizeInBits() * Ty1.getSizeInBits();
+    int GCDSize = greatestCommonDivisor(Ty0.getSizeInBits(),
+                                        Ty1.getSizeInBits());
+    return LLT::scalar(Mul / GCDSize);
+  }
+
+  if (Ty0.isVector() && !Ty1.isVector()) {
+    assert(Ty0.getElementType() == Ty1 && "not yet handled");
+    return Ty0;
+  }
+
+  if (Ty1.isVector() && !Ty0.isVector()) {
+    assert(Ty1.getElementType() == Ty0 && "not yet handled");
+    return Ty1;
+  }
+
+  if (Ty0.isVector() && Ty1.isVector()) {
+    assert(Ty0.getElementType() == Ty1.getElementType() && "not yet handled");
+
+    int GCDElts = greatestCommonDivisor(Ty0.getNumElements(),
+                                        Ty1.getNumElements());
+
+    int Mul = Ty0.getNumElements() * Ty1.getNumElements();
+    return LLT::vector(Mul / GCDElts, Ty0.getElementType());
+  }
+
+  llvm_unreachable("not yet handled");
+}
+
+LLT llvm::getGCDType(LLT OrigTy, LLT TargetTy) {
+  if (OrigTy.isVector() && TargetTy.isVector()) {
+    assert(OrigTy.getElementType() == TargetTy.getElementType());
+    int GCD = greatestCommonDivisor(OrigTy.getNumElements(),
+                                    TargetTy.getNumElements());
+    return LLT::scalarOrVector(GCD, OrigTy.getElementType());
+  }
+
+  if (OrigTy.isVector() && !TargetTy.isVector()) {
+    assert(OrigTy.getElementType() == TargetTy);
+    return TargetTy;
+  }
+
+  assert(!OrigTy.isVector() && !TargetTy.isVector() &&
+         "GCD type of vector and scalar not implemented");
+
+  int GCD = greatestCommonDivisor(OrigTy.getSizeInBits(),
+                                  TargetTy.getSizeInBits());
+  return LLT::scalar(GCD);
+}
diff --git a/llvm/lib/CodeGen/GlobalMerge.cpp b/llvm/lib/CodeGen/GlobalMerge.cpp
index 5870e20d4227..1e20c02ba160 100644
--- a/llvm/lib/CodeGen/GlobalMerge.cpp
+++ b/llvm/lib/CodeGen/GlobalMerge.cpp
@@ -83,6 +83,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -463,7 +464,7 @@ bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
       Type *Ty = Globals[j]->getValueType();
 
       // Make sure we use the same alignment AsmPrinter would use.
-      Align Alignment(DL.getPreferredAlignment(Globals[j]));
+      Align Alignment = DL.getPreferredAlign(Globals[j]);
       unsigned Padding = alignTo(MergedSize, Alignment) - MergedSize;
       MergedSize += Padding;
       MergedSize += DL.getTypeAllocSize(Ty);
@@ -523,7 +524,8 @@ bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
     const StructLayout *MergedLayout = DL.getStructLayout(MergedTy);
     for (ssize_t k = i, idx = 0; k != j; k = GlobalSet.find_next(k), ++idx) {
       GlobalValue::LinkageTypes Linkage = Globals[k]->getLinkage();
-      std::string Name = Globals[k]->getName();
+      std::string Name(Globals[k]->getName());
+      GlobalValue::VisibilityTypes Visibility = Globals[k]->getVisibility();
       GlobalValue::DLLStorageClassTypes DLLStorage =
           Globals[k]->getDLLStorageClass();
 
@@ -549,6 +551,7 @@ bool GlobalMerge::doMerge(const SmallVectorImpl<GlobalVariable *> &Globals,
       if (Linkage != GlobalValue::InternalLinkage || !IsMachO) {
         GlobalAlias *GA = GlobalAlias::create(Tys[StructIdxs[idx]], AddrSpace,
                                               Linkage, Name, GEP, &M);
+        GA->setVisibility(Visibility);
         GA->setDLLStorageClass(DLLStorage);
       }
 
diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp
index 65c2a37e5d43..0ba7e920e507 100644
--- a/llvm/lib/CodeGen/HardwareLoops.cpp
+++ b/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -20,7 +20,7 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -35,7 +35,6 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
-#include "llvm/PassSupport.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
@@ -43,6 +42,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 #define DEBUG_TYPE "hardware-loops"
 
@@ -245,14 +245,17 @@ bool HardwareLoops::runOnFunction(Function &F) {
 // converted and the parent loop doesn't support containing a hardware loop.
 bool HardwareLoops::TryConvertLoop(Loop *L) {
   // Process nested loops first.
-  for (Loop::iterator I = L->begin(), E = L->end(); I != E; ++I) {
-    if (TryConvertLoop(*I)) {
-      reportHWLoopFailure("nested hardware-loops not supported", "HWLoopNested",
-                          ORE, L);
-      return true; // Stop search.
-    }
+  bool AnyChanged = false;
+  for (Loop *SL : *L)
+    AnyChanged |= TryConvertLoop(SL);
+  if (AnyChanged) {
+    reportHWLoopFailure("nested hardware-loops not supported", "HWLoopNested",
+                        ORE, L);
+    return true; // Stop search.
   }
 
+  LLVM_DEBUG(dbgs() << "HWLoops: Loop " << L->getHeader()->getName() << "\n");
+
   HardwareLoopInfo HWLoopInfo(L);
   if (!HWLoopInfo.canAnalyze(*LI)) {
     reportHWLoopFailure("cannot analyze loop, irreducible control flow",
@@ -476,9 +479,7 @@ Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
 
   Function *DecFunc =
       Intrinsic::getDeclaration(M, Intrinsic::loop_decrement_reg,
-                                { EltsRem->getType(), EltsRem->getType(),
-                                  LoopDecrement->getType()
-                                });
+                                { EltsRem->getType() });
   Value *Ops[] = { EltsRem, LoopDecrement };
   Value *Call = CondBuilder.CreateCall(DecFunc, Ops);
 
diff --git a/llvm/lib/CodeGen/IfConversion.cpp b/llvm/lib/CodeGen/IfConversion.cpp
index 7d64828aa482..1a5c5d685017 100644
--- a/llvm/lib/CodeGen/IfConversion.cpp
+++ b/llvm/lib/CodeGen/IfConversion.cpp
@@ -31,6 +31,7 @@
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MBFIWrapper.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -447,7 +448,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   TLI = ST.getTargetLowering();
   TII = ST.getInstrInfo();
   TRI = ST.getRegisterInfo();
-  BranchFolder::MBFIWrapper MBFI(getAnalysis<MachineBlockFrequencyInfo>());
+  MBFIWrapper MBFI(getAnalysis<MachineBlockFrequencyInfo>());
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
   ProfileSummaryInfo *PSI =
       &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
@@ -462,10 +463,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
   if (!PreRegAlloc) {
     // Tail merge tend to expose more if-conversion opportunities.
     BranchFolder BF(true, false, MBFI, *MBPI, PSI);
-    auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
-    BFChange = BF.OptimizeFunction(
-        MF, TII, ST.getRegisterInfo(),
-        MMIWP ? &MMIWP->getMMI() : nullptr);
+    BFChange = BF.OptimizeFunction(MF, TII, ST.getRegisterInfo());
   }
 
   LLVM_DEBUG(dbgs() << "\nIfcvt: function (" << ++FnNum << ") \'"
@@ -604,10 +602,7 @@ bool IfConverter::runOnMachineFunction(MachineFunction &MF) {
 
   if (MadeChange && IfCvtBranchFold) {
     BranchFolder BF(false, false, MBFI, *MBPI, PSI);
-    auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
-    BF.OptimizeFunction(
-        MF, TII, MF.getSubtarget().getRegisterInfo(),
-        MMIWP ? &MMIWP->getMMI() : nullptr);
+    BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo());
   }
 
   MadeChange |= BFChange;
@@ -972,6 +967,11 @@ bool IfConverter::ValidDiamond(
       FalseBBI.IsBeingAnalyzed || FalseBBI.IsDone)
     return false;
 
+  // If the True and False BBs are equal we're dealing with a degenerate case
+  // that we don't treat as a diamond.
+  if (TrueBBI.BB == FalseBBI.BB)
+    return false;
+
   MachineBasicBlock *TT = TrueBBI.TrueBB;
   MachineBasicBlock *FT = FalseBBI.TrueBB;
 
@@ -1851,7 +1851,7 @@ bool IfConverter::IfConvertDiamondCommon(
   while (NumDups1 != 0) {
     // Since this instruction is going to be deleted, update call
     // site info state if the instruction is call instruction.
-    if (DI2->isCall(MachineInstr::IgnoreBundle))
+    if (DI2->shouldUpdateCallSiteInfo())
       MBB2.getParent()->eraseCallSiteInfo(&*DI2);
 
     ++DI2;
@@ -1900,7 +1900,7 @@ bool IfConverter::IfConvertDiamondCommon(
 
     // Since this instruction is going to be deleted, update call
     // site info state if the instruction is call instruction.
-    if (DI1->isCall(MachineInstr::IgnoreBundle))
+    if (DI1->shouldUpdateCallSiteInfo())
       MBB1.getParent()->eraseCallSiteInfo(&*DI1);
 
     // skip dbg_value instructions
@@ -2188,8 +2188,8 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
 
     MachineInstr *MI = MF.CloneMachineInstr(&I);
     // Make a copy of the call site info.
-    if (MI->isCall(MachineInstr::IgnoreBundle))
-      MF.copyCallSiteInfo(&I,MI);
+    if (I.isCandidateForCallSiteEntry())
+      MF.copyCallSiteInfo(&I, MI);
 
     ToBBI.BB->insert(ToBBI.BB->end(), MI);
     ToBBI.NonPredSize++;
@@ -2237,10 +2237,10 @@ void IfConverter::CopyAndPredicateBlock(BBInfo &ToBBI, BBInfo &FromBBI,
 }
 
 /// Move all instructions from FromBB to the end of ToBB.  This will leave
-/// FromBB as an empty block, so remove all of its successor edges except for
-/// the fall-through edge.  If AddEdges is true, i.e., when FromBBI's branch is
-/// being moved, add those successor edges to ToBBI and remove the old edge
-/// from ToBBI to FromBBI.
+/// FromBB as an empty block, so remove all of its successor edges and move it
+/// to the end of the function.  If AddEdges is true, i.e., when FromBBI's
+/// branch is being moved, add those successor edges to ToBBI and remove the old
+/// edge from ToBBI to FromBBI.
 void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
   MachineBasicBlock &FromMBB = *FromBBI.BB;
   assert(!FromMBB.hasAddressTaken() &&
@@ -2280,8 +2280,10 @@ void IfConverter::MergeBlocks(BBInfo &ToBBI, BBInfo &FromBBI, bool AddEdges) {
 
   for (MachineBasicBlock *Succ : FromSuccs) {
     // Fallthrough edge can't be transferred.
-    if (Succ == FallThrough)
+    if (Succ == FallThrough) {
+      FromMBB.removeSuccessor(Succ);
       continue;
+    }
 
     auto NewProb = BranchProbability::getZero();
     if (AddEdges) {
diff --git a/llvm/lib/CodeGen/ImplicitNullChecks.cpp b/llvm/lib/CodeGen/ImplicitNullChecks.cpp
index 0bbedb0a5ea6..16c9bfc672af 100644
--- a/llvm/lib/CodeGen/ImplicitNullChecks.cpp
+++ b/llvm/lib/CodeGen/ImplicitNullChecks.cpp
@@ -364,12 +364,18 @@ ImplicitNullChecks::isSuitableMemoryOp(const MachineInstr &MI,
                                        unsigned PointerReg,
                                        ArrayRef<MachineInstr *> PrevInsts) {
   int64_t Offset;
+  bool OffsetIsScalable;
   const MachineOperand *BaseOp;
 
-  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) ||
+
+  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI) ||
       !BaseOp->isReg() || BaseOp->getReg() != PointerReg)
     return SR_Unsuitable;
 
+  // FIXME: This algorithm assumes instructions have fixed-size offsets.
+  if (OffsetIsScalable)
+    return SR_Unsuitable;
+
   // We want the mem access to be issued at a sane offset from PointerReg,
   // so that if PointerReg is null then the access reliably page faults.
   if (!(MI.mayLoadOrStore() && !MI.isPredicable() &&
diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp
index ed3e159ac566..41eef2fed840 100644
--- a/llvm/lib/CodeGen/InlineSpiller.cpp
+++ b/llvm/lib/CodeGen/InlineSpiller.cpp
@@ -11,7 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Spiller.h"
 #include "SplitKit.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -24,8 +23,8 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalCalc.h"
 #include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -40,6 +39,8 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/Spiller.h"
+#include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -113,10 +114,10 @@ class HoistSpillHelper : private LiveRangeEdit::Delegate {
   /// This is the map from original register to a set containing all its
   /// siblings. To hoist a spill to another BB, we need to find out a live
   /// sibling there and use it as the source of the new spill.
-  DenseMap<unsigned, SmallSetVector<unsigned, 16>> Virt2SiblingsMap;
+  DenseMap<Register, SmallSetVector<Register, 16>> Virt2SiblingsMap;
 
   bool isSpillCandBB(LiveInterval &OrigLI, VNInfo &OrigVNI,
-                     MachineBasicBlock &BB, unsigned &LiveReg);
+                     MachineBasicBlock &BB, Register &LiveReg);
 
   void rmRedundantSpills(
       SmallPtrSet<MachineInstr *, 16> &Spills,
@@ -175,7 +176,7 @@ class InlineSpiller : public Spiller {
   unsigned Original;
 
   // All registers to spill to StackSlot, including the main register.
-  SmallVector<unsigned, 8> RegsToSpill;
+  SmallVector<Register, 8> RegsToSpill;
 
   // All COPY instructions to/from snippets.
   // They are ignored since both operands refer to the same stack slot.
@@ -211,24 +212,24 @@ private:
   bool isSnippet(const LiveInterval &SnipLI);
   void collectRegsToSpill();
 
-  bool isRegToSpill(unsigned Reg) { return is_contained(RegsToSpill, Reg); }
+  bool isRegToSpill(Register Reg) { return is_contained(RegsToSpill, Reg); }
 
-  bool isSibling(unsigned Reg);
+  bool isSibling(Register Reg);
   bool hoistSpillInsideBB(LiveInterval &SpillLI, MachineInstr &CopyMI);
   void eliminateRedundantSpills(LiveInterval &LI, VNInfo *VNI);
 
   void markValueUsed(LiveInterval*, VNInfo*);
-  bool canGuaranteeAssignmentAfterRemat(unsigned VReg, MachineInstr &MI);
+  bool canGuaranteeAssignmentAfterRemat(Register VReg, MachineInstr &MI);
   bool reMaterializeFor(LiveInterval &, MachineInstr &MI);
   void reMaterializeAll();
 
-  bool coalesceStackAccess(MachineInstr *MI, unsigned Reg);
+  bool coalesceStackAccess(MachineInstr *MI, Register Reg);
   bool foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>>,
                          MachineInstr *LoadMI = nullptr);
-  void insertReload(unsigned VReg, SlotIndex, MachineBasicBlock::iterator MI);
-  void insertSpill(unsigned VReg, bool isKill, MachineBasicBlock::iterator MI);
+  void insertReload(Register VReg, SlotIndex, MachineBasicBlock::iterator MI);
+  void insertSpill(Register VReg, bool isKill, MachineBasicBlock::iterator MI);
 
-  void spillAroundUses(unsigned Reg);
+  void spillAroundUses(Register Reg);
   void spillAll();
 };
 
@@ -258,21 +259,21 @@ Spiller *llvm::createInlineSpiller(MachineFunctionPass &pass,
 
 /// isFullCopyOf - If MI is a COPY to or from Reg, return the other register,
 /// otherwise return 0.
-static unsigned isFullCopyOf(const MachineInstr &MI, unsigned Reg) {
+static Register isFullCopyOf(const MachineInstr &MI, Register Reg) {
   if (!MI.isFullCopy())
-    return 0;
+    return Register();
   if (MI.getOperand(0).getReg() == Reg)
     return MI.getOperand(1).getReg();
   if (MI.getOperand(1).getReg() == Reg)
     return MI.getOperand(0).getReg();
-  return 0;
+  return Register();
 }
 
 /// isSnippet - Identify if a live interval is a snippet that should be spilled.
 /// It is assumed that SnipLI is a virtual register with the same original as
 /// Edit->getReg().
 bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) {
-  unsigned Reg = Edit->getReg();
+  Register Reg = Edit->getReg();
 
   // A snippet is a tiny live range with only a single instruction using it
   // besides copies to/from Reg or spills/fills. We accept:
@@ -316,7 +317,7 @@ bool InlineSpiller::isSnippet(const LiveInterval &SnipLI) {
 /// collectRegsToSpill - Collect live range snippets that only have a single
 /// real use.
 void InlineSpiller::collectRegsToSpill() {
-  unsigned Reg = Edit->getReg();
+  Register Reg = Edit->getReg();
 
   // Main register always spills.
   RegsToSpill.assign(1, Reg);
@@ -330,7 +331,7 @@ void InlineSpiller::collectRegsToSpill() {
   for (MachineRegisterInfo::reg_instr_iterator
        RI = MRI.reg_instr_begin(Reg), E = MRI.reg_instr_end(); RI != E; ) {
     MachineInstr &MI = *RI++;
-    unsigned SnipReg = isFullCopyOf(MI, Reg);
+    Register SnipReg = isFullCopyOf(MI, Reg);
     if (!isSibling(SnipReg))
       continue;
     LiveInterval &SnipLI = LIS.getInterval(SnipReg);
@@ -345,8 +346,8 @@ void InlineSpiller::collectRegsToSpill() {
   }
 }
 
-bool InlineSpiller::isSibling(unsigned Reg) {
-  return Register::isVirtualRegister(Reg) && VRM.getOriginal(Reg) == Original;
+bool InlineSpiller::isSibling(Register Reg) {
+  return Reg.isVirtual() && VRM.getOriginal(Reg) == Original;
 }
 
 /// It is beneficial to spill to earlier place in the same BB in case
@@ -431,7 +432,7 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
   do {
     LiveInterval *LI;
     std::tie(LI, VNI) = WorkList.pop_back_val();
-    unsigned Reg = LI->reg;
+    Register Reg = LI->reg;
     LLVM_DEBUG(dbgs() << "Checking redundant spills for " << VNI->id << '@'
                       << VNI->def << " in " << *LI << '\n');
 
@@ -455,7 +456,7 @@ void InlineSpiller::eliminateRedundantSpills(LiveInterval &SLI, VNInfo *VNI) {
         continue;
 
       // Follow sibling copies down the dominator tree.
-      if (unsigned DstReg = isFullCopyOf(MI, Reg)) {
+      if (Register DstReg = isFullCopyOf(MI, Reg)) {
         if (isSibling(DstReg)) {
            LiveInterval &DstLI = LIS.getInterval(DstReg);
            VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot());
@@ -517,7 +518,7 @@ void InlineSpiller::markValueUsed(LiveInterval *LI, VNInfo *VNI) {
   } while (!WorkList.empty());
 }
 
-bool InlineSpiller::canGuaranteeAssignmentAfterRemat(unsigned VReg,
+bool InlineSpiller::canGuaranteeAssignmentAfterRemat(Register VReg,
                                                      MachineInstr &MI) {
   if (!RestrictStatepointRemat)
     return true;
@@ -536,7 +537,19 @@ bool InlineSpiller::canGuaranteeAssignmentAfterRemat(unsigned VReg,
   // At the moment, we only handle this for STATEPOINTs since they're the only
   // pseudo op where we've seen this.  If we start seeing other instructions
   // with the same problem, we need to revisit this.
-  return (MI.getOpcode() != TargetOpcode::STATEPOINT);
+  if (MI.getOpcode() != TargetOpcode::STATEPOINT)
+    return true;
+  // For STATEPOINTs we allow re-materialization for fixed arguments only hoping
+  // that number of physical registers is enough to cover all fixed arguments.
+  // If it is not true we need to revisit it.
+  for (unsigned Idx = StatepointOpers(&MI).getVarIdx(),
+                EndIdx = MI.getNumOperands();
+       Idx < EndIdx; ++Idx) {
+    MachineOperand &MO = MI.getOperand(Idx);
+    if (MO.isReg() && MO.getReg() == VReg)
+      return false;
+  }
+  return true;
 }
 
 /// reMaterializeFor - Attempt to rematerialize before MI instead of reloading.
@@ -602,7 +615,7 @@ bool InlineSpiller::reMaterializeFor(LiveInterval &VirtReg, MachineInstr &MI) {
   }
 
   // Allocate a new register for the remat.
-  unsigned NewVReg = Edit->createFrom(Original);
+  Register NewVReg = Edit->createFrom(Original);
 
   // Finally we can rematerialize OrigMI before MI.
   SlotIndex DefIdx =
@@ -641,7 +654,7 @@ void InlineSpiller::reMaterializeAll() {
 
   // Try to remat before all uses of snippets.
   bool anyRemat = false;
-  for (unsigned Reg : RegsToSpill) {
+  for (Register Reg : RegsToSpill) {
     LiveInterval &LI = LIS.getInterval(Reg);
     for (MachineRegisterInfo::reg_bundle_iterator
            RegI = MRI.reg_bundle_begin(Reg), E = MRI.reg_bundle_end();
@@ -662,7 +675,7 @@ void InlineSpiller::reMaterializeAll() {
     return;
 
   // Remove any values that were completely rematted.
-  for (unsigned Reg : RegsToSpill) {
+  for (Register Reg : RegsToSpill) {
     LiveInterval &LI = LIS.getInterval(Reg);
     for (LiveInterval::vni_iterator I = LI.vni_begin(), E = LI.vni_end();
          I != E; ++I) {
@@ -692,7 +705,7 @@ void InlineSpiller::reMaterializeAll() {
   // So to get rid of unused reg, we need to check whether it has non-dbg
   // reference instead of whether it has non-empty interval.
   unsigned ResultPos = 0;
-  for (unsigned Reg : RegsToSpill) {
+  for (Register Reg : RegsToSpill) {
     if (MRI.reg_nodbg_empty(Reg)) {
       Edit->eraseVirtReg(Reg);
       continue;
@@ -714,9 +727,9 @@ void InlineSpiller::reMaterializeAll() {
 //===----------------------------------------------------------------------===//
 
 /// If MI is a load or store of StackSlot, it can be removed.
-bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, unsigned Reg) {
+bool InlineSpiller::coalesceStackAccess(MachineInstr *MI, Register Reg) {
   int FI = 0;
-  unsigned InstrReg = TII.isLoadFromStackSlot(*MI, FI);
+  Register InstrReg = TII.isLoadFromStackSlot(*MI, FI);
   bool IsLoad = InstrReg;
   if (!IsLoad)
     InstrReg = TII.isStoreToStackSlot(*MI, FI);
@@ -750,7 +763,7 @@ static void dumpMachineInstrRangeWithSlotIndex(MachineBasicBlock::iterator B,
                                                MachineBasicBlock::iterator E,
                                                LiveIntervals const &LIS,
                                                const char *const header,
-                                               unsigned VReg =0) {
+                                               Register VReg = Register()) {
   char NextLine = '\n';
   char SlotIndent = '\t';
 
@@ -795,7 +808,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
     return false;
 
   bool WasCopy = MI->isCopy();
-  unsigned ImpReg = 0;
+  Register ImpReg;
 
   // Spill subregs if the target allows it.
   // We always want to spill subregs for stackmap/patchpoint pseudos.
@@ -864,7 +877,8 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
       HSpiller.rmFromMergeableSpills(*MI, FI))
     --NumSpills;
   LIS.ReplaceMachineInstrInMaps(*MI, *FoldMI);
-  if (MI->isCall())
+  // Update the call site info.
+  if (MI->isCandidateForCallSiteEntry())
     MI->getMF()->moveCallSiteInfo(MI, FoldMI);
   MI->eraseFromParent();
 
@@ -898,7 +912,7 @@ foldMemoryOperand(ArrayRef<std::pair<MachineInstr *, unsigned>> Ops,
   return true;
 }
 
-void InlineSpiller::insertReload(unsigned NewVReg,
+void InlineSpiller::insertReload(Register NewVReg,
                                  SlotIndex Idx,
                                  MachineBasicBlock::iterator MI) {
   MachineBasicBlock &MBB = *MI->getParent();
@@ -917,47 +931,51 @@ void InlineSpiller::insertReload(unsigned NewVReg,
 /// Check if \p Def fully defines a VReg with an undefined value.
 /// If that's the case, that means the value of VReg is actually
 /// not relevant.
-static bool isFullUndefDef(const MachineInstr &Def) {
+static bool isRealSpill(const MachineInstr &Def) {
   if (!Def.isImplicitDef())
-    return false;
+    return true;
   assert(Def.getNumOperands() == 1 &&
          "Implicit def with more than one definition");
   // We can say that the VReg defined by Def is undef, only if it is
   // fully defined by Def. Otherwise, some of the lanes may not be
   // undef and the value of the VReg matters.
-  return !Def.getOperand(0).getSubReg();
+  return Def.getOperand(0).getSubReg();
 }
 
 /// insertSpill - Insert a spill of NewVReg after MI.
-void InlineSpiller::insertSpill(unsigned NewVReg, bool isKill,
+void InlineSpiller::insertSpill(Register NewVReg, bool isKill,
                                  MachineBasicBlock::iterator MI) {
+  // Spill are not terminators, so inserting spills after terminators will
+  // violate invariants in MachineVerifier.
+  assert(!MI->isTerminator() && "Inserting a spill after a terminator");
   MachineBasicBlock &MBB = *MI->getParent();
 
   MachineInstrSpan MIS(MI, &MBB);
-  bool IsRealSpill = true;
-  if (isFullUndefDef(*MI)) {
+  MachineBasicBlock::iterator SpillBefore = std::next(MI);
+  bool IsRealSpill = isRealSpill(*MI);
+  if (IsRealSpill)
+    TII.storeRegToStackSlot(MBB, SpillBefore, NewVReg, isKill, StackSlot,
+                            MRI.getRegClass(NewVReg), &TRI);
+  else
     // Don't spill undef value.
     // Anything works for undef, in particular keeping the memory
     // uninitialized is a viable option and it saves code size and
     // run time.
-    BuildMI(MBB, std::next(MI), MI->getDebugLoc(), TII.get(TargetOpcode::KILL))
+    BuildMI(MBB, SpillBefore, MI->getDebugLoc(), TII.get(TargetOpcode::KILL))
         .addReg(NewVReg, getKillRegState(isKill));
-    IsRealSpill = false;
-  } else
-    TII.storeRegToStackSlot(MBB, std::next(MI), NewVReg, isKill, StackSlot,
-                            MRI.getRegClass(NewVReg), &TRI);
 
-  LIS.InsertMachineInstrRangeInMaps(std::next(MI), MIS.end());
+  MachineBasicBlock::iterator Spill = std::next(MI);
+  LIS.InsertMachineInstrRangeInMaps(Spill, MIS.end());
 
-  LLVM_DEBUG(dumpMachineInstrRangeWithSlotIndex(std::next(MI), MIS.end(), LIS,
-                                                "spill"));
+  LLVM_DEBUG(
+      dumpMachineInstrRangeWithSlotIndex(Spill, MIS.end(), LIS, "spill"));
   ++NumSpills;
   if (IsRealSpill)
-    HSpiller.addToMergeableSpills(*std::next(MI), StackSlot, Original);
+    HSpiller.addToMergeableSpills(*Spill, StackSlot, Original);
 }
 
 /// spillAroundUses - insert spill code around each use of Reg.
-void InlineSpiller::spillAroundUses(unsigned Reg) {
+void InlineSpiller::spillAroundUses(Register Reg) {
   LLVM_DEBUG(dbgs() << "spillAroundUses " << printReg(Reg) << '\n');
   LiveInterval &OldLI = LIS.getInterval(Reg);
 
@@ -1000,7 +1018,7 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
         Idx = VNI->def;
 
     // Check for a sibling copy.
-    unsigned SibReg = isFullCopyOf(*MI, Reg);
+    Register SibReg = isFullCopyOf(*MI, Reg);
     if (SibReg && isSibling(SibReg)) {
       // This may actually be a copy between snippets.
       if (isRegToSpill(SibReg)) {
@@ -1029,7 +1047,7 @@ void InlineSpiller::spillAroundUses(unsigned Reg) {
 
     // Create a new virtual register for spill/fill.
     // FIXME: Infer regclass from instruction alone.
-    unsigned NewVReg = Edit->createFrom(Reg);
+    Register NewVReg = Edit->createFrom(Reg);
 
     if (RI.Reads)
       insertReload(NewVReg, Idx, MI);
@@ -1070,13 +1088,13 @@ void InlineSpiller::spillAll() {
     VRM.assignVirt2StackSlot(Edit->getReg(), StackSlot);
 
   assert(StackInt->getNumValNums() == 1 && "Bad stack interval values");
-  for (unsigned Reg : RegsToSpill)
+  for (Register Reg : RegsToSpill)
     StackInt->MergeSegmentsInAsValue(LIS.getInterval(Reg),
                                      StackInt->getValNumInfo(0));
   LLVM_DEBUG(dbgs() << "Merged spilled regs: " << *StackInt << '\n');
 
   // Spill around uses of all RegsToSpill.
-  for (unsigned Reg : RegsToSpill)
+  for (Register Reg : RegsToSpill)
     spillAroundUses(Reg);
 
   // Hoisted spills may cause dead code.
@@ -1086,7 +1104,7 @@ void InlineSpiller::spillAll() {
   }
 
   // Finally delete the SnippetCopies.
-  for (unsigned Reg : RegsToSpill) {
+  for (Register Reg : RegsToSpill) {
     for (MachineRegisterInfo::reg_instr_iterator
          RI = MRI.reg_instr_begin(Reg), E = MRI.reg_instr_end();
          RI != E; ) {
@@ -1099,7 +1117,7 @@ void InlineSpiller::spillAll() {
   }
 
   // Delete all spilled registers.
-  for (unsigned Reg : RegsToSpill)
+  for (Register Reg : RegsToSpill)
     Edit->eraseVirtReg(Reg);
 }
 
@@ -1168,18 +1186,18 @@ bool HoistSpillHelper::rmFromMergeableSpills(MachineInstr &Spill,
 /// Check BB to see if it is a possible target BB to place a hoisted spill,
 /// i.e., there should be a living sibling of OrigReg at the insert point.
 bool HoistSpillHelper::isSpillCandBB(LiveInterval &OrigLI, VNInfo &OrigVNI,
-                                     MachineBasicBlock &BB, unsigned &LiveReg) {
+                                     MachineBasicBlock &BB, Register &LiveReg) {
   SlotIndex Idx;
-  unsigned OrigReg = OrigLI.reg;
+  Register OrigReg = OrigLI.reg;
   MachineBasicBlock::iterator MI = IPA.getLastInsertPointIter(OrigLI, BB);
   if (MI != BB.end())
     Idx = LIS.getInstructionIndex(*MI);
   else
     Idx = LIS.getMBBEndIdx(&BB).getPrevSlot();
-  SmallSetVector<unsigned, 16> &Siblings = Virt2SiblingsMap[OrigReg];
+  SmallSetVector<Register, 16> &Siblings = Virt2SiblingsMap[OrigReg];
   assert(OrigLI.getVNInfoAt(Idx) == &OrigVNI && "Unexpected VNI");
 
-  for (auto const SibReg : Siblings) {
+  for (const Register &SibReg : Siblings) {
     LiveInterval &LI = LIS.getInterval(SibReg);
     VNInfo *VNI = LI.getVNInfoAt(Idx);
     if (VNI) {
@@ -1288,10 +1306,7 @@ void HoistSpillHelper::getVisitOrders(
   Orders.push_back(MDT.getBase().getNode(Root));
   do {
     MachineDomTreeNode *Node = Orders[idx++];
-    const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
-    unsigned NumChildren = Children.size();
-    for (unsigned i = 0; i != NumChildren; ++i) {
-      MachineDomTreeNode *Child = Children[i];
+    for (MachineDomTreeNode *Child : Node->children()) {
       if (WorkSet.count(Child))
         Orders.push_back(Child);
     }
@@ -1359,10 +1374,7 @@ void HoistSpillHelper::runHoistSpills(
 
     // Collect spills in subtree of current node (*RIt) to
     // SpillsInSubTreeMap[*RIt].first.
-    const std::vector<MachineDomTreeNode *> &Children = (*RIt)->getChildren();
-    unsigned NumChildren = Children.size();
-    for (unsigned i = 0; i != NumChildren; ++i) {
-      MachineDomTreeNode *Child = Children[i];
+    for (MachineDomTreeNode *Child : (*RIt)->children()) {
       if (SpillsInSubTreeMap.find(Child) == SpillsInSubTreeMap.end())
         continue;
       // The stmt "SpillsInSubTree = SpillsInSubTreeMap[*RIt].first" below
@@ -1388,7 +1400,7 @@ void HoistSpillHelper::runHoistSpills(
       continue;
 
     // Check whether Block is a possible candidate to insert spill.
-    unsigned LiveReg = 0;
+    Register LiveReg;
     if (!isSpillCandBB(OrigLI, OrigVNI, *Block, LiveReg))
       continue;
 
@@ -1450,12 +1462,12 @@ void HoistSpillHelper::runHoistSpills(
 /// inside its subtree to that node. In this way, we can get benefit locally
 /// even if hoisting all the equal spills to one cold place is impossible.
 void HoistSpillHelper::hoistAllSpills() {
-  SmallVector<unsigned, 4> NewVRegs;
+  SmallVector<Register, 4> NewVRegs;
   LiveRangeEdit Edit(nullptr, NewVRegs, MF, LIS, &VRM, this);
 
   for (unsigned i = 0, e = MRI.getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = Register::index2VirtReg(i);
-    unsigned Original = VRM.getPreSplitReg(Reg);
+    Register Reg = Register::index2VirtReg(i);
+    Register Original = VRM.getPreSplitReg(Reg);
     if (!MRI.def_empty(Reg))
       Virt2SiblingsMap[Original].insert(Reg);
   }
@@ -1503,7 +1515,7 @@ void HoistSpillHelper::hoistAllSpills() {
     // Insert hoisted spills.
     for (auto const &Insert : SpillsToIns) {
       MachineBasicBlock *BB = Insert.first;
-      unsigned LiveReg = Insert.second;
+      Register LiveReg = Insert.second;
       MachineBasicBlock::iterator MI = IPA.getLastInsertPointIter(OrigLI, *BB);
       TII.storeRegToStackSlot(*BB, MI, LiveReg, false, Slot,
                               MRI.getRegClass(LiveReg), &TRI);
diff --git a/llvm/lib/CodeGen/InterferenceCache.h b/llvm/lib/CodeGen/InterferenceCache.h
index 50c6ac62d194..9019e9f61fa0 100644
--- a/llvm/lib/CodeGen/InterferenceCache.h
+++ b/llvm/lib/CodeGen/InterferenceCache.h
@@ -157,8 +157,6 @@ class LLVM_LIBRARY_VISIBILITY InterferenceCache {
   Entry *get(unsigned PhysReg);
 
 public:
-  friend class Cursor;
-
   InterferenceCache() = default;
 
   ~InterferenceCache() {
diff --git a/llvm/lib/CodeGen/InterleavedAccessPass.cpp b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
index 1f9b436378d2..c4d83547a06c 100644
--- a/llvm/lib/CodeGen/InterleavedAccessPass.cpp
+++ b/llvm/lib/CodeGen/InterleavedAccessPass.cpp
@@ -280,7 +280,7 @@ static bool isReInterleaveMask(ArrayRef<int> Mask, unsigned &Factor,
 
 bool InterleavedAccess::lowerInterleavedLoad(
     LoadInst *LI, SmallVector<Instruction *, 32> &DeadInsts) {
-  if (!LI->isSimple())
+  if (!LI->isSimple() || isa<ScalableVectorType>(LI->getType()))
     return false;
 
   SmallVector<ShuffleVectorInst *, 4> Shuffles;
@@ -308,7 +308,8 @@ bool InterleavedAccess::lowerInterleavedLoad(
 
   unsigned Factor, Index;
 
-  unsigned NumLoadElements = LI->getType()->getVectorNumElements();
+  unsigned NumLoadElements =
+      cast<FixedVectorType>(LI->getType())->getNumElements();
   // Check if the first shufflevector is DE-interleave shuffle.
   if (!isDeInterleaveMask(Shuffles[0]->getShuffleMask(), Factor, Index,
                           MaxFactor, NumLoadElements))
@@ -421,12 +422,13 @@ bool InterleavedAccess::lowerInterleavedStore(
     return false;
 
   ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(SI->getValueOperand());
-  if (!SVI || !SVI->hasOneUse())
+  if (!SVI || !SVI->hasOneUse() || isa<ScalableVectorType>(SVI->getType()))
     return false;
 
   // Check if the shufflevector is RE-interleave shuffle.
   unsigned Factor;
-  unsigned OpNumElts = SVI->getOperand(0)->getType()->getVectorNumElements();
+  unsigned OpNumElts =
+      cast<FixedVectorType>(SVI->getOperand(0)->getType())->getNumElements();
   if (!isReInterleaveMask(SVI->getShuffleMask(), Factor, MaxFactor, OpNumElts))
     return false;
 
diff --git a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
index 42691b8a6154..f7131926ee65 100644
--- a/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
+++ b/llvm/lib/CodeGen/InterleavedLoadCombinePass.cpp
@@ -673,9 +673,9 @@ public:
   ElementInfo *EI;
 
   /// Vector Type
-  VectorType *const VTy;
+  FixedVectorType *const VTy;
 
-  VectorInfo(VectorType *VTy)
+  VectorInfo(FixedVectorType *VTy)
       : BB(nullptr), PV(nullptr), LIs(), Is(), SVI(nullptr), VTy(VTy) {
     EI = new ElementInfo[VTy->getNumElements()];
   }
@@ -735,7 +735,7 @@ public:
     if (!Op)
       return false;
 
-    VectorType *VTy = dyn_cast<VectorType>(Op->getType());
+    FixedVectorType *VTy = dyn_cast<FixedVectorType>(Op->getType());
     if (!VTy)
       return false;
 
@@ -785,8 +785,8 @@ public:
   /// \returns false if no sensible information can be gathered.
   static bool computeFromSVI(ShuffleVectorInst *SVI, VectorInfo &Result,
                              const DataLayout &DL) {
-    VectorType *ArgTy = dyn_cast<VectorType>(SVI->getOperand(0)->getType());
-    assert(ArgTy && "ShuffleVector Operand is not a VectorType");
+    FixedVectorType *ArgTy =
+        cast<FixedVectorType>(SVI->getOperand(0)->getType());
 
     // Compute the left hand vector information.
     VectorInfo LHS(ArgTy);
@@ -1200,14 +1200,15 @@ bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
   IRBuilder<> Builder(InsertionPoint);
   Type *ETy = InterleavedLoad.front().SVI->getType()->getElementType();
   unsigned ElementsPerSVI =
-      InterleavedLoad.front().SVI->getType()->getNumElements();
-  VectorType *ILTy = VectorType::get(ETy, Factor * ElementsPerSVI);
+      cast<FixedVectorType>(InterleavedLoad.front().SVI->getType())
+          ->getNumElements();
+  FixedVectorType *ILTy = FixedVectorType::get(ETy, Factor * ElementsPerSVI);
 
   SmallVector<unsigned, 4> Indices;
   for (unsigned i = 0; i < Factor; i++)
     Indices.push_back(i);
   InterleavedCost = TTI.getInterleavedMemoryOpCost(
-      Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlignment(),
+      Instruction::Load, ILTy, Factor, Indices, InsertionPoint->getAlign(),
       InsertionPoint->getPointerAddressSpace());
 
   if (InterleavedCost >= InstructionCost) {
@@ -1220,7 +1221,7 @@ bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
                                       "interleaved.wide.ptrcast");
 
   // Create the wide load and update the MemorySSA.
-  auto LI = Builder.CreateAlignedLoad(ILTy, CI, InsertionPoint->getAlignment(),
+  auto LI = Builder.CreateAlignedLoad(ILTy, CI, InsertionPoint->getAlign(),
                                       "interleaved.wide.load");
   auto MSSAU = MemorySSAUpdater(&MSSA);
   MemoryUse *MSSALoad = cast<MemoryUse>(MSSAU.createMemoryAccessBefore(
@@ -1230,7 +1231,7 @@ bool InterleavedLoadCombineImpl::combine(std::list<VectorInfo> &InterleavedLoad,
   // Create the final SVIs and replace all uses.
   int i = 0;
   for (auto &VI : InterleavedLoad) {
-    SmallVector<uint32_t, 4> Mask;
+    SmallVector<int, 4> Mask;
     for (unsigned j = 0; j < ElementsPerSVI; j++)
       Mask.push_back(i + j * Factor);
 
@@ -1265,8 +1266,11 @@ bool InterleavedLoadCombineImpl::run() {
     for (BasicBlock &BB : F) {
       for (Instruction &I : BB) {
         if (auto SVI = dyn_cast<ShuffleVectorInst>(&I)) {
+          // We don't support scalable vectors in this pass.
+          if (isa<ScalableVectorType>(SVI->getType()))
+            continue;
 
-          Candidates.emplace_back(SVI->getType());
+          Candidates.emplace_back(cast<FixedVectorType>(SVI->getType()));
 
           if (!VectorInfo::computeFromSVI(SVI, Candidates.back(), DL)) {
             Candidates.pop_back();
diff --git a/llvm/lib/CodeGen/IntrinsicLowering.cpp b/llvm/lib/CodeGen/IntrinsicLowering.cpp
index 4461a235d6c1..e37c21e76597 100644
--- a/llvm/lib/CodeGen/IntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/IntrinsicLowering.cpp
@@ -12,7 +12,6 @@
 
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -203,22 +202,21 @@ static Value *LowerCTLZ(LLVMContext &Context, Value *V, Instruction *IP) {
 static void ReplaceFPIntrinsicWithCall(CallInst *CI, const char *Fname,
                                        const char *Dname,
                                        const char *LDname) {
-  CallSite CS(CI);
   switch (CI->getArgOperand(0)->getType()->getTypeID()) {
   default: llvm_unreachable("Invalid type in intrinsic");
   case Type::FloatTyID:
-    ReplaceCallWith(Fname, CI, CS.arg_begin(), CS.arg_end(),
-                  Type::getFloatTy(CI->getContext()));
+    ReplaceCallWith(Fname, CI, CI->arg_begin(), CI->arg_end(),
+                    Type::getFloatTy(CI->getContext()));
     break;
   case Type::DoubleTyID:
-    ReplaceCallWith(Dname, CI, CS.arg_begin(), CS.arg_end(),
-                  Type::getDoubleTy(CI->getContext()));
+    ReplaceCallWith(Dname, CI, CI->arg_begin(), CI->arg_end(),
+                    Type::getDoubleTy(CI->getContext()));
     break;
   case Type::X86_FP80TyID:
   case Type::FP128TyID:
   case Type::PPC_FP128TyID:
-    ReplaceCallWith(LDname, CI, CS.arg_begin(), CS.arg_end(),
-                  CI->getArgOperand(0)->getType());
+    ReplaceCallWith(LDname, CI, CI->arg_begin(), CI->arg_end(),
+                    CI->getArgOperand(0)->getType());
     break;
   }
 }
@@ -230,7 +228,6 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
   const Function *Callee = CI->getCalledFunction();
   assert(Callee && "Cannot lower an indirect call!");
 
-  CallSite CS(CI);
   switch (Callee->getIntrinsicID()) {
   case Intrinsic::not_intrinsic:
     report_fatal_error("Cannot lower a call to a non-intrinsic function '"+
@@ -424,6 +421,10 @@ void IntrinsicLowering::LowerIntrinsicCall(CallInst *CI) {
     ReplaceFPIntrinsicWithCall(CI, "roundf", "round", "roundl");
     break;
   }
+  case Intrinsic::roundeven: {
+    ReplaceFPIntrinsicWithCall(CI, "roundevenf", "roundeven", "roundevenl");
+    break;
+  }
   case Intrinsic::copysign: {
     ReplaceFPIntrinsicWithCall(CI, "copysignf", "copysign", "copysignl");
     break;
diff --git a/llvm/lib/CodeGen/LLVMTargetMachine.cpp b/llvm/lib/CodeGen/LLVMTargetMachine.cpp
index 50c178ff7598..b485f2cf7261 100644
--- a/llvm/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/llvm/lib/CodeGen/LLVMTargetMachine.cpp
@@ -157,9 +157,6 @@ bool LLVMTargetMachine::addAsmPrinter(PassManagerBase &PM,
     if (!MCE || !MAB)
       return true;
 
-    // Don't waste memory on names of temp labels.
-    Context.setUseNamesOnTempLabels(false);
-
     Triple T(getTargetTriple().str());
     AsmStreamer.reset(getTarget().createMCObjectStreamer(
         T, Context, std::unique_ptr<MCAsmBackend>(MAB),
diff --git a/llvm/lib/CodeGen/LexicalScopes.cpp b/llvm/lib/CodeGen/LexicalScopes.cpp
index ac3ef0e709f3..690b429832a5 100644
--- a/llvm/lib/CodeGen/LexicalScopes.cpp
+++ b/llvm/lib/CodeGen/LexicalScopes.cpp
@@ -44,6 +44,7 @@ void LexicalScopes::reset() {
   AbstractScopeMap.clear();
   InlinedLexicalScopeMap.clear();
   AbstractScopesList.clear();
+  DominatedBlocks.clear();
 }
 
 /// initialize - Scan machine function and constuct lexical scope nest.
@@ -229,24 +230,24 @@ LexicalScopes::getOrCreateAbstractScope(const DILocalScope *Scope) {
   return &I->second;
 }
 
-/// constructScopeNest
+/// constructScopeNest - Traverse the Scope tree depth-first, storing
+/// traversal state in WorkStack and recording the depth-first
+/// numbering (setDFSIn, setDFSOut) for edge classification.
 void LexicalScopes::constructScopeNest(LexicalScope *Scope) {
   assert(Scope && "Unable to calculate scope dominance graph!");
-  SmallVector<LexicalScope *, 4> WorkStack;
-  WorkStack.push_back(Scope);
+  SmallVector<std::pair<LexicalScope *, size_t>, 4> WorkStack;
+  WorkStack.push_back(std::make_pair(Scope, 0));
   unsigned Counter = 0;
   while (!WorkStack.empty()) {
-    LexicalScope *WS = WorkStack.back();
+    auto &ScopePosition = WorkStack.back();
+    LexicalScope *WS = ScopePosition.first;
+    size_t ChildNum = ScopePosition.second++;
     const SmallVectorImpl<LexicalScope *> &Children = WS->getChildren();
-    bool visitedChildren = false;
-    for (auto &ChildScope : Children)
-      if (!ChildScope->getDFSOut()) {
-        WorkStack.push_back(ChildScope);
-        visitedChildren = true;
-        ChildScope->setDFSIn(++Counter);
-        break;
-      }
-    if (!visitedChildren) {
+    if (ChildNum < Children.size()) {
+      auto &ChildScope = Children[ChildNum];
+      WorkStack.push_back(std::make_pair(ChildScope, 0));
+      ChildScope->setDFSIn(++Counter);
+    } else {
       WorkStack.pop_back();
       WS->setDFSOut(++Counter);
     }
@@ -291,13 +292,17 @@ void LexicalScopes::getMachineBasicBlocks(
     return;
   }
 
+  // The scope ranges can cover multiple basic blocks in each span. Iterate over
+  // all blocks (in the order they are in the function) until we reach the one
+  // containing the end of the span.
   SmallVectorImpl<InsnRange> &InsnRanges = Scope->getRanges();
   for (auto &R : InsnRanges)
-    MBBs.insert(R.first->getParent());
+    for (auto CurMBBIt = R.first->getParent()->getIterator(),
+              EndBBIt = std::next(R.second->getParent()->getIterator());
+         CurMBBIt != EndBBIt; CurMBBIt++)
+      MBBs.insert(&*CurMBBIt);
 }
 
-/// dominates - Return true if DebugLoc's lexical scope dominates at least one
-/// machine instruction's lexical scope in a given machine basic block.
 bool LexicalScopes::dominates(const DILocation *DL, MachineBasicBlock *MBB) {
   assert(MF && "Unexpected uninitialized LexicalScopes object!");
   LexicalScope *Scope = getOrCreateLexicalScope(DL);
@@ -308,14 +313,18 @@ bool LexicalScopes::dominates(const DILocation *DL, MachineBasicBlock *MBB) {
   if (Scope == CurrentFnLexicalScope && MBB->getParent() == MF)
     return true;
 
-  bool Result = false;
-  for (auto &I : *MBB) {
-    if (const DILocation *IDL = I.getDebugLoc())
-      if (LexicalScope *IScope = getOrCreateLexicalScope(IDL))
-        if (Scope->dominates(IScope))
-          return true;
+  // Fetch all the blocks in DLs scope. Because the range / block list also
+  // contain any subscopes, any instruction that DL dominates can be found in
+  // the block set.
+  //
+  // Cache the set of fetched blocks to avoid repeatedly recomputing the set in
+  // the LiveDebugValues pass.
+  std::unique_ptr<BlockSetT> &Set = DominatedBlocks[DL];
+  if (!Set) {
+    Set = std::make_unique<BlockSetT>();
+    getMachineBasicBlocks(DL, *Set);
   }
-  return Result;
+  return Set->count(MBB) != 0;
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
diff --git a/llvm/lib/CodeGen/LiveDebugValues.cpp b/llvm/lib/CodeGen/LiveDebugValues.cpp
index 2226c10b49a4..07a275b546f6 100644
--- a/llvm/lib/CodeGen/LiveDebugValues.cpp
+++ b/llvm/lib/CodeGen/LiveDebugValues.cpp
@@ -6,32 +6,107 @@
 //
 //===----------------------------------------------------------------------===//
 ///
-/// This pass implements a data flow analysis that propagates debug location
-/// information by inserting additional DBG_VALUE insts into the machine
-/// instruction stream. Before running, each DBG_VALUE inst corresponds to a
-/// source assignment of a variable. Afterwards, a DBG_VALUE inst specifies a
-/// variable location for the current basic block (see SourceLevelDebugging.rst).
+/// \file LiveDebugValues.cpp
 ///
-/// This is a separate pass from DbgValueHistoryCalculator to facilitate
-/// testing and improve modularity.
+/// LiveDebugValues is an optimistic "available expressions" dataflow
+/// algorithm. The set of expressions is the set of machine locations
+/// (registers, spill slots, constants) that a variable fragment might be
+/// located, qualified by a DIExpression and indirect-ness flag, while each
+/// variable is identified by a DebugVariable object. The availability of an
+/// expression begins when a DBG_VALUE instruction specifies the location of a
+/// DebugVariable, and continues until that location is clobbered or
+/// re-specified by a different DBG_VALUE for the same DebugVariable.
 ///
-/// Each variable location is represented by a VarLoc object that identifies the
-/// source variable, its current machine-location, and the DBG_VALUE inst that
-/// specifies the location. Each VarLoc is indexed in the (function-scope)
-/// VarLocMap, giving each VarLoc a unique index. Rather than operate directly
-/// on machine locations, the dataflow analysis in this pass identifies
-/// locations by their index in the VarLocMap, meaning all the variable
-/// locations in a block can be described by a sparse vector of VarLocMap
-/// indexes.
+/// The cannonical "available expressions" problem doesn't have expression
+/// clobbering, instead when a variable is re-assigned, any expressions using
+/// that variable get invalidated. LiveDebugValues can map onto "available
+/// expressions" by having every register represented by a variable, which is
+/// used in an expression that becomes available at a DBG_VALUE instruction.
+/// When the register is clobbered, its variable is effectively reassigned, and
+/// expressions computed from it become unavailable. A similar construct is
+/// needed when a DebugVariable has its location re-specified, to invalidate
+/// all other locations for that DebugVariable.
+///
+/// Using the dataflow analysis to compute the available expressions, we create
+/// a DBG_VALUE at the beginning of each block where the expression is
+/// live-in. This propagates variable locations into every basic block where
+/// the location can be determined, rather than only having DBG_VALUEs in blocks
+/// where locations are specified due to an assignment or some optimization.
+/// Movements of values between registers and spill slots are annotated with
+/// DBG_VALUEs too to track variable values bewteen locations. All this allows
+/// DbgEntityHistoryCalculator to focus on only the locations within individual
+/// blocks, facilitating testing and improving modularity.
+///
+/// We follow an optimisic dataflow approach, with this lattice:
+///
+/// \verbatim
+///                    ┬ "Unknown"
+///                          |
+///                          v
+///                         True
+///                          |
+///                          v
+///                      ⊥ False
+/// \endverbatim With "True" signifying that the expression is available (and
+/// thus a DebugVariable's location is the corresponding register), while
+/// "False" signifies that the expression is unavailable. "Unknown"s never
+/// survive to the end of the analysis (see below).
+///
+/// Formally, all DebugVariable locations that are live-out of a block are
+/// initialized to \top.  A blocks live-in values take the meet of the lattice
+/// value for every predecessors live-outs, except for the entry block, where
+/// all live-ins are \bot. The usual dataflow propagation occurs: the transfer
+/// function for a block assigns an expression for a DebugVariable to be "True"
+/// if a DBG_VALUE in the block specifies it; "False" if the location is
+/// clobbered; or the live-in value if it is unaffected by the block. We
+/// visit each block in reverse post order until a fixedpoint is reached. The
+/// solution produced is maximal.
+///
+/// Intuitively, we start by assuming that every expression / variable location
+/// is at least "True", and then propagate "False" from the entry block and any
+/// clobbers until there are no more changes to make. This gives us an accurate
+/// solution because all incorrect locations will have a "False" propagated into
+/// them. It also gives us a solution that copes well with loops by assuming
+/// that variable locations are live-through every loop, and then removing those
+/// that are not through dataflow.
+///
+/// Within LiveDebugValues: each variable location is represented by a
+/// VarLoc object that identifies the source variable, its current
+/// machine-location, and the DBG_VALUE inst that specifies the location. Each
+/// VarLoc is indexed in the (function-scope) \p VarLocMap, giving each VarLoc a
+/// unique index. Rather than operate directly on machine locations, the
+/// dataflow analysis in this pass identifies locations by their index in the
+/// VarLocMap, meaning all the variable locations in a block can be described
+/// by a sparse vector of VarLocMap indicies.
+///
+/// All the storage for the dataflow analysis is local to the ExtendRanges
+/// method and passed down to helper methods. "OutLocs" and "InLocs" record the
+/// in and out lattice values for each block. "OpenRanges" maintains a list of
+/// variable locations and, with the "process" method, evaluates the transfer
+/// function of each block. "flushPendingLocs" installs DBG_VALUEs for each
+/// live-in location at the start of blocks, while "Transfers" records
+/// transfers of values between machine-locations.
+///
+/// We avoid explicitly representing the "Unknown" (\top) lattice value in the
+/// implementation. Instead, unvisited blocks implicitly have all lattice
+/// values set as "Unknown". After being visited, there will be path back to
+/// the entry block where the lattice value is "False", and as the transfer
+/// function cannot make new "Unknown" locations, there are no scenarios where
+/// a block can have an "Unknown" location after being visited. Similarly, we
+/// don't enumerate all possible variable locations before exploring the
+/// function: when a new location is discovered, all blocks previously explored
+/// were implicitly "False" but unrecorded, and become explicitly "False" when
+/// a new VarLoc is created with its bit not set in predecessor InLocs or
+/// OutLocs.
 ///
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/CoalescingBitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/SparseBitVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/UniqueVector.h"
 #include "llvm/CodeGen/LexicalScopes.h"
@@ -64,6 +139,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -78,7 +154,18 @@ using namespace llvm;
 #define DEBUG_TYPE "livedebugvalues"
 
 STATISTIC(NumInserted, "Number of DBG_VALUE instructions inserted");
-STATISTIC(NumRemoved, "Number of DBG_VALUE instructions removed");
+
+// Options to prevent pathological compile-time behavior. If InputBBLimit and
+// InputDbgValueLimit are both exceeded, range extension is disabled.
+static cl::opt<unsigned> InputBBLimit(
+    "livedebugvalues-input-bb-limit",
+    cl::desc("Maximum input basic blocks before DBG_VALUE limit applies"),
+    cl::init(10000), cl::Hidden);
+static cl::opt<unsigned> InputDbgValueLimit(
+    "livedebugvalues-input-dbg-value-limit",
+    cl::desc(
+        "Maximum input DBG_VALUE insts supported by debug range extension"),
+    cl::init(50000), cl::Hidden);
 
 // If @MI is a DBG_VALUE with debug value described by a defined
 // register, returns the number of this register. In the other case, returns 0.
@@ -87,7 +174,8 @@ static Register isDbgValueDescribedByReg(const MachineInstr &MI) {
   assert(MI.getNumOperands() == 4 && "malformed DBG_VALUE");
   // If location of variable is described using a register (directly
   // or indirectly), this register is always a first operand.
-  return MI.getOperand(0).isReg() ? MI.getOperand(0).getReg() : Register();
+  return MI.getDebugOperand(0).isReg() ? MI.getDebugOperand(0).getReg()
+                                       : Register();
 }
 
 /// If \p Op is a stack or frame register return true, otherwise return false.
@@ -101,7 +189,7 @@ static bool isRegOtherThanSPAndFP(const MachineOperand &Op,
 
   const MachineFunction *MF = MI.getParent()->getParent();
   const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
-  unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
+  Register SP = TLI->getStackPointerRegisterToSaveRestore();
   Register FP = TRI->getFrameRegister(*MF);
   Register Reg = Op.getReg();
 
@@ -110,8 +198,72 @@ static bool isRegOtherThanSPAndFP(const MachineOperand &Op,
 
 namespace {
 
+// Max out the number of statically allocated elements in DefinedRegsSet, as
+// this prevents fallback to std::set::count() operations.
 using DefinedRegsSet = SmallSet<Register, 32>;
 
+using VarLocSet = CoalescingBitVector<uint64_t>;
+
+/// A type-checked pair of {Register Location (or 0), Index}, used to index
+/// into a \ref VarLocMap. This can be efficiently converted to a 64-bit int
+/// for insertion into a \ref VarLocSet, and efficiently converted back. The
+/// type-checker helps ensure that the conversions aren't lossy.
+///
+/// Why encode a location /into/ the VarLocMap index? This makes it possible
+/// to find the open VarLocs killed by a register def very quickly. This is a
+/// performance-critical operation for LiveDebugValues.
+struct LocIndex {
+  using u32_location_t = uint32_t;
+  using u32_index_t = uint32_t;
+
+  u32_location_t Location; // Physical registers live in the range [1;2^30) (see
+                           // \ref MCRegister), so we have plenty of range left
+                           // here to encode non-register locations.
+  u32_index_t Index;
+
+  /// The first location greater than 0 that is not reserved for VarLocs of
+  /// kind RegisterKind.
+  static constexpr u32_location_t kFirstInvalidRegLocation = 1 << 30;
+
+  /// A special location reserved for VarLocs of kind SpillLocKind.
+  static constexpr u32_location_t kSpillLocation = kFirstInvalidRegLocation;
+
+  /// A special location reserved for VarLocs of kind EntryValueBackupKind and
+  /// EntryValueCopyBackupKind.
+  static constexpr u32_location_t kEntryValueBackupLocation =
+      kFirstInvalidRegLocation + 1;
+
+  LocIndex(u32_location_t Location, u32_index_t Index)
+      : Location(Location), Index(Index) {}
+
+  uint64_t getAsRawInteger() const {
+    return (static_cast<uint64_t>(Location) << 32) | Index;
+  }
+
+  template<typename IntT> static LocIndex fromRawInteger(IntT ID) {
+    static_assert(std::is_unsigned<IntT>::value &&
+                      sizeof(ID) == sizeof(uint64_t),
+                  "Cannot convert raw integer to LocIndex");
+    return {static_cast<u32_location_t>(ID >> 32),
+            static_cast<u32_index_t>(ID)};
+  }
+
+  /// Get the start of the interval reserved for VarLocs of kind RegisterKind
+  /// which reside in \p Reg. The end is at rawIndexForReg(Reg+1)-1.
+  static uint64_t rawIndexForReg(uint32_t Reg) {
+    return LocIndex(Reg, 0).getAsRawInteger();
+  }
+
+  /// Return a range covering all set indices in the interval reserved for
+  /// \p Location in \p Set.
+  static auto indexRangeForLocation(const VarLocSet &Set,
+                                    u32_location_t Location) {
+    uint64_t Start = LocIndex(Location, 0).getAsRawInteger();
+    uint64_t End = LocIndex(Location + 1, 0).getAsRawInteger();
+    return Set.half_open_range(Start, End);
+  }
+};
+
 class LiveDebugValues : public MachineFunctionPass {
 private:
   const TargetRegisterInfo *TRI;
@@ -119,28 +271,10 @@ private:
   const TargetFrameLowering *TFI;
   BitVector CalleeSavedRegs;
   LexicalScopes LS;
+  VarLocSet::Allocator Alloc;
 
   enum struct TransferKind { TransferCopy, TransferSpill, TransferRestore };
 
-  /// Keeps track of lexical scopes associated with a user value's source
-  /// location.
-  class UserValueScopes {
-    DebugLoc DL;
-    LexicalScopes &LS;
-    SmallPtrSet<const MachineBasicBlock *, 4> LBlocks;
-
-  public:
-    UserValueScopes(DebugLoc D, LexicalScopes &L) : DL(std::move(D)), LS(L) {}
-
-    /// Return true if current scope dominates at least one machine
-    /// instruction in a given machine basic block.
-    bool dominates(MachineBasicBlock *MBB) {
-      if (LBlocks.empty())
-        LS.getMachineBasicBlocks(DL, LBlocks);
-      return LBlocks.count(MBB) != 0 || LS.dominates(DL, MBB);
-    }
-  };
-
   using FragmentInfo = DIExpression::FragmentInfo;
   using OptFragmentInfo = Optional<DIExpression::FragmentInfo>;
 
@@ -154,6 +288,9 @@ private:
       bool operator==(const SpillLoc &Other) const {
         return SpillBase == Other.SpillBase && SpillOffset == Other.SpillOffset;
       }
+      bool operator!=(const SpillLoc &Other) const {
+        return !(*this == Other);
+      }
     };
 
     /// Identity of the variable at this location.
@@ -166,7 +303,6 @@ private:
     /// is moved.
     const MachineInstr &MI;
 
-    mutable UserValueScopes UVS;
     enum VarLocKind {
       InvalidKind = 0,
       RegisterKind,
@@ -191,7 +327,7 @@ private:
     VarLoc(const MachineInstr &MI, LexicalScopes &LS)
         : Var(MI.getDebugVariable(), MI.getDebugExpression(),
               MI.getDebugLoc()->getInlinedAt()),
-          Expr(MI.getDebugExpression()), MI(MI), UVS(MI.getDebugLoc(), LS) {
+          Expr(MI.getDebugExpression()), MI(MI) {
       static_assert((sizeof(Loc) == sizeof(uint64_t)),
                     "hash does not cover all members of Loc");
       assert(MI.isDebugValue() && "not a DBG_VALUE");
@@ -199,15 +335,15 @@ private:
       if (int RegNo = isDbgValueDescribedByReg(MI)) {
         Kind = RegisterKind;
         Loc.RegNo = RegNo;
-      } else if (MI.getOperand(0).isImm()) {
+      } else if (MI.getDebugOperand(0).isImm()) {
         Kind = ImmediateKind;
-        Loc.Immediate = MI.getOperand(0).getImm();
-      } else if (MI.getOperand(0).isFPImm()) {
+        Loc.Immediate = MI.getDebugOperand(0).getImm();
+      } else if (MI.getDebugOperand(0).isFPImm()) {
         Kind = ImmediateKind;
-        Loc.FPImm = MI.getOperand(0).getFPImm();
-      } else if (MI.getOperand(0).isCImm()) {
+        Loc.FPImm = MI.getDebugOperand(0).getFPImm();
+      } else if (MI.getDebugOperand(0).isCImm()) {
         Kind = ImmediateKind;
-        Loc.CImm = MI.getOperand(0).getCImm();
+        Loc.CImm = MI.getDebugOperand(0).getCImm();
       }
 
       // We create the debug entry values from the factory functions rather than
@@ -218,7 +354,7 @@ private:
     /// Take the variable and machine-location in DBG_VALUE MI, and build an
     /// entry location using the given expression.
     static VarLoc CreateEntryLoc(const MachineInstr &MI, LexicalScopes &LS,
-                                 const DIExpression *EntryExpr, unsigned Reg) {
+                                 const DIExpression *EntryExpr, Register Reg) {
       VarLoc VL(MI, LS);
       assert(VL.Kind == RegisterKind);
       VL.Kind = EntryValueKind;
@@ -247,7 +383,7 @@ private:
     static VarLoc CreateEntryCopyBackupLoc(const MachineInstr &MI,
                                            LexicalScopes &LS,
                                            const DIExpression *EntryExpr,
-                                           unsigned NewReg) {
+                                           Register NewReg) {
       VarLoc VL(MI, LS);
       assert(VL.Kind == RegisterKind);
       VL.Kind = EntryValueCopyBackupKind;
@@ -259,7 +395,7 @@ private:
     /// Copy the register location in DBG_VALUE MI, updating the register to
     /// be NewReg.
     static VarLoc CreateCopyLoc(const MachineInstr &MI, LexicalScopes &LS,
-                                unsigned NewReg) {
+                                Register NewReg) {
       VarLoc VL(MI, LS);
       assert(VL.Kind == RegisterKind);
       VL.Loc.RegNo = NewReg;
@@ -287,6 +423,7 @@ private:
       const auto &IID = MI.getDesc();
       const DILocalVariable *Var = MI.getDebugVariable();
       const DIExpression *DIExpr = MI.getDebugExpression();
+      NumInserted++;
 
       switch (Kind) {
       case EntryValueKind:
@@ -294,8 +431,8 @@ private:
         // expression. The register location of such DBG_VALUE is always the one
         // from the entry DBG_VALUE, it does not matter if the entry value was
         // copied in to another register due to some optimizations.
-        return BuildMI(MF, DbgLoc, IID, Indirect, MI.getOperand(0).getReg(),
-                       Var, Expr);
+        return BuildMI(MF, DbgLoc, IID, Indirect,
+                       MI.getDebugOperand(0).getReg(), Var, Expr);
       case RegisterKind:
         // Register locations are like the source DBG_VALUE, but with the
         // register number from this VarLoc.
@@ -311,7 +448,7 @@ private:
         return BuildMI(MF, DbgLoc, IID, true, Base, Var, SpillExpr);
       }
       case ImmediateKind: {
-        MachineOperand MO = MI.getOperand(0);
+        MachineOperand MO = MI.getDebugOperand(0);
         return BuildMI(MF, DbgLoc, IID, Indirect, MO, Var, DIExpr);
       }
       case EntryValueBackupKind:
@@ -357,41 +494,42 @@ private:
 
     /// Determine whether the lexical scope of this value's debug location
     /// dominates MBB.
-    bool dominates(MachineBasicBlock &MBB) const { return UVS.dominates(&MBB); }
+    bool dominates(LexicalScopes &LS, MachineBasicBlock &MBB) const {
+      return LS.dominates(MI.getDebugLoc().get(), &MBB);
+    }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
     // TRI can be null.
     void dump(const TargetRegisterInfo *TRI, raw_ostream &Out = dbgs()) const {
-      dbgs() << "VarLoc(";
+      Out << "VarLoc(";
       switch (Kind) {
       case RegisterKind:
       case EntryValueKind:
       case EntryValueBackupKind:
       case EntryValueCopyBackupKind:
-        dbgs() << printReg(Loc.RegNo, TRI);
+        Out << printReg(Loc.RegNo, TRI);
         break;
       case SpillLocKind:
-        dbgs() << printReg(Loc.SpillLocation.SpillBase, TRI);
-        dbgs() << "[" << Loc.SpillLocation.SpillOffset << "]";
+        Out << printReg(Loc.SpillLocation.SpillBase, TRI);
+        Out << "[" << Loc.SpillLocation.SpillOffset << "]";
         break;
       case ImmediateKind:
-        dbgs() << Loc.Immediate;
+        Out << Loc.Immediate;
         break;
       case InvalidKind:
         llvm_unreachable("Invalid VarLoc in dump method");
       }
 
-      dbgs() << ", \"" << Var.getVariable()->getName() << "\", " << *Expr
-             << ", ";
+      Out << ", \"" << Var.getVariable()->getName() << "\", " << *Expr << ", ";
       if (Var.getInlinedAt())
-        dbgs() << "!" << Var.getInlinedAt()->getMetadataID() << ")\n";
+        Out << "!" << Var.getInlinedAt()->getMetadataID() << ")\n";
       else
-        dbgs() << "(null))";
+        Out << "(null))";
 
       if (isEntryBackupLoc())
-        dbgs() << " (backup loc)\n";
+        Out << " (backup loc)\n";
       else
-        dbgs() << "\n";
+        Out << "\n";
     }
 #endif
 
@@ -407,12 +545,62 @@ private:
     }
   };
 
-  using VarLocMap = UniqueVector<VarLoc>;
-  using VarLocSet = SparseBitVector<>;
-  using VarLocInMBB = SmallDenseMap<const MachineBasicBlock *, VarLocSet>;
+  /// VarLocMap is used for two things:
+  /// 1) Assigning a unique LocIndex to a VarLoc. This LocIndex can be used to
+  ///    virtually insert a VarLoc into a VarLocSet.
+  /// 2) Given a LocIndex, look up the unique associated VarLoc.
+  class VarLocMap {
+    /// Map a VarLoc to an index within the vector reserved for its location
+    /// within Loc2Vars.
+    std::map<VarLoc, LocIndex::u32_index_t> Var2Index;
+
+    /// Map a location to a vector which holds VarLocs which live in that
+    /// location.
+    SmallDenseMap<LocIndex::u32_location_t, std::vector<VarLoc>> Loc2Vars;
+
+    /// Determine the 32-bit location reserved for \p VL, based on its kind.
+    static LocIndex::u32_location_t getLocationForVar(const VarLoc &VL) {
+      switch (VL.Kind) {
+      case VarLoc::RegisterKind:
+        assert((VL.Loc.RegNo < LocIndex::kFirstInvalidRegLocation) &&
+               "Physreg out of range?");
+        return VL.Loc.RegNo;
+      case VarLoc::SpillLocKind:
+        return LocIndex::kSpillLocation;
+      case VarLoc::EntryValueBackupKind:
+      case VarLoc::EntryValueCopyBackupKind:
+        return LocIndex::kEntryValueBackupLocation;
+      default:
+        return 0;
+      }
+    }
+
+  public:
+    /// Retrieve a unique LocIndex for \p VL.
+    LocIndex insert(const VarLoc &VL) {
+      LocIndex::u32_location_t Location = getLocationForVar(VL);
+      LocIndex::u32_index_t &Index = Var2Index[VL];
+      if (!Index) {
+        auto &Vars = Loc2Vars[Location];
+        Vars.push_back(VL);
+        Index = Vars.size();
+      }
+      return {Location, Index - 1};
+    }
+
+    /// Retrieve the unique VarLoc associated with \p ID.
+    const VarLoc &operator[](LocIndex ID) const {
+      auto LocIt = Loc2Vars.find(ID.Location);
+      assert(LocIt != Loc2Vars.end() && "Location not tracked");
+      return LocIt->second[ID.Index];
+    }
+  };
+
+  using VarLocInMBB =
+      SmallDenseMap<const MachineBasicBlock *, std::unique_ptr<VarLocSet>>;
   struct TransferDebugPair {
-    MachineInstr *TransferInst; /// Instruction where this transfer occurs.
-    unsigned LocationID;        /// Location number for the transfer dest.
+    MachineInstr *TransferInst; ///< Instruction where this transfer occurs.
+    LocIndex LocationID;        ///< Location number for the transfer dest.
   };
   using TransferMap = SmallVector<TransferDebugPair, 4>;
 
@@ -441,13 +629,14 @@ private:
   class OpenRangesSet {
     VarLocSet VarLocs;
     // Map the DebugVariable to recent primary location ID.
-    SmallDenseMap<DebugVariable, unsigned, 8> Vars;
+    SmallDenseMap<DebugVariable, LocIndex, 8> Vars;
     // Map the DebugVariable to recent backup location ID.
-    SmallDenseMap<DebugVariable, unsigned, 8> EntryValuesBackupVars;
+    SmallDenseMap<DebugVariable, LocIndex, 8> EntryValuesBackupVars;
     OverlapMap &OverlappingFragments;
 
   public:
-    OpenRangesSet(OverlapMap &_OLapMap) : OverlappingFragments(_OLapMap) {}
+    OpenRangesSet(VarLocSet::Allocator &Alloc, OverlapMap &_OLapMap)
+        : VarLocs(Alloc), OverlappingFragments(_OLapMap) {}
 
     const VarLocSet &getVarLocs() const { return VarLocs; }
 
@@ -459,17 +648,18 @@ private:
     void erase(const VarLocSet &KillSet, const VarLocMap &VarLocIDs);
 
     /// Insert a new range into the set.
-    void insert(unsigned VarLocID, const VarLoc &VL);
+    void insert(LocIndex VarLocID, const VarLoc &VL);
 
     /// Insert a set of ranges.
     void insertFromLocSet(const VarLocSet &ToLoad, const VarLocMap &Map) {
-      for (unsigned Id : ToLoad) {
-        const VarLoc &VarL = Map[Id];
-        insert(Id, VarL);
+      for (uint64_t ID : ToLoad) {
+        LocIndex Idx = LocIndex::fromRawInteger(ID);
+        const VarLoc &VarL = Map[Idx];
+        insert(Idx, VarL);
       }
     }
 
-    llvm::Optional<unsigned> getEntryValueBackup(DebugVariable Var);
+    llvm::Optional<LocIndex> getEntryValueBackup(DebugVariable Var);
 
     /// Empty the set.
     void clear() {
@@ -485,8 +675,57 @@ private:
              "open ranges are inconsistent");
       return VarLocs.empty();
     }
+
+    /// Get an empty range of VarLoc IDs.
+    auto getEmptyVarLocRange() const {
+      return iterator_range<VarLocSet::const_iterator>(getVarLocs().end(),
+                                                       getVarLocs().end());
+    }
+
+    /// Get all set IDs for VarLocs of kind RegisterKind in \p Reg.
+    auto getRegisterVarLocs(Register Reg) const {
+      return LocIndex::indexRangeForLocation(getVarLocs(), Reg);
+    }
+
+    /// Get all set IDs for VarLocs of kind SpillLocKind.
+    auto getSpillVarLocs() const {
+      return LocIndex::indexRangeForLocation(getVarLocs(),
+                                             LocIndex::kSpillLocation);
+    }
+
+    /// Get all set IDs for VarLocs of kind EntryValueBackupKind or
+    /// EntryValueCopyBackupKind.
+    auto getEntryValueBackupVarLocs() const {
+      return LocIndex::indexRangeForLocation(
+          getVarLocs(), LocIndex::kEntryValueBackupLocation);
+    }
   };
 
+  /// Collect all VarLoc IDs from \p CollectFrom for VarLocs of kind
+  /// RegisterKind which are located in any reg in \p Regs. Insert collected IDs
+  /// into \p Collected.
+  void collectIDsForRegs(VarLocSet &Collected, const DefinedRegsSet &Regs,
+                         const VarLocSet &CollectFrom) const;
+
+  /// Get the registers which are used by VarLocs of kind RegisterKind tracked
+  /// by \p CollectFrom.
+  void getUsedRegs(const VarLocSet &CollectFrom,
+                   SmallVectorImpl<uint32_t> &UsedRegs) const;
+
+  VarLocSet &getVarLocsInMBB(const MachineBasicBlock *MBB, VarLocInMBB &Locs) {
+    std::unique_ptr<VarLocSet> &VLS = Locs[MBB];
+    if (!VLS)
+      VLS = std::make_unique<VarLocSet>(Alloc);
+    return *VLS.get();
+  }
+
+  const VarLocSet &getVarLocsInMBB(const MachineBasicBlock *MBB,
+                                   const VarLocInMBB &Locs) const {
+    auto It = Locs.find(MBB);
+    assert(It != Locs.end() && "MBB not in map");
+    return *It->second.get();
+  }
+
   /// Tests whether this instruction is a spill to a stack location.
   bool isSpillInstruction(const MachineInstr &MI, MachineFunction *MF);
 
@@ -497,7 +736,7 @@ private:
   /// TODO: Store optimization can fold spills into other stores (including
   /// other spills). We do not handle this yet (more than one memory operand).
   bool isLocationSpill(const MachineInstr &MI, MachineFunction *MF,
-                       unsigned &Reg);
+                       Register &Reg);
 
   /// Returns true if the given machine instruction is a debug value which we
   /// can emit entry values for.
@@ -511,14 +750,14 @@ private:
   /// and set \p Reg to the spilled register.
   Optional<VarLoc::SpillLoc> isRestoreInstruction(const MachineInstr &MI,
                                                   MachineFunction *MF,
-                                                  unsigned &Reg);
+                                                  Register &Reg);
   /// Given a spill instruction, extract the register and offset used to
   /// address the spill location in a target independent way.
   VarLoc::SpillLoc extractSpillBaseRegAndOffset(const MachineInstr &MI);
   void insertTransferDebugPair(MachineInstr &MI, OpenRangesSet &OpenRanges,
                                TransferMap &Transfers, VarLocMap &VarLocIDs,
-                               unsigned OldVarID, TransferKind Kind,
-                               unsigned NewReg = 0);
+                               LocIndex OldVarID, TransferKind Kind,
+                               Register NewReg = Register());
 
   void transferDebugValue(const MachineInstr &MI, OpenRangesSet &OpenRanges,
                           VarLocMap &VarLocIDs);
@@ -528,7 +767,7 @@ private:
                         VarLocMap &VarLocIDs, const VarLoc &EntryVL);
   void emitEntryValues(MachineInstr &MI, OpenRangesSet &OpenRanges,
                        VarLocMap &VarLocIDs, TransferMap &Transfers,
-                       SparseBitVector<> &KillSet);
+                       VarLocSet &KillSet);
   void recordEntryValue(const MachineInstr &MI,
                         const DefinedRegsSet &DefinedRegs,
                         OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs);
@@ -548,8 +787,7 @@ private:
   bool join(MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
             const VarLocMap &VarLocIDs,
             SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
-            SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks,
-            VarLocInMBB &PendingInLocs);
+            SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks);
 
   /// Create DBG_VALUE insts for inlocs that have been propagated but
   /// had their instruction creation deferred.
@@ -617,8 +855,8 @@ void LiveDebugValues::OpenRangesSet::erase(const VarLoc &VL) {
     auto *EraseFrom = VL.isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
     auto It = EraseFrom->find(VarToErase);
     if (It != EraseFrom->end()) {
-      unsigned ID = It->second;
-      VarLocs.reset(ID);
+      LocIndex ID = It->second;
+      VarLocs.reset(ID.getAsRawInteger());
       EraseFrom->erase(It);
     }
   };
@@ -648,23 +886,23 @@ void LiveDebugValues::OpenRangesSet::erase(const VarLoc &VL) {
 void LiveDebugValues::OpenRangesSet::erase(const VarLocSet &KillSet,
                                            const VarLocMap &VarLocIDs) {
   VarLocs.intersectWithComplement(KillSet);
-  for (unsigned ID : KillSet) {
-    const VarLoc *VL = &VarLocIDs[ID];
+  for (uint64_t ID : KillSet) {
+    const VarLoc *VL = &VarLocIDs[LocIndex::fromRawInteger(ID)];
     auto *EraseFrom = VL->isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
     EraseFrom->erase(VL->Var);
   }
 }
 
-void LiveDebugValues::OpenRangesSet::insert(unsigned VarLocID,
+void LiveDebugValues::OpenRangesSet::insert(LocIndex VarLocID,
                                             const VarLoc &VL) {
   auto *InsertInto = VL.isEntryBackupLoc() ? &EntryValuesBackupVars : &Vars;
-  VarLocs.set(VarLocID);
+  VarLocs.set(VarLocID.getAsRawInteger());
   InsertInto->insert({VL.Var, VarLocID});
 }
 
 /// Return the Loc ID of an entry value backup location, if it exists for the
 /// variable.
-llvm::Optional<unsigned>
+llvm::Optional<LocIndex>
 LiveDebugValues::OpenRangesSet::getEntryValueBackup(DebugVariable Var) {
   auto It = EntryValuesBackupVars.find(Var);
   if (It != EntryValuesBackupVars.end())
@@ -673,6 +911,57 @@ LiveDebugValues::OpenRangesSet::getEntryValueBackup(DebugVariable Var) {
   return llvm::None;
 }
 
+void LiveDebugValues::collectIDsForRegs(VarLocSet &Collected,
+                                        const DefinedRegsSet &Regs,
+                                        const VarLocSet &CollectFrom) const {
+  assert(!Regs.empty() && "Nothing to collect");
+  SmallVector<uint32_t, 32> SortedRegs;
+  for (Register Reg : Regs)
+    SortedRegs.push_back(Reg);
+  array_pod_sort(SortedRegs.begin(), SortedRegs.end());
+  auto It = CollectFrom.find(LocIndex::rawIndexForReg(SortedRegs.front()));
+  auto End = CollectFrom.end();
+  for (uint32_t Reg : SortedRegs) {
+    // The half-open interval [FirstIndexForReg, FirstInvalidIndex) contains all
+    // possible VarLoc IDs for VarLocs of kind RegisterKind which live in Reg.
+    uint64_t FirstIndexForReg = LocIndex::rawIndexForReg(Reg);
+    uint64_t FirstInvalidIndex = LocIndex::rawIndexForReg(Reg + 1);
+    It.advanceToLowerBound(FirstIndexForReg);
+
+    // Iterate through that half-open interval and collect all the set IDs.
+    for (; It != End && *It < FirstInvalidIndex; ++It)
+      Collected.set(*It);
+
+    if (It == End)
+      return;
+  }
+}
+
+void LiveDebugValues::getUsedRegs(const VarLocSet &CollectFrom,
+                                  SmallVectorImpl<uint32_t> &UsedRegs) const {
+  // All register-based VarLocs are assigned indices greater than or equal to
+  // FirstRegIndex.
+  uint64_t FirstRegIndex = LocIndex::rawIndexForReg(1);
+  uint64_t FirstInvalidIndex =
+      LocIndex::rawIndexForReg(LocIndex::kFirstInvalidRegLocation);
+  for (auto It = CollectFrom.find(FirstRegIndex),
+            End = CollectFrom.find(FirstInvalidIndex);
+       It != End;) {
+    // We found a VarLoc ID for a VarLoc that lives in a register. Figure out
+    // which register and add it to UsedRegs.
+    uint32_t FoundReg = LocIndex::fromRawInteger(*It).Location;
+    assert((UsedRegs.empty() || FoundReg != UsedRegs.back()) &&
+           "Duplicate used reg");
+    UsedRegs.push_back(FoundReg);
+
+    // Skip to the next /set/ register. Note that this finds a lower bound, so
+    // even if there aren't any VarLocs living in `FoundReg+1`, we're still
+    // guaranteed to move on to the next register (or to end()).
+    uint64_t NextRegIndex = LocIndex::rawIndexForReg(FoundReg + 1);
+    It.advanceToLowerBound(NextRegIndex);
+  }
+}
+
 //===----------------------------------------------------------------------===//
 //            Debug Range Extension Implementation
 //===----------------------------------------------------------------------===//
@@ -685,12 +974,14 @@ void LiveDebugValues::printVarLocInMBB(const MachineFunction &MF,
                                        raw_ostream &Out) const {
   Out << '\n' << msg << '\n';
   for (const MachineBasicBlock &BB : MF) {
-    const VarLocSet &L = V.lookup(&BB);
+    if (!V.count(&BB))
+      continue;
+    const VarLocSet &L = getVarLocsInMBB(&BB, V);
     if (L.empty())
       continue;
     Out << "MBB: " << BB.getNumber() << ":\n";
-    for (unsigned VLL : L) {
-      const VarLoc &VL = VarLocIDs[VLL];
+    for (uint64_t VLL : L) {
+      const VarLoc &VL = VarLocIDs[LocIndex::fromRawInteger(VLL)];
       Out << " Var: " << VL.Var.getVariable()->getName();
       Out << " MI: ";
       VL.dump(TRI, Out);
@@ -710,7 +1001,7 @@ LiveDebugValues::extractSpillBaseRegAndOffset(const MachineInstr &MI) {
          "Inconsistent memory operand in spill instruction");
   int FI = cast<FixedStackPseudoSourceValue>(PVal)->getFrameIndex();
   const MachineBasicBlock *MBB = MI.getParent();
-  unsigned Reg;
+  Register Reg;
   int Offset = TFI->getFrameIndexReference(*MBB->getParent(), FI, Reg);
   return {Reg, Offset};
 }
@@ -730,7 +1021,7 @@ bool LiveDebugValues::removeEntryValue(const MachineInstr &MI,
   // the entry value any more. In addition, if the debug expression from the
   // DBG_VALUE is not empty, we can assume the parameter's value has changed
   // indicating that we should stop tracking its entry value as well.
-  if (!MI.getOperand(0).isReg() ||
+  if (!MI.getDebugOperand(0).isReg() ||
       MI.getDebugExpression()->getNumElements() != 0)
     return true;
 
@@ -738,7 +1029,7 @@ bool LiveDebugValues::removeEntryValue(const MachineInstr &MI,
   // it means the parameter's value has not changed and we should be able to use
   // its entry value.
   bool TrySalvageEntryValue = false;
-  Register Reg = MI.getOperand(0).getReg();
+  Register Reg = MI.getDebugOperand(0).getReg();
   auto I = std::next(MI.getReverseIterator());
   const MachineOperand *SrcRegOp, *DestRegOp;
   if (I != MI.getParent()->rend()) {
@@ -757,13 +1048,10 @@ bool LiveDebugValues::removeEntryValue(const MachineInstr &MI,
   }
 
   if (TrySalvageEntryValue) {
-    for (unsigned ID : OpenRanges.getVarLocs()) {
-      const VarLoc &VL = VarLocIDs[ID];
-      if (!VL.isEntryBackupLoc())
-        continue;
-
+    for (uint64_t ID : OpenRanges.getEntryValueBackupVarLocs()) {
+      const VarLoc &VL = VarLocIDs[LocIndex::fromRawInteger(ID)];
       if (VL.getEntryValueCopyBackupReg() == Reg &&
-          VL.MI.getOperand(0).getReg() == SrcRegOp->getReg())
+          VL.MI.getDebugOperand(0).getReg() == SrcRegOp->getReg())
         return false;
     }
   }
@@ -801,23 +1089,25 @@ void LiveDebugValues::transferDebugValue(const MachineInstr &MI,
     }
   }
 
-  unsigned ID;
-  if (isDbgValueDescribedByReg(MI) || MI.getOperand(0).isImm() ||
-      MI.getOperand(0).isFPImm() || MI.getOperand(0).isCImm()) {
+  if (isDbgValueDescribedByReg(MI) || MI.getDebugOperand(0).isImm() ||
+      MI.getDebugOperand(0).isFPImm() || MI.getDebugOperand(0).isCImm()) {
     // Use normal VarLoc constructor for registers and immediates.
     VarLoc VL(MI, LS);
     // End all previous ranges of VL.Var.
     OpenRanges.erase(VL);
 
-    ID = VarLocIDs.insert(VL);
+    LocIndex ID = VarLocIDs.insert(VL);
     // Add the VarLoc to OpenRanges from this DBG_VALUE.
     OpenRanges.insert(ID, VL);
   } else if (MI.hasOneMemOperand()) {
     llvm_unreachable("DBG_VALUE with mem operand encountered after regalloc?");
   } else {
-    // This must be an undefined location. We should leave OpenRanges closed.
-    assert(MI.getOperand(0).isReg() && MI.getOperand(0).getReg() == 0 &&
+    // This must be an undefined location. If it has an open range, erase it.
+    assert(MI.getDebugOperand(0).isReg() &&
+           MI.getDebugOperand(0).getReg() == 0 &&
            "Unexpected non-undef DBG_VALUE encountered");
+    VarLoc VL(MI, LS);
+    OpenRanges.erase(VL);
   }
 }
 
@@ -826,13 +1116,20 @@ void LiveDebugValues::emitEntryValues(MachineInstr &MI,
                                       OpenRangesSet &OpenRanges,
                                       VarLocMap &VarLocIDs,
                                       TransferMap &Transfers,
-                                      SparseBitVector<> &KillSet) {
-  for (unsigned ID : KillSet) {
-    if (!VarLocIDs[ID].Var.getVariable()->isParameter())
+                                      VarLocSet &KillSet) {
+  // Do not insert entry value locations after a terminator.
+  if (MI.isTerminator())
+    return;
+
+  for (uint64_t ID : KillSet) {
+    LocIndex Idx = LocIndex::fromRawInteger(ID);
+    const VarLoc &VL = VarLocIDs[Idx];
+    if (!VL.Var.getVariable()->isParameter())
       continue;
 
-    auto DebugVar = VarLocIDs[ID].Var;
-    auto EntryValBackupID = OpenRanges.getEntryValueBackup(DebugVar);
+    auto DebugVar = VL.Var;
+    Optional<LocIndex> EntryValBackupID =
+        OpenRanges.getEntryValueBackup(DebugVar);
 
     // If the parameter has the entry value backup, it means we should
     // be able to use its entry value.
@@ -842,7 +1139,7 @@ void LiveDebugValues::emitEntryValues(MachineInstr &MI,
     const VarLoc &EntryVL = VarLocIDs[*EntryValBackupID];
     VarLoc EntryLoc =
         VarLoc::CreateEntryLoc(EntryVL.MI, LS, EntryVL.Expr, EntryVL.Loc.RegNo);
-    unsigned EntryValueID = VarLocIDs.insert(EntryLoc);
+    LocIndex EntryValueID = VarLocIDs.insert(EntryLoc);
     Transfers.push_back({&MI, EntryValueID});
     OpenRanges.insert(EntryValueID, EntryLoc);
   }
@@ -855,12 +1152,12 @@ void LiveDebugValues::emitEntryValues(MachineInstr &MI,
 /// otherwise it is variable's location on the stack.
 void LiveDebugValues::insertTransferDebugPair(
     MachineInstr &MI, OpenRangesSet &OpenRanges, TransferMap &Transfers,
-    VarLocMap &VarLocIDs, unsigned OldVarID, TransferKind Kind,
-    unsigned NewReg) {
+    VarLocMap &VarLocIDs, LocIndex OldVarID, TransferKind Kind,
+    Register NewReg) {
   const MachineInstr *DebugInstr = &VarLocIDs[OldVarID].MI;
 
   auto ProcessVarLoc = [&MI, &OpenRanges, &Transfers, &VarLocIDs](VarLoc &VL) {
-    unsigned LocId = VarLocIDs.insert(VL);
+    LocIndex LocId = VarLocIDs.insert(VL);
 
     // Close this variable's previous location range.
     OpenRanges.erase(VL);
@@ -868,6 +1165,7 @@ void LiveDebugValues::insertTransferDebugPair(
     // Record the new location as an open range, and a postponed transfer
     // inserting a DBG_VALUE for this location.
     OpenRanges.insert(LocId, VL);
+    assert(!MI.isTerminator() && "Cannot insert DBG_VALUE after terminator");
     TransferDebugPair MIP = {&MI, LocId};
     Transfers.push_back(MIP);
   };
@@ -922,39 +1220,67 @@ void LiveDebugValues::insertTransferDebugPair(
 void LiveDebugValues::transferRegisterDef(
     MachineInstr &MI, OpenRangesSet &OpenRanges, VarLocMap &VarLocIDs,
     TransferMap &Transfers) {
+
+  // Meta Instructions do not affect the debug liveness of any register they
+  // define.
+  if (MI.isMetaInstruction())
+    return;
+
   MachineFunction *MF = MI.getMF();
   const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
-  unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
-  SparseBitVector<> KillSet;
+  Register SP = TLI->getStackPointerRegisterToSaveRestore();
+
+  // Find the regs killed by MI, and find regmasks of preserved regs.
+  DefinedRegsSet DeadRegs;
+  SmallVector<const uint32_t *, 4> RegMasks;
   for (const MachineOperand &MO : MI.operands()) {
-    // Determine whether the operand is a register def.  Assume that call
-    // instructions never clobber SP, because some backends (e.g., AArch64)
-    // never list SP in the regmask.
+    // Determine whether the operand is a register def.
     if (MO.isReg() && MO.isDef() && MO.getReg() &&
         Register::isPhysicalRegister(MO.getReg()) &&
         !(MI.isCall() && MO.getReg() == SP)) {
       // Remove ranges of all aliased registers.
       for (MCRegAliasIterator RAI(MO.getReg(), TRI, true); RAI.isValid(); ++RAI)
-        for (unsigned ID : OpenRanges.getVarLocs())
-          if (VarLocIDs[ID].isDescribedByReg() == *RAI)
-            KillSet.set(ID);
+        // FIXME: Can we break out of this loop early if no insertion occurs?
+        DeadRegs.insert(*RAI);
     } else if (MO.isRegMask()) {
+      RegMasks.push_back(MO.getRegMask());
+    }
+  }
+
+  // Erase VarLocs which reside in one of the dead registers. For performance
+  // reasons, it's critical to not iterate over the full set of open VarLocs.
+  // Iterate over the set of dying/used regs instead.
+  if (!RegMasks.empty()) {
+    SmallVector<uint32_t, 32> UsedRegs;
+    getUsedRegs(OpenRanges.getVarLocs(), UsedRegs);
+    for (uint32_t Reg : UsedRegs) {
       // Remove ranges of all clobbered registers. Register masks don't usually
-      // list SP as preserved.  While the debug info may be off for an
-      // instruction or two around callee-cleanup calls, transferring the
-      // DEBUG_VALUE across the call is still a better user experience.
-      for (unsigned ID : OpenRanges.getVarLocs()) {
-        unsigned Reg = VarLocIDs[ID].isDescribedByReg();
-        if (Reg && Reg != SP && MO.clobbersPhysReg(Reg))
-          KillSet.set(ID);
-      }
+      // list SP as preserved. Assume that call instructions never clobber SP,
+      // because some backends (e.g., AArch64) never list SP in the regmask.
+      // While the debug info may be off for an instruction or two around
+      // callee-cleanup calls, transferring the DEBUG_VALUE across the call is
+      // still a better user experience.
+      if (Reg == SP)
+        continue;
+      bool AnyRegMaskKillsReg =
+          any_of(RegMasks, [Reg](const uint32_t *RegMask) {
+            return MachineOperand::clobbersPhysReg(RegMask, Reg);
+          });
+      if (AnyRegMaskKillsReg)
+        DeadRegs.insert(Reg);
     }
   }
+
+  if (DeadRegs.empty())
+    return;
+
+  VarLocSet KillSet(Alloc);
+  collectIDsForRegs(KillSet, DeadRegs, OpenRanges.getVarLocs());
   OpenRanges.erase(KillSet, VarLocIDs);
 
   if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
     auto &TM = TPC->getTM<TargetMachine>();
-    if (TM.Options.EnableDebugEntryValues)
+    if (TM.Options.ShouldEmitDebugEntryValues())
       emitEntryValues(MI, OpenRanges, VarLocIDs, Transfers, KillSet);
   }
 }
@@ -973,11 +1299,11 @@ bool LiveDebugValues::isSpillInstruction(const MachineInstr &MI,
 }
 
 bool LiveDebugValues::isLocationSpill(const MachineInstr &MI,
-                                      MachineFunction *MF, unsigned &Reg) {
+                                      MachineFunction *MF, Register &Reg) {
   if (!isSpillInstruction(MI, MF))
     return false;
 
-  auto isKilledReg = [&](const MachineOperand MO, unsigned &Reg) {
+  auto isKilledReg = [&](const MachineOperand MO, Register &Reg) {
     if (!MO.isReg() || !MO.isUse()) {
       Reg = 0;
       return false;
@@ -999,7 +1325,7 @@ bool LiveDebugValues::isLocationSpill(const MachineInstr &MI,
       // Skip next instruction that points to basic block end iterator.
       if (MI.getParent()->end() == NextI)
         continue;
-      unsigned RegNext;
+      Register RegNext;
       for (const MachineOperand &MONext : NextI->operands()) {
         // Return true if we came across the register from the
         // previous spill instruction that is killed in NextI.
@@ -1014,7 +1340,7 @@ bool LiveDebugValues::isLocationSpill(const MachineInstr &MI,
 
 Optional<LiveDebugValues::VarLoc::SpillLoc>
 LiveDebugValues::isRestoreInstruction(const MachineInstr &MI,
-                                      MachineFunction *MF, unsigned &Reg) {
+                                      MachineFunction *MF, Register &Reg) {
   if (!MI.hasOneMemOperand())
     return None;
 
@@ -1040,7 +1366,7 @@ void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
                                                  TransferMap &Transfers) {
   MachineFunction *MF = MI.getMF();
   TransferKind TKind;
-  unsigned Reg;
+  Register Reg;
   Optional<VarLoc::SpillLoc> Loc;
 
   LLVM_DEBUG(dbgs() << "Examining instruction: "; MI.dump(););
@@ -1048,12 +1374,14 @@ void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
   // First, if there are any DBG_VALUEs pointing at a spill slot that is
   // written to, then close the variable location. The value in memory
   // will have changed.
-  VarLocSet KillSet;
+  VarLocSet KillSet(Alloc);
   if (isSpillInstruction(MI, MF)) {
     Loc = extractSpillBaseRegAndOffset(MI);
-    for (unsigned ID : OpenRanges.getVarLocs()) {
-      const VarLoc &VL = VarLocIDs[ID];
-      if (VL.Kind == VarLoc::SpillLocKind && VL.Loc.SpillLocation == *Loc) {
+    for (uint64_t ID : OpenRanges.getSpillVarLocs()) {
+      LocIndex Idx = LocIndex::fromRawInteger(ID);
+      const VarLoc &VL = VarLocIDs[Idx];
+      assert(VL.Kind == VarLoc::SpillLocKind && "Broken VarLocSet?");
+      if (VL.Loc.SpillLocation == *Loc) {
         // This location is overwritten by the current instruction -- terminate
         // the open range, and insert an explicit DBG_VALUE $noreg.
         //
@@ -1066,7 +1394,7 @@ void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
         // where they are located; it's best to fix handle overwrites now.
         KillSet.set(ID);
         VarLoc UndefVL = VarLoc::CreateCopyLoc(VL.MI, LS, 0);
-        unsigned UndefLocID = VarLocIDs.insert(UndefVL);
+        LocIndex UndefLocID = VarLocIDs.insert(UndefVL);
         Transfers.push_back({&MI, UndefLocID});
       }
     }
@@ -1089,20 +1417,31 @@ void LiveDebugValues::transferSpillOrRestoreInst(MachineInstr &MI,
                       << "\n");
   }
   // Check if the register or spill location is the location of a debug value.
-  for (unsigned ID : OpenRanges.getVarLocs()) {
-    if (TKind == TransferKind::TransferSpill &&
-        VarLocIDs[ID].isDescribedByReg() == Reg) {
+  auto TransferCandidates = OpenRanges.getEmptyVarLocRange();
+  if (TKind == TransferKind::TransferSpill)
+    TransferCandidates = OpenRanges.getRegisterVarLocs(Reg);
+  else if (TKind == TransferKind::TransferRestore)
+    TransferCandidates = OpenRanges.getSpillVarLocs();
+  for (uint64_t ID : TransferCandidates) {
+    LocIndex Idx = LocIndex::fromRawInteger(ID);
+    const VarLoc &VL = VarLocIDs[Idx];
+    if (TKind == TransferKind::TransferSpill) {
+      assert(VL.isDescribedByReg() == Reg && "Broken VarLocSet?");
       LLVM_DEBUG(dbgs() << "Spilling Register " << printReg(Reg, TRI) << '('
-                        << VarLocIDs[ID].Var.getVariable()->getName() << ")\n");
-    } else if (TKind == TransferKind::TransferRestore &&
-               VarLocIDs[ID].Kind == VarLoc::SpillLocKind &&
-               VarLocIDs[ID].Loc.SpillLocation == *Loc) {
+                        << VL.Var.getVariable()->getName() << ")\n");
+    } else {
+      assert(TKind == TransferKind::TransferRestore &&
+             VL.Kind == VarLoc::SpillLocKind && "Broken VarLocSet?");
+      if (VL.Loc.SpillLocation != *Loc)
+        // The spill location is not the location of a debug value.
+        continue;
       LLVM_DEBUG(dbgs() << "Restoring Register " << printReg(Reg, TRI) << '('
-                        << VarLocIDs[ID].Var.getVariable()->getName() << ")\n");
-    } else
-      continue;
-    insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, ID, TKind,
+                        << VL.Var.getVariable()->getName() << ")\n");
+    }
+    insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, Idx, TKind,
                             Reg);
+    // FIXME: A comment should explain why it's correct to return early here,
+    // if that is in fact correct.
     return;
   }
 }
@@ -1124,7 +1463,7 @@ void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
   if (!DestRegOp->isDef())
     return;
 
-  auto isCalleeSavedReg = [&](unsigned Reg) {
+  auto isCalleeSavedReg = [&](Register Reg) {
     for (MCRegAliasIterator RAI(Reg, TRI, true); RAI.isValid(); ++RAI)
       if (CalleeSavedRegs.test(*RAI))
         return true;
@@ -1146,17 +1485,19 @@ void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
   // a parameter describing only a moving of the value around, rather then
   // modifying it, we are still able to use the entry value if needed.
   if (isRegOtherThanSPAndFP(*DestRegOp, MI, TRI)) {
-    for (unsigned ID : OpenRanges.getVarLocs()) {
-      if (VarLocIDs[ID].getEntryValueBackupReg() == SrcReg) {
+    for (uint64_t ID : OpenRanges.getEntryValueBackupVarLocs()) {
+      LocIndex Idx = LocIndex::fromRawInteger(ID);
+      const VarLoc &VL = VarLocIDs[Idx];
+      if (VL.getEntryValueBackupReg() == SrcReg) {
         LLVM_DEBUG(dbgs() << "Copy of the entry value: "; MI.dump(););
-        VarLoc EntryValLocCopyBackup = VarLoc::CreateEntryCopyBackupLoc(
-            VarLocIDs[ID].MI, LS, VarLocIDs[ID].Expr, DestReg);
+        VarLoc EntryValLocCopyBackup =
+            VarLoc::CreateEntryCopyBackupLoc(VL.MI, LS, VL.Expr, DestReg);
 
         // Stop tracking the original entry value.
-        OpenRanges.erase(VarLocIDs[ID]);
+        OpenRanges.erase(VL);
 
         // Start tracking the entry value copy.
-        unsigned EntryValCopyLocID = VarLocIDs.insert(EntryValLocCopyBackup);
+        LocIndex EntryValCopyLocID = VarLocIDs.insert(EntryValLocCopyBackup);
         OpenRanges.insert(EntryValCopyLocID, EntryValLocCopyBackup);
         break;
       }
@@ -1166,12 +1507,14 @@ void LiveDebugValues::transferRegisterCopy(MachineInstr &MI,
   if (!SrcRegOp->isKill())
     return;
 
-  for (unsigned ID : OpenRanges.getVarLocs()) {
-    if (VarLocIDs[ID].isDescribedByReg() == SrcReg) {
-      insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, ID,
-                              TransferKind::TransferCopy, DestReg);
-      return;
-    }
+  for (uint64_t ID : OpenRanges.getRegisterVarLocs(SrcReg)) {
+    LocIndex Idx = LocIndex::fromRawInteger(ID);
+    assert(VarLocIDs[Idx].isDescribedByReg() == SrcReg && "Broken VarLocSet?");
+    insertTransferDebugPair(MI, OpenRanges, Transfers, VarLocIDs, Idx,
+                            TransferKind::TransferCopy, DestReg);
+    // FIXME: A comment should explain why it's correct to return early here,
+    // if that is in fact correct.
+    return;
   }
 }
 
@@ -1182,13 +1525,13 @@ bool LiveDebugValues::transferTerminator(MachineBasicBlock *CurMBB,
                                          const VarLocMap &VarLocIDs) {
   bool Changed = false;
 
-  LLVM_DEBUG(for (unsigned ID
+  LLVM_DEBUG(for (uint64_t ID
                   : OpenRanges.getVarLocs()) {
     // Copy OpenRanges to OutLocs, if not already present.
     dbgs() << "Add to OutLocs in MBB #" << CurMBB->getNumber() << ":  ";
-    VarLocIDs[ID].dump(TRI);
+    VarLocIDs[LocIndex::fromRawInteger(ID)].dump(TRI);
   });
-  VarLocSet &VLS = OutLocs[CurMBB];
+  VarLocSet &VLS = getVarLocsInMBB(CurMBB, OutLocs);
   Changed = VLS != OpenRanges.getVarLocs();
   // New OutLocs set may be different due to spill, restore or register
   // copy instruction processing.
@@ -1275,12 +1618,10 @@ bool LiveDebugValues::join(
     MachineBasicBlock &MBB, VarLocInMBB &OutLocs, VarLocInMBB &InLocs,
     const VarLocMap &VarLocIDs,
     SmallPtrSet<const MachineBasicBlock *, 16> &Visited,
-    SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks,
-    VarLocInMBB &PendingInLocs) {
+    SmallPtrSetImpl<const MachineBasicBlock *> &ArtificialBlocks) {
   LLVM_DEBUG(dbgs() << "join MBB: " << MBB.getNumber() << "\n");
-  bool Changed = false;
 
-  VarLocSet InLocsT; // Temporary incoming locations.
+  VarLocSet InLocsT(Alloc); // Temporary incoming locations.
 
   // For all predecessors of this MBB, find the set of VarLocs that
   // can be joined.
@@ -1303,16 +1644,20 @@ bool LiveDebugValues::join(
 
     // Just copy over the Out locs to incoming locs for the first visited
     // predecessor, and for all other predecessors join the Out locs.
+    VarLocSet &OutLocVLS = *OL->second.get();
     if (!NumVisited)
-      InLocsT = OL->second;
+      InLocsT = OutLocVLS;
     else
-      InLocsT &= OL->second;
+      InLocsT &= OutLocVLS;
 
     LLVM_DEBUG({
       if (!InLocsT.empty()) {
-        for (auto ID : InLocsT)
+        for (uint64_t ID : InLocsT)
           dbgs() << "  gathered candidate incoming var: "
-                 << VarLocIDs[ID].Var.getVariable()->getName() << "\n";
+                 << VarLocIDs[LocIndex::fromRawInteger(ID)]
+                        .Var.getVariable()
+                        ->getName()
+                 << "\n";
       }
     });
 
@@ -1320,14 +1665,15 @@ bool LiveDebugValues::join(
   }
 
   // Filter out DBG_VALUES that are out of scope.
-  VarLocSet KillSet;
+  VarLocSet KillSet(Alloc);
   bool IsArtificial = ArtificialBlocks.count(&MBB);
   if (!IsArtificial) {
-    for (auto ID : InLocsT) {
-      if (!VarLocIDs[ID].dominates(MBB)) {
+    for (uint64_t ID : InLocsT) {
+      LocIndex Idx = LocIndex::fromRawInteger(ID);
+      if (!VarLocIDs[Idx].dominates(LS, MBB)) {
         KillSet.set(ID);
         LLVM_DEBUG({
-          auto Name = VarLocIDs[ID].Var.getVariable()->getName();
+          auto Name = VarLocIDs[Idx].Var.getVariable()->getName();
           dbgs() << "  killing " << Name << ", it doesn't dominate MBB\n";
         });
       }
@@ -1341,30 +1687,10 @@ bool LiveDebugValues::join(
   assert((NumVisited || MBB.pred_empty()) &&
          "Should have processed at least one predecessor");
 
-  VarLocSet &ILS = InLocs[&MBB];
-  VarLocSet &Pending = PendingInLocs[&MBB];
-
-  // New locations will have DBG_VALUE insts inserted at the start of the
-  // block, after location propagation has finished. Record the insertions
-  // that we need to perform in the Pending set.
-  VarLocSet Diff = InLocsT;
-  Diff.intersectWithComplement(ILS);
-  for (auto ID : Diff) {
-    Pending.set(ID);
-    ILS.set(ID);
-    ++NumInserted;
-    Changed = true;
-  }
-
-  // We may have lost locations by learning about a predecessor that either
-  // loses or moves a variable. Find any locations in ILS that are not in the
-  // new in-locations, and delete those.
-  VarLocSet Removed = ILS;
-  Removed.intersectWithComplement(InLocsT);
-  for (auto ID : Removed) {
-    Pending.reset(ID);
-    ILS.reset(ID);
-    ++NumRemoved;
+  VarLocSet &ILS = getVarLocsInMBB(&MBB, InLocs);
+  bool Changed = false;
+  if (ILS != InLocsT) {
+    ILS = InLocsT;
     Changed = true;
   }
 
@@ -1378,12 +1704,12 @@ void LiveDebugValues::flushPendingLocs(VarLocInMBB &PendingInLocs,
   for (auto &Iter : PendingInLocs) {
     // Map is keyed on a constant pointer, unwrap it so we can insert insts.
     auto &MBB = const_cast<MachineBasicBlock &>(*Iter.first);
-    VarLocSet &Pending = Iter.second;
+    VarLocSet &Pending = *Iter.second.get();
 
-    for (unsigned ID : Pending) {
+    for (uint64_t ID : Pending) {
       // The ID location is live-in to MBB -- work out what kind of machine
       // location it is and create a DBG_VALUE.
-      const VarLoc &DiffIt = VarLocIDs[ID];
+      const VarLoc &DiffIt = VarLocIDs[LocIndex::fromRawInteger(ID)];
       if (DiffIt.isEntryBackupLoc())
         continue;
       MachineInstr *MI = DiffIt.BuildDbgValue(*MBB.getParent());
@@ -1411,25 +1737,21 @@ bool LiveDebugValues::isEntryValueCandidate(
   if (MI.getDebugLoc()->getInlinedAt())
     return false;
 
-  // Do not consider indirect debug values (TODO: explain why).
-  if (MI.isIndirectDebugValue())
-    return false;
-
   // Only consider parameters that are described using registers. Parameters
   // that are passed on the stack are not yet supported, so ignore debug
   // values that are described by the frame or stack pointer.
-  if (!isRegOtherThanSPAndFP(MI.getOperand(0), MI, TRI))
+  if (!isRegOtherThanSPAndFP(MI.getDebugOperand(0), MI, TRI))
     return false;
 
   // If a parameter's value has been propagated from the caller, then the
   // parameter's DBG_VALUE may be described using a register defined by some
   // instruction in the entry block, in which case we shouldn't create an
   // entry value.
-  if (DefinedRegs.count(MI.getOperand(0).getReg()))
+  if (DefinedRegs.count(MI.getDebugOperand(0).getReg()))
     return false;
 
   // TODO: Add support for parameters that have a pre-existing debug expressions
-  // (e.g. fragments, or indirect parameters using DW_OP_deref).
+  // (e.g. fragments).
   if (MI.getDebugExpression()->getNumElements() > 0)
     return false;
 
@@ -1454,7 +1776,7 @@ void LiveDebugValues::recordEntryValue(const MachineInstr &MI,
                                        VarLocMap &VarLocIDs) {
   if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>()) {
     auto &TM = TPC->getTM<TargetMachine>();
-    if (!TM.Options.EnableDebugEntryValues)
+    if (!TM.Options.ShouldEmitDebugEntryValues())
       return;
   }
 
@@ -1472,7 +1794,7 @@ void LiveDebugValues::recordEntryValue(const MachineInstr &MI,
   DIExpression *NewExpr =
       DIExpression::prepend(MI.getDebugExpression(), DIExpression::EntryValue);
   VarLoc EntryValLocAsBackup = VarLoc::CreateEntryBackupLoc(MI, LS, NewExpr);
-  unsigned EntryValLocID = VarLocIDs.insert(EntryValLocAsBackup);
+  LocIndex EntryValLocID = VarLocIDs.insert(EntryValLocAsBackup);
   OpenRanges.insert(EntryValLocID, EntryValLocAsBackup);
 }
 
@@ -1487,15 +1809,12 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
 
   VarLocMap VarLocIDs;         // Map VarLoc<>unique ID for use in bitvectors.
   OverlapMap OverlapFragments; // Map of overlapping variable fragments.
-  OpenRangesSet OpenRanges(OverlapFragments);
+  OpenRangesSet OpenRanges(Alloc, OverlapFragments);
                               // Ranges that are open until end of bb.
   VarLocInMBB OutLocs;        // Ranges that exist beyond bb.
   VarLocInMBB InLocs;         // Ranges that are incoming after joining.
   TransferMap Transfers;      // DBG_VALUEs associated with transfers (such as
                               // spills, copies and restores).
-  VarLocInMBB PendingInLocs;  // Ranges that are incoming after joining, but
-                              // that we have deferred creating DBG_VALUE insts
-                              // for immediately.
 
   VarToFragments SeenFragments;
 
@@ -1526,14 +1845,10 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
   }
 
   // Initialize per-block structures and scan for fragment overlaps.
-  for (auto &MBB : MF) {
-    PendingInLocs[&MBB] = VarLocSet();
-
-    for (auto &MI : MBB) {
+  for (auto &MBB : MF)
+    for (auto &MI : MBB)
       if (MI.isDebugValue())
         accumulateFragmentMap(MI, SeenFragments, OverlapFragments);
-    }
-  }
 
   auto hasNonArtificialLocation = [](const MachineInstr &MI) -> bool {
     if (const DebugLoc &DL = MI.getDebugLoc())
@@ -1555,6 +1870,22 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
     Worklist.push(RPONumber);
     ++RPONumber;
   }
+
+  if (RPONumber > InputBBLimit) {
+    unsigned NumInputDbgValues = 0;
+    for (auto &MBB : MF)
+      for (auto &MI : MBB)
+        if (MI.isDebugValue())
+          ++NumInputDbgValues;
+    if (NumInputDbgValues > InputDbgValueLimit) {
+      LLVM_DEBUG(dbgs() << "Disabling LiveDebugValues: " << MF.getName()
+                        << " has " << RPONumber << " basic blocks and "
+                        << NumInputDbgValues
+                        << " input DBG_VALUEs, exceeding limits.\n");
+      return false;
+    }
+  }
+
   // This is a standard "union of predecessor outs" dataflow problem.
   // To solve it, we perform join() and process() using the two worklist method
   // until the ranges converge.
@@ -1570,7 +1901,7 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
       MachineBasicBlock *MBB = OrderToBB[Worklist.top()];
       Worklist.pop();
       MBBJoined = join(*MBB, OutLocs, InLocs, VarLocIDs, Visited,
-                       ArtificialBlocks, PendingInLocs);
+                       ArtificialBlocks);
       MBBJoined |= Visited.insert(MBB).second;
       if (MBBJoined) {
         MBBJoined = false;
@@ -1579,7 +1910,7 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
         // examine spill, copy and restore instructions to see whether they
         // operate with registers that correspond to user variables.
         // First load any pending inlocs.
-        OpenRanges.insertFromLocSet(PendingInLocs[MBB], VarLocIDs);
+        OpenRanges.insertFromLocSet(getVarLocsInMBB(MBB, InLocs), VarLocIDs);
         for (auto &MI : *MBB)
           process(MI, OpenRanges, VarLocIDs, Transfers);
         OLChanged |= transferTerminator(MBB, OpenRanges, OutLocs, VarLocIDs);
@@ -1606,6 +1937,8 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
 
   // Add any DBG_VALUE instructions created by location transfers.
   for (auto &TR : Transfers) {
+    assert(!TR.TransferInst->isTerminator() &&
+           "Cannot insert DBG_VALUE after terminator");
     MachineBasicBlock *MBB = TR.TransferInst->getParent();
     const VarLoc &VL = VarLocIDs[TR.LocationID];
     MachineInstr *MI = VL.BuildDbgValue(MF);
@@ -1615,7 +1948,7 @@ bool LiveDebugValues::ExtendRanges(MachineFunction &MF) {
 
   // Deferred inlocs will not have had any DBG_VALUE insts created; do
   // that now.
-  flushPendingLocs(PendingInLocs, VarLocIDs);
+  flushPendingLocs(InLocs, VarLocIDs);
 
   LLVM_DEBUG(printVarLocInMBB(MF, OutLocs, VarLocIDs, "Final OutLocs", dbgs()));
   LLVM_DEBUG(printVarLocInMBB(MF, InLocs, VarLocIDs, "Final InLocs", dbgs()));
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.cpp b/llvm/lib/CodeGen/LiveDebugVariables.cpp
index 2cc547a6b741..158e873370b1 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.cpp
+++ b/llvm/lib/CodeGen/LiveDebugVariables.cpp
@@ -96,44 +96,49 @@ LiveDebugVariables::LiveDebugVariables() : MachineFunctionPass(ID) {
 
 enum : unsigned { UndefLocNo = ~0U };
 
-/// Describes a location by number along with some flags about the original
-/// usage of the location.
-class DbgValueLocation {
+/// Describes a debug variable value by location number and expression along
+/// with some flags about the original usage of the location.
+class DbgVariableValue {
 public:
-  DbgValueLocation(unsigned LocNo)
-      : LocNo(LocNo) {
-    static_assert(sizeof(*this) == sizeof(unsigned), "bad bitfield packing");
-    assert(locNo() == LocNo && "location truncation");
+  DbgVariableValue(unsigned LocNo, bool WasIndirect,
+                   const DIExpression &Expression)
+      : LocNo(LocNo), WasIndirect(WasIndirect), Expression(&Expression) {
+    assert(getLocNo() == LocNo && "location truncation");
   }
 
-  DbgValueLocation() : LocNo(0) {}
+  DbgVariableValue() : LocNo(0), WasIndirect(0) {}
 
-  unsigned locNo() const {
+  const DIExpression *getExpression() const { return Expression; }
+  unsigned getLocNo() const {
     // Fix up the undef location number, which gets truncated.
     return LocNo == INT_MAX ? UndefLocNo : LocNo;
   }
-  bool isUndef() const { return locNo() == UndefLocNo; }
+  bool getWasIndirect() const { return WasIndirect; }
+  bool isUndef() const { return getLocNo() == UndefLocNo; }
 
-  DbgValueLocation changeLocNo(unsigned NewLocNo) const {
-    return DbgValueLocation(NewLocNo);
+  DbgVariableValue changeLocNo(unsigned NewLocNo) const {
+    return DbgVariableValue(NewLocNo, WasIndirect, *Expression);
   }
 
-  friend inline bool operator==(const DbgValueLocation &LHS,
-                                const DbgValueLocation &RHS) {
-    return LHS.LocNo == RHS.LocNo;
+  friend inline bool operator==(const DbgVariableValue &LHS,
+                                const DbgVariableValue &RHS) {
+    return LHS.LocNo == RHS.LocNo && LHS.WasIndirect == RHS.WasIndirect &&
+           LHS.Expression == RHS.Expression;
   }
 
-  friend inline bool operator!=(const DbgValueLocation &LHS,
-                                const DbgValueLocation &RHS) {
+  friend inline bool operator!=(const DbgVariableValue &LHS,
+                                const DbgVariableValue &RHS) {
     return !(LHS == RHS);
   }
 
 private:
-  unsigned LocNo;
+  unsigned LocNo : 31;
+  unsigned WasIndirect : 1;
+  const DIExpression *Expression = nullptr;
 };
 
-/// Map of where a user value is live, and its location.
-using LocMap = IntervalMap<SlotIndex, DbgValueLocation, 4>;
+/// Map of where a user value is live to that value.
+using LocMap = IntervalMap<SlotIndex, DbgVariableValue, 4>;
 
 /// Map of stack slot offsets for spilled locations.
 /// Non-spilled locations are not added to the map.
@@ -149,12 +154,12 @@ class LDVImpl;
 /// holds part of a user variable. The part is identified by a byte offset.
 ///
 /// UserValues are grouped into equivalence classes for easier searching. Two
-/// user values are related if they refer to the same variable, or if they are
-/// held by the same virtual register. The equivalence class is the transitive
-/// closure of that relation.
+/// user values are related if they are held by the same virtual register. The
+/// equivalence class is the transitive closure of that relation.
 class UserValue {
   const DILocalVariable *Variable; ///< The debug info variable we are part of.
-  const DIExpression *Expression; ///< Any complex address expression.
+  /// The part of the variable we describe.
+  const Optional<DIExpression::FragmentInfo> Fragment;
   DebugLoc dl;            ///< The debug location for the variable. This is
                           ///< used by dwarf writer to find lexical scope.
   UserValue *leader;      ///< Equivalence class leader.
@@ -166,23 +171,28 @@ class UserValue {
   /// Map of slot indices where this value is live.
   LocMap locInts;
 
-  /// Insert a DBG_VALUE into MBB at Idx for LocNo.
+  /// Set of interval start indexes that have been trimmed to the
+  /// lexical scope.
+  SmallSet<SlotIndex, 2> trimmedDefs;
+
+  /// Insert a DBG_VALUE into MBB at Idx for DbgValue.
   void insertDebugValue(MachineBasicBlock *MBB, SlotIndex StartIdx,
-                        SlotIndex StopIdx, DbgValueLocation Loc, bool Spilled,
-                        unsigned SpillOffset, LiveIntervals &LIS,
+                        SlotIndex StopIdx, DbgVariableValue DbgValue,
+                        bool Spilled, unsigned SpillOffset, LiveIntervals &LIS,
                         const TargetInstrInfo &TII,
                         const TargetRegisterInfo &TRI);
 
   /// Replace OldLocNo ranges with NewRegs ranges where NewRegs
   /// is live. Returns true if any changes were made.
-  bool splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+  bool splitLocation(unsigned OldLocNo, ArrayRef<Register> NewRegs,
                      LiveIntervals &LIS);
 
 public:
   /// Create a new UserValue.
-  UserValue(const DILocalVariable *var, const DIExpression *expr, DebugLoc L,
+  UserValue(const DILocalVariable *var,
+            Optional<DIExpression::FragmentInfo> Fragment, DebugLoc L,
             LocMap::Allocator &alloc)
-      : Variable(var), Expression(expr), dl(std::move(L)), leader(this),
+      : Variable(var), Fragment(Fragment), dl(std::move(L)), leader(this),
         locInts(alloc) {}
 
   /// Get the leader of this value's equivalence class.
@@ -196,14 +206,6 @@ public:
   /// Return the next UserValue in the equivalence class.
   UserValue *getNext() const { return next; }
 
-  /// Does this UserValue match the parameters?
-  bool match(const DILocalVariable *Var, const DIExpression *Expr,
-             const DILocation *IA) const {
-    // FIXME: The fragment should be part of the equivalence class, but not
-    // other things in the expression like stack values.
-    return Var == Variable && Expr == Expression && dl->getInlinedAt() == IA;
-  }
-
   /// Merge equivalence classes.
   static UserValue *merge(UserValue *L1, UserValue *L2) {
     L2 = L2->getLeader();
@@ -261,33 +263,34 @@ public:
   void removeLocationIfUnused(unsigned LocNo) {
     // Bail out if LocNo still is used.
     for (LocMap::const_iterator I = locInts.begin(); I.valid(); ++I) {
-      DbgValueLocation Loc = I.value();
-      if (Loc.locNo() == LocNo)
+      DbgVariableValue DbgValue = I.value();
+      if (DbgValue.getLocNo() == LocNo)
         return;
     }
     // Remove the entry in the locations vector, and adjust all references to
     // location numbers above the removed entry.
     locations.erase(locations.begin() + LocNo);
     for (LocMap::iterator I = locInts.begin(); I.valid(); ++I) {
-      DbgValueLocation Loc = I.value();
-      if (!Loc.isUndef() && Loc.locNo() > LocNo)
-        I.setValueUnchecked(Loc.changeLocNo(Loc.locNo() - 1));
+      DbgVariableValue DbgValue = I.value();
+      if (!DbgValue.isUndef() && DbgValue.getLocNo() > LocNo)
+        I.setValueUnchecked(DbgValue.changeLocNo(DbgValue.getLocNo() - 1));
     }
   }
 
   /// Ensure that all virtual register locations are mapped.
   void mapVirtRegs(LDVImpl *LDV);
 
-  /// Add a definition point to this value.
-  void addDef(SlotIndex Idx, const MachineOperand &LocMO) {
-    DbgValueLocation Loc(getLocationNo(LocMO));
-    // Add a singular (Idx,Idx) -> Loc mapping.
+  /// Add a definition point to this user value.
+  void addDef(SlotIndex Idx, const MachineOperand &LocMO, bool IsIndirect,
+              const DIExpression &Expr) {
+    DbgVariableValue DbgValue(getLocationNo(LocMO), IsIndirect, Expr);
+    // Add a singular (Idx,Idx) -> value mapping.
     LocMap::iterator I = locInts.find(Idx);
     if (!I.valid() || I.start() != Idx)
-      I.insert(Idx, Idx.getNextSlot(), Loc);
+      I.insert(Idx, Idx.getNextSlot(), DbgValue);
     else
       // A later DBG_VALUE at the same SlotIndex overrides the old location.
-      I.setValue(Loc);
+      I.setValue(DbgValue);
   }
 
   /// Extend the current definition as far as possible down.
@@ -299,28 +302,27 @@ public:
   /// data-flow analysis to propagate them beyond basic block boundaries.
   ///
   /// \param Idx Starting point for the definition.
-  /// \param Loc Location number to propagate.
+  /// \param DbgValue value to propagate.
   /// \param LR Restrict liveness to where LR has the value VNI. May be null.
   /// \param VNI When LR is not null, this is the value to restrict to.
   /// \param [out] Kills Append end points of VNI's live range to Kills.
   /// \param LIS Live intervals analysis.
-  void extendDef(SlotIndex Idx, DbgValueLocation Loc,
-                 LiveRange *LR, const VNInfo *VNI,
-                 SmallVectorImpl<SlotIndex> *Kills,
+  void extendDef(SlotIndex Idx, DbgVariableValue DbgValue, LiveRange *LR,
+                 const VNInfo *VNI, SmallVectorImpl<SlotIndex> *Kills,
                  LiveIntervals &LIS);
 
-  /// The value in LI/LocNo may be copies to other registers. Determine if
+  /// The value in LI may be copies to other registers. Determine if
   /// any of the copies are available at the kill points, and add defs if
   /// possible.
   ///
   /// \param LI Scan for copies of the value in LI->reg.
-  /// \param LocNo Location number of LI->reg.
-  /// \param Kills Points where the range of LocNo could be extended.
-  /// \param [in,out] NewDefs Append (Idx, LocNo) of inserted defs here.
+  /// \param DbgValue Location number of LI->reg, and DIExpression.
+  /// \param Kills Points where the range of DbgValue could be extended.
+  /// \param [in,out] NewDefs Append (Idx, DbgValue) of inserted defs here.
   void addDefsFromCopies(
-      LiveInterval *LI, unsigned LocNo,
+      LiveInterval *LI, DbgVariableValue DbgValue,
       const SmallVectorImpl<SlotIndex> &Kills,
-      SmallVectorImpl<std::pair<SlotIndex, DbgValueLocation>> &NewDefs,
+      SmallVectorImpl<std::pair<SlotIndex, DbgVariableValue>> &NewDefs,
       MachineRegisterInfo &MRI, LiveIntervals &LIS);
 
   /// Compute the live intervals of all locations after collecting all their
@@ -330,7 +332,7 @@ public:
 
   /// Replace OldReg ranges with NewRegs ranges where NewRegs is
   /// live. Returns true if any changes were made.
-  bool splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
+  bool splitRegister(Register OldReg, ArrayRef<Register> NewRegs,
                      LiveIntervals &LIS);
 
   /// Rewrite virtual register locations according to the provided virtual
@@ -370,7 +372,7 @@ public:
       : Label(label), dl(std::move(L)), loc(Idx) {}
 
   /// Does this UserLabel match the parameters?
-  bool match(const DILabel *L, const DILocation *IA,
+  bool matches(const DILabel *L, const DILocation *IA,
              const SlotIndex Index) const {
     return Label == L && dl->getInlinedAt() == IA && loc == Index;
   }
@@ -408,16 +410,17 @@ class LDVImpl {
   using VRMap = DenseMap<unsigned, UserValue *>;
   VRMap virtRegToEqClass;
 
-  /// Map user variable to eq class leader.
-  using UVMap = DenseMap<const DILocalVariable *, UserValue *>;
+  /// Map to find existing UserValue instances.
+  using UVMap = DenseMap<DebugVariable, UserValue *>;
   UVMap userVarMap;
 
   /// Find or create a UserValue.
-  UserValue *getUserValue(const DILocalVariable *Var, const DIExpression *Expr,
+  UserValue *getUserValue(const DILocalVariable *Var,
+                          Optional<DIExpression::FragmentInfo> Fragment,
                           const DebugLoc &DL);
 
   /// Find the EC leader for VirtReg or null.
-  UserValue *lookupVirtReg(unsigned VirtReg);
+  UserValue *lookupVirtReg(Register VirtReg);
 
   /// Add DBG_VALUE instruction to our maps.
   ///
@@ -467,10 +470,10 @@ public:
   }
 
   /// Map virtual register to an equivalence class.
-  void mapVirtReg(unsigned VirtReg, UserValue *EC);
+  void mapVirtReg(Register VirtReg, UserValue *EC);
 
   /// Replace all references to OldReg with NewRegs.
-  void splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs);
+  void splitRegister(Register OldReg, ArrayRef<Register> NewRegs);
 
   /// Recreate DBG_VALUE instruction from data structures.
   void emitDebugValues(VirtRegMap *VRM);
@@ -537,7 +540,9 @@ void UserValue::print(raw_ostream &OS, const TargetRegisterInfo *TRI) {
     if (I.value().isUndef())
       OS << "undef";
     else {
-      OS << I.value().locNo();
+      OS << I.value().getLocNo();
+      if (I.value().getWasIndirect())
+        OS << " ind";
     }
   }
   for (unsigned i = 0, e = locations.size(); i != e; ++i) {
@@ -574,30 +579,27 @@ void UserValue::mapVirtRegs(LDVImpl *LDV) {
 }
 
 UserValue *LDVImpl::getUserValue(const DILocalVariable *Var,
-                                 const DIExpression *Expr, const DebugLoc &DL) {
-  UserValue *&Leader = userVarMap[Var];
-  if (Leader) {
-    UserValue *UV = Leader->getLeader();
-    Leader = UV;
-    for (; UV; UV = UV->getNext())
-      if (UV->match(Var, Expr, DL->getInlinedAt()))
-        return UV;
+                                 Optional<DIExpression::FragmentInfo> Fragment,
+                                 const DebugLoc &DL) {
+  // FIXME: Handle partially overlapping fragments. See
+  // https://reviews.llvm.org/D70121#1849741.
+  DebugVariable ID(Var, Fragment, DL->getInlinedAt());
+  UserValue *&UV = userVarMap[ID];
+  if (!UV) {
+    userValues.push_back(
+        std::make_unique<UserValue>(Var, Fragment, DL, allocator));
+    UV = userValues.back().get();
   }
-
-  userValues.push_back(
-      std::make_unique<UserValue>(Var, Expr, DL, allocator));
-  UserValue *UV = userValues.back().get();
-  Leader = UserValue::merge(Leader, UV);
   return UV;
 }
 
-void LDVImpl::mapVirtReg(unsigned VirtReg, UserValue *EC) {
+void LDVImpl::mapVirtReg(Register VirtReg, UserValue *EC) {
   assert(Register::isVirtualRegister(VirtReg) && "Only map VirtRegs");
   UserValue *&Leader = virtRegToEqClass[VirtReg];
   Leader = UserValue::merge(Leader, EC);
 }
 
-UserValue *LDVImpl::lookupVirtReg(unsigned VirtReg) {
+UserValue *LDVImpl::lookupVirtReg(Register VirtReg) {
   if (UserValue *UV = virtRegToEqClass.lookup(VirtReg))
     return UV->getLeader();
   return nullptr;
@@ -606,8 +608,8 @@ UserValue *LDVImpl::lookupVirtReg(unsigned VirtReg) {
 bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
   // DBG_VALUE loc, offset, variable
   if (MI.getNumOperands() != 4 ||
-      !(MI.getOperand(1).isReg() || MI.getOperand(1).isImm()) ||
-      !MI.getOperand(2).isMetadata()) {
+      !(MI.getDebugOffset().isReg() || MI.getDebugOffset().isImm()) ||
+      !MI.getDebugVariableOp().isMetadata()) {
     LLVM_DEBUG(dbgs() << "Can't handle " << MI);
     return false;
   }
@@ -620,9 +622,9 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
   // (and if the machine verifier is improved to catch this), then these checks
   // could be removed or replaced by asserts.
   bool Discard = false;
-  if (MI.getOperand(0).isReg() &&
-      Register::isVirtualRegister(MI.getOperand(0).getReg())) {
-    const Register Reg = MI.getOperand(0).getReg();
+  if (MI.getDebugOperand(0).isReg() &&
+      Register::isVirtualRegister(MI.getDebugOperand(0).getReg())) {
+    const Register Reg = MI.getDebugOperand(0).getReg();
     if (!LIS->hasInterval(Reg)) {
       // The DBG_VALUE is described by a virtual register that does not have a
       // live interval. Discard the DBG_VALUE.
@@ -646,18 +648,19 @@ bool LDVImpl::handleDebugValue(MachineInstr &MI, SlotIndex Idx) {
   }
 
   // Get or create the UserValue for (variable,offset) here.
-  assert(!MI.getOperand(1).isImm() && "DBG_VALUE with indirect flag before "
-                                      "LiveDebugVariables");
+  bool IsIndirect = MI.isDebugOffsetImm();
+  if (IsIndirect)
+    assert(MI.getDebugOffset().getImm() == 0 &&
+           "DBG_VALUE with nonzero offset");
   const DILocalVariable *Var = MI.getDebugVariable();
   const DIExpression *Expr = MI.getDebugExpression();
-  UserValue *UV =
-      getUserValue(Var, Expr, MI.getDebugLoc());
+  UserValue *UV = getUserValue(Var, Expr->getFragmentInfo(), MI.getDebugLoc());
   if (!Discard)
-    UV->addDef(Idx, MI.getOperand(0));
+    UV->addDef(Idx, MI.getDebugOperand(0), IsIndirect, *Expr);
   else {
     MachineOperand MO = MachineOperand::CreateReg(0U, false);
     MO.setIsDebug();
-    UV->addDef(Idx, MO);
+    UV->addDef(Idx, MO, false, *Expr);
   }
   return true;
 }
@@ -674,7 +677,7 @@ bool LDVImpl::handleDebugLabel(MachineInstr &MI, SlotIndex Idx) {
   const DebugLoc &DL = MI.getDebugLoc();
   bool Found = false;
   for (auto const &L : userLabels) {
-    if (L->match(Label, DL->getInlinedAt(), Idx)) {
+    if (L->matches(Label, DL->getInlinedAt(), Idx)) {
       Found = true;
       break;
     }
@@ -720,7 +723,7 @@ bool LDVImpl::collectDebugValues(MachineFunction &mf) {
   return Changed;
 }
 
-void UserValue::extendDef(SlotIndex Idx, DbgValueLocation Loc, LiveRange *LR,
+void UserValue::extendDef(SlotIndex Idx, DbgVariableValue DbgValue, LiveRange *LR,
                           const VNInfo *VNI, SmallVectorImpl<SlotIndex> *Kills,
                           LiveIntervals &LIS) {
   SlotIndex Start = Idx;
@@ -747,7 +750,7 @@ void UserValue::extendDef(SlotIndex Idx, DbgValueLocation Loc, LiveRange *LR,
   if (I.valid() && I.start() <= Start) {
     // Stop when meeting a different location or an already extended interval.
     Start = Start.getNextSlot();
-    if (I.value() != Loc || I.stop() != Start)
+    if (I.value() != DbgValue || I.stop() != Start)
       return;
     // This is a one-slot placeholder. Just skip it.
     ++I;
@@ -761,13 +764,13 @@ void UserValue::extendDef(SlotIndex Idx, DbgValueLocation Loc, LiveRange *LR,
     Kills->push_back(Stop);
 
   if (Start < Stop)
-    I.insert(Start, Stop, Loc);
+    I.insert(Start, Stop, DbgValue);
 }
 
 void UserValue::addDefsFromCopies(
-    LiveInterval *LI, unsigned LocNo,
+    LiveInterval *LI, DbgVariableValue DbgValue,
     const SmallVectorImpl<SlotIndex> &Kills,
-    SmallVectorImpl<std::pair<SlotIndex, DbgValueLocation>> &NewDefs,
+    SmallVectorImpl<std::pair<SlotIndex, DbgVariableValue>> &NewDefs,
     MachineRegisterInfo &MRI, LiveIntervals &LIS) {
   if (Kills.empty())
     return;
@@ -791,11 +794,11 @@ void UserValue::addDefsFromCopies(
     if (!Register::isVirtualRegister(DstReg))
       continue;
 
-    // Is LocNo extended to reach this copy? If not, another def may be blocking
-    // it, or we are looking at a wrong value of LI.
+    // Is the value extended to reach this copy? If not, another def may be
+    // blocking it, or we are looking at a wrong value of LI.
     SlotIndex Idx = LIS.getInstructionIndex(*MI);
     LocMap::iterator I = locInts.find(Idx.getRegSlot(true));
-    if (!I.valid() || I.value().locNo() != LocNo)
+    if (!I.valid() || I.value() != DbgValue)
       continue;
 
     if (!LIS.hasInterval(DstReg))
@@ -829,9 +832,9 @@ void UserValue::addDefsFromCopies(
       MachineInstr *CopyMI = LIS.getInstructionFromIndex(DstVNI->def);
       assert(CopyMI && CopyMI->isCopy() && "Bad copy value");
       unsigned LocNo = getLocationNo(CopyMI->getOperand(0));
-      DbgValueLocation NewLoc(LocNo);
-      I.insert(Idx, Idx.getNextSlot(), NewLoc);
-      NewDefs.push_back(std::make_pair(Idx, NewLoc));
+      DbgVariableValue NewValue = DbgValue.changeLocNo(LocNo);
+      I.insert(Idx, Idx.getNextSlot(), NewValue);
+      NewDefs.push_back(std::make_pair(Idx, NewValue));
       break;
     }
   }
@@ -840,7 +843,7 @@ void UserValue::addDefsFromCopies(
 void UserValue::computeIntervals(MachineRegisterInfo &MRI,
                                  const TargetRegisterInfo &TRI,
                                  LiveIntervals &LIS, LexicalScopes &LS) {
-  SmallVector<std::pair<SlotIndex, DbgValueLocation>, 16> Defs;
+  SmallVector<std::pair<SlotIndex, DbgVariableValue>, 16> Defs;
 
   // Collect all defs to be extended (Skipping undefs).
   for (LocMap::const_iterator I = locInts.begin(); I.valid(); ++I)
@@ -850,11 +853,11 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
   // Extend all defs, and possibly add new ones along the way.
   for (unsigned i = 0; i != Defs.size(); ++i) {
     SlotIndex Idx = Defs[i].first;
-    DbgValueLocation Loc = Defs[i].second;
-    const MachineOperand &LocMO = locations[Loc.locNo()];
+    DbgVariableValue DbgValue = Defs[i].second;
+    const MachineOperand &LocMO = locations[DbgValue.getLocNo()];
 
     if (!LocMO.isReg()) {
-      extendDef(Idx, Loc, nullptr, nullptr, nullptr, LIS);
+      extendDef(Idx, DbgValue, nullptr, nullptr, nullptr, LIS);
       continue;
     }
 
@@ -867,7 +870,7 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
         VNI = LI->getVNInfoAt(Idx);
       }
       SmallVector<SlotIndex, 16> Kills;
-      extendDef(Idx, Loc, LI, VNI, &Kills, LIS);
+      extendDef(Idx, DbgValue, LI, VNI, &Kills, LIS);
       // FIXME: Handle sub-registers in addDefsFromCopies. The problem is that
       // if the original location for example is %vreg0:sub_hi, and we find a
       // full register copy in addDefsFromCopies (at the moment it only handles
@@ -877,7 +880,7 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
       // sub-register in that regclass). For now, simply skip handling copies if
       // a sub-register is involved.
       if (LI && !LocMO.getSubReg())
-        addDefsFromCopies(LI, Loc.locNo(), Kills, Defs, MRI, LIS);
+        addDefsFromCopies(LI, DbgValue, Kills, Defs, MRI, LIS);
       continue;
     }
 
@@ -910,11 +913,16 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
     SlotIndex RStart = LIS.getInstructionIndex(*Range.first);
     SlotIndex REnd = LIS.getInstructionIndex(*Range.second);
 
+    // Variable locations at the first instruction of a block should be
+    // based on the block's SlotIndex, not the first instruction's index.
+    if (Range.first == Range.first->getParent()->begin())
+      RStart = LIS.getSlotIndexes()->getIndexBefore(*Range.first);
+
     // At the start of each iteration I has been advanced so that
     // I.stop() >= PrevEnd. Check for overlap.
     if (PrevEnd && I.start() < PrevEnd) {
       SlotIndex IStop = I.stop();
-      DbgValueLocation Loc = I.value();
+      DbgVariableValue DbgValue = I.value();
 
       // Stop overlaps previous end - trim the end of the interval to
       // the scope range.
@@ -922,9 +930,10 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
       ++I;
 
       // If the interval also overlaps the start of the "next" (i.e.
-      // current) range create a new interval for the remainder
+      // current) range create a new interval for the remainder (which
+      // may be further trimmed).
       if (RStart < IStop)
-        I.insert(RStart, IStop, Loc);
+        I.insert(RStart, IStop, DbgValue);
     }
 
     // Advance I so that I.stop() >= RStart, and check for overlap.
@@ -932,6 +941,13 @@ void UserValue::computeIntervals(MachineRegisterInfo &MRI,
     if (!I.valid())
       return;
 
+    if (I.start() < RStart) {
+      // Interval start overlaps range - trim to the scope range.
+      I.setStartUnchecked(RStart);
+      // Remember that this interval was trimmed.
+      trimmedDefs.insert(RStart);
+    }
+
     // The end of a lexical scope range is the last instruction in the
     // range. To convert to an interval we need the index of the
     // instruction after it.
@@ -1014,7 +1030,7 @@ LiveDebugVariables::~LiveDebugVariables() {
 //===----------------------------------------------------------------------===//
 
 bool
-UserValue::splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
+UserValue::splitLocation(unsigned OldLocNo, ArrayRef<Register> NewRegs,
                          LiveIntervals& LIS) {
   LLVM_DEBUG({
     dbgs() << "Splitting Loc" << OldLocNo << '\t';
@@ -1044,7 +1060,8 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
         break;
 
       // Now LII->end > LocMapI.start(). Do we have an overlap?
-      if (LocMapI.value().locNo() == OldLocNo && LII->start < LocMapI.stop()) {
+      if (LocMapI.value().getLocNo() == OldLocNo &&
+          LII->start < LocMapI.stop()) {
         // Overlapping correct location. Allocate NewLocNo now.
         if (NewLocNo == UndefLocNo) {
           MachineOperand MO = MachineOperand::CreateReg(LI->reg, false);
@@ -1054,8 +1071,8 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
         }
 
         SlotIndex LStart = LocMapI.start();
-        SlotIndex LStop  = LocMapI.stop();
-        DbgValueLocation OldLoc = LocMapI.value();
+        SlotIndex LStop = LocMapI.stop();
+        DbgVariableValue OldDbgValue = LocMapI.value();
 
         // Trim LocMapI down to the LII overlap.
         if (LStart < LII->start)
@@ -1064,17 +1081,17 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
           LocMapI.setStopUnchecked(LII->end);
 
         // Change the value in the overlap. This may trigger coalescing.
-        LocMapI.setValue(OldLoc.changeLocNo(NewLocNo));
+        LocMapI.setValue(OldDbgValue.changeLocNo(NewLocNo));
 
-        // Re-insert any removed OldLocNo ranges.
+        // Re-insert any removed OldDbgValue ranges.
         if (LStart < LocMapI.start()) {
-          LocMapI.insert(LStart, LocMapI.start(), OldLoc);
+          LocMapI.insert(LStart, LocMapI.start(), OldDbgValue);
           ++LocMapI;
           assert(LocMapI.valid() && "Unexpected coalescing");
         }
         if (LStop > LocMapI.stop()) {
           ++LocMapI;
-          LocMapI.insert(LII->end, LStop, OldLoc);
+          LocMapI.insert(LII->end, LStop, OldDbgValue);
           --LocMapI;
         }
       }
@@ -1100,6 +1117,9 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
   // register to the spill slot). So for a while we can have locations that map
   // to virtual registers that have been removed from both the MachineFunction
   // and from LiveIntervals.
+  //
+  // We may also just be using the location for a value with a different
+  // expression.
   removeLocationIfUnused(OldLocNo);
 
   LLVM_DEBUG({
@@ -1110,7 +1130,7 @@ UserValue::splitLocation(unsigned OldLocNo, ArrayRef<unsigned> NewRegs,
 }
 
 bool
-UserValue::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
+UserValue::splitRegister(Register OldReg, ArrayRef<Register> NewRegs,
                          LiveIntervals &LIS) {
   bool DidChange = false;
   // Split locations referring to OldReg. Iterate backwards so splitLocation can
@@ -1125,7 +1145,7 @@ UserValue::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
   return DidChange;
 }
 
-void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs) {
+void LDVImpl::splitRegister(Register OldReg, ArrayRef<Register> NewRegs) {
   bool DidChange = false;
   for (UserValue *UV = lookupVirtReg(OldReg); UV; UV = UV->getNext())
     DidChange |= UV->splitRegister(OldReg, NewRegs, *LIS);
@@ -1140,7 +1160,7 @@ void LDVImpl::splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs) {
 }
 
 void LiveDebugVariables::
-splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs, LiveIntervals &LIS) {
+splitRegister(Register OldReg, ArrayRef<Register> NewRegs, LiveIntervals &LIS) {
   if (pImpl)
     static_cast<LDVImpl*>(pImpl)->splitRegister(OldReg, NewRegs);
 }
@@ -1218,13 +1238,13 @@ void UserValue::rewriteLocations(VirtRegMap &VRM, const MachineFunction &MF,
   // DBG_VALUE intervals with different vregs that were allocated to the same
   // physical register.
   for (LocMap::iterator I = locInts.begin(); I.valid(); ++I) {
-    DbgValueLocation Loc = I.value();
+    DbgVariableValue DbgValue = I.value();
     // Undef values don't exist in locations (and thus not in LocNoMap either)
     // so skip over them. See getLocationNo().
-    if (Loc.isUndef())
+    if (DbgValue.isUndef())
       continue;
-    unsigned NewLocNo = LocNoMap[Loc.locNo()];
-    I.setValueUnchecked(Loc.changeLocNo(NewLocNo));
+    unsigned NewLocNo = LocNoMap[DbgValue.getLocNo()];
+    I.setValueUnchecked(DbgValue.changeLocNo(NewLocNo));
     I.setStart(I.start());
   }
 }
@@ -1278,7 +1298,7 @@ findNextInsertLocation(MachineBasicBlock *MBB,
 }
 
 void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex StartIdx,
-                                 SlotIndex StopIdx, DbgValueLocation Loc,
+                                 SlotIndex StopIdx, DbgVariableValue DbgValue,
                                  bool Spilled, unsigned SpillOffset,
                                  LiveIntervals &LIS, const TargetInstrInfo &TII,
                                  const TargetRegisterInfo &TRI) {
@@ -1288,12 +1308,14 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex StartIdx,
   MachineBasicBlock::iterator I = findInsertLocation(MBB, StartIdx, LIS);
   // Undef values don't exist in locations so create new "noreg" register MOs
   // for them. See getLocationNo().
-  MachineOperand MO = !Loc.isUndef() ?
-    locations[Loc.locNo()] :
-    MachineOperand::CreateReg(/* Reg */ 0, /* isDef */ false, /* isImp */ false,
-                              /* isKill */ false, /* isDead */ false,
-                              /* isUndef */ false, /* isEarlyClobber */ false,
-                              /* SubReg */ 0, /* isDebug */ true);
+  MachineOperand MO =
+      !DbgValue.isUndef()
+          ? locations[DbgValue.getLocNo()]
+          : MachineOperand::CreateReg(
+                /* Reg */ 0, /* isDef */ false, /* isImp */ false,
+                /* isKill */ false, /* isDead */ false,
+                /* isUndef */ false, /* isEarlyClobber */ false,
+                /* SubReg */ 0, /* isDebug */ true);
 
   ++NumInsertedDebugValues;
 
@@ -1305,15 +1327,22 @@ void UserValue::insertDebugValue(MachineBasicBlock *MBB, SlotIndex StartIdx,
   // original DBG_VALUE was indirect, we need to add DW_OP_deref to indicate
   // that the original virtual register was a pointer. Also, add the stack slot
   // offset for the spilled register to the expression.
-  const DIExpression *Expr = Expression;
-  if (Spilled)
-    Expr = DIExpression::prepend(Expr, DIExpression::ApplyOffset, SpillOffset);
+  const DIExpression *Expr = DbgValue.getExpression();
+  uint8_t DIExprFlags = DIExpression::ApplyOffset;
+  bool IsIndirect = DbgValue.getWasIndirect();
+  if (Spilled) {
+    if (IsIndirect)
+      DIExprFlags |= DIExpression::DerefAfter;
+    Expr =
+        DIExpression::prepend(Expr, DIExprFlags, SpillOffset);
+    IsIndirect = true;
+  }
 
   assert((!Spilled || MO.isFI()) && "a spilled location must be a frame index");
 
   do {
     BuildMI(*MBB, I, getDebugLoc(), TII.get(TargetOpcode::DBG_VALUE),
-            Spilled, MO, Variable, Expr);
+            IsIndirect, MO, Variable, Expr);
 
     // Continue and insert DBG_VALUES after every redefinition of register
     // associated with the debug value within the range
@@ -1339,19 +1368,26 @@ void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
   for (LocMap::const_iterator I = locInts.begin(); I.valid();) {
     SlotIndex Start = I.start();
     SlotIndex Stop = I.stop();
-    DbgValueLocation Loc = I.value();
-    auto SpillIt =
-        !Loc.isUndef() ? SpillOffsets.find(Loc.locNo()) : SpillOffsets.end();
+    DbgVariableValue DbgValue = I.value();
+    auto SpillIt = !DbgValue.isUndef() ? SpillOffsets.find(DbgValue.getLocNo())
+                                       : SpillOffsets.end();
     bool Spilled = SpillIt != SpillOffsets.end();
     unsigned SpillOffset = Spilled ? SpillIt->second : 0;
 
-    LLVM_DEBUG(dbgs() << "\t[" << Start << ';' << Stop << "):" << Loc.locNo());
+    // If the interval start was trimmed to the lexical scope insert the
+    // DBG_VALUE at the previous index (otherwise it appears after the
+    // first instruction in the range).
+    if (trimmedDefs.count(Start))
+      Start = Start.getPrevIndex();
+
+    LLVM_DEBUG(dbgs() << "\t[" << Start << ';' << Stop
+                      << "):" << DbgValue.getLocNo());
     MachineFunction::iterator MBB = LIS.getMBBFromIndex(Start)->getIterator();
     SlotIndex MBBEnd = LIS.getMBBEndIdx(&*MBB);
 
     LLVM_DEBUG(dbgs() << ' ' << printMBBReference(*MBB) << '-' << MBBEnd);
-    insertDebugValue(&*MBB, Start, Stop, Loc, Spilled, SpillOffset, LIS, TII,
-                     TRI);
+    insertDebugValue(&*MBB, Start, Stop, DbgValue, Spilled, SpillOffset, LIS,
+                     TII, TRI);
     // This interval may span multiple basic blocks.
     // Insert a DBG_VALUE into each one.
     while (Stop > MBBEnd) {
@@ -1361,8 +1397,8 @@ void UserValue::emitDebugValues(VirtRegMap *VRM, LiveIntervals &LIS,
         break;
       MBBEnd = LIS.getMBBEndIdx(&*MBB);
       LLVM_DEBUG(dbgs() << ' ' << printMBBReference(*MBB) << '-' << MBBEnd);
-      insertDebugValue(&*MBB, Start, Stop, Loc, Spilled, SpillOffset, LIS, TII,
-                       TRI);
+      insertDebugValue(&*MBB, Start, Stop, DbgValue, Spilled, SpillOffset, LIS,
+                       TII, TRI);
     }
     LLVM_DEBUG(dbgs() << '\n');
     if (MBB == MFEnd)
diff --git a/llvm/lib/CodeGen/LiveDebugVariables.h b/llvm/lib/CodeGen/LiveDebugVariables.h
index 0cbe10c6a422..74e738ec3e56 100644
--- a/llvm/lib/CodeGen/LiveDebugVariables.h
+++ b/llvm/lib/CodeGen/LiveDebugVariables.h
@@ -41,7 +41,7 @@ public:
   /// splitRegister - Move any user variables in OldReg to the live ranges in
   /// NewRegs where they are live. Mark the values as unavailable where no new
   /// register is live.
-  void splitRegister(unsigned OldReg, ArrayRef<unsigned> NewRegs,
+  void splitRegister(Register OldReg, ArrayRef<Register> NewRegs,
                      LiveIntervals &LIS);
 
   /// emitDebugValues - Emit new DBG_VALUE instructions reflecting the changes
diff --git a/llvm/lib/CodeGen/LiveIntervalCalc.cpp b/llvm/lib/CodeGen/LiveIntervalCalc.cpp
new file mode 100644
index 000000000000..30c2d74a71c5
--- /dev/null
+++ b/llvm/lib/CodeGen/LiveIntervalCalc.cpp
@@ -0,0 +1,205 @@
+//===- LiveIntervalCalc.cpp - Calculate live interval --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Implementation of the LiveIntervalCalc class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/LiveIntervalCalc.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/MC/LaneBitmask.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <iterator>
+#include <tuple>
+#include <utility>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "regalloc"
+
+// Reserve an address that indicates a value that is known to be "undef".
+static VNInfo UndefVNI(0xbad, SlotIndex());
+
+static void createDeadDef(SlotIndexes &Indexes, VNInfo::Allocator &Alloc,
+                          LiveRange &LR, const MachineOperand &MO) {
+  const MachineInstr &MI = *MO.getParent();
+  SlotIndex DefIdx =
+      Indexes.getInstructionIndex(MI).getRegSlot(MO.isEarlyClobber());
+
+  // Create the def in LR. This may find an existing def.
+  LR.createDeadDef(DefIdx, Alloc);
+}
+
+void LiveIntervalCalc::calculate(LiveInterval &LI, bool TrackSubRegs) {
+  const MachineRegisterInfo *MRI = getRegInfo();
+  SlotIndexes *Indexes = getIndexes();
+  VNInfo::Allocator *Alloc = getVNAlloc();
+
+  assert(MRI && Indexes && "call reset() first");
+
+  // Step 1: Create minimal live segments for every definition of Reg.
+  // Visit all def operands. If the same instruction has multiple defs of Reg,
+  // createDeadDef() will deduplicate.
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  unsigned Reg = LI.reg;
+  for (const MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+    if (!MO.isDef() && !MO.readsReg())
+      continue;
+
+    unsigned SubReg = MO.getSubReg();
+    if (LI.hasSubRanges() || (SubReg != 0 && TrackSubRegs)) {
+      LaneBitmask SubMask = SubReg != 0 ? TRI.getSubRegIndexLaneMask(SubReg)
+                                        : MRI->getMaxLaneMaskForVReg(Reg);
+      // If this is the first time we see a subregister def, initialize
+      // subranges by creating a copy of the main range.
+      if (!LI.hasSubRanges() && !LI.empty()) {
+        LaneBitmask ClassMask = MRI->getMaxLaneMaskForVReg(Reg);
+        LI.createSubRangeFrom(*Alloc, ClassMask, LI);
+      }
+
+      LI.refineSubRanges(
+          *Alloc, SubMask,
+          [&MO, Indexes, Alloc](LiveInterval::SubRange &SR) {
+            if (MO.isDef())
+              createDeadDef(*Indexes, *Alloc, SR, MO);
+          },
+          *Indexes, TRI);
+    }
+
+    // Create the def in the main liverange. We do not have to do this if
+    // subranges are tracked as we recreate the main range later in this case.
+    if (MO.isDef() && !LI.hasSubRanges())
+      createDeadDef(*Indexes, *Alloc, LI, MO);
+  }
+
+  // We may have created empty live ranges for partially undefined uses, we
+  // can't keep them because we won't find defs in them later.
+  LI.removeEmptySubRanges();
+
+  const MachineFunction *MF = getMachineFunction();
+  MachineDominatorTree *DomTree = getDomTree();
+  // Step 2: Extend live segments to all uses, constructing SSA form as
+  // necessary.
+  if (LI.hasSubRanges()) {
+    for (LiveInterval::SubRange &S : LI.subranges()) {
+      LiveIntervalCalc SubLIC;
+      SubLIC.reset(MF, Indexes, DomTree, Alloc);
+      SubLIC.extendToUses(S, Reg, S.LaneMask, &LI);
+    }
+    LI.clear();
+    constructMainRangeFromSubranges(LI);
+  } else {
+    resetLiveOutMap();
+    extendToUses(LI, Reg, LaneBitmask::getAll());
+  }
+}
+
+void LiveIntervalCalc::constructMainRangeFromSubranges(LiveInterval &LI) {
+  // First create dead defs at all defs found in subranges.
+  LiveRange &MainRange = LI;
+  assert(MainRange.segments.empty() && MainRange.valnos.empty() &&
+         "Expect empty main liverange");
+
+  VNInfo::Allocator *Alloc = getVNAlloc();
+  for (const LiveInterval::SubRange &SR : LI.subranges()) {
+    for (const VNInfo *VNI : SR.valnos) {
+      if (!VNI->isUnused() && !VNI->isPHIDef())
+        MainRange.createDeadDef(VNI->def, *Alloc);
+    }
+  }
+  resetLiveOutMap();
+  extendToUses(MainRange, LI.reg, LaneBitmask::getAll(), &LI);
+}
+
+void LiveIntervalCalc::createDeadDefs(LiveRange &LR, Register Reg) {
+  const MachineRegisterInfo *MRI = getRegInfo();
+  SlotIndexes *Indexes = getIndexes();
+  VNInfo::Allocator *Alloc = getVNAlloc();
+  assert(MRI && Indexes && "call reset() first");
+
+  // Visit all def operands. If the same instruction has multiple defs of Reg,
+  // LR.createDeadDef() will deduplicate.
+  for (MachineOperand &MO : MRI->def_operands(Reg))
+    createDeadDef(*Indexes, *Alloc, LR, MO);
+}
+
+void LiveIntervalCalc::extendToUses(LiveRange &LR, Register Reg,
+                                    LaneBitmask Mask, LiveInterval *LI) {
+  const MachineRegisterInfo *MRI = getRegInfo();
+  SlotIndexes *Indexes = getIndexes();
+  SmallVector<SlotIndex, 4> Undefs;
+  if (LI != nullptr)
+    LI->computeSubRangeUndefs(Undefs, Mask, *MRI, *Indexes);
+
+  // Visit all operands that read Reg. This may include partial defs.
+  bool IsSubRange = !Mask.all();
+  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
+  for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
+    // Clear all kill flags. They will be reinserted after register allocation
+    // by LiveIntervals::addKillFlags().
+    if (MO.isUse())
+      MO.setIsKill(false);
+    // MO::readsReg returns "true" for subregister defs. This is for keeping
+    // liveness of the entire register (i.e. for the main range of the live
+    // interval). For subranges, definitions of non-overlapping subregisters
+    // do not count as uses.
+    if (!MO.readsReg() || (IsSubRange && MO.isDef()))
+      continue;
+
+    unsigned SubReg = MO.getSubReg();
+    if (SubReg != 0) {
+      LaneBitmask SLM = TRI.getSubRegIndexLaneMask(SubReg);
+      if (MO.isDef())
+        SLM = ~SLM;
+      // Ignore uses not reading the current (sub)range.
+      if ((SLM & Mask).none())
+        continue;
+    }
+
+    // Determine the actual place of the use.
+    const MachineInstr *MI = MO.getParent();
+    unsigned OpNo = (&MO - &MI->getOperand(0));
+    SlotIndex UseIdx;
+    if (MI->isPHI()) {
+      assert(!MO.isDef() && "Cannot handle PHI def of partial register.");
+      // The actual place where a phi operand is used is the end of the pred
+      // MBB. PHI operands are paired: (Reg, PredMBB).
+      UseIdx = Indexes->getMBBEndIdx(MI->getOperand(OpNo + 1).getMBB());
+    } else {
+      // Check for early-clobber redefs.
+      bool isEarlyClobber = false;
+      unsigned DefIdx;
+      if (MO.isDef())
+        isEarlyClobber = MO.isEarlyClobber();
+      else if (MI->isRegTiedToDefOperand(OpNo, &DefIdx)) {
+        // FIXME: This would be a lot easier if tied early-clobber uses also
+        // had an early-clobber flag.
+        isEarlyClobber = MI->getOperand(DefIdx).isEarlyClobber();
+      }
+      UseIdx = Indexes->getInstructionIndex(*MI).getRegSlot(isEarlyClobber);
+    }
+
+    // MI is reading Reg. We may have visited MI before if it happens to be
+    // reading Reg multiple times. That is OK, extend() is idempotent.
+    extend(LR, UseIdx, Reg, Undefs);
+  }
+}
+\ No newline at end of file
diff --git a/llvm/lib/CodeGen/LiveIntervals.cpp b/llvm/lib/CodeGen/LiveIntervals.cpp
index 9c80282bc59e..e8ee0599e1a2 100644
--- a/llvm/lib/CodeGen/LiveIntervals.cpp
+++ b/llvm/lib/CodeGen/LiveIntervals.cpp
@@ -21,7 +21,7 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveInterval.h"
-#include "llvm/CodeGen/LiveRangeCalc.h"
+#include "llvm/CodeGen/LiveIntervalCalc.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -101,9 +101,7 @@ LiveIntervals::LiveIntervals() : MachineFunctionPass(ID) {
   initializeLiveIntervalsPass(*PassRegistry::getPassRegistry());
 }
 
-LiveIntervals::~LiveIntervals() {
-  delete LRCalc;
-}
+LiveIntervals::~LiveIntervals() { delete LICalc; }
 
 void LiveIntervals::releaseMemory() {
   // Free the live intervals themselves.
@@ -131,8 +129,8 @@ bool LiveIntervals::runOnMachineFunction(MachineFunction &fn) {
   Indexes = &getAnalysis<SlotIndexes>();
   DomTree = &getAnalysis<MachineDominatorTree>();
 
-  if (!LRCalc)
-    LRCalc = new LiveRangeCalc();
+  if (!LICalc)
+    LICalc = new LiveIntervalCalc();
 
   // Allocate space for all virtual registers.
   VirtRegIntervals.resize(MRI->getNumVirtRegs());
@@ -192,10 +190,10 @@ LiveInterval* LiveIntervals::createInterval(unsigned reg) {
 
 /// Compute the live interval of a virtual register, based on defs and uses.
 bool LiveIntervals::computeVirtRegInterval(LiveInterval &LI) {
-  assert(LRCalc && "LRCalc not initialized.");
+  assert(LICalc && "LICalc not initialized.");
   assert(LI.empty() && "Should only compute empty intervals.");
-  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
-  LRCalc->calculate(LI, MRI->shouldTrackSubRegLiveness(LI.reg));
+  LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  LICalc->calculate(LI, MRI->shouldTrackSubRegLiveness(LI.reg));
   return computeDeadValues(LI, nullptr);
 }
 
@@ -266,8 +264,8 @@ void LiveIntervals::computeRegMasks() {
 /// aliasing registers.  The range should be empty, or contain only dead
 /// phi-defs from ABI blocks.
 void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
-  assert(LRCalc && "LRCalc not initialized.");
-  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  assert(LICalc && "LICalc not initialized.");
+  LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
 
   // The physregs aliasing Unit are the roots and their super-registers.
   // Create all values as dead defs before extending to uses. Note that roots
@@ -281,7 +279,7 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
          Super.isValid(); ++Super) {
       unsigned Reg = *Super;
       if (!MRI->reg_empty(Reg))
-        LRCalc->createDeadDefs(LR, Reg);
+        LICalc->createDeadDefs(LR, Reg);
       // A register unit is considered reserved if all its roots and all their
       // super registers are reserved.
       if (!MRI->isReserved(Reg))
@@ -300,7 +298,7 @@ void LiveIntervals::computeRegUnitRange(LiveRange &LR, unsigned Unit) {
            Super.isValid(); ++Super) {
         unsigned Reg = *Super;
         if (!MRI->reg_empty(Reg))
-          LRCalc->extendToUses(LR, Reg);
+          LICalc->extendToUses(LR, Reg);
       }
     }
   }
@@ -623,10 +621,10 @@ void LiveIntervals::shrinkToUses(LiveInterval::SubRange &SR, unsigned Reg) {
 void LiveIntervals::extendToIndices(LiveRange &LR,
                                     ArrayRef<SlotIndex> Indices,
                                     ArrayRef<SlotIndex> Undefs) {
-  assert(LRCalc && "LRCalc not initialized.");
-  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  assert(LICalc && "LICalc not initialized.");
+  LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
   for (SlotIndex Idx : Indices)
-    LRCalc->extend(LR, Idx, /*PhysReg=*/0, Undefs);
+    LICalc->extend(LR, Idx, /*PhysReg=*/0, Undefs);
 }
 
 void LiveIntervals::pruneValue(LiveRange &LR, SlotIndex Kill,
@@ -1013,6 +1011,20 @@ public:
           }
         }
         updateRange(LI, Reg, LaneBitmask::getNone());
+        // If main range has a hole and we are moving a subrange use across
+        // the hole updateRange() cannot properly handle it since it only
+        // gets the LiveRange and not the whole LiveInterval. As a result
+        // we may end up with a main range not covering all subranges.
+        // This is extremely rare case, so let's check and reconstruct the
+        // main range.
+        for (LiveInterval::SubRange &S : LI.subranges()) {
+          if (LI.covers(S))
+            continue;
+          LI.clear();
+          LIS.constructMainRangeFromSubranges(LI);
+          break;
+        }
+
         continue;
       }
 
@@ -1344,7 +1356,7 @@ private:
           OldIdxOut->start = NewIdxDef;
           OldIdxVNI->def = NewIdxDef;
           if (OldIdxIn != E && SlotIndex::isEarlierInstr(NewIdx, OldIdxIn->end))
-            OldIdxIn->end = NewIdx.getRegSlot();
+            OldIdxIn->end = NewIdxDef;
         }
       } else if (OldIdxIn != E
           && SlotIndex::isEarlierInstr(NewIdxOut->start, NewIdx)
@@ -1480,13 +1492,43 @@ void LiveIntervals::handleMove(MachineInstr &MI, bool UpdateFlags) {
   HME.updateAllRanges(&MI);
 }
 
-void LiveIntervals::handleMoveIntoBundle(MachineInstr &MI,
-                                         MachineInstr &BundleStart,
-                                         bool UpdateFlags) {
-  SlotIndex OldIndex = Indexes->getInstructionIndex(MI);
-  SlotIndex NewIndex = Indexes->getInstructionIndex(BundleStart);
-  HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags);
-  HME.updateAllRanges(&MI);
+void LiveIntervals::handleMoveIntoNewBundle(MachineInstr &BundleStart,
+                                            bool UpdateFlags) {
+  assert((BundleStart.getOpcode() == TargetOpcode::BUNDLE) &&
+         "Bundle start is not a bundle");
+  SmallVector<SlotIndex, 16> ToProcess;
+  const SlotIndex NewIndex = Indexes->insertMachineInstrInMaps(BundleStart);
+  auto BundleEnd = getBundleEnd(BundleStart.getIterator());
+
+  auto I = BundleStart.getIterator();
+  I++;
+  while (I != BundleEnd) {
+    if (!Indexes->hasIndex(*I))
+      continue;
+    SlotIndex OldIndex = Indexes->getInstructionIndex(*I, true);
+    ToProcess.push_back(OldIndex);
+    Indexes->removeMachineInstrFromMaps(*I, true);
+    I++;
+  }
+  for (SlotIndex OldIndex : ToProcess) {
+    HMEditor HME(*this, *MRI, *TRI, OldIndex, NewIndex, UpdateFlags);
+    HME.updateAllRanges(&BundleStart);
+  }
+
+  // Fix up dead defs
+  const SlotIndex Index = getInstructionIndex(BundleStart);
+  for (unsigned Idx = 0, E = BundleStart.getNumOperands(); Idx != E; ++Idx) {
+    MachineOperand &MO = BundleStart.getOperand(Idx);
+    if (!MO.isReg())
+      continue;
+    Register Reg = MO.getReg();
+    if (Reg.isVirtual() && hasInterval(Reg) && !MO.isUndef()) {
+      LiveInterval &LI = getInterval(Reg);
+      LiveQueryResult LRQ = LI.Query(Index);
+      if (LRQ.isDeadDef())
+        MO.setIsDead();
+    }
+  }
 }
 
 void LiveIntervals::repairOldRegInRange(const MachineBasicBlock::iterator Begin,
@@ -1587,7 +1629,7 @@ void
 LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
                                       MachineBasicBlock::iterator Begin,
                                       MachineBasicBlock::iterator End,
-                                      ArrayRef<unsigned> OrigRegs) {
+                                      ArrayRef<Register> OrigRegs) {
   // Find anchor points, which are at the beginning/end of blocks or at
   // instructions that already have indexes.
   while (Begin != MBB->begin() && !Indexes->hasIndex(*Begin))
@@ -1618,8 +1660,8 @@ LiveIntervals::repairIntervalsInRange(MachineBasicBlock *MBB,
     }
   }
 
-  for (unsigned Reg : OrigRegs) {
-    if (!Register::isVirtualRegister(Reg))
+  for (Register Reg : OrigRegs) {
+    if (!Reg.isVirtual())
       continue;
 
     LiveInterval &LI = getInterval(Reg);
@@ -1678,7 +1720,7 @@ void LiveIntervals::splitSeparateComponents(LiveInterval &LI,
 }
 
 void LiveIntervals::constructMainRangeFromSubranges(LiveInterval &LI) {
-  assert(LRCalc && "LRCalc not initialized.");
-  LRCalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
-  LRCalc->constructMainRangeFromSubranges(LI);
+  assert(LICalc && "LICalc not initialized.");
+  LICalc->reset(MF, getSlotIndexes(), DomTree, &getVNInfoAllocator());
+  LICalc->constructMainRangeFromSubranges(LI);
 }
diff --git a/llvm/lib/CodeGen/LivePhysRegs.cpp b/llvm/lib/CodeGen/LivePhysRegs.cpp
index 7a5cffca3470..547970e7ab5d 100644
--- a/llvm/lib/CodeGen/LivePhysRegs.cpp
+++ b/llvm/lib/CodeGen/LivePhysRegs.cpp
@@ -276,6 +276,7 @@ void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
   const MachineFunction &MF = *MBB.getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
 
   // We walk through the block backwards and start with the live outs.
   LivePhysRegs LiveRegs;
@@ -294,6 +295,18 @@ void llvm::recomputeLivenessFlags(MachineBasicBlock &MBB) {
       assert(Register::isPhysicalRegister(Reg));
 
       bool IsNotLive = LiveRegs.available(MRI, Reg);
+
+      // Special-case return instructions for cases when a return is not
+      // the last instruction in the block.
+      if (MI.isReturn() && MFI.isCalleeSavedInfoValid()) {
+        for (const CalleeSavedInfo &Info : MFI.getCalleeSavedInfo()) {
+          if (Info.getReg() == Reg) {
+            IsNotLive = !Info.isRestored();
+            break;
+          }
+        }
+      }
+
       MO->setIsDead(IsNotLive);
     }
 
diff --git a/llvm/lib/CodeGen/LiveRangeCalc.cpp b/llvm/lib/CodeGen/LiveRangeCalc.cpp
index 24b57be0da00..e9c9b70d29a9 100644
--- a/llvm/lib/CodeGen/LiveRangeCalc.cpp
+++ b/llvm/lib/CodeGen/LiveRangeCalc.cpp
@@ -1,4 +1,4 @@
-//===- LiveRangeCalc.cpp - Calculate live ranges --------------------------===//
+//===- LiveRangeCalc.cpp - Calculate live ranges -------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -61,158 +61,6 @@ void LiveRangeCalc::reset(const MachineFunction *mf,
   LiveIn.clear();
 }
 
-static void createDeadDef(SlotIndexes &Indexes, VNInfo::Allocator &Alloc,
-                          LiveRange &LR, const MachineOperand &MO) {
-  const MachineInstr &MI = *MO.getParent();
-  SlotIndex DefIdx =
-      Indexes.getInstructionIndex(MI).getRegSlot(MO.isEarlyClobber());
-
-  // Create the def in LR. This may find an existing def.
-  LR.createDeadDef(DefIdx, Alloc);
-}
-
-void LiveRangeCalc::calculate(LiveInterval &LI, bool TrackSubRegs) {
-  assert(MRI && Indexes && "call reset() first");
-
-  // Step 1: Create minimal live segments for every definition of Reg.
-  // Visit all def operands. If the same instruction has multiple defs of Reg,
-  // createDeadDef() will deduplicate.
-  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
-  unsigned Reg = LI.reg;
-  for (const MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
-    if (!MO.isDef() && !MO.readsReg())
-      continue;
-
-    unsigned SubReg = MO.getSubReg();
-    if (LI.hasSubRanges() || (SubReg != 0 && TrackSubRegs)) {
-      LaneBitmask SubMask = SubReg != 0 ? TRI.getSubRegIndexLaneMask(SubReg)
-                                        : MRI->getMaxLaneMaskForVReg(Reg);
-      // If this is the first time we see a subregister def, initialize
-      // subranges by creating a copy of the main range.
-      if (!LI.hasSubRanges() && !LI.empty()) {
-        LaneBitmask ClassMask = MRI->getMaxLaneMaskForVReg(Reg);
-        LI.createSubRangeFrom(*Alloc, ClassMask, LI);
-      }
-
-      LI.refineSubRanges(*Alloc, SubMask,
-                         [&MO, this](LiveInterval::SubRange &SR) {
-                           if (MO.isDef())
-                             createDeadDef(*Indexes, *Alloc, SR, MO);
-                         },
-                         *Indexes, TRI);
-    }
-
-    // Create the def in the main liverange. We do not have to do this if
-    // subranges are tracked as we recreate the main range later in this case.
-    if (MO.isDef() && !LI.hasSubRanges())
-      createDeadDef(*Indexes, *Alloc, LI, MO);
-  }
-
-  // We may have created empty live ranges for partially undefined uses, we
-  // can't keep them because we won't find defs in them later.
-  LI.removeEmptySubRanges();
-
-  // Step 2: Extend live segments to all uses, constructing SSA form as
-  // necessary.
-  if (LI.hasSubRanges()) {
-    for (LiveInterval::SubRange &S : LI.subranges()) {
-      LiveRangeCalc SubLRC;
-      SubLRC.reset(MF, Indexes, DomTree, Alloc);
-      SubLRC.extendToUses(S, Reg, S.LaneMask, &LI);
-    }
-    LI.clear();
-    constructMainRangeFromSubranges(LI);
-  } else {
-    resetLiveOutMap();
-    extendToUses(LI, Reg, LaneBitmask::getAll());
-  }
-}
-
-void LiveRangeCalc::constructMainRangeFromSubranges(LiveInterval &LI) {
-  // First create dead defs at all defs found in subranges.
-  LiveRange &MainRange = LI;
-  assert(MainRange.segments.empty() && MainRange.valnos.empty() &&
-         "Expect empty main liverange");
-
-  for (const LiveInterval::SubRange &SR : LI.subranges()) {
-    for (const VNInfo *VNI : SR.valnos) {
-      if (!VNI->isUnused() && !VNI->isPHIDef())
-        MainRange.createDeadDef(VNI->def, *Alloc);
-    }
-  }
-  resetLiveOutMap();
-  extendToUses(MainRange, LI.reg, LaneBitmask::getAll(), &LI);
-}
-
-void LiveRangeCalc::createDeadDefs(LiveRange &LR, unsigned Reg) {
-  assert(MRI && Indexes && "call reset() first");
-
-  // Visit all def operands. If the same instruction has multiple defs of Reg,
-  // LR.createDeadDef() will deduplicate.
-  for (MachineOperand &MO : MRI->def_operands(Reg))
-    createDeadDef(*Indexes, *Alloc, LR, MO);
-}
-
-void LiveRangeCalc::extendToUses(LiveRange &LR, unsigned Reg, LaneBitmask Mask,
-                                 LiveInterval *LI) {
-  SmallVector<SlotIndex, 4> Undefs;
-  if (LI != nullptr)
-    LI->computeSubRangeUndefs(Undefs, Mask, *MRI, *Indexes);
-
-  // Visit all operands that read Reg. This may include partial defs.
-  bool IsSubRange = !Mask.all();
-  const TargetRegisterInfo &TRI = *MRI->getTargetRegisterInfo();
-  for (MachineOperand &MO : MRI->reg_nodbg_operands(Reg)) {
-    // Clear all kill flags. They will be reinserted after register allocation
-    // by LiveIntervals::addKillFlags().
-    if (MO.isUse())
-      MO.setIsKill(false);
-    // MO::readsReg returns "true" for subregister defs. This is for keeping
-    // liveness of the entire register (i.e. for the main range of the live
-    // interval). For subranges, definitions of non-overlapping subregisters
-    // do not count as uses.
-    if (!MO.readsReg() || (IsSubRange && MO.isDef()))
-      continue;
-
-    unsigned SubReg = MO.getSubReg();
-    if (SubReg != 0) {
-      LaneBitmask SLM = TRI.getSubRegIndexLaneMask(SubReg);
-      if (MO.isDef())
-        SLM = ~SLM;
-      // Ignore uses not reading the current (sub)range.
-      if ((SLM & Mask).none())
-        continue;
-    }
-
-    // Determine the actual place of the use.
-    const MachineInstr *MI = MO.getParent();
-    unsigned OpNo = (&MO - &MI->getOperand(0));
-    SlotIndex UseIdx;
-    if (MI->isPHI()) {
-      assert(!MO.isDef() && "Cannot handle PHI def of partial register.");
-      // The actual place where a phi operand is used is the end of the pred
-      // MBB. PHI operands are paired: (Reg, PredMBB).
-      UseIdx = Indexes->getMBBEndIdx(MI->getOperand(OpNo+1).getMBB());
-    } else {
-      // Check for early-clobber redefs.
-      bool isEarlyClobber = false;
-      unsigned DefIdx;
-      if (MO.isDef())
-        isEarlyClobber = MO.isEarlyClobber();
-      else if (MI->isRegTiedToDefOperand(OpNo, &DefIdx)) {
-        // FIXME: This would be a lot easier if tied early-clobber uses also
-        // had an early-clobber flag.
-        isEarlyClobber = MI->getOperand(DefIdx).isEarlyClobber();
-      }
-      UseIdx = Indexes->getInstructionIndex(*MI).getRegSlot(isEarlyClobber);
-    }
-
-    // MI is reading Reg. We may have visited MI before if it happens to be
-    // reading Reg multiple times. That is OK, extend() is idempotent.
-    extend(LR, UseIdx, Reg, Undefs);
-  }
-}
-
 void LiveRangeCalc::updateFromLiveIns() {
   LiveRangeUpdater Updater;
   for (const LiveInBlock &I : LiveIn) {
diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp
index 34bac082bcd7..9de77c19a23a 100644
--- a/llvm/lib/CodeGen/LiveRangeEdit.cpp
+++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
@@ -30,7 +31,7 @@ STATISTIC(NumFracRanges,     "Number of live ranges fractured by DCE");
 
 void LiveRangeEdit::Delegate::anchor() { }
 
-LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg,
+LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(Register OldReg,
                                                      bool createSubRanges) {
   Register VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
   if (VRM)
@@ -51,7 +52,7 @@ LiveInterval &LiveRangeEdit::createEmptyIntervalFrom(unsigned OldReg,
   return LI;
 }
 
-unsigned LiveRangeEdit::createFrom(unsigned OldReg) {
+Register LiveRangeEdit::createFrom(Register OldReg) {
   Register VReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
   if (VRM) {
     VRM->setIsSplitFromReg(VReg, VRM->getOriginal(OldReg));
@@ -69,7 +70,7 @@ unsigned LiveRangeEdit::createFrom(unsigned OldReg) {
 
 bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
                                           const MachineInstr *DefMI,
-                                          AliasAnalysis *aa) {
+                                          AAResults *aa) {
   assert(DefMI && "Missing instruction");
   ScannedRemattable = true;
   if (!TII.isTriviallyReMaterializable(*DefMI, aa))
@@ -78,7 +79,7 @@ bool LiveRangeEdit::checkRematerializable(VNInfo *VNI,
   return true;
 }
 
-void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) {
+void LiveRangeEdit::scanRemattable(AAResults *aa) {
   for (VNInfo *VNI : getParent().valnos) {
     if (VNI->isUnused())
       continue;
@@ -95,7 +96,7 @@ void LiveRangeEdit::scanRemattable(AliasAnalysis *aa) {
   ScannedRemattable = true;
 }
 
-bool LiveRangeEdit::anyRematerializable(AliasAnalysis *aa) {
+bool LiveRangeEdit::anyRematerializable(AAResults *aa) {
   if (!ScannedRemattable)
     scanRemattable(aa);
   return !Remattable.empty();
@@ -177,7 +178,7 @@ SlotIndex LiveRangeEdit::rematerializeAt(MachineBasicBlock &MBB,
   return LIS.getSlotIndexes()->insertMachineInstrInMaps(*MI, Late).getRegSlot();
 }
 
-void LiveRangeEdit::eraseVirtReg(unsigned Reg) {
+void LiveRangeEdit::eraseVirtReg(Register Reg) {
   if (TheDelegate && TheDelegate->LRE_CanEraseVirtReg(Reg))
     LIS.removeInterval(Reg);
 }
@@ -231,7 +232,8 @@ bool LiveRangeEdit::foldAsLoad(LiveInterval *LI,
     return false;
   LLVM_DEBUG(dbgs() << "                folded: " << *FoldMI);
   LIS.ReplaceMachineInstrInMaps(*UseMI, *FoldMI);
-  if (UseMI->isCall())
+  // Update the call site info.
+  if (UseMI->shouldUpdateCallSiteInfo())
     UseMI->getMF()->moveCallSiteInfo(UseMI, FoldMI);
   UseMI->eraseFromParent();
   DefMI->addRegisterDead(LI->reg, nullptr);
@@ -258,7 +260,7 @@ bool LiveRangeEdit::useIsKill(const LiveInterval &LI,
 
 /// Find all live intervals that need to shrink, then remove the instruction.
 void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
-                                     AliasAnalysis *AA) {
+                                     AAResults *AA) {
   assert(MI->allDefsAreDead() && "Def isn't really dead");
   SlotIndex Idx = LIS.getInstructionIndex(*MI).getRegSlot();
 
@@ -381,7 +383,7 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
   // Erase any virtregs that are now empty and unused. There may be <undef>
   // uses around. Keep the empty live range in that case.
   for (unsigned i = 0, e = RegsToErase.size(); i != e; ++i) {
-    unsigned Reg = RegsToErase[i];
+    Register Reg = RegsToErase[i];
     if (LIS.hasInterval(Reg) && MRI.reg_nodbg_empty(Reg)) {
       ToShrink.remove(&LIS.getInterval(Reg));
       eraseVirtReg(Reg);
@@ -390,8 +392,8 @@ void LiveRangeEdit::eliminateDeadDef(MachineInstr *MI, ToShrinkSet &ToShrink,
 }
 
 void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
-                                      ArrayRef<unsigned> RegsBeingSpilled,
-                                      AliasAnalysis *AA) {
+                                      ArrayRef<Register> RegsBeingSpilled,
+                                      AAResults *AA) {
   ToShrinkSet ToShrink;
 
   for (;;) {
@@ -450,8 +452,7 @@ void LiveRangeEdit::eliminateDeadDefs(SmallVectorImpl<MachineInstr *> &Dead,
 // Keep track of new virtual registers created via
 // MachineRegisterInfo::createVirtualRegister.
 void
-LiveRangeEdit::MRI_NoteNewVirtualRegister(unsigned VReg)
-{
+LiveRangeEdit::MRI_NoteNewVirtualRegister(Register VReg) {
   if (VRM)
     VRM->grow();
 
diff --git a/llvm/lib/CodeGen/LiveRangeShrink.cpp b/llvm/lib/CodeGen/LiveRangeShrink.cpp
index 2ebc8d7576d1..26439a656917 100644
--- a/llvm/lib/CodeGen/LiveRangeShrink.cpp
+++ b/llvm/lib/CodeGen/LiveRangeShrink.cpp
@@ -234,8 +234,7 @@ bool LiveRangeShrink::runOnMachineFunction(MachineFunction &MF) {
         MachineBasicBlock::iterator EndIter = std::next(MI.getIterator());
         if (MI.getOperand(0).isReg())
           for (; EndIter != MBB.end() && EndIter->isDebugValue() &&
-                 EndIter->getOperand(0).isReg() &&
-                 EndIter->getOperand(0).getReg() == MI.getOperand(0).getReg();
+                 EndIter->getDebugOperandForReg(MI.getOperand(0).getReg());
                ++EndIter, ++Next)
             IOM[&*EndIter] = NewOrder;
         MBB.splice(I, &MBB, MI.getIterator(), EndIter);
diff --git a/llvm/lib/CodeGen/LiveVariables.cpp b/llvm/lib/CodeGen/LiveVariables.cpp
index 9bd55c6f750f..6610491dd111 100644
--- a/llvm/lib/CodeGen/LiveVariables.cpp
+++ b/llvm/lib/CodeGen/LiveVariables.cpp
@@ -806,3 +806,31 @@ void LiveVariables::addNewBlock(MachineBasicBlock *BB,
       VI.AliveBlocks.set(NumNew);
   }
 }
+
+/// addNewBlock - Add a new basic block BB as an empty succcessor to DomBB. All
+/// variables that are live out of DomBB will be marked as passing live through
+/// BB. LiveInSets[BB] is *not* updated (because it is not needed during
+/// PHIElimination).
+void LiveVariables::addNewBlock(MachineBasicBlock *BB,
+                                MachineBasicBlock *DomBB,
+                                MachineBasicBlock *SuccBB,
+                                std::vector<SparseBitVector<>> &LiveInSets) {
+  const unsigned NumNew = BB->getNumber();
+
+  SparseBitVector<> &BV = LiveInSets[SuccBB->getNumber()];
+  for (auto R = BV.begin(), E = BV.end(); R != E; R++) {
+    unsigned VirtReg = Register::index2VirtReg(*R);
+    LiveVariables::VarInfo &VI = getVarInfo(VirtReg);
+    VI.AliveBlocks.set(NumNew);
+  }
+  // All registers used by PHI nodes in SuccBB must be live through BB.
+  for (MachineBasicBlock::iterator BBI = SuccBB->begin(),
+         BBE = SuccBB->end();
+       BBI != BBE && BBI->isPHI(); ++BBI) {
+    for (unsigned i = 1, e = BBI->getNumOperands(); i != e; i += 2)
+      if (BBI->getOperand(i + 1).getMBB() == BB &&
+          BBI->getOperand(i).readsReg())
+        getVarInfo(BBI->getOperand(i).getReg())
+          .AliveBlocks.set(NumNew);
+  }
+}
diff --git a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
index 5022726dc70a..6c5ef0255a08 100644
--- a/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
+++ b/llvm/lib/CodeGen/LocalStackSlotAllocation.cpp
@@ -79,11 +79,11 @@ namespace {
     using StackObjSet = SmallSetVector<int, 8>;
 
     void AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx, int64_t &Offset,
-                           bool StackGrowsDown, unsigned &MaxAlign);
+                           bool StackGrowsDown, Align &MaxAlign);
     void AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
                                SmallSet<int, 16> &ProtectedObjs,
                                MachineFrameInfo &MFI, bool StackGrowsDown,
-                               int64_t &Offset, unsigned &MaxAlign);
+                               int64_t &Offset, Align &MaxAlign);
     void calculateFrameObjectOffsets(MachineFunction &Fn);
     bool insertFrameReferenceRegisters(MachineFunction &Fn);
 
@@ -140,22 +140,21 @@ bool LocalStackSlotPass::runOnMachineFunction(MachineFunction &MF) {
 }
 
 /// AdjustStackOffset - Helper function used to adjust the stack frame offset.
-void LocalStackSlotPass::AdjustStackOffset(MachineFrameInfo &MFI,
-                                           int FrameIdx, int64_t &Offset,
-                                           bool StackGrowsDown,
-                                           unsigned &MaxAlign) {
+void LocalStackSlotPass::AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
+                                           int64_t &Offset, bool StackGrowsDown,
+                                           Align &MaxAlign) {
   // If the stack grows down, add the object size to find the lowest address.
   if (StackGrowsDown)
     Offset += MFI.getObjectSize(FrameIdx);
 
-  unsigned Align = MFI.getObjectAlignment(FrameIdx);
+  Align Alignment = MFI.getObjectAlign(FrameIdx);
 
   // If the alignment of this object is greater than that of the stack, then
   // increase the stack alignment to match.
-  MaxAlign = std::max(MaxAlign, Align);
+  MaxAlign = std::max(MaxAlign, Alignment);
 
   // Adjust to alignment boundary.
-  Offset = (Offset + Align - 1) / Align * Align;
+  Offset = alignTo(Offset, Alignment);
 
   int64_t LocalOffset = StackGrowsDown ? -Offset : Offset;
   LLVM_DEBUG(dbgs() << "Allocate FI(" << FrameIdx << ") to local offset "
@@ -173,11 +172,10 @@ void LocalStackSlotPass::AdjustStackOffset(MachineFrameInfo &MFI,
 
 /// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
 /// those required to be close to the Stack Protector) to stack offsets.
-void LocalStackSlotPass::AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
-                                           SmallSet<int, 16> &ProtectedObjs,
-                                           MachineFrameInfo &MFI,
-                                           bool StackGrowsDown, int64_t &Offset,
-                                           unsigned &MaxAlign) {
+void LocalStackSlotPass::AssignProtectedObjSet(
+    const StackObjSet &UnassignedObjs, SmallSet<int, 16> &ProtectedObjs,
+    MachineFrameInfo &MFI, bool StackGrowsDown, int64_t &Offset,
+    Align &MaxAlign) {
   for (StackObjSet::const_iterator I = UnassignedObjs.begin(),
         E = UnassignedObjs.end(); I != E; ++I) {
     int i = *I;
@@ -195,7 +193,7 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
   bool StackGrowsDown =
     TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown;
   int64_t Offset = 0;
-  unsigned MaxAlign = 0;
+  Align MaxAlign;
 
   // Make sure that the stack protector comes before the local variables on the
   // stack.
@@ -262,7 +260,7 @@ void LocalStackSlotPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
 
   // Remember how big this blob of stack space is
   MFI.setLocalFrameSize(Offset);
-  MFI.setLocalFrameMaxAlign(assumeAligned(MaxAlign));
+  MFI.setLocalFrameMaxAlign(MaxAlign);
 }
 
 static inline bool
diff --git a/llvm/lib/CodeGen/LowLevelType.cpp b/llvm/lib/CodeGen/LowLevelType.cpp
index 40dfa696a2b9..33752a1f9230 100644
--- a/llvm/lib/CodeGen/LowLevelType.cpp
+++ b/llvm/lib/CodeGen/LowLevelType.cpp
@@ -19,7 +19,7 @@ using namespace llvm;
 
 LLT llvm::getLLTForType(Type &Ty, const DataLayout &DL) {
   if (auto VTy = dyn_cast<VectorType>(&Ty)) {
-    auto NumElements = VTy->getNumElements();
+    auto NumElements = cast<FixedVectorType>(VTy)->getNumElements();
     LLT ScalarTy = getLLTForType(*VTy->getElementType(), DL);
     if (NumElements == 1)
       return ScalarTy;
diff --git a/llvm/lib/CodeGen/LowerEmuTLS.cpp b/llvm/lib/CodeGen/LowerEmuTLS.cpp
index 529d478756d4..36b863178b47 100644
--- a/llvm/lib/CodeGen/LowerEmuTLS.cpp
+++ b/llvm/lib/CodeGen/LowerEmuTLS.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -127,12 +128,7 @@ bool LowerEmuTLS::addEmuTlsVar(Module &M, const GlobalVariable *GV) {
     return true;
 
   Type *GVType = GV->getValueType();
-  unsigned GVAlignment = GV->getAlignment();
-  if (!GVAlignment) {
-    // When LLVM IL declares a variable without alignment, use
-    // the ABI default alignment for the type.
-    GVAlignment = DL.getABITypeAlignment(GVType);
-  }
+  Align GVAlignment = DL.getValueOrABITypeAlignment(GV->getAlign(), GVType);
 
   // Define "__emutls_t.*" if there is InitValue
   GlobalVariable *EmuTlsTmplVar = nullptr;
@@ -143,21 +139,20 @@ bool LowerEmuTLS::addEmuTlsVar(Module &M, const GlobalVariable *GV) {
     assert(EmuTlsTmplVar && "Failed to create emualted TLS initializer");
     EmuTlsTmplVar->setConstant(true);
     EmuTlsTmplVar->setInitializer(const_cast<Constant*>(InitValue));
-    EmuTlsTmplVar->setAlignment(Align(GVAlignment));
+    EmuTlsTmplVar->setAlignment(GVAlignment);
     copyLinkageVisibility(M, GV, EmuTlsTmplVar);
   }
 
   // Define "__emutls_v.*" with initializer and alignment.
   Constant *ElementValues[4] = {
       ConstantInt::get(WordType, DL.getTypeStoreSize(GVType)),
-      ConstantInt::get(WordType, GVAlignment),
-      NullPtr, EmuTlsTmplVar ? EmuTlsTmplVar : NullPtr
-  };
+      ConstantInt::get(WordType, GVAlignment.value()), NullPtr,
+      EmuTlsTmplVar ? EmuTlsTmplVar : NullPtr};
   ArrayRef<Constant*> ElementValueArray(ElementValues, 4);
   EmuTlsVar->setInitializer(
       ConstantStruct::get(EmuTlsVarType, ElementValueArray));
-  Align MaxAlignment(std::max(DL.getABITypeAlignment(WordType),
-                              DL.getABITypeAlignment(VoidPtrType)));
+  Align MaxAlignment =
+      std::max(DL.getABITypeAlign(WordType), DL.getABITypeAlign(VoidPtrType));
   EmuTlsVar->setAlignment(MaxAlignment);
   return true;
 }
diff --git a/llvm/lib/CodeGen/MBFIWrapper.cpp b/llvm/lib/CodeGen/MBFIWrapper.cpp
new file mode 100644
index 000000000000..5110f75ebb42
--- /dev/null
+++ b/llvm/lib/CodeGen/MBFIWrapper.cpp
@@ -0,0 +1,49 @@
+//===- MBFIWrapper.cpp - MachineBlockFrequencyInfo wrapper ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class keeps track of branch frequencies of newly created blocks and
+// tail-merged blocks. Used by the TailDuplication and MachineBlockPlacement.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MBFIWrapper.h"
+#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
+
+using namespace llvm;
+
+BlockFrequency MBFIWrapper::getBlockFreq(const MachineBasicBlock *MBB) const {
+  auto I = MergedBBFreq.find(MBB);
+
+  if (I != MergedBBFreq.end())
+    return I->second;
+
+  return MBFI.getBlockFreq(MBB);
+}
+
+void MBFIWrapper::setBlockFreq(const MachineBasicBlock *MBB,
+                               BlockFrequency F) {
+  MergedBBFreq[MBB] = F;
+}
+
+raw_ostream & MBFIWrapper::printBlockFreq(raw_ostream &OS,
+                                          const MachineBasicBlock *MBB) const {
+  return MBFI.printBlockFreq(OS, getBlockFreq(MBB));
+}
+
+raw_ostream & MBFIWrapper::printBlockFreq(raw_ostream &OS,
+                                          const BlockFrequency Freq) const {
+  return MBFI.printBlockFreq(OS, Freq);
+}
+
+void MBFIWrapper::view(const Twine &Name, bool isSimple) {
+  MBFI.view(Name, isSimple);
+}
+
+uint64_t MBFIWrapper::getEntryFreq() const {
+  return MBFI.getEntryFreq();
+}
diff --git a/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp b/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
index 5ef907b88315..9eddb8626f60 100644
--- a/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
+++ b/llvm/lib/CodeGen/MIRCanonicalizerPass.cpp
@@ -107,7 +107,7 @@ rescheduleLexographically(std::vector<MachineInstr *> instructions,
     II->print(OS);
     OS.flush();
 
-    // Trim the assignment, or start from the begining in the case of a store.
+    // Trim the assignment, or start from the beginning in the case of a store.
     const size_t i = S.find("=");
     StringInstrMap.push_back({(i == std::string::npos) ? S : S.substr(i), II});
   }
@@ -138,7 +138,7 @@ static bool rescheduleCanonically(unsigned &PseudoIdempotentInstCount,
 
   bool Changed = false;
 
-  // Calculates the distance of MI from the begining of its parent BB.
+  // Calculates the distance of MI from the beginning of its parent BB.
   auto getInstrIdx = [](const MachineInstr &MI) {
     unsigned i = 0;
     for (auto &CurMI : *MI.getParent()) {
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.cpp b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
index 5976f5da1569..98af46dc4872 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.cpp
@@ -11,12 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "MILexer.h"
-#include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/None.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
 #include <algorithm>
 #include <cassert>
@@ -104,6 +101,20 @@ static Cursor skipComment(Cursor C) {
   return C;
 }
 
+/// Machine operands can have comments, enclosed between /* and */.
+/// This eats up all tokens, including /* and */.
+static Cursor skipMachineOperandComment(Cursor C) {
+  if (C.peek() != '/' || C.peek(1) != '*')
+    return C;
+
+  while (C.peek() != '*' || C.peek(1) != '/')
+    C.advance();
+
+  C.advance();
+  C.advance();
+  return C;
+}
+
 /// Return true if the given character satisfies the following regular
 /// expression: [-a-zA-Z$._0-9]
 static bool isIdentifierChar(char C) {
@@ -246,6 +257,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("liveout", MIToken::kw_liveout)
       .Case("address-taken", MIToken::kw_address_taken)
       .Case("landing-pad", MIToken::kw_landing_pad)
+      .Case("ehfunclet-entry", MIToken::kw_ehfunclet_entry)
       .Case("liveins", MIToken::kw_liveins)
       .Case("successors", MIToken::kw_successors)
       .Case("floatpred", MIToken::kw_floatpred)
@@ -254,6 +266,7 @@ static MIToken::TokenKind getIdentifierKind(StringRef Identifier) {
       .Case("pre-instr-symbol", MIToken::kw_pre_instr_symbol)
       .Case("post-instr-symbol", MIToken::kw_post_instr_symbol)
       .Case("heap-alloc-marker", MIToken::kw_heap_alloc_marker)
+      .Case("bbsections", MIToken::kw_bbsections)
       .Case("unknown-size", MIToken::kw_unknown_size)
       .Default(MIToken::Identifier);
 }
@@ -518,7 +531,7 @@ static Cursor maybeLexMCSymbol(Cursor C, MIToken &Token,
 }
 
 static bool isValidHexFloatingPointPrefix(char C) {
-  return C == 'H' || C == 'K' || C == 'L' || C == 'M';
+  return C == 'H' || C == 'K' || C == 'L' || C == 'M' || C == 'R';
 }
 
 static Cursor lexFloatingPointLiteral(Cursor Range, Cursor C, MIToken &Token) {
@@ -691,6 +704,8 @@ StringRef llvm::lexMIToken(StringRef Source, MIToken &Token,
     return C.remaining();
   }
 
+  C = skipMachineOperandComment(C);
+
   if (Cursor R = maybeLexMachineBasicBlock(C, Token, ErrorCallback))
     return R.remaining();
   if (Cursor R = maybeLexIdentifier(C, Token))
diff --git a/llvm/lib/CodeGen/MIRParser/MILexer.h b/llvm/lib/CodeGen/MIRParser/MILexer.h
index aaffe4a4c91b..ef16da94d21b 100644
--- a/llvm/lib/CodeGen/MIRParser/MILexer.h
+++ b/llvm/lib/CodeGen/MIRParser/MILexer.h
@@ -15,7 +15,6 @@
 #define LLVM_LIB_CODEGEN_MIRPARSER_MILEXER_H
 
 #include "llvm/ADT/APSInt.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include <string>
 
@@ -114,6 +113,7 @@ struct MIToken {
     kw_liveout,
     kw_address_taken,
     kw_landing_pad,
+    kw_ehfunclet_entry,
     kw_liveins,
     kw_successors,
     kw_floatpred,
@@ -122,6 +122,7 @@ struct MIToken {
     kw_pre_instr_symbol,
     kw_post_instr_symbol,
     kw_heap_alloc_marker,
+    kw_bbsections,
     kw_unknown_size,
 
     // Named metadata keywords
diff --git a/llvm/lib/CodeGen/MIRParser/MIParser.cpp b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
index 076ca943788b..ded31cd08fb5 100644
--- a/llvm/lib/CodeGen/MIRParser/MIParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIParser.cpp
@@ -122,7 +122,7 @@ void PerTargetMIParsingState::initNames2Regs() {
 }
 
 bool PerTargetMIParsingState::getRegisterByName(StringRef RegName,
-                                                unsigned &Reg) {
+                                                Register &Reg) {
   initNames2Regs();
   auto RegInfo = Names2Regs.find(RegName);
   if (RegInfo == Names2Regs.end())
@@ -321,7 +321,7 @@ PerFunctionMIParsingState::PerFunctionMIParsingState(MachineFunction &MF,
   : MF(MF), SM(&SM), IRSlots(IRSlots), Target(T) {
 }
 
-VRegInfo &PerFunctionMIParsingState::getVRegInfo(unsigned Num) {
+VRegInfo &PerFunctionMIParsingState::getVRegInfo(Register Num) {
   auto I = VRegInfos.insert(std::make_pair(Num, nullptr));
   if (I.second) {
     MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -426,9 +426,9 @@ public:
   bool parseBasicBlocks();
   bool parse(MachineInstr *&MI);
   bool parseStandaloneMBB(MachineBasicBlock *&MBB);
-  bool parseStandaloneNamedRegister(unsigned &Reg);
+  bool parseStandaloneNamedRegister(Register &Reg);
   bool parseStandaloneVirtualRegister(VRegInfo *&Info);
-  bool parseStandaloneRegister(unsigned &Reg);
+  bool parseStandaloneRegister(Register &Reg);
   bool parseStandaloneStackObject(int &FI);
   bool parseStandaloneMDNode(MDNode *&Node);
 
@@ -439,10 +439,10 @@ public:
   bool parseBasicBlockLiveins(MachineBasicBlock &MBB);
   bool parseBasicBlockSuccessors(MachineBasicBlock &MBB);
 
-  bool parseNamedRegister(unsigned &Reg);
+  bool parseNamedRegister(Register &Reg);
   bool parseVirtualRegister(VRegInfo *&Info);
   bool parseNamedVirtualRegister(VRegInfo *&Info);
-  bool parseRegister(unsigned &Reg, VRegInfo *&VRegInfo);
+  bool parseRegister(Register &Reg, VRegInfo *&VRegInfo);
   bool parseRegisterFlag(unsigned &Flags);
   bool parseRegisterClassOrBank(VRegInfo &RegInfo);
   bool parseSubRegisterIndex(unsigned &SubReg);
@@ -474,7 +474,7 @@ public:
   bool parseDILocation(MDNode *&Expr);
   bool parseMetadataOperand(MachineOperand &Dest);
   bool parseCFIOffset(int &Offset);
-  bool parseCFIRegister(unsigned &Reg);
+  bool parseCFIRegister(Register &Reg);
   bool parseCFIEscapeValues(std::string& Values);
   bool parseCFIOperand(MachineOperand &Dest);
   bool parseIRBlock(BasicBlock *&BB, const Function &F);
@@ -495,6 +495,7 @@ public:
   bool parseOffset(int64_t &Offset);
   bool parseAlignment(unsigned &Alignment);
   bool parseAddrspace(unsigned &Addrspace);
+  bool parseSectionID(Optional<MBBSectionID> &SID);
   bool parseOperandsOffset(MachineOperand &Op);
   bool parseIRValue(const Value *&V);
   bool parseMemoryOperandFlag(MachineMemOperand::Flags &Flags);
@@ -562,7 +563,7 @@ MIParser::MIParser(PerFunctionMIParsingState &PFS, SMDiagnostic &Error,
 
 void MIParser::lex(unsigned SkipChar) {
   CurrentSource = lexMIToken(
-      CurrentSource.data() + SkipChar, Token,
+      CurrentSource.slice(SkipChar, StringRef::npos), Token,
       [this](StringRef::iterator Loc, const Twine &Msg) { error(Loc, Msg); });
 }
 
@@ -619,6 +620,28 @@ bool MIParser::consumeIfPresent(MIToken::TokenKind TokenKind) {
   return true;
 }
 
+// Parse Machine Basic Block Section ID.
+bool MIParser::parseSectionID(Optional<MBBSectionID> &SID) {
+  assert(Token.is(MIToken::kw_bbsections));
+  lex();
+  if (Token.is(MIToken::IntegerLiteral)) {
+    unsigned Value = 0;
+    if (getUnsigned(Value))
+      return error("Unknown Section ID");
+    SID = MBBSectionID{Value};
+  } else {
+    const StringRef &S = Token.stringValue();
+    if (S == "Exception")
+      SID = MBBSectionID::ExceptionSectionID;
+    else if (S == "Cold")
+      SID = MBBSectionID::ColdSectionID;
+    else
+      return error("Unknown Section ID");
+  }
+  lex();
+  return false;
+}
+
 bool MIParser::parseBasicBlockDefinition(
     DenseMap<unsigned, MachineBasicBlock *> &MBBSlots) {
   assert(Token.is(MIToken::MachineBasicBlockLabel));
@@ -630,6 +653,8 @@ bool MIParser::parseBasicBlockDefinition(
   lex();
   bool HasAddressTaken = false;
   bool IsLandingPad = false;
+  bool IsEHFuncletEntry = false;
+  Optional<MBBSectionID> SectionID;
   unsigned Alignment = 0;
   BasicBlock *BB = nullptr;
   if (consumeIfPresent(MIToken::lparen)) {
@@ -644,6 +669,10 @@ bool MIParser::parseBasicBlockDefinition(
         IsLandingPad = true;
         lex();
         break;
+      case MIToken::kw_ehfunclet_entry:
+        IsEHFuncletEntry = true;
+        lex();
+        break;
       case MIToken::kw_align:
         if (parseAlignment(Alignment))
           return true;
@@ -654,6 +683,10 @@ bool MIParser::parseBasicBlockDefinition(
           return true;
         lex();
         break;
+      case MIToken::kw_bbsections:
+        if (parseSectionID(SectionID))
+          return true;
+        break;
       default:
         break;
       }
@@ -683,6 +716,11 @@ bool MIParser::parseBasicBlockDefinition(
   if (HasAddressTaken)
     MBB->setHasAddressTaken();
   MBB->setIsEHPad(IsLandingPad);
+  MBB->setIsEHFuncletEntry(IsEHFuncletEntry);
+  if (SectionID.hasValue()) {
+    MBB->setSectionID(SectionID.getValue());
+    MF.setBBSectionsType(BasicBlockSection::List);
+  }
   return false;
 }
 
@@ -740,7 +778,7 @@ bool MIParser::parseBasicBlockLiveins(MachineBasicBlock &MBB) {
   do {
     if (Token.isNot(MIToken::NamedRegister))
       return error("expected a named register");
-    unsigned Reg = 0;
+    Register Reg;
     if (parseNamedRegister(Reg))
       return true;
     lex();
@@ -750,10 +788,10 @@ bool MIParser::parseBasicBlockLiveins(MachineBasicBlock &MBB) {
       if (Token.isNot(MIToken::IntegerLiteral) &&
           Token.isNot(MIToken::HexLiteral))
         return error("expected a lane mask");
-      static_assert(sizeof(LaneBitmask::Type) == sizeof(unsigned),
+      static_assert(sizeof(LaneBitmask::Type) == sizeof(uint64_t),
                     "Use correct get-function for lane mask");
       LaneBitmask::Type V;
-      if (getUnsigned(V))
+      if (getUint64(V))
         return error("invalid lane mask value");
       Mask = LaneBitmask(V);
       lex();
@@ -1048,7 +1086,7 @@ bool MIParser::parseStandaloneMBB(MachineBasicBlock *&MBB) {
   return false;
 }
 
-bool MIParser::parseStandaloneNamedRegister(unsigned &Reg) {
+bool MIParser::parseStandaloneNamedRegister(Register &Reg) {
   lex();
   if (Token.isNot(MIToken::NamedRegister))
     return error("expected a named register");
@@ -1072,7 +1110,7 @@ bool MIParser::parseStandaloneVirtualRegister(VRegInfo *&Info) {
   return false;
 }
 
-bool MIParser::parseStandaloneRegister(unsigned &Reg) {
+bool MIParser::parseStandaloneRegister(Register &Reg) {
   lex();
   if (Token.isNot(MIToken::NamedRegister) &&
       Token.isNot(MIToken::VirtualRegister))
@@ -1123,7 +1161,7 @@ static const char *printImplicitRegisterFlag(const MachineOperand &MO) {
 }
 
 static std::string getRegisterName(const TargetRegisterInfo *TRI,
-                                   unsigned Reg) {
+                                   Register Reg) {
   assert(Register::isPhysicalRegister(Reg) && "expected phys reg");
   return StringRef(TRI->getName(Reg)).lower();
 }
@@ -1223,7 +1261,7 @@ bool MIParser::parseInstruction(unsigned &OpCode, unsigned &Flags) {
   return false;
 }
 
-bool MIParser::parseNamedRegister(unsigned &Reg) {
+bool MIParser::parseNamedRegister(Register &Reg) {
   assert(Token.is(MIToken::NamedRegister) && "Needs NamedRegister token");
   StringRef Name = Token.stringValue();
   if (PFS.Target.getRegisterByName(Name, Reg))
@@ -1251,7 +1289,7 @@ bool MIParser::parseVirtualRegister(VRegInfo *&Info) {
   return false;
 }
 
-bool MIParser::parseRegister(unsigned &Reg, VRegInfo *&Info) {
+bool MIParser::parseRegister(Register &Reg, VRegInfo *&Info) {
   switch (Token.kind()) {
   case MIToken::underscore:
     Reg = 0;
@@ -1445,7 +1483,7 @@ bool MIParser::parseRegisterOperand(MachineOperand &Dest,
   }
   if (!Token.isRegister())
     return error("expected a register after register flags");
-  unsigned Reg;
+  Register Reg;
   VRegInfo *RegInfo;
   if (parseRegister(Reg, RegInfo))
     return true;
@@ -2138,10 +2176,10 @@ bool MIParser::parseCFIOffset(int &Offset) {
   return false;
 }
 
-bool MIParser::parseCFIRegister(unsigned &Reg) {
+bool MIParser::parseCFIRegister(Register &Reg) {
   if (Token.isNot(MIToken::NamedRegister))
     return error("expected a cfi register");
-  unsigned LLVMReg;
+  Register LLVMReg;
   if (parseNamedRegister(LLVMReg))
     return true;
   const auto *TRI = MF.getSubtarget().getRegisterInfo();
@@ -2173,7 +2211,7 @@ bool MIParser::parseCFIOperand(MachineOperand &Dest) {
   auto Kind = Token.kind();
   lex();
   int Offset;
-  unsigned Reg;
+  Register Reg;
   unsigned CFIIndex;
   switch (Kind) {
   case MIToken::kw_cfi_same_value:
@@ -2204,9 +2242,8 @@ bool MIParser::parseCFIOperand(MachineOperand &Dest) {
   case MIToken::kw_cfi_def_cfa_offset:
     if (parseCFIOffset(Offset))
       return true;
-    // NB: MCCFIInstruction::createDefCfaOffset negates the offset.
-    CFIIndex = MF.addFrameInst(
-        MCCFIInstruction::createDefCfaOffset(nullptr, -Offset));
+    CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, Offset));
     break;
   case MIToken::kw_cfi_adjust_cfa_offset:
     if (parseCFIOffset(Offset))
@@ -2218,9 +2255,8 @@ bool MIParser::parseCFIOperand(MachineOperand &Dest) {
     if (parseCFIRegister(Reg) || expectAndConsume(MIToken::comma) ||
         parseCFIOffset(Offset))
       return true;
-    // NB: MCCFIInstruction::createDefCfa negates the offset.
     CFIIndex =
-        MF.addFrameInst(MCCFIInstruction::createDefCfa(nullptr, Reg, -Offset));
+        MF.addFrameInst(MCCFIInstruction::cfiDefCfa(nullptr, Reg, Offset));
     break;
   case MIToken::kw_cfi_remember_state:
     CFIIndex = MF.addFrameInst(MCCFIInstruction::createRememberState(nullptr));
@@ -2239,7 +2275,7 @@ bool MIParser::parseCFIOperand(MachineOperand &Dest) {
     CFIIndex = MF.addFrameInst(MCCFIInstruction::createUndefined(nullptr, Reg));
     break;
   case MIToken::kw_cfi_register: {
-    unsigned Reg2;
+    Register Reg2;
     if (parseCFIRegister(Reg) || expectAndConsume(MIToken::comma) ||
         parseCFIRegister(Reg2))
       return true;
@@ -2334,7 +2370,7 @@ bool MIParser::parseIntrinsicOperand(MachineOperand &Dest) {
   if (Token.isNot(MIToken::NamedGlobalValue))
     return error("expected syntax intrinsic(@llvm.whatever)");
 
-  std::string Name = Token.stringValue();
+  std::string Name = std::string(Token.stringValue());
   lex();
 
   if (expectAndConsume(MIToken::rparen))
@@ -2469,7 +2505,7 @@ bool MIParser::parseCustomRegisterMaskOperand(MachineOperand &Dest) {
   while (true) {
     if (Token.isNot(MIToken::NamedRegister))
       return error("expected a named register");
-    unsigned Reg;
+    Register Reg;
     if (parseNamedRegister(Reg))
       return true;
     lex();
@@ -2495,7 +2531,7 @@ bool MIParser::parseLiveoutRegisterMaskOperand(MachineOperand &Dest) {
   while (true) {
     if (Token.isNot(MIToken::NamedRegister))
       return error("expected a named register");
-    unsigned Reg;
+    Register Reg;
     if (parseNamedRegister(Reg))
       return true;
     lex();
@@ -3060,8 +3096,8 @@ bool MIParser::parseMachineMemoryOperand(MachineMemOperand *&Dest) {
   }
   if (expectAndConsume(MIToken::rparen))
     return true;
-  Dest = MF.getMachineMemOperand(Ptr, Flags, Size, BaseAlignment, AAInfo, Range,
-                                 SSID, Order, FailureOrder);
+  Dest = MF.getMachineMemOperand(Ptr, Flags, Size, Align(BaseAlignment), AAInfo,
+                                 Range, SSID, Order, FailureOrder);
   return false;
 }
 
@@ -3149,7 +3185,7 @@ MCSymbol *MIParser::getOrCreateMCSymbol(StringRef Name) {
 bool MIParser::parseStringConstant(std::string &Result) {
   if (Token.isNot(MIToken::StringConstant))
     return error("expected string constant");
-  Result = Token.stringValue();
+  Result = std::string(Token.stringValue());
   lex();
   return false;
 }
@@ -3172,13 +3208,13 @@ bool llvm::parseMBBReference(PerFunctionMIParsingState &PFS,
 }
 
 bool llvm::parseRegisterReference(PerFunctionMIParsingState &PFS,
-                                  unsigned &Reg, StringRef Src,
+                                  Register &Reg, StringRef Src,
                                   SMDiagnostic &Error) {
   return MIParser(PFS, Error, Src).parseStandaloneRegister(Reg);
 }
 
 bool llvm::parseNamedRegisterReference(PerFunctionMIParsingState &PFS,
-                                       unsigned &Reg, StringRef Src,
+                                       Register &Reg, StringRef Src,
                                        SMDiagnostic &Error) {
   return MIParser(PFS, Error, Src).parseStandaloneNamedRegister(Reg);
 }
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 10157c746b46..2e0b0e745e9e 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -93,7 +93,8 @@ public:
   /// file.
   ///
   /// Return null if an error occurred.
-  std::unique_ptr<Module> parseIRModule();
+  std::unique_ptr<Module>
+  parseIRModule(DataLayoutCallbackTy DataLayoutCallback);
 
   /// Create an empty function with the given name.
   Function *createDummyFunction(StringRef Name, Module &M);
@@ -216,13 +217,17 @@ void MIRParserImpl::reportDiagnostic(const SMDiagnostic &Diag) {
   Context.diagnose(DiagnosticInfoMIRParser(Kind, Diag));
 }
 
-std::unique_ptr<Module> MIRParserImpl::parseIRModule() {
+std::unique_ptr<Module>
+MIRParserImpl::parseIRModule(DataLayoutCallbackTy DataLayoutCallback) {
   if (!In.setCurrentDocument()) {
     if (In.error())
       return nullptr;
     // Create an empty module when the MIR file is empty.
     NoMIRDocuments = true;
-    return std::make_unique<Module>(Filename, Context);
+    auto M = std::make_unique<Module>(Filename, Context);
+    if (auto LayoutOverride = DataLayoutCallback(M->getTargetTriple()))
+      M->setDataLayout(*LayoutOverride);
+    return M;
   }
 
   std::unique_ptr<Module> M;
@@ -232,7 +237,7 @@ std::unique_ptr<Module> MIRParserImpl::parseIRModule() {
           dyn_cast_or_null<yaml::BlockScalarNode>(In.getCurrentNode())) {
     SMDiagnostic Error;
     M = parseAssembly(MemoryBufferRef(BSN->getValue(), Filename), Error,
-                      Context, &IRSlots, /*UpgradeDebugInfo=*/false);
+                      Context, &IRSlots, DataLayoutCallback);
     if (!M) {
       reportDiagnostic(diagFromBlockStringDiag(Error, BSN->getSourceRange()));
       return nullptr;
@@ -243,6 +248,8 @@ std::unique_ptr<Module> MIRParserImpl::parseIRModule() {
   } else {
     // Create an new, empty module.
     M = std::make_unique<Module>(Filename, Context);
+    if (auto LayoutOverride = DataLayoutCallback(M->getTargetTriple()))
+      M->setDataLayout(*LayoutOverride);
     NoLLVMIR = true;
   }
   return M;
@@ -375,17 +382,17 @@ bool MIRParserImpl::initializeCallSiteInfo(
                    " is not a call instruction");
     MachineFunction::CallSiteInfo CSInfo;
     for (auto ArgRegPair : YamlCSInfo.ArgForwardingRegs) {
-      unsigned Reg = 0;
+      Register Reg;
       if (parseNamedRegisterReference(PFS, Reg, ArgRegPair.Reg.Value, Error))
         return error(Error, ArgRegPair.Reg.SourceRange);
       CSInfo.emplace_back(Reg, ArgRegPair.ArgNo);
     }
 
-    if (TM.Options.EnableDebugEntryValues)
+    if (TM.Options.EmitCallSiteInfo)
       MF.addCallArgsForwardingRegs(&*CallI, std::move(CSInfo));
   }
 
-  if (YamlMF.CallSitesInfo.size() && !TM.Options.EnableDebugEntryValues)
+  if (YamlMF.CallSitesInfo.size() && !TM.Options.EmitCallSiteInfo)
     return error(Twine("Call site info provided but not used"));
   return false;
 }
@@ -401,8 +408,7 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
     Target.reset(new PerTargetMIParsingState(MF.getSubtarget()));
   }
 
-  if (YamlMF.Alignment)
-    MF.setAlignment(Align(YamlMF.Alignment));
+  MF.setAlignment(YamlMF.Alignment.valueOrOne());
   MF.setExposesReturnsTwice(YamlMF.ExposesReturnsTwice);
   MF.setHasWinCFI(YamlMF.HasWinCFI);
 
@@ -438,6 +444,14 @@ MIRParserImpl::initializeMachineFunction(const yaml::MachineFunction &YamlMF,
         diagFromBlockStringDiag(Error, YamlMF.Body.Value.SourceRange));
     return true;
   }
+  // Check Basic Block Section Flags.
+  if (MF.getTarget().getBBSectionsType() == BasicBlockSection::Labels) {
+    MF.createBBLabels();
+    MF.setBBSectionsType(BasicBlockSection::Labels);
+  } else if (MF.hasBBSections()) {
+    MF.createBBLabels();
+    MF.assignBeginEndSections();
+  }
   PFS.SM = &SM;
 
   // Initialize the frame information after creating all the MBBs so that the
@@ -550,10 +564,10 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
 
   // Parse the liveins.
   for (const auto &LiveIn : YamlMF.LiveIns) {
-    unsigned Reg = 0;
+    Register Reg;
     if (parseNamedRegisterReference(PFS, Reg, LiveIn.Register.Value, Error))
       return error(Error, LiveIn.Register.SourceRange);
-    unsigned VReg = 0;
+    Register VReg;
     if (!LiveIn.VirtualRegister.Value.empty()) {
       VRegInfo *Info;
       if (parseVirtualRegisterReference(PFS, Info, LiveIn.VirtualRegister.Value,
@@ -569,7 +583,7 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
   if (YamlMF.CalleeSavedRegisters) {
     SmallVector<MCPhysReg, 16> CalleeSavedRegisters;
     for (const auto &RegSource : YamlMF.CalleeSavedRegisters.getValue()) {
-      unsigned Reg = 0;
+      Register Reg;
       if (parseNamedRegisterReference(PFS, Reg, RegSource.Value, Error))
         return error(Error, RegSource.SourceRange);
       CalleeSavedRegisters.push_back(Reg);
@@ -587,7 +601,7 @@ bool MIRParserImpl::setupRegisterInfo(const PerFunctionMIParsingState &PFS,
   bool Error = false;
   // Create VRegs
   auto populateVRegInfo = [&] (const VRegInfo &Info, Twine Name) {
-    unsigned Reg = Info.VReg;
+    Register Reg = Info.VReg;
     switch (Info.Kind) {
     case VRegInfo::UNKNOWN:
       error(Twine("Cannot determine class/bank of virtual register ") +
@@ -646,7 +660,7 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
   MFI.setStackSize(YamlMFI.StackSize);
   MFI.setOffsetAdjustment(YamlMFI.OffsetAdjustment);
   if (YamlMFI.MaxAlignment)
-    MFI.ensureMaxAlignment(YamlMFI.MaxAlignment);
+    MFI.ensureMaxAlignment(Align(YamlMFI.MaxAlignment));
   MFI.setAdjustsStack(YamlMFI.AdjustsStack);
   MFI.setHasCalls(YamlMFI.HasCalls);
   if (YamlMFI.MaxCallFrameSize != ~0u)
@@ -683,7 +697,7 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
       return error(Object.ID.SourceRange.Start,
                    Twine("StackID is not supported by target"));
     MFI.setStackID(ObjectIdx, Object.StackID);
-    MFI.setObjectAlignment(ObjectIdx, Object.Alignment);
+    MFI.setObjectAlignment(ObjectIdx, Object.Alignment.valueOrOne());
     if (!PFS.FixedStackObjectSlots.insert(std::make_pair(Object.ID.Value,
                                                          ObjectIdx))
              .second)
@@ -715,10 +729,11 @@ bool MIRParserImpl::initializeFrameInfo(PerFunctionMIParsingState &PFS,
       return error(Object.ID.SourceRange.Start,
                    Twine("StackID is not supported by target"));
     if (Object.Type == yaml::MachineStackObject::VariableSized)
-      ObjectIdx = MFI.CreateVariableSizedObject(Object.Alignment, Alloca);
+      ObjectIdx =
+          MFI.CreateVariableSizedObject(Object.Alignment.valueOrOne(), Alloca);
     else
       ObjectIdx = MFI.CreateStackObject(
-          Object.Size, Object.Alignment,
+          Object.Size, Object.Alignment.valueOrOne(),
           Object.Type == yaml::MachineStackObject::SpillSlot, Alloca,
           Object.StackID);
     MFI.setObjectOffset(ObjectIdx, Object.Offset);
@@ -757,7 +772,7 @@ bool MIRParserImpl::parseCalleeSavedRegister(PerFunctionMIParsingState &PFS,
     const yaml::StringValue &RegisterSource, bool IsRestored, int FrameIdx) {
   if (RegisterSource.Value.empty())
     return false;
-  unsigned Reg = 0;
+  Register Reg;
   SMDiagnostic Error;
   if (parseNamedRegisterReference(PFS, Reg, RegisterSource.Value, Error))
     return error(Error, RegisterSource.SourceRange);
@@ -830,10 +845,9 @@ bool MIRParserImpl::initializeConstantPool(PerFunctionMIParsingState &PFS,
         parseConstantValue(YamlConstant.Value.Value, Error, M));
     if (!Value)
       return error(Error, YamlConstant.Value.SourceRange);
-    unsigned Alignment =
-        YamlConstant.Alignment
-            ? YamlConstant.Alignment
-            : M.getDataLayout().getPrefTypeAlignment(Value->getType());
+    const Align PrefTypeAlign =
+        M.getDataLayout().getPrefTypeAlign(Value->getType());
+    const Align Alignment = YamlConstant.Alignment.getValueOr(PrefTypeAlign);
     unsigned Index = ConstantPool.getConstantPoolIndex(Value, Alignment);
     if (!ConstantPoolSlots.insert(std::make_pair(YamlConstant.ID.Value, Index))
              .second)
@@ -926,8 +940,9 @@ MIRParser::MIRParser(std::unique_ptr<MIRParserImpl> Impl)
 
 MIRParser::~MIRParser() {}
 
-std::unique_ptr<Module> MIRParser::parseIRModule() {
-  return Impl->parseIRModule();
+std::unique_ptr<Module>
+MIRParser::parseIRModule(DataLayoutCallbackTy DataLayoutCallback) {
+  return Impl->parseIRModule(DataLayoutCallback);
 }
 
 bool MIRParser::parseMachineFunctions(Module &M, MachineModuleInfo &MMI) {
diff --git a/llvm/lib/CodeGen/MIRPrinter.cpp b/llvm/lib/CodeGen/MIRPrinter.cpp
index e8cd3d60ccb1..fa23df6288e9 100644
--- a/llvm/lib/CodeGen/MIRPrinter.cpp
+++ b/llvm/lib/CodeGen/MIRPrinter.cpp
@@ -79,6 +79,9 @@ static cl::opt<bool> SimplifyMIR(
     "simplify-mir", cl::Hidden,
     cl::desc("Leave out unnecessary information when printing MIR"));
 
+static cl::opt<bool> PrintLocations("mir-debug-loc", cl::Hidden, cl::init(true),
+                                    cl::desc("Print MIR debug-locations"));
+
 namespace {
 
 /// This structure describes how to print out stack object references.
@@ -162,8 +165,9 @@ public:
   void print(const MachineInstr &MI);
   void printStackObjectReference(int FrameIndex);
   void print(const MachineInstr &MI, unsigned OpIdx,
-             const TargetRegisterInfo *TRI, bool ShouldPrintRegisterTies,
-             LLT TypeToPrint, bool PrintDef = true);
+             const TargetRegisterInfo *TRI, const TargetInstrInfo *TII,
+             bool ShouldPrintRegisterTies, LLT TypeToPrint,
+             bool PrintDef = true);
 };
 
 } // end namespace llvm
@@ -197,7 +201,7 @@ void MIRPrinter::print(const MachineFunction &MF) {
 
   yaml::MachineFunction YamlMF;
   YamlMF.Name = MF.getName();
-  YamlMF.Alignment = MF.getAlignment().value();
+  YamlMF.Alignment = MF.getAlignment();
   YamlMF.ExposesReturnsTwice = MF.exposesReturnsTwice();
   YamlMF.HasWinCFI = MF.hasWinCFI();
 
@@ -333,7 +337,7 @@ void MIRPrinter::convert(ModuleSlotTracker &MST,
   YamlMFI.HasPatchPoint = MFI.hasPatchPoint();
   YamlMFI.StackSize = MFI.getStackSize();
   YamlMFI.OffsetAdjustment = MFI.getOffsetAdjustment();
-  YamlMFI.MaxAlignment = MFI.getMaxAlignment();
+  YamlMFI.MaxAlignment = MFI.getMaxAlign().value();
   YamlMFI.AdjustsStack = MFI.adjustsStack();
   YamlMFI.HasCalls = MFI.hasCalls();
   YamlMFI.MaxCallFrameSize = MFI.isMaxCallFrameSizeComputed()
@@ -372,7 +376,7 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF,
                           : yaml::FixedMachineStackObject::DefaultType;
     YamlObject.Offset = MFI.getObjectOffset(I);
     YamlObject.Size = MFI.getObjectSize(I);
-    YamlObject.Alignment = MFI.getObjectAlignment(I);
+    YamlObject.Alignment = MFI.getObjectAlign(I);
     YamlObject.StackID = (TargetStackID::Value)MFI.getStackID(I);
     YamlObject.IsImmutable = MFI.isImmutableObjectIndex(I);
     YamlObject.IsAliased = MFI.isAliasedObjectIndex(I);
@@ -390,8 +394,8 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF,
     yaml::MachineStackObject YamlObject;
     YamlObject.ID = ID;
     if (const auto *Alloca = MFI.getObjectAllocation(I))
-      YamlObject.Name.Value =
-          Alloca->hasName() ? Alloca->getName() : "<unnamed alloca>";
+      YamlObject.Name.Value = std::string(
+          Alloca->hasName() ? Alloca->getName() : "<unnamed alloca>");
     YamlObject.Type = MFI.isSpillSlotObjectIndex(I)
                           ? yaml::MachineStackObject::SpillSlot
                           : MFI.isVariableSizedObjectIndex(I)
@@ -399,7 +403,7 @@ void MIRPrinter::convertStackObjects(yaml::MachineFunction &YMF,
                                 : yaml::MachineStackObject::DefaultType;
     YamlObject.Offset = MFI.getObjectOffset(I);
     YamlObject.Size = MFI.getObjectSize(I);
-    YamlObject.Alignment = MFI.getObjectAlignment(I);
+    YamlObject.Alignment = MFI.getObjectAlign(I);
     YamlObject.StackID = (TargetStackID::Value)MFI.getStackID(I);
 
     YMF.StackObjects.push_back(YamlObject);
@@ -513,7 +517,7 @@ void MIRPrinter::convert(yaml::MachineFunction &MF,
     yaml::MachineConstantPoolValue YamlConstant;
     YamlConstant.ID = ID++;
     YamlConstant.Value = StrOS.str();
-    YamlConstant.Alignment = Constant.getAlignment();
+    YamlConstant.Alignment = Constant.getAlign();
     YamlConstant.IsTargetSpecific = Constant.isMachineConstantPoolEntry();
 
     MF.Constants.push_back(YamlConstant);
@@ -629,11 +633,31 @@ void MIPrinter::print(const MachineBasicBlock &MBB) {
     OS << "landing-pad";
     HasAttributes = true;
   }
-  if (MBB.getAlignment() != Align::None()) {
+  if (MBB.isEHFuncletEntry()) {
+    OS << (HasAttributes ? ", " : " (");
+    OS << "ehfunclet-entry";
+    HasAttributes = true;
+  }
+  if (MBB.getAlignment() != Align(1)) {
     OS << (HasAttributes ? ", " : " (");
     OS << "align " << MBB.getAlignment().value();
     HasAttributes = true;
   }
+  if (MBB.getSectionID() != MBBSectionID(0)) {
+    OS << (HasAttributes ? ", " : " (");
+    OS << "bbsections ";
+    switch (MBB.getSectionID().Type) {
+    case MBBSectionID::SectionType::Exception:
+      OS << "Exception";
+      break;
+    case MBBSectionID::SectionType::Cold:
+      OS << "Cold";
+      break;
+    default:
+      OS << MBB.getSectionID().Number;
+    }
+    HasAttributes = true;
+  }
   if (HasAttributes)
     OS << ")";
   OS << ":\n";
@@ -721,7 +745,7 @@ void MIPrinter::print(const MachineInstr &MI) {
        ++I) {
     if (I)
       OS << ", ";
-    print(MI, I, TRI, ShouldPrintRegisterTies,
+    print(MI, I, TRI, TII, ShouldPrintRegisterTies,
           MI.getTypeToPrint(I, PrintedTypes, MRI),
           /*PrintDef=*/false);
   }
@@ -754,6 +778,8 @@ void MIPrinter::print(const MachineInstr &MI) {
     OS << "exact ";
   if (MI.getFlag(MachineInstr::NoFPExcept))
     OS << "nofpexcept ";
+  if (MI.getFlag(MachineInstr::NoMerge))
+    OS << "nomerge ";
 
   OS << TII->getName(MI.getOpcode());
   if (I < E)
@@ -763,7 +789,7 @@ void MIPrinter::print(const MachineInstr &MI) {
   for (; I < E; ++I) {
     if (NeedComma)
       OS << ", ";
-    print(MI, I, TRI, ShouldPrintRegisterTies,
+    print(MI, I, TRI, TII, ShouldPrintRegisterTies,
           MI.getTypeToPrint(I, PrintedTypes, MRI));
     NeedComma = true;
   }
@@ -792,11 +818,13 @@ void MIPrinter::print(const MachineInstr &MI) {
     NeedComma = true;
   }
 
-  if (const DebugLoc &DL = MI.getDebugLoc()) {
-    if (NeedComma)
-      OS << ',';
-    OS << " debug-location ";
-    DL->printAsOperand(OS, MST);
+  if (PrintLocations) {
+    if (const DebugLoc &DL = MI.getDebugLoc()) {
+      if (NeedComma)
+        OS << ',';
+      OS << " debug-location ";
+      DL->printAsOperand(OS, MST);
+    }
   }
 
   if (!MI.memoperands_empty()) {
@@ -822,11 +850,20 @@ void MIPrinter::printStackObjectReference(int FrameIndex) {
                                             Operand.Name);
 }
 
+static std::string formatOperandComment(std::string Comment) {
+  if (Comment.empty())
+    return Comment;
+  return std::string(" /* " + Comment + " */");
+}
+
 void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
                       const TargetRegisterInfo *TRI,
+                      const TargetInstrInfo *TII,
                       bool ShouldPrintRegisterTies, LLT TypeToPrint,
                       bool PrintDef) {
   const MachineOperand &Op = MI.getOperand(OpIdx);
+  std::string MOComment = TII->createMIROperandComment(MI, Op, OpIdx, TRI);
+
   switch (Op.getType()) {
   case MachineOperand::MO_Immediate:
     if (MI.isOperandSubregIdx(OpIdx)) {
@@ -858,6 +895,7 @@ void MIPrinter::print(const MachineInstr &MI, unsigned OpIdx,
     const TargetIntrinsicInfo *TII = MI.getMF()->getTarget().getIntrinsicInfo();
     Op.print(OS, MST, TypeToPrint, OpIdx, PrintDef, /*IsStandalone=*/false,
              ShouldPrintRegisterTies, TiedOperandIdx, TRI, TII);
+      OS << formatOperandComment(MOComment);
     break;
   }
   case MachineOperand::MO_FrameIndex:
diff --git a/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp b/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
index fcc40b26c527..54441301d65b 100644
--- a/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
+++ b/llvm/lib/CodeGen/MIRVRegNamerUtils.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "MIRVRegNamerUtils.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/Support/Debug.h"
 
 using namespace llvm;
@@ -69,6 +71,8 @@ std::string VRegRenamer::getInstructionOpcodeHash(MachineInstr &MI) {
     case MachineOperand::MO_TargetIndex:
       return MO.getOffset() | (MO.getTargetFlags() << 16);
     case MachineOperand::MO_FrameIndex:
+    case MachineOperand::MO_ConstantPoolIndex:
+    case MachineOperand::MO_JumpTableIndex:
       return llvm::hash_value(MO);
 
     // We could explicitly handle all the types of the MachineOperand,
@@ -79,8 +83,6 @@ std::string VRegRenamer::getInstructionOpcodeHash(MachineInstr &MI) {
 
     // TODO: Handle the following Index/ID/Predicate cases. They can
     // be hashed on in a stable manner.
-    case MachineOperand::MO_ConstantPoolIndex:
-    case MachineOperand::MO_JumpTableIndex:
     case MachineOperand::MO_CFIIndex:
     case MachineOperand::MO_IntrinsicID:
     case MachineOperand::MO_Predicate:
@@ -112,7 +114,7 @@ std::string VRegRenamer::getInstructionOpcodeHash(MachineInstr &MI) {
     MIOperands.push_back((unsigned)Op->getOrdering());
     MIOperands.push_back((unsigned)Op->getAddrSpace());
     MIOperands.push_back((unsigned)Op->getSyncScopeID());
-    MIOperands.push_back((unsigned)Op->getBaseAlignment());
+    MIOperands.push_back((unsigned)Op->getBaseAlign().value());
     MIOperands.push_back((unsigned)Op->getFailureOrdering());
   }
 
diff --git a/llvm/lib/CodeGen/MIRVRegNamerUtils.h b/llvm/lib/CodeGen/MIRVRegNamerUtils.h
index 0c0a71a13248..a059bc5333c6 100644
--- a/llvm/lib/CodeGen/MIRVRegNamerUtils.h
+++ b/llvm/lib/CodeGen/MIRVRegNamerUtils.h
@@ -17,15 +17,18 @@
 #ifndef LLVM_LIB_CODEGEN_MIRVREGNAMERUTILS_H
 #define LLVM_LIB_CODEGEN_MIRVREGNAMERUTILS_H
 
-#include "llvm/ADT/PostOrderIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/CodeGen/MachineFunctionPass.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/CodeGen/Register.h"
+#include <map>
+#include <vector>
+#include <string>
 
 namespace llvm {
+
+class MachineBasicBlock;
+class MachineInstr;
+class MachineRegisterInfo;
+class StringRef;
+
 /// VRegRenamer - This class is used for renaming vregs in a machine basic
 /// block according to semantics of the instruction.
 class VRegRenamer {
@@ -71,6 +74,7 @@ class VRegRenamer {
 
   /// Create a vreg with name and return it.
   unsigned createVirtualRegisterWithLowerName(unsigned VReg, StringRef Name);
+
   /// Linearly traverse the MachineBasicBlock and rename each instruction's
   /// vreg definition based on the semantics of the instruction.
   /// Names are as follows bb<BBNum>_hash_[0-9]+
diff --git a/llvm/lib/CodeGen/MachineBasicBlock.cpp b/llvm/lib/CodeGen/MachineBasicBlock.cpp
index f433c4b6c90b..2d4b60435d96 100644
--- a/llvm/lib/CodeGen/MachineBasicBlock.cpp
+++ b/llvm/lib/CodeGen/MachineBasicBlock.cpp
@@ -61,12 +61,42 @@ MCSymbol *MachineBasicBlock::getSymbol() const {
     const MachineFunction *MF = getParent();
     MCContext &Ctx = MF->getContext();
     auto Prefix = Ctx.getAsmInfo()->getPrivateLabelPrefix();
+
     assert(getNumber() >= 0 && "cannot get label for unreachable MBB");
-    CachedMCSymbol = Ctx.getOrCreateSymbol(Twine(Prefix) + "BB" +
-                                           Twine(MF->getFunctionNumber()) +
-                                           "_" + Twine(getNumber()));
-  }
 
+    // We emit a non-temporary symbol for every basic block if we have BBLabels
+    // or -- with basic block sections -- when a basic block begins a section.
+    // With basic block symbols, we use a unary encoding which can
+    // compress the symbol names significantly. For basic block sections where
+    // this block is the first in a cluster, we use a non-temp descriptive name.
+    // Otherwise we fall back to use temp label.
+    if (MF->hasBBLabels()) {
+      auto Iter = MF->getBBSectionsSymbolPrefix().begin();
+      if (getNumber() < 0 ||
+          getNumber() >= (int)MF->getBBSectionsSymbolPrefix().size())
+        report_fatal_error("Unreachable MBB: " + Twine(getNumber()));
+      // The basic blocks for function foo are named a.BB.foo, aa.BB.foo, and
+      // so on.
+      std::string Prefix(Iter + 1, Iter + getNumber() + 1);
+      std::reverse(Prefix.begin(), Prefix.end());
+      CachedMCSymbol =
+          Ctx.getOrCreateSymbol(Twine(Prefix) + ".BB." + Twine(MF->getName()));
+    } else if (MF->hasBBSections() && isBeginSection()) {
+      SmallString<5> Suffix;
+      if (SectionID == MBBSectionID::ColdSectionID) {
+        Suffix += ".cold";
+      } else if (SectionID == MBBSectionID::ExceptionSectionID) {
+        Suffix += ".eh";
+      } else {
+        Suffix += "." + std::to_string(SectionID.Number);
+      }
+      CachedMCSymbol = Ctx.getOrCreateSymbol(MF->getName() + Suffix);
+    } else {
+      CachedMCSymbol = Ctx.getOrCreateSymbol(Twine(Prefix) + "BB" +
+                                             Twine(MF->getFunctionNumber()) +
+                                             "_" + Twine(getNumber()));
+    }
+  }
   return CachedMCSymbol;
 }
 
@@ -247,8 +277,16 @@ LLVM_DUMP_METHOD void MachineBasicBlock::dump() const {
 }
 #endif
 
+bool MachineBasicBlock::mayHaveInlineAsmBr() const {
+  for (const MachineBasicBlock *Succ : successors()) {
+    if (Succ->isInlineAsmBrIndirectTarget())
+      return true;
+  }
+  return false;
+}
+
 bool MachineBasicBlock::isLegalToHoistInto() const {
-  if (isReturnBlock() || hasEHPadSuccessor())
+  if (isReturnBlock() || hasEHPadSuccessor() || mayHaveInlineAsmBr())
     return false;
   return true;
 }
@@ -326,7 +364,7 @@ void MachineBasicBlock::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << "landing-pad";
     HasAttributes = true;
   }
-  if (getAlignment() != Align::None()) {
+  if (getAlignment() != Align(1)) {
     OS << (HasAttributes ? ", " : " (");
     OS << "align " << Log2(getAlignment());
     HasAttributes = true;
@@ -479,7 +517,7 @@ void MachineBasicBlock::sortUniqueLiveIns() {
   LiveInVector::const_iterator J;
   LiveInVector::iterator Out = LiveIns.begin();
   for (; I != LiveIns.end(); ++Out, I = J) {
-    unsigned PhysReg = I->PhysReg;
+    MCRegister PhysReg = I->PhysReg;
     LaneBitmask LaneMask = I->LaneMask;
     for (J = std::next(I); J != LiveIns.end() && J->PhysReg == PhysReg; ++J)
       LaneMask |= J->LaneMask;
@@ -489,7 +527,7 @@ void MachineBasicBlock::sortUniqueLiveIns() {
   LiveIns.erase(Out, LiveIns.end());
 }
 
-unsigned
+Register
 MachineBasicBlock::addLiveIn(MCRegister PhysReg, const TargetRegisterClass *RC) {
   assert(getParent() && "MBB must be inserted in function");
   assert(PhysReg.isPhysical() && "Expected physreg");
@@ -529,7 +567,11 @@ void MachineBasicBlock::moveAfter(MachineBasicBlock *NewBefore) {
   getParent()->splice(++NewBefore->getIterator(), getIterator());
 }
 
-void MachineBasicBlock::updateTerminator() {
+void MachineBasicBlock::updateTerminator(
+    MachineBasicBlock *PreviousLayoutSuccessor) {
+  LLVM_DEBUG(dbgs() << "Updating terminators on " << printMBBReference(*this)
+                    << "\n");
+
   const TargetInstrInfo *TII = getParent()->getSubtarget().getInstrInfo();
   // A block with no successors has no concerns with fall-through edges.
   if (this->succ_empty())
@@ -548,25 +590,21 @@ void MachineBasicBlock::updateTerminator() {
       if (isLayoutSuccessor(TBB))
         TII->removeBranch(*this);
     } else {
-      // The block has an unconditional fallthrough. If its successor is not its
-      // layout successor, insert a branch. First we have to locate the only
-      // non-landing-pad successor, as that is the fallthrough block.
-      for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) {
-        if ((*SI)->isEHPad())
-          continue;
-        assert(!TBB && "Found more than one non-landing-pad successor!");
-        TBB = *SI;
-      }
-
-      // If there is no non-landing-pad successor, the block has no fall-through
-      // edges to be concerned with.
-      if (!TBB)
+      // The block has an unconditional fallthrough, or the end of the block is
+      // unreachable.
+
+      // Unfortunately, whether the end of the block is unreachable is not
+      // immediately obvious; we must fall back to checking the successor list,
+      // and assuming that if the passed in block is in the succesor list and
+      // not an EHPad, it must be the intended target.
+      if (!PreviousLayoutSuccessor || !isSuccessor(PreviousLayoutSuccessor) ||
+          PreviousLayoutSuccessor->isEHPad())
         return;
 
-      // Finally update the unconditional successor to be reached via a branch
-      // if it would not be reached by fallthrough.
-      if (!isLayoutSuccessor(TBB))
-        TII->insertBranch(*this, TBB, nullptr, Cond, DL);
+      // If the unconditional successor block is not the current layout
+      // successor, insert a branch to jump to it.
+      if (!isLayoutSuccessor(PreviousLayoutSuccessor))
+        TII->insertBranch(*this, PreviousLayoutSuccessor, nullptr, Cond, DL);
     }
     return;
   }
@@ -587,38 +625,20 @@ void MachineBasicBlock::updateTerminator() {
     return;
   }
 
-  // Walk through the successors and find the successor which is not a landing
-  // pad and is not the conditional branch destination (in TBB) as the
-  // fallthrough successor.
-  MachineBasicBlock *FallthroughBB = nullptr;
-  for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) {
-    if ((*SI)->isEHPad() || *SI == TBB)
-      continue;
-    assert(!FallthroughBB && "Found more than one fallthrough successor.");
-    FallthroughBB = *SI;
-  }
-
-  if (!FallthroughBB) {
-    if (canFallThrough()) {
-      // We fallthrough to the same basic block as the conditional jump targets.
-      // Remove the conditional jump, leaving unconditional fallthrough.
-      // FIXME: This does not seem like a reasonable pattern to support, but it
-      // has been seen in the wild coming out of degenerate ARM test cases.
-      TII->removeBranch(*this);
-
-      // Finally update the unconditional successor to be reached via a branch if
-      // it would not be reached by fallthrough.
-      if (!isLayoutSuccessor(TBB))
-        TII->insertBranch(*this, TBB, nullptr, Cond, DL);
-      return;
-    }
+  // We now know we're going to fallthrough to PreviousLayoutSuccessor.
+  assert(PreviousLayoutSuccessor);
+  assert(!PreviousLayoutSuccessor->isEHPad());
+  assert(isSuccessor(PreviousLayoutSuccessor));
 
-    // We enter here iff exactly one successor is TBB which cannot fallthrough
-    // and the rest successors if any are EHPads.  In this case, we need to
-    // change the conditional branch into unconditional branch.
+  if (PreviousLayoutSuccessor == TBB) {
+    // We had a fallthrough to the same basic block as the conditional jump
+    // targets.  Remove the conditional jump, leaving an unconditional
+    // fallthrough or an unconditional jump.
     TII->removeBranch(*this);
-    Cond.clear();
-    TII->insertBranch(*this, TBB, nullptr, Cond, DL);
+    if (!isLayoutSuccessor(TBB)) {
+      Cond.clear();
+      TII->insertBranch(*this, TBB, nullptr, Cond, DL);
+    }
     return;
   }
 
@@ -627,14 +647,14 @@ void MachineBasicBlock::updateTerminator() {
     if (TII->reverseBranchCondition(Cond)) {
       // We can't reverse the condition, add an unconditional branch.
       Cond.clear();
-      TII->insertBranch(*this, FallthroughBB, nullptr, Cond, DL);
+      TII->insertBranch(*this, PreviousLayoutSuccessor, nullptr, Cond, DL);
       return;
     }
     TII->removeBranch(*this);
-    TII->insertBranch(*this, FallthroughBB, nullptr, Cond, DL);
-  } else if (!isLayoutSuccessor(FallthroughBB)) {
+    TII->insertBranch(*this, PreviousLayoutSuccessor, nullptr, Cond, DL);
+  } else if (!isLayoutSuccessor(PreviousLayoutSuccessor)) {
     TII->removeBranch(*this);
-    TII->insertBranch(*this, TBB, FallthroughBB, Cond, DL);
+    TII->insertBranch(*this, TBB, PreviousLayoutSuccessor, Cond, DL);
   }
 }
 
@@ -871,12 +891,14 @@ bool MachineBasicBlock::canFallThrough() {
   return getFallThrough() != nullptr;
 }
 
-MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
-                                                        Pass &P) {
+MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(
+    MachineBasicBlock *Succ, Pass &P,
+    std::vector<SparseBitVector<>> *LiveInSets) {
   if (!canSplitCriticalEdge(Succ))
     return nullptr;
 
   MachineFunction *MF = getParent();
+  MachineBasicBlock *PrevFallthrough = getNextNode();
   DebugLoc DL;  // FIXME: this is nowhere
 
   MachineBasicBlock *NMBB = MF->CreateMachineBasicBlock();
@@ -898,7 +920,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
   LiveVariables *LV = P.getAnalysisIfAvailable<LiveVariables>();
 
   // Collect a list of virtual registers killed by the terminators.
-  SmallVector<unsigned, 4> KilledRegs;
+  SmallVector<Register, 4> KilledRegs;
   if (LV)
     for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
          I != E; ++I) {
@@ -918,7 +940,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
       }
     }
 
-  SmallVector<unsigned, 4> UsedRegs;
+  SmallVector<Register, 4> UsedRegs;
   if (LIS) {
     for (instr_iterator I = getFirstInstrTerminator(), E = instr_end();
          I != E; ++I) {
@@ -947,7 +969,11 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
       Terminators.push_back(&*I);
   }
 
-  updateTerminator();
+  // Since we replaced all uses of Succ with NMBB, that should also be treated
+  // as the fallthrough successor
+  if (Succ == PrevFallthrough)
+    PrevFallthrough = NMBB;
+  updateTerminator(PrevFallthrough);
 
   if (Indexes) {
     SmallVector<MachineInstr*, 4> NewTerminators;
@@ -992,7 +1018,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
   if (LV) {
     // Restore kills of virtual registers that were killed by the terminators.
     while (!KilledRegs.empty()) {
-      unsigned Reg = KilledRegs.pop_back_val();
+      Register Reg = KilledRegs.pop_back_val();
       for (instr_iterator I = instr_end(), E = instr_begin(); I != E;) {
         if (!(--I)->addRegisterKilled(Reg, TRI, /* AddIfNotFound= */ false))
           continue;
@@ -1003,7 +1029,10 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
       }
     }
     // Update relevant live-through information.
-    LV->addNewBlock(NMBB, this, Succ);
+    if (LiveInSets != nullptr)
+      LV->addNewBlock(NMBB, this, Succ, *LiveInSets);
+    else
+      LV->addNewBlock(NMBB, this, Succ);
   }
 
   if (LIS) {
@@ -1022,7 +1051,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
     SlotIndex EndIndex = Indexes->getMBBEndIdx(NMBB);
 
     // Find the registers used from NMBB in PHIs in Succ.
-    SmallSet<unsigned, 8> PHISrcRegs;
+    SmallSet<Register, 8> PHISrcRegs;
     for (MachineBasicBlock::instr_iterator
          I = Succ->instr_begin(), E = Succ->instr_end();
          I != E && I->isPHI(); ++I) {
@@ -1045,7 +1074,7 @@ MachineBasicBlock *MachineBasicBlock::SplitCriticalEdge(MachineBasicBlock *Succ,
 
     MachineRegisterInfo *MRI = &getParent()->getRegInfo();
     for (unsigned i = 0, e = MRI->getNumVirtRegs(); i != e; ++i) {
-      unsigned Reg = Register::index2VirtReg(i);
+      Register Reg = Register::index2VirtReg(i);
       if (PHISrcRegs.count(Reg) || !LIS->hasInterval(Reg))
         continue;
 
@@ -1109,15 +1138,19 @@ bool MachineBasicBlock::canSplitCriticalEdge(
   if (Succ->isEHPad())
     return false;
 
-  const MachineFunction *MF = getParent();
+  // Splitting the critical edge to a callbr's indirect block isn't advised.
+  // Don't do it in this generic function.
+  if (Succ->isInlineAsmBrIndirectTarget())
+    return false;
 
+  const MachineFunction *MF = getParent();
   // Performance might be harmed on HW that implements branching using exec mask
   // where both sides of the branches are always executed.
   if (MF->getTarget().requiresStructuredCFG())
     return false;
 
   // We may need to update this's terminator, but we can't do that if
-  // AnalyzeBranch fails. If this uses a jump table, we won't touch it.
+  // analyzeBranch fails. If this uses a jump table, we won't touch it.
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
@@ -1223,68 +1256,6 @@ void MachineBasicBlock::replacePhiUsesWith(MachineBasicBlock *Old,
     }
 }
 
-/// Various pieces of code can cause excess edges in the CFG to be inserted.  If
-/// we have proven that MBB can only branch to DestA and DestB, remove any other
-/// MBB successors from the CFG.  DestA and DestB can be null.
-///
-/// Besides DestA and DestB, retain other edges leading to LandingPads
-/// (currently there can be only one; we don't check or require that here).
-/// Note it is possible that DestA and/or DestB are LandingPads.
-bool MachineBasicBlock::CorrectExtraCFGEdges(MachineBasicBlock *DestA,
-                                             MachineBasicBlock *DestB,
-                                             bool IsCond) {
-  // The values of DestA and DestB frequently come from a call to the
-  // 'TargetInstrInfo::AnalyzeBranch' method. We take our meaning of the initial
-  // values from there.
-  //
-  // 1. If both DestA and DestB are null, then the block ends with no branches
-  //    (it falls through to its successor).
-  // 2. If DestA is set, DestB is null, and IsCond is false, then the block ends
-  //    with only an unconditional branch.
-  // 3. If DestA is set, DestB is null, and IsCond is true, then the block ends
-  //    with a conditional branch that falls through to a successor (DestB).
-  // 4. If DestA and DestB is set and IsCond is true, then the block ends with a
-  //    conditional branch followed by an unconditional branch. DestA is the
-  //    'true' destination and DestB is the 'false' destination.
-
-  bool Changed = false;
-
-  MachineBasicBlock *FallThru = getNextNode();
-
-  if (!DestA && !DestB) {
-    // Block falls through to successor.
-    DestA = FallThru;
-    DestB = FallThru;
-  } else if (DestA && !DestB) {
-    if (IsCond)
-      // Block ends in conditional jump that falls through to successor.
-      DestB = FallThru;
-  } else {
-    assert(DestA && DestB && IsCond &&
-           "CFG in a bad state. Cannot correct CFG edges");
-  }
-
-  // Remove superfluous edges. I.e., those which aren't destinations of this
-  // basic block, duplicate edges, or landing pads.
-  SmallPtrSet<const MachineBasicBlock*, 8> SeenMBBs;
-  MachineBasicBlock::succ_iterator SI = succ_begin();
-  while (SI != succ_end()) {
-    const MachineBasicBlock *MBB = *SI;
-    if (!SeenMBBs.insert(MBB).second ||
-        (MBB != DestA && MBB != DestB && !MBB->isEHPad())) {
-      // This is a superfluous edge, remove it.
-      SI = removeSuccessor(SI);
-      Changed = true;
-    } else {
-      ++SI;
-    }
-  }
-
-  if (Changed)
-    normalizeSuccProbs();
-  return Changed;
-}
-
 /// Find the next valid DebugLoc starting at MBBI, skipping any DBG_VALUE
 /// instructions.  Return UnknownLoc if there is none.
 DebugLoc
@@ -1300,8 +1271,8 @@ MachineBasicBlock::findDebugLoc(instr_iterator MBBI) {
 /// instructions.  Return UnknownLoc if there is none.
 DebugLoc MachineBasicBlock::findPrevDebugLoc(instr_iterator MBBI) {
   if (MBBI == instr_begin()) return {};
-  // Skip debug declarations, we don't want a DebugLoc from them.
-  MBBI = skipDebugInstructionsBackward(std::prev(MBBI), instr_begin());
+  // Skip debug instructions, we don't want a DebugLoc from them.
+  MBBI = prev_nodbg(MBBI, instr_begin());
   if (!MBBI->isDebugInstr()) return MBBI->getDebugLoc();
   return {};
 }
@@ -1383,7 +1354,7 @@ MachineBasicBlock::getProbabilityIterator(MachineBasicBlock::succ_iterator I) {
 /// instructions after (searching just for defs) MI.
 MachineBasicBlock::LivenessQueryResult
 MachineBasicBlock::computeRegisterLiveness(const TargetRegisterInfo *TRI,
-                                           unsigned Reg, const_iterator Before,
+                                           MCRegister Reg, const_iterator Before,
                                            unsigned Neighborhood) const {
   unsigned N = Neighborhood;
 
@@ -1503,3 +1474,7 @@ MachineBasicBlock::livein_iterator MachineBasicBlock::livein_begin() const {
       "Liveness information is accurate");
   return LiveIns.begin();
 }
+
+const MBBSectionID MBBSectionID::ColdSectionID(MBBSectionID::SectionType::Cold);
+const MBBSectionID
+    MBBSectionID::ExceptionSectionID(MBBSectionID::SectionType::Exception);
diff --git a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
index d8ea3e0b9cf6..1168b01a835f 100644
--- a/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
+++ b/llvm/lib/CodeGen/MachineBlockFrequencyInfo.cpp
@@ -247,6 +247,12 @@ MachineBlockFrequencyInfo::isIrrLoopHeader(const MachineBasicBlock *MBB) {
   return MBFI->isIrrLoopHeader(MBB);
 }
 
+void MachineBlockFrequencyInfo::setBlockFreq(const MachineBasicBlock *MBB,
+                                             uint64_t Freq) {
+  assert(MBFI && "Expected analysis to be available");
+  MBFI->setBlockFreq(MBB, Freq);
+}
+
 const MachineFunction *MachineBlockFrequencyInfo::getFunction() const {
   return MBFI ? MBFI->getFunction() : nullptr;
 }
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 30b98ec88c24..783d22fafee9 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -346,7 +346,7 @@ class MachineBlockPlacement : public MachineFunctionPass {
   const MachineBranchProbabilityInfo *MBPI;
 
   /// A handle to the function-wide block frequency pass.
-  std::unique_ptr<BranchFolder::MBFIWrapper> MBFI;
+  std::unique_ptr<MBFIWrapper> MBFI;
 
   /// A handle to the loop info.
   MachineLoopInfo *MLI;
@@ -374,6 +374,9 @@ class MachineBlockPlacement : public MachineFunctionPass {
   /// must be done inline.
   TailDuplicator TailDup;
 
+  /// Partial tail duplication threshold.
+  BlockFrequency DupThreshold;
+
   /// Allocator and owner of BlockChain structures.
   ///
   /// We build BlockChains lazily while processing the loop structure of
@@ -399,6 +402,10 @@ class MachineBlockPlacement : public MachineFunctionPass {
   SmallPtrSet<MachineBasicBlock *, 4> BlocksWithUnanalyzableExits;
 #endif
 
+  /// Scale the DupThreshold according to basic block size.
+  BlockFrequency scaleThreshold(MachineBasicBlock *BB);
+  void initDupThreshold();
+
   /// Decrease the UnscheduledPredecessors count for all blocks in chain, and
   /// if the count goes to 0, add them to the appropriate work list.
   void markChainSuccessors(
@@ -421,6 +428,11 @@ class MachineBlockPlacement : public MachineFunctionPass {
       const MachineBasicBlock *BB, const MachineBasicBlock *Succ,
       const BlockChain &Chain, const BlockFilterSet *BlockFilter,
       BranchProbability SuccProb, BranchProbability HotProb);
+  bool isBestSuccessor(MachineBasicBlock *BB, MachineBasicBlock *Pred,
+                       BlockFilterSet *BlockFilter);
+  void findDuplicateCandidates(SmallVectorImpl<MachineBasicBlock *> &Candidates,
+                               MachineBasicBlock *BB,
+                               BlockFilterSet *BlockFilter);
   bool repeatedlyTailDuplicateBlock(
       MachineBasicBlock *BB, MachineBasicBlock *&LPred,
       const MachineBasicBlock *LoopHeaderBB,
@@ -1141,6 +1153,11 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
   if (NumDup == 0)
     return false;
 
+  // If profile information is available, findDuplicateCandidates can do more
+  // precise benefit analysis.
+  if (F->getFunction().hasProfileData())
+    return true;
+
   // This is mainly for function exit BB.
   // The integrated tail duplication is really designed for increasing
   // fallthrough from predecessors from Succ to its successors. We may need
@@ -1169,9 +1186,6 @@ bool MachineBlockPlacement::canTailDuplicateUnplacedPreds(
   //
   // A small number of extra duplication may not hurt too much. We need a better
   // heuristic to handle it.
-  //
-  // FIXME: we should selectively tail duplicate a BB into part of its
-  // predecessors.
   if ((NumDup > Succ->succ_size()) || !Duplicate)
     return false;
 
@@ -1556,7 +1570,7 @@ MachineBlockPlacement::selectBestSuccessor(
   // For blocks with CFG violations, we may be able to lay them out anyway with
   // tail-duplication. We keep this vector so we can perform the probability
   // calculations the minimum number of times.
-  SmallVector<std::tuple<BranchProbability, MachineBasicBlock *>, 4>
+  SmallVector<std::pair<BranchProbability, MachineBasicBlock *>, 4>
       DupCandidates;
   for (MachineBasicBlock *Succ : Successors) {
     auto RealSuccProb = MBPI->getEdgeProbability(BB, Succ);
@@ -1570,7 +1584,7 @@ MachineBlockPlacement::selectBestSuccessor(
                                    Chain, BlockFilter)) {
       // If tail duplication would make Succ profitable, place it.
       if (allowTailDupPlacement() && shouldTailDuplicate(Succ))
-        DupCandidates.push_back(std::make_tuple(SuccProb, Succ));
+        DupCandidates.emplace_back(SuccProb, Succ);
       continue;
     }
 
@@ -1799,11 +1813,11 @@ void MachineBlockPlacement::buildChain(
     // Placement may have changed tail duplication opportunities.
     // Check for that now.
     if (allowTailDupPlacement() && BestSucc && ShouldTailDup) {
-      // If the chosen successor was duplicated into all its predecessors,
-      // don't bother laying it out, just go round the loop again with BB as
-      // the chain end.
-      if (repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
-                                       BlockFilter, PrevUnplacedBlockIt))
+      repeatedlyTailDuplicateBlock(BestSucc, BB, LoopHeaderBB, Chain,
+                                       BlockFilter, PrevUnplacedBlockIt);
+      // If the chosen successor was duplicated into BB, don't bother laying
+      // it out, just go round the loop again with BB as the chain end.
+      if (!BB->isSuccessor(BestSucc))
         continue;
     }
 
@@ -2082,8 +2096,7 @@ MachineBlockPlacement::findBestLoopTop(const MachineLoop &L,
   // In practice this never happens though: there always seems to be a preheader
   // that can fallthrough and that is also placed before the header.
   bool OptForSize = F->getFunction().hasOptSize() ||
-                    llvm::shouldOptimizeForSize(L.getHeader(), PSI,
-                                                &MBFI->getMBFI());
+                    llvm::shouldOptimizeForSize(L.getHeader(), PSI, MBFI.get());
   if (OptForSize)
     return L.getHeader();
 
@@ -2616,7 +2629,7 @@ void MachineBlockPlacement::buildLoopChains(const MachineLoop &L) {
 void MachineBlockPlacement::buildCFGChains() {
   // Ensure that every BB in the function has an associated chain to simplify
   // the assumptions of the remaining algorithm.
-  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
+  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
   for (MachineFunction::iterator FI = F->begin(), FE = F->end(); FI != FE;
        ++FI) {
     MachineBasicBlock *BB = &*FI;
@@ -2626,7 +2639,7 @@ void MachineBlockPlacement::buildCFGChains() {
     // the exact fallthrough behavior for.
     while (true) {
       Cond.clear();
-      MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
+      MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
       if (!TII->analyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
         break;
 
@@ -2690,6 +2703,20 @@ void MachineBlockPlacement::buildCFGChains() {
     assert(!BadFunc && "Detected problems with the block placement.");
   });
 
+  // Remember original layout ordering, so we can update terminators after
+  // reordering to point to the original layout successor.
+  SmallVector<MachineBasicBlock *, 4> OriginalLayoutSuccessors(
+      F->getNumBlockIDs());
+  {
+    MachineBasicBlock *LastMBB = nullptr;
+    for (auto &MBB : *F) {
+      if (LastMBB != nullptr)
+        OriginalLayoutSuccessors[LastMBB->getNumber()] = &MBB;
+      LastMBB = &MBB;
+    }
+    OriginalLayoutSuccessors[F->back().getNumber()] = nullptr;
+  }
+
   // Splice the blocks into place.
   MachineFunction::iterator InsertPos = F->begin();
   LLVM_DEBUG(dbgs() << "[MBP] Function: " << F->getName() << "\n");
@@ -2711,7 +2738,7 @@ void MachineBlockPlacement::buildCFGChains() {
     // than assert when the branch cannot be analyzed in order to remove this
     // boiler plate.
     Cond.clear();
-    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
 
 #ifndef NDEBUG
     if (!BlocksWithUnanalyzableExits.count(PrevBB)) {
@@ -2747,15 +2774,18 @@ void MachineBlockPlacement::buildCFGChains() {
     //     TBB = FBB = nullptr;
     //   }
     // }
-    if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond))
-      PrevBB->updateTerminator();
+    if (!TII->analyzeBranch(*PrevBB, TBB, FBB, Cond)) {
+      PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
+    }
   }
 
   // Fixup the last block.
   Cond.clear();
-  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
-  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond))
-    F->back().updateTerminator();
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
+  if (!TII->analyzeBranch(F->back(), TBB, FBB, Cond)) {
+    MachineBasicBlock *PrevBB = &F->back();
+    PrevBB->updateTerminator(OriginalLayoutSuccessors[PrevBB->getNumber()]);
+  }
 
   BlockWorkList.clear();
   EHPadWorkList.clear();
@@ -2763,17 +2793,17 @@ void MachineBlockPlacement::buildCFGChains() {
 
 void MachineBlockPlacement::optimizeBranches() {
   BlockChain &FunctionChain = *BlockToChain[&F->front()];
-  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
+  SmallVector<MachineOperand, 4> Cond; // For analyzeBranch.
 
   // Now that all the basic blocks in the chain have the proper layout,
-  // make a final call to AnalyzeBranch with AllowModify set.
+  // make a final call to analyzeBranch with AllowModify set.
   // Indeed, the target may be able to optimize the branches in a way we
   // cannot because all branches may not be analyzable.
   // E.g., the target may be able to remove an unconditional branch to
   // a fallthrough when it occurs after predicated terminators.
   for (MachineBasicBlock *ChainBB : FunctionChain) {
     Cond.clear();
-    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For AnalyzeBranch.
+    MachineBasicBlock *TBB = nullptr, *FBB = nullptr; // For analyzeBranch.
     if (!TII->analyzeBranch(*ChainBB, TBB, FBB, Cond, /*AllowModify*/ true)) {
       // If PrevBB has a two-way branch, try to re-order the branches
       // such that we branch to the successor with higher probability first.
@@ -2789,7 +2819,6 @@ void MachineBlockPlacement::optimizeBranches() {
         DebugLoc dl; // FIXME: this is nowhere
         TII->removeBranch(*ChainBB);
         TII->insertBranch(*ChainBB, FBB, TBB, Cond, dl);
-        ChainBB->updateTerminator();
       }
     }
   }
@@ -2841,7 +2870,7 @@ void MachineBlockPlacement::alignBlocks() {
       continue;
 
     // If the global profiles indicates so, don't align it.
-    if (llvm::shouldOptimizeForSize(ChainBB, PSI, &MBFI->getMBFI()) &&
+    if (llvm::shouldOptimizeForSize(ChainBB, PSI, MBFI.get()) &&
         !TLI->alignLoopsWithOptSize())
       continue;
 
@@ -2901,10 +2930,7 @@ bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
   // duplicated into is still small enough to be duplicated again.
   // No need to call markBlockSuccessors in this case, as the blocks being
   // duplicated from here on are already scheduled.
-  // Note that DuplicatedToLPred always implies Removed.
-  while (DuplicatedToLPred) {
-    assert(Removed && "Block must have been removed to be duplicated into its "
-           "layout predecessor.");
+  while (DuplicatedToLPred && Removed) {
     MachineBasicBlock *DupBB, *DupPred;
     // The removal callback causes Chain.end() to be updated when a block is
     // removed. On the first pass through the loop, the chain end should be the
@@ -2943,8 +2969,7 @@ bool MachineBlockPlacement::repeatedlyTailDuplicateBlock(
 ///                          chosen in the given order due to unnatural CFG
 ///                          only needed if \p BB is removed and
 ///                          \p PrevUnplacedBlockIt pointed to \p BB.
-/// \p DuplicatedToLPred - True if the block was duplicated into LPred. Will
-///                        only be true if the block was removed.
+/// \p DuplicatedToLPred - True if the block was duplicated into LPred.
 /// \return  - True if the block was duplicated into all preds and removed.
 bool MachineBlockPlacement::maybeTailDuplicateBlock(
     MachineBasicBlock *BB, MachineBasicBlock *LPred,
@@ -3012,8 +3037,18 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
 
   SmallVector<MachineBasicBlock *, 8> DuplicatedPreds;
   bool IsSimple = TailDup.isSimpleBB(BB);
-  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred,
-                                 &DuplicatedPreds, &RemovalCallbackRef);
+  SmallVector<MachineBasicBlock *, 8> CandidatePreds;
+  SmallVectorImpl<MachineBasicBlock *> *CandidatePtr = nullptr;
+  if (F->getFunction().hasProfileData()) {
+    // We can do partial duplication with precise profile information.
+    findDuplicateCandidates(CandidatePreds, BB, BlockFilter);
+    if (CandidatePreds.size() == 0)
+      return false;
+    if (CandidatePreds.size() < BB->pred_size())
+      CandidatePtr = &CandidatePreds;
+  }
+  TailDup.tailDuplicateAndUpdate(IsSimple, BB, LPred, &DuplicatedPreds,
+                                 &RemovalCallbackRef, CandidatePtr);
 
   // Update UnscheduledPredecessors to reflect tail-duplication.
   DuplicatedToLPred = false;
@@ -3036,6 +3071,191 @@ bool MachineBlockPlacement::maybeTailDuplicateBlock(
   return Removed;
 }
 
+// Count the number of actual machine instructions.
+static uint64_t countMBBInstruction(MachineBasicBlock *MBB) {
+  uint64_t InstrCount = 0;
+  for (MachineInstr &MI : *MBB) {
+    if (!MI.isPHI() && !MI.isMetaInstruction())
+      InstrCount += 1;
+  }
+  return InstrCount;
+}
+
+// The size cost of duplication is the instruction size of the duplicated block.
+// So we should scale the threshold accordingly. But the instruction size is not
+// available on all targets, so we use the number of instructions instead.
+BlockFrequency MachineBlockPlacement::scaleThreshold(MachineBasicBlock *BB) {
+  return DupThreshold.getFrequency() * countMBBInstruction(BB);
+}
+
+// Returns true if BB is Pred's best successor.
+bool MachineBlockPlacement::isBestSuccessor(MachineBasicBlock *BB,
+                                            MachineBasicBlock *Pred,
+                                            BlockFilterSet *BlockFilter) {
+  if (BB == Pred)
+    return false;
+  if (BlockFilter && !BlockFilter->count(Pred))
+    return false;
+  BlockChain *PredChain = BlockToChain[Pred];
+  if (PredChain && (Pred != *std::prev(PredChain->end())))
+    return false;
+
+  // Find the successor with largest probability excluding BB.
+  BranchProbability BestProb = BranchProbability::getZero();
+  for (MachineBasicBlock *Succ : Pred->successors())
+    if (Succ != BB) {
+      if (BlockFilter && !BlockFilter->count(Succ))
+        continue;
+      BlockChain *SuccChain = BlockToChain[Succ];
+      if (SuccChain && (Succ != *SuccChain->begin()))
+        continue;
+      BranchProbability SuccProb = MBPI->getEdgeProbability(Pred, Succ);
+      if (SuccProb > BestProb)
+        BestProb = SuccProb;
+    }
+
+  BranchProbability BBProb = MBPI->getEdgeProbability(Pred, BB);
+  if (BBProb <= BestProb)
+    return false;
+
+  // Compute the number of reduced taken branches if Pred falls through to BB
+  // instead of another successor. Then compare it with threshold.
+  BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
+  BlockFrequency Gain = PredFreq * (BBProb - BestProb);
+  return Gain > scaleThreshold(BB);
+}
+
+// Find out the predecessors of BB and BB can be beneficially duplicated into
+// them.
+void MachineBlockPlacement::findDuplicateCandidates(
+    SmallVectorImpl<MachineBasicBlock *> &Candidates,
+    MachineBasicBlock *BB,
+    BlockFilterSet *BlockFilter) {
+  MachineBasicBlock *Fallthrough = nullptr;
+  BranchProbability DefaultBranchProb = BranchProbability::getZero();
+  BlockFrequency BBDupThreshold(scaleThreshold(BB));
+  SmallVector<MachineBasicBlock *, 8> Preds(BB->pred_begin(), BB->pred_end());
+  SmallVector<MachineBasicBlock *, 8> Succs(BB->succ_begin(), BB->succ_end());
+
+  // Sort for highest frequency.
+  auto CmpSucc = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
+    return MBPI->getEdgeProbability(BB, A) > MBPI->getEdgeProbability(BB, B);
+  };
+  auto CmpPred = [&](MachineBasicBlock *A, MachineBasicBlock *B) {
+    return MBFI->getBlockFreq(A) > MBFI->getBlockFreq(B);
+  };
+  llvm::stable_sort(Succs, CmpSucc);
+  llvm::stable_sort(Preds, CmpPred);
+
+  auto SuccIt = Succs.begin();
+  if (SuccIt != Succs.end()) {
+    DefaultBranchProb = MBPI->getEdgeProbability(BB, *SuccIt).getCompl();
+  }
+
+  // For each predecessors of BB, compute the benefit of duplicating BB,
+  // if it is larger than the threshold, add it into Candidates.
+  //
+  // If we have following control flow.
+  //
+  //     PB1 PB2 PB3 PB4
+  //      \   |  /    /\
+  //       \  | /    /  \
+  //        \ |/    /    \
+  //         BB----/     OB
+  //         /\
+  //        /  \
+  //      SB1 SB2
+  //
+  // And it can be partially duplicated as
+  //
+  //   PB2+BB
+  //      |  PB1 PB3 PB4
+  //      |   |  /    /\
+  //      |   | /    /  \
+  //      |   |/    /    \
+  //      |  BB----/     OB
+  //      |\ /|
+  //      | X |
+  //      |/ \|
+  //     SB2 SB1
+  //
+  // The benefit of duplicating into a predecessor is defined as
+  //         Orig_taken_branch - Duplicated_taken_branch
+  //
+  // The Orig_taken_branch is computed with the assumption that predecessor
+  // jumps to BB and the most possible successor is laid out after BB.
+  //
+  // The Duplicated_taken_branch is computed with the assumption that BB is
+  // duplicated into PB, and one successor is layout after it (SB1 for PB1 and
+  // SB2 for PB2 in our case). If there is no available successor, the combined
+  // block jumps to all BB's successor, like PB3 in this example.
+  //
+  // If a predecessor has multiple successors, so BB can't be duplicated into
+  // it. But it can beneficially fall through to BB, and duplicate BB into other
+  // predecessors.
+  for (MachineBasicBlock *Pred : Preds) {
+    BlockFrequency PredFreq = MBFI->getBlockFreq(Pred);
+
+    if (!TailDup.canTailDuplicate(BB, Pred)) {
+      // BB can't be duplicated into Pred, but it is possible to be layout
+      // below Pred.
+      if (!Fallthrough && isBestSuccessor(BB, Pred, BlockFilter)) {
+        Fallthrough = Pred;
+        if (SuccIt != Succs.end())
+          SuccIt++;
+      }
+      continue;
+    }
+
+    BlockFrequency OrigCost = PredFreq + PredFreq * DefaultBranchProb;
+    BlockFrequency DupCost;
+    if (SuccIt == Succs.end()) {
+      // Jump to all successors;
+      if (Succs.size() > 0)
+        DupCost += PredFreq;
+    } else {
+      // Fallthrough to *SuccIt, jump to all other successors;
+      DupCost += PredFreq;
+      DupCost -= PredFreq * MBPI->getEdgeProbability(BB, *SuccIt);
+    }
+
+    assert(OrigCost >= DupCost);
+    OrigCost -= DupCost;
+    if (OrigCost > BBDupThreshold) {
+      Candidates.push_back(Pred);
+      if (SuccIt != Succs.end())
+        SuccIt++;
+    }
+  }
+
+  // No predecessors can optimally fallthrough to BB.
+  // So we can change one duplication into fallthrough.
+  if (!Fallthrough) {
+    if ((Candidates.size() < Preds.size()) && (Candidates.size() > 0)) {
+      Candidates[0] = Candidates.back();
+      Candidates.pop_back();
+    }
+  }
+}
+
+void MachineBlockPlacement::initDupThreshold() {
+  DupThreshold = 0;
+  if (!F->getFunction().hasProfileData())
+    return;
+
+  BlockFrequency MaxFreq = 0;
+  for (MachineBasicBlock &MBB : *F) {
+    BlockFrequency Freq = MBFI->getBlockFreq(&MBB);
+    if (Freq > MaxFreq)
+      MaxFreq = Freq;
+  }
+
+  // FIXME: we may use profile count instead of frequency,
+  // and need more fine tuning.
+  BranchProbability ThresholdProb(TailDupPlacementPenalty, 100);
+  DupThreshold = MaxFreq * ThresholdProb;
+}
+
 bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   if (skipFunction(MF.getFunction()))
     return false;
@@ -3046,7 +3266,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
 
   F = &MF;
   MBPI = &getAnalysis<MachineBranchProbabilityInfo>();
-  MBFI = std::make_unique<BranchFolder::MBFIWrapper>(
+  MBFI = std::make_unique<MBFIWrapper>(
       getAnalysis<MachineBlockFrequencyInfo>());
   MLI = &getAnalysis<MachineLoopInfo>();
   TII = MF.getSubtarget().getInstrInfo();
@@ -3054,6 +3274,8 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
   MPDT = nullptr;
   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
 
+  initDupThreshold();
+
   // Initialize PreferredLoopExit to nullptr here since it may never be set if
   // there are no MachineLoops.
   PreferredLoopExit = nullptr;
@@ -3088,7 +3310,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
     if (OptForSize)
       TailDupSize = 1;
     bool PreRegAlloc = false;
-    TailDup.initMF(MF, PreRegAlloc, MBPI, &MBFI->getMBFI(), PSI,
+    TailDup.initMF(MF, PreRegAlloc, MBPI, MBFI.get(), PSI,
                    /* LayoutMode */ true, TailDupSize);
     precomputeTriangleChains();
   }
@@ -3107,9 +3329,7 @@ bool MachineBlockPlacement::runOnMachineFunction(MachineFunction &MF) {
     BranchFolder BF(/*EnableTailMerge=*/true, /*CommonHoist=*/false, *MBFI,
                     *MBPI, PSI, TailMergeSize);
 
-    auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
-    if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(),
-                            MMIWP ? &MMIWP->getMMI() : nullptr, MLI,
+    if (BF.OptimizeFunction(MF, TII, MF.getSubtarget().getRegisterInfo(), MLI,
                             /*AfterPlacement=*/true)) {
       // Redo the layout if tail merging creates/removes/moves blocks.
       BlockToChain.clear();
diff --git a/llvm/lib/CodeGen/MachineCSE.cpp b/llvm/lib/CodeGen/MachineCSE.cpp
index 9561a06ce8df..09531276bc10 100644
--- a/llvm/lib/CodeGen/MachineCSE.cpp
+++ b/llvm/lib/CodeGen/MachineCSE.cpp
@@ -747,9 +747,8 @@ bool MachineCSE::PerformCSE(MachineDomTreeNode *Node) {
   do {
     Node = WorkList.pop_back_val();
     Scopes.push_back(Node);
-    const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
-    OpenChildren[Node] = Children.size();
-    for (MachineDomTreeNode *Child : Children)
+    OpenChildren[Node] = Node->getNumChildren();
+    for (MachineDomTreeNode *Child : Node->children())
       WorkList.push_back(Child);
   } while (!WorkList.empty());
 
@@ -831,6 +830,13 @@ bool MachineCSE::ProcessBlockPRE(MachineDominatorTree *DT,
           continue;
         MachineInstr &NewMI =
             TII->duplicate(*CMBB, CMBB->getFirstTerminator(), *MI);
+
+        // When hoisting, make sure we don't carry the debug location of
+        // the original instruction, as that's not correct and can cause
+        // unexpected jumps when debugging optimized code.
+        auto EmptyDL = DebugLoc();
+        NewMI.setDebugLoc(EmptyDL);
+
         NewMI.getOperand(0).setReg(NewReg);
 
         PREMap[MI] = CMBB;
@@ -855,8 +861,7 @@ bool MachineCSE::PerformSimplePRE(MachineDominatorTree *DT) {
   BBs.push_back(DT->getRootNode());
   do {
     auto Node = BBs.pop_back_val();
-    const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
-    for (MachineDomTreeNode *Child : Children)
+    for (MachineDomTreeNode *Child : Node->children())
       BBs.push_back(Child);
 
     MachineBasicBlock *MBB = Node->getBlock();
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index 73895bdf834f..f241435a0482 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -269,6 +269,8 @@ static CombinerObjective getCombinerObjective(MachineCombinerPattern P) {
   case MachineCombinerPattern::REASSOC_AX_YB:
   case MachineCombinerPattern::REASSOC_XA_BY:
   case MachineCombinerPattern::REASSOC_XA_YB:
+  case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
+  case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
     return CombinerObjective::MustReduceDepth;
   default:
     return CombinerObjective::Default;
@@ -406,12 +408,14 @@ bool MachineCombiner::preservesResourceLen(
                     << ResLenBeforeCombine
                     << " and after: " << ResLenAfterCombine << "\n";);
   LLVM_DEBUG(
-      ResLenAfterCombine <= ResLenBeforeCombine
+      ResLenAfterCombine <=
+      ResLenBeforeCombine + TII->getExtendResourceLenLimit()
           ? dbgs() << "\t\t  As result it IMPROVES/PRESERVES Resource Length\n"
           : dbgs() << "\t\t  As result it DOES NOT improve/preserve Resource "
                       "Length\n");
 
-  return ResLenAfterCombine <= ResLenBeforeCombine;
+  return ResLenAfterCombine <=
+         ResLenBeforeCombine + TII->getExtendResourceLenLimit();
 }
 
 /// \returns true when new instruction sequence should be generated
diff --git a/llvm/lib/CodeGen/MachineCopyPropagation.cpp b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
index c316b167059b..70d6dcc2e3e2 100644
--- a/llvm/lib/CodeGen/MachineCopyPropagation.cpp
+++ b/llvm/lib/CodeGen/MachineCopyPropagation.cpp
@@ -51,6 +51,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/iterator_range.h"
@@ -113,7 +114,8 @@ public:
     // Since Reg might be a subreg of some registers, only invalidate Reg is not
     // enough. We have to find the COPY defines Reg or registers defined by Reg
     // and invalidate all of them.
-    DenseSet<unsigned> RegsToInvalidate{Reg};
+    SmallSet<unsigned, 8> RegsToInvalidate;
+    RegsToInvalidate.insert(Reg);
     for (MCRegUnitIterator RUI(Reg, &TRI); RUI.isValid(); ++RUI) {
       auto I = Copies.find(*RUI);
       if (I != Copies.end()) {
diff --git a/llvm/lib/CodeGen/MachineDebugify.cpp b/llvm/lib/CodeGen/MachineDebugify.cpp
new file mode 100644
index 000000000000..bf57ec0e8c28
--- /dev/null
+++ b/llvm/lib/CodeGen/MachineDebugify.cpp
@@ -0,0 +1,172 @@
+//===- MachineDebugify.cpp - Attach synthetic debug info to everything ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This pass attaches synthetic debug info to everything. It can be used
+/// to create targeted tests for debug info preservation, or test for CodeGen
+/// differences with vs. without debug info.
+///
+/// This isn't intended to have feature parity with Debugify.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DIBuilder.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils/Debugify.h"
+
+#define DEBUG_TYPE "mir-debugify"
+
+using namespace llvm;
+
+namespace {
+bool applyDebugifyMetadataToMachineFunction(MachineModuleInfo &MMI,
+                                            DIBuilder &DIB, Function &F) {
+  MachineFunction *MaybeMF = MMI.getMachineFunction(F);
+  if (!MaybeMF)
+    return false;
+  MachineFunction &MF = *MaybeMF;
+  const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
+
+  DISubprogram *SP = F.getSubprogram();
+  assert(SP && "IR Debugify just created it?");
+
+  Module &M = *F.getParent();
+  LLVMContext &Ctx = M.getContext();
+
+  unsigned NextLine = SP->getLine();
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineInstr &MI : MBB) {
+      // This will likely emit line numbers beyond the end of the imagined
+      // source function and into subsequent ones. We don't do anything about
+      // that as it doesn't really matter to the compiler where the line is in
+      // the imaginary source code.
+      MI.setDebugLoc(DILocation::get(Ctx, NextLine++, 1, SP));
+    }
+  }
+
+  // Find local variables defined by debugify. No attempt is made to match up
+  // MIR-level regs to the 'correct' IR-level variables: there isn't a simple
+  // way to do that, and it isn't necessary to find interesting CodeGen bugs.
+  // Instead, simply keep track of one variable per line. Later, we can insert
+  // DBG_VALUE insts that point to these local variables. Emitting DBG_VALUEs
+  // which cover a wide range of lines can help stress the debug info passes:
+  // if we can't do that, fall back to using the local variable which precedes
+  // all the others.
+  Function *DbgValF = M.getFunction("llvm.dbg.value");
+  DbgValueInst *EarliestDVI = nullptr;
+  DenseMap<unsigned, DILocalVariable *> Line2Var;
+  DIExpression *Expr = nullptr;
+  if (DbgValF) {
+    for (const Use &U : DbgValF->uses()) {
+      auto *DVI = dyn_cast<DbgValueInst>(U.getUser());
+      if (!DVI || DVI->getFunction() != &F)
+        continue;
+      unsigned Line = DVI->getDebugLoc().getLine();
+      assert(Line != 0 && "debugify should not insert line 0 locations");
+      Line2Var[Line] = DVI->getVariable();
+      if (!EarliestDVI || Line < EarliestDVI->getDebugLoc().getLine())
+        EarliestDVI = DVI;
+      Expr = DVI->getExpression();
+    }
+  }
+  if (Line2Var.empty())
+    return true;
+
+  // Now, try to insert a DBG_VALUE instruction after each real instruction.
+  // Do this by introducing debug uses of each register definition. If that is
+  // not possible (e.g. we have a phi or a meta instruction), emit a constant.
+  uint64_t NextImm = 0;
+  const MCInstrDesc &DbgValDesc = TII.get(TargetOpcode::DBG_VALUE);
+  for (MachineBasicBlock &MBB : MF) {
+    MachineBasicBlock::iterator FirstNonPHIIt = MBB.getFirstNonPHI();
+    for (auto I = MBB.begin(), E = MBB.end(); I != E; ) {
+      MachineInstr &MI = *I;
+      ++I;
+
+      // `I` may point to a DBG_VALUE created in the previous loop iteration.
+      if (MI.isDebugInstr())
+        continue;
+
+      // It's not allowed to insert DBG_VALUEs after a terminator.
+      if (MI.isTerminator())
+        continue;
+
+      // Find a suitable insertion point for the DBG_VALUE.
+      auto InsertBeforeIt = MI.isPHI() ? FirstNonPHIIt : I;
+
+      // Find a suitable local variable for the DBG_VALUE.
+      unsigned Line = MI.getDebugLoc().getLine();
+      if (!Line2Var.count(Line))
+        Line = EarliestDVI->getDebugLoc().getLine();
+      DILocalVariable *LocalVar = Line2Var[Line];
+      assert(LocalVar && "No variable for current line?");
+
+      // Emit DBG_VALUEs for register definitions.
+      SmallVector<MachineOperand *, 4> RegDefs;
+      for (MachineOperand &MO : MI.operands())
+        if (MO.isReg() && MO.isDef() && MO.getReg())
+          RegDefs.push_back(&MO);
+      for (MachineOperand *MO : RegDefs)
+        BuildMI(MBB, InsertBeforeIt, MI.getDebugLoc(), DbgValDesc,
+                /*IsIndirect=*/false, *MO, LocalVar, Expr);
+
+      // OK, failing that, emit a constant DBG_VALUE.
+      if (RegDefs.empty()) {
+        auto ImmOp = MachineOperand::CreateImm(NextImm++);
+        BuildMI(MBB, InsertBeforeIt, MI.getDebugLoc(), DbgValDesc,
+                /*IsIndirect=*/false, ImmOp, LocalVar, Expr);
+      }
+    }
+  }
+
+  return true;
+}
+
+/// ModulePass for attaching synthetic debug info to everything, used with the
+/// legacy module pass manager.
+struct DebugifyMachineModule : public ModulePass {
+  bool runOnModule(Module &M) override {
+    MachineModuleInfo &MMI =
+        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+    return applyDebugifyMetadata(
+        M, M.functions(),
+        "ModuleDebugify: ", [&](DIBuilder &DIB, Function &F) -> bool {
+          return applyDebugifyMetadataToMachineFunction(MMI, DIB, F);
+        });
+  }
+
+  DebugifyMachineModule() : ModulePass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  static char ID; // Pass identification.
+};
+char DebugifyMachineModule::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(DebugifyMachineModule, DEBUG_TYPE,
+                      "Machine Debugify Module", false, false)
+INITIALIZE_PASS_END(DebugifyMachineModule, DEBUG_TYPE,
+                    "Machine Debugify Module", false, false)
+
+ModulePass *llvm::createDebugifyMachineModulePass() {
+  return new DebugifyMachineModule();
+}
diff --git a/llvm/lib/CodeGen/MachineFrameInfo.cpp b/llvm/lib/CodeGen/MachineFrameInfo.cpp
index 22ab2c7a6d77..7ba27ff1c856 100644
--- a/llvm/lib/CodeGen/MachineFrameInfo.cpp
+++ b/llvm/lib/CodeGen/MachineFrameInfo.cpp
@@ -41,8 +41,9 @@ static inline Align clampStackAlignment(bool ShouldClamp, Align Alignment,
                                         Align StackAlignment) {
   if (!ShouldClamp || Alignment <= StackAlignment)
     return Alignment;
-  LLVM_DEBUG(dbgs() << "Warning: requested alignment " << Alignment.value()
-                    << " exceeds the stack alignment " << StackAlignment.value()
+  LLVM_DEBUG(dbgs() << "Warning: requested alignment " << DebugStr(Alignment)
+                    << " exceeds the stack alignment "
+                    << DebugStr(StackAlignment)
                     << " when stack realignment is off" << '\n');
   return StackAlignment;
 }
@@ -89,7 +90,7 @@ int MachineFrameInfo::CreateFixedObject(uint64_t Size, int64_t SPOffset,
   // stack needs realignment, we can't assume that the stack will in fact be
   // aligned.
   Align Alignment =
-      commonAlignment(ForcedRealign ? Align::None() : StackAlignment, SPOffset);
+      commonAlignment(ForcedRealign ? Align(1) : StackAlignment, SPOffset);
   Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
   Objects.insert(Objects.begin(),
                  StackObject(Size, Alignment, SPOffset, IsImmutable,
@@ -102,7 +103,7 @@ int MachineFrameInfo::CreateFixedSpillStackObject(uint64_t Size,
                                                   int64_t SPOffset,
                                                   bool IsImmutable) {
   Align Alignment =
-      commonAlignment(ForcedRealign ? Align::None() : StackAlignment, SPOffset);
+      commonAlignment(ForcedRealign ? Align(1) : StackAlignment, SPOffset);
   Alignment = clampStackAlignment(!StackRealignable, Alignment, StackAlignment);
   Objects.insert(Objects.begin(),
                  StackObject(Size, Alignment, SPOffset, IsImmutable,
@@ -136,7 +137,7 @@ BitVector MachineFrameInfo::getPristineRegs(const MachineFunction &MF) const {
 uint64_t MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
-  unsigned MaxAlign = getMaxAlignment();
+  Align MaxAlign = getMaxAlign();
   int64_t Offset = 0;
 
   // This code is very, very similar to PEI::calculateFrameObjectOffsets().
@@ -155,11 +156,11 @@ uint64_t MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const {
     if (isDeadObjectIndex(i) || getStackID(i) != TargetStackID::Default)
       continue;
     Offset += getObjectSize(i);
-    unsigned Align = getObjectAlignment(i);
+    Align Alignment = getObjectAlign(i);
     // Adjust to alignment boundary
-    Offset = (Offset+Align-1)/Align*Align;
+    Offset = alignTo(Offset, Alignment);
 
-    MaxAlign = std::max(Align, MaxAlign);
+    MaxAlign = std::max(Alignment, MaxAlign);
   }
 
   if (adjustsStack() && TFI->hasReservedCallFrame(MF))
@@ -170,20 +171,17 @@ uint64_t MachineFrameInfo::estimateStackSize(const MachineFunction &MF) const {
   // ensure that the callee's frame or the alloca data is suitably aligned;
   // otherwise, for leaf functions, align to the TransientStackAlignment
   // value.
-  unsigned StackAlign;
+  Align StackAlign;
   if (adjustsStack() || hasVarSizedObjects() ||
       (RegInfo->needsStackRealignment(MF) && getObjectIndexEnd() != 0))
-    StackAlign = TFI->getStackAlignment();
+    StackAlign = TFI->getStackAlign();
   else
-    StackAlign = TFI->getTransientStackAlignment();
+    StackAlign = TFI->getTransientStackAlign();
 
   // If the frame pointer is eliminated, all frame offsets will be relative to
   // SP not FP. Align to MaxAlign so this works.
   StackAlign = std::max(StackAlign, MaxAlign);
-  unsigned AlignMask = StackAlign - 1;
-  Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
-
-  return (uint64_t)Offset;
+  return alignTo(Offset, StackAlign);
 }
 
 void MachineFrameInfo::computeMaxCallFrameSize(const MachineFunction &MF) {
diff --git a/llvm/lib/CodeGen/MachineFunction.cpp b/llvm/lib/CodeGen/MachineFunction.cpp
index 4612690644fe..6d45f08804ed 100644
--- a/llvm/lib/CodeGen/MachineFunction.cpp
+++ b/llvm/lib/CodeGen/MachineFunction.cpp
@@ -33,6 +33,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -71,6 +72,7 @@
 #include <cstdint>
 #include <iterator>
 #include <string>
+#include <type_traits>
 #include <utility>
 #include <vector>
 
@@ -96,6 +98,7 @@ static const char *getPropertyName(MachineFunctionProperties::Property Prop) {
   case P::RegBankSelected: return "RegBankSelected";
   case P::Selected: return "Selected";
   case P::TracksLiveness: return "TracksLiveness";
+  case P::TiedOpsRewritten: return "TiedOpsRewritten";
   }
   llvm_unreachable("Invalid machine function property");
 }
@@ -128,11 +131,10 @@ static inline unsigned getFnStackAlignment(const TargetSubtargetInfo *STI,
                                            const Function &F) {
   if (F.hasFnAttribute(Attribute::StackAlignment))
     return F.getFnStackAlignment();
-  return STI->getFrameLowering()->getStackAlignment();
+  return STI->getFrameLowering()->getStackAlign().value();
 }
 
-MachineFunction::MachineFunction(const Function &F,
-                                 const LLVMTargetMachine &Target,
+MachineFunction::MachineFunction(Function &F, const LLVMTargetMachine &Target,
                                  const TargetSubtargetInfo &STI,
                                  unsigned FunctionNum, MachineModuleInfo &mmi)
     : F(F), Target(Target), STI(&STI), Ctx(mmi.getContext()), MMI(mmi) {
@@ -170,7 +172,7 @@ void MachineFunction::init() {
           F.hasFnAttribute(Attribute::StackAlignment));
 
   if (F.hasFnAttribute(Attribute::StackAlignment))
-    FrameInfo->ensureMaxAlignment(F.getFnStackAlignment());
+    FrameInfo->ensureMaxAlignment(*F.getFnStackAlign());
 
   ConstantPool = new (Allocator) MachineConstantPool(getDataLayout());
   Alignment = STI->getTargetLowering()->getMinFunctionAlignment();
@@ -271,18 +273,20 @@ getOrCreateJumpTableInfo(unsigned EntryKind) {
 }
 
 DenormalMode MachineFunction::getDenormalMode(const fltSemantics &FPType) const {
+  if (&FPType == &APFloat::IEEEsingle()) {
+    Attribute Attr = F.getFnAttribute("denormal-fp-math-f32");
+    StringRef Val = Attr.getValueAsString();
+    if (!Val.empty())
+      return parseDenormalFPAttribute(Val);
+
+    // If the f32 variant of the attribute isn't specified, try to use the
+    // generic one.
+  }
+
   // TODO: Should probably avoid the connection to the IR and store directly
   // in the MachineFunction.
   Attribute Attr = F.getFnAttribute("denormal-fp-math");
-
-  // FIXME: This should assume IEEE behavior on an unspecified
-  // attribute. However, the one current user incorrectly assumes a non-IEEE
-  // target by default.
-  StringRef Val = Attr.getValueAsString();
-  if (Val.empty())
-    return DenormalMode::Invalid;
-
-  return parseDenormalFPAttribute(Val);
+  return parseDenormalFPAttribute(Attr.getValueAsString());
 }
 
 /// Should we be emitting segmented stack stuff for the function
@@ -337,6 +341,49 @@ void MachineFunction::RenumberBlocks(MachineBasicBlock *MBB) {
   MBBNumbering.resize(BlockNo);
 }
 
+/// This is used with -fbasic-block-sections or -fbasicblock-labels option.
+/// A unary encoding of basic block labels is done to keep ".strtab" sizes
+/// small.
+void MachineFunction::createBBLabels() {
+  const TargetInstrInfo *TII = getSubtarget().getInstrInfo();
+  this->BBSectionsSymbolPrefix.resize(getNumBlockIDs(), 'a');
+  for (auto MBBI = begin(), E = end(); MBBI != E; ++MBBI) {
+    assert(
+        (MBBI->getNumber() >= 0 && MBBI->getNumber() < (int)getNumBlockIDs()) &&
+        "BasicBlock number was out of range!");
+    // 'a' - Normal block.
+    // 'r' - Return block.
+    // 'l' - Landing Pad.
+    // 'L' - Return and landing pad.
+    bool isEHPad = MBBI->isEHPad();
+    bool isRetBlock = MBBI->isReturnBlock() && !TII->isTailCall(MBBI->back());
+    char type = 'a';
+    if (isEHPad && isRetBlock)
+      type = 'L';
+    else if (isEHPad)
+      type = 'l';
+    else if (isRetBlock)
+      type = 'r';
+    BBSectionsSymbolPrefix[MBBI->getNumber()] = type;
+  }
+}
+
+/// This method iterates over the basic blocks and assigns their IsBeginSection
+/// and IsEndSection fields. This must be called after MBB layout is finalized
+/// and the SectionID's are assigned to MBBs.
+void MachineFunction::assignBeginEndSections() {
+  front().setIsBeginSection();
+  auto CurrentSectionID = front().getSectionID();
+  for (auto MBBI = std::next(begin()), E = end(); MBBI != E; ++MBBI) {
+    if (MBBI->getSectionID() == CurrentSectionID)
+      continue;
+    MBBI->setIsBeginSection();
+    std::prev(MBBI)->setIsEndSection();
+    CurrentSectionID = MBBI->getSectionID();
+  }
+  back().setIsEndSection();
+}
+
 /// Allocate a new MachineInstr. Use this instead of `new MachineInstr'.
 MachineInstr *MachineFunction::CreateMachineInstr(const MCInstrDesc &MCID,
                                                   const DebugLoc &DL,
@@ -370,6 +417,11 @@ MachineInstr &MachineFunction::CloneMachineInstrBundle(MachineBasicBlock &MBB,
       break;
     ++I;
   }
+  // Copy over call site info to the cloned instruction if needed. If Orig is in
+  // a bundle, copyCallSiteInfo takes care of finding the call instruction in
+  // the bundle.
+  if (Orig.shouldUpdateCallSiteInfo())
+    copyCallSiteInfo(&Orig, FirstClone);
   return *FirstClone;
 }
 
@@ -383,7 +435,7 @@ MachineFunction::DeleteMachineInstr(MachineInstr *MI) {
   // be triggered during the implementation of support for the
   // call site info of a new architecture. If the assertion is triggered,
   // back trace will tell where to insert a call to updateCallSiteInfo().
-  assert((!MI->isCall(MachineInstr::IgnoreBundle) ||
+  assert((!MI->isCandidateForCallSiteEntry() ||
           CallSitesInfo.find(MI) == CallSitesInfo.end()) &&
          "Call site info was not updated!");
   // Strip it for parts. The operand array and the MI object itself are
@@ -414,7 +466,7 @@ MachineFunction::DeleteMachineBasicBlock(MachineBasicBlock *MBB) {
 
 MachineMemOperand *MachineFunction::getMachineMemOperand(
     MachinePointerInfo PtrInfo, MachineMemOperand::Flags f, uint64_t s,
-    unsigned base_alignment, const AAMDNodes &AAInfo, const MDNode *Ranges,
+    Align base_alignment, const AAMDNodes &AAInfo, const MDNode *Ranges,
     SyncScope::ID SSID, AtomicOrdering Ordering,
     AtomicOrdering FailureOrdering) {
   return new (Allocator)
@@ -429,13 +481,13 @@ MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
 
   // If there is no pointer value, the offset isn't tracked so we need to adjust
   // the base alignment.
-  unsigned Align = PtrInfo.V.isNull()
-                       ? MinAlign(MMO->getBaseAlignment(), Offset)
-                       : MMO->getBaseAlignment();
+  Align Alignment = PtrInfo.V.isNull()
+                        ? commonAlignment(MMO->getBaseAlign(), Offset)
+                        : MMO->getBaseAlign();
 
   return new (Allocator)
       MachineMemOperand(PtrInfo.getWithOffset(Offset), MMO->getFlags(), Size,
-                        Align, AAMDNodes(), nullptr, MMO->getSyncScopeID(),
+                        Alignment, AAMDNodes(), nullptr, MMO->getSyncScopeID(),
                         MMO->getOrdering(), MMO->getFailureOrdering());
 }
 
@@ -446,18 +498,17 @@ MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
              MachinePointerInfo(MMO->getValue(), MMO->getOffset()) :
              MachinePointerInfo(MMO->getPseudoValue(), MMO->getOffset());
 
-  return new (Allocator)
-             MachineMemOperand(MPI, MMO->getFlags(), MMO->getSize(),
-                               MMO->getBaseAlignment(), AAInfo,
-                               MMO->getRanges(), MMO->getSyncScopeID(),
-                               MMO->getOrdering(), MMO->getFailureOrdering());
+  return new (Allocator) MachineMemOperand(
+      MPI, MMO->getFlags(), MMO->getSize(), MMO->getBaseAlign(), AAInfo,
+      MMO->getRanges(), MMO->getSyncScopeID(), MMO->getOrdering(),
+      MMO->getFailureOrdering());
 }
 
 MachineMemOperand *
 MachineFunction::getMachineMemOperand(const MachineMemOperand *MMO,
                                       MachineMemOperand::Flags Flags) {
   return new (Allocator) MachineMemOperand(
-      MMO->getPointerInfo(), Flags, MMO->getSize(), MMO->getBaseAlignment(),
+      MMO->getPointerInfo(), Flags, MMO->getSize(), MMO->getBaseAlign(),
       MMO->getAAInfo(), MMO->getRanges(), MMO->getSyncScopeID(),
       MMO->getOrdering(), MMO->getFailureOrdering());
 }
@@ -608,10 +659,10 @@ void MachineFunction::viewCFGOnly() const
 
 /// Add the specified physical register as a live-in value and
 /// create a corresponding virtual register for it.
-unsigned MachineFunction::addLiveIn(unsigned PReg,
+Register MachineFunction::addLiveIn(MCRegister PReg,
                                     const TargetRegisterClass *RC) {
   MachineRegisterInfo &MRI = getRegInfo();
-  unsigned VReg = MRI.getLiveInVirtReg(PReg);
+  Register VReg = MRI.getLiveInVirtReg(PReg);
   if (VReg) {
     const TargetRegisterClass *VRegRC = MRI.getRegClass(VReg);
     (void)VRegRC;
@@ -853,28 +904,34 @@ try_next:;
 
 MachineFunction::CallSiteInfoMap::iterator
 MachineFunction::getCallSiteInfo(const MachineInstr *MI) {
-  assert(MI->isCall() && "Call site info refers only to call instructions!");
+  assert(MI->isCandidateForCallSiteEntry() &&
+         "Call site info refers only to call (MI) candidates");
 
-  if (!Target.Options.EnableDebugEntryValues)
+  if (!Target.Options.EmitCallSiteInfo)
     return CallSitesInfo.end();
   return CallSitesInfo.find(MI);
 }
 
-void MachineFunction::moveCallSiteInfo(const MachineInstr *Old,
-                                       const MachineInstr *New) {
-  assert(New->isCall() && "Call site info refers only to call instructions!");
+/// Return the call machine instruction or find a call within bundle.
+static const MachineInstr *getCallInstr(const MachineInstr *MI) {
+  if (!MI->isBundle())
+    return MI;
 
-  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(Old);
-  if (CSIt == CallSitesInfo.end())
-    return;
+  for (auto &BMI : make_range(getBundleStart(MI->getIterator()),
+                              getBundleEnd(MI->getIterator())))
+    if (BMI.isCandidateForCallSiteEntry())
+      return &BMI;
 
-  CallSiteInfo CSInfo = std::move(CSIt->second);
-  CallSitesInfo.erase(CSIt);
-  CallSitesInfo[New] = CSInfo;
+  llvm_unreachable("Unexpected bundle without a call site candidate");
 }
 
 void MachineFunction::eraseCallSiteInfo(const MachineInstr *MI) {
-  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(MI);
+  assert(MI->shouldUpdateCallSiteInfo() &&
+         "Call site info refers only to call (MI) candidates or "
+         "candidates inside bundles");
+
+  const MachineInstr *CallMI = getCallInstr(MI);
+  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(CallMI);
   if (CSIt == CallSitesInfo.end())
     return;
   CallSitesInfo.erase(CSIt);
@@ -882,9 +939,15 @@ void MachineFunction::eraseCallSiteInfo(const MachineInstr *MI) {
 
 void MachineFunction::copyCallSiteInfo(const MachineInstr *Old,
                                        const MachineInstr *New) {
-  assert(New->isCall() && "Call site info refers only to call instructions!");
+  assert(Old->shouldUpdateCallSiteInfo() &&
+         "Call site info refers only to call (MI) candidates or "
+         "candidates inside bundles");
+
+  if (!New->isCandidateForCallSiteEntry())
+    return eraseCallSiteInfo(Old);
 
-  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(Old);
+  const MachineInstr *OldCallMI = getCallInstr(Old);
+  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(OldCallMI);
   if (CSIt == CallSitesInfo.end())
     return;
 
@@ -892,6 +955,25 @@ void MachineFunction::copyCallSiteInfo(const MachineInstr *Old,
   CallSitesInfo[New] = CSInfo;
 }
 
+void MachineFunction::moveCallSiteInfo(const MachineInstr *Old,
+                                       const MachineInstr *New) {
+  assert(Old->shouldUpdateCallSiteInfo() &&
+         "Call site info refers only to call (MI) candidates or "
+         "candidates inside bundles");
+
+  if (!New->isCandidateForCallSiteEntry())
+    return eraseCallSiteInfo(Old);
+
+  const MachineInstr *OldCallMI = getCallInstr(Old);
+  CallSiteInfoMap::iterator CSIt = getCallSiteInfo(OldCallMI);
+  if (CSIt == CallSitesInfo.end())
+    return;
+
+  CallSiteInfo CSInfo = std::move(CSIt->second);
+  CallSitesInfo.erase(CSIt);
+  CallSitesInfo[New] = CSInfo;
+}
+
 /// \}
 
 //===----------------------------------------------------------------------===//
@@ -1095,8 +1177,7 @@ static bool CanShareConstantPoolEntry(const Constant *A, const Constant *B,
 /// Create a new entry in the constant pool or return an existing one.
 /// User must specify the log2 of the minimum required alignment for the object.
 unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C,
-                                                   unsigned Alignment) {
-  assert(Alignment && "Alignment must be specified!");
+                                                   Align Alignment) {
   if (Alignment > PoolAlignment) PoolAlignment = Alignment;
 
   // Check to see if we already have this constant.
@@ -1105,7 +1186,7 @@ unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C,
   for (unsigned i = 0, e = Constants.size(); i != e; ++i)
     if (!Constants[i].isMachineConstantPoolEntry() &&
         CanShareConstantPoolEntry(Constants[i].Val.ConstVal, C, DL)) {
-      if ((unsigned)Constants[i].getAlignment() < Alignment)
+      if (Constants[i].getAlign() < Alignment)
         Constants[i].Alignment = Alignment;
       return i;
     }
@@ -1115,8 +1196,7 @@ unsigned MachineConstantPool::getConstantPoolIndex(const Constant *C,
 }
 
 unsigned MachineConstantPool::getConstantPoolIndex(MachineConstantPoolValue *V,
-                                                   unsigned Alignment) {
-  assert(Alignment && "Alignment must be specified!");
+                                                   Align Alignment) {
   if (Alignment > PoolAlignment) PoolAlignment = Alignment;
 
   // Check to see if we already have this constant.
@@ -1142,7 +1222,7 @@ void MachineConstantPool::print(raw_ostream &OS) const {
       Constants[i].Val.MachineCPVal->print(OS);
     else
       Constants[i].Val.ConstVal->printAsOperand(OS, /*PrintType=*/false);
-    OS << ", align=" << Constants[i].getAlignment();
+    OS << ", align=" << Constants[i].getAlign().value();
     OS << "\n";
   }
 }
diff --git a/llvm/lib/CodeGen/MachineInstr.cpp b/llvm/lib/CodeGen/MachineInstr.cpp
index 08d786f8f12c..d4181591deab 100644
--- a/llvm/lib/CodeGen/MachineInstr.cpp
+++ b/llvm/lib/CodeGen/MachineInstr.cpp
@@ -61,6 +61,7 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/FormattedStream.h"
 #include "llvm/Support/LowLevelTypeImpl.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -696,6 +697,26 @@ void MachineInstr::eraseFromBundle() {
   getParent()->erase_instr(this);
 }
 
+bool MachineInstr::isCandidateForCallSiteEntry(QueryType Type) const {
+  if (!isCall(Type))
+    return false;
+  switch (getOpcode()) {
+  case TargetOpcode::PATCHABLE_EVENT_CALL:
+  case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
+  case TargetOpcode::PATCHPOINT:
+  case TargetOpcode::STACKMAP:
+  case TargetOpcode::STATEPOINT:
+    return false;
+  }
+  return true;
+}
+
+bool MachineInstr::shouldUpdateCallSiteInfo() const {
+  if (isBundle())
+    return isCandidateForCallSiteEntry(MachineInstr::AnyInBundle);
+  return isCandidateForCallSiteEntry();
+}
+
 unsigned MachineInstr::getNumExplicitOperands() const {
   unsigned NumOperands = MCID->getNumOperands();
   if (!MCID->isVariadic())
@@ -813,11 +834,26 @@ const DILabel *MachineInstr::getDebugLabel() const {
   return cast<DILabel>(getOperand(0).getMetadata());
 }
 
+const MachineOperand &MachineInstr::getDebugVariableOp() const {
+  assert(isDebugValue() && "not a DBG_VALUE");
+  return getOperand(2);
+}
+
+MachineOperand &MachineInstr::getDebugVariableOp() {
+  assert(isDebugValue() && "not a DBG_VALUE");
+  return getOperand(2);
+}
+
 const DILocalVariable *MachineInstr::getDebugVariable() const {
   assert(isDebugValue() && "not a DBG_VALUE");
   return cast<DILocalVariable>(getOperand(2).getMetadata());
 }
 
+MachineOperand &MachineInstr::getDebugExpressionOp() {
+  assert(isDebugValue() && "not a DBG_VALUE");
+  return getOperand(3);
+}
+
 const DIExpression *MachineInstr::getDebugExpression() const {
   assert(isDebugValue() && "not a DBG_VALUE");
   return cast<DIExpression>(getOperand(3).getMetadata());
@@ -1199,6 +1235,10 @@ bool MachineInstr::mayAlias(AAResults *AA, const MachineInstr &Other,
   if (!mayStore() && !Other.mayStore())
     return false;
 
+  // Both instructions must be memory operations to be able to alias.
+  if (!mayLoadOrStore() || !Other.mayLoadOrStore())
+    return false;
+
   // Let the target decide if memory accesses cannot possibly overlap.
   if (TII->areMemAccessesTriviallyDisjoint(*this, Other))
     return false;
@@ -1449,6 +1489,37 @@ LLVM_DUMP_METHOD void MachineInstr::dump() const {
   dbgs() << "  ";
   print(dbgs());
 }
+
+LLVM_DUMP_METHOD void MachineInstr::dumprImpl(
+    const MachineRegisterInfo &MRI, unsigned Depth, unsigned MaxDepth,
+    SmallPtrSetImpl<const MachineInstr *> &AlreadySeenInstrs) const {
+  if (Depth >= MaxDepth)
+    return;
+  if (!AlreadySeenInstrs.insert(this).second)
+    return;
+  // PadToColumn always inserts at least one space.
+  // Don't mess up the alignment if we don't want any space.
+  if (Depth)
+    fdbgs().PadToColumn(Depth * 2);
+  print(fdbgs());
+  for (const MachineOperand &MO : operands()) {
+    if (!MO.isReg() || MO.isDef())
+      continue;
+    Register Reg = MO.getReg();
+    if (Reg.isPhysical())
+      continue;
+    const MachineInstr *NewMI = MRI.getUniqueVRegDef(Reg);
+    if (NewMI == nullptr)
+      continue;
+    NewMI->dumprImpl(MRI, Depth + 1, MaxDepth, AlreadySeenInstrs);
+  }
+}
+
+LLVM_DUMP_METHOD void MachineInstr::dumpr(const MachineRegisterInfo &MRI,
+                                          unsigned MaxDepth) const {
+  SmallPtrSet<const MachineInstr *, 16> AlreadySeenInstrs;
+  dumprImpl(MRI, 0, MaxDepth, AlreadySeenInstrs);
+}
 #endif
 
 void MachineInstr::print(raw_ostream &OS, bool IsStandalone, bool SkipOpers,
@@ -1473,7 +1544,6 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
                          bool IsStandalone, bool SkipOpers, bool SkipDebugLoc,
                          bool AddNewLine, const TargetInstrInfo *TII) const {
   // We can be a bit tidier if we know the MachineFunction.
-  const MachineFunction *MF = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
   const MachineRegisterInfo *MRI = nullptr;
   const TargetIntrinsicInfo *IntrinsicInfo = nullptr;
@@ -1540,6 +1610,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
     OS << "exact ";
   if (getFlag(MachineInstr::NoFPExcept))
     OS << "nofpexcept ";
+  if (getFlag(MachineInstr::NoMerge))
+    OS << "nomerge ";
 
   // Print the opcode name.
   if (TII)
@@ -1618,15 +1690,8 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
       // Pretty print the inline asm operand descriptor.
       OS << '$' << AsmOpCount++;
       unsigned Flag = MO.getImm();
-      switch (InlineAsm::getKind(Flag)) {
-      case InlineAsm::Kind_RegUse:             OS << ":[reguse"; break;
-      case InlineAsm::Kind_RegDef:             OS << ":[regdef"; break;
-      case InlineAsm::Kind_RegDefEarlyClobber: OS << ":[regdef-ec"; break;
-      case InlineAsm::Kind_Clobber:            OS << ":[clobber"; break;
-      case InlineAsm::Kind_Imm:                OS << ":[imm"; break;
-      case InlineAsm::Kind_Mem:                OS << ":[mem"; break;
-      default: OS << ":[??" << InlineAsm::getKind(Flag); break;
-      }
+      OS << ":[";
+      OS << InlineAsm::getKindName(InlineAsm::getKind(Flag));
 
       unsigned RCID = 0;
       if (!InlineAsm::isImmKind(Flag) && !InlineAsm::isMemKind(Flag) &&
@@ -1639,29 +1704,7 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
 
       if (InlineAsm::isMemKind(Flag)) {
         unsigned MCID = InlineAsm::getMemoryConstraintID(Flag);
-        switch (MCID) {
-        case InlineAsm::Constraint_es: OS << ":es"; break;
-        case InlineAsm::Constraint_i:  OS << ":i"; break;
-        case InlineAsm::Constraint_m:  OS << ":m"; break;
-        case InlineAsm::Constraint_o:  OS << ":o"; break;
-        case InlineAsm::Constraint_v:  OS << ":v"; break;
-        case InlineAsm::Constraint_Q:  OS << ":Q"; break;
-        case InlineAsm::Constraint_R:  OS << ":R"; break;
-        case InlineAsm::Constraint_S:  OS << ":S"; break;
-        case InlineAsm::Constraint_T:  OS << ":T"; break;
-        case InlineAsm::Constraint_Um: OS << ":Um"; break;
-        case InlineAsm::Constraint_Un: OS << ":Un"; break;
-        case InlineAsm::Constraint_Uq: OS << ":Uq"; break;
-        case InlineAsm::Constraint_Us: OS << ":Us"; break;
-        case InlineAsm::Constraint_Ut: OS << ":Ut"; break;
-        case InlineAsm::Constraint_Uv: OS << ":Uv"; break;
-        case InlineAsm::Constraint_Uy: OS << ":Uy"; break;
-        case InlineAsm::Constraint_X:  OS << ":X"; break;
-        case InlineAsm::Constraint_Z:  OS << ":Z"; break;
-        case InlineAsm::Constraint_ZC: OS << ":ZC"; break;
-        case InlineAsm::Constraint_Zy: OS << ":Zy"; break;
-        default: OS << ":?"; break;
-        }
+        OS << ":" << InlineAsm::getMemConstraintName(MCID);
       }
 
       unsigned TiedTo = 0;
@@ -1758,21 +1801,13 @@ void MachineInstr::print(raw_ostream &OS, ModuleSlotTracker &MST,
   }
 
   // Print extra comments for DEBUG_VALUE.
-  if (isDebugValue() && getOperand(e - 2).isMetadata()) {
+  if (isDebugValue() && getDebugVariableOp().isMetadata()) {
     if (!HaveSemi) {
       OS << ";";
       HaveSemi = true;
     }
-    auto *DV = cast<DILocalVariable>(getOperand(e - 2).getMetadata());
+    auto *DV = getDebugVariable();
     OS << " line no:" <<  DV->getLine();
-    if (auto *InlinedAt = debugLoc->getInlinedAt()) {
-      DebugLoc InlinedAtDL(InlinedAt);
-      if (InlinedAtDL && MF) {
-        OS << " inlined @[ ";
-        InlinedAtDL.print(OS);
-        OS << " ]";
-      }
-    }
     if (isIndirectDebugValue())
       OS << " indirect";
   }
@@ -2077,7 +2112,8 @@ static const DIExpression *computeExprForSpill(const MachineInstr &MI) {
 
   const DIExpression *Expr = MI.getDebugExpression();
   if (MI.isIndirectDebugValue()) {
-    assert(MI.getOperand(1).getImm() == 0 && "DBG_VALUE with nonzero offset");
+    assert(MI.getDebugOffset().getImm() == 0 &&
+           "DBG_VALUE with nonzero offset");
     Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);
   }
   return Expr;
@@ -2097,9 +2133,9 @@ MachineInstr *llvm::buildDbgValueForSpill(MachineBasicBlock &BB,
 
 void llvm::updateDbgValueForSpill(MachineInstr &Orig, int FrameIndex) {
   const DIExpression *Expr = computeExprForSpill(Orig);
-  Orig.getOperand(0).ChangeToFrameIndex(FrameIndex);
-  Orig.getOperand(1).ChangeToImmediate(0U);
-  Orig.getOperand(3).setMetadata(Expr);
+  Orig.getDebugOperand(0).ChangeToFrameIndex(FrameIndex);
+  Orig.getDebugOffset().ChangeToImmediate(0U);
+  Orig.getDebugExpressionOp().setMetadata(Expr);
 }
 
 void MachineInstr::collectDebugValues(
@@ -2113,8 +2149,7 @@ void MachineInstr::collectDebugValues(
        DI != DE; ++DI) {
     if (!DI->isDebugValue())
       return;
-    if (DI->getOperand(0).isReg() &&
-        DI->getOperand(0).getReg() == MI.getOperand(0).getReg())
+    if (DI->getDebugOperandForReg(MI.getOperand(0).getReg()))
       DbgValues.push_back(&*DI);
   }
 }
@@ -2126,26 +2161,25 @@ void MachineInstr::changeDebugValuesDefReg(Register Reg) {
   if (!getOperand(0).isReg())
     return;
 
-  unsigned DefReg = getOperand(0).getReg();
+  Register DefReg = getOperand(0).getReg();
   auto *MRI = getRegInfo();
   for (auto &MO : MRI->use_operands(DefReg)) {
     auto *DI = MO.getParent();
     if (!DI->isDebugValue())
       continue;
-    if (DI->getOperand(0).isReg() &&
-        DI->getOperand(0).getReg() == DefReg){
+    if (DI->getDebugOperandForReg(DefReg)) {
       DbgValues.push_back(DI);
     }
   }
 
   // Propagate Reg to debug value instructions.
   for (auto *DBI : DbgValues)
-    DBI->getOperand(0).setReg(Reg);
+    DBI->getDebugOperandForReg(DefReg)->setReg(Reg);
 }
 
 using MMOList = SmallVector<const MachineMemOperand *, 2>;
 
-static unsigned getSpillSlotSize(MMOList &Accesses,
+static unsigned getSpillSlotSize(const MMOList &Accesses,
                                  const MachineFrameInfo &MFI) {
   unsigned Size = 0;
   for (auto A : Accesses)
diff --git a/llvm/lib/CodeGen/MachineInstrBundle.cpp b/llvm/lib/CodeGen/MachineInstrBundle.cpp
index 94865b0e9031..50456e489ea1 100644
--- a/llvm/lib/CodeGen/MachineInstrBundle.cpp
+++ b/llvm/lib/CodeGen/MachineInstrBundle.cpp
@@ -136,14 +136,14 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
       BuildMI(MF, getDebugLoc(FirstMI, LastMI), TII->get(TargetOpcode::BUNDLE));
   Bundle.prepend(MIB);
 
-  SmallVector<unsigned, 32> LocalDefs;
-  SmallSet<unsigned, 32> LocalDefSet;
-  SmallSet<unsigned, 8> DeadDefSet;
-  SmallSet<unsigned, 16> KilledDefSet;
-  SmallVector<unsigned, 8> ExternUses;
-  SmallSet<unsigned, 8> ExternUseSet;
-  SmallSet<unsigned, 8> KilledUseSet;
-  SmallSet<unsigned, 8> UndefUseSet;
+  SmallVector<Register, 32> LocalDefs;
+  SmallSet<Register, 32> LocalDefSet;
+  SmallSet<Register, 8> DeadDefSet;
+  SmallSet<Register, 16> KilledDefSet;
+  SmallVector<Register, 8> ExternUses;
+  SmallSet<Register, 8> ExternUseSet;
+  SmallSet<Register, 8> KilledUseSet;
+  SmallSet<Register, 8> UndefUseSet;
   SmallVector<MachineOperand*, 4> Defs;
   for (auto MII = FirstMI; MII != LastMI; ++MII) {
     for (unsigned i = 0, e = MII->getNumOperands(); i != e; ++i) {
@@ -207,9 +207,9 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
     Defs.clear();
   }
 
-  SmallSet<unsigned, 32> Added;
+  SmallSet<Register, 32> Added;
   for (unsigned i = 0, e = LocalDefs.size(); i != e; ++i) {
-    unsigned Reg = LocalDefs[i];
+    Register Reg = LocalDefs[i];
     if (Added.insert(Reg).second) {
       // If it's not live beyond end of the bundle, mark it dead.
       bool isDead = DeadDefSet.count(Reg) || KilledDefSet.count(Reg);
@@ -219,7 +219,7 @@ void llvm::finalizeBundle(MachineBasicBlock &MBB,
   }
 
   for (unsigned i = 0, e = ExternUses.size(); i != e; ++i) {
-    unsigned Reg = ExternUses[i];
+    Register Reg = ExternUses[i];
     bool isKill = KilledUseSet.count(Reg);
     bool isUndef = UndefUseSet.count(Reg);
     MIB.addReg(Reg, getKillRegState(isKill) | getUndefRegState(isUndef) |
@@ -279,7 +279,7 @@ bool llvm::finalizeBundles(MachineFunction &MF) {
 }
 
 VirtRegInfo llvm::AnalyzeVirtRegInBundle(
-    MachineInstr &MI, unsigned Reg,
+    MachineInstr &MI, Register Reg,
     SmallVectorImpl<std::pair<MachineInstr *, unsigned>> *Ops) {
   VirtRegInfo RI = {false, false, false};
   for (MIBundleOperands O(MI); O.isValid(); ++O) {
@@ -308,13 +308,12 @@ VirtRegInfo llvm::AnalyzeVirtRegInBundle(
   return RI;
 }
 
-PhysRegInfo llvm::AnalyzePhysRegInBundle(const MachineInstr &MI, unsigned Reg,
+PhysRegInfo llvm::AnalyzePhysRegInBundle(const MachineInstr &MI, Register Reg,
                                          const TargetRegisterInfo *TRI) {
   bool AllDefsDead = true;
   PhysRegInfo PRI = {false, false, false, false, false, false, false, false};
 
-  assert(Register::isPhysicalRegister(Reg) &&
-         "analyzePhysReg not given a physical register!");
+  assert(Reg.isPhysical() && "analyzePhysReg not given a physical register!");
   for (ConstMIBundleOperands O(MI); O.isValid(); ++O) {
     const MachineOperand &MO = *O;
 
diff --git a/llvm/lib/CodeGen/MachineLICM.cpp b/llvm/lib/CodeGen/MachineLICM.cpp
index 462d4d3b3726..5e8a916b3b3b 100644
--- a/llvm/lib/CodeGen/MachineLICM.cpp
+++ b/llvm/lib/CodeGen/MachineLICM.cpp
@@ -635,6 +635,12 @@ void MachineLICMBase::HoistPostRA(MachineInstr *MI, unsigned Def) {
   MachineBasicBlock *MBB = MI->getParent();
   Preheader->splice(Preheader->getFirstTerminator(), MBB, MI);
 
+  // Since we are moving the instruction out of its basic block, we do not
+  // retain its debug location. Doing so would degrade the debugging
+  // experience and adversely affect the accuracy of profiling information.
+  assert(!MI->isDebugInstr() && "Should not hoist debug inst");
+  MI->setDebugLoc(DebugLoc());
+
   // Add register to livein list to all the BBs in the current loop since a
   // loop invariant must be kept live throughout the whole loop. This is
   // important to ensure later passes do not scavenge the def register.
@@ -731,8 +737,7 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
       continue;
 
     Scopes.push_back(Node);
-    const std::vector<MachineDomTreeNode*> &Children = Node->getChildren();
-    unsigned NumChildren = Children.size();
+    unsigned NumChildren = Node->getNumChildren();
 
     // Don't hoist things out of a large switch statement.  This often causes
     // code to be hoisted that wasn't going to be executed, and increases
@@ -741,13 +746,14 @@ void MachineLICMBase::HoistOutOfLoop(MachineDomTreeNode *HeaderN) {
       NumChildren = 0;
 
     OpenChildren[Node] = NumChildren;
-    // Add children in reverse order as then the next popped worklist node is
-    // the first child of this node.  This means we ultimately traverse the
-    // DOM tree in exactly the same order as if we'd recursed.
-    for (int i = (int)NumChildren-1; i >= 0; --i) {
-      MachineDomTreeNode *Child = Children[i];
-      ParentMap[Child] = Node;
-      WorkList.push_back(Child);
+    if (NumChildren) {
+      // Add children in reverse order as then the next popped worklist node is
+      // the first child of this node.  This means we ultimately traverse the
+      // DOM tree in exactly the same order as if we'd recursed.
+      for (MachineDomTreeNode *Child : reverse(Node->children())) {
+        ParentMap[Child] = Node;
+        WorkList.push_back(Child);
+      }
     }
   }
 
@@ -829,7 +835,15 @@ void MachineLICMBase::SinkIntoLoop() {
     }
     if (!CanSink || !B || B == Preheader)
       continue;
+
+    LLVM_DEBUG(dbgs() << "Sinking to " << printMBBReference(*B) << " from "
+                      << printMBBReference(*I->getParent()) << ": " << *I);
     B->splice(B->getFirstNonPHI(), Preheader, I);
+
+    // The instruction is is moved from its basic block, so do not retain the
+    // debug information.
+    assert(!I->isDebugInstr() && "Should not sink debug inst");
+    I->setDebugLoc(DebugLoc());
   }
 }
 
@@ -1367,6 +1381,11 @@ MachineInstr *MachineLICMBase::ExtractHoistableLoad(MachineInstr *MI) {
   UpdateRegPressure(NewMIs[1]);
 
   // Otherwise we successfully unfolded a load that we can hoist.
+
+  // Update the call site info.
+  if (MI->shouldUpdateCallSiteInfo())
+    MF.eraseCallSiteInfo(MI);
+
   MI->eraseFromParent();
   return NewMIs[0];
 }
@@ -1519,6 +1538,7 @@ bool MachineLICMBase::Hoist(MachineInstr *MI, MachineBasicBlock *Preheader) {
     // Since we are moving the instruction out of its basic block, we do not
     // retain its debug location. Doing so would degrade the debugging
     // experience and adversely affect the accuracy of profiling information.
+    assert(!MI->isDebugInstr() && "Should not hoist debug inst");
     MI->setDebugLoc(DebugLoc());
 
     // Update register pressure for BBs from header to this block.
diff --git a/llvm/lib/CodeGen/MachineLoopUtils.cpp b/llvm/lib/CodeGen/MachineLoopUtils.cpp
index cf30e28449cd..2295e1ca6d4e 100644
--- a/llvm/lib/CodeGen/MachineLoopUtils.cpp
+++ b/llvm/lib/CodeGen/MachineLoopUtils.cpp
@@ -42,8 +42,7 @@ MachineBasicBlock *llvm::PeelSingleBlockLoop(LoopPeelDirection Direction,
   else
     MF.insert(std::next(Loop->getIterator()), NewBB);
 
-  // FIXME: Add DenseMapInfo trait for Register so we can use it as a key.
-  DenseMap<unsigned, Register> Remaps;
+  DenseMap<Register, Register> Remaps;
   auto InsertPt = NewBB->end();
   for (MachineInstr &MI : *Loop) {
     MachineInstr *NewMI = MF.CloneMachineInstr(&MI);
diff --git a/llvm/lib/CodeGen/MachineModuleInfo.cpp b/llvm/lib/CodeGen/MachineModuleInfo.cpp
index 0094a923e039..f866c7ca53c6 100644
--- a/llvm/lib/CodeGen/MachineModuleInfo.cpp
+++ b/llvm/lib/CodeGen/MachineModuleInfo.cpp
@@ -76,25 +76,11 @@ class MMIAddrLabelMap {
   /// we get notified if a block is deleted or RAUWd.
   std::vector<MMIAddrLabelMapCallbackPtr> BBCallbacks;
 
-  /// This is a per-function list of symbols whose corresponding BasicBlock got
-  /// deleted.  These symbols need to be emitted at some point in the file, so
-  /// AsmPrinter emits them after the function body.
-  DenseMap<AssertingVH<Function>, std::vector<MCSymbol*>>
-    DeletedAddrLabelsNeedingEmission;
-
 public:
   MMIAddrLabelMap(MCContext &context) : Context(context) {}
 
-  ~MMIAddrLabelMap() {
-    assert(DeletedAddrLabelsNeedingEmission.empty() &&
-           "Some labels for deleted blocks never got emitted");
-  }
-
   ArrayRef<MCSymbol *> getAddrLabelSymbolToEmit(BasicBlock *BB);
 
-  void takeDeletedSymbolsForFunction(Function *F,
-                                     std::vector<MCSymbol*> &Result);
-
   void UpdateForDeletedBlock(BasicBlock *BB);
   void UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New);
 };
@@ -119,33 +105,10 @@ ArrayRef<MCSymbol *> MMIAddrLabelMap::getAddrLabelSymbolToEmit(BasicBlock *BB) {
   Entry.Index = BBCallbacks.size() - 1;
   Entry.Fn = BB->getParent();
   MCSymbol *Sym = Context.createTempSymbol(!BB->hasAddressTaken());
-  if (Context.getObjectFileInfo()->getTargetTriple().isOSBinFormatXCOFF()) {
-    MCSymbol *FnEntryPointSym =
-        Context.lookupSymbol("." + Entry.Fn->getName());
-    assert(FnEntryPointSym && "The function entry pointer symbol should have"
-		              " already been initialized.");
-    MCSectionXCOFF *Csect =
-        cast<MCSymbolXCOFF>(FnEntryPointSym)->getContainingCsect();
-    cast<MCSymbolXCOFF>(Sym)->setContainingCsect(Csect);
-  }
   Entry.Symbols.push_back(Sym);
   return Entry.Symbols;
 }
 
-/// If we have any deleted symbols for F, return them.
-void MMIAddrLabelMap::
-takeDeletedSymbolsForFunction(Function *F, std::vector<MCSymbol*> &Result) {
-  DenseMap<AssertingVH<Function>, std::vector<MCSymbol*>>::iterator I =
-    DeletedAddrLabelsNeedingEmission.find(F);
-
-  // If there are no entries for the function, just return.
-  if (I == DeletedAddrLabelsNeedingEmission.end()) return;
-
-  // Otherwise, take the list.
-  std::swap(Result, I->second);
-  DeletedAddrLabelsNeedingEmission.erase(I);
-}
-
 void MMIAddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) {
   // If the block got deleted, there is no need for the symbol.  If the symbol
   // was already emitted, we can just forget about it, otherwise we need to
@@ -158,16 +121,8 @@ void MMIAddrLabelMap::UpdateForDeletedBlock(BasicBlock *BB) {
   assert((BB->getParent() == nullptr || BB->getParent() == Entry.Fn) &&
          "Block/parent mismatch");
 
-  for (MCSymbol *Sym : Entry.Symbols) {
-    if (Sym->isDefined())
-      return;
-
-    // If the block is not yet defined, we need to emit it at the end of the
-    // function.  Add the symbol to the DeletedAddrLabelsNeedingEmission list
-    // for the containing Function.  Since the block is being deleted, its
-    // parent may already be removed, we have to get the function from 'Entry'.
-    DeletedAddrLabelsNeedingEmission[Entry.Fn].push_back(Sym);
-  }
+  assert(llvm::all_of(Entry.Symbols, [](MCSymbol *Sym) {
+    return Sym->isDefined(); }));
 }
 
 void MMIAddrLabelMap::UpdateForRAUWBlock(BasicBlock *Old, BasicBlock *New) {
@@ -252,15 +207,6 @@ MachineModuleInfo::getAddrLabelSymbolToEmit(const BasicBlock *BB) {
  return AddrLabelSymbols->getAddrLabelSymbolToEmit(const_cast<BasicBlock*>(BB));
 }
 
-void MachineModuleInfo::
-takeDeletedSymbolsForFunction(const Function *F,
-                              std::vector<MCSymbol*> &Result) {
-  // If no blocks have had their addresses taken, we're done.
-  if (!AddrLabelSymbols) return;
-  return AddrLabelSymbols->
-     takeDeletedSymbolsForFunction(const_cast<Function*>(F), Result);
-}
-
 /// \name Exception Handling
 /// \{
 
@@ -279,8 +225,7 @@ MachineModuleInfo::getMachineFunction(const Function &F) const {
   return I != MachineFunctions.end() ? I->second.get() : nullptr;
 }
 
-MachineFunction &
-MachineModuleInfo::getOrCreateMachineFunction(const Function &F) {
+MachineFunction &MachineModuleInfo::getOrCreateMachineFunction(Function &F) {
   // Shortcut for the common case where a sequence of MachineFunctionPasses
   // all query for the same Function.
   if (LastRequest == &F)
diff --git a/llvm/lib/CodeGen/MachineOperand.cpp b/llvm/lib/CodeGen/MachineOperand.cpp
index 7b8f01100929..2b4fd654e46c 100644
--- a/llvm/lib/CodeGen/MachineOperand.cpp
+++ b/llvm/lib/CodeGen/MachineOperand.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/MemoryLocation.h"
@@ -24,6 +25,7 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRPrintingPasses.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/Target/TargetIntrinsicInfo.h"
@@ -668,7 +670,7 @@ static void printCFI(raw_ostream &OS, const MCCFIInstruction &CFI,
       size_t e = CFI.getValues().size() - 1;
       for (size_t i = 0; i < e; ++i)
         OS << format("0x%02x", uint8_t(CFI.getValues()[i])) << ", ";
-      OS << format("0x%02x", uint8_t(CFI.getValues()[e])) << ", ";
+      OS << format("0x%02x", uint8_t(CFI.getValues()[e]));
     }
     break;
   }
@@ -969,8 +971,7 @@ bool MachinePointerInfo::isDereferenceable(unsigned Size, LLVMContext &C,
     return false;
 
   return isDereferenceableAndAlignedPointer(
-      BasePtr, Align::None(), APInt(DL.getPointerSizeInBits(), Offset + Size),
-      DL);
+      BasePtr, Align(1), APInt(DL.getPointerSizeInBits(), Offset + Size), DL);
 }
 
 /// getConstantPool - Return a MachinePointerInfo record that refers to the
@@ -1004,17 +1005,16 @@ MachinePointerInfo MachinePointerInfo::getUnknownStack(MachineFunction &MF) {
 }
 
 MachineMemOperand::MachineMemOperand(MachinePointerInfo ptrinfo, Flags f,
-                                     uint64_t s, uint64_t a,
+                                     uint64_t s, Align a,
                                      const AAMDNodes &AAInfo,
                                      const MDNode *Ranges, SyncScope::ID SSID,
                                      AtomicOrdering Ordering,
                                      AtomicOrdering FailureOrdering)
-    : PtrInfo(ptrinfo), Size(s), FlagVals(f), BaseAlignLog2(Log2_32(a) + 1),
-      AAInfo(AAInfo), Ranges(Ranges) {
+    : PtrInfo(ptrinfo), Size(s), FlagVals(f), BaseAlign(a), AAInfo(AAInfo),
+      Ranges(Ranges) {
   assert((PtrInfo.V.isNull() || PtrInfo.V.is<const PseudoSourceValue *>() ||
           isa<PointerType>(PtrInfo.V.get<const Value *>()->getType())) &&
          "invalid pointer value");
-  assert(getBaseAlignment() == a && a != 0 && "Alignment is not a power of 2!");
   assert((isLoad() || isStore()) && "Not a load/store!");
 
   AtomicInfo.SSID = static_cast<unsigned>(SSID);
@@ -1032,7 +1032,7 @@ void MachineMemOperand::Profile(FoldingSetNodeID &ID) const {
   ID.AddInteger(Size);
   ID.AddPointer(getOpaqueValue());
   ID.AddInteger(getFlags());
-  ID.AddInteger(getBaseAlignment());
+  ID.AddInteger(getBaseAlign().value());
 }
 
 void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) {
@@ -1041,9 +1041,9 @@ void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) {
   assert(MMO->getFlags() == getFlags() && "Flags mismatch!");
   assert(MMO->getSize() == getSize() && "Size mismatch!");
 
-  if (MMO->getBaseAlignment() >= getBaseAlignment()) {
+  if (MMO->getBaseAlign() >= getBaseAlign()) {
     // Update the alignment value.
-    BaseAlignLog2 = Log2_32(MMO->getBaseAlignment()) + 1;
+    BaseAlign = MMO->getBaseAlign();
     // Also update the base and offset, because the new alignment may
     // not be applicable with the old ones.
     PtrInfo = MMO->PtrInfo;
@@ -1052,8 +1052,12 @@ void MachineMemOperand::refineAlignment(const MachineMemOperand *MMO) {
 
 /// getAlignment - Return the minimum known alignment in bytes of the
 /// actual memory reference.
-uint64_t MachineMemOperand::getAlignment() const {
-  return MinAlign(getBaseAlignment(), getOffset());
+uint64_t MachineMemOperand::getAlignment() const { return getAlign().value(); }
+
+/// getAlign - Return the minimum known alignment in bytes of the
+/// actual memory reference.
+Align MachineMemOperand::getAlign() const {
+  return commonAlignment(getBaseAlign(), getOffset());
 }
 
 void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
@@ -1148,8 +1152,8 @@ void MachineMemOperand::print(raw_ostream &OS, ModuleSlotTracker &MST,
     }
   }
   MachineOperand::printOperandOffset(OS, getOffset());
-  if (getBaseAlignment() != getSize())
-    OS << ", align " << getBaseAlignment();
+  if (getBaseAlign() != getSize())
+    OS << ", align " << getBaseAlign().value();
   auto AAInfo = getAAInfo();
   if (AAInfo.TBAA) {
     OS << ", !tbaa ";
diff --git a/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp b/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
index d656953f9115..dcb8e4073ea3 100644
--- a/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
+++ b/llvm/lib/CodeGen/MachineOptimizationRemarkEmitter.cpp
@@ -24,7 +24,7 @@ using namespace llvm;
 DiagnosticInfoMIROptimization::MachineArgument::MachineArgument(
     StringRef MKey, const MachineInstr &MI)
     : Argument() {
-  Key = MKey;
+  Key = std::string(MKey);
 
   raw_string_ostream OS(Val);
   MI.print(OS, /*IsStandalone=*/true, /*SkipOpers=*/false,
diff --git a/llvm/lib/CodeGen/MachineOutliner.cpp b/llvm/lib/CodeGen/MachineOutliner.cpp
index 3a9104bda0d1..f9d099e02995 100644
--- a/llvm/lib/CodeGen/MachineOutliner.cpp
+++ b/llvm/lib/CodeGen/MachineOutliner.cpp
@@ -56,6 +56,7 @@
 //===----------------------------------------------------------------------===//
 #include "llvm/CodeGen/MachineOutliner.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -69,9 +70,9 @@
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/SuffixTree.h"
 #include "llvm/Support/raw_ostream.h"
 #include <functional>
 #include <tuple>
@@ -96,514 +97,15 @@ static cl::opt<bool> EnableLinkOnceODROutlining(
     cl::desc("Enable the machine outliner on linkonceodr functions"),
     cl::init(false));
 
-namespace {
-
-/// Represents an undefined index in the suffix tree.
-const unsigned EmptyIdx = -1;
-
-/// A node in a suffix tree which represents a substring or suffix.
-///
-/// Each node has either no children or at least two children, with the root
-/// being a exception in the empty tree.
-///
-/// Children are represented as a map between unsigned integers and nodes. If
-/// a node N has a child M on unsigned integer k, then the mapping represented
-/// by N is a proper prefix of the mapping represented by M. Note that this,
-/// although similar to a trie is somewhat different: each node stores a full
-/// substring of the full mapping rather than a single character state.
-///
-/// Each internal node contains a pointer to the internal node representing
-/// the same string, but with the first character chopped off. This is stored
-/// in \p Link. Each leaf node stores the start index of its respective
-/// suffix in \p SuffixIdx.
-struct SuffixTreeNode {
-
-  /// The children of this node.
-  ///
-  /// A child existing on an unsigned integer implies that from the mapping
-  /// represented by the current node, there is a way to reach another
-  /// mapping by tacking that character on the end of the current string.
-  DenseMap<unsigned, SuffixTreeNode *> Children;
-
-  /// The start index of this node's substring in the main string.
-  unsigned StartIdx = EmptyIdx;
-
-  /// The end index of this node's substring in the main string.
-  ///
-  /// Every leaf node must have its \p EndIdx incremented at the end of every
-  /// step in the construction algorithm. To avoid having to update O(N)
-  /// nodes individually at the end of every step, the end index is stored
-  /// as a pointer.
-  unsigned *EndIdx = nullptr;
-
-  /// For leaves, the start index of the suffix represented by this node.
-  ///
-  /// For all other nodes, this is ignored.
-  unsigned SuffixIdx = EmptyIdx;
-
-  /// For internal nodes, a pointer to the internal node representing
-  /// the same sequence with the first character chopped off.
-  ///
-  /// This acts as a shortcut in Ukkonen's algorithm. One of the things that
-  /// Ukkonen's algorithm does to achieve linear-time construction is
-  /// keep track of which node the next insert should be at. This makes each
-  /// insert O(1), and there are a total of O(N) inserts. The suffix link
-  /// helps with inserting children of internal nodes.
-  ///
-  /// Say we add a child to an internal node with associated mapping S. The
-  /// next insertion must be at the node representing S - its first character.
-  /// This is given by the way that we iteratively build the tree in Ukkonen's
-  /// algorithm. The main idea is to look at the suffixes of each prefix in the
-  /// string, starting with the longest suffix of the prefix, and ending with
-  /// the shortest. Therefore, if we keep pointers between such nodes, we can
-  /// move to the next insertion point in O(1) time. If we don't, then we'd
-  /// have to query from the root, which takes O(N) time. This would make the
-  /// construction algorithm O(N^2) rather than O(N).
-  SuffixTreeNode *Link = nullptr;
-
-  /// The length of the string formed by concatenating the edge labels from the
-  /// root to this node.
-  unsigned ConcatLen = 0;
-
-  /// Returns true if this node is a leaf.
-  bool isLeaf() const { return SuffixIdx != EmptyIdx; }
-
-  /// Returns true if this node is the root of its owning \p SuffixTree.
-  bool isRoot() const { return StartIdx == EmptyIdx; }
-
-  /// Return the number of elements in the substring associated with this node.
-  size_t size() const {
-
-    // Is it the root? If so, it's the empty string so return 0.
-    if (isRoot())
-      return 0;
-
-    assert(*EndIdx != EmptyIdx && "EndIdx is undefined!");
-
-    // Size = the number of elements in the string.
-    // For example, [0 1 2 3] has length 4, not 3. 3-0 = 3, so we have 3-0+1.
-    return *EndIdx - StartIdx + 1;
-  }
-
-  SuffixTreeNode(unsigned StartIdx, unsigned *EndIdx, SuffixTreeNode *Link)
-      : StartIdx(StartIdx), EndIdx(EndIdx), Link(Link) {}
-
-  SuffixTreeNode() {}
-};
-
-/// A data structure for fast substring queries.
-///
-/// Suffix trees represent the suffixes of their input strings in their leaves.
-/// A suffix tree is a type of compressed trie structure where each node
-/// represents an entire substring rather than a single character. Each leaf
-/// of the tree is a suffix.
-///
-/// A suffix tree can be seen as a type of state machine where each state is a
-/// substring of the full string. The tree is structured so that, for a string
-/// of length N, there are exactly N leaves in the tree. This structure allows
-/// us to quickly find repeated substrings of the input string.
-///
-/// In this implementation, a "string" is a vector of unsigned integers.
-/// These integers may result from hashing some data type. A suffix tree can
-/// contain 1 or many strings, which can then be queried as one large string.
-///
-/// The suffix tree is implemented using Ukkonen's algorithm for linear-time
-/// suffix tree construction. Ukkonen's algorithm is explained in more detail
-/// in the paper by Esko Ukkonen "On-line construction of suffix trees. The
-/// paper is available at
-///
-/// https://www.cs.helsinki.fi/u/ukkonen/SuffixT1withFigs.pdf
-class SuffixTree {
-public:
-  /// Each element is an integer representing an instruction in the module.
-  ArrayRef<unsigned> Str;
-
-  /// A repeated substring in the tree.
-  struct RepeatedSubstring {
-    /// The length of the string.
-    unsigned Length;
-
-    /// The start indices of each occurrence.
-    std::vector<unsigned> StartIndices;
-  };
-
-private:
-  /// Maintains each node in the tree.
-  SpecificBumpPtrAllocator<SuffixTreeNode> NodeAllocator;
-
-  /// The root of the suffix tree.
-  ///
-  /// The root represents the empty string. It is maintained by the
-  /// \p NodeAllocator like every other node in the tree.
-  SuffixTreeNode *Root = nullptr;
-
-  /// Maintains the end indices of the internal nodes in the tree.
-  ///
-  /// Each internal node is guaranteed to never have its end index change
-  /// during the construction algorithm; however, leaves must be updated at
-  /// every step. Therefore, we need to store leaf end indices by reference
-  /// to avoid updating O(N) leaves at every step of construction. Thus,
-  /// every internal node must be allocated its own end index.
-  BumpPtrAllocator InternalEndIdxAllocator;
-
-  /// The end index of each leaf in the tree.
-  unsigned LeafEndIdx = -1;
-
-  /// Helper struct which keeps track of the next insertion point in
-  /// Ukkonen's algorithm.
-  struct ActiveState {
-    /// The next node to insert at.
-    SuffixTreeNode *Node = nullptr;
-
-    /// The index of the first character in the substring currently being added.
-    unsigned Idx = EmptyIdx;
-
-    /// The length of the substring we have to add at the current step.
-    unsigned Len = 0;
-  };
-
-  /// The point the next insertion will take place at in the
-  /// construction algorithm.
-  ActiveState Active;
-
-  /// Allocate a leaf node and add it to the tree.
-  ///
-  /// \param Parent The parent of this node.
-  /// \param StartIdx The start index of this node's associated string.
-  /// \param Edge The label on the edge leaving \p Parent to this node.
-  ///
-  /// \returns A pointer to the allocated leaf node.
-  SuffixTreeNode *insertLeaf(SuffixTreeNode &Parent, unsigned StartIdx,
-                             unsigned Edge) {
-
-    assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
-
-    SuffixTreeNode *N = new (NodeAllocator.Allocate())
-        SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
-    Parent.Children[Edge] = N;
-
-    return N;
-  }
-
-  /// Allocate an internal node and add it to the tree.
-  ///
-  /// \param Parent The parent of this node. Only null when allocating the root.
-  /// \param StartIdx The start index of this node's associated string.
-  /// \param EndIdx The end index of this node's associated string.
-  /// \param Edge The label on the edge leaving \p Parent to this node.
-  ///
-  /// \returns A pointer to the allocated internal node.
-  SuffixTreeNode *insertInternalNode(SuffixTreeNode *Parent, unsigned StartIdx,
-                                     unsigned EndIdx, unsigned Edge) {
-
-    assert(StartIdx <= EndIdx && "String can't start after it ends!");
-    assert(!(!Parent && StartIdx != EmptyIdx) &&
-           "Non-root internal nodes must have parents!");
-
-    unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
-    SuffixTreeNode *N =
-        new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
-    if (Parent)
-      Parent->Children[Edge] = N;
-
-    return N;
-  }
-
-  /// Set the suffix indices of the leaves to the start indices of their
-  /// respective suffixes.
-  void setSuffixIndices() {
-    // List of nodes we need to visit along with the current length of the
-    // string.
-    std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
-
-    // Current node being visited.
-    SuffixTreeNode *CurrNode = Root;
-
-    // Sum of the lengths of the nodes down the path to the current one.
-    unsigned CurrNodeLen = 0;
-    ToVisit.push_back({CurrNode, CurrNodeLen});
-    while (!ToVisit.empty()) {
-      std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
-      ToVisit.pop_back();
-      CurrNode->ConcatLen = CurrNodeLen;
-      for (auto &ChildPair : CurrNode->Children) {
-        assert(ChildPair.second && "Node had a null child!");
-        ToVisit.push_back(
-            {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
-      }
-
-      // No children, so we are at the end of the string.
-      if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
-        CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
-    }
-  }
-
-  /// Construct the suffix tree for the prefix of the input ending at
-  /// \p EndIdx.
-  ///
-  /// Used to construct the full suffix tree iteratively. At the end of each
-  /// step, the constructed suffix tree is either a valid suffix tree, or a
-  /// suffix tree with implicit suffixes. At the end of the final step, the
-  /// suffix tree is a valid tree.
-  ///
-  /// \param EndIdx The end index of the current prefix in the main string.
-  /// \param SuffixesToAdd The number of suffixes that must be added
-  /// to complete the suffix tree at the current phase.
-  ///
-  /// \returns The number of suffixes that have not been added at the end of
-  /// this step.
-  unsigned extend(unsigned EndIdx, unsigned SuffixesToAdd) {
-    SuffixTreeNode *NeedsLink = nullptr;
-
-    while (SuffixesToAdd > 0) {
-
-      // Are we waiting to add anything other than just the last character?
-      if (Active.Len == 0) {
-        // If not, then say the active index is the end index.
-        Active.Idx = EndIdx;
-      }
-
-      assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
-
-      // The first character in the current substring we're looking at.
-      unsigned FirstChar = Str[Active.Idx];
-
-      // Have we inserted anything starting with FirstChar at the current node?
-      if (Active.Node->Children.count(FirstChar) == 0) {
-        // If not, then we can just insert a leaf and move too the next step.
-        insertLeaf(*Active.Node, EndIdx, FirstChar);
-
-        // The active node is an internal node, and we visited it, so it must
-        // need a link if it doesn't have one.
-        if (NeedsLink) {
-          NeedsLink->Link = Active.Node;
-          NeedsLink = nullptr;
-        }
-      } else {
-        // There's a match with FirstChar, so look for the point in the tree to
-        // insert a new node.
-        SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
-
-        unsigned SubstringLen = NextNode->size();
-
-        // Is the current suffix we're trying to insert longer than the size of
-        // the child we want to move to?
-        if (Active.Len >= SubstringLen) {
-          // If yes, then consume the characters we've seen and move to the next
-          // node.
-          Active.Idx += SubstringLen;
-          Active.Len -= SubstringLen;
-          Active.Node = NextNode;
-          continue;
-        }
-
-        // Otherwise, the suffix we're trying to insert must be contained in the
-        // next node we want to move to.
-        unsigned LastChar = Str[EndIdx];
-
-        // Is the string we're trying to insert a substring of the next node?
-        if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
-          // If yes, then we're done for this step. Remember our insertion point
-          // and move to the next end index. At this point, we have an implicit
-          // suffix tree.
-          if (NeedsLink && !Active.Node->isRoot()) {
-            NeedsLink->Link = Active.Node;
-            NeedsLink = nullptr;
-          }
+/// Number of times to re-run the outliner. This is not the total number of runs
+/// as the outliner will run at least one time. The default value is set to 0,
+/// meaning the outliner will run one time and rerun zero times after that.
+static cl::opt<unsigned> OutlinerReruns(
+    "machine-outliner-reruns", cl::init(0), cl::Hidden,
+    cl::desc(
+        "Number of times to rerun the outliner after the initial outline"));
 
-          Active.Len++;
-          break;
-        }
-
-        // The string we're trying to insert isn't a substring of the next node,
-        // but matches up to a point. Split the node.
-        //
-        // For example, say we ended our search at a node n and we're trying to
-        // insert ABD. Then we'll create a new node s for AB, reduce n to just
-        // representing C, and insert a new leaf node l to represent d. This
-        // allows us to ensure that if n was a leaf, it remains a leaf.
-        //
-        //   | ABC  ---split--->  | AB
-        //   n                    s
-        //                     C / \ D
-        //                      n   l
-
-        // The node s from the diagram
-        SuffixTreeNode *SplitNode =
-            insertInternalNode(Active.Node, NextNode->StartIdx,
-                               NextNode->StartIdx + Active.Len - 1, FirstChar);
-
-        // Insert the new node representing the new substring into the tree as
-        // a child of the split node. This is the node l from the diagram.
-        insertLeaf(*SplitNode, EndIdx, LastChar);
-
-        // Make the old node a child of the split node and update its start
-        // index. This is the node n from the diagram.
-        NextNode->StartIdx += Active.Len;
-        SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
-
-        // SplitNode is an internal node, update the suffix link.
-        if (NeedsLink)
-          NeedsLink->Link = SplitNode;
-
-        NeedsLink = SplitNode;
-      }
-
-      // We've added something new to the tree, so there's one less suffix to
-      // add.
-      SuffixesToAdd--;
-
-      if (Active.Node->isRoot()) {
-        if (Active.Len > 0) {
-          Active.Len--;
-          Active.Idx = EndIdx - SuffixesToAdd + 1;
-        }
-      } else {
-        // Start the next phase at the next smallest suffix.
-        Active.Node = Active.Node->Link;
-      }
-    }
-
-    return SuffixesToAdd;
-  }
-
-public:
-  /// Construct a suffix tree from a sequence of unsigned integers.
-  ///
-  /// \param Str The string to construct the suffix tree for.
-  SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
-    Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
-    Active.Node = Root;
-
-    // Keep track of the number of suffixes we have to add of the current
-    // prefix.
-    unsigned SuffixesToAdd = 0;
-
-    // Construct the suffix tree iteratively on each prefix of the string.
-    // PfxEndIdx is the end index of the current prefix.
-    // End is one past the last element in the string.
-    for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End;
-         PfxEndIdx++) {
-      SuffixesToAdd++;
-      LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
-      SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
-    }
-
-    // Set the suffix indices of each leaf.
-    assert(Root && "Root node can't be nullptr!");
-    setSuffixIndices();
-  }
-
-  /// Iterator for finding all repeated substrings in the suffix tree.
-  struct RepeatedSubstringIterator {
-  private:
-    /// The current node we're visiting.
-    SuffixTreeNode *N = nullptr;
-
-    /// The repeated substring associated with this node.
-    RepeatedSubstring RS;
-
-    /// The nodes left to visit.
-    std::vector<SuffixTreeNode *> ToVisit;
-
-    /// The minimum length of a repeated substring to find.
-    /// Since we're outlining, we want at least two instructions in the range.
-    /// FIXME: This may not be true for targets like X86 which support many
-    /// instruction lengths.
-    const unsigned MinLength = 2;
-
-    /// Move the iterator to the next repeated substring.
-    void advance() {
-      // Clear the current state. If we're at the end of the range, then this
-      // is the state we want to be in.
-      RS = RepeatedSubstring();
-      N = nullptr;
-
-      // Each leaf node represents a repeat of a string.
-      std::vector<SuffixTreeNode *> LeafChildren;
-
-      // Continue visiting nodes until we find one which repeats more than once.
-      while (!ToVisit.empty()) {
-        SuffixTreeNode *Curr = ToVisit.back();
-        ToVisit.pop_back();
-        LeafChildren.clear();
-
-        // Keep track of the length of the string associated with the node. If
-        // it's too short, we'll quit.
-        unsigned Length = Curr->ConcatLen;
-
-        // Iterate over each child, saving internal nodes for visiting, and
-        // leaf nodes in LeafChildren. Internal nodes represent individual
-        // strings, which may repeat.
-        for (auto &ChildPair : Curr->Children) {
-          // Save all of this node's children for processing.
-          if (!ChildPair.second->isLeaf())
-            ToVisit.push_back(ChildPair.second);
-
-          // It's not an internal node, so it must be a leaf. If we have a
-          // long enough string, then save the leaf children.
-          else if (Length >= MinLength)
-            LeafChildren.push_back(ChildPair.second);
-        }
-
-        // The root never represents a repeated substring. If we're looking at
-        // that, then skip it.
-        if (Curr->isRoot())
-          continue;
-
-        // Do we have any repeated substrings?
-        if (LeafChildren.size() >= 2) {
-          // Yes. Update the state to reflect this, and then bail out.
-          N = Curr;
-          RS.Length = Length;
-          for (SuffixTreeNode *Leaf : LeafChildren)
-            RS.StartIndices.push_back(Leaf->SuffixIdx);
-          break;
-        }
-      }
-
-      // At this point, either NewRS is an empty RepeatedSubstring, or it was
-      // set in the above loop. Similarly, N is either nullptr, or the node
-      // associated with NewRS.
-    }
-
-  public:
-    /// Return the current repeated substring.
-    RepeatedSubstring &operator*() { return RS; }
-
-    RepeatedSubstringIterator &operator++() {
-      advance();
-      return *this;
-    }
-
-    RepeatedSubstringIterator operator++(int I) {
-      RepeatedSubstringIterator It(*this);
-      advance();
-      return It;
-    }
-
-    bool operator==(const RepeatedSubstringIterator &Other) {
-      return N == Other.N;
-    }
-    bool operator!=(const RepeatedSubstringIterator &Other) {
-      return !(*this == Other);
-    }
-
-    RepeatedSubstringIterator(SuffixTreeNode *N) : N(N) {
-      // Do we have a non-null node?
-      if (N) {
-        // Yes. At the first step, we need to visit all of N's children.
-        // Note: This means that we visit N last.
-        ToVisit.push_back(N);
-        advance();
-      }
-    }
-  };
-
-  typedef RepeatedSubstringIterator iterator;
-  iterator begin() { return iterator(Root); }
-  iterator end() { return iterator(nullptr); }
-};
+namespace {
 
 /// Maps \p MachineInstrs to unsigned integers and stores the mappings.
 struct InstructionMapper {
@@ -841,6 +343,9 @@ struct MachineOutliner : public ModulePass {
   /// linkonceodr linkage.
   bool OutlineFromLinkOnceODRs = false;
 
+  /// The current repeat number of machine outlining.
+  unsigned OutlineRepeatedNum = 0;
+
   /// Set to true if the outliner should run on all functions in the module
   /// considered safe for outlining.
   /// Set to true by default for compatibility with llc's -run-pass option.
@@ -899,7 +404,7 @@ struct MachineOutliner : public ModulePass {
                                           InstructionMapper &Mapper,
                                           unsigned Name);
 
-  /// Calls 'doOutline()'.
+  /// Calls 'doOutline()' 1 + OutlinerReruns times.
   bool runOnModule(Module &M) override;
 
   /// Construct a suffix tree on the instructions in \p M and outline repeated
@@ -1098,7 +603,10 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
   // Create the function name. This should be unique.
   // FIXME: We should have a better naming scheme. This should be stable,
   // regardless of changes to the outliner's cost model/traversal order.
-  std::string FunctionName = ("OUTLINED_FUNCTION_" + Twine(Name)).str();
+  std::string FunctionName = "OUTLINED_FUNCTION_";
+  if (OutlineRepeatedNum > 0)
+    FunctionName += std::to_string(OutlineRepeatedNum + 1) + "_";
+  FunctionName += std::to_string(Name);
 
   // Create the function using an IR-level function.
   LLVMContext &C = M.getContext();
@@ -1110,9 +618,6 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
   F->setLinkage(GlobalValue::InternalLinkage);
   F->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
 
-  // FIXME: Set nounwind, so we don't generate eh_frame? Haven't verified it's
-  // necessary.
-
   // Set optsize/minsize, so we don't insert padding between outlined
   // functions.
   F->addFnAttr(Attribute::OptimizeForSize);
@@ -1127,6 +632,12 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
   if (ParentFn.hasFnAttribute("target-features"))
     F->addFnAttr(ParentFn.getFnAttribute("target-features"));
 
+  // Set nounwind, so we don't generate eh_frame.
+  if (llvm::all_of(OF.Candidates, [](const outliner::Candidate &C) {
+        return C.getMF()->getFunction().hasFnAttribute(Attribute::NoUnwind);
+      }))
+    F->addFnAttr(Attribute::NoUnwind);
+
   BasicBlock *EntryBB = BasicBlock::Create(C, "entry", F);
   IRBuilder<> Builder(EntryBB);
   Builder.CreateRetVoid();
@@ -1140,9 +651,17 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
   // Insert the new function into the module.
   MF.insert(MF.begin(), &MBB);
 
+  MachineFunction *OriginalMF = FirstCand.front()->getMF();
+  const std::vector<MCCFIInstruction> &Instrs =
+      OriginalMF->getFrameInstructions();
   for (auto I = FirstCand.front(), E = std::next(FirstCand.back()); I != E;
        ++I) {
     MachineInstr *NewMI = MF.CloneMachineInstr(&*I);
+    if (I->isCFIInstruction()) {
+      unsigned CFIIndex = NewMI->getOperand(0).getCFIIndex();
+      MCCFIInstruction CFI = Instrs[CFIIndex];
+      (void)MF.addFrameInst(CFI);
+    }
     NewMI->dropMemRefs(MF);
 
     // Don't keep debug information for outlined instructions.
@@ -1150,12 +669,35 @@ MachineFunction *MachineOutliner::createOutlinedFunction(
     MBB.insert(MBB.end(), NewMI);
   }
 
-  TII.buildOutlinedFrame(MBB, MF, OF);
-
-  // Outlined functions shouldn't preserve liveness.
-  MF.getProperties().reset(MachineFunctionProperties::Property::TracksLiveness);
+  // Set normal properties for a late MachineFunction.
+  MF.getProperties().reset(MachineFunctionProperties::Property::IsSSA);
+  MF.getProperties().set(MachineFunctionProperties::Property::NoPHIs);
+  MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
+  MF.getProperties().set(MachineFunctionProperties::Property::TracksLiveness);
   MF.getRegInfo().freezeReservedRegs(MF);
 
+  // Compute live-in set for outlined fn
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
+  LivePhysRegs LiveIns(TRI);
+  for (auto &Cand : OF.Candidates) {
+    // Figure out live-ins at the first instruction.
+    MachineBasicBlock &OutlineBB = *Cand.front()->getParent();
+    LivePhysRegs CandLiveIns(TRI);
+    CandLiveIns.addLiveOuts(OutlineBB);
+    for (const MachineInstr &MI :
+         reverse(make_range(Cand.front(), OutlineBB.end())))
+      CandLiveIns.stepBackward(MI);
+
+    // The live-in set for the outlined function is the union of the live-ins
+    // from all the outlining points.
+    for (MCPhysReg Reg : make_range(CandLiveIns.begin(), CandLiveIns.end()))
+      LiveIns.addReg(Reg);
+  }
+  addLiveIns(MBB, LiveIns);
+
+  TII.buildOutlinedFrame(MBB, MF, OF);
+
   // If there's a DISubprogram associated with this outlined function, then
   // emit debug info for the outlined function.
   if (DISubprogram *SP = getSubprogramOrNull(OF)) {
@@ -1245,31 +787,54 @@ bool MachineOutliner::outline(Module &M,
       // make sure that the ranges we yank things out of aren't wrong.
       if (MBB.getParent()->getProperties().hasProperty(
               MachineFunctionProperties::Property::TracksLiveness)) {
-        // Helper lambda for adding implicit def operands to the call
+        // The following code is to add implicit def operands to the call
         // instruction. It also updates call site information for moved
         // code.
-        auto CopyDefsAndUpdateCalls = [&CallInst](MachineInstr &MI) {
-          for (MachineOperand &MOP : MI.operands()) {
-            // Skip over anything that isn't a register.
-            if (!MOP.isReg())
-              continue;
-
-            // If it's a def, add it to the call instruction.
-            if (MOP.isDef())
-              CallInst->addOperand(MachineOperand::CreateReg(
-                  MOP.getReg(), true, /* isDef = true */
-                  true /* isImp = true */));
-          }
-          if (MI.isCall())
-            MI.getMF()->eraseCallSiteInfo(&MI);
-        };
+        SmallSet<Register, 2> UseRegs, DefRegs;
         // Copy over the defs in the outlined range.
         // First inst in outlined range <-- Anything that's defined in this
         // ...                           .. range has to be added as an
         // implicit Last inst in outlined range  <-- def to the call
         // instruction. Also remove call site information for outlined block
-        // of code.
-        std::for_each(CallInst, std::next(EndIt), CopyDefsAndUpdateCalls);
+        // of code. The exposed uses need to be copied in the outlined range.
+        for (MachineBasicBlock::reverse_iterator
+                 Iter = EndIt.getReverse(),
+                 Last = std::next(CallInst.getReverse());
+             Iter != Last; Iter++) {
+          MachineInstr *MI = &*Iter;
+          for (MachineOperand &MOP : MI->operands()) {
+            // Skip over anything that isn't a register.
+            if (!MOP.isReg())
+              continue;
+
+            if (MOP.isDef()) {
+              // Introduce DefRegs set to skip the redundant register.
+              DefRegs.insert(MOP.getReg());
+              if (UseRegs.count(MOP.getReg()))
+                // Since the regiester is modeled as defined,
+                // it is not necessary to be put in use register set.
+                UseRegs.erase(MOP.getReg());
+            } else if (!MOP.isUndef()) {
+              // Any register which is not undefined should
+              // be put in the use register set.
+              UseRegs.insert(MOP.getReg());
+            }
+          }
+          if (MI->isCandidateForCallSiteEntry())
+            MI->getMF()->eraseCallSiteInfo(MI);
+        }
+
+        for (const Register &I : DefRegs)
+          // If it's a def, add it to the call instruction.
+          CallInst->addOperand(
+              MachineOperand::CreateReg(I, true, /* isDef = true */
+                                        true /* isImp = true */));
+
+        for (const Register &I : UseRegs)
+          // If it's a exposed use, add it to the call instruction.
+          CallInst->addOperand(
+              MachineOperand::CreateReg(I, false, /* isDef = false */
+                                        true /* isImp = true */));
       }
 
       // Erase from the point after where the call was inserted up to, and
@@ -1289,7 +854,6 @@ bool MachineOutliner::outline(Module &M,
   }
 
   LLVM_DEBUG(dbgs() << "OutlinedSomething = " << OutlinedSomething << "\n";);
-
   return OutlinedSomething;
 }
 
@@ -1377,7 +941,7 @@ void MachineOutliner::emitInstrCountChangedRemark(
     if (!MF)
       continue;
 
-    std::string Fname = F.getName();
+    std::string Fname = std::string(F.getName());
     unsigned FnCountAfter = MF->getInstructionCount();
     unsigned FnCountBefore = 0;
 
@@ -1424,8 +988,22 @@ bool MachineOutliner::runOnModule(Module &M) {
   // Number to append to the current outlined function.
   unsigned OutlinedFunctionNum = 0;
 
+  OutlineRepeatedNum = 0;
   if (!doOutline(M, OutlinedFunctionNum))
     return false;
+
+  for (unsigned I = 0; I < OutlinerReruns; ++I) {
+    OutlinedFunctionNum = 0;
+    OutlineRepeatedNum++;
+    if (!doOutline(M, OutlinedFunctionNum)) {
+      LLVM_DEBUG({
+        dbgs() << "Did not outline on iteration " << I + 2 << " out of "
+               << OutlinerReruns + 1 << "\n";
+      });
+      break;
+    }
+  }
+
   return true;
 }
 
@@ -1482,5 +1060,11 @@ bool MachineOutliner::doOutline(Module &M, unsigned &OutlinedFunctionNum) {
   if (ShouldEmitSizeRemarks && OutlinedSomething)
     emitInstrCountChangedRemark(M, MMI, FunctionToInstrCount);
 
+  LLVM_DEBUG({
+    if (!OutlinedSomething)
+      dbgs() << "Stopped outlining at iteration " << OutlineRepeatedNum
+             << " because no changes were found.\n";
+  });
+
   return OutlinedSomething;
 }
diff --git a/llvm/lib/CodeGen/MachinePipeliner.cpp b/llvm/lib/CodeGen/MachinePipeliner.cpp
index ef22caa877c9..ef4b02ca9e3e 100644
--- a/llvm/lib/CodeGen/MachinePipeliner.cpp
+++ b/llvm/lib/CodeGen/MachinePipeliner.cpp
@@ -217,6 +217,7 @@ bool MachinePipeliner::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
   MLI = &getAnalysis<MachineLoopInfo>();
   MDT = &getAnalysis<MachineDominatorTree>();
+  ORE = &getAnalysis<MachineOptimizationRemarkEmitterPass>().getORE();
   TII = MF->getSubtarget().getInstrInfo();
   RegClassInfo.runOnMachineFunction(*MF);
 
@@ -248,6 +249,12 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
   setPragmaPipelineOptions(L);
   if (!canPipelineLoop(L)) {
     LLVM_DEBUG(dbgs() << "\n!!! Can not pipeline loop.\n");
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkMissed(DEBUG_TYPE, "canPipelineLoop",
+                                             L.getStartLoc(), L.getHeader())
+             << "Failed to pipeline loop";
+    });
+
     return Changed;
   }
 
@@ -259,6 +266,9 @@ bool MachinePipeliner::scheduleLoop(MachineLoop &L) {
 }
 
 void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
+  // Reset the pragma for the next loop in iteration.
+  disabledByPragma = false;
+
   MachineBasicBlock *LBLK = L.getTopBlock();
 
   if (LBLK == nullptr)
@@ -306,11 +316,24 @@ void MachinePipeliner::setPragmaPipelineOptions(MachineLoop &L) {
 /// restricted to loops with a single basic block.  Make sure that the
 /// branch in the loop can be analyzed.
 bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
-  if (L.getNumBlocks() != 1)
+  if (L.getNumBlocks() != 1) {
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "Not a single basic block: "
+             << ore::NV("NumBlocks", L.getNumBlocks());
+    });
     return false;
+  }
 
-  if (disabledByPragma)
+  if (disabledByPragma) {
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "Disabled by Pragma.";
+    });
     return false;
+  }
 
   // Check if the branch can't be understood because we can't do pipelining
   // if that's the case.
@@ -318,25 +341,37 @@ bool MachinePipeliner::canPipelineLoop(MachineLoop &L) {
   LI.FBB = nullptr;
   LI.BrCond.clear();
   if (TII->analyzeBranch(*L.getHeader(), LI.TBB, LI.FBB, LI.BrCond)) {
-    LLVM_DEBUG(
-        dbgs() << "Unable to analyzeBranch, can NOT pipeline current Loop\n");
+    LLVM_DEBUG(dbgs() << "Unable to analyzeBranch, can NOT pipeline Loop\n");
     NumFailBranch++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "The branch can't be understood";
+    });
     return false;
   }
 
   LI.LoopInductionVar = nullptr;
   LI.LoopCompare = nullptr;
   if (!TII->analyzeLoopForPipelining(L.getTopBlock())) {
-    LLVM_DEBUG(
-        dbgs() << "Unable to analyzeLoop, can NOT pipeline current Loop\n");
+    LLVM_DEBUG(dbgs() << "Unable to analyzeLoop, can NOT pipeline Loop\n");
     NumFailLoop++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "The loop structure is not supported";
+    });
     return false;
   }
 
   if (!L.getLoopPreheader()) {
-    LLVM_DEBUG(
-        dbgs() << "Preheader not found, can NOT pipeline current Loop\n");
+    LLVM_DEBUG(dbgs() << "Preheader not found, can NOT pipeline Loop\n");
     NumFailPreheader++;
+    ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(DEBUG_TYPE, "canPipelineLoop",
+                                               L.getStartLoc(), L.getHeader())
+             << "No loop preheader found";
+    });
     return false;
   }
 
@@ -454,10 +489,13 @@ void SwingSchedulerDAG::schedule() {
 
   // Can't schedule a loop without a valid MII.
   if (MII == 0) {
-    LLVM_DEBUG(
-        dbgs()
-        << "0 is not a valid Minimal Initiation Interval, can NOT schedule\n");
+    LLVM_DEBUG(dbgs() << "Invalid Minimal Initiation Interval: 0\n");
     NumFailZeroMII++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Invalid Minimal Initiation Interval: 0";
+    });
     return;
   }
 
@@ -466,6 +504,14 @@ void SwingSchedulerDAG::schedule() {
     LLVM_DEBUG(dbgs() << "MII > " << SwpMaxMii
                       << ", we don't pipleline large loops\n");
     NumFailLargeMaxMII++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Minimal Initiation Interval too large: "
+             << ore::NV("MII", (int)MII) << " > "
+             << ore::NV("SwpMaxMii", SwpMaxMii) << "."
+             << "Refer to -pipeliner-max-mii.";
+    });
     return;
   }
 
@@ -508,15 +554,24 @@ void SwingSchedulerDAG::schedule() {
   if (!Scheduled){
     LLVM_DEBUG(dbgs() << "No schedule found, return\n");
     NumFailNoSchedule++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Unable to find schedule";
+    });
     return;
   }
 
   unsigned numStages = Schedule.getMaxStageCount();
   // No need to generate pipeline if there are no overlapped iterations.
   if (numStages == 0) {
-    LLVM_DEBUG(
-        dbgs() << "No overlapped iterations, no need to generate pipeline\n");
+    LLVM_DEBUG(dbgs() << "No overlapped iterations, skip.\n");
     NumFailZeroStage++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "No need to pipeline - no overlapped iterations in schedule.";
+    });
     return;
   }
   // Check that the maximum stage count is less than user-defined limit.
@@ -524,9 +579,23 @@ void SwingSchedulerDAG::schedule() {
     LLVM_DEBUG(dbgs() << "numStages:" << numStages << ">" << SwpMaxStages
                       << " : too many stages, abort\n");
     NumFailLargeMaxStage++;
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Too many stages in schedule: "
+             << ore::NV("numStages", (int)numStages) << " > "
+             << ore::NV("SwpMaxStages", SwpMaxStages)
+             << ". Refer to -pipeliner-max-stages.";
+    });
     return;
   }
 
+  Pass.ORE->emit([&]() {
+    return MachineOptimizationRemark(DEBUG_TYPE, "schedule", Loop.getStartLoc(),
+                                     Loop.getHeader())
+           << "Pipelined succesfully!";
+  });
+
   // Generate the schedule as a ModuloSchedule.
   DenseMap<MachineInstr *, int> Cycles, Stages;
   std::vector<MachineInstr *> OrderedInsts;
@@ -693,9 +762,13 @@ void SwingSchedulerDAG::addLoopCarriedDependences(AliasAnalysis *AA) {
           // offset, then mark the dependence as loop carried potentially.
           const MachineOperand *BaseOp1, *BaseOp2;
           int64_t Offset1, Offset2;
-          if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1, TRI) &&
-              TII->getMemOperandWithOffset(MI, BaseOp2, Offset2, TRI)) {
+          bool Offset1IsScalable, Offset2IsScalable;
+          if (TII->getMemOperandWithOffset(LdMI, BaseOp1, Offset1,
+                                           Offset1IsScalable, TRI) &&
+              TII->getMemOperandWithOffset(MI, BaseOp2, Offset2,
+                                           Offset2IsScalable, TRI)) {
             if (BaseOp1->isIdenticalTo(*BaseOp2) &&
+                Offset1IsScalable == Offset2IsScalable &&
                 (int)Offset1 < (int)Offset2) {
               assert(TII->areMemAccessesTriviallyDisjoint(LdMI, MI) &&
                      "What happened to the chain edge?");
@@ -802,7 +875,7 @@ void SwingSchedulerDAG::updatePhiDependences() {
           if (!MI->isPHI()) {
             SDep Dep(SU, SDep::Data, Reg);
             Dep.setLatency(0);
-            ST.adjustSchedDependency(SU, &I, Dep);
+            ST.adjustSchedDependency(SU, 0, &I, MI->getOperandNo(MOI), Dep);
             I.addPred(Dep);
           } else {
             HasPhiUse = Reg;
@@ -905,7 +978,7 @@ namespace {
 struct FuncUnitSorter {
   const InstrItineraryData *InstrItins;
   const MCSubtargetInfo *STI;
-  DenseMap<unsigned, unsigned> Resources;
+  DenseMap<InstrStage::FuncUnits, unsigned> Resources;
 
   FuncUnitSorter(const TargetSubtargetInfo &TSI)
       : InstrItins(TSI.getInstrItineraryData()), STI(&TSI) {}
@@ -913,14 +986,15 @@ struct FuncUnitSorter {
   // Compute the number of functional unit alternatives needed
   // at each stage, and take the minimum value. We prioritize the
   // instructions by the least number of choices first.
-  unsigned minFuncUnits(const MachineInstr *Inst, unsigned &F) const {
+  unsigned minFuncUnits(const MachineInstr *Inst,
+                        InstrStage::FuncUnits &F) const {
     unsigned SchedClass = Inst->getDesc().getSchedClass();
     unsigned min = UINT_MAX;
     if (InstrItins && !InstrItins->isEmpty()) {
       for (const InstrStage &IS :
            make_range(InstrItins->beginStage(SchedClass),
                       InstrItins->endStage(SchedClass))) {
-        unsigned funcUnits = IS.getUnits();
+        InstrStage::FuncUnits funcUnits = IS.getUnits();
         unsigned numAlternatives = countPopulation(funcUnits);
         if (numAlternatives < min) {
           min = numAlternatives;
@@ -966,7 +1040,7 @@ struct FuncUnitSorter {
       for (const InstrStage &IS :
            make_range(InstrItins->beginStage(SchedClass),
                       InstrItins->endStage(SchedClass))) {
-        unsigned FuncUnits = IS.getUnits();
+        InstrStage::FuncUnits FuncUnits = IS.getUnits();
         if (countPopulation(FuncUnits) == 1)
           Resources[FuncUnits]++;
       }
@@ -994,7 +1068,7 @@ struct FuncUnitSorter {
 
   /// Return true if IS1 has less priority than IS2.
   bool operator()(const MachineInstr *IS1, const MachineInstr *IS2) const {
-    unsigned F1 = 0, F2 = 0;
+    InstrStage::FuncUnits F1 = 0, F2 = 0;
     unsigned MFUs1 = minFuncUnits(IS1, F1);
     unsigned MFUs2 = minFuncUnits(IS2, F2);
     if (MFUs1 == MFUs2)
@@ -1072,7 +1146,7 @@ unsigned SwingSchedulerDAG::calculateResMII() {
     }
   }
   int Resmii = Resources.size();
-  LLVM_DEBUG(dbgs() << "Retrun Res MII:" << Resmii << "\n");
+  LLVM_DEBUG(dbgs() << "Return Res MII:" << Resmii << "\n");
   // Delete the memory for each of the DFAs that were created earlier.
   for (ResourceManager *RI : Resources) {
     ResourceManager *D = RI;
@@ -2044,9 +2118,16 @@ bool SwingSchedulerDAG::schedulePipeline(SMSchedule &Schedule) {
   LLVM_DEBUG(dbgs() << "Schedule Found? " << scheduleFound << " (II=" << II
                     << ")\n");
 
-  if (scheduleFound)
+  if (scheduleFound) {
     Schedule.finalizeSchedule(this);
-  else
+    Pass.ORE->emit([&]() {
+      return MachineOptimizationRemarkAnalysis(
+                 DEBUG_TYPE, "schedule", Loop.getStartLoc(), Loop.getHeader())
+             << "Schedule found with Initiation Interval: " << ore::NV("II", II)
+             << ", MaxStageCount: "
+             << ore::NV("MaxStageCount", Schedule.getMaxStageCount());
+    });
+  } else
     Schedule.reset();
 
   return scheduleFound && Schedule.getMaxStageCount() > 0;
@@ -2058,7 +2139,12 @@ bool SwingSchedulerDAG::computeDelta(MachineInstr &MI, unsigned &Delta) {
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const MachineOperand *BaseOp;
   int64_t Offset;
-  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI))
+  bool OffsetIsScalable;
+  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
+    return false;
+
+  // FIXME: This algorithm assumes instructions have fixed-size offsets.
+  if (OffsetIsScalable)
     return false;
 
   if (!BaseOp->isReg())
@@ -2236,11 +2322,17 @@ bool SwingSchedulerDAG::isLoopCarriedDep(SUnit *Source, const SDep &Dep,
 
   const MachineOperand *BaseOpS, *BaseOpD;
   int64_t OffsetS, OffsetD;
+  bool OffsetSIsScalable, OffsetDIsScalable;
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
-  if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, TRI) ||
-      !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, TRI))
+  if (!TII->getMemOperandWithOffset(*SI, BaseOpS, OffsetS, OffsetSIsScalable,
+                                    TRI) ||
+      !TII->getMemOperandWithOffset(*DI, BaseOpD, OffsetD, OffsetDIsScalable,
+                                    TRI))
     return true;
 
+  assert(!OffsetSIsScalable && !OffsetDIsScalable &&
+         "Expected offsets to be byte offsets");
+
   if (!BaseOpS->isIdenticalTo(*BaseOpD))
     return true;
 
@@ -2352,7 +2444,7 @@ int SMSchedule::earliestCycleInChain(const SDep &Dep) {
       continue;
     EarlyCycle = std::min(EarlyCycle, it->second);
     for (const auto &PI : PrevSU->Preds)
-      if (PI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
+      if (PI.getKind() == SDep::Order || PI.getKind() == SDep::Output)
         Worklist.push_back(PI);
     Visited.insert(PrevSU);
   }
@@ -2375,7 +2467,7 @@ int SMSchedule::latestCycleInChain(const SDep &Dep) {
       continue;
     LateCycle = std::max(LateCycle, it->second);
     for (const auto &SI : SuccSU->Succs)
-      if (SI.getKind() == SDep::Order || Dep.getKind() == SDep::Output)
+      if (SI.getKind() == SDep::Order || SI.getKind() == SDep::Output)
         Worklist.push_back(SI);
     Visited.insert(SuccSU);
   }
diff --git a/llvm/lib/CodeGen/MachineRegisterInfo.cpp b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
index b88d4ea462ef..4c733738840a 100644
--- a/llvm/lib/CodeGen/MachineRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/MachineRegisterInfo.cpp
@@ -55,18 +55,18 @@ MachineRegisterInfo::MachineRegisterInfo(MachineFunction *MF)
 /// setRegClass - Set the register class of the specified virtual register.
 ///
 void
-MachineRegisterInfo::setRegClass(unsigned Reg, const TargetRegisterClass *RC) {
+MachineRegisterInfo::setRegClass(Register Reg, const TargetRegisterClass *RC) {
   assert(RC && RC->isAllocatable() && "Invalid RC for virtual register");
   VRegInfo[Reg].first = RC;
 }
 
-void MachineRegisterInfo::setRegBank(unsigned Reg,
+void MachineRegisterInfo::setRegBank(Register Reg,
                                      const RegisterBank &RegBank) {
   VRegInfo[Reg].first = &RegBank;
 }
 
 static const TargetRegisterClass *
-constrainRegClass(MachineRegisterInfo &MRI, unsigned Reg,
+constrainRegClass(MachineRegisterInfo &MRI, Register Reg,
                   const TargetRegisterClass *OldRC,
                   const TargetRegisterClass *RC, unsigned MinNumRegs) {
   if (OldRC == RC)
@@ -82,15 +82,15 @@ constrainRegClass(MachineRegisterInfo &MRI, unsigned Reg,
 }
 
 const TargetRegisterClass *
-MachineRegisterInfo::constrainRegClass(unsigned Reg,
+MachineRegisterInfo::constrainRegClass(Register Reg,
                                        const TargetRegisterClass *RC,
                                        unsigned MinNumRegs) {
   return ::constrainRegClass(*this, Reg, getRegClass(Reg), RC, MinNumRegs);
 }
 
 bool
-MachineRegisterInfo::constrainRegAttrs(unsigned Reg,
-                                       unsigned ConstrainingReg,
+MachineRegisterInfo::constrainRegAttrs(Register Reg,
+                                       Register ConstrainingReg,
                                        unsigned MinNumRegs) {
   const LLT RegTy = getType(Reg);
   const LLT ConstrainingRegTy = getType(ConstrainingReg);
@@ -119,7 +119,7 @@ MachineRegisterInfo::constrainRegAttrs(unsigned Reg,
 }
 
 bool
-MachineRegisterInfo::recomputeRegClass(unsigned Reg) {
+MachineRegisterInfo::recomputeRegClass(Register Reg) {
   const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   const TargetRegisterClass *OldRC = getRegClass(Reg);
   const TargetRegisterClass *NewRC =
@@ -143,8 +143,8 @@ MachineRegisterInfo::recomputeRegClass(unsigned Reg) {
   return true;
 }
 
-unsigned MachineRegisterInfo::createIncompleteVirtualRegister(StringRef Name) {
-  unsigned Reg = Register::index2VirtReg(getNumVirtRegs());
+Register MachineRegisterInfo::createIncompleteVirtualRegister(StringRef Name) {
+  Register Reg = Register::index2VirtReg(getNumVirtRegs());
   VRegInfo.grow(Reg);
   RegAllocHints.grow(Reg);
   insertVRegByName(Name, Reg);
@@ -162,7 +162,7 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass,
          "Virtual register RegClass must be allocatable.");
 
   // New virtual register number.
-  unsigned Reg = createIncompleteVirtualRegister(Name);
+  Register Reg = createIncompleteVirtualRegister(Name);
   VRegInfo[Reg].first = RegClass;
   if (TheDelegate)
     TheDelegate->MRI_NoteNewVirtualRegister(Reg);
@@ -171,7 +171,7 @@ MachineRegisterInfo::createVirtualRegister(const TargetRegisterClass *RegClass,
 
 Register MachineRegisterInfo::cloneVirtualRegister(Register VReg,
                                                    StringRef Name) {
-  unsigned Reg = createIncompleteVirtualRegister(Name);
+  Register Reg = createIncompleteVirtualRegister(Name);
   VRegInfo[Reg].first = VRegInfo[VReg].first;
   setType(Reg, getType(VReg));
   if (TheDelegate)
@@ -179,7 +179,7 @@ Register MachineRegisterInfo::cloneVirtualRegister(Register VReg,
   return Reg;
 }
 
-void MachineRegisterInfo::setType(unsigned VReg, LLT Ty) {
+void MachineRegisterInfo::setType(Register VReg, LLT Ty) {
   VRegToType.grow(VReg);
   VRegToType[VReg] = Ty;
 }
@@ -187,7 +187,7 @@ void MachineRegisterInfo::setType(unsigned VReg, LLT Ty) {
 Register
 MachineRegisterInfo::createGenericVirtualRegister(LLT Ty, StringRef Name) {
   // New virtual register number.
-  unsigned Reg = createIncompleteVirtualRegister(Name);
+  Register Reg = createIncompleteVirtualRegister(Name);
   // FIXME: Should we use a dummy register class?
   VRegInfo[Reg].first = static_cast<RegisterBank *>(nullptr);
   setType(Reg, Ty);
@@ -202,7 +202,7 @@ void MachineRegisterInfo::clearVirtRegTypes() { VRegToType.clear(); }
 void MachineRegisterInfo::clearVirtRegs() {
 #ifndef NDEBUG
   for (unsigned i = 0, e = getNumVirtRegs(); i != e; ++i) {
-    unsigned Reg = Register::index2VirtReg(i);
+    Register Reg = Register::index2VirtReg(i);
     if (!VRegInfo[Reg].second)
       continue;
     verifyUseList(Reg);
@@ -214,7 +214,7 @@ void MachineRegisterInfo::clearVirtRegs() {
     I.second = 0;
 }
 
-void MachineRegisterInfo::verifyUseList(unsigned Reg) const {
+void MachineRegisterInfo::verifyUseList(Register Reg) const {
 #ifndef NDEBUG
   bool Valid = true;
   for (MachineOperand &M : reg_operands(Reg)) {
@@ -377,7 +377,7 @@ void MachineRegisterInfo::moveOperands(MachineOperand *Dst,
 /// except that it also changes any definitions of the register as well.
 /// If ToReg is a physical register we apply the sub register to obtain the
 /// final/proper physical register.
-void MachineRegisterInfo::replaceRegWith(unsigned FromReg, unsigned ToReg) {
+void MachineRegisterInfo::replaceRegWith(Register FromReg, Register ToReg) {
   assert(FromReg != ToReg && "Cannot replace a reg with itself");
 
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
@@ -397,7 +397,7 @@ void MachineRegisterInfo::replaceRegWith(unsigned FromReg, unsigned ToReg) {
 /// getVRegDef - Return the machine instr that defines the specified virtual
 /// register or null if none is found.  This assumes that the code is in SSA
 /// form, so there should only be one definition.
-MachineInstr *MachineRegisterInfo::getVRegDef(unsigned Reg) const {
+MachineInstr *MachineRegisterInfo::getVRegDef(Register Reg) const {
   // Since we are in SSA form, we can use the first definition.
   def_instr_iterator I = def_instr_begin(Reg);
   assert((I.atEnd() || std::next(I) == def_instr_end()) &&
@@ -408,7 +408,7 @@ MachineInstr *MachineRegisterInfo::getVRegDef(unsigned Reg) const {
 /// getUniqueVRegDef - Return the unique machine instr that defines the
 /// specified virtual register or null if none is found.  If there are
 /// multiple definitions or no definition, return null.
-MachineInstr *MachineRegisterInfo::getUniqueVRegDef(unsigned Reg) const {
+MachineInstr *MachineRegisterInfo::getUniqueVRegDef(Register Reg) const {
   if (def_empty(Reg)) return nullptr;
   def_instr_iterator I = def_instr_begin(Reg);
   if (std::next(I) != def_instr_end())
@@ -416,14 +416,14 @@ MachineInstr *MachineRegisterInfo::getUniqueVRegDef(unsigned Reg) const {
   return &*I;
 }
 
-bool MachineRegisterInfo::hasOneNonDBGUse(unsigned RegNo) const {
+bool MachineRegisterInfo::hasOneNonDBGUse(Register RegNo) const {
   use_nodbg_iterator UI = use_nodbg_begin(RegNo);
   if (UI == use_nodbg_end())
     return false;
   return ++UI == use_nodbg_end();
 }
 
-bool MachineRegisterInfo::hasOneNonDBGUser(unsigned RegNo) const {
+bool MachineRegisterInfo::hasOneNonDBGUser(Register RegNo) const {
   use_instr_nodbg_iterator UI = use_instr_nodbg_begin(RegNo);
   if (UI == use_instr_nodbg_end())
     return false;
@@ -434,34 +434,34 @@ bool MachineRegisterInfo::hasOneNonDBGUser(unsigned RegNo) const {
 /// clear the kill flag from the MachineOperand. This function is used by
 /// optimization passes which extend register lifetimes and need only
 /// preserve conservative kill flag information.
-void MachineRegisterInfo::clearKillFlags(unsigned Reg) const {
+void MachineRegisterInfo::clearKillFlags(Register Reg) const {
   for (MachineOperand &MO : use_operands(Reg))
     MO.setIsKill(false);
 }
 
-bool MachineRegisterInfo::isLiveIn(unsigned Reg) const {
+bool MachineRegisterInfo::isLiveIn(Register Reg) const {
   for (livein_iterator I = livein_begin(), E = livein_end(); I != E; ++I)
-    if (I->first == Reg || I->second == Reg)
+    if ((Register)I->first == Reg || I->second == Reg)
       return true;
   return false;
 }
 
 /// getLiveInPhysReg - If VReg is a live-in virtual register, return the
 /// corresponding live-in physical register.
-unsigned MachineRegisterInfo::getLiveInPhysReg(unsigned VReg) const {
+MCRegister MachineRegisterInfo::getLiveInPhysReg(Register VReg) const {
   for (livein_iterator I = livein_begin(), E = livein_end(); I != E; ++I)
     if (I->second == VReg)
       return I->first;
-  return 0;
+  return MCRegister();
 }
 
 /// getLiveInVirtReg - If PReg is a live-in physical register, return the
 /// corresponding live-in physical register.
-unsigned MachineRegisterInfo::getLiveInVirtReg(unsigned PReg) const {
+Register MachineRegisterInfo::getLiveInVirtReg(MCRegister PReg) const {
   for (livein_iterator I = livein_begin(), E = livein_end(); I != E; ++I)
     if (I->first == PReg)
       return I->second;
-  return 0;
+  return Register();
 }
 
 /// EmitLiveInCopies - Emit copies to initialize livein virtual registers
@@ -496,7 +496,7 @@ MachineRegisterInfo::EmitLiveInCopies(MachineBasicBlock *EntryMBB,
     }
 }
 
-LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const {
+LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(Register Reg) const {
   // Lane masks are only defined for vregs.
   assert(Register::isVirtualRegister(Reg));
   const TargetRegisterClass &TRC = *getRegClass(Reg);
@@ -504,7 +504,7 @@ LaneBitmask MachineRegisterInfo::getMaxLaneMaskForVReg(unsigned Reg) const {
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-LLVM_DUMP_METHOD void MachineRegisterInfo::dumpUses(unsigned Reg) const {
+LLVM_DUMP_METHOD void MachineRegisterInfo::dumpUses(Register Reg) const {
   for (MachineInstr &I : use_instructions(Reg))
     I.dump();
 }
@@ -516,7 +516,7 @@ void MachineRegisterInfo::freezeReservedRegs(const MachineFunction &MF) {
          "Invalid ReservedRegs vector from target");
 }
 
-bool MachineRegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
+bool MachineRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
   assert(Register::isPhysicalRegister(PhysReg));
 
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
@@ -533,7 +533,7 @@ bool MachineRegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
 }
 
 bool
-MachineRegisterInfo::isCallerPreservedOrConstPhysReg(unsigned PhysReg) const {
+MachineRegisterInfo::isCallerPreservedOrConstPhysReg(MCRegister PhysReg) const {
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
   return isConstantPhysReg(PhysReg) ||
       TRI->isCallerPreservedPhysReg(PhysReg, *MF);
@@ -542,7 +542,7 @@ MachineRegisterInfo::isCallerPreservedOrConstPhysReg(unsigned PhysReg) const {
 /// markUsesInDebugValueAsUndef - Mark every DBG_VALUE referencing the
 /// specified register as undefined which causes the DBG_VALUE to be
 /// deleted during LiveDebugVariables analysis.
-void MachineRegisterInfo::markUsesInDebugValueAsUndef(unsigned Reg) const {
+void MachineRegisterInfo::markUsesInDebugValueAsUndef(Register Reg) const {
   // Mark any DBG_VALUE that uses Reg as undef (but don't delete it.)
   MachineRegisterInfo::use_instr_iterator nextI;
   for (use_instr_iterator I = use_instr_begin(Reg), E = use_instr_end();
@@ -550,7 +550,7 @@ void MachineRegisterInfo::markUsesInDebugValueAsUndef(unsigned Reg) const {
     nextI = std::next(I);  // I is invalidated by the setReg
     MachineInstr *UseMI = &*I;
     if (UseMI->isDebugValue())
-      UseMI->getOperand(0).setReg(0U);
+      UseMI->getDebugOperandForReg(Reg)->setReg(0U);
   }
 }
 
@@ -583,7 +583,7 @@ static bool isNoReturnDef(const MachineOperand &MO) {
            !Called->hasFnAttribute(Attribute::NoUnwind));
 }
 
-bool MachineRegisterInfo::isPhysRegModified(unsigned PhysReg,
+bool MachineRegisterInfo::isPhysRegModified(MCRegister PhysReg,
                                             bool SkipNoReturnDef) const {
   if (UsedPhysRegMask.test(PhysReg))
     return true;
@@ -598,7 +598,7 @@ bool MachineRegisterInfo::isPhysRegModified(unsigned PhysReg,
   return false;
 }
 
-bool MachineRegisterInfo::isPhysRegUsed(unsigned PhysReg) const {
+bool MachineRegisterInfo::isPhysRegUsed(MCRegister PhysReg) const {
   if (UsedPhysRegMask.test(PhysReg))
     return true;
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
@@ -610,7 +610,7 @@ bool MachineRegisterInfo::isPhysRegUsed(unsigned PhysReg) const {
   return false;
 }
 
-void MachineRegisterInfo::disableCalleeSavedRegister(unsigned Reg) {
+void MachineRegisterInfo::disableCalleeSavedRegister(MCRegister Reg) {
 
   const TargetRegisterInfo *TRI = getTargetRegisterInfo();
   assert(Reg && (Reg < TRI->getNumRegs()) &&
diff --git a/llvm/lib/CodeGen/MachineSSAUpdater.cpp b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
index 258a5f9e0482..b12557d6d326 100644
--- a/llvm/lib/CodeGen/MachineSSAUpdater.cpp
+++ b/llvm/lib/CodeGen/MachineSSAUpdater.cpp
@@ -34,7 +34,7 @@ using namespace llvm;
 
 #define DEBUG_TYPE "machine-ssaupdater"
 
-using AvailableValsTy = DenseMap<MachineBasicBlock *, unsigned>;
+using AvailableValsTy = DenseMap<MachineBasicBlock *, Register>;
 
 static AvailableValsTy &getAvailableVals(void *AV) {
   return *static_cast<AvailableValsTy*>(AV);
@@ -51,7 +51,7 @@ MachineSSAUpdater::~MachineSSAUpdater() {
 
 /// Initialize - Reset this object to get ready for a new set of SSA
 /// updates.  ProtoValue is the value used to name PHI nodes.
-void MachineSSAUpdater::Initialize(unsigned V) {
+void MachineSSAUpdater::Initialize(Register V) {
   if (!AV)
     AV = new AvailableValsTy();
   else
@@ -69,25 +69,25 @@ bool MachineSSAUpdater::HasValueForBlock(MachineBasicBlock *BB) const {
 
 /// AddAvailableValue - Indicate that a rewritten value is available in the
 /// specified block with the specified value.
-void MachineSSAUpdater::AddAvailableValue(MachineBasicBlock *BB, unsigned V) {
+void MachineSSAUpdater::AddAvailableValue(MachineBasicBlock *BB, Register V) {
   getAvailableVals(AV)[BB] = V;
 }
 
 /// GetValueAtEndOfBlock - Construct SSA form, materializing a value that is
 /// live at the end of the specified block.
-unsigned MachineSSAUpdater::GetValueAtEndOfBlock(MachineBasicBlock *BB) {
+Register MachineSSAUpdater::GetValueAtEndOfBlock(MachineBasicBlock *BB) {
   return GetValueAtEndOfBlockInternal(BB);
 }
 
 static
-unsigned LookForIdenticalPHI(MachineBasicBlock *BB,
-        SmallVectorImpl<std::pair<MachineBasicBlock *, unsigned>> &PredValues) {
+Register LookForIdenticalPHI(MachineBasicBlock *BB,
+        SmallVectorImpl<std::pair<MachineBasicBlock *, Register>> &PredValues) {
   if (BB->empty())
-    return 0;
+    return Register();
 
   MachineBasicBlock::iterator I = BB->begin();
   if (!I->isPHI())
-    return 0;
+    return Register();
 
   AvailableValsTy AVals;
   for (unsigned i = 0, e = PredValues.size(); i != e; ++i)
@@ -106,7 +106,7 @@ unsigned LookForIdenticalPHI(MachineBasicBlock *BB,
       return I->getOperand(0).getReg();
     ++I;
   }
-  return 0;
+  return Register();
 }
 
 /// InsertNewDef - Insert an empty PHI or IMPLICIT_DEF instruction which define
@@ -140,7 +140,7 @@ MachineInstrBuilder InsertNewDef(unsigned Opcode,
 /// their respective blocks.  However, the use of X happens in the *middle* of
 /// a block.  Because of this, we need to insert a new PHI node in SomeBB to
 /// merge the appropriate values, and this value isn't live out of the block.
-unsigned MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) {
+Register MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) {
   // If there is no definition of the renamed variable in this block, just use
   // GetValueAtEndOfBlock to do our work.
   if (!HasValueForBlock(BB))
@@ -157,14 +157,14 @@ unsigned MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) {
 
   // Otherwise, we have the hard case.  Get the live-in values for each
   // predecessor.
-  SmallVector<std::pair<MachineBasicBlock*, unsigned>, 8> PredValues;
-  unsigned SingularValue = 0;
+  SmallVector<std::pair<MachineBasicBlock*, Register>, 8> PredValues;
+  Register SingularValue;
 
   bool isFirstPred = true;
   for (MachineBasicBlock::pred_iterator PI = BB->pred_begin(),
          E = BB->pred_end(); PI != E; ++PI) {
     MachineBasicBlock *PredBB = *PI;
-    unsigned PredVal = GetValueAtEndOfBlockInternal(PredBB);
+    Register PredVal = GetValueAtEndOfBlockInternal(PredBB);
     PredValues.push_back(std::make_pair(PredBB, PredVal));
 
     // Compute SingularValue.
@@ -172,15 +172,15 @@ unsigned MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) {
       SingularValue = PredVal;
       isFirstPred = false;
     } else if (PredVal != SingularValue)
-      SingularValue = 0;
+      SingularValue = Register();
   }
 
   // Otherwise, if all the merged values are the same, just use it.
-  if (SingularValue != 0)
+  if (SingularValue)
     return SingularValue;
 
   // If an identical PHI is already in BB, just reuse it.
-  unsigned DupPHI = LookForIdenticalPHI(BB, PredValues);
+  Register DupPHI = LookForIdenticalPHI(BB, PredValues);
   if (DupPHI)
     return DupPHI;
 
@@ -204,7 +204,7 @@ unsigned MachineSSAUpdater::GetValueInMiddleOfBlock(MachineBasicBlock *BB) {
   if (InsertedPHIs) InsertedPHIs->push_back(InsertedPHI);
 
   LLVM_DEBUG(dbgs() << "  Inserted PHI: " << *InsertedPHI << "\n");
-  return InsertedPHI->getOperand(0).getReg();
+  return InsertedPHI.getReg(0);
 }
 
 static
@@ -222,7 +222,7 @@ MachineBasicBlock *findCorrespondingPred(const MachineInstr *MI,
 /// which use their value in the corresponding predecessor.
 void MachineSSAUpdater::RewriteUse(MachineOperand &U) {
   MachineInstr *UseMI = U.getParent();
-  unsigned NewVR = 0;
+  Register NewVR;
   if (UseMI->isPHI()) {
     MachineBasicBlock *SourceBB = findCorrespondingPred(UseMI, &U);
     NewVR = GetValueAtEndOfBlockInternal(SourceBB);
@@ -241,7 +241,7 @@ template<>
 class SSAUpdaterTraits<MachineSSAUpdater> {
 public:
   using BlkT = MachineBasicBlock;
-  using ValT = unsigned;
+  using ValT = Register;
   using PhiT = MachineInstr;
   using BlkSucc_iterator = MachineBasicBlock::succ_iterator;
 
@@ -288,7 +288,7 @@ public:
 
   /// GetUndefVal - Create an IMPLICIT_DEF instruction with a new register.
   /// Add it into the specified block and return the register.
-  static unsigned GetUndefVal(MachineBasicBlock *BB,
+  static Register GetUndefVal(MachineBasicBlock *BB,
                               MachineSSAUpdater *Updater) {
     // Insert an implicit_def to represent an undef value.
     MachineInstr *NewDef = InsertNewDef(TargetOpcode::IMPLICIT_DEF,
@@ -300,7 +300,7 @@ public:
 
   /// CreateEmptyPHI - Create a PHI instruction that defines a new register.
   /// Add it into the specified block and return the register.
-  static unsigned CreateEmptyPHI(MachineBasicBlock *BB, unsigned NumPreds,
+  static Register CreateEmptyPHI(MachineBasicBlock *BB, unsigned NumPreds,
                                  MachineSSAUpdater *Updater) {
     MachineBasicBlock::iterator Loc = BB->empty() ? BB->end() : BB->begin();
     MachineInstr *PHI = InsertNewDef(TargetOpcode::PHI, BB, Loc,
@@ -311,7 +311,7 @@ public:
 
   /// AddPHIOperand - Add the specified value as an operand of the PHI for
   /// the specified predecessor block.
-  static void AddPHIOperand(MachineInstr *PHI, unsigned Val,
+  static void AddPHIOperand(MachineInstr *PHI, Register Val,
                             MachineBasicBlock *Pred) {
     MachineInstrBuilder(*Pred->getParent(), PHI).addReg(Val).addMBB(Pred);
   }
@@ -325,13 +325,13 @@ public:
 
   /// ValueIsPHI - Check if the instruction that defines the specified register
   /// is a PHI instruction.
-  static MachineInstr *ValueIsPHI(unsigned Val, MachineSSAUpdater *Updater) {
+  static MachineInstr *ValueIsPHI(Register Val, MachineSSAUpdater *Updater) {
     return InstrIsPHI(Updater->MRI->getVRegDef(Val));
   }
 
   /// ValueIsNewPHI - Like ValueIsPHI but also check if the PHI has no source
   /// operands, i.e., it was just added.
-  static MachineInstr *ValueIsNewPHI(unsigned Val, MachineSSAUpdater *Updater) {
+  static MachineInstr *ValueIsNewPHI(Register Val, MachineSSAUpdater *Updater) {
     MachineInstr *PHI = ValueIsPHI(Val, Updater);
     if (PHI && PHI->getNumOperands() <= 1)
       return PHI;
@@ -340,7 +340,7 @@ public:
 
   /// GetPHIValue - For the specified PHI instruction, return the register
   /// that it defines.
-  static unsigned GetPHIValue(MachineInstr *PHI) {
+  static Register GetPHIValue(MachineInstr *PHI) {
     return PHI->getOperand(0).getReg();
   }
 };
@@ -351,9 +351,9 @@ public:
 /// for the specified BB and if so, return it.  If not, construct SSA form by
 /// first calculating the required placement of PHIs and then inserting new
 /// PHIs where needed.
-unsigned MachineSSAUpdater::GetValueAtEndOfBlockInternal(MachineBasicBlock *BB){
+Register MachineSSAUpdater::GetValueAtEndOfBlockInternal(MachineBasicBlock *BB){
   AvailableValsTy &AvailableVals = getAvailableVals(AV);
-  if (unsigned V = AvailableVals[BB])
+  if (Register V = AvailableVals[BB])
     return V;
 
   SSAUpdaterImpl<MachineSSAUpdater> Impl(this, &AvailableVals, InsertedPHIs);
diff --git a/llvm/lib/CodeGen/MachineScheduler.cpp b/llvm/lib/CodeGen/MachineScheduler.cpp
index e42701b9c6ca..cf75d531deb2 100644
--- a/llvm/lib/CodeGen/MachineScheduler.cpp
+++ b/llvm/lib/CodeGen/MachineScheduler.cpp
@@ -1471,41 +1471,48 @@ namespace {
 class BaseMemOpClusterMutation : public ScheduleDAGMutation {
   struct MemOpInfo {
     SUnit *SU;
-    const MachineOperand *BaseOp;
+    SmallVector<const MachineOperand *, 4> BaseOps;
     int64_t Offset;
-
-    MemOpInfo(SUnit *su, const MachineOperand *Op, int64_t ofs)
-        : SU(su), BaseOp(Op), Offset(ofs) {}
-
-    bool operator<(const MemOpInfo &RHS) const {
-      if (BaseOp->getType() != RHS.BaseOp->getType())
-        return BaseOp->getType() < RHS.BaseOp->getType();
-
-      if (BaseOp->isReg())
-        return std::make_tuple(BaseOp->getReg(), Offset, SU->NodeNum) <
-               std::make_tuple(RHS.BaseOp->getReg(), RHS.Offset,
-                               RHS.SU->NodeNum);
-      if (BaseOp->isFI()) {
-        const MachineFunction &MF =
-            *BaseOp->getParent()->getParent()->getParent();
+    unsigned Width;
+
+    MemOpInfo(SUnit *SU, ArrayRef<const MachineOperand *> BaseOps,
+              int64_t Offset, unsigned Width)
+        : SU(SU), BaseOps(BaseOps.begin(), BaseOps.end()), Offset(Offset),
+          Width(Width) {}
+
+    static bool Compare(const MachineOperand *const &A,
+                        const MachineOperand *const &B) {
+      if (A->getType() != B->getType())
+        return A->getType() < B->getType();
+      if (A->isReg())
+        return A->getReg() < B->getReg();
+      if (A->isFI()) {
+        const MachineFunction &MF = *A->getParent()->getParent()->getParent();
         const TargetFrameLowering &TFI = *MF.getSubtarget().getFrameLowering();
         bool StackGrowsDown = TFI.getStackGrowthDirection() ==
                               TargetFrameLowering::StackGrowsDown;
-        // Can't use tuple comparison here since we might need to use a
-        // different order when the stack grows down.
-        if (BaseOp->getIndex() != RHS.BaseOp->getIndex())
-          return StackGrowsDown ? BaseOp->getIndex() > RHS.BaseOp->getIndex()
-                                : BaseOp->getIndex() < RHS.BaseOp->getIndex();
-
-        if (Offset != RHS.Offset)
-          return Offset < RHS.Offset;
-
-        return SU->NodeNum < RHS.SU->NodeNum;
+        return StackGrowsDown ? A->getIndex() > B->getIndex()
+                              : A->getIndex() < B->getIndex();
       }
 
       llvm_unreachable("MemOpClusterMutation only supports register or frame "
                        "index bases.");
     }
+
+    bool operator<(const MemOpInfo &RHS) const {
+      // FIXME: Don't compare everything twice. Maybe use C++20 three way
+      // comparison instead when it's available.
+      if (std::lexicographical_compare(BaseOps.begin(), BaseOps.end(),
+                                       RHS.BaseOps.begin(), RHS.BaseOps.end(),
+                                       Compare))
+        return true;
+      if (std::lexicographical_compare(RHS.BaseOps.begin(), RHS.BaseOps.end(),
+                                       BaseOps.begin(), BaseOps.end(), Compare))
+        return false;
+      if (Offset != RHS.Offset)
+        return Offset < RHS.Offset;
+      return SU->NodeNum < RHS.SU->NodeNum;
+    }
   };
 
   const TargetInstrInfo *TII;
@@ -1560,41 +1567,78 @@ void BaseMemOpClusterMutation::clusterNeighboringMemOps(
     ArrayRef<SUnit *> MemOps, ScheduleDAGInstrs *DAG) {
   SmallVector<MemOpInfo, 32> MemOpRecords;
   for (SUnit *SU : MemOps) {
-    const MachineOperand *BaseOp;
+    const MachineInstr &MI = *SU->getInstr();
+    SmallVector<const MachineOperand *, 4> BaseOps;
     int64_t Offset;
-    if (TII->getMemOperandWithOffset(*SU->getInstr(), BaseOp, Offset, TRI))
-      MemOpRecords.push_back(MemOpInfo(SU, BaseOp, Offset));
+    bool OffsetIsScalable;
+    unsigned Width;
+    if (TII->getMemOperandsWithOffsetWidth(MI, BaseOps, Offset,
+                                           OffsetIsScalable, Width, TRI)) {
+      MemOpRecords.push_back(MemOpInfo(SU, BaseOps, Offset, Width));
+
+      LLVM_DEBUG(dbgs() << "Num BaseOps: " << BaseOps.size() << ", Offset: "
+                        << Offset << ", OffsetIsScalable: " << OffsetIsScalable
+                        << ", Width: " << Width << "\n");
+    }
+#ifndef NDEBUG
+    for (auto *Op : BaseOps)
+      assert(Op);
+#endif
   }
   if (MemOpRecords.size() < 2)
     return;
 
   llvm::sort(MemOpRecords);
+
+  // At this point, `MemOpRecords` array must hold atleast two mem ops. Try to
+  // cluster mem ops collected within `MemOpRecords` array.
   unsigned ClusterLength = 1;
+  unsigned CurrentClusterBytes = MemOpRecords[0].Width;
   for (unsigned Idx = 0, End = MemOpRecords.size(); Idx < (End - 1); ++Idx) {
-    SUnit *SUa = MemOpRecords[Idx].SU;
-    SUnit *SUb = MemOpRecords[Idx+1].SU;
+    // Decision to cluster mem ops is taken based on target dependent logic
+    auto MemOpa = MemOpRecords[Idx];
+    auto MemOpb = MemOpRecords[Idx + 1];
+    ++ClusterLength;
+    CurrentClusterBytes += MemOpb.Width;
+    if (!TII->shouldClusterMemOps(MemOpa.BaseOps, MemOpb.BaseOps, ClusterLength,
+                                  CurrentClusterBytes)) {
+      // Current mem ops pair could not be clustered, reset cluster length, and
+      // go to next pair
+      ClusterLength = 1;
+      CurrentClusterBytes = MemOpb.Width;
+      continue;
+    }
+
+    SUnit *SUa = MemOpa.SU;
+    SUnit *SUb = MemOpb.SU;
     if (SUa->NodeNum > SUb->NodeNum)
       std::swap(SUa, SUb);
-    if (TII->shouldClusterMemOps(*MemOpRecords[Idx].BaseOp,
-                                 *MemOpRecords[Idx + 1].BaseOp,
-                                 ClusterLength) &&
-        DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
-      LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
-                        << SUb->NodeNum << ")\n");
-      // Copy successor edges from SUa to SUb. Interleaving computation
-      // dependent on SUa can prevent load combining due to register reuse.
-      // Predecessor edges do not need to be copied from SUb to SUa since nearby
-      // loads should have effectively the same inputs.
-      for (const SDep &Succ : SUa->Succs) {
-        if (Succ.getSUnit() == SUb)
-          continue;
-        LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
-                          << ")\n");
-        DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
-      }
-      ++ClusterLength;
-    } else
+
+    // FIXME: Is this check really required?
+    if (!DAG->addEdge(SUb, SDep(SUa, SDep::Cluster))) {
       ClusterLength = 1;
+      CurrentClusterBytes = MemOpb.Width;
+      continue;
+    }
+
+    LLVM_DEBUG(dbgs() << "Cluster ld/st SU(" << SUa->NodeNum << ") - SU("
+                      << SUb->NodeNum << ")\n");
+
+    // Copy successor edges from SUa to SUb. Interleaving computation
+    // dependent on SUa can prevent load combining due to register reuse.
+    // Predecessor edges do not need to be copied from SUb to SUa since
+    // nearby loads should have effectively the same inputs.
+    for (const SDep &Succ : SUa->Succs) {
+      if (Succ.getSUnit() == SUb)
+        continue;
+      LLVM_DEBUG(dbgs() << "  Copy Succ SU(" << Succ.getSUnit()->NodeNum
+                        << ")\n");
+      DAG->addEdge(Succ.getSUnit(), SDep(SUb, SDep::Artificial));
+    }
+
+    LLVM_DEBUG(dbgs() << "  Curr cluster length: " << ClusterLength
+                      << ", Curr cluster bytes: " << CurrentClusterBytes
+                      << "\n");
   }
 }
 
@@ -1609,7 +1653,7 @@ void BaseMemOpClusterMutation::apply(ScheduleDAGInstrs *DAG) {
 
     unsigned ChainPredID = DAG->SUnits.size();
     for (const SDep &Pred : SU.Preds) {
-      if (Pred.isCtrl()) {
+      if (Pred.isCtrl() && !Pred.isArtificial()) {
         ChainPredID = Pred.getSUnit()->NodeNum;
         break;
       }
@@ -2389,16 +2433,14 @@ SUnit *SchedBoundary::pickOnlyChoice() {
   if (CheckPending)
     releasePending();
 
-  if (CurrMOps > 0) {
-    // Defer any ready instrs that now have a hazard.
-    for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
-      if (checkHazard(*I)) {
-        Pending.push(*I);
-        I = Available.remove(I);
-        continue;
-      }
-      ++I;
+  // Defer any ready instrs that now have a hazard.
+  for (ReadyQueue::iterator I = Available.begin(); I != Available.end();) {
+    if (checkHazard(*I)) {
+      Pending.push(*I);
+      I = Available.remove(I);
+      continue;
     }
+    ++I;
   }
   for (unsigned i = 0; Available.empty(); ++i) {
 //  FIXME: Re-enable assert once PR20057 is resolved.
@@ -2720,6 +2762,9 @@ void GenericScheduler::initialize(ScheduleDAGMI *dag) {
   SchedModel = DAG->getSchedModel();
   TRI = DAG->TRI;
 
+  if (RegionPolicy.ComputeDFSResult)
+    DAG->computeDFSResult();
+
   Rem.init(DAG, SchedModel);
   Top.init(DAG, SchedModel, &Rem);
   Bot.init(DAG, SchedModel, &Rem);
@@ -3684,7 +3729,7 @@ struct DOTGraphTraits<ScheduleDAGMI*> : public DefaultDOTGraphTraits {
   DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
 
   static std::string getGraphName(const ScheduleDAG *G) {
-    return G->MF.getName();
+    return std::string(G->MF.getName());
   }
 
   static bool renderGraphFromBottomUp() {
diff --git a/llvm/lib/CodeGen/MachineSink.cpp b/llvm/lib/CodeGen/MachineSink.cpp
index a4ba197b7a1d..5f958bbc31b7 100644
--- a/llvm/lib/CodeGen/MachineSink.cpp
+++ b/llvm/lib/CodeGen/MachineSink.cpp
@@ -91,7 +91,7 @@ namespace {
     MachineDominatorTree *DT;      // Machine dominator tree
     MachinePostDominatorTree *PDT; // Machine post dominator tree
     MachineLoopInfo *LI;
-    const MachineBlockFrequencyInfo *MBFI;
+    MachineBlockFrequencyInfo *MBFI;
     const MachineBranchProbabilityInfo *MBPI;
     AliasAnalysis *AA;
 
@@ -269,30 +269,26 @@ MachineSinking::AllUsesDominatedByBlock(unsigned Reg,
   // into and they are all PHI nodes. In this case, machine-sink must break
   // the critical edge first. e.g.
   //
-  // %bb.1: derived from LLVM BB %bb4.preheader
+  // %bb.1:
   //   Predecessors according to CFG: %bb.0
   //     ...
-  //     %reg16385 = DEC64_32r %reg16437, implicit-def dead %eflags
+  //     %def = DEC64_32r %x, implicit-def dead %eflags
   //     ...
   //     JE_4 <%bb.37>, implicit %eflags
   //   Successors according to CFG: %bb.37 %bb.2
   //
-  // %bb.2: derived from LLVM BB %bb.nph
-  //   Predecessors according to CFG: %bb.0 %bb.1
-  //     %reg16386 = PHI %reg16434, %bb.0, %reg16385, %bb.1
-  BreakPHIEdge = true;
-  for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
-    MachineInstr *UseInst = MO.getParent();
-    unsigned OpNo = &MO - &UseInst->getOperand(0);
-    MachineBasicBlock *UseBlock = UseInst->getParent();
-    if (!(UseBlock == MBB && UseInst->isPHI() &&
-          UseInst->getOperand(OpNo+1).getMBB() == DefMBB)) {
-      BreakPHIEdge = false;
-      break;
-    }
-  }
-  if (BreakPHIEdge)
+  // %bb.2:
+  //     %p = PHI %y, %bb.0, %def, %bb.1
+  if (all_of(MRI->use_nodbg_operands(Reg), [&](MachineOperand &MO) {
+        MachineInstr *UseInst = MO.getParent();
+        unsigned OpNo = UseInst->getOperandNo(&MO);
+        MachineBasicBlock *UseBlock = UseInst->getParent();
+        return UseBlock == MBB && UseInst->isPHI() &&
+               UseInst->getOperand(OpNo + 1).getMBB() == DefMBB;
+      })) {
+    BreakPHIEdge = true;
     return true;
+  }
 
   for (MachineOperand &MO : MRI->use_nodbg_operands(Reg)) {
     // Determine the block of the use.
@@ -351,6 +347,11 @@ bool MachineSinking::runOnMachineFunction(MachineFunction &MF) {
                           << printMBBReference(*Pair.first) << " -- "
                           << printMBBReference(*NewSucc) << " -- "
                           << printMBBReference(*Pair.second) << '\n');
+        if (MBFI) {
+          auto NewSuccFreq = MBFI->getBlockFreq(Pair.first) *
+                             MBPI->getEdgeProbability(Pair.first, NewSucc);
+          MBFI->setBlockFreq(NewSucc, NewSuccFreq.getFrequency());
+        }
         MadeChange = true;
         ++NumSplit;
       } else
@@ -431,7 +432,7 @@ void MachineSinking::ProcessDbgInst(MachineInstr &MI) {
                     MI.getDebugLoc()->getInlinedAt());
   bool SeenBefore = SeenDbgVars.count(Var) != 0;
 
-  MachineOperand &MO = MI.getOperand(0);
+  MachineOperand &MO = MI.getDebugOperand(0);
   if (MO.isReg() && MO.getReg().isVirtual())
     SeenDbgUsers[MO.getReg()].push_back(SeenDbgUser(&MI, SeenBefore));
 
@@ -622,14 +623,13 @@ MachineSinking::GetAllSortedSuccessors(MachineInstr &MI, MachineBasicBlock *MBB,
   //   if () {} else {}
   //   use x
   //
-  const std::vector<MachineDomTreeNode *> &Children =
-    DT->getNode(MBB)->getChildren();
-  for (const auto &DTChild : Children)
+  for (MachineDomTreeNode *DTChild : DT->getNode(MBB)->children()) {
     // DomTree children of MBB that have MBB as immediate dominator are added.
     if (DTChild->getIDom()->getBlock() == MI.getParent() &&
         // Skip MBBs already added to the AllSuccs vector above.
         !MBB->isSuccessor(DTChild->getBlock()))
       AllSuccs.push_back(DTChild->getBlock());
+  }
 
   // Sort Successors according to their loop depth or block frequency info.
   llvm::stable_sort(
@@ -733,6 +733,13 @@ MachineSinking::FindSuccToSinkTo(MachineInstr &MI, MachineBasicBlock *MBB,
   if (SuccToSinkTo && SuccToSinkTo->isEHPad())
     return nullptr;
 
+  // It ought to be okay to sink instructions into an INLINEASM_BR target, but
+  // only if we make sure that MI occurs _before_ an INLINEASM_BR instruction in
+  // the source block (which this code does not yet do). So for now, forbid
+  // doing so.
+  if (SuccToSinkTo && SuccToSinkTo->isInlineAsmBrIndirectTarget())
+    return nullptr;
+
   return SuccToSinkTo;
 }
 
@@ -764,7 +771,8 @@ static bool SinkingPreventsImplicitNullCheck(MachineInstr &MI,
 
   const MachineOperand *BaseOp;
   int64_t Offset;
-  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI))
+  bool OffsetIsScalable;
+  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
     return false;
 
   if (!BaseOp->isReg())
@@ -794,7 +802,7 @@ static bool attemptDebugCopyProp(MachineInstr &SinkInst, MachineInstr &DbgMI) {
   // Copy DBG_VALUE operand and set the original to undef. We then check to
   // see whether this is something that can be copy-forwarded. If it isn't,
   // continue around the loop.
-  MachineOperand DbgMO = DbgMI.getOperand(0);
+  MachineOperand &DbgMO = DbgMI.getDebugOperand(0);
 
   const MachineOperand *SrcMO = nullptr, *DstMO = nullptr;
   auto CopyOperands = TII.isCopyInstr(SinkInst);
@@ -828,8 +836,8 @@ static bool attemptDebugCopyProp(MachineInstr &SinkInst, MachineInstr &DbgMI) {
   if (PostRA && DbgMO.getReg() != DstMO->getReg())
     return false;
 
-  DbgMI.getOperand(0).setReg(SrcMO->getReg());
-  DbgMI.getOperand(0).setSubReg(SrcMO->getSubReg());
+  DbgMO.setReg(SrcMO->getReg());
+  DbgMO.setSubReg(SrcMO->getSubReg());
   return true;
 }
 
@@ -864,7 +872,7 @@ static void performSink(MachineInstr &MI, MachineBasicBlock &SuccToSinkTo,
     SuccToSinkTo.insert(InsertPos, NewDbgMI);
 
     if (!attemptDebugCopyProp(MI, *DbgMI))
-      DbgMI->getOperand(0).setReg(0);
+      DbgMI->setDebugValueUndef();
   }
 }
 
@@ -998,7 +1006,7 @@ bool MachineSinking::SinkInstruction(MachineInstr &MI, bool &SawStore,
         // This DBG_VALUE would re-order assignments. If we can't copy-propagate
         // it, it can't be recovered. Set it undef.
         if (!attemptDebugCopyProp(MI, *DbgMI))
-          DbgMI->getOperand(0).setReg(0);
+          DbgMI->setDebugValueUndef();
       } else {
         DbgUsersToSink.push_back(DbgMI);
       }
@@ -1047,7 +1055,7 @@ void MachineSinking::SalvageUnsunkDebugUsersOfCopy(
       if (User.getParent() == MI.getParent())
         continue;
 
-      assert(User.getOperand(0).isReg() &&
+      assert(User.getDebugOperand(0).isReg() &&
              "DBG_VALUE user of vreg, but non reg operand?");
       DbgDefUsers.push_back(&User);
     }
@@ -1056,8 +1064,8 @@ void MachineSinking::SalvageUnsunkDebugUsersOfCopy(
   // Point the users of this copy that are no longer dominated, at the source
   // of the copy.
   for (auto *User : DbgDefUsers) {
-    User->getOperand(0).setReg(MI.getOperand(1).getReg());
-    User->getOperand(0).setSubReg(MI.getOperand(1).getSubReg());
+    User->getDebugOperand(0).setReg(MI.getOperand(1).getReg());
+    User->getDebugOperand(0).setSubReg(MI.getOperand(1).getSubReg());
   }
 }
 
@@ -1303,7 +1311,7 @@ bool PostRAMachineSinking::tryToSinkCopy(MachineBasicBlock &CurBB,
     // We must sink this DBG_VALUE if its operand is sunk. To avoid searching
     // for DBG_VALUEs later, record them when they're encountered.
     if (MI->isDebugValue()) {
-      auto &MO = MI->getOperand(0);
+      auto &MO = MI->getDebugOperand(0);
       if (MO.isReg() && Register::isPhysicalRegister(MO.getReg())) {
         // Bail if we can already tell the sink would be rejected, rather
         // than needlessly accumulating lots of DBG_VALUEs.
diff --git a/llvm/lib/CodeGen/MachineSizeOpts.cpp b/llvm/lib/CodeGen/MachineSizeOpts.cpp
index aff67f9cfd55..584d43b42004 100644
--- a/llvm/lib/CodeGen/MachineSizeOpts.cpp
+++ b/llvm/lib/CodeGen/MachineSizeOpts.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/MachineSizeOpts.h"
+#include "llvm/CodeGen/MBFIWrapper.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
 
@@ -23,6 +24,7 @@ extern cl::opt<bool> ForcePGSO;
 extern cl::opt<int> PgsoCutoffInstrProf;
 extern cl::opt<int> PgsoCutoffSampleProf;
 
+namespace {
 namespace machine_size_opts_detail {
 
 /// Like ProfileSummaryInfo::isColdBlock but for MachineBasicBlock.
@@ -33,6 +35,13 @@ bool isColdBlock(const MachineBasicBlock *MBB,
   return Count && PSI->isColdCount(*Count);
 }
 
+bool isColdBlock(BlockFrequency BlockFreq,
+                 ProfileSummaryInfo *PSI,
+                 const MachineBlockFrequencyInfo *MBFI) {
+  auto Count = MBFI->getProfileCountFromFreq(BlockFreq.getFrequency());
+  return Count && PSI->isColdCount(*Count);
+}
+
 /// Like ProfileSummaryInfo::isHotBlockNthPercentile but for MachineBasicBlock.
 static bool isHotBlockNthPercentile(int PercentileCutoff,
                                     const MachineBasicBlock *MBB,
@@ -42,6 +51,30 @@ static bool isHotBlockNthPercentile(int PercentileCutoff,
   return Count && PSI->isHotCountNthPercentile(PercentileCutoff, *Count);
 }
 
+static bool isHotBlockNthPercentile(int PercentileCutoff,
+                                    BlockFrequency BlockFreq,
+                                    ProfileSummaryInfo *PSI,
+                                    const MachineBlockFrequencyInfo *MBFI) {
+  auto Count = MBFI->getProfileCountFromFreq(BlockFreq.getFrequency());
+  return Count && PSI->isHotCountNthPercentile(PercentileCutoff, *Count);
+}
+
+static bool isColdBlockNthPercentile(int PercentileCutoff,
+                                     const MachineBasicBlock *MBB,
+                                     ProfileSummaryInfo *PSI,
+                                     const MachineBlockFrequencyInfo *MBFI) {
+  auto Count = MBFI->getBlockProfileCount(MBB);
+  return Count && PSI->isColdCountNthPercentile(PercentileCutoff, *Count);
+}
+
+static bool isColdBlockNthPercentile(int PercentileCutoff,
+                                     BlockFrequency BlockFreq,
+                                     ProfileSummaryInfo *PSI,
+                                     const MachineBlockFrequencyInfo *MBFI) {
+  auto Count = MBFI->getProfileCountFromFreq(BlockFreq.getFrequency());
+  return Count && PSI->isColdCountNthPercentile(PercentileCutoff, *Count);
+}
+
 /// Like ProfileSummaryInfo::isFunctionColdInCallGraph but for
 /// MachineFunction.
 bool isFunctionColdInCallGraph(
@@ -73,9 +106,21 @@ bool isFunctionHotInCallGraphNthPercentile(
       return true;
   return false;
 }
+
+bool isFunctionColdInCallGraphNthPercentile(
+    int PercentileCutoff, const MachineFunction *MF, ProfileSummaryInfo *PSI,
+    const MachineBlockFrequencyInfo &MBFI) {
+  if (auto FunctionCount = MF->getFunction().getEntryCount())
+    if (!PSI->isColdCountNthPercentile(PercentileCutoff,
+                                       FunctionCount.getCount()))
+      return false;
+  for (const auto &MBB : *MF)
+    if (!isColdBlockNthPercentile(PercentileCutoff, &MBB, PSI, &MBFI))
+      return false;
+  return true;
+}
 } // namespace machine_size_opts_detail
 
-namespace {
 struct MachineBasicBlockBFIAdapter {
   static bool isFunctionColdInCallGraph(const MachineFunction *MF,
                                         ProfileSummaryInfo *PSI,
@@ -90,11 +135,22 @@ struct MachineBasicBlockBFIAdapter {
     return machine_size_opts_detail::isFunctionHotInCallGraphNthPercentile(
         CutOff, MF, PSI, MBFI);
   }
+  static bool isFunctionColdInCallGraphNthPercentile(
+      int CutOff, const MachineFunction *MF, ProfileSummaryInfo *PSI,
+      const MachineBlockFrequencyInfo &MBFI) {
+    return machine_size_opts_detail::isFunctionColdInCallGraphNthPercentile(
+        CutOff, MF, PSI, MBFI);
+  }
   static bool isColdBlock(const MachineBasicBlock *MBB,
                           ProfileSummaryInfo *PSI,
                           const MachineBlockFrequencyInfo *MBFI) {
     return machine_size_opts_detail::isColdBlock(MBB, PSI, MBFI);
   }
+  static bool isColdBlock(BlockFrequency BlockFreq,
+                          ProfileSummaryInfo *PSI,
+                          const MachineBlockFrequencyInfo *MBFI) {
+    return machine_size_opts_detail::isColdBlock(BlockFreq, PSI, MBFI);
+  }
   static bool isHotBlockNthPercentile(int CutOff,
                                       const MachineBasicBlock *MBB,
                                       ProfileSummaryInfo *PSI,
@@ -102,6 +158,25 @@ struct MachineBasicBlockBFIAdapter {
     return machine_size_opts_detail::isHotBlockNthPercentile(
         CutOff, MBB, PSI, MBFI);
   }
+  static bool isHotBlockNthPercentile(int CutOff,
+                                      BlockFrequency BlockFreq,
+                                      ProfileSummaryInfo *PSI,
+                                      const MachineBlockFrequencyInfo *MBFI) {
+    return machine_size_opts_detail::isHotBlockNthPercentile(
+        CutOff, BlockFreq, PSI, MBFI);
+  }
+  static bool isColdBlockNthPercentile(int CutOff, const MachineBasicBlock *MBB,
+                                       ProfileSummaryInfo *PSI,
+                                       const MachineBlockFrequencyInfo *MBFI) {
+    return machine_size_opts_detail::isColdBlockNthPercentile(CutOff, MBB, PSI,
+                                                              MBFI);
+  }
+  static bool isColdBlockNthPercentile(int CutOff, BlockFrequency BlockFreq,
+                                       ProfileSummaryInfo *PSI,
+                                       const MachineBlockFrequencyInfo *MBFI) {
+    return machine_size_opts_detail::isColdBlockNthPercentile(CutOff, BlockFreq,
+                                                              PSI, MBFI);
+  }
 };
 } // end anonymous namespace
 
@@ -117,6 +192,19 @@ bool llvm::shouldOptimizeForSize(const MachineBasicBlock *MBB,
                                  ProfileSummaryInfo *PSI,
                                  const MachineBlockFrequencyInfo *MBFI,
                                  PGSOQueryType QueryType) {
+  assert(MBB);
   return shouldOptimizeForSizeImpl<MachineBasicBlockBFIAdapter>(
       MBB, PSI, MBFI, QueryType);
 }
+
+bool llvm::shouldOptimizeForSize(const MachineBasicBlock *MBB,
+                                 ProfileSummaryInfo *PSI,
+                                 MBFIWrapper *MBFIW,
+                                 PGSOQueryType QueryType) {
+  assert(MBB);
+  if (!PSI || !MBFIW)
+    return false;
+  BlockFrequency BlockFreq = MBFIW->getBlockFreq(MBB);
+  return shouldOptimizeForSizeImpl<MachineBasicBlockBFIAdapter>(
+      BlockFreq, PSI, &MBFIW->getMBFI(), QueryType);
+}
diff --git a/llvm/lib/CodeGen/MachineStripDebug.cpp b/llvm/lib/CodeGen/MachineStripDebug.cpp
new file mode 100644
index 000000000000..a1cb12f91275
--- /dev/null
+++ b/llvm/lib/CodeGen/MachineStripDebug.cpp
@@ -0,0 +1,111 @@
+//===- MachineStripDebug.cpp - Strip debug info ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This removes debug info from everything. It can be used to ensure
+/// tests can be debugified without affecting the output MIR.
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/IR/DebugInfo.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Debugify.h"
+
+#define DEBUG_TYPE "mir-strip-debug"
+
+using namespace llvm;
+
+namespace {
+cl::opt<bool>
+    OnlyDebugifiedDefault("mir-strip-debugify-only",
+                          cl::desc("Should mir-strip-debug only strip debug "
+                                   "info from debugified modules by default"),
+                          cl::init(true));
+
+struct StripDebugMachineModule : public ModulePass {
+  bool runOnModule(Module &M) override {
+    if (OnlyDebugified) {
+      NamedMDNode *DebugifyMD = M.getNamedMetadata("llvm.debugify");
+      if (!DebugifyMD) {
+        LLVM_DEBUG(dbgs() << "Not stripping debug info"
+                             " (debugify metadata not found)?\n");
+        return false;
+      }
+    }
+
+    MachineModuleInfo &MMI =
+        getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+
+    bool Changed = false;
+    for (Function &F : M.functions()) {
+      MachineFunction *MaybeMF = MMI.getMachineFunction(F);
+      if (!MaybeMF)
+        continue;
+      MachineFunction &MF = *MaybeMF;
+      for (MachineBasicBlock &MBB : MF) {
+        for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end();
+             I != E;) {
+          if (I->isDebugInstr()) {
+            // FIXME: We should remove all of them. However, AArch64 emits an
+            //        invalid `DBG_VALUE $lr` with only one operand instead of
+            //        the usual three and has a test that depends on it's
+            //        preservation. Preserve it for now.
+            if (I->getNumOperands() > 1) {
+              LLVM_DEBUG(dbgs() << "Removing debug instruction " << *I);
+              I = MBB.erase(I);
+              Changed |= true;
+              continue;
+            }
+          }
+          if (I->getDebugLoc()) {
+            LLVM_DEBUG(dbgs() << "Removing location " << *I);
+            I->setDebugLoc(DebugLoc());
+            Changed |= true;
+            ++I;
+            continue;
+          }
+          LLVM_DEBUG(dbgs() << "Keeping " << *I);
+          ++I;
+        }
+      }
+    }
+
+    Changed |= stripDebugifyMetadata(M);
+
+    return Changed;
+  }
+
+  StripDebugMachineModule() : StripDebugMachineModule(OnlyDebugifiedDefault) {}
+  StripDebugMachineModule(bool OnlyDebugified)
+      : ModulePass(ID), OnlyDebugified(OnlyDebugified) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineModuleInfoWrapperPass>();
+    AU.addPreserved<MachineModuleInfoWrapperPass>();
+    AU.setPreservesCFG();
+  }
+
+  static char ID; // Pass identification.
+
+protected:
+  bool OnlyDebugified;
+};
+char StripDebugMachineModule::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS_BEGIN(StripDebugMachineModule, DEBUG_TYPE,
+                      "Machine Strip Debug Module", false, false)
+INITIALIZE_PASS_END(StripDebugMachineModule, DEBUG_TYPE,
+                    "Machine Strip Debug Module", false, false)
+
+ModulePass *llvm::createStripDebugMachineModulePass(bool OnlyDebugified) {
+  return new StripDebugMachineModule(OnlyDebugified);
+}
diff --git a/llvm/lib/CodeGen/MachineVerifier.cpp b/llvm/lib/CodeGen/MachineVerifier.cpp
index 6c0402df8489..c1a2c4e0bc6e 100644
--- a/llvm/lib/CodeGen/MachineVerifier.cpp
+++ b/llvm/lib/CodeGen/MachineVerifier.cpp
@@ -16,16 +16,15 @@
 // Register live intervals: Registers must be defined only once, and must be
 // defined before use.
 //
-// The machine code verifier is enabled from LLVMTargetMachine.cpp with the
-// command-line option -verify-machineinstrs, or by defining the environment
-// variable LLVM_VERIFY_MACHINEINSTRS to the name of a file that will receive
-// the verifier errors.
+// The machine code verifier is enabled with the command-line option
+// -verify-machineinstrs.
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -35,8 +34,8 @@
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalCalc.h"
 #include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/LiveStacks.h"
 #include "llvm/CodeGen/LiveVariables.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -157,25 +156,6 @@ namespace {
 
       BBInfo() = default;
 
-      // Add register to vregsPassed if it belongs there. Return true if
-      // anything changed.
-      bool addPassed(unsigned Reg) {
-        if (!Register::isVirtualRegister(Reg))
-          return false;
-        if (regsKilled.count(Reg) || regsLiveOut.count(Reg))
-          return false;
-        return vregsPassed.insert(Reg).second;
-      }
-
-      // Same for a full set.
-      bool addPassed(const RegSet &RS) {
-        bool changed = false;
-        for (RegSet::const_iterator I = RS.begin(), E = RS.end(); I != E; ++I)
-          if (addPassed(*I))
-            changed = true;
-        return changed;
-      }
-
       // Add register to vregsRequired if it belongs there. Return true if
       // anything changed.
       bool addRequired(unsigned Reg) {
@@ -188,20 +168,18 @@ namespace {
 
       // Same for a full set.
       bool addRequired(const RegSet &RS) {
-        bool changed = false;
-        for (RegSet::const_iterator I = RS.begin(), E = RS.end(); I != E; ++I)
-          if (addRequired(*I))
-            changed = true;
-        return changed;
+        bool Changed = false;
+        for (unsigned Reg : RS)
+          Changed |= addRequired(Reg);
+        return Changed;
       }
 
       // Same for a full map.
       bool addRequired(const RegMap &RM) {
-        bool changed = false;
-        for (RegMap::const_iterator I = RM.begin(), E = RM.end(); I != E; ++I)
-          if (addRequired(I->first))
-            changed = true;
-        return changed;
+        bool Changed = false;
+        for (const auto &I : RM)
+          Changed |= addRequired(I.first);
+        return Changed;
       }
 
       // Live-out registers are either in regsLiveOut or vregsPassed.
@@ -236,7 +214,6 @@ namespace {
     void verifyPreISelGenericInstruction(const MachineInstr *MI);
     void visitMachineInstrBefore(const MachineInstr *MI);
     void visitMachineOperand(const MachineOperand *MO, unsigned MONum);
-    void visitMachineInstrAfter(const MachineInstr *MI);
     void visitMachineBundleAfter(const MachineInstr *MI);
     void visitMachineBasicBlockAfter(const MachineBasicBlock *MBB);
     void visitMachineFunctionAfter();
@@ -376,13 +353,11 @@ unsigned MachineVerifier::verify(MachineFunction &MF) {
   if (isFunctionFailedISel)
     return foundErrors;
 
-  isFunctionRegBankSelected =
-      !isFunctionFailedISel &&
-      MF.getProperties().hasProperty(
-          MachineFunctionProperties::Property::RegBankSelected);
-  isFunctionSelected = !isFunctionFailedISel &&
-                       MF.getProperties().hasProperty(
-                           MachineFunctionProperties::Property::Selected);
+  isFunctionRegBankSelected = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::RegBankSelected);
+  isFunctionSelected = MF.getProperties().hasProperty(
+      MachineFunctionProperties::Property::Selected);
+
   LiveVars = nullptr;
   LiveInts = nullptr;
   LiveStks = nullptr;
@@ -401,43 +376,40 @@ unsigned MachineVerifier::verify(MachineFunction &MF) {
   verifyProperties(MF);
 
   visitMachineFunctionBefore();
-  for (MachineFunction::const_iterator MFI = MF.begin(), MFE = MF.end();
-       MFI!=MFE; ++MFI) {
-    visitMachineBasicBlockBefore(&*MFI);
+  for (const MachineBasicBlock &MBB : MF) {
+    visitMachineBasicBlockBefore(&MBB);
     // Keep track of the current bundle header.
     const MachineInstr *CurBundle = nullptr;
     // Do we expect the next instruction to be part of the same bundle?
     bool InBundle = false;
 
-    for (MachineBasicBlock::const_instr_iterator MBBI = MFI->instr_begin(),
-           MBBE = MFI->instr_end(); MBBI != MBBE; ++MBBI) {
-      if (MBBI->getParent() != &*MFI) {
-        report("Bad instruction parent pointer", &*MFI);
-        errs() << "Instruction: " << *MBBI;
+    for (const MachineInstr &MI : MBB.instrs()) {
+      if (MI.getParent() != &MBB) {
+        report("Bad instruction parent pointer", &MBB);
+        errs() << "Instruction: " << MI;
         continue;
       }
 
       // Check for consistent bundle flags.
-      if (InBundle && !MBBI->isBundledWithPred())
+      if (InBundle && !MI.isBundledWithPred())
         report("Missing BundledPred flag, "
                "BundledSucc was set on predecessor",
-               &*MBBI);
-      if (!InBundle && MBBI->isBundledWithPred())
+               &MI);
+      if (!InBundle && MI.isBundledWithPred())
         report("BundledPred flag is set, "
                "but BundledSucc not set on predecessor",
-               &*MBBI);
+               &MI);
 
       // Is this a bundle header?
-      if (!MBBI->isInsideBundle()) {
+      if (!MI.isInsideBundle()) {
         if (CurBundle)
           visitMachineBundleAfter(CurBundle);
-        CurBundle = &*MBBI;
+        CurBundle = &MI;
         visitMachineBundleBefore(CurBundle);
       } else if (!CurBundle)
-        report("No bundle header", &*MBBI);
-      visitMachineInstrBefore(&*MBBI);
-      for (unsigned I = 0, E = MBBI->getNumOperands(); I != E; ++I) {
-        const MachineInstr &MI = *MBBI;
+        report("No bundle header", &MI);
+      visitMachineInstrBefore(&MI);
+      for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
         const MachineOperand &Op = MI.getOperand(I);
         if (Op.getParent() != &MI) {
           // Make sure to use correct addOperand / RemoveOperand / ChangeTo
@@ -448,16 +420,14 @@ unsigned MachineVerifier::verify(MachineFunction &MF) {
         visitMachineOperand(&Op, I);
       }
 
-      visitMachineInstrAfter(&*MBBI);
-
       // Was this the last bundled instruction?
-      InBundle = MBBI->isBundledWithSucc();
+      InBundle = MI.isBundledWithSucc();
     }
     if (CurBundle)
       visitMachineBundleAfter(CurBundle);
     if (InBundle)
-      report("BundledSucc flag set on last instruction in block", &MFI->back());
-    visitMachineBasicBlockAfter(&*MFI);
+      report("BundledSucc flag set on last instruction in block", &MBB.back());
+    visitMachineBasicBlockAfter(&MBB);
   }
   visitMachineFunctionAfter();
 
@@ -568,9 +538,8 @@ void MachineVerifier::markReachable(const MachineBasicBlock *MBB) {
   BBInfo &MInfo = MBBInfoMap[MBB];
   if (!MInfo.reachable) {
     MInfo.reachable = true;
-    for (MachineBasicBlock::const_succ_iterator SuI = MBB->succ_begin(),
-           SuE = MBB->succ_end(); SuI != SuE; ++SuI)
-      markReachable(*SuI);
+    for (const MachineBasicBlock *Succ : MBB->successors())
+      markReachable(Succ);
   }
 }
 
@@ -604,16 +573,6 @@ void MachineVerifier::visitMachineFunctionBefore() {
     verifyStackFrame();
 }
 
-// Does iterator point to a and b as the first two elements?
-static bool matchPair(MachineBasicBlock::const_succ_iterator i,
-                      const MachineBasicBlock *a, const MachineBasicBlock *b) {
-  if (*i == a)
-    return *++i == b;
-  if (*i == b)
-    return *++i == a;
-  return false;
-}
-
 void
 MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   FirstTerminator = nullptr;
@@ -633,29 +592,27 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
   }
 
   // Count the number of landing pad successors.
-  SmallPtrSet<MachineBasicBlock*, 4> LandingPadSuccs;
-  for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
-       E = MBB->succ_end(); I != E; ++I) {
-    if ((*I)->isEHPad())
-      LandingPadSuccs.insert(*I);
-    if (!FunctionBlocks.count(*I))
+  SmallPtrSet<const MachineBasicBlock*, 4> LandingPadSuccs;
+  for (const auto *succ : MBB->successors()) {
+    if (succ->isEHPad())
+      LandingPadSuccs.insert(succ);
+    if (!FunctionBlocks.count(succ))
       report("MBB has successor that isn't part of the function.", MBB);
-    if (!MBBInfoMap[*I].Preds.count(MBB)) {
+    if (!MBBInfoMap[succ].Preds.count(MBB)) {
       report("Inconsistent CFG", MBB);
       errs() << "MBB is not in the predecessor list of the successor "
-             << printMBBReference(*(*I)) << ".\n";
+             << printMBBReference(*succ) << ".\n";
     }
   }
 
   // Check the predecessor list.
-  for (MachineBasicBlock::const_pred_iterator I = MBB->pred_begin(),
-       E = MBB->pred_end(); I != E; ++I) {
-    if (!FunctionBlocks.count(*I))
+  for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+    if (!FunctionBlocks.count(Pred))
       report("MBB has predecessor that isn't part of the function.", MBB);
-    if (!MBBInfoMap[*I].Succs.count(MBB)) {
+    if (!MBBInfoMap[Pred].Succs.count(MBB)) {
       report("Inconsistent CFG", MBB);
       errs() << "MBB is not in the successor list of the predecessor "
-             << printMBBReference(*(*I)) << ".\n";
+             << printMBBReference(*Pred) << ".\n";
     }
   }
 
@@ -669,32 +626,15 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       !isScopedEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
     report("MBB has more than one landing pad successor", MBB);
 
-  // Call AnalyzeBranch. If it succeeds, there several more conditions to check.
+  // Call analyzeBranch. If it succeeds, there several more conditions to check.
   MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
   SmallVector<MachineOperand, 4> Cond;
   if (!TII->analyzeBranch(*const_cast<MachineBasicBlock *>(MBB), TBB, FBB,
                           Cond)) {
-    // Ok, AnalyzeBranch thinks it knows what's going on with this block. Let's
+    // Ok, analyzeBranch thinks it knows what's going on with this block. Let's
     // check whether its answers match up with reality.
     if (!TBB && !FBB) {
       // Block falls through to its successor.
-      MachineFunction::const_iterator MBBI = MBB->getIterator();
-      ++MBBI;
-      if (MBBI == MF->end()) {
-        // It's possible that the block legitimately ends with a noreturn
-        // call or an unreachable, in which case it won't actually fall
-        // out the bottom of the function.
-      } else if (MBB->succ_size() == LandingPadSuccs.size()) {
-        // It's possible that the block legitimately ends with a noreturn
-        // call or an unreachable, in which case it won't actually fall
-        // out of the block.
-      } else if (MBB->succ_size() != 1+LandingPadSuccs.size()) {
-        report("MBB exits via unconditional fall-through but doesn't have "
-               "exactly one CFG successor!", MBB);
-      } else if (!MBB->isSuccessor(&*MBBI)) {
-        report("MBB exits via unconditional fall-through but its successor "
-               "differs from its CFG successor!", MBB);
-      }
       if (!MBB->empty() && MBB->back().isBarrier() &&
           !TII->isPredicated(MBB->back())) {
         report("MBB exits via unconditional fall-through but ends with a "
@@ -706,17 +646,6 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       }
     } else if (TBB && !FBB && Cond.empty()) {
       // Block unconditionally branches somewhere.
-      // If the block has exactly one successor, that happens to be a
-      // landingpad, accept it as valid control flow.
-      if (MBB->succ_size() != 1+LandingPadSuccs.size() &&
-          (MBB->succ_size() != 1 || LandingPadSuccs.size() != 1 ||
-           *MBB->succ_begin() != *LandingPadSuccs.begin())) {
-        report("MBB exits via unconditional branch but doesn't have "
-               "exactly one CFG successor!", MBB);
-      } else if (!MBB->isSuccessor(TBB)) {
-        report("MBB exits via unconditional branch but the CFG "
-               "successor doesn't match the actual successor!", MBB);
-      }
       if (MBB->empty()) {
         report("MBB exits via unconditional branch but doesn't contain "
                "any instructions!", MBB);
@@ -729,25 +658,6 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
       }
     } else if (TBB && !FBB && !Cond.empty()) {
       // Block conditionally branches somewhere, otherwise falls through.
-      MachineFunction::const_iterator MBBI = MBB->getIterator();
-      ++MBBI;
-      if (MBBI == MF->end()) {
-        report("MBB conditionally falls through out of function!", MBB);
-      } else if (MBB->succ_size() == 1) {
-        // A conditional branch with only one successor is weird, but allowed.
-        if (&*MBBI != TBB)
-          report("MBB exits via conditional branch/fall-through but only has "
-                 "one CFG successor!", MBB);
-        else if (TBB != *MBB->succ_begin())
-          report("MBB exits via conditional branch/fall-through but the CFG "
-                 "successor don't match the actual successor!", MBB);
-      } else if (MBB->succ_size() != 2) {
-        report("MBB exits via conditional branch/fall-through but doesn't have "
-               "exactly two CFG successors!", MBB);
-      } else if (!matchPair(MBB->succ_begin(), TBB, &*MBBI)) {
-        report("MBB exits via conditional branch/fall-through but the CFG "
-               "successors don't match the actual successors!", MBB);
-      }
       if (MBB->empty()) {
         report("MBB exits via conditional branch/fall-through but doesn't "
                "contain any instructions!", MBB);
@@ -761,21 +671,6 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
     } else if (TBB && FBB) {
       // Block conditionally branches somewhere, otherwise branches
       // somewhere else.
-      if (MBB->succ_size() == 1) {
-        // A conditional branch with only one successor is weird, but allowed.
-        if (FBB != TBB)
-          report("MBB exits via conditional branch/branch through but only has "
-                 "one CFG successor!", MBB);
-        else if (TBB != *MBB->succ_begin())
-          report("MBB exits via conditional branch/branch through but the CFG "
-                 "successor don't match the actual successor!", MBB);
-      } else if (MBB->succ_size() != 2) {
-        report("MBB exits via conditional branch/branch but doesn't have "
-               "exactly two CFG successors!", MBB);
-      } else if (!matchPair(MBB->succ_begin(), TBB, FBB)) {
-        report("MBB exits via conditional branch/branch but the CFG "
-               "successors don't match the actual successors!", MBB);
-      }
       if (MBB->empty()) {
         report("MBB exits via conditional branch/branch but doesn't "
                "contain any instructions!", MBB);
@@ -791,7 +686,54 @@ MachineVerifier::visitMachineBasicBlockBefore(const MachineBasicBlock *MBB) {
                "condition!", MBB);
       }
     } else {
-      report("AnalyzeBranch returned invalid data!", MBB);
+      report("analyzeBranch returned invalid data!", MBB);
+    }
+
+    // Now check that the successors match up with the answers reported by
+    // analyzeBranch.
+    if (TBB && !MBB->isSuccessor(TBB))
+      report("MBB exits via jump or conditional branch, but its target isn't a "
+             "CFG successor!",
+             MBB);
+    if (FBB && !MBB->isSuccessor(FBB))
+      report("MBB exits via conditional branch, but its target isn't a CFG "
+             "successor!",
+             MBB);
+
+    // There might be a fallthrough to the next block if there's either no
+    // unconditional true branch, or if there's a condition, and one of the
+    // branches is missing.
+    bool Fallthrough = !TBB || (!Cond.empty() && !FBB);
+
+    // A conditional fallthrough must be an actual CFG successor, not
+    // unreachable. (Conversely, an unconditional fallthrough might not really
+    // be a successor, because the block might end in unreachable.)
+    if (!Cond.empty() && !FBB) {
+      MachineFunction::const_iterator MBBI = std::next(MBB->getIterator());
+      if (MBBI == MF->end()) {
+        report("MBB conditionally falls through out of function!", MBB);
+      } else if (!MBB->isSuccessor(&*MBBI))
+        report("MBB exits via conditional branch/fall-through but the CFG "
+               "successors don't match the actual successors!",
+               MBB);
+    }
+
+    // Verify that there aren't any extra un-accounted-for successors.
+    for (const MachineBasicBlock *SuccMBB : MBB->successors()) {
+      // If this successor is one of the branch targets, it's okay.
+      if (SuccMBB == TBB || SuccMBB == FBB)
+        continue;
+      // If we might have a fallthrough, and the successor is the fallthrough
+      // block, that's also ok.
+      if (Fallthrough && SuccMBB == MBB->getNextNode())
+        continue;
+      // Also accept successors which are for exception-handling or might be
+      // inlineasm_br targets.
+      if (SuccMBB->isEHPad() || SuccMBB->isInlineAsmBrIndirectTarget())
+        continue;
+      report("MBB has unexpected successors which are not branch targets, "
+             "fallthrough, EHPads, or inlineasm_br targets.",
+             MBB);
     }
   }
 
@@ -839,7 +781,7 @@ void MachineVerifier::visitMachineBundleBefore(const MachineInstr *MI) {
   if (MI->isTerminator() && !TII->isPredicated(*MI)) {
     if (!FirstTerminator)
       FirstTerminator = MI;
-  } else if (FirstTerminator && !MI->isDebugEntryValue()) {
+  } else if (FirstTerminator) {
     report("Non-terminator instruction after the first terminator", MI);
     errs() << "First terminator was:\t" << *FirstTerminator;
   }
@@ -920,6 +862,23 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
   const MCInstrDesc &MCID = MI->getDesc();
   unsigned NumOps = MI->getNumOperands();
 
+  // Branches must reference a basic block if they are not indirect
+  if (MI->isBranch() && !MI->isIndirectBranch()) {
+    bool HasMBB = false;
+    for (const MachineOperand &Op : MI->operands()) {
+      if (Op.isMBB()) {
+        HasMBB = true;
+        break;
+      }
+    }
+
+    if (!HasMBB) {
+      report("Branch instruction is missing a basic block operand or "
+             "isIndirectBranch property",
+             MI);
+    }
+  }
+
   // Check types.
   SmallVector<LLT, 4> Types;
   for (unsigned I = 0, E = std::min(MCID.getNumOperands(), NumOps);
@@ -972,9 +931,6 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
   switch (MI->getOpcode()) {
   case TargetOpcode::G_CONSTANT:
   case TargetOpcode::G_FCONSTANT: {
-    if (MI->getNumOperands() < MCID.getNumOperands())
-      break;
-
     LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
     if (DstTy.isVector())
       report("Instruction cannot use a vector result type", MI);
@@ -1062,6 +1018,10 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
 
     if (SrcTy.getSizeInBits() != DstTy.getSizeInBits())
       report("bitcast sizes must match", MI);
+
+    if (SrcTy == DstTy)
+      report("bitcast must change the type", MI);
+
     break;
   }
   case TargetOpcode::G_INTTOPTR:
@@ -1115,6 +1075,22 @@ void MachineVerifier::verifyPreISelGenericInstruction(const MachineInstr *MI) {
     // TODO: Is the offset allowed to be a scalar with a vector?
     break;
   }
+  case TargetOpcode::G_PTRMASK: {
+    LLT DstTy = MRI->getType(MI->getOperand(0).getReg());
+    LLT SrcTy = MRI->getType(MI->getOperand(1).getReg());
+    LLT MaskTy = MRI->getType(MI->getOperand(2).getReg());
+    if (!DstTy.isValid() || !SrcTy.isValid() || !MaskTy.isValid())
+      break;
+
+    if (!DstTy.getScalarType().isPointer())
+      report("ptrmask result type must be a pointer", MI);
+
+    if (!MaskTy.getScalarType().isScalar())
+      report("ptrmask mask type must be an integer", MI);
+
+    verifyVectorElementMatch(DstTy, MaskTy, MI);
+    break;
+  }
   case TargetOpcode::G_SEXT:
   case TargetOpcode::G_ZEXT:
   case TargetOpcode::G_ANYEXT:
@@ -1485,13 +1461,18 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
   if (MI->isInlineAsm())
     verifyInlineAsm(MI);
 
+  // A fully-formed DBG_VALUE must have a location. Ignore partially formed
+  // DBG_VALUEs: these are convenient to use in tests, but should never get
+  // generated.
+  if (MI->isDebugValue() && MI->getNumOperands() == 4)
+    if (!MI->getDebugLoc())
+      report("Missing DebugLoc for debug instruction", MI);
+
   // Check the MachineMemOperands for basic consistency.
-  for (MachineInstr::mmo_iterator I = MI->memoperands_begin(),
-                                  E = MI->memoperands_end();
-       I != E; ++I) {
-    if ((*I)->isLoad() && !MI->mayLoad())
+  for (MachineMemOperand *Op : MI->memoperands()) {
+    if (Op->isLoad() && !MI->mayLoad())
       report("Missing mayLoad flag", MI);
-    if ((*I)->isStore() && !MI->mayStore())
+    if (Op->isStore() && !MI->mayStore())
       report("Missing mayStore flag", MI);
   }
 
@@ -1552,26 +1533,27 @@ void MachineVerifier::visitMachineInstrBefore(const MachineInstr *MI) {
     }
     break;
   }
-  case TargetOpcode::STATEPOINT:
-    if (!MI->getOperand(StatepointOpers::IDPos).isImm() ||
-        !MI->getOperand(StatepointOpers::NBytesPos).isImm() ||
-        !MI->getOperand(StatepointOpers::NCallArgsPos).isImm())
+  case TargetOpcode::STATEPOINT: {
+    StatepointOpers SO(MI);
+    if (!MI->getOperand(SO.getIDPos()).isImm() ||
+        !MI->getOperand(SO.getNBytesPos()).isImm() ||
+        !MI->getOperand(SO.getNCallArgsPos()).isImm()) {
       report("meta operands to STATEPOINT not constant!", MI);
-    break;
+      break;
+    }
 
     auto VerifyStackMapConstant = [&](unsigned Offset) {
-      if (!MI->getOperand(Offset).isImm() ||
-          MI->getOperand(Offset).getImm() != StackMaps::ConstantOp ||
-          !MI->getOperand(Offset + 1).isImm())
+      if (!MI->getOperand(Offset - 1).isImm() ||
+          MI->getOperand(Offset - 1).getImm() != StackMaps::ConstantOp ||
+          !MI->getOperand(Offset).isImm())
         report("stack map constant to STATEPOINT not well formed!", MI);
     };
-    const unsigned VarStart = StatepointOpers(MI).getVarIdx();
-    VerifyStackMapConstant(VarStart + StatepointOpers::CCOffset);
-    VerifyStackMapConstant(VarStart + StatepointOpers::FlagsOffset);
-    VerifyStackMapConstant(VarStart + StatepointOpers::NumDeoptOperandsOffset);
+    VerifyStackMapConstant(SO.getCCIdx());
+    VerifyStackMapConstant(SO.getFlagsIdx());
+    VerifyStackMapConstant(SO.getNumDeoptArgsIdx());
 
     // TODO: verify we have properly encoded deopt arguments
-    break;
+  } break;
   }
 }
 
@@ -1599,7 +1581,7 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
     bool IsOptional = MI->isVariadic() && MONum == MCID.getNumOperands() - 1;
     if (!IsOptional) {
       if (MO->isReg()) {
-        if (MO->isDef() && !MCOI.isOptionalDef())
+        if (MO->isDef() && !MCOI.isOptionalDef() && !MCID.variadicOpsAreDefs())
           report("Explicit operand marked as def", MO, MONum);
         if (MO->isImplicit())
           report("Explicit operand marked as implicit", MO, MONum);
@@ -1668,10 +1650,17 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
       }
     }
 
-    // Verify two-address constraints after leaving SSA form.
+    // Verify two-address constraints after the twoaddressinstruction pass.
+    // Both twoaddressinstruction pass and phi-node-elimination pass call
+    // MRI->leaveSSA() to set MF as NoSSA, we should do the verification after
+    // twoaddressinstruction pass not after phi-node-elimination pass. So we
+    // shouldn't use the NoSSA as the condition, we should based on
+    // TiedOpsRewritten property to verify two-address constraints, this
+    // property will be set in twoaddressinstruction pass.
     unsigned DefIdx;
-    if (!MRI->isSSA() && MO->isUse() &&
-        MI->isRegTiedToDefOperand(MONum, &DefIdx) &&
+    if (MF->getProperties().hasProperty(
+            MachineFunctionProperties::Property::TiedOpsRewritten) &&
+        MO->isUse() && MI->isRegTiedToDefOperand(MONum, &DefIdx) &&
         Reg != MI->getOperand(DefIdx).getReg())
       report("Two-address instruction operands must be identical", MO, MONum);
 
@@ -1709,6 +1698,15 @@ MachineVerifier::visitMachineOperand(const MachineOperand *MO, unsigned MONum) {
       if (!RC) {
         // This is a generic virtual register.
 
+        // Do not allow undef uses for generic virtual registers. This ensures
+        // getVRegDef can never fail and return null on a generic register.
+        //
+        // FIXME: This restriction should probably be broadened to all SSA
+        // MIR. However, DetectDeadLanes/ProcessImplicitDefs technically still
+        // run on the SSA function just before phi elimination.
+        if (MO->isUndef())
+          report("Generic virtual register use cannot be undef", MO, MONum);
+
         // If we're post-Select, we can't have gvregs anymore.
         if (isFunctionSelected) {
           report("Generic virtual register invalid in a Selected function",
@@ -2088,8 +2086,6 @@ void MachineVerifier::checkLiveness(const MachineOperand *MO, unsigned MONum) {
   }
 }
 
-void MachineVerifier::visitMachineInstrAfter(const MachineInstr *MI) {}
-
 // This function gets called after visiting all instructions in a bundle. The
 // argument points to the bundle header.
 // Normal stand-alone instructions are also considered 'bundles', and this
@@ -2101,10 +2097,10 @@ void MachineVerifier::visitMachineBundleAfter(const MachineInstr *MI) {
   // Kill any masked registers.
   while (!regMasks.empty()) {
     const uint32_t *Mask = regMasks.pop_back_val();
-    for (RegSet::iterator I = regsLive.begin(), E = regsLive.end(); I != E; ++I)
-      if (Register::isPhysicalRegister(*I) &&
-          MachineOperand::clobbersPhysReg(Mask, *I))
-        regsDead.push_back(*I);
+    for (unsigned Reg : regsLive)
+      if (Register::isPhysicalRegister(Reg) &&
+          MachineOperand::clobbersPhysReg(Mask, Reg))
+        regsDead.push_back(Reg);
   }
   set_subtract(regsLive, regsDead);   regsDead.clear();
   set_union(regsLive, regsDefined);   regsDefined.clear();
@@ -2126,40 +2122,171 @@ MachineVerifier::visitMachineBasicBlockAfter(const MachineBasicBlock *MBB) {
   }
 }
 
+namespace {
+// This implements a set of registers that serves as a filter: can filter other
+// sets by passing through elements not in the filter and blocking those that
+// are. Any filter implicitly includes the full set of physical registers upon
+// creation, thus filtering them all out. The filter itself as a set only grows,
+// and needs to be as efficient as possible.
+struct VRegFilter {
+  // Add elements to the filter itself. \pre Input set \p FromRegSet must have
+  // no duplicates. Both virtual and physical registers are fine.
+  template <typename RegSetT> void add(const RegSetT &FromRegSet) {
+    SmallVector<unsigned, 0> VRegsBuffer;
+    filterAndAdd(FromRegSet, VRegsBuffer);
+  }
+  // Filter \p FromRegSet through the filter and append passed elements into \p
+  // ToVRegs. All elements appended are then added to the filter itself.
+  // \returns true if anything changed.
+  template <typename RegSetT>
+  bool filterAndAdd(const RegSetT &FromRegSet,
+                    SmallVectorImpl<unsigned> &ToVRegs) {
+    unsigned SparseUniverse = Sparse.size();
+    unsigned NewSparseUniverse = SparseUniverse;
+    unsigned NewDenseSize = Dense.size();
+    size_t Begin = ToVRegs.size();
+    for (unsigned Reg : FromRegSet) {
+      if (!Register::isVirtualRegister(Reg))
+        continue;
+      unsigned Index = Register::virtReg2Index(Reg);
+      if (Index < SparseUniverseMax) {
+        if (Index < SparseUniverse && Sparse.test(Index))
+          continue;
+        NewSparseUniverse = std::max(NewSparseUniverse, Index + 1);
+      } else {
+        if (Dense.count(Reg))
+          continue;
+        ++NewDenseSize;
+      }
+      ToVRegs.push_back(Reg);
+    }
+    size_t End = ToVRegs.size();
+    if (Begin == End)
+      return false;
+    // Reserving space in sets once performs better than doing so continuously
+    // and pays easily for double look-ups (even in Dense with SparseUniverseMax
+    // tuned all the way down) and double iteration (the second one is over a
+    // SmallVector, which is a lot cheaper compared to DenseSet or BitVector).
+    Sparse.resize(NewSparseUniverse);
+    Dense.reserve(NewDenseSize);
+    for (unsigned I = Begin; I < End; ++I) {
+      unsigned Reg = ToVRegs[I];
+      unsigned Index = Register::virtReg2Index(Reg);
+      if (Index < SparseUniverseMax)
+        Sparse.set(Index);
+      else
+        Dense.insert(Reg);
+    }
+    return true;
+  }
+
+private:
+  static constexpr unsigned SparseUniverseMax = 10 * 1024 * 8;
+  // VRegs indexed within SparseUniverseMax are tracked by Sparse, those beyound
+  // are tracked by Dense. The only purpose of the threashold and the Dense set
+  // is to have a reasonably growing memory usage in pathological cases (large
+  // number of very sparse VRegFilter instances live at the same time). In
+  // practice even in the worst-by-execution time cases having all elements
+  // tracked by Sparse (very large SparseUniverseMax scenario) tends to be more
+  // space efficient than if tracked by Dense. The threashold is set to keep the
+  // worst-case memory usage within 2x of figures determined empirically for
+  // "all Dense" scenario in such worst-by-execution-time cases.
+  BitVector Sparse;
+  DenseSet<unsigned> Dense;
+};
+
+// Implements both a transfer function and a (binary, in-place) join operator
+// for a dataflow over register sets with set union join and filtering transfer
+// (out_b = in_b \ filter_b). filter_b is expected to be set-up ahead of time.
+// Maintains out_b as its state, allowing for O(n) iteration over it at any
+// time, where n is the size of the set (as opposed to O(U) where U is the
+// universe). filter_b implicitly contains all physical registers at all times.
+class FilteringVRegSet {
+  VRegFilter Filter;
+  SmallVector<unsigned, 0> VRegs;
+
+public:
+  // Set-up the filter_b. \pre Input register set \p RS must have no duplicates.
+  // Both virtual and physical registers are fine.
+  template <typename RegSetT> void addToFilter(const RegSetT &RS) {
+    Filter.add(RS);
+  }
+  // Passes \p RS through the filter_b (transfer function) and adds what's left
+  // to itself (out_b).
+  template <typename RegSetT> bool add(const RegSetT &RS) {
+    // Double-duty the Filter: to maintain VRegs a set (and the join operation
+    // a set union) just add everything being added here to the Filter as well.
+    return Filter.filterAndAdd(RS, VRegs);
+  }
+  using const_iterator = decltype(VRegs)::const_iterator;
+  const_iterator begin() const { return VRegs.begin(); }
+  const_iterator end() const { return VRegs.end(); }
+  size_t size() const { return VRegs.size(); }
+};
+} // namespace
+
 // Calculate the largest possible vregsPassed sets. These are the registers that
 // can pass through an MBB live, but may not be live every time. It is assumed
 // that all vregsPassed sets are empty before the call.
 void MachineVerifier::calcRegsPassed() {
+  // This is a forward dataflow, doing it in RPO. A standard map serves as a
+  // priority (sorting by RPO number) queue, deduplicating worklist, and an RPO
+  // number to MBB mapping all at once.
+  std::map<unsigned, const MachineBasicBlock *> RPOWorklist;
+  DenseMap<const MachineBasicBlock *, unsigned> RPONumbers;
+  if (MF->empty()) {
+    // ReversePostOrderTraversal doesn't handle empty functions.
+    return;
+  }
+  std::vector<FilteringVRegSet> VRegsPassedSets(MF->size());
+  for (const MachineBasicBlock *MBB :
+       ReversePostOrderTraversal<const MachineFunction *>(MF)) {
+    // Careful with the evaluation order, fetch next number before allocating.
+    unsigned Number = RPONumbers.size();
+    RPONumbers[MBB] = Number;
+    // Set-up the transfer functions for all blocks.
+    const BBInfo &MInfo = MBBInfoMap[MBB];
+    VRegsPassedSets[Number].addToFilter(MInfo.regsKilled);
+    VRegsPassedSets[Number].addToFilter(MInfo.regsLiveOut);
+  }
   // First push live-out regs to successors' vregsPassed. Remember the MBBs that
   // have any vregsPassed.
-  SmallPtrSet<const MachineBasicBlock*, 8> todo;
-  for (const auto &MBB : *MF) {
-    BBInfo &MInfo = MBBInfoMap[&MBB];
+  for (const MachineBasicBlock &MBB : *MF) {
+    const BBInfo &MInfo = MBBInfoMap[&MBB];
     if (!MInfo.reachable)
       continue;
-    for (MachineBasicBlock::const_succ_iterator SuI = MBB.succ_begin(),
-           SuE = MBB.succ_end(); SuI != SuE; ++SuI) {
-      BBInfo &SInfo = MBBInfoMap[*SuI];
-      if (SInfo.addPassed(MInfo.regsLiveOut))
-        todo.insert(*SuI);
-    }
-  }
-
-  // Iteratively push vregsPassed to successors. This will converge to the same
-  // final state regardless of DenseSet iteration order.
-  while (!todo.empty()) {
-    const MachineBasicBlock *MBB = *todo.begin();
-    todo.erase(MBB);
-    BBInfo &MInfo = MBBInfoMap[MBB];
-    for (MachineBasicBlock::const_succ_iterator SuI = MBB->succ_begin(),
-           SuE = MBB->succ_end(); SuI != SuE; ++SuI) {
-      if (*SuI == MBB)
+    for (const MachineBasicBlock *Succ : MBB.successors()) {
+      unsigned SuccNumber = RPONumbers[Succ];
+      FilteringVRegSet &SuccSet = VRegsPassedSets[SuccNumber];
+      if (SuccSet.add(MInfo.regsLiveOut))
+        RPOWorklist.emplace(SuccNumber, Succ);
+    }
+  }
+
+  // Iteratively push vregsPassed to successors.
+  while (!RPOWorklist.empty()) {
+    auto Next = RPOWorklist.begin();
+    const MachineBasicBlock *MBB = Next->second;
+    RPOWorklist.erase(Next);
+    FilteringVRegSet &MSet = VRegsPassedSets[RPONumbers[MBB]];
+    for (const MachineBasicBlock *Succ : MBB->successors()) {
+      if (Succ == MBB)
         continue;
-      BBInfo &SInfo = MBBInfoMap[*SuI];
-      if (SInfo.addPassed(MInfo.vregsPassed))
-        todo.insert(*SuI);
+      unsigned SuccNumber = RPONumbers[Succ];
+      FilteringVRegSet &SuccSet = VRegsPassedSets[SuccNumber];
+      if (SuccSet.add(MSet))
+        RPOWorklist.emplace(SuccNumber, Succ);
     }
   }
+  // Copy the results back to BBInfos.
+  for (const MachineBasicBlock &MBB : *MF) {
+    BBInfo &MInfo = MBBInfoMap[&MBB];
+    if (!MInfo.reachable)
+      continue;
+    const FilteringVRegSet &MSet = VRegsPassedSets[RPONumbers[&MBB]];
+    MInfo.vregsPassed.reserve(MSet.size());
+    MInfo.vregsPassed.insert(MSet.begin(), MSet.end());
+  }
 }
 
 // Calculate the set of virtual registers that must be passed through each basic
@@ -2170,11 +2297,10 @@ void MachineVerifier::calcRegsRequired() {
   SmallPtrSet<const MachineBasicBlock*, 8> todo;
   for (const auto &MBB : *MF) {
     BBInfo &MInfo = MBBInfoMap[&MBB];
-    for (MachineBasicBlock::const_pred_iterator PrI = MBB.pred_begin(),
-           PrE = MBB.pred_end(); PrI != PrE; ++PrI) {
-      BBInfo &PInfo = MBBInfoMap[*PrI];
+    for (const MachineBasicBlock *Pred : MBB.predecessors()) {
+      BBInfo &PInfo = MBBInfoMap[Pred];
       if (PInfo.addRequired(MInfo.vregsLiveIn))
-        todo.insert(*PrI);
+        todo.insert(Pred);
     }
   }
 
@@ -2184,13 +2310,12 @@ void MachineVerifier::calcRegsRequired() {
     const MachineBasicBlock *MBB = *todo.begin();
     todo.erase(MBB);
     BBInfo &MInfo = MBBInfoMap[MBB];
-    for (MachineBasicBlock::const_pred_iterator PrI = MBB->pred_begin(),
-           PrE = MBB->pred_end(); PrI != PrE; ++PrI) {
-      if (*PrI == MBB)
+    for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+      if (Pred == MBB)
         continue;
-      BBInfo &SInfo = MBBInfoMap[*PrI];
+      BBInfo &SInfo = MBBInfoMap[Pred];
       if (SInfo.addRequired(MInfo.vregsRequired))
-        todo.insert(*PrI);
+        todo.insert(Pred);
     }
   }
 }
@@ -2274,23 +2399,19 @@ void MachineVerifier::visitMachineFunctionAfter() {
   // Check for killed virtual registers that should be live out.
   for (const auto &MBB : *MF) {
     BBInfo &MInfo = MBBInfoMap[&MBB];
-    for (RegSet::iterator
-         I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E;
-         ++I)
-      if (MInfo.regsKilled.count(*I)) {
+    for (unsigned VReg : MInfo.vregsRequired)
+      if (MInfo.regsKilled.count(VReg)) {
         report("Virtual register killed in block, but needed live out.", &MBB);
-        errs() << "Virtual register " << printReg(*I)
+        errs() << "Virtual register " << printReg(VReg)
                << " is used after the block.\n";
       }
   }
 
   if (!MF->empty()) {
     BBInfo &MInfo = MBBInfoMap[&MF->front()];
-    for (RegSet::iterator
-         I = MInfo.vregsRequired.begin(), E = MInfo.vregsRequired.end(); I != E;
-         ++I) {
+    for (unsigned VReg : MInfo.vregsRequired) {
       report("Virtual register defs don't dominate all uses.", MF);
-      report_context_vreg(*I);
+      report_context_vreg(VReg);
     }
   }
 
@@ -2652,9 +2773,8 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
       VNI->def == LiveInts->getMBBStartIdx(&*MFI);
 
     // Check that VNI is live-out of all predecessors.
-    for (MachineBasicBlock::const_pred_iterator PI = MFI->pred_begin(),
-         PE = MFI->pred_end(); PI != PE; ++PI) {
-      SlotIndex PEnd = LiveInts->getMBBEndIdx(*PI);
+    for (const MachineBasicBlock *Pred : MFI->predecessors()) {
+      SlotIndex PEnd = LiveInts->getMBBEndIdx(Pred);
       const VNInfo *PVNI = LR.getVNInfoBefore(PEnd);
 
       // All predecessors must have a live-out value. However for a phi
@@ -2662,9 +2782,9 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
       // only one of the subregisters (not necessarily the current one) needs to
       // be defined.
       if (!PVNI && (LaneMask.none() || !IsPHI)) {
-        if (LiveRangeCalc::isJointlyDominated(*PI, Undefs, *Indexes))
+        if (LiveRangeCalc::isJointlyDominated(Pred, Undefs, *Indexes))
           continue;
-        report("Register not marked live out of predecessor", *PI);
+        report("Register not marked live out of predecessor", Pred);
         report_context(LR, Reg, LaneMask);
         report_context(*VNI);
         errs() << " live into " << printMBBReference(*MFI) << '@'
@@ -2675,10 +2795,10 @@ void MachineVerifier::verifyLiveRangeSegment(const LiveRange &LR,
 
       // Only PHI-defs can take different predecessor values.
       if (!IsPHI && PVNI != VNI) {
-        report("Different value live out of predecessor", *PI);
+        report("Different value live out of predecessor", Pred);
         report_context(LR, Reg, LaneMask);
         errs() << "Valno #" << PVNI->id << " live out of "
-               << printMBBReference(*(*PI)) << '@' << PEnd << "\nValno #"
+               << printMBBReference(*Pred) << '@' << PEnd << "\nValno #"
                << VNI->id << " live into " << printMBBReference(*MFI) << '@'
                << LiveInts->getMBBStartIdx(&*MFI) << '\n';
       }
@@ -2734,10 +2854,9 @@ void MachineVerifier::verifyLiveInterval(const LiveInterval &LI) {
     report_context(LI);
     for (unsigned comp = 0; comp != NumComp; ++comp) {
       errs() << comp << ": valnos";
-      for (LiveInterval::const_vni_iterator I = LI.vni_begin(),
-           E = LI.vni_end(); I!=E; ++I)
-        if (comp == ConEQ.getEqClass(*I))
-          errs() << ' ' << (*I)->id;
+      for (const VNInfo *I : LI.valnos)
+        if (comp == ConEQ.getEqClass(I))
+          errs() << ' ' << I->id;
       errs() << '\n';
     }
   }
@@ -2824,15 +2943,14 @@ void MachineVerifier::verifyStackFrame() {
 
     // Make sure the exit state of any predecessor is consistent with the entry
     // state.
-    for (MachineBasicBlock::const_pred_iterator I = MBB->pred_begin(),
-         E = MBB->pred_end(); I != E; ++I) {
-      if (Reachable.count(*I) &&
-          (SPState[(*I)->getNumber()].ExitValue != BBState.EntryValue ||
-           SPState[(*I)->getNumber()].ExitIsSetup != BBState.EntryIsSetup)) {
+    for (const MachineBasicBlock *Pred : MBB->predecessors()) {
+      if (Reachable.count(Pred) &&
+          (SPState[Pred->getNumber()].ExitValue != BBState.EntryValue ||
+           SPState[Pred->getNumber()].ExitIsSetup != BBState.EntryIsSetup)) {
         report("The exit stack state of a predecessor is inconsistent.", MBB);
-        errs() << "Predecessor " << printMBBReference(*(*I))
-               << " has exit state (" << SPState[(*I)->getNumber()].ExitValue
-               << ", " << SPState[(*I)->getNumber()].ExitIsSetup << "), while "
+        errs() << "Predecessor " << printMBBReference(*Pred)
+               << " has exit state (" << SPState[Pred->getNumber()].ExitValue
+               << ", " << SPState[Pred->getNumber()].ExitIsSetup << "), while "
                << printMBBReference(*MBB) << " has entry state ("
                << BBState.EntryValue << ", " << BBState.EntryIsSetup << ").\n";
       }
@@ -2840,15 +2958,14 @@ void MachineVerifier::verifyStackFrame() {
 
     // Make sure the entry state of any successor is consistent with the exit
     // state.
-    for (MachineBasicBlock::const_succ_iterator I = MBB->succ_begin(),
-         E = MBB->succ_end(); I != E; ++I) {
-      if (Reachable.count(*I) &&
-          (SPState[(*I)->getNumber()].EntryValue != BBState.ExitValue ||
-           SPState[(*I)->getNumber()].EntryIsSetup != BBState.ExitIsSetup)) {
+    for (const MachineBasicBlock *Succ : MBB->successors()) {
+      if (Reachable.count(Succ) &&
+          (SPState[Succ->getNumber()].EntryValue != BBState.ExitValue ||
+           SPState[Succ->getNumber()].EntryIsSetup != BBState.ExitIsSetup)) {
         report("The entry stack state of a successor is inconsistent.", MBB);
-        errs() << "Successor " << printMBBReference(*(*I))
-               << " has entry state (" << SPState[(*I)->getNumber()].EntryValue
-               << ", " << SPState[(*I)->getNumber()].EntryIsSetup << "), while "
+        errs() << "Successor " << printMBBReference(*Succ)
+               << " has entry state (" << SPState[Succ->getNumber()].EntryValue
+               << ", " << SPState[Succ->getNumber()].EntryIsSetup << "), while "
                << printMBBReference(*MBB) << " has exit state ("
                << BBState.ExitValue << ", " << BBState.ExitIsSetup << ").\n";
       }
diff --git a/llvm/lib/CodeGen/ModuloSchedule.cpp b/llvm/lib/CodeGen/ModuloSchedule.cpp
index 163e52d9199d..d85b1b7988ce 100644
--- a/llvm/lib/CodeGen/ModuloSchedule.cpp
+++ b/llvm/lib/CodeGen/ModuloSchedule.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/CodeGen/ModuloSchedule.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopUtils.h"
@@ -420,7 +421,7 @@ void ModuloScheduleExpander::generateExistingPhis(
     unsigned NewReg = 0;
     unsigned AccessStage = (LoopValStage != -1) ? LoopValStage : StageScheduled;
     // In the epilog, we may need to look back one stage to get the correct
-    // Phi name because the epilog and prolog blocks execute the same stage.
+    // Phi name, because the epilog and prolog blocks execute the same stage.
     // The correct name is from the previous block only when the Phi has
     // been completely scheduled prior to the epilog, and Phi value is not
     // needed in multiple stages.
@@ -913,7 +914,12 @@ bool ModuloScheduleExpander::computeDelta(MachineInstr &MI, unsigned &Delta) {
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
   const MachineOperand *BaseOp;
   int64_t Offset;
-  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI))
+  bool OffsetIsScalable;
+  if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable, TRI))
+    return false;
+
+  // FIXME: This algorithm assumes instructions have fixed-size offsets.
+  if (OffsetIsScalable)
     return false;
 
   if (!BaseOp->isReg())
@@ -1435,11 +1441,15 @@ Register KernelRewriter::remapUse(Register Reg, MachineInstr &MI) {
     // immediately prior to pruning.
     auto RC = MRI.getRegClass(Reg);
     Register R = MRI.createVirtualRegister(RC);
-    BuildMI(*BB, MI, DebugLoc(), TII->get(TargetOpcode::PHI), R)
-        .addReg(IllegalPhiDefault.getValue())
-        .addMBB(PreheaderBB) // Block choice is arbitrary and has no effect.
-        .addReg(LoopReg)
-        .addMBB(BB); // Block choice is arbitrary and has no effect.
+    MachineInstr *IllegalPhi =
+        BuildMI(*BB, MI, DebugLoc(), TII->get(TargetOpcode::PHI), R)
+            .addReg(IllegalPhiDefault.getValue())
+            .addMBB(PreheaderBB) // Block choice is arbitrary and has no effect.
+            .addReg(LoopReg)
+            .addMBB(BB); // Block choice is arbitrary and has no effect.
+    // Illegal phi should belong to the producer stage so that it can be
+    // filtered correctly during peeling.
+    S.setStage(IllegalPhi, LoopProducerStage);
     return R;
   }
 
@@ -1620,18 +1630,21 @@ void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
     MachineInstr *MI = &*I++;
     if (MI->isPHI()) {
       // This is an illegal PHI. If we move any instructions using an illegal
-      // PHI, we need to create a legal Phi
-      Register PhiR = MI->getOperand(0).getReg();
-      auto RC = MRI.getRegClass(PhiR);
-      Register NR = MRI.createVirtualRegister(RC);
-      MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(), DebugLoc(),
-                                 TII->get(TargetOpcode::PHI), NR)
-                             .addReg(PhiR)
-                             .addMBB(SourceBB);
-      BlockMIs[{DestBB, CanonicalMIs[MI]}] = NI;
-      CanonicalMIs[NI] = CanonicalMIs[MI];
-      Remaps[PhiR] = NR;
-      continue;
+      // PHI, we need to create a legal Phi.
+      if (getStage(MI) != Stage) {
+        // The legal Phi is not necessary if the illegal phi's stage
+        // is being moved.
+        Register PhiR = MI->getOperand(0).getReg();
+        auto RC = MRI.getRegClass(PhiR);
+        Register NR = MRI.createVirtualRegister(RC);
+        MachineInstr *NI = BuildMI(*DestBB, DestBB->getFirstNonPHI(),
+                                   DebugLoc(), TII->get(TargetOpcode::PHI), NR)
+                               .addReg(PhiR)
+                               .addMBB(SourceBB);
+        BlockMIs[{DestBB, CanonicalMIs[MI]}] = NI;
+        CanonicalMIs[NI] = CanonicalMIs[MI];
+        Remaps[PhiR] = NR;
+      }
     }
     if (getStage(MI) != Stage)
       continue;
@@ -1649,8 +1662,8 @@ void PeelingModuloScheduleExpander::moveStageBetweenBlocks(
     // we don't need the phi anymore.
     if (getStage(Def) == Stage) {
       Register PhiReg = MI.getOperand(0).getReg();
-      MRI.replaceRegWith(MI.getOperand(0).getReg(),
-                         Def->getOperand(0).getReg());
+      assert(Def->findRegisterDefOperandIdx(MI.getOperand(1).getReg()) != -1);
+      MRI.replaceRegWith(MI.getOperand(0).getReg(), MI.getOperand(1).getReg());
       MI.getOperand(0).setReg(PhiReg);
       PhiToDelete.push_back(&MI);
     }
@@ -1698,16 +1711,17 @@ PeelingModuloScheduleExpander::getPhiCanonicalReg(MachineInstr *CanonicalPhi,
                                                   MachineInstr *Phi) {
   unsigned distance = PhiNodeLoopIteration[Phi];
   MachineInstr *CanonicalUse = CanonicalPhi;
+  Register CanonicalUseReg = CanonicalUse->getOperand(0).getReg();
   for (unsigned I = 0; I < distance; ++I) {
     assert(CanonicalUse->isPHI());
     assert(CanonicalUse->getNumOperands() == 5);
     unsigned LoopRegIdx = 3, InitRegIdx = 1;
     if (CanonicalUse->getOperand(2).getMBB() == CanonicalUse->getParent())
       std::swap(LoopRegIdx, InitRegIdx);
-    CanonicalUse =
-        MRI.getVRegDef(CanonicalUse->getOperand(LoopRegIdx).getReg());
+    CanonicalUseReg = CanonicalUse->getOperand(LoopRegIdx).getReg();
+    CanonicalUse = MRI.getVRegDef(CanonicalUseReg);
   }
-  return CanonicalUse->getOperand(0).getReg();
+  return CanonicalUseReg;
 }
 
 void PeelingModuloScheduleExpander::peelPrologAndEpilogs() {
@@ -1933,7 +1947,7 @@ void PeelingModuloScheduleExpander::fixupBranches() {
     SmallVector<MachineOperand, 4> Cond;
     TII->removeBranch(*Prolog);
     Optional<bool> StaticallyGreater =
-        Info->createTripCountGreaterCondition(TC, *Prolog, Cond);
+        LoopInfo->createTripCountGreaterCondition(TC, *Prolog, Cond);
     if (!StaticallyGreater.hasValue()) {
       LLVM_DEBUG(dbgs() << "Dynamic: TC > " << TC << "\n");
       // Dynamically branch based on Cond.
@@ -1961,10 +1975,10 @@ void PeelingModuloScheduleExpander::fixupBranches() {
   }
 
   if (!KernelDisposed) {
-    Info->adjustTripCount(-(Schedule.getNumStages() - 1));
-    Info->setPreheader(Prologs.back());
+    LoopInfo->adjustTripCount(-(Schedule.getNumStages() - 1));
+    LoopInfo->setPreheader(Prologs.back());
   } else {
-    Info->disposed();
+    LoopInfo->disposed();
   }
 }
 
@@ -1977,8 +1991,8 @@ void PeelingModuloScheduleExpander::expand() {
   BB = Schedule.getLoop()->getTopBlock();
   Preheader = Schedule.getLoop()->getLoopPreheader();
   LLVM_DEBUG(Schedule.dump());
-  Info = TII->analyzeLoopForPipelining(BB);
-  assert(Info);
+  LoopInfo = TII->analyzeLoopForPipelining(BB);
+  assert(LoopInfo);
 
   rewriteKernel();
   peelPrologAndEpilogs();
diff --git a/llvm/lib/CodeGen/PHIElimination.cpp b/llvm/lib/CodeGen/PHIElimination.cpp
index 4dd4c4b1084e..311b87fa9e3b 100644
--- a/llvm/lib/CodeGen/PHIElimination.cpp
+++ b/llvm/lib/CodeGen/PHIElimination.cpp
@@ -96,7 +96,8 @@ namespace {
 
     /// Split critical edges where necessary for good coalescer performance.
     bool SplitPHIEdges(MachineFunction &MF, MachineBasicBlock &MBB,
-                       MachineLoopInfo *MLI);
+                       MachineLoopInfo *MLI,
+                       std::vector<SparseBitVector<>> *LiveInSets);
 
     // These functions are temporary abstractions around LiveVariables and
     // LiveIntervals, so they can go away when LiveVariables does.
@@ -151,16 +152,45 @@ bool PHIElimination::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
 
-  // This pass takes the function out of SSA form.
-  MRI->leaveSSA();
-
   // Split critical edges to help the coalescer.
   if (!DisableEdgeSplitting && (LV || LIS)) {
+    // A set of live-in regs for each MBB which is used to update LV
+    // efficiently also with large functions.
+    std::vector<SparseBitVector<>> LiveInSets;
+    if (LV) {
+      LiveInSets.resize(MF.size());
+      for (unsigned Index = 0, e = MRI->getNumVirtRegs(); Index != e; ++Index) {
+        // Set the bit for this register for each MBB where it is
+        // live-through or live-in (killed).
+        unsigned VirtReg = Register::index2VirtReg(Index);
+        MachineInstr *DefMI = MRI->getVRegDef(VirtReg);
+        if (!DefMI)
+          continue;
+        LiveVariables::VarInfo &VI = LV->getVarInfo(VirtReg);
+        SparseBitVector<>::iterator AliveBlockItr = VI.AliveBlocks.begin();
+        SparseBitVector<>::iterator EndItr = VI.AliveBlocks.end();
+        while (AliveBlockItr != EndItr) {
+          unsigned BlockNum = *(AliveBlockItr++);
+          LiveInSets[BlockNum].set(Index);
+        }
+        // The register is live into an MBB in which it is killed but not
+        // defined. See comment for VarInfo in LiveVariables.h.
+        MachineBasicBlock *DefMBB = DefMI->getParent();
+        if (VI.Kills.size() > 1 ||
+            (!VI.Kills.empty() && VI.Kills.front()->getParent() != DefMBB))
+          for (auto *MI : VI.Kills)
+            LiveInSets[MI->getParent()->getNumber()].set(Index);
+      }
+    }
+
     MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
     for (auto &MBB : MF)
-      Changed |= SplitPHIEdges(MF, MBB, MLI);
+      Changed |= SplitPHIEdges(MF, MBB, MLI, (LV ? &LiveInSets : nullptr));
   }
 
+  // This pass takes the function out of SSA form.
+  MRI->leaveSSA();
+
   // Populate VRegPHIUseCount
   analyzePHINodes(MF);
 
@@ -561,7 +591,8 @@ void PHIElimination::analyzePHINodes(const MachineFunction& MF) {
 
 bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
                                    MachineBasicBlock &MBB,
-                                   MachineLoopInfo *MLI) {
+                                   MachineLoopInfo *MLI,
+                                   std::vector<SparseBitVector<>> *LiveInSets) {
   if (MBB.empty() || !MBB.front().isPHI() || MBB.isEHPad())
     return false;   // Quick exit for basic blocks without PHIs.
 
@@ -628,7 +659,7 @@ bool PHIElimination::SplitPHIEdges(MachineFunction &MF,
       }
       if (!ShouldSplit && !SplitAllCriticalEdges)
         continue;
-      if (!PreMBB->SplitCriticalEdge(&MBB, *this)) {
+      if (!PreMBB->SplitCriticalEdge(&MBB, *this, LiveInSets)) {
         LLVM_DEBUG(dbgs() << "Failed to split critical edge.\n");
         continue;
       }
diff --git a/llvm/lib/CodeGen/PHIEliminationUtils.cpp b/llvm/lib/CodeGen/PHIEliminationUtils.cpp
index 3a2cdaf3bd3c..bae96eb84521 100644
--- a/llvm/lib/CodeGen/PHIEliminationUtils.cpp
+++ b/llvm/lib/CodeGen/PHIEliminationUtils.cpp
@@ -26,8 +26,9 @@ llvm::findPHICopyInsertPoint(MachineBasicBlock* MBB, MachineBasicBlock* SuccMBB,
 
   // Usually, we just want to insert the copy before the first terminator
   // instruction. However, for the edge going to a landing pad, we must insert
-  // the copy before the call/invoke instruction.
-  if (!SuccMBB->isEHPad())
+  // the copy before the call/invoke instruction. Similarly for an INLINEASM_BR
+  // going to an indirect target.
+  if (!SuccMBB->isEHPad() && !SuccMBB->isInlineAsmBrIndirectTarget())
     return MBB->getFirstTerminator();
 
   // Discover any defs/uses in this basic block.
diff --git a/llvm/lib/CodeGen/ParallelCG.cpp b/llvm/lib/CodeGen/ParallelCG.cpp
index 7dbd830666fb..c19ed1f8f71d 100644
--- a/llvm/lib/CodeGen/ParallelCG.cpp
+++ b/llvm/lib/CodeGen/ParallelCG.cpp
@@ -51,7 +51,7 @@ std::unique_ptr<Module> llvm::splitCodeGen(
   // Create ThreadPool in nested scope so that threads will be joined
   // on destruction.
   {
-    ThreadPool CodegenThreadPool(OSs.size());
+    ThreadPool CodegenThreadPool(hardware_concurrency(OSs.size()));
     int ThreadCount = 0;
 
     SplitModule(
diff --git a/llvm/lib/CodeGen/PatchableFunction.cpp b/llvm/lib/CodeGen/PatchableFunction.cpp
index 1d6069c50554..ca44b7a53982 100644
--- a/llvm/lib/CodeGen/PatchableFunction.cpp
+++ b/llvm/lib/CodeGen/PatchableFunction.cpp
@@ -57,9 +57,9 @@ static bool doesNotGeneratecode(const MachineInstr &MI) {
 bool PatchableFunction::runOnMachineFunction(MachineFunction &MF) {
   if (MF.getFunction().hasFnAttribute("patchable-function-entry")) {
     MachineBasicBlock &FirstMBB = *MF.begin();
-    MachineInstr &FirstMI = *FirstMBB.begin();
     const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
-    BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(),
+    // The initial .loc covers PATCHABLE_FUNCTION_ENTER.
+    BuildMI(FirstMBB, FirstMBB.begin(), DebugLoc(),
             TII->get(TargetOpcode::PATCHABLE_FUNCTION_ENTER));
     return true;
   }
diff --git a/llvm/lib/CodeGen/PeepholeOptimizer.cpp b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
index c9c279cf0ddf..4a66863ea803 100644
--- a/llvm/lib/CodeGen/PeepholeOptimizer.cpp
+++ b/llvm/lib/CodeGen/PeepholeOptimizer.cpp
@@ -457,12 +457,12 @@ INITIALIZE_PASS_END(PeepholeOptimizer, DEBUG_TYPE,
 bool PeepholeOptimizer::
 optimizeExtInstr(MachineInstr &MI, MachineBasicBlock &MBB,
                  SmallPtrSetImpl<MachineInstr*> &LocalMIs) {
-  unsigned SrcReg, DstReg, SubIdx;
+  Register SrcReg, DstReg;
+  unsigned SubIdx;
   if (!TII->isCoalescableExtInstr(MI, SrcReg, DstReg, SubIdx))
     return false;
 
-  if (Register::isPhysicalRegister(DstReg) ||
-      Register::isPhysicalRegister(SrcReg))
+  if (DstReg.isPhysical() || SrcReg.isPhysical())
     return false;
 
   if (MRI->hasOneNonDBGUse(SrcReg))
@@ -607,15 +607,16 @@ optimizeExtInstr(MachineInstr &MI, MachineBasicBlock &MBB,
 bool PeepholeOptimizer::optimizeCmpInstr(MachineInstr &MI) {
   // If this instruction is a comparison against zero and isn't comparing a
   // physical register, we can try to optimize it.
-  unsigned SrcReg, SrcReg2;
+  Register SrcReg, SrcReg2;
   int CmpMask, CmpValue;
   if (!TII->analyzeCompare(MI, SrcReg, SrcReg2, CmpMask, CmpValue) ||
-      Register::isPhysicalRegister(SrcReg) ||
-      (SrcReg2 != 0 && Register::isPhysicalRegister(SrcReg2)))
+      SrcReg.isPhysical() || SrcReg2.isPhysical())
     return false;
 
   // Attempt to optimize the comparison instruction.
+  LLVM_DEBUG(dbgs() << "Attempting to optimize compare: " << MI);
   if (TII->optimizeCompareInstr(MI, SrcReg, SrcReg2, CmpMask, CmpValue, MRI)) {
+    LLVM_DEBUG(dbgs() << "  -> Successfully optimized compare!\n");
     ++NumCmps;
     return true;
   }
@@ -636,6 +637,7 @@ bool PeepholeOptimizer::optimizeSelect(MachineInstr &MI,
     return false;
   if (!TII->optimizeSelect(MI, LocalMIs))
     return false;
+  LLVM_DEBUG(dbgs() << "Deleting select: " << MI);
   MI.eraseFromParent();
   ++NumSelects;
   return true;
@@ -663,8 +665,8 @@ bool PeepholeOptimizer::findNextSource(RegSubRegPair RegSubReg,
   // So far we do not have any motivating example for doing that.
   // Thus, instead of maintaining untested code, we will revisit that if
   // that changes at some point.
-  unsigned Reg = RegSubReg.Reg;
-  if (Register::isPhysicalRegister(Reg))
+  Register Reg = RegSubReg.Reg;
+  if (Reg.isPhysical())
     return false;
   const TargetRegisterClass *DefRC = MRI->getRegClass(Reg);
 
@@ -1300,6 +1302,7 @@ bool PeepholeOptimizer::optimizeUncoalescableCopy(
   }
 
   // MI is now dead.
+  LLVM_DEBUG(dbgs() << "Deleting uncoalescable copy: " << MI);
   MI.eraseFromParent();
   ++NumUncoalescableCopies;
   return true;
@@ -1724,6 +1727,7 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
           (foldRedundantCopy(*MI, CopySrcRegs, CopySrcMIs) ||
            foldRedundantNAPhysCopy(*MI, NAPhysToVirtMIs))) {
         LocalMIs.erase(MI);
+        LLVM_DEBUG(dbgs() << "Deleting redundant copy: " << *MI << "\n");
         MI->eraseFromParent();
         Changed = true;
         continue;
@@ -1776,7 +1780,8 @@ bool PeepholeOptimizer::runOnMachineFunction(MachineFunction &MF) {
               LocalMIs.erase(MI);
               LocalMIs.erase(DefMI);
               LocalMIs.insert(FoldMI);
-              if (MI->isCall())
+              // Update the call site info.
+              if (MI->shouldUpdateCallSiteInfo())
                 MI->getMF()->moveCallSiteInfo(MI, FoldMI);
               MI->eraseFromParent();
               DefMI->eraseFromParent();
diff --git a/llvm/lib/CodeGen/PostRASchedulerList.cpp b/llvm/lib/CodeGen/PostRASchedulerList.cpp
index d68959935cec..b85f00a61eac 100644
--- a/llvm/lib/CodeGen/PostRASchedulerList.cpp
+++ b/llvm/lib/CodeGen/PostRASchedulerList.cpp
@@ -17,11 +17,9 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AggressiveAntiDepBreaker.h"
-#include "AntiDepBreaker.h"
-#include "CriticalAntiDepBreaker.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/CodeGen/AntiDepBreaker.h"
 #include "llvm/CodeGen/LatencyPriorityQueue.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -220,11 +218,11 @@ SchedulePostRATDList::SchedulePostRATDList(
   assert((AntiDepMode == TargetSubtargetInfo::ANTIDEP_NONE ||
           MRI.tracksLiveness()) &&
          "Live-ins must be accurate for anti-dependency breaking");
-  AntiDepBreak =
-    ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_ALL) ?
-     (AntiDepBreaker *)new AggressiveAntiDepBreaker(MF, RCI, CriticalPathRCs) :
-     ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_CRITICAL) ?
-      (AntiDepBreaker *)new CriticalAntiDepBreaker(MF, RCI) : nullptr));
+  AntiDepBreak = ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_ALL)
+                      ? createAggressiveAntiDepBreaker(MF, RCI, CriticalPathRCs)
+                      : ((AntiDepMode == TargetSubtargetInfo::ANTIDEP_CRITICAL)
+                             ? createCriticalAntiDepBreaker(MF, RCI)
+                             : nullptr));
 }
 
 SchedulePostRATDList::~SchedulePostRATDList() {
diff --git a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
index 1ff4e7cbd8fb..1be9544848ec 100644
--- a/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
+++ b/llvm/lib/CodeGen/PreISelIntrinsicLowering.cpp
@@ -39,14 +39,14 @@ static bool lowerLoadRelative(Function &F) {
   for (auto I = F.use_begin(), E = F.use_end(); I != E;) {
     auto CI = dyn_cast<CallInst>(I->getUser());
     ++I;
-    if (!CI || CI->getCalledValue() != &F)
+    if (!CI || CI->getCalledOperand() != &F)
       continue;
 
     IRBuilder<> B(CI);
     Value *OffsetPtr =
         B.CreateGEP(Int8Ty, CI->getArgOperand(0), CI->getArgOperand(1));
     Value *OffsetPtrI32 = B.CreateBitCast(OffsetPtr, Int32PtrTy);
-    Value *OffsetI32 = B.CreateAlignedLoad(Int32Ty, OffsetPtrI32, 4);
+    Value *OffsetI32 = B.CreateAlignedLoad(Int32Ty, OffsetPtrI32, Align(4));
 
     Value *ResultPtr = B.CreateGEP(Int8Ty, CI->getArgOperand(0), OffsetI32);
 
diff --git a/llvm/lib/CodeGen/PrologEpilogInserter.cpp b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
index 3909b5717281..a489f493d5ee 100644
--- a/llvm/lib/CodeGen/PrologEpilogInserter.cpp
+++ b/llvm/lib/CodeGen/PrologEpilogInserter.cpp
@@ -185,7 +185,7 @@ static void stashEntryDbgValues(MachineBasicBlock &MBB,
       break;
     if (!MI.isDebugValue() || !MI.getDebugVariable()->isParameter())
       continue;
-    if (MI.getOperand(0).isFI()) {
+    if (MI.getDebugOperand(0).isFI()) {
       // We can only emit valid locations for frame indices after the frame
       // setup, so do not stash away them.
       FrameIndexValues.push_back(&MI);
@@ -237,7 +237,7 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
     stashEntryDbgValues(*SaveBlock, EntryDbgValues);
 
   // Handle CSR spilling and restoring, for targets that need it.
-  if (MF.getTarget().usesPhysRegsForPEI())
+  if (MF.getTarget().usesPhysRegsForValues())
     spillCalleeSavedRegs(MF);
 
   // Allow the target machine to make final modifications to the function
@@ -259,6 +259,10 @@ bool PEI::runOnMachineFunction(MachineFunction &MF) {
   for (auto &I : EntryDbgValues)
     I.first->insert(I.first->begin(), I.second.begin(), I.second.end());
 
+  // Allow the target machine to make final modifications to the function
+  // before the frame layout is finalized.
+  TFI->processFunctionBeforeFrameIndicesReplaced(MF, RS);
+
   // Replace all MO_FrameIndex operands with physical register references
   // and actual offsets.
   //
@@ -434,14 +438,12 @@ static void assignCalleeSavedSpillSlots(MachineFunction &F,
       unsigned Size = RegInfo->getSpillSize(*RC);
       if (FixedSlot == FixedSpillSlots + NumFixedSpillSlots) {
         // Nope, just spill it anywhere convenient.
-        unsigned Align = RegInfo->getSpillAlignment(*RC);
-        unsigned StackAlign = TFI->getStackAlignment();
-
+        Align Alignment(RegInfo->getSpillAlignment(*RC));
         // We may not be able to satisfy the desired alignment specification of
         // the TargetRegisterClass if the stack alignment is smaller. Use the
         // min.
-        Align = std::min(Align, StackAlign);
-        FrameIdx = MFI.CreateStackObject(Size, Align, true);
+        Alignment = std::min(Alignment, TFI->getStackAlign());
+        FrameIdx = MFI.CreateStackObject(Size, Alignment, true);
         if ((unsigned)FrameIdx < MinCSFrameIndex) MinCSFrameIndex = FrameIdx;
         if ((unsigned)FrameIdx > MaxCSFrameIndex) MaxCSFrameIndex = FrameIdx;
       } else {
@@ -631,22 +633,21 @@ void PEI::spillCalleeSavedRegs(MachineFunction &MF) {
 }
 
 /// AdjustStackOffset - Helper function used to adjust the stack frame offset.
-static inline void
-AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
-                  bool StackGrowsDown, int64_t &Offset,
-                  unsigned &MaxAlign, unsigned Skew) {
+static inline void AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
+                                     bool StackGrowsDown, int64_t &Offset,
+                                     Align &MaxAlign, unsigned Skew) {
   // If the stack grows down, add the object size to find the lowest address.
   if (StackGrowsDown)
     Offset += MFI.getObjectSize(FrameIdx);
 
-  unsigned Align = MFI.getObjectAlignment(FrameIdx);
+  Align Alignment = MFI.getObjectAlign(FrameIdx);
 
   // If the alignment of this object is greater than that of the stack, then
   // increase the stack alignment to match.
-  MaxAlign = std::max(MaxAlign, Align);
+  MaxAlign = std::max(MaxAlign, Alignment);
 
   // Adjust to alignment boundary.
-  Offset = alignTo(Offset, Align, Skew);
+  Offset = alignTo(Offset, Alignment, Skew);
 
   if (StackGrowsDown) {
     LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset
@@ -706,7 +707,7 @@ computeFreeStackSlots(MachineFrameInfo &MFI, bool StackGrowsDown,
 /// Assign frame object to an unused portion of the stack in the fixed stack
 /// object range.  Return true if the allocation was successful.
 static inline bool scavengeStackSlot(MachineFrameInfo &MFI, int FrameIdx,
-                                     bool StackGrowsDown, unsigned MaxAlign,
+                                     bool StackGrowsDown, Align MaxAlign,
                                      BitVector &StackBytesFree) {
   if (MFI.isVariableSizedObjectIndex(FrameIdx))
     return false;
@@ -718,7 +719,7 @@ static inline bool scavengeStackSlot(MachineFrameInfo &MFI, int FrameIdx,
     return false;
   }
 
-  unsigned ObjAlign = MFI.getObjectAlignment(FrameIdx);
+  Align ObjAlign = MFI.getObjectAlign(FrameIdx);
   if (ObjAlign > MaxAlign)
     return false;
 
@@ -765,11 +766,11 @@ static inline bool scavengeStackSlot(MachineFrameInfo &MFI, int FrameIdx,
 
 /// AssignProtectedObjSet - Helper function to assign large stack objects (i.e.,
 /// those required to be close to the Stack Protector) to stack offsets.
-static void
-AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
-                      SmallSet<int, 16> &ProtectedObjs,
-                      MachineFrameInfo &MFI, bool StackGrowsDown,
-                      int64_t &Offset, unsigned &MaxAlign, unsigned Skew) {
+static void AssignProtectedObjSet(const StackObjSet &UnassignedObjs,
+                                  SmallSet<int, 16> &ProtectedObjs,
+                                  MachineFrameInfo &MFI, bool StackGrowsDown,
+                                  int64_t &Offset, Align &MaxAlign,
+                                  unsigned Skew) {
 
   for (StackObjSet::const_iterator I = UnassignedObjs.begin(),
         E = UnassignedObjs.end(); I != E; ++I) {
@@ -807,7 +808,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
   for (unsigned i = 0, e = MFI.getObjectIndexEnd(); i != e; ++i)
     if (!MFI.isDeadObjectIndex(i) &&
         MFI.getStackID(i) == TargetStackID::Default)
-      assert(MFI.getObjectAlignment(i) <= MFI.getMaxAlignment() &&
+      assert(MFI.getObjectAlign(i) <= MFI.getMaxAlign() &&
              "MaxAlignment is invalid");
 #endif
 
@@ -846,9 +847,8 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
       // address of the object.
       Offset += MFI.getObjectSize(i);
 
-      unsigned Align = MFI.getObjectAlignment(i);
       // Adjust to alignment boundary
-      Offset = alignTo(Offset, Align, Skew);
+      Offset = alignTo(Offset, MFI.getObjectAlign(i), Skew);
 
       LLVM_DEBUG(dbgs() << "alloc FI(" << i << ") at SP[" << -Offset << "]\n");
       MFI.setObjectOffset(i, -Offset);        // Set the computed offset
@@ -863,9 +863,8 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
       if (MFI.isDeadObjectIndex(i))
         continue;
 
-      unsigned Align = MFI.getObjectAlignment(i);
       // Adjust to alignment boundary
-      Offset = alignTo(Offset, Align, Skew);
+      Offset = alignTo(Offset, MFI.getObjectAlign(i), Skew);
 
       LLVM_DEBUG(dbgs() << "alloc FI(" << i << ") at SP[" << Offset << "]\n");
       MFI.setObjectOffset(i, Offset);
@@ -876,7 +875,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
   // FixedCSEnd is the stack offset to the end of the fixed and callee-save
   // stack area.
   int64_t FixedCSEnd = Offset;
-  unsigned MaxAlign = MFI.getMaxAlignment();
+  Align MaxAlign = MFI.getMaxAlign();
 
   // Make sure the special register scavenging spill slot is closest to the
   // incoming stack pointer if a frame pointer is required and is closer
@@ -899,10 +898,10 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
   // frame index registers. Functions which don't want/need this optimization
   // will continue to use the existing code path.
   if (MFI.getUseLocalStackAllocationBlock()) {
-    unsigned Align = MFI.getLocalFrameMaxAlign().value();
+    Align Alignment = MFI.getLocalFrameMaxAlign();
 
     // Adjust to alignment boundary.
-    Offset = alignTo(Offset, Align, Skew);
+    Offset = alignTo(Offset, Alignment, Skew);
 
     LLVM_DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
 
@@ -917,7 +916,7 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
     // Allocate the local block
     Offset += MFI.getLocalFrameSize();
 
-    MaxAlign = std::max(Align, MaxAlign);
+    MaxAlign = std::max(Alignment, MaxAlign);
   }
 
   // Retrieve the Exception Handler registration node.
@@ -1068,12 +1067,12 @@ void PEI::calculateFrameObjectOffsets(MachineFunction &MF) {
     // ensure that the callee's frame or the alloca data is suitably aligned;
     // otherwise, for leaf functions, align to the TransientStackAlignment
     // value.
-    unsigned StackAlign;
+    Align StackAlign;
     if (MFI.adjustsStack() || MFI.hasVarSizedObjects() ||
         (RegInfo->needsStackRealignment(MF) && MFI.getObjectIndexEnd() != 0))
-      StackAlign = TFI.getStackAlignment();
+      StackAlign = TFI.getStackAlign();
     else
-      StackAlign = TFI.getTransientStackAlignment();
+      StackAlign = TFI.getTransientStackAlign();
 
     // If the frame pointer is eliminated, all frame offsets will be relative to
     // SP not FP. Align to MaxAlign so this works.
@@ -1206,7 +1205,7 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
       if (MI.isDebugValue()) {
         assert(i == 0 && "Frame indices can only appear as the first "
                          "operand of a DBG_VALUE machine instruction");
-        unsigned Reg;
+        Register Reg;
         unsigned FrameIdx = MI.getOperand(0).getIndex();
         unsigned Size = MF.getFrameInfo().getObjectSize(FrameIdx);
 
@@ -1235,10 +1234,10 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
           bool WithStackValue = true;
           DIExpr = DIExpression::prependOpcodes(DIExpr, Ops, WithStackValue);
           // Make the DBG_VALUE direct.
-          MI.getOperand(1).ChangeToRegister(0, false);
+          MI.getDebugOffset().ChangeToRegister(0, false);
         }
         DIExpr = DIExpression::prepend(DIExpr, PrependFlags, Offset);
-        MI.getOperand(3).setMetadata(DIExpr);
+        MI.getDebugExpressionOp().setMetadata(DIExpr);
         continue;
       }
 
@@ -1251,7 +1250,7 @@ void PEI::replaceFrameIndices(MachineBasicBlock *BB, MachineFunction &MF,
         assert((!MI.isDebugValue() || i == 0) &&
                "Frame indicies can only appear as the first operand of a "
                "DBG_VALUE machine instruction");
-        unsigned Reg;
+        Register Reg;
         MachineOperand &Offset = MI.getOperand(i + 1);
         int refOffset = TFI->getFrameIndexReferencePreferSP(
             MF, MI.getOperand(i).getIndex(), Reg, /*IgnoreSPUpdates*/ false);
diff --git a/llvm/lib/Target/Hexagon/RDFGraph.cpp b/llvm/lib/CodeGen/RDFGraph.cpp
index 0cb35dc98819..437a6b030096 100644
--- a/llvm/lib/Target/Hexagon/RDFGraph.cpp
+++ b/llvm/lib/CodeGen/RDFGraph.cpp
@@ -8,8 +8,6 @@
 //
 // Target-independent, SSA-based data flow graph for register data flow (RDF).
 //
-#include "RDFGraph.h"
-#include "RDFRegisters.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -20,6 +18,8 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -753,8 +753,10 @@ RegisterSet DataFlowGraph::getLandingPadLiveIns() const {
   const TargetLowering &TLI = *MF.getSubtarget().getTargetLowering();
   if (RegisterId R = TLI.getExceptionPointerRegister(PF))
     LR.insert(RegisterRef(R));
-  if (RegisterId R = TLI.getExceptionSelectorRegister(PF))
-    LR.insert(RegisterRef(R));
+  if (!isFuncletEHPersonality(classifyEHPersonality(PF))) {
+    if (RegisterId R = TLI.getExceptionSelectorRegister(PF))
+      LR.insert(RegisterRef(R));
+  }
   return LR;
 }
 
diff --git a/llvm/lib/Target/Hexagon/RDFLiveness.cpp b/llvm/lib/CodeGen/RDFLiveness.cpp
index e2c007c9d01a..0bcd27f8ea45 100644
--- a/llvm/lib/Target/Hexagon/RDFLiveness.cpp
+++ b/llvm/lib/CodeGen/RDFLiveness.cpp
@@ -22,9 +22,6 @@
 // and Embedded Architectures and Compilers", 8 (4),
 // <10.1145/2086696.2086706>. <hal-00647369>
 //
-#include "RDFLiveness.h"
-#include "RDFGraph.h"
-#include "RDFRegisters.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -33,6 +30,9 @@
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/RDFLiveness.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCRegisterInfo.h"
diff --git a/llvm/lib/Target/Hexagon/RDFRegisters.cpp b/llvm/lib/CodeGen/RDFRegisters.cpp
index b5675784e34b..bd8661816e71 100644
--- a/llvm/lib/Target/Hexagon/RDFRegisters.cpp
+++ b/llvm/lib/CodeGen/RDFRegisters.cpp
@@ -6,11 +6,11 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "RDFRegisters.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/MC/LaneBitmask.h"
 #include "llvm/MC/MCRegisterInfo.h"
diff --git a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
index 3c1f9905afd0..5bd8b4b8e27f 100644
--- a/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
+++ b/llvm/lib/CodeGen/ReachingDefAnalysis.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/ReachingDefAnalysis.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -20,10 +21,27 @@ char ReachingDefAnalysis::ID = 0;
 INITIALIZE_PASS(ReachingDefAnalysis, DEBUG_TYPE, "ReachingDefAnalysis", false,
                 true)
 
-void ReachingDefAnalysis::enterBasicBlock(
-    const LoopTraversal::TraversedMBBInfo &TraversedMBB) {
+static bool isValidReg(const MachineOperand &MO) {
+  return MO.isReg() && MO.getReg();
+}
 
-  MachineBasicBlock *MBB = TraversedMBB.MBB;
+static bool isValidRegUse(const MachineOperand &MO) {
+  return isValidReg(MO) && MO.isUse();
+}
+
+static bool isValidRegUseOf(const MachineOperand &MO, int PhysReg) {
+  return isValidRegUse(MO) && MO.getReg() == PhysReg;
+}
+
+static bool isValidRegDef(const MachineOperand &MO) {
+  return isValidReg(MO) && MO.isDef();
+}
+
+static bool isValidRegDefOf(const MachineOperand &MO, int PhysReg) {
+  return isValidRegDef(MO) && MO.getReg() == PhysReg;
+}
+
+void ReachingDefAnalysis::enterBasicBlock(MachineBasicBlock *MBB) {
   unsigned MBBNumber = MBB->getNumber();
   assert(MBBNumber < MBBReachingDefs.size() &&
          "Unexpected basic block number.");
@@ -44,8 +62,10 @@ void ReachingDefAnalysis::enterBasicBlock(
         // Treat function live-ins as if they were defined just before the first
         // instruction.  Usually, function arguments are set up immediately
         // before the call.
-        LiveRegs[*Unit] = -1;
-        MBBReachingDefs[MBBNumber][*Unit].push_back(LiveRegs[*Unit]);
+        if (LiveRegs[*Unit] != -1) {
+          LiveRegs[*Unit] = -1;
+          MBBReachingDefs[MBBNumber][*Unit].push_back(-1);
+        }
       }
     }
     LLVM_DEBUG(dbgs() << printMBBReference(*MBB) << ": entry\n");
@@ -62,23 +82,20 @@ void ReachingDefAnalysis::enterBasicBlock(
     if (Incoming.empty())
       continue;
 
-    for (unsigned Unit = 0; Unit != NumRegUnits; ++Unit) {
-      // Use the most recent predecessor def for each register.
+    // Find the most recent reaching definition from a predecessor.
+    for (unsigned Unit = 0; Unit != NumRegUnits; ++Unit)
       LiveRegs[Unit] = std::max(LiveRegs[Unit], Incoming[Unit]);
-      if ((LiveRegs[Unit] != ReachingDefDefaultVal))
-        MBBReachingDefs[MBBNumber][Unit].push_back(LiveRegs[Unit]);
-    }
   }
 
-  LLVM_DEBUG(dbgs() << printMBBReference(*MBB)
-                    << (!TraversedMBB.IsDone ? ": incomplete\n"
-                                             : ": all preds known\n"));
+  // Insert the most recent reaching definition we found.
+  for (unsigned Unit = 0; Unit != NumRegUnits; ++Unit)
+    if (LiveRegs[Unit] != ReachingDefDefaultVal)
+      MBBReachingDefs[MBBNumber][Unit].push_back(LiveRegs[Unit]);
 }
 
-void ReachingDefAnalysis::leaveBasicBlock(
-    const LoopTraversal::TraversedMBBInfo &TraversedMBB) {
+void ReachingDefAnalysis::leaveBasicBlock(MachineBasicBlock *MBB) {
   assert(!LiveRegs.empty() && "Must enter basic block first.");
-  unsigned MBBNumber = TraversedMBB.MBB->getNumber();
+  unsigned MBBNumber = MBB->getNumber();
   assert(MBBNumber < MBBOutRegsInfos.size() &&
          "Unexpected basic block number.");
   // Save register clearances at end of MBB - used by enterBasicBlock().
@@ -89,7 +106,8 @@ void ReachingDefAnalysis::leaveBasicBlock(
   // only cares about the clearance from the end of the block, so adjust
   // everything to be relative to the end of the basic block.
   for (int &OutLiveReg : MBBOutRegsInfos[MBBNumber])
-    OutLiveReg -= CurInstr;
+    if (OutLiveReg != ReachingDefDefaultVal)
+      OutLiveReg -= CurInstr;
   LiveRegs.clear();
 }
 
@@ -99,79 +117,146 @@ void ReachingDefAnalysis::processDefs(MachineInstr *MI) {
   unsigned MBBNumber = MI->getParent()->getNumber();
   assert(MBBNumber < MBBReachingDefs.size() &&
          "Unexpected basic block number.");
-  const MCInstrDesc &MCID = MI->getDesc();
-  for (unsigned i = 0,
-                e = MI->isVariadic() ? MI->getNumOperands() : MCID.getNumDefs();
-       i != e; ++i) {
-    MachineOperand &MO = MI->getOperand(i);
-    if (!MO.isReg() || !MO.getReg())
-      continue;
-    if (MO.isUse())
+
+  for (auto &MO : MI->operands()) {
+    if (!isValidRegDef(MO))
       continue;
     for (MCRegUnitIterator Unit(MO.getReg(), TRI); Unit.isValid(); ++Unit) {
       // This instruction explicitly defines the current reg unit.
-      LLVM_DEBUG(dbgs() << printReg(MO.getReg(), TRI) << ":\t" << CurInstr
+      LLVM_DEBUG(dbgs() << printReg(*Unit, TRI) << ":\t" << CurInstr
                         << '\t' << *MI);
 
       // How many instructions since this reg unit was last written?
-      LiveRegs[*Unit] = CurInstr;
-      MBBReachingDefs[MBBNumber][*Unit].push_back(CurInstr);
+      if (LiveRegs[*Unit] != CurInstr) {
+        LiveRegs[*Unit] = CurInstr;
+        MBBReachingDefs[MBBNumber][*Unit].push_back(CurInstr);
+      }
     }
   }
   InstIds[MI] = CurInstr;
   ++CurInstr;
 }
 
+void ReachingDefAnalysis::reprocessBasicBlock(MachineBasicBlock *MBB) {
+  unsigned MBBNumber = MBB->getNumber();
+  assert(MBBNumber < MBBReachingDefs.size() &&
+         "Unexpected basic block number.");
+
+  // Count number of non-debug instructions for end of block adjustment.
+  int NumInsts = 0;
+  for (const MachineInstr &MI : *MBB)
+    if (!MI.isDebugInstr())
+      NumInsts++;
+
+  // When reprocessing a block, the only thing we need to do is check whether
+  // there is now a more recent incoming reaching definition from a predecessor.
+  for (MachineBasicBlock *pred : MBB->predecessors()) {
+    assert(unsigned(pred->getNumber()) < MBBOutRegsInfos.size() &&
+           "Should have pre-allocated MBBInfos for all MBBs");
+    const LiveRegsDefInfo &Incoming = MBBOutRegsInfos[pred->getNumber()];
+    // Incoming may be empty for dead predecessors.
+    if (Incoming.empty())
+      continue;
+
+    for (unsigned Unit = 0; Unit != NumRegUnits; ++Unit) {
+      int Def = Incoming[Unit];
+      if (Def == ReachingDefDefaultVal)
+        continue;
+
+      auto Start = MBBReachingDefs[MBBNumber][Unit].begin();
+      if (Start != MBBReachingDefs[MBBNumber][Unit].end() && *Start < 0) {
+        if (*Start >= Def)
+          continue;
+
+        // Update existing reaching def from predecessor to a more recent one.
+        *Start = Def;
+      } else {
+        // Insert new reaching def from predecessor.
+        MBBReachingDefs[MBBNumber][Unit].insert(Start, Def);
+      }
+
+      // Update reaching def at end of of BB. Keep in mind that these are
+      // adjusted relative to the end of the basic block.
+      if (MBBOutRegsInfos[MBBNumber][Unit] < Def - NumInsts)
+        MBBOutRegsInfos[MBBNumber][Unit] = Def - NumInsts;
+    }
+  }
+}
+
 void ReachingDefAnalysis::processBasicBlock(
     const LoopTraversal::TraversedMBBInfo &TraversedMBB) {
-  enterBasicBlock(TraversedMBB);
-  for (MachineInstr &MI : *TraversedMBB.MBB) {
+  MachineBasicBlock *MBB = TraversedMBB.MBB;
+  LLVM_DEBUG(dbgs() << printMBBReference(*MBB)
+                    << (!TraversedMBB.IsDone ? ": incomplete\n"
+                                             : ": all preds known\n"));
+
+  if (!TraversedMBB.PrimaryPass) {
+    // Reprocess MBB that is part of a loop.
+    reprocessBasicBlock(MBB);
+    return;
+  }
+
+  enterBasicBlock(MBB);
+  for (MachineInstr &MI : *MBB) {
     if (!MI.isDebugInstr())
       processDefs(&MI);
   }
-  leaveBasicBlock(TraversedMBB);
+  leaveBasicBlock(MBB);
 }
 
 bool ReachingDefAnalysis::runOnMachineFunction(MachineFunction &mf) {
   MF = &mf;
   TRI = MF->getSubtarget().getRegisterInfo();
+  LLVM_DEBUG(dbgs() << "********** REACHING DEFINITION ANALYSIS **********\n");
+  init();
+  traverse();
+  return false;
+}
 
+void ReachingDefAnalysis::releaseMemory() {
+  // Clear the internal vectors.
+  MBBOutRegsInfos.clear();
+  MBBReachingDefs.clear();
+  InstIds.clear();
   LiveRegs.clear();
-  NumRegUnits = TRI->getNumRegUnits();
-
-  MBBReachingDefs.resize(mf.getNumBlockIDs());
+}
 
-  LLVM_DEBUG(dbgs() << "********** REACHING DEFINITION ANALYSIS **********\n");
+void ReachingDefAnalysis::reset() {
+  releaseMemory();
+  init();
+  traverse();
+}
 
+void ReachingDefAnalysis::init() {
+  NumRegUnits = TRI->getNumRegUnits();
+  MBBReachingDefs.resize(MF->getNumBlockIDs());
   // Initialize the MBBOutRegsInfos
-  MBBOutRegsInfos.resize(mf.getNumBlockIDs());
+  MBBOutRegsInfos.resize(MF->getNumBlockIDs());
+  LoopTraversal Traversal;
+  TraversedMBBOrder = Traversal.traverse(*MF);
+}
 
+void ReachingDefAnalysis::traverse() {
   // Traverse the basic blocks.
-  LoopTraversal Traversal;
-  LoopTraversal::TraversalOrder TraversedMBBOrder = Traversal.traverse(mf);
-  for (LoopTraversal::TraversedMBBInfo TraversedMBB : TraversedMBBOrder) {
+  for (LoopTraversal::TraversedMBBInfo TraversedMBB : TraversedMBBOrder)
     processBasicBlock(TraversedMBB);
-  }
-
-  // Sorting all reaching defs found for a ceartin reg unit in a given BB.
+#ifndef NDEBUG
+  // Make sure reaching defs are sorted and unique.
   for (MBBDefsInfo &MBBDefs : MBBReachingDefs) {
-    for (MBBRegUnitDefs &RegUnitDefs : MBBDefs)
-      llvm::sort(RegUnitDefs);
+    for (MBBRegUnitDefs &RegUnitDefs : MBBDefs) {
+      int LastDef = ReachingDefDefaultVal;
+      for (int Def : RegUnitDefs) {
+        assert(Def > LastDef && "Defs must be sorted and unique");
+        LastDef = Def;
+      }
+    }
   }
-
-  return false;
-}
-
-void ReachingDefAnalysis::releaseMemory() {
-  // Clear the internal vectors.
-  MBBOutRegsInfos.clear();
-  MBBReachingDefs.clear();
-  InstIds.clear();
+#endif
 }
 
-int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, int PhysReg) {
+int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, int PhysReg) const {
   assert(InstIds.count(MI) && "Unexpected machine instuction.");
-  int InstId = InstIds[MI];
+  int InstId = InstIds.lookup(MI);
   int DefRes = ReachingDefDefaultVal;
   unsigned MBBNumber = MI->getParent()->getNumber();
   assert(MBBNumber < MBBReachingDefs.size() &&
@@ -188,12 +273,13 @@ int ReachingDefAnalysis::getReachingDef(MachineInstr *MI, int PhysReg) {
   return LatestDef;
 }
 
-MachineInstr* ReachingDefAnalysis::getReachingMIDef(MachineInstr *MI, int PhysReg) {
+MachineInstr* ReachingDefAnalysis::getReachingLocalMIDef(MachineInstr *MI,
+                                                    int PhysReg) const {
   return getInstFromId(MI->getParent(), getReachingDef(MI, PhysReg));
 }
 
 bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B,
-                                             int PhysReg) {
+                                             int PhysReg) const {
   MachineBasicBlock *ParentA = A->getParent();
   MachineBasicBlock *ParentB = B->getParent();
   if (ParentA != ParentB)
@@ -203,7 +289,7 @@ bool ReachingDefAnalysis::hasSameReachingDef(MachineInstr *A, MachineInstr *B,
 }
 
 MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB,
-                                                 int InstId) {
+                                                 int InstId) const {
   assert(static_cast<size_t>(MBB->getNumber()) < MBBReachingDefs.size() &&
          "Unexpected basic block number.");
   assert(InstId < static_cast<int>(MBB->size()) &&
@@ -213,45 +299,156 @@ MachineInstr *ReachingDefAnalysis::getInstFromId(MachineBasicBlock *MBB,
     return nullptr;
 
   for (auto &MI : *MBB) {
-    if (InstIds.count(&MI) && InstIds[&MI] == InstId)
+    auto F = InstIds.find(&MI);
+    if (F != InstIds.end() && F->second == InstId)
       return &MI;
   }
+
   return nullptr;
 }
 
-int ReachingDefAnalysis::getClearance(MachineInstr *MI, MCPhysReg PhysReg) {
+int
+ReachingDefAnalysis::getClearance(MachineInstr *MI, MCPhysReg PhysReg) const {
   assert(InstIds.count(MI) && "Unexpected machine instuction.");
-  return InstIds[MI] - getReachingDef(MI, PhysReg);
+  return InstIds.lookup(MI) - getReachingDef(MI, PhysReg);
+}
+
+bool
+ReachingDefAnalysis::hasLocalDefBefore(MachineInstr *MI, int PhysReg) const {
+  return getReachingDef(MI, PhysReg) >= 0;
 }
 
 void ReachingDefAnalysis::getReachingLocalUses(MachineInstr *Def, int PhysReg,
-    SmallVectorImpl<MachineInstr*> &Uses) {
+                                               InstSet &Uses) const {
   MachineBasicBlock *MBB = Def->getParent();
   MachineBasicBlock::iterator MI = MachineBasicBlock::iterator(Def);
   while (++MI != MBB->end()) {
+    if (MI->isDebugInstr())
+      continue;
+
     // If/when we find a new reaching def, we know that there's no more uses
     // of 'Def'.
-    if (getReachingMIDef(&*MI, PhysReg) != Def)
+    if (getReachingLocalMIDef(&*MI, PhysReg) != Def)
       return;
 
     for (auto &MO : MI->operands()) {
-      if (!MO.isReg() || !MO.isUse() || MO.getReg() != PhysReg)
+      if (!isValidRegUseOf(MO, PhysReg))
         continue;
 
-      Uses.push_back(&*MI);
+      Uses.insert(&*MI);
       if (MO.isKill())
         return;
     }
   }
 }
 
-unsigned ReachingDefAnalysis::getNumUses(MachineInstr *Def, int PhysReg) {
-  SmallVector<MachineInstr*, 4> Uses;
-  getReachingLocalUses(Def, PhysReg, Uses);
-  return Uses.size();
+bool
+ReachingDefAnalysis::getLiveInUses(MachineBasicBlock *MBB, int PhysReg,
+                                   InstSet &Uses) const {
+  for (auto &MI : *MBB) {
+    if (MI.isDebugInstr())
+      continue;
+    for (auto &MO : MI.operands()) {
+      if (!isValidRegUseOf(MO, PhysReg))
+        continue;
+      if (getReachingDef(&MI, PhysReg) >= 0)
+        return false;
+      Uses.insert(&MI);
+    }
+  }
+  return isReachingDefLiveOut(&MBB->back(), PhysReg);
+}
+
+void
+ReachingDefAnalysis::getGlobalUses(MachineInstr *MI, int PhysReg,
+                                   InstSet &Uses) const {
+  MachineBasicBlock *MBB = MI->getParent();
+
+  // Collect the uses that each def touches within the block.
+  getReachingLocalUses(MI, PhysReg, Uses);
+
+  // Handle live-out values.
+  if (auto *LiveOut = getLocalLiveOutMIDef(MI->getParent(), PhysReg)) {
+    if (LiveOut != MI)
+      return;
+
+    SmallVector<MachineBasicBlock*, 4> ToVisit;
+    ToVisit.insert(ToVisit.begin(), MBB->successors().begin(),
+                   MBB->successors().end());
+    SmallPtrSet<MachineBasicBlock*, 4>Visited;
+    while (!ToVisit.empty()) {
+      MachineBasicBlock *MBB = ToVisit.back();
+      ToVisit.pop_back();
+      if (Visited.count(MBB) || !MBB->isLiveIn(PhysReg))
+        continue;
+      if (getLiveInUses(MBB, PhysReg, Uses))
+        ToVisit.insert(ToVisit.end(), MBB->successors().begin(),
+                       MBB->successors().end());
+      Visited.insert(MBB);
+    }
+  }
+}
+
+void ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, int PhysReg,
+                                      InstSet &Defs) const {
+  SmallPtrSet<MachineBasicBlock*, 2> VisitedBBs;
+  getLiveOuts(MBB, PhysReg, Defs, VisitedBBs);
+}
+
+void
+ReachingDefAnalysis::getLiveOuts(MachineBasicBlock *MBB, int PhysReg,
+                                 InstSet &Defs, BlockSet &VisitedBBs) const {
+  if (VisitedBBs.count(MBB))
+    return;
+
+  VisitedBBs.insert(MBB);
+  LivePhysRegs LiveRegs(*TRI);
+  LiveRegs.addLiveOuts(*MBB);
+  if (!LiveRegs.contains(PhysReg))
+    return;
+
+  if (auto *Def = getLocalLiveOutMIDef(MBB, PhysReg))
+    Defs.insert(Def);
+  else
+    for (auto *Pred : MBB->predecessors())
+      getLiveOuts(Pred, PhysReg, Defs, VisitedBBs);
+}
+
+MachineInstr *ReachingDefAnalysis::getUniqueReachingMIDef(MachineInstr *MI,
+                                                          int PhysReg) const {
+  // If there's a local def before MI, return it.
+  MachineInstr *LocalDef = getReachingLocalMIDef(MI, PhysReg);
+  if (LocalDef && InstIds.lookup(LocalDef) < InstIds.lookup(MI))
+    return LocalDef;
+
+  SmallPtrSet<MachineBasicBlock*, 4> VisitedBBs;
+  SmallPtrSet<MachineInstr*, 2> Incoming;
+  for (auto *Pred : MI->getParent()->predecessors())
+    getLiveOuts(Pred, PhysReg, Incoming, VisitedBBs);
+
+  // If we have a local def and an incoming instruction, then there's not a
+  // unique instruction def.
+  if (!Incoming.empty() && LocalDef)
+    return nullptr;
+  else if (Incoming.size() == 1)
+    return *Incoming.begin();
+  else
+    return LocalDef;
 }
 
-bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, int PhysReg) {
+MachineInstr *ReachingDefAnalysis::getMIOperand(MachineInstr *MI,
+                                                unsigned Idx) const {
+  assert(MI->getOperand(Idx).isReg() && "Expected register operand");
+  return getUniqueReachingMIDef(MI, MI->getOperand(Idx).getReg());
+}
+
+MachineInstr *ReachingDefAnalysis::getMIOperand(MachineInstr *MI,
+                                                MachineOperand &MO) const {
+  assert(MO.isReg() && "Expected register operand");
+  return getUniqueReachingMIDef(MI, MO.getReg());
+}
+
+bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, int PhysReg) const {
   MachineBasicBlock *MBB = MI->getParent();
   LivePhysRegs LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
@@ -265,12 +462,25 @@ bool ReachingDefAnalysis::isRegUsedAfter(MachineInstr *MI, int PhysReg) {
   for (auto Last = MBB->rbegin(), End = MBB->rend(); Last != End; ++Last) {
     LiveRegs.stepBackward(*Last);
     if (LiveRegs.contains(PhysReg))
-      return InstIds[&*Last] > InstIds[MI];
+      return InstIds.lookup(&*Last) > InstIds.lookup(MI);
   }
   return false;
 }
 
-bool ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI, int PhysReg) {
+bool ReachingDefAnalysis::isRegDefinedAfter(MachineInstr *MI,
+                                            int PhysReg) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  if (getReachingDef(MI, PhysReg) != getReachingDef(&MBB->back(), PhysReg))
+    return true;
+
+  if (auto *Def = getLocalLiveOutMIDef(MBB, PhysReg))
+    return Def == getReachingLocalMIDef(MI, PhysReg);
+
+  return false;
+}
+
+bool
+ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI, int PhysReg) const {
   MachineBasicBlock *MBB = MI->getParent();
   LivePhysRegs LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
@@ -284,14 +494,14 @@ bool ReachingDefAnalysis::isReachingDefLiveOut(MachineInstr *MI, int PhysReg) {
 
   // Finally check that the last instruction doesn't redefine the register.
   for (auto &MO : Last->operands())
-    if (MO.isReg() && MO.isDef() && MO.getReg() == PhysReg)
+    if (isValidRegDefOf(MO, PhysReg))
       return false;
 
   return true;
 }
 
 MachineInstr* ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB,
-                                                        int PhysReg) {
+                                                        int PhysReg) const {
   LivePhysRegs LiveRegs(*TRI);
   LiveRegs.addLiveOuts(*MBB);
   if (!LiveRegs.contains(PhysReg))
@@ -300,33 +510,168 @@ MachineInstr* ReachingDefAnalysis::getLocalLiveOutMIDef(MachineBasicBlock *MBB,
   MachineInstr *Last = &MBB->back();
   int Def = getReachingDef(Last, PhysReg);
   for (auto &MO : Last->operands())
-    if (MO.isReg() && MO.isDef() && MO.getReg() == PhysReg)
+    if (isValidRegDefOf(MO, PhysReg))
       return Last;
 
   return Def < 0 ? nullptr : getInstFromId(MBB, Def);
 }
 
-MachineInstr *ReachingDefAnalysis::getInstWithUseBefore(MachineInstr *MI,
-    int PhysReg) {
-  auto I = MachineBasicBlock::reverse_iterator(MI);
-  auto E = MI->getParent()->rend();
-  I++;
+static bool mayHaveSideEffects(MachineInstr &MI) {
+  return MI.mayLoadOrStore() || MI.mayRaiseFPException() ||
+         MI.hasUnmodeledSideEffects() || MI.isTerminator() ||
+         MI.isCall() || MI.isBarrier() || MI.isBranch() || MI.isReturn();
+}
+
+// Can we safely move 'From' to just before 'To'? To satisfy this, 'From' must
+// not define a register that is used by any instructions, after and including,
+// 'To'. These instructions also must not redefine any of Froms operands.
+template<typename Iterator>
+bool ReachingDefAnalysis::isSafeToMove(MachineInstr *From,
+                                       MachineInstr *To) const {
+  if (From->getParent() != To->getParent())
+    return false;
+
+  SmallSet<int, 2> Defs;
+  // First check that From would compute the same value if moved.
+  for (auto &MO : From->operands()) {
+    if (!isValidReg(MO))
+      continue;
+    if (MO.isDef())
+      Defs.insert(MO.getReg());
+    else if (!hasSameReachingDef(From, To, MO.getReg()))
+      return false;
+  }
 
-  for ( ; I != E; I++)
+  // Now walk checking that the rest of the instructions will compute the same
+  // value and that we're not overwriting anything. Don't move the instruction
+  // past any memory, control-flow or other ambiguous instructions.
+  for (auto I = ++Iterator(From), E = Iterator(To); I != E; ++I) {
+    if (mayHaveSideEffects(*I))
+      return false;
     for (auto &MO : I->operands())
-      if (MO.isReg() && MO.isUse() && MO.getReg() == PhysReg)
-        return &*I;
+      if (MO.isReg() && MO.getReg() && Defs.count(MO.getReg()))
+        return false;
+  }
+  return true;
+}
 
-  return nullptr;
+bool ReachingDefAnalysis::isSafeToMoveForwards(MachineInstr *From,
+                                               MachineInstr *To) const {
+  return isSafeToMove<MachineBasicBlock::reverse_iterator>(From, To);
+}
+
+bool ReachingDefAnalysis::isSafeToMoveBackwards(MachineInstr *From,
+                                                MachineInstr *To) const {
+  return isSafeToMove<MachineBasicBlock::iterator>(From, To);
+}
+
+bool ReachingDefAnalysis::isSafeToRemove(MachineInstr *MI,
+                                         InstSet &ToRemove) const {
+  SmallPtrSet<MachineInstr*, 1> Ignore;
+  SmallPtrSet<MachineInstr*, 2> Visited;
+  return isSafeToRemove(MI, Visited, ToRemove, Ignore);
+}
+
+bool
+ReachingDefAnalysis::isSafeToRemove(MachineInstr *MI, InstSet &ToRemove,
+                                    InstSet &Ignore) const {
+  SmallPtrSet<MachineInstr*, 2> Visited;
+  return isSafeToRemove(MI, Visited, ToRemove, Ignore);
+}
+
+bool
+ReachingDefAnalysis::isSafeToRemove(MachineInstr *MI, InstSet &Visited,
+                                    InstSet &ToRemove, InstSet &Ignore) const {
+  if (Visited.count(MI) || Ignore.count(MI))
+    return true;
+  else if (mayHaveSideEffects(*MI)) {
+    // Unless told to ignore the instruction, don't remove anything which has
+    // side effects.
+    return false;
+  }
+
+  Visited.insert(MI);
+  for (auto &MO : MI->operands()) {
+    if (!isValidRegDef(MO))
+      continue;
+
+    SmallPtrSet<MachineInstr*, 4> Uses;
+    getGlobalUses(MI, MO.getReg(), Uses);
+
+    for (auto I : Uses) {
+      if (Ignore.count(I) || ToRemove.count(I))
+        continue;
+      if (!isSafeToRemove(I, Visited, ToRemove, Ignore))
+        return false;
+    }
+  }
+  ToRemove.insert(MI);
+  return true;
+}
+
+void ReachingDefAnalysis::collectKilledOperands(MachineInstr *MI,
+                                                InstSet &Dead) const {
+  Dead.insert(MI);
+  auto IsDead = [this, &Dead](MachineInstr *Def, int PhysReg) {
+    unsigned LiveDefs = 0;
+    for (auto &MO : Def->operands()) {
+      if (!isValidRegDef(MO))
+        continue;
+      if (!MO.isDead())
+        ++LiveDefs;
+    }
+
+    if (LiveDefs > 1)
+      return false;
+
+    SmallPtrSet<MachineInstr*, 4> Uses;
+    getGlobalUses(Def, PhysReg, Uses);
+    for (auto *Use : Uses)
+      if (!Dead.count(Use))
+        return false;
+    return true;
+  };
+
+  for (auto &MO : MI->operands()) {
+    if (!isValidRegUse(MO))
+      continue;
+    if (MachineInstr *Def = getMIOperand(MI, MO))
+      if (IsDead(Def, MO.getReg()))
+        collectKilledOperands(Def, Dead);
+  }
 }
 
-void ReachingDefAnalysis::getAllInstWithUseBefore(MachineInstr *MI,
-    int PhysReg, SmallVectorImpl<MachineInstr*> &Uses) {
-  MachineInstr *Use = nullptr;
-  MachineInstr *Pos = MI;
+bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI,
+                                           int PhysReg) const {
+  SmallPtrSet<MachineInstr*, 1> Ignore;
+  return isSafeToDefRegAt(MI, PhysReg, Ignore);
+}
 
-  while ((Use = getInstWithUseBefore(Pos, PhysReg))) {
-    Uses.push_back(Use);
-    Pos = Use;
+bool ReachingDefAnalysis::isSafeToDefRegAt(MachineInstr *MI, int PhysReg,
+                                           InstSet &Ignore) const {
+  // Check for any uses of the register after MI.
+  if (isRegUsedAfter(MI, PhysReg)) {
+    if (auto *Def = getReachingLocalMIDef(MI, PhysReg)) {
+      SmallPtrSet<MachineInstr*, 2> Uses;
+      getReachingLocalUses(Def, PhysReg, Uses);
+      for (auto *Use : Uses)
+        if (!Ignore.count(Use))
+          return false;
+    } else
+      return false;
   }
+
+  MachineBasicBlock *MBB = MI->getParent();
+  // Check for any defs after MI.
+  if (isRegDefinedAfter(MI, PhysReg)) {
+    auto I = MachineBasicBlock::iterator(MI);
+    for (auto E = MBB->end(); I != E; ++I) {
+      if (Ignore.count(&*I))
+        continue;
+      for (auto &MO : I->operands())
+        if (isValidRegDefOf(MO, PhysReg))
+          return false;
+    }
+  }
+  return true;
 }
diff --git a/llvm/lib/CodeGen/RegAllocBase.cpp b/llvm/lib/CodeGen/RegAllocBase.cpp
index 156daaa03bb5..d22826853672 100644
--- a/llvm/lib/CodeGen/RegAllocBase.cpp
+++ b/llvm/lib/CodeGen/RegAllocBase.cpp
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "RegAllocBase.h"
-#include "Spiller.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/LiveInterval.h"
@@ -21,6 +20,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Spiller.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Pass.h"
@@ -107,7 +107,7 @@ void RegAllocBase::allocatePhysRegs() {
                       << TRI->getRegClassName(MRI->getRegClass(VirtReg->reg))
                       << ':' << *VirtReg << " w=" << VirtReg->weight << '\n');
 
-    using VirtRegVec = SmallVector<unsigned, 4>;
+    using VirtRegVec = SmallVector<Register, 4>;
 
     VirtRegVec SplitVRegs;
     unsigned AvailablePhysReg = selectOrSplit(*VirtReg, SplitVRegs);
diff --git a/llvm/lib/CodeGen/RegAllocBase.h b/llvm/lib/CodeGen/RegAllocBase.h
index 6a7cc5ba4308..8e931eaae99a 100644
--- a/llvm/lib/CodeGen/RegAllocBase.h
+++ b/llvm/lib/CodeGen/RegAllocBase.h
@@ -101,8 +101,8 @@ protected:
   // Each call must guarantee forward progess by returning an available PhysReg
   // or new set of split live virtual registers. It is up to the splitter to
   // converge quickly toward fully spilled live ranges.
-  virtual unsigned selectOrSplit(LiveInterval &VirtReg,
-                                 SmallVectorImpl<unsigned> &splitLVRs) = 0;
+  virtual Register selectOrSplit(LiveInterval &VirtReg,
+                                 SmallVectorImpl<Register> &splitLVRs) = 0;
 
   // Use this group name for NamedRegionTimer.
   static const char TimerGroupName[];
diff --git a/llvm/lib/CodeGen/RegAllocBasic.cpp b/llvm/lib/CodeGen/RegAllocBasic.cpp
index 46f6946f7003..5009bcc0a397 100644
--- a/llvm/lib/CodeGen/RegAllocBasic.cpp
+++ b/llvm/lib/CodeGen/RegAllocBasic.cpp
@@ -14,7 +14,6 @@
 #include "AllocationOrder.h"
 #include "LiveDebugVariables.h"
 #include "RegAllocBase.h"
-#include "Spiller.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/CalcSpillWeights.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -28,9 +27,10 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
+#include "llvm/CodeGen/Spiller.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
-#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdlib>
@@ -100,8 +100,8 @@ public:
     return LI;
   }
 
-  unsigned selectOrSplit(LiveInterval &VirtReg,
-                         SmallVectorImpl<unsigned> &SplitVRegs) override;
+  Register selectOrSplit(LiveInterval &VirtReg,
+                         SmallVectorImpl<Register> &SplitVRegs) override;
 
   /// Perform register allocation.
   bool runOnMachineFunction(MachineFunction &mf) override;
@@ -114,8 +114,8 @@ public:
   // Helper for spilling all live virtual registers currently unified under preg
   // that interfere with the most recently queried lvr.  Return true if spilling
   // was successful, and append any new spilled/split intervals to splitLVRs.
-  bool spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
-                          SmallVectorImpl<unsigned> &SplitVRegs);
+  bool spillInterferences(LiveInterval &VirtReg, Register PhysReg,
+                          SmallVectorImpl<Register> &SplitVRegs);
 
   static char ID;
 };
@@ -201,8 +201,8 @@ void RABasic::releaseMemory() {
 // Spill or split all live virtual registers currently unified under PhysReg
 // that interfere with VirtReg. The newly spilled or split live intervals are
 // returned by appending them to SplitVRegs.
-bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
-                                 SmallVectorImpl<unsigned> &SplitVRegs) {
+bool RABasic::spillInterferences(LiveInterval &VirtReg, Register PhysReg,
+                                 SmallVectorImpl<Register> &SplitVRegs) {
   // Record each interference and determine if all are spillable before mutating
   // either the union or live intervals.
   SmallVector<LiveInterval*, 8> Intfs;
@@ -253,14 +253,14 @@ bool RABasic::spillInterferences(LiveInterval &VirtReg, unsigned PhysReg,
 // |vregs| * |machineregs|. And since the number of interference tests is
 // minimal, there is no value in caching them outside the scope of
 // selectOrSplit().
-unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
-                                SmallVectorImpl<unsigned> &SplitVRegs) {
+Register RABasic::selectOrSplit(LiveInterval &VirtReg,
+                                SmallVectorImpl<Register> &SplitVRegs) {
   // Populate a list of physical register spill candidates.
-  SmallVector<unsigned, 8> PhysRegSpillCands;
+  SmallVector<Register, 8> PhysRegSpillCands;
 
   // Check for an available register in this class.
   AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix);
-  while (unsigned PhysReg = Order.next()) {
+  while (Register PhysReg = Order.next()) {
     // Check for interference in PhysReg
     switch (Matrix->checkInterference(VirtReg, PhysReg)) {
     case LiveRegMatrix::IK_Free:
@@ -279,7 +279,7 @@ unsigned RABasic::selectOrSplit(LiveInterval &VirtReg,
   }
 
   // Try to spill another interfering reg with less spill weight.
-  for (SmallVectorImpl<unsigned>::iterator PhysRegI = PhysRegSpillCands.begin(),
+  for (SmallVectorImpl<Register>::iterator PhysRegI = PhysRegSpillCands.begin(),
        PhysRegE = PhysRegSpillCands.end(); PhysRegI != PhysRegE; ++PhysRegI) {
     if (!spillInterferences(VirtReg, *PhysRegI, SplitVRegs))
       continue;
diff --git a/llvm/lib/CodeGen/RegAllocFast.cpp b/llvm/lib/CodeGen/RegAllocFast.cpp
index 89b5bcebd61c..5396f9f3a143 100644
--- a/llvm/lib/CodeGen/RegAllocFast.cpp
+++ b/llvm/lib/CodeGen/RegAllocFast.cpp
@@ -106,13 +106,8 @@ namespace {
     /// that it is alive across blocks.
     BitVector MayLiveAcrossBlocks;
 
-    /// State of a physical register.
-    enum RegState {
-      /// A disabled register is not available for allocation, but an alias may
-      /// be in use. A register can only be moved out of the disabled state if
-      /// all aliases are disabled.
-      regDisabled,
-
+    /// State of a register unit.
+    enum RegUnitState {
       /// A free register is not currently in use and can be allocated
       /// immediately without checking aliases.
       regFree,
@@ -126,8 +121,8 @@ namespace {
       /// register. In that case, LiveVirtRegs contains the inverse mapping.
     };
 
-    /// Maps each physical register to a RegState enum or a virtual register.
-    std::vector<unsigned> PhysRegState;
+    /// Maps each physical register to a RegUnitState enum or virtual register.
+    std::vector<unsigned> RegUnitStates;
 
     SmallVector<Register, 16> VirtDead;
     SmallVector<MachineInstr *, 32> Coalesced;
@@ -189,6 +184,10 @@ namespace {
     bool isLastUseOfLocalReg(const MachineOperand &MO) const;
 
     void addKillFlag(const LiveReg &LRI);
+#ifndef NDEBUG
+    bool verifyRegStateMapping(const LiveReg &LR) const;
+#endif
+
     void killVirtReg(LiveReg &LR);
     void killVirtReg(Register VirtReg);
     void spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR);
@@ -196,7 +195,7 @@ namespace {
 
     void usePhysReg(MachineOperand &MO);
     void definePhysReg(MachineBasicBlock::iterator MI, MCPhysReg PhysReg,
-                       RegState NewState);
+                       unsigned NewState);
     unsigned calcSpillCost(MCPhysReg PhysReg) const;
     void assignVirtToPhysReg(LiveReg &, MCPhysReg PhysReg);
 
@@ -229,7 +228,7 @@ namespace {
     bool mayLiveOut(Register VirtReg);
     bool mayLiveIn(Register VirtReg);
 
-    void dumpState();
+    void dumpState() const;
   };
 
 } // end anonymous namespace
@@ -240,7 +239,8 @@ INITIALIZE_PASS(RegAllocFast, "regallocfast", "Fast Register Allocator", false,
                 false)
 
 void RegAllocFast::setPhysRegState(MCPhysReg PhysReg, unsigned NewState) {
-  PhysRegState[PhysReg] = NewState;
+  for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI)
+    RegUnitStates[*UI] = NewState;
 }
 
 /// This allocates space for the specified virtual register to be held on the
@@ -255,8 +255,8 @@ int RegAllocFast::getStackSpaceFor(Register VirtReg) {
   // Allocate a new stack object for this spill location...
   const TargetRegisterClass &RC = *MRI->getRegClass(VirtReg);
   unsigned Size = TRI->getSpillSize(RC);
-  unsigned Align = TRI->getSpillAlignment(RC);
-  int FrameIdx = MFI->CreateSpillStackObject(Size, Align);
+  Align Alignment = TRI->getSpillAlign(RC);
+  int FrameIdx = MFI->CreateSpillStackObject(Size, Alignment);
 
   // Assign the slot.
   StackSlotForVirtReg[VirtReg] = FrameIdx;
@@ -384,12 +384,23 @@ void RegAllocFast::addKillFlag(const LiveReg &LR) {
   }
 }
 
+#ifndef NDEBUG
+bool RegAllocFast::verifyRegStateMapping(const LiveReg &LR) const {
+  for (MCRegUnitIterator UI(LR.PhysReg, TRI); UI.isValid(); ++UI) {
+    if (RegUnitStates[*UI] != LR.VirtReg)
+      return false;
+  }
+
+  return true;
+}
+#endif
+
 /// Mark virtreg as no longer available.
 void RegAllocFast::killVirtReg(LiveReg &LR) {
+  assert(verifyRegStateMapping(LR) && "Broken RegState mapping");
   addKillFlag(LR);
-  assert(PhysRegState[LR.PhysReg] == LR.VirtReg &&
-         "Broken RegState mapping");
-  setPhysRegState(LR.PhysReg, regFree);
+  MCPhysReg PhysReg = LR.PhysReg;
+  setPhysRegState(PhysReg, regFree);
   LR.PhysReg = 0;
 }
 
@@ -416,7 +427,9 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI,
 
 /// Do the actual work of spilling.
 void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR) {
-  assert(PhysRegState[LR.PhysReg] == LR.VirtReg && "Broken RegState mapping");
+  assert(verifyRegStateMapping(LR) && "Broken RegState mapping");
+
+  MCPhysReg PhysReg = LR.PhysReg;
 
   if (LR.Dirty) {
     // If this physreg is used by the instruction, we want to kill it on the
@@ -424,7 +437,7 @@ void RegAllocFast::spillVirtReg(MachineBasicBlock::iterator MI, LiveReg &LR) {
     bool SpillKill = MachineBasicBlock::iterator(LR.LastUse) != MI;
     LR.Dirty = false;
 
-    spill(MI, LR.VirtReg, LR.PhysReg, SpillKill);
+    spill(MI, LR.VirtReg, PhysReg, SpillKill);
 
     if (SpillKill)
       LR.LastUse = nullptr; // Don't kill register again
@@ -460,53 +473,16 @@ void RegAllocFast::usePhysReg(MachineOperand &MO) {
   assert(PhysReg.isPhysical() && "Bad usePhysReg operand");
 
   markRegUsedInInstr(PhysReg);
-  switch (PhysRegState[PhysReg]) {
-  case regDisabled:
-    break;
-  case regReserved:
-    PhysRegState[PhysReg] = regFree;
-    LLVM_FALLTHROUGH;
-  case regFree:
-    MO.setIsKill();
-    return;
-  default:
-    // The physreg was allocated to a virtual register. That means the value we
-    // wanted has been clobbered.
-    llvm_unreachable("Instruction uses an allocated register");
-  }
 
-  // Maybe a superregister is reserved?
-  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
-    MCPhysReg Alias = *AI;
-    switch (PhysRegState[Alias]) {
-    case regDisabled:
-      break;
+  for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) {
+    switch (RegUnitStates[*UI]) {
     case regReserved:
-      // Either PhysReg is a subregister of Alias and we mark the
-      // whole register as free, or PhysReg is the superregister of
-      // Alias and we mark all the aliases as disabled before freeing
-      // PhysReg.
-      // In the latter case, since PhysReg was disabled, this means that
-      // its value is defined only by physical sub-registers. This check
-      // is performed by the assert of the default case in this loop.
-      // Note: The value of the superregister may only be partial
-      // defined, that is why regDisabled is a valid state for aliases.
-      assert((TRI->isSuperRegister(PhysReg, Alias) ||
-              TRI->isSuperRegister(Alias, PhysReg)) &&
-             "Instruction is not using a subregister of a reserved register");
+      RegUnitStates[*UI] = regFree;
       LLVM_FALLTHROUGH;
     case regFree:
-      if (TRI->isSuperRegister(PhysReg, Alias)) {
-        // Leave the superregister in the working set.
-        setPhysRegState(Alias, regFree);
-        MO.getParent()->addRegisterKilled(Alias, TRI, true);
-        return;
-      }
-      // Some other alias was in the working set - clear it.
-      setPhysRegState(Alias, regDisabled);
       break;
     default:
-      llvm_unreachable("Instruction uses an alias of an allocated register");
+      llvm_unreachable("Unexpected reg unit state");
     }
   }
 
@@ -519,38 +495,20 @@ void RegAllocFast::usePhysReg(MachineOperand &MO) {
 /// similar to defineVirtReg except the physreg is reserved instead of
 /// allocated.
 void RegAllocFast::definePhysReg(MachineBasicBlock::iterator MI,
-                                 MCPhysReg PhysReg, RegState NewState) {
-  markRegUsedInInstr(PhysReg);
-  switch (Register VirtReg = PhysRegState[PhysReg]) {
-  case regDisabled:
-    break;
-  default:
-    spillVirtReg(MI, VirtReg);
-    LLVM_FALLTHROUGH;
-  case regFree:
-  case regReserved:
-    setPhysRegState(PhysReg, NewState);
-    return;
-  }
-
-  // This is a disabled register, disable all aliases.
-  setPhysRegState(PhysReg, NewState);
-  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
-    MCPhysReg Alias = *AI;
-    switch (Register VirtReg = PhysRegState[Alias]) {
-    case regDisabled:
-      break;
+                                 MCPhysReg PhysReg, unsigned NewState) {
+  for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) {
+    switch (unsigned VirtReg = RegUnitStates[*UI]) {
     default:
       spillVirtReg(MI, VirtReg);
-      LLVM_FALLTHROUGH;
+      break;
     case regFree:
     case regReserved:
-      setPhysRegState(Alias, regDisabled);
-      if (TRI->isSuperRegister(PhysReg, Alias))
-        return;
       break;
     }
   }
+
+  markRegUsedInInstr(PhysReg);
+  setPhysRegState(PhysReg, NewState);
 }
 
 /// Return the cost of spilling clearing out PhysReg and aliases so it is free
@@ -563,46 +521,24 @@ unsigned RegAllocFast::calcSpillCost(MCPhysReg PhysReg) const {
                       << " is already used in instr.\n");
     return spillImpossible;
   }
-  switch (Register VirtReg = PhysRegState[PhysReg]) {
-  case regDisabled:
-    break;
-  case regFree:
-    return 0;
-  case regReserved:
-    LLVM_DEBUG(dbgs() << printReg(VirtReg, TRI) << " corresponding "
-                      << printReg(PhysReg, TRI) << " is reserved already.\n");
-    return spillImpossible;
-  default: {
-    LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
-    assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
-           "Missing VirtReg entry");
-    return LRI->Dirty ? spillDirty : spillClean;
-  }
-  }
 
-  // This is a disabled register, add up cost of aliases.
-  LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << " is disabled.\n");
-  unsigned Cost = 0;
-  for (MCRegAliasIterator AI(PhysReg, TRI, false); AI.isValid(); ++AI) {
-    MCPhysReg Alias = *AI;
-    switch (Register VirtReg = PhysRegState[Alias]) {
-    case regDisabled:
-      break;
+  for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) {
+    switch (unsigned VirtReg = RegUnitStates[*UI]) {
     case regFree:
-      ++Cost;
       break;
     case regReserved:
+      LLVM_DEBUG(dbgs() << printReg(VirtReg, TRI) << " corresponding "
+                        << printReg(PhysReg, TRI) << " is reserved already.\n");
       return spillImpossible;
     default: {
       LiveRegMap::const_iterator LRI = findLiveVirtReg(VirtReg);
       assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
              "Missing VirtReg entry");
-      Cost += LRI->Dirty ? spillDirty : spillClean;
-      break;
+      return LRI->Dirty ? spillDirty : spillClean;
     }
     }
   }
-  return Cost;
+  return 0;
 }
 
 /// This method updates local state so that we know that PhysReg is the
@@ -909,9 +845,17 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
     if (!Reg || !Reg.isPhysical())
       continue;
     markRegUsedInInstr(Reg);
-    for (MCRegAliasIterator AI(Reg, TRI, true); AI.isValid(); ++AI) {
-      if (ThroughRegs.count(PhysRegState[*AI]))
-        definePhysReg(MI, *AI, regFree);
+
+    for (MCRegUnitIterator UI(Reg, TRI); UI.isValid(); ++UI) {
+      if (!ThroughRegs.count(RegUnitStates[*UI]))
+        continue;
+
+      // Need to spill any aliasing registers.
+      for (MCRegUnitRootIterator RI(*UI, TRI); RI.isValid(); ++RI) {
+        for (MCSuperRegIterator SI(*RI, TRI, true); SI.isValid(); ++SI) {
+          definePhysReg(MI, *SI, regFree);
+        }
+      }
     }
   }
 
@@ -975,37 +919,40 @@ void RegAllocFast::handleThroughOperands(MachineInstr &MI,
 }
 
 #ifndef NDEBUG
-void RegAllocFast::dumpState() {
-  for (unsigned Reg = 1, E = TRI->getNumRegs(); Reg != E; ++Reg) {
-    if (PhysRegState[Reg] == regDisabled) continue;
-    dbgs() << " " << printReg(Reg, TRI);
-    switch(PhysRegState[Reg]) {
+
+void RegAllocFast::dumpState() const {
+  for (unsigned Unit = 1, UnitE = TRI->getNumRegUnits(); Unit != UnitE;
+       ++Unit) {
+    switch (unsigned VirtReg = RegUnitStates[Unit]) {
     case regFree:
       break;
     case regReserved:
-      dbgs() << "*";
+      dbgs() << " " << printRegUnit(Unit, TRI) << "[P]";
       break;
     default: {
-      dbgs() << '=' << printReg(PhysRegState[Reg]);
-      LiveRegMap::iterator LRI = findLiveVirtReg(PhysRegState[Reg]);
-      assert(LRI != LiveVirtRegs.end() && LRI->PhysReg &&
-             "Missing VirtReg entry");
-      if (LRI->Dirty)
-        dbgs() << "*";
-      assert(LRI->PhysReg == Reg && "Bad inverse map");
+      dbgs() << ' ' << printRegUnit(Unit, TRI) << '=' << printReg(VirtReg);
+      LiveRegMap::const_iterator I = findLiveVirtReg(VirtReg);
+      assert(I != LiveVirtRegs.end() && "have LiveVirtRegs entry");
+      if (I->Dirty)
+        dbgs() << "[D]";
+      assert(TRI->hasRegUnit(I->PhysReg, Unit) && "inverse mapping present");
       break;
     }
     }
   }
   dbgs() << '\n';
   // Check that LiveVirtRegs is the inverse.
-  for (LiveRegMap::iterator i = LiveVirtRegs.begin(),
-       e = LiveVirtRegs.end(); i != e; ++i) {
-    if (!i->PhysReg)
-      continue;
-    assert(i->VirtReg.isVirtual() && "Bad map key");
-    assert(Register::isPhysicalRegister(i->PhysReg) && "Bad map value");
-    assert(PhysRegState[i->PhysReg] == i->VirtReg && "Bad inverse map");
+  for (const LiveReg &LR : LiveVirtRegs) {
+    Register VirtReg = LR.VirtReg;
+    assert(VirtReg.isVirtual() && "Bad map key");
+    MCPhysReg PhysReg = LR.PhysReg;
+    if (PhysReg != 0) {
+      assert(Register::isPhysicalRegister(PhysReg) &&
+             "mapped to physreg");
+      for (MCRegUnitIterator UI(PhysReg, TRI); UI.isValid(); ++UI) {
+        assert(RegUnitStates[*UI] == VirtReg && "inverse map valid");
+      }
+    }
   }
 }
 #endif
@@ -1209,7 +1156,7 @@ void RegAllocFast::allocateInstruction(MachineInstr &MI) {
 }
 
 void RegAllocFast::handleDebugValue(MachineInstr &MI) {
-  MachineOperand &MO = MI.getOperand(0);
+  MachineOperand &MO = MI.getDebugOperand(0);
 
   // Ignore DBG_VALUEs that aren't based on virtual registers. These are
   // mostly constants and frame indices.
@@ -1247,7 +1194,7 @@ void RegAllocFast::allocateBasicBlock(MachineBasicBlock &MBB) {
   this->MBB = &MBB;
   LLVM_DEBUG(dbgs() << "\nAllocating " << MBB);
 
-  PhysRegState.assign(TRI->getNumRegs(), regDisabled);
+  RegUnitStates.assign(TRI->getNumRegUnits(), regFree);
   assert(LiveVirtRegs.empty() && "Mapping not cleared from last block?");
 
   MachineBasicBlock::iterator MII = MBB.begin();
diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp
index 27de7fe45887..41cf00261265 100644
--- a/llvm/lib/CodeGen/RegAllocGreedy.cpp
+++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp
@@ -16,7 +16,6 @@
 #include "LiveDebugVariables.h"
 #include "RegAllocBase.h"
 #include "SpillPlacement.h"
-#include "Spiller.h"
 #include "SplitKit.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
@@ -53,6 +52,7 @@
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/RegisterClassInfo.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/Spiller.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -124,12 +124,6 @@ static cl::opt<bool> EnableDeferredSpilling(
              "variable because of other evicted variables."),
     cl::init(false));
 
-static cl::opt<unsigned>
-    HugeSizeForSplit("huge-size-for-split", cl::Hidden,
-                     cl::desc("A threshold of live range size which may cause "
-                              "high compile time cost in global splitting."),
-                     cl::init(5000));
-
 // FIXME: Find a good default for this flag and remove the flag.
 static cl::opt<unsigned>
 CSRFirstTimeCost("regalloc-csr-first-time-cost",
@@ -423,7 +417,7 @@ public:
   Spiller &spiller() override { return *SpillerInstance; }
   void enqueue(LiveInterval *LI) override;
   LiveInterval *dequeue() override;
-  unsigned selectOrSplit(LiveInterval&, SmallVectorImpl<unsigned>&) override;
+  Register selectOrSplit(LiveInterval&, SmallVectorImpl<Register>&) override;
   void aboutToRemoveInterval(LiveInterval &) override;
 
   /// Perform register allocation.
@@ -437,7 +431,7 @@ public:
   static char ID;
 
 private:
-  unsigned selectOrSplitImpl(LiveInterval &, SmallVectorImpl<unsigned> &,
+  Register selectOrSplitImpl(LiveInterval &, SmallVectorImpl<Register> &,
                              SmallVirtRegSet &, unsigned = 0);
 
   bool LRE_CanEraseVirtReg(unsigned) override;
@@ -462,31 +456,30 @@ private:
   bool calcCompactRegion(GlobalSplitCandidate&);
   void splitAroundRegion(LiveRangeEdit&, ArrayRef<unsigned>);
   void calcGapWeights(unsigned, SmallVectorImpl<float>&);
-  unsigned canReassign(LiveInterval &VirtReg, unsigned PrevReg);
+  Register canReassign(LiveInterval &VirtReg, Register PrevReg);
   bool shouldEvict(LiveInterval &A, bool, LiveInterval &B, bool);
-  bool canEvictInterference(LiveInterval&, unsigned, bool, EvictionCost&,
+  bool canEvictInterference(LiveInterval&, Register, bool, EvictionCost&,
                             const SmallVirtRegSet&);
-  bool canEvictInterferenceInRange(LiveInterval &VirtReg, unsigned PhysReg,
+  bool canEvictInterferenceInRange(LiveInterval &VirtReg, Register oPhysReg,
                                    SlotIndex Start, SlotIndex End,
                                    EvictionCost &MaxCost);
   unsigned getCheapestEvicteeWeight(const AllocationOrder &Order,
                                     LiveInterval &VirtReg, SlotIndex Start,
                                     SlotIndex End, float *BestEvictWeight);
-  void evictInterference(LiveInterval&, unsigned,
-                         SmallVectorImpl<unsigned>&);
+  void evictInterference(LiveInterval&, Register,
+                         SmallVectorImpl<Register>&);
   bool mayRecolorAllInterferences(unsigned PhysReg, LiveInterval &VirtReg,
                                   SmallLISet &RecoloringCandidates,
                                   const SmallVirtRegSet &FixedRegisters);
 
-  unsigned tryAssign(LiveInterval&, AllocationOrder&,
-                     SmallVectorImpl<unsigned>&,
+  Register tryAssign(LiveInterval&, AllocationOrder&,
+                     SmallVectorImpl<Register>&,
                      const SmallVirtRegSet&);
   unsigned tryEvict(LiveInterval&, AllocationOrder&,
-                    SmallVectorImpl<unsigned>&, unsigned,
+                    SmallVectorImpl<Register>&, unsigned,
                     const SmallVirtRegSet&);
   unsigned tryRegionSplit(LiveInterval&, AllocationOrder&,
-                          SmallVectorImpl<unsigned>&);
-  unsigned isSplitBenefitWorthCost(LiveInterval &VirtReg);
+                          SmallVectorImpl<Register>&);
   /// Calculate cost of region splitting.
   unsigned calculateRegionSplitCost(LiveInterval &VirtReg,
                                     AllocationOrder &Order,
@@ -496,26 +489,26 @@ private:
   /// Perform region splitting.
   unsigned doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
                          bool HasCompact,
-                         SmallVectorImpl<unsigned> &NewVRegs);
+                         SmallVectorImpl<Register> &NewVRegs);
   /// Check other options before using a callee-saved register for the first
   /// time.
   unsigned tryAssignCSRFirstTime(LiveInterval &VirtReg, AllocationOrder &Order,
-                                 unsigned PhysReg, unsigned &CostPerUseLimit,
-                                 SmallVectorImpl<unsigned> &NewVRegs);
+                                 Register PhysReg, unsigned &CostPerUseLimit,
+                                 SmallVectorImpl<Register> &NewVRegs);
   void initializeCSRCost();
   unsigned tryBlockSplit(LiveInterval&, AllocationOrder&,
-                         SmallVectorImpl<unsigned>&);
+                         SmallVectorImpl<Register>&);
   unsigned tryInstructionSplit(LiveInterval&, AllocationOrder&,
-                               SmallVectorImpl<unsigned>&);
+                               SmallVectorImpl<Register>&);
   unsigned tryLocalSplit(LiveInterval&, AllocationOrder&,
-    SmallVectorImpl<unsigned>&);
+    SmallVectorImpl<Register>&);
   unsigned trySplit(LiveInterval&, AllocationOrder&,
-                    SmallVectorImpl<unsigned>&,
+                    SmallVectorImpl<Register>&,
                     const SmallVirtRegSet&);
   unsigned tryLastChanceRecoloring(LiveInterval &, AllocationOrder &,
-                                   SmallVectorImpl<unsigned> &,
+                                   SmallVectorImpl<Register> &,
                                    SmallVirtRegSet &, unsigned);
-  bool tryRecoloringCandidates(PQueue &, SmallVectorImpl<unsigned> &,
+  bool tryRecoloringCandidates(PQueue &, SmallVectorImpl<Register> &,
                                SmallVirtRegSet &, unsigned);
   void tryHintRecoloring(LiveInterval &);
   void tryHintsRecoloring();
@@ -525,12 +518,12 @@ private:
     /// The frequency of the copy.
     BlockFrequency Freq;
     /// The virtual register or physical register.
-    unsigned Reg;
+    Register Reg;
     /// Its currently assigned register.
     /// In case of a physical register Reg == PhysReg.
-    unsigned PhysReg;
+    MCRegister PhysReg;
 
-    HintInfo(BlockFrequency Freq, unsigned Reg, unsigned PhysReg)
+    HintInfo(BlockFrequency Freq, Register Reg, MCRegister PhysReg)
         : Freq(Freq), Reg(Reg), PhysReg(PhysReg) {}
   };
   using HintsInfo = SmallVector<HintInfo, 4>;
@@ -538,7 +531,7 @@ private:
   BlockFrequency getBrokenHintFreq(const HintsInfo &, unsigned);
   void collectHintInfo(unsigned, HintsInfo &);
 
-  bool isUnusedCalleeSavedReg(unsigned PhysReg) const;
+  bool isUnusedCalleeSavedReg(MCRegister PhysReg) const;
 
   /// Compute and report the number of spills and reloads for a loop.
   void reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads,
@@ -759,12 +752,12 @@ LiveInterval *RAGreedy::dequeue(PQueue &CurQueue) {
 //===----------------------------------------------------------------------===//
 
 /// tryAssign - Try to assign VirtReg to an available register.
-unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
+Register RAGreedy::tryAssign(LiveInterval &VirtReg,
                              AllocationOrder &Order,
-                             SmallVectorImpl<unsigned> &NewVRegs,
+                             SmallVectorImpl<Register> &NewVRegs,
                              const SmallVirtRegSet &FixedRegisters) {
   Order.rewind();
-  unsigned PhysReg;
+  Register PhysReg;
   while ((PhysReg = Order.next()))
     if (!Matrix->checkInterference(VirtReg, PhysReg))
       break;
@@ -775,7 +768,7 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
 
   // If we missed a simple hint, try to cheaply evict interference from the
   // preferred register.
-  if (unsigned Hint = MRI->getSimpleHint(VirtReg.reg))
+  if (Register Hint = MRI->getSimpleHint(VirtReg.reg))
     if (Order.isHint(Hint)) {
       LLVM_DEBUG(dbgs() << "missed hint " << printReg(Hint, TRI) << '\n');
       EvictionCost MaxCost;
@@ -798,7 +791,7 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
 
   LLVM_DEBUG(dbgs() << printReg(PhysReg, TRI) << " is available at cost "
                     << Cost << '\n');
-  unsigned CheapReg = tryEvict(VirtReg, Order, NewVRegs, Cost, FixedRegisters);
+  Register CheapReg = tryEvict(VirtReg, Order, NewVRegs, Cost, FixedRegisters);
   return CheapReg ? CheapReg : PhysReg;
 }
 
@@ -806,9 +799,9 @@ unsigned RAGreedy::tryAssign(LiveInterval &VirtReg,
 //                         Interference eviction
 //===----------------------------------------------------------------------===//
 
-unsigned RAGreedy::canReassign(LiveInterval &VirtReg, unsigned PrevReg) {
+Register RAGreedy::canReassign(LiveInterval &VirtReg, Register PrevReg) {
   AllocationOrder Order(VirtReg.reg, *VRM, RegClassInfo, Matrix);
-  unsigned PhysReg;
+  Register PhysReg;
   while ((PhysReg = Order.next())) {
     if (PhysReg == PrevReg)
       continue;
@@ -869,7 +862,7 @@ bool RAGreedy::shouldEvict(LiveInterval &A, bool IsHint,
 /// @param MaxCost Only look for cheaper candidates and update with new cost
 ///                when returning true.
 /// @returns True when interference can be evicted cheaper than MaxCost.
-bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
+bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, Register PhysReg,
                                     bool IsHint, EvictionCost &MaxCost,
                                     const SmallVirtRegSet &FixedRegisters) {
   // It is only possible to evict virtual register interference.
@@ -967,7 +960,7 @@ bool RAGreedy::canEvictInterference(LiveInterval &VirtReg, unsigned PhysReg,
 ///                when returning true.
 /// \return True when interference can be evicted cheaper than MaxCost.
 bool RAGreedy::canEvictInterferenceInRange(LiveInterval &VirtReg,
-                                           unsigned PhysReg, SlotIndex Start,
+                                           Register PhysReg, SlotIndex Start,
                                            SlotIndex End,
                                            EvictionCost &MaxCost) {
   EvictionCost Cost;
@@ -1045,8 +1038,8 @@ unsigned RAGreedy::getCheapestEvicteeWeight(const AllocationOrder &Order,
 /// evictInterference - Evict any interferring registers that prevent VirtReg
 /// from being assigned to Physreg. This assumes that canEvictInterference
 /// returned true.
-void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
-                                 SmallVectorImpl<unsigned> &NewVRegs) {
+void RAGreedy::evictInterference(LiveInterval &VirtReg, Register PhysReg,
+                                 SmallVectorImpl<Register> &NewVRegs) {
   // Make sure that VirtReg has a cascade number, and assign that cascade
   // number to every evicted register. These live ranges than then only be
   // evicted by a newer cascade, preventing infinite loops.
@@ -1091,9 +1084,9 @@ void RAGreedy::evictInterference(LiveInterval &VirtReg, unsigned PhysReg,
 
 /// Returns true if the given \p PhysReg is a callee saved register and has not
 /// been used for allocation yet.
-bool RAGreedy::isUnusedCalleeSavedReg(unsigned PhysReg) const {
-  unsigned CSR = RegClassInfo.getLastCalleeSavedAlias(PhysReg);
-  if (CSR == 0)
+bool RAGreedy::isUnusedCalleeSavedReg(MCRegister PhysReg) const {
+  MCRegister CSR = RegClassInfo.getLastCalleeSavedAlias(PhysReg);
+  if (!CSR)
     return false;
 
   return !Matrix->isPhysRegUsed(PhysReg);
@@ -1105,7 +1098,7 @@ bool RAGreedy::isUnusedCalleeSavedReg(unsigned PhysReg) const {
 /// @return         Physreg to assign VirtReg, or 0.
 unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
                             AllocationOrder &Order,
-                            SmallVectorImpl<unsigned> &NewVRegs,
+                            SmallVectorImpl<Register> &NewVRegs,
                             unsigned CostPerUseLimit,
                             const SmallVirtRegSet &FixedRegisters) {
   NamedRegionTimer T("evict", "Evict", TimerGroupName, TimerGroupDescription,
@@ -1142,7 +1135,7 @@ unsigned RAGreedy::tryEvict(LiveInterval &VirtReg,
   }
 
   Order.rewind();
-  while (unsigned PhysReg = Order.next(OrderLimit)) {
+  while (MCRegister PhysReg = Order.next(OrderLimit)) {
     if (TRI->getCostPerUse(PhysReg) >= CostPerUseLimit)
       continue;
     // The first use of a callee-saved register in a function has cost 1.
@@ -1815,20 +1808,9 @@ void RAGreedy::splitAroundRegion(LiveRangeEdit &LREdit,
     MF->verify(this, "After splitting live range around region");
 }
 
-// Global split has high compile time cost especially for large live range.
-// Return false for the case here where the potential benefit will never
-// worth the cost.
-unsigned RAGreedy::isSplitBenefitWorthCost(LiveInterval &VirtReg) {
-  MachineInstr *MI = MRI->getUniqueVRegDef(VirtReg.reg);
-  if (MI && TII->isTriviallyReMaterializable(*MI, AA) &&
-      VirtReg.size() > HugeSizeForSplit)
-    return false;
-  return true;
-}
-
 unsigned RAGreedy::tryRegionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                                  SmallVectorImpl<unsigned> &NewVRegs) {
-  if (!isSplitBenefitWorthCost(VirtReg))
+                                  SmallVectorImpl<Register> &NewVRegs) {
+  if (!TRI->shouldRegionSplitForVirtReg(*MF, VirtReg))
     return 0;
   unsigned NumCands = 0;
   BlockFrequency SpillCost = calcSpillCost();
@@ -1971,7 +1953,7 @@ unsigned RAGreedy::calculateRegionSplitCost(LiveInterval &VirtReg,
 
 unsigned RAGreedy::doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
                                  bool HasCompact,
-                                 SmallVectorImpl<unsigned> &NewVRegs) {
+                                 SmallVectorImpl<Register> &NewVRegs) {
   SmallVector<unsigned, 8> UsedCands;
   // Prepare split editor.
   LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
@@ -2017,9 +1999,9 @@ unsigned RAGreedy::doRegionSplit(LiveInterval &VirtReg, unsigned BestCand,
 /// creates a lot of local live ranges, that will be split by tryLocalSplit if
 /// they don't allocate.
 unsigned RAGreedy::tryBlockSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                                 SmallVectorImpl<unsigned> &NewVRegs) {
+                                 SmallVectorImpl<Register> &NewVRegs) {
   assert(&SA->getParent() == &VirtReg && "Live range wasn't analyzed");
-  unsigned Reg = VirtReg.reg;
+  Register Reg = VirtReg.reg;
   bool SingleInstrs = RegClassInfo.isProperSubClass(MRI->getRegClass(Reg));
   LiveRangeEdit LREdit(&VirtReg, NewVRegs, *MF, *LIS, VRM, this, &DeadRemats);
   SE->reset(LREdit, SplitSpillMode);
@@ -2084,7 +2066,7 @@ static unsigned getNumAllocatableRegsForConstraints(
 /// This is similar to spilling to a larger register class.
 unsigned
 RAGreedy::tryInstructionSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                              SmallVectorImpl<unsigned> &NewVRegs) {
+                              SmallVectorImpl<Register> &NewVRegs) {
   const TargetRegisterClass *CurRC = MRI->getRegClass(VirtReg.reg);
   // There is no point to this if there are no larger sub-classes.
   if (!RegClassInfo.isProperSubClass(CurRC))
@@ -2227,7 +2209,7 @@ void RAGreedy::calcGapWeights(unsigned PhysReg,
 /// basic block.
 ///
 unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                                 SmallVectorImpl<unsigned> &NewVRegs) {
+                                 SmallVectorImpl<Register> &NewVRegs) {
   // TODO: the function currently only handles a single UseBlock; it should be
   // possible to generalize.
   if (SA->getUseBlocks().size() != 1)
@@ -2458,7 +2440,7 @@ unsigned RAGreedy::tryLocalSplit(LiveInterval &VirtReg, AllocationOrder &Order,
 /// assignable.
 /// @return Physreg when VirtReg may be assigned and/or new NewVRegs.
 unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
-                            SmallVectorImpl<unsigned>&NewVRegs,
+                            SmallVectorImpl<Register> &NewVRegs,
                             const SmallVirtRegSet &FixedRegisters) {
   // Ranges must be Split2 or less.
   if (getStage(VirtReg) >= RS_Spill)
@@ -2469,7 +2451,7 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
     NamedRegionTimer T("local_split", "Local Splitting", TimerGroupName,
                        TimerGroupDescription, TimePassesIsEnabled);
     SA->analyze(&VirtReg);
-    unsigned PhysReg = tryLocalSplit(VirtReg, Order, NewVRegs);
+    Register PhysReg = tryLocalSplit(VirtReg, Order, NewVRegs);
     if (PhysReg || !NewVRegs.empty())
       return PhysReg;
     return tryInstructionSplit(VirtReg, Order, NewVRegs);
@@ -2487,7 +2469,7 @@ unsigned RAGreedy::trySplit(LiveInterval &VirtReg, AllocationOrder &Order,
   if (SA->didRepairRange()) {
     // VirtReg has changed, so all cached queries are invalid.
     Matrix->invalidateVirtRegs();
-    if (unsigned PhysReg = tryAssign(VirtReg, Order, NewVRegs, FixedRegisters))
+    if (Register PhysReg = tryAssign(VirtReg, Order, NewVRegs, FixedRegisters))
       return PhysReg;
   }
 
@@ -2602,7 +2584,7 @@ RAGreedy::mayRecolorAllInterferences(unsigned PhysReg, LiveInterval &VirtReg,
 /// exists.
 unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
                                            AllocationOrder &Order,
-                                           SmallVectorImpl<unsigned> &NewVRegs,
+                                           SmallVectorImpl<Register> &NewVRegs,
                                            SmallVirtRegSet &FixedRegisters,
                                            unsigned Depth) {
   LLVM_DEBUG(dbgs() << "Try last chance recoloring for " << VirtReg << '\n');
@@ -2623,15 +2605,15 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
   SmallLISet RecoloringCandidates;
   // Record the original mapping virtual register to physical register in case
   // the recoloring fails.
-  DenseMap<unsigned, unsigned> VirtRegToPhysReg;
+  DenseMap<Register, Register> VirtRegToPhysReg;
   // Mark VirtReg as fixed, i.e., it will not be recolored pass this point in
   // this recoloring "session".
   assert(!FixedRegisters.count(VirtReg.reg));
   FixedRegisters.insert(VirtReg.reg);
-  SmallVector<unsigned, 4> CurrentNewVRegs;
+  SmallVector<Register, 4> CurrentNewVRegs;
 
   Order.rewind();
-  while (unsigned PhysReg = Order.next()) {
+  while (Register PhysReg = Order.next()) {
     LLVM_DEBUG(dbgs() << "Try to assign: " << VirtReg << " to "
                       << printReg(PhysReg, TRI) << '\n');
     RecoloringCandidates.clear();
@@ -2662,7 +2644,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
     for (SmallLISet::iterator It = RecoloringCandidates.begin(),
                               EndIt = RecoloringCandidates.end();
          It != EndIt; ++It) {
-      unsigned ItVirtReg = (*It)->reg;
+      Register ItVirtReg = (*It)->reg;
       enqueue(RecoloringQueue, *It);
       assert(VRM->hasPhys(ItVirtReg) &&
              "Interferences are supposed to be with allocated variables");
@@ -2685,7 +2667,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
     if (tryRecoloringCandidates(RecoloringQueue, CurrentNewVRegs,
                                 FixedRegisters, Depth)) {
       // Push the queued vregs into the main queue.
-      for (unsigned NewVReg : CurrentNewVRegs)
+      for (Register NewVReg : CurrentNewVRegs)
         NewVRegs.push_back(NewVReg);
       // Do not mess up with the global assignment process.
       // I.e., VirtReg must be unassigned.
@@ -2704,7 +2686,7 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
     // don't add it to NewVRegs because its physical register will be restored
     // below. Other vregs in CurrentNewVRegs are created by calling
     // selectOrSplit and should be added into NewVRegs.
-    for (SmallVectorImpl<unsigned>::iterator Next = CurrentNewVRegs.begin(),
+    for (SmallVectorImpl<Register>::iterator Next = CurrentNewVRegs.begin(),
                                              End = CurrentNewVRegs.end();
          Next != End; ++Next) {
       if (RecoloringCandidates.count(&LIS->getInterval(*Next)))
@@ -2715,10 +2697,10 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
     for (SmallLISet::iterator It = RecoloringCandidates.begin(),
                               EndIt = RecoloringCandidates.end();
          It != EndIt; ++It) {
-      unsigned ItVirtReg = (*It)->reg;
+      Register ItVirtReg = (*It)->reg;
       if (VRM->hasPhys(ItVirtReg))
         Matrix->unassign(**It);
-      unsigned ItPhysReg = VirtRegToPhysReg[ItVirtReg];
+      Register ItPhysReg = VirtRegToPhysReg[ItVirtReg];
       Matrix->assign(**It, ItPhysReg);
     }
   }
@@ -2736,14 +2718,14 @@ unsigned RAGreedy::tryLastChanceRecoloring(LiveInterval &VirtReg,
 /// \return true if all virtual registers in RecoloringQueue were successfully
 /// recolored, false otherwise.
 bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue,
-                                       SmallVectorImpl<unsigned> &NewVRegs,
+                                       SmallVectorImpl<Register> &NewVRegs,
                                        SmallVirtRegSet &FixedRegisters,
                                        unsigned Depth) {
   while (!RecoloringQueue.empty()) {
     LiveInterval *LI = dequeue(RecoloringQueue);
     LLVM_DEBUG(dbgs() << "Try to recolor: " << *LI << '\n');
-    unsigned PhysReg;
-    PhysReg = selectOrSplitImpl(*LI, NewVRegs, FixedRegisters, Depth + 1);
+    Register PhysReg = selectOrSplitImpl(*LI, NewVRegs, FixedRegisters,
+                                         Depth + 1);
     // When splitting happens, the live-range may actually be empty.
     // In that case, this is okay to continue the recoloring even
     // if we did not find an alternative color for it. Indeed,
@@ -2770,12 +2752,12 @@ bool RAGreedy::tryRecoloringCandidates(PQueue &RecoloringQueue,
 //                            Main Entry Point
 //===----------------------------------------------------------------------===//
 
-unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
-                                 SmallVectorImpl<unsigned> &NewVRegs) {
+Register RAGreedy::selectOrSplit(LiveInterval &VirtReg,
+                                 SmallVectorImpl<Register> &NewVRegs) {
   CutOffInfo = CO_None;
   LLVMContext &Ctx = MF->getFunction().getContext();
   SmallVirtRegSet FixedRegisters;
-  unsigned Reg = selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters);
+  Register Reg = selectOrSplitImpl(VirtReg, NewVRegs, FixedRegisters);
   if (Reg == ~0U && (CutOffInfo != CO_None)) {
     uint8_t CutOffEncountered = CutOffInfo & (CO_Depth | CO_Interf);
     if (CutOffEncountered == CO_Depth)
@@ -2802,9 +2784,9 @@ unsigned RAGreedy::selectOrSplit(LiveInterval &VirtReg,
 /// to use the CSR; otherwise return 0.
 unsigned RAGreedy::tryAssignCSRFirstTime(LiveInterval &VirtReg,
                                          AllocationOrder &Order,
-                                         unsigned PhysReg,
+                                         Register PhysReg,
                                          unsigned &CostPerUseLimit,
-                                         SmallVectorImpl<unsigned> &NewVRegs) {
+                                         SmallVectorImpl<Register> &NewVRegs) {
   if (getStage(VirtReg) == RS_Spill && VirtReg.isSpillable()) {
     // We choose spill over using the CSR for the first time if the spill cost
     // is lower than CSRCost.
@@ -3031,8 +3013,8 @@ void RAGreedy::tryHintsRecoloring() {
   }
 }
 
-unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
-                                     SmallVectorImpl<unsigned> &NewVRegs,
+Register RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
+                                     SmallVectorImpl<Register> &NewVRegs,
                                      SmallVirtRegSet &FixedRegisters,
                                      unsigned Depth) {
   unsigned CostPerUseLimit = ~0u;
@@ -3046,7 +3028,7 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
     // register.
     if (CSRCost.getFrequency() && isUnusedCalleeSavedReg(PhysReg) &&
         NewVRegs.empty()) {
-      unsigned CSRReg = tryAssignCSRFirstTime(VirtReg, Order, PhysReg,
+      Register CSRReg = tryAssignCSRFirstTime(VirtReg, Order, PhysReg,
                                               CostPerUseLimit, NewVRegs);
       if (CSRReg || !NewVRegs.empty())
         // Return now if we decide to use a CSR or create new vregs due to
@@ -3064,10 +3046,10 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   // queue. The RS_Split ranges already failed to do this, and they should not
   // get a second chance until they have been split.
   if (Stage != RS_Split)
-    if (unsigned PhysReg =
+    if (Register PhysReg =
             tryEvict(VirtReg, Order, NewVRegs, CostPerUseLimit,
                      FixedRegisters)) {
-      unsigned Hint = MRI->getSimpleHint(VirtReg.reg);
+      Register Hint = MRI->getSimpleHint(VirtReg.reg);
       // If VirtReg has a hint and that hint is broken record this
       // virtual register as a recoloring candidate for broken hint.
       // Indeed, since we evicted a variable in its neighborhood it is
@@ -3096,9 +3078,9 @@ unsigned RAGreedy::selectOrSplitImpl(LiveInterval &VirtReg,
   if (Stage < RS_Spill) {
     // Try splitting VirtReg or interferences.
     unsigned NewVRegSizeBefore = NewVRegs.size();
-    unsigned PhysReg = trySplit(VirtReg, Order, NewVRegs, FixedRegisters);
+    Register PhysReg = trySplit(VirtReg, Order, NewVRegs, FixedRegisters);
     if (PhysReg || (NewVRegs.size() - NewVRegSizeBefore)) {
-      // If VirtReg got split, the eviction info is no longre relevant.
+      // If VirtReg got split, the eviction info is no longer relevant.
       LastEvicted.clearEvicteeInfo(VirtReg.reg);
       return PhysReg;
     }
@@ -3165,7 +3147,6 @@ void RAGreedy::reportNumberOfSplillsReloads(MachineLoop *L, unsigned &Reloads,
   }
 
   const MachineFrameInfo &MFI = MF->getFrameInfo();
-  const TargetInstrInfo *TII = MF->getSubtarget().getInstrInfo();
   int FI;
 
   for (MachineBasicBlock *MBB : L->getBlocks())
diff --git a/llvm/lib/CodeGen/RegAllocPBQP.cpp b/llvm/lib/CodeGen/RegAllocPBQP.cpp
index 3c4a46b12f99..7590dbf1b977 100644
--- a/llvm/lib/CodeGen/RegAllocPBQP.cpp
+++ b/llvm/lib/CodeGen/RegAllocPBQP.cpp
@@ -30,7 +30,6 @@
 
 #include "llvm/CodeGen/RegAllocPBQP.h"
 #include "RegisterCoalescer.h"
-#include "Spiller.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/DenseMap.h"
@@ -58,6 +57,7 @@
 #include "llvm/CodeGen/PBQPRAConstraint.h"
 #include "llvm/CodeGen/RegAllocRegistry.h"
 #include "llvm/CodeGen/SlotIndexes.h"
+#include "llvm/CodeGen/Spiller.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
@@ -166,7 +166,7 @@ private:
   void initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM, Spiller &VRegSpiller);
 
   /// Spill the given VReg.
-  void spillVReg(unsigned VReg, SmallVectorImpl<unsigned> &NewIntervals,
+  void spillVReg(Register VReg, SmallVectorImpl<Register> &NewIntervals,
                  MachineFunction &MF, LiveIntervals &LIS, VirtRegMap &VRM,
                  Spiller &VRegSpiller);
 
@@ -637,7 +637,7 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM,
     // Check for vregs that have no allowed registers. These should be
     // pre-spilled and the new vregs added to the worklist.
     if (VRegAllowed.empty()) {
-      SmallVector<unsigned, 8> NewVRegs;
+      SmallVector<Register, 8> NewVRegs;
       spillVReg(VReg, NewVRegs, MF, LIS, VRM, VRegSpiller);
       Worklist.insert(Worklist.end(), NewVRegs.begin(), NewVRegs.end());
       continue;
@@ -673,8 +673,8 @@ void RegAllocPBQP::initializeGraph(PBQPRAGraph &G, VirtRegMap &VRM,
   }
 }
 
-void RegAllocPBQP::spillVReg(unsigned VReg,
-                             SmallVectorImpl<unsigned> &NewIntervals,
+void RegAllocPBQP::spillVReg(Register VReg,
+                             SmallVectorImpl<Register> &NewIntervals,
                              MachineFunction &MF, LiveIntervals &LIS,
                              VirtRegMap &VRM, Spiller &VRegSpiller) {
   VRegsToAlloc.erase(VReg);
@@ -730,7 +730,7 @@ bool RegAllocPBQP::mapPBQPToRegAlloc(const PBQPRAGraph &G,
     } else {
       // Spill VReg. If this introduces new intervals we'll need another round
       // of allocation.
-      SmallVector<unsigned, 8> NewVRegs;
+      SmallVector<Register, 8> NewVRegs;
       spillVReg(VReg, NewVRegs, MF, LIS, VRM, VRegSpiller);
       AnotherRoundNeeded |= !NewVRegs.empty();
     }
diff --git a/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp b/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
index 0205e6193741..0c3e8a89c920 100644
--- a/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
+++ b/llvm/lib/CodeGen/RegUsageInfoPropagate.cpp
@@ -26,7 +26,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/RegisterUsageInfo.h"
 #include "llvm/IR/Module.h"
-#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -118,8 +118,8 @@ bool RegUsageInfoPropagation::runOnMachineFunction(MachineFunction &MF) {
         continue;
       LLVM_DEBUG(
           dbgs()
-          << "Call Instruction Before Register Usage Info Propagation : \n");
-      LLVM_DEBUG(dbgs() << MI << "\n");
+          << "Call Instruction Before Register Usage Info Propagation : \n"
+          << MI << "\n");
 
       auto UpdateRegMask = [&](const Function &F) {
         const ArrayRef<uint32_t> RegMask = PRUI->getRegUsageInfo(F);
@@ -140,8 +140,9 @@ bool RegUsageInfoPropagation::runOnMachineFunction(MachineFunction &MF) {
       }
 
       LLVM_DEBUG(
-          dbgs() << "Call Instruction After Register Usage Info Propagation : "
-                 << MI << '\n');
+          dbgs()
+          << "Call Instruction After Register Usage Info Propagation : \n"
+          << MI << '\n');
     }
   }
 
diff --git a/llvm/lib/CodeGen/RegisterCoalescer.cpp b/llvm/lib/CodeGen/RegisterCoalescer.cpp
index a3f75d82d0ec..17160a9f42cd 100644
--- a/llvm/lib/CodeGen/RegisterCoalescer.cpp
+++ b/llvm/lib/CodeGen/RegisterCoalescer.cpp
@@ -571,7 +571,7 @@ void RegisterCoalescer::getAnalysisUsage(AnalysisUsage &AU) const {
 }
 
 void RegisterCoalescer::eliminateDeadDefs() {
-  SmallVector<unsigned, 8> NewRegs;
+  SmallVector<Register, 8> NewRegs;
   LiveRangeEdit(nullptr, NewRegs, *MF, *LIS,
                 nullptr, this).eliminateDeadDefs(DeadDefs);
 }
@@ -675,6 +675,12 @@ bool RegisterCoalescer::adjustCopiesBackFrom(const CoalescerPair &CP,
       S.removeSegment(*SS, true);
       continue;
     }
+    // The subrange may have ended before FillerStart. If so, extend it.
+    if (!S.getVNInfoAt(FillerStart)) {
+      SlotIndex BBStart =
+          LIS->getMBBStartIdx(LIS->getMBBFromIndex(FillerStart));
+      S.extendInBlock(BBStart, FillerStart);
+    }
     VNInfo *SubBValNo = S.getVNInfoAt(CopyIdx);
     S.addSegment(LiveInterval::Segment(FillerStart, FillerEnd, SubBValNo));
     VNInfo *SubValSNo = S.getVNInfoAt(AValNo->def.getPrevSlot());
@@ -1058,7 +1064,9 @@ bool RegisterCoalescer::removePartialRedundancy(const CoalescerPair &CP,
     return false;
 
   MachineBasicBlock &MBB = *CopyMI.getParent();
-  if (MBB.isEHPad())
+  // If this block is the target of an invoke/inlineasm_br, moving the copy into
+  // the predecessor is tricker, and we don't handle it.
+  if (MBB.isEHPad() || MBB.isInlineAsmBrIndirectTarget())
     return false;
 
   if (MBB.pred_size() != 2)
@@ -1439,6 +1447,9 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
       SlotIndex CurrIdx = LIS->getInstructionIndex(NewMI);
       LaneBitmask DstMask = TRI->getSubRegIndexLaneMask(NewIdx);
       bool UpdatedSubRanges = false;
+      SlotIndex DefIndex =
+          CurrIdx.getRegSlot(NewMI.getOperand(0).isEarlyClobber());
+      VNInfo::Allocator &Alloc = LIS->getVNInfoAllocator();
       for (LiveInterval::SubRange &SR : DstInt.subranges()) {
         if ((SR.LaneMask & DstMask).none()) {
           LLVM_DEBUG(dbgs()
@@ -1449,6 +1460,14 @@ bool RegisterCoalescer::reMaterializeTrivialDef(const CoalescerPair &CP,
             SR.removeValNo(RmValNo);
             UpdatedSubRanges = true;
           }
+        } else {
+          // We know that this lane is defined by this instruction,
+          // but at this point it may be empty because it is not used by
+          // anything. This happens when updateRegDefUses adds the missing
+          // lanes. Assign that lane a dead def so that the interferences
+          // are properly modeled.
+          if (SR.empty())
+            SR.createDeadDef(DefIndex, Alloc);
         }
       }
       if (UpdatedSubRanges)
@@ -2412,7 +2431,7 @@ public:
   /// Add foreign virtual registers to ShrinkRegs if their live range ended at
   /// the erased instrs.
   void eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
-                   SmallVectorImpl<unsigned> &ShrinkRegs,
+                   SmallVectorImpl<Register> &ShrinkRegs,
                    LiveInterval *LI = nullptr);
 
   /// Remove liverange defs at places where implicit defs will be removed.
@@ -2885,7 +2904,8 @@ bool JoinVals::resolveConflicts(JoinVals &Other) {
     if (V.Resolution != CR_Unresolved)
       continue;
     LLVM_DEBUG(dbgs() << "\t\tconflict at " << printReg(Reg) << ':' << i << '@'
-                      << LR.getValNumInfo(i)->def << '\n');
+                      << LR.getValNumInfo(i)->def
+                      << ' ' << PrintLaneMask(LaneMask) << '\n');
     if (SubRangeJoin)
       return false;
 
@@ -3153,7 +3173,7 @@ void JoinVals::removeImplicitDefs() {
 }
 
 void JoinVals::eraseInstrs(SmallPtrSetImpl<MachineInstr*> &ErasedInstrs,
-                           SmallVectorImpl<unsigned> &ShrinkRegs,
+                           SmallVectorImpl<Register> &ShrinkRegs,
                            LiveInterval *LI) {
   for (unsigned i = 0, e = LR.getNumValNums(); i != e; ++i) {
     // Get the def location before markUnused() below invalidates it.
@@ -3421,7 +3441,7 @@ bool RegisterCoalescer::joinVirtRegs(CoalescerPair &CP) {
 
   // Erase COPY and IMPLICIT_DEF instructions. This may cause some external
   // registers to require trimming.
-  SmallVector<unsigned, 8> ShrinkRegs;
+  SmallVector<Register, 8> ShrinkRegs;
   LHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs, &LHS);
   RHSVals.eraseInstrs(ErasedInstrs, ShrinkRegs);
   while (!ShrinkRegs.empty())
@@ -3470,7 +3490,7 @@ void RegisterCoalescer::buildVRegToDbgValueMap(MachineFunction &MF)
   // vreg => DbgValueLoc map.
   auto CloseNewDVRange = [this, &ToInsert](SlotIndex Slot) {
     for (auto *X : ToInsert)
-      DbgVRegToValues[X->getOperand(0).getReg()].push_back({Slot, X});
+      DbgVRegToValues[X->getDebugOperand(0).getReg()].push_back({Slot, X});
 
     ToInsert.clear();
   };
@@ -3482,8 +3502,8 @@ void RegisterCoalescer::buildVRegToDbgValueMap(MachineFunction &MF)
     SlotIndex CurrentSlot = Slots.getMBBStartIdx(&MBB);
 
     for (auto &MI : MBB) {
-      if (MI.isDebugValue() && MI.getOperand(0).isReg() &&
-          MI.getOperand(0).getReg().isVirtual()) {
+      if (MI.isDebugValue() && MI.getDebugOperand(0).isReg() &&
+          MI.getDebugOperand(0).getReg().isVirtual()) {
         ToInsert.push_back(&MI);
       } else if (!MI.isDebugInstr()) {
         CurrentSlot = Slots.getInstructionIndex(MI);
@@ -3582,10 +3602,10 @@ void RegisterCoalescer::checkMergingChangesDbgValuesImpl(unsigned Reg,
       // "Other" is live and there is a DBG_VALUE of Reg: test if we should
       // set it undef.
       if (DbgValueSetIt->first >= SegmentIt->start &&
-          DbgValueSetIt->second->getOperand(0).getReg() != 0 &&
+          DbgValueSetIt->second->getDebugOperand(0).getReg() != 0 &&
           ShouldUndef(DbgValueSetIt->first)) {
         // Mark undef, erase record of this DBG_VALUE to avoid revisiting.
-        DbgValueSetIt->second->getOperand(0).setReg(0);
+        DbgValueSetIt->second->setDebugValueUndef();
         continue;
       }
       ++DbgValueSetIt;
@@ -3853,6 +3873,23 @@ void RegisterCoalescer::releaseMemory() {
 }
 
 bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
+  LLVM_DEBUG(dbgs() << "********** SIMPLE REGISTER COALESCING **********\n"
+                    << "********** Function: " << fn.getName() << '\n');
+
+  // Variables changed between a setjmp and a longjump can have undefined value
+  // after the longjmp. This behaviour can be observed if such a variable is
+  // spilled, so longjmp won't restore the value in the spill slot.
+  // RegisterCoalescer should not run in functions with a setjmp to avoid
+  // merging such undefined variables with predictable ones.
+  //
+  // TODO: Could specifically disable coalescing registers live across setjmp
+  // calls
+  if (fn.exposesReturnsTwice()) {
+    LLVM_DEBUG(
+        dbgs() << "* Skipped as it exposes funcions that returns twice.\n");
+    return false;
+  }
+
   MF = &fn;
   MRI = &fn.getRegInfo();
   const TargetSubtargetInfo &STI = fn.getSubtarget();
@@ -3871,9 +3908,6 @@ bool RegisterCoalescer::runOnMachineFunction(MachineFunction &fn) {
   // splitting optimization.
   JoinSplitEdges = EnableJoinSplits;
 
-  LLVM_DEBUG(dbgs() << "********** SIMPLE REGISTER COALESCING **********\n"
-                    << "********** Function: " << MF->getName() << '\n');
-
   if (VerifyCoalescing)
     MF->verify(this, "Before register coalescing");
 
diff --git a/llvm/lib/CodeGen/RegisterPressure.cpp b/llvm/lib/CodeGen/RegisterPressure.cpp
index bf192d1c530d..ecbc4ed63ef6 100644
--- a/llvm/lib/CodeGen/RegisterPressure.cpp
+++ b/llvm/lib/CodeGen/RegisterPressure.cpp
@@ -858,7 +858,7 @@ void RegPressureTracker::recedeSkipDebugValues() {
     static_cast<RegionPressure&>(P).openTop(CurrPos);
 
   // Find the previous instruction.
-  CurrPos = skipDebugInstructionsBackward(std::prev(CurrPos), MBB->begin());
+  CurrPos = prev_nodbg(CurrPos, MBB->begin());
 
   SlotIndex SlotIdx;
   if (RequireIntervals && !CurrPos->isDebugInstr())
@@ -940,7 +940,7 @@ void RegPressureTracker::advance(const RegisterOperands &RegOpers) {
   bumpDeadDefs(RegOpers.DeadDefs);
 
   // Find the next instruction.
-  CurrPos = skipDebugInstructionsForward(std::next(CurrPos), MBB->end());
+  CurrPos = next_nodbg(CurrPos, MBB->end());
 }
 
 void RegPressureTracker::advance() {
diff --git a/llvm/lib/CodeGen/RegisterScavenging.cpp b/llvm/lib/CodeGen/RegisterScavenging.cpp
index a5bea1463468..41b6de1441d7 100644
--- a/llvm/lib/CodeGen/RegisterScavenging.cpp
+++ b/llvm/lib/CodeGen/RegisterScavenging.cpp
@@ -466,7 +466,7 @@ RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
   const MachineFunction &MF = *Before->getMF();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   unsigned NeedSize = TRI->getSpillSize(RC);
-  unsigned NeedAlign = TRI->getSpillAlignment(RC);
+  Align NeedAlign = TRI->getSpillAlign(RC);
 
   unsigned SI = Scavenged.size(), Diff = std::numeric_limits<unsigned>::max();
   int FIB = MFI.getObjectIndexBegin(), FIE = MFI.getObjectIndexEnd();
@@ -478,7 +478,7 @@ RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
     if (FI < FIB || FI >= FIE)
       continue;
     unsigned S = MFI.getObjectSize(FI);
-    unsigned A = MFI.getObjectAlignment(FI);
+    Align A = MFI.getObjectAlign(FI);
     if (NeedSize > S || NeedAlign > A)
       continue;
     // Avoid wasting slots with large size and/or large alignment. Pick one
@@ -487,7 +487,7 @@ RegScavenger::spill(Register Reg, const TargetRegisterClass &RC, int SPAdj,
     // larger register is reserved before a slot for a smaller one. When
     // trying to spill a smaller register, the large slot would be found
     // first, thus making it impossible to spill the larger register later.
-    unsigned D = (S-NeedSize) + (A-NeedAlign);
+    unsigned D = (S - NeedSize) + (A.value() - NeedAlign.value());
     if (D < Diff) {
       SI = I;
       Diff = D;
diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index 8aa488e63913..55478c232dd7 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -14,10 +14,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "SafeStackColoring.h"
 #include "SafeStackLayout.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -27,13 +27,13 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/StackLifetime.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
@@ -96,6 +96,10 @@ static cl::opt<bool>
     SafeStackUsePointerAddress("safestack-use-pointer-address",
                                   cl::init(false), cl::Hidden);
 
+// Disabled by default due to PR32143.
+static cl::opt<bool> ClColoring("safe-stack-coloring",
+                                cl::desc("enable safe stack coloring"),
+                                cl::Hidden, cl::init(false));
 
 namespace {
 
@@ -200,7 +204,7 @@ class SafeStack {
   bool IsAccessSafe(Value *Addr, uint64_t Size, const Value *AllocaPtr,
                     uint64_t AllocaSize);
 
-  bool ShouldInlinePointerAddress(CallSite &CS);
+  bool ShouldInlinePointerAddress(CallInst &CI);
   void TryInlinePointerAddress();
 
 public:
@@ -322,7 +326,7 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
 
       case Instruction::Call:
       case Instruction::Invoke: {
-        ImmutableCallSite CS(I);
+        const CallBase &CS = *cast<CallBase>(I);
 
         if (I->isLifetimeStartOrEnd())
           continue;
@@ -344,8 +348,8 @@ bool SafeStack::IsSafeStackAlloca(const Value *AllocaPtr, uint64_t AllocaSize) {
         // FIXME: a more precise solution would require an interprocedural
         // analysis here, which would look at all uses of an argument inside
         // the function being called.
-        ImmutableCallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
-        for (ImmutableCallSite::arg_iterator A = B; A != E; ++A)
+        auto B = CS.arg_begin(), E = CS.arg_end();
+        for (auto A = B; A != E; ++A)
           if (A->get() == V)
             if (!(CS.doesNotCapture(A - B) && (CS.doesNotAccessMemory(A - B) ||
                                                CS.doesNotAccessMemory()))) {
@@ -493,9 +497,18 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
 
   DIBuilder DIB(*F.getParent());
 
-  StackColoring SSC(F, StaticAllocas);
-  SSC.run();
-  SSC.removeAllMarkers();
+  StackLifetime SSC(F, StaticAllocas, StackLifetime::LivenessType::May);
+  static const StackLifetime::LiveRange NoColoringRange(1, true);
+  if (ClColoring)
+    SSC.run();
+
+  for (auto *I : SSC.getMarkers()) {
+    auto *Op = dyn_cast<Instruction>(I->getOperand(1));
+    const_cast<IntrinsicInst *>(I)->eraseFromParent();
+    // Remove the operand bitcast, too, if it has no more uses left.
+    if (Op && Op->use_empty())
+      Op->eraseFromParent();
+  }
 
   // Unsafe stack always grows down.
   StackLayout SSL(StackAlignment);
@@ -529,7 +542,8 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
     unsigned Align =
         std::max((unsigned)DL.getPrefTypeAlignment(Ty), AI->getAlignment());
 
-    SSL.addObject(AI, Size, Align, SSC.getLiveRange(AI));
+    SSL.addObject(AI, Size, Align,
+                  ClColoring ? SSC.getLiveRange(AI) : NoColoringRange);
   }
 
   SSL.computeLayout();
@@ -576,8 +590,8 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
                                      Arg->getName() + ".unsafe-byval");
 
     // Replace alloc with the new location.
-    replaceDbgDeclare(Arg, BasePointer, BasePointer->getNextNode(), DIB,
-                      DIExpression::ApplyOffset, -Offset);
+    replaceDbgDeclare(Arg, BasePointer, DIB, DIExpression::ApplyOffset,
+                      -Offset);
     Arg->replaceAllUsesWith(NewArg);
     IRB.SetInsertPoint(cast<Instruction>(NewArg)->getNextNode());
     IRB.CreateMemCpy(Off, Align, Arg, Arg->getParamAlign(), Size);
@@ -588,8 +602,7 @@ Value *SafeStack::moveStaticAllocasToUnsafeStack(
     IRB.SetInsertPoint(AI);
     unsigned Offset = SSL.getObjectOffset(AI);
 
-    replaceDbgDeclareForAlloca(AI, BasePointer, DIB, DIExpression::ApplyOffset,
-                               -Offset);
+    replaceDbgDeclare(AI, BasePointer, DIB, DIExpression::ApplyOffset, -Offset);
     replaceDbgValueForAlloca(AI, BasePointer, DIB, -Offset);
 
     // Replace uses of the alloca with the new location.
@@ -676,7 +689,7 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
     if (AI->hasName() && isa<Instruction>(NewAI))
       NewAI->takeName(AI);
 
-    replaceDbgDeclareForAlloca(AI, NewAI, DIB, DIExpression::ApplyOffset, 0);
+    replaceDbgDeclare(AI, NewAI, DIB, DIExpression::ApplyOffset, 0);
     AI->replaceAllUsesWith(NewAI);
     AI->eraseFromParent();
   }
@@ -706,33 +719,34 @@ void SafeStack::moveDynamicAllocasToUnsafeStack(
   }
 }
 
-bool SafeStack::ShouldInlinePointerAddress(CallSite &CS) {
-  Function *Callee = CS.getCalledFunction();
-  if (CS.hasFnAttr(Attribute::AlwaysInline) && isInlineViable(*Callee))
+bool SafeStack::ShouldInlinePointerAddress(CallInst &CI) {
+  Function *Callee = CI.getCalledFunction();
+  if (CI.hasFnAttr(Attribute::AlwaysInline) &&
+      isInlineViable(*Callee).isSuccess())
     return true;
   if (Callee->isInterposable() || Callee->hasFnAttribute(Attribute::NoInline) ||
-      CS.isNoInline())
+      CI.isNoInline())
     return false;
   return true;
 }
 
 void SafeStack::TryInlinePointerAddress() {
-  if (!isa<CallInst>(UnsafeStackPtr))
+  auto *CI = dyn_cast<CallInst>(UnsafeStackPtr);
+  if (!CI)
     return;
 
   if(F.hasOptNone())
     return;
 
-  CallSite CS(UnsafeStackPtr);
-  Function *Callee = CS.getCalledFunction();
+  Function *Callee = CI->getCalledFunction();
   if (!Callee || Callee->isDeclaration())
     return;
 
-  if (!ShouldInlinePointerAddress(CS))
+  if (!ShouldInlinePointerAddress(*CI))
     return;
 
   InlineFunctionInfo IFI;
-  InlineFunction(CS, IFI);
+  InlineFunction(*CI, IFI);
 }
 
 bool SafeStack::run() {
diff --git a/llvm/lib/CodeGen/SafeStackColoring.h b/llvm/lib/CodeGen/SafeStackColoring.h
deleted file mode 100644
index b696b1b6baed..000000000000
--- a/llvm/lib/CodeGen/SafeStackColoring.h
+++ /dev/null
@@ -1,165 +0,0 @@
-//===- SafeStackColoring.h - SafeStack frame coloring ----------*- C++ -*--===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_CODEGEN_SAFESTACKCOLORING_H
-#define LLVM_LIB_CODEGEN_SAFESTACKCOLORING_H
-
-#include "llvm/ADT/ArrayRef.h"
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/Support/raw_ostream.h"
-#include <cassert>
-#include <utility>
-
-namespace llvm {
-
-class BasicBlock;
-class Function;
-class Instruction;
-
-namespace safestack {
-
-/// Compute live ranges of allocas.
-/// Live ranges are represented as sets of "interesting" instructions, which are
-/// defined as instructions that may start or end an alloca's lifetime. These
-/// are:
-/// * lifetime.start and lifetime.end intrinsics
-/// * first instruction of any basic block
-/// Interesting instructions are numbered in the depth-first walk of the CFG,
-/// and in the program order inside each basic block.
-class StackColoring {
-  /// A class representing liveness information for a single basic block.
-  /// Each bit in the BitVector represents the liveness property
-  /// for a different stack slot.
-  struct BlockLifetimeInfo {
-    /// Which slots BEGINs in each basic block.
-    BitVector Begin;
-
-    /// Which slots ENDs in each basic block.
-    BitVector End;
-
-    /// Which slots are marked as LIVE_IN, coming into each basic block.
-    BitVector LiveIn;
-
-    /// Which slots are marked as LIVE_OUT, coming out of each basic block.
-    BitVector LiveOut;
-  };
-
-public:
-  /// This class represents a set of interesting instructions where an alloca is
-  /// live.
-  struct LiveRange {
-    BitVector bv;
-
-    void SetMaximum(int size) { bv.resize(size); }
-    void AddRange(unsigned start, unsigned end) { bv.set(start, end); }
-
-    bool Overlaps(const LiveRange &Other) const {
-      return bv.anyCommon(Other.bv);
-    }
-
-    void Join(const LiveRange &Other) { bv |= Other.bv; }
-  };
-
-private:
-  Function &F;
-
-  /// Maps active slots (per bit) for each basic block.
-  using LivenessMap = DenseMap<BasicBlock *, BlockLifetimeInfo>;
-  LivenessMap BlockLiveness;
-
-  /// Number of interesting instructions.
-  int NumInst = -1;
-
-  /// Numeric ids for interesting instructions.
-  DenseMap<Instruction *, unsigned> InstructionNumbering;
-
-  /// A range [Start, End) of instruction ids for each basic block.
-  /// Instructions inside each BB have monotonic and consecutive ids.
-  DenseMap<const BasicBlock *, std::pair<unsigned, unsigned>> BlockInstRange;
-
-  ArrayRef<AllocaInst *> Allocas;
-  unsigned NumAllocas;
-  DenseMap<AllocaInst *, unsigned> AllocaNumbering;
-
-  /// LiveRange for allocas.
-  SmallVector<LiveRange, 8> LiveRanges;
-
-  /// The set of allocas that have at least one lifetime.start. All other
-  /// allocas get LiveRange that corresponds to the entire function.
-  BitVector InterestingAllocas;
-  SmallVector<Instruction *, 8> Markers;
-
-  struct Marker {
-    unsigned AllocaNo;
-    bool IsStart;
-  };
-
-  /// List of {InstNo, {AllocaNo, IsStart}} for each BB, ordered by InstNo.
-  DenseMap<BasicBlock *, SmallVector<std::pair<unsigned, Marker>, 4>> BBMarkers;
-
-  void dumpAllocas();
-  void dumpBlockLiveness();
-  void dumpLiveRanges();
-
-  bool readMarker(Instruction *I, bool *IsStart);
-  void collectMarkers();
-  void calculateLocalLiveness();
-  void calculateLiveIntervals();
-
-public:
-  StackColoring(Function &F, ArrayRef<AllocaInst *> Allocas)
-      : F(F), Allocas(Allocas), NumAllocas(Allocas.size()) {}
-
-  void run();
-  void removeAllMarkers();
-
-  /// Returns a set of "interesting" instructions where the given alloca is
-  /// live. Not all instructions in a function are interesting: we pick a set
-  /// that is large enough for LiveRange::Overlaps to be correct.
-  const LiveRange &getLiveRange(AllocaInst *AI);
-
-  /// Returns a live range that represents an alloca that is live throughout the
-  /// entire function.
-  LiveRange getFullLiveRange() {
-    assert(NumInst >= 0);
-    LiveRange R;
-    R.SetMaximum(NumInst);
-    R.AddRange(0, NumInst);
-    return R;
-  }
-};
-
-static inline raw_ostream &operator<<(raw_ostream &OS, const BitVector &V) {
-  OS << "{";
-  int idx = V.find_first();
-  bool first = true;
-  while (idx >= 0) {
-    if (!first) {
-      OS << ", ";
-    }
-    first = false;
-    OS << idx;
-    idx = V.find_next(idx);
-  }
-  OS << "}";
-  return OS;
-}
-
-static inline raw_ostream &operator<<(raw_ostream &OS,
-                                      const StackColoring::LiveRange &R) {
-  return OS << R.bv;
-}
-
-} // end namespace safestack
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_CODEGEN_SAFESTACKCOLORING_H
diff --git a/llvm/lib/CodeGen/SafeStackLayout.cpp b/llvm/lib/CodeGen/SafeStackLayout.cpp
index 09964866e4d3..c823454f825c 100644
--- a/llvm/lib/CodeGen/SafeStackLayout.cpp
+++ b/llvm/lib/CodeGen/SafeStackLayout.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "SafeStackLayout.h"
-#include "SafeStackColoring.h"
+#include "llvm/Analysis/StackLifetime.h"
 #include "llvm/IR/Value.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
@@ -39,7 +39,7 @@ LLVM_DUMP_METHOD void StackLayout::print(raw_ostream &OS) {
 }
 
 void StackLayout::addObject(const Value *V, unsigned Size, unsigned Alignment,
-                            const StackColoring::LiveRange &Range) {
+                            const StackLifetime::LiveRange &Range) {
   StackObjects.push_back({V, Size, Alignment, Range});
   ObjectAlignments[V] = Alignment;
   MaxAlignment = std::max(MaxAlignment, Alignment);
@@ -76,7 +76,7 @@ void StackLayout::layoutObject(StackObject &Obj) {
       LLVM_DEBUG(dbgs() << "  Does not intersect, skip.\n");
       continue;
     }
-    if (Obj.Range.Overlaps(R.Range)) {
+    if (Obj.Range.overlaps(R.Range)) {
       // Find the next appropriate location.
       Start = AdjustStackOffset(R.End, Obj.Size, Obj.Alignment);
       End = Start + Obj.Size;
@@ -96,7 +96,7 @@ void StackLayout::layoutObject(StackObject &Obj) {
     if (Start > LastRegionEnd) {
       LLVM_DEBUG(dbgs() << "  Creating gap region: " << LastRegionEnd << " .. "
                         << Start << "\n");
-      Regions.emplace_back(LastRegionEnd, Start, StackColoring::LiveRange());
+      Regions.emplace_back(LastRegionEnd, Start, StackLifetime::LiveRange(0));
       LastRegionEnd = Start;
     }
     LLVM_DEBUG(dbgs() << "  Creating new region: " << LastRegionEnd << " .. "
@@ -125,7 +125,7 @@ void StackLayout::layoutObject(StackObject &Obj) {
   // Update live ranges for all affected regions.
   for (StackRegion &R : Regions) {
     if (Start < R.End && End > R.Start)
-      R.Range.Join(Obj.Range);
+      R.Range.join(Obj.Range);
     if (End <= R.End)
       break;
   }
diff --git a/llvm/lib/CodeGen/SafeStackLayout.h b/llvm/lib/CodeGen/SafeStackLayout.h
index 349d9a8b595c..f0db1b42aa00 100644
--- a/llvm/lib/CodeGen/SafeStackLayout.h
+++ b/llvm/lib/CodeGen/SafeStackLayout.h
@@ -9,9 +9,9 @@
 #ifndef LLVM_LIB_CODEGEN_SAFESTACKLAYOUT_H
 #define LLVM_LIB_CODEGEN_SAFESTACKLAYOUT_H
 
-#include "SafeStackColoring.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/StackLifetime.h"
 
 namespace llvm {
 
@@ -27,10 +27,10 @@ class StackLayout {
   struct StackRegion {
     unsigned Start;
     unsigned End;
-    StackColoring::LiveRange Range;
+    StackLifetime::LiveRange Range;
 
     StackRegion(unsigned Start, unsigned End,
-                const StackColoring::LiveRange &Range)
+                const StackLifetime::LiveRange &Range)
         : Start(Start), End(End), Range(Range) {}
   };
 
@@ -40,7 +40,7 @@ class StackLayout {
   struct StackObject {
     const Value *Handle;
     unsigned Size, Alignment;
-    StackColoring::LiveRange Range;
+    StackLifetime::LiveRange Range;
   };
 
   SmallVector<StackObject, 8> StackObjects;
@@ -56,7 +56,7 @@ public:
   /// Add an object to the stack frame. Value pointer is opaque and used as a
   /// handle to retrieve the object's offset in the frame later.
   void addObject(const Value *V, unsigned Size, unsigned Alignment,
-                 const StackColoring::LiveRange &Range);
+                 const StackLifetime::LiveRange &Range);
 
   /// Run the layout computation for all previously added objects.
   void computeLayout();
diff --git a/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp b/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
index ee72de67d875..c93b29617438 100644
--- a/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
+++ b/llvm/lib/CodeGen/ScalarizeMaskedMemIntrin.cpp
@@ -43,6 +43,7 @@ namespace {
 
 class ScalarizeMaskedMemIntrin : public FunctionPass {
   const TargetTransformInfo *TTI = nullptr;
+  const DataLayout *DL = nullptr;
 
 public:
   static char ID; // Pass identification, replacement for typeid
@@ -82,7 +83,7 @@ static bool isConstantIntVector(Value *Mask) {
   if (!C)
     return false;
 
-  unsigned NumElts = Mask->getType()->getVectorNumElements();
+  unsigned NumElts = cast<FixedVectorType>(Mask->getType())->getNumElements();
   for (unsigned i = 0; i != NumElts; ++i) {
     Constant *CElt = C->getAggregateElement(i);
     if (!CElt || !isa<ConstantInt>(CElt))
@@ -130,8 +131,8 @@ static void scalarizeMaskedLoad(CallInst *CI, bool &ModifiedDT) {
   Value *Mask = CI->getArgOperand(2);
   Value *Src0 = CI->getArgOperand(3);
 
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-  VectorType *VecType = cast<VectorType>(CI->getType());
+  const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
+  VectorType *VecType = cast<FixedVectorType>(CI->getType());
 
   Type *EltTy = VecType->getElementType();
 
@@ -151,12 +152,13 @@ static void scalarizeMaskedLoad(CallInst *CI, bool &ModifiedDT) {
   }
 
   // Adjust alignment for the scalar instruction.
-  AlignVal = MinAlign(AlignVal, EltTy->getPrimitiveSizeInBits() / 8);
+  const Align AdjustedAlignVal =
+      commonAlignment(AlignVal, EltTy->getPrimitiveSizeInBits() / 8);
   // Bitcast %addr from i8* to EltTy*
   Type *NewPtrType =
       EltTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
   Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
-  unsigned VectorWidth = VecType->getNumElements();
+  unsigned VectorWidth = cast<FixedVectorType>(VecType)->getNumElements();
 
   // The result vector
   Value *VResult = Src0;
@@ -166,7 +168,7 @@ static void scalarizeMaskedLoad(CallInst *CI, bool &ModifiedDT) {
       if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
         continue;
       Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
-      LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Gep, AlignVal);
+      LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Gep, AdjustedAlignVal);
       VResult = Builder.CreateInsertElement(VResult, Load, Idx);
     }
     CI->replaceAllUsesWith(VResult);
@@ -210,7 +212,7 @@ static void scalarizeMaskedLoad(CallInst *CI, bool &ModifiedDT) {
     Builder.SetInsertPoint(InsertPt);
 
     Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
-    LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Gep, AlignVal);
+    LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Gep, AdjustedAlignVal);
     Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx);
 
     // Create "else" block, fill it in the next iteration
@@ -268,8 +270,8 @@ static void scalarizeMaskedStore(CallInst *CI, bool &ModifiedDT) {
   Value *Alignment = CI->getArgOperand(2);
   Value *Mask = CI->getArgOperand(3);
 
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-  VectorType *VecType = cast<VectorType>(Src->getType());
+  const Align AlignVal = cast<ConstantInt>(Alignment)->getAlignValue();
+  auto *VecType = cast<VectorType>(Src->getType());
 
   Type *EltTy = VecType->getElementType();
 
@@ -287,12 +289,13 @@ static void scalarizeMaskedStore(CallInst *CI, bool &ModifiedDT) {
   }
 
   // Adjust alignment for the scalar instruction.
-  AlignVal = MinAlign(AlignVal, EltTy->getPrimitiveSizeInBits() / 8);
+  const Align AdjustedAlignVal =
+      commonAlignment(AlignVal, EltTy->getPrimitiveSizeInBits() / 8);
   // Bitcast %addr from i8* to EltTy*
   Type *NewPtrType =
       EltTy->getPointerTo(Ptr->getType()->getPointerAddressSpace());
   Value *FirstEltPtr = Builder.CreateBitCast(Ptr, NewPtrType);
-  unsigned VectorWidth = VecType->getNumElements();
+  unsigned VectorWidth = cast<FixedVectorType>(VecType)->getNumElements();
 
   if (isConstantIntVector(Mask)) {
     for (unsigned Idx = 0; Idx < VectorWidth; ++Idx) {
@@ -300,7 +303,7 @@ static void scalarizeMaskedStore(CallInst *CI, bool &ModifiedDT) {
         continue;
       Value *OneElt = Builder.CreateExtractElement(Src, Idx);
       Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
-      Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+      Builder.CreateAlignedStore(OneElt, Gep, AdjustedAlignVal);
     }
     CI->eraseFromParent();
     return;
@@ -342,7 +345,7 @@ static void scalarizeMaskedStore(CallInst *CI, bool &ModifiedDT) {
 
     Value *OneElt = Builder.CreateExtractElement(Src, Idx);
     Value *Gep = Builder.CreateConstInBoundsGEP1_32(EltTy, FirstEltPtr, Idx);
-    Builder.CreateAlignedStore(OneElt, Gep, AlignVal);
+    Builder.CreateAlignedStore(OneElt, Gep, AdjustedAlignVal);
 
     // Create "else" block, fill it in the next iteration
     BasicBlock *NewIfBlock =
@@ -393,14 +396,14 @@ static void scalarizeMaskedGather(CallInst *CI, bool &ModifiedDT) {
   Value *Mask = CI->getArgOperand(2);
   Value *Src0 = CI->getArgOperand(3);
 
-  VectorType *VecType = cast<VectorType>(CI->getType());
+  auto *VecType = cast<FixedVectorType>(CI->getType());
   Type *EltTy = VecType->getElementType();
 
   IRBuilder<> Builder(CI->getContext());
   Instruction *InsertPt = CI;
   BasicBlock *IfBlock = CI->getParent();
   Builder.SetInsertPoint(InsertPt);
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
+  MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
 
   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
 
@@ -517,11 +520,12 @@ static void scalarizeMaskedScatter(CallInst *CI, bool &ModifiedDT) {
   Value *Alignment = CI->getArgOperand(2);
   Value *Mask = CI->getArgOperand(3);
 
-  assert(isa<VectorType>(Src->getType()) &&
-         "Unexpected data type in masked scatter intrinsic");
-  assert(isa<VectorType>(Ptrs->getType()) &&
-         isa<PointerType>(Ptrs->getType()->getVectorElementType()) &&
-         "Vector of pointers is expected in masked scatter intrinsic");
+  auto *SrcFVTy = cast<FixedVectorType>(Src->getType());
+
+  assert(
+      isa<VectorType>(Ptrs->getType()) &&
+      isa<PointerType>(cast<VectorType>(Ptrs->getType())->getElementType()) &&
+      "Vector of pointers is expected in masked scatter intrinsic");
 
   IRBuilder<> Builder(CI->getContext());
   Instruction *InsertPt = CI;
@@ -529,8 +533,8 @@ static void scalarizeMaskedScatter(CallInst *CI, bool &ModifiedDT) {
   Builder.SetInsertPoint(InsertPt);
   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
 
-  unsigned AlignVal = cast<ConstantInt>(Alignment)->getZExtValue();
-  unsigned VectorWidth = Src->getType()->getVectorNumElements();
+  MaybeAlign AlignVal = cast<ConstantInt>(Alignment)->getMaybeAlignValue();
+  unsigned VectorWidth = SrcFVTy->getNumElements();
 
   // Shorten the way if the mask is a vector of constants.
   if (isConstantIntVector(Mask)) {
@@ -601,7 +605,7 @@ static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) {
   Value *Mask = CI->getArgOperand(1);
   Value *PassThru = CI->getArgOperand(2);
 
-  VectorType *VecType = cast<VectorType>(CI->getType());
+  auto *VecType = cast<FixedVectorType>(CI->getType());
 
   Type *EltTy = VecType->getElementType();
 
@@ -624,8 +628,8 @@ static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) {
       if (cast<Constant>(Mask)->getAggregateElement(Idx)->isNullValue())
         continue;
       Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
-      LoadInst *Load =
-          Builder.CreateAlignedLoad(EltTy, NewPtr, 1, "Load" + Twine(Idx));
+      LoadInst *Load = Builder.CreateAlignedLoad(EltTy, NewPtr, Align(1),
+                                                 "Load" + Twine(Idx));
       VResult =
           Builder.CreateInsertElement(VResult, Load, Idx, "Res" + Twine(Idx));
       ++MemIndex;
@@ -670,7 +674,7 @@ static void scalarizeMaskedExpandLoad(CallInst *CI, bool &ModifiedDT) {
                                                      "cond.load");
     Builder.SetInsertPoint(InsertPt);
 
-    LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, 1);
+    LoadInst *Load = Builder.CreateAlignedLoad(EltTy, Ptr, Align(1));
     Value *NewVResult = Builder.CreateInsertElement(VResult, Load, Idx);
 
     // Move the pointer if there are more blocks to come.
@@ -714,7 +718,7 @@ static void scalarizeMaskedCompressStore(CallInst *CI, bool &ModifiedDT) {
   Value *Ptr = CI->getArgOperand(1);
   Value *Mask = CI->getArgOperand(2);
 
-  VectorType *VecType = cast<VectorType>(Src->getType());
+  auto *VecType = cast<FixedVectorType>(Src->getType());
 
   IRBuilder<> Builder(CI->getContext());
   Instruction *InsertPt = CI;
@@ -723,7 +727,7 @@ static void scalarizeMaskedCompressStore(CallInst *CI, bool &ModifiedDT) {
   Builder.SetInsertPoint(InsertPt);
   Builder.SetCurrentDebugLocation(CI->getDebugLoc());
 
-  Type *EltTy = VecType->getVectorElementType();
+  Type *EltTy = VecType->getElementType();
 
   unsigned VectorWidth = VecType->getNumElements();
 
@@ -736,7 +740,7 @@ static void scalarizeMaskedCompressStore(CallInst *CI, bool &ModifiedDT) {
       Value *OneElt =
           Builder.CreateExtractElement(Src, Idx, "Elt" + Twine(Idx));
       Value *NewPtr = Builder.CreateConstInBoundsGEP1_32(EltTy, Ptr, MemIndex);
-      Builder.CreateAlignedStore(OneElt, NewPtr, 1);
+      Builder.CreateAlignedStore(OneElt, NewPtr, Align(1));
       ++MemIndex;
     }
     CI->eraseFromParent();
@@ -777,7 +781,7 @@ static void scalarizeMaskedCompressStore(CallInst *CI, bool &ModifiedDT) {
     Builder.SetInsertPoint(InsertPt);
 
     Value *OneElt = Builder.CreateExtractElement(Src, Idx);
-    Builder.CreateAlignedStore(OneElt, Ptr, 1);
+    Builder.CreateAlignedStore(OneElt, Ptr, Align(1));
 
     // Move the pointer if there are more blocks to come.
     Value *NewPtr;
@@ -811,6 +815,7 @@ bool ScalarizeMaskedMemIntrin::runOnFunction(Function &F) {
   bool EverMadeChange = false;
 
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  DL = &F.getParent()->getDataLayout();
 
   bool MadeChange = true;
   while (MadeChange) {
@@ -849,39 +854,46 @@ bool ScalarizeMaskedMemIntrin::optimizeCallInst(CallInst *CI,
                                                 bool &ModifiedDT) {
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
   if (II) {
-    unsigned Alignment;
     switch (II->getIntrinsicID()) {
     default:
       break;
-    case Intrinsic::masked_load: {
+    case Intrinsic::masked_load:
       // Scalarize unsupported vector masked load
-      Alignment = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      if (TTI->isLegalMaskedLoad(CI->getType(), MaybeAlign(Alignment)))
+      if (TTI->isLegalMaskedLoad(
+              CI->getType(),
+              cast<ConstantInt>(CI->getArgOperand(1))->getAlignValue()))
         return false;
       scalarizeMaskedLoad(CI, ModifiedDT);
       return true;
-    }
-    case Intrinsic::masked_store: {
-      Alignment = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      if (TTI->isLegalMaskedStore(CI->getArgOperand(0)->getType(),
-                                  MaybeAlign(Alignment)))
+    case Intrinsic::masked_store:
+      if (TTI->isLegalMaskedStore(
+              CI->getArgOperand(0)->getType(),
+              cast<ConstantInt>(CI->getArgOperand(2))->getAlignValue()))
         return false;
       scalarizeMaskedStore(CI, ModifiedDT);
       return true;
-    }
-    case Intrinsic::masked_gather:
-      Alignment = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      if (TTI->isLegalMaskedGather(CI->getType(), MaybeAlign(Alignment)))
+    case Intrinsic::masked_gather: {
+      unsigned AlignmentInt =
+          cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
+      Type *LoadTy = CI->getType();
+      Align Alignment =
+          DL->getValueOrABITypeAlignment(MaybeAlign(AlignmentInt), LoadTy);
+      if (TTI->isLegalMaskedGather(LoadTy, Alignment))
         return false;
       scalarizeMaskedGather(CI, ModifiedDT);
       return true;
-    case Intrinsic::masked_scatter:
-      Alignment = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      if (TTI->isLegalMaskedScatter(CI->getArgOperand(0)->getType(),
-                                    MaybeAlign(Alignment)))
+    }
+    case Intrinsic::masked_scatter: {
+      unsigned AlignmentInt =
+          cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
+      Type *StoreTy = CI->getArgOperand(0)->getType();
+      Align Alignment =
+          DL->getValueOrABITypeAlignment(MaybeAlign(AlignmentInt), StoreTy);
+      if (TTI->isLegalMaskedScatter(StoreTy, Alignment))
         return false;
       scalarizeMaskedScatter(CI, ModifiedDT);
       return true;
+    }
     case Intrinsic::masked_expandload:
       if (TTI->isLegalMaskedExpandLoad(CI->getType()))
         return false;
diff --git a/llvm/lib/CodeGen/ScheduleDAG.cpp b/llvm/lib/CodeGen/ScheduleDAG.cpp
index dc3a11670a16..60f8eec1b9bc 100644
--- a/llvm/lib/CodeGen/ScheduleDAG.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAG.cpp
@@ -713,6 +713,14 @@ bool ScheduleDAGTopologicalSort::WillCreateCycle(SUnit *TargetSU, SUnit *SU) {
   return false;
 }
 
+void ScheduleDAGTopologicalSort::AddSUnitWithoutPredecessors(const SUnit *SU) {
+  assert(SU->NodeNum == Index2Node.size() && "Node cannot be added at the end");
+  assert(SU->NumPreds == 0 && "Can only add SU's with no predecessors");
+  Node2Index.push_back(Index2Node.size());
+  Index2Node.push_back(SU->NodeNum);
+  Visited.resize(Node2Index.size());
+}
+
 bool ScheduleDAGTopologicalSort::IsReachable(const SUnit *SU,
                                              const SUnit *TargetSU) {
   FixOrder();
diff --git a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
index d11406cc330f..10da2d421797 100644
--- a/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGInstrs.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
@@ -269,13 +270,13 @@ void ScheduleDAGInstrs::addPhysRegDataDeps(SUnit *SU, unsigned OperIdx) {
       if (!ImplicitPseudoDef && !ImplicitPseudoUse) {
         Dep.setLatency(SchedModel.computeOperandLatency(SU->getInstr(), OperIdx,
                                                         RegUse, UseOp));
-        ST.adjustSchedDependency(SU, UseSU, Dep);
+        ST.adjustSchedDependency(SU, OperIdx, UseSU, UseOp, Dep);
       } else {
         Dep.setLatency(0);
         // FIXME: We could always let target to adjustSchedDependency(), and
         // remove this condition, but that currently asserts in Hexagon BE.
         if (SU->getInstr()->isBundle() || (RegUse && RegUse->isBundle()))
-          ST.adjustSchedDependency(SU, UseSU, Dep);
+          ST.adjustSchedDependency(SU, OperIdx, UseSU, UseOp, Dep);
       }
 
       UseSU->addPred(Dep);
@@ -294,6 +295,8 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
   if (MRI.isConstantPhysReg(Reg))
     return;
 
+  const TargetSubtargetInfo &ST = MF.getSubtarget();
+
   // Optionally add output and anti dependencies. For anti
   // dependencies we use a latency of 0 because for a multi-issue
   // target we want to allow the defining instruction to issue
@@ -311,14 +314,12 @@ void ScheduleDAGInstrs::addPhysRegDeps(SUnit *SU, unsigned OperIdx) {
       if (DefSU != SU &&
           (Kind != SDep::Output || !MO.isDead() ||
            !DefSU->getInstr()->registerDefIsDead(*Alias))) {
-        if (Kind == SDep::Anti)
-          DefSU->addPred(SDep(SU, Kind, /*Reg=*/*Alias));
-        else {
-          SDep Dep(SU, Kind, /*Reg=*/*Alias);
+        SDep Dep(SU, Kind, /*Reg=*/*Alias);
+        if (Kind != SDep::Anti)
           Dep.setLatency(
             SchedModel.computeOutputLatency(MI, OperIdx, DefSU->getInstr()));
-          DefSU->addPred(Dep);
-        }
+        ST.adjustSchedDependency(SU, OperIdx, DefSU, I->OpIdx, Dep);
+        DefSU->addPred(Dep);
       }
     }
   }
@@ -444,7 +445,7 @@ void ScheduleDAGInstrs::addVRegDefDeps(SUnit *SU, unsigned OperIdx) {
         SDep Dep(SU, SDep::Data, Reg);
         Dep.setLatency(SchedModel.computeOperandLatency(MI, OperIdx, Use,
                                                         I->OperandIndex));
-        ST.adjustSchedDependency(SU, UseSU, Dep);
+        ST.adjustSchedDependency(SU, OperIdx, UseSU, I->OperandIndex, Dep);
         UseSU->addPred(Dep);
       }
 
diff --git a/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp b/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
index 8d04711f07c6..a113c30f851b 100644
--- a/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/llvm/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -28,7 +28,7 @@ namespace llvm {
   DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
 
     static std::string getGraphName(const ScheduleDAG *G) {
-      return G->MF.getName();
+      return std::string(G->MF.getName());
     }
 
     static bool renderGraphFromBottomUp() {
diff --git a/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp b/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
index a9fda56f2dac..6e05de888cc0 100644
--- a/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
+++ b/llvm/lib/CodeGen/ScoreboardHazardRecognizer.cpp
@@ -92,10 +92,11 @@ LLVM_DUMP_METHOD void ScoreboardHazardRecognizer::Scoreboard::dump() const {
     last--;
 
   for (unsigned i = 0; i <= last; i++) {
-    unsigned FUs = (*this)[i];
+    InstrStage::FuncUnits FUs = (*this)[i];
     dbgs() << "\t";
-    for (int j = 31; j >= 0; j--)
-      dbgs() << ((FUs & (1 << j)) ? '1' : '0');
+    for (int j = std::numeric_limits<InstrStage::FuncUnits>::digits - 1;
+         j >= 0; j--)
+      dbgs() << ((FUs & (1ULL << j)) ? '1' : '0');
     dbgs() << '\n';
   }
 }
@@ -142,7 +143,7 @@ ScoreboardHazardRecognizer::getHazardType(SUnit *SU, int Stalls) {
         break;
       }
 
-      unsigned freeUnits = IS->getUnits();
+      InstrStage::FuncUnits freeUnits = IS->getUnits();
       switch (IS->getReservationKind()) {
       case InstrStage::Required:
         // Required FUs conflict with both reserved and required ones
@@ -193,7 +194,7 @@ void ScoreboardHazardRecognizer::EmitInstruction(SUnit *SU) {
       assert(((cycle + i) < RequiredScoreboard.getDepth()) &&
              "Scoreboard depth exceeded!");
 
-      unsigned freeUnits = IS->getUnits();
+      InstrStage::FuncUnits freeUnits = IS->getUnits();
       switch (IS->getReservationKind()) {
       case InstrStage::Required:
         // Required FUs conflict with both reserved and required ones
@@ -206,7 +207,7 @@ void ScoreboardHazardRecognizer::EmitInstruction(SUnit *SU) {
       }
 
       // reduce to a single unit
-      unsigned freeUnit = 0;
+      InstrStage::FuncUnits freeUnit = 0;
       do {
         freeUnit = freeUnits;
         freeUnits = freeUnit & (freeUnit - 1);
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index e5bc08b9280a..f14b3dba4f31 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -30,6 +30,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -124,17 +125,29 @@ static cl::opt<unsigned> StoreMergeDependenceLimit(
     cl::desc("Limit the number of times for the same StoreNode and RootNode "
              "to bail out in store merging dependence check"));
 
+static cl::opt<bool> EnableReduceLoadOpStoreWidth(
+    "combiner-reduce-load-op-store-width", cl::Hidden, cl::init(true),
+    cl::desc("DAG cominber enable reducing the width of load/op/store "
+             "sequence"));
+
+static cl::opt<bool> EnableShrinkLoadReplaceStoreWithStore(
+    "combiner-shrink-load-replace-store-with-store", cl::Hidden, cl::init(true),
+    cl::desc("DAG cominber enable load/<replace bytes>/store with "
+             "a narrower store"));
+
 namespace {
 
   class DAGCombiner {
     SelectionDAG &DAG;
     const TargetLowering &TLI;
+    const SelectionDAGTargetInfo *STI;
     CombineLevel Level;
     CodeGenOpt::Level OptLevel;
     bool LegalDAG = false;
     bool LegalOperations = false;
     bool LegalTypes = false;
     bool ForCodeSize;
+    bool DisableGenericCombines;
 
     /// Worklist of all of the nodes that need to be simplified.
     ///
@@ -222,9 +235,11 @@ namespace {
 
   public:
     DAGCombiner(SelectionDAG &D, AliasAnalysis *AA, CodeGenOpt::Level OL)
-        : DAG(D), TLI(D.getTargetLoweringInfo()), Level(BeforeLegalizeTypes),
-          OptLevel(OL), AA(AA) {
+        : DAG(D), TLI(D.getTargetLoweringInfo()),
+          STI(D.getSubtarget().getSelectionDAGInfo()),
+          Level(BeforeLegalizeTypes), OptLevel(OL), AA(AA) {
       ForCodeSize = DAG.shouldOptForSize();
+      DisableGenericCombines = STI && STI->disableGenericCombines(OptLevel);
 
       MaximumLegalStoreInBits = 0;
       // We use the minimum store size here, since that's all we can guarantee
@@ -307,23 +322,34 @@ namespace {
     }
 
     bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits) {
-      EVT VT = Op.getValueType();
-      unsigned NumElts = VT.isVector() ? VT.getVectorNumElements() : 1;
-      APInt DemandedElts = APInt::getAllOnesValue(NumElts);
-      return SimplifyDemandedBits(Op, DemandedBits, DemandedElts);
+      TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
+      KnownBits Known;
+      if (!TLI.SimplifyDemandedBits(Op, DemandedBits, Known, TLO, 0, false))
+        return false;
+
+      // Revisit the node.
+      AddToWorklist(Op.getNode());
+
+      CommitTargetLoweringOpt(TLO);
+      return true;
     }
 
     /// Check the specified vector node value to see if it can be simplified or
     /// if things it uses can be simplified as it only uses some of the
     /// elements. If so, return true.
     bool SimplifyDemandedVectorElts(SDValue Op) {
+      // TODO: For now just pretend it cannot be simplified.
+      if (Op.getValueType().isScalableVector())
+        return false;
+
       unsigned NumElts = Op.getValueType().getVectorNumElements();
       APInt DemandedElts = APInt::getAllOnesValue(NumElts);
       return SimplifyDemandedVectorElts(Op, DemandedElts);
     }
 
     bool SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
-                              const APInt &DemandedElts);
+                              const APInt &DemandedElts,
+                              bool AssumeSingleUse = false);
     bool SimplifyDemandedVectorElts(SDValue Op, const APInt &DemandedElts,
                                     bool AssumeSingleUse = false);
 
@@ -429,11 +455,13 @@ namespace {
     SDValue visitZERO_EXTEND(SDNode *N);
     SDValue visitANY_EXTEND(SDNode *N);
     SDValue visitAssertExt(SDNode *N);
+    SDValue visitAssertAlign(SDNode *N);
     SDValue visitSIGN_EXTEND_INREG(SDNode *N);
     SDValue visitSIGN_EXTEND_VECTOR_INREG(SDNode *N);
     SDValue visitZERO_EXTEND_VECTOR_INREG(SDNode *N);
     SDValue visitTRUNCATE(SDNode *N);
     SDValue visitBITCAST(SDNode *N);
+    SDValue visitFREEZE(SDNode *N);
     SDValue visitBUILD_PAIR(SDNode *N);
     SDValue visitFADD(SDNode *N);
     SDValue visitFSUB(SDNode *N);
@@ -522,9 +550,8 @@ namespace {
     SDValue rebuildSetCC(SDValue N);
 
     bool isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
-                           SDValue &CC) const;
+                           SDValue &CC, bool MatchStrict = false) const;
     bool isOneUseSetCC(SDValue N) const;
-    bool isCheaperToUseNegatedFPOps(SDValue X, SDValue Y);
 
     SDValue SimplifyNodeWithTwoResults(SDNode *N, unsigned LoOp,
                                          unsigned HiOp);
@@ -553,6 +580,10 @@ namespace {
                               SDValue InnerPos, SDValue InnerNeg,
                               unsigned PosOpcode, unsigned NegOpcode,
                               const SDLoc &DL);
+    SDValue MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos, SDValue Neg,
+                              SDValue InnerPos, SDValue InnerNeg,
+                              unsigned PosOpcode, unsigned NegOpcode,
+                              const SDLoc &DL);
     SDValue MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL);
     SDValue MatchLoadCombine(SDNode *N);
     SDValue MatchStoreCombine(StoreSDNode *N);
@@ -562,6 +593,7 @@ namespace {
     SDValue TransformFPLoadStorePair(SDNode *N);
     SDValue convertBuildVecZextToZext(SDNode *N);
     SDValue reduceBuildVecExtToExtBuildVec(SDNode *N);
+    SDValue reduceBuildVecTruncToBitCast(SDNode *N);
     SDValue reduceBuildVecToShuffle(SDNode *N);
     SDValue createBuildVecShuffle(const SDLoc &DL, SDNode *N,
                                   ArrayRef<int> VectorMask, SDValue VecIn1,
@@ -606,6 +638,19 @@ namespace {
           : MemNode(N), OffsetFromBase(Offset) {}
     };
 
+    // Classify the origin of a stored value.
+    enum class StoreSource { Unknown, Constant, Extract, Load };
+    StoreSource getStoreSource(SDValue StoreVal) {
+      if (isa<ConstantSDNode>(StoreVal) || isa<ConstantFPSDNode>(StoreVal))
+        return StoreSource::Constant;
+      if (StoreVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+          StoreVal.getOpcode() == ISD::EXTRACT_SUBVECTOR)
+        return StoreSource::Extract;
+      if (isa<LoadSDNode>(StoreVal))
+        return StoreSource::Load;
+      return StoreSource::Unknown;
+    }
+
     /// This is a helper function for visitMUL to check the profitability
     /// of folding (mul (add x, c1), c2) -> (add (mul x, c2), c1*c2).
     /// MulNode is the original multiply, AddNode is (add x, c1),
@@ -633,43 +678,66 @@ namespace {
     /// can be combined into narrow loads.
     bool BackwardsPropagateMask(SDNode *N);
 
-    /// Helper function for MergeConsecutiveStores which merges the
-    /// component store chains.
+    /// Helper function for mergeConsecutiveStores which merges the component
+    /// store chains.
     SDValue getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
                                 unsigned NumStores);
 
-    /// This is a helper function for MergeConsecutiveStores. When the
-    /// source elements of the consecutive stores are all constants or
-    /// all extracted vector elements, try to merge them into one
-    /// larger store introducing bitcasts if necessary.  \return True
-    /// if a merged store was created.
-    bool MergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
+    /// This is a helper function for mergeConsecutiveStores. When the source
+    /// elements of the consecutive stores are all constants or all extracted
+    /// vector elements, try to merge them into one larger store introducing
+    /// bitcasts if necessary.  \return True if a merged store was created.
+    bool mergeStoresOfConstantsOrVecElts(SmallVectorImpl<MemOpLink> &StoreNodes,
                                          EVT MemVT, unsigned NumStores,
                                          bool IsConstantSrc, bool UseVector,
                                          bool UseTrunc);
 
-    /// This is a helper function for MergeConsecutiveStores. Stores
-    /// that potentially may be merged with St are placed in
-    /// StoreNodes. RootNode is a chain predecessor to all store
-    /// candidates.
+    /// This is a helper function for mergeConsecutiveStores. Stores that
+    /// potentially may be merged with St are placed in StoreNodes. RootNode is
+    /// a chain predecessor to all store candidates.
     void getStoreMergeCandidates(StoreSDNode *St,
                                  SmallVectorImpl<MemOpLink> &StoreNodes,
                                  SDNode *&Root);
 
-    /// Helper function for MergeConsecutiveStores. Checks if
-    /// candidate stores have indirect dependency through their
-    /// operands. RootNode is the predecessor to all stores calculated
-    /// by getStoreMergeCandidates and is used to prune the dependency check.
-    /// \return True if safe to merge.
+    /// Helper function for mergeConsecutiveStores. Checks if candidate stores
+    /// have indirect dependency through their operands. RootNode is the
+    /// predecessor to all stores calculated by getStoreMergeCandidates and is
+    /// used to prune the dependency check. \return True if safe to merge.
     bool checkMergeStoreCandidatesForDependencies(
         SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumStores,
         SDNode *RootNode);
 
+    /// This is a helper function for mergeConsecutiveStores. Given a list of
+    /// store candidates, find the first N that are consecutive in memory.
+    /// Returns 0 if there are not at least 2 consecutive stores to try merging.
+    unsigned getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                  int64_t ElementSizeBytes) const;
+
+    /// This is a helper function for mergeConsecutiveStores. It is used for
+    /// store chains that are composed entirely of constant values.
+    bool tryStoreMergeOfConstants(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                  unsigned NumConsecutiveStores,
+                                  EVT MemVT, SDNode *Root, bool AllowVectors);
+
+    /// This is a helper function for mergeConsecutiveStores. It is used for
+    /// store chains that are composed entirely of extracted vector elements.
+    /// When extracting multiple vector elements, try to store them in one
+    /// vector store rather than a sequence of scalar stores.
+    bool tryStoreMergeOfExtracts(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                 unsigned NumConsecutiveStores, EVT MemVT,
+                                 SDNode *Root);
+
+    /// This is a helper function for mergeConsecutiveStores. It is used for
+    /// store chains that are composed entirely of loaded values.
+    bool tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
+                              unsigned NumConsecutiveStores, EVT MemVT,
+                              SDNode *Root, bool AllowVectors,
+                              bool IsNonTemporalStore, bool IsNonTemporalLoad);
+
     /// Merge consecutive store operations into a wide store.
     /// This optimization uses wide integers or vectors when possible.
-    /// \return number of stores that were merged into a merged store (the
-    /// affected nodes are stored as a prefix in \p StoreNodes).
-    bool MergeConsecutiveStores(StoreSDNode *St);
+    /// \return true if stores were merged.
+    bool mergeConsecutiveStores(StoreSDNode *St);
 
     /// Try to transform a truncation where C is a constant:
     ///     (trunc (and X, C)) -> (and (trunc X), (trunc C))
@@ -814,7 +882,7 @@ static void zeroExtendToMatch(APInt &LHS, APInt &RHS, unsigned Offset = 0) {
 // the appropriate nodes based on the type of node we are checking. This
 // simplifies life a bit for the callers.
 bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
-                                    SDValue &CC) const {
+                                    SDValue &CC, bool MatchStrict) const {
   if (N.getOpcode() == ISD::SETCC) {
     LHS = N.getOperand(0);
     RHS = N.getOperand(1);
@@ -822,6 +890,15 @@ bool DAGCombiner::isSetCCEquivalent(SDValue N, SDValue &LHS, SDValue &RHS,
     return true;
   }
 
+  if (MatchStrict &&
+      (N.getOpcode() == ISD::STRICT_FSETCC ||
+       N.getOpcode() == ISD::STRICT_FSETCCS)) {
+    LHS = N.getOperand(1);
+    RHS = N.getOperand(2);
+    CC  = N.getOperand(3);
+    return true;
+  }
+
   if (N.getOpcode() != ISD::SELECT_CC ||
       !TLI.isConstTrueVal(N.getOperand(2).getNode()) ||
       !TLI.isConstFalseVal(N.getOperand(3).getNode()))
@@ -886,6 +963,13 @@ static bool isAnyConstantBuildVector(SDValue V, bool NoOpaques = false) {
          ISD::isBuildVectorOfConstantFPSDNodes(V.getNode());
 }
 
+// Determine if this an indexed load with an opaque target constant index.
+static bool canSplitIdx(LoadSDNode *LD) {
+  return MaySplitLoadIndex &&
+         (LD->getOperand(2).getOpcode() != ISD::TargetConstant ||
+          !cast<ConstantSDNode>(LD->getOperand(2))->isOpaque());
+}
+
 bool DAGCombiner::reassociationCanBreakAddressingModePattern(unsigned Opc,
                                                              const SDLoc &DL,
                                                              SDValue N0,
@@ -951,14 +1035,11 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
   if (N0.getOpcode() != Opc)
     return SDValue();
 
-  // Don't reassociate reductions.
-  if (N0->getFlags().hasVectorReduction())
-    return SDValue();
-
-  if (SDNode *C1 = DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
-    if (SDNode *C2 = DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
+  if (DAG.isConstantIntBuildVectorOrConstantInt(N0.getOperand(1))) {
+    if (DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
       // Reassociate: (op (op x, c1), c2) -> (op x, (op c1, c2))
-      if (SDValue OpNode = DAG.FoldConstantArithmetic(Opc, DL, VT, C1, C2))
+      if (SDValue OpNode =
+              DAG.FoldConstantArithmetic(Opc, DL, VT, {N0.getOperand(1), N1}))
         return DAG.getNode(Opc, DL, VT, N0.getOperand(0), OpNode);
       return SDValue();
     }
@@ -978,9 +1059,6 @@ SDValue DAGCombiner::reassociateOpsCommutative(unsigned Opc, const SDLoc &DL,
 SDValue DAGCombiner::reassociateOps(unsigned Opc, const SDLoc &DL, SDValue N0,
                                     SDValue N1, SDNodeFlags Flags) {
   assert(TLI.isCommutativeBinOp(Opc) && "Operation not commutative.");
-  // Don't reassociate reductions.
-  if (Flags.hasVectorReduction())
-    return SDValue();
 
   // Floating-point reassociation is not allowed without loose FP math.
   if (N0.getValueType().isFloatingPoint() ||
@@ -1029,6 +1107,12 @@ SDValue DAGCombiner::CombineTo(SDNode *N, const SDValue *To, unsigned NumTo,
 
 void DAGCombiner::
 CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
+  // Replace the old value with the new one.
+  ++NodesCombined;
+  LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
+             dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);
+             dbgs() << '\n');
+
   // Replace all uses.  If any nodes become isomorphic to other nodes and
   // are deleted, make sure to remove them from our worklist.
   WorklistRemover DeadNodes(*this);
@@ -1047,21 +1131,17 @@ CommitTargetLoweringOpt(const TargetLowering::TargetLoweringOpt &TLO) {
 /// Check the specified integer node value to see if it can be simplified or if
 /// things it uses can be simplified by bit propagation. If so, return true.
 bool DAGCombiner::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
-                                       const APInt &DemandedElts) {
+                                       const APInt &DemandedElts,
+                                       bool AssumeSingleUse) {
   TargetLowering::TargetLoweringOpt TLO(DAG, LegalTypes, LegalOperations);
   KnownBits Known;
-  if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO))
+  if (!TLI.SimplifyDemandedBits(Op, DemandedBits, DemandedElts, Known, TLO, 0,
+                                AssumeSingleUse))
     return false;
 
   // Revisit the node.
   AddToWorklist(Op.getNode());
 
-  // Replace the old value with the new one.
-  ++NodesCombined;
-  LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
-             dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);
-             dbgs() << '\n');
-
   CommitTargetLoweringOpt(TLO);
   return true;
 }
@@ -1081,12 +1161,6 @@ bool DAGCombiner::SimplifyDemandedVectorElts(SDValue Op,
   // Revisit the node.
   AddToWorklist(Op.getNode());
 
-  // Replace the old value with the new one.
-  ++NodesCombined;
-  LLVM_DEBUG(dbgs() << "\nReplacing.2 "; TLO.Old.getNode()->dump(&DAG);
-             dbgs() << "\nWith: "; TLO.New.getNode()->dump(&DAG);
-             dbgs() << '\n');
-
   CommitTargetLoweringOpt(TLO);
   return true;
 }
@@ -1210,8 +1284,11 @@ SDValue DAGCombiner::PromoteIntBinOp(SDValue Op) {
     SDValue RV =
         DAG.getNode(ISD::TRUNCATE, DL, VT, DAG.getNode(Opc, DL, PVT, NN0, NN1));
 
-    // We are always replacing N0/N1's use in N and only need
-    // additional replacements if there are additional uses.
+    // We are always replacing N0/N1's use in N and only need additional
+    // replacements if there are additional uses.
+    // Note: We are checking uses of the *nodes* (SDNode) rather than values
+    //       (SDValue) here because the node may reference multiple values
+    //       (for example, the chain value of a load node).
     Replace0 &= !N0->hasOneUse();
     Replace1 &= (N0 != N1) && !N1->hasOneUse();
 
@@ -1561,6 +1638,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::ANY_EXTEND:         return visitANY_EXTEND(N);
   case ISD::AssertSext:
   case ISD::AssertZext:         return visitAssertExt(N);
+  case ISD::AssertAlign:        return visitAssertAlign(N);
   case ISD::SIGN_EXTEND_INREG:  return visitSIGN_EXTEND_INREG(N);
   case ISD::SIGN_EXTEND_VECTOR_INREG: return visitSIGN_EXTEND_VECTOR_INREG(N);
   case ISD::ZERO_EXTEND_VECTOR_INREG: return visitZERO_EXTEND_VECTOR_INREG(N);
@@ -1610,6 +1688,7 @@ SDValue DAGCombiner::visit(SDNode *N) {
   case ISD::LIFETIME_END:       return visitLIFETIME_END(N);
   case ISD::FP_TO_FP16:         return visitFP_TO_FP16(N);
   case ISD::FP16_TO_FP:         return visitFP16_TO_FP(N);
+  case ISD::FREEZE:             return visitFREEZE(N);
   case ISD::VECREDUCE_FADD:
   case ISD::VECREDUCE_FMUL:
   case ISD::VECREDUCE_ADD:
@@ -1628,7 +1707,9 @@ SDValue DAGCombiner::visit(SDNode *N) {
 }
 
 SDValue DAGCombiner::combine(SDNode *N) {
-  SDValue RV = visit(N);
+  SDValue RV;
+  if (!DisableGenericCombines)
+    RV = visit(N);
 
   // If nothing happened, try a target-specific DAG combine.
   if (!RV.getNode()) {
@@ -2046,12 +2127,11 @@ static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
 
   // We need a constant operand for the add/sub, and the other operand is a
   // logical shift right: add (srl), C or sub C, (srl).
-  // TODO - support non-uniform vector amounts.
   bool IsAdd = N->getOpcode() == ISD::ADD;
   SDValue ConstantOp = IsAdd ? N->getOperand(1) : N->getOperand(0);
   SDValue ShiftOp = IsAdd ? N->getOperand(0) : N->getOperand(1);
-  ConstantSDNode *C = isConstOrConstSplat(ConstantOp);
-  if (!C || ShiftOp.getOpcode() != ISD::SRL)
+  if (!DAG.isConstantIntBuildVectorOrConstantInt(ConstantOp) ||
+      ShiftOp.getOpcode() != ISD::SRL)
     return SDValue();
 
   // The shift must be of a 'not' value.
@@ -2072,8 +2152,11 @@ static SDValue foldAddSubOfSignBit(SDNode *N, SelectionDAG &DAG) {
   SDLoc DL(N);
   auto ShOpcode = IsAdd ? ISD::SRA : ISD::SRL;
   SDValue NewShift = DAG.getNode(ShOpcode, DL, VT, Not.getOperand(0), ShAmt);
-  APInt NewC = IsAdd ? C->getAPIntValue() + 1 : C->getAPIntValue() - 1;
-  return DAG.getNode(ISD::ADD, DL, VT, NewShift, DAG.getConstant(NewC, DL, VT));
+  if (SDValue NewC =
+          DAG.FoldConstantArithmetic(IsAdd ? ISD::ADD : ISD::SUB, DL, VT,
+                                     {ConstantOp, DAG.getConstant(1, DL, VT)}))
+    return DAG.getNode(ISD::ADD, DL, VT, NewShift, NewC);
+  return SDValue();
 }
 
 /// Try to fold a node that behaves like an ADD (note that N isn't necessarily
@@ -2109,8 +2192,7 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
     if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))
       return DAG.getNode(ISD::ADD, DL, VT, N1, N0);
     // fold (add c1, c2) -> c1+c2
-    return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, N0.getNode(),
-                                      N1.getNode());
+    return DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0, N1});
   }
 
   // fold (add x, 0) -> x
@@ -2121,8 +2203,8 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
     // fold ((A-c1)+c2) -> (A+(c2-c1))
     if (N0.getOpcode() == ISD::SUB &&
         isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true)) {
-      SDValue Sub = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, N1.getNode(),
-                                               N0.getOperand(1).getNode());
+      SDValue Sub =
+          DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N1, N0.getOperand(1)});
       assert(Sub && "Constant folding failed");
       return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Sub);
     }
@@ -2130,8 +2212,8 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
     // fold ((c1-A)+c2) -> (c1+c2)-A
     if (N0.getOpcode() == ISD::SUB &&
         isConstantOrConstantVector(N0.getOperand(0), /* NoOpaque */ true)) {
-      SDValue Add = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, N1.getNode(),
-                                               N0.getOperand(0).getNode());
+      SDValue Add =
+          DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N1, N0.getOperand(0)});
       assert(Add && "Constant folding failed");
       return DAG.getNode(ISD::SUB, DL, VT, Add, N0.getOperand(1));
     }
@@ -2152,13 +2234,14 @@ SDValue DAGCombiner::visitADDLike(SDNode *N) {
       }
     }
 
-    // Undo the add -> or combine to merge constant offsets from a frame index.
+    // Fold (add (or x, c0), c1) -> (add x, (c0 + c1)) if (or x, c0) is
+    // equivalent to (add x, c0).
     if (N0.getOpcode() == ISD::OR &&
-        isa<FrameIndexSDNode>(N0.getOperand(0)) &&
-        isa<ConstantSDNode>(N0.getOperand(1)) &&
+        isConstantOrConstantVector(N0.getOperand(1), /* NoOpaque */ true) &&
         DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1))) {
-      SDValue Add0 = DAG.getNode(ISD::ADD, DL, VT, N1, N0.getOperand(1));
-      return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0);
+      if (SDValue Add0 = DAG.FoldConstantArithmetic(ISD::ADD, DL, VT,
+                                                    {N1, N0.getOperand(1)}))
+        return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), Add0);
     }
   }
 
@@ -2317,6 +2400,23 @@ SDValue DAGCombiner::visitADD(SDNode *N) {
       DAG.haveNoCommonBitsSet(N0, N1))
     return DAG.getNode(ISD::OR, DL, VT, N0, N1);
 
+  // Fold (add (vscale * C0), (vscale * C1)) to (vscale * (C0 + C1)).
+  if (N0.getOpcode() == ISD::VSCALE && N1.getOpcode() == ISD::VSCALE) {
+    APInt C0 = N0->getConstantOperandAPInt(0);
+    APInt C1 = N1->getConstantOperandAPInt(0);
+    return DAG.getVScale(DL, VT, C0 + C1);
+  }
+
+  // fold a+vscale(c1)+vscale(c2) -> a+vscale(c1+c2)
+  if ((N0.getOpcode() == ISD::ADD) &&
+      (N0.getOperand(1).getOpcode() == ISD::VSCALE) &&
+      (N1.getOpcode() == ISD::VSCALE)) {
+    auto VS0 = N0.getOperand(1)->getConstantOperandAPInt(0);
+    auto VS1 = N1->getConstantOperandAPInt(0);
+    auto VS = DAG.getVScale(DL, VT, VS0 + VS1);
+    return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), VS);
+  }
+
   return SDValue();
 }
 
@@ -2347,8 +2447,7 @@ SDValue DAGCombiner::visitADDSAT(SDNode *N) {
     if (!DAG.isConstantIntBuildVectorOrConstantInt(N1))
       return DAG.getNode(Opcode, DL, VT, N1, N0);
     // fold (add_sat c1, c2) -> c3
-    return DAG.FoldConstantArithmetic(Opcode, DL, VT, N0.getNode(),
-                                      N1.getNode());
+    return DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1});
   }
 
   // fold (add_sat x, 0) -> x
@@ -2968,12 +3067,10 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   // FIXME: Refactor this and xor and other similar operations together.
   if (N0 == N1)
     return tryFoldToZero(DL, TLI, VT, DAG, LegalOperations);
-  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
-      DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
-    // fold (sub c1, c2) -> c1-c2
-    return DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, N0.getNode(),
-                                      N1.getNode());
-  }
+
+  // fold (sub c1, c2) -> c3
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N1}))
+    return C;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
@@ -3040,8 +3137,8 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (N0.getOpcode() == ISD::ADD &&
       isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
       isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
-    SDValue NewC = DAG.FoldConstantArithmetic(
-        ISD::SUB, DL, VT, N0.getOperand(1).getNode(), N1.getNode());
+    SDValue NewC =
+        DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(1), N1});
     assert(NewC && "Constant folding failed");
     return DAG.getNode(ISD::ADD, DL, VT, N0.getOperand(0), NewC);
   }
@@ -3051,8 +3148,7 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     SDValue N11 = N1.getOperand(1);
     if (isConstantOrConstantVector(N0, /* NoOpaques */ true) &&
         isConstantOrConstantVector(N11, /* NoOpaques */ true)) {
-      SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, N0.getNode(),
-                                                N11.getNode());
+      SDValue NewC = DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0, N11});
       assert(NewC && "Constant folding failed");
       return DAG.getNode(ISD::SUB, DL, VT, NewC, N1.getOperand(0));
     }
@@ -3062,8 +3158,8 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (N0.getOpcode() == ISD::SUB &&
       isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
       isConstantOrConstantVector(N0.getOperand(1), /* NoOpaques */ true)) {
-    SDValue NewC = DAG.FoldConstantArithmetic(
-        ISD::ADD, DL, VT, N0.getOperand(1).getNode(), N1.getNode());
+    SDValue NewC =
+        DAG.FoldConstantArithmetic(ISD::ADD, DL, VT, {N0.getOperand(1), N1});
     assert(NewC && "Constant folding failed");
     return DAG.getNode(ISD::SUB, DL, VT, N0.getOperand(0), NewC);
   }
@@ -3072,8 +3168,8 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
   if (N0.getOpcode() == ISD::SUB &&
       isConstantOrConstantVector(N1, /* NoOpaques */ true) &&
       isConstantOrConstantVector(N0.getOperand(0), /* NoOpaques */ true)) {
-    SDValue NewC = DAG.FoldConstantArithmetic(
-        ISD::SUB, DL, VT, N0.getOperand(0).getNode(), N1.getNode());
+    SDValue NewC =
+        DAG.FoldConstantArithmetic(ISD::SUB, DL, VT, {N0.getOperand(0), N1});
     assert(NewC && "Constant folding failed");
     return DAG.getNode(ISD::SUB, DL, VT, NewC, N0.getOperand(1));
   }
@@ -3244,6 +3340,12 @@ SDValue DAGCombiner::visitSUB(SDNode *N) {
     }
   }
 
+  // canonicalize (sub X, (vscale * C)) to (add X,  (vscale * -C))
+  if (N1.getOpcode() == ISD::VSCALE) {
+    APInt IntVal = N1.getConstantOperandAPInt(0);
+    return DAG.getNode(ISD::ADD, DL, VT, N0, DAG.getVScale(DL, VT, -IntVal));
+  }
+
   // Prefer an add for more folding potential and possibly better codegen:
   // sub N0, (lshr N10, width-1) --> add N0, (ashr N10, width-1)
   if (!LegalOperations && N1.getOpcode() == ISD::SRL && N1.hasOneUse()) {
@@ -3294,12 +3396,9 @@ SDValue DAGCombiner::visitSUBSAT(SDNode *N) {
   if (N0 == N1)
     return DAG.getConstant(0, DL, VT);
 
-  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
-      DAG.isConstantIntBuildVectorOrConstantInt(N1)) {
-    // fold (sub_sat c1, c2) -> c3
-    return DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, N0.getNode(),
-                                      N1.getNode());
-  }
+  // fold (sub_sat c1, c2) -> c3
+  if (SDValue C = DAG.FoldConstantArithmetic(N->getOpcode(), DL, VT, {N0, N1}))
+    return C;
 
   // fold (sub_sat x, 0) -> x
   if (isNullConstant(N1))
@@ -3435,30 +3534,20 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   if (N0.isUndef() || N1.isUndef())
     return DAG.getConstant(0, SDLoc(N), VT);
 
-  bool N0IsConst = false;
   bool N1IsConst = false;
   bool N1IsOpaqueConst = false;
-  bool N0IsOpaqueConst = false;
-  APInt ConstValue0, ConstValue1;
+  APInt ConstValue1;
+
   // fold vector ops
   if (VT.isVector()) {
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
       return FoldedVOp;
 
-    N0IsConst = ISD::isConstantSplatVector(N0.getNode(), ConstValue0);
     N1IsConst = ISD::isConstantSplatVector(N1.getNode(), ConstValue1);
-    assert((!N0IsConst ||
-            ConstValue0.getBitWidth() == VT.getScalarSizeInBits()) &&
-           "Splat APInt should be element width");
     assert((!N1IsConst ||
             ConstValue1.getBitWidth() == VT.getScalarSizeInBits()) &&
            "Splat APInt should be element width");
   } else {
-    N0IsConst = isa<ConstantSDNode>(N0);
-    if (N0IsConst) {
-      ConstValue0 = cast<ConstantSDNode>(N0)->getAPIntValue();
-      N0IsOpaqueConst = cast<ConstantSDNode>(N0)->isOpaque();
-    }
     N1IsConst = isa<ConstantSDNode>(N1);
     if (N1IsConst) {
       ConstValue1 = cast<ConstantSDNode>(N1)->getAPIntValue();
@@ -3467,17 +3556,18 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
   }
 
   // fold (mul c1, c2) -> c1*c2
-  if (N0IsConst && N1IsConst && !N0IsOpaqueConst && !N1IsOpaqueConst)
-    return DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT,
-                                      N0.getNode(), N1.getNode());
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::MUL, SDLoc(N), VT, {N0, N1}))
+    return C;
 
   // canonicalize constant to RHS (vector doesn't have to splat)
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::MUL, SDLoc(N), VT, N1, N0);
+
   // fold (mul x, 0) -> 0
   if (N1IsConst && ConstValue1.isNullValue())
     return N1;
+
   // fold (mul x, 1) -> x
   if (N1IsConst && ConstValue1.isOneValue())
     return N0;
@@ -3491,6 +3581,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     return DAG.getNode(ISD::SUB, DL, VT,
                        DAG.getConstant(0, DL, VT), N0);
   }
+
   // fold (mul x, (1 << c)) -> x << c
   if (isConstantOrConstantVector(N1, /*NoOpaques*/ true) &&
       DAG.isKnownToBeAPowerOfTwo(N1) &&
@@ -3501,6 +3592,7 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
     SDValue Trunc = DAG.getZExtOrTrunc(LogBase2, DL, ShiftVT);
     return DAG.getNode(ISD::SHL, DL, VT, N0, Trunc);
   }
+
   // fold (mul x, -(1 << c)) -> -(x << c) or (-x) << c
   if (N1IsConst && !N1IsOpaqueConst && (-ConstValue1).isPowerOf2()) {
     unsigned Log2Val = (-ConstValue1).logBase2();
@@ -3589,6 +3681,14 @@ SDValue DAGCombiner::visitMUL(SDNode *N) {
                          DAG.getNode(ISD::MUL, SDLoc(N1), VT,
                                      N0.getOperand(1), N1));
 
+  // Fold (mul (vscale * C0), C1) to (vscale * (C0 * C1)).
+  if (N0.getOpcode() == ISD::VSCALE)
+    if (ConstantSDNode *NC1 = isConstOrConstSplat(N1)) {
+      APInt C0 = N0.getConstantOperandAPInt(0);
+      APInt C1 = NC1->getAPIntValue();
+      return DAG.getVScale(SDLoc(N), VT, C0 * C1);
+    }
+
   // reassociate mul
   if (SDValue RMUL = reassociateOps(ISD::MUL, SDLoc(N), N0, N1, N->getFlags()))
     return RMUL;
@@ -3746,13 +3846,14 @@ SDValue DAGCombiner::visitSDIV(SDNode *N) {
   SDLoc DL(N);
 
   // fold (sdiv c1, c2) -> c1/c2
-  ConstantSDNode *N0C = isConstOrConstSplat(N0);
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
-  if (N0C && N1C && !N0C->isOpaque() && !N1C->isOpaque())
-    return DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, N0C, N1C);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SDIV, DL, VT, {N0, N1}))
+    return C;
+
   // fold (sdiv X, -1) -> 0-X
   if (N1C && N1C->isAllOnesValue())
     return DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), N0);
+
   // fold (sdiv X, MIN_SIGNED) -> select(X == MIN_SIGNED, 1, 0)
   if (N1C && N1C->getAPIntValue().isMinSignedValue())
     return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
@@ -3890,12 +3991,10 @@ SDValue DAGCombiner::visitUDIV(SDNode *N) {
   SDLoc DL(N);
 
   // fold (udiv c1, c2) -> c1/c2
-  ConstantSDNode *N0C = isConstOrConstSplat(N0);
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
-  if (N0C && N1C)
-    if (SDValue Folded = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT,
-                                                    N0C, N1C))
-      return Folded;
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::UDIV, DL, VT, {N0, N1}))
+    return C;
+
   // fold (udiv X, -1) -> select(X == -1, 1, 0)
   if (N1C && N1C->getAPIntValue().isAllOnesValue())
     return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
@@ -3988,11 +4087,10 @@ SDValue DAGCombiner::visitREM(SDNode *N) {
   SDLoc DL(N);
 
   // fold (rem c1, c2) -> c1%c2
-  ConstantSDNode *N0C = isConstOrConstSplat(N0);
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
-  if (N0C && N1C)
-    if (SDValue Folded = DAG.FoldConstantArithmetic(Opcode, DL, VT, N0C, N1C))
-      return Folded;
+  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, DL, VT, {N0, N1}))
+    return C;
+
   // fold (urem X, -1) -> select(X == -1, 0, x)
   if (!isSigned && N1C && N1C->getAPIntValue().isAllOnesValue())
     return DAG.getSelect(DL, VT, DAG.getSetCC(DL, CCVT, N0, N1, ISD::SETEQ),
@@ -4088,7 +4186,7 @@ SDValue DAGCombiner::visitMULHS(SDNode *N) {
 
   // If the type twice as wide is legal, transform the mulhs to a wider multiply
   // plus a shift.
-  if (VT.isSimple() && !VT.isVector()) {
+  if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) {
     MVT Simple = VT.getSimpleVT();
     unsigned SimpleSize = Simple.getSizeInBits();
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
@@ -4144,7 +4242,7 @@ SDValue DAGCombiner::visitMULHU(SDNode *N) {
 
   // If the type twice as wide is legal, transform the mulhu to a wider multiply
   // plus a shift.
-  if (VT.isSimple() && !VT.isVector()) {
+  if (!TLI.isMulhCheaperThanMulShift(VT) && VT.isSimple() && !VT.isVector()) {
     MVT Simple = VT.getSimpleVT();
     unsigned SimpleSize = Simple.getSizeInBits();
     EVT NewVT = EVT::getIntegerVT(*DAG.getContext(), SimpleSize*2);
@@ -4317,6 +4415,7 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
+  unsigned Opcode = N->getOpcode();
 
   // fold vector ops
   if (VT.isVector())
@@ -4324,19 +4423,16 @@ SDValue DAGCombiner::visitIMINMAX(SDNode *N) {
       return FoldedVOp;
 
   // fold operation with constant operands.
-  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
-  ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
-  if (N0C && N1C)
-    return DAG.FoldConstantArithmetic(N->getOpcode(), SDLoc(N), VT, N0C, N1C);
+  if (SDValue C = DAG.FoldConstantArithmetic(Opcode, SDLoc(N), VT, {N0, N1}))
+    return C;
 
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
-     !DAG.isConstantIntBuildVectorOrConstantInt(N1))
+      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(N->getOpcode(), SDLoc(N), VT, N1, N0);
 
   // Is sign bits are zero, flip between UMIN/UMAX and SMIN/SMAX.
   // Only do this if the current op isn't legal and the flipped is.
-  unsigned Opcode = N->getOpcode();
   if (!TLI.isOperationLegal(Opcode, VT) &&
       (N0.isUndef() || DAG.SignBitIsZero(N0)) &&
       (N1.isUndef() || DAG.SignBitIsZero(N1))) {
@@ -4825,11 +4921,16 @@ bool DAGCombiner::isLegalNarrowLdSt(LSBaseSDNode *LDST,
     return false;
 
   // Ensure that this isn't going to produce an unsupported memory access.
-  if (ShAmt &&
-      !TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
-                              LDST->getAddressSpace(), ShAmt / 8,
-                              LDST->getMemOperand()->getFlags()))
-    return false;
+  if (ShAmt) {
+    assert(ShAmt % 8 == 0 && "ShAmt is byte offset");
+    const unsigned ByteShAmt = ShAmt / 8;
+    const Align LDSTAlign = LDST->getAlign();
+    const Align NarrowAlign = commonAlignment(LDSTAlign, ByteShAmt);
+    if (!TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), MemVT,
+                                LDST->getAddressSpace(), NarrowAlign,
+                                LDST->getMemOperand()->getFlags()))
+      return false;
+  }
 
   // It's not possible to generate a constant of extended or untyped type.
   EVT PtrType = LDST->getBasePtr().getValueType();
@@ -5174,17 +5275,19 @@ SDValue DAGCombiner::visitAND(SDNode *N) {
   }
 
   // fold (and c1, c2) -> c1&c2
-  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
-  if (N0C && N1C && !N1C->isOpaque())
-    return DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, N0C, N1C);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::AND, SDLoc(N), VT, {N0, N1}))
+    return C;
+
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::AND, SDLoc(N), VT, N1, N0);
+
   // fold (and x, -1) -> x
   if (isAllOnesConstant(N1))
     return N0;
+
   // if (and x, c) is known to be zero, return 0
   unsigned BitWidth = VT.getScalarSizeInBits();
   if (N1C && DAG.MaskedValueIsZero(SDValue(N, 0),
@@ -5654,6 +5757,48 @@ static bool isBSwapHWordPair(SDValue N, MutableArrayRef<SDNode *> Parts) {
   return false;
 }
 
+// Match this pattern:
+//   (or (and (shl (A, 8)), 0xff00ff00), (and (srl (A, 8)), 0x00ff00ff))
+// And rewrite this to:
+//   (rotr (bswap A), 16)
+static SDValue matchBSwapHWordOrAndAnd(const TargetLowering &TLI,
+                                       SelectionDAG &DAG, SDNode *N, SDValue N0,
+                                       SDValue N1, EVT VT, EVT ShiftAmountTy) {
+  assert(N->getOpcode() == ISD::OR && VT == MVT::i32 &&
+         "MatchBSwapHWordOrAndAnd: expecting i32");
+  if (!TLI.isOperationLegalOrCustom(ISD::ROTR, VT))
+    return SDValue();
+  if (N0.getOpcode() != ISD::AND || N1.getOpcode() != ISD::AND)
+    return SDValue();
+  // TODO: this is too restrictive; lifting this restriction requires more tests
+  if (!N0->hasOneUse() || !N1->hasOneUse())
+    return SDValue();
+  ConstantSDNode *Mask0 = isConstOrConstSplat(N0.getOperand(1));
+  ConstantSDNode *Mask1 = isConstOrConstSplat(N1.getOperand(1));
+  if (!Mask0 || !Mask1)
+    return SDValue();
+  if (Mask0->getAPIntValue() != 0xff00ff00 ||
+      Mask1->getAPIntValue() != 0x00ff00ff)
+    return SDValue();
+  SDValue Shift0 = N0.getOperand(0);
+  SDValue Shift1 = N1.getOperand(0);
+  if (Shift0.getOpcode() != ISD::SHL || Shift1.getOpcode() != ISD::SRL)
+    return SDValue();
+  ConstantSDNode *ShiftAmt0 = isConstOrConstSplat(Shift0.getOperand(1));
+  ConstantSDNode *ShiftAmt1 = isConstOrConstSplat(Shift1.getOperand(1));
+  if (!ShiftAmt0 || !ShiftAmt1)
+    return SDValue();
+  if (ShiftAmt0->getAPIntValue() != 8 || ShiftAmt1->getAPIntValue() != 8)
+    return SDValue();
+  if (Shift0.getOperand(0) != Shift1.getOperand(0))
+    return SDValue();
+
+  SDLoc DL(N);
+  SDValue BSwap = DAG.getNode(ISD::BSWAP, DL, VT, Shift0.getOperand(0));
+  SDValue ShAmt = DAG.getConstant(16, DL, ShiftAmountTy);
+  return DAG.getNode(ISD::ROTR, DL, VT, BSwap, ShAmt);
+}
+
 /// Match a 32-bit packed halfword bswap. That is
 /// ((x & 0x000000ff) << 8) |
 /// ((x & 0x0000ff00) >> 8) |
@@ -5670,6 +5815,16 @@ SDValue DAGCombiner::MatchBSwapHWord(SDNode *N, SDValue N0, SDValue N1) {
   if (!TLI.isOperationLegalOrCustom(ISD::BSWAP, VT))
     return SDValue();
 
+  if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N0, N1, VT,
+                                              getShiftAmountTy(VT)))
+  return BSwap;
+
+  // Try again with commuted operands.
+  if (SDValue BSwap = matchBSwapHWordOrAndAnd(TLI, DAG, N, N1, N0, VT,
+                                              getShiftAmountTy(VT)))
+  return BSwap;
+
+
   // Look for either
   // (or (bswaphpair), (bswaphpair))
   // (or (or (bswaphpair), (and)), (and))
@@ -5875,17 +6030,19 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   }
 
   // fold (or c1, c2) -> c1|c2
-  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
-  if (N0C && N1C && !N1C->isOpaque())
-    return DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, N0C, N1C);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N), VT, {N0, N1}))
+    return C;
+
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::OR, SDLoc(N), VT, N1, N0);
+
   // fold (or x, 0) -> x
   if (isNullConstant(N1))
     return N0;
+
   // fold (or x, -1) -> -1
   if (isAllOnesConstant(N1))
     return N1;
@@ -5920,8 +6077,8 @@ SDValue DAGCombiner::visitOR(SDNode *N) {
   };
   if (N0.getOpcode() == ISD::AND && N0.getNode()->hasOneUse() &&
       ISD::matchBinaryPredicate(N0.getOperand(1), N1, MatchIntersect, true)) {
-    if (SDValue COR = DAG.FoldConstantArithmetic(
-            ISD::OR, SDLoc(N1), VT, N1.getNode(), N0.getOperand(1).getNode())) {
+    if (SDValue COR = DAG.FoldConstantArithmetic(ISD::OR, SDLoc(N1), VT,
+                                                 {N1, N0.getOperand(1)})) {
       SDValue IOR = DAG.getNode(ISD::OR, SDLoc(N0), VT, N0.getOperand(0), N1);
       AddToWorklist(IOR.getNode());
       return DAG.getNode(ISD::AND, SDLoc(N), VT, COR, IOR);
@@ -6020,6 +6177,7 @@ static SDValue extractShiftForRotate(SelectionDAG &DAG, SDValue OppShift,
   ConstantSDNode *OppShiftCst = isConstOrConstSplat(OppShift.getOperand(1));
 
   // (add v v) -> (shl v 1)
+  // TODO: Should this be a general DAG canonicalization?
   if (OppShift.getOpcode() == ISD::SRL && OppShiftCst &&
       ExtractFrom.getOpcode() == ISD::ADD &&
       ExtractFrom.getOperand(0) == ExtractFrom.getOperand(1) &&
@@ -6192,8 +6350,12 @@ static bool matchRotateSub(SDValue Pos, SDValue Neg, unsigned EltSize,
   //              EltSize & Mask == NegC & Mask
   //
   // (because "x & Mask" is a truncation and distributes through subtraction).
+  //
+  // We also need to account for a potential truncation of NegOp1 if the amount
+  // has already been legalized to a shift amount type.
   APInt Width;
-  if (Pos == NegOp1)
+  if ((Pos == NegOp1) ||
+      (NegOp1.getOpcode() == ISD::TRUNCATE && Pos == NegOp1.getOperand(0)))
     Width = NegC->getAPIntValue();
 
   // Check for cases where Pos has the form (add NegOp1, PosC) for some PosC.
@@ -6246,19 +6408,91 @@ SDValue DAGCombiner::MatchRotatePosNeg(SDValue Shifted, SDValue Pos,
   return SDValue();
 }
 
+// A subroutine of MatchRotate used once we have found an OR of two opposite
+// shifts of N0 + N1.  If Neg == <operand size> - Pos then the OR reduces
+// to both (PosOpcode N0, N1, Pos) and (NegOpcode N0, N1, Neg), with the
+// former being preferred if supported.  InnerPos and InnerNeg are Pos and
+// Neg with outer conversions stripped away.
+// TODO: Merge with MatchRotatePosNeg.
+SDValue DAGCombiner::MatchFunnelPosNeg(SDValue N0, SDValue N1, SDValue Pos,
+                                       SDValue Neg, SDValue InnerPos,
+                                       SDValue InnerNeg, unsigned PosOpcode,
+                                       unsigned NegOpcode, const SDLoc &DL) {
+  EVT VT = N0.getValueType();
+  unsigned EltBits = VT.getScalarSizeInBits();
+
+  // fold (or (shl x0, (*ext y)),
+  //          (srl x1, (*ext (sub 32, y)))) ->
+  //   (fshl x0, x1, y) or (fshr x0, x1, (sub 32, y))
+  //
+  // fold (or (shl x0, (*ext (sub 32, y))),
+  //          (srl x1, (*ext y))) ->
+  //   (fshr x0, x1, y) or (fshl x0, x1, (sub 32, y))
+  if (matchRotateSub(InnerPos, InnerNeg, EltBits, DAG)) {
+    bool HasPos = TLI.isOperationLegalOrCustom(PosOpcode, VT);
+    return DAG.getNode(HasPos ? PosOpcode : NegOpcode, DL, VT, N0, N1,
+                       HasPos ? Pos : Neg);
+  }
+
+  // Matching the shift+xor cases, we can't easily use the xor'd shift amount
+  // so for now just use the PosOpcode case if its legal.
+  // TODO: When can we use the NegOpcode case?
+  if (PosOpcode == ISD::FSHL && isPowerOf2_32(EltBits)) {
+    auto IsBinOpImm = [](SDValue Op, unsigned BinOpc, unsigned Imm) {
+      if (Op.getOpcode() != BinOpc)
+        return false;
+      ConstantSDNode *Cst = isConstOrConstSplat(Op.getOperand(1));
+      return Cst && (Cst->getAPIntValue() == Imm);
+    };
+
+    // fold (or (shl x0, y), (srl (srl x1, 1), (xor y, 31)))
+    //   -> (fshl x0, x1, y)
+    if (IsBinOpImm(N1, ISD::SRL, 1) &&
+        IsBinOpImm(InnerNeg, ISD::XOR, EltBits - 1) &&
+        InnerPos == InnerNeg.getOperand(0) &&
+        TLI.isOperationLegalOrCustom(ISD::FSHL, VT)) {
+      return DAG.getNode(ISD::FSHL, DL, VT, N0, N1.getOperand(0), Pos);
+    }
+
+    // fold (or (shl (shl x0, 1), (xor y, 31)), (srl x1, y))
+    //   -> (fshr x0, x1, y)
+    if (IsBinOpImm(N0, ISD::SHL, 1) &&
+        IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
+        InnerNeg == InnerPos.getOperand(0) &&
+        TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {
+      return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
+    }
+
+    // fold (or (shl (add x0, x0), (xor y, 31)), (srl x1, y))
+    //   -> (fshr x0, x1, y)
+    // TODO: Should add(x,x) -> shl(x,1) be a general DAG canonicalization?
+    if (N0.getOpcode() == ISD::ADD && N0.getOperand(0) == N0.getOperand(1) &&
+        IsBinOpImm(InnerPos, ISD::XOR, EltBits - 1) &&
+        InnerNeg == InnerPos.getOperand(0) &&
+        TLI.isOperationLegalOrCustom(ISD::FSHR, VT)) {
+      return DAG.getNode(ISD::FSHR, DL, VT, N0.getOperand(0), N1, Neg);
+    }
+  }
+
+  return SDValue();
+}
+
 // MatchRotate - Handle an 'or' of two operands.  If this is one of the many
 // idioms for rotate, and if the target supports rotation instructions, generate
-// a rot[lr].
+// a rot[lr]. This also matches funnel shift patterns, similar to rotation but
+// with different shifted sources.
 SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
   // Must be a legal type.  Expanded 'n promoted things won't work with rotates.
   EVT VT = LHS.getValueType();
   if (!TLI.isTypeLegal(VT))
     return SDValue();
 
-  // The target must have at least one rotate flavor.
+  // The target must have at least one rotate/funnel flavor.
   bool HasROTL = hasOperation(ISD::ROTL, VT);
   bool HasROTR = hasOperation(ISD::ROTR, VT);
-  if (!HasROTL && !HasROTR)
+  bool HasFSHL = hasOperation(ISD::FSHL, VT);
+  bool HasFSHR = hasOperation(ISD::FSHR, VT);
+  if (!HasROTL && !HasROTR && !HasFSHL && !HasFSHR)
     return SDValue();
 
   // Check for truncated rotate.
@@ -6308,12 +6542,13 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
 
   // At this point we've matched or extracted a shift op on each side.
 
-  if (LHSShift.getOperand(0) != RHSShift.getOperand(0))
-    return SDValue(); // Not shifting the same value.
-
   if (LHSShift.getOpcode() == RHSShift.getOpcode())
     return SDValue(); // Shifts must disagree.
 
+  bool IsRotate = LHSShift.getOperand(0) == RHSShift.getOperand(0);
+  if (!IsRotate && !(HasFSHL || HasFSHR))
+    return SDValue(); // Requires funnel shift support.
+
   // Canonicalize shl to left side in a shl/srl pair.
   if (RHSShift.getOpcode() == ISD::SHL) {
     std::swap(LHS, RHS);
@@ -6329,13 +6564,21 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
 
   // fold (or (shl x, C1), (srl x, C2)) -> (rotl x, C1)
   // fold (or (shl x, C1), (srl x, C2)) -> (rotr x, C2)
+  // fold (or (shl x, C1), (srl y, C2)) -> (fshl x, y, C1)
+  // fold (or (shl x, C1), (srl y, C2)) -> (fshr x, y, C2)
+  // iff C1+C2 == EltSizeInBits
   auto MatchRotateSum = [EltSizeInBits](ConstantSDNode *LHS,
                                         ConstantSDNode *RHS) {
     return (LHS->getAPIntValue() + RHS->getAPIntValue()) == EltSizeInBits;
   };
   if (ISD::matchBinaryPredicate(LHSShiftAmt, RHSShiftAmt, MatchRotateSum)) {
-    SDValue Rot = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT,
-                              LHSShiftArg, HasROTL ? LHSShiftAmt : RHSShiftAmt);
+    SDValue Res;
+    if (IsRotate && (HasROTL || HasROTR))
+      Res = DAG.getNode(HasROTL ? ISD::ROTL : ISD::ROTR, DL, VT, LHSShiftArg,
+                        HasROTL ? LHSShiftAmt : RHSShiftAmt);
+    else
+      Res = DAG.getNode(HasFSHL ? ISD::FSHL : ISD::FSHR, DL, VT, LHSShiftArg,
+                        RHSShiftArg, HasFSHL ? LHSShiftAmt : RHSShiftAmt);
 
     // If there is an AND of either shifted operand, apply it to the result.
     if (LHSMask.getNode() || RHSMask.getNode()) {
@@ -6353,10 +6596,10 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
                            DAG.getNode(ISD::OR, DL, VT, RHSMask, LHSBits));
       }
 
-      Rot = DAG.getNode(ISD::AND, DL, VT, Rot, Mask);
+      Res = DAG.getNode(ISD::AND, DL, VT, Res, Mask);
     }
 
-    return Rot;
+    return Res;
   }
 
   // If there is a mask here, and we have a variable shift, we can't be sure
@@ -6379,13 +6622,29 @@ SDValue DAGCombiner::MatchRotate(SDValue LHS, SDValue RHS, const SDLoc &DL) {
     RExtOp0 = RHSShiftAmt.getOperand(0);
   }
 
-  SDValue TryL = MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt,
-                                   LExtOp0, RExtOp0, ISD::ROTL, ISD::ROTR, DL);
+  if (IsRotate && (HasROTL || HasROTR)) {
+    SDValue TryL =
+        MatchRotatePosNeg(LHSShiftArg, LHSShiftAmt, RHSShiftAmt, LExtOp0,
+                          RExtOp0, ISD::ROTL, ISD::ROTR, DL);
+    if (TryL)
+      return TryL;
+
+    SDValue TryR =
+        MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt, RExtOp0,
+                          LExtOp0, ISD::ROTR, ISD::ROTL, DL);
+    if (TryR)
+      return TryR;
+  }
+
+  SDValue TryL =
+      MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, LHSShiftAmt, RHSShiftAmt,
+                        LExtOp0, RExtOp0, ISD::FSHL, ISD::FSHR, DL);
   if (TryL)
     return TryL;
 
-  SDValue TryR = MatchRotatePosNeg(RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
-                                   RExtOp0, LExtOp0, ISD::ROTR, ISD::ROTL, DL);
+  SDValue TryR =
+      MatchFunnelPosNeg(LHSShiftArg, RHSShiftArg, RHSShiftAmt, LHSShiftAmt,
+                        RExtOp0, LExtOp0, ISD::FSHR, ISD::FSHL, DL);
   if (TryR)
     return TryR;
 
@@ -6610,9 +6869,9 @@ SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) {
   if (LegalOperations && !TLI.isOperationLegal(ISD::STORE, VT))
     return SDValue();
 
-  // Check if all the bytes of the combined value we are looking at are stored 
-  // to the same base address. Collect bytes offsets from Base address into 
-  // ByteOffsets. 
+  // Check if all the bytes of the combined value we are looking at are stored
+  // to the same base address. Collect bytes offsets from Base address into
+  // ByteOffsets.
   SDValue CombinedValue;
   SmallVector<int64_t, 8> ByteOffsets(Width, INT64_MAX);
   int64_t FirstOffset = INT64_MAX;
@@ -6630,17 +6889,16 @@ SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) {
     SDValue Value = Trunc.getOperand(0);
     if (Value.getOpcode() == ISD::SRL ||
         Value.getOpcode() == ISD::SRA) {
-      ConstantSDNode *ShiftOffset =
-        dyn_cast<ConstantSDNode>(Value.getOperand(1));
-      // Trying to match the following pattern. The shift offset must be 
+      auto *ShiftOffset = dyn_cast<ConstantSDNode>(Value.getOperand(1));
+      // Trying to match the following pattern. The shift offset must be
       // a constant and a multiple of 8. It is the byte offset in "y".
-      // 
+      //
       // x = srl y, offset
-      // i8 z = trunc x 
+      // i8 z = trunc x
       // store z, ...
       if (!ShiftOffset || (ShiftOffset->getSExtValue() % 8))
         return SDValue();
-  
+
      Offset = ShiftOffset->getSExtValue()/8;
      Value = Value.getOperand(0);
     }
@@ -6685,7 +6943,7 @@ SDValue DAGCombiner::MatchStoreCombine(StoreSDNode *N) {
   assert(FirstOffset != INT64_MAX && "First byte offset must be set");
   assert(FirstStore && "First store must be set");
 
-  // Check if the bytes of the combined value we are looking at match with 
+  // Check if the bytes of the combined value we are looking at match with
   // either big or little endian value store.
   Optional<bool> IsBigEndian = isBigEndian(ByteOffsets, FirstOffset);
   if (!IsBigEndian.hasValue())
@@ -7030,20 +7288,22 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   SDLoc DL(N);
   if (N0.isUndef() && N1.isUndef())
     return DAG.getConstant(0, DL, VT);
+
   // fold (xor x, undef) -> undef
   if (N0.isUndef())
     return N0;
   if (N1.isUndef())
     return N1;
+
   // fold (xor c1, c2) -> c1^c2
-  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
-  ConstantSDNode *N1C = getAsNonOpaqueConstant(N1);
-  if (N0C && N1C)
-    return DAG.FoldConstantArithmetic(ISD::XOR, DL, VT, N0C, N1C);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::XOR, DL, VT, {N0, N1}))
+    return C;
+
   // canonicalize constant to RHS
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
      !DAG.isConstantIntBuildVectorOrConstantInt(N1))
     return DAG.getNode(ISD::XOR, DL, VT, N1, N0);
+
   // fold (xor x, 0) -> x
   if (isNullConstant(N1))
     return N0;
@@ -7058,7 +7318,8 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   // fold !(x cc y) -> (x !cc y)
   unsigned N0Opcode = N0.getOpcode();
   SDValue LHS, RHS, CC;
-  if (TLI.isConstTrueVal(N1.getNode()) && isSetCCEquivalent(N0, LHS, RHS, CC)) {
+  if (TLI.isConstTrueVal(N1.getNode()) &&
+      isSetCCEquivalent(N0, LHS, RHS, CC, /*MatchStrict*/true)) {
     ISD::CondCode NotCC = ISD::getSetCCInverse(cast<CondCodeSDNode>(CC)->get(),
                                                LHS.getValueType());
     if (!LegalOperations ||
@@ -7071,6 +7332,21 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
       case ISD::SELECT_CC:
         return DAG.getSelectCC(SDLoc(N0), LHS, RHS, N0.getOperand(2),
                                N0.getOperand(3), NotCC);
+      case ISD::STRICT_FSETCC:
+      case ISD::STRICT_FSETCCS: {
+        if (N0.hasOneUse()) {
+          // FIXME Can we handle multiple uses? Could we token factor the chain
+          // results from the new/old setcc?
+          SDValue SetCC = DAG.getSetCC(SDLoc(N0), VT, LHS, RHS, NotCC,
+                                       N0.getOperand(0),
+                                       N0Opcode == ISD::STRICT_FSETCCS);
+          CombineTo(N, SetCC);
+          DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), SetCC.getValue(1));
+          recursivelyDeleteUnusedNodes(N0.getNode());
+          return SDValue(N, 0); // Return N so it doesn't get rechecked!
+        }
+        break;
+      }
       }
     }
   }
@@ -7405,15 +7681,29 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
   }
 
   // fold (rot x, c) -> (rot x, c % BitSize)
-  // TODO - support non-uniform vector amounts.
-  if (ConstantSDNode *Cst = isConstOrConstSplat(N1)) {
-    if (Cst->getAPIntValue().uge(Bitsize)) {
-      uint64_t RotAmt = Cst->getAPIntValue().urem(Bitsize);
-      return DAG.getNode(N->getOpcode(), dl, VT, N0,
-                         DAG.getConstant(RotAmt, dl, N1.getValueType()));
-    }
+  bool OutOfRange = false;
+  auto MatchOutOfRange = [Bitsize, &OutOfRange](ConstantSDNode *C) {
+    OutOfRange |= C->getAPIntValue().uge(Bitsize);
+    return true;
+  };
+  if (ISD::matchUnaryPredicate(N1, MatchOutOfRange) && OutOfRange) {
+    EVT AmtVT = N1.getValueType();
+    SDValue Bits = DAG.getConstant(Bitsize, dl, AmtVT);
+    if (SDValue Amt =
+            DAG.FoldConstantArithmetic(ISD::UREM, dl, AmtVT, {N1, Bits}))
+      return DAG.getNode(N->getOpcode(), dl, VT, N0, Amt);
   }
 
+  // rot i16 X, 8 --> bswap X
+  auto *RotAmtC = isConstOrConstSplat(N1);
+  if (RotAmtC && RotAmtC->getAPIntValue() == 8 &&
+      VT.getScalarSizeInBits() == 16 && hasOperation(ISD::BSWAP, VT))
+    return DAG.getNode(ISD::BSWAP, dl, VT, N0);
+
+  // Simplify the operands using demanded-bits information.
+  if (SimplifyDemandedBits(SDValue(N, 0)))
+    return SDValue(N, 0);
+
   // fold (rot* x, (trunc (and y, c))) -> (rot* x, (and (trunc y), (trunc c))).
   if (N1.getOpcode() == ISD::TRUNCATE &&
       N1.getOperand(0).getOpcode() == ISD::AND) {
@@ -7430,12 +7720,11 @@ SDValue DAGCombiner::visitRotate(SDNode *N) {
       EVT ShiftVT = C1->getValueType(0);
       bool SameSide = (N->getOpcode() == NextOp);
       unsigned CombineOp = SameSide ? ISD::ADD : ISD::SUB;
-      if (SDValue CombinedShift =
-              DAG.FoldConstantArithmetic(CombineOp, dl, ShiftVT, C1, C2)) {
+      if (SDValue CombinedShift = DAG.FoldConstantArithmetic(
+              CombineOp, dl, ShiftVT, {N1, N0.getOperand(1)})) {
         SDValue BitsizeC = DAG.getConstant(Bitsize, dl, ShiftVT);
         SDValue CombinedShiftNorm = DAG.FoldConstantArithmetic(
-            ISD::SREM, dl, ShiftVT, CombinedShift.getNode(),
-            BitsizeC.getNode());
+            ISD::SREM, dl, ShiftVT, {CombinedShift, BitsizeC});
         return DAG.getNode(N->getOpcode(), dl, VT, N0->getOperand(0),
                            CombinedShiftNorm);
       }
@@ -7471,8 +7760,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
         if (N01CV && N01CV->isConstant() && N00.getOpcode() == ISD::SETCC &&
             TLI.getBooleanContents(N00.getOperand(0).getValueType()) ==
                 TargetLowering::ZeroOrNegativeOneBooleanContent) {
-          if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT,
-                                                     N01CV, N1CV))
+          if (SDValue C =
+                  DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N01, N1}))
             return DAG.getNode(ISD::AND, SDLoc(N), VT, N00, C);
         }
       }
@@ -7482,10 +7771,8 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
 
   // fold (shl c1, c2) -> c1<<c2
-  // TODO - support non-uniform vector shift amounts.
-  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
-  if (N0C && N1C && !N1C->isOpaque())
-    return DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, N0C, N1C);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SHL, SDLoc(N), VT, {N0, N1}))
+    return C;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
@@ -7502,8 +7789,7 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
       return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0, NewOp1);
   }
 
-  // TODO - support non-uniform vector shift amounts.
-  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+  if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   // fold (shl (shl x, c1), c2) -> 0 or (shl x, (add c1, c2))
@@ -7691,9 +7977,90 @@ SDValue DAGCombiner::visitSHL(SDNode *N) {
     if (SDValue NewSHL = visitShiftByConstant(N))
       return NewSHL;
 
+  // Fold (shl (vscale * C0), C1) to (vscale * (C0 << C1)).
+  if (N0.getOpcode() == ISD::VSCALE)
+    if (ConstantSDNode *NC1 = isConstOrConstSplat(N->getOperand(1))) {
+      auto DL = SDLoc(N);
+      APInt C0 = N0.getConstantOperandAPInt(0);
+      APInt C1 = NC1->getAPIntValue();
+      return DAG.getVScale(DL, VT, C0 << C1);
+    }
+
   return SDValue();
 }
 
+// Transform a right shift of a multiply into a multiply-high.
+// Examples:
+// (srl (mul (zext i32:$a to i64), (zext i32:$a to i64)), 32) -> (mulhu $a, $b)
+// (sra (mul (sext i32:$a to i64), (sext i32:$a to i64)), 32) -> (mulhs $a, $b)
+static SDValue combineShiftToMULH(SDNode *N, SelectionDAG &DAG,
+                                  const TargetLowering &TLI) {
+  assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+         "SRL or SRA node is required here!");
+
+  // Check the shift amount. Proceed with the transformation if the shift
+  // amount is constant.
+  ConstantSDNode *ShiftAmtSrc = isConstOrConstSplat(N->getOperand(1));
+  if (!ShiftAmtSrc)
+    return SDValue();
+
+  SDLoc DL(N);
+
+  // The operation feeding into the shift must be a multiply.
+  SDValue ShiftOperand = N->getOperand(0);
+  if (ShiftOperand.getOpcode() != ISD::MUL)
+    return SDValue();
+
+  // Both operands must be equivalent extend nodes.
+  SDValue LeftOp = ShiftOperand.getOperand(0);
+  SDValue RightOp = ShiftOperand.getOperand(1);
+  bool IsSignExt = LeftOp.getOpcode() == ISD::SIGN_EXTEND;
+  bool IsZeroExt = LeftOp.getOpcode() == ISD::ZERO_EXTEND;
+
+  if ((!(IsSignExt || IsZeroExt)) || LeftOp.getOpcode() != RightOp.getOpcode())
+    return SDValue();
+
+  EVT WideVT1 = LeftOp.getValueType();
+  EVT WideVT2 = RightOp.getValueType();
+  (void)WideVT2;
+  // Proceed with the transformation if the wide types match.
+  assert((WideVT1 == WideVT2) &&
+         "Cannot have a multiply node with two different operand types.");
+
+  EVT NarrowVT = LeftOp.getOperand(0).getValueType();
+  // Check that the two extend nodes are the same type.
+  if (NarrowVT !=  RightOp.getOperand(0).getValueType())
+    return SDValue();
+
+  // Only transform into mulh if mulh for the narrow type is cheaper than
+  // a multiply followed by a shift. This should also check if mulh is
+  // legal for NarrowVT on the target.
+  if (!TLI.isMulhCheaperThanMulShift(NarrowVT))
+      return SDValue();
+
+  // Proceed with the transformation if the wide type is twice as large
+  // as the narrow type.
+  unsigned NarrowVTSize = NarrowVT.getScalarSizeInBits();
+  if (WideVT1.getScalarSizeInBits() != 2 * NarrowVTSize)
+    return SDValue();
+
+  // Check the shift amount with the narrow type size.
+  // Proceed with the transformation if the shift amount is the width
+  // of the narrow type.
+  unsigned ShiftAmt = ShiftAmtSrc->getZExtValue();
+  if (ShiftAmt != NarrowVTSize)
+    return SDValue();
+
+  // If the operation feeding into the MUL is a sign extend (sext),
+  // we use mulhs. Othewise, zero extends (zext) use mulhu.
+  unsigned MulhOpcode = IsSignExt ? ISD::MULHS : ISD::MULHU;
+
+  SDValue Result = DAG.getNode(MulhOpcode, DL, NarrowVT, LeftOp.getOperand(0),
+                               RightOp.getOperand(0));
+  return (N->getOpcode() == ISD::SRA ? DAG.getSExtOrTrunc(Result, DL, WideVT1)
+                                     : DAG.getZExtOrTrunc(Result, DL, WideVT1));
+}
+
 SDValue DAGCombiner::visitSRA(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -7717,10 +8084,8 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
 
   // fold (sra c1, c2) -> (sra c1, c2)
-  // TODO - support non-uniform vector shift amounts.
-  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
-  if (N0C && N1C && !N1C->isOpaque())
-    return DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, N0C, N1C);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRA, SDLoc(N), VT, {N0, N1}))
+    return C;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
@@ -7811,7 +8176,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   // We convert trunc/ext to opposing shifts in IR, but casts may be cheaper.
   //   sra (add (shl X, N1C), AddC), N1C -->
   //   sext (add (trunc X to (width - N1C)), AddC')
-  if (!LegalTypes && N0.getOpcode() == ISD::ADD && N0.hasOneUse() && N1C &&
+  if (N0.getOpcode() == ISD::ADD && N0.hasOneUse() && N1C &&
       N0.getOperand(0).getOpcode() == ISD::SHL &&
       N0.getOperand(0).getOperand(1) == N1 && N0.getOperand(0).hasOneUse()) {
     if (ConstantSDNode *AddC = isConstOrConstSplat(N0.getOperand(1))) {
@@ -7828,7 +8193,8 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
       //       implementation and/or target-specific overrides (because
       //       non-simple types likely require masking when legalized), but that
       //       restriction may conflict with other transforms.
-      if (TruncVT.isSimple() && TLI.isTruncateFree(VT, TruncVT)) {
+      if (TruncVT.isSimple() && isTypeLegal(TruncVT) &&
+          TLI.isTruncateFree(VT, TruncVT)) {
         SDLoc DL(N);
         SDValue Trunc = DAG.getZExtOrTrunc(Shl.getOperand(0), DL, TruncVT);
         SDValue ShiftC = DAG.getConstant(AddC->getAPIntValue().lshr(ShiftAmt).
@@ -7871,8 +8237,7 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
   }
 
   // Simplify, based on bits shifted out of the LHS.
-  // TODO - support non-uniform vector shift amounts.
-  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+  if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   // If the sign bit is known to be zero, switch this to a SRL.
@@ -7883,6 +8248,11 @@ SDValue DAGCombiner::visitSRA(SDNode *N) {
     if (SDValue NewSRA = visitShiftByConstant(N))
       return NewSRA;
 
+  // Try to transform this shift into a multiply-high if
+  // it matches the appropriate pattern detected in combineShiftToMULH.
+  if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
+    return MULH;
+
   return SDValue();
 }
 
@@ -7903,10 +8273,8 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
   ConstantSDNode *N1C = isConstOrConstSplat(N1);
 
   // fold (srl c1, c2) -> c1 >>u c2
-  // TODO - support non-uniform vector shift amounts.
-  ConstantSDNode *N0C = getAsNonOpaqueConstant(N0);
-  if (N0C && N1C && !N1C->isOpaque())
-    return DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, N0C, N1C);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::SRL, SDLoc(N), VT, {N0, N1}))
+    return C;
 
   if (SDValue NewSel = foldBinOpIntoSelect(N))
     return NewSel;
@@ -8070,8 +8438,7 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
 
   // fold operands of srl based on knowledge that the low bits are not
   // demanded.
-  // TODO - support non-uniform vector shift amounts.
-  if (N1C && SimplifyDemandedBits(SDValue(N, 0)))
+  if (SimplifyDemandedBits(SDValue(N, 0)))
     return SDValue(N, 0);
 
   if (N1C && !N1C->isOpaque())
@@ -8111,6 +8478,11 @@ SDValue DAGCombiner::visitSRL(SDNode *N) {
     }
   }
 
+  // Try to transform this shift into a multiply-high if
+  // it matches the appropriate pattern detected in combineShiftToMULH.
+  if (SDValue MULH = combineShiftToMULH(N, DAG, TLI))
+    return MULH;
+
   return SDValue();
 }
 
@@ -8160,6 +8532,45 @@ SDValue DAGCombiner::visitFunnelShift(SDNode *N) {
       return DAG.getNode(ISD::SHL, SDLoc(N), VT, N0,
                          DAG.getConstant(IsFSHL ? ShAmt : BitWidth - ShAmt,
                                          SDLoc(N), ShAmtTy));
+
+    // fold (fshl ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
+    // fold (fshr ld1, ld0, c) -> (ld0[ofs]) iff ld0 and ld1 are consecutive.
+    // TODO - bigendian support once we have test coverage.
+    // TODO - can we merge this with CombineConseutiveLoads/MatchLoadCombine?
+    // TODO - permit LHS EXTLOAD if extensions are shifted out.
+    if ((BitWidth % 8) == 0 && (ShAmt % 8) == 0 && !VT.isVector() &&
+        !DAG.getDataLayout().isBigEndian()) {
+      auto *LHS = dyn_cast<LoadSDNode>(N0);
+      auto *RHS = dyn_cast<LoadSDNode>(N1);
+      if (LHS && RHS && LHS->isSimple() && RHS->isSimple() &&
+          LHS->getAddressSpace() == RHS->getAddressSpace() &&
+          (LHS->hasOneUse() || RHS->hasOneUse()) && ISD::isNON_EXTLoad(RHS) &&
+          ISD::isNON_EXTLoad(LHS)) {
+        if (DAG.areNonVolatileConsecutiveLoads(LHS, RHS, BitWidth / 8, 1)) {
+          SDLoc DL(RHS);
+          uint64_t PtrOff =
+              IsFSHL ? (((BitWidth - ShAmt) % BitWidth) / 8) : (ShAmt / 8);
+          Align NewAlign = commonAlignment(RHS->getAlign(), PtrOff);
+          bool Fast = false;
+          if (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), VT,
+                                     RHS->getAddressSpace(), NewAlign,
+                                     RHS->getMemOperand()->getFlags(), &Fast) &&
+              Fast) {
+            SDValue NewPtr =
+                DAG.getMemBasePlusOffset(RHS->getBasePtr(), PtrOff, DL);
+            AddToWorklist(NewPtr.getNode());
+            SDValue Load = DAG.getLoad(
+                VT, DL, RHS->getChain(), NewPtr,
+                RHS->getPointerInfo().getWithOffset(PtrOff), NewAlign,
+                RHS->getMemOperand()->getFlags(), RHS->getAAInfo());
+            // Replace the old load's chain with the new load's chain.
+            WorklistRemover DeadNodes(*this);
+            DAG.ReplaceAllUsesOfValueWith(N1.getValue(1), Load.getValue(1));
+            return Load;
+          }
+        }
+      }
+    }
   }
 
   // fold fshr(undef_or_zero, N1, N2) -> lshr(N1, N2)
@@ -8609,7 +9020,7 @@ SDValue DAGCombiner::visitSELECT(SDNode *N) {
         // Create the actual or node if we can generate good code for it.
         if (!normalizeToSequence) {
           SDValue Or = DAG.getNode(ISD::OR, DL, N0.getValueType(), N0, N2_0);
-          return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1, 
+          return DAG.getNode(ISD::SELECT, DL, N1.getValueType(), Or, N1,
                              N2_2, Flags);
         }
         // Otherwise see if we can optimize to a better pattern.
@@ -8825,6 +9236,8 @@ SDValue DAGCombiner::foldVSelectOfConstants(SDNode *N) {
     SDValue N2Elt = N2.getOperand(i);
     if (N1Elt.isUndef() || N2Elt.isUndef())
       continue;
+    if (N1Elt.getValueType() != N2Elt.getValueType())
+      continue;
 
     const APInt &C1 = cast<ConstantSDNode>(N1Elt)->getAPIntValue();
     const APInt &C2 = cast<ConstantSDNode>(N2Elt)->getAPIntValue();
@@ -9395,8 +9808,7 @@ SDValue DAGCombiner::CombineZExtLogicopShiftLoad(SDNode *N) {
   SDValue Shift = DAG.getNode(N1.getOpcode(), DL1, VT, ExtLoad,
                               N1.getOperand(1));
 
-  APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
-  Mask = Mask.zext(VT.getSizeInBits());
+  APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
   SDLoc DL0(N0);
   SDValue And = DAG.getNode(N0.getOpcode(), DL0, VT, Shift,
                             DAG.getConstant(Mask, DL0, VT));
@@ -9702,8 +10114,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND(SDNode *N) {
                                          LN00->getChain(), LN00->getBasePtr(),
                                          LN00->getMemoryVT(),
                                          LN00->getMemOperand());
-        APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
-        Mask = Mask.sext(VT.getSizeInBits());
+        APInt Mask = N0.getConstantOperandAPInt(1).sext(VT.getSizeInBits());
         SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
                                   ExtLoad, DAG.getConstant(Mask, DL, VT));
         ExtendSetCCUses(SetCCs, N0.getOperand(0), ExtLoad, ISD::SIGN_EXTEND);
@@ -9941,7 +10352,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       if (!LegalOperations || (TLI.isOperationLegal(ISD::AND, SrcVT) &&
                                TLI.isOperationLegal(ISD::ZERO_EXTEND, VT))) {
         SDValue Op = N0.getOperand(0);
-        Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
+        Op = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT);
         AddToWorklist(Op.getNode());
         SDValue ZExtOrTrunc = DAG.getZExtOrTrunc(Op, SDLoc(N), VT);
         // Transfer the debug info; the new node is equivalent to N0.
@@ -9953,7 +10364,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     if (!LegalOperations || TLI.isOperationLegal(ISD::AND, VT)) {
       SDValue Op = DAG.getAnyExtOrTrunc(N0.getOperand(0), SDLoc(N), VT);
       AddToWorklist(Op.getNode());
-      SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT.getScalarType());
+      SDValue And = DAG.getZeroExtendInReg(Op, SDLoc(N), MinVT);
       // We may safely transfer the debug info describing the truncate node over
       // to the equivalent and operation.
       DAG.transferDbgValues(N0, And);
@@ -9971,8 +10382,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
        !TLI.isZExtFree(N0.getValueType(), VT))) {
     SDValue X = N0.getOperand(0).getOperand(0);
     X = DAG.getAnyExtOrTrunc(X, SDLoc(X), VT);
-    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
-    Mask = Mask.zext(VT.getSizeInBits());
+    APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
     SDLoc DL(N);
     return DAG.getNode(ISD::AND, DL, VT,
                        X, DAG.getConstant(Mask, DL, VT));
@@ -10026,8 +10436,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
                                          LN00->getChain(), LN00->getBasePtr(),
                                          LN00->getMemoryVT(),
                                          LN00->getMemOperand());
-        APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
-        Mask = Mask.zext(VT.getSizeInBits());
+        APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
         SDLoc DL(N);
         SDValue And = DAG.getNode(N0.getOpcode(), DL, VT,
                                   ExtLoad, DAG.getConstant(Mask, DL, VT));
@@ -10080,23 +10489,22 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
       // that the element size of the sext'd result matches the element size of
       // the compare operands.
       SDLoc DL(N);
-      SDValue VecOnes = DAG.getConstant(1, DL, VT);
       if (VT.getSizeInBits() == N00VT.getSizeInBits()) {
-        // zext(setcc) -> (and (vsetcc), (1, 1, ...) for vectors.
+        // zext(setcc) -> zext_in_reg(vsetcc) for vectors.
         SDValue VSetCC = DAG.getNode(ISD::SETCC, DL, VT, N0.getOperand(0),
                                      N0.getOperand(1), N0.getOperand(2));
-        return DAG.getNode(ISD::AND, DL, VT, VSetCC, VecOnes);
+        return DAG.getZeroExtendInReg(VSetCC, DL, N0.getValueType());
       }
 
       // If the desired elements are smaller or larger than the source
       // elements we can use a matching integer vector type and then
-      // truncate/sign extend.
+      // truncate/any extend followed by zext_in_reg.
       EVT MatchingVectorType = N00VT.changeVectorElementTypeToInteger();
       SDValue VsetCC =
           DAG.getNode(ISD::SETCC, DL, MatchingVectorType, N0.getOperand(0),
                       N0.getOperand(1), N0.getOperand(2));
-      return DAG.getNode(ISD::AND, DL, VT, DAG.getSExtOrTrunc(VsetCC, DL, VT),
-                         VecOnes);
+      return DAG.getZeroExtendInReg(DAG.getAnyExtOrTrunc(VsetCC, DL, VT), DL,
+                                    N0.getValueType());
     }
 
     // zext(setcc x,y,cc) -> select_cc x, y, 1, 0, cc
@@ -10127,7 +10535,7 @@ SDValue DAGCombiner::visitZERO_EXTEND(SDNode *N) {
     SDLoc DL(N);
 
     // Ensure that the shift amount is wide enough for the shifted value.
-    if (VT.getSizeInBits() >= 256)
+    if (Log2_32_Ceil(VT.getSizeInBits()) > ShAmt.getValueSizeInBits())
       ShAmt = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i32, ShAmt);
 
     return DAG.getNode(N0.getOpcode(), DL, VT,
@@ -10187,8 +10595,7 @@ SDValue DAGCombiner::visitANY_EXTEND(SDNode *N) {
     SDLoc DL(N);
     SDValue X = N0.getOperand(0).getOperand(0);
     X = DAG.getAnyExtOrTrunc(X, DL, VT);
-    APInt Mask = cast<ConstantSDNode>(N0.getOperand(1))->getAPIntValue();
-    Mask = Mask.zext(VT.getSizeInBits());
+    APInt Mask = N0.getConstantOperandAPInt(1).zext(VT.getSizeInBits());
     return DAG.getNode(ISD::AND, DL, VT,
                        X, DAG.getConstant(Mask, DL, VT));
   }
@@ -10348,6 +10755,45 @@ SDValue DAGCombiner::visitAssertExt(SDNode *N) {
   return SDValue();
 }
 
+SDValue DAGCombiner::visitAssertAlign(SDNode *N) {
+  SDLoc DL(N);
+
+  Align AL = cast<AssertAlignSDNode>(N)->getAlign();
+  SDValue N0 = N->getOperand(0);
+
+  // Fold (assertalign (assertalign x, AL0), AL1) ->
+  // (assertalign x, max(AL0, AL1))
+  if (auto *AAN = dyn_cast<AssertAlignSDNode>(N0))
+    return DAG.getAssertAlign(DL, N0.getOperand(0),
+                              std::max(AL, AAN->getAlign()));
+
+  // In rare cases, there are trivial arithmetic ops in source operands. Sink
+  // this assert down to source operands so that those arithmetic ops could be
+  // exposed to the DAG combining.
+  switch (N0.getOpcode()) {
+  default:
+    break;
+  case ISD::ADD:
+  case ISD::SUB: {
+    unsigned AlignShift = Log2(AL);
+    SDValue LHS = N0.getOperand(0);
+    SDValue RHS = N0.getOperand(1);
+    unsigned LHSAlignShift = DAG.computeKnownBits(LHS).countMinTrailingZeros();
+    unsigned RHSAlignShift = DAG.computeKnownBits(RHS).countMinTrailingZeros();
+    if (LHSAlignShift >= AlignShift || RHSAlignShift >= AlignShift) {
+      if (LHSAlignShift < AlignShift)
+        LHS = DAG.getAssertAlign(DL, LHS, AL);
+      if (RHSAlignShift < AlignShift)
+        RHS = DAG.getAssertAlign(DL, RHS, AL);
+      return DAG.getNode(N0.getOpcode(), DL, N0.getValueType(), LHS, RHS);
+    }
+    break;
+  }
+  }
+
+  return SDValue();
+}
+
 /// If the result of a wider load is shifted to right of N  bits and then
 /// truncated to a narrower type and where N is a multiple of number of bits of
 /// the narrower type, transform it to a narrower load from address + N / num of
@@ -10428,9 +10874,8 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
       }
 
       // At this point, we must have a load or else we can't do the transform.
-      if (!isa<LoadSDNode>(N0)) return SDValue();
-
-      auto *LN0 = cast<LoadSDNode>(N0);
+      auto *LN0 = dyn_cast<LoadSDNode>(N0);
+      if (!LN0) return SDValue();
 
       // Because a SRL must be assumed to *need* to zero-extend the high bits
       // (as opposed to anyext the high bits), we can't combine the zextload
@@ -10449,8 +10894,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
       SDNode *Mask = *(SRL->use_begin());
       if (Mask->getOpcode() == ISD::AND &&
           isa<ConstantSDNode>(Mask->getOperand(1))) {
-        const APInt &ShiftMask =
-          cast<ConstantSDNode>(Mask->getOperand(1))->getAPIntValue();
+        const APInt& ShiftMask = Mask->getConstantOperandAPInt(1);
         if (ShiftMask.isMask()) {
           EVT MaskedVT = EVT::getIntegerVT(*DAG.getContext(),
                                            ShiftMask.countTrailingOnes());
@@ -10480,7 +10924,7 @@ SDValue DAGCombiner::ReduceLoadWidth(SDNode *N) {
 
   LoadSDNode *LN0 = cast<LoadSDNode>(N0);
   // Reducing the width of a volatile load is illegal.  For atomics, we may be
-  // able to reduce the width provided we never widen again. (see D66309)  
+  // able to reduce the width provided we never widen again. (see D66309)
   if (!LN0->isSimple() ||
       !isLegalNarrowLdSt(LN0, ExtType, ExtVT, ShAmt))
     return SDValue();
@@ -10561,26 +11005,27 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
-  EVT EVT = cast<VTSDNode>(N1)->getVT();
+  EVT ExtVT = cast<VTSDNode>(N1)->getVT();
   unsigned VTBits = VT.getScalarSizeInBits();
-  unsigned EVTBits = EVT.getScalarSizeInBits();
+  unsigned ExtVTBits = ExtVT.getScalarSizeInBits();
 
+  // sext_vector_inreg(undef) = 0 because the top bit will all be the same.
   if (N0.isUndef())
-    return DAG.getUNDEF(VT);
+    return DAG.getConstant(0, SDLoc(N), VT);
 
   // fold (sext_in_reg c1) -> c1
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);
 
   // If the input is already sign extended, just drop the extension.
-  if (DAG.ComputeNumSignBits(N0) >= VTBits-EVTBits+1)
+  if (DAG.ComputeNumSignBits(N0) >= (VTBits - ExtVTBits + 1))
     return N0;
 
   // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
   if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
-      EVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT()))
-    return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
-                       N0.getOperand(0), N1);
+      ExtVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT()))
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0),
+                       N1);
 
   // fold (sext_in_reg (sext x)) -> (sext x)
   // fold (sext_in_reg (aext x)) -> (sext x)
@@ -10589,8 +11034,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   if (N0.getOpcode() == ISD::SIGN_EXTEND || N0.getOpcode() == ISD::ANY_EXTEND) {
     SDValue N00 = N0.getOperand(0);
     unsigned N00Bits = N00.getScalarValueSizeInBits();
-    if ((N00Bits <= EVTBits ||
-         (N00Bits - DAG.ComputeNumSignBits(N00)) < EVTBits) &&
+    if ((N00Bits <= ExtVTBits ||
+         (N00Bits - DAG.ComputeNumSignBits(N00)) < ExtVTBits) &&
         (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
       return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00);
   }
@@ -10599,7 +11044,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   if ((N0.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG ||
        N0.getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ||
        N0.getOpcode() == ISD::ZERO_EXTEND_VECTOR_INREG) &&
-      N0.getOperand(0).getScalarValueSizeInBits() == EVTBits) {
+      N0.getOperand(0).getScalarValueSizeInBits() == ExtVTBits) {
     if (!LegalOperations ||
         TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT))
       return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT,
@@ -10610,14 +11055,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   // iff we are extending the source sign bit.
   if (N0.getOpcode() == ISD::ZERO_EXTEND) {
     SDValue N00 = N0.getOperand(0);
-    if (N00.getScalarValueSizeInBits() == EVTBits &&
+    if (N00.getScalarValueSizeInBits() == ExtVTBits &&
         (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
       return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00, N1);
   }
 
   // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
-  if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, EVTBits - 1)))
-    return DAG.getZeroExtendInReg(N0, SDLoc(N), EVT.getScalarType());
+  if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, ExtVTBits - 1)))
+    return DAG.getZeroExtendInReg(N0, SDLoc(N), ExtVT);
 
   // fold operands of sext_in_reg based on knowledge that the top bits are not
   // demanded.
@@ -10634,11 +11079,11 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   // We already fold "(sext_in_reg (srl X, 25), i8) -> srl X, 25" above.
   if (N0.getOpcode() == ISD::SRL) {
     if (auto *ShAmt = dyn_cast<ConstantSDNode>(N0.getOperand(1)))
-      if (ShAmt->getAPIntValue().ule(VTBits - EVTBits)) {
+      if (ShAmt->getAPIntValue().ule(VTBits - ExtVTBits)) {
         // We can turn this into an SRA iff the input to the SRL is already sign
         // extended enough.
         unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
-        if (((VTBits - EVTBits) - ShAmt->getZExtValue()) < InSignBits)
+        if (((VTBits - ExtVTBits) - ShAmt->getZExtValue()) < InSignBits)
           return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0),
                              N0.getOperand(1));
       }
@@ -10650,14 +11095,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   // extends that the target does support.
   if (ISD::isEXTLoad(N0.getNode()) &&
       ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
+      ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
       ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() &&
         N0.hasOneUse()) ||
-       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
-                                     LN0->getBasePtr(), EVT,
+                                     LN0->getBasePtr(), ExtVT,
                                      LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
@@ -10667,13 +11112,13 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
   if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
       N0.hasOneUse() &&
-      EVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
+      ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
       ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) &&
-       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, EVT))) {
+       TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
     LoadSDNode *LN0 = cast<LoadSDNode>(N0);
     SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
                                      LN0->getChain(),
-                                     LN0->getBasePtr(), EVT,
+                                     LN0->getBasePtr(), ExtVT,
                                      LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
@@ -10681,11 +11126,10 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   }
 
   // Form (sext_inreg (bswap >> 16)) or (sext_inreg (rotl (bswap) 16))
-  if (EVTBits <= 16 && N0.getOpcode() == ISD::OR) {
+  if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) {
     if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
                                            N0.getOperand(1), false))
-      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT,
-                         BSwap, N1);
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1);
   }
 
   return SDValue();
@@ -10695,8 +11139,9 @@ SDValue DAGCombiner::visitSIGN_EXTEND_VECTOR_INREG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
+  // sext_vector_inreg(undef) = 0 because the top bit will all be the same.
   if (N0.isUndef())
-    return DAG.getUNDEF(VT);
+    return DAG.getConstant(0, SDLoc(N), VT);
 
   if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
     return Res;
@@ -10711,8 +11156,9 @@ SDValue DAGCombiner::visitZERO_EXTEND_VECTOR_INREG(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
+  // zext_vector_inreg(undef) = 0 because the top bits will be zero.
   if (N0.isUndef())
-    return DAG.getUNDEF(VT);
+    return DAG.getConstant(0, SDLoc(N), VT);
 
   if (SDValue Res = tryToFoldExtendOfConstant(N, TLI, DAG, LegalTypes))
     return Res;
@@ -10788,13 +11234,12 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     SDValue EltNo = N0->getOperand(1);
     if (isa<ConstantSDNode>(EltNo) && isTypeLegal(NVT)) {
       int Elt = cast<ConstantSDNode>(EltNo)->getZExtValue();
-      EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
       int Index = isLE ? (Elt*SizeRatio) : (Elt*SizeRatio + (SizeRatio-1));
 
       SDLoc DL(N);
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, TrTy,
                          DAG.getBitcast(NVT, N0.getOperand(0)),
-                         DAG.getConstant(Index, DL, IndexTy));
+                         DAG.getVectorIdxConstant(Index, DL));
     }
   }
 
@@ -10832,7 +11277,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
 
   // Attempt to pre-truncate BUILD_VECTOR sources.
   if (N0.getOpcode() == ISD::BUILD_VECTOR && !LegalOperations &&
-      TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType())) {
+      TLI.isTruncateFree(SrcVT.getScalarType(), VT.getScalarType()) &&
+      // Avoid creating illegal types if running after type legalizer.
+      (!LegalTypes || TLI.isTypeLegal(VT.getScalarType()))) {
     SDLoc DL(N);
     EVT SVT = VT.getScalarType();
     SmallVector<SDValue, 8> TruncOps;
@@ -10961,10 +11408,9 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
          TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecSrcVT))) {
       SDLoc SL(N);
 
-      EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
       unsigned Idx = isLE ? 0 : VecSrcVT.getVectorNumElements() - 1;
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, VT, VecSrc,
-                         DAG.getConstant(Idx, SL, IdxVT));
+                         DAG.getVectorIdxConstant(Idx, SL));
     }
   }
 
@@ -11064,14 +11510,14 @@ SDValue DAGCombiner::CombineConsecutiveLoads(SDNode *N, EVT VT) {
   unsigned LD1Bytes = LD1VT.getStoreSize();
   if (ISD::isNON_EXTLoad(LD2) && LD2->hasOneUse() &&
       DAG.areNonVolatileConsecutiveLoads(LD2, LD1, LD1Bytes, 1)) {
-    unsigned Align = LD1->getAlignment();
-    unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
+    Align Alignment = LD1->getAlign();
+    Align NewAlign = DAG.getDataLayout().getABITypeAlign(
         VT.getTypeForEVT(*DAG.getContext()));
 
-    if (NewAlign <= Align &&
+    if (NewAlign <= Alignment &&
         (!LegalOperations || TLI.isOperationLegal(ISD::LOAD, VT)))
       return DAG.getLoad(VT, SDLoc(N), LD1->getChain(), LD1->getBasePtr(),
-                         LD1->getPointerInfo(), Align);
+                         LD1->getPointerInfo(), Alignment);
   }
 
   return SDValue();
@@ -11389,6 +11835,20 @@ SDValue DAGCombiner::visitBUILD_PAIR(SDNode *N) {
   return CombineConsecutiveLoads(N, VT);
 }
 
+SDValue DAGCombiner::visitFREEZE(SDNode *N) {
+  SDValue N0 = N->getOperand(0);
+
+  // (freeze (freeze x)) -> (freeze x)
+  if (N0.getOpcode() == ISD::FREEZE)
+    return N0;
+
+  // If the input is a constant, return it.
+  if (isa<ConstantSDNode>(N0) || isa<ConstantFPSDNode>(N0))
+    return N0;
+
+  return SDValue();
+}
+
 /// We know that BV is a build_vector node with Constant, ConstantFP or Undef
 /// operands. DstEltVT indicates the destination element value type.
 SDValue DAGCombiner::
@@ -11519,7 +11979,7 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
   const TargetOptions &Options = DAG.getTarget().Options;
 
   // Floating-point multiply-add with intermediate rounding.
-  bool HasFMAD = (LegalOperations && TLI.isFMADLegalForFAddFSub(DAG, N));
+  bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
@@ -11532,13 +11992,14 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
 
   SDNodeFlags Flags = N->getFlags();
   bool CanFuse = Options.UnsafeFPMath || isContractable(N);
+  bool CanReassociate =
+      Options.UnsafeFPMath || N->getFlags().hasAllowReassociation();
   bool AllowFusionGlobally = (Options.AllowFPOpFusion == FPOpFusion::Fast ||
                               CanFuse || HasFMAD);
   // If the addition is not contractable, do not combine.
   if (!AllowFusionGlobally && !isContractable(N))
     return SDValue();
 
-  const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
   if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
     return SDValue();
 
@@ -11573,6 +12034,30 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
                        N1.getOperand(0), N1.getOperand(1), N0, Flags);
   }
 
+  // fadd (fma A, B, (fmul C, D)), E --> fma A, B, (fma C, D, E)
+  // fadd E, (fma A, B, (fmul C, D)) --> fma A, B, (fma C, D, E)
+  // This requires reassociation because it changes the order of operations.
+  SDValue FMA, E;
+  if (CanReassociate && N0.getOpcode() == PreferredFusedOpcode &&
+      N0.getOperand(2).getOpcode() == ISD::FMUL && N0.hasOneUse() &&
+      N0.getOperand(2).hasOneUse()) {
+    FMA = N0;
+    E = N1;
+  } else if (CanReassociate && N1.getOpcode() == PreferredFusedOpcode &&
+             N1.getOperand(2).getOpcode() == ISD::FMUL && N1.hasOneUse() &&
+             N1.getOperand(2).hasOneUse()) {
+    FMA = N1;
+    E = N0;
+  }
+  if (FMA && E) {
+    SDValue A = FMA.getOperand(0);
+    SDValue B = FMA.getOperand(1);
+    SDValue C = FMA.getOperand(2).getOperand(0);
+    SDValue D = FMA.getOperand(2).getOperand(1);
+    SDValue CDE = DAG.getNode(PreferredFusedOpcode, SL, VT, C, D, E, Flags);
+    return DAG.getNode(PreferredFusedOpcode, SL, VT, A, B, CDE, Flags);
+  }
+
   // Look through FP_EXTEND nodes to do more combining.
 
   // fold (fadd (fpext (fmul x, y)), z) -> (fma (fpext x), (fpext y), z)
@@ -11606,33 +12091,6 @@ SDValue DAGCombiner::visitFADDForFMACombine(SDNode *N) {
 
   // More folding opportunities when target permits.
   if (Aggressive) {
-    // fold (fadd (fma x, y, (fmul u, v)), z) -> (fma x, y (fma u, v, z))
-    if (CanFuse &&
-        N0.getOpcode() == PreferredFusedOpcode &&
-        N0.getOperand(2).getOpcode() == ISD::FMUL &&
-        N0->hasOneUse() && N0.getOperand(2)->hasOneUse()) {
-      return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                         N0.getOperand(0), N0.getOperand(1),
-                         DAG.getNode(PreferredFusedOpcode, SL, VT,
-                                     N0.getOperand(2).getOperand(0),
-                                     N0.getOperand(2).getOperand(1),
-                                     N1, Flags), Flags);
-    }
-
-    // fold (fadd x, (fma y, z, (fmul u, v)) -> (fma y, z (fma u, v, x))
-    if (CanFuse &&
-        N1->getOpcode() == PreferredFusedOpcode &&
-        N1.getOperand(2).getOpcode() == ISD::FMUL &&
-        N1->hasOneUse() && N1.getOperand(2)->hasOneUse()) {
-      return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                         N1.getOperand(0), N1.getOperand(1),
-                         DAG.getNode(PreferredFusedOpcode, SL, VT,
-                                     N1.getOperand(2).getOperand(0),
-                                     N1.getOperand(2).getOperand(1),
-                                     N0, Flags), Flags);
-    }
-
-
     // fold (fadd (fma x, y, (fpext (fmul u, v))), z)
     //   -> (fma x, y, (fma (fpext u), (fpext v), z))
     auto FoldFAddFMAFPExtFMul = [&] (
@@ -11736,7 +12194,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
 
   const TargetOptions &Options = DAG.getTarget().Options;
   // Floating-point multiply-add with intermediate rounding.
-  bool HasFMAD = (LegalOperations && TLI.isFMADLegalForFAddFSub(DAG, N));
+  bool HasFMAD = (LegalOperations && TLI.isFMADLegal(DAG, N));
 
   // Floating-point multiply-add without intermediate rounding.
   bool HasFMA =
@@ -11756,13 +12214,13 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   if (!AllowFusionGlobally && !isContractable(N))
     return SDValue();
 
-  const SelectionDAGTargetInfo *STI = DAG.getSubtarget().getSelectionDAGInfo();
   if (STI && STI->generateFMAsInMachineCombiner(OptLevel))
     return SDValue();
 
   // Always prefer FMAD to FMA for precision.
   unsigned PreferredFusedOpcode = HasFMAD ? ISD::FMAD : ISD::FMA;
   bool Aggressive = TLI.enableAggressiveFMAFusion(VT);
+  bool NoSignedZero = Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros();
 
   // Is the node an FMUL and contractable either due to global flags or
   // SDNodeFlags.
@@ -11773,19 +12231,43 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
   };
 
   // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
-  if (isContractableFMUL(N0) && (Aggressive || N0->hasOneUse())) {
-    return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                       N0.getOperand(0), N0.getOperand(1),
-                       DAG.getNode(ISD::FNEG, SL, VT, N1), Flags);
-  }
+  auto tryToFoldXYSubZ = [&](SDValue XY, SDValue Z) {
+    if (isContractableFMUL(XY) && (Aggressive || XY->hasOneUse())) {
+      return DAG.getNode(PreferredFusedOpcode, SL, VT, XY.getOperand(0),
+                         XY.getOperand(1), DAG.getNode(ISD::FNEG, SL, VT, Z),
+                         Flags);
+    }
+    return SDValue();
+  };
 
   // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
   // Note: Commutes FSUB operands.
-  if (isContractableFMUL(N1) && (Aggressive || N1->hasOneUse())) {
-    return DAG.getNode(PreferredFusedOpcode, SL, VT,
-                       DAG.getNode(ISD::FNEG, SL, VT,
-                                   N1.getOperand(0)),
-                       N1.getOperand(1), N0, Flags);
+  auto tryToFoldXSubYZ = [&](SDValue X, SDValue YZ) {
+    if (isContractableFMUL(YZ) && (Aggressive || YZ->hasOneUse())) {
+      return DAG.getNode(PreferredFusedOpcode, SL, VT,
+                         DAG.getNode(ISD::FNEG, SL, VT, YZ.getOperand(0)),
+                         YZ.getOperand(1), X, Flags);
+    }
+    return SDValue();
+  };
+
+  // If we have two choices trying to fold (fsub (fmul u, v), (fmul x, y)),
+  // prefer to fold the multiply with fewer uses.
+  if (isContractableFMUL(N0) && isContractableFMUL(N1) &&
+      (N0.getNode()->use_size() > N1.getNode()->use_size())) {
+    // fold (fsub (fmul a, b), (fmul c, d)) -> (fma (fneg c), d, (fmul a, b))
+    if (SDValue V = tryToFoldXSubYZ(N0, N1))
+      return V;
+    // fold (fsub (fmul a, b), (fmul c, d)) -> (fma a, b, (fneg (fmul c, d)))
+    if (SDValue V = tryToFoldXYSubZ(N0, N1))
+      return V;
+  } else {
+    // fold (fsub (fmul x, y), z) -> (fma x, y, (fneg z))
+    if (SDValue V = tryToFoldXYSubZ(N0, N1))
+      return V;
+    // fold (fsub x, (fmul y, z)) -> (fma (fneg y), z, x)
+    if (SDValue V = tryToFoldXSubYZ(N0, N1))
+      return V;
   }
 
   // fold (fsub (fneg (fmul, x, y)), z) -> (fma (fneg x), y, (fneg z))
@@ -11902,7 +12384,7 @@ SDValue DAGCombiner::visitFSUBForFMACombine(SDNode *N) {
     //   -> (fma (fneg y), z, (fma (fneg u), v, x))
     if (CanFuse && N1.getOpcode() == PreferredFusedOpcode &&
         isContractableFMUL(N1.getOperand(2)) &&
-        N1->hasOneUse()) {
+        N1->hasOneUse() && NoSignedZero) {
       SDValue N20 = N1.getOperand(2).getOperand(0);
       SDValue N21 = N1.getOperand(2).getOperand(1);
       return DAG.getNode(PreferredFusedOpcode, SL, VT,
@@ -12055,7 +12537,7 @@ SDValue DAGCombiner::visitFMULForFMADistributiveCombine(SDNode *N) {
   // Floating-point multiply-add with intermediate rounding. This can result
   // in a less precise result due to the changed rounding order.
   bool HasFMAD = Options.UnsafeFPMath &&
-                 (LegalOperations && TLI.isOperationLegal(ISD::FMAD, VT));
+                 (LegalOperations && TLI.isFMADLegal(DAG, N));
 
   // No valid opcode, do not combine.
   if (!HasFMAD && !HasFMA)
@@ -12132,6 +12614,9 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
   const TargetOptions &Options = DAG.getTarget().Options;
   const SDNodeFlags Flags = N->getFlags();
 
+  if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
+    return R;
+
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
@@ -12155,18 +12640,16 @@ SDValue DAGCombiner::visitFADD(SDNode *N) {
     return NewSel;
 
   // fold (fadd A, (fneg B)) -> (fsub A, B)
-  if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
-      TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize) == 2)
-    return DAG.getNode(
-        ISD::FSUB, DL, VT, N0,
-        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags);
+  if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
+    if (SDValue NegN1 = TLI.getCheaperNegatedExpression(
+            N1, DAG, LegalOperations, ForCodeSize))
+      return DAG.getNode(ISD::FSUB, DL, VT, N0, NegN1, Flags);
 
   // fold (fadd (fneg A), B) -> (fsub B, A)
-  if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT)) &&
-      TLI.isNegatibleForFree(N0, DAG, LegalOperations, ForCodeSize) == 2)
-    return DAG.getNode(
-        ISD::FSUB, DL, VT, N1,
-        TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize), Flags);
+  if (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::FSUB, VT))
+    if (SDValue NegN0 = TLI.getCheaperNegatedExpression(
+            N0, DAG, LegalOperations, ForCodeSize))
+      return DAG.getNode(ISD::FSUB, DL, VT, N1, NegN0, Flags);
 
   auto isFMulNegTwo = [](SDValue FMul) {
     if (!FMul.hasOneUse() || FMul.getOpcode() != ISD::FMUL)
@@ -12311,6 +12794,9 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   const TargetOptions &Options = DAG.getTarget().Options;
   const SDNodeFlags Flags = N->getFlags();
 
+  if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
+    return R;
+
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
@@ -12345,8 +12831,9 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   if (N0CFP && N0CFP->isZero()) {
     if (N0CFP->isNegative() ||
         (Options.NoSignedZerosFPMath || Flags.hasNoSignedZeros())) {
-      if (TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize))
-        return TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize);
+      if (SDValue NegN1 =
+              TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
+        return NegN1;
       if (!LegalOperations || TLI.isOperationLegal(ISD::FNEG, VT))
         return DAG.getNode(ISD::FNEG, DL, VT, N1, Flags);
     }
@@ -12364,10 +12851,9 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   }
 
   // fold (fsub A, (fneg B)) -> (fadd A, B)
-  if (TLI.isNegatibleForFree(N1, DAG, LegalOperations, ForCodeSize))
-    return DAG.getNode(
-        ISD::FADD, DL, VT, N0,
-        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags);
+  if (SDValue NegN1 =
+          TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize))
+    return DAG.getNode(ISD::FADD, DL, VT, N0, NegN1, Flags);
 
   // FSUB -> FMA combines:
   if (SDValue Fused = visitFSUBForFMACombine(N)) {
@@ -12378,21 +12864,6 @@ SDValue DAGCombiner::visitFSUB(SDNode *N) {
   return SDValue();
 }
 
-/// Return true if both inputs are at least as cheap in negated form and at
-/// least one input is strictly cheaper in negated form.
-bool DAGCombiner::isCheaperToUseNegatedFPOps(SDValue X, SDValue Y) {
-  if (char LHSNeg =
-          TLI.isNegatibleForFree(X, DAG, LegalOperations, ForCodeSize))
-    if (char RHSNeg =
-            TLI.isNegatibleForFree(Y, DAG, LegalOperations, ForCodeSize))
-      // Both negated operands are at least as cheap as their counterparts.
-      // Check to see if at least one is cheaper negated.
-      if (LHSNeg == 2 || RHSNeg == 2)
-        return true;
-
-  return false;
-}
-
 SDValue DAGCombiner::visitFMUL(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -12403,6 +12874,9 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
   const TargetOptions &Options = DAG.getTarget().Options;
   const SDNodeFlags Flags = N->getFlags();
 
+  if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
+    return R;
+
   // fold vector ops
   if (VT.isVector()) {
     // This just handles C1 * C2 for vectors. Other vector folds are below.
@@ -12464,13 +12938,18 @@ SDValue DAGCombiner::visitFMUL(SDNode *N) {
       return DAG.getNode(ISD::FNEG, DL, VT, N0);
 
   // -N0 * -N1 --> N0 * N1
-  if (isCheaperToUseNegatedFPOps(N0, N1)) {
-    SDValue NegN0 =
-        TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);
-    SDValue NegN1 =
-        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize);
+  TargetLowering::NegatibleCost CostN0 =
+      TargetLowering::NegatibleCost::Expensive;
+  TargetLowering::NegatibleCost CostN1 =
+      TargetLowering::NegatibleCost::Expensive;
+  SDValue NegN0 =
+      TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
+  SDValue NegN1 =
+      TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
+  if (NegN0 && NegN1 &&
+      (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
+       CostN1 == TargetLowering::NegatibleCost::Cheaper))
     return DAG.getNode(ISD::FMUL, DL, VT, NegN0, NegN1, Flags);
-  }
 
   // fold (fmul X, (select (fcmp X > 0.0), -1.0, 1.0)) -> (fneg (fabs X))
   // fold (fmul X, (select (fcmp X > 0.0), 1.0, -1.0)) -> (fabs X)
@@ -12549,13 +13028,18 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
   }
 
   // (-N0 * -N1) + N2 --> (N0 * N1) + N2
-  if (isCheaperToUseNegatedFPOps(N0, N1)) {
-    SDValue NegN0 =
-        TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);
-    SDValue NegN1 =
-        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize);
+  TargetLowering::NegatibleCost CostN0 =
+      TargetLowering::NegatibleCost::Expensive;
+  TargetLowering::NegatibleCost CostN1 =
+      TargetLowering::NegatibleCost::Expensive;
+  SDValue NegN0 =
+      TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
+  SDValue NegN1 =
+      TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
+  if (NegN0 && NegN1 &&
+      (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
+       CostN1 == TargetLowering::NegatibleCost::Cheaper))
     return DAG.getNode(ISD::FMA, DL, VT, NegN0, NegN1, N2, Flags);
-  }
 
   if (UnsafeFPMath) {
     if (N0CFP && N0CFP->isZero())
@@ -12641,13 +13125,10 @@ SDValue DAGCombiner::visitFMA(SDNode *N) {
 
   // fold ((fma (fneg X), Y, (fneg Z)) -> fneg (fma X, Y, Z))
   // fold ((fma X, (fneg Y), (fneg Z)) -> fneg (fma X, Y, Z))
-  if (!TLI.isFNegFree(VT) &&
-      TLI.isNegatibleForFree(SDValue(N, 0), DAG, LegalOperations,
-                             ForCodeSize) == 2)
-    return DAG.getNode(ISD::FNEG, DL, VT,
-                       TLI.getNegatedExpression(SDValue(N, 0), DAG,
-                                                LegalOperations, ForCodeSize),
-                       Flags);
+  if (!TLI.isFNegFree(VT))
+    if (SDValue Neg = TLI.getCheaperNegatedExpression(
+            SDValue(N, 0), DAG, LegalOperations, ForCodeSize))
+      return DAG.getNode(ISD::FNEG, DL, VT, Neg, Flags);
   return SDValue();
 }
 
@@ -12664,7 +13145,7 @@ SDValue DAGCombiner::combineRepeatedFPDivisors(SDNode *N) {
   //       that only minsize should restrict this.
   bool UnsafeMath = DAG.getTarget().Options.UnsafeFPMath;
   const SDNodeFlags Flags = N->getFlags();
-  if (!UnsafeMath && !Flags.hasAllowReciprocal())
+  if (LegalDAG || (!UnsafeMath && !Flags.hasAllowReciprocal()))
     return SDValue();
 
   // Skip if current node is a reciprocal/fneg-reciprocal.
@@ -12735,6 +13216,9 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
   const TargetOptions &Options = DAG.getTarget().Options;
   SDNodeFlags Flags = N->getFlags();
 
+  if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
+    return R;
+
   // fold vector ops
   if (VT.isVector())
     if (SDValue FoldedVOp = SimplifyVBinOp(N))
@@ -12794,37 +13278,62 @@ SDValue DAGCombiner::visitFDIV(SDNode *N) {
     } else if (N1.getOpcode() == ISD::FMUL) {
       // Look through an FMUL. Even though this won't remove the FDIV directly,
       // it's still worthwhile to get rid of the FSQRT if possible.
-      SDValue SqrtOp;
-      SDValue OtherOp;
+      SDValue Sqrt, Y;
       if (N1.getOperand(0).getOpcode() == ISD::FSQRT) {
-        SqrtOp = N1.getOperand(0);
-        OtherOp = N1.getOperand(1);
+        Sqrt = N1.getOperand(0);
+        Y = N1.getOperand(1);
       } else if (N1.getOperand(1).getOpcode() == ISD::FSQRT) {
-        SqrtOp = N1.getOperand(1);
-        OtherOp = N1.getOperand(0);
+        Sqrt = N1.getOperand(1);
+        Y = N1.getOperand(0);
       }
-      if (SqrtOp.getNode()) {
+      if (Sqrt.getNode()) {
+        // If the other multiply operand is known positive, pull it into the
+        // sqrt. That will eliminate the division if we convert to an estimate:
+        // X / (fabs(A) * sqrt(Z)) --> X / sqrt(A*A*Z) --> X * rsqrt(A*A*Z)
+        // TODO: Also fold the case where A == Z (fabs is missing).
+        if (Flags.hasAllowReassociation() && N1.hasOneUse() &&
+            N1->getFlags().hasAllowReassociation() && Sqrt.hasOneUse() &&
+            Y.getOpcode() == ISD::FABS && Y.hasOneUse()) {
+          SDValue AA = DAG.getNode(ISD::FMUL, DL, VT, Y.getOperand(0),
+                                   Y.getOperand(0), Flags);
+          SDValue AAZ =
+              DAG.getNode(ISD::FMUL, DL, VT, AA, Sqrt.getOperand(0), Flags);
+          if (SDValue Rsqrt = buildRsqrtEstimate(AAZ, Flags))
+            return DAG.getNode(ISD::FMUL, DL, VT, N0, Rsqrt, Flags);
+
+          // Estimate creation failed. Clean up speculatively created nodes.
+          recursivelyDeleteUnusedNodes(AAZ.getNode());
+        }
+
         // We found a FSQRT, so try to make this fold:
-        // x / (y * sqrt(z)) -> x * (rsqrt(z) / y)
-        if (SDValue RV = buildRsqrtEstimate(SqrtOp.getOperand(0), Flags)) {
-          RV = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, RV, OtherOp, Flags);
-          AddToWorklist(RV.getNode());
-          return DAG.getNode(ISD::FMUL, DL, VT, N0, RV, Flags);
+        // X / (Y * sqrt(Z)) -> X * (rsqrt(Z) / Y)
+        if (SDValue Rsqrt = buildRsqrtEstimate(Sqrt.getOperand(0), Flags)) {
+          SDValue Div = DAG.getNode(ISD::FDIV, SDLoc(N1), VT, Rsqrt, Y, Flags);
+          AddToWorklist(Div.getNode());
+          return DAG.getNode(ISD::FMUL, DL, VT, N0, Div, Flags);
         }
       }
     }
 
     // Fold into a reciprocal estimate and multiply instead of a real divide.
-    if (SDValue RV = BuildDivEstimate(N0, N1, Flags))
-      return RV;
+    if (Options.NoInfsFPMath || Flags.hasNoInfs())
+      if (SDValue RV = BuildDivEstimate(N0, N1, Flags))
+        return RV;
   }
 
   // (fdiv (fneg X), (fneg Y)) -> (fdiv X, Y)
-  if (isCheaperToUseNegatedFPOps(N0, N1))
-    return DAG.getNode(
-        ISD::FDIV, SDLoc(N), VT,
-        TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize),
-        TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize), Flags);
+  TargetLowering::NegatibleCost CostN0 =
+      TargetLowering::NegatibleCost::Expensive;
+  TargetLowering::NegatibleCost CostN1 =
+      TargetLowering::NegatibleCost::Expensive;
+  SDValue NegN0 =
+      TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize, CostN0);
+  SDValue NegN1 =
+      TLI.getNegatedExpression(N1, DAG, LegalOperations, ForCodeSize, CostN1);
+  if (NegN0 && NegN1 &&
+      (CostN0 == TargetLowering::NegatibleCost::Cheaper ||
+       CostN1 == TargetLowering::NegatibleCost::Cheaper))
+    return DAG.getNode(ISD::FDIV, SDLoc(N), VT, NegN0, NegN1, Flags);
 
   return SDValue();
 }
@@ -12835,6 +13344,10 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
   ConstantFPSDNode *N0CFP = dyn_cast<ConstantFPSDNode>(N0);
   ConstantFPSDNode *N1CFP = dyn_cast<ConstantFPSDNode>(N1);
   EVT VT = N->getValueType(0);
+  SDNodeFlags Flags = N->getFlags();
+
+  if (SDValue R = DAG.simplifyFPBinop(N->getOpcode(), N0, N1, Flags))
+    return R;
 
   // fold (frem c1, c2) -> fmod(c1,c2)
   if (N0CFP && N1CFP)
@@ -12848,8 +13361,12 @@ SDValue DAGCombiner::visitFREM(SDNode *N) {
 
 SDValue DAGCombiner::visitFSQRT(SDNode *N) {
   SDNodeFlags Flags = N->getFlags();
-  if (!DAG.getTarget().Options.UnsafeFPMath &&
-      !Flags.hasApproximateFuncs())
+  const TargetOptions &Options = DAG.getTarget().Options;
+
+  // Require 'ninf' flag since sqrt(+Inf) = +Inf, but the estimation goes as:
+  // sqrt(+Inf) == rsqrt(+Inf) * +Inf = 0 * +Inf = NaN
+  if ((!Options.UnsafeFPMath && !Flags.hasApproximateFuncs()) ||
+      (!Options.NoInfsFPMath && !Flags.hasNoInfs()))
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
@@ -13061,33 +13578,24 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
   }
 
   // The next optimizations are desirable only if SELECT_CC can be lowered.
-  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) {
-    // fold (sint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
-    if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
-        !VT.isVector() &&
-        (!LegalOperations ||
-         TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
-      SDLoc DL(N);
-      SDValue Ops[] =
-        { N0.getOperand(0), N0.getOperand(1),
-          DAG.getConstantFP(-1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
-          N0.getOperand(2) };
-      return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
-    }
+  // fold (sint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), -1.0, 0.0)
+  if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
+      !VT.isVector() &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
+    SDLoc DL(N);
+    return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(-1.0, DL, VT),
+                         DAG.getConstantFP(0.0, DL, VT));
+  }
 
-    // fold (sint_to_fp (zext (setcc x, y, cc))) ->
-    //      (select_cc x, y, 1.0, 0.0,, cc)
-    if (N0.getOpcode() == ISD::ZERO_EXTEND &&
-        N0.getOperand(0).getOpcode() == ISD::SETCC &&!VT.isVector() &&
-        (!LegalOperations ||
-         TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
-      SDLoc DL(N);
-      SDValue Ops[] =
-        { N0.getOperand(0).getOperand(0), N0.getOperand(0).getOperand(1),
-          DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
-          N0.getOperand(0).getOperand(2) };
-      return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
-    }
+  // fold (sint_to_fp (zext (setcc x, y, cc))) ->
+  //      (select (setcc x, y, cc), 1.0, 0.0)
+  if (N0.getOpcode() == ISD::ZERO_EXTEND &&
+      N0.getOperand(0).getOpcode() == ISD::SETCC && !VT.isVector() &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
+    SDLoc DL(N);
+    return DAG.getSelect(DL, VT, N0.getOperand(0),
+                         DAG.getConstantFP(1.0, DL, VT),
+                         DAG.getConstantFP(0.0, DL, VT));
   }
 
   if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
@@ -13121,19 +13629,12 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
       return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
   }
 
-  // The next optimizations are desirable only if SELECT_CC can be lowered.
-  if (TLI.isOperationLegalOrCustom(ISD::SELECT_CC, VT) || !LegalOperations) {
-    // fold (uint_to_fp (setcc x, y, cc)) -> (select_cc x, y, -1.0, 0.0,, cc)
-    if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
-        (!LegalOperations ||
-         TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
-      SDLoc DL(N);
-      SDValue Ops[] =
-        { N0.getOperand(0), N0.getOperand(1),
-          DAG.getConstantFP(1.0, DL, VT), DAG.getConstantFP(0.0, DL, VT),
-          N0.getOperand(2) };
-      return DAG.getNode(ISD::SELECT_CC, DL, VT, Ops);
-    }
+  // fold (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), 1.0, 0.0)
+  if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
+    SDLoc DL(N);
+    return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(1.0, DL, VT),
+                         DAG.getConstantFP(0.0, DL, VT));
   }
 
   if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
@@ -13378,12 +13879,14 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   if (isConstantFPBuildVectorOrConstantFP(N0))
     return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);
 
-  if (TLI.isNegatibleForFree(N0, DAG, LegalOperations, ForCodeSize))
-    return TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize);
+  if (SDValue NegN0 =
+          TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize))
+    return NegN0;
 
-  // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0 FIXME: This is
-  // duplicated in isNegatibleForFree, but isNegatibleForFree doesn't know it
-  // was called from a context with a nsz flag if the input fsub does not.
+  // -(X-Y) -> (Y-X) is unsafe because when X==Y, -0.0 != +0.0
+  // FIXME: This is duplicated in getNegatibleCost, but getNegatibleCost doesn't
+  // know it was called from a context with a nsz flag if the input fsub does
+  // not.
   if (N0.getOpcode() == ISD::FSUB &&
       (DAG.getTarget().Options.NoSignedZerosFPMath ||
        N->getFlags().hasNoSignedZeros()) && N0.hasOneUse()) {
@@ -13539,8 +14042,12 @@ SDValue DAGCombiner::visitBRCOND(SDNode *N) {
   }
 
   if (N1.hasOneUse()) {
+    // rebuildSetCC calls visitXor which may change the Chain when there is a
+    // STRICT_FSETCC/STRICT_FSETCCS involved. Use a handle to track changes.
+    HandleSDNode ChainHandle(Chain);
     if (SDValue NewN1 = rebuildSetCC(N1))
-      return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other, Chain, NewN1, N2);
+      return DAG.getNode(ISD::BRCOND, SDLoc(N), MVT::Other,
+                         ChainHandle.getValue(), NewN1, N2);
   }
 
   return SDValue();
@@ -13592,8 +14099,8 @@ SDValue DAGCombiner::rebuildSetCC(SDValue N) {
     }
   }
 
-  // Transform br(xor(x, y)) -> br(x != y)
-  // Transform br(xor(xor(x,y), 1)) -> br (x == y)
+  // Transform (brcond (xor x, y)) -> (brcond (setcc, x, y, ne))
+  // Transform (brcond (xor (xor x, y), -1)) -> (brcond (setcc, x, y, eq))
   if (N.getOpcode() == ISD::XOR) {
     // Because we may call this on a speculatively constructed
     // SimplifiedSetCC Node, we need to simplify this node first.
@@ -13617,16 +14124,17 @@ SDValue DAGCombiner::rebuildSetCC(SDValue N) {
     if (N.getOpcode() != ISD::XOR)
       return N;
 
-    SDNode *TheXor = N.getNode();
-
-    SDValue Op0 = TheXor->getOperand(0);
-    SDValue Op1 = TheXor->getOperand(1);
+    SDValue Op0 = N->getOperand(0);
+    SDValue Op1 = N->getOperand(1);
 
     if (Op0.getOpcode() != ISD::SETCC && Op1.getOpcode() != ISD::SETCC) {
       bool Equal = false;
-      if (isOneConstant(Op0) && Op0.hasOneUse() &&
-          Op0.getOpcode() == ISD::XOR) {
-        TheXor = Op0.getNode();
+      // (brcond (xor (xor x, y), -1)) -> (brcond (setcc x, y, eq))
+      if (isBitwiseNot(N) && Op0.hasOneUse() && Op0.getOpcode() == ISD::XOR &&
+          Op0.getValueType() == MVT::i1) {
+        N = Op0;
+        Op0 = N->getOperand(0);
+        Op1 = N->getOperand(1);
         Equal = true;
       }
 
@@ -13634,7 +14142,7 @@ SDValue DAGCombiner::rebuildSetCC(SDValue N) {
       if (LegalTypes)
         SetCCVT = getSetCCResultType(SetCCVT);
       // Replace the uses of XOR with SETCC
-      return DAG.getSetCC(SDLoc(TheXor), SetCCVT, Op0, Op1,
+      return DAG.getSetCC(SDLoc(N), SetCCVT, Op0, Op1,
                           Equal ? ISD::SETEQ : ISD::SETNE);
     }
   }
@@ -13994,118 +14502,142 @@ bool DAGCombiner::CombineToPreIndexedLoadStore(SDNode *N) {
   return true;
 }
 
-/// Try to combine a load/store with a add/sub of the base pointer node into a
-/// post-indexed load/store. The transformation folded the add/subtract into the
-/// new indexed load/store effectively and all of its uses are redirected to the
-/// new load/store.
-bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
-  if (Level < AfterLegalizeDAG)
+static bool shouldCombineToPostInc(SDNode *N, SDValue Ptr, SDNode *PtrUse,
+                                   SDValue &BasePtr, SDValue &Offset,
+                                   ISD::MemIndexedMode &AM,
+                                   SelectionDAG &DAG,
+                                   const TargetLowering &TLI) {
+  if (PtrUse == N ||
+      (PtrUse->getOpcode() != ISD::ADD && PtrUse->getOpcode() != ISD::SUB))
     return false;
 
-  bool IsLoad = true;
-  bool IsMasked = false;
-  SDValue Ptr;
-  if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad, IsMasked,
-                                Ptr, TLI))
+  if (!TLI.getPostIndexedAddressParts(N, PtrUse, BasePtr, Offset, AM, DAG))
     return false;
 
-  if (Ptr.getNode()->hasOneUse())
+  // Don't create a indexed load / store with zero offset.
+  if (isNullConstant(Offset))
     return false;
 
-  for (SDNode *Op : Ptr.getNode()->uses()) {
-    if (Op == N ||
-        (Op->getOpcode() != ISD::ADD && Op->getOpcode() != ISD::SUB))
-      continue;
+  if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
+    return false;
 
-    SDValue BasePtr;
-    SDValue Offset;
-    ISD::MemIndexedMode AM = ISD::UNINDEXED;
-    if (TLI.getPostIndexedAddressParts(N, Op, BasePtr, Offset, AM, DAG)) {
-      // Don't create a indexed load / store with zero offset.
-      if (isNullConstant(Offset))
-        continue;
+  SmallPtrSet<const SDNode *, 32> Visited;
+  for (SDNode *Use : BasePtr.getNode()->uses()) {
+    if (Use == Ptr.getNode())
+      continue;
 
-      // Try turning it into a post-indexed load / store except when
-      // 1) All uses are load / store ops that use it as base ptr (and
-      //    it may be folded as addressing mmode).
-      // 2) Op must be independent of N, i.e. Op is neither a predecessor
-      //    nor a successor of N. Otherwise, if Op is folded that would
-      //    create a cycle.
+    // No if there's a later user which could perform the index instead.
+    if (isa<MemSDNode>(Use)) {
+      bool IsLoad = true;
+      bool IsMasked = false;
+      SDValue OtherPtr;
+      if (getCombineLoadStoreParts(Use, ISD::POST_INC, ISD::POST_DEC, IsLoad,
+                                   IsMasked, OtherPtr, TLI)) {
+        SmallVector<const SDNode *, 2> Worklist;
+        Worklist.push_back(Use);
+        if (SDNode::hasPredecessorHelper(N, Visited, Worklist))
+          return false;
+      }
+    }
 
-      if (isa<FrameIndexSDNode>(BasePtr) || isa<RegisterSDNode>(BasePtr))
-        continue;
+    // If all the uses are load / store addresses, then don't do the
+    // transformation.
+    if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {
+      for (SDNode *UseUse : Use->uses())
+        if (canFoldInAddressingMode(Use, UseUse, DAG, TLI))
+          return false;
+    }
+  }
+  return true;
+}
 
-      // Check for #1.
-      bool TryNext = false;
-      for (SDNode *Use : BasePtr.getNode()->uses()) {
-        if (Use == Ptr.getNode())
-          continue;
+static SDNode *getPostIndexedLoadStoreOp(SDNode *N, bool &IsLoad,
+                                         bool &IsMasked, SDValue &Ptr,
+                                         SDValue &BasePtr, SDValue &Offset,
+                                         ISD::MemIndexedMode &AM,
+                                         SelectionDAG &DAG,
+                                         const TargetLowering &TLI) {
+  if (!getCombineLoadStoreParts(N, ISD::POST_INC, ISD::POST_DEC, IsLoad,
+                                IsMasked, Ptr, TLI) ||
+      Ptr.getNode()->hasOneUse())
+    return nullptr;
+
+  // Try turning it into a post-indexed load / store except when
+  // 1) All uses are load / store ops that use it as base ptr (and
+  //    it may be folded as addressing mmode).
+  // 2) Op must be independent of N, i.e. Op is neither a predecessor
+  //    nor a successor of N. Otherwise, if Op is folded that would
+  //    create a cycle.
+  for (SDNode *Op : Ptr->uses()) {
+    // Check for #1.
+    if (!shouldCombineToPostInc(N, Ptr, Op, BasePtr, Offset, AM, DAG, TLI))
+      continue;
 
-        // If all the uses are load / store addresses, then don't do the
-        // transformation.
-        if (Use->getOpcode() == ISD::ADD || Use->getOpcode() == ISD::SUB) {
-          bool RealUse = false;
-          for (SDNode *UseUse : Use->uses()) {
-            if (!canFoldInAddressingMode(Use, UseUse, DAG, TLI))
-              RealUse = true;
-          }
+    // Check for #2.
+    SmallPtrSet<const SDNode *, 32> Visited;
+    SmallVector<const SDNode *, 8> Worklist;
+    // Ptr is predecessor to both N and Op.
+    Visited.insert(Ptr.getNode());
+    Worklist.push_back(N);
+    Worklist.push_back(Op);
+    if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) &&
+        !SDNode::hasPredecessorHelper(Op, Visited, Worklist))
+      return Op;
+  }
+  return nullptr;
+}
 
-          if (!RealUse) {
-            TryNext = true;
-            break;
-          }
-        }
-      }
+/// Try to combine a load/store with a add/sub of the base pointer node into a
+/// post-indexed load/store. The transformation folded the add/subtract into the
+/// new indexed load/store effectively and all of its uses are redirected to the
+/// new load/store.
+bool DAGCombiner::CombineToPostIndexedLoadStore(SDNode *N) {
+  if (Level < AfterLegalizeDAG)
+    return false;
 
-      if (TryNext)
-        continue;
+  bool IsLoad = true;
+  bool IsMasked = false;
+  SDValue Ptr;
+  SDValue BasePtr;
+  SDValue Offset;
+  ISD::MemIndexedMode AM = ISD::UNINDEXED;
+  SDNode *Op = getPostIndexedLoadStoreOp(N, IsLoad, IsMasked, Ptr, BasePtr,
+                                         Offset, AM, DAG, TLI);
+  if (!Op)
+    return false;
 
-      // Check for #2.
-      SmallPtrSet<const SDNode *, 32> Visited;
-      SmallVector<const SDNode *, 8> Worklist;
-      // Ptr is predecessor to both N and Op.
-      Visited.insert(Ptr.getNode());
-      Worklist.push_back(N);
-      Worklist.push_back(Op);
-      if (!SDNode::hasPredecessorHelper(N, Visited, Worklist) &&
-          !SDNode::hasPredecessorHelper(Op, Visited, Worklist)) {
-        SDValue Result;
-        if (!IsMasked)
-          Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
-                                               Offset, AM)
-                          : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N),
+  SDValue Result;
+  if (!IsMasked)
+    Result = IsLoad ? DAG.getIndexedLoad(SDValue(N, 0), SDLoc(N), BasePtr,
+                                         Offset, AM)
+                    : DAG.getIndexedStore(SDValue(N, 0), SDLoc(N),
+                                          BasePtr, Offset, AM);
+  else
+    Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N),
+                                               BasePtr, Offset, AM)
+                    : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N),
                                                 BasePtr, Offset, AM);
-        else
-          Result = IsLoad ? DAG.getIndexedMaskedLoad(SDValue(N, 0), SDLoc(N),
-                                                     BasePtr, Offset, AM)
-                          : DAG.getIndexedMaskedStore(SDValue(N, 0), SDLoc(N),
-                                                      BasePtr, Offset, AM);
-        ++PostIndexedNodes;
-        ++NodesCombined;
-        LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG);
-                   dbgs() << "\nWith: "; Result.getNode()->dump(&DAG);
-                   dbgs() << '\n');
-        WorklistRemover DeadNodes(*this);
-        if (IsLoad) {
-          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
-          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
-        } else {
-          DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
-        }
-
-        // Finally, since the node is now dead, remove it from the graph.
-        deleteAndRecombine(N);
-
-        // Replace the uses of Use with uses of the updated base value.
-        DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
-                                      Result.getValue(IsLoad ? 1 : 0));
-        deleteAndRecombine(Op);
-        return true;
-      }
-    }
+  ++PostIndexedNodes;
+  ++NodesCombined;
+  LLVM_DEBUG(dbgs() << "\nReplacing.5 "; N->dump(&DAG);
+             dbgs() << "\nWith: "; Result.getNode()->dump(&DAG);
+             dbgs() << '\n');
+  WorklistRemover DeadNodes(*this);
+  if (IsLoad) {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(0));
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 1), Result.getValue(2));
+  } else {
+    DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Result.getValue(1));
   }
 
-  return false;
+  // Finally, since the node is now dead, remove it from the graph.
+  deleteAndRecombine(N);
+
+  // Replace the uses of Use with uses of the updated base value.
+  DAG.ReplaceAllUsesOfValueWith(SDValue(Op, 0),
+                                Result.getValue(IsLoad ? 1 : 0));
+  deleteAndRecombine(Op);
+  return true;
 }
 
 /// Return the base-pointer arithmetic from an indexed \p LD.
@@ -14222,11 +14754,11 @@ SDValue DAGCombiner::ForwardStoreValueToDirectLoad(LoadSDNode *LD) {
 
   auto ReplaceLd = [&](LoadSDNode *LD, SDValue Val, SDValue Chain) -> SDValue {
     if (LD->isIndexed()) {
-      bool IsSub = (LD->getAddressingMode() == ISD::PRE_DEC ||
-                    LD->getAddressingMode() == ISD::POST_DEC);
-      unsigned Opc = IsSub ? ISD::SUB : ISD::ADD;
-      SDValue Idx = DAG.getNode(Opc, SDLoc(LD), LD->getOperand(1).getValueType(),
-                             LD->getOperand(1), LD->getOperand(2));
+      // Cannot handle opaque target constants and we must respect the user's
+      // request not to split indexes from loads.
+      if (!canSplitIdx(LD))
+        return SDValue();
+      SDValue Idx = SplitIndexingFromLoad(LD);
       SDValue Ops[] = {Val, Idx, Chain};
       return CombineTo(LD, Ops, 3);
     }
@@ -14322,14 +14854,12 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
       // the indexing into an add/sub directly (that TargetConstant may not be
       // valid for a different type of node, and we cannot convert an opaque
       // target constant into a regular constant).
-      bool HasOTCInc = LD->getOperand(2).getOpcode() == ISD::TargetConstant &&
-                       cast<ConstantSDNode>(LD->getOperand(2))->isOpaque();
+      bool CanSplitIdx = canSplitIdx(LD);
 
-      if (!N->hasAnyUseOfValue(0) &&
-          ((MaySplitLoadIndex && !HasOTCInc) || !N->hasAnyUseOfValue(1))) {
+      if (!N->hasAnyUseOfValue(0) && (CanSplitIdx || !N->hasAnyUseOfValue(1))) {
         SDValue Undef = DAG.getUNDEF(N->getValueType(0));
         SDValue Index;
-        if (N->hasAnyUseOfValue(1) && MaySplitLoadIndex && !HasOTCInc) {
+        if (N->hasAnyUseOfValue(1) && CanSplitIdx) {
           Index = SplitIndexingFromLoad(LD);
           // Try to fold the base pointer arithmetic into subsequent loads and
           // stores.
@@ -14356,11 +14886,12 @@ SDValue DAGCombiner::visitLOAD(SDNode *N) {
 
   // Try to infer better alignment information than the load already has.
   if (OptLevel != CodeGenOpt::None && LD->isUnindexed() && !LD->isAtomic()) {
-    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
-      if (Align > LD->getAlignment() && LD->getSrcValueOffset() % Align == 0) {
+    if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {
+      if (*Alignment > LD->getAlign() &&
+          isAligned(*Alignment, LD->getSrcValueOffset())) {
         SDValue NewLoad = DAG.getExtLoad(
             LD->getExtensionType(), SDLoc(N), LD->getValueType(0), Chain, Ptr,
-            LD->getPointerInfo(), LD->getMemoryVT(), Align,
+            LD->getPointerInfo(), LD->getMemoryVT(), *Alignment,
             LD->getMemOperand()->getFlags(), LD->getAAInfo());
         // NewLoad will always be N as we are only refining the alignment
         assert(NewLoad.getNode() == N);
@@ -14557,11 +15088,11 @@ struct LoadedSlice {
   }
 
   /// Get the alignment of the load used for this slice.
-  unsigned getAlignment() const {
-    unsigned Alignment = Origin->getAlignment();
+  Align getAlign() const {
+    Align Alignment = Origin->getAlign();
     uint64_t Offset = getOffsetFromBase();
     if (Offset != 0)
-      Alignment = MinAlign(Alignment, Alignment + Offset);
+      Alignment = commonAlignment(Alignment, Alignment.value() + Offset);
     return Alignment;
   }
 
@@ -14657,8 +15188,8 @@ struct LoadedSlice {
     // Create the load for the slice.
     SDValue LastInst =
         DAG->getLoad(SliceType, SDLoc(Origin), Origin->getChain(), BaseAddr,
-                     Origin->getPointerInfo().getWithOffset(Offset),
-                     getAlignment(), Origin->getMemOperand()->getFlags());
+                     Origin->getPointerInfo().getWithOffset(Offset), getAlign(),
+                     Origin->getMemOperand()->getFlags());
     // If the final type is not the same as the loaded type, this means that
     // we have to pad with zero. Create a zero extend for that.
     EVT FinalType = Inst->getValueType(0);
@@ -14699,10 +15230,10 @@ struct LoadedSlice {
 
     // Check if it will be merged with the load.
     // 1. Check the alignment constraint.
-    unsigned RequiredAlignment = DAG->getDataLayout().getABITypeAlignment(
+    Align RequiredAlignment = DAG->getDataLayout().getABITypeAlign(
         ResVT.getTypeForEVT(*DAG->getContext()));
 
-    if (RequiredAlignment > getAlignment())
+    if (RequiredAlignment > getAlign())
       return false;
 
     // 2. Check that the load is a legal operation for that type.
@@ -14788,14 +15319,14 @@ static void adjustCostForPairing(SmallVectorImpl<LoadedSlice> &LoadedSlices,
       continue;
 
     // Check if the target supplies paired loads for this type.
-    unsigned RequiredAlignment = 0;
+    Align RequiredAlignment;
     if (!TLI.hasPairedLoad(LoadedType, RequiredAlignment)) {
       // move to the next pair, this type is hopeless.
       Second = nullptr;
       continue;
     }
     // Check if we meet the alignment requirement.
-    if (RequiredAlignment > First->getAlignment())
+    if (First->getAlign() < RequiredAlignment)
       continue;
 
     // Check that both loads are next to each other in memory.
@@ -14868,6 +15399,12 @@ bool DAGCombiner::SliceUpLoad(SDNode *N) {
       !LD->getValueType(0).isInteger())
     return false;
 
+  // The algorithm to split up a load of a scalable vector into individual
+  // elements currently requires knowing the length of the loaded type,
+  // so will need adjusting to work on scalable vectors.
+  if (LD->getValueType(0).isScalableVector())
+    return false;
+
   // Keep track of already used bits to detect overlapping values.
   // In that case, we will just abort the transformation.
   APInt UsedBits(LD->getValueSizeInBits(0), 0);
@@ -15112,7 +15649,7 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
   // Y is known to provide just those bytes.  If so, we try to replace the
   // load + replace + store sequence with a single (narrower) store, which makes
   // the load dead.
-  if (Opc == ISD::OR) {
+  if (Opc == ISD::OR && EnableShrinkLoadReplaceStoreWithStore) {
     std::pair<unsigned, unsigned> MaskedLoad;
     MaskedLoad = CheckForMaskedLoad(Value.getOperand(0), Ptr, Chain);
     if (MaskedLoad.first)
@@ -15128,6 +15665,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
         return NewST;
   }
 
+  if (!EnableReduceLoadOpStoreWidth)
+    return SDValue();
+
   if ((Opc != ISD::OR && Opc != ISD::XOR && Opc != ISD::AND) ||
       Value.getOperand(1).getOpcode() != ISD::Constant)
     return SDValue();
@@ -15181,9 +15721,9 @@ SDValue DAGCombiner::ReduceLoadOpStoreWidth(SDNode *N) {
       if (DAG.getDataLayout().isBigEndian())
         PtrOff = (BitWidth + 7 - NewBW) / 8 - PtrOff;
 
-      unsigned NewAlign = MinAlign(LD->getAlignment(), PtrOff);
+      Align NewAlign = commonAlignment(LD->getAlign(), PtrOff);
       Type *NewVTTy = NewVT.getTypeForEVT(*DAG.getContext());
-      if (NewAlign < DAG.getDataLayout().getABITypeAlignment(NewVTTy))
+      if (NewAlign < DAG.getDataLayout().getABITypeAlign(NewVTTy))
         return SDValue();
 
       SDValue NewPtr = DAG.getMemBasePlusOffset(Ptr, PtrOff, SDLoc(LD));
@@ -15229,17 +15769,24 @@ SDValue DAGCombiner::TransformFPLoadStorePair(SDNode *N) {
         ST->getPointerInfo().getAddrSpace() != 0)
       return SDValue();
 
-    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+    TypeSize VTSize = VT.getSizeInBits();
+
+    // We don't know the size of scalable types at compile time so we cannot
+    // create an integer of the equivalent size.
+    if (VTSize.isScalable())
+      return SDValue();
+
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VTSize.getFixedSize());
     if (!TLI.isOperationLegal(ISD::LOAD, IntVT) ||
         !TLI.isOperationLegal(ISD::STORE, IntVT) ||
         !TLI.isDesirableToTransformToIntegerOp(ISD::LOAD, VT) ||
         !TLI.isDesirableToTransformToIntegerOp(ISD::STORE, VT))
       return SDValue();
 
-    unsigned LDAlign = LD->getAlignment();
-    unsigned STAlign = ST->getAlignment();
+    Align LDAlign = LD->getAlign();
+    Align STAlign = ST->getAlign();
     Type *IntVTTy = IntVT.getTypeForEVT(*DAG.getContext());
-    unsigned ABIAlign = DAG.getDataLayout().getABITypeAlignment(IntVTTy);
+    Align ABIAlign = DAG.getDataLayout().getABITypeAlign(IntVTTy);
     if (LDAlign < ABIAlign || STAlign < ABIAlign)
       return SDValue();
 
@@ -15356,7 +15903,7 @@ SDValue DAGCombiner::getMergeStoreChains(SmallVectorImpl<MemOpLink> &StoreNodes,
   return DAG.getTokenFactor(StoreDL, Chains);
 }
 
-bool DAGCombiner::MergeStoresOfConstantsOrVecElts(
+bool DAGCombiner::mergeStoresOfConstantsOrVecElts(
     SmallVectorImpl<MemOpLink> &StoreNodes, EVT MemVT, unsigned NumStores,
     bool IsConstantSrc, bool UseVector, bool UseTrunc) {
   // Make sure we have something to merge.
@@ -15530,14 +16077,12 @@ void DAGCombiner::getStoreMergeCandidates(
   if (BasePtr.getBase().isUndef())
     return;
 
-  bool IsConstantSrc = isa<ConstantSDNode>(Val) || isa<ConstantFPSDNode>(Val);
-  bool IsExtractVecSrc = (Val.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
-                          Val.getOpcode() == ISD::EXTRACT_SUBVECTOR);
-  bool IsLoadSrc = isa<LoadSDNode>(Val);
+  StoreSource StoreSrc = getStoreSource(Val);
+  assert(StoreSrc != StoreSource::Unknown && "Expected known source for store");
   BaseIndexOffset LBasePtr;
   // Match on loadbaseptr if relevant.
   EVT LoadVT;
-  if (IsLoadSrc) {
+  if (StoreSrc == StoreSource::Load) {
     auto *Ld = cast<LoadSDNode>(Val);
     LBasePtr = BaseIndexOffset::match(Ld, DAG);
     LoadVT = Ld->getMemoryVT();
@@ -15565,7 +16110,7 @@ void DAGCombiner::getStoreMergeCandidates(
     // Allow merging constants of different types as integers.
     bool NoTypeMatch = (MemVT.isInteger()) ? !MemVT.bitsEq(Other->getMemoryVT())
                                            : Other->getMemoryVT() != MemVT;
-    if (IsLoadSrc) {
+    if (StoreSrc == StoreSource::Load) {
       if (NoTypeMatch)
         return false;
       // The Load's Base Ptr must also match
@@ -15589,13 +16134,13 @@ void DAGCombiner::getStoreMergeCandidates(
       } else
         return false;
     }
-    if (IsConstantSrc) {
+    if (StoreSrc == StoreSource::Constant) {
       if (NoTypeMatch)
         return false;
       if (!(isa<ConstantSDNode>(OtherBC) || isa<ConstantFPSDNode>(OtherBC)))
         return false;
     }
-    if (IsExtractVecSrc) {
+    if (StoreSrc == StoreSource::Extract) {
       // Do not merge truncated stores here.
       if (Other->isTruncatingStore())
         return false;
@@ -15736,77 +16281,22 @@ bool DAGCombiner::checkMergeStoreCandidatesForDependencies(
   return true;
 }
 
-bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
-  if (OptLevel == CodeGenOpt::None || !EnableStoreMerging)
-    return false;
-
-  EVT MemVT = St->getMemoryVT();
-  int64_t ElementSizeBytes = MemVT.getStoreSize();
-  unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
-
-  if (MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
-    return false;
-
-  bool NoVectors = DAG.getMachineFunction().getFunction().hasFnAttribute(
-      Attribute::NoImplicitFloat);
-
-  // This function cannot currently deal with non-byte-sized memory sizes.
-  if (ElementSizeBytes * 8 != (int64_t)MemVT.getSizeInBits())
-    return false;
-
-  if (!MemVT.isSimple())
-    return false;
-
-  // Perform an early exit check. Do not bother looking at stored values that
-  // are not constants, loads, or extracted vector elements.
-  SDValue StoredVal = peekThroughBitcasts(St->getValue());
-  bool IsLoadSrc = isa<LoadSDNode>(StoredVal);
-  bool IsConstantSrc = isa<ConstantSDNode>(StoredVal) ||
-                       isa<ConstantFPSDNode>(StoredVal);
-  bool IsExtractVecSrc = (StoredVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
-                          StoredVal.getOpcode() == ISD::EXTRACT_SUBVECTOR);
-  bool IsNonTemporalStore = St->isNonTemporal();
-  bool IsNonTemporalLoad =
-      IsLoadSrc && cast<LoadSDNode>(StoredVal)->isNonTemporal();
-
-  if (!IsConstantSrc && !IsLoadSrc && !IsExtractVecSrc)
-    return false;
-
-  SmallVector<MemOpLink, 8> StoreNodes;
-  SDNode *RootNode;
-  // Find potential store merge candidates by searching through chain sub-DAG
-  getStoreMergeCandidates(St, StoreNodes, RootNode);
-
-  // Check if there is anything to merge.
-  if (StoreNodes.size() < 2)
-    return false;
-
-  // Sort the memory operands according to their distance from the
-  // base pointer.
-  llvm::sort(StoreNodes, [](MemOpLink LHS, MemOpLink RHS) {
-    return LHS.OffsetFromBase < RHS.OffsetFromBase;
-  });
-
-  // Store Merge attempts to merge the lowest stores. This generally
-  // works out as if successful, as the remaining stores are checked
-  // after the first collection of stores is merged. However, in the
-  // case that a non-mergeable store is found first, e.g., {p[-2],
-  // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent
-  // mergeable cases. To prevent this, we prune such stores from the
-  // front of StoreNodes here.
-
-  bool RV = false;
-  while (StoreNodes.size() > 1) {
+unsigned
+DAGCombiner::getConsecutiveStores(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                  int64_t ElementSizeBytes) const {
+  while (true) {
+    // Find a store past the width of the first store.
     size_t StartIdx = 0;
     while ((StartIdx + 1 < StoreNodes.size()) &&
            StoreNodes[StartIdx].OffsetFromBase + ElementSizeBytes !=
-               StoreNodes[StartIdx + 1].OffsetFromBase)
+              StoreNodes[StartIdx + 1].OffsetFromBase)
       ++StartIdx;
 
     // Bail if we don't have enough candidates to merge.
     if (StartIdx + 1 >= StoreNodes.size())
-      return RV;
+      return 0;
 
+    // Trim stores that overlapped with the first store.
     if (StartIdx)
       StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + StartIdx);
 
@@ -15822,302 +16312,345 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
         break;
       NumConsecutiveStores = i + 1;
     }
+    if (NumConsecutiveStores > 1)
+      return NumConsecutiveStores;
 
-    if (NumConsecutiveStores < 2) {
-      StoreNodes.erase(StoreNodes.begin(),
-                       StoreNodes.begin() + NumConsecutiveStores);
-      continue;
-    }
-
-    // The node with the lowest store address.
-    LLVMContext &Context = *DAG.getContext();
-    const DataLayout &DL = DAG.getDataLayout();
-
-    // Store the constants into memory as one consecutive store.
-    if (IsConstantSrc) {
-      while (NumConsecutiveStores >= 2) {
-        LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
-        unsigned FirstStoreAS = FirstInChain->getAddressSpace();
-        unsigned FirstStoreAlign = FirstInChain->getAlignment();
-        unsigned LastLegalType = 1;
-        unsigned LastLegalVectorType = 1;
-        bool LastIntegerTrunc = false;
-        bool NonZero = false;
-        unsigned FirstZeroAfterNonZero = NumConsecutiveStores;
-        for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
-          StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
-          SDValue StoredVal = ST->getValue();
-          bool IsElementZero = false;
-          if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))
-            IsElementZero = C->isNullValue();
-          else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
-            IsElementZero = C->getConstantFPValue()->isNullValue();
-          if (IsElementZero) {
-            if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)
-              FirstZeroAfterNonZero = i;
-          }
-          NonZero |= !IsElementZero;
-
-          // Find a legal type for the constant store.
-          unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
-          EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
-          bool IsFast = false;
+    // There are no consecutive stores at the start of the list.
+    // Remove the first store and try again.
+    StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 1);
+  }
+}
 
-          // Break early when size is too large to be legal.
-          if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
-            break;
+bool DAGCombiner::tryStoreMergeOfConstants(
+    SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores,
+    EVT MemVT, SDNode *RootNode, bool AllowVectors) {
+  LLVMContext &Context = *DAG.getContext();
+  const DataLayout &DL = DAG.getDataLayout();
+  int64_t ElementSizeBytes = MemVT.getStoreSize();
+  unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
+  bool MadeChange = false;
+
+  // Store the constants into memory as one consecutive store.
+  while (NumConsecutiveStores >= 2) {
+    LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+    unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+    unsigned FirstStoreAlign = FirstInChain->getAlignment();
+    unsigned LastLegalType = 1;
+    unsigned LastLegalVectorType = 1;
+    bool LastIntegerTrunc = false;
+    bool NonZero = false;
+    unsigned FirstZeroAfterNonZero = NumConsecutiveStores;
+    for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
+      StoreSDNode *ST = cast<StoreSDNode>(StoreNodes[i].MemNode);
+      SDValue StoredVal = ST->getValue();
+      bool IsElementZero = false;
+      if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(StoredVal))
+        IsElementZero = C->isNullValue();
+      else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(StoredVal))
+        IsElementZero = C->getConstantFPValue()->isNullValue();
+      if (IsElementZero) {
+        if (NonZero && FirstZeroAfterNonZero == NumConsecutiveStores)
+          FirstZeroAfterNonZero = i;
+      }
+      NonZero |= !IsElementZero;
 
-          if (TLI.isTypeLegal(StoreTy) &&
-              TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
-              TLI.allowsMemoryAccess(Context, DL, StoreTy,
-                                     *FirstInChain->getMemOperand(), &IsFast) &&
-              IsFast) {
-            LastIntegerTrunc = false;
-            LastLegalType = i + 1;
-            // Or check whether a truncstore is legal.
-          } else if (TLI.getTypeAction(Context, StoreTy) ==
-                     TargetLowering::TypePromoteInteger) {
-            EVT LegalizedStoredValTy =
-                TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
-            if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
-                TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&
-                TLI.allowsMemoryAccess(Context, DL, StoreTy,
-                                       *FirstInChain->getMemOperand(),
-                                       &IsFast) &&
-                IsFast) {
-              LastIntegerTrunc = true;
-              LastLegalType = i + 1;
-            }
-          }
+      // Find a legal type for the constant store.
+      unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
+      EVT StoreTy = EVT::getIntegerVT(Context, SizeInBits);
+      bool IsFast = false;
 
-          // We only use vectors if the constant is known to be zero or the
-          // target allows it and the function is not marked with the
-          // noimplicitfloat attribute.
-          if ((!NonZero ||
-               TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
-              !NoVectors) {
-            // Find a legal type for the vector store.
-            unsigned Elts = (i + 1) * NumMemElts;
-            EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
-            if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&
-                TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
-                TLI.allowsMemoryAccess(
-                    Context, DL, Ty, *FirstInChain->getMemOperand(), &IsFast) &&
-                IsFast)
-              LastLegalVectorType = i + 1;
-          }
-        }
+      // Break early when size is too large to be legal.
+      if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
+        break;
 
-        bool UseVector = (LastLegalVectorType > LastLegalType) && !NoVectors;
-        unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
-
-        // Check if we found a legal integer type that creates a meaningful
-        // merge.
-        if (NumElem < 2) {
-          // We know that candidate stores are in order and of correct
-          // shape. While there is no mergeable sequence from the
-          // beginning one may start later in the sequence. The only
-          // reason a merge of size N could have failed where another of
-          // the same size would not have, is if the alignment has
-          // improved or we've dropped a non-zero value. Drop as many
-          // candidates as we can here.
-          unsigned NumSkip = 1;
-          while (
-              (NumSkip < NumConsecutiveStores) &&
-              (NumSkip < FirstZeroAfterNonZero) &&
-              (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
-            NumSkip++;
-
-          StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
-          NumConsecutiveStores -= NumSkip;
-          continue;
+      if (TLI.isTypeLegal(StoreTy) &&
+          TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy,
+                                 *FirstInChain->getMemOperand(), &IsFast) &&
+          IsFast) {
+        LastIntegerTrunc = false;
+        LastLegalType = i + 1;
+        // Or check whether a truncstore is legal.
+      } else if (TLI.getTypeAction(Context, StoreTy) ==
+                 TargetLowering::TypePromoteInteger) {
+        EVT LegalizedStoredValTy =
+            TLI.getTypeToTransformTo(Context, StoredVal.getValueType());
+        if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
+            TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&
+            TLI.allowsMemoryAccess(Context, DL, StoreTy,
+                                   *FirstInChain->getMemOperand(), &IsFast) &&
+            IsFast) {
+          LastIntegerTrunc = true;
+          LastLegalType = i + 1;
         }
+      }
 
-        // Check that we can merge these candidates without causing a cycle.
-        if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
-                                                      RootNode)) {
-          StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
-          NumConsecutiveStores -= NumElem;
-          continue;
-        }
+      // We only use vectors if the constant is known to be zero or the
+      // target allows it and the function is not marked with the
+      // noimplicitfloat attribute.
+      if ((!NonZero ||
+           TLI.storeOfVectorConstantIsCheap(MemVT, i + 1, FirstStoreAS)) &&
+          AllowVectors) {
+        // Find a legal type for the vector store.
+        unsigned Elts = (i + 1) * NumMemElts;
+        EVT Ty = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
+        if (TLI.isTypeLegal(Ty) && TLI.isTypeLegal(MemVT) &&
+            TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
+            TLI.allowsMemoryAccess(Context, DL, Ty,
+                                   *FirstInChain->getMemOperand(), &IsFast) &&
+            IsFast)
+          LastLegalVectorType = i + 1;
+      }
+    }
 
-        RV |= MergeStoresOfConstantsOrVecElts(StoreNodes, MemVT, NumElem, true,
-                                              UseVector, LastIntegerTrunc);
+    bool UseVector = (LastLegalVectorType > LastLegalType) && AllowVectors;
+    unsigned NumElem = (UseVector) ? LastLegalVectorType : LastLegalType;
+
+    // Check if we found a legal integer type that creates a meaningful
+    // merge.
+    if (NumElem < 2) {
+      // We know that candidate stores are in order and of correct
+      // shape. While there is no mergeable sequence from the
+      // beginning one may start later in the sequence. The only
+      // reason a merge of size N could have failed where another of
+      // the same size would not have, is if the alignment has
+      // improved or we've dropped a non-zero value. Drop as many
+      // candidates as we can here.
+      unsigned NumSkip = 1;
+      while ((NumSkip < NumConsecutiveStores) &&
+             (NumSkip < FirstZeroAfterNonZero) &&
+             (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
+        NumSkip++;
+
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
+      NumConsecutiveStores -= NumSkip;
+      continue;
+    }
 
-        // Remove merged stores for next iteration.
-        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
-        NumConsecutiveStores -= NumElem;
-      }
+    // Check that we can merge these candidates without causing a cycle.
+    if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
+                                                  RootNode)) {
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+      NumConsecutiveStores -= NumElem;
       continue;
     }
 
-    // When extracting multiple vector elements, try to store them
-    // in one vector store rather than a sequence of scalar stores.
-    if (IsExtractVecSrc) {
-      // Loop on Consecutive Stores on success.
-      while (NumConsecutiveStores >= 2) {
-        LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
-        unsigned FirstStoreAS = FirstInChain->getAddressSpace();
-        unsigned FirstStoreAlign = FirstInChain->getAlignment();
-        unsigned NumStoresToMerge = 1;
-        for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
-          // Find a legal type for the vector store.
-          unsigned Elts = (i + 1) * NumMemElts;
-          EVT Ty =
-              EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
-          bool IsFast;
-
-          // Break early when size is too large to be legal.
-          if (Ty.getSizeInBits() > MaximumLegalStoreInBits)
-            break;
+    MadeChange |= mergeStoresOfConstantsOrVecElts(
+        StoreNodes, MemVT, NumElem, true, UseVector, LastIntegerTrunc);
 
-          if (TLI.isTypeLegal(Ty) &&
-              TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
-              TLI.allowsMemoryAccess(Context, DL, Ty,
-                                     *FirstInChain->getMemOperand(), &IsFast) &&
-              IsFast)
-            NumStoresToMerge = i + 1;
-        }
+    // Remove merged stores for next iteration.
+    StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+    NumConsecutiveStores -= NumElem;
+  }
+  return MadeChange;
+}
 
-        // Check if we found a legal integer type creating a meaningful
-        // merge.
-        if (NumStoresToMerge < 2) {
-          // We know that candidate stores are in order and of correct
-          // shape. While there is no mergeable sequence from the
-          // beginning one may start later in the sequence. The only
-          // reason a merge of size N could have failed where another of
-          // the same size would not have, is if the alignment has
-          // improved. Drop as many candidates as we can here.
-          unsigned NumSkip = 1;
-          while (
-              (NumSkip < NumConsecutiveStores) &&
-              (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
-            NumSkip++;
-
-          StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
-          NumConsecutiveStores -= NumSkip;
-          continue;
-        }
+bool DAGCombiner::tryStoreMergeOfExtracts(
+    SmallVectorImpl<MemOpLink> &StoreNodes, unsigned NumConsecutiveStores,
+    EVT MemVT, SDNode *RootNode) {
+  LLVMContext &Context = *DAG.getContext();
+  const DataLayout &DL = DAG.getDataLayout();
+  unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
+  bool MadeChange = false;
+
+  // Loop on Consecutive Stores on success.
+  while (NumConsecutiveStores >= 2) {
+    LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+    unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+    unsigned FirstStoreAlign = FirstInChain->getAlignment();
+    unsigned NumStoresToMerge = 1;
+    for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
+      // Find a legal type for the vector store.
+      unsigned Elts = (i + 1) * NumMemElts;
+      EVT Ty = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(), Elts);
+      bool IsFast = false;
 
-        // Check that we can merge these candidates without causing a cycle.
-        if (!checkMergeStoreCandidatesForDependencies(
-                StoreNodes, NumStoresToMerge, RootNode)) {
-          StoreNodes.erase(StoreNodes.begin(),
-                           StoreNodes.begin() + NumStoresToMerge);
-          NumConsecutiveStores -= NumStoresToMerge;
-          continue;
-        }
+      // Break early when size is too large to be legal.
+      if (Ty.getSizeInBits() > MaximumLegalStoreInBits)
+        break;
 
-        RV |= MergeStoresOfConstantsOrVecElts(
-            StoreNodes, MemVT, NumStoresToMerge, false, true, false);
+      if (TLI.isTypeLegal(Ty) && TLI.canMergeStoresTo(FirstStoreAS, Ty, DAG) &&
+          TLI.allowsMemoryAccess(Context, DL, Ty,
+                                 *FirstInChain->getMemOperand(), &IsFast) &&
+          IsFast)
+        NumStoresToMerge = i + 1;
+    }
+
+    // Check if we found a legal integer type creating a meaningful
+    // merge.
+    if (NumStoresToMerge < 2) {
+      // We know that candidate stores are in order and of correct
+      // shape. While there is no mergeable sequence from the
+      // beginning one may start later in the sequence. The only
+      // reason a merge of size N could have failed where another of
+      // the same size would not have, is if the alignment has
+      // improved. Drop as many candidates as we can here.
+      unsigned NumSkip = 1;
+      while ((NumSkip < NumConsecutiveStores) &&
+             (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
+        NumSkip++;
+
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
+      NumConsecutiveStores -= NumSkip;
+      continue;
+    }
 
-        StoreNodes.erase(StoreNodes.begin(),
-                         StoreNodes.begin() + NumStoresToMerge);
-        NumConsecutiveStores -= NumStoresToMerge;
-      }
+    // Check that we can merge these candidates without causing a cycle.
+    if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumStoresToMerge,
+                                                  RootNode)) {
+      StoreNodes.erase(StoreNodes.begin(),
+                       StoreNodes.begin() + NumStoresToMerge);
+      NumConsecutiveStores -= NumStoresToMerge;
       continue;
     }
 
-    // Below we handle the case of multiple consecutive stores that
-    // come from multiple consecutive loads. We merge them into a single
-    // wide load and a single wide store.
+    MadeChange |= mergeStoresOfConstantsOrVecElts(
+        StoreNodes, MemVT, NumStoresToMerge, false, true, false);
+
+    StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumStoresToMerge);
+    NumConsecutiveStores -= NumStoresToMerge;
+  }
+  return MadeChange;
+}
+
+bool DAGCombiner::tryStoreMergeOfLoads(SmallVectorImpl<MemOpLink> &StoreNodes,
+                                       unsigned NumConsecutiveStores, EVT MemVT,
+                                       SDNode *RootNode, bool AllowVectors,
+                                       bool IsNonTemporalStore,
+                                       bool IsNonTemporalLoad) {
+  LLVMContext &Context = *DAG.getContext();
+  const DataLayout &DL = DAG.getDataLayout();
+  int64_t ElementSizeBytes = MemVT.getStoreSize();
+  unsigned NumMemElts = MemVT.isVector() ? MemVT.getVectorNumElements() : 1;
+  bool MadeChange = false;
 
-    // Look for load nodes which are used by the stored values.
-    SmallVector<MemOpLink, 8> LoadNodes;
+  int64_t StartAddress = StoreNodes[0].OffsetFromBase;
 
-    // Find acceptable loads. Loads need to have the same chain (token factor),
-    // must not be zext, volatile, indexed, and they must be consecutive.
-    BaseIndexOffset LdBasePtr;
+  // Look for load nodes which are used by the stored values.
+  SmallVector<MemOpLink, 8> LoadNodes;
 
-    for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
-      StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
-      SDValue Val = peekThroughBitcasts(St->getValue());
-      LoadSDNode *Ld = cast<LoadSDNode>(Val);
-
-      BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);
-      // If this is not the first ptr that we check.
-      int64_t LdOffset = 0;
-      if (LdBasePtr.getBase().getNode()) {
-        // The base ptr must be the same.
-        if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset))
-          break;
-      } else {
-        // Check that all other base pointers are the same as this one.
-        LdBasePtr = LdPtr;
-      }
+  // Find acceptable loads. Loads need to have the same chain (token factor),
+  // must not be zext, volatile, indexed, and they must be consecutive.
+  BaseIndexOffset LdBasePtr;
+
+  for (unsigned i = 0; i < NumConsecutiveStores; ++i) {
+    StoreSDNode *St = cast<StoreSDNode>(StoreNodes[i].MemNode);
+    SDValue Val = peekThroughBitcasts(St->getValue());
+    LoadSDNode *Ld = cast<LoadSDNode>(Val);
 
-      // We found a potential memory operand to merge.
-      LoadNodes.push_back(MemOpLink(Ld, LdOffset));
+    BaseIndexOffset LdPtr = BaseIndexOffset::match(Ld, DAG);
+    // If this is not the first ptr that we check.
+    int64_t LdOffset = 0;
+    if (LdBasePtr.getBase().getNode()) {
+      // The base ptr must be the same.
+      if (!LdBasePtr.equalBaseIndex(LdPtr, DAG, LdOffset))
+        break;
+    } else {
+      // Check that all other base pointers are the same as this one.
+      LdBasePtr = LdPtr;
     }
 
-    while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) {
+    // We found a potential memory operand to merge.
+    LoadNodes.push_back(MemOpLink(Ld, LdOffset));
+  }
+
+  while (NumConsecutiveStores >= 2 && LoadNodes.size() >= 2) {
+    Align RequiredAlignment;
+    bool NeedRotate = false;
+    if (LoadNodes.size() == 2) {
       // If we have load/store pair instructions and we only have two values,
       // don't bother merging.
-      unsigned RequiredAlignment;
-      if (LoadNodes.size() == 2 &&
-          TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
-          StoreNodes[0].MemNode->getAlignment() >= RequiredAlignment) {
+      if (TLI.hasPairedLoad(MemVT, RequiredAlignment) &&
+          StoreNodes[0].MemNode->getAlign() >= RequiredAlignment) {
         StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + 2);
         LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + 2);
         break;
       }
-      LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
-      unsigned FirstStoreAS = FirstInChain->getAddressSpace();
-      unsigned FirstStoreAlign = FirstInChain->getAlignment();
-      LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
-      unsigned FirstLoadAlign = FirstLoad->getAlignment();
-
-      // Scan the memory operations on the chain and find the first
-      // non-consecutive load memory address. These variables hold the index in
-      // the store node array.
-
-      unsigned LastConsecutiveLoad = 1;
-
-      // This variable refers to the size and not index in the array.
-      unsigned LastLegalVectorType = 1;
-      unsigned LastLegalIntegerType = 1;
-      bool isDereferenceable = true;
-      bool DoIntegerTruncate = false;
-      StartAddress = LoadNodes[0].OffsetFromBase;
-      SDValue FirstChain = FirstLoad->getChain();
-      for (unsigned i = 1; i < LoadNodes.size(); ++i) {
-        // All loads must share the same chain.
-        if (LoadNodes[i].MemNode->getChain() != FirstChain)
-          break;
+      // If the loads are reversed, see if we can rotate the halves into place.
+      int64_t Offset0 = LoadNodes[0].OffsetFromBase;
+      int64_t Offset1 = LoadNodes[1].OffsetFromBase;
+      EVT PairVT = EVT::getIntegerVT(Context, ElementSizeBytes * 8 * 2);
+      if (Offset0 - Offset1 == ElementSizeBytes &&
+          (hasOperation(ISD::ROTL, PairVT) ||
+           hasOperation(ISD::ROTR, PairVT))) {
+        std::swap(LoadNodes[0], LoadNodes[1]);
+        NeedRotate = true;
+      }
+    }
+    LSBaseSDNode *FirstInChain = StoreNodes[0].MemNode;
+    unsigned FirstStoreAS = FirstInChain->getAddressSpace();
+    unsigned FirstStoreAlign = FirstInChain->getAlignment();
+    LoadSDNode *FirstLoad = cast<LoadSDNode>(LoadNodes[0].MemNode);
 
-        int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
-        if (CurrAddress - StartAddress != (ElementSizeBytes * i))
-          break;
-        LastConsecutiveLoad = i;
+    // Scan the memory operations on the chain and find the first
+    // non-consecutive load memory address. These variables hold the index in
+    // the store node array.
+
+    unsigned LastConsecutiveLoad = 1;
+
+    // This variable refers to the size and not index in the array.
+    unsigned LastLegalVectorType = 1;
+    unsigned LastLegalIntegerType = 1;
+    bool isDereferenceable = true;
+    bool DoIntegerTruncate = false;
+    StartAddress = LoadNodes[0].OffsetFromBase;
+    SDValue LoadChain = FirstLoad->getChain();
+    for (unsigned i = 1; i < LoadNodes.size(); ++i) {
+      // All loads must share the same chain.
+      if (LoadNodes[i].MemNode->getChain() != LoadChain)
+        break;
 
-        if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())
-          isDereferenceable = false;
+      int64_t CurrAddress = LoadNodes[i].OffsetFromBase;
+      if (CurrAddress - StartAddress != (ElementSizeBytes * i))
+        break;
+      LastConsecutiveLoad = i;
 
-        // Find a legal type for the vector store.
-        unsigned Elts = (i + 1) * NumMemElts;
-        EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
+      if (isDereferenceable && !LoadNodes[i].MemNode->isDereferenceable())
+        isDereferenceable = false;
 
-        // Break early when size is too large to be legal.
-        if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
-          break;
+      // Find a legal type for the vector store.
+      unsigned Elts = (i + 1) * NumMemElts;
+      EVT StoreTy = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
 
-        bool IsFastSt, IsFastLd;
-        if (TLI.isTypeLegal(StoreTy) &&
-            TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
-            TLI.allowsMemoryAccess(Context, DL, StoreTy,
-                                   *FirstInChain->getMemOperand(), &IsFastSt) &&
-            IsFastSt &&
-            TLI.allowsMemoryAccess(Context, DL, StoreTy,
-                                   *FirstLoad->getMemOperand(), &IsFastLd) &&
-            IsFastLd) {
-          LastLegalVectorType = i + 1;
-        }
+      // Break early when size is too large to be legal.
+      if (StoreTy.getSizeInBits() > MaximumLegalStoreInBits)
+        break;
 
-        // Find a legal type for the integer store.
-        unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
-        StoreTy = EVT::getIntegerVT(Context, SizeInBits);
-        if (TLI.isTypeLegal(StoreTy) &&
-            TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
+      bool IsFastSt = false;
+      bool IsFastLd = false;
+      if (TLI.isTypeLegal(StoreTy) &&
+          TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy,
+                                 *FirstInChain->getMemOperand(), &IsFastSt) &&
+          IsFastSt &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy,
+                                 *FirstLoad->getMemOperand(), &IsFastLd) &&
+          IsFastLd) {
+        LastLegalVectorType = i + 1;
+      }
+
+      // Find a legal type for the integer store.
+      unsigned SizeInBits = (i + 1) * ElementSizeBytes * 8;
+      StoreTy = EVT::getIntegerVT(Context, SizeInBits);
+      if (TLI.isTypeLegal(StoreTy) &&
+          TLI.canMergeStoresTo(FirstStoreAS, StoreTy, DAG) &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy,
+                                 *FirstInChain->getMemOperand(), &IsFastSt) &&
+          IsFastSt &&
+          TLI.allowsMemoryAccess(Context, DL, StoreTy,
+                                 *FirstLoad->getMemOperand(), &IsFastLd) &&
+          IsFastLd) {
+        LastLegalIntegerType = i + 1;
+        DoIntegerTruncate = false;
+        // Or check whether a truncstore and extload is legal.
+      } else if (TLI.getTypeAction(Context, StoreTy) ==
+                 TargetLowering::TypePromoteInteger) {
+        EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy);
+        if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
+            TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&
+            TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy, StoreTy) &&
+            TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy, StoreTy) &&
+            TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) &&
             TLI.allowsMemoryAccess(Context, DL, StoreTy,
                                    *FirstInChain->getMemOperand(), &IsFastSt) &&
             IsFastSt &&
@@ -16125,149 +16658,225 @@ bool DAGCombiner::MergeConsecutiveStores(StoreSDNode *St) {
                                    *FirstLoad->getMemOperand(), &IsFastLd) &&
             IsFastLd) {
           LastLegalIntegerType = i + 1;
-          DoIntegerTruncate = false;
-          // Or check whether a truncstore and extload is legal.
-        } else if (TLI.getTypeAction(Context, StoreTy) ==
-                   TargetLowering::TypePromoteInteger) {
-          EVT LegalizedStoredValTy = TLI.getTypeToTransformTo(Context, StoreTy);
-          if (TLI.isTruncStoreLegal(LegalizedStoredValTy, StoreTy) &&
-              TLI.canMergeStoresTo(FirstStoreAS, LegalizedStoredValTy, DAG) &&
-              TLI.isLoadExtLegal(ISD::ZEXTLOAD, LegalizedStoredValTy,
-                                 StoreTy) &&
-              TLI.isLoadExtLegal(ISD::SEXTLOAD, LegalizedStoredValTy,
-                                 StoreTy) &&
-              TLI.isLoadExtLegal(ISD::EXTLOAD, LegalizedStoredValTy, StoreTy) &&
-              TLI.allowsMemoryAccess(Context, DL, StoreTy,
-                                     *FirstInChain->getMemOperand(),
-                                     &IsFastSt) &&
-              IsFastSt &&
-              TLI.allowsMemoryAccess(Context, DL, StoreTy,
-                                     *FirstLoad->getMemOperand(), &IsFastLd) &&
-              IsFastLd) {
-            LastLegalIntegerType = i + 1;
-            DoIntegerTruncate = true;
-          }
+          DoIntegerTruncate = true;
         }
       }
+    }
 
-      // Only use vector types if the vector type is larger than the integer
-      // type. If they are the same, use integers.
-      bool UseVectorTy =
-          LastLegalVectorType > LastLegalIntegerType && !NoVectors;
-      unsigned LastLegalType =
-          std::max(LastLegalVectorType, LastLegalIntegerType);
-
-      // We add +1 here because the LastXXX variables refer to location while
-      // the NumElem refers to array/index size.
-      unsigned NumElem =
-          std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
-      NumElem = std::min(LastLegalType, NumElem);
-
-      if (NumElem < 2) {
-        // We know that candidate stores are in order and of correct
-        // shape. While there is no mergeable sequence from the
-        // beginning one may start later in the sequence. The only
-        // reason a merge of size N could have failed where another of
-        // the same size would not have is if the alignment or either
-        // the load or store has improved. Drop as many candidates as we
-        // can here.
-        unsigned NumSkip = 1;
-        while ((NumSkip < LoadNodes.size()) &&
-               (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) &&
-               (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
-          NumSkip++;
-        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
-        LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip);
-        NumConsecutiveStores -= NumSkip;
-        continue;
-      }
+    // Only use vector types if the vector type is larger than the integer
+    // type. If they are the same, use integers.
+    bool UseVectorTy =
+        LastLegalVectorType > LastLegalIntegerType && AllowVectors;
+    unsigned LastLegalType =
+        std::max(LastLegalVectorType, LastLegalIntegerType);
+
+    // We add +1 here because the LastXXX variables refer to location while
+    // the NumElem refers to array/index size.
+    unsigned NumElem = std::min(NumConsecutiveStores, LastConsecutiveLoad + 1);
+    NumElem = std::min(LastLegalType, NumElem);
+    unsigned FirstLoadAlign = FirstLoad->getAlignment();
+
+    if (NumElem < 2) {
+      // We know that candidate stores are in order and of correct
+      // shape. While there is no mergeable sequence from the
+      // beginning one may start later in the sequence. The only
+      // reason a merge of size N could have failed where another of
+      // the same size would not have is if the alignment or either
+      // the load or store has improved. Drop as many candidates as we
+      // can here.
+      unsigned NumSkip = 1;
+      while ((NumSkip < LoadNodes.size()) &&
+             (LoadNodes[NumSkip].MemNode->getAlignment() <= FirstLoadAlign) &&
+             (StoreNodes[NumSkip].MemNode->getAlignment() <= FirstStoreAlign))
+        NumSkip++;
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumSkip);
+      LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumSkip);
+      NumConsecutiveStores -= NumSkip;
+      continue;
+    }
 
-      // Check that we can merge these candidates without causing a cycle.
-      if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
-                                                    RootNode)) {
-        StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
-        LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
-        NumConsecutiveStores -= NumElem;
-        continue;
-      }
+    // Check that we can merge these candidates without causing a cycle.
+    if (!checkMergeStoreCandidatesForDependencies(StoreNodes, NumElem,
+                                                  RootNode)) {
+      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+      LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
+      NumConsecutiveStores -= NumElem;
+      continue;
+    }
 
-      // Find if it is better to use vectors or integers to load and store
-      // to memory.
-      EVT JointMemOpVT;
-      if (UseVectorTy) {
-        // Find a legal type for the vector store.
-        unsigned Elts = NumElem * NumMemElts;
-        JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
-      } else {
-        unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
-        JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
+    // Find if it is better to use vectors or integers to load and store
+    // to memory.
+    EVT JointMemOpVT;
+    if (UseVectorTy) {
+      // Find a legal type for the vector store.
+      unsigned Elts = NumElem * NumMemElts;
+      JointMemOpVT = EVT::getVectorVT(Context, MemVT.getScalarType(), Elts);
+    } else {
+      unsigned SizeInBits = NumElem * ElementSizeBytes * 8;
+      JointMemOpVT = EVT::getIntegerVT(Context, SizeInBits);
+    }
+
+    SDLoc LoadDL(LoadNodes[0].MemNode);
+    SDLoc StoreDL(StoreNodes[0].MemNode);
+
+    // The merged loads are required to have the same incoming chain, so
+    // using the first's chain is acceptable.
+
+    SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
+    AddToWorklist(NewStoreChain.getNode());
+
+    MachineMemOperand::Flags LdMMOFlags =
+        isDereferenceable ? MachineMemOperand::MODereferenceable
+                          : MachineMemOperand::MONone;
+    if (IsNonTemporalLoad)
+      LdMMOFlags |= MachineMemOperand::MONonTemporal;
+
+    MachineMemOperand::Flags StMMOFlags = IsNonTemporalStore
+                                              ? MachineMemOperand::MONonTemporal
+                                              : MachineMemOperand::MONone;
+
+    SDValue NewLoad, NewStore;
+    if (UseVectorTy || !DoIntegerTruncate) {
+      NewLoad = DAG.getLoad(
+          JointMemOpVT, LoadDL, FirstLoad->getChain(), FirstLoad->getBasePtr(),
+          FirstLoad->getPointerInfo(), FirstLoadAlign, LdMMOFlags);
+      SDValue StoreOp = NewLoad;
+      if (NeedRotate) {
+        unsigned LoadWidth = ElementSizeBytes * 8 * 2;
+        assert(JointMemOpVT == EVT::getIntegerVT(Context, LoadWidth) &&
+               "Unexpected type for rotate-able load pair");
+        SDValue RotAmt =
+            DAG.getShiftAmountConstant(LoadWidth / 2, JointMemOpVT, LoadDL);
+        // Target can convert to the identical ROTR if it does not have ROTL.
+        StoreOp = DAG.getNode(ISD::ROTL, LoadDL, JointMemOpVT, NewLoad, RotAmt);
       }
+      NewStore = DAG.getStore(
+          NewStoreChain, StoreDL, StoreOp, FirstInChain->getBasePtr(),
+          FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags);
+    } else { // This must be the truncstore/extload case
+      EVT ExtendedTy =
+          TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
+      NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy,
+                               FirstLoad->getChain(), FirstLoad->getBasePtr(),
+                               FirstLoad->getPointerInfo(), JointMemOpVT,
+                               FirstLoadAlign, LdMMOFlags);
+      NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,
+                                   FirstInChain->getBasePtr(),
+                                   FirstInChain->getPointerInfo(), JointMemOpVT,
+                                   FirstInChain->getAlignment(),
+                                   FirstInChain->getMemOperand()->getFlags());
+    }
+
+    // Transfer chain users from old loads to the new load.
+    for (unsigned i = 0; i < NumElem; ++i) {
+      LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
+      DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
+                                    SDValue(NewLoad.getNode(), 1));
+    }
+
+    // Replace all stores with the new store. Recursively remove corresponding
+    // values if they are no longer used.
+    for (unsigned i = 0; i < NumElem; ++i) {
+      SDValue Val = StoreNodes[i].MemNode->getOperand(1);
+      CombineTo(StoreNodes[i].MemNode, NewStore);
+      if (Val.getNode()->use_empty())
+        recursivelyDeleteUnusedNodes(Val.getNode());
+    }
+
+    MadeChange = true;
+    StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
+    LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
+    NumConsecutiveStores -= NumElem;
+  }
+  return MadeChange;
+}
+
+bool DAGCombiner::mergeConsecutiveStores(StoreSDNode *St) {
+  if (OptLevel == CodeGenOpt::None || !EnableStoreMerging)
+    return false;
 
-      SDLoc LoadDL(LoadNodes[0].MemNode);
-      SDLoc StoreDL(StoreNodes[0].MemNode);
-
-      // The merged loads are required to have the same incoming chain, so
-      // using the first's chain is acceptable.
-
-      SDValue NewStoreChain = getMergeStoreChains(StoreNodes, NumElem);
-      AddToWorklist(NewStoreChain.getNode());
-
-      MachineMemOperand::Flags LdMMOFlags =
-          isDereferenceable ? MachineMemOperand::MODereferenceable
-                            : MachineMemOperand::MONone;
-      if (IsNonTemporalLoad)
-        LdMMOFlags |= MachineMemOperand::MONonTemporal;
-
-      MachineMemOperand::Flags StMMOFlags =
-          IsNonTemporalStore ? MachineMemOperand::MONonTemporal
-                             : MachineMemOperand::MONone;
-
-      SDValue NewLoad, NewStore;
-      if (UseVectorTy || !DoIntegerTruncate) {
-        NewLoad =
-            DAG.getLoad(JointMemOpVT, LoadDL, FirstLoad->getChain(),
-                        FirstLoad->getBasePtr(), FirstLoad->getPointerInfo(),
-                        FirstLoadAlign, LdMMOFlags);
-        NewStore = DAG.getStore(
-            NewStoreChain, StoreDL, NewLoad, FirstInChain->getBasePtr(),
-            FirstInChain->getPointerInfo(), FirstStoreAlign, StMMOFlags);
-      } else { // This must be the truncstore/extload case
-        EVT ExtendedTy =
-            TLI.getTypeToTransformTo(*DAG.getContext(), JointMemOpVT);
-        NewLoad = DAG.getExtLoad(ISD::EXTLOAD, LoadDL, ExtendedTy,
-                                 FirstLoad->getChain(), FirstLoad->getBasePtr(),
-                                 FirstLoad->getPointerInfo(), JointMemOpVT,
-                                 FirstLoadAlign, LdMMOFlags);
-        NewStore = DAG.getTruncStore(NewStoreChain, StoreDL, NewLoad,
-                                     FirstInChain->getBasePtr(),
-                                     FirstInChain->getPointerInfo(),
-                                     JointMemOpVT, FirstInChain->getAlignment(),
-                                     FirstInChain->getMemOperand()->getFlags());
-      }
+  // TODO: Extend this function to merge stores of scalable vectors.
+  // (i.e. two <vscale x 8 x i8> stores can be merged to one <vscale x 16 x i8>
+  // store since we know <vscale x 16 x i8> is exactly twice as large as
+  // <vscale x 8 x i8>). Until then, bail out for scalable vectors.
+  EVT MemVT = St->getMemoryVT();
+  if (MemVT.isScalableVector())
+    return false;
+  if (!MemVT.isSimple() || MemVT.getSizeInBits() * 2 > MaximumLegalStoreInBits)
+    return false;
 
-      // Transfer chain users from old loads to the new load.
-      for (unsigned i = 0; i < NumElem; ++i) {
-        LoadSDNode *Ld = cast<LoadSDNode>(LoadNodes[i].MemNode);
-        DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1),
-                                      SDValue(NewLoad.getNode(), 1));
-      }
+  // This function cannot currently deal with non-byte-sized memory sizes.
+  int64_t ElementSizeBytes = MemVT.getStoreSize();
+  if (ElementSizeBytes * 8 != (int64_t)MemVT.getSizeInBits())
+    return false;
 
-      // Replace the all stores with the new store. Recursively remove
-      // corresponding value if its no longer used.
-      for (unsigned i = 0; i < NumElem; ++i) {
-        SDValue Val = StoreNodes[i].MemNode->getOperand(1);
-        CombineTo(StoreNodes[i].MemNode, NewStore);
-        if (Val.getNode()->use_empty())
-          recursivelyDeleteUnusedNodes(Val.getNode());
-      }
+  // Do not bother looking at stored values that are not constants, loads, or
+  // extracted vector elements.
+  SDValue StoredVal = peekThroughBitcasts(St->getValue());
+  const StoreSource StoreSrc = getStoreSource(StoredVal);
+  if (StoreSrc == StoreSource::Unknown)
+    return false;
 
-      RV = true;
-      StoreNodes.erase(StoreNodes.begin(), StoreNodes.begin() + NumElem);
-      LoadNodes.erase(LoadNodes.begin(), LoadNodes.begin() + NumElem);
-      NumConsecutiveStores -= NumElem;
+  SmallVector<MemOpLink, 8> StoreNodes;
+  SDNode *RootNode;
+  // Find potential store merge candidates by searching through chain sub-DAG
+  getStoreMergeCandidates(St, StoreNodes, RootNode);
+
+  // Check if there is anything to merge.
+  if (StoreNodes.size() < 2)
+    return false;
+
+  // Sort the memory operands according to their distance from the
+  // base pointer.
+  llvm::sort(StoreNodes, [](MemOpLink LHS, MemOpLink RHS) {
+    return LHS.OffsetFromBase < RHS.OffsetFromBase;
+  });
+
+  bool AllowVectors = !DAG.getMachineFunction().getFunction().hasFnAttribute(
+      Attribute::NoImplicitFloat);
+  bool IsNonTemporalStore = St->isNonTemporal();
+  bool IsNonTemporalLoad = StoreSrc == StoreSource::Load &&
+                           cast<LoadSDNode>(StoredVal)->isNonTemporal();
+
+  // Store Merge attempts to merge the lowest stores. This generally
+  // works out as if successful, as the remaining stores are checked
+  // after the first collection of stores is merged. However, in the
+  // case that a non-mergeable store is found first, e.g., {p[-2],
+  // p[0], p[1], p[2], p[3]}, we would fail and miss the subsequent
+  // mergeable cases. To prevent this, we prune such stores from the
+  // front of StoreNodes here.
+  bool MadeChange = false;
+  while (StoreNodes.size() > 1) {
+    unsigned NumConsecutiveStores =
+        getConsecutiveStores(StoreNodes, ElementSizeBytes);
+    // There are no more stores in the list to examine.
+    if (NumConsecutiveStores == 0)
+      return MadeChange;
+
+    // We have at least 2 consecutive stores. Try to merge them.
+    assert(NumConsecutiveStores >= 2 && "Expected at least 2 stores");
+    switch (StoreSrc) {
+    case StoreSource::Constant:
+      MadeChange |= tryStoreMergeOfConstants(StoreNodes, NumConsecutiveStores,
+                                             MemVT, RootNode, AllowVectors);
+      break;
+
+    case StoreSource::Extract:
+      MadeChange |= tryStoreMergeOfExtracts(StoreNodes, NumConsecutiveStores,
+                                            MemVT, RootNode);
+      break;
+
+    case StoreSource::Load:
+      MadeChange |= tryStoreMergeOfLoads(StoreNodes, NumConsecutiveStores,
+                                         MemVT, RootNode, AllowVectors,
+                                         IsNonTemporalStore, IsNonTemporalLoad);
+      break;
+
+    default:
+      llvm_unreachable("Unhandled store source type");
     }
   }
-  return RV;
+  return MadeChange;
 }
 
 SDValue DAGCombiner::replaceStoreChain(StoreSDNode *ST, SDValue BetterChain) {
@@ -16408,11 +17017,12 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
 
   // Try to infer better alignment information than the store already has.
   if (OptLevel != CodeGenOpt::None && ST->isUnindexed() && !ST->isAtomic()) {
-    if (unsigned Align = DAG.InferPtrAlignment(Ptr)) {
-      if (Align > ST->getAlignment() && ST->getSrcValueOffset() % Align == 0) {
+    if (MaybeAlign Alignment = DAG.InferPtrAlign(Ptr)) {
+      if (*Alignment > ST->getAlign() &&
+          isAligned(*Alignment, ST->getSrcValueOffset())) {
         SDValue NewStore =
             DAG.getTruncStore(Chain, SDLoc(N), Value, Ptr, ST->getPointerInfo(),
-                              ST->getMemoryVT(), Align,
+                              ST->getMemoryVT(), *Alignment,
                               ST->getMemOperand()->getFlags(), ST->getAAInfo());
         // NewStore will always be N as we are only refining the alignment
         assert(NewStore.getNode() == N);
@@ -16497,7 +17107,10 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       }
 
       if (OptLevel != CodeGenOpt::None && ST1->hasOneUse() &&
-          !ST1->getBasePtr().isUndef()) {
+          !ST1->getBasePtr().isUndef() &&
+          // BaseIndexOffset and the code below requires knowing the size
+          // of a vector, so bail out if MemoryVT is scalable.
+          !ST1->getMemoryVT().isScalableVector()) {
         const BaseIndexOffset STBase = BaseIndexOffset::match(ST, DAG);
         const BaseIndexOffset ChainBase = BaseIndexOffset::match(ST1, DAG);
         unsigned STBitSize = ST->getMemoryVT().getSizeInBits();
@@ -16510,33 +17123,6 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
           CombineTo(ST1, ST1->getChain());
           return SDValue();
         }
-
-        // If ST stores to a subset of preceding store's write set, we may be
-        // able to fold ST's value into the preceding stored value. As we know
-        // the other uses of ST1's chain are unconcerned with ST, this folding
-        // will not affect those nodes.
-        int64_t BitOffset;
-        if (ChainBase.contains(DAG, ChainBitSize, STBase, STBitSize,
-                               BitOffset)) {
-          SDValue ChainValue = ST1->getValue();
-          if (auto *C1 = dyn_cast<ConstantSDNode>(ChainValue)) {
-            if (auto *C = dyn_cast<ConstantSDNode>(Value)) {
-              APInt Val = C1->getAPIntValue();
-              APInt InsertVal = C->getAPIntValue().zextOrTrunc(STBitSize);
-              // FIXME: Handle Big-endian mode.
-              if (!DAG.getDataLayout().isBigEndian()) {
-                Val.insertBits(InsertVal, BitOffset);
-                SDValue NewSDVal =
-                    DAG.getConstant(Val, SDLoc(C), ChainValue.getValueType(),
-                                    C1->isTargetOpcode(), C1->isOpaque());
-                SDNode *NewST1 = DAG.UpdateNodeOperands(
-                    ST1, ST1->getChain(), NewSDVal, ST1->getOperand(2),
-                    ST1->getOperand(3));
-                return CombineTo(ST, SDValue(NewST1, 0));
-              }
-            }
-          }
-        } // End ST subset of ST1 case.
       }
     }
   }
@@ -16559,7 +17145,7 @@ SDValue DAGCombiner::visitSTORE(SDNode *N) {
       // There can be multiple store sequences on the same chain.
       // Keep trying to merge store sequences until we are unable to do so
       // or until we merge the last store on the chain.
-      bool Changed = MergeConsecutiveStores(ST);
+      bool Changed = mergeConsecutiveStores(ST);
       if (!Changed) break;
       // Return N as merge only uses CombineTo and no worklist clean
       // up is necessary.
@@ -16835,6 +17421,10 @@ SDValue DAGCombiner::combineInsertEltToShuffle(SDNode *N, unsigned InsIndex) {
   EVT SubVecVT = SubVec.getValueType();
   EVT VT = DestVec.getValueType();
   unsigned NumSrcElts = SubVecVT.getVectorNumElements();
+  // If the source only has a single vector element, the cost of creating adding
+  // it to a vector is likely to exceed the cost of a insert_vector_elt.
+  if (NumSrcElts == 1)
+    return SDValue();
   unsigned ExtendRatio = VT.getSizeInBits() / SubVecVT.getSizeInBits();
   unsigned NumMaskVals = ExtendRatio * NumSrcElts;
 
@@ -16880,12 +17470,12 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
   SDLoc DL(N);
 
   EVT VT = InVec.getValueType();
-  unsigned NumElts = VT.getVectorNumElements();
+  auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
 
   // Insert into out-of-bounds element is undefined.
-  if (auto *IndexC = dyn_cast<ConstantSDNode>(EltNo))
-    if (IndexC->getZExtValue() >= VT.getVectorNumElements())
-      return DAG.getUNDEF(VT);
+  if (IndexC && VT.isFixedLengthVector() &&
+      IndexC->getZExtValue() >= VT.getVectorNumElements())
+    return DAG.getUNDEF(VT);
 
   // Remove redundant insertions:
   // (insert_vector_elt x (extract_vector_elt x idx) idx) -> x
@@ -16893,17 +17483,25 @@ SDValue DAGCombiner::visitINSERT_VECTOR_ELT(SDNode *N) {
       InVec == InVal.getOperand(0) && EltNo == InVal.getOperand(1))
     return InVec;
 
-  auto *IndexC = dyn_cast<ConstantSDNode>(EltNo);
   if (!IndexC) {
     // If this is variable insert to undef vector, it might be better to splat:
     // inselt undef, InVal, EltNo --> build_vector < InVal, InVal, ... >
     if (InVec.isUndef() && TLI.shouldSplatInsEltVarIndex(VT)) {
-      SmallVector<SDValue, 8> Ops(NumElts, InVal);
-      return DAG.getBuildVector(VT, DL, Ops);
+      if (VT.isScalableVector())
+        return DAG.getSplatVector(VT, DL, InVal);
+      else {
+        SmallVector<SDValue, 8> Ops(VT.getVectorNumElements(), InVal);
+        return DAG.getBuildVector(VT, DL, Ops);
+      }
     }
     return SDValue();
   }
 
+  if (VT.isScalableVector())
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+
   // We must know which element is being inserted for folds below here.
   unsigned Elt = IndexC->getZExtValue();
   if (SDValue Shuf = combineInsertEltToShuffle(N, Elt))
@@ -16968,11 +17566,12 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
 
   EVT ResultVT = EVE->getValueType(0);
   EVT VecEltVT = InVecVT.getVectorElementType();
-  unsigned Align = OriginalLoad->getAlignment();
-  unsigned NewAlign = DAG.getDataLayout().getABITypeAlignment(
+  Align Alignment = OriginalLoad->getAlign();
+  Align NewAlign = DAG.getDataLayout().getABITypeAlign(
       VecEltVT.getTypeForEVT(*DAG.getContext()));
 
-  if (NewAlign > Align || !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
+  if (NewAlign > Alignment ||
+      !TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT))
     return SDValue();
 
   ISD::LoadExtType ExtTy = ResultVT.bitsGT(VecEltVT) ?
@@ -16980,7 +17579,7 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
   if (!TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
     return SDValue();
 
-  Align = NewAlign;
+  Alignment = NewAlign;
 
   SDValue NewPtr = OriginalLoad->getBasePtr();
   SDValue Offset;
@@ -17020,13 +17619,13 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
                                    : ISD::EXTLOAD;
     Load = DAG.getExtLoad(ExtType, SDLoc(EVE), ResultVT,
                           OriginalLoad->getChain(), NewPtr, MPI, VecEltVT,
-                          Align, OriginalLoad->getMemOperand()->getFlags(),
+                          Alignment, OriginalLoad->getMemOperand()->getFlags(),
                           OriginalLoad->getAAInfo());
     Chain = Load.getValue(1);
   } else {
-    Load = DAG.getLoad(VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr,
-                       MPI, Align, OriginalLoad->getMemOperand()->getFlags(),
-                       OriginalLoad->getAAInfo());
+    Load = DAG.getLoad(
+        VecEltVT, SDLoc(EVE), OriginalLoad->getChain(), NewPtr, MPI, Alignment,
+        OriginalLoad->getMemOperand()->getFlags(), OriginalLoad->getAAInfo());
     Chain = Load.getValue(1);
     if (ResultVT.bitsLT(VecEltVT))
       Load = DAG.getNode(ISD::TRUNCATE, SDLoc(EVE), ResultVT, Load);
@@ -17102,6 +17701,10 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
 
   // (vextract (scalar_to_vector val, 0) -> val
   if (VecOp.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+    // Only 0'th element of SCALAR_TO_VECTOR is defined.
+    if (DAG.isKnownNeverZero(Index))
+      return DAG.getUNDEF(ScalarVT);
+
     // Check if the result type doesn't match the inserted element type. A
     // SCALAR_TO_VECTOR may truncate the inserted element and the
     // EXTRACT_VECTOR_ELT may widen the extracted vector.
@@ -17115,15 +17718,21 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
 
   // extract_vector_elt of out-of-bounds element -> UNDEF
   auto *IndexC = dyn_cast<ConstantSDNode>(Index);
-  unsigned NumElts = VecVT.getVectorNumElements();
-  if (IndexC && IndexC->getAPIntValue().uge(NumElts))
+  if (IndexC && VecVT.isFixedLengthVector() &&
+      IndexC->getAPIntValue().uge(VecVT.getVectorNumElements()))
     return DAG.getUNDEF(ScalarVT);
 
   // extract_vector_elt (build_vector x, y), 1 -> y
-  if (IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR &&
+  if (((IndexC && VecOp.getOpcode() == ISD::BUILD_VECTOR) ||
+       VecOp.getOpcode() == ISD::SPLAT_VECTOR) &&
       TLI.isTypeLegal(VecVT) &&
       (VecOp.hasOneUse() || TLI.aggressivelyPreferBuildVectorSources(VecVT))) {
-    SDValue Elt = VecOp.getOperand(IndexC->getZExtValue());
+    assert((VecOp.getOpcode() != ISD::BUILD_VECTOR ||
+            VecVT.isFixedLengthVector()) &&
+           "BUILD_VECTOR used for scalable vectors");
+    unsigned IndexVal =
+        VecOp.getOpcode() == ISD::BUILD_VECTOR ? IndexC->getZExtValue() : 0;
+    SDValue Elt = VecOp.getOperand(IndexVal);
     EVT InEltVT = Elt.getValueType();
 
     // Sometimes build_vector's scalar input types do not match result type.
@@ -17134,6 +17743,15 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
     // converts.
   }
 
+  if (VecVT.isScalableVector())
+    return SDValue();
+
+  // All the code from this point onwards assumes fixed width vectors, but it's
+  // possible that some of the combinations could be made to work for scalable
+  // vectors too.
+  unsigned NumElts = VecVT.getVectorNumElements();
+  unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
+
   // TODO: These transforms should not require the 'hasOneUse' restriction, but
   // there are regressions on multiple targets without it. We can end up with a
   // mess of scalar and vector code if we reduce only part of the DAG to scalar.
@@ -17157,7 +17775,6 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
              "Extract element and scalar to vector can't change element type "
              "from FP to integer.");
       unsigned XBitWidth = X.getValueSizeInBits();
-      unsigned VecEltBitWidth = VecVT.getScalarSizeInBits();
       BCTruncElt = IsLE ? 0 : XBitWidth / VecEltBitWidth - 1;
 
       // An extract element return value type can be wider than its vector
@@ -17215,9 +17832,8 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
         // FIXME: Should really be just isOperationLegalOrCustom.
         TLI.isOperationLegal(ISD::EXTRACT_VECTOR_ELT, VecVT) ||
         TLI.isOperationExpand(ISD::VECTOR_SHUFFLE, VecVT)) {
-      EVT IndexTy = TLI.getVectorIdxTy(DAG.getDataLayout());
       return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT, SVInVec,
-                         DAG.getConstant(OrigElt, DL, IndexTy));
+                         DAG.getVectorIdxConstant(OrigElt, DL));
     }
   }
 
@@ -17241,6 +17857,14 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
         AddToWorklist(N);
       return SDValue(N, 0);
     }
+    APInt DemandedBits = APInt::getAllOnesValue(VecEltBitWidth);
+    if (SimplifyDemandedBits(VecOp, DemandedBits, DemandedElts, true)) {
+      // We simplified the vector operand of this extract element. If this
+      // extract is not dead, visit it again so it is folded properly.
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        AddToWorklist(N);
+      return SDValue(N, 0);
+    }
   }
 
   // Everything under here is trying to match an extract of a loaded value.
@@ -17326,6 +17950,30 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
       Elt = (Idx < (int)NumElts) ? Idx : Idx - (int)NumElts;
       Index = DAG.getConstant(Elt, DL, Index.getValueType());
     }
+  } else if (VecOp.getOpcode() == ISD::CONCAT_VECTORS && !BCNumEltsChanged &&
+             VecVT.getVectorElementType() == ScalarVT &&
+             (!LegalTypes ||
+              TLI.isTypeLegal(
+                  VecOp.getOperand(0).getValueType().getVectorElementType()))) {
+    // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 0
+    //      -> extract_vector_elt a, 0
+    // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 1
+    //      -> extract_vector_elt a, 1
+    // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 2
+    //      -> extract_vector_elt b, 0
+    // extract_vector_elt (concat_vectors v2i16:a, v2i16:b), 3
+    //      -> extract_vector_elt b, 1
+    SDLoc SL(N);
+    EVT ConcatVT = VecOp.getOperand(0).getValueType();
+    unsigned ConcatNumElts = ConcatVT.getVectorNumElements();
+    SDValue NewIdx = DAG.getConstant(Elt % ConcatNumElts, SL,
+                                     Index.getValueType());
+
+    SDValue ConcatOp = VecOp.getOperand(Elt / ConcatNumElts);
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL,
+                              ConcatVT.getVectorElementType(),
+                              ConcatOp, NewIdx);
+    return DAG.getNode(ISD::BITCAST, SL, ScalarVT, Elt);
   }
 
   // Make sure we found a non-volatile load and the extractelement is
@@ -17407,6 +18055,11 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
   if (!ValidTypes)
     return SDValue();
 
+  // If we already have a splat buildvector, then don't fold it if it means
+  // introducing zeros.
+  if (!AllAnyExt && DAG.isSplatValue(SDValue(N, 0), /*AllowUndefs*/ true))
+    return SDValue();
+
   bool isLE = DAG.getDataLayout().isLittleEndian();
   unsigned ElemRatio = OutScalarTy.getSizeInBits()/SourceType.getSizeInBits();
   assert(ElemRatio > 1 && "Invalid element size ratio");
@@ -17453,12 +18106,89 @@ SDValue DAGCombiner::reduceBuildVecExtToExtBuildVec(SDNode *N) {
   return DAG.getBitcast(VT, BV);
 }
 
+// Simplify (build_vec (trunc $1)
+//                     (trunc (srl $1 half-width))
+//                     (trunc (srl $1 (2 * half-width))) …)
+// to (bitcast $1)
+SDValue DAGCombiner::reduceBuildVecTruncToBitCast(SDNode *N) {
+  assert(N->getOpcode() == ISD::BUILD_VECTOR && "Expected build vector");
+
+  // Only for little endian
+  if (!DAG.getDataLayout().isLittleEndian())
+    return SDValue();
+
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  EVT OutScalarTy = VT.getScalarType();
+  uint64_t ScalarTypeBitsize = OutScalarTy.getSizeInBits();
+
+  // Only for power of two types to be sure that bitcast works well
+  if (!isPowerOf2_64(ScalarTypeBitsize))
+    return SDValue();
+
+  unsigned NumInScalars = N->getNumOperands();
+
+  // Look through bitcasts
+  auto PeekThroughBitcast = [](SDValue Op) {
+    if (Op.getOpcode() == ISD::BITCAST)
+      return Op.getOperand(0);
+    return Op;
+  };
+
+  // The source value where all the parts are extracted.
+  SDValue Src;
+  for (unsigned i = 0; i != NumInScalars; ++i) {
+    SDValue In = PeekThroughBitcast(N->getOperand(i));
+    // Ignore undef inputs.
+    if (In.isUndef()) continue;
+
+    if (In.getOpcode() != ISD::TRUNCATE)
+      return SDValue();
+
+    In = PeekThroughBitcast(In.getOperand(0));
+
+    if (In.getOpcode() != ISD::SRL) {
+      // For now only build_vec without shuffling, handle shifts here in the
+      // future.
+      if (i != 0)
+        return SDValue();
+
+      Src = In;
+    } else {
+      // In is SRL
+      SDValue part = PeekThroughBitcast(In.getOperand(0));
+
+      if (!Src) {
+        Src = part;
+      } else if (Src != part) {
+        // Vector parts do not stem from the same variable
+        return SDValue();
+      }
+
+      SDValue ShiftAmtVal = In.getOperand(1);
+      if (!isa<ConstantSDNode>(ShiftAmtVal))
+        return SDValue();
+
+      uint64_t ShiftAmt = In.getNode()->getConstantOperandVal(1);
+
+      // The extracted value is not extracted at the right position
+      if (ShiftAmt != i * ScalarTypeBitsize)
+        return SDValue();
+    }
+  }
+
+  // Only cast if the size is the same
+  if (Src.getValueType().getSizeInBits() != VT.getSizeInBits())
+    return SDValue();
+
+  return DAG.getBitcast(VT, Src);
+}
+
 SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
                                            ArrayRef<int> VectorMask,
                                            SDValue VecIn1, SDValue VecIn2,
                                            unsigned LeftIdx, bool DidSplitVec) {
-  MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
-  SDValue ZeroIdx = DAG.getConstant(0, DL, IdxTy);
+  SDValue ZeroIdx = DAG.getVectorIdxConstant(0, DL);
 
   EVT VT = N->getValueType(0);
   EVT InVT1 = VecIn1.getValueType();
@@ -17492,7 +18222,7 @@ SDValue DAGCombiner::createBuildVecShuffle(const SDLoc &DL, SDNode *N,
         // If we only have one input vector, and it's twice the size of the
         // output, split it in two.
         VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1,
-                             DAG.getConstant(NumElems, DL, IdxTy));
+                             DAG.getVectorIdxConstant(NumElems, DL));
         VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, VecIn1, ZeroIdx);
         // Since we now have shorter input vectors, adjust the offset of the
         // second vector's start.
@@ -17699,6 +18429,9 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
       return SDValue();
     SDValue ExtractedFromVec = Op.getOperand(0);
 
+    if (ExtractedFromVec.getValueType().isScalableVector())
+      return SDValue();
+
     const APInt &ExtractIdx = Op.getConstantOperandAPInt(1);
     if (ExtractIdx.uge(ExtractedFromVec.getValueType().getVectorNumElements()))
       return SDValue();
@@ -17733,7 +18466,6 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
     unsigned NearestPow2 = 0;
     SDValue Vec = VecIn.back();
     EVT InVT = Vec.getValueType();
-    MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
     SmallVector<unsigned, 8> IndexVec(NumElems, 0);
 
     for (unsigned i = 0; i < NumElems; i++) {
@@ -17752,9 +18484,9 @@ SDValue DAGCombiner::reduceBuildVecToShuffle(SDNode *N) {
                                      InVT.getVectorElementType(), SplitSize);
       if (TLI.isTypeLegal(SplitVT)) {
         SDValue VecIn2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
-                                     DAG.getConstant(SplitSize, DL, IdxTy));
+                                     DAG.getVectorIdxConstant(SplitSize, DL));
         SDValue VecIn1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SplitVT, Vec,
-                                     DAG.getConstant(0, DL, IdxTy));
+                                     DAG.getVectorIdxConstant(0, DL));
         VecIn.pop_back();
         VecIn.push_back(VecIn1);
         VecIn.push_back(VecIn2);
@@ -17986,6 +18718,9 @@ SDValue DAGCombiner::visitBUILD_VECTOR(SDNode *N) {
   if (SDValue V = reduceBuildVecExtToExtBuildVec(N))
     return V;
 
+  if (SDValue V = reduceBuildVecTruncToBitCast(N))
+    return V;
+
   if (SDValue V = reduceBuildVecToShuffle(N))
     return V;
 
@@ -18080,6 +18815,7 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
 
     // What vector are we extracting the subvector from and at what index?
     SDValue ExtVec = Op.getOperand(0);
+    int ExtIdx = Op.getConstantOperandVal(1);
 
     // We want the EVT of the original extraction to correctly scale the
     // extraction index.
@@ -18092,10 +18828,6 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
       continue;
     }
 
-    if (!isa<ConstantSDNode>(Op.getOperand(1)))
-      return SDValue();
-    int ExtIdx = Op.getConstantOperandVal(1);
-
     // Ensure that we are extracting a subvector from a vector the same
     // size as the result.
     if (ExtVT.getSizeInBits() != VT.getSizeInBits())
@@ -18129,6 +18861,69 @@ static SDValue combineConcatVectorOfExtracts(SDNode *N, SelectionDAG &DAG) {
                                      DAG.getBitcast(VT, SV1), Mask, DAG);
 }
 
+static SDValue combineConcatVectorOfCasts(SDNode *N, SelectionDAG &DAG) {
+  unsigned CastOpcode = N->getOperand(0).getOpcode();
+  switch (CastOpcode) {
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    // TODO: Allow more opcodes?
+    //  case ISD::BITCAST:
+    //  case ISD::TRUNCATE:
+    //  case ISD::ZERO_EXTEND:
+    //  case ISD::SIGN_EXTEND:
+    //  case ISD::FP_EXTEND:
+    break;
+  default:
+    return SDValue();
+  }
+
+  EVT SrcVT = N->getOperand(0).getOperand(0).getValueType();
+  if (!SrcVT.isVector())
+    return SDValue();
+
+  // All operands of the concat must be the same kind of cast from the same
+  // source type.
+  SmallVector<SDValue, 4> SrcOps;
+  for (SDValue Op : N->ops()) {
+    if (Op.getOpcode() != CastOpcode || !Op.hasOneUse() ||
+        Op.getOperand(0).getValueType() != SrcVT)
+      return SDValue();
+    SrcOps.push_back(Op.getOperand(0));
+  }
+
+  // The wider cast must be supported by the target. This is unusual because
+  // the operation support type parameter depends on the opcode. In addition,
+  // check the other type in the cast to make sure this is really legal.
+  EVT VT = N->getValueType(0);
+  EVT SrcEltVT = SrcVT.getVectorElementType();
+  unsigned NumElts = SrcVT.getVectorElementCount().Min * N->getNumOperands();
+  EVT ConcatSrcVT = EVT::getVectorVT(*DAG.getContext(), SrcEltVT, NumElts);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  switch (CastOpcode) {
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:
+    if (!TLI.isOperationLegalOrCustom(CastOpcode, ConcatSrcVT) ||
+        !TLI.isTypeLegal(VT))
+      return SDValue();
+    break;
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT:
+    if (!TLI.isOperationLegalOrCustom(CastOpcode, VT) ||
+        !TLI.isTypeLegal(ConcatSrcVT))
+      return SDValue();
+    break;
+  default:
+    llvm_unreachable("Unexpected cast opcode");
+  }
+
+  // concat (cast X), (cast Y)... -> cast (concat X, Y...)
+  SDLoc DL(N);
+  SDValue NewConcat = DAG.getNode(ISD::CONCAT_VECTORS, DL, ConcatSrcVT, SrcOps);
+  return DAG.getNode(CastOpcode, DL, VT, NewConcat);
+}
+
 SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
   // If we only have one input vector, we don't need to do any concatenation.
   if (N->getNumOperands() == 1)
@@ -18256,6 +19051,9 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
     if (SDValue V = combineConcatVectorOfExtracts(N, DAG))
       return V;
 
+  if (SDValue V = combineConcatVectorOfCasts(N, DAG))
+    return V;
+
   // Type legalization of vectors and DAG canonicalization of SHUFFLE_VECTOR
   // nodes often generate nop CONCAT_VECTOR nodes.
   // Scan the CONCAT_VECTOR operands and look for a CONCAT operations that
@@ -18287,14 +19085,9 @@ SDValue DAGCombiner::visitCONCAT_VECTORS(SDNode *N) {
         return SDValue();
     }
 
-    auto *CS = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-    // The extract index must be constant.
-    if (!CS)
-      return SDValue();
-
     // Check that we are reading from the identity index.
     unsigned IdentityIndex = i * PartNumElem;
-    if (CS->getAPIntValue() != IdentityIndex)
+    if (Op.getConstantOperandAPInt(1) != IdentityIndex)
       return SDValue();
   }
 
@@ -18377,6 +19170,15 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
   if (!TLI.isBinOp(BOpcode) || BinOp.getNode()->getNumValues() != 1)
     return SDValue();
 
+  // Exclude the fake form of fneg (fsub -0.0, x) because that is likely to be
+  // reduced to the unary fneg when it is visited, and we probably want to deal
+  // with fneg in a target-specific way.
+  if (BOpcode == ISD::FSUB) {
+    auto *C = isConstOrConstSplatFP(BinOp.getOperand(0), /*AllowUndefs*/ true);
+    if (C && C->getValueAPF().isNegZero())
+      return SDValue();
+  }
+
   // The binop must be a vector type, so we can extract some fraction of it.
   EVT WideBVT = BinOp.getValueType();
   if (!WideBVT.isVector())
@@ -18412,12 +19214,11 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
   // bitcasted.
   unsigned ConcatOpNum = ExtractIndex / VT.getVectorNumElements();
   unsigned ExtBOIdx = ConcatOpNum * NarrowBVT.getVectorNumElements();
-  EVT ExtBOIdxVT = Extract->getOperand(1).getValueType();
   if (TLI.isExtractSubvectorCheap(NarrowBVT, WideBVT, ExtBOIdx) &&
       BinOp.hasOneUse() && Extract->getOperand(0)->hasOneUse()) {
     // extract (binop B0, B1), N --> binop (extract B0, N), (extract B1, N)
     SDLoc DL(Extract);
-    SDValue NewExtIndex = DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT);
+    SDValue NewExtIndex = DAG.getVectorIdxConstant(ExtBOIdx, DL);
     SDValue X = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
                             BinOp.getOperand(0), NewExtIndex);
     SDValue Y = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
@@ -18457,7 +19258,7 @@ static SDValue narrowExtractedVectorBinOp(SDNode *Extract, SelectionDAG &DAG) {
     // extract (binop (concat X1, X2), Y), N --> binop XN, (extract Y, IndexC)
     // extract (binop X, (concat Y1, Y2)), N --> binop (extract X, IndexC), YN
     SDLoc DL(Extract);
-    SDValue IndexC = DAG.getConstant(ExtBOIdx, DL, ExtBOIdxVT);
+    SDValue IndexC = DAG.getVectorIdxConstant(ExtBOIdx, DL);
     SDValue X = SubVecL ? DAG.getBitcast(NarrowBVT, SubVecL)
                         : DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NarrowBVT,
                                       BinOp.getOperand(0), IndexC);
@@ -18489,6 +19290,26 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
 
   // Allow targets to opt-out.
   EVT VT = Extract->getValueType(0);
+
+  // We can only create byte sized loads.
+  if (!VT.isByteSized())
+    return SDValue();
+
+  unsigned Index = ExtIdx->getZExtValue();
+  unsigned NumElts = VT.getVectorNumElements();
+
+  // If the index is a multiple of the extract element count, we can offset the
+  // address by the store size multiplied by the subvector index. Otherwise if
+  // the scalar type is byte sized, we can just use the index multiplied by
+  // the element size in bytes as the offset.
+  unsigned Offset;
+  if (Index % NumElts == 0)
+    Offset = (Index / NumElts) * VT.getStoreSize();
+  else if (VT.getScalarType().isByteSized())
+    Offset = Index * VT.getScalarType().getStoreSize();
+  else
+    return SDValue();
+
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.shouldReduceLoadWidth(Ld, Ld->getExtensionType(), VT))
     return SDValue();
@@ -18496,8 +19317,7 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
   // The narrow load will be offset from the base address of the old load if
   // we are extracting from something besides index 0 (little-endian).
   SDLoc DL(Extract);
-  SDValue BaseAddr = Ld->getOperand(1);
-  unsigned Offset = ExtIdx->getZExtValue() * VT.getScalarType().getStoreSize();
+  SDValue BaseAddr = Ld->getBasePtr();
 
   // TODO: Use "BaseIndexOffset" to make this more effective.
   SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
@@ -18512,6 +19332,7 @@ static SDValue narrowExtractedVectorLoad(SDNode *Extract, SelectionDAG &DAG) {
 SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
   EVT NVT = N->getValueType(0);
   SDValue V = N->getOperand(0);
+  uint64_t ExtIdx = N->getConstantOperandVal(1);
 
   // Extract from UNDEF is UNDEF.
   if (V.isUndef())
@@ -18523,9 +19344,7 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
 
   // Combine an extract of an extract into a single extract_subvector.
   // ext (ext X, C), 0 --> ext X, C
-  SDValue Index = N->getOperand(1);
-  if (isNullConstant(Index) && V.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-      V.hasOneUse() && isa<ConstantSDNode>(V.getOperand(1))) {
+  if (ExtIdx == 0 && V.getOpcode() == ISD::EXTRACT_SUBVECTOR && V.hasOneUse()) {
     if (TLI.isExtractSubvectorCheap(NVT, V.getOperand(0).getValueType(),
                                     V.getConstantOperandVal(1)) &&
         TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NVT)) {
@@ -18536,21 +19355,20 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
 
   // Try to move vector bitcast after extract_subv by scaling extraction index:
   // extract_subv (bitcast X), Index --> bitcast (extract_subv X, Index')
-  if (isa<ConstantSDNode>(Index) && V.getOpcode() == ISD::BITCAST &&
+  if (V.getOpcode() == ISD::BITCAST &&
       V.getOperand(0).getValueType().isVector()) {
     SDValue SrcOp = V.getOperand(0);
     EVT SrcVT = SrcOp.getValueType();
-    unsigned SrcNumElts = SrcVT.getVectorNumElements();
-    unsigned DestNumElts = V.getValueType().getVectorNumElements();
+    unsigned SrcNumElts = SrcVT.getVectorMinNumElements();
+    unsigned DestNumElts = V.getValueType().getVectorMinNumElements();
     if ((SrcNumElts % DestNumElts) == 0) {
       unsigned SrcDestRatio = SrcNumElts / DestNumElts;
-      unsigned NewExtNumElts = NVT.getVectorNumElements() * SrcDestRatio;
+      ElementCount NewExtEC = NVT.getVectorElementCount() * SrcDestRatio;
       EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
-                                      NewExtNumElts);
+                                      NewExtEC);
       if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
-        unsigned IndexValScaled = N->getConstantOperandVal(1) * SrcDestRatio;
         SDLoc DL(N);
-        SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
+        SDValue NewIndex = DAG.getVectorIdxConstant(ExtIdx * SrcDestRatio, DL);
         SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
                                          V.getOperand(0), NewIndex);
         return DAG.getBitcast(NVT, NewExtract);
@@ -18558,34 +19376,43 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     }
     if ((DestNumElts % SrcNumElts) == 0) {
       unsigned DestSrcRatio = DestNumElts / SrcNumElts;
-      if ((NVT.getVectorNumElements() % DestSrcRatio) == 0) {
-        unsigned NewExtNumElts = NVT.getVectorNumElements() / DestSrcRatio;
-        EVT NewExtVT = EVT::getVectorVT(*DAG.getContext(),
-                                        SrcVT.getScalarType(), NewExtNumElts);
-        if ((N->getConstantOperandVal(1) % DestSrcRatio) == 0 &&
-            TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
-          unsigned IndexValScaled = N->getConstantOperandVal(1) / DestSrcRatio;
+      if ((NVT.getVectorMinNumElements() % DestSrcRatio) == 0) {
+        ElementCount NewExtEC = NVT.getVectorElementCount() / DestSrcRatio;
+        EVT ScalarVT = SrcVT.getScalarType();
+        if ((ExtIdx % DestSrcRatio) == 0) {
           SDLoc DL(N);
-          SDValue NewIndex = DAG.getIntPtrConstant(IndexValScaled, DL);
-          SDValue NewExtract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
-                                           V.getOperand(0), NewIndex);
-          return DAG.getBitcast(NVT, NewExtract);
+          unsigned IndexValScaled = ExtIdx / DestSrcRatio;
+          EVT NewExtVT =
+              EVT::getVectorVT(*DAG.getContext(), ScalarVT, NewExtEC);
+          if (TLI.isOperationLegalOrCustom(ISD::EXTRACT_SUBVECTOR, NewExtVT)) {
+            SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
+            SDValue NewExtract =
+                DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewExtVT,
+                            V.getOperand(0), NewIndex);
+            return DAG.getBitcast(NVT, NewExtract);
+          }
+          if (NewExtEC == 1 &&
+              TLI.isOperationLegalOrCustom(ISD::EXTRACT_VECTOR_ELT, ScalarVT)) {
+            SDValue NewIndex = DAG.getVectorIdxConstant(IndexValScaled, DL);
+            SDValue NewExtract =
+                DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ScalarVT,
+                            V.getOperand(0), NewIndex);
+            return DAG.getBitcast(NVT, NewExtract);
+          }
         }
       }
     }
   }
 
-  if (V.getOpcode() == ISD::CONCAT_VECTORS && isa<ConstantSDNode>(Index)) {
+  if (V.getOpcode() == ISD::CONCAT_VECTORS) {
+    unsigned ExtNumElts = NVT.getVectorMinNumElements();
     EVT ConcatSrcVT = V.getOperand(0).getValueType();
     assert(ConcatSrcVT.getVectorElementType() == NVT.getVectorElementType() &&
            "Concat and extract subvector do not change element type");
-
-    unsigned ExtIdx = N->getConstantOperandVal(1);
-    unsigned ExtNumElts = NVT.getVectorNumElements();
-    assert(ExtIdx % ExtNumElts == 0 &&
+    assert((ExtIdx % ExtNumElts) == 0 &&
            "Extract index is not a multiple of the input vector length.");
 
-    unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorNumElements();
+    unsigned ConcatSrcNumElts = ConcatSrcVT.getVectorMinNumElements();
     unsigned ConcatOpIdx = ExtIdx / ConcatSrcNumElts;
 
     // If the concatenated source types match this extract, it's a direct
@@ -18599,15 +19426,14 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     // concat operand. Example:
     //   v2i8 extract_subvec (v16i8 concat (v8i8 X), (v8i8 Y), 14 -->
     //   v2i8 extract_subvec v8i8 Y, 6
-    if (ConcatSrcNumElts % ExtNumElts == 0) {
+    if (NVT.isFixedLengthVector() && ConcatSrcNumElts % ExtNumElts == 0) {
       SDLoc DL(N);
       unsigned NewExtIdx = ExtIdx - ConcatOpIdx * ConcatSrcNumElts;
       assert(NewExtIdx + ExtNumElts <= ConcatSrcNumElts &&
              "Trying to extract from >1 concat operand?");
       assert(NewExtIdx % ExtNumElts == 0 &&
              "Extract index is not a multiple of the input vector length.");
-      MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
-      SDValue NewIndexC = DAG.getConstant(NewExtIdx, DL, IdxTy);
+      SDValue NewIndexC = DAG.getVectorIdxConstant(NewExtIdx, DL);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NVT,
                          V.getOperand(ConcatOpIdx), NewIndexC);
     }
@@ -18617,37 +19443,33 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
 
   // If the input is a build vector. Try to make a smaller build vector.
   if (V.getOpcode() == ISD::BUILD_VECTOR) {
-    if (auto *IdxC = dyn_cast<ConstantSDNode>(Index)) {
-      EVT InVT = V.getValueType();
-      unsigned ExtractSize = NVT.getSizeInBits();
-      unsigned EltSize = InVT.getScalarSizeInBits();
-      // Only do this if we won't split any elements.
-      if (ExtractSize % EltSize == 0) {
-        unsigned NumElems = ExtractSize / EltSize;
-        EVT EltVT = InVT.getVectorElementType();
-        EVT ExtractVT = NumElems == 1 ? EltVT
-                                      : EVT::getVectorVT(*DAG.getContext(),
-                                                         EltVT, NumElems);
-        if ((Level < AfterLegalizeDAG ||
-             (NumElems == 1 ||
-              TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) &&
-            (!LegalTypes || TLI.isTypeLegal(ExtractVT))) {
-          unsigned IdxVal = IdxC->getZExtValue();
-          IdxVal *= NVT.getScalarSizeInBits();
-          IdxVal /= EltSize;
-
-          if (NumElems == 1) {
-            SDValue Src = V->getOperand(IdxVal);
-            if (EltVT != Src.getValueType())
-              Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src);
-            return DAG.getBitcast(NVT, Src);
-          }
-
-          // Extract the pieces from the original build_vector.
-          SDValue BuildVec = DAG.getBuildVector(
-              ExtractVT, SDLoc(N), V->ops().slice(IdxVal, NumElems));
-          return DAG.getBitcast(NVT, BuildVec);
+    EVT InVT = V.getValueType();
+    unsigned ExtractSize = NVT.getSizeInBits();
+    unsigned EltSize = InVT.getScalarSizeInBits();
+    // Only do this if we won't split any elements.
+    if (ExtractSize % EltSize == 0) {
+      unsigned NumElems = ExtractSize / EltSize;
+      EVT EltVT = InVT.getVectorElementType();
+      EVT ExtractVT =
+          NumElems == 1 ? EltVT
+                        : EVT::getVectorVT(*DAG.getContext(), EltVT, NumElems);
+      if ((Level < AfterLegalizeDAG ||
+           (NumElems == 1 ||
+            TLI.isOperationLegal(ISD::BUILD_VECTOR, ExtractVT))) &&
+          (!LegalTypes || TLI.isTypeLegal(ExtractVT))) {
+        unsigned IdxVal = (ExtIdx * NVT.getScalarSizeInBits()) / EltSize;
+
+        if (NumElems == 1) {
+          SDValue Src = V->getOperand(IdxVal);
+          if (EltVT != Src.getValueType())
+            Src = DAG.getNode(ISD::TRUNCATE, SDLoc(N), InVT, Src);
+          return DAG.getBitcast(NVT, Src);
         }
+
+        // Extract the pieces from the original build_vector.
+        SDValue BuildVec = DAG.getBuildVector(ExtractVT, SDLoc(N),
+                                              V->ops().slice(IdxVal, NumElems));
+        return DAG.getBitcast(NVT, BuildVec);
       }
     }
   }
@@ -18659,23 +19481,19 @@ SDValue DAGCombiner::visitEXTRACT_SUBVECTOR(SDNode *N) {
     if (!NVT.bitsEq(SmallVT))
       return SDValue();
 
-    // Only handle cases where both indexes are constants.
-    auto *ExtIdx = dyn_cast<ConstantSDNode>(Index);
-    auto *InsIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
-    if (InsIdx && ExtIdx) {
-      // Combine:
-      //    (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
-      // Into:
-      //    indices are equal or bit offsets are equal => V1
-      //    otherwise => (extract_subvec V1, ExtIdx)
-      if (InsIdx->getZExtValue() * SmallVT.getScalarSizeInBits() ==
-          ExtIdx->getZExtValue() * NVT.getScalarSizeInBits())
-        return DAG.getBitcast(NVT, V.getOperand(1));
-      return DAG.getNode(
-          ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT,
-          DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
-          Index);
-    }
+    // Combine:
+    //    (extract_subvec (insert_subvec V1, V2, InsIdx), ExtIdx)
+    // Into:
+    //    indices are equal or bit offsets are equal => V1
+    //    otherwise => (extract_subvec V1, ExtIdx)
+    uint64_t InsIdx = V.getConstantOperandVal(2);
+    if (InsIdx * SmallVT.getScalarSizeInBits() ==
+        ExtIdx * NVT.getScalarSizeInBits())
+      return DAG.getBitcast(NVT, V.getOperand(1));
+    return DAG.getNode(
+        ISD::EXTRACT_SUBVECTOR, SDLoc(N), NVT,
+        DAG.getBitcast(N->getOperand(0).getValueType(), V.getOperand(0)),
+        N->getOperand(1));
   }
 
   if (SDValue NarrowBOp = narrowExtractedVectorBinOp(N, DAG))
@@ -19064,6 +19882,57 @@ static SDValue combineShuffleOfSplatVal(ShuffleVectorSDNode *Shuf,
                               NewMask);
 }
 
+/// Combine shuffle of shuffle of the form:
+/// shuf (shuf X, undef, InnerMask), undef, OuterMask --> splat X
+static SDValue formSplatFromShuffles(ShuffleVectorSDNode *OuterShuf,
+                                     SelectionDAG &DAG) {
+  if (!OuterShuf->getOperand(1).isUndef())
+    return SDValue();
+  auto *InnerShuf = dyn_cast<ShuffleVectorSDNode>(OuterShuf->getOperand(0));
+  if (!InnerShuf || !InnerShuf->getOperand(1).isUndef())
+    return SDValue();
+
+  ArrayRef<int> OuterMask = OuterShuf->getMask();
+  ArrayRef<int> InnerMask = InnerShuf->getMask();
+  unsigned NumElts = OuterMask.size();
+  assert(NumElts == InnerMask.size() && "Mask length mismatch");
+  SmallVector<int, 32> CombinedMask(NumElts, -1);
+  int SplatIndex = -1;
+  for (unsigned i = 0; i != NumElts; ++i) {
+    // Undef lanes remain undef.
+    int OuterMaskElt = OuterMask[i];
+    if (OuterMaskElt == -1)
+      continue;
+
+    // Peek through the shuffle masks to get the underlying source element.
+    int InnerMaskElt = InnerMask[OuterMaskElt];
+    if (InnerMaskElt == -1)
+      continue;
+
+    // Initialize the splatted element.
+    if (SplatIndex == -1)
+      SplatIndex = InnerMaskElt;
+
+    // Non-matching index - this is not a splat.
+    if (SplatIndex != InnerMaskElt)
+      return SDValue();
+
+    CombinedMask[i] = InnerMaskElt;
+  }
+  assert((all_of(CombinedMask, [](int M) { return M == -1; }) ||
+          getSplatIndex(CombinedMask) != -1) &&
+         "Expected a splat mask");
+
+  // TODO: The transform may be a win even if the mask is not legal.
+  EVT VT = OuterShuf->getValueType(0);
+  assert(VT == InnerShuf->getValueType(0) && "Expected matching shuffle types");
+  if (!DAG.getTargetLoweringInfo().isShuffleMaskLegal(CombinedMask, VT))
+    return SDValue();
+
+  return DAG.getVectorShuffle(VT, SDLoc(OuterShuf), InnerShuf->getOperand(0),
+                              InnerShuf->getOperand(1), CombinedMask);
+}
+
 /// If the shuffle mask is taking exactly one element from the first vector
 /// operand and passing through all other elements from the second vector
 /// operand, return the index of the mask element that is choosing an element
@@ -19136,8 +20005,7 @@ static SDValue replaceShuffleOfInsert(ShuffleVectorSDNode *Shuf,
   // element used. Therefore, our new insert element occurs at the shuffle's
   // mask index value, not the insert's index value.
   // shuffle (insertelt v1, x, C), v2, mask --> insertelt v2, x, C'
-  SDValue NewInsIndex = DAG.getConstant(ShufOp0Index, SDLoc(Shuf),
-                                        Op0.getOperand(2).getValueType());
+  SDValue NewInsIndex = DAG.getVectorIdxConstant(ShufOp0Index, SDLoc(Shuf));
   return DAG.getNode(ISD::INSERT_VECTOR_ELT, SDLoc(Shuf), Op0.getValueType(),
                      Op1, Op0.getOperand(1), NewInsIndex);
 }
@@ -19223,6 +20091,9 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   if (SDValue V = combineShuffleOfSplatVal(SVN, DAG))
     return V;
 
+  if (SDValue V = formSplatFromShuffles(SVN, DAG))
+    return V;
+
   // If it is a splat, check if the argument vector is another splat or a
   // build_vector.
   if (SVN->isSplat() && SVN->getSplatIndex() < (int)NumElts) {
@@ -19234,7 +20105,7 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
       SDValue L = N0.getOperand(0), R = N0.getOperand(1);
       SDLoc DL(N);
       EVT EltVT = VT.getScalarType();
-      SDValue Index = DAG.getIntPtrConstant(SplatIndex, DL);
+      SDValue Index = DAG.getVectorIdxConstant(SplatIndex, DL);
       SDValue ExtL = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, L, Index);
       SDValue ExtR = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, R, Index);
       SDValue NewBO = DAG.getNode(N0.getOpcode(), DL, EltVT, ExtL, ExtR,
@@ -19354,16 +20225,6 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
   if (N0.getOpcode() == ISD::BITCAST && N0.hasOneUse() &&
       N1.isUndef() && Level < AfterLegalizeVectorOps &&
       TLI.isTypeLegal(VT)) {
-    auto ScaleShuffleMask = [](ArrayRef<int> Mask, int Scale) {
-      if (Scale == 1)
-        return SmallVector<int, 8>(Mask.begin(), Mask.end());
-
-      SmallVector<int, 8> NewMask;
-      for (int M : Mask)
-        for (int s = 0; s != Scale; ++s)
-          NewMask.push_back(M < 0 ? -1 : Scale * M + s);
-      return NewMask;
-    };
 
     SDValue BC0 = peekThroughOneUseBitcasts(N0);
     if (BC0.getOpcode() == ISD::VECTOR_SHUFFLE && BC0.hasOneUse()) {
@@ -19383,10 +20244,10 @@ SDValue DAGCombiner::visitVECTOR_SHUFFLE(SDNode *N) {
 
         // Scale the shuffle masks to the smaller scalar type.
         ShuffleVectorSDNode *InnerSVN = cast<ShuffleVectorSDNode>(BC0);
-        SmallVector<int, 8> InnerMask =
-            ScaleShuffleMask(InnerSVN->getMask(), InnerScale);
-        SmallVector<int, 8> OuterMask =
-            ScaleShuffleMask(SVN->getMask(), OuterScale);
+        SmallVector<int, 8> InnerMask;
+        SmallVector<int, 8> OuterMask;
+        narrowShuffleMaskElts(InnerScale, InnerSVN->getMask(), InnerMask);
+        narrowShuffleMaskElts(OuterScale, SVN->getMask(), OuterMask);
 
         // Merge the shuffle masks.
         SmallVector<int, 8> NewMask;
@@ -19547,7 +20408,9 @@ SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
 
   // Replace a SCALAR_TO_VECTOR(EXTRACT_VECTOR_ELT(V,C0)) pattern
   // with a VECTOR_SHUFFLE and possible truncate.
-  if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+  if (InVal.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      VT.isFixedLengthVector() &&
+      InVal->getOperand(0).getValueType().isFixedLengthVector()) {
     SDValue InVec = InVal->getOperand(0);
     SDValue EltNo = InVal->getOperand(1);
     auto InVecT = InVec.getValueType();
@@ -19576,11 +20439,10 @@ SDValue DAGCombiner::visitSCALAR_TO_VECTOR(SDNode *N) {
             return LegalShuffle;
           // If not we must truncate the vector.
           if (VT.getVectorNumElements() != InVecT.getVectorNumElements()) {
-            MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
-            SDValue ZeroIdx = DAG.getConstant(0, SDLoc(N), IdxTy);
-            EVT SubVT =
-                EVT::getVectorVT(*DAG.getContext(), InVecT.getVectorElementType(),
-                                 VT.getVectorNumElements());
+            SDValue ZeroIdx = DAG.getVectorIdxConstant(0, SDLoc(N));
+            EVT SubVT = EVT::getVectorVT(*DAG.getContext(),
+                                         InVecT.getVectorElementType(),
+                                         VT.getVectorNumElements());
             return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), SubVT,
                                LegalShuffle, ZeroIdx);
           }
@@ -19597,6 +20459,7 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   SDValue N2 = N->getOperand(2);
+  uint64_t InsIdx = N->getConstantOperandVal(2);
 
   // If inserting an UNDEF, just return the original vector.
   if (N1.isUndef())
@@ -19657,11 +20520,6 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
     return DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), VT, N0,
                        N1.getOperand(1), N2);
 
-  if (!isa<ConstantSDNode>(N2))
-    return SDValue();
-
-  uint64_t InsIdx = cast<ConstantSDNode>(N2)->getZExtValue();
-
   // Push subvector bitcasts to the output, adjusting the index as we go.
   // insert_subvector(bitcast(v), bitcast(s), c1)
   // -> bitcast(insert_subvector(v, s, c2))
@@ -19676,19 +20534,18 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
       EVT NewVT;
       SDLoc DL(N);
       SDValue NewIdx;
-      MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
       LLVMContext &Ctx = *DAG.getContext();
       unsigned NumElts = VT.getVectorNumElements();
       unsigned EltSizeInBits = VT.getScalarSizeInBits();
       if ((EltSizeInBits % N1SrcSVT.getSizeInBits()) == 0) {
         unsigned Scale = EltSizeInBits / N1SrcSVT.getSizeInBits();
         NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts * Scale);
-        NewIdx = DAG.getConstant(InsIdx * Scale, DL, IdxVT);
+        NewIdx = DAG.getVectorIdxConstant(InsIdx * Scale, DL);
       } else if ((N1SrcSVT.getSizeInBits() % EltSizeInBits) == 0) {
         unsigned Scale = N1SrcSVT.getSizeInBits() / EltSizeInBits;
         if ((NumElts % Scale) == 0 && (InsIdx % Scale) == 0) {
           NewVT = EVT::getVectorVT(Ctx, N1SrcSVT, NumElts / Scale);
-          NewIdx = DAG.getConstant(InsIdx / Scale, DL, IdxVT);
+          NewIdx = DAG.getVectorIdxConstant(InsIdx / Scale, DL);
         }
       }
       if (NewIdx && hasOperation(ISD::INSERT_SUBVECTOR, NewVT)) {
@@ -19704,8 +20561,7 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   // (insert_subvector (insert_subvector A, Idx0), Idx1)
   // -> (insert_subvector (insert_subvector A, Idx1), Idx0)
   if (N0.getOpcode() == ISD::INSERT_SUBVECTOR && N0.hasOneUse() &&
-      N1.getValueType() == N0.getOperand(1).getValueType() &&
-      isa<ConstantSDNode>(N0.getOperand(2))) {
+      N1.getValueType() == N0.getOperand(1).getValueType()) {
     unsigned OtherIdx = N0.getConstantOperandVal(2);
     if (InsIdx < OtherIdx) {
       // Swap nodes.
@@ -19722,10 +20578,8 @@ SDValue DAGCombiner::visitINSERT_SUBVECTOR(SDNode *N) {
   if (N0.getOpcode() == ISD::CONCAT_VECTORS && N0.hasOneUse() &&
       N0.getOperand(0).getValueType() == N1.getValueType()) {
     unsigned Factor = N1.getValueType().getVectorNumElements();
-
     SmallVector<SDValue, 8> Ops(N0->op_begin(), N0->op_end());
-    Ops[cast<ConstantSDNode>(N2)->getZExtValue() / Factor] = N1;
-
+    Ops[InsIdx / Factor] = N1;
     return DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(N), VT, Ops);
   }
 
@@ -19769,9 +20623,9 @@ SDValue DAGCombiner::visitVECREDUCE(SDNode *N) {
   // VECREDUCE over 1-element vector is just an extract.
   if (VT.getVectorNumElements() == 1) {
     SDLoc dl(N);
-    SDValue Res = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0,
-        DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    SDValue Res =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getVectorElementType(), N0,
+                    DAG.getVectorIdxConstant(0, dl));
     if (Res.getValueType() != N->getValueType(0))
       Res = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Res);
     return Res;
@@ -19904,10 +20758,9 @@ static SDValue scalarizeBinOpOfSplats(SDNode *N, SelectionDAG &DAG) {
     return SDValue();
 
   SDLoc DL(N);
-  SDValue IndexC =
-      DAG.getConstant(Index0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()));
-  SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, N0, IndexC);
-  SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, N1, IndexC);
+  SDValue IndexC = DAG.getVectorIdxConstant(Index0, DL);
+  SDValue X = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src0, IndexC);
+  SDValue Y = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src1, IndexC);
   SDValue ScalarBO = DAG.getNode(Opcode, DL, EltVT, X, Y, N->getFlags());
 
   // If all lanes but 1 are undefined, no need to splat the scalar result.
@@ -19937,6 +20790,7 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
   SDValue Ops[] = {LHS, RHS};
   EVT VT = N->getValueType(0);
   unsigned Opcode = N->getOpcode();
+  SDNodeFlags Flags = N->getFlags();
 
   // See if we can constant fold the vector operation.
   if (SDValue Fold = DAG.FoldConstantVectorArithmetic(
@@ -19960,10 +20814,37 @@ SDValue DAGCombiner::SimplifyVBinOp(SDNode *N) {
         (LHS.hasOneUse() || RHS.hasOneUse() || LHS == RHS)) {
       SDLoc DL(N);
       SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS.getOperand(0),
-                                     RHS.getOperand(0), N->getFlags());
+                                     RHS.getOperand(0), Flags);
       SDValue UndefV = LHS.getOperand(1);
       return DAG.getVectorShuffle(VT, DL, NewBinOp, UndefV, Shuf0->getMask());
     }
+
+    // Try to sink a splat shuffle after a binop with a uniform constant.
+    // This is limited to cases where neither the shuffle nor the constant have
+    // undefined elements because that could be poison-unsafe or inhibit
+    // demanded elements analysis. It is further limited to not change a splat
+    // of an inserted scalar because that may be optimized better by
+    // load-folding or other target-specific behaviors.
+    if (isConstOrConstSplat(RHS) && Shuf0 && is_splat(Shuf0->getMask()) &&
+        Shuf0->hasOneUse() && Shuf0->getOperand(1).isUndef() &&
+        Shuf0->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
+      // binop (splat X), (splat C) --> splat (binop X, C)
+      SDLoc DL(N);
+      SDValue X = Shuf0->getOperand(0);
+      SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, X, RHS, Flags);
+      return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
+                                  Shuf0->getMask());
+    }
+    if (isConstOrConstSplat(LHS) && Shuf1 && is_splat(Shuf1->getMask()) &&
+        Shuf1->hasOneUse() && Shuf1->getOperand(1).isUndef() &&
+        Shuf1->getOperand(0).getOpcode() != ISD::INSERT_VECTOR_ELT) {
+      // binop (splat C), (splat X) --> splat (binop C, X)
+      SDLoc DL(N);
+      SDValue X = Shuf1->getOperand(0);
+      SDValue NewBinOp = DAG.getNode(Opcode, DL, VT, LHS, X, Flags);
+      return DAG.getVectorShuffle(VT, DL, NewBinOp, DAG.getUNDEF(VT),
+                                  Shuf1->getMask());
+    }
   }
 
   // The following pattern is likely to emerge with vector reduction ops. Moving
@@ -20361,8 +21242,8 @@ SDValue DAGCombiner::convertSelectOfFPConstantsToLoadOffset(
   // Create a ConstantArray of the two constants.
   Constant *CA = ConstantArray::get(ArrayType::get(FPTy, 2), Elts);
   SDValue CPIdx = DAG.getConstantPool(CA, TLI.getPointerTy(DAG.getDataLayout()),
-                                      TD.getPrefTypeAlignment(FPTy));
-  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+                                      TD.getPrefTypeAlign(FPTy));
+  Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();
 
   // Get offsets to the 0 and 1 elements of the array, so we can select between
   // them.
@@ -20797,7 +21678,10 @@ SDValue DAGCombiner::buildSqrtEstimateImpl(SDValue Op, SDNodeFlags Flags,
         EVT CCVT = getSetCCResultType(VT);
         ISD::NodeType SelOpcode = VT.isVector() ? ISD::VSELECT : ISD::SELECT;
         DenormalMode DenormMode = DAG.getDenormalMode(VT);
-        if (DenormMode == DenormalMode::IEEE) {
+        if (DenormMode.Input == DenormalMode::IEEE) {
+          // This is specifically a check for the handling of denormal inputs,
+          // not the result.
+
           // fabs(X) < SmallestNormal ? 0.0 : Est
           const fltSemantics &FltSem = DAG.EVTToAPFloatSemantics(VT);
           APFloat SmallestNorm = APFloat::getSmallestNormalized(FltSem);
@@ -20849,9 +21733,11 @@ bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const {
                      : (LSN->getAddressingMode() == ISD::PRE_DEC)
                            ? -1 * C->getSExtValue()
                            : 0;
+      uint64_t Size =
+          MemoryLocation::getSizeOrUnknown(LSN->getMemoryVT().getStoreSize());
       return {LSN->isVolatile(), LSN->isAtomic(), LSN->getBasePtr(),
               Offset /*base offset*/,
-              Optional<int64_t>(LSN->getMemoryVT().getStoreSize()),
+              Optional<int64_t>(Size),
               LSN->getMemOperand()};
     }
     if (const auto *LN = cast<LifetimeSDNode>(N))
@@ -20911,21 +21797,24 @@ bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const {
   // If we know required SrcValue1 and SrcValue2 have relatively large
   // alignment compared to the size and offset of the access, we may be able
   // to prove they do not alias. This check is conservative for now to catch
-  // cases created by splitting vector types.
+  // cases created by splitting vector types, it only works when the offsets are
+  // multiples of the size of the data.
   int64_t SrcValOffset0 = MUC0.MMO->getOffset();
   int64_t SrcValOffset1 = MUC1.MMO->getOffset();
-  unsigned OrigAlignment0 = MUC0.MMO->getBaseAlignment();
-  unsigned OrigAlignment1 = MUC1.MMO->getBaseAlignment();
+  Align OrigAlignment0 = MUC0.MMO->getBaseAlign();
+  Align OrigAlignment1 = MUC1.MMO->getBaseAlign();
+  auto &Size0 = MUC0.NumBytes;
+  auto &Size1 = MUC1.NumBytes;
   if (OrigAlignment0 == OrigAlignment1 && SrcValOffset0 != SrcValOffset1 &&
-      MUC0.NumBytes.hasValue() && MUC1.NumBytes.hasValue() &&
-      *MUC0.NumBytes == *MUC1.NumBytes && OrigAlignment0 > *MUC0.NumBytes) {
-    int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0;
-    int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1;
+      Size0.hasValue() && Size1.hasValue() && *Size0 == *Size1 &&
+      OrigAlignment0 > *Size0 && SrcValOffset0 % *Size0 == 0 &&
+      SrcValOffset1 % *Size1 == 0) {
+    int64_t OffAlign0 = SrcValOffset0 % OrigAlignment0.value();
+    int64_t OffAlign1 = SrcValOffset1 % OrigAlignment1.value();
 
     // There is no overlap between these relatively aligned accesses of
     // similar size. Return no alias.
-    if ((OffAlign0 + *MUC0.NumBytes) <= OffAlign1 ||
-        (OffAlign1 + *MUC1.NumBytes) <= OffAlign0)
+    if ((OffAlign0 + *Size0) <= OffAlign1 || (OffAlign1 + *Size1) <= OffAlign0)
       return false;
   }
 
@@ -20938,11 +21827,12 @@ bool DAGCombiner::isAlias(SDNode *Op0, SDNode *Op1) const {
     UseAA = false;
 #endif
 
-  if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue()) {
+  if (UseAA && AA && MUC0.MMO->getValue() && MUC1.MMO->getValue() &&
+      Size0.hasValue() && Size1.hasValue()) {
     // Use alias analysis information.
     int64_t MinOffset = std::min(SrcValOffset0, SrcValOffset1);
-    int64_t Overlap0 = *MUC0.NumBytes + SrcValOffset0 - MinOffset;
-    int64_t Overlap1 = *MUC1.NumBytes + SrcValOffset1 - MinOffset;
+    int64_t Overlap0 = *Size0 + SrcValOffset0 - MinOffset;
+    int64_t Overlap1 = *Size1 + SrcValOffset1 - MinOffset;
     AliasResult AAResult = AA->alias(
         MemoryLocation(MUC0.MMO->getValue(), Overlap0,
                        UseTBAA ? MUC0.MMO->getAAInfo() : AAMDNodes()),
@@ -21099,10 +21989,10 @@ bool operator!=(const UnitT &, const UnitT &) { return false; }
 // redundant, as this function gets called when visiting every store
 // node, so why not let the work be done on each store as it's visited?
 //
-// I believe this is mainly important because MergeConsecutiveStores
+// I believe this is mainly important because mergeConsecutiveStores
 // is unable to deal with merging stores of different sizes, so unless
 // we improve the chains of all the potential candidates up-front
-// before running MergeConsecutiveStores, it might only see some of
+// before running mergeConsecutiveStores, it might only see some of
 // the nodes that will eventually be candidates, and then not be able
 // to go from a partially-merged state to the desired final
 // fully-merged state.
@@ -21131,6 +22021,12 @@ bool DAGCombiner::parallelizeChainedStores(StoreSDNode *St) {
   if (BasePtr.getBase().isUndef())
     return false;
 
+  // BaseIndexOffset assumes that offsets are fixed-size, which
+  // is not valid for scalable vectors where the offsets are
+  // scaled by `vscale`, so bail out early.
+  if (St->getMemoryVT().isScalableVector())
+    return false;
+
   // Add ST's interval.
   Intervals.insert(0, (St->getMemoryVT().getSizeInBits() + 7) / 8, Unit);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
index 2bec8613e79c..fc6c3a145f13 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -68,7 +68,6 @@
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -156,7 +155,7 @@ bool FastISel::lowerArguments() {
   for (Function::const_arg_iterator I = FuncInfo.Fn->arg_begin(),
                                     E = FuncInfo.Fn->arg_end();
        I != E; ++I) {
-    DenseMap<const Value *, unsigned>::iterator VI = LocalValueMap.find(&*I);
+    DenseMap<const Value *, Register>::iterator VI = LocalValueMap.find(&*I);
     assert(VI != LocalValueMap.end() && "Missed an argument?");
     FuncInfo.ValueMap[&*I] = VI->second;
   }
@@ -165,8 +164,8 @@ bool FastISel::lowerArguments() {
 
 /// Return the defined register if this instruction defines exactly one
 /// virtual register and uses no other virtual registers. Otherwise return 0.
-static unsigned findSinkableLocalRegDef(MachineInstr &MI) {
-  unsigned RegDef = 0;
+static Register findSinkableLocalRegDef(MachineInstr &MI) {
+  Register RegDef;
   for (const MachineOperand &MO : MI.operands()) {
     if (!MO.isReg())
       continue;
@@ -174,9 +173,9 @@ static unsigned findSinkableLocalRegDef(MachineInstr &MI) {
       if (RegDef)
         return 0;
       RegDef = MO.getReg();
-    } else if (Register::isVirtualRegister(MO.getReg())) {
+    } else if (MO.getReg().isVirtual()) {
       // This is another use of a vreg. Don't try to sink it.
-      return 0;
+      return Register();
     }
   }
   return RegDef;
@@ -202,7 +201,7 @@ void FastISel::flushLocalValueMap() {
       bool Store = true;
       if (!LocalMI.isSafeToMove(nullptr, Store))
         continue;
-      unsigned DefReg = findSinkableLocalRegDef(LocalMI);
+      Register DefReg = findSinkableLocalRegDef(LocalMI);
       if (DefReg == 0)
         continue;
 
@@ -217,7 +216,7 @@ void FastISel::flushLocalValueMap() {
   LastFlushPoint = FuncInfo.InsertPt;
 }
 
-static bool isRegUsedByPhiNodes(unsigned DefReg,
+static bool isRegUsedByPhiNodes(Register DefReg,
                                 FunctionLoweringInfo &FuncInfo) {
   for (auto &P : FuncInfo.PHINodesToUpdate)
     if (P.second == DefReg)
@@ -225,6 +224,21 @@ static bool isRegUsedByPhiNodes(unsigned DefReg,
   return false;
 }
 
+static bool isTerminatingEHLabel(MachineBasicBlock *MBB, MachineInstr &MI) {
+  // Ignore non-EH labels.
+  if (!MI.isEHLabel())
+    return false;
+
+  // Any EH label outside a landing pad must be for an invoke. Consider it a
+  // terminator.
+  if (!MBB->isEHPad())
+    return true;
+
+  // If this is a landingpad, the first non-phi instruction will be an EH_LABEL.
+  // Don't consider that label to be a terminator.
+  return MI.getIterator() != MBB->getFirstNonPHI();
+}
+
 /// Build a map of instruction orders. Return the first terminator and its
 /// order. Consider EH_LABEL instructions to be terminators as well, since local
 /// values for phis after invokes must be materialized before the call.
@@ -233,7 +247,7 @@ void FastISel::InstOrderMap::initialize(
   unsigned Order = 0;
   for (MachineInstr &I : *MBB) {
     if (!FirstTerminator &&
-        (I.isTerminator() || (I.isEHLabel() && &I != &MBB->front()))) {
+        (I.isTerminator() || isTerminatingEHLabel(MBB, I))) {
       FirstTerminator = &I;
       FirstTerminatorOrder = Order;
     }
@@ -246,7 +260,7 @@ void FastISel::InstOrderMap::initialize(
 }
 
 void FastISel::sinkLocalValueMaterialization(MachineInstr &LocalMI,
-                                             unsigned DefReg,
+                                             Register DefReg,
                                              InstOrderMap &OrderMap) {
   // If this register is used by a register fixup, MRI will not contain all
   // the uses until after register fixups, so don't attempt to sink or DCE
@@ -341,7 +355,7 @@ bool FastISel::hasTrivialKill(const Value *V) {
   // Even the value might have only one use in the LLVM IR, it is possible that
   // FastISel might fold the use into another instruction and now there is more
   // than one use at the Machine Instruction level.
-  unsigned Reg = lookUpRegForValue(V);
+  Register Reg = lookUpRegForValue(V);
   if (Reg && !MRI.use_empty(Reg))
     return false;
 
@@ -359,11 +373,11 @@ bool FastISel::hasTrivialKill(const Value *V) {
          cast<Instruction>(*I->user_begin())->getParent() == I->getParent();
 }
 
-unsigned FastISel::getRegForValue(const Value *V) {
+Register FastISel::getRegForValue(const Value *V) {
   EVT RealVT = TLI.getValueType(DL, V->getType(), /*AllowUnknown=*/true);
   // Don't handle non-simple values in FastISel.
   if (!RealVT.isSimple())
-    return 0;
+    return Register();
 
   // Ignore illegal types. We must do this before looking up the value
   // in ValueMap because Arguments are given virtual registers regardless
@@ -374,11 +388,11 @@ unsigned FastISel::getRegForValue(const Value *V) {
     if (VT == MVT::i1 || VT == MVT::i8 || VT == MVT::i16)
       VT = TLI.getTypeToTransformTo(V->getContext(), VT).getSimpleVT();
     else
-      return 0;
+      return Register();
   }
 
   // Look up the value to see if we already have a register for it.
-  unsigned Reg = lookUpRegForValue(V);
+  Register Reg = lookUpRegForValue(V);
   if (Reg)
     return Reg;
 
@@ -400,8 +414,8 @@ unsigned FastISel::getRegForValue(const Value *V) {
   return Reg;
 }
 
-unsigned FastISel::materializeConstant(const Value *V, MVT VT) {
-  unsigned Reg = 0;
+Register FastISel::materializeConstant(const Value *V, MVT VT) {
+  Register Reg;
   if (const auto *CI = dyn_cast<ConstantInt>(V)) {
     if (CI->getValue().getActiveBits() <= 64)
       Reg = fastEmit_i(VT, VT, ISD::Constant, CI->getZExtValue());
@@ -428,9 +442,9 @@ unsigned FastISel::materializeConstant(const Value *V, MVT VT) {
       bool isExact;
       (void)Flt.convertToInteger(SIntVal, APFloat::rmTowardZero, &isExact);
       if (isExact) {
-        unsigned IntegerReg =
+        Register IntegerReg =
             getRegForValue(ConstantInt::get(V->getContext(), SIntVal));
-        if (IntegerReg != 0)
+        if (IntegerReg)
           Reg = fastEmit_r(IntVT.getSimpleVT(), VT, ISD::SINT_TO_FP, IntegerReg,
                            /*Kill=*/false);
       }
@@ -452,8 +466,8 @@ unsigned FastISel::materializeConstant(const Value *V, MVT VT) {
 /// Helper for getRegForValue. This function is called when the value isn't
 /// already available in a register and must be materialized with new
 /// instructions.
-unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
-  unsigned Reg = 0;
+Register FastISel::materializeRegForValue(const Value *V, MVT VT) {
+  Register Reg;
   // Give the target-specific code a try first.
   if (isa<Constant>(V))
     Reg = fastMaterializeConstant(cast<Constant>(V));
@@ -472,25 +486,25 @@ unsigned FastISel::materializeRegForValue(const Value *V, MVT VT) {
   return Reg;
 }
 
-unsigned FastISel::lookUpRegForValue(const Value *V) {
+Register FastISel::lookUpRegForValue(const Value *V) {
   // Look up the value to see if we already have a register for it. We
   // cache values defined by Instructions across blocks, and other values
   // only locally. This is because Instructions already have the SSA
   // def-dominates-use requirement enforced.
-  DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(V);
+  DenseMap<const Value *, Register>::iterator I = FuncInfo.ValueMap.find(V);
   if (I != FuncInfo.ValueMap.end())
     return I->second;
   return LocalValueMap[V];
 }
 
-void FastISel::updateValueMap(const Value *I, unsigned Reg, unsigned NumRegs) {
+void FastISel::updateValueMap(const Value *I, Register Reg, unsigned NumRegs) {
   if (!isa<Instruction>(I)) {
     LocalValueMap[I] = Reg;
     return;
   }
 
-  unsigned &AssignedReg = FuncInfo.ValueMap[I];
-  if (AssignedReg == 0)
+  Register &AssignedReg = FuncInfo.ValueMap[I];
+  if (!AssignedReg)
     // Use the new register.
     AssignedReg = Reg;
   else if (Reg != AssignedReg) {
@@ -504,11 +518,11 @@ void FastISel::updateValueMap(const Value *I, unsigned Reg, unsigned NumRegs) {
   }
 }
 
-std::pair<unsigned, bool> FastISel::getRegForGEPIndex(const Value *Idx) {
-  unsigned IdxN = getRegForValue(Idx);
-  if (IdxN == 0)
+std::pair<Register, bool> FastISel::getRegForGEPIndex(const Value *Idx) {
+  Register IdxN = getRegForValue(Idx);
+  if (!IdxN)
     // Unhandled operand. Halt "fast" selection and bail.
-    return std::pair<unsigned, bool>(0, false);
+    return std::pair<Register, bool>(Register(), false);
 
   bool IdxNIsKill = hasTrivialKill(Idx);
 
@@ -524,7 +538,7 @@ std::pair<unsigned, bool> FastISel::getRegForGEPIndex(const Value *Idx) {
         fastEmit_r(IdxVT.getSimpleVT(), PtrVT, ISD::TRUNCATE, IdxN, IdxNIsKill);
     IdxNIsKill = true;
   }
-  return std::pair<unsigned, bool>(IdxN, IdxNIsKill);
+  return std::pair<Register, bool>(IdxN, IdxNIsKill);
 }
 
 void FastISel::recomputeInsertPt() {
@@ -605,12 +619,12 @@ bool FastISel::selectBinaryOp(const User *I, unsigned ISDOpcode) {
   // we don't have anything that canonicalizes operand order.
   if (const auto *CI = dyn_cast<ConstantInt>(I->getOperand(0)))
     if (isa<Instruction>(I) && cast<Instruction>(I)->isCommutative()) {
-      unsigned Op1 = getRegForValue(I->getOperand(1));
+      Register Op1 = getRegForValue(I->getOperand(1));
       if (!Op1)
         return false;
       bool Op1IsKill = hasTrivialKill(I->getOperand(1));
 
-      unsigned ResultReg =
+      Register ResultReg =
           fastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op1, Op1IsKill,
                        CI->getZExtValue(), VT.getSimpleVT());
       if (!ResultReg)
@@ -621,7 +635,7 @@ bool FastISel::selectBinaryOp(const User *I, unsigned ISDOpcode) {
       return true;
     }
 
-  unsigned Op0 = getRegForValue(I->getOperand(0));
+  Register Op0 = getRegForValue(I->getOperand(0));
   if (!Op0) // Unhandled operand. Halt "fast" selection and bail.
     return false;
   bool Op0IsKill = hasTrivialKill(I->getOperand(0));
@@ -644,7 +658,7 @@ bool FastISel::selectBinaryOp(const User *I, unsigned ISDOpcode) {
       ISDOpcode = ISD::AND;
     }
 
-    unsigned ResultReg = fastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op0,
+    Register ResultReg = fastEmit_ri_(VT.getSimpleVT(), ISDOpcode, Op0,
                                       Op0IsKill, Imm, VT.getSimpleVT());
     if (!ResultReg)
       return false;
@@ -654,13 +668,13 @@ bool FastISel::selectBinaryOp(const User *I, unsigned ISDOpcode) {
     return true;
   }
 
-  unsigned Op1 = getRegForValue(I->getOperand(1));
+  Register Op1 = getRegForValue(I->getOperand(1));
   if (!Op1) // Unhandled operand. Halt "fast" selection and bail.
     return false;
   bool Op1IsKill = hasTrivialKill(I->getOperand(1));
 
   // Now we have both operands in registers. Emit the instruction.
-  unsigned ResultReg = fastEmit_rr(VT.getSimpleVT(), VT.getSimpleVT(),
+  Register ResultReg = fastEmit_rr(VT.getSimpleVT(), VT.getSimpleVT(),
                                    ISDOpcode, Op0, Op0IsKill, Op1, Op1IsKill);
   if (!ResultReg)
     // Target-specific code wasn't able to find a machine opcode for
@@ -673,7 +687,7 @@ bool FastISel::selectBinaryOp(const User *I, unsigned ISDOpcode) {
 }
 
 bool FastISel::selectGetElementPtr(const User *I) {
-  unsigned N = getRegForValue(I->getOperand(0));
+  Register N = getRegForValue(I->getOperand(0));
   if (!N) // Unhandled operand. Halt "fast" selection and bail.
     return false;
   bool NIsKill = hasTrivialKill(I->getOperand(0));
@@ -729,8 +743,8 @@ bool FastISel::selectGetElementPtr(const User *I) {
 
       // N = N + Idx * ElementSize;
       uint64_t ElementSize = DL.getTypeAllocSize(Ty);
-      std::pair<unsigned, bool> Pair = getRegForGEPIndex(Idx);
-      unsigned IdxN = Pair.first;
+      std::pair<Register, bool> Pair = getRegForGEPIndex(Idx);
+      Register IdxN = Pair.first;
       bool IdxNIsKill = Pair.second;
       if (!IdxN) // Unhandled operand. Halt "fast" selection and bail.
         return false;
@@ -778,7 +792,7 @@ bool FastISel::addStackMapLiveVars(SmallVectorImpl<MachineOperand> &Ops,
       else
         return false;
     } else {
-      unsigned Reg = getRegForValue(Val);
+      Register Reg = getRegForValue(Val);
       if (!Reg)
         return false;
       Ops.push_back(MachineOperand::CreateReg(Reg, /*isDef=*/false));
@@ -871,7 +885,6 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,
   Args.reserve(NumArgs);
 
   // Populate the argument list.
-  ImmutableCallSite CS(CI);
   for (unsigned ArgI = ArgIdx, ArgE = ArgIdx + NumArgs; ArgI != ArgE; ++ArgI) {
     Value *V = CI->getOperand(ArgI);
 
@@ -880,7 +893,7 @@ bool FastISel::lowerCallOperands(const CallInst *CI, unsigned ArgIdx,
     ArgListEntry Entry;
     Entry.Val = V;
     Entry.Ty = V->getType();
-    Entry.setAttributes(&CS, ArgI);
+    Entry.setAttributes(CI, ArgI);
     Args.push_back(Entry);
   }
 
@@ -987,7 +1000,7 @@ bool FastISel::selectPatchpoint(const CallInst *I) {
   // place these in any free register.
   if (IsAnyRegCC) {
     for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i) {
-      unsigned Reg = getRegForValue(I->getArgOperand(i));
+      Register Reg = getRegForValue(I->getArgOperand(i));
       if (!Reg)
         return false;
       Ops.push_back(MachineOperand::CreateReg(Reg, /*isDef=*/false));
@@ -1104,10 +1117,8 @@ bool FastISel::lowerCallTo(const CallInst *CI, const char *SymName,
 
 bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol,
                            unsigned NumArgs) {
-  ImmutableCallSite CS(CI);
-
-  FunctionType *FTy = CS.getFunctionType();
-  Type *RetTy = CS.getType();
+  FunctionType *FTy = CI->getFunctionType();
+  Type *RetTy = CI->getType();
 
   ArgListTy Args;
   Args.reserve(NumArgs);
@@ -1122,13 +1133,13 @@ bool FastISel::lowerCallTo(const CallInst *CI, MCSymbol *Symbol,
     ArgListEntry Entry;
     Entry.Val = V;
     Entry.Ty = V->getType();
-    Entry.setAttributes(&CS, ArgI);
+    Entry.setAttributes(CI, ArgI);
     Args.push_back(Entry);
   }
-  TLI.markLibCallAttributes(MF, CS.getCallingConv(), Args);
+  TLI.markLibCallAttributes(MF, CI->getCallingConv(), Args);
 
   CallLoweringInfo CLI;
-  CLI.setCallee(RetTy, FTy, Symbol, std::move(Args), CS, NumArgs);
+  CLI.setCallee(RetTy, FTy, Symbol, std::move(Args), *CI, NumArgs);
 
   return lowerCallTo(CLI);
 }
@@ -1203,7 +1214,16 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
       // the various CC lowering callbacks.
       Flags.setByVal();
     }
-    if (Arg.IsByVal || Arg.IsInAlloca) {
+    if (Arg.IsPreallocated) {
+      Flags.setPreallocated();
+      // Set the byval flag for CCAssignFn callbacks that don't know about
+      // preallocated. This way we can know how many bytes we should've
+      // allocated and how many bytes a callee cleanup function will pop.  If we
+      // port preallocated to more targets, we'll have to add custom
+      // preallocated handling in the various CC lowering callbacks.
+      Flags.setByVal();
+    }
+    if (Arg.IsByVal || Arg.IsInAlloca || Arg.IsPreallocated) {
       PointerType *Ty = cast<PointerType>(Arg.Ty);
       Type *ElementTy = Ty->getElementType();
       unsigned FrameSize =
@@ -1211,17 +1231,17 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
 
       // For ByVal, alignment should come from FE. BE will guess if this info
       // is not there, but there are cases it cannot get right.
-      unsigned FrameAlign = Arg.Alignment;
+      MaybeAlign FrameAlign = Arg.Alignment;
       if (!FrameAlign)
-        FrameAlign = TLI.getByValTypeAlignment(ElementTy, DL);
+        FrameAlign = Align(TLI.getByValTypeAlignment(ElementTy, DL));
       Flags.setByValSize(FrameSize);
-      Flags.setByValAlign(Align(FrameAlign));
+      Flags.setByValAlign(*FrameAlign);
     }
     if (Arg.IsNest)
       Flags.setNest();
     if (NeedsRegBlock)
       Flags.setInConsecutiveRegs();
-    Flags.setOrigAlign(Align(DL.getABITypeAlignment(Arg.Ty)));
+    Flags.setOrigAlign(DL.getABITypeAlign(Arg.Ty));
 
     CLI.OutVals.push_back(Arg.Val);
     CLI.OutFlags.push_back(Flags);
@@ -1234,29 +1254,26 @@ bool FastISel::lowerCallTo(CallLoweringInfo &CLI) {
   assert(CLI.Call && "No call instruction specified.");
   CLI.Call->setPhysRegsDeadExcept(CLI.InRegs, TRI);
 
-  if (CLI.NumResultRegs && CLI.CS)
-    updateValueMap(CLI.CS->getInstruction(), CLI.ResultReg, CLI.NumResultRegs);
+  if (CLI.NumResultRegs && CLI.CB)
+    updateValueMap(CLI.CB, CLI.ResultReg, CLI.NumResultRegs);
 
   // Set labels for heapallocsite call.
-  if (CLI.CS)
-    if (MDNode *MD = CLI.CS->getInstruction()->getMetadata("heapallocsite"))
+  if (CLI.CB)
+    if (MDNode *MD = CLI.CB->getMetadata("heapallocsite"))
       CLI.Call->setHeapAllocMarker(*MF, MD);
 
   return true;
 }
 
 bool FastISel::lowerCall(const CallInst *CI) {
-  ImmutableCallSite CS(CI);
-
-  FunctionType *FuncTy = CS.getFunctionType();
-  Type *RetTy = CS.getType();
+  FunctionType *FuncTy = CI->getFunctionType();
+  Type *RetTy = CI->getType();
 
   ArgListTy Args;
   ArgListEntry Entry;
-  Args.reserve(CS.arg_size());
+  Args.reserve(CI->arg_size());
 
-  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
-       i != e; ++i) {
+  for (auto i = CI->arg_begin(), e = CI->arg_end(); i != e; ++i) {
     Value *V = *i;
 
     // Skip empty types
@@ -1267,14 +1284,14 @@ bool FastISel::lowerCall(const CallInst *CI) {
     Entry.Ty = V->getType();
 
     // Skip the first return-type Attribute to get to params.
-    Entry.setAttributes(&CS, i - CS.arg_begin());
+    Entry.setAttributes(CI, i - CI->arg_begin());
     Args.push_back(Entry);
   }
 
   // Check if target-independent constraints permit a tail call here.
   // Target-dependent constraints are checked within fastLowerCall.
   bool IsTailCall = CI->isTailCall();
-  if (IsTailCall && !isInTailCallPosition(CS, TM))
+  if (IsTailCall && !isInTailCallPosition(*CI, TM))
     IsTailCall = false;
   if (IsTailCall && MF->getFunction()
                             .getFnAttribute("disable-tail-calls")
@@ -1282,7 +1299,7 @@ bool FastISel::lowerCall(const CallInst *CI) {
     IsTailCall = false;
 
   CallLoweringInfo CLI;
-  CLI.setCallee(RetTy, FuncTy, CI->getCalledValue(), std::move(Args), CS)
+  CLI.setCallee(RetTy, FuncTy, CI->getCalledOperand(), std::move(Args), *CI)
       .setTailCall(IsTailCall);
 
   return lowerCallTo(CLI);
@@ -1292,7 +1309,7 @@ bool FastISel::selectCall(const User *I) {
   const CallInst *Call = cast<CallInst>(I);
 
   // Handle simple inline asms.
-  if (const InlineAsm *IA = dyn_cast<InlineAsm>(Call->getCalledValue())) {
+  if (const InlineAsm *IA = dyn_cast<InlineAsm>(Call->getCalledOperand())) {
     // If the inline asm has side effects, then make sure that no local value
     // lives across by flushing the local value map.
     if (IA->hasSideEffects())
@@ -1307,12 +1324,19 @@ bool FastISel::selectCall(const User *I) {
       ExtraInfo |= InlineAsm::Extra_HasSideEffects;
     if (IA->isAlignStack())
       ExtraInfo |= InlineAsm::Extra_IsAlignStack;
+    if (Call->isConvergent())
+      ExtraInfo |= InlineAsm::Extra_IsConvergent;
     ExtraInfo |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
 
-    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-            TII.get(TargetOpcode::INLINEASM))
-        .addExternalSymbol(IA->getAsmString().c_str())
-        .addImm(ExtraInfo);
+    MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+                                      TII.get(TargetOpcode::INLINEASM));
+    MIB.addExternalSymbol(IA->getAsmString().c_str());
+    MIB.addImm(ExtraInfo);
+
+    const MDNode *SrcLoc = Call->getMetadata("srcloc");
+    if (SrcLoc)
+      MIB.addMetadata(SrcLoc);
+
     return true;
   }
 
@@ -1350,13 +1374,15 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
     const DbgDeclareInst *DI = cast<DbgDeclareInst>(II);
     assert(DI->getVariable() && "Missing variable");
     if (!FuncInfo.MF->getMMI().hasDebugInfo()) {
-      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI
+                        << " (!hasDebugInfo)\n");
       return true;
     }
 
     const Value *Address = DI->getAddress();
     if (!Address || isa<UndefValue>(Address)) {
-      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI
+                        << " (bad/undef address)\n");
       return true;
     }
 
@@ -1368,7 +1394,7 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
       return true;
 
     Optional<MachineOperand> Op;
-    if (unsigned Reg = lookUpRegForValue(Address))
+    if (Register Reg = lookUpRegForValue(Address))
       Op = MachineOperand::CreateReg(Reg, false);
 
     // If we have a VLA that has a "use" in a metadata node that's then used
@@ -1393,15 +1419,14 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
              "Expected inlined-at fields to agree");
       // A dbg.declare describes the address of a source variable, so lower it
       // into an indirect DBG_VALUE.
-      auto *Expr = DI->getExpression();
-      Expr = DIExpression::append(Expr, {dwarf::DW_OP_deref});
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(TargetOpcode::DBG_VALUE), /*IsIndirect*/ false,
-              *Op, DI->getVariable(), Expr);
+              TII.get(TargetOpcode::DBG_VALUE), /*IsIndirect*/ true,
+              *Op, DI->getVariable(), DI->getExpression());
     } else {
       // We can't yet handle anything else here because it would require
       // generating code, thus altering codegen because of debug info.
-      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI
+                        << " (no materialized reg for address)\n");
     }
     return true;
   }
@@ -1412,38 +1437,37 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
     const Value *V = DI->getValue();
     assert(DI->getVariable()->isValidLocationForIntrinsic(DbgLoc) &&
            "Expected inlined-at fields to agree");
-    if (!V) {
+    if (!V || isa<UndefValue>(V)) {
       // Currently the optimizer can produce this; insert an undef to
-      // help debugging.  Probably the optimizer should not do this.
+      // help debugging.
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, false, 0U,
               DI->getVariable(), DI->getExpression());
     } else if (const auto *CI = dyn_cast<ConstantInt>(V)) {
       if (CI->getBitWidth() > 64)
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
             .addCImm(CI)
-            .addReg(0U)
+            .addImm(0U)
             .addMetadata(DI->getVariable())
             .addMetadata(DI->getExpression());
       else
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
             .addImm(CI->getZExtValue())
-            .addReg(0U)
+            .addImm(0U)
             .addMetadata(DI->getVariable())
             .addMetadata(DI->getExpression());
     } else if (const auto *CF = dyn_cast<ConstantFP>(V)) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II)
           .addFPImm(CF)
-          .addReg(0U)
+          .addImm(0U)
           .addMetadata(DI->getVariable())
           .addMetadata(DI->getExpression());
-    } else if (unsigned Reg = lookUpRegForValue(V)) {
+    } else if (Register Reg = lookUpRegForValue(V)) {
       // FIXME: This does not handle register-indirect values at offset 0.
       bool IsIndirect = false;
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, IsIndirect, Reg,
               DI->getVariable(), DI->getExpression());
     } else {
-      // We can't yet handle anything else here because it would require
-      // generating code, thus altering codegen because of debug info.
+      // We don't know how to handle other cases, so we drop.
       LLVM_DEBUG(dbgs() << "Dropping debug info for " << *DI << "\n");
     }
     return true;
@@ -1469,7 +1493,7 @@ bool FastISel::selectIntrinsicCall(const IntrinsicInst *II) {
   case Intrinsic::launder_invariant_group:
   case Intrinsic::strip_invariant_group:
   case Intrinsic::expect: {
-    unsigned ResultReg = getRegForValue(II->getArgOperand(0));
+    Register ResultReg = getRegForValue(II->getArgOperand(0));
     if (!ResultReg)
       return false;
     updateValueMap(II, ResultReg);
@@ -1507,14 +1531,14 @@ bool FastISel::selectCast(const User *I, unsigned Opcode) {
   if (!TLI.isTypeLegal(SrcVT))
     return false;
 
-  unsigned InputReg = getRegForValue(I->getOperand(0));
+  Register InputReg = getRegForValue(I->getOperand(0));
   if (!InputReg)
     // Unhandled operand.  Halt "fast" selection and bail.
     return false;
 
   bool InputRegIsKill = hasTrivialKill(I->getOperand(0));
 
-  unsigned ResultReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(),
+  Register ResultReg = fastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(),
                                   Opcode, InputReg, InputRegIsKill);
   if (!ResultReg)
     return false;
@@ -1526,7 +1550,7 @@ bool FastISel::selectCast(const User *I, unsigned Opcode) {
 bool FastISel::selectBitCast(const User *I) {
   // If the bitcast doesn't change the type, just use the operand value.
   if (I->getType() == I->getOperand(0)->getType()) {
-    unsigned Reg = getRegForValue(I->getOperand(0));
+    Register Reg = getRegForValue(I->getOperand(0));
     if (!Reg)
       return false;
     updateValueMap(I, Reg);
@@ -1543,13 +1567,13 @@ bool FastISel::selectBitCast(const User *I) {
 
   MVT SrcVT = SrcEVT.getSimpleVT();
   MVT DstVT = DstEVT.getSimpleVT();
-  unsigned Op0 = getRegForValue(I->getOperand(0));
+  Register Op0 = getRegForValue(I->getOperand(0));
   if (!Op0) // Unhandled operand. Halt "fast" selection and bail.
     return false;
   bool Op0IsKill = hasTrivialKill(I->getOperand(0));
 
   // First, try to perform the bitcast by inserting a reg-reg copy.
-  unsigned ResultReg = 0;
+  Register ResultReg;
   if (SrcVT == DstVT) {
     const TargetRegisterClass *SrcClass = TLI.getRegClassFor(SrcVT);
     const TargetRegisterClass *DstClass = TLI.getRegClassFor(DstVT);
@@ -1572,6 +1596,27 @@ bool FastISel::selectBitCast(const User *I) {
   return true;
 }
 
+bool FastISel::selectFreeze(const User *I) {
+  Register Reg = getRegForValue(I->getOperand(0));
+  if (!Reg)
+    // Unhandled operand.
+    return false;
+
+  EVT ETy = TLI.getValueType(DL, I->getOperand(0)->getType());
+  if (ETy == MVT::Other || !TLI.isTypeLegal(ETy))
+    // Unhandled type, bail out.
+    return false;
+
+  MVT Ty = ETy.getSimpleVT();
+  const TargetRegisterClass *TyRegClass = TLI.getRegClassFor(Ty);
+  Register ResultReg = createResultReg(TyRegClass);
+  BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+          TII.get(TargetOpcode::COPY), ResultReg).addReg(Reg);
+
+  updateValueMap(I, ResultReg);
+  return true;
+}
+
 // Remove local value instructions starting from the instruction after
 // SavedLastLocalValue to the current function insert point.
 void FastISel::removeDeadLocalValueCode(MachineInstr *SavedLastLocalValue)
@@ -1607,9 +1652,9 @@ bool FastISel::selectInstruction(const Instruction *I) {
   }
 
   // FastISel does not handle any operand bundles except OB_funclet.
-  if (ImmutableCallSite CS = ImmutableCallSite(I))
-    for (unsigned i = 0, e = CS.getNumOperandBundles(); i != e; ++i)
-      if (CS.getOperandBundleAt(i).getTagID() != LLVMContext::OB_funclet)
+  if (auto *Call = dyn_cast<CallBase>(I))
+    for (unsigned i = 0, e = Call->getNumOperandBundles(); i != e; ++i)
+      if (Call->getOperandBundleAt(i).getTagID() != LLVMContext::OB_funclet)
         return false;
 
   DbgLoc = I->getDebugLoc();
@@ -1710,14 +1755,14 @@ void FastISel::finishCondBranch(const BasicBlock *BranchBB,
 
 /// Emit an FNeg operation.
 bool FastISel::selectFNeg(const User *I, const Value *In) {
-  unsigned OpReg = getRegForValue(In);
+  Register OpReg = getRegForValue(In);
   if (!OpReg)
     return false;
   bool OpRegIsKill = hasTrivialKill(In);
 
   // If the target has ISD::FNEG, use it.
   EVT VT = TLI.getValueType(DL, I->getType());
-  unsigned ResultReg = fastEmit_r(VT.getSimpleVT(), VT.getSimpleVT(), ISD::FNEG,
+  Register ResultReg = fastEmit_r(VT.getSimpleVT(), VT.getSimpleVT(), ISD::FNEG,
                                   OpReg, OpRegIsKill);
   if (ResultReg) {
     updateValueMap(I, ResultReg);
@@ -1732,12 +1777,12 @@ bool FastISel::selectFNeg(const User *I, const Value *In) {
   if (!TLI.isTypeLegal(IntVT))
     return false;
 
-  unsigned IntReg = fastEmit_r(VT.getSimpleVT(), IntVT.getSimpleVT(),
+  Register IntReg = fastEmit_r(VT.getSimpleVT(), IntVT.getSimpleVT(),
                                ISD::BITCAST, OpReg, OpRegIsKill);
   if (!IntReg)
     return false;
 
-  unsigned IntResultReg = fastEmit_ri_(
+  Register IntResultReg = fastEmit_ri_(
       IntVT.getSimpleVT(), ISD::XOR, IntReg, /*IsKill=*/true,
       UINT64_C(1) << (VT.getSizeInBits() - 1), IntVT.getSimpleVT());
   if (!IntResultReg)
@@ -1771,7 +1816,7 @@ bool FastISel::selectExtractValue(const User *U) {
 
   // Get the base result register.
   unsigned ResultReg;
-  DenseMap<const Value *, unsigned>::iterator I = FuncInfo.ValueMap.find(Op0);
+  DenseMap<const Value *, Register>::iterator I = FuncInfo.ValueMap.find(Op0);
   if (I != FuncInfo.ValueMap.end())
     ResultReg = I->second;
   else if (isa<Instruction>(Op0))
@@ -1903,7 +1948,7 @@ bool FastISel::selectOperator(const User *I, unsigned Opcode) {
       return selectCast(I, ISD::ZERO_EXTEND);
     if (DstVT.bitsLT(SrcVT))
       return selectCast(I, ISD::TRUNCATE);
-    unsigned Reg = getRegForValue(I->getOperand(0));
+    Register Reg = getRegForValue(I->getOperand(0));
     if (!Reg)
       return false;
     updateValueMap(I, Reg);
@@ -1913,6 +1958,9 @@ bool FastISel::selectOperator(const User *I, unsigned Opcode) {
   case Instruction::ExtractValue:
     return selectExtractValue(I);
 
+  case Instruction::Freeze:
+    return selectFreeze(I);
+
   case Instruction::PHI:
     llvm_unreachable("FastISel shouldn't visit PHI nodes!");
 
@@ -1975,7 +2023,7 @@ unsigned FastISel::fastEmit_ri(MVT, MVT, unsigned, unsigned /*Op0*/,
 /// instruction with an immediate operand using fastEmit_ri.
 /// If that fails, it materializes the immediate into a register and try
 /// fastEmit_rr instead.
-unsigned FastISel::fastEmit_ri_(MVT VT, unsigned Opcode, unsigned Op0,
+Register FastISel::fastEmit_ri_(MVT VT, unsigned Opcode, unsigned Op0,
                                 bool Op0IsKill, uint64_t Imm, MVT ImmType) {
   // If this is a multiply by a power of two, emit this as a shift left.
   if (Opcode == ISD::MUL && isPowerOf2_64(Imm)) {
@@ -1994,10 +2042,10 @@ unsigned FastISel::fastEmit_ri_(MVT VT, unsigned Opcode, unsigned Op0,
     return 0;
 
   // First check if immediate type is legal. If not, we can't use the ri form.
-  unsigned ResultReg = fastEmit_ri(VT, VT, Opcode, Op0, Op0IsKill, Imm);
+  Register ResultReg = fastEmit_ri(VT, VT, Opcode, Op0, Op0IsKill, Imm);
   if (ResultReg)
     return ResultReg;
-  unsigned MaterialReg = fastEmit_i(ImmType, ImmType, ISD::Constant, Imm);
+  Register MaterialReg = fastEmit_i(ImmType, ImmType, ISD::Constant, Imm);
   bool IsImmKill = true;
   if (!MaterialReg) {
     // This is a bit ugly/slow, but failing here means falling out of
@@ -2018,19 +2066,19 @@ unsigned FastISel::fastEmit_ri_(MVT VT, unsigned Opcode, unsigned Op0,
   return fastEmit_rr(VT, VT, Opcode, Op0, Op0IsKill, MaterialReg, IsImmKill);
 }
 
-unsigned FastISel::createResultReg(const TargetRegisterClass *RC) {
+Register FastISel::createResultReg(const TargetRegisterClass *RC) {
   return MRI.createVirtualRegister(RC);
 }
 
-unsigned FastISel::constrainOperandRegClass(const MCInstrDesc &II, unsigned Op,
+Register FastISel::constrainOperandRegClass(const MCInstrDesc &II, Register Op,
                                             unsigned OpNum) {
-  if (Register::isVirtualRegister(Op)) {
+  if (Op.isVirtual()) {
     const TargetRegisterClass *RegClass =
         TII.getRegClass(II, OpNum, &TRI, *FuncInfo.MF);
     if (!MRI.constrainRegClass(Op, RegClass)) {
       // If it's not legal to COPY between the register classes, something
       // has gone very wrong before we got here.
-      unsigned NewOp = createResultReg(RegClass);
+      Register NewOp = createResultReg(RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), NewOp).addReg(Op);
       return NewOp;
@@ -2039,21 +2087,21 @@ unsigned FastISel::constrainOperandRegClass(const MCInstrDesc &II, unsigned Op,
   return Op;
 }
 
-unsigned FastISel::fastEmitInst_(unsigned MachineInstOpcode,
+Register FastISel::fastEmitInst_(unsigned MachineInstOpcode,
                                  const TargetRegisterClass *RC) {
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg);
   return ResultReg;
 }
 
-unsigned FastISel::fastEmitInst_r(unsigned MachineInstOpcode,
+Register FastISel::fastEmitInst_r(unsigned MachineInstOpcode,
                                   const TargetRegisterClass *RC, unsigned Op0,
                                   bool Op0IsKill) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
 
   if (II.getNumDefs() >= 1)
@@ -2069,13 +2117,13 @@ unsigned FastISel::fastEmitInst_r(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned FastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
+Register FastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
                                    const TargetRegisterClass *RC, unsigned Op0,
                                    bool Op0IsKill, unsigned Op1,
                                    bool Op1IsKill) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
   Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
 
@@ -2093,14 +2141,14 @@ unsigned FastISel::fastEmitInst_rr(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned FastISel::fastEmitInst_rrr(unsigned MachineInstOpcode,
+Register FastISel::fastEmitInst_rrr(unsigned MachineInstOpcode,
                                     const TargetRegisterClass *RC, unsigned Op0,
                                     bool Op0IsKill, unsigned Op1,
                                     bool Op1IsKill, unsigned Op2,
                                     bool Op2IsKill) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
   Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
   Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
@@ -2121,12 +2169,12 @@ unsigned FastISel::fastEmitInst_rrr(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned FastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
+Register FastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
                                    const TargetRegisterClass *RC, unsigned Op0,
                                    bool Op0IsKill, uint64_t Imm) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
 
   if (II.getNumDefs() >= 1)
@@ -2143,13 +2191,13 @@ unsigned FastISel::fastEmitInst_ri(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned FastISel::fastEmitInst_rii(unsigned MachineInstOpcode,
+Register FastISel::fastEmitInst_rii(unsigned MachineInstOpcode,
                                     const TargetRegisterClass *RC, unsigned Op0,
                                     bool Op0IsKill, uint64_t Imm1,
                                     uint64_t Imm2) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
 
   if (II.getNumDefs() >= 1)
@@ -2168,12 +2216,12 @@ unsigned FastISel::fastEmitInst_rii(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned FastISel::fastEmitInst_f(unsigned MachineInstOpcode,
+Register FastISel::fastEmitInst_f(unsigned MachineInstOpcode,
                                   const TargetRegisterClass *RC,
                                   const ConstantFP *FPImm) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
 
   if (II.getNumDefs() >= 1)
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, ResultReg)
@@ -2187,13 +2235,13 @@ unsigned FastISel::fastEmitInst_f(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned FastISel::fastEmitInst_rri(unsigned MachineInstOpcode,
+Register FastISel::fastEmitInst_rri(unsigned MachineInstOpcode,
                                     const TargetRegisterClass *RC, unsigned Op0,
                                     bool Op0IsKill, unsigned Op1,
                                     bool Op1IsKill, uint64_t Imm) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
   Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
 
@@ -2213,9 +2261,9 @@ unsigned FastISel::fastEmitInst_rri(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned FastISel::fastEmitInst_i(unsigned MachineInstOpcode,
+Register FastISel::fastEmitInst_i(unsigned MachineInstOpcode,
                                   const TargetRegisterClass *RC, uint64_t Imm) {
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
   if (II.getNumDefs() >= 1)
@@ -2229,9 +2277,9 @@ unsigned FastISel::fastEmitInst_i(unsigned MachineInstOpcode,
   return ResultReg;
 }
 
-unsigned FastISel::fastEmitInst_extractsubreg(MVT RetVT, unsigned Op0,
+Register FastISel::fastEmitInst_extractsubreg(MVT RetVT, unsigned Op0,
                                               bool Op0IsKill, uint32_t Idx) {
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
+  Register ResultReg = createResultReg(TLI.getRegClassFor(RetVT));
   assert(Register::isVirtualRegister(Op0) &&
          "Cannot yet extract from physregs");
   const TargetRegisterClass *RC = MRI.getRegClass(Op0);
@@ -2243,7 +2291,7 @@ unsigned FastISel::fastEmitInst_extractsubreg(MVT RetVT, unsigned Op0,
 
 /// Emit MachineInstrs to compute the value of Op with all but the least
 /// significant bit set to zero.
-unsigned FastISel::fastEmitZExtFromI1(MVT VT, unsigned Op0, bool Op0IsKill) {
+Register FastISel::fastEmitZExtFromI1(MVT VT, unsigned Op0, bool Op0IsKill) {
   return fastEmit_ri(VT, VT, ISD::AND, Op0, Op0IsKill, 1);
 }
 
@@ -2305,7 +2353,7 @@ bool FastISel::handlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
       if (const auto *Inst = dyn_cast<Instruction>(PHIOp))
         DbgLoc = Inst->getDebugLoc();
 
-      unsigned Reg = getRegForValue(PHIOp);
+      Register Reg = getRegForValue(PHIOp);
       if (!Reg) {
         FuncInfo.PHINodesToUpdate.resize(FuncInfo.OrigNumPHINodesToUpdate);
         return false;
@@ -2351,7 +2399,7 @@ bool FastISel::tryToFoldLoad(const LoadInst *LI, const Instruction *FoldInst) {
   // Figure out which vreg this is going into.  If there is no assigned vreg yet
   // then there actually was no reference to it.  Perhaps the load is referenced
   // by a dead instruction.
-  unsigned LoadReg = getRegForValue(LI);
+  Register LoadReg = getRegForValue(LI);
   if (!LoadReg)
     return false;
 
@@ -2394,18 +2442,18 @@ MachineMemOperand *
 FastISel::createMachineMemOperandFor(const Instruction *I) const {
   const Value *Ptr;
   Type *ValTy;
-  unsigned Alignment;
+  MaybeAlign Alignment;
   MachineMemOperand::Flags Flags;
   bool IsVolatile;
 
   if (const auto *LI = dyn_cast<LoadInst>(I)) {
-    Alignment = LI->getAlignment();
+    Alignment = LI->getAlign();
     IsVolatile = LI->isVolatile();
     Flags = MachineMemOperand::MOLoad;
     Ptr = LI->getPointerOperand();
     ValTy = LI->getType();
   } else if (const auto *SI = dyn_cast<StoreInst>(I)) {
-    Alignment = SI->getAlignment();
+    Alignment = SI->getAlign();
     IsVolatile = SI->isVolatile();
     Flags = MachineMemOperand::MOStore;
     Ptr = SI->getPointerOperand();
@@ -2421,8 +2469,8 @@ FastISel::createMachineMemOperandFor(const Instruction *I) const {
   AAMDNodes AAInfo;
   I->getAAMetadata(AAInfo);
 
-  if (Alignment == 0) // Ensure that codegen never sees alignment 0.
-    Alignment = DL.getABITypeAlignment(ValTy);
+  if (!Alignment) // Ensure that codegen never sees alignment 0.
+    Alignment = DL.getABITypeAlign(ValTy);
 
   unsigned Size = DL.getTypeStoreSize(ValTy);
 
@@ -2436,7 +2484,7 @@ FastISel::createMachineMemOperandFor(const Instruction *I) const {
     Flags |= MachineMemOperand::MOInvariant;
 
   return FuncInfo.MF->getMachineMemOperand(MachinePointerInfo(Ptr), Flags, Size,
-                                           Alignment, AAInfo, Ranges);
+                                           *Alignment, AAInfo, Ranges);
 }
 
 CmpInst::Predicate FastISel::optimizeCmpPredicate(const CmpInst *CI) const {
diff --git a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
index fa33400cd4b3..5cf83cff3a90 100644
--- a/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/FunctionLoweringInfo.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
@@ -85,7 +86,6 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
   TLI = MF->getSubtarget().getTargetLowering();
   RegInfo = &MF->getRegInfo();
   const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
-  unsigned StackAlign = TFI->getStackAlignment();
   DA = DAG->getDivergenceAnalysis();
 
   // Check whether the function can return without sret-demotion.
@@ -130,19 +130,31 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
   // Initialize the mapping of values to registers.  This is only set up for
   // instruction values that are used outside of the block that defines
   // them.
+  const Align StackAlign = TFI->getStackAlign();
   for (const BasicBlock &BB : *Fn) {
     for (const Instruction &I : BB) {
       if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
         Type *Ty = AI->getAllocatedType();
-        unsigned Align =
-          std::max((unsigned)MF->getDataLayout().getPrefTypeAlignment(Ty),
-                   AI->getAlignment());
+        Align TyPrefAlign = MF->getDataLayout().getPrefTypeAlign(Ty);
+        // The "specified" alignment is the alignment written on the alloca,
+        // or the preferred alignment of the type if none is specified.
+        //
+        // (Unspecified alignment on allocas will be going away soon.)
+        Align SpecifiedAlign = AI->getAlign();
+
+        // If the preferred alignment of the type is higher than the specified
+        // alignment of the alloca, promote the alignment, as long as it doesn't
+        // require realigning the stack.
+        //
+        // FIXME: Do we really want to second-guess the IR in isel?
+        Align Alignment =
+            std::max(std::min(TyPrefAlign, StackAlign), SpecifiedAlign);
 
         // Static allocas can be folded into the initial stack frame
         // adjustment. For targets that don't realign the stack, don't
         // do this if there is an extra alignment requirement.
         if (AI->isStaticAlloca() &&
-            (TFI->isStackRealignable() || (Align <= StackAlign))) {
+            (TFI->isStackRealignable() || (Alignment <= StackAlign))) {
           const ConstantInt *CUI = cast<ConstantInt>(AI->getArraySize());
           uint64_t TySize =
               MF->getDataLayout().getTypeAllocSize(Ty).getKnownMinSize();
@@ -154,15 +166,15 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
           if (Iter != CatchObjects.end() && TLI->needsFixedCatchObjects()) {
             FrameIndex = MF->getFrameInfo().CreateFixedObject(
                 TySize, 0, /*IsImmutable=*/false, /*isAliased=*/true);
-            MF->getFrameInfo().setObjectAlignment(FrameIndex, Align);
+            MF->getFrameInfo().setObjectAlignment(FrameIndex, Alignment);
           } else {
-            FrameIndex =
-                MF->getFrameInfo().CreateStackObject(TySize, Align, false, AI);
+            FrameIndex = MF->getFrameInfo().CreateStackObject(TySize, Alignment,
+                                                              false, AI);
           }
 
           // Scalable vectors may need a special StackID to distinguish
           // them from other (fixed size) stack objects.
-          if (Ty->isVectorTy() && Ty->getVectorIsScalable())
+          if (isa<ScalableVectorType>(Ty))
             MF->getFrameInfo().setStackID(FrameIndex,
                                           TFI->getStackIDForScalableVectors());
 
@@ -176,21 +188,20 @@ void FunctionLoweringInfo::set(const Function &fn, MachineFunction &mf,
           // FIXME: Overaligned static allocas should be grouped into
           // a single dynamic allocation instead of using a separate
           // stack allocation for each one.
-          if (Align <= StackAlign)
-            Align = 0;
           // Inform the Frame Information that we have variable-sized objects.
-          MF->getFrameInfo().CreateVariableSizedObject(Align ? Align : 1, AI);
+          MF->getFrameInfo().CreateVariableSizedObject(
+              Alignment <= StackAlign ? Align(1) : Alignment, AI);
         }
       }
 
       // Look for inline asm that clobbers the SP register.
-      if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
-        ImmutableCallSite CS(&I);
-        if (isa<InlineAsm>(CS.getCalledValue())) {
+      if (auto *Call = dyn_cast<CallBase>(&I)) {
+        if (Call->isInlineAsm()) {
           unsigned SP = TLI->getStackPointerRegisterToSaveRestore();
           const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
           std::vector<TargetLowering::AsmOperandInfo> Ops =
-              TLI->ParseConstraints(Fn->getParent()->getDataLayout(), TRI, CS);
+              TLI->ParseConstraints(Fn->getParent()->getDataLayout(), TRI,
+                                    *Call);
           for (TargetLowering::AsmOperandInfo &Op : Ops) {
             if (Op.Type == InlineAsm::isClobber) {
               // Clobbers don't have SDValue operands, hence SDValue().
@@ -354,7 +365,7 @@ void FunctionLoweringInfo::clear() {
 }
 
 /// CreateReg - Allocate a single virtual register for the given type.
-unsigned FunctionLoweringInfo::CreateReg(MVT VT, bool isDivergent) {
+Register FunctionLoweringInfo::CreateReg(MVT VT, bool isDivergent) {
   return RegInfo->createVirtualRegister(
       MF->getSubtarget().getTargetLowering()->getRegClassFor(VT, isDivergent));
 }
@@ -366,29 +377,29 @@ unsigned FunctionLoweringInfo::CreateReg(MVT VT, bool isDivergent) {
 /// In the case that the given value has struct or array type, this function
 /// will assign registers for each member or element.
 ///
-unsigned FunctionLoweringInfo::CreateRegs(Type *Ty, bool isDivergent) {
+Register FunctionLoweringInfo::CreateRegs(Type *Ty, bool isDivergent) {
   const TargetLowering *TLI = MF->getSubtarget().getTargetLowering();
 
   SmallVector<EVT, 4> ValueVTs;
   ComputeValueVTs(*TLI, MF->getDataLayout(), Ty, ValueVTs);
 
-  unsigned FirstReg = 0;
+  Register FirstReg;
   for (unsigned Value = 0, e = ValueVTs.size(); Value != e; ++Value) {
     EVT ValueVT = ValueVTs[Value];
     MVT RegisterVT = TLI->getRegisterType(Ty->getContext(), ValueVT);
 
     unsigned NumRegs = TLI->getNumRegisters(Ty->getContext(), ValueVT);
     for (unsigned i = 0; i != NumRegs; ++i) {
-      unsigned R = CreateReg(RegisterVT, isDivergent);
+      Register R = CreateReg(RegisterVT, isDivergent);
       if (!FirstReg) FirstReg = R;
     }
   }
   return FirstReg;
 }
 
-unsigned FunctionLoweringInfo::CreateRegs(const Value *V) {
-  return CreateRegs(V->getType(), DA && !TLI->requiresUniformRegister(*MF, V) &&
-                                      DA->isDivergent(V));
+Register FunctionLoweringInfo::CreateRegs(const Value *V) {
+  return CreateRegs(V->getType(), DA && DA->isDivergent(V) &&
+                    !TLI->requiresUniformRegister(*MF, V));
 }
 
 /// GetLiveOutRegInfo - Gets LiveOutInfo for a register, returning NULL if the
@@ -397,7 +408,7 @@ unsigned FunctionLoweringInfo::CreateRegs(const Value *V) {
 /// the larger bit width by zero extension. The bit width must be no smaller
 /// than the LiveOutInfo's existing bit width.
 const FunctionLoweringInfo::LiveOutInfo *
-FunctionLoweringInfo::GetLiveOutRegInfo(unsigned Reg, unsigned BitWidth) {
+FunctionLoweringInfo::GetLiveOutRegInfo(Register Reg, unsigned BitWidth) {
   if (!LiveOutRegInfo.inBounds(Reg))
     return nullptr;
 
@@ -407,7 +418,7 @@ FunctionLoweringInfo::GetLiveOutRegInfo(unsigned Reg, unsigned BitWidth) {
 
   if (BitWidth > LOI->Known.getBitWidth()) {
     LOI->NumSignBits = 1;
-    LOI->Known = LOI->Known.zext(BitWidth, false /* => any extend */);
+    LOI->Known = LOI->Known.anyext(BitWidth);
   }
 
   return LOI;
@@ -431,7 +442,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
   IntVT = TLI->getTypeToTransformTo(PN->getContext(), IntVT);
   unsigned BitWidth = IntVT.getSizeInBits();
 
-  unsigned DestReg = ValueMap[PN];
+  Register DestReg = ValueMap[PN];
   if (!Register::isVirtualRegister(DestReg))
     return;
   LiveOutRegInfo.grow(DestReg);
@@ -452,7 +463,7 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
   } else {
     assert(ValueMap.count(V) && "V should have been placed in ValueMap when its"
                                 "CopyToReg node was created.");
-    unsigned SrcReg = ValueMap[V];
+    Register SrcReg = ValueMap[V];
     if (!Register::isVirtualRegister(SrcReg)) {
       DestLOI.IsValid = false;
       return;
@@ -487,8 +498,8 @@ void FunctionLoweringInfo::ComputePHILiveOutRegInfo(const PHINode *PN) {
 
     assert(ValueMap.count(V) && "V should have been placed in ValueMap when "
                                 "its CopyToReg node was created.");
-    unsigned SrcReg = ValueMap[V];
-    if (!Register::isVirtualRegister(SrcReg)) {
+    Register SrcReg = ValueMap[V];
+    if (!SrcReg.isVirtual()) {
       DestLOI.IsValid = false;
       return;
     }
@@ -522,11 +533,11 @@ int FunctionLoweringInfo::getArgumentFrameIndex(const Argument *A) {
   return INT_MAX;
 }
 
-unsigned FunctionLoweringInfo::getCatchPadExceptionPointerVReg(
+Register FunctionLoweringInfo::getCatchPadExceptionPointerVReg(
     const Value *CPI, const TargetRegisterClass *RC) {
   MachineRegisterInfo &MRI = MF->getRegInfo();
   auto I = CatchPadExceptionPointers.insert({CPI, 0});
-  unsigned &VReg = I.first->second;
+  Register &VReg = I.first->second;
   if (I.second)
     VReg = MRI.createVirtualRegister(RC);
   assert(VReg && "null vreg in exception pointer table!");
@@ -534,7 +545,7 @@ unsigned FunctionLoweringInfo::getCatchPadExceptionPointerVReg(
 }
 
 const Value *
-FunctionLoweringInfo::getValueFromVirtualReg(unsigned Vreg) {
+FunctionLoweringInfo::getValueFromVirtualReg(Register Vreg) {
   if (VirtReg2Value.empty()) {
     SmallVector<EVT, 4> ValueVTs;
     for (auto &P : ValueMap) {
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
index c613c2540628..0e4e99214aa2 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.cpp
@@ -19,6 +19,7 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -28,6 +29,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "instr-emitter"
@@ -84,9 +86,9 @@ static unsigned countOperands(SDNode *Node, unsigned NumExpUses,
 /// implicit physical register output.
 void InstrEmitter::
 EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
-                unsigned SrcReg, DenseMap<SDValue, unsigned> &VRBaseMap) {
-  unsigned VRBase = 0;
-  if (Register::isVirtualRegister(SrcReg)) {
+                Register SrcReg, DenseMap<SDValue, Register> &VRBaseMap) {
+  Register VRBase;
+  if (SrcReg.isVirtual()) {
     // Just use the input register directly!
     SDValue Op(Node, ResNo);
     if (IsClone)
@@ -113,8 +115,8 @@ EmitCopyFromReg(SDNode *Node, unsigned ResNo, bool IsClone, bool IsCloned,
       if (User->getOpcode() == ISD::CopyToReg &&
           User->getOperand(2).getNode() == Node &&
           User->getOperand(2).getResNo() == ResNo) {
-        unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
-        if (Register::isVirtualRegister(DestReg)) {
+        Register DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+        if (DestReg.isVirtual()) {
           VRBase = DestReg;
           Match = false;
         } else if (DestReg != SrcReg)
@@ -190,16 +192,19 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
                                        MachineInstrBuilder &MIB,
                                        const MCInstrDesc &II,
                                        bool IsClone, bool IsCloned,
-                                       DenseMap<SDValue, unsigned> &VRBaseMap) {
+                                       DenseMap<SDValue, Register> &VRBaseMap) {
   assert(Node->getMachineOpcode() != TargetOpcode::IMPLICIT_DEF &&
          "IMPLICIT_DEF should have been handled as a special case elsewhere!");
 
   unsigned NumResults = CountResults(Node);
-  for (unsigned i = 0; i < II.getNumDefs(); ++i) {
+  bool HasVRegVariadicDefs = !MF->getTarget().usesPhysRegsForValues() &&
+                             II.isVariadic() && II.variadicOpsAreDefs();
+  unsigned NumVRegs = HasVRegVariadicDefs ? NumResults : II.getNumDefs();
+  for (unsigned i = 0; i < NumVRegs; ++i) {
     // If the specific node value is only used by a CopyToReg and the dest reg
     // is a vreg in the same register class, use the CopyToReg'd destination
     // register instead of creating a new vreg.
-    unsigned VRBase = 0;
+    Register VRBase;
     const TargetRegisterClass *RC =
       TRI->getAllocatableClass(TII->getRegClass(II, i, TRI, *MF));
     // Always let the value type influence the used register class. The
@@ -216,10 +221,10 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
         RC = VTRC;
     }
 
-    if (II.OpInfo[i].isOptionalDef()) {
+    if (II.OpInfo != nullptr && II.OpInfo[i].isOptionalDef()) {
       // Optional def must be a physical register.
       VRBase = cast<RegisterSDNode>(Node->getOperand(i-NumResults))->getReg();
-      assert(Register::isPhysicalRegister(VRBase));
+      assert(VRBase.isPhysical());
       MIB.addReg(VRBase, RegState::Define);
     }
 
@@ -263,8 +268,8 @@ void InstrEmitter::CreateVirtualRegisters(SDNode *Node,
 
 /// getVR - Return the virtual register corresponding to the specified result
 /// of the specified node.
-unsigned InstrEmitter::getVR(SDValue Op,
-                             DenseMap<SDValue, unsigned> &VRBaseMap) {
+Register InstrEmitter::getVR(SDValue Op,
+                             DenseMap<SDValue, Register> &VRBaseMap) {
   if (Op.isMachineOpcode() &&
       Op.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF) {
     // Add an IMPLICIT_DEF instruction before every use.
@@ -278,7 +283,7 @@ unsigned InstrEmitter::getVR(SDValue Op,
     return VReg;
   }
 
-  DenseMap<SDValue, unsigned>::iterator I = VRBaseMap.find(Op);
+  DenseMap<SDValue, Register>::iterator I = VRBaseMap.find(Op);
   assert(I != VRBaseMap.end() && "Node emitted out of order - late");
   return I->second;
 }
@@ -292,13 +297,13 @@ InstrEmitter::AddRegisterOperand(MachineInstrBuilder &MIB,
                                  SDValue Op,
                                  unsigned IIOpNum,
                                  const MCInstrDesc *II,
-                                 DenseMap<SDValue, unsigned> &VRBaseMap,
+                                 DenseMap<SDValue, Register> &VRBaseMap,
                                  bool IsDebug, bool IsClone, bool IsCloned) {
   assert(Op.getValueType() != MVT::Other &&
          Op.getValueType() != MVT::Glue &&
          "Chain and glue operands should occur at end of operand list!");
   // Get/emit the operand.
-  unsigned VReg = getVR(Op, VRBaseMap);
+  Register VReg = getVR(Op, VRBaseMap);
 
   const MCInstrDesc &MCID = MIB->getDesc();
   bool isOptDef = IIOpNum < MCID.getNumOperands() &&
@@ -363,7 +368,7 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
                               SDValue Op,
                               unsigned IIOpNum,
                               const MCInstrDesc *II,
-                              DenseMap<SDValue, unsigned> &VRBaseMap,
+                              DenseMap<SDValue, Register> &VRBaseMap,
                               bool IsDebug, bool IsClone, bool IsCloned) {
   if (Op.isMachineOpcode()) {
     AddRegisterOperand(MIB, Op, IIOpNum, II, VRBaseMap,
@@ -373,7 +378,7 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
   } else if (ConstantFPSDNode *F = dyn_cast<ConstantFPSDNode>(Op)) {
     MIB.addFPImm(F->getConstantFPValue());
   } else if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(Op)) {
-    unsigned VReg = R->getReg();
+    Register VReg = R->getReg();
     MVT OpVT = Op.getSimpleValueType();
     const TargetRegisterClass *IIRC =
         II ? TRI->getAllocatableClass(TII->getRegClass(*II, IIOpNum, TRI, *MF))
@@ -409,23 +414,14 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
     MIB.addJumpTableIndex(JT->getIndex(), JT->getTargetFlags());
   } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op)) {
     int Offset = CP->getOffset();
-    unsigned Align = CP->getAlignment();
-    Type *Type = CP->getType();
-    // MachineConstantPool wants an explicit alignment.
-    if (Align == 0) {
-      Align = MF->getDataLayout().getPrefTypeAlignment(Type);
-      if (Align == 0) {
-        // Alignment of vector types.  FIXME!
-        Align = MF->getDataLayout().getTypeAllocSize(Type);
-      }
-    }
+    Align Alignment = CP->getAlign();
 
     unsigned Idx;
     MachineConstantPool *MCP = MF->getConstantPool();
     if (CP->isMachineConstantPoolEntry())
-      Idx = MCP->getConstantPoolIndex(CP->getMachineCPVal(), Align);
+      Idx = MCP->getConstantPoolIndex(CP->getMachineCPVal(), Alignment);
     else
-      Idx = MCP->getConstantPoolIndex(CP->getConstVal(), Align);
+      Idx = MCP->getConstantPoolIndex(CP->getConstVal(), Alignment);
     MIB.addConstantPoolIndex(Idx, Offset, CP->getTargetFlags());
   } else if (ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op)) {
     MIB.addExternalSymbol(ES->getSymbol(), ES->getTargetFlags());
@@ -446,7 +442,7 @@ void InstrEmitter::AddOperand(MachineInstrBuilder &MIB,
   }
 }
 
-unsigned InstrEmitter::ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
+Register InstrEmitter::ConstrainForSubReg(Register VReg, unsigned SubIdx,
                                           MVT VT, bool isDivergent, const DebugLoc &DL) {
   const TargetRegisterClass *VRC = MRI->getRegClass(VReg);
   const TargetRegisterClass *RC = TRI->getSubClassWithSubReg(VRC, SubIdx);
@@ -473,9 +469,9 @@ unsigned InstrEmitter::ConstrainForSubReg(unsigned VReg, unsigned SubIdx,
 /// EmitSubregNode - Generate machine code for subreg nodes.
 ///
 void InstrEmitter::EmitSubregNode(SDNode *Node,
-                                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                                  DenseMap<SDValue, Register> &VRBaseMap,
                                   bool IsClone, bool IsCloned) {
-  unsigned VRBase = 0;
+  Register VRBase;
   unsigned Opc = Node->getMachineOpcode();
 
   // If the node is only used by a CopyToReg and the dest reg is a vreg, use
@@ -483,8 +479,8 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
   for (SDNode *User : Node->uses()) {
     if (User->getOpcode() == ISD::CopyToReg &&
         User->getOperand(2).getNode() == Node) {
-      unsigned DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
-      if (Register::isVirtualRegister(DestReg)) {
+      Register DestReg = cast<RegisterSDNode>(User->getOperand(1))->getReg();
+      if (DestReg.isVirtual()) {
         VRBase = DestReg;
         break;
       }
@@ -499,7 +495,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
     const TargetRegisterClass *TRC =
       TLI->getRegClassFor(Node->getSimpleValueType(0), Node->isDivergent());
 
-    unsigned Reg;
+    Register Reg;
     MachineInstr *DefMI;
     RegisterSDNode *R = dyn_cast<RegisterSDNode>(Node->getOperand(0));
     if (R && Register::isPhysicalRegister(R->getReg())) {
@@ -510,7 +506,8 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       DefMI = MRI->getVRegDef(Reg);
     }
 
-    unsigned SrcReg, DstReg, DefSubIdx;
+    Register SrcReg, DstReg;
+    unsigned DefSubIdx;
     if (DefMI &&
         TII->isCoalescableExtInstr(*DefMI, SrcReg, DstReg, DefSubIdx) &&
         SubIdx == DefSubIdx &&
@@ -528,19 +525,19 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
       // Reg may not support a SubIdx sub-register, and we may need to
       // constrain its register class or issue a COPY to a compatible register
       // class.
-      if (Register::isVirtualRegister(Reg))
+      if (Reg.isVirtual())
         Reg = ConstrainForSubReg(Reg, SubIdx,
                                  Node->getOperand(0).getSimpleValueType(),
                                  Node->isDivergent(), Node->getDebugLoc());
       // Create the destreg if it is missing.
-      if (VRBase == 0)
+      if (!VRBase)
         VRBase = MRI->createVirtualRegister(TRC);
 
       // Create the extract_subreg machine instruction.
       MachineInstrBuilder CopyMI =
           BuildMI(*MBB, InsertPos, Node->getDebugLoc(),
                   TII->get(TargetOpcode::COPY), VRBase);
-      if (Register::isVirtualRegister(Reg))
+      if (Reg.isVirtual())
         CopyMI.addReg(Reg, 0, SubIdx);
       else
         CopyMI.addReg(TRI->getSubReg(Reg, SubIdx));
@@ -606,7 +603,7 @@ void InstrEmitter::EmitSubregNode(SDNode *Node,
 ///
 void
 InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
-                                     DenseMap<SDValue, unsigned> &VRBaseMap) {
+                                     DenseMap<SDValue, Register> &VRBaseMap) {
   unsigned VReg = getVR(Node->getOperand(0), VRBaseMap);
 
   // Create the new VReg in the destination class and emit a copy.
@@ -626,7 +623,7 @@ InstrEmitter::EmitCopyToRegClassNode(SDNode *Node,
 /// EmitRegSequence - Generate machine code for REG_SEQUENCE nodes.
 ///
 void InstrEmitter::EmitRegSequence(SDNode *Node,
-                                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                                  DenseMap<SDValue, Register> &VRBaseMap,
                                   bool IsClone, bool IsCloned) {
   unsigned DstRCIdx = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
   const TargetRegisterClass *RC = TRI->getRegClass(DstRCIdx);
@@ -675,9 +672,9 @@ void InstrEmitter::EmitRegSequence(SDNode *Node,
 ///
 MachineInstr *
 InstrEmitter::EmitDbgValue(SDDbgValue *SD,
-                           DenseMap<SDValue, unsigned> &VRBaseMap) {
+                           DenseMap<SDValue, Register> &VRBaseMap) {
   MDNode *Var = SD->getVariable();
-  const DIExpression *Expr = SD->getExpression();
+  MDNode *Expr = SD->getExpression();
   DebugLoc DL = SD->getDebugLoc();
   assert(cast<DILocalVariable>(Var)->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
@@ -701,11 +698,12 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
     // EmitTargetCodeForFrameDebugValue is responsible for allocation.
     auto FrameMI = BuildMI(*MF, DL, TII->get(TargetOpcode::DBG_VALUE))
                        .addFrameIndex(SD->getFrameIx());
-
     if (SD->isIndirect())
-      Expr = DIExpression::append(Expr, {dwarf::DW_OP_deref});
-
-    FrameMI.addReg(0);
+      // Push [fi + 0] onto the DIExpression stack.
+      FrameMI.addImm(0);
+    else
+      // Push fi onto the DIExpression stack.
+      FrameMI.addReg(0);
     return FrameMI.addMetadata(Var).addMetadata(Expr);
   }
   // Otherwise, we're going to create an instruction here.
@@ -719,7 +717,7 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
     // they happen and transfer the debug info, but trying to guarantee that
     // in all cases would be very fragile; this is a safeguard for any
     // that were missed.
-    DenseMap<SDValue, unsigned>::iterator I = VRBaseMap.find(Op);
+    DenseMap<SDValue, Register>::iterator I = VRBaseMap.find(Op);
     if (I==VRBaseMap.end())
       MIB.addReg(0U);       // undef
     else
@@ -751,9 +749,9 @@ InstrEmitter::EmitDbgValue(SDDbgValue *SD,
 
   // Indirect addressing is indicated by an Imm as the second parameter.
   if (SD->isIndirect())
-    Expr = DIExpression::append(Expr, {dwarf::DW_OP_deref});
-
-  MIB.addReg(0U, RegState::Debug);
+    MIB.addImm(0U);
+  else
+    MIB.addReg(0U, RegState::Debug);
 
   MIB.addMetadata(Var);
   MIB.addMetadata(Expr);
@@ -780,7 +778,7 @@ InstrEmitter::EmitDbgLabel(SDDbgLabel *SD) {
 ///
 void InstrEmitter::
 EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
-                DenseMap<SDValue, unsigned> &VRBaseMap) {
+                DenseMap<SDValue, Register> &VRBaseMap) {
   unsigned Opc = Node->getMachineOpcode();
 
   // Handle subreg insert/extract specially
@@ -828,7 +826,10 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
   unsigned NumImpUses = 0;
   unsigned NodeOperands =
     countOperands(Node, II.getNumOperands() - NumDefs, NumImpUses);
-  bool HasPhysRegOuts = NumResults > NumDefs && II.getImplicitDefs()!=nullptr;
+  bool HasVRegVariadicDefs = !MF->getTarget().usesPhysRegsForValues() &&
+                             II.isVariadic() && II.variadicOpsAreDefs();
+  bool HasPhysRegOuts = NumResults > NumDefs &&
+                        II.getImplicitDefs() != nullptr && !HasVRegVariadicDefs;
 #ifndef NDEBUG
   unsigned NumMIOperands = NodeOperands + NumResults;
   if (II.isVariadic())
@@ -978,7 +979,7 @@ EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
 /// needed dependencies.
 void InstrEmitter::
 EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
-                DenseMap<SDValue, unsigned> &VRBaseMap) {
+                DenseMap<SDValue, Register> &VRBaseMap) {
   switch (Node->getOpcode()) {
   default:
 #ifndef NDEBUG
@@ -991,7 +992,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
   case ISD::TokenFactor: // fall thru
     break;
   case ISD::CopyToReg: {
-    unsigned DestReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
+    Register DestReg = cast<RegisterSDNode>(Node->getOperand(1))->getReg();
     SDValue SrcVal = Node->getOperand(2);
     if (Register::isVirtualRegister(DestReg) && SrcVal.isMachineOpcode() &&
         SrcVal.getMachineOpcode() == TargetOpcode::IMPLICIT_DEF) {
@@ -1001,7 +1002,7 @@ EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
               TII->get(TargetOpcode::IMPLICIT_DEF), DestReg);
       break;
     }
-    unsigned SrcReg;
+    Register SrcReg;
     if (RegisterSDNode *R = dyn_cast<RegisterSDNode>(SrcVal))
       SrcReg = R->getReg();
     else
diff --git a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
index cfe99dd977b5..c3567eae9161 100644
--- a/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
+++ b/llvm/lib/CodeGen/SelectionDAG/InstrEmitter.h
@@ -17,13 +17,15 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/SelectionDAG.h"
+#include "llvm/CodeGen/SelectionDAGNodes.h"
 
 namespace llvm {
 
 class MachineInstrBuilder;
 class MCInstrDesc;
+class SDDbgLabel;
 class SDDbgValue;
+class TargetLowering;
 
 class LLVM_LIBRARY_VISIBILITY InstrEmitter {
   MachineFunction *MF;
@@ -39,19 +41,19 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter {
   /// implicit physical register output.
   void EmitCopyFromReg(SDNode *Node, unsigned ResNo,
                        bool IsClone, bool IsCloned,
-                       unsigned SrcReg,
-                       DenseMap<SDValue, unsigned> &VRBaseMap);
+                       Register SrcReg,
+                       DenseMap<SDValue, Register> &VRBaseMap);
 
   void CreateVirtualRegisters(SDNode *Node,
                               MachineInstrBuilder &MIB,
                               const MCInstrDesc &II,
                               bool IsClone, bool IsCloned,
-                              DenseMap<SDValue, unsigned> &VRBaseMap);
+                              DenseMap<SDValue, Register> &VRBaseMap);
 
   /// getVR - Return the virtual register corresponding to the specified result
   /// of the specified node.
-  unsigned getVR(SDValue Op,
-                 DenseMap<SDValue, unsigned> &VRBaseMap);
+  Register getVR(SDValue Op,
+                 DenseMap<SDValue, Register> &VRBaseMap);
 
   /// AddRegisterOperand - Add the specified register as an operand to the
   /// specified machine instr. Insert register copies if the register is
@@ -60,7 +62,7 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter {
                           SDValue Op,
                           unsigned IIOpNum,
                           const MCInstrDesc *II,
-                          DenseMap<SDValue, unsigned> &VRBaseMap,
+                          DenseMap<SDValue, Register> &VRBaseMap,
                           bool IsDebug, bool IsClone, bool IsCloned);
 
   /// AddOperand - Add the specified operand to the specified machine instr.  II
@@ -71,18 +73,18 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter {
                   SDValue Op,
                   unsigned IIOpNum,
                   const MCInstrDesc *II,
-                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                  DenseMap<SDValue, Register> &VRBaseMap,
                   bool IsDebug, bool IsClone, bool IsCloned);
 
   /// ConstrainForSubReg - Try to constrain VReg to a register class that
   /// supports SubIdx sub-registers.  Emit a copy if that isn't possible.
   /// Return the virtual register to use.
-  unsigned ConstrainForSubReg(unsigned VReg, unsigned SubIdx, MVT VT,
+  Register ConstrainForSubReg(Register VReg, unsigned SubIdx, MVT VT,
                               bool isDivergent, const DebugLoc &DL);
 
   /// EmitSubregNode - Generate machine code for subreg nodes.
   ///
-  void EmitSubregNode(SDNode *Node, DenseMap<SDValue, unsigned> &VRBaseMap,
+  void EmitSubregNode(SDNode *Node, DenseMap<SDValue, Register> &VRBaseMap,
                       bool IsClone, bool IsCloned);
 
   /// EmitCopyToRegClassNode - Generate machine code for COPY_TO_REGCLASS nodes.
@@ -90,11 +92,11 @@ class LLVM_LIBRARY_VISIBILITY InstrEmitter {
   /// register is constrained to be in a particular register class.
   ///
   void EmitCopyToRegClassNode(SDNode *Node,
-                              DenseMap<SDValue, unsigned> &VRBaseMap);
+                              DenseMap<SDValue, Register> &VRBaseMap);
 
   /// EmitRegSequence - Generate machine code for REG_SEQUENCE nodes.
   ///
-  void EmitRegSequence(SDNode *Node, DenseMap<SDValue, unsigned> &VRBaseMap,
+  void EmitRegSequence(SDNode *Node, DenseMap<SDValue, Register> &VRBaseMap,
                        bool IsClone, bool IsCloned);
 public:
   /// CountResults - The results of target nodes have register or immediate
@@ -105,7 +107,7 @@ public:
   /// EmitDbgValue - Generate machine instruction for a dbg_value node.
   ///
   MachineInstr *EmitDbgValue(SDDbgValue *SD,
-                             DenseMap<SDValue, unsigned> &VRBaseMap);
+                             DenseMap<SDValue, Register> &VRBaseMap);
 
   /// Generate machine instruction for a dbg_label node.
   MachineInstr *EmitDbgLabel(SDDbgLabel *SD);
@@ -113,7 +115,7 @@ public:
   /// EmitNode - Generate machine code for a node and needed dependencies.
   ///
   void EmitNode(SDNode *Node, bool IsClone, bool IsCloned,
-                DenseMap<SDValue, unsigned> &VRBaseMap) {
+                DenseMap<SDValue, Register> &VRBaseMap) {
     if (Node->isMachineOpcode())
       EmitMachineNode(Node, IsClone, IsCloned, VRBaseMap);
     else
@@ -132,9 +134,9 @@ public:
 
 private:
   void EmitMachineNode(SDNode *Node, bool IsClone, bool IsCloned,
-                       DenseMap<SDValue, unsigned> &VRBaseMap);
+                       DenseMap<SDValue, Register> &VRBaseMap);
   void EmitSpecialNode(SDNode *Node, bool IsClone, bool IsCloned,
-                       DenseMap<SDValue, unsigned> &VRBaseMap);
+                       DenseMap<SDValue, Register> &VRBaseMap);
 };
 
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 80ac8b95e4ef..6a6004c158bb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -328,7 +328,7 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
 
   SDValue CPIdx =
       DAG.getConstantPool(LLVMC, TLI.getPointerTy(DAG.getDataLayout()));
-  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();
   if (Extend) {
     SDValue Result = DAG.getExtLoad(
         ISD::EXTLOAD, dl, OrigVT, DAG.getEntryNode(), CPIdx,
@@ -348,7 +348,7 @@ SDValue SelectionDAGLegalize::ExpandConstant(ConstantSDNode *CP) {
   EVT VT = CP->getValueType(0);
   SDValue CPIdx = DAG.getConstantPool(CP->getConstantIntValue(),
                                       TLI.getPointerTy(DAG.getDataLayout()));
-  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();
   SDValue Result = DAG.getLoad(
       VT, dl, DAG.getEntryNode(), CPIdx,
       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Alignment);
@@ -387,7 +387,9 @@ SDValue SelectionDAGLegalize::PerformInsertVectorEltInMemory(SDValue Vec,
   SDValue StackPtr2 = TLI.getVectorElementPointer(DAG, StackPtr, VT, Tmp3);
 
   // Store the scalar value.
-  Ch = DAG.getTruncStore(Ch, dl, Tmp2, StackPtr2, MachinePointerInfo(), EltVT);
+  Ch = DAG.getTruncStore(
+      Ch, dl, Tmp2, StackPtr2,
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT);
   // Load the updated vector.
   return DAG.getLoad(VT, dl, Ch, StackPtr, MachinePointerInfo::getFixedStack(
                                                DAG.getMachineFunction(), SPFI));
@@ -434,7 +436,6 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
   // We generally can't do this one for long doubles.
   SDValue Chain = ST->getChain();
   SDValue Ptr = ST->getBasePtr();
-  unsigned Alignment = ST->getAlignment();
   MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
   AAMDNodes AAInfo = ST->getAAInfo();
   SDLoc dl(ST);
@@ -444,8 +445,8 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
       SDValue Con = DAG.getConstant(CFP->getValueAPF().
                                       bitcastToAPInt().zextOrTrunc(32),
                                     SDLoc(CFP), MVT::i32);
-      return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(), Alignment,
-                          MMOFlags, AAInfo);
+      return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
+                          ST->getOriginalAlign(), MMOFlags, AAInfo);
     }
 
     if (CFP->getValueType(0) == MVT::f64) {
@@ -454,7 +455,7 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         SDValue Con = DAG.getConstant(CFP->getValueAPF().bitcastToAPInt().
                                       zextOrTrunc(64), SDLoc(CFP), MVT::i64);
         return DAG.getStore(Chain, dl, Con, Ptr, ST->getPointerInfo(),
-                            Alignment, MMOFlags, AAInfo);
+                            ST->getOriginalAlign(), MMOFlags, AAInfo);
       }
 
       if (TLI.isTypeLegal(MVT::i32) && !ST->isVolatile()) {
@@ -467,12 +468,12 @@ SDValue SelectionDAGLegalize::OptimizeFloatStore(StoreSDNode* ST) {
         if (DAG.getDataLayout().isBigEndian())
           std::swap(Lo, Hi);
 
-        Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(), Alignment,
-                          MMOFlags, AAInfo);
+        Lo = DAG.getStore(Chain, dl, Lo, Ptr, ST->getPointerInfo(),
+                          ST->getOriginalAlign(), MMOFlags, AAInfo);
         Ptr = DAG.getMemBasePlusOffset(Ptr, 4, dl);
         Hi = DAG.getStore(Chain, dl, Hi, Ptr,
                           ST->getPointerInfo().getWithOffset(4),
-                          MinAlign(Alignment, 4U), MMOFlags, AAInfo);
+                          ST->getOriginalAlign(), MMOFlags, AAInfo);
 
         return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
       }
@@ -487,7 +488,6 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
   SDValue Ptr = ST->getBasePtr();
   SDLoc dl(Node);
 
-  unsigned Alignment = ST->getAlignment();
   MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
   AAMDNodes AAInfo = ST->getAAInfo();
 
@@ -528,9 +528,8 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
       assert(NVT.getSizeInBits() == VT.getSizeInBits() &&
              "Can only promote stores to same size type");
       Value = DAG.getNode(ISD::BITCAST, dl, NVT, Value);
-      SDValue Result =
-          DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                       Alignment, MMOFlags, AAInfo);
+      SDValue Result = DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
+                                    ST->getOriginalAlign(), MMOFlags, AAInfo);
       ReplaceNode(SDValue(Node, 0), Result);
       break;
     }
@@ -553,7 +552,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
     Value = DAG.getZeroExtendInReg(Value, dl, StVT);
     SDValue Result =
         DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), NVT,
-                          Alignment, MMOFlags, AAInfo);
+                          ST->getOriginalAlign(), MMOFlags, AAInfo);
     ReplaceNode(SDValue(Node, 0), Result);
   } else if (StWidth & (StWidth - 1)) {
     // If not storing a power-of-2 number of bits, expand as two stores.
@@ -575,7 +574,7 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
       // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 X, TRUNCSTORE@+2:i8 (srl X, 16)
       // Store the bottom RoundWidth bits.
       Lo = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                             RoundVT, Alignment, MMOFlags, AAInfo);
+                             RoundVT, ST->getOriginalAlign(), MMOFlags, AAInfo);
 
       // Store the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
@@ -584,10 +583,9 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           ISD::SRL, dl, Value.getValueType(), Value,
           DAG.getConstant(RoundWidth, dl,
                           TLI.getShiftAmountTy(Value.getValueType(), DL)));
-      Hi = DAG.getTruncStore(
-          Chain, dl, Hi, Ptr,
-          ST->getPointerInfo().getWithOffset(IncrementSize), ExtraVT,
-          MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
+      Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr,
+                             ST->getPointerInfo().getWithOffset(IncrementSize),
+                             ExtraVT, ST->getOriginalAlign(), MMOFlags, AAInfo);
     } else {
       // Big endian - avoid unaligned stores.
       // TRUNCSTORE:i24 X -> TRUNCSTORE:i16 (srl X, 8), TRUNCSTORE@+2:i8 X
@@ -596,18 +594,17 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
           ISD::SRL, dl, Value.getValueType(), Value,
           DAG.getConstant(ExtraWidth, dl,
                           TLI.getShiftAmountTy(Value.getValueType(), DL)));
-      Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(),
-                             RoundVT, Alignment, MMOFlags, AAInfo);
+      Hi = DAG.getTruncStore(Chain, dl, Hi, Ptr, ST->getPointerInfo(), RoundVT,
+                             ST->getOriginalAlign(), MMOFlags, AAInfo);
 
       // Store the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
       Ptr = DAG.getNode(ISD::ADD, dl, Ptr.getValueType(), Ptr,
                         DAG.getConstant(IncrementSize, dl,
                                         Ptr.getValueType()));
-      Lo = DAG.getTruncStore(
-          Chain, dl, Value, Ptr,
-          ST->getPointerInfo().getWithOffset(IncrementSize), ExtraVT,
-          MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
+      Lo = DAG.getTruncStore(Chain, dl, Value, Ptr,
+                             ST->getPointerInfo().getWithOffset(IncrementSize),
+                             ExtraVT, ST->getOriginalAlign(), MMOFlags, AAInfo);
     }
 
     // The order of the stores doesn't matter.
@@ -643,15 +640,16 @@ void SelectionDAGLegalize::LegalizeStoreOps(SDNode *Node) {
       if (TLI.isTypeLegal(StVT)) {
         Value = DAG.getNode(ISD::TRUNCATE, dl, StVT, Value);
         Result = DAG.getStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                              Alignment, MMOFlags, AAInfo);
+                              ST->getOriginalAlign(), MMOFlags, AAInfo);
       } else {
         // The in-memory type isn't legal. Truncate to the type it would promote
         // to, and then do a truncstore.
         Value = DAG.getNode(ISD::TRUNCATE, dl,
                             TLI.getTypeToTransformTo(*DAG.getContext(), StVT),
                             Value);
-        Result = DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(),
-                                   StVT, Alignment, MMOFlags, AAInfo);
+        Result =
+            DAG.getTruncStore(Chain, dl, Value, Ptr, ST->getPointerInfo(), StVT,
+                              ST->getOriginalAlign(), MMOFlags, AAInfo);
       }
 
       ReplaceNode(SDValue(Node, 0), Result);
@@ -721,7 +719,6 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
   LLVM_DEBUG(dbgs() << "Legalizing extending load operation\n");
   EVT SrcVT = LD->getMemoryVT();
   unsigned SrcWidth = SrcVT.getSizeInBits();
-  unsigned Alignment = LD->getAlignment();
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
@@ -748,9 +745,9 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
     ISD::LoadExtType NewExtType =
       ExtType == ISD::ZEXTLOAD ? ISD::ZEXTLOAD : ISD::EXTLOAD;
 
-    SDValue Result =
-        DAG.getExtLoad(NewExtType, dl, Node->getValueType(0), Chain, Ptr,
-                       LD->getPointerInfo(), NVT, Alignment, MMOFlags, AAInfo);
+    SDValue Result = DAG.getExtLoad(NewExtType, dl, Node->getValueType(0),
+                                    Chain, Ptr, LD->getPointerInfo(), NVT,
+                                    LD->getOriginalAlign(), MMOFlags, AAInfo);
 
     Ch = Result.getValue(1); // The chain.
 
@@ -788,16 +785,15 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       // EXTLOAD:i24 -> ZEXTLOAD:i16 | (shl EXTLOAD@+2:i8, 16)
       // Load the bottom RoundWidth bits.
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
-                          LD->getPointerInfo(), RoundVT, Alignment, MMOFlags,
-                          AAInfo);
+                          LD->getPointerInfo(), RoundVT, LD->getOriginalAlign(),
+                          MMOFlags, AAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
       Ptr = DAG.getMemBasePlusOffset(Ptr, IncrementSize, dl);
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
-                          ExtraVT, MinAlign(Alignment, IncrementSize), MMOFlags,
-                          AAInfo);
+                          ExtraVT, LD->getOriginalAlign(), MMOFlags, AAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
@@ -817,16 +813,15 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
       // EXTLOAD:i24 -> (shl EXTLOAD:i16, 8) | ZEXTLOAD@+2:i8
       // Load the top RoundWidth bits.
       Hi = DAG.getExtLoad(ExtType, dl, Node->getValueType(0), Chain, Ptr,
-                          LD->getPointerInfo(), RoundVT, Alignment, MMOFlags,
-                          AAInfo);
+                          LD->getPointerInfo(), RoundVT, LD->getOriginalAlign(),
+                          MMOFlags, AAInfo);
 
       // Load the remaining ExtraWidth bits.
       IncrementSize = RoundWidth / 8;
       Ptr = DAG.getMemBasePlusOffset(Ptr, IncrementSize, dl);
       Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, Node->getValueType(0), Chain, Ptr,
                           LD->getPointerInfo().getWithOffset(IncrementSize),
-                          ExtraVT, MinAlign(Alignment, IncrementSize), MMOFlags,
-                          AAInfo);
+                          ExtraVT, LD->getOriginalAlign(), MMOFlags, AAInfo);
 
       // Build a factor node to remember that this load is independent of
       // the other one.
@@ -933,7 +928,7 @@ void SelectionDAGLegalize::LegalizeLoadOps(SDNode *Node) {
                              Result.getValueType(),
                              Result, DAG.getValueType(SrcVT));
       else
-        ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT.getScalarType());
+        ValRes = DAG.getZeroExtendInReg(Result, dl, SrcVT);
       Value = ValRes;
       Chain = Result.getValue(1);
       break;
@@ -1009,6 +1004,7 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
     Action = TLI.getOperationAction(Node->getOpcode(),
                                     Node->getOperand(0).getValueType());
     break;
+  case ISD::STRICT_FP_TO_FP16:
   case ISD::STRICT_SINT_TO_FP:
   case ISD::STRICT_UINT_TO_FP:
   case ISD::STRICT_LRINT:
@@ -1131,7 +1127,9 @@ void SelectionDAGLegalize::LegalizeOp(SDNode *Node) {
   case ISD::UMULFIX:
   case ISD::UMULFIXSAT:
   case ISD::SDIVFIX:
-  case ISD::UDIVFIX: {
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: {
     unsigned Scale = Node->getConstantOperandVal(2);
     Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
                                               Node->getValueType(0), Scale);
@@ -1383,19 +1381,26 @@ SDValue SelectionDAGLegalize::ExpandInsertToVectorThroughStack(SDValue Op) {
   SDValue SubStackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
 
   // Store the subvector.
-  Ch = DAG.getStore(Ch, dl, Part, SubStackPtr, MachinePointerInfo());
+  Ch = DAG.getStore(
+      Ch, dl, Part, SubStackPtr,
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
 
   // Finally, load the updated vector.
   return DAG.getLoad(Op.getValueType(), dl, Ch, StackPtr, PtrInfo);
 }
 
 SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
+  assert((Node->getOpcode() == ISD::BUILD_VECTOR ||
+          Node->getOpcode() == ISD::CONCAT_VECTORS) &&
+         "Unexpected opcode!");
+
   // We can't handle this case efficiently.  Allocate a sufficiently
-  // aligned object on the stack, store each element into it, then load
+  // aligned object on the stack, store each operand into it, then load
   // the result as a vector.
   // Create the stack frame object.
   EVT VT = Node->getValueType(0);
-  EVT EltVT = VT.getVectorElementType();
+  EVT MemVT = isa<BuildVectorSDNode>(Node) ? VT.getVectorElementType()
+                                           : Node->getOperand(0).getValueType();
   SDLoc dl(Node);
   SDValue FIPtr = DAG.CreateStackTemporary(VT);
   int FI = cast<FrameIndexSDNode>(FIPtr.getNode())->getIndex();
@@ -1404,7 +1409,7 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
 
   // Emit a store of each element to the stack slot.
   SmallVector<SDValue, 8> Stores;
-  unsigned TypeByteSize = EltVT.getSizeInBits() / 8;
+  unsigned TypeByteSize = MemVT.getSizeInBits() / 8;
   assert(TypeByteSize > 0 && "Vector element type too small for stack store!");
   // Store (in the right endianness) the elements to memory.
   for (unsigned i = 0, e = Node->getNumOperands(); i != e; ++i) {
@@ -1413,16 +1418,15 @@ SDValue SelectionDAGLegalize::ExpandVectorBuildThroughStack(SDNode* Node) {
 
     unsigned Offset = TypeByteSize*i;
 
-    SDValue Idx = DAG.getConstant(Offset, dl, FIPtr.getValueType());
-    Idx = DAG.getMemBasePlusOffset(FIPtr, Idx, dl);
+    SDValue Idx = DAG.getMemBasePlusOffset(FIPtr, Offset, dl);
 
     // If the destination vector element type is narrower than the source
     // element type, only store the bits necessary.
-    if (EltVT.bitsLT(Node->getOperand(i).getValueType().getScalarType())) {
+    if (MemVT.bitsLT(Node->getOperand(i).getValueType()))
       Stores.push_back(DAG.getTruncStore(DAG.getEntryNode(), dl,
                                          Node->getOperand(i), Idx,
-                                         PtrInfo.getWithOffset(Offset), EltVT));
-    } else
+                                         PtrInfo.getWithOffset(Offset), MemVT));
+    else
       Stores.push_back(DAG.getStore(DAG.getEntryNode(), dl, Node->getOperand(i),
                                     Idx, PtrInfo.getWithOffset(Offset)));
   }
@@ -1600,13 +1604,17 @@ void SelectionDAGLegalize::ExpandDYNAMIC_STACKALLOC(SDNode* Node,
   SDValue Size  = Tmp2.getOperand(1);
   SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
   Chain = SP.getValue(1);
-  unsigned Align = cast<ConstantSDNode>(Tmp3)->getZExtValue();
-  unsigned StackAlign =
-      DAG.getSubtarget().getFrameLowering()->getStackAlignment();
-  Tmp1 = DAG.getNode(ISD::SUB, dl, VT, SP, Size);       // Value
-  if (Align > StackAlign)
+  Align Alignment = cast<ConstantSDNode>(Tmp3)->getAlignValue();
+  const TargetFrameLowering *TFL = DAG.getSubtarget().getFrameLowering();
+  unsigned Opc =
+    TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
+    ISD::ADD : ISD::SUB;
+
+  Align StackAlign = TFL->getStackAlign();
+  Tmp1 = DAG.getNode(Opc, dl, VT, SP, Size);       // Value
+  if (Alignment > StackAlign)
     Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
-                       DAG.getConstant(-(uint64_t)Align, dl, VT));
+                       DAG.getConstant(-Alignment.value(), dl, VT));
   Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);     // Output chain
 
   Tmp2 = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
@@ -1968,7 +1976,7 @@ SDValue SelectionDAGLegalize::ExpandBUILD_VECTOR(SDNode *Node) {
     Constant *CP = ConstantVector::get(CV);
     SDValue CPIdx =
         DAG.getConstantPool(CP, TLI.getPointerTy(DAG.getDataLayout()));
-    unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+    Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();
     return DAG.getLoad(
         VT, dl, DAG.getEntryNode(), CPIdx,
         MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
@@ -2360,36 +2368,34 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
     // Get the stack frame index of a 8 byte buffer.
     SDValue StackSlot = DAG.CreateStackTemporary(MVT::f64);
 
-    // word offset constant for Hi/Lo address computation
-    SDValue WordOff = DAG.getConstant(sizeof(int), dl,
-                                      StackSlot.getValueType());
-    // set up Hi and Lo (into buffer) address based on endian
-    SDValue Hi = StackSlot;
-    SDValue Lo = DAG.getNode(ISD::ADD, dl, StackSlot.getValueType(),
-                             StackSlot, WordOff);
-    if (DAG.getDataLayout().isLittleEndian())
-      std::swap(Hi, Lo);
-
+    SDValue Lo = Op0;
     // if signed map to unsigned space
-    SDValue Op0Mapped;
     if (isSigned) {
-      // constant used to invert sign bit (signed to unsigned mapping)
-      SDValue SignBit = DAG.getConstant(0x80000000u, dl, MVT::i32);
-      Op0Mapped = DAG.getNode(ISD::XOR, dl, MVT::i32, Op0, SignBit);
-    } else {
-      Op0Mapped = Op0;
+      // Invert sign bit (signed to unsigned mapping).
+      Lo = DAG.getNode(ISD::XOR, dl, MVT::i32, Lo,
+                       DAG.getConstant(0x80000000u, dl, MVT::i32));
     }
-    // store the lo of the constructed double - based on integer input
-    SDValue Store1 = DAG.getStore(DAG.getEntryNode(), dl, Op0Mapped, Lo,
+    // Initial hi portion of constructed double.
+    SDValue Hi = DAG.getConstant(0x43300000u, dl, MVT::i32);
+
+    // If this a big endian target, swap the lo and high data.
+    if (DAG.getDataLayout().isBigEndian())
+      std::swap(Lo, Hi);
+
+    SDValue MemChain = DAG.getEntryNode();
+
+    // Store the lo of the constructed double.
+    SDValue Store1 = DAG.getStore(MemChain, dl, Lo, StackSlot,
                                   MachinePointerInfo());
-    // initial hi portion of constructed double
-    SDValue InitialHi = DAG.getConstant(0x43300000u, dl, MVT::i32);
-    // store the hi of the constructed double - biased exponent
+    // Store the hi of the constructed double.
+    SDValue HiPtr = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
     SDValue Store2 =
-        DAG.getStore(Store1, dl, InitialHi, Hi, MachinePointerInfo());
+        DAG.getStore(MemChain, dl, Hi, HiPtr, MachinePointerInfo());
+    MemChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Store1, Store2);
+
     // load the constructed double
     SDValue Load =
-        DAG.getLoad(MVT::f64, dl, Store2, StackSlot, MachinePointerInfo());
+        DAG.getLoad(MVT::f64, dl, MemChain, StackSlot, MachinePointerInfo());
     // FP constant to bias correct the final result
     SDValue Bias = DAG.getConstantFP(isSigned ?
                                      BitsToDouble(0x4330000080000000ULL) :
@@ -2417,10 +2423,65 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
     }
     return Result;
   }
-  assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
   // Code below here assumes !isSigned without checking again.
-  // FIXME: This can produce slightly incorrect results. See details in
-  // FIXME: https://reviews.llvm.org/D69275
+  assert(!isSigned && "Legalize cannot Expand SINT_TO_FP for i64 yet");
+
+  // TODO: Generalize this for use with other types.
+  if ((SrcVT == MVT::i32 || SrcVT == MVT::i64) && DestVT == MVT::f32) {
+    LLVM_DEBUG(dbgs() << "Converting unsigned i32/i64 to f32\n");
+    // For unsigned conversions, convert them to signed conversions using the
+    // algorithm from the x86_64 __floatundisf in compiler_rt. That method
+    // should be valid for i32->f32 as well.
+
+    // TODO: This really should be implemented using a branch rather than a
+    // select.  We happen to get lucky and machinesink does the right
+    // thing most of the time.  This would be a good candidate for a
+    // pseudo-op, or, even better, for whole-function isel.
+    EVT SetCCVT = getSetCCResultType(SrcVT);
+
+    SDValue SignBitTest = DAG.getSetCC(
+        dl, SetCCVT, Op0, DAG.getConstant(0, dl, SrcVT), ISD::SETLT);
+
+    EVT ShiftVT = TLI.getShiftAmountTy(SrcVT, DAG.getDataLayout());
+    SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT);
+    SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Op0, ShiftConst);
+    SDValue AndConst = DAG.getConstant(1, dl, SrcVT);
+    SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Op0, AndConst);
+    SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr);
+
+    SDValue Slow, Fast;
+    if (Node->isStrictFPOpcode()) {
+      // In strict mode, we must avoid spurious exceptions, and therefore
+      // must make sure to only emit a single STRICT_SINT_TO_FP.
+      SDValue InCvt = DAG.getSelect(dl, SrcVT, SignBitTest, Or, Op0);
+      Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, { DestVT, MVT::Other },
+                         { Node->getOperand(0), InCvt });
+      Slow = DAG.getNode(ISD::STRICT_FADD, dl, { DestVT, MVT::Other },
+                         { Fast.getValue(1), Fast, Fast });
+      Chain = Slow.getValue(1);
+      // The STRICT_SINT_TO_FP inherits the exception mode from the
+      // incoming STRICT_UINT_TO_FP node; the STRICT_FADD node can
+      // never raise any exception.
+      SDNodeFlags Flags;
+      Flags.setNoFPExcept(Node->getFlags().hasNoFPExcept());
+      Fast->setFlags(Flags);
+      Flags.setNoFPExcept(true);
+      Slow->setFlags(Flags);
+    } else {
+      SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Or);
+      Slow = DAG.getNode(ISD::FADD, dl, DestVT, SignCvt, SignCvt);
+      Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DestVT, Op0);
+    }
+
+    return DAG.getSelect(dl, DestVT, SignBitTest, Slow, Fast);
+  }
+
+  // The following optimization is valid only if every value in SrcVT (when
+  // treated as signed) is representable in DestVT.  Check that the mantissa
+  // size of DestVT is >= than the number of bits in SrcVT -1.
+  assert(APFloat::semanticsPrecision(DAG.EVTToAPFloatSemantics(DestVT)) >=
+             SrcVT.getSizeInBits() - 1 &&
+         "Cannot perform lossless SINT_TO_FP!");
 
   SDValue Tmp1;
   if (Node->isStrictFPOpcode()) {
@@ -2454,9 +2515,9 @@ SDValue SelectionDAGLegalize::ExpandLegalINT_TO_FP(SDNode *Node,
 
   SDValue CPIdx =
       DAG.getConstantPool(FudgeFactor, TLI.getPointerTy(DAG.getDataLayout()));
-  unsigned Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlignment();
+  Align Alignment = cast<ConstantPoolSDNode>(CPIdx)->getAlign();
   CPIdx = DAG.getNode(ISD::ADD, dl, CPIdx.getValueType(), CPIdx, CstOffset);
-  Alignment = std::min(Alignment, 4u);
+  Alignment = commonAlignment(Alignment, 4);
   SDValue FudgeInReg;
   if (DestVT == MVT::f32)
     FudgeInReg = DAG.getLoad(
@@ -2765,6 +2826,7 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
   }
   case ISD::FLT_ROUNDS_:
     Results.push_back(DAG.getConstant(1, dl, Node->getValueType(0)));
+    Results.push_back(Node->getOperand(0));
     break;
   case ISD::EH_RETURN:
   case ISD::EH_LABEL:
@@ -3090,14 +3152,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
       }
       unsigned Idx = Mask[i];
       if (Idx < NumElems)
-        Ops.push_back(DAG.getNode(
-            ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
-            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
+        Ops.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                                  DAG.getVectorIdxConstant(Idx, dl)));
       else
-        Ops.push_back(DAG.getNode(
-            ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op1,
-            DAG.getConstant(Idx - NumElems, dl,
-                            TLI.getVectorIdxTy(DAG.getDataLayout()))));
+        Ops.push_back(
+            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op1,
+                        DAG.getVectorIdxConstant(Idx - NumElems, dl)));
     }
 
     Tmp1 = DAG.getBuildVector(VT, dl, Ops);
@@ -3219,6 +3279,21 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
           DAG.getNode(ISD::FP_EXTEND, dl, Node->getValueType(0), Res));
     }
     break;
+  case ISD::STRICT_FP16_TO_FP:
+    if (Node->getValueType(0) != MVT::f32) {
+      // We can extend to types bigger than f32 in two steps without changing
+      // the result. Since "f16 -> f32" is much more commonly available, give
+      // CodeGen the option of emitting that before resorting to a libcall.
+      SDValue Res =
+          DAG.getNode(ISD::STRICT_FP16_TO_FP, dl, {MVT::f32, MVT::Other},
+                      {Node->getOperand(0), Node->getOperand(1)});
+      Res = DAG.getNode(ISD::STRICT_FP_EXTEND, dl,
+                        {Node->getValueType(0), MVT::Other},
+                        {Res.getValue(1), Res});
+      Results.push_back(Res);
+      Results.push_back(Res.getValue(1));
+    }
+    break;
   case ISD::FP_TO_FP16:
     LLVM_DEBUG(dbgs() << "Legalizing FP_TO_FP16\n");
     if (!TLI.useSoftFloat() && TM.Options.UnsafeFPMath) {
@@ -3273,26 +3348,10 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     break;
   }
   case ISD::UREM:
-  case ISD::SREM: {
-    EVT VT = Node->getValueType(0);
-    bool isSigned = Node->getOpcode() == ISD::SREM;
-    unsigned DivOpc = isSigned ? ISD::SDIV : ISD::UDIV;
-    unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
-    Tmp2 = Node->getOperand(0);
-    Tmp3 = Node->getOperand(1);
-    if (TLI.isOperationLegalOrCustom(DivRemOpc, VT)) {
-      SDVTList VTs = DAG.getVTList(VT, VT);
-      Tmp1 = DAG.getNode(DivRemOpc, dl, VTs, Tmp2, Tmp3).getValue(1);
-      Results.push_back(Tmp1);
-    } else if (TLI.isOperationLegalOrCustom(DivOpc, VT)) {
-      // X % Y -> X-X/Y*Y
-      Tmp1 = DAG.getNode(DivOpc, dl, VT, Tmp2, Tmp3);
-      Tmp1 = DAG.getNode(ISD::MUL, dl, VT, Tmp1, Tmp3);
-      Tmp1 = DAG.getNode(ISD::SUB, dl, VT, Tmp2, Tmp1);
+  case ISD::SREM:
+    if (TLI.expandREM(Node, Tmp1, DAG))
       Results.push_back(Tmp1);
-    }
     break;
-  }
   case ISD::UDIV:
   case ISD::SDIV: {
     bool isSigned = Node->getOpcode() == ISD::SDIV;
@@ -3420,7 +3479,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     Results.push_back(TLI.expandFixedPointMul(Node, DAG));
     break;
   case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
   case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:
     if (SDValue V = TLI.expandFixedPointDiv(Node->getOpcode(), SDLoc(Node),
                                             Node->getOperand(0),
                                             Node->getOperand(1),
@@ -3457,8 +3518,9 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
     SDValue Overflow = DAG.getSetCC(dl, SetCCType, Sum, LHS, CC);
 
     // Add of the sum and the carry.
+    SDValue One = DAG.getConstant(1, dl, VT);
     SDValue CarryExt =
-        DAG.getZeroExtendInReg(DAG.getZExtOrTrunc(Carry, dl, VT), dl, MVT::i1);
+        DAG.getNode(ISD::AND, dl, VT, DAG.getZExtOrTrunc(Carry, dl, VT), One);
     SDValue Sum2 = DAG.getNode(Op, dl, VT, Sum, CarryExt);
 
     // Second check for overflow. If we are adding, we can only overflow if the
@@ -3780,12 +3842,12 @@ bool SelectionDAGLegalize::ExpandNode(SDNode *Node) {
 
     SmallVector<SDValue, 8> Scalars;
     for (unsigned Idx = 0; Idx < NumElem; Idx++) {
-      SDValue Ex = DAG.getNode(
-          ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(), Node->getOperand(0),
-          DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-      SDValue Sh = DAG.getNode(
-          ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(), Node->getOperand(1),
-          DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      SDValue Ex =
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(),
+                      Node->getOperand(0), DAG.getVectorIdxConstant(Idx, dl));
+      SDValue Sh =
+          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT.getScalarType(),
+                      Node->getOperand(1), DAG.getVectorIdxConstant(Idx, dl));
       Scalars.push_back(DAG.getNode(Node->getOpcode(), dl,
                                     VT.getScalarType(), Ex, Sh));
     }
@@ -3867,7 +3929,6 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
   SmallVector<SDValue, 8> Results;
   SDLoc dl(Node);
   // FIXME: Check flags on the node to see if we can use a finite call.
-  bool CanUseFiniteLibCall = TM.Options.NoInfsFPMath && TM.Options.NoNaNsFPMath;
   unsigned Opc = Node->getOpcode();
   switch (Opc) {
   case ISD::ATOMIC_FENCE: {
@@ -3976,68 +4037,28 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     break;
   case ISD::FLOG:
   case ISD::STRICT_FLOG:
-    if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_log_finite))
-      ExpandFPLibCall(Node, RTLIB::LOG_FINITE_F32,
-                      RTLIB::LOG_FINITE_F64,
-                      RTLIB::LOG_FINITE_F80,
-                      RTLIB::LOG_FINITE_F128,
-                      RTLIB::LOG_FINITE_PPCF128, Results);
-    else
-      ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64,
-                      RTLIB::LOG_F80, RTLIB::LOG_F128,
-                      RTLIB::LOG_PPCF128, Results);
+    ExpandFPLibCall(Node, RTLIB::LOG_F32, RTLIB::LOG_F64, RTLIB::LOG_F80,
+                    RTLIB::LOG_F128, RTLIB::LOG_PPCF128, Results);
     break;
   case ISD::FLOG2:
   case ISD::STRICT_FLOG2:
-    if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_log2_finite))
-      ExpandFPLibCall(Node, RTLIB::LOG2_FINITE_F32,
-                      RTLIB::LOG2_FINITE_F64,
-                      RTLIB::LOG2_FINITE_F80,
-                      RTLIB::LOG2_FINITE_F128,
-                      RTLIB::LOG2_FINITE_PPCF128, Results);
-    else
-      ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64,
-                      RTLIB::LOG2_F80, RTLIB::LOG2_F128,
-                      RTLIB::LOG2_PPCF128, Results);
+    ExpandFPLibCall(Node, RTLIB::LOG2_F32, RTLIB::LOG2_F64, RTLIB::LOG2_F80,
+                    RTLIB::LOG2_F128, RTLIB::LOG2_PPCF128, Results);
     break;
   case ISD::FLOG10:
   case ISD::STRICT_FLOG10:
-    if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_log10_finite))
-      ExpandFPLibCall(Node, RTLIB::LOG10_FINITE_F32,
-                      RTLIB::LOG10_FINITE_F64,
-                      RTLIB::LOG10_FINITE_F80,
-                      RTLIB::LOG10_FINITE_F128,
-                      RTLIB::LOG10_FINITE_PPCF128, Results);
-    else
-      ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64,
-                      RTLIB::LOG10_F80, RTLIB::LOG10_F128,
-                      RTLIB::LOG10_PPCF128, Results);
+    ExpandFPLibCall(Node, RTLIB::LOG10_F32, RTLIB::LOG10_F64, RTLIB::LOG10_F80,
+                    RTLIB::LOG10_F128, RTLIB::LOG10_PPCF128, Results);
     break;
   case ISD::FEXP:
   case ISD::STRICT_FEXP:
-    if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_exp_finite))
-      ExpandFPLibCall(Node, RTLIB::EXP_FINITE_F32,
-                      RTLIB::EXP_FINITE_F64,
-                      RTLIB::EXP_FINITE_F80,
-                      RTLIB::EXP_FINITE_F128,
-                      RTLIB::EXP_FINITE_PPCF128, Results);
-    else
-      ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64,
-                      RTLIB::EXP_F80, RTLIB::EXP_F128,
-                      RTLIB::EXP_PPCF128, Results);
+    ExpandFPLibCall(Node, RTLIB::EXP_F32, RTLIB::EXP_F64, RTLIB::EXP_F80,
+                    RTLIB::EXP_F128, RTLIB::EXP_PPCF128, Results);
     break;
   case ISD::FEXP2:
   case ISD::STRICT_FEXP2:
-    if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_exp2_finite))
-      ExpandFPLibCall(Node, RTLIB::EXP2_FINITE_F32,
-                      RTLIB::EXP2_FINITE_F64,
-                      RTLIB::EXP2_FINITE_F80,
-                      RTLIB::EXP2_FINITE_F128,
-                      RTLIB::EXP2_FINITE_PPCF128, Results);
-    else
-      ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64,
-                      RTLIB::EXP2_F80, RTLIB::EXP2_F128,
-                      RTLIB::EXP2_PPCF128, Results);
+    ExpandFPLibCall(Node, RTLIB::EXP2_F32, RTLIB::EXP2_F64, RTLIB::EXP2_F80,
+                    RTLIB::EXP2_F128, RTLIB::EXP2_PPCF128, Results);
     break;
   case ISD::FTRUNC:
   case ISD::STRICT_FTRUNC:
@@ -4079,6 +4100,14 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
                     RTLIB::ROUND_F128,
                     RTLIB::ROUND_PPCF128, Results);
     break;
+  case ISD::FROUNDEVEN:
+  case ISD::STRICT_FROUNDEVEN:
+    ExpandFPLibCall(Node, RTLIB::ROUNDEVEN_F32,
+                    RTLIB::ROUNDEVEN_F64,
+                    RTLIB::ROUNDEVEN_F80,
+                    RTLIB::ROUNDEVEN_F128,
+                    RTLIB::ROUNDEVEN_PPCF128, Results);
+    break;
   case ISD::FPOWI:
   case ISD::STRICT_FPOWI: {
     RTLIB::Libcall LC;
@@ -4107,16 +4136,8 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
   }
   case ISD::FPOW:
   case ISD::STRICT_FPOW:
-    if (CanUseFiniteLibCall && DAG.getLibInfo().has(LibFunc_pow_finite))
-      ExpandFPLibCall(Node, RTLIB::POW_FINITE_F32,
-                      RTLIB::POW_FINITE_F64,
-                      RTLIB::POW_FINITE_F80,
-                      RTLIB::POW_FINITE_F128,
-                      RTLIB::POW_FINITE_PPCF128, Results);
-    else
-      ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64,
-                      RTLIB::POW_F80, RTLIB::POW_F128,
-                      RTLIB::POW_PPCF128, Results);
+    ExpandFPLibCall(Node, RTLIB::POW_F32, RTLIB::POW_F64, RTLIB::POW_F80,
+                    RTLIB::POW_F128, RTLIB::POW_PPCF128, Results);
     break;
   case ISD::LROUND:
   case ISD::STRICT_LROUND:
@@ -4181,6 +4202,17 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
       Results.push_back(ExpandLibCall(RTLIB::FPEXT_F16_F32, Node, false));
     }
     break;
+  case ISD::STRICT_FP16_TO_FP: {
+    if (Node->getValueType(0) == MVT::f32) {
+      TargetLowering::MakeLibCallOptions CallOptions;
+      std::pair<SDValue, SDValue> Tmp = TLI.makeLibCall(
+          DAG, RTLIB::FPEXT_F16_F32, MVT::f32, Node->getOperand(1), CallOptions,
+          SDLoc(Node), Node->getOperand(0));
+      Results.push_back(Tmp.first);
+      Results.push_back(Tmp.second);
+    }
+    break;
+  }
   case ISD::FP_TO_FP16: {
     RTLIB::Libcall LC =
         RTLIB::getFPROUND(Node->getOperand(0).getValueType(), MVT::f16);
@@ -4188,6 +4220,19 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     Results.push_back(ExpandLibCall(LC, Node, false));
     break;
   }
+  case ISD::STRICT_FP_TO_FP16: {
+    RTLIB::Libcall LC =
+        RTLIB::getFPROUND(Node->getOperand(1).getValueType(), MVT::f16);
+    assert(LC != RTLIB::UNKNOWN_LIBCALL &&
+           "Unable to expand strict_fp_to_fp16");
+    TargetLowering::MakeLibCallOptions CallOptions;
+    std::pair<SDValue, SDValue> Tmp =
+        TLI.makeLibCall(DAG, LC, Node->getValueType(0), Node->getOperand(1),
+                        CallOptions, SDLoc(Node), Node->getOperand(0));
+    Results.push_back(Tmp.first);
+    Results.push_back(Tmp.second);
+    break;
+  }
   case ISD::FSUB:
   case ISD::STRICT_FSUB:
     ExpandFPLibCall(Node, RTLIB::SUB_F32, RTLIB::SUB_F64,
@@ -4289,8 +4334,13 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::CTLZ:
   case ISD::CTLZ_ZERO_UNDEF:
   case ISD::CTPOP:
-    // Zero extend the argument.
-    Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
+    // Zero extend the argument unless its cttz, then use any_extend.
+    if (Node->getOpcode() == ISD::CTTZ ||
+        Node->getOpcode() == ISD::CTTZ_ZERO_UNDEF)
+      Tmp1 = DAG.getNode(ISD::ANY_EXTEND, dl, NVT, Node->getOperand(0));
+    else
+      Tmp1 = DAG.getNode(ISD::ZERO_EXTEND, dl, NVT, Node->getOperand(0));
+
     if (Node->getOpcode() == ISD::CTTZ) {
       // The count is the same in the promoted type except if the original
       // value was zero.  This can be handled by setting the bit just off
@@ -4552,6 +4602,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::FRINT:
   case ISD::FNEARBYINT:
   case ISD::FROUND:
+  case ISD::FROUNDEVEN:
   case ISD::FTRUNC:
   case ISD::FNEG:
   case ISD::FSQRT:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index f191160dee4f..7e8ad28f9b14 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -113,6 +113,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FRINT:       R = SoftenFloatRes_FRINT(N); break;
     case ISD::STRICT_FROUND:
     case ISD::FROUND:      R = SoftenFloatRes_FROUND(N); break;
+    case ISD::STRICT_FROUNDEVEN:
+    case ISD::FROUNDEVEN:  R = SoftenFloatRes_FROUNDEVEN(N); break;
     case ISD::STRICT_FSIN:
     case ISD::FSIN:        R = SoftenFloatRes_FSIN(N); break;
     case ISD::STRICT_FSQRT:
@@ -125,6 +127,7 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
     case ISD::SELECT:      R = SoftenFloatRes_SELECT(N); break;
     case ISD::SELECT_CC:   R = SoftenFloatRes_SELECT_CC(N); break;
+    case ISD::FREEZE:      R = SoftenFloatRes_FREEZE(N); break;
     case ISD::STRICT_SINT_TO_FP:
     case ISD::STRICT_UINT_TO_FP:
     case ISD::SINT_TO_FP:
@@ -184,6 +187,12 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_BITCAST(SDNode *N) {
   return BitConvertToInteger(N->getOperand(0));
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_FREEZE(SDNode *N) {
+  EVT Ty = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  return DAG.getNode(ISD::FREEZE, SDLoc(N), Ty,
+                     GetSoftenedFloat(N->getOperand(0)));
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_MERGE_VALUES(SDNode *N,
                                                       unsigned ResNo) {
   SDValue Op = DisintegrateMERGE_VALUES(N, ResNo);
@@ -609,6 +618,15 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FROUND(SDNode *N) {
                                               RTLIB::ROUND_PPCF128));
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_FROUNDEVEN(SDNode *N) {
+  return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                              RTLIB::ROUNDEVEN_F32,
+                                              RTLIB::ROUNDEVEN_F64,
+                                              RTLIB::ROUNDEVEN_F80,
+                                              RTLIB::ROUNDEVEN_F128,
+                                              RTLIB::ROUNDEVEN_PPCF128));
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_FSIN(SDNode *N) {
   return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
                                               RTLIB::SIN_F32,
@@ -658,8 +676,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
   if (L->getExtensionType() == ISD::NON_EXTLOAD) {
     NewL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), NVT, dl,
                        L->getChain(), L->getBasePtr(), L->getOffset(),
-                       L->getPointerInfo(), NVT, L->getAlignment(), MMOFlags,
-                       L->getAAInfo());
+                       L->getPointerInfo(), NVT, L->getOriginalAlign(),
+                       MMOFlags, L->getAAInfo());
     // Legalized the chain result - switch anything that used the old chain to
     // use the new one.
     ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
@@ -669,8 +687,8 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_LOAD(SDNode *N) {
   // Do a non-extending load followed by FP_EXTEND.
   NewL = DAG.getLoad(L->getAddressingMode(), ISD::NON_EXTLOAD, L->getMemoryVT(),
                      dl, L->getChain(), L->getBasePtr(), L->getOffset(),
-                     L->getPointerInfo(), L->getMemoryVT(), L->getAlignment(),
-                     MMOFlags, L->getAAInfo());
+                     L->getPointerInfo(), L->getMemoryVT(),
+                     L->getOriginalAlign(), MMOFlags, L->getAAInfo());
   // Legalized the chain result - switch anything that used the old chain to
   // use the new one.
   ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
@@ -1166,10 +1184,13 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::FPOW:       ExpandFloatRes_FPOW(N, Lo, Hi); break;
   case ISD::STRICT_FPOWI:
   case ISD::FPOWI:      ExpandFloatRes_FPOWI(N, Lo, Hi); break;
+  case ISD::FREEZE:     ExpandFloatRes_FREEZE(N, Lo, Hi); break;
   case ISD::STRICT_FRINT:
   case ISD::FRINT:      ExpandFloatRes_FRINT(N, Lo, Hi); break;
   case ISD::STRICT_FROUND:
   case ISD::FROUND:     ExpandFloatRes_FROUND(N, Lo, Hi); break;
+  case ISD::STRICT_FROUNDEVEN:
+  case ISD::FROUNDEVEN: ExpandFloatRes_FROUNDEVEN(N, Lo, Hi); break;
   case ISD::STRICT_FSIN:
   case ISD::FSIN:       ExpandFloatRes_FSIN(N, Lo, Hi); break;
   case ISD::STRICT_FSQRT:
@@ -1459,6 +1480,17 @@ void DAGTypeLegalizer::ExpandFloatRes_FPOWI(SDNode *N,
                                         RTLIB::POWI_PPCF128), Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_FREEZE(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  assert(N->getValueType(0) == MVT::ppcf128 &&
+         "Logic only correct for ppcf128!");
+
+  SDLoc dl(N);
+  GetExpandedFloat(N->getOperand(0), Lo, Hi);
+  Lo = DAG.getNode(ISD::FREEZE, dl, Lo.getValueType(), Lo);
+  Hi = DAG.getNode(ISD::FREEZE, dl, Hi.getValueType(), Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FREM(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   ExpandFloatRes_Binary(N, GetFPLibCall(N->getValueType(0),
@@ -1485,6 +1517,16 @@ void DAGTypeLegalizer::ExpandFloatRes_FROUND(SDNode *N,
                                        RTLIB::ROUND_PPCF128), Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_FROUNDEVEN(SDNode *N,
+                                             SDValue &Lo, SDValue &Hi) {
+  ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
+                                       RTLIB::ROUNDEVEN_F32,
+                                       RTLIB::ROUNDEVEN_F64,
+                                       RTLIB::ROUNDEVEN_F80,
+                                       RTLIB::ROUNDEVEN_F128,
+                                       RTLIB::ROUNDEVEN_PPCF128), Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FSIN(SDNode *N,
                                            SDValue &Lo, SDValue &Hi) {
   ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
@@ -2117,6 +2159,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FNEG:
     case ISD::FRINT:
     case ISD::FROUND:
+    case ISD::FROUNDEVEN:
     case ISD::FSIN:
     case ISD::FSQRT:
     case ISD::FTRUNC:
@@ -2328,12 +2371,10 @@ SDValue DAGTypeLegalizer::PromoteFloatRes_LOAD(SDNode *N) {
 
   // Load the value as an integer value with the same number of bits.
   EVT IVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
-  SDValue newL = DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), IVT,
-                             SDLoc(N), L->getChain(), L->getBasePtr(),
-                             L->getOffset(), L->getPointerInfo(), IVT,
-                             L->getAlignment(),
-                             L->getMemOperand()->getFlags(),
-                             L->getAAInfo());
+  SDValue newL = DAG.getLoad(
+      L->getAddressingMode(), L->getExtensionType(), IVT, SDLoc(N),
+      L->getChain(), L->getBasePtr(), L->getOffset(), L->getPointerInfo(), IVT,
+      L->getOriginalAlign(), L->getMemOperand()->getFlags(), L->getAAInfo());
   // Legalize the chain result by replacing uses of the old value chain with the
   // new one
   ReplaceValueWith(SDValue(N, 1), newL.getValue(1));
@@ -2412,3 +2453,421 @@ SDValue DAGTypeLegalizer::BitcastToInt_ATOMIC_SWAP(SDNode *N) {
 
 }
 
+//===----------------------------------------------------------------------===//
+//  Half Result Soft Promotion
+//===----------------------------------------------------------------------===//
+
+void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
+  LLVM_DEBUG(dbgs() << "Soft promote half result " << ResNo << ": ";
+             N->dump(&DAG); dbgs() << "\n");
+  SDValue R = SDValue();
+
+  // See if the target wants to custom expand this node.
+  if (CustomLowerNode(N, N->getValueType(ResNo), true)) {
+    LLVM_DEBUG(dbgs() << "Node has been custom expanded, done\n");
+    return;
+  }
+
+  switch (N->getOpcode()) {
+  default:
+#ifndef NDEBUG
+    dbgs() << "SoftPromoteHalfResult #" << ResNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+#endif
+    llvm_unreachable("Do not know how to soft promote this operator's result!");
+
+  case ISD::BITCAST:    R = SoftPromoteHalfRes_BITCAST(N); break;
+  case ISD::ConstantFP: R = SoftPromoteHalfRes_ConstantFP(N); break;
+  case ISD::EXTRACT_VECTOR_ELT:
+    R = SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(N); break;
+  case ISD::FCOPYSIGN:  R = SoftPromoteHalfRes_FCOPYSIGN(N); break;
+  case ISD::STRICT_FP_ROUND:
+  case ISD::FP_ROUND:   R = SoftPromoteHalfRes_FP_ROUND(N); break;
+
+  // Unary FP Operations
+  case ISD::FABS:
+  case ISD::FCBRT:
+  case ISD::FCEIL:
+  case ISD::FCOS:
+  case ISD::FEXP:
+  case ISD::FEXP2:
+  case ISD::FFLOOR:
+  case ISD::FLOG:
+  case ISD::FLOG2:
+  case ISD::FLOG10:
+  case ISD::FNEARBYINT:
+  case ISD::FNEG:
+  case ISD::FREEZE:
+  case ISD::FRINT:
+  case ISD::FROUND:
+  case ISD::FROUNDEVEN:
+  case ISD::FSIN:
+  case ISD::FSQRT:
+  case ISD::FTRUNC:
+  case ISD::FCANONICALIZE: R = SoftPromoteHalfRes_UnaryOp(N); break;
+
+  // Binary FP Operations
+  case ISD::FADD:
+  case ISD::FDIV:
+  case ISD::FMAXIMUM:
+  case ISD::FMINIMUM:
+  case ISD::FMAXNUM:
+  case ISD::FMINNUM:
+  case ISD::FMUL:
+  case ISD::FPOW:
+  case ISD::FREM:
+  case ISD::FSUB:        R = SoftPromoteHalfRes_BinOp(N); break;
+
+  case ISD::FMA:         // FMA is same as FMAD
+  case ISD::FMAD:        R = SoftPromoteHalfRes_FMAD(N); break;
+
+  case ISD::FPOWI:       R = SoftPromoteHalfRes_FPOWI(N); break;
+
+  case ISD::LOAD:        R = SoftPromoteHalfRes_LOAD(N); break;
+  case ISD::SELECT:      R = SoftPromoteHalfRes_SELECT(N); break;
+  case ISD::SELECT_CC:   R = SoftPromoteHalfRes_SELECT_CC(N); break;
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:  R = SoftPromoteHalfRes_XINT_TO_FP(N); break;
+  case ISD::UNDEF:       R = SoftPromoteHalfRes_UNDEF(N); break;
+  case ISD::ATOMIC_SWAP: R = BitcastToInt_ATOMIC_SWAP(N); break;
+  }
+
+  if (R.getNode())
+    SetSoftPromotedHalf(SDValue(N, ResNo), R);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BITCAST(SDNode *N) {
+  return BitConvertToInteger(N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_ConstantFP(SDNode *N) {
+  ConstantFPSDNode *CN = cast<ConstantFPSDNode>(N);
+
+  // Get the (bit-cast) APInt of the APFloat and build an integer constant
+  return DAG.getConstant(CN->getValueAPF().bitcastToAPInt(), SDLoc(CN),
+                         MVT::i16);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(SDNode *N) {
+  SDValue NewOp = BitConvertVectorToIntegerVector(N->getOperand(0));
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N),
+                     NewOp.getValueType().getVectorElementType(), NewOp,
+                     N->getOperand(1));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FCOPYSIGN(SDNode *N) {
+  SDValue LHS = GetSoftPromotedHalf(N->getOperand(0));
+  SDValue RHS = BitConvertToInteger(N->getOperand(1));
+  SDLoc dl(N);
+
+  EVT LVT = LHS.getValueType();
+  EVT RVT = RHS.getValueType();
+
+  unsigned LSize = LVT.getSizeInBits();
+  unsigned RSize = RVT.getSizeInBits();
+
+  // First get the sign bit of second operand.
+  SDValue SignBit = DAG.getNode(
+      ISD::SHL, dl, RVT, DAG.getConstant(1, dl, RVT),
+      DAG.getConstant(RSize - 1, dl,
+                      TLI.getShiftAmountTy(RVT, DAG.getDataLayout())));
+  SignBit = DAG.getNode(ISD::AND, dl, RVT, RHS, SignBit);
+
+  // Shift right or sign-extend it if the two operands have different types.
+  int SizeDiff = RVT.getSizeInBits() - LVT.getSizeInBits();
+  if (SizeDiff > 0) {
+    SignBit =
+        DAG.getNode(ISD::SRL, dl, RVT, SignBit,
+                    DAG.getConstant(SizeDiff, dl,
+                                    TLI.getShiftAmountTy(SignBit.getValueType(),
+                                                         DAG.getDataLayout())));
+    SignBit = DAG.getNode(ISD::TRUNCATE, dl, LVT, SignBit);
+  } else if (SizeDiff < 0) {
+    SignBit = DAG.getNode(ISD::ANY_EXTEND, dl, LVT, SignBit);
+    SignBit =
+        DAG.getNode(ISD::SHL, dl, LVT, SignBit,
+                    DAG.getConstant(-SizeDiff, dl,
+                                    TLI.getShiftAmountTy(SignBit.getValueType(),
+                                                         DAG.getDataLayout())));
+  }
+
+  // Clear the sign bit of the first operand.
+  SDValue Mask = DAG.getNode(
+      ISD::SHL, dl, LVT, DAG.getConstant(1, dl, LVT),
+      DAG.getConstant(LSize - 1, dl,
+                      TLI.getShiftAmountTy(LVT, DAG.getDataLayout())));
+  Mask = DAG.getNode(ISD::SUB, dl, LVT, Mask, DAG.getConstant(1, dl, LVT));
+  LHS = DAG.getNode(ISD::AND, dl, LVT, LHS, Mask);
+
+  // Or the value with the sign bit.
+  return DAG.getNode(ISD::OR, dl, LVT, LHS, SignBit);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FMAD(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
+  SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
+  SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
+  SDLoc dl(N);
+
+  // Promote to the larger FP type.
+  Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
+  Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
+  Op2 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op2);
+
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1, Op2);
+
+  // Convert back to FP16 as an integer.
+  return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FPOWI(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
+  SDValue Op1 = N->getOperand(1);
+  SDLoc dl(N);
+
+  Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
+
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1);
+
+  // Convert back to FP16 as an integer.
+  return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_FP_ROUND(SDNode *N) {
+  if (N->isStrictFPOpcode()) {
+    SDValue Res =
+        DAG.getNode(ISD::STRICT_FP_TO_FP16, SDLoc(N), {MVT::i16, MVT::Other},
+                    {N->getOperand(0), N->getOperand(1)});
+    ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+    return Res;
+  }
+
+  return DAG.getNode(ISD::FP_TO_FP16, SDLoc(N), MVT::i16, N->getOperand(0));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_LOAD(SDNode *N) {
+  LoadSDNode *L = cast<LoadSDNode>(N);
+
+  // Load the value as an integer value with the same number of bits.
+  assert(L->getExtensionType() == ISD::NON_EXTLOAD && "Unexpected extension!");
+  SDValue NewL =
+      DAG.getLoad(L->getAddressingMode(), L->getExtensionType(), MVT::i16,
+                  SDLoc(N), L->getChain(), L->getBasePtr(), L->getOffset(),
+                  L->getPointerInfo(), MVT::i16, L->getOriginalAlign(),
+                  L->getMemOperand()->getFlags(), L->getAAInfo());
+  // Legalize the chain result by replacing uses of the old value chain with the
+  // new one
+  ReplaceValueWith(SDValue(N, 1), NewL.getValue(1));
+  return NewL;
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT(SDNode *N) {
+  SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
+  SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
+  return DAG.getSelect(SDLoc(N), Op1.getValueType(), N->getOperand(0), Op1,
+                       Op2);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_SELECT_CC(SDNode *N) {
+  SDValue Op2 = GetSoftPromotedHalf(N->getOperand(2));
+  SDValue Op3 = GetSoftPromotedHalf(N->getOperand(3));
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N), Op2.getValueType(),
+                     N->getOperand(0), N->getOperand(1), Op2, Op3,
+                     N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_XINT_TO_FP(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDLoc dl(N);
+
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, N->getOperand(0));
+
+  // Round the value to the softened type.
+  return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UNDEF(SDNode *N) {
+  return DAG.getUNDEF(MVT::i16);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_UnaryOp(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op = GetSoftPromotedHalf(N->getOperand(0));
+  SDLoc dl(N);
+
+  // Promote to the larger FP type.
+  Op = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op);
+
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op);
+
+  // Convert back to FP16 as an integer.
+  return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfRes_BinOp(SDNode *N) {
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+  SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
+  SDValue Op1 = GetSoftPromotedHalf(N->getOperand(1));
+  SDLoc dl(N);
+
+  // Promote to the larger FP type.
+  Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
+  Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
+
+  SDValue Res = DAG.getNode(N->getOpcode(), dl, NVT, Op0, Op1);
+
+  // Convert back to FP16 as an integer.
+  return DAG.getNode(ISD::FP_TO_FP16, dl, MVT::i16, Res);
+}
+
+//===----------------------------------------------------------------------===//
+//  Half Operand Soft Promotion
+//===----------------------------------------------------------------------===//
+
+bool DAGTypeLegalizer::SoftPromoteHalfOperand(SDNode *N, unsigned OpNo) {
+  LLVM_DEBUG(dbgs() << "Soft promote half operand " << OpNo << ": ";
+             N->dump(&DAG); dbgs() << "\n");
+  SDValue Res = SDValue();
+
+  if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) {
+    LLVM_DEBUG(dbgs() << "Node has been custom lowered, done\n");
+    return false;
+  }
+
+  // Nodes that use a promotion-requiring floating point operand, but doesn't
+  // produce a soft promotion-requiring floating point result, need to be
+  // legalized to use the soft promoted float operand.  Nodes that produce at
+  // least one soft promotion-requiring floating point result have their
+  // operands legalized as a part of PromoteFloatResult.
+  switch (N->getOpcode()) {
+  default:
+  #ifndef NDEBUG
+    dbgs() << "SoftPromoteHalfOperand Op #" << OpNo << ": ";
+    N->dump(&DAG); dbgs() << "\n";
+  #endif
+    llvm_unreachable("Do not know how to soft promote this operator's operand!");
+
+  case ISD::BITCAST:    Res = SoftPromoteHalfOp_BITCAST(N); break;
+  case ISD::FCOPYSIGN:  Res = SoftPromoteHalfOp_FCOPYSIGN(N, OpNo); break;
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT: Res = SoftPromoteHalfOp_FP_TO_XINT(N); break;
+  case ISD::STRICT_FP_EXTEND:
+  case ISD::FP_EXTEND:  Res = SoftPromoteHalfOp_FP_EXTEND(N); break;
+  case ISD::SELECT_CC:  Res = SoftPromoteHalfOp_SELECT_CC(N, OpNo); break;
+  case ISD::SETCC:      Res = SoftPromoteHalfOp_SETCC(N); break;
+  case ISD::STORE:      Res = SoftPromoteHalfOp_STORE(N, OpNo); break;
+  }
+
+  if (!Res.getNode())
+    return false;
+
+  assert(Res.getNode() != N && "Expected a new node!");
+
+  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+         "Invalid operand expansion");
+
+  ReplaceValueWith(SDValue(N, 0), Res);
+  return false;
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_BITCAST(SDNode *N) {
+  SDValue Op0 = GetSoftPromotedHalf(N->getOperand(0));
+
+  return DAG.getNode(ISD::BITCAST, SDLoc(N), N->getValueType(0), Op0);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FCOPYSIGN(SDNode *N,
+                                                      unsigned OpNo) {
+  assert(OpNo == 1 && "Only Operand 1 must need promotion here");
+  SDValue Op1 = N->getOperand(1);
+  SDLoc dl(N);
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op1.getValueType());
+
+  Op1 = GetSoftPromotedHalf(Op1);
+  Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
+
+  return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), N->getOperand(0),
+                     Op1);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_EXTEND(SDNode *N) {
+  bool IsStrict = N->isStrictFPOpcode();
+  SDValue Op = GetSoftPromotedHalf(N->getOperand(IsStrict ? 1 : 0));
+
+  if (IsStrict) {
+    SDValue Res =
+        DAG.getNode(ISD::STRICT_FP16_TO_FP, SDLoc(N),
+                    {N->getValueType(0), MVT::Other}, {N->getOperand(0), Op});
+    ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+    ReplaceValueWith(SDValue(N, 0), Res);
+    return SDValue();
+  }
+
+  return DAG.getNode(ISD::FP16_TO_FP, SDLoc(N), N->getValueType(0), Op);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_FP_TO_XINT(SDNode *N) {
+  SDValue Op = N->getOperand(0);
+  SDLoc dl(N);
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
+
+  Op = GetSoftPromotedHalf(Op);
+
+  SDValue Res = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op);
+
+  return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Res);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_SELECT_CC(SDNode *N,
+                                                      unsigned OpNo) {
+  assert(OpNo == 0 && "Can only soften the comparison values");
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  SDLoc dl(N);
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op0.getValueType());
+
+  Op0 = GetSoftPromotedHalf(Op0);
+  Op1 = GetSoftPromotedHalf(Op1);
+
+  // Promote to the larger FP type.
+  Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
+  Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
+
+  return DAG.getNode(ISD::SELECT_CC, SDLoc(N), N->getValueType(0), Op0, Op1,
+                     N->getOperand(2), N->getOperand(3), N->getOperand(4));
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_SETCC(SDNode *N) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  ISD::CondCode CCCode = cast<CondCodeSDNode>(N->getOperand(2))->get();
+  SDLoc dl(N);
+
+  EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op0.getValueType());
+
+  Op0 = GetSoftPromotedHalf(Op0);
+  Op1 = GetSoftPromotedHalf(Op1);
+
+  // Promote to the larger FP type.
+  Op0 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op0);
+  Op1 = DAG.getNode(ISD::FP16_TO_FP, dl, NVT, Op1);
+
+  return DAG.getSetCC(SDLoc(N), N->getValueType(0), Op0, Op1, CCCode);
+}
+
+SDValue DAGTypeLegalizer::SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo) {
+  assert(OpNo == 1 && "Can only soften the stored value!");
+  StoreSDNode *ST = cast<StoreSDNode>(N);
+  SDValue Val = ST->getValue();
+  SDLoc dl(N);
+
+  assert(!ST->isTruncatingStore() && "Unexpected truncating store.");
+  SDValue Promoted = GetSoftPromotedHalf(Val);
+  return DAG.getStore(ST->getChain(), dl, Promoted, ST->getBasePtr(),
+                      ST->getMemOperand());
+}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index 0e46f8d68f83..74071f763dbf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -91,6 +91,7 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::TRUNCATE:    Res = PromoteIntRes_TRUNCATE(N); break;
   case ISD::UNDEF:       Res = PromoteIntRes_UNDEF(N); break;
   case ISD::VAARG:       Res = PromoteIntRes_VAARG(N); break;
+  case ISD::VSCALE:      Res = PromoteIntRes_VSCALE(N); break;
 
   case ISD::EXTRACT_SUBVECTOR:
                          Res = PromoteIntRes_EXTRACT_SUBVECTOR(N); break;
@@ -161,7 +162,9 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::UMULFIXSAT:  Res = PromoteIntRes_MULFIX(N); break;
 
   case ISD::SDIVFIX:
-  case ISD::UDIVFIX:     Res = PromoteIntRes_DIVFIX(N); break;
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:  Res = PromoteIntRes_DIVFIX(N); break;
 
   case ISD::ABS:         Res = PromoteIntRes_ABS(N); break;
 
@@ -198,6 +201,10 @@ void DAGTypeLegalizer::PromoteIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::VECREDUCE_UMIN:
     Res = PromoteIntRes_VECREDUCE(N);
     break;
+
+  case ISD::FREEZE:
+    Res = PromoteIntRes_FREEZE(N);
+    break;
   }
 
   // If the result is null then the sub-method took care of registering it.
@@ -271,8 +278,24 @@ SDValue DAGTypeLegalizer::PromoteIntRes_AtomicCmpSwap(AtomicSDNode *N,
     return Res.getValue(1);
   }
 
-  SDValue Op2 = GetPromotedInteger(N->getOperand(2));
+  // Op2 is used for the comparison and thus must be extended according to the
+  // target's atomic operations. Op3 is merely stored and so can be left alone.
+  SDValue Op2 = N->getOperand(2);
   SDValue Op3 = GetPromotedInteger(N->getOperand(3));
+  switch (TLI.getExtendForAtomicCmpSwapArg()) {
+  case ISD::SIGN_EXTEND:
+    Op2 = SExtPromotedInteger(Op2);
+    break;
+  case ISD::ZERO_EXTEND:
+    Op2 = ZExtPromotedInteger(Op2);
+    break;
+  case ISD::ANY_EXTEND:
+    Op2 = GetPromotedInteger(Op2);
+    break;
+  default:
+    llvm_unreachable("Invalid atomic op extension");
+  }
+
   SDVTList VTs =
       DAG.getVTList(Op2.getValueType(), N->getValueType(1), MVT::Other);
   SDValue Res = DAG.getAtomicCmpSwap(
@@ -303,6 +326,9 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
   case TargetLowering::TypeSoftenFloat:
     // Promote the integer operand by hand.
     return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftenedFloat(InOp));
+  case TargetLowering::TypeSoftPromoteHalf:
+    // Promote the integer operand by hand.
+    return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, GetSoftPromotedHalf(InOp));
   case TargetLowering::TypePromoteFloat: {
     // Convert the promoted float by hand.
     if (!NOutVT.isVector())
@@ -318,6 +344,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
       return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT,
                          BitConvertToInteger(GetScalarizedVector(InOp)));
     break;
+  case TargetLowering::TypeScalarizeScalableVector:
+    report_fatal_error("Scalarization of scalable vectors is not supported.");
   case TargetLowering::TypeSplitVector: {
     if (!NOutVT.isVector()) {
       // For example, i32 = BITCAST v2i16 on alpha.  Convert the split
@@ -370,9 +398,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_BITCAST(SDNode *N) {
                                          OutVT.getVectorNumElements() * Scale);
         if (isTypeLegal(WideOutVT)) {
           InOp = DAG.getBitcast(WideOutVT, GetWidenedVector(InOp));
-          MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
           InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OutVT, InOp,
-                             DAG.getConstant(0, dl, IdxTy));
+                             DAG.getVectorIdxConstant(0, dl));
           return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, InOp);
         }
       }
@@ -396,6 +423,12 @@ static EVT getShiftAmountTyForConstant(EVT VT, const TargetLowering &TLI,
   return ShiftVT;
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_FREEZE(SDNode *N) {
+  SDValue V = GetPromotedInteger(N->getOperand(0));
+  return DAG.getNode(ISD::FREEZE, SDLoc(N),
+                     V.getValueType(), V);
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_BSWAP(SDNode *N) {
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   EVT OVT = N->getValueType(0);
@@ -558,7 +591,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_FLT_ROUNDS(SDNode *N) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   SDLoc dl(N);
 
-  return DAG.getNode(N->getOpcode(), dl, NVT);
+  SDValue Res =
+      DAG.getNode(N->getOpcode(), dl, {NVT, MVT::Other}, N->getOperand(0));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Res.getValue(1));
+  return Res;
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) {
@@ -578,8 +617,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_INT_EXTEND(SDNode *N) {
         return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, NVT, Res,
                            DAG.getValueType(N->getOperand(0).getValueType()));
       if (N->getOpcode() == ISD::ZERO_EXTEND)
-        return DAG.getZeroExtendInReg(Res, dl,
-                      N->getOperand(0).getValueType().getScalarType());
+        return DAG.getZeroExtendInReg(Res, dl, N->getOperand(0).getValueType());
       assert(N->getOpcode() == ISD::ANY_EXTEND && "Unknown integer extension!");
       return Res;
     }
@@ -781,22 +819,51 @@ SDValue DAGTypeLegalizer::PromoteIntRes_MULFIX(SDNode *N) {
                      N->getOperand(2));
 }
 
+static SDValue SaturateWidenedDIVFIX(SDValue V, SDLoc &dl,
+                                     unsigned SatW, bool Signed,
+                                     const TargetLowering &TLI,
+                                     SelectionDAG &DAG) {
+  EVT VT = V.getValueType();
+  unsigned VTW = VT.getScalarSizeInBits();
+
+  if (!Signed) {
+    // Saturate to the unsigned maximum by getting the minimum of V and the
+    // maximum.
+    return DAG.getNode(ISD::UMIN, dl, VT, V,
+                       DAG.getConstant(APInt::getLowBitsSet(VTW, SatW),
+                                       dl, VT));
+  }
+
+  // Saturate to the signed maximum (the low SatW - 1 bits) by taking the
+  // signed minimum of it and V.
+  V = DAG.getNode(ISD::SMIN, dl, VT, V,
+                  DAG.getConstant(APInt::getLowBitsSet(VTW, SatW - 1),
+                                  dl, VT));
+  // Saturate to the signed minimum (the high SatW + 1 bits) by taking the
+  // signed maximum of it and V.
+  V = DAG.getNode(ISD::SMAX, dl, VT, V,
+                  DAG.getConstant(APInt::getHighBitsSet(VTW, VTW - SatW + 1),
+                                  dl, VT));
+  return V;
+}
+
 static SDValue earlyExpandDIVFIX(SDNode *N, SDValue LHS, SDValue RHS,
-                                    unsigned Scale, const TargetLowering &TLI,
-                                    SelectionDAG &DAG) {
+                                 unsigned Scale, const TargetLowering &TLI,
+                                 SelectionDAG &DAG, unsigned SatW = 0) {
   EVT VT = LHS.getValueType();
-  bool Signed = N->getOpcode() == ISD::SDIVFIX;
+  unsigned VTSize = VT.getScalarSizeInBits();
+  bool Signed = N->getOpcode() == ISD::SDIVFIX ||
+                N->getOpcode() == ISD::SDIVFIXSAT;
+  bool Saturating = N->getOpcode() == ISD::SDIVFIXSAT ||
+                    N->getOpcode() == ISD::UDIVFIXSAT;
 
   SDLoc dl(N);
-  // See if we can perform the division in this type without widening.
-  if (SDValue V = TLI.expandFixedPointDiv(N->getOpcode(), dl, LHS, RHS, Scale,
-                                          DAG))
-    return V;
-
-  // If that didn't work, double the type width and try again. That must work,
-  // or something is wrong.
-  EVT WideVT = EVT::getIntegerVT(*DAG.getContext(),
-                                 VT.getScalarSizeInBits() * 2);
+  // Widen the types by a factor of two. This is guaranteed to expand, since it
+  // will always have enough high bits in the LHS to shift into.
+  EVT WideVT = EVT::getIntegerVT(*DAG.getContext(), VTSize * 2);
+  if (VT.isVector())
+    WideVT = EVT::getVectorVT(*DAG.getContext(), WideVT,
+                              VT.getVectorElementCount());
   if (Signed) {
     LHS = DAG.getSExtOrTrunc(LHS, dl, WideVT);
     RHS = DAG.getSExtOrTrunc(RHS, dl, WideVT);
@@ -805,18 +872,28 @@ static SDValue earlyExpandDIVFIX(SDNode *N, SDValue LHS, SDValue RHS,
     RHS = DAG.getZExtOrTrunc(RHS, dl, WideVT);
   }
 
-  // TODO: Saturation.
-
   SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, LHS, RHS, Scale,
                                         DAG);
   assert(Res && "Expanding DIVFIX with wide type failed?");
+  if (Saturating) {
+    // If the caller has told us to saturate at something less, use that width
+    // instead of the type before doubling. However, it cannot be more than
+    // what we just widened!
+    assert(SatW <= VTSize &&
+           "Tried to saturate to more than the original type?");
+    Res = SaturateWidenedDIVFIX(Res, dl, SatW == 0 ? VTSize : SatW, Signed,
+                                TLI, DAG);
+  }
   return DAG.getZExtOrTrunc(Res, dl, VT);
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_DIVFIX(SDNode *N) {
   SDLoc dl(N);
   SDValue Op1Promoted, Op2Promoted;
-  bool Signed = N->getOpcode() == ISD::SDIVFIX;
+  bool Signed = N->getOpcode() == ISD::SDIVFIX ||
+                N->getOpcode() == ISD::SDIVFIXSAT;
+  bool Saturating = N->getOpcode() == ISD::SDIVFIXSAT ||
+                    N->getOpcode() == ISD::UDIVFIXSAT;
   if (Signed) {
     Op1Promoted = SExtPromotedInteger(N->getOperand(0));
     Op2Promoted = SExtPromotedInteger(N->getOperand(1));
@@ -827,23 +904,41 @@ SDValue DAGTypeLegalizer::PromoteIntRes_DIVFIX(SDNode *N) {
   EVT PromotedType = Op1Promoted.getValueType();
   unsigned Scale = N->getConstantOperandVal(2);
 
-  SDValue Res;
   // If the type is already legal and the operation is legal in that type, we
   // should not early expand.
   if (TLI.isTypeLegal(PromotedType)) {
     TargetLowering::LegalizeAction Action =
         TLI.getFixedPointOperationAction(N->getOpcode(), PromotedType, Scale);
-    if (Action == TargetLowering::Legal || Action == TargetLowering::Custom)
-      Res = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted,
-                        Op2Promoted, N->getOperand(2));
+    if (Action == TargetLowering::Legal || Action == TargetLowering::Custom) {
+      EVT ShiftTy = TLI.getShiftAmountTy(PromotedType, DAG.getDataLayout());
+      unsigned Diff = PromotedType.getScalarSizeInBits() -
+                      N->getValueType(0).getScalarSizeInBits();
+      if (Saturating)
+        Op1Promoted = DAG.getNode(ISD::SHL, dl, PromotedType, Op1Promoted,
+                                  DAG.getConstant(Diff, dl, ShiftTy));
+      SDValue Res = DAG.getNode(N->getOpcode(), dl, PromotedType, Op1Promoted,
+                                Op2Promoted, N->getOperand(2));
+      if (Saturating)
+        Res = DAG.getNode(Signed ? ISD::SRA : ISD::SRL, dl, PromotedType, Res,
+                          DAG.getConstant(Diff, dl, ShiftTy));
+      return Res;
+    }
   }
 
-  if (!Res)
-    Res = earlyExpandDIVFIX(N, Op1Promoted, Op2Promoted, Scale, TLI, DAG);
-
-  // TODO: Saturation.
-
-  return Res;
+  // See if we can perform the division in this type without expanding.
+  if (SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, Op1Promoted,
+                                        Op2Promoted, Scale, DAG)) {
+    if (Saturating)
+      Res = SaturateWidenedDIVFIX(Res, dl,
+                                  N->getValueType(0).getScalarSizeInBits(),
+                                  Signed, TLI, DAG);
+    return Res;
+  }
+  // If we cannot, expand it to twice the type width. If we are saturating, give
+  // it the original width as a saturating width so we don't need to emit
+  // two saturations.
+  return earlyExpandDIVFIX(N, Op1Promoted, Op2Promoted, Scale, TLI, DAG,
+                            N->getValueType(0).getScalarSizeInBits());
 }
 
 SDValue DAGTypeLegalizer::PromoteIntRes_SADDSUBO(SDNode *N, unsigned ResNo) {
@@ -1048,8 +1143,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_TRUNCATE(SDNode *N) {
     SDValue WideExt = DAG.getNode(ISD::ZERO_EXTEND, dl, ExtVT, WideTrunc);
 
     // Extract the low NVT subvector.
-    MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
-    SDValue ZeroIdx = DAG.getConstant(0, dl, IdxTy);
+    SDValue ZeroIdx = DAG.getVectorIdxConstant(0, dl);
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, WideExt, ZeroIdx);
   }
   }
@@ -1076,7 +1170,7 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UADDSUBO(SDNode *N, unsigned ResNo) {
 
   // Calculate the overflow flag: zero extend the arithmetic result from
   // the original type.
-  SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT.getScalarType());
+  SDValue Ofl = DAG.getZeroExtendInReg(Res, dl, OVT);
   // Overflowed if and only if this is not equal to Res.
   Ofl = DAG.getSetCC(dl, N->getValueType(1), Ofl, Res, ISD::SETNE);
 
@@ -1181,6 +1275,13 @@ SDValue DAGTypeLegalizer::PromoteIntRes_UNDEF(SDNode *N) {
                                                N->getValueType(0)));
 }
 
+SDValue DAGTypeLegalizer::PromoteIntRes_VSCALE(SDNode *N) {
+  EVT VT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
+
+  APInt MulImm = cast<ConstantSDNode>(N->getOperand(0))->getAPIntValue();
+  return DAG.getVScale(SDLoc(N), VT, MulImm.sextOrSelf(VT.getSizeInBits()));
+}
+
 SDValue DAGTypeLegalizer::PromoteIntRes_VAARG(SDNode *N) {
   SDValue Chain = N->getOperand(0); // Get the chain.
   SDValue Ptr = N->getOperand(1); // Get the pointer.
@@ -1233,7 +1334,6 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   LLVM_DEBUG(dbgs() << "Promote integer operand: "; N->dump(&DAG);
              dbgs() << "\n");
   SDValue Res = SDValue();
-
   if (CustomLowerNode(N, N->getOperand(OpNo).getValueType(), false)) {
     LLVM_DEBUG(dbgs() << "Node has been custom lowered, done\n");
     return false;
@@ -1307,7 +1407,9 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   case ISD::UMULFIX:
   case ISD::UMULFIXSAT:
   case ISD::SDIVFIX:
-  case ISD::UDIVFIX: Res = PromoteIntOp_FIX(N); break;
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: Res = PromoteIntOp_FIX(N); break;
 
   case ISD::FPOWI: Res = PromoteIntOp_FPOWI(N); break;
 
@@ -1330,10 +1432,17 @@ bool DAGTypeLegalizer::PromoteIntegerOperand(SDNode *N, unsigned OpNo) {
   if (Res.getNode() == N)
     return true;
 
-  assert(Res.getValueType() == N->getValueType(0) && N->getNumValues() == 1 &&
+  const bool IsStrictFp = N->isStrictFPOpcode();
+  assert(Res.getValueType() == N->getValueType(0) &&
+         N->getNumValues() == (IsStrictFp ? 2 : 1) &&
          "Invalid operand expansion");
+  LLVM_DEBUG(dbgs() << "Replacing: "; N->dump(&DAG); dbgs() << "     with: ";
+             Res.dump());
 
   ReplaceValueWith(SDValue(N, 0), Res);
+  if (IsStrictFp)
+    ReplaceValueWith(SDValue(N, 1), SDValue(Res.getNode(), 1));
+
   return false;
 }
 
@@ -1614,7 +1723,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MLOAD(MaskedLoadSDNode *N,
   SDValue Mask = PromoteTargetBoolean(N->getOperand(OpNo), DataVT);
   SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
   NewOps[OpNo] = Mask;
-  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+  SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
+  if (Res == N)
+    return SDValue(Res, 0);
+
+  // Update triggered CSE, do our own replacement since caller can't.
+  ReplaceValueWith(SDValue(N, 0), SDValue(Res, 0));
+  ReplaceValueWith(SDValue(N, 1), SDValue(Res, 1));
+  return SDValue();
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N,
@@ -1635,7 +1751,14 @@ SDValue DAGTypeLegalizer::PromoteIntOp_MGATHER(MaskedGatherSDNode *N,
   } else
     NewOps[OpNo] = GetPromotedInteger(N->getOperand(OpNo));
 
-  return SDValue(DAG.UpdateNodeOperands(N, NewOps), 0);
+  SDNode *Res = DAG.UpdateNodeOperands(N, NewOps);
+  if (Res == N)
+    return SDValue(Res, 0);
+
+  // Update triggered CSE, do our own replacement since caller can't.
+  ReplaceValueWith(SDValue(N, 0), SDValue(Res, 0));
+  ReplaceValueWith(SDValue(N, 1), SDValue(Res, 1));
+  return SDValue();
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_MSCATTER(MaskedScatterSDNode *N,
@@ -1676,8 +1799,7 @@ SDValue DAGTypeLegalizer::PromoteIntOp_ZERO_EXTEND(SDNode *N) {
   SDLoc dl(N);
   SDValue Op = GetPromotedInteger(N->getOperand(0));
   Op = DAG.getNode(ISD::ANY_EXTEND, dl, N->getValueType(0), Op);
-  return DAG.getZeroExtendInReg(Op, dl,
-                                N->getOperand(0).getValueType().getScalarType());
+  return DAG.getZeroExtendInReg(Op, dl, N->getOperand(0).getValueType());
 }
 
 SDValue DAGTypeLegalizer::PromoteIntOp_ADDSUBCARRY(SDNode *N, unsigned OpNo) {
@@ -1786,6 +1908,7 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::SELECT:       SplitRes_SELECT(N, Lo, Hi); break;
   case ISD::SELECT_CC:    SplitRes_SELECT_CC(N, Lo, Hi); break;
   case ISD::UNDEF:        SplitRes_UNDEF(N, Lo, Hi); break;
+  case ISD::FREEZE:       SplitRes_FREEZE(N, Lo, Hi); break;
 
   case ISD::BITCAST:            ExpandRes_BITCAST(N, Lo, Hi); break;
   case ISD::BUILD_PAIR:         ExpandRes_BUILD_PAIR(N, Lo, Hi); break;
@@ -1908,7 +2031,9 @@ void DAGTypeLegalizer::ExpandIntegerResult(SDNode *N, unsigned ResNo) {
   case ISD::UMULFIXSAT: ExpandIntRes_MULFIX(N, Lo, Hi); break;
 
   case ISD::SDIVFIX:
-  case ISD::UDIVFIX: ExpandIntRes_DIVFIX(N, Lo, Hi); break;
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: ExpandIntRes_DIVFIX(N, Lo, Hi); break;
 
   case ISD::VECREDUCE_ADD:
   case ISD::VECREDUCE_MUL:
@@ -2666,10 +2791,15 @@ void DAGTypeLegalizer::ExpandIntRes_FLT_ROUNDS(SDNode *N, SDValue &Lo,
   unsigned NBitWidth = NVT.getSizeInBits();
 
   EVT ShiftAmtTy = TLI.getShiftAmountTy(NVT, DAG.getDataLayout());
-  Lo = DAG.getNode(ISD::FLT_ROUNDS_, dl, NVT);
+  Lo = DAG.getNode(ISD::FLT_ROUNDS_, dl, {NVT, MVT::Other}, N->getOperand(0));
+  SDValue Chain = Lo.getValue(1);
   // The high part is the sign of Lo, as -1 is a valid value for FLT_ROUNDS
   Hi = DAG.getNode(ISD::SRA, dl, NVT, Lo,
                    DAG.getConstant(NBitWidth - 1, dl, ShiftAmtTy));
+
+  // Legalize the chain result - switch anything that used the old chain to
+  // use the new one.
+  ReplaceValueWith(SDValue(N, 1), Chain);
 }
 
 void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
@@ -2683,6 +2813,12 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_SINT(SDNode *N, SDValue &Lo,
   if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat)
     Op = GetPromotedFloat(Op);
 
+  if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftPromoteHalf) {
+    EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
+    Op = GetSoftPromotedHalf(Op);
+    Op = DAG.getNode(ISD::FP16_TO_FP, dl, NFPVT, Op);
+  }
+
   RTLIB::Libcall LC = RTLIB::getFPTOSINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-sint conversion!");
   TargetLowering::MakeLibCallOptions CallOptions;
@@ -2706,6 +2842,12 @@ void DAGTypeLegalizer::ExpandIntRes_FP_TO_UINT(SDNode *N, SDValue &Lo,
   if (getTypeAction(Op.getValueType()) == TargetLowering::TypePromoteFloat)
     Op = GetPromotedFloat(Op);
 
+  if (getTypeAction(Op.getValueType()) == TargetLowering::TypeSoftPromoteHalf) {
+    EVT NFPVT = TLI.getTypeToTransformTo(*DAG.getContext(), Op.getValueType());
+    Op = GetSoftPromotedHalf(Op);
+    Op = DAG.getNode(ISD::FP16_TO_FP, dl, NFPVT, Op);
+  }
+
   RTLIB::Libcall LC = RTLIB::getFPTOUINT(Op.getValueType(), VT);
   assert(LC != RTLIB::UNKNOWN_LIBCALL && "Unexpected fp-to-uint conversion!");
   TargetLowering::MakeLibCallOptions CallOptions;
@@ -2800,7 +2942,6 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
   SDValue Ch  = N->getChain();
   SDValue Ptr = N->getBasePtr();
   ISD::LoadExtType ExtType = N->getExtensionType();
-  unsigned Alignment = N->getAlignment();
   MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
   AAMDNodes AAInfo = N->getAAInfo();
   SDLoc dl(N);
@@ -2811,7 +2952,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     EVT MemVT = N->getMemoryVT();
 
     Lo = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(), MemVT,
-                        Alignment, MMOFlags, AAInfo);
+                        N->getOriginalAlign(), MMOFlags, AAInfo);
 
     // Remember the chain.
     Ch = Lo.getValue(1);
@@ -2833,8 +2974,8 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     }
   } else if (DAG.getDataLayout().isLittleEndian()) {
     // Little-endian - low bits are at low addresses.
-    Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
-                     AAInfo);
+    Lo = DAG.getLoad(NVT, dl, Ch, Ptr, N->getPointerInfo(),
+                     N->getOriginalAlign(), MMOFlags, AAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
@@ -2845,7 +2986,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     Ptr = DAG.getMemBasePlusOffset(Ptr, IncrementSize, dl);
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
-                        MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
+                        N->getOriginalAlign(), MMOFlags, AAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
@@ -2863,7 +3004,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     Hi = DAG.getExtLoad(ExtType, dl, NVT, Ch, Ptr, N->getPointerInfo(),
                         EVT::getIntegerVT(*DAG.getContext(),
                                           MemVT.getSizeInBits() - ExcessBits),
-                        Alignment, MMOFlags, AAInfo);
+                        N->getOriginalAlign(), MMOFlags, AAInfo);
 
     // Increment the pointer to the other half.
     Ptr = DAG.getMemBasePlusOffset(Ptr, IncrementSize, dl);
@@ -2871,7 +3012,7 @@ void DAGTypeLegalizer::ExpandIntRes_LOAD(LoadSDNode *N,
     Lo = DAG.getExtLoad(ISD::ZEXTLOAD, dl, NVT, Ch, Ptr,
                         N->getPointerInfo().getWithOffset(IncrementSize),
                         EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
-                        MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
+                        N->getOriginalAlign(), MMOFlags, AAInfo);
 
     // Build a factor node to remember that this load is independent of the
     // other one.
@@ -3226,8 +3367,15 @@ void DAGTypeLegalizer::ExpandIntRes_MULFIX(SDNode *N, SDValue &Lo,
 
 void DAGTypeLegalizer::ExpandIntRes_DIVFIX(SDNode *N, SDValue &Lo,
                                            SDValue &Hi) {
-  SDValue Res = earlyExpandDIVFIX(N, N->getOperand(0), N->getOperand(1),
-                                  N->getConstantOperandVal(2), TLI, DAG);
+  SDLoc dl(N);
+  // Try expanding in the existing type first.
+  SDValue Res = TLI.expandFixedPointDiv(N->getOpcode(), dl, N->getOperand(0),
+                                        N->getOperand(1),
+                                        N->getConstantOperandVal(2), DAG);
+
+  if (!Res)
+    Res = earlyExpandDIVFIX(N, N->getOperand(0), N->getOperand(1),
+                            N->getConstantOperandVal(2), TLI, DAG);
   SplitInteger(Res, Lo, Hi);
 }
 
@@ -4071,7 +4219,6 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
   SDValue Ch  = N->getChain();
   SDValue Ptr = N->getBasePtr();
-  unsigned Alignment = N->getAlignment();
   MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
   AAMDNodes AAInfo = N->getAAInfo();
   SDLoc dl(N);
@@ -4082,15 +4229,16 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
   if (N->getMemoryVT().bitsLE(NVT)) {
     GetExpandedInteger(N->getValue(), Lo, Hi);
     return DAG.getTruncStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
-                             N->getMemoryVT(), Alignment, MMOFlags, AAInfo);
+                             N->getMemoryVT(), N->getOriginalAlign(), MMOFlags,
+                             AAInfo);
   }
 
   if (DAG.getDataLayout().isLittleEndian()) {
     // Little-endian - low bits are at low addresses.
     GetExpandedInteger(N->getValue(), Lo, Hi);
 
-    Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
-                      AAInfo);
+    Lo = DAG.getStore(Ch, dl, Lo, Ptr, N->getPointerInfo(),
+                      N->getOriginalAlign(), MMOFlags, AAInfo);
 
     unsigned ExcessBits =
       N->getMemoryVT().getSizeInBits() - NVT.getSizeInBits();
@@ -4099,9 +4247,9 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
     // Increment the pointer to the other half.
     unsigned IncrementSize = NVT.getSizeInBits()/8;
     Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
-    Hi = DAG.getTruncStore(
-        Ch, dl, Hi, Ptr, N->getPointerInfo().getWithOffset(IncrementSize), NEVT,
-        MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
+    Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr,
+                           N->getPointerInfo().getWithOffset(IncrementSize),
+                           NEVT, N->getOriginalAlign(), MMOFlags, AAInfo);
     return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
   }
 
@@ -4129,8 +4277,8 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
   }
 
   // Store both the high bits and maybe some of the low bits.
-  Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(), HiVT, Alignment,
-                         MMOFlags, AAInfo);
+  Hi = DAG.getTruncStore(Ch, dl, Hi, Ptr, N->getPointerInfo(), HiVT,
+                         N->getOriginalAlign(), MMOFlags, AAInfo);
 
   // Increment the pointer to the other half.
   Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
@@ -4138,7 +4286,7 @@ SDValue DAGTypeLegalizer::ExpandIntOp_STORE(StoreSDNode *N, unsigned OpNo) {
   Lo = DAG.getTruncStore(Ch, dl, Lo, Ptr,
                          N->getPointerInfo().getWithOffset(IncrementSize),
                          EVT::getIntegerVT(*DAG.getContext(), ExcessBits),
-                         MinAlign(Alignment, IncrementSize), MMOFlags, AAInfo);
+                         N->getOriginalAlign(), MMOFlags, AAInfo);
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }
 
@@ -4186,18 +4334,43 @@ SDValue DAGTypeLegalizer::PromoteIntRes_EXTRACT_SUBVECTOR(SDNode *N) {
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
   assert(NOutVT.isVector() && "This type must be promoted to a vector type");
-  unsigned OutNumElems = OutVT.getVectorNumElements();
   EVT NOutVTElem = NOutVT.getVectorElementType();
 
   SDLoc dl(N);
   SDValue BaseIdx = N->getOperand(1);
 
+  // TODO: We may be able to use this for types other than scalable
+  // vectors and fix those tests that expect BUILD_VECTOR to be used
+  if (OutVT.isScalableVector()) {
+    SDValue InOp0 = N->getOperand(0);
+    EVT InVT = InOp0.getValueType();
+
+    // Promote operands and see if this is handled by target lowering,
+    // Otherwise, use the BUILD_VECTOR approach below
+    if (getTypeAction(InVT) == TargetLowering::TypePromoteInteger) {
+      // Collect the (promoted) operands
+      SDValue Ops[] = { GetPromotedInteger(InOp0), BaseIdx };
+
+      EVT PromEltVT = Ops[0].getValueType().getVectorElementType();
+      assert(PromEltVT.bitsLE(NOutVTElem) &&
+             "Promoted operand has an element type greater than result");
+
+      EVT ExtVT = NOutVT.changeVectorElementType(PromEltVT);
+      SDValue Ext = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), ExtVT, Ops);
+      return DAG.getNode(ISD::ANY_EXTEND, dl, NOutVT, Ext);
+    }
+  }
+
+  if (OutVT.isScalableVector())
+    report_fatal_error("Unable to promote scalable types using BUILD_VECTOR");
+
   SDValue InOp0 = N->getOperand(0);
   if (getTypeAction(InOp0.getValueType()) == TargetLowering::TypePromoteInteger)
     InOp0 = GetPromotedInteger(N->getOperand(0));
 
   EVT InVT = InOp0.getValueType();
 
+  unsigned OutNumElems = OutVT.getVectorNumElements();
   SmallVector<SDValue, 8> Ops;
   Ops.reserve(OutNumElems);
   for (unsigned i = 0; i != OutNumElems; ++i) {
@@ -4319,9 +4492,8 @@ SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
            "Unexpected number of elements");
 
     for (unsigned j = 0; j < NumElem; ++j) {
-      SDValue Ext = DAG.getNode(
-          ISD::EXTRACT_VECTOR_ELT, dl, SclrTy, Op,
-          DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SclrTy, Op,
+                                DAG.getVectorIdxConstant(j, dl));
       Ops[i * NumElem + j] = DAG.getAnyExtOrTrunc(Ext, dl, OutElemTy);
     }
   }
@@ -4429,9 +4601,8 @@ SDValue DAGTypeLegalizer::PromoteIntOp_CONCAT_VECTORS(SDNode *N) {
 
     for (unsigned i=0; i<NumElem; ++i) {
       // Extract element from incoming vector
-      SDValue Ex = DAG.getNode(
-          ISD::EXTRACT_VECTOR_ELT, dl, SclrTy, Incoming,
-          DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, SclrTy, Incoming,
+                               DAG.getVectorIdxConstant(i, dl));
       SDValue Tr = DAG.getNode(ISD::TRUNCATE, dl, RetSclrTy, Ex);
       NewOps.push_back(Tr);
     }
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
index 63ddb59fce68..ae087d3bbd8c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.cpp
@@ -124,6 +124,8 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
         Mapped |= 128;
       if (ResId && PromotedFloats.find(ResId) != PromotedFloats.end())
         Mapped |= 256;
+      if (ResId && SoftPromotedHalfs.find(ResId) != SoftPromotedHalfs.end())
+        Mapped |= 512;
 
       if (Node.getNodeId() != Processed) {
         // Since we allow ReplacedValues to map deleted nodes, it may map nodes
@@ -168,12 +170,15 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
           dbgs() << " WidenedVectors";
         if (Mapped & 256)
           dbgs() << " PromotedFloats";
+        if (Mapped & 512)
+          dbgs() << " SoftPromoteHalfs";
         dbgs() << "\n";
         llvm_unreachable(nullptr);
       }
     }
   }
 
+#ifndef NDEBUG
   // Checked that NewNodes are only used by other NewNodes.
   for (unsigned i = 0, e = NewNodes.size(); i != e; ++i) {
     SDNode *N = NewNodes[i];
@@ -181,6 +186,7 @@ void DAGTypeLegalizer::PerformExpensiveChecks() {
          UI != UE; ++UI)
       assert(UI->getNodeId() == NewNode && "NewNode used by non-NewNode!");
   }
+#endif
 }
 
 /// This is the main entry point for the type legalizer. This does a top-down
@@ -239,6 +245,9 @@ bool DAGTypeLegalizer::run() {
       case TargetLowering::TypeLegal:
         LLVM_DEBUG(dbgs() << "Legal result type\n");
         break;
+      case TargetLowering::TypeScalarizeScalableVector:
+        report_fatal_error(
+            "Scalarization of scalable vectors is not supported.");
       // The following calls must take care of *all* of the node's results,
       // not just the illegal result they were passed (this includes results
       // with a legal type).  Results can be remapped using ReplaceValueWith,
@@ -276,6 +285,10 @@ bool DAGTypeLegalizer::run() {
         PromoteFloatResult(N, i);
         Changed = true;
         goto NodeDone;
+      case TargetLowering::TypeSoftPromoteHalf:
+        SoftPromoteHalfResult(N, i);
+        Changed = true;
+        goto NodeDone;
       }
     }
 
@@ -297,6 +310,9 @@ ScanOperands:
       case TargetLowering::TypeLegal:
         LLVM_DEBUG(dbgs() << "Legal operand\n");
         continue;
+      case TargetLowering::TypeScalarizeScalableVector:
+        report_fatal_error(
+            "Scalarization of scalable vectors is not supported.");
       // The following calls must either replace all of the node's results
       // using ReplaceValueWith, and return "false"; or update the node's
       // operands in place, and return "true".
@@ -332,6 +348,10 @@ ScanOperands:
         NeedsReanalyzing = PromoteFloatOperand(N, i);
         Changed = true;
         break;
+      case TargetLowering::TypeSoftPromoteHalf:
+        NeedsReanalyzing = SoftPromoteHalfOperand(N, i);
+        Changed = true;
+        break;
       }
       break;
     }
@@ -719,6 +739,16 @@ void DAGTypeLegalizer::SetPromotedFloat(SDValue Op, SDValue Result) {
   OpIdEntry = getTableId(Result);
 }
 
+void DAGTypeLegalizer::SetSoftPromotedHalf(SDValue Op, SDValue Result) {
+  assert(Result.getValueType() == MVT::i16 &&
+         "Invalid type for soft-promoted half");
+  AnalyzeNewValue(Result);
+
+  auto &OpIdEntry = SoftPromotedHalfs[getTableId(Op)];
+  assert((OpIdEntry == 0) && "Node is already promoted!");
+  OpIdEntry = getTableId(Result);
+}
+
 void DAGTypeLegalizer::SetScalarizedVector(SDValue Op, SDValue Result) {
   // Note that in some cases vector operation operands may be greater than
   // the vector element type. For example BUILD_VECTOR of type <1 x i1> with
@@ -805,9 +835,9 @@ void DAGTypeLegalizer::GetSplitVector(SDValue Op, SDValue &Lo,
 void DAGTypeLegalizer::SetSplitVector(SDValue Op, SDValue Lo,
                                       SDValue Hi) {
   assert(Lo.getValueType().getVectorElementType() ==
-         Op.getValueType().getVectorElementType() &&
-         2*Lo.getValueType().getVectorNumElements() ==
-         Op.getValueType().getVectorNumElements() &&
+             Op.getValueType().getVectorElementType() &&
+         Lo.getValueType().getVectorElementCount() * 2 ==
+             Op.getValueType().getVectorElementCount() &&
          Hi.getValueType() == Lo.getValueType() &&
          "Invalid type for split vector");
   // Lo/Hi may have been newly allocated, if so, add nodeid's as relevant.
@@ -859,12 +889,19 @@ SDValue DAGTypeLegalizer::CreateStackStoreLoad(SDValue Op,
   SDLoc dl(Op);
   // Create the stack frame object.  Make sure it is aligned for both
   // the source and destination types.
-  SDValue StackPtr = DAG.CreateStackTemporary(Op.getValueType(), DestVT);
+
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align DestAlign = DAG.getReducedAlign(DestVT, /*UseABI=*/false);
+  Align OpAlign = DAG.getReducedAlign(Op.getValueType(), /*UseABI=*/false);
+  Align Align = std::max(DestAlign, OpAlign);
+  SDValue StackPtr =
+      DAG.CreateStackTemporary(Op.getValueType().getStoreSize(), Align);
   // Emit a store to the stack slot.
-  SDValue Store =
-      DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr, MachinePointerInfo());
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Op, StackPtr,
+                               MachinePointerInfo(), Align);
   // Result is a load from the stack slot.
-  return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo());
+  return DAG.getLoad(DestVT, dl, Store, StackPtr, MachinePointerInfo(), Align);
 }
 
 /// Replace the node's results with custom code provided by the target and
@@ -890,17 +927,6 @@ bool DAGTypeLegalizer::CustomLowerNode(SDNode *N, EVT VT, bool LegalizeResult) {
     // The target didn't want to custom lower it after all.
     return false;
 
-  // When called from DAGTypeLegalizer::ExpandIntegerResult, we might need to
-  // provide the same kind of custom splitting behavior.
-  if (Results.size() == N->getNumValues() + 1 && LegalizeResult) {
-    // We've legalized a return type by splitting it. If there is a chain,
-    // replace that too.
-    SetExpandedInteger(SDValue(N, 0), Results[0], Results[1]);
-    if (N->getNumValues() > 1)
-      ReplaceValueWith(SDValue(N, 1), Results[2]);
-    return true;
-  }
-
   // Make everything that once used N's values now use those in Results instead.
   assert(Results.size() == N->getNumValues() &&
          "Custom lowering returned the wrong number of results!");
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index faae14444d51..0fa6d653a836 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -109,6 +109,10 @@ private:
   /// supported precision, this map indicates what promoted value to use.
   SmallDenseMap<TableId, TableId, 8> PromotedFloats;
 
+  /// For floating-point nodes that have a smaller precision than the smallest
+  /// supported precision, this map indicates the converted value to use.
+  SmallDenseMap<TableId, TableId, 8> SoftPromotedHalfs;
+
   /// For float nodes that need to be expanded this map indicates which operands
   /// are the expanded version of the input.
   SmallDenseMap<TableId, std::pair<TableId, TableId>, 8> ExpandedFloats;
@@ -155,7 +159,9 @@ private:
   const SDValue &getSDValue(TableId &Id) {
     RemapId(Id);
     assert(Id && "TableId should be non-zero");
-    return IdToValueMap[Id];
+    auto I = IdToValueMap.find(Id);
+    assert(I != IdToValueMap.end() && "cannot find Id in map");
+    return I->second;
   }
 
 public:
@@ -172,24 +178,30 @@ public:
   bool run();
 
   void NoteDeletion(SDNode *Old, SDNode *New) {
+    assert(Old != New && "node replaced with self");
     for (unsigned i = 0, e = Old->getNumValues(); i != e; ++i) {
       TableId NewId = getTableId(SDValue(New, i));
       TableId OldId = getTableId(SDValue(Old, i));
 
-      if (OldId != NewId)
+      if (OldId != NewId) {
         ReplacedValues[OldId] = NewId;
 
-      // Delete Node from tables.
+        // Delete Node from tables.  We cannot do this when OldId == NewId,
+        // because NewId can still have table references to it in
+        // ReplacedValues.
+        IdToValueMap.erase(OldId);
+        PromotedIntegers.erase(OldId);
+        ExpandedIntegers.erase(OldId);
+        SoftenedFloats.erase(OldId);
+        PromotedFloats.erase(OldId);
+        SoftPromotedHalfs.erase(OldId);
+        ExpandedFloats.erase(OldId);
+        ScalarizedVectors.erase(OldId);
+        SplitVectors.erase(OldId);
+        WidenedVectors.erase(OldId);
+      }
+
       ValueToIdMap.erase(SDValue(Old, i));
-      IdToValueMap.erase(OldId);
-      PromotedIntegers.erase(OldId);
-      ExpandedIntegers.erase(OldId);
-      SoftenedFloats.erase(OldId);
-      PromotedFloats.erase(OldId);
-      ExpandedFloats.erase(OldId);
-      ScalarizedVectors.erase(OldId);
-      SplitVectors.erase(OldId);
-      WidenedVectors.erase(OldId);
     }
   }
 
@@ -260,7 +272,7 @@ private:
     EVT OldVT = Op.getValueType();
     SDLoc dl(Op);
     Op = GetPromotedInteger(Op);
-    return DAG.getZeroExtendInReg(Op, dl, OldVT.getScalarType());
+    return DAG.getZeroExtendInReg(Op, dl, OldVT);
   }
 
   // Get a promoted operand and sign or zero extend it to the final size
@@ -274,7 +286,7 @@ private:
     if (TLI.isSExtCheaperThanZExt(OldVT, Op.getValueType()))
       return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Op.getValueType(), Op,
                          DAG.getValueType(OldVT));
-    return DAG.getZeroExtendInReg(Op, DL, OldVT.getScalarType());
+    return DAG.getZeroExtendInReg(Op, DL, OldVT);
   }
 
   // Integer Result Promotion.
@@ -304,6 +316,7 @@ private:
   SDValue PromoteIntRes_EXTRACT_VECTOR_ELT(SDNode *N);
   SDValue PromoteIntRes_FP_TO_XINT(SDNode *N);
   SDValue PromoteIntRes_FP_TO_FP16(SDNode *N);
+  SDValue PromoteIntRes_FREEZE(SDNode *N);
   SDValue PromoteIntRes_INT_EXTEND(SDNode *N);
   SDValue PromoteIntRes_LOAD(LoadSDNode *N);
   SDValue PromoteIntRes_MLOAD(MaskedLoadSDNode *N);
@@ -326,6 +339,7 @@ private:
   SDValue PromoteIntRes_ADDSUBCARRY(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_UNDEF(SDNode *N);
   SDValue PromoteIntRes_VAARG(SDNode *N);
+  SDValue PromoteIntRes_VSCALE(SDNode *N);
   SDValue PromoteIntRes_XMULO(SDNode *N, unsigned ResNo);
   SDValue PromoteIntRes_ADDSUBSAT(SDNode *N);
   SDValue PromoteIntRes_MULFIX(SDNode *N);
@@ -512,9 +526,11 @@ private:
   SDValue SoftenFloatRes_FP_ROUND(SDNode *N);
   SDValue SoftenFloatRes_FPOW(SDNode *N);
   SDValue SoftenFloatRes_FPOWI(SDNode *N);
+  SDValue SoftenFloatRes_FREEZE(SDNode *N);
   SDValue SoftenFloatRes_FREM(SDNode *N);
   SDValue SoftenFloatRes_FRINT(SDNode *N);
   SDValue SoftenFloatRes_FROUND(SDNode *N);
+  SDValue SoftenFloatRes_FROUNDEVEN(SDNode *N);
   SDValue SoftenFloatRes_FSIN(SDNode *N);
   SDValue SoftenFloatRes_FSQRT(SDNode *N);
   SDValue SoftenFloatRes_FSUB(SDNode *N);
@@ -584,9 +600,11 @@ private:
   void ExpandFloatRes_FP_EXTEND (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FPOW      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FPOWI     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FREEZE    (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FREM      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FRINT     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FROUND    (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FROUNDEVEN(SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSIN      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSQRT     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FSUB      (SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -651,6 +669,43 @@ private:
   SDValue PromoteFloatOp_SETCC(SDNode *N, unsigned OpNo);
 
   //===--------------------------------------------------------------------===//
+  // Half soft promotion support: LegalizeFloatTypes.cpp
+  //===--------------------------------------------------------------------===//
+
+  SDValue GetSoftPromotedHalf(SDValue Op) {
+    TableId &PromotedId = SoftPromotedHalfs[getTableId(Op)];
+    SDValue PromotedOp = getSDValue(PromotedId);
+    assert(PromotedOp.getNode() && "Operand wasn't promoted?");
+    return PromotedOp;
+  }
+  void SetSoftPromotedHalf(SDValue Op, SDValue Result);
+
+  void SoftPromoteHalfResult(SDNode *N, unsigned ResNo);
+  SDValue SoftPromoteHalfRes_BinOp(SDNode *N);
+  SDValue SoftPromoteHalfRes_BITCAST(SDNode *N);
+  SDValue SoftPromoteHalfRes_ConstantFP(SDNode *N);
+  SDValue SoftPromoteHalfRes_EXTRACT_VECTOR_ELT(SDNode *N);
+  SDValue SoftPromoteHalfRes_FCOPYSIGN(SDNode *N);
+  SDValue SoftPromoteHalfRes_FMAD(SDNode *N);
+  SDValue SoftPromoteHalfRes_FPOWI(SDNode *N);
+  SDValue SoftPromoteHalfRes_FP_ROUND(SDNode *N);
+  SDValue SoftPromoteHalfRes_LOAD(SDNode *N);
+  SDValue SoftPromoteHalfRes_SELECT(SDNode *N);
+  SDValue SoftPromoteHalfRes_SELECT_CC(SDNode *N);
+  SDValue SoftPromoteHalfRes_UnaryOp(SDNode *N);
+  SDValue SoftPromoteHalfRes_XINT_TO_FP(SDNode *N);
+  SDValue SoftPromoteHalfRes_UNDEF(SDNode *N);
+
+  bool SoftPromoteHalfOperand(SDNode *N, unsigned OpNo);
+  SDValue SoftPromoteHalfOp_BITCAST(SDNode *N);
+  SDValue SoftPromoteHalfOp_FCOPYSIGN(SDNode *N, unsigned OpNo);
+  SDValue SoftPromoteHalfOp_FP_EXTEND(SDNode *N);
+  SDValue SoftPromoteHalfOp_FP_TO_XINT(SDNode *N);
+  SDValue SoftPromoteHalfOp_SETCC(SDNode *N);
+  SDValue SoftPromoteHalfOp_SELECT_CC(SDNode *N, unsigned OpNo);
+  SDValue SoftPromoteHalfOp_STORE(SDNode *N, unsigned OpNo);
+
+  //===--------------------------------------------------------------------===//
   // Scalarization Support: LegalizeVectorTypes.cpp
   //===--------------------------------------------------------------------===//
 
@@ -721,6 +776,11 @@ private:
   void GetSplitVector(SDValue Op, SDValue &Lo, SDValue &Hi);
   void SetSplitVector(SDValue Op, SDValue Lo, SDValue Hi);
 
+  // Helper function for incrementing the pointer when splitting
+  // memory operations
+  void IncrementPointer(MemSDNode *N, EVT MemVT,
+                        MachinePointerInfo &MPI, SDValue &Ptr);
+
   // Vector Result Splitting: <128 x ty> -> 2 x <64 x ty>.
   void SplitVectorResult(SDNode *N, unsigned ResNo);
   void SplitVecRes_BinOp(SDNode *N, SDValue &Lo, SDValue &Hi);
@@ -918,6 +978,7 @@ private:
   void SplitRes_SELECT      (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_SELECT_CC   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void SplitRes_UNDEF       (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void SplitRes_FREEZE      (SDNode *N, SDValue &Lo, SDValue &Hi);
 
   void SplitVSETCC(const SDNode *N);
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index c45c62cabc05..9cd3b8f76d6c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -50,6 +50,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
     case TargetLowering::TypePromoteInteger:
       break;
     case TargetLowering::TypePromoteFloat:
+    case TargetLowering::TypeSoftPromoteHalf:
       llvm_unreachable("Bitcast of a promotion-needing float should never need"
                        "expansion");
     case TargetLowering::TypeSoftenFloat:
@@ -82,6 +83,8 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
       Lo = DAG.getNode(ISD::BITCAST, dl, NOutVT, Lo);
       Hi = DAG.getNode(ISD::BITCAST, dl, NOutVT, Hi);
       return;
+    case TargetLowering::TypeScalarizeScalableVector:
+      report_fatal_error("Scalarization of scalable vectors is not supported.");
     case TargetLowering::TypeWidenVector: {
       assert(!(InVT.getVectorNumElements() & 1) && "Unsupported BITCAST");
       InOp = GetWidenedVector(InOp);
@@ -119,9 +122,8 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
       SmallVector<SDValue, 8> Vals;
       for (unsigned i = 0; i < NumElems; ++i)
-        Vals.push_back(DAG.getNode(
-            ISD::EXTRACT_VECTOR_ELT, dl, ElemVT, CastInOp,
-            DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
+        Vals.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ElemVT,
+                                   CastInOp, DAG.getVectorIdxConstant(i, dl)));
 
       // Build Lo, Hi pair by pairing extracted elements if needed.
       unsigned Slot = 0;
@@ -154,9 +156,13 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
   // Create the stack frame object.  Make sure it is aligned for both
   // the source and expanded destination types.
-  unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(
-      NOutVT.getTypeForEVT(*DAG.getContext()));
-  SDValue StackPtr = DAG.CreateStackTemporary(InVT, Alignment);
+
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align InAlign = DAG.getReducedAlign(InVT, /*UseABI=*/false);
+  Align NOutAlign = DAG.getReducedAlign(NOutVT, /*UseABI=*/false);
+  Align Align = std::max(InAlign, NOutAlign);
+  SDValue StackPtr = DAG.CreateStackTemporary(InVT.getStoreSize(), Align);
   int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
   MachinePointerInfo PtrInfo =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
@@ -165,7 +171,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, InOp, StackPtr, PtrInfo);
 
   // Load the first half from the stack slot.
-  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo);
+  Lo = DAG.getLoad(NOutVT, dl, Store, StackPtr, PtrInfo, NOutAlign);
 
   // Increment the pointer to the other half.
   unsigned IncrementSize = NOutVT.getSizeInBits() / 8;
@@ -173,8 +179,7 @@ void DAGTypeLegalizer::ExpandRes_BITCAST(SDNode *N, SDValue &Lo, SDValue &Hi) {
 
   // Load the second half from the stack slot.
   Hi = DAG.getLoad(NOutVT, dl, Store, StackPtr,
-                   PtrInfo.getWithOffset(IncrementSize),
-                   MinAlign(Alignment, IncrementSize));
+                   PtrInfo.getWithOffset(IncrementSize), NOutAlign);
 
   // Handle endianness of the load.
   if (TLI.hasBigEndianPartOrdering(OutVT, DAG.getDataLayout()))
@@ -251,21 +256,20 @@ void DAGTypeLegalizer::ExpandRes_NormalLoad(SDNode *N, SDValue &Lo,
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), ValueVT);
   SDValue Chain = LD->getChain();
   SDValue Ptr = LD->getBasePtr();
-  unsigned Alignment = LD->getAlignment();
   AAMDNodes AAInfo = LD->getAAInfo();
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
 
-  Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(), Alignment,
-                   LD->getMemOperand()->getFlags(), AAInfo);
+  Lo = DAG.getLoad(NVT, dl, Chain, Ptr, LD->getPointerInfo(),
+                   LD->getOriginalAlign(), LD->getMemOperand()->getFlags(),
+                   AAInfo);
 
   // Increment the pointer to the other half.
   unsigned IncrementSize = NVT.getSizeInBits() / 8;
   Ptr = DAG.getMemBasePlusOffset(Ptr, IncrementSize, dl);
-  Hi = DAG.getLoad(NVT, dl, Chain, Ptr,
-                   LD->getPointerInfo().getWithOffset(IncrementSize),
-                   MinAlign(Alignment, IncrementSize),
-                   LD->getMemOperand()->getFlags(), AAInfo);
+  Hi = DAG.getLoad(
+      NVT, dl, Chain, Ptr, LD->getPointerInfo().getWithOffset(IncrementSize),
+      LD->getOriginalAlign(), LD->getMemOperand()->getFlags(), AAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -462,7 +466,6 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
   EVT NVT = TLI.getTypeToTransformTo(*DAG.getContext(), ValueVT);
   SDValue Chain = St->getChain();
   SDValue Ptr = St->getBasePtr();
-  unsigned Alignment = St->getAlignment();
   AAMDNodes AAInfo = St->getAAInfo();
 
   assert(NVT.isByteSized() && "Expanded type not byte sized!");
@@ -474,14 +477,14 @@ SDValue DAGTypeLegalizer::ExpandOp_NormalStore(SDNode *N, unsigned OpNo) {
   if (TLI.hasBigEndianPartOrdering(ValueVT, DAG.getDataLayout()))
     std::swap(Lo, Hi);
 
-  Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(), Alignment,
-                    St->getMemOperand()->getFlags(), AAInfo);
+  Lo = DAG.getStore(Chain, dl, Lo, Ptr, St->getPointerInfo(),
+                    St->getOriginalAlign(), St->getMemOperand()->getFlags(),
+                    AAInfo);
 
   Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
-  Hi = DAG.getStore(Chain, dl, Hi, Ptr,
-                    St->getPointerInfo().getWithOffset(IncrementSize),
-                    MinAlign(Alignment, IncrementSize),
-                    St->getMemOperand()->getFlags(), AAInfo);
+  Hi = DAG.getStore(
+      Chain, dl, Hi, Ptr, St->getPointerInfo().getWithOffset(IncrementSize),
+      St->getOriginalAlign(), St->getMemOperand()->getFlags(), AAInfo);
 
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Lo, Hi);
 }
@@ -558,3 +561,12 @@ void DAGTypeLegalizer::SplitRes_UNDEF(SDNode *N, SDValue &Lo, SDValue &Hi) {
   Lo = DAG.getUNDEF(LoVT);
   Hi = DAG.getUNDEF(HiVT);
 }
+
+void DAGTypeLegalizer::SplitRes_FREEZE(SDNode *N, SDValue &Lo, SDValue &Hi) {
+  SDValue L, H;
+  SDLoc dl(N);
+  GetSplitOp(N->getOperand(0), L, H);
+
+  Lo = DAG.getNode(ISD::FREEZE, dl, L.getValueType(), L);
+  Hi = DAG.getNode(ISD::FREEZE, dl, H.getValueType(), H);
+}
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index 7d0b1ee6ae07..6409f924920d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -142,9 +142,10 @@ class VectorLegalizer {
   void ExpandUADDSUBO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void ExpandSADDSUBO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   void ExpandMULO(SDNode *Node, SmallVectorImpl<SDValue> &Results);
-  SDValue ExpandFixedPointDiv(SDNode *Node);
+  void ExpandFixedPointDiv(SDNode *Node, SmallVectorImpl<SDValue> &Results);
   SDValue ExpandStrictFPOp(SDNode *Node);
   void ExpandStrictFPOp(SDNode *Node, SmallVectorImpl<SDValue> &Results);
+  void ExpandREM(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
   void UnrollStrictFPOp(SDNode *Node, SmallVectorImpl<SDValue> &Results);
 
@@ -182,9 +183,7 @@ bool VectorLegalizer::Run() {
        E = std::prev(DAG.allnodes_end()); I != std::next(E); ++I) {
     // Check if the values of the nodes contain vectors. We don't need to check
     // the operands because we are going to check their values at some point.
-    for (SDNode::value_iterator J = I->value_begin(), E = I->value_end();
-         J != E; ++J)
-      HasVectors |= J->isVector();
+    HasVectors = llvm::any_of(I->values(), [](EVT T) { return T.isVector(); });
 
     // If we found a vector node we can start the legalization.
     if (HasVectors)
@@ -318,12 +317,10 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     }
   }
 
-  bool HasVectorValueOrOp = false;
-  for (auto J = Node->value_begin(), E = Node->value_end(); J != E; ++J)
-    HasVectorValueOrOp |= J->isVector();
-  for (const SDValue &Oper : Node->op_values())
-    HasVectorValueOrOp |= Oper.getValueType().isVector();
-
+  bool HasVectorValueOrOp =
+      llvm::any_of(Node->values(), [](EVT T) { return T.isVector(); }) ||
+      llvm::any_of(Node->op_values(),
+                   [](SDValue O) { return O.getValueType().isVector(); });
   if (!HasVectorValueOrOp)
     return TranslateLegalizeResults(Op, Node);
 
@@ -339,7 +336,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
     if (Action == TargetLowering::Legal)
       Action = TargetLowering::Expand;
     break;
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)                   \
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
   case ISD::STRICT_##DAGN:
 #include "llvm/IR/ConstrainedOps.def"
     ValVT = Node->getValueType(0);
@@ -431,6 +428,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FRINT:
   case ISD::FNEARBYINT:
   case ISD::FROUND:
+  case ISD::FROUNDEVEN:
   case ISD::FFLOOR:
   case ISD::FP_ROUND:
   case ISD::FP_EXTEND:
@@ -463,7 +461,9 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::UMULFIX:
   case ISD::UMULFIXSAT:
   case ISD::SDIVFIX:
-  case ISD::UDIVFIX: {
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT: {
     unsigned Scale = Node->getConstantOperandVal(2);
     Action = TLI.getFixedPointOperationAction(Node->getOpcode(),
                                               Node->getValueType(0), Scale);
@@ -704,132 +704,7 @@ void VectorLegalizer::PromoteFP_TO_INT(SDNode *Node,
 
 std::pair<SDValue, SDValue> VectorLegalizer::ExpandLoad(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
-
-  EVT SrcVT = LD->getMemoryVT();
-  EVT SrcEltVT = SrcVT.getScalarType();
-  unsigned NumElem = SrcVT.getVectorNumElements();
-
-  SDValue NewChain;
-  SDValue Value;
-  if (SrcVT.getVectorNumElements() > 1 && !SrcEltVT.isByteSized()) {
-    SDLoc dl(N);
-
-    SmallVector<SDValue, 8> Vals;
-    SmallVector<SDValue, 8> LoadChains;
-
-    EVT DstEltVT = LD->getValueType(0).getScalarType();
-    SDValue Chain = LD->getChain();
-    SDValue BasePTR = LD->getBasePtr();
-    ISD::LoadExtType ExtType = LD->getExtensionType();
-
-    // When elements in a vector is not byte-addressable, we cannot directly
-    // load each element by advancing pointer, which could only address bytes.
-    // Instead, we load all significant words, mask bits off, and concatenate
-    // them to form each element. Finally, they are extended to destination
-    // scalar type to build the destination vector.
-    EVT WideVT = TLI.getPointerTy(DAG.getDataLayout());
-
-    assert(WideVT.isRound() &&
-           "Could not handle the sophisticated case when the widest integer is"
-           " not power of 2.");
-    assert(WideVT.bitsGE(SrcEltVT) &&
-           "Type is not legalized?");
-
-    unsigned WideBytes = WideVT.getStoreSize();
-    unsigned Offset = 0;
-    unsigned RemainingBytes = SrcVT.getStoreSize();
-    SmallVector<SDValue, 8> LoadVals;
-    while (RemainingBytes > 0) {
-      SDValue ScalarLoad;
-      unsigned LoadBytes = WideBytes;
-
-      if (RemainingBytes >= LoadBytes) {
-        ScalarLoad =
-            DAG.getLoad(WideVT, dl, Chain, BasePTR,
-                        LD->getPointerInfo().getWithOffset(Offset),
-                        MinAlign(LD->getAlignment(), Offset),
-                        LD->getMemOperand()->getFlags(), LD->getAAInfo());
-      } else {
-        EVT LoadVT = WideVT;
-        while (RemainingBytes < LoadBytes) {
-          LoadBytes >>= 1; // Reduce the load size by half.
-          LoadVT = EVT::getIntegerVT(*DAG.getContext(), LoadBytes << 3);
-        }
-        ScalarLoad =
-            DAG.getExtLoad(ISD::EXTLOAD, dl, WideVT, Chain, BasePTR,
-                           LD->getPointerInfo().getWithOffset(Offset), LoadVT,
-                           MinAlign(LD->getAlignment(), Offset),
-                           LD->getMemOperand()->getFlags(), LD->getAAInfo());
-      }
-
-      RemainingBytes -= LoadBytes;
-      Offset += LoadBytes;
-
-      BasePTR = DAG.getObjectPtrOffset(dl, BasePTR, LoadBytes);
-
-      LoadVals.push_back(ScalarLoad.getValue(0));
-      LoadChains.push_back(ScalarLoad.getValue(1));
-    }
-
-    unsigned BitOffset = 0;
-    unsigned WideIdx = 0;
-    unsigned WideBits = WideVT.getSizeInBits();
-
-    // Extract bits, pack and extend/trunc them into destination type.
-    unsigned SrcEltBits = SrcEltVT.getSizeInBits();
-    SDValue SrcEltBitMask = DAG.getConstant(
-        APInt::getLowBitsSet(WideBits, SrcEltBits), dl, WideVT);
-
-    for (unsigned Idx = 0; Idx != NumElem; ++Idx) {
-      assert(BitOffset < WideBits && "Unexpected offset!");
-
-      SDValue ShAmt = DAG.getConstant(
-          BitOffset, dl, TLI.getShiftAmountTy(WideVT, DAG.getDataLayout()));
-      SDValue Lo = DAG.getNode(ISD::SRL, dl, WideVT, LoadVals[WideIdx], ShAmt);
-
-      BitOffset += SrcEltBits;
-      if (BitOffset >= WideBits) {
-        WideIdx++;
-        BitOffset -= WideBits;
-        if (BitOffset > 0) {
-          ShAmt = DAG.getConstant(
-              SrcEltBits - BitOffset, dl,
-              TLI.getShiftAmountTy(WideVT, DAG.getDataLayout()));
-          SDValue Hi =
-              DAG.getNode(ISD::SHL, dl, WideVT, LoadVals[WideIdx], ShAmt);
-          Lo = DAG.getNode(ISD::OR, dl, WideVT, Lo, Hi);
-        }
-      }
-
-      Lo = DAG.getNode(ISD::AND, dl, WideVT, Lo, SrcEltBitMask);
-
-      switch (ExtType) {
-      default: llvm_unreachable("Unknown extended-load op!");
-      case ISD::EXTLOAD:
-        Lo = DAG.getAnyExtOrTrunc(Lo, dl, DstEltVT);
-        break;
-      case ISD::ZEXTLOAD:
-        Lo = DAG.getZExtOrTrunc(Lo, dl, DstEltVT);
-        break;
-      case ISD::SEXTLOAD:
-        ShAmt =
-            DAG.getConstant(WideBits - SrcEltBits, dl,
-                            TLI.getShiftAmountTy(WideVT, DAG.getDataLayout()));
-        Lo = DAG.getNode(ISD::SHL, dl, WideVT, Lo, ShAmt);
-        Lo = DAG.getNode(ISD::SRA, dl, WideVT, Lo, ShAmt);
-        Lo = DAG.getSExtOrTrunc(Lo, dl, DstEltVT);
-        break;
-      }
-      Vals.push_back(Lo);
-    }
-
-    NewChain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, LoadChains);
-    Value = DAG.getBuildVector(N->getValueType(0), dl, Vals);
-  } else {
-    std::tie(Value, NewChain) = TLI.scalarizeVectorLoad(LD, DAG);
-  }
-
-  return std::make_pair(Value, NewChain);
+  return TLI.scalarizeVectorLoad(LD, DAG);
 }
 
 SDValue VectorLegalizer::ExpandStore(SDNode *N) {
@@ -968,9 +843,12 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
     break;
   case ISD::SDIVFIX:
   case ISD::UDIVFIX:
-    Results.push_back(ExpandFixedPointDiv(Node));
+    ExpandFixedPointDiv(Node, Results);
     return;
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)                   \
+  case ISD::SDIVFIXSAT:
+  case ISD::UDIVFIXSAT:
+    break;
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
   case ISD::STRICT_##DAGN:
 #include "llvm/IR/ConstrainedOps.def"
     ExpandStrictFPOp(Node, Results);
@@ -990,6 +868,10 @@ void VectorLegalizer::Expand(SDNode *Node, SmallVectorImpl<SDValue> &Results) {
   case ISD::VECREDUCE_FMIN:
     Results.push_back(TLI.expandVecReduce(Node, DAG));
     return;
+  case ISD::SREM:
+  case ISD::UREM:
+    ExpandREM(Node, Results);
+    return;
   }
 
   Results.push_back(DAG.UnrollVectorOp(Node));
@@ -1087,9 +969,8 @@ SDValue VectorLegalizer::ExpandANY_EXTEND_VECTOR_INREG(SDNode *Node) {
     NumSrcElements = VT.getSizeInBits() / SrcVT.getScalarSizeInBits();
     SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
                              NumSrcElements);
-    Src = DAG.getNode(
-        ISD::INSERT_SUBVECTOR, DL, SrcVT, DAG.getUNDEF(SrcVT), Src,
-        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcVT, DAG.getUNDEF(SrcVT),
+                      Src, DAG.getVectorIdxConstant(0, DL));
   }
 
   // Build a base mask of undef shuffles.
@@ -1147,9 +1028,8 @@ SDValue VectorLegalizer::ExpandZERO_EXTEND_VECTOR_INREG(SDNode *Node) {
     NumSrcElements = VT.getSizeInBits() / SrcVT.getScalarSizeInBits();
     SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getScalarType(),
                              NumSrcElements);
-    Src = DAG.getNode(
-        ISD::INSERT_SUBVECTOR, DL, SrcVT, DAG.getUNDEF(SrcVT), Src,
-        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    Src = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, SrcVT, DAG.getUNDEF(SrcVT),
+                      Src, DAG.getVectorIdxConstant(0, DL));
   }
 
   // Build up a zero vector to blend into this one.
@@ -1456,12 +1336,12 @@ void VectorLegalizer::ExpandMULO(SDNode *Node,
   Results.push_back(Overflow);
 }
 
-SDValue VectorLegalizer::ExpandFixedPointDiv(SDNode *Node) {
+void VectorLegalizer::ExpandFixedPointDiv(SDNode *Node,
+                                          SmallVectorImpl<SDValue> &Results) {
   SDNode *N = Node;
   if (SDValue Expanded = TLI.expandFixedPointDiv(N->getOpcode(), SDLoc(N),
           N->getOperand(0), N->getOperand(1), N->getConstantOperandVal(2), DAG))
-    return Expanded;
-  return DAG.UnrollVectorOp(N);
+    Results.push_back(Expanded);
 }
 
 void VectorLegalizer::ExpandStrictFPOp(SDNode *Node,
@@ -1478,6 +1358,17 @@ void VectorLegalizer::ExpandStrictFPOp(SDNode *Node,
   UnrollStrictFPOp(Node, Results);
 }
 
+void VectorLegalizer::ExpandREM(SDNode *Node,
+                                SmallVectorImpl<SDValue> &Results) {
+  assert((Node->getOpcode() == ISD::SREM || Node->getOpcode() == ISD::UREM) &&
+         "Expected REM node");
+
+  SDValue Result;
+  if (!TLI.expandREM(Node, Result, DAG))
+    Result = DAG.UnrollVectorOp(Node);
+  Results.push_back(Result);
+}
+
 void VectorLegalizer::UnrollStrictFPOp(SDNode *Node,
                                        SmallVectorImpl<SDValue> &Results) {
   EVT VT = Node->getValueType(0);
@@ -1500,8 +1391,7 @@ void VectorLegalizer::UnrollStrictFPOp(SDNode *Node,
   SmallVector<SDValue, 32> OpChains;
   for (unsigned i = 0; i < NumElems; ++i) {
     SmallVector<SDValue, 4> Opers;
-    SDValue Idx = DAG.getConstant(i, dl,
-                                  TLI.getVectorIdxTy(DAG.getDataLayout()));
+    SDValue Idx = DAG.getVectorIdxConstant(i, dl);
 
     // The Chain is the first operand.
     Opers.push_back(Chain);
@@ -1551,12 +1441,10 @@ SDValue VectorLegalizer::UnrollVSETCC(SDNode *Node) {
   SDLoc dl(Node);
   SmallVector<SDValue, 8> Ops(NumElems);
   for (unsigned i = 0; i < NumElems; ++i) {
-    SDValue LHSElem = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
-        DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-    SDValue RHSElem = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
-        DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
+                                  DAG.getVectorIdxConstant(i, dl));
+    SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
+                                  DAG.getVectorIdxConstant(i, dl));
     Ops[i] = DAG.getNode(ISD::SETCC, dl,
                          TLI.getSetCCResultType(DAG.getDataLayout(),
                                                 *DAG.getContext(), TmpEltVT),
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index 974914d00d05..414ba25ffd5f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -20,10 +20,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "LegalizeTypes.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Support/TypeSize.h"
+#include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "legalize-types"
@@ -88,11 +89,13 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FLOG2:
   case ISD::FNEARBYINT:
   case ISD::FNEG:
+  case ISD::FREEZE:
   case ISD::FP_EXTEND:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FRINT:
   case ISD::FROUND:
+  case ISD::FROUNDEVEN:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
@@ -147,7 +150,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
     R = ScalarizeVecRes_TernaryOp(N);
     break;
 
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)                   \
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
   case ISD::STRICT_##DAGN:
 #include "llvm/IR/ConstrainedOps.def"
     R = ScalarizeVecRes_StrictFPOp(N);
@@ -166,7 +169,9 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::UMULFIX:
   case ISD::UMULFIXSAT:
   case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
   case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:
     R = ScalarizeVecRes_FIX(N);
     break;
   }
@@ -187,8 +192,8 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_TernaryOp(SDNode *N) {
   SDValue Op0 = GetScalarizedVector(N->getOperand(0));
   SDValue Op1 = GetScalarizedVector(N->getOperand(1));
   SDValue Op2 = GetScalarizedVector(N->getOperand(2));
-  return DAG.getNode(N->getOpcode(), SDLoc(N),
-                     Op0.getValueType(), Op0, Op1, Op2);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), Op0.getValueType(), Op0, Op1,
+                     Op2, N->getFlags());
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_FIX(SDNode *N) {
@@ -196,7 +201,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_FIX(SDNode *N) {
   SDValue Op1 = GetScalarizedVector(N->getOperand(1));
   SDValue Op2 = N->getOperand(2);
   return DAG.getNode(N->getOpcode(), SDLoc(N), Op0.getValueType(), Op0, Op1,
-                     Op2);
+                     Op2, N->getFlags());
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_StrictFPOp(SDNode *N) {
@@ -221,7 +226,8 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_StrictFPOp(SDNode *N) {
     Opers[i] = Oper;
   }
 
-  SDValue Result = DAG.getNode(N->getOpcode(), dl, ValueVTs, Opers);
+  SDValue Result = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(ValueVTs),
+                               Opers, N->getFlags());
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
@@ -251,6 +257,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_OverflowOp(SDNode *N,
       ResVT.getVectorElementType(), OvVT.getVectorElementType());
   SDNode *ScalarNode = DAG.getNode(
       N->getOpcode(), DL, ScalarVTs, ScalarLHS, ScalarRHS).getNode();
+  ScalarNode->setFlags(N->getFlags());
 
   // Replace the other vector result not being explicitly scalarized here.
   unsigned OtherNo = 1 - ResNo;
@@ -331,8 +338,7 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_LOAD(LoadSDNode *N) {
       N->getValueType(0).getVectorElementType(), SDLoc(N), N->getChain(),
       N->getBasePtr(), DAG.getUNDEF(N->getBasePtr().getValueType()),
       N->getPointerInfo(), N->getMemoryVT().getVectorElementType(),
-      N->getOriginalAlignment(), N->getMemOperand()->getFlags(),
-      N->getAAInfo());
+      N->getOriginalAlign(), N->getMemOperand()->getFlags(), N->getAAInfo());
 
   // Legalize the chain result - switch anything that used the old chain to
   // use the new one.
@@ -357,11 +363,10 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_UnaryOp(SDNode *N) {
     Op = GetScalarizedVector(Op);
   } else {
     EVT VT = OpVT.getVectorElementType();
-    Op = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
-        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Op,
+                     DAG.getVectorIdxConstant(0, DL));
   }
-  return DAG.getNode(N->getOpcode(), SDLoc(N), DestVT, Op);
+  return DAG.getNode(N->getOpcode(), SDLoc(N), DestVT, Op, N->getFlags());
 }
 
 SDValue DAGTypeLegalizer::ScalarizeVecRes_InregOp(SDNode *N) {
@@ -383,9 +388,8 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VecInregOp(SDNode *N) {
   if (getTypeAction(OpVT) == TargetLowering::TypeScalarizeVector) {
     Op = GetScalarizedVector(Op);
   } else {
-    Op = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, DL, OpEltVT, Op,
-        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, OpEltVT, Op,
+                     DAG.getVectorIdxConstant(0, DL));
   }
 
   switch (N->getOpcode()) {
@@ -421,9 +425,8 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_VSELECT(SDNode *N) {
     Cond = GetScalarizedVector(Cond);
   } else {
     EVT VT = OpVT.getVectorElementType();
-    Cond = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, DL, VT, Cond,
-        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    Cond = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Cond,
+                       DAG.getVectorIdxConstant(0, DL));
   }
 
   SDValue LHS = GetScalarizedVector(N->getOperand(1));
@@ -523,12 +526,10 @@ SDValue DAGTypeLegalizer::ScalarizeVecRes_SETCC(SDNode *N) {
     RHS = GetScalarizedVector(RHS);
   } else {
     EVT VT = OpVT.getVectorElementType();
-    LHS = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, DL, VT, LHS,
-        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
-    RHS = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, DL, VT, RHS,
-        DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    LHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, LHS,
+                      DAG.getVectorIdxConstant(0, DL));
+    RHS = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, RHS,
+                      DAG.getVectorIdxConstant(0, DL));
   }
 
   // Turn it into a scalar SETCC.
@@ -749,12 +750,12 @@ SDValue DAGTypeLegalizer::ScalarizeVecOp_STORE(StoreSDNode *N, unsigned OpNo){
     return DAG.getTruncStore(
         N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
         N->getBasePtr(), N->getPointerInfo(),
-        N->getMemoryVT().getVectorElementType(), N->getAlignment(),
+        N->getMemoryVT().getVectorElementType(), N->getOriginalAlign(),
         N->getMemOperand()->getFlags(), N->getAAInfo());
 
   return DAG.getStore(N->getChain(), dl, GetScalarizedVector(N->getOperand(1)),
                       N->getBasePtr(), N->getPointerInfo(),
-                      N->getOriginalAlignment(), N->getMemOperand()->getFlags(),
+                      N->getOriginalAlign(), N->getMemOperand()->getFlags(),
                       N->getAAInfo());
 }
 
@@ -881,12 +882,14 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FLOG2:
   case ISD::FNEARBYINT:
   case ISD::FNEG:
+  case ISD::FREEZE:
   case ISD::FP_EXTEND:
   case ISD::FP_ROUND:
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
   case ISD::FRINT:
   case ISD::FROUND:
+  case ISD::FROUNDEVEN:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC:
@@ -942,7 +945,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SplitVecRes_TernaryOp(N, Lo, Hi);
     break;
 
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)                   \
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
   case ISD::STRICT_##DAGN:
 #include "llvm/IR/ConstrainedOps.def"
     SplitVecRes_StrictFPOp(N, Lo, Hi);
@@ -961,7 +964,9 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::UMULFIX:
   case ISD::UMULFIXSAT:
   case ISD::SDIVFIX:
+  case ISD::SDIVFIXSAT:
   case ISD::UDIVFIX:
+  case ISD::UDIVFIXSAT:
     SplitVecRes_FIX(N, Lo, Hi);
     break;
   }
@@ -971,6 +976,25 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
     SetSplitVector(SDValue(N, ResNo), Lo, Hi);
 }
 
+void DAGTypeLegalizer::IncrementPointer(MemSDNode *N, EVT MemVT,
+                                        MachinePointerInfo &MPI,
+                                        SDValue &Ptr) {
+  SDLoc DL(N);
+  unsigned IncrementSize = MemVT.getSizeInBits().getKnownMinSize() / 8;
+
+  if (MemVT.isScalableVector()) {
+    SDValue BytesIncrement = DAG.getVScale(
+        DL, Ptr.getValueType(),
+        APInt(Ptr.getValueSizeInBits().getFixedSize(), IncrementSize));
+    MPI = MachinePointerInfo(N->getPointerInfo().getAddrSpace());
+    Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr, BytesIncrement);
+  } else {
+    MPI = N->getPointerInfo().getWithOffset(IncrementSize);
+    // Increment the pointer to the other half.
+    Ptr = DAG.getObjectPtrOffset(DL, Ptr, IncrementSize);
+  }
+}
+
 void DAGTypeLegalizer::SplitVecRes_BinOp(SDNode *N, SDValue &Lo,
                                          SDValue &Hi) {
   SDValue LHSLo, LHSHi;
@@ -995,10 +1019,10 @@ void DAGTypeLegalizer::SplitVecRes_TernaryOp(SDNode *N, SDValue &Lo,
   GetSplitVector(N->getOperand(2), Op2Lo, Op2Hi);
   SDLoc dl(N);
 
-  Lo = DAG.getNode(N->getOpcode(), dl, Op0Lo.getValueType(),
-                   Op0Lo, Op1Lo, Op2Lo);
-  Hi = DAG.getNode(N->getOpcode(), dl, Op0Hi.getValueType(),
-                   Op0Hi, Op1Hi, Op2Hi);
+  Lo = DAG.getNode(N->getOpcode(), dl, Op0Lo.getValueType(), Op0Lo, Op1Lo,
+                   Op2Lo, N->getFlags());
+  Hi = DAG.getNode(N->getOpcode(), dl, Op0Hi.getValueType(), Op0Hi, Op1Hi,
+                   Op2Hi, N->getFlags());
 }
 
 void DAGTypeLegalizer::SplitVecRes_FIX(SDNode *N, SDValue &Lo, SDValue &Hi) {
@@ -1010,8 +1034,10 @@ void DAGTypeLegalizer::SplitVecRes_FIX(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue Op2 = N->getOperand(2);
 
   unsigned Opcode = N->getOpcode();
-  Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Op2);
-  Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Op2);
+  Lo = DAG.getNode(Opcode, dl, LHSLo.getValueType(), LHSLo, RHSLo, Op2,
+                   N->getFlags());
+  Hi = DAG.getNode(Opcode, dl, LHSHi.getValueType(), LHSHi, RHSHi, Op2,
+                   N->getFlags());
 }
 
 void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
@@ -1030,6 +1056,7 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
   case TargetLowering::TypeLegal:
   case TargetLowering::TypePromoteInteger:
   case TargetLowering::TypePromoteFloat:
+  case TargetLowering::TypeSoftPromoteHalf:
   case TargetLowering::TypeSoftenFloat:
   case TargetLowering::TypeScalarizeVector:
   case TargetLowering::TypeWidenVector:
@@ -1055,6 +1082,8 @@ void DAGTypeLegalizer::SplitVecRes_BITCAST(SDNode *N, SDValue &Lo,
     Lo = DAG.getNode(ISD::BITCAST, dl, LoVT, Lo);
     Hi = DAG.getNode(ISD::BITCAST, dl, HiVT, Hi);
     return;
+  case TargetLowering::TypeScalarizeScalableVector:
+    report_fatal_error("Scalarization of scalable vectors is not supported.");
   }
 
   // In the general case, convert the input to an integer and split it by hand.
@@ -1116,9 +1145,9 @@ void DAGTypeLegalizer::SplitVecRes_EXTRACT_SUBVECTOR(SDNode *N, SDValue &Lo,
 
   Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, LoVT, Vec, Idx);
   uint64_t IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
-  Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec,
-                   DAG.getConstant(IdxVal + LoVT.getVectorNumElements(), dl,
-                                   TLI.getVectorIdxTy(DAG.getDataLayout())));
+  Hi = DAG.getNode(
+      ISD::EXTRACT_SUBVECTOR, dl, HiVT, Vec,
+      DAG.getVectorIdxConstant(IdxVal + LoVT.getVectorNumElements(), dl));
 }
 
 void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
@@ -1137,40 +1166,45 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_SUBVECTOR(SDNode *N, SDValue &Lo,
   // boundary between the halves, we can avoid spilling the vector, and insert
   // into the lower half of the split vector directly.
   // TODO: The IdxVal == 0 constraint is artificial, we could do this whenever
-  // the index is constant and there is no boundary crossing. But those cases
-  // don't seem to get hit in practice.
-  if (ConstantSDNode *ConstIdx = dyn_cast<ConstantSDNode>(Idx)) {
-    unsigned IdxVal = ConstIdx->getZExtValue();
-    if ((IdxVal == 0) && (IdxVal + SubElems <= VecElems / 2)) {
-      EVT LoVT, HiVT;
-      std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
-      Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, LoVT, Lo, SubVec, Idx);
-      return;
-    }
+  // there is no boundary crossing. But those cases don't seem to get hit in
+  // practice.
+  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  if ((IdxVal == 0) && (IdxVal + SubElems <= VecElems / 2)) {
+    EVT LoVT, HiVT;
+    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
+    Lo = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, LoVT, Lo, SubVec, Idx);
+    return;
   }
 
   // Spill the vector to the stack.
-  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
-  SDValue Store =
-      DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, MachinePointerInfo());
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+  SDValue StackPtr =
+      DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
+  auto &MF = DAG.getMachineFunction();
+  auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
+
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+                               SmallestAlign);
 
   // Store the new subvector into the specified index.
   SDValue SubVecPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
-  Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
-  unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
-  Store = DAG.getStore(Store, dl, SubVec, SubVecPtr, MachinePointerInfo());
+  Store = DAG.getStore(Store, dl, SubVec, SubVecPtr,
+                       MachinePointerInfo::getUnknownStack(MF));
 
   // Load the Lo part from the stack slot.
-  Lo =
-      DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, MachinePointerInfo());
+  Lo = DAG.getLoad(Lo.getValueType(), dl, Store, StackPtr, PtrInfo,
+                   SmallestAlign);
 
   // Increment the pointer to the other part.
   unsigned IncrementSize = Lo.getValueSizeInBits() / 8;
   StackPtr = DAG.getMemBasePlusOffset(StackPtr, IncrementSize, dl);
 
   // Load the Hi part from the stack slot.
-  Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr, MachinePointerInfo(),
-                   MinAlign(Alignment, IncrementSize));
+  Hi = DAG.getLoad(Hi.getValueType(), dl, Store, StackPtr,
+                   PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
 }
 
 void DAGTypeLegalizer::SplitVecRes_FPOWI(SDNode *N, SDValue &Lo,
@@ -1291,8 +1325,10 @@ void DAGTypeLegalizer::SplitVecRes_StrictFPOp(SDNode *N, SDValue &Lo,
 
   EVT LoValueVTs[] = {LoVT, MVT::Other};
   EVT HiValueVTs[] = {HiVT, MVT::Other};
-  Lo = DAG.getNode(N->getOpcode(), dl, LoValueVTs, OpsLo);
-  Hi = DAG.getNode(N->getOpcode(), dl, HiValueVTs, OpsHi);
+  Lo = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(LoValueVTs), OpsLo,
+                   N->getFlags());
+  Hi = DAG.getNode(N->getOpcode(), dl, DAG.getVTList(HiValueVTs), OpsHi,
+                   N->getFlags());
 
   // Build a factor node to remember that this Op is independent of the
   // other one.
@@ -1332,10 +1368,8 @@ SDValue DAGTypeLegalizer::UnrollVectorOp_StrictFP(SDNode *N, unsigned ResNE) {
       EVT OperandVT = Operand.getValueType();
       if (OperandVT.isVector()) {
         EVT OperandEltVT = OperandVT.getVectorElementType();
-        Operands[j] =
-            DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT, Operand,
-                    DAG.getConstant(i, dl, TLI.getVectorIdxTy(
-                          DAG.getDataLayout())));
+        Operands[j] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT,
+                                  Operand, DAG.getVectorIdxConstant(i, dl));
       } else {
         Operands[j] = Operand;
       }
@@ -1384,6 +1418,8 @@ void DAGTypeLegalizer::SplitVecRes_OverflowOp(SDNode *N, unsigned ResNo,
   SDVTList HiVTs = DAG.getVTList(HiResVT, HiOvVT);
   SDNode *LoNode = DAG.getNode(Opcode, dl, LoVTs, LoLHS, LoRHS).getNode();
   SDNode *HiNode = DAG.getNode(Opcode, dl, HiVTs, HiLHS, HiRHS).getNode();
+  LoNode->setFlags(N->getFlags());
+  HiNode->setFlags(N->getFlags());
 
   Lo = SDValue(LoNode, ResNo);
   Hi = SDValue(HiNode, ResNo);
@@ -1417,10 +1453,8 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
       Lo = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl,
                        Lo.getValueType(), Lo, Elt, Idx);
     else
-      Hi =
-          DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
-                      DAG.getConstant(IdxVal - LoNumElts, dl,
-                                      TLI.getVectorIdxTy(DAG.getDataLayout())));
+      Hi = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, Hi.getValueType(), Hi, Elt,
+                       DAG.getVectorIdxConstant(IdxVal - LoNumElts, dl));
     return;
   }
 
@@ -1442,36 +1476,38 @@ void DAGTypeLegalizer::SplitVecRes_INSERT_VECTOR_ELT(SDNode *N, SDValue &Lo,
   }
 
   // Spill the vector to the stack.
-  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+  SDValue StackPtr =
+      DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
   auto &MF = DAG.getMachineFunction();
   auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
   auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
+
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+                               SmallestAlign);
 
   // Store the new element.  This may be larger than the vector element type,
   // so use a truncating store.
   SDValue EltPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
-  Type *VecType = VecVT.getTypeForEVT(*DAG.getContext());
-  unsigned Alignment = DAG.getDataLayout().getPrefTypeAlignment(VecType);
-  Store = DAG.getTruncStore(Store, dl, Elt, EltPtr,
-                            MachinePointerInfo::getUnknownStack(MF), EltVT);
+  Store = DAG.getTruncStore(
+      Store, dl, Elt, EltPtr, MachinePointerInfo::getUnknownStack(MF), EltVT,
+      commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
 
   EVT LoVT, HiVT;
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VecVT);
 
   // Load the Lo part from the stack slot.
-  Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo);
+  Lo = DAG.getLoad(LoVT, dl, Store, StackPtr, PtrInfo, SmallestAlign);
 
   // Increment the pointer to the other part.
   unsigned IncrementSize = LoVT.getSizeInBits() / 8;
-  StackPtr = DAG.getNode(ISD::ADD, dl, StackPtr.getValueType(), StackPtr,
-                         DAG.getConstant(IncrementSize, dl,
-                                         StackPtr.getValueType()));
+  StackPtr = DAG.getMemBasePlusOffset(StackPtr, IncrementSize, dl);
 
   // Load the Hi part from the stack slot.
   Hi = DAG.getLoad(HiVT, dl, Store, StackPtr,
-                   PtrInfo.getWithOffset(IncrementSize),
-                   MinAlign(Alignment, IncrementSize));
+                   PtrInfo.getWithOffset(IncrementSize), SmallestAlign);
 
   // If we adjusted the original type, we need to truncate the results.
   std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(N->getValueType(0));
@@ -1502,21 +1538,29 @@ void DAGTypeLegalizer::SplitVecRes_LOAD(LoadSDNode *LD, SDValue &Lo,
   SDValue Ptr = LD->getBasePtr();
   SDValue Offset = DAG.getUNDEF(Ptr.getValueType());
   EVT MemoryVT = LD->getMemoryVT();
-  unsigned Alignment = LD->getOriginalAlignment();
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
   EVT LoMemVT, HiMemVT;
   std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
+  if (!LoMemVT.isByteSized() || !HiMemVT.isByteSized()) {
+    SDValue Value, NewChain;
+    std::tie(Value, NewChain) = TLI.scalarizeVectorLoad(LD, DAG);
+    std::tie(Lo, Hi) = DAG.SplitVector(Value, dl);
+    ReplaceValueWith(SDValue(LD, 1), NewChain);
+    return;
+  }
+
   Lo = DAG.getLoad(ISD::UNINDEXED, ExtType, LoVT, dl, Ch, Ptr, Offset,
-                   LD->getPointerInfo(), LoMemVT, Alignment, MMOFlags, AAInfo);
+                   LD->getPointerInfo(), LoMemVT, LD->getOriginalAlign(),
+                   MMOFlags, AAInfo);
+
+  MachinePointerInfo MPI;
+  IncrementPointer(LD, LoMemVT, MPI, Ptr);
 
-  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
-  Ptr = DAG.getObjectPtrOffset(dl, Ptr, IncrementSize);
-  Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset,
-                   LD->getPointerInfo().getWithOffset(IncrementSize), HiMemVT,
-                   Alignment, MMOFlags, AAInfo);
+  Hi = DAG.getLoad(ISD::UNINDEXED, ExtType, HiVT, dl, Ch, Ptr, Offset, MPI,
+                   HiMemVT, LD->getOriginalAlign(), MMOFlags, AAInfo);
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -1541,7 +1585,7 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
   assert(Offset.isUndef() && "Unexpected indexed masked load offset");
   SDValue Mask = MLD->getMask();
   SDValue PassThru = MLD->getPassThru();
-  unsigned Alignment = MLD->getOriginalAlignment();
+  Align Alignment = MLD->getOriginalAlign();
   ISD::LoadExtType ExtType = MLD->getExtensionType();
 
   // Split Mask operand
@@ -1557,7 +1601,9 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
 
   EVT MemoryVT = MLD->getMemoryVT();
   EVT LoMemVT, HiMemVT;
-  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
+  bool HiIsEmpty = false;
+  std::tie(LoMemVT, HiMemVT) =
+      DAG.GetDependentSplitDestVTs(MemoryVT, LoVT, &HiIsEmpty);
 
   SDValue PassThruLo, PassThruHi;
   if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
@@ -1565,27 +1611,33 @@ void DAGTypeLegalizer::SplitVecRes_MLOAD(MaskedLoadSDNode *MLD,
   else
     std::tie(PassThruLo, PassThruHi) = DAG.SplitVector(PassThru, dl);
 
-  MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MLD->getPointerInfo(),
-                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
-                         Alignment, MLD->getAAInfo(), MLD->getRanges());
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MLD->getPointerInfo(), MachineMemOperand::MOLoad, LoMemVT.getStoreSize(),
+      Alignment, MLD->getAAInfo(), MLD->getRanges());
 
   Lo = DAG.getMaskedLoad(LoVT, dl, Ch, Ptr, Offset, MaskLo, PassThruLo, LoMemVT,
                          MMO, MLD->getAddressingMode(), ExtType,
                          MLD->isExpandingLoad());
 
-  Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG,
-                                   MLD->isExpandingLoad());
-  unsigned HiOffset = LoMemVT.getStoreSize();
-
-  MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MLD->getPointerInfo().getWithOffset(HiOffset), MachineMemOperand::MOLoad,
-      HiMemVT.getStoreSize(), Alignment, MLD->getAAInfo(),
-      MLD->getRanges());
-
-  Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, Offset, MaskHi, PassThruHi, HiMemVT,
-                         MMO, MLD->getAddressingMode(), ExtType,
-                         MLD->isExpandingLoad());
+  if (HiIsEmpty) {
+    // The hi masked load has zero storage size. We therefore simply set it to
+    // the low masked load and rely on subsequent removal from the chain.
+    Hi = Lo;
+  } else {
+    // Generate hi masked load.
+    Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, dl, LoMemVT, DAG,
+                                     MLD->isExpandingLoad());
+    unsigned HiOffset = LoMemVT.getStoreSize();
+
+    MMO = DAG.getMachineFunction().getMachineMemOperand(
+        MLD->getPointerInfo().getWithOffset(HiOffset),
+        MachineMemOperand::MOLoad, HiMemVT.getStoreSize(), Alignment,
+        MLD->getAAInfo(), MLD->getRanges());
+
+    Hi = DAG.getMaskedLoad(HiVT, dl, Ch, Ptr, Offset, MaskHi, PassThruHi,
+                           HiMemVT, MMO, MLD->getAddressingMode(), ExtType,
+                           MLD->isExpandingLoad());
+  }
 
   // Build a factor node to remember that this load is independent of the
   // other one.
@@ -1610,7 +1662,7 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
   SDValue PassThru = MGT->getPassThru();
   SDValue Index = MGT->getIndex();
   SDValue Scale = MGT->getScale();
-  unsigned Alignment = MGT->getOriginalAlignment();
+  Align Alignment = MGT->getOriginalAlign();
 
   // Split Mask operand
   SDValue MaskLo, MaskHi;
@@ -1623,11 +1675,6 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
       std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, dl);
   }
 
-  EVT MemoryVT = MGT->getMemoryVT();
-  EVT LoMemVT, HiMemVT;
-  // Split MemoryVT
-  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
-
   SDValue PassThruLo, PassThruHi;
   if (getTypeAction(PassThru.getValueType()) == TargetLowering::TypeSplitVector)
     GetSplitVector(PassThru, PassThruLo, PassThruHi);
@@ -1640,10 +1687,10 @@ void DAGTypeLegalizer::SplitVecRes_MGATHER(MaskedGatherSDNode *MGT,
   else
     std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
 
-  MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MGT->getPointerInfo(),
-                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
-                         Alignment, MGT->getAAInfo(), MGT->getRanges());
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MGT->getPointerInfo(), MachineMemOperand::MOLoad,
+      MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(),
+      MGT->getRanges());
 
   SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
   Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl, OpsLo,
@@ -1708,11 +1755,13 @@ void DAGTypeLegalizer::SplitVecRes_UnaryOp(SDNode *N, SDValue &Lo,
     std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, OpNo);
 
   if (N->getOpcode() == ISD::FP_ROUND) {
-    Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1));
-    Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getOperand(1));
+    Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getOperand(1),
+                     N->getFlags());
+    Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getOperand(1),
+                     N->getFlags());
   } else {
-    Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo);
-    Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi);
+    Lo = DAG.getNode(N->getOpcode(), dl, LoVT, Lo, N->getFlags());
+    Hi = DAG.getNode(N->getOpcode(), dl, HiVT, Hi, N->getFlags());
   }
 }
 
@@ -1737,8 +1786,7 @@ void DAGTypeLegalizer::SplitVecRes_ExtendOp(SDNode *N, SDValue &Lo,
   // more effectively move in the right direction and prevent falling down
   // to scalarization in many cases due to the input vector being split too
   // far.
-  unsigned NumElements = SrcVT.getVectorNumElements();
-  if ((NumElements & 1) == 0 &&
+  if ((SrcVT.getVectorMinNumElements() & 1) == 0 &&
       SrcVT.getSizeInBits() * 2 < DestVT.getSizeInBits()) {
     LLVMContext &Ctx = *DAG.getContext();
     EVT NewSrcVT = SrcVT.widenIntegerVectorElementType(Ctx);
@@ -1851,9 +1899,9 @@ void DAGTypeLegalizer::SplitVecRes_VECTOR_SHUFFLE(ShuffleVectorSDNode *N,
         Idx -= Input * NewElts;
 
         // Extract the vector element by hand.
-        SVOps.push_back(DAG.getNode(
-            ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Inputs[Input],
-            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
+        SVOps.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT,
+                                    Inputs[Input],
+                                    DAG.getVectorIdxConstant(Idx, dl)));
       }
 
       // Construct the Lo/Hi output using a BUILD_VECTOR.
@@ -1882,11 +1930,11 @@ void DAGTypeLegalizer::SplitVecRes_VAARG(SDNode *N, SDValue &Lo, SDValue &Hi) {
   SDValue SV = N->getOperand(2);
   SDLoc dl(N);
 
-  const unsigned Alignment = DAG.getDataLayout().getABITypeAlignment(
-      NVT.getTypeForEVT(*DAG.getContext()));
+  const Align Alignment =
+      DAG.getDataLayout().getABITypeAlign(NVT.getTypeForEVT(*DAG.getContext()));
 
-  Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, SV, Alignment);
-  Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, SV, Alignment);
+  Lo = DAG.getVAArg(NVT, dl, Chain, Ptr, SV, Alignment.value());
+  Hi = DAG.getVAArg(NVT, dl, Lo.getValue(1), Ptr, SV, Alignment.value());
   Chain = Hi.getValue(1);
 
   // Modified the chain - switch anything that used the old chain to use
@@ -2160,8 +2208,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_SUBVECTOR(SDNode *N) {
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Lo, Idx);
   } else {
     return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVT, Hi,
-                       DAG.getConstant(IdxVal - LoElts, dl,
-                                       Idx.getValueType()));
+                       DAG.getVectorIdxConstant(IdxVal - LoElts, dl));
   }
 }
 
@@ -2200,11 +2247,16 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
   }
 
   // Store the vector to the stack.
-  SDValue StackPtr = DAG.CreateStackTemporary(VecVT);
+  // In cases where the vector is illegal it will be broken down into parts
+  // and stored in parts - we should use the alignment for the smallest part.
+  Align SmallestAlign = DAG.getReducedAlign(VecVT, /*UseABI=*/false);
+  SDValue StackPtr =
+      DAG.CreateStackTemporary(VecVT.getStoreSize(), SmallestAlign);
   auto &MF = DAG.getMachineFunction();
   auto FrameIndex = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
   auto PtrInfo = MachinePointerInfo::getFixedStack(MF, FrameIndex);
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo);
+  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Vec, StackPtr, PtrInfo,
+                               SmallestAlign);
 
   // Load back the required element.
   StackPtr = TLI.getVectorElementPointer(DAG, StackPtr, VecVT, Idx);
@@ -2219,7 +2271,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_EXTRACT_VECTOR_ELT(SDNode *N) {
 
   return DAG.getExtLoad(
       ISD::EXTLOAD, dl, N->getValueType(0), Store, StackPtr,
-      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT);
+      MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()), EltVT,
+      commonAlignment(SmallestAlign, EltVT.getSizeInBits() / 8));
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_ExtVecInRegOp(SDNode *N) {
@@ -2244,7 +2297,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
   SDValue Scale = MGT->getScale();
   SDValue Mask = MGT->getMask();
   SDValue PassThru = MGT->getPassThru();
-  unsigned Alignment = MGT->getOriginalAlignment();
+  Align Alignment = MGT->getOriginalAlign();
 
   SDValue MaskLo, MaskHi;
   if (getTypeAction(Mask.getValueType()) == TargetLowering::TypeSplitVector)
@@ -2269,21 +2322,15 @@ SDValue DAGTypeLegalizer::SplitVecOp_MGATHER(MaskedGatherSDNode *MGT,
   else
     std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, dl);
 
-  MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MGT->getPointerInfo(),
-                         MachineMemOperand::MOLoad,  LoMemVT.getStoreSize(),
-                         Alignment, MGT->getAAInfo(), MGT->getRanges());
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MGT->getPointerInfo(), MachineMemOperand::MOLoad,
+      MemoryLocation::UnknownSize, Alignment, MGT->getAAInfo(),
+      MGT->getRanges());
 
   SDValue OpsLo[] = {Ch, PassThruLo, MaskLo, Ptr, IndexLo, Scale};
   SDValue Lo = DAG.getMaskedGather(DAG.getVTList(LoVT, MVT::Other), LoVT, dl,
                                    OpsLo, MMO, MGT->getIndexType());
 
-  MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MGT->getPointerInfo(),
-                         MachineMemOperand::MOLoad,  HiMemVT.getStoreSize(),
-                         Alignment, MGT->getAAInfo(),
-                         MGT->getRanges());
-
   SDValue OpsHi[] = {Ch, PassThruHi, MaskHi, Ptr, IndexHi, Scale};
   SDValue Hi = DAG.getMaskedGather(DAG.getVTList(HiVT, MVT::Other), HiVT, dl,
                                    OpsHi, MMO, MGT->getIndexType());
@@ -2312,13 +2359,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
   assert(Offset.isUndef() && "Unexpected indexed masked store offset");
   SDValue Mask = N->getMask();
   SDValue Data = N->getValue();
-  EVT MemoryVT = N->getMemoryVT();
-  unsigned Alignment = N->getOriginalAlignment();
+  Align Alignment = N->getOriginalAlign();
   SDLoc DL(N);
 
-  EVT LoMemVT, HiMemVT;
-  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
-
   SDValue DataLo, DataHi;
   if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
     // Split Data operand
@@ -2337,32 +2380,45 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSTORE(MaskedStoreSDNode *N,
       std::tie(MaskLo, MaskHi) = DAG.SplitVector(Mask, DL);
   }
 
-  SDValue Lo, Hi;
-  MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(N->getPointerInfo(),
-                         MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
-                         Alignment, N->getAAInfo(), N->getRanges());
+  EVT MemoryVT = N->getMemoryVT();
+  EVT LoMemVT, HiMemVT;
+  bool HiIsEmpty = false;
+  std::tie(LoMemVT, HiMemVT) =
+      DAG.GetDependentSplitDestVTs(MemoryVT, DataLo.getValueType(), &HiIsEmpty);
+
+  SDValue Lo, Hi, Res;
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      N->getPointerInfo(), MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
+      Alignment, N->getAAInfo(), N->getRanges());
 
   Lo = DAG.getMaskedStore(Ch, DL, DataLo, Ptr, Offset, MaskLo, LoMemVT, MMO,
                           N->getAddressingMode(), N->isTruncatingStore(),
                           N->isCompressingStore());
 
-  Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
-                                   N->isCompressingStore());
-  unsigned HiOffset = LoMemVT.getStoreSize();
+  if (HiIsEmpty) {
+    // The hi masked store has zero storage size.
+    // Only the lo masked store is needed.
+    Res = Lo;
+  } else {
 
-  MMO = DAG.getMachineFunction().getMachineMemOperand(
-      N->getPointerInfo().getWithOffset(HiOffset), MachineMemOperand::MOStore,
-      HiMemVT.getStoreSize(), Alignment, N->getAAInfo(),
-      N->getRanges());
+    Ptr = TLI.IncrementMemoryAddress(Ptr, MaskLo, DL, LoMemVT, DAG,
+                                     N->isCompressingStore());
+    unsigned HiOffset = LoMemVT.getStoreSize();
 
-  Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, Offset, MaskHi, HiMemVT, MMO,
-                          N->getAddressingMode(), N->isTruncatingStore(),
-                          N->isCompressingStore());
+    MMO = DAG.getMachineFunction().getMachineMemOperand(
+        N->getPointerInfo().getWithOffset(HiOffset), MachineMemOperand::MOStore,
+        HiMemVT.getStoreSize(), Alignment, N->getAAInfo(), N->getRanges());
 
-  // Build a factor node to remember that this store is independent of the
-  // other one.
-  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+    Hi = DAG.getMaskedStore(Ch, DL, DataHi, Ptr, Offset, MaskHi, HiMemVT, MMO,
+                            N->getAddressingMode(), N->isTruncatingStore(),
+                            N->isCompressingStore());
+
+    // Build a factor node to remember that this store is independent of the
+    // other one.
+    Res = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
+  }
+
+  return Res;
 }
 
 SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
@@ -2373,13 +2429,10 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
   SDValue Index = N->getIndex();
   SDValue Scale = N->getScale();
   SDValue Data = N->getValue();
-  EVT MemoryVT = N->getMemoryVT();
-  unsigned Alignment = N->getOriginalAlignment();
+  Align Alignment = N->getOriginalAlign();
   SDLoc DL(N);
 
   // Split all operands
-  EVT LoMemVT, HiMemVT;
-  std::tie(LoMemVT, HiMemVT) = DAG.GetSplitDestVTs(MemoryVT);
 
   SDValue DataLo, DataHi;
   if (getTypeAction(Data.getValueType()) == TargetLowering::TypeSplitVector)
@@ -2406,20 +2459,14 @@ SDValue DAGTypeLegalizer::SplitVecOp_MSCATTER(MaskedScatterSDNode *N,
     std::tie(IndexLo, IndexHi) = DAG.SplitVector(Index, DL);
 
   SDValue Lo;
-  MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(N->getPointerInfo(),
-                         MachineMemOperand::MOStore, LoMemVT.getStoreSize(),
-                         Alignment, N->getAAInfo(), N->getRanges());
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      N->getPointerInfo(), MachineMemOperand::MOStore,
+      MemoryLocation::UnknownSize, Alignment, N->getAAInfo(), N->getRanges());
 
   SDValue OpsLo[] = {Ch, DataLo, MaskLo, Ptr, IndexLo, Scale};
   Lo = DAG.getMaskedScatter(DAG.getVTList(MVT::Other), DataLo.getValueType(),
                             DL, OpsLo, MMO, N->getIndexType());
 
-  MMO = DAG.getMachineFunction().
-    getMachineMemOperand(N->getPointerInfo(),
-                         MachineMemOperand::MOStore,  HiMemVT.getStoreSize(),
-                         Alignment, N->getAAInfo(), N->getRanges());
-
   // The order of the Scatter operation after split is well defined. The "Hi"
   // part comes after the "Lo". So these two operations should be chained one
   // after another.
@@ -2437,7 +2484,7 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   SDValue Ch  = N->getChain();
   SDValue Ptr = N->getBasePtr();
   EVT MemoryVT = N->getMemoryVT();
-  unsigned Alignment = N->getOriginalAlignment();
+  Align Alignment = N->getOriginalAlign();
   MachineMemOperand::Flags MMOFlags = N->getMemOperand()->getFlags();
   AAMDNodes AAInfo = N->getAAInfo();
   SDValue Lo, Hi;
@@ -2450,8 +2497,6 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
   if (!LoMemVT.isByteSized() || !HiMemVT.isByteSized())
     return TLI.scalarizeVectorStore(N, DAG);
 
-  unsigned IncrementSize = LoMemVT.getSizeInBits()/8;
-
   if (isTruncating)
     Lo = DAG.getTruncStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), LoMemVT,
                            Alignment, MMOFlags, AAInfo);
@@ -2459,17 +2504,14 @@ SDValue DAGTypeLegalizer::SplitVecOp_STORE(StoreSDNode *N, unsigned OpNo) {
     Lo = DAG.getStore(Ch, DL, Lo, Ptr, N->getPointerInfo(), Alignment, MMOFlags,
                       AAInfo);
 
-  // Increment the pointer to the other half.
-  Ptr = DAG.getObjectPtrOffset(DL, Ptr, IncrementSize);
+  MachinePointerInfo MPI;
+  IncrementPointer(N, LoMemVT, MPI, Ptr);
 
   if (isTruncating)
-    Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr,
-                           N->getPointerInfo().getWithOffset(IncrementSize),
+    Hi = DAG.getTruncStore(Ch, DL, Hi, Ptr, MPI,
                            HiMemVT, Alignment, MMOFlags, AAInfo);
   else
-    Hi = DAG.getStore(Ch, DL, Hi, Ptr,
-                      N->getPointerInfo().getWithOffset(IncrementSize),
-                      Alignment, MMOFlags, AAInfo);
+    Hi = DAG.getStore(Ch, DL, Hi, Ptr, MPI, Alignment, MMOFlags, AAInfo);
 
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Lo, Hi);
 }
@@ -2487,9 +2529,8 @@ SDValue DAGTypeLegalizer::SplitVecOp_CONCAT_VECTORS(SDNode *N) {
   for (const SDValue &Op : N->op_values()) {
     for (unsigned i = 0, e = Op.getValueType().getVectorNumElements();
          i != e; ++i) {
-      Elts.push_back(DAG.getNode(
-          ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,
-          DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout()))));
+      Elts.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Op,
+                                 DAG.getVectorIdxConstant(i, DL)));
     }
   }
 
@@ -2565,9 +2606,9 @@ SDValue DAGTypeLegalizer::SplitVecOp_TruncateHelper(SDNode *N) {
   SDValue Chain;
   if (N->isStrictFPOpcode()) {
     HalfLo = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other},
-                         {N->getOperand(0), HalfLo});
+                         {N->getOperand(0), InLoVec});
     HalfHi = DAG.getNode(N->getOpcode(), DL, {HalfVT, MVT::Other},
-                         {N->getOperand(0), HalfHi});
+                         {N->getOperand(0), InHiVec});
     // Legalize the chain result - switch anything that used the old chain to
     // use the new one.
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, HalfLo.getValue(1),
@@ -2611,9 +2652,11 @@ SDValue DAGTypeLegalizer::SplitVecOp_VSETCC(SDNode *N) {
   SDLoc DL(N);
   GetSplitVector(N->getOperand(0), Lo0, Hi0);
   GetSplitVector(N->getOperand(1), Lo1, Hi1);
-  unsigned PartElements = Lo0.getValueType().getVectorNumElements();
-  EVT PartResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, PartElements);
-  EVT WideResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, 2*PartElements);
+  auto PartEltCnt = Lo0.getValueType().getVectorElementCount();
+
+  LLVMContext &Context = *DAG.getContext();
+  EVT PartResVT = EVT::getVectorVT(Context, MVT::i1, PartEltCnt);
+  EVT WideResVT = EVT::getVectorVT(Context, MVT::i1, PartEltCnt*2);
 
   LoRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Lo0, Lo1, N->getOperand(2));
   HiRes = DAG.getNode(ISD::SETCC, DL, PartResVT, Hi0, Hi1, N->getOperand(2));
@@ -2753,7 +2796,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     Res = WidenVecRes_BinaryWithExtraScalarOp(N);
     break;
 
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)                   \
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
   case ISD::STRICT_##DAGN:
 #include "llvm/IR/ConstrainedOps.def"
     Res = WidenVecRes_StrictFP(N);
@@ -2813,6 +2856,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::FNEARBYINT:
   case ISD::FRINT:
   case ISD::FROUND:
+  case ISD::FROUNDEVEN:
   case ISD::FSIN:
   case ISD::FSQRT:
   case ISD::FTRUNC: {
@@ -2842,6 +2886,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::CTTZ:
   case ISD::CTTZ_ZERO_UNDEF:
   case ISD::FNEG:
+  case ISD::FREEZE:
   case ISD::FCANONICALIZE:
     Res = WidenVecRes_Unary(N);
     break;
@@ -2924,9 +2969,8 @@ static SDValue CollectOpsToWiden(SelectionDAG &DAG, const TargetLowering &TLI,
       SDValue VecOp = DAG.getUNDEF(NextVT);
       unsigned NumToInsert = ConcatEnd - Idx - 1;
       for (unsigned i = 0, OpIdx = Idx+1; i < NumToInsert; i++, OpIdx++) {
-        VecOp = DAG.getNode(
-            ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp, ConcatOps[OpIdx],
-            DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+        VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NextVT, VecOp,
+                            ConcatOps[OpIdx], DAG.getVectorIdxConstant(i, dl));
       }
       ConcatOps[Idx+1] = VecOp;
       ConcatEnd = Idx + 2;
@@ -3008,12 +3052,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
   // }
   while (CurNumElts != 0) {
     while (CurNumElts >= NumElts) {
-      SDValue EOp1 = DAG.getNode(
-          ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
-          DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-      SDValue EOp2 = DAG.getNode(
-          ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
-          DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      SDValue EOp1 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp1,
+                                 DAG.getVectorIdxConstant(Idx, dl));
+      SDValue EOp2 = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, InOp2,
+                                 DAG.getVectorIdxConstant(Idx, dl));
       ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, VT, EOp1, EOp2, Flags);
       Idx += NumElts;
       CurNumElts -= NumElts;
@@ -3025,12 +3067,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_BinaryCanTrap(SDNode *N) {
 
     if (NumElts == 1) {
       for (unsigned i = 0; i != CurNumElts; ++i, ++Idx) {
-        SDValue EOp1 = DAG.getNode(
-            ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, InOp1,
-            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-        SDValue EOp2 = DAG.getNode(
-            ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, InOp2,
-            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+        SDValue EOp1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT,
+                                   InOp1, DAG.getVectorIdxConstant(Idx, dl));
+        SDValue EOp2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT,
+                                   InOp2, DAG.getVectorIdxConstant(Idx, dl));
         ConcatOps[ConcatEnd++] = DAG.getNode(Opcode, dl, WidenEltVT,
                                              EOp1, EOp2, Flags);
       }
@@ -3108,14 +3148,13 @@ SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) {
   while (CurNumElts != 0) {
     while (CurNumElts >= NumElts) {
       SmallVector<SDValue, 4> EOps;
-      
+
       for (unsigned i = 0; i < NumOpers; ++i) {
         SDValue Op = InOps[i];
-        
-        if (Op.getValueType().isVector()) 
-          Op = DAG.getNode(
-            ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
-            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+
+        if (Op.getValueType().isVector())
+          Op = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Op,
+                           DAG.getVectorIdxConstant(Idx, dl));
 
         EOps.push_back(Op);
       }
@@ -3140,10 +3179,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_StrictFP(SDNode *N) {
           SDValue Op = InOps[i];
 
           if (Op.getValueType().isVector())
-            Op = DAG.getNode(
-              ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, Op,
-              DAG.getConstant(Idx, dl, 
-                              TLI.getVectorIdxTy(DAG.getDataLayout())));
+            Op = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, WidenEltVT, Op,
+                             DAG.getVectorIdxConstant(Idx, dl));
 
           EOps.push_back(Op);
         }
@@ -3190,8 +3227,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo) {
         *DAG.getContext(), ResVT.getVectorElementType(),
         WideOvVT.getVectorNumElements());
 
-    SDValue Zero = DAG.getConstant(
-        0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()));
+    SDValue Zero = DAG.getVectorIdxConstant(0, DL);
     WideLHS = DAG.getNode(
         ISD::INSERT_SUBVECTOR, DL, WideResVT, DAG.getUNDEF(WideResVT),
         N->getOperand(0), Zero);
@@ -3210,8 +3246,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_OverflowOp(SDNode *N, unsigned ResNo) {
   if (getTypeAction(OtherVT) == TargetLowering::TypeWidenVector) {
     SetWidenedVector(SDValue(N, OtherNo), SDValue(WideNode, OtherNo));
   } else {
-    SDValue Zero = DAG.getConstant(
-        0, DL, TLI.getVectorIdxTy(DAG.getDataLayout()));
+    SDValue Zero = DAG.getVectorIdxConstant(0, DL);
     SDValue OtherVal = DAG.getNode(
         ISD::EXTRACT_SUBVECTOR, DL, OtherVT, SDValue(WideNode, OtherNo), Zero);
     ReplaceValueWith(SDValue(N, OtherNo), OtherVal);
@@ -3274,9 +3309,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
     }
 
     if (InVTNumElts % WidenNumElts == 0) {
-      SDValue InVal = DAG.getNode(
-          ISD::EXTRACT_SUBVECTOR, DL, InWidenVT, InOp,
-          DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      SDValue InVal = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InWidenVT, InOp,
+                                  DAG.getVectorIdxConstant(0, DL));
       // Extract the input and convert the shorten input vector.
       if (N->getNumOperands() == 1)
         return DAG.getNode(Opcode, DL, WidenVT, InVal);
@@ -3291,9 +3325,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert(SDNode *N) {
   // necessary.
   unsigned MinElts = N->getValueType(0).getVectorNumElements();
   for (unsigned i=0; i < MinElts; ++i) {
-    SDValue Val = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
-        DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
+                              DAG.getVectorIdxConstant(i, DL));
     if (N->getNumOperands() == 1)
       Ops[i] = DAG.getNode(Opcode, DL, EltVT, Val);
     else
@@ -3310,7 +3343,6 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert_StrictFP(SDNode *N) {
 
   EVT WidenVT = TLI.getTypeToTransformTo(*DAG.getContext(), N->getValueType(0));
   unsigned WidenNumElts = WidenVT.getVectorNumElements();
-  SmallVector<EVT, 2> WidenVTs = { WidenVT, MVT::Other };
 
   EVT InVT = InOp.getValueType();
   EVT InEltVT = InVT.getVectorElementType();
@@ -3321,16 +3353,15 @@ SDValue DAGTypeLegalizer::WidenVecRes_Convert_StrictFP(SDNode *N) {
 
   // Otherwise unroll into some nasty scalar code and rebuild the vector.
   EVT EltVT = WidenVT.getVectorElementType();
-  SmallVector<EVT, 2> EltVTs = { EltVT, MVT::Other };
+  std::array<EVT, 2> EltVTs = {{EltVT, MVT::Other}};
   SmallVector<SDValue, 16> Ops(WidenNumElts, DAG.getUNDEF(EltVT));
   SmallVector<SDValue, 32> OpChains;
   // Use the original element count so we don't do more scalar opts than
   // necessary.
   unsigned MinElts = N->getValueType(0).getVectorNumElements();
   for (unsigned i=0; i < MinElts; ++i) {
-    NewOps[1] = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
-        DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    NewOps[1] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InEltVT, InOp,
+                            DAG.getVectorIdxConstant(i, DL));
     Ops[i] = DAG.getNode(Opcode, DL, EltVTs, NewOps);
     OpChains.push_back(Ops[i].getValue(1));
   }
@@ -3370,7 +3401,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTEND_VECTOR_INREG(SDNode *N) {
   SmallVector<SDValue, 16> Ops;
   for (unsigned i = 0, e = std::min(InVTNumElts, WidenNumElts); i != e; ++i) {
     SDValue Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, InSVT, InOp,
-      DAG.getConstant(i, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+                              DAG.getVectorIdxConstant(i, DL));
     switch (Opcode) {
     case ISD::ANY_EXTEND_VECTOR_INREG:
       Val = DAG.getNode(ISD::ANY_EXTEND, DL, WidenSVT, Val);
@@ -3463,6 +3494,8 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
   switch (getTypeAction(InVT)) {
   case TargetLowering::TypeLegal:
     break;
+  case TargetLowering::TypeScalarizeScalableVector:
+    report_fatal_error("Scalarization of scalable vectors is not supported.");
   case TargetLowering::TypePromoteInteger: {
     // If the incoming type is a vector that is being promoted, then
     // we know that the elements are arranged differently and that we
@@ -3492,6 +3525,7 @@ SDValue DAGTypeLegalizer::WidenVecRes_BITCAST(SDNode *N) {
   }
   case TargetLowering::TypeSoftenFloat:
   case TargetLowering::TypePromoteFloat:
+  case TargetLowering::TypeSoftPromoteHalf:
   case TargetLowering::TypeExpandInteger:
   case TargetLowering::TypeExpandFloat:
   case TargetLowering::TypeScalarizeVector:
@@ -3626,10 +3660,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_CONCAT_VECTORS(SDNode *N) {
     SDValue InOp = N->getOperand(i);
     if (InputWidened)
       InOp = GetWidenedVector(InOp);
-    for (unsigned j=0; j < NumInElts; ++j)
-      Ops[Idx++] = DAG.getNode(
-          ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-          DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    for (unsigned j = 0; j < NumInElts; ++j)
+      Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                               DAG.getVectorIdxConstant(j, dl));
   }
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; Idx < WidenNumElts; ++Idx)
@@ -3666,11 +3699,9 @@ SDValue DAGTypeLegalizer::WidenVecRes_EXTRACT_SUBVECTOR(SDNode *N) {
   EVT EltVT = VT.getVectorElementType();
   unsigned NumElts = VT.getVectorNumElements();
   unsigned i;
-  for (i=0; i < NumElts; ++i)
-    Ops[i] =
-        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-                    DAG.getConstant(IdxVal + i, dl,
-                                    TLI.getVectorIdxTy(DAG.getDataLayout())));
+  for (i = 0; i < NumElts; ++i)
+    Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                         DAG.getVectorIdxConstant(IdxVal + i, dl));
 
   SDValue UndefVal = DAG.getUNDEF(EltVT);
   for (; i < WidenNumElts; ++i)
@@ -3689,6 +3720,20 @@ SDValue DAGTypeLegalizer::WidenVecRes_LOAD(SDNode *N) {
   LoadSDNode *LD = cast<LoadSDNode>(N);
   ISD::LoadExtType ExtType = LD->getExtensionType();
 
+  // A vector must always be stored in memory as-is, i.e. without any padding
+  // between the elements, since various code depend on it, e.g. in the
+  // handling of a bitcast of a vector type to int, which may be done with a
+  // vector store followed by an integer load. A vector that does not have
+  // elements that are byte-sized must therefore be stored as an integer
+  // built out of the extracted vector elements.
+  if (!LD->getMemoryVT().isByteSized()) {
+    SDValue Value, NewChain;
+    std::tie(Value, NewChain) = TLI.scalarizeVectorLoad(LD, DAG);
+    ReplaceValueWith(SDValue(LD, 0), Value);
+    ReplaceValueWith(SDValue(LD, 1), NewChain);
+    return SDValue();
+  }
+
   SDValue Result;
   SmallVector<SDValue, 16> LdChain;  // Chain for the series of load
   if (ExtType != ISD::NON_EXTLOAD)
@@ -3877,8 +3922,7 @@ SDValue DAGTypeLegalizer::convertMask(SDValue InMask, EVT MaskVT,
   // Adjust Mask to the right number of elements.
   unsigned CurrMaskNumEls = Mask->getValueType(0).getVectorNumElements();
   if (CurrMaskNumEls > ToMaskVT.getVectorNumElements()) {
-    MVT IdxTy = TLI.getVectorIdxTy(DAG.getDataLayout());
-    SDValue ZeroIdx = DAG.getConstant(0, SDLoc(Mask), IdxTy);
+    SDValue ZeroIdx = DAG.getVectorIdxConstant(0, SDLoc(Mask));
     Mask = DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Mask), ToMaskVT, Mask,
                        ZeroIdx);
   } else if (CurrMaskNumEls < ToMaskVT.getVectorNumElements()) {
@@ -4144,12 +4188,10 @@ SDValue DAGTypeLegalizer::WidenVecRes_STRICT_FSETCC(SDNode *N) {
   SmallVector<SDValue, 8> Scalars(WidenNumElts, DAG.getUNDEF(EltVT));
   SmallVector<SDValue, 8> Chains(NumElts);
   for (unsigned i = 0; i != NumElts; ++i) {
-    SDValue LHSElem = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
-        DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-    SDValue RHSElem = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
-        DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
+                                  DAG.getVectorIdxConstant(i, dl));
+    SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
+                                  DAG.getVectorIdxConstant(i, dl));
 
     Scalars[i] = DAG.getNode(N->getOpcode(), dl, {MVT::i1, MVT::Other},
                              {Chain, LHSElem, RHSElem, CC});
@@ -4288,13 +4330,12 @@ SDValue DAGTypeLegalizer::WidenVecOp_EXTEND(SDNode *N) {
         assert(FixedVT.getVectorNumElements() != InVT.getVectorNumElements() &&
                "We can't have the same type as we started with!");
         if (FixedVT.getVectorNumElements() > InVT.getVectorNumElements())
-          InOp = DAG.getNode(
-              ISD::INSERT_SUBVECTOR, DL, FixedVT, DAG.getUNDEF(FixedVT), InOp,
-              DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+          InOp = DAG.getNode(ISD::INSERT_SUBVECTOR, DL, FixedVT,
+                             DAG.getUNDEF(FixedVT), InOp,
+                             DAG.getVectorIdxConstant(0, DL));
         else
-          InOp = DAG.getNode(
-              ISD::EXTRACT_SUBVECTOR, DL, FixedVT, InOp,
-              DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+          InOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, FixedVT, InOp,
+                             DAG.getVectorIdxConstant(0, DL));
         break;
       }
     }
@@ -4363,9 +4404,8 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
       else
         Res = DAG.getNode(Opcode, dl, WideVT, InOp);
     }
-    return DAG.getNode(
-        ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
-        DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Res,
+                       DAG.getVectorIdxConstant(0, dl));
   }
 
   EVT InEltVT = InVT.getVectorElementType();
@@ -4376,9 +4416,8 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
     SmallVector<SDValue, 4> NewOps(N->op_begin(), N->op_end());
     SmallVector<SDValue, 32> OpChains;
     for (unsigned i=0; i < NumElts; ++i) {
-      NewOps[1] = DAG.getNode(
-          ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
-          DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      NewOps[1] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
+                              DAG.getVectorIdxConstant(i, dl));
       Ops[i] = DAG.getNode(Opcode, dl, { EltVT, MVT::Other }, NewOps);
       OpChains.push_back(Ops[i].getValue(1));
     }
@@ -4386,11 +4425,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_Convert(SDNode *N) {
     ReplaceValueWith(SDValue(N, 1), NewChain);
   } else {
     for (unsigned i = 0; i < NumElts; ++i)
-      Ops[i] = DAG.getNode(
-          Opcode, dl, EltVT,
-          DAG.getNode(
-              ISD::EXTRACT_VECTOR_ELT, dl, InEltVT, InOp,
-              DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout()))));
+      Ops[i] = DAG.getNode(Opcode, dl, EltVT,
+                           DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, InEltVT,
+                                       InOp, DAG.getVectorIdxConstant(i, dl)));
   }
 
   return DAG.getBuildVector(VT, dl, Ops);
@@ -4411,9 +4448,8 @@ SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
     EVT NewVT = EVT::getVectorVT(*DAG.getContext(), VT, NewNumElts);
     if (TLI.isTypeLegal(NewVT)) {
       SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
-      return DAG.getNode(
-          ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
-          DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, VT, BitOp,
+                         DAG.getVectorIdxConstant(0, dl));
     }
   }
 
@@ -4430,7 +4466,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_BITCAST(SDNode *N) {
       if (TLI.isTypeLegal(NewVT)) {
         SDValue BitOp = DAG.getNode(ISD::BITCAST, dl, NewVT, InOp);
         return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, BitOp,
-            DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+                           DAG.getVectorIdxConstant(0, dl));
       }
     }
   }
@@ -4470,10 +4506,9 @@ SDValue DAGTypeLegalizer::WidenVecOp_CONCAT_VECTORS(SDNode *N) {
                TargetLowering::TypeWidenVector &&
            "Unexpected type action");
     InOp = GetWidenedVector(InOp);
-    for (unsigned j=0; j < NumInElts; ++j)
-      Ops[Idx++] = DAG.getNode(
-          ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-          DAG.getConstant(j, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    for (unsigned j = 0; j < NumInElts; ++j)
+      Ops[Idx++] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                               DAG.getVectorIdxConstant(j, dl));
   }
   return DAG.getBuildVector(VT, dl, Ops);
 }
@@ -4630,9 +4665,8 @@ SDValue DAGTypeLegalizer::WidenVecOp_SETCC(SDNode *N) {
   EVT ResVT = EVT::getVectorVT(*DAG.getContext(),
                                SVT.getVectorElementType(),
                                VT.getVectorNumElements());
-  SDValue CC = DAG.getNode(
-      ISD::EXTRACT_SUBVECTOR, dl, ResVT, WideSETCC,
-      DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  SDValue CC = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, ResVT, WideSETCC,
+                           DAG.getVectorIdxConstant(0, dl));
 
   EVT OpVT = N->getOperand(0).getValueType();
   ISD::NodeType ExtendCode =
@@ -4657,12 +4691,10 @@ SDValue DAGTypeLegalizer::WidenVecOp_STRICT_FSETCC(SDNode *N) {
   SmallVector<SDValue, 8> Chains(NumElts);
 
   for (unsigned i = 0; i != NumElts; ++i) {
-    SDValue LHSElem = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
-        DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-    SDValue RHSElem = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
-        DAG.getConstant(i, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    SDValue LHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, LHS,
+                                  DAG.getVectorIdxConstant(i, dl));
+    SDValue RHSElem = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, TmpEltVT, RHS,
+                                  DAG.getVectorIdxConstant(i, dl));
 
     Scalars[i] = DAG.getNode(N->getOpcode(), dl, {MVT::i1, MVT::Other},
                              {Chain, LHSElem, RHSElem, CC});
@@ -4716,11 +4748,11 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) {
     break;
   case ISD::VECREDUCE_FMAX:
     NeutralElem = DAG.getConstantFP(
-        std::numeric_limits<double>::infinity(), dl, ElemVT);
+        -std::numeric_limits<double>::infinity(), dl, ElemVT);
     break;
   case ISD::VECREDUCE_FMIN:
     NeutralElem = DAG.getConstantFP(
-        -std::numeric_limits<double>::infinity(), dl, ElemVT);
+        std::numeric_limits<double>::infinity(), dl, ElemVT);
     break;
   }
 
@@ -4729,7 +4761,7 @@ SDValue DAGTypeLegalizer::WidenVecOp_VECREDUCE(SDNode *N) {
   unsigned WideElts = WideVT.getVectorNumElements();
   for (unsigned Idx = OrigElts; Idx < WideElts; Idx++)
     Op = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, WideVT, Op, NeutralElem,
-        DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+                     DAG.getVectorIdxConstant(Idx, dl));
 
   return DAG.getNode(N->getOpcode(), dl, N->getValueType(0), Op, N->getFlags());
 }
@@ -4748,9 +4780,8 @@ SDValue DAGTypeLegalizer::WidenVecOp_VSELECT(SDNode *N) {
 
   SDValue Select = DAG.getNode(N->getOpcode(), DL, LeftIn.getValueType(), Cond,
                                LeftIn, RightIn);
-  return DAG.getNode(
-      ISD::EXTRACT_SUBVECTOR, DL, VT, Select,
-      DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Select,
+                     DAG.getVectorIdxConstant(0, DL));
 }
 
 //===----------------------------------------------------------------------===//
@@ -4836,7 +4867,6 @@ static EVT FindMemType(SelectionDAG& DAG, const TargetLowering &TLI,
 static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
                                      SmallVectorImpl<SDValue> &LdOps,
                                      unsigned Start, unsigned End) {
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc dl(LdOps[Start]);
   EVT LdTy = LdOps[Start].getValueType();
   unsigned Width = VecTy.getSizeInBits();
@@ -4856,9 +4886,8 @@ static SDValue BuildVectorFromScalar(SelectionDAG& DAG, EVT VecTy,
       Idx = Idx * LdTy.getSizeInBits() / NewLdTy.getSizeInBits();
       LdTy = NewLdTy;
     }
-    VecOp = DAG.getNode(
-        ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i],
-        DAG.getConstant(Idx++, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    VecOp = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, NewVecVT, VecOp, LdOps[i],
+                        DAG.getVectorIdxConstant(Idx++, dl));
   }
   return DAG.getNode(ISD::BITCAST, dl, VecTy, VecOp);
 }
@@ -4879,19 +4908,19 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
   // Load information
   SDValue Chain = LD->getChain();
   SDValue BasePtr = LD->getBasePtr();
-  unsigned Align = LD->getAlignment();
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
   int LdWidth = LdVT.getSizeInBits();
   int WidthDiff = WidenWidth - LdWidth;
-  unsigned LdAlign = (!LD->isSimple()) ? 0 : Align; // Allow wider loads.
+  // Allow wider loads.
+  unsigned LdAlign = (!LD->isSimple()) ? 0 : LD->getAlignment();
 
   // Find the vector type that can load from.
   EVT NewVT = FindMemType(DAG, TLI, LdWidth, WidenVT, LdAlign, WidthDiff);
   int NewVTWidth = NewVT.getSizeInBits();
   SDValue LdOp = DAG.getLoad(NewVT, dl, Chain, BasePtr, LD->getPointerInfo(),
-                             Align, MMOFlags, AAInfo);
+                             LD->getOriginalAlign(), MMOFlags, AAInfo);
   LdChain.push_back(LdOp.getValue(1));
 
   // Check if we can load the element with one instruction.
@@ -4934,7 +4963,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
       NewVTWidth = NewVT.getSizeInBits();
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset),
-                      MinAlign(Align, Increment), MMOFlags, AAInfo);
+                      LD->getOriginalAlign(), MMOFlags, AAInfo);
       LdChain.push_back(L.getValue(1));
       if (L->getValueType(0).isVector() && NewVTWidth >= LdWidth) {
         // Later code assumes the vector loads produced will be mergeable, so we
@@ -4952,7 +4981,7 @@ SDValue DAGTypeLegalizer::GenWidenVectorLoads(SmallVectorImpl<SDValue> &LdChain,
     } else {
       L = DAG.getLoad(NewVT, dl, Chain, BasePtr,
                       LD->getPointerInfo().getWithOffset(Offset),
-                      MinAlign(Align, Increment), MMOFlags, AAInfo);
+                      LD->getOriginalAlign(), MMOFlags, AAInfo);
       LdChain.push_back(L.getValue(1));
     }
 
@@ -5029,7 +5058,6 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   // Load information
   SDValue Chain = LD->getChain();
   SDValue BasePtr = LD->getBasePtr();
-  unsigned Align = LD->getAlignment();
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
@@ -5043,14 +5071,14 @@ DAGTypeLegalizer::GenWidenVectorExtLoads(SmallVectorImpl<SDValue> &LdChain,
   unsigned Increment = LdEltVT.getSizeInBits() / 8;
   Ops[0] =
       DAG.getExtLoad(ExtType, dl, EltVT, Chain, BasePtr, LD->getPointerInfo(),
-                     LdEltVT, Align, MMOFlags, AAInfo);
+                     LdEltVT, LD->getOriginalAlign(), MMOFlags, AAInfo);
   LdChain.push_back(Ops[0].getValue(1));
   unsigned i = 0, Offset = Increment;
   for (i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Offset);
     Ops[i] = DAG.getExtLoad(ExtType, dl, EltVT, Chain, NewBasePtr,
                             LD->getPointerInfo().getWithOffset(Offset), LdEltVT,
-                            Align, MMOFlags, AAInfo);
+                            LD->getOriginalAlign(), MMOFlags, AAInfo);
     LdChain.push_back(Ops[i].getValue(1));
   }
 
@@ -5069,7 +5097,6 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
   // element type or scalar stores.
   SDValue  Chain = ST->getChain();
   SDValue  BasePtr = ST->getBasePtr();
-  unsigned Align = ST->getAlignment();
   MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
   AAMDNodes AAInfo = ST->getAAInfo();
   SDValue  ValOp = GetWidenedVector(ST->getValue());
@@ -5093,12 +5120,11 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
     if (NewVT.isVector()) {
       unsigned NumVTElts = NewVT.getVectorNumElements();
       do {
-        SDValue EOp = DAG.getNode(
-            ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
-            DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+        SDValue EOp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NewVT, ValOp,
+                                  DAG.getVectorIdxConstant(Idx, dl));
         StChain.push_back(DAG.getStore(
             Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
-            MinAlign(Align, Offset), MMOFlags, AAInfo));
+            ST->getOriginalAlign(), MMOFlags, AAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         Idx += NumVTElts;
@@ -5113,13 +5139,11 @@ void DAGTypeLegalizer::GenWidenVectorStores(SmallVectorImpl<SDValue> &StChain,
       // Readjust index position based on new vector type.
       Idx = Idx * ValEltWidth / NewVTWidth;
       do {
-        SDValue EOp = DAG.getNode(
-            ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
-            DAG.getConstant(Idx++, dl,
-                            TLI.getVectorIdxTy(DAG.getDataLayout())));
+        SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, NewVT, VecOp,
+                                  DAG.getVectorIdxConstant(Idx++, dl));
         StChain.push_back(DAG.getStore(
             Chain, dl, EOp, BasePtr, ST->getPointerInfo().getWithOffset(Offset),
-            MinAlign(Align, Offset), MMOFlags, AAInfo));
+            ST->getOriginalAlign(), MMOFlags, AAInfo));
         StWidth -= NewVTWidth;
         Offset += Increment;
         BasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Increment);
@@ -5137,7 +5161,6 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
   // and then store it. Instead, we extract each element and then store it.
   SDValue Chain = ST->getChain();
   SDValue BasePtr = ST->getBasePtr();
-  unsigned Align = ST->getAlignment();
   MachineMemOperand::Flags MMOFlags = ST->getMemOperand()->getFlags();
   AAMDNodes AAInfo = ST->getAAInfo();
   SDValue ValOp = GetWidenedVector(ST->getValue());
@@ -5157,21 +5180,19 @@ DAGTypeLegalizer::GenWidenVectorTruncStores(SmallVectorImpl<SDValue> &StChain,
   EVT ValEltVT = ValVT.getVectorElementType();
   unsigned Increment = ValEltVT.getSizeInBits() / 8;
   unsigned NumElts = StVT.getVectorNumElements();
-  SDValue EOp = DAG.getNode(
-      ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
-      DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
-  StChain.push_back(DAG.getTruncStore(Chain, dl, EOp, BasePtr,
-                                      ST->getPointerInfo(), StEltVT, Align,
-                                      MMOFlags, AAInfo));
+  SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
+                            DAG.getVectorIdxConstant(0, dl));
+  StChain.push_back(
+      DAG.getTruncStore(Chain, dl, EOp, BasePtr, ST->getPointerInfo(), StEltVT,
+                        ST->getOriginalAlign(), MMOFlags, AAInfo));
   unsigned Offset = Increment;
   for (unsigned i=1; i < NumElts; ++i, Offset += Increment) {
     SDValue NewBasePtr = DAG.getObjectPtrOffset(dl, BasePtr, Offset);
-    SDValue EOp = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
-        DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    SDValue EOp = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ValEltVT, ValOp,
+                              DAG.getVectorIdxConstant(0, dl));
     StChain.push_back(DAG.getTruncStore(
         Chain, dl, EOp, NewBasePtr, ST->getPointerInfo().getWithOffset(Offset),
-        StEltVT, MinAlign(Align, Offset), MMOFlags, AAInfo));
+        StEltVT, ST->getOriginalAlign(), MMOFlags, AAInfo));
   }
 }
 
@@ -5206,9 +5227,8 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
   }
 
   if (WidenNumElts < InNumElts && InNumElts % WidenNumElts)
-    return DAG.getNode(
-        ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
-        DAG.getConstant(0, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, NVT, InOp,
+                       DAG.getVectorIdxConstant(0, dl));
 
   // Fall back to extract and build.
   SmallVector<SDValue, 16> Ops(WidenNumElts);
@@ -5216,9 +5236,8 @@ SDValue DAGTypeLegalizer::ModifyToType(SDValue InOp, EVT NVT,
   unsigned MinNumElts = std::min(WidenNumElts, InNumElts);
   unsigned Idx;
   for (Idx = 0; Idx < MinNumElts; ++Idx)
-    Ops[Idx] = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
-        DAG.getConstant(Idx, dl, TLI.getVectorIdxTy(DAG.getDataLayout())));
+    Ops[Idx] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, InOp,
+                           DAG.getVectorIdxConstant(Idx, dl));
 
   SDValue FillVal = FillWithZeroes ? DAG.getConstant(0, dl, EltVT) :
     DAG.getUNDEF(EltVT);
diff --git a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
index 34660e3a48ec..55fe26eb64cd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ResourcePriorityQueue.cpp
@@ -19,9 +19,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/ResourcePriorityQueue.h"
+#include "llvm/CodeGen/DFAPacketizer.h"
 #include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
index 7ee44c808fcb..2902c96c7658 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGFast.cpp
@@ -761,7 +761,7 @@ void ScheduleDAGLinearize::Schedule() {
 MachineBasicBlock*
 ScheduleDAGLinearize::EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
   InstrEmitter Emitter(BB, InsertPos);
-  DenseMap<SDValue, unsigned> VRBaseMap;
+  DenseMap<SDValue, Register> VRBaseMap;
 
   LLVM_DEBUG({ dbgs() << "\n*** Final schedule ***\n"; });
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index ff806bdb822c..72e68a5045c6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -279,7 +279,7 @@ private:
     SUnit *NewNode = newSUnit(N);
     // Update the topological ordering.
     if (NewNode->NodeNum >= NumSUnits)
-      Topo.MarkDirty();
+      Topo.AddSUnitWithoutPredecessors(NewNode);
     return NewNode;
   }
 
@@ -289,7 +289,7 @@ private:
     SUnit *NewNode = Clone(N);
     // Update the topological ordering.
     if (NewNode->NodeNum >= NumSUnits)
-      Topo.MarkDirty();
+      Topo.AddSUnitWithoutPredecessors(NewNode);
     return NewNode;
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
index 0e4d783e3505..ce20d506586f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.cpp
@@ -31,6 +31,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "pre-RA-sched"
@@ -198,10 +199,10 @@ static void RemoveUnusedGlue(SDNode *N, SelectionDAG *DAG) {
 /// outputs to ensure they are scheduled together and in order. This
 /// optimization may benefit some targets by improving cache locality.
 void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
-  SDNode *Chain = nullptr;
+  SDValue Chain;
   unsigned NumOps = Node->getNumOperands();
   if (Node->getOperand(NumOps-1).getValueType() == MVT::Other)
-    Chain = Node->getOperand(NumOps-1).getNode();
+    Chain = Node->getOperand(NumOps-1);
   if (!Chain)
     return;
 
@@ -234,6 +235,9 @@ void ScheduleDAGSDNodes::ClusterNeighboringLoads(SDNode *Node) {
   unsigned UseCount = 0;
   for (SDNode::use_iterator I = Chain->use_begin(), E = Chain->use_end();
        I != E && UseCount < 100; ++I, ++UseCount) {
+    if (I.getUse().getResNo() != Chain.getResNo())
+      continue;
+
     SDNode *User = *I;
     if (User == Node || !Visited.insert(User).second)
       continue;
@@ -471,6 +475,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
 
       for (unsigned i = 0, e = N->getNumOperands(); i != e; ++i) {
         SDNode *OpN = N->getOperand(i).getNode();
+        unsigned DefIdx = N->getOperand(i).getResNo();
         if (isPassiveNode(OpN)) continue;   // Not scheduled.
         SUnit *OpSU = &SUnits[OpN->getNodeId()];
         assert(OpSU && "Node has no SUnit!");
@@ -505,7 +510,7 @@ void ScheduleDAGSDNodes::AddSchedEdges() {
         Dep.setLatency(OpLatency);
         if (!isChain && !UnitLatencies) {
           computeOperandLatency(OpN, N, i, Dep);
-          ST.adjustSchedDependency(OpSU, SU, Dep);
+          ST.adjustSchedDependency(OpSU, DefIdx, SU, i, Dep);
         }
 
         if (!SU->addPred(Dep) && !Dep.isCtrl() && OpSU->NumRegDefsLeft > 1) {
@@ -731,7 +736,7 @@ void ScheduleDAGSDNodes::VerifyScheduledSequence(bool isBottomUp) {
 static void
 ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
                    SmallVectorImpl<std::pair<unsigned, MachineInstr*> > &Orders,
-                   DenseMap<SDValue, unsigned> &VRBaseMap, unsigned Order) {
+                   DenseMap<SDValue, Register> &VRBaseMap, unsigned Order) {
   if (!N->getHasDebugValue())
     return;
 
@@ -758,9 +763,9 @@ ProcessSDDbgValues(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
 // instructions in the right order.
 static void
 ProcessSourceNode(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
-                  DenseMap<SDValue, unsigned> &VRBaseMap,
+                  DenseMap<SDValue, Register> &VRBaseMap,
                   SmallVectorImpl<std::pair<unsigned, MachineInstr *>> &Orders,
-                  SmallSet<unsigned, 8> &Seen, MachineInstr *NewInsn) {
+                  SmallSet<Register, 8> &Seen, MachineInstr *NewInsn) {
   unsigned Order = N->getIROrder();
   if (!Order || Seen.count(Order)) {
     // Process any valid SDDbgValues even if node does not have any order
@@ -784,17 +789,17 @@ ProcessSourceNode(SDNode *N, SelectionDAG *DAG, InstrEmitter &Emitter,
 }
 
 void ScheduleDAGSDNodes::
-EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, unsigned> &VRBaseMap,
+EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, Register> &VRBaseMap,
                 MachineBasicBlock::iterator InsertPos) {
   for (SUnit::const_pred_iterator I = SU->Preds.begin(), E = SU->Preds.end();
        I != E; ++I) {
     if (I->isCtrl()) continue;  // ignore chain preds
     if (I->getSUnit()->CopyDstRC) {
       // Copy to physical register.
-      DenseMap<SUnit*, unsigned>::iterator VRI = VRBaseMap.find(I->getSUnit());
+      DenseMap<SUnit*, Register>::iterator VRI = VRBaseMap.find(I->getSUnit());
       assert(VRI != VRBaseMap.end() && "Node emitted out of order - late");
       // Find the destination physical register.
-      unsigned Reg = 0;
+      Register Reg;
       for (SUnit::const_succ_iterator II = SU->Succs.begin(),
              EE = SU->Succs.end(); II != EE; ++II) {
         if (II->isCtrl()) continue;  // ignore chain preds
@@ -826,17 +831,17 @@ EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, unsigned> &VRBaseMap,
 MachineBasicBlock *ScheduleDAGSDNodes::
 EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
   InstrEmitter Emitter(BB, InsertPos);
-  DenseMap<SDValue, unsigned> VRBaseMap;
-  DenseMap<SUnit*, unsigned> CopyVRBaseMap;
+  DenseMap<SDValue, Register> VRBaseMap;
+  DenseMap<SUnit*, Register> CopyVRBaseMap;
   SmallVector<std::pair<unsigned, MachineInstr*>, 32> Orders;
-  SmallSet<unsigned, 8> Seen;
+  SmallSet<Register, 8> Seen;
   bool HasDbg = DAG->hasDebugValues();
 
   // Emit a node, and determine where its first instruction is for debuginfo.
   // Zero, one, or multiple instructions can be created when emitting a node.
   auto EmitNode =
       [&](SDNode *Node, bool IsClone, bool IsCloned,
-          DenseMap<SDValue, unsigned> &VRBaseMap) -> MachineInstr * {
+          DenseMap<SDValue, Register> &VRBaseMap) -> MachineInstr * {
     // Fetch instruction prior to this, or end() if nonexistant.
     auto GetPrevInsn = [&](MachineBasicBlock::iterator I) {
       if (I == BB->begin())
@@ -863,9 +868,14 @@ EmitSchedule(MachineBasicBlock::iterator &InsertPos) {
       MI = &*std::next(Before);
     }
 
-    if (MI->isCall() && DAG->getTarget().Options.EnableDebugEntryValues)
+    if (MI->isCandidateForCallSiteEntry() &&
+        DAG->getTarget().Options.EmitCallSiteInfo)
       MF.addCallArgsForwardingRegs(MI, DAG->getSDCallSiteInfo(Node));
 
+    if (DAG->getNoMergeSiteInfo(Node)) {
+      MI->setFlag(MachineInstr::MIFlag::NoMerge);
+    }
+
     return MI;
   };
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
index 183ce4b0652d..8c28ce403c9b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/ScheduleDAGSDNodes.h
@@ -184,7 +184,7 @@ class InstrItineraryData;
     void BuildSchedUnits();
     void AddSchedEdges();
 
-    void EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, unsigned> &VRBaseMap,
+    void EmitPhysRegCopy(SUnit *SU, DenseMap<SUnit*, Register> &VRBaseMap,
                          MachineBasicBlock::iterator InsertPos);
   };
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 313e07b5fdd6..592c09c10fb0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -38,6 +38,7 @@
 #include "llvm/CodeGen/SelectionDAGAddressAnalysis.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -543,7 +544,7 @@ static void AddNodeIDCustom(FoldingSetNodeID &ID, const SDNode *N) {
   case ISD::ConstantPool:
   case ISD::TargetConstantPool: {
     const ConstantPoolSDNode *CP = cast<ConstantPoolSDNode>(N);
-    ID.AddInteger(CP->getAlignment());
+    ID.AddInteger(CP->getAlign().value());
     ID.AddInteger(CP->getOffset());
     if (CP->isMachineConstantPoolEntry())
       CP->getMachineCPVal()->addSelectionDAGCSEId(ID);
@@ -1000,12 +1001,12 @@ SDNode *SelectionDAG::FindModifiedNodeSlot(SDNode *N, ArrayRef<SDValue> Ops,
   return Node;
 }
 
-unsigned SelectionDAG::getEVTAlignment(EVT VT) const {
+Align SelectionDAG::getEVTAlign(EVT VT) const {
   Type *Ty = VT == MVT::iPTR ?
                    PointerType::get(Type::getInt8Ty(*getContext()), 0) :
                    VT.getTypeForEVT(*getContext());
 
-  return getDataLayout().getABITypeAlignment(Ty);
+  return getDataLayout().getABITypeAlign(Ty);
 }
 
 // EntryNode could meaningfully have debug info if we can find it...
@@ -1167,15 +1168,21 @@ SDValue SelectionDAG::getBoolExtOrTrunc(SDValue Op, const SDLoc &SL, EVT VT,
 }
 
 SDValue SelectionDAG::getZeroExtendInReg(SDValue Op, const SDLoc &DL, EVT VT) {
-  assert(!VT.isVector() &&
-         "getZeroExtendInReg should use the vector element type instead of "
-         "the vector type!");
-  if (Op.getValueType().getScalarType() == VT) return Op;
-  unsigned BitWidth = Op.getScalarValueSizeInBits();
-  APInt Imm = APInt::getLowBitsSet(BitWidth,
-                                   VT.getSizeInBits());
-  return getNode(ISD::AND, DL, Op.getValueType(), Op,
-                 getConstant(Imm, DL, Op.getValueType()));
+  EVT OpVT = Op.getValueType();
+  assert(VT.isInteger() && OpVT.isInteger() &&
+         "Cannot getZeroExtendInReg FP types");
+  assert(VT.isVector() == OpVT.isVector() &&
+         "getZeroExtendInReg type should be vector iff the operand "
+         "type is vector!");
+  assert((!VT.isVector() ||
+          VT.getVectorElementCount() == OpVT.getVectorElementCount()) &&
+         "Vector element counts must match in getZeroExtendInReg");
+  assert(VT.bitsLE(OpVT) && "Not extending!");
+  if (OpVT == VT)
+    return Op;
+  APInt Imm = APInt::getLowBitsSet(OpVT.getScalarSizeInBits(),
+                                   VT.getScalarSizeInBits());
+  return getNode(ISD::AND, DL, OpVT, Op, getConstant(Imm, DL, OpVT));
 }
 
 SDValue SelectionDAG::getPtrExtOrTrunc(SDValue Op, const SDLoc &DL, EVT VT) {
@@ -1332,10 +1339,16 @@ SDValue SelectionDAG::getIntPtrConstant(uint64_t Val, const SDLoc &DL,
 
 SDValue SelectionDAG::getShiftAmountConstant(uint64_t Val, EVT VT,
                                              const SDLoc &DL, bool LegalTypes) {
+  assert(VT.isInteger() && "Shift amount is not an integer type!");
   EVT ShiftVT = TLI->getShiftAmountTy(VT, getDataLayout(), LegalTypes);
   return getConstant(Val, DL, ShiftVT);
 }
 
+SDValue SelectionDAG::getVectorIdxConstant(uint64_t Val, const SDLoc &DL,
+                                           bool isTarget) {
+  return getConstant(Val, DL, TLI->getVectorIdxTy(getDataLayout()), isTarget);
+}
+
 SDValue SelectionDAG::getConstantFP(const APFloat &V, const SDLoc &DL, EVT VT,
                                     bool isTarget) {
   return getConstantFP(*ConstantFP::get(*getContext(), V), DL, VT, isTarget);
@@ -1381,7 +1394,7 @@ SDValue SelectionDAG::getConstantFP(double Val, const SDLoc &DL, EVT VT,
   else if (EltVT == MVT::f64)
     return getConstantFP(APFloat(Val), DL, VT, isTarget);
   else if (EltVT == MVT::f80 || EltVT == MVT::f128 || EltVT == MVT::ppcf128 ||
-           EltVT == MVT::f16) {
+           EltVT == MVT::f16 || EltVT == MVT::bf16) {
     bool Ignored;
     APFloat APF = APFloat(Val);
     APF.convert(EVTToAPFloatSemantics(EltVT), APFloat::rmNearestTiesToEven,
@@ -1459,19 +1472,18 @@ SDValue SelectionDAG::getJumpTable(int JTI, EVT VT, bool isTarget,
 }
 
 SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
-                                      unsigned Alignment, int Offset,
-                                      bool isTarget,
-                                      unsigned TargetFlags) {
+                                      MaybeAlign Alignment, int Offset,
+                                      bool isTarget, unsigned TargetFlags) {
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
-  if (Alignment == 0)
+  if (!Alignment)
     Alignment = shouldOptForSize()
-                    ? getDataLayout().getABITypeAlignment(C->getType())
-                    : getDataLayout().getPrefTypeAlignment(C->getType());
+                    ? getDataLayout().getABITypeAlign(C->getType())
+                    : getDataLayout().getPrefTypeAlign(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), None);
-  ID.AddInteger(Alignment);
+  ID.AddInteger(Alignment->value());
   ID.AddInteger(Offset);
   ID.AddPointer(C);
   ID.AddInteger(TargetFlags);
@@ -1479,25 +1491,26 @@ SDValue SelectionDAG::getConstantPool(const Constant *C, EVT VT,
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
+  auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, *Alignment,
                                           TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
-  return SDValue(N, 0);
+  SDValue V = SDValue(N, 0);
+  NewSDValueDbgMsg(V, "Creating new constant pool: ", this);
+  return V;
 }
 
 SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
-                                      unsigned Alignment, int Offset,
-                                      bool isTarget,
-                                      unsigned TargetFlags) {
+                                      MaybeAlign Alignment, int Offset,
+                                      bool isTarget, unsigned TargetFlags) {
   assert((TargetFlags == 0 || isTarget) &&
          "Cannot set target flags on target-independent globals");
-  if (Alignment == 0)
-    Alignment = getDataLayout().getPrefTypeAlignment(C->getType());
+  if (!Alignment)
+    Alignment = getDataLayout().getPrefTypeAlign(C->getType());
   unsigned Opc = isTarget ? ISD::TargetConstantPool : ISD::ConstantPool;
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, Opc, getVTList(VT), None);
-  ID.AddInteger(Alignment);
+  ID.AddInteger(Alignment->value());
   ID.AddInteger(Offset);
   C->addSelectionDAGCSEId(ID);
   ID.AddInteger(TargetFlags);
@@ -1505,7 +1518,7 @@ SDValue SelectionDAG::getConstantPool(MachineConstantPoolValue *C, EVT VT,
   if (SDNode *E = FindNodeOrInsertPos(ID, IP))
     return SDValue(E, 0);
 
-  auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, Alignment,
+  auto *N = newSDNode<ConstantPoolSDNode>(isTarget, C, VT, Offset, *Alignment,
                                           TargetFlags);
   CSEMap.InsertNode(N, IP);
   InsertNode(N);
@@ -1861,9 +1874,6 @@ SDValue SelectionDAG::getBlockAddress(const BlockAddress *BA, EVT VT,
 }
 
 SDValue SelectionDAG::getSrcValue(const Value *V) {
-  assert((!V || V->getType()->isPointerTy()) &&
-         "SrcValue is not a pointer?");
-
   FoldingSetNodeID ID;
   AddNodeIDNode(ID, ISD::SRCVALUE, getVTList(MVT::Other), None);
   ID.AddPointer(V);
@@ -1921,6 +1931,10 @@ SDValue SelectionDAG::getAddrSpaceCast(const SDLoc &dl, EVT VT, SDValue Ptr,
   return SDValue(N, 0);
 }
 
+SDValue SelectionDAG::getFreeze(SDValue V) {
+  return getNode(ISD::FREEZE, SDLoc(V), V.getValueType(), V);
+}
+
 /// getShiftAmountOperand - Return the specified value casted to
 /// the target's desired shift amount type.
 SDValue SelectionDAG::getShiftAmountOperand(EVT LHSTy, SDValue Op) {
@@ -1979,28 +1993,54 @@ SDValue SelectionDAG::expandVACopy(SDNode *Node) {
                   MachinePointerInfo(VD));
 }
 
-SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
-  MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
-  unsigned ByteSize = VT.getStoreSize();
+Align SelectionDAG::getReducedAlign(EVT VT, bool UseABI) {
+  const DataLayout &DL = getDataLayout();
   Type *Ty = VT.getTypeForEVT(*getContext());
-  unsigned StackAlign =
-      std::max((unsigned)getDataLayout().getPrefTypeAlignment(Ty), minAlign);
+  Align RedAlign = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty);
+
+  if (TLI->isTypeLegal(VT) || !VT.isVector())
+    return RedAlign;
+
+  const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
+  const Align StackAlign = TFI->getStackAlign();
+
+  // See if we can choose a smaller ABI alignment in cases where it's an
+  // illegal vector type that will get broken down.
+  if (RedAlign > StackAlign) {
+    EVT IntermediateVT;
+    MVT RegisterVT;
+    unsigned NumIntermediates;
+    TLI->getVectorTypeBreakdown(*getContext(), VT, IntermediateVT,
+                                NumIntermediates, RegisterVT);
+    Ty = IntermediateVT.getTypeForEVT(*getContext());
+    Align RedAlign2 = UseABI ? DL.getABITypeAlign(Ty) : DL.getPrefTypeAlign(Ty);
+    if (RedAlign2 < RedAlign)
+      RedAlign = RedAlign2;
+  }
+
+  return RedAlign;
+}
 
-  int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
+SDValue SelectionDAG::CreateStackTemporary(TypeSize Bytes, Align Alignment) {
+  MachineFrameInfo &MFI = MF->getFrameInfo();
+  int FrameIdx = MFI.CreateStackObject(Bytes, Alignment, false);
   return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
 }
 
+SDValue SelectionDAG::CreateStackTemporary(EVT VT, unsigned minAlign) {
+  Type *Ty = VT.getTypeForEVT(*getContext());
+  Align StackAlign =
+      std::max(getDataLayout().getPrefTypeAlign(Ty), Align(minAlign));
+  return CreateStackTemporary(VT.getStoreSize(), StackAlign);
+}
+
 SDValue SelectionDAG::CreateStackTemporary(EVT VT1, EVT VT2) {
-  unsigned Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize());
+  TypeSize Bytes = std::max(VT1.getStoreSize(), VT2.getStoreSize());
   Type *Ty1 = VT1.getTypeForEVT(*getContext());
   Type *Ty2 = VT2.getTypeForEVT(*getContext());
   const DataLayout &DL = getDataLayout();
-  unsigned Align =
-      std::max(DL.getPrefTypeAlignment(Ty1), DL.getPrefTypeAlignment(Ty2));
-
-  MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
-  int FrameIdx = MFI.CreateStackObject(Bytes, Align, false);
-  return getFrameIndex(FrameIdx, TLI->getFrameIndexTy(getDataLayout()));
+  Align Align = std::max(DL.getPrefTypeAlign(Ty1), DL.getPrefTypeAlign(Ty2));
+  return CreateStackTemporary(Bytes, Align);
 }
 
 SDValue SelectionDAG::FoldSetCC(EVT VT, SDValue N1, SDValue N2,
@@ -2179,21 +2219,16 @@ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits,
                                       const APInt &DemandedElts) {
   switch (V.getOpcode()) {
   default:
+    return TLI->SimplifyMultipleUseDemandedBits(V, DemandedBits, DemandedElts,
+                                                *this, 0);
     break;
   case ISD::Constant: {
-    auto *CV = cast<ConstantSDNode>(V.getNode());
-    assert(CV && "Const value should be ConstSDNode.");
-    const APInt &CVal = CV->getAPIntValue();
+    const APInt &CVal = cast<ConstantSDNode>(V)->getAPIntValue();
     APInt NewVal = CVal & DemandedBits;
     if (NewVal != CVal)
       return getConstant(NewVal, SDLoc(V), V.getValueType());
     break;
   }
-  case ISD::OR:
-  case ISD::XOR:
-  case ISD::SIGN_EXTEND_INREG:
-    return TLI->SimplifyMultipleUseDemandedBits(V, DemandedBits, DemandedElts,
-                                                *this, 0);
   case ISD::SRL:
     // Only look at single-use SRLs.
     if (!V.getNode()->hasOneUse())
@@ -2224,19 +2259,6 @@ SDValue SelectionDAG::GetDemandedBits(SDValue V, const APInt &DemandedBits,
     }
     break;
   }
-  case ISD::ANY_EXTEND: {
-    SDValue Src = V.getOperand(0);
-    unsigned SrcBitWidth = Src.getScalarValueSizeInBits();
-    // Being conservative here - only peek through if we only demand bits in the
-    // non-extended source (even though the extended bits are technically
-    // undef).
-    if (DemandedBits.getActiveBits() > SrcBitWidth)
-      break;
-    APInt SrcDemandedBits = DemandedBits.trunc(SrcBitWidth);
-    if (SDValue DemandedSrc = GetDemandedBits(Src, SrcDemandedBits))
-      return getNode(ISD::ANY_EXTEND, SDLoc(V), V.getValueType(), DemandedSrc);
-    break;
-  }
   }
   return SDValue();
 }
@@ -2253,11 +2275,7 @@ bool SelectionDAG::SignBitIsZero(SDValue Op, unsigned Depth) const {
 /// for bits that V cannot have.
 bool SelectionDAG::MaskedValueIsZero(SDValue V, const APInt &Mask,
                                      unsigned Depth) const {
-  EVT VT = V.getValueType();
-  APInt DemandedElts = VT.isVector()
-                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
-                           : APInt(1, 1);
-  return MaskedValueIsZero(V, Mask, DemandedElts, Depth);
+  return Mask.isSubsetOf(computeKnownBits(V, Depth).Zero);
 }
 
 /// MaskedValueIsZero - Return true if 'V & Mask' is known to be zero in
@@ -2276,15 +2294,42 @@ bool SelectionDAG::MaskedValueIsAllOnes(SDValue V, const APInt &Mask,
 }
 
 /// isSplatValue - Return true if the vector V has the same value
-/// across all DemandedElts.
+/// across all DemandedElts. For scalable vectors it does not make
+/// sense to specify which elements are demanded or undefined, therefore
+/// they are simply ignored.
 bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
                                 APInt &UndefElts) {
-  if (!DemandedElts)
-    return false; // No demanded elts, better to assume we don't know anything.
-
   EVT VT = V.getValueType();
   assert(VT.isVector() && "Vector type expected");
 
+  if (!VT.isScalableVector() && !DemandedElts)
+    return false; // No demanded elts, better to assume we don't know anything.
+
+  // Deal with some common cases here that work for both fixed and scalable
+  // vector types.
+  switch (V.getOpcode()) {
+  case ISD::SPLAT_VECTOR:
+    return true;
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::AND: {
+    APInt UndefLHS, UndefRHS;
+    SDValue LHS = V.getOperand(0);
+    SDValue RHS = V.getOperand(1);
+    if (isSplatValue(LHS, DemandedElts, UndefLHS) &&
+        isSplatValue(RHS, DemandedElts, UndefRHS)) {
+      UndefElts = UndefLHS | UndefRHS;
+      return true;
+    }
+    break;
+  }
+  }
+
+  // We don't support other cases than those above for scalable vectors at
+  // the moment.
+  if (VT.isScalableVector())
+    return false;
+
   unsigned NumElts = VT.getVectorNumElements();
   assert(NumElts == DemandedElts.getBitWidth() && "Vector size mismatch");
   UndefElts = APInt::getNullValue(NumElts);
@@ -2326,30 +2371,14 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
     return true;
   }
   case ISD::EXTRACT_SUBVECTOR: {
+    // Offset the demanded elts by the subvector index.
     SDValue Src = V.getOperand(0);
-    ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(V.getOperand(1));
+    uint64_t Idx = V.getConstantOperandVal(1);
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
-    if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
-      // Offset the demanded elts by the subvector index.
-      uint64_t Idx = SubIdx->getZExtValue();
-      APInt UndefSrcElts;
-      APInt DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
-      if (isSplatValue(Src, DemandedSrc, UndefSrcElts)) {
-        UndefElts = UndefSrcElts.extractBits(NumElts, Idx);
-        return true;
-      }
-    }
-    break;
-  }
-  case ISD::ADD:
-  case ISD::SUB:
-  case ISD::AND: {
-    APInt UndefLHS, UndefRHS;
-    SDValue LHS = V.getOperand(0);
-    SDValue RHS = V.getOperand(1);
-    if (isSplatValue(LHS, DemandedElts, UndefLHS) &&
-        isSplatValue(RHS, DemandedElts, UndefRHS)) {
-      UndefElts = UndefLHS | UndefRHS;
+    APInt UndefSrcElts;
+    APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+    if (isSplatValue(Src, DemandedSrcElts, UndefSrcElts)) {
+      UndefElts = UndefSrcElts.extractBits(NumElts, Idx);
       return true;
     }
     break;
@@ -2363,10 +2392,13 @@ bool SelectionDAG::isSplatValue(SDValue V, const APInt &DemandedElts,
 bool SelectionDAG::isSplatValue(SDValue V, bool AllowUndefs) {
   EVT VT = V.getValueType();
   assert(VT.isVector() && "Vector type expected");
-  unsigned NumElts = VT.getVectorNumElements();
 
   APInt UndefElts;
-  APInt DemandedElts = APInt::getAllOnesValue(NumElts);
+  APInt DemandedElts;
+
+  // For now we don't support this with scalable vectors.
+  if (!VT.isScalableVector())
+    DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
   return isSplatValue(V, DemandedElts, UndefElts) &&
          (AllowUndefs || !UndefElts);
 }
@@ -2379,19 +2411,35 @@ SDValue SelectionDAG::getSplatSourceVector(SDValue V, int &SplatIdx) {
   switch (Opcode) {
   default: {
     APInt UndefElts;
-    APInt DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+    APInt DemandedElts;
+
+    if (!VT.isScalableVector())
+      DemandedElts = APInt::getAllOnesValue(VT.getVectorNumElements());
+
     if (isSplatValue(V, DemandedElts, UndefElts)) {
-      // Handle case where all demanded elements are UNDEF.
-      if (DemandedElts.isSubsetOf(UndefElts)) {
+      if (VT.isScalableVector()) {
+        // DemandedElts and UndefElts are ignored for scalable vectors, since
+        // the only supported cases are SPLAT_VECTOR nodes.
         SplatIdx = 0;
-        return getUNDEF(VT);
+      } else {
+        // Handle case where all demanded elements are UNDEF.
+        if (DemandedElts.isSubsetOf(UndefElts)) {
+          SplatIdx = 0;
+          return getUNDEF(VT);
+        }
+        SplatIdx = (UndefElts & DemandedElts).countTrailingOnes();
       }
-      SplatIdx = (UndefElts & DemandedElts).countTrailingOnes();
       return V;
     }
     break;
   }
+  case ISD::SPLAT_VECTOR:
+    SplatIdx = 0;
+    return V;
   case ISD::VECTOR_SHUFFLE: {
+    if (VT.isScalableVector())
+      return SDValue();
+
     // Check if this is a shuffle node doing a splat.
     // TODO - remove this and rely purely on SelectionDAG::isSplatValue,
     // getTargetVShiftNode currently struggles without the splat source.
@@ -2413,14 +2461,16 @@ SDValue SelectionDAG::getSplatValue(SDValue V) {
   if (SDValue SrcVector = getSplatSourceVector(V, SplatIdx))
     return getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V),
                    SrcVector.getValueType().getScalarType(), SrcVector,
-                   getIntPtrConstant(SplatIdx, SDLoc(V)));
+                   getVectorIdxConstant(SplatIdx, SDLoc(V)));
   return SDValue();
 }
 
-/// If a SHL/SRA/SRL node has a constant or splat constant shift amount that
-/// is less than the element bit-width of the shift node, return it.
-static const APInt *getValidShiftAmountConstant(SDValue V,
-                                                const APInt &DemandedElts) {
+const APInt *
+SelectionDAG::getValidShiftAmountConstant(SDValue V,
+                                          const APInt &DemandedElts) const {
+  assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL ||
+          V.getOpcode() == ISD::SRA) &&
+         "Unknown shift node");
   unsigned BitWidth = V.getScalarValueSizeInBits();
   if (ConstantSDNode *SA = isConstOrConstSplat(V.getOperand(1), DemandedElts)) {
     // Shifting more than the bitwidth is not valid.
@@ -2431,10 +2481,13 @@ static const APInt *getValidShiftAmountConstant(SDValue V,
   return nullptr;
 }
 
-/// If a SHL/SRA/SRL node has constant vector shift amounts that are all less
-/// than the element bit-width of the shift node, return the minimum value.
-static const APInt *
-getValidMinimumShiftAmountConstant(SDValue V, const APInt &DemandedElts) {
+const APInt *SelectionDAG::getValidMinimumShiftAmountConstant(
+    SDValue V, const APInt &DemandedElts) const {
+  assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL ||
+          V.getOpcode() == ISD::SRA) &&
+         "Unknown shift node");
+  if (const APInt *ValidAmt = getValidShiftAmountConstant(V, DemandedElts))
+    return ValidAmt;
   unsigned BitWidth = V.getScalarValueSizeInBits();
   auto *BV = dyn_cast<BuildVectorSDNode>(V.getOperand(1));
   if (!BV)
@@ -2457,10 +2510,13 @@ getValidMinimumShiftAmountConstant(SDValue V, const APInt &DemandedElts) {
   return MinShAmt;
 }
 
-/// If a SHL/SRA/SRL node has constant vector shift amounts that are all less
-/// than the element bit-width of the shift node, return the maximum value.
-static const APInt *
-getValidMaximumShiftAmountConstant(SDValue V, const APInt &DemandedElts) {
+const APInt *SelectionDAG::getValidMaximumShiftAmountConstant(
+    SDValue V, const APInt &DemandedElts) const {
+  assert((V.getOpcode() == ISD::SHL || V.getOpcode() == ISD::SRL ||
+          V.getOpcode() == ISD::SRA) &&
+         "Unknown shift node");
+  if (const APInt *ValidAmt = getValidShiftAmountConstant(V, DemandedElts))
+    return ValidAmt;
   unsigned BitWidth = V.getScalarValueSizeInBits();
   auto *BV = dyn_cast<BuildVectorSDNode>(V.getOperand(1));
   if (!BV)
@@ -2488,6 +2544,14 @@ getValidMaximumShiftAmountConstant(SDValue V, const APInt &DemandedElts) {
 /// every vector element.
 KnownBits SelectionDAG::computeKnownBits(SDValue Op, unsigned Depth) const {
   EVT VT = Op.getValueType();
+
+  // TOOD: Until we have a plan for how to represent demanded elements for
+  // scalable vectors, we can just bail out for now.
+  if (Op.getValueType().isScalableVector()) {
+    unsigned BitWidth = Op.getScalarValueSizeInBits();
+    return KnownBits(BitWidth);
+  }
+
   APInt DemandedElts = VT.isVector()
                            ? APInt::getAllOnesValue(VT.getVectorNumElements())
                            : APInt(1, 1);
@@ -2503,6 +2567,11 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
 
   KnownBits Known(BitWidth);   // Don't know anything.
 
+  // TOOD: Until we have a plan for how to represent demanded elements for
+  // scalable vectors, we can just bail out for now.
+  if (Op.getValueType().isScalableVector())
+    return Known;
+
   if (auto *C = dyn_cast<ConstantSDNode>(Op)) {
     // We know all of the bits for a constant!
     Known.One = C->getAPIntValue();
@@ -2622,52 +2691,40 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     break;
   }
   case ISD::INSERT_SUBVECTOR: {
-    // If we know the element index, demand any elements from the subvector and
-    // the remainder from the src its inserted into, otherwise demand them all.
+    // Demand any elements from the subvector and the remainder from the src its
+    // inserted into.
     SDValue Src = Op.getOperand(0);
     SDValue Sub = Op.getOperand(1);
-    ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    uint64_t Idx = Op.getConstantOperandVal(2);
     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
-    if (SubIdx && SubIdx->getAPIntValue().ule(NumElts - NumSubElts)) {
-      Known.One.setAllBits();
-      Known.Zero.setAllBits();
-      uint64_t Idx = SubIdx->getZExtValue();
-      APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
-      if (!!DemandedSubElts) {
-        Known = computeKnownBits(Sub, DemandedSubElts, Depth + 1);
-        if (Known.isUnknown())
-          break; // early-out.
-      }
-      APInt SubMask = APInt::getBitsSet(NumElts, Idx, Idx + NumSubElts);
-      APInt DemandedSrcElts = DemandedElts & ~SubMask;
-      if (!!DemandedSrcElts) {
-        Known2 = computeKnownBits(Src, DemandedSrcElts, Depth + 1);
-        Known.One &= Known2.One;
-        Known.Zero &= Known2.Zero;
-      }
-    } else {
-      Known = computeKnownBits(Sub, Depth + 1);
+    APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+    APInt DemandedSrcElts = DemandedElts;
+    DemandedSrcElts.insertBits(APInt::getNullValue(NumSubElts), Idx);
+
+    Known.One.setAllBits();
+    Known.Zero.setAllBits();
+    if (!!DemandedSubElts) {
+      Known = computeKnownBits(Sub, DemandedSubElts, Depth + 1);
       if (Known.isUnknown())
         break; // early-out.
-      Known2 = computeKnownBits(Src, Depth + 1);
+    }
+    if (!!DemandedSrcElts) {
+      Known2 = computeKnownBits(Src, DemandedSrcElts, Depth + 1);
       Known.One &= Known2.One;
       Known.Zero &= Known2.Zero;
     }
     break;
   }
   case ISD::EXTRACT_SUBVECTOR: {
-    // If we know the element index, just demand that subvector elements,
-    // otherwise demand them all.
+    // Offset the demanded elts by the subvector index.
     SDValue Src = Op.getOperand(0);
-    ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    // Bail until we can represent demanded elements for scalable vectors.
+    if (Src.getValueType().isScalableVector())
+      break;
+    uint64_t Idx = Op.getConstantOperandVal(1);
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
-    APInt DemandedSrc = APInt::getAllOnesValue(NumSrcElts);
-    if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
-      // Offset the demanded elts by the subvector index.
-      uint64_t Idx = SubIdx->getZExtValue();
-      DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
-    }
-    Known = computeKnownBits(Src, DemandedSrc, Depth + 1);
+    APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+    Known = computeKnownBits(Src, DemandedSrcElts, Depth + 1);
     break;
   }
   case ISD::SCALAR_TO_VECTOR: {
@@ -2753,35 +2810,23 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     break;
   }
   case ISD::AND:
-    // If either the LHS or the RHS are Zero, the result is zero.
     Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
 
-    // Output known-1 bits are only known if set in both the LHS & RHS.
-    Known.One &= Known2.One;
-    // Output known-0 are known to be clear if zero in either the LHS | RHS.
-    Known.Zero |= Known2.Zero;
+    Known &= Known2;
     break;
   case ISD::OR:
     Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
 
-    // Output known-0 bits are only known if clear in both the LHS & RHS.
-    Known.Zero &= Known2.Zero;
-    // Output known-1 are known to be set if set in either the LHS | RHS.
-    Known.One |= Known2.One;
+    Known |= Known2;
     break;
-  case ISD::XOR: {
+  case ISD::XOR:
     Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
 
-    // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    APInt KnownZeroOut = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
-    // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    Known.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
-    Known.Zero = KnownZeroOut;
+    Known ^= Known2;
     break;
-  }
   case ISD::MUL: {
     Known = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
@@ -3075,12 +3120,12 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     EVT InVT = Op.getOperand(0).getValueType();
     APInt InDemandedElts = DemandedElts.zextOrSelf(InVT.getVectorNumElements());
     Known = computeKnownBits(Op.getOperand(0), InDemandedElts, Depth + 1);
-    Known = Known.zext(BitWidth, true /* ExtendedBitsAreKnownZero */);
+    Known = Known.zext(BitWidth);
     break;
   }
   case ISD::ZERO_EXTEND: {
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
-    Known = Known.zext(BitWidth, true /* ExtendedBitsAreKnownZero */);
+    Known = Known.zext(BitWidth);
     break;
   }
   case ISD::SIGN_EXTEND_VECTOR_INREG: {
@@ -3099,9 +3144,16 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     Known = Known.sext(BitWidth);
     break;
   }
+  case ISD::ANY_EXTEND_VECTOR_INREG: {
+    EVT InVT = Op.getOperand(0).getValueType();
+    APInt InDemandedElts = DemandedElts.zextOrSelf(InVT.getVectorNumElements());
+    Known = computeKnownBits(Op.getOperand(0), InDemandedElts, Depth + 1);
+    Known = Known.anyext(BitWidth);
+    break;
+  }
   case ISD::ANY_EXTEND: {
-    Known = computeKnownBits(Op.getOperand(0), Depth+1);
-    Known = Known.zext(BitWidth, false /* ExtendedBitsAreKnownZero */);
+    Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    Known = Known.anyext(BitWidth);
     break;
   }
   case ISD::TRUNCATE: {
@@ -3117,6 +3169,15 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     Known.One  &= (~Known.Zero);
     break;
   }
+  case ISD::AssertAlign: {
+    unsigned LogOfAlign = Log2(cast<AssertAlignSDNode>(Op)->getAlign());
+    assert(LogOfAlign != 0);
+    // If a node is guaranteed to be aligned, set low zero bits accordingly as
+    // well as clearing one bits.
+    Known.Zero.setLowBits(LogOfAlign);
+    Known.One.clearLowBits(LogOfAlign);
+    break;
+  }
   case ISD::FGETSIGN:
     // All bits are zero except the low bit.
     Known.Zero.setBitsFrom(1);
@@ -3134,6 +3195,9 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     LLVM_FALLTHROUGH;
   case ISD::SUB:
   case ISD::SUBC: {
+    assert(Op.getResNo() == 0 &&
+           "We only compute knownbits for the difference here.");
+
     Known = computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
     Known2 = computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
     Known = KnownBits::computeForAddSub(/* Add */ false, /* NSW */ false,
@@ -3245,57 +3309,51 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
     EVT VecVT = InVec.getValueType();
     const unsigned EltBitWidth = VecVT.getScalarSizeInBits();
     const unsigned NumSrcElts = VecVT.getVectorNumElements();
+
     // If BitWidth > EltBitWidth the value is anyext:ed. So we do not know
     // anything about the extended bits.
     if (BitWidth > EltBitWidth)
       Known = Known.trunc(EltBitWidth);
-    ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
-    if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts)) {
-      // If we know the element index, just demand that vector element.
-      unsigned Idx = ConstEltNo->getZExtValue();
-      APInt DemandedElt = APInt::getOneBitSet(NumSrcElts, Idx);
-      Known = computeKnownBits(InVec, DemandedElt, Depth + 1);
-    } else {
-      // Unknown element index, so ignore DemandedElts and demand them all.
-      Known = computeKnownBits(InVec, Depth + 1);
-    }
+
+    // If we know the element index, just demand that vector element, else for
+    // an unknown element index, ignore DemandedElts and demand them all.
+    APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
+    auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
+      DemandedSrcElts =
+          APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());
+
+    Known = computeKnownBits(InVec, DemandedSrcElts, Depth + 1);
     if (BitWidth > EltBitWidth)
-      Known = Known.zext(BitWidth, false /* => any extend */);
+      Known = Known.anyext(BitWidth);
     break;
   }
   case ISD::INSERT_VECTOR_ELT: {
+    // If we know the element index, split the demand between the
+    // source vector and the inserted element, otherwise assume we need
+    // the original demanded vector elements and the value.
     SDValue InVec = Op.getOperand(0);
     SDValue InVal = Op.getOperand(1);
     SDValue EltNo = Op.getOperand(2);
-
-    ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    bool DemandedVal = true;
+    APInt DemandedVecElts = DemandedElts;
+    auto *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
     if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
-      // If we know the element index, split the demand between the
-      // source vector and the inserted element.
-      Known.Zero = Known.One = APInt::getAllOnesValue(BitWidth);
       unsigned EltIdx = CEltNo->getZExtValue();
-
-      // If we demand the inserted element then add its common known bits.
-      if (DemandedElts[EltIdx]) {
-        Known2 = computeKnownBits(InVal, Depth + 1);
-        Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
-        Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
-      }
-
-      // If we demand the source vector then add its common known bits, ensuring
-      // that we don't demand the inserted element.
-      APInt VectorElts = DemandedElts & ~(APInt::getOneBitSet(NumElts, EltIdx));
-      if (!!VectorElts) {
-        Known2 = computeKnownBits(InVec, VectorElts, Depth + 1);
-        Known.One &= Known2.One;
-        Known.Zero &= Known2.Zero;
-      }
-    } else {
-      // Unknown element index, so ignore DemandedElts and demand them all.
-      Known = computeKnownBits(InVec, Depth + 1);
+      DemandedVal = !!DemandedElts[EltIdx];
+      DemandedVecElts.clearBit(EltIdx);
+    }
+    Known.One.setAllBits();
+    Known.Zero.setAllBits();
+    if (DemandedVal) {
       Known2 = computeKnownBits(InVal, Depth + 1);
-      Known.One &= Known2.One.zextOrTrunc(Known.One.getBitWidth());
-      Known.Zero &= Known2.Zero.zextOrTrunc(Known.Zero.getBitWidth());
+      Known.One &= Known2.One.zextOrTrunc(BitWidth);
+      Known.Zero &= Known2.Zero.zextOrTrunc(BitWidth);
+    }
+    if (!!DemandedVecElts) {
+      Known2 = computeKnownBits(InVec, DemandedVecElts, Depth + 1);
+      Known.One &= Known2.One;
+      Known.Zero &= Known2.Zero;
     }
     break;
   }
@@ -3399,7 +3457,8 @@ KnownBits SelectionDAG::computeKnownBits(SDValue Op, const APInt &DemandedElts,
   }
   case ISD::FrameIndex:
   case ISD::TargetFrameIndex:
-    TLI->computeKnownBitsForFrameIndex(Op, Known, DemandedElts, *this, Depth);
+    TLI->computeKnownBitsForFrameIndex(cast<FrameIndexSDNode>(Op)->getIndex(),
+                                       Known, getMachineFunction());
     break;
 
   default:
@@ -3492,6 +3551,11 @@ bool SelectionDAG::isKnownToBeAPowerOfTwo(SDValue Val) const {
 
 unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, unsigned Depth) const {
   EVT VT = Op.getValueType();
+
+  // TODO: Assume we don't know anything for now.
+  if (VT.isScalableVector())
+    return 1;
+
   APInt DemandedElts = VT.isVector()
                            ? APInt::getAllOnesValue(VT.getVectorNumElements())
                            : APInt(1, 1);
@@ -3515,7 +3579,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   if (Depth >= MaxRecursionDepth)
     return 1;  // Limit search depth.
 
-  if (!DemandedElts)
+  if (!DemandedElts || VT.isScalableVector())
     return 1;  // No demanded elts, better to assume we don't know anything.
 
   unsigned Opcode = Op.getOpcode();
@@ -3535,7 +3599,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
         continue;
 
       SDValue SrcOp = Op.getOperand(i);
-      Tmp2 = ComputeNumSignBits(Op.getOperand(i), Depth + 1);
+      Tmp2 = ComputeNumSignBits(SrcOp, Depth + 1);
 
       // BUILD_VECTOR can implicitly truncate sources, we must handle this.
       if (SrcOp.getValueSizeInBits() != VTBits) {
@@ -3646,23 +3710,17 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   case ISD::SRA:
     Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
     // SRA X, C -> adds C sign bits.
-    if (const APInt *ShAmt = getValidShiftAmountConstant(Op, DemandedElts))
-      Tmp = std::min<uint64_t>(Tmp + ShAmt->getZExtValue(), VTBits);
-    else if (const APInt *ShAmt =
-                 getValidMinimumShiftAmountConstant(Op, DemandedElts))
+    if (const APInt *ShAmt =
+            getValidMinimumShiftAmountConstant(Op, DemandedElts))
       Tmp = std::min<uint64_t>(Tmp + ShAmt->getZExtValue(), VTBits);
     return Tmp;
   case ISD::SHL:
-    if (const APInt *ShAmt = getValidShiftAmountConstant(Op, DemandedElts)) {
+    if (const APInt *ShAmt =
+            getValidMaximumShiftAmountConstant(Op, DemandedElts)) {
       // shl destroys sign bits, ensure it doesn't shift out all sign bits.
       Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
       if (ShAmt->ult(Tmp))
         return Tmp - ShAmt->getZExtValue();
-    } else if (const APInt *ShAmt =
-                   getValidMaximumShiftAmountConstant(Op, DemandedElts)) {
-      Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
-      if (ShAmt->ult(Tmp))
-        return Tmp - ShAmt->getZExtValue();
     }
     break;
   case ISD::AND:
@@ -3712,18 +3770,18 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     }
 
     // Fallback - just get the minimum number of sign bits of the operands.
-    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
     if (Tmp == 1)
       return 1;  // Early out.
-    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
     return std::min(Tmp, Tmp2);
   }
   case ISD::UMIN:
   case ISD::UMAX:
-    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
     if (Tmp == 1)
       return 1;  // Early out.
-    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth + 1);
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
     return std::min(Tmp, Tmp2);
   case ISD::SADDO:
   case ISD::UADDO:
@@ -3753,7 +3811,14 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   }
   case ISD::ROTL:
   case ISD::ROTR:
-    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+
+    // If we're rotating an 0/-1 value, then it stays an 0/-1 value.
+    if (Tmp == VTBits)
+      return VTBits;
+
+    if (ConstantSDNode *C =
+            isConstOrConstSplat(Op.getOperand(1), DemandedElts)) {
       unsigned RotAmt = C->getAPIntValue().urem(VTBits);
 
       // Handle rotate right by N like a rotate left by 32-N.
@@ -3762,7 +3827,6 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
 
       // If we aren't rotating out all of the known-in sign bits, return the
       // number that are left.  This handles rotl(sext(x), 1) for example.
-      Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
       if (Tmp > (RotAmt + 1)) return (Tmp - RotAmt);
     }
     break;
@@ -3770,13 +3834,15 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
   case ISD::ADDC:
     // Add can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
-    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
-    if (Tmp == 1) return 1;  // Early out.
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    if (Tmp == 1) return 1; // Early out.
 
     // Special case decrementing a value (ADD X, -1):
-    if (ConstantSDNode *CRHS = dyn_cast<ConstantSDNode>(Op.getOperand(1)))
+    if (ConstantSDNode *CRHS =
+            isConstOrConstSplat(Op.getOperand(1), DemandedElts))
       if (CRHS->isAllOnesValue()) {
-        KnownBits Known = computeKnownBits(Op.getOperand(0), Depth+1);
+        KnownBits Known =
+            computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
 
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
@@ -3789,18 +3855,19 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
           return Tmp;
       }
 
-    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
-    if (Tmp2 == 1) return 1;
-    return std::min(Tmp, Tmp2)-1;
-
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    if (Tmp2 == 1) return 1; // Early out.
+    return std::min(Tmp, Tmp2) - 1;
   case ISD::SUB:
-    Tmp2 = ComputeNumSignBits(Op.getOperand(1), Depth+1);
-    if (Tmp2 == 1) return 1;
+    Tmp2 = ComputeNumSignBits(Op.getOperand(1), DemandedElts, Depth + 1);
+    if (Tmp2 == 1) return 1; // Early out.
 
     // Handle NEG.
-    if (ConstantSDNode *CLHS = isConstOrConstSplat(Op.getOperand(0)))
+    if (ConstantSDNode *CLHS =
+            isConstOrConstSplat(Op.getOperand(0), DemandedElts))
       if (CLHS->isNullValue()) {
-        KnownBits Known = computeKnownBits(Op.getOperand(1), Depth+1);
+        KnownBits Known =
+            computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
         // If the input is known to be 0 or 1, the output is 0/-1, which is all
         // sign bits set.
         if ((Known.Zero | 1).isAllOnesValue())
@@ -3816,9 +3883,9 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
 
     // Sub can have at most one carry bit.  Thus we know that the output
     // is, at worst, one more bit than the inputs.
-    Tmp = ComputeNumSignBits(Op.getOperand(0), Depth+1);
-    if (Tmp == 1) return 1;  // Early out.
-    return std::min(Tmp, Tmp2)-1;
+    Tmp = ComputeNumSignBits(Op.getOperand(0), DemandedElts, Depth + 1);
+    if (Tmp == 1) return 1; // Early out.
+    return std::min(Tmp, Tmp2) - 1;
   case ISD::MUL: {
     // The output of the Mul can be at most twice the valid bits in the inputs.
     unsigned SignBitsOp0 = ComputeNumSignBits(Op.getOperand(0), Depth + 1);
@@ -3853,39 +3920,32 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     return std::max(std::min(KnownSign - rIndex * BitWidth, BitWidth), 0);
   }
   case ISD::INSERT_VECTOR_ELT: {
+    // If we know the element index, split the demand between the
+    // source vector and the inserted element, otherwise assume we need
+    // the original demanded vector elements and the value.
     SDValue InVec = Op.getOperand(0);
     SDValue InVal = Op.getOperand(1);
     SDValue EltNo = Op.getOperand(2);
-
-    ConstantSDNode *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    bool DemandedVal = true;
+    APInt DemandedVecElts = DemandedElts;
+    auto *CEltNo = dyn_cast<ConstantSDNode>(EltNo);
     if (CEltNo && CEltNo->getAPIntValue().ult(NumElts)) {
-      // If we know the element index, split the demand between the
-      // source vector and the inserted element.
       unsigned EltIdx = CEltNo->getZExtValue();
-
-      // If we demand the inserted element then get its sign bits.
-      Tmp = std::numeric_limits<unsigned>::max();
-      if (DemandedElts[EltIdx]) {
-        // TODO - handle implicit truncation of inserted elements.
-        if (InVal.getScalarValueSizeInBits() != VTBits)
-          break;
-        Tmp = ComputeNumSignBits(InVal, Depth + 1);
-      }
-
-      // If we demand the source vector then get its sign bits, and determine
-      // the minimum.
-      APInt VectorElts = DemandedElts;
-      VectorElts.clearBit(EltIdx);
-      if (!!VectorElts) {
-        Tmp2 = ComputeNumSignBits(InVec, VectorElts, Depth + 1);
-        Tmp = std::min(Tmp, Tmp2);
-      }
-    } else {
-      // Unknown element index, so ignore DemandedElts and demand them all.
-      Tmp = ComputeNumSignBits(InVec, Depth + 1);
+      DemandedVal = !!DemandedElts[EltIdx];
+      DemandedVecElts.clearBit(EltIdx);
+    }
+    Tmp = std::numeric_limits<unsigned>::max();
+    if (DemandedVal) {
+      // TODO - handle implicit truncation of inserted elements.
+      if (InVal.getScalarValueSizeInBits() != VTBits)
+        break;
       Tmp2 = ComputeNumSignBits(InVal, Depth + 1);
       Tmp = std::min(Tmp, Tmp2);
     }
+    if (!!DemandedVecElts) {
+      Tmp2 = ComputeNumSignBits(InVec, DemandedVecElts, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
     assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
     return Tmp;
   }
@@ -3906,7 +3966,7 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     // If we know the element index, just demand that vector element, else for
     // an unknown element index, ignore DemandedElts and demand them all.
     APInt DemandedSrcElts = APInt::getAllOnesValue(NumSrcElts);
-    ConstantSDNode *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
+    auto *ConstEltNo = dyn_cast<ConstantSDNode>(EltNo);
     if (ConstEltNo && ConstEltNo->getAPIntValue().ult(NumSrcElts))
       DemandedSrcElts =
           APInt::getOneBitSet(NumSrcElts, ConstEltNo->getZExtValue());
@@ -3914,18 +3974,15 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     return ComputeNumSignBits(InVec, DemandedSrcElts, Depth + 1);
   }
   case ISD::EXTRACT_SUBVECTOR: {
-    // If we know the element index, just demand that subvector elements,
-    // otherwise demand them all.
+    // Offset the demanded elts by the subvector index.
     SDValue Src = Op.getOperand(0);
-    ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    // Bail until we can represent demanded elements for scalable vectors.
+    if (Src.getValueType().isScalableVector())
+      break;
+    uint64_t Idx = Op.getConstantOperandVal(1);
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
-    APInt DemandedSrc = APInt::getAllOnesValue(NumSrcElts);
-    if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
-      // Offset the demanded elts by the subvector index.
-      uint64_t Idx = SubIdx->getZExtValue();
-      DemandedSrc = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
-    }
-    return ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
+    APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+    return ComputeNumSignBits(Src, DemandedSrcElts, Depth + 1);
   }
   case ISD::CONCAT_VECTORS: {
     // Determine the minimum number of sign bits across all demanded
@@ -3946,35 +4003,26 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     return Tmp;
   }
   case ISD::INSERT_SUBVECTOR: {
-    // If we know the element index, demand any elements from the subvector and
-    // the remainder from the src its inserted into, otherwise demand them all.
+    // Demand any elements from the subvector and the remainder from the src its
+    // inserted into.
     SDValue Src = Op.getOperand(0);
     SDValue Sub = Op.getOperand(1);
-    auto *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(2));
+    uint64_t Idx = Op.getConstantOperandVal(2);
     unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
-    if (SubIdx && SubIdx->getAPIntValue().ule(NumElts - NumSubElts)) {
-      Tmp = std::numeric_limits<unsigned>::max();
-      uint64_t Idx = SubIdx->getZExtValue();
-      APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
-      if (!!DemandedSubElts) {
-        Tmp = ComputeNumSignBits(Sub, DemandedSubElts, Depth + 1);
-        if (Tmp == 1) return 1; // early-out
-      }
-      APInt SubMask = APInt::getBitsSet(NumElts, Idx, Idx + NumSubElts);
-      APInt DemandedSrcElts = DemandedElts & ~SubMask;
-      if (!!DemandedSrcElts) {
-        Tmp2 = ComputeNumSignBits(Src, DemandedSrcElts, Depth + 1);
-        Tmp = std::min(Tmp, Tmp2);
-      }
-      assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
-      return Tmp;
-    }
+    APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+    APInt DemandedSrcElts = DemandedElts;
+    DemandedSrcElts.insertBits(APInt::getNullValue(NumSubElts), Idx);
 
-    // Not able to determine the index so just assume worst case.
-    Tmp = ComputeNumSignBits(Sub, Depth + 1);
-    if (Tmp == 1) return 1; // early-out
-    Tmp2 = ComputeNumSignBits(Src, Depth + 1);
-    Tmp = std::min(Tmp, Tmp2);
+    Tmp = std::numeric_limits<unsigned>::max();
+    if (!!DemandedSubElts) {
+      Tmp = ComputeNumSignBits(Sub, DemandedSubElts, Depth + 1);
+      if (Tmp == 1)
+        return 1; // early-out
+    }
+    if (!!DemandedSrcElts) {
+      Tmp2 = ComputeNumSignBits(Src, DemandedSrcElts, Depth + 1);
+      Tmp = std::min(Tmp, Tmp2);
+    }
     assert(Tmp <= VTBits && "Failed to determine minimum sign bits");
     return Tmp;
   }
@@ -4052,13 +4100,10 @@ unsigned SelectionDAG::ComputeNumSignBits(SDValue Op, const APInt &DemandedElts,
     return FirstAnswer;
   }
 
-  // Okay, we know that the sign bit in Mask is set.  Use CLZ to determine
+  // Okay, we know that the sign bit in Mask is set.  Use CLO to determine
   // the number of identical bits in the top of the input value.
-  Mask = ~Mask;
   Mask <<= Mask.getBitWidth()-VTBits;
-  // Return # leading zeros.  We use 'min' here in case Val was zero before
-  // shifting.  We don't want to return '64' as for an i32 "0".
-  return std::max(FirstAnswer, std::min(VTBits, Mask.countLeadingZeros()));
+  return std::max(FirstAnswer, Mask.countLeadingOnes());
 }
 
 bool SelectionDAG::isBaseWithConstantOffset(SDValue Op) const {
@@ -4109,6 +4154,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
   case ISD::FFLOOR:
   case ISD::FCEIL:
   case ISD::FROUND:
+  case ISD::FROUNDEVEN:
   case ISD::FRINT:
   case ISD::FNEARBYINT: {
     if (SNaN)
@@ -4249,6 +4295,8 @@ static SDValue FoldBUILD_VECTOR(const SDLoc &DL, EVT VT,
                                 SelectionDAG &DAG) {
   int NumOps = Ops.size();
   assert(NumOps != 0 && "Can't build an empty vector!");
+  assert(!VT.isScalableVector() &&
+         "BUILD_VECTOR cannot be used with scalable types");
   assert(VT.getVectorNumElements() == (unsigned)NumOps &&
          "Incorrect element count in BUILD_VECTOR!");
 
@@ -4287,8 +4335,8 @@ static SDValue foldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
                         return Ops[0].getValueType() == Op.getValueType();
                       }) &&
          "Concatenation of vectors with inconsistent value types!");
-  assert((Ops.size() * Ops[0].getValueType().getVectorNumElements()) ==
-             VT.getVectorNumElements() &&
+  assert((Ops[0].getValueType().getVectorElementCount() * Ops.size()) ==
+             VT.getVectorElementCount() &&
          "Incorrect element count in vector concatenation!");
 
   if (Ops.size() == 1)
@@ -4305,11 +4353,10 @@ static SDValue foldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
   bool IsIdentity = true;
   for (unsigned i = 0, e = Ops.size(); i != e; ++i) {
     SDValue Op = Ops[i];
-    unsigned IdentityIndex = i * Op.getValueType().getVectorNumElements();
+    unsigned IdentityIndex = i * Op.getValueType().getVectorMinNumElements();
     if (Op.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
         Op.getOperand(0).getValueType() != VT ||
         (IdentitySrc && Op.getOperand(0) != IdentitySrc) ||
-        !isa<ConstantSDNode>(Op.getOperand(1)) ||
         Op.getConstantOperandVal(1) != IdentityIndex) {
       IsIdentity = false;
       break;
@@ -4323,6 +4370,11 @@ static SDValue foldCONCAT_VECTORS(const SDLoc &DL, EVT VT,
     return IdentitySrc;
   }
 
+  // The code below this point is only designed to work for fixed width
+  // vectors, so we bail out for now.
+  if (VT.isScalableVector())
+    return SDValue();
+
   // A CONCAT_VECTOR with all UNDEF/BUILD_VECTOR operands can be
   // simplified to one big BUILD_VECTOR.
   // FIXME: Add support for SCALAR_TO_VECTOR as well.
@@ -4508,7 +4560,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
       // FIXME need to be more flexible about rounding mode.
       (void)V.convert(APFloat::IEEEhalf(),
                       APFloat::rmNearestTiesToEven, &Ignored);
-      return getConstant(V.bitcastToAPInt(), DL, VT);
+      return getConstant(V.bitcastToAPInt().getZExtValue(), DL, VT);
     }
     }
   }
@@ -4553,6 +4605,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 
   unsigned OpOpcode = Operand.getNode()->getOpcode();
   switch (Opcode) {
+  case ISD::FREEZE:
+    assert(VT == Operand.getValueType() && "Unexpected VT!");
+    break;
   case ISD::TokenFactor:
   case ISD::MERGE_VALUES:
   case ISD::CONCAT_VECTORS:
@@ -4597,8 +4652,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            "type is vector!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
     assert((!VT.isVector() ||
-            VT.getVectorNumElements() ==
-            Operand.getValueType().getVectorNumElements()) &&
+            VT.getVectorElementCount() ==
+                Operand.getValueType().getVectorElementCount()) &&
            "Vector element count mismatch!");
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid sext node, dst < src!");
@@ -4616,8 +4671,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            "type is vector!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
     assert((!VT.isVector() ||
-            VT.getVectorNumElements() ==
-            Operand.getValueType().getVectorNumElements()) &&
+            VT.getVectorElementCount() ==
+                Operand.getValueType().getVectorElementCount()) &&
            "Vector element count mismatch!");
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid zext node, dst < src!");
@@ -4635,8 +4690,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            "type is vector!");
     if (Operand.getValueType() == VT) return Operand;   // noop extension
     assert((!VT.isVector() ||
-            VT.getVectorNumElements() ==
-            Operand.getValueType().getVectorNumElements()) &&
+            VT.getVectorElementCount() ==
+                Operand.getValueType().getVectorElementCount()) &&
            "Vector element count mismatch!");
     assert(Operand.getValueType().bitsLT(VT) &&
            "Invalid anyext node, dst < src!");
@@ -4665,8 +4720,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            "type is vector!");
     if (Operand.getValueType() == VT) return Operand;   // noop truncate
     assert((!VT.isVector() ||
-            VT.getVectorNumElements() ==
-            Operand.getValueType().getVectorNumElements()) &&
+            VT.getVectorElementCount() ==
+                Operand.getValueType().getVectorElementCount()) &&
            "Vector element count mismatch!");
     assert(Operand.getValueType().bitsGT(VT) &&
            "Invalid truncate node, src < dst!");
@@ -4753,6 +4808,9 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (OpOpcode == ISD::FNEG)  // abs(-X) -> abs(X)
       return getNode(ISD::FABS, DL, VT, Operand.getOperand(0));
     break;
+  case ISD::VSCALE:
+    assert(VT == Operand.getValueType() && "Unexpected VT!");
+    break;
   }
 
   SDNode *N;
@@ -4824,17 +4882,6 @@ static llvm::Optional<APInt> FoldValue(unsigned Opcode, const APInt &C1,
   return llvm::None;
 }
 
-SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
-                                             EVT VT, const ConstantSDNode *C1,
-                                             const ConstantSDNode *C2) {
-  if (C1->isOpaque() || C2->isOpaque())
-    return SDValue();
-  if (Optional<APInt> Folded =
-          FoldValue(Opcode, C1->getAPIntValue(), C2->getAPIntValue()))
-    return getConstant(Folded.getValue(), DL, VT);
-  return SDValue();
-}
-
 SDValue SelectionDAG::FoldSymbolOffset(unsigned Opcode, EVT VT,
                                        const GlobalAddressSDNode *GA,
                                        const SDNode *N2) {
@@ -4881,20 +4928,37 @@ bool SelectionDAG::isUndef(unsigned Opcode, ArrayRef<SDValue> Ops) {
 }
 
 SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
-                                             EVT VT, SDNode *N1, SDNode *N2) {
+                                             EVT VT, ArrayRef<SDValue> Ops) {
   // If the opcode is a target-specific ISD node, there's nothing we can
   // do here and the operand rules may not line up with the below, so
   // bail early.
   if (Opcode >= ISD::BUILTIN_OP_END)
     return SDValue();
 
-  if (isUndef(Opcode, {SDValue(N1, 0), SDValue(N2, 0)}))
+  // For now, the array Ops should only contain two values.
+  // This enforcement will be removed once this function is merged with
+  // FoldConstantVectorArithmetic
+  if (Ops.size() != 2)
+    return SDValue();
+
+  if (isUndef(Opcode, Ops))
     return getUNDEF(VT);
 
+  SDNode *N1 = Ops[0].getNode();
+  SDNode *N2 = Ops[1].getNode();
+
   // Handle the case of two scalars.
   if (auto *C1 = dyn_cast<ConstantSDNode>(N1)) {
     if (auto *C2 = dyn_cast<ConstantSDNode>(N2)) {
-      SDValue Folded = FoldConstantArithmetic(Opcode, DL, VT, C1, C2);
+      if (C1->isOpaque() || C2->isOpaque())
+        return SDValue();
+
+      Optional<APInt> FoldAttempt =
+          FoldValue(Opcode, C1->getAPIntValue(), C2->getAPIntValue());
+      if (!FoldAttempt)
+        return SDValue();
+
+      SDValue Folded = getConstant(FoldAttempt.getValue(), DL, VT);
       assert((!Folded || !VT.isVector()) &&
              "Can't fold vectors ops with scalar operands");
       return Folded;
@@ -4908,8 +4972,14 @@ SDValue SelectionDAG::FoldConstantArithmetic(unsigned Opcode, const SDLoc &DL,
     if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(N2))
       return FoldSymbolOffset(Opcode, VT, GA, N1);
 
-  // For vectors, extract each constant element and fold them individually.
-  // Either input may be an undef value.
+  // TODO: All the folds below are performed lane-by-lane and assume a fixed
+  // vector width, however we should be able to do constant folds involving
+  // splat vector nodes too.
+  if (VT.isScalableVector())
+    return SDValue();
+
+  // For fixed width vectors, extract each constant element and fold them
+  // individually. Either input may be an undef value.
   auto *BV1 = dyn_cast<BuildVectorSDNode>(N1);
   if (!BV1 && !N1->isUndef())
     return SDValue();
@@ -4985,6 +5055,13 @@ SDValue SelectionDAG::FoldConstantVectorArithmetic(unsigned Opcode,
   if (!VT.isVector())
     return SDValue();
 
+  // TODO: All the folds below are performed lane-by-lane and assume a fixed
+  // vector width, however we should be able to do constant folds involving
+  // splat vector nodes too.
+  if (VT.isScalableVector())
+    return SDValue();
+
+  // From this point onwards all vectors are assumed to be fixed width.
   unsigned NumElts = VT.getVectorNumElements();
 
   auto IsScalarOrSameVectorSize = [&](const SDValue &Op) {
@@ -5107,8 +5184,13 @@ SDValue SelectionDAG::foldConstantFPMath(unsigned Opcode, const SDLoc &DL,
   }
 
   switch (Opcode) {
-  case ISD::FADD:
   case ISD::FSUB:
+    // -0.0 - undef --> undef (consistent with "fneg undef")
+    if (N1CFP && N1CFP->getValueAPF().isNegZero() && N2.isUndef())
+      return getUNDEF(VT);
+    LLVM_FALLTHROUGH;
+
+  case ISD::FADD:
   case ISD::FMUL:
   case ISD::FDIV:
   case ISD::FREM:
@@ -5122,6 +5204,34 @@ SDValue SelectionDAG::foldConstantFPMath(unsigned Opcode, const SDLoc &DL,
   return SDValue();
 }
 
+SDValue SelectionDAG::getAssertAlign(const SDLoc &DL, SDValue Val, Align A) {
+  assert(Val.getValueType().isInteger() && "Invalid AssertAlign!");
+
+  // There's no need to assert on a byte-aligned pointer. All pointers are at
+  // least byte aligned.
+  if (A == Align(1))
+    return Val;
+
+  FoldingSetNodeID ID;
+  AddNodeIDNode(ID, ISD::AssertAlign, getVTList(Val.getValueType()), {Val});
+  ID.AddInteger(A.value());
+
+  void *IP = nullptr;
+  if (SDNode *E = FindNodeOrInsertPos(ID, DL, IP))
+    return SDValue(E, 0);
+
+  auto *N = newSDNode<AssertAlignSDNode>(DL.getIROrder(), DL.getDebugLoc(),
+                                         Val.getValueType(), A);
+  createOperands(N, {Val});
+
+  CSEMap.InsertNode(N, IP);
+  InsertNode(N);
+
+  SDValue V(N, 0);
+  NewSDValueDbgMsg(V, "Creating new node: ", this);
+  return V;
+}
+
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
                               SDValue N1, SDValue N2, const SDNodeFlags Flags) {
   ConstantSDNode *N1C = dyn_cast<ConstantSDNode>(N1);
@@ -5186,11 +5296,20 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (N2C && N2C->isNullValue())
       return N1;
     break;
+  case ISD::MUL:
+    assert(VT.isInteger() && "This operator does not apply to FP types!");
+    assert(N1.getValueType() == N2.getValueType() &&
+           N1.getValueType() == VT && "Binary operator types must match!");
+    if (N2C && (N1.getOpcode() == ISD::VSCALE) && Flags.hasNoSignedWrap()) {
+      APInt MulImm = cast<ConstantSDNode>(N1->getOperand(0))->getAPIntValue();
+      APInt N2CImm = N2C->getAPIntValue();
+      return getVScale(DL, VT, MulImm * N2CImm);
+    }
+    break;
   case ISD::UDIV:
   case ISD::UREM:
   case ISD::MULHU:
   case ISD::MULHS:
-  case ISD::MUL:
   case ISD::SDIV:
   case ISD::SREM:
   case ISD::SMIN:
@@ -5213,7 +5332,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     assert(VT.isFloatingPoint() && "This operator only applies to FP types!");
     assert(N1.getValueType() == N2.getValueType() &&
            N1.getValueType() == VT && "Binary operator types must match!");
-    if (SDValue V = simplifyFPBinop(Opcode, N1, N2))
+    if (SDValue V = simplifyFPBinop(Opcode, N1, N2, Flags))
       return V;
     break;
   case ISD::FCOPYSIGN:   // N1 and result must match.  N1/N2 need not match.
@@ -5223,6 +5342,12 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            "Invalid FCOPYSIGN!");
     break;
   case ISD::SHL:
+    if (N2C && (N1.getOpcode() == ISD::VSCALE) && Flags.hasNoSignedWrap()) {
+      APInt MulImm = cast<ConstantSDNode>(N1->getOperand(0))->getAPIntValue();
+      APInt ShiftImm = N2C->getAPIntValue();
+      return getVScale(DL, VT, MulImm << ShiftImm);
+    }
+    LLVM_FALLTHROUGH;
   case ISD::SRA:
   case ISD::SRL:
     if (SDValue V = simplifyShift(N1, N2))
@@ -5240,7 +5365,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // amounts.  This catches things like trying to shift an i1024 value by an
     // i8, which is easy to fall into in generic code that uses
     // TLI.getShiftAmount().
-    assert(N2.getValueSizeInBits() >= Log2_32_Ceil(N1.getValueSizeInBits()) &&
+    assert(N2.getValueType().getScalarSizeInBits().getFixedSize() >=
+               Log2_32_Ceil(VT.getScalarSizeInBits().getFixedSize()) &&
            "Invalid use of small shift amount with oversized value!");
 
     // Always fold shifts of i1 values so the code generator doesn't need to
@@ -5281,7 +5407,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            "SIGN_EXTEND_INREG type should be vector iff the operand "
            "type is vector!");
     assert((!EVT.isVector() ||
-            EVT.getVectorNumElements() == VT.getVectorNumElements()) &&
+            EVT.getVectorElementCount() == VT.getVectorElementCount()) &&
            "Vector element counts must match in SIGN_EXTEND_INREG");
     assert(EVT.bitsLE(VT) && "Not extending!");
     if (EVT == VT) return N1;  // Not actually extending
@@ -5323,27 +5449,36 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     if (N1.isUndef() || N2.isUndef())
       return getUNDEF(VT);
 
-    // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF
-    if (N2C && N2C->getAPIntValue().uge(N1.getValueType().getVectorNumElements()))
+    // EXTRACT_VECTOR_ELT of out-of-bounds element is an UNDEF for fixed length
+    // vectors. For scalable vectors we will provide appropriate support for
+    // dealing with arbitrary indices.
+    if (N2C && N1.getValueType().isFixedLengthVector() &&
+        N2C->getAPIntValue().uge(N1.getValueType().getVectorNumElements()))
       return getUNDEF(VT);
 
     // EXTRACT_VECTOR_ELT of CONCAT_VECTORS is often formed while lowering is
-    // expanding copies of large vectors from registers.
-    if (N2C &&
-        N1.getOpcode() == ISD::CONCAT_VECTORS &&
-        N1.getNumOperands() > 0) {
+    // expanding copies of large vectors from registers. This only works for
+    // fixed length vectors, since we need to know the exact number of
+    // elements.
+    if (N2C && N1.getOperand(0).getValueType().isFixedLengthVector() &&
+        N1.getOpcode() == ISD::CONCAT_VECTORS && N1.getNumOperands() > 0) {
       unsigned Factor =
         N1.getOperand(0).getValueType().getVectorNumElements();
       return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT,
                      N1.getOperand(N2C->getZExtValue() / Factor),
-                     getConstant(N2C->getZExtValue() % Factor, DL,
-                                 N2.getValueType()));
+                     getVectorIdxConstant(N2C->getZExtValue() % Factor, DL));
     }
 
-    // EXTRACT_VECTOR_ELT of BUILD_VECTOR is often formed while lowering is
-    // expanding large vector constants.
-    if (N2C && N1.getOpcode() == ISD::BUILD_VECTOR) {
-      SDValue Elt = N1.getOperand(N2C->getZExtValue());
+    // EXTRACT_VECTOR_ELT of BUILD_VECTOR or SPLAT_VECTOR is often formed while
+    // lowering is expanding large vector constants.
+    if (N2C && (N1.getOpcode() == ISD::BUILD_VECTOR ||
+                N1.getOpcode() == ISD::SPLAT_VECTOR)) {
+      assert((N1.getOpcode() != ISD::BUILD_VECTOR ||
+              N1.getValueType().isFixedLengthVector()) &&
+             "BUILD_VECTOR used for scalable vectors");
+      unsigned Index =
+          N1.getOpcode() == ISD::BUILD_VECTOR ? N2C->getZExtValue() : 0;
+      SDValue Elt = N1.getOperand(Index);
 
       if (VT != Elt.getValueType())
         // If the vector element type is not legal, the BUILD_VECTOR operands
@@ -5377,8 +5512,14 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
 
     // EXTRACT_VECTOR_ELT of v1iX EXTRACT_SUBVECTOR could be formed
     // when vector types are scalarized and v1iX is legal.
-    // vextract (v1iX extract_subvector(vNiX, Idx)) -> vextract(vNiX,Idx)
+    // vextract (v1iX extract_subvector(vNiX, Idx)) -> vextract(vNiX,Idx).
+    // Here we are completely ignoring the extract element index (N2),
+    // which is fine for fixed width vectors, since any index other than 0
+    // is undefined anyway. However, this cannot be ignored for scalable
+    // vectors - in theory we could support this, but we don't want to do this
+    // without a profitability check.
     if (N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        N1.getValueType().isFixedLengthVector() &&
         N1.getValueType().getVectorNumElements() == 1) {
       return getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, N1.getOperand(0),
                      N1.getOperand(1));
@@ -5406,50 +5547,48 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     }
     break;
   case ISD::EXTRACT_SUBVECTOR:
-    if (VT.isSimple() && N1.getValueType().isSimple()) {
-      assert(VT.isVector() && N1.getValueType().isVector() &&
-             "Extract subvector VTs must be a vectors!");
-      assert(VT.getVectorElementType() ==
-             N1.getValueType().getVectorElementType() &&
-             "Extract subvector VTs must have the same element type!");
-      assert(VT.getSimpleVT() <= N1.getSimpleValueType() &&
-             "Extract subvector must be from larger vector to smaller vector!");
-
-      if (N2C) {
-        assert((VT.getVectorNumElements() + N2C->getZExtValue()
-                <= N1.getValueType().getVectorNumElements())
-               && "Extract subvector overflow!");
-      }
-
-      // Trivial extraction.
-      if (VT.getSimpleVT() == N1.getSimpleValueType())
-        return N1;
-
-      // EXTRACT_SUBVECTOR of an UNDEF is an UNDEF.
-      if (N1.isUndef())
-        return getUNDEF(VT);
+    EVT N1VT = N1.getValueType();
+    assert(VT.isVector() && N1VT.isVector() &&
+           "Extract subvector VTs must be vectors!");
+    assert(VT.getVectorElementType() == N1VT.getVectorElementType() &&
+           "Extract subvector VTs must have the same element type!");
+    assert((VT.isFixedLengthVector() || N1VT.isScalableVector()) &&
+           "Cannot extract a scalable vector from a fixed length vector!");
+    assert((VT.isScalableVector() != N1VT.isScalableVector() ||
+            VT.getVectorMinNumElements() <= N1VT.getVectorMinNumElements()) &&
+           "Extract subvector must be from larger vector to smaller vector!");
+    assert(N2C && "Extract subvector index must be a constant");
+    assert((VT.isScalableVector() != N1VT.isScalableVector() ||
+            (VT.getVectorMinNumElements() + N2C->getZExtValue()) <=
+                N1VT.getVectorMinNumElements()) &&
+           "Extract subvector overflow!");
+
+    // Trivial extraction.
+    if (VT == N1VT)
+      return N1;
 
-      // EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of
-      // the concat have the same type as the extract.
-      if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS &&
-          N1.getNumOperands() > 0 &&
-          VT == N1.getOperand(0).getValueType()) {
-        unsigned Factor = VT.getVectorNumElements();
-        return N1.getOperand(N2C->getZExtValue() / Factor);
-      }
+    // EXTRACT_SUBVECTOR of an UNDEF is an UNDEF.
+    if (N1.isUndef())
+      return getUNDEF(VT);
 
-      // EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created
-      // during shuffle legalization.
-      if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) &&
-          VT == N1.getOperand(1).getValueType())
-        return N1.getOperand(1);
+    // EXTRACT_SUBVECTOR of CONCAT_VECTOR can be simplified if the pieces of
+    // the concat have the same type as the extract.
+    if (N2C && N1.getOpcode() == ISD::CONCAT_VECTORS &&
+        N1.getNumOperands() > 0 && VT == N1.getOperand(0).getValueType()) {
+      unsigned Factor = VT.getVectorMinNumElements();
+      return N1.getOperand(N2C->getZExtValue() / Factor);
     }
+
+    // EXTRACT_SUBVECTOR of INSERT_SUBVECTOR is often created
+    // during shuffle legalization.
+    if (N1.getOpcode() == ISD::INSERT_SUBVECTOR && N2 == N1.getOperand(2) &&
+        VT == N1.getOperand(1).getValueType())
+      return N1.getOperand(1);
     break;
   }
 
   // Perform trivial constant folding.
-  if (SDValue SV =
-          FoldConstantArithmetic(Opcode, DL, VT, N1.getNode(), N2.getNode()))
+  if (SDValue SV = FoldConstantArithmetic(Opcode, DL, VT, {N1, N2}))
     return SV;
 
   if (SDValue V = foldConstantFPMath(Opcode, DL, VT, N1, N2))
@@ -5571,8 +5710,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
            "SETCC operands must have the same type!");
     assert(VT.isVector() == N1.getValueType().isVector() &&
            "SETCC type should be vector iff the operand type is vector!");
-    assert((!VT.isVector() ||
-            VT.getVectorNumElements() == N1.getValueType().getVectorNumElements()) &&
+    assert((!VT.isVector() || VT.getVectorElementCount() ==
+                                  N1.getValueType().getVectorElementCount()) &&
            "SETCC vector element counts must match!");
     // Use FoldSetCC to simplify SETCC's.
     if (SDValue V = FoldSetCC(VT, N1, N2, cast<CondCodeSDNode>(N3)->get(), DL))
@@ -5594,8 +5733,11 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     llvm_unreachable("should use getVectorShuffle constructor!");
   case ISD::INSERT_VECTOR_ELT: {
     ConstantSDNode *N3C = dyn_cast<ConstantSDNode>(N3);
-    // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF
-    if (N3C && N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
+    // INSERT_VECTOR_ELT into out-of-bounds element is an UNDEF, except
+    // for scalable vectors where we will generate appropriate code to
+    // deal with out-of-bounds cases correctly.
+    if (N3C && N1.getValueType().isFixedLengthVector() &&
+        N3C->getZExtValue() >= N1.getValueType().getVectorNumElements())
       return getUNDEF(VT);
 
     // Undefined index can be assumed out-of-bounds, so that's UNDEF too.
@@ -5612,33 +5754,34 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     // Inserting undef into undef is still undef.
     if (N1.isUndef() && N2.isUndef())
       return getUNDEF(VT);
-    SDValue Index = N3;
-    if (VT.isSimple() && N1.getValueType().isSimple()
-        && N2.getValueType().isSimple()) {
-      assert(VT.isVector() && N1.getValueType().isVector() &&
-             N2.getValueType().isVector() &&
-             "Insert subvector VTs must be a vectors");
-      assert(VT == N1.getValueType() &&
-             "Dest and insert subvector source types must match!");
-      assert(N2.getSimpleValueType() <= N1.getSimpleValueType() &&
-             "Insert subvector must be from smaller vector to larger vector!");
-      if (isa<ConstantSDNode>(Index)) {
-        assert((N2.getValueType().getVectorNumElements() +
-                cast<ConstantSDNode>(Index)->getZExtValue()
-                <= VT.getVectorNumElements())
-               && "Insert subvector overflow!");
-      }
 
-      // Trivial insertion.
-      if (VT.getSimpleVT() == N2.getSimpleValueType())
-        return N2;
+    EVT N2VT = N2.getValueType();
+    assert(VT == N1.getValueType() &&
+           "Dest and insert subvector source types must match!");
+    assert(VT.isVector() && N2VT.isVector() &&
+           "Insert subvector VTs must be vectors!");
+    assert((VT.isScalableVector() || N2VT.isFixedLengthVector()) &&
+           "Cannot insert a scalable vector into a fixed length vector!");
+    assert((VT.isScalableVector() != N2VT.isScalableVector() ||
+            VT.getVectorMinNumElements() >= N2VT.getVectorMinNumElements()) &&
+           "Insert subvector must be from smaller vector to larger vector!");
+    assert(isa<ConstantSDNode>(N3) &&
+           "Insert subvector index must be constant");
+    assert((VT.isScalableVector() != N2VT.isScalableVector() ||
+            (N2VT.getVectorMinNumElements() +
+             cast<ConstantSDNode>(N3)->getZExtValue()) <=
+                VT.getVectorMinNumElements()) &&
+           "Insert subvector overflow!");
+
+    // Trivial insertion.
+    if (VT == N2VT)
+      return N2;
 
-      // If this is an insert of an extracted vector into an undef vector, we
-      // can just use the input to the extract.
-      if (N1.isUndef() && N2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-          N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT)
-        return N2.getOperand(0);
-    }
+    // If this is an insert of an extracted vector into an undef vector, we
+    // can just use the input to the extract.
+    if (N1.isUndef() && N2.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        N2.getOperand(1) == N3 && N2.getOperand(0).getValueType() == VT)
+      return N2.getOperand(0);
     break;
   }
   case ISD::BITCAST:
@@ -5867,7 +6010,7 @@ static void chainLoadsAndStoresForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
 
 static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
                                        SDValue Chain, SDValue Dst, SDValue Src,
-                                       uint64_t Size, unsigned Alignment,
+                                       uint64_t Size, Align Alignment,
                                        bool isVol, bool AlwaysInline,
                                        MachinePointerInfo DstPtrInfo,
                                        MachinePointerInfo SrcPtrInfo) {
@@ -5891,37 +6034,38 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
-  unsigned SrcAlign = DAG.InferPtrAlignment(Src);
-  if (Alignment > SrcAlign)
+  MaybeAlign SrcAlign = DAG.InferPtrAlign(Src);
+  if (!SrcAlign || Alignment > *SrcAlign)
     SrcAlign = Alignment;
+  assert(SrcAlign && "SrcAlign must be set");
   ConstantDataArraySlice Slice;
   bool CopyFromConstant = isMemSrcFromConstant(Src, Slice);
   bool isZeroConstant = CopyFromConstant && Slice.Array == nullptr;
   unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemcpy(OptSize);
-
+  const MemOp Op = isZeroConstant
+                       ? MemOp::Set(Size, DstAlignCanChange, Alignment,
+                                    /*IsZeroMemset*/ true, isVol)
+                       : MemOp::Copy(Size, DstAlignCanChange, Alignment,
+                                     *SrcAlign, isVol, CopyFromConstant);
   if (!TLI.findOptimalMemOpLowering(
-          MemOps, Limit, Size, (DstAlignCanChange ? 0 : Alignment),
-          (isZeroConstant ? 0 : SrcAlign), /*IsMemset=*/false,
-          /*ZeroMemset=*/false, /*MemcpyStrSrc=*/CopyFromConstant,
-          /*AllowOverlap=*/!isVol, DstPtrInfo.getAddrSpace(),
+          MemOps, Limit, Op, DstPtrInfo.getAddrSpace(),
           SrcPtrInfo.getAddrSpace(), MF.getFunction().getAttributes()))
     return SDValue();
 
   if (DstAlignCanChange) {
     Type *Ty = MemOps[0].getTypeForEVT(C);
-    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
+    Align NewAlign = DL.getABITypeAlign(Ty);
 
     // Don't promote to an alignment that would require dynamic stack
     // realignment.
     const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
     if (!TRI->needsStackRealignment(MF))
-      while (NewAlign > Alignment &&
-             DL.exceedsNaturalStackAlignment(Align(NewAlign)))
-        NewAlign /= 2;
+      while (NewAlign > Alignment && DL.exceedsNaturalStackAlignment(NewAlign))
+        NewAlign = NewAlign / 2;
 
     if (NewAlign > Alignment) {
       // Give the stack frame object a larger alignment if needed.
-      if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
+      if (MFI.getObjectAlign(FI->getIndex()) < NewAlign)
         MFI.setObjectAlignment(FI->getIndex(), NewAlign);
       Alignment = NewAlign;
     }
@@ -5968,7 +6112,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
       if (Value.getNode()) {
         Store = DAG.getStore(
             Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
-            DstPtrInfo.getWithOffset(DstOff), Alignment, MMOFlags);
+            DstPtrInfo.getWithOffset(DstOff), Alignment.value(), MMOFlags);
         OutChains.push_back(Store);
       }
     }
@@ -5991,12 +6135,13 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
       Value = DAG.getExtLoad(ISD::EXTLOAD, dl, NVT, Chain,
                              DAG.getMemBasePlusOffset(Src, SrcOff, dl),
                              SrcPtrInfo.getWithOffset(SrcOff), VT,
-                             MinAlign(SrcAlign, SrcOff), SrcMMOFlags);
+                             commonAlignment(*SrcAlign, SrcOff).value(),
+                             SrcMMOFlags);
       OutLoadChains.push_back(Value.getValue(1));
 
       Store = DAG.getTruncStore(
           Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
-          DstPtrInfo.getWithOffset(DstOff), VT, Alignment, MMOFlags);
+          DstPtrInfo.getWithOffset(DstOff), VT, Alignment.value(), MMOFlags);
       OutStoreChains.push_back(Store);
     }
     SrcOff += VTSize;
@@ -6052,7 +6197,7 @@ static SDValue getMemcpyLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
 
 static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
                                         SDValue Chain, SDValue Dst, SDValue Src,
-                                        uint64_t Size, unsigned Align,
+                                        uint64_t Size, Align Alignment,
                                         bool isVol, bool AlwaysInline,
                                         MachinePointerInfo DstPtrInfo,
                                         MachinePointerInfo SrcPtrInfo) {
@@ -6074,29 +6219,27 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
   FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Dst);
   if (FI && !MFI.isFixedObjectIndex(FI->getIndex()))
     DstAlignCanChange = true;
-  unsigned SrcAlign = DAG.InferPtrAlignment(Src);
-  if (Align > SrcAlign)
-    SrcAlign = Align;
+  MaybeAlign SrcAlign = DAG.InferPtrAlign(Src);
+  if (!SrcAlign || Alignment > *SrcAlign)
+    SrcAlign = Alignment;
+  assert(SrcAlign && "SrcAlign must be set");
   unsigned Limit = AlwaysInline ? ~0U : TLI.getMaxStoresPerMemmove(OptSize);
-  // FIXME: `AllowOverlap` should really be `!isVol` but there is a bug in
-  // findOptimalMemOpLowering. Meanwhile, setting it to `false` produces the
-  // correct code.
-  bool AllowOverlap = false;
   if (!TLI.findOptimalMemOpLowering(
-          MemOps, Limit, Size, (DstAlignCanChange ? 0 : Align), SrcAlign,
-          /*IsMemset=*/false, /*ZeroMemset=*/false, /*MemcpyStrSrc=*/false,
-          AllowOverlap, DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
+          MemOps, Limit,
+          MemOp::Copy(Size, DstAlignCanChange, Alignment, *SrcAlign,
+                      /*IsVolatile*/ true),
+          DstPtrInfo.getAddrSpace(), SrcPtrInfo.getAddrSpace(),
           MF.getFunction().getAttributes()))
     return SDValue();
 
   if (DstAlignCanChange) {
     Type *Ty = MemOps[0].getTypeForEVT(C);
-    unsigned NewAlign = (unsigned)DL.getABITypeAlignment(Ty);
-    if (NewAlign > Align) {
+    Align NewAlign = DL.getABITypeAlign(Ty);
+    if (NewAlign > Alignment) {
       // Give the stack frame object a larger alignment if needed.
-      if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
+      if (MFI.getObjectAlign(FI->getIndex()) < NewAlign)
         MFI.setObjectAlignment(FI->getIndex(), NewAlign);
-      Align = NewAlign;
+      Alignment = NewAlign;
     }
   }
 
@@ -6118,9 +6261,9 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     if (isDereferenceable)
       SrcMMOFlags |= MachineMemOperand::MODereferenceable;
 
-    Value =
-        DAG.getLoad(VT, dl, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl),
-                    SrcPtrInfo.getWithOffset(SrcOff), SrcAlign, SrcMMOFlags);
+    Value = DAG.getLoad(
+        VT, dl, Chain, DAG.getMemBasePlusOffset(Src, SrcOff, dl),
+        SrcPtrInfo.getWithOffset(SrcOff), SrcAlign->value(), SrcMMOFlags);
     LoadValues.push_back(Value);
     LoadChains.push_back(Value.getValue(1));
     SrcOff += VTSize;
@@ -6132,9 +6275,9 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
     unsigned VTSize = VT.getSizeInBits() / 8;
     SDValue Store;
 
-    Store = DAG.getStore(Chain, dl, LoadValues[i],
-                         DAG.getMemBasePlusOffset(Dst, DstOff, dl),
-                         DstPtrInfo.getWithOffset(DstOff), Align, MMOFlags);
+    Store = DAG.getStore(
+        Chain, dl, LoadValues[i], DAG.getMemBasePlusOffset(Dst, DstOff, dl),
+        DstPtrInfo.getWithOffset(DstOff), Alignment.value(), MMOFlags);
     OutChains.push_back(Store);
     DstOff += VTSize;
   }
@@ -6151,7 +6294,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
 /// \param Dst Pointer to destination memory location.
 /// \param Src Value of byte to write into the memory.
 /// \param Size Number of bytes to write.
-/// \param Align Alignment of the destination in bytes.
+/// \param Alignment Alignment of the destination in bytes.
 /// \param isVol True if destination is volatile.
 /// \param DstPtrInfo IR information on the memory pointer.
 /// \returns New head in the control flow, if lowering was successful, empty
@@ -6162,7 +6305,7 @@ static SDValue getMemmoveLoadsAndStores(SelectionDAG &DAG, const SDLoc &dl,
 /// memory size.
 static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
                                SDValue Chain, SDValue Dst, SDValue Src,
-                               uint64_t Size, unsigned Align, bool isVol,
+                               uint64_t Size, Align Alignment, bool isVol,
                                MachinePointerInfo DstPtrInfo) {
   // Turn a memset of undef to nop.
   // FIXME: We need to honor volatile even is Src is undef.
@@ -6183,21 +6326,19 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
   bool IsZeroVal =
     isa<ConstantSDNode>(Src) && cast<ConstantSDNode>(Src)->isNullValue();
   if (!TLI.findOptimalMemOpLowering(
-          MemOps, TLI.getMaxStoresPerMemset(OptSize), Size,
-          (DstAlignCanChange ? 0 : Align), 0, /*IsMemset=*/true,
-          /*ZeroMemset=*/IsZeroVal, /*MemcpyStrSrc=*/false,
-          /*AllowOverlap=*/!isVol, DstPtrInfo.getAddrSpace(), ~0u,
-          MF.getFunction().getAttributes()))
+          MemOps, TLI.getMaxStoresPerMemset(OptSize),
+          MemOp::Set(Size, DstAlignCanChange, Alignment, IsZeroVal, isVol),
+          DstPtrInfo.getAddrSpace(), ~0u, MF.getFunction().getAttributes()))
     return SDValue();
 
   if (DstAlignCanChange) {
     Type *Ty = MemOps[0].getTypeForEVT(*DAG.getContext());
-    unsigned NewAlign = (unsigned)DAG.getDataLayout().getABITypeAlignment(Ty);
-    if (NewAlign > Align) {
+    Align NewAlign = DAG.getDataLayout().getABITypeAlign(Ty);
+    if (NewAlign > Alignment) {
       // Give the stack frame object a larger alignment if needed.
-      if (MFI.getObjectAlignment(FI->getIndex()) < NewAlign)
+      if (MFI.getObjectAlign(FI->getIndex()) < NewAlign)
         MFI.setObjectAlignment(FI->getIndex(), NewAlign);
-      Align = NewAlign;
+      Alignment = NewAlign;
     }
   }
 
@@ -6235,7 +6376,7 @@ static SDValue getMemsetStores(SelectionDAG &DAG, const SDLoc &dl,
     assert(Value.getValueType() == VT && "Value with wrong type.");
     SDValue Store = DAG.getStore(
         Chain, dl, Value, DAG.getMemBasePlusOffset(Dst, DstOff, dl),
-        DstPtrInfo.getWithOffset(DstOff), Align,
+        DstPtrInfo.getWithOffset(DstOff), Alignment.value(),
         isVol ? MachineMemOperand::MOVolatile : MachineMemOperand::MONone);
     OutChains.push_back(Store);
     DstOff += VT.getSizeInBits() / 8;
@@ -6256,12 +6397,10 @@ static void checkAddrSpaceIsValidForLibcall(const TargetLowering *TLI,
 }
 
 SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
-                                SDValue Src, SDValue Size, unsigned Align,
+                                SDValue Src, SDValue Size, Align Alignment,
                                 bool isVol, bool AlwaysInline, bool isTailCall,
                                 MachinePointerInfo DstPtrInfo,
                                 MachinePointerInfo SrcPtrInfo) {
-  assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
-
   // Check to see if we should lower the memcpy to loads and stores first.
   // For cases within the target-specified limits, this is the best choice.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
@@ -6270,9 +6409,9 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
     if (ConstantSize->isNullValue())
       return Chain;
 
-    SDValue Result = getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
-                                             ConstantSize->getZExtValue(),Align,
-                                isVol, false, DstPtrInfo, SrcPtrInfo);
+    SDValue Result = getMemcpyLoadsAndStores(
+        *this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Alignment,
+        isVol, false, DstPtrInfo, SrcPtrInfo);
     if (Result.getNode())
       return Result;
   }
@@ -6281,7 +6420,7 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
   // code. If the target chooses to do this, this is the next best.
   if (TSI) {
     SDValue Result = TSI->EmitTargetCodeForMemcpy(
-        *this, dl, Chain, Dst, Src, Size, Align, isVol, AlwaysInline,
+        *this, dl, Chain, Dst, Src, Size, Alignment, isVol, AlwaysInline,
         DstPtrInfo, SrcPtrInfo);
     if (Result.getNode())
       return Result;
@@ -6292,8 +6431,8 @@ SDValue SelectionDAG::getMemcpy(SDValue Chain, const SDLoc &dl, SDValue Dst,
   if (AlwaysInline) {
     assert(ConstantSize && "AlwaysInline requires a constant size!");
     return getMemcpyLoadsAndStores(*this, dl, Chain, Dst, Src,
-                                   ConstantSize->getZExtValue(), Align, isVol,
-                                   true, DstPtrInfo, SrcPtrInfo);
+                                   ConstantSize->getZExtValue(), Alignment,
+                                   isVol, true, DstPtrInfo, SrcPtrInfo);
   }
 
   checkAddrSpaceIsValidForLibcall(TLI, DstPtrInfo.getAddrSpace());
@@ -6372,12 +6511,10 @@ SDValue SelectionDAG::getAtomicMemcpy(SDValue Chain, const SDLoc &dl,
 }
 
 SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
-                                 SDValue Src, SDValue Size, unsigned Align,
+                                 SDValue Src, SDValue Size, Align Alignment,
                                  bool isVol, bool isTailCall,
                                  MachinePointerInfo DstPtrInfo,
                                  MachinePointerInfo SrcPtrInfo) {
-  assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
-
   // Check to see if we should lower the memmove to loads and stores first.
   // For cases within the target-specified limits, this is the best choice.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
@@ -6386,10 +6523,9 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
     if (ConstantSize->isNullValue())
       return Chain;
 
-    SDValue Result =
-      getMemmoveLoadsAndStores(*this, dl, Chain, Dst, Src,
-                               ConstantSize->getZExtValue(), Align, isVol,
-                               false, DstPtrInfo, SrcPtrInfo);
+    SDValue Result = getMemmoveLoadsAndStores(
+        *this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(), Alignment,
+        isVol, false, DstPtrInfo, SrcPtrInfo);
     if (Result.getNode())
       return Result;
   }
@@ -6397,8 +6533,9 @@ SDValue SelectionDAG::getMemmove(SDValue Chain, const SDLoc &dl, SDValue Dst,
   // Then check to see if we should lower the memmove with target-specific
   // code. If the target chooses to do this, this is the next best.
   if (TSI) {
-    SDValue Result = TSI->EmitTargetCodeForMemmove(
-        *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo, SrcPtrInfo);
+    SDValue Result =
+        TSI->EmitTargetCodeForMemmove(*this, dl, Chain, Dst, Src, Size,
+                                      Alignment, isVol, DstPtrInfo, SrcPtrInfo);
     if (Result.getNode())
       return Result;
   }
@@ -6476,11 +6613,9 @@ SDValue SelectionDAG::getAtomicMemmove(SDValue Chain, const SDLoc &dl,
 }
 
 SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
-                                SDValue Src, SDValue Size, unsigned Align,
+                                SDValue Src, SDValue Size, Align Alignment,
                                 bool isVol, bool isTailCall,
                                 MachinePointerInfo DstPtrInfo) {
-  assert(Align && "The SDAG layer expects explicit alignment and reserves 0");
-
   // Check to see if we should lower the memset to stores first.
   // For cases within the target-specified limits, this is the best choice.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
@@ -6489,9 +6624,9 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
     if (ConstantSize->isNullValue())
       return Chain;
 
-    SDValue Result =
-      getMemsetStores(*this, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
-                      Align, isVol, DstPtrInfo);
+    SDValue Result = getMemsetStores(*this, dl, Chain, Dst, Src,
+                                     ConstantSize->getZExtValue(), Alignment,
+                                     isVol, DstPtrInfo);
 
     if (Result.getNode())
       return Result;
@@ -6501,7 +6636,7 @@ SDValue SelectionDAG::getMemset(SDValue Chain, const SDLoc &dl, SDValue Dst,
   // code. If the target chooses to do this, this is the next best.
   if (TSI) {
     SDValue Result = TSI->EmitTargetCodeForMemset(
-        *this, dl, Chain, Dst, Src, Size, Align, isVol, DstPtrInfo);
+        *this, dl, Chain, Dst, Src, Size, Alignment, isVol, DstPtrInfo);
     if (Result.getNode())
       return Result;
   }
@@ -6662,11 +6797,8 @@ SDValue SelectionDAG::getMergeValues(ArrayRef<SDValue> Ops, const SDLoc &dl) {
 
 SDValue SelectionDAG::getMemIntrinsicNode(
     unsigned Opcode, const SDLoc &dl, SDVTList VTList, ArrayRef<SDValue> Ops,
-    EVT MemVT, MachinePointerInfo PtrInfo, unsigned Align,
+    EVT MemVT, MachinePointerInfo PtrInfo, Align Alignment,
     MachineMemOperand::Flags Flags, uint64_t Size, const AAMDNodes &AAInfo) {
-  if (Align == 0)  // Ensure that codegen never sees alignment 0
-    Align = getEVTAlignment(MemVT);
-
   if (!Size && MemVT.isScalableVector())
     Size = MemoryLocation::UnknownSize;
   else if (!Size)
@@ -6674,7 +6806,7 @@ SDValue SelectionDAG::getMemIntrinsicNode(
 
   MachineFunction &MF = getMachineFunction();
   MachineMemOperand *MMO =
-      MF.getMachineMemOperand(PtrInfo, Flags, Size, Align, AAInfo);
+      MF.getMachineMemOperand(PtrInfo, Flags, Size, Alignment, AAInfo);
 
   return getMemIntrinsicNode(Opcode, dl, VTList, Ops, MemVT, MMO);
 }
@@ -6686,8 +6818,6 @@ SDValue SelectionDAG::getMemIntrinsicNode(unsigned Opcode, const SDLoc &dl,
   assert((Opcode == ISD::INTRINSIC_VOID ||
           Opcode == ISD::INTRINSIC_W_CHAIN ||
           Opcode == ISD::PREFETCH ||
-          Opcode == ISD::LIFETIME_START ||
-          Opcode == ISD::LIFETIME_END ||
           ((int)Opcode <= std::numeric_limits<int>::max() &&
            (int)Opcode >= ISD::FIRST_TARGET_MEMORY_OPCODE)) &&
          "Opcode is not a memory-accessing opcode!");
@@ -6795,13 +6925,11 @@ SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
                               EVT VT, const SDLoc &dl, SDValue Chain,
                               SDValue Ptr, SDValue Offset,
                               MachinePointerInfo PtrInfo, EVT MemVT,
-                              unsigned Alignment,
+                              Align Alignment,
                               MachineMemOperand::Flags MMOFlags,
                               const AAMDNodes &AAInfo, const MDNode *Ranges) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
-  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
-    Alignment = getEVTAlignment(MemVT);
 
   MMOFlags |= MachineMemOperand::MOLoad;
   assert((MMOFlags & MachineMemOperand::MOStore) == 0);
@@ -6810,9 +6938,10 @@ SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
   if (PtrInfo.V.isNull())
     PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr, Offset);
 
+  uint64_t Size = MemoryLocation::getSizeOrUnknown(MemVT.getStoreSize());
   MachineFunction &MF = getMachineFunction();
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MMOFlags, MemVT.getStoreSize(), Alignment, AAInfo, Ranges);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(PtrInfo, MMOFlags, Size,
+                                                   Alignment, AAInfo, Ranges);
   return getLoad(AM, ExtType, VT, dl, Chain, Ptr, Offset, MemVT, MMO);
 }
 
@@ -6867,7 +6996,7 @@ SDValue SelectionDAG::getLoad(ISD::MemIndexedMode AM, ISD::LoadExtType ExtType,
 
 SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
                               SDValue Ptr, MachinePointerInfo PtrInfo,
-                              unsigned Alignment,
+                              MaybeAlign Alignment,
                               MachineMemOperand::Flags MMOFlags,
                               const AAMDNodes &AAInfo, const MDNode *Ranges) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
@@ -6885,7 +7014,7 @@ SDValue SelectionDAG::getLoad(EVT VT, const SDLoc &dl, SDValue Chain,
 SDValue SelectionDAG::getExtLoad(ISD::LoadExtType ExtType, const SDLoc &dl,
                                  EVT VT, SDValue Chain, SDValue Ptr,
                                  MachinePointerInfo PtrInfo, EVT MemVT,
-                                 unsigned Alignment,
+                                 MaybeAlign Alignment,
                                  MachineMemOperand::Flags MMOFlags,
                                  const AAMDNodes &AAInfo) {
   SDValue Undef = getUNDEF(Ptr.getValueType());
@@ -6918,12 +7047,10 @@ SDValue SelectionDAG::getIndexedLoad(SDValue OrigLoad, const SDLoc &dl,
 
 SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
                                SDValue Ptr, MachinePointerInfo PtrInfo,
-                               unsigned Alignment,
+                               Align Alignment,
                                MachineMemOperand::Flags MMOFlags,
                                const AAMDNodes &AAInfo) {
   assert(Chain.getValueType() == MVT::Other && "Invalid chain type");
-  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
-    Alignment = getEVTAlignment(Val.getValueType());
 
   MMOFlags |= MachineMemOperand::MOStore;
   assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
@@ -6932,8 +7059,10 @@ SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
     PtrInfo = InferPointerInfo(PtrInfo, *this, Ptr);
 
   MachineFunction &MF = getMachineFunction();
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MMOFlags, Val.getValueType().getStoreSize(), Alignment, AAInfo);
+  uint64_t Size =
+      MemoryLocation::getSizeOrUnknown(Val.getValueType().getStoreSize());
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PtrInfo, MMOFlags, Size, Alignment, AAInfo);
   return getStore(Chain, dl, Val, Ptr, MMO);
 }
 
@@ -6969,13 +7098,11 @@ SDValue SelectionDAG::getStore(SDValue Chain, const SDLoc &dl, SDValue Val,
 
 SDValue SelectionDAG::getTruncStore(SDValue Chain, const SDLoc &dl, SDValue Val,
                                     SDValue Ptr, MachinePointerInfo PtrInfo,
-                                    EVT SVT, unsigned Alignment,
+                                    EVT SVT, Align Alignment,
                                     MachineMemOperand::Flags MMOFlags,
                                     const AAMDNodes &AAInfo) {
   assert(Chain.getValueType() == MVT::Other &&
         "Invalid chain type");
-  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
-    Alignment = getEVTAlignment(SVT);
 
   MMOFlags |= MachineMemOperand::MOStore;
   assert((MMOFlags & MachineMemOperand::MOLoad) == 0);
@@ -7288,9 +7415,24 @@ SDValue SelectionDAG::simplifyShift(SDValue X, SDValue Y) {
   return SDValue();
 }
 
-// TODO: Use fast-math-flags to enable more simplifications.
-SDValue SelectionDAG::simplifyFPBinop(unsigned Opcode, SDValue X, SDValue Y) {
+SDValue SelectionDAG::simplifyFPBinop(unsigned Opcode, SDValue X, SDValue Y,
+                                      SDNodeFlags Flags) {
+  // If this operation has 'nnan' or 'ninf' and at least 1 disallowed operand
+  // (an undef operand can be chosen to be Nan/Inf), then the result of this
+  // operation is poison. That result can be relaxed to undef.
+  ConstantFPSDNode *XC = isConstOrConstSplatFP(X, /* AllowUndefs */ true);
   ConstantFPSDNode *YC = isConstOrConstSplatFP(Y, /* AllowUndefs */ true);
+  bool HasNan = (XC && XC->getValueAPF().isNaN()) ||
+                (YC && YC->getValueAPF().isNaN());
+  bool HasInf = (XC && XC->getValueAPF().isInfinity()) ||
+                (YC && YC->getValueAPF().isInfinity());
+
+  if (Flags.hasNoNaNs() && (HasNan || X.isUndef() || Y.isUndef()))
+    return getUNDEF(X.getValueType());
+
+  if (Flags.hasNoInfs() && (HasInf || X.isUndef() || Y.isUndef()))
+    return getUNDEF(X.getValueType());
+
   if (!YC)
     return SDValue();
 
@@ -7394,6 +7536,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, EVT VT,
     createOperands(N, Ops);
   }
 
+  N->setFlags(Flags);
   InsertNode(N);
   SDValue V(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
@@ -7406,7 +7549,7 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL,
 }
 
 SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
-                              ArrayRef<SDValue> Ops) {
+                              ArrayRef<SDValue> Ops, const SDNodeFlags Flags) {
   if (VTList.NumVTs == 1)
     return getNode(Opcode, DL, VTList.VTs[0], Ops);
 
@@ -7481,6 +7624,8 @@ SDValue SelectionDAG::getNode(unsigned Opcode, const SDLoc &DL, SDVTList VTList,
     N = newSDNode<SDNode>(Opcode, DL.getIROrder(), DL.getDebugLoc(), VTList);
     createOperands(N, Ops);
   }
+
+  N->setFlags(Flags);
   InsertNode(N);
   SDValue V(N, 0);
   NewSDValueDbgMsg(V, "Creating new node: ", this);
@@ -7919,7 +8064,7 @@ SDNode* SelectionDAG::mutateStrictFPToFP(SDNode *Node) {
   switch (OrigOpc) {
   default:
     llvm_unreachable("mutateStrictFPToFP called with unexpected opcode!");
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)                   \
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
   case ISD::STRICT_##DAGN: NewOpc = ISD::DAGN; break;
 #define CMP_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
   case ISD::STRICT_##DAGN: NewOpc = ISD::SETCC; break;
@@ -9196,9 +9341,8 @@ SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
     if (!TLI->isExtractSubvectorCheap(SubVT, OpVT, 0))
       return SDValue();
     BinOp = (ISD::NodeType)CandidateBinOp;
-    return getNode(
-        ISD::EXTRACT_SUBVECTOR, SDLoc(Op), SubVT, Op,
-        getConstant(0, SDLoc(Op), TLI->getVectorIdxTy(getDataLayout())));
+    return getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(Op), SubVT, Op,
+                   getVectorIdxConstant(0, SDLoc(Op)));
   };
 
   // At each stage, we're looking for something that looks like:
@@ -9246,6 +9390,28 @@ SelectionDAG::matchBinOpReduction(SDNode *Extract, ISD::NodeType &BinOp,
     PrevOp = Op;
   }
 
+  // Handle subvector reductions, which tend to appear after the shuffle
+  // reduction stages.
+  while (Op.getOpcode() == CandidateBinOp) {
+    unsigned NumElts = Op.getValueType().getVectorNumElements();
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+    if (Op0.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+        Op1.getOpcode() != ISD::EXTRACT_SUBVECTOR ||
+        Op0.getOperand(0) != Op1.getOperand(0))
+      break;
+    SDValue Src = Op0.getOperand(0);
+    unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
+    if (NumSrcElts != (2 * NumElts))
+      break;
+    if (!(Op0.getConstantOperandAPInt(1) == 0 &&
+          Op1.getConstantOperandAPInt(1) == NumElts) &&
+        !(Op1.getConstantOperandAPInt(1) == 0 &&
+          Op0.getConstantOperandAPInt(1) == NumElts))
+      break;
+    Op = Src;
+  }
+
   BinOp = (ISD::NodeType)CandidateBinOp;
   return Op;
 }
@@ -9276,9 +9442,8 @@ SDValue SelectionDAG::UnrollVectorOp(SDNode *N, unsigned ResNE) {
       if (OperandVT.isVector()) {
         // A vector operand; extract a single element.
         EVT OperandEltVT = OperandVT.getVectorElementType();
-        Operands[j] =
-            getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT, Operand,
-                    getConstant(i, dl, TLI->getVectorIdxTy(getDataLayout())));
+        Operands[j] = getNode(ISD::EXTRACT_VECTOR_ELT, dl, OperandEltVT,
+                              Operand, getVectorIdxConstant(i, dl));
       } else {
         // A scalar operand; just use it as is.
         Operands[j] = Operand;
@@ -9395,9 +9560,9 @@ bool SelectionDAG::areNonVolatileConsecutiveLoads(LoadSDNode *LD,
   return false;
 }
 
-/// InferPtrAlignment - Infer alignment of a load / store address. Return 0 if
-/// it cannot be inferred.
-unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
+/// InferPtrAlignment - Infer alignment of a load / store address. Return None
+/// if it cannot be inferred.
+MaybeAlign SelectionDAG::InferPtrAlign(SDValue Ptr) const {
   // If this is a GlobalAddress + cst, return the alignment.
   const GlobalValue *GV = nullptr;
   int64_t GVOffset = 0;
@@ -9406,9 +9571,8 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
     KnownBits Known(PtrWidth);
     llvm::computeKnownBits(GV, Known, getDataLayout());
     unsigned AlignBits = Known.countMinTrailingZeros();
-    unsigned Align = AlignBits ? 1 << std::min(31U, AlignBits) : 0;
-    if (Align)
-      return MinAlign(Align, GVOffset);
+    if (AlignBits)
+      return commonAlignment(Align(1ull << std::min(31U, AlignBits)), GVOffset);
   }
 
   // If this is a direct reference to a stack slot, use information about the
@@ -9426,12 +9590,10 @@ unsigned SelectionDAG::InferPtrAlignment(SDValue Ptr) const {
 
   if (FrameIdx != INT_MIN) {
     const MachineFrameInfo &MFI = getMachineFunction().getFrameInfo();
-    unsigned FIInfoAlign = MinAlign(MFI.getObjectAlignment(FrameIdx),
-                                    FrameOffset);
-    return FIInfoAlign;
+    return commonAlignment(MFI.getObjectAlign(FrameIdx), FrameOffset);
   }
 
-  return 0;
+  return None;
 }
 
 /// GetSplitDestVTs - Compute the VTs needed for the low/hi parts of a type
@@ -9447,20 +9609,58 @@ std::pair<EVT, EVT> SelectionDAG::GetSplitDestVTs(const EVT &VT) const {
   return std::make_pair(LoVT, HiVT);
 }
 
+/// GetDependentSplitDestVTs - Compute the VTs needed for the low/hi parts of a
+/// type, dependent on an enveloping VT that has been split into two identical
+/// pieces. Sets the HiIsEmpty flag when hi type has zero storage size.
+std::pair<EVT, EVT>
+SelectionDAG::GetDependentSplitDestVTs(const EVT &VT, const EVT &EnvVT,
+                                       bool *HiIsEmpty) const {
+  EVT EltTp = VT.getVectorElementType();
+  bool IsScalable = VT.isScalableVector();
+  // Examples:
+  //   custom VL=8  with enveloping VL=8/8 yields 8/0 (hi empty)
+  //   custom VL=9  with enveloping VL=8/8 yields 8/1
+  //   custom VL=10 with enveloping VL=8/8 yields 8/2
+  //   etc.
+  unsigned VTNumElts = VT.getVectorNumElements();
+  unsigned EnvNumElts = EnvVT.getVectorNumElements();
+  EVT LoVT, HiVT;
+  if (VTNumElts > EnvNumElts) {
+    LoVT = EnvVT;
+    HiVT = EVT::getVectorVT(*getContext(), EltTp, VTNumElts - EnvNumElts,
+                            IsScalable);
+    *HiIsEmpty = false;
+  } else {
+    // Flag that hi type has zero storage size, but return split envelop type
+    // (this would be easier if vector types with zero elements were allowed).
+    LoVT = EVT::getVectorVT(*getContext(), EltTp, VTNumElts, IsScalable);
+    HiVT = EnvVT;
+    *HiIsEmpty = true;
+  }
+  return std::make_pair(LoVT, HiVT);
+}
+
 /// SplitVector - Split the vector with EXTRACT_SUBVECTOR and return the
 /// low/high part.
 std::pair<SDValue, SDValue>
 SelectionDAG::SplitVector(const SDValue &N, const SDLoc &DL, const EVT &LoVT,
                           const EVT &HiVT) {
-  assert(LoVT.getVectorNumElements() + HiVT.getVectorNumElements() <=
-         N.getValueType().getVectorNumElements() &&
+  assert(LoVT.isScalableVector() == HiVT.isScalableVector() &&
+         LoVT.isScalableVector() == N.getValueType().isScalableVector() &&
+         "Splitting vector with an invalid mixture of fixed and scalable "
+         "vector types");
+  assert(LoVT.getVectorMinNumElements() + HiVT.getVectorMinNumElements() <=
+             N.getValueType().getVectorMinNumElements() &&
          "More vector elements requested than available!");
   SDValue Lo, Hi;
-  Lo = getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
-               getConstant(0, DL, TLI->getVectorIdxTy(getDataLayout())));
+  Lo =
+      getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N, getVectorIdxConstant(0, DL));
+  // For scalable vectors it is safe to use LoVT.getVectorMinNumElements()
+  // (rather than having to use ElementCount), because EXTRACT_SUBVECTOR scales
+  // IDX with the runtime scaling factor of the result vector type. For
+  // fixed-width result vectors, that runtime scaling factor is 1.
   Hi = getNode(ISD::EXTRACT_SUBVECTOR, DL, HiVT, N,
-               getConstant(LoVT.getVectorNumElements(), DL,
-                           TLI->getVectorIdxTy(getDataLayout())));
+               getVectorIdxConstant(LoVT.getVectorMinNumElements(), DL));
   return std::make_pair(Lo, Hi);
 }
 
@@ -9470,22 +9670,22 @@ SDValue SelectionDAG::WidenVector(const SDValue &N, const SDLoc &DL) {
   EVT WideVT = EVT::getVectorVT(*getContext(), VT.getVectorElementType(),
                                 NextPowerOf2(VT.getVectorNumElements()));
   return getNode(ISD::INSERT_SUBVECTOR, DL, WideVT, getUNDEF(WideVT), N,
-                 getConstant(0, DL, TLI->getVectorIdxTy(getDataLayout())));
+                 getVectorIdxConstant(0, DL));
 }
 
 void SelectionDAG::ExtractVectorElements(SDValue Op,
                                          SmallVectorImpl<SDValue> &Args,
-                                         unsigned Start, unsigned Count) {
+                                         unsigned Start, unsigned Count,
+                                         EVT EltVT) {
   EVT VT = Op.getValueType();
   if (Count == 0)
     Count = VT.getVectorNumElements();
-
-  EVT EltVT = VT.getVectorElementType();
-  EVT IdxTy = TLI->getVectorIdxTy(getDataLayout());
+  if (EltVT == EVT())
+    EltVT = VT.getVectorElementType();
   SDLoc SL(Op);
   for (unsigned i = Start, e = Start + Count; i != e; ++i) {
-    Args.push_back(getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT,
-                           Op, getConstant(i, SL, IdxTy)));
+    Args.push_back(getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Op,
+                           getVectorIdxConstant(i, SL)));
   }
 }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 728d963a916f..1d596c89c911 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -69,7 +69,6 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
@@ -136,6 +135,11 @@ using namespace SwitchCG;
 /// some float libcalls (6, 8 or 12 bits).
 static unsigned LimitFloatPrecision;
 
+static cl::opt<bool>
+    InsertAssertAlign("insert-assert-align", cl::init(true),
+                      cl::desc("Insert the experimental `assertalign` node."),
+                      cl::ReallyHidden);
+
 static cl::opt<unsigned, true>
     LimitFPPrecision("limit-float-precision",
                      cl::desc("Generate low-precision inline sequences "
@@ -206,12 +210,17 @@ static SDValue getCopyFromParts(SelectionDAG &DAG, const SDLoc &DL,
                                 MVT PartVT, EVT ValueVT, const Value *V,
                                 Optional<CallingConv::ID> CC = None,
                                 Optional<ISD::NodeType> AssertOp = None) {
+  // Let the target assemble the parts if it wants to
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (SDValue Val = TLI.joinRegisterPartsIntoValue(DAG, DL, Parts, NumParts,
+                                                   PartVT, ValueVT, CC))
+    return Val;
+
   if (ValueVT.isVector())
     return getCopyFromPartsVector(DAG, DL, Parts, NumParts, PartVT, ValueVT, V,
                                   CC);
 
   assert(NumParts > 0 && "No parts to assemble!");
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDValue Val = Parts[0];
 
   if (NumParts > 1) {
@@ -347,7 +356,7 @@ static void diagnosePossiblyInvalidConstraint(LLVMContext &Ctx, const Value *V,
 
   const char *AsmError = ", possible invalid constraint for vector type";
   if (const CallInst *CI = dyn_cast<CallInst>(I))
-    if (isa<InlineAsm>(CI->getCalledValue()))
+    if (CI->isInlineAsm())
       return Ctx.emitError(I, ErrMsg + AsmError);
 
   return Ctx.emitError(I, ErrMsg);
@@ -415,10 +424,13 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
     // Build a vector with BUILD_VECTOR or CONCAT_VECTORS from the
     // intermediate operands.
     EVT BuiltVectorTy =
-        EVT::getVectorVT(*DAG.getContext(), IntermediateVT.getScalarType(),
-                         (IntermediateVT.isVector()
-                              ? IntermediateVT.getVectorNumElements() * NumParts
-                              : NumIntermediates));
+        IntermediateVT.isVector()
+            ? EVT::getVectorVT(
+                  *DAG.getContext(), IntermediateVT.getScalarType(),
+                  IntermediateVT.getVectorElementCount() * NumParts)
+            : EVT::getVectorVT(*DAG.getContext(),
+                               IntermediateVT.getScalarType(),
+                               NumIntermediates);
     Val = DAG.getNode(IntermediateVT.isVector() ? ISD::CONCAT_VECTORS
                                                 : ISD::BUILD_VECTOR,
                       DL, BuiltVectorTy, Ops);
@@ -436,18 +448,20 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
     // vector widening case (e.g. <2 x float> -> <4 x float>).  Extract the
     // elements we want.
     if (PartEVT.getVectorElementType() == ValueVT.getVectorElementType()) {
-      assert(PartEVT.getVectorNumElements() > ValueVT.getVectorNumElements() &&
+      assert((PartEVT.getVectorElementCount().Min >
+              ValueVT.getVectorElementCount().Min) &&
+             (PartEVT.getVectorElementCount().Scalable ==
+              ValueVT.getVectorElementCount().Scalable) &&
              "Cannot narrow, it would be a lossy transformation");
-      return DAG.getNode(
-          ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
-          DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
+                         DAG.getVectorIdxConstant(0, DL));
     }
 
     // Vector/Vector bitcast.
     if (ValueVT.getSizeInBits() == PartEVT.getSizeInBits())
       return DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
 
-    assert(PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements() &&
+    assert(PartEVT.getVectorElementCount() == ValueVT.getVectorElementCount() &&
       "Cannot handle this kind of promotion");
     // Promoted vector extract
     return DAG.getAnyExtOrTrunc(Val, DL, ValueVT);
@@ -472,9 +486,8 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
        EVT WiderVecType = EVT::getVectorVT(*DAG.getContext(),
                                            ValueVT.getVectorElementType(), Elts);
        Val = DAG.getBitcast(WiderVecType, Val);
-       return DAG.getNode(
-           ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
-           DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ValueVT, Val,
+                          DAG.getVectorIdxConstant(0, DL));
      }
 
      diagnosePossiblyInvalidConstraint(
@@ -484,9 +497,14 @@ static SDValue getCopyFromPartsVector(SelectionDAG &DAG, const SDLoc &DL,
 
   // Handle cases such as i8 -> <1 x i1>
   EVT ValueSVT = ValueVT.getVectorElementType();
-  if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT)
-    Val = ValueVT.isFloatingPoint() ? DAG.getFPExtendOrRound(Val, DL, ValueSVT)
-                                    : DAG.getAnyExtOrTrunc(Val, DL, ValueSVT);
+  if (ValueVT.getVectorNumElements() == 1 && ValueSVT != PartEVT) {
+    if (ValueSVT.getSizeInBits() == PartEVT.getSizeInBits())
+      Val = DAG.getNode(ISD::BITCAST, DL, ValueSVT, Val);
+    else
+      Val = ValueVT.isFloatingPoint()
+                ? DAG.getFPExtendOrRound(Val, DL, ValueSVT)
+                : DAG.getAnyExtOrTrunc(Val, DL, ValueSVT);
+  }
 
   return DAG.getBuildVector(ValueVT, DL, Val);
 }
@@ -504,6 +522,11 @@ static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
                            const Value *V,
                            Optional<CallingConv::ID> CallConv = None,
                            ISD::NodeType ExtendKind = ISD::ANY_EXTEND) {
+  // Let the target split the parts if it wants to
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.splitValueIntoRegisterParts(DAG, DL, Val, Parts, NumParts, PartVT,
+                                      CallConv))
+    return;
   EVT ValueVT = Val.getValueType();
 
   // Handle the vector case separately.
@@ -633,7 +656,7 @@ static void getCopyToParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
 
 static SDValue widenVectorToPartType(SelectionDAG &DAG,
                                      SDValue Val, const SDLoc &DL, EVT PartVT) {
-  if (!PartVT.isVector())
+  if (!PartVT.isFixedLengthVector())
     return SDValue();
 
   EVT ValueVT = Val.getValueType();
@@ -679,16 +702,16 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
       Val = Widened;
     } else if (PartVT.isVector() &&
                PartEVT.getVectorElementType().bitsGE(
-                 ValueVT.getVectorElementType()) &&
-               PartEVT.getVectorNumElements() == ValueVT.getVectorNumElements()) {
+                   ValueVT.getVectorElementType()) &&
+               PartEVT.getVectorElementCount() ==
+                   ValueVT.getVectorElementCount()) {
 
       // Promoted vector extract
       Val = DAG.getAnyExtOrTrunc(Val, DL, PartVT);
     } else {
       if (ValueVT.getVectorNumElements() == 1) {
-        Val = DAG.getNode(
-            ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
-            DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+        Val = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, PartVT, Val,
+                          DAG.getVectorIdxConstant(0, DL));
       } else {
         assert(PartVT.getSizeInBits() > ValueVT.getSizeInBits() &&
                "lossy conversion of vector to scalar type");
@@ -723,15 +746,18 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
   NumParts = NumRegs; // Silence a compiler warning.
   assert(RegisterVT == PartVT && "Part type doesn't match vector breakdown!");
 
-  unsigned IntermediateNumElts = IntermediateVT.isVector() ?
-    IntermediateVT.getVectorNumElements() : 1;
+  assert(IntermediateVT.isScalableVector() == ValueVT.isScalableVector() &&
+         "Mixing scalable and fixed vectors when copying in parts");
 
-  // Convert the vector to the appropriate type if necessary.
-  unsigned DestVectorNoElts = NumIntermediates * IntermediateNumElts;
+  ElementCount DestEltCnt;
+
+  if (IntermediateVT.isVector())
+    DestEltCnt = IntermediateVT.getVectorElementCount() * NumIntermediates;
+  else
+    DestEltCnt = ElementCount(NumIntermediates, false);
 
   EVT BuiltVectorTy = EVT::getVectorVT(
-      *DAG.getContext(), IntermediateVT.getScalarType(), DestVectorNoElts);
-  MVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
+      *DAG.getContext(), IntermediateVT.getScalarType(), DestEltCnt);
   if (ValueVT != BuiltVectorTy) {
     if (SDValue Widened = widenVectorToPartType(DAG, Val, DL, BuiltVectorTy))
       Val = Widened;
@@ -743,12 +769,15 @@ static void getCopyToPartsVector(SelectionDAG &DAG, const SDLoc &DL,
   SmallVector<SDValue, 8> Ops(NumIntermediates);
   for (unsigned i = 0; i != NumIntermediates; ++i) {
     if (IntermediateVT.isVector()) {
-      Ops[i] = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val,
-                           DAG.getConstant(i * IntermediateNumElts, DL, IdxVT));
+      // This does something sensible for scalable vectors - see the
+      // definition of EXTRACT_SUBVECTOR for further details.
+      unsigned IntermediateNumElts = IntermediateVT.getVectorMinNumElements();
+      Ops[i] =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, IntermediateVT, Val,
+                      DAG.getVectorIdxConstant(i * IntermediateNumElts, DL));
     } else {
-      Ops[i] = DAG.getNode(
-          ISD::EXTRACT_VECTOR_ELT, DL, IntermediateVT, Val,
-          DAG.getConstant(i, DL, IdxVT));
+      Ops[i] = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, IntermediateVT, Val,
+                           DAG.getVectorIdxConstant(i, DL));
     }
   }
 
@@ -1112,32 +1141,26 @@ void SelectionDAGBuilder::visit(const Instruction &I) {
   visit(I.getOpcode(), I);
 
   if (auto *FPMO = dyn_cast<FPMathOperator>(&I)) {
-    // Propagate the fast-math-flags of this IR instruction to the DAG node that
-    // maps to this instruction.
-    // TODO: We could handle all flags (nsw, etc) here.
-    // TODO: If an IR instruction maps to >1 node, only the final node will have
-    //       flags set.
-    if (SDNode *Node = getNodeForIRValue(&I)) {
-      SDNodeFlags IncomingFlags;
-      IncomingFlags.copyFMF(*FPMO);
-      if (!Node->getFlags().isDefined())
-        Node->setFlags(IncomingFlags);
-      else
-        Node->intersectFlagsWith(IncomingFlags);
-    }
-  }
-  // Constrained FP intrinsics with fpexcept.ignore should also get
-  // the NoFPExcept flag.
-  if (auto *FPI = dyn_cast<ConstrainedFPIntrinsic>(&I))
-    if (FPI->getExceptionBehavior() == fp::ExceptionBehavior::ebIgnore)
+    // ConstrainedFPIntrinsics handle their own FMF.
+    if (!isa<ConstrainedFPIntrinsic>(&I)) {
+      // Propagate the fast-math-flags of this IR instruction to the DAG node that
+      // maps to this instruction.
+      // TODO: We could handle all flags (nsw, etc) here.
+      // TODO: If an IR instruction maps to >1 node, only the final node will have
+      //       flags set.
       if (SDNode *Node = getNodeForIRValue(&I)) {
-        SDNodeFlags Flags = Node->getFlags();
-        Flags.setNoFPExcept(true);
-        Node->setFlags(Flags);
+        SDNodeFlags IncomingFlags;
+        IncomingFlags.copyFMF(*FPMO);
+        if (!Node->getFlags().isDefined())
+          Node->setFlags(IncomingFlags);
+        else
+          Node->intersectFlagsWith(IncomingFlags);
       }
+    }
+  }
 
   if (!I.isTerminator() && !HasTailCall &&
-      !isStatepoint(&I)) // statepoints handle their exports internally
+      !isa<GCStatepointInst>(I)) // statepoints handle their exports internally
     CopyToExportRegsIfNeeded(&I);
 
   CurInst = nullptr;
@@ -1399,11 +1422,11 @@ void SelectionDAGBuilder::resolveOrClearDbgInfo() {
 /// getCopyFromRegs - If there was virtual register allocated for the value V
 /// emit CopyFromReg of the specified type Ty. Return empty SDValue() otherwise.
 SDValue SelectionDAGBuilder::getCopyFromRegs(const Value *V, Type *Ty) {
-  DenseMap<const Value *, unsigned>::iterator It = FuncInfo.ValueMap.find(V);
+  DenseMap<const Value *, Register>::iterator It = FuncInfo.ValueMap.find(V);
   SDValue Result;
 
   if (It != FuncInfo.ValueMap.end()) {
-    unsigned InReg = It->second;
+    Register InReg = It->second;
 
     RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
                      DAG.getDataLayout(), InReg, Ty,
@@ -1437,12 +1460,6 @@ SDValue SelectionDAGBuilder::getValue(const Value *V) {
   return Val;
 }
 
-// Return true if SDValue exists for the given Value
-bool SelectionDAGBuilder::findValue(const Value *V) const {
-  return (NodeMap.find(V) != NodeMap.end()) ||
-    (FuncInfo.ValueMap.find(V) != FuncInfo.ValueMap.end());
-}
-
 /// getNonRegisterValue - Return an SDValue for the given Value, but
 /// don't look in FuncInfo.ValueMap for a virtual register.
 SDValue SelectionDAGBuilder::getNonRegisterValue(const Value *V) {
@@ -1486,6 +1503,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
                              TLI.getPointerTy(DAG.getDataLayout(), AS));
     }
 
+    if (match(C, m_VScale(DAG.getDataLayout())))
+      return DAG.getVScale(getCurSDLoc(), VT, APInt(VT.getSizeInBits(), 1));
+
     if (const ConstantFP *CFP = dyn_cast<ConstantFP>(C))
       return DAG.getConstantFP(*CFP, getCurSDLoc(), VT);
 
@@ -1558,16 +1578,17 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
       return DAG.getBlockAddress(BA, VT);
 
     VectorType *VecTy = cast<VectorType>(V->getType());
-    unsigned NumElements = VecTy->getNumElements();
 
     // Now that we know the number and type of the elements, get that number of
     // elements into the Ops array based on what kind of constant it is.
-    SmallVector<SDValue, 16> Ops;
     if (const ConstantVector *CV = dyn_cast<ConstantVector>(C)) {
+      SmallVector<SDValue, 16> Ops;
+      unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
       for (unsigned i = 0; i != NumElements; ++i)
         Ops.push_back(getValue(CV->getOperand(i)));
-    } else {
-      assert(isa<ConstantAggregateZero>(C) && "Unknown vector constant!");
+
+      return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
+    } else if (isa<ConstantAggregateZero>(C)) {
       EVT EltVT =
           TLI.getValueType(DAG.getDataLayout(), VecTy->getElementType());
 
@@ -1576,11 +1597,16 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
         Op = DAG.getConstantFP(0, getCurSDLoc(), EltVT);
       else
         Op = DAG.getConstant(0, getCurSDLoc(), EltVT);
-      Ops.assign(NumElements, Op);
-    }
 
-    // Create a BUILD_VECTOR node.
-    return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
+      if (isa<ScalableVectorType>(VecTy))
+        return NodeMap[V] = DAG.getSplatVector(VT, getCurSDLoc(), Op);
+      else {
+        SmallVector<SDValue, 16> Ops;
+        Ops.assign(cast<FixedVectorType>(VecTy)->getNumElements(), Op);
+        return NodeMap[V] = DAG.getBuildVector(VT, getCurSDLoc(), Ops);
+      }
+    }
+    llvm_unreachable("Unknown vector constant");
   }
 
   // If this is a static alloca, generate it as the frameindex instead of
@@ -1603,6 +1629,9 @@ SDValue SelectionDAGBuilder::getValueImpl(const Value *V) {
     return RFV.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(), Chain, nullptr, V);
   }
 
+  if (const MetadataAsValue *MD = dyn_cast<MetadataAsValue>(V)) {
+    return DAG.getMDNode(cast<MDNode>(MD->getMetadata()));
+  }
   llvm_unreachable("Can't get register for value!");
 }
 
@@ -1611,17 +1640,12 @@ void SelectionDAGBuilder::visitCatchPad(const CatchPadInst &I) {
   bool IsMSVCCXX = Pers == EHPersonality::MSVC_CXX;
   bool IsCoreCLR = Pers == EHPersonality::CoreCLR;
   bool IsSEH = isAsynchronousEHPersonality(Pers);
-  bool IsWasmCXX = Pers == EHPersonality::Wasm_CXX;
   MachineBasicBlock *CatchPadMBB = FuncInfo.MBB;
   if (!IsSEH)
     CatchPadMBB->setIsEHScopeEntry();
   // In MSVC C++ and CoreCLR, catchblocks are funclets and need prologues.
   if (IsMSVCCXX || IsCoreCLR)
     CatchPadMBB->setIsEHFuncletEntry();
-  // Wasm does not need catchpads anymore
-  if (!IsWasmCXX)
-    DAG.setRoot(DAG.getNode(ISD::CATCHPAD, getCurSDLoc(), MVT::Other,
-                            getControlRoot()));
 }
 
 void SelectionDAGBuilder::visitCatchRet(const CatchReturnInst &I) {
@@ -1835,6 +1859,7 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
     unsigned NumValues = ValueVTs.size();
 
     SmallVector<SDValue, 4> Chains(NumValues);
+    Align BaseAlign = DL.getPrefTypeAlign(I.getOperand(0)->getType());
     for (unsigned i = 0; i != NumValues; ++i) {
       // An aggregate return value cannot wrap around the address space, so
       // offsets to its parts don't wrap either.
@@ -1843,9 +1868,11 @@ void SelectionDAGBuilder::visitRet(const ReturnInst &I) {
       SDValue Val = RetOp.getValue(RetOp.getResNo() + i);
       if (MemVTs[i] != ValueVTs[i])
         Val = DAG.getPtrExtOrTrunc(Val, getCurSDLoc(), MemVTs[i]);
-      Chains[i] = DAG.getStore(Chain, getCurSDLoc(), Val,
+      Chains[i] = DAG.getStore(
+          Chain, getCurSDLoc(), Val,
           // FIXME: better loc info would be nice.
-          Ptr, MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()));
+          Ptr, MachinePointerInfo::getUnknownStack(DAG.getMachineFunction()),
+          commonAlignment(BaseAlign, Offsets[i]));
     }
 
     Chain = DAG.getNode(ISD::TokenFactor, getCurSDLoc(),
@@ -1964,7 +1991,7 @@ void SelectionDAGBuilder::CopyToExportRegsIfNeeded(const Value *V) {
   if (V->getType()->isEmptyTy())
     return;
 
-  DenseMap<const Value *, unsigned>::iterator VMI = FuncInfo.ValueMap.find(V);
+  DenseMap<const Value *, Register>::iterator VMI = FuncInfo.ValueMap.find(V);
   if (VMI != FuncInfo.ValueMap.end()) {
     assert(!V->use_empty() && "Unused value assigned virtual registers!");
     CopyValueToVirtualRegister(V, VMI->second);
@@ -2277,7 +2304,9 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
 
   // If this is a series of conditions that are or'd or and'd together, emit
   // this as a sequence of branches instead of setcc's with and/or operations.
-  // As long as jumps are not expensive, this should improve performance.
+  // As long as jumps are not expensive (exceptions for multi-use logic ops,
+  // unpredictable branches, and vector extracts because those jumps are likely
+  // expensive for any target), this should improve performance.
   // For example, instead of something like:
   //     cmp A, B
   //     C = seteq
@@ -2292,9 +2321,12 @@ void SelectionDAGBuilder::visitBr(const BranchInst &I) {
   //     jle foo
   if (const BinaryOperator *BOp = dyn_cast<BinaryOperator>(CondVal)) {
     Instruction::BinaryOps Opcode = BOp->getOpcode();
+    Value *Vec, *BOp0 = BOp->getOperand(0), *BOp1 = BOp->getOperand(1);
     if (!DAG.getTargetLoweringInfo().isJumpExpensive() && BOp->hasOneUse() &&
         !I.hasMetadata(LLVMContext::MD_unpredictable) &&
-        (Opcode == Instruction::And || Opcode == Instruction::Or)) {
+        (Opcode == Instruction::And || Opcode == Instruction::Or) &&
+        !(match(BOp0, m_ExtractElt(m_Value(Vec), m_Value())) &&
+          match(BOp1, m_ExtractElt(m_Specific(Vec), m_Value())))) {
       FindMergedConditions(BOp, Succ0MBB, Succ1MBB, BrMBB, BrMBB,
                            Opcode,
                            getEdgeProbability(BrMBB, Succ0MBB),
@@ -2516,7 +2548,7 @@ static SDValue getLoadStackGuard(SelectionDAG &DAG, const SDLoc &DL,
     auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
                  MachineMemOperand::MODereferenceable;
     MachineMemOperand *MemRef = MF.getMachineMemOperand(
-        MPInfo, Flags, PtrTy.getSizeInBits() / 8, DAG.getEVTAlignment(PtrTy));
+        MPInfo, Flags, PtrTy.getSizeInBits() / 8, DAG.getEVTAlign(PtrTy));
     DAG.setNodeMemRefs(Node, {MemRef});
   }
   if (PtrTy != PtrMemTy)
@@ -2597,17 +2629,13 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
                         MachineMemOperand::MOVolatile);
   }
 
-  // Perform the comparison via a subtract/getsetcc.
-  EVT VT = Guard.getValueType();
-  SDValue Sub = DAG.getNode(ISD::SUB, dl, VT, Guard, GuardVal);
-
+  // Perform the comparison via a getsetcc.
   SDValue Cmp = DAG.getSetCC(dl, TLI.getSetCCResultType(DAG.getDataLayout(),
                                                         *DAG.getContext(),
-                                                        Sub.getValueType()),
-                             Sub, DAG.getConstant(0, dl, VT), ISD::SETNE);
+                                                        Guard.getValueType()),
+                             Guard, GuardVal, ISD::SETNE);
 
-  // If the sub is not 0, then we know the guard/stackslot do not equal, so
-  // branch to failure MBB.
+  // If the guard/stackslot do not equal, branch to failure MBB.
   SDValue BrCond = DAG.getNode(ISD::BRCOND, dl,
                                MVT::Other, GuardVal.getOperand(0),
                                Cmp, DAG.getBasicBlock(SPD.getFailureMBB()));
@@ -2640,6 +2668,11 @@ SelectionDAGBuilder::visitSPDescriptorFailure(StackProtectorDescriptor &SPD) {
   // Passing 'true' for doesNotReturn above won't generate the trap for us.
   if (TM.getTargetTriple().isPS4CPU())
     Chain = DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, Chain);
+  // WebAssembly needs an unreachable instruction after a non-returning call,
+  // because the function return type can be different from __stack_chk_fail's
+  // return type (void).
+  if (TM.getTargetTriple().isWasm())
+    Chain = DAG.getNode(ISD::TRAP, getCurSDLoc(), MVT::Other, Chain);
 
   DAG.setRoot(Chain);
 }
@@ -2778,14 +2811,16 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
   // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
   // have to do anything here to lower funclet bundles.
   assert(!I.hasOperandBundlesOtherThan({LLVMContext::OB_deopt,
+                                        LLVMContext::OB_gc_transition,
+                                        LLVMContext::OB_gc_live,
                                         LLVMContext::OB_funclet,
                                         LLVMContext::OB_cfguardtarget}) &&
          "Cannot lower invokes with arbitrary operand bundles yet!");
 
-  const Value *Callee(I.getCalledValue());
+  const Value *Callee(I.getCalledOperand());
   const Function *Fn = dyn_cast<Function>(Callee);
   if (isa<InlineAsm>(Callee))
-    visitInlineAsm(&I);
+    visitInlineAsm(I);
   else if (Fn && Fn->isIntrinsic()) {
     switch (Fn->getIntrinsicID()) {
     default:
@@ -2795,10 +2830,10 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
       break;
     case Intrinsic::experimental_patchpoint_void:
     case Intrinsic::experimental_patchpoint_i64:
-      visitPatchpoint(&I, EHPadBB);
+      visitPatchpoint(I, EHPadBB);
       break;
     case Intrinsic::experimental_gc_statepoint:
-      LowerStatepoint(ImmutableStatepoint(&I), EHPadBB);
+      LowerStatepoint(cast<GCStatepointInst>(I), EHPadBB);
       break;
     case Intrinsic::wasm_rethrow_in_catch: {
       // This is usually done in visitTargetIntrinsic, but this intrinsic is
@@ -2822,14 +2857,14 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
     // with deopt state.
     LowerCallSiteWithDeoptBundle(&I, getValue(Callee), EHPadBB);
   } else {
-    LowerCallTo(&I, getValue(Callee), false, EHPadBB);
+    LowerCallTo(I, getValue(Callee), false, EHPadBB);
   }
 
   // If the value of the invoke is used outside of its defining block, make it
   // available as a virtual register.
   // We already took care of the exported value for the statepoint instruction
   // during call to the LowerStatepoint.
-  if (!isStatepoint(I)) {
+  if (!isa<GCStatepointInst>(I)) {
     CopyToExportRegsIfNeeded(&I);
   }
 
@@ -2862,18 +2897,19 @@ void SelectionDAGBuilder::visitCallBr(const CallBrInst &I) {
              {LLVMContext::OB_deopt, LLVMContext::OB_funclet}) &&
          "Cannot lower callbrs with arbitrary operand bundles yet!");
 
-  assert(isa<InlineAsm>(I.getCalledValue()) &&
-         "Only know how to handle inlineasm callbr");
-  visitInlineAsm(&I);
+  assert(I.isInlineAsm() && "Only know how to handle inlineasm callbr");
+  visitInlineAsm(I);
+  CopyToExportRegsIfNeeded(&I);
 
   // Retrieve successors.
   MachineBasicBlock *Return = FuncInfo.MBBMap[I.getDefaultDest()];
 
   // Update successor info.
-  addSuccessorWithProb(CallBrMBB, Return);
+  addSuccessorWithProb(CallBrMBB, Return, BranchProbability::getOne());
   for (unsigned i = 0, e = I.getNumIndirectDests(); i < e; ++i) {
     MachineBasicBlock *Target = FuncInfo.MBBMap[I.getIndirectDest(i)];
-    addSuccessorWithProb(CallBrMBB, Target);
+    addSuccessorWithProb(CallBrMBB, Target, BranchProbability::getZero());
+    Target->setIsInlineAsmBrIndirectTarget();
   }
   CallBrMBB->normalizeSuccProbs();
 
@@ -3003,133 +3039,6 @@ void SelectionDAGBuilder::visitFSub(const User &I) {
   visitBinary(I, ISD::FSUB);
 }
 
-/// Checks if the given instruction performs a vector reduction, in which case
-/// we have the freedom to alter the elements in the result as long as the
-/// reduction of them stays unchanged.
-static bool isVectorReductionOp(const User *I) {
-  const Instruction *Inst = dyn_cast<Instruction>(I);
-  if (!Inst || !Inst->getType()->isVectorTy())
-    return false;
-
-  auto OpCode = Inst->getOpcode();
-  switch (OpCode) {
-  case Instruction::Add:
-  case Instruction::Mul:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-    break;
-  case Instruction::FAdd:
-  case Instruction::FMul:
-    if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
-      if (FPOp->getFastMathFlags().isFast())
-        break;
-    LLVM_FALLTHROUGH;
-  default:
-    return false;
-  }
-
-  unsigned ElemNum = Inst->getType()->getVectorNumElements();
-  // Ensure the reduction size is a power of 2.
-  if (!isPowerOf2_32(ElemNum))
-    return false;
-
-  unsigned ElemNumToReduce = ElemNum;
-
-  // Do DFS search on the def-use chain from the given instruction. We only
-  // allow four kinds of operations during the search until we reach the
-  // instruction that extracts the first element from the vector:
-  //
-  //   1. The reduction operation of the same opcode as the given instruction.
-  //
-  //   2. PHI node.
-  //
-  //   3. ShuffleVector instruction together with a reduction operation that
-  //      does a partial reduction.
-  //
-  //   4. ExtractElement that extracts the first element from the vector, and we
-  //      stop searching the def-use chain here.
-  //
-  // 3 & 4 above perform a reduction on all elements of the vector. We push defs
-  // from 1-3 to the stack to continue the DFS. The given instruction is not
-  // a reduction operation if we meet any other instructions other than those
-  // listed above.
-
-  SmallVector<const User *, 16> UsersToVisit{Inst};
-  SmallPtrSet<const User *, 16> Visited;
-  bool ReduxExtracted = false;
-
-  while (!UsersToVisit.empty()) {
-    auto User = UsersToVisit.back();
-    UsersToVisit.pop_back();
-    if (!Visited.insert(User).second)
-      continue;
-
-    for (const auto *U : User->users()) {
-      auto Inst = dyn_cast<Instruction>(U);
-      if (!Inst)
-        return false;
-
-      if (Inst->getOpcode() == OpCode || isa<PHINode>(U)) {
-        if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(Inst))
-          if (!isa<PHINode>(FPOp) && !FPOp->getFastMathFlags().isFast())
-            return false;
-        UsersToVisit.push_back(U);
-      } else if (const ShuffleVectorInst *ShufInst =
-                     dyn_cast<ShuffleVectorInst>(U)) {
-        // Detect the following pattern: A ShuffleVector instruction together
-        // with a reduction that do partial reduction on the first and second
-        // ElemNumToReduce / 2 elements, and store the result in
-        // ElemNumToReduce / 2 elements in another vector.
-
-        unsigned ResultElements = ShufInst->getType()->getVectorNumElements();
-        if (ResultElements < ElemNum)
-          return false;
-
-        if (ElemNumToReduce == 1)
-          return false;
-        if (!isa<UndefValue>(U->getOperand(1)))
-          return false;
-        for (unsigned i = 0; i < ElemNumToReduce / 2; ++i)
-          if (ShufInst->getMaskValue(i) != int(i + ElemNumToReduce / 2))
-            return false;
-        for (unsigned i = ElemNumToReduce / 2; i < ElemNum; ++i)
-          if (ShufInst->getMaskValue(i) != -1)
-            return false;
-
-        // There is only one user of this ShuffleVector instruction, which
-        // must be a reduction operation.
-        if (!U->hasOneUse())
-          return false;
-
-        auto U2 = dyn_cast<Instruction>(*U->user_begin());
-        if (!U2 || U2->getOpcode() != OpCode)
-          return false;
-
-        // Check operands of the reduction operation.
-        if ((U2->getOperand(0) == U->getOperand(0) && U2->getOperand(1) == U) ||
-            (U2->getOperand(1) == U->getOperand(0) && U2->getOperand(0) == U)) {
-          UsersToVisit.push_back(U2);
-          ElemNumToReduce /= 2;
-        } else
-          return false;
-      } else if (isa<ExtractElementInst>(U)) {
-        // At this moment we should have reduced all elements in the vector.
-        if (ElemNumToReduce != 1)
-          return false;
-
-        const ConstantInt *Val = dyn_cast<ConstantInt>(U->getOperand(1));
-        if (!Val || !Val->isZero())
-          return false;
-
-        ReduxExtracted = true;
-      } else
-        return false;
-    }
-  }
-  return ReduxExtracted;
-}
-
 void SelectionDAGBuilder::visitUnary(const User &I, unsigned Opcode) {
   SDNodeFlags Flags;
 
@@ -3148,17 +3057,6 @@ void SelectionDAGBuilder::visitBinary(const User &I, unsigned Opcode) {
   if (auto *ExactOp = dyn_cast<PossiblyExactOperator>(&I)) {
     Flags.setExact(ExactOp->isExact());
   }
-  if (isVectorReductionOp(&I)) {
-    Flags.setVectorReduction(true);
-    LLVM_DEBUG(dbgs() << "Detected a reduction operation:" << I << "\n");
-
-    // If no flags are set we will propagate the incoming flags, if any flags
-    // are set, we will intersect them with the incoming flag and so we need to
-    // copy the FMF flags here.
-    if (auto *FPOp = dyn_cast<FPMathOperator>(&I)) {
-      Flags.copyFMF(*FPOp);
-    }
-  }
 
   SDValue Op1 = getValue(I.getOperand(0));
   SDValue Op2 = getValue(I.getOperand(1));
@@ -3296,9 +3194,9 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
   SDValue Cond     = getValue(I.getOperand(0));
   SDValue LHSVal   = getValue(I.getOperand(1));
   SDValue RHSVal   = getValue(I.getOperand(2));
-  auto BaseOps = {Cond};
-  ISD::NodeType OpCode = Cond.getValueType().isVector() ?
-    ISD::VSELECT : ISD::SELECT;
+  SmallVector<SDValue, 1> BaseOps(1, Cond);
+  ISD::NodeType OpCode =
+      Cond.getValueType().isVector() ? ISD::VSELECT : ISD::SELECT;
 
   bool IsUnaryAbs = false;
 
@@ -3381,13 +3279,13 @@ void SelectionDAGBuilder::visitSelect(const User &I) {
       OpCode = Opc;
       LHSVal = getValue(LHS);
       RHSVal = getValue(RHS);
-      BaseOps = {};
+      BaseOps.clear();
     }
 
     if (IsUnaryAbs) {
       OpCode = Opc;
       LHSVal = getValue(LHS);
-      BaseOps = {};
+      BaseOps.clear();
     }
   }
 
@@ -3577,19 +3475,22 @@ void SelectionDAGBuilder::visitExtractElement(const User &I) {
 void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   SDValue Src1 = getValue(I.getOperand(0));
   SDValue Src2 = getValue(I.getOperand(1));
-  Constant *MaskV = cast<Constant>(I.getOperand(2));
+  ArrayRef<int> Mask;
+  if (auto *SVI = dyn_cast<ShuffleVectorInst>(&I))
+    Mask = SVI->getShuffleMask();
+  else
+    Mask = cast<ConstantExpr>(I).getShuffleMask();
   SDLoc DL = getCurSDLoc();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
   EVT SrcVT = Src1.getValueType();
-  unsigned SrcNumElts = SrcVT.getVectorNumElements();
 
-  if (MaskV->isNullValue() && VT.isScalableVector()) {
+  if (all_of(Mask, [](int Elem) { return Elem == 0; }) &&
+      VT.isScalableVector()) {
     // Canonical splat form of first element of first input vector.
-    SDValue FirstElt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
-                                   SrcVT.getScalarType(), Src1,
-                                   DAG.getConstant(0, DL, 
-                                   TLI.getVectorIdxTy(DAG.getDataLayout())));
+    SDValue FirstElt =
+        DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, SrcVT.getScalarType(), Src1,
+                    DAG.getVectorIdxConstant(0, DL));
     setValue(&I, DAG.getNode(ISD::SPLAT_VECTOR, DL, VT, FirstElt));
     return;
   }
@@ -3599,8 +3500,7 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   // for targets that support a SPLAT_VECTOR for non-scalable vector types.
   assert(!VT.isScalableVector() && "Unsupported scalable vector shuffle");
 
-  SmallVector<int, 8> Mask;
-  ShuffleVectorInst::getShuffleMask(MaskV, Mask);
+  unsigned SrcNumElts = SrcVT.getVectorNumElements();
   unsigned MaskNumElts = Mask.size();
 
   if (SrcNumElts == MaskNumElts) {
@@ -3683,9 +3583,8 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
     // If the concatenated vector was padded, extract a subvector with the
     // correct number of elements.
     if (MaskNumElts != PaddedMaskNumElts)
-      Result = DAG.getNode(
-          ISD::EXTRACT_SUBVECTOR, DL, VT, Result,
-          DAG.getConstant(0, DL, TLI.getVectorIdxTy(DAG.getDataLayout())));
+      Result = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Result,
+                           DAG.getVectorIdxConstant(0, DL));
 
     setValue(&I, Result);
     return;
@@ -3729,10 +3628,8 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
         if (StartIdx[Input] < 0)
           Src = DAG.getUNDEF(VT);
         else {
-          Src = DAG.getNode(
-              ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
-              DAG.getConstant(StartIdx[Input], DL,
-                              TLI.getVectorIdxTy(DAG.getDataLayout())));
+          Src = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Src,
+                            DAG.getVectorIdxConstant(StartIdx[Input], DL));
         }
       }
 
@@ -3754,7 +3651,6 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
   // replacing the shuffle with extract and build vector.
   // to insert and build vector.
   EVT EltVT = VT.getVectorElementType();
-  EVT IdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
   SmallVector<SDValue,8> Ops;
   for (int Idx : Mask) {
     SDValue Res;
@@ -3765,8 +3661,8 @@ void SelectionDAGBuilder::visitShuffleVector(const User &I) {
       SDValue &Src = Idx < (int)SrcNumElts ? Src1 : Src2;
       if (Idx >= (int)SrcNumElts) Idx -= SrcNumElts;
 
-      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL,
-                        EltVT, Src, DAG.getConstant(Idx, DL, IdxVT));
+      Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Src,
+                        DAG.getVectorIdxConstant(Idx, DL));
     }
 
     Ops.push_back(Res);
@@ -3882,13 +3778,18 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
 
   // Normalize Vector GEP - all scalar operands should be converted to the
   // splat vector.
-  unsigned VectorWidth = I.getType()->isVectorTy() ?
-    I.getType()->getVectorNumElements() : 0;
+  bool IsVectorGEP = I.getType()->isVectorTy();
+  ElementCount VectorElementCount =
+      IsVectorGEP ? cast<VectorType>(I.getType())->getElementCount()
+                  : ElementCount(0, false);
 
-  if (VectorWidth && !N.getValueType().isVector()) {
+  if (IsVectorGEP && !N.getValueType().isVector()) {
     LLVMContext &Context = *DAG.getContext();
-    EVT VT = EVT::getVectorVT(Context, N.getValueType(), VectorWidth);
-    N = DAG.getSplatBuildVector(VT, dl, N);
+    EVT VT = EVT::getVectorVT(Context, N.getValueType(), VectorElementCount);
+    if (VectorElementCount.Scalable)
+      N = DAG.getSplatVector(VT, dl, N);
+    else
+      N = DAG.getSplatBuildVector(VT, dl, N);
   }
 
   for (gep_type_iterator GTI = gep_type_begin(&I), E = gep_type_end(&I);
@@ -3910,9 +3811,16 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
                         DAG.getConstant(Offset, dl, N.getValueType()), Flags);
       }
     } else {
+      // IdxSize is the width of the arithmetic according to IR semantics.
+      // In SelectionDAG, we may prefer to do arithmetic in a wider bitwidth
+      // (and fix up the result later).
       unsigned IdxSize = DAG.getDataLayout().getIndexSizeInBits(AS);
       MVT IdxTy = MVT::getIntegerVT(IdxSize);
-      APInt ElementSize(IdxSize, DL->getTypeAllocSize(GTI.getIndexedType()));
+      TypeSize ElementSize = DL->getTypeAllocSize(GTI.getIndexedType());
+      // We intentionally mask away the high bits here; ElementSize may not
+      // fit in IdxTy.
+      APInt ElementMul(IdxSize, ElementSize.getKnownMinSize());
+      bool ElementScalable = ElementSize.isScalable();
 
       // If this is a scalar constant or a splat vector of constants,
       // handle it quickly.
@@ -3920,14 +3828,18 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
       if (C && isa<VectorType>(C->getType()))
         C = C->getSplatValue();
 
-      if (const auto *CI = dyn_cast_or_null<ConstantInt>(C)) {
-        if (CI->isZero())
-          continue;
-        APInt Offs = ElementSize * CI->getValue().sextOrTrunc(IdxSize);
+      const auto *CI = dyn_cast_or_null<ConstantInt>(C);
+      if (CI && CI->isZero())
+        continue;
+      if (CI && !ElementScalable) {
+        APInt Offs = ElementMul * CI->getValue().sextOrTrunc(IdxSize);
         LLVMContext &Context = *DAG.getContext();
-        SDValue OffsVal = VectorWidth ?
-          DAG.getConstant(Offs, dl, EVT::getVectorVT(Context, IdxTy, VectorWidth)) :
-          DAG.getConstant(Offs, dl, IdxTy);
+        SDValue OffsVal;
+        if (IsVectorGEP)
+          OffsVal = DAG.getConstant(
+              Offs, dl, EVT::getVectorVT(Context, IdxTy, VectorElementCount));
+        else
+          OffsVal = DAG.getConstant(Offs, dl, IdxTy);
 
         // In an inbounds GEP with an offset that is nonnegative even when
         // interpreted as signed, assume there is no unsigned overflow.
@@ -3941,31 +3853,45 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
         continue;
       }
 
-      // N = N + Idx * ElementSize;
+      // N = N + Idx * ElementMul;
       SDValue IdxN = getValue(Idx);
 
-      if (!IdxN.getValueType().isVector() && VectorWidth) {
-        EVT VT = EVT::getVectorVT(*Context, IdxN.getValueType(), VectorWidth);
-        IdxN = DAG.getSplatBuildVector(VT, dl, IdxN);
+      if (!IdxN.getValueType().isVector() && IsVectorGEP) {
+        EVT VT = EVT::getVectorVT(*Context, IdxN.getValueType(),
+                                  VectorElementCount);
+        if (VectorElementCount.Scalable)
+          IdxN = DAG.getSplatVector(VT, dl, IdxN);
+        else
+          IdxN = DAG.getSplatBuildVector(VT, dl, IdxN);
       }
 
       // If the index is smaller or larger than intptr_t, truncate or extend
       // it.
       IdxN = DAG.getSExtOrTrunc(IdxN, dl, N.getValueType());
 
-      // If this is a multiply by a power of two, turn it into a shl
-      // immediately.  This is a very common case.
-      if (ElementSize != 1) {
-        if (ElementSize.isPowerOf2()) {
-          unsigned Amt = ElementSize.logBase2();
-          IdxN = DAG.getNode(ISD::SHL, dl,
-                             N.getValueType(), IdxN,
-                             DAG.getConstant(Amt, dl, IdxN.getValueType()));
-        } else {
-          SDValue Scale = DAG.getConstant(ElementSize.getZExtValue(), dl,
-                                          IdxN.getValueType());
-          IdxN = DAG.getNode(ISD::MUL, dl,
-                             N.getValueType(), IdxN, Scale);
+      if (ElementScalable) {
+        EVT VScaleTy = N.getValueType().getScalarType();
+        SDValue VScale = DAG.getNode(
+            ISD::VSCALE, dl, VScaleTy,
+            DAG.getConstant(ElementMul.getZExtValue(), dl, VScaleTy));
+        if (IsVectorGEP)
+          VScale = DAG.getSplatVector(N.getValueType(), dl, VScale);
+        IdxN = DAG.getNode(ISD::MUL, dl, N.getValueType(), IdxN, VScale);
+      } else {
+        // If this is a multiply by a power of two, turn it into a shl
+        // immediately.  This is a very common case.
+        if (ElementMul != 1) {
+          if (ElementMul.isPowerOf2()) {
+            unsigned Amt = ElementMul.logBase2();
+            IdxN = DAG.getNode(ISD::SHL, dl,
+                               N.getValueType(), IdxN,
+                               DAG.getConstant(Amt, dl, IdxN.getValueType()));
+          } else {
+            SDValue Scale = DAG.getConstant(ElementMul.getZExtValue(), dl,
+                                            IdxN.getValueType());
+            IdxN = DAG.getNode(ISD::MUL, dl,
+                               N.getValueType(), IdxN, Scale);
+          }
         }
       }
 
@@ -3991,8 +3917,7 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   auto &DL = DAG.getDataLayout();
   uint64_t TySize = DL.getTypeAllocSize(Ty);
-  unsigned Align =
-      std::max((unsigned)DL.getPrefTypeAlignment(Ty), I.getAlignment());
+  MaybeAlign Alignment = std::max(DL.getPrefTypeAlign(Ty), I.getAlign());
 
   SDValue AllocSize = getValue(I.getArraySize());
 
@@ -4007,25 +3932,26 @@ void SelectionDAGBuilder::visitAlloca(const AllocaInst &I) {
   // Handle alignment.  If the requested alignment is less than or equal to
   // the stack alignment, ignore it.  If the size is greater than or equal to
   // the stack alignment, we note this in the DYNAMIC_STACKALLOC node.
-  unsigned StackAlign =
-      DAG.getSubtarget().getFrameLowering()->getStackAlignment();
-  if (Align <= StackAlign)
-    Align = 0;
+  Align StackAlign = DAG.getSubtarget().getFrameLowering()->getStackAlign();
+  if (*Alignment <= StackAlign)
+    Alignment = None;
 
+  const uint64_t StackAlignMask = StackAlign.value() - 1U;
   // Round the size of the allocation up to the stack alignment size
   // by add SA-1 to the size. This doesn't overflow because we're computing
   // an address inside an alloca.
   SDNodeFlags Flags;
   Flags.setNoUnsignedWrap(true);
   AllocSize = DAG.getNode(ISD::ADD, dl, AllocSize.getValueType(), AllocSize,
-                          DAG.getConstant(StackAlign - 1, dl, IntPtr), Flags);
+                          DAG.getConstant(StackAlignMask, dl, IntPtr), Flags);
 
   // Mask out the low bits for alignment purposes.
-  AllocSize =
-      DAG.getNode(ISD::AND, dl, AllocSize.getValueType(), AllocSize,
-                  DAG.getConstant(~(uint64_t)(StackAlign - 1), dl, IntPtr));
+  AllocSize = DAG.getNode(ISD::AND, dl, AllocSize.getValueType(), AllocSize,
+                          DAG.getConstant(~StackAlignMask, dl, IntPtr));
 
-  SDValue Ops[] = {getRoot(), AllocSize, DAG.getConstant(Align, dl, IntPtr)};
+  SDValue Ops[] = {
+      getRoot(), AllocSize,
+      DAG.getConstant(Alignment ? Alignment->value() : 0, dl, IntPtr)};
   SDVTList VTs = DAG.getVTList(AllocSize.getValueType(), MVT::Other);
   SDValue DSA = DAG.getNode(ISD::DYNAMIC_STACKALLOC, dl, VTs, Ops);
   setValue(&I, DSA);
@@ -4057,13 +3983,7 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   SDValue Ptr = getValue(SV);
 
   Type *Ty = I.getType();
-
-  bool isVolatile = I.isVolatile();
-  bool isNonTemporal = I.hasMetadata(LLVMContext::MD_nontemporal);
-  bool isInvariant = I.hasMetadata(LLVMContext::MD_invariant_load);
-  bool isDereferenceable =
-      isDereferenceablePointer(SV, I.getType(), DAG.getDataLayout());
-  unsigned Alignment = I.getAlignment();
+  Align Alignment = I.getAlign();
 
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
@@ -4076,6 +3996,8 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   if (NumValues == 0)
     return;
 
+  bool isVolatile = I.isVolatile();
+
   SDValue Root;
   bool ConstantMemory = false;
   if (isVolatile)
@@ -4109,6 +4031,10 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
   SmallVector<SDValue, 4> Values(NumValues);
   SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
   EVT PtrVT = Ptr.getValueType();
+
+  MachineMemOperand::Flags MMOFlags
+    = TLI.getLoadMemOperandFlags(I, DAG.getDataLayout());
+
   unsigned ChainI = 0;
   for (unsigned i = 0; i != NumValues; ++i, ++ChainI) {
     // Serializing loads here may result in excessive register pressure, and
@@ -4128,16 +4054,6 @@ void SelectionDAGBuilder::visitLoad(const LoadInst &I) {
                             PtrVT, Ptr,
                             DAG.getConstant(Offsets[i], dl, PtrVT),
                             Flags);
-    auto MMOFlags = MachineMemOperand::MONone;
-    if (isVolatile)
-      MMOFlags |= MachineMemOperand::MOVolatile;
-    if (isNonTemporal)
-      MMOFlags |= MachineMemOperand::MONonTemporal;
-    if (isInvariant)
-      MMOFlags |= MachineMemOperand::MOInvariant;
-    if (isDereferenceable)
-      MMOFlags |= MachineMemOperand::MODereferenceable;
-    MMOFlags |= TLI.getMMOFlags(I);
 
     SDValue L = DAG.getLoad(MemVTs[i], dl, Root, A,
                             MachinePointerInfo(SV, Offsets[i]), Alignment,
@@ -4260,16 +4176,11 @@ void SelectionDAGBuilder::visitStore(const StoreInst &I) {
   SDValue Root = I.isVolatile() ? getRoot() : getMemoryRoot();
   SmallVector<SDValue, 4> Chains(std::min(MaxParallelChains, NumValues));
   SDLoc dl = getCurSDLoc();
-  unsigned Alignment = I.getAlignment();
+  Align Alignment = I.getAlign();
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
 
-  auto MMOFlags = MachineMemOperand::MONone;
-  if (I.isVolatile())
-    MMOFlags |= MachineMemOperand::MOVolatile;
-  if (I.hasMetadata(LLVMContext::MD_nontemporal))
-    MMOFlags |= MachineMemOperand::MONonTemporal;
-  MMOFlags |= TLI.getMMOFlags(I);
+  auto MMOFlags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
 
   // An aggregate load cannot wrap around the address space, so offsets to its
   // parts don't wrap either.
@@ -4304,25 +4215,25 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
                                            bool IsCompressing) {
   SDLoc sdl = getCurSDLoc();
 
-  auto getMaskedStoreOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
-                           unsigned& Alignment) {
+  auto getMaskedStoreOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0,
+                               MaybeAlign &Alignment) {
     // llvm.masked.store.*(Src0, Ptr, alignment, Mask)
     Src0 = I.getArgOperand(0);
     Ptr = I.getArgOperand(1);
-    Alignment = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
+    Alignment = cast<ConstantInt>(I.getArgOperand(2))->getMaybeAlignValue();
     Mask = I.getArgOperand(3);
   };
-  auto getCompressingStoreOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
-                           unsigned& Alignment) {
+  auto getCompressingStoreOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0,
+                                    MaybeAlign &Alignment) {
     // llvm.masked.compressstore.*(Src0, Ptr, Mask)
     Src0 = I.getArgOperand(0);
     Ptr = I.getArgOperand(1);
     Mask = I.getArgOperand(2);
-    Alignment = 0;
+    Alignment = None;
   };
 
   Value  *PtrOperand, *MaskOperand, *Src0Operand;
-  unsigned Alignment;
+  MaybeAlign Alignment;
   if (IsCompressing)
     getCompressingStoreOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
   else
@@ -4335,19 +4246,16 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
 
   EVT VT = Src0.getValueType();
   if (!Alignment)
-    Alignment = DAG.getEVTAlignment(VT);
+    Alignment = DAG.getEVTAlign(VT);
 
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
 
-  MachineMemOperand *MMO =
-    DAG.getMachineFunction().
-    getMachineMemOperand(MachinePointerInfo(PtrOperand),
-                          MachineMemOperand::MOStore,
-                          // TODO: Make MachineMemOperands aware of scalable
-                          // vectors.
-                          VT.getStoreSize().getKnownMinSize(),
-                          Alignment, AAInfo);
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(PtrOperand), MachineMemOperand::MOStore,
+      // TODO: Make MachineMemOperands aware of scalable
+      // vectors.
+      VT.getStoreSize().getKnownMinSize(), *Alignment, AAInfo);
   SDValue StoreNode =
       DAG.getMaskedStore(getMemoryRoot(), sdl, Src0, Ptr, Offset, Mask, VT, MMO,
                          ISD::UNINDEXED, false /* Truncating */, IsCompressing);
@@ -4370,78 +4278,51 @@ void SelectionDAGBuilder::visitMaskedStore(const CallInst &I,
 // are looking for. If first operand of the GEP is a splat vector - we
 // extract the splat value and use it as a uniform base.
 // In all other cases the function returns 'false'.
-static bool getUniformBase(const Value *&Ptr, SDValue &Base, SDValue &Index,
+static bool getUniformBase(const Value *Ptr, SDValue &Base, SDValue &Index,
                            ISD::MemIndexType &IndexType, SDValue &Scale,
-                           SelectionDAGBuilder *SDB) {
+                           SelectionDAGBuilder *SDB, const BasicBlock *CurBB) {
   SelectionDAG& DAG = SDB->DAG;
-  LLVMContext &Context = *DAG.getContext();
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  const DataLayout &DL = DAG.getDataLayout();
 
   assert(Ptr->getType()->isVectorTy() && "Uexpected pointer type");
-  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
-  if (!GEP)
-    return false;
-
-  const Value *GEPPtr = GEP->getPointerOperand();
-  if (!GEPPtr->getType()->isVectorTy())
-    Ptr = GEPPtr;
-  else if (!(Ptr = getSplatValue(GEPPtr)))
-    return false;
-
-  unsigned FinalIndex = GEP->getNumOperands() - 1;
-  Value *IndexVal = GEP->getOperand(FinalIndex);
-  gep_type_iterator GTI = gep_type_begin(*GEP);
 
-  // Ensure all the other indices are 0.
-  for (unsigned i = 1; i < FinalIndex; ++i, ++GTI) {
-    auto *C = dyn_cast<Constant>(GEP->getOperand(i));
+  // Handle splat constant pointer.
+  if (auto *C = dyn_cast<Constant>(Ptr)) {
+    C = C->getSplatValue();
     if (!C)
       return false;
-    if (isa<VectorType>(C->getType()))
-      C = C->getSplatValue();
-    auto *CI = dyn_cast_or_null<ConstantInt>(C);
-    if (!CI || !CI->isZero())
-      return false;
+
+    Base = SDB->getValue(C);
+
+    unsigned NumElts = cast<FixedVectorType>(Ptr->getType())->getNumElements();
+    EVT VT = EVT::getVectorVT(*DAG.getContext(), TLI.getPointerTy(DL), NumElts);
+    Index = DAG.getConstant(0, SDB->getCurSDLoc(), VT);
+    IndexType = ISD::SIGNED_SCALED;
+    Scale = DAG.getTargetConstant(1, SDB->getCurSDLoc(), TLI.getPointerTy(DL));
+    return true;
   }
 
-  // The operands of the GEP may be defined in another basic block.
-  // In this case we'll not find nodes for the operands.
-  if (!SDB->findValue(Ptr))
+  const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  if (!GEP || GEP->getParent() != CurBB)
     return false;
-  Constant *C = dyn_cast<Constant>(IndexVal);
-  if (!C && !SDB->findValue(IndexVal))
+
+  if (GEP->getNumOperands() != 2)
     return false;
 
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  const DataLayout &DL = DAG.getDataLayout();
-  StructType *STy = GTI.getStructTypeOrNull();
-
-  if (STy) {
-    const StructLayout *SL = DL.getStructLayout(STy);
-    if (isa<VectorType>(C->getType())) {
-      C = C->getSplatValue();
-      // FIXME: If getSplatValue may return nullptr for a structure?
-      // If not, the following check can be removed.
-      if (!C)
-        return false;
-    }
-    auto *CI = cast<ConstantInt>(C);
-    Scale = DAG.getTargetConstant(1, SDB->getCurSDLoc(), TLI.getPointerTy(DL));
-    Index = DAG.getConstant(SL->getElementOffset(CI->getZExtValue()),
-                            SDB->getCurSDLoc(), TLI.getPointerTy(DL));
-  } else {
-    Scale = DAG.getTargetConstant(
-                DL.getTypeAllocSize(GEP->getResultElementType()),
-                SDB->getCurSDLoc(), TLI.getPointerTy(DL));
-    Index = SDB->getValue(IndexVal);
-  }
-  Base = SDB->getValue(Ptr);
-  IndexType = ISD::SIGNED_SCALED;
+  const Value *BasePtr = GEP->getPointerOperand();
+  const Value *IndexVal = GEP->getOperand(GEP->getNumOperands() - 1);
 
-  if (STy || !Index.getValueType().isVector()) {
-    unsigned GEPWidth = GEP->getType()->getVectorNumElements();
-    EVT VT = EVT::getVectorVT(Context, Index.getValueType(), GEPWidth);
-    Index = DAG.getSplatBuildVector(VT, SDLoc(Index), Index);
-  }
+  // Make sure the base is scalar and the index is a vector.
+  if (BasePtr->getType()->isVectorTy() || !IndexVal->getType()->isVectorTy())
+    return false;
+
+  Base = SDB->getValue(BasePtr);
+  Index = SDB->getValue(IndexVal);
+  IndexType = ISD::SIGNED_SCALED;
+  Scale = DAG.getTargetConstant(
+              DL.getTypeAllocSize(GEP->getResultElementType()),
+              SDB->getCurSDLoc(), TLI.getPointerTy(DL));
   return true;
 }
 
@@ -4453,9 +4334,9 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   SDValue Src0 = getValue(I.getArgOperand(0));
   SDValue Mask = getValue(I.getArgOperand(3));
   EVT VT = Src0.getValueType();
-  unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(2)))->getZExtValue();
-  if (!Alignment)
-    Alignment = DAG.getEVTAlignment(VT);
+  Align Alignment = cast<ConstantInt>(I.getArgOperand(2))
+                        ->getMaybeAlignValue()
+                        .getValueOr(DAG.getEVTAlign(VT));
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
 
   AAMDNodes AAInfo;
@@ -4465,18 +4346,15 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
   SDValue Index;
   ISD::MemIndexType IndexType;
   SDValue Scale;
-  const Value *BasePtr = Ptr;
-  bool UniformBase = getUniformBase(BasePtr, Base, Index, IndexType, Scale,
-                                    this);
-
-  const Value *MemOpBasePtr = UniformBase ? BasePtr : nullptr;
-  MachineMemOperand *MMO = DAG.getMachineFunction().
-    getMachineMemOperand(MachinePointerInfo(MemOpBasePtr),
-                         MachineMemOperand::MOStore,
-                         // TODO: Make MachineMemOperands aware of scalable
-                         // vectors.
-                         VT.getStoreSize().getKnownMinSize(),
-                         Alignment, AAInfo);
+  bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+                                    I.getParent());
+
+  unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(AS), MachineMemOperand::MOStore,
+      // TODO: Make MachineMemOperands aware of scalable
+      // vectors.
+      MemoryLocation::UnknownSize, Alignment, AAInfo);
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
     Index = getValue(Ptr);
@@ -4493,25 +4371,25 @@ void SelectionDAGBuilder::visitMaskedScatter(const CallInst &I) {
 void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
   SDLoc sdl = getCurSDLoc();
 
-  auto getMaskedLoadOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
-                           unsigned& Alignment) {
+  auto getMaskedLoadOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0,
+                              MaybeAlign &Alignment) {
     // @llvm.masked.load.*(Ptr, alignment, Mask, Src0)
     Ptr = I.getArgOperand(0);
-    Alignment = cast<ConstantInt>(I.getArgOperand(1))->getZExtValue();
+    Alignment = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
     Mask = I.getArgOperand(2);
     Src0 = I.getArgOperand(3);
   };
-  auto getExpandingLoadOps = [&](Value* &Ptr, Value* &Mask, Value* &Src0,
-                           unsigned& Alignment) {
+  auto getExpandingLoadOps = [&](Value *&Ptr, Value *&Mask, Value *&Src0,
+                                 MaybeAlign &Alignment) {
     // @llvm.masked.expandload.*(Ptr, Mask, Src0)
     Ptr = I.getArgOperand(0);
-    Alignment = 0;
+    Alignment = None;
     Mask = I.getArgOperand(1);
     Src0 = I.getArgOperand(2);
   };
 
   Value  *PtrOperand, *MaskOperand, *Src0Operand;
-  unsigned Alignment;
+  MaybeAlign Alignment;
   if (IsExpanding)
     getExpandingLoadOps(PtrOperand, MaskOperand, Src0Operand, Alignment);
   else
@@ -4524,7 +4402,7 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
 
   EVT VT = Src0.getValueType();
   if (!Alignment)
-    Alignment = DAG.getEVTAlignment(VT);
+    Alignment = DAG.getEVTAlign(VT);
 
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
@@ -4542,14 +4420,11 @@ void SelectionDAGBuilder::visitMaskedLoad(const CallInst &I, bool IsExpanding) {
 
   SDValue InChain = AddToChain ? DAG.getRoot() : DAG.getEntryNode();
 
-  MachineMemOperand *MMO =
-    DAG.getMachineFunction().
-    getMachineMemOperand(MachinePointerInfo(PtrOperand),
-                          MachineMemOperand::MOLoad,
-                          // TODO: Make MachineMemOperands aware of scalable
-                          // vectors.
-                          VT.getStoreSize().getKnownMinSize(),
-                          Alignment, AAInfo, Ranges);
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(PtrOperand), MachineMemOperand::MOLoad,
+      // TODO: Make MachineMemOperands aware of scalable
+      // vectors.
+      VT.getStoreSize().getKnownMinSize(), *Alignment, AAInfo, Ranges);
 
   SDValue Load =
       DAG.getMaskedLoad(VT, sdl, InChain, Ptr, Offset, Mask, Src0, VT, MMO,
@@ -4569,9 +4444,9 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
-  unsigned Alignment = (cast<ConstantInt>(I.getArgOperand(1)))->getZExtValue();
-  if (!Alignment)
-    Alignment = DAG.getEVTAlignment(VT);
+  Align Alignment = cast<ConstantInt>(I.getArgOperand(1))
+                        ->getMaybeAlignValue()
+                        .getValueOr(DAG.getEVTAlign(VT));
 
   AAMDNodes AAInfo;
   I.getAAMetadata(AAInfo);
@@ -4582,29 +4457,14 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDValue Index;
   ISD::MemIndexType IndexType;
   SDValue Scale;
-  const Value *BasePtr = Ptr;
-  bool UniformBase = getUniformBase(BasePtr, Base, Index, IndexType, Scale,
-                                    this);
-  bool ConstantMemory = false;
-  if (UniformBase && AA &&
-      AA->pointsToConstantMemory(
-          MemoryLocation(BasePtr,
-                         LocationSize::precise(
-                             DAG.getDataLayout().getTypeStoreSize(I.getType())),
-                         AAInfo))) {
-    // Do not serialize (non-volatile) loads of constant memory with anything.
-    Root = DAG.getEntryNode();
-    ConstantMemory = true;
-  }
-
-  MachineMemOperand *MMO =
-    DAG.getMachineFunction().
-    getMachineMemOperand(MachinePointerInfo(UniformBase ? BasePtr : nullptr),
-                         MachineMemOperand::MOLoad,
-                         // TODO: Make MachineMemOperands aware of scalable
-                         // vectors.
-                         VT.getStoreSize().getKnownMinSize(),
-                         Alignment, AAInfo, Ranges);
+  bool UniformBase = getUniformBase(Ptr, Base, Index, IndexType, Scale, this,
+                                    I.getParent());
+  unsigned AS = Ptr->getType()->getScalarType()->getPointerAddressSpace();
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(AS), MachineMemOperand::MOLoad,
+      // TODO: Make MachineMemOperands aware of scalable
+      // vectors.
+      MemoryLocation::UnknownSize, Alignment, AAInfo, Ranges);
 
   if (!UniformBase) {
     Base = DAG.getConstant(0, sdl, TLI.getPointerTy(DAG.getDataLayout()));
@@ -4616,9 +4476,7 @@ void SelectionDAGBuilder::visitMaskedGather(const CallInst &I) {
   SDValue Gather = DAG.getMaskedGather(DAG.getVTList(VT, MVT::Other), VT, sdl,
                                        Ops, MMO, IndexType);
 
-  SDValue OutChain = Gather.getValue(1);
-  if (!ConstantMemory)
-    PendingLoads.push_back(OutChain);
+  PendingLoads.push_back(Gather.getValue(1));
   setValue(&I, Gather);
 }
 
@@ -4633,19 +4491,14 @@ void SelectionDAGBuilder::visitAtomicCmpXchg(const AtomicCmpXchgInst &I) {
   MVT MemVT = getValue(I.getCompareOperand()).getSimpleValueType();
   SDVTList VTs = DAG.getVTList(MemVT, MVT::i1, MVT::Other);
 
-  auto Alignment = DAG.getEVTAlignment(MemVT);
-
-  auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
-  if (I.isVolatile())
-    Flags |= MachineMemOperand::MOVolatile;
-  Flags |= DAG.getTargetLoweringInfo().getMMOFlags(I);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto Flags = TLI.getAtomicMemOperandFlags(I, DAG.getDataLayout());
 
   MachineFunction &MF = DAG.getMachineFunction();
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
-                            Flags, MemVT.getStoreSize(), Alignment,
-                            AAMDNodes(), nullptr, SSID, SuccessOrdering,
-                            FailureOrdering);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
+      DAG.getEVTAlign(MemVT), AAMDNodes(), nullptr, SSID, SuccessOrdering,
+      FailureOrdering);
 
   SDValue L = DAG.getAtomicCmpSwap(ISD::ATOMIC_CMP_SWAP_WITH_SUCCESS,
                                    dl, MemVT, VTs, InChain,
@@ -4684,18 +4537,13 @@ void SelectionDAGBuilder::visitAtomicRMW(const AtomicRMWInst &I) {
   SDValue InChain = getRoot();
 
   auto MemVT = getValue(I.getValOperand()).getSimpleValueType();
-  auto Alignment = DAG.getEVTAlignment(MemVT);
-
-  auto Flags = MachineMemOperand::MOLoad |  MachineMemOperand::MOStore;
-  if (I.isVolatile())
-    Flags |= MachineMemOperand::MOVolatile;
-  Flags |= DAG.getTargetLoweringInfo().getMMOFlags(I);
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  auto Flags = TLI.getAtomicMemOperandFlags(I, DAG.getDataLayout());
 
   MachineFunction &MF = DAG.getMachineFunction();
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()), Flags,
-                            MemVT.getStoreSize(), Alignment, AAMDNodes(),
-                            nullptr, SSID, Ordering);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
+      DAG.getEVTAlign(MemVT), AAMDNodes(), nullptr, SSID, Ordering);
 
   SDValue L =
     DAG.getAtomic(NT, dl, MemVT, InChain,
@@ -4735,24 +4583,11 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
       I.getAlignment() < MemVT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic load");
 
-  auto Flags = MachineMemOperand::MOLoad;
-  if (I.isVolatile())
-    Flags |= MachineMemOperand::MOVolatile;
-  if (I.hasMetadata(LLVMContext::MD_invariant_load))
-    Flags |= MachineMemOperand::MOInvariant;
-  if (isDereferenceablePointer(I.getPointerOperand(), I.getType(),
-                               DAG.getDataLayout()))
-    Flags |= MachineMemOperand::MODereferenceable;
-
-  Flags |= TLI.getMMOFlags(I);
-
-  MachineMemOperand *MMO =
-      DAG.getMachineFunction().
-      getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()),
-                           Flags, MemVT.getStoreSize(),
-                           I.getAlignment() ? I.getAlignment() :
-                                              DAG.getEVTAlignment(MemVT),
-                           AAMDNodes(), nullptr, SSID, Order);
+  auto Flags = TLI.getLoadMemOperandFlags(I, DAG.getDataLayout());
+
+  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
+      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
+      I.getAlign(), AAMDNodes(), nullptr, SSID, Order);
 
   InChain = TLI.prepareVolatileOrAtomicLoad(InChain, dl, DAG);
 
@@ -4773,7 +4608,7 @@ void SelectionDAGBuilder::visitAtomicLoad(const LoadInst &I) {
       PendingLoads.push_back(OutChain);
     return;
   }
-  
+
   SDValue L = DAG.getAtomic(ISD::ATOMIC_LOAD, dl, MemVT, MemVT, InChain,
                             Ptr, MMO);
 
@@ -4800,16 +4635,12 @@ void SelectionDAGBuilder::visitAtomicStore(const StoreInst &I) {
   if (I.getAlignment() < MemVT.getSizeInBits() / 8)
     report_fatal_error("Cannot generate unaligned atomic store");
 
-  auto Flags = MachineMemOperand::MOStore;
-  if (I.isVolatile())
-    Flags |= MachineMemOperand::MOVolatile;
-  Flags |= TLI.getMMOFlags(I);
+  auto Flags = TLI.getStoreMemOperandFlags(I, DAG.getDataLayout());
 
   MachineFunction &MF = DAG.getMachineFunction();
-  MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo(I.getPointerOperand()), Flags,
-                            MemVT.getStoreSize(), I.getAlignment(), AAMDNodes(),
-                            nullptr, SSID, Ordering);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(I.getPointerOperand()), Flags, MemVT.getStoreSize(),
+      I.getAlign(), AAMDNodes(), nullptr, SSID, Ordering);
 
   SDValue Val = getValue(I.getValueOperand());
   if (Val.getValueType() != MemVT)
@@ -4899,10 +4730,10 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     // This is target intrinsic that touches memory
     AAMDNodes AAInfo;
     I.getAAMetadata(AAInfo);
-    Result = DAG.getMemIntrinsicNode(
-        Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT,
-        MachinePointerInfo(Info.ptrVal, Info.offset),
-        Info.align ? Info.align->value() : 0, Info.flags, Info.size, AAInfo);
+    Result =
+        DAG.getMemIntrinsicNode(Info.opc, getCurSDLoc(), VTs, Ops, Info.memVT,
+                                MachinePointerInfo(Info.ptrVal, Info.offset),
+                                Info.align, Info.flags, Info.size, AAInfo);
   } else if (!HasChain) {
     Result = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, getCurSDLoc(), VTs, Ops);
   } else if (!I.getType()->isVoidTy()) {
@@ -4926,6 +4757,15 @@ void SelectionDAGBuilder::visitTargetIntrinsic(const CallInst &I,
     } else
       Result = lowerRangeToAssertZExt(DAG, I, Result);
 
+    MaybeAlign Alignment = I.getRetAlign();
+    if (!Alignment)
+      Alignment = F->getAttributes().getRetAlignment();
+    // Insert `assertalign` node if there's an alignment.
+    if (InsertAssertAlign && Alignment) {
+      Result =
+          DAG.getAssertAlign(getCurSDLoc(), Result, Alignment.valueOrOne());
+    }
+
     setValue(&I, Result);
   }
 }
@@ -5465,7 +5305,8 @@ static SDValue expandDivFix(unsigned Opcode, const SDLoc &DL,
                             SDValue LHS, SDValue RHS, SDValue Scale,
                             SelectionDAG &DAG, const TargetLowering &TLI) {
   EVT VT = LHS.getValueType();
-  bool Signed = Opcode == ISD::SDIVFIX;
+  bool Signed = Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT;
+  bool Saturating = Opcode == ISD::SDIVFIXSAT || Opcode == ISD::UDIVFIXSAT;
   LLVMContext &Ctx = *DAG.getContext();
 
   // If the type is legal but the operation isn't, this node might survive all
@@ -5477,14 +5318,16 @@ static SDValue expandDivFix(unsigned Opcode, const SDLoc &DL,
   // by bumping the size by one bit. This will force it to Promote, enabling the
   // early expansion and avoiding the need to expand later.
 
-  // We don't have to do this if Scale is 0; that can always be expanded.
+  // We don't have to do this if Scale is 0; that can always be expanded, unless
+  // it's a saturating signed operation. Those can experience true integer
+  // division overflow, a case which we must avoid.
 
   // FIXME: We wouldn't have to do this (or any of the early
   // expansion/promotion) if it was possible to expand a libcall of an
   // illegal type during operation legalization. But it's not, so things
   // get a bit hacky.
   unsigned ScaleInt = cast<ConstantSDNode>(Scale)->getZExtValue();
-  if (ScaleInt > 0 &&
+  if ((ScaleInt > 0 || (Saturating && Signed)) &&
       (TLI.isTypeLegal(VT) ||
        (VT.isVector() && TLI.isTypeLegal(VT.getVectorElementType())))) {
     TargetLowering::LegalizeAction Action = TLI.getFixedPointOperationAction(
@@ -5506,8 +5349,16 @@ static SDValue expandDivFix(unsigned Opcode, const SDLoc &DL,
         LHS = DAG.getZExtOrTrunc(LHS, DL, PromVT);
         RHS = DAG.getZExtOrTrunc(RHS, DL, PromVT);
       }
-      // TODO: Saturation.
+      EVT ShiftTy = TLI.getShiftAmountTy(PromVT, DAG.getDataLayout());
+      // For saturating operations, we need to shift up the LHS to get the
+      // proper saturation width, and then shift down again afterwards.
+      if (Saturating)
+        LHS = DAG.getNode(ISD::SHL, DL, PromVT, LHS,
+                          DAG.getConstant(1, DL, ShiftTy));
       SDValue Res = DAG.getNode(Opcode, DL, PromVT, LHS, RHS, Scale);
+      if (Saturating)
+        Res = DAG.getNode(Signed ? ISD::SRA : ISD::SRL, DL, PromVT, Res,
+                          DAG.getConstant(1, DL, ShiftTy));
       return DAG.getZExtOrTrunc(Res, DL, VT);
     }
   }
@@ -5622,6 +5473,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
   MachineFunction &MF = DAG.getMachineFunction();
   const TargetInstrInfo *TII = DAG.getSubtarget().getInstrInfo();
 
+  bool IsIndirect = false;
   Optional<MachineOperand> Op;
   // Some arguments' frame index is recorded during argument lowering.
   int FI = FuncInfo.getArgumentFrameIndex(Arg);
@@ -5643,6 +5495,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
     }
     if (Reg) {
       Op = MachineOperand::CreateReg(Reg, false);
+      IsIndirect = IsDbgDeclare;
     }
   }
 
@@ -5691,13 +5544,13 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
         }
         assert(!IsDbgDeclare && "DbgDeclare operand is not in memory?");
         FuncInfo.ArgDbgValues.push_back(
-          BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), false,
+          BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsDbgDeclare,
                   RegAndSize.first, Variable, *FragmentExpr));
       }
     };
 
     // Check if ValueMap has reg number.
-    DenseMap<const Value *, unsigned>::const_iterator
+    DenseMap<const Value *, Register>::const_iterator
       VMI = FuncInfo.ValueMap.find(V);
     if (VMI != FuncInfo.ValueMap.end()) {
       const auto &TLI = DAG.getTargetLoweringInfo();
@@ -5709,6 +5562,7 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
       }
 
       Op = MachineOperand::CreateReg(VMI->second, false);
+      IsIndirect = IsDbgDeclare;
     } else if (ArgRegsAndSizes.size() > 1) {
       // This was split due to the calling convention, and no virtual register
       // mapping exists for the value.
@@ -5722,28 +5576,9 @@ bool SelectionDAGBuilder::EmitFuncArgumentDbgValue(
 
   assert(Variable->isValidLocationForIntrinsic(DL) &&
          "Expected inlined-at fields to agree");
-
-  // If the argument arrives in a stack slot, then what the IR thought was a
-  // normal Value is actually in memory, and we must add a deref to load it.
-  if (Op->isFI()) {
-    int FI = Op->getIndex();
-    unsigned Size = DAG.getMachineFunction().getFrameInfo().getObjectSize(FI);
-    if (Expr->isImplicit()) {
-      SmallVector<uint64_t, 2> Ops = {dwarf::DW_OP_deref_size, Size};
-      Expr = DIExpression::prependOpcodes(Expr, Ops);
-    } else {
-      Expr = DIExpression::prepend(Expr, DIExpression::DerefBefore);
-    }
-  }
-
-  // If this location was specified with a dbg.declare, then it and its
-  // expression calculate the address of the variable. Append a deref to
-  // force it to be a memory location.
-  if (IsDbgDeclare)
-    Expr = DIExpression::append(Expr, {dwarf::DW_OP_deref});
-
+  IsIndirect = (Op->isReg()) ? IsIndirect : true;
   FuncInfo.ArgDbgValues.push_back(
-      BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), false,
+      BuildMI(MF, DL, TII->get(TargetOpcode::DBG_VALUE), IsIndirect,
               *Op, Variable, Expr));
 
   return true;
@@ -5787,6 +5622,10 @@ static unsigned FixedPointIntrinsicToOpcode(unsigned Intrinsic) {
     return ISD::SDIVFIX;
   case Intrinsic::udiv_fix:
     return ISD::UDIVFIX;
+  case Intrinsic::sdiv_fix_sat:
+    return ISD::SDIVFIXSAT;
+  case Intrinsic::udiv_fix_sat:
+    return ISD::UDIVFIXSAT;
   default:
     llvm_unreachable("Unhandled fixed point intrinsic");
   }
@@ -5798,7 +5637,24 @@ void SelectionDAGBuilder::lowerCallToExternalSymbol(const CallInst &I,
   SDValue Callee = DAG.getExternalSymbol(
       FunctionName,
       DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));
-  LowerCallTo(&I, Callee, I.isTailCall());
+  LowerCallTo(I, Callee, I.isTailCall());
+}
+
+/// Given a @llvm.call.preallocated.setup, return the corresponding
+/// preallocated call.
+static const CallBase *FindPreallocatedCall(const Value *PreallocatedSetup) {
+  assert(cast<CallBase>(PreallocatedSetup)
+                 ->getCalledFunction()
+                 ->getIntrinsicID() == Intrinsic::call_preallocated_setup &&
+         "expected call_preallocated_setup Value");
+  for (auto *U : PreallocatedSetup->users()) {
+    auto *UseCall = cast<CallBase>(U);
+    const Function *Fn = UseCall->getCalledFunction();
+    if (!Fn || Fn->getIntrinsicID() != Intrinsic::call_preallocated_arg) {
+      return UseCall;
+    }
+  }
+  llvm_unreachable("expected corresponding call to preallocated setup/arg");
 }
 
 /// Lower the call to the specified intrinsic function.
@@ -5814,6 +5670,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // By default, turn this into a target intrinsic node.
     visitTargetIntrinsic(I, Intrinsic);
     return;
+  case Intrinsic::vscale: {
+    match(&I, m_VScale(DAG.getDataLayout()));
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    setValue(&I,
+             DAG.getVScale(getCurSDLoc(), VT, APInt(VT.getSizeInBits(), 1)));
+    return;
+  }
   case Intrinsic::vastart:  visitVAStart(I); return;
   case Intrinsic::vaend:    visitVAEnd(I); return;
   case Intrinsic::vacopy:   visitVACopy(I); return;
@@ -5835,6 +5698,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              TLI.getFrameIndexTy(DAG.getDataLayout()),
                              getValue(I.getArgOperand(0))));
     return;
+  case Intrinsic::read_volatile_register:
   case Intrinsic::read_register: {
     Value *Reg = I.getArgOperand(0);
     SDValue Chain = getRoot();
@@ -5863,16 +5727,37 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
     // @llvm.memcpy defines 0 and 1 to both mean no alignment.
-    unsigned DstAlign = std::max<unsigned>(MCI.getDestAlignment(), 1);
-    unsigned SrcAlign = std::max<unsigned>(MCI.getSourceAlignment(), 1);
-    unsigned Align = MinAlign(DstAlign, SrcAlign);
+    Align DstAlign = MCI.getDestAlign().valueOrOne();
+    Align SrcAlign = MCI.getSourceAlign().valueOrOne();
+    Align Alignment = commonAlignment(DstAlign, SrcAlign);
     bool isVol = MCI.isVolatile();
-    bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     // FIXME: Support passing different dest/src alignments to the memcpy DAG
     // node.
     SDValue Root = isVol ? getRoot() : getMemoryRoot();
-    SDValue MC = DAG.getMemcpy(Root, sdl, Op1, Op2, Op3, Align, isVol,
-                               false, isTC,
+    SDValue MC = DAG.getMemcpy(Root, sdl, Op1, Op2, Op3, Alignment, isVol,
+                               /* AlwaysInline */ false, isTC,
+                               MachinePointerInfo(I.getArgOperand(0)),
+                               MachinePointerInfo(I.getArgOperand(1)));
+    updateDAGForMaybeTailCall(MC);
+    return;
+  }
+  case Intrinsic::memcpy_inline: {
+    const auto &MCI = cast<MemCpyInlineInst>(I);
+    SDValue Dst = getValue(I.getArgOperand(0));
+    SDValue Src = getValue(I.getArgOperand(1));
+    SDValue Size = getValue(I.getArgOperand(2));
+    assert(isa<ConstantSDNode>(Size) && "memcpy_inline needs constant size");
+    // @llvm.memcpy.inline defines 0 and 1 to both mean no alignment.
+    Align DstAlign = MCI.getDestAlign().valueOrOne();
+    Align SrcAlign = MCI.getSourceAlign().valueOrOne();
+    Align Alignment = commonAlignment(DstAlign, SrcAlign);
+    bool isVol = MCI.isVolatile();
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
+    // FIXME: Support passing different dest/src alignments to the memcpy DAG
+    // node.
+    SDValue MC = DAG.getMemcpy(getRoot(), sdl, Dst, Src, Size, Alignment, isVol,
+                               /* AlwaysInline */ true, isTC,
                                MachinePointerInfo(I.getArgOperand(0)),
                                MachinePointerInfo(I.getArgOperand(1)));
     updateDAGForMaybeTailCall(MC);
@@ -5884,12 +5769,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
     // @llvm.memset defines 0 and 1 to both mean no alignment.
-    unsigned Align = std::max<unsigned>(MSI.getDestAlignment(), 1);
+    Align Alignment = MSI.getDestAlign().valueOrOne();
     bool isVol = MSI.isVolatile();
-    bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     SDValue Root = isVol ? getRoot() : getMemoryRoot();
-    SDValue MS = DAG.getMemset(Root, sdl, Op1, Op2, Op3, Align, isVol,
-                               isTC, MachinePointerInfo(I.getArgOperand(0)));
+    SDValue MS = DAG.getMemset(Root, sdl, Op1, Op2, Op3, Alignment, isVol, isTC,
+                               MachinePointerInfo(I.getArgOperand(0)));
     updateDAGForMaybeTailCall(MS);
     return;
   }
@@ -5899,15 +5784,15 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
     // @llvm.memmove defines 0 and 1 to both mean no alignment.
-    unsigned DstAlign = std::max<unsigned>(MMI.getDestAlignment(), 1);
-    unsigned SrcAlign = std::max<unsigned>(MMI.getSourceAlignment(), 1);
-    unsigned Align = MinAlign(DstAlign, SrcAlign);
+    Align DstAlign = MMI.getDestAlign().valueOrOne();
+    Align SrcAlign = MMI.getSourceAlign().valueOrOne();
+    Align Alignment = commonAlignment(DstAlign, SrcAlign);
     bool isVol = MMI.isVolatile();
-    bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     // FIXME: Support passing different dest/src alignments to the memmove DAG
     // node.
     SDValue Root = isVol ? getRoot() : getMemoryRoot();
-    SDValue MM = DAG.getMemmove(Root, sdl, Op1, Op2, Op3, Align, isVol,
+    SDValue MM = DAG.getMemmove(Root, sdl, Op1, Op2, Op3, Alignment, isVol,
                                 isTC, MachinePointerInfo(I.getArgOperand(0)),
                                 MachinePointerInfo(I.getArgOperand(1)));
     updateDAGForMaybeTailCall(MM);
@@ -5923,7 +5808,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     unsigned SrcAlign = MI.getSourceAlignment();
     Type *LengthTy = MI.getLength()->getType();
     unsigned ElemSz = MI.getElementSizeInBytes();
-    bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     SDValue MC = DAG.getAtomicMemcpy(getRoot(), sdl, Dst, DstAlign, Src,
                                      SrcAlign, Length, LengthTy, ElemSz, isTC,
                                      MachinePointerInfo(MI.getRawDest()),
@@ -5941,7 +5826,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     unsigned SrcAlign = MI.getSourceAlignment();
     Type *LengthTy = MI.getLength()->getType();
     unsigned ElemSz = MI.getElementSizeInBytes();
-    bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     SDValue MC = DAG.getAtomicMemmove(getRoot(), sdl, Dst, DstAlign, Src,
                                       SrcAlign, Length, LengthTy, ElemSz, isTC,
                                       MachinePointerInfo(MI.getRawDest()),
@@ -5958,13 +5843,37 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     unsigned DstAlign = MI.getDestAlignment();
     Type *LengthTy = MI.getLength()->getType();
     unsigned ElemSz = MI.getElementSizeInBytes();
-    bool isTC = I.isTailCall() && isInTailCallPosition(&I, DAG.getTarget());
+    bool isTC = I.isTailCall() && isInTailCallPosition(I, DAG.getTarget());
     SDValue MC = DAG.getAtomicMemset(getRoot(), sdl, Dst, DstAlign, Val, Length,
                                      LengthTy, ElemSz, isTC,
                                      MachinePointerInfo(MI.getRawDest()));
     updateDAGForMaybeTailCall(MC);
     return;
   }
+  case Intrinsic::call_preallocated_setup: {
+    const CallBase *PreallocatedCall = FindPreallocatedCall(&I);
+    SDValue SrcValue = DAG.getSrcValue(PreallocatedCall);
+    SDValue Res = DAG.getNode(ISD::PREALLOCATED_SETUP, sdl, MVT::Other,
+                              getRoot(), SrcValue);
+    setValue(&I, Res);
+    DAG.setRoot(Res);
+    return;
+  }
+  case Intrinsic::call_preallocated_arg: {
+    const CallBase *PreallocatedCall = FindPreallocatedCall(I.getOperand(0));
+    SDValue SrcValue = DAG.getSrcValue(PreallocatedCall);
+    SDValue Ops[3];
+    Ops[0] = getRoot();
+    Ops[1] = SrcValue;
+    Ops[2] = DAG.getTargetConstant(*cast<ConstantInt>(I.getArgOperand(1)), sdl,
+                                   MVT::i32); // arg index
+    SDValue Res = DAG.getNode(
+        ISD::PREALLOCATED_ARG, sdl,
+        DAG.getVTList(TLI.getPointerTy(DAG.getDataLayout()), MVT::Other), Ops);
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
+    return;
+  }
   case Intrinsic::dbg_addr:
   case Intrinsic::dbg_declare: {
     const auto &DI = cast<DbgVariableIntrinsic>(I);
@@ -5972,12 +5881,14 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     DIExpression *Expression = DI.getExpression();
     dropDanglingDebugInfo(Variable, Expression);
     assert(Variable && "Missing variable");
-
+    LLVM_DEBUG(dbgs() << "SelectionDAG visiting debug intrinsic: " << DI
+                      << "\n");
     // Check if address has undef value.
     const Value *Address = DI.getVariableLocation();
     if (!Address || isa<UndefValue>(Address) ||
         (Address->use_empty() && !isa<Argument>(Address))) {
-      LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+      LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI
+                        << " (bad/undef/unused-arg address)\n");
       return;
     }
 
@@ -6006,6 +5917,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
         SDDbgValue *SDV = DAG.getFrameIndexDbgValue(
             Variable, Expression, FI, /*IsIndirect*/ true, dl, SDNodeOrder);
         DAG.AddDbgValue(SDV, getRoot().getNode(), isParameter);
+      } else {
+        LLVM_DEBUG(dbgs() << "Skipping " << DI
+                          << " (variable info stashed in MF side table)\n");
       }
       return;
     }
@@ -6040,7 +5954,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
       // virtual register info from the FuncInfo.ValueMap.
       if (!EmitFuncArgumentDbgValue(Address, Variable, Expression, dl, true,
                                     N)) {
-        LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI << "\n");
+        LLVM_DEBUG(dbgs() << "Dropping debug info for " << DI
+                          << " (could not emit func-arg dbg_value)\n");
       }
     }
     return;
@@ -6192,6 +6107,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::rint:
   case Intrinsic::nearbyint:
   case Intrinsic::round:
+  case Intrinsic::roundeven:
   case Intrinsic::canonicalize: {
     unsigned Opcode;
     switch (Intrinsic) {
@@ -6206,6 +6122,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     case Intrinsic::rint:      Opcode = ISD::FRINT;      break;
     case Intrinsic::nearbyint: Opcode = ISD::FNEARBYINT; break;
     case Intrinsic::round:     Opcode = ISD::FROUND;     break;
+    case Intrinsic::roundeven: Opcode = ISD::FROUNDEVEN; break;
     case Intrinsic::canonicalize: Opcode = ISD::FCANONICALIZE; break;
     }
 
@@ -6269,7 +6186,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              getValue(I.getArgOperand(1)),
                              getValue(I.getArgOperand(2))));
     return;
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)                   \
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
     visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(I));
@@ -6456,7 +6373,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   }
   case Intrinsic::sdiv_fix:
-  case Intrinsic::udiv_fix: {
+  case Intrinsic::udiv_fix:
+  case Intrinsic::sdiv_fix_sat:
+  case Intrinsic::udiv_fix_sat: {
     SDValue Op1 = getValue(I.getArgOperand(0));
     SDValue Op2 = getValue(I.getArgOperand(1));
     SDValue Op3 = getValue(I.getArgOperand(2));
@@ -6466,9 +6385,8 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   }
   case Intrinsic::stacksave: {
     SDValue Op = getRoot();
-    Res = DAG.getNode(
-        ISD::STACKSAVE, sdl,
-        DAG.getVTList(TLI.getPointerTy(DAG.getDataLayout()), MVT::Other), Op);
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    Res = DAG.getNode(ISD::STACKSAVE, sdl, DAG.getVTList(VT, MVT::Other), Op);
     setValue(&I, Res);
     DAG.setRoot(Res.getValue(1));
     return;
@@ -6479,7 +6397,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   case Intrinsic::get_dynamic_area_offset: {
     SDValue Op = getRoot();
-    EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
+    EVT PtrTy = TLI.getFrameIndexTy(DAG.getDataLayout());
     EVT ResTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
     // Result type for @llvm.get.dynamic.area.offset should match PtrTy for
     // target.
@@ -6493,13 +6411,13 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   }
   case Intrinsic::stackguard: {
-    EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
     MachineFunction &MF = DAG.getMachineFunction();
     const Module &M = *MF.getFunction().getParent();
     SDValue Chain = getRoot();
     if (TLI.useLoadStackGuardNode()) {
       Res = getLoadStackGuard(DAG, sdl, Chain);
     } else {
+      EVT PtrTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
       const Value *Global = TLI.getSDagStackGuard(M);
       unsigned Align = DL->getPrefTypeAlignment(Global->getType());
       Res = DAG.getLoad(PtrTy, sdl, Chain, getValue(Global),
@@ -6516,7 +6434,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // Emit code into the DAG to store the stack guard onto the stack.
     MachineFunction &MF = DAG.getMachineFunction();
     MachineFrameInfo &MFI = MF.getFrameInfo();
-    EVT PtrTy = TLI.getPointerTy(DAG.getDataLayout());
     SDValue Src, Chain = getRoot();
 
     if (TLI.useLoadStackGuardNode())
@@ -6528,6 +6445,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
 
     int FI = FuncInfo.StaticAllocaMap[Slot];
     MFI.setStackProtectorIndex(FI);
+    EVT PtrTy = TLI.getFrameIndexTy(DAG.getDataLayout());
 
     SDValue FIN = DAG.getFrameIndex(FI, PtrTy);
 
@@ -6606,7 +6524,9 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::gcwrite:
     llvm_unreachable("GC failed to lower gcread/gcwrite intrinsics!");
   case Intrinsic::flt_rounds:
-    setValue(&I, DAG.getNode(ISD::FLT_ROUNDS_, sdl, MVT::i32));
+    Res = DAG.getNode(ISD::FLT_ROUNDS_, sdl, {MVT::i32, MVT::Other}, getRoot());
+    setValue(&I, Res);
+    DAG.setRoot(Res.getValue(1));
     return;
 
   case Intrinsic::expect:
@@ -6678,12 +6598,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     Ops[2] = getValue(I.getArgOperand(1));
     Ops[3] = getValue(I.getArgOperand(2));
     Ops[4] = getValue(I.getArgOperand(3));
-    SDValue Result = DAG.getMemIntrinsicNode(ISD::PREFETCH, sdl,
-                                             DAG.getVTList(MVT::Other), Ops,
-                                             EVT::getIntegerVT(*Context, 8),
-                                             MachinePointerInfo(I.getArgOperand(0)),
-                                             0, /* align */
-                                             Flags);
+    SDValue Result = DAG.getMemIntrinsicNode(
+        ISD::PREFETCH, sdl, DAG.getVTList(MVT::Other), Ops,
+        EVT::getIntegerVT(*Context, 8), MachinePointerInfo(I.getArgOperand(0)),
+        /* align */ None, Flags);
 
     // Chain the prefetch in parallell with any pending loads, to stay out of
     // the way of later optimizations.
@@ -6750,10 +6668,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     return;
   case Intrinsic::experimental_patchpoint_void:
   case Intrinsic::experimental_patchpoint_i64:
-    visitPatchpoint(&I);
+    visitPatchpoint(I);
     return;
   case Intrinsic::experimental_gc_statepoint:
-    LowerStatepoint(ImmutableStatepoint(&I));
+    LowerStatepoint(cast<GCStatepointInst>(I));
     return;
   case Intrinsic::experimental_gc_result:
     visitGCResult(cast<GCResultInst>(I));
@@ -6794,7 +6712,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
   case Intrinsic::localrecover: {
     // i8* @llvm.localrecover(i8* %fn, i8* %fp, i32 %idx)
     MachineFunction &MF = DAG.getMachineFunction();
-    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout(), 0);
 
     // Get the symbol that defines the frame offset.
     auto *Fn = cast<Function>(I.getArgOperand(0)->stripPointerCasts());
@@ -6805,6 +6722,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
         MF.getMMI().getContext().getOrCreateFrameAllocSymbol(
             GlobalValue::dropLLVMManglingEscape(Fn->getName()), IdxVal);
 
+    Value *FP = I.getArgOperand(1);
+    SDValue FPVal = getValue(FP);
+    EVT PtrVT = FPVal.getValueType();
+
     // Create a MCSymbol for the label to avoid any target lowering
     // that would make this PC relative.
     SDValue OffsetSym = DAG.getMCSymbol(FrameAllocSym, PtrVT);
@@ -6812,8 +6733,6 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
         DAG.getNode(ISD::LOCAL_RECOVER, sdl, PtrVT, OffsetSym);
 
     // Add the offset to the FP.
-    Value *FP = I.getArgOperand(1);
-    SDValue FPVal = getValue(FP);
     SDValue Add = DAG.getMemBasePlusOffset(FPVal, OffsetVal, sdl);
     setValue(&I, Add);
 
@@ -6996,11 +6915,42 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     SDValue Ptr = getValue(I.getOperand(0));
     SDValue Const = getValue(I.getOperand(1));
 
-    EVT DestVT =
-        EVT(DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout()));
+    EVT PtrVT = Ptr.getValueType();
+    setValue(&I, DAG.getNode(ISD::AND, getCurSDLoc(), PtrVT, Ptr,
+                             DAG.getZExtOrTrunc(Const, getCurSDLoc(), PtrVT)));
+    return;
+  }
+  case Intrinsic::get_active_lane_mask: {
+    auto DL = getCurSDLoc();
+    SDValue Index = getValue(I.getOperand(0));
+    SDValue BTC = getValue(I.getOperand(1));
+    Type *ElementTy = I.getOperand(0)->getType();
+    EVT VT = TLI.getValueType(DAG.getDataLayout(), I.getType());
+    unsigned VecWidth = VT.getVectorNumElements();
+
+    SmallVector<SDValue, 16> OpsBTC;
+    SmallVector<SDValue, 16> OpsIndex;
+    SmallVector<SDValue, 16> OpsStepConstants;
+    for (unsigned i = 0; i < VecWidth; i++) {
+      OpsBTC.push_back(BTC);
+      OpsIndex.push_back(Index);
+      OpsStepConstants.push_back(DAG.getConstant(i, DL, MVT::getVT(ElementTy)));
+    }
 
-    setValue(&I, DAG.getNode(ISD::AND, getCurSDLoc(), DestVT, Ptr,
-                             DAG.getZExtOrTrunc(Const, getCurSDLoc(), DestVT)));
+    EVT CCVT = MVT::i1;
+    CCVT = EVT::getVectorVT(I.getContext(), CCVT, VecWidth);
+
+    auto VecTy = MVT::getVT(FixedVectorType::get(ElementTy, VecWidth));
+    SDValue VectorIndex = DAG.getBuildVector(VecTy, DL, OpsIndex);
+    SDValue VectorStep = DAG.getBuildVector(VecTy, DL, OpsStepConstants);
+    SDValue VectorInduction = DAG.getNode(
+       ISD::UADDO, DL, DAG.getVTList(VecTy, CCVT), VectorIndex, VectorStep);
+    SDValue VectorBTC = DAG.getBuildVector(VecTy, DL, OpsBTC);
+    SDValue SetCC = DAG.getSetCC(DL, CCVT, VectorInduction.getValue(0),
+                                 VectorBTC, ISD::CondCode::SETULE);
+    setValue(&I, DAG.getNode(ISD::AND, DL, CCVT,
+                             DAG.getNOT(DL, VectorInduction.getValue(1), CCVT),
+                             SetCC));
     return;
   }
   }
@@ -7032,14 +6982,67 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
     Opers.push_back(getValue(FPI.getArgOperand(1)));
   }
 
+  auto pushOutChain = [this](SDValue Result, fp::ExceptionBehavior EB) {
+    assert(Result.getNode()->getNumValues() == 2);
+
+    // Push node to the appropriate list so that future instructions can be
+    // chained up correctly.
+    SDValue OutChain = Result.getValue(1);
+    switch (EB) {
+    case fp::ExceptionBehavior::ebIgnore:
+      // The only reason why ebIgnore nodes still need to be chained is that
+      // they might depend on the current rounding mode, and therefore must
+      // not be moved across instruction that may change that mode.
+      LLVM_FALLTHROUGH;
+    case fp::ExceptionBehavior::ebMayTrap:
+      // These must not be moved across calls or instructions that may change
+      // floating-point exception masks.
+      PendingConstrainedFP.push_back(OutChain);
+      break;
+    case fp::ExceptionBehavior::ebStrict:
+      // These must not be moved across calls or instructions that may change
+      // floating-point exception masks or read floating-point exception flags.
+      // In addition, they cannot be optimized out even if unused.
+      PendingConstrainedFPStrict.push_back(OutChain);
+      break;
+    }
+  };
+
+  SDVTList VTs = DAG.getVTList(ValueVTs);
+  fp::ExceptionBehavior EB = FPI.getExceptionBehavior().getValue();
+
+  SDNodeFlags Flags;
+  if (EB == fp::ExceptionBehavior::ebIgnore)
+    Flags.setNoFPExcept(true);
+
+  if (auto *FPOp = dyn_cast<FPMathOperator>(&FPI))
+    Flags.copyFMF(*FPOp);
+
   unsigned Opcode;
   switch (FPI.getIntrinsicID()) {
   default: llvm_unreachable("Impossible intrinsic");  // Can't reach here.
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)                   \
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
   case Intrinsic::INTRINSIC:                                                   \
     Opcode = ISD::STRICT_##DAGN;                                               \
     break;
 #include "llvm/IR/ConstrainedOps.def"
+  case Intrinsic::experimental_constrained_fmuladd: {
+    Opcode = ISD::STRICT_FMA;
+    // Break fmuladd into fmul and fadd.
+    if (TM.Options.AllowFPOpFusion == FPOpFusion::Strict ||
+        !TLI.isFMAFasterThanFMulAndFAdd(DAG.getMachineFunction(),
+                                        ValueVTs[0])) {
+      Opers.pop_back();
+      SDValue Mul = DAG.getNode(ISD::STRICT_FMUL, sdl, VTs, Opers, Flags);
+      pushOutChain(Mul, EB);
+      Opcode = ISD::STRICT_FADD;
+      Opers.clear();
+      Opers.push_back(Mul.getValue(1));
+      Opers.push_back(Mul.getValue(0));
+      Opers.push_back(getValue(FPI.getArgOperand(2)));
+    }
+    break;
+  }
   }
 
   // A few strict DAG nodes carry additional operands that are not
@@ -7058,32 +7061,8 @@ void SelectionDAGBuilder::visitConstrainedFPIntrinsic(
   }
   }
 
-  SDVTList VTs = DAG.getVTList(ValueVTs);
-  SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers);
-
-  assert(Result.getNode()->getNumValues() == 2);
-
-  // Push node to the appropriate list so that future instructions can be
-  // chained up correctly.
-  SDValue OutChain = Result.getValue(1);
-  switch (FPI.getExceptionBehavior().getValue()) {
-  case fp::ExceptionBehavior::ebIgnore:
-    // The only reason why ebIgnore nodes still need to be chained is that
-    // they might depend on the current rounding mode, and therefore must
-    // not be moved across instruction that may change that mode.
-    LLVM_FALLTHROUGH;
-  case fp::ExceptionBehavior::ebMayTrap:
-    // These must not be moved across calls or instructions that may change
-    // floating-point exception masks.
-    PendingConstrainedFP.push_back(OutChain);
-    break;
-  case fp::ExceptionBehavior::ebStrict:
-    // These must not be moved across calls or instructions that may change
-    // floating-point exception masks or read floating-point exception flags.
-    // In addition, they cannot be optimized out even if unused.
-    PendingConstrainedFPStrict.push_back(OutChain);
-    break;
-  }
+  SDValue Result = DAG.getNode(Opcode, sdl, VTs, Opers, Flags);
+  pushOutChain(Result, EB);
 
   SDValue FPResult = Result.getValue(0);
   setValue(&FPI, FPResult);
@@ -7150,10 +7129,9 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
     // There is a platform (e.g. wasm) that uses funclet style IR but does not
     // actually use outlined funclets and their LSDA info style.
     if (MF.hasEHFunclets() && isFuncletEHPersonality(Pers)) {
-      assert(CLI.CS);
+      assert(CLI.CB);
       WinEHFuncInfo *EHInfo = DAG.getMachineFunction().getWinEHFuncInfo();
-      EHInfo->addIPToStateRange(cast<InvokeInst>(CLI.CS.getInstruction()),
-                                BeginLabel, EndLabel);
+      EHInfo->addIPToStateRange(cast<InvokeInst>(CLI.CB), BeginLabel, EndLabel);
     } else if (!isScopedEHPersonality(Pers)) {
       MF.addInvoke(FuncInfo.MBBMap[EHPadBB], BeginLabel, EndLabel);
     }
@@ -7162,15 +7140,15 @@ SelectionDAGBuilder::lowerInvokable(TargetLowering::CallLoweringInfo &CLI,
   return Result;
 }
 
-void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
+void SelectionDAGBuilder::LowerCallTo(const CallBase &CB, SDValue Callee,
                                       bool isTailCall,
                                       const BasicBlock *EHPadBB) {
   auto &DL = DAG.getDataLayout();
-  FunctionType *FTy = CS.getFunctionType();
-  Type *RetTy = CS.getType();
+  FunctionType *FTy = CB.getFunctionType();
+  Type *RetTy = CB.getType();
 
   TargetLowering::ArgListTy Args;
-  Args.reserve(CS.arg_size());
+  Args.reserve(CB.arg_size());
 
   const Value *SwiftErrorVal = nullptr;
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -7178,7 +7156,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   if (isTailCall) {
     // Avoid emitting tail calls in functions with the disable-tail-calls
     // attribute.
-    auto *Caller = CS.getInstruction()->getParent()->getParent();
+    auto *Caller = CB.getParent()->getParent();
     if (Caller->getFnAttribute("disable-tail-calls").getValueAsString() ==
         "true")
       isTailCall = false;
@@ -7191,10 +7169,9 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
       isTailCall = false;
   }
 
-  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
-       i != e; ++i) {
+  for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I) {
     TargetLowering::ArgListEntry Entry;
-    const Value *V = *i;
+    const Value *V = *I;
 
     // Skip empty types
     if (V->getType()->isEmptyTy())
@@ -7203,16 +7180,16 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
     SDValue ArgNode = getValue(V);
     Entry.Node = ArgNode; Entry.Ty = V->getType();
 
-    Entry.setAttributes(&CS, i - CS.arg_begin());
+    Entry.setAttributes(&CB, I - CB.arg_begin());
 
     // Use swifterror virtual register as input to the call.
     if (Entry.IsSwiftError && TLI.supportSwiftError()) {
       SwiftErrorVal = V;
       // We find the virtual register for the actual swifterror argument.
       // Instead of using the Value, we use the virtual register instead.
-      Entry.Node = DAG.getRegister(
-          SwiftError.getOrCreateVRegUseAt(CS.getInstruction(), FuncInfo.MBB, V),
-          EVT(TLI.getPointerTy(DL)));
+      Entry.Node =
+          DAG.getRegister(SwiftError.getOrCreateVRegUseAt(&CB, FuncInfo.MBB, V),
+                          EVT(TLI.getPointerTy(DL)));
     }
 
     Args.push_back(Entry);
@@ -7225,7 +7202,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
   // If call site has a cfguardtarget operand bundle, create and add an
   // additional ArgListEntry.
-  if (auto Bundle = CS.getOperandBundle(LLVMContext::OB_cfguardtarget)) {
+  if (auto Bundle = CB.getOperandBundle(LLVMContext::OB_cfguardtarget)) {
     TargetLowering::ArgListEntry Entry;
     Value *V = Bundle->Inputs[0];
     SDValue ArgNode = getValue(V);
@@ -7237,7 +7214,7 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
 
   // Check if target-independent constraints permit a tail call here.
   // Target-dependent constraints are checked within TLI->LowerCallTo.
-  if (isTailCall && !isInTailCallPosition(CS, DAG.getTarget()))
+  if (isTailCall && !isInTailCallPosition(CB, DAG.getTarget()))
     isTailCall = false;
 
   // Disable tail calls if there is an swifterror argument. Targets have not
@@ -7248,15 +7225,16 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(getCurSDLoc())
       .setChain(getRoot())
-      .setCallee(RetTy, FTy, Callee, std::move(Args), CS)
+      .setCallee(RetTy, FTy, Callee, std::move(Args), CB)
       .setTailCall(isTailCall)
-      .setConvergent(CS.isConvergent());
+      .setConvergent(CB.isConvergent())
+      .setIsPreallocated(
+          CB.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0);
   std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
 
   if (Result.first.getNode()) {
-    const Instruction *Inst = CS.getInstruction();
-    Result.first = lowerRangeToAssertZExt(DAG, *Inst, Result.first);
-    setValue(Inst, Result.first);
+    Result.first = lowerRangeToAssertZExt(DAG, CB, Result.first);
+    setValue(&CB, Result.first);
   }
 
   // The last element of CLI.InVals has the SDValue for swifterror return.
@@ -7265,8 +7243,8 @@ void SelectionDAGBuilder::LowerCallTo(ImmutableCallSite CS, SDValue Callee,
   if (SwiftErrorVal && TLI.supportSwiftError()) {
     // Get the last element of InVals.
     SDValue Src = CLI.InVals.back();
-    Register VReg = SwiftError.getOrCreateVRegDefAt(
-        CS.getInstruction(), FuncInfo.MBB, SwiftErrorVal);
+    Register VReg =
+        SwiftError.getOrCreateVRegDefAt(&CB, FuncInfo.MBB, SwiftErrorVal);
     SDValue CopyNode = CLI.DAG.getCopyToReg(Result.second, CLI.DL, VReg, Src);
     DAG.setRoot(CopyNode);
   }
@@ -7281,7 +7259,7 @@ static SDValue getMemCmpLoad(const Value *PtrVal, MVT LoadVT,
     Type *LoadTy =
         Type::getIntNTy(PtrVal->getContext(), LoadVT.getScalarSizeInBits());
     if (LoadVT.isVector())
-      LoadTy = VectorType::get(LoadTy, LoadVT.getVectorNumElements());
+      LoadTy = FixedVectorType::get(LoadTy, LoadVT.getVectorNumElements());
 
     LoadInput = ConstantExpr::getBitCast(const_cast<Constant *>(LoadInput),
                                          PointerType::getUnqual(LoadTy));
@@ -7455,11 +7433,10 @@ bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
   SDValue Src = getValue(I.getArgOperand(1));
   SDValue Size = getValue(I.getArgOperand(2));
 
-  unsigned DstAlign = DAG.InferPtrAlignment(Dst);
-  unsigned SrcAlign = DAG.InferPtrAlignment(Src);
-  unsigned Align = std::min(DstAlign, SrcAlign);
-  if (Align == 0) // Alignment of one or both could not be inferred.
-    Align = 1; // 0 and 1 both specify no alignment, but 0 is reserved.
+  Align DstAlign = DAG.InferPtrAlign(Dst).valueOrOne();
+  Align SrcAlign = DAG.InferPtrAlign(Src).valueOrOne();
+  // DAG::getMemcpy needs Alignment to be defined.
+  Align Alignment = std::min(DstAlign, SrcAlign);
 
   bool isVol = false;
   SDLoc sdl = getCurSDLoc();
@@ -7468,8 +7445,8 @@ bool SelectionDAGBuilder::visitMemPCpyCall(const CallInst &I) {
   // because the return pointer needs to be adjusted by the size of
   // the copied memory.
   SDValue Root = isVol ? getRoot() : getMemoryRoot();
-  SDValue MC = DAG.getMemcpy(Root, sdl, Dst, Src, Size, Align, isVol,
-                             false, /*isTailCall=*/false,
+  SDValue MC = DAG.getMemcpy(Root, sdl, Dst, Src, Size, Alignment, isVol, false,
+                             /*isTailCall=*/false,
                              MachinePointerInfo(I.getArgOperand(0)),
                              MachinePointerInfo(I.getArgOperand(1)));
   assert(MC.getNode() != nullptr &&
@@ -7611,8 +7588,8 @@ bool SelectionDAGBuilder::visitBinaryFloatCall(const CallInst &I,
 
 void SelectionDAGBuilder::visitCall(const CallInst &I) {
   // Handle inline assembly differently.
-  if (isa<InlineAsm>(I.getCalledValue())) {
-    visitInlineAsm(&I);
+  if (I.isInlineAsm()) {
+    visitInlineAsm(I);
     return;
   }
 
@@ -7778,12 +7755,12 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
   // Deopt bundles are lowered in LowerCallSiteWithDeoptBundle, and we don't
   // have to do anything here to lower funclet bundles.
   // CFGuardTarget bundles are lowered in LowerCallTo.
-  assert(!I.hasOperandBundlesOtherThan({LLVMContext::OB_deopt,
-                                        LLVMContext::OB_funclet,
-                                        LLVMContext::OB_cfguardtarget}) &&
+  assert(!I.hasOperandBundlesOtherThan(
+             {LLVMContext::OB_deopt, LLVMContext::OB_funclet,
+              LLVMContext::OB_cfguardtarget, LLVMContext::OB_preallocated}) &&
          "Cannot lower calls with arbitrary operand bundles!");
 
-  SDValue Callee = getValue(I.getCalledValue());
+  SDValue Callee = getValue(I.getCalledOperand());
 
   if (I.countOperandBundlesOfType(LLVMContext::OB_deopt))
     LowerCallSiteWithDeoptBundle(&I, Callee, nullptr);
@@ -7791,7 +7768,7 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
     // Check if we can potentially perform a tail call. More detailed checking
     // is be done within LowerCallTo, after more information about the call is
     // known.
-    LowerCallTo(&I, Callee, I.isTailCall());
+    LowerCallTo(I, Callee, I.isTailCall());
 }
 
 namespace {
@@ -7834,7 +7811,7 @@ public:
     if (!CallOperandVal) return MVT::Other;
 
     if (isa<BasicBlock>(CallOperandVal))
-      return TLI.getPointerTy(DL);
+      return TLI.getProgramPointerTy(DL);
 
     llvm::Type *OpTy = CallOperandVal->getType();
 
@@ -7874,7 +7851,6 @@ public:
   }
 };
 
-using SDISelAsmOperandInfoVector = SmallVector<SDISelAsmOperandInfo, 16>;
 
 } // end anonymous namespace
 
@@ -7936,9 +7912,9 @@ static SDValue getAddressForMemoryInput(SDValue Chain, const SDLoc &Location,
   Type *Ty = OpVal->getType();
   auto &DL = DAG.getDataLayout();
   uint64_t TySize = DL.getTypeAllocSize(Ty);
-  unsigned Align = DL.getPrefTypeAlignment(Ty);
   MachineFunction &MF = DAG.getMachineFunction();
-  int SSFI = MF.getFrameInfo().CreateStackObject(TySize, Align, false);
+  int SSFI = MF.getFrameInfo().CreateStackObject(
+      TySize, DL.getPrefTypeAlign(Ty), false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, TLI.getFrameIndexTy(DL));
   Chain = DAG.getTruncStore(Chain, Location, OpInfo.CallOperand, StackSlot,
                             MachinePointerInfo::getFixedStack(MF, SSFI),
@@ -8083,13 +8059,13 @@ class ExtraFlags {
   unsigned Flags = 0;
 
 public:
-  explicit ExtraFlags(ImmutableCallSite CS) {
-    const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
+  explicit ExtraFlags(const CallBase &Call) {
+    const InlineAsm *IA = cast<InlineAsm>(Call.getCalledOperand());
     if (IA->hasSideEffects())
       Flags |= InlineAsm::Extra_HasSideEffects;
     if (IA->isAlignStack())
       Flags |= InlineAsm::Extra_IsAlignStack;
-    if (CS.isConvergent())
+    if (Call.isConvergent())
       Flags |= InlineAsm::Extra_IsConvergent;
     Flags |= IA->getDialect() * InlineAsm::Extra_AsmDialect;
   }
@@ -8116,23 +8092,24 @@ public:
 } // end anonymous namespace
 
 /// visitInlineAsm - Handle a call to an InlineAsm object.
-void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
-  const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
+void SelectionDAGBuilder::visitInlineAsm(const CallBase &Call) {
+  const InlineAsm *IA = cast<InlineAsm>(Call.getCalledOperand());
 
   /// ConstraintOperands - Information about all of the constraints.
-  SDISelAsmOperandInfoVector ConstraintOperands;
+  SmallVector<SDISelAsmOperandInfo, 16> ConstraintOperands;
 
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   TargetLowering::AsmOperandInfoVector TargetConstraints = TLI.ParseConstraints(
-      DAG.getDataLayout(), DAG.getSubtarget().getRegisterInfo(), CS);
+      DAG.getDataLayout(), DAG.getSubtarget().getRegisterInfo(), Call);
 
   // First Pass: Calculate HasSideEffects and ExtraFlags (AlignStack,
   // AsmDialect, MayLoad, MayStore).
   bool HasSideEffect = IA->hasSideEffects();
-  ExtraFlags ExtraInfo(CS);
+  ExtraFlags ExtraInfo(Call);
 
   unsigned ArgNo = 0;   // ArgNo - The argument of the CallInst.
   unsigned ResNo = 0;   // ResNo - The result number of the next output.
+  unsigned NumMatchingOps = 0;
   for (auto &T : TargetConstraints) {
     ConstraintOperands.push_back(SDISelAsmOperandInfo(T));
     SDISelAsmOperandInfo &OpInfo = ConstraintOperands.back();
@@ -8140,14 +8117,17 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     // Compute the value type for each operand.
     if (OpInfo.Type == InlineAsm::isInput ||
         (OpInfo.Type == InlineAsm::isOutput && OpInfo.isIndirect)) {
-      OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
+      OpInfo.CallOperandVal = Call.getArgOperand(ArgNo++);
 
       // Process the call argument. BasicBlocks are labels, currently appearing
       // only in asm's.
-      const Instruction *I = CS.getInstruction();
-      if (isa<CallBrInst>(I) &&
-          (ArgNo - 1) >= (cast<CallBrInst>(I)->getNumArgOperands() -
-                          cast<CallBrInst>(I)->getNumIndirectDests())) {
+      if (isa<CallBrInst>(Call) &&
+          ArgNo - 1 >= (cast<CallBrInst>(&Call)->getNumArgOperands() -
+                        cast<CallBrInst>(&Call)->getNumIndirectDests() -
+                        NumMatchingOps) &&
+          (NumMatchingOps == 0 ||
+           ArgNo - 1 < (cast<CallBrInst>(&Call)->getNumArgOperands() -
+                        NumMatchingOps))) {
         const auto *BA = cast<BlockAddress>(OpInfo.CallOperandVal);
         EVT VT = TLI.getValueType(DAG.getDataLayout(), BA->getType(), true);
         OpInfo.CallOperand = DAG.getTargetBlockAddress(BA, VT);
@@ -8164,20 +8144,23 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     } else if (OpInfo.Type == InlineAsm::isOutput && !OpInfo.isIndirect) {
       // The return value of the call is this value.  As such, there is no
       // corresponding argument.
-      assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
-      if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
+      assert(!Call.getType()->isVoidTy() && "Bad inline asm!");
+      if (StructType *STy = dyn_cast<StructType>(Call.getType())) {
         OpInfo.ConstraintVT = TLI.getSimpleValueType(
             DAG.getDataLayout(), STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
         OpInfo.ConstraintVT =
-            TLI.getSimpleValueType(DAG.getDataLayout(), CS.getType());
+            TLI.getSimpleValueType(DAG.getDataLayout(), Call.getType());
       }
       ++ResNo;
     } else {
       OpInfo.ConstraintVT = MVT::Other;
     }
 
+    if (OpInfo.hasMatchingInput())
+      ++NumMatchingOps;
+
     if (!HasSideEffect)
       HasSideEffect = OpInfo.hasMemory(TLI);
 
@@ -8191,9 +8174,9 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         OpInfo.CallOperand && !isa<ConstantSDNode>(OpInfo.CallOperand))
       // We've delayed emitting a diagnostic like the "n" constraint because
       // inlining could cause an integer showing up.
-      return emitInlineAsmError(
-          CS, "constraint '" + Twine(T.ConstraintCode) + "' expects an "
-                  "integer constant expression");
+      return emitInlineAsmError(Call, "constraint '" + Twine(T.ConstraintCode) +
+                                          "' expects an integer constant "
+                                          "expression");
 
     ExtraInfo.update(T);
   }
@@ -8203,7 +8186,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   // memory and is nonvolatile.
   SDValue Flag, Chain = (HasSideEffect) ? getRoot() : DAG.getRoot();
 
-  bool IsCallBr = isa<CallBrInst>(CS.getInstruction());
+  bool IsCallBr = isa<CallBrInst>(Call);
   if (IsCallBr) {
     // If this is a callbr we need to flush pending exports since inlineasm_br
     // is a terminator. We need to do this before nodes are glued to
@@ -8253,12 +8236,12 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   std::vector<SDValue> AsmNodeOperands;
   AsmNodeOperands.push_back(SDValue());  // reserve space for input chain
   AsmNodeOperands.push_back(DAG.getTargetExternalSymbol(
-      IA->getAsmString().c_str(), TLI.getPointerTy(DAG.getDataLayout())));
+      IA->getAsmString().c_str(), TLI.getProgramPointerTy(DAG.getDataLayout())));
 
   // If we have a !srcloc metadata node associated with it, we want to attach
   // this to the ultimately generated inline asm machineinstr.  To do this, we
   // pass in the third operand as this (potentially null) inline asm MDNode.
-  const MDNode *SrcLoc = CS.getInstruction()->getMetadata("srcloc");
+  const MDNode *SrcLoc = Call.getMetadata("srcloc");
   AsmNodeOperands.push_back(DAG.getMDNode(SrcLoc));
 
   // Remember the HasSideEffect, AlignStack, AsmDialect, MayLoad and MayStore
@@ -8276,6 +8259,21 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
             : OpInfo;
     GetRegistersForValue(DAG, getCurSDLoc(), OpInfo, RefOpInfo);
 
+    auto DetectWriteToReservedRegister = [&]() {
+      const MachineFunction &MF = DAG.getMachineFunction();
+      const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
+      for (unsigned Reg : OpInfo.AssignedRegs.Regs) {
+        if (Register::isPhysicalRegister(Reg) &&
+            TRI.isInlineAsmReadOnlyReg(MF, Reg)) {
+          const char *RegName = TRI.getName(Reg);
+          emitInlineAsmError(Call, "write to reserved register '" +
+                                       Twine(RegName) + "'");
+          return true;
+        }
+      }
+      return false;
+    };
+
     switch (OpInfo.Type) {
     case InlineAsm::isOutput:
       if (OpInfo.ConstraintType == TargetLowering::C_Memory) {
@@ -8296,11 +8294,14 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         // C_Immediate/C_Other). Find a register that we can use.
         if (OpInfo.AssignedRegs.Regs.empty()) {
           emitInlineAsmError(
-              CS, "couldn't allocate output register for constraint '" +
-                      Twine(OpInfo.ConstraintCode) + "'");
+              Call, "couldn't allocate output register for constraint '" +
+                        Twine(OpInfo.ConstraintCode) + "'");
           return;
         }
 
+        if (DetectWriteToReservedRegister())
+          return;
+
         // Add information to the INLINEASM node to know that this register is
         // set.
         OpInfo.AssignedRegs.AddInlineAsmOperands(
@@ -8325,9 +8326,9 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
           // Add (OpFlag&0xffff)>>3 registers to MatchedRegs.
           if (OpInfo.isIndirect) {
             // This happens on gcc/testsuite/gcc.dg/pr8788-1.c
-            emitInlineAsmError(CS, "inline asm not supported yet:"
-                                   " don't know how to handle tied "
-                                   "indirect register inputs");
+            emitInlineAsmError(Call, "inline asm not supported yet: "
+                                     "don't know how to handle tied "
+                                     "indirect register inputs");
             return;
           }
 
@@ -8341,8 +8342,9 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
             for (unsigned i = 0; i != NumRegs; ++i)
               Regs.push_back(RegInfo.createVirtualRegister(RC));
           } else {
-            emitInlineAsmError(CS, "inline asm error: This value type register "
-                                   "class is not natively supported!");
+            emitInlineAsmError(Call,
+                               "inline asm error: This value type register "
+                               "class is not natively supported!");
             return;
           }
 
@@ -8350,8 +8352,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
           SDLoc dl = getCurSDLoc();
           // Use the produced MatchedRegs object to
-          MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Flag,
-                                    CS.getInstruction());
+          MatchedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Flag, &Call);
           MatchedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse,
                                            true, OpInfo.getMatchedOperand(), dl,
                                            DAG, AsmNodeOperands);
@@ -8385,13 +8386,14 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         if (Ops.empty()) {
           if (OpInfo.ConstraintType == TargetLowering::C_Immediate)
             if (isa<ConstantSDNode>(InOperandVal)) {
-              emitInlineAsmError(CS, "value out of range for constraint '" +
-                                 Twine(OpInfo.ConstraintCode) + "'");
+              emitInlineAsmError(Call, "value out of range for constraint '" +
+                                           Twine(OpInfo.ConstraintCode) + "'");
               return;
             }
 
-          emitInlineAsmError(CS, "invalid operand for inline asm constraint '" +
-                                     Twine(OpInfo.ConstraintCode) + "'");
+          emitInlineAsmError(Call,
+                             "invalid operand for inline asm constraint '" +
+                                 Twine(OpInfo.ConstraintCode) + "'");
           return;
         }
 
@@ -8432,23 +8434,27 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       // TODO: Support this.
       if (OpInfo.isIndirect) {
         emitInlineAsmError(
-            CS, "Don't know how to handle indirect register inputs yet "
-                "for constraint '" +
-                    Twine(OpInfo.ConstraintCode) + "'");
+            Call, "Don't know how to handle indirect register inputs yet "
+                  "for constraint '" +
+                      Twine(OpInfo.ConstraintCode) + "'");
         return;
       }
 
       // Copy the input into the appropriate registers.
       if (OpInfo.AssignedRegs.Regs.empty()) {
-        emitInlineAsmError(CS, "couldn't allocate input reg for constraint '" +
-                                   Twine(OpInfo.ConstraintCode) + "'");
+        emitInlineAsmError(Call,
+                           "couldn't allocate input reg for constraint '" +
+                               Twine(OpInfo.ConstraintCode) + "'");
         return;
       }
 
+      if (DetectWriteToReservedRegister())
+        return;
+
       SDLoc dl = getCurSDLoc();
 
-      OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, dl,
-                                        Chain, &Flag, CS.getInstruction());
+      OpInfo.AssignedRegs.getCopyToRegs(InOperandVal, DAG, dl, Chain, &Flag,
+                                        &Call);
 
       OpInfo.AssignedRegs.AddInlineAsmOperands(InlineAsm::Kind_RegUse, false, 0,
                                                dl, DAG, AsmNodeOperands);
@@ -8480,12 +8486,12 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
   SmallVector<SDValue, 1> ResultValues;
   SmallVector<SDValue, 8> OutChains;
 
-  llvm::Type *CSResultType = CS.getType();
+  llvm::Type *CallResultType = Call.getType();
   ArrayRef<Type *> ResultTypes;
-  if (StructType *StructResult = dyn_cast<StructType>(CSResultType))
+  if (StructType *StructResult = dyn_cast<StructType>(CallResultType))
     ResultTypes = StructResult->elements();
-  else if (!CSResultType->isVoidTy())
-    ResultTypes = makeArrayRef(CSResultType);
+  else if (!CallResultType->isVoidTy())
+    ResultTypes = makeArrayRef(CallResultType);
 
   auto CurResultType = ResultTypes.begin();
   auto handleRegAssign = [&](SDValue V) {
@@ -8529,8 +8535,8 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
       switch (OpInfo.ConstraintType) {
       case TargetLowering::C_Register:
       case TargetLowering::C_RegisterClass:
-        Val = OpInfo.AssignedRegs.getCopyFromRegs(
-            DAG, FuncInfo, getCurSDLoc(), Chain, &Flag, CS.getInstruction());
+        Val = OpInfo.AssignedRegs.getCopyFromRegs(DAG, FuncInfo, getCurSDLoc(),
+                                                  Chain, &Flag, &Call);
         break;
       case TargetLowering::C_Immediate:
       case TargetLowering::C_Other:
@@ -8552,7 +8558,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
         OutChains.push_back(Store);
       } else {
         // generate CopyFromRegs to associated registers.
-        assert(!CS.getType()->isVoidTy() && "Bad inline asm!");
+        assert(!Call.getType()->isVoidTy() && "Bad inline asm!");
         if (Val.getOpcode() == ISD::MERGE_VALUES) {
           for (const SDValue &V : Val->op_values())
             handleRegAssign(V);
@@ -8571,7 +8577,7 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
 
     SDValue V = DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
                             DAG.getVTList(ResultVTs), ResultValues);
-    setValue(CS.getInstruction(), V);
+    setValue(&Call, V);
   }
 
   // Collect store chains.
@@ -8583,15 +8589,15 @@ void SelectionDAGBuilder::visitInlineAsm(ImmutableCallSite CS) {
     DAG.setRoot(Chain);
 }
 
-void SelectionDAGBuilder::emitInlineAsmError(ImmutableCallSite CS,
+void SelectionDAGBuilder::emitInlineAsmError(const CallBase &Call,
                                              const Twine &Message) {
   LLVMContext &Ctx = *DAG.getContext();
-  Ctx.emitError(CS.getInstruction(), Message);
+  Ctx.emitError(&Call, Message);
 
   // Make sure we leave the DAG in a valid state
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SmallVector<EVT, 1> ValueVTs;
-  ComputeValueVTs(TLI, DAG.getDataLayout(), CS->getType(), ValueVTs);
+  ComputeValueVTs(TLI, DAG.getDataLayout(), Call.getType(), ValueVTs);
 
   if (ValueVTs.empty())
     return;
@@ -8600,7 +8606,7 @@ void SelectionDAGBuilder::emitInlineAsmError(ImmutableCallSite CS,
   for (unsigned i = 0, e = ValueVTs.size(); i != e; ++i)
     Ops.push_back(DAG.getUNDEF(ValueVTs[i]));
 
-  setValue(CS.getInstruction(), DAG.getMergeValues(Ops, getCurSDLoc()));
+  setValue(&Call, DAG.getMergeValues(Ops, getCurSDLoc()));
 }
 
 void SelectionDAGBuilder::visitVAStart(const CallInst &I) {
@@ -8616,7 +8622,7 @@ void SelectionDAGBuilder::visitVAArg(const VAArgInst &I) {
   SDValue V = DAG.getVAArg(
       TLI.getMemValueType(DAG.getDataLayout(), I.getType()), getCurSDLoc(),
       getRoot(), getValue(I.getOperand(0)), DAG.getSrcValue(I.getOperand(0)),
-      DL.getABITypeAlignment(I.getType()));
+      DL.getABITypeAlign(I.getType()).value());
   DAG.setRoot(V.getValue(1));
 
   if (I.getType()->isPointerTy())
@@ -8711,7 +8717,9 @@ void SelectionDAGBuilder::populateCallLoweringInfo(
       .setChain(getRoot())
       .setCallee(Call->getCallingConv(), ReturnTy, Callee, std::move(Args))
       .setDiscardResult(Call->use_empty())
-      .setIsPatchPoint(IsPatchPoint);
+      .setIsPatchPoint(IsPatchPoint)
+      .setIsPreallocated(
+          Call->countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0);
 }
 
 /// Add a stack map intrinsic call's live variable operands to a stackmap
@@ -8731,11 +8739,11 @@ void SelectionDAGBuilder::populateCallLoweringInfo(
 /// assumption made by the llvm.gcroot intrinsic). If the alloca's location were
 /// only available in a register, then the runtime would need to trap when
 /// execution reaches the StackMap in order to read the alloca's location.
-static void addStackMapLiveVars(ImmutableCallSite CS, unsigned StartIdx,
+static void addStackMapLiveVars(const CallBase &Call, unsigned StartIdx,
                                 const SDLoc &DL, SmallVectorImpl<SDValue> &Ops,
                                 SelectionDAGBuilder &Builder) {
-  for (unsigned i = StartIdx, e = CS.arg_size(); i != e; ++i) {
-    SDValue OpVal = Builder.getValue(CS.getArgument(i));
+  for (unsigned i = StartIdx, e = Call.arg_size(); i != e; ++i) {
+    SDValue OpVal = Builder.getValue(Call.getArgOperand(i));
     if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(OpVal)) {
       Ops.push_back(
         Builder.DAG.getTargetConstant(StackMaps::ConstantOp, DL, MVT::i64));
@@ -8761,7 +8769,7 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
   SmallVector<SDValue, 32> Ops;
 
   SDLoc DL = getCurSDLoc();
-  Callee = getValue(CI.getCalledValue());
+  Callee = getValue(CI.getCalledOperand());
   NullPtr = DAG.getIntPtrConstant(0, DL, true);
 
   // The stackmap intrinsic only records the live variables (the arguments
@@ -8787,7 +8795,7 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
                   MVT::i32));
 
   // Push live variables for the stack map.
-  addStackMapLiveVars(&CI, 2, DL, Ops, *this);
+  addStackMapLiveVars(CI, 2, DL, Ops, *this);
 
   // We are not pushing any register mask info here on the operands list,
   // because the stackmap doesn't clobber anything.
@@ -8814,7 +8822,7 @@ void SelectionDAGBuilder::visitStackmap(const CallInst &CI) {
 }
 
 /// Lower llvm.experimental.patchpoint directly to its target opcode.
-void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
+void SelectionDAGBuilder::visitPatchpoint(const CallBase &CB,
                                           const BasicBlock *EHPadBB) {
   // void|i64 @llvm.experimental.patchpoint.void|i64(i64 <id>,
   //                                                 i32 <numBytes>,
@@ -8823,11 +8831,11 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
   //                                                 [Args...],
   //                                                 [live variables...])
 
-  CallingConv::ID CC = CS.getCallingConv();
+  CallingConv::ID CC = CB.getCallingConv();
   bool IsAnyRegCC = CC == CallingConv::AnyReg;
-  bool HasDef = !CS->getType()->isVoidTy();
+  bool HasDef = !CB.getType()->isVoidTy();
   SDLoc dl = getCurSDLoc();
-  SDValue Callee = getValue(CS->getOperand(PatchPointOpers::TargetPos));
+  SDValue Callee = getValue(CB.getArgOperand(PatchPointOpers::TargetPos));
 
   // Handle immediate and symbolic callees.
   if (auto* ConstCallee = dyn_cast<ConstantSDNode>(Callee))
@@ -8839,23 +8847,23 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
                                          SymbolicCallee->getValueType(0));
 
   // Get the real number of arguments participating in the call <numArgs>
-  SDValue NArgVal = getValue(CS.getArgument(PatchPointOpers::NArgPos));
+  SDValue NArgVal = getValue(CB.getArgOperand(PatchPointOpers::NArgPos));
   unsigned NumArgs = cast<ConstantSDNode>(NArgVal)->getZExtValue();
 
   // Skip the four meta args: <id>, <numNopBytes>, <target>, <numArgs>
   // Intrinsics include all meta-operands up to but not including CC.
   unsigned NumMetaOpers = PatchPointOpers::CCPos;
-  assert(CS.arg_size() >= NumMetaOpers + NumArgs &&
+  assert(CB.arg_size() >= NumMetaOpers + NumArgs &&
          "Not enough arguments provided to the patchpoint intrinsic");
 
   // For AnyRegCC the arguments are lowered later on manually.
   unsigned NumCallArgs = IsAnyRegCC ? 0 : NumArgs;
   Type *ReturnTy =
-    IsAnyRegCC ? Type::getVoidTy(*DAG.getContext()) : CS->getType();
+      IsAnyRegCC ? Type::getVoidTy(*DAG.getContext()) : CB.getType();
 
   TargetLowering::CallLoweringInfo CLI(DAG);
-  populateCallLoweringInfo(CLI, cast<CallBase>(CS.getInstruction()),
-                           NumMetaOpers, NumCallArgs, Callee, ReturnTy, true);
+  populateCallLoweringInfo(CLI, &CB, NumMetaOpers, NumCallArgs, Callee,
+                           ReturnTy, true);
   std::pair<SDValue, SDValue> Result = lowerInvokable(CLI, EHPadBB);
 
   SDNode *CallEnd = Result.second.getNode();
@@ -8873,10 +8881,10 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
   SmallVector<SDValue, 8> Ops;
 
   // Add the <id> and <numBytes> constants.
-  SDValue IDVal = getValue(CS->getOperand(PatchPointOpers::IDPos));
+  SDValue IDVal = getValue(CB.getArgOperand(PatchPointOpers::IDPos));
   Ops.push_back(DAG.getTargetConstant(
                   cast<ConstantSDNode>(IDVal)->getZExtValue(), dl, MVT::i64));
-  SDValue NBytesVal = getValue(CS->getOperand(PatchPointOpers::NBytesPos));
+  SDValue NBytesVal = getValue(CB.getArgOperand(PatchPointOpers::NBytesPos));
   Ops.push_back(DAG.getTargetConstant(
                   cast<ConstantSDNode>(NBytesVal)->getZExtValue(), dl,
                   MVT::i32));
@@ -8898,14 +8906,14 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
   // place these in any free register.
   if (IsAnyRegCC)
     for (unsigned i = NumMetaOpers, e = NumMetaOpers + NumArgs; i != e; ++i)
-      Ops.push_back(getValue(CS.getArgument(i)));
+      Ops.push_back(getValue(CB.getArgOperand(i)));
 
   // Push the arguments from the call instruction up to the register mask.
   SDNode::op_iterator e = HasGlue ? Call->op_end()-2 : Call->op_end()-1;
   Ops.append(Call->op_begin() + 2, e);
 
   // Push live variables for the stack map.
-  addStackMapLiveVars(CS, NumMetaOpers + NumArgs, dl, Ops, *this);
+  addStackMapLiveVars(CB, NumMetaOpers + NumArgs, dl, Ops, *this);
 
   // Push the register mask info.
   if (HasGlue)
@@ -8926,7 +8934,7 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
     // Create the return types based on the intrinsic definition
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     SmallVector<EVT, 3> ValueVTs;
-    ComputeValueVTs(TLI, DAG.getDataLayout(), CS->getType(), ValueVTs);
+    ComputeValueVTs(TLI, DAG.getDataLayout(), CB.getType(), ValueVTs);
     assert(ValueVTs.size() == 1 && "Expected only one return value type.");
 
     // There is always a chain and a glue type at the end
@@ -8943,9 +8951,9 @@ void SelectionDAGBuilder::visitPatchpoint(ImmutableCallSite CS,
   // Update the NodeMap.
   if (HasDef) {
     if (IsAnyRegCC)
-      setValue(CS.getInstruction(), SDValue(MN, 0));
+      setValue(&CB, SDValue(MN, 0));
     else
-      setValue(CS.getInstruction(), Result.first);
+      setValue(&CB, Result.first);
   }
 
   // Fixup the consumers of the intrinsic. The chain and glue may be used in the
@@ -9094,9 +9102,10 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     // assert(!CS.hasInAllocaArgument() &&
     //        "sret demotion is incompatible with inalloca");
     uint64_t TySize = DL.getTypeAllocSize(CLI.RetTy);
-    unsigned Align = DL.getPrefTypeAlignment(CLI.RetTy);
+    Align Alignment = DL.getPrefTypeAlign(CLI.RetTy);
     MachineFunction &MF = CLI.DAG.getMachineFunction();
-    DemoteStackIdx = MF.getFrameInfo().CreateStackObject(TySize, Align, false);
+    DemoteStackIdx =
+        MF.getFrameInfo().CreateStackObject(TySize, Alignment, false);
     Type *StackSlotPtrType = PointerType::get(CLI.RetTy,
                                               DL.getAllocaAddrSpace());
 
@@ -9114,7 +9123,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     Entry.IsSwiftSelf = false;
     Entry.IsSwiftError = false;
     Entry.IsCFGuardTarget = false;
-    Entry.Alignment = Align;
+    Entry.Alignment = Alignment;
     CLI.getArgs().insert(CLI.getArgs().begin(), Entry);
     CLI.NumFixedArgs += 1;
     CLI.RetTy = Type::getVoidTy(CLI.RetTy->getContext());
@@ -9230,6 +9239,15 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         Flags.setCFGuardTarget();
       if (Args[i].IsByVal)
         Flags.setByVal();
+      if (Args[i].IsPreallocated) {
+        Flags.setPreallocated();
+        // Set the byval flag for CCAssignFn callbacks that don't know about
+        // preallocated.  This way we can know how many bytes we should've
+        // allocated and how many bytes a callee cleanup function will pop.  If
+        // we port preallocated to more targets, we'll have to add custom
+        // preallocated handling in the various CC lowering callbacks.
+        Flags.setByVal();
+      }
       if (Args[i].IsInAlloca) {
         Flags.setInAlloca();
         // Set the byval flag for CCAssignFn callbacks that don't know about
@@ -9239,7 +9257,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         // in the various CC lowering callbacks.
         Flags.setByVal();
       }
-      if (Args[i].IsByVal || Args[i].IsInAlloca) {
+      if (Args[i].IsByVal || Args[i].IsInAlloca || Args[i].IsPreallocated) {
         PointerType *Ty = cast<PointerType>(Args[i].Ty);
         Type *ElementTy = Ty->getElementType();
 
@@ -9248,12 +9266,12 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         Flags.setByValSize(FrameSize);
 
         // info is not there but there are cases it cannot get right.
-        unsigned FrameAlign;
-        if (Args[i].Alignment)
-          FrameAlign = Args[i].Alignment;
+        Align FrameAlign;
+        if (auto MA = Args[i].Alignment)
+          FrameAlign = *MA;
         else
-          FrameAlign = getByValTypeAlignment(ElementTy, DL);
-        Flags.setByValAlign(Align(FrameAlign));
+          FrameAlign = Align(getByValTypeAlignment(ElementTy, DL));
+        Flags.setByValAlign(FrameAlign);
       }
       if (Args[i].IsNest)
         Flags.setNest();
@@ -9298,8 +9316,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
           Flags.setReturned();
       }
 
-      getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT,
-                     CLI.CS.getInstruction(), CLI.CallConv, ExtendKind);
+      getCopyToParts(CLI.DAG, CLI.DL, Op, &Parts[0], NumParts, PartVT, CLI.CB,
+                     CLI.CallConv, ExtendKind);
 
       for (unsigned j = 0; j != NumParts; ++j) {
         // if it isn't first piece, alignment must be 1
@@ -9311,7 +9329,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
         if (NumParts > 1 && j == 0)
           MyFlags.Flags.setSplit();
         else if (j != 0) {
-          MyFlags.Flags.setOrigAlign(Align::None());
+          MyFlags.Flags.setOrigAlign(Align(1));
           if (j == NumParts - 1)
             MyFlags.Flags.setSplitEnd();
         }
@@ -9376,6 +9394,8 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
     SDNodeFlags Flags;
     Flags.setNoUnsignedWrap(true);
 
+    MachineFunction &MF = CLI.DAG.getMachineFunction();
+    Align HiddenSRetAlign = MF.getFrameInfo().getObjectAlign(DemoteStackIdx);
     for (unsigned i = 0; i < NumValues; ++i) {
       SDValue Add = CLI.DAG.getNode(ISD::ADD, CLI.DL, PtrVT, DemoteStackSlot,
                                     CLI.DAG.getConstant(Offsets[i], CLI.DL,
@@ -9384,7 +9404,7 @@ TargetLowering::LowerCallTo(TargetLowering::CallLoweringInfo &CLI) const {
           RetTys[i], CLI.DL, CLI.Chain, Add,
           MachinePointerInfo::getFixedStack(CLI.DAG.getMachineFunction(),
                                             DemoteStackIdx, Offsets[i]),
-          /* Alignment = */ 1);
+          HiddenSRetAlign);
       ReturnValues[i] = L;
       Chains[i] = L.getValue(1);
     }
@@ -9551,7 +9571,7 @@ findArgumentCopyElisionCandidates(const DataLayout &DL,
     // initializes the alloca. Don't elide copies from the same argument twice.
     const Value *Val = SI->getValueOperand()->stripPointerCasts();
     const auto *Arg = dyn_cast<Argument>(Val);
-    if (!Arg || Arg->hasInAllocaAttr() || Arg->hasByValAttr() ||
+    if (!Arg || Arg->hasPassPointeeByValueAttr() ||
         Arg->getType()->isEmptyTy() ||
         DL.getTypeStoreSize(Arg->getType()) !=
             DL.getTypeAllocSize(AI->getAllocatedType()) ||
@@ -9607,16 +9627,12 @@ static void tryToElideArgumentCopy(
                   "object size\n");
     return;
   }
-  unsigned RequiredAlignment = AI->getAlignment();
-  if (!RequiredAlignment) {
-    RequiredAlignment = FuncInfo.MF->getDataLayout().getABITypeAlignment(
-        AI->getAllocatedType());
-  }
-  if (MFI.getObjectAlignment(FixedIndex) < RequiredAlignment) {
+  Align RequiredAlignment = AI->getAlign();
+  if (MFI.getObjectAlign(FixedIndex) < RequiredAlignment) {
     LLVM_DEBUG(dbgs() << "  argument copy elision failed: alignment of alloca "
                          "greater than stack argument alignment ("
-                      << RequiredAlignment << " vs "
-                      << MFI.getObjectAlignment(FixedIndex) << ")\n");
+                      << DebugStr(RequiredAlignment) << " vs "
+                      << DebugStr(MFI.getObjectAlign(FixedIndex)) << ")\n");
     return;
   }
 
@@ -9653,6 +9669,10 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
   const DataLayout &DL = DAG.getDataLayout();
   SmallVector<ISD::InputArg, 16> Ins;
 
+  // In Naked functions we aren't going to save any registers.
+  if (F.hasFnAttribute(Attribute::Naked))
+    return;
+
   if (!FuncInfo->CanLowerReturn) {
     // Put in an sret pointer parameter before all the other parameters.
     SmallVector<EVT, 1> ValueVTs;
@@ -9741,12 +9761,21 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
         // in the various CC lowering callbacks.
         Flags.setByVal();
       }
+      if (Arg.hasAttribute(Attribute::Preallocated)) {
+        Flags.setPreallocated();
+        // Set the byval flag for CCAssignFn callbacks that don't know about
+        // preallocated.  This way we can know how many bytes we should've
+        // allocated and how many bytes a callee cleanup function will pop.  If
+        // we port preallocated to more targets, we'll have to add custom
+        // preallocated handling in the various CC lowering callbacks.
+        Flags.setByVal();
+      }
       if (F.getCallingConv() == CallingConv::X86_INTR) {
         // IA Interrupt passes frame (1st parameter) by value in the stack.
         if (ArgNo == 0)
           Flags.setByVal();
       }
-      if (Flags.isByVal() || Flags.isInAlloca()) {
+      if (Flags.isByVal() || Flags.isInAlloca() || Flags.isPreallocated()) {
         Type *ElementTy = Arg.getParamByValType();
 
         // For ByVal, size and alignment should be passed from FE.  BE will
@@ -9786,7 +9815,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
           MyFlags.Flags.setSplit();
         // if it isn't first piece, alignment must be 1
         else if (i > 0) {
-          MyFlags.Flags.setOrigAlign(Align::None());
+          MyFlags.Flags.setOrigAlign(Align(1));
           if (i == NumRegs - 1)
             MyFlags.Flags.setSplitEnd();
         }
@@ -9988,7 +10017,7 @@ void SelectionDAGISel::LowerArguments(const Function &F) {
   }
 
   // Finally, if the target has anything special to do, allow it to do so.
-  EmitFunctionEntryCode();
+  emitFunctionEntryCode();
 }
 
 /// Handle PHI nodes in successor blocks.  Emit code into the SelectionDAG to
@@ -10040,7 +10069,7 @@ SelectionDAGBuilder::HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB) {
         }
         Reg = RegOut;
       } else {
-        DenseMap<const Value *, unsigned>::iterator I =
+        DenseMap<const Value *, Register>::iterator I =
           FuncInfo.ValueMap.find(PHIOp);
         if (I != FuncInfo.ValueMap.end())
           Reg = I->second;
@@ -10654,6 +10683,19 @@ void SelectionDAGBuilder::visitSwitch(const SwitchInst &SI) {
 }
 
 void SelectionDAGBuilder::visitFreeze(const FreezeInst &I) {
-  SDValue N = getValue(I.getOperand(0));
-  setValue(&I, N);
+  SmallVector<EVT, 4> ValueVTs;
+  ComputeValueVTs(DAG.getTargetLoweringInfo(), DAG.getDataLayout(), I.getType(),
+                  ValueVTs);
+  unsigned NumValues = ValueVTs.size();
+  if (NumValues == 0) return;
+
+  SmallVector<SDValue, 4> Values(NumValues);
+  SDValue Op = getValue(I.getOperand(0));
+
+  for (unsigned i = 0; i != NumValues; ++i)
+    Values[i] = DAG.getNode(ISD::FREEZE, getCurSDLoc(), ValueVTs[i],
+                            SDValue(Op.getNode(), Op.getResNo() + i));
+
+  setValue(&I, DAG.getNode(ISD::MERGE_VALUES, getCurSDLoc(),
+                           DAG.getVTList(ValueVTs), Values));
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
index 18e0edf7fc04..f0b7fb0d5229 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.h
@@ -14,19 +14,16 @@
 #define LLVM_LIB_CODEGEN_SELECTIONDAG_SELECTIONDAGBUILDER_H
 
 #include "StatepointLowering.h"
-#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/CodeGen/SwitchLoweringUtils.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Statepoint.h"
@@ -55,7 +52,6 @@ class CatchSwitchInst;
 class CleanupPadInst;
 class CleanupReturnInst;
 class Constant;
-class ConstantInt;
 class ConstrainedFPIntrinsic;
 class DbgValueInst;
 class DataLayout;
@@ -77,6 +73,7 @@ class PHINode;
 class ResumeInst;
 class ReturnInst;
 class SDDbgValue;
+class SelectionDAG;
 class StoreInst;
 class SwiftErrorValueTracking;
 class SwitchInst;
@@ -409,6 +406,8 @@ public:
     SelectionDAGBuilder *SDB;
   };
 
+  // Data related to deferred switch lowerings. Used to construct additional
+  // Basic Blocks in SelectionDAGISel::FinishBasicBlock.
   std::unique_ptr<SDAGSwitchLowering> SL;
 
   /// A StackProtectorDescriptor structure used to communicate stack protector
@@ -518,7 +517,6 @@ public:
   void resolveOrClearDbgInfo();
 
   SDValue getValue(const Value *V);
-  bool findValue(const Value *V) const;
 
   /// Return the SDNode for the specified IR value if it exists.
   SDNode *getNodeForIRValue(const Value *V) {
@@ -557,7 +555,7 @@ public:
   bool isExportableFromCurrentBlock(const Value *V, const BasicBlock *FromBB);
   void CopyToExportRegsIfNeeded(const Value *V);
   void ExportFromCurrentBlock(const Value *V);
-  void LowerCallTo(ImmutableCallSite CS, SDValue Callee, bool IsTailCall,
+  void LowerCallTo(const CallBase &CB, SDValue Callee, bool IsTailCall,
                    const BasicBlock *EHPadBB = nullptr);
 
   // Lower range metadata from 0 to N to assert zext to an integer of nearest
@@ -627,7 +625,7 @@ public:
 
   // This function is responsible for the whole statepoint lowering process.
   // It uniformly handles invoke and call statepoints.
-  void LowerStatepoint(ImmutableStatepoint ISP,
+  void LowerStatepoint(const GCStatepointInst &I,
                        const BasicBlock *EHPadBB = nullptr);
 
   void LowerCallSiteWithDeoptBundle(const CallBase *Call, SDValue Callee,
@@ -764,7 +762,7 @@ private:
   void visitStoreToSwiftError(const StoreInst &I);
   void visitFreeze(const FreezeInst &I);
 
-  void visitInlineAsm(ImmutableCallSite CS);
+  void visitInlineAsm(const CallBase &Call);
   void visitIntrinsicCall(const CallInst &I, unsigned Intrinsic);
   void visitTargetIntrinsic(const CallInst &I, unsigned Intrinsic);
   void visitConstrainedFPIntrinsic(const ConstrainedFPIntrinsic &FPI);
@@ -774,8 +772,7 @@ private:
   void visitVAEnd(const CallInst &I);
   void visitVACopy(const CallInst &I);
   void visitStackmap(const CallInst &I);
-  void visitPatchpoint(ImmutableCallSite CS,
-                       const BasicBlock *EHPadBB = nullptr);
+  void visitPatchpoint(const CallBase &CB, const BasicBlock *EHPadBB = nullptr);
 
   // These two are implemented in StatepointLowering.cpp
   void visitGCRelocate(const GCRelocateInst &Relocate);
@@ -795,7 +792,7 @@ private:
 
   void HandlePHINodesInSuccessorBlocks(const BasicBlock *LLVMBB);
 
-  void emitInlineAsmError(ImmutableCallSite CS, const Twine &Message);
+  void emitInlineAsmError(const CallBase &Call, const Twine &Message);
 
   /// If V is an function argument then create corresponding DBG_VALUE machine
   /// instruction for it now. At the end of instruction selection, they will be
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 6fd71393bf38..42e3016e65b8 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -65,7 +65,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
       if (G)
         if (const TargetInstrInfo *TII = G->getSubtarget().getInstrInfo())
           if (getMachineOpcode() < TII->getNumOpcodes())
-            return TII->getName(getMachineOpcode());
+            return std::string(TII->getName(getMachineOpcode()));
       return "<<Unknown Machine Node #" + utostr(getOpcode()) + ">>";
     }
     if (G) {
@@ -106,6 +106,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::TokenFactor:                return "TokenFactor";
   case ISD::AssertSext:                 return "AssertSext";
   case ISD::AssertZext:                 return "AssertZext";
+  case ISD::AssertAlign:                return "AssertAlign";
 
   case ISD::BasicBlock:                 return "BasicBlock";
   case ISD::VALUETYPE:                  return "ValueType";
@@ -170,6 +171,7 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::CopyToReg:                  return "CopyToReg";
   case ISD::CopyFromReg:                return "CopyFromReg";
   case ISD::UNDEF:                      return "undef";
+  case ISD::VSCALE:                     return "vscale";
   case ISD::MERGE_VALUES:               return "merge_values";
   case ISD::INLINEASM:                  return "inlineasm";
   case ISD::INLINEASM_BR:               return "inlineasm_br";
@@ -210,6 +212,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::STRICT_FNEARBYINT:          return "strict_fnearbyint";
   case ISD::FROUND:                     return "fround";
   case ISD::STRICT_FROUND:              return "strict_fround";
+  case ISD::FROUNDEVEN:                 return "froundeven";
+  case ISD::STRICT_FROUNDEVEN:          return "strict_froundeven";
   case ISD::FEXP:                       return "fexp";
   case ISD::STRICT_FEXP:                return "strict_fexp";
   case ISD::FEXP2:                      return "fexp2";
@@ -313,7 +317,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::UMULFIXSAT:                 return "umulfixsat";
 
   case ISD::SDIVFIX:                    return "sdivfix";
+  case ISD::SDIVFIXSAT:                 return "sdivfixsat";
   case ISD::UDIVFIX:                    return "udivfix";
+  case ISD::UDIVFIXSAT:                 return "udivfixsat";
 
   // Conversion operators.
   case ISD::SIGN_EXTEND:                return "sign_extend";
@@ -341,7 +347,9 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::BITCAST:                    return "bitcast";
   case ISD::ADDRSPACECAST:              return "addrspacecast";
   case ISD::FP16_TO_FP:                 return "fp16_to_fp";
+  case ISD::STRICT_FP16_TO_FP:          return "strict_fp16_to_fp";
   case ISD::FP_TO_FP16:                 return "fp_to_fp16";
+  case ISD::STRICT_FP_TO_FP16:          return "strict_fp_to_fp16";
   case ISD::LROUND:                     return "lround";
   case ISD::STRICT_LROUND:              return "strict_lround";
   case ISD::LLROUND:                    return "llround";
@@ -387,6 +395,11 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::GC_TRANSITION_START:        return "gc_transition.start";
   case ISD::GC_TRANSITION_END:          return "gc_transition.end";
   case ISD::GET_DYNAMIC_AREA_OFFSET:    return "get.dynamic.area.offset";
+  case ISD::FREEZE:                     return "freeze";
+  case ISD::PREALLOCATED_SETUP:
+    return "call_setup";
+  case ISD::PREALLOCATED_ARG:
+    return "call_alloc";
 
   // Bit manipulation
   case ISD::ABS:                        return "abs";
@@ -547,9 +560,6 @@ void SDNode::print_details(raw_ostream &OS, const SelectionDAG *G) const {
   if (getFlags().hasAllowReassociation())
     OS << " reassoc";
 
-  if (getFlags().hasVectorReduction())
-    OS << " vector-reduction";
-
   if (getFlags().hasNoFPExcept())
     OS << " nofpexcept";
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 6c57c72d47a7..1f0432196a2d 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -215,6 +215,7 @@ namespace llvm {
     OptLevelChanger(SelectionDAGISel &ISel,
                     CodeGenOpt::Level NewOptLevel) : IS(ISel) {
       SavedOptLevel = IS.OptLevel;
+      SavedFastISel = IS.TM.Options.EnableFastISel;
       if (NewOptLevel == SavedOptLevel)
         return;
       IS.OptLevel = NewOptLevel;
@@ -223,7 +224,6 @@ namespace llvm {
                         << IS.MF->getFunction().getName() << "\n");
       LLVM_DEBUG(dbgs() << "\tBefore: -O" << SavedOptLevel << " ; After: -O"
                         << NewOptLevel << "\n");
-      SavedFastISel = IS.TM.Options.EnableFastISel;
       if (NewOptLevel == CodeGenOpt::None) {
         IS.TM.setFastISel(IS.TM.getO0WantsFastISel());
         LLVM_DEBUG(
@@ -337,7 +337,8 @@ void SelectionDAGISel::getAnalysisUsage(AnalysisUsage &AU) const {
   if (UseMBPI && OptLevel != CodeGenOpt::None)
     AU.addRequired<BranchProbabilityInfoWrapperPass>();
   AU.addRequired<ProfileSummaryInfoWrapperPass>();
-  LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
+  if (OptLevel != CodeGenOpt::None)
+    LazyBlockFrequencyInfoPass::getLazyBFIAnalysisUsage(AU);
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -441,9 +442,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   auto *LIWP = getAnalysisIfAvailable<LoopInfoWrapperPass>();
   LoopInfo *LI = LIWP ? &LIWP->getLoopInfo() : nullptr;
   auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-  auto *BFI = (PSI && PSI->hasProfileSummary()) ?
-              &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI() :
-              nullptr;
+  BlockFrequencyInfo *BFI = nullptr;
+  if (PSI && PSI->hasProfileSummary() && OptLevel != CodeGenOpt::None)
+    BFI = &getAnalysis<LazyBlockFrequencyInfoPass>().getBFI();
 
   LLVM_DEBUG(dbgs() << "\n\n\n=== " << Fn.getName() << "\n");
 
@@ -513,15 +514,15 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   // registers. If we don't apply the reg fixups before, some registers may
   // appear as unused and will be skipped, resulting in bad MI.
   MachineRegisterInfo &MRI = MF->getRegInfo();
-  for (DenseMap<unsigned, unsigned>::iterator I = FuncInfo->RegFixups.begin(),
+  for (DenseMap<Register, Register>::iterator I = FuncInfo->RegFixups.begin(),
                                               E = FuncInfo->RegFixups.end();
        I != E; ++I) {
-    unsigned From = I->first;
-    unsigned To = I->second;
+    Register From = I->first;
+    Register To = I->second;
     // If To is also scheduled to be replaced, find what its ultimate
     // replacement is.
     while (true) {
-      DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To);
+      DenseMap<Register, Register>::iterator J = FuncInfo->RegFixups.find(To);
       if (J == E)
         break;
       To = J->second;
@@ -622,7 +623,9 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
         // Otherwise this is another use or second copy use.
         CopyUseMI = nullptr; break;
       }
-      if (CopyUseMI) {
+      if (CopyUseMI &&
+          TRI.getRegSizeInBits(LDI->second, MRI) ==
+              TRI.getRegSizeInBits(CopyUseMI->getOperand(0).getReg(), MRI)) {
         // Use MI's debug location, which describes where Variable was
         // declared, rather than whatever is attached to CopyUseMI.
         MachineInstr *NewMI =
@@ -658,36 +661,6 @@ bool SelectionDAGISel::runOnMachineFunction(MachineFunction &mf) {
   // Determine if floating point is used for msvc
   computeUsesMSVCFloatingPoint(TM.getTargetTriple(), Fn, MF->getMMI());
 
-  // Replace forward-declared registers with the registers containing
-  // the desired value.
-  for (DenseMap<unsigned, unsigned>::iterator
-       I = FuncInfo->RegFixups.begin(), E = FuncInfo->RegFixups.end();
-       I != E; ++I) {
-    unsigned From = I->first;
-    unsigned To = I->second;
-    // If To is also scheduled to be replaced, find what its ultimate
-    // replacement is.
-    while (true) {
-      DenseMap<unsigned, unsigned>::iterator J = FuncInfo->RegFixups.find(To);
-      if (J == E) break;
-      To = J->second;
-    }
-    // Make sure the new register has a sufficiently constrained register class.
-    if (Register::isVirtualRegister(From) && Register::isVirtualRegister(To))
-      MRI.constrainRegClass(To, MRI.getRegClass(From));
-    // Replace it.
-
-
-    // Replacing one register with another won't touch the kill flags.
-    // We need to conservatively clear the kill flags as a kill on the old
-    // register might dominate existing uses of the new register.
-    if (!MRI.use_empty(To))
-      MRI.clearKillFlags(From);
-    MRI.replaceRegWith(From, To);
-  }
-
-  TLI->finalizeLowering(*MF);
-
   // Release function-specific state. SDB and CurDAG are already cleared
   // at this point.
   FuncInfo->clear();
@@ -1321,8 +1294,11 @@ static void processDbgDeclares(FunctionLoweringInfo &FuncInfo) {
       assert(DI->getVariable() && "Missing variable");
       assert(DI->getDebugLoc() && "Missing location");
       const Value *Address = DI->getAddress();
-      if (!Address)
+      if (!Address) {
+        LLVM_DEBUG(dbgs() << "processDbgDeclares skipping " << *DI
+                          << " (bad address)\n");
         continue;
+      }
 
       // Look through casts and constant offset GEPs. These mostly come from
       // inalloca.
@@ -1347,6 +1323,8 @@ static void processDbgDeclares(FunctionLoweringInfo &FuncInfo) {
       if (Offset.getBoolValue())
         Expr = DIExpression::prepend(Expr, DIExpression::ApplyOffset,
                                      Offset.getZExtValue());
+      LLVM_DEBUG(dbgs() << "processDbgDeclares: setVariableDbgInfo FI=" << FI
+                        << ", " << *DI << "\n");
       MF->setVariableDbgInfo(DI->getVariable(), Expr, FI, DI->getDebugLoc());
     }
   }
@@ -1513,8 +1491,8 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
         // to keep track of gc-relocates for a particular gc-statepoint. This is
         // done by SelectionDAGBuilder::LowerAsSTATEPOINT, called before
         // visitGCRelocate.
-        if (isa<CallInst>(Inst) && !isStatepoint(Inst) && !isGCRelocate(Inst) &&
-            !isGCResult(Inst)) {
+        if (isa<CallInst>(Inst) && !isa<GCStatepointInst>(Inst) &&
+            !isa<GCRelocateInst>(Inst) && !isa<GCResultInst>(Inst)) {
           OptimizationRemarkMissed R("sdagisel", "FastISelFailure",
                                      Inst->getDebugLoc(), LLVMBB);
 
@@ -1532,7 +1510,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
           if (!Inst->getType()->isVoidTy() && !Inst->getType()->isTokenTy() &&
               !Inst->use_empty()) {
-            unsigned &R = FuncInfo->ValueMap[Inst];
+            Register &R = FuncInfo->ValueMap[Inst];
             if (!R)
               R = FuncInfo->CreateRegs(Inst);
           }
@@ -2234,14 +2212,14 @@ bool SelectionDAGISel::IsLegalToFold(SDValue N, SDNode *U, SDNode *Root,
   return !findNonImmUse(Root, N.getNode(), U, IgnoreChains);
 }
 
-void SelectionDAGISel::Select_INLINEASM(SDNode *N, bool Branch) {
+void SelectionDAGISel::Select_INLINEASM(SDNode *N) {
   SDLoc DL(N);
 
   std::vector<SDValue> Ops(N->op_begin(), N->op_end());
   SelectInlineAsmMemoryOperands(Ops, DL);
 
   const EVT VTs[] = {MVT::Other, MVT::Glue};
-  SDValue New = CurDAG->getNode(Branch ? ISD::INLINEASM_BR : ISD::INLINEASM, DL, VTs, Ops);
+  SDValue New = CurDAG->getNode(N->getOpcode(), DL, VTs, Ops);
   New->setNodeId(-1);
   ReplaceUses(N, New.getNode());
   CurDAG->RemoveDeadNode(N);
@@ -2285,6 +2263,14 @@ void SelectionDAGISel::Select_UNDEF(SDNode *N) {
   CurDAG->SelectNodeTo(N, TargetOpcode::IMPLICIT_DEF, N->getValueType(0));
 }
 
+void SelectionDAGISel::Select_FREEZE(SDNode *N) {
+  // TODO: We don't have FREEZE pseudo-instruction in MachineInstr-level now.
+  // If FREEZE instruction is added later, the code below must be changed as
+  // well.
+  CurDAG->SelectNodeTo(N, TargetOpcode::COPY, N->getValueType(0),
+                       N->getOperand(0));
+}
+
 /// GetVBR - decode a vbr encoding whose top bit is set.
 LLVM_ATTRIBUTE_ALWAYS_INLINE static inline uint64_t
 GetVBR(uint64_t Val, const unsigned char *MatcherTable, unsigned &Idx) {
@@ -2804,13 +2790,13 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
     return;
   case ISD::AssertSext:
   case ISD::AssertZext:
+  case ISD::AssertAlign:
     ReplaceUses(SDValue(NodeToMatch, 0), NodeToMatch->getOperand(0));
     CurDAG->RemoveDeadNode(NodeToMatch);
     return;
   case ISD::INLINEASM:
   case ISD::INLINEASM_BR:
-    Select_INLINEASM(NodeToMatch,
-                     NodeToMatch->getOpcode() == ISD::INLINEASM_BR);
+    Select_INLINEASM(NodeToMatch);
     return;
   case ISD::READ_REGISTER:
     Select_READ_REGISTER(NodeToMatch);
@@ -2821,6 +2807,9 @@ void SelectionDAGISel::SelectCodeCommon(SDNode *NodeToMatch,
   case ISD::UNDEF:
     Select_UNDEF(NodeToMatch);
     return;
+  case ISD::FREEZE:
+    Select_FREEZE(NodeToMatch);
+    return;
   }
 
   assert(!NodeToMatch->isMachineOpcode() && "Node already selected!");
@@ -3693,12 +3682,11 @@ bool SelectionDAGISel::isOrEquivalentToAdd(const SDNode *N) const {
   // Detect when "or" is used to add an offset to a stack object.
   if (auto *FN = dyn_cast<FrameIndexSDNode>(N->getOperand(0))) {
     MachineFrameInfo &MFI = MF->getFrameInfo();
-    unsigned A = MFI.getObjectAlignment(FN->getIndex());
-    assert(isPowerOf2_32(A) && "Unexpected alignment");
+    Align A = MFI.getObjectAlign(FN->getIndex());
     int32_t Off = C->getSExtValue();
     // If the alleged offset fits in the zero bits guaranteed by
     // the alignment, then this or is really an add.
-    return (Off >= 0) && (((A - 1) & Off) == unsigned(Off));
+    return (Off >= 0) && (((A.value() - 1) & Off) == unsigned(Off));
   }
   return false;
 }
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index cdc09d59f6a4..059a6baf967a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -70,7 +70,7 @@ namespace llvm {
     }
 
     static std::string getGraphName(const SelectionDAG *G) {
-      return G->getMachineFunction().getName();
+      return std::string(G->getMachineFunction().getName());
     }
 
     static bool renderGraphFromBottomUp() {
@@ -164,6 +164,20 @@ void SelectionDAG::viewGraph() {
   viewGraph("");
 }
 
+/// Just dump dot graph to a user-provided path and title.
+/// This doesn't open the dot viewer program and
+/// helps visualization when outside debugging session.
+/// FileName expects absolute path. If provided
+/// without any path separators then the file
+/// will be created in the current directory.
+/// Error will be emitted if the path is insane.
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+LLVM_DUMP_METHOD void SelectionDAG::dumpDotGraph(const Twine &FileName,
+                                                 const Twine &Title) {
+  dumpDotGraphToFile(this, FileName, Title);
+}
+#endif
+
 /// clearGraphAttrs - Clear all previously defined node graph attributes.
 /// Intended to be used from a debugging tool (eg. gdb).
 void SelectionDAG::clearGraphAttrs() {
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
index c628f379e415..2cb57c1d1ccc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
@@ -41,6 +42,7 @@
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
@@ -61,6 +63,10 @@ STATISTIC(NumOfStatepoints, "Number of statepoint nodes encountered");
 STATISTIC(StatepointMaxSlotsRequired,
           "Maximum number of stack slots required for a singe statepoint");
 
+cl::opt<bool> UseRegistersForDeoptValues(
+    "use-registers-for-deopt-values", cl::Hidden, cl::init(false),
+    cl::desc("Allow using registers for non pointer deopt args"));
+
 static void pushStackMapConstant(SmallVectorImpl<SDValue>& Ops,
                                  SelectionDAGBuilder &Builder, uint64_t Value) {
   SDLoc L = Builder.getCurSDLoc();
@@ -215,6 +221,28 @@ static Optional<int> findPreviousSpillSlot(const Value *Val,
   return None;
 }
 
+
+/// Return true if-and-only-if the given SDValue can be lowered as either a
+/// constant argument or a stack reference.  The key point is that the value
+/// doesn't need to be spilled or tracked as a vreg use.
+static bool willLowerDirectly(SDValue Incoming) {
+  // We are making an unchecked assumption that the frame size <= 2^16 as that
+  // is the largest offset which can be encoded in the stackmap format.
+  if (isa<FrameIndexSDNode>(Incoming))
+    return true;
+
+  // The largest constant describeable in the StackMap format is 64 bits.
+  // Potential Optimization:  Constants values are sign extended by consumer,
+  // and thus there are many constants of static type > 64 bits whose value
+  // happens to be sext(Con64) and could thus be lowered directly.
+  if (Incoming.getValueType().getSizeInBits() > 64)
+    return false;
+
+  return (isa<ConstantSDNode>(Incoming) || isa<ConstantFPSDNode>(Incoming) ||
+          Incoming.isUndef());
+}
+
+
 /// Try to find existing copies of the incoming values in stack slots used for
 /// statepoint spilling.  If we can find a spill slot for the incoming value,
 /// mark that slot as allocated, and reuse the same slot for this safepoint.
@@ -224,11 +252,10 @@ static void reservePreviousStackSlotForValue(const Value *IncomingValue,
                                              SelectionDAGBuilder &Builder) {
   SDValue Incoming = Builder.getValue(IncomingValue);
 
-  if (isa<ConstantSDNode>(Incoming) || isa<FrameIndexSDNode>(Incoming)) {
-    // We won't need to spill this, so no need to check for previously
-    // allocated stack slots
+  // If we won't spill this, we don't need to check for previously allocated
+  // stack slots.
+  if (willLowerDirectly(Incoming))
     return;
-  }
 
   SDValue OldLocation = Builder.StatepointLowering.getLocation(Incoming);
   if (OldLocation.getNode())
@@ -268,45 +295,6 @@ static void reservePreviousStackSlotForValue(const Value *IncomingValue,
   Builder.StatepointLowering.setLocation(Incoming, Loc);
 }
 
-/// Remove any duplicate (as SDValues) from the derived pointer pairs.  This
-/// is not required for correctness.  It's purpose is to reduce the size of
-/// StackMap section.  It has no effect on the number of spill slots required
-/// or the actual lowering.
-static void
-removeDuplicateGCPtrs(SmallVectorImpl<const Value *> &Bases,
-                      SmallVectorImpl<const Value *> &Ptrs,
-                      SmallVectorImpl<const GCRelocateInst *> &Relocs,
-                      SelectionDAGBuilder &Builder,
-                      FunctionLoweringInfo::StatepointSpillMap &SSM) {
-  DenseMap<SDValue, const Value *> Seen;
-
-  SmallVector<const Value *, 64> NewBases, NewPtrs;
-  SmallVector<const GCRelocateInst *, 64> NewRelocs;
-  for (size_t i = 0, e = Ptrs.size(); i < e; i++) {
-    SDValue SD = Builder.getValue(Ptrs[i]);
-    auto SeenIt = Seen.find(SD);
-
-    if (SeenIt == Seen.end()) {
-      // Only add non-duplicates
-      NewBases.push_back(Bases[i]);
-      NewPtrs.push_back(Ptrs[i]);
-      NewRelocs.push_back(Relocs[i]);
-      Seen[SD] = Ptrs[i];
-    } else {
-      // Duplicate pointer found, note in SSM and move on:
-      SSM.DuplicateMap[Ptrs[i]] = SeenIt->second;
-    }
-  }
-  assert(Bases.size() >= NewBases.size());
-  assert(Ptrs.size() >= NewPtrs.size());
-  assert(Relocs.size() >= NewRelocs.size());
-  Bases = NewBases;
-  Ptrs = NewPtrs;
-  Relocs = NewRelocs;
-  assert(Ptrs.size() == Bases.size());
-  assert(Ptrs.size() == Relocs.size());
-}
-
 /// Extract call from statepoint, lower it and return pointer to the
 /// call node. Also update NodeMap so that getValue(statepoint) will
 /// reference lowered call result
@@ -353,9 +341,9 @@ static MachineMemOperand* getMachineMemOperand(MachineFunction &MF,
   auto MMOFlags = MachineMemOperand::MOStore |
     MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
   auto &MFI = MF.getFrameInfo();
-  return MF.getMachineMemOperand(PtrInfo, MMOFlags, 
+  return MF.getMachineMemOperand(PtrInfo, MMOFlags,
                                  MFI.getObjectSize(FI.getIndex()),
-                                 MFI.getObjectAlignment(FI.getIndex()));
+                                 MFI.getObjectAlign(FI.getIndex()));
 }
 
 /// Spill a value incoming to the statepoint. It might be either part of
@@ -393,10 +381,9 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
     // slots with preferred alignments larger than frame alignment..
     auto &MF = Builder.DAG.getMachineFunction();
     auto PtrInfo = MachinePointerInfo::getFixedStack(MF, Index);
-    auto *StoreMMO =
-      MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore, 
-                              MFI.getObjectSize(Index),
-                              MFI.getObjectAlignment(Index));
+    auto *StoreMMO = MF.getMachineMemOperand(
+        PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(Index),
+        MFI.getObjectAlign(Index));
     Chain = Builder.DAG.getStore(Chain, Builder.getCurSDLoc(), Incoming, Loc,
                                  StoreMMO);
 
@@ -412,59 +399,81 @@ spillIncomingStatepointValue(SDValue Incoming, SDValue Chain,
 /// Lower a single value incoming to a statepoint node.  This value can be
 /// either a deopt value or a gc value, the handling is the same.  We special
 /// case constants and allocas, then fall back to spilling if required.
-static void lowerIncomingStatepointValue(SDValue Incoming, bool LiveInOnly,
-                                         SmallVectorImpl<SDValue> &Ops,
-                                         SmallVectorImpl<MachineMemOperand*> &MemRefs,
-                                         SelectionDAGBuilder &Builder) {
-  // Note: We know all of these spills are independent, but don't bother to
-  // exploit that chain wise.  DAGCombine will happily do so as needed, so
-  // doing it here would be a small compile time win at most.
-  SDValue Chain = Builder.getRoot();
-
-  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Incoming)) {
+static void
+lowerIncomingStatepointValue(SDValue Incoming, bool RequireSpillSlot,
+                             SmallVectorImpl<SDValue> &Ops,
+                             SmallVectorImpl<MachineMemOperand *> &MemRefs,
+                             SelectionDAGBuilder &Builder) {
+  
+  if (willLowerDirectly(Incoming)) {
+    if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) {
+      // This handles allocas as arguments to the statepoint (this is only
+      // really meaningful for a deopt value.  For GC, we'd be trying to
+      // relocate the address of the alloca itself?)
+      assert(Incoming.getValueType() == Builder.getFrameIndexTy() &&
+             "Incoming value is a frame index!");
+      Ops.push_back(Builder.DAG.getTargetFrameIndex(FI->getIndex(),
+                                                    Builder.getFrameIndexTy()));
+
+      auto &MF = Builder.DAG.getMachineFunction();
+      auto *MMO = getMachineMemOperand(MF, *FI);
+      MemRefs.push_back(MMO);
+      return;
+    }
+
+    assert(Incoming.getValueType().getSizeInBits() <= 64);
+    
+    if (Incoming.isUndef()) {
+      // Put an easily recognized constant that's unlikely to be a valid
+      // value so that uses of undef by the consumer of the stackmap is
+      // easily recognized. This is legal since the compiler is always
+      // allowed to chose an arbitrary value for undef.
+      pushStackMapConstant(Ops, Builder, 0xFEFEFEFE);
+      return;
+    }
+
     // If the original value was a constant, make sure it gets recorded as
     // such in the stackmap.  This is required so that the consumer can
     // parse any internal format to the deopt state.  It also handles null
-    // pointers and other constant pointers in GC states.  Note the constant
-    // vectors do not appear to actually hit this path and that anything larger
-    // than an i64 value (not type!) will fail asserts here.
-    pushStackMapConstant(Ops, Builder, C->getSExtValue());
-  } else if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(Incoming)) {
-    // This handles allocas as arguments to the statepoint (this is only
-    // really meaningful for a deopt value.  For GC, we'd be trying to
-    // relocate the address of the alloca itself?)
-    assert(Incoming.getValueType() == Builder.getFrameIndexTy() &&
-           "Incoming value is a frame index!");
-    Ops.push_back(Builder.DAG.getTargetFrameIndex(FI->getIndex(),
-                                                  Builder.getFrameIndexTy()));
+    // pointers and other constant pointers in GC states.
+    if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Incoming)) {
+      pushStackMapConstant(Ops, Builder, C->getSExtValue());
+      return;
+    } else if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Incoming)) {
+      pushStackMapConstant(Ops, Builder,
+                           C->getValueAPF().bitcastToAPInt().getZExtValue());
+      return;
+    }
 
-    auto &MF = Builder.DAG.getMachineFunction();
-    auto *MMO = getMachineMemOperand(MF, *FI);
-    MemRefs.push_back(MMO);
-    
-  } else if (LiveInOnly) {
+    llvm_unreachable("unhandled direct lowering case");
+  }
+
+
+
+  if (!RequireSpillSlot) {
     // If this value is live in (not live-on-return, or live-through), we can
     // treat it the same way patchpoint treats it's "live in" values.  We'll
     // end up folding some of these into stack references, but they'll be
     // handled by the register allocator.  Note that we do not have the notion
     // of a late use so these values might be placed in registers which are
-    // clobbered by the call.  This is fine for live-in.
+    // clobbered by the call.  This is fine for live-in. For live-through
+    // fix-up pass should be executed to force spilling of such registers.
     Ops.push_back(Incoming);
   } else {
-    // Otherwise, locate a spill slot and explicitly spill it so it
-    // can be found by the runtime later.  We currently do not support
-    // tracking values through callee saved registers to their eventual
-    // spill location.  This would be a useful optimization, but would
-    // need to be optional since it requires a lot of complexity on the
-    // runtime side which not all would support.
+    // Otherwise, locate a spill slot and explicitly spill it so it can be
+    // found by the runtime later.  Note: We know all of these spills are
+    // independent, but don't bother to exploit that chain wise.  DAGCombine
+    // will happily do so as needed, so doing it here would be a small compile
+    // time win at most. 
+    SDValue Chain = Builder.getRoot();
     auto Res = spillIncomingStatepointValue(Incoming, Chain, Builder);
     Ops.push_back(std::get<0>(Res));
     if (auto *MMO = std::get<2>(Res))
       MemRefs.push_back(MMO);
     Chain = std::get<1>(Res);;
+    Builder.DAG.setRoot(Chain);
   }
 
-  Builder.DAG.setRoot(Chain);
 }
 
 /// Lower deopt state and gc pointer arguments of the statepoint.  The actual
@@ -522,8 +531,18 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   const bool LiveInDeopt =
     SI.StatepointFlags & (uint64_t)StatepointFlags::DeoptLiveIn;
 
-  auto isGCValue =[&](const Value *V) {
-    return is_contained(SI.Ptrs, V) || is_contained(SI.Bases, V);
+  auto isGCValue = [&](const Value *V) {
+    auto *Ty = V->getType();
+    if (!Ty->isPtrOrPtrVectorTy())
+      return false;
+    if (auto *GFI = Builder.GFI)
+      if (auto IsManaged = GFI->getStrategy().isGCManagedPointer(Ty))
+        return *IsManaged;
+    return true; // conservative
+  };
+
+  auto requireSpillSlot = [&](const Value *V) {
+    return !(LiveInDeopt || UseRegistersForDeoptValues) || isGCValue(V);
   };
 
   // Before we actually start lowering (and allocating spill slots for values),
@@ -532,7 +551,7 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   // doesn't change semantics at all.  It is important for performance that we
   // reserve slots for both deopt and gc values before lowering either.
   for (const Value *V : SI.DeoptState) {
-    if (!LiveInDeopt || isGCValue(V))
+    if (requireSpillSlot(V))
       reservePreviousStackSlotForValue(V, Builder);
   }
   for (unsigned i = 0; i < SI.Bases.size(); ++i) {
@@ -559,8 +578,8 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
     }
     if (!Incoming.getNode())
       Incoming = Builder.getValue(V);
-    const bool LiveInValue = LiveInDeopt && !isGCValue(V);
-    lowerIncomingStatepointValue(Incoming, LiveInValue, Ops, MemRefs, Builder);
+    lowerIncomingStatepointValue(Incoming, requireSpillSlot(V), Ops, MemRefs,
+                                 Builder);
   }
 
   // Finally, go ahead and lower all the gc arguments.  There's no prefixed
@@ -570,12 +589,14 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
   // (base[0], ptr[0], base[1], ptr[1], ...)
   for (unsigned i = 0; i < SI.Bases.size(); ++i) {
     const Value *Base = SI.Bases[i];
-    lowerIncomingStatepointValue(Builder.getValue(Base), /*LiveInOnly*/ false,
-                                 Ops, MemRefs, Builder);
+    lowerIncomingStatepointValue(Builder.getValue(Base),
+                                 /*RequireSpillSlot*/ true, Ops, MemRefs,
+                                 Builder);
 
     const Value *Ptr = SI.Ptrs[i];
-    lowerIncomingStatepointValue(Builder.getValue(Ptr), /*LiveInOnly*/ false,
-                                 Ops, MemRefs, Builder);
+    lowerIncomingStatepointValue(Builder.getValue(Ptr),
+                                 /*RequireSpillSlot*/ true, Ops, MemRefs,
+                                 Builder);
   }
 
   // If there are any explicit spill slots passed to the statepoint, record
@@ -610,7 +631,7 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
     SDValue Loc = Builder.StatepointLowering.getLocation(SDV);
 
     if (Loc.getNode()) {
-      SpillMap.SlotMap[V] = cast<FrameIndexSDNode>(Loc)->getIndex();
+      SpillMap[V] = cast<FrameIndexSDNode>(Loc)->getIndex();
     } else {
       // Record value as visited, but not spilled. This is case for allocas
       // and constants. For this values we can avoid emitting spill load while
@@ -618,7 +639,7 @@ lowerStatepointMetaArgs(SmallVectorImpl<SDValue> &Ops,
       // Actually we do not need to record them in this map at all.
       // We do this only to check that we are not relocating any unvisited
       // value.
-      SpillMap.SlotMap[V] = None;
+      SpillMap[V] = None;
 
       // Default llvm mechanisms for exporting values which are used in
       // different basic blocks does not work for gc relocates.
@@ -641,24 +662,15 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
   NumOfStatepoints++;
   // Clear state
   StatepointLowering.startNewStatepoint(*this);
+  assert(SI.Bases.size() == SI.Ptrs.size() &&
+         SI.Ptrs.size() <= SI.GCRelocates.size());
 
 #ifndef NDEBUG
-  // We schedule gc relocates before removeDuplicateGCPtrs since we _will_
-  // encounter the duplicate gc relocates we elide in removeDuplicateGCPtrs.
   for (auto *Reloc : SI.GCRelocates)
     if (Reloc->getParent() == SI.StatepointInstr->getParent())
       StatepointLowering.scheduleRelocCall(*Reloc);
 #endif
 
-  // Remove any redundant llvm::Values which map to the same SDValue as another
-  // input.  Also has the effect of removing duplicates in the original
-  // llvm::Value input list as well.  This is a useful optimization for
-  // reducing the size of the StackMap section.  It has no other impact.
-  removeDuplicateGCPtrs(SI.Bases, SI.Ptrs, SI.GCRelocates, *this,
-                        FuncInfo.StatepointSpillMaps[SI.StatepointInstr]);
-  assert(SI.Bases.size() == SI.Ptrs.size() &&
-         SI.Ptrs.size() == SI.GCRelocates.size());
-
   // Lower statepoint vmstate and gcstate arguments
   SmallVector<SDValue, 10> LoweredMetaArgs;
   SmallVector<MachineMemOperand*, 16> MemRefs;
@@ -830,97 +842,109 @@ SDValue SelectionDAGBuilder::LowerAsSTATEPOINT(
 }
 
 void
-SelectionDAGBuilder::LowerStatepoint(ImmutableStatepoint ISP,
+SelectionDAGBuilder::LowerStatepoint(const GCStatepointInst &I,
                                      const BasicBlock *EHPadBB /*= nullptr*/) {
-  assert(ISP.getCall()->getCallingConv() != CallingConv::AnyReg &&
+  assert(I.getCallingConv() != CallingConv::AnyReg &&
          "anyregcc is not supported on statepoints!");
 
 #ifndef NDEBUG
-  // If this is a malformed statepoint, report it early to simplify debugging.
-  // This should catch any IR level mistake that's made when constructing or
-  // transforming statepoints.
-  ISP.verify();
-
   // Check that the associated GCStrategy expects to encounter statepoints.
   assert(GFI->getStrategy().useStatepoints() &&
          "GCStrategy does not expect to encounter statepoints");
 #endif
 
   SDValue ActualCallee;
+  SDValue Callee = getValue(I.getActualCalledOperand());
 
-  if (ISP.getNumPatchBytes() > 0) {
+  if (I.getNumPatchBytes() > 0) {
     // If we've been asked to emit a nop sequence instead of a call instruction
     // for this statepoint then don't lower the call target, but use a constant
-    // `null` instead.  Not lowering the call target lets statepoint clients get
-    // away without providing a physical address for the symbolic call target at
-    // link time.
-
-    const auto &TLI = DAG.getTargetLoweringInfo();
-    const auto &DL = DAG.getDataLayout();
-
-    unsigned AS = ISP.getCalledValue()->getType()->getPointerAddressSpace();
-    ActualCallee = DAG.getConstant(0, getCurSDLoc(), TLI.getPointerTy(DL, AS));
+    // `undef` instead.  Not lowering the call target lets statepoint clients
+    // get away without providing a physical address for the symbolic call
+    // target at link time.
+    ActualCallee = DAG.getUNDEF(Callee.getValueType());
   } else {
-    ActualCallee = getValue(ISP.getCalledValue());
+    ActualCallee = Callee;
   }
 
   StatepointLoweringInfo SI(DAG);
-  populateCallLoweringInfo(SI.CLI, ISP.getCall(),
-                           ImmutableStatepoint::CallArgsBeginPos,
-                           ISP.getNumCallArgs(), ActualCallee,
-                           ISP.getActualReturnType(), false /* IsPatchPoint */);
-
-  for (const GCRelocateInst *Relocate : ISP.getRelocates()) {
+  populateCallLoweringInfo(SI.CLI, &I, GCStatepointInst::CallArgsBeginPos,
+                           I.getNumCallArgs(), ActualCallee,
+                           I.getActualReturnType(), false /* IsPatchPoint */);
+
+  // There may be duplication in the gc.relocate list; such as two copies of
+  // each relocation on normal and exceptional path for an invoke.  We only
+  // need to spill once and record one copy in the stackmap, but we need to
+  // reload once per gc.relocate.  (Dedupping gc.relocates is trickier and best
+  // handled as a CSE problem elsewhere.)
+  // TODO: There a couple of major stackmap size optimizations we could do
+  // here if we wished.
+  // 1) If we've encountered a derived pair {B, D}, we don't need to actually
+  // record {B,B} if it's seen later.
+  // 2) Due to rematerialization, actual derived pointers are somewhat rare;
+  // given that, we could change the format to record base pointer relocations
+  // separately with half the space. This would require a format rev and a
+  // fairly major rework of the STATEPOINT node though.
+  SmallSet<SDValue, 8> Seen;
+  for (const GCRelocateInst *Relocate : I.getGCRelocates()) {
     SI.GCRelocates.push_back(Relocate);
-    SI.Bases.push_back(Relocate->getBasePtr());
-    SI.Ptrs.push_back(Relocate->getDerivedPtr());
+
+    SDValue DerivedSD = getValue(Relocate->getDerivedPtr());
+    if (Seen.insert(DerivedSD).second) {
+      SI.Bases.push_back(Relocate->getBasePtr());
+      SI.Ptrs.push_back(Relocate->getDerivedPtr());
+    }
   }
 
-  SI.GCArgs = ArrayRef<const Use>(ISP.gc_args_begin(), ISP.gc_args_end());
-  SI.StatepointInstr = ISP.getInstruction();
-  SI.GCTransitionArgs =
-      ArrayRef<const Use>(ISP.gc_args_begin(), ISP.gc_args_end());
-  SI.ID = ISP.getID();
-  SI.DeoptState = ArrayRef<const Use>(ISP.deopt_begin(), ISP.deopt_end());
-  SI.StatepointFlags = ISP.getFlags();
-  SI.NumPatchBytes = ISP.getNumPatchBytes();
+  SI.GCArgs = ArrayRef<const Use>(I.gc_args_begin(), I.gc_args_end());
+  SI.StatepointInstr = &I;
+  SI.ID = I.getID();
+
+  SI.DeoptState = ArrayRef<const Use>(I.deopt_begin(), I.deopt_end());
+  SI.GCTransitionArgs = ArrayRef<const Use>(I.gc_transition_args_begin(),
+                                            I.gc_transition_args_end());
+
+  SI.StatepointFlags = I.getFlags();
+  SI.NumPatchBytes = I.getNumPatchBytes();
   SI.EHPadBB = EHPadBB;
 
   SDValue ReturnValue = LowerAsSTATEPOINT(SI);
 
   // Export the result value if needed
-  const GCResultInst *GCResult = ISP.getGCResult();
-  Type *RetTy = ISP.getActualReturnType();
-  if (!RetTy->isVoidTy() && GCResult) {
-    if (GCResult->getParent() != ISP.getCall()->getParent()) {
-      // Result value will be used in a different basic block so we need to
-      // export it now.  Default exporting mechanism will not work here because
-      // statepoint call has a different type than the actual call. It means
-      // that by default llvm will create export register of the wrong type
-      // (always i32 in our case). So instead we need to create export register
-      // with correct type manually.
-      // TODO: To eliminate this problem we can remove gc.result intrinsics
-      //       completely and make statepoint call to return a tuple.
-      unsigned Reg = FuncInfo.CreateRegs(RetTy);
-      RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
-                       DAG.getDataLayout(), Reg, RetTy,
-                       ISP.getCall()->getCallingConv());
-      SDValue Chain = DAG.getEntryNode();
-
-      RFV.getCopyToRegs(ReturnValue, DAG, getCurSDLoc(), Chain, nullptr);
-      PendingExports.push_back(Chain);
-      FuncInfo.ValueMap[ISP.getInstruction()] = Reg;
-    } else {
-      // Result value will be used in a same basic block. Don't export it or
-      // perform any explicit register copies.
-      // We'll replace the actuall call node shortly. gc_result will grab
-      // this value.
-      setValue(ISP.getInstruction(), ReturnValue);
-    }
-  } else {
-    // The token value is never used from here on, just generate a poison value
-    setValue(ISP.getInstruction(), DAG.getIntPtrConstant(-1, getCurSDLoc()));
+  const GCResultInst *GCResult = I.getGCResult();
+  Type *RetTy = I.getActualReturnType();
+
+  if (RetTy->isVoidTy() || !GCResult) {
+    // The return value is not needed, just generate a poison value. 
+    setValue(&I, DAG.getIntPtrConstant(-1, getCurSDLoc()));
+    return;
+  }
+
+  if (GCResult->getParent() == I.getParent()) {
+    // Result value will be used in a same basic block. Don't export it or
+    // perform any explicit register copies. The gc_result will simply grab
+    // this value. 
+    setValue(&I, ReturnValue);
+    return;
   }
+  
+  // Result value will be used in a different basic block so we need to export
+  // it now.  Default exporting mechanism will not work here because statepoint
+  // call has a different type than the actual call. It means that by default
+  // llvm will create export register of the wrong type (always i32 in our
+  // case). So instead we need to create export register with correct type
+  // manually.
+  // TODO: To eliminate this problem we can remove gc.result intrinsics
+  //       completely and make statepoint call to return a tuple.
+  unsigned Reg = FuncInfo.CreateRegs(RetTy);
+  RegsForValue RFV(*DAG.getContext(), DAG.getTargetLoweringInfo(),
+                   DAG.getDataLayout(), Reg, RetTy,
+                   I.getCallingConv());
+  SDValue Chain = DAG.getEntryNode();
+  
+  RFV.getCopyToRegs(ReturnValue, DAG, getCurSDLoc(), Chain, nullptr);
+  PendingExports.push_back(Chain);
+  FuncInfo.ValueMap[&I] = Reg;
 }
 
 void SelectionDAGBuilder::LowerCallSiteWithDeoptBundleImpl(
@@ -966,26 +990,23 @@ void SelectionDAGBuilder::LowerCallSiteWithDeoptBundle(
 void SelectionDAGBuilder::visitGCResult(const GCResultInst &CI) {
   // The result value of the gc_result is simply the result of the actual
   // call.  We've already emitted this, so just grab the value.
-  const Instruction *I = CI.getStatepoint();
-
-  if (I->getParent() != CI.getParent()) {
-    // Statepoint is in different basic block so we should have stored call
-    // result in a virtual register.
-    // We can not use default getValue() functionality to copy value from this
-    // register because statepoint and actual call return types can be
-    // different, and getValue() will use CopyFromReg of the wrong type,
-    // which is always i32 in our case.
-    PointerType *CalleeType = cast<PointerType>(
-        ImmutableStatepoint(I).getCalledValue()->getType());
-    Type *RetTy =
-        cast<FunctionType>(CalleeType->getElementType())->getReturnType();
-    SDValue CopyFromReg = getCopyFromRegs(I, RetTy);
-
-    assert(CopyFromReg.getNode());
-    setValue(&CI, CopyFromReg);
-  } else {
-    setValue(&CI, getValue(I));
+  const GCStatepointInst *SI = CI.getStatepoint();
+
+  if (SI->getParent() == CI.getParent()) {
+    setValue(&CI, getValue(SI));
+    return;
   }
+  // Statepoint is in different basic block so we should have stored call
+  // result in a virtual register.
+  // We can not use default getValue() functionality to copy value from this
+  // register because statepoint and actual call return types can be
+  // different, and getValue() will use CopyFromReg of the wrong type,
+  // which is always i32 in our case.
+  Type *RetTy = SI->getActualReturnType();
+  SDValue CopyFromReg = getCopyFromRegs(SI, RetTy);
+  
+  assert(CopyFromReg.getNode());
+  setValue(&CI, CopyFromReg);
 }
 
 void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
@@ -1005,6 +1026,13 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
   const Value *DerivedPtr = Relocate.getDerivedPtr();
   SDValue SD = getValue(DerivedPtr);
 
+  if (SD.isUndef() && SD.getValueType().getSizeInBits() <= 64) {
+    // Lowering relocate(undef) as arbitrary constant. Current constant value
+    // is chosen such that it's unlikely to be a valid pointer.
+    setValue(&Relocate, DAG.getTargetConstant(0xFEFEFEFE, SDLoc(SD), MVT::i64));
+    return;
+  }
+
   auto &SpillMap = FuncInfo.StatepointSpillMaps[Relocate.getStatepoint()];
   auto SlotIt = SpillMap.find(DerivedPtr);
   assert(SlotIt != SpillMap.end() && "Relocating not lowered gc value");
@@ -1020,26 +1048,27 @@ void SelectionDAGBuilder::visitGCRelocate(const GCRelocateInst &Relocate) {
   unsigned Index = *DerivedPtrLocation;
   SDValue SpillSlot = DAG.getTargetFrameIndex(Index, getFrameIndexTy());
 
-  // Note: We know all of these reloads are independent, but don't bother to
-  // exploit that chain wise.  DAGCombine will happily do so as needed, so
-  // doing it here would be a small compile time win at most.
-  SDValue Chain = getRoot();
+  // All the reloads are independent and are reading memory only modified by
+  // statepoints (i.e. no other aliasing stores); informing SelectionDAG of
+  // this this let's CSE kick in for free and allows reordering of instructions
+  // if possible.  The lowering for statepoint sets the root, so this is
+  // ordering all reloads with the either a) the statepoint node itself, or b)
+  // the entry of the current block for an invoke statepoint.
+  const SDValue Chain = DAG.getRoot(); // != Builder.getRoot()
 
   auto &MF = DAG.getMachineFunction();
   auto &MFI = MF.getFrameInfo();
   auto PtrInfo = MachinePointerInfo::getFixedStack(MF, Index);
-  auto *LoadMMO =
-    MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad, 
-                            MFI.getObjectSize(Index),
-                            MFI.getObjectAlignment(Index));
+  auto *LoadMMO = MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
+                                          MFI.getObjectSize(Index),
+                                          MFI.getObjectAlign(Index));
 
   auto LoadVT = DAG.getTargetLoweringInfo().getValueType(DAG.getDataLayout(),
                                                          Relocate.getType());
 
   SDValue SpillLoad = DAG.getLoad(LoadVT, getCurSDLoc(), Chain,
                                   SpillSlot, LoadMMO);
-
-  DAG.setRoot(SpillLoad.getValue(1));
+  PendingLoads.push_back(SpillLoad.getValue(1));
 
   assert(SpillLoad.getNode());
   setValue(&Relocate, SpillLoad);
diff --git a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h
index 70507932681d..634ef87f3840 100644
--- a/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h
+++ b/llvm/lib/CodeGen/SelectionDAG/StatepointLowering.h
@@ -15,11 +15,9 @@
 #define LLVM_LIB_CODEGEN_SELECTIONDAG_STATEPOINTLOWERING_H
 
 #include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
-#include "llvm/CodeGen/ValueTypes.h"
 #include <cassert>
 
 namespace llvm {
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 24ab65171a17..96df20039b15 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -83,7 +83,7 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
     const CCValAssign &ArgLoc = ArgLocs[I];
     if (!ArgLoc.isRegLoc())
       continue;
-    Register Reg = ArgLoc.getLocReg();
+    MCRegister Reg = ArgLoc.getLocReg();
     // Only look at callee saved registers.
     if (MachineOperand::clobbersPhysReg(CallerPreservedMask, Reg))
       continue;
@@ -93,7 +93,7 @@ bool TargetLowering::parametersInCSRMatch(const MachineRegisterInfo &MRI,
     SDValue Value = OutVals[I];
     if (Value->getOpcode() != ISD::CopyFromReg)
       return false;
-    unsigned ArgReg = cast<RegisterSDNode>(Value->getOperand(1))->getReg();
+    MCRegister ArgReg = cast<RegisterSDNode>(Value->getOperand(1))->getReg();
     if (MRI.getLiveInPhysReg(ArgReg) != Reg)
       return false;
   }
@@ -110,14 +110,18 @@ void TargetLoweringBase::ArgListEntry::setAttributes(const CallBase *Call,
   IsSRet = Call->paramHasAttr(ArgIdx, Attribute::StructRet);
   IsNest = Call->paramHasAttr(ArgIdx, Attribute::Nest);
   IsByVal = Call->paramHasAttr(ArgIdx, Attribute::ByVal);
+  IsPreallocated = Call->paramHasAttr(ArgIdx, Attribute::Preallocated);
   IsInAlloca = Call->paramHasAttr(ArgIdx, Attribute::InAlloca);
   IsReturned = Call->paramHasAttr(ArgIdx, Attribute::Returned);
   IsSwiftSelf = Call->paramHasAttr(ArgIdx, Attribute::SwiftSelf);
   IsSwiftError = Call->paramHasAttr(ArgIdx, Attribute::SwiftError);
-  Alignment = Call->getParamAlignment(ArgIdx);
+  Alignment = Call->getParamAlign(ArgIdx);
   ByValType = nullptr;
-  if (Call->paramHasAttr(ArgIdx, Attribute::ByVal))
+  if (IsByVal)
     ByValType = Call->getParamByValType(ArgIdx);
+  PreallocatedType = nullptr;
+  if (IsPreallocated)
+    PreallocatedType = Call->getParamPreallocatedType(ArgIdx);
 }
 
 /// Generate a libcall taking the given operands as arguments and returning a
@@ -176,38 +180,24 @@ TargetLowering::makeLibCall(SelectionDAG &DAG, RTLIB::Libcall LC, EVT RetVT,
   return LowerCallTo(CLI);
 }
 
-bool
-TargetLowering::findOptimalMemOpLowering(std::vector<EVT> &MemOps,
-                                         unsigned Limit, uint64_t Size,
-                                         unsigned DstAlign, unsigned SrcAlign,
-                                         bool IsMemset,
-                                         bool ZeroMemset,
-                                         bool MemcpyStrSrc,
-                                         bool AllowOverlap,
-                                         unsigned DstAS, unsigned SrcAS,
-                                         const AttributeList &FuncAttributes) const {
-  // If 'SrcAlign' is zero, that means the memory operation does not need to
-  // load the value, i.e. memset or memcpy from constant string. Otherwise,
-  // it's the inferred alignment of the source. 'DstAlign', on the other hand,
-  // is the specified alignment of the memory operation. If it is zero, that
-  // means it's possible to change the alignment of the destination.
-  // 'MemcpyStrSrc' indicates whether the memcpy source is constant so it does
-  // not need to be loaded.
-  if (!(SrcAlign == 0 || SrcAlign >= DstAlign))
+bool TargetLowering::findOptimalMemOpLowering(
+    std::vector<EVT> &MemOps, unsigned Limit, const MemOp &Op, unsigned DstAS,
+    unsigned SrcAS, const AttributeList &FuncAttributes) const {
+  if (Op.isMemcpyWithFixedDstAlign() && Op.getSrcAlign() < Op.getDstAlign())
     return false;
 
-  EVT VT = getOptimalMemOpType(Size, DstAlign, SrcAlign,
-                               IsMemset, ZeroMemset, MemcpyStrSrc,
-                               FuncAttributes);
+  EVT VT = getOptimalMemOpType(Op, FuncAttributes);
 
   if (VT == MVT::Other) {
     // Use the largest integer type whose alignment constraints are satisfied.
     // We only need to check DstAlign here as SrcAlign is always greater or
     // equal to DstAlign (or zero).
     VT = MVT::i64;
-    while (DstAlign && DstAlign < VT.getSizeInBits() / 8 &&
-           !allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign))
-      VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
+    if (Op.isFixedDstAlign())
+      while (
+          Op.getDstAlign() < (VT.getSizeInBits() / 8) &&
+          !allowsMisalignedMemoryAccesses(VT, DstAS, Op.getDstAlign().value()))
+        VT = (MVT::SimpleValueType)(VT.getSimpleVT().SimpleTy - 1);
     assert(VT.isInteger());
 
     // Find the largest legal integer type.
@@ -223,7 +213,8 @@ TargetLowering::findOptimalMemOpLowering(std::vector<EVT> &MemOps,
   }
 
   unsigned NumMemOps = 0;
-  while (Size != 0) {
+  uint64_t Size = Op.size();
+  while (Size) {
     unsigned VTSize = VT.getSizeInBits() / 8;
     while (VTSize > Size) {
       // For now, only use non-vector load / store's for the left-over pieces.
@@ -257,9 +248,10 @@ TargetLowering::findOptimalMemOpLowering(std::vector<EVT> &MemOps,
       // If the new VT cannot cover all of the remaining bits, then consider
       // issuing a (or a pair of) unaligned and overlapping load / store.
       bool Fast;
-      if (NumMemOps && AllowOverlap && NewVTSize < Size &&
-          allowsMisalignedMemoryAccesses(VT, DstAS, DstAlign,
-                                         MachineMemOperand::MONone, &Fast) &&
+      if (NumMemOps && Op.allowOverlap() && NewVTSize < Size &&
+          allowsMisalignedMemoryAccesses(
+              VT, DstAS, Op.isFixedDstAlign() ? Op.getDstAlign().value() : 0,
+              MachineMemOperand::MONone, &Fast) &&
           Fast)
         VTSize = Size;
       else {
@@ -491,13 +483,15 @@ TargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 /// If the specified instruction has a constant integer operand and there are
 /// bits set in that constant that are not demanded, then clear those bits and
 /// return true.
-bool TargetLowering::ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+bool TargetLowering::ShrinkDemandedConstant(SDValue Op,
+                                            const APInt &DemandedBits,
+                                            const APInt &DemandedElts,
                                             TargetLoweringOpt &TLO) const {
   SDLoc DL(Op);
   unsigned Opcode = Op.getOpcode();
 
   // Do target-specific constant optimization.
-  if (targetShrinkDemandedConstant(Op, Demanded, TLO))
+  if (targetShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO))
     return TLO.New.getNode();
 
   // FIXME: ISD::SELECT, ISD::SELECT_CC
@@ -513,12 +507,12 @@ bool TargetLowering::ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
 
     // If this is a 'not' op, don't touch it because that's a canonical form.
     const APInt &C = Op1C->getAPIntValue();
-    if (Opcode == ISD::XOR && Demanded.isSubsetOf(C))
+    if (Opcode == ISD::XOR && DemandedBits.isSubsetOf(C))
       return false;
 
-    if (!C.isSubsetOf(Demanded)) {
+    if (!C.isSubsetOf(DemandedBits)) {
       EVT VT = Op.getValueType();
-      SDValue NewC = TLO.DAG.getConstant(Demanded & C, DL, VT);
+      SDValue NewC = TLO.DAG.getConstant(DemandedBits & C, DL, VT);
       SDValue NewOp = TLO.DAG.getNode(Opcode, DL, VT, Op.getOperand(0), NewC);
       return TLO.CombineTo(Op, NewOp);
     }
@@ -530,6 +524,16 @@ bool TargetLowering::ShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
   return false;
 }
 
+bool TargetLowering::ShrinkDemandedConstant(SDValue Op,
+                                            const APInt &DemandedBits,
+                                            TargetLoweringOpt &TLO) const {
+  EVT VT = Op.getValueType();
+  APInt DemandedElts = VT.isVector()
+                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  return ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO);
+}
+
 /// Convert x+y to (VT)((SmallVT)x+(SmallVT)y) if the casts are free.
 /// This uses isZExtFree and ZERO_EXTEND for the widening cast, but it could be
 /// generalized for targets with other types of implicit widening casts.
@@ -598,6 +602,16 @@ bool TargetLowering::SimplifyDemandedBits(SDValue Op, const APInt &DemandedBits,
                                           unsigned Depth,
                                           bool AssumeSingleUse) const {
   EVT VT = Op.getValueType();
+
+  // TODO: We can probably do more work on calculating the known bits and
+  // simplifying the operations for scalable vectors, but for now we just
+  // bail out.
+  if (VT.isScalableVector()) {
+    // Pretend we don't know anything for now.
+    Known = KnownBits(DemandedBits.getBitWidth());
+    return false;
+  }
+
   APInt DemandedElts = VT.isVector()
                            ? APInt::getAllOnesValue(VT.getVectorNumElements())
                            : APInt(1, 1);
@@ -623,15 +637,18 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
     return DAG.getUNDEF(Op.getValueType());
 
   unsigned NumElts = DemandedElts.getBitWidth();
+  unsigned BitWidth = DemandedBits.getBitWidth();
   KnownBits LHSKnown, RHSKnown;
   switch (Op.getOpcode()) {
   case ISD::BITCAST: {
     SDValue Src = peekThroughBitcasts(Op.getOperand(0));
     EVT SrcVT = Src.getValueType();
     EVT DstVT = Op.getValueType();
+    if (SrcVT == DstVT)
+      return Src;
+
     unsigned NumSrcEltBits = SrcVT.getScalarSizeInBits();
     unsigned NumDstEltBits = DstVT.getScalarSizeInBits();
-
     if (NumSrcEltBits == NumDstEltBits)
       if (SDValue V = SimplifyMultipleUseDemandedBits(
               Src, DemandedBits, DemandedElts, DAG, Depth + 1))
@@ -719,6 +736,21 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
       return Op.getOperand(1);
     break;
   }
+  case ISD::SHL: {
+    // If we are only demanding sign bits then we can use the shift source
+    // directly.
+    if (const APInt *MaxSA =
+            DAG.getValidMaximumShiftAmountConstant(Op, DemandedElts)) {
+      SDValue Op0 = Op.getOperand(0);
+      unsigned ShAmt = MaxSA->getZExtValue();
+      unsigned NumSignBits =
+          DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+      unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
+      if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= (UpperDemandedBits))
+        return Op0;
+    }
+    break;
+  }
   case ISD::SETCC: {
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
@@ -727,7 +759,7 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
     // width as the setcc result, and (3) the result of a setcc conforms to 0 or
     // -1, we may be able to bypass the setcc.
     if (DemandedBits.isSignMask() &&
-        Op0.getScalarValueSizeInBits() == DemandedBits.getBitWidth() &&
+        Op0.getScalarValueSizeInBits() == BitWidth &&
         getBooleanContents(Op0.getValueType()) ==
             BooleanContent::ZeroOrNegativeOneBooleanContent) {
       // If we're testing X < 0, then this compare isn't needed - just use X!
@@ -742,9 +774,30 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
   }
   case ISD::SIGN_EXTEND_INREG: {
     // If none of the extended bits are demanded, eliminate the sextinreg.
+    SDValue Op0 = Op.getOperand(0);
     EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
-    if (DemandedBits.getActiveBits() <= ExVT.getScalarSizeInBits())
-      return Op.getOperand(0);
+    unsigned ExBits = ExVT.getScalarSizeInBits();
+    if (DemandedBits.getActiveBits() <= ExBits)
+      return Op0;
+    // If the input is already sign extended, just drop the extension.
+    unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+    if (NumSignBits >= (BitWidth - ExBits + 1))
+      return Op0;
+    break;
+  }
+  case ISD::ANY_EXTEND_VECTOR_INREG:
+  case ISD::SIGN_EXTEND_VECTOR_INREG:
+  case ISD::ZERO_EXTEND_VECTOR_INREG: {
+    // If we only want the lowest element and none of extended bits, then we can
+    // return the bitcasted source vector.
+    SDValue Src = Op.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    EVT DstVT = Op.getValueType();
+    if (DemandedElts == 1 && DstVT.getSizeInBits() == SrcVT.getSizeInBits() &&
+        DAG.getDataLayout().isLittleEndian() &&
+        DemandedBits.getActiveBits() <= SrcVT.getScalarSizeInBits()) {
+      return DAG.getBitcast(DstVT, Src);
+    }
     break;
   }
   case ISD::INSERT_VECTOR_ELT: {
@@ -757,6 +810,16 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
       return Vec;
     break;
   }
+  case ISD::INSERT_SUBVECTOR: {
+    // If we don't demand the inserted subvector, return the base vector.
+    SDValue Vec = Op.getOperand(0);
+    SDValue Sub = Op.getOperand(1);
+    uint64_t Idx = Op.getConstantOperandVal(2);
+    unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+    if (DemandedElts.extractBits(NumSubElts, Idx) == 0)
+      return Vec;
+    break;
+  }
   case ISD::VECTOR_SHUFFLE: {
     ArrayRef<int> ShuffleMask = cast<ShuffleVectorSDNode>(Op)->getMask();
 
@@ -790,6 +853,25 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
   return SDValue();
 }
 
+SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
+    SDValue Op, const APInt &DemandedBits, SelectionDAG &DAG,
+    unsigned Depth) const {
+  EVT VT = Op.getValueType();
+  APInt DemandedElts = VT.isVector()
+                           ? APInt::getAllOnesValue(VT.getVectorNumElements())
+                           : APInt(1, 1);
+  return SimplifyMultipleUseDemandedBits(Op, DemandedBits, DemandedElts, DAG,
+                                         Depth);
+}
+
+SDValue TargetLowering::SimplifyMultipleUseDemandedVectorElts(
+    SDValue Op, const APInt &DemandedElts, SelectionDAG &DAG,
+    unsigned Depth) const {
+  APInt DemandedBits = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
+  return SimplifyMultipleUseDemandedBits(Op, DemandedBits, DemandedElts, DAG,
+                                         Depth);
+}
+
 /// Look at Op. At this point, we know that only the OriginalDemandedBits of the
 /// result of Op are ever used downstream. If we can use this information to
 /// simplify Op, create a new simplified DAG node and return true, returning the
@@ -805,6 +887,15 @@ bool TargetLowering::SimplifyDemandedBits(
   assert(Op.getScalarValueSizeInBits() == BitWidth &&
          "Mask size mismatches value type size!");
 
+  // Don't know anything.
+  Known = KnownBits(BitWidth);
+
+  // TODO: We can probably do more work on calculating the known bits and
+  // simplifying the operations for scalable vectors, but for now we just
+  // bail out.
+  if (Op.getValueType().isScalableVector())
+    return false;
+
   unsigned NumElts = OriginalDemandedElts.getBitWidth();
   assert((!Op.getValueType().isVector() ||
           NumElts == Op.getValueType().getVectorNumElements()) &&
@@ -815,9 +906,6 @@ bool TargetLowering::SimplifyDemandedBits(
   SDLoc dl(Op);
   auto &DL = TLO.DAG.getDataLayout();
 
-  // Don't know anything.
-  Known = KnownBits(BitWidth);
-
   // Undef operand.
   if (Op.isUndef())
     return false;
@@ -850,7 +938,7 @@ bool TargetLowering::SimplifyDemandedBits(
     return false;
   }
 
-  KnownBits Known2, KnownOut;
+  KnownBits Known2;
   switch (Op.getOpcode()) {
   case ISD::TargetConstant:
     llvm_unreachable("Can't simplify this node");
@@ -864,7 +952,11 @@ bool TargetLowering::SimplifyDemandedBits(
     APInt SrcDemandedBits = DemandedBits.zextOrSelf(SrcBitWidth);
     if (SimplifyDemandedBits(Src, SrcDemandedBits, SrcKnown, TLO, Depth + 1))
       return true;
-    Known = SrcKnown.zextOrTrunc(BitWidth, false);
+
+    // Upper elements are undef, so only get the knownbits if we just demand
+    // the bottom element.
+    if (DemandedElts == 1)
+      Known = SrcKnown.anyextOrTrunc(BitWidth);
     break;
   }
   case ISD::BUILD_VECTOR:
@@ -877,6 +969,12 @@ bool TargetLowering::SimplifyDemandedBits(
     if (getTargetConstantFromLoad(LD)) {
       Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
       return false; // Don't fall through, will infinitely loop.
+    } else if (ISD::isZEXTLoad(Op.getNode()) && Op.getResNo() == 0) {
+      // If this is a ZEXTLoad and we are looking at the loaded value.
+      EVT MemVT = LD->getMemoryVT();
+      unsigned MemBits = MemVT.getScalarSizeInBits();
+      Known.Zero.setBitsFrom(MemBits);
+      return false; // Don't fall through, will infinitely loop.
     }
     break;
   }
@@ -904,7 +1002,7 @@ bool TargetLowering::SimplifyDemandedBits(
     if (SimplifyDemandedBits(Scl, DemandedSclBits, KnownScl, TLO, Depth + 1))
       return true;
 
-    Known = KnownScl.zextOrTrunc(BitWidth, false);
+    Known = KnownScl.anyextOrTrunc(BitWidth);
 
     KnownBits KnownVec;
     if (SimplifyDemandedBits(Vec, DemandedBits, DemandedVecElts, KnownVec, TLO,
@@ -919,57 +1017,75 @@ bool TargetLowering::SimplifyDemandedBits(
     return false;
   }
   case ISD::INSERT_SUBVECTOR: {
-    SDValue Base = Op.getOperand(0);
+    // Demand any elements from the subvector and the remainder from the src its
+    // inserted into.
+    SDValue Src = Op.getOperand(0);
     SDValue Sub = Op.getOperand(1);
-    EVT SubVT = Sub.getValueType();
-    unsigned NumSubElts = SubVT.getVectorNumElements();
-
-    // If index isn't constant, assume we need the original demanded base
-    // elements and ALL the inserted subvector elements.
-    APInt BaseElts = DemandedElts;
-    APInt SubElts = APInt::getAllOnesValue(NumSubElts);
-    if (isa<ConstantSDNode>(Op.getOperand(2))) {
-      const APInt &Idx = Op.getConstantOperandAPInt(2);
-      if (Idx.ule(NumElts - NumSubElts)) {
-        unsigned SubIdx = Idx.getZExtValue();
-        SubElts = DemandedElts.extractBits(NumSubElts, SubIdx);
-        BaseElts.insertBits(APInt::getNullValue(NumSubElts), SubIdx);
-      }
-    }
-
-    KnownBits KnownSub, KnownBase;
-    if (SimplifyDemandedBits(Sub, DemandedBits, SubElts, KnownSub, TLO,
+    uint64_t Idx = Op.getConstantOperandVal(2);
+    unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+    APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+    APInt DemandedSrcElts = DemandedElts;
+    DemandedSrcElts.insertBits(APInt::getNullValue(NumSubElts), Idx);
+
+    KnownBits KnownSub, KnownSrc;
+    if (SimplifyDemandedBits(Sub, DemandedBits, DemandedSubElts, KnownSub, TLO,
                              Depth + 1))
       return true;
-    if (SimplifyDemandedBits(Base, DemandedBits, BaseElts, KnownBase, TLO,
+    if (SimplifyDemandedBits(Src, DemandedBits, DemandedSrcElts, KnownSrc, TLO,
                              Depth + 1))
       return true;
 
     Known.Zero.setAllBits();
     Known.One.setAllBits();
-    if (!!SubElts) {
-        Known.One &= KnownSub.One;
-        Known.Zero &= KnownSub.Zero;
+    if (!!DemandedSubElts) {
+      Known.One &= KnownSub.One;
+      Known.Zero &= KnownSub.Zero;
     }
-    if (!!BaseElts) {
-        Known.One &= KnownBase.One;
-        Known.Zero &= KnownBase.Zero;
+    if (!!DemandedSrcElts) {
+      Known.One &= KnownSrc.One;
+      Known.Zero &= KnownSrc.Zero;
+    }
+
+    // Attempt to avoid multi-use src if we don't need anything from it.
+    if (!DemandedBits.isAllOnesValue() || !DemandedSubElts.isAllOnesValue() ||
+        !DemandedSrcElts.isAllOnesValue()) {
+      SDValue NewSub = SimplifyMultipleUseDemandedBits(
+          Sub, DemandedBits, DemandedSubElts, TLO.DAG, Depth + 1);
+      SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+          Src, DemandedBits, DemandedSrcElts, TLO.DAG, Depth + 1);
+      if (NewSub || NewSrc) {
+        NewSub = NewSub ? NewSub : Sub;
+        NewSrc = NewSrc ? NewSrc : Src;
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc, NewSub,
+                                        Op.getOperand(2));
+        return TLO.CombineTo(Op, NewOp);
+      }
     }
     break;
   }
   case ISD::EXTRACT_SUBVECTOR: {
-    // If index isn't constant, assume we need all the source vector elements.
+    // Offset the demanded elts by the subvector index.
     SDValue Src = Op.getOperand(0);
-    ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (Src.getValueType().isScalableVector())
+      break;
+    uint64_t Idx = Op.getConstantOperandVal(1);
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
-    APInt SrcElts = APInt::getAllOnesValue(NumSrcElts);
-    if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
-      // Offset the demanded elts by the subvector index.
-      uint64_t Idx = SubIdx->getZExtValue();
-      SrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
-    }
-    if (SimplifyDemandedBits(Src, DemandedBits, SrcElts, Known, TLO, Depth + 1))
+    APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+
+    if (SimplifyDemandedBits(Src, DemandedBits, DemandedSrcElts, Known, TLO,
+                             Depth + 1))
       return true;
+
+    // Attempt to avoid multi-use src if we don't need anything from it.
+    if (!DemandedBits.isAllOnesValue() || !DemandedSrcElts.isAllOnesValue()) {
+      SDValue DemandedSrc = SimplifyMultipleUseDemandedBits(
+          Src, DemandedBits, DemandedSrcElts, TLO.DAG, Depth + 1);
+      if (DemandedSrc) {
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), dl, VT, DemandedSrc,
+                                        Op.getOperand(1));
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
     break;
   }
   case ISD::CONCAT_VECTORS: {
@@ -1069,7 +1185,8 @@ bool TargetLowering::SimplifyDemandedBits(
 
       // If any of the set bits in the RHS are known zero on the LHS, shrink
       // the constant.
-      if (ShrinkDemandedConstant(Op, ~LHSKnown.Zero & DemandedBits, TLO))
+      if (ShrinkDemandedConstant(Op, ~LHSKnown.Zero & DemandedBits,
+                                 DemandedElts, TLO))
         return true;
 
       // Bitwise-not (xor X, -1) is a special case: we don't usually shrink its
@@ -1117,16 +1234,14 @@ bool TargetLowering::SimplifyDemandedBits(
     if (DemandedBits.isSubsetOf(Known.Zero | Known2.Zero))
       return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, VT));
     // If the RHS is a constant, see if we can simplify it.
-    if (ShrinkDemandedConstant(Op, ~Known2.Zero & DemandedBits, TLO))
+    if (ShrinkDemandedConstant(Op, ~Known2.Zero & DemandedBits, DemandedElts,
+                               TLO))
       return true;
     // If the operation can be done in a smaller type, do so.
     if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
       return true;
 
-    // Output known-1 bits are only known if set in both the LHS & RHS.
-    Known.One &= Known2.One;
-    // Output known-0 are known to be clear if zero in either the LHS | RHS.
-    Known.Zero |= Known2.Zero;
+    Known &= Known2;
     break;
   }
   case ISD::OR: {
@@ -1163,16 +1278,13 @@ bool TargetLowering::SimplifyDemandedBits(
     if (DemandedBits.isSubsetOf(Known.One | Known2.Zero))
       return TLO.CombineTo(Op, Op1);
     // If the RHS is a constant, see if we can simplify it.
-    if (ShrinkDemandedConstant(Op, DemandedBits, TLO))
+    if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO))
       return true;
     // If the operation can be done in a smaller type, do so.
     if (ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
       return true;
 
-    // Output known-0 bits are only known if clear in both the LHS & RHS.
-    Known.Zero &= Known2.Zero;
-    // Output known-1 are known to be set if set in either the LHS | RHS.
-    Known.One |= Known2.One;
+    Known |= Known2;
     break;
   }
   case ISD::XOR: {
@@ -1218,12 +1330,8 @@ bool TargetLowering::SimplifyDemandedBits(
     if (DemandedBits.isSubsetOf(Known.Zero | Known2.Zero))
       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::OR, dl, VT, Op0, Op1));
 
-    // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    KnownOut.Zero = (Known.Zero & Known2.Zero) | (Known.One & Known2.One);
-    // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    KnownOut.One = (Known.Zero & Known2.One) | (Known.One & Known2.Zero);
-
-    if (ConstantSDNode *C = isConstOrConstSplat(Op1)) {
+    ConstantSDNode* C = isConstOrConstSplat(Op1, DemandedElts);
+    if (C) {
       // If one side is a constant, and all of the known set bits on the other
       // side are also set in the constant, turn this into an AND, as we know
       // the bits will be cleared.
@@ -1238,19 +1346,20 @@ bool TargetLowering::SimplifyDemandedBits(
       // If the RHS is a constant, see if we can change it. Don't alter a -1
       // constant because that's a 'not' op, and that is better for combining
       // and codegen.
-      if (!C->isAllOnesValue()) {
-        if (DemandedBits.isSubsetOf(C->getAPIntValue())) {
-          // We're flipping all demanded bits. Flip the undemanded bits too.
-          SDValue New = TLO.DAG.getNOT(dl, Op0, VT);
-          return TLO.CombineTo(Op, New);
-        }
-        // If we can't turn this into a 'not', try to shrink the constant.
-        if (ShrinkDemandedConstant(Op, DemandedBits, TLO))
-          return true;
+      if (!C->isAllOnesValue() &&
+          DemandedBits.isSubsetOf(C->getAPIntValue())) {
+        // We're flipping all demanded bits. Flip the undemanded bits too.
+        SDValue New = TLO.DAG.getNOT(dl, Op0, VT);
+        return TLO.CombineTo(Op, New);
       }
     }
 
-    Known = std::move(KnownOut);
+    // If we can't turn this into a 'not', try to shrink the constant.
+    if (!C || !C->isAllOnesValue())
+      if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO))
+        return true;
+
+    Known ^= Known2;
     break;
   }
   case ISD::SELECT:
@@ -1264,7 +1373,7 @@ bool TargetLowering::SimplifyDemandedBits(
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // If the operands are constants, see if we can simplify them.
-    if (ShrinkDemandedConstant(Op, DemandedBits, TLO))
+    if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO))
       return true;
 
     // Only known if known in both the LHS and RHS.
@@ -1282,7 +1391,7 @@ bool TargetLowering::SimplifyDemandedBits(
     assert(!Known2.hasConflict() && "Bits known to be one AND zero?");
 
     // If the operands are constants, see if we can simplify them.
-    if (ShrinkDemandedConstant(Op, DemandedBits, TLO))
+    if (ShrinkDemandedConstant(Op, DemandedBits, DemandedElts, TLO))
       return true;
 
     // Only known if known in both the LHS and RHS.
@@ -1320,12 +1429,10 @@ bool TargetLowering::SimplifyDemandedBits(
   case ISD::SHL: {
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
+    EVT ShiftVT = Op1.getValueType();
 
-    if (ConstantSDNode *SA = isConstOrConstSplat(Op1, DemandedElts)) {
-      // If the shift count is an invalid immediate, don't do anything.
-      if (SA->getAPIntValue().uge(BitWidth))
-        break;
-
+    if (const APInt *SA =
+            TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) {
       unsigned ShAmt = SA->getZExtValue();
       if (ShAmt == 0)
         return TLO.CombineTo(Op, Op0);
@@ -1336,37 +1443,25 @@ bool TargetLowering::SimplifyDemandedBits(
       // TODO - support non-uniform vector amounts.
       if (Op0.getOpcode() == ISD::SRL) {
         if (!DemandedBits.intersects(APInt::getLowBitsSet(BitWidth, ShAmt))) {
-          if (ConstantSDNode *SA2 =
-                  isConstOrConstSplat(Op0.getOperand(1), DemandedElts)) {
-            if (SA2->getAPIntValue().ult(BitWidth)) {
-              unsigned C1 = SA2->getZExtValue();
-              unsigned Opc = ISD::SHL;
-              int Diff = ShAmt - C1;
-              if (Diff < 0) {
-                Diff = -Diff;
-                Opc = ISD::SRL;
-              }
-
-              SDValue NewSA = TLO.DAG.getConstant(Diff, dl, Op1.getValueType());
-              return TLO.CombineTo(
-                  Op, TLO.DAG.getNode(Opc, dl, VT, Op0.getOperand(0), NewSA));
+          if (const APInt *SA2 =
+                  TLO.DAG.getValidShiftAmountConstant(Op0, DemandedElts)) {
+            unsigned C1 = SA2->getZExtValue();
+            unsigned Opc = ISD::SHL;
+            int Diff = ShAmt - C1;
+            if (Diff < 0) {
+              Diff = -Diff;
+              Opc = ISD::SRL;
             }
+            SDValue NewSA = TLO.DAG.getConstant(Diff, dl, ShiftVT);
+            return TLO.CombineTo(
+                Op, TLO.DAG.getNode(Opc, dl, VT, Op0.getOperand(0), NewSA));
           }
         }
       }
 
-      if (SimplifyDemandedBits(Op0, DemandedBits.lshr(ShAmt), DemandedElts,
-                               Known, TLO, Depth + 1))
-        return true;
-
-      // Try shrinking the operation as long as the shift amount will still be
-      // in range.
-      if ((ShAmt < DemandedBits.getActiveBits()) &&
-          ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
-        return true;
-
       // Convert (shl (anyext x, c)) to (anyext (shl x, c)) if the high bits
       // are not demanded. This will likely allow the anyext to be folded away.
+      // TODO - support non-uniform vector amounts.
       if (Op0.getOpcode() == ISD::ANY_EXTEND) {
         SDValue InnerOp = Op0.getOperand(0);
         EVT InnerVT = InnerOp.getValueType();
@@ -1382,22 +1477,24 @@ bool TargetLowering::SimplifyDemandedBits(
           return TLO.CombineTo(
               Op, TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT, NarrowShl));
         }
+
         // Repeat the SHL optimization above in cases where an extension
         // intervenes: (shl (anyext (shr x, c1)), c2) to
         // (shl (anyext x), c2-c1).  This requires that the bottom c1 bits
         // aren't demanded (as above) and that the shifted upper c1 bits of
         // x aren't demanded.
+        // TODO - support non-uniform vector amounts.
         if (Op0.hasOneUse() && InnerOp.getOpcode() == ISD::SRL &&
             InnerOp.hasOneUse()) {
-          if (ConstantSDNode *SA2 =
-                  isConstOrConstSplat(InnerOp.getOperand(1))) {
-            unsigned InnerShAmt = SA2->getLimitedValue(InnerBits);
+          if (const APInt *SA2 =
+                  TLO.DAG.getValidShiftAmountConstant(InnerOp, DemandedElts)) {
+            unsigned InnerShAmt = SA2->getZExtValue();
             if (InnerShAmt < ShAmt && InnerShAmt < InnerBits &&
                 DemandedBits.getActiveBits() <=
                     (InnerBits - InnerShAmt + ShAmt) &&
                 DemandedBits.countTrailingZeros() >= ShAmt) {
-              SDValue NewSA = TLO.DAG.getConstant(ShAmt - InnerShAmt, dl,
-                                                  Op1.getValueType());
+              SDValue NewSA =
+                  TLO.DAG.getConstant(ShAmt - InnerShAmt, dl, ShiftVT);
               SDValue NewExt = TLO.DAG.getNode(ISD::ANY_EXTEND, dl, VT,
                                                InnerOp.getOperand(0));
               return TLO.CombineTo(
@@ -1407,60 +1504,76 @@ bool TargetLowering::SimplifyDemandedBits(
         }
       }
 
+      APInt InDemandedMask = DemandedBits.lshr(ShAmt);
+      if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
+                               Depth + 1))
+        return true;
+      assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       Known.Zero <<= ShAmt;
       Known.One <<= ShAmt;
       // low bits known zero.
       Known.Zero.setLowBits(ShAmt);
+
+      // Try shrinking the operation as long as the shift amount will still be
+      // in range.
+      if ((ShAmt < DemandedBits.getActiveBits()) &&
+          ShrinkDemandedOp(Op, BitWidth, DemandedBits, TLO))
+        return true;
+    }
+
+    // If we are only demanding sign bits then we can use the shift source
+    // directly.
+    if (const APInt *MaxSA =
+            TLO.DAG.getValidMaximumShiftAmountConstant(Op, DemandedElts)) {
+      unsigned ShAmt = MaxSA->getZExtValue();
+      unsigned NumSignBits =
+          TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+      unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
+      if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= (UpperDemandedBits))
+        return TLO.CombineTo(Op, Op0);
     }
     break;
   }
   case ISD::SRL: {
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
+    EVT ShiftVT = Op1.getValueType();
 
-    if (ConstantSDNode *SA = isConstOrConstSplat(Op1, DemandedElts)) {
-      // If the shift count is an invalid immediate, don't do anything.
-      if (SA->getAPIntValue().uge(BitWidth))
-        break;
-
+    if (const APInt *SA =
+            TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) {
       unsigned ShAmt = SA->getZExtValue();
       if (ShAmt == 0)
         return TLO.CombineTo(Op, Op0);
 
-      EVT ShiftVT = Op1.getValueType();
-      APInt InDemandedMask = (DemandedBits << ShAmt);
-
-      // If the shift is exact, then it does demand the low bits (and knows that
-      // they are zero).
-      if (Op->getFlags().hasExact())
-        InDemandedMask.setLowBits(ShAmt);
-
       // If this is ((X << C1) >>u ShAmt), see if we can simplify this into a
       // single shift.  We can do this if the top bits (which are shifted out)
       // are never demanded.
       // TODO - support non-uniform vector amounts.
       if (Op0.getOpcode() == ISD::SHL) {
-        if (ConstantSDNode *SA2 =
-                isConstOrConstSplat(Op0.getOperand(1), DemandedElts)) {
-          if (!DemandedBits.intersects(
-                  APInt::getHighBitsSet(BitWidth, ShAmt))) {
-            if (SA2->getAPIntValue().ult(BitWidth)) {
-              unsigned C1 = SA2->getZExtValue();
-              unsigned Opc = ISD::SRL;
-              int Diff = ShAmt - C1;
-              if (Diff < 0) {
-                Diff = -Diff;
-                Opc = ISD::SHL;
-              }
-
-              SDValue NewSA = TLO.DAG.getConstant(Diff, dl, ShiftVT);
-              return TLO.CombineTo(
-                  Op, TLO.DAG.getNode(Opc, dl, VT, Op0.getOperand(0), NewSA));
+        if (!DemandedBits.intersects(APInt::getHighBitsSet(BitWidth, ShAmt))) {
+          if (const APInt *SA2 =
+                  TLO.DAG.getValidShiftAmountConstant(Op0, DemandedElts)) {
+            unsigned C1 = SA2->getZExtValue();
+            unsigned Opc = ISD::SRL;
+            int Diff = ShAmt - C1;
+            if (Diff < 0) {
+              Diff = -Diff;
+              Opc = ISD::SHL;
             }
+            SDValue NewSA = TLO.DAG.getConstant(Diff, dl, ShiftVT);
+            return TLO.CombineTo(
+                Op, TLO.DAG.getNode(Opc, dl, VT, Op0.getOperand(0), NewSA));
           }
         }
       }
 
+      APInt InDemandedMask = (DemandedBits << ShAmt);
+
+      // If the shift is exact, then it does demand the low bits (and knows that
+      // they are zero).
+      if (Op->getFlags().hasExact())
+        InDemandedMask.setLowBits(ShAmt);
+
       // Compute the new bits that are at the top now.
       if (SimplifyDemandedBits(Op0, InDemandedMask, DemandedElts, Known, TLO,
                                Depth + 1))
@@ -1468,14 +1581,22 @@ bool TargetLowering::SimplifyDemandedBits(
       assert(!Known.hasConflict() && "Bits known to be one AND zero?");
       Known.Zero.lshrInPlace(ShAmt);
       Known.One.lshrInPlace(ShAmt);
-
-      Known.Zero.setHighBits(ShAmt); // High bits known zero.
+      // High bits known zero.
+      Known.Zero.setHighBits(ShAmt);
     }
     break;
   }
   case ISD::SRA: {
     SDValue Op0 = Op.getOperand(0);
     SDValue Op1 = Op.getOperand(1);
+    EVT ShiftVT = Op1.getValueType();
+
+    // If we only want bits that already match the signbit then we don't need
+    // to shift.
+    unsigned NumHiDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
+    if (TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1) >=
+        NumHiDemandedBits)
+      return TLO.CombineTo(Op, Op0);
 
     // If this is an arithmetic shift right and only the low-bit is set, we can
     // always convert this into a logical shr, even if the shift amount is
@@ -1484,11 +1605,8 @@ bool TargetLowering::SimplifyDemandedBits(
     if (DemandedBits.isOneValue())
       return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, Op1));
 
-    if (ConstantSDNode *SA = isConstOrConstSplat(Op1, DemandedElts)) {
-      // If the shift count is an invalid immediate, don't do anything.
-      if (SA->getAPIntValue().uge(BitWidth))
-        break;
-
+    if (const APInt *SA =
+            TLO.DAG.getValidShiftAmountConstant(Op, DemandedElts)) {
       unsigned ShAmt = SA->getZExtValue();
       if (ShAmt == 0)
         return TLO.CombineTo(Op, Op0);
@@ -1525,14 +1643,23 @@ bool TargetLowering::SimplifyDemandedBits(
       int Log2 = DemandedBits.exactLogBase2();
       if (Log2 >= 0) {
         // The bit must come from the sign.
-        SDValue NewSA =
-            TLO.DAG.getConstant(BitWidth - 1 - Log2, dl, Op1.getValueType());
+        SDValue NewSA = TLO.DAG.getConstant(BitWidth - 1 - Log2, dl, ShiftVT);
         return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::SRL, dl, VT, Op0, NewSA));
       }
 
       if (Known.One[BitWidth - ShAmt - 1])
         // New bits are known one.
         Known.One.setHighBits(ShAmt);
+
+      // Attempt to avoid multi-use ops if we don't need anything from them.
+      if (!InDemandedMask.isAllOnesValue() || !DemandedElts.isAllOnesValue()) {
+        SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+            Op0, InDemandedMask, DemandedElts, TLO.DAG, Depth + 1);
+        if (DemandedOp0) {
+          SDValue NewOp = TLO.DAG.getNode(ISD::SRA, dl, VT, DemandedOp0, Op1);
+          return TLO.CombineTo(Op, NewOp);
+        }
+      }
     }
     break;
   }
@@ -1573,6 +1700,32 @@ bool TargetLowering::SimplifyDemandedBits(
       Known.One |= Known2.One;
       Known.Zero |= Known2.Zero;
     }
+
+    // For pow-2 bitwidths we only demand the bottom modulo amt bits.
+    if (isPowerOf2_32(BitWidth)) {
+      APInt DemandedAmtBits(Op2.getScalarValueSizeInBits(), BitWidth - 1);
+      if (SimplifyDemandedBits(Op2, DemandedAmtBits, DemandedElts,
+                               Known2, TLO, Depth + 1))
+        return true;
+    }
+    break;
+  }
+  case ISD::ROTL:
+  case ISD::ROTR: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    // If we're rotating an 0/-1 value, then it stays an 0/-1 value.
+    if (BitWidth == TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1))
+      return TLO.CombineTo(Op, Op0);
+
+    // For pow-2 bitwidths we only demand the bottom modulo amt bits.
+    if (isPowerOf2_32(BitWidth)) {
+      APInt DemandedAmtBits(Op1.getScalarValueSizeInBits(), BitWidth - 1);
+      if (SimplifyDemandedBits(Op1, DemandedAmtBits, DemandedElts, Known2, TLO,
+                               Depth + 1))
+        return true;
+    }
     break;
   }
   case ISD::BITREVERSE: {
@@ -1602,7 +1755,8 @@ bool TargetLowering::SimplifyDemandedBits(
 
     // If we only care about the highest bit, don't bother shifting right.
     if (DemandedBits.isSignMask()) {
-      unsigned NumSignBits = TLO.DAG.ComputeNumSignBits(Op0);
+      unsigned NumSignBits =
+          TLO.DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
       bool AlreadySignExtended = NumSignBits >= BitWidth - ExVTBits + 1;
       // However if the input is already sign extended we expect the sign
       // extension to be dropped altogether later and do not simplify.
@@ -1639,8 +1793,7 @@ bool TargetLowering::SimplifyDemandedBits(
 
     // If the input sign bit is known zero, convert this into a zero extension.
     if (Known.Zero[ExVTBits - 1])
-      return TLO.CombineTo(
-          Op, TLO.DAG.getZeroExtendInReg(Op0, dl, ExVT.getScalarType()));
+      return TLO.CombineTo(Op, TLO.DAG.getZeroExtendInReg(Op0, dl, ExVT));
 
     APInt Mask = APInt::getLowBitsSet(BitWidth, ExVTBits);
     if (Known.One[ExVTBits - 1]) { // Input sign bit known set
@@ -1704,7 +1857,7 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     assert(Known.getBitWidth() == InBits && "Src width has changed?");
-    Known = Known.zext(BitWidth, true /* ExtendedBitsAreKnownZero */);
+    Known = Known.zext(BitWidth);
     break;
   }
   case ISD::SIGN_EXTEND:
@@ -1777,7 +1930,12 @@ bool TargetLowering::SimplifyDemandedBits(
       return true;
     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     assert(Known.getBitWidth() == InBits && "Src width has changed?");
-    Known = Known.zext(BitWidth, false /* => any extend */);
+    Known = Known.anyext(BitWidth);
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+            Src, InDemandedBits, InDemandedElts, TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(Op.getOpcode(), dl, VT, NewSrc));
     break;
   }
   case ISD::TRUNCATE: {
@@ -1886,7 +2044,7 @@ bool TargetLowering::SimplifyDemandedBits(
 
     Known = Known2;
     if (BitWidth > EltBitWidth)
-      Known = Known.zext(BitWidth, false /* => any extend */);
+      Known = Known.anyext(BitWidth);
     break;
   }
   case ISD::BITCAST: {
@@ -2151,14 +2309,20 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     APInt &KnownZero, TargetLoweringOpt &TLO, unsigned Depth,
     bool AssumeSingleUse) const {
   EVT VT = Op.getValueType();
+  unsigned Opcode = Op.getOpcode();
   APInt DemandedElts = OriginalDemandedElts;
   unsigned NumElts = DemandedElts.getBitWidth();
   assert(VT.isVector() && "Expected vector op");
-  assert(VT.getVectorNumElements() == NumElts &&
-         "Mask size mismatches value type element count!");
 
   KnownUndef = KnownZero = APInt::getNullValue(NumElts);
 
+  // TODO: For now we assume we know nothing about scalable vectors.
+  if (VT.isScalableVector())
+    return false;
+
+  assert(VT.getVectorNumElements() == NumElts &&
+         "Mask size mismatches value type element count!");
+
   // Undef operand.
   if (Op.isUndef()) {
     KnownUndef.setAllBits();
@@ -2182,7 +2346,22 @@ bool TargetLowering::SimplifyDemandedVectorElts(
   SDLoc DL(Op);
   unsigned EltSizeInBits = VT.getScalarSizeInBits();
 
-  switch (Op.getOpcode()) {
+  // Helper for demanding the specified elements and all the bits of both binary
+  // operands.
+  auto SimplifyDemandedVectorEltsBinOp = [&](SDValue Op0, SDValue Op1) {
+    SDValue NewOp0 = SimplifyMultipleUseDemandedVectorElts(Op0, DemandedElts,
+                                                           TLO.DAG, Depth + 1);
+    SDValue NewOp1 = SimplifyMultipleUseDemandedVectorElts(Op1, DemandedElts,
+                                                           TLO.DAG, Depth + 1);
+    if (NewOp0 || NewOp1) {
+      SDValue NewOp = TLO.DAG.getNode(
+          Opcode, SDLoc(Op), VT, NewOp0 ? NewOp0 : Op0, NewOp1 ? NewOp1 : Op1);
+      return TLO.CombineTo(Op, NewOp);
+    }
+    return false;
+  };
+
+  switch (Opcode) {
   case ISD::SCALAR_TO_VECTOR: {
     if (!DemandedElts[0]) {
       KnownUndef.setAllBits();
@@ -2234,7 +2413,8 @@ bool TargetLowering::SimplifyDemandedVectorElts(
           }
 
         KnownBits Known;
-        if (SimplifyDemandedBits(Src, SrcDemandedBits, Known, TLO, Depth + 1))
+        if (SimplifyDemandedBits(Src, SrcDemandedBits, SrcDemandedElts, Known,
+                                 TLO, Depth + 1))
           return true;
       }
 
@@ -2323,53 +2503,75 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     break;
   }
   case ISD::INSERT_SUBVECTOR: {
-    if (!isa<ConstantSDNode>(Op.getOperand(2)))
-      break;
-    SDValue Base = Op.getOperand(0);
+    // Demand any elements from the subvector and the remainder from the src its
+    // inserted into.
+    SDValue Src = Op.getOperand(0);
     SDValue Sub = Op.getOperand(1);
-    EVT SubVT = Sub.getValueType();
-    unsigned NumSubElts = SubVT.getVectorNumElements();
-    const APInt &Idx = Op.getConstantOperandAPInt(2);
-    if (Idx.ugt(NumElts - NumSubElts))
-      break;
-    unsigned SubIdx = Idx.getZExtValue();
-    APInt SubElts = DemandedElts.extractBits(NumSubElts, SubIdx);
+    uint64_t Idx = Op.getConstantOperandVal(2);
+    unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
+    APInt DemandedSubElts = DemandedElts.extractBits(NumSubElts, Idx);
+    APInt DemandedSrcElts = DemandedElts;
+    DemandedSrcElts.insertBits(APInt::getNullValue(NumSubElts), Idx);
+
     APInt SubUndef, SubZero;
-    if (SimplifyDemandedVectorElts(Sub, SubElts, SubUndef, SubZero, TLO,
+    if (SimplifyDemandedVectorElts(Sub, DemandedSubElts, SubUndef, SubZero, TLO,
                                    Depth + 1))
       return true;
-    APInt BaseElts = DemandedElts;
-    BaseElts.insertBits(APInt::getNullValue(NumSubElts), SubIdx);
-
-    // If none of the base operand elements are demanded, replace it with undef.
-    if (!BaseElts && !Base.isUndef())
-      return TLO.CombineTo(Op,
-                           TLO.DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
-                                           TLO.DAG.getUNDEF(VT),
-                                           Op.getOperand(1),
-                                           Op.getOperand(2)));
-
-    if (SimplifyDemandedVectorElts(Base, BaseElts, KnownUndef, KnownZero, TLO,
-                                   Depth + 1))
+
+    // If none of the src operand elements are demanded, replace it with undef.
+    if (!DemandedSrcElts && !Src.isUndef())
+      return TLO.CombineTo(Op, TLO.DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                                               TLO.DAG.getUNDEF(VT), Sub,
+                                               Op.getOperand(2)));
+
+    if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, KnownUndef, KnownZero,
+                                   TLO, Depth + 1))
       return true;
-    KnownUndef.insertBits(SubUndef, SubIdx);
-    KnownZero.insertBits(SubZero, SubIdx);
+    KnownUndef.insertBits(SubUndef, Idx);
+    KnownZero.insertBits(SubZero, Idx);
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedSrcElts.isAllOnesValue() ||
+        !DemandedSubElts.isAllOnesValue()) {
+      SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
+          Src, DemandedSrcElts, TLO.DAG, Depth + 1);
+      SDValue NewSub = SimplifyMultipleUseDemandedVectorElts(
+          Sub, DemandedSubElts, TLO.DAG, Depth + 1);
+      if (NewSrc || NewSub) {
+        NewSrc = NewSrc ? NewSrc : Src;
+        NewSub = NewSub ? NewSub : Sub;
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, NewSrc,
+                                        NewSub, Op.getOperand(2));
+        return TLO.CombineTo(Op, NewOp);
+      }
+    }
     break;
   }
   case ISD::EXTRACT_SUBVECTOR: {
+    // Offset the demanded elts by the subvector index.
     SDValue Src = Op.getOperand(0);
-    ConstantSDNode *SubIdx = dyn_cast<ConstantSDNode>(Op.getOperand(1));
+    if (Src.getValueType().isScalableVector())
+      break;
+    uint64_t Idx = Op.getConstantOperandVal(1);
     unsigned NumSrcElts = Src.getValueType().getVectorNumElements();
-    if (SubIdx && SubIdx->getAPIntValue().ule(NumSrcElts - NumElts)) {
-      // Offset the demanded elts by the subvector index.
-      uint64_t Idx = SubIdx->getZExtValue();
-      APInt SrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
-      APInt SrcUndef, SrcZero;
-      if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
-                                     Depth + 1))
-        return true;
-      KnownUndef = SrcUndef.extractBits(NumElts, Idx);
-      KnownZero = SrcZero.extractBits(NumElts, Idx);
+    APInt DemandedSrcElts = DemandedElts.zextOrSelf(NumSrcElts).shl(Idx);
+
+    APInt SrcUndef, SrcZero;
+    if (SimplifyDemandedVectorElts(Src, DemandedSrcElts, SrcUndef, SrcZero, TLO,
+                                   Depth + 1))
+      return true;
+    KnownUndef = SrcUndef.extractBits(NumElts, Idx);
+    KnownZero = SrcZero.extractBits(NumElts, Idx);
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    if (!DemandedElts.isAllOnesValue()) {
+      SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
+          Src, DemandedSrcElts, TLO.DAG, Depth + 1);
+      if (NewSrc) {
+        SDValue NewOp = TLO.DAG.getNode(Op.getOpcode(), SDLoc(Op), VT, NewSrc,
+                                        Op.getOperand(1));
+        return TLO.CombineTo(Op, NewOp);
+      }
     }
     break;
   }
@@ -2538,7 +2740,7 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     break;
   }
 
-  // TODO: There are more binop opcodes that could be handled here - MUL, MIN,
+  // TODO: There are more binop opcodes that could be handled here - MIN,
   // MAX, saturated math, etc.
   case ISD::OR:
   case ISD::XOR:
@@ -2549,17 +2751,26 @@ bool TargetLowering::SimplifyDemandedVectorElts(
   case ISD::FMUL:
   case ISD::FDIV:
   case ISD::FREM: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
     APInt UndefRHS, ZeroRHS;
-    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, UndefRHS,
-                                   ZeroRHS, TLO, Depth + 1))
+    if (SimplifyDemandedVectorElts(Op1, DemandedElts, UndefRHS, ZeroRHS, TLO,
+                                   Depth + 1))
       return true;
     APInt UndefLHS, ZeroLHS;
-    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, UndefLHS,
-                                   ZeroLHS, TLO, Depth + 1))
+    if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO,
+                                   Depth + 1))
       return true;
 
     KnownZero = ZeroLHS & ZeroRHS;
     KnownUndef = getKnownUndefForVectorBinop(Op, TLO.DAG, UndefLHS, UndefRHS);
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    // TODO - use KnownUndef to relax the demandedelts?
+    if (!DemandedElts.isAllOnesValue())
+      if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
+        return true;
     break;
   }
   case ISD::SHL:
@@ -2567,27 +2778,39 @@ bool TargetLowering::SimplifyDemandedVectorElts(
   case ISD::SRA:
   case ISD::ROTL:
   case ISD::ROTR: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
     APInt UndefRHS, ZeroRHS;
-    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, UndefRHS,
-                                   ZeroRHS, TLO, Depth + 1))
+    if (SimplifyDemandedVectorElts(Op1, DemandedElts, UndefRHS, ZeroRHS, TLO,
+                                   Depth + 1))
       return true;
     APInt UndefLHS, ZeroLHS;
-    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, UndefLHS,
-                                   ZeroLHS, TLO, Depth + 1))
+    if (SimplifyDemandedVectorElts(Op0, DemandedElts, UndefLHS, ZeroLHS, TLO,
+                                   Depth + 1))
       return true;
 
     KnownZero = ZeroLHS;
     KnownUndef = UndefLHS & UndefRHS; // TODO: use getKnownUndefForVectorBinop?
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    // TODO - use KnownUndef to relax the demandedelts?
+    if (!DemandedElts.isAllOnesValue())
+      if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
+        return true;
     break;
   }
   case ISD::MUL:
   case ISD::AND: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
     APInt SrcUndef, SrcZero;
-    if (SimplifyDemandedVectorElts(Op.getOperand(1), DemandedElts, SrcUndef,
-                                   SrcZero, TLO, Depth + 1))
+    if (SimplifyDemandedVectorElts(Op1, DemandedElts, SrcUndef, SrcZero, TLO,
+                                   Depth + 1))
       return true;
-    if (SimplifyDemandedVectorElts(Op.getOperand(0), DemandedElts, KnownUndef,
-                                   KnownZero, TLO, Depth + 1))
+    if (SimplifyDemandedVectorElts(Op0, DemandedElts, KnownUndef, KnownZero,
+                                   TLO, Depth + 1))
       return true;
 
     // If either side has a zero element, then the result element is zero, even
@@ -2597,6 +2820,12 @@ bool TargetLowering::SimplifyDemandedVectorElts(
     KnownZero |= SrcZero;
     KnownUndef &= SrcUndef;
     KnownUndef &= ~KnownZero;
+
+    // Attempt to avoid multi-use ops if we don't need anything from them.
+    // TODO - use KnownUndef to relax the demandedelts?
+    if (!DemandedElts.isAllOnesValue())
+      if (SimplifyDemandedVectorEltsBinOp(Op0, Op1))
+        return true;
     break;
   }
   case ISD::TRUNCATE:
@@ -2661,17 +2890,16 @@ void TargetLowering::computeKnownBitsForTargetInstr(
   Known.resetAll();
 }
 
-void TargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
-                                                   KnownBits &Known,
-                                                   const APInt &DemandedElts,
-                                                   const SelectionDAG &DAG,
-                                                   unsigned Depth) const {
-  assert(isa<FrameIndexSDNode>(Op) && "expected FrameIndex");
+void TargetLowering::computeKnownBitsForFrameIndex(
+  const int FrameIdx, KnownBits &Known, const MachineFunction &MF) const {
+  // The low bits are known zero if the pointer is aligned.
+  Known.Zero.setLowBits(Log2(MF.getFrameInfo().getObjectAlign(FrameIdx)));
+}
 
-  if (unsigned Align = DAG.InferPtrAlignment(Op)) {
-    // The low bits are known zero if the pointer is aligned.
-    Known.Zero.setLowBits(Log2_32(Align));
-  }
+Align TargetLowering::computeKnownAlignForTargetInstr(
+  GISelKnownBits &Analysis, Register R, const MachineRegisterInfo &MRI,
+  unsigned Depth) const {
+  return Align(1);
 }
 
 /// This method can be implemented by targets that want to expose additional
@@ -2689,6 +2917,12 @@ unsigned TargetLowering::ComputeNumSignBitsForTargetNode(SDValue Op,
   return 1;
 }
 
+unsigned TargetLowering::computeNumSignBitsForTargetInstr(
+  GISelKnownBits &Analysis, Register R, const APInt &DemandedElts,
+  const MachineRegisterInfo &MRI, unsigned Depth) const {
+  return 1;
+}
+
 bool TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
     TargetLoweringOpt &TLO, unsigned Depth) const {
@@ -3788,33 +4022,18 @@ SDValue TargetLowering::SimplifySetCC(EVT VT, SDValue N0, SDValue N1,
       // the comparison operands is infinity or negative infinity, convert the
       // condition to a less-awkward <= or >=.
       if (CFP->getValueAPF().isInfinity()) {
-        if (CFP->getValueAPF().isNegative()) {
-          if (Cond == ISD::SETOEQ &&
-              isCondCodeLegal(ISD::SETOLE, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOLE);
-          if (Cond == ISD::SETUEQ &&
-              isCondCodeLegal(ISD::SETOLE, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETULE);
-          if (Cond == ISD::SETUNE &&
-              isCondCodeLegal(ISD::SETUGT, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETUGT);
-          if (Cond == ISD::SETONE &&
-              isCondCodeLegal(ISD::SETUGT, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOGT);
-        } else {
-          if (Cond == ISD::SETOEQ &&
-              isCondCodeLegal(ISD::SETOGE, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOGE);
-          if (Cond == ISD::SETUEQ &&
-              isCondCodeLegal(ISD::SETOGE, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETUGE);
-          if (Cond == ISD::SETUNE &&
-              isCondCodeLegal(ISD::SETULT, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETULT);
-          if (Cond == ISD::SETONE &&
-              isCondCodeLegal(ISD::SETULT, N0.getSimpleValueType()))
-            return DAG.getSetCC(dl, VT, N0, N1, ISD::SETOLT);
+        bool IsNegInf = CFP->getValueAPF().isNegative();
+        ISD::CondCode NewCond = ISD::SETCC_INVALID;
+        switch (Cond) {
+        case ISD::SETOEQ: NewCond = IsNegInf ? ISD::SETOLE : ISD::SETOGE; break;
+        case ISD::SETUEQ: NewCond = IsNegInf ? ISD::SETULE : ISD::SETUGE; break;
+        case ISD::SETUNE: NewCond = IsNegInf ? ISD::SETUGT : ISD::SETULT; break;
+        case ISD::SETONE: NewCond = IsNegInf ? ISD::SETOGT : ISD::SETOLT; break;
+        default: break;
         }
+        if (NewCond != ISD::SETCC_INVALID &&
+            isCondCodeLegal(NewCond, N0.getSimpleValueType()))
+          return DAG.getSetCC(dl, VT, N0, N1, NewCond);
       }
     }
   }
@@ -4245,10 +4464,10 @@ unsigned TargetLowering::AsmOperandInfo::getMatchedOperand() const {
 TargetLowering::AsmOperandInfoVector
 TargetLowering::ParseConstraints(const DataLayout &DL,
                                  const TargetRegisterInfo *TRI,
-                                 ImmutableCallSite CS) const {
+                                 const CallBase &Call) const {
   /// Information about all of the constraints.
   AsmOperandInfoVector ConstraintOperands;
-  const InlineAsm *IA = cast<InlineAsm>(CS.getCalledValue());
+  const InlineAsm *IA = cast<InlineAsm>(Call.getCalledOperand());
   unsigned maCount = 0; // Largest number of multiple alternative constraints.
 
   // Do a prepass over the constraints, canonicalizing them, and building up the
@@ -4271,25 +4490,24 @@ TargetLowering::ParseConstraints(const DataLayout &DL,
     case InlineAsm::isOutput:
       // Indirect outputs just consume an argument.
       if (OpInfo.isIndirect) {
-        OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
+        OpInfo.CallOperandVal = Call.getArgOperand(ArgNo++);
         break;
       }
 
       // The return value of the call is this value.  As such, there is no
       // corresponding argument.
-      assert(!CS.getType()->isVoidTy() &&
-             "Bad inline asm!");
-      if (StructType *STy = dyn_cast<StructType>(CS.getType())) {
+      assert(!Call.getType()->isVoidTy() && "Bad inline asm!");
+      if (StructType *STy = dyn_cast<StructType>(Call.getType())) {
         OpInfo.ConstraintVT =
             getSimpleValueType(DL, STy->getElementType(ResNo));
       } else {
         assert(ResNo == 0 && "Asm only has one result!");
-        OpInfo.ConstraintVT = getSimpleValueType(DL, CS.getType());
+        OpInfo.ConstraintVT = getSimpleValueType(DL, Call.getType());
       }
       ++ResNo;
       break;
     case InlineAsm::isInput:
-      OpInfo.CallOperandVal = const_cast<Value *>(CS.getArgument(ArgNo++));
+      OpInfo.CallOperandVal = Call.getArgOperand(ArgNo++);
       break;
     case InlineAsm::isClobber:
       // Nothing to do.
@@ -5479,251 +5697,221 @@ verifyReturnAddressArgumentIsConstant(SDValue Op, SelectionDAG &DAG) const {
   return false;
 }
 
-char TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
-                                        bool LegalOperations, bool ForCodeSize,
-                                        unsigned Depth) const {
+SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+                                             bool LegalOps, bool OptForSize,
+                                             NegatibleCost &Cost,
+                                             unsigned Depth) const {
   // fneg is removable even if it has multiple uses.
-  if (Op.getOpcode() == ISD::FNEG)
-    return 2;
+  if (Op.getOpcode() == ISD::FNEG) {
+    Cost = NegatibleCost::Cheaper;
+    return Op.getOperand(0);
+  }
 
-  // Don't allow anything with multiple uses unless we know it is free.
-  EVT VT = Op.getValueType();
+  // Don't recurse exponentially.
+  if (Depth > SelectionDAG::MaxRecursionDepth)
+    return SDValue();
+
+  // Pre-increment recursion depth for use in recursive calls.
+  ++Depth;
   const SDNodeFlags Flags = Op->getFlags();
   const TargetOptions &Options = DAG.getTarget().Options;
-  if (!Op.hasOneUse() && !(Op.getOpcode() == ISD::FP_EXTEND &&
-                           isFPExtFree(VT, Op.getOperand(0).getValueType())))
-    return 0;
+  EVT VT = Op.getValueType();
+  unsigned Opcode = Op.getOpcode();
 
-  // Don't recurse exponentially.
-  if (Depth > SelectionDAG::MaxRecursionDepth)
-    return 0;
+  // Don't allow anything with multiple uses unless we know it is free.
+  if (!Op.hasOneUse() && Opcode != ISD::ConstantFP) {
+    bool IsFreeExtend = Opcode == ISD::FP_EXTEND &&
+                        isFPExtFree(VT, Op.getOperand(0).getValueType());
+    if (!IsFreeExtend)
+      return SDValue();
+  }
 
-  switch (Op.getOpcode()) {
-  case ISD::ConstantFP: {
-    if (!LegalOperations)
-      return 1;
+  SDLoc DL(Op);
 
+  switch (Opcode) {
+  case ISD::ConstantFP: {
     // Don't invert constant FP values after legalization unless the target says
     // the negated constant is legal.
-    return isOperationLegal(ISD::ConstantFP, VT) ||
-           isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT,
-                        ForCodeSize);
+    bool IsOpLegal =
+        isOperationLegal(ISD::ConstantFP, VT) ||
+        isFPImmLegal(neg(cast<ConstantFPSDNode>(Op)->getValueAPF()), VT,
+                     OptForSize);
+
+    if (LegalOps && !IsOpLegal)
+      break;
+
+    APFloat V = cast<ConstantFPSDNode>(Op)->getValueAPF();
+    V.changeSign();
+    SDValue CFP = DAG.getConstantFP(V, DL, VT);
+
+    // If we already have the use of the negated floating constant, it is free
+    // to negate it even it has multiple uses.
+    if (!Op.hasOneUse() && CFP.use_empty())
+      break;
+    Cost = NegatibleCost::Neutral;
+    return CFP;
   }
   case ISD::BUILD_VECTOR: {
     // Only permit BUILD_VECTOR of constants.
     if (llvm::any_of(Op->op_values(), [&](SDValue N) {
           return !N.isUndef() && !isa<ConstantFPSDNode>(N);
         }))
-      return 0;
-    if (!LegalOperations)
-      return 1;
-    if (isOperationLegal(ISD::ConstantFP, VT) &&
-        isOperationLegal(ISD::BUILD_VECTOR, VT))
-      return 1;
-    return llvm::all_of(Op->op_values(), [&](SDValue N) {
-      return N.isUndef() ||
-             isFPImmLegal(neg(cast<ConstantFPSDNode>(N)->getValueAPF()), VT,
-                          ForCodeSize);
-    });
+      break;
+
+    bool IsOpLegal =
+        (isOperationLegal(ISD::ConstantFP, VT) &&
+         isOperationLegal(ISD::BUILD_VECTOR, VT)) ||
+        llvm::all_of(Op->op_values(), [&](SDValue N) {
+          return N.isUndef() ||
+                 isFPImmLegal(neg(cast<ConstantFPSDNode>(N)->getValueAPF()), VT,
+                              OptForSize);
+        });
+
+    if (LegalOps && !IsOpLegal)
+      break;
+
+    SmallVector<SDValue, 4> Ops;
+    for (SDValue C : Op->op_values()) {
+      if (C.isUndef()) {
+        Ops.push_back(C);
+        continue;
+      }
+      APFloat V = cast<ConstantFPSDNode>(C)->getValueAPF();
+      V.changeSign();
+      Ops.push_back(DAG.getConstantFP(V, DL, C.getValueType()));
+    }
+    Cost = NegatibleCost::Neutral;
+    return DAG.getBuildVector(VT, DL, Ops);
   }
-  case ISD::FADD:
+  case ISD::FADD: {
     if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
-      return 0;
+      break;
 
     // After operation legalization, it might not be legal to create new FSUBs.
-    if (LegalOperations && !isOperationLegalOrCustom(ISD::FSUB, VT))
-      return 0;
+    if (LegalOps && !isOperationLegalOrCustom(ISD::FSUB, VT))
+      break;
+    SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
 
-    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
-    if (char V = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
-                                    ForCodeSize, Depth + 1))
-      return V;
-    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
-    return isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations,
-                              ForCodeSize, Depth + 1);
-  case ISD::FSUB:
+    // fold (fneg (fadd X, Y)) -> (fsub (fneg X), Y)
+    NegatibleCost CostX = NegatibleCost::Expensive;
+    SDValue NegX =
+        getNegatedExpression(X, DAG, LegalOps, OptForSize, CostX, Depth);
+    // fold (fneg (fadd X, Y)) -> (fsub (fneg Y), X)
+    NegatibleCost CostY = NegatibleCost::Expensive;
+    SDValue NegY =
+        getNegatedExpression(Y, DAG, LegalOps, OptForSize, CostY, Depth);
+
+    // Negate the X if its cost is less or equal than Y.
+    if (NegX && (CostX <= CostY)) {
+      Cost = CostX;
+      return DAG.getNode(ISD::FSUB, DL, VT, NegX, Y, Flags);
+    }
+
+    // Negate the Y if it is not expensive.
+    if (NegY) {
+      Cost = CostY;
+      return DAG.getNode(ISD::FSUB, DL, VT, NegY, X, Flags);
+    }
+    break;
+  }
+  case ISD::FSUB: {
     // We can't turn -(A-B) into B-A when we honor signed zeros.
     if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
-      return 0;
+      break;
 
-    // fold (fneg (fsub A, B)) -> (fsub B, A)
-    return 1;
+    SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
+    // fold (fneg (fsub 0, Y)) -> Y
+    if (ConstantFPSDNode *C = isConstOrConstSplatFP(X, /*AllowUndefs*/ true))
+      if (C->isZero()) {
+        Cost = NegatibleCost::Cheaper;
+        return Y;
+      }
 
+    // fold (fneg (fsub X, Y)) -> (fsub Y, X)
+    Cost = NegatibleCost::Neutral;
+    return DAG.getNode(ISD::FSUB, DL, VT, Y, X, Flags);
+  }
   case ISD::FMUL:
-  case ISD::FDIV:
-    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y) or (fmul X, (fneg Y))
-    if (char V = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
-                                    ForCodeSize, Depth + 1))
-      return V;
+  case ISD::FDIV: {
+    SDValue X = Op.getOperand(0), Y = Op.getOperand(1);
+
+    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
+    NegatibleCost CostX = NegatibleCost::Expensive;
+    SDValue NegX =
+        getNegatedExpression(X, DAG, LegalOps, OptForSize, CostX, Depth);
+    // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
+    NegatibleCost CostY = NegatibleCost::Expensive;
+    SDValue NegY =
+        getNegatedExpression(Y, DAG, LegalOps, OptForSize, CostY, Depth);
+
+    // Negate the X if its cost is less or equal than Y.
+    if (NegX && (CostX <= CostY)) {
+      Cost = CostX;
+      return DAG.getNode(Opcode, DL, VT, NegX, Y, Flags);
+    }
 
     // Ignore X * 2.0 because that is expected to be canonicalized to X + X.
     if (auto *C = isConstOrConstSplatFP(Op.getOperand(1)))
       if (C->isExactlyValue(2.0) && Op.getOpcode() == ISD::FMUL)
-        return 0;
-
-    return isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations,
-                              ForCodeSize, Depth + 1);
+        break;
 
+    // Negate the Y if it is not expensive.
+    if (NegY) {
+      Cost = CostY;
+      return DAG.getNode(Opcode, DL, VT, X, NegY, Flags);
+    }
+    break;
+  }
   case ISD::FMA:
   case ISD::FMAD: {
     if (!Options.NoSignedZerosFPMath && !Flags.hasNoSignedZeros())
-      return 0;
+      break;
+
+    SDValue X = Op.getOperand(0), Y = Op.getOperand(1), Z = Op.getOperand(2);
+    NegatibleCost CostZ = NegatibleCost::Expensive;
+    SDValue NegZ =
+        getNegatedExpression(Z, DAG, LegalOps, OptForSize, CostZ, Depth);
+    // Give up if fail to negate the Z.
+    if (!NegZ)
+      break;
 
     // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z))
+    NegatibleCost CostX = NegatibleCost::Expensive;
+    SDValue NegX =
+        getNegatedExpression(X, DAG, LegalOps, OptForSize, CostX, Depth);
     // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z))
-    char V2 = isNegatibleForFree(Op.getOperand(2), DAG, LegalOperations,
-                                 ForCodeSize, Depth + 1);
-    if (!V2)
-      return 0;
-
-    // One of Op0/Op1 must be cheaply negatible, then select the cheapest.
-    char V0 = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
-                                 ForCodeSize, Depth + 1);
-    char V1 = isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations,
-                                 ForCodeSize, Depth + 1);
-    char V01 = std::max(V0, V1);
-    return V01 ? std::max(V01, V2) : 0;
-  }
-
-  case ISD::FP_EXTEND:
-  case ISD::FP_ROUND:
-  case ISD::FSIN:
-    return isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
-                              ForCodeSize, Depth + 1);
-  }
-
-  return 0;
-}
-
-SDValue TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
-                                             bool LegalOperations,
-                                             bool ForCodeSize,
-                                             unsigned Depth) const {
-  // fneg is removable even if it has multiple uses.
-  if (Op.getOpcode() == ISD::FNEG)
-    return Op.getOperand(0);
+    NegatibleCost CostY = NegatibleCost::Expensive;
+    SDValue NegY =
+        getNegatedExpression(Y, DAG, LegalOps, OptForSize, CostY, Depth);
 
-  assert(Depth <= SelectionDAG::MaxRecursionDepth &&
-         "getNegatedExpression doesn't match isNegatibleForFree");
-  const SDNodeFlags Flags = Op->getFlags();
-
-  switch (Op.getOpcode()) {
-  case ISD::ConstantFP: {
-    APFloat V = cast<ConstantFPSDNode>(Op)->getValueAPF();
-    V.changeSign();
-    return DAG.getConstantFP(V, SDLoc(Op), Op.getValueType());
-  }
-  case ISD::BUILD_VECTOR: {
-    SmallVector<SDValue, 4> Ops;
-    for (SDValue C : Op->op_values()) {
-      if (C.isUndef()) {
-        Ops.push_back(C);
-        continue;
-      }
-      APFloat V = cast<ConstantFPSDNode>(C)->getValueAPF();
-      V.changeSign();
-      Ops.push_back(DAG.getConstantFP(V, SDLoc(Op), C.getValueType()));
+    // Negate the X if its cost is less or equal than Y.
+    if (NegX && (CostX <= CostY)) {
+      Cost = std::min(CostX, CostZ);
+      return DAG.getNode(Opcode, DL, VT, NegX, Y, NegZ, Flags);
     }
-    return DAG.getBuildVector(Op.getValueType(), SDLoc(Op), Ops);
-  }
-  case ISD::FADD:
-    assert((DAG.getTarget().Options.NoSignedZerosFPMath ||
-            Flags.hasNoSignedZeros()) &&
-           "Expected NSZ fp-flag");
-
-    // fold (fneg (fadd A, B)) -> (fsub (fneg A), B)
-    if (isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations, ForCodeSize,
-                           Depth + 1))
-      return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
-                         getNegatedExpression(Op.getOperand(0), DAG,
-                                              LegalOperations, ForCodeSize,
-                                              Depth + 1),
-                         Op.getOperand(1), Flags);
-    // fold (fneg (fadd A, B)) -> (fsub (fneg B), A)
-    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
-                       getNegatedExpression(Op.getOperand(1), DAG,
-                                            LegalOperations, ForCodeSize,
-                                            Depth + 1),
-                       Op.getOperand(0), Flags);
-  case ISD::FSUB:
-    // fold (fneg (fsub 0, B)) -> B
-    if (ConstantFPSDNode *N0CFP =
-            isConstOrConstSplatFP(Op.getOperand(0), /*AllowUndefs*/ true))
-      if (N0CFP->isZero())
-        return Op.getOperand(1);
-
-    // fold (fneg (fsub A, B)) -> (fsub B, A)
-    return DAG.getNode(ISD::FSUB, SDLoc(Op), Op.getValueType(),
-                       Op.getOperand(1), Op.getOperand(0), Flags);
-
-  case ISD::FMUL:
-  case ISD::FDIV:
-    // fold (fneg (fmul X, Y)) -> (fmul (fneg X), Y)
-    if (isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations, ForCodeSize,
-                           Depth + 1))
-      return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
-                         getNegatedExpression(Op.getOperand(0), DAG,
-                                              LegalOperations, ForCodeSize,
-                                              Depth + 1),
-                         Op.getOperand(1), Flags);
-
-    // fold (fneg (fmul X, Y)) -> (fmul X, (fneg Y))
-    return DAG.getNode(
-        Op.getOpcode(), SDLoc(Op), Op.getValueType(), Op.getOperand(0),
-        getNegatedExpression(Op.getOperand(1), DAG, LegalOperations,
-                             ForCodeSize, Depth + 1),
-        Flags);
 
-  case ISD::FMA:
-  case ISD::FMAD: {
-    assert((DAG.getTarget().Options.NoSignedZerosFPMath ||
-            Flags.hasNoSignedZeros()) &&
-           "Expected NSZ fp-flag");
-
-    SDValue Neg2 = getNegatedExpression(Op.getOperand(2), DAG, LegalOperations,
-                                        ForCodeSize, Depth + 1);
-
-    char V0 = isNegatibleForFree(Op.getOperand(0), DAG, LegalOperations,
-                                 ForCodeSize, Depth + 1);
-    char V1 = isNegatibleForFree(Op.getOperand(1), DAG, LegalOperations,
-                                 ForCodeSize, Depth + 1);
-    // TODO: This is a hack. It is possible that costs have changed between now
-    //       and the initial calls to isNegatibleForFree(). That is because we
-    //       are rewriting the expression, and that may change the number of
-    //       uses (and therefore the cost) of values. If the negation costs are
-    //       equal, only negate this value if it is a constant. Otherwise, try
-    //       operand 1. A better fix would eliminate uses as a cost factor or
-    //       track the change in uses as we rewrite the expression.
-    if (V0 > V1 || (V0 == V1 && isa<ConstantFPSDNode>(Op.getOperand(0)))) {
-      // fold (fneg (fma X, Y, Z)) -> (fma (fneg X), Y, (fneg Z))
-      SDValue Neg0 = getNegatedExpression(
-          Op.getOperand(0), DAG, LegalOperations, ForCodeSize, Depth + 1);
-      return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(), Neg0,
-                         Op.getOperand(1), Neg2, Flags);
+    // Negate the Y if it is not expensive.
+    if (NegY) {
+      Cost = std::min(CostY, CostZ);
+      return DAG.getNode(Opcode, DL, VT, X, NegY, NegZ, Flags);
     }
-
-    // fold (fneg (fma X, Y, Z)) -> (fma X, (fneg Y), (fneg Z))
-    SDValue Neg1 = getNegatedExpression(Op.getOperand(1), DAG, LegalOperations,
-                                        ForCodeSize, Depth + 1);
-    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
-                       Op.getOperand(0), Neg1, Neg2, Flags);
+    break;
   }
 
   case ISD::FP_EXTEND:
   case ISD::FSIN:
-    return DAG.getNode(Op.getOpcode(), SDLoc(Op), Op.getValueType(),
-                       getNegatedExpression(Op.getOperand(0), DAG,
-                                            LegalOperations, ForCodeSize,
-                                            Depth + 1));
+    if (SDValue NegV = getNegatedExpression(Op.getOperand(0), DAG, LegalOps,
+                                            OptForSize, Cost, Depth))
+      return DAG.getNode(Opcode, DL, VT, NegV);
+    break;
   case ISD::FP_ROUND:
-    return DAG.getNode(ISD::FP_ROUND, SDLoc(Op), Op.getValueType(),
-                       getNegatedExpression(Op.getOperand(0), DAG,
-                                            LegalOperations, ForCodeSize,
-                                            Depth + 1),
-                       Op.getOperand(1));
+    if (SDValue NegV = getNegatedExpression(Op.getOperand(0), DAG, LegalOps,
+                                            OptForSize, Cost, Depth))
+      return DAG.getNode(ISD::FP_ROUND, DL, VT, NegV, Op.getOperand(1));
+    break;
   }
 
-  llvm_unreachable("Unknown code");
+  return SDValue();
 }
 
 //===----------------------------------------------------------------------===//
@@ -5929,6 +6117,14 @@ bool TargetLowering::expandMUL(SDNode *N, SDValue &Lo, SDValue &Hi, EVT HiLoVT,
   return Ok;
 }
 
+// Check that (every element of) Z is undef or not an exact multiple of BW.
+static bool isNonZeroModBitWidth(SDValue Z, unsigned BW) {
+  return ISD::matchUnaryPredicate(
+      Z,
+      [=](ConstantSDNode *C) { return !C || C->getAPIntValue().urem(BW) != 0; },
+      true);
+}
+
 bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
                                        SelectionDAG &DAG) const {
   EVT VT = Node->getValueType(0);
@@ -5939,41 +6135,54 @@ bool TargetLowering::expandFunnelShift(SDNode *Node, SDValue &Result,
                         !isOperationLegalOrCustomOrPromote(ISD::OR, VT)))
     return false;
 
-  // fshl: (X << (Z % BW)) | (Y >> (BW - (Z % BW)))
-  // fshr: (X << (BW - (Z % BW))) | (Y >> (Z % BW))
   SDValue X = Node->getOperand(0);
   SDValue Y = Node->getOperand(1);
   SDValue Z = Node->getOperand(2);
 
-  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  unsigned BW = VT.getScalarSizeInBits();
   bool IsFSHL = Node->getOpcode() == ISD::FSHL;
   SDLoc DL(SDValue(Node, 0));
 
   EVT ShVT = Z.getValueType();
-  SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
-  SDValue Zero = DAG.getConstant(0, DL, ShVT);
 
-  SDValue ShAmt;
-  if (isPowerOf2_32(EltSizeInBits)) {
-    SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
-    ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask);
-  } else {
+  SDValue ShX, ShY;
+  SDValue ShAmt, InvShAmt;
+  if (isNonZeroModBitWidth(Z, BW)) {
+    // fshl: X << C | Y >> (BW - C)
+    // fshr: X << (BW - C) | Y >> C
+    // where C = Z % BW is not zero
+    SDValue BitWidthC = DAG.getConstant(BW, DL, ShVT);
     ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC);
-  }
-
-  SDValue InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt);
-  SDValue ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt);
-  SDValue ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt);
-  SDValue Or = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
-
-  // If (Z % BW == 0), then the opposite direction shift is shift-by-bitwidth,
-  // and that is undefined. We must compare and select to avoid UB.
-  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), ShVT);
+    InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, ShAmt);
+    ShX = DAG.getNode(ISD::SHL, DL, VT, X, IsFSHL ? ShAmt : InvShAmt);
+    ShY = DAG.getNode(ISD::SRL, DL, VT, Y, IsFSHL ? InvShAmt : ShAmt);
+  } else {
+    // fshl: X << (Z % BW) | Y >> 1 >> (BW - 1 - (Z % BW))
+    // fshr: X << 1 << (BW - 1 - (Z % BW)) | Y >> (Z % BW)
+    SDValue Mask = DAG.getConstant(BW - 1, DL, ShVT);
+    if (isPowerOf2_32(BW)) {
+      // Z % BW -> Z & (BW - 1)
+      ShAmt = DAG.getNode(ISD::AND, DL, ShVT, Z, Mask);
+      // (BW - 1) - (Z % BW) -> ~Z & (BW - 1)
+      InvShAmt = DAG.getNode(ISD::AND, DL, ShVT, DAG.getNOT(DL, Z, ShVT), Mask);
+    } else {
+      SDValue BitWidthC = DAG.getConstant(BW, DL, ShVT);
+      ShAmt = DAG.getNode(ISD::UREM, DL, ShVT, Z, BitWidthC);
+      InvShAmt = DAG.getNode(ISD::SUB, DL, ShVT, Mask, ShAmt);
+    }
 
-  // For fshl, 0-shift returns the 1st arg (X).
-  // For fshr, 0-shift returns the 2nd arg (Y).
-  SDValue IsZeroShift = DAG.getSetCC(DL, CCVT, ShAmt, Zero, ISD::SETEQ);
-  Result = DAG.getSelect(DL, VT, IsZeroShift, IsFSHL ? X : Y, Or);
+    SDValue One = DAG.getConstant(1, DL, ShVT);
+    if (IsFSHL) {
+      ShX = DAG.getNode(ISD::SHL, DL, VT, X, ShAmt);
+      SDValue ShY1 = DAG.getNode(ISD::SRL, DL, VT, Y, One);
+      ShY = DAG.getNode(ISD::SRL, DL, VT, ShY1, InvShAmt);
+    } else {
+      SDValue ShX1 = DAG.getNode(ISD::SHL, DL, VT, X, One);
+      ShX = DAG.getNode(ISD::SHL, DL, VT, ShX1, InvShAmt);
+      ShY = DAG.getNode(ISD::SRL, DL, VT, Y, ShAmt);
+    }
+  }
+  Result = DAG.getNode(ISD::OR, DL, VT, ShX, ShY);
   return true;
 }
 
@@ -5988,12 +6197,15 @@ bool TargetLowering::expandROT(SDNode *Node, SDValue &Result,
   SDLoc DL(SDValue(Node, 0));
 
   EVT ShVT = Op1.getValueType();
-  SDValue BitWidthC = DAG.getConstant(EltSizeInBits, DL, ShVT);
+  SDValue Zero = DAG.getConstant(0, DL, ShVT);
 
-  // If a rotate in the other direction is legal, use it.
+  assert(isPowerOf2_32(EltSizeInBits) && EltSizeInBits > 1 &&
+         "Expecting the type bitwidth to be a power of 2");
+
+  // If a rotate in the other direction is supported, use it.
   unsigned RevRot = IsLeft ? ISD::ROTR : ISD::ROTL;
-  if (isOperationLegal(RevRot, VT)) {
-    SDValue Sub = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, Op1);
+  if (isOperationLegalOrCustom(RevRot, VT)) {
+    SDValue Sub = DAG.getNode(ISD::SUB, DL, ShVT, Zero, Op1);
     Result = DAG.getNode(RevRot, DL, VT, Op0, Sub);
     return true;
   }
@@ -6006,15 +6218,13 @@ bool TargetLowering::expandROT(SDNode *Node, SDValue &Result,
     return false;
 
   // Otherwise,
-  //   (rotl x, c) -> (or (shl x, (and c, w-1)), (srl x, (and w-c, w-1)))
-  //   (rotr x, c) -> (or (srl x, (and c, w-1)), (shl x, (and w-c, w-1)))
+  //   (rotl x, c) -> (or (shl x, (and c, w-1)), (srl x, (and -c, w-1)))
+  //   (rotr x, c) -> (or (srl x, (and c, w-1)), (shl x, (and -c, w-1)))
   //
-  assert(isPowerOf2_32(EltSizeInBits) && EltSizeInBits > 1 &&
-         "Expecting the type bitwidth to be a power of 2");
   unsigned ShOpc = IsLeft ? ISD::SHL : ISD::SRL;
   unsigned HsOpc = IsLeft ? ISD::SRL : ISD::SHL;
   SDValue BitWidthMinusOneC = DAG.getConstant(EltSizeInBits - 1, DL, ShVT);
-  SDValue NegOp1 = DAG.getNode(ISD::SUB, DL, ShVT, BitWidthC, Op1);
+  SDValue NegOp1 = DAG.getNode(ISD::SUB, DL, ShVT, Zero, Op1);
   SDValue And0 = DAG.getNode(ISD::AND, DL, ShVT, Op1, BitWidthMinusOneC);
   SDValue And1 = DAG.getNode(ISD::AND, DL, ShVT, NegOp1, BitWidthMinusOneC);
   Result = DAG.getNode(ISD::OR, DL, VT, DAG.getNode(ShOpc, DL, VT, Op0, And0),
@@ -6198,114 +6408,50 @@ bool TargetLowering::expandUINT_TO_FP(SDNode *Node, SDValue &Result,
   EVT SrcVT = Src.getValueType();
   EVT DstVT = Node->getValueType(0);
 
-  if (SrcVT.getScalarType() != MVT::i64)
+  if (SrcVT.getScalarType() != MVT::i64 || DstVT.getScalarType() != MVT::f64)
+    return false;
+
+  // Only expand vector types if we have the appropriate vector bit operations.
+  if (SrcVT.isVector() && (!isOperationLegalOrCustom(ISD::SRL, SrcVT) ||
+                           !isOperationLegalOrCustom(ISD::FADD, DstVT) ||
+                           !isOperationLegalOrCustom(ISD::FSUB, DstVT) ||
+                           !isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) ||
+                           !isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT)))
     return false;
 
   SDLoc dl(SDValue(Node, 0));
   EVT ShiftVT = getShiftAmountTy(SrcVT, DAG.getDataLayout());
 
-  if (DstVT.getScalarType() == MVT::f32) {
-    // Only expand vector types if we have the appropriate vector bit
-    // operations.
-    if (SrcVT.isVector() &&
-        (!isOperationLegalOrCustom(ISD::SRL, SrcVT) ||
-         !isOperationLegalOrCustom(ISD::FADD, DstVT) ||
-         !isOperationLegalOrCustom(ISD::SINT_TO_FP, SrcVT) ||
-         !isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) ||
-         !isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT)))
-      return false;
-
-    // For unsigned conversions, convert them to signed conversions using the
-    // algorithm from the x86_64 __floatundisf in compiler_rt.
-
-    // TODO: This really should be implemented using a branch rather than a
-    // select.  We happen to get lucky and machinesink does the right
-    // thing most of the time.  This would be a good candidate for a
-    // pseudo-op, or, even better, for whole-function isel.
-    EVT SetCCVT =
-        getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), SrcVT);
-
-    SDValue SignBitTest = DAG.getSetCC(
-        dl, SetCCVT, Src, DAG.getConstant(0, dl, SrcVT), ISD::SETLT);
-
-    SDValue ShiftConst = DAG.getConstant(1, dl, ShiftVT);
-    SDValue Shr = DAG.getNode(ISD::SRL, dl, SrcVT, Src, ShiftConst);
-    SDValue AndConst = DAG.getConstant(1, dl, SrcVT);
-    SDValue And = DAG.getNode(ISD::AND, dl, SrcVT, Src, AndConst);
-    SDValue Or = DAG.getNode(ISD::OR, dl, SrcVT, And, Shr);
-
-    SDValue Slow, Fast;
-    if (Node->isStrictFPOpcode()) {
-      // In strict mode, we must avoid spurious exceptions, and therefore
-      // must make sure to only emit a single STRICT_SINT_TO_FP.
-      SDValue InCvt = DAG.getSelect(dl, SrcVT, SignBitTest, Or, Src);
-      Fast = DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, { DstVT, MVT::Other },
-                         { Node->getOperand(0), InCvt });
-      Slow = DAG.getNode(ISD::STRICT_FADD, dl, { DstVT, MVT::Other },
-                         { Fast.getValue(1), Fast, Fast });
-      Chain = Slow.getValue(1);
-      // The STRICT_SINT_TO_FP inherits the exception mode from the
-      // incoming STRICT_UINT_TO_FP node; the STRICT_FADD node can
-      // never raise any exception.
-      SDNodeFlags Flags;
-      Flags.setNoFPExcept(Node->getFlags().hasNoFPExcept());
-      Fast->setFlags(Flags);
-      Flags.setNoFPExcept(true);
-      Slow->setFlags(Flags);
-    } else {
-      SDValue SignCvt = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Or);
-      Slow = DAG.getNode(ISD::FADD, dl, DstVT, SignCvt, SignCvt);
-      Fast = DAG.getNode(ISD::SINT_TO_FP, dl, DstVT, Src);
-    }
-
-    Result = DAG.getSelect(dl, DstVT, SignBitTest, Slow, Fast);
-    return true;
-  }
-
-  if (DstVT.getScalarType() == MVT::f64) {
-    // Only expand vector types if we have the appropriate vector bit
-    // operations.
-    if (SrcVT.isVector() &&
-        (!isOperationLegalOrCustom(ISD::SRL, SrcVT) ||
-         !isOperationLegalOrCustom(ISD::FADD, DstVT) ||
-         !isOperationLegalOrCustom(ISD::FSUB, DstVT) ||
-         !isOperationLegalOrCustomOrPromote(ISD::OR, SrcVT) ||
-         !isOperationLegalOrCustomOrPromote(ISD::AND, SrcVT)))
-      return false;
-
-    // Implementation of unsigned i64 to f64 following the algorithm in
-    // __floatundidf in compiler_rt. This implementation has the advantage
-    // of performing rounding correctly, both in the default rounding mode
-    // and in all alternate rounding modes.
-    SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT);
-    SDValue TwoP84PlusTwoP52 = DAG.getConstantFP(
-        BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT);
-    SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT);
-    SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT);
-    SDValue HiShift = DAG.getConstant(32, dl, ShiftVT);
-
-    SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Src, LoMask);
-    SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Src, HiShift);
-    SDValue LoOr = DAG.getNode(ISD::OR, dl, SrcVT, Lo, TwoP52);
-    SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84);
-    SDValue LoFlt = DAG.getBitcast(DstVT, LoOr);
-    SDValue HiFlt = DAG.getBitcast(DstVT, HiOr);
-    if (Node->isStrictFPOpcode()) {
-      SDValue HiSub =
-          DAG.getNode(ISD::STRICT_FSUB, dl, {DstVT, MVT::Other},
-                      {Node->getOperand(0), HiFlt, TwoP84PlusTwoP52});
-      Result = DAG.getNode(ISD::STRICT_FADD, dl, {DstVT, MVT::Other},
-                           {HiSub.getValue(1), LoFlt, HiSub});
-      Chain = Result.getValue(1);
-    } else {
-      SDValue HiSub =
-          DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
-      Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
-    }
-    return true;
+  // Implementation of unsigned i64 to f64 following the algorithm in
+  // __floatundidf in compiler_rt. This implementation has the advantage
+  // of performing rounding correctly, both in the default rounding mode
+  // and in all alternate rounding modes.
+  SDValue TwoP52 = DAG.getConstant(UINT64_C(0x4330000000000000), dl, SrcVT);
+  SDValue TwoP84PlusTwoP52 = DAG.getConstantFP(
+      BitsToDouble(UINT64_C(0x4530000000100000)), dl, DstVT);
+  SDValue TwoP84 = DAG.getConstant(UINT64_C(0x4530000000000000), dl, SrcVT);
+  SDValue LoMask = DAG.getConstant(UINT64_C(0x00000000FFFFFFFF), dl, SrcVT);
+  SDValue HiShift = DAG.getConstant(32, dl, ShiftVT);
+
+  SDValue Lo = DAG.getNode(ISD::AND, dl, SrcVT, Src, LoMask);
+  SDValue Hi = DAG.getNode(ISD::SRL, dl, SrcVT, Src, HiShift);
+  SDValue LoOr = DAG.getNode(ISD::OR, dl, SrcVT, Lo, TwoP52);
+  SDValue HiOr = DAG.getNode(ISD::OR, dl, SrcVT, Hi, TwoP84);
+  SDValue LoFlt = DAG.getBitcast(DstVT, LoOr);
+  SDValue HiFlt = DAG.getBitcast(DstVT, HiOr);
+  if (Node->isStrictFPOpcode()) {
+    SDValue HiSub =
+        DAG.getNode(ISD::STRICT_FSUB, dl, {DstVT, MVT::Other},
+                    {Node->getOperand(0), HiFlt, TwoP84PlusTwoP52});
+    Result = DAG.getNode(ISD::STRICT_FADD, dl, {DstVT, MVT::Other},
+                         {HiSub.getValue(1), LoFlt, HiSub});
+    Chain = Result.getValue(1);
+  } else {
+    SDValue HiSub =
+        DAG.getNode(ISD::FSUB, dl, DstVT, HiFlt, TwoP84PlusTwoP52);
+    Result = DAG.getNode(ISD::FADD, dl, DstVT, LoFlt, HiSub);
   }
-
-  return false;
+  return true;
 }
 
 SDValue TargetLowering::expandFMINNUM_FMAXNUM(SDNode *Node,
@@ -6564,12 +6710,61 @@ TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
   SDValue Chain = LD->getChain();
   SDValue BasePTR = LD->getBasePtr();
   EVT SrcVT = LD->getMemoryVT();
+  EVT DstVT = LD->getValueType(0);
   ISD::LoadExtType ExtType = LD->getExtensionType();
 
   unsigned NumElem = SrcVT.getVectorNumElements();
 
   EVT SrcEltVT = SrcVT.getScalarType();
-  EVT DstEltVT = LD->getValueType(0).getScalarType();
+  EVT DstEltVT = DstVT.getScalarType();
+
+  // A vector must always be stored in memory as-is, i.e. without any padding
+  // between the elements, since various code depend on it, e.g. in the
+  // handling of a bitcast of a vector type to int, which may be done with a
+  // vector store followed by an integer load. A vector that does not have
+  // elements that are byte-sized must therefore be stored as an integer
+  // built out of the extracted vector elements.
+  if (!SrcEltVT.isByteSized()) {
+    unsigned NumLoadBits = SrcVT.getStoreSizeInBits();
+    EVT LoadVT = EVT::getIntegerVT(*DAG.getContext(), NumLoadBits);
+
+    unsigned NumSrcBits = SrcVT.getSizeInBits();
+    EVT SrcIntVT = EVT::getIntegerVT(*DAG.getContext(), NumSrcBits);
+
+    unsigned SrcEltBits = SrcEltVT.getSizeInBits();
+    SDValue SrcEltBitMask = DAG.getConstant(
+        APInt::getLowBitsSet(NumLoadBits, SrcEltBits), SL, LoadVT);
+
+    // Load the whole vector and avoid masking off the top bits as it makes
+    // the codegen worse.
+    SDValue Load =
+        DAG.getExtLoad(ISD::EXTLOAD, SL, LoadVT, Chain, BasePTR,
+                       LD->getPointerInfo(), SrcIntVT, LD->getAlignment(),
+                       LD->getMemOperand()->getFlags(), LD->getAAInfo());
+
+    SmallVector<SDValue, 8> Vals;
+    for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
+      unsigned ShiftIntoIdx =
+          (DAG.getDataLayout().isBigEndian() ? (NumElem - 1) - Idx : Idx);
+      SDValue ShiftAmount =
+          DAG.getShiftAmountConstant(ShiftIntoIdx * SrcEltVT.getSizeInBits(),
+                                     LoadVT, SL, /*LegalTypes=*/false);
+      SDValue ShiftedElt = DAG.getNode(ISD::SRL, SL, LoadVT, Load, ShiftAmount);
+      SDValue Elt =
+          DAG.getNode(ISD::AND, SL, LoadVT, ShiftedElt, SrcEltBitMask);
+      SDValue Scalar = DAG.getNode(ISD::TRUNCATE, SL, SrcEltVT, Elt);
+
+      if (ExtType != ISD::NON_EXTLOAD) {
+        unsigned ExtendOp = ISD::getExtForLoadExtType(false, ExtType);
+        Scalar = DAG.getNode(ExtendOp, SL, DstEltVT, Scalar);
+      }
+
+      Vals.push_back(Scalar);
+    }
+
+    SDValue Value = DAG.getBuildVector(DstVT, SL, Vals);
+    return std::make_pair(Value, Load.getValue(1));
+  }
 
   unsigned Stride = SrcEltVT.getSizeInBits() / 8;
   assert(SrcEltVT.isByteSized());
@@ -6591,7 +6786,7 @@ TargetLowering::scalarizeVectorLoad(LoadSDNode *LD,
   }
 
   SDValue NewChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other, LoadChains);
-  SDValue Value = DAG.getBuildVector(LD->getValueType(0), SL, Vals);
+  SDValue Value = DAG.getBuildVector(DstVT, SL, Vals);
 
   return std::make_pair(Value, NewChain);
 }
@@ -6612,7 +6807,6 @@ SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
   // The type of data as saved in memory.
   EVT MemSclVT = StVT.getScalarType();
 
-  EVT IdxVT = getVectorIdxTy(DAG.getDataLayout());
   unsigned NumElem = StVT.getVectorNumElements();
 
   // A vector must always be stored in memory as-is, i.e. without any padding
@@ -6629,7 +6823,7 @@ SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
 
     for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
       SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, RegSclVT, Value,
-                                DAG.getConstant(Idx, SL, IdxVT));
+                                DAG.getVectorIdxConstant(Idx, SL));
       SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SL, MemSclVT, Elt);
       SDValue ExtElt = DAG.getNode(ISD::ZERO_EXTEND, SL, IntVT, Trunc);
       unsigned ShiftIntoIdx =
@@ -6654,7 +6848,7 @@ SDValue TargetLowering::scalarizeVectorStore(StoreSDNode *ST,
   SmallVector<SDValue, 8> Stores;
   for (unsigned Idx = 0; Idx < NumElem; ++Idx) {
     SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, RegSclVT, Value,
-                              DAG.getConstant(Idx, SL, IdxVT));
+                              DAG.getVectorIdxConstant(Idx, SL));
 
     SDValue Ptr = DAG.getObjectPtrOffset(SL, BasePtr, Idx * Stride);
 
@@ -7313,12 +7507,13 @@ SDValue
 TargetLowering::expandFixedPointDiv(unsigned Opcode, const SDLoc &dl,
                                     SDValue LHS, SDValue RHS,
                                     unsigned Scale, SelectionDAG &DAG) const {
-  assert((Opcode == ISD::SDIVFIX ||
-          Opcode == ISD::UDIVFIX) &&
+  assert((Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT ||
+          Opcode == ISD::UDIVFIX || Opcode == ISD::UDIVFIXSAT) &&
          "Expected a fixed point division opcode");
 
   EVT VT = LHS.getValueType();
-  bool Signed = Opcode == ISD::SDIVFIX;
+  bool Signed = Opcode == ISD::SDIVFIX || Opcode == ISD::SDIVFIXSAT;
+  bool Saturating = Opcode == ISD::SDIVFIXSAT || Opcode == ISD::UDIVFIXSAT;
   EVT BoolVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
 
   // If there is enough room in the type to upscale the LHS or downscale the
@@ -7330,7 +7525,15 @@ TargetLowering::expandFixedPointDiv(unsigned Opcode, const SDLoc &dl,
                             : DAG.computeKnownBits(LHS).countMinLeadingZeros();
   unsigned RHSTrail = DAG.computeKnownBits(RHS).countMinTrailingZeros();
 
-  if (LHSLead + RHSTrail < Scale)
+  // For signed saturating operations, we need to be able to detect true integer
+  // division overflow; that is, when you have MIN / -EPS. However, this
+  // is undefined behavior and if we emit divisions that could take such
+  // values it may cause undesired behavior (arithmetic exceptions on x86, for
+  // example).
+  // Avoid this by requiring an extra bit so that we never get this case.
+  // FIXME: This is a bit unfortunate as it means that for an 8-bit 7-scale
+  // signed saturating division, we need to emit a whopping 32-bit division.
+  if (LHSLead + RHSTrail < Scale + (unsigned)(Saturating && Signed))
     return SDValue();
 
   unsigned LHSShift = std::min(LHSLead, Scale);
@@ -7384,8 +7587,6 @@ TargetLowering::expandFixedPointDiv(unsigned Opcode, const SDLoc &dl,
     Quot = DAG.getNode(ISD::UDIV, dl, VT,
                        LHS, RHS);
 
-  // TODO: Saturation.
-
   return Quot;
 }
 
@@ -7659,3 +7860,26 @@ SDValue TargetLowering::expandVecReduce(SDNode *Node, SelectionDAG &DAG) const {
     Res = DAG.getNode(ISD::ANY_EXTEND, dl, Node->getValueType(0), Res);
   return Res;
 }
+
+bool TargetLowering::expandREM(SDNode *Node, SDValue &Result,
+                               SelectionDAG &DAG) const {
+  EVT VT = Node->getValueType(0);
+  SDLoc dl(Node);
+  bool isSigned = Node->getOpcode() == ISD::SREM;
+  unsigned DivOpc = isSigned ? ISD::SDIV : ISD::UDIV;
+  unsigned DivRemOpc = isSigned ? ISD::SDIVREM : ISD::UDIVREM;
+  SDValue Dividend = Node->getOperand(0);
+  SDValue Divisor = Node->getOperand(1);
+  if (isOperationLegalOrCustom(DivRemOpc, VT)) {
+    SDVTList VTs = DAG.getVTList(VT, VT);
+    Result = DAG.getNode(DivRemOpc, dl, VTs, Dividend, Divisor).getValue(1);
+    return true;
+  } else if (isOperationLegalOrCustom(DivOpc, VT)) {
+    // X % Y -> X-X/Y*Y
+    SDValue Divide = DAG.getNode(DivOpc, dl, VT, Dividend, Divisor);
+    SDValue Mul = DAG.getNode(ISD::MUL, dl, VT, Divide, Divisor);
+    Result = DAG.getNode(ISD::SUB, dl, VT, Dividend, Mul);
+    return true;
+  }
+  return false;
+}
diff --git a/llvm/lib/CodeGen/ShrinkWrap.cpp b/llvm/lib/CodeGen/ShrinkWrap.cpp
index 85dd4f59fa13..ce43fb1fbd4b 100644
--- a/llvm/lib/CodeGen/ShrinkWrap.cpp
+++ b/llvm/lib/CodeGen/ShrinkWrap.cpp
@@ -494,17 +494,15 @@ bool ShrinkWrap::runOnMachineFunction(MachineFunction &MF) {
                                "EH Funclets are not supported yet.",
                                MBB.front().getDebugLoc(), &MBB);
 
-    if (MBB.isEHPad()) {
-      // Push the prologue and epilogue outside of
-      // the region that may throw by making sure
-      // that all the landing pads are at least at the
-      // boundary of the save and restore points.
-      // The problem with exceptions is that the throw
-      // is not properly modeled and in particular, a
-      // basic block can jump out from the middle.
+    if (MBB.isEHPad() || MBB.isInlineAsmBrIndirectTarget()) {
+      // Push the prologue and epilogue outside of the region that may throw (or
+      // jump out via inlineasm_br), by making sure that all the landing pads
+      // are at least at the boundary of the save and restore points.  The
+      // problem is that a basic block can jump out from the middle in these
+      // cases, which we do not handle.
       updateSaveRestorePoints(MBB, RS.get());
       if (!ArePointsInteresting()) {
-        LLVM_DEBUG(dbgs() << "EHPad prevents shrink-wrapping\n");
+        LLVM_DEBUG(dbgs() << "EHPad/inlineasm_br prevents shrink-wrapping\n");
         return false;
       }
       continue;
diff --git a/llvm/lib/CodeGen/SjLjEHPrepare.cpp b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
index 4abf9ea41b65..0683058f177e 100644
--- a/llvm/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/llvm/lib/CodeGen/SjLjEHPrepare.cpp
@@ -27,6 +27,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 
@@ -37,6 +38,7 @@ STATISTIC(NumSpilled, "Number of registers live across unwind edges");
 
 namespace {
 class SjLjEHPrepare : public FunctionPass {
+  IntegerType *DataTy;
   Type *doubleUnderDataTy;
   Type *doubleUnderJBufTy;
   Type *FunctionContextTy;
@@ -50,10 +52,12 @@ class SjLjEHPrepare : public FunctionPass {
   Function *CallSiteFn;
   Function *FuncCtxFn;
   AllocaInst *FuncCtx;
+  const TargetMachine *TM;
 
 public:
   static char ID; // Pass identification, replacement for typeid
-  explicit SjLjEHPrepare() : FunctionPass(ID) {}
+  explicit SjLjEHPrepare(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), TM(TM) {}
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
 
@@ -77,23 +81,28 @@ INITIALIZE_PASS(SjLjEHPrepare, DEBUG_TYPE, "Prepare SjLj exceptions",
                 false, false)
 
 // Public Interface To the SjLjEHPrepare pass.
-FunctionPass *llvm::createSjLjEHPreparePass() { return new SjLjEHPrepare(); }
+FunctionPass *llvm::createSjLjEHPreparePass(const TargetMachine *TM) {
+  return new SjLjEHPrepare(TM);
+}
+
 // doInitialization - Set up decalarations and types needed to process
 // exceptions.
 bool SjLjEHPrepare::doInitialization(Module &M) {
   // Build the function context structure.
   // builtin_setjmp uses a five word jbuf
   Type *VoidPtrTy = Type::getInt8PtrTy(M.getContext());
-  Type *Int32Ty = Type::getInt32Ty(M.getContext());
-  doubleUnderDataTy = ArrayType::get(Int32Ty, 4);
+  unsigned DataBits =
+      TM ? TM->getSjLjDataSize() : TargetMachine::DefaultSjLjDataSize;
+  DataTy = Type::getIntNTy(M.getContext(), DataBits);
+  doubleUnderDataTy = ArrayType::get(DataTy, 4);
   doubleUnderJBufTy = ArrayType::get(VoidPtrTy, 5);
   FunctionContextTy = StructType::get(VoidPtrTy,         // __prev
-                                      Int32Ty,           // call_site
+                                      DataTy,            // call_site
                                       doubleUnderDataTy, // __data
                                       VoidPtrTy,         // __personality
                                       VoidPtrTy,         // __lsda
                                       doubleUnderJBufTy  // __jbuf
-                                      );
+  );
 
   return true;
 }
@@ -112,8 +121,7 @@ void SjLjEHPrepare::insertCallSiteStore(Instruction *I, int Number) {
       Builder.CreateGEP(FunctionContextTy, FuncCtx, Idxs, "call_site");
 
   // Insert a store of the call-site number
-  ConstantInt *CallSiteNoC =
-      ConstantInt::get(Type::getInt32Ty(I->getContext()), Number);
+  ConstantInt *CallSiteNoC = ConstantInt::get(DataTy, Number);
   Builder.CreateStore(CallSiteNoC, CallSite, true /*volatile*/);
 }
 
@@ -128,7 +136,6 @@ static void MarkBlocksLiveIn(BasicBlock *BB,
 
   for (BasicBlock *B : inverse_depth_first_ext(BB, Visited))
     LiveBBs.insert(B);
-
 }
 
 /// substituteLPadValues - Substitute the values returned by the landingpad
@@ -190,16 +197,18 @@ Value *SjLjEHPrepare::setupFunctionContext(Function &F,
         Builder.CreateConstGEP2_32(FunctionContextTy, FuncCtx, 0, 2, "__data");
 
     // The exception values come back in context->__data[0].
-    Type *Int32Ty = Type::getInt32Ty(F.getContext());
     Value *ExceptionAddr = Builder.CreateConstGEP2_32(doubleUnderDataTy, FCData,
                                                       0, 0, "exception_gep");
-    Value *ExnVal = Builder.CreateLoad(Int32Ty, ExceptionAddr, true, "exn_val");
+    Value *ExnVal = Builder.CreateLoad(DataTy, ExceptionAddr, true, "exn_val");
     ExnVal = Builder.CreateIntToPtr(ExnVal, Builder.getInt8PtrTy());
 
     Value *SelectorAddr = Builder.CreateConstGEP2_32(doubleUnderDataTy, FCData,
                                                      0, 1, "exn_selector_gep");
     Value *SelVal =
-        Builder.CreateLoad(Int32Ty, SelectorAddr, true, "exn_selector_val");
+        Builder.CreateLoad(DataTy, SelectorAddr, true, "exn_selector_val");
+
+    // SelVal must be Int32Ty, so trunc it
+    SelVal = Builder.CreateTrunc(SelVal, Type::getInt32Ty(F.getContext()));
 
     substituteLPadValues(LPI, ExnVal, SelVal);
   }
@@ -457,8 +466,7 @@ bool SjLjEHPrepare::setupEntryBlockAndCallSites(Function &F) {
       }
       Instruction *StackAddr = CallInst::Create(StackAddrFn, "sp");
       StackAddr->insertAfter(&I);
-      Instruction *StoreStackAddr = new StoreInst(StackAddr, StackPtr, true);
-      StoreStackAddr->insertAfter(StackAddr);
+      new StoreInst(StackAddr, StackPtr, true, StackAddr->getNextNode());
     }
   }
 
diff --git a/llvm/lib/CodeGen/SlotIndexes.cpp b/llvm/lib/CodeGen/SlotIndexes.cpp
index 6664b58eccf8..d2bfdc663edb 100644
--- a/llvm/lib/CodeGen/SlotIndexes.cpp
+++ b/llvm/lib/CodeGen/SlotIndexes.cpp
@@ -112,9 +112,10 @@ bool SlotIndexes::runOnMachineFunction(MachineFunction &fn) {
   return false;
 }
 
-void SlotIndexes::removeMachineInstrFromMaps(MachineInstr &MI) {
-  assert(!MI.isBundledWithPred() &&
-         "Use removeSingleMachineInstrFromMaps() instread");
+void SlotIndexes::removeMachineInstrFromMaps(MachineInstr &MI,
+                                             bool AllowBundled) {
+  assert((AllowBundled || !MI.isBundledWithPred()) &&
+         "Use removeSingleMachineInstrFromMaps() instead");
   Mi2IndexMap::iterator mi2iItr = mi2iMap.find(&MI);
   if (mi2iItr == mi2iMap.end())
     return;
@@ -141,7 +142,7 @@ void SlotIndexes::removeSingleMachineInstrFromMaps(MachineInstr &MI) {
   // instruction.
   if (MI.isBundledWithSucc()) {
     // Only the first instruction of a bundle should have an index assigned.
-    assert(!MI.isBundledWithPred() && "Should have first bundle isntruction");
+    assert(!MI.isBundledWithPred() && "Should be first bundle instruction");
 
     MachineBasicBlock::instr_iterator Next = std::next(MI.getIterator());
     MachineInstr &NextMI = *Next;
diff --git a/llvm/lib/CodeGen/Spiller.h b/llvm/lib/CodeGen/Spiller.h
deleted file mode 100644
index 66dabf78f873..000000000000
--- a/llvm/lib/CodeGen/Spiller.h
+++ /dev/null
@@ -1,43 +0,0 @@
-//===- llvm/CodeGen/Spiller.h - Spiller -------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_CODEGEN_SPILLER_H
-#define LLVM_LIB_CODEGEN_SPILLER_H
-
-namespace llvm {
-
-class LiveRangeEdit;
-class MachineFunction;
-class MachineFunctionPass;
-class VirtRegMap;
-
-  /// Spiller interface.
-  ///
-  /// Implementations are utility classes which insert spill or remat code on
-  /// demand.
-  class Spiller {
-    virtual void anchor();
-
-  public:
-    virtual ~Spiller() = 0;
-
-    /// spill - Spill the LRE.getParent() live interval.
-    virtual void spill(LiveRangeEdit &LRE) = 0;
-
-    virtual void postOptimization() {}
-  };
-
-  /// Create and return a spiller that will insert spill code directly instead
-  /// of deferring though VirtRegMap.
-  Spiller *createInlineSpiller(MachineFunctionPass &pass,
-                               MachineFunction &mf,
-                               VirtRegMap &vrm);
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_CODEGEN_SPILLER_H
diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp
index 0c1f1220c421..8dec620536a7 100644
--- a/llvm/lib/CodeGen/SplitKit.cpp
+++ b/llvm/lib/CodeGen/SplitKit.cpp
@@ -19,9 +19,10 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalCalc.h"
 #include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/LiveRangeEdit.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -79,10 +80,15 @@ InsertPointAnalysis::computeLastInsertPoint(const LiveInterval &CurLI,
   std::pair<SlotIndex, SlotIndex> &LIP = LastInsertPoint[Num];
   SlotIndex MBBEnd = LIS.getMBBEndIdx(&MBB);
 
-  SmallVector<const MachineBasicBlock *, 1> EHPadSuccessors;
-  for (const MachineBasicBlock *SMBB : MBB.successors())
-    if (SMBB->isEHPad())
-      EHPadSuccessors.push_back(SMBB);
+  SmallVector<const MachineBasicBlock *, 1> ExceptionalSuccessors;
+  bool EHPadSuccessor = false;
+  for (const MachineBasicBlock *SMBB : MBB.successors()) {
+    if (SMBB->isEHPad()) {
+      ExceptionalSuccessors.push_back(SMBB);
+      EHPadSuccessor = true;
+    } else if (SMBB->isInlineAsmBrIndirectTarget())
+      ExceptionalSuccessors.push_back(SMBB);
+  }
 
   // Compute insert points on the first call. The pair is independent of the
   // current live interval.
@@ -93,15 +99,17 @@ InsertPointAnalysis::computeLastInsertPoint(const LiveInterval &CurLI,
     else
       LIP.first = LIS.getInstructionIndex(*FirstTerm);
 
-    // If there is a landing pad successor, also find the call instruction.
-    if (EHPadSuccessors.empty())
+    // If there is a landing pad or inlineasm_br successor, also find the
+    // instruction. If there is no such instruction, we don't need to do
+    // anything special.  We assume there cannot be multiple instructions that
+    // are Calls with EHPad successors or INLINEASM_BR in a block. Further, we
+    // assume that if there are any, they will be after any other call
+    // instructions in the block.
+    if (ExceptionalSuccessors.empty())
       return LIP.first;
-    // There may not be a call instruction (?) in which case we ignore LPad.
-    LIP.second = LIP.first;
-    for (MachineBasicBlock::const_iterator I = MBB.end(), E = MBB.begin();
-         I != E;) {
-      --I;
-      if (I->isCall()) {
+    for (auto I = MBB.rbegin(), E = MBB.rend(); I != E; ++I) {
+      if ((EHPadSuccessor && I->isCall()) ||
+          I->getOpcode() == TargetOpcode::INLINEASM_BR) {
         LIP.second = LIS.getInstructionIndex(*I);
         break;
       }
@@ -113,7 +121,7 @@ InsertPointAnalysis::computeLastInsertPoint(const LiveInterval &CurLI,
   if (!LIP.second)
     return LIP.first;
 
-  if (none_of(EHPadSuccessors, [&](const MachineBasicBlock *EHPad) {
+  if (none_of(ExceptionalSuccessors, [&](const MachineBasicBlock *EHPad) {
         return LIS.isLiveInToMBB(CurLI, EHPad);
       }))
     return LIP.first;
@@ -379,11 +387,11 @@ void SplitEditor::reset(LiveRangeEdit &LRE, ComplementSpillMode SM) {
   RegAssign.clear();
   Values.clear();
 
-  // Reset the LiveRangeCalc instances needed for this spill mode.
-  LRCalc[0].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+  // Reset the LiveIntervalCalc instances needed for this spill mode.
+  LICalc[0].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
                   &LIS.getVNInfoAllocator());
   if (SpillMode)
-    LRCalc[1].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+    LICalc[1].reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
                     &LIS.getVNInfoAllocator());
 
   // We don't need an AliasAnalysis since we will only be performing
@@ -832,7 +840,7 @@ void SplitEditor::overlapIntv(SlotIndex Start, SlotIndex End) {
   assert(LIS.getMBBFromIndex(Start) == LIS.getMBBFromIndex(End) &&
          "Range cannot span basic blocks");
 
-  // The complement interval will be extended as needed by LRCalc.extend().
+  // The complement interval will be extended as needed by LICalc.extend().
   if (ParentVNI)
     forceRecompute(0, *ParentVNI);
   LLVM_DEBUG(dbgs() << "    overlapIntv [" << Start << ';' << End << "):");
@@ -1118,7 +1126,7 @@ void SplitEditor::hoistCopies() {
 }
 
 /// transferValues - Transfer all possible values to the new live ranges.
-/// Values that were rematerialized are left alone, they need LRCalc.extend().
+/// Values that were rematerialized are left alone, they need LICalc.extend().
 bool SplitEditor::transferValues() {
   bool Skipped = false;
   RegAssignMap::const_iterator AssignI = RegAssign.begin();
@@ -1166,7 +1174,7 @@ bool SplitEditor::transferValues() {
         continue;
       }
 
-      LiveRangeCalc &LRC = getLRCalc(RegIdx);
+      LiveIntervalCalc &LIC = getLICalc(RegIdx);
 
       // This value has multiple defs in RegIdx, but it wasn't rematerialized,
       // so the live range is accurate. Add live-in blocks in [Start;End) to the
@@ -1182,7 +1190,7 @@ bool SplitEditor::transferValues() {
         LLVM_DEBUG(dbgs() << ':' << VNI->id << "*" << printMBBReference(*MBB));
         // MBB has its own def. Is it also live-out?
         if (BlockEnd <= End)
-          LRC.setLiveOutValue(&*MBB, VNI);
+          LIC.setLiveOutValue(&*MBB, VNI);
 
         // Skip to the next block for live-in.
         ++MBB;
@@ -1200,16 +1208,16 @@ bool SplitEditor::transferValues() {
           VNInfo *VNI = LI.extendInBlock(BlockStart, std::min(BlockEnd, End));
           assert(VNI && "Missing def for complex mapped parent PHI");
           if (End >= BlockEnd)
-            LRC.setLiveOutValue(&*MBB, VNI); // Live-out as well.
+            LIC.setLiveOutValue(&*MBB, VNI); // Live-out as well.
         } else {
           // This block needs a live-in value.  The last block covered may not
           // be live-out.
           if (End < BlockEnd)
-            LRC.addLiveInBlock(LI, MDT[&*MBB], End);
+            LIC.addLiveInBlock(LI, MDT[&*MBB], End);
           else {
             // Live-through, and we don't know the value.
-            LRC.addLiveInBlock(LI, MDT[&*MBB]);
-            LRC.setLiveOutValue(&*MBB, nullptr);
+            LIC.addLiveInBlock(LI, MDT[&*MBB]);
+            LIC.setLiveOutValue(&*MBB, nullptr);
           }
         }
         BlockStart = BlockEnd;
@@ -1220,9 +1228,9 @@ bool SplitEditor::transferValues() {
     LLVM_DEBUG(dbgs() << '\n');
   }
 
-  LRCalc[0].calculateValues();
+  LICalc[0].calculateValues();
   if (SpillMode)
-    LRCalc[1].calculateValues();
+    LICalc[1].calculateValues();
 
   return Skipped;
 }
@@ -1238,7 +1246,7 @@ static bool removeDeadSegment(SlotIndex Def, LiveRange &LR) {
   return true;
 }
 
-void SplitEditor::extendPHIRange(MachineBasicBlock &B, LiveRangeCalc &LRC,
+void SplitEditor::extendPHIRange(MachineBasicBlock &B, LiveIntervalCalc &LIC,
                                  LiveRange &LR, LaneBitmask LM,
                                  ArrayRef<SlotIndex> Undefs) {
   for (MachineBasicBlock *P : B.predecessors()) {
@@ -1252,7 +1260,7 @@ void SplitEditor::extendPHIRange(MachineBasicBlock &B, LiveRangeCalc &LRC,
     LiveRange &PSR = !LM.all() ? getSubRangeForMask(LM, PLI)
                                : static_cast<LiveRange&>(PLI);
     if (PSR.liveAt(LastUse))
-      LRC.extend(LR, End, /*PhysReg=*/0, Undefs);
+      LIC.extend(LR, End, /*PhysReg=*/0, Undefs);
   }
 }
 
@@ -1270,14 +1278,14 @@ void SplitEditor::extendPHIKillRanges() {
 
     unsigned RegIdx = RegAssign.lookup(V->def);
     LiveInterval &LI = LIS.getInterval(Edit->get(RegIdx));
-    LiveRangeCalc &LRC = getLRCalc(RegIdx);
+    LiveIntervalCalc &LIC = getLICalc(RegIdx);
     MachineBasicBlock &B = *LIS.getMBBFromIndex(V->def);
     if (!removeDeadSegment(V->def, LI))
-      extendPHIRange(B, LRC, LI, LaneBitmask::getAll(), /*Undefs=*/{});
+      extendPHIRange(B, LIC, LI, LaneBitmask::getAll(), /*Undefs=*/{});
   }
 
   SmallVector<SlotIndex, 4> Undefs;
-  LiveRangeCalc SubLRC;
+  LiveIntervalCalc SubLIC;
 
   for (LiveInterval::SubRange &PS : ParentLI.subranges()) {
     for (const VNInfo *V : PS.valnos) {
@@ -1290,11 +1298,11 @@ void SplitEditor::extendPHIKillRanges() {
         continue;
 
       MachineBasicBlock &B = *LIS.getMBBFromIndex(V->def);
-      SubLRC.reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+      SubLIC.reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
                    &LIS.getVNInfoAllocator());
       Undefs.clear();
       LI.computeSubRangeUndefs(Undefs, PS.LaneMask, MRI, *LIS.getSlotIndexes());
-      extendPHIRange(B, SubLRC, S, PS.LaneMask, Undefs);
+      extendPHIRange(B, SubLIC, S, PS.LaneMask, Undefs);
     }
   }
 }
@@ -1363,8 +1371,8 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
       if (MO.isUse())
         ExtPoints.push_back(ExtPoint(MO, RegIdx, Next));
     } else {
-      LiveRangeCalc &LRC = getLRCalc(RegIdx);
-      LRC.extend(LI, Next, 0, ArrayRef<SlotIndex>());
+      LiveIntervalCalc &LIC = getLICalc(RegIdx);
+      LIC.extend(LI, Next, 0, ArrayRef<SlotIndex>());
     }
   }
 
@@ -1372,7 +1380,7 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
     LiveInterval &LI = LIS.getInterval(Edit->get(EP.RegIdx));
     assert(LI.hasSubRanges());
 
-    LiveRangeCalc SubLRC;
+    LiveIntervalCalc SubLIC;
     Register Reg = EP.MO.getReg(), Sub = EP.MO.getSubReg();
     LaneBitmask LM = Sub != 0 ? TRI.getSubRegIndexLaneMask(Sub)
                               : MRI.getMaxLaneMaskForVReg(Reg);
@@ -1386,11 +1394,11 @@ void SplitEditor::rewriteAssigned(bool ExtendRanges) {
       //   %1 = COPY %0
       if (S.empty())
         continue;
-      SubLRC.reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
+      SubLIC.reset(&VRM.getMachineFunction(), LIS.getSlotIndexes(), &MDT,
                    &LIS.getVNInfoAllocator());
       SmallVector<SlotIndex, 4> Undefs;
       LI.computeSubRangeUndefs(Undefs, S.LaneMask, MRI, *LIS.getSlotIndexes());
-      SubLRC.extend(S, EP.Next, 0, Undefs);
+      SubLIC.extend(S, EP.Next, 0, Undefs);
     }
   }
 
diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h
index 78f0bbd24db5..3ab5f2585f34 100644
--- a/llvm/lib/CodeGen/SplitKit.h
+++ b/llvm/lib/CodeGen/SplitKit.h
@@ -23,8 +23,8 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/LiveInterval.h"
+#include "llvm/CodeGen/LiveIntervalCalc.h"
 #include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/LiveRangeCalc.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SlotIndexes.h"
@@ -34,6 +34,7 @@
 
 namespace llvm {
 
+class AAResults;
 class LiveIntervals;
 class LiveRangeEdit;
 class MachineBlockFrequencyInfo;
@@ -53,7 +54,7 @@ private:
   /// Last legal insert point in each basic block in the current function.
   /// The first entry is the first terminator, the second entry is the
   /// last valid point to insert a split or spill for a variable that is
-  /// live into a landing pad successor.
+  /// live into a landing pad or inlineasm_br successor.
   SmallVector<std::pair<SlotIndex, SlotIndex>, 8> LastInsertPoint;
 
   SlotIndex computeLastInsertPoint(const LiveInterval &CurLI,
@@ -256,7 +257,7 @@ public:
 ///
 class LLVM_LIBRARY_VISIBILITY SplitEditor {
   SplitAnalysis &SA;
-  AliasAnalysis &AA;
+  AAResults &AA;
   LiveIntervals &LIS;
   VirtRegMap &VRM;
   MachineRegisterInfo &MRI;
@@ -327,21 +328,21 @@ private:
   ///    its def.  The full live range can be inferred exactly from the range
   ///    of RegIdx in RegAssign.
   /// 3. (Null, true).  As above, but the ranges in RegAssign are too large, and
-  ///    the live range must be recomputed using LiveRangeCalc::extend().
+  ///    the live range must be recomputed using ::extend().
   /// 4. (VNI, false) The value is mapped to a single new value.
   ///    The new value has no live ranges anywhere.
   ValueMap Values;
 
-  /// LRCalc - Cache for computing live ranges and SSA update.  Each instance
+  /// LICalc - Cache for computing live ranges and SSA update.  Each instance
   /// can only handle non-overlapping live ranges, so use a separate
-  /// LiveRangeCalc instance for the complement interval when in spill mode.
-  LiveRangeCalc LRCalc[2];
+  /// LiveIntervalCalc instance for the complement interval when in spill mode.
+  LiveIntervalCalc LICalc[2];
 
-  /// getLRCalc - Return the LRCalc to use for RegIdx.  In spill mode, the
+  /// getLICalc - Return the LICalc to use for RegIdx.  In spill mode, the
   /// complement interval can overlap the other intervals, so it gets its own
-  /// LRCalc instance.  When not in spill mode, all intervals can share one.
-  LiveRangeCalc &getLRCalc(unsigned RegIdx) {
-    return LRCalc[SpillMode != SM_Partition && RegIdx != 0];
+  /// LICalc instance.  When not in spill mode, all intervals can share one.
+  LiveIntervalCalc &getLICalc(unsigned RegIdx) {
+    return LICalc[SpillMode != SM_Partition && RegIdx != 0];
   }
 
   /// Find a subrange corresponding to the lane mask @p LM in the live
@@ -414,7 +415,7 @@ private:
   /// all predecessor values that reach this def. If @p LR is a subrange,
   /// the array @p Undefs is the set of all locations where it is undefined
   /// via <def,read-undef> in other subranges for the same register.
-  void extendPHIRange(MachineBasicBlock &B, LiveRangeCalc &LRC,
+  void extendPHIRange(MachineBasicBlock &B, LiveIntervalCalc &LIC,
                       LiveRange &LR, LaneBitmask LM,
                       ArrayRef<SlotIndex> Undefs);
 
@@ -442,7 +443,7 @@ private:
 public:
   /// Create a new SplitEditor for editing the LiveInterval analyzed by SA.
   /// Newly created intervals will be appended to newIntervals.
-  SplitEditor(SplitAnalysis &sa, AliasAnalysis &aa, LiveIntervals &lis,
+  SplitEditor(SplitAnalysis &sa, AAResults &aa, LiveIntervals &lis,
               VirtRegMap &vrm, MachineDominatorTree &mdt,
               MachineBlockFrequencyInfo &mbfi);
 
diff --git a/llvm/lib/CodeGen/StackColoring.cpp b/llvm/lib/CodeGen/StackColoring.cpp
index b6e81116286f..d720d93c306d 100644
--- a/llvm/lib/CodeGen/StackColoring.cpp
+++ b/llvm/lib/CodeGen/StackColoring.cpp
@@ -913,6 +913,11 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
     assert(To && From && "Invalid allocation object");
     Allocas[From] = To;
 
+    // If From is before wo, its possible that there is a use of From between
+    // them.
+    if (From->comesBefore(To))
+      const_cast<AllocaInst*>(To)->moveBefore(const_cast<AllocaInst*>(From));
+
     // AA might be used later for instruction scheduling, and we need it to be
     // able to deduce the correct aliasing releationships between pointers
     // derived from the alloca being remapped and the target of that remapping.
@@ -960,6 +965,8 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
   }
 
   // Remap all instructions to the new stack slots.
+  std::vector<std::vector<MachineMemOperand *>> SSRefs(
+      MFI->getObjectIndexEnd());
   for (MachineBasicBlock &BB : *MF)
     for (MachineInstr &I : BB) {
       // Skip lifetime markers. We'll remove them soon.
@@ -1025,6 +1032,16 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
       SmallVector<MachineMemOperand *, 2> NewMMOs;
       bool ReplaceMemOps = false;
       for (MachineMemOperand *MMO : I.memoperands()) {
+        // Collect MachineMemOperands which reference
+        // FixedStackPseudoSourceValues with old frame indices.
+        if (const auto *FSV = dyn_cast_or_null<FixedStackPseudoSourceValue>(
+                MMO->getPseudoValue())) {
+          int FI = FSV->getFrameIndex();
+          auto To = SlotRemap.find(FI);
+          if (To != SlotRemap.end())
+            SSRefs[FI].push_back(MMO);
+        }
+
         // If this memory location can be a slot remapped here,
         // we remove AA information.
         bool MayHaveConflictingAAMD = false;
@@ -1062,6 +1079,15 @@ void StackColoring::remapInstructions(DenseMap<int, int> &SlotRemap) {
         I.setMemRefs(*MF, NewMMOs);
     }
 
+  // Rewrite MachineMemOperands that reference old frame indices.
+  for (auto E : enumerate(SSRefs))
+    if (!E.value().empty()) {
+      const PseudoSourceValue *NewSV =
+          MF->getPSVManager().getFixedStack(SlotRemap.find(E.index())->second);
+      for (MachineMemOperand *Ref : E.value())
+        Ref->setValue(NewSV);
+    }
+
   // Update the location of C++ catch objects for the MSVC personality routine.
   if (WinEHFuncInfo *EHInfo = MF->getWinEHFuncInfo())
     for (WinEHTryBlockMapEntry &TBME : EHInfo->TryBlockMap)
@@ -1269,8 +1295,8 @@ bool StackColoring::runOnMachineFunction(MachineFunction &Func) {
           SortedSlots[J] = -1;
           LLVM_DEBUG(dbgs() << "Merging #" << FirstSlot << " and slots #"
                             << SecondSlot << " together.\n");
-          unsigned MaxAlignment = std::max(MFI->getObjectAlignment(FirstSlot),
-                                           MFI->getObjectAlignment(SecondSlot));
+          Align MaxAlignment = std::max(MFI->getObjectAlign(FirstSlot),
+                                        MFI->getObjectAlign(SecondSlot));
 
           assert(MFI->getObjectSize(FirstSlot) >=
                  MFI->getObjectSize(SecondSlot) &&
diff --git a/llvm/lib/CodeGen/StackMaps.cpp b/llvm/lib/CodeGen/StackMaps.cpp
index e16587c44a55..1e060ecbeb43 100644
--- a/llvm/lib/CodeGen/StackMaps.cpp
+++ b/llvm/lib/CodeGen/StackMaps.cpp
@@ -300,7 +300,7 @@ void StackMaps::recordStackMapOpers(const MCSymbol &MILabel,
                                     MachineInstr::const_mop_iterator MOE,
                                     bool recordResult) {
   MCContext &OutContext = AP.OutStreamer->getContext();
-  
+
   LocationVec Locations;
   LiveOutVec LiveOuts;
 
@@ -413,19 +413,19 @@ void StackMaps::recordStatepoint(const MCSymbol &L, const MachineInstr &MI) {
 /// uint32 : NumRecords
 void StackMaps::emitStackmapHeader(MCStreamer &OS) {
   // Header.
-  OS.EmitIntValue(StackMapVersion, 1); // Version.
-  OS.EmitIntValue(0, 1);               // Reserved.
-  OS.EmitIntValue(0, 2);               // Reserved.
+  OS.emitIntValue(StackMapVersion, 1); // Version.
+  OS.emitIntValue(0, 1);               // Reserved.
+  OS.emitInt16(0);                     // Reserved.
 
   // Num functions.
   LLVM_DEBUG(dbgs() << WSMP << "#functions = " << FnInfos.size() << '\n');
-  OS.EmitIntValue(FnInfos.size(), 4);
+  OS.emitInt32(FnInfos.size());
   // Num constants.
   LLVM_DEBUG(dbgs() << WSMP << "#constants = " << ConstPool.size() << '\n');
-  OS.EmitIntValue(ConstPool.size(), 4);
+  OS.emitInt32(ConstPool.size());
   // Num callsites.
   LLVM_DEBUG(dbgs() << WSMP << "#callsites = " << CSInfos.size() << '\n');
-  OS.EmitIntValue(CSInfos.size(), 4);
+  OS.emitInt32(CSInfos.size());
 }
 
 /// Emit the function frame record for each function.
@@ -442,9 +442,9 @@ void StackMaps::emitFunctionFrameRecords(MCStreamer &OS) {
     LLVM_DEBUG(dbgs() << WSMP << "function addr: " << FR.first
                       << " frame size: " << FR.second.StackSize
                       << " callsite count: " << FR.second.RecordCount << '\n');
-    OS.EmitSymbolValue(FR.first, 8);
-    OS.EmitIntValue(FR.second.StackSize, 8);
-    OS.EmitIntValue(FR.second.RecordCount, 8);
+    OS.emitSymbolValue(FR.first, 8);
+    OS.emitIntValue(FR.second.StackSize, 8);
+    OS.emitIntValue(FR.second.RecordCount, 8);
   }
 }
 
@@ -456,7 +456,7 @@ void StackMaps::emitConstantPoolEntries(MCStreamer &OS) {
   LLVM_DEBUG(dbgs() << WSMP << "constants:\n");
   for (const auto &ConstEntry : ConstPool) {
     LLVM_DEBUG(dbgs() << WSMP << ConstEntry.second << '\n');
-    OS.EmitIntValue(ConstEntry.second, 8);
+    OS.emitIntValue(ConstEntry.second, 8);
   }
 }
 
@@ -501,46 +501,46 @@ void StackMaps::emitCallsiteEntries(MCStreamer &OS) {
     // simple overflow checks, but we may eventually communicate other
     // compilation errors this way.
     if (CSLocs.size() > UINT16_MAX || LiveOuts.size() > UINT16_MAX) {
-      OS.EmitIntValue(UINT64_MAX, 8); // Invalid ID.
-      OS.EmitValue(CSI.CSOffsetExpr, 4);
-      OS.EmitIntValue(0, 2); // Reserved.
-      OS.EmitIntValue(0, 2); // 0 locations.
-      OS.EmitIntValue(0, 2); // padding.
-      OS.EmitIntValue(0, 2); // 0 live-out registers.
-      OS.EmitIntValue(0, 4); // padding.
+      OS.emitIntValue(UINT64_MAX, 8); // Invalid ID.
+      OS.emitValue(CSI.CSOffsetExpr, 4);
+      OS.emitInt16(0); // Reserved.
+      OS.emitInt16(0); // 0 locations.
+      OS.emitInt16(0); // padding.
+      OS.emitInt16(0); // 0 live-out registers.
+      OS.emitInt32(0); // padding.
       continue;
     }
 
-    OS.EmitIntValue(CSI.ID, 8);
-    OS.EmitValue(CSI.CSOffsetExpr, 4);
+    OS.emitIntValue(CSI.ID, 8);
+    OS.emitValue(CSI.CSOffsetExpr, 4);
 
     // Reserved for flags.
-    OS.EmitIntValue(0, 2);
-    OS.EmitIntValue(CSLocs.size(), 2);
+    OS.emitInt16(0);
+    OS.emitInt16(CSLocs.size());
 
     for (const auto &Loc : CSLocs) {
-      OS.EmitIntValue(Loc.Type, 1);
-      OS.EmitIntValue(0, 1);  // Reserved
-      OS.EmitIntValue(Loc.Size, 2);
-      OS.EmitIntValue(Loc.Reg, 2);
-      OS.EmitIntValue(0, 2);  // Reserved
-      OS.EmitIntValue(Loc.Offset, 4);
+      OS.emitIntValue(Loc.Type, 1);
+      OS.emitIntValue(0, 1);  // Reserved
+      OS.emitInt16(Loc.Size);
+      OS.emitInt16(Loc.Reg);
+      OS.emitInt16(0); // Reserved
+      OS.emitInt32(Loc.Offset);
     }
 
     // Emit alignment to 8 byte.
-    OS.EmitValueToAlignment(8);
+    OS.emitValueToAlignment(8);
 
     // Num live-out registers and padding to align to 4 byte.
-    OS.EmitIntValue(0, 2);
-    OS.EmitIntValue(LiveOuts.size(), 2);
+    OS.emitInt16(0);
+    OS.emitInt16(LiveOuts.size());
 
     for (const auto &LO : LiveOuts) {
-      OS.EmitIntValue(LO.DwarfRegNum, 2);
-      OS.EmitIntValue(0, 1);
-      OS.EmitIntValue(LO.Size, 1);
+      OS.emitInt16(LO.DwarfRegNum);
+      OS.emitIntValue(0, 1);
+      OS.emitIntValue(LO.Size, 1);
     }
     // Emit alignment to 8 byte.
-    OS.EmitValueToAlignment(8);
+    OS.emitValueToAlignment(8);
   }
 }
 
@@ -564,7 +564,7 @@ void StackMaps::serializeToStackMapSection() {
   OS.SwitchSection(StackMapSection);
 
   // Emit a dummy symbol to force section inclusion.
-  OS.EmitLabel(OutContext.getOrCreateSymbol(Twine("__LLVM_StackMaps")));
+  OS.emitLabel(OutContext.getOrCreateSymbol(Twine("__LLVM_StackMaps")));
 
   // Serialize data.
   LLVM_DEBUG(dbgs() << "********** Stack Map Output **********\n");
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index 4e2189884bb1..a343791807e6 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/EHPersonalities.h"
+#include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLowering.h"
@@ -161,9 +162,16 @@ bool StackProtector::ContainsProtectableArray(Type *Ty, bool &IsLarge,
   return NeedsProtector;
 }
 
-bool StackProtector::HasAddressTaken(const Instruction *AI) {
+bool StackProtector::HasAddressTaken(const Instruction *AI,
+                                     uint64_t AllocSize) {
+  const DataLayout &DL = M->getDataLayout();
   for (const User *U : AI->users()) {
     const auto *I = cast<Instruction>(U);
+    // If this instruction accesses memory make sure it doesn't access beyond
+    // the bounds of the allocated object.
+    Optional<MemoryLocation> MemLoc = MemoryLocation::getOrNone(I);
+    if (MemLoc.hasValue() && MemLoc->Size.getValue() > AllocSize)
+      return true;
     switch (I->getOpcode()) {
     case Instruction::Store:
       if (AI == cast<StoreInst>(I)->getValueOperand())
@@ -189,11 +197,26 @@ bool StackProtector::HasAddressTaken(const Instruction *AI) {
     }
     case Instruction::Invoke:
       return true;
+    case Instruction::GetElementPtr: {
+      // If the GEP offset is out-of-bounds, or is non-constant and so has to be
+      // assumed to be potentially out-of-bounds, then any memory access that
+      // would use it could also be out-of-bounds meaning stack protection is
+      // required.
+      const GetElementPtrInst *GEP = cast<GetElementPtrInst>(I);
+      unsigned TypeSize = DL.getIndexTypeSizeInBits(I->getType());
+      APInt Offset(TypeSize, 0);
+      APInt MaxOffset(TypeSize, AllocSize);
+      if (!GEP->accumulateConstantOffset(DL, Offset) || Offset.ugt(MaxOffset))
+        return true;
+      // Adjust AllocSize to be the space remaining after this offset.
+      if (HasAddressTaken(I, AllocSize - Offset.getLimitedValue()))
+        return true;
+      break;
+    }
     case Instruction::BitCast:
-    case Instruction::GetElementPtr:
     case Instruction::Select:
     case Instruction::AddrSpaceCast:
-      if (HasAddressTaken(I))
+      if (HasAddressTaken(I, AllocSize))
         return true;
       break;
     case Instruction::PHI: {
@@ -201,7 +224,7 @@ bool StackProtector::HasAddressTaken(const Instruction *AI) {
       // they are only visited once.
       const auto *PN = cast<PHINode>(I);
       if (VisitedPHIs.insert(PN).second)
-        if (HasAddressTaken(PN))
+        if (HasAddressTaken(PN, AllocSize))
           return true;
       break;
     }
@@ -330,7 +353,8 @@ bool StackProtector::RequiresStackProtector() {
           continue;
         }
 
-        if (Strong && HasAddressTaken(AI)) {
+        if (Strong && HasAddressTaken(AI, M->getDataLayout().getTypeAllocSize(
+                                              AI->getAllocatedType()))) {
           ++NumAddrTaken;
           Layout.insert(std::make_pair(AI, MachineFrameInfo::SSPLK_AddrOf));
           ORE.emit([&]() {
@@ -342,6 +366,9 @@ bool StackProtector::RequiresStackProtector() {
           });
           NeedsProtector = true;
         }
+        // Clear any PHIs that we visited, to make sure we examine all uses of
+        // any subsequent allocas that we look at.
+        VisitedPHIs.clear();
       }
     }
   }
diff --git a/llvm/lib/CodeGen/StackSlotColoring.cpp b/llvm/lib/CodeGen/StackSlotColoring.cpp
index 7ae758323280..3cc5d30ebad7 100644
--- a/llvm/lib/CodeGen/StackSlotColoring.cpp
+++ b/llvm/lib/CodeGen/StackSlotColoring.cpp
@@ -74,7 +74,7 @@ namespace {
     SmallVector<SmallVector<MachineMemOperand *, 8>, 16> SSRefs;
 
     // OrigAlignments - Alignments of stack objects before coloring.
-    SmallVector<unsigned, 16> OrigAlignments;
+    SmallVector<Align, 16> OrigAlignments;
 
     // OrigSizes - Sizess of stack objects before coloring.
     SmallVector<unsigned, 16> OrigSizes;
@@ -227,7 +227,7 @@ void StackSlotColoring::InitializeSlots() {
       continue;
 
     SSIntervals.push_back(&li);
-    OrigAlignments[FI] = MFI->getObjectAlignment(FI);
+    OrigAlignments[FI] = MFI->getObjectAlign(FI);
     OrigSizes[FI]      = MFI->getObjectSize(FI);
 
     auto StackID = MFI->getStackID(FI);
@@ -309,9 +309,9 @@ int StackSlotColoring::ColorSlot(LiveInterval *li) {
   // Change size and alignment of the allocated slot. If there are multiple
   // objects sharing the same slot, then make sure the size and alignment
   // are large enough for all.
-  unsigned Align = OrigAlignments[FI];
-  if (!Share || Align > MFI->getObjectAlignment(Color))
-    MFI->setObjectAlignment(Color, Align);
+  Align Alignment = OrigAlignments[FI];
+  if (!Share || Alignment > MFI->getObjectAlign(Color))
+    MFI->setObjectAlignment(Color, Alignment);
   int64_t Size = OrigSizes[FI];
   if (!Share || Size > MFI->getObjectSize(Color))
     MFI->setObjectSize(Color, Size);
diff --git a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
index c72a04276a4f..dd0b9d4c2e48 100644
--- a/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
+++ b/llvm/lib/CodeGen/SwiftErrorValueTracking.cpp
@@ -264,11 +264,10 @@ void SwiftErrorValueTracking::preassignVRegs(
 
   // Iterator over instructions and assign vregs to swifterror defs and uses.
   for (auto It = Begin; It != End; ++It) {
-    ImmutableCallSite CS(&*It);
-    if (CS) {
+    if (auto *CB = dyn_cast<CallBase>(&*It)) {
       // A call-site with a swifterror argument is both use and def.
       const Value *SwiftErrorAddr = nullptr;
-      for (auto &Arg : CS.args()) {
+      for (auto &Arg : CB->args()) {
         if (!Arg->isSwiftError())
           continue;
         // Use of swifterror.
diff --git a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
index c2cd8fa0324e..078c9691f8dc 100644
--- a/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
+++ b/llvm/lib/CodeGen/SwitchLoweringUtils.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/SwitchLoweringUtils.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 using namespace SwitchCG;
diff --git a/llvm/lib/CodeGen/TailDuplication.cpp b/llvm/lib/CodeGen/TailDuplication.cpp
index 648bf48b7d17..20892a79d35f 100644
--- a/llvm/lib/CodeGen/TailDuplication.cpp
+++ b/llvm/lib/CodeGen/TailDuplication.cpp
@@ -31,6 +31,7 @@ namespace {
 
 class TailDuplicateBase : public MachineFunctionPass {
   TailDuplicator Duplicator;
+  std::unique_ptr<MBFIWrapper> MBFIW;
   bool PreRegAlloc;
 public:
   TailDuplicateBase(char &PassID, bool PreRegAlloc)
@@ -88,7 +89,10 @@ bool TailDuplicateBase::runOnMachineFunction(MachineFunction &MF) {
   auto *MBFI = (PSI && PSI->hasProfileSummary()) ?
                &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
                nullptr;
-  Duplicator.initMF(MF, PreRegAlloc, MBPI, MBFI, PSI, /*LayoutMode=*/false);
+  if (MBFI)
+    MBFIW = std::make_unique<MBFIWrapper>(*MBFI);
+  Duplicator.initMF(MF, PreRegAlloc, MBPI, MBFI ? MBFIW.get() : nullptr, PSI,
+                    /*LayoutMode=*/false);
 
   bool MadeChange = false;
   while (Duplicator.tailDuplicateBlocks())
diff --git a/llvm/lib/CodeGen/TailDuplicator.cpp b/llvm/lib/CodeGen/TailDuplicator.cpp
index cd1278fd4d8d..bd554189f12b 100644
--- a/llvm/lib/CodeGen/TailDuplicator.cpp
+++ b/llvm/lib/CodeGen/TailDuplicator.cpp
@@ -80,7 +80,7 @@ static cl::opt<unsigned> TailDupLimit("tail-dup-limit", cl::init(~0U),
 
 void TailDuplicator::initMF(MachineFunction &MFin, bool PreRegAlloc,
                             const MachineBranchProbabilityInfo *MBPIin,
-                            const MachineBlockFrequencyInfo *MBFIin,
+                            MBFIWrapper *MBFIin,
                             ProfileSummaryInfo *PSIin,
                             bool LayoutModeIn, unsigned TailDupSizeIn) {
   MF = &MFin;
@@ -159,14 +159,16 @@ bool TailDuplicator::tailDuplicateAndUpdate(
     bool IsSimple, MachineBasicBlock *MBB,
     MachineBasicBlock *ForcedLayoutPred,
     SmallVectorImpl<MachineBasicBlock*> *DuplicatedPreds,
-    function_ref<void(MachineBasicBlock *)> *RemovalCallback) {
+    function_ref<void(MachineBasicBlock *)> *RemovalCallback,
+    SmallVectorImpl<MachineBasicBlock *> *CandidatePtr) {
   // Save the successors list.
   SmallSetVector<MachineBasicBlock *, 8> Succs(MBB->succ_begin(),
                                                MBB->succ_end());
 
   SmallVector<MachineBasicBlock *, 8> TDBBs;
   SmallVector<MachineInstr *, 16> Copies;
-  if (!tailDuplicate(IsSimple, MBB, ForcedLayoutPred, TDBBs, Copies))
+  if (!tailDuplicate(IsSimple, MBB, ForcedLayoutPred,
+                     TDBBs, Copies, CandidatePtr))
     return false;
 
   ++NumTails;
@@ -204,11 +206,11 @@ bool TailDuplicator::tailDuplicateAndUpdate(
       }
 
       // Add the new vregs as available values.
-      DenseMap<unsigned, AvailableValsTy>::iterator LI =
+      DenseMap<Register, AvailableValsTy>::iterator LI =
           SSAUpdateVals.find(VReg);
       for (unsigned j = 0, ee = LI->second.size(); j != ee; ++j) {
         MachineBasicBlock *SrcBB = LI->second[j].first;
-        unsigned SrcReg = LI->second[j].second;
+        Register SrcReg = LI->second[j].second;
         SSAUpdate.AddAvailableValue(SrcBB, SrcReg);
       }
 
@@ -292,7 +294,7 @@ bool TailDuplicator::tailDuplicateBlocks() {
   return MadeChange;
 }
 
-static bool isDefLiveOut(unsigned Reg, MachineBasicBlock *BB,
+static bool isDefLiveOut(Register Reg, MachineBasicBlock *BB,
                          const MachineRegisterInfo *MRI) {
   for (MachineInstr &UseMI : MRI->use_instructions(Reg)) {
     if (UseMI.isDebugValue())
@@ -314,7 +316,7 @@ static unsigned getPHISrcRegOpIdx(MachineInstr *MI, MachineBasicBlock *SrcBB) {
 // used to determine which registers are liveout while modifying the
 // block (which is why we need to copy the information).
 static void getRegsUsedByPHIs(const MachineBasicBlock &BB,
-                              DenseSet<unsigned> *UsedByPhi) {
+                              DenseSet<Register> *UsedByPhi) {
   for (const auto &MI : BB) {
     if (!MI.isPHI())
       break;
@@ -326,9 +328,9 @@ static void getRegsUsedByPHIs(const MachineBasicBlock &BB,
 }
 
 /// Add a definition and source virtual registers pair for SSA update.
-void TailDuplicator::addSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
+void TailDuplicator::addSSAUpdateEntry(Register OrigReg, Register NewReg,
                                        MachineBasicBlock *BB) {
-  DenseMap<unsigned, AvailableValsTy>::iterator LI =
+  DenseMap<Register, AvailableValsTy>::iterator LI =
       SSAUpdateVals.find(OrigReg);
   if (LI != SSAUpdateVals.end())
     LI->second.push_back(std::make_pair(BB, NewReg));
@@ -344,9 +346,9 @@ void TailDuplicator::addSSAUpdateEntry(unsigned OrigReg, unsigned NewReg,
 /// source register that's contributed by PredBB and update SSA update map.
 void TailDuplicator::processPHI(
     MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB,
-    DenseMap<unsigned, RegSubRegPair> &LocalVRMap,
-    SmallVectorImpl<std::pair<unsigned, RegSubRegPair>> &Copies,
-    const DenseSet<unsigned> &RegsUsedByPhi, bool Remove) {
+    DenseMap<Register, RegSubRegPair> &LocalVRMap,
+    SmallVectorImpl<std::pair<Register, RegSubRegPair>> &Copies,
+    const DenseSet<Register> &RegsUsedByPhi, bool Remove) {
   Register DefReg = MI->getOperand(0).getReg();
   unsigned SrcOpIdx = getPHISrcRegOpIdx(MI, PredBB);
   assert(SrcOpIdx && "Unable to find matching PHI source?");
@@ -376,8 +378,8 @@ void TailDuplicator::processPHI(
 /// the source operands due to earlier PHI translation.
 void TailDuplicator::duplicateInstruction(
     MachineInstr *MI, MachineBasicBlock *TailBB, MachineBasicBlock *PredBB,
-    DenseMap<unsigned, RegSubRegPair> &LocalVRMap,
-    const DenseSet<unsigned> &UsedByPhi) {
+    DenseMap<Register, RegSubRegPair> &LocalVRMap,
+    const DenseSet<Register> &UsedByPhi) {
   // Allow duplication of CFI instructions.
   if (MI->isCFIInstruction()) {
     BuildMI(*PredBB, PredBB->end(), PredBB->findDebugLoc(PredBB->begin()),
@@ -502,7 +504,7 @@ void TailDuplicator::updateSuccessorsPHIs(
       // If Idx is set, the operands at Idx and Idx+1 must be removed.
       // We reuse the location to avoid expensive RemoveOperand calls.
 
-      DenseMap<unsigned, AvailableValsTy>::iterator LI =
+      DenseMap<Register, AvailableValsTy>::iterator LI =
           SSAUpdateVals.find(Reg);
       if (LI != SSAUpdateVals.end()) {
         // This register is defined in the tail block.
@@ -515,7 +517,7 @@ void TailDuplicator::updateSuccessorsPHIs(
           if (!SrcBB->isSuccessor(SuccBB))
             continue;
 
-          unsigned SrcReg = LI->second[j].second;
+          Register SrcReg = LI->second[j].second;
           if (Idx != 0) {
             MI.getOperand(Idx).setReg(SrcReg);
             MI.getOperand(Idx + 1).setMBB(SrcBB);
@@ -625,7 +627,9 @@ bool TailDuplicator::shouldTailDuplicate(bool IsSimple,
     if (PreRegAlloc && MI.isCall())
       return false;
 
-    if (!MI.isPHI() && !MI.isMetaInstruction())
+    if (MI.isBundle())
+      InstrCount += MI.getBundleSize();
+    else if (!MI.isPHI() && !MI.isMetaInstruction())
       InstrCount += 1;
 
     if (InstrCount > MaxDuplicateCount)
@@ -704,7 +708,7 @@ bool TailDuplicator::canCompletelyDuplicateBB(MachineBasicBlock &BB) {
 
 bool TailDuplicator::duplicateSimpleBB(
     MachineBasicBlock *TailBB, SmallVectorImpl<MachineBasicBlock *> &TDBBs,
-    const DenseSet<unsigned> &UsedByPhi,
+    const DenseSet<Register> &UsedByPhi,
     SmallVectorImpl<MachineInstr *> &Copies) {
   SmallPtrSet<MachineBasicBlock *, 8> Succs(TailBB->succ_begin(),
                                             TailBB->succ_end());
@@ -712,7 +716,7 @@ bool TailDuplicator::duplicateSimpleBB(
                                             TailBB->pred_end());
   bool Changed = false;
   for (MachineBasicBlock *PredBB : Preds) {
-    if (PredBB->hasEHPadSuccessor())
+    if (PredBB->hasEHPadSuccessor() || PredBB->mayHaveInlineAsmBr())
       continue;
 
     if (bothUsedInPHI(*PredBB, Succs))
@@ -802,13 +806,16 @@ bool TailDuplicator::canTailDuplicate(MachineBasicBlock *TailBB,
 /// \p Copies            A vector of copy instructions inserted. Used later to
 ///                      walk all the inserted copies and remove redundant ones.
 bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
-                                   MachineBasicBlock *ForcedLayoutPred,
-                                   SmallVectorImpl<MachineBasicBlock *> &TDBBs,
-                                   SmallVectorImpl<MachineInstr *> &Copies) {
+                          MachineBasicBlock *ForcedLayoutPred,
+                          SmallVectorImpl<MachineBasicBlock *> &TDBBs,
+                          SmallVectorImpl<MachineInstr *> &Copies,
+                          SmallVectorImpl<MachineBasicBlock *> *CandidatePtr) {
   LLVM_DEBUG(dbgs() << "\n*** Tail-duplicating " << printMBBReference(*TailBB)
                     << '\n');
 
-  DenseSet<unsigned> UsedByPhi;
+  bool ShouldUpdateTerminators = TailBB->canFallThrough();
+
+  DenseSet<Register> UsedByPhi;
   getRegsUsedByPHIs(*TailBB, &UsedByPhi);
 
   if (IsSimple)
@@ -818,8 +825,12 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
   // block into them, if possible. Copying the list ahead of time also
   // avoids trouble with the predecessor list reallocating.
   bool Changed = false;
-  SmallSetVector<MachineBasicBlock *, 8> Preds(TailBB->pred_begin(),
-                                               TailBB->pred_end());
+  SmallSetVector<MachineBasicBlock *, 8> Preds;
+  if (CandidatePtr)
+    Preds.insert(CandidatePtr->begin(), CandidatePtr->end());
+  else
+    Preds.insert(TailBB->pred_begin(), TailBB->pred_end());
+
   for (MachineBasicBlock *PredBB : Preds) {
     assert(TailBB != PredBB &&
            "Single-block loop should have been rejected earlier!");
@@ -828,13 +839,17 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
       continue;
 
     // Don't duplicate into a fall-through predecessor (at least for now).
-    bool IsLayoutSuccessor = false;
-    if (ForcedLayoutPred)
-      IsLayoutSuccessor = (ForcedLayoutPred == PredBB);
-    else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
-      IsLayoutSuccessor = true;
-    if (IsLayoutSuccessor)
-      continue;
+    // If profile is available, findDuplicateCandidates can choose better
+    // fall-through predecessor.
+    if (!(MF->getFunction().hasProfileData() && LayoutMode)) {
+      bool IsLayoutSuccessor = false;
+      if (ForcedLayoutPred)
+        IsLayoutSuccessor = (ForcedLayoutPred == PredBB);
+      else if (PredBB->isLayoutSuccessor(TailBB) && PredBB->canFallThrough())
+        IsLayoutSuccessor = true;
+      if (IsLayoutSuccessor)
+        continue;
+    }
 
     LLVM_DEBUG(dbgs() << "\nTail-duplicating into PredBB: " << *PredBB
                       << "From Succ: " << *TailBB);
@@ -845,8 +860,8 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
     TII->removeBranch(*PredBB);
 
     // Clone the contents of TailBB into PredBB.
-    DenseMap<unsigned, RegSubRegPair> LocalVRMap;
-    SmallVector<std::pair<unsigned, RegSubRegPair>, 4> CopyInfos;
+    DenseMap<Register, RegSubRegPair> LocalVRMap;
+    SmallVector<std::pair<Register, RegSubRegPair>, 4> CopyInfos;
     for (MachineBasicBlock::iterator I = TailBB->begin(), E = TailBB->end();
          I != E; /* empty */) {
       MachineInstr *MI = &*I;
@@ -872,6 +887,10 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
     for (MachineBasicBlock *Succ : TailBB->successors())
       PredBB->addSuccessor(Succ, MBPI->getEdgeProbability(TailBB, Succ));
 
+    // Update branches in pred to jump to tail's layout successor if needed.
+    if (ShouldUpdateTerminators)
+      PredBB->updateTerminator(TailBB->getNextNode());
+
     Changed = true;
     ++NumTailDups;
   }
@@ -901,8 +920,8 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
     // duplicating the instructions in all cases.
     TII->removeBranch(*PrevBB);
     if (PreRegAlloc) {
-      DenseMap<unsigned, RegSubRegPair> LocalVRMap;
-      SmallVector<std::pair<unsigned, RegSubRegPair>, 4> CopyInfos;
+      DenseMap<Register, RegSubRegPair> LocalVRMap;
+      SmallVector<std::pair<Register, RegSubRegPair>, 4> CopyInfos;
       MachineBasicBlock::iterator I = TailBB->begin();
       // Process PHI instructions first.
       while (I != TailBB->end() && I->isPHI()) {
@@ -930,6 +949,11 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
     PrevBB->removeSuccessor(PrevBB->succ_begin());
     assert(PrevBB->succ_empty());
     PrevBB->transferSuccessors(TailBB);
+
+    // Update branches in PrevBB based on Tail's layout successor.
+    if (ShouldUpdateTerminators)
+      PrevBB->updateTerminator(TailBB->getNextNode());
+
     TDBBs.push_back(PrevBB);
     Changed = true;
   }
@@ -964,8 +988,8 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
     if (PredBB->succ_size() != 1)
       continue;
 
-    DenseMap<unsigned, RegSubRegPair> LocalVRMap;
-    SmallVector<std::pair<unsigned, RegSubRegPair>, 4> CopyInfos;
+    DenseMap<Register, RegSubRegPair> LocalVRMap;
+    SmallVector<std::pair<Register, RegSubRegPair>, 4> CopyInfos;
     MachineBasicBlock::iterator I = TailBB->begin();
     // Process PHI instructions first.
     while (I != TailBB->end() && I->isPHI()) {
@@ -983,7 +1007,7 @@ bool TailDuplicator::tailDuplicate(bool IsSimple, MachineBasicBlock *TailBB,
 /// At the end of the block \p MBB generate COPY instructions between registers
 /// described by \p CopyInfos. Append resulting instructions to \p Copies.
 void TailDuplicator::appendCopies(MachineBasicBlock *MBB,
-      SmallVectorImpl<std::pair<unsigned,RegSubRegPair>> &CopyInfos,
+      SmallVectorImpl<std::pair<Register, RegSubRegPair>> &CopyInfos,
       SmallVectorImpl<MachineInstr*> &Copies) {
   MachineBasicBlock::iterator Loc = MBB->getFirstTerminator();
   const MCInstrDesc &CopyD = TII->get(TargetOpcode::COPY);
@@ -1002,6 +1026,13 @@ void TailDuplicator::removeDeadBlock(
   assert(MBB->pred_empty() && "MBB must be dead!");
   LLVM_DEBUG(dbgs() << "\nRemoving MBB: " << *MBB);
 
+  MachineFunction *MF = MBB->getParent();
+  // Update the call site info.
+  std::for_each(MBB->begin(), MBB->end(), [MF](const MachineInstr &MI) {
+    if (MI.shouldUpdateCallSiteInfo())
+      MF->eraseCallSiteInfo(&MI);
+  });
+
   if (RemovalCallback)
     (*RemovalCallback)(MBB);
 
diff --git a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
index bc59be890c97..f8b482c04a58 100644
--- a/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
+++ b/llvm/lib/CodeGen/TargetFrameLoweringImpl.cpp
@@ -10,17 +10,17 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstrTypes.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Target/TargetMachine.h"
@@ -42,7 +42,8 @@ bool TargetFrameLowering::enableCalleeSaveSkip(const MachineFunction &MF) const
 /// (in output arg FrameReg). This is the default implementation which
 /// is overridden for some targets.
 int TargetFrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                             int FI, unsigned &FrameReg) const {
+                                                int FI,
+                                                Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
 
@@ -140,8 +141,8 @@ bool TargetFrameLowering::isSafeForNoCSROpt(const Function &F) {
     return false;
   // Function should not be optimized as tail call.
   for (const User *U : F.users())
-    if (auto CS = ImmutableCallSite(U))
-      if (CS.isTailCall())
+    if (auto *CB = dyn_cast<CallBase>(U))
+      if (CB->isTailCall())
         return false;
   return true;
 }
@@ -150,7 +151,13 @@ int TargetFrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
   llvm_unreachable("getInitialCFAOffset() not implemented!");
 }
 
-unsigned TargetFrameLowering::getInitialCFARegister(const MachineFunction &MF)
-    const {
+Register
+TargetFrameLowering::getInitialCFARegister(const MachineFunction &MF) const {
   llvm_unreachable("getInitialCFARegister() not implemented!");
 }
+
+TargetFrameLowering::DwarfFrameBase
+TargetFrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
+  const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
+  return DwarfFrameBase{DwarfFrameBase::Register, {RI->getFrameRegister(MF)}};
+}
diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp
index a98c627dab09..24f3f96d0b1d 100644
--- a/llvm/lib/CodeGen/TargetInstrInfo.cpp
+++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
@@ -104,14 +105,14 @@ unsigned TargetInstrInfo::getInlineAsmLength(
       AtInsnStart = false;
     }
 
-    if (AtInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+    if (AtInsnStart && !isSpace(static_cast<unsigned char>(*Str))) {
       unsigned AddLength = MaxInstLength;
       if (strncmp(Str, ".space", 6) == 0) {
         char *EStr;
         int SpaceSize;
         SpaceSize = strtol(Str + 6, &EStr, 10);
         SpaceSize = SpaceSize < 0 ? 0 : SpaceSize;
-        while (*EStr != '\n' && std::isspace(static_cast<unsigned char>(*EStr)))
+        while (*EStr != '\n' && isSpace(static_cast<unsigned char>(*EStr)))
           ++EStr;
         if (*EStr == '\0' || *EStr == '\n' ||
             isAsmComment(EStr, MAI)) // Successfully parsed .space argument
@@ -143,7 +144,7 @@ TargetInstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
   // from the end of MBB.
   while (Tail != MBB->end()) {
     auto MI = Tail++;
-    if (MI->isCall())
+    if (MI->shouldUpdateCallSiteInfo())
       MBB->getParent()->eraseCallSiteInfo(&*MI);
     MBB->erase(MI);
   }
@@ -408,7 +409,7 @@ bool TargetInstrInfo::getStackSlotRange(const TargetRegisterClass *RC,
 
 void TargetInstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator I,
-                                    unsigned DestReg, unsigned SubIdx,
+                                    Register DestReg, unsigned SubIdx,
                                     const MachineInstr &Orig,
                                     const TargetRegisterInfo &TRI) const {
   MachineInstr *MI = MBB.getParent()->CloneMachineInstr(&Orig);
@@ -591,11 +592,15 @@ MachineInstr *TargetInstrInfo::foldMemoryOperand(MachineInstr &MI,
             NewMI->mayLoad()) &&
            "Folded a use to a non-load!");
     assert(MFI.getObjectOffset(FI) != -1);
-    MachineMemOperand *MMO = MF.getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(MF, FI), Flags, MemSize,
-        MFI.getObjectAlignment(FI));
+    MachineMemOperand *MMO =
+        MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
+                                Flags, MemSize, MFI.getObjectAlign(FI));
     NewMI->addMemOperand(MF, MMO);
 
+    // The pass "x86 speculative load hardening" always attaches symbols to
+    // call instructions. We need copy it form old instruction.
+    NewMI->cloneInstrSymbols(MF, MI);
+
     return NewMI;
   }
 
@@ -699,10 +704,13 @@ bool TargetInstrInfo::hasReassociableSibling(const MachineInstr &Inst,
     std::swap(MI1, MI2);
 
   // 1. The previous instruction must be the same type as Inst.
-  // 2. The previous instruction must have virtual register definitions for its
+  // 2. The previous instruction must also be associative/commutative (this can
+  //    be different even for instructions with the same opcode if traits like
+  //    fast-math-flags are included).
+  // 3. The previous instruction must have virtual register definitions for its
   //    operands in the same basic block as Inst.
-  // 3. The previous instruction's result must only be used by Inst.
-  return MI1->getOpcode() == AssocOpcode &&
+  // 4. The previous instruction's result must only be used by Inst.
+  return MI1->getOpcode() == AssocOpcode && isAssociativeAndCommutative(*MI1) &&
          hasReassociableOperands(*MI1, MBB) &&
          MRI.hasOneNonDBGUse(MI1->getOperand(0).getReg());
 }
@@ -991,6 +999,10 @@ bool TargetInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   if (MI.isTerminator() || MI.isPosition())
     return true;
 
+  // INLINEASM_BR can jump to another block
+  if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
+    return true;
+
   // Don't attempt to schedule around any instruction that defines
   // a stack-oriented pointer, as it's unlikely to be profitable. This
   // saves compile time, because it doesn't require every single
@@ -1028,6 +1040,20 @@ CreateTargetPostRAHazardRecognizer(const InstrItineraryData *II,
   return new ScoreboardHazardRecognizer(II, DAG, "post-RA-sched");
 }
 
+// Default implementation of getMemOperandWithOffset.
+bool TargetInstrInfo::getMemOperandWithOffset(
+    const MachineInstr &MI, const MachineOperand *&BaseOp, int64_t &Offset,
+    bool &OffsetIsScalable, const TargetRegisterInfo *TRI) const {
+  SmallVector<const MachineOperand *, 4> BaseOps;
+  unsigned Width;
+  if (!getMemOperandsWithOffsetWidth(MI, BaseOps, Offset, OffsetIsScalable,
+                                     Width, TRI) ||
+      BaseOps.size() != 1)
+    return false;
+  BaseOp = BaseOps.front();
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 //  SelectionDAG latency interface.
 //===----------------------------------------------------------------------===//
@@ -1125,6 +1151,7 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI,
   const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
   DIExpression *Expr = DIExpression::get(MF->getFunction().getContext(), {});
   int64_t Offset;
+  bool OffsetIsScalable;
 
   // To simplify the sub-register handling, verify that we only need to
   // consider physical registers.
@@ -1134,6 +1161,11 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI,
   if (auto DestSrc = isCopyInstr(MI)) {
     Register DestReg = DestSrc->Destination->getReg();
 
+    // If the copy destination is the forwarding reg, describe the forwarding
+    // reg using the copy source as the backup location. Example:
+    //
+    //   x0 = MOV x7
+    //   call callee(x0)      ; x0 described as x7
     if (Reg == DestReg)
       return ParamLoadedValue(*DestSrc->Source, Expr);
 
@@ -1163,11 +1195,22 @@ TargetInstrInfo::describeLoadedValue(const MachineInstr &MI,
       return None;
 
     const MachineOperand *BaseOp;
-    if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI))
+    if (!TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable,
+                                      TRI))
       return None;
 
-    assert(MI.getNumExplicitDefs() == 1 &&
-           "Can currently only handle mem instructions with a single define");
+    // FIXME: Scalable offsets are not yet handled in the offset code below.
+    if (OffsetIsScalable)
+      return None;
+
+    // TODO: Can currently only handle mem instructions with a single define.
+    // An example from the x86 target:
+    //    ...
+    //    DIV64m $rsp, 1, $noreg, 24, $noreg, implicit-def dead $rax, implicit-def $rdx
+    //    ...
+    //
+    if (MI.getNumExplicitDefs() != 1)
+      return None;
 
     // TODO: In what way do we need to take Reg into consideration here?
 
@@ -1290,4 +1333,60 @@ bool TargetInstrInfo::getInsertSubregInputs(
   return true;
 }
 
+// Returns a MIRPrinter comment for this machine operand.
+std::string TargetInstrInfo::createMIROperandComment(
+    const MachineInstr &MI, const MachineOperand &Op, unsigned OpIdx,
+    const TargetRegisterInfo *TRI) const {
+
+  if (!MI.isInlineAsm())
+    return "";
+
+  std::string Flags;
+  raw_string_ostream OS(Flags);
+
+  if (OpIdx == InlineAsm::MIOp_ExtraInfo) {
+    // Print HasSideEffects, MayLoad, MayStore, IsAlignStack
+    unsigned ExtraInfo = Op.getImm();
+    bool First = true;
+    for (StringRef Info : InlineAsm::getExtraInfoNames(ExtraInfo)) {
+      if (!First)
+        OS << " ";
+      First = false;
+      OS << Info;
+    }
+
+    return OS.str();
+  }
+
+  int FlagIdx = MI.findInlineAsmFlagIdx(OpIdx);
+  if (FlagIdx < 0 || (unsigned)FlagIdx != OpIdx)
+    return "";
+
+  assert(Op.isImm() && "Expected flag operand to be an immediate");
+  // Pretty print the inline asm operand descriptor.
+  unsigned Flag = Op.getImm();
+  unsigned Kind = InlineAsm::getKind(Flag);
+  OS << InlineAsm::getKindName(Kind);
+
+  unsigned RCID = 0;
+  if (!InlineAsm::isImmKind(Flag) && !InlineAsm::isMemKind(Flag) &&
+      InlineAsm::hasRegClassConstraint(Flag, RCID)) {
+    if (TRI) {
+      OS << ':' << TRI->getRegClassName(TRI->getRegClass(RCID));
+    } else
+      OS << ":RC" << RCID;
+  }
+
+  if (InlineAsm::isMemKind(Flag)) {
+    unsigned MCID = InlineAsm::getMemoryConstraintID(Flag);
+    OS << ":" << InlineAsm::getMemConstraintName(MCID);
+  }
+
+  unsigned TiedTo = 0;
+  if (InlineAsm::isUseOperandTiedToDef(Flag, TiedTo))
+    OS << " tiedto:$" << TiedTo;
+
+  return OS.str();
+}
+
 TargetInstrInfo::PipelinerLoopInfo::~PipelinerLoopInfo() {}
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index e5a7b70d82c8..2c94c2c62e5f 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -17,6 +17,8 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/Analysis.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -51,6 +53,7 @@
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/SizeOpts.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -612,7 +615,7 @@ void TargetLoweringBase::initActions() {
             std::end(TargetDAGCombineArray), 0);
 
   for (MVT VT : MVT::fp_valuetypes()) {
-    MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits());
+    MVT IntVT = MVT::getIntegerVT(VT.getSizeInBits().getFixedSize());
     if (IntVT.isValid()) {
       setOperationAction(ISD::ATOMIC_SWAP, VT, Promote);
       AddPromotedToType(ISD::ATOMIC_SWAP, VT, IntVT);
@@ -659,7 +662,9 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::UMULFIX, VT, Expand);
     setOperationAction(ISD::UMULFIXSAT, VT, Expand);
     setOperationAction(ISD::SDIVFIX, VT, Expand);
+    setOperationAction(ISD::SDIVFIXSAT, VT, Expand);
     setOperationAction(ISD::UDIVFIX, VT, Expand);
+    setOperationAction(ISD::UDIVFIXSAT, VT, Expand);
 
     // Overflow operations default to expand
     setOperationAction(ISD::SADDO, VT, Expand);
@@ -688,6 +693,7 @@ void TargetLoweringBase::initActions() {
 
     // These library functions default to expand.
     setOperationAction(ISD::FROUND, VT, Expand);
+    setOperationAction(ISD::FROUNDEVEN, VT, Expand);
     setOperationAction(ISD::FPOWI, VT, Expand);
 
     // These operations default to expand for vector types.
@@ -701,7 +707,7 @@ void TargetLoweringBase::initActions() {
     }
 
     // Constrained floating-point operations default to expand.
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)                   \
+#define DAG_INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)               \
     setOperationAction(ISD::STRICT_##DAGN, VT, Expand);
 #include "llvm/IR/ConstrainedOps.def"
 
@@ -753,6 +759,7 @@ void TargetLoweringBase::initActions() {
     setOperationAction(ISD::FRINT,      VT, Expand);
     setOperationAction(ISD::FTRUNC,     VT, Expand);
     setOperationAction(ISD::FROUND,     VT, Expand);
+    setOperationAction(ISD::FROUNDEVEN, VT, Expand);
     setOperationAction(ISD::LROUND,     VT, Expand);
     setOperationAction(ISD::LLROUND,    VT, Expand);
     setOperationAction(ISD::LRINT,      VT, Expand);
@@ -810,6 +817,7 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
     LegalizeTypeAction LA = ValueTypeActions.getTypeAction(SVT);
 
     assert((LA == TypeLegal || LA == TypeSoftenFloat ||
+            LA == TypeSoftPromoteHalf ||
             (NVT.isVector() ||
              ValueTypeActions.getTypeAction(NVT) != TypePromoteInteger)) &&
            "Promote may not follow Expand or Promote");
@@ -817,7 +825,7 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
     if (LA == TypeSplitVector)
       return LegalizeKind(LA,
                           EVT::getVectorVT(Context, SVT.getVectorElementType(),
-                                           SVT.getVectorNumElements() / 2));
+                                           SVT.getVectorElementCount() / 2));
     if (LA == TypeScalarizeVector)
       return LegalizeKind(LA, SVT.getVectorElementType());
     return LegalizeKind(LA, NVT);
@@ -844,13 +852,16 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
   }
 
   // Handle vector types.
-  unsigned NumElts = VT.getVectorNumElements();
+  ElementCount NumElts = VT.getVectorElementCount();
   EVT EltVT = VT.getVectorElementType();
 
   // Vectors with only one element are always scalarized.
   if (NumElts == 1)
     return LegalizeKind(TypeScalarizeVector, EltVT);
 
+  if (VT.getVectorElementCount() == ElementCount(1, true))
+    report_fatal_error("Cannot legalize this vector");
+
   // Try to widen vector elements until the element type is a power of two and
   // promote it to a legal type later on, for example:
   // <3 x i8> -> <4 x i8> -> <4 x i32>
@@ -858,7 +869,7 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
     // Vectors with a number of elements that is not a power of two are always
     // widened, for example <3 x i8> -> <4 x i8>.
     if (!VT.isPow2VectorType()) {
-      NumElts = (unsigned)NextPowerOf2(NumElts);
+      NumElts = NumElts.NextPowerOf2();
       EVT NVT = EVT::getVectorVT(Context, EltVT, NumElts);
       return LegalizeKind(TypeWidenVector, NVT);
     }
@@ -907,7 +918,7 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
   // If there is no wider legal type, split the vector.
   while (true) {
     // Round up to the next power of 2.
-    NumElts = (unsigned)NextPowerOf2(NumElts);
+    NumElts = NumElts.NextPowerOf2();
 
     // If there is no simple vector type with this many elements then there
     // cannot be a larger legal vector type.  Note that this assumes that
@@ -930,7 +941,7 @@ TargetLoweringBase::getTypeConversion(LLVMContext &Context, EVT VT) const {
   }
 
   // Vectors with illegal element types are expanded.
-  EVT NVT = EVT::getVectorVT(Context, EltVT, VT.getVectorNumElements() / 2);
+  EVT NVT = EVT::getVectorVT(Context, EltVT, VT.getVectorElementCount() / 2);
   return LegalizeKind(TypeSplitVector, NVT);
 }
 
@@ -939,42 +950,51 @@ static unsigned getVectorTypeBreakdownMVT(MVT VT, MVT &IntermediateVT,
                                           MVT &RegisterVT,
                                           TargetLoweringBase *TLI) {
   // Figure out the right, legal destination reg to copy into.
-  unsigned NumElts = VT.getVectorNumElements();
+  ElementCount EC = VT.getVectorElementCount();
   MVT EltTy = VT.getVectorElementType();
 
   unsigned NumVectorRegs = 1;
 
-  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally we
-  // could break down into LHS/RHS like LegalizeDAG does.
-  if (!isPowerOf2_32(NumElts)) {
-    NumVectorRegs = NumElts;
-    NumElts = 1;
+  // Scalable vectors cannot be scalarized, so splitting or widening is
+  // required.
+  if (VT.isScalableVector() && !isPowerOf2_32(EC.Min))
+    llvm_unreachable(
+        "Splitting or widening of non-power-of-2 MVTs is not implemented.");
+
+  // FIXME: We don't support non-power-of-2-sized vectors for now.
+  // Ideally we could break down into LHS/RHS like LegalizeDAG does.
+  if (!isPowerOf2_32(EC.Min)) {
+    // Split EC to unit size (scalable property is preserved).
+    NumVectorRegs = EC.Min;
+    EC = EC / NumVectorRegs;
   }
 
-  // Divide the input until we get to a supported size.  This will always
-  // end with a scalar if the target doesn't support vectors.
-  while (NumElts > 1 && !TLI->isTypeLegal(MVT::getVectorVT(EltTy, NumElts))) {
-    NumElts >>= 1;
+  // Divide the input until we get to a supported size. This will
+  // always end up with an EC that represent a scalar or a scalable
+  // scalar.
+  while (EC.Min > 1 && !TLI->isTypeLegal(MVT::getVectorVT(EltTy, EC))) {
+    EC.Min >>= 1;
     NumVectorRegs <<= 1;
   }
 
   NumIntermediates = NumVectorRegs;
 
-  MVT NewVT = MVT::getVectorVT(EltTy, NumElts);
+  MVT NewVT = MVT::getVectorVT(EltTy, EC);
   if (!TLI->isTypeLegal(NewVT))
     NewVT = EltTy;
   IntermediateVT = NewVT;
 
-  unsigned NewVTSize = NewVT.getSizeInBits();
+  unsigned LaneSizeInBits = NewVT.getScalarSizeInBits().getFixedSize();
 
   // Convert sizes such as i33 to i64.
-  if (!isPowerOf2_32(NewVTSize))
-    NewVTSize = NextPowerOf2(NewVTSize);
+  if (!isPowerOf2_32(LaneSizeInBits))
+    LaneSizeInBits = NextPowerOf2(LaneSizeInBits);
 
   MVT DestVT = TLI->getRegisterType(NewVT);
   RegisterVT = DestVT;
   if (EVT(DestVT).bitsLT(NewVT))    // Value is expanded, e.g. i64 -> i16.
-    return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());
+    return NumVectorRegs *
+           (LaneSizeInBits / DestVT.getScalarSizeInBits().getFixedSize());
 
   // Otherwise, promotion or legal types use the same number of registers as
   // the vector decimated to the appropriate level.
@@ -1012,20 +1032,25 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
   // all stack slots), but we need to handle the different type of stackmap
   // operands and memory effects here.
 
-  // MI changes inside this loop as we grow operands.
-  for(unsigned OperIdx = 0; OperIdx != MI->getNumOperands(); ++OperIdx) {
-    MachineOperand &MO = MI->getOperand(OperIdx);
-    if (!MO.isFI())
+  if (!llvm::any_of(MI->operands(),
+                    [](MachineOperand &Operand) { return Operand.isFI(); }))
+    return MBB;
+
+  MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), MI->getDesc());
+
+  // Inherit previous memory operands.
+  MIB.cloneMemRefs(*MI);
+
+  for (auto &MO : MI->operands()) {
+    if (!MO.isFI()) {
+      MIB.add(MO);
       continue;
+    }
 
     // foldMemoryOperand builds a new MI after replacing a single FI operand
     // with the canonical set of five x86 addressing-mode operands.
     int FI = MO.getIndex();
-    MachineInstrBuilder MIB = BuildMI(MF, MI->getDebugLoc(), MI->getDesc());
 
-    // Copy operands before the frame-index.
-    for (unsigned i = 0; i < OperIdx; ++i)
-      MIB.add(MI->getOperand(i));
     // Add frame index operands recognized by stackmaps.cpp
     if (MFI.isStatepointSpillSlotObjectIndex(FI)) {
       // indirect-mem-ref tag, size, #FI, offset.
@@ -1035,21 +1060,16 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
       assert(MI->getOpcode() == TargetOpcode::STATEPOINT && "sanity");
       MIB.addImm(StackMaps::IndirectMemRefOp);
       MIB.addImm(MFI.getObjectSize(FI));
-      MIB.add(MI->getOperand(OperIdx));
+      MIB.add(MO);
       MIB.addImm(0);
     } else {
       // direct-mem-ref tag, #FI, offset.
       // Used by patchpoint, and direct alloca arguments to statepoints
       MIB.addImm(StackMaps::DirectMemRefOp);
-      MIB.add(MI->getOperand(OperIdx));
+      MIB.add(MO);
       MIB.addImm(0);
     }
-    // Copy the operands after the frame index.
-    for (unsigned i = OperIdx + 1; i != MI->getNumOperands(); ++i)
-      MIB.add(MI->getOperand(i));
 
-    // Inherit previous memory operands.
-    MIB.cloneMemRefs(*MI);
     assert(MIB->mayLoad() && "Folded a stackmap use to a non-load!");
 
     // Add a new memory operand for this FI.
@@ -1061,16 +1081,12 @@ TargetLoweringBase::emitPatchPoint(MachineInstr &InitialMI,
       auto Flags = MachineMemOperand::MOLoad;
       MachineMemOperand *MMO = MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, FI), Flags,
-          MF.getDataLayout().getPointerSize(), MFI.getObjectAlignment(FI));
+          MF.getDataLayout().getPointerSize(), MFI.getObjectAlign(FI));
       MIB->addMemOperand(MF, MMO);
     }
-    
-    // Replace the instruction and update the operand index.
-    MBB->insert(MachineBasicBlock::iterator(MI), MIB);
-    OperIdx += (MIB->getNumOperands() - MI->getNumOperands()) - 1;
-    MI->eraseFromParent();
-    MI = MIB;
   }
+  MBB->insert(MachineBasicBlock::iterator(MI), MIB);
+  MI->eraseFromParent();
   return MBB;
 }
 
@@ -1228,10 +1244,18 @@ void TargetLoweringBase::computeRegisterProperties(
   // promote it to f32, because there are no f16 library calls (except for
   // conversions).
   if (!isTypeLegal(MVT::f16)) {
-    NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::f32];
-    RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::f32];
-    TransformToType[MVT::f16] = MVT::f32;
-    ValueTypeActions.setTypeAction(MVT::f16, TypePromoteFloat);
+    // Allow targets to control how we legalize half.
+    if (softPromoteHalfType()) {
+      NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::i16];
+      RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::i16];
+      TransformToType[MVT::f16] = MVT::f32;
+      ValueTypeActions.setTypeAction(MVT::f16, TypeSoftPromoteHalf);
+    } else {
+      NumRegistersForVT[MVT::f16] = NumRegistersForVT[MVT::f32];
+      RegisterTypeForVT[MVT::f16] = RegisterTypeForVT[MVT::f32];
+      TransformToType[MVT::f16] = MVT::f32;
+      ValueTypeActions.setTypeAction(MVT::f16, TypePromoteFloat);
+    }
   }
 
   // Loop over all of the vector value types to see which need transformations.
@@ -1242,7 +1266,7 @@ void TargetLoweringBase::computeRegisterProperties(
       continue;
 
     MVT EltVT = VT.getVectorElementType();
-    unsigned NElts = VT.getVectorNumElements();
+    ElementCount EC = VT.getVectorElementCount();
     bool IsLegalWiderType = false;
     bool IsScalable = VT.isScalableVector();
     LegalizeTypeAction PreferredAction = getPreferredVectorAction(VT);
@@ -1259,8 +1283,7 @@ void TargetLoweringBase::computeRegisterProperties(
         // Promote vectors of integers to vectors with the same number
         // of elements, with a wider element type.
         if (SVT.getScalarSizeInBits() > EltVT.getSizeInBits() &&
-            SVT.getVectorNumElements() == NElts &&
-            SVT.isScalableVector() == IsScalable && isTypeLegal(SVT)) {
+            SVT.getVectorElementCount() == EC && isTypeLegal(SVT)) {
           TransformToType[i] = SVT;
           RegisterTypeForVT[i] = SVT;
           NumRegistersForVT[i] = 1;
@@ -1275,13 +1298,13 @@ void TargetLoweringBase::computeRegisterProperties(
     }
 
     case TypeWidenVector:
-      if (isPowerOf2_32(NElts)) {
+      if (isPowerOf2_32(EC.Min)) {
         // Try to widen the vector.
         for (unsigned nVT = i + 1; nVT <= MVT::LAST_VECTOR_VALUETYPE; ++nVT) {
           MVT SVT = (MVT::SimpleValueType) nVT;
-          if (SVT.getVectorElementType() == EltVT
-              && SVT.getVectorNumElements() > NElts
-              && SVT.isScalableVector() == IsScalable && isTypeLegal(SVT)) {
+          if (SVT.getVectorElementType() == EltVT &&
+              SVT.isScalableVector() == IsScalable &&
+              SVT.getVectorElementCount().Min > EC.Min && isTypeLegal(SVT)) {
             TransformToType[i] = SVT;
             RegisterTypeForVT[i] = SVT;
             NumRegistersForVT[i] = 1;
@@ -1325,10 +1348,12 @@ void TargetLoweringBase::computeRegisterProperties(
           ValueTypeActions.setTypeAction(VT, TypeScalarizeVector);
         else if (PreferredAction == TypeSplitVector)
           ValueTypeActions.setTypeAction(VT, TypeSplitVector);
+        else if (EC.Min > 1)
+          ValueTypeActions.setTypeAction(VT, TypeSplitVector);
         else
-          // Set type action according to the number of elements.
-          ValueTypeActions.setTypeAction(VT, NElts == 1 ? TypeScalarizeVector
-                                                        : TypeSplitVector);
+          ValueTypeActions.setTypeAction(VT, EC.Scalable
+                                                 ? TypeScalarizeScalableVector
+                                                 : TypeScalarizeVector);
       } else {
         TransformToType[i] = NVT;
         ValueTypeActions.setTypeAction(VT, TypeWidenVector);
@@ -1376,7 +1401,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
                                                 EVT &IntermediateVT,
                                                 unsigned &NumIntermediates,
                                                 MVT &RegisterVT) const {
-  unsigned NumElts = VT.getVectorNumElements();
+  ElementCount EltCnt = VT.getVectorElementCount();
 
   // If there is a wider vector type with the same element type as this one,
   // or a promoted vector type that has the same number of elements which
@@ -1384,7 +1409,7 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
   // This handles things like <2 x float> -> <4 x float> and
   // <4 x i1> -> <4 x i32>.
   LegalizeTypeAction TA = getTypeAction(Context, VT);
-  if (NumElts != 1 && (TA == TypeWidenVector || TA == TypePromoteInteger)) {
+  if (EltCnt.Min != 1 && (TA == TypeWidenVector || TA == TypePromoteInteger)) {
     EVT RegisterEVT = getTypeToTransformTo(Context, VT);
     if (isTypeLegal(RegisterEVT)) {
       IntermediateVT = RegisterEVT;
@@ -1399,38 +1424,64 @@ unsigned TargetLoweringBase::getVectorTypeBreakdown(LLVMContext &Context, EVT VT
 
   unsigned NumVectorRegs = 1;
 
-  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally we
-  // could break down into LHS/RHS like LegalizeDAG does.
-  if (!isPowerOf2_32(NumElts)) {
-    NumVectorRegs = NumElts;
-    NumElts = 1;
+  // Scalable vectors cannot be scalarized, so handle the legalisation of the
+  // types like done elsewhere in SelectionDAG.
+  if (VT.isScalableVector() && !isPowerOf2_32(EltCnt.Min)) {
+    LegalizeKind LK;
+    EVT PartVT = VT;
+    do {
+      // Iterate until we've found a legal (part) type to hold VT.
+      LK = getTypeConversion(Context, PartVT);
+      PartVT = LK.second;
+    } while (LK.first != TypeLegal);
+
+    NumIntermediates =
+        VT.getVectorElementCount().Min / PartVT.getVectorElementCount().Min;
+
+    // FIXME: This code needs to be extended to handle more complex vector
+    // breakdowns, like nxv7i64 -> nxv8i64 -> 4 x nxv2i64. Currently the only
+    // supported cases are vectors that are broken down into equal parts
+    // such as nxv6i64 -> 3 x nxv2i64.
+    assert(NumIntermediates * PartVT.getVectorElementCount().Min ==
+               VT.getVectorElementCount().Min &&
+           "Expected an integer multiple of PartVT");
+    IntermediateVT = PartVT;
+    RegisterVT = getRegisterType(Context, IntermediateVT);
+    return NumIntermediates;
+  }
+
+  // FIXME: We don't support non-power-of-2-sized vectors for now.  Ideally
+  // we could break down into LHS/RHS like LegalizeDAG does.
+  if (!isPowerOf2_32(EltCnt.Min)) {
+    NumVectorRegs = EltCnt.Min;
+    EltCnt.Min = 1;
   }
 
   // Divide the input until we get to a supported size.  This will always
   // end with a scalar if the target doesn't support vectors.
-  while (NumElts > 1 && !isTypeLegal(
-                                   EVT::getVectorVT(Context, EltTy, NumElts))) {
-    NumElts >>= 1;
+  while (EltCnt.Min > 1 &&
+         !isTypeLegal(EVT::getVectorVT(Context, EltTy, EltCnt))) {
+    EltCnt.Min >>= 1;
     NumVectorRegs <<= 1;
   }
 
   NumIntermediates = NumVectorRegs;
 
-  EVT NewVT = EVT::getVectorVT(Context, EltTy, NumElts);
+  EVT NewVT = EVT::getVectorVT(Context, EltTy, EltCnt);
   if (!isTypeLegal(NewVT))
     NewVT = EltTy;
   IntermediateVT = NewVT;
 
   MVT DestVT = getRegisterType(Context, NewVT);
   RegisterVT = DestVT;
-  unsigned NewVTSize = NewVT.getSizeInBits();
 
-  // Convert sizes such as i33 to i64.
-  if (!isPowerOf2_32(NewVTSize))
-    NewVTSize = NextPowerOf2(NewVTSize);
-
-  if (EVT(DestVT).bitsLT(NewVT))   // Value is expanded, e.g. i64 -> i16.
+  if (EVT(DestVT).bitsLT(NewVT)) {  // Value is expanded, e.g. i64 -> i16.
+    TypeSize NewVTSize = NewVT.getSizeInBits();
+    // Convert sizes such as i33 to i64.
+    if (!isPowerOf2_32(NewVTSize.getKnownMinSize()))
+      NewVTSize = NewVTSize.NextPowerOf2();
     return NumVectorRegs*(NewVTSize/DestVT.getSizeInBits());
+  }
 
   // Otherwise, promotion or legal types use the same number of registers as
   // the vector decimated to the appropriate level.
@@ -1517,19 +1568,19 @@ void llvm::GetReturnInfo(CallingConv::ID CC, Type *ReturnType,
 /// alignment, not its logarithm.
 unsigned TargetLoweringBase::getByValTypeAlignment(Type *Ty,
                                                    const DataLayout &DL) const {
-  return DL.getABITypeAlignment(Ty);
+  return DL.getABITypeAlign(Ty).value();
 }
 
 bool TargetLoweringBase::allowsMemoryAccessForAlignment(
     LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
-    unsigned Alignment, MachineMemOperand::Flags Flags, bool *Fast) const {
+    Align Alignment, MachineMemOperand::Flags Flags, bool *Fast) const {
   // Check if the specified alignment is sufficient based on the data layout.
   // TODO: While using the data layout works in practice, a better solution
   // would be to implement this check directly (make this a virtual function).
   // For example, the ABI alignment may change based on software platform while
   // this function should only be affected by hardware implementation.
   Type *Ty = VT.getTypeForEVT(Context);
-  if (Alignment >= DL.getABITypeAlignment(Ty)) {
+  if (Alignment >= DL.getABITypeAlign(Ty)) {
     // Assume that an access that meets the ABI-specified alignment is fast.
     if (Fast != nullptr)
       *Fast = true;
@@ -1537,20 +1588,22 @@ bool TargetLoweringBase::allowsMemoryAccessForAlignment(
   }
 
   // This is a misaligned access.
-  return allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment, Flags, Fast);
+  return allowsMisalignedMemoryAccesses(VT, AddrSpace, Alignment.value(), Flags,
+                                        Fast);
 }
 
 bool TargetLoweringBase::allowsMemoryAccessForAlignment(
     LLVMContext &Context, const DataLayout &DL, EVT VT,
     const MachineMemOperand &MMO, bool *Fast) const {
   return allowsMemoryAccessForAlignment(Context, DL, VT, MMO.getAddrSpace(),
-                                        MMO.getAlignment(), MMO.getFlags(),
-                                        Fast);
+                                        MMO.getAlign(), MMO.getFlags(), Fast);
 }
 
-bool TargetLoweringBase::allowsMemoryAccess(
-    LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
-    unsigned Alignment, MachineMemOperand::Flags Flags, bool *Fast) const {
+bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
+                                            const DataLayout &DL, EVT VT,
+                                            unsigned AddrSpace, Align Alignment,
+                                            MachineMemOperand::Flags Flags,
+                                            bool *Fast) const {
   return allowsMemoryAccessForAlignment(Context, DL, VT, AddrSpace, Alignment,
                                         Flags, Fast);
 }
@@ -1559,8 +1612,8 @@ bool TargetLoweringBase::allowsMemoryAccess(LLVMContext &Context,
                                             const DataLayout &DL, EVT VT,
                                             const MachineMemOperand &MMO,
                                             bool *Fast) const {
-  return allowsMemoryAccess(Context, DL, VT, MMO.getAddrSpace(),
-                            MMO.getAlignment(), MMO.getFlags(), Fast);
+  return allowsMemoryAccess(Context, DL, VT, MMO.getAddrSpace(), MMO.getAlign(),
+                            MMO.getFlags(), Fast);
 }
 
 BranchProbability TargetLoweringBase::getPredictableBranchThreshold() const {
@@ -1644,7 +1697,7 @@ int TargetLoweringBase::InstructionOpcodeToISD(unsigned Opcode) const {
   case ExtractValue:   return ISD::MERGE_VALUES;
   case InsertValue:    return ISD::MERGE_VALUES;
   case LandingPad:     return 0;
-  case Freeze:         return 0;
+  case Freeze:         return ISD::FREEZE;
   }
 
   llvm_unreachable("Unknown instruction type encountered!");
@@ -1818,6 +1871,10 @@ void TargetLoweringBase::setMaximumJumpTableSize(unsigned Val) {
   MaximumJumpTableSize = Val;
 }
 
+bool TargetLoweringBase::isJumpTableRelative() const {
+  return getTargetMachine().isPositionIndependent();
+}
+
 //===----------------------------------------------------------------------===//
 //  Reciprocal Estimates
 //===----------------------------------------------------------------------===//
@@ -2005,3 +2062,119 @@ int TargetLoweringBase::getDivRefinementSteps(EVT VT,
 void TargetLoweringBase::finalizeLowering(MachineFunction &MF) const {
   MF.getRegInfo().freezeReservedRegs(MF);
 }
+
+MachineMemOperand::Flags
+TargetLoweringBase::getLoadMemOperandFlags(const LoadInst &LI,
+                                           const DataLayout &DL) const {
+  MachineMemOperand::Flags Flags = MachineMemOperand::MOLoad;
+  if (LI.isVolatile())
+    Flags |= MachineMemOperand::MOVolatile;
+
+  if (LI.hasMetadata(LLVMContext::MD_nontemporal))
+    Flags |= MachineMemOperand::MONonTemporal;
+
+  if (LI.hasMetadata(LLVMContext::MD_invariant_load))
+    Flags |= MachineMemOperand::MOInvariant;
+
+  if (isDereferenceablePointer(LI.getPointerOperand(), LI.getType(), DL))
+    Flags |= MachineMemOperand::MODereferenceable;
+
+  Flags |= getTargetMMOFlags(LI);
+  return Flags;
+}
+
+MachineMemOperand::Flags
+TargetLoweringBase::getStoreMemOperandFlags(const StoreInst &SI,
+                                            const DataLayout &DL) const {
+  MachineMemOperand::Flags Flags = MachineMemOperand::MOStore;
+
+  if (SI.isVolatile())
+    Flags |= MachineMemOperand::MOVolatile;
+
+  if (SI.hasMetadata(LLVMContext::MD_nontemporal))
+    Flags |= MachineMemOperand::MONonTemporal;
+
+  // FIXME: Not preserving dereferenceable
+  Flags |= getTargetMMOFlags(SI);
+  return Flags;
+}
+
+MachineMemOperand::Flags
+TargetLoweringBase::getAtomicMemOperandFlags(const Instruction &AI,
+                                             const DataLayout &DL) const {
+  auto Flags = MachineMemOperand::MOLoad | MachineMemOperand::MOStore;
+
+  if (const AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(&AI)) {
+    if (RMW->isVolatile())
+      Flags |= MachineMemOperand::MOVolatile;
+  } else if (const AtomicCmpXchgInst *CmpX = dyn_cast<AtomicCmpXchgInst>(&AI)) {
+    if (CmpX->isVolatile())
+      Flags |= MachineMemOperand::MOVolatile;
+  } else
+    llvm_unreachable("not an atomic instruction");
+
+  // FIXME: Not preserving dereferenceable
+  Flags |= getTargetMMOFlags(AI);
+  return Flags;
+}
+
+//===----------------------------------------------------------------------===//
+//  GlobalISel Hooks
+//===----------------------------------------------------------------------===//
+
+bool TargetLoweringBase::shouldLocalize(const MachineInstr &MI,
+                                        const TargetTransformInfo *TTI) const {
+  auto &MF = *MI.getMF();
+  auto &MRI = MF.getRegInfo();
+  // Assuming a spill and reload of a value has a cost of 1 instruction each,
+  // this helper function computes the maximum number of uses we should consider
+  // for remat. E.g. on arm64 global addresses take 2 insts to materialize. We
+  // break even in terms of code size when the original MI has 2 users vs
+  // choosing to potentially spill. Any more than 2 users we we have a net code
+  // size increase. This doesn't take into account register pressure though.
+  auto maxUses = [](unsigned RematCost) {
+    // A cost of 1 means remats are basically free.
+    if (RematCost == 1)
+      return UINT_MAX;
+    if (RematCost == 2)
+      return 2U;
+
+    // Remat is too expensive, only sink if there's one user.
+    if (RematCost > 2)
+      return 1U;
+    llvm_unreachable("Unexpected remat cost");
+  };
+
+  // Helper to walk through uses and terminate if we've reached a limit. Saves
+  // us spending time traversing uses if all we want to know is if it's >= min.
+  auto isUsesAtMost = [&](unsigned Reg, unsigned MaxUses) {
+    unsigned NumUses = 0;
+    auto UI = MRI.use_instr_nodbg_begin(Reg), UE = MRI.use_instr_nodbg_end();
+    for (; UI != UE && NumUses < MaxUses; ++UI) {
+      NumUses++;
+    }
+    // If we haven't reached the end yet then there are more than MaxUses users.
+    return UI == UE;
+  };
+
+  switch (MI.getOpcode()) {
+  default:
+    return false;
+  // Constants-like instructions should be close to their users.
+  // We don't want long live-ranges for them.
+  case TargetOpcode::G_CONSTANT:
+  case TargetOpcode::G_FCONSTANT:
+  case TargetOpcode::G_FRAME_INDEX:
+  case TargetOpcode::G_INTTOPTR:
+    return true;
+  case TargetOpcode::G_GLOBAL_VALUE: {
+    unsigned RematCost = TTI->getGISelRematGlobalCost();
+    Register Reg = MI.getOperand(0).getReg();
+    unsigned MaxUses = maxUses(RematCost);
+    if (MaxUses == UINT_MAX)
+      return true; // Remats are "free" so always localize.
+    bool B = isUsesAtMost(Reg, MaxUses);
+    return B;
+  }
+  }
+}
diff --git a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 8cb9814300d1..27bebe503ce6 100644
--- a/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -21,12 +21,16 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/IR/Comdat.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/DiagnosticInfo.h"
+#include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalObject.h"
@@ -52,8 +56,8 @@
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CodeGen.h"
-#include "llvm/Support/Format.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
 #include <cassert>
@@ -84,6 +88,15 @@ static void GetObjCImageInfo(Module &M, unsigned &Version, unsigned &Flags,
     } else if (Key == "Objective-C Image Info Section") {
       Section = cast<MDString>(MFE.Val)->getString();
     }
+    // Backend generates L_OBJC_IMAGE_INFO from Swift ABI version + major + minor +
+    // "Objective-C Garbage Collection".
+    else if (Key == "Swift ABI Version") {
+      Flags |= (mdconst::extract<ConstantInt>(MFE.Val)->getZExtValue()) << 8;
+    } else if (Key == "Swift Major Version") {
+      Flags |= (mdconst::extract<ConstantInt>(MFE.Val)->getZExtValue()) << 24;
+    } else if (Key == "Swift Minor Version") {
+      Flags |= (mdconst::extract<ConstantInt>(MFE.Val)->getZExtValue()) << 16;
+    }
   }
 }
 
@@ -97,6 +110,7 @@ void TargetLoweringObjectFileELF::Initialize(MCContext &Ctx,
   TM = &TgtM;
 
   CodeModel::Model CM = TgtM.getCodeModel();
+  InitializeELF(TgtM.Options.UseInitArray);
 
   switch (TgtM.getTargetTriple().getArch()) {
   case Triple::arm:
@@ -277,8 +291,8 @@ void TargetLoweringObjectFileELF::emitModuleMetadata(MCStreamer &Streamer,
       if (cast<MDNode>(Operand)->getNumOperands() != 2)
         report_fatal_error("invalid llvm.linker.options");
       for (const auto &Option : cast<MDNode>(Operand)->operands()) {
-        Streamer.EmitBytes(cast<MDString>(Option)->getString());
-        Streamer.EmitIntValue(0, 1);
+        Streamer.emitBytes(cast<MDString>(Option)->getString());
+        Streamer.emitInt8(0);
       }
     }
   }
@@ -290,9 +304,9 @@ void TargetLoweringObjectFileELF::emitModuleMetadata(MCStreamer &Streamer,
     Streamer.SwitchSection(S);
 
     for (const auto *Operand : DependentLibraries->operands()) {
-      Streamer.EmitBytes(
+      Streamer.emitBytes(
           cast<MDString>(cast<MDNode>(Operand)->getOperand(0))->getString());
-      Streamer.EmitIntValue(0, 1);
+      Streamer.emitInt8(0);
     }
   }
 
@@ -304,9 +318,9 @@ void TargetLoweringObjectFileELF::emitModuleMetadata(MCStreamer &Streamer,
   if (!Section.empty()) {
     auto *S = C.getELFSection(Section, ELF::SHT_PROGBITS, ELF::SHF_ALLOC);
     Streamer.SwitchSection(S);
-    Streamer.EmitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO")));
-    Streamer.EmitIntValue(Version, 4);
-    Streamer.EmitIntValue(Flags, 4);
+    Streamer.emitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO")));
+    Streamer.emitInt32(Version);
+    Streamer.emitInt32(Flags);
     Streamer.AddBlankLine();
   }
 
@@ -370,20 +384,20 @@ void TargetLoweringObjectFileELF::emitPersonalityValue(
   NameData += Sym->getName();
   MCSymbolELF *Label =
       cast<MCSymbolELF>(getContext().getOrCreateSymbol(NameData));
-  Streamer.EmitSymbolAttribute(Label, MCSA_Hidden);
-  Streamer.EmitSymbolAttribute(Label, MCSA_Weak);
+  Streamer.emitSymbolAttribute(Label, MCSA_Hidden);
+  Streamer.emitSymbolAttribute(Label, MCSA_Weak);
   unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_WRITE | ELF::SHF_GROUP;
   MCSection *Sec = getContext().getELFNamedSection(".data", Label->getName(),
                                                    ELF::SHT_PROGBITS, Flags, 0);
   unsigned Size = DL.getPointerSize();
   Streamer.SwitchSection(Sec);
-  Streamer.EmitValueToAlignment(DL.getPointerABIAlignment(0).value());
-  Streamer.EmitSymbolAttribute(Label, MCSA_ELF_TypeObject);
+  Streamer.emitValueToAlignment(DL.getPointerABIAlignment(0).value());
+  Streamer.emitSymbolAttribute(Label, MCSA_ELF_TypeObject);
   const MCExpr *E = MCConstantExpr::create(Size, getContext());
   Streamer.emitELFSize(Label, E);
-  Streamer.EmitLabel(Label);
+  Streamer.emitLabel(Label);
 
-  Streamer.EmitSymbolValue(Sym, Size);
+  Streamer.emitSymbolValue(Sym, Size);
 }
 
 const MCExpr *TargetLoweringObjectFileELF::getTTypeGlobalReference(
@@ -420,6 +434,8 @@ static SectionKind getELFKindForNamedSection(StringRef Name, SectionKind K) {
   //   .section   .eh_frame,"a",@progbits
 
   if (Name == getInstrProfSectionName(IPSK_covmap, Triple::ELF,
+                                      /*AddSegmentInfo=*/false) ||
+      Name == getInstrProfSectionName(IPSK_covfun, Triple::ELF,
                                       /*AddSegmentInfo=*/false))
     return SectionKind::getMetadata();
 
@@ -512,8 +528,8 @@ static const Comdat *getELFComdat(const GlobalValue *GV) {
   return C;
 }
 
-static const MCSymbolELF *getAssociatedSymbol(const GlobalObject *GO,
-                                              const TargetMachine &TM) {
+static const MCSymbolELF *getLinkedToSymbol(const GlobalObject *GO,
+                                            const TargetMachine &TM) {
   MDNode *MD = GO->getMetadata(LLVMContext::MD_associated);
   if (!MD)
     return nullptr;
@@ -554,6 +570,75 @@ static unsigned getEntrySizeForKind(SectionKind Kind) {
   }
 }
 
+/// Return the section prefix name used by options FunctionsSections and
+/// DataSections.
+static StringRef getSectionPrefixForGlobal(SectionKind Kind) {
+  if (Kind.isText())
+    return ".text";
+  if (Kind.isReadOnly())
+    return ".rodata";
+  if (Kind.isBSS())
+    return ".bss";
+  if (Kind.isThreadData())
+    return ".tdata";
+  if (Kind.isThreadBSS())
+    return ".tbss";
+  if (Kind.isData())
+    return ".data";
+  if (Kind.isReadOnlyWithRel())
+    return ".data.rel.ro";
+  llvm_unreachable("Unknown section kind");
+}
+
+static SmallString<128>
+getELFSectionNameForGlobal(const GlobalObject *GO, SectionKind Kind,
+                           Mangler &Mang, const TargetMachine &TM,
+                           unsigned EntrySize, bool UniqueSectionName) {
+  SmallString<128> Name;
+  if (Kind.isMergeableCString()) {
+    // We also need alignment here.
+    // FIXME: this is getting the alignment of the character, not the
+    // alignment of the global!
+    Align Alignment = GO->getParent()->getDataLayout().getPreferredAlign(
+        cast<GlobalVariable>(GO));
+
+    std::string SizeSpec = ".rodata.str" + utostr(EntrySize) + ".";
+    Name = SizeSpec + utostr(Alignment.value());
+  } else if (Kind.isMergeableConst()) {
+    Name = ".rodata.cst";
+    Name += utostr(EntrySize);
+  } else {
+    Name = getSectionPrefixForGlobal(Kind);
+  }
+
+  bool HasPrefix = false;
+  if (const auto *F = dyn_cast<Function>(GO)) {
+    if (Optional<StringRef> Prefix = F->getSectionPrefix()) {
+      Name += *Prefix;
+      HasPrefix = true;
+    }
+  }
+
+  if (UniqueSectionName) {
+    Name.push_back('.');
+    TM.getNameWithPrefix(Name, GO, Mang, /*MayAlwaysUsePrivate*/true);
+  } else if (HasPrefix)
+    Name.push_back('.');
+  return Name;
+}
+
+namespace {
+class LoweringDiagnosticInfo : public DiagnosticInfo {
+  const Twine &Msg;
+
+public:
+  LoweringDiagnosticInfo(const Twine &DiagMsg,
+                         DiagnosticSeverity Severity = DS_Error)
+      : DiagnosticInfo(DK_Lowering, Severity), Msg(DiagMsg) {}
+  void print(DiagnosticPrinter &DP) const override { DP << Msg; }
+};
+}
+
 MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
   StringRef SectionName = GO->getSection();
@@ -589,42 +674,84 @@ MCSection *TargetLoweringObjectFileELF::getExplicitSectionGlobal(
     Flags |= ELF::SHF_GROUP;
   }
 
+  unsigned EntrySize = getEntrySizeForKind(Kind);
+
   // A section can have at most one associated section. Put each global with
   // MD_associated in a unique section.
   unsigned UniqueID = MCContext::GenericSectionID;
-  const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM);
-  if (AssociatedSymbol) {
+  const MCSymbolELF *LinkedToSym = getLinkedToSymbol(GO, TM);
+  if (LinkedToSym) {
     UniqueID = NextUniqueID++;
     Flags |= ELF::SHF_LINK_ORDER;
+  } else {
+    if (getContext().getAsmInfo()->useIntegratedAssembler()) {
+      // Symbols must be placed into sections with compatible entry
+      // sizes. Generate unique sections for symbols that have not
+      // been assigned to compatible sections.
+      if (Flags & ELF::SHF_MERGE) {
+        auto maybeID = getContext().getELFUniqueIDForEntsize(SectionName, Flags,
+                                                             EntrySize);
+        if (maybeID)
+          UniqueID = *maybeID;
+        else {
+          // If the user has specified the same section name as would be created
+          // implicitly for this symbol e.g. .rodata.str1.1, then we don't need
+          // to unique the section as the entry size for this symbol will be
+          // compatible with implicitly created sections.
+          SmallString<128> ImplicitSectionNameStem = getELFSectionNameForGlobal(
+              GO, Kind, getMangler(), TM, EntrySize, false);
+          if (!(getContext().isELFImplicitMergeableSectionNamePrefix(
+                    SectionName) &&
+                SectionName.startswith(ImplicitSectionNameStem)))
+            UniqueID = NextUniqueID++;
+        }
+      } else {
+        // We need to unique the section if the user has explicity
+        // assigned a non-mergeable symbol to a section name for
+        // a generic mergeable section.
+        if (getContext().isELFGenericMergeableSection(SectionName)) {
+          auto maybeID = getContext().getELFUniqueIDForEntsize(
+              SectionName, Flags, EntrySize);
+          UniqueID = maybeID ? *maybeID : NextUniqueID++;
+        }
+      }
+    } else {
+      // If two symbols with differing sizes end up in the same mergeable
+      // section that section can be assigned an incorrect entry size. To avoid
+      // this we usually put symbols of the same size into distinct mergeable
+      // sections with the same name. Doing so relies on the ",unique ,"
+      // assembly feature. This feature is not avalible until bintuils
+      // version 2.35 (https://sourceware.org/bugzilla/show_bug.cgi?id=25380).
+      Flags &= ~ELF::SHF_MERGE;
+      EntrySize = 0;
+    }
   }
 
   MCSectionELF *Section = getContext().getELFSection(
       SectionName, getELFSectionType(SectionName, Kind), Flags,
-      getEntrySizeForKind(Kind), Group, UniqueID, AssociatedSymbol);
+      EntrySize, Group, UniqueID, LinkedToSym);
   // Make sure that we did not get some other section with incompatible sh_link.
   // This should not be possible due to UniqueID code above.
-  assert(Section->getAssociatedSymbol() == AssociatedSymbol &&
+  assert(Section->getLinkedToSymbol() == LinkedToSym &&
          "Associated symbol mismatch between sections");
-  return Section;
-}
 
-/// Return the section prefix name used by options FunctionsSections and
-/// DataSections.
-static StringRef getSectionPrefixForGlobal(SectionKind Kind) {
-  if (Kind.isText())
-    return ".text";
-  if (Kind.isReadOnly())
-    return ".rodata";
-  if (Kind.isBSS())
-    return ".bss";
-  if (Kind.isThreadData())
-    return ".tdata";
-  if (Kind.isThreadBSS())
-    return ".tbss";
-  if (Kind.isData())
-    return ".data";
-  assert(Kind.isReadOnlyWithRel() && "Unknown section kind");
-  return ".data.rel.ro";
+  if (!getContext().getAsmInfo()->useIntegratedAssembler()) {
+    // If we are not using the integrated assembler then this symbol might have
+    // been placed in an incompatible mergeable section. Emit an error if this
+    // is the case to avoid creating broken output.
+    if ((Section->getFlags() & ELF::SHF_MERGE) &&
+        (Section->getEntrySize() != getEntrySizeForKind(Kind)))
+      GO->getContext().diagnose(LoweringDiagnosticInfo(
+          "Symbol '" + GO->getName() + "' from module '" +
+          (GO->getParent() ? GO->getParent()->getSourceFileName() : "unknown") +
+          "' required a section with entry-size=" +
+          Twine(getEntrySizeForKind(Kind)) + " but was placed in section '" +
+          SectionName + "' with entry-size=" + Twine(Section->getEntrySize()) +
+          ": Explicit assignment by pragma or attribute of an incompatible "
+          "symbol to this section?"));
+  }
+
+  return Section;
 }
 
 static MCSectionELF *selectELFSectionForGlobal(
@@ -641,39 +768,19 @@ static MCSectionELF *selectELFSectionForGlobal(
   // Get the section entry size based on the kind.
   unsigned EntrySize = getEntrySizeForKind(Kind);
 
-  SmallString<128> Name;
-  if (Kind.isMergeableCString()) {
-    // We also need alignment here.
-    // FIXME: this is getting the alignment of the character, not the
-    // alignment of the global!
-    unsigned Align = GO->getParent()->getDataLayout().getPreferredAlignment(
-        cast<GlobalVariable>(GO));
-
-    std::string SizeSpec = ".rodata.str" + utostr(EntrySize) + ".";
-    Name = SizeSpec + utostr(Align);
-  } else if (Kind.isMergeableConst()) {
-    Name = ".rodata.cst";
-    Name += utostr(EntrySize);
-  } else {
-    Name = getSectionPrefixForGlobal(Kind);
-  }
-
-  if (const auto *F = dyn_cast<Function>(GO)) {
-    const auto &OptionalPrefix = F->getSectionPrefix();
-    if (OptionalPrefix)
-      Name += *OptionalPrefix;
-  }
-
+  bool UniqueSectionName = false;
   unsigned UniqueID = MCContext::GenericSectionID;
   if (EmitUniqueSection) {
     if (TM.getUniqueSectionNames()) {
-      Name.push_back('.');
-      TM.getNameWithPrefix(Name, GO, Mang, true /*MayAlwaysUsePrivate*/);
+      UniqueSectionName = true;
     } else {
       UniqueID = *NextUniqueID;
       (*NextUniqueID)++;
     }
   }
+  SmallString<128> Name = getELFSectionNameForGlobal(
+      GO, Kind, Mang, TM, EntrySize, UniqueSectionName);
+
   // Use 0 as the unique ID for execute-only text.
   if (Kind.isExecuteOnly())
     UniqueID = 0;
@@ -696,16 +803,16 @@ MCSection *TargetLoweringObjectFileELF::SelectSectionForGlobal(
   }
   EmitUniqueSection |= GO->hasComdat();
 
-  const MCSymbolELF *AssociatedSymbol = getAssociatedSymbol(GO, TM);
-  if (AssociatedSymbol) {
+  const MCSymbolELF *LinkedToSym = getLinkedToSymbol(GO, TM);
+  if (LinkedToSym) {
     EmitUniqueSection = true;
     Flags |= ELF::SHF_LINK_ORDER;
   }
 
   MCSectionELF *Section = selectELFSectionForGlobal(
       getContext(), GO, Kind, getMangler(), TM, EmitUniqueSection, Flags,
-      &NextUniqueID, AssociatedSymbol);
-  assert(Section->getAssociatedSymbol() == AssociatedSymbol);
+      &NextUniqueID, LinkedToSym);
+  assert(Section->getLinkedToSymbol() == LinkedToSym);
   return Section;
 }
 
@@ -735,7 +842,7 @@ bool TargetLoweringObjectFileELF::shouldPutJumpTableInFunctionSection(
 /// information, return a section that it should be placed in.
 MCSection *TargetLoweringObjectFileELF::getSectionForConstant(
     const DataLayout &DL, SectionKind Kind, const Constant *C,
-    unsigned &Align) const {
+    Align &Alignment) const {
   if (Kind.isMergeableConst4() && MergeableConst4Section)
     return MergeableConst4Section;
   if (Kind.isMergeableConst8() && MergeableConst8Section)
@@ -751,6 +858,46 @@ MCSection *TargetLoweringObjectFileELF::getSectionForConstant(
   return DataRelROSection;
 }
 
+/// Returns a unique section for the given machine basic block.
+MCSection *TargetLoweringObjectFileELF::getSectionForMachineBasicBlock(
+    const Function &F, const MachineBasicBlock &MBB,
+    const TargetMachine &TM) const {
+  assert(MBB.isBeginSection() && "Basic block does not start a section!");
+  unsigned UniqueID = MCContext::GenericSectionID;
+
+  // For cold sections use the .text.unlikely prefix along with the parent
+  // function name. All cold blocks for the same function go to the same
+  // section. Similarly all exception blocks are grouped by symbol name
+  // under the .text.eh prefix. For regular sections, we either use a unique
+  // name, or a unique ID for the section.
+  SmallString<128> Name;
+  if (MBB.getSectionID() == MBBSectionID::ColdSectionID) {
+    Name += ".text.unlikely.";
+    Name += MBB.getParent()->getName();
+  } else if (MBB.getSectionID() == MBBSectionID::ExceptionSectionID) {
+    Name += ".text.eh.";
+    Name += MBB.getParent()->getName();
+  } else {
+    Name += MBB.getParent()->getSection()->getName();
+    if (TM.getUniqueBasicBlockSectionNames()) {
+      Name += ".";
+      Name += MBB.getSymbol()->getName();
+    } else {
+      UniqueID = NextUniqueID++;
+    }
+  }
+
+  unsigned Flags = ELF::SHF_ALLOC | ELF::SHF_EXECINSTR;
+  std::string GroupName = "";
+  if (F.hasComdat()) {
+    Flags |= ELF::SHF_GROUP;
+    GroupName = F.getComdat()->getName().str();
+  }
+  return getContext().getELFSection(Name, ELF::SHT_PROGBITS, Flags,
+                                    0 /* Entry Size */, GroupName, UniqueID,
+                                    nullptr);
+}
+
 static MCSectionELF *getStaticStructorSection(MCContext &Ctx, bool UseInitArray,
                                               bool IsCtor, unsigned Priority,
                                               const MCSymbol *KeySym) {
@@ -888,8 +1035,8 @@ void TargetLoweringObjectFileMachO::emitModuleMetadata(MCStreamer &Streamer,
     for (const auto *Option : LinkerOptions->operands()) {
       SmallVector<std::string, 4> StrOptions;
       for (const auto &Piece : cast<MDNode>(Option)->operands())
-        StrOptions.push_back(cast<MDString>(Piece)->getString());
-      Streamer.EmitLinkerOptions(StrOptions);
+        StrOptions.push_back(std::string(cast<MDString>(Piece)->getString()));
+      Streamer.emitLinkerOptions(StrOptions);
     }
   }
 
@@ -918,10 +1065,10 @@ void TargetLoweringObjectFileMachO::emitModuleMetadata(MCStreamer &Streamer,
   MCSectionMachO *S = getContext().getMachOSection(
       Segment, Section, TAA, StubSize, SectionKind::getData());
   Streamer.SwitchSection(S);
-  Streamer.EmitLabel(getContext().
+  Streamer.emitLabel(getContext().
                      getOrCreateSymbol(StringRef("L_OBJC_IMAGE_INFO")));
-  Streamer.EmitIntValue(VersionVal, 4);
-  Streamer.EmitIntValue(ImageInfoFlags, 4);
+  Streamer.emitInt32(VersionVal);
+  Streamer.emitInt32(ImageInfoFlags);
   Streamer.AddBlankLine();
 }
 
@@ -998,16 +1145,16 @@ MCSection *TargetLoweringObjectFileMachO::SelectSectionForGlobal(
 
   // FIXME: Alignment check should be handled by section classifier.
   if (Kind.isMergeable1ByteCString() &&
-      GO->getParent()->getDataLayout().getPreferredAlignment(
-          cast<GlobalVariable>(GO)) < 32)
+      GO->getParent()->getDataLayout().getPreferredAlign(
+          cast<GlobalVariable>(GO)) < Align(32))
     return CStringSection;
 
   // Do not put 16-bit arrays in the UString section if they have an
   // externally visible label, this runs into issues with certain linker
   // versions.
   if (Kind.isMergeable2ByteCString() && !GO->hasExternalLinkage() &&
-      GO->getParent()->getDataLayout().getPreferredAlignment(
-          cast<GlobalVariable>(GO)) < 32)
+      GO->getParent()->getDataLayout().getPreferredAlign(
+          cast<GlobalVariable>(GO)) < Align(32))
     return UStringSection;
 
   // With MachO only variables whose corresponding symbol starts with 'l' or
@@ -1047,7 +1194,7 @@ MCSection *TargetLoweringObjectFileMachO::SelectSectionForGlobal(
 
 MCSection *TargetLoweringObjectFileMachO::getSectionForConstant(
     const DataLayout &DL, SectionKind Kind, const Constant *C,
-    unsigned &Align) const {
+    Align &Alignment) const {
   // If this constant requires a relocation, we have to put it in the data
   // segment, not in the text segment.
   if (Kind.isData() || Kind.isReadOnlyWithRel())
@@ -1453,8 +1600,8 @@ void TargetLoweringObjectFileCOFF::emitModuleMetadata(MCStreamer &Streamer,
       for (const auto &Piece : cast<MDNode>(Option)->operands()) {
         // Lead with a space for consistency with our dllexport implementation.
         std::string Directive(" ");
-        Directive.append(cast<MDString>(Piece)->getString());
-        Streamer.EmitBytes(Directive);
+        Directive.append(std::string(cast<MDString>(Piece)->getString()));
+        Streamer.emitBytes(Directive);
       }
     }
   }
@@ -1472,9 +1619,9 @@ void TargetLoweringObjectFileCOFF::emitModuleMetadata(MCStreamer &Streamer,
       Section, COFF::IMAGE_SCN_CNT_INITIALIZED_DATA | COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getReadOnly());
   Streamer.SwitchSection(S);
-  Streamer.EmitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO")));
-  Streamer.EmitIntValue(Version, 4);
-  Streamer.EmitIntValue(Flags, 4);
+  Streamer.emitLabel(C.getOrCreateSymbol(StringRef("OBJC_IMAGE_INFO")));
+  Streamer.emitInt32(Version);
+  Streamer.emitInt32(Flags);
   Streamer.AddBlankLine();
 }
 
@@ -1599,7 +1746,7 @@ const MCExpr *TargetLoweringObjectFileCOFF::lowerRelativeReference(
 static std::string APIntToHexString(const APInt &AI) {
   unsigned Width = (AI.getBitWidth() / 8) * 2;
   std::string HexString = AI.toString(16, /*Signed=*/false);
-  transform(HexString.begin(), HexString.end(), HexString.begin(), tolower);
+  llvm::transform(HexString, HexString.begin(), tolower);
   unsigned Size = HexString.size();
   assert(Width >= Size && "hex string is too large!");
   HexString.insert(HexString.begin(), Width - Size, '0');
@@ -1617,8 +1764,8 @@ static std::string scalarConstantToHexString(const Constant *C) {
     return APIntToHexString(CI->getValue());
   } else {
     unsigned NumElements;
-    if (isa<VectorType>(Ty))
-      NumElements = Ty->getVectorNumElements();
+    if (auto *VTy = dyn_cast<VectorType>(Ty))
+      NumElements = cast<FixedVectorType>(VTy)->getNumElements();
     else
       NumElements = Ty->getArrayNumElements();
     std::string HexString;
@@ -1630,7 +1777,7 @@ static std::string scalarConstantToHexString(const Constant *C) {
 
 MCSection *TargetLoweringObjectFileCOFF::getSectionForConstant(
     const DataLayout &DL, SectionKind Kind, const Constant *C,
-    unsigned &Align) const {
+    Align &Alignment) const {
   if (Kind.isMergeableConst() && C &&
       getContext().getAsmInfo()->hasCOFFComdatConstants()) {
     // This creates comdat sections with the given symbol name, but unless
@@ -1642,25 +1789,25 @@ MCSection *TargetLoweringObjectFileCOFF::getSectionForConstant(
                                      COFF::IMAGE_SCN_LNK_COMDAT;
     std::string COMDATSymName;
     if (Kind.isMergeableConst4()) {
-      if (Align <= 4) {
+      if (Alignment <= 4) {
         COMDATSymName = "__real@" + scalarConstantToHexString(C);
-        Align = 4;
+        Alignment = Align(4);
       }
     } else if (Kind.isMergeableConst8()) {
-      if (Align <= 8) {
+      if (Alignment <= 8) {
         COMDATSymName = "__real@" + scalarConstantToHexString(C);
-        Align = 8;
+        Alignment = Align(8);
       }
     } else if (Kind.isMergeableConst16()) {
       // FIXME: These may not be appropriate for non-x86 architectures.
-      if (Align <= 16) {
+      if (Alignment <= 16) {
         COMDATSymName = "__xmm@" + scalarConstantToHexString(C);
-        Align = 16;
+        Alignment = Align(16);
       }
     } else if (Kind.isMergeableConst32()) {
-      if (Align <= 32) {
+      if (Alignment <= 32) {
         COMDATSymName = "__ymm@" + scalarConstantToHexString(C);
-        Align = 32;
+        Alignment = Align(32);
       }
     }
 
@@ -1670,10 +1817,10 @@ MCSection *TargetLoweringObjectFileCOFF::getSectionForConstant(
                                          COFF::IMAGE_COMDAT_SELECT_ANY);
   }
 
-  return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C, Align);
+  return TargetLoweringObjectFile::getSectionForConstant(DL, Kind, C,
+                                                         Alignment);
 }
 
-
 //===----------------------------------------------------------------------===//
 //                                  Wasm
 //===----------------------------------------------------------------------===//
@@ -1691,16 +1838,6 @@ static const Comdat *getWasmComdat(const GlobalValue *GV) {
   return C;
 }
 
-static SectionKind getWasmKindForNamedSection(StringRef Name, SectionKind K) {
-  // If we're told we have function data, then use that.
-  if (K.isText())
-    return SectionKind::getText();
-
-  // Otherwise, ignore whatever section type the generic impl detected and use
-  // a plain data section.
-  return SectionKind::getData();
-}
-
 MCSection *TargetLoweringObjectFileWasm::getExplicitSectionGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
   // We don't support explict section names for functions in the wasm object
@@ -1711,7 +1848,13 @@ MCSection *TargetLoweringObjectFileWasm::getExplicitSectionGlobal(
 
   StringRef Name = GO->getSection();
 
-  Kind = getWasmKindForNamedSection(Name, Kind);
+  // Certain data sections we treat as named custom sections rather than
+  // segments within the data section.
+  // This could be avoided if all data segements (the wasm sense) were
+  // represented as their own sections (in the llvm sense).
+  // TODO(sbc): https://github.com/WebAssembly/tool-conventions/issues/138
+  if (Name == ".llvmcmd" || Name == ".llvmbc")
+    Kind = SectionKind::getMetadata();
 
   StringRef Group = "";
   if (const Comdat *C = getWasmComdat(GO)) {
@@ -1827,11 +1970,61 @@ MCSection *TargetLoweringObjectFileWasm::getStaticDtorSection(
 //===----------------------------------------------------------------------===//
 //                                  XCOFF
 //===----------------------------------------------------------------------===//
+MCSymbol *
+TargetLoweringObjectFileXCOFF::getTargetSymbol(const GlobalValue *GV,
+                                               const TargetMachine &TM) const {
+  if (TM.getDataSections())
+    report_fatal_error("XCOFF unique data sections not yet implemented");
+
+  // We always use a qualname symbol for a GV that represents
+  // a declaration, a function descriptor, or a common symbol.
+  // It is inherently ambiguous when the GO represents the address of a
+  // function, as the GO could either represent a function descriptor or a
+  // function entry point. We choose to always return a function descriptor
+  // here.
+  if (const GlobalObject *GO = dyn_cast<GlobalObject>(GV)) {
+    if (GO->isDeclarationForLinker())
+      return cast<MCSectionXCOFF>(getSectionForExternalReference(GO, TM))
+          ->getQualNameSymbol();
+
+    SectionKind GOKind = getKindForGlobal(GO, TM);
+    if (GOKind.isText())
+      return cast<MCSectionXCOFF>(
+                 getSectionForFunctionDescriptor(cast<Function>(GO), TM))
+          ->getQualNameSymbol();
+    if (GOKind.isCommon() || GOKind.isBSSLocal())
+      return cast<MCSectionXCOFF>(SectionForGlobal(GO, GOKind, TM))
+          ->getQualNameSymbol();
+  }
+
+  // For all other cases, fall back to getSymbol to return the unqualified name.
+  // This could change for a GV that is a GlobalVariable when we decide to
+  // support -fdata-sections since we could avoid having label symbols if the
+  // linkage name is applied to the csect symbol.
+  return nullptr;
+}
+
 MCSection *TargetLoweringObjectFileXCOFF::getExplicitSectionGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
   report_fatal_error("XCOFF explicit sections not yet implemented.");
 }
 
+MCSection *TargetLoweringObjectFileXCOFF::getSectionForExternalReference(
+    const GlobalObject *GO, const TargetMachine &TM) const {
+  assert(GO->isDeclarationForLinker() &&
+         "Tried to get ER section for a defined global.");
+
+  SmallString<128> Name;
+  getNameWithPrefix(Name, GO, TM);
+  XCOFF::StorageClass SC =
+      TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO);
+
+  // Externals go into a csect of type ER.
+  return getContext().getXCOFFSection(
+      Name, isa<Function>(GO) ? XCOFF::XMC_DS : XCOFF::XMC_UA, XCOFF::XTY_ER,
+      SC, SectionKind::getMetadata());
+}
+
 MCSection *TargetLoweringObjectFileXCOFF::SelectSectionForGlobal(
     const GlobalObject *GO, SectionKind Kind, const TargetMachine &TM) const {
   assert(!TM.getFunctionSections() && !TM.getDataSections() &&
@@ -1850,16 +2043,13 @@ MCSection *TargetLoweringObjectFileXCOFF::SelectSectionForGlobal(
   }
 
   if (Kind.isMergeableCString()) {
-    if (!Kind.isMergeable1ByteCString())
-      report_fatal_error("Unhandled multi-byte mergeable string kind.");
-
-    unsigned Align = GO->getParent()->getDataLayout().getPreferredAlignment(
+    Align Alignment = GO->getParent()->getDataLayout().getPreferredAlign(
         cast<GlobalVariable>(GO));
 
     unsigned EntrySize = getEntrySizeForKind(Kind);
     std::string SizeSpec = ".rodata.str" + utostr(EntrySize) + ".";
     SmallString<128> Name;
-    Name = SizeSpec + utostr(Align);
+    Name = SizeSpec + utostr(Alignment.value());
 
     return getContext().getXCOFFSection(
         Name, XCOFF::XMC_RO, XCOFF::XTY_SD,
@@ -1906,7 +2096,7 @@ bool TargetLoweringObjectFileXCOFF::shouldPutJumpTableInFunctionSection(
 /// information, return a section that it should be placed in.
 MCSection *TargetLoweringObjectFileXCOFF::getSectionForConstant(
     const DataLayout &DL, SectionKind Kind, const Constant *C,
-    unsigned &Align) const {
+    Align &Alignment) const {
   //TODO: Enable emiting constant pool to unique sections when we support it.
   return ReadOnlySection;
 }
@@ -1943,11 +2133,41 @@ XCOFF::StorageClass TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(
     return XCOFF::C_HIDEXT;
   case GlobalValue::ExternalLinkage:
   case GlobalValue::CommonLinkage:
+  case GlobalValue::AvailableExternallyLinkage:
     return XCOFF::C_EXT;
   case GlobalValue::ExternalWeakLinkage:
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
     return XCOFF::C_WEAKEXT;
-  default:
+  case GlobalValue::AppendingLinkage:
     report_fatal_error(
-        "Unhandled linkage when mapping linkage to StorageClass.");
+        "There is no mapping that implements AppendingLinkage for XCOFF.");
   }
+  llvm_unreachable("Unknown linkage type!");
+}
+
+MCSymbol *TargetLoweringObjectFileXCOFF::getFunctionEntryPointSymbol(
+    const Function *F, const TargetMachine &TM) const {
+  SmallString<128> NameStr;
+  NameStr.push_back('.');
+  getNameWithPrefix(NameStr, F, TM);
+  return getContext().getOrCreateSymbol(NameStr);
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::getSectionForFunctionDescriptor(
+    const Function *F, const TargetMachine &TM) const {
+  SmallString<128> NameStr;
+  getNameWithPrefix(NameStr, F, TM);
+  return getContext().getXCOFFSection(NameStr, XCOFF::XMC_DS, XCOFF::XTY_SD,
+                                      getStorageClassForGlobal(F),
+                                      SectionKind::getData());
+}
+
+MCSection *TargetLoweringObjectFileXCOFF::getSectionForTOCEntry(
+    const MCSymbol *Sym) const {
+  return getContext().getXCOFFSection(
+      cast<MCSymbolXCOFF>(Sym)->getSymbolTableName(), XCOFF::XMC_TC,
+      XCOFF::XTY_SD, XCOFF::C_HIDEXT, SectionKind::getData());
 }
diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
index d794a261ecb2..4866d4c171c0 100644
--- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
@@ -45,3 +45,9 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const {
 bool TargetOptions::HonorSignDependentRoundingFPMath() const {
   return !UnsafeFPMath && HonorSignDependentRoundingFPMathOption;
 }
+
+/// NOTE: There are targets that still do not support the debug entry values
+/// production.
+bool TargetOptions::ShouldEmitDebugEntryValues() const {
+  return SupportsDebugEntryValues || EnableDebugEntryValues;
+}
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index 41cb511ad9b4..e0fdb0cefcb8 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -114,6 +114,12 @@ static cl::opt<cl::boolOrDefault>
     VerifyMachineCode("verify-machineinstrs", cl::Hidden,
                       cl::desc("Verify generated machine code"),
                       cl::ZeroOrMore);
+static cl::opt<cl::boolOrDefault> DebugifyAndStripAll(
+    "debugify-and-strip-all-safe", cl::Hidden,
+    cl::desc(
+        "Debugify MIR before and Strip debug after "
+        "each pass except those known to be unsafe when debug info is present"),
+    cl::ZeroOrMore);
 enum RunOutliner { AlwaysOutline, NeverOutline, TargetDefault };
 // Enable or disable the MachineOutliner.
 static cl::opt<RunOutliner> EnableMachineOutliner(
@@ -466,7 +472,7 @@ bool TargetPassConfig::hasLimitedCodeGenPipeline() {
 }
 
 std::string
-TargetPassConfig::getLimitedCodeGenPipelineReason(const char *Separator) const {
+TargetPassConfig::getLimitedCodeGenPipelineReason(const char *Separator) {
   if (!hasLimitedCodeGenPipeline())
     return std::string();
   std::string Res;
@@ -530,17 +536,16 @@ void TargetPassConfig::addPass(Pass *P, bool verifyAfter, bool printAfter) {
   if (StopBefore == PassID && StopBeforeCount++ == StopBeforeInstanceNum)
     Stopped = true;
   if (Started && !Stopped) {
+    if (AddingMachinePasses)
+      addMachinePrePasses();
     std::string Banner;
     // Construct banner message before PM->add() as that may delete the pass.
     if (AddingMachinePasses && (printAfter || verifyAfter))
       Banner = std::string("After ") + std::string(P->getPassName());
     PM->add(P);
-    if (AddingMachinePasses) {
-      if (printAfter)
-        addPrintPass(Banner);
-      if (verifyAfter)
-        addVerifyPass(Banner);
-    }
+    if (AddingMachinePasses)
+      addMachinePostPasses(Banner, /*AllowPrint*/ printAfter,
+                           /*AllowVerify*/ verifyAfter);
 
     // Add the passes after the pass P if there is any.
     for (auto IP : Impl->InsertedPasses) {
@@ -606,45 +611,71 @@ void TargetPassConfig::addVerifyPass(const std::string &Banner) {
     PM->add(createMachineVerifierPass(Banner));
 }
 
+void TargetPassConfig::addDebugifyPass() {
+  PM->add(createDebugifyMachineModulePass());
+}
+
+void TargetPassConfig::addStripDebugPass() {
+  PM->add(createStripDebugMachineModulePass(/*OnlyDebugified=*/true));
+}
+
+void TargetPassConfig::addMachinePrePasses(bool AllowDebugify) {
+  if (AllowDebugify && DebugifyAndStripAll == cl::BOU_TRUE && DebugifyIsSafe)
+    addDebugifyPass();
+}
+
+void TargetPassConfig::addMachinePostPasses(const std::string &Banner,
+                                            bool AllowPrint, bool AllowVerify,
+                                            bool AllowStrip) {
+  if (DebugifyAndStripAll == cl::BOU_TRUE && DebugifyIsSafe)
+    addStripDebugPass();
+  if (AllowPrint)
+    addPrintPass(Banner);
+  if (AllowVerify)
+    addVerifyPass(Banner);
+}
+
 /// Add common target configurable passes that perform LLVM IR to IR transforms
 /// following machine independent optimization.
 void TargetPassConfig::addIRPasses() {
-  switch (UseCFLAA) {
-  case CFLAAType::Steensgaard:
-    addPass(createCFLSteensAAWrapperPass());
-    break;
-  case CFLAAType::Andersen:
-    addPass(createCFLAndersAAWrapperPass());
-    break;
-  case CFLAAType::Both:
-    addPass(createCFLAndersAAWrapperPass());
-    addPass(createCFLSteensAAWrapperPass());
-    break;
-  default:
-    break;
-  }
-
-  // Basic AliasAnalysis support.
-  // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
-  // BasicAliasAnalysis wins if they disagree. This is intended to help
-  // support "obvious" type-punning idioms.
-  addPass(createTypeBasedAAWrapperPass());
-  addPass(createScopedNoAliasAAWrapperPass());
-  addPass(createBasicAAWrapperPass());
-
   // Before running any passes, run the verifier to determine if the input
   // coming from the front-end and/or optimizer is valid.
   if (!DisableVerify)
     addPass(createVerifierPass());
 
-  // Run loop strength reduction before anything else.
-  if (getOptLevel() != CodeGenOpt::None && !DisableLSR) {
-    addPass(createLoopStrengthReducePass());
-    if (PrintLSR)
-      addPass(createPrintFunctionPass(dbgs(), "\n\n*** Code after LSR ***\n"));
-  }
-
   if (getOptLevel() != CodeGenOpt::None) {
+    switch (UseCFLAA) {
+    case CFLAAType::Steensgaard:
+      addPass(createCFLSteensAAWrapperPass());
+      break;
+    case CFLAAType::Andersen:
+      addPass(createCFLAndersAAWrapperPass());
+      break;
+    case CFLAAType::Both:
+      addPass(createCFLAndersAAWrapperPass());
+      addPass(createCFLSteensAAWrapperPass());
+      break;
+    default:
+      break;
+    }
+
+    // Basic AliasAnalysis support.
+    // Add TypeBasedAliasAnalysis before BasicAliasAnalysis so that
+    // BasicAliasAnalysis wins if they disagree. This is intended to help
+    // support "obvious" type-punning idioms.
+    addPass(createTypeBasedAAWrapperPass());
+    addPass(createScopedNoAliasAAWrapperPass());
+    addPass(createBasicAAWrapperPass());
+
+    // Run loop strength reduction before anything else.
+    if (!DisableLSR) {
+      addPass(createCanonicalizeFreezeInLoopsPass());
+      addPass(createLoopStrengthReducePass());
+      if (PrintLSR)
+        addPass(createPrintFunctionPass(dbgs(),
+                                        "\n\n*** Code after LSR ***\n"));
+    }
+
     // The MergeICmpsPass tries to create memcmp calls by grouping sequences of
     // loads and compares. ExpandMemCmpPass then tries to expand those calls
     // into optimally-sized loads and compares. The transforms are enabled by a
@@ -695,18 +726,18 @@ void TargetPassConfig::addPassesToHandleExceptions() {
     // removed from the parent invoke(s). This could happen when a landing
     // pad is shared by multiple invokes and is also a target of a normal
     // edge from elsewhere.
-    addPass(createSjLjEHPreparePass());
+    addPass(createSjLjEHPreparePass(TM));
     LLVM_FALLTHROUGH;
   case ExceptionHandling::DwarfCFI:
   case ExceptionHandling::ARM:
-    addPass(createDwarfEHPass());
+    addPass(createDwarfEHPass(getOptLevel()));
     break;
   case ExceptionHandling::WinEH:
     // We support using both GCC-style and MSVC-style exceptions on Windows, so
     // add both preparation passes. Each pass will only actually run if it
     // recognizes the personality function.
     addPass(createWinEHPass());
-    addPass(createDwarfEHPass());
+    addPass(createDwarfEHPass(getOptLevel()));
     break;
   case ExceptionHandling::Wasm:
     // Wasm EH uses Windows EH instructions, but it does not need to demote PHIs
@@ -785,6 +816,19 @@ bool TargetPassConfig::addCoreISelPasses() {
     TM->setGlobalISel(true);
   }
 
+  // FIXME: Injecting into the DAGISel pipeline seems to cause issues with
+  //        analyses needing to be re-run. This can result in being unable to
+  //        schedule passes (particularly with 'Function Alias Analysis
+  //        Results'). It's not entirely clear why but AFAICT this seems to be
+  //        due to one FunctionPassManager not being able to use analyses from a
+  //        previous one. As we're injecting a ModulePass we break the usual
+  //        pass manager into two. GlobalISel with the fallback path disabled
+  //        and -run-pass seem to be unaffected. The majority of GlobalISel
+  //        testing uses -run-pass so this probably isn't too bad.
+  SaveAndRestore<bool> SavedDebugifyIsSafe(DebugifyIsSafe);
+  if (Selector != SelectorType::GlobalISel || !isGlobalISelAbortEnabled())
+    DebugifyIsSafe = false;
+
   // Add instruction selector passes.
   if (Selector == SelectorType::GlobalISel) {
     SaveAndRestore<bool> SavedAddingMachinePasses(AddingMachinePasses, true);
@@ -892,7 +936,7 @@ void TargetPassConfig::addMachinePasses() {
   } else {
     // If the target requests it, assign local variables to stack slots relative
     // to one another and simplify frame index references where possible.
-    addPass(&LocalStackSlotAllocationID, false);
+    addPass(&LocalStackSlotAllocationID);
   }
 
   if (TM->Options.EnableIPRA)
@@ -901,6 +945,11 @@ void TargetPassConfig::addMachinePasses() {
   // Run pre-ra passes.
   addPreRegAlloc();
 
+  // Debugifying the register allocator passes seems to provoke some
+  // non-determinism that affects CodeGen and there doesn't seem to be a point
+  // where it becomes safe again so stop debugifying here.
+  DebugifyIsSafe = false;
+
   // Run register allocation and passes that are tightly coupled with it,
   // including phi elimination and scheduling.
   if (getOptimizeRegAlloc())
@@ -911,6 +960,8 @@ void TargetPassConfig::addMachinePasses() {
   // Run post-ra passes.
   addPostRegAlloc();
 
+  addPass(&FixupStatepointCallerSavedID);
+
   // Insert prolog/epilog code.  Eliminate abstract frame index references...
   if (getOptLevel() != CodeGenOpt::None) {
     addPass(&PostRAMachineSinkingID);
@@ -956,6 +1007,12 @@ void TargetPassConfig::addMachinePasses() {
   if (getOptLevel() != CodeGenOpt::None)
     addBlockPlacement();
 
+  // Insert before XRay Instrumentation.
+  addPass(&FEntryInserterID);
+
+  addPass(&XRayInstrumentationID);
+  addPass(&PatchableFunctionID);
+
   addPreEmitPass();
 
   if (TM->Options.EnableIPRA)
@@ -963,17 +1020,13 @@ void TargetPassConfig::addMachinePasses() {
     // clobbered registers, to be used to optimize call sites.
     addPass(createRegUsageInfoCollector());
 
+  // FIXME: Some backends are incompatible with running the verifier after
+  // addPreEmitPass.  Maybe only pass "false" here for those targets?
   addPass(&FuncletLayoutID, false);
 
   addPass(&StackMapLivenessID, false);
   addPass(&LiveDebugValuesID, false);
 
-  // Insert before XRay Instrumentation.
-  addPass(&FEntryInserterID, false);
-
-  addPass(&XRayInstrumentationID, false);
-  addPass(&PatchableFunctionID, false);
-
   if (TM->Options.EnableMachineOutliner && getOptLevel() != CodeGenOpt::None &&
       EnableMachineOutliner != NeverOutline) {
     bool RunOnAllFunctions = (EnableMachineOutliner == AlwaysOutline);
@@ -983,6 +1036,9 @@ void TargetPassConfig::addMachinePasses() {
       addPass(createMachineOutlinerPass(RunOnAllFunctions));
   }
 
+  if (TM->getBBSectionsType() != llvm::BasicBlockSection::None)
+    addPass(llvm::createBBSectionsPreparePass(TM->getBBSectionsFuncListBuf()));
+
   // Add passes that directly emit MI after all other MI passes.
   addPreEmitPass2();
 
@@ -996,15 +1052,15 @@ void TargetPassConfig::addMachineSSAOptimization() {
 
   // Optimize PHIs before DCE: removing dead PHI cycles may make more
   // instructions dead.
-  addPass(&OptimizePHIsID, false);
+  addPass(&OptimizePHIsID);
 
   // This pass merges large allocas. StackSlotColoring is a different pass
   // which merges spill slots.
-  addPass(&StackColoringID, false);
+  addPass(&StackColoringID);
 
   // If the target requests it, assign local variables to stack slots relative
   // to one another and simplify frame index references where possible.
-  addPass(&LocalStackSlotAllocationID, false);
+  addPass(&LocalStackSlotAllocationID);
 
   // With optimization, dead code should already be eliminated. However
   // there is one known exception: lowered code for arguments that are only
@@ -1017,8 +1073,8 @@ void TargetPassConfig::addMachineSSAOptimization() {
   // loop info, just like LICM and CSE below.
   addILPOpts();
 
-  addPass(&EarlyMachineLICMID, false);
-  addPass(&MachineCSEID, false);
+  addPass(&EarlyMachineLICMID);
+  addPass(&MachineCSEID);
 
   addPass(&MachineSinkingID);
 
@@ -1110,6 +1166,7 @@ bool TargetPassConfig::addRegAssignmentOptimized() {
 
   // Finally rewrite virtual registers.
   addPass(&VirtRegRewriterID);
+
   // Perform stack slot coloring and post-ra machine LICM.
   //
   // FIXME: Re-enable coloring with register when it's capable of adding
diff --git a/llvm/lib/CodeGen/TargetRegisterInfo.cpp b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
index e5592c31098a..e2ef12d8ac77 100644
--- a/llvm/lib/CodeGen/TargetRegisterInfo.cpp
+++ b/llvm/lib/CodeGen/TargetRegisterInfo.cpp
@@ -13,19 +13,22 @@
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/LiveInterval.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/CodeGen/VirtRegMap.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MachineValueType.h"
@@ -39,6 +42,12 @@
 
 using namespace llvm;
 
+static cl::opt<unsigned>
+    HugeSizeForSplit("huge-size-for-split", cl::Hidden,
+                     cl::desc("A threshold of live range size which may cause "
+                              "high compile time cost in global splitting."),
+                     cl::init(5000));
+
 TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
                              regclass_iterator RCB, regclass_iterator RCE,
                              const char *const *SRINames,
@@ -55,8 +64,19 @@ TargetRegisterInfo::TargetRegisterInfo(const TargetRegisterInfoDesc *ID,
 
 TargetRegisterInfo::~TargetRegisterInfo() = default;
 
-void TargetRegisterInfo::markSuperRegs(BitVector &RegisterSet, unsigned Reg)
-    const {
+bool TargetRegisterInfo::shouldRegionSplitForVirtReg(
+    const MachineFunction &MF, const LiveInterval &VirtReg) const {
+  const TargetInstrInfo *TII = MF.getSubtarget().getInstrInfo();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineInstr *MI = MRI.getUniqueVRegDef(VirtReg.reg);
+  if (MI && TII->isTriviallyReMaterializable(*MI) &&
+      VirtReg.size() > HugeSizeForSplit)
+    return false;
+  return true;
+}
+
+void TargetRegisterInfo::markSuperRegs(BitVector &RegisterSet,
+                                       MCRegister Reg) const {
   for (MCSuperRegIterator AI(Reg, this, true); AI.isValid(); ++AI)
     RegisterSet.set(*AI);
 }
@@ -150,7 +170,7 @@ Printable printVRegOrUnit(unsigned Unit, const TargetRegisterInfo *TRI) {
   });
 }
 
-Printable printRegClassOrBank(unsigned Reg, const MachineRegisterInfo &RegInfo,
+Printable printRegClassOrBank(Register Reg, const MachineRegisterInfo &RegInfo,
                               const TargetRegisterInfo *TRI) {
   return Printable([Reg, &RegInfo, TRI](raw_ostream &OS) {
     if (RegInfo.getRegClassOrNull(Reg))
@@ -187,7 +207,7 @@ TargetRegisterInfo::getAllocatableClass(const TargetRegisterClass *RC) const {
 /// register of the given type, picking the most sub register class of
 /// the right type that contains this physreg.
 const TargetRegisterClass *
-TargetRegisterInfo::getMinimalPhysRegClass(unsigned reg, MVT VT) const {
+TargetRegisterInfo::getMinimalPhysRegClass(MCRegister reg, MVT VT) const {
   assert(Register::isPhysicalRegister(reg) &&
          "reg must be a physical register");
 
@@ -379,18 +399,15 @@ bool TargetRegisterInfo::shouldRewriteCopySrc(const TargetRegisterClass *DefRC,
 }
 
 // Compute target-independent register allocator hints to help eliminate copies.
-bool
-TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
-                                          ArrayRef<MCPhysReg> Order,
-                                          SmallVectorImpl<MCPhysReg> &Hints,
-                                          const MachineFunction &MF,
-                                          const VirtRegMap *VRM,
-                                          const LiveRegMatrix *Matrix) const {
+bool TargetRegisterInfo::getRegAllocationHints(
+    Register VirtReg, ArrayRef<MCPhysReg> Order,
+    SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
+    const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  const std::pair<unsigned, SmallVector<unsigned, 4>> &Hints_MRI =
+  const std::pair<Register, SmallVector<Register, 4>> &Hints_MRI =
     MRI.getRegAllocationHints(VirtReg);
 
-  SmallSet<unsigned, 32> HintedRegs;
+  SmallSet<Register, 32> HintedRegs;
   // First hint may be a target hint.
   bool Skip = (Hints_MRI.first != 0);
   for (auto Reg : Hints_MRI.second) {
@@ -400,8 +417,8 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
     }
 
     // Target-independent hints are either a physical or a virtual register.
-    unsigned Phys = Reg;
-    if (VRM && Register::isVirtualRegister(Phys))
+    Register Phys = Reg;
+    if (VRM && Phys.isVirtual())
       Phys = VRM->getPhys(Phys);
 
     // Don't add the same reg twice (Hints_MRI may contain multiple virtual
@@ -409,7 +426,7 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
     if (!HintedRegs.insert(Phys).second)
       continue;
     // Check that Phys is a valid hint in VirtReg's register class.
-    if (!Register::isPhysicalRegister(Phys))
+    if (!Phys.isPhysical())
       continue;
     if (MRI.isReserved(Phys))
       continue;
@@ -426,7 +443,7 @@ TargetRegisterInfo::getRegAllocationHints(unsigned VirtReg,
 }
 
 bool TargetRegisterInfo::isCalleeSavedPhysReg(
-    unsigned PhysReg, const MachineFunction &MF) const {
+    MCRegister PhysReg, const MachineFunction &MF) const {
   if (PhysReg == 0)
     return false;
   const uint32_t *callerPreservedRegs =
@@ -448,8 +465,8 @@ bool TargetRegisterInfo::needsStackRealignment(
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetFrameLowering *TFI = MF.getSubtarget().getFrameLowering();
   const Function &F = MF.getFunction();
-  unsigned StackAlign = TFI->getStackAlignment();
-  bool requiresRealignment = ((MFI.getMaxAlignment() > StackAlign) ||
+  Align StackAlign = TFI->getStackAlign();
+  bool requiresRealignment = ((MFI.getMaxAlign() > StackAlign) ||
                               F.hasFnAttribute(Attribute::StackAlignment));
   if (F.hasFnAttribute("stackrealign") || requiresRealignment) {
     if (canRealignStack(MF))
@@ -469,10 +486,11 @@ bool TargetRegisterInfo::regmaskSubsetEqual(const uint32_t *mask0,
   return true;
 }
 
-unsigned TargetRegisterInfo::getRegSizeInBits(unsigned Reg,
-                                         const MachineRegisterInfo &MRI) const {
+unsigned
+TargetRegisterInfo::getRegSizeInBits(Register Reg,
+                                     const MachineRegisterInfo &MRI) const {
   const TargetRegisterClass *RC{};
-  if (Register::isPhysicalRegister(Reg)) {
+  if (Reg.isPhysical()) {
     // The size is not directly available for physical registers.
     // Instead, we need to access a register class that contains Reg and
     // get the size of that register class.
@@ -491,15 +509,15 @@ unsigned TargetRegisterInfo::getRegSizeInBits(unsigned Reg,
   return getRegSizeInBits(*RC);
 }
 
-unsigned
-TargetRegisterInfo::lookThruCopyLike(unsigned SrcReg,
+Register
+TargetRegisterInfo::lookThruCopyLike(Register SrcReg,
                                      const MachineRegisterInfo *MRI) const {
   while (true) {
     const MachineInstr *MI = MRI->getVRegDef(SrcReg);
     if (!MI->isCopyLike())
       return SrcReg;
 
-    unsigned CopySrcReg;
+    Register CopySrcReg;
     if (MI->isCopy())
       CopySrcReg = MI->getOperand(1).getReg();
     else {
@@ -507,7 +525,7 @@ TargetRegisterInfo::lookThruCopyLike(unsigned SrcReg,
       CopySrcReg = MI->getOperand(2).getReg();
     }
 
-    if (!Register::isVirtualRegister(CopySrcReg))
+    if (!CopySrcReg.isVirtual())
       return CopySrcReg;
 
     SrcReg = CopySrcReg;
@@ -516,7 +534,7 @@ TargetRegisterInfo::lookThruCopyLike(unsigned SrcReg,
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD
-void TargetRegisterInfo::dumpReg(unsigned Reg, unsigned SubRegIndex,
+void TargetRegisterInfo::dumpReg(Register Reg, unsigned SubRegIndex,
                                  const TargetRegisterInfo *TRI) {
   dbgs() << printReg(Reg, TRI, SubRegIndex) << "\n";
 }
diff --git a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
index 2b1ffab74b6f..de336abe607a 100644
--- a/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/llvm/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -1238,21 +1238,18 @@ bool TwoAddressInstructionPass::tryInstructionCommute(MachineInstr *MI,
                                         Dist)) {
       MadeChange = true;
       ++NumCommuted;
-      if (AggressiveCommute) {
+      if (AggressiveCommute)
         ++NumAggrCommuted;
-        // There might be more than two commutable operands, update BaseOp and
-        // continue scanning.
-        // FIXME: This assumes that the new instruction's operands are in the
-        // same positions and were simply swapped.
-        BaseOpReg = OtherOpReg;
-        BaseOpKilled = OtherOpKilled;
-        // Resamples OpsNum in case the number of operands was reduced. This
-        // happens with X86.
-        OpsNum = MI->getDesc().getNumOperands();
-        continue;
-      }
-      // If this was a commute based on kill, we won't do better continuing.
-      return MadeChange;
+
+      // There might be more than two commutable operands, update BaseOp and
+      // continue scanning.
+      // FIXME: This assumes that the new instruction's operands are in the
+      // same positions and were simply swapped.
+      BaseOpReg = OtherOpReg;
+      BaseOpKilled = OtherOpKilled;
+      // Resamples OpsNum in case the number of operands was reduced. This
+      // happens with X86.
+      OpsNum = MI->getDesc().getNumOperands();
     }
   }
   return MadeChange;
@@ -1422,7 +1419,7 @@ tryInstructionTransform(MachineBasicBlock::iterator &mi,
             LV->addVirtualRegisterKilled(Reg, *NewMIs[1]);
           }
 
-          SmallVector<unsigned, 4> OrigRegs;
+          SmallVector<Register, 4> OrigRegs;
           if (LIS) {
             for (const MachineOperand &MO : MI.operands()) {
               if (MO.isReg())
@@ -1690,6 +1687,10 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &Func) {
   // This pass takes the function out of SSA form.
   MRI->leaveSSA();
 
+  // This pass will rewrite the tied-def to meet the RegConstraint.
+  MF->getProperties()
+      .set(MachineFunctionProperties::Property::TiedOpsRewritten);
+
   TiedOperandMap TiedOperands;
   for (MachineFunction::iterator MBBI = MF->begin(), MBBE = MF->end();
        MBBI != MBBE; ++MBBI) {
@@ -1805,7 +1806,7 @@ eliminateRegSequence(MachineBasicBlock::iterator &MBBI) {
     llvm_unreachable(nullptr);
   }
 
-  SmallVector<unsigned, 4> OrigRegs;
+  SmallVector<Register, 4> OrigRegs;
   if (LIS) {
     OrigRegs.push_back(MI.getOperand(0).getReg());
     for (unsigned i = 1, e = MI.getNumOperands(); i < e; i += 2)
diff --git a/llvm/lib/CodeGen/TypePromotion.cpp b/llvm/lib/CodeGen/TypePromotion.cpp
index 4522484222f5..807babdcaf25 100644
--- a/llvm/lib/CodeGen/TypePromotion.cpp
+++ b/llvm/lib/CodeGen/TypePromotion.cpp
@@ -40,6 +40,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
 
 #define DEBUG_TYPE "type-promotion"
 #define PASS_NAME "Type Promotion"
@@ -847,8 +848,7 @@ bool TypePromotion::TryToPromote(Value *V, unsigned PromotedWidth) {
 
   // Iterate through, and add to, a tree of operands and users in the use-def.
   while (!WorkList.empty()) {
-    Value *V = WorkList.back();
-    WorkList.pop_back();
+    Value *V = WorkList.pop_back_val();
     if (CurrentVisited.count(V))
       continue;
 
@@ -917,7 +917,7 @@ bool TypePromotion::TryToPromote(Value *V, unsigned PromotedWidth) {
      ++ToPromote;
    }
 
-  // DAG optimisations should be able to handle these cases better, especially
+  // DAG optimizations should be able to handle these cases better, especially
   // for function arguments.
   if (ToPromote < 2 || (Blocks.size() == 1 && (NonFreeArgs > SafeWrap.size())))
     return false;
@@ -941,6 +941,9 @@ bool TypePromotion::runOnFunction(Function &F) {
   if (!TPC)
     return false;
 
+  AllVisited.clear();
+  SafeToPromote.clear();
+  SafeWrap.clear();
   bool MadeChange = false;
   const DataLayout &DL = F.getParent()->getDataLayout();
   const TargetMachine &TM = TPC->getTM<TargetMachine>();
@@ -998,6 +1001,10 @@ bool TypePromotion::runOnFunction(Function &F) {
   if (MadeChange)
     LLVM_DEBUG(dbgs() << "After TypePromotion: " << F << "\n");
 
+  AllVisited.clear();
+  SafeToPromote.clear();
+  SafeWrap.clear();
+
   return MadeChange;
 }
 
diff --git a/llvm/lib/CodeGen/UnreachableBlockElim.cpp b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
index b770e1d94488..f5dc589a98cb 100644
--- a/llvm/lib/CodeGen/UnreachableBlockElim.cpp
+++ b/llvm/lib/CodeGen/UnreachableBlockElim.cpp
@@ -81,7 +81,7 @@ namespace {
   class UnreachableMachineBlockElim : public MachineFunctionPass {
     bool runOnMachineFunction(MachineFunction &F) override;
     void getAnalysisUsage(AnalysisUsage &AU) const override;
-    MachineModuleInfo *MMI;
+
   public:
     static char ID; // Pass identification, replacement for typeid
     UnreachableMachineBlockElim() : MachineFunctionPass(ID) {}
@@ -104,8 +104,6 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
   df_iterator_default_set<MachineBasicBlock*> Reachable;
   bool ModifiedPHI = false;
 
-  auto *MMIWP = getAnalysisIfAvailable<MachineModuleInfoWrapperPass>();
-  MMI = MMIWP ? &MMIWP->getMMI() : nullptr;
   MachineDominatorTree *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
   MachineLoopInfo *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
 
@@ -151,7 +149,7 @@ bool UnreachableMachineBlockElim::runOnMachineFunction(MachineFunction &F) {
   for (unsigned i = 0, e = DeadBlocks.size(); i != e; ++i) {
     // Remove any call site information for calls in the block.
     for (auto &I : DeadBlocks[i]->instrs())
-      if (I.isCall(MachineInstr::IgnoreBundle))
+      if (I.shouldUpdateCallSiteInfo())
         DeadBlocks[i]->getParent()->eraseCallSiteInfo(&I);
 
     DeadBlocks[i]->eraseFromParent();
diff --git a/llvm/lib/CodeGen/ValueTypes.cpp b/llvm/lib/CodeGen/ValueTypes.cpp
index 41cbdf035558..66bcdd9b2c4a 100644
--- a/llvm/lib/CodeGen/ValueTypes.cpp
+++ b/llvm/lib/CodeGen/ValueTypes.cpp
@@ -22,7 +22,13 @@ EVT EVT::changeExtendedTypeToInteger() const {
 EVT EVT::changeExtendedVectorElementTypeToInteger() const {
   LLVMContext &Context = LLVMTy->getContext();
   EVT IntTy = getIntegerVT(Context, getScalarSizeInBits());
-  return getVectorVT(Context, IntTy, getVectorNumElements());
+  return getVectorVT(Context, IntTy, getVectorNumElements(),
+                     isScalableVector());
+}
+
+EVT EVT::changeExtendedVectorElementType(EVT EltVT) const {
+  LLVMContext &Context = LLVMTy->getContext();
+  return getVectorVT(Context, EltVT, getVectorElementCount());
 }
 
 EVT EVT::getExtendedIntegerVT(LLVMContext &Context, unsigned BitWidth) {
@@ -32,10 +38,19 @@ EVT EVT::getExtendedIntegerVT(LLVMContext &Context, unsigned BitWidth) {
   return VT;
 }
 
-EVT EVT::getExtendedVectorVT(LLVMContext &Context, EVT VT,
-                             unsigned NumElements) {
+EVT EVT::getExtendedVectorVT(LLVMContext &Context, EVT VT, unsigned NumElements,
+                             bool IsScalable) {
+  EVT ResultVT;
+  ResultVT.LLVMTy =
+      VectorType::get(VT.getTypeForEVT(Context), NumElements, IsScalable);
+  assert(ResultVT.isExtended() && "Type is not extended!");
+  return ResultVT;
+}
+
+EVT EVT::getExtendedVectorVT(LLVMContext &Context, EVT VT, ElementCount EC) {
   EVT ResultVT;
-  ResultVT.LLVMTy = VectorType::get(VT.getTypeForEVT(Context), NumElements);
+  ResultVT.LLVMTy =
+      VectorType::get(VT.getTypeForEVT(Context), {EC.Min, EC.Scalable});
   assert(ResultVT.isExtended() && "Type is not extended!");
   return ResultVT;
 }
@@ -92,6 +107,14 @@ bool EVT::isExtended2048BitVector() const {
   return isExtendedVector() && getExtendedSizeInBits() == 2048;
 }
 
+bool EVT::isExtendedFixedLengthVector() const {
+  return isExtendedVector() && isa<FixedVectorType>(LLVMTy);
+}
+
+bool EVT::isExtendedScalableVector() const {
+  return isExtendedVector() && isa<ScalableVectorType>(LLVMTy);
+}
+
 EVT EVT::getExtendedVectorElementType() const {
   assert(isExtended() && "Type is not extended!");
   return EVT::getEVT(cast<VectorType>(LLVMTy)->getElementType());
@@ -99,7 +122,19 @@ EVT EVT::getExtendedVectorElementType() const {
 
 unsigned EVT::getExtendedVectorNumElements() const {
   assert(isExtended() && "Type is not extended!");
-  return cast<VectorType>(LLVMTy)->getNumElements();
+  ElementCount EC = cast<VectorType>(LLVMTy)->getElementCount();
+  if (EC.Scalable) {
+    WithColor::warning()
+        << "The code that requested the fixed number of elements has made the "
+           "assumption that this vector is not scalable. This assumption was "
+           "not correct, and this may lead to broken code\n";
+  }
+  return EC.Min;
+}
+
+ElementCount EVT::getExtendedVectorElementCount() const {
+  assert(isExtended() && "Type is not extended!");
+  return cast<VectorType>(LLVMTy)->getElementCount();
 }
 
 TypeSize EVT::getExtendedSizeInBits() const {
@@ -116,13 +151,15 @@ std::string EVT::getEVTString() const {
   switch (V.SimpleTy) {
   default:
     if (isVector())
-      return (isScalableVector() ? "nxv" : "v") + utostr(getVectorNumElements())
+      return (isScalableVector() ? "nxv" : "v")
+             + utostr(getVectorElementCount().Min)
              + getVectorElementType().getEVTString();
     if (isInteger())
       return "i" + utostr(getSizeInBits());
     if (isFloatingPoint())
       return "f" + utostr(getSizeInBits());
     llvm_unreachable("Invalid EVT!");
+  case MVT::bf16:    return "bf16";
   case MVT::ppcf128: return "ppcf128";
   case MVT::isVoid:  return "isVoid";
   case MVT::Other:   return "ch";
@@ -150,170 +187,285 @@ Type *EVT::getTypeForEVT(LLVMContext &Context) const {
   case MVT::i64:     return Type::getInt64Ty(Context);
   case MVT::i128:    return IntegerType::get(Context, 128);
   case MVT::f16:     return Type::getHalfTy(Context);
+  case MVT::bf16:     return Type::getBFloatTy(Context);
   case MVT::f32:     return Type::getFloatTy(Context);
   case MVT::f64:     return Type::getDoubleTy(Context);
   case MVT::f80:     return Type::getX86_FP80Ty(Context);
   case MVT::f128:    return Type::getFP128Ty(Context);
   case MVT::ppcf128: return Type::getPPC_FP128Ty(Context);
   case MVT::x86mmx:  return Type::getX86_MMXTy(Context);
-  case MVT::v1i1:    return VectorType::get(Type::getInt1Ty(Context), 1);
-  case MVT::v2i1:    return VectorType::get(Type::getInt1Ty(Context), 2);
-  case MVT::v4i1:    return VectorType::get(Type::getInt1Ty(Context), 4);
-  case MVT::v8i1:    return VectorType::get(Type::getInt1Ty(Context), 8);
-  case MVT::v16i1:   return VectorType::get(Type::getInt1Ty(Context), 16);
-  case MVT::v32i1:   return VectorType::get(Type::getInt1Ty(Context), 32);
-  case MVT::v64i1:   return VectorType::get(Type::getInt1Ty(Context), 64);
-  case MVT::v128i1:  return VectorType::get(Type::getInt1Ty(Context), 128);
-  case MVT::v256i1:  return VectorType::get(Type::getInt1Ty(Context), 256);
-  case MVT::v512i1:  return VectorType::get(Type::getInt1Ty(Context), 512);
-  case MVT::v1024i1: return VectorType::get(Type::getInt1Ty(Context), 1024);
-  case MVT::v1i8:    return VectorType::get(Type::getInt8Ty(Context), 1);
-  case MVT::v2i8:    return VectorType::get(Type::getInt8Ty(Context), 2);
-  case MVT::v4i8:    return VectorType::get(Type::getInt8Ty(Context), 4);
-  case MVT::v8i8:    return VectorType::get(Type::getInt8Ty(Context), 8);
-  case MVT::v16i8:   return VectorType::get(Type::getInt8Ty(Context), 16);
-  case MVT::v32i8:   return VectorType::get(Type::getInt8Ty(Context), 32);
-  case MVT::v64i8:   return VectorType::get(Type::getInt8Ty(Context), 64);
-  case MVT::v128i8:  return VectorType::get(Type::getInt8Ty(Context), 128);
-  case MVT::v256i8:  return VectorType::get(Type::getInt8Ty(Context), 256);
-  case MVT::v1i16:   return VectorType::get(Type::getInt16Ty(Context), 1);
-  case MVT::v2i16:   return VectorType::get(Type::getInt16Ty(Context), 2);
-  case MVT::v3i16:   return VectorType::get(Type::getInt16Ty(Context), 3);
-  case MVT::v4i16:   return VectorType::get(Type::getInt16Ty(Context), 4);
-  case MVT::v8i16:   return VectorType::get(Type::getInt16Ty(Context), 8);
-  case MVT::v16i16:  return VectorType::get(Type::getInt16Ty(Context), 16);
-  case MVT::v32i16:  return VectorType::get(Type::getInt16Ty(Context), 32);
-  case MVT::v64i16:  return VectorType::get(Type::getInt16Ty(Context), 64);
-  case MVT::v128i16: return VectorType::get(Type::getInt16Ty(Context), 128);
-  case MVT::v1i32:   return VectorType::get(Type::getInt32Ty(Context), 1);
-  case MVT::v2i32:   return VectorType::get(Type::getInt32Ty(Context), 2);
-  case MVT::v3i32:   return VectorType::get(Type::getInt32Ty(Context), 3);
-  case MVT::v4i32:   return VectorType::get(Type::getInt32Ty(Context), 4);
-  case MVT::v5i32:   return VectorType::get(Type::getInt32Ty(Context), 5);
-  case MVT::v8i32:   return VectorType::get(Type::getInt32Ty(Context), 8);
-  case MVT::v16i32:  return VectorType::get(Type::getInt32Ty(Context), 16);
-  case MVT::v32i32:  return VectorType::get(Type::getInt32Ty(Context), 32);
-  case MVT::v64i32:  return VectorType::get(Type::getInt32Ty(Context), 64);
-  case MVT::v128i32: return VectorType::get(Type::getInt32Ty(Context), 128);
-  case MVT::v256i32: return VectorType::get(Type::getInt32Ty(Context), 256);
-  case MVT::v512i32: return VectorType::get(Type::getInt32Ty(Context), 512);
-  case MVT::v1024i32:return VectorType::get(Type::getInt32Ty(Context), 1024);
-  case MVT::v2048i32:return VectorType::get(Type::getInt32Ty(Context), 2048);
-  case MVT::v1i64:   return VectorType::get(Type::getInt64Ty(Context), 1);
-  case MVT::v2i64:   return VectorType::get(Type::getInt64Ty(Context), 2);
-  case MVT::v4i64:   return VectorType::get(Type::getInt64Ty(Context), 4);
-  case MVT::v8i64:   return VectorType::get(Type::getInt64Ty(Context), 8);
-  case MVT::v16i64:  return VectorType::get(Type::getInt64Ty(Context), 16);
-  case MVT::v32i64:  return VectorType::get(Type::getInt64Ty(Context), 32);
-  case MVT::v1i128:  return VectorType::get(Type::getInt128Ty(Context), 1);
-  case MVT::v2f16:   return VectorType::get(Type::getHalfTy(Context), 2);
-  case MVT::v3f16:   return VectorType::get(Type::getHalfTy(Context), 3);
-  case MVT::v4f16:   return VectorType::get(Type::getHalfTy(Context), 4);
-  case MVT::v8f16:   return VectorType::get(Type::getHalfTy(Context), 8);
-  case MVT::v16f16:  return VectorType::get(Type::getHalfTy(Context), 16);
-  case MVT::v32f16:  return VectorType::get(Type::getHalfTy(Context), 32);
-  case MVT::v1f32:   return VectorType::get(Type::getFloatTy(Context), 1);
-  case MVT::v2f32:   return VectorType::get(Type::getFloatTy(Context), 2);
-  case MVT::v3f32:   return VectorType::get(Type::getFloatTy(Context), 3);
-  case MVT::v4f32:   return VectorType::get(Type::getFloatTy(Context), 4);
-  case MVT::v5f32:   return VectorType::get(Type::getFloatTy(Context), 5);
-  case MVT::v8f32:   return VectorType::get(Type::getFloatTy(Context), 8);
-  case MVT::v16f32:  return VectorType::get(Type::getFloatTy(Context), 16);
-  case MVT::v32f32:  return VectorType::get(Type::getFloatTy(Context), 32);
-  case MVT::v64f32:  return VectorType::get(Type::getFloatTy(Context), 64);
-  case MVT::v128f32: return VectorType::get(Type::getFloatTy(Context), 128);
-  case MVT::v256f32: return VectorType::get(Type::getFloatTy(Context), 256);
-  case MVT::v512f32: return VectorType::get(Type::getFloatTy(Context), 512);
-  case MVT::v1024f32:return VectorType::get(Type::getFloatTy(Context), 1024);
-  case MVT::v2048f32:return VectorType::get(Type::getFloatTy(Context), 2048);
-  case MVT::v1f64:   return VectorType::get(Type::getDoubleTy(Context), 1);
-  case MVT::v2f64:   return VectorType::get(Type::getDoubleTy(Context), 2);
-  case MVT::v4f64:   return VectorType::get(Type::getDoubleTy(Context), 4);
-  case MVT::v8f64:   return VectorType::get(Type::getDoubleTy(Context), 8);
-  case MVT::nxv1i1:  
-    return VectorType::get(Type::getInt1Ty(Context), 1, /*Scalable=*/ true);
-  case MVT::nxv2i1:  
-    return VectorType::get(Type::getInt1Ty(Context), 2, /*Scalable=*/ true);
-  case MVT::nxv4i1:  
-    return VectorType::get(Type::getInt1Ty(Context), 4, /*Scalable=*/ true);
-  case MVT::nxv8i1:  
-    return VectorType::get(Type::getInt1Ty(Context), 8, /*Scalable=*/ true);
-  case MVT::nxv16i1: 
-    return VectorType::get(Type::getInt1Ty(Context), 16, /*Scalable=*/ true);
-  case MVT::nxv32i1: 
-    return VectorType::get(Type::getInt1Ty(Context), 32, /*Scalable=*/ true);
-  case MVT::nxv1i8:  
-    return VectorType::get(Type::getInt8Ty(Context), 1, /*Scalable=*/ true);
-  case MVT::nxv2i8:  
-    return VectorType::get(Type::getInt8Ty(Context), 2, /*Scalable=*/ true);
-  case MVT::nxv4i8:  
-    return VectorType::get(Type::getInt8Ty(Context), 4, /*Scalable=*/ true);
-  case MVT::nxv8i8:  
-    return VectorType::get(Type::getInt8Ty(Context), 8, /*Scalable=*/ true);
-  case MVT::nxv16i8: 
-    return VectorType::get(Type::getInt8Ty(Context), 16, /*Scalable=*/ true);
-  case MVT::nxv32i8: 
-    return VectorType::get(Type::getInt8Ty(Context), 32, /*Scalable=*/ true);
-  case MVT::nxv1i16: 
-    return VectorType::get(Type::getInt16Ty(Context), 1, /*Scalable=*/ true);
-  case MVT::nxv2i16: 
-    return VectorType::get(Type::getInt16Ty(Context), 2, /*Scalable=*/ true);
-  case MVT::nxv4i16: 
-    return VectorType::get(Type::getInt16Ty(Context), 4, /*Scalable=*/ true);
-  case MVT::nxv8i16: 
-    return VectorType::get(Type::getInt16Ty(Context), 8, /*Scalable=*/ true);
+  case MVT::v1i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 1);
+  case MVT::v2i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 2);
+  case MVT::v4i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 4);
+  case MVT::v8i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 8);
+  case MVT::v16i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 16);
+  case MVT::v32i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 32);
+  case MVT::v64i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 64);
+  case MVT::v128i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 128);
+  case MVT::v256i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 256);
+  case MVT::v512i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 512);
+  case MVT::v1024i1:
+    return FixedVectorType::get(Type::getInt1Ty(Context), 1024);
+  case MVT::v1i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 1);
+  case MVT::v2i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 2);
+  case MVT::v4i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 4);
+  case MVT::v8i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 8);
+  case MVT::v16i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 16);
+  case MVT::v32i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 32);
+  case MVT::v64i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 64);
+  case MVT::v128i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 128);
+  case MVT::v256i8:
+    return FixedVectorType::get(Type::getInt8Ty(Context), 256);
+  case MVT::v1i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 1);
+  case MVT::v2i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 2);
+  case MVT::v3i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 3);
+  case MVT::v4i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 4);
+  case MVT::v8i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 8);
+  case MVT::v16i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 16);
+  case MVT::v32i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 32);
+  case MVT::v64i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 64);
+  case MVT::v128i16:
+    return FixedVectorType::get(Type::getInt16Ty(Context), 128);
+  case MVT::v1i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 1);
+  case MVT::v2i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 2);
+  case MVT::v3i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 3);
+  case MVT::v4i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 4);
+  case MVT::v5i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 5);
+  case MVT::v8i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 8);
+  case MVT::v16i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 16);
+  case MVT::v32i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 32);
+  case MVT::v64i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 64);
+  case MVT::v128i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 128);
+  case MVT::v256i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 256);
+  case MVT::v512i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 512);
+  case MVT::v1024i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 1024);
+  case MVT::v2048i32:
+    return FixedVectorType::get(Type::getInt32Ty(Context), 2048);
+  case MVT::v1i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 1);
+  case MVT::v2i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 2);
+  case MVT::v4i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 4);
+  case MVT::v8i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 8);
+  case MVT::v16i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 16);
+  case MVT::v32i64:
+    return FixedVectorType::get(Type::getInt64Ty(Context), 32);
+  case MVT::v1i128:
+    return FixedVectorType::get(Type::getInt128Ty(Context), 1);
+  case MVT::v2f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 2);
+  case MVT::v3f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 3);
+  case MVT::v4f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 4);
+  case MVT::v8f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 8);
+  case MVT::v16f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 16);
+  case MVT::v32f16:
+    return FixedVectorType::get(Type::getHalfTy(Context), 32);
+  case MVT::v64f16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 64);
+  case MVT::v128f16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 128);
+  case MVT::v2bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 2);
+  case MVT::v3bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 3);
+  case MVT::v4bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 4);
+  case MVT::v8bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 8);
+  case MVT::v16bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 16);
+  case MVT::v32bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 32);
+  case MVT::v64bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 64);
+  case MVT::v128bf16:
+    return FixedVectorType::get(Type::getBFloatTy(Context), 128);
+  case MVT::v1f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 1);
+  case MVT::v2f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 2);
+  case MVT::v3f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 3);
+  case MVT::v4f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 4);
+  case MVT::v5f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 5);
+  case MVT::v8f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 8);
+  case MVT::v16f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 16);
+  case MVT::v32f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 32);
+  case MVT::v64f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 64);
+  case MVT::v128f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 128);
+  case MVT::v256f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 256);
+  case MVT::v512f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 512);
+  case MVT::v1024f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 1024);
+  case MVT::v2048f32:
+    return FixedVectorType::get(Type::getFloatTy(Context), 2048);
+  case MVT::v1f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 1);
+  case MVT::v2f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 2);
+  case MVT::v4f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 4);
+  case MVT::v8f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 8);
+  case MVT::v16f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 16);
+  case MVT::v32f64:
+    return FixedVectorType::get(Type::getDoubleTy(Context), 32);
+  case MVT::nxv1i1:
+    return ScalableVectorType::get(Type::getInt1Ty(Context), 1);
+  case MVT::nxv2i1:
+    return ScalableVectorType::get(Type::getInt1Ty(Context), 2);
+  case MVT::nxv4i1:
+    return ScalableVectorType::get(Type::getInt1Ty(Context), 4);
+  case MVT::nxv8i1:
+    return ScalableVectorType::get(Type::getInt1Ty(Context), 8);
+  case MVT::nxv16i1:
+    return ScalableVectorType::get(Type::getInt1Ty(Context), 16);
+  case MVT::nxv32i1:
+    return ScalableVectorType::get(Type::getInt1Ty(Context), 32);
+  case MVT::nxv64i1:
+    return ScalableVectorType::get(Type::getInt1Ty(Context), 64);
+  case MVT::nxv1i8:
+    return ScalableVectorType::get(Type::getInt8Ty(Context), 1);
+  case MVT::nxv2i8:
+    return ScalableVectorType::get(Type::getInt8Ty(Context), 2);
+  case MVT::nxv4i8:
+    return ScalableVectorType::get(Type::getInt8Ty(Context), 4);
+  case MVT::nxv8i8:
+    return ScalableVectorType::get(Type::getInt8Ty(Context), 8);
+  case MVT::nxv16i8:
+    return ScalableVectorType::get(Type::getInt8Ty(Context), 16);
+  case MVT::nxv32i8:
+    return ScalableVectorType::get(Type::getInt8Ty(Context), 32);
+  case MVT::nxv64i8:
+    return ScalableVectorType::get(Type::getInt8Ty(Context), 64);
+  case MVT::nxv1i16:
+    return ScalableVectorType::get(Type::getInt16Ty(Context), 1);
+  case MVT::nxv2i16:
+    return ScalableVectorType::get(Type::getInt16Ty(Context), 2);
+  case MVT::nxv4i16:
+    return ScalableVectorType::get(Type::getInt16Ty(Context), 4);
+  case MVT::nxv8i16:
+    return ScalableVectorType::get(Type::getInt16Ty(Context), 8);
   case MVT::nxv16i16:
-    return VectorType::get(Type::getInt16Ty(Context), 16, /*Scalable=*/ true);
+    return ScalableVectorType::get(Type::getInt16Ty(Context), 16);
   case MVT::nxv32i16:
-    return VectorType::get(Type::getInt16Ty(Context), 32, /*Scalable=*/ true);
-  case MVT::nxv1i32: 
-    return VectorType::get(Type::getInt32Ty(Context), 1, /*Scalable=*/ true);
-  case MVT::nxv2i32: 
-    return VectorType::get(Type::getInt32Ty(Context), 2, /*Scalable=*/ true);
-  case MVT::nxv4i32: 
-    return VectorType::get(Type::getInt32Ty(Context), 4, /*Scalable=*/ true);
-  case MVT::nxv8i32: 
-    return VectorType::get(Type::getInt32Ty(Context), 8, /*Scalable=*/ true);
+    return ScalableVectorType::get(Type::getInt16Ty(Context), 32);
+  case MVT::nxv1i32:
+    return ScalableVectorType::get(Type::getInt32Ty(Context), 1);
+  case MVT::nxv2i32:
+    return ScalableVectorType::get(Type::getInt32Ty(Context), 2);
+  case MVT::nxv4i32:
+    return ScalableVectorType::get(Type::getInt32Ty(Context), 4);
+  case MVT::nxv8i32:
+    return ScalableVectorType::get(Type::getInt32Ty(Context), 8);
   case MVT::nxv16i32:
-    return VectorType::get(Type::getInt32Ty(Context), 16,/*Scalable=*/ true);
+    return ScalableVectorType::get(Type::getInt32Ty(Context), 16);
   case MVT::nxv32i32:
-    return VectorType::get(Type::getInt32Ty(Context), 32,/*Scalable=*/ true);
-  case MVT::nxv1i64: 
-    return VectorType::get(Type::getInt64Ty(Context), 1, /*Scalable=*/ true);
-  case MVT::nxv2i64: 
-    return VectorType::get(Type::getInt64Ty(Context), 2, /*Scalable=*/ true);
-  case MVT::nxv4i64: 
-    return VectorType::get(Type::getInt64Ty(Context), 4, /*Scalable=*/ true);
-  case MVT::nxv8i64: 
-    return VectorType::get(Type::getInt64Ty(Context), 8, /*Scalable=*/ true);
+    return ScalableVectorType::get(Type::getInt32Ty(Context), 32);
+  case MVT::nxv1i64:
+    return ScalableVectorType::get(Type::getInt64Ty(Context), 1);
+  case MVT::nxv2i64:
+    return ScalableVectorType::get(Type::getInt64Ty(Context), 2);
+  case MVT::nxv4i64:
+    return ScalableVectorType::get(Type::getInt64Ty(Context), 4);
+  case MVT::nxv8i64:
+    return ScalableVectorType::get(Type::getInt64Ty(Context), 8);
   case MVT::nxv16i64:
-    return VectorType::get(Type::getInt64Ty(Context), 16, /*Scalable=*/ true);
+    return ScalableVectorType::get(Type::getInt64Ty(Context), 16);
   case MVT::nxv32i64:
-    return VectorType::get(Type::getInt64Ty(Context), 32, /*Scalable=*/ true);
-  case MVT::nxv2f16: 
-    return VectorType::get(Type::getHalfTy(Context), 2, /*Scalable=*/ true);
-  case MVT::nxv4f16: 
-    return VectorType::get(Type::getHalfTy(Context), 4, /*Scalable=*/ true);
-  case MVT::nxv8f16: 
-    return VectorType::get(Type::getHalfTy(Context), 8, /*Scalable=*/ true);
-  case MVT::nxv1f32: 
-    return VectorType::get(Type::getFloatTy(Context), 1, /*Scalable=*/ true);
-  case MVT::nxv2f32: 
-    return VectorType::get(Type::getFloatTy(Context), 2, /*Scalable=*/ true);
-  case MVT::nxv4f32: 
-    return VectorType::get(Type::getFloatTy(Context), 4, /*Scalable=*/ true);
-  case MVT::nxv8f32: 
-    return VectorType::get(Type::getFloatTy(Context), 8, /*Scalable=*/ true);
+    return ScalableVectorType::get(Type::getInt64Ty(Context), 32);
+  case MVT::nxv1f16:
+    return ScalableVectorType::get(Type::getHalfTy(Context), 1);
+  case MVT::nxv2f16:
+    return ScalableVectorType::get(Type::getHalfTy(Context), 2);
+  case MVT::nxv4f16:
+    return ScalableVectorType::get(Type::getHalfTy(Context), 4);
+  case MVT::nxv8f16:
+    return ScalableVectorType::get(Type::getHalfTy(Context), 8);
+  case MVT::nxv16f16:
+    return ScalableVectorType::get(Type::getHalfTy(Context), 16);
+  case MVT::nxv32f16:
+    return ScalableVectorType::get(Type::getHalfTy(Context), 32);
+  case MVT::nxv2bf16:
+    return ScalableVectorType::get(Type::getBFloatTy(Context), 2);
+  case MVT::nxv4bf16:
+    return ScalableVectorType::get(Type::getBFloatTy(Context), 4);
+  case MVT::nxv8bf16:
+    return ScalableVectorType::get(Type::getBFloatTy(Context), 8);
+  case MVT::nxv1f32:
+    return ScalableVectorType::get(Type::getFloatTy(Context), 1);
+  case MVT::nxv2f32:
+    return ScalableVectorType::get(Type::getFloatTy(Context), 2);
+  case MVT::nxv4f32:
+    return ScalableVectorType::get(Type::getFloatTy(Context), 4);
+  case MVT::nxv8f32:
+    return ScalableVectorType::get(Type::getFloatTy(Context), 8);
   case MVT::nxv16f32:
-    return VectorType::get(Type::getFloatTy(Context), 16, /*Scalable=*/ true);
-  case MVT::nxv1f64: 
-    return VectorType::get(Type::getDoubleTy(Context), 1, /*Scalable=*/ true);
-  case MVT::nxv2f64: 
-    return VectorType::get(Type::getDoubleTy(Context), 2, /*Scalable=*/ true);
-  case MVT::nxv4f64: 
-    return VectorType::get(Type::getDoubleTy(Context), 4, /*Scalable=*/ true);
-  case MVT::nxv8f64: 
-    return VectorType::get(Type::getDoubleTy(Context), 8, /*Scalable=*/ true);
+    return ScalableVectorType::get(Type::getFloatTy(Context), 16);
+  case MVT::nxv1f64:
+    return ScalableVectorType::get(Type::getDoubleTy(Context), 1);
+  case MVT::nxv2f64:
+    return ScalableVectorType::get(Type::getDoubleTy(Context), 2);
+  case MVT::nxv4f64:
+    return ScalableVectorType::get(Type::getDoubleTy(Context), 4);
+  case MVT::nxv8f64:
+    return ScalableVectorType::get(Type::getDoubleTy(Context), 8);
   case MVT::Metadata: return Type::getMetadataTy(Context);
   }
 }
@@ -331,6 +483,7 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   case Type::IntegerTyID:
     return getIntegerVT(cast<IntegerType>(Ty)->getBitWidth());
   case Type::HalfTyID:      return MVT(MVT::f16);
+  case Type::BFloatTyID:    return MVT(MVT::bf16);
   case Type::FloatTyID:     return MVT(MVT::f32);
   case Type::DoubleTyID:    return MVT(MVT::f64);
   case Type::X86_FP80TyID:  return MVT(MVT::f80);
@@ -338,7 +491,8 @@ MVT MVT::getVT(Type *Ty, bool HandleUnknown){
   case Type::FP128TyID:     return MVT(MVT::f128);
   case Type::PPC_FP128TyID: return MVT(MVT::ppcf128);
   case Type::PointerTyID:   return MVT(MVT::iPTR);
-  case Type::VectorTyID: {
+  case Type::FixedVectorTyID:
+  case Type::ScalableVectorTyID: {
     VectorType *VTy = cast<VectorType>(Ty);
     return getVectorVT(
       getVT(VTy->getElementType(), /*HandleUnknown=*/ false),
@@ -356,7 +510,8 @@ EVT EVT::getEVT(Type *Ty, bool HandleUnknown){
     return MVT::getVT(Ty, HandleUnknown);
   case Type::IntegerTyID:
     return getIntegerVT(Ty->getContext(), cast<IntegerType>(Ty)->getBitWidth());
-  case Type::VectorTyID: {
+  case Type::FixedVectorTyID:
+  case Type::ScalableVectorTyID: {
     VectorType *VTy = cast<VectorType>(Ty);
     return getVectorVT(Ty->getContext(),
                        getEVT(VTy->getElementType(), /*HandleUnknown=*/ false),
diff --git a/llvm/lib/CodeGen/VirtRegMap.cpp b/llvm/lib/CodeGen/VirtRegMap.cpp
index 5312e2eea96b..2c83f13b651b 100644
--- a/llvm/lib/CodeGen/VirtRegMap.cpp
+++ b/llvm/lib/CodeGen/VirtRegMap.cpp
@@ -92,8 +92,8 @@ void VirtRegMap::assignVirt2Phys(Register virtReg, MCPhysReg physReg) {
 
 unsigned VirtRegMap::createSpillSlot(const TargetRegisterClass *RC) {
   unsigned Size = TRI->getSpillSize(*RC);
-  unsigned Align = TRI->getSpillAlignment(*RC);
-  int SS = MF->getFrameInfo().CreateSpillStackObject(Size, Align);
+  Align Alignment = TRI->getSpillAlign(*RC);
+  int SS = MF->getFrameInfo().CreateSpillStackObject(Size, Alignment);
   ++NumSpillSlots;
   return SS;
 }
diff --git a/llvm/lib/CodeGen/WasmEHPrepare.cpp b/llvm/lib/CodeGen/WasmEHPrepare.cpp
index 1582f12ad580..44f4fe2ff9b1 100644
--- a/llvm/lib/CodeGen/WasmEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WasmEHPrepare.cpp
@@ -77,9 +77,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/BreadthFirstIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -118,14 +120,17 @@ class WasmEHPrepare : public FunctionPass {
   bool prepareEHPads(Function &F);
   bool prepareThrows(Function &F);
 
-  void prepareEHPad(BasicBlock *BB, bool NeedLSDA, unsigned Index = 0);
+  bool IsEHPadFunctionsSetUp = false;
+  void setupEHPadFunctions(Function &F);
+  void prepareEHPad(BasicBlock *BB, bool NeedPersonality, bool NeedLSDA = false,
+                    unsigned Index = 0);
   void prepareTerminateCleanupPad(BasicBlock *BB);
 
 public:
   static char ID; // Pass identification, replacement for typeid
 
   WasmEHPrepare() : FunctionPass(ID) {}
-
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
   bool doInitialization(Module &M) override;
   bool runOnFunction(Function &F) override;
 
@@ -136,11 +141,18 @@ public:
 } // end anonymous namespace
 
 char WasmEHPrepare::ID = 0;
-INITIALIZE_PASS(WasmEHPrepare, DEBUG_TYPE, "Prepare WebAssembly exceptions",
-                false, false)
+INITIALIZE_PASS_BEGIN(WasmEHPrepare, DEBUG_TYPE,
+                      "Prepare WebAssembly exceptions", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(WasmEHPrepare, DEBUG_TYPE, "Prepare WebAssembly exceptions",
+                    false, false)
 
 FunctionPass *llvm::createWasmEHPass() { return new WasmEHPrepare(); }
 
+void WasmEHPrepare::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<DominatorTreeWrapperPass>();
+}
+
 bool WasmEHPrepare::doInitialization(Module &M) {
   IRBuilder<> IRB(M.getContext());
   LPadContextTy = StructType::get(IRB.getInt32Ty(),   // lpad_index
@@ -153,18 +165,19 @@ bool WasmEHPrepare::doInitialization(Module &M) {
 // Erase the specified BBs if the BB does not have any remaining predecessors,
 // and also all its dead children.
 template <typename Container>
-static void eraseDeadBBsAndChildren(const Container &BBs) {
+static void eraseDeadBBsAndChildren(const Container &BBs, DomTreeUpdater *DTU) {
   SmallVector<BasicBlock *, 8> WL(BBs.begin(), BBs.end());
   while (!WL.empty()) {
     auto *BB = WL.pop_back_val();
     if (pred_begin(BB) != pred_end(BB))
       continue;
     WL.append(succ_begin(BB), succ_end(BB));
-    DeleteDeadBlock(BB);
+    DeleteDeadBlock(BB, DTU);
   }
 }
 
 bool WasmEHPrepare::runOnFunction(Function &F) {
+  IsEHPadFunctionsSetUp = false;
   bool Changed = false;
   Changed |= prepareThrows(F);
   Changed |= prepareEHPads(F);
@@ -172,6 +185,9 @@ bool WasmEHPrepare::runOnFunction(Function &F) {
 }
 
 bool WasmEHPrepare::prepareThrows(Function &F) {
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  DomTreeUpdater DTU(&DT, /*PostDominatorTree*/ nullptr,
+                     DomTreeUpdater::UpdateStrategy::Eager);
   Module &M = *F.getParent();
   IRBuilder<> IRB(F.getContext());
   bool Changed = false;
@@ -194,30 +210,102 @@ bool WasmEHPrepare::prepareThrows(Function &F) {
     InstList.erase(std::next(BasicBlock::iterator(ThrowI)), InstList.end());
     IRB.SetInsertPoint(BB);
     IRB.CreateUnreachable();
-    eraseDeadBBsAndChildren(Succs);
+    eraseDeadBBsAndChildren(Succs, &DTU);
   }
 
   return Changed;
 }
 
 bool WasmEHPrepare::prepareEHPads(Function &F) {
-  Module &M = *F.getParent();
-  IRBuilder<> IRB(F.getContext());
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  bool Changed = false;
 
-  SmallVector<BasicBlock *, 16> CatchPads;
-  SmallVector<BasicBlock *, 16> CleanupPads;
-  for (BasicBlock &BB : F) {
-    if (!BB.isEHPad())
+  // There are two things to decide: whether we need a personality function call
+  // and whether we need a `wasm.lsda()` call and its store.
+  //
+  // For the personality function call, catchpads with `catch (...)` and
+  // cleanuppads don't need it, because exceptions are always caught. Others all
+  // need it.
+  //
+  // For `wasm.lsda()` and its store, in order to minimize the number of them,
+  // we need a way to figure out whether we have encountered `wasm.lsda()` call
+  // in any of EH pads that dominates the current EH pad. To figure that out, we
+  // now visit EH pads in BFS order in the dominator tree so that we visit
+  // parent BBs first before visiting its child BBs in the domtree.
+  //
+  // We keep a set named `ExecutedLSDA`, which basically means "Do we have
+  // `wasm.lsda() either in the current EH pad or any of its parent EH pads in
+  // the dominator tree?". This is to prevent scanning the domtree up to the
+  // root every time we examine an EH pad, in the worst case: each EH pad only
+  // needs to check its immediate parent EH pad.
+  //
+  // - If any of its parent EH pads in the domtree has `wasm.lsda`, this means
+  //   we don't need `wasm.lsda()` in the current EH pad. We also insert the
+  //   current EH pad in `ExecutedLSDA` set.
+  // - If none of its parent EH pad has `wasm.lsda()`,
+  //   - If the current EH pad is a `catch (...)` or a cleanuppad, done.
+  //   - If the current EH pad is neither a `catch (...)` nor a cleanuppad,
+  //     add `wasm.lsda()` and the store in the current EH pad, and add the
+  //     current EH pad to `ExecutedLSDA` set.
+  //
+  // TODO Can we not store LSDA address in user function but make libcxxabi
+  // compute it?
+  DenseSet<Value *> ExecutedLSDA;
+  unsigned Index = 0;
+  for (auto DomNode : breadth_first(&DT)) {
+    auto *BB = DomNode->getBlock();
+    auto *Pad = BB->getFirstNonPHI();
+    if (!Pad || (!isa<CatchPadInst>(Pad) && !isa<CleanupPadInst>(Pad)))
       continue;
-    auto *Pad = BB.getFirstNonPHI();
-    if (isa<CatchPadInst>(Pad))
-      CatchPads.push_back(&BB);
-    else if (isa<CleanupPadInst>(Pad))
-      CleanupPads.push_back(&BB);
+    Changed = true;
+
+    Value *ParentPad = nullptr;
+    if (CatchPadInst *CPI = dyn_cast<CatchPadInst>(Pad)) {
+      ParentPad = CPI->getCatchSwitch()->getParentPad();
+      if (ExecutedLSDA.count(ParentPad)) {
+        ExecutedLSDA.insert(CPI);
+        // We insert its associated catchswitch too, because
+        // FuncletPadInst::getParentPad() returns a CatchSwitchInst if the child
+        // FuncletPadInst is a CleanupPadInst.
+        ExecutedLSDA.insert(CPI->getCatchSwitch());
+      }
+    } else { // CleanupPadInst
+      ParentPad = cast<CleanupPadInst>(Pad)->getParentPad();
+      if (ExecutedLSDA.count(ParentPad))
+        ExecutedLSDA.insert(Pad);
+    }
+
+    if (CatchPadInst *CPI = dyn_cast<CatchPadInst>(Pad)) {
+      if (CPI->getNumArgOperands() == 1 &&
+          cast<Constant>(CPI->getArgOperand(0))->isNullValue())
+        // In case of a single catch (...), we need neither personality call nor
+        // wasm.lsda() call
+        prepareEHPad(BB, false);
+      else {
+        if (ExecutedLSDA.count(CPI))
+          // catch (type), but one of parents already has wasm.lsda() call
+          prepareEHPad(BB, true, false, Index++);
+        else {
+          // catch (type), and none of parents has wasm.lsda() call. We have to
+          // add the call in this EH pad, and record this EH pad in
+          // ExecutedLSDA.
+          ExecutedLSDA.insert(CPI);
+          ExecutedLSDA.insert(CPI->getCatchSwitch());
+          prepareEHPad(BB, true, true, Index++);
+        }
+      }
+    } else if (isa<CleanupPadInst>(Pad)) {
+      // Cleanup pads need neither personality call nor wasm.lsda() call
+      prepareEHPad(BB, false);
+    }
   }
 
-  if (CatchPads.empty() && CleanupPads.empty())
-    return false;
+  return Changed;
+}
+
+void WasmEHPrepare::setupEHPadFunctions(Function &F) {
+  Module &M = *F.getParent();
+  IRBuilder<> IRB(F.getContext());
   assert(F.hasPersonalityFn() && "Personality function not found");
 
   // __wasm_lpad_context global variable
@@ -252,29 +340,16 @@ bool WasmEHPrepare::prepareEHPads(Function &F) {
       "_Unwind_CallPersonality", IRB.getInt32Ty(), IRB.getInt8PtrTy());
   if (Function *F = dyn_cast<Function>(CallPersonalityF.getCallee()))
     F->setDoesNotThrow();
-
-  unsigned Index = 0;
-  for (auto *BB : CatchPads) {
-    auto *CPI = cast<CatchPadInst>(BB->getFirstNonPHI());
-    // In case of a single catch (...), we don't need to emit LSDA
-    if (CPI->getNumArgOperands() == 1 &&
-        cast<Constant>(CPI->getArgOperand(0))->isNullValue())
-      prepareEHPad(BB, false);
-    else
-      prepareEHPad(BB, true, Index++);
-  }
-
-  // Cleanup pads don't need LSDA.
-  for (auto *BB : CleanupPads)
-    prepareEHPad(BB, false);
-
-  return true;
 }
 
-// Prepare an EH pad for Wasm EH handling. If NeedLSDA is false, Index is
+// Prepare an EH pad for Wasm EH handling. If NeedPersonality is false, Index is
 // ignored.
-void WasmEHPrepare::prepareEHPad(BasicBlock *BB, bool NeedLSDA,
-                                 unsigned Index) {
+void WasmEHPrepare::prepareEHPad(BasicBlock *BB, bool NeedPersonality,
+                                 bool NeedLSDA, unsigned Index) {
+  if (!IsEHPadFunctionsSetUp) {
+    IsEHPadFunctionsSetUp = true;
+    setupEHPadFunctions(*BB->getParent());
+  }
   assert(BB->isEHPad() && "BB is not an EHPad!");
   IRBuilder<> IRB(BB->getContext());
   IRB.SetInsertPoint(&*BB->getFirstInsertionPt());
@@ -283,9 +358,9 @@ void WasmEHPrepare::prepareEHPad(BasicBlock *BB, bool NeedLSDA,
   Instruction *GetExnCI = nullptr, *GetSelectorCI = nullptr;
   for (auto &U : FPI->uses()) {
     if (auto *CI = dyn_cast<CallInst>(U.getUser())) {
-      if (CI->getCalledValue() == GetExnF)
+      if (CI->getCalledOperand() == GetExnF)
         GetExnCI = CI;
-      if (CI->getCalledValue() == GetSelectorF)
+      if (CI->getCalledOperand() == GetSelectorF)
         GetSelectorCI = CI;
     }
   }
@@ -304,7 +379,7 @@ void WasmEHPrepare::prepareEHPad(BasicBlock *BB, bool NeedLSDA,
 
   // In case it is a catchpad with single catch (...) or a cleanuppad, we don't
   // need to call personality function because we don't need a selector.
-  if (!NeedLSDA) {
+  if (!NeedPersonality) {
     if (GetSelectorCI) {
       assert(GetSelectorCI->use_empty() &&
              "wasm.get.ehselector() still has uses!");
@@ -322,14 +397,8 @@ void WasmEHPrepare::prepareEHPad(BasicBlock *BB, bool NeedLSDA,
   // Pseudocode: __wasm_lpad_context.lpad_index = index;
   IRB.CreateStore(IRB.getInt32(Index), LPadIndexField);
 
-  // Store LSDA address only if this catchpad belongs to a top-level
-  // catchswitch. If there is another catchpad that dominates this pad, we don't
-  // need to store LSDA address again, because they are the same throughout the
-  // function and have been already stored before.
-  // TODO Can we not store LSDA address in user function but make libcxxabi
-  // compute it?
   auto *CPI = cast<CatchPadInst>(FPI);
-  if (isa<ConstantTokenNone>(CPI->getCatchSwitch()->getParentPad()))
+  if (NeedLSDA)
     // Pseudocode: __wasm_lpad_context.lsda = wasm.lsda();
     IRB.CreateStore(IRB.CreateCall(LSDAF), LSDAField);
 
diff --git a/llvm/lib/CodeGen/WinEHPrepare.cpp b/llvm/lib/CodeGen/WinEHPrepare.cpp
index 87958a738c67..5a25234ba850 100644
--- a/llvm/lib/CodeGen/WinEHPrepare.cpp
+++ b/llvm/lib/CodeGen/WinEHPrepare.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -234,6 +235,9 @@ static const BasicBlock *getEHPadFromPredecessor(const BasicBlock *BB,
   return CleanupPad->getParent();
 }
 
+// Starting from a EHPad, Backward walk through control-flow graph
+// to produce two primary outputs:
+//      FuncInfo.EHPadStateMap[] and FuncInfo.CxxUnwindMap[]
 static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo,
                                      const Instruction *FirstNonPHI,
                                      int ParentState) {
@@ -260,6 +264,16 @@ static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo,
 
     // catchpads are separate funclets in C++ EH due to the way rethrow works.
     int TryHigh = CatchLow - 1;
+
+    // MSVC FrameHandler3/4 on x64&Arm64 expect Catch Handlers in $tryMap$
+    //  stored in pre-order (outer first, inner next), not post-order
+    //  Add to map here.  Fix the CatchHigh after children are processed
+    const Module *Mod = BB->getParent()->getParent();
+    bool IsPreOrder = Triple(Mod->getTargetTriple()).isArch64Bit();
+    if (IsPreOrder)
+      addTryBlockMapEntry(FuncInfo, TryLow, TryHigh, CatchLow, Handlers);
+    unsigned TBMEIdx = FuncInfo.TryBlockMap.size() - 1;
+
     for (const auto *CatchPad : Handlers) {
       FuncInfo.FuncletBaseStateMap[CatchPad] = CatchLow;
       for (const User *U : CatchPad->users()) {
@@ -280,7 +294,12 @@ static void calculateCXXStateNumbers(WinEHFuncInfo &FuncInfo,
       }
     }
     int CatchHigh = FuncInfo.getLastStateNumber();
-    addTryBlockMapEntry(FuncInfo, TryLow, TryHigh, CatchHigh, Handlers);
+    // Now child Catches are processed, update CatchHigh
+    if (IsPreOrder)
+      FuncInfo.TryBlockMap[TBMEIdx].CatchHigh = CatchHigh;
+    else // PostOrder
+      addTryBlockMapEntry(FuncInfo, TryLow, TryHigh, CatchHigh, Handlers);
+
     LLVM_DEBUG(dbgs() << "TryLow[" << BB->getName() << "]: " << TryLow << '\n');
     LLVM_DEBUG(dbgs() << "TryHigh[" << BB->getName() << "]: " << TryHigh
                       << '\n');
@@ -336,6 +355,9 @@ static int addSEHFinally(WinEHFuncInfo &FuncInfo, int ParentState,
   return FuncInfo.SEHUnwindMap.size() - 1;
 }
 
+// Starting from a EHPad, Backward walk through control-flow graph
+// to produce two primary outputs:
+//      FuncInfo.EHPadStateMap[] and FuncInfo.SEHUnwindMap[]
 static void calculateSEHStateNumbers(WinEHFuncInfo &FuncInfo,
                                      const Instruction *FirstNonPHI,
                                      int ParentState) {
@@ -942,12 +964,12 @@ void WinEHPrepare::removeImplausibleInstructions(Function &F) {
 
     for (BasicBlock *BB : BlocksInFunclet) {
       for (Instruction &I : *BB) {
-        CallSite CS(&I);
-        if (!CS)
+        auto *CB = dyn_cast<CallBase>(&I);
+        if (!CB)
           continue;
 
         Value *FuncletBundleOperand = nullptr;
-        if (auto BU = CS.getOperandBundle(LLVMContext::OB_funclet))
+        if (auto BU = CB->getOperandBundle(LLVMContext::OB_funclet))
           FuncletBundleOperand = BU->Inputs.front();
 
         if (FuncletBundleOperand == FuncletPad)
@@ -955,13 +977,13 @@ void WinEHPrepare::removeImplausibleInstructions(Function &F) {
 
         // Skip call sites which are nounwind intrinsics or inline asm.
         auto *CalledFn =
-            dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
-        if (CalledFn && ((CalledFn->isIntrinsic() && CS.doesNotThrow()) ||
-                         CS.isInlineAsm()))
+            dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
+        if (CalledFn && ((CalledFn->isIntrinsic() && CB->doesNotThrow()) ||
+                         CB->isInlineAsm()))
           continue;
 
         // This call site was not part of this funclet, remove it.
-        if (CS.isInvoke()) {
+        if (isa<InvokeInst>(CB)) {
           // Remove the unwind edge if it was an invoke.
           removeUnwindEdge(BB);
           // Get a pointer to the new call.
@@ -1050,10 +1072,10 @@ bool WinEHPrepare::prepareExplicitEH(Function &F) {
                                 DemoteCatchSwitchPHIOnlyOpt);
 
   if (!DisableCleanups) {
-    LLVM_DEBUG(verifyFunction(F));
+    assert(!verifyFunction(F, &dbgs()));
     removeImplausibleInstructions(F);
 
-    LLVM_DEBUG(verifyFunction(F));
+    assert(!verifyFunction(F, &dbgs()));
     cleanupPreparedFunclets(F);
   }
 
diff --git a/llvm/lib/CodeGen/XRayInstrumentation.cpp b/llvm/lib/CodeGen/XRayInstrumentation.cpp
index 4847a0c3e842..ab9c0e81ebdc 100644
--- a/llvm/lib/CodeGen/XRayInstrumentation.cpp
+++ b/llvm/lib/CodeGen/XRayInstrumentation.cpp
@@ -111,7 +111,7 @@ void XRayInstrumentation::replaceRetWithPatchableRet(
         for (auto &MO : T.operands())
           MIB.add(MO);
         Terminators.push_back(&T);
-        if (T.isCall())
+        if (T.shouldUpdateCallSiteInfo())
           MF.eraseCallSiteInfo(&T);
       }
     }
@@ -148,40 +148,51 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
   bool AlwaysInstrument = !InstrAttr.hasAttribute(Attribute::None) &&
                           InstrAttr.isStringAttribute() &&
                           InstrAttr.getValueAsString() == "xray-always";
-  Attribute Attr = F.getFnAttribute("xray-instruction-threshold");
-  unsigned XRayThreshold = 0;
+  auto ThresholdAttr = F.getFnAttribute("xray-instruction-threshold");
+  auto IgnoreLoopsAttr = F.getFnAttribute("xray-ignore-loops");
+  unsigned int XRayThreshold = 0;
   if (!AlwaysInstrument) {
-    if (Attr.hasAttribute(Attribute::None) || !Attr.isStringAttribute())
+    if (ThresholdAttr.hasAttribute(Attribute::None) ||
+        !ThresholdAttr.isStringAttribute())
       return false; // XRay threshold attribute not found.
-    if (Attr.getValueAsString().getAsInteger(10, XRayThreshold))
+    if (ThresholdAttr.getValueAsString().getAsInteger(10, XRayThreshold))
       return false; // Invalid value for threshold.
 
+    bool IgnoreLoops = !IgnoreLoopsAttr.hasAttribute(Attribute::None);
+
     // Count the number of MachineInstr`s in MachineFunction
     int64_t MICount = 0;
     for (const auto &MBB : MF)
       MICount += MBB.size();
 
-    // Get MachineDominatorTree or compute it on the fly if it's unavailable
-    auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
-    MachineDominatorTree ComputedMDT;
-    if (!MDT) {
-      ComputedMDT.getBase().recalculate(MF);
-      MDT = &ComputedMDT;
-    }
+    bool TooFewInstrs = MICount < XRayThreshold;
 
-    // Get MachineLoopInfo or compute it on the fly if it's unavailable
-    auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
-    MachineLoopInfo ComputedMLI;
-    if (!MLI) {
-      ComputedMLI.getBase().analyze(MDT->getBase());
-      MLI = &ComputedMLI;
-    }
+    if (!IgnoreLoops) {
+      // Get MachineDominatorTree or compute it on the fly if it's unavailable
+      auto *MDT = getAnalysisIfAvailable<MachineDominatorTree>();
+      MachineDominatorTree ComputedMDT;
+      if (!MDT) {
+        ComputedMDT.getBase().recalculate(MF);
+        MDT = &ComputedMDT;
+      }
 
-    // Check if we have a loop.
-    // FIXME: Maybe make this smarter, and see whether the loops are dependent
-    // on inputs or side-effects?
-    if (MLI->empty() && MICount < XRayThreshold)
-      return false; // Function is too small and has no loops.
+      // Get MachineLoopInfo or compute it on the fly if it's unavailable
+      auto *MLI = getAnalysisIfAvailable<MachineLoopInfo>();
+      MachineLoopInfo ComputedMLI;
+      if (!MLI) {
+        ComputedMLI.getBase().analyze(MDT->getBase());
+        MLI = &ComputedMLI;
+      }
+
+      // Check if we have a loop.
+      // FIXME: Maybe make this smarter, and see whether the loops are dependent
+      // on inputs or side-effects?
+      if (MLI->empty() && TooFewInstrs)
+        return false; // Function is too small and has no loops.
+    } else if (TooFewInstrs) {
+      // Function is too small
+      return false;
+    }
   }
 
   // We look for the first non-empty MachineBasicBlock, so that we can insert
@@ -201,43 +212,47 @@ bool XRayInstrumentation::runOnMachineFunction(MachineFunction &MF) {
     return false;
   }
 
-  // First, insert an PATCHABLE_FUNCTION_ENTER as the first instruction of the
-  // MachineFunction.
-  BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(),
-          TII->get(TargetOpcode::PATCHABLE_FUNCTION_ENTER));
-
-  switch (MF.getTarget().getTargetTriple().getArch()) {
-  case Triple::ArchType::arm:
-  case Triple::ArchType::thumb:
-  case Triple::ArchType::aarch64:
-  case Triple::ArchType::mips:
-  case Triple::ArchType::mipsel:
-  case Triple::ArchType::mips64:
-  case Triple::ArchType::mips64el: {
-    // For the architectures which don't have a single return instruction
-    InstrumentationOptions op;
-    op.HandleTailcall = false;
-    op.HandleAllReturns = true;
-    prependRetWithPatchableExit(MF, TII, op);
-    break;
-  }
-  case Triple::ArchType::ppc64le: {
-    // PPC has conditional returns. Turn them into branch and plain returns.
-    InstrumentationOptions op;
-    op.HandleTailcall = false;
-    op.HandleAllReturns = true;
-    replaceRetWithPatchableRet(MF, TII, op);
-    break;
-  }
-  default: {
-    // For the architectures that have a single return instruction (such as
-    //   RETQ on x86_64).
-    InstrumentationOptions op;
-    op.HandleTailcall = true;
-    op.HandleAllReturns = false;
-    replaceRetWithPatchableRet(MF, TII, op);
-    break;
+  if (!F.hasFnAttribute("xray-skip-entry")) {
+    // First, insert an PATCHABLE_FUNCTION_ENTER as the first instruction of the
+    // MachineFunction.
+    BuildMI(FirstMBB, FirstMI, FirstMI.getDebugLoc(),
+            TII->get(TargetOpcode::PATCHABLE_FUNCTION_ENTER));
   }
+
+  if (!F.hasFnAttribute("xray-skip-exit")) {
+    switch (MF.getTarget().getTargetTriple().getArch()) {
+    case Triple::ArchType::arm:
+    case Triple::ArchType::thumb:
+    case Triple::ArchType::aarch64:
+    case Triple::ArchType::mips:
+    case Triple::ArchType::mipsel:
+    case Triple::ArchType::mips64:
+    case Triple::ArchType::mips64el: {
+      // For the architectures which don't have a single return instruction
+      InstrumentationOptions op;
+      op.HandleTailcall = false;
+      op.HandleAllReturns = true;
+      prependRetWithPatchableExit(MF, TII, op);
+      break;
+    }
+    case Triple::ArchType::ppc64le: {
+      // PPC has conditional returns. Turn them into branch and plain returns.
+      InstrumentationOptions op;
+      op.HandleTailcall = false;
+      op.HandleAllReturns = true;
+      replaceRetWithPatchableRet(MF, TII, op);
+      break;
+    }
+    default: {
+      // For the architectures that have a single return instruction (such as
+      //   RETQ on x86_64).
+      InstrumentationOptions op;
+      op.HandleTailcall = true;
+      op.HandleAllReturns = false;
+      replaceRetWithPatchableRet(MF, TII, op);
+      break;
+    }
+    }
   }
   return true;
 }
diff --git a/llvm/lib/DWARFLinker/DWARFLinker.cpp b/llvm/lib/DWARFLinker/DWARFLinker.cpp
index 65b2a1bdf1fc..12b19e77a422 100644
--- a/llvm/lib/DWARFLinker/DWARFLinker.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinker.cpp
@@ -7,11 +7,2586 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DWARFLinker/DWARFLinker.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/NonRelocatableStringpool.h"
+#include "llvm/DWARFLinker/DWARFLinkerDeclContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFAbbreviationDeclaration.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugLine.h"
+#include "llvm/DebugInfo/DWARF/DWARFDebugRangeList.h"
+#include "llvm/DebugInfo/DWARF/DWARFDie.h"
+#include "llvm/DebugInfo/DWARF/DWARFFormValue.h"
+#include "llvm/DebugInfo/DWARF/DWARFSection.h"
+#include "llvm/DebugInfo/DWARF/DWARFUnit.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FormatVariadic.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/Path.h"
+#include "llvm/Support/ThreadPool.h"
+#include <vector>
 
 namespace llvm {
 
+/// Hold the input and output of the debug info size in bytes.
+struct DebugInfoSize {
+  uint64_t Input;
+  uint64_t Output;
+};
+
+/// Compute the total size of the debug info.
+static uint64_t getDebugInfoSize(DWARFContext &Dwarf) {
+  uint64_t Size = 0;
+  for (auto &Unit : Dwarf.compile_units()) {
+    Size += Unit->getLength();
+  }
+  return Size;
+}
+
+/// Similar to DWARFUnitSection::getUnitForOffset(), but returning our
+/// CompileUnit object instead.
+static CompileUnit *getUnitForOffset(const UnitListTy &Units, uint64_t Offset) {
+  auto CU = std::upper_bound(
+      Units.begin(), Units.end(), Offset,
+      [](uint64_t LHS, const std::unique_ptr<CompileUnit> &RHS) {
+        return LHS < RHS->getOrigUnit().getNextUnitOffset();
+      });
+  return CU != Units.end() ? CU->get() : nullptr;
+}
+
+/// Resolve the DIE attribute reference that has been extracted in \p RefValue.
+/// The resulting DIE might be in another CompileUnit which is stored into \p
+/// ReferencedCU. \returns null if resolving fails for any reason.
+DWARFDie DWARFLinker::resolveDIEReference(const DwarfFile &File,
+                                          const UnitListTy &Units,
+                                          const DWARFFormValue &RefValue,
+                                          const DWARFDie &DIE,
+                                          CompileUnit *&RefCU) {
+  assert(RefValue.isFormClass(DWARFFormValue::FC_Reference));
+  uint64_t RefOffset = *RefValue.getAsReference();
+  if ((RefCU = getUnitForOffset(Units, RefOffset)))
+    if (const auto RefDie = RefCU->getOrigUnit().getDIEForOffset(RefOffset)) {
+      // In a file with broken references, an attribute might point to a NULL
+      // DIE.
+      if (!RefDie.isNULL())
+        return RefDie;
+    }
+
+  reportWarning("could not find referenced DIE", File, &DIE);
+  return DWARFDie();
+}
+
+/// \returns whether the passed \a Attr type might contain a DIE reference
+/// suitable for ODR uniquing.
+static bool isODRAttribute(uint16_t Attr) {
+  switch (Attr) {
+  default:
+    return false;
+  case dwarf::DW_AT_type:
+  case dwarf::DW_AT_containing_type:
+  case dwarf::DW_AT_specification:
+  case dwarf::DW_AT_abstract_origin:
+  case dwarf::DW_AT_import:
+    return true;
+  }
+  llvm_unreachable("Improper attribute.");
+}
+
+static bool isTypeTag(uint16_t Tag) {
+  switch (Tag) {
+  case dwarf::DW_TAG_array_type:
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_enumeration_type:
+  case dwarf::DW_TAG_pointer_type:
+  case dwarf::DW_TAG_reference_type:
+  case dwarf::DW_TAG_string_type:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_subroutine_type:
+  case dwarf::DW_TAG_typedef:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_ptr_to_member_type:
+  case dwarf::DW_TAG_set_type:
+  case dwarf::DW_TAG_subrange_type:
+  case dwarf::DW_TAG_base_type:
+  case dwarf::DW_TAG_const_type:
+  case dwarf::DW_TAG_constant:
+  case dwarf::DW_TAG_file_type:
+  case dwarf::DW_TAG_namelist:
+  case dwarf::DW_TAG_packed_type:
+  case dwarf::DW_TAG_volatile_type:
+  case dwarf::DW_TAG_restrict_type:
+  case dwarf::DW_TAG_atomic_type:
+  case dwarf::DW_TAG_interface_type:
+  case dwarf::DW_TAG_unspecified_type:
+  case dwarf::DW_TAG_shared_type:
+    return true;
+  default:
+    break;
+  }
+  return false;
+}
+
 AddressesMap::~AddressesMap() {}
 
 DwarfEmitter::~DwarfEmitter() {}
 
+static Optional<StringRef> StripTemplateParameters(StringRef Name) {
+  // We are looking for template parameters to strip from Name. e.g.
+  //
+  //  operator<<B>
+  //
+  // We look for > at the end but if it does not contain any < then we
+  // have something like operator>>. We check for the operator<=> case.
+  if (!Name.endswith(">") || Name.count("<") == 0 || Name.endswith("<=>"))
+    return {};
+
+  // How many < until we have the start of the template parameters.
+  size_t NumLeftAnglesToSkip = 1;
+
+  // If we have operator<=> then we need to skip its < as well.
+  NumLeftAnglesToSkip += Name.count("<=>");
+
+  size_t RightAngleCount = Name.count('>');
+  size_t LeftAngleCount = Name.count('<');
+
+  // If we have more < than > we have operator< or operator<<
+  // we to account for their < as well.
+  if (LeftAngleCount > RightAngleCount)
+    NumLeftAnglesToSkip += LeftAngleCount - RightAngleCount;
+
+  size_t StartOfTemplate = 0;
+  while (NumLeftAnglesToSkip--)
+    StartOfTemplate = Name.find('<', StartOfTemplate) + 1;
+
+  return Name.substr(0, StartOfTemplate - 1);
+}
+
+bool DWARFLinker::DIECloner::getDIENames(const DWARFDie &Die,
+                                         AttributesInfo &Info,
+                                         OffsetsStringPool &StringPool,
+                                         bool StripTemplate) {
+  // This function will be called on DIEs having low_pcs and
+  // ranges. As getting the name might be more expansive, filter out
+  // blocks directly.
+  if (Die.getTag() == dwarf::DW_TAG_lexical_block)
+    return false;
+
+  if (!Info.MangledName)
+    if (const char *MangledName = Die.getLinkageName())
+      Info.MangledName = StringPool.getEntry(MangledName);
+
+  if (!Info.Name)
+    if (const char *Name = Die.getShortName())
+      Info.Name = StringPool.getEntry(Name);
+
+  if (!Info.MangledName)
+    Info.MangledName = Info.Name;
+
+  if (StripTemplate && Info.Name && Info.MangledName != Info.Name) {
+    StringRef Name = Info.Name.getString();
+    if (Optional<StringRef> StrippedName = StripTemplateParameters(Name))
+      Info.NameWithoutTemplate = StringPool.getEntry(*StrippedName);
+  }
+
+  return Info.Name || Info.MangledName;
+}
+
+/// Resolve the relative path to a build artifact referenced by DWARF by
+/// applying DW_AT_comp_dir.
+static void resolveRelativeObjectPath(SmallVectorImpl<char> &Buf, DWARFDie CU) {
+  sys::path::append(Buf, dwarf::toString(CU.find(dwarf::DW_AT_comp_dir), ""));
+}
+
+/// Collect references to parseable Swift interfaces in imported
+/// DW_TAG_module blocks.
+static void analyzeImportedModule(
+    const DWARFDie &DIE, CompileUnit &CU,
+    swiftInterfacesMap *ParseableSwiftInterfaces,
+    std::function<void(const Twine &, const DWARFDie &)> ReportWarning) {
+  if (CU.getLanguage() != dwarf::DW_LANG_Swift)
+    return;
+
+  if (!ParseableSwiftInterfaces)
+    return;
+
+  StringRef Path = dwarf::toStringRef(DIE.find(dwarf::DW_AT_LLVM_include_path));
+  if (!Path.endswith(".swiftinterface"))
+    return;
+  // Don't track interfaces that are part of the SDK.
+  StringRef SysRoot = dwarf::toStringRef(DIE.find(dwarf::DW_AT_LLVM_sysroot));
+  if (SysRoot.empty())
+    SysRoot = CU.getSysRoot();
+  if (!SysRoot.empty() && Path.startswith(SysRoot))
+    return;
+  if (Optional<DWARFFormValue> Val = DIE.find(dwarf::DW_AT_name))
+    if (Optional<const char *> Name = Val->getAsCString()) {
+      auto &Entry = (*ParseableSwiftInterfaces)[*Name];
+      // The prepend path is applied later when copying.
+      DWARFDie CUDie = CU.getOrigUnit().getUnitDIE();
+      SmallString<128> ResolvedPath;
+      if (sys::path::is_relative(Path))
+        resolveRelativeObjectPath(ResolvedPath, CUDie);
+      sys::path::append(ResolvedPath, Path);
+      if (!Entry.empty() && Entry != ResolvedPath)
+        ReportWarning(
+            Twine("Conflicting parseable interfaces for Swift Module ") +
+                *Name + ": " + Entry + " and " + Path,
+            DIE);
+      Entry = std::string(ResolvedPath.str());
+    }
+}
+
+/// Recursive helper to build the global DeclContext information and
+/// gather the child->parent relationships in the original compile unit.
+///
+/// \return true when this DIE and all of its children are only
+/// forward declarations to types defined in external clang modules
+/// (i.e., forward declarations that are children of a DW_TAG_module).
+static bool analyzeContextInfo(
+    const DWARFDie &DIE, unsigned ParentIdx, CompileUnit &CU,
+    DeclContext *CurrentDeclContext, UniquingStringPool &StringPool,
+    DeclContextTree &Contexts, uint64_t ModulesEndOffset,
+    swiftInterfacesMap *ParseableSwiftInterfaces,
+    std::function<void(const Twine &, const DWARFDie &)> ReportWarning,
+    bool InImportedModule = false) {
+  unsigned MyIdx = CU.getOrigUnit().getDIEIndex(DIE);
+  CompileUnit::DIEInfo &Info = CU.getInfo(MyIdx);
+
+  // Clang imposes an ODR on modules(!) regardless of the language:
+  //  "The module-id should consist of only a single identifier,
+  //   which provides the name of the module being defined. Each
+  //   module shall have a single definition."
+  //
+  // This does not extend to the types inside the modules:
+  //  "[I]n C, this implies that if two structs are defined in
+  //   different submodules with the same name, those two types are
+  //   distinct types (but may be compatible types if their
+  //   definitions match)."
+  //
+  // We treat non-C++ modules like namespaces for this reason.
+  if (DIE.getTag() == dwarf::DW_TAG_module && ParentIdx == 0 &&
+      dwarf::toString(DIE.find(dwarf::DW_AT_name), "") !=
+          CU.getClangModuleName()) {
+    InImportedModule = true;
+    analyzeImportedModule(DIE, CU, ParseableSwiftInterfaces, ReportWarning);
+  }
+
+  Info.ParentIdx = ParentIdx;
+  bool InClangModule = CU.isClangModule() || InImportedModule;
+  if (CU.hasODR() || InClangModule) {
+    if (CurrentDeclContext) {
+      auto PtrInvalidPair = Contexts.getChildDeclContext(
+          *CurrentDeclContext, DIE, CU, StringPool, InClangModule);
+      CurrentDeclContext = PtrInvalidPair.getPointer();
+      Info.Ctxt =
+          PtrInvalidPair.getInt() ? nullptr : PtrInvalidPair.getPointer();
+      if (Info.Ctxt)
+        Info.Ctxt->setDefinedInClangModule(InClangModule);
+    } else
+      Info.Ctxt = CurrentDeclContext = nullptr;
+  }
+
+  Info.Prune = InImportedModule;
+  if (DIE.hasChildren())
+    for (auto Child : DIE.children())
+      Info.Prune &= analyzeContextInfo(Child, MyIdx, CU, CurrentDeclContext,
+                                       StringPool, Contexts, ModulesEndOffset,
+                                       ParseableSwiftInterfaces, ReportWarning,
+                                       InImportedModule);
+
+  // Prune this DIE if it is either a forward declaration inside a
+  // DW_TAG_module or a DW_TAG_module that contains nothing but
+  // forward declarations.
+  Info.Prune &= (DIE.getTag() == dwarf::DW_TAG_module) ||
+                (isTypeTag(DIE.getTag()) &&
+                 dwarf::toUnsigned(DIE.find(dwarf::DW_AT_declaration), 0));
+
+  // Only prune forward declarations inside a DW_TAG_module for which a
+  // definition exists elsewhere.
+  if (ModulesEndOffset == 0)
+    Info.Prune &= Info.Ctxt && Info.Ctxt->getCanonicalDIEOffset();
+  else
+    Info.Prune &= Info.Ctxt && Info.Ctxt->getCanonicalDIEOffset() > 0 &&
+                  Info.Ctxt->getCanonicalDIEOffset() <= ModulesEndOffset;
+
+  return Info.Prune;
+}
+
+static bool dieNeedsChildrenToBeMeaningful(uint32_t Tag) {
+  switch (Tag) {
+  default:
+    return false;
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_common_block:
+  case dwarf::DW_TAG_lexical_block:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_subprogram:
+  case dwarf::DW_TAG_subroutine_type:
+  case dwarf::DW_TAG_union_type:
+    return true;
+  }
+  llvm_unreachable("Invalid Tag");
+}
+
+void DWARFLinker::cleanupAuxiliarryData(LinkContext &Context) {
+  Context.clear();
+
+  for (auto I = DIEBlocks.begin(), E = DIEBlocks.end(); I != E; ++I)
+    (*I)->~DIEBlock();
+  for (auto I = DIELocs.begin(), E = DIELocs.end(); I != E; ++I)
+    (*I)->~DIELoc();
+
+  DIEBlocks.clear();
+  DIELocs.clear();
+  DIEAlloc.Reset();
+}
+
+/// Get the starting and ending (exclusive) offset for the
+/// attribute with index \p Idx descibed by \p Abbrev. \p Offset is
+/// supposed to point to the position of the first attribute described
+/// by \p Abbrev.
+/// \return [StartOffset, EndOffset) as a pair.
+static std::pair<uint64_t, uint64_t>
+getAttributeOffsets(const DWARFAbbreviationDeclaration *Abbrev, unsigned Idx,
+                    uint64_t Offset, const DWARFUnit &Unit) {
+  DataExtractor Data = Unit.getDebugInfoExtractor();
+
+  for (unsigned I = 0; I < Idx; ++I)
+    DWARFFormValue::skipValue(Abbrev->getFormByIndex(I), Data, &Offset,
+                              Unit.getFormParams());
+
+  uint64_t End = Offset;
+  DWARFFormValue::skipValue(Abbrev->getFormByIndex(Idx), Data, &End,
+                            Unit.getFormParams());
+
+  return std::make_pair(Offset, End);
+}
+
+/// Check if a variable describing DIE should be kept.
+/// \returns updated TraversalFlags.
+unsigned DWARFLinker::shouldKeepVariableDIE(AddressesMap &RelocMgr,
+                                            const DWARFDie &DIE,
+                                            CompileUnit &Unit,
+                                            CompileUnit::DIEInfo &MyInfo,
+                                            unsigned Flags) {
+  const auto *Abbrev = DIE.getAbbreviationDeclarationPtr();
+
+  // Global variables with constant value can always be kept.
+  if (!(Flags & TF_InFunctionScope) &&
+      Abbrev->findAttributeIndex(dwarf::DW_AT_const_value)) {
+    MyInfo.InDebugMap = true;
+    return Flags | TF_Keep;
+  }
+
+  Optional<uint32_t> LocationIdx =
+      Abbrev->findAttributeIndex(dwarf::DW_AT_location);
+  if (!LocationIdx)
+    return Flags;
+
+  uint64_t Offset = DIE.getOffset() + getULEB128Size(Abbrev->getCode());
+  const DWARFUnit &OrigUnit = Unit.getOrigUnit();
+  uint64_t LocationOffset, LocationEndOffset;
+  std::tie(LocationOffset, LocationEndOffset) =
+      getAttributeOffsets(Abbrev, *LocationIdx, Offset, OrigUnit);
+
+  // See if there is a relocation to a valid debug map entry inside
+  // this variable's location. The order is important here. We want to
+  // always check if the variable has a valid relocation, so that the
+  // DIEInfo is filled. However, we don't want a static variable in a
+  // function to force us to keep the enclosing function.
+  if (!RelocMgr.hasValidRelocationAt(LocationOffset, LocationEndOffset,
+                                     MyInfo) ||
+      (Flags & TF_InFunctionScope))
+    return Flags;
+
+  if (Options.Verbose) {
+    outs() << "Keeping variable DIE:";
+    DIDumpOptions DumpOpts;
+    DumpOpts.ChildRecurseDepth = 0;
+    DumpOpts.Verbose = Options.Verbose;
+    DIE.dump(outs(), 8 /* Indent */, DumpOpts);
+  }
+
+  return Flags | TF_Keep;
+}
+
+/// Check if a function describing DIE should be kept.
+/// \returns updated TraversalFlags.
+unsigned DWARFLinker::shouldKeepSubprogramDIE(
+    AddressesMap &RelocMgr, RangesTy &Ranges, const DWARFDie &DIE,
+    const DwarfFile &File, CompileUnit &Unit, CompileUnit::DIEInfo &MyInfo,
+    unsigned Flags) {
+  const auto *Abbrev = DIE.getAbbreviationDeclarationPtr();
+
+  Flags |= TF_InFunctionScope;
+
+  Optional<uint32_t> LowPcIdx = Abbrev->findAttributeIndex(dwarf::DW_AT_low_pc);
+  if (!LowPcIdx)
+    return Flags;
+
+  uint64_t Offset = DIE.getOffset() + getULEB128Size(Abbrev->getCode());
+  DWARFUnit &OrigUnit = Unit.getOrigUnit();
+  uint64_t LowPcOffset, LowPcEndOffset;
+  std::tie(LowPcOffset, LowPcEndOffset) =
+      getAttributeOffsets(Abbrev, *LowPcIdx, Offset, OrigUnit);
+
+  auto LowPc = dwarf::toAddress(DIE.find(dwarf::DW_AT_low_pc));
+  assert(LowPc.hasValue() && "low_pc attribute is not an address.");
+  if (!LowPc ||
+      !RelocMgr.hasValidRelocationAt(LowPcOffset, LowPcEndOffset, MyInfo))
+    return Flags;
+
+  if (Options.Verbose) {
+    outs() << "Keeping subprogram DIE:";
+    DIDumpOptions DumpOpts;
+    DumpOpts.ChildRecurseDepth = 0;
+    DumpOpts.Verbose = Options.Verbose;
+    DIE.dump(outs(), 8 /* Indent */, DumpOpts);
+  }
+
+  if (DIE.getTag() == dwarf::DW_TAG_label) {
+    if (Unit.hasLabelAt(*LowPc))
+      return Flags;
+    // FIXME: dsymutil-classic compat. dsymutil-classic doesn't consider labels
+    // that don't fall into the CU's aranges. This is wrong IMO. Debug info
+    // generation bugs aside, this is really wrong in the case of labels, where
+    // a label marking the end of a function will have a PC == CU's high_pc.
+    if (dwarf::toAddress(OrigUnit.getUnitDIE().find(dwarf::DW_AT_high_pc))
+            .getValueOr(UINT64_MAX) <= LowPc)
+      return Flags;
+    Unit.addLabelLowPc(*LowPc, MyInfo.AddrAdjust);
+    return Flags | TF_Keep;
+  }
+
+  Flags |= TF_Keep;
+
+  Optional<uint64_t> HighPc = DIE.getHighPC(*LowPc);
+  if (!HighPc) {
+    reportWarning("Function without high_pc. Range will be discarded.\n", File,
+                  &DIE);
+    return Flags;
+  }
+
+  // Replace the debug map range with a more accurate one.
+  Ranges[*LowPc] = ObjFileAddressRange(*HighPc, MyInfo.AddrAdjust);
+  Unit.addFunctionRange(*LowPc, *HighPc, MyInfo.AddrAdjust);
+  return Flags;
+}
+
+/// Check if a DIE should be kept.
+/// \returns updated TraversalFlags.
+unsigned DWARFLinker::shouldKeepDIE(AddressesMap &RelocMgr, RangesTy &Ranges,
+                                    const DWARFDie &DIE, const DwarfFile &File,
+                                    CompileUnit &Unit,
+                                    CompileUnit::DIEInfo &MyInfo,
+                                    unsigned Flags) {
+  switch (DIE.getTag()) {
+  case dwarf::DW_TAG_constant:
+  case dwarf::DW_TAG_variable:
+    return shouldKeepVariableDIE(RelocMgr, DIE, Unit, MyInfo, Flags);
+  case dwarf::DW_TAG_subprogram:
+  case dwarf::DW_TAG_label:
+    return shouldKeepSubprogramDIE(RelocMgr, Ranges, DIE, File, Unit, MyInfo,
+                                   Flags);
+  case dwarf::DW_TAG_base_type:
+    // DWARF Expressions may reference basic types, but scanning them
+    // is expensive. Basic types are tiny, so just keep all of them.
+  case dwarf::DW_TAG_imported_module:
+  case dwarf::DW_TAG_imported_declaration:
+  case dwarf::DW_TAG_imported_unit:
+    // We always want to keep these.
+    return Flags | TF_Keep;
+  default:
+    break;
+  }
+
+  return Flags;
+}
+
+/// Helper that updates the completeness of the current DIE based on the
+/// completeness of one of its children. It depends on the incompleteness of
+/// the children already being computed.
+static void updateChildIncompleteness(const DWARFDie &Die, CompileUnit &CU,
+                                      CompileUnit::DIEInfo &ChildInfo) {
+  switch (Die.getTag()) {
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_class_type:
+    break;
+  default:
+    return;
+  }
+
+  unsigned Idx = CU.getOrigUnit().getDIEIndex(Die);
+  CompileUnit::DIEInfo &MyInfo = CU.getInfo(Idx);
+
+  if (ChildInfo.Incomplete || ChildInfo.Prune)
+    MyInfo.Incomplete = true;
+}
+
+/// Helper that updates the completeness of the current DIE based on the
+/// completeness of the DIEs it references. It depends on the incompleteness of
+/// the referenced DIE already being computed.
+static void updateRefIncompleteness(const DWARFDie &Die, CompileUnit &CU,
+                                    CompileUnit::DIEInfo &RefInfo) {
+  switch (Die.getTag()) {
+  case dwarf::DW_TAG_typedef:
+  case dwarf::DW_TAG_member:
+  case dwarf::DW_TAG_reference_type:
+  case dwarf::DW_TAG_ptr_to_member_type:
+  case dwarf::DW_TAG_pointer_type:
+    break;
+  default:
+    return;
+  }
+
+  unsigned Idx = CU.getOrigUnit().getDIEIndex(Die);
+  CompileUnit::DIEInfo &MyInfo = CU.getInfo(Idx);
+
+  if (MyInfo.Incomplete)
+    return;
+
+  if (RefInfo.Incomplete)
+    MyInfo.Incomplete = true;
+}
+
+/// Look at the children of the given DIE and decide whether they should be
+/// kept.
+void DWARFLinker::lookForChildDIEsToKeep(
+    const DWARFDie &Die, CompileUnit &CU, unsigned Flags,
+    SmallVectorImpl<WorklistItem> &Worklist) {
+  // The TF_ParentWalk flag tells us that we are currently walking up the
+  // parent chain of a required DIE, and we don't want to mark all the children
+  // of the parents as kept (consider for example a DW_TAG_namespace node in
+  // the parent chain). There are however a set of DIE types for which we want
+  // to ignore that directive and still walk their children.
+  if (dieNeedsChildrenToBeMeaningful(Die.getTag()))
+    Flags &= ~DWARFLinker::TF_ParentWalk;
+
+  // We're finished if this DIE has no children or we're walking the parent
+  // chain.
+  if (!Die.hasChildren() || (Flags & DWARFLinker::TF_ParentWalk))
+    return;
+
+  // Add children in reverse order to the worklist to effectively process them
+  // in order.
+  for (auto Child : reverse(Die.children())) {
+    // Add a worklist item before every child to calculate incompleteness right
+    // after the current child is processed.
+    unsigned Idx = CU.getOrigUnit().getDIEIndex(Child);
+    CompileUnit::DIEInfo &ChildInfo = CU.getInfo(Idx);
+    Worklist.emplace_back(Die, CU, WorklistItemType::UpdateChildIncompleteness,
+                          &ChildInfo);
+    Worklist.emplace_back(Child, CU, Flags);
+  }
+}
+
+/// Look at DIEs referenced by the given DIE and decide whether they should be
+/// kept. All DIEs referenced though attributes should be kept.
+void DWARFLinker::lookForRefDIEsToKeep(
+    const DWARFDie &Die, CompileUnit &CU, unsigned Flags,
+    const UnitListTy &Units, const DwarfFile &File,
+    SmallVectorImpl<WorklistItem> &Worklist) {
+  bool UseOdr = (Flags & DWARFLinker::TF_DependencyWalk)
+                    ? (Flags & DWARFLinker::TF_ODR)
+                    : CU.hasODR();
+  DWARFUnit &Unit = CU.getOrigUnit();
+  DWARFDataExtractor Data = Unit.getDebugInfoExtractor();
+  const auto *Abbrev = Die.getAbbreviationDeclarationPtr();
+  uint64_t Offset = Die.getOffset() + getULEB128Size(Abbrev->getCode());
+
+  SmallVector<std::pair<DWARFDie, CompileUnit &>, 4> ReferencedDIEs;
+  for (const auto &AttrSpec : Abbrev->attributes()) {
+    DWARFFormValue Val(AttrSpec.Form);
+    if (!Val.isFormClass(DWARFFormValue::FC_Reference) ||
+        AttrSpec.Attr == dwarf::DW_AT_sibling) {
+      DWARFFormValue::skipValue(AttrSpec.Form, Data, &Offset,
+                                Unit.getFormParams());
+      continue;
+    }
+
+    Val.extractValue(Data, &Offset, Unit.getFormParams(), &Unit);
+    CompileUnit *ReferencedCU;
+    if (auto RefDie =
+            resolveDIEReference(File, Units, Val, Die, ReferencedCU)) {
+      uint32_t RefIdx = ReferencedCU->getOrigUnit().getDIEIndex(RefDie);
+      CompileUnit::DIEInfo &Info = ReferencedCU->getInfo(RefIdx);
+      bool IsModuleRef = Info.Ctxt && Info.Ctxt->getCanonicalDIEOffset() &&
+                         Info.Ctxt->isDefinedInClangModule();
+      // If the referenced DIE has a DeclContext that has already been
+      // emitted, then do not keep the one in this CU. We'll link to
+      // the canonical DIE in cloneDieReferenceAttribute.
+      //
+      // FIXME: compatibility with dsymutil-classic. UseODR shouldn't
+      // be necessary and could be advantageously replaced by
+      // ReferencedCU->hasODR() && CU.hasODR().
+      //
+      // FIXME: compatibility with dsymutil-classic. There is no
+      // reason not to unique ref_addr references.
+      if (AttrSpec.Form != dwarf::DW_FORM_ref_addr && (UseOdr || IsModuleRef) &&
+          Info.Ctxt &&
+          Info.Ctxt != ReferencedCU->getInfo(Info.ParentIdx).Ctxt &&
+          Info.Ctxt->getCanonicalDIEOffset() && isODRAttribute(AttrSpec.Attr))
+        continue;
+
+      // Keep a module forward declaration if there is no definition.
+      if (!(isODRAttribute(AttrSpec.Attr) && Info.Ctxt &&
+            Info.Ctxt->getCanonicalDIEOffset()))
+        Info.Prune = false;
+      ReferencedDIEs.emplace_back(RefDie, *ReferencedCU);
+    }
+  }
+
+  unsigned ODRFlag = UseOdr ? DWARFLinker::TF_ODR : 0;
+
+  // Add referenced DIEs in reverse order to the worklist to effectively
+  // process them in order.
+  for (auto &P : reverse(ReferencedDIEs)) {
+    // Add a worklist item before every child to calculate incompleteness right
+    // after the current child is processed.
+    uint32_t RefIdx = P.second.getOrigUnit().getDIEIndex(P.first);
+    CompileUnit::DIEInfo &Info = P.second.getInfo(RefIdx);
+    Worklist.emplace_back(Die, CU, WorklistItemType::UpdateRefIncompleteness,
+                          &Info);
+    Worklist.emplace_back(P.first, P.second,
+                          DWARFLinker::TF_Keep |
+                              DWARFLinker::TF_DependencyWalk | ODRFlag);
+  }
+}
+
+/// Look at the parent of the given DIE and decide whether they should be kept.
+void DWARFLinker::lookForParentDIEsToKeep(
+    unsigned AncestorIdx, CompileUnit &CU, unsigned Flags,
+    SmallVectorImpl<WorklistItem> &Worklist) {
+  // Stop if we encounter an ancestor that's already marked as kept.
+  if (CU.getInfo(AncestorIdx).Keep)
+    return;
+
+  DWARFUnit &Unit = CU.getOrigUnit();
+  DWARFDie ParentDIE = Unit.getDIEAtIndex(AncestorIdx);
+  Worklist.emplace_back(CU.getInfo(AncestorIdx).ParentIdx, CU, Flags);
+  Worklist.emplace_back(ParentDIE, CU, Flags);
+}
+
+/// Recursively walk the \p DIE tree and look for DIEs to keep. Store that
+/// information in \p CU's DIEInfo.
+///
+/// This function is the entry point of the DIE selection algorithm. It is
+/// expected to walk the DIE tree in file order and (though the mediation of
+/// its helper) call hasValidRelocation() on each DIE that might be a 'root
+/// DIE' (See DwarfLinker class comment).
+///
+/// While walking the dependencies of root DIEs, this function is also called,
+/// but during these dependency walks the file order is not respected. The
+/// TF_DependencyWalk flag tells us which kind of traversal we are currently
+/// doing.
+///
+/// The recursive algorithm is implemented iteratively as a work list because
+/// very deep recursion could exhaust the stack for large projects. The work
+/// list acts as a scheduler for different types of work that need to be
+/// performed.
+///
+/// The recursive nature of the algorithm is simulated by running the "main"
+/// algorithm (LookForDIEsToKeep) followed by either looking at more DIEs
+/// (LookForChildDIEsToKeep, LookForRefDIEsToKeep, LookForParentDIEsToKeep) or
+/// fixing up a computed property (UpdateChildIncompleteness,
+/// UpdateRefIncompleteness).
+///
+/// The return value indicates whether the DIE is incomplete.
+void DWARFLinker::lookForDIEsToKeep(AddressesMap &AddressesMap,
+                                    RangesTy &Ranges, const UnitListTy &Units,
+                                    const DWARFDie &Die, const DwarfFile &File,
+                                    CompileUnit &Cu, unsigned Flags) {
+  // LIFO work list.
+  SmallVector<WorklistItem, 4> Worklist;
+  Worklist.emplace_back(Die, Cu, Flags);
+
+  while (!Worklist.empty()) {
+    WorklistItem Current = Worklist.back();
+    Worklist.pop_back();
+
+    // Look at the worklist type to decide what kind of work to perform.
+    switch (Current.Type) {
+    case WorklistItemType::UpdateChildIncompleteness:
+      updateChildIncompleteness(Current.Die, Current.CU, *Current.OtherInfo);
+      continue;
+    case WorklistItemType::UpdateRefIncompleteness:
+      updateRefIncompleteness(Current.Die, Current.CU, *Current.OtherInfo);
+      continue;
+    case WorklistItemType::LookForChildDIEsToKeep:
+      lookForChildDIEsToKeep(Current.Die, Current.CU, Current.Flags, Worklist);
+      continue;
+    case WorklistItemType::LookForRefDIEsToKeep:
+      lookForRefDIEsToKeep(Current.Die, Current.CU, Current.Flags, Units, File,
+                           Worklist);
+      continue;
+    case WorklistItemType::LookForParentDIEsToKeep:
+      lookForParentDIEsToKeep(Current.AncestorIdx, Current.CU, Current.Flags,
+                              Worklist);
+      continue;
+    case WorklistItemType::LookForDIEsToKeep:
+      break;
+    }
+
+    unsigned Idx = Current.CU.getOrigUnit().getDIEIndex(Current.Die);
+    CompileUnit::DIEInfo &MyInfo = Current.CU.getInfo(Idx);
+
+    if (MyInfo.Prune)
+      continue;
+
+    // If the Keep flag is set, we are marking a required DIE's dependencies.
+    // If our target is already marked as kept, we're all set.
+    bool AlreadyKept = MyInfo.Keep;
+    if ((Current.Flags & TF_DependencyWalk) && AlreadyKept)
+      continue;
+
+    // We must not call shouldKeepDIE while called from keepDIEAndDependencies,
+    // because it would screw up the relocation finding logic.
+    if (!(Current.Flags & TF_DependencyWalk))
+      Current.Flags = shouldKeepDIE(AddressesMap, Ranges, Current.Die, File,
+                                    Current.CU, MyInfo, Current.Flags);
+
+    // Finish by looking for child DIEs. Because of the LIFO worklist we need
+    // to schedule that work before any subsequent items are added to the
+    // worklist.
+    Worklist.emplace_back(Current.Die, Current.CU, Current.Flags,
+                          WorklistItemType::LookForChildDIEsToKeep);
+
+    if (AlreadyKept || !(Current.Flags & TF_Keep))
+      continue;
+
+    // If it is a newly kept DIE mark it as well as all its dependencies as
+    // kept.
+    MyInfo.Keep = true;
+
+    // We're looking for incomplete types.
+    MyInfo.Incomplete =
+        Current.Die.getTag() != dwarf::DW_TAG_subprogram &&
+        Current.Die.getTag() != dwarf::DW_TAG_member &&
+        dwarf::toUnsigned(Current.Die.find(dwarf::DW_AT_declaration), 0);
+
+    // After looking at the parent chain, look for referenced DIEs. Because of
+    // the LIFO worklist we need to schedule that work before any subsequent
+    // items are added to the worklist.
+    Worklist.emplace_back(Current.Die, Current.CU, Current.Flags,
+                          WorklistItemType::LookForRefDIEsToKeep);
+
+    bool UseOdr = (Current.Flags & TF_DependencyWalk) ? (Current.Flags & TF_ODR)
+                                                      : Current.CU.hasODR();
+    unsigned ODRFlag = UseOdr ? TF_ODR : 0;
+    unsigned ParFlags = TF_ParentWalk | TF_Keep | TF_DependencyWalk | ODRFlag;
+
+    // Now schedule the parent walk.
+    Worklist.emplace_back(MyInfo.ParentIdx, Current.CU, ParFlags);
+  }
+}
+
+/// Assign an abbreviation number to \p Abbrev.
+///
+/// Our DIEs get freed after every DebugMapObject has been processed,
+/// thus the FoldingSet we use to unique DIEAbbrevs cannot refer to
+/// the instances hold by the DIEs. When we encounter an abbreviation
+/// that we don't know, we create a permanent copy of it.
+void DWARFLinker::assignAbbrev(DIEAbbrev &Abbrev) {
+  // Check the set for priors.
+  FoldingSetNodeID ID;
+  Abbrev.Profile(ID);
+  void *InsertToken;
+  DIEAbbrev *InSet = AbbreviationsSet.FindNodeOrInsertPos(ID, InsertToken);
+
+  // If it's newly added.
+  if (InSet) {
+    // Assign existing abbreviation number.
+    Abbrev.setNumber(InSet->getNumber());
+  } else {
+    // Add to abbreviation list.
+    Abbreviations.push_back(
+        std::make_unique<DIEAbbrev>(Abbrev.getTag(), Abbrev.hasChildren()));
+    for (const auto &Attr : Abbrev.getData())
+      Abbreviations.back()->AddAttribute(Attr.getAttribute(), Attr.getForm());
+    AbbreviationsSet.InsertNode(Abbreviations.back().get(), InsertToken);
+    // Assign the unique abbreviation number.
+    Abbrev.setNumber(Abbreviations.size());
+    Abbreviations.back()->setNumber(Abbreviations.size());
+  }
+}
+
+unsigned DWARFLinker::DIECloner::cloneStringAttribute(
+    DIE &Die, AttributeSpec AttrSpec, const DWARFFormValue &Val,
+    const DWARFUnit &U, OffsetsStringPool &StringPool, AttributesInfo &Info) {
+  // Switch everything to out of line strings.
+  const char *String = *Val.getAsCString();
+  auto StringEntry = StringPool.getEntry(String);
+
+  // Update attributes info.
+  if (AttrSpec.Attr == dwarf::DW_AT_name)
+    Info.Name = StringEntry;
+  else if (AttrSpec.Attr == dwarf::DW_AT_MIPS_linkage_name ||
+           AttrSpec.Attr == dwarf::DW_AT_linkage_name)
+    Info.MangledName = StringEntry;
+
+  Die.addValue(DIEAlloc, dwarf::Attribute(AttrSpec.Attr), dwarf::DW_FORM_strp,
+               DIEInteger(StringEntry.getOffset()));
+
+  return 4;
+}
+
+unsigned DWARFLinker::DIECloner::cloneDieReferenceAttribute(
+    DIE &Die, const DWARFDie &InputDIE, AttributeSpec AttrSpec,
+    unsigned AttrSize, const DWARFFormValue &Val, const DwarfFile &File,
+    CompileUnit &Unit) {
+  const DWARFUnit &U = Unit.getOrigUnit();
+  uint64_t Ref = *Val.getAsReference();
+
+  DIE *NewRefDie = nullptr;
+  CompileUnit *RefUnit = nullptr;
+  DeclContext *Ctxt = nullptr;
+
+  DWARFDie RefDie =
+      Linker.resolveDIEReference(File, CompileUnits, Val, InputDIE, RefUnit);
+
+  // If the referenced DIE is not found,  drop the attribute.
+  if (!RefDie || AttrSpec.Attr == dwarf::DW_AT_sibling)
+    return 0;
+
+  unsigned Idx = RefUnit->getOrigUnit().getDIEIndex(RefDie);
+  CompileUnit::DIEInfo &RefInfo = RefUnit->getInfo(Idx);
+
+  // If we already have emitted an equivalent DeclContext, just point
+  // at it.
+  if (isODRAttribute(AttrSpec.Attr)) {
+    Ctxt = RefInfo.Ctxt;
+    if (Ctxt && Ctxt->getCanonicalDIEOffset()) {
+      DIEInteger Attr(Ctxt->getCanonicalDIEOffset());
+      Die.addValue(DIEAlloc, dwarf::Attribute(AttrSpec.Attr),
+                   dwarf::DW_FORM_ref_addr, Attr);
+      return U.getRefAddrByteSize();
+    }
+  }
+
+  if (!RefInfo.Clone) {
+    assert(Ref > InputDIE.getOffset());
+    // We haven't cloned this DIE yet. Just create an empty one and
+    // store it. It'll get really cloned when we process it.
+    RefInfo.Clone = DIE::get(DIEAlloc, dwarf::Tag(RefDie.getTag()));
+  }
+  NewRefDie = RefInfo.Clone;
+
+  if (AttrSpec.Form == dwarf::DW_FORM_ref_addr ||
+      (Unit.hasODR() && isODRAttribute(AttrSpec.Attr))) {
+    // We cannot currently rely on a DIEEntry to emit ref_addr
+    // references, because the implementation calls back to DwarfDebug
+    // to find the unit offset. (We don't have a DwarfDebug)
+    // FIXME: we should be able to design DIEEntry reliance on
+    // DwarfDebug away.
+    uint64_t Attr;
+    if (Ref < InputDIE.getOffset()) {
+      // We must have already cloned that DIE.
+      uint32_t NewRefOffset =
+          RefUnit->getStartOffset() + NewRefDie->getOffset();
+      Attr = NewRefOffset;
+      Die.addValue(DIEAlloc, dwarf::Attribute(AttrSpec.Attr),
+                   dwarf::DW_FORM_ref_addr, DIEInteger(Attr));
+    } else {
+      // A forward reference. Note and fixup later.
+      Attr = 0xBADDEF;
+      Unit.noteForwardReference(
+          NewRefDie, RefUnit, Ctxt,
+          Die.addValue(DIEAlloc, dwarf::Attribute(AttrSpec.Attr),
+                       dwarf::DW_FORM_ref_addr, DIEInteger(Attr)));
+    }
+    return U.getRefAddrByteSize();
+  }
+
+  Die.addValue(DIEAlloc, dwarf::Attribute(AttrSpec.Attr),
+               dwarf::Form(AttrSpec.Form), DIEEntry(*NewRefDie));
+
+  return AttrSize;
+}
+
+void DWARFLinker::DIECloner::cloneExpression(
+    DataExtractor &Data, DWARFExpression Expression, const DwarfFile &File,
+    CompileUnit &Unit, SmallVectorImpl<uint8_t> &OutputBuffer) {
+  using Encoding = DWARFExpression::Operation::Encoding;
+
+  uint64_t OpOffset = 0;
+  for (auto &Op : Expression) {
+    auto Description = Op.getDescription();
+    // DW_OP_const_type is variable-length and has 3
+    // operands. DWARFExpression thus far only supports 2.
+    auto Op0 = Description.Op[0];
+    auto Op1 = Description.Op[1];
+    if ((Op0 == Encoding::BaseTypeRef && Op1 != Encoding::SizeNA) ||
+        (Op1 == Encoding::BaseTypeRef && Op0 != Encoding::Size1))
+      Linker.reportWarning("Unsupported DW_OP encoding.", File);
+
+    if ((Op0 == Encoding::BaseTypeRef && Op1 == Encoding::SizeNA) ||
+        (Op1 == Encoding::BaseTypeRef && Op0 == Encoding::Size1)) {
+      // This code assumes that the other non-typeref operand fits into 1 byte.
+      assert(OpOffset < Op.getEndOffset());
+      uint32_t ULEBsize = Op.getEndOffset() - OpOffset - 1;
+      assert(ULEBsize <= 16);
+
+      // Copy over the operation.
+      OutputBuffer.push_back(Op.getCode());
+      uint64_t RefOffset;
+      if (Op1 == Encoding::SizeNA) {
+        RefOffset = Op.getRawOperand(0);
+      } else {
+        OutputBuffer.push_back(Op.getRawOperand(0));
+        RefOffset = Op.getRawOperand(1);
+      }
+      uint32_t Offset = 0;
+      // Look up the base type. For DW_OP_convert, the operand may be 0 to
+      // instead indicate the generic type. The same holds for
+      // DW_OP_reinterpret, which is currently not supported.
+      if (RefOffset > 0 || Op.getCode() != dwarf::DW_OP_convert) {
+        auto RefDie = Unit.getOrigUnit().getDIEForOffset(RefOffset);
+        uint32_t RefIdx = Unit.getOrigUnit().getDIEIndex(RefDie);
+        CompileUnit::DIEInfo &Info = Unit.getInfo(RefIdx);
+        if (DIE *Clone = Info.Clone)
+          Offset = Clone->getOffset();
+        else
+          Linker.reportWarning(
+              "base type ref doesn't point to DW_TAG_base_type.", File);
+      }
+      uint8_t ULEB[16];
+      unsigned RealSize = encodeULEB128(Offset, ULEB, ULEBsize);
+      if (RealSize > ULEBsize) {
+        // Emit the generic type as a fallback.
+        RealSize = encodeULEB128(0, ULEB, ULEBsize);
+        Linker.reportWarning("base type ref doesn't fit.", File);
+      }
+      assert(RealSize == ULEBsize && "padding failed");
+      ArrayRef<uint8_t> ULEBbytes(ULEB, ULEBsize);
+      OutputBuffer.append(ULEBbytes.begin(), ULEBbytes.end());
+    } else {
+      // Copy over everything else unmodified.
+      StringRef Bytes = Data.getData().slice(OpOffset, Op.getEndOffset());
+      OutputBuffer.append(Bytes.begin(), Bytes.end());
+    }
+    OpOffset = Op.getEndOffset();
+  }
+}
+
+unsigned DWARFLinker::DIECloner::cloneBlockAttribute(
+    DIE &Die, const DwarfFile &File, CompileUnit &Unit, AttributeSpec AttrSpec,
+    const DWARFFormValue &Val, unsigned AttrSize, bool IsLittleEndian) {
+  DIEValueList *Attr;
+  DIEValue Value;
+  DIELoc *Loc = nullptr;
+  DIEBlock *Block = nullptr;
+  if (AttrSpec.Form == dwarf::DW_FORM_exprloc) {
+    Loc = new (DIEAlloc) DIELoc;
+    Linker.DIELocs.push_back(Loc);
+  } else {
+    Block = new (DIEAlloc) DIEBlock;
+    Linker.DIEBlocks.push_back(Block);
+  }
+  Attr = Loc ? static_cast<DIEValueList *>(Loc)
+             : static_cast<DIEValueList *>(Block);
+
+  if (Loc)
+    Value = DIEValue(dwarf::Attribute(AttrSpec.Attr),
+                     dwarf::Form(AttrSpec.Form), Loc);
+  else
+    Value = DIEValue(dwarf::Attribute(AttrSpec.Attr),
+                     dwarf::Form(AttrSpec.Form), Block);
+
+  // If the block is a DWARF Expression, clone it into the temporary
+  // buffer using cloneExpression(), otherwise copy the data directly.
+  SmallVector<uint8_t, 32> Buffer;
+  ArrayRef<uint8_t> Bytes = *Val.getAsBlock();
+  if (DWARFAttribute::mayHaveLocationDescription(AttrSpec.Attr) &&
+      (Val.isFormClass(DWARFFormValue::FC_Block) ||
+       Val.isFormClass(DWARFFormValue::FC_Exprloc))) {
+    DWARFUnit &OrigUnit = Unit.getOrigUnit();
+    DataExtractor Data(StringRef((const char *)Bytes.data(), Bytes.size()),
+                       IsLittleEndian, OrigUnit.getAddressByteSize());
+    DWARFExpression Expr(Data, OrigUnit.getAddressByteSize(),
+                         OrigUnit.getFormParams().Format);
+    cloneExpression(Data, Expr, File, Unit, Buffer);
+    Bytes = Buffer;
+  }
+  for (auto Byte : Bytes)
+    Attr->addValue(DIEAlloc, static_cast<dwarf::Attribute>(0),
+                   dwarf::DW_FORM_data1, DIEInteger(Byte));
+
+  // FIXME: If DIEBlock and DIELoc just reuses the Size field of
+  // the DIE class, this "if" could be replaced by
+  // Attr->setSize(Bytes.size()).
+  if (Loc)
+    Loc->setSize(Bytes.size());
+  else
+    Block->setSize(Bytes.size());
+
+  Die.addValue(DIEAlloc, Value);
+  return AttrSize;
+}
+
+unsigned DWARFLinker::DIECloner::cloneAddressAttribute(
+    DIE &Die, AttributeSpec AttrSpec, const DWARFFormValue &Val,
+    const CompileUnit &Unit, AttributesInfo &Info) {
+  uint64_t Addr = *Val.getAsAddress();
+
+  if (LLVM_UNLIKELY(Linker.Options.Update)) {
+    if (AttrSpec.Attr == dwarf::DW_AT_low_pc)
+      Info.HasLowPc = true;
+    Die.addValue(DIEAlloc, dwarf::Attribute(AttrSpec.Attr),
+                 dwarf::Form(AttrSpec.Form), DIEInteger(Addr));
+    return Unit.getOrigUnit().getAddressByteSize();
+  }
+
+  if (AttrSpec.Attr == dwarf::DW_AT_low_pc) {
+    if (Die.getTag() == dwarf::DW_TAG_inlined_subroutine ||
+        Die.getTag() == dwarf::DW_TAG_lexical_block)
+      // The low_pc of a block or inline subroutine might get
+      // relocated because it happens to match the low_pc of the
+      // enclosing subprogram. To prevent issues with that, always use
+      // the low_pc from the input DIE if relocations have been applied.
+      Addr = (Info.OrigLowPc != std::numeric_limits<uint64_t>::max()
+                  ? Info.OrigLowPc
+                  : Addr) +
+             Info.PCOffset;
+    else if (Die.getTag() == dwarf::DW_TAG_compile_unit) {
+      Addr = Unit.getLowPc();
+      if (Addr == std::numeric_limits<uint64_t>::max())
+        return 0;
+    }
+    Info.HasLowPc = true;
+  } else if (AttrSpec.Attr == dwarf::DW_AT_high_pc) {
+    if (Die.getTag() == dwarf::DW_TAG_compile_unit) {
+      if (uint64_t HighPc = Unit.getHighPc())
+        Addr = HighPc;
+      else
+        return 0;
+    } else
+      // If we have a high_pc recorded for the input DIE, use
+      // it. Otherwise (when no relocations where applied) just use the
+      // one we just decoded.
+      Addr = (Info.OrigHighPc ? Info.OrigHighPc : Addr) + Info.PCOffset;
+  } else if (AttrSpec.Attr == dwarf::DW_AT_call_return_pc) {
+    // Relocate a return PC address within a call site entry.
+    if (Die.getTag() == dwarf::DW_TAG_call_site)
+      Addr = (Info.OrigCallReturnPc ? Info.OrigCallReturnPc : Addr) +
+             Info.PCOffset;
+  } else if (AttrSpec.Attr == dwarf::DW_AT_call_pc) {
+    // Relocate the address of a branch instruction within a call site entry.
+    if (Die.getTag() == dwarf::DW_TAG_call_site)
+      Addr = (Info.OrigCallPc ? Info.OrigCallPc : Addr) + Info.PCOffset;
+  }
+
+  Die.addValue(DIEAlloc, static_cast<dwarf::Attribute>(AttrSpec.Attr),
+               static_cast<dwarf::Form>(AttrSpec.Form), DIEInteger(Addr));
+  return Unit.getOrigUnit().getAddressByteSize();
+}
+
+unsigned DWARFLinker::DIECloner::cloneScalarAttribute(
+    DIE &Die, const DWARFDie &InputDIE, const DwarfFile &File,
+    CompileUnit &Unit, AttributeSpec AttrSpec, const DWARFFormValue &Val,
+    unsigned AttrSize, AttributesInfo &Info) {
+  uint64_t Value;
+
+  if (LLVM_UNLIKELY(Linker.Options.Update)) {
+    if (auto OptionalValue = Val.getAsUnsignedConstant())
+      Value = *OptionalValue;
+    else if (auto OptionalValue = Val.getAsSignedConstant())
+      Value = *OptionalValue;
+    else if (auto OptionalValue = Val.getAsSectionOffset())
+      Value = *OptionalValue;
+    else {
+      Linker.reportWarning(
+          "Unsupported scalar attribute form. Dropping attribute.", File,
+          &InputDIE);
+      return 0;
+    }
+    if (AttrSpec.Attr == dwarf::DW_AT_declaration && Value)
+      Info.IsDeclaration = true;
+    Die.addValue(DIEAlloc, dwarf::Attribute(AttrSpec.Attr),
+                 dwarf::Form(AttrSpec.Form), DIEInteger(Value));
+    return AttrSize;
+  }
+
+  if (AttrSpec.Attr == dwarf::DW_AT_high_pc &&
+      Die.getTag() == dwarf::DW_TAG_compile_unit) {
+    if (Unit.getLowPc() == -1ULL)
+      return 0;
+    // Dwarf >= 4 high_pc is an size, not an address.
+    Value = Unit.getHighPc() - Unit.getLowPc();
+  } else if (AttrSpec.Form == dwarf::DW_FORM_sec_offset)
+    Value = *Val.getAsSectionOffset();
+  else if (AttrSpec.Form == dwarf::DW_FORM_sdata)
+    Value = *Val.getAsSignedConstant();
+  else if (auto OptionalValue = Val.getAsUnsignedConstant())
+    Value = *OptionalValue;
+  else {
+    Linker.reportWarning(
+        "Unsupported scalar attribute form. Dropping attribute.", File,
+        &InputDIE);
+    return 0;
+  }
+  PatchLocation Patch =
+      Die.addValue(DIEAlloc, dwarf::Attribute(AttrSpec.Attr),
+                   dwarf::Form(AttrSpec.Form), DIEInteger(Value));
+  if (AttrSpec.Attr == dwarf::DW_AT_ranges) {
+    Unit.noteRangeAttribute(Die, Patch);
+    Info.HasRanges = true;
+  }
+
+  // A more generic way to check for location attributes would be
+  // nice, but it's very unlikely that any other attribute needs a
+  // location list.
+  // FIXME: use DWARFAttribute::mayHaveLocationDescription().
+  else if (AttrSpec.Attr == dwarf::DW_AT_location ||
+           AttrSpec.Attr == dwarf::DW_AT_frame_base) {
+    Unit.noteLocationAttribute(Patch, Info.PCOffset);
+  } else if (AttrSpec.Attr == dwarf::DW_AT_declaration && Value)
+    Info.IsDeclaration = true;
+
+  return AttrSize;
+}
+
+/// Clone \p InputDIE's attribute described by \p AttrSpec with
+/// value \p Val, and add it to \p Die.
+/// \returns the size of the cloned attribute.
+unsigned DWARFLinker::DIECloner::cloneAttribute(
+    DIE &Die, const DWARFDie &InputDIE, const DwarfFile &File,
+    CompileUnit &Unit, OffsetsStringPool &StringPool, const DWARFFormValue &Val,
+    const AttributeSpec AttrSpec, unsigned AttrSize, AttributesInfo &Info,
+    bool IsLittleEndian) {
+  const DWARFUnit &U = Unit.getOrigUnit();
+
+  switch (AttrSpec.Form) {
+  case dwarf::DW_FORM_strp:
+  case dwarf::DW_FORM_string:
+    return cloneStringAttribute(Die, AttrSpec, Val, U, StringPool, Info);
+  case dwarf::DW_FORM_ref_addr:
+  case dwarf::DW_FORM_ref1:
+  case dwarf::DW_FORM_ref2:
+  case dwarf::DW_FORM_ref4:
+  case dwarf::DW_FORM_ref8:
+    return cloneDieReferenceAttribute(Die, InputDIE, AttrSpec, AttrSize, Val,
+                                      File, Unit);
+  case dwarf::DW_FORM_block:
+  case dwarf::DW_FORM_block1:
+  case dwarf::DW_FORM_block2:
+  case dwarf::DW_FORM_block4:
+  case dwarf::DW_FORM_exprloc:
+    return cloneBlockAttribute(Die, File, Unit, AttrSpec, Val, AttrSize,
+                               IsLittleEndian);
+  case dwarf::DW_FORM_addr:
+    return cloneAddressAttribute(Die, AttrSpec, Val, Unit, Info);
+  case dwarf::DW_FORM_data1:
+  case dwarf::DW_FORM_data2:
+  case dwarf::DW_FORM_data4:
+  case dwarf::DW_FORM_data8:
+  case dwarf::DW_FORM_udata:
+  case dwarf::DW_FORM_sdata:
+  case dwarf::DW_FORM_sec_offset:
+  case dwarf::DW_FORM_flag:
+  case dwarf::DW_FORM_flag_present:
+    return cloneScalarAttribute(Die, InputDIE, File, Unit, AttrSpec, Val,
+                                AttrSize, Info);
+  default:
+    Linker.reportWarning(
+        "Unsupported attribute form in cloneAttribute. Dropping.", File,
+        &InputDIE);
+  }
+
+  return 0;
+}
+
+static bool isObjCSelector(StringRef Name) {
+  return Name.size() > 2 && (Name[0] == '-' || Name[0] == '+') &&
+         (Name[1] == '[');
+}
+
+void DWARFLinker::DIECloner::addObjCAccelerator(CompileUnit &Unit,
+                                                const DIE *Die,
+                                                DwarfStringPoolEntryRef Name,
+                                                OffsetsStringPool &StringPool,
+                                                bool SkipPubSection) {
+  assert(isObjCSelector(Name.getString()) && "not an objc selector");
+  // Objective C method or class function.
+  // "- [Class(Category) selector :withArg ...]"
+  StringRef ClassNameStart(Name.getString().drop_front(2));
+  size_t FirstSpace = ClassNameStart.find(' ');
+  if (FirstSpace == StringRef::npos)
+    return;
+
+  StringRef SelectorStart(ClassNameStart.data() + FirstSpace + 1);
+  if (!SelectorStart.size())
+    return;
+
+  StringRef Selector(SelectorStart.data(), SelectorStart.size() - 1);
+  Unit.addNameAccelerator(Die, StringPool.getEntry(Selector), SkipPubSection);
+
+  // Add an entry for the class name that points to this
+  // method/class function.
+  StringRef ClassName(ClassNameStart.data(), FirstSpace);
+  Unit.addObjCAccelerator(Die, StringPool.getEntry(ClassName), SkipPubSection);
+
+  if (ClassName[ClassName.size() - 1] == ')') {
+    size_t OpenParens = ClassName.find('(');
+    if (OpenParens != StringRef::npos) {
+      StringRef ClassNameNoCategory(ClassName.data(), OpenParens);
+      Unit.addObjCAccelerator(Die, StringPool.getEntry(ClassNameNoCategory),
+                              SkipPubSection);
+
+      std::string MethodNameNoCategory(Name.getString().data(), OpenParens + 2);
+      // FIXME: The missing space here may be a bug, but
+      //        dsymutil-classic also does it this way.
+      MethodNameNoCategory.append(std::string(SelectorStart));
+      Unit.addNameAccelerator(Die, StringPool.getEntry(MethodNameNoCategory),
+                              SkipPubSection);
+    }
+  }
+}
+
+static bool
+shouldSkipAttribute(DWARFAbbreviationDeclaration::AttributeSpec AttrSpec,
+                    uint16_t Tag, bool InDebugMap, bool SkipPC,
+                    bool InFunctionScope) {
+  switch (AttrSpec.Attr) {
+  default:
+    return false;
+  case dwarf::DW_AT_low_pc:
+  case dwarf::DW_AT_high_pc:
+  case dwarf::DW_AT_ranges:
+    return SkipPC;
+  case dwarf::DW_AT_location:
+  case dwarf::DW_AT_frame_base:
+    // FIXME: for some reason dsymutil-classic keeps the location attributes
+    // when they are of block type (i.e. not location lists). This is totally
+    // wrong for globals where we will keep a wrong address. It is mostly
+    // harmless for locals, but there is no point in keeping these anyway when
+    // the function wasn't linked.
+    return (SkipPC || (!InFunctionScope && Tag == dwarf::DW_TAG_variable &&
+                       !InDebugMap)) &&
+           !DWARFFormValue(AttrSpec.Form).isFormClass(DWARFFormValue::FC_Block);
+  }
+}
+
+DIE *DWARFLinker::DIECloner::cloneDIE(const DWARFDie &InputDIE,
+                                      const DwarfFile &File, CompileUnit &Unit,
+                                      OffsetsStringPool &StringPool,
+                                      int64_t PCOffset, uint32_t OutOffset,
+                                      unsigned Flags, bool IsLittleEndian,
+                                      DIE *Die) {
+  DWARFUnit &U = Unit.getOrigUnit();
+  unsigned Idx = U.getDIEIndex(InputDIE);
+  CompileUnit::DIEInfo &Info = Unit.getInfo(Idx);
+
+  // Should the DIE appear in the output?
+  if (!Unit.getInfo(Idx).Keep)
+    return nullptr;
+
+  uint64_t Offset = InputDIE.getOffset();
+  assert(!(Die && Info.Clone) && "Can't supply a DIE and a cloned DIE");
+  if (!Die) {
+    // The DIE might have been already created by a forward reference
+    // (see cloneDieReferenceAttribute()).
+    if (!Info.Clone)
+      Info.Clone = DIE::get(DIEAlloc, dwarf::Tag(InputDIE.getTag()));
+    Die = Info.Clone;
+  }
+
+  assert(Die->getTag() == InputDIE.getTag());
+  Die->setOffset(OutOffset);
+  if ((Unit.hasODR() || Unit.isClangModule()) && !Info.Incomplete &&
+      Die->getTag() != dwarf::DW_TAG_namespace && Info.Ctxt &&
+      Info.Ctxt != Unit.getInfo(Info.ParentIdx).Ctxt &&
+      !Info.Ctxt->getCanonicalDIEOffset()) {
+    // We are about to emit a DIE that is the root of its own valid
+    // DeclContext tree. Make the current offset the canonical offset
+    // for this context.
+    Info.Ctxt->setCanonicalDIEOffset(OutOffset + Unit.getStartOffset());
+  }
+
+  // Extract and clone every attribute.
+  DWARFDataExtractor Data = U.getDebugInfoExtractor();
+  // Point to the next DIE (generally there is always at least a NULL
+  // entry after the current one). If this is a lone
+  // DW_TAG_compile_unit without any children, point to the next unit.
+  uint64_t NextOffset = (Idx + 1 < U.getNumDIEs())
+                            ? U.getDIEAtIndex(Idx + 1).getOffset()
+                            : U.getNextUnitOffset();
+  AttributesInfo AttrInfo;
+
+  // We could copy the data only if we need to apply a relocation to it. After
+  // testing, it seems there is no performance downside to doing the copy
+  // unconditionally, and it makes the code simpler.
+  SmallString<40> DIECopy(Data.getData().substr(Offset, NextOffset - Offset));
+  Data =
+      DWARFDataExtractor(DIECopy, Data.isLittleEndian(), Data.getAddressSize());
+
+  // Modify the copy with relocated addresses.
+  if (ObjFile.Addresses->areRelocationsResolved() &&
+      ObjFile.Addresses->applyValidRelocs(DIECopy, Offset,
+                                          Data.isLittleEndian())) {
+    // If we applied relocations, we store the value of high_pc that was
+    // potentially stored in the input DIE. If high_pc is an address
+    // (Dwarf version == 2), then it might have been relocated to a
+    // totally unrelated value (because the end address in the object
+    // file might be start address of another function which got moved
+    // independently by the linker). The computation of the actual
+    // high_pc value is done in cloneAddressAttribute().
+    AttrInfo.OrigHighPc =
+        dwarf::toAddress(InputDIE.find(dwarf::DW_AT_high_pc), 0);
+    // Also store the low_pc. It might get relocated in an
+    // inline_subprogram that happens at the beginning of its
+    // inlining function.
+    AttrInfo.OrigLowPc = dwarf::toAddress(InputDIE.find(dwarf::DW_AT_low_pc),
+                                          std::numeric_limits<uint64_t>::max());
+    AttrInfo.OrigCallReturnPc =
+        dwarf::toAddress(InputDIE.find(dwarf::DW_AT_call_return_pc), 0);
+    AttrInfo.OrigCallPc =
+        dwarf::toAddress(InputDIE.find(dwarf::DW_AT_call_pc), 0);
+  }
+
+  // Reset the Offset to 0 as we will be working on the local copy of
+  // the data.
+  Offset = 0;
+
+  const auto *Abbrev = InputDIE.getAbbreviationDeclarationPtr();
+  Offset += getULEB128Size(Abbrev->getCode());
+
+  // We are entering a subprogram. Get and propagate the PCOffset.
+  if (Die->getTag() == dwarf::DW_TAG_subprogram)
+    PCOffset = Info.AddrAdjust;
+  AttrInfo.PCOffset = PCOffset;
+
+  if (Abbrev->getTag() == dwarf::DW_TAG_subprogram) {
+    Flags |= TF_InFunctionScope;
+    if (!Info.InDebugMap && LLVM_LIKELY(!Update))
+      Flags |= TF_SkipPC;
+  }
+
+  bool Copied = false;
+  for (const auto &AttrSpec : Abbrev->attributes()) {
+    if (LLVM_LIKELY(!Update) &&
+        shouldSkipAttribute(AttrSpec, Die->getTag(), Info.InDebugMap,
+                            Flags & TF_SkipPC, Flags & TF_InFunctionScope)) {
+      DWARFFormValue::skipValue(AttrSpec.Form, Data, &Offset,
+                                U.getFormParams());
+      // FIXME: dsymutil-classic keeps the old abbreviation around
+      // even if it's not used. We can remove this (and the copyAbbrev
+      // helper) as soon as bit-for-bit compatibility is not a goal anymore.
+      if (!Copied) {
+        copyAbbrev(*InputDIE.getAbbreviationDeclarationPtr(), Unit.hasODR());
+        Copied = true;
+      }
+      continue;
+    }
+
+    DWARFFormValue Val(AttrSpec.Form);
+    uint64_t AttrSize = Offset;
+    Val.extractValue(Data, &Offset, U.getFormParams(), &U);
+    AttrSize = Offset - AttrSize;
+
+    OutOffset += cloneAttribute(*Die, InputDIE, File, Unit, StringPool, Val,
+                                AttrSpec, AttrSize, AttrInfo, IsLittleEndian);
+  }
+
+  // Look for accelerator entries.
+  uint16_t Tag = InputDIE.getTag();
+  // FIXME: This is slightly wrong. An inline_subroutine without a
+  // low_pc, but with AT_ranges might be interesting to get into the
+  // accelerator tables too. For now stick with dsymutil's behavior.
+  if ((Info.InDebugMap || AttrInfo.HasLowPc || AttrInfo.HasRanges) &&
+      Tag != dwarf::DW_TAG_compile_unit &&
+      getDIENames(InputDIE, AttrInfo, StringPool,
+                  Tag != dwarf::DW_TAG_inlined_subroutine)) {
+    if (AttrInfo.MangledName && AttrInfo.MangledName != AttrInfo.Name)
+      Unit.addNameAccelerator(Die, AttrInfo.MangledName,
+                              Tag == dwarf::DW_TAG_inlined_subroutine);
+    if (AttrInfo.Name) {
+      if (AttrInfo.NameWithoutTemplate)
+        Unit.addNameAccelerator(Die, AttrInfo.NameWithoutTemplate,
+                                /* SkipPubSection */ true);
+      Unit.addNameAccelerator(Die, AttrInfo.Name,
+                              Tag == dwarf::DW_TAG_inlined_subroutine);
+    }
+    if (AttrInfo.Name && isObjCSelector(AttrInfo.Name.getString()))
+      addObjCAccelerator(Unit, Die, AttrInfo.Name, StringPool,
+                         /* SkipPubSection =*/true);
+
+  } else if (Tag == dwarf::DW_TAG_namespace) {
+    if (!AttrInfo.Name)
+      AttrInfo.Name = StringPool.getEntry("(anonymous namespace)");
+    Unit.addNamespaceAccelerator(Die, AttrInfo.Name);
+  } else if (isTypeTag(Tag) && !AttrInfo.IsDeclaration &&
+             getDIENames(InputDIE, AttrInfo, StringPool) && AttrInfo.Name &&
+             AttrInfo.Name.getString()[0]) {
+    uint32_t Hash = hashFullyQualifiedName(InputDIE, Unit, File);
+    uint64_t RuntimeLang =
+        dwarf::toUnsigned(InputDIE.find(dwarf::DW_AT_APPLE_runtime_class))
+            .getValueOr(0);
+    bool ObjCClassIsImplementation =
+        (RuntimeLang == dwarf::DW_LANG_ObjC ||
+         RuntimeLang == dwarf::DW_LANG_ObjC_plus_plus) &&
+        dwarf::toUnsigned(InputDIE.find(dwarf::DW_AT_APPLE_objc_complete_type))
+            .getValueOr(0);
+    Unit.addTypeAccelerator(Die, AttrInfo.Name, ObjCClassIsImplementation,
+                            Hash);
+  }
+
+  // Determine whether there are any children that we want to keep.
+  bool HasChildren = false;
+  for (auto Child : InputDIE.children()) {
+    unsigned Idx = U.getDIEIndex(Child);
+    if (Unit.getInfo(Idx).Keep) {
+      HasChildren = true;
+      break;
+    }
+  }
+
+  DIEAbbrev NewAbbrev = Die->generateAbbrev();
+  if (HasChildren)
+    NewAbbrev.setChildrenFlag(dwarf::DW_CHILDREN_yes);
+  // Assign a permanent abbrev number
+  Linker.assignAbbrev(NewAbbrev);
+  Die->setAbbrevNumber(NewAbbrev.getNumber());
+
+  // Add the size of the abbreviation number to the output offset.
+  OutOffset += getULEB128Size(Die->getAbbrevNumber());
+
+  if (!HasChildren) {
+    // Update our size.
+    Die->setSize(OutOffset - Die->getOffset());
+    return Die;
+  }
+
+  // Recursively clone children.
+  for (auto Child : InputDIE.children()) {
+    if (DIE *Clone = cloneDIE(Child, File, Unit, StringPool, PCOffset,
+                              OutOffset, Flags, IsLittleEndian)) {
+      Die->addChild(Clone);
+      OutOffset = Clone->getOffset() + Clone->getSize();
+    }
+  }
+
+  // Account for the end of children marker.
+  OutOffset += sizeof(int8_t);
+  // Update our size.
+  Die->setSize(OutOffset - Die->getOffset());
+  return Die;
+}
+
+/// Patch the input object file relevant debug_ranges entries
+/// and emit them in the output file. Update the relevant attributes
+/// to point at the new entries.
+void DWARFLinker::patchRangesForUnit(const CompileUnit &Unit,
+                                     DWARFContext &OrigDwarf,
+                                     const DwarfFile &File) const {
+  DWARFDebugRangeList RangeList;
+  const auto &FunctionRanges = Unit.getFunctionRanges();
+  unsigned AddressSize = Unit.getOrigUnit().getAddressByteSize();
+  DWARFDataExtractor RangeExtractor(OrigDwarf.getDWARFObj(),
+                                    OrigDwarf.getDWARFObj().getRangesSection(),
+                                    OrigDwarf.isLittleEndian(), AddressSize);
+  auto InvalidRange = FunctionRanges.end(), CurrRange = InvalidRange;
+  DWARFUnit &OrigUnit = Unit.getOrigUnit();
+  auto OrigUnitDie = OrigUnit.getUnitDIE(false);
+  uint64_t OrigLowPc =
+      dwarf::toAddress(OrigUnitDie.find(dwarf::DW_AT_low_pc), -1ULL);
+  // Ranges addresses are based on the unit's low_pc. Compute the
+  // offset we need to apply to adapt to the new unit's low_pc.
+  int64_t UnitPcOffset = 0;
+  if (OrigLowPc != -1ULL)
+    UnitPcOffset = int64_t(OrigLowPc) - Unit.getLowPc();
+
+  for (const auto &RangeAttribute : Unit.getRangesAttributes()) {
+    uint64_t Offset = RangeAttribute.get();
+    RangeAttribute.set(TheDwarfEmitter->getRangesSectionSize());
+    if (Error E = RangeList.extract(RangeExtractor, &Offset)) {
+      llvm::consumeError(std::move(E));
+      reportWarning("invalid range list ignored.", File);
+      RangeList.clear();
+    }
+    const auto &Entries = RangeList.getEntries();
+    if (!Entries.empty()) {
+      const DWARFDebugRangeList::RangeListEntry &First = Entries.front();
+
+      if (CurrRange == InvalidRange ||
+          First.StartAddress + OrigLowPc < CurrRange.start() ||
+          First.StartAddress + OrigLowPc >= CurrRange.stop()) {
+        CurrRange = FunctionRanges.find(First.StartAddress + OrigLowPc);
+        if (CurrRange == InvalidRange ||
+            CurrRange.start() > First.StartAddress + OrigLowPc) {
+          reportWarning("no mapping for range.", File);
+          continue;
+        }
+      }
+    }
+
+    TheDwarfEmitter->emitRangesEntries(UnitPcOffset, OrigLowPc, CurrRange,
+                                       Entries, AddressSize);
+  }
+}
+
+/// Generate the debug_aranges entries for \p Unit and if the
+/// unit has a DW_AT_ranges attribute, also emit the debug_ranges
+/// contribution for this attribute.
+/// FIXME: this could actually be done right in patchRangesForUnit,
+/// but for the sake of initial bit-for-bit compatibility with legacy
+/// dsymutil, we have to do it in a delayed pass.
+void DWARFLinker::generateUnitRanges(CompileUnit &Unit) const {
+  auto Attr = Unit.getUnitRangesAttribute();
+  if (Attr)
+    Attr->set(TheDwarfEmitter->getRangesSectionSize());
+  TheDwarfEmitter->emitUnitRangesEntries(Unit, static_cast<bool>(Attr));
+}
+
+/// Insert the new line info sequence \p Seq into the current
+/// set of already linked line info \p Rows.
+static void insertLineSequence(std::vector<DWARFDebugLine::Row> &Seq,
+                               std::vector<DWARFDebugLine::Row> &Rows) {
+  if (Seq.empty())
+    return;
+
+  if (!Rows.empty() && Rows.back().Address < Seq.front().Address) {
+    Rows.insert(Rows.end(), Seq.begin(), Seq.end());
+    Seq.clear();
+    return;
+  }
+
+  object::SectionedAddress Front = Seq.front().Address;
+  auto InsertPoint = partition_point(
+      Rows, [=](const DWARFDebugLine::Row &O) { return O.Address < Front; });
+
+  // FIXME: this only removes the unneeded end_sequence if the
+  // sequences have been inserted in order. Using a global sort like
+  // described in patchLineTableForUnit() and delaying the end_sequene
+  // elimination to emitLineTableForUnit() we can get rid of all of them.
+  if (InsertPoint != Rows.end() && InsertPoint->Address == Front &&
+      InsertPoint->EndSequence) {
+    *InsertPoint = Seq.front();
+    Rows.insert(InsertPoint + 1, Seq.begin() + 1, Seq.end());
+  } else {
+    Rows.insert(InsertPoint, Seq.begin(), Seq.end());
+  }
+
+  Seq.clear();
+}
+
+static void patchStmtList(DIE &Die, DIEInteger Offset) {
+  for (auto &V : Die.values())
+    if (V.getAttribute() == dwarf::DW_AT_stmt_list) {
+      V = DIEValue(V.getAttribute(), V.getForm(), Offset);
+      return;
+    }
+
+  llvm_unreachable("Didn't find DW_AT_stmt_list in cloned DIE!");
+}
+
+/// Extract the line table for \p Unit from \p OrigDwarf, and
+/// recreate a relocated version of these for the address ranges that
+/// are present in the binary.
+void DWARFLinker::patchLineTableForUnit(CompileUnit &Unit,
+                                        DWARFContext &OrigDwarf,
+                                        const DwarfFile &File) {
+  DWARFDie CUDie = Unit.getOrigUnit().getUnitDIE();
+  auto StmtList = dwarf::toSectionOffset(CUDie.find(dwarf::DW_AT_stmt_list));
+  if (!StmtList)
+    return;
+
+  // Update the cloned DW_AT_stmt_list with the correct debug_line offset.
+  if (auto *OutputDIE = Unit.getOutputUnitDIE())
+    patchStmtList(*OutputDIE,
+                  DIEInteger(TheDwarfEmitter->getLineSectionSize()));
+
+  RangesTy &Ranges = File.Addresses->getValidAddressRanges();
+
+  // Parse the original line info for the unit.
+  DWARFDebugLine::LineTable LineTable;
+  uint64_t StmtOffset = *StmtList;
+  DWARFDataExtractor LineExtractor(
+      OrigDwarf.getDWARFObj(), OrigDwarf.getDWARFObj().getLineSection(),
+      OrigDwarf.isLittleEndian(), Unit.getOrigUnit().getAddressByteSize());
+  if (needToTranslateStrings())
+    return TheDwarfEmitter->translateLineTable(LineExtractor, StmtOffset);
+
+  if (Error Err =
+          LineTable.parse(LineExtractor, &StmtOffset, OrigDwarf,
+                          &Unit.getOrigUnit(), OrigDwarf.getWarningHandler()))
+    OrigDwarf.getWarningHandler()(std::move(Err));
+
+  // This vector is the output line table.
+  std::vector<DWARFDebugLine::Row> NewRows;
+  NewRows.reserve(LineTable.Rows.size());
+
+  // Current sequence of rows being extracted, before being inserted
+  // in NewRows.
+  std::vector<DWARFDebugLine::Row> Seq;
+  const auto &FunctionRanges = Unit.getFunctionRanges();
+  auto InvalidRange = FunctionRanges.end(), CurrRange = InvalidRange;
+
+  // FIXME: This logic is meant to generate exactly the same output as
+  // Darwin's classic dsymutil. There is a nicer way to implement this
+  // by simply putting all the relocated line info in NewRows and simply
+  // sorting NewRows before passing it to emitLineTableForUnit. This
+  // should be correct as sequences for a function should stay
+  // together in the sorted output. There are a few corner cases that
+  // look suspicious though, and that required to implement the logic
+  // this way. Revisit that once initial validation is finished.
+
+  // Iterate over the object file line info and extract the sequences
+  // that correspond to linked functions.
+  for (auto &Row : LineTable.Rows) {
+    // Check whether we stepped out of the range. The range is
+    // half-open, but consider accept the end address of the range if
+    // it is marked as end_sequence in the input (because in that
+    // case, the relocation offset is accurate and that entry won't
+    // serve as the start of another function).
+    if (CurrRange == InvalidRange || Row.Address.Address < CurrRange.start() ||
+        Row.Address.Address > CurrRange.stop() ||
+        (Row.Address.Address == CurrRange.stop() && !Row.EndSequence)) {
+      // We just stepped out of a known range. Insert a end_sequence
+      // corresponding to the end of the range.
+      uint64_t StopAddress = CurrRange != InvalidRange
+                                 ? CurrRange.stop() + CurrRange.value()
+                                 : -1ULL;
+      CurrRange = FunctionRanges.find(Row.Address.Address);
+      bool CurrRangeValid =
+          CurrRange != InvalidRange && CurrRange.start() <= Row.Address.Address;
+      if (!CurrRangeValid) {
+        CurrRange = InvalidRange;
+        if (StopAddress != -1ULL) {
+          // Try harder by looking in the Address ranges map.
+          // There are corner cases where this finds a
+          // valid entry. It's unclear if this is right or wrong, but
+          // for now do as dsymutil.
+          // FIXME: Understand exactly what cases this addresses and
+          // potentially remove it along with the Ranges map.
+          auto Range = Ranges.lower_bound(Row.Address.Address);
+          if (Range != Ranges.begin() && Range != Ranges.end())
+            --Range;
+
+          if (Range != Ranges.end() && Range->first <= Row.Address.Address &&
+              Range->second.HighPC >= Row.Address.Address) {
+            StopAddress = Row.Address.Address + Range->second.Offset;
+          }
+        }
+      }
+      if (StopAddress != -1ULL && !Seq.empty()) {
+        // Insert end sequence row with the computed end address, but
+        // the same line as the previous one.
+        auto NextLine = Seq.back();
+        NextLine.Address.Address = StopAddress;
+        NextLine.EndSequence = 1;
+        NextLine.PrologueEnd = 0;
+        NextLine.BasicBlock = 0;
+        NextLine.EpilogueBegin = 0;
+        Seq.push_back(NextLine);
+        insertLineSequence(Seq, NewRows);
+      }
+
+      if (!CurrRangeValid)
+        continue;
+    }
+
+    // Ignore empty sequences.
+    if (Row.EndSequence && Seq.empty())
+      continue;
+
+    // Relocate row address and add it to the current sequence.
+    Row.Address.Address += CurrRange.value();
+    Seq.emplace_back(Row);
+
+    if (Row.EndSequence)
+      insertLineSequence(Seq, NewRows);
+  }
+
+  // Finished extracting, now emit the line tables.
+  // FIXME: LLVM hard-codes its prologue values. We just copy the
+  // prologue over and that works because we act as both producer and
+  // consumer. It would be nicer to have a real configurable line
+  // table emitter.
+  if (LineTable.Prologue.getVersion() < 2 ||
+      LineTable.Prologue.getVersion() > 5 ||
+      LineTable.Prologue.DefaultIsStmt != DWARF2_LINE_DEFAULT_IS_STMT ||
+      LineTable.Prologue.OpcodeBase > 13)
+    reportWarning("line table parameters mismatch. Cannot emit.", File);
+  else {
+    uint32_t PrologueEnd = *StmtList + 10 + LineTable.Prologue.PrologueLength;
+    // DWARF v5 has an extra 2 bytes of information before the header_length
+    // field.
+    if (LineTable.Prologue.getVersion() == 5)
+      PrologueEnd += 2;
+    StringRef LineData = OrigDwarf.getDWARFObj().getLineSection().Data;
+    MCDwarfLineTableParams Params;
+    Params.DWARF2LineOpcodeBase = LineTable.Prologue.OpcodeBase;
+    Params.DWARF2LineBase = LineTable.Prologue.LineBase;
+    Params.DWARF2LineRange = LineTable.Prologue.LineRange;
+    TheDwarfEmitter->emitLineTableForUnit(
+        Params, LineData.slice(*StmtList + 4, PrologueEnd),
+        LineTable.Prologue.MinInstLength, NewRows,
+        Unit.getOrigUnit().getAddressByteSize());
+  }
+}
+
+void DWARFLinker::emitAcceleratorEntriesForUnit(CompileUnit &Unit) {
+  switch (Options.TheAccelTableKind) {
+  case AccelTableKind::Apple:
+    emitAppleAcceleratorEntriesForUnit(Unit);
+    break;
+  case AccelTableKind::Dwarf:
+    emitDwarfAcceleratorEntriesForUnit(Unit);
+    break;
+  case AccelTableKind::Default:
+    llvm_unreachable("The default must be updated to a concrete value.");
+    break;
+  }
+}
+
+void DWARFLinker::emitAppleAcceleratorEntriesForUnit(CompileUnit &Unit) {
+  // Add namespaces.
+  for (const auto &Namespace : Unit.getNamespaces())
+    AppleNamespaces.addName(Namespace.Name,
+                            Namespace.Die->getOffset() + Unit.getStartOffset());
+
+  /// Add names.
+  TheDwarfEmitter->emitPubNamesForUnit(Unit);
+  for (const auto &Pubname : Unit.getPubnames())
+    AppleNames.addName(Pubname.Name,
+                       Pubname.Die->getOffset() + Unit.getStartOffset());
+
+  /// Add types.
+  TheDwarfEmitter->emitPubTypesForUnit(Unit);
+  for (const auto &Pubtype : Unit.getPubtypes())
+    AppleTypes.addName(
+        Pubtype.Name, Pubtype.Die->getOffset() + Unit.getStartOffset(),
+        Pubtype.Die->getTag(),
+        Pubtype.ObjcClassImplementation ? dwarf::DW_FLAG_type_implementation
+                                        : 0,
+        Pubtype.QualifiedNameHash);
+
+  /// Add ObjC names.
+  for (const auto &ObjC : Unit.getObjC())
+    AppleObjc.addName(ObjC.Name, ObjC.Die->getOffset() + Unit.getStartOffset());
+}
+
+void DWARFLinker::emitDwarfAcceleratorEntriesForUnit(CompileUnit &Unit) {
+  for (const auto &Namespace : Unit.getNamespaces())
+    DebugNames.addName(Namespace.Name, Namespace.Die->getOffset(),
+                       Namespace.Die->getTag(), Unit.getUniqueID());
+  for (const auto &Pubname : Unit.getPubnames())
+    DebugNames.addName(Pubname.Name, Pubname.Die->getOffset(),
+                       Pubname.Die->getTag(), Unit.getUniqueID());
+  for (const auto &Pubtype : Unit.getPubtypes())
+    DebugNames.addName(Pubtype.Name, Pubtype.Die->getOffset(),
+                       Pubtype.Die->getTag(), Unit.getUniqueID());
+}
+
+/// Read the frame info stored in the object, and emit the
+/// patched frame descriptions for the resulting file.
+///
+/// This is actually pretty easy as the data of the CIEs and FDEs can
+/// be considered as black boxes and moved as is. The only thing to do
+/// is to patch the addresses in the headers.
+void DWARFLinker::patchFrameInfoForObject(const DwarfFile &File,
+                                          RangesTy &Ranges,
+                                          DWARFContext &OrigDwarf,
+                                          unsigned AddrSize) {
+  StringRef FrameData = OrigDwarf.getDWARFObj().getFrameSection().Data;
+  if (FrameData.empty())
+    return;
+
+  DataExtractor Data(FrameData, OrigDwarf.isLittleEndian(), 0);
+  uint64_t InputOffset = 0;
+
+  // Store the data of the CIEs defined in this object, keyed by their
+  // offsets.
+  DenseMap<uint64_t, StringRef> LocalCIES;
+
+  while (Data.isValidOffset(InputOffset)) {
+    uint64_t EntryOffset = InputOffset;
+    uint32_t InitialLength = Data.getU32(&InputOffset);
+    if (InitialLength == 0xFFFFFFFF)
+      return reportWarning("Dwarf64 bits no supported", File);
+
+    uint32_t CIEId = Data.getU32(&InputOffset);
+    if (CIEId == 0xFFFFFFFF) {
+      // This is a CIE, store it.
+      StringRef CIEData = FrameData.substr(EntryOffset, InitialLength + 4);
+      LocalCIES[EntryOffset] = CIEData;
+      // The -4 is to account for the CIEId we just read.
+      InputOffset += InitialLength - 4;
+      continue;
+    }
+
+    uint32_t Loc = Data.getUnsigned(&InputOffset, AddrSize);
+
+    // Some compilers seem to emit frame info that doesn't start at
+    // the function entry point, thus we can't just lookup the address
+    // in the debug map. Use the AddressInfo's range map to see if the FDE
+    // describes something that we can relocate.
+    auto Range = Ranges.upper_bound(Loc);
+    if (Range != Ranges.begin())
+      --Range;
+    if (Range == Ranges.end() || Range->first > Loc ||
+        Range->second.HighPC <= Loc) {
+      // The +4 is to account for the size of the InitialLength field itself.
+      InputOffset = EntryOffset + InitialLength + 4;
+      continue;
+    }
+
+    // This is an FDE, and we have a mapping.
+    // Have we already emitted a corresponding CIE?
+    StringRef CIEData = LocalCIES[CIEId];
+    if (CIEData.empty())
+      return reportWarning("Inconsistent debug_frame content. Dropping.", File);
+
+    // Look if we already emitted a CIE that corresponds to the
+    // referenced one (the CIE data is the key of that lookup).
+    auto IteratorInserted = EmittedCIEs.insert(
+        std::make_pair(CIEData, TheDwarfEmitter->getFrameSectionSize()));
+    // If there is no CIE yet for this ID, emit it.
+    if (IteratorInserted.second ||
+        // FIXME: dsymutil-classic only caches the last used CIE for
+        // reuse. Mimic that behavior for now. Just removing that
+        // second half of the condition and the LastCIEOffset variable
+        // makes the code DTRT.
+        LastCIEOffset != IteratorInserted.first->getValue()) {
+      LastCIEOffset = TheDwarfEmitter->getFrameSectionSize();
+      IteratorInserted.first->getValue() = LastCIEOffset;
+      TheDwarfEmitter->emitCIE(CIEData);
+    }
+
+    // Emit the FDE with updated address and CIE pointer.
+    // (4 + AddrSize) is the size of the CIEId + initial_location
+    // fields that will get reconstructed by emitFDE().
+    unsigned FDERemainingBytes = InitialLength - (4 + AddrSize);
+    TheDwarfEmitter->emitFDE(IteratorInserted.first->getValue(), AddrSize,
+                             Loc + Range->second.Offset,
+                             FrameData.substr(InputOffset, FDERemainingBytes));
+    InputOffset += FDERemainingBytes;
+  }
+}
+
+void DWARFLinker::DIECloner::copyAbbrev(
+    const DWARFAbbreviationDeclaration &Abbrev, bool HasODR) {
+  DIEAbbrev Copy(dwarf::Tag(Abbrev.getTag()),
+                 dwarf::Form(Abbrev.hasChildren()));
+
+  for (const auto &Attr : Abbrev.attributes()) {
+    uint16_t Form = Attr.Form;
+    if (HasODR && isODRAttribute(Attr.Attr))
+      Form = dwarf::DW_FORM_ref_addr;
+    Copy.AddAttribute(dwarf::Attribute(Attr.Attr), dwarf::Form(Form));
+  }
+
+  Linker.assignAbbrev(Copy);
+}
+
+uint32_t DWARFLinker::DIECloner::hashFullyQualifiedName(DWARFDie DIE,
+                                                        CompileUnit &U,
+                                                        const DwarfFile &File,
+                                                        int ChildRecurseDepth) {
+  const char *Name = nullptr;
+  DWARFUnit *OrigUnit = &U.getOrigUnit();
+  CompileUnit *CU = &U;
+  Optional<DWARFFormValue> Ref;
+
+  while (1) {
+    if (const char *CurrentName = DIE.getName(DINameKind::ShortName))
+      Name = CurrentName;
+
+    if (!(Ref = DIE.find(dwarf::DW_AT_specification)) &&
+        !(Ref = DIE.find(dwarf::DW_AT_abstract_origin)))
+      break;
+
+    if (!Ref->isFormClass(DWARFFormValue::FC_Reference))
+      break;
+
+    CompileUnit *RefCU;
+    if (auto RefDIE =
+            Linker.resolveDIEReference(File, CompileUnits, *Ref, DIE, RefCU)) {
+      CU = RefCU;
+      OrigUnit = &RefCU->getOrigUnit();
+      DIE = RefDIE;
+    }
+  }
+
+  unsigned Idx = OrigUnit->getDIEIndex(DIE);
+  if (!Name && DIE.getTag() == dwarf::DW_TAG_namespace)
+    Name = "(anonymous namespace)";
+
+  if (CU->getInfo(Idx).ParentIdx == 0 ||
+      // FIXME: dsymutil-classic compatibility. Ignore modules.
+      CU->getOrigUnit().getDIEAtIndex(CU->getInfo(Idx).ParentIdx).getTag() ==
+          dwarf::DW_TAG_module)
+    return djbHash(Name ? Name : "", djbHash(ChildRecurseDepth ? "" : "::"));
+
+  DWARFDie Die = OrigUnit->getDIEAtIndex(CU->getInfo(Idx).ParentIdx);
+  return djbHash(
+      (Name ? Name : ""),
+      djbHash((Name ? "::" : ""),
+              hashFullyQualifiedName(Die, *CU, File, ++ChildRecurseDepth)));
+}
+
+static uint64_t getDwoId(const DWARFDie &CUDie, const DWARFUnit &Unit) {
+  auto DwoId = dwarf::toUnsigned(
+      CUDie.find({dwarf::DW_AT_dwo_id, dwarf::DW_AT_GNU_dwo_id}));
+  if (DwoId)
+    return *DwoId;
+  return 0;
+}
+
+static std::string remapPath(StringRef Path,
+                             const objectPrefixMap &ObjectPrefixMap) {
+  if (ObjectPrefixMap.empty())
+    return Path.str();
+
+  SmallString<256> p = Path;
+  for (const auto &Entry : ObjectPrefixMap)
+    if (llvm::sys::path::replace_path_prefix(p, Entry.first, Entry.second))
+      break;
+  return p.str().str();
+}
+
+bool DWARFLinker::registerModuleReference(
+    DWARFDie CUDie, const DWARFUnit &Unit, const DwarfFile &File,
+    OffsetsStringPool &StringPool, UniquingStringPool &UniquingStringPool,
+    DeclContextTree &ODRContexts, uint64_t ModulesEndOffset, unsigned &UnitID,
+    bool IsLittleEndian, unsigned Indent, bool Quiet) {
+  std::string PCMfile = dwarf::toString(
+      CUDie.find({dwarf::DW_AT_dwo_name, dwarf::DW_AT_GNU_dwo_name}), "");
+  if (PCMfile.empty())
+    return false;
+  if (Options.ObjectPrefixMap)
+    PCMfile = remapPath(PCMfile, *Options.ObjectPrefixMap);
+
+  // Clang module DWARF skeleton CUs abuse this for the path to the module.
+  uint64_t DwoId = getDwoId(CUDie, Unit);
+
+  std::string Name = dwarf::toString(CUDie.find(dwarf::DW_AT_name), "");
+  if (Name.empty()) {
+    if (!Quiet)
+      reportWarning("Anonymous module skeleton CU for " + PCMfile, File);
+    return true;
+  }
+
+  if (!Quiet && Options.Verbose) {
+    outs().indent(Indent);
+    outs() << "Found clang module reference " << PCMfile;
+  }
+
+  auto Cached = ClangModules.find(PCMfile);
+  if (Cached != ClangModules.end()) {
+    // FIXME: Until PR27449 (https://llvm.org/bugs/show_bug.cgi?id=27449) is
+    // fixed in clang, only warn about DWO_id mismatches in verbose mode.
+    // ASTFileSignatures will change randomly when a module is rebuilt.
+    if (!Quiet && Options.Verbose && (Cached->second != DwoId))
+      reportWarning(Twine("hash mismatch: this object file was built against a "
+                          "different version of the module ") +
+                        PCMfile,
+                    File);
+    if (!Quiet && Options.Verbose)
+      outs() << " [cached].\n";
+    return true;
+  }
+  if (!Quiet && Options.Verbose)
+    outs() << " ...\n";
+
+  // Cyclic dependencies are disallowed by Clang, but we still
+  // shouldn't run into an infinite loop, so mark it as processed now.
+  ClangModules.insert({PCMfile, DwoId});
+
+  if (Error E =
+          loadClangModule(CUDie, PCMfile, Name, DwoId, File, StringPool,
+                          UniquingStringPool, ODRContexts, ModulesEndOffset,
+                          UnitID, IsLittleEndian, Indent + 2, Quiet)) {
+    consumeError(std::move(E));
+    return false;
+  }
+  return true;
+}
+
+Error DWARFLinker::loadClangModule(
+    DWARFDie CUDie, StringRef Filename, StringRef ModuleName, uint64_t DwoId,
+    const DwarfFile &File, OffsetsStringPool &StringPool,
+    UniquingStringPool &UniquingStringPool, DeclContextTree &ODRContexts,
+    uint64_t ModulesEndOffset, unsigned &UnitID, bool IsLittleEndian,
+    unsigned Indent, bool Quiet) {
+  /// Using a SmallString<0> because loadClangModule() is recursive.
+  SmallString<0> Path(Options.PrependPath);
+  if (sys::path::is_relative(Filename))
+    resolveRelativeObjectPath(Path, CUDie);
+  sys::path::append(Path, Filename);
+  // Don't use the cached binary holder because we have no thread-safety
+  // guarantee and the lifetime is limited.
+
+  if (Options.ObjFileLoader == nullptr)
+    return Error::success();
+
+  auto ErrOrObj = Options.ObjFileLoader(File.FileName, Path);
+  if (!ErrOrObj)
+    return Error::success();
+
+  std::unique_ptr<CompileUnit> Unit;
+
+  for (const auto &CU : ErrOrObj->Dwarf->compile_units()) {
+    updateDwarfVersion(CU->getVersion());
+    // Recursively get all modules imported by this one.
+    auto CUDie = CU->getUnitDIE(false);
+    if (!CUDie)
+      continue;
+    if (!registerModuleReference(
+            CUDie, *CU, File, StringPool, UniquingStringPool, ODRContexts,
+            ModulesEndOffset, UnitID, IsLittleEndian, Indent, Quiet)) {
+      if (Unit) {
+        std::string Err =
+            (Filename +
+             ": Clang modules are expected to have exactly 1 compile unit.\n")
+                .str();
+        reportError(Err, File);
+        return make_error<StringError>(Err, inconvertibleErrorCode());
+      }
+      // FIXME: Until PR27449 (https://llvm.org/bugs/show_bug.cgi?id=27449) is
+      // fixed in clang, only warn about DWO_id mismatches in verbose mode.
+      // ASTFileSignatures will change randomly when a module is rebuilt.
+      uint64_t PCMDwoId = getDwoId(CUDie, *CU);
+      if (PCMDwoId != DwoId) {
+        if (!Quiet && Options.Verbose)
+          reportWarning(
+              Twine("hash mismatch: this object file was built against a "
+                    "different version of the module ") +
+                  Filename,
+              File);
+        // Update the cache entry with the DwoId of the module loaded from disk.
+        ClangModules[Filename] = PCMDwoId;
+      }
+
+      // Add this module.
+      Unit = std::make_unique<CompileUnit>(*CU, UnitID++, !Options.NoODR,
+                                           ModuleName);
+      Unit->setHasInterestingContent();
+      analyzeContextInfo(CUDie, 0, *Unit, &ODRContexts.getRoot(),
+                         UniquingStringPool, ODRContexts, ModulesEndOffset,
+                         Options.ParseableSwiftInterfaces,
+                         [&](const Twine &Warning, const DWARFDie &DIE) {
+                           reportWarning(Warning, File, &DIE);
+                         });
+      // Keep everything.
+      Unit->markEverythingAsKept();
+    }
+  }
+  if (!Unit->getOrigUnit().getUnitDIE().hasChildren())
+    return Error::success();
+  if (!Quiet && Options.Verbose) {
+    outs().indent(Indent);
+    outs() << "cloning .debug_info from " << Filename << "\n";
+  }
+
+  UnitListTy CompileUnits;
+  CompileUnits.push_back(std::move(Unit));
+  assert(TheDwarfEmitter);
+  DIECloner(*this, TheDwarfEmitter, *ErrOrObj, DIEAlloc, CompileUnits,
+            Options.Update)
+      .cloneAllCompileUnits(*(ErrOrObj->Dwarf), File, StringPool,
+                            IsLittleEndian);
+  return Error::success();
+}
+
+uint64_t DWARFLinker::DIECloner::cloneAllCompileUnits(
+    DWARFContext &DwarfContext, const DwarfFile &File,
+    OffsetsStringPool &StringPool, bool IsLittleEndian) {
+  uint64_t OutputDebugInfoSize =
+      Linker.Options.NoOutput ? 0 : Emitter->getDebugInfoSectionSize();
+  const uint64_t StartOutputDebugInfoSize = OutputDebugInfoSize;
+
+  for (auto &CurrentUnit : CompileUnits) {
+    auto InputDIE = CurrentUnit->getOrigUnit().getUnitDIE();
+    CurrentUnit->setStartOffset(OutputDebugInfoSize);
+    if (!InputDIE) {
+      OutputDebugInfoSize = CurrentUnit->computeNextUnitOffset();
+      continue;
+    }
+    if (CurrentUnit->getInfo(0).Keep) {
+      // Clone the InputDIE into your Unit DIE in our compile unit since it
+      // already has a DIE inside of it.
+      CurrentUnit->createOutputDIE();
+      cloneDIE(InputDIE, File, *CurrentUnit, StringPool, 0 /* PC offset */,
+               11 /* Unit Header size */, 0, IsLittleEndian,
+               CurrentUnit->getOutputUnitDIE());
+    }
+
+    OutputDebugInfoSize = CurrentUnit->computeNextUnitOffset();
+
+    if (!Linker.Options.NoOutput) {
+      assert(Emitter);
+
+      if (LLVM_LIKELY(!Linker.Options.Update) ||
+          Linker.needToTranslateStrings())
+        Linker.patchLineTableForUnit(*CurrentUnit, DwarfContext, File);
+
+      Linker.emitAcceleratorEntriesForUnit(*CurrentUnit);
+
+      if (LLVM_UNLIKELY(Linker.Options.Update))
+        continue;
+
+      Linker.patchRangesForUnit(*CurrentUnit, DwarfContext, File);
+      auto ProcessExpr = [&](StringRef Bytes,
+                             SmallVectorImpl<uint8_t> &Buffer) {
+        DWARFUnit &OrigUnit = CurrentUnit->getOrigUnit();
+        DataExtractor Data(Bytes, IsLittleEndian,
+                           OrigUnit.getAddressByteSize());
+        cloneExpression(Data,
+                        DWARFExpression(Data, OrigUnit.getAddressByteSize(),
+                                        OrigUnit.getFormParams().Format),
+                        File, *CurrentUnit, Buffer);
+      };
+      Emitter->emitLocationsForUnit(*CurrentUnit, DwarfContext, ProcessExpr);
+    }
+  }
+
+  if (!Linker.Options.NoOutput) {
+    assert(Emitter);
+    // Emit all the compile unit's debug information.
+    for (auto &CurrentUnit : CompileUnits) {
+      if (LLVM_LIKELY(!Linker.Options.Update))
+        Linker.generateUnitRanges(*CurrentUnit);
+
+      CurrentUnit->fixupForwardReferences();
+
+      if (!CurrentUnit->getOutputUnitDIE())
+        continue;
+
+      assert(Emitter->getDebugInfoSectionSize() ==
+             CurrentUnit->getStartOffset());
+      Emitter->emitCompileUnitHeader(*CurrentUnit);
+      Emitter->emitDIE(*CurrentUnit->getOutputUnitDIE());
+      assert(Emitter->getDebugInfoSectionSize() ==
+             CurrentUnit->computeNextUnitOffset());
+    }
+  }
+
+  return OutputDebugInfoSize - StartOutputDebugInfoSize;
+}
+
+void DWARFLinker::updateAccelKind(DWARFContext &Dwarf) {
+  if (Options.TheAccelTableKind != AccelTableKind::Default)
+    return;
+
+  auto &DwarfObj = Dwarf.getDWARFObj();
+
+  if (!AtLeastOneDwarfAccelTable &&
+      (!DwarfObj.getAppleNamesSection().Data.empty() ||
+       !DwarfObj.getAppleTypesSection().Data.empty() ||
+       !DwarfObj.getAppleNamespacesSection().Data.empty() ||
+       !DwarfObj.getAppleObjCSection().Data.empty())) {
+    AtLeastOneAppleAccelTable = true;
+  }
+
+  if (!AtLeastOneDwarfAccelTable && !DwarfObj.getNamesSection().Data.empty()) {
+    AtLeastOneDwarfAccelTable = true;
+  }
+}
+
+bool DWARFLinker::emitPaperTrailWarnings(const DwarfFile &File,
+                                         OffsetsStringPool &StringPool) {
+
+  if (File.Warnings.empty())
+    return false;
+
+  DIE *CUDie = DIE::get(DIEAlloc, dwarf::DW_TAG_compile_unit);
+  CUDie->setOffset(11);
+  StringRef Producer;
+  StringRef WarningHeader;
+
+  switch (DwarfLinkerClientID) {
+  case DwarfLinkerClient::Dsymutil:
+    Producer = StringPool.internString("dsymutil");
+    WarningHeader = "dsymutil_warning";
+    break;
+
+  default:
+    Producer = StringPool.internString("dwarfopt");
+    WarningHeader = "dwarfopt_warning";
+    break;
+  }
+
+  StringRef FileName = StringPool.internString(File.FileName);
+  CUDie->addValue(DIEAlloc, dwarf::DW_AT_producer, dwarf::DW_FORM_strp,
+                  DIEInteger(StringPool.getStringOffset(Producer)));
+  DIEBlock *String = new (DIEAlloc) DIEBlock();
+  DIEBlocks.push_back(String);
+  for (auto &C : FileName)
+    String->addValue(DIEAlloc, dwarf::Attribute(0), dwarf::DW_FORM_data1,
+                     DIEInteger(C));
+  String->addValue(DIEAlloc, dwarf::Attribute(0), dwarf::DW_FORM_data1,
+                   DIEInteger(0));
+
+  CUDie->addValue(DIEAlloc, dwarf::DW_AT_name, dwarf::DW_FORM_string, String);
+  for (const auto &Warning : File.Warnings) {
+    DIE &ConstDie = CUDie->addChild(DIE::get(DIEAlloc, dwarf::DW_TAG_constant));
+    ConstDie.addValue(DIEAlloc, dwarf::DW_AT_name, dwarf::DW_FORM_strp,
+                      DIEInteger(StringPool.getStringOffset(WarningHeader)));
+    ConstDie.addValue(DIEAlloc, dwarf::DW_AT_artificial, dwarf::DW_FORM_flag,
+                      DIEInteger(1));
+    ConstDie.addValue(DIEAlloc, dwarf::DW_AT_const_value, dwarf::DW_FORM_strp,
+                      DIEInteger(StringPool.getStringOffset(Warning)));
+  }
+  unsigned Size = 4 /* FORM_strp */ + FileName.size() + 1 +
+                  File.Warnings.size() * (4 + 1 + 4) + 1 /* End of children */;
+  DIEAbbrev Abbrev = CUDie->generateAbbrev();
+  assignAbbrev(Abbrev);
+  CUDie->setAbbrevNumber(Abbrev.getNumber());
+  Size += getULEB128Size(Abbrev.getNumber());
+  // Abbreviation ordering needed for classic compatibility.
+  for (auto &Child : CUDie->children()) {
+    Abbrev = Child.generateAbbrev();
+    assignAbbrev(Abbrev);
+    Child.setAbbrevNumber(Abbrev.getNumber());
+    Size += getULEB128Size(Abbrev.getNumber());
+  }
+  CUDie->setSize(Size);
+  TheDwarfEmitter->emitPaperTrailWarningsDie(*CUDie);
+
+  return true;
+}
+
+void DWARFLinker::copyInvariantDebugSection(DWARFContext &Dwarf) {
+  if (!needToTranslateStrings())
+    TheDwarfEmitter->emitSectionContents(
+        Dwarf.getDWARFObj().getLineSection().Data, "debug_line");
+  TheDwarfEmitter->emitSectionContents(Dwarf.getDWARFObj().getLocSection().Data,
+                                       "debug_loc");
+  TheDwarfEmitter->emitSectionContents(
+      Dwarf.getDWARFObj().getRangesSection().Data, "debug_ranges");
+  TheDwarfEmitter->emitSectionContents(
+      Dwarf.getDWARFObj().getFrameSection().Data, "debug_frame");
+  TheDwarfEmitter->emitSectionContents(Dwarf.getDWARFObj().getArangesSection(),
+                                       "debug_aranges");
+}
+
+void DWARFLinker::addObjectFile(DwarfFile &File) {
+  ObjectContexts.emplace_back(LinkContext(File));
+
+  if (ObjectContexts.back().File.Dwarf)
+    updateAccelKind(*ObjectContexts.back().File.Dwarf);
+}
+
+bool DWARFLinker::link() {
+  assert(Options.NoOutput || TheDwarfEmitter);
+
+  // A unique ID that identifies each compile unit.
+  unsigned UnitID = 0;
+
+  // First populate the data structure we need for each iteration of the
+  // parallel loop.
+  unsigned NumObjects = ObjectContexts.size();
+
+  // This Dwarf string pool which is only used for uniquing. This one should
+  // never be used for offsets as its not thread-safe or predictable.
+  UniquingStringPool UniquingStringPool(nullptr, true);
+
+  // This Dwarf string pool which is used for emission. It must be used
+  // serially as the order of calling getStringOffset matters for
+  // reproducibility.
+  OffsetsStringPool OffsetsStringPool(StringsTranslator, true);
+
+  // ODR Contexts for the optimize.
+  DeclContextTree ODRContexts;
+
+  // If we haven't decided on an accelerator table kind yet, we base ourselves
+  // on the DWARF we have seen so far. At this point we haven't pulled in debug
+  // information from modules yet, so it is technically possible that they
+  // would affect the decision. However, as they're built with the same
+  // compiler and flags, it is safe to assume that they will follow the
+  // decision made here.
+  if (Options.TheAccelTableKind == AccelTableKind::Default) {
+    if (AtLeastOneDwarfAccelTable && !AtLeastOneAppleAccelTable)
+      Options.TheAccelTableKind = AccelTableKind::Dwarf;
+    else
+      Options.TheAccelTableKind = AccelTableKind::Apple;
+  }
+
+  for (LinkContext &OptContext : ObjectContexts) {
+    if (Options.Verbose) {
+      if (DwarfLinkerClientID == DwarfLinkerClient::Dsymutil)
+        outs() << "DEBUG MAP OBJECT: " << OptContext.File.FileName << "\n";
+      else
+        outs() << "OBJECT FILE: " << OptContext.File.FileName << "\n";
+    }
+
+    if (emitPaperTrailWarnings(OptContext.File, OffsetsStringPool))
+      continue;
+
+    if (!OptContext.File.Dwarf)
+      continue;
+    // Look for relocations that correspond to address map entries.
+
+    // there was findvalidrelocations previously ... probably we need to gather
+    // info here
+    if (LLVM_LIKELY(!Options.Update) &&
+        !OptContext.File.Addresses->hasValidRelocs()) {
+      if (Options.Verbose)
+        outs() << "No valid relocations found. Skipping.\n";
+
+      // Set "Skip" flag as a signal to other loops that we should not
+      // process this iteration.
+      OptContext.Skip = true;
+      continue;
+    }
+
+    // Setup access to the debug info.
+    if (!OptContext.File.Dwarf)
+      continue;
+
+    // In a first phase, just read in the debug info and load all clang modules.
+    OptContext.CompileUnits.reserve(
+        OptContext.File.Dwarf->getNumCompileUnits());
+
+    for (const auto &CU : OptContext.File.Dwarf->compile_units()) {
+      updateDwarfVersion(CU->getVersion());
+      auto CUDie = CU->getUnitDIE(false);
+      if (Options.Verbose) {
+        outs() << "Input compilation unit:";
+        DIDumpOptions DumpOpts;
+        DumpOpts.ChildRecurseDepth = 0;
+        DumpOpts.Verbose = Options.Verbose;
+        CUDie.dump(outs(), 0, DumpOpts);
+      }
+      if (CUDie && !LLVM_UNLIKELY(Options.Update))
+        registerModuleReference(CUDie, *CU, OptContext.File, OffsetsStringPool,
+                                UniquingStringPool, ODRContexts, 0, UnitID,
+                                OptContext.File.Dwarf->isLittleEndian());
+    }
+  }
+
+  // If we haven't seen any CUs, pick an arbitrary valid Dwarf version anyway.
+  if (MaxDwarfVersion == 0)
+    MaxDwarfVersion = 3;
+
+  // At this point we know how much data we have emitted. We use this value to
+  // compare canonical DIE offsets in analyzeContextInfo to see if a definition
+  // is already emitted, without being affected by canonical die offsets set
+  // later. This prevents undeterminism when analyze and clone execute
+  // concurrently, as clone set the canonical DIE offset and analyze reads it.
+  const uint64_t ModulesEndOffset =
+      Options.NoOutput ? 0 : TheDwarfEmitter->getDebugInfoSectionSize();
+
+  // These variables manage the list of processed object files.
+  // The mutex and condition variable are to ensure that this is thread safe.
+  std::mutex ProcessedFilesMutex;
+  std::condition_variable ProcessedFilesConditionVariable;
+  BitVector ProcessedFiles(NumObjects, false);
+
+  //  Analyzing the context info is particularly expensive so it is executed in
+  //  parallel with emitting the previous compile unit.
+  auto AnalyzeLambda = [&](size_t I) {
+    auto &Context = ObjectContexts[I];
+
+    if (Context.Skip || !Context.File.Dwarf)
+      return;
+
+    for (const auto &CU : Context.File.Dwarf->compile_units()) {
+      updateDwarfVersion(CU->getVersion());
+      // The !registerModuleReference() condition effectively skips
+      // over fully resolved skeleton units. This second pass of
+      // registerModuleReferences doesn't do any new work, but it
+      // will collect top-level errors, which are suppressed. Module
+      // warnings were already displayed in the first iteration.
+      bool Quiet = true;
+      auto CUDie = CU->getUnitDIE(false);
+      if (!CUDie || LLVM_UNLIKELY(Options.Update) ||
+          !registerModuleReference(CUDie, *CU, Context.File, OffsetsStringPool,
+                                   UniquingStringPool, ODRContexts,
+                                   ModulesEndOffset, UnitID, Quiet)) {
+        Context.CompileUnits.push_back(std::make_unique<CompileUnit>(
+            *CU, UnitID++, !Options.NoODR && !Options.Update, ""));
+      }
+    }
+
+    // Now build the DIE parent links that we will use during the next phase.
+    for (auto &CurrentUnit : Context.CompileUnits) {
+      auto CUDie = CurrentUnit->getOrigUnit().getUnitDIE();
+      if (!CUDie)
+        continue;
+      analyzeContextInfo(CurrentUnit->getOrigUnit().getUnitDIE(), 0,
+                         *CurrentUnit, &ODRContexts.getRoot(),
+                         UniquingStringPool, ODRContexts, ModulesEndOffset,
+                         Options.ParseableSwiftInterfaces,
+                         [&](const Twine &Warning, const DWARFDie &DIE) {
+                           reportWarning(Warning, Context.File, &DIE);
+                         });
+    }
+  };
+
+  // For each object file map how many bytes were emitted.
+  StringMap<DebugInfoSize> SizeByObject;
+
+  // And then the remaining work in serial again.
+  // Note, although this loop runs in serial, it can run in parallel with
+  // the analyzeContextInfo loop so long as we process files with indices >=
+  // than those processed by analyzeContextInfo.
+  auto CloneLambda = [&](size_t I) {
+    auto &OptContext = ObjectContexts[I];
+    if (OptContext.Skip || !OptContext.File.Dwarf)
+      return;
+
+    // Then mark all the DIEs that need to be present in the generated output
+    // and collect some information about them.
+    // Note that this loop can not be merged with the previous one because
+    // cross-cu references require the ParentIdx to be setup for every CU in
+    // the object file before calling this.
+    if (LLVM_UNLIKELY(Options.Update)) {
+      for (auto &CurrentUnit : OptContext.CompileUnits)
+        CurrentUnit->markEverythingAsKept();
+      copyInvariantDebugSection(*OptContext.File.Dwarf);
+    } else {
+      for (auto &CurrentUnit : OptContext.CompileUnits)
+        lookForDIEsToKeep(*OptContext.File.Addresses,
+                          OptContext.File.Addresses->getValidAddressRanges(),
+                          OptContext.CompileUnits,
+                          CurrentUnit->getOrigUnit().getUnitDIE(),
+                          OptContext.File, *CurrentUnit, 0);
+    }
+
+    // The calls to applyValidRelocs inside cloneDIE will walk the reloc
+    // array again (in the same way findValidRelocsInDebugInfo() did). We
+    // need to reset the NextValidReloc index to the beginning.
+    if (OptContext.File.Addresses->hasValidRelocs() ||
+        LLVM_UNLIKELY(Options.Update)) {
+      SizeByObject[OptContext.File.FileName].Input =
+          getDebugInfoSize(*OptContext.File.Dwarf);
+      SizeByObject[OptContext.File.FileName].Output =
+          DIECloner(*this, TheDwarfEmitter, OptContext.File, DIEAlloc,
+                    OptContext.CompileUnits, Options.Update)
+              .cloneAllCompileUnits(*OptContext.File.Dwarf, OptContext.File,
+                                    OffsetsStringPool,
+                                    OptContext.File.Dwarf->isLittleEndian());
+    }
+    if (!Options.NoOutput && !OptContext.CompileUnits.empty() &&
+        LLVM_LIKELY(!Options.Update))
+      patchFrameInfoForObject(
+          OptContext.File, OptContext.File.Addresses->getValidAddressRanges(),
+          *OptContext.File.Dwarf,
+          OptContext.CompileUnits[0]->getOrigUnit().getAddressByteSize());
+
+    // Clean-up before starting working on the next object.
+    cleanupAuxiliarryData(OptContext);
+  };
+
+  auto EmitLambda = [&]() {
+    // Emit everything that's global.
+    if (!Options.NoOutput) {
+      TheDwarfEmitter->emitAbbrevs(Abbreviations, MaxDwarfVersion);
+      TheDwarfEmitter->emitStrings(OffsetsStringPool);
+      switch (Options.TheAccelTableKind) {
+      case AccelTableKind::Apple:
+        TheDwarfEmitter->emitAppleNames(AppleNames);
+        TheDwarfEmitter->emitAppleNamespaces(AppleNamespaces);
+        TheDwarfEmitter->emitAppleTypes(AppleTypes);
+        TheDwarfEmitter->emitAppleObjc(AppleObjc);
+        break;
+      case AccelTableKind::Dwarf:
+        TheDwarfEmitter->emitDebugNames(DebugNames);
+        break;
+      case AccelTableKind::Default:
+        llvm_unreachable("Default should have already been resolved.");
+        break;
+      }
+    }
+  };
+
+  auto AnalyzeAll = [&]() {
+    for (unsigned I = 0, E = NumObjects; I != E; ++I) {
+      AnalyzeLambda(I);
+
+      std::unique_lock<std::mutex> LockGuard(ProcessedFilesMutex);
+      ProcessedFiles.set(I);
+      ProcessedFilesConditionVariable.notify_one();
+    }
+  };
+
+  auto CloneAll = [&]() {
+    for (unsigned I = 0, E = NumObjects; I != E; ++I) {
+      {
+        std::unique_lock<std::mutex> LockGuard(ProcessedFilesMutex);
+        if (!ProcessedFiles[I]) {
+          ProcessedFilesConditionVariable.wait(
+              LockGuard, [&]() { return ProcessedFiles[I]; });
+        }
+      }
+
+      CloneLambda(I);
+    }
+    EmitLambda();
+  };
+
+  // To limit memory usage in the single threaded case, analyze and clone are
+  // run sequentially so the OptContext is freed after processing each object
+  // in endDebugObject.
+  if (Options.Threads == 1) {
+    for (unsigned I = 0, E = NumObjects; I != E; ++I) {
+      AnalyzeLambda(I);
+      CloneLambda(I);
+    }
+    EmitLambda();
+  } else {
+    ThreadPool Pool(hardware_concurrency(2));
+    Pool.async(AnalyzeAll);
+    Pool.async(CloneAll);
+    Pool.wait();
+  }
+
+  if (Options.Statistics) {
+    // Create a vector sorted in descending order by output size.
+    std::vector<std::pair<StringRef, DebugInfoSize>> Sorted;
+    for (auto &E : SizeByObject)
+      Sorted.emplace_back(E.first(), E.second);
+    llvm::sort(Sorted.begin(), Sorted.end(), [](auto &LHS, auto &RHS) {
+      return LHS.second.Output > RHS.second.Output;
+    });
+
+    auto ComputePercentange = [](int64_t Input, int64_t Output) -> float {
+      const float Difference = Output - Input;
+      const float Sum = Input + Output;
+      if (Sum == 0)
+        return 0;
+      return (Difference / (Sum / 2));
+    };
+
+    int64_t InputTotal = 0;
+    int64_t OutputTotal = 0;
+    const char *FormatStr = "{0,-45} {1,10}b  {2,10}b {3,8:P}\n";
+
+    // Print header.
+    outs() << ".debug_info section size (in bytes)\n";
+    outs() << "----------------------------------------------------------------"
+              "---------------\n";
+    outs() << "Filename                                           Object       "
+              "  dSYM   Change\n";
+    outs() << "----------------------------------------------------------------"
+              "---------------\n";
+
+    // Print body.
+    for (auto &E : Sorted) {
+      InputTotal += E.second.Input;
+      OutputTotal += E.second.Output;
+      llvm::outs() << formatv(
+          FormatStr, sys::path::filename(E.first).take_back(45), E.second.Input,
+          E.second.Output, ComputePercentange(E.second.Input, E.second.Output));
+    }
+    // Print total and footer.
+    outs() << "----------------------------------------------------------------"
+              "---------------\n";
+    llvm::outs() << formatv(FormatStr, "Total", InputTotal, OutputTotal,
+                            ComputePercentange(InputTotal, OutputTotal));
+    outs() << "----------------------------------------------------------------"
+              "---------------\n\n";
+  }
+
+  return true;
+}
+
 } // namespace llvm
diff --git a/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp b/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
index e4de01676dca..f59a9023c690 100644
--- a/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinkerCompileUnit.cpp
@@ -29,6 +29,14 @@ uint16_t CompileUnit::getLanguage() {
   return Language;
 }
 
+StringRef CompileUnit::getSysRoot() {
+  if (SysRoot.empty()) {
+    DWARFDie CU = getOrigUnit().getUnitDIE();
+    SysRoot = dwarf::toStringRef(CU.find(dwarf::DW_AT_LLVM_sysroot)).str();
+  }
+  return SysRoot;
+}
+ 
 void CompileUnit::markEverythingAsKept() {
   unsigned Idx = 0;
 
diff --git a/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp b/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp
index 077fd4494241..c9a5da6676b3 100644
--- a/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp
+++ b/llvm/lib/DWARFLinker/DWARFLinkerDeclContext.cpp
@@ -80,8 +80,12 @@ PointerIntPair<DeclContext *, 1> DeclContextTree::getChildDeclContext(
     break;
   }
 
-  const char *Name = DIE.getName(DINameKind::LinkageName);
-  const char *ShortName = DIE.getName(DINameKind::ShortName);
+  const char *Name = DIE.getLinkageName();
+  const char *ShortName = DIE.getShortName();
+
+  if (!Name)
+    Name = ShortName;
+
   StringRef NameRef;
   StringRef ShortNameRef;
   StringRef FileRef;
diff --git a/llvm/lib/DWARFLinker/DWARFStreamer.cpp b/llvm/lib/DWARFLinker/DWARFStreamer.cpp
new file mode 100644
index 000000000000..e900335f24b3
--- /dev/null
+++ b/llvm/lib/DWARFLinker/DWARFStreamer.cpp
@@ -0,0 +1,774 @@
+//===- DwarfStreamer.cpp --------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DWARFLinker/DWARFStreamer.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/NonRelocatableStringpool.h"
+#include "llvm/DWARFLinker/DWARFLinkerCompileUnit.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/MCTargetOptionsCommandFlags.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetOptions.h"
+
+namespace llvm {
+
+bool DwarfStreamer::init(Triple TheTriple) {
+  std::string ErrorStr;
+  std::string TripleName;
+  StringRef Context = "dwarf streamer init";
+
+  // Get the target.
+  const Target *TheTarget =
+      TargetRegistry::lookupTarget(TripleName, TheTriple, ErrorStr);
+  if (!TheTarget)
+    return error(ErrorStr, Context), false;
+  TripleName = TheTriple.getTriple();
+
+  // Create all the MC Objects.
+  MRI.reset(TheTarget->createMCRegInfo(TripleName));
+  if (!MRI)
+    return error(Twine("no register info for target ") + TripleName, Context),
+           false;
+
+  MCTargetOptions MCOptions = mc::InitMCTargetOptionsFromFlags();
+  MAI.reset(TheTarget->createMCAsmInfo(*MRI, TripleName, MCOptions));
+  if (!MAI)
+    return error("no asm info for target " + TripleName, Context), false;
+
+  MOFI.reset(new MCObjectFileInfo);
+  MC.reset(new MCContext(MAI.get(), MRI.get(), MOFI.get()));
+  MOFI->InitMCObjectFileInfo(TheTriple, /*PIC*/ false, *MC);
+
+  MSTI.reset(TheTarget->createMCSubtargetInfo(TripleName, "", ""));
+  if (!MSTI)
+    return error("no subtarget info for target " + TripleName, Context), false;
+
+  MAB = TheTarget->createMCAsmBackend(*MSTI, *MRI, MCOptions);
+  if (!MAB)
+    return error("no asm backend for target " + TripleName, Context), false;
+
+  MII.reset(TheTarget->createMCInstrInfo());
+  if (!MII)
+    return error("no instr info info for target " + TripleName, Context), false;
+
+  MCE = TheTarget->createMCCodeEmitter(*MII, *MRI, *MC);
+  if (!MCE)
+    return error("no code emitter for target " + TripleName, Context), false;
+
+  switch (OutFileType) {
+  case OutputFileType::Assembly: {
+    MIP = TheTarget->createMCInstPrinter(TheTriple, MAI->getAssemblerDialect(),
+                                         *MAI, *MII, *MRI);
+    MS = TheTarget->createAsmStreamer(
+        *MC, std::make_unique<formatted_raw_ostream>(OutFile), true, true, MIP,
+        std::unique_ptr<MCCodeEmitter>(MCE), std::unique_ptr<MCAsmBackend>(MAB),
+        true);
+    break;
+  }
+  case OutputFileType::Object: {
+    MS = TheTarget->createMCObjectStreamer(
+        TheTriple, *MC, std::unique_ptr<MCAsmBackend>(MAB),
+        MAB->createObjectWriter(OutFile), std::unique_ptr<MCCodeEmitter>(MCE),
+        *MSTI, MCOptions.MCRelaxAll, MCOptions.MCIncrementalLinkerCompatible,
+        /*DWARFMustBeAtTheEnd*/ false);
+    break;
+  }
+  }
+
+  if (!MS)
+    return error("no object streamer for target " + TripleName, Context), false;
+
+  // Finally create the AsmPrinter we'll use to emit the DIEs.
+  TM.reset(TheTarget->createTargetMachine(TripleName, "", "", TargetOptions(),
+                                          None));
+  if (!TM)
+    return error("no target machine for target " + TripleName, Context), false;
+
+  Asm.reset(TheTarget->createAsmPrinter(*TM, std::unique_ptr<MCStreamer>(MS)));
+  if (!Asm)
+    return error("no asm printer for target " + TripleName, Context), false;
+
+  RangesSectionSize = 0;
+  LocSectionSize = 0;
+  LineSectionSize = 0;
+  FrameSectionSize = 0;
+  DebugInfoSectionSize = 0;
+
+  return true;
+}
+
+void DwarfStreamer::finish() { MS->Finish(); }
+
+void DwarfStreamer::switchToDebugInfoSection(unsigned DwarfVersion) {
+  MS->SwitchSection(MOFI->getDwarfInfoSection());
+  MC->setDwarfVersion(DwarfVersion);
+}
+
+/// Emit the compilation unit header for \p Unit in the debug_info section.
+///
+/// A Dwarf section header is encoded as:
+///  uint32_t   Unit length (omitting this field)
+///  uint16_t   Version
+///  uint32_t   Abbreviation table offset
+///  uint8_t    Address size
+///
+/// Leading to a total of 11 bytes.
+void DwarfStreamer::emitCompileUnitHeader(CompileUnit &Unit) {
+  unsigned Version = Unit.getOrigUnit().getVersion();
+  switchToDebugInfoSection(Version);
+
+  /// The start of the unit within its section.
+  Unit.setLabelBegin(Asm->createTempSymbol("cu_begin"));
+  Asm->OutStreamer->emitLabel(Unit.getLabelBegin());
+
+  // Emit size of content not including length itself. The size has already
+  // been computed in CompileUnit::computeOffsets(). Subtract 4 to that size to
+  // account for the length field.
+  Asm->emitInt32(Unit.getNextUnitOffset() - Unit.getStartOffset() - 4);
+  Asm->emitInt16(Version);
+
+  // We share one abbreviations table across all units so it's always at the
+  // start of the section.
+  Asm->emitInt32(0);
+  Asm->emitInt8(Unit.getOrigUnit().getAddressByteSize());
+  DebugInfoSectionSize += 11;
+
+  // Remember this CU.
+  EmittedUnits.push_back({Unit.getUniqueID(), Unit.getLabelBegin()});
+}
+
+/// Emit the \p Abbrevs array as the shared abbreviation table
+/// for the linked Dwarf file.
+void DwarfStreamer::emitAbbrevs(
+    const std::vector<std::unique_ptr<DIEAbbrev>> &Abbrevs,
+    unsigned DwarfVersion) {
+  MS->SwitchSection(MOFI->getDwarfAbbrevSection());
+  MC->setDwarfVersion(DwarfVersion);
+  Asm->emitDwarfAbbrevs(Abbrevs);
+}
+
+/// Recursively emit the DIE tree rooted at \p Die.
+void DwarfStreamer::emitDIE(DIE &Die) {
+  MS->SwitchSection(MOFI->getDwarfInfoSection());
+  Asm->emitDwarfDIE(Die);
+  DebugInfoSectionSize += Die.getSize();
+}
+
+/// Emit contents of section SecName From Obj.
+void DwarfStreamer::emitSectionContents(StringRef SecData, StringRef SecName) {
+  MCSection *Section =
+      StringSwitch<MCSection *>(SecName)
+          .Case("debug_line", MC->getObjectFileInfo()->getDwarfLineSection())
+          .Case("debug_loc", MC->getObjectFileInfo()->getDwarfLocSection())
+          .Case("debug_ranges",
+                MC->getObjectFileInfo()->getDwarfRangesSection())
+          .Case("debug_frame", MC->getObjectFileInfo()->getDwarfFrameSection())
+          .Case("debug_aranges",
+                MC->getObjectFileInfo()->getDwarfARangesSection())
+          .Default(nullptr);
+
+  if (Section) {
+    MS->SwitchSection(Section);
+
+    MS->emitBytes(SecData);
+  }
+}
+
+/// Emit DIE containing warnings.
+void DwarfStreamer::emitPaperTrailWarningsDie(DIE &Die) {
+  switchToDebugInfoSection(/* Version */ 2);
+  auto &Asm = getAsmPrinter();
+  Asm.emitInt32(11 + Die.getSize() - 4);
+  Asm.emitInt16(2);
+  Asm.emitInt32(0);
+  Asm.emitInt8(MOFI->getTargetTriple().isArch64Bit() ? 8 : 4);
+  DebugInfoSectionSize += 11;
+  emitDIE(Die);
+}
+
+/// Emit the debug_str section stored in \p Pool.
+void DwarfStreamer::emitStrings(const NonRelocatableStringpool &Pool) {
+  Asm->OutStreamer->SwitchSection(MOFI->getDwarfStrSection());
+  std::vector<DwarfStringPoolEntryRef> Entries = Pool.getEntriesForEmission();
+  for (auto Entry : Entries) {
+    // Emit the string itself.
+    Asm->OutStreamer->emitBytes(Entry.getString());
+    // Emit a null terminator.
+    Asm->emitInt8(0);
+  }
+}
+
+void DwarfStreamer::emitDebugNames(
+    AccelTable<DWARF5AccelTableStaticData> &Table) {
+  if (EmittedUnits.empty())
+    return;
+
+  // Build up data structures needed to emit this section.
+  std::vector<MCSymbol *> CompUnits;
+  DenseMap<unsigned, size_t> UniqueIdToCuMap;
+  unsigned Id = 0;
+  for (auto &CU : EmittedUnits) {
+    CompUnits.push_back(CU.LabelBegin);
+    // We might be omitting CUs, so we need to remap them.
+    UniqueIdToCuMap[CU.ID] = Id++;
+  }
+
+  Asm->OutStreamer->SwitchSection(MOFI->getDwarfDebugNamesSection());
+  emitDWARF5AccelTable(
+      Asm.get(), Table, CompUnits,
+      [&UniqueIdToCuMap](const DWARF5AccelTableStaticData &Entry) {
+        return UniqueIdToCuMap[Entry.getCUIndex()];
+      });
+}
+
+void DwarfStreamer::emitAppleNamespaces(
+    AccelTable<AppleAccelTableStaticOffsetData> &Table) {
+  Asm->OutStreamer->SwitchSection(MOFI->getDwarfAccelNamespaceSection());
+  auto *SectionBegin = Asm->createTempSymbol("namespac_begin");
+  Asm->OutStreamer->emitLabel(SectionBegin);
+  emitAppleAccelTable(Asm.get(), Table, "namespac", SectionBegin);
+}
+
+void DwarfStreamer::emitAppleNames(
+    AccelTable<AppleAccelTableStaticOffsetData> &Table) {
+  Asm->OutStreamer->SwitchSection(MOFI->getDwarfAccelNamesSection());
+  auto *SectionBegin = Asm->createTempSymbol("names_begin");
+  Asm->OutStreamer->emitLabel(SectionBegin);
+  emitAppleAccelTable(Asm.get(), Table, "names", SectionBegin);
+}
+
+void DwarfStreamer::emitAppleObjc(
+    AccelTable<AppleAccelTableStaticOffsetData> &Table) {
+  Asm->OutStreamer->SwitchSection(MOFI->getDwarfAccelObjCSection());
+  auto *SectionBegin = Asm->createTempSymbol("objc_begin");
+  Asm->OutStreamer->emitLabel(SectionBegin);
+  emitAppleAccelTable(Asm.get(), Table, "objc", SectionBegin);
+}
+
+void DwarfStreamer::emitAppleTypes(
+    AccelTable<AppleAccelTableStaticTypeData> &Table) {
+  Asm->OutStreamer->SwitchSection(MOFI->getDwarfAccelTypesSection());
+  auto *SectionBegin = Asm->createTempSymbol("types_begin");
+  Asm->OutStreamer->emitLabel(SectionBegin);
+  emitAppleAccelTable(Asm.get(), Table, "types", SectionBegin);
+}
+
+/// Emit the swift_ast section stored in \p Buffers.
+void DwarfStreamer::emitSwiftAST(StringRef Buffer) {
+  MCSection *SwiftASTSection = MOFI->getDwarfSwiftASTSection();
+  SwiftASTSection->setAlignment(Align(32));
+  MS->SwitchSection(SwiftASTSection);
+  MS->emitBytes(Buffer);
+}
+
+/// Emit the debug_range section contents for \p FuncRange by
+/// translating the original \p Entries. The debug_range section
+/// format is totally trivial, consisting just of pairs of address
+/// sized addresses describing the ranges.
+void DwarfStreamer::emitRangesEntries(
+    int64_t UnitPcOffset, uint64_t OrigLowPc,
+    const FunctionIntervals::const_iterator &FuncRange,
+    const std::vector<DWARFDebugRangeList::RangeListEntry> &Entries,
+    unsigned AddressSize) {
+  MS->SwitchSection(MC->getObjectFileInfo()->getDwarfRangesSection());
+
+  // Offset each range by the right amount.
+  int64_t PcOffset = Entries.empty() ? 0 : FuncRange.value() + UnitPcOffset;
+  for (const auto &Range : Entries) {
+    if (Range.isBaseAddressSelectionEntry(AddressSize)) {
+      warn("unsupported base address selection operation",
+           "emitting debug_ranges");
+      break;
+    }
+    // Do not emit empty ranges.
+    if (Range.StartAddress == Range.EndAddress)
+      continue;
+
+    // All range entries should lie in the function range.
+    if (!(Range.StartAddress + OrigLowPc >= FuncRange.start() &&
+          Range.EndAddress + OrigLowPc <= FuncRange.stop()))
+      warn("inconsistent range data.", "emitting debug_ranges");
+    MS->emitIntValue(Range.StartAddress + PcOffset, AddressSize);
+    MS->emitIntValue(Range.EndAddress + PcOffset, AddressSize);
+    RangesSectionSize += 2 * AddressSize;
+  }
+
+  // Add the terminator entry.
+  MS->emitIntValue(0, AddressSize);
+  MS->emitIntValue(0, AddressSize);
+  RangesSectionSize += 2 * AddressSize;
+}
+
+/// Emit the debug_aranges contribution of a unit and
+/// if \p DoDebugRanges is true the debug_range contents for a
+/// compile_unit level DW_AT_ranges attribute (Which are basically the
+/// same thing with a different base address).
+/// Just aggregate all the ranges gathered inside that unit.
+void DwarfStreamer::emitUnitRangesEntries(CompileUnit &Unit,
+                                          bool DoDebugRanges) {
+  unsigned AddressSize = Unit.getOrigUnit().getAddressByteSize();
+  // Gather the ranges in a vector, so that we can simplify them. The
+  // IntervalMap will have coalesced the non-linked ranges, but here
+  // we want to coalesce the linked addresses.
+  std::vector<std::pair<uint64_t, uint64_t>> Ranges;
+  const auto &FunctionRanges = Unit.getFunctionRanges();
+  for (auto Range = FunctionRanges.begin(), End = FunctionRanges.end();
+       Range != End; ++Range)
+    Ranges.push_back(std::make_pair(Range.start() + Range.value(),
+                                    Range.stop() + Range.value()));
+
+  // The object addresses where sorted, but again, the linked
+  // addresses might end up in a different order.
+  llvm::sort(Ranges);
+
+  if (!Ranges.empty()) {
+    MS->SwitchSection(MC->getObjectFileInfo()->getDwarfARangesSection());
+
+    MCSymbol *BeginLabel = Asm->createTempSymbol("Barange");
+    MCSymbol *EndLabel = Asm->createTempSymbol("Earange");
+
+    unsigned HeaderSize =
+        sizeof(int32_t) + // Size of contents (w/o this field
+        sizeof(int16_t) + // DWARF ARange version number
+        sizeof(int32_t) + // Offset of CU in the .debug_info section
+        sizeof(int8_t) +  // Pointer Size (in bytes)
+        sizeof(int8_t);   // Segment Size (in bytes)
+
+    unsigned TupleSize = AddressSize * 2;
+    unsigned Padding = offsetToAlignment(HeaderSize, Align(TupleSize));
+
+    Asm->emitLabelDifference(EndLabel, BeginLabel, 4); // Arange length
+    Asm->OutStreamer->emitLabel(BeginLabel);
+    Asm->emitInt16(dwarf::DW_ARANGES_VERSION); // Version number
+    Asm->emitInt32(Unit.getStartOffset());     // Corresponding unit's offset
+    Asm->emitInt8(AddressSize);                // Address size
+    Asm->emitInt8(0);                          // Segment size
+
+    Asm->OutStreamer->emitFill(Padding, 0x0);
+
+    for (auto Range = Ranges.begin(), End = Ranges.end(); Range != End;
+         ++Range) {
+      uint64_t RangeStart = Range->first;
+      MS->emitIntValue(RangeStart, AddressSize);
+      while ((Range + 1) != End && Range->second == (Range + 1)->first)
+        ++Range;
+      MS->emitIntValue(Range->second - RangeStart, AddressSize);
+    }
+
+    // Emit terminator
+    Asm->OutStreamer->emitIntValue(0, AddressSize);
+    Asm->OutStreamer->emitIntValue(0, AddressSize);
+    Asm->OutStreamer->emitLabel(EndLabel);
+  }
+
+  if (!DoDebugRanges)
+    return;
+
+  MS->SwitchSection(MC->getObjectFileInfo()->getDwarfRangesSection());
+  // Offset each range by the right amount.
+  int64_t PcOffset = -Unit.getLowPc();
+  // Emit coalesced ranges.
+  for (auto Range = Ranges.begin(), End = Ranges.end(); Range != End; ++Range) {
+    MS->emitIntValue(Range->first + PcOffset, AddressSize);
+    while (Range + 1 != End && Range->second == (Range + 1)->first)
+      ++Range;
+    MS->emitIntValue(Range->second + PcOffset, AddressSize);
+    RangesSectionSize += 2 * AddressSize;
+  }
+
+  // Add the terminator entry.
+  MS->emitIntValue(0, AddressSize);
+  MS->emitIntValue(0, AddressSize);
+  RangesSectionSize += 2 * AddressSize;
+}
+
+/// Emit location lists for \p Unit and update attributes to point to the new
+/// entries.
+void DwarfStreamer::emitLocationsForUnit(
+    const CompileUnit &Unit, DWARFContext &Dwarf,
+    std::function<void(StringRef, SmallVectorImpl<uint8_t> &)> ProcessExpr) {
+  const auto &Attributes = Unit.getLocationAttributes();
+
+  if (Attributes.empty())
+    return;
+
+  MS->SwitchSection(MC->getObjectFileInfo()->getDwarfLocSection());
+
+  unsigned AddressSize = Unit.getOrigUnit().getAddressByteSize();
+  uint64_t BaseAddressMarker = (AddressSize == 8)
+                                   ? std::numeric_limits<uint64_t>::max()
+                                   : std::numeric_limits<uint32_t>::max();
+  const DWARFSection &InputSec = Dwarf.getDWARFObj().getLocSection();
+  DataExtractor Data(InputSec.Data, Dwarf.isLittleEndian(), AddressSize);
+  DWARFUnit &OrigUnit = Unit.getOrigUnit();
+  auto OrigUnitDie = OrigUnit.getUnitDIE(false);
+  int64_t UnitPcOffset = 0;
+  if (auto OrigLowPc = dwarf::toAddress(OrigUnitDie.find(dwarf::DW_AT_low_pc)))
+    UnitPcOffset = int64_t(*OrigLowPc) - Unit.getLowPc();
+
+  SmallVector<uint8_t, 32> Buffer;
+  for (const auto &Attr : Attributes) {
+    uint64_t Offset = Attr.first.get();
+    Attr.first.set(LocSectionSize);
+    // This is the quantity to add to the old location address to get
+    // the correct address for the new one.
+    int64_t LocPcOffset = Attr.second + UnitPcOffset;
+    while (Data.isValidOffset(Offset)) {
+      uint64_t Low = Data.getUnsigned(&Offset, AddressSize);
+      uint64_t High = Data.getUnsigned(&Offset, AddressSize);
+      LocSectionSize += 2 * AddressSize;
+      // End of list entry.
+      if (Low == 0 && High == 0) {
+        Asm->OutStreamer->emitIntValue(0, AddressSize);
+        Asm->OutStreamer->emitIntValue(0, AddressSize);
+        break;
+      }
+      // Base address selection entry.
+      if (Low == BaseAddressMarker) {
+        Asm->OutStreamer->emitIntValue(BaseAddressMarker, AddressSize);
+        Asm->OutStreamer->emitIntValue(High + Attr.second, AddressSize);
+        LocPcOffset = 0;
+        continue;
+      }
+      // Location list entry.
+      Asm->OutStreamer->emitIntValue(Low + LocPcOffset, AddressSize);
+      Asm->OutStreamer->emitIntValue(High + LocPcOffset, AddressSize);
+      uint64_t Length = Data.getU16(&Offset);
+      Asm->OutStreamer->emitIntValue(Length, 2);
+      // Copy the bytes into to the buffer, process them, emit them.
+      Buffer.reserve(Length);
+      Buffer.resize(0);
+      StringRef Input = InputSec.Data.substr(Offset, Length);
+      ProcessExpr(Input, Buffer);
+      Asm->OutStreamer->emitBytes(
+          StringRef((const char *)Buffer.data(), Length));
+      Offset += Length;
+      LocSectionSize += Length + 2;
+    }
+  }
+}
+
+void DwarfStreamer::emitLineTableForUnit(MCDwarfLineTableParams Params,
+                                         StringRef PrologueBytes,
+                                         unsigned MinInstLength,
+                                         std::vector<DWARFDebugLine::Row> &Rows,
+                                         unsigned PointerSize) {
+  // Switch to the section where the table will be emitted into.
+  MS->SwitchSection(MC->getObjectFileInfo()->getDwarfLineSection());
+  MCSymbol *LineStartSym = MC->createTempSymbol();
+  MCSymbol *LineEndSym = MC->createTempSymbol();
+
+  // The first 4 bytes is the total length of the information for this
+  // compilation unit (not including these 4 bytes for the length).
+  Asm->emitLabelDifference(LineEndSym, LineStartSym, 4);
+  Asm->OutStreamer->emitLabel(LineStartSym);
+  // Copy Prologue.
+  MS->emitBytes(PrologueBytes);
+  LineSectionSize += PrologueBytes.size() + 4;
+
+  SmallString<128> EncodingBuffer;
+  raw_svector_ostream EncodingOS(EncodingBuffer);
+
+  if (Rows.empty()) {
+    // We only have the dummy entry, dsymutil emits an entry with a 0
+    // address in that case.
+    MCDwarfLineAddr::Encode(*MC, Params, std::numeric_limits<int64_t>::max(), 0,
+                            EncodingOS);
+    MS->emitBytes(EncodingOS.str());
+    LineSectionSize += EncodingBuffer.size();
+    MS->emitLabel(LineEndSym);
+    return;
+  }
+
+  // Line table state machine fields
+  unsigned FileNum = 1;
+  unsigned LastLine = 1;
+  unsigned Column = 0;
+  unsigned IsStatement = 1;
+  unsigned Isa = 0;
+  uint64_t Address = -1ULL;
+
+  unsigned RowsSinceLastSequence = 0;
+
+  for (unsigned Idx = 0; Idx < Rows.size(); ++Idx) {
+    auto &Row = Rows[Idx];
+
+    int64_t AddressDelta;
+    if (Address == -1ULL) {
+      MS->emitIntValue(dwarf::DW_LNS_extended_op, 1);
+      MS->emitULEB128IntValue(PointerSize + 1);
+      MS->emitIntValue(dwarf::DW_LNE_set_address, 1);
+      MS->emitIntValue(Row.Address.Address, PointerSize);
+      LineSectionSize += 2 + PointerSize + getULEB128Size(PointerSize + 1);
+      AddressDelta = 0;
+    } else {
+      AddressDelta = (Row.Address.Address - Address) / MinInstLength;
+    }
+
+    // FIXME: code copied and transformed from MCDwarf.cpp::EmitDwarfLineTable.
+    // We should find a way to share this code, but the current compatibility
+    // requirement with classic dsymutil makes it hard. Revisit that once this
+    // requirement is dropped.
+
+    if (FileNum != Row.File) {
+      FileNum = Row.File;
+      MS->emitIntValue(dwarf::DW_LNS_set_file, 1);
+      MS->emitULEB128IntValue(FileNum);
+      LineSectionSize += 1 + getULEB128Size(FileNum);
+    }
+    if (Column != Row.Column) {
+      Column = Row.Column;
+      MS->emitIntValue(dwarf::DW_LNS_set_column, 1);
+      MS->emitULEB128IntValue(Column);
+      LineSectionSize += 1 + getULEB128Size(Column);
+    }
+
+    // FIXME: We should handle the discriminator here, but dsymutil doesn't
+    // consider it, thus ignore it for now.
+
+    if (Isa != Row.Isa) {
+      Isa = Row.Isa;
+      MS->emitIntValue(dwarf::DW_LNS_set_isa, 1);
+      MS->emitULEB128IntValue(Isa);
+      LineSectionSize += 1 + getULEB128Size(Isa);
+    }
+    if (IsStatement != Row.IsStmt) {
+      IsStatement = Row.IsStmt;
+      MS->emitIntValue(dwarf::DW_LNS_negate_stmt, 1);
+      LineSectionSize += 1;
+    }
+    if (Row.BasicBlock) {
+      MS->emitIntValue(dwarf::DW_LNS_set_basic_block, 1);
+      LineSectionSize += 1;
+    }
+
+    if (Row.PrologueEnd) {
+      MS->emitIntValue(dwarf::DW_LNS_set_prologue_end, 1);
+      LineSectionSize += 1;
+    }
+
+    if (Row.EpilogueBegin) {
+      MS->emitIntValue(dwarf::DW_LNS_set_epilogue_begin, 1);
+      LineSectionSize += 1;
+    }
+
+    int64_t LineDelta = int64_t(Row.Line) - LastLine;
+    if (!Row.EndSequence) {
+      MCDwarfLineAddr::Encode(*MC, Params, LineDelta, AddressDelta, EncodingOS);
+      MS->emitBytes(EncodingOS.str());
+      LineSectionSize += EncodingBuffer.size();
+      EncodingBuffer.resize(0);
+      Address = Row.Address.Address;
+      LastLine = Row.Line;
+      RowsSinceLastSequence++;
+    } else {
+      if (LineDelta) {
+        MS->emitIntValue(dwarf::DW_LNS_advance_line, 1);
+        MS->emitSLEB128IntValue(LineDelta);
+        LineSectionSize += 1 + getSLEB128Size(LineDelta);
+      }
+      if (AddressDelta) {
+        MS->emitIntValue(dwarf::DW_LNS_advance_pc, 1);
+        MS->emitULEB128IntValue(AddressDelta);
+        LineSectionSize += 1 + getULEB128Size(AddressDelta);
+      }
+      MCDwarfLineAddr::Encode(*MC, Params, std::numeric_limits<int64_t>::max(),
+                              0, EncodingOS);
+      MS->emitBytes(EncodingOS.str());
+      LineSectionSize += EncodingBuffer.size();
+      EncodingBuffer.resize(0);
+      Address = -1ULL;
+      LastLine = FileNum = IsStatement = 1;
+      RowsSinceLastSequence = Column = Isa = 0;
+    }
+  }
+
+  if (RowsSinceLastSequence) {
+    MCDwarfLineAddr::Encode(*MC, Params, std::numeric_limits<int64_t>::max(), 0,
+                            EncodingOS);
+    MS->emitBytes(EncodingOS.str());
+    LineSectionSize += EncodingBuffer.size();
+    EncodingBuffer.resize(0);
+  }
+
+  MS->emitLabel(LineEndSym);
+}
+
+/// Copy the debug_line over to the updated binary while unobfuscating the file
+/// names and directories.
+void DwarfStreamer::translateLineTable(DataExtractor Data, uint64_t Offset) {
+  MS->SwitchSection(MC->getObjectFileInfo()->getDwarfLineSection());
+  StringRef Contents = Data.getData();
+
+  // We have to deconstruct the line table header, because it contains to
+  // length fields that will need to be updated when we change the length of
+  // the files and directories in there.
+  unsigned UnitLength = Data.getU32(&Offset);
+  uint64_t UnitEnd = Offset + UnitLength;
+  MCSymbol *BeginLabel = MC->createTempSymbol();
+  MCSymbol *EndLabel = MC->createTempSymbol();
+  unsigned Version = Data.getU16(&Offset);
+
+  if (Version > 5) {
+    warn("Unsupported line table version: dropping contents and not "
+         "unobfsucating line table.");
+    return;
+  }
+
+  Asm->emitLabelDifference(EndLabel, BeginLabel, 4);
+  Asm->OutStreamer->emitLabel(BeginLabel);
+  Asm->emitInt16(Version);
+  LineSectionSize += 6;
+
+  MCSymbol *HeaderBeginLabel = MC->createTempSymbol();
+  MCSymbol *HeaderEndLabel = MC->createTempSymbol();
+  Asm->emitLabelDifference(HeaderEndLabel, HeaderBeginLabel, 4);
+  Asm->OutStreamer->emitLabel(HeaderBeginLabel);
+  Offset += 4;
+  LineSectionSize += 4;
+
+  uint64_t AfterHeaderLengthOffset = Offset;
+  // Skip to the directories.
+  Offset += (Version >= 4) ? 5 : 4;
+  unsigned OpcodeBase = Data.getU8(&Offset);
+  Offset += OpcodeBase - 1;
+  Asm->OutStreamer->emitBytes(Contents.slice(AfterHeaderLengthOffset, Offset));
+  LineSectionSize += Offset - AfterHeaderLengthOffset;
+
+  // Offset points to the first directory.
+  while (const char *Dir = Data.getCStr(&Offset)) {
+    if (Dir[0] == 0)
+      break;
+
+    StringRef Translated = Translator(Dir);
+    Asm->OutStreamer->emitBytes(Translated);
+    Asm->emitInt8(0);
+    LineSectionSize += Translated.size() + 1;
+  }
+  Asm->emitInt8(0);
+  LineSectionSize += 1;
+
+  while (const char *File = Data.getCStr(&Offset)) {
+    if (File[0] == 0)
+      break;
+
+    StringRef Translated = Translator(File);
+    Asm->OutStreamer->emitBytes(Translated);
+    Asm->emitInt8(0);
+    LineSectionSize += Translated.size() + 1;
+
+    uint64_t OffsetBeforeLEBs = Offset;
+    Asm->emitULEB128(Data.getULEB128(&Offset));
+    Asm->emitULEB128(Data.getULEB128(&Offset));
+    Asm->emitULEB128(Data.getULEB128(&Offset));
+    LineSectionSize += Offset - OffsetBeforeLEBs;
+  }
+  Asm->emitInt8(0);
+  LineSectionSize += 1;
+
+  Asm->OutStreamer->emitLabel(HeaderEndLabel);
+
+  // Copy the actual line table program over.
+  Asm->OutStreamer->emitBytes(Contents.slice(Offset, UnitEnd));
+  LineSectionSize += UnitEnd - Offset;
+
+  Asm->OutStreamer->emitLabel(EndLabel);
+  Offset = UnitEnd;
+}
+
+/// Emit the pubnames or pubtypes section contribution for \p
+/// Unit into \p Sec. The data is provided in \p Names.
+void DwarfStreamer::emitPubSectionForUnit(
+    MCSection *Sec, StringRef SecName, const CompileUnit &Unit,
+    const std::vector<CompileUnit::AccelInfo> &Names) {
+  if (Names.empty())
+    return;
+
+  // Start the dwarf pubnames section.
+  Asm->OutStreamer->SwitchSection(Sec);
+  MCSymbol *BeginLabel = Asm->createTempSymbol("pub" + SecName + "_begin");
+  MCSymbol *EndLabel = Asm->createTempSymbol("pub" + SecName + "_end");
+
+  bool HeaderEmitted = false;
+  // Emit the pubnames for this compilation unit.
+  for (const auto &Name : Names) {
+    if (Name.SkipPubSection)
+      continue;
+
+    if (!HeaderEmitted) {
+      // Emit the header.
+      Asm->emitLabelDifference(EndLabel, BeginLabel, 4); // Length
+      Asm->OutStreamer->emitLabel(BeginLabel);
+      Asm->emitInt16(dwarf::DW_PUBNAMES_VERSION); // Version
+      Asm->emitInt32(Unit.getStartOffset());      // Unit offset
+      Asm->emitInt32(Unit.getNextUnitOffset() - Unit.getStartOffset()); // Size
+      HeaderEmitted = true;
+    }
+    Asm->emitInt32(Name.Die->getOffset());
+
+    // Emit the string itself.
+    Asm->OutStreamer->emitBytes(Name.Name.getString());
+    // Emit a null terminator.
+    Asm->emitInt8(0);
+  }
+
+  if (!HeaderEmitted)
+    return;
+  Asm->emitInt32(0); // End marker.
+  Asm->OutStreamer->emitLabel(EndLabel);
+}
+
+/// Emit .debug_pubnames for \p Unit.
+void DwarfStreamer::emitPubNamesForUnit(const CompileUnit &Unit) {
+  if (Minimize)
+    return;
+  emitPubSectionForUnit(MC->getObjectFileInfo()->getDwarfPubNamesSection(),
+                        "names", Unit, Unit.getPubnames());
+}
+
+/// Emit .debug_pubtypes for \p Unit.
+void DwarfStreamer::emitPubTypesForUnit(const CompileUnit &Unit) {
+  if (Minimize)
+    return;
+  emitPubSectionForUnit(MC->getObjectFileInfo()->getDwarfPubTypesSection(),
+                        "types", Unit, Unit.getPubtypes());
+}
+
+/// Emit a CIE into the debug_frame section.
+void DwarfStreamer::emitCIE(StringRef CIEBytes) {
+  MS->SwitchSection(MC->getObjectFileInfo()->getDwarfFrameSection());
+
+  MS->emitBytes(CIEBytes);
+  FrameSectionSize += CIEBytes.size();
+}
+
+/// Emit a FDE into the debug_frame section. \p FDEBytes
+/// contains the FDE data without the length, CIE offset and address
+/// which will be replaced with the parameter values.
+void DwarfStreamer::emitFDE(uint32_t CIEOffset, uint32_t AddrSize,
+                            uint32_t Address, StringRef FDEBytes) {
+  MS->SwitchSection(MC->getObjectFileInfo()->getDwarfFrameSection());
+
+  MS->emitIntValue(FDEBytes.size() + 4 + AddrSize, 4);
+  MS->emitIntValue(CIEOffset, 4);
+  MS->emitIntValue(Address, AddrSize);
+  MS->emitBytes(FDEBytes);
+  FrameSectionSize += FDEBytes.size() + 8 + AddrSize;
+}
+
+} // namespace llvm
diff --git a/llvm/lib/DebugInfo/CodeView/AppendingTypeTableBuilder.cpp b/llvm/lib/DebugInfo/CodeView/AppendingTypeTableBuilder.cpp
index 86a6f9eebfa2..4d8b15530b9e 100644
--- a/llvm/lib/DebugInfo/CodeView/AppendingTypeTableBuilder.cpp
+++ b/llvm/lib/DebugInfo/CodeView/AppendingTypeTableBuilder.cpp
@@ -74,12 +74,17 @@ ArrayRef<ArrayRef<uint8_t>> AppendingTypeTableBuilder::records() const {
 
 void AppendingTypeTableBuilder::reset() { SeenRecords.clear(); }
 
+static ArrayRef<uint8_t> stabilize(BumpPtrAllocator &RecordStorage,
+                                   ArrayRef<uint8_t> Record) {
+  uint8_t *Stable = RecordStorage.Allocate<uint8_t>(Record.size());
+  memcpy(Stable, Record.data(), Record.size());
+  return ArrayRef<uint8_t>(Stable, Record.size());
+}
+
 TypeIndex
 AppendingTypeTableBuilder::insertRecordBytes(ArrayRef<uint8_t> &Record) {
   TypeIndex NewTI = nextTypeIndex();
-  uint8_t *Stable = RecordStorage.Allocate<uint8_t>(Record.size());
-  memcpy(Stable, Record.data(), Record.size());
-  Record = ArrayRef<uint8_t>(Stable, Record.size());
+  Record = stabilize(RecordStorage, Record);
   SeenRecords.push_back(Record);
   return NewTI;
 }
@@ -93,3 +98,15 @@ AppendingTypeTableBuilder::insertRecord(ContinuationRecordBuilder &Builder) {
     TI = insertRecordBytes(C.RecordData);
   return TI;
 }
+
+bool AppendingTypeTableBuilder::replaceType(TypeIndex &Index, CVType Data,
+                                            bool Stabilize) {
+  assert(Index.toArrayIndex() < SeenRecords.size() &&
+         "This function cannot be used to insert records!");
+
+  ArrayRef<uint8_t> Record = Data.data();
+  if (Stabilize)
+    Record = stabilize(RecordStorage, Record);
+  SeenRecords[Index.toArrayIndex()] = Record;
+  return true;
+}
diff --git a/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp b/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
index 36a384baa13d..49761b9dce88 100644
--- a/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
+++ b/llvm/lib/DebugInfo/CodeView/CodeViewRecordIO.cpp
@@ -46,7 +46,7 @@ Error CodeViewRecordIO::endRecord() {
     while (PaddingBytes > 0) {
       char Pad = static_cast<uint8_t>(LF_PAD0 + PaddingBytes);
       StringRef BytesSR = StringRef(&Pad, sizeof(Pad));
-      Streamer->EmitBytes(BytesSR);
+      Streamer->emitBytes(BytesSR);
       --PaddingBytes;
     }
     resetStreamedLen();
@@ -101,7 +101,7 @@ Error CodeViewRecordIO::mapByteVectorTail(ArrayRef<uint8_t> &Bytes,
                                           const Twine &Comment) {
   if (isStreaming()) {
     emitComment(Comment);
-    Streamer->EmitBinaryData(toStringRef(Bytes));
+    Streamer->emitBinaryData(toStringRef(Bytes));
     incrStreamedLen(Bytes.size());
   } else if (isWriting()) {
     if (auto EC = Writer->writeBytes(Bytes))
@@ -131,7 +131,7 @@ Error CodeViewRecordIO::mapInteger(TypeIndex &TypeInd, const Twine &Comment) {
       emitComment(Comment + ": " + TypeNameStr);
     else
       emitComment(Comment);
-    Streamer->EmitIntValue(TypeInd.getIndex(), sizeof(TypeInd.getIndex()));
+    Streamer->emitIntValue(TypeInd.getIndex(), sizeof(TypeInd.getIndex()));
     incrStreamedLen(sizeof(TypeInd.getIndex()));
   } else if (isWriting()) {
     if (auto EC = Writer->writeInteger(TypeInd.getIndex()))
@@ -205,7 +205,7 @@ Error CodeViewRecordIO::mapStringZ(StringRef &Value, const Twine &Comment) {
   if (isStreaming()) {
     auto NullTerminatedString = StringRef(Value.data(), Value.size() + 1);
     emitComment(Comment);
-    Streamer->EmitBytes(NullTerminatedString);
+    Streamer->emitBytes(NullTerminatedString);
     incrStreamedLen(NullTerminatedString.size());
   } else if (isWriting()) {
     // Truncate if we attempt to write too much.
@@ -226,7 +226,7 @@ Error CodeViewRecordIO::mapGuid(GUID &Guid, const Twine &Comment) {
     StringRef GuidSR =
         StringRef((reinterpret_cast<const char *>(&Guid)), GuidSize);
     emitComment(Comment);
-    Streamer->EmitBytes(GuidSR);
+    Streamer->emitBytes(GuidSR);
     incrStreamedLen(GuidSize);
     return Error::success();
   }
@@ -275,24 +275,24 @@ void CodeViewRecordIO::emitEncodedSignedInteger(const int64_t &Value,
                                                 const Twine &Comment) {
   assert(Value < 0 && "Encoded integer is not signed!");
   if (Value >= std::numeric_limits<int8_t>::min()) {
-    Streamer->EmitIntValue(LF_CHAR, 2);
+    Streamer->emitIntValue(LF_CHAR, 2);
     emitComment(Comment);
-    Streamer->EmitIntValue(Value, 1);
+    Streamer->emitIntValue(Value, 1);
     incrStreamedLen(3);
   } else if (Value >= std::numeric_limits<int16_t>::min()) {
-    Streamer->EmitIntValue(LF_SHORT, 2);
+    Streamer->emitIntValue(LF_SHORT, 2);
     emitComment(Comment);
-    Streamer->EmitIntValue(Value, 2);
+    Streamer->emitIntValue(Value, 2);
     incrStreamedLen(4);
   } else if (Value >= std::numeric_limits<int32_t>::min()) {
-    Streamer->EmitIntValue(LF_LONG, 2);
+    Streamer->emitIntValue(LF_LONG, 2);
     emitComment(Comment);
-    Streamer->EmitIntValue(Value, 4);
+    Streamer->emitIntValue(Value, 4);
     incrStreamedLen(6);
   } else {
-    Streamer->EmitIntValue(LF_QUADWORD, 2);
+    Streamer->emitIntValue(LF_QUADWORD, 2);
     emitComment(Comment);
-    Streamer->EmitIntValue(Value, 4);
+    Streamer->emitIntValue(Value, 4);
     incrStreamedLen(6);
   }
 }
@@ -301,22 +301,22 @@ void CodeViewRecordIO::emitEncodedUnsignedInteger(const uint64_t &Value,
                                                   const Twine &Comment) {
   if (Value < LF_NUMERIC) {
     emitComment(Comment);
-    Streamer->EmitIntValue(Value, 2);
+    Streamer->emitIntValue(Value, 2);
     incrStreamedLen(2);
   } else if (Value <= std::numeric_limits<uint16_t>::max()) {
-    Streamer->EmitIntValue(LF_USHORT, 2);
+    Streamer->emitIntValue(LF_USHORT, 2);
     emitComment(Comment);
-    Streamer->EmitIntValue(Value, 2);
+    Streamer->emitIntValue(Value, 2);
     incrStreamedLen(4);
   } else if (Value <= std::numeric_limits<uint32_t>::max()) {
-    Streamer->EmitIntValue(LF_ULONG, 2);
+    Streamer->emitIntValue(LF_ULONG, 2);
     emitComment(Comment);
-    Streamer->EmitIntValue(Value, 4);
+    Streamer->emitIntValue(Value, 4);
     incrStreamedLen(6);
   } else {
-    Streamer->EmitIntValue(LF_UQUADWORD, 2);
+    Streamer->emitIntValue(LF_UQUADWORD, 2);
     emitComment(Comment);
-    Streamer->EmitIntValue(Value, 8);
+    Streamer->emitIntValue(Value, 8);
     incrStreamedLen(6);
   }
 }
diff --git a/llvm/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp b/llvm/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
index 0f704f286ee9..3c8a30101450 100644
--- a/llvm/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
+++ b/llvm/lib/DebugInfo/CodeView/DebugSubsectionRecord.cpp
@@ -23,13 +23,11 @@ using namespace llvm::codeview;
 DebugSubsectionRecord::DebugSubsectionRecord() = default;
 
 DebugSubsectionRecord::DebugSubsectionRecord(DebugSubsectionKind Kind,
-                                             BinaryStreamRef Data,
-                                             CodeViewContainer Container)
-    : Container(Container), Kind(Kind), Data(Data) {}
+                                             BinaryStreamRef Data)
+    : Kind(Kind), Data(Data) {}
 
 Error DebugSubsectionRecord::initialize(BinaryStreamRef Stream,
-                                        DebugSubsectionRecord &Info,
-                                        CodeViewContainer Container) {
+                                        DebugSubsectionRecord &Info) {
   const DebugSubsectionHeader *Header;
   BinaryStreamReader Reader(Stream);
   if (auto EC = Reader.readObject(Header))
@@ -39,7 +37,6 @@ Error DebugSubsectionRecord::initialize(BinaryStreamRef Stream,
       static_cast<DebugSubsectionKind>(uint32_t(Header->Kind));
   if (auto EC = Reader.readStreamRef(Info.Data, Header->Length))
     return EC;
-  Info.Container = Container;
   Info.Kind = Kind;
   return Error::success();
 }
@@ -53,14 +50,14 @@ DebugSubsectionKind DebugSubsectionRecord::kind() const { return Kind; }
 BinaryStreamRef DebugSubsectionRecord::getRecordData() const { return Data; }
 
 DebugSubsectionRecordBuilder::DebugSubsectionRecordBuilder(
-    std::shared_ptr<DebugSubsection> Subsection, CodeViewContainer Container)
-    : Subsection(std::move(Subsection)), Container(Container) {}
+    std::shared_ptr<DebugSubsection> Subsection)
+    : Subsection(std::move(Subsection)) {}
 
 DebugSubsectionRecordBuilder::DebugSubsectionRecordBuilder(
-    const DebugSubsectionRecord &Contents, CodeViewContainer Container)
-    : Contents(Contents), Container(Container) {}
+    const DebugSubsectionRecord &Contents)
+    : Contents(Contents) {}
 
-uint32_t DebugSubsectionRecordBuilder::calculateSerializedLength() {
+uint32_t DebugSubsectionRecordBuilder::calculateSerializedLength() const {
   uint32_t DataSize = Subsection ? Subsection->calculateSerializedSize()
                                  : Contents.getRecordData().getLength();
   // The length of the entire subsection is always padded to 4 bytes,
@@ -68,7 +65,8 @@ uint32_t DebugSubsectionRecordBuilder::calculateSerializedLength() {
   return sizeof(DebugSubsectionHeader) + alignTo(DataSize, 4);
 }
 
-Error DebugSubsectionRecordBuilder::commit(BinaryStreamWriter &Writer) const {
+Error DebugSubsectionRecordBuilder::commit(BinaryStreamWriter &Writer,
+                                           CodeViewContainer Container) const {
   assert(Writer.getOffset() % alignOf(Container) == 0 &&
          "Debug Subsection not properly aligned");
 
diff --git a/llvm/lib/DebugInfo/CodeView/GlobalTypeTableBuilder.cpp b/llvm/lib/DebugInfo/CodeView/GlobalTypeTableBuilder.cpp
index a7ad1d045f04..7cd9ca7498f5 100644
--- a/llvm/lib/DebugInfo/CodeView/GlobalTypeTableBuilder.cpp
+++ b/llvm/lib/DebugInfo/CodeView/GlobalTypeTableBuilder.cpp
@@ -84,6 +84,13 @@ void GlobalTypeTableBuilder::reset() {
   SeenRecords.clear();
 }
 
+static inline ArrayRef<uint8_t> stabilize(BumpPtrAllocator &Alloc,
+                                          ArrayRef<uint8_t> Data) {
+  uint8_t *Stable = Alloc.Allocate<uint8_t>(Data.size());
+  memcpy(Stable, Data.data(), Data.size());
+  return makeArrayRef(Stable, Data.size());
+}
+
 TypeIndex GlobalTypeTableBuilder::insertRecordBytes(ArrayRef<uint8_t> Record) {
   GloballyHashedType GHT =
       GloballyHashedType::hashType(Record, SeenHashes, SeenHashes);
@@ -104,3 +111,30 @@ GlobalTypeTableBuilder::insertRecord(ContinuationRecordBuilder &Builder) {
     TI = insertRecordBytes(C.RecordData);
   return TI;
 }
+
+bool GlobalTypeTableBuilder::replaceType(TypeIndex &Index, CVType Data,
+                                         bool Stabilize) {
+  assert(Index.toArrayIndex() < SeenRecords.size() &&
+         "This function cannot be used to insert records!");
+
+  ArrayRef<uint8_t> Record = Data.data();
+  assert(Record.size() < UINT32_MAX && "Record too big");
+  assert(Record.size() % 4 == 0 &&
+         "The type record size is not a multiple of 4 bytes which will cause "
+         "misalignment in the output TPI stream!");
+
+  GloballyHashedType Hash =
+      GloballyHashedType::hashType(Record, SeenHashes, SeenHashes);
+  auto Result = HashedRecords.try_emplace(Hash, Index.toArrayIndex());
+  if (!Result.second) {
+    Index = Result.first->second;
+    return false; // The record is already there, at a different location
+  }
+
+  if (Stabilize)
+    Record = stabilize(RecordStorage, Record);
+
+  SeenRecords[Index.toArrayIndex()] = Record;
+  SeenHashes[Index.toArrayIndex()] = Hash;
+  return true;
+}
diff --git a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
index dc1253b7a39f..06b20ba33eec 100644
--- a/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/LazyRandomTypeCollection.cpp
@@ -277,3 +277,8 @@ void LazyRandomTypeCollection::visitRange(TypeIndex Begin, uint32_t BeginOffset,
     ++RI;
   }
 }
+
+bool LazyRandomTypeCollection::replaceType(TypeIndex &Index, CVType Data,
+                                           bool Stabilize) {
+  llvm_unreachable("Method cannot be called");
+}
diff --git a/llvm/lib/DebugInfo/CodeView/MergingTypeTableBuilder.cpp b/llvm/lib/DebugInfo/CodeView/MergingTypeTableBuilder.cpp
index 4d7cd468f3ee..13ce3ae82c26 100644
--- a/llvm/lib/DebugInfo/CodeView/MergingTypeTableBuilder.cpp
+++ b/llvm/lib/DebugInfo/CodeView/MergingTypeTableBuilder.cpp
@@ -90,7 +90,9 @@ static inline ArrayRef<uint8_t> stabilize(BumpPtrAllocator &Alloc,
 TypeIndex MergingTypeTableBuilder::insertRecordAs(hash_code Hash,
                                                   ArrayRef<uint8_t> &Record) {
   assert(Record.size() < UINT32_MAX && "Record too big");
-  assert(Record.size() % 4 == 0 && "Record is not aligned to 4 bytes!");
+  assert(Record.size() % 4 == 0 &&
+         "The type record size is not a multiple of 4 bytes which will cause "
+         "misalignment in the output TPI stream!");
 
   LocallyHashedType WeakHash{Hash, Record};
   auto Result = HashedRecords.try_emplace(WeakHash, nextTypeIndex());
@@ -121,3 +123,30 @@ MergingTypeTableBuilder::insertRecord(ContinuationRecordBuilder &Builder) {
     TI = insertRecordBytes(C.RecordData);
   return TI;
 }
+
+bool MergingTypeTableBuilder::replaceType(TypeIndex &Index, CVType Data,
+                                          bool Stabilize) {
+  assert(Index.toArrayIndex() < SeenRecords.size() &&
+         "This function cannot be used to insert records!");
+
+  ArrayRef<uint8_t> Record = Data.data();
+  assert(Record.size() < UINT32_MAX && "Record too big");
+  assert(Record.size() % 4 == 0 &&
+         "The type record size is not a multiple of 4 bytes which will cause "
+         "misalignment in the output TPI stream!");
+
+  LocallyHashedType WeakHash{hash_value(Record), Record};
+  auto Result = HashedRecords.try_emplace(WeakHash, Index.toArrayIndex());
+  if (!Result.second) {
+    Index = Result.first->second;
+    return false; // The record is already there, at a different location
+  }
+
+  if (Stabilize) {
+    Record = stabilize(RecordStorage, Record);
+    Result.first->first.RecordData = Record;
+  }
+
+  SeenRecords[Index.toArrayIndex()] = Record;
+  return true;
+}
diff --git a/llvm/lib/DebugInfo/CodeView/RecordName.cpp b/llvm/lib/DebugInfo/CodeView/RecordName.cpp
index cfaad1581159..47b5498181b7 100644
--- a/llvm/lib/DebugInfo/CodeView/RecordName.cpp
+++ b/llvm/lib/DebugInfo/CodeView/RecordName.cpp
@@ -253,7 +253,7 @@ std::string llvm::codeview::computeTypeName(TypeCollection &Types,
     consumeError(std::move(EC));
     return "<unknown UDT>";
   }
-  return Computer.name();
+  return std::string(Computer.name());
 }
 
 static int getSymbolNameOffset(CVSymbol Sym) {
diff --git a/llvm/lib/DebugInfo/CodeView/SimpleTypeSerializer.cpp b/llvm/lib/DebugInfo/CodeView/SimpleTypeSerializer.cpp
index 654c40a7470d..ac3b30175956 100644
--- a/llvm/lib/DebugInfo/CodeView/SimpleTypeSerializer.cpp
+++ b/llvm/lib/DebugInfo/CodeView/SimpleTypeSerializer.cpp
@@ -1,4 +1,15 @@
+//===- SimpleTypeSerializer.cpp -----------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
 #include "llvm/DebugInfo/CodeView/SimpleTypeSerializer.h"
+#include "llvm/DebugInfo/CodeView/TypeRecord.h"
+#include "llvm/DebugInfo/CodeView/TypeRecordMapping.h"
+#include "llvm/Support/BinaryStreamWriter.h"
 
 using namespace llvm;
 using namespace llvm::codeview;
diff --git a/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp b/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
index 1aded589e565..bb71c86a0609 100644
--- a/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeRecordMapping.cpp
@@ -99,12 +99,12 @@ static std::string getMemberAttributes(CodeViewRecordIO &IO,
                                        MethodOptions Options) {
   if (!IO.isStreaming())
     return "";
-  std::string AccessSpecifier =
-      getEnumName(IO, uint8_t(Access), makeArrayRef(getMemberAccessNames()));
+  std::string AccessSpecifier = std::string(
+      getEnumName(IO, uint8_t(Access), makeArrayRef(getMemberAccessNames())));
   std::string MemberAttrs(AccessSpecifier);
   if (Kind != MethodKind::Vanilla) {
-    std::string MethodKind =
-        getEnumName(IO, unsigned(Kind), makeArrayRef(getMemberKindNames()));
+    std::string MethodKind = std::string(
+        getEnumName(IO, unsigned(Kind), makeArrayRef(getMemberKindNames())));
     MemberAttrs += ", " + MethodKind;
   }
   if (Options != MethodOptions::None) {
@@ -201,8 +201,8 @@ Error TypeRecordMapping::visitTypeBegin(CVType &CVR) {
   if (IO.isStreaming()) {
     auto RecordKind = CVR.kind();
     uint16_t RecordLen = CVR.length() - 2;
-    std::string RecordKindName =
-        getEnumName(IO, unsigned(RecordKind), makeArrayRef(LeafTypeNames));
+    std::string RecordKindName = std::string(
+        getEnumName(IO, unsigned(RecordKind), makeArrayRef(LeafTypeNames)));
     error(IO.mapInteger(RecordLen, "Record length"));
     error(IO.mapEnum(RecordKind, "Record kind: " + RecordKindName));
   }
@@ -241,7 +241,7 @@ Error TypeRecordMapping::visitMemberBegin(CVMemberRecord &Record) {
 
   MemberKind = Record.Kind;
   if (IO.isStreaming()) {
-    std::string MemberKindName = getLeafTypeName(Record.Kind);
+    std::string MemberKindName = std::string(getLeafTypeName(Record.Kind));
     MemberKindName +=
         " ( " +
         (getEnumName(IO, unsigned(Record.Kind), makeArrayRef(LeafTypeNames)))
@@ -277,8 +277,8 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR, ModifierRecord &Record) {
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
                                           ProcedureRecord &Record) {
-  std::string CallingConvName = getEnumName(
-      IO, uint8_t(Record.CallConv), makeArrayRef(getCallingConventions()));
+  std::string CallingConvName = std::string(getEnumName(
+      IO, uint8_t(Record.CallConv), makeArrayRef(getCallingConventions())));
   std::string FuncOptionNames =
       getFlagNames(IO, static_cast<uint16_t>(Record.Options),
                    makeArrayRef(getFunctionOptionEnum()));
@@ -293,8 +293,8 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
                                           MemberFunctionRecord &Record) {
-  std::string CallingConvName = getEnumName(
-      IO, uint8_t(Record.CallConv), makeArrayRef(getCallingConventions()));
+  std::string CallingConvName = std::string(getEnumName(
+      IO, uint8_t(Record.CallConv), makeArrayRef(getCallingConventions())));
   std::string FuncOptionNames =
       getFlagNames(IO, static_cast<uint16_t>(Record.Options),
                    makeArrayRef(getFunctionOptionEnum()));
@@ -337,12 +337,13 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR, PointerRecord &Record) {
   SmallString<128> Attr("Attrs: ");
 
   if (IO.isStreaming()) {
-    std::string PtrType = getEnumName(IO, unsigned(Record.getPointerKind()),
-                                      makeArrayRef(getPtrKindNames()));
+    std::string PtrType =
+        std::string(getEnumName(IO, unsigned(Record.getPointerKind()),
+                                makeArrayRef(getPtrKindNames())));
     Attr += "[ Type: " + PtrType;
 
-    std::string PtrMode = getEnumName(IO, unsigned(Record.getMode()),
-                                      makeArrayRef(getPtrModeNames()));
+    std::string PtrMode = std::string(getEnumName(
+        IO, unsigned(Record.getMode()), makeArrayRef(getPtrModeNames())));
     Attr += ", Mode: " + PtrMode;
 
     auto PtrSizeOf = Record.getSize();
@@ -374,8 +375,8 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR, PointerRecord &Record) {
 
     MemberPointerInfo &M = *Record.MemberInfo;
     error(IO.mapInteger(M.ContainingType, "ClassType"));
-    std::string PtrMemberGetRepresentation = getEnumName(
-        IO, uint16_t(M.Representation), makeArrayRef(getPtrMemberRepNames()));
+    std::string PtrMemberGetRepresentation = std::string(getEnumName(
+        IO, uint16_t(M.Representation), makeArrayRef(getPtrMemberRepNames())));
     error(IO.mapEnum(M.Representation,
                      "Representation: " + PtrMemberGetRepresentation));
   }
@@ -581,8 +582,8 @@ Error TypeRecordMapping::visitKnownRecord(CVType &CVR,
 }
 
 Error TypeRecordMapping::visitKnownRecord(CVType &CVR, LabelRecord &Record) {
-  std::string ModeName =
-      getEnumName(IO, uint16_t(Record.Mode), makeArrayRef(getLabelTypeEnum()));
+  std::string ModeName = std::string(
+      getEnumName(IO, uint16_t(Record.Mode), makeArrayRef(getLabelTypeEnum())));
   error(IO.mapEnum(Record.Mode, "Mode: " + ModeName));
   return Error::success();
 }
diff --git a/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp b/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
index f9fca74a2199..8c4b640bcd19 100644
--- a/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeStreamMerger.cpp
@@ -360,16 +360,18 @@ Error TypeStreamMerger::remapType(const CVType &Type) {
         [this, Type](MutableArrayRef<uint8_t> Storage) -> ArrayRef<uint8_t> {
       return remapIndices(Type, Storage);
     };
+    unsigned AlignedSize = alignTo(Type.RecordData.size(), 4);
+
     if (LLVM_LIKELY(UseGlobalHashes)) {
       GlobalTypeTableBuilder &Dest =
           isIdRecord(Type.kind()) ? *DestGlobalIdStream : *DestGlobalTypeStream;
       GloballyHashedType H = GlobalHashes[CurIndex.toArrayIndex()];
-      DestIdx = Dest.insertRecordAs(H, Type.RecordData.size(), DoSerialize);
+      DestIdx = Dest.insertRecordAs(H, AlignedSize, DoSerialize);
     } else {
       MergingTypeTableBuilder &Dest =
           isIdRecord(Type.kind()) ? *DestIdStream : *DestTypeStream;
 
-      RemapStorage.resize(Type.RecordData.size());
+      RemapStorage.resize(AlignedSize);
       ArrayRef<uint8_t> Result = DoSerialize(RemapStorage);
       if (!Result.empty())
         DestIdx = Dest.insertRecordBytes(Result);
@@ -386,9 +388,14 @@ Error TypeStreamMerger::remapType(const CVType &Type) {
 ArrayRef<uint8_t>
 TypeStreamMerger::remapIndices(const CVType &OriginalType,
                                MutableArrayRef<uint8_t> Storage) {
+  unsigned Align = OriginalType.RecordData.size() & 3;
+  assert(Storage.size() == alignTo(OriginalType.RecordData.size(), 4) &&
+         "The storage buffer size is not a multiple of 4 bytes which will "
+         "cause misalignment in the output TPI stream!");
+
   SmallVector<TiReference, 4> Refs;
   discoverTypeIndices(OriginalType.RecordData, Refs);
-  if (Refs.empty())
+  if (Refs.empty() && Align == 0)
     return OriginalType.RecordData;
 
   ::memcpy(Storage.data(), OriginalType.RecordData.data(),
@@ -408,6 +415,16 @@ TypeStreamMerger::remapIndices(const CVType &OriginalType,
         return {};
     }
   }
+
+  if (Align > 0) {
+    RecordPrefix *StorageHeader =
+        reinterpret_cast<RecordPrefix *>(Storage.data());
+    StorageHeader->RecordLen += 4 - Align;
+
+    DestContent = Storage.data() + OriginalType.RecordData.size();
+    for (; Align < 4; ++Align)
+      *DestContent++ = LF_PAD4 - Align;
+  }
   return Storage;
 }
 
diff --git a/llvm/lib/DebugInfo/CodeView/TypeTableCollection.cpp b/llvm/lib/DebugInfo/CodeView/TypeTableCollection.cpp
index e13068b5b1eb..e517e8846d69 100644
--- a/llvm/lib/DebugInfo/CodeView/TypeTableCollection.cpp
+++ b/llvm/lib/DebugInfo/CodeView/TypeTableCollection.cpp
@@ -58,3 +58,8 @@ bool TypeTableCollection::contains(TypeIndex Index) {
 uint32_t TypeTableCollection::size() { return Records.size(); }
 
 uint32_t TypeTableCollection::capacity() { return Records.size(); }
+
+bool TypeTableCollection::replaceType(TypeIndex &Index, CVType Data,
+                                      bool Stabilize) {
+  llvm_unreachable("Method cannot be called");
+}
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
index abbea3a868c8..ee1ff5460b9b 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAbbreviationDeclaration.cpp
@@ -150,6 +150,8 @@ DWARFAbbreviationDeclaration::findAttributeIndex(dwarf::Attribute Attr) const {
 Optional<DWARFFormValue> DWARFAbbreviationDeclaration::getAttributeValue(
     const uint64_t DIEOffset, const dwarf::Attribute Attr,
     const DWARFUnit &U) const {
+  // Check if this abbreviation has this attribute without needing to skip
+  // any data so we can return quickly if it doesn't.
   Optional<uint32_t> MatchAttrIndex = findAttributeIndex(Attr);
   if (!MatchAttrIndex)
     return None;
@@ -159,26 +161,24 @@ Optional<DWARFFormValue> DWARFAbbreviationDeclaration::getAttributeValue(
   // Add the byte size of ULEB that for the abbrev Code so we can start
   // skipping the attribute data.
   uint64_t Offset = DIEOffset + CodeByteSize;
-  uint32_t AttrIndex = 0;
-  for (const auto &Spec : AttributeSpecs) {
-    if (*MatchAttrIndex == AttrIndex) {
-      // We have arrived at the attribute to extract, extract if from Offset.
-      if (Spec.isImplicitConst())
-        return DWARFFormValue::createFromSValue(Spec.Form,
-                                                Spec.getImplicitConstValue());
-
-      DWARFFormValue FormValue(Spec.Form);
-      if (FormValue.extractValue(DebugInfoData, &Offset, U.getFormParams(), &U))
-        return FormValue;
-    }
-    // March Offset along until we get to the attribute we want.
-    if (auto FixedSize = Spec.getByteSize(U))
+  for (uint32_t CurAttrIdx = 0; CurAttrIdx != *MatchAttrIndex; ++CurAttrIdx)
+    // Match Offset along until we get to the attribute we want.
+    if (auto FixedSize = AttributeSpecs[CurAttrIdx].getByteSize(U))
       Offset += *FixedSize;
     else
-      DWARFFormValue::skipValue(Spec.Form, DebugInfoData, &Offset,
-                                U.getFormParams());
-    ++AttrIndex;
-  }
+      DWARFFormValue::skipValue(AttributeSpecs[CurAttrIdx].Form, DebugInfoData,
+                                &Offset, U.getFormParams());
+
+  // We have arrived at the attribute to extract, extract if from Offset.
+  const AttributeSpec &Spec = AttributeSpecs[*MatchAttrIndex];
+  if (Spec.isImplicitConst())
+    return DWARFFormValue::createFromSValue(Spec.Form,
+                                            Spec.getImplicitConstValue());
+
+  DWARFFormValue FormValue(Spec.Form);
+  if (FormValue.extractValue(DebugInfoData, &Offset, U.getFormParams(), &U))
+    return FormValue;
+
   return None;
 }
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
index 575edba51ee8..28d35b609c24 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFAcceleratorTable.cpp
@@ -365,8 +365,8 @@ AppleAcceleratorTable::equal_range(StringRef Key) const {
 void DWARFDebugNames::Header::dump(ScopedPrinter &W) const {
   DictScope HeaderScope(W, "Header");
   W.printHex("Length", UnitLength);
+  W.printString("Format", dwarf::FormatString(Format));
   W.printNumber("Version", Version);
-  W.printHex("Padding", Padding);
   W.printNumber("CU count", CompUnitCount);
   W.printNumber("Local TU count", LocalTypeUnitCount);
   W.printNumber("Foreign TU count", ForeignTypeUnitCount);
@@ -378,30 +378,36 @@ void DWARFDebugNames::Header::dump(ScopedPrinter &W) const {
 
 Error DWARFDebugNames::Header::extract(const DWARFDataExtractor &AS,
                                              uint64_t *Offset) {
-  // Check that we can read the fixed-size part.
-  if (!AS.isValidOffset(*Offset + sizeof(HeaderPOD) - 1))
+  auto HeaderError = [Offset = *Offset](Error E) {
     return createStringError(errc::illegal_byte_sequence,
-                             "Section too small: cannot read header.");
-
-  UnitLength = AS.getU32(Offset);
-  Version = AS.getU16(Offset);
-  Padding = AS.getU16(Offset);
-  CompUnitCount = AS.getU32(Offset);
-  LocalTypeUnitCount = AS.getU32(Offset);
-  ForeignTypeUnitCount = AS.getU32(Offset);
-  BucketCount = AS.getU32(Offset);
-  NameCount = AS.getU32(Offset);
-  AbbrevTableSize = AS.getU32(Offset);
-  AugmentationStringSize = alignTo(AS.getU32(Offset), 4);
-
-  if (!AS.isValidOffsetForDataOfSize(*Offset, AugmentationStringSize))
-    return createStringError(
-        errc::illegal_byte_sequence,
-        "Section too small: cannot read header augmentation.");
+                             "parsing .debug_names header at 0x%" PRIx64 ": %s",
+                             Offset, toString(std::move(E)).c_str());
+  };
+
+  DataExtractor::Cursor C(*Offset);
+  std::tie(UnitLength, Format) = AS.getInitialLength(C);
+
+  Version = AS.getU16(C);
+  AS.skip(C, 2); // padding
+  CompUnitCount = AS.getU32(C);
+  LocalTypeUnitCount = AS.getU32(C);
+  ForeignTypeUnitCount = AS.getU32(C);
+  BucketCount = AS.getU32(C);
+  NameCount = AS.getU32(C);
+  AbbrevTableSize = AS.getU32(C);
+  AugmentationStringSize = alignTo(AS.getU32(C), 4);
+
+  if (!C)
+    return HeaderError(C.takeError());
+
+  if (!AS.isValidOffsetForDataOfSize(C.tell(), AugmentationStringSize))
+    return HeaderError(createStringError(errc::illegal_byte_sequence,
+                                         "cannot read header augmentation"));
   AugmentationString.resize(AugmentationStringSize);
-  AS.getU8(Offset, reinterpret_cast<uint8_t *>(AugmentationString.data()),
+  AS.getU8(C, reinterpret_cast<uint8_t *>(AugmentationString.data()),
            AugmentationStringSize);
-  return Error::success();
+  *Offset = C.tell();
+  return C.takeError();
 }
 
 void DWARFDebugNames::Abbrev::dump(ScopedPrinter &W) const {
@@ -486,9 +492,10 @@ Error DWARFDebugNames::NameIndex::extract() {
   if (Error E = Hdr.extract(AS, &Offset))
     return E;
 
+  const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
   CUsBase = Offset;
-  Offset += Hdr.CompUnitCount * 4;
-  Offset += Hdr.LocalTypeUnitCount * 4;
+  Offset += Hdr.CompUnitCount * SectionOffsetSize;
+  Offset += Hdr.LocalTypeUnitCount * SectionOffsetSize;
   Offset += Hdr.ForeignTypeUnitCount * 8;
   BucketsBase = Offset;
   Offset += Hdr.BucketCount * 4;
@@ -496,9 +503,9 @@ Error DWARFDebugNames::NameIndex::extract() {
   if (Hdr.BucketCount > 0)
     Offset += Hdr.NameCount * 4;
   StringOffsetsBase = Offset;
-  Offset += Hdr.NameCount * 4;
+  Offset += Hdr.NameCount * SectionOffsetSize;
   EntryOffsetsBase = Offset;
-  Offset += Hdr.NameCount * 4;
+  Offset += Hdr.NameCount * SectionOffsetSize;
 
   if (!AS.isValidOffsetForDataOfSize(Offset, Hdr.AbbrevTableSize))
     return createStringError(errc::illegal_byte_sequence,
@@ -579,20 +586,24 @@ std::error_code DWARFDebugNames::SentinelError::convertToErrorCode() const {
 
 uint64_t DWARFDebugNames::NameIndex::getCUOffset(uint32_t CU) const {
   assert(CU < Hdr.CompUnitCount);
-  uint64_t Offset = CUsBase + 4 * CU;
-  return Section.AccelSection.getRelocatedValue(4, &Offset);
+  const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
+  uint64_t Offset = CUsBase + SectionOffsetSize * CU;
+  return Section.AccelSection.getRelocatedValue(SectionOffsetSize, &Offset);
 }
 
 uint64_t DWARFDebugNames::NameIndex::getLocalTUOffset(uint32_t TU) const {
   assert(TU < Hdr.LocalTypeUnitCount);
-  uint64_t Offset = CUsBase + 4 * (Hdr.CompUnitCount + TU);
-  return Section.AccelSection.getRelocatedValue(4, &Offset);
+  const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
+  uint64_t Offset = CUsBase + SectionOffsetSize * (Hdr.CompUnitCount + TU);
+  return Section.AccelSection.getRelocatedValue(SectionOffsetSize, &Offset);
 }
 
 uint64_t DWARFDebugNames::NameIndex::getForeignTUSignature(uint32_t TU) const {
   assert(TU < Hdr.ForeignTypeUnitCount);
+  const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
   uint64_t Offset =
-      CUsBase + 4 * (Hdr.CompUnitCount + Hdr.LocalTypeUnitCount) + 8 * TU;
+      CUsBase +
+      SectionOffsetSize * (Hdr.CompUnitCount + Hdr.LocalTypeUnitCount) + 8 * TU;
   return Section.AccelSection.getU64(&Offset);
 }
 
@@ -613,7 +624,7 @@ DWARFDebugNames::NameIndex::getEntry(uint64_t *Offset) const {
 
   Entry E(*this, *AbbrevIt);
 
-  dwarf::FormParams FormParams = {Hdr.Version, 0, dwarf::DwarfFormat::DWARF32};
+  dwarf::FormParams FormParams = {Hdr.Version, 0, Hdr.Format};
   for (auto &Value : E.Values) {
     if (!Value.extractValue(AS, Offset, FormParams))
       return createStringError(errc::io_error,
@@ -625,12 +636,16 @@ DWARFDebugNames::NameIndex::getEntry(uint64_t *Offset) const {
 DWARFDebugNames::NameTableEntry
 DWARFDebugNames::NameIndex::getNameTableEntry(uint32_t Index) const {
   assert(0 < Index && Index <= Hdr.NameCount);
-  uint64_t StringOffsetOffset = StringOffsetsBase + 4 * (Index - 1);
-  uint64_t EntryOffsetOffset = EntryOffsetsBase + 4 * (Index - 1);
+  const unsigned SectionOffsetSize = dwarf::getDwarfOffsetByteSize(Hdr.Format);
+  uint64_t StringOffsetOffset =
+      StringOffsetsBase + SectionOffsetSize * (Index - 1);
+  uint64_t EntryOffsetOffset =
+      EntryOffsetsBase + SectionOffsetSize * (Index - 1);
   const DWARFDataExtractor &AS = Section.AccelSection;
 
-  uint64_t StringOffset = AS.getRelocatedValue(4, &StringOffsetOffset);
-  uint64_t EntryOffset = AS.getU32(&EntryOffsetOffset);
+  uint64_t StringOffset =
+      AS.getRelocatedValue(SectionOffsetSize, &StringOffsetOffset);
+  uint64_t EntryOffset = AS.getUnsigned(&EntryOffsetOffset, SectionOffsetSize);
   EntryOffset += EntriesBase;
   return {Section.StringSection, Index, StringOffset, EntryOffset};
 }
@@ -859,13 +874,14 @@ void DWARFDebugNames::ValueIterator::next() {
 
 DWARFDebugNames::ValueIterator::ValueIterator(const DWARFDebugNames &AccelTable,
                                               StringRef Key)
-    : CurrentIndex(AccelTable.NameIndices.begin()), IsLocal(false), Key(Key) {
+    : CurrentIndex(AccelTable.NameIndices.begin()), IsLocal(false),
+      Key(std::string(Key)) {
   searchFromStartOfCurrentIndex();
 }
 
 DWARFDebugNames::ValueIterator::ValueIterator(
     const DWARFDebugNames::NameIndex &NI, StringRef Key)
-    : CurrentIndex(&NI), IsLocal(true), Key(Key) {
+    : CurrentIndex(&NI), IsLocal(true), Key(std::string(Key)) {
   if (!findInCurrentIndex())
     setEnd();
 }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
index f59e49268288..9bd134105c9b 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFCompileUnit.cpp
@@ -15,16 +15,18 @@
 using namespace llvm;
 
 void DWARFCompileUnit::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
+  int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(getFormat());
   OS << format("0x%08" PRIx64, getOffset()) << ": Compile Unit:"
-     << " length = " << format("0x%08" PRIx64, getLength())
-     << " version = " << format("0x%04x", getVersion());
+     << " length = " << format("0x%0*" PRIx64, OffsetDumpWidth, getLength())
+     << ", format = " << dwarf::FormatString(getFormat())
+     << ", version = " << format("0x%04x", getVersion());
   if (getVersion() >= 5)
-    OS << " unit_type = " << dwarf::UnitTypeString(getUnitType());
-  OS << " abbr_offset = "
+    OS << ", unit_type = " << dwarf::UnitTypeString(getUnitType());
+  OS << ", abbr_offset = "
      << format("0x%04" PRIx64, getAbbreviations()->getOffset())
-     << " addr_size = " << format("0x%02x", getAddressByteSize());
+     << ", addr_size = " << format("0x%02x", getAddressByteSize());
   if (getVersion() >= 5 && getUnitType() != dwarf::DW_UT_compile)
-    OS << " DWO_id = " << format("0x%016" PRIx64, *getDWOId());
+    OS << ", DWO_id = " << format("0x%016" PRIx64, *getDWOId());
   OS << " (next unit at " << format("0x%08" PRIx64, getNextUnitOffset())
      << ")\n";
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
index aaa6d5250f23..bf6219497770 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFContext.cpp
@@ -45,7 +45,6 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/TargetRegistry.h"
-#include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 #include <cstdint>
@@ -66,8 +65,12 @@ using FileLineInfoKind = DILineInfoSpecifier::FileLineInfoKind;
 using FunctionNameKind = DILineInfoSpecifier::FunctionNameKind;
 
 DWARFContext::DWARFContext(std::unique_ptr<const DWARFObject> DObj,
-                           std::string DWPName)
-    : DIContext(CK_DWARF), DWPName(std::move(DWPName)), DObj(std::move(DObj)) {}
+                           std::string DWPName,
+                           std::function<void(Error)> RecoverableErrorHandler,
+                           std::function<void(Error)> WarningHandler)
+    : DIContext(CK_DWARF), DWPName(std::move(DWPName)),
+      RecoverableErrorHandler(RecoverableErrorHandler),
+      WarningHandler(WarningHandler), DObj(std::move(DObj)) {}
 
 DWARFContext::~DWARFContext() = default;
 
@@ -130,10 +133,21 @@ collectContributionData(DWARFContext::unit_iterator_range Units) {
   return Contributions;
 }
 
-static void dumpDWARFv5StringOffsetsSection(
-    raw_ostream &OS, StringRef SectionName, const DWARFObject &Obj,
-    const DWARFSection &StringOffsetsSection, StringRef StringSection,
-    DWARFContext::unit_iterator_range Units, bool LittleEndian) {
+// Dump a DWARF string offsets section. This may be a DWARF v5 formatted
+// string offsets section, where each compile or type unit contributes a
+// number of entries (string offsets), with each contribution preceded by
+// a header containing size and version number. Alternatively, it may be a
+// monolithic series of string offsets, as generated by the pre-DWARF v5
+// implementation of split DWARF; however, in that case we still need to
+// collect contributions of units because the size of the offsets (4 or 8
+// bytes) depends on the format of the referencing unit (DWARF32 or DWARF64).
+static void dumpStringOffsetsSection(raw_ostream &OS, DIDumpOptions DumpOpts,
+                                     StringRef SectionName,
+                                     const DWARFObject &Obj,
+                                     const DWARFSection &StringOffsetsSection,
+                                     StringRef StringSection,
+                                     DWARFContext::unit_iterator_range Units,
+                                     bool LittleEndian) {
   auto Contributions = collectContributionData(Units);
   DWARFDataExtractor StrOffsetExt(Obj, StringOffsetsSection, LittleEndian, 0);
   DataExtractor StrData(StringSection, LittleEndian, 0);
@@ -148,6 +162,7 @@ static void dumpDWARFv5StringOffsetsSection(
     }
 
     dwarf::DwarfFormat Format = Contribution->getFormat();
+    int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(Format);
     uint16_t Version = Contribution->getVersion();
     uint64_t ContributionHeader = Contribution->Base;
     // In DWARF v5 there is a contribution header that immediately precedes
@@ -159,10 +174,10 @@ static void dumpDWARFv5StringOffsetsSection(
 
     // Detect overlapping contributions.
     if (Offset > ContributionHeader) {
-      WithColor::error()
-          << "overlapping contributions to string offsets table in section ."
-          << SectionName << ".\n";
-      return;
+      DumpOpts.RecoverableErrorHandler(createStringError(
+          errc::invalid_argument,
+          "overlapping contributions to string offsets table in section .%s.",
+          SectionName.data()));
     }
     // Report a gap in the table.
     if (Offset < ContributionHeader) {
@@ -175,7 +190,7 @@ static void dumpDWARFv5StringOffsetsSection(
     // version field and the padding, a total of 4 bytes). Add them back in
     // for reporting.
     OS << "Contribution size = " << (Contribution->Size + (Version < 5 ? 0 : 4))
-       << ", Format = " << (Format == DWARF32 ? "DWARF32" : "DWARF64")
+       << ", Format = " << dwarf::FormatString(Format)
        << ", Version = " << Version << "\n";
 
     Offset = Contribution->Base;
@@ -184,7 +199,7 @@ static void dumpDWARFv5StringOffsetsSection(
       OS << format("0x%8.8" PRIx64 ": ", Offset);
       uint64_t StringOffset =
           StrOffsetExt.getRelocatedValue(EntrySize, &Offset);
-      OS << format("%8.8" PRIx64 " ", StringOffset);
+      OS << format("%0*" PRIx64 " ", OffsetDumpWidth, StringOffset);
       const char *S = StrData.getCStr(&StringOffset);
       if (S)
         OS << format("\"%s\"", S);
@@ -198,47 +213,6 @@ static void dumpDWARFv5StringOffsetsSection(
   }
 }
 
-// Dump a DWARF string offsets section. This may be a DWARF v5 formatted
-// string offsets section, where each compile or type unit contributes a
-// number of entries (string offsets), with each contribution preceded by
-// a header containing size and version number. Alternatively, it may be a
-// monolithic series of string offsets, as generated by the pre-DWARF v5
-// implementation of split DWARF.
-static void dumpStringOffsetsSection(raw_ostream &OS, StringRef SectionName,
-                                     const DWARFObject &Obj,
-                                     const DWARFSection &StringOffsetsSection,
-                                     StringRef StringSection,
-                                     DWARFContext::unit_iterator_range Units,
-                                     bool LittleEndian, unsigned MaxVersion) {
-  // If we have at least one (compile or type) unit with DWARF v5 or greater,
-  // we assume that the section is formatted like a DWARF v5 string offsets
-  // section.
-  if (MaxVersion >= 5)
-    dumpDWARFv5StringOffsetsSection(OS, SectionName, Obj, StringOffsetsSection,
-                                    StringSection, Units, LittleEndian);
-  else {
-    DataExtractor strOffsetExt(StringOffsetsSection.Data, LittleEndian, 0);
-    uint64_t offset = 0;
-    uint64_t size = StringOffsetsSection.Data.size();
-    // Ensure that size is a multiple of the size of an entry.
-    if (size & ((uint64_t)(sizeof(uint32_t) - 1))) {
-      OS << "error: size of ." << SectionName << " is not a multiple of "
-         << sizeof(uint32_t) << ".\n";
-      size &= -(uint64_t)sizeof(uint32_t);
-    }
-    DataExtractor StrData(StringSection, LittleEndian, 0);
-    while (offset < size) {
-      OS << format("0x%8.8" PRIx64 ": ", offset);
-      uint64_t StringOffset = strOffsetExt.getU32(&offset);
-      OS << format("%8.8" PRIx64 "  ", StringOffset);
-      const char *S = StrData.getCStr(&StringOffset);
-      if (S)
-        OS << format("\"%s\"", S);
-      OS << "\n";
-    }
-  }
-}
-
 // Dump the .debug_addr section.
 static void dumpAddrSection(raw_ostream &OS, DWARFDataExtractor &AddrData,
                             DIDumpOptions DumpOpts, uint16_t Version,
@@ -248,16 +222,17 @@ static void dumpAddrSection(raw_ostream &OS, DWARFDataExtractor &AddrData,
     DWARFDebugAddrTable AddrTable;
     uint64_t TableOffset = Offset;
     if (Error Err = AddrTable.extract(AddrData, &Offset, Version, AddrSize,
-                                      DWARFContext::dumpWarning)) {
-      WithColor::error() << toString(std::move(Err)) << '\n';
+                                      DumpOpts.WarningHandler)) {
+      DumpOpts.RecoverableErrorHandler(std::move(Err));
       // Keep going after an error, if we can, assuming that the length field
       // could be read. If it couldn't, stop reading the section.
-      if (!AddrTable.hasValidLength())
-        break;
-      Offset = TableOffset + AddrTable.getLength();
-    } else {
-      AddrTable.dump(OS, DumpOpts);
+      if (auto TableLength = AddrTable.getFullLength()) {
+        Offset = TableOffset + *TableLength;
+        continue;
+      }
+      break;
     }
+    AddrTable.dump(OS, DumpOpts);
   }
 }
 
@@ -272,7 +247,7 @@ static void dumpRnglistsSection(
     llvm::DWARFDebugRnglistTable Rnglists;
     uint64_t TableOffset = Offset;
     if (Error Err = Rnglists.extract(rnglistData, &Offset)) {
-      WithColor::error() << toString(std::move(Err)) << '\n';
+      DumpOpts.RecoverableErrorHandler(std::move(Err));
       uint64_t Length = Rnglists.length();
       // Keep going after an error, if we can, assuming that the length field
       // could be read. If it couldn't, stop reading the section.
@@ -285,6 +260,48 @@ static void dumpRnglistsSection(
   }
 }
 
+std::unique_ptr<DWARFDebugMacro>
+DWARFContext::parseMacroOrMacinfo(MacroSecType SectionType) {
+  auto Macro = std::make_unique<DWARFDebugMacro>();
+  auto ParseAndDump = [&](DWARFDataExtractor &Data, bool IsMacro) {
+    if (Error Err = IsMacro ? Macro->parseMacro(SectionType == MacroSection
+                                                    ? compile_units()
+                                                    : dwo_compile_units(),
+                                                SectionType == MacroSection
+                                                    ? getStringExtractor()
+                                                    : getStringDWOExtractor(),
+                                                Data)
+                            : Macro->parseMacinfo(Data)) {
+      RecoverableErrorHandler(std::move(Err));
+      Macro = nullptr;
+    }
+  };
+  switch (SectionType) {
+  case MacinfoSection: {
+    DWARFDataExtractor Data(DObj->getMacinfoSection(), isLittleEndian(), 0);
+    ParseAndDump(Data, /*IsMacro=*/false);
+    break;
+  }
+  case MacinfoDwoSection: {
+    DWARFDataExtractor Data(DObj->getMacinfoDWOSection(), isLittleEndian(), 0);
+    ParseAndDump(Data, /*IsMacro=*/false);
+    break;
+  }
+  case MacroSection: {
+    DWARFDataExtractor Data(*DObj, DObj->getMacroSection(), isLittleEndian(),
+                            0);
+    ParseAndDump(Data, /*IsMacro=*/true);
+    break;
+  }
+  case MacroDwoSection: {
+    DWARFDataExtractor Data(DObj->getMacroDWOSection(), isLittleEndian(), 0);
+    ParseAndDump(Data, /*IsMacro=*/true);
+    break;
+  }
+  }
+  return Macro;
+}
+
 static void dumpLoclistsSection(raw_ostream &OS, DIDumpOptions DumpOpts,
                                 DWARFDataExtractor Data,
                                 const MCRegisterInfo *MRI,
@@ -295,7 +312,7 @@ static void dumpLoclistsSection(raw_ostream &OS, DIDumpOptions DumpOpts,
   while (Data.isValidOffset(Offset)) {
     DWARFListTableHeader Header(".debug_loclists", "locations");
     if (Error E = Header.extract(Data, &Offset)) {
-      WithColor::error() << toString(std::move(E)) << '\n';
+      DumpOpts.RecoverableErrorHandler(std::move(E));
       return;
     }
 
@@ -319,10 +336,16 @@ static void dumpLoclistsSection(raw_ostream &OS, DIDumpOptions DumpOpts,
   }
 }
 
+static void dumpPubTableSection(raw_ostream &OS, DIDumpOptions DumpOpts,
+                                DWARFDataExtractor Data, bool GnuStyle) {
+  DWARFDebugPubTable Table;
+  Table.extract(Data, GnuStyle, DumpOpts.RecoverableErrorHandler);
+  Table.dump(OS);
+}
+
 void DWARFContext::dump(
     raw_ostream &OS, DIDumpOptions DumpOpts,
     std::array<Optional<uint64_t>, DIDT_ID_Count> DumpOffsets) {
-
   uint64_t DumpType = DumpOpts.DumpType;
 
   StringRef Extension = sys::path::extension(DObj->getFileName());
@@ -430,31 +453,61 @@ void DWARFContext::dump(
     }
   }
 
-  if (const auto *Off = shouldDump(Explicit, ".debug_frame", DIDT_ID_DebugFrame,
-                                   DObj->getFrameSection().Data))
-    getDebugFrame()->dump(OS, getRegisterInfo(), *Off);
+  if (const Optional<uint64_t> *Off =
+          shouldDump(Explicit, ".debug_frame", DIDT_ID_DebugFrame,
+                     DObj->getFrameSection().Data)) {
+    if (Expected<const DWARFDebugFrame *> DF = getDebugFrame())
+      (*DF)->dump(OS, getRegisterInfo(), *Off);
+    else
+      RecoverableErrorHandler(DF.takeError());
+  }
 
-  if (const auto *Off = shouldDump(Explicit, ".eh_frame", DIDT_ID_DebugFrame,
-                                   DObj->getEHFrameSection().Data))
-    getEHFrame()->dump(OS, getRegisterInfo(), *Off);
+  if (const Optional<uint64_t> *Off =
+          shouldDump(Explicit, ".eh_frame", DIDT_ID_DebugFrame,
+                     DObj->getEHFrameSection().Data)) {
+    if (Expected<const DWARFDebugFrame *> DF = getEHFrame())
+      (*DF)->dump(OS, getRegisterInfo(), *Off);
+    else
+      RecoverableErrorHandler(DF.takeError());
+  }
 
-  if (DumpType & DIDT_DebugMacro) {
-    if (Explicit || !getDebugMacro()->empty()) {
-      OS << "\n.debug_macinfo contents:\n";
-      getDebugMacro()->dump(OS);
-    } else if (ExplicitDWO || !getDebugMacroDWO()->empty()) {
-      OS << "\n.debug_macinfo.dwo contents:\n";
-      getDebugMacroDWO()->dump(OS);
-    }
+  if (shouldDump(Explicit, ".debug_macro", DIDT_ID_DebugMacro,
+                 DObj->getMacroSection().Data)) {
+    if (auto Macro = getDebugMacro())
+      Macro->dump(OS);
+  }
+
+  if (shouldDump(Explicit, ".debug_macro.dwo", DIDT_ID_DebugMacro,
+                 DObj->getMacroDWOSection())) {
+    if (auto MacroDWO = getDebugMacroDWO())
+      MacroDWO->dump(OS);
+  }
+
+  if (shouldDump(Explicit, ".debug_macinfo", DIDT_ID_DebugMacro,
+                 DObj->getMacinfoSection())) {
+    if (auto Macinfo = getDebugMacinfo())
+      Macinfo->dump(OS);
+  }
+
+  if (shouldDump(Explicit, ".debug_macinfo.dwo", DIDT_ID_DebugMacro,
+                 DObj->getMacinfoDWOSection())) {
+    if (auto MacinfoDWO = getDebugMacinfoDWO())
+      MacinfoDWO->dump(OS);
   }
 
   if (shouldDump(Explicit, ".debug_aranges", DIDT_ID_DebugAranges,
                  DObj->getArangesSection())) {
     uint64_t offset = 0;
-    DataExtractor arangesData(DObj->getArangesSection(), isLittleEndian(), 0);
+    DWARFDataExtractor arangesData(DObj->getArangesSection(), isLittleEndian(),
+                                   0);
     DWARFDebugArangeSet set;
-    while (set.extract(arangesData, &offset))
+    while (arangesData.isValidOffset(offset)) {
+      if (Error E = set.extract(arangesData, &offset)) {
+        RecoverableErrorHandler(std::move(E));
+        break;
+      }
       set.dump(OS);
+    }
   }
 
   auto DumpLineSection = [&](DWARFDebugLine::SectionParser Parser,
@@ -462,18 +515,13 @@ void DWARFContext::dump(
                              Optional<uint64_t> DumpOffset) {
     while (!Parser.done()) {
       if (DumpOffset && Parser.getOffset() != *DumpOffset) {
-        Parser.skip(dumpWarning);
+        Parser.skip(DumpOpts.WarningHandler, DumpOpts.WarningHandler);
         continue;
       }
       OS << "debug_line[" << format("0x%8.8" PRIx64, Parser.getOffset())
          << "]\n";
-      if (DumpOpts.Verbose) {
-        Parser.parseNext(dumpWarning, dumpWarning, &OS);
-      } else {
-        DWARFDebugLine::LineTable LineTable =
-            Parser.parseNext(dumpWarning, dumpWarning);
-        LineTable.dump(OS, DumpOpts);
-      }
+      Parser.parseNext(DumpOpts.WarningHandler, DumpOpts.WarningHandler, &OS,
+                       DumpOpts.Verbose);
     }
   };
 
@@ -555,7 +603,7 @@ void DWARFContext::dump(
     DWARFDebugRangeList rangeList;
     while (rangesData.isValidOffset(offset)) {
       if (Error E = rangeList.extract(rangesData, &offset)) {
-        WithColor::error() << toString(std::move(E)) << '\n';
+        DumpOpts.RecoverableErrorHandler(std::move(E));
         break;
       }
       rangeList.dump(OS);
@@ -585,39 +633,44 @@ void DWARFContext::dump(
   }
 
   if (shouldDump(Explicit, ".debug_pubnames", DIDT_ID_DebugPubnames,
-                 DObj->getPubnamesSection().Data))
-    DWARFDebugPubTable(*DObj, DObj->getPubnamesSection(), isLittleEndian(), false)
-        .dump(OS);
+                 DObj->getPubnamesSection().Data)) {
+    DWARFDataExtractor PubTableData(*DObj, DObj->getPubnamesSection(),
+                                    isLittleEndian(), 0);
+    dumpPubTableSection(OS, DumpOpts, PubTableData, /*GnuStyle=*/false);
+  }
 
   if (shouldDump(Explicit, ".debug_pubtypes", DIDT_ID_DebugPubtypes,
-                 DObj->getPubtypesSection().Data))
-    DWARFDebugPubTable(*DObj, DObj->getPubtypesSection(), isLittleEndian(), false)
-        .dump(OS);
+                 DObj->getPubtypesSection().Data)) {
+    DWARFDataExtractor PubTableData(*DObj, DObj->getPubtypesSection(),
+                                    isLittleEndian(), 0);
+    dumpPubTableSection(OS, DumpOpts, PubTableData, /*GnuStyle=*/false);
+  }
 
   if (shouldDump(Explicit, ".debug_gnu_pubnames", DIDT_ID_DebugGnuPubnames,
-                 DObj->getGnuPubnamesSection().Data))
-    DWARFDebugPubTable(*DObj, DObj->getGnuPubnamesSection(), isLittleEndian(),
-                       true /* GnuStyle */)
-        .dump(OS);
+                 DObj->getGnuPubnamesSection().Data)) {
+    DWARFDataExtractor PubTableData(*DObj, DObj->getGnuPubnamesSection(),
+                                    isLittleEndian(), 0);
+    dumpPubTableSection(OS, DumpOpts, PubTableData, /*GnuStyle=*/true);
+  }
 
   if (shouldDump(Explicit, ".debug_gnu_pubtypes", DIDT_ID_DebugGnuPubtypes,
-                 DObj->getGnuPubtypesSection().Data))
-    DWARFDebugPubTable(*DObj, DObj->getGnuPubtypesSection(), isLittleEndian(),
-                       true /* GnuStyle */)
-        .dump(OS);
+                 DObj->getGnuPubtypesSection().Data)) {
+    DWARFDataExtractor PubTableData(*DObj, DObj->getGnuPubtypesSection(),
+                                    isLittleEndian(), 0);
+    dumpPubTableSection(OS, DumpOpts, PubTableData, /*GnuStyle=*/true);
+  }
 
   if (shouldDump(Explicit, ".debug_str_offsets", DIDT_ID_DebugStrOffsets,
                  DObj->getStrOffsetsSection().Data))
-    dumpStringOffsetsSection(OS, "debug_str_offsets", *DObj,
-                             DObj->getStrOffsetsSection(),
-                             DObj->getStrSection(), normal_units(),
-                             isLittleEndian(), getMaxVersion());
+    dumpStringOffsetsSection(
+        OS, DumpOpts, "debug_str_offsets", *DObj, DObj->getStrOffsetsSection(),
+        DObj->getStrSection(), normal_units(), isLittleEndian());
   if (shouldDump(ExplicitDWO, ".debug_str_offsets.dwo", DIDT_ID_DebugStrOffsets,
                  DObj->getStrOffsetsDWOSection().Data))
-    dumpStringOffsetsSection(OS, "debug_str_offsets.dwo", *DObj,
+    dumpStringOffsetsSection(OS, DumpOpts, "debug_str_offsets.dwo", *DObj,
                              DObj->getStrOffsetsDWOSection(),
                              DObj->getStrDWOSection(), dwo_units(),
-                             isLittleEndian(), getMaxDWOVersion());
+                             isLittleEndian());
 
   if (shouldDump(Explicit, ".gdb_index", DIDT_ID_GdbIndex,
                  DObj->getGdbIndexSection())) {
@@ -711,7 +764,7 @@ const DWARFUnitIndex &DWARFContext::getTUIndex() {
 
   DataExtractor TUIndexData(DObj->getTUIndexSection(), isLittleEndian(), 0);
 
-  TUIndex = std::make_unique<DWARFUnitIndex>(DW_SECT_TYPES);
+  TUIndex = std::make_unique<DWARFUnitIndex>(DW_SECT_EXT_TYPES);
   TUIndex->parse(TUIndexData);
   return *TUIndex;
 }
@@ -770,7 +823,7 @@ const DWARFDebugAranges *DWARFContext::getDebugAranges() {
   return Aranges.get();
 }
 
-const DWARFDebugFrame *DWARFContext::getDebugFrame() {
+Expected<const DWARFDebugFrame *> DWARFContext::getDebugFrame() {
   if (DebugFrame)
     return DebugFrame.get();
 
@@ -785,41 +838,50 @@ const DWARFDebugFrame *DWARFContext::getDebugFrame() {
   // http://lists.dwarfstd.org/htdig.cgi/dwarf-discuss-dwarfstd.org/2011-December/001173.html
   DWARFDataExtractor debugFrameData(*DObj, DObj->getFrameSection(),
                                     isLittleEndian(), DObj->getAddressSize());
-  DebugFrame.reset(new DWARFDebugFrame(getArch(), false /* IsEH */));
-  DebugFrame->parse(debugFrameData);
+  auto DF = std::make_unique<DWARFDebugFrame>(getArch(), /*IsEH=*/false);
+  if (Error E = DF->parse(debugFrameData))
+    return std::move(E);
+
+  DebugFrame.swap(DF);
   return DebugFrame.get();
 }
 
-const DWARFDebugFrame *DWARFContext::getEHFrame() {
+Expected<const DWARFDebugFrame *> DWARFContext::getEHFrame() {
   if (EHFrame)
     return EHFrame.get();
 
   DWARFDataExtractor debugFrameData(*DObj, DObj->getEHFrameSection(),
                                     isLittleEndian(), DObj->getAddressSize());
-  DebugFrame.reset(new DWARFDebugFrame(getArch(), true /* IsEH */));
-  DebugFrame->parse(debugFrameData);
+
+  auto DF = std::make_unique<DWARFDebugFrame>(getArch(), /*IsEH=*/true);
+  if (Error E = DF->parse(debugFrameData))
+    return std::move(E);
+  DebugFrame.swap(DF);
   return DebugFrame.get();
 }
 
-const DWARFDebugMacro *DWARFContext::getDebugMacroDWO() {
-  if (MacroDWO)
-    return MacroDWO.get();
+const DWARFDebugMacro *DWARFContext::getDebugMacro() {
+  if (!Macro)
+    Macro = parseMacroOrMacinfo(MacroSection);
+  return Macro.get();
+}
 
-  DataExtractor MacinfoDWOData(DObj->getMacinfoDWOSection(), isLittleEndian(),
-                               0);
-  MacroDWO.reset(new DWARFDebugMacro());
-  MacroDWO->parse(MacinfoDWOData);
+const DWARFDebugMacro *DWARFContext::getDebugMacroDWO() {
+  if (!MacroDWO)
+    MacroDWO = parseMacroOrMacinfo(MacroDwoSection);
   return MacroDWO.get();
 }
 
-const DWARFDebugMacro *DWARFContext::getDebugMacro() {
-  if (Macro)
-    return Macro.get();
+const DWARFDebugMacro *DWARFContext::getDebugMacinfo() {
+  if (!Macinfo)
+    Macinfo = parseMacroOrMacinfo(MacinfoSection);
+  return Macinfo.get();
+}
 
-  DataExtractor MacinfoData(DObj->getMacinfoSection(), isLittleEndian(), 0);
-  Macro.reset(new DWARFDebugMacro());
-  Macro->parse(MacinfoData);
-  return Macro.get();
+const DWARFDebugMacro *DWARFContext::getDebugMacinfoDWO() {
+  if (!MacinfoDWO)
+    MacinfoDWO = parseMacroOrMacinfo(MacinfoDwoSection);
+  return MacinfoDWO.get();
 }
 
 template <typename T>
@@ -865,16 +927,16 @@ const AppleAcceleratorTable &DWARFContext::getAppleObjC() {
 const DWARFDebugLine::LineTable *
 DWARFContext::getLineTableForUnit(DWARFUnit *U) {
   Expected<const DWARFDebugLine::LineTable *> ExpectedLineTable =
-      getLineTableForUnit(U, dumpWarning);
+      getLineTableForUnit(U, WarningHandler);
   if (!ExpectedLineTable) {
-    dumpWarning(ExpectedLineTable.takeError());
+    WarningHandler(ExpectedLineTable.takeError());
     return nullptr;
   }
   return *ExpectedLineTable;
 }
 
 Expected<const DWARFDebugLine::LineTable *> DWARFContext::getLineTableForUnit(
-    DWARFUnit *U, function_ref<void(Error)> RecoverableErrorCallback) {
+    DWARFUnit *U, function_ref<void(Error)> RecoverableErrorHandler) {
   if (!Line)
     Line.reset(new DWARFDebugLine);
 
@@ -899,7 +961,7 @@ Expected<const DWARFDebugLine::LineTable *> DWARFContext::getLineTableForUnit(
   DWARFDataExtractor lineData(*DObj, U->getLineSection(), isLittleEndian(),
                               U->getAddressByteSize());
   return Line->getOrParseLineTable(lineData, stmtOffset, *this, U,
-                                   RecoverableErrorCallback);
+                                   RecoverableErrorHandler);
 }
 
 void DWARFContext::parseNormalUnits() {
@@ -910,7 +972,7 @@ void DWARFContext::parseNormalUnits() {
   });
   NormalUnits.finishedInfoUnits();
   DObj->forEachTypesSections([&](const DWARFSection &S) {
-    NormalUnits.addUnitsForSection(*this, S, DW_SECT_TYPES);
+    NormalUnits.addUnitsForSection(*this, S, DW_SECT_EXT_TYPES);
   });
 }
 
@@ -922,7 +984,7 @@ void DWARFContext::parseDWOUnits(bool Lazy) {
   });
   DWOUnits.finishedInfoUnits();
   DObj->forEachTypesDWOSections([&](const DWARFSection &S) {
-    DWOUnits.addUnitsForDWOSection(*this, S, DW_SECT_TYPES, Lazy);
+    DWOUnits.addUnitsForDWOSection(*this, S, DW_SECT_EXT_TYPES, Lazy);
   });
 }
 
@@ -1418,11 +1480,6 @@ static bool isRelocScattered(const object::ObjectFile &Obj,
   return MachObj->isRelocationScattered(RelocInfo);
 }
 
-ErrorPolicy DWARFContext::defaultErrorHandler(Error E) {
-  WithColor::error() << toString(std::move(E)) << '\n';
-  return ErrorPolicy::Continue;
-}
-
 namespace {
 struct DWARFSectionMap final : public DWARFSection {
   RelocAddrMap Relocs;
@@ -1467,6 +1524,7 @@ class DWARFObjInMemory final : public DWARFObject {
   DWARFSectionMap PubtypesSection;
   DWARFSectionMap GnuPubnamesSection;
   DWARFSectionMap GnuPubtypesSection;
+  DWARFSectionMap MacroSection;
 
   DWARFSectionMap *mapNameToDWARFSection(StringRef Name) {
     return StringSwitch<DWARFSectionMap *>(Name)
@@ -1494,6 +1552,7 @@ class DWARFObjInMemory final : public DWARFObject {
         .Case("apple_namespaces", &AppleNamespacesSection)
         .Case("apple_namespac", &AppleNamespacesSection)
         .Case("apple_objc", &AppleObjCSection)
+        .Case("debug_macro", &MacroSection)
         .Default(nullptr);
   }
 
@@ -1502,6 +1561,7 @@ class DWARFObjInMemory final : public DWARFObject {
   StringRef StrSection;
   StringRef MacinfoSection;
   StringRef MacinfoDWOSection;
+  StringRef MacroDWOSection;
   StringRef AbbrevDWOSection;
   StringRef StrDWOSection;
   StringRef CUIndexSection;
@@ -1522,6 +1582,7 @@ class DWARFObjInMemory final : public DWARFObject {
         .Case("debug_str", &StrSection)
         .Case("debug_macinfo", &MacinfoSection)
         .Case("debug_macinfo.dwo", &MacinfoDWOSection)
+        .Case("debug_macro.dwo", &MacroDWOSection)
         .Case("debug_abbrev.dwo", &AbbrevDWOSection)
         .Case("debug_str.dwo", &StrDWOSection)
         .Case("debug_cu_index", &CUIndexSection)
@@ -1574,7 +1635,7 @@ public:
     }
   }
   DWARFObjInMemory(const object::ObjectFile &Obj, const LoadedObjectInfo *L,
-                   function_ref<ErrorPolicy(Error)> HandleError)
+                   function_ref<void(Error)> HandleError, function_ref<void(Error)> HandleWarning )
       : IsLittleEndian(Obj.isLittleEndian()),
         AddressSize(Obj.getBytesInAddress()), FileName(Obj.getFileName()),
         Obj(&Obj) {
@@ -1601,10 +1662,8 @@ public:
       StringRef Data;
       Expected<section_iterator> SecOrErr = Section.getRelocatedSection();
       if (!SecOrErr) {
-        ErrorPolicy EP = HandleError(createError(
-            "failed to get relocated section: ", SecOrErr.takeError()));
-        if (EP == ErrorPolicy::Halt)
-          return;
+        HandleError(createError("failed to get relocated section: ",
+                                SecOrErr.takeError()));
         continue;
       }
 
@@ -1622,10 +1681,8 @@ public:
       }
 
       if (auto Err = maybeDecompress(Section, Name, Data)) {
-        ErrorPolicy EP = HandleError(createError(
-            "failed to decompress '" + Name + "', ", std::move(Err)));
-        if (EP == ErrorPolicy::Halt)
-          return;
+        HandleError(createError("failed to decompress '" + Name + "', ",
+                                std::move(Err)));
         continue;
       }
 
@@ -1726,8 +1783,7 @@ public:
         Expected<SymInfo> SymInfoOrErr =
             getSymbolInfo(Obj, Reloc, L, AddrCache);
         if (!SymInfoOrErr) {
-          if (HandleError(SymInfoOrErr.takeError()) == ErrorPolicy::Halt)
-            return;
+          HandleError(SymInfoOrErr.takeError());
           continue;
         }
 
@@ -1747,10 +1803,8 @@ public:
           if (!I.second) {
             RelocAddrEntry &entry = I.first->getSecond();
             if (entry.Reloc2) {
-              ErrorPolicy EP = HandleError(createError(
+              HandleError(createError(
                   "At most two relocations per offset are supported"));
-              if (EP == ErrorPolicy::Halt)
-                return;
             }
             entry.Reloc2 = Reloc;
             entry.SymbolValue2 = SymInfoOrErr->Address;
@@ -1758,11 +1812,10 @@ public:
         } else {
           SmallString<32> Type;
           Reloc.getTypeName(Type);
-          ErrorPolicy EP = HandleError(
+          // FIXME: Support more relocations & change this to an error
+          HandleWarning(
               createError("failed to compute relocation: " + Type + ", ",
                           errorCodeToError(object_error::parse_failed)));
-          if (EP == ErrorPolicy::Halt)
-            return;
         }
       }
     }
@@ -1847,6 +1900,8 @@ public:
   const DWARFSection &getRnglistsSection() const override {
     return RnglistsSection;
   }
+  const DWARFSection &getMacroSection() const override { return MacroSection; }
+  StringRef getMacroDWOSection() const override { return MacroDWOSection; }
   StringRef getMacinfoSection() const override { return MacinfoSection; }
   StringRef getMacinfoDWOSection() const override { return MacinfoDWOSection; }
   const DWARFSection &getPubnamesSection() const override { return PubnamesSection; }
@@ -1890,18 +1945,25 @@ public:
 
 std::unique_ptr<DWARFContext>
 DWARFContext::create(const object::ObjectFile &Obj, const LoadedObjectInfo *L,
-                     function_ref<ErrorPolicy(Error)> HandleError,
-                     std::string DWPName) {
-  auto DObj = std::make_unique<DWARFObjInMemory>(Obj, L, HandleError);
-  return std::make_unique<DWARFContext>(std::move(DObj), std::move(DWPName));
+                     std::string DWPName,
+                     std::function<void(Error)> RecoverableErrorHandler,
+                     std::function<void(Error)> WarningHandler) {
+  auto DObj =
+      std::make_unique<DWARFObjInMemory>(Obj, L, RecoverableErrorHandler, WarningHandler);
+  return std::make_unique<DWARFContext>(std::move(DObj), std::move(DWPName),
+                                        RecoverableErrorHandler,
+                                        WarningHandler);
 }
 
 std::unique_ptr<DWARFContext>
 DWARFContext::create(const StringMap<std::unique_ptr<MemoryBuffer>> &Sections,
-                     uint8_t AddrSize, bool isLittleEndian) {
+                     uint8_t AddrSize, bool isLittleEndian,
+                     std::function<void(Error)> RecoverableErrorHandler,
+                     std::function<void(Error)> WarningHandler) {
   auto DObj =
       std::make_unique<DWARFObjInMemory>(Sections, AddrSize, isLittleEndian);
-  return std::make_unique<DWARFContext>(std::move(DObj), "");
+  return std::make_unique<DWARFContext>(
+      std::move(DObj), "", RecoverableErrorHandler, WarningHandler);
 }
 
 Error DWARFContext::loadRegisterInfo(const object::ObjectFile &Obj) {
@@ -1924,19 +1986,9 @@ Error DWARFContext::loadRegisterInfo(const object::ObjectFile &Obj) {
 uint8_t DWARFContext::getCUAddrSize() {
   // In theory, different compile units may have different address byte
   // sizes, but for simplicity we just use the address byte size of the
-  // last compile unit. In practice the address size field is repeated across
+  // first compile unit. In practice the address size field is repeated across
   // various DWARF headers (at least in version 5) to make it easier to dump
   // them independently, not to enable varying the address size.
-  uint8_t Addr = 0;
-  for (const auto &CU : compile_units()) {
-    Addr = CU->getAddressByteSize();
-    break;
-  }
-  return Addr;
-}
-
-void DWARFContext::dumpWarning(Error Warning) {
-  handleAllErrors(std::move(Warning), [](ErrorInfoBase &Info) {
-      WithColor::warning() << Info.message() << '\n';
-  });
+  unit_iterator_range CUs = compile_units();
+  return CUs.empty() ? 0 : (*CUs.begin())->getAddressByteSize();
 }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
index 53e676bc7031..886fe1dff976 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDataExtractor.cpp
@@ -7,11 +7,42 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
-#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 
 using namespace llvm;
 
+std::pair<uint64_t, dwarf::DwarfFormat>
+DWARFDataExtractor::getInitialLength(uint64_t *Off, Error *Err) const {
+  ErrorAsOutParameter ErrAsOut(Err);
+  if (Err && *Err)
+    return {0, dwarf::DWARF32};
+
+  Cursor C(*Off);
+  uint64_t Length = getRelocatedValue(C, 4);
+  dwarf::DwarfFormat Format = dwarf::DWARF32;
+  if (Length == dwarf::DW_LENGTH_DWARF64) {
+    Length = getRelocatedValue(C, 8);
+    Format = dwarf::DWARF64;
+  } else if (Length >= dwarf::DW_LENGTH_lo_reserved) {
+    cantFail(C.takeError());
+    if (Err)
+      *Err = createStringError(
+          errc::invalid_argument,
+          "unsupported reserved unit length of value 0x%8.8" PRIx64, Length);
+    return {0, dwarf::DWARF32};
+  }
+
+  if (C) {
+    *Off = C.tell();
+    return {Length, Format};
+  }
+  if (Err)
+    *Err = C.takeError();
+  else
+    consumeError(C.takeError());
+  return {0, dwarf::DWARF32};
+}
+
 uint64_t DWARFDataExtractor::getRelocatedValue(uint32_t Size, uint64_t *Off,
                                                uint64_t *SecNdx,
                                                Error *Err) const {
@@ -19,9 +50,11 @@ uint64_t DWARFDataExtractor::getRelocatedValue(uint32_t Size, uint64_t *Off,
     *SecNdx = object::SectionedAddress::UndefSection;
   if (!Section)
     return getUnsigned(Off, Size, Err);
+
+  ErrorAsOutParameter ErrAsOut(Err);
   Optional<RelocAddrEntry> E = Obj->find(*Section, *Off);
   uint64_t A = getUnsigned(Off, Size, Err);
-  if (!E)
+  if (!E || (Err && *Err))
     return A;
   if (SecNdx)
     *SecNdx = E->SectionIndex;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
index f71543799e28..dcf2aefeb39f 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugAddr.cpp
@@ -12,144 +12,144 @@
 
 using namespace llvm;
 
-void DWARFDebugAddrTable::clear() {
-  HeaderData = {};
+Error DWARFDebugAddrTable::extractAddresses(const DWARFDataExtractor &Data,
+                                            uint64_t *OffsetPtr,
+                                            uint64_t EndOffset) {
+  assert(EndOffset >= *OffsetPtr);
+  uint64_t DataSize = EndOffset - *OffsetPtr;
+  assert(Data.isValidOffsetForDataOfSize(*OffsetPtr, DataSize));
+  if (AddrSize != 4 && AddrSize != 8)
+    return createStringError(errc::not_supported,
+                             "address table at offset 0x%" PRIx64
+                             " has unsupported address size %" PRIu8
+                             " (4 and 8 are supported)",
+                             Offset, AddrSize);
+  if (DataSize % AddrSize != 0) {
+    invalidateLength();
+    return createStringError(errc::invalid_argument,
+                             "address table at offset 0x%" PRIx64
+                             " contains data of size 0x%" PRIx64
+                             " which is not a multiple of addr size %" PRIu8,
+                             Offset, DataSize, AddrSize);
+  }
   Addrs.clear();
-  invalidateLength();
+  size_t Count = DataSize / AddrSize;
+  Addrs.reserve(Count);
+  while (Count--)
+    Addrs.push_back(Data.getRelocatedValue(AddrSize, OffsetPtr));
+  return Error::success();
 }
 
-Error DWARFDebugAddrTable::extract(DWARFDataExtractor Data,
-                                   uint64_t *OffsetPtr,
-                                   uint16_t Version,
-                                   uint8_t AddrSize,
-                                   std::function<void(Error)> WarnCallback) {
-  clear();
-  HeaderOffset = *OffsetPtr;
-  // Read and verify the length field.
-  if (!Data.isValidOffsetForDataOfSize(*OffsetPtr, sizeof(uint32_t)))
+Error DWARFDebugAddrTable::extractV5(const DWARFDataExtractor &Data,
+                                     uint64_t *OffsetPtr, uint8_t CUAddrSize,
+                                     std::function<void(Error)> WarnCallback) {
+  Offset = *OffsetPtr;
+  llvm::Error Err = Error::success();
+  std::tie(Length, Format) = Data.getInitialLength(OffsetPtr, &Err);
+  if (Err) {
+    invalidateLength();
     return createStringError(errc::invalid_argument,
-                       "section is not large enough to contain a "
-                       ".debug_addr table length at offset 0x%"
-                       PRIx64, *OffsetPtr);
-  uint16_t UnitVersion;
-  if (Version == 0) {
-    WarnCallback(createStringError(errc::invalid_argument,
-                       "DWARF version is not defined in CU,"
-                       " assuming version 5"));
-    UnitVersion = 5;
-  } else {
-    UnitVersion = Version;
+                             "parsing address table at offset 0x%" PRIx64
+                             ": %s",
+                             Offset, toString(std::move(Err)).c_str());
+  }
+
+  if (!Data.isValidOffsetForDataOfSize(*OffsetPtr, Length)) {
+    uint64_t DiagnosticLength = Length;
+    invalidateLength();
+    return createStringError(
+        errc::invalid_argument,
+        "section is not large enough to contain an address table "
+        "at offset 0x%" PRIx64 " with a unit_length value of 0x%" PRIx64,
+        Offset, DiagnosticLength);
   }
-  // TODO: Add support for DWARF64.
-  Format = dwarf::DwarfFormat::DWARF32;
-  if (UnitVersion >= 5) {
-    HeaderData.Length = Data.getU32(OffsetPtr);
-    if (HeaderData.Length == dwarf::DW_LENGTH_DWARF64) {
-      invalidateLength();
-      return createStringError(errc::not_supported,
-          "DWARF64 is not supported in .debug_addr at offset 0x%" PRIx64,
-          HeaderOffset);
-    }
-    if (HeaderData.Length + sizeof(uint32_t) < sizeof(Header)) {
-      uint32_t TmpLength = getLength();
-      invalidateLength();
-      return createStringError(errc::invalid_argument,
-                         ".debug_addr table at offset 0x%" PRIx64
-                         " has too small length (0x%" PRIx32
-                         ") to contain a complete header",
-                         HeaderOffset, TmpLength);
-    }
-    uint64_t End = HeaderOffset + getLength();
-    if (!Data.isValidOffsetForDataOfSize(HeaderOffset, End - HeaderOffset)) {
-      uint32_t TmpLength = getLength();
-      invalidateLength();
-      return createStringError(errc::invalid_argument,
-          "section is not large enough to contain a .debug_addr table "
-          "of length 0x%" PRIx32 " at offset 0x%" PRIx64,
-          TmpLength, HeaderOffset);
-    }
-
-    HeaderData.Version = Data.getU16(OffsetPtr);
-    HeaderData.AddrSize = Data.getU8(OffsetPtr);
-    HeaderData.SegSize = Data.getU8(OffsetPtr);
-    DataSize = getDataSize();
-  } else {
-    HeaderData.Version = UnitVersion;
-    HeaderData.AddrSize = AddrSize;
-    // TODO: Support for non-zero SegSize.
-    HeaderData.SegSize = 0;
-    DataSize = Data.size();
+  uint64_t EndOffset = *OffsetPtr + Length;
+  // Ensure that we can read the remaining header fields.
+  if (Length < 4) {
+    uint64_t DiagnosticLength = Length;
+    invalidateLength();
+    return createStringError(
+        errc::invalid_argument,
+        "address table at offset 0x%" PRIx64
+        " has a unit_length value of 0x%" PRIx64
+        ", which is too small to contain a complete header",
+        Offset, DiagnosticLength);
   }
 
-  // Perform basic validation of the remaining header fields.
+  Version = Data.getU16(OffsetPtr);
+  AddrSize = Data.getU8(OffsetPtr);
+  SegSize = Data.getU8(OffsetPtr);
 
-  // We support DWARF version 5 for now as well as pre-DWARF5
-  // implementations of .debug_addr table, which doesn't contain a header
-  // and consists only of a series of addresses.
-  if (HeaderData.Version > 5) {
-    return createStringError(errc::not_supported, "version %" PRIu16
-        " of .debug_addr section at offset 0x%" PRIx64 " is not supported",
-        HeaderData.Version, HeaderOffset);
-  }
-  // FIXME: For now we just treat version mismatch as an error,
-  // however the correct way to associate a .debug_addr table
-  // with a .debug_info table is to look at the DW_AT_addr_base
-  // attribute in the info table.
-  if (HeaderData.Version != UnitVersion)
-    return createStringError(errc::invalid_argument,
-                       ".debug_addr table at offset 0x%" PRIx64
-                       " has version %" PRIu16
-                       " which is different from the version suggested"
-                       " by the DWARF unit header: %" PRIu16,
-                       HeaderOffset, HeaderData.Version, UnitVersion);
-  if (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8)
+  // Perform a basic validation of the header fields.
+  if (Version != 5)
     return createStringError(errc::not_supported,
-                       ".debug_addr table at offset 0x%" PRIx64
-                       " has unsupported address size %" PRIu8,
-                       HeaderOffset, HeaderData.AddrSize);
-  if (HeaderData.AddrSize != AddrSize && AddrSize != 0)
-    return createStringError(errc::invalid_argument,
-                       ".debug_addr table at offset 0x%" PRIx64
-                       " has address size %" PRIu8
-                       " which is different from CU address size %" PRIu8,
-                       HeaderOffset, HeaderData.AddrSize, AddrSize);
-
+                             "address table at offset 0x%" PRIx64
+                             " has unsupported version %" PRIu16,
+                             Offset, Version);
   // TODO: add support for non-zero segment selector size.
-  if (HeaderData.SegSize != 0)
+  if (SegSize != 0)
     return createStringError(errc::not_supported,
-                       ".debug_addr table at offset 0x%" PRIx64
-                       " has unsupported segment selector size %" PRIu8,
-                       HeaderOffset, HeaderData.SegSize);
-  if (DataSize % HeaderData.AddrSize != 0) {
-    invalidateLength();
-    return createStringError(errc::invalid_argument,
-                       ".debug_addr table at offset 0x%" PRIx64
-                       " contains data of size %" PRIu32
-                       " which is not a multiple of addr size %" PRIu8,
-                       HeaderOffset, DataSize, HeaderData.AddrSize);
+                             "address table at offset 0x%" PRIx64
+                             " has unsupported segment selector size %" PRIu8,
+                             Offset, SegSize);
+
+  if (Error Err = extractAddresses(Data, OffsetPtr, EndOffset))
+    return Err;
+  if (CUAddrSize && AddrSize != CUAddrSize) {
+    WarnCallback(createStringError(
+        errc::invalid_argument,
+        "address table at offset 0x%" PRIx64 " has address size %" PRIu8
+        " which is different from CU address size %" PRIu8,
+        Offset, AddrSize, CUAddrSize));
   }
-  Data.setAddressSize(HeaderData.AddrSize);
-  uint32_t AddrCount = DataSize / HeaderData.AddrSize;
-  for (uint32_t I = 0; I < AddrCount; ++I)
-    if (HeaderData.AddrSize == 4)
-      Addrs.push_back(Data.getU32(OffsetPtr));
-    else
-      Addrs.push_back(Data.getU64(OffsetPtr));
   return Error::success();
 }
 
+Error DWARFDebugAddrTable::extractPreStandard(const DWARFDataExtractor &Data,
+                                              uint64_t *OffsetPtr,
+                                              uint16_t CUVersion,
+                                              uint8_t CUAddrSize) {
+  assert(CUVersion > 0 && CUVersion < 5);
+
+  Offset = *OffsetPtr;
+  Length = 0;
+  Version = CUVersion;
+  AddrSize = CUAddrSize;
+  SegSize = 0;
+
+  return extractAddresses(Data, OffsetPtr, Data.size());
+}
+
+Error DWARFDebugAddrTable::extract(const DWARFDataExtractor &Data,
+                                   uint64_t *OffsetPtr,
+                                   uint16_t CUVersion,
+                                   uint8_t CUAddrSize,
+                                   std::function<void(Error)> WarnCallback) {
+  if (CUVersion > 0 && CUVersion < 5)
+    return extractPreStandard(Data, OffsetPtr, CUVersion, CUAddrSize);
+  if (CUVersion == 0)
+    WarnCallback(createStringError(errc::invalid_argument,
+                                   "DWARF version is not defined in CU,"
+                                   " assuming version 5"));
+  return extractV5(Data, OffsetPtr, CUAddrSize, WarnCallback);
+}
+
 void DWARFDebugAddrTable::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
   if (DumpOpts.Verbose)
-    OS << format("0x%8.8" PRIx32 ": ", HeaderOffset);
-  OS << format("Addr Section: length = 0x%8.8" PRIx32
-               ", version = 0x%4.4" PRIx16 ", "
-               "addr_size = 0x%2.2" PRIx8 ", seg_size = 0x%2.2" PRIx8 "\n",
-               HeaderData.Length, HeaderData.Version, HeaderData.AddrSize,
-               HeaderData.SegSize);
+    OS << format("0x%8.8" PRIx64 ": ", Offset);
+  if (Length) {
+    int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(Format);
+    OS << "Address table header: "
+       << format("length = 0x%0*" PRIx64, OffsetDumpWidth, Length)
+       << ", format = " << dwarf::FormatString(Format)
+       << format(", version = 0x%4.4" PRIx16, Version)
+       << format(", addr_size = 0x%2.2" PRIx8, AddrSize)
+       << format(", seg_size = 0x%2.2" PRIx8, SegSize) << "\n";
+  }
 
   if (Addrs.size() > 0) {
-    const char *AddrFmt = (HeaderData.AddrSize == 4) ? "0x%8.8" PRIx64 "\n"
-                                                     : "0x%16.16" PRIx64 "\n";
+    const char *AddrFmt =
+        (AddrSize == 4) ? "0x%8.8" PRIx64 "\n" : "0x%16.16" PRIx64 "\n";
     OS << "Addrs: [\n";
     for (uint64_t Addr : Addrs)
       OS << format(AddrFmt, Addr);
@@ -162,21 +162,13 @@ Expected<uint64_t> DWARFDebugAddrTable::getAddrEntry(uint32_t Index) const {
     return Addrs[Index];
   return createStringError(errc::invalid_argument,
                            "Index %" PRIu32 " is out of range of the "
-                           ".debug_addr table at offset 0x%" PRIx64,
-                           Index, HeaderOffset);
+                           "address table at offset 0x%" PRIx64,
+                           Index, Offset);
 }
 
-uint32_t DWARFDebugAddrTable::getLength() const {
-  if (HeaderData.Length == 0)
-    return 0;
-  // TODO: DWARF64 support.
-  return HeaderData.Length + sizeof(uint32_t);
+Optional<uint64_t> DWARFDebugAddrTable::getFullLength() const {
+  if (Length == 0)
+    return None;
+  return Length + dwarf::getUnitLengthFieldByteSize(Format);
 }
 
-uint32_t DWARFDebugAddrTable::getDataSize() const {
-  if (DataSize != 0)
-    return DataSize;
-  if (getLength() == 0)
-    return 0;
-  return getLength() - getHeaderSize();
-}
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
index 200b2d52a02b..608fc0388af0 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugArangeSet.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
@@ -29,80 +31,141 @@ void DWARFDebugArangeSet::clear() {
   ArangeDescriptors.clear();
 }
 
-bool
-DWARFDebugArangeSet::extract(DataExtractor data, uint64_t *offset_ptr) {
-  if (data.isValidOffset(*offset_ptr)) {
-    ArangeDescriptors.clear();
-    Offset = *offset_ptr;
-
-    // 7.20 Address Range Table
-    //
-    // Each set of entries in the table of address ranges contained in
-    // the .debug_aranges section begins with a header consisting of: a
-    // 4-byte length containing the length of the set of entries for this
-    // compilation unit, not including the length field itself; a 2-byte
-    // version identifier containing the value 2 for DWARF Version 2; a
-    // 4-byte offset into the.debug_infosection; a 1-byte unsigned integer
-    // containing the size in bytes of an address (or the offset portion of
-    // an address for segmented addressing) on the target system; and a
-    // 1-byte unsigned integer containing the size in bytes of a segment
-    // descriptor on the target system. This header is followed by a series
-    // of tuples. Each tuple consists of an address and a length, each in
-    // the size appropriate for an address on the target architecture.
-    HeaderData.Length = data.getU32(offset_ptr);
-    HeaderData.Version = data.getU16(offset_ptr);
-    HeaderData.CuOffset = data.getU32(offset_ptr);
-    HeaderData.AddrSize = data.getU8(offset_ptr);
-    HeaderData.SegSize = data.getU8(offset_ptr);
-
-    // Perform basic validation of the header fields.
-    if (!data.isValidOffsetForDataOfSize(Offset, HeaderData.Length) ||
-        (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8)) {
-      clear();
-      return false;
-    }
-
-    // The first tuple following the header in each set begins at an offset
-    // that is a multiple of the size of a single tuple (that is, twice the
-    // size of an address). The header is padded, if necessary, to the
-    // appropriate boundary.
-    const uint32_t header_size = *offset_ptr - Offset;
-    const uint32_t tuple_size = HeaderData.AddrSize * 2;
-    uint32_t first_tuple_offset = 0;
-    while (first_tuple_offset < header_size)
-      first_tuple_offset += tuple_size;
-
-    *offset_ptr = Offset + first_tuple_offset;
-
-    Descriptor arangeDescriptor;
-
-    static_assert(sizeof(arangeDescriptor.Address) ==
-                      sizeof(arangeDescriptor.Length),
-                  "Different datatypes for addresses and sizes!");
-    assert(sizeof(arangeDescriptor.Address) >= HeaderData.AddrSize);
-
-    while (data.isValidOffset(*offset_ptr)) {
-      arangeDescriptor.Address = data.getUnsigned(offset_ptr, HeaderData.AddrSize);
-      arangeDescriptor.Length = data.getUnsigned(offset_ptr, HeaderData.AddrSize);
+Error DWARFDebugArangeSet::extract(DWARFDataExtractor data,
+                                   uint64_t *offset_ptr) {
+  assert(data.isValidOffset(*offset_ptr));
+  ArangeDescriptors.clear();
+  Offset = *offset_ptr;
+
+  // 7.21 Address Range Table (extract)
+  // Each set of entries in the table of address ranges contained in
+  // the .debug_aranges section begins with a header containing:
+  // 1. unit_length (initial length)
+  //    A 4-byte (32-bit DWARF) or 12-byte (64-bit DWARF) length containing
+  //    the length of the set of entries for this compilation unit,
+  //    not including the length field itself.
+  // 2. version (uhalf)
+  //    The value in this field is 2.
+  // 3. debug_info_offset (section offset)
+  //    A 4-byte (32-bit DWARF) or 8-byte (64-bit DWARF) offset into the
+  //    .debug_info section of the compilation unit header.
+  // 4. address_size (ubyte)
+  // 5. segment_selector_size (ubyte)
+  // This header is followed by a series of tuples. Each tuple consists of
+  // a segment, an address and a length. The segment selector size is given by
+  // the segment_selector_size field of the header; the address and length
+  // size are each given by the address_size field of the header. Each set of
+  // tuples is terminated by a 0 for the segment, a 0 for the address and 0
+  // for the length. If the segment_selector_size field in the header is zero,
+  // the segment selectors are omitted from all tuples, including
+  // the terminating tuple.
+
+  Error Err = Error::success();
+  std::tie(HeaderData.Length, HeaderData.Format) =
+      data.getInitialLength(offset_ptr, &Err);
+  HeaderData.Version = data.getU16(offset_ptr, &Err);
+  HeaderData.CuOffset = data.getUnsigned(
+      offset_ptr, dwarf::getDwarfOffsetByteSize(HeaderData.Format), &Err);
+  HeaderData.AddrSize = data.getU8(offset_ptr, &Err);
+  HeaderData.SegSize = data.getU8(offset_ptr, &Err);
+  if (Err) {
+    return createStringError(errc::invalid_argument,
+                             "parsing address ranges table at offset 0x%" PRIx64
+                             ": %s",
+                             Offset, toString(std::move(Err)).c_str());
+  }
 
+  // Perform basic validation of the header fields.
+  uint64_t full_length =
+      dwarf::getUnitLengthFieldByteSize(HeaderData.Format) + HeaderData.Length;
+  if (!data.isValidOffsetForDataOfSize(Offset, full_length))
+    return createStringError(errc::invalid_argument,
+                             "the length of address range table at offset "
+                             "0x%" PRIx64 " exceeds section size",
+                             Offset);
+  if (HeaderData.AddrSize != 4 && HeaderData.AddrSize != 8)
+    return createStringError(errc::invalid_argument,
+                             "address range table at offset 0x%" PRIx64
+                             " has unsupported address size: %d "
+                             "(4 and 8 supported)",
+                             Offset, HeaderData.AddrSize);
+  if (HeaderData.SegSize != 0)
+    return createStringError(errc::not_supported,
+                             "non-zero segment selector size in address range "
+                             "table at offset 0x%" PRIx64 " is not supported",
+                             Offset);
+
+  // The first tuple following the header in each set begins at an offset that
+  // is a multiple of the size of a single tuple (that is, twice the size of
+  // an address because we do not support non-zero segment selector sizes).
+  // Therefore, the full length should also be a multiple of the tuple size.
+  const uint32_t tuple_size = HeaderData.AddrSize * 2;
+  if (full_length % tuple_size != 0)
+    return createStringError(
+        errc::invalid_argument,
+        "address range table at offset 0x%" PRIx64
+        " has length that is not a multiple of the tuple size",
+        Offset);
+
+  // The header is padded, if necessary, to the appropriate boundary.
+  const uint32_t header_size = *offset_ptr - Offset;
+  uint32_t first_tuple_offset = 0;
+  while (first_tuple_offset < header_size)
+    first_tuple_offset += tuple_size;
+
+  // There should be space for at least one tuple.
+  if (full_length <= first_tuple_offset)
+    return createStringError(
+        errc::invalid_argument,
+        "address range table at offset 0x%" PRIx64
+        " has an insufficient length to contain any entries",
+        Offset);
+
+  *offset_ptr = Offset + first_tuple_offset;
+
+  Descriptor arangeDescriptor;
+
+  static_assert(sizeof(arangeDescriptor.Address) ==
+                    sizeof(arangeDescriptor.Length),
+                "Different datatypes for addresses and sizes!");
+  assert(sizeof(arangeDescriptor.Address) >= HeaderData.AddrSize);
+
+  uint64_t end_offset = Offset + full_length;
+  while (*offset_ptr < end_offset) {
+    arangeDescriptor.Address = data.getUnsigned(offset_ptr, HeaderData.AddrSize);
+    arangeDescriptor.Length = data.getUnsigned(offset_ptr, HeaderData.AddrSize);
+
+    if (arangeDescriptor.Length == 0) {
       // Each set of tuples is terminated by a 0 for the address and 0
       // for the length.
-      if (arangeDescriptor.Address || arangeDescriptor.Length)
-        ArangeDescriptors.push_back(arangeDescriptor);
-      else
-        break; // We are done if we get a zero address and length
+      if (arangeDescriptor.Address == 0 && *offset_ptr == end_offset)
+        return ErrorSuccess();
+      return createStringError(
+          errc::invalid_argument,
+          "address range table at offset 0x%" PRIx64
+          " has an invalid tuple (length = 0) at offset 0x%" PRIx64,
+          Offset, *offset_ptr - tuple_size);
     }
 
-    return !ArangeDescriptors.empty();
+    ArangeDescriptors.push_back(arangeDescriptor);
   }
-  return false;
+
+  return createStringError(errc::invalid_argument,
+                           "address range table at offset 0x%" PRIx64
+                           " is not terminated by null entry",
+                           Offset);
 }
 
 void DWARFDebugArangeSet::dump(raw_ostream &OS) const {
-  OS << format("Address Range Header: length = 0x%8.8x, version = 0x%4.4x, ",
-               HeaderData.Length, HeaderData.Version)
-     << format("cu_offset = 0x%8.8x, addr_size = 0x%2.2x, seg_size = 0x%2.2x\n",
-               HeaderData.CuOffset, HeaderData.AddrSize, HeaderData.SegSize);
+  int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(HeaderData.Format);
+  OS << "Address Range Header: "
+     << format("length = 0x%0*" PRIx64 ", ", OffsetDumpWidth, HeaderData.Length)
+     << "format = " << dwarf::FormatString(HeaderData.Format) << ", "
+     << format("version = 0x%4.4x, ", HeaderData.Version)
+     << format("cu_offset = 0x%0*" PRIx64 ", ", OffsetDumpWidth,
+               HeaderData.CuOffset)
+     << format("addr_size = 0x%2.2x, ", HeaderData.AddrSize)
+     << format("seg_size = 0x%2.2x\n", HeaderData.SegSize);
 
   for (const auto &Desc : ArangeDescriptors) {
     Desc.dump(OS, HeaderData.AddrSize);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
index fa157e868851..e8ed63075055 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugAranges.cpp
@@ -11,7 +11,6 @@
 #include "llvm/DebugInfo/DWARF/DWARFContext.h"
 #include "llvm/DebugInfo/DWARF/DWARFDebugArangeSet.h"
 #include "llvm/Support/DataExtractor.h"
-#include "llvm/Support/WithColor.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -20,13 +19,19 @@
 
 using namespace llvm;
 
-void DWARFDebugAranges::extract(DataExtractor DebugArangesData) {
+void DWARFDebugAranges::extract(
+    DWARFDataExtractor DebugArangesData,
+    function_ref<void(Error)> RecoverableErrorHandler) {
   if (!DebugArangesData.isValidOffset(0))
     return;
   uint64_t Offset = 0;
   DWARFDebugArangeSet Set;
 
-  while (Set.extract(DebugArangesData, &Offset)) {
+  while (DebugArangesData.isValidOffset(Offset)) {
+    if (Error E = Set.extract(DebugArangesData, &Offset)) {
+      RecoverableErrorHandler(std::move(E));
+      return;
+    }
     uint64_t CUOffset = Set.getCompileUnitDIEOffset();
     for (const auto &Desc : Set.descriptors()) {
       uint64_t LowPC = Desc.Address;
@@ -43,9 +48,9 @@ void DWARFDebugAranges::generate(DWARFContext *CTX) {
     return;
 
   // Extract aranges from .debug_aranges section.
-  DataExtractor ArangesData(CTX->getDWARFObj().getArangesSection(),
-                            CTX->isLittleEndian(), 0);
-  extract(ArangesData);
+  DWARFDataExtractor ArangesData(CTX->getDWARFObj().getArangesSection(),
+                                 CTX->isLittleEndian(), 0);
+  extract(ArangesData, CTX->getRecoverableErrorHandler());
 
   // Generate aranges from DIEs: even if .debug_aranges section is present,
   // it may describe only a small subset of compilation units, so we need to
@@ -55,7 +60,7 @@ void DWARFDebugAranges::generate(DWARFContext *CTX) {
     if (ParsedCUOffsets.insert(CUOffset).second) {
       Expected<DWARFAddressRangesVector> CURanges = CU->collectAddressRanges();
       if (!CURanges)
-        WithColor::error() << toString(CURanges.takeError()) << '\n';
+        CTX->getRecoverableErrorHandler()(CURanges.takeError());
       else
         for (const auto &R : *CURanges)
           appendRange(CUOffset, R.LowPC, R.HighPC);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
index 81b00f65741b..0a1b75592290 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugFrame.cpp
@@ -36,123 +36,130 @@ const uint8_t DWARF_CFI_PRIMARY_OPERAND_MASK = 0x3f;
 
 Error CFIProgram::parse(DWARFDataExtractor Data, uint64_t *Offset,
                         uint64_t EndOffset) {
-  while (*Offset < EndOffset) {
-    uint8_t Opcode = Data.getRelocatedValue(1, Offset);
-    // Some instructions have a primary opcode encoded in the top bits.
-    uint8_t Primary = Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK;
+  DataExtractor::Cursor C(*Offset);
+  while (C && C.tell() < EndOffset) {
+    uint8_t Opcode = Data.getRelocatedValue(C, 1);
+    if (!C)
+      break;
 
-    if (Primary) {
+    // Some instructions have a primary opcode encoded in the top bits.
+    if (uint8_t Primary = Opcode & DWARF_CFI_PRIMARY_OPCODE_MASK) {
       // If it's a primary opcode, the first operand is encoded in the bottom
       // bits of the opcode itself.
       uint64_t Op1 = Opcode & DWARF_CFI_PRIMARY_OPERAND_MASK;
       switch (Primary) {
-      default:
-        return createStringError(errc::illegal_byte_sequence,
-                                 "Invalid primary CFI opcode 0x%" PRIx8,
-                                 Primary);
       case DW_CFA_advance_loc:
       case DW_CFA_restore:
         addInstruction(Primary, Op1);
         break;
       case DW_CFA_offset:
-        addInstruction(Primary, Op1, Data.getULEB128(Offset));
+        addInstruction(Primary, Op1, Data.getULEB128(C));
         break;
-      }
-    } else {
-      // Extended opcode - its value is Opcode itself.
-      switch (Opcode) {
       default:
-        return createStringError(errc::illegal_byte_sequence,
-                                 "Invalid extended CFI opcode 0x%" PRIx8,
-                                 Opcode);
-      case DW_CFA_nop:
-      case DW_CFA_remember_state:
-      case DW_CFA_restore_state:
-      case DW_CFA_GNU_window_save:
-        // No operands
-        addInstruction(Opcode);
-        break;
-      case DW_CFA_set_loc:
-        // Operands: Address
-        addInstruction(Opcode, Data.getRelocatedAddress(Offset));
-        break;
-      case DW_CFA_advance_loc1:
-        // Operands: 1-byte delta
-        addInstruction(Opcode, Data.getRelocatedValue(1, Offset));
-        break;
-      case DW_CFA_advance_loc2:
-        // Operands: 2-byte delta
-        addInstruction(Opcode, Data.getRelocatedValue(2, Offset));
-        break;
-      case DW_CFA_advance_loc4:
-        // Operands: 4-byte delta
-        addInstruction(Opcode, Data.getRelocatedValue(4, Offset));
-        break;
-      case DW_CFA_restore_extended:
-      case DW_CFA_undefined:
-      case DW_CFA_same_value:
-      case DW_CFA_def_cfa_register:
-      case DW_CFA_def_cfa_offset:
-      case DW_CFA_GNU_args_size:
-        // Operands: ULEB128
-        addInstruction(Opcode, Data.getULEB128(Offset));
-        break;
-      case DW_CFA_def_cfa_offset_sf:
-        // Operands: SLEB128
-        addInstruction(Opcode, Data.getSLEB128(Offset));
-        break;
-      case DW_CFA_offset_extended:
-      case DW_CFA_register:
-      case DW_CFA_def_cfa:
-      case DW_CFA_val_offset: {
-        // Operands: ULEB128, ULEB128
-        // Note: We can not embed getULEB128 directly into function
-        // argument list. getULEB128 changes Offset and order of evaluation
-        // for arguments is unspecified.
-        auto op1 = Data.getULEB128(Offset);
-        auto op2 = Data.getULEB128(Offset);
-        addInstruction(Opcode, op1, op2);
-        break;
-        }
-        case DW_CFA_offset_extended_sf:
-        case DW_CFA_def_cfa_sf:
-        case DW_CFA_val_offset_sf: {
-          // Operands: ULEB128, SLEB128
-          // Note: see comment for the previous case
-          auto op1 = Data.getULEB128(Offset);
-          auto op2 = (uint64_t)Data.getSLEB128(Offset);
-          addInstruction(Opcode, op1, op2);
-          break;
-        }
-        case DW_CFA_def_cfa_expression: {
-          uint32_t ExprLength = Data.getULEB128(Offset);
-          addInstruction(Opcode, 0);
-          DataExtractor Extractor(
-              Data.getData().slice(*Offset, *Offset + ExprLength),
-              Data.isLittleEndian(), Data.getAddressSize());
-          Instructions.back().Expression = DWARFExpression(
-              Extractor, Data.getAddressSize(), dwarf::DWARF_VERSION);
-          *Offset += ExprLength;
-          break;
-        }
-        case DW_CFA_expression:
-        case DW_CFA_val_expression: {
-          auto RegNum = Data.getULEB128(Offset);
-          auto BlockLength = Data.getULEB128(Offset);
-          addInstruction(Opcode, RegNum, 0);
-          DataExtractor Extractor(
-              Data.getData().slice(*Offset, *Offset + BlockLength),
-              Data.isLittleEndian(), Data.getAddressSize());
-          Instructions.back().Expression = DWARFExpression(
-              Extractor, Data.getAddressSize(), dwarf::DWARF_VERSION);
-          *Offset += BlockLength;
-          break;
-        }
+        llvm_unreachable("invalid primary CFI opcode");
       }
+      continue;
+    }
+
+    // Extended opcode - its value is Opcode itself.
+    switch (Opcode) {
+    default:
+      return createStringError(errc::illegal_byte_sequence,
+                               "invalid extended CFI opcode 0x%" PRIx8, Opcode);
+    case DW_CFA_nop:
+    case DW_CFA_remember_state:
+    case DW_CFA_restore_state:
+    case DW_CFA_GNU_window_save:
+      // No operands
+      addInstruction(Opcode);
+      break;
+    case DW_CFA_set_loc:
+      // Operands: Address
+      addInstruction(Opcode, Data.getRelocatedAddress(C));
+      break;
+    case DW_CFA_advance_loc1:
+      // Operands: 1-byte delta
+      addInstruction(Opcode, Data.getRelocatedValue(C, 1));
+      break;
+    case DW_CFA_advance_loc2:
+      // Operands: 2-byte delta
+      addInstruction(Opcode, Data.getRelocatedValue(C, 2));
+      break;
+    case DW_CFA_advance_loc4:
+      // Operands: 4-byte delta
+      addInstruction(Opcode, Data.getRelocatedValue(C, 4));
+      break;
+    case DW_CFA_restore_extended:
+    case DW_CFA_undefined:
+    case DW_CFA_same_value:
+    case DW_CFA_def_cfa_register:
+    case DW_CFA_def_cfa_offset:
+    case DW_CFA_GNU_args_size:
+      // Operands: ULEB128
+      addInstruction(Opcode, Data.getULEB128(C));
+      break;
+    case DW_CFA_def_cfa_offset_sf:
+      // Operands: SLEB128
+      addInstruction(Opcode, Data.getSLEB128(C));
+      break;
+    case DW_CFA_offset_extended:
+    case DW_CFA_register:
+    case DW_CFA_def_cfa:
+    case DW_CFA_val_offset: {
+      // Operands: ULEB128, ULEB128
+      // Note: We can not embed getULEB128 directly into function
+      // argument list. getULEB128 changes Offset and order of evaluation
+      // for arguments is unspecified.
+      uint64_t op1 = Data.getULEB128(C);
+      uint64_t op2 = Data.getULEB128(C);
+      addInstruction(Opcode, op1, op2);
+      break;
+    }
+    case DW_CFA_offset_extended_sf:
+    case DW_CFA_def_cfa_sf:
+    case DW_CFA_val_offset_sf: {
+      // Operands: ULEB128, SLEB128
+      // Note: see comment for the previous case
+      uint64_t op1 = Data.getULEB128(C);
+      uint64_t op2 = (uint64_t)Data.getSLEB128(C);
+      addInstruction(Opcode, op1, op2);
+      break;
+    }
+    case DW_CFA_def_cfa_expression: {
+      uint64_t ExprLength = Data.getULEB128(C);
+      addInstruction(Opcode, 0);
+      StringRef Expression = Data.getBytes(C, ExprLength);
+
+      DataExtractor Extractor(Expression, Data.isLittleEndian(),
+                              Data.getAddressSize());
+      // Note. We do not pass the DWARF format to DWARFExpression, because
+      // DW_OP_call_ref, the only operation which depends on the format, is
+      // prohibited in call frame instructions, see sec. 6.4.2 in DWARFv5.
+      Instructions.back().Expression =
+          DWARFExpression(Extractor, Data.getAddressSize());
+      break;
+    }
+    case DW_CFA_expression:
+    case DW_CFA_val_expression: {
+      uint64_t RegNum = Data.getULEB128(C);
+      addInstruction(Opcode, RegNum, 0);
+
+      uint64_t BlockLength = Data.getULEB128(C);
+      StringRef Expression = Data.getBytes(C, BlockLength);
+      DataExtractor Extractor(Expression, Data.isLittleEndian(),
+                              Data.getAddressSize());
+      // Note. We do not pass the DWARF format to DWARFExpression, because
+      // DW_OP_call_ref, the only operation which depends on the format, is
+      // prohibited in call frame instructions, see sec. 6.4.2 in DWARFv5.
+      Instructions.back().Expression =
+          DWARFExpression(Extractor, Data.getAddressSize());
+      break;
+    }
     }
   }
 
-  return Error::success();
+  *Offset = C.tell();
+  return C.takeError();
 }
 
 namespace {
@@ -285,12 +292,33 @@ void CFIProgram::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH,
   }
 }
 
+// Returns the CIE identifier to be used by the requested format.
+// CIE ids for .debug_frame sections are defined in Section 7.24 of DWARFv5.
+// For CIE ID in .eh_frame sections see
+// https://refspecs.linuxfoundation.org/LSB_5.0.0/LSB-Core-generic/LSB-Core-generic/ehframechpt.html
+constexpr uint64_t getCIEId(bool IsDWARF64, bool IsEH) {
+  if (IsEH)
+    return 0;
+  if (IsDWARF64)
+    return DW64_CIE_ID;
+  return DW_CIE_ID;
+}
+
 void CIE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const {
-  OS << format("%08x %08x %08x CIE", (uint32_t)Offset, (uint32_t)Length,
-               DW_CIE_ID)
-     << "\n";
-  OS << format("  Version:               %d\n", Version);
-  OS << "  Augmentation:          \"" << Augmentation << "\"\n";
+  // A CIE with a zero length is a terminator entry in the .eh_frame section.
+  if (IsEH && Length == 0) {
+    OS << format("%08" PRIx64, Offset) << " ZERO terminator\n";
+    return;
+  }
+
+  OS << format("%08" PRIx64, Offset)
+     << format(" %0*" PRIx64, IsDWARF64 ? 16 : 8, Length)
+     << format(" %0*" PRIx64, IsDWARF64 && !IsEH ? 16 : 8,
+               getCIEId(IsDWARF64, IsEH))
+     << " CIE\n"
+     << "  Format:                " << FormatString(IsDWARF64) << "\n"
+     << format("  Version:               %d\n", Version)
+     << "  Augmentation:          \"" << Augmentation << "\"\n";
   if (Version >= 4) {
     OS << format("  Address size:          %u\n", (uint32_t)AddressSize);
     OS << format("  Segment desc size:     %u\n",
@@ -313,11 +341,17 @@ void CIE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const {
 }
 
 void FDE::dump(raw_ostream &OS, const MCRegisterInfo *MRI, bool IsEH) const {
-  OS << format("%08x %08x %08x FDE ", (uint32_t)Offset, (uint32_t)Length,
-               (int32_t)LinkedCIEOffset);
-  OS << format("cie=%08x pc=%08x...%08x\n", (int32_t)LinkedCIEOffset,
-               (uint32_t)InitialLocation,
-               (uint32_t)InitialLocation + (uint32_t)AddressRange);
+  OS << format("%08" PRIx64, Offset)
+     << format(" %0*" PRIx64, IsDWARF64 ? 16 : 8, Length)
+     << format(" %0*" PRIx64, IsDWARF64 && !IsEH ? 16 : 8, CIEPointer)
+     << " FDE cie=";
+  if (LinkedCIE)
+    OS << format("%08" PRIx64, LinkedCIE->getOffset());
+  else
+    OS << "<invalid offset>";
+  OS << format(" pc=%08" PRIx64 "...%08" PRIx64 "\n", InitialLocation,
+               InitialLocation + AddressRange);
+  OS << "  Format:       " << FormatString(IsDWARF64) << "\n";
   if (LSDAAddress)
     OS << format("  LSDA Address: %016" PRIx64 "\n", *LSDAAddress);
   CFIs.dump(OS, MRI, IsEH);
@@ -340,36 +374,28 @@ static void LLVM_ATTRIBUTE_UNUSED dumpDataAux(DataExtractor Data,
   errs() << "\n";
 }
 
-// This is a workaround for old compilers which do not allow
-// noreturn attribute usage in lambdas. Once the support for those
-// compilers are phased out, we can remove this and return back to
-// a ReportError lambda: [StartOffset](const char *ErrorMsg).
-static void LLVM_ATTRIBUTE_NORETURN ReportError(uint64_t StartOffset,
-                                                const char *ErrorMsg) {
-  std::string Str;
-  raw_string_ostream OS(Str);
-  OS << format(ErrorMsg, StartOffset);
-  OS.flush();
-  report_fatal_error(Str);
-}
-
-void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
+Error DWARFDebugFrame::parse(DWARFDataExtractor Data) {
   uint64_t Offset = 0;
   DenseMap<uint64_t, CIE *> CIEs;
 
   while (Data.isValidOffset(Offset)) {
     uint64_t StartOffset = Offset;
 
-    bool IsDWARF64 = false;
-    uint64_t Length = Data.getRelocatedValue(4, &Offset);
-    uint64_t Id;
+    uint64_t Length;
+    DwarfFormat Format;
+    std::tie(Length, Format) = Data.getInitialLength(&Offset);
+    bool IsDWARF64 = Format == DWARF64;
 
-    if (Length == dwarf::DW_LENGTH_DWARF64) {
-      // DWARF-64 is distinguished by the first 32 bits of the initial length
-      // field being 0xffffffff. Then, the next 64 bits are the actual entry
-      // length.
-      IsDWARF64 = true;
-      Length = Data.getRelocatedValue(8, &Offset);
+    // If the Length is 0, then this CIE is a terminator. We add it because some
+    // dumper tools might need it to print something special for such entries
+    // (e.g. llvm-objdump --dwarf=frames prints "ZERO terminator").
+    if (Length == 0) {
+      auto Cie = std::make_unique<CIE>(
+          IsDWARF64, StartOffset, 0, 0, SmallString<8>(), 0, 0, 0, 0, 0,
+          SmallString<8>(), 0, 0, None, None, Arch);
+      CIEs[StartOffset] = Cie.get();
+      Entries.push_back(std::move(Cie));
+      break;
     }
 
     // At this point, Offset points to the next field after Length.
@@ -380,14 +406,21 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
     uint64_t EndStructureOffset = Offset + Length;
 
     // The Id field's size depends on the DWARF format
-    Id = Data.getUnsigned(&Offset, (IsDWARF64 && !IsEH) ? 8 : 4);
-    bool IsCIE =
-        ((IsDWARF64 && Id == DW64_CIE_ID) || Id == DW_CIE_ID || (IsEH && !Id));
+    Error Err = Error::success();
+    uint64_t Id = Data.getRelocatedValue((IsDWARF64 && !IsEH) ? 8 : 4, &Offset,
+                                         /*SectionIndex=*/nullptr, &Err);
+    if (Err)
+      return Err;
 
-    if (IsCIE) {
+    if (Id == getCIEId(IsDWARF64, IsEH)) {
       uint8_t Version = Data.getU8(&Offset);
       const char *Augmentation = Data.getCStr(&Offset);
       StringRef AugmentationString(Augmentation ? Augmentation : "");
+      // TODO: we should provide a way to report a warning and continue dumping.
+      if (IsEH && Version != 1)
+        return createStringError(errc::not_supported,
+                                 "unsupported CIE version: %" PRIu8, Version);
+
       uint8_t AddressSize = Version < 4 ? Data.getAddressSize() :
                                           Data.getU8(&Offset);
       Data.setAddressSize(AddressSize);
@@ -411,61 +444,66 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
         // Walk the augmentation string to get all the augmentation data.
         for (unsigned i = 0, e = AugmentationString.size(); i != e; ++i) {
           switch (AugmentationString[i]) {
-            default:
-              ReportError(
-                  StartOffset,
-                  "Unknown augmentation character in entry at %" PRIx64);
-            case 'L':
-              LSDAPointerEncoding = Data.getU8(&Offset);
-              break;
-            case 'P': {
-              if (Personality)
-                ReportError(StartOffset,
-                            "Duplicate personality in entry at %" PRIx64);
-              PersonalityEncoding = Data.getU8(&Offset);
-              Personality = Data.getEncodedPointer(
-                  &Offset, *PersonalityEncoding,
-                  EHFrameAddress ? EHFrameAddress + Offset : 0);
-              break;
-            }
-            case 'R':
-              FDEPointerEncoding = Data.getU8(&Offset);
-              break;
-            case 'S':
-              // Current frame is a signal trampoline.
-              break;
-            case 'z':
-              if (i)
-                ReportError(StartOffset,
-                            "'z' must be the first character at %" PRIx64);
-              // Parse the augmentation length first.  We only parse it if
-              // the string contains a 'z'.
-              AugmentationLength = Data.getULEB128(&Offset);
-              StartAugmentationOffset = Offset;
-              EndAugmentationOffset = Offset + *AugmentationLength;
-              break;
-            case 'B':
-              // B-Key is used for signing functions associated with this
-              // augmentation string
-              break;
+          default:
+            return createStringError(
+                errc::invalid_argument,
+                "unknown augmentation character in entry at 0x%" PRIx64,
+                StartOffset);
+          case 'L':
+            LSDAPointerEncoding = Data.getU8(&Offset);
+            break;
+          case 'P': {
+            if (Personality)
+              return createStringError(
+                  errc::invalid_argument,
+                  "duplicate personality in entry at 0x%" PRIx64, StartOffset);
+            PersonalityEncoding = Data.getU8(&Offset);
+            Personality = Data.getEncodedPointer(
+                &Offset, *PersonalityEncoding,
+                EHFrameAddress ? EHFrameAddress + Offset : 0);
+            break;
+          }
+          case 'R':
+            FDEPointerEncoding = Data.getU8(&Offset);
+            break;
+          case 'S':
+            // Current frame is a signal trampoline.
+            break;
+          case 'z':
+            if (i)
+              return createStringError(
+                  errc::invalid_argument,
+                  "'z' must be the first character at 0x%" PRIx64, StartOffset);
+            // Parse the augmentation length first.  We only parse it if
+            // the string contains a 'z'.
+            AugmentationLength = Data.getULEB128(&Offset);
+            StartAugmentationOffset = Offset;
+            EndAugmentationOffset = Offset + *AugmentationLength;
+            break;
+          case 'B':
+            // B-Key is used for signing functions associated with this
+            // augmentation string
+            break;
           }
         }
 
         if (AugmentationLength.hasValue()) {
           if (Offset != EndAugmentationOffset)
-            ReportError(StartOffset,
-                        "Parsing augmentation data at %" PRIx64 " failed");
-
+            return createStringError(errc::invalid_argument,
+                                     "parsing augmentation data at 0x%" PRIx64
+                                     " failed",
+                                     StartOffset);
           AugmentationData = Data.getData().slice(StartAugmentationOffset,
                                                   EndAugmentationOffset);
         }
       }
 
       auto Cie = std::make_unique<CIE>(
-          StartOffset, Length, Version, AugmentationString, AddressSize,
-          SegmentDescriptorSize, CodeAlignmentFactor, DataAlignmentFactor,
-          ReturnAddressRegister, AugmentationData, FDEPointerEncoding,
-          LSDAPointerEncoding, Personality, PersonalityEncoding, Arch);
+          IsDWARF64, StartOffset, Length, Version, AugmentationString,
+          AddressSize, SegmentDescriptorSize, CodeAlignmentFactor,
+          DataAlignmentFactor, ReturnAddressRegister, AugmentationData,
+          FDEPointerEncoding, LSDAPointerEncoding, Personality,
+          PersonalityEncoding, Arch);
       CIEs[StartOffset] = Cie.get();
       Entries.emplace_back(std::move(Cie));
     } else {
@@ -479,9 +517,10 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
       if (IsEH) {
         // The address size is encoded in the CIE we reference.
         if (!Cie)
-          ReportError(StartOffset, "Parsing FDE data at %" PRIx64
-                                   " failed due to missing CIE");
-
+          return createStringError(errc::invalid_argument,
+                                   "parsing FDE data at 0x%" PRIx64
+                                   " failed due to missing CIE",
+                                   StartOffset);
         if (auto Val = Data.getEncodedPointer(
                 &Offset, Cie->getFDEPointerEncoding(),
                 EHFrameAddress ? EHFrameAddress + Offset : 0)) {
@@ -507,28 +546,32 @@ void DWARFDebugFrame::parse(DWARFDataExtractor Data) {
           }
 
           if (Offset != EndAugmentationOffset)
-            ReportError(StartOffset,
-                        "Parsing augmentation data at %" PRIx64 " failed");
+            return createStringError(errc::invalid_argument,
+                                     "parsing augmentation data at 0x%" PRIx64
+                                     " failed",
+                                     StartOffset);
         }
       } else {
         InitialLocation = Data.getRelocatedAddress(&Offset);
         AddressRange = Data.getRelocatedAddress(&Offset);
       }
 
-      Entries.emplace_back(new FDE(StartOffset, Length, CIEPointer,
-                                   InitialLocation, AddressRange,
-                                   Cie, LSDAAddress, Arch));
+      Entries.emplace_back(new FDE(IsDWARF64, StartOffset, Length, CIEPointer,
+                                   InitialLocation, AddressRange, Cie,
+                                   LSDAAddress, Arch));
     }
 
     if (Error E =
-            Entries.back()->cfis().parse(Data, &Offset, EndStructureOffset)) {
-      report_fatal_error(toString(std::move(E)));
-    }
+            Entries.back()->cfis().parse(Data, &Offset, EndStructureOffset))
+      return E;
 
     if (Offset != EndStructureOffset)
-      ReportError(StartOffset,
-                  "Parsing entry instructions at %" PRIx64 " failed");
+      return createStringError(
+          errc::invalid_argument,
+          "parsing entry instructions at 0x%" PRIx64 " failed", StartOffset);
   }
+
+  return Error::success();
 }
 
 FrameEntry *DWARFDebugFrame::getEntryAtOffset(uint64_t Offset) const {
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
index 11adb1e47640..3ca21e97888c 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLine.cpp
@@ -16,6 +16,7 @@
 #include "llvm/DebugInfo/DWARF/DWARFRelocMap.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Format.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -41,6 +42,10 @@ using ContentDescriptors = SmallVector<ContentDescriptor, 4>;
 
 } // end anonymous namespace
 
+static bool versionIsSupported(uint16_t Version) {
+  return Version >= 2 && Version <= 5;
+}
+
 void DWARFDebugLine::ContentTypeTracker::trackContentType(
     dwarf::LineNumberEntryFormat ContentType) {
   switch (ContentType) {
@@ -99,13 +104,21 @@ void DWARFDebugLine::Prologue::clear() {
 
 void DWARFDebugLine::Prologue::dump(raw_ostream &OS,
                                     DIDumpOptions DumpOptions) const {
+  if (!totalLengthIsValid())
+    return;
+  int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(FormParams.Format);
   OS << "Line table prologue:\n"
-     << format("    total_length: 0x%8.8" PRIx64 "\n", TotalLength)
+     << format("    total_length: 0x%0*" PRIx64 "\n", OffsetDumpWidth,
+               TotalLength)
+     << "          format: " << dwarf::FormatString(FormParams.Format) << "\n"
      << format("         version: %u\n", getVersion());
+  if (!versionIsSupported(getVersion()))
+    return;
   if (getVersion() >= 5)
     OS << format("    address_size: %u\n", getAddressSize())
        << format(" seg_select_size: %u\n", SegSelectorSize);
-  OS << format(" prologue_length: 0x%8.8" PRIx64 "\n", PrologueLength)
+  OS << format(" prologue_length: 0x%0*" PRIx64 "\n", OffsetDumpWidth,
+               PrologueLength)
      << format(" min_inst_length: %u\n", MinInstLength)
      << format(getVersion() >= 4 ? "max_ops_per_inst: %u\n" : "", MaxOpsPerInst)
      << format(" default_is_stmt: %u\n", DefaultIsStmt)
@@ -114,8 +127,9 @@ void DWARFDebugLine::Prologue::dump(raw_ostream &OS,
      << format("     opcode_base: %u\n", OpcodeBase);
 
   for (uint32_t I = 0; I != StandardOpcodeLengths.size(); ++I)
-    OS << format("standard_opcode_lengths[%s] = %u\n",
-                 LNStandardString(I + 1).data(), StandardOpcodeLengths[I]);
+    OS << formatv("standard_opcode_lengths[{0}] = {1}\n",
+                  static_cast<dwarf::LineNumberOps>(I + 1),
+                  StandardOpcodeLengths[I]);
 
   if (!IncludeDirectories.empty()) {
     // DWARF v5 starts directory indexes at 0.
@@ -153,14 +167,21 @@ void DWARFDebugLine::Prologue::dump(raw_ostream &OS,
 }
 
 // Parse v2-v4 directory and file tables.
-static void
+static Error
 parseV2DirFileTables(const DWARFDataExtractor &DebugLineData,
-                     uint64_t *OffsetPtr, uint64_t EndPrologueOffset,
+                     uint64_t *OffsetPtr,
                      DWARFDebugLine::ContentTypeTracker &ContentTypes,
                      std::vector<DWARFFormValue> &IncludeDirectories,
                      std::vector<DWARFDebugLine::FileNameEntry> &FileNames) {
-  while (*OffsetPtr < EndPrologueOffset) {
-    StringRef S = DebugLineData.getCStrRef(OffsetPtr);
+  while (true) {
+    Error Err = Error::success();
+    StringRef S = DebugLineData.getCStrRef(OffsetPtr, &Err);
+    if (Err) {
+      consumeError(std::move(Err));
+      return createStringError(errc::invalid_argument,
+                               "include directories table was not null "
+                               "terminated before the end of the prologue");
+    }
     if (S.empty())
       break;
     DWARFFormValue Dir =
@@ -168,21 +189,33 @@ parseV2DirFileTables(const DWARFDataExtractor &DebugLineData,
     IncludeDirectories.push_back(Dir);
   }
 
-  while (*OffsetPtr < EndPrologueOffset) {
-    StringRef Name = DebugLineData.getCStrRef(OffsetPtr);
-    if (Name.empty())
+  ContentTypes.HasModTime = true;
+  ContentTypes.HasLength = true;
+
+  while (true) {
+    Error Err = Error::success();
+    StringRef Name = DebugLineData.getCStrRef(OffsetPtr, &Err);
+    if (!Err && Name.empty())
       break;
+
     DWARFDebugLine::FileNameEntry FileEntry;
     FileEntry.Name =
         DWARFFormValue::createFromPValue(dwarf::DW_FORM_string, Name.data());
-    FileEntry.DirIdx = DebugLineData.getULEB128(OffsetPtr);
-    FileEntry.ModTime = DebugLineData.getULEB128(OffsetPtr);
-    FileEntry.Length = DebugLineData.getULEB128(OffsetPtr);
+    FileEntry.DirIdx = DebugLineData.getULEB128(OffsetPtr, &Err);
+    FileEntry.ModTime = DebugLineData.getULEB128(OffsetPtr, &Err);
+    FileEntry.Length = DebugLineData.getULEB128(OffsetPtr, &Err);
+
+    if (Err) {
+      consumeError(std::move(Err));
+      return createStringError(
+          errc::invalid_argument,
+          "file names table was not null terminated before "
+          "the end of the prologue");
+    }
     FileNames.push_back(FileEntry);
   }
 
-  ContentTypes.HasModTime = true;
-  ContentTypes.HasLength = true;
+  return Error::success();
 }
 
 // Parse v5 directory/file entry content descriptions.
@@ -191,14 +224,15 @@ parseV2DirFileTables(const DWARFDataExtractor &DebugLineData,
 static llvm::Expected<ContentDescriptors>
 parseV5EntryFormat(const DWARFDataExtractor &DebugLineData, uint64_t *OffsetPtr,
                    DWARFDebugLine::ContentTypeTracker *ContentTypes) {
+  Error Err = Error::success();
   ContentDescriptors Descriptors;
-  int FormatCount = DebugLineData.getU8(OffsetPtr);
+  int FormatCount = DebugLineData.getU8(OffsetPtr, &Err);
   bool HasPath = false;
-  for (int I = 0; I != FormatCount; ++I) {
+  for (int I = 0; I != FormatCount && !Err; ++I) {
     ContentDescriptor Descriptor;
     Descriptor.Type =
-      dwarf::LineNumberEntryFormat(DebugLineData.getULEB128(OffsetPtr));
-    Descriptor.Form = dwarf::Form(DebugLineData.getULEB128(OffsetPtr));
+        dwarf::LineNumberEntryFormat(DebugLineData.getULEB128(OffsetPtr, &Err));
+    Descriptor.Form = dwarf::Form(DebugLineData.getULEB128(OffsetPtr, &Err));
     if (Descriptor.Type == dwarf::DW_LNCT_path)
       HasPath = true;
     if (ContentTypes)
@@ -206,6 +240,11 @@ parseV5EntryFormat(const DWARFDataExtractor &DebugLineData, uint64_t *OffsetPtr,
     Descriptors.push_back(Descriptor);
   }
 
+  if (Err)
+    return createStringError(errc::invalid_argument,
+                             "failed to parse entry content descriptors: %s",
+                             toString(std::move(Err)).c_str());
+
   if (!HasPath)
     return createStringError(errc::invalid_argument,
                              "failed to parse entry content descriptions"
@@ -227,8 +266,8 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
     return DirDescriptors.takeError();
 
   // Get the directory entries, according to the format described above.
-  int DirEntryCount = DebugLineData.getU8(OffsetPtr);
-  for (int I = 0; I != DirEntryCount; ++I) {
+  uint64_t DirEntryCount = DebugLineData.getULEB128(OffsetPtr);
+  for (uint64_t I = 0; I != DirEntryCount; ++I) {
     for (auto Descriptor : *DirDescriptors) {
       DWARFFormValue Value(Descriptor.Form);
       switch (Descriptor.Type) {
@@ -236,14 +275,14 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
         if (!Value.extractValue(DebugLineData, OffsetPtr, FormParams, &Ctx, U))
           return createStringError(errc::invalid_argument,
                                    "failed to parse directory entry because "
-                                   "extracting the form value failed.");
+                                   "extracting the form value failed");
         IncludeDirectories.push_back(Value);
         break;
       default:
         if (!Value.skipValue(DebugLineData, OffsetPtr, FormParams))
           return createStringError(errc::invalid_argument,
                                    "failed to parse directory entry because "
-                                   "skipping the form value failed.");
+                                   "skipping the form value failed");
       }
     }
   }
@@ -255,15 +294,15 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
     return FileDescriptors.takeError();
 
   // Get the file entries, according to the format described above.
-  int FileEntryCount = DebugLineData.getU8(OffsetPtr);
-  for (int I = 0; I != FileEntryCount; ++I) {
+  uint64_t FileEntryCount = DebugLineData.getULEB128(OffsetPtr);
+  for (uint64_t I = 0; I != FileEntryCount; ++I) {
     DWARFDebugLine::FileNameEntry FileEntry;
     for (auto Descriptor : *FileDescriptors) {
       DWARFFormValue Value(Descriptor.Form);
       if (!Value.extractValue(DebugLineData, OffsetPtr, FormParams, &Ctx, U))
         return createStringError(errc::invalid_argument,
                                  "failed to parse file entry because "
-                                 "extracting the form value failed.");
+                                 "extracting the form value failed");
       switch (Descriptor.Type) {
       case DW_LNCT_path:
         FileEntry.Name = Value;
@@ -297,78 +336,114 @@ parseV5DirFileTables(const DWARFDataExtractor &DebugLineData,
   return Error::success();
 }
 
-Error DWARFDebugLine::Prologue::parse(const DWARFDataExtractor &DebugLineData,
-                                      uint64_t *OffsetPtr,
-                                      const DWARFContext &Ctx,
-                                      const DWARFUnit *U) {
+uint64_t DWARFDebugLine::Prologue::getLength() const {
+  uint64_t Length = PrologueLength + sizeofTotalLength() +
+                    sizeof(getVersion()) + sizeofPrologueLength();
+  if (getVersion() >= 5)
+    Length += 2; // Address + Segment selector sizes.
+  return Length;
+}
+
+Error DWARFDebugLine::Prologue::parse(
+    DWARFDataExtractor DebugLineData, uint64_t *OffsetPtr,
+    function_ref<void(Error)> RecoverableErrorHandler, const DWARFContext &Ctx,
+    const DWARFUnit *U) {
   const uint64_t PrologueOffset = *OffsetPtr;
 
   clear();
-  TotalLength = DebugLineData.getRelocatedValue(4, OffsetPtr);
-  if (TotalLength == dwarf::DW_LENGTH_DWARF64) {
-    FormParams.Format = dwarf::DWARF64;
-    TotalLength = DebugLineData.getU64(OffsetPtr);
-  } else if (TotalLength >= dwarf::DW_LENGTH_lo_reserved) {
-    return createStringError(errc::invalid_argument,
+  DataExtractor::Cursor Cursor(*OffsetPtr);
+  std::tie(TotalLength, FormParams.Format) =
+      DebugLineData.getInitialLength(Cursor);
+
+  DebugLineData =
+      DWARFDataExtractor(DebugLineData, Cursor.tell() + TotalLength);
+  FormParams.Version = DebugLineData.getU16(Cursor);
+  if (Cursor && !versionIsSupported(getVersion())) {
+    // Treat this error as unrecoverable - we cannot be sure what any of
+    // the data represents including the length field, so cannot skip it or make
+    // any reasonable assumptions.
+    *OffsetPtr = Cursor.tell();
+    return createStringError(
+        errc::not_supported,
         "parsing line table prologue at offset 0x%8.8" PRIx64
-        " unsupported reserved unit length found of value 0x%8.8" PRIx64,
-        PrologueOffset, TotalLength);
+        ": unsupported version %" PRIu16,
+        PrologueOffset, getVersion());
   }
-  FormParams.Version = DebugLineData.getU16(OffsetPtr);
-  if (getVersion() < 2)
-    return createStringError(errc::not_supported,
-                       "parsing line table prologue at offset 0x%8.8" PRIx64
-                       " found unsupported version 0x%2.2" PRIx16,
-                       PrologueOffset, getVersion());
 
   if (getVersion() >= 5) {
-    FormParams.AddrSize = DebugLineData.getU8(OffsetPtr);
-    assert((DebugLineData.getAddressSize() == 0 ||
+    FormParams.AddrSize = DebugLineData.getU8(Cursor);
+    assert((!Cursor || DebugLineData.getAddressSize() == 0 ||
             DebugLineData.getAddressSize() == getAddressSize()) &&
            "Line table header and data extractor disagree");
-    SegSelectorSize = DebugLineData.getU8(OffsetPtr);
+    SegSelectorSize = DebugLineData.getU8(Cursor);
   }
 
   PrologueLength =
-      DebugLineData.getRelocatedValue(sizeofPrologueLength(), OffsetPtr);
-  const uint64_t EndPrologueOffset = PrologueLength + *OffsetPtr;
-  MinInstLength = DebugLineData.getU8(OffsetPtr);
+      DebugLineData.getRelocatedValue(Cursor, sizeofPrologueLength());
+  const uint64_t EndPrologueOffset = PrologueLength + Cursor.tell();
+  DebugLineData = DWARFDataExtractor(DebugLineData, EndPrologueOffset);
+  MinInstLength = DebugLineData.getU8(Cursor);
   if (getVersion() >= 4)
-    MaxOpsPerInst = DebugLineData.getU8(OffsetPtr);
-  DefaultIsStmt = DebugLineData.getU8(OffsetPtr);
-  LineBase = DebugLineData.getU8(OffsetPtr);
-  LineRange = DebugLineData.getU8(OffsetPtr);
-  OpcodeBase = DebugLineData.getU8(OffsetPtr);
-
-  StandardOpcodeLengths.reserve(OpcodeBase - 1);
-  for (uint32_t I = 1; I < OpcodeBase; ++I) {
-    uint8_t OpLen = DebugLineData.getU8(OffsetPtr);
-    StandardOpcodeLengths.push_back(OpLen);
+    MaxOpsPerInst = DebugLineData.getU8(Cursor);
+  DefaultIsStmt = DebugLineData.getU8(Cursor);
+  LineBase = DebugLineData.getU8(Cursor);
+  LineRange = DebugLineData.getU8(Cursor);
+  OpcodeBase = DebugLineData.getU8(Cursor);
+
+  if (Cursor && OpcodeBase == 0) {
+    // If the opcode base is 0, we cannot read the standard opcode lengths (of
+    // which there are supposed to be one fewer than the opcode base). Assume
+    // there are no standard opcodes and continue parsing.
+    RecoverableErrorHandler(createStringError(
+        errc::invalid_argument,
+        "parsing line table prologue at offset 0x%8.8" PRIx64
+        " found opcode base of 0. Assuming no standard opcodes",
+        PrologueOffset));
+  } else if (Cursor) {
+    StandardOpcodeLengths.reserve(OpcodeBase - 1);
+    for (uint32_t I = 1; I < OpcodeBase; ++I) {
+      uint8_t OpLen = DebugLineData.getU8(Cursor);
+      StandardOpcodeLengths.push_back(OpLen);
+    }
   }
 
-  if (getVersion() >= 5) {
-    if (Error E =
-            parseV5DirFileTables(DebugLineData, OffsetPtr, FormParams, Ctx, U,
-                                 ContentTypes, IncludeDirectories, FileNames)) {
-      return joinErrors(
-          createStringError(
-              errc::invalid_argument,
-              "parsing line table prologue at 0x%8.8" PRIx64
-              " found an invalid directory or file table description at"
-              " 0x%8.8" PRIx64,
-              PrologueOffset, *OffsetPtr),
-          std::move(E));
-    }
-  } else
-    parseV2DirFileTables(DebugLineData, OffsetPtr, EndPrologueOffset,
-                         ContentTypes, IncludeDirectories, FileNames);
+  *OffsetPtr = Cursor.tell();
+  // A corrupt file name or directory table does not prevent interpretation of
+  // the main line program, so check the cursor state now so that its errors can
+  // be handled separately.
+  if (!Cursor)
+    return createStringError(
+        errc::invalid_argument,
+        "parsing line table prologue at offset 0x%8.8" PRIx64 ": %s",
+        PrologueOffset, toString(Cursor.takeError()).c_str());
+
+  Error E =
+      getVersion() >= 5
+          ? parseV5DirFileTables(DebugLineData, OffsetPtr, FormParams, Ctx, U,
+                                 ContentTypes, IncludeDirectories, FileNames)
+          : parseV2DirFileTables(DebugLineData, OffsetPtr, ContentTypes,
+                                 IncludeDirectories, FileNames);
+  if (E) {
+    RecoverableErrorHandler(joinErrors(
+        createStringError(
+            errc::invalid_argument,
+            "parsing line table prologue at 0x%8.8" PRIx64
+            " found an invalid directory or file table description at"
+            " 0x%8.8" PRIx64,
+            PrologueOffset, *OffsetPtr),
+        std::move(E)));
+    return Error::success();
+  }
 
-  if (*OffsetPtr != EndPrologueOffset)
-    return createStringError(errc::invalid_argument,
-                       "parsing line table prologue at 0x%8.8" PRIx64
-                       " should have ended at 0x%8.8" PRIx64
-                       " but it ended at 0x%8.8" PRIx64,
-                       PrologueOffset, EndPrologueOffset, *OffsetPtr);
+  assert(*OffsetPtr <= EndPrologueOffset);
+  if (*OffsetPtr != EndPrologueOffset) {
+    RecoverableErrorHandler(createStringError(
+        errc::invalid_argument,
+        "unknown data in line table prologue at offset 0x%8.8" PRIx64
+        ": parsing ended (at offset 0x%8.8" PRIx64
+        ") before reaching the prologue end at offset 0x%8.8" PRIx64,
+        PrologueOffset, *OffsetPtr, EndPrologueOffset));
+  }
   return Error::success();
 }
 
@@ -396,10 +471,12 @@ void DWARFDebugLine::Row::reset(bool DefaultIsStmt) {
   EpilogueBegin = false;
 }
 
-void DWARFDebugLine::Row::dumpTableHeader(raw_ostream &OS) {
-  OS << "Address            Line   Column File   ISA Discriminator Flags\n"
-     << "------------------ ------ ------ ------ --- ------------- "
-        "-------------\n";
+void DWARFDebugLine::Row::dumpTableHeader(raw_ostream &OS, unsigned Indent) {
+  OS.indent(Indent)
+      << "Address            Line   Column File   ISA Discriminator Flags\n";
+  OS.indent(Indent)
+      << "------------------ ------ ------ ------ --- ------------- "
+         "-------------\n";
 }
 
 void DWARFDebugLine::Row::dump(raw_ostream &OS) const {
@@ -430,7 +507,7 @@ void DWARFDebugLine::LineTable::dump(raw_ostream &OS,
 
   if (!Rows.empty()) {
     OS << '\n';
-    Row::dumpTableHeader(OS);
+    Row::dumpTableHeader(OS, 0);
     for (const Row &R : Rows) {
       R.dump(OS);
     }
@@ -447,8 +524,10 @@ void DWARFDebugLine::LineTable::clear() {
   Sequences.clear();
 }
 
-DWARFDebugLine::ParsingState::ParsingState(struct LineTable *LT)
-    : LineTable(LT) {
+DWARFDebugLine::ParsingState::ParsingState(
+    struct LineTable *LT, uint64_t TableOffset,
+    function_ref<void(Error)> ErrorHandler)
+    : LineTable(LT), LineTableOffset(TableOffset), ErrorHandler(ErrorHandler) {
   resetRowAndSequence();
 }
 
@@ -488,7 +567,7 @@ DWARFDebugLine::getLineTable(uint64_t Offset) const {
 
 Expected<const DWARFDebugLine::LineTable *> DWARFDebugLine::getOrParseLineTable(
     DWARFDataExtractor &DebugLineData, uint64_t Offset, const DWARFContext &Ctx,
-    const DWARFUnit *U, function_ref<void(Error)> RecoverableErrorCallback) {
+    const DWARFUnit *U, function_ref<void(Error)> RecoverableErrorHandler) {
   if (!DebugLineData.isValidOffset(Offset))
     return createStringError(errc::invalid_argument, "offset 0x%8.8" PRIx64
                        " is not a valid debug line section offset",
@@ -499,32 +578,163 @@ Expected<const DWARFDebugLine::LineTable *> DWARFDebugLine::getOrParseLineTable(
   LineTable *LT = &Pos.first->second;
   if (Pos.second) {
     if (Error Err =
-            LT->parse(DebugLineData, &Offset, Ctx, U, RecoverableErrorCallback))
+            LT->parse(DebugLineData, &Offset, Ctx, U, RecoverableErrorHandler))
       return std::move(Err);
     return LT;
   }
   return LT;
 }
 
+static StringRef getOpcodeName(uint8_t Opcode, uint8_t OpcodeBase) {
+  assert(Opcode != 0);
+  if (Opcode < OpcodeBase)
+    return LNStandardString(Opcode);
+  return "special";
+}
+
+uint64_t DWARFDebugLine::ParsingState::advanceAddr(uint64_t OperationAdvance,
+                                                   uint8_t Opcode,
+                                                   uint64_t OpcodeOffset) {
+  StringRef OpcodeName = getOpcodeName(Opcode, LineTable->Prologue.OpcodeBase);
+  // For versions less than 4, the MaxOpsPerInst member is set to 0, as the
+  // maximum_operations_per_instruction field wasn't introduced until DWARFv4.
+  // Don't warn about bad values in this situation.
+  if (ReportAdvanceAddrProblem && LineTable->Prologue.getVersion() >= 4 &&
+      LineTable->Prologue.MaxOpsPerInst != 1)
+    ErrorHandler(createStringError(
+        errc::not_supported,
+        "line table program at offset 0x%8.8" PRIx64
+        " contains a %s opcode at offset 0x%8.8" PRIx64
+        ", but the prologue maximum_operations_per_instruction value is %" PRId8
+        ", which is unsupported. Assuming a value of 1 instead",
+        LineTableOffset, OpcodeName.data(), OpcodeOffset,
+        LineTable->Prologue.MaxOpsPerInst));
+  if (ReportAdvanceAddrProblem && LineTable->Prologue.MinInstLength == 0)
+    ErrorHandler(
+        createStringError(errc::invalid_argument,
+                          "line table program at offset 0x%8.8" PRIx64
+                          " contains a %s opcode at offset 0x%8.8" PRIx64
+                          ", but the prologue minimum_instruction_length value "
+                          "is 0, which prevents any address advancing",
+                          LineTableOffset, OpcodeName.data(), OpcodeOffset));
+  ReportAdvanceAddrProblem = false;
+  uint64_t AddrOffset = OperationAdvance * LineTable->Prologue.MinInstLength;
+  Row.Address.Address += AddrOffset;
+  return AddrOffset;
+}
+
+DWARFDebugLine::ParsingState::AddrAndAdjustedOpcode
+DWARFDebugLine::ParsingState::advanceAddrForOpcode(uint8_t Opcode,
+                                                   uint64_t OpcodeOffset) {
+  assert(Opcode == DW_LNS_const_add_pc ||
+         Opcode >= LineTable->Prologue.OpcodeBase);
+  if (ReportBadLineRange && LineTable->Prologue.LineRange == 0) {
+    StringRef OpcodeName =
+        getOpcodeName(Opcode, LineTable->Prologue.OpcodeBase);
+    ErrorHandler(
+        createStringError(errc::not_supported,
+                          "line table program at offset 0x%8.8" PRIx64
+                          " contains a %s opcode at offset 0x%8.8" PRIx64
+                          ", but the prologue line_range value is 0. The "
+                          "address and line will not be adjusted",
+                          LineTableOffset, OpcodeName.data(), OpcodeOffset));
+    ReportBadLineRange = false;
+  }
+
+  uint8_t OpcodeValue = Opcode;
+  if (Opcode == DW_LNS_const_add_pc)
+    OpcodeValue = 255;
+  uint8_t AdjustedOpcode = OpcodeValue - LineTable->Prologue.OpcodeBase;
+  uint64_t OperationAdvance =
+      LineTable->Prologue.LineRange != 0
+          ? AdjustedOpcode / LineTable->Prologue.LineRange
+          : 0;
+  uint64_t AddrOffset = advanceAddr(OperationAdvance, Opcode, OpcodeOffset);
+  return {AddrOffset, AdjustedOpcode};
+}
+
+DWARFDebugLine::ParsingState::AddrAndLineDelta
+DWARFDebugLine::ParsingState::handleSpecialOpcode(uint8_t Opcode,
+                                                  uint64_t OpcodeOffset) {
+  // A special opcode value is chosen based on the amount that needs
+  // to be added to the line and address registers. The maximum line
+  // increment for a special opcode is the value of the line_base
+  // field in the header, plus the value of the line_range field,
+  // minus 1 (line base + line range - 1). If the desired line
+  // increment is greater than the maximum line increment, a standard
+  // opcode must be used instead of a special opcode. The "address
+  // advance" is calculated by dividing the desired address increment
+  // by the minimum_instruction_length field from the header. The
+  // special opcode is then calculated using the following formula:
+  //
+  //  opcode = (desired line increment - line_base) +
+  //           (line_range * address advance) + opcode_base
+  //
+  // If the resulting opcode is greater than 255, a standard opcode
+  // must be used instead.
+  //
+  // To decode a special opcode, subtract the opcode_base from the
+  // opcode itself to give the adjusted opcode. The amount to
+  // increment the address register is the result of the adjusted
+  // opcode divided by the line_range multiplied by the
+  // minimum_instruction_length field from the header. That is:
+  //
+  //  address increment = (adjusted opcode / line_range) *
+  //                      minimum_instruction_length
+  //
+  // The amount to increment the line register is the line_base plus
+  // the result of the adjusted opcode modulo the line_range. That is:
+  //
+  // line increment = line_base + (adjusted opcode % line_range)
+
+  DWARFDebugLine::ParsingState::AddrAndAdjustedOpcode AddrAdvanceResult =
+      advanceAddrForOpcode(Opcode, OpcodeOffset);
+  int32_t LineOffset = 0;
+  if (LineTable->Prologue.LineRange != 0)
+    LineOffset =
+        LineTable->Prologue.LineBase +
+        (AddrAdvanceResult.AdjustedOpcode % LineTable->Prologue.LineRange);
+  Row.Line += LineOffset;
+  return {AddrAdvanceResult.AddrDelta, LineOffset};
+}
+
+/// Parse a ULEB128 using the specified \p Cursor. \returns the parsed value on
+/// success, or None if \p Cursor is in a failing state.
+template <typename T>
+static Optional<T> parseULEB128(DWARFDataExtractor &Data,
+                                DataExtractor::Cursor &Cursor) {
+  T Value = Data.getULEB128(Cursor);
+  if (Cursor)
+    return Value;
+  return None;
+}
+
 Error DWARFDebugLine::LineTable::parse(
     DWARFDataExtractor &DebugLineData, uint64_t *OffsetPtr,
     const DWARFContext &Ctx, const DWARFUnit *U,
-    function_ref<void(Error)> RecoverableErrorCallback, raw_ostream *OS) {
+    function_ref<void(Error)> RecoverableErrorHandler, raw_ostream *OS,
+    bool Verbose) {
+  assert((OS || !Verbose) && "cannot have verbose output without stream");
   const uint64_t DebugLineOffset = *OffsetPtr;
 
   clear();
 
-  Error PrologueErr = Prologue.parse(DebugLineData, OffsetPtr, Ctx, U);
+  Error PrologueErr =
+      Prologue.parse(DebugLineData, OffsetPtr, RecoverableErrorHandler, Ctx, U);
 
   if (OS) {
-    // The presence of OS signals verbose dumping.
     DIDumpOptions DumpOptions;
-    DumpOptions.Verbose = true;
+    DumpOptions.Verbose = Verbose;
     Prologue.dump(*OS, DumpOptions);
   }
 
-  if (PrologueErr)
+  if (PrologueErr) {
+    // Ensure there is a blank line after the prologue to clearly delineate it
+    // from later dumps.
+    if (OS)
+      *OS << "\n";
     return PrologueErr;
+  }
 
   uint64_t ProgramLength = Prologue.TotalLength + Prologue.sizeofTotalLength();
   if (!DebugLineData.isValidOffsetForDataOfSize(DebugLineOffset,
@@ -532,7 +742,7 @@ Error DWARFDebugLine::LineTable::parse(
     assert(DebugLineData.size() > DebugLineOffset &&
            "prologue parsing should handle invalid offset");
     uint64_t BytesRemaining = DebugLineData.size() - DebugLineOffset;
-    RecoverableErrorCallback(
+    RecoverableErrorHandler(
         createStringError(errc::invalid_argument,
                           "line table program with offset 0x%8.8" PRIx64
                           " has length 0x%8.8" PRIx64 " but only 0x%8.8" PRIx64
@@ -542,41 +752,62 @@ Error DWARFDebugLine::LineTable::parse(
     ProgramLength = BytesRemaining;
   }
 
+  // Create a DataExtractor which can only see the data up to the end of the
+  // table, to prevent reading past the end.
   const uint64_t EndOffset = DebugLineOffset + ProgramLength;
+  DWARFDataExtractor TableData(DebugLineData, EndOffset);
 
   // See if we should tell the data extractor the address size.
-  if (DebugLineData.getAddressSize() == 0)
-    DebugLineData.setAddressSize(Prologue.getAddressSize());
+  if (TableData.getAddressSize() == 0)
+    TableData.setAddressSize(Prologue.getAddressSize());
   else
     assert(Prologue.getAddressSize() == 0 ||
-           Prologue.getAddressSize() == DebugLineData.getAddressSize());
+           Prologue.getAddressSize() == TableData.getAddressSize());
 
-  ParsingState State(this);
+  ParsingState State(this, DebugLineOffset, RecoverableErrorHandler);
 
+  *OffsetPtr = DebugLineOffset + Prologue.getLength();
+  if (OS && *OffsetPtr < EndOffset) {
+    *OS << '\n';
+    Row::dumpTableHeader(*OS, /*Indent=*/Verbose ? 12 : 0);
+  }
   while (*OffsetPtr < EndOffset) {
-    if (OS)
+    DataExtractor::Cursor Cursor(*OffsetPtr);
+
+    if (Verbose)
       *OS << format("0x%08.08" PRIx64 ": ", *OffsetPtr);
 
-    uint8_t Opcode = DebugLineData.getU8(OffsetPtr);
+    uint64_t OpcodeOffset = *OffsetPtr;
+    uint8_t Opcode = TableData.getU8(Cursor);
+    size_t RowCount = Rows.size();
 
-    if (OS)
+    if (Cursor && Verbose)
       *OS << format("%02.02" PRIx8 " ", Opcode);
 
     if (Opcode == 0) {
       // Extended Opcodes always start with a zero opcode followed by
       // a uleb128 length so you can skip ones you don't know about
-      uint64_t Len = DebugLineData.getULEB128(OffsetPtr);
-      uint64_t ExtOffset = *OffsetPtr;
+      uint64_t Len = TableData.getULEB128(Cursor);
+      uint64_t ExtOffset = Cursor.tell();
 
       // Tolerate zero-length; assume length is correct and soldier on.
       if (Len == 0) {
-        if (OS)
+        if (Cursor && Verbose)
           *OS << "Badly formed extended line op (length 0)\n";
+        if (!Cursor) {
+          if (Verbose)
+            *OS << "\n";
+          RecoverableErrorHandler(Cursor.takeError());
+        }
+        *OffsetPtr = Cursor.tell();
         continue;
       }
 
-      uint8_t SubOpcode = DebugLineData.getU8(OffsetPtr);
-      if (OS)
+      uint8_t SubOpcode = TableData.getU8(Cursor);
+      // OperandOffset will be the same as ExtOffset, if it was not possible to
+      // read the SubOpcode.
+      uint64_t OperandOffset = Cursor.tell();
+      if (Verbose)
         *OS << LNExtendedString(SubOpcode);
       switch (SubOpcode) {
       case DW_LNE_end_sequence:
@@ -588,11 +819,15 @@ Error DWARFDebugLine::LineTable::parse(
         // address is that of the byte after the last target machine instruction
         // of the sequence.
         State.Row.EndSequence = true;
-        if (OS) {
+        // No need to test the Cursor is valid here, since it must be to get
+        // into this code path - if it were invalid, the default case would be
+        // followed.
+        if (Verbose) {
           *OS << "\n";
           OS->indent(12);
-          State.Row.dump(*OS);
         }
+        if (OS)
+          State.Row.dump(*OS);
         State.appendRowToMatrix();
         State.resetRowAndSequence();
         break;
@@ -608,25 +843,39 @@ Error DWARFDebugLine::LineTable::parse(
         // Make sure the extractor knows the address size.  If not, infer it
         // from the size of the operand.
         {
-          uint8_t ExtractorAddressSize = DebugLineData.getAddressSize();
-          if (ExtractorAddressSize != Len - 1 && ExtractorAddressSize != 0)
-            RecoverableErrorCallback(createStringError(
+          uint8_t ExtractorAddressSize = TableData.getAddressSize();
+          uint64_t OpcodeAddressSize = Len - 1;
+          if (ExtractorAddressSize != OpcodeAddressSize &&
+              ExtractorAddressSize != 0)
+            RecoverableErrorHandler(createStringError(
                 errc::invalid_argument,
                 "mismatching address size at offset 0x%8.8" PRIx64
                 " expected 0x%2.2" PRIx8 " found 0x%2.2" PRIx64,
                 ExtOffset, ExtractorAddressSize, Len - 1));
 
           // Assume that the line table is correct and temporarily override the
-          // address size.
-          DebugLineData.setAddressSize(Len - 1);
-          State.Row.Address.Address = DebugLineData.getRelocatedAddress(
-              OffsetPtr, &State.Row.Address.SectionIndex);
-
-          // Restore the address size if the extractor already had it.
-          if (ExtractorAddressSize != 0)
-            DebugLineData.setAddressSize(ExtractorAddressSize);
+          // address size. If the size is unsupported, give up trying to read
+          // the address and continue to the next opcode.
+          if (OpcodeAddressSize != 1 && OpcodeAddressSize != 2 &&
+              OpcodeAddressSize != 4 && OpcodeAddressSize != 8) {
+            RecoverableErrorHandler(createStringError(
+                errc::invalid_argument,
+                "address size 0x%2.2" PRIx64
+                " of DW_LNE_set_address opcode at offset 0x%8.8" PRIx64
+                " is unsupported",
+                OpcodeAddressSize, ExtOffset));
+            TableData.skip(Cursor, OpcodeAddressSize);
+          } else {
+            TableData.setAddressSize(OpcodeAddressSize);
+            State.Row.Address.Address = TableData.getRelocatedAddress(
+                Cursor, &State.Row.Address.SectionIndex);
+
+            // Restore the address size if the extractor already had it.
+            if (ExtractorAddressSize != 0)
+              TableData.setAddressSize(ExtractorAddressSize);
+          }
 
-          if (OS)
+          if (Cursor && Verbose)
             *OS << format(" (0x%16.16" PRIx64 ")", State.Row.Address.Address);
         }
         break;
@@ -654,14 +903,14 @@ Error DWARFDebugLine::LineTable::parse(
         // the file register of the state machine.
         {
           FileNameEntry FileEntry;
-          const char *Name = DebugLineData.getCStr(OffsetPtr);
+          const char *Name = TableData.getCStr(Cursor);
           FileEntry.Name =
               DWARFFormValue::createFromPValue(dwarf::DW_FORM_string, Name);
-          FileEntry.DirIdx = DebugLineData.getULEB128(OffsetPtr);
-          FileEntry.ModTime = DebugLineData.getULEB128(OffsetPtr);
-          FileEntry.Length = DebugLineData.getULEB128(OffsetPtr);
+          FileEntry.DirIdx = TableData.getULEB128(Cursor);
+          FileEntry.ModTime = TableData.getULEB128(Cursor);
+          FileEntry.Length = TableData.getULEB128(Cursor);
           Prologue.FileNames.push_back(FileEntry);
-          if (OS)
+          if (Cursor && Verbose)
             *OS << " (" << Name << ", dir=" << FileEntry.DirIdx << ", mod_time="
                 << format("(0x%16.16" PRIx64 ")", FileEntry.ModTime)
                 << ", length=" << FileEntry.Length << ")";
@@ -669,41 +918,63 @@ Error DWARFDebugLine::LineTable::parse(
         break;
 
       case DW_LNE_set_discriminator:
-        State.Row.Discriminator = DebugLineData.getULEB128(OffsetPtr);
-        if (OS)
+        State.Row.Discriminator = TableData.getULEB128(Cursor);
+        if (Cursor && Verbose)
           *OS << " (" << State.Row.Discriminator << ")";
         break;
 
       default:
-        if (OS)
+        if (Cursor && Verbose)
           *OS << format("Unrecognized extended op 0x%02.02" PRIx8, SubOpcode)
               << format(" length %" PRIx64, Len);
         // Len doesn't include the zero opcode byte or the length itself, but
         // it does include the sub_opcode, so we have to adjust for that.
-        (*OffsetPtr) += Len - 1;
+        TableData.skip(Cursor, Len - 1);
         break;
       }
-      // Make sure the stated and parsed lengths are the same.
-      // Otherwise we have an unparseable line-number program.
-      if (*OffsetPtr - ExtOffset != Len)
-        return createStringError(errc::illegal_byte_sequence,
-                           "unexpected line op length at offset 0x%8.8" PRIx64
-                           " expected 0x%2.2" PRIx64 " found 0x%2.2" PRIx64,
-                           ExtOffset, Len, *OffsetPtr - ExtOffset);
+      // Make sure the length as recorded in the table and the standard length
+      // for the opcode match. If they don't, continue from the end as claimed
+      // by the table. Similarly, continue from the claimed end in the event of
+      // a parsing error.
+      uint64_t End = ExtOffset + Len;
+      if (Cursor && Cursor.tell() != End)
+        RecoverableErrorHandler(createStringError(
+            errc::illegal_byte_sequence,
+            "unexpected line op length at offset 0x%8.8" PRIx64
+            " expected 0x%2.2" PRIx64 " found 0x%2.2" PRIx64,
+            ExtOffset, Len, Cursor.tell() - ExtOffset));
+      if (!Cursor && Verbose) {
+        DWARFDataExtractor::Cursor ByteCursor(OperandOffset);
+        uint8_t Byte = TableData.getU8(ByteCursor);
+        if (ByteCursor) {
+          *OS << " (<parsing error>";
+          do {
+            *OS << format(" %2.2" PRIx8, Byte);
+            Byte = TableData.getU8(ByteCursor);
+          } while (ByteCursor);
+          *OS << ")";
+        }
+
+        // The only parse failure in this case should be if the end was reached.
+        // In that case, throw away the error, as the main Cursor's error will
+        // be sufficient.
+        consumeError(ByteCursor.takeError());
+      }
+      *OffsetPtr = End;
     } else if (Opcode < Prologue.OpcodeBase) {
-      if (OS)
+      if (Verbose)
         *OS << LNStandardString(Opcode);
       switch (Opcode) {
       // Standard Opcodes
       case DW_LNS_copy:
         // Takes no arguments. Append a row to the matrix using the
         // current values of the state-machine registers.
-        if (OS) {
+        if (Verbose) {
           *OS << "\n";
           OS->indent(12);
-          State.Row.dump(*OS);
-          *OS << "\n";
         }
+        if (OS)
+          State.Row.dump(*OS);
         State.appendRowToMatrix();
         break;
 
@@ -711,11 +982,11 @@ Error DWARFDebugLine::LineTable::parse(
         // Takes a single unsigned LEB128 operand, multiplies it by the
         // min_inst_length field of the prologue, and adds the
         // result to the address register of the state machine.
-        {
+        if (Optional<uint64_t> Operand =
+                parseULEB128<uint64_t>(TableData, Cursor)) {
           uint64_t AddrOffset =
-              DebugLineData.getULEB128(OffsetPtr) * Prologue.MinInstLength;
-          State.Row.Address.Address += AddrOffset;
-          if (OS)
+              State.advanceAddr(*Operand, Opcode, OpcodeOffset);
+          if (Verbose)
             *OS << " (" << AddrOffset << ")";
         }
         break;
@@ -723,25 +994,36 @@ Error DWARFDebugLine::LineTable::parse(
       case DW_LNS_advance_line:
         // Takes a single signed LEB128 operand and adds that value to
         // the line register of the state machine.
-        State.Row.Line += DebugLineData.getSLEB128(OffsetPtr);
-        if (OS)
-          *OS << " (" << State.Row.Line << ")";
+        {
+          int64_t LineDelta = TableData.getSLEB128(Cursor);
+          if (Cursor) {
+            State.Row.Line += LineDelta;
+            if (Verbose)
+              *OS << " (" << State.Row.Line << ")";
+          }
+        }
         break;
 
       case DW_LNS_set_file:
         // Takes a single unsigned LEB128 operand and stores it in the file
         // register of the state machine.
-        State.Row.File = DebugLineData.getULEB128(OffsetPtr);
-        if (OS)
-          *OS << " (" << State.Row.File << ")";
+        if (Optional<uint16_t> File =
+                parseULEB128<uint16_t>(TableData, Cursor)) {
+          State.Row.File = *File;
+          if (Verbose)
+            *OS << " (" << State.Row.File << ")";
+        }
         break;
 
       case DW_LNS_set_column:
         // Takes a single unsigned LEB128 operand and stores it in the
         // column register of the state machine.
-        State.Row.Column = DebugLineData.getULEB128(OffsetPtr);
-        if (OS)
-          *OS << " (" << State.Row.Column << ")";
+        if (Optional<uint16_t> Column =
+                parseULEB128<uint16_t>(TableData, Cursor)) {
+          State.Row.Column = *Column;
+          if (Verbose)
+            *OS << " (" << State.Row.Column << ")";
+        }
         break;
 
       case DW_LNS_negate_stmt:
@@ -769,13 +1051,10 @@ Error DWARFDebugLine::LineTable::parse(
         // than twice that range will it need to use both DW_LNS_advance_pc
         // and a special opcode, requiring three or more bytes.
         {
-          uint8_t AdjustOpcode = 255 - Prologue.OpcodeBase;
           uint64_t AddrOffset =
-              (AdjustOpcode / Prologue.LineRange) * Prologue.MinInstLength;
-          State.Row.Address.Address += AddrOffset;
-          if (OS)
-            *OS
-                << format(" (0x%16.16" PRIx64 ")", AddrOffset);
+              State.advanceAddrForOpcode(Opcode, OpcodeOffset).AddrDelta;
+          if (Verbose)
+            *OS << format(" (0x%16.16" PRIx64 ")", AddrOffset);
         }
         break;
 
@@ -790,11 +1069,13 @@ Error DWARFDebugLine::LineTable::parse(
         // requires the use of DW_LNS_advance_pc. Such assemblers, however,
         // can use DW_LNS_fixed_advance_pc instead, sacrificing compression.
         {
-          uint16_t PCOffset = DebugLineData.getRelocatedValue(2, OffsetPtr);
-          State.Row.Address.Address += PCOffset;
-          if (OS)
-            *OS
-                << format(" (0x%4.4" PRIx16 ")", PCOffset);
+          uint16_t PCOffset =
+              TableData.getRelocatedValue(Cursor, 2);
+          if (Cursor) {
+            State.Row.Address.Address += PCOffset;
+            if (Verbose)
+              *OS << format(" (0x%4.4" PRIx16 ")", PCOffset);
+          }
         }
         break;
 
@@ -812,10 +1093,12 @@ Error DWARFDebugLine::LineTable::parse(
 
       case DW_LNS_set_isa:
         // Takes a single unsigned LEB128 operand and stores it in the
-        // column register of the state machine.
-        State.Row.Isa = DebugLineData.getULEB128(OffsetPtr);
-        if (OS)
-          *OS << " (" << (uint64_t)State.Row.Isa << ")";
+        // ISA register of the state machine.
+        if (Optional<uint8_t> Isa = parseULEB128<uint8_t>(TableData, Cursor)) {
+          State.Row.Isa = *Isa;
+          if (Verbose)
+            *OS << " (" << (uint64_t)State.Row.Isa << ")";
+        }
         break;
 
       default:
@@ -824,73 +1107,72 @@ Error DWARFDebugLine::LineTable::parse(
         // as a multiple of LEB128 operands for each opcode.
         {
           assert(Opcode - 1U < Prologue.StandardOpcodeLengths.size());
+          if (Verbose)
+            *OS << "Unrecognized standard opcode";
           uint8_t OpcodeLength = Prologue.StandardOpcodeLengths[Opcode - 1];
+          std::vector<uint64_t> Operands;
           for (uint8_t I = 0; I < OpcodeLength; ++I) {
-            uint64_t Value = DebugLineData.getULEB128(OffsetPtr);
-            if (OS)
-              *OS << format("Skipping ULEB128 value: 0x%16.16" PRIx64 ")\n",
-                            Value);
+            if (Optional<uint64_t> Value =
+                    parseULEB128<uint64_t>(TableData, Cursor))
+              Operands.push_back(*Value);
+            else
+              break;
+          }
+          if (Verbose && !Operands.empty()) {
+            *OS << " (operands: ";
+            bool First = true;
+            for (uint64_t Value : Operands) {
+              if (!First)
+                *OS << ", ";
+              First = false;
+              *OS << format("0x%16.16" PRIx64, Value);
+            }
+            if (Verbose)
+              *OS << ')';
           }
         }
         break;
       }
+
+      *OffsetPtr = Cursor.tell();
     } else {
-      // Special Opcodes
-
-      // A special opcode value is chosen based on the amount that needs
-      // to be added to the line and address registers. The maximum line
-      // increment for a special opcode is the value of the line_base
-      // field in the header, plus the value of the line_range field,
-      // minus 1 (line base + line range - 1). If the desired line
-      // increment is greater than the maximum line increment, a standard
-      // opcode must be used instead of a special opcode. The "address
-      // advance" is calculated by dividing the desired address increment
-      // by the minimum_instruction_length field from the header. The
-      // special opcode is then calculated using the following formula:
-      //
-      //  opcode = (desired line increment - line_base) +
-      //           (line_range * address advance) + opcode_base
-      //
-      // If the resulting opcode is greater than 255, a standard opcode
-      // must be used instead.
-      //
-      // To decode a special opcode, subtract the opcode_base from the
-      // opcode itself to give the adjusted opcode. The amount to
-      // increment the address register is the result of the adjusted
-      // opcode divided by the line_range multiplied by the
-      // minimum_instruction_length field from the header. That is:
-      //
-      //  address increment = (adjusted opcode / line_range) *
-      //                      minimum_instruction_length
-      //
-      // The amount to increment the line register is the line_base plus
-      // the result of the adjusted opcode modulo the line_range. That is:
-      //
-      // line increment = line_base + (adjusted opcode % line_range)
-
-      uint8_t AdjustOpcode = Opcode - Prologue.OpcodeBase;
-      uint64_t AddrOffset =
-          (AdjustOpcode / Prologue.LineRange) * Prologue.MinInstLength;
-      int32_t LineOffset =
-          Prologue.LineBase + (AdjustOpcode % Prologue.LineRange);
-      State.Row.Line += LineOffset;
-      State.Row.Address.Address += AddrOffset;
-
-      if (OS) {
-        *OS << "address += " << AddrOffset << ",  line += " << LineOffset
+      // Special Opcodes.
+      ParsingState::AddrAndLineDelta Delta =
+          State.handleSpecialOpcode(Opcode, OpcodeOffset);
+
+      if (Verbose) {
+        *OS << "address += " << Delta.Address << ",  line += " << Delta.Line
             << "\n";
         OS->indent(12);
-        State.Row.dump(*OS);
       }
+      if (OS)
+        State.Row.dump(*OS);
 
       State.appendRowToMatrix();
+      *OffsetPtr = Cursor.tell();
     }
-    if(OS)
+
+    // When a row is added to the matrix, it is also dumped, which includes a
+    // new line already, so don't add an extra one.
+    if (Verbose && Rows.size() == RowCount)
       *OS << "\n";
+
+    // Most parse failures other than when parsing extended opcodes are due to
+    // failures to read ULEBs. Bail out of parsing, since we don't know where to
+    // continue reading from as there is no stated length for such byte
+    // sequences. Print the final trailing new line if needed before doing so.
+    if (!Cursor && Opcode != 0) {
+      if (Verbose)
+        *OS << "\n";
+      return Cursor.takeError();
+    }
+
+    if (!Cursor)
+      RecoverableErrorHandler(Cursor.takeError());
   }
 
   if (!State.Sequence.Empty)
-    RecoverableErrorCallback(createStringError(
+    RecoverableErrorHandler(createStringError(
         errc::illegal_byte_sequence,
         "last sequence in debug line table at offset 0x%8.8" PRIx64
         " is not terminated",
@@ -907,6 +1189,11 @@ Error DWARFDebugLine::LineTable::parse(
     // rudimentary sequences for address ranges [0x0, 0xsomething).
   }
 
+  // Terminate the table with a final blank line to clearly delineate it from
+  // later dumps.
+  if (OS)
+    *OS << "\n";
+
   return Error::success();
 }
 
@@ -1054,9 +1341,13 @@ bool DWARFDebugLine::Prologue::getFileNameByIndex(
   if (!Name)
     return false;
   StringRef FileName = *Name;
-  if (Kind != FileLineInfoKind::AbsoluteFilePath ||
+  if (Kind == FileLineInfoKind::RawValue ||
       isPathAbsoluteOnWindowsOrPosix(FileName)) {
-    Result = FileName;
+    Result = std::string(FileName);
+    return true;
+  }
+  if (Kind == FileLineInfoKind::BaseNameOnly) {
+    Result = std::string(llvm::sys::path::filename(FileName));
     return true;
   }
 
@@ -1064,23 +1355,31 @@ bool DWARFDebugLine::Prologue::getFileNameByIndex(
   StringRef IncludeDir;
   // Be defensive about the contents of Entry.
   if (getVersion() >= 5) {
-    if (Entry.DirIdx < IncludeDirectories.size())
+    // DirIdx 0 is the compilation directory, so don't include it for
+    // relative names.
+    if ((Entry.DirIdx != 0 || Kind != FileLineInfoKind::RelativeFilePath) &&
+        Entry.DirIdx < IncludeDirectories.size())
       IncludeDir = IncludeDirectories[Entry.DirIdx].getAsCString().getValue();
   } else {
     if (0 < Entry.DirIdx && Entry.DirIdx <= IncludeDirectories.size())
       IncludeDir =
           IncludeDirectories[Entry.DirIdx - 1].getAsCString().getValue();
-
-    // We may still need to append compilation directory of compile unit.
-    // We know that FileName is not absolute, the only way to have an
-    // absolute path at this point would be if IncludeDir is absolute.
-    if (!CompDir.empty() && !isPathAbsoluteOnWindowsOrPosix(IncludeDir))
-      sys::path::append(FilePath, Style, CompDir);
   }
 
+  // For absolute paths only, include the compilation directory of compile unit.
+  // We know that FileName is not absolute, the only way to have an absolute
+  // path at this point would be if IncludeDir is absolute.
+  if (Kind == FileLineInfoKind::AbsoluteFilePath && !CompDir.empty() &&
+      !isPathAbsoluteOnWindowsOrPosix(IncludeDir))
+    sys::path::append(FilePath, Style, CompDir);
+
+  assert((Kind == FileLineInfoKind::AbsoluteFilePath ||
+          Kind == FileLineInfoKind::RelativeFilePath) &&
+         "invalid FileLineInfo Kind");
+
   // sys::path::append skips empty strings.
   sys::path::append(FilePath, Style, IncludeDir, FileName);
-  Result = FilePath.str();
+  Result = std::string(FilePath.str());
   return true;
 }
 
@@ -1131,34 +1430,36 @@ DWARFDebugLine::SectionParser::SectionParser(DWARFDataExtractor &Data,
 }
 
 bool DWARFDebugLine::Prologue::totalLengthIsValid() const {
-  return TotalLength == dwarf::DW_LENGTH_DWARF64 ||
-         TotalLength < dwarf::DW_LENGTH_lo_reserved;
+  return TotalLength != 0u;
 }
 
 DWARFDebugLine::LineTable DWARFDebugLine::SectionParser::parseNext(
-    function_ref<void(Error)> RecoverableErrorCallback,
-    function_ref<void(Error)> UnrecoverableErrorCallback, raw_ostream *OS) {
+    function_ref<void(Error)> RecoverableErrorHandler,
+    function_ref<void(Error)> UnrecoverableErrorHandler, raw_ostream *OS,
+    bool Verbose) {
   assert(DebugLineData.isValidOffset(Offset) &&
          "parsing should have terminated");
   DWARFUnit *U = prepareToParse(Offset);
   uint64_t OldOffset = Offset;
   LineTable LT;
   if (Error Err = LT.parse(DebugLineData, &Offset, Context, U,
-                           RecoverableErrorCallback, OS))
-    UnrecoverableErrorCallback(std::move(Err));
+                           RecoverableErrorHandler, OS, Verbose))
+    UnrecoverableErrorHandler(std::move(Err));
   moveToNextTable(OldOffset, LT.Prologue);
   return LT;
 }
 
 void DWARFDebugLine::SectionParser::skip(
-    function_ref<void(Error)> ErrorCallback) {
+    function_ref<void(Error)> RecoverableErrorHandler,
+    function_ref<void(Error)> UnrecoverableErrorHandler) {
   assert(DebugLineData.isValidOffset(Offset) &&
          "parsing should have terminated");
   DWARFUnit *U = prepareToParse(Offset);
   uint64_t OldOffset = Offset;
   LineTable LT;
-  if (Error Err = LT.Prologue.parse(DebugLineData, &Offset, Context, U))
-    ErrorCallback(std::move(Err));
+  if (Error Err = LT.Prologue.parse(DebugLineData, &Offset,
+                                    RecoverableErrorHandler, Context, U))
+    UnrecoverableErrorHandler(std::move(Err));
   moveToNextTable(OldOffset, LT.Prologue);
 }
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
index 0c5f9a9c54ec..f38126364401 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugLoc.cpp
@@ -106,16 +106,15 @@ DWARFLocationInterpreter::Interpret(const DWARFLocationEntry &E) {
   }
 }
 
-// When directly dumping the .debug_loc without a compile unit, we have to guess
-// at the DWARF version. This only affects DW_OP_call_ref, which is a rare
-// expression that LLVM doesn't produce. Guessing the wrong version means we
-// won't be able to pretty print expressions in DWARF2 binaries produced by
-// non-LLVM tools.
 static void dumpExpression(raw_ostream &OS, ArrayRef<uint8_t> Data,
                            bool IsLittleEndian, unsigned AddressSize,
                            const MCRegisterInfo *MRI, DWARFUnit *U) {
-  DWARFDataExtractor Extractor(toStringRef(Data), IsLittleEndian, AddressSize);
-  DWARFExpression(Extractor, dwarf::DWARF_VERSION, AddressSize).print(OS, MRI, U);
+  DWARFDataExtractor Extractor(Data, IsLittleEndian, AddressSize);
+  // Note. We do not pass any format to DWARFExpression, even if the
+  // corresponding unit is known. For now, there is only one operation,
+  // DW_OP_call_ref, which depends on the format; it is rarely used, and
+  // is unexpected in location tables.
+  DWARFExpression(Extractor, AddressSize).print(OS, MRI, U);
 }
 
 bool DWARFLocationTable::dumpLocationList(uint64_t *Offset, raw_ostream &OS,
@@ -161,9 +160,7 @@ bool DWARFLocationTable::dumpLocationList(uint64_t *Offset, raw_ostream &OS,
     return true;
   });
   if (E) {
-    OS << "\n";
-    OS.indent(Indent);
-    OS << "error: " << toString(std::move(E));
+    DumpOpts.RecoverableErrorHandler(std::move(E));
     return false;
   }
   return true;
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
index 8cb259ebc622..f920d69cc43f 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugMacro.cpp
@@ -8,6 +8,8 @@
 
 #include "llvm/DebugInfo/DWARF/DWARFDebugMacro.h"
 #include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFDataExtractor.h"
 #include "llvm/Support/WithColor.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdint>
@@ -15,10 +17,32 @@
 using namespace llvm;
 using namespace dwarf;
 
+DwarfFormat DWARFDebugMacro::MacroHeader::getDwarfFormat() const {
+  return Flags & MACRO_OFFSET_SIZE ? DWARF64 : DWARF32;
+}
+
+uint8_t DWARFDebugMacro::MacroHeader::getOffsetByteSize() const {
+  return getDwarfOffsetByteSize(getDwarfFormat());
+}
+
+void DWARFDebugMacro::MacroHeader::dumpMacroHeader(raw_ostream &OS) const {
+  // FIXME: Add support for dumping opcode_operands_table
+  OS << format("macro header: version = 0x%04" PRIx16, Version)
+     << format(", flags = 0x%02" PRIx8, Flags)
+     << ", format = " << FormatString(getDwarfFormat());
+  if (Flags & MACRO_DEBUG_LINE_OFFSET)
+    OS << format(", debug_line_offset = 0x%0*" PRIx64, 2 * getOffsetByteSize(),
+                 DebugLineOffset);
+  OS << "\n";
+}
+
 void DWARFDebugMacro::dump(raw_ostream &OS) const {
   unsigned IndLevel = 0;
   for (const auto &Macros : MacroLists) {
-    for (const Entry &E : Macros) {
+    OS << format("0x%08" PRIx64 ":\n", Macros.Offset);
+    if (Macros.Header.Version >= 5)
+      Macros.Header.dumpMacroHeader(OS);
+    for (const Entry &E : Macros.Macros) {
       // There should not be DW_MACINFO_end_file when IndLevel is Zero. However,
       // this check handles the case of corrupted ".debug_macinfo" section.
       if (IndLevel > 0)
@@ -27,22 +51,40 @@ void DWARFDebugMacro::dump(raw_ostream &OS) const {
       for (unsigned I = 0; I < IndLevel; I++)
         OS << "  ";
       IndLevel += (E.Type == DW_MACINFO_start_file);
-
-      WithColor(OS, HighlightColor::Macro).get() << MacinfoString(E.Type);
+      // Based on which version we are handling choose appropriate macro forms.
+      if (Macros.Header.Version >= 5)
+        WithColor(OS, HighlightColor::Macro).get() << MacroString(E.Type);
+      else
+        WithColor(OS, HighlightColor::Macro).get() << MacinfoString(E.Type);
       switch (E.Type) {
       default:
-        // Got a corrupted ".debug_macinfo" section (invalid macinfo type).
+        // Got a corrupted ".debug_macinfo/.debug_macro" section (invalid
+        // macinfo type).
         break;
-      case DW_MACINFO_define:
-      case DW_MACINFO_undef:
+        // debug_macro and debug_macinfo share some common encodings.
+        // DW_MACRO_define     == DW_MACINFO_define
+        // DW_MACRO_undef      == DW_MACINFO_undef
+        // DW_MACRO_start_file == DW_MACINFO_start_file
+        // DW_MACRO_end_file   == DW_MACINFO_end_file
+        // For readability/uniformity we are using DW_MACRO_*.
+      case DW_MACRO_define:
+      case DW_MACRO_undef:
+      case DW_MACRO_define_strp:
+      case DW_MACRO_undef_strp:
+      case DW_MACRO_define_strx:
+      case DW_MACRO_undef_strx:
         OS << " - lineno: " << E.Line;
         OS << " macro: " << E.MacroStr;
         break;
-      case DW_MACINFO_start_file:
+      case DW_MACRO_start_file:
         OS << " - lineno: " << E.Line;
         OS << " filenum: " << E.File;
         break;
-      case DW_MACINFO_end_file:
+      case DW_MACRO_import:
+        OS << format(" - import offset: 0x%0*" PRIx64,
+                     2 * Macros.Header.getOffsetByteSize(), E.ImportOffset);
+        break;
+      case DW_MACRO_end_file:
         break;
       case DW_MACINFO_vendor_ext:
         OS << " - constant: " << E.ExtConstant;
@@ -51,26 +93,46 @@ void DWARFDebugMacro::dump(raw_ostream &OS) const {
       }
       OS << "\n";
     }
-    OS << "\n";
   }
 }
 
-void DWARFDebugMacro::parse(DataExtractor data) {
+Error DWARFDebugMacro::parseImpl(
+    Optional<DWARFUnitVector::iterator_range> Units,
+    Optional<DataExtractor> StringExtractor, DWARFDataExtractor Data,
+    bool IsMacro) {
   uint64_t Offset = 0;
   MacroList *M = nullptr;
-  while (data.isValidOffset(Offset)) {
+  using MacroToUnitsMap = DenseMap<uint64_t, DWARFUnit *>;
+  MacroToUnitsMap MacroToUnits;
+  if (IsMacro && Data.isValidOffset(Offset)) {
+    // Keep a mapping from Macro contribution to CUs, this will
+    // be needed while retrieving macro from DW_MACRO_define_strx form.
+    for (const auto &U : Units.getValue())
+      if (auto CUDIE = U->getUnitDIE())
+        // Skip units which does not contibutes to macro section.
+        if (auto MacroOffset = toSectionOffset(CUDIE.find(DW_AT_macros)))
+          MacroToUnits.try_emplace(*MacroOffset, U.get());
+  }
+  while (Data.isValidOffset(Offset)) {
     if (!M) {
       MacroLists.emplace_back();
       M = &MacroLists.back();
+      M->Offset = Offset;
+      if (IsMacro) {
+        auto Err = M->Header.parseMacroHeader(Data, &Offset);
+        if (Err)
+          return Err;
+      }
     }
     // A macro list entry consists of:
-    M->emplace_back();
-    Entry &E = M->back();
+    M->Macros.emplace_back();
+    Entry &E = M->Macros.back();
     // 1. Macinfo type
-    E.Type = data.getULEB128(&Offset);
+    E.Type = Data.getULEB128(&Offset);
 
     if (E.Type == 0) {
-      // Reached end of a ".debug_macinfo" section contribution.
+      // Reached end of a ".debug_macinfo/debug_macro" section contribution.
+      M = nullptr;
       continue;
     }
 
@@ -79,28 +141,99 @@ void DWARFDebugMacro::parse(DataExtractor data) {
       // Got a corrupted ".debug_macinfo" section (invalid macinfo type).
       // Push the corrupted entry to the list and halt parsing.
       E.Type = DW_MACINFO_invalid;
-      return;
-    case DW_MACINFO_define:
-    case DW_MACINFO_undef:
+      return Error::success();
+    // debug_macro and debug_macinfo share some common encodings.
+    // DW_MACRO_define     == DW_MACINFO_define
+    // DW_MACRO_undef      == DW_MACINFO_undef
+    // DW_MACRO_start_file == DW_MACINFO_start_file
+    // DW_MACRO_end_file   == DW_MACINFO_end_file
+    // For readibility/uniformity we are using DW_MACRO_*.
+    case DW_MACRO_define:
+    case DW_MACRO_undef:
+      // 2. Source line
+      E.Line = Data.getULEB128(&Offset);
+      // 3. Macro string
+      E.MacroStr = Data.getCStr(&Offset);
+      break;
+    case DW_MACRO_define_strp:
+    case DW_MACRO_undef_strp: {
+      if (!IsMacro) {
+        // DW_MACRO_define_strp is a new form introduced in DWARFv5, it is
+        // not supported in debug_macinfo[.dwo] sections. Assume it as an
+        // invalid entry, push it and halt parsing.
+        E.Type = DW_MACINFO_invalid;
+        return Error::success();
+      }
+      uint64_t StrOffset = 0;
       // 2. Source line
-      E.Line = data.getULEB128(&Offset);
+      E.Line = Data.getULEB128(&Offset);
       // 3. Macro string
-      E.MacroStr = data.getCStr(&Offset);
+      StrOffset =
+          Data.getRelocatedValue(M->Header.getOffsetByteSize(), &Offset);
+      assert(StringExtractor && "String Extractor not found");
+      E.MacroStr = StringExtractor->getCStr(&StrOffset);
+      break;
+    }
+    case DW_MACRO_define_strx:
+    case DW_MACRO_undef_strx: {
+      if (!IsMacro) {
+        // DW_MACRO_define_strx is a new form introduced in DWARFv5, it is
+        // not supported in debug_macinfo[.dwo] sections. Assume it as an
+        // invalid entry, push it and halt parsing.
+        E.Type = DW_MACINFO_invalid;
+        return Error::success();
+      }
+      E.Line = Data.getULEB128(&Offset);
+      auto MacroContributionOffset = MacroToUnits.find(M->Offset);
+      if (MacroContributionOffset == MacroToUnits.end())
+        return createStringError(errc::invalid_argument,
+                                 "Macro contribution of the unit not found");
+      Optional<uint64_t> StrOffset =
+          MacroContributionOffset->second->getStringOffsetSectionItem(
+              Data.getULEB128(&Offset));
+      if (!StrOffset)
+        return createStringError(
+            errc::invalid_argument,
+            "String offsets contribution of the unit not found");
+      E.MacroStr =
+          MacroContributionOffset->second->getStringExtractor().getCStr(
+              &*StrOffset);
       break;
-    case DW_MACINFO_start_file:
+    }
+    case DW_MACRO_start_file:
       // 2. Source line
-      E.Line = data.getULEB128(&Offset);
+      E.Line = Data.getULEB128(&Offset);
       // 3. Source file id
-      E.File = data.getULEB128(&Offset);
+      E.File = Data.getULEB128(&Offset);
+      break;
+    case DW_MACRO_end_file:
       break;
-    case DW_MACINFO_end_file:
+    case DW_MACRO_import:
+      E.ImportOffset =
+          Data.getRelocatedValue(M->Header.getOffsetByteSize(), &Offset);
       break;
     case DW_MACINFO_vendor_ext:
       // 2. Vendor extension constant
-      E.ExtConstant = data.getULEB128(&Offset);
+      E.ExtConstant = Data.getULEB128(&Offset);
       // 3. Vendor extension string
-      E.ExtStr = data.getCStr(&Offset);
+      E.ExtStr = Data.getCStr(&Offset);
       break;
     }
   }
+  return Error::success();
+}
+
+Error DWARFDebugMacro::MacroHeader::parseMacroHeader(DWARFDataExtractor Data,
+                                                     uint64_t *Offset) {
+  Version = Data.getU16(Offset);
+  uint8_t FlagData = Data.getU8(Offset);
+
+  // FIXME: Add support for parsing opcode_operands_table
+  if (FlagData & MACRO_OPCODE_OPERANDS_TABLE)
+    return createStringError(errc::not_supported,
+                             "opcode_operands_table is not supported");
+  Flags = FlagData;
+  if (Flags & MACRO_DEBUG_LINE_OFFSET)
+    DebugLineOffset = Data.getUnsigned(Offset, getOffsetByteSize());
+  return Error::success();
 }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
index ab71b239cb67..5031acdb54ef 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDebugPubTable.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cstdint>
@@ -18,44 +19,92 @@
 using namespace llvm;
 using namespace dwarf;
 
-DWARFDebugPubTable::DWARFDebugPubTable(const DWARFObject &Obj,
-                                       const DWARFSection &Sec,
-                                       bool LittleEndian, bool GnuStyle)
-    : GnuStyle(GnuStyle) {
-  DWARFDataExtractor PubNames(Obj, Sec, LittleEndian, 0);
+void DWARFDebugPubTable::extract(
+    DWARFDataExtractor Data, bool GnuStyle,
+    function_ref<void(Error)> RecoverableErrorHandler) {
+  this->GnuStyle = GnuStyle;
+  Sets.clear();
   uint64_t Offset = 0;
-  while (PubNames.isValidOffset(Offset)) {
+  while (Data.isValidOffset(Offset)) {
+    uint64_t SetOffset = Offset;
     Sets.push_back({});
-    Set &SetData = Sets.back();
+    Set &NewSet = Sets.back();
 
-    SetData.Length = PubNames.getU32(&Offset);
-    SetData.Version = PubNames.getU16(&Offset);
-    SetData.Offset = PubNames.getRelocatedValue(4, &Offset);
-    SetData.Size = PubNames.getU32(&Offset);
+    DataExtractor::Cursor C(Offset);
+    std::tie(NewSet.Length, NewSet.Format) = Data.getInitialLength(C);
+    if (!C) {
+      // Drop the newly added set because it does not contain anything useful
+      // to dump.
+      Sets.pop_back();
+      RecoverableErrorHandler(createStringError(
+          errc::invalid_argument,
+          "name lookup table at offset 0x%" PRIx64 " parsing failed: %s",
+          SetOffset, toString(C.takeError()).c_str()));
+      return;
+    }
+
+    Offset = C.tell() + NewSet.Length;
+    DWARFDataExtractor SetData(Data, Offset);
+    const unsigned OffsetSize = dwarf::getDwarfOffsetByteSize(NewSet.Format);
 
-    while (Offset < Sec.Data.size()) {
-      uint32_t DieRef = PubNames.getU32(&Offset);
+    NewSet.Version = SetData.getU16(C);
+    NewSet.Offset = SetData.getRelocatedValue(C, OffsetSize);
+    NewSet.Size = SetData.getUnsigned(C, OffsetSize);
+
+    if (!C) {
+      // Preserve the newly added set because at least some fields of the header
+      // are read and can be dumped.
+      RecoverableErrorHandler(
+          createStringError(errc::invalid_argument,
+                            "name lookup table at offset 0x%" PRIx64
+                            " does not have a complete header: %s",
+                            SetOffset, toString(C.takeError()).c_str()));
+      continue;
+    }
+
+    while (C) {
+      uint64_t DieRef = SetData.getUnsigned(C, OffsetSize);
       if (DieRef == 0)
         break;
-      uint8_t IndexEntryValue = GnuStyle ? PubNames.getU8(&Offset) : 0;
-      StringRef Name = PubNames.getCStrRef(&Offset);
-      SetData.Entries.push_back(
-          {DieRef, PubIndexEntryDescriptor(IndexEntryValue), Name});
+      uint8_t IndexEntryValue = GnuStyle ? SetData.getU8(C) : 0;
+      StringRef Name = SetData.getCStrRef(C);
+      if (C)
+        NewSet.Entries.push_back(
+            {DieRef, PubIndexEntryDescriptor(IndexEntryValue), Name});
+    }
+
+    if (!C) {
+      RecoverableErrorHandler(createStringError(
+          errc::invalid_argument,
+          "name lookup table at offset 0x%" PRIx64 " parsing failed: %s",
+          SetOffset, toString(C.takeError()).c_str()));
+      continue;
     }
+    if (C.tell() != Offset)
+      RecoverableErrorHandler(createStringError(
+          errc::invalid_argument,
+          "name lookup table at offset 0x%" PRIx64
+          " has a terminator at offset 0x%" PRIx64
+          " before the expected end at 0x%" PRIx64,
+          SetOffset, C.tell() - OffsetSize, Offset - OffsetSize));
   }
 }
 
 void DWARFDebugPubTable::dump(raw_ostream &OS) const {
   for (const Set &S : Sets) {
-    OS << "length = " << format("0x%08x", S.Length);
-    OS << " version = " << format("0x%04x", S.Version);
-    OS << " unit_offset = " << format("0x%08" PRIx64, S.Offset);
-    OS << " unit_size = " << format("0x%08x", S.Size) << '\n';
+    int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(S.Format);
+    OS << "length = " << format("0x%0*" PRIx64, OffsetDumpWidth, S.Length);
+    OS << ", format = " << dwarf::FormatString(S.Format);
+    OS << ", version = " << format("0x%04x", S.Version);
+    OS << ", unit_offset = "
+       << format("0x%0*" PRIx64, OffsetDumpWidth, S.Offset);
+    OS << ", unit_size = " << format("0x%0*" PRIx64, OffsetDumpWidth, S.Size)
+       << '\n';
     OS << (GnuStyle ? "Offset     Linkage  Kind     Name\n"
                     : "Offset     Name\n");
 
     for (const Entry &E : S.Entries) {
-      OS << format("0x%8.8" PRIx64 " ", E.SecOffset);
+      OS << format("0x%0*" PRIx64 " ", OffsetDumpWidth, E.SecOffset);
       if (GnuStyle) {
         StringRef EntryLinkage =
             GDBIndexEntryLinkageString(E.Descriptor.Linkage);
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
index c1dc3b68c6ab..81a6b5dcd5e7 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFDie.cpp
@@ -79,7 +79,7 @@ static void dumpLocation(raw_ostream &OS, DWARFFormValue &FormValue,
     ArrayRef<uint8_t> Expr = *FormValue.getAsBlock();
     DataExtractor Data(StringRef((const char *)Expr.data(), Expr.size()),
                        Ctx.isLittleEndian(), 0);
-    DWARFExpression(Data, U->getVersion(), U->getAddressByteSize())
+    DWARFExpression(Data, U->getAddressByteSize(), U->getFormParams().Format)
         .print(OS, MRI, U);
     return;
   }
@@ -317,8 +317,9 @@ static void dumpAttribute(raw_ostream &OS, const DWARFDie &Die,
       dumpRanges(Obj, OS, RangesOrError.get(), U->getAddressByteSize(),
                  sizeof(BaseIndent) + Indent + 4, DumpOpts);
     else
-      WithColor::error() << "decoding address ranges: "
-                         << toString(RangesOrError.takeError()) << '\n';
+      DumpOpts.RecoverableErrorHandler(createStringError(
+          errc::invalid_argument, "decoding address ranges: %s",
+          toString(RangesOrError.takeError()).c_str()));
   }
 
   OS << ")\n";
@@ -356,7 +357,7 @@ DWARFDie::find(ArrayRef<dwarf::Attribute> Attrs) const {
 
 Optional<DWARFFormValue>
 DWARFDie::findRecursively(ArrayRef<dwarf::Attribute> Attrs) const {
-  std::vector<DWARFDie> Worklist;
+  SmallVector<DWARFDie, 3> Worklist;
   Worklist.push_back(*this);
 
   // Keep track if DIEs already seen to prevent infinite recursion.
@@ -531,14 +532,26 @@ const char *DWARFDie::getName(DINameKind Kind) const {
     return nullptr;
   // Try to get mangled name only if it was asked for.
   if (Kind == DINameKind::LinkageName) {
-    if (auto Name = dwarf::toString(
-            findRecursively({DW_AT_MIPS_linkage_name, DW_AT_linkage_name}),
-            nullptr))
+    if (auto Name = getLinkageName())
       return Name;
   }
-  if (auto Name = dwarf::toString(findRecursively(DW_AT_name), nullptr))
-    return Name;
-  return nullptr;
+  return getShortName();
+}
+
+const char *DWARFDie::getShortName() const {
+  if (!isValid())
+    return nullptr;
+
+  return dwarf::toString(findRecursively(dwarf::DW_AT_name), nullptr);
+}
+
+const char *DWARFDie::getLinkageName() const {
+  if (!isValid())
+    return nullptr;
+
+  return dwarf::toString(findRecursively({dwarf::DW_AT_MIPS_linkage_name,
+                                          dwarf::DW_AT_linkage_name}),
+                         nullptr);
 }
 
 uint64_t DWARFDie::getDeclLine() const {
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp b/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
index 7d817d8a9925..de5e11e084f4 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFExpression.cpp
@@ -8,7 +8,6 @@
 
 #include "llvm/DebugInfo/DWARF/DWARFExpression.h"
 #include "llvm/DebugInfo/DWARF/DWARFUnit.h"
-#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Format.h"
 #include <cassert>
@@ -94,7 +93,7 @@ static DescVector getDescriptions() {
       Desc(Op::Dwarf3, Op::SizeLEB, Op::SizeBlock);
   Descriptions[DW_OP_stack_value] = Desc(Op::Dwarf3);
   Descriptions[DW_OP_WASM_location] =
-      Desc(Op::Dwarf4, Op::SizeLEB, Op::SignedSizeLEB);
+      Desc(Op::Dwarf4, Op::SizeLEB, Op::WasmLocationArg);
   Descriptions[DW_OP_GNU_push_tls_address] = Desc(Op::Dwarf3);
   Descriptions[DW_OP_addrx] = Desc(Op::Dwarf4, Op::SizeLEB);
   Descriptions[DW_OP_GNU_addr_index] = Desc(Op::Dwarf4, Op::SizeLEB);
@@ -103,6 +102,8 @@ static DescVector getDescriptions() {
 
   Descriptions[DW_OP_convert] = Desc(Op::Dwarf5, Op::BaseTypeRef);
   Descriptions[DW_OP_entry_value] = Desc(Op::Dwarf5, Op::SizeLEB);
+  Descriptions[DW_OP_regval_type] =
+      Desc(Op::Dwarf5, Op::SizeLEB, Op::BaseTypeRef);
 
   return Descriptions;
 }
@@ -116,19 +117,15 @@ static DWARFExpression::Operation::Description getOpDesc(unsigned OpCode) {
   return Descriptions[OpCode];
 }
 
-static uint8_t getRefAddrSize(uint8_t AddrSize, uint16_t Version) {
-  return (Version == 2) ? AddrSize : 4;
-}
-
-bool DWARFExpression::Operation::extract(DataExtractor Data, uint16_t Version,
-                                         uint8_t AddressSize, uint64_t Offset) {
+bool DWARFExpression::Operation::extract(DataExtractor Data,
+                                         uint8_t AddressSize, uint64_t Offset,
+                                         Optional<DwarfFormat> Format) {
+  EndOffset = Offset;
   Opcode = Data.getU8(&Offset);
 
   Desc = getOpDesc(Opcode);
-  if (Desc.Version == Operation::DwarfNA) {
-    EndOffset = Offset;
+  if (Desc.Version == Operation::DwarfNA)
     return false;
-  }
 
   for (unsigned Operand = 0; Operand < 2; ++Operand) {
     unsigned Size = Desc.Op[Operand];
@@ -157,24 +154,13 @@ bool DWARFExpression::Operation::extract(DataExtractor Data, uint16_t Version,
       Operands[Operand] = Data.getU64(&Offset);
       break;
     case Operation::SizeAddr:
-      if (AddressSize == 8) {
-        Operands[Operand] = Data.getU64(&Offset);
-      } else if (AddressSize == 4) {
-        Operands[Operand] = Data.getU32(&Offset);
-      } else {
-        assert(AddressSize == 2);
-        Operands[Operand] = Data.getU16(&Offset);
-      }
+      Operands[Operand] = Data.getUnsigned(&Offset, AddressSize);
       break;
     case Operation::SizeRefAddr:
-      if (getRefAddrSize(AddressSize, Version) == 8) {
-        Operands[Operand] = Data.getU64(&Offset);
-      } else if (getRefAddrSize(AddressSize, Version) == 4) {
-        Operands[Operand] = Data.getU32(&Offset);
-      } else {
-        assert(getRefAddrSize(AddressSize, Version) == 2);
-        Operands[Operand] = Data.getU16(&Offset);
-      }
+      if (!Format)
+        return false;
+      Operands[Operand] =
+          Data.getUnsigned(&Offset, dwarf::getDwarfOffsetByteSize(*Format));
       break;
     case Operation::SizeLEB:
       if (Signed)
@@ -185,6 +171,19 @@ bool DWARFExpression::Operation::extract(DataExtractor Data, uint16_t Version,
     case Operation::BaseTypeRef:
       Operands[Operand] = Data.getULEB128(&Offset);
       break;
+    case Operation::WasmLocationArg:
+      assert(Operand == 1);
+      switch (Operands[0]) {
+      case 0: case 1: case 2:
+        Operands[Operand] = Data.getULEB128(&Offset);
+        break;
+      case 3: // global as uint32
+         Operands[Operand] = Data.getU32(&Offset);
+         break;
+      default:
+        return false; // Unknown Wasm location
+      }
+      break;
     case Operation::SizeBlock:
       // We need a size, so this cannot be the first operand
       if (Operand == 0)
@@ -204,7 +203,21 @@ bool DWARFExpression::Operation::extract(DataExtractor Data, uint16_t Version,
   return true;
 }
 
-static bool prettyPrintRegisterOp(raw_ostream &OS, uint8_t Opcode,
+static void prettyPrintBaseTypeRef(DWARFUnit *U, raw_ostream &OS,
+                                   uint64_t Operands[2], unsigned Operand) {
+  assert(Operand < 2 && "operand out of bounds");
+  auto Die = U->getDIEForOffset(U->getOffset() + Operands[Operand]);
+  if (Die && Die.getTag() == dwarf::DW_TAG_base_type) {
+    OS << format(" (0x%08" PRIx64 ")", U->getOffset() + Operands[Operand]);
+    if (auto Name = Die.find(dwarf::DW_AT_name))
+      OS << " \"" << Name->getAsCString() << "\"";
+  } else {
+    OS << format(" <invalid base_type ref: 0x%" PRIx64 ">",
+                 Operands[Operand]);
+  }
+}
+
+static bool prettyPrintRegisterOp(DWARFUnit *U, raw_ostream &OS, uint8_t Opcode,
                                   uint64_t Operands[2],
                                   const MCRegisterInfo *MRI, bool isEH) {
   if (!MRI)
@@ -213,7 +226,8 @@ static bool prettyPrintRegisterOp(raw_ostream &OS, uint8_t Opcode,
   uint64_t DwarfRegNum;
   unsigned OpNum = 0;
 
-  if (Opcode == DW_OP_bregx || Opcode == DW_OP_regx)
+  if (Opcode == DW_OP_bregx || Opcode == DW_OP_regx ||
+      Opcode == DW_OP_regval_type)
     DwarfRegNum = Operands[OpNum++];
   else if (Opcode >= DW_OP_breg0 && Opcode < DW_OP_bregx)
     DwarfRegNum = Opcode - DW_OP_breg0;
@@ -227,6 +241,9 @@ static bool prettyPrintRegisterOp(raw_ostream &OS, uint8_t Opcode,
         OS << format(" %s%+" PRId64, RegName, Operands[OpNum]);
       else
         OS << ' ' << RegName;
+
+      if (Opcode == DW_OP_regval_type)
+        prettyPrintBaseTypeRef(U, OS, Operands, 1);
       return true;
     }
   }
@@ -250,8 +267,9 @@ bool DWARFExpression::Operation::print(raw_ostream &OS,
 
   if ((Opcode >= DW_OP_breg0 && Opcode <= DW_OP_breg31) ||
       (Opcode >= DW_OP_reg0 && Opcode <= DW_OP_reg31) ||
-      Opcode == DW_OP_bregx || Opcode == DW_OP_regx)
-    if (prettyPrintRegisterOp(OS, Opcode, Operands, RegInfo, isEH))
+      Opcode == DW_OP_bregx || Opcode == DW_OP_regx ||
+      Opcode == DW_OP_regval_type)
+    if (prettyPrintRegisterOp(U, OS, Opcode, Operands, RegInfo, isEH))
       return true;
 
   for (unsigned Operand = 0; Operand < 2; ++Operand) {
@@ -262,14 +280,21 @@ bool DWARFExpression::Operation::print(raw_ostream &OS,
       break;
 
     if (Size == Operation::BaseTypeRef && U) {
-      auto Die = U->getDIEForOffset(U->getOffset() + Operands[Operand]);
-      if (Die && Die.getTag() == dwarf::DW_TAG_base_type) {
-        OS << format(" (0x%08" PRIx64 ")", U->getOffset() + Operands[Operand]);
-        if (auto Name = Die.find(dwarf::DW_AT_name))
-          OS << " \"" << Name->getAsCString() << "\"";
-      } else {
-        OS << format(" <invalid base_type ref: 0x%" PRIx64 ">",
-                     Operands[Operand]);
+      // For DW_OP_convert the operand may be 0 to indicate that conversion to
+      // the generic type should be done. The same holds for DW_OP_reinterpret,
+      // which is currently not supported.
+      if (Opcode == DW_OP_convert && Operands[Operand] == 0)
+        OS << " 0x0";
+      else
+        prettyPrintBaseTypeRef(U, OS, Operands, Operand);
+    } else if (Size == Operation::WasmLocationArg) {
+      assert(Operand == 1);
+      switch (Operands[0]) {
+      case 0: case 1: case 2:
+      case 3: // global as uint32
+        OS << format(" 0x%" PRIx64, Operands[Operand]);
+        break;
+      default: assert(false);
       }
     } else if (Size == Operation::SizeBlock) {
       uint64_t Offset = Operands[Operand];
@@ -324,6 +349,12 @@ bool DWARFExpression::Operation::verify(DWARFUnit *U) {
       break;
 
     if (Size == Operation::BaseTypeRef) {
+      // For DW_OP_convert the operand may be 0 to indicate that conversion to
+      // the generic type should be done, so don't look up a base type in that
+      // case. The same holds for DW_OP_reinterpret, which is currently not
+      // supported.
+      if (Opcode == DW_OP_convert && Operands[Operand] == 0)
+        continue;
       auto Die = U->getDIEForOffset(U->getOffset() + Operands[Operand]);
       if (!Die || Die.getTag() != dwarf::DW_TAG_base_type) {
         Error = true;
@@ -343,4 +374,126 @@ bool DWARFExpression::verify(DWARFUnit *U) {
   return true;
 }
 
+/// A user-facing string representation of a DWARF expression. This might be an
+/// Address expression, in which case it will be implicitly dereferenced, or a
+/// Value expression.
+struct PrintedExpr {
+  enum ExprKind {
+    Address,
+    Value,
+  };
+  ExprKind Kind;
+  SmallString<16> String;
+
+  PrintedExpr(ExprKind K = Address) : Kind(K) {}
+};
+
+static bool printCompactDWARFExpr(raw_ostream &OS, DWARFExpression::iterator I,
+                                  const DWARFExpression::iterator E,
+                                  const MCRegisterInfo &MRI) {
+  SmallVector<PrintedExpr, 4> Stack;
+
+  while (I != E) {
+    DWARFExpression::Operation &Op = *I;
+    uint8_t Opcode = Op.getCode();
+    switch (Opcode) {
+    case dwarf::DW_OP_regx: {
+      // DW_OP_regx: A register, with the register num given as an operand.
+      // Printed as the plain register name.
+      uint64_t DwarfRegNum = Op.getRawOperand(0);
+      Optional<unsigned> LLVMRegNum = MRI.getLLVMRegNum(DwarfRegNum, false);
+      if (!LLVMRegNum) {
+        OS << "<unknown register " << DwarfRegNum << ">";
+        return false;
+      }
+      raw_svector_ostream S(Stack.emplace_back(PrintedExpr::Value).String);
+      S << MRI.getName(*LLVMRegNum);
+      break;
+    }
+    case dwarf::DW_OP_bregx: {
+      int DwarfRegNum = Op.getRawOperand(0);
+      int64_t Offset = Op.getRawOperand(1);
+      Optional<unsigned> LLVMRegNum = MRI.getLLVMRegNum(DwarfRegNum, false);
+      if (!LLVMRegNum) {
+        OS << "<unknown register " << DwarfRegNum << ">";
+        return false;
+      }
+      raw_svector_ostream S(Stack.emplace_back().String);
+      S << MRI.getName(*LLVMRegNum);
+      if (Offset)
+        S << format("%+" PRId64, Offset);
+      break;
+    }
+    case dwarf::DW_OP_entry_value:
+    case dwarf::DW_OP_GNU_entry_value: {
+      // DW_OP_entry_value contains a sub-expression which must be rendered
+      // separately.
+      uint64_t SubExprLength = Op.getRawOperand(0);
+      DWARFExpression::iterator SubExprEnd = I.skipBytes(SubExprLength);
+      ++I;
+      raw_svector_ostream S(Stack.emplace_back().String);
+      S << "entry(";
+      printCompactDWARFExpr(S, I, SubExprEnd, MRI);
+      S << ")";
+      I = SubExprEnd;
+      continue;
+    }
+    case dwarf::DW_OP_stack_value: {
+      // The top stack entry should be treated as the actual value of tne
+      // variable, rather than the address of the variable in memory.
+      assert(!Stack.empty());
+      Stack.back().Kind = PrintedExpr::Value;
+      break;
+    }
+    default:
+      if (Opcode >= dwarf::DW_OP_reg0 && Opcode <= dwarf::DW_OP_reg31) {
+        // DW_OP_reg<N>: A register, with the register num implied by the
+        // opcode. Printed as the plain register name.
+        uint64_t DwarfRegNum = Opcode - dwarf::DW_OP_reg0;
+        Optional<unsigned> LLVMRegNum = MRI.getLLVMRegNum(DwarfRegNum, false);
+        if (!LLVMRegNum) {
+          OS << "<unknown register " << DwarfRegNum << ">";
+          return false;
+        }
+        raw_svector_ostream S(Stack.emplace_back(PrintedExpr::Value).String);
+        S << MRI.getName(*LLVMRegNum);
+      } else if (Opcode >= dwarf::DW_OP_breg0 &&
+                 Opcode <= dwarf::DW_OP_breg31) {
+        int DwarfRegNum = Opcode - dwarf::DW_OP_breg0;
+        int64_t Offset = Op.getRawOperand(0);
+        Optional<unsigned> LLVMRegNum = MRI.getLLVMRegNum(DwarfRegNum, false);
+        if (!LLVMRegNum) {
+          OS << "<unknown register " << DwarfRegNum << ">";
+          return false;
+        }
+        raw_svector_ostream S(Stack.emplace_back().String);
+        S << MRI.getName(*LLVMRegNum);
+        if (Offset)
+          S << format("%+" PRId64, Offset);
+      } else {
+        // If we hit an unknown operand, we don't know its effect on the stack,
+        // so bail out on the whole expression.
+        OS << "<unknown op " << dwarf::OperationEncodingString(Opcode) << " ("
+           << (int)Opcode << ")>";
+        return false;
+      }
+      break;
+    }
+    ++I;
+  }
+
+  assert(Stack.size() == 1 && "expected one value on stack");
+
+  if (Stack.front().Kind == PrintedExpr::Address)
+    OS << "[" << Stack.front().String << "]";
+  else
+    OS << Stack.front().String;
+
+  return true;
+}
+
+bool DWARFExpression::printCompact(raw_ostream &OS, const MCRegisterInfo &MRI) {
+  return printCompactDWARFExpr(OS, begin(), end(), MRI);
+}
+
 } // namespace llvm
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
index e97ae81345b8..a7da5acc380b 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFFormValue.cpp
@@ -241,11 +241,13 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
     Ctx = &CU->getContext();
   C = Ctx;
   U = CU;
+  Format = FP.Format;
   bool Indirect = false;
   bool IsBlock = false;
   Value.data = nullptr;
   // Read the value for the form into value and follow and DW_FORM_indirect
   // instances we run into
+  Error Err = Error::success();
   do {
     Indirect = false;
     switch (Form) {
@@ -253,24 +255,25 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
     case DW_FORM_ref_addr: {
       uint16_t Size =
           (Form == DW_FORM_addr) ? FP.AddrSize : FP.getRefAddrByteSize();
-      Value.uval = Data.getRelocatedValue(Size, OffsetPtr, &Value.SectionIndex);
+      Value.uval =
+          Data.getRelocatedValue(Size, OffsetPtr, &Value.SectionIndex, &Err);
       break;
     }
     case DW_FORM_exprloc:
     case DW_FORM_block:
-      Value.uval = Data.getULEB128(OffsetPtr);
+      Value.uval = Data.getULEB128(OffsetPtr, &Err);
       IsBlock = true;
       break;
     case DW_FORM_block1:
-      Value.uval = Data.getU8(OffsetPtr);
+      Value.uval = Data.getU8(OffsetPtr, &Err);
       IsBlock = true;
       break;
     case DW_FORM_block2:
-      Value.uval = Data.getU16(OffsetPtr);
+      Value.uval = Data.getU16(OffsetPtr, &Err);
       IsBlock = true;
       break;
     case DW_FORM_block4:
-      Value.uval = Data.getU32(OffsetPtr);
+      Value.uval = Data.getU32(OffsetPtr, &Err);
       IsBlock = true;
       break;
     case DW_FORM_data1:
@@ -278,28 +281,28 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
     case DW_FORM_flag:
     case DW_FORM_strx1:
     case DW_FORM_addrx1:
-      Value.uval = Data.getU8(OffsetPtr);
+      Value.uval = Data.getU8(OffsetPtr, &Err);
       break;
     case DW_FORM_data2:
     case DW_FORM_ref2:
     case DW_FORM_strx2:
     case DW_FORM_addrx2:
-      Value.uval = Data.getU16(OffsetPtr);
+      Value.uval = Data.getU16(OffsetPtr, &Err);
       break;
     case DW_FORM_strx3:
-      Value.uval = Data.getU24(OffsetPtr);
+      Value.uval = Data.getU24(OffsetPtr, &Err);
       break;
     case DW_FORM_data4:
     case DW_FORM_ref4:
     case DW_FORM_ref_sup4:
     case DW_FORM_strx4:
     case DW_FORM_addrx4:
-      Value.uval = Data.getRelocatedValue(4, OffsetPtr);
+      Value.uval = Data.getRelocatedValue(4, OffsetPtr, nullptr, &Err);
       break;
     case DW_FORM_data8:
     case DW_FORM_ref8:
     case DW_FORM_ref_sup8:
-      Value.uval = Data.getRelocatedValue(8, OffsetPtr);
+      Value.uval = Data.getRelocatedValue(8, OffsetPtr, nullptr, &Err);
       break;
     case DW_FORM_data16:
       // Treat this like a 16-byte block.
@@ -307,19 +310,23 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
       IsBlock = true;
       break;
     case DW_FORM_sdata:
-      Value.sval = Data.getSLEB128(OffsetPtr);
+      Value.sval = Data.getSLEB128(OffsetPtr, &Err);
       break;
     case DW_FORM_udata:
     case DW_FORM_ref_udata:
     case DW_FORM_rnglistx:
     case DW_FORM_loclistx:
-      Value.uval = Data.getULEB128(OffsetPtr);
+    case DW_FORM_GNU_addr_index:
+    case DW_FORM_GNU_str_index:
+    case DW_FORM_addrx:
+    case DW_FORM_strx:
+      Value.uval = Data.getULEB128(OffsetPtr, &Err);
       break;
     case DW_FORM_string:
-      Value.cstr = Data.getCStr(OffsetPtr);
+      Value.cstr = Data.getCStr(OffsetPtr, &Err);
       break;
     case DW_FORM_indirect:
-      Form = static_cast<dwarf::Form>(Data.getULEB128(OffsetPtr));
+      Form = static_cast<dwarf::Form>(Data.getULEB128(OffsetPtr, &Err));
       Indirect = true;
       break;
     case DW_FORM_strp:
@@ -328,39 +335,27 @@ bool DWARFFormValue::extractValue(const DWARFDataExtractor &Data,
     case DW_FORM_GNU_strp_alt:
     case DW_FORM_line_strp:
     case DW_FORM_strp_sup: {
-      Value.uval =
-          Data.getRelocatedValue(FP.getDwarfOffsetByteSize(), OffsetPtr);
+      Value.uval = Data.getRelocatedValue(FP.getDwarfOffsetByteSize(),
+                                          OffsetPtr, nullptr, &Err);
       break;
     }
     case DW_FORM_flag_present:
       Value.uval = 1;
       break;
     case DW_FORM_ref_sig8:
-      Value.uval = Data.getU64(OffsetPtr);
-      break;
-    case DW_FORM_GNU_addr_index:
-    case DW_FORM_GNU_str_index:
-    case DW_FORM_addrx:
-    case DW_FORM_strx:
-      Value.uval = Data.getULEB128(OffsetPtr);
+      Value.uval = Data.getU64(OffsetPtr, &Err);
       break;
     default:
       // DWARFFormValue::skipValue() will have caught this and caused all
       // DWARF DIEs to fail to be parsed, so this code is not be reachable.
       llvm_unreachable("unsupported form");
     }
-  } while (Indirect);
+  } while (Indirect && !Err);
 
-  if (IsBlock) {
-    StringRef Str = Data.getData().substr(*OffsetPtr, Value.uval);
-    Value.data = nullptr;
-    if (!Str.empty()) {
-      Value.data = Str.bytes_begin();
-      *OffsetPtr += Value.uval;
-    }
-  }
+  if (IsBlock)
+    Value.data = Data.getBytes(OffsetPtr, Value.uval, &Err).bytes_begin();
 
-  return true;
+  return !errorToBool(std::move(Err));
 }
 
 void DWARFFormValue::dumpSectionedAddress(raw_ostream &OS,
@@ -392,6 +387,7 @@ void DWARFFormValue::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
   raw_ostream &AddrOS = DumpOpts.ShowAddresses
                             ? WithColor(OS, HighlightColor::Address).get()
                             : nulls();
+  int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(Format);
   switch (Form) {
   case DW_FORM_addr:
     dumpSectionedAddress(AddrOS, DumpOpts, {Value.uval, Value.SectionIndex});
@@ -487,12 +483,13 @@ void DWARFFormValue::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
     break;
   case DW_FORM_strp:
     if (DumpOpts.Verbose)
-      OS << format(" .debug_str[0x%8.8x] = ", (uint32_t)UValue);
+      OS << format(" .debug_str[0x%0*" PRIx64 "] = ", OffsetDumpWidth, UValue);
     dumpString(OS);
     break;
   case DW_FORM_line_strp:
     if (DumpOpts.Verbose)
-      OS << format(" .debug_line_str[0x%8.8x] = ", (uint32_t)UValue);
+      OS << format(" .debug_line_str[0x%0*" PRIx64 "] = ", OffsetDumpWidth,
+                   UValue);
     dumpString(OS);
     break;
   case DW_FORM_strx:
@@ -556,9 +553,8 @@ void DWARFFormValue::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
     OS << format("indexed (0x%x) loclist = ", (uint32_t)UValue);
     break;
 
-  // Should be formatted to 64-bit for DWARF64.
   case DW_FORM_sec_offset:
-    AddrOS << format("0x%08x", (uint32_t)UValue);
+    AddrOS << format("0x%0*" PRIx64, OffsetDumpWidth, UValue);
     break;
 
   default:
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp b/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp
index 269ea9f79a6e..2124a49bef60 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFListTable.cpp
@@ -18,34 +18,24 @@ using namespace llvm;
 Error DWARFListTableHeader::extract(DWARFDataExtractor Data,
                                     uint64_t *OffsetPtr) {
   HeaderOffset = *OffsetPtr;
-  // Read and verify the length field.
-  if (!Data.isValidOffsetForDataOfSize(*OffsetPtr, sizeof(uint32_t)))
-    return createStringError(errc::invalid_argument,
-                       "section is not large enough to contain a "
-                       "%s table length at offset 0x%" PRIx64,
-                       SectionName.data(), *OffsetPtr);
-  Format = dwarf::DwarfFormat::DWARF32;
-  uint8_t OffsetByteSize = 4;
-  HeaderData.Length = Data.getRelocatedValue(4, OffsetPtr);
-  if (HeaderData.Length == dwarf::DW_LENGTH_DWARF64) {
-    Format = dwarf::DwarfFormat::DWARF64;
-    OffsetByteSize = 8;
-    HeaderData.Length = Data.getU64(OffsetPtr);
-  } else if (HeaderData.Length >= dwarf::DW_LENGTH_lo_reserved) {
-    return createStringError(errc::invalid_argument,
-        "%s table at offset 0x%" PRIx64
-        " has unsupported reserved unit length of value 0x%8.8" PRIx64,
-        SectionName.data(), HeaderOffset, HeaderData.Length);
-  }
+  Error Err = Error::success();
+
+  std::tie(HeaderData.Length, Format) = Data.getInitialLength(OffsetPtr, &Err);
+  if (Err)
+    return createStringError(
+        errc::invalid_argument, "parsing %s table at offset 0x%" PRIx64 ": %s",
+        SectionName.data(), HeaderOffset, toString(std::move(Err)).c_str());
+
+  uint8_t OffsetByteSize = Format == dwarf::DWARF64 ? 8 : 4;
   uint64_t FullLength =
       HeaderData.Length + dwarf::getUnitLengthFieldByteSize(Format);
-  assert(FullLength == length());
   if (FullLength < getHeaderSize(Format))
     return createStringError(errc::invalid_argument,
                        "%s table at offset 0x%" PRIx64
                        " has too small length (0x%" PRIx64
                        ") to contain a complete header",
                        SectionName.data(), HeaderOffset, FullLength);
+  assert(FullLength == length() && "Inconsistent calculation of length.");
   uint64_t End = HeaderOffset + FullLength;
   if (!Data.isValidOffsetForDataOfSize(HeaderOffset, FullLength))
     return createStringError(errc::invalid_argument,
@@ -89,20 +79,22 @@ Error DWARFListTableHeader::extract(DWARFDataExtractor Data,
 void DWARFListTableHeader::dump(raw_ostream &OS, DIDumpOptions DumpOpts) const {
   if (DumpOpts.Verbose)
     OS << format("0x%8.8" PRIx64 ": ", HeaderOffset);
-  OS << format(
-      "%s list header: length = 0x%8.8" PRIx64 ", version = 0x%4.4" PRIx16 ", "
-      "addr_size = 0x%2.2" PRIx8 ", seg_size = 0x%2.2" PRIx8
-      ", offset_entry_count = "
-      "0x%8.8" PRIx32 "\n",
-      ListTypeString.data(), HeaderData.Length, HeaderData.Version,
-      HeaderData.AddrSize, HeaderData.SegSize, HeaderData.OffsetEntryCount);
+  int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(Format);
+  OS << format("%s list header: length = 0x%0*" PRIx64, ListTypeString.data(),
+               OffsetDumpWidth, HeaderData.Length)
+     << ", format = " << dwarf::FormatString(Format)
+     << format(", version = 0x%4.4" PRIx16 ", addr_size = 0x%2.2" PRIx8
+               ", seg_size = 0x%2.2" PRIx8
+               ", offset_entry_count = 0x%8.8" PRIx32 "\n",
+               HeaderData.Version, HeaderData.AddrSize, HeaderData.SegSize,
+               HeaderData.OffsetEntryCount);
 
   if (HeaderData.OffsetEntryCount > 0) {
     OS << "offsets: [";
     for (const auto &Off : Offsets) {
-      OS << format("\n0x%8.8" PRIx64, Off);
+      OS << format("\n0x%0*" PRIx64, OffsetDumpWidth, Off);
       if (DumpOpts.Verbose)
-        OS << format(" => 0x%8.8" PRIx64,
+        OS << format(" => 0x%08" PRIx64,
                      Off + HeaderOffset + getHeaderSize(Format));
     }
     OS << "\n]\n";
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
index bb81090ba25c..c219f34bbc31 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFTypeUnit.cpp
@@ -20,25 +20,28 @@ using namespace llvm;
 void DWARFTypeUnit::dump(raw_ostream &OS, DIDumpOptions DumpOpts) {
   DWARFDie TD = getDIEForOffset(getTypeOffset() + getOffset());
   const char *Name = TD.getName(DINameKind::ShortName);
+  int OffsetDumpWidth = 2 * dwarf::getDwarfOffsetByteSize(getFormat());
 
   if (DumpOpts.SummarizeTypes) {
     OS << "name = '" << Name << "'"
-       << " type_signature = " << format("0x%016" PRIx64, getTypeHash())
-       << " length = " << format("0x%08" PRIx64, getLength()) << '\n';
+       << ", type_signature = " << format("0x%016" PRIx64, getTypeHash())
+       << ", length = " << format("0x%0*" PRIx64, OffsetDumpWidth, getLength())
+       << '\n';
     return;
   }
 
   OS << format("0x%08" PRIx64, getOffset()) << ": Type Unit:"
-     << " length = " << format("0x%08" PRIx64, getLength())
-     << " version = " << format("0x%04x", getVersion());
+     << " length = " << format("0x%0*" PRIx64, OffsetDumpWidth, getLength())
+     << ", format = " << dwarf::FormatString(getFormat())
+     << ", version = " << format("0x%04x", getVersion());
   if (getVersion() >= 5)
-    OS << " unit_type = " << dwarf::UnitTypeString(getUnitType());
-  OS << " abbr_offset = "
+    OS << ", unit_type = " << dwarf::UnitTypeString(getUnitType());
+  OS << ", abbr_offset = "
      << format("0x%04" PRIx64, getAbbreviations()->getOffset())
-     << " addr_size = " << format("0x%02x", getAddressByteSize())
-     << " name = '" << Name << "'"
-     << " type_signature = " << format("0x%016" PRIx64, getTypeHash())
-     << " type_offset = " << format("0x%04" PRIx64, getTypeOffset())
+     << ", addr_size = " << format("0x%02x", getAddressByteSize())
+     << ", name = '" << Name << "'"
+     << ", type_signature = " << format("0x%016" PRIx64, getTypeHash())
+     << ", type_offset = " << format("0x%04" PRIx64, getTypeOffset())
      << " (next unit at " << format("0x%08" PRIx64, getNextUnitOffset())
      << ")\n";
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
index 7bb019466161..a6d44f04e468 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnit.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Support/DataExtractor.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Path.h"
-#include "llvm/Support/WithColor.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -74,12 +73,15 @@ void DWARFUnitVector::addUnitsImpl(
       DWARFDataExtractor Data(Obj, InfoSection, LE, 0);
       if (!Data.isValidOffset(Offset))
         return nullptr;
-      const DWARFUnitIndex *Index = nullptr;
-      if (IsDWO)
-        Index = &getDWARFUnitIndex(Context, SectionKind);
       DWARFUnitHeader Header;
-      if (!Header.extract(Context, Data, &Offset, SectionKind, Index,
-                          IndexEntry))
+      if (!Header.extract(Context, Data, &Offset, SectionKind))
+        return nullptr;
+      if (!IndexEntry && IsDWO) {
+        const DWARFUnitIndex &Index = getDWARFUnitIndex(
+            Context, Header.isTypeUnit() ? DW_SECT_EXT_TYPES : DW_SECT_INFO);
+        IndexEntry = Index.getFromOffset(Header.getOffset());
+      }
+      if (IndexEntry && !Header.applyIndexEntry(IndexEntry))
         return nullptr;
       std::unique_ptr<DWARFUnit> U;
       if (Header.isTypeUnit())
@@ -140,7 +142,7 @@ DWARFUnit *DWARFUnitVector::getUnitForOffset(uint64_t Offset) const {
 
 DWARFUnit *
 DWARFUnitVector::getUnitForIndexEntry(const DWARFUnitIndex::Entry &E) {
-  const auto *CUOff = E.getOffset(DW_SECT_INFO);
+  const auto *CUOff = E.getContribution(DW_SECT_INFO);
   if (!CUOff)
     return nullptr;
 
@@ -182,20 +184,17 @@ DWARFUnit::DWARFUnit(DWARFContext &DC, const DWARFSection &Section,
   if (IsDWO) {
     // If we are reading a package file, we need to adjust the location list
     // data based on the index entries.
-    StringRef Data = LocSection->Data;
+    StringRef Data = Header.getVersion() >= 5
+                         ? Context.getDWARFObj().getLoclistsDWOSection().Data
+                         : LocSection->Data;
     if (auto *IndexEntry = Header.getIndexEntry())
-      if (const auto *C = IndexEntry->getOffset(DW_SECT_LOC))
+      if (const auto *C = IndexEntry->getContribution(
+              Header.getVersion() >= 5 ? DW_SECT_LOCLISTS : DW_SECT_EXT_LOC))
         Data = Data.substr(C->Offset, C->Length);
 
-    DWARFDataExtractor DWARFData =
-        Header.getVersion() >= 5
-            ? DWARFDataExtractor(Context.getDWARFObj(),
-                                 Context.getDWARFObj().getLoclistsDWOSection(),
-                                 isLittleEndian, getAddressByteSize())
-            : DWARFDataExtractor(Data, isLittleEndian, getAddressByteSize());
+    DWARFDataExtractor DWARFData(Data, isLittleEndian, getAddressByteSize());
     LocTable =
         std::make_unique<DWARFDebugLoclists>(DWARFData, Header.getVersion());
-
   } else if (Header.getVersion() >= 5) {
     LocTable = std::make_unique<DWARFDebugLoclists>(
         DWARFDataExtractor(Context.getDWARFObj(),
@@ -255,20 +254,12 @@ Optional<uint64_t> DWARFUnit::getStringOffsetSectionItem(uint32_t Index) const {
 bool DWARFUnitHeader::extract(DWARFContext &Context,
                               const DWARFDataExtractor &debug_info,
                               uint64_t *offset_ptr,
-                              DWARFSectionKind SectionKind,
-                              const DWARFUnitIndex *Index,
-                              const DWARFUnitIndex::Entry *Entry) {
+                              DWARFSectionKind SectionKind) {
   Offset = *offset_ptr;
   Error Err = Error::success();
-  IndexEntry = Entry;
-  if (!IndexEntry && Index)
-    IndexEntry = Index->getFromOffset(*offset_ptr);
-  Length = debug_info.getRelocatedValue(4, offset_ptr, nullptr, &Err);
-  FormParams.Format = DWARF32;
-  if (Length == dwarf::DW_LENGTH_DWARF64) {
-    Length = debug_info.getU64(offset_ptr, &Err);
-    FormParams.Format = DWARF64;
-  }
+  IndexEntry = nullptr;
+  std::tie(Length, FormParams.Format) =
+      debug_info.getInitialLength(offset_ptr, &Err);
   FormParams.Version = debug_info.getU16(offset_ptr, &Err);
   if (FormParams.Version >= 5) {
     UnitType = debug_info.getU8(offset_ptr, &Err);
@@ -281,22 +272,11 @@ bool DWARFUnitHeader::extract(DWARFContext &Context,
     FormParams.AddrSize = debug_info.getU8(offset_ptr, &Err);
     // Fake a unit type based on the section type.  This isn't perfect,
     // but distinguishing compile and type units is generally enough.
-    if (SectionKind == DW_SECT_TYPES)
+    if (SectionKind == DW_SECT_EXT_TYPES)
       UnitType = DW_UT_type;
     else
       UnitType = DW_UT_compile;
   }
-  if (IndexEntry) {
-    if (AbbrOffset)
-      return false;
-    auto *UnitContrib = IndexEntry->getOffset();
-    if (!UnitContrib || UnitContrib->Length != (Length + 4))
-      return false;
-    auto *AbbrEntry = IndexEntry->getOffset(DW_SECT_ABBREV);
-    if (!AbbrEntry)
-      return false;
-    AbbrOffset = AbbrEntry->Offset;
-  }
   if (isTypeUnit()) {
     TypeHash = debug_info.getU64(offset_ptr, &Err);
     TypeOffset = debug_info.getUnsigned(
@@ -320,7 +300,7 @@ bool DWARFUnitHeader::extract(DWARFContext &Context,
                 TypeOffset < getLength() + getUnitLengthFieldByteSize();
   bool LengthOK = debug_info.isValidOffset(getNextUnitOffset() - 1);
   bool VersionOK = DWARFContext::isSupportedVersion(getVersion());
-  bool AddrSizeOK = getAddressByteSize() == 4 || getAddressByteSize() == 8;
+  bool AddrSizeOK = DWARFContext::isAddressSizeSupported(getAddressByteSize());
 
   if (!LengthOK || !VersionOK || !AddrSizeOK || !TypeOffsetOK)
     return false;
@@ -330,6 +310,23 @@ bool DWARFUnitHeader::extract(DWARFContext &Context,
   return true;
 }
 
+bool DWARFUnitHeader::applyIndexEntry(const DWARFUnitIndex::Entry *Entry) {
+  assert(Entry);
+  assert(!IndexEntry);
+  IndexEntry = Entry;
+  if (AbbrOffset)
+    return false;
+  auto *UnitContrib = IndexEntry->getContribution();
+  if (!UnitContrib ||
+      UnitContrib->Length != (getLength() + getUnitLengthFieldByteSize()))
+    return false;
+  auto *AbbrEntry = IndexEntry->getContribution(DW_SECT_ABBREV);
+  if (!AbbrEntry)
+    return false;
+  AbbrOffset = AbbrEntry->Offset;
+  return true;
+}
+
 // Parse the rangelist table header, including the optional array of offsets
 // following it (DWARF v5 and later).
 template<typename ListTableType>
@@ -426,15 +423,17 @@ void DWARFUnit::extractDIEsToVector(
   // should always terminate at or before the start of the next compilation
   // unit header).
   if (DIEOffset > NextCUOffset)
-    WithColor::warning() << format("DWARF compile unit extends beyond its "
-                                   "bounds cu 0x%8.8" PRIx64 " "
-                                   "at 0x%8.8" PRIx64 "\n",
-                                   getOffset(), DIEOffset);
+    Context.getWarningHandler()(
+        createStringError(errc::invalid_argument,
+                          "DWARF compile unit extends beyond its "
+                          "bounds cu 0x%8.8" PRIx64 " "
+                          "at 0x%8.8" PRIx64 "\n",
+                          getOffset(), DIEOffset));
 }
 
 void DWARFUnit::extractDIEsIfNeeded(bool CUDieOnly) {
   if (Error e = tryExtractDIEsIfNeeded(CUDieOnly))
-    WithColor::error() << toString(std::move(e));
+    Context.getRecoverableErrorHandler()(std::move(e));
 }
 
 Error DWARFUnit::tryExtractDIEsIfNeeded(bool CUDieOnly) {
@@ -492,9 +491,17 @@ Error DWARFUnit::tryExtractDIEsIfNeeded(bool CUDieOnly) {
   // DWARF v5 uses the .debug_rnglists and .debug_rnglists.dwo sections to
   // describe address ranges.
   if (getVersion() >= 5) {
-    if (IsDWO)
-      setRangesSection(&Context.getDWARFObj().getRnglistsDWOSection(), 0);
-    else
+    // In case of DWP, the base offset from the index has to be added.
+    uint64_t ContributionBaseOffset = 0;
+    if (IsDWO) {
+      if (auto *IndexEntry = Header.getIndexEntry())
+        if (auto *Contrib = IndexEntry->getContribution(DW_SECT_RNGLISTS))
+          ContributionBaseOffset = Contrib->Offset;
+      setRangesSection(
+          &Context.getDWARFObj().getRnglistsDWOSection(),
+          ContributionBaseOffset +
+              DWARFListTableHeader::getHeaderSize(Header.getFormat()));
+    } else
       setRangesSection(&Context.getDWARFObj().getRnglistsSection(),
                        toSectionOffset(UnitDie.find(DW_AT_rnglists_base), 0));
     if (RangeSection->Data.size()) {
@@ -514,19 +521,26 @@ Error DWARFUnit::tryExtractDIEsIfNeeded(bool CUDieOnly) {
       // In a split dwarf unit, there is no DW_AT_rnglists_base attribute.
       // Adjust RangeSectionBase to point past the table header.
       if (IsDWO && RngListTable)
-        RangeSectionBase = RngListTable->getHeaderSize();
+        RangeSectionBase =
+            ContributionBaseOffset + RngListTable->getHeaderSize();
     }
 
     // In a split dwarf unit, there is no DW_AT_loclists_base attribute.
     // Setting LocSectionBase to point past the table header.
-    if (IsDWO)
-      setLocSection(&Context.getDWARFObj().getLoclistsDWOSection(),
+    if (IsDWO) {
+      auto &DWOSection = Context.getDWARFObj().getLoclistsDWOSection();
+      if (DWOSection.Data.empty())
+        return Error::success();
+      setLocSection(&DWOSection,
                     DWARFListTableHeader::getHeaderSize(Header.getFormat()));
-    else
+    } else if (auto X = UnitDie.find(DW_AT_loclists_base)) {
       setLocSection(&Context.getDWARFObj().getLoclistsSection(),
-                    toSectionOffset(UnitDie.find(DW_AT_loclists_base), 0));
+                    toSectionOffset(X, 0));
+    } else {
+      return Error::success();
+    }
 
-    if (LocSection->Data.size()) {
+    if (LocSection) {
       if (IsDWO)
         LoclistTableHeader.emplace(".debug_loclists.dwo", "locations");
       else
@@ -542,6 +556,9 @@ Error DWARFUnit::tryExtractDIEsIfNeeded(bool CUDieOnly) {
                                  " list table with base = 0x%" PRIx64 "\n",
                                  Offset);
       Offset -= HeaderSize;
+      if (auto *IndexEntry = Header.getIndexEntry())
+        if (const auto *Contrib = IndexEntry->getContribution(DW_SECT_LOCLISTS))
+          Offset += Contrib->Offset;
       if (Error E = LoclistTableHeader->extract(Data, &Offset))
         return createStringError(errc::invalid_argument,
                                  "parsing a loclist table: " +
@@ -596,9 +613,10 @@ bool DWARFUnit::parseDWO() {
             RangesDA, RangeSectionBase, Header.getFormat()))
       DWO->RngListTable = TableOrError.get();
     else
-      WithColor::error() << "parsing a range list table: "
-                         << toString(TableOrError.takeError())
-                         << '\n';
+      Context.getRecoverableErrorHandler()(createStringError(
+          errc::invalid_argument, "parsing a range list table: %s",
+          toString(TableOrError.takeError()).c_str()));
+
     if (DWO->RngListTable)
       DWO->RangeSectionBase = DWO->RngListTable->getHeaderSize();
   } else {
@@ -759,7 +777,7 @@ const DWARFUnitIndex &llvm::getDWARFUnitIndex(DWARFContext &Context,
                                               DWARFSectionKind Kind) {
   if (Kind == DW_SECT_INFO)
     return Context.getCUIndex();
-  assert(Kind == DW_SECT_TYPES);
+  assert(Kind == DW_SECT_EXT_TYPES);
   return Context.getTUIndex();
 }
 
@@ -944,18 +962,12 @@ parseDWARFStringOffsetsTableHeader(DWARFDataExtractor &DA,
 
 Expected<Optional<StrOffsetsContributionDescriptor>>
 DWARFUnit::determineStringOffsetsTableContribution(DWARFDataExtractor &DA) {
-  uint64_t Offset;
-  if (IsDWO) {
-    Offset = 0;
-    if (DA.getData().data() == nullptr)
-      return None;
-  } else {
-    auto OptOffset = toSectionOffset(getUnitDIE().find(DW_AT_str_offsets_base));
-    if (!OptOffset)
-      return None;
-    Offset = *OptOffset;
-  }
-  auto DescOrError = parseDWARFStringOffsetsTableHeader(DA, Header.getFormat(), Offset);
+  assert(!IsDWO);
+  auto OptOffset = toSectionOffset(getUnitDIE().find(DW_AT_str_offsets_base));
+  if (!OptOffset)
+    return None;
+  auto DescOrError =
+      parseDWARFStringOffsetsTableHeader(DA, Header.getFormat(), *OptOffset);
   if (!DescOrError)
     return DescOrError.takeError();
   return *DescOrError;
@@ -963,10 +975,11 @@ DWARFUnit::determineStringOffsetsTableContribution(DWARFDataExtractor &DA) {
 
 Expected<Optional<StrOffsetsContributionDescriptor>>
 DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor & DA) {
+  assert(IsDWO);
   uint64_t Offset = 0;
   auto IndexEntry = Header.getIndexEntry();
   const auto *C =
-      IndexEntry ? IndexEntry->getOffset(DW_SECT_STR_OFFSETS) : nullptr;
+      IndexEntry ? IndexEntry->getContribution(DW_SECT_STR_OFFSETS) : nullptr;
   if (C)
     Offset = C->Offset;
   if (getVersion() >= 5) {
@@ -983,11 +996,10 @@ DWARFUnit::determineStringOffsetsTableContributionDWO(DWARFDataExtractor & DA) {
   // index table (in a package file). In a .dwo file it is simply
   // the length of the string offsets section.
   if (!IndexEntry)
-    return {
-        Optional<StrOffsetsContributionDescriptor>(
-            {0, StringOffsetSection.Data.size(), 4, DWARF32})};
+    return {Optional<StrOffsetsContributionDescriptor>(
+        {0, StringOffsetSection.Data.size(), 4, Header.getFormat()})};
   if (C)
     return {Optional<StrOffsetsContributionDescriptor>(
-        {C->Offset, C->Length, 4, DWARF32})};
+        {C->Offset, C->Length, 4, Header.getFormat()})};
   return None;
 }
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp b/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
index f29c1e6cc5c7..3d4cecce27db 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFUnitIndex.cpp
@@ -17,19 +17,102 @@
 
 using namespace llvm;
 
+namespace {
+
+enum class DWARFSectionKindV2 {
+  DW_SECT_INFO = 1,
+  DW_SECT_TYPES = 2,
+  DW_SECT_ABBREV = 3,
+  DW_SECT_LINE = 4,
+  DW_SECT_LOC = 5,
+  DW_SECT_STR_OFFSETS = 6,
+  DW_SECT_MACINFO = 7,
+  DW_SECT_MACRO = 8,
+};
+
+} // namespace
+
+// Return true if the section identifier is defined in the DWARFv5 standard.
+constexpr bool isKnownV5SectionID(uint32_t ID) {
+  return ID >= DW_SECT_INFO && ID <= DW_SECT_RNGLISTS &&
+         ID != DW_SECT_EXT_TYPES;
+}
+
+uint32_t llvm::serializeSectionKind(DWARFSectionKind Kind,
+                                    unsigned IndexVersion) {
+  if (IndexVersion == 5) {
+    assert(isKnownV5SectionID(Kind));
+    return static_cast<uint32_t>(Kind);
+  }
+  assert(IndexVersion == 2);
+  switch (Kind) {
+#define CASE(S,T) \
+  case DW_SECT_##S: \
+    return static_cast<uint32_t>(DWARFSectionKindV2::DW_SECT_##T)
+  CASE(INFO, INFO);
+  CASE(EXT_TYPES, TYPES);
+  CASE(ABBREV, ABBREV);
+  CASE(LINE, LINE);
+  CASE(EXT_LOC, LOC);
+  CASE(STR_OFFSETS, STR_OFFSETS);
+  CASE(EXT_MACINFO, MACINFO);
+  CASE(MACRO, MACRO);
+#undef CASE
+  default:
+    // All other section kinds have no corresponding values in v2 indexes.
+    llvm_unreachable("Invalid DWARFSectionKind");
+  }
+}
+
+DWARFSectionKind llvm::deserializeSectionKind(uint32_t Value,
+                                              unsigned IndexVersion) {
+  if (IndexVersion == 5)
+    return isKnownV5SectionID(Value)
+               ? static_cast<DWARFSectionKind>(Value)
+               : DW_SECT_EXT_unknown;
+  assert(IndexVersion == 2);
+  switch (static_cast<DWARFSectionKindV2>(Value)) {
+#define CASE(S,T) \
+  case DWARFSectionKindV2::DW_SECT_##S: \
+    return DW_SECT_##T
+  CASE(INFO, INFO);
+  CASE(TYPES, EXT_TYPES);
+  CASE(ABBREV, ABBREV);
+  CASE(LINE, LINE);
+  CASE(LOC, EXT_LOC);
+  CASE(STR_OFFSETS, STR_OFFSETS);
+  CASE(MACINFO, EXT_MACINFO);
+  CASE(MACRO, MACRO);
+#undef CASE
+  }
+  return DW_SECT_EXT_unknown;
+}
+
 bool DWARFUnitIndex::Header::parse(DataExtractor IndexData,
                                    uint64_t *OffsetPtr) {
+  const uint64_t BeginOffset = *OffsetPtr;
   if (!IndexData.isValidOffsetForDataOfSize(*OffsetPtr, 16))
     return false;
+  // GCC Debug Fission defines the version as an unsigned 32-bit field
+  // with value of 2, https://gcc.gnu.org/wiki/DebugFissionDWP.
+  // DWARFv5 defines the same space as an uhalf version field with value of 5
+  // and a 2 bytes long padding, see Section 7.3.5.3.
   Version = IndexData.getU32(OffsetPtr);
+  if (Version != 2) {
+    *OffsetPtr = BeginOffset;
+    Version = IndexData.getU16(OffsetPtr);
+    if (Version != 5)
+      return false;
+    *OffsetPtr += 2; // Skip padding.
+  }
   NumColumns = IndexData.getU32(OffsetPtr);
   NumUnits = IndexData.getU32(OffsetPtr);
   NumBuckets = IndexData.getU32(OffsetPtr);
-  return Version <= 2;
+  return true;
 }
 
 void DWARFUnitIndex::Header::dump(raw_ostream &OS) const {
-  OS << format("version = %u slots = %u\n\n", Version, NumBuckets);
+  OS << format("version = %u, units = %u, slots = %u\n\n", Version, NumUnits, NumBuckets);
 }
 
 bool DWARFUnitIndex::parse(DataExtractor IndexData) {
@@ -49,6 +132,10 @@ bool DWARFUnitIndex::parseImpl(DataExtractor IndexData) {
   if (!Header.parse(IndexData, &Offset))
     return false;
 
+  // Fix InfoColumnKind: in DWARFv5, type units are in .debug_info.dwo.
+  if (Header.Version == 5)
+    InfoColumnKind = DW_SECT_INFO;
+
   if (!IndexData.isValidOffsetForDataOfSize(
           Offset, Header.NumBuckets * (8 + 4) +
                       (2 * Header.NumUnits + 1) * 4 * Header.NumColumns))
@@ -58,6 +145,7 @@ bool DWARFUnitIndex::parseImpl(DataExtractor IndexData) {
   auto Contribs =
       std::make_unique<Entry::SectionContribution *[]>(Header.NumUnits);
   ColumnKinds = std::make_unique<DWARFSectionKind[]>(Header.NumColumns);
+  RawSectionIds = std::make_unique<uint32_t[]>(Header.NumColumns);
 
   // Read Hash Table of Signatures
   for (unsigned i = 0; i != Header.NumBuckets; ++i)
@@ -76,7 +164,8 @@ bool DWARFUnitIndex::parseImpl(DataExtractor IndexData) {
 
   // Read the Column Headers
   for (unsigned i = 0; i != Header.NumColumns; ++i) {
-    ColumnKinds[i] = static_cast<DWARFSectionKind>(IndexData.getU32(&Offset));
+    RawSectionIds[i] = IndexData.getU32(&Offset);
+    ColumnKinds[i] = deserializeSectionKind(RawSectionIds[i], Header.Version);
     if (ColumnKinds[i] == InfoColumnKind) {
       if (InfoColumn != -1)
         return false;
@@ -105,20 +194,21 @@ bool DWARFUnitIndex::parseImpl(DataExtractor IndexData) {
 }
 
 StringRef DWARFUnitIndex::getColumnHeader(DWARFSectionKind DS) {
-#define CASE(DS)                                                               \
-  case DW_SECT_##DS:                                                           \
-    return #DS;
   switch (DS) {
-    CASE(INFO);
-    CASE(TYPES);
-    CASE(ABBREV);
-    CASE(LINE);
-    CASE(LOC);
-    CASE(STR_OFFSETS);
-    CASE(MACINFO);
-    CASE(MACRO);
+#define HANDLE_DW_SECT(ID, NAME)                                               \
+  case DW_SECT_##NAME:                                                         \
+    return #NAME;
+#include "llvm/BinaryFormat/Dwarf.def"
+  case DW_SECT_EXT_TYPES:
+    return "TYPES";
+  case DW_SECT_EXT_LOC:
+    return "LOC";
+  case DW_SECT_EXT_MACINFO:
+    return "MACINFO";
+  case DW_SECT_EXT_unknown:
+    return StringRef();
   }
-  llvm_unreachable("unknown DWARFSectionKind");
+  llvm_unreachable("Unknown DWARFSectionKind");
 }
 
 void DWARFUnitIndex::dump(raw_ostream &OS) const {
@@ -127,8 +217,14 @@ void DWARFUnitIndex::dump(raw_ostream &OS) const {
 
   Header.dump(OS);
   OS << "Index Signature         ";
-  for (unsigned i = 0; i != Header.NumColumns; ++i)
-    OS << ' ' << left_justify(getColumnHeader(ColumnKinds[i]), 24);
+  for (unsigned i = 0; i != Header.NumColumns; ++i) {
+    DWARFSectionKind Kind = ColumnKinds[i];
+    StringRef Name = getColumnHeader(Kind);
+    if (!Name.empty())
+      OS << ' ' << left_justify(Name, 24);
+    else
+      OS << format(" Unknown: %-15" PRIu32, RawSectionIds[i]);
+  }
   OS << "\n----- ------------------";
   for (unsigned i = 0; i != Header.NumColumns; ++i)
     OS << " ------------------------";
@@ -148,7 +244,7 @@ void DWARFUnitIndex::dump(raw_ostream &OS) const {
 }
 
 const DWARFUnitIndex::Entry::SectionContribution *
-DWARFUnitIndex::Entry::getOffset(DWARFSectionKind Sec) const {
+DWARFUnitIndex::Entry::getContribution(DWARFSectionKind Sec) const {
   uint32_t i = 0;
   for (; i != Index->Header.NumColumns; ++i)
     if (Index->ColumnKinds[i] == Sec)
@@ -157,7 +253,7 @@ DWARFUnitIndex::Entry::getOffset(DWARFSectionKind Sec) const {
 }
 
 const DWARFUnitIndex::Entry::SectionContribution *
-DWARFUnitIndex::Entry::getOffset() const {
+DWARFUnitIndex::Entry::getContribution() const {
   return &Contributions[Index->InfoColumn];
 }
 
diff --git a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
index 1fd6c1d7d282..3a83317a73a3 100644
--- a/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
+++ b/llvm/lib/DebugInfo/DWARF/DWARFVerifier.cpp
@@ -26,24 +26,26 @@ using namespace llvm;
 using namespace dwarf;
 using namespace object;
 
-DWARFVerifier::DieRangeInfo::address_range_iterator
+Optional<DWARFAddressRange>
 DWARFVerifier::DieRangeInfo::insert(const DWARFAddressRange &R) {
   auto Begin = Ranges.begin();
   auto End = Ranges.end();
   auto Pos = std::lower_bound(Begin, End, R);
 
   if (Pos != End) {
-    if (Pos->intersects(R))
-      return std::move(Pos);
-    if (Pos != Begin) {
-      auto Iter = Pos - 1;
-      if (Iter->intersects(R))
-        return std::move(Iter);
-    }
+    DWARFAddressRange Range(*Pos);
+    if (Pos->merge(R))
+      return Range;
+  }
+  if (Pos != Begin) {
+    auto Iter = Pos - 1;
+    DWARFAddressRange Range(*Iter);
+    if (Iter->merge(R))
+      return Range;
   }
 
   Ranges.insert(Pos, R);
-  return Ranges.end();
+  return None;
 }
 
 DWARFVerifier::DieRangeInfo::die_range_info_iterator
@@ -112,11 +114,9 @@ bool DWARFVerifier::verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
   bool ValidAbbrevOffset = true;
 
   uint64_t OffsetStart = *Offset;
-  Length = DebugInfoData.getU32(Offset);
-  if (Length == dwarf::DW_LENGTH_DWARF64) {
-    Length = DebugInfoData.getU64(Offset);
-    isUnitDWARF64 = true;
-  }
+  DwarfFormat Format;
+  std::tie(Length, Format) = DebugInfoData.getInitialLength(Offset);
+  isUnitDWARF64 = Format == DWARF64;
   Version = DebugInfoData.getU16(Offset);
 
   if (Version >= 5) {
@@ -135,7 +135,7 @@ bool DWARFVerifier::verifyUnitHeader(const DWARFDataExtractor DebugInfoData,
 
   ValidLength = DebugInfoData.isValidOffset(OffsetStart + Length + 3);
   ValidVersion = DWARFContext::isSupportedVersion(Version);
-  ValidAddrSize = AddrSize == 4 || AddrSize == 8;
+  ValidAddrSize = DWARFContext::isAddressSizeSupported(AddrSize);
   if (!ValidLength || !ValidVersion || !ValidAddrSize || !ValidAbbrevOffset ||
       !ValidType) {
     Success = false;
@@ -307,7 +307,7 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
         Unit = TypeUnitVector.addUnit(std::make_unique<DWARFTypeUnit>(
             DCtx, S, Header, DCtx.getDebugAbbrev(), &DObj.getRangesSection(),
             &DObj.getLocSection(), DObj.getStrSection(),
-            DObj.getStrOffsetsSection(), &DObj.getAppleObjCSection(),
+            DObj.getStrOffsetsSection(), &DObj.getAddrSection(),
             DObj.getLineSection(), DCtx.isLittleEndian(), false,
             TypeUnitVector));
         break;
@@ -321,7 +321,7 @@ unsigned DWARFVerifier::verifyUnitSection(const DWARFSection &S,
         Unit = CompileUnitVector.addUnit(std::make_unique<DWARFCompileUnit>(
             DCtx, S, Header, DCtx.getDebugAbbrev(), &DObj.getRangesSection(),
             &DObj.getLocSection(), DObj.getStrSection(),
-            DObj.getStrOffsetsSection(), &DObj.getAppleObjCSection(),
+            DObj.getStrOffsetsSection(), &DObj.getAddrSection(),
             DObj.getLineSection(), DCtx.isLittleEndian(), false,
             CompileUnitVector));
         break;
@@ -354,7 +354,7 @@ bool DWARFVerifier::handleDebugInfo() {
 
   OS << "Verifying .debug_types Unit Header Chain...\n";
   DObj.forEachTypesSections([&](const DWARFSection &S) {
-    NumErrors += verifyUnitSection(S, DW_SECT_TYPES);
+    NumErrors += verifyUnitSection(S, DW_SECT_EXT_TYPES);
   });
   return NumErrors == 0;
 }
@@ -399,22 +399,30 @@ unsigned DWARFVerifier::verifyDieRanges(const DWARFDie &Die,
   // processing an object file.
 
   if (!IsObjectFile || IsMachOObject || Die.getTag() != DW_TAG_compile_unit) {
+    bool DumpDieAfterError = false;
     for (auto Range : Ranges) {
       if (!Range.valid()) {
         ++NumErrors;
         error() << "Invalid address range " << Range << "\n";
+        DumpDieAfterError = true;
         continue;
       }
 
-      // Verify that ranges don't intersect.
-      const auto IntersectingRange = RI.insert(Range);
-      if (IntersectingRange != RI.Ranges.end()) {
+      // Verify that ranges don't intersect and also build up the DieRangeInfo
+      // address ranges. Don't break out of the loop below early, or we will
+      // think this DIE doesn't have all of the address ranges it is supposed
+      // to have. Compile units often have DW_AT_ranges that can contain one or
+      // more dead stripped address ranges which tend to all be at the same
+      // address: 0 or -1.
+      if (auto PrevRange = RI.insert(Range)) {
         ++NumErrors;
-        error() << "DIE has overlapping address ranges: " << Range << " and "
-                << *IntersectingRange << "\n";
-        break;
+        error() << "DIE has overlapping ranges in DW_AT_ranges attribute: "
+                << *PrevRange << " and " << Range << '\n';
+        DumpDieAfterError = true;
       }
     }
+    if (DumpDieAfterError)
+      dump(Die, 2) << '\n';
   }
 
   // Verify that children don't intersect.
@@ -459,8 +467,15 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
   case DW_AT_ranges:
     // Make sure the offset in the DW_AT_ranges attribute is valid.
     if (auto SectionOffset = AttrValue.Value.getAsSectionOffset()) {
-      if (*SectionOffset >= DObj.getRangesSection().Data.size())
-        ReportError("DW_AT_ranges offset is beyond .debug_ranges bounds:");
+      unsigned DwarfVersion = Die.getDwarfUnit()->getVersion();
+      const DWARFSection &RangeSection = DwarfVersion < 5
+                                             ? DObj.getRangesSection()
+                                             : DObj.getRnglistsSection();
+      if (*SectionOffset >= RangeSection.Data.size())
+        ReportError(
+            "DW_AT_ranges offset is beyond " +
+            StringRef(DwarfVersion < 5 ? ".debug_ranges" : ".debug_rnglists") +
+            " bounds: " + llvm::formatv("{0:x8}", *SectionOffset));
       break;
     }
     ReportError("DIE has invalid DW_AT_ranges encoding:");
@@ -481,8 +496,8 @@ unsigned DWARFVerifier::verifyDebugInfoAttribute(const DWARFDie &Die,
       DWARFUnit *U = Die.getDwarfUnit();
       for (const auto &Entry : *Loc) {
         DataExtractor Data(toStringRef(Entry.Expr), DCtx.isLittleEndian(), 0);
-        DWARFExpression Expression(Data, U->getVersion(),
-                                   U->getAddressByteSize());
+        DWARFExpression Expression(Data, U->getAddressByteSize(),
+                                   U->getFormParams().Format);
         bool Error = any_of(Expression, [](DWARFExpression::Operation &Op) {
           return Op.isError();
         });
@@ -758,7 +773,7 @@ void DWARFVerifier::verifyDebugLineRows() {
                 << "] row[" << RowIndex
                 << "] decreases in address from previous row:\n";
 
-        DWARFDebugLine::Row::dumpTableHeader(OS);
+        DWARFDebugLine::Row::dumpTableHeader(OS, 0);
         if (RowIndex > 0)
           LineTable->Rows[RowIndex - 1].dump(OS);
         Row.dump(OS);
@@ -776,7 +791,7 @@ void DWARFVerifier::verifyDebugLineRows() {
                 << " (valid values are [" << (isDWARF5 ? "0," : "1,")
                 << LineTable->Prologue.FileNames.size()
                 << (isDWARF5 ? ")" : "]") << "):\n";
-        DWARFDebugLine::Row::dumpTableHeader(OS);
+        DWARFDebugLine::Row::dumpTableHeader(OS, 0);
         Row.dump(OS);
         OS << '\n';
       }
@@ -1290,7 +1305,8 @@ static bool isVariableIndexable(const DWARFDie &Die, DWARFContext &DCtx) {
   for (const auto &Entry : *Loc) {
     DataExtractor Data(toStringRef(Entry.Expr), DCtx.isLittleEndian(),
                        U->getAddressByteSize());
-    DWARFExpression Expression(Data, U->getVersion(), U->getAddressByteSize());
+    DWARFExpression Expression(Data, U->getAddressByteSize(),
+                               U->getFormParams().Format);
     bool IsInteresting = any_of(Expression, [](DWARFExpression::Operation &Op) {
       return !Op.isError() && (Op.getCode() == DW_OP_addr ||
                                Op.getCode() == DW_OP_form_tls_address ||
@@ -1330,9 +1346,7 @@ unsigned DWARFVerifier::verifyNameIndexCompleteness(
   // "The name index must contain an entry for each debugging information entry
   // that defines a named subprogram, label, variable, type, or namespace,
   // subject to ..."
-  // Instead whitelisting all TAGs representing a "type" or a "subprogram", to
-  // make sure we catch any missing items, we instead blacklist all TAGs that we
-  // know shouldn't be indexed.
+  // Explicitly exclude all TAGs that we know shouldn't be indexed.
   switch (Die.getTag()) {
   // Compile units and modules have names but shouldn't be indexed.
   case DW_TAG_compile_unit:
diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
new file mode 100644
index 000000000000..1e527ab3916e
--- /dev/null
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -0,0 +1,572 @@
+//===- DwarfTransformer.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <thread>
+#include <unordered_set>
+
+#include "llvm/DebugInfo/DIContext.h"
+#include "llvm/DebugInfo/DWARF/DWARFContext.h"
+#include "llvm/Support/Error.h"
+#include "llvm/Support/ThreadPool.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/DebugInfo/GSYM/DwarfTransformer.h"
+#include "llvm/DebugInfo/GSYM/FunctionInfo.h"
+#include "llvm/DebugInfo/GSYM/GsymCreator.h"
+#include "llvm/DebugInfo/GSYM/GsymReader.h"
+#include "llvm/DebugInfo/GSYM/InlineInfo.h"
+
+using namespace llvm;
+using namespace gsym;
+
+struct llvm::gsym::CUInfo {
+  const DWARFDebugLine::LineTable *LineTable;
+  const char *CompDir;
+  std::vector<uint32_t> FileCache;
+  uint64_t Language = 0;
+  uint8_t AddrSize = 0;
+
+  CUInfo(DWARFContext &DICtx, DWARFCompileUnit *CU) {
+    LineTable = DICtx.getLineTableForUnit(CU);
+    CompDir = CU->getCompilationDir();
+    FileCache.clear();
+    if (LineTable)
+      FileCache.assign(LineTable->Prologue.FileNames.size() + 1, UINT32_MAX);
+    DWARFDie Die = CU->getUnitDIE();
+    Language = dwarf::toUnsigned(Die.find(dwarf::DW_AT_language), 0);
+    AddrSize = CU->getAddressByteSize();
+  }
+
+  /// Return true if Addr is the highest address for a given compile unit. The
+  /// highest address is encoded as -1, of all ones in the address. These high
+  /// addresses are used by some linkers to indicate that a function has been
+  /// dead stripped or didn't end up in the linked executable.
+  bool isHighestAddress(uint64_t Addr) const {
+    if (AddrSize == 4)
+      return Addr == UINT32_MAX;
+    else if (AddrSize == 8)
+      return Addr == UINT64_MAX;
+    return false;
+  }
+
+  /// Convert a DWARF compile unit file index into a GSYM global file index.
+  ///
+  /// Each compile unit in DWARF has its own file table in the line table
+  /// prologue. GSYM has a single large file table that applies to all files
+  /// from all of the info in a GSYM file. This function converts between the
+  /// two and caches and DWARF CU file index that has already been converted so
+  /// the first client that asks for a compile unit file index will end up
+  /// doing the conversion, and subsequent clients will get the cached GSYM
+  /// index.
+  uint32_t DWARFToGSYMFileIndex(GsymCreator &Gsym, uint32_t DwarfFileIdx) {
+    if (!LineTable)
+      return 0;
+    assert(DwarfFileIdx < FileCache.size());
+    uint32_t &GsymFileIdx = FileCache[DwarfFileIdx];
+    if (GsymFileIdx != UINT32_MAX)
+      return GsymFileIdx;
+    std::string File;
+    if (LineTable->getFileNameByIndex(
+            DwarfFileIdx, CompDir,
+            DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, File))
+      GsymFileIdx = Gsym.insertFile(File);
+    else
+      GsymFileIdx = 0;
+    return GsymFileIdx;
+  }
+};
+
+
+static DWARFDie GetParentDeclContextDIE(DWARFDie &Die) {
+  if (DWARFDie SpecDie =
+          Die.getAttributeValueAsReferencedDie(dwarf::DW_AT_specification)) {
+    if (DWARFDie SpecParent = GetParentDeclContextDIE(SpecDie))
+      return SpecParent;
+  }
+  if (DWARFDie AbstDie =
+          Die.getAttributeValueAsReferencedDie(dwarf::DW_AT_abstract_origin)) {
+    if (DWARFDie AbstParent = GetParentDeclContextDIE(AbstDie))
+      return AbstParent;
+  }
+
+  // We never want to follow parent for inlined subroutine - that would
+  // give us information about where the function is inlined, not what
+  // function is inlined
+  if (Die.getTag() == dwarf::DW_TAG_inlined_subroutine)
+    return DWARFDie();
+
+  DWARFDie ParentDie = Die.getParent();
+  if (!ParentDie)
+    return DWARFDie();
+
+  switch (ParentDie.getTag()) {
+  case dwarf::DW_TAG_namespace:
+  case dwarf::DW_TAG_structure_type:
+  case dwarf::DW_TAG_union_type:
+  case dwarf::DW_TAG_class_type:
+  case dwarf::DW_TAG_subprogram:
+    return ParentDie; // Found parent decl context DIE
+  case dwarf::DW_TAG_lexical_block:
+    return GetParentDeclContextDIE(ParentDie);
+  default:
+    break;
+  }
+
+  return DWARFDie();
+}
+
+/// Get the GsymCreator string table offset for the qualified name for the
+/// DIE passed in. This function will avoid making copies of any strings in
+/// the GsymCreator when possible. We don't need to copy a string when the
+/// string comes from our .debug_str section or is an inlined string in the
+/// .debug_info. If we create a qualified name string in this function by
+/// combining multiple strings in the DWARF string table or info, we will make
+/// a copy of the string when we add it to the string table.
+static Optional<uint32_t> getQualifiedNameIndex(DWARFDie &Die,
+                                                uint64_t Language,
+                                                GsymCreator &Gsym) {
+  // If the dwarf has mangled name, use mangled name
+  if (auto LinkageName =
+          dwarf::toString(Die.findRecursively({dwarf::DW_AT_MIPS_linkage_name,
+                                               dwarf::DW_AT_linkage_name}),
+                          nullptr))
+    return Gsym.insertString(LinkageName, /* Copy */ false);
+
+  StringRef ShortName(Die.getName(DINameKind::ShortName));
+  if (ShortName.empty())
+    return llvm::None;
+
+  // For C++ and ObjC, prepend names of all parent declaration contexts
+  if (!(Language == dwarf::DW_LANG_C_plus_plus ||
+        Language == dwarf::DW_LANG_C_plus_plus_03 ||
+        Language == dwarf::DW_LANG_C_plus_plus_11 ||
+        Language == dwarf::DW_LANG_C_plus_plus_14 ||
+        Language == dwarf::DW_LANG_ObjC_plus_plus ||
+        // This should not be needed for C, but we see C++ code marked as C
+        // in some binaries. This should hurt, so let's do it for C as well
+        Language == dwarf::DW_LANG_C))
+    return Gsym.insertString(ShortName, /* Copy */ false);
+
+  // Some GCC optimizations create functions with names ending with .isra.<num>
+  // or .part.<num> and those names are just DW_AT_name, not DW_AT_linkage_name
+  // If it looks like it could be the case, don't add any prefix
+  if (ShortName.startswith("_Z") &&
+      (ShortName.contains(".isra.") || ShortName.contains(".part.")))
+    return Gsym.insertString(ShortName, /* Copy */ false);
+
+  DWARFDie ParentDeclCtxDie = GetParentDeclContextDIE(Die);
+  if (ParentDeclCtxDie) {
+    std::string Name = ShortName.str();
+    while (ParentDeclCtxDie) {
+      StringRef ParentName(ParentDeclCtxDie.getName(DINameKind::ShortName));
+      if (!ParentName.empty()) {
+        // "lambda" names are wrapped in < >. Replace with { }
+        // to be consistent with demangled names and not to confuse with
+        // templates
+        if (ParentName.front() == '<' && ParentName.back() == '>')
+          Name = "{" + ParentName.substr(1, ParentName.size() - 2).str() + "}" +
+                "::" + Name;
+        else
+          Name = ParentName.str() + "::" + Name;
+      }
+      ParentDeclCtxDie = GetParentDeclContextDIE(ParentDeclCtxDie);
+    }
+    // Copy the name since we created a new name in a std::string.
+    return Gsym.insertString(Name, /* Copy */ true);
+  }
+  // Don't copy the name since it exists in the DWARF object file.
+  return Gsym.insertString(ShortName, /* Copy */ false);
+}
+
+static bool hasInlineInfo(DWARFDie Die, uint32_t Depth) {
+  bool CheckChildren = true;
+  switch (Die.getTag()) {
+  case dwarf::DW_TAG_subprogram:
+    // Don't look into functions within functions.
+    CheckChildren = Depth == 0;
+    break;
+  case dwarf::DW_TAG_inlined_subroutine:
+    return true;
+  default:
+    break;
+  }
+  if (!CheckChildren)
+    return false;
+  for (DWARFDie ChildDie : Die.children()) {
+    if (hasInlineInfo(ChildDie, Depth + 1))
+      return true;
+  }
+  return false;
+}
+
+static void parseInlineInfo(GsymCreator &Gsym, CUInfo &CUI, DWARFDie Die,
+                            uint32_t Depth, FunctionInfo &FI,
+                            InlineInfo &parent) {
+  if (!hasInlineInfo(Die, Depth))
+    return;
+
+  dwarf::Tag Tag = Die.getTag();
+  if (Tag == dwarf::DW_TAG_inlined_subroutine) {
+    // create new InlineInfo and append to parent.children
+    InlineInfo II;
+    DWARFAddressRange FuncRange =
+        DWARFAddressRange(FI.startAddress(), FI.endAddress());
+    Expected<DWARFAddressRangesVector> RangesOrError = Die.getAddressRanges();
+    if (RangesOrError) {
+      for (const DWARFAddressRange &Range : RangesOrError.get()) {
+        // Check that the inlined function is within the range of the function
+        // info, it might not be in case of split functions
+        if (FuncRange.LowPC <= Range.LowPC && Range.HighPC <= FuncRange.HighPC)
+          II.Ranges.insert(AddressRange(Range.LowPC, Range.HighPC));
+      }
+    }
+    if (II.Ranges.empty())
+      return;
+
+    if (auto NameIndex = getQualifiedNameIndex(Die, CUI.Language, Gsym))
+      II.Name = *NameIndex;
+    II.CallFile = CUI.DWARFToGSYMFileIndex(
+        Gsym, dwarf::toUnsigned(Die.find(dwarf::DW_AT_call_file), 0));
+    II.CallLine = dwarf::toUnsigned(Die.find(dwarf::DW_AT_call_line), 0);
+    // parse all children and append to parent
+    for (DWARFDie ChildDie : Die.children())
+      parseInlineInfo(Gsym, CUI, ChildDie, Depth + 1, FI, II);
+    parent.Children.emplace_back(std::move(II));
+    return;
+  }
+  if (Tag == dwarf::DW_TAG_subprogram || Tag == dwarf::DW_TAG_lexical_block) {
+    // skip this Die and just recurse down
+    for (DWARFDie ChildDie : Die.children())
+      parseInlineInfo(Gsym, CUI, ChildDie, Depth + 1, FI, parent);
+  }
+}
+
+static void convertFunctionLineTable(raw_ostream &Log, CUInfo &CUI,
+                                     DWARFDie Die, GsymCreator &Gsym,
+                                     FunctionInfo &FI) {
+  std::vector<uint32_t> RowVector;
+  const uint64_t StartAddress = FI.startAddress();
+  const uint64_t EndAddress = FI.endAddress();
+  const uint64_t RangeSize = EndAddress - StartAddress;
+  const object::SectionedAddress SecAddress{
+      StartAddress, object::SectionedAddress::UndefSection};
+
+
+  if (!CUI.LineTable->lookupAddressRange(SecAddress, RangeSize, RowVector)) {
+    // If we have a DW_TAG_subprogram but no line entries, fall back to using
+    // the DW_AT_decl_file an d DW_AT_decl_line if we have both attributes.
+    if (auto FileIdx =
+            dwarf::toUnsigned(Die.findRecursively({dwarf::DW_AT_decl_file}))) {
+      if (auto Line =
+              dwarf::toUnsigned(Die.findRecursively({dwarf::DW_AT_decl_line}))) {
+        LineEntry LE(StartAddress, CUI.DWARFToGSYMFileIndex(Gsym, *FileIdx),
+                     *Line);
+        FI.OptLineTable = LineTable();
+        FI.OptLineTable->push(LE);
+        // LE.Addr = EndAddress;
+        // FI.OptLineTable->push(LE);
+      }
+    }
+    return;
+  }
+
+  FI.OptLineTable = LineTable();
+  DWARFDebugLine::Row PrevRow;
+  for (uint32_t RowIndex : RowVector) {
+    // Take file number and line/column from the row.
+    const DWARFDebugLine::Row &Row = CUI.LineTable->Rows[RowIndex];
+    const uint32_t FileIdx = CUI.DWARFToGSYMFileIndex(Gsym, Row.File);
+    uint64_t RowAddress = Row.Address.Address;
+    // Watch out for a RowAddress that is in the middle of a line table entry
+    // in the DWARF. If we pass an address in between two line table entries
+    // we will get a RowIndex for the previous valid line table row which won't
+    // be contained in our function. This is usually a bug in the DWARF due to
+    // linker problems or LTO or other DWARF re-linking so it is worth emitting
+    // an error, but not worth stopping the creation of the GSYM.
+    if (!FI.Range.contains(RowAddress)) {
+      if (RowAddress < FI.Range.Start) {
+        Log << "error: DIE has a start address whose LowPC is between the "
+          "line table Row[" << RowIndex << "] with address "
+          << HEX64(RowAddress) << " and the next one.\n";
+        Die.dump(Log, 0, DIDumpOptions::getForSingleDIE());
+        RowAddress = FI.Range.Start;
+      } else {
+        continue;
+      }
+    }
+
+    LineEntry LE(RowAddress, FileIdx, Row.Line);
+    if (RowIndex != RowVector[0] && Row.Address < PrevRow.Address) {
+      // We have seen full duplicate line tables for functions in some
+      // DWARF files. Watch for those here by checking the the last
+      // row was the function's end address (HighPC) and that the
+      // current line table entry's address is the same as the first
+      // line entry we already have in our "function_info.Lines". If
+      // so break out after printing a warning.
+      auto FirstLE = FI.OptLineTable->first();
+      if (FirstLE && *FirstLE == LE) {
+        Log << "warning: duplicate line table detected for DIE:\n";
+        Die.dump(Log, 0, DIDumpOptions::getForSingleDIE());
+      } else {
+        // Print out (ignore if os == nulls as this is expensive)
+        Log << "error: line table has addresses that do not "
+             << "monotonically increase:\n";
+        for (uint32_t RowIndex2 : RowVector) {
+          CUI.LineTable->Rows[RowIndex2].dump(Log);
+        }
+        Die.dump(Log, 0, DIDumpOptions::getForSingleDIE());
+      }
+      break;
+    }
+
+    // Skip multiple line entries for the same file and line.
+    auto LastLE = FI.OptLineTable->last();
+    if (LastLE && LastLE->File == FileIdx && LastLE->Line == Row.Line)
+        continue;
+    // Only push a row if it isn't an end sequence. End sequence markers are
+    // included for the last address in a function or the last contiguous
+    // address in a sequence.
+    if (Row.EndSequence) {
+      // End sequence means that the next line entry could have a lower address
+      // that the previous entries. So we clear the previous row so we don't
+      // trigger the line table error about address that do not monotonically
+      // increase.
+      PrevRow = DWARFDebugLine::Row();
+    } else {
+      FI.OptLineTable->push(LE);
+      PrevRow = Row;
+    }
+  }
+  // If not line table rows were added, clear the line table so we don't encode
+  // on in the GSYM file.
+  if (FI.OptLineTable->empty())
+    FI.OptLineTable = llvm::None;
+}
+
+void DwarfTransformer::handleDie(raw_ostream &OS, CUInfo &CUI, DWARFDie Die) {
+  switch (Die.getTag()) {
+  case dwarf::DW_TAG_subprogram: {
+    Expected<DWARFAddressRangesVector> RangesOrError = Die.getAddressRanges();
+    if (!RangesOrError) {
+      consumeError(RangesOrError.takeError());
+      break;
+    }
+    const DWARFAddressRangesVector &Ranges = RangesOrError.get();
+    if (Ranges.empty())
+      break;
+    auto NameIndex = getQualifiedNameIndex(Die, CUI.Language, Gsym);
+    if (!NameIndex) {
+      OS << "error: function at " << HEX64(Die.getOffset())
+         << " has no name\n ";
+      Die.dump(OS, 0, DIDumpOptions::getForSingleDIE());
+      break;
+    }
+
+    // Create a function_info for each range
+    for (const DWARFAddressRange &Range : Ranges) {
+      // The low PC must be less than the high PC. Many linkers don't remove
+      // DWARF for functions that don't get linked into the final executable.
+      // If both the high and low pc have relocations, linkers will often set
+      // the address values for both to the same value to indicate the function
+      // has been remove. Other linkers have been known to set the one or both
+      // PC values to a UINT32_MAX for 4 byte addresses and UINT64_MAX for 8
+      // byte addresses to indicate the function isn't valid. The check below
+      // tries to watch for these cases and abort if it runs into them.
+      if (Range.LowPC >= Range.HighPC || CUI.isHighestAddress(Range.LowPC))
+        break;
+
+      // Many linkers can't remove DWARF and might set the LowPC to zero. Since
+      // high PC can be an offset from the low PC in more recent DWARF versions
+      // we need to watch for a zero'ed low pc which we do using
+      // ValidTextRanges below.
+      if (!Gsym.IsValidTextAddress(Range.LowPC)) {
+        // We expect zero and -1 to be invalid addresses in DWARF depending
+        // on the linker of the DWARF. This indicates a function was stripped
+        // and the debug info wasn't able to be stripped from the DWARF. If
+        // the LowPC isn't zero or -1, then we should emit an error.
+        if (Range.LowPC != 0) {
+          // Unexpected invalid address, emit an error
+          Log << "warning: DIE has an address range whose start address is "
+              "not in any executable sections (" <<
+              *Gsym.GetValidTextRanges() << ") and will not be processed:\n";
+          Die.dump(Log, 0, DIDumpOptions::getForSingleDIE());
+        }
+        break;
+      }
+
+      FunctionInfo FI;
+      FI.setStartAddress(Range.LowPC);
+      FI.setEndAddress(Range.HighPC);
+      FI.Name = *NameIndex;
+      if (CUI.LineTable) {
+        convertFunctionLineTable(OS, CUI, Die, Gsym, FI);
+      }
+      if (hasInlineInfo(Die, 0)) {
+        FI.Inline = InlineInfo();
+        FI.Inline->Name = *NameIndex;
+        FI.Inline->Ranges.insert(FI.Range);
+        parseInlineInfo(Gsym, CUI, Die, 0, FI, *FI.Inline);
+      }
+      Gsym.addFunctionInfo(std::move(FI));
+    }
+  } break;
+  default:
+    break;
+  }
+  for (DWARFDie ChildDie : Die.children())
+    handleDie(OS, CUI, ChildDie);
+}
+
+Error DwarfTransformer::convert(uint32_t NumThreads) {
+  size_t NumBefore = Gsym.getNumFunctionInfos();
+  if (NumThreads == 1) {
+    // Parse all DWARF data from this thread, use the same string/file table
+    // for everything
+    for (const auto &CU : DICtx.compile_units()) {
+      DWARFDie Die = CU->getUnitDIE(false);
+      CUInfo CUI(DICtx, dyn_cast<DWARFCompileUnit>(CU.get()));
+      handleDie(Log, CUI, Die);
+    }
+  } else {
+    // LLVM Dwarf parser is not thread-safe and we need to parse all DWARF up
+    // front before we start accessing any DIEs since there might be
+    // cross compile unit references in the DWARF. If we don't do this we can
+    // end up crashing.
+
+    // We need to call getAbbreviations sequentially first so that getUnitDIE()
+    // only works with its local data.
+    for (const auto &CU : DICtx.compile_units())
+      CU->getAbbreviations();
+
+    // Now parse all DIEs in case we have cross compile unit references in a
+    // thread pool.
+    ThreadPool pool(hardware_concurrency(NumThreads));
+    for (const auto &CU : DICtx.compile_units())
+      pool.async([&CU]() { CU->getUnitDIE(false /*CUDieOnly*/); });
+    pool.wait();
+
+    // Now convert all DWARF to GSYM in a thread pool.
+    std::mutex LogMutex;
+    for (const auto &CU : DICtx.compile_units()) {
+      DWARFDie Die = CU->getUnitDIE(false /*CUDieOnly*/);
+      if (Die) {
+        CUInfo CUI(DICtx, dyn_cast<DWARFCompileUnit>(CU.get()));
+        pool.async([this, CUI, &LogMutex, Die]() mutable {
+          std::string ThreadLogStorage;
+          raw_string_ostream ThreadOS(ThreadLogStorage);
+          handleDie(ThreadOS, CUI, Die);
+          ThreadOS.flush();
+          if (!ThreadLogStorage.empty()) {
+            // Print ThreadLogStorage lines into an actual stream under a lock
+            std::lock_guard<std::mutex> guard(LogMutex);
+            Log << ThreadLogStorage;
+          }
+        });
+      }
+    }
+    pool.wait();
+  }
+  size_t FunctionsAddedCount = Gsym.getNumFunctionInfos() - NumBefore;
+  Log << "Loaded " << FunctionsAddedCount << " functions from DWARF.\n";
+  return Error::success();
+}
+
+llvm::Error DwarfTransformer::verify(StringRef GsymPath) {
+  Log << "Verifying GSYM file \"" << GsymPath << "\":\n";
+
+  auto Gsym = GsymReader::openFile(GsymPath);
+  if (!Gsym)
+    return Gsym.takeError();
+
+  auto NumAddrs = Gsym->getNumAddresses();
+  DILineInfoSpecifier DLIS(
+      DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath,
+      DILineInfoSpecifier::FunctionNameKind::LinkageName);
+  std::string gsymFilename;
+  for (uint32_t I = 0; I < NumAddrs; ++I) {
+    auto FuncAddr = Gsym->getAddress(I);
+    if (!FuncAddr)
+        return createStringError(std::errc::invalid_argument,
+                                  "failed to extract address[%i]", I);
+
+    auto FI = Gsym->getFunctionInfo(*FuncAddr);
+    if (!FI)
+      return createStringError(std::errc::invalid_argument,
+                            "failed to extract function info for address 0x%"
+                            PRIu64, *FuncAddr);
+
+    for (auto Addr = *FuncAddr; Addr < *FuncAddr + FI->size(); ++Addr) {
+      const object::SectionedAddress SectAddr{
+          Addr, object::SectionedAddress::UndefSection};
+      auto LR = Gsym->lookup(Addr);
+      if (!LR)
+        return LR.takeError();
+
+      auto DwarfInlineInfos =
+          DICtx.getInliningInfoForAddress(SectAddr, DLIS);
+      uint32_t NumDwarfInlineInfos = DwarfInlineInfos.getNumberOfFrames();
+      if (NumDwarfInlineInfos == 0) {
+        DwarfInlineInfos.addFrame(
+            DICtx.getLineInfoForAddress(SectAddr, DLIS));
+      }
+
+      // Check for 1 entry that has no file and line info
+      if (NumDwarfInlineInfos == 1 &&
+          DwarfInlineInfos.getFrame(0).FileName == "<invalid>") {
+        DwarfInlineInfos = DIInliningInfo();
+        NumDwarfInlineInfos = 0;
+      }
+      if (NumDwarfInlineInfos > 0 &&
+          NumDwarfInlineInfos != LR->Locations.size()) {
+        Log << "error: address " << HEX64(Addr) << " has "
+            << NumDwarfInlineInfos << " DWARF inline frames and GSYM has "
+            << LR->Locations.size() << "\n";
+        Log << "    " << NumDwarfInlineInfos << " DWARF frames:\n";
+        for (size_t Idx = 0; Idx < NumDwarfInlineInfos; ++Idx) {
+          const auto dii = DwarfInlineInfos.getFrame(Idx);
+          Log << "    [" << Idx << "]: " << dii.FunctionName << " @ "
+              << dii.FileName << ':' << dii.Line << '\n';
+        }
+        Log << "    " << LR->Locations.size() << " GSYM frames:\n";
+        for (size_t Idx = 0, count = LR->Locations.size();
+              Idx < count; ++Idx) {
+          const auto &gii = LR->Locations[Idx];
+          Log << "    [" << Idx << "]: " << gii.Name << " @ " << gii.Dir
+              << '/' << gii.Base << ':' << gii.Line << '\n';
+        }
+        DwarfInlineInfos = DICtx.getInliningInfoForAddress(SectAddr, DLIS);
+        Gsym->dump(Log, *FI);
+        continue;
+      }
+
+      for (size_t Idx = 0, count = LR->Locations.size(); Idx < count;
+            ++Idx) {
+        const auto &gii = LR->Locations[Idx];
+        if (Idx < NumDwarfInlineInfos) {
+          const auto dii = DwarfInlineInfos.getFrame(Idx);
+          gsymFilename = LR->getSourceFile(Idx);
+          // Verify function name
+          if (dii.FunctionName.find(gii.Name.str()) != 0)
+            Log << "error: address " << HEX64(Addr) << " DWARF function \""
+                << dii.FunctionName.c_str()
+                << "\" doesn't match GSYM function \"" << gii.Name << "\"\n";
+          // Verify source file path
+          if (dii.FileName != gsymFilename)
+            Log << "error: address " << HEX64(Addr) << " DWARF path \""
+                << dii.FileName.c_str() << "\" doesn't match GSYM path \""
+                << gsymFilename.c_str() << "\"\n";
+          // Verify source file line
+          if (dii.Line != gii.Line)
+            Log << "error: address " << HEX64(Addr) << " DWARF line "
+                << dii.Line << " != GSYM line " << gii.Line << "\n";
+        }
+      }
+    }
+  }
+  return Error::success();
+}
diff --git a/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp b/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp
index 6731a8b27443..cef1b9498c5c 100644
--- a/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp
+++ b/llvm/lib/DebugInfo/GSYM/FunctionInfo.cpp
@@ -25,8 +25,11 @@ enum InfoType : uint32_t {
 };
 
 raw_ostream &llvm::gsym::operator<<(raw_ostream &OS, const FunctionInfo &FI) {
-  OS << '[' << HEX64(FI.Range.Start) << '-' << HEX64(FI.Range.End) << "): "
-     << "Name=" << HEX32(FI.Name) << '\n' << FI.OptLineTable << FI.Inline;
+  OS << FI.Range << ": " << "Name=" << HEX32(FI.Name) << '\n';
+  if (FI.OptLineTable)
+    OS << FI.OptLineTable << '\n';
+  if (FI.Inline)
+    OS << FI.Inline << '\n';
   return OS;
 }
 
@@ -167,7 +170,7 @@ llvm::Expected<LookupResult> FunctionInfo::lookup(DataExtractor &Data,
   // This function will be called with the result of a binary search of the
   // address table, we must still make sure the address does not fall into a
   // gap between functions are after the last function.
-  if (Addr >= LR.FuncRange.End)
+  if (LR.FuncRange.size() > 0 && !LR.FuncRange.contains(Addr))
     return createStringError(std::errc::io_error,
         "address 0x%" PRIx64 " is not in GSYM", Addr);
 
@@ -220,6 +223,7 @@ llvm::Expected<LookupResult> FunctionInfo::lookup(DataExtractor &Data,
     // location as best we can and return.
     SourceLocation SrcLoc;
     SrcLoc.Name = LR.FuncName;
+    SrcLoc.Offset = Addr - FuncAddr;
     LR.Locations.push_back(SrcLoc);
     return LR;
   }
@@ -232,6 +236,7 @@ llvm::Expected<LookupResult> FunctionInfo::lookup(DataExtractor &Data,
 
   SourceLocation SrcLoc;
   SrcLoc.Name = LR.FuncName;
+  SrcLoc.Offset = Addr - FuncAddr;
   SrcLoc.Dir = GR.getString(LineEntryFile->Dir);
   SrcLoc.Base = GR.getString(LineEntryFile->Base);
   SrcLoc.Line = LineEntry->Line;
diff --git a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
index f371426f2010..7d9b72c6283d 100644
--- a/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
+++ b/llvm/lib/DebugInfo/GSYM/GsymCreator.cpp
@@ -29,7 +29,13 @@ uint32_t GsymCreator::insertFile(StringRef Path,
                                  llvm::sys::path::Style Style) {
   llvm::StringRef directory = llvm::sys::path::parent_path(Path, Style);
   llvm::StringRef filename = llvm::sys::path::filename(Path, Style);
-  FileEntry FE(insertString(directory), insertString(filename));
+  // We must insert the strings first, then call the FileEntry constructor.
+  // If we inline the insertString() function call into the constructor, the
+  // call order is undefined due to parameter lists not having any ordering
+  // requirements.
+  const uint32_t Dir = insertString(directory);
+  const uint32_t Base = insertString(filename);
+  FileEntry FE(Dir, Base);
 
   std::lock_guard<std::recursive_mutex> Guard(Mutex);
   const auto NextIndex = Files.size();
@@ -62,7 +68,8 @@ llvm::Error GsymCreator::encode(FileWriter &O) const {
   if (Funcs.size() > UINT32_MAX)
     return createStringError(std::errc::invalid_argument,
                              "too many FunctionInfos");
-  const uint64_t MinAddr = Funcs.front().startAddress();
+
+  const uint64_t MinAddr = BaseAddress ? *BaseAddress : Funcs.front().startAddress();
   const uint64_t MaxAddr = Funcs.back().startAddress();
   const uint64_t AddrDelta = MaxAddr - MinAddr;
   Header Hdr;
@@ -73,7 +80,7 @@ llvm::Error GsymCreator::encode(FileWriter &O) const {
   Hdr.BaseAddress = MinAddr;
   Hdr.NumAddresses = static_cast<uint32_t>(Funcs.size());
   Hdr.StrtabOffset = 0; // We will fix this up later.
-  Hdr.StrtabOffset = 0; // We will fix this up later.
+  Hdr.StrtabSize = 0; // We will fix this up later.
   memset(Hdr.UUID, 0, sizeof(Hdr.UUID));
   if (UUID.size() > sizeof(Hdr.UUID))
     return createStringError(std::errc::invalid_argument,
@@ -203,9 +210,8 @@ llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
           // that have debug info are last in the sort.
           if (*Prev == *Curr) {
             // FunctionInfo entries match exactly (range, lines, inlines)
-            OS << "warning: duplicate function info entries, removing "
-                  "duplicate:\n"
-               << *Curr << '\n';
+            OS << "warning: duplicate function info entries for range: "
+               << Curr->Range << '\n';
             Curr = Funcs.erase(Prev);
           } else {
             if (!Prev->hasRichInfo() && Curr->hasRichInfo()) {
@@ -239,20 +245,43 @@ llvm::Error GsymCreator::finalize(llvm::raw_ostream &OS) {
     Prev = Curr++;
   }
 
+  // If our last function info entry doesn't have a size and if we have valid
+  // text ranges, we should set the size of the last entry since any search for
+  // a high address might match our last entry. By fixing up this size, we can
+  // help ensure we don't cause lookups to always return the last symbol that
+  // has no size when doing lookups.
+  if (!Funcs.empty() && Funcs.back().Range.size() == 0 && ValidTextRanges) {
+    if (auto Range = ValidTextRanges->getRangeThatContains(
+          Funcs.back().Range.Start)) {
+      Funcs.back().Range.End = Range->End;
+    }
+  }
   OS << "Pruned " << NumBefore - Funcs.size() << " functions, ended with "
      << Funcs.size() << " total\n";
   return Error::success();
 }
 
-uint32_t GsymCreator::insertString(StringRef S) {
-  std::lock_guard<std::recursive_mutex> Guard(Mutex);
+uint32_t GsymCreator::insertString(StringRef S, bool Copy) {
   if (S.empty())
     return 0;
+  std::lock_guard<std::recursive_mutex> Guard(Mutex);
+  if (Copy) {
+    // We need to provide backing storage for the string if requested
+    // since StringTableBuilder stores references to strings. Any string
+    // that comes from a section in an object file doesn't need to be
+    // copied, but any string created by code will need to be copied.
+    // This allows GsymCreator to be really fast when parsing DWARF and
+    // other object files as most strings don't need to be copied.
+    CachedHashStringRef CHStr(S);
+    if (!StrTab.contains(CHStr))
+      S = StringStorage.insert(S).first->getKey();
+  }
   return StrTab.add(S);
 }
 
 void GsymCreator::addFunctionInfo(FunctionInfo &&FI) {
   std::lock_guard<std::recursive_mutex> Guard(Mutex);
+  Ranges.insert(FI.Range);
   Funcs.emplace_back(FI);
 }
 
@@ -273,3 +302,19 @@ void GsymCreator::forEachFunctionInfo(
       break;
   }
 }
+
+size_t GsymCreator::getNumFunctionInfos() const{
+  std::lock_guard<std::recursive_mutex> Guard(Mutex);
+  return Funcs.size();
+}
+
+bool GsymCreator::IsValidTextAddress(uint64_t Addr) const {
+  if (ValidTextRanges)
+    return ValidTextRanges->contains(Addr);
+  return true; // No valid text ranges has been set, so accept all ranges.
+}
+
+bool GsymCreator::hasFunctionInfoForAddress(uint64_t Addr) const {
+  std::lock_guard<std::recursive_mutex> Guard(Mutex);
+  return Ranges.contains(Addr);
+}
diff --git a/llvm/lib/DebugInfo/GSYM/GsymReader.cpp b/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
index b4f3f2052ae7..2ad18bf63d5d 100644
--- a/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
+++ b/llvm/lib/DebugInfo/GSYM/GsymReader.cpp
@@ -225,20 +225,33 @@ Optional<uint64_t> GsymReader::getAddressInfoOffset(size_t Index) const {
 
 Expected<uint64_t>
 GsymReader::getAddressIndex(const uint64_t Addr) const {
-  if (Addr < Hdr->BaseAddress)
-    return createStringError(std::errc::invalid_argument,
-                             "address 0x%" PRIx64 " not in GSYM", Addr);
-  const uint64_t AddrOffset = Addr - Hdr->BaseAddress;
-  switch (Hdr->AddrOffSize) {
-  case 1: return getAddressOffsetIndex<uint8_t>(AddrOffset);
-  case 2: return getAddressOffsetIndex<uint16_t>(AddrOffset);
-  case 4: return getAddressOffsetIndex<uint32_t>(AddrOffset);
-  case 8: return getAddressOffsetIndex<uint64_t>(AddrOffset);
-  default: break;
+  if (Addr >= Hdr->BaseAddress) {
+    const uint64_t AddrOffset = Addr - Hdr->BaseAddress;
+    Optional<uint64_t> AddrOffsetIndex;
+    switch (Hdr->AddrOffSize) {
+    case 1:
+      AddrOffsetIndex = getAddressOffsetIndex<uint8_t>(AddrOffset);
+      break;
+    case 2:
+      AddrOffsetIndex = getAddressOffsetIndex<uint16_t>(AddrOffset);
+      break;
+    case 4:
+      AddrOffsetIndex = getAddressOffsetIndex<uint32_t>(AddrOffset);
+      break;
+    case 8:
+      AddrOffsetIndex = getAddressOffsetIndex<uint64_t>(AddrOffset);
+      break;
+    default:
+      return createStringError(std::errc::invalid_argument,
+                               "unsupported address offset size %u",
+                               Hdr->AddrOffSize);
+    }
+    if (AddrOffsetIndex)
+      return *AddrOffsetIndex;
   }
   return createStringError(std::errc::invalid_argument,
-                           "unsupported address offset size %u",
-                           Hdr->AddrOffSize);
+                           "address 0x%" PRIx64 " is not in GSYM", Addr);
+
 }
 
 llvm::Expected<FunctionInfo> GsymReader::getFunctionInfo(uint64_t Addr) const {
@@ -255,7 +268,7 @@ llvm::Expected<FunctionInfo> GsymReader::getFunctionInfo(uint64_t Addr) const {
       if (ExpectedFI->Range.contains(Addr) || ExpectedFI->Range.size() == 0)
         return ExpectedFI;
       return createStringError(std::errc::invalid_argument,
-                                "address 0x%" PRIx64 " not in GSYM", Addr);
+                                "address 0x%" PRIx64 " is not in GSYM", Addr);
     }
   }
   return createStringError(std::errc::invalid_argument,
@@ -277,3 +290,117 @@ llvm::Expected<LookupResult> GsymReader::lookup(uint64_t Addr) const {
                            "failed to extract address[%" PRIu64 "]",
                            *AddressIndex);
 }
+
+void GsymReader::dump(raw_ostream &OS) {
+  const auto &Header = getHeader();
+  // Dump the GSYM header.
+  OS << Header << "\n";
+  // Dump the address table.
+  OS << "Address Table:\n";
+  OS << "INDEX  OFFSET";
+
+  switch (Hdr->AddrOffSize) {
+  case 1: OS << "8 "; break;
+  case 2: OS << "16"; break;
+  case 4: OS << "32"; break;
+  case 8: OS << "64"; break;
+  default: OS << "??"; break;
+  }
+  OS << " (ADDRESS)\n";
+  OS << "====== =============================== \n";
+  for (uint32_t I = 0; I < Header.NumAddresses; ++I) {
+    OS << format("[%4u] ", I);
+    switch (Hdr->AddrOffSize) {
+    case 1: OS << HEX8(getAddrOffsets<uint8_t>()[I]); break;
+    case 2: OS << HEX16(getAddrOffsets<uint16_t>()[I]); break;
+    case 4: OS << HEX32(getAddrOffsets<uint32_t>()[I]); break;
+    case 8: OS << HEX32(getAddrOffsets<uint64_t>()[I]); break;
+    default: break;
+    }
+    OS << " (" << HEX64(*getAddress(I)) << ")\n";
+  }
+  // Dump the address info offsets table.
+  OS << "\nAddress Info Offsets:\n";
+  OS << "INDEX  Offset\n";
+  OS << "====== ==========\n";
+  for (uint32_t I = 0; I < Header.NumAddresses; ++I)
+    OS << format("[%4u] ", I) << HEX32(AddrInfoOffsets[I]) << "\n";
+  // Dump the file table.
+  OS << "\nFiles:\n";
+  OS << "INDEX  DIRECTORY  BASENAME   PATH\n";
+  OS << "====== ========== ========== ==============================\n";
+  for (uint32_t I = 0; I < Files.size(); ++I) {
+    OS << format("[%4u] ", I) << HEX32(Files[I].Dir) << ' '
+       << HEX32(Files[I].Base) << ' ';
+    dump(OS, getFile(I));
+    OS << "\n";
+  }
+  OS << "\n" << StrTab << "\n";
+
+  for (uint32_t I = 0; I < Header.NumAddresses; ++I) {
+    OS << "FunctionInfo @ " << HEX32(AddrInfoOffsets[I]) << ": ";
+    if (auto FI = getFunctionInfo(*getAddress(I)))
+      dump(OS, *FI);
+    else
+      logAllUnhandledErrors(FI.takeError(), OS, "FunctionInfo:");
+  }
+}
+
+void GsymReader::dump(raw_ostream &OS, const FunctionInfo &FI) {
+  OS << FI.Range << " \"" << getString(FI.Name) << "\"\n";
+  if (FI.OptLineTable)
+    dump(OS, *FI.OptLineTable);
+  if (FI.Inline)
+    dump(OS, *FI.Inline);
+}
+
+void GsymReader::dump(raw_ostream &OS, const LineTable &LT) {
+  OS << "LineTable:\n";
+  for (auto &LE: LT) {
+    OS << "  " << HEX64(LE.Addr) << ' ';
+    if (LE.File)
+      dump(OS, getFile(LE.File));
+    OS << ':' << LE.Line << '\n';
+  }
+}
+
+void GsymReader::dump(raw_ostream &OS, const InlineInfo &II, uint32_t Indent) {
+  if (Indent == 0)
+    OS << "InlineInfo:\n";
+  else
+    OS.indent(Indent);
+  OS << II.Ranges << ' ' << getString(II.Name);
+  if (II.CallFile != 0) {
+    if (auto File = getFile(II.CallFile)) {
+      OS << " called from ";
+      dump(OS, File);
+      OS << ':' << II.CallLine;
+    }
+  }
+  OS << '\n';
+  for (const auto &ChildII: II.Children)
+    dump(OS, ChildII, Indent + 2);
+}
+
+void GsymReader::dump(raw_ostream &OS, Optional<FileEntry> FE) {
+  if (FE) {
+    // IF we have the file from index 0, then don't print anything
+    if (FE->Dir == 0 && FE->Base == 0)
+      return;
+    StringRef Dir = getString(FE->Dir);
+    StringRef Base = getString(FE->Base);
+    if (!Dir.empty()) {
+      OS << Dir;
+      if (Dir.contains('\\') && !Dir.contains('/'))
+        OS << '\\';
+      else
+        OS << '/';
+    }
+    if (!Base.empty()) {
+      OS << Base;
+    }
+    if (!Dir.empty() || !Base.empty())
+      return;
+  }
+  OS << "<invalid-file>";
+}
diff --git a/llvm/lib/DebugInfo/GSYM/InlineInfo.cpp b/llvm/lib/DebugInfo/GSYM/InlineInfo.cpp
index 1b8c974fdcd2..21679b1b78aa 100644
--- a/llvm/lib/DebugInfo/GSYM/InlineInfo.cpp
+++ b/llvm/lib/DebugInfo/GSYM/InlineInfo.cpp
@@ -142,13 +142,17 @@ static bool lookup(const GsymReader &GR, DataExtractor &Data, uint64_t &Offset,
     return false;
   }
 
-  SourceLocation SrcLoc;
-  SrcLoc.Name = SrcLocs.back().Name;
-  SrcLoc.Dir = GR.getString(CallFile->Dir);
-  SrcLoc.Base = GR.getString(CallFile->Base);
-  SrcLoc.Line = Inline.CallLine;
-  SrcLocs.back().Name = GR.getString(Inline.Name);
-  SrcLocs.push_back(SrcLoc);
+  if (CallFile->Dir || CallFile->Base) {
+    SourceLocation SrcLoc;
+    SrcLoc.Name = SrcLocs.back().Name;
+    SrcLoc.Offset = SrcLocs.back().Offset;
+    SrcLoc.Dir = GR.getString(CallFile->Dir);
+    SrcLoc.Base = GR.getString(CallFile->Base);
+    SrcLoc.Line = Inline.CallLine;
+    SrcLocs.back().Name = GR.getString(Inline.Name);
+    SrcLocs.back().Offset = Addr - Inline.Ranges[0].Start;
+    SrcLocs.push_back(SrcLoc);
+  }
   return true;
 }
 
diff --git a/llvm/lib/DebugInfo/GSYM/LookupResult.cpp b/llvm/lib/DebugInfo/GSYM/LookupResult.cpp
index c54b166b2887..8a624226b1d3 100644
--- a/llvm/lib/DebugInfo/GSYM/LookupResult.cpp
+++ b/llvm/lib/DebugInfo/GSYM/LookupResult.cpp
@@ -21,7 +21,7 @@ std::string LookupResult::getSourceFile(uint32_t Index) const {
   if (Index < Locations.size()) {
     if (!Locations[Index].Dir.empty()) {
       if (Locations[Index].Base.empty()) {
-        Fullpath = Locations[Index].Dir;
+        Fullpath = std::string(Locations[Index].Dir);
       } else {
         llvm::SmallString<64> Storage;
         llvm::sys::path::append(Storage, Locations[Index].Dir,
@@ -29,25 +29,30 @@ std::string LookupResult::getSourceFile(uint32_t Index) const {
         Fullpath.assign(Storage.begin(), Storage.end());
       }
     } else if (!Locations[Index].Base.empty())
-      Fullpath = Locations[Index].Base;
+      Fullpath = std::string(Locations[Index].Base);
   }
   return Fullpath;
 }
 
 raw_ostream &llvm::gsym::operator<<(raw_ostream &OS, const SourceLocation &SL) {
-  OS << SL.Name << " @ ";
-  if (!SL.Dir.empty()) {
-    OS << SL.Dir;
-    if (SL.Dir.contains('\\') and not SL.Dir.contains('/'))
-      OS << '\\';
+  OS << SL.Name;
+  if (SL.Offset > 0)
+    OS << " + " << SL.Offset;
+  if (SL.Dir.size() || SL.Base.size()) {
+    OS << " @ ";
+    if (!SL.Dir.empty()) {
+      OS << SL.Dir;
+      if (SL.Dir.contains('\\') and not SL.Dir.contains('/'))
+        OS << '\\';
+      else
+        OS << '/';
+    }
+    if (SL.Base.empty())
+      OS << "<invalid-file>";
     else
-      OS << '/';
+      OS << SL.Base;
+    OS << ':' << SL.Line;
   }
-  if (SL.Base.empty())
-    OS << "<invalid-file>";
-  else
-    OS << SL.Base;
-  OS << ':' << SL.Line;
   return OS;
 }
 
diff --git a/llvm/lib/DebugInfo/GSYM/ObjectFileTransformer.cpp b/llvm/lib/DebugInfo/GSYM/ObjectFileTransformer.cpp
new file mode 100644
index 000000000000..ad35aefe7774
--- /dev/null
+++ b/llvm/lib/DebugInfo/GSYM/ObjectFileTransformer.cpp
@@ -0,0 +1,116 @@
+//===- ObjectFileTransformer.cpp --------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include <unordered_set>
+
+#include "llvm/Object/ELFObjectFile.h"
+#include "llvm/Object/MachOUniversal.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/DataExtractor.h"
+#include "llvm/Support/raw_ostream.h"
+
+#include "llvm/DebugInfo/GSYM/ObjectFileTransformer.h"
+#include "llvm/DebugInfo/GSYM/GsymCreator.h"
+
+using namespace llvm;
+using namespace gsym;
+
+constexpr uint32_t NT_GNU_BUILD_ID_TAG = 0x03;
+
+static std::vector<uint8_t> getUUID(const object::ObjectFile &Obj) {
+  // Extract the UUID from the object file
+  std::vector<uint8_t> UUID;
+  if (auto *MachO = dyn_cast<object::MachOObjectFile>(&Obj)) {
+    const ArrayRef<uint8_t> MachUUID = MachO->getUuid();
+    if (!MachUUID.empty())
+      UUID.assign(MachUUID.data(), MachUUID.data() + MachUUID.size());
+  } else if (isa<object::ELFObjectFileBase>(&Obj)) {
+    const StringRef GNUBuildID(".note.gnu.build-id");
+    for (const object::SectionRef &Sect : Obj.sections()) {
+      Expected<StringRef> SectNameOrErr = Sect.getName();
+      if (!SectNameOrErr) {
+        consumeError(SectNameOrErr.takeError());
+        continue;
+      }
+      StringRef SectName(*SectNameOrErr);
+      if (SectName != GNUBuildID)
+        continue;
+      StringRef BuildIDData;
+      Expected<StringRef> E = Sect.getContents();
+      if (E)
+        BuildIDData = *E;
+      else {
+        consumeError(E.takeError());
+        continue;
+      }
+      DataExtractor Decoder(BuildIDData, Obj.makeTriple().isLittleEndian(), 8);
+      uint64_t Offset = 0;
+      const uint32_t NameSize = Decoder.getU32(&Offset);
+      const uint32_t PayloadSize = Decoder.getU32(&Offset);
+      const uint32_t PayloadType = Decoder.getU32(&Offset);
+      StringRef Name(Decoder.getFixedLengthString(&Offset, NameSize));
+      if (Name == "GNU" && PayloadType == NT_GNU_BUILD_ID_TAG) {
+        Offset = alignTo(Offset, 4);
+        StringRef UUIDBytes(Decoder.getBytes(&Offset, PayloadSize));
+        if (!UUIDBytes.empty()) {
+          auto Ptr = reinterpret_cast<const uint8_t *>(UUIDBytes.data());
+          UUID.assign(Ptr, Ptr + UUIDBytes.size());
+        }
+      }
+    }
+  }
+  return UUID;
+}
+
+llvm::Error ObjectFileTransformer::convert(const object::ObjectFile &Obj,
+                                           raw_ostream &Log,
+                                           GsymCreator &Gsym) {
+  using namespace llvm::object;
+
+  const bool IsMachO = isa<MachOObjectFile>(&Obj);
+  const bool IsELF = isa<ELFObjectFileBase>(&Obj);
+
+  // Read build ID.
+  Gsym.setUUID(getUUID(Obj));
+
+  // Parse the symbol table.
+  size_t NumBefore = Gsym.getNumFunctionInfos();
+  for (const object::SymbolRef &Sym : Obj.symbols()) {
+    Expected<SymbolRef::Type> SymType = Sym.getType();
+    if (!SymType) {
+      consumeError(SymType.takeError());
+      continue;
+    }
+    Expected<uint64_t> AddrOrErr = Sym.getValue();
+    if (!AddrOrErr)
+      // TODO: Test this error.
+      return AddrOrErr.takeError();
+
+    if (SymType.get() != SymbolRef::Type::ST_Function ||
+        !Gsym.IsValidTextAddress(*AddrOrErr) ||
+        Gsym.hasFunctionInfoForAddress(*AddrOrErr))
+      continue;
+    // Function size for MachO files will be 0
+    constexpr bool NoCopy = false;
+    const uint64_t size = IsELF ? ELFSymbolRef(Sym).getSize() : 0;
+    Expected<StringRef> Name = Sym.getName();
+    if (!Name) {
+      logAllUnhandledErrors(Name.takeError(), Log, "ObjectFileTransformer: ");
+      continue;
+    }
+    // Remove the leading '_' character in any symbol names if there is one
+    // for mach-o files.
+    if (IsMachO)
+      Name->consume_front("_");
+    Gsym.addFunctionInfo(
+        FunctionInfo(*AddrOrErr, size, Gsym.insertString(*Name, NoCopy)));
+  }
+  size_t FunctionsAddedCount = Gsym.getNumFunctionInfos() - NumBefore;
+  Log << "Loaded " << FunctionsAddedCount << " functions from symbol table.\n";
+  return Error::success();
+}
diff --git a/llvm/lib/DebugInfo/GSYM/Range.cpp b/llvm/lib/DebugInfo/GSYM/Range.cpp
index f78101e49bf8..044ddb8ba1ba 100644
--- a/llvm/lib/DebugInfo/GSYM/Range.cpp
+++ b/llvm/lib/DebugInfo/GSYM/Range.cpp
@@ -53,6 +53,16 @@ bool AddressRanges::contains(AddressRange Range) const {
   return Range.End <= It[-1].End;
 }
 
+Optional<AddressRange>
+AddressRanges::getRangeThatContains(uint64_t Addr) const {
+  auto It = std::partition_point(
+      Ranges.begin(), Ranges.end(),
+      [=](const AddressRange &R) { return R.Start <= Addr; });
+  if (It != Ranges.begin() && Addr < It[-1].End)
+    return It[-1];
+  return llvm::None;
+}
+
 raw_ostream &llvm::gsym::operator<<(raw_ostream &OS, const AddressRange &R) {
   return OS << '[' << HEX64(R.Start) << " - " << HEX64(R.End) << ")";
 }
diff --git a/llvm/lib/DebugInfo/PDB/DIA/DIASession.cpp b/llvm/lib/DebugInfo/PDB/DIA/DIASession.cpp
index 64ffa776bbd6..2729e3236965 100644
--- a/llvm/lib/DebugInfo/PDB/DIA/DIASession.cpp
+++ b/llvm/lib/DebugInfo/PDB/DIA/DIASession.cpp
@@ -189,8 +189,8 @@ DIASession::getSymbolById(SymIndexId SymbolId) const {
   return PDBSymbol::create(*this, std::move(RawSymbol));
 }
 
-std::unique_ptr<PDBSymbol>
-DIASession::findSymbolByAddress(uint64_t Address, PDB_SymType Type) const {
+std::unique_ptr<PDBSymbol> DIASession::findSymbolByAddress(uint64_t Address,
+                                                           PDB_SymType Type) {
   enum SymTagEnum EnumVal = static_cast<enum SymTagEnum>(Type);
 
   CComPtr<IDiaSymbol> Symbol;
@@ -207,7 +207,7 @@ DIASession::findSymbolByAddress(uint64_t Address, PDB_SymType Type) const {
 }
 
 std::unique_ptr<PDBSymbol> DIASession::findSymbolByRVA(uint32_t RVA,
-                                                       PDB_SymType Type) const {
+                                                       PDB_SymType Type) {
   enum SymTagEnum EnumVal = static_cast<enum SymTagEnum>(Type);
 
   CComPtr<IDiaSymbol> Symbol;
@@ -220,7 +220,7 @@ std::unique_ptr<PDBSymbol> DIASession::findSymbolByRVA(uint32_t RVA,
 
 std::unique_ptr<PDBSymbol>
 DIASession::findSymbolBySectOffset(uint32_t Sect, uint32_t Offset,
-                                   PDB_SymType Type) const {
+                                   PDB_SymType Type) {
   enum SymTagEnum EnumVal = static_cast<enum SymTagEnum>(Type);
 
   CComPtr<IDiaSymbol> Symbol;
diff --git a/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
index 419734771ccd..73801ea1dd1b 100644
--- a/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/DbiModuleDescriptorBuilder.cpp
@@ -39,7 +39,7 @@ static uint32_t calculateDiSymbolStreamSize(uint32_t SymbolByteSize,
 DbiModuleDescriptorBuilder::DbiModuleDescriptorBuilder(StringRef ModuleName,
                                                        uint32_t ModIndex,
                                                        msf::MSFBuilder &Msf)
-    : MSF(Msf), ModuleName(ModuleName) {
+    : MSF(Msf), ModuleName(std::string(ModuleName)) {
   ::memset(&Layout, 0, sizeof(Layout));
   Layout.Mod = ModIndex;
 }
@@ -51,7 +51,7 @@ uint16_t DbiModuleDescriptorBuilder::getStreamIndex() const {
 }
 
 void DbiModuleDescriptorBuilder::setObjFileName(StringRef Name) {
-  ObjFileName = Name;
+  ObjFileName = std::string(Name);
 }
 
 void DbiModuleDescriptorBuilder::setPdbFilePathNI(uint32_t NI) {
@@ -83,14 +83,13 @@ void DbiModuleDescriptorBuilder::addSymbolsInBulk(
 }
 
 void DbiModuleDescriptorBuilder::addSourceFile(StringRef Path) {
-  SourceFiles.push_back(Path);
+  SourceFiles.push_back(std::string(Path));
 }
 
 uint32_t DbiModuleDescriptorBuilder::calculateC13DebugInfoSize() const {
   uint32_t Result = 0;
   for (const auto &Builder : C13Builders) {
-    assert(Builder && "Empty C13 Fragment Builder!");
-    Result += Builder->calculateSerializedLength();
+    Result += Builder.calculateSerializedLength();
   }
   return Result;
 }
@@ -163,8 +162,7 @@ Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter,
            "Invalid debug section alignment!");
     // TODO: Write C11 Line data
     for (const auto &Builder : C13Builders) {
-      assert(Builder && "Empty C13 Fragment Builder!");
-      if (auto EC = Builder->commit(SymbolWriter))
+      if (auto EC = Builder.commit(SymbolWriter, CodeViewContainer::Pdb))
         return EC;
     }
 
@@ -180,12 +178,10 @@ Error DbiModuleDescriptorBuilder::commit(BinaryStreamWriter &ModiWriter,
 void DbiModuleDescriptorBuilder::addDebugSubsection(
     std::shared_ptr<DebugSubsection> Subsection) {
   assert(Subsection);
-  C13Builders.push_back(std::make_unique<DebugSubsectionRecordBuilder>(
-      std::move(Subsection), CodeViewContainer::Pdb));
+  C13Builders.push_back(DebugSubsectionRecordBuilder(std::move(Subsection)));
 }
 
 void DbiModuleDescriptorBuilder::addDebugSubsection(
     const DebugSubsectionRecord &SubsectionContents) {
-  C13Builders.push_back(std::make_unique<DebugSubsectionRecordBuilder>(
-      SubsectionContents, CodeViewContainer::Pdb));
+  C13Builders.push_back(DebugSubsectionRecordBuilder(SubsectionContents));
 }
diff --git a/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
index 0e00c2f7ff98..627aef7506fd 100644
--- a/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/DbiStreamBuilder.cpp
@@ -58,10 +58,6 @@ void DbiStreamBuilder::setMachineType(COFF::MachineTypes M) {
   MachineType = static_cast<pdb::PDB_Machine>(static_cast<unsigned>(M));
 }
 
-void DbiStreamBuilder::setSectionMap(ArrayRef<SecMapEntry> SecMap) {
-  SectionMap = SecMap;
-}
-
 void DbiStreamBuilder::setGlobalsStreamIndex(uint32_t Index) {
   GlobalsStreamIndex = Index;
 }
@@ -348,19 +344,18 @@ static uint16_t toSecMapFlags(uint32_t Flags) {
   return Ret;
 }
 
-// A utility function to create a Section Map for a given list of COFF sections.
+// Populate the Section Map from COFF section headers.
 //
 // A Section Map seem to be a copy of a COFF section list in other format.
 // I don't know why a PDB file contains both a COFF section header and
 // a Section Map, but it seems it must be present in a PDB.
-std::vector<SecMapEntry> DbiStreamBuilder::createSectionMap(
+void DbiStreamBuilder::createSectionMap(
     ArrayRef<llvm::object::coff_section> SecHdrs) {
-  std::vector<SecMapEntry> Ret;
   int Idx = 0;
 
   auto Add = [&]() -> SecMapEntry & {
-    Ret.emplace_back();
-    auto &Entry = Ret.back();
+    SectionMap.emplace_back();
+    auto &Entry = SectionMap.back();
     memset(&Entry, 0, sizeof(Entry));
 
     Entry.Frame = Idx + 1;
@@ -384,8 +379,6 @@ std::vector<SecMapEntry> DbiStreamBuilder::createSectionMap(
   Entry.Flags = static_cast<uint16_t>(OMFSegDescFlags::AddressIs32Bit) |
                 static_cast<uint16_t>(OMFSegDescFlags::IsAbsoluteAddress);
   Entry.SecByteLength = UINT32_MAX;
-
-  return Ret;
 }
 
 Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
@@ -417,7 +410,7 @@ Error DbiStreamBuilder::commit(const msf::MSFLayout &Layout,
     SecMapHeader SMHeader = {Size, Size};
     if (auto EC = Writer.writeObject(SMHeader))
       return EC;
-    if (auto EC = Writer.writeArray(SectionMap))
+    if (auto EC = Writer.writeArray(makeArrayRef(SectionMap)))
       return EC;
   }
 
diff --git a/llvm/lib/DebugInfo/PDB/Native/EnumTables.cpp b/llvm/lib/DebugInfo/PDB/Native/EnumTables.cpp
index f5125393695b..37192ba36a04 100644
--- a/llvm/lib/DebugInfo/PDB/Native/EnumTables.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/EnumTables.cpp
@@ -34,4 +34,4 @@ ArrayRef<EnumEntry<uint16_t>> getOMFSegMapDescFlagNames() {
   return makeArrayRef(OMFSegMapDescFlagNames);
 }
 }
-}
-\ No newline at end of file
+}
diff --git a/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
index 432f1e9b24d3..4e58489f1401 100644
--- a/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/GSIStreamBuilder.cpp
@@ -5,10 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+//
+// The data structures defined in this file are based on the reference
+// implementation which is available at
+// https://github.com/Microsoft/microsoft-pdb/blob/master/PDB/dbi/gsi.cpp
+//
+//===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/GSIStreamBuilder.h"
-
-#include "llvm/ADT/DenseSet.h"
 #include "llvm/DebugInfo/CodeView/RecordName.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/SymbolRecord.h"
@@ -20,6 +24,7 @@
 #include "llvm/DebugInfo/PDB/Native/Hash.h"
 #include "llvm/Support/BinaryItemStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
+#include "llvm/Support/Parallel.h"
 #include "llvm/Support/xxhash.h"
 #include <algorithm>
 #include <vector>
@@ -29,53 +34,91 @@ using namespace llvm::msf;
 using namespace llvm::pdb;
 using namespace llvm::codeview;
 
+// Helper class for building the public and global PDB hash table buckets.
 struct llvm::pdb::GSIHashStreamBuilder {
-  struct SymbolDenseMapInfo {
-    static inline CVSymbol getEmptyKey() {
-      static CVSymbol Empty;
-      return Empty;
-    }
-    static inline CVSymbol getTombstoneKey() {
-      static CVSymbol Tombstone(
-          DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
-      return Tombstone;
-    }
-    static unsigned getHashValue(const CVSymbol &Val) {
-      return xxHash64(Val.RecordData);
-    }
-    static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
-      return LHS.RecordData == RHS.RecordData;
-    }
-  };
+  // Sum of the size of all public or global records.
+  uint32_t RecordByteSize = 0;
 
-  std::vector<CVSymbol> Records;
-  uint32_t StreamIndex;
-  llvm::DenseSet<CVSymbol, SymbolDenseMapInfo> SymbolHashes;
   std::vector<PSHashRecord> HashRecords;
+
+  // The hash bitmap has `ceil((IPHR_HASH + 1) / 32)` words in it. The
+  // reference implementation builds a hash table with IPHR_HASH buckets in it.
+  // The last bucket is used to link together free hash table cells in a linked
+  // list, but it is always empty in the compressed, on-disk format. However,
+  // the bitmap must have a bit for it.
   std::array<support::ulittle32_t, (IPHR_HASH + 32) / 32> HashBitmap;
+
   std::vector<support::ulittle32_t> HashBuckets;
 
   uint32_t calculateSerializedLength() const;
-  uint32_t calculateRecordByteSize() const;
   Error commit(BinaryStreamWriter &Writer);
-  void finalizeBuckets(uint32_t RecordZeroOffset);
 
-  template <typename T> void addSymbol(const T &Symbol, MSFBuilder &Msf) {
-    T Copy(Symbol);
-    addSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
-                                               CodeViewContainer::Pdb));
-  }
-  void addSymbol(const CVSymbol &Symbol) {
-    if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
-      auto Iter = SymbolHashes.insert(Symbol);
-      if (!Iter.second)
-        return;
-    }
+  void finalizePublicBuckets();
+  void finalizeGlobalBuckets(uint32_t RecordZeroOffset);
+
+  // Assign public and global symbol records into hash table buckets.
+  // Modifies the list of records to store the bucket index, but does not
+  // change the order.
+  void finalizeBuckets(uint32_t RecordZeroOffset,
+                       MutableArrayRef<BulkPublic> Globals);
+};
 
-    Records.push_back(Symbol);
+// DenseMapInfo implementation for deduplicating symbol records.
+struct llvm::pdb::SymbolDenseMapInfo {
+  static inline CVSymbol getEmptyKey() {
+    static CVSymbol Empty;
+    return Empty;
+  }
+  static inline CVSymbol getTombstoneKey() {
+    static CVSymbol Tombstone(
+        DenseMapInfo<ArrayRef<uint8_t>>::getTombstoneKey());
+    return Tombstone;
+  }
+  static unsigned getHashValue(const CVSymbol &Val) {
+    return xxHash64(Val.RecordData);
+  }
+  static bool isEqual(const CVSymbol &LHS, const CVSymbol &RHS) {
+    return LHS.RecordData == RHS.RecordData;
   }
 };
 
+namespace {
+LLVM_PACKED_START
+struct PublicSym32Layout {
+  RecordPrefix Prefix;
+  PublicSym32Header Pub;
+  // char Name[];
+};
+LLVM_PACKED_END
+} // namespace
+
+// Calculate how much memory this public needs when serialized.
+static uint32_t sizeOfPublic(const BulkPublic &Pub) {
+  uint32_t NameLen = Pub.NameLen;
+  NameLen = std::min(NameLen,
+                     uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
+  return alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
+}
+
+static CVSymbol serializePublic(uint8_t *Mem, const BulkPublic &Pub) {
+  // Assume the caller has allocated sizeOfPublic bytes.
+  uint32_t NameLen = std::min(
+      Pub.NameLen, uint32_t(MaxRecordLength - sizeof(PublicSym32Layout) - 1));
+  size_t Size = alignTo(sizeof(PublicSym32Layout) + NameLen + 1, 4);
+  assert(Size == sizeOfPublic(Pub));
+  auto *FixedMem = reinterpret_cast<PublicSym32Layout *>(Mem);
+  FixedMem->Prefix.RecordKind = static_cast<uint16_t>(codeview::S_PUB32);
+  FixedMem->Prefix.RecordLen = static_cast<uint16_t>(Size - 2);
+  FixedMem->Pub.Flags = Pub.Flags;
+  FixedMem->Pub.Offset = Pub.Offset;
+  FixedMem->Pub.Segment = Pub.Segment;
+  char *NameMem = reinterpret_cast<char *>(FixedMem + 1);
+  memcpy(NameMem, Pub.Name, NameLen);
+  // Zero the null terminator and remaining bytes.
+  memset(&NameMem[NameLen], 0, Size - sizeof(PublicSym32Layout) - NameLen);
+  return CVSymbol(makeArrayRef(reinterpret_cast<uint8_t *>(Mem), Size));
+}
+
 uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
   uint32_t Size = sizeof(GSIHashHeader);
   Size += HashRecords.size() * sizeof(PSHashRecord);
@@ -84,13 +127,6 @@ uint32_t GSIHashStreamBuilder::calculateSerializedLength() const {
   return Size;
 }
 
-uint32_t GSIHashStreamBuilder::calculateRecordByteSize() const {
-  uint32_t Size = 0;
-  for (const auto &Sym : Records)
-    Size += Sym.length();
-  return Size;
-}
-
 Error GSIHashStreamBuilder::commit(BinaryStreamWriter &Writer) {
   GSIHashHeader Header;
   Header.VerSignature = GSIHashHeader::HdrSignature;
@@ -115,70 +151,134 @@ static bool isAsciiString(StringRef S) {
 }
 
 // See `caseInsensitiveComparePchPchCchCch` in gsi.cpp
-static bool gsiRecordLess(StringRef S1, StringRef S2) {
+static int gsiRecordCmp(StringRef S1, StringRef S2) {
   size_t LS = S1.size();
   size_t RS = S2.size();
   // Shorter strings always compare less than longer strings.
   if (LS != RS)
-    return LS < RS;
+    return LS - RS;
 
   // If either string contains non ascii characters, memcmp them.
   if (LLVM_UNLIKELY(!isAsciiString(S1) || !isAsciiString(S2)))
-    return memcmp(S1.data(), S2.data(), LS) < 0;
+    return memcmp(S1.data(), S2.data(), LS);
 
   // Both strings are ascii, perform a case-insenstive comparison.
-  return S1.compare_lower(S2.data()) < 0;
+  return S1.compare_lower(S2.data());
+}
+
+void GSIStreamBuilder::finalizePublicBuckets() {
+  PSH->finalizeBuckets(0, Publics);
 }
 
-void GSIHashStreamBuilder::finalizeBuckets(uint32_t RecordZeroOffset) {
-  std::array<std::vector<std::pair<StringRef, PSHashRecord>>, IPHR_HASH + 1>
-      TmpBuckets;
+void GSIStreamBuilder::finalizeGlobalBuckets(uint32_t RecordZeroOffset) {
+  // Build up a list of globals to be bucketed. Use the BulkPublic data
+  // structure for this purpose, even though these are global records, not
+  // public records. Most of the same fields are required:
+  // - Name
+  // - NameLen
+  // - SymOffset
+  // - BucketIdx
+  // The dead fields are Offset, Segment, and Flags.
+  std::vector<BulkPublic> Records;
+  Records.resize(Globals.size());
   uint32_t SymOffset = RecordZeroOffset;
-  for (const CVSymbol &Sym : Records) {
-    PSHashRecord HR;
-    // Add one when writing symbol offsets to disk. See GSI1::fixSymRecs.
-    HR.Off = SymOffset + 1;
-    HR.CRef = 1; // Always use a refcount of 1.
-
-    // Hash the name to figure out which bucket this goes into.
-    StringRef Name = getSymbolName(Sym);
-    size_t BucketIdx = hashStringV1(Name) % IPHR_HASH;
-    TmpBuckets[BucketIdx].push_back(std::make_pair(Name, HR));
-    SymOffset += Sym.length();
+  for (size_t I = 0, E = Globals.size(); I < E; ++I) {
+    StringRef Name = getSymbolName(Globals[I]);
+    Records[I].Name = Name.data();
+    Records[I].NameLen = Name.size();
+    Records[I].SymOffset = SymOffset;
+    SymOffset += Globals[I].length();
+  }
+
+  GSH->finalizeBuckets(RecordZeroOffset, Records);
+}
+
+void GSIHashStreamBuilder::finalizeBuckets(
+    uint32_t RecordZeroOffset, MutableArrayRef<BulkPublic> Records) {
+  // Hash every name in parallel.
+  parallelForEachN(0, Records.size(), [&](size_t I) {
+    Records[I].setBucketIdx(hashStringV1(Records[I].Name) % IPHR_HASH);
+  });
+
+  // Count up the size of each bucket. Then, use an exclusive prefix sum to
+  // calculate the bucket start offsets. This is C++17 std::exclusive_scan, but
+  // we can't use it yet.
+  uint32_t BucketStarts[IPHR_HASH] = {0};
+  for (const BulkPublic &P : Records)
+    ++BucketStarts[P.BucketIdx];
+  uint32_t Sum = 0;
+  for (uint32_t &B : BucketStarts) {
+    uint32_t Size = B;
+    B = Sum;
+    Sum += Size;
+  }
+
+  // Place globals into the hash table in bucket order. When placing a global,
+  // update the bucket start. Every hash table slot should be filled. Always use
+  // a refcount of one for now.
+  HashRecords.resize(Records.size());
+  uint32_t BucketCursors[IPHR_HASH];
+  memcpy(BucketCursors, BucketStarts, sizeof(BucketCursors));
+  for (int I = 0, E = Records.size(); I < E; ++I) {
+    uint32_t HashIdx = BucketCursors[Records[I].BucketIdx]++;
+    HashRecords[HashIdx].Off = I;
+    HashRecords[HashIdx].CRef = 1;
   }
 
-  // Compute the three tables: the hash records in bucket and chain order, the
-  // bucket presence bitmap, and the bucket chain start offsets.
-  HashRecords.reserve(Records.size());
-  for (ulittle32_t &Word : HashBitmap)
-    Word = 0;
-  for (size_t BucketIdx = 0; BucketIdx < IPHR_HASH + 1; ++BucketIdx) {
-    auto &Bucket = TmpBuckets[BucketIdx];
-    if (Bucket.empty())
-      continue;
-    HashBitmap[BucketIdx / 32] |= 1U << (BucketIdx % 32);
-
-    // Calculate what the offset of the first hash record in the chain would
-    // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
-    // each record would be 12 bytes. See HROffsetCalc in gsi.h.
-    const int SizeOfHROffsetCalc = 12;
-    ulittle32_t ChainStartOff =
-        ulittle32_t(HashRecords.size() * SizeOfHROffsetCalc);
-    HashBuckets.push_back(ChainStartOff);
-
-    // Sort each bucket by memcmp of the symbol's name.  It's important that
-    // we use the same sorting algorithm as is used by the reference
-    // implementation to ensure that the search for a record within a bucket
-    // can properly early-out when it detects the record won't be found.  The
-    // algorithm used here corredsponds to the function
-    // caseInsensitiveComparePchPchCchCch in the reference implementation.
-    llvm::sort(Bucket, [](const std::pair<StringRef, PSHashRecord> &Left,
-                          const std::pair<StringRef, PSHashRecord> &Right) {
-      return gsiRecordLess(Left.first, Right.first);
-    });
-
-    for (const auto &Entry : Bucket)
-      HashRecords.push_back(Entry.second);
+  // Within the buckets, sort each bucket by memcmp of the symbol's name.  It's
+  // important that we use the same sorting algorithm as is used by the
+  // reference implementation to ensure that the search for a record within a
+  // bucket can properly early-out when it detects the record won't be found.
+  // The algorithm used here corresponds to the function
+  // caseInsensitiveComparePchPchCchCch in the reference implementation.
+  parallelForEachN(0, IPHR_HASH, [&](size_t I) {
+    auto B = HashRecords.begin() + BucketStarts[I];
+    auto E = HashRecords.begin() + BucketCursors[I];
+    if (B == E)
+      return;
+    auto BucketCmp = [Records](const PSHashRecord &LHash,
+                               const PSHashRecord &RHash) {
+      const BulkPublic &L = Records[uint32_t(LHash.Off)];
+      const BulkPublic &R = Records[uint32_t(RHash.Off)];
+      assert(L.BucketIdx == R.BucketIdx);
+      int Cmp = gsiRecordCmp(L.getName(), R.getName());
+      if (Cmp != 0)
+        return Cmp < 0;
+      // This comparison is necessary to make the sorting stable in the presence
+      // of two static globals with the same name. The easiest way to observe
+      // this is with S_LDATA32 records.
+      return L.SymOffset < R.SymOffset;
+    };
+    llvm::sort(B, E, BucketCmp);
+
+    // After we are done sorting, replace the global indices with the stream
+    // offsets of each global. Add one when writing symbol offsets to disk.
+    // See GSI1::fixSymRecs.
+    for (PSHashRecord &HRec : make_range(B, E))
+      HRec.Off = Records[uint32_t(HRec.Off)].SymOffset + 1;
+  });
+
+  // For each non-empty bucket, push the bucket start offset into HashBuckets
+  // and set a bit in the hash bitmap.
+  for (uint32_t I = 0; I < HashBitmap.size(); ++I) {
+    uint32_t Word = 0;
+    for (uint32_t J = 0; J < 32; ++J) {
+      // Skip empty buckets.
+      uint32_t BucketIdx = I * 32 + J;
+      if (BucketIdx >= IPHR_HASH ||
+          BucketStarts[BucketIdx] == BucketCursors[BucketIdx])
+        continue;
+      Word |= (1U << J);
+
+      // Calculate what the offset of the first hash record in the chain would
+      // be if it were inflated to contain 32-bit pointers. On a 32-bit system,
+      // each record would be 12 bytes. See HROffsetCalc in gsi.h.
+      const int SizeOfHROffsetCalc = 12;
+      ulittle32_t ChainStartOff =
+          ulittle32_t(BucketStarts[BucketIdx] * SizeOfHROffsetCalc);
+      HashBuckets.push_back(ChainStartOff);
+    }
+    HashBitmap[I] = Word;
   }
 }
 
@@ -192,7 +292,7 @@ uint32_t GSIStreamBuilder::calculatePublicsHashStreamSize() const {
   uint32_t Size = 0;
   Size += sizeof(PublicsStreamHeader);
   Size += PSH->calculateSerializedLength();
-  Size += PSH->Records.size() * sizeof(uint32_t); // AddrMap
+  Size += Publics.size() * sizeof(uint32_t); // AddrMap
   // FIXME: Add thunk map and section offsets for incremental linking.
 
   return Size;
@@ -204,103 +304,90 @@ uint32_t GSIStreamBuilder::calculateGlobalsHashStreamSize() const {
 
 Error GSIStreamBuilder::finalizeMsfLayout() {
   // First we write public symbol records, then we write global symbol records.
-  uint32_t PSHZero = 0;
-  uint32_t GSHZero = PSH->calculateRecordByteSize();
-
-  PSH->finalizeBuckets(PSHZero);
-  GSH->finalizeBuckets(GSHZero);
+  finalizePublicBuckets();
+  finalizeGlobalBuckets(PSH->RecordByteSize);
 
   Expected<uint32_t> Idx = Msf.addStream(calculateGlobalsHashStreamSize());
   if (!Idx)
     return Idx.takeError();
-  GSH->StreamIndex = *Idx;
+  GlobalsStreamIndex = *Idx;
+
   Idx = Msf.addStream(calculatePublicsHashStreamSize());
   if (!Idx)
     return Idx.takeError();
-  PSH->StreamIndex = *Idx;
+  PublicsStreamIndex = *Idx;
 
-  uint32_t RecordBytes =
-      GSH->calculateRecordByteSize() + PSH->calculateRecordByteSize();
+  uint32_t RecordBytes = PSH->RecordByteSize + GSH->RecordByteSize;
 
   Idx = Msf.addStream(RecordBytes);
   if (!Idx)
     return Idx.takeError();
-  RecordStreamIdx = *Idx;
+  RecordStreamIndex = *Idx;
   return Error::success();
 }
 
-static bool comparePubSymByAddrAndName(
-    const std::pair<const CVSymbol *, const PublicSym32 *> &LS,
-    const std::pair<const CVSymbol *, const PublicSym32 *> &RS) {
-  if (LS.second->Segment != RS.second->Segment)
-    return LS.second->Segment < RS.second->Segment;
-  if (LS.second->Offset != RS.second->Offset)
-    return LS.second->Offset < RS.second->Offset;
+void GSIStreamBuilder::addPublicSymbols(std::vector<BulkPublic> &&PublicsIn) {
+  assert(Publics.empty() && PSH->RecordByteSize == 0 &&
+         "publics can only be added once");
+  Publics = std::move(PublicsIn);
 
-  return LS.second->Name < RS.second->Name;
-}
-
-/// Compute the address map. The address map is an array of symbol offsets
-/// sorted so that it can be binary searched by address.
-static std::vector<ulittle32_t> computeAddrMap(ArrayRef<CVSymbol> Records) {
-  // Make a vector of pointers to the symbols so we can sort it by address.
-  // Also gather the symbol offsets while we're at it.
-
-  std::vector<PublicSym32> DeserializedPublics;
-  std::vector<std::pair<const CVSymbol *, const PublicSym32 *>> PublicsByAddr;
-  std::vector<uint32_t> SymOffsets;
-  DeserializedPublics.reserve(Records.size());
-  PublicsByAddr.reserve(Records.size());
-  SymOffsets.reserve(Records.size());
+  // Sort the symbols by name. PDBs contain lots of symbols, so use parallelism.
+  parallelSort(Publics, [](const BulkPublic &L, const BulkPublic &R) {
+    return L.getName() < R.getName();
+  });
 
+  // Assign offsets and calculate the length of the public symbol records.
   uint32_t SymOffset = 0;
-  for (const CVSymbol &Sym : Records) {
-    assert(Sym.kind() == SymbolKind::S_PUB32);
-    DeserializedPublics.push_back(
-        cantFail(SymbolDeserializer::deserializeAs<PublicSym32>(Sym)));
-    PublicsByAddr.emplace_back(&Sym, &DeserializedPublics.back());
-    SymOffsets.push_back(SymOffset);
-    SymOffset += Sym.length();
-  }
-  llvm::stable_sort(PublicsByAddr, comparePubSymByAddrAndName);
-
-  // Fill in the symbol offsets in the appropriate order.
-  std::vector<ulittle32_t> AddrMap;
-  AddrMap.reserve(Records.size());
-  for (auto &Sym : PublicsByAddr) {
-    ptrdiff_t Idx = std::distance(Records.data(), Sym.first);
-    assert(Idx >= 0 && size_t(Idx) < Records.size());
-    AddrMap.push_back(ulittle32_t(SymOffsets[Idx]));
+  for (BulkPublic &Pub : Publics) {
+    Pub.SymOffset = SymOffset;
+    SymOffset += sizeOfPublic(Pub);
   }
-  return AddrMap;
-}
 
-uint32_t GSIStreamBuilder::getPublicsStreamIndex() const {
-  return PSH->StreamIndex;
+  // Remember the length of the public stream records.
+  PSH->RecordByteSize = SymOffset;
 }
 
-uint32_t GSIStreamBuilder::getGlobalsStreamIndex() const {
-  return GSH->StreamIndex;
+void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
+  serializeAndAddGlobal(Sym);
 }
 
-void GSIStreamBuilder::addPublicSymbol(const PublicSym32 &Pub) {
-  PSH->addSymbol(Pub, Msf);
+void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
+  serializeAndAddGlobal(Sym);
 }
 
-void GSIStreamBuilder::addGlobalSymbol(const ProcRefSym &Sym) {
-  GSH->addSymbol(Sym, Msf);
+void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
+  serializeAndAddGlobal(Sym);
 }
 
-void GSIStreamBuilder::addGlobalSymbol(const DataSym &Sym) {
-  GSH->addSymbol(Sym, Msf);
+template <typename T>
+void GSIStreamBuilder::serializeAndAddGlobal(const T &Symbol) {
+  T Copy(Symbol);
+  addGlobalSymbol(SymbolSerializer::writeOneSymbol(Copy, Msf.getAllocator(),
+                                                   CodeViewContainer::Pdb));
 }
 
-void GSIStreamBuilder::addGlobalSymbol(const ConstantSym &Sym) {
-  GSH->addSymbol(Sym, Msf);
+void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Symbol) {
+  // Ignore duplicate typedefs and constants.
+  if (Symbol.kind() == S_UDT || Symbol.kind() == S_CONSTANT) {
+    auto Iter = GlobalsSeen.insert(Symbol);
+    if (!Iter.second)
+      return;
+  }
+  GSH->RecordByteSize += Symbol.length();
+  Globals.push_back(Symbol);
 }
 
-void GSIStreamBuilder::addGlobalSymbol(const codeview::CVSymbol &Sym) {
-  GSH->addSymbol(Sym);
+// Serialize each public and write it.
+static Error writePublics(BinaryStreamWriter &Writer,
+                          ArrayRef<BulkPublic> Publics) {
+  std::vector<uint8_t> Storage;
+  for (const BulkPublic &Pub : Publics) {
+    Storage.resize(sizeOfPublic(Pub));
+    serializePublic(Storage.data(), Pub);
+    if (Error E = Writer.writeBytes(Storage))
+      return E;
+  }
+  return Error::success();
 }
 
 static Error writeRecords(BinaryStreamWriter &Writer,
@@ -318,14 +405,42 @@ Error GSIStreamBuilder::commitSymbolRecordStream(
   // Write public symbol records first, followed by global symbol records.  This
   // must match the order that we assume in finalizeMsfLayout when computing
   // PSHZero and GSHZero.
-  if (auto EC = writeRecords(Writer, PSH->Records))
+  if (auto EC = writePublics(Writer, Publics))
     return EC;
-  if (auto EC = writeRecords(Writer, GSH->Records))
+  if (auto EC = writeRecords(Writer, Globals))
     return EC;
 
   return Error::success();
 }
 
+static std::vector<support::ulittle32_t>
+computeAddrMap(ArrayRef<BulkPublic> Publics) {
+  // Build a parallel vector of indices into the Publics vector, and sort it by
+  // address.
+  std::vector<ulittle32_t> PubAddrMap;
+  PubAddrMap.reserve(Publics.size());
+  for (int I = 0, E = Publics.size(); I < E; ++I)
+    PubAddrMap.push_back(ulittle32_t(I));
+
+  auto AddrCmp = [Publics](const ulittle32_t &LIdx, const ulittle32_t &RIdx) {
+    const BulkPublic &L = Publics[LIdx];
+    const BulkPublic &R = Publics[RIdx];
+    if (L.Segment != R.Segment)
+      return L.Segment < R.Segment;
+    if (L.Offset != R.Offset)
+      return L.Offset < R.Offset;
+    // parallelSort is unstable, so we have to do name comparison to ensure
+    // that two names for the same location come out in a deterministic order.
+    return L.getName() < R.getName();
+  };
+  parallelSort(PubAddrMap, AddrCmp);
+
+  // Rewrite the public symbol indices into symbol offsets.
+  for (ulittle32_t &Entry : PubAddrMap)
+    Entry = Publics[Entry].SymOffset;
+  return PubAddrMap;
+}
+
 Error GSIStreamBuilder::commitPublicsHashStream(
     WritableBinaryStreamRef Stream) {
   BinaryStreamWriter Writer(Stream);
@@ -333,7 +448,7 @@ Error GSIStreamBuilder::commitPublicsHashStream(
 
   // FIXME: Fill these in. They are for incremental linking.
   Header.SymHash = PSH->calculateSerializedLength();
-  Header.AddrMap = PSH->Records.size() * 4;
+  Header.AddrMap = Publics.size() * 4;
   Header.NumThunks = 0;
   Header.SizeOfThunk = 0;
   Header.ISectThunkTable = 0;
@@ -346,8 +461,9 @@ Error GSIStreamBuilder::commitPublicsHashStream(
   if (auto EC = PSH->commit(Writer))
     return EC;
 
-  std::vector<ulittle32_t> AddrMap = computeAddrMap(PSH->Records);
-  if (auto EC = Writer.writeArray(makeArrayRef(AddrMap)))
+  std::vector<support::ulittle32_t> PubAddrMap = computeAddrMap(Publics);
+  assert(PubAddrMap.size() == Publics.size());
+  if (auto EC = Writer.writeArray(makeArrayRef(PubAddrMap)))
     return EC;
 
   return Error::success();
@@ -366,7 +482,7 @@ Error GSIStreamBuilder::commit(const msf::MSFLayout &Layout,
   auto PS = WritableMappedBlockStream::createIndexedStream(
       Layout, Buffer, getPublicsStreamIndex(), Msf.getAllocator());
   auto PRS = WritableMappedBlockStream::createIndexedStream(
-      Layout, Buffer, getRecordStreamIdx(), Msf.getAllocator());
+      Layout, Buffer, getRecordStreamIndex(), Msf.getAllocator());
 
   if (auto EC = commitSymbolRecordStream(*PRS))
     return EC;
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
index 39ae84acba20..7717f062eac1 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeCompilandSymbol.cpp
@@ -49,11 +49,11 @@ SymIndexId NativeCompilandSymbol::getLexicalParentId() const { return 0; }
 // this potential confusion.
 
 std::string NativeCompilandSymbol::getLibraryName() const {
-  return Module.getObjFileName();
+  return std::string(Module.getObjFileName());
 }
 
 std::string NativeCompilandSymbol::getName() const {
-  return Module.getModuleName();
+  return std::string(Module.getModuleName());
 }
 
 } // namespace pdb
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp
index 2f6a5bc3d574..7a258acbd7c0 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumInjectedSources.cpp
@@ -48,19 +48,19 @@ public:
   std::string getFileName() const override {
     StringRef Ret = cantFail(Strings.getStringForID(Entry.FileNI),
                              "InjectedSourceStream should have rejected this");
-    return Ret;
+    return std::string(Ret);
   }
 
   std::string getObjectFileName() const override {
     StringRef Ret = cantFail(Strings.getStringForID(Entry.ObjNI),
                              "InjectedSourceStream should have rejected this");
-    return Ret;
+    return std::string(Ret);
   }
 
   std::string getVirtualFileName() const override {
     StringRef Ret = cantFail(Strings.getStringForID(Entry.VFileNI),
                              "InjectedSourceStream should have rejected this");
-    return Ret;
+    return std::string(Ret);
   }
 
   uint32_t getCompression() const override { return Entry.Compression; }
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeEnumLineNumbers.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeEnumLineNumbers.cpp
new file mode 100644
index 000000000000..1e4b07646335
--- /dev/null
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeEnumLineNumbers.cpp
@@ -0,0 +1,42 @@
+//==- NativeEnumLineNumbers.cpp - Native Type Enumerator impl ----*- C++ -*-==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h"
+
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
+#include "llvm/DebugInfo/CodeView/DebugSubsectionRecord.h"
+#include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
+#include "llvm/DebugInfo/PDB/Native/NativeLineNumber.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSourceFile.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+NativeEnumLineNumbers::NativeEnumLineNumbers(
+    std::vector<NativeLineNumber> LineNums)
+    : Lines(std::move(LineNums)), Index(0) {}
+
+uint32_t NativeEnumLineNumbers::getChildCount() const {
+  return static_cast<uint32_t>(Lines.size());
+}
+
+std::unique_ptr<IPDBLineNumber>
+NativeEnumLineNumbers::getChildAtIndex(uint32_t N) const {
+  if (N >= getChildCount())
+    return nullptr;
+  return std::make_unique<NativeLineNumber>(Lines[N]);
+}
+
+std::unique_ptr<IPDBLineNumber> NativeEnumLineNumbers::getNext() {
+  return getChildAtIndex(Index++);
+}
+
+void NativeEnumLineNumbers::reset() { Index = 0; }
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
index 3f393409129b..895f8943157a 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeExeSymbol.cpp
@@ -73,7 +73,7 @@ uint32_t NativeExeSymbol::getAge() const {
 }
 
 std::string NativeExeSymbol::getSymbolsFileName() const {
-  return Session.getPDBFile().getFilePath();
+  return std::string(Session.getPDBFile().getFilePath());
 }
 
 codeview::GUID NativeExeSymbol::getGuid() const {
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp
new file mode 100644
index 000000000000..2537daa7493c
--- /dev/null
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeFunctionSymbol.cpp
@@ -0,0 +1,57 @@
+//===- NativeFunctionSymbol.cpp - info about function symbols----*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h"
+
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
+#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+NativeFunctionSymbol::NativeFunctionSymbol(NativeSession &Session,
+                                           SymIndexId Id,
+                                           const codeview::ProcSym &Sym)
+    : NativeRawSymbol(Session, PDB_SymType::Data, Id), Sym(Sym) {}
+
+NativeFunctionSymbol::~NativeFunctionSymbol() {}
+
+void NativeFunctionSymbol::dump(raw_ostream &OS, int Indent,
+                                PdbSymbolIdField ShowIdFields,
+                                PdbSymbolIdField RecurseIdFields) const {
+  NativeRawSymbol::dump(OS, Indent, ShowIdFields, RecurseIdFields);
+  dumpSymbolField(OS, "name", getName(), Indent);
+  dumpSymbolField(OS, "length", getLength(), Indent);
+  dumpSymbolField(OS, "offset", getAddressOffset(), Indent);
+  dumpSymbolField(OS, "section", getAddressSection(), Indent);
+}
+
+uint32_t NativeFunctionSymbol::getAddressOffset() const {
+  return Sym.CodeOffset;
+}
+
+uint32_t NativeFunctionSymbol::getAddressSection() const { return Sym.Segment; }
+std::string NativeFunctionSymbol::getName() const {
+  return std::string(Sym.Name);
+}
+
+PDB_SymType NativeFunctionSymbol::getSymTag() const {
+  return PDB_SymType::Function;
+}
+
+uint64_t NativeFunctionSymbol::getLength() const { return Sym.CodeSize; }
+
+uint32_t NativeFunctionSymbol::getRelativeVirtualAddress() const {
+  return Session.getRVAFromSectOffset(Sym.Segment, Sym.CodeOffset);
+}
+
+uint64_t NativeFunctionSymbol::getVirtualAddress() const {
+  return Session.getVAFromSectOffset(Sym.Segment, Sym.CodeOffset);
+}
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp
new file mode 100644
index 000000000000..2535e09baf62
--- /dev/null
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeLineNumber.cpp
@@ -0,0 +1,50 @@
+//===- NativeLineNumber.cpp - Native line number implementation -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeLineNumber.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+NativeLineNumber::NativeLineNumber(const NativeSession &Session,
+                                   const codeview::LineInfo Line,
+                                   uint32_t ColumnNumber, uint32_t Section,
+                                   uint32_t Offset, uint32_t Length,
+                                   uint32_t SrcFileId)
+    : Session(Session), Line(Line), ColumnNumber(ColumnNumber),
+      Section(Section), Offset(Offset), Length(Length), SrcFileId(SrcFileId) {}
+
+uint32_t NativeLineNumber::getLineNumber() const { return Line.getStartLine(); }
+
+uint32_t NativeLineNumber::getLineNumberEnd() const {
+  return Line.getEndLine();
+}
+
+uint32_t NativeLineNumber::getColumnNumber() const { return ColumnNumber; }
+
+uint32_t NativeLineNumber::getColumnNumberEnd() const { return 0; }
+
+uint32_t NativeLineNumber::getAddressSection() const { return Section; }
+
+uint32_t NativeLineNumber::getAddressOffset() const { return Offset; }
+
+uint32_t NativeLineNumber::getRelativeVirtualAddress() const {
+  return Session.getRVAFromSectOffset(Section, Offset);
+}
+
+uint64_t NativeLineNumber::getVirtualAddress() const {
+  return Session.getVAFromSectOffset(Section, Offset);
+}
+
+uint32_t NativeLineNumber::getLength() const { return Length; }
+
+uint32_t NativeLineNumber::getSourceFileId() const { return SrcFileId; }
+
+uint32_t NativeLineNumber::getCompilandId() const { return 0; }
+
+bool NativeLineNumber::isStatement() const { return Line.isStatement(); }
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp b/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp
new file mode 100644
index 000000000000..7086af7e67a2
--- /dev/null
+++ b/llvm/lib/DebugInfo/PDB/Native/NativePublicSymbol.cpp
@@ -0,0 +1,52 @@
+//===- NativePublicSymbol.cpp - info about public symbols -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativePublicSymbol.h"
+
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/DebugInfo/PDB/Native/NativeTypeBuiltin.h"
+#include "llvm/DebugInfo/PDB/Native/NativeTypeEnum.h"
+
+using namespace llvm;
+using namespace llvm::codeview;
+using namespace llvm::pdb;
+
+NativePublicSymbol::NativePublicSymbol(NativeSession &Session, SymIndexId Id,
+                                       const codeview::PublicSym32 &Sym)
+    : NativeRawSymbol(Session, PDB_SymType::Data, Id), Sym(Sym) {}
+
+NativePublicSymbol::~NativePublicSymbol() {}
+
+void NativePublicSymbol::dump(raw_ostream &OS, int Indent,
+                              PdbSymbolIdField ShowIdFields,
+                              PdbSymbolIdField RecurseIdFields) const {
+  NativeRawSymbol::dump(OS, Indent, ShowIdFields, RecurseIdFields);
+  dumpSymbolField(OS, "name", getName(), Indent);
+  dumpSymbolField(OS, "offset", getAddressOffset(), Indent);
+  dumpSymbolField(OS, "section", getAddressSection(), Indent);
+}
+
+uint32_t NativePublicSymbol::getAddressOffset() const { return Sym.Offset; }
+
+uint32_t NativePublicSymbol::getAddressSection() const { return Sym.Segment; }
+
+std::string NativePublicSymbol::getName() const {
+  return std::string(Sym.Name);
+}
+
+PDB_SymType NativePublicSymbol::getSymTag() const {
+  return PDB_SymType::PublicSymbol;
+}
+
+uint32_t NativePublicSymbol::getRelativeVirtualAddress() const {
+  return Session.getRVAFromSectOffset(Sym.Segment, Sym.Offset);
+}
+
+uint64_t NativePublicSymbol::getVirtualAddress() const {
+  return Session.getVAFromSectOffset(Sym.Segment, Sym.Offset);
+}
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
index b45a5881dcb5..ac8449df44ff 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeSession.cpp
@@ -12,6 +12,7 @@
 #include "llvm/DebugInfo/CodeView/TypeIndex.h"
 #include "llvm/DebugInfo/PDB/IPDBEnumChildren.h"
 #include "llvm/DebugInfo/PDB/IPDBSourceFile.h"
+#include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumInjectedSources.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumTypes.h"
@@ -25,11 +26,14 @@
 #include "llvm/DebugInfo/PDB/PDBSymbolCompiland.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolExe.h"
 #include "llvm/DebugInfo/PDB/PDBSymbolTypeEnum.h"
+#include "llvm/Object/COFF.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/BinaryByteStream.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Path.h"
 
 #include <algorithm>
 #include <cassert>
@@ -75,14 +79,125 @@ Error NativeSession::createFromPdb(std::unique_ptr<MemoryBuffer> Buffer,
   return Error::success();
 }
 
-Error NativeSession::createFromExe(StringRef Path,
+static Expected<std::unique_ptr<PDBFile>>
+loadPdbFile(StringRef PdbPath, std::unique_ptr<BumpPtrAllocator> &Allocator) {
+  ErrorOr<std::unique_ptr<MemoryBuffer>> ErrorOrBuffer =
+      MemoryBuffer::getFile(PdbPath, /*FileSize=*/-1,
+                            /*RequiresNullTerminator=*/false);
+  if (!ErrorOrBuffer)
+    return make_error<RawError>(ErrorOrBuffer.getError());
+  std::unique_ptr<llvm::MemoryBuffer> Buffer = std::move(*ErrorOrBuffer);
+
+  PdbPath = Buffer->getBufferIdentifier();
+  file_magic Magic;
+  auto EC = identify_magic(PdbPath, Magic);
+  if (EC || Magic != file_magic::pdb)
+    return make_error<RawError>(EC);
+
+  auto Stream = std::make_unique<MemoryBufferByteStream>(std::move(Buffer),
+                                                         llvm::support::little);
+
+  auto File = std::make_unique<PDBFile>(PdbPath, std::move(Stream), *Allocator);
+  if (auto EC = File->parseFileHeaders())
+    return std::move(EC);
+
+  if (auto EC = File->parseStreamData())
+    return std::move(EC);
+
+  return std::move(File);
+}
+
+Error NativeSession::createFromPdbPath(StringRef PdbPath,
+                                       std::unique_ptr<IPDBSession> &Session) {
+  auto Allocator = std::make_unique<BumpPtrAllocator>();
+  auto PdbFile = loadPdbFile(PdbPath, Allocator);
+  if (!PdbFile)
+    return PdbFile.takeError();
+
+  Session = std::make_unique<NativeSession>(std::move(PdbFile.get()),
+                                            std::move(Allocator));
+  return Error::success();
+}
+
+static Expected<std::string> getPdbPathFromExe(StringRef ExePath) {
+  Expected<object::OwningBinary<object::Binary>> BinaryFile =
+      object::createBinary(ExePath);
+  if (!BinaryFile)
+    return BinaryFile.takeError();
+
+  const object::COFFObjectFile *ObjFile =
+      dyn_cast<object::COFFObjectFile>(BinaryFile->getBinary());
+  if (!ObjFile)
+    return make_error<RawError>(raw_error_code::invalid_format);
+
+  StringRef PdbPath;
+  const llvm::codeview::DebugInfo *PdbInfo = nullptr;
+  if (Error E = ObjFile->getDebugPDBInfo(PdbInfo, PdbPath))
+    return std::move(E);
+
+  return std::string(PdbPath);
+}
+
+Error NativeSession::createFromExe(StringRef ExePath,
                                    std::unique_ptr<IPDBSession> &Session) {
-  return make_error<RawError>(raw_error_code::feature_unsupported);
+  Expected<std::string> PdbPath = getPdbPathFromExe(ExePath);
+  if (!PdbPath)
+    return PdbPath.takeError();
+
+  file_magic Magic;
+  auto EC = identify_magic(PdbPath.get(), Magic);
+  if (EC || Magic != file_magic::pdb)
+    return make_error<RawError>(EC);
+
+  auto Allocator = std::make_unique<BumpPtrAllocator>();
+  auto File = loadPdbFile(PdbPath.get(), Allocator);
+  if (!File)
+    return File.takeError();
+
+  Session = std::make_unique<NativeSession>(std::move(File.get()),
+                                            std::move(Allocator));
+
+  return Error::success();
 }
 
-uint64_t NativeSession::getLoadAddress() const { return 0; }
+Expected<std::string>
+NativeSession::searchForPdb(const PdbSearchOptions &Opts) {
+  Expected<std::string> PathOrErr = getPdbPathFromExe(Opts.ExePath);
+  if (!PathOrErr)
+    return PathOrErr.takeError();
+  StringRef PathFromExe = PathOrErr.get();
+  sys::path::Style Style = PathFromExe.startswith("/")
+                               ? sys::path::Style::posix
+                               : sys::path::Style::windows;
+  StringRef PdbName = sys::path::filename(PathFromExe, Style);
+
+  // Check if pdb exists in the executable directory.
+  SmallString<128> PdbPath = StringRef(Opts.ExePath);
+  sys::path::remove_filename(PdbPath);
+  sys::path::append(PdbPath, PdbName);
 
-bool NativeSession::setLoadAddress(uint64_t Address) { return false; }
+  auto Allocator = std::make_unique<BumpPtrAllocator>();
+
+  if (auto File = loadPdbFile(PdbPath, Allocator))
+    return std::string(PdbPath);
+  else
+    consumeError(File.takeError());
+
+  // Check path that was in the executable.
+  if (auto File = loadPdbFile(PathFromExe, Allocator))
+    return std::string(PathFromExe);
+  else
+    return File.takeError();
+
+  return make_error<RawError>("PDB not found");
+}
+
+uint64_t NativeSession::getLoadAddress() const { return LoadAddress; }
+
+bool NativeSession::setLoadAddress(uint64_t Address) {
+  LoadAddress = Address;
+  return true;
+}
 
 std::unique_ptr<PDBSymbolExe> NativeSession::getGlobalScope() {
   return PDBSymbol::createAs<PDBSymbolExe>(*this, getNativeGlobalScope());
@@ -95,28 +210,52 @@ NativeSession::getSymbolById(SymIndexId SymbolId) const {
 
 bool NativeSession::addressForVA(uint64_t VA, uint32_t &Section,
                                  uint32_t &Offset) const {
-  return false;
+  uint32_t RVA = VA - getLoadAddress();
+  return addressForRVA(RVA, Section, Offset);
 }
 
-bool NativeSession::addressForRVA(uint32_t VA, uint32_t &Section,
+bool NativeSession::addressForRVA(uint32_t RVA, uint32_t &Section,
                                   uint32_t &Offset) const {
-  return false;
+  Section = 0;
+  Offset = 0;
+
+  auto Dbi = Pdb->getPDBDbiStream();
+  if (!Dbi)
+    return false;
+
+  if ((int32_t)RVA < 0)
+    return true;
+
+  Offset = RVA;
+  for (; Section < Dbi->getSectionHeaders().size(); ++Section) {
+    auto &Sec = Dbi->getSectionHeaders()[Section];
+    if (RVA < Sec.VirtualAddress)
+      return true;
+    Offset = RVA - Sec.VirtualAddress;
+  }
+  return true;
 }
 
 std::unique_ptr<PDBSymbol>
-NativeSession::findSymbolByAddress(uint64_t Address, PDB_SymType Type) const {
-  return nullptr;
+NativeSession::findSymbolByAddress(uint64_t Address, PDB_SymType Type) {
+  uint32_t Section;
+  uint32_t Offset;
+  addressForVA(Address, Section, Offset);
+  return findSymbolBySectOffset(Section, Offset, Type);
 }
 
-std::unique_ptr<PDBSymbol>
-NativeSession::findSymbolByRVA(uint32_t RVA, PDB_SymType Type) const {
-  return nullptr;
+std::unique_ptr<PDBSymbol> NativeSession::findSymbolByRVA(uint32_t RVA,
+                                                          PDB_SymType Type) {
+  uint32_t Section;
+  uint32_t Offset;
+  addressForRVA(RVA, Section, Offset);
+  return findSymbolBySectOffset(Section, Offset, Type);
 }
 
 std::unique_ptr<PDBSymbol>
 NativeSession::findSymbolBySectOffset(uint32_t Sect, uint32_t Offset,
-                                      PDB_SymType Type) const {
-  return nullptr;
+                                      PDB_SymType Type) {
+  return Cache.findSymbolBySectOffset(Sect, Offset, Type);
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
@@ -128,18 +267,19 @@ NativeSession::findLineNumbers(const PDBSymbolCompiland &Compiland,
 std::unique_ptr<IPDBEnumLineNumbers>
 NativeSession::findLineNumbersByAddress(uint64_t Address,
                                         uint32_t Length) const {
-  return nullptr;
+  return Cache.findLineNumbersByVA(Address, Length);
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
 NativeSession::findLineNumbersByRVA(uint32_t RVA, uint32_t Length) const {
-  return nullptr;
+  return findLineNumbersByAddress(getLoadAddress() + RVA, Length);
 }
 
 std::unique_ptr<IPDBEnumLineNumbers>
 NativeSession::findLineNumbersBySectOffset(uint32_t Section, uint32_t Offset,
                                            uint32_t Length) const {
-  return nullptr;
+  uint64_t VA = getVAFromSectOffset(Section, Offset);
+  return findLineNumbersByAddress(VA, Length);
 }
 
 std::unique_ptr<IPDBEnumSourceFiles>
@@ -179,7 +319,7 @@ std::unique_ptr<IPDBEnumSourceFiles> NativeSession::getSourceFilesForCompiland(
 
 std::unique_ptr<IPDBSourceFile>
 NativeSession::getSourceFileById(uint32_t FileId) const {
-  return nullptr;
+  return Cache.getSourceFileById(FileId);
 }
 
 std::unique_ptr<IPDBEnumDataStreams> NativeSession::getDebugStreams() const {
@@ -225,3 +365,24 @@ NativeExeSymbol &NativeSession::getNativeGlobalScope() const {
 
   return Cache.getNativeSymbolById<NativeExeSymbol>(ExeSymbol);
 }
+
+uint32_t NativeSession::getRVAFromSectOffset(uint32_t Section,
+                                             uint32_t Offset) const {
+  if (Section <= 0)
+    return 0;
+
+  auto Dbi = getDbiStreamPtr(*Pdb);
+  if (!Dbi)
+    return 0;
+
+  uint32_t MaxSection = Dbi->getSectionHeaders().size();
+  if (Section > MaxSection + 1)
+    Section = MaxSection + 1;
+  auto &Sec = Dbi->getSectionHeaders()[Section - 1];
+  return Sec.VirtualAddress + Offset;
+}
+
+uint64_t NativeSession::getVAFromSectOffset(uint32_t Section,
+                                            uint32_t Offset) const {
+  return LoadAddress + getRVAFromSectOffset(Section, Offset);
+}
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp
new file mode 100644
index 000000000000..6473207e058a
--- /dev/null
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeSourceFile.cpp
@@ -0,0 +1,47 @@
+//===- NativeSourceFile.cpp - Native line number implementaiton -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/DebugInfo/PDB/Native/NativeSourceFile.h"
+#include "llvm/DebugInfo/PDB/Native/NativeSession.h"
+
+using namespace llvm;
+using namespace llvm::pdb;
+
+NativeSourceFile::NativeSourceFile(NativeSession &Session, uint32_t FileId,
+                                   const codeview::FileChecksumEntry &Checksum)
+    : Session(Session), FileId(FileId), Checksum(Checksum) {}
+
+std::string NativeSourceFile::getFileName() const {
+  auto ST = Session.getPDBFile().getStringTable();
+  if (!ST) {
+    consumeError(ST.takeError());
+    return "";
+  }
+  auto FileName = ST->getStringTable().getString(Checksum.FileNameOffset);
+  if (!FileName) {
+    consumeError(FileName.takeError());
+    return "";
+  }
+
+  return std::string(FileName.get());
+}
+
+uint32_t NativeSourceFile::getUniqueId() const { return FileId; }
+
+std::string NativeSourceFile::getChecksum() const {
+  return toStringRef(Checksum.Checksum).str();
+}
+
+PDB_Checksum NativeSourceFile::getChecksumType() const {
+  return static_cast<PDB_Checksum>(Checksum.Kind);
+}
+
+std::unique_ptr<IPDBEnumChildren<PDBSymbolCompiland>>
+NativeSourceFile::getCompilands() const {
+  return nullptr;
+}
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeSymbolEnumerator.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeSymbolEnumerator.cpp
index 704c1254afbf..e5f1dcaf801e 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeSymbolEnumerator.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeSymbolEnumerator.cpp
@@ -51,7 +51,9 @@ SymIndexId NativeSymbolEnumerator::getClassParentId() const {
 
 SymIndexId NativeSymbolEnumerator::getLexicalParentId() const { return 0; }
 
-std::string NativeSymbolEnumerator::getName() const { return Record.Name; }
+std::string NativeSymbolEnumerator::getName() const {
+  return std::string(Record.Name);
+}
 
 SymIndexId NativeSymbolEnumerator::getTypeId() const {
   return Parent.getTypeId();
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeArray.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeArray.cpp
index 80d455ad66e9..63ac9fae0e87 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeArray.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeArray.cpp
@@ -63,4 +63,4 @@ SymIndexId NativeTypeArray::getTypeId() const {
       Record.getElementType());
 }
 
-uint64_t NativeTypeArray::getLength() const { return Record.Size; }
-\ No newline at end of file
+uint64_t NativeTypeArray::getLength() const { return Record.Size; }
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp
index 26ccb7daece0..aaec3a5e7c60 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeEnum.cpp
@@ -305,7 +305,7 @@ std::string NativeTypeEnum::getName() const {
   if (UnmodifiedType)
     return UnmodifiedType->getName();
 
-  return Record->getName();
+  return std::string(Record->getName());
 }
 
 bool NativeTypeEnum::isNested() const {
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeTypedef.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeTypedef.cpp
index 60b373282267..72964a9e0d4d 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeTypedef.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeTypedef.cpp
@@ -20,7 +20,9 @@ void NativeTypeTypedef::dump(raw_ostream &OS, int Indent,
                     PdbSymbolIdField::Type, ShowIdFields, RecurseIdFields);
 }
 
-std::string NativeTypeTypedef::getName() const { return Record.Name; }
+std::string NativeTypeTypedef::getName() const {
+  return std::string(Record.Name);
+}
 
 SymIndexId NativeTypeTypedef::getTypeId() const {
   return Session.getSymbolCache().findSymbolByTypeIndex(Record.Type);
diff --git a/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp b/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp
index be67846c0b24..b0be7f76e86e 100644
--- a/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/NativeTypeUDT.cpp
@@ -74,7 +74,7 @@ std::string NativeTypeUDT::getName() const {
   if (UnmodifiedType)
     return UnmodifiedType->getName();
 
-  return Tag->getName();
+  return std::string(Tag->getName());
 }
 
 SymIndexId NativeTypeUDT::getLexicalParentId() const { return 0; }
diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp
index 9ac226b89139..cde645236851 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBFile.cpp
@@ -41,7 +41,8 @@ typedef FixedStreamArray<support::ulittle32_t> ulittle_array;
 
 PDBFile::PDBFile(StringRef Path, std::unique_ptr<BinaryStream> PdbFileBuffer,
                  BumpPtrAllocator &Allocator)
-    : FilePath(Path), Allocator(Allocator), Buffer(std::move(PdbFileBuffer)) {}
+    : FilePath(std::string(Path)), Allocator(Allocator),
+      Buffer(std::move(PdbFileBuffer)) {}
 
 PDBFile::~PDBFile() = default;
 
diff --git a/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
index aa3288724390..deb0f201a71e 100644
--- a/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/PDBFileBuilder.cpp
@@ -7,9 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/DebugInfo/PDB/Native/PDBFileBuilder.h"
-
 #include "llvm/ADT/BitVector.h"
-
 #include "llvm/DebugInfo/MSF/MSFBuilder.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStreamBuilder.h"
@@ -23,6 +21,7 @@
 #include "llvm/Support/BinaryStream.h"
 #include "llvm/Support/BinaryStreamWriter.h"
 #include "llvm/Support/CRC.h"
+#include "llvm/Support/Chrono.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/xxhash.h"
 
@@ -95,7 +94,7 @@ Error PDBFileBuilder::addNamedStream(StringRef Name, StringRef Data) {
   if (!ExpectedIndex)
     return ExpectedIndex.takeError();
   assert(NamedStreamData.count(*ExpectedIndex) == 0);
-  NamedStreamData[*ExpectedIndex] = Data;
+  NamedStreamData[*ExpectedIndex] = std::string(Data);
   return Error::success();
 }
 
@@ -144,7 +143,7 @@ Error PDBFileBuilder::finalizeMsfLayout() {
     if (Dbi) {
       Dbi->setPublicsStreamIndex(Gsi->getPublicsStreamIndex());
       Dbi->setGlobalsStreamIndex(Gsi->getGlobalsStreamIndex());
-      Dbi->setSymbolRecordStreamIndex(Gsi->getRecordStreamIdx());
+      Dbi->setSymbolRecordStreamIndex(Gsi->getRecordStreamIndex());
     }
   }
   if (Tpi) {
diff --git a/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp b/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
index 5cdd628312fe..9f15907b519e 100644
--- a/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/SymbolCache.cpp
@@ -1,13 +1,18 @@
 #include "llvm/DebugInfo/PDB/Native/SymbolCache.h"
 
+#include "llvm/DebugInfo/CodeView/DebugLinesSubsection.h"
 #include "llvm/DebugInfo/CodeView/SymbolDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeDeserializer.h"
 #include "llvm/DebugInfo/CodeView/TypeRecordHelpers.h"
 #include "llvm/DebugInfo/PDB/Native/DbiStream.h"
 #include "llvm/DebugInfo/PDB/Native/GlobalsStream.h"
+#include "llvm/DebugInfo/PDB/Native/ISectionContribVisitor.h"
 #include "llvm/DebugInfo/PDB/Native/NativeCompilandSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumGlobals.h"
+#include "llvm/DebugInfo/PDB/Native/NativeEnumLineNumbers.h"
 #include "llvm/DebugInfo/PDB/Native/NativeEnumTypes.h"
+#include "llvm/DebugInfo/PDB/Native/NativeFunctionSymbol.h"
+#include "llvm/DebugInfo/PDB/Native/NativePublicSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeRawSymbol.h"
 #include "llvm/DebugInfo/PDB/Native/NativeSession.h"
 #include "llvm/DebugInfo/PDB/Native/NativeTypeArray.h"
@@ -19,6 +24,7 @@
 #include "llvm/DebugInfo/PDB/Native/NativeTypeUDT.h"
 #include "llvm/DebugInfo/PDB/Native/NativeTypeVTShape.h"
 #include "llvm/DebugInfo/PDB/Native/PDBFile.h"
+#include "llvm/DebugInfo/PDB/Native/PublicsStream.h"
 #include "llvm/DebugInfo/PDB/Native/SymbolStream.h"
 #include "llvm/DebugInfo/PDB/Native/TpiStream.h"
 #include "llvm/DebugInfo/PDB/PDBSymbol.h"
@@ -62,9 +68,10 @@ static const struct BuiltinTypeEntry {
 };
 
 SymbolCache::SymbolCache(NativeSession &Session, DbiStream *Dbi)
-    : Session(Session), Dbi(Dbi) {
+    : Session(Session), Dbi(Dbi), AddrToModuleIndex(IMapAllocator) {
   // Id 0 is reserved for the invalid symbol.
   Cache.push_back(nullptr);
+  SourceFiles.push_back(nullptr);
 
   if (Dbi)
     Compilands.resize(Dbi->modules().getModuleCount());
@@ -281,6 +288,312 @@ SymIndexId SymbolCache::getOrCreateGlobalSymbolByOffset(uint32_t Offset) {
   return Id;
 }
 
+Expected<ModuleDebugStreamRef>
+SymbolCache::getModuleDebugStream(uint32_t Index) const {
+  assert(Dbi && "Dbi stream not present");
+
+  DbiModuleDescriptor Modi = Dbi->modules().getModuleDescriptor(Index);
+
+  uint16_t ModiStream = Modi.getModuleStreamIndex();
+  if (ModiStream == kInvalidStreamIndex)
+    return make_error<RawError>("Module stream not present");
+
+  std::unique_ptr<msf::MappedBlockStream> ModStreamData =
+      Session.getPDBFile().createIndexedStream(ModiStream);
+
+  ModuleDebugStreamRef ModS(Modi, std::move(ModStreamData));
+  if (auto EC = ModS.reload())
+    return std::move(EC);
+
+  return std::move(ModS);
+}
+
+std::unique_ptr<PDBSymbol>
+SymbolCache::findSymbolBySectOffset(uint32_t Sect, uint32_t Offset,
+                                    PDB_SymType Type) {
+  if (AddrToModuleIndex.empty())
+    parseSectionContribs();
+
+  switch (Type) {
+  case PDB_SymType::Function:
+    return findFunctionSymbolBySectOffset(Sect, Offset);
+  case PDB_SymType::PublicSymbol:
+    return findPublicSymbolBySectOffset(Sect, Offset);
+  case PDB_SymType::None: {
+    // FIXME: Implement for PDB_SymType::Data.
+    if (auto Sym = findFunctionSymbolBySectOffset(Sect, Offset))
+      return Sym;
+    return nullptr;
+  }
+  default:
+    return nullptr;
+  }
+}
+
+std::unique_ptr<PDBSymbol>
+SymbolCache::findFunctionSymbolBySectOffset(uint32_t Sect, uint32_t Offset) {
+  auto Iter = AddressToFunctionSymId.find({Sect, Offset});
+  if (Iter != AddressToFunctionSymId.end())
+    return getSymbolById(Iter->second);
+
+  if (!Dbi)
+    return nullptr;
+
+  auto Modi = getModuleIndexForAddr(Session.getVAFromSectOffset(Sect, Offset));
+  if (!Modi)
+    return nullptr;
+
+  auto ExpectedModS = getModuleDebugStream(*Modi);
+  if (!ExpectedModS) {
+    consumeError(ExpectedModS.takeError());
+    return nullptr;
+  }
+  CVSymbolArray Syms = ExpectedModS->getSymbolArray();
+
+  // Search for the symbol in this module.
+  for (auto I = Syms.begin(), E = Syms.end(); I != E; ++I) {
+    if (I->kind() != S_LPROC32 && I->kind() != S_GPROC32)
+      continue;
+    auto PS = cantFail(SymbolDeserializer::deserializeAs<ProcSym>(*I));
+    if (Sect == PS.Segment && Offset >= PS.CodeOffset &&
+        Offset < PS.CodeOffset + PS.CodeSize) {
+      SymIndexId Id = createSymbol<NativeFunctionSymbol>(PS);
+      AddressToFunctionSymId.insert({{Sect, Offset}, Id});
+      return getSymbolById(Id);
+    }
+
+    // Jump to the end of this ProcSym.
+    I = Syms.at(PS.End);
+  }
+  return nullptr;
+}
+
+std::unique_ptr<PDBSymbol>
+SymbolCache::findPublicSymbolBySectOffset(uint32_t Sect, uint32_t Offset) {
+  auto Iter = AddressToPublicSymId.find({Sect, Offset});
+  if (Iter != AddressToPublicSymId.end())
+    return getSymbolById(Iter->second);
+
+  auto Publics = Session.getPDBFile().getPDBPublicsStream();
+  if (!Publics)
+    return nullptr;
+
+  auto ExpectedSyms = Session.getPDBFile().getPDBSymbolStream();
+  if (!ExpectedSyms)
+    return nullptr;
+  BinaryStreamRef SymStream =
+      ExpectedSyms->getSymbolArray().getUnderlyingStream();
+
+  // Use binary search to find the first public symbol with an address greater
+  // than or equal to Sect, Offset.
+  auto AddrMap = Publics->getAddressMap();
+  auto First = AddrMap.begin();
+  auto It = AddrMap.begin();
+  size_t Count = AddrMap.size();
+  size_t Half;
+  while (Count > 0) {
+    It = First;
+    Half = Count / 2;
+    It += Half;
+    Expected<CVSymbol> Sym = readSymbolFromStream(SymStream, *It);
+    if (!Sym) {
+      consumeError(Sym.takeError());
+      return nullptr;
+    }
+
+    auto PS =
+        cantFail(SymbolDeserializer::deserializeAs<PublicSym32>(Sym.get()));
+    if (PS.Segment < Sect || (PS.Segment == Sect && PS.Offset <= Offset)) {
+      First = ++It;
+      Count -= Half + 1;
+    } else
+      Count = Half;
+  }
+  if (It == AddrMap.begin())
+    return nullptr;
+  --It;
+
+  Expected<CVSymbol> Sym = readSymbolFromStream(SymStream, *It);
+  if (!Sym) {
+    consumeError(Sym.takeError());
+    return nullptr;
+  }
+  auto PS = cantFail(SymbolDeserializer::deserializeAs<PublicSym32>(Sym.get()));
+  SymIndexId Id = createSymbol<NativePublicSymbol>(PS);
+  AddressToPublicSymId.insert({{Sect, Offset}, Id});
+  return getSymbolById(Id);
+}
+
+std::vector<SymbolCache::LineTableEntry>
+SymbolCache::findLineTable(uint16_t Modi) const {
+  // Check if this module has already been added.
+  auto LineTableIter = LineTable.find(Modi);
+  if (LineTableIter != LineTable.end())
+    return LineTableIter->second;
+
+  std::vector<LineTableEntry> &ModuleLineTable = LineTable[Modi];
+
+  // If there is an error or there are no lines, just return the
+  // empty vector.
+  Expected<ModuleDebugStreamRef> ExpectedModS = getModuleDebugStream(Modi);
+  if (!ExpectedModS) {
+    consumeError(ExpectedModS.takeError());
+    return ModuleLineTable;
+  }
+
+  std::vector<std::vector<LineTableEntry>> EntryList;
+  for (const auto &SS : ExpectedModS->getSubsectionsArray()) {
+    if (SS.kind() != DebugSubsectionKind::Lines)
+      continue;
+
+    DebugLinesSubsectionRef Lines;
+    BinaryStreamReader Reader(SS.getRecordData());
+    if (auto EC = Lines.initialize(Reader)) {
+      consumeError(std::move(EC));
+      continue;
+    }
+
+    uint32_t RelocSegment = Lines.header()->RelocSegment;
+    uint32_t RelocOffset = Lines.header()->RelocOffset;
+    for (const LineColumnEntry &Group : Lines) {
+      if (Group.LineNumbers.empty())
+        continue;
+
+      std::vector<LineTableEntry> Entries;
+
+      // If there are column numbers, then they should be in a parallel stream
+      // to the line numbers.
+      auto ColIt = Group.Columns.begin();
+      auto ColsEnd = Group.Columns.end();
+
+      for (const LineNumberEntry &LN : Group.LineNumbers) {
+        uint64_t VA =
+            Session.getVAFromSectOffset(RelocSegment, RelocOffset + LN.Offset);
+        LineInfo Line(LN.Flags);
+        uint32_t ColNum = 0;
+
+        if (Lines.hasColumnInfo() && ColIt != ColsEnd) {
+          ColNum = ColIt->StartColumn;
+          ++ColIt;
+        }
+        Entries.push_back({VA, Line, ColNum, Group.NameIndex, false});
+      }
+
+      // Add a terminal entry line to mark the end of this subsection.
+      uint64_t VA = Session.getVAFromSectOffset(
+          RelocSegment, RelocOffset + Lines.header()->CodeSize);
+      LineInfo LastLine(Group.LineNumbers.back().Flags);
+      uint32_t ColNum =
+          (Lines.hasColumnInfo()) ? Group.Columns.back().StartColumn : 0;
+      Entries.push_back({VA, LastLine, ColNum, Group.NameIndex, true});
+
+      EntryList.push_back(Entries);
+    }
+  }
+
+  // Sort EntryList, and add flattened contents to the line table.
+  std::sort(EntryList.begin(), EntryList.end(),
+            [](const std::vector<LineTableEntry> &LHS,
+               const std::vector<LineTableEntry> &RHS) {
+              return LHS[0].Addr < RHS[0].Addr;
+            });
+  for (size_t I = 0; I < EntryList.size(); ++I)
+    ModuleLineTable.insert(ModuleLineTable.end(), EntryList[I].begin(),
+                           EntryList[I].end());
+
+  return ModuleLineTable;
+}
+
+std::unique_ptr<IPDBEnumLineNumbers>
+SymbolCache::findLineNumbersByVA(uint64_t VA, uint32_t Length) const {
+  Optional<uint16_t> MaybeModi = getModuleIndexForAddr(VA);
+  if (!MaybeModi)
+    return nullptr;
+  uint16_t Modi = *MaybeModi;
+
+  std::vector<LineTableEntry> Lines = findLineTable(Modi);
+  if (Lines.empty())
+    return nullptr;
+
+  // Find the first line in the line table whose address is not greater than
+  // the one we are searching for.
+  auto LineIter = llvm::partition_point(Lines, [&](const LineTableEntry &E) {
+    return (E.Addr < VA || (E.Addr == VA && E.IsTerminalEntry));
+  });
+
+  // Try to back up if we've gone too far.
+  if (LineIter == Lines.end() || LineIter->Addr > VA) {
+    if (LineIter == Lines.begin() || std::prev(LineIter)->IsTerminalEntry)
+      return nullptr;
+    --LineIter;
+  }
+
+  Expected<ModuleDebugStreamRef> ExpectedModS = getModuleDebugStream(Modi);
+  if (!ExpectedModS) {
+    consumeError(ExpectedModS.takeError());
+    return nullptr;
+  }
+  Expected<DebugChecksumsSubsectionRef> ExpectedChecksums =
+      ExpectedModS->findChecksumsSubsection();
+  if (!ExpectedChecksums) {
+    consumeError(ExpectedChecksums.takeError());
+    return nullptr;
+  }
+
+  // Populate a vector of NativeLineNumbers that have addresses in the given
+  // address range.
+  Optional<uint16_t> EndModi = getModuleIndexForAddr(VA + Length);
+  if (!EndModi)
+    return nullptr;
+  std::vector<NativeLineNumber> LineNumbers;
+  while (Modi <= *EndModi) {
+    // If we reached the end of the current module, increment Modi and get the
+    // new line table and checksums array.
+    if (LineIter == Lines.end()) {
+      ++Modi;
+
+      ExpectedModS = getModuleDebugStream(Modi);
+      if (!ExpectedModS) {
+        consumeError(ExpectedModS.takeError());
+        break;
+      }
+      ExpectedChecksums = ExpectedModS->findChecksumsSubsection();
+      if (!ExpectedChecksums) {
+        consumeError(ExpectedChecksums.takeError());
+        break;
+      }
+
+      Lines = findLineTable(Modi);
+      LineIter = Lines.begin();
+
+      if (Lines.empty())
+        continue;
+    }
+
+    if (LineIter->IsTerminalEntry) {
+      ++LineIter;
+      continue;
+    }
+
+    // If the line is still within the address range, create a NativeLineNumber
+    // and add to the list.
+    if (LineIter->Addr > VA + Length)
+      break;
+
+    uint32_t LineSect, LineOff;
+    Session.addressForVA(LineIter->Addr, LineSect, LineOff);
+    uint32_t LineLength = std::next(LineIter)->Addr - LineIter->Addr;
+    auto ChecksumIter =
+        ExpectedChecksums->getArray().at(LineIter->FileNameIndex);
+    uint32_t SrcFileId = getOrCreateSourceFile(*ChecksumIter);
+    NativeLineNumber LineNum(Session, LineIter->Line, LineIter->ColumnNumber,
+                             LineSect, LineOff, LineLength, SrcFileId);
+    LineNumbers.push_back(LineNum);
+    ++LineIter;
+  }
+  return std::make_unique<NativeEnumLineNumbers>(std::move(LineNumbers));
+}
+
 std::unique_ptr<PDBSymbolCompiland>
 SymbolCache::getOrCreateCompiland(uint32_t Index) {
   if (!Dbi)
@@ -297,3 +610,65 @@ SymbolCache::getOrCreateCompiland(uint32_t Index) {
 
   return Session.getConcreteSymbolById<PDBSymbolCompiland>(Compilands[Index]);
 }
+
+std::unique_ptr<IPDBSourceFile>
+SymbolCache::getSourceFileById(SymIndexId FileId) const {
+  assert(FileId < SourceFiles.size());
+
+  // Id 0 is reserved.
+  if (FileId == 0)
+    return nullptr;
+
+  return std::unique_ptr<NativeSourceFile>(
+      new NativeSourceFile(*SourceFiles[FileId].get()));
+}
+
+SymIndexId
+SymbolCache::getOrCreateSourceFile(const FileChecksumEntry &Checksums) const {
+  auto Iter = FileNameOffsetToId.find(Checksums.FileNameOffset);
+  if (Iter != FileNameOffsetToId.end())
+    return Iter->second;
+
+  SymIndexId Id = SourceFiles.size();
+  auto SrcFile = std::make_unique<NativeSourceFile>(Session, Id, Checksums);
+  SourceFiles.push_back(std::move(SrcFile));
+  FileNameOffsetToId[Checksums.FileNameOffset] = Id;
+  return Id;
+}
+
+void SymbolCache::parseSectionContribs() {
+  if (!Dbi)
+    return;
+
+  class Visitor : public ISectionContribVisitor {
+    NativeSession &Session;
+    IMap &AddrMap;
+
+  public:
+    Visitor(NativeSession &Session, IMap &AddrMap)
+        : Session(Session), AddrMap(AddrMap) {}
+    void visit(const SectionContrib &C) override {
+      if (C.Size == 0)
+        return;
+
+      uint64_t VA = Session.getVAFromSectOffset(C.ISect, C.Off);
+      uint64_t End = VA + C.Size;
+
+      // Ignore overlapping sections based on the assumption that a valid
+      // PDB file should not have overlaps.
+      if (!AddrMap.overlaps(VA, End))
+        AddrMap.insert(VA, End, C.Imod);
+    }
+    void visit(const SectionContrib2 &C) override { visit(C.Base); }
+  };
+
+  Visitor V(Session, AddrToModuleIndex);
+  Dbi->visitSectionContributions(V);
+}
+
+Optional<uint16_t> SymbolCache::getModuleIndexForAddr(uint64_t Addr) const {
+  auto Iter = AddrToModuleIndex.find(Addr);
+  if (Iter == AddrToModuleIndex.end())
+    return None;
+  return Iter.value();
+}
diff --git a/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp b/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
index 4f10f8524a9b..51a1f0a544e3 100644
--- a/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
+++ b/llvm/lib/DebugInfo/PDB/Native/TpiStreamBuilder.cpp
@@ -44,6 +44,9 @@ void TpiStreamBuilder::setVersionHeader(PdbRaw_TpiVer Version) {
 void TpiStreamBuilder::addTypeRecord(ArrayRef<uint8_t> Record,
                                      Optional<uint32_t> Hash) {
   // If we just crossed an 8KB threshold, add a type index offset.
+  assert(((Record.size() & 3) == 0) &&
+         "The type record's size is not a multiple of 4 bytes which will "
+         "cause misalignment in the output TPI stream!");
   size_t NewSize = TypeRecordBytes + Record.size();
   constexpr size_t EightKB = 8 * 1024;
   if (NewSize / EightKB > TypeRecordBytes / EightKB || TypeRecords.empty()) {
@@ -153,8 +156,11 @@ Error TpiStreamBuilder::commit(const msf::MSFLayout &Layout,
     return EC;
 
   for (auto Rec : TypeRecords) {
-    assert(!Rec.empty()); // An empty record will not write anything, but it
-                          // would shift all offsets from here on.
+    assert(!Rec.empty() && "Attempting to write an empty type record shifts "
+                           "all offsets in the TPI stream!");
+    assert(((Rec.size() & 3) == 0) &&
+           "The type record's size is not a multiple of 4 bytes which will "
+           "cause misalignment in the output TPI stream!");
     if (auto EC = Writer.writeBytes(Rec))
       return EC;
   }
diff --git a/llvm/lib/DebugInfo/PDB/PDB.cpp b/llvm/lib/DebugInfo/PDB/PDB.cpp
index e7b968cb7bea..e5b7731f6f4a 100644
--- a/llvm/lib/DebugInfo/PDB/PDB.cpp
+++ b/llvm/lib/DebugInfo/PDB/PDB.cpp
@@ -23,15 +23,8 @@ using namespace llvm::pdb;
 Error llvm::pdb::loadDataForPDB(PDB_ReaderType Type, StringRef Path,
                                 std::unique_ptr<IPDBSession> &Session) {
   // Create the correct concrete instance type based on the value of Type.
-  if (Type == PDB_ReaderType::Native) {
-    ErrorOr<std::unique_ptr<MemoryBuffer>> ErrorOrBuffer =
-        MemoryBuffer::getFileOrSTDIN(Path, /*FileSize=*/-1,
-                                     /*RequiresNullTerminator=*/false);
-    if (!ErrorOrBuffer)
-      return errorCodeToError(ErrorOrBuffer.getError());
-
-    return NativeSession::createFromPdb(std::move(*ErrorOrBuffer), Session);
-  }
+  if (Type == PDB_ReaderType::Native)
+    return NativeSession::createFromPdbPath(Path, Session);
 
 #if LLVM_ENABLE_DIA_SDK
   return DIASession::createFromPdb(Path, Session);
@@ -43,8 +36,12 @@ Error llvm::pdb::loadDataForPDB(PDB_ReaderType Type, StringRef Path,
 Error llvm::pdb::loadDataForEXE(PDB_ReaderType Type, StringRef Path,
                                 std::unique_ptr<IPDBSession> &Session) {
   // Create the correct concrete instance type based on the value of Type.
-  if (Type == PDB_ReaderType::Native)
-    return NativeSession::createFromExe(Path, Session);
+  if (Type == PDB_ReaderType::Native) {
+    Expected<std::string> PdbPath = NativeSession::searchForPdb({Path});
+    if (!PdbPath)
+      return PdbPath.takeError();
+    return NativeSession::createFromPdbPath(PdbPath.get(), Session);
+  }
 
 #if LLVM_ENABLE_DIA_SDK
   return DIASession::createFromExe(Path, Session);
diff --git a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
index 2f3a2500c293..10352237763c 100644
--- a/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/DIPrinter.cpp
@@ -73,12 +73,12 @@ void DIPrinter::print(const DILineInfo &Info, bool Inlined) {
   std::string Filename = Info.FileName;
   if (Filename == DILineInfo::BadString)
     Filename = DILineInfo::Addr2LineBadString;
-  else if (Basenames)
-    Filename = llvm::sys::path::filename(Filename);
   if (!Verbose) {
     OS << Filename << ":" << Info.Line;
     if (Style == OutputStyle::LLVM)
       OS << ":" << Info.Column;
+    else if (Style == OutputStyle::GNU && Info.Discriminator != 0)
+      OS << " (discriminator " << Info.Discriminator << ")";
     OS << "\n";
     printContext(Filename, Info.Line);
     return;
diff --git a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
index b4d49d9ff958..84524195fa8a 100644
--- a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.cpp
@@ -35,13 +35,7 @@ using namespace llvm;
 using namespace object;
 using namespace symbolize;
 
-static DILineInfoSpecifier
-getDILineInfoSpecifier(FunctionNameKind FNKind) {
-  return DILineInfoSpecifier(
-      DILineInfoSpecifier::FileLineInfoKind::AbsoluteFilePath, FNKind);
-}
-
-ErrorOr<std::unique_ptr<SymbolizableObjectFile>>
+Expected<std::unique_ptr<SymbolizableObjectFile>>
 SymbolizableObjectFile::create(const object::ObjectFile *Obj,
                                std::unique_ptr<DIContext> DICtx,
                                bool UntagAddresses) {
@@ -56,12 +50,12 @@ SymbolizableObjectFile::create(const object::ObjectFile *Obj,
     for (section_iterator Section : Obj->sections()) {
       Expected<StringRef> NameOrErr = Section->getName();
       if (!NameOrErr)
-        return errorToErrorCode(NameOrErr.takeError());
+        return NameOrErr.takeError();
 
       if (*NameOrErr == ".opd") {
         Expected<StringRef> E = Section->getContents();
         if (!E)
-          return errorToErrorCode(E.takeError());
+          return E.takeError();
         OpdExtractor.reset(new DataExtractor(*E, Obj->isLittleEndian(),
                                              Obj->getBytesInAddress()));
         OpdAddress = Section->getAddress();
@@ -72,14 +66,16 @@ SymbolizableObjectFile::create(const object::ObjectFile *Obj,
   std::vector<std::pair<SymbolRef, uint64_t>> Symbols =
       computeSymbolSizes(*Obj);
   for (auto &P : Symbols)
-    res->addSymbol(P.first, P.second, OpdExtractor.get(), OpdAddress);
+    if (Error E =
+            res->addSymbol(P.first, P.second, OpdExtractor.get(), OpdAddress))
+      return std::move(E);
 
   // If this is a COFF object and we didn't find any symbols, try the export
   // table.
   if (Symbols.empty()) {
     if (auto *CoffObj = dyn_cast<COFFObjectFile>(Obj))
-      if (auto EC = res->addCoffExportSymbols(CoffObj))
-        return EC;
+      if (Error E = res->addCoffExportSymbols(CoffObj))
+        return std::move(E);
   }
 
   std::vector<std::pair<SymbolDesc, StringRef>> &Fs = res->Functions,
@@ -123,7 +119,7 @@ struct OffsetNamePair {
 
 } // end anonymous namespace
 
-std::error_code SymbolizableObjectFile::addCoffExportSymbols(
+Error SymbolizableObjectFile::addCoffExportSymbols(
     const COFFObjectFile *CoffObj) {
   // Get all export names and offsets.
   std::vector<OffsetNamePair> ExportSyms;
@@ -137,7 +133,7 @@ std::error_code SymbolizableObjectFile::addCoffExportSymbols(
     ExportSyms.push_back(OffsetNamePair{Offset, Name});
   }
   if (ExportSyms.empty())
-    return std::error_code();
+    return Error::success();
 
   // Sort by ascending offset.
   array_pod_sort(ExportSyms.begin(), ExportSyms.end());
@@ -154,27 +150,27 @@ std::error_code SymbolizableObjectFile::addCoffExportSymbols(
     SymbolDesc SD = {SymbolStart, SymbolSize};
     Functions.emplace_back(SD, Export.Name);
   }
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
-                                                  uint64_t SymbolSize,
-                                                  DataExtractor *OpdExtractor,
-                                                  uint64_t OpdAddress) {
+Error SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
+                                        uint64_t SymbolSize,
+                                        DataExtractor *OpdExtractor,
+                                        uint64_t OpdAddress) {
   // Avoid adding symbols from an unknown/undefined section.
   const ObjectFile *Obj = Symbol.getObject();
   Expected<section_iterator> Sec = Symbol.getSection();
   if (!Sec || (Obj && Obj->section_end() == *Sec))
-    return std::error_code();
+    return Error::success();
   Expected<SymbolRef::Type> SymbolTypeOrErr = Symbol.getType();
   if (!SymbolTypeOrErr)
-    return errorToErrorCode(SymbolTypeOrErr.takeError());
+    return SymbolTypeOrErr.takeError();
   SymbolRef::Type SymbolType = *SymbolTypeOrErr;
   if (SymbolType != SymbolRef::ST_Function && SymbolType != SymbolRef::ST_Data)
-    return std::error_code();
+    return Error::success();
   Expected<uint64_t> SymbolAddressOrErr = Symbol.getAddress();
   if (!SymbolAddressOrErr)
-    return errorToErrorCode(SymbolAddressOrErr.takeError());
+    return SymbolAddressOrErr.takeError();
   uint64_t SymbolAddress = *SymbolAddressOrErr;
   if (UntagAddresses) {
     // For kernel addresses, bits 56-63 need to be set, so we sign extend bit 55
@@ -194,7 +190,7 @@ std::error_code SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
   }
   Expected<StringRef> SymbolNameOrErr = Symbol.getName();
   if (!SymbolNameOrErr)
-    return errorToErrorCode(SymbolNameOrErr.takeError());
+    return SymbolNameOrErr.takeError();
   StringRef SymbolName = *SymbolNameOrErr;
   // Mach-O symbol table names have leading underscore, skip it.
   if (Module->isMachO() && !SymbolName.empty() && SymbolName[0] == '_')
@@ -204,7 +200,7 @@ std::error_code SymbolizableObjectFile::addSymbol(const SymbolRef &Symbol,
   auto &M = SymbolType == SymbolRef::ST_Function ? Functions : Objects;
   SymbolDesc SD = { SymbolAddress, SymbolSize };
   M.emplace_back(SD, SymbolName);
-  return std::error_code();
+  return Error::success();
 }
 
 // Return true if this is a 32-bit x86 PE COFF module.
@@ -251,16 +247,16 @@ bool SymbolizableObjectFile::shouldOverrideWithSymbolTable(
 
 DILineInfo
 SymbolizableObjectFile::symbolizeCode(object::SectionedAddress ModuleOffset,
-                                      FunctionNameKind FNKind,
+                                      DILineInfoSpecifier LineInfoSpecifier,
                                       bool UseSymbolTable) const {
   if (ModuleOffset.SectionIndex == object::SectionedAddress::UndefSection)
     ModuleOffset.SectionIndex =
         getModuleSectionIndexForAddress(ModuleOffset.Address);
-  DILineInfo LineInfo = DebugInfoContext->getLineInfoForAddress(
-      ModuleOffset, getDILineInfoSpecifier(FNKind));
+  DILineInfo LineInfo =
+      DebugInfoContext->getLineInfoForAddress(ModuleOffset, LineInfoSpecifier);
 
   // Override function name from symbol table if necessary.
-  if (shouldOverrideWithSymbolTable(FNKind, UseSymbolTable)) {
+  if (shouldOverrideWithSymbolTable(LineInfoSpecifier.FNKind, UseSymbolTable)) {
     std::string FunctionName;
     uint64_t Start, Size;
     if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset.Address,
@@ -272,20 +268,20 @@ SymbolizableObjectFile::symbolizeCode(object::SectionedAddress ModuleOffset,
 }
 
 DIInliningInfo SymbolizableObjectFile::symbolizeInlinedCode(
-    object::SectionedAddress ModuleOffset, FunctionNameKind FNKind,
-    bool UseSymbolTable) const {
+    object::SectionedAddress ModuleOffset,
+    DILineInfoSpecifier LineInfoSpecifier, bool UseSymbolTable) const {
   if (ModuleOffset.SectionIndex == object::SectionedAddress::UndefSection)
     ModuleOffset.SectionIndex =
         getModuleSectionIndexForAddress(ModuleOffset.Address);
   DIInliningInfo InlinedContext = DebugInfoContext->getInliningInfoForAddress(
-      ModuleOffset, getDILineInfoSpecifier(FNKind));
+      ModuleOffset, LineInfoSpecifier);
 
   // Make sure there is at least one frame in context.
   if (InlinedContext.getNumberOfFrames() == 0)
     InlinedContext.addFrame(DILineInfo());
 
   // Override the function name in lower frame with name from symbol table.
-  if (shouldOverrideWithSymbolTable(FNKind, UseSymbolTable)) {
+  if (shouldOverrideWithSymbolTable(LineInfoSpecifier.FNKind, UseSymbolTable)) {
     std::string FunctionName;
     uint64_t Start, Size;
     if (getNameFromSymbolTable(SymbolRef::ST_Function, ModuleOffset.Address,
diff --git a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
index b5b9793a44d9..0ba304ee4c61 100644
--- a/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
+++ b/llvm/lib/DebugInfo/Symbolize/SymbolizableObjectFile.h
@@ -30,15 +30,15 @@ namespace symbolize {
 
 class SymbolizableObjectFile : public SymbolizableModule {
 public:
-  static ErrorOr<std::unique_ptr<SymbolizableObjectFile>>
+  static Expected<std::unique_ptr<SymbolizableObjectFile>>
   create(const object::ObjectFile *Obj, std::unique_ptr<DIContext> DICtx,
          bool UntagAddresses);
 
   DILineInfo symbolizeCode(object::SectionedAddress ModuleOffset,
-                           FunctionNameKind FNKind,
+                           DILineInfoSpecifier LineInfoSpecifier,
                            bool UseSymbolTable) const override;
   DIInliningInfo symbolizeInlinedCode(object::SectionedAddress ModuleOffset,
-                                      FunctionNameKind FNKind,
+                                      DILineInfoSpecifier LineInfoSpecifier,
                                       bool UseSymbolTable) const override;
   DIGlobal symbolizeData(object::SectionedAddress ModuleOffset) const override;
   std::vector<DILocal>
@@ -60,11 +60,10 @@ private:
                               uint64_t &Size) const;
   // For big-endian PowerPC64 ELF, OpdAddress is the address of the .opd
   // (function descriptor) section and OpdExtractor refers to its contents.
-  std::error_code addSymbol(const object::SymbolRef &Symbol,
-                            uint64_t SymbolSize,
-                            DataExtractor *OpdExtractor = nullptr,
-                            uint64_t OpdAddress = 0);
-  std::error_code addCoffExportSymbols(const object::COFFObjectFile *CoffObj);
+  Error addSymbol(const object::SymbolRef &Symbol, uint64_t SymbolSize,
+                  DataExtractor *OpdExtractor = nullptr,
+                  uint64_t OpdAddress = 0);
+  Error addCoffExportSymbols(const object::COFFObjectFile *CoffObj);
 
   /// Search for the first occurence of specified Address in ObjectFile.
   uint64_t getModuleSectionIndexForAddress(uint64_t Address) const;
diff --git a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
index 35e3ead6317b..1d767a2b0d88 100644
--- a/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
+++ b/llvm/lib/DebugInfo/Symbolize/Symbolize.cpp
@@ -51,8 +51,9 @@ LLVMSymbolizer::symbolizeCodeCommon(SymbolizableModule *Info,
   if (Opts.RelativeAddresses)
     ModuleOffset.Address += Info->getModulePreferredBase();
 
-  DILineInfo LineInfo = Info->symbolizeCode(ModuleOffset, Opts.PrintFunctions,
-                                            Opts.UseSymbolTable);
+  DILineInfo LineInfo = Info->symbolizeCode(
+      ModuleOffset, DILineInfoSpecifier(Opts.PathStyle, Opts.PrintFunctions),
+      Opts.UseSymbolTable);
   if (Opts.Demangle)
     LineInfo.FunctionName = DemangleName(LineInfo.FunctionName, Info);
   return LineInfo;
@@ -66,8 +67,7 @@ LLVMSymbolizer::symbolizeCode(const ObjectFile &Obj,
   if (I != Modules.end())
     return symbolizeCodeCommon(I->second.get(), ModuleOffset);
 
-  std::unique_ptr<DIContext> Context =
-        DWARFContext::create(Obj, nullptr, DWARFContext::defaultErrorHandler);
+  std::unique_ptr<DIContext> Context = DWARFContext::create(Obj);
   Expected<SymbolizableModule *> InfoOrErr =
                      createModuleInfo(&Obj, std::move(Context), ModuleName);
   if (!InfoOrErr)
@@ -104,7 +104,8 @@ LLVMSymbolizer::symbolizeInlinedCode(const std::string &ModuleName,
     ModuleOffset.Address += Info->getModulePreferredBase();
 
   DIInliningInfo InlinedContext = Info->symbolizeInlinedCode(
-      ModuleOffset, Opts.PrintFunctions, Opts.UseSymbolTable);
+      ModuleOffset, DILineInfoSpecifier(Opts.PathStyle, Opts.PrintFunctions),
+      Opts.UseSymbolTable);
   if (Opts.Demangle) {
     for (int i = 0, n = InlinedContext.getNumberOfFrames(); i < n; i++) {
       auto *Frame = InlinedContext.getMutableFrame(i);
@@ -184,7 +185,7 @@ std::string getDarwinDWARFResourceForPath(
   }
   sys::path::append(ResourceName, "Contents", "Resources", "DWARF");
   sys::path::append(ResourceName, Basename);
-  return ResourceName.str();
+  return std::string(ResourceName.str());
 }
 
 bool checkFileCRC(StringRef Path, uint32_t CRCHash) {
@@ -205,14 +206,14 @@ bool findDebugBinary(const std::string &OrigPath,
   // Try relative/path/to/original_binary/debuglink_name
   llvm::sys::path::append(DebugPath, DebuglinkName);
   if (checkFileCRC(DebugPath, CRCHash)) {
-    Result = DebugPath.str();
+    Result = std::string(DebugPath.str());
     return true;
   }
   // Try relative/path/to/original_binary/.debug/debuglink_name
   DebugPath = OrigDir;
   llvm::sys::path::append(DebugPath, ".debug", DebuglinkName);
   if (checkFileCRC(DebugPath, CRCHash)) {
-    Result = DebugPath.str();
+    Result = std::string(DebugPath.str());
     return true;
   }
   // Make the path absolute so that lookups will go to
@@ -234,7 +235,7 @@ bool findDebugBinary(const std::string &OrigPath,
   llvm::sys::path::append(DebugPath, llvm::sys::path::relative_path(OrigDir),
                           DebuglinkName);
   if (checkFileCRC(DebugPath, CRCHash)) {
-    Result = DebugPath.str();
+    Result = std::string(DebugPath.str());
     return true;
   }
   return false;
@@ -300,6 +301,7 @@ Optional<ArrayRef<uint8_t>> getBuildID(const ELFFile<ELFT> *Obj) {
     for (auto N : Obj->notes(P, Err))
       if (N.getType() == ELF::NT_GNU_BUILD_ID && N.getName() == ELF::ELF_NOTE_GNU)
         return N.getDesc();
+    consumeError(std::move(Err));
   }
   return {};
 }
@@ -341,7 +343,7 @@ bool findDebugBinary(const std::vector<std::string> &DebugFileDirectory,
 #endif
     );
     if (llvm::sys::fs::exists(Path)) {
-      Result = Path.str();
+      Result = std::string(Path.str());
       return true;
     }
   } else {
@@ -349,7 +351,7 @@ bool findDebugBinary(const std::vector<std::string> &DebugFileDirectory,
       // Try <debug-file-directory>/.build-id/../...
       SmallString<128> Path = getDebugPath(Directory);
       if (llvm::sys::fs::exists(Path)) {
-        Result = Path.str();
+        Result = std::string(Path.str());
         return true;
       }
     }
@@ -365,9 +367,11 @@ ObjectFile *LLVMSymbolizer::lookUpDsymFile(const std::string &ExePath,
   // resource directory.
   std::vector<std::string> DsymPaths;
   StringRef Filename = sys::path::filename(ExePath);
-  DsymPaths.push_back(getDarwinDWARFResourceForPath(ExePath, Filename));
+  DsymPaths.push_back(
+      getDarwinDWARFResourceForPath(ExePath, std::string(Filename)));
   for (const auto &Path : Opts.DsymHints) {
-    DsymPaths.push_back(getDarwinDWARFResourceForPath(Path, Filename));
+    DsymPaths.push_back(
+        getDarwinDWARFResourceForPath(Path, std::string(Filename)));
   }
   for (const auto &Path : DsymPaths) {
     auto DbgObjOrErr = getOrCreateObject(Path, ArchName);
@@ -508,11 +512,11 @@ LLVMSymbolizer::createModuleInfo(const ObjectFile *Obj,
   std::unique_ptr<SymbolizableModule> SymMod;
   if (InfoOrErr)
     SymMod = std::move(*InfoOrErr);
-  auto InsertResult =
-      Modules.insert(std::make_pair(ModuleName, std::move(SymMod)));
+  auto InsertResult = Modules.insert(
+      std::make_pair(std::string(ModuleName), std::move(SymMod)));
   assert(InsertResult.second);
-  if (std::error_code EC = InfoOrErr.getError())
-    return errorCodeToError(EC);
+  if (!InfoOrErr)
+    return InfoOrErr.takeError();
   return InsertResult.first->second.get();
 }
 
@@ -551,8 +555,11 @@ LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
     if (!EC && DebugInfo != nullptr && !PDBFileName.empty()) {
       using namespace pdb;
       std::unique_ptr<IPDBSession> Session;
-      if (auto Err = loadDataForEXE(PDB_ReaderType::DIA,
-                                    Objects.first->getFileName(), Session)) {
+      PDB_ReaderType ReaderType = Opts.UseNativePDBReader
+                                      ? PDB_ReaderType::Native
+                                      : PDB_ReaderType::DIA;
+      if (auto Err = loadDataForEXE(ReaderType, Objects.first->getFileName(),
+                                    Session)) {
         Modules.emplace(ModuleName, std::unique_ptr<SymbolizableModule>());
         // Return along the PDB filename to provide more context
         return createFileError(PDBFileName, std::move(Err));
@@ -561,9 +568,7 @@ LLVMSymbolizer::getOrCreateModuleInfo(const std::string &ModuleName) {
     }
   }
   if (!Context)
-    Context =
-        DWARFContext::create(*Objects.second, nullptr,
-                             DWARFContext::defaultErrorHandler, Opts.DWPName);
+    Context = DWARFContext::create(*Objects.second, nullptr, Opts.DWPName);
   return createModuleInfo(Objects.first, std::move(Context), ModuleName);
 }
 
@@ -619,7 +624,7 @@ LLVMSymbolizer::DemangleName(const std::string &Name,
     // Only do MSVC C++ demangling on symbols starting with '?'.
     int status = 0;
     char *DemangledName = microsoftDemangle(
-        Name.c_str(), nullptr, nullptr, &status,
+        Name.c_str(), nullptr, nullptr, nullptr, &status,
         MSDemangleFlags(MSDF_NoAccessSpecifier | MSDF_NoCallingConvention |
                         MSDF_NoMemberType | MSDF_NoReturnType));
     if (status != 0)
diff --git a/llvm/lib/Demangle/Demangle.cpp b/llvm/lib/Demangle/Demangle.cpp
index 5f921537b9bd..71dafa0b2e43 100644
--- a/llvm/lib/Demangle/Demangle.cpp
+++ b/llvm/lib/Demangle/Demangle.cpp
@@ -24,8 +24,8 @@ std::string llvm::demangle(const std::string &MangledName) {
   if (isItaniumEncoding(MangledName))
     Demangled = itaniumDemangle(MangledName.c_str(), nullptr, nullptr, nullptr);
   else
-    Demangled =
-        microsoftDemangle(MangledName.c_str(), nullptr, nullptr, nullptr);
+    Demangled = microsoftDemangle(MangledName.c_str(), nullptr, nullptr,
+                                  nullptr, nullptr);
 
   if (!Demangled)
     return MangledName;
diff --git a/llvm/lib/Demangle/ItaniumDemangle.cpp b/llvm/lib/Demangle/ItaniumDemangle.cpp
index e112d5c5ec77..fad9b6b7b63b 100644
--- a/llvm/lib/Demangle/ItaniumDemangle.cpp
+++ b/llvm/lib/Demangle/ItaniumDemangle.cpp
@@ -107,13 +107,11 @@ struct DumpVisitor {
   // Overload used when T is exactly 'bool', not merely convertible to 'bool'.
   void print(bool B) { printStr(B ? "true" : "false"); }
 
-  template <class T>
-  typename std::enable_if<std::is_unsigned<T>::value>::type print(T N) {
+  template <class T> std::enable_if_t<std::is_unsigned<T>::value> print(T N) {
     fprintf(stderr, "%llu", (unsigned long long)N);
   }
 
-  template <class T>
-  typename std::enable_if<std::is_signed<T>::value>::type print(T N) {
+  template <class T> std::enable_if_t<std::is_signed<T>::value> print(T N) {
     fprintf(stderr, "%lld", (long long)N);
   }
 
diff --git a/llvm/lib/Demangle/MicrosoftDemangle.cpp b/llvm/lib/Demangle/MicrosoftDemangle.cpp
index c681d6e25b87..16074314a84d 100644
--- a/llvm/lib/Demangle/MicrosoftDemangle.cpp
+++ b/llvm/lib/Demangle/MicrosoftDemangle.cpp
@@ -2334,14 +2334,16 @@ void Demangler::dumpBackReferences() {
     std::printf("\n");
 }
 
-char *llvm::microsoftDemangle(const char *MangledName, char *Buf, size_t *N,
+char *llvm::microsoftDemangle(const char *MangledName, size_t *NMangled,
+                              char *Buf, size_t *N,
                               int *Status, MSDemangleFlags Flags) {
-  int InternalStatus = demangle_success;
   Demangler D;
   OutputStream S;
 
   StringView Name{MangledName};
   SymbolNode *AST = D.parse(Name);
+  if (!D.Error && NMangled)
+    *NMangled = Name.begin() - MangledName;
 
   if (Flags & MSDF_DumpBackrefs)
     D.dumpBackReferences();
@@ -2356,6 +2358,7 @@ char *llvm::microsoftDemangle(const char *MangledName, char *Buf, size_t *N,
   if (Flags & MSDF_NoMemberType)
     OF = OutputFlags(OF | OF_NoMemberType);
 
+  int InternalStatus = demangle_success;
   if (D.Error)
     InternalStatus = demangle_invalid_mangled_name;
   else if (!initializeOutputStream(Buf, N, S, 1024))
diff --git a/llvm/lib/ExecutionEngine/ExecutionEngine.cpp b/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
index ee7a7cb60bc9..d8bd671c6661 100644
--- a/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
+++ b/llvm/lib/ExecutionEngine/ExecutionEngine.cpp
@@ -9,6 +9,8 @@
 // This file defines the common interface used by the various execution engine
 // subclasses.
 //
+// FIXME: This file needs to be updated to support scalable vectors
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
@@ -108,7 +110,7 @@ public:
     Type *ElTy = GV->getValueType();
     size_t GVSize = (size_t)TD.getTypeAllocSize(ElTy);
     void *RawMemory = ::operator new(
-        alignTo(sizeof(GVMemoryBlock), TD.getPreferredAlignment(GV)) + GVSize);
+        alignTo(sizeof(GVMemoryBlock), TD.getPreferredAlign(GV)) + GVSize);
     new(RawMemory) GVMemoryBlock(GV);
     return static_cast<char*>(RawMemory) + sizeof(GVMemoryBlock);
   }
@@ -200,7 +202,7 @@ std::string ExecutionEngine::getMangledName(const GlobalValue *GV) {
       : GV->getParent()->getDataLayout();
 
   Mangler::getNameWithPrefix(FullName, GV->getName(), DL);
-  return FullName.str();
+  return std::string(FullName.str());
 }
 
 void ExecutionEngine::addGlobalMapping(const GlobalValue *GV, void *Addr) {
@@ -223,7 +225,7 @@ void ExecutionEngine::addGlobalMapping(StringRef Name, uint64_t Addr) {
     std::string &V = EEState.getGlobalAddressReverseMap()[CurVal];
     assert((!V.empty() || !Name.empty()) &&
            "GlobalMapping already established!");
-    V = Name;
+    V = std::string(Name);
   }
 }
 
@@ -269,7 +271,7 @@ uint64_t ExecutionEngine::updateGlobalMapping(StringRef Name, uint64_t Addr) {
     std::string &V = EEState.getGlobalAddressReverseMap()[CurVal];
     assert((!V.empty() || !Name.empty()) &&
            "GlobalMapping already established!");
-    V = Name;
+    V = std::string(Name);
   }
   return OldVal;
 }
@@ -307,8 +309,8 @@ const GlobalValue *ExecutionEngine::getGlobalValueAtAddress(void *Addr) {
            E = EEState.getGlobalAddressMap().end(); I != E; ++I) {
       StringRef Name = I->first();
       uint64_t Addr = I->second;
-      EEState.getGlobalAddressReverseMap().insert(std::make_pair(
-                                                          Addr, Name));
+      EEState.getGlobalAddressReverseMap().insert(
+          std::make_pair(Addr, std::string(Name)));
     }
   }
 
@@ -582,7 +584,7 @@ void *ExecutionEngine::getPointerToGlobal(const GlobalValue *GV) {
   // Global variable might have been added since interpreter started.
   if (GlobalVariable *GVar =
           const_cast<GlobalVariable *>(dyn_cast<GlobalVariable>(GV)))
-    EmitGlobalVariable(GVar);
+    emitGlobalVariable(GVar);
   else
     llvm_unreachable("Global hasn't had an address allocated yet!");
 
@@ -624,17 +626,20 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
         }
       }
       break;
-    case Type::VectorTyID:
-      // if the whole vector is 'undef' just reserve memory for the value.
-      auto* VTy = cast<VectorType>(C->getType());
-      Type *ElemTy = VTy->getElementType();
-      unsigned int elemNum = VTy->getNumElements();
-      Result.AggregateVal.resize(elemNum);
-      if (ElemTy->isIntegerTy())
-        for (unsigned int i = 0; i < elemNum; ++i)
-          Result.AggregateVal[i].IntVal =
-            APInt(ElemTy->getPrimitiveSizeInBits(), 0);
-      break;
+      case Type::ScalableVectorTyID:
+        report_fatal_error(
+            "Scalable vector support not yet implemented in ExecutionEngine");
+      case Type::FixedVectorTyID:
+        // if the whole vector is 'undef' just reserve memory for the value.
+        auto *VTy = cast<FixedVectorType>(C->getType());
+        Type *ElemTy = VTy->getElementType();
+        unsigned int elemNum = VTy->getNumElements();
+        Result.AggregateVal.resize(elemNum);
+        if (ElemTy->isIntegerTy())
+          for (unsigned int i = 0; i < elemNum; ++i)
+            Result.AggregateVal[i].IntVal =
+                APInt(ElemTy->getPrimitiveSizeInBits(), 0);
+        break;
     }
     return Result;
   }
@@ -914,7 +919,10 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
     else
       llvm_unreachable("Unknown constant pointer type!");
     break;
-  case Type::VectorTyID: {
+  case Type::ScalableVectorTyID:
+    report_fatal_error(
+        "Scalable vector support not yet implemented in ExecutionEngine");
+  case Type::FixedVectorTyID: {
     unsigned elemNum;
     Type* ElemTy;
     const ConstantDataVector *CDV = dyn_cast<ConstantDataVector>(C);
@@ -925,9 +933,9 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
         elemNum = CDV->getNumElements();
         ElemTy = CDV->getElementType();
     } else if (CV || CAZ) {
-        auto* VTy = cast<VectorType>(C->getType());
-        elemNum = VTy->getNumElements();
-        ElemTy = VTy->getElementType();
+      auto *VTy = cast<FixedVectorType>(C->getType());
+      elemNum = VTy->getNumElements();
+      ElemTy = VTy->getElementType();
     } else {
         llvm_unreachable("Unknown constant vector type!");
     }
@@ -1006,8 +1014,7 @@ GenericValue ExecutionEngine::getConstantValue(const Constant *C) {
       break;
     }
     llvm_unreachable("Unknown constant pointer type!");
-  }
-  break;
+  } break;
 
   default:
     SmallString<256> Msg;
@@ -1046,7 +1053,8 @@ void ExecutionEngine::StoreValueToMemory(const GenericValue &Val,
 
     *((PointerTy*)Ptr) = Val.PointerVal;
     break;
-  case Type::VectorTyID:
+  case Type::FixedVectorTyID:
+  case Type::ScalableVectorTyID:
     for (unsigned i = 0; i < Val.AggregateVal.size(); ++i) {
       if (cast<VectorType>(Ty)->getElementType()->isDoubleTy())
         *(((double*)Ptr)+i) = Val.AggregateVal[i].DoubleVal;
@@ -1096,8 +1104,11 @@ void ExecutionEngine::LoadValueFromMemory(GenericValue &Result,
     Result.IntVal = APInt(80, y);
     break;
   }
-  case Type::VectorTyID: {
-    auto *VT = cast<VectorType>(Ty);
+  case Type::ScalableVectorTyID:
+    report_fatal_error(
+        "Scalable vector support not yet implemented in ExecutionEngine");
+  case Type::FixedVectorTyID: {
+    auto *VT = cast<FixedVectorType>(Ty);
     Type *ElemT = VT->getElementType();
     const unsigned numElems = VT->getNumElements();
     if (ElemT->isFloatTy()) {
@@ -1200,8 +1211,8 @@ void ExecutionEngine::emitGlobals() {
             GV.hasAppendingLinkage() || !GV.hasName())
           continue;// Ignore external globals and globals with internal linkage.
 
-        const GlobalValue *&GVEntry =
-          LinkedGlobalsMap[std::make_pair(GV.getName(), GV.getType())];
+        const GlobalValue *&GVEntry = LinkedGlobalsMap[std::make_pair(
+            std::string(GV.getName()), GV.getType())];
 
         // If this is the first time we've seen this global, it is the canonical
         // version.
@@ -1228,8 +1239,8 @@ void ExecutionEngine::emitGlobals() {
     for (const auto &GV : M.globals()) {
       // In the multi-module case, see what this global maps to.
       if (!LinkedGlobalsMap.empty()) {
-        if (const GlobalValue *GVEntry =
-              LinkedGlobalsMap[std::make_pair(GV.getName(), GV.getType())]) {
+        if (const GlobalValue *GVEntry = LinkedGlobalsMap[std::make_pair(
+                std::string(GV.getName()), GV.getType())]) {
           // If something else is the canonical global, ignore this one.
           if (GVEntry != &GV) {
             NonCanonicalGlobals.push_back(&GV);
@@ -1243,8 +1254,8 @@ void ExecutionEngine::emitGlobals() {
       } else {
         // External variable reference. Try to use the dynamic loader to
         // get a pointer to it.
-        if (void *SymAddr =
-            sys::DynamicLibrary::SearchForAddressOfSymbol(GV.getName()))
+        if (void *SymAddr = sys::DynamicLibrary::SearchForAddressOfSymbol(
+                std::string(GV.getName())))
           addGlobalMapping(&GV, SymAddr);
         else {
           report_fatal_error("Could not resolve external global address: "
@@ -1258,8 +1269,8 @@ void ExecutionEngine::emitGlobals() {
     if (!NonCanonicalGlobals.empty()) {
       for (unsigned i = 0, e = NonCanonicalGlobals.size(); i != e; ++i) {
         const GlobalValue *GV = NonCanonicalGlobals[i];
-        const GlobalValue *CGV =
-          LinkedGlobalsMap[std::make_pair(GV->getName(), GV->getType())];
+        const GlobalValue *CGV = LinkedGlobalsMap[std::make_pair(
+            std::string(GV->getName()), GV->getType())];
         void *Ptr = getPointerToGlobalIfAvailable(CGV);
         assert(Ptr && "Canonical global wasn't codegen'd!");
         addGlobalMapping(GV, Ptr);
@@ -1271,12 +1282,12 @@ void ExecutionEngine::emitGlobals() {
     for (const auto &GV : M.globals()) {
       if (!GV.isDeclaration()) {
         if (!LinkedGlobalsMap.empty()) {
-          if (const GlobalValue *GVEntry =
-                LinkedGlobalsMap[std::make_pair(GV.getName(), GV.getType())])
+          if (const GlobalValue *GVEntry = LinkedGlobalsMap[std::make_pair(
+                  std::string(GV.getName()), GV.getType())])
             if (GVEntry != &GV)  // Not the canonical variable.
               continue;
         }
-        EmitGlobalVariable(&GV);
+        emitGlobalVariable(&GV);
       }
     }
   }
@@ -1285,7 +1296,7 @@ void ExecutionEngine::emitGlobals() {
 // EmitGlobalVariable - This method emits the specified global variable to the
 // address specified in GlobalAddresses, or allocates new memory if it's not
 // already in the map.
-void ExecutionEngine::EmitGlobalVariable(const GlobalVariable *GV) {
+void ExecutionEngine::emitGlobalVariable(const GlobalVariable *GV) {
   void *GA = getPointerToGlobalIfAvailable(GV);
 
   if (!GA) {
diff --git a/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp b/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
index ff1e8050c7e7..addec6871fa1 100644
--- a/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
+++ b/llvm/lib/ExecutionEngine/ExecutionEngineBindings.cpp
@@ -308,6 +308,18 @@ uint64_t LLVMGetFunctionAddress(LLVMExecutionEngineRef EE, const char *Name) {
   return unwrap(EE)->getFunctionAddress(Name);
 }
 
+LLVMBool LLVMExecutionEngineGetErrMsg(LLVMExecutionEngineRef EE,
+                                      char **OutError) {
+  assert(OutError && "OutError must be non-null");
+  auto *ExecEngine = unwrap(EE);
+  if (ExecEngine->hasError()) {
+    *OutError = strdup(ExecEngine->getErrorMessage().c_str());
+    ExecEngine->clearErrorMessage();
+    return true;
+  }
+  return false;
+}
+
 /*===-- Operations on memory managers -------------------------------------===*/
 
 namespace {
diff --git a/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp b/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
index 51f31d3d5d8f..62e1ea6e0f0a 100644
--- a/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
+++ b/llvm/lib/ExecutionEngine/Interpreter/Execution.cpp
@@ -169,13 +169,14 @@ static void executeFRemInst(GenericValue &Dest, GenericValue Src1,
       Dest.IntVal = APInt(1,Src1.IntVal.OP(Src2.IntVal)); \
       break;
 
-#define IMPLEMENT_VECTOR_INTEGER_ICMP(OP, TY)                        \
-  case Type::VectorTyID: {                                           \
-    assert(Src1.AggregateVal.size() == Src2.AggregateVal.size());    \
-    Dest.AggregateVal.resize( Src1.AggregateVal.size() );            \
-    for( uint32_t _i=0;_i<Src1.AggregateVal.size();_i++)             \
-      Dest.AggregateVal[_i].IntVal = APInt(1,                        \
-      Src1.AggregateVal[_i].IntVal.OP(Src2.AggregateVal[_i].IntVal));\
+#define IMPLEMENT_VECTOR_INTEGER_ICMP(OP, TY)                                  \
+  case Type::FixedVectorTyID:                                                  \
+  case Type::ScalableVectorTyID: {                                             \
+    assert(Src1.AggregateVal.size() == Src2.AggregateVal.size());              \
+    Dest.AggregateVal.resize(Src1.AggregateVal.size());                        \
+    for (uint32_t _i = 0; _i < Src1.AggregateVal.size(); _i++)                 \
+      Dest.AggregateVal[_i].IntVal = APInt(                                    \
+          1, Src1.AggregateVal[_i].IntVal.OP(Src2.AggregateVal[_i].IntVal));   \
   } break;
 
 // Handle pointers specially because they must be compared with only as much
@@ -367,12 +368,13 @@ void Interpreter::visitICmpInst(ICmpInst &I) {
     Src1.AggregateVal[_i].TY##Val OP Src2.AggregateVal[_i].TY##Val);\
   break;
 
-#define IMPLEMENT_VECTOR_FCMP(OP)                                   \
-  case Type::VectorTyID:                                            \
-    if (cast<VectorType>(Ty)->getElementType()->isFloatTy()) {      \
-      IMPLEMENT_VECTOR_FCMP_T(OP, Float);                           \
-    } else {                                                        \
-        IMPLEMENT_VECTOR_FCMP_T(OP, Double);                        \
+#define IMPLEMENT_VECTOR_FCMP(OP)                                              \
+  case Type::FixedVectorTyID:                                                  \
+  case Type::ScalableVectorTyID:                                               \
+    if (cast<VectorType>(Ty)->getElementType()->isFloatTy()) {                 \
+      IMPLEMENT_VECTOR_FCMP_T(OP, Float);                                      \
+    } else {                                                                   \
+      IMPLEMENT_VECTOR_FCMP_T(OP, Double);                                     \
     }
 
 static GenericValue executeFCMP_OEQ(GenericValue Src1, GenericValue Src2,
@@ -902,13 +904,13 @@ void Interpreter::popStackAndReturnValueToCaller(Type *RetTy,
     // If we have a previous stack frame, and we have a previous call,
     // fill in the return value...
     ExecutionContext &CallingSF = ECStack.back();
-    if (Instruction *I = CallingSF.Caller.getInstruction()) {
+    if (CallingSF.Caller) {
       // Save result...
-      if (!CallingSF.Caller.getType()->isVoidTy())
-        SetValue(I, Result, CallingSF);
-      if (InvokeInst *II = dyn_cast<InvokeInst> (I))
+      if (!CallingSF.Caller->getType()->isVoidTy())
+        SetValue(CallingSF.Caller, Result, CallingSF);
+      if (InvokeInst *II = dyn_cast<InvokeInst>(CallingSF.Caller))
         SwitchToNewBasicBlock (II->getNormalDest (), CallingSF);
-      CallingSF.Caller = CallSite();          // We returned from the call...
+      CallingSF.Caller = nullptr;             // We returned from the call...
     }
   }
 }
@@ -1113,64 +1115,59 @@ void Interpreter::visitStoreInst(StoreInst &I) {
 //                 Miscellaneous Instruction Implementations
 //===----------------------------------------------------------------------===//
 
-void Interpreter::visitCallSite(CallSite CS) {
+void Interpreter::visitVAStartInst(VAStartInst &I) {
   ExecutionContext &SF = ECStack.back();
+  GenericValue ArgIndex;
+  ArgIndex.UIntPairVal.first = ECStack.size() - 1;
+  ArgIndex.UIntPairVal.second = 0;
+  SetValue(&I, ArgIndex, SF);
+}
 
-  // Check to see if this is an intrinsic function call...
-  Function *F = CS.getCalledFunction();
-  if (F && F->isDeclaration())
-    switch (F->getIntrinsicID()) {
-    case Intrinsic::not_intrinsic:
-      break;
-    case Intrinsic::vastart: { // va_start
-      GenericValue ArgIndex;
-      ArgIndex.UIntPairVal.first = ECStack.size() - 1;
-      ArgIndex.UIntPairVal.second = 0;
-      SetValue(CS.getInstruction(), ArgIndex, SF);
-      return;
-    }
-    case Intrinsic::vaend:    // va_end is a noop for the interpreter
-      return;
-    case Intrinsic::vacopy:   // va_copy: dest = src
-      SetValue(CS.getInstruction(), getOperandValue(*CS.arg_begin(), SF), SF);
-      return;
-    default:
-      // If it is an unknown intrinsic function, use the intrinsic lowering
-      // class to transform it into hopefully tasty LLVM code.
-      //
-      BasicBlock::iterator me(CS.getInstruction());
-      BasicBlock *Parent = CS.getInstruction()->getParent();
-      bool atBegin(Parent->begin() == me);
-      if (!atBegin)
-        --me;
-      IL->LowerIntrinsicCall(cast<CallInst>(CS.getInstruction()));
-
-      // Restore the CurInst pointer to the first instruction newly inserted, if
-      // any.
-      if (atBegin) {
-        SF.CurInst = Parent->begin();
-      } else {
-        SF.CurInst = me;
-        ++SF.CurInst;
-      }
-      return;
-    }
+void Interpreter::visitVAEndInst(VAEndInst &I) {
+  // va_end is a noop for the interpreter
+}
+
+void Interpreter::visitVACopyInst(VACopyInst &I) {
+  ExecutionContext &SF = ECStack.back();
+  SetValue(&I, getOperandValue(*I.arg_begin(), SF), SF);
+}
+
+void Interpreter::visitIntrinsicInst(IntrinsicInst &I) {
+  ExecutionContext &SF = ECStack.back();
+
+  // If it is an unknown intrinsic function, use the intrinsic lowering
+  // class to transform it into hopefully tasty LLVM code.
+  //
+  BasicBlock::iterator Me(&I);
+  BasicBlock *Parent = I.getParent();
+  bool atBegin(Parent->begin() == Me);
+  if (!atBegin)
+    --Me;
+  IL->LowerIntrinsicCall(&I);
+
+  // Restore the CurInst pointer to the first instruction newly inserted, if
+  // any.
+  if (atBegin) {
+    SF.CurInst = Parent->begin();
+  } else {
+    SF.CurInst = Me;
+    ++SF.CurInst;
+  }
+}
 
+void Interpreter::visitCallBase(CallBase &I) {
+  ExecutionContext &SF = ECStack.back();
 
-  SF.Caller = CS;
+  SF.Caller = &I;
   std::vector<GenericValue> ArgVals;
-  const unsigned NumArgs = SF.Caller.arg_size();
+  const unsigned NumArgs = SF.Caller->arg_size();
   ArgVals.reserve(NumArgs);
-  uint16_t pNum = 1;
-  for (CallSite::arg_iterator i = SF.Caller.arg_begin(),
-         e = SF.Caller.arg_end(); i != e; ++i, ++pNum) {
-    Value *V = *i;
+  for (Value *V : SF.Caller->args())
     ArgVals.push_back(getOperandValue(V, SF));
-  }
 
   // To handle indirect calls, we must get the pointer value from the argument
   // and treat it as a function pointer.
-  GenericValue SRC = getOperandValue(SF.Caller.getCalledValue(), SF);
+  GenericValue SRC = getOperandValue(SF.Caller->getCalledOperand(), SF);
   callFunction((Function*)GVTOP(SRC), ArgVals);
 }
 
@@ -1332,7 +1329,7 @@ GenericValue Interpreter::executeFPTruncInst(Value *SrcVal, Type *DstTy,
                                              ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
 
-  if (SrcVal->getType()->getTypeID() == Type::VectorTyID) {
+  if (isa<VectorType>(SrcVal->getType())) {
     assert(SrcVal->getType()->getScalarType()->isDoubleTy() &&
            DstTy->getScalarType()->isFloatTy() &&
            "Invalid FPTrunc instruction");
@@ -1355,7 +1352,7 @@ GenericValue Interpreter::executeFPExtInst(Value *SrcVal, Type *DstTy,
                                            ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
 
-  if (SrcVal->getType()->getTypeID() == Type::VectorTyID) {
+  if (isa<VectorType>(SrcVal->getType())) {
     assert(SrcVal->getType()->getScalarType()->isFloatTy() &&
            DstTy->getScalarType()->isDoubleTy() && "Invalid FPExt instruction");
 
@@ -1378,7 +1375,7 @@ GenericValue Interpreter::executeFPToUIInst(Value *SrcVal, Type *DstTy,
   Type *SrcTy = SrcVal->getType();
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
 
-  if (SrcTy->getTypeID() == Type::VectorTyID) {
+  if (isa<VectorType>(SrcTy)) {
     Type *DstVecTy = DstTy->getScalarType();
     Type *SrcVecTy = SrcTy->getScalarType();
     uint32_t DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth();
@@ -1416,7 +1413,7 @@ GenericValue Interpreter::executeFPToSIInst(Value *SrcVal, Type *DstTy,
   Type *SrcTy = SrcVal->getType();
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
 
-  if (SrcTy->getTypeID() == Type::VectorTyID) {
+  if (isa<VectorType>(SrcTy)) {
     Type *DstVecTy = DstTy->getScalarType();
     Type *SrcVecTy = SrcTy->getScalarType();
     uint32_t DBitWidth = cast<IntegerType>(DstVecTy)->getBitWidth();
@@ -1452,7 +1449,7 @@ GenericValue Interpreter::executeUIToFPInst(Value *SrcVal, Type *DstTy,
                                             ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
 
-  if (SrcVal->getType()->getTypeID() == Type::VectorTyID) {
+  if (isa<VectorType>(SrcVal->getType())) {
     Type *DstVecTy = DstTy->getScalarType();
     unsigned size = Src.AggregateVal.size();
     // the sizes of src and dst vectors must be equal
@@ -1484,7 +1481,7 @@ GenericValue Interpreter::executeSIToFPInst(Value *SrcVal, Type *DstTy,
                                             ExecutionContext &SF) {
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
 
-  if (SrcVal->getType()->getTypeID() == Type::VectorTyID) {
+  if (isa<VectorType>(SrcVal->getType())) {
     Type *DstVecTy = DstTy->getScalarType();
     unsigned size = Src.AggregateVal.size();
     // the sizes of src and dst vectors must be equal
@@ -1545,8 +1542,7 @@ GenericValue Interpreter::executeBitCastInst(Value *SrcVal, Type *DstTy,
   Type *SrcTy = SrcVal->getType();
   GenericValue Dest, Src = getOperandValue(SrcVal, SF);
 
-  if ((SrcTy->getTypeID() == Type::VectorTyID) ||
-      (DstTy->getTypeID() == Type::VectorTyID)) {
+  if (isa<VectorType>(SrcTy) || isa<VectorType>(DstTy)) {
     // vector src bitcast to vector dst or vector src bitcast to scalar dst or
     // scalar src bitcast to vector dst
     bool isLittleEndian = getDataLayout().isLittleEndian();
@@ -1558,7 +1554,7 @@ GenericValue Interpreter::executeBitCastInst(Value *SrcVal, Type *DstTy,
     unsigned SrcNum;
     unsigned DstNum;
 
-    if (SrcTy->getTypeID() == Type::VectorTyID) {
+    if (isa<VectorType>(SrcTy)) {
       SrcElemTy = SrcTy->getScalarType();
       SrcBitSize = SrcTy->getScalarSizeInBits();
       SrcNum = Src.AggregateVal.size();
@@ -1571,7 +1567,7 @@ GenericValue Interpreter::executeBitCastInst(Value *SrcVal, Type *DstTy,
       SrcVec.AggregateVal.push_back(Src);
     }
 
-    if (DstTy->getTypeID() == Type::VectorTyID) {
+    if (isa<VectorType>(DstTy)) {
       DstElemTy = DstTy->getScalarType();
       DstBitSize = DstTy->getScalarSizeInBits();
       DstNum = (SrcNum * SrcBitSize) / DstBitSize;
@@ -1644,7 +1640,7 @@ GenericValue Interpreter::executeBitCastInst(Value *SrcVal, Type *DstTy,
     }
 
     // convert result from integer to specified type
-    if (DstTy->getTypeID() == Type::VectorTyID) {
+    if (isa<VectorType>(DstTy)) {
       if (DstElemTy->isDoubleTy()) {
         Dest.AggregateVal.resize(DstNum);
         for (unsigned i = 0; i < DstNum; i++)
@@ -1667,8 +1663,7 @@ GenericValue Interpreter::executeBitCastInst(Value *SrcVal, Type *DstTy,
         Dest.IntVal = TempDst.AggregateVal[0].IntVal;
       }
     }
-  } else { //  if ((SrcTy->getTypeID() == Type::VectorTyID) ||
-           //     (DstTy->getTypeID() == Type::VectorTyID))
+  } else { //  if (isa<VectorType>(SrcTy)) || isa<VectorType>(DstTy))
 
     // scalar src bitcast to scalar dst
     if (DstTy->isPointerTy()) {
@@ -1868,7 +1863,6 @@ void Interpreter::visitShuffleVectorInst(ShuffleVectorInst &I){
 
   GenericValue Src1 = getOperandValue(I.getOperand(0), SF);
   GenericValue Src2 = getOperandValue(I.getOperand(1), SF);
-  GenericValue Src3 = getOperandValue(I.getOperand(2), SF);
   GenericValue Dest;
 
   // There is no need to check types of src1 and src2, because the compiled
@@ -1878,7 +1872,7 @@ void Interpreter::visitShuffleVectorInst(ShuffleVectorInst &I){
   Type *TyContained = Ty->getElementType();
   unsigned src1Size = (unsigned)Src1.AggregateVal.size();
   unsigned src2Size = (unsigned)Src2.AggregateVal.size();
-  unsigned src3Size = (unsigned)Src3.AggregateVal.size();
+  unsigned src3Size = I.getShuffleMask().size();
 
   Dest.AggregateVal.resize(src3Size);
 
@@ -1888,7 +1882,7 @@ void Interpreter::visitShuffleVectorInst(ShuffleVectorInst &I){
       break;
     case Type::IntegerTyID:
       for( unsigned i=0; i<src3Size; i++) {
-        unsigned j = Src3.AggregateVal[i].IntVal.getZExtValue();
+        unsigned j = std::max(0, I.getMaskValue(i));
         if(j < src1Size)
           Dest.AggregateVal[i].IntVal = Src1.AggregateVal[j].IntVal;
         else if(j < src1Size + src2Size)
@@ -1904,7 +1898,7 @@ void Interpreter::visitShuffleVectorInst(ShuffleVectorInst &I){
       break;
     case Type::FloatTyID:
       for( unsigned i=0; i<src3Size; i++) {
-        unsigned j = Src3.AggregateVal[i].IntVal.getZExtValue();
+        unsigned j = std::max(0, I.getMaskValue(i));
         if(j < src1Size)
           Dest.AggregateVal[i].FloatVal = Src1.AggregateVal[j].FloatVal;
         else if(j < src1Size + src2Size)
@@ -1915,7 +1909,7 @@ void Interpreter::visitShuffleVectorInst(ShuffleVectorInst &I){
       break;
     case Type::DoubleTyID:
       for( unsigned i=0; i<src3Size; i++) {
-        unsigned j = Src3.AggregateVal[i].IntVal.getZExtValue();
+        unsigned j = std::max(0, I.getMaskValue(i));
         if(j < src1Size)
           Dest.AggregateVal[i].DoubleVal = Src1.AggregateVal[j].DoubleVal;
         else if(j < src1Size + src2Size)
@@ -1960,7 +1954,8 @@ void Interpreter::visitExtractValueInst(ExtractValueInst &I) {
     break;
     case Type::ArrayTyID:
     case Type::StructTyID:
-    case Type::VectorTyID:
+    case Type::FixedVectorTyID:
+    case Type::ScalableVectorTyID:
       Dest.AggregateVal = pSrc->AggregateVal;
     break;
     case Type::PointerTyID:
@@ -2007,7 +2002,8 @@ void Interpreter::visitInsertValueInst(InsertValueInst &I) {
     break;
     case Type::ArrayTyID:
     case Type::StructTyID:
-    case Type::VectorTyID:
+    case Type::FixedVectorTyID:
+    case Type::ScalableVectorTyID:
       pDest->AggregateVal = Src2.AggregateVal;
     break;
     case Type::PointerTyID:
@@ -2121,8 +2117,8 @@ GenericValue Interpreter::getOperandValue(Value *V, ExecutionContext &SF) {
 // callFunction - Execute the specified function...
 //
 void Interpreter::callFunction(Function *F, ArrayRef<GenericValue> ArgVals) {
-  assert((ECStack.empty() || !ECStack.back().Caller.getInstruction() ||
-          ECStack.back().Caller.arg_size() == ArgVals.size()) &&
+  assert((ECStack.empty() || !ECStack.back().Caller ||
+          ECStack.back().Caller->arg_size() == ArgVals.size()) &&
          "Incorrect number of arguments passed into function call!");
   // Make a new stack frame... and fill it in.
   ECStack.emplace_back();
diff --git a/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index 71b7f893d712..cb1b35d62388 100644
--- a/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/llvm/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -274,7 +274,7 @@ GenericValue Interpreter::callExternalFunction(Function *F,
   RawFunc RawFn;
   if (RF == RawFunctions->end()) {
     RawFn = (RawFunc)(intptr_t)
-      sys::DynamicLibrary::SearchForAddressOfSymbol(F->getName());
+      sys::DynamicLibrary::SearchForAddressOfSymbol(std::string(F->getName()));
     if (!RawFn)
       RawFn = (RawFunc)(intptr_t)getPointerToGlobalIfAvailable(F);
     if (RawFn != 0)
diff --git a/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h b/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h
index e72d778317d6..fd7fa21df196 100644
--- a/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h
+++ b/llvm/lib/ExecutionEngine/Interpreter/Interpreter.h
@@ -15,7 +15,6 @@
 
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
 #include "llvm/ExecutionEngine/GenericValue.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/InstVisitor.h"
@@ -61,8 +60,8 @@ struct ExecutionContext {
   Function             *CurFunction;// The currently executing function
   BasicBlock           *CurBB;      // The currently executing BB
   BasicBlock::iterator  CurInst;    // The next instruction to execute
-  CallSite             Caller;     // Holds the call that called subframes.
-                                   // NULL if main func or debugger invoked fn
+  CallBase             *Caller;     // Holds the call that called subframes.
+                                    // NULL if main func or debugger invoked fn
   std::map<Value *, GenericValue> Values; // LLVM values used in this invocation
   std::vector<GenericValue>  VarArgs; // Values passed through an ellipsis
   AllocaHolder Allocas;            // Track memory allocated by alloca
@@ -149,10 +148,11 @@ public:
   void visitBitCastInst(BitCastInst &I);
   void visitSelectInst(SelectInst &I);
 
-
-  void visitCallSite(CallSite CS);
-  void visitCallInst(CallInst &I) { visitCallSite (CallSite (&I)); }
-  void visitInvokeInst(InvokeInst &I) { visitCallSite (CallSite (&I)); }
+  void visitVAStartInst(VAStartInst &I);
+  void visitVAEndInst(VAEndInst &I);
+  void visitVACopyInst(VACopyInst &I);
+  void visitIntrinsicInst(IntrinsicInst &I);
+  void visitCallBase(CallBase &I);
   void visitUnreachableInst(UnreachableInst &I);
 
   void visitShl(BinaryOperator &I);
diff --git a/llvm/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h b/llvm/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h
index b47a798c7603..82258a35a675 100644
--- a/llvm/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/BasicGOTAndStubsBuilder.h
@@ -15,6 +15,8 @@
 
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
 
+#define DEBUG_TYPE "jitlink"
+
 namespace llvm {
 namespace jitlink {
 
@@ -27,12 +29,25 @@ public:
     // the newly added ones, so just copy the existing blocks out.
     std::vector<Block *> Blocks(G.blocks().begin(), G.blocks().end());
 
+    LLVM_DEBUG(dbgs() << "Creating GOT entries and stubs:\n");
+
     for (auto *B : Blocks)
       for (auto &E : B->edges())
-        if (impl().isGOTEdge(E))
+        if (impl().isGOTEdge(E)) {
+          LLVM_DEBUG({
+            dbgs() << "  Updating GOT edge ";
+            printEdge(dbgs(), *B, E, "<target GOT>");
+            dbgs() << "\n";
+          });
           impl().fixGOTEdge(E, getGOTEntrySymbol(E.getTarget()));
-        else if (impl().isExternalBranchEdge(E))
+        } else if (impl().isExternalBranchEdge(E)) {
+          LLVM_DEBUG({
+            dbgs() << "  Updating external branch edge ";
+            printEdge(dbgs(), *B, E, "<target PC-rel>");
+            dbgs() << "\n";
+          });
           impl().fixExternalBranchEdge(E, getStubSymbol(E.getTarget()));
+        }
   }
 
 protected:
@@ -44,11 +59,17 @@ protected:
     // Build the entry if it doesn't exist.
     if (GOTEntryI == GOTEntries.end()) {
       auto &GOTEntry = impl().createGOTEntry(Target);
+      LLVM_DEBUG({
+        dbgs() << "    Created GOT entry for " << Target.getName() << ": "
+               << GOTEntry << "\n";
+      });
       GOTEntryI =
           GOTEntries.insert(std::make_pair(Target.getName(), &GOTEntry)).first;
     }
 
     assert(GOTEntryI != GOTEntries.end() && "Could not get GOT entry symbol");
+    LLVM_DEBUG(
+        { dbgs() << "    Using GOT entry " << *GOTEntryI->second << "\n"; });
     return *GOTEntryI->second;
   }
 
@@ -59,10 +80,15 @@ protected:
 
     if (StubI == Stubs.end()) {
       auto &StubSymbol = impl().createStub(Target);
+      LLVM_DEBUG({
+        dbgs() << "    Created stub for " << Target.getName() << ": "
+               << StubSymbol << "\n";
+      });
       StubI = Stubs.insert(std::make_pair(Target.getName(), &StubSymbol)).first;
     }
 
     assert(StubI != Stubs.end() && "Count not get stub symbol");
+    LLVM_DEBUG({ dbgs() << "    Using stub " << *StubI->second << "\n"; });
     return *StubI->second;
   }
 
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
new file mode 100644
index 000000000000..6160583b13fe
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF.cpp
@@ -0,0 +1,51 @@
+//===-------------- ELF.cpp - JIT linker function for ELF -------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// ELF jit-link function.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/ELF.h"
+
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h"
+#include "llvm/Support/Endian.h"
+#include "llvm/Support/Format.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include <cstring>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "jitlink"
+
+namespace llvm {
+namespace jitlink {
+
+void jitLink_ELF(std::unique_ptr<JITLinkContext> Ctx) {
+
+  // We don't want to do full ELF validation here. We just verify it is elf'ish.
+  // Probably should parse into an elf header when we support more than x86 :)
+
+  StringRef Data = Ctx->getObjectBuffer().getBuffer();
+  if (Data.size() < llvm::ELF::EI_MAG3 + 1) {
+    Ctx->notifyFailed(make_error<JITLinkError>("Truncated ELF buffer"));
+    return;
+  }
+
+  if (!memcmp(Data.data(), llvm::ELF::ElfMagic, strlen(llvm::ELF::ElfMagic))) {
+    if (Data.data()[llvm::ELF::EI_CLASS] == ELF::ELFCLASS64) {
+      return jitLink_ELF_x86_64(std::move(Ctx));
+    }
+  }
+
+  Ctx->notifyFailed(make_error<JITLinkError>("ELF magic not valid"));
+}
+
+} // end namespace jitlink
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
new file mode 100644
index 000000000000..505f03590b6b
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
@@ -0,0 +1,463 @@
+//===---- ELF_x86_64.cpp -JIT linker implementation for ELF/x86-64 ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// ELF/x86-64 jit-link implementation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/JITLink/ELF_x86_64.h"
+#include "JITLinkGeneric.h"
+#include "llvm/ExecutionEngine/JITLink/JITLink.h"
+#include "llvm/Object/ELFObjectFile.h"
+
+#define DEBUG_TYPE "jitlink"
+
+using namespace llvm;
+using namespace llvm::jitlink;
+
+static const char *CommonSectionName = "__common";
+
+namespace llvm {
+namespace jitlink {
+
+// This should become a template as the ELFFile is so a lot of this could become
+// generic
+class ELFLinkGraphBuilder_x86_64 {
+
+private:
+  Section *CommonSection = nullptr;
+  // TODO hack to get this working
+  // Find a better way
+  using SymbolTable = object::ELFFile<object::ELF64LE>::Elf_Shdr;
+  // For now we just assume
+  std::map<int32_t, Symbol *> JITSymbolTable;
+
+  Section &getCommonSection() {
+    if (!CommonSection) {
+      auto Prot = static_cast<sys::Memory::ProtectionFlags>(
+          sys::Memory::MF_READ | sys::Memory::MF_WRITE);
+      CommonSection = &G->createSection(CommonSectionName, Prot);
+    }
+    return *CommonSection;
+  }
+
+  static Expected<ELF_x86_64_Edges::ELFX86RelocationKind>
+  getRelocationKind(const uint32_t Type) {
+    switch (Type) {
+    case ELF::R_X86_64_PC32:
+      return ELF_x86_64_Edges::ELFX86RelocationKind::PCRel32;
+    }
+    return make_error<JITLinkError>("Unsupported x86-64 relocation:" +
+                                    formatv("{0:d}", Type));
+  }
+
+  std::unique_ptr<LinkGraph> G;
+  // This could be a template
+  const object::ELFFile<object::ELF64LE> &Obj;
+  object::ELFFile<object::ELF64LE>::Elf_Shdr_Range sections;
+  SymbolTable SymTab;
+
+  bool isRelocatable() { return Obj.getHeader()->e_type == llvm::ELF::ET_REL; }
+
+  support::endianness
+  getEndianness(const object::ELFFile<object::ELF64LE> &Obj) {
+    return Obj.isLE() ? support::little : support::big;
+  }
+
+  // This could also just become part of a template
+  unsigned getPointerSize(const object::ELFFile<object::ELF64LE> &Obj) {
+    return Obj.getHeader()->getFileClass() == ELF::ELFCLASS64 ? 8 : 4;
+  }
+
+  // We don't technically need this right now
+  // But for now going to keep it as it helps me to debug things
+
+  Error createNormalizedSymbols() {
+    LLVM_DEBUG(dbgs() << "Creating normalized symbols...\n");
+
+    for (auto SecRef : sections) {
+      if (SecRef.sh_type != ELF::SHT_SYMTAB &&
+          SecRef.sh_type != ELF::SHT_DYNSYM)
+        continue;
+
+      auto Symbols = Obj.symbols(&SecRef);
+      // TODO: Currently I use this function to test things 
+      // I also want to leave it to see if its common between MACH and elf
+      // so for now I just want to continue even if there is an error
+      if (errorToBool(Symbols.takeError()))
+        continue;
+
+      auto StrTabSec = Obj.getSection(SecRef.sh_link);
+      if (!StrTabSec)
+        return StrTabSec.takeError();
+      auto StringTable = Obj.getStringTable(*StrTabSec);
+      if (!StringTable)
+        return StringTable.takeError();
+
+      for (auto SymRef : *Symbols) {
+        Optional<StringRef> Name;
+        uint64_t Size = 0;
+
+        // FIXME: Read size.
+        (void)Size;
+
+        if (auto NameOrErr = SymRef.getName(*StringTable))
+          Name = *NameOrErr;
+        else
+          return NameOrErr.takeError();
+
+        LLVM_DEBUG({
+          dbgs() << "  ";
+          if (!Name)
+            dbgs() << "<anonymous symbol>";
+          else
+            dbgs() << *Name;
+          dbgs() << ": value = " << formatv("{0:x16}", SymRef.getValue())
+                 << ", type = " << formatv("{0:x2}", SymRef.getType())
+                 << ", binding = " << SymRef.getBinding()
+                 << ", size =" << Size;
+          dbgs() << "\n";
+        });
+      }
+    }
+    return Error::success();
+  }
+
+  Error createNormalizedSections() {
+    LLVM_DEBUG(dbgs() << "Creating normalized sections...\n");
+    for (auto &SecRef : sections) {
+      auto Name = Obj.getSectionName(&SecRef);
+      if (!Name)
+        return Name.takeError();
+      sys::Memory::ProtectionFlags Prot;
+      if (SecRef.sh_flags & ELF::SHF_EXECINSTR) {
+        Prot = static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
+                                                         sys::Memory::MF_EXEC);
+      } else {
+        Prot = static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
+                                                         sys::Memory::MF_WRITE);
+      }
+      uint64_t Address = SecRef.sh_addr;
+      uint64_t Size = SecRef.sh_size;
+      uint64_t Flags = SecRef.sh_flags;
+      uint64_t Alignment = SecRef.sh_addralign;
+      const char *Data = nullptr;
+      // TODO: figure out what it is that has 0 size no name and address
+      // 0000-0000
+      if (Size == 0)
+        continue;
+
+      // FIXME: Use flags.
+      (void)Flags;
+
+      LLVM_DEBUG({
+        dbgs() << "  " << *Name << ": " << formatv("{0:x16}", Address) << " -- "
+               << formatv("{0:x16}", Address + Size) << ", align: " << Alignment
+               << " Flags:" << Flags << "\n";
+      });
+
+      if (SecRef.sh_type != ELF::SHT_NOBITS) {
+        // .sections() already checks that the data is not beyond the end of
+        // file
+        auto contents = Obj.getSectionContentsAsArray<char>(&SecRef);
+        if (!contents)
+          return contents.takeError();
+
+        Data = contents->data();
+        // TODO protection flags.
+        // for now everything is
+        auto &section = G->createSection(*Name, Prot);
+        // Do this here because we have it, but move it into graphify later
+        G->createContentBlock(section, StringRef(Data, Size), Address,
+                              Alignment, 0);
+        if (SecRef.sh_type == ELF::SHT_SYMTAB)
+          // TODO: Dynamic?
+          SymTab = SecRef;
+      }
+    }
+
+    return Error::success();
+  }
+
+  Error addRelocations() {
+    LLVM_DEBUG(dbgs() << "Adding relocations\n");
+    // TODO a partern is forming of iterate some sections but only give me
+    // ones I am interested, i should abstract that concept some where
+    for (auto &SecRef : sections) {
+      if (SecRef.sh_type != ELF::SHT_RELA && SecRef.sh_type != ELF::SHT_REL)
+        continue;
+      // TODO can the elf obj file do this for me?
+      if (SecRef.sh_type == ELF::SHT_REL)
+        return make_error<llvm::StringError>("Shouldn't have REL in x64",
+                                             llvm::inconvertibleErrorCode());
+
+      auto RelSectName = Obj.getSectionName(&SecRef);
+      if (!RelSectName)
+        return RelSectName.takeError();
+      // Deal with .eh_frame later
+      if (*RelSectName == StringRef(".rela.eh_frame"))
+        continue;
+
+      auto UpdateSection = Obj.getSection(SecRef.sh_info);
+      if (!UpdateSection)
+        return UpdateSection.takeError();
+
+      auto UpdateSectionName = Obj.getSectionName(*UpdateSection);
+      if (!UpdateSectionName)
+        return UpdateSectionName.takeError();
+
+      auto JITSection = G->findSectionByName(*UpdateSectionName);
+      if (!JITSection)
+        return make_error<llvm::StringError>(
+            "Refencing a a section that wasn't added to graph" +
+                *UpdateSectionName,
+            llvm::inconvertibleErrorCode());
+
+      auto Relocations = Obj.relas(&SecRef);
+      if (!Relocations)
+        return Relocations.takeError();
+
+      for (const auto &Rela : *Relocations) {
+        auto Type = Rela.getType(false);
+
+        LLVM_DEBUG({
+          dbgs() << "Relocation Type: " << Type << "\n"
+                 << "Name: " << Obj.getRelocationTypeName(Type) << "\n";
+        });
+
+        auto Symbol = Obj.getRelocationSymbol(&Rela, &SymTab);
+        if (!Symbol)
+          return Symbol.takeError();
+
+        auto BlockToFix = *(JITSection->blocks().begin());
+        auto TargetSymbol = JITSymbolTable[(*Symbol)->st_shndx];
+        uint64_t Addend = Rela.r_addend;
+        JITTargetAddress FixupAddress =
+            (*UpdateSection)->sh_addr + Rela.r_offset;
+
+        LLVM_DEBUG({
+          dbgs() << "Processing relocation at "
+                 << format("0x%016" PRIx64, FixupAddress) << "\n";
+        });
+        auto Kind = getRelocationKind(Type);
+        if (!Kind)
+          return Kind.takeError();
+
+        LLVM_DEBUG({
+          Edge GE(*Kind, FixupAddress - BlockToFix->getAddress(), *TargetSymbol,
+                  Addend);
+          // TODO a mapping of KIND => type then call getRelocationTypeName4
+          printEdge(dbgs(), *BlockToFix, GE, StringRef(""));
+          dbgs() << "\n";
+        });
+        BlockToFix->addEdge(*Kind, FixupAddress - BlockToFix->getAddress(),
+                            *TargetSymbol, Addend);
+      }
+    }
+    return Error::success();
+  }
+
+  Error graphifyRegularSymbols() {
+
+    // TODO: ELF supports beyond SHN_LORESERVE,
+    // need to perf test how a vector vs map handles those cases
+
+    std::vector<std::vector<object::ELFFile<object::ELF64LE>::Elf_Shdr_Range *>>
+        SecIndexToSymbols;
+
+    LLVM_DEBUG(dbgs() << "Creating graph symbols...\n");
+
+    for (auto SecRef : sections) {
+
+      if (SecRef.sh_type != ELF::SHT_SYMTAB &&
+          SecRef.sh_type != ELF::SHT_DYNSYM)
+        continue;
+      auto Symbols = Obj.symbols(&SecRef);
+      if (!Symbols)
+        return Symbols.takeError();
+
+      auto StrTabSec = Obj.getSection(SecRef.sh_link);
+      if (!StrTabSec)
+        return StrTabSec.takeError();
+      auto StringTable = Obj.getStringTable(*StrTabSec);
+      if (!StringTable)
+        return StringTable.takeError();
+      auto Name = Obj.getSectionName(&SecRef);
+      if (!Name)
+        return Name.takeError();
+      auto Section = G->findSectionByName(*Name);
+      if (!Section)
+        return make_error<llvm::StringError>("Could not find a section",
+                                             llvm::inconvertibleErrorCode());
+      // we only have one for now
+      auto blocks = Section->blocks();
+      if (blocks.empty())
+        return make_error<llvm::StringError>("Section has no block",
+                                             llvm::inconvertibleErrorCode());
+
+      for (auto SymRef : *Symbols) {
+        auto Type = SymRef.getType();
+        if (Type == ELF::STT_NOTYPE || Type == ELF::STT_FILE)
+          continue;
+        // these should do it for now
+        // if(Type != ELF::STT_NOTYPE &&
+        //   Type != ELF::STT_OBJECT &&
+        //   Type != ELF::STT_FUNC    &&
+        //   Type != ELF::STT_SECTION &&
+        //   Type != ELF::STT_COMMON) {
+        //     continue;
+        //   }
+        std::pair<Linkage, Scope> bindings;
+        auto Name = SymRef.getName(*StringTable);
+        // I am not sure on If this is going to hold as an invariant. Revisit.
+        if (!Name)
+          return Name.takeError();
+        // TODO: weak and hidden
+        if (SymRef.isExternal())
+          bindings = {Linkage::Strong, Scope::Default};
+        else
+          bindings = {Linkage::Strong, Scope::Local};
+
+        if (SymRef.isDefined() &&
+            (Type == ELF::STT_FUNC || Type == ELF::STT_OBJECT)) {
+
+          auto DefinedSection = Obj.getSection(SymRef.st_shndx);
+          if (!DefinedSection)
+            return DefinedSection.takeError();
+          auto sectName = Obj.getSectionName(*DefinedSection);
+          if (!sectName)
+            return Name.takeError();
+
+          auto JitSection = G->findSectionByName(*sectName);
+          if (!JitSection)
+            return make_error<llvm::StringError>(
+                "Could not find a section", llvm::inconvertibleErrorCode());
+          auto bs = JitSection->blocks();
+          if (bs.empty())
+            return make_error<llvm::StringError>(
+                "Section has no block", llvm::inconvertibleErrorCode());
+
+          auto B = *bs.begin();
+          LLVM_DEBUG({ dbgs() << "  " << *Name << ": "; });
+
+          auto &S = G->addDefinedSymbol(
+              *B, SymRef.getValue(), *Name, SymRef.st_size, bindings.first,
+              bindings.second, SymRef.getType() == ELF::STT_FUNC, false);
+          JITSymbolTable[SymRef.st_shndx] = &S;
+        }
+        //TODO: The following has to be implmented.
+        // leaving commented out to save time for future patchs
+        /*
+          G->addAbsoluteSymbol(*Name, SymRef.getValue(), SymRef.st_size,
+          Linkage::Strong, Scope::Default, false);
+
+          if(SymRef.isCommon()) {
+            G->addCommonSymbol(*Name, Scope::Default, getCommonSection(), 0, 0,
+          SymRef.getValue(), false);
+          }
+
+
+          //G->addExternalSymbol(*Name, SymRef.st_size, Linkage::Strong);
+  */
+      }
+    }
+    return Error::success();
+  }
+
+public:
+  ELFLinkGraphBuilder_x86_64(std::string filename,
+                             const object::ELFFile<object::ELF64LE> &Obj)
+      : G(std::make_unique<LinkGraph>(filename, getPointerSize(Obj),
+                                      getEndianness(Obj))),
+        Obj(Obj) {}
+
+  Expected<std::unique_ptr<LinkGraph>> buildGraph() {
+    // Sanity check: we only operate on relocatable objects.
+    if (!isRelocatable())
+      return make_error<JITLinkError>("Object is not a relocatable ELF");
+
+    auto Secs = Obj.sections();
+
+    if (!Secs) {
+      return Secs.takeError();
+    }
+    sections = *Secs;
+
+    if (auto Err = createNormalizedSections())
+      return std::move(Err);
+
+    if (auto Err = createNormalizedSymbols())
+      return std::move(Err);
+
+    if (auto Err = graphifyRegularSymbols())
+      return std::move(Err);
+
+    if (auto Err = addRelocations())
+      return std::move(Err);
+
+    return std::move(G);
+  }
+};
+
+class ELFJITLinker_x86_64 : public JITLinker<ELFJITLinker_x86_64> {
+  friend class JITLinker<ELFJITLinker_x86_64>;
+
+public:
+  ELFJITLinker_x86_64(std::unique_ptr<JITLinkContext> Ctx,
+                      PassConfiguration PassConfig)
+      : JITLinker(std::move(Ctx), std::move(PassConfig)) {}
+
+private:
+  StringRef getEdgeKindName(Edge::Kind R) const override { return StringRef(); }
+
+  Expected<std::unique_ptr<LinkGraph>>
+  buildGraph(MemoryBufferRef ObjBuffer) override {
+    auto ELFObj = object::ObjectFile::createELFObjectFile(ObjBuffer);
+    if (!ELFObj)
+      return ELFObj.takeError();
+
+    auto &ELFObjFile = cast<object::ELFObjectFile<object::ELF64LE>>(**ELFObj);
+    std::string fileName(ELFObj->get()->getFileName());
+    return ELFLinkGraphBuilder_x86_64(std::move(fileName),
+                                      *ELFObjFile.getELFFile())
+        .buildGraph();
+  }
+
+  Error applyFixup(Block &B, const Edge &E, char *BlockWorkingMem) const {
+    using namespace ELF_x86_64_Edges;
+    char *FixupPtr = BlockWorkingMem + E.getOffset();
+    JITTargetAddress FixupAddress = B.getAddress() + E.getOffset();
+    switch (E.getKind()) {
+
+    case ELFX86RelocationKind::PCRel32:
+      int64_t Value = E.getTarget().getAddress() + E.getAddend() - FixupAddress;
+      // verify
+      *(support::little32_t *)FixupPtr = Value;
+      break;
+    }
+    return Error::success();
+  }
+};
+
+void jitLink_ELF_x86_64(std::unique_ptr<JITLinkContext> Ctx) {
+  PassConfiguration Config;
+  Triple TT("x86_64-linux");
+  // Construct a JITLinker and run the link function.
+  // Add a mark-live pass.
+  if (auto MarkLive = Ctx->getMarkLivePass(TT))
+    Config.PrePrunePasses.push_back(std::move(MarkLive));
+  else
+    Config.PrePrunePasses.push_back(markAllSymbolsLive);
+
+  if (auto Err = Ctx->modifyPassConfig(TT, Config))
+    return Ctx->notifyFailed(std::move(Err));
+
+  ELFJITLinker_x86_64::link(std::move(Ctx), std::move(Config));
+}
+} // end namespace jitlink
+} // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
index 6c924f889577..5105ec495148 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLink.cpp
@@ -10,6 +10,7 @@
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
 
 #include "llvm/BinaryFormat/Magic.h"
+#include "llvm/ExecutionEngine/JITLink/ELF.h"
 #include "llvm/ExecutionEngine/JITLink/MachO.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/ManagedStatic.h"
@@ -180,18 +181,14 @@ Block &LinkGraph::splitBlock(Block &B, size_t SplitIndex,
     // Copy edges to NewBlock (recording their iterators so that we can remove
     // them from B), and update of Edges remaining on B.
     std::vector<Block::edge_iterator> EdgesToRemove;
-    for (auto I = B.edges().begin(), E = B.edges().end(); I != E; ++I) {
+    for (auto I = B.edges().begin(); I != B.edges().end();) {
       if (I->getOffset() < SplitIndex) {
         NewBlock.addEdge(*I);
-        EdgesToRemove.push_back(I);
-      } else
+        I = B.removeEdge(I);
+      } else {
         I->setOffset(I->getOffset() - SplitIndex);
-    }
-
-    // Remove edges that were transfered to NewBlock from B.
-    while (!EdgesToRemove.empty()) {
-      B.removeEdge(EdgesToRemove.back());
-      EdgesToRemove.pop_back();
+        ++I;
+      }
     }
   }
 
@@ -304,6 +301,8 @@ void jitLink(std::unique_ptr<JITLinkContext> Ctx) {
   switch (Magic) {
   case file_magic::macho_object:
     return jitLink_MachO(std::move(Ctx));
+  case file_magic::elf_relocatable:
+    return jitLink_ELF(std::move(Ctx));
   default:
     Ctx->notifyFailed(make_error<JITLinkError>("Unsupported file format"));
   };
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
index 7b594fd2c0ea..1d76a49939dc 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.cpp
@@ -24,6 +24,8 @@ JITLinkerBase::~JITLinkerBase() {}
 
 void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
 
+  LLVM_DEBUG({ dbgs() << "Building jitlink graph for new input...\n"; });
+
   // Build the link graph.
   if (auto GraphOrErr = buildGraph(Ctx->getObjectBuffer()))
     G = std::move(*GraphOrErr);
@@ -31,6 +33,10 @@ void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
     return Ctx->notifyFailed(GraphOrErr.takeError());
   assert(G && "Graph should have been created by buildGraph above");
 
+  LLVM_DEBUG({
+    dbgs() << "Starting link phase 1 for graph " << G->getName() << "\n";
+  });
+
   // Prune and optimize the graph.
   if (auto Err = runPasses(Passes.PrePrunePasses))
     return Ctx->notifyFailed(std::move(Err));
@@ -59,10 +65,17 @@ void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
     return Ctx->notifyFailed(std::move(Err));
 
   // Notify client that the defined symbols have been assigned addresses.
+  LLVM_DEBUG(
+      { dbgs() << "Resolving symbols defined in " << G->getName() << "\n"; });
   Ctx->notifyResolved(*G);
 
   auto ExternalSymbols = getExternalSymbolNames();
 
+  LLVM_DEBUG({
+    dbgs() << "Issuing lookup for external symbols for " << G->getName()
+           << " (may trigger materialization/linking of other graphs)...\n";
+  });
+
   // We're about to hand off ownership of ourself to the continuation. Grab a
   // pointer to the context so that we can call it to initiate the lookup.
   //
@@ -87,6 +100,11 @@ void JITLinkerBase::linkPhase1(std::unique_ptr<JITLinkerBase> Self) {
 void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
                                Expected<AsyncLookupResult> LR,
                                SegmentLayoutMap Layout) {
+
+  LLVM_DEBUG({
+    dbgs() << "Starting link phase 2 for graph " << G->getName() << "\n";
+  });
+
   // If the lookup failed, bail out.
   if (!LR)
     return deallocateAndBailOut(LR.takeError());
@@ -94,13 +112,25 @@ void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
   // Assign addresses to external addressables.
   applyLookupResult(*LR);
 
+  // Copy block content to working memory.
+  copyBlockContentToWorkingMemory(Layout, *Alloc);
+
+  LLVM_DEBUG({
+    dbgs() << "Link graph \"" << G->getName()
+           << "\" before post-allocation passes:\n";
+    dumpGraph(dbgs());
+  });
+
+  if (auto Err = runPasses(Passes.PostAllocationPasses))
+    return deallocateAndBailOut(std::move(Err));
+
   LLVM_DEBUG({
     dbgs() << "Link graph \"" << G->getName() << "\" before copy-and-fixup:\n";
     dumpGraph(dbgs());
   });
 
-  // Copy block content to working memory and fix up.
-  if (auto Err = copyAndFixUpBlocks(Layout, *Alloc))
+  // Fix up block content.
+  if (auto Err = fixUpBlocks(*G))
     return deallocateAndBailOut(std::move(Err));
 
   LLVM_DEBUG({
@@ -122,9 +152,16 @@ void JITLinkerBase::linkPhase2(std::unique_ptr<JITLinkerBase> Self,
 }
 
 void JITLinkerBase::linkPhase3(std::unique_ptr<JITLinkerBase> Self, Error Err) {
+
+  LLVM_DEBUG({
+    dbgs() << "Starting link phase 3 for graph " << G->getName() << "\n";
+  });
+
   if (Err)
     return deallocateAndBailOut(std::move(Err));
   Ctx->notifyFinalized(std::move(Alloc));
+
+  LLVM_DEBUG({ dbgs() << "Link of graph " << G->getName() << " complete\n"; });
 }
 
 Error JITLinkerBase::runPasses(LinkGraphPassList &Passes) {
@@ -165,7 +202,7 @@ JITLinkerBase::SegmentLayoutMap JITLinkerBase::layOutBlocks() {
   }
 
   LLVM_DEBUG({
-    dbgs() << "Segment ordering:\n";
+    dbgs() << "Computed segment ordering:\n";
     for (auto &KV : Layout) {
       dbgs() << "  Segment "
              << static_cast<sys::Memory::ProtectionFlags>(KV.first) << ":\n";
@@ -230,11 +267,12 @@ Error JITLinkerBase::allocateSegments(const SegmentLayoutMap &Layout) {
     return AllocOrErr.takeError();
 
   LLVM_DEBUG({
-    dbgs() << "JIT linker got working memory:\n";
+    dbgs() << "JIT linker got memory (working -> target):\n";
     for (auto &KV : Layout) {
       auto Prot = static_cast<sys::Memory::ProtectionFlags>(KV.first);
       dbgs() << "  " << Prot << ": "
-             << (const void *)Alloc->getWorkingMemory(Prot).data() << "\n";
+             << (const void *)Alloc->getWorkingMemory(Prot).data() << " -> "
+             << formatv("{0:x16}", Alloc->getTargetMemory(Prot)) << "\n";
     }
   });
 
@@ -302,6 +340,77 @@ void JITLinkerBase::applyLookupResult(AsyncLookupResult Result) {
          "All strong external symbols should have been resolved by now");
 }
 
+void JITLinkerBase::copyBlockContentToWorkingMemory(
+    const SegmentLayoutMap &Layout, JITLinkMemoryManager::Allocation &Alloc) {
+
+  LLVM_DEBUG(dbgs() << "Copying block content:\n");
+  for (auto &KV : Layout) {
+    auto &Prot = KV.first;
+    auto &SegLayout = KV.second;
+
+    auto SegMem =
+        Alloc.getWorkingMemory(static_cast<sys::Memory::ProtectionFlags>(Prot));
+    char *LastBlockEnd = SegMem.data();
+    char *BlockDataPtr = LastBlockEnd;
+
+    LLVM_DEBUG({
+      dbgs() << "  Processing segment "
+             << static_cast<sys::Memory::ProtectionFlags>(Prot) << " [ "
+             << (const void *)SegMem.data() << " .. "
+             << (const void *)((char *)SegMem.data() + SegMem.size())
+             << " ]\n    Processing content sections:\n";
+    });
+
+    for (auto *B : SegLayout.ContentBlocks) {
+      LLVM_DEBUG(dbgs() << "    " << *B << ":\n");
+
+      // Pad to alignment/alignment-offset.
+      BlockDataPtr = alignToBlock(BlockDataPtr, *B);
+
+      LLVM_DEBUG({
+        dbgs() << "      Bumped block pointer to " << (const void *)BlockDataPtr
+               << " to meet block alignment " << B->getAlignment()
+               << " and alignment offset " << B->getAlignmentOffset() << "\n";
+      });
+
+      // Zero pad up to alignment.
+      LLVM_DEBUG({
+        if (LastBlockEnd != BlockDataPtr)
+          dbgs() << "      Zero padding from " << (const void *)LastBlockEnd
+                 << " to " << (const void *)BlockDataPtr << "\n";
+      });
+
+      while (LastBlockEnd != BlockDataPtr)
+        *LastBlockEnd++ = 0;
+
+      // Copy initial block content.
+      LLVM_DEBUG({
+        dbgs() << "      Copying block " << *B << " content, "
+               << B->getContent().size() << " bytes, from "
+               << (const void *)B->getContent().data() << " to "
+               << (const void *)BlockDataPtr << "\n";
+      });
+      memcpy(BlockDataPtr, B->getContent().data(), B->getContent().size());
+
+      // Point the block's content to the fixed up buffer.
+      B->setContent(StringRef(BlockDataPtr, B->getContent().size()));
+
+      // Update block end pointer.
+      LastBlockEnd = BlockDataPtr + B->getContent().size();
+      BlockDataPtr = LastBlockEnd;
+    }
+
+    // Zero pad the rest of the segment.
+    LLVM_DEBUG({
+      dbgs() << "    Zero padding end of segment from "
+             << (const void *)LastBlockEnd << " to "
+             << (const void *)((char *)SegMem.data() + SegMem.size()) << "\n";
+    });
+    while (LastBlockEnd != SegMem.data() + SegMem.size())
+      *LastBlockEnd++ = 0;
+  }
+}
+
 void JITLinkerBase::deallocateAndBailOut(Error Err) {
   assert(Err && "Should not be bailing out on success value");
   assert(Alloc && "can not call deallocateAndBailOut before allocation");
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
index d5687b7afc96..87e5e8bbc98d 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkGeneric.h
@@ -80,13 +80,13 @@ protected:
   // For debug dumping of the link graph.
   virtual StringRef getEdgeKindName(Edge::Kind K) const = 0;
 
-  // Alight a JITTargetAddress to conform with block alignment requirements.
+  // Align a JITTargetAddress to conform with block alignment requirements.
   static JITTargetAddress alignToBlock(JITTargetAddress Addr, Block &B) {
     uint64_t Delta = (B.getAlignmentOffset() - Addr) % B.getAlignment();
     return Addr + Delta;
   }
 
-  // Alight a pointer to conform with block alignment requirements.
+  // Align a pointer to conform with block alignment requirements.
   static char *alignToBlock(char *P, Block &B) {
     uint64_t PAddr = static_cast<uint64_t>(reinterpret_cast<uintptr_t>(P));
     uint64_t Delta = (B.getAlignmentOffset() - PAddr) % B.getAlignment();
@@ -100,14 +100,14 @@ private:
 
   // Copy block contents and apply relocations.
   // Implemented in JITLinker.
-  virtual Error
-  copyAndFixUpBlocks(const SegmentLayoutMap &Layout,
-                     JITLinkMemoryManager::Allocation &Alloc) const = 0;
+  virtual Error fixUpBlocks(LinkGraph &G) const = 0;
 
   SegmentLayoutMap layOutBlocks();
   Error allocateSegments(const SegmentLayoutMap &Layout);
   JITLinkContext::LookupMap getExternalSymbolNames() const;
   void applyLookupResult(AsyncLookupResult LR);
+  void copyBlockContentToWorkingMemory(const SegmentLayoutMap &Layout,
+                                       JITLinkMemoryManager::Allocation &Alloc);
   void deallocateAndBailOut(Error Err);
 
   void dumpGraph(raw_ostream &OS);
@@ -144,88 +144,25 @@ private:
     return static_cast<const LinkerImpl &>(*this);
   }
 
-  Error
-  copyAndFixUpBlocks(const SegmentLayoutMap &Layout,
-                     JITLinkMemoryManager::Allocation &Alloc) const override {
-    LLVM_DEBUG(dbgs() << "Copying and fixing up blocks:\n");
-    for (auto &KV : Layout) {
-      auto &Prot = KV.first;
-      auto &SegLayout = KV.second;
-
-      auto SegMem = Alloc.getWorkingMemory(
-          static_cast<sys::Memory::ProtectionFlags>(Prot));
-      char *LastBlockEnd = SegMem.data();
-      char *BlockDataPtr = LastBlockEnd;
-
-      LLVM_DEBUG({
-        dbgs() << "  Processing segment "
-               << static_cast<sys::Memory::ProtectionFlags>(Prot) << " [ "
-               << (const void *)SegMem.data() << " .. "
-               << (const void *)((char *)SegMem.data() + SegMem.size())
-               << " ]\n    Processing content sections:\n";
-      });
-
-      for (auto *B : SegLayout.ContentBlocks) {
-        LLVM_DEBUG(dbgs() << "    " << *B << ":\n");
-
-        // Pad to alignment/alignment-offset.
-        BlockDataPtr = alignToBlock(BlockDataPtr, *B);
-
-        LLVM_DEBUG({
-          dbgs() << "      Bumped block pointer to "
-                 << (const void *)BlockDataPtr << " to meet block alignment "
-                 << B->getAlignment() << " and alignment offset "
-                 << B->getAlignmentOffset() << "\n";
-        });
-
-        // Zero pad up to alignment.
-        LLVM_DEBUG({
-          if (LastBlockEnd != BlockDataPtr)
-            dbgs() << "      Zero padding from " << (const void *)LastBlockEnd
-                   << " to " << (const void *)BlockDataPtr << "\n";
-        });
-
-        while (LastBlockEnd != BlockDataPtr)
-          *LastBlockEnd++ = 0;
-
-        // Copy initial block content.
-        LLVM_DEBUG({
-          dbgs() << "      Copying block " << *B << " content, "
-                 << B->getContent().size() << " bytes, from "
-                 << (const void *)B->getContent().data() << " to "
-                 << (const void *)BlockDataPtr << "\n";
-        });
-        memcpy(BlockDataPtr, B->getContent().data(), B->getContent().size());
-
-        // Copy Block data and apply fixups.
-        LLVM_DEBUG(dbgs() << "      Applying fixups.\n");
-        for (auto &E : B->edges()) {
-
-          // Skip non-relocation edges.
-          if (!E.isRelocation())
-            continue;
-
-          // Dispatch to LinkerImpl for fixup.
-          if (auto Err = impl().applyFixup(*B, E, BlockDataPtr))
-            return Err;
-        }
-
-        // Point the block's content to the fixed up buffer.
-        B->setContent(StringRef(BlockDataPtr, B->getContent().size()));
-
-        // Update block end pointer.
-        LastBlockEnd = BlockDataPtr + B->getContent().size();
-        BlockDataPtr = LastBlockEnd;
-      }
+  Error fixUpBlocks(LinkGraph &G) const override {
+    LLVM_DEBUG(dbgs() << "Fixing up blocks:\n");
+
+    for (auto *B : G.blocks()) {
+      LLVM_DEBUG(dbgs() << "  " << *B << ":\n");
+
+      // Copy Block data and apply fixups.
+      LLVM_DEBUG(dbgs() << "    Applying fixups.\n");
+      for (auto &E : B->edges()) {
 
-      // Zero pad the rest of the segment.
-      LLVM_DEBUG({
-        dbgs() << "    Zero padding end of segment from "
-               << (const void *)LastBlockEnd << " to "
-               << (const void *)((char *)SegMem.data() + SegMem.size()) << "\n";
-      });
-      while (LastBlockEnd != SegMem.data() + SegMem.size())
-        *LastBlockEnd++ = 0;
+        // Skip non-relocation edges.
+        if (!E.isRelocation())
+          continue;
+
+        // Dispatch to LinkerImpl for fixup.
+        auto *BlockData = const_cast<char *>(B->getContent().data());
+        if (auto Err = impl().applyFixup(*B, E, BlockData))
+          return Err;
+      }
     }
 
     return Error::success();
diff --git a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
index 9e0d207e8bdb..68ec9d79af9b 100644
--- a/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/JITLinkMemoryManager.cpp
@@ -32,7 +32,7 @@ InProcessMemoryManager::allocate(const SegmentsRequestMap &Request) {
     }
     JITTargetAddress getTargetMemory(ProtectionFlags Seg) override {
       assert(SegBlocks.count(Seg) && "No allocation for segment");
-      return reinterpret_cast<JITTargetAddress>(SegBlocks[Seg].base());
+      return pointerToJITTargetAddress(SegBlocks[Seg].base());
     }
     void finalizeAsync(FinalizeContinuation OnFinalize) override {
       OnFinalize(applyProtections());
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO.cpp
index 58bc0f56e155..b3e45868ab22 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO.cpp
@@ -16,9 +16,9 @@
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/ExecutionEngine/JITLink/MachO_arm64.h"
 #include "llvm/ExecutionEngine/JITLink/MachO_x86_64.h"
-#include "llvm/Support/Endian.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SwapByteOrder.h"
 
 using namespace llvm;
 
@@ -34,7 +34,9 @@ void jitLink_MachO(std::unique_ptr<JITLinkContext> Ctx) {
 
   StringRef Data = Ctx->getObjectBuffer().getBuffer();
   if (Data.size() < 4) {
-    Ctx->notifyFailed(make_error<JITLinkError>("Truncated MachO buffer"));
+    StringRef BufferName = Ctx->getObjectBuffer().getBufferIdentifier();
+    Ctx->notifyFailed(make_error<JITLinkError>("Truncated MachO buffer \"" +
+                                               BufferName + "\""));
     return;
   }
 
@@ -51,20 +53,26 @@ void jitLink_MachO(std::unique_ptr<JITLinkContext> Ctx) {
         make_error<JITLinkError>("MachO 32-bit platforms not supported"));
     return;
   } else if (Magic == MachO::MH_MAGIC_64 || Magic == MachO::MH_CIGAM_64) {
-    MachO::mach_header_64 Header;
 
-    memcpy(&Header, Data.data(), sizeof(MachO::mach_header_64));
+    if (Data.size() < sizeof(MachO::mach_header_64)) {
+      StringRef BufferName = Ctx->getObjectBuffer().getBufferIdentifier();
+      Ctx->notifyFailed(make_error<JITLinkError>("Truncated MachO buffer \"" +
+                                                 BufferName + "\""));
+      return;
+    }
+
+    // Read the CPU type from the header.
+    uint32_t CPUType;
+    memcpy(&CPUType, Data.data() + 4, sizeof(uint32_t));
     if (Magic == MachO::MH_CIGAM_64)
-      swapStruct(Header);
+      CPUType = ByteSwap_32(CPUType);
 
     LLVM_DEBUG({
-      dbgs() << "jitLink_MachO: cputype = "
-             << format("0x%08" PRIx32, Header.cputype)
-             << ", cpusubtype = " << format("0x%08" PRIx32, Header.cpusubtype)
+      dbgs() << "jitLink_MachO: cputype = " << format("0x%08" PRIx32, CPUType)
              << "\n";
     });
 
-    switch (Header.cputype) {
+    switch (CPUType) {
     case MachO::CPU_TYPE_ARM64:
       return jitLink_MachO_arm64(std::move(Ctx));
     case MachO::CPU_TYPE_X86_64:
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
index 701f108a9a21..fa3f403b717c 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.cpp
@@ -47,8 +47,8 @@ Expected<std::unique_ptr<LinkGraph>> MachOLinkGraphBuilder::buildGraph() {
 
 MachOLinkGraphBuilder::MachOLinkGraphBuilder(const object::MachOObjectFile &Obj)
     : Obj(Obj),
-      G(std::make_unique<LinkGraph>(Obj.getFileName(), getPointerSize(Obj),
-                                    getEndianness(Obj))) {}
+      G(std::make_unique<LinkGraph>(std::string(Obj.getFileName()),
+                                    getPointerSize(Obj), getEndianness(Obj))) {}
 
 void MachOLinkGraphBuilder::addCustomSectionParser(
     StringRef SectionName, SectionParserFunction Parser) {
@@ -64,12 +64,14 @@ Linkage MachOLinkGraphBuilder::getLinkage(uint16_t Desc) {
 }
 
 Scope MachOLinkGraphBuilder::getScope(StringRef Name, uint8_t Type) {
-  if (Name.startswith("l"))
-    return Scope::Local;
   if (Type & MachO::N_PEXT)
     return Scope::Hidden;
-  if (Type & MachO::N_EXT)
-    return Scope::Default;
+  if (Type & MachO::N_EXT) {
+    if (Name.startswith("l"))
+      return Scope::Hidden;
+    else
+      return Scope::Default;
+  }
   return Scope::Local;
 }
 
@@ -77,6 +79,11 @@ bool MachOLinkGraphBuilder::isAltEntry(const NormalizedSymbol &NSym) {
   return NSym.Desc & MachO::N_ALT_ENTRY;
 }
 
+bool MachOLinkGraphBuilder::isDebugSection(const NormalizedSection &NSec) {
+  return (NSec.Flags & MachO::S_ATTR_DEBUG &&
+          strcmp(NSec.SegName, "__DWARF") == 0);
+}
+
 unsigned
 MachOLinkGraphBuilder::getPointerSize(const object::MachOObjectFile &Obj) {
   return Obj.is64Bit() ? 8 : 4;
@@ -116,6 +123,11 @@ Error MachOLinkGraphBuilder::createNormalizedSections() {
       const MachO::section_64 &Sec64 =
           Obj.getSection64(SecRef.getRawDataRefImpl());
 
+      memcpy(&NSec.SectName, &Sec64.sectname, 16);
+      NSec.SectName[16] = '\0';
+      memcpy(&NSec.SegName, Sec64.segname, 16);
+      NSec.SegName[16] = '\0';
+
       NSec.Address = Sec64.addr;
       NSec.Size = Sec64.size;
       NSec.Alignment = 1ULL << Sec64.align;
@@ -123,6 +135,12 @@ Error MachOLinkGraphBuilder::createNormalizedSections() {
       DataOffset = Sec64.offset;
     } else {
       const MachO::section &Sec32 = Obj.getSection(SecRef.getRawDataRefImpl());
+
+      memcpy(&NSec.SectName, &Sec32.sectname, 16);
+      NSec.SectName[16] = '\0';
+      memcpy(&NSec.SegName, Sec32.segname, 16);
+      NSec.SegName[16] = '\0';
+
       NSec.Address = Sec32.addr;
       NSec.Size = Sec32.size;
       NSec.Alignment = 1ULL << Sec32.align;
@@ -162,7 +180,14 @@ Error MachOLinkGraphBuilder::createNormalizedSections() {
       Prot = static_cast<sys::Memory::ProtectionFlags>(sys::Memory::MF_READ |
                                                        sys::Memory::MF_WRITE);
 
-    NSec.GraphSection = &G->createSection(*Name, Prot);
+    if (!isDebugSection(NSec))
+      NSec.GraphSection = &G->createSection(*Name, Prot);
+    else
+      LLVM_DEBUG({
+        dbgs() << "    " << *Name
+               << " is a debug section: No graph section will be created.\n";
+      });
+
     IndexToSection.insert(std::make_pair(SecIndex, std::move(NSec)));
   }
 
@@ -189,12 +214,12 @@ Error MachOLinkGraphBuilder::createNormalizedSections() {
     auto &Next = *Sections[I + 1];
     if (Next.Address < Cur.Address + Cur.Size)
       return make_error<JITLinkError>(
-          "Address range for section " + Cur.GraphSection->getName() +
-          formatv(" [ {0:x16} -- {1:x16} ] ", Cur.Address,
-                  Cur.Address + Cur.Size) +
-          "overlaps " +
-          formatv(" [ {0:x16} -- {1:x16} ] ", Next.Address,
-                  Next.Address + Next.Size));
+          "Address range for section " +
+          formatv("\"{0}/{1}\" [ {2:x16} -- {3:x16} ] ", Cur.SegName,
+                  Cur.SectName, Cur.Address, Cur.Address + Cur.Size) +
+          "overlaps section \"" + Next.SegName + "/" + Next.SectName + "\"" +
+          formatv("\"{0}/{1}\" [ {2:x16} -- {3:x16} ] ", Next.SegName,
+                  Next.SectName, Next.Address, Next.Address + Next.Size));
   }
 
   return Error::success();
@@ -260,21 +285,28 @@ Error MachOLinkGraphBuilder::createNormalizedSymbols() {
     });
 
     // If this symbol has a section, sanity check that the addresses line up.
-    NormalizedSection *NSec = nullptr;
     if (Sect != 0) {
-      if (auto NSecOrErr = findSectionByIndex(Sect - 1))
-        NSec = &*NSecOrErr;
-      else
-        return NSecOrErr.takeError();
+      auto NSec = findSectionByIndex(Sect - 1);
+      if (!NSec)
+        return NSec.takeError();
 
       if (Value < NSec->Address || Value > NSec->Address + NSec->Size)
         return make_error<JITLinkError>("Symbol address does not fall within "
                                         "section");
+
+      if (!NSec->GraphSection) {
+        LLVM_DEBUG({
+          dbgs() << "  Skipping: Symbol is in section " << NSec->SegName << "/"
+                 << NSec->SectName
+                 << " which has no associated graph section.\n";
+        });
+        continue;
+      }
     }
 
     IndexToSymbol[SymbolIndex] =
         &createNormalizedSymbol(*Name, Value, Type, Sect, Desc,
-                                getLinkage(Type), getScope(*Name, Type));
+                                getLinkage(Desc), getScope(*Name, Type));
   }
 
   return Error::success();
@@ -362,6 +394,14 @@ Error MachOLinkGraphBuilder::graphifyRegularSymbols() {
     auto SecIndex = KV.first;
     auto &NSec = KV.second;
 
+    if (!NSec.GraphSection) {
+      LLVM_DEBUG({
+        dbgs() << "  " << NSec.SegName << "/" << NSec.SectName
+               << " has no graph section. Skipping.\n";
+      });
+      continue;
+    }
+
     // Skip sections with custom parsers.
     if (CustomSectionParserFunctions.count(NSec.GraphSection->getName())) {
       LLVM_DEBUG({
@@ -524,6 +564,10 @@ Error MachOLinkGraphBuilder::graphifySectionsWithCustomParsers() {
   for (auto &KV : IndexToSection) {
     auto &NSec = KV.second;
 
+    // Skip non-graph sections.
+    if (!NSec.GraphSection)
+      continue;
+
     auto HI = CustomSectionParserFunctions.find(NSec.GraphSection->getName());
     if (HI != CustomSectionParserFunctions.end()) {
       auto &Parse = HI->second;
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
index 91b1d5a22387..dd3bcf27494c 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
+++ b/llvm/lib/ExecutionEngine/JITLink/MachOLinkGraphBuilder.h
@@ -13,6 +13,8 @@
 #ifndef LIB_EXECUTIONENGINE_JITLINK_MACHOLINKGRAPHBUILDER_H
 #define LIB_EXECUTIONENGINE_JITLINK_MACHOLINKGRAPHBUILDER_H
 
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ExecutionEngine/JITLink/JITLink.h"
 
 #include "EHFrameSupportImpl.h"
@@ -58,6 +60,8 @@ protected:
     Symbol *GraphSymbol = nullptr;
   };
 
+  // Normalized section representation. Section and segment names are guaranteed
+  // to be null-terminated, hence the extra bytes on SegName and SectName.
   class NormalizedSection {
     friend class MachOLinkGraphBuilder;
 
@@ -65,12 +69,14 @@ protected:
     NormalizedSection() = default;
 
   public:
-    Section *GraphSection = nullptr;
+    char SectName[17];
+    char SegName[17];
     uint64_t Address = 0;
     uint64_t Size = 0;
     uint64_t Alignment = 0;
     uint32_t Flags = 0;
     const char *Data = nullptr;
+    Section *GraphSection = nullptr;
   };
 
   using SectionParserFunction = std::function<Error(NormalizedSection &S)>;
@@ -110,7 +116,7 @@ protected:
     auto I = IndexToSection.find(Index);
     if (I == IndexToSection.end())
       return make_error<JITLinkError>("No section recorded for index " +
-                                      formatv("{0:u}", Index));
+                                      formatv("{0:d}", Index));
     return I->second;
   }
 
@@ -123,7 +129,7 @@ protected:
     auto *Sym = IndexToSymbol[Index];
     if (!Sym)
       return make_error<JITLinkError>("No symbol at index " +
-                                      formatv("{0:u}", Index));
+                                      formatv("{0:d}", Index));
     return *Sym;
   }
 
@@ -151,6 +157,22 @@ protected:
   static Scope getScope(StringRef Name, uint8_t Type);
   static bool isAltEntry(const NormalizedSymbol &NSym);
 
+  static bool isDebugSection(const NormalizedSection &NSec);
+
+  MachO::relocation_info
+  getRelocationInfo(const object::relocation_iterator RelItr) {
+    MachO::any_relocation_info ARI =
+        getObject().getRelocation(RelItr->getRawDataRefImpl());
+    MachO::relocation_info RI;
+    RI.r_address = ARI.r_word0;
+    RI.r_symbolnum = ARI.r_word1 & 0xffffff;
+    RI.r_pcrel = (ARI.r_word1 >> 24) & 1;
+    RI.r_length = (ARI.r_word1 >> 25) & 3;
+    RI.r_extern = (ARI.r_word1 >> 27) & 1;
+    RI.r_type = (ARI.r_word1 >> 28);
+    return RI;
+  }
+
 private:
   static unsigned getPointerSize(const object::MachOObjectFile &Obj);
   static support::endianness getEndianness(const object::MachOObjectFile &Obj);
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
index 944767449ce2..463845a5b8cb 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_arm64.cpp
@@ -92,15 +92,6 @@ private:
         ", length=" + formatv("{0:d}", RI.r_length));
   }
 
-  MachO::relocation_info
-  getRelocationInfo(const object::relocation_iterator RelItr) {
-    MachO::any_relocation_info ARI =
-        getObject().getRelocation(RelItr->getRawDataRefImpl());
-    MachO::relocation_info RI;
-    memcpy(&RI, &ARI, sizeof(MachO::relocation_info));
-    return RI;
-  }
-
   using PairRelocInfo =
       std::tuple<MachOARM64RelocationKind, Symbol *, uint64_t>;
 
@@ -194,6 +185,28 @@ private:
 
       JITTargetAddress SectionAddress = S.getAddress();
 
+      // Skip relocations virtual sections.
+      if (S.isVirtual()) {
+        if (S.relocation_begin() != S.relocation_end())
+          return make_error<JITLinkError>("Virtual section contains "
+                                          "relocations");
+        continue;
+      }
+
+      // Skip relocations for debug symbols.
+      {
+        auto &NSec =
+            getSectionByIndex(Obj.getSectionIndex(S.getRawDataRefImpl()));
+        if (!NSec.GraphSection) {
+          LLVM_DEBUG({
+            dbgs() << "Skipping relocations for MachO section " << NSec.SegName
+                   << "/" << NSec.SectName
+                   << " which has no associated graph section\n";
+          });
+          continue;
+        }
+      }
+
       for (auto RelItr = S.relocation_begin(), RelEnd = S.relocation_end();
            RelItr != RelEnd; ++RelItr) {
 
@@ -560,7 +573,8 @@ private:
       *(ulittle32_t *)FixupPtr = Value;
       break;
     }
-    case Pointer64: {
+    case Pointer64:
+    case Pointer64Anon: {
       uint64_t Value = E.getTarget().getAddress() + E.getAddend();
       *(ulittle64_t *)FixupPtr = Value;
       break;
diff --git a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
index 69ec72aae292..a91bc3b6033c 100644
--- a/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/MachO_x86_64.cpp
@@ -95,15 +95,6 @@ private:
         ", length=" + formatv("{0:d}", RI.r_length));
   }
 
-  MachO::relocation_info
-  getRelocationInfo(const object::relocation_iterator RelItr) {
-    MachO::any_relocation_info ARI =
-        getObject().getRelocation(RelItr->getRawDataRefImpl());
-    MachO::relocation_info RI;
-    memcpy(&RI, &ARI, sizeof(MachO::relocation_info));
-    return RI;
-  }
-
   using PairRelocInfo = std::tuple<MachOX86RelocationKind, Symbol *, uint64_t>;
 
   // Parses paired SUBTRACTOR/UNSIGNED relocations and, on success,
@@ -196,6 +187,7 @@ private:
 
       JITTargetAddress SectionAddress = S.getAddress();
 
+      // Skip relocations virtual sections.
       if (S.isVirtual()) {
         if (S.relocation_begin() != S.relocation_end())
           return make_error<JITLinkError>("Virtual section contains "
@@ -203,6 +195,21 @@ private:
         continue;
       }
 
+      // Skip relocations for debug symbols.
+      {
+        auto &NSec =
+            getSectionByIndex(Obj.getSectionIndex(S.getRawDataRefImpl()));
+        if (!NSec.GraphSection) {
+          LLVM_DEBUG({
+            dbgs() << "Skipping relocations for MachO section " << NSec.SegName
+                   << "/" << NSec.SectName
+                   << " which has no associated graph section\n";
+          });
+          continue;
+        }
+      }
+
+      // Add relocations for section.
       for (auto RelItr = S.relocation_begin(), RelEnd = S.relocation_end();
            RelItr != RelEnd; ++RelItr) {
 
@@ -350,6 +357,9 @@ private:
 class MachO_x86_64_GOTAndStubsBuilder
     : public BasicGOTAndStubsBuilder<MachO_x86_64_GOTAndStubsBuilder> {
 public:
+  static const uint8_t NullGOTEntryContent[8];
+  static const uint8_t StubContent[6];
+
   MachO_x86_64_GOTAndStubsBuilder(LinkGraph &G)
       : BasicGOTAndStubsBuilder<MachO_x86_64_GOTAndStubsBuilder>(G) {}
 
@@ -367,7 +377,13 @@ public:
   void fixGOTEdge(Edge &E, Symbol &GOTEntry) {
     assert((E.getKind() == PCRel32GOT || E.getKind() == PCRel32GOTLoad) &&
            "Not a GOT edge?");
-    E.setKind(PCRel32);
+    // If this is a PCRel32GOT then change it to an ordinary PCRel32. If it is
+    // a PCRel32GOTLoad then leave it as-is for now. We will use the kind to
+    // check for GOT optimization opportunities in the
+    // optimizeMachO_x86_64_GOTAndStubs pass below.
+    if (E.getKind() == PCRel32GOT)
+      E.setKind(PCRel32);
+
     E.setTarget(GOTEntry);
     // Leave the edge addend as-is.
   }
@@ -388,6 +404,11 @@ public:
   void fixExternalBranchEdge(Edge &E, Symbol &Stub) {
     assert(E.getKind() == Branch32 && "Not a Branch32 edge?");
     assert(E.getAddend() == 0 && "Branch32 edge has non-zero addend?");
+
+    // Set the edge kind to Branch32ToStub. We will use this to check for stub
+    // optimization opportunities in the optimizeMachO_x86_64_GOTAndStubs pass
+    // below.
+    E.setKind(Branch32ToStub);
     E.setTarget(Stub);
   }
 
@@ -417,8 +438,6 @@ private:
                      sizeof(StubContent));
   }
 
-  static const uint8_t NullGOTEntryContent[8];
-  static const uint8_t StubContent[6];
   Section *GOTSection = nullptr;
   Section *StubsSection = nullptr;
 };
@@ -429,6 +448,89 @@ const uint8_t MachO_x86_64_GOTAndStubsBuilder::StubContent[6] = {
     0xFF, 0x25, 0x00, 0x00, 0x00, 0x00};
 } // namespace
 
+static Error optimizeMachO_x86_64_GOTAndStubs(LinkGraph &G) {
+  LLVM_DEBUG(dbgs() << "Optimizing GOT entries and stubs:\n");
+
+  for (auto *B : G.blocks())
+    for (auto &E : B->edges())
+      if (E.getKind() == PCRel32GOTLoad) {
+        assert(E.getOffset() >= 3 && "GOT edge occurs too early in block");
+
+        // Switch the edge kind to PCRel32: Whether we change the edge target
+        // or not this will be the desired kind.
+        E.setKind(PCRel32);
+
+        // Optimize GOT references.
+        auto &GOTBlock = E.getTarget().getBlock();
+        assert(GOTBlock.getSize() == G.getPointerSize() &&
+               "GOT entry block should be pointer sized");
+        assert(GOTBlock.edges_size() == 1 &&
+               "GOT entry should only have one outgoing edge");
+
+        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
+        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
+        JITTargetAddress TargetAddr = GOTTarget.getAddress();
+
+        // Check that this is a recognized MOV instruction.
+        // FIXME: Can we assume this?
+        constexpr uint8_t MOVQRIPRel[] = {0x48, 0x8b};
+        if (strncmp(B->getContent().data() + E.getOffset() - 3,
+                    reinterpret_cast<const char *>(MOVQRIPRel), 2) != 0)
+          continue;
+
+        int64_t Displacement = TargetAddr - EdgeAddr + 4;
+        if (Displacement >= std::numeric_limits<int32_t>::min() &&
+            Displacement <= std::numeric_limits<int32_t>::max()) {
+          E.setTarget(GOTTarget);
+          auto *BlockData = reinterpret_cast<uint8_t *>(
+              const_cast<char *>(B->getContent().data()));
+          BlockData[E.getOffset() - 2] = 0x8d;
+          LLVM_DEBUG({
+            dbgs() << "  Replaced GOT load wih LEA:\n    ";
+            printEdge(dbgs(), *B, E,
+                      getMachOX86RelocationKindName(E.getKind()));
+            dbgs() << "\n";
+          });
+        }
+      } else if (E.getKind() == Branch32ToStub) {
+
+        // Switch the edge kind to PCRel32: Whether we change the edge target
+        // or not this will be the desired kind.
+        E.setKind(Branch32);
+
+        auto &StubBlock = E.getTarget().getBlock();
+        assert(StubBlock.getSize() ==
+                   sizeof(MachO_x86_64_GOTAndStubsBuilder::StubContent) &&
+               "Stub block should be stub sized");
+        assert(StubBlock.edges_size() == 1 &&
+               "Stub block should only have one outgoing edge");
+
+        auto &GOTBlock = StubBlock.edges().begin()->getTarget().getBlock();
+        assert(GOTBlock.getSize() == G.getPointerSize() &&
+               "GOT block should be pointer sized");
+        assert(GOTBlock.edges_size() == 1 &&
+               "GOT block should only have one outgoing edge");
+
+        auto &GOTTarget = GOTBlock.edges().begin()->getTarget();
+        JITTargetAddress EdgeAddr = B->getAddress() + E.getOffset();
+        JITTargetAddress TargetAddr = GOTTarget.getAddress();
+
+        int64_t Displacement = TargetAddr - EdgeAddr + 4;
+        if (Displacement >= std::numeric_limits<int32_t>::min() &&
+            Displacement <= std::numeric_limits<int32_t>::max()) {
+          E.setTarget(GOTTarget);
+          LLVM_DEBUG({
+            dbgs() << "  Replaced stub branch with direct branch:\n    ";
+            printEdge(dbgs(), *B, E,
+                      getMachOX86RelocationKindName(E.getKind()));
+            dbgs() << "\n";
+          });
+        }
+      }
+
+  return Error::success();
+}
+
 namespace llvm {
 namespace jitlink {
 
@@ -570,6 +672,9 @@ void jitLink_MachO_x86_64(std::unique_ptr<JITLinkContext> Ctx) {
       MachO_x86_64_GOTAndStubsBuilder(G).run();
       return Error::success();
     });
+
+    // Add GOT/Stubs optimizer pass.
+    Config.PostAllocationPasses.push_back(optimizeMachO_x86_64_GOTAndStubs);
   }
 
   if (auto Err = Ctx->modifyPassConfig(TT, Config))
@@ -583,6 +688,8 @@ StringRef getMachOX86RelocationKindName(Edge::Kind R) {
   switch (R) {
   case Branch32:
     return "Branch32";
+  case Branch32ToStub:
+    return "Branch32ToStub";
   case Pointer32:
     return "Pointer32";
   case Pointer64:
diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 94741f5f01d5..144329aa8bea 100644
--- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -11,6 +11,7 @@
 #include "llvm/ExecutionEngine/GenericValue.h"
 #include "llvm/ExecutionEngine/JITEventListener.h"
 #include "llvm/ExecutionEngine/MCJIT.h"
+#include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -23,6 +24,7 @@
 #include "llvm/Support/DynamicLibrary.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SmallVectorMemoryBuffer.h"
 #include <mutex>
 
 using namespace llvm;
@@ -239,6 +241,10 @@ void MCJIT::finalizeLoadedModules() {
   // Resolve any outstanding relocations.
   Dyld.resolveRelocations();
 
+  // Check for Dyld error.
+  if (Dyld.hasError())
+    ErrMsg = Dyld.getErrorString().str();
+
   OwnedModules.markAllLoadedModulesAsFinalized();
 
   // Register EH frame data for any module we own which has been loaded
@@ -609,7 +615,7 @@ GenericValue MCJIT::runFunction(Function *F, ArrayRef<GenericValue> ArgValues) {
 
 void *MCJIT::getPointerToNamedFunction(StringRef Name, bool AbortOnFailure) {
   if (!isSymbolSearchingDisabled()) {
-    if (auto Sym = Resolver.findSymbol(Name)) {
+    if (auto Sym = Resolver.findSymbol(std::string(Name))) {
       if (auto AddrOrErr = Sym.getAddress())
         return reinterpret_cast<void*>(
                  static_cast<uintptr_t>(*AddrOrErr));
@@ -619,7 +625,7 @@ void *MCJIT::getPointerToNamedFunction(StringRef Name, bool AbortOnFailure) {
 
   /// If a LazyFunctionCreator is installed, use it to get/create the function.
   if (LazyFunctionCreator)
-    if (void *RP = LazyFunctionCreator(Name))
+    if (void *RP = LazyFunctionCreator(std::string(Name)))
       return RP;
 
   if (AbortOnFailure) {
diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
index 77097fc0d17e..83b64b5171c0 100644
--- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
+++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.h
@@ -12,14 +12,13 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ExecutionEngine/ExecutionEngine.h"
-#include "llvm/ExecutionEngine/ObjectCache.h"
 #include "llvm/ExecutionEngine/RTDyldMemoryManager.h"
 #include "llvm/ExecutionEngine/RuntimeDyld.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/SmallVectorMemoryBuffer.h"
 
 namespace llvm {
 class MCJIT;
+class Module;
+class ObjectCache;
 
 // This is a helper class that the MCJIT execution engine uses for linking
 // functions across modules that it owns.  It aggregates the memory manager
diff --git a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
index f26835ff8a08..9e38dc36faae 100644
--- a/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/CompileOnDemandLayer.cpp
@@ -7,8 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/CompileOnDemandLayer.h"
+
+#include "llvm/ADT/Hashing.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Support/FormatVariadic.h"
 
 using namespace llvm;
 using namespace llvm::orc;
@@ -35,7 +39,7 @@ static ThreadSafeModule extractSubModule(ThreadSafeModule &TSM,
       Constant *Aliasee = A.getAliasee();
       assert(A.hasName() && "Anonymous alias?");
       assert(Aliasee->hasName() && "Anonymous aliasee");
-      std::string AliasName = A.getName();
+      std::string AliasName = std::string(A.getName());
 
       if (isa<Function>(Aliasee)) {
         auto *F = cloneFunctionDecl(*A.getParent(), *cast<Function>(Aliasee));
@@ -67,17 +71,19 @@ namespace orc {
 
 class PartitioningIRMaterializationUnit : public IRMaterializationUnit {
 public:
-  PartitioningIRMaterializationUnit(ExecutionSession &ES, ThreadSafeModule TSM,
-                                    VModuleKey K, CompileOnDemandLayer &Parent)
-      : IRMaterializationUnit(ES, std::move(TSM), std::move(K)),
+  PartitioningIRMaterializationUnit(ExecutionSession &ES,
+                                    const IRSymbolMapper::ManglingOptions &MO,
+                                    ThreadSafeModule TSM, VModuleKey K,
+                                    CompileOnDemandLayer &Parent)
+      : IRMaterializationUnit(ES, MO, std::move(TSM), std::move(K)),
         Parent(Parent) {}
 
   PartitioningIRMaterializationUnit(
-      ThreadSafeModule TSM, SymbolFlagsMap SymbolFlags,
-      SymbolNameToDefinitionMap SymbolToDefinition,
+      ThreadSafeModule TSM, VModuleKey K, SymbolFlagsMap SymbolFlags,
+      SymbolStringPtr InitSymbol, SymbolNameToDefinitionMap SymbolToDefinition,
       CompileOnDemandLayer &Parent)
       : IRMaterializationUnit(std::move(TSM), std::move(K),
-                              std::move(SymbolFlags),
+                              std::move(SymbolFlags), std::move(InitSymbol),
                               std::move(SymbolToDefinition)),
         Parent(Parent) {}
 
@@ -111,7 +117,8 @@ CompileOnDemandLayer::compileWholeModule(GlobalValueSet Requested) {
 CompileOnDemandLayer::CompileOnDemandLayer(
     ExecutionSession &ES, IRLayer &BaseLayer, LazyCallThroughManager &LCTMgr,
     IndirectStubsManagerBuilder BuildIndirectStubsManager)
-    : IRLayer(ES), BaseLayer(BaseLayer), LCTMgr(LCTMgr),
+    : IRLayer(ES, BaseLayer.getManglingOptions()), BaseLayer(BaseLayer),
+      LCTMgr(LCTMgr),
       BuildIndirectStubsManager(std::move(BuildIndirectStubsManager)) {}
 
 void CompileOnDemandLayer::setPartitionFunction(PartitionFunction Partition) {
@@ -136,36 +143,34 @@ void CompileOnDemandLayer::emit(MaterializationResponsibility R,
   TSM.withModuleDo([&](Module &M) {
     // First, do some cleanup on the module:
     cleanUpModule(M);
-
-    MangleAndInterner Mangle(ES, M.getDataLayout());
-    for (auto &GV : M.global_values()) {
-      if (GV.isDeclaration() || GV.hasLocalLinkage() ||
-          GV.hasAppendingLinkage())
-        continue;
-
-      auto Name = Mangle(GV.getName());
-      auto Flags = JITSymbolFlags::fromGlobalValue(GV);
-      if (Flags.isCallable())
-        Callables[Name] = SymbolAliasMapEntry(Name, Flags);
-      else
-        NonCallables[Name] = SymbolAliasMapEntry(Name, Flags);
-    }
   });
 
+  for (auto &KV : R.getSymbols()) {
+    auto &Name = KV.first;
+    auto &Flags = KV.second;
+    if (Flags.isCallable())
+      Callables[Name] = SymbolAliasMapEntry(Name, Flags);
+    else
+      NonCallables[Name] = SymbolAliasMapEntry(Name, Flags);
+  }
+
   // Create a partitioning materialization unit and lodge it with the
   // implementation dylib.
   if (auto Err = PDR.getImplDylib().define(
           std::make_unique<PartitioningIRMaterializationUnit>(
-              ES, std::move(TSM), R.getVModuleKey(), *this))) {
+              ES, *getManglingOptions(), std::move(TSM), R.getVModuleKey(),
+              *this))) {
     ES.reportError(std::move(Err));
     R.failMaterialization();
     return;
   }
 
-  R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables),
-                      JITDylibLookupFlags::MatchAllSymbols));
-  R.replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(),
-                          std::move(Callables), AliaseeImpls));
+  if (!NonCallables.empty())
+    R.replace(reexports(PDR.getImplDylib(), std::move(NonCallables),
+                        JITDylibLookupFlags::MatchAllSymbols));
+  if (!Callables.empty())
+    R.replace(lazyReexports(LCTMgr, PDR.getISManager(), PDR.getImplDylib(),
+                            std::move(Callables), AliaseeImpls));
 }
 
 CompileOnDemandLayer::PerDylibResources &
@@ -173,21 +178,22 @@ CompileOnDemandLayer::getPerDylibResources(JITDylib &TargetD) {
   auto I = DylibResources.find(&TargetD);
   if (I == DylibResources.end()) {
     auto &ImplD =
-        getExecutionSession().createJITDylib(TargetD.getName() + ".impl");
-    TargetD.withSearchOrderDo(
-        [&](const JITDylibSearchOrder &TargetSearchOrder) {
-          auto NewSearchOrder = TargetSearchOrder;
-          assert(
-              !NewSearchOrder.empty() &&
-              NewSearchOrder.front().first == &TargetD &&
-              NewSearchOrder.front().second ==
-                  JITDylibLookupFlags::MatchAllSymbols &&
-              "TargetD must be at the front of its own search order and match "
-              "non-exported symbol");
-          NewSearchOrder.insert(std::next(NewSearchOrder.begin()),
-                                {&ImplD, JITDylibLookupFlags::MatchAllSymbols});
-          ImplD.setSearchOrder(std::move(NewSearchOrder), false);
-        });
+        getExecutionSession().createBareJITDylib(TargetD.getName() + ".impl");
+    JITDylibSearchOrder NewLinkOrder;
+    TargetD.withLinkOrderDo([&](const JITDylibSearchOrder &TargetLinkOrder) {
+      NewLinkOrder = TargetLinkOrder;
+    });
+
+    assert(!NewLinkOrder.empty() && NewLinkOrder.front().first == &TargetD &&
+           NewLinkOrder.front().second ==
+               JITDylibLookupFlags::MatchAllSymbols &&
+           "TargetD must be at the front of its own search order and match "
+           "non-exported symbol");
+    NewLinkOrder.insert(std::next(NewLinkOrder.begin()),
+                        {&ImplD, JITDylibLookupFlags::MatchAllSymbols});
+    ImplD.setLinkOrder(NewLinkOrder, false);
+    TargetD.setLinkOrder(std::move(NewLinkOrder), false);
+
     PerDylibResources PDR(ImplD, BuildIndirectStubsManager());
     I = DylibResources.insert(std::make_pair(&TargetD, std::move(PDR))).first;
   }
@@ -252,8 +258,15 @@ void CompileOnDemandLayer::emitPartition(
   auto &ES = getExecutionSession();
   GlobalValueSet RequestedGVs;
   for (auto &Name : R.getRequestedSymbols()) {
-    assert(Defs.count(Name) && "No definition for symbol");
-    RequestedGVs.insert(Defs[Name]);
+    if (Name == R.getInitializerSymbol())
+      TSM.withModuleDo([&](Module &M) {
+        for (auto &GV : getStaticInitGVs(M))
+          RequestedGVs.insert(&GV);
+      });
+    else {
+      assert(Defs.count(Name) && "No definition for symbol");
+      RequestedGVs.insert(Defs[Name]);
+    }
   }
 
   /// Perform partitioning with the context lock held, since the partition
@@ -273,7 +286,8 @@ void CompileOnDemandLayer::emitPartition(
   // If the partition is empty, return the whole module to the symbol table.
   if (GVsToExtract->empty()) {
     R.replace(std::make_unique<PartitioningIRMaterializationUnit>(
-        std::move(TSM), R.getSymbols(), std::move(Defs), *this));
+        std::move(TSM), R.getVModuleKey(), R.getSymbols(),
+        R.getInitializerSymbol(), std::move(Defs), *this));
     return;
   }
 
@@ -284,29 +298,52 @@ void CompileOnDemandLayer::emitPartition(
   //
   // FIXME: We apply this promotion once per partitioning. It's safe, but
   // overkill.
-
   auto ExtractedTSM =
       TSM.withModuleDo([&](Module &M) -> Expected<ThreadSafeModule> {
         auto PromotedGlobals = PromoteSymbols(M);
         if (!PromotedGlobals.empty()) {
+
           MangleAndInterner Mangle(ES, M.getDataLayout());
           SymbolFlagsMap SymbolFlags;
-          for (auto &GV : PromotedGlobals)
-            SymbolFlags[Mangle(GV->getName())] =
-                JITSymbolFlags::fromGlobalValue(*GV);
+          IRSymbolMapper::add(ES, *getManglingOptions(),
+                              PromotedGlobals, SymbolFlags);
+
           if (auto Err = R.defineMaterializing(SymbolFlags))
             return std::move(Err);
         }
 
         expandPartition(*GVsToExtract);
 
+        // Submodule name is given by hashing the names of the globals.
+        std::string SubModuleName;
+        {
+          std::vector<const GlobalValue*> HashGVs;
+          HashGVs.reserve(GVsToExtract->size());
+          for (auto *GV : *GVsToExtract)
+            HashGVs.push_back(GV);
+          llvm::sort(HashGVs, [](const GlobalValue *LHS, const GlobalValue *RHS) {
+              return LHS->getName() < RHS->getName();
+            });
+          hash_code HC(0);
+          for (auto *GV : HashGVs) {
+            assert(GV->hasName() && "All GVs to extract should be named by now");
+            auto GVName = GV->getName();
+            HC = hash_combine(HC, hash_combine_range(GVName.begin(), GVName.end()));
+          }
+          raw_string_ostream(SubModuleName)
+            << ".submodule."
+            << formatv(sizeof(size_t) == 8 ? "{0:x16}" : "{0:x8}",
+                       static_cast<size_t>(HC))
+            << ".ll";
+        }
+
         // Extract the requested partiton (plus any necessary aliases) and
         // put the rest back into the impl dylib.
         auto ShouldExtract = [&](const GlobalValue &GV) -> bool {
           return GVsToExtract->count(&GV);
         };
 
-        return extractSubModule(TSM, ".submodule", ShouldExtract);
+        return extractSubModule(TSM, SubModuleName , ShouldExtract);
       });
 
   if (!ExtractedTSM) {
@@ -316,7 +353,7 @@ void CompileOnDemandLayer::emitPartition(
   }
 
   R.replace(std::make_unique<PartitioningIRMaterializationUnit>(
-      ES, std::move(TSM), R.getVModuleKey(), *this));
+      ES, *getManglingOptions(), std::move(TSM), R.getVModuleKey(), *this));
   BaseLayer.emit(std::move(R), std::move(*ExtractedTSM));
 }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp b/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
index f5671d90420a..f8efed15edea 100644
--- a/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/CompileUtils.cpp
@@ -24,11 +24,20 @@
 namespace llvm {
 namespace orc {
 
+IRSymbolMapper::ManglingOptions
+irManglingOptionsFromTargetOptions(const TargetOptions &Opts) {
+  IRSymbolMapper::ManglingOptions MO;
+
+  MO.EmulatedTLS = Opts.EmulatedTLS;
+
+  return MO;
+}
+
 /// Compile a Module to an ObjectFile.
-SimpleCompiler::CompileResult SimpleCompiler::operator()(Module &M) {
+Expected<SimpleCompiler::CompileResult> SimpleCompiler::operator()(Module &M) {
   CompileResult CachedObject = tryToLoadFromObjectCache(M);
   if (CachedObject)
-    return CachedObject;
+    return std::move(CachedObject);
 
   SmallVector<char, 0> ObjBufferSV;
 
@@ -38,7 +47,8 @@ SimpleCompiler::CompileResult SimpleCompiler::operator()(Module &M) {
     legacy::PassManager PM;
     MCContext *Ctx;
     if (TM.addPassesToEmitMC(PM, Ctx, ObjStream))
-      llvm_unreachable("Target does not support MC emission.");
+      return make_error<StringError>("Target does not support MC emission",
+                                     inconvertibleErrorCode());
     PM.run(M);
   }
 
@@ -47,14 +57,11 @@ SimpleCompiler::CompileResult SimpleCompiler::operator()(Module &M) {
 
   auto Obj = object::ObjectFile::createObjectFile(ObjBuffer->getMemBufferRef());
 
-  if (Obj) {
-    notifyObjectCompiled(M, *ObjBuffer);
-    return std::move(ObjBuffer);
-  }
+  if (!Obj)
+    return Obj.takeError();
 
-  // TODO: Actually report errors helpfully.
-  consumeError(Obj.takeError());
-  return nullptr;
+  notifyObjectCompiled(M, *ObjBuffer);
+  return std::move(ObjBuffer);
 }
 
 SimpleCompiler::CompileResult
@@ -73,9 +80,11 @@ void SimpleCompiler::notifyObjectCompiled(const Module &M,
 
 ConcurrentIRCompiler::ConcurrentIRCompiler(JITTargetMachineBuilder JTMB,
                                            ObjectCache *ObjCache)
-    : JTMB(std::move(JTMB)), ObjCache(ObjCache) {}
+    : IRCompiler(irManglingOptionsFromTargetOptions(JTMB.getOptions())),
+      JTMB(std::move(JTMB)), ObjCache(ObjCache) {}
 
-std::unique_ptr<MemoryBuffer> ConcurrentIRCompiler::operator()(Module &M) {
+Expected<std::unique_ptr<MemoryBuffer>>
+ConcurrentIRCompiler::operator()(Module &M) {
   auto TM = cantFail(JTMB.createTargetMachine());
   SimpleCompiler C(*TM, ObjCache);
   return C(M);
diff --git a/llvm/lib/ExecutionEngine/Orc/Core.cpp b/llvm/lib/ExecutionEngine/Orc/Core.cpp
index 63ef889dae46..bad13cfebbc6 100644
--- a/llvm/lib/ExecutionEngine/Orc/Core.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Core.cpp
@@ -10,302 +10,30 @@
 
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/ExecutionEngine/Orc/DebugUtils.h"
 #include "llvm/ExecutionEngine/Orc/OrcError.h"
-#include "llvm/IR/Mangler.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/Format.h"
 
+#include <condition_variable>
 #if LLVM_ENABLE_THREADS
 #include <future>
 #endif
 
 #define DEBUG_TYPE "orc"
 
-using namespace llvm;
-
-namespace {
-
-#ifndef NDEBUG
-
-cl::opt<bool> PrintHidden("debug-orc-print-hidden", cl::init(true),
-                          cl::desc("debug print hidden symbols defined by "
-                                   "materialization units"),
-                          cl::Hidden);
-
-cl::opt<bool> PrintCallable("debug-orc-print-callable", cl::init(true),
-                            cl::desc("debug print callable symbols defined by "
-                                     "materialization units"),
-                            cl::Hidden);
-
-cl::opt<bool> PrintData("debug-orc-print-data", cl::init(true),
-                        cl::desc("debug print data symbols defined by "
-                                 "materialization units"),
-                        cl::Hidden);
-
-#endif // NDEBUG
-
-// SetPrinter predicate that prints every element.
-template <typename T> struct PrintAll {
-  bool operator()(const T &E) { return true; }
-};
-
-bool anyPrintSymbolOptionSet() {
-#ifndef NDEBUG
-  return PrintHidden || PrintCallable || PrintData;
-#else
-  return false;
-#endif // NDEBUG
-}
-
-bool flagsMatchCLOpts(const JITSymbolFlags &Flags) {
-#ifndef NDEBUG
-  // Bail out early if this is a hidden symbol and we're not printing hiddens.
-  if (!PrintHidden && !Flags.isExported())
-    return false;
-
-  // Return true if this is callable and we're printing callables.
-  if (PrintCallable && Flags.isCallable())
-    return true;
-
-  // Return true if this is data and we're printing data.
-  if (PrintData && !Flags.isCallable())
-    return true;
-
-  // otherwise return false.
-  return false;
-#else
-  return false;
-#endif // NDEBUG
-}
-
-// Prints a sequence of items, filtered by an user-supplied predicate.
-template <typename Sequence,
-          typename Pred = PrintAll<typename Sequence::value_type>>
-class SequencePrinter {
-public:
-  SequencePrinter(const Sequence &S, char OpenSeq, char CloseSeq,
-                  Pred ShouldPrint = Pred())
-      : S(S), OpenSeq(OpenSeq), CloseSeq(CloseSeq),
-        ShouldPrint(std::move(ShouldPrint)) {}
-
-  void printTo(llvm::raw_ostream &OS) const {
-    bool PrintComma = false;
-    OS << OpenSeq;
-    for (auto &E : S) {
-      if (ShouldPrint(E)) {
-        if (PrintComma)
-          OS << ',';
-        OS << ' ' << E;
-        PrintComma = true;
-      }
-    }
-    OS << ' ' << CloseSeq;
-  }
-
-private:
-  const Sequence &S;
-  char OpenSeq;
-  char CloseSeq;
-  mutable Pred ShouldPrint;
-};
-
-template <typename Sequence, typename Pred>
-SequencePrinter<Sequence, Pred> printSequence(const Sequence &S, char OpenSeq,
-                                              char CloseSeq, Pred P = Pred()) {
-  return SequencePrinter<Sequence, Pred>(S, OpenSeq, CloseSeq, std::move(P));
-}
-
-// Render a SequencePrinter by delegating to its printTo method.
-template <typename Sequence, typename Pred>
-llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
-                              const SequencePrinter<Sequence, Pred> &Printer) {
-  Printer.printTo(OS);
-  return OS;
-}
-
-struct PrintSymbolFlagsMapElemsMatchingCLOpts {
-  bool operator()(const orc::SymbolFlagsMap::value_type &KV) {
-    return flagsMatchCLOpts(KV.second);
-  }
-};
-
-struct PrintSymbolMapElemsMatchingCLOpts {
-  bool operator()(const orc::SymbolMap::value_type &KV) {
-    return flagsMatchCLOpts(KV.second.getFlags());
-  }
-};
-
-} // end anonymous namespace
-
 namespace llvm {
 namespace orc {
 
 char FailedToMaterialize::ID = 0;
 char SymbolsNotFound::ID = 0;
 char SymbolsCouldNotBeRemoved::ID = 0;
+char MissingSymbolDefinitions::ID = 0;
+char UnexpectedSymbolDefinitions::ID = 0;
 
 RegisterDependenciesFunction NoDependenciesToRegister =
     RegisterDependenciesFunction();
 
 void MaterializationUnit::anchor() {}
 
-raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym) {
-  return OS << *Sym;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols) {
-  return OS << printSequence(Symbols, '{', '}', PrintAll<SymbolStringPtr>());
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolNameVector &Symbols) {
-  return OS << printSequence(Symbols, '[', ']', PrintAll<SymbolStringPtr>());
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const JITSymbolFlags &Flags) {
-  if (Flags.hasError())
-    OS << "[*ERROR*]";
-  if (Flags.isCallable())
-    OS << "[Callable]";
-  else
-    OS << "[Data]";
-  if (Flags.isWeak())
-    OS << "[Weak]";
-  else if (Flags.isCommon())
-    OS << "[Common]";
-
-  if (!Flags.isExported())
-    OS << "[Hidden]";
-
-  return OS;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const JITEvaluatedSymbol &Sym) {
-  return OS << format("0x%016" PRIx64, Sym.getAddress()) << " "
-            << Sym.getFlags();
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap::value_type &KV) {
-  return OS << "(\"" << KV.first << "\", " << KV.second << ")";
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolMap::value_type &KV) {
-  return OS << "(\"" << KV.first << "\": " << KV.second << ")";
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap &SymbolFlags) {
-  return OS << printSequence(SymbolFlags, '{', '}',
-                             PrintSymbolFlagsMapElemsMatchingCLOpts());
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolMap &Symbols) {
-  return OS << printSequence(Symbols, '{', '}',
-                             PrintSymbolMapElemsMatchingCLOpts());
-}
-
-raw_ostream &operator<<(raw_ostream &OS,
-                        const SymbolDependenceMap::value_type &KV) {
-  return OS << "(" << KV.first << ", " << KV.second << ")";
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps) {
-  return OS << printSequence(Deps, '{', '}',
-                             PrintAll<SymbolDependenceMap::value_type>());
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU) {
-  OS << "MU@" << &MU << " (\"" << MU.getName() << "\"";
-  if (anyPrintSymbolOptionSet())
-    OS << ", " << MU.getSymbols();
-  return OS << ")";
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K) {
-  switch (K) {
-  case LookupKind::Static:
-    return OS << "Static";
-  case LookupKind::DLSym:
-    return OS << "DLSym";
-  }
-  llvm_unreachable("Invalid lookup kind");
-}
-
-raw_ostream &operator<<(raw_ostream &OS,
-                        const JITDylibLookupFlags &JDLookupFlags) {
-  switch (JDLookupFlags) {
-  case JITDylibLookupFlags::MatchExportedSymbolsOnly:
-    return OS << "MatchExportedSymbolsOnly";
-  case JITDylibLookupFlags::MatchAllSymbols:
-    return OS << "MatchAllSymbols";
-  }
-  llvm_unreachable("Invalid JITDylib lookup flags");
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LookupFlags) {
-  switch (LookupFlags) {
-  case SymbolLookupFlags::RequiredSymbol:
-    return OS << "RequiredSymbol";
-  case SymbolLookupFlags::WeaklyReferencedSymbol:
-    return OS << "WeaklyReferencedSymbol";
-  }
-  llvm_unreachable("Invalid symbol lookup flags");
-}
-
-raw_ostream &operator<<(raw_ostream &OS,
-                        const SymbolLookupSet::value_type &KV) {
-  return OS << "(" << KV.first << ", " << KV.second << ")";
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet &LookupSet) {
-  return OS << printSequence(LookupSet, '{', '}',
-                             PrintAll<SymbolLookupSet::value_type>());
-}
-
-raw_ostream &operator<<(raw_ostream &OS,
-                        const JITDylibSearchOrder &SearchOrder) {
-  OS << "[";
-  if (!SearchOrder.empty()) {
-    assert(SearchOrder.front().first &&
-           "JITDylibList entries must not be null");
-    OS << " (\"" << SearchOrder.front().first->getName() << "\", "
-       << SearchOrder.begin()->second << ")";
-    for (auto &KV :
-         make_range(std::next(SearchOrder.begin(), 1), SearchOrder.end())) {
-      assert(KV.first && "JITDylibList entries must not be null");
-      OS << ", (\"" << KV.first->getName() << "\", " << KV.second << ")";
-    }
-  }
-  OS << " ]";
-  return OS;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases) {
-  OS << "{";
-  for (auto &KV : Aliases)
-    OS << " " << *KV.first << ": " << KV.second.Aliasee << " "
-       << KV.second.AliasFlags;
-  OS << " }";
-  return OS;
-}
-
-raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S) {
-  switch (S) {
-  case SymbolState::Invalid:
-    return OS << "Invalid";
-  case SymbolState::NeverSearched:
-    return OS << "Never-Searched";
-  case SymbolState::Materializing:
-    return OS << "Materializing";
-  case SymbolState::Resolved:
-    return OS << "Resolved";
-  case SymbolState::Emitted:
-    return OS << "Emitted";
-  case SymbolState::Ready:
-    return OS << "Ready";
-  }
-  llvm_unreachable("Invalid state");
-}
-
 FailedToMaterialize::FailedToMaterialize(
     std::shared_ptr<SymbolDependenceMap> Symbols)
     : Symbols(std::move(Symbols)) {
@@ -352,6 +80,24 @@ void SymbolsCouldNotBeRemoved::log(raw_ostream &OS) const {
   OS << "Symbols could not be removed: " << Symbols;
 }
 
+std::error_code MissingSymbolDefinitions::convertToErrorCode() const {
+  return orcError(OrcErrorCode::MissingSymbolDefinitions);
+}
+
+void MissingSymbolDefinitions::log(raw_ostream &OS) const {
+  OS << "Missing definitions in module " << ModuleName
+     << ": " << Symbols;
+}
+
+std::error_code UnexpectedSymbolDefinitions::convertToErrorCode() const {
+  return orcError(OrcErrorCode::UnexpectedSymbolDefinitions);
+}
+
+void UnexpectedSymbolDefinitions::log(raw_ostream &OS) const {
+  OS << "Unexpected definitions in module " << ModuleName
+     << ": " << Symbols;
+}
+
 AsynchronousSymbolQuery::AsynchronousSymbolQuery(
     const SymbolLookupSet &Symbols, SymbolState RequiredState,
     SymbolsResolvedCallback NotifyComplete)
@@ -372,7 +118,13 @@ void AsynchronousSymbolQuery::notifySymbolMetRequiredState(
   assert(I != ResolvedSymbols.end() &&
          "Resolving symbol outside the requested set");
   assert(I->second.getAddress() == 0 && "Redundantly resolving symbol Name");
-  I->second = std::move(Sym);
+
+  // If this is a materialization-side-effects-only symbol then drop it,
+  // otherwise update its map entry with its resolved address.
+  if (Sym.getFlags().hasMaterializationSideEffectsOnly())
+    ResolvedSymbols.erase(I);
+  else
+    I->second = std::move(Sym);
   --OutstandingSymbolsCount;
 }
 
@@ -413,6 +165,14 @@ void AsynchronousSymbolQuery::removeQueryDependence(
     QueryRegistrations.erase(QRI);
 }
 
+void AsynchronousSymbolQuery::dropSymbol(const SymbolStringPtr &Name) {
+  auto I = ResolvedSymbols.find(Name);
+  assert(I != ResolvedSymbols.end() &&
+         "Redundant removal of weakly-referenced symbol");
+  ResolvedSymbols.erase(I);
+  --OutstandingSymbolsCount;
+}
+
 void AsynchronousSymbolQuery::detach() {
   ResolvedSymbols.clear();
   OutstandingSymbolsCount = 0;
@@ -421,24 +181,18 @@ void AsynchronousSymbolQuery::detach() {
   QueryRegistrations.clear();
 }
 
-MaterializationResponsibility::MaterializationResponsibility(
-    JITDylib &JD, SymbolFlagsMap SymbolFlags, VModuleKey K)
-    : JD(JD), SymbolFlags(std::move(SymbolFlags)), K(std::move(K)) {
-  assert(!this->SymbolFlags.empty() && "Materializing nothing?");
-}
-
 MaterializationResponsibility::~MaterializationResponsibility() {
   assert(SymbolFlags.empty() &&
          "All symbols should have been explicitly materialized or failed");
 }
 
 SymbolNameSet MaterializationResponsibility::getRequestedSymbols() const {
-  return JD.getRequestedSymbols(SymbolFlags);
+  return JD->getRequestedSymbols(SymbolFlags);
 }
 
 Error MaterializationResponsibility::notifyResolved(const SymbolMap &Symbols) {
   LLVM_DEBUG({
-    dbgs() << "In " << JD.getName() << " resolving " << Symbols << "\n";
+    dbgs() << "In " << JD->getName() << " resolving " << Symbols << "\n";
   });
 #ifndef NDEBUG
   for (auto &KV : Symbols) {
@@ -446,21 +200,23 @@ Error MaterializationResponsibility::notifyResolved(const SymbolMap &Symbols) {
     auto I = SymbolFlags.find(KV.first);
     assert(I != SymbolFlags.end() &&
            "Resolving symbol outside this responsibility set");
+    assert(!I->second.hasMaterializationSideEffectsOnly() &&
+           "Can't resolve materialization-side-effects-only symbol");
     assert((KV.second.getFlags() & ~WeakFlags) == (I->second & ~WeakFlags) &&
            "Resolving symbol with incorrect flags");
   }
 #endif
 
-  return JD.resolve(Symbols);
+  return JD->resolve(Symbols);
 }
 
 Error MaterializationResponsibility::notifyEmitted() {
 
   LLVM_DEBUG({
-    dbgs() << "In " << JD.getName() << " emitting " << SymbolFlags << "\n";
+    dbgs() << "In " << JD->getName() << " emitting " << SymbolFlags << "\n";
   });
 
-  if (auto Err = JD.emit(SymbolFlags))
+  if (auto Err = JD->emit(SymbolFlags))
     return Err;
 
   SymbolFlags.clear();
@@ -468,44 +224,59 @@ Error MaterializationResponsibility::notifyEmitted() {
 }
 
 Error MaterializationResponsibility::defineMaterializing(
-    const SymbolFlagsMap &NewSymbolFlags) {
-  // Add the given symbols to this responsibility object.
-  // It's ok if we hit a duplicate here: In that case the new version will be
-  // discarded, and the JITDylib::defineMaterializing method will return a
-  // duplicate symbol error.
-  for (auto &KV : NewSymbolFlags)
-    SymbolFlags.insert(KV);
+    SymbolFlagsMap NewSymbolFlags) {
 
-  return JD.defineMaterializing(NewSymbolFlags);
+  LLVM_DEBUG({
+    dbgs() << "In " << JD->getName() << " defining materializing symbols "
+           << NewSymbolFlags << "\n";
+  });
+  if (auto AcceptedDefs = JD->defineMaterializing(std::move(NewSymbolFlags))) {
+    // Add all newly accepted symbols to this responsibility object.
+    for (auto &KV : *AcceptedDefs)
+      SymbolFlags.insert(KV);
+    return Error::success();
+  } else
+    return AcceptedDefs.takeError();
 }
 
 void MaterializationResponsibility::failMaterialization() {
 
   LLVM_DEBUG({
-    dbgs() << "In " << JD.getName() << " failing materialization for "
+    dbgs() << "In " << JD->getName() << " failing materialization for "
            << SymbolFlags << "\n";
   });
 
   JITDylib::FailedSymbolsWorklist Worklist;
 
   for (auto &KV : SymbolFlags)
-    Worklist.push_back(std::make_pair(&JD, KV.first));
+    Worklist.push_back(std::make_pair(JD.get(), KV.first));
   SymbolFlags.clear();
 
-  JD.notifyFailed(std::move(Worklist));
+  JD->notifyFailed(std::move(Worklist));
 }
 
 void MaterializationResponsibility::replace(
     std::unique_ptr<MaterializationUnit> MU) {
-  for (auto &KV : MU->getSymbols())
+
+  // If the replacement MU is empty then return.
+  if (MU->getSymbols().empty())
+    return;
+
+  for (auto &KV : MU->getSymbols()) {
+    assert(SymbolFlags.count(KV.first) &&
+           "Replacing definition outside this responsibility set");
     SymbolFlags.erase(KV.first);
+  }
 
-  LLVM_DEBUG(JD.getExecutionSession().runSessionLocked([&]() {
-    dbgs() << "In " << JD.getName() << " replacing symbols with " << *MU
+  if (MU->getInitializerSymbol() == InitSymbol)
+    InitSymbol = nullptr;
+
+  LLVM_DEBUG(JD->getExecutionSession().runSessionLocked([&]() {
+    dbgs() << "In " << JD->getName() << " replacing symbols with " << *MU
            << "\n";
   }););
 
-  JD.replace(std::move(MU));
+  JD->replace(std::move(MU));
 }
 
 MaterializationResponsibility
@@ -515,6 +286,7 @@ MaterializationResponsibility::delegate(const SymbolNameSet &Symbols,
   if (NewKey == VModuleKey())
     NewKey = K;
 
+  SymbolStringPtr DelegatedInitSymbol;
   SymbolFlagsMap DelegatedFlags;
 
   for (auto &Name : Symbols) {
@@ -524,29 +296,41 @@ MaterializationResponsibility::delegate(const SymbolNameSet &Symbols,
            "instance");
 
     DelegatedFlags[Name] = std::move(I->second);
+    if (Name == InitSymbol)
+      std::swap(InitSymbol, DelegatedInitSymbol);
+
     SymbolFlags.erase(I);
   }
 
   return MaterializationResponsibility(JD, std::move(DelegatedFlags),
+                                       std::move(DelegatedInitSymbol),
                                        std::move(NewKey));
 }
 
 void MaterializationResponsibility::addDependencies(
     const SymbolStringPtr &Name, const SymbolDependenceMap &Dependencies) {
+  LLVM_DEBUG({
+    dbgs() << "Adding dependencies for " << Name << ": " << Dependencies
+           << "\n";
+  });
   assert(SymbolFlags.count(Name) &&
          "Symbol not covered by this MaterializationResponsibility instance");
-  JD.addDependencies(Name, Dependencies);
+  JD->addDependencies(Name, Dependencies);
 }
 
 void MaterializationResponsibility::addDependenciesForAll(
     const SymbolDependenceMap &Dependencies) {
+  LLVM_DEBUG({
+    dbgs() << "Adding dependencies for all symbols in " << SymbolFlags << ": "
+           << Dependencies << "\n";
+  });
   for (auto &KV : SymbolFlags)
-    JD.addDependencies(KV.first, Dependencies);
+    JD->addDependencies(KV.first, Dependencies);
 }
 
 AbsoluteSymbolsMaterializationUnit::AbsoluteSymbolsMaterializationUnit(
     SymbolMap Symbols, VModuleKey K)
-    : MaterializationUnit(extractFlags(Symbols), std::move(K)),
+    : MaterializationUnit(extractFlags(Symbols), nullptr, std::move(K)),
       Symbols(std::move(Symbols)) {}
 
 StringRef AbsoluteSymbolsMaterializationUnit::getName() const {
@@ -577,7 +361,7 @@ AbsoluteSymbolsMaterializationUnit::extractFlags(const SymbolMap &Symbols) {
 ReExportsMaterializationUnit::ReExportsMaterializationUnit(
     JITDylib *SourceJD, JITDylibLookupFlags SourceJDLookupFlags,
     SymbolAliasMap Aliases, VModuleKey K)
-    : MaterializationUnit(extractFlags(Aliases), std::move(K)),
+    : MaterializationUnit(extractFlags(Aliases), nullptr, std::move(K)),
       SourceJD(SourceJD), SourceJDLookupFlags(SourceJDLookupFlags),
       Aliases(std::move(Aliases)) {}
 
@@ -630,11 +414,13 @@ void ReExportsMaterializationUnit::materialize(
     SymbolAliasMap Aliases;
   };
 
-  // Build a list of queries to issue. In each round we build the largest set of
-  // aliases that we can resolve without encountering a chain definition of the
-  // form Foo -> Bar, Bar -> Baz. Such a form would deadlock as the query would
-  // be waitin on a symbol that it itself had to resolve. Usually this will just
-  // involve one round and a single query.
+  // Build a list of queries to issue. In each round we build a query for the
+  // largest set of aliases that we can resolve without encountering a chain of
+  // aliases (e.g. Foo -> Bar, Bar -> Baz). Such a chain would deadlock as the
+  // query would be waiting on a symbol that it itself had to resolve. Creating
+  // a new query for each link in such a chain eliminates the possibility of
+  // deadlock. In practice chains are likely to be rare, and this algorithm will
+  // usually result in a single query to issue.
 
   std::vector<std::pair<SymbolLookupSet, std::shared_ptr<OnResolveInfo>>>
       QueryInfos;
@@ -651,7 +437,10 @@ void ReExportsMaterializationUnit::materialize(
         continue;
 
       ResponsibilitySymbols.insert(KV.first);
-      QuerySymbols.add(KV.second.Aliasee);
+      QuerySymbols.add(KV.second.Aliasee,
+                       KV.second.AliasFlags.hasMaterializationSideEffectsOnly()
+                           ? SymbolLookupFlags::WeaklyReferencedSymbol
+                           : SymbolLookupFlags::RequiredSymbol);
       QueryAliases[KV.first] = std::move(KV.second);
     }
 
@@ -700,8 +489,13 @@ void ReExportsMaterializationUnit::materialize(
       if (Result) {
         SymbolMap ResolutionMap;
         for (auto &KV : QueryInfo->Aliases) {
-          assert(Result->count(KV.second.Aliasee) &&
+          assert((KV.second.AliasFlags.hasMaterializationSideEffectsOnly() ||
+                  Result->count(KV.second.Aliasee)) &&
                  "Result map missing entry?");
+          // Don't try to resolve materialization-side-effects-only symbols.
+          if (KV.second.AliasFlags.hasMaterializationSideEffectsOnly())
+            continue;
+
           ResolutionMap[KV.first] = JITEvaluatedSymbol(
               (*Result)[KV.second.Aliasee].getAddress(), KV.second.AliasFlags);
         }
@@ -809,31 +603,52 @@ void JITDylib::removeGenerator(DefinitionGenerator &G) {
   });
 }
 
-Error JITDylib::defineMaterializing(const SymbolFlagsMap &SymbolFlags) {
-  return ES.runSessionLocked([&]() -> Error {
+Expected<SymbolFlagsMap>
+JITDylib::defineMaterializing(SymbolFlagsMap SymbolFlags) {
+
+  return ES.runSessionLocked([&]() -> Expected<SymbolFlagsMap> {
     std::vector<SymbolTable::iterator> AddedSyms;
+    std::vector<SymbolFlagsMap::iterator> RejectedWeakDefs;
 
-    for (auto &KV : SymbolFlags) {
-      SymbolTable::iterator EntryItr;
-      bool Added;
+    for (auto SFItr = SymbolFlags.begin(), SFEnd = SymbolFlags.end();
+         SFItr != SFEnd; ++SFItr) {
 
-      std::tie(EntryItr, Added) =
-          Symbols.insert(std::make_pair(KV.first, SymbolTableEntry(KV.second)));
+      auto &Name = SFItr->first;
+      auto &Flags = SFItr->second;
 
-      if (Added) {
-        AddedSyms.push_back(EntryItr);
-        EntryItr->second.setState(SymbolState::Materializing);
-      } else {
-        // Remove any symbols already added.
-        for (auto &SI : AddedSyms)
-          Symbols.erase(SI);
+      auto EntryItr = Symbols.find(Name);
 
-        // FIXME: Return all duplicates.
-        return make_error<DuplicateDefinition>(*KV.first);
-      }
+      // If the entry already exists...
+      if (EntryItr != Symbols.end()) {
+
+        // If this is a strong definition then error out.
+        if (!Flags.isWeak()) {
+          // Remove any symbols already added.
+          for (auto &SI : AddedSyms)
+            Symbols.erase(SI);
+
+          // FIXME: Return all duplicates.
+          return make_error<DuplicateDefinition>(std::string(*Name));
+        }
+
+        // Otherwise just make a note to discard this symbol after the loop.
+        RejectedWeakDefs.push_back(SFItr);
+        continue;
+      } else
+        EntryItr =
+          Symbols.insert(std::make_pair(Name, SymbolTableEntry(Flags))).first;
+
+      AddedSyms.push_back(EntryItr);
+      EntryItr->second.setState(SymbolState::Materializing);
     }
 
-    return Error::success();
+    // Remove any rejected weak definitions from the SymbolFlags map.
+    while (!RejectedWeakDefs.empty()) {
+      SymbolFlags.erase(RejectedWeakDefs.back());
+      RejectedWeakDefs.pop_back();
+    }
+
+    return SymbolFlags;
   });
 }
 
@@ -847,8 +662,8 @@ void JITDylib::replace(std::unique_ptr<MaterializationUnit> MU) {
         for (auto &KV : MU->getSymbols()) {
           auto SymI = Symbols.find(KV.first);
           assert(SymI != Symbols.end() && "Replacing unknown symbol");
-          assert(SymI->second.isInMaterializationPhase() &&
-                 "Can not call replace on a symbol that is not materializing");
+          assert(SymI->second.getState() == SymbolState::Materializing &&
+                 "Can not replace a symbol that ha is not materializing");
           assert(!SymI->second.hasMaterializerAttached() &&
                  "Symbol should not have materializer attached already");
           assert(UnmaterializedInfos.count(KV.first) == 0 &&
@@ -878,14 +693,21 @@ void JITDylib::replace(std::unique_ptr<MaterializationUnit> MU) {
                  "Unexpected materializer entry in map");
           SymI->second.setAddress(SymI->second.getAddress());
           SymI->second.setMaterializerAttached(true);
-          UnmaterializedInfos[KV.first] = UMI;
+
+          auto &UMIEntry = UnmaterializedInfos[KV.first];
+          assert((!UMIEntry || !UMIEntry->MU) &&
+                 "Replacing symbol with materializer still attached");
+          UMIEntry = UMI;
         }
 
         return nullptr;
       });
 
-  if (MustRunMU)
-    ES.dispatchMaterialization(*this, std::move(MustRunMU));
+  if (MustRunMU) {
+    auto MR =
+        MustRunMU->createMaterializationResponsibility(shared_from_this());
+    ES.dispatchMaterialization(std::move(MustRunMU), std::move(MR));
+  }
 }
 
 SymbolNameSet
@@ -895,7 +717,9 @@ JITDylib::getRequestedSymbols(const SymbolFlagsMap &SymbolFlags) const {
 
     for (auto &KV : SymbolFlags) {
       assert(Symbols.count(KV.first) && "JITDylib does not cover this symbol?");
-      assert(Symbols.find(KV.first)->second.isInMaterializationPhase() &&
+      assert(Symbols.find(KV.first)->second.getState() !=
+                 SymbolState::NeverSearched &&
+             Symbols.find(KV.first)->second.getState() != SymbolState::Ready &&
              "getRequestedSymbols can only be called for symbols that have "
              "started materializing");
       auto I = MaterializingInfos.find(KV.first);
@@ -913,9 +737,14 @@ JITDylib::getRequestedSymbols(const SymbolFlagsMap &SymbolFlags) const {
 void JITDylib::addDependencies(const SymbolStringPtr &Name,
                                const SymbolDependenceMap &Dependencies) {
   assert(Symbols.count(Name) && "Name not in symbol table");
-  assert(Symbols[Name].isInMaterializationPhase() &&
+  assert(Symbols[Name].getState() < SymbolState::Emitted &&
          "Can not add dependencies for a symbol that is not materializing");
 
+  LLVM_DEBUG({
+      dbgs() << "In " << getName() << " adding dependencies for "
+             << *Name << ": " << Dependencies << "\n";
+    });
+
   // If Name is already in an error state then just bail out.
   if (Symbols[Name].getFlags().hasError())
     return;
@@ -938,16 +767,18 @@ void JITDylib::addDependencies(const SymbolStringPtr &Name,
       // Check the sym entry for the dependency.
       auto OtherSymI = OtherJITDylib.Symbols.find(OtherSymbol);
 
-#ifndef NDEBUG
       // Assert that this symbol exists and has not reached the ready state
       // already.
       assert(OtherSymI != OtherJITDylib.Symbols.end() &&
-             (OtherSymI->second.getState() != SymbolState::Ready &&
-              "Dependency on emitted/ready symbol"));
-#endif
+             "Dependency on unknown symbol");
 
       auto &OtherSymEntry = OtherSymI->second;
 
+      // If the other symbol is already in the Ready state then there's no
+      // dependency to add.
+      if (OtherSymEntry.getState() == SymbolState::Ready)
+        continue;
+
       // If the dependency is in an error state then note this and continue,
       // we will move this symbol to the error state below.
       if (OtherSymEntry.getFlags().hasError()) {
@@ -957,8 +788,6 @@ void JITDylib::addDependencies(const SymbolStringPtr &Name,
 
       // If the dependency was not in the error state then add it to
       // our list of dependencies.
-      assert(OtherJITDylib.MaterializingInfos.count(OtherSymbol) &&
-             "No MaterializingInfo for dependency");
       auto &OtherMI = OtherJITDylib.MaterializingInfos[OtherSymbol];
 
       if (OtherSymEntry.getState() == SymbolState::Emitted)
@@ -1039,7 +868,11 @@ Error JITDylib::resolve(const SymbolMap &Resolved) {
       SymI->second.setFlags(ResolvedFlags);
       SymI->second.setState(SymbolState::Resolved);
 
-      auto &MI = MaterializingInfos[Name];
+      auto MII = MaterializingInfos.find(Name);
+      if (MII == MaterializingInfos.end())
+        continue;
+
+      auto &MI = MII->second;
       for (auto &Q : MI.takeQueriesMeeting(SymbolState::Resolved)) {
         Q->notifySymbolMetRequiredState(Name, ResolvedSym);
         Q->removeQueryDependence(*this, Name);
@@ -1071,6 +904,7 @@ Error JITDylib::resolve(const SymbolMap &Resolved) {
 Error JITDylib::emit(const SymbolFlagsMap &Emitted) {
   AsynchronousSymbolQuerySet CompletedQueries;
   SymbolNameSet SymbolsInErrorState;
+  DenseMap<JITDylib *, SymbolNameVector> ReadySymbols;
 
   ES.runSessionLocked([&, this]() {
     std::vector<SymbolTable::iterator> Worklist;
@@ -1101,13 +935,21 @@ Error JITDylib::emit(const SymbolFlagsMap &Emitted) {
       auto &SymEntry = SymI->second;
 
       // Move symbol to the emitted state.
-      assert(SymEntry.getState() == SymbolState::Resolved &&
+      assert(((SymEntry.getFlags().hasMaterializationSideEffectsOnly() &&
+               SymEntry.getState() == SymbolState::Materializing) ||
+              SymEntry.getState() == SymbolState::Resolved) &&
              "Emitting from state other than Resolved");
       SymEntry.setState(SymbolState::Emitted);
 
       auto MII = MaterializingInfos.find(Name);
-      assert(MII != MaterializingInfos.end() &&
-             "Missing MaterializingInfo entry");
+
+      // If this symbol has no MaterializingInfo then it's trivially ready.
+      // Update its state and continue.
+      if (MII == MaterializingInfos.end()) {
+        SymEntry.setState(SymbolState::Ready);
+        continue;
+      }
+
       auto &MI = MII->second;
 
       // For each dependant, transfer this node's emitted dependencies to
@@ -1115,6 +957,7 @@ Error JITDylib::emit(const SymbolFlagsMap &Emitted) {
       // dependencies) then notify any pending queries.
       for (auto &KV : MI.Dependants) {
         auto &DependantJD = *KV.first;
+        auto &DependantJDReadySymbols = ReadySymbols[&DependantJD];
         for (auto &DependantName : KV.second) {
           auto DependantMII =
               DependantJD.MaterializingInfos.find(DependantName);
@@ -1154,6 +997,7 @@ Error JITDylib::emit(const SymbolFlagsMap &Emitted) {
             // Since this dependant is now ready, we erase its MaterializingInfo
             // and update its materializing state.
             DependantSymEntry.setState(SymbolState::Ready);
+            DependantJDReadySymbols.push_back(DependantName);
 
             for (auto &Q : DependantMI.takeQueriesMeeting(SymbolState::Ready)) {
               Q->notifySymbolMetRequiredState(
@@ -1162,22 +1006,21 @@ Error JITDylib::emit(const SymbolFlagsMap &Emitted) {
                 CompletedQueries.insert(Q);
               Q->removeQueryDependence(DependantJD, DependantName);
             }
-
-            DependantJD.MaterializingInfos.erase(DependantMII);
           }
         }
       }
 
+      auto &ThisJDReadySymbols = ReadySymbols[this];
       MI.Dependants.clear();
       if (MI.UnemittedDependencies.empty()) {
         SymI->second.setState(SymbolState::Ready);
+        ThisJDReadySymbols.push_back(Name);
         for (auto &Q : MI.takeQueriesMeeting(SymbolState::Ready)) {
           Q->notifySymbolMetRequiredState(Name, SymI->second.getSymbol());
           if (Q->isComplete())
             CompletedQueries.insert(Q);
           Q->removeQueryDependence(*this, Name);
         }
-        MaterializingInfos.erase(MII);
       }
     }
   });
@@ -1317,30 +1160,29 @@ void JITDylib::notifyFailed(FailedSymbolsWorklist Worklist) {
     Q->handleFailed(make_error<FailedToMaterialize>(FailedSymbolsMap));
 }
 
-void JITDylib::setSearchOrder(JITDylibSearchOrder NewSearchOrder,
-                              bool SearchThisJITDylibFirst) {
+void JITDylib::setLinkOrder(JITDylibSearchOrder NewLinkOrder,
+                            bool LinkAgainstThisJITDylibFirst) {
   ES.runSessionLocked([&]() {
-    if (SearchThisJITDylibFirst) {
-      SearchOrder.clear();
-      if (NewSearchOrder.empty() || NewSearchOrder.front().first != this)
-        SearchOrder.push_back(
+    if (LinkAgainstThisJITDylibFirst) {
+      LinkOrder.clear();
+      if (NewLinkOrder.empty() || NewLinkOrder.front().first != this)
+        LinkOrder.push_back(
             std::make_pair(this, JITDylibLookupFlags::MatchAllSymbols));
-      SearchOrder.insert(SearchOrder.end(), NewSearchOrder.begin(),
-                         NewSearchOrder.end());
+      LinkOrder.insert(LinkOrder.end(), NewLinkOrder.begin(),
+                       NewLinkOrder.end());
     } else
-      SearchOrder = std::move(NewSearchOrder);
+      LinkOrder = std::move(NewLinkOrder);
   });
 }
 
-void JITDylib::addToSearchOrder(JITDylib &JD,
-                                JITDylibLookupFlags JDLookupFlags) {
-  ES.runSessionLocked([&]() { SearchOrder.push_back({&JD, JDLookupFlags}); });
+void JITDylib::addToLinkOrder(JITDylib &JD, JITDylibLookupFlags JDLookupFlags) {
+  ES.runSessionLocked([&]() { LinkOrder.push_back({&JD, JDLookupFlags}); });
 }
 
-void JITDylib::replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD,
-                                    JITDylibLookupFlags JDLookupFlags) {
+void JITDylib::replaceInLinkOrder(JITDylib &OldJD, JITDylib &NewJD,
+                                  JITDylibLookupFlags JDLookupFlags) {
   ES.runSessionLocked([&]() {
-    for (auto &KV : SearchOrder)
+    for (auto &KV : LinkOrder)
       if (KV.first == &OldJD) {
         KV = {&NewJD, JDLookupFlags};
         break;
@@ -1348,14 +1190,14 @@ void JITDylib::replaceInSearchOrder(JITDylib &OldJD, JITDylib &NewJD,
   });
 }
 
-void JITDylib::removeFromSearchOrder(JITDylib &JD) {
+void JITDylib::removeFromLinkOrder(JITDylib &JD) {
   ES.runSessionLocked([&]() {
-    auto I = std::find_if(SearchOrder.begin(), SearchOrder.end(),
+    auto I = std::find_if(LinkOrder.begin(), LinkOrder.end(),
                           [&](const JITDylibSearchOrder::value_type &KV) {
                             return KV.first == &JD;
                           });
-    if (I != SearchOrder.end())
-      SearchOrder.erase(I);
+    if (I != LinkOrder.end())
+      LinkOrder.erase(I);
   });
 }
 
@@ -1377,7 +1219,8 @@ Error JITDylib::remove(const SymbolNameSet &Names) {
       }
 
       // Note symbol materializing.
-      if (I->second.isInMaterializationPhase()) {
+      if (I->second.getState() != SymbolState::NeverSearched &&
+          I->second.getState() != SymbolState::Ready) {
         Materializing.insert(Name);
         continue;
       }
@@ -1498,6 +1341,12 @@ Error JITDylib::lodgeQueryImpl(MaterializationUnitList &MUs,
         if (SymI == Symbols.end())
           return false;
 
+        // If we match against a materialization-side-effects only symbol then
+        // make sure it is weakly-referenced. Otherwise bail out with an error.
+        if (SymI->second.getFlags().hasMaterializationSideEffectsOnly() &&
+            SymLookupFlags != SymbolLookupFlags::WeaklyReferencedSymbol)
+          return make_error<SymbolsNotFound>(SymbolNameVector({Name}));
+
         // If this is a non exported symbol and we're matching exported symbols
         // only then skip this symbol without removal.
         if (!SymI->second.getFlags().isExported() &&
@@ -1545,7 +1394,8 @@ Error JITDylib::lodgeQueryImpl(MaterializationUnitList &MUs,
 
         // Add the query to the PendingQueries list and continue, deleting the
         // element.
-        assert(SymI->second.isInMaterializationPhase() &&
+        assert(SymI->second.getState() != SymbolState::NeverSearched &&
+               SymI->second.getState() != SymbolState::Ready &&
                "By this line the symbol should be materializing");
         auto &MI = MaterializingInfos[Name];
         MI.addQuery(Q);
@@ -1601,8 +1451,11 @@ JITDylib::legacyLookup(std::shared_ptr<AsynchronousSymbolQuery> Q,
   // Add MUs to the OutstandingMUs list.
   {
     std::lock_guard<std::recursive_mutex> Lock(ES.OutstandingMUsMutex);
-    for (auto &MU : MUs)
-      ES.OutstandingMUs.push_back(make_pair(this, std::move(MU)));
+    auto ThisJD = shared_from_this();
+    for (auto &MU : MUs) {
+      auto MR = MU->createMaterializationResponsibility(ThisJD);
+      ES.OutstandingMUs.push_back(make_pair(std::move(MU), std::move(MR)));
+    }
   }
   ES.runOutstandingMUs();
 
@@ -1665,7 +1518,8 @@ bool JITDylib::lookupImpl(
         }
 
         // Add the query to the PendingQueries list.
-        assert(SymI->second.isInMaterializationPhase() &&
+        assert(SymI->second.getState() != SymbolState::NeverSearched &&
+               SymI->second.getState() != SymbolState::Ready &&
                "By this line the symbol should be materializing");
         auto &MI = MaterializingInfos[Name];
         MI.addQuery(Q);
@@ -1680,7 +1534,7 @@ void JITDylib::dump(raw_ostream &OS) {
   ES.runSessionLocked([&, this]() {
     OS << "JITDylib \"" << JITDylibName << "\" (ES: "
        << format("0x%016" PRIx64, reinterpret_cast<uintptr_t>(&ES)) << "):\n"
-       << "Search order: " << SearchOrder << "\n"
+       << "Link order: " << LinkOrder << "\n"
        << "Symbol table:\n";
 
     for (auto &KV : Symbols) {
@@ -1691,14 +1545,14 @@ void JITDylib::dump(raw_ostream &OS) {
       else
         OS << "<not resolved> ";
 
-      OS << KV.second.getState();
+      OS << KV.second.getFlags() << " " << KV.second.getState();
 
       if (KV.second.hasMaterializerAttached()) {
         OS << " (Materializer ";
         auto I = UnmaterializedInfos.find(KV.first);
         assert(I != UnmaterializedInfos.end() &&
                "Lazy symbol should have UnmaterializedInfo");
-        OS << I->second->MU.get() << ")\n";
+        OS << I->second->MU.get() << ", " << I->second->MU->getName() << ")\n";
       } else
         OS << "\n";
     }
@@ -1761,10 +1615,13 @@ JITDylib::MaterializingInfo::takeQueriesMeeting(SymbolState RequiredState) {
 
 JITDylib::JITDylib(ExecutionSession &ES, std::string Name)
     : ES(ES), JITDylibName(std::move(Name)) {
-  SearchOrder.push_back({this, JITDylibLookupFlags::MatchAllSymbols});
+  LinkOrder.push_back({this, JITDylibLookupFlags::MatchAllSymbols});
 }
 
 Error JITDylib::defineImpl(MaterializationUnit &MU) {
+
+  LLVM_DEBUG({ dbgs() << "  " << MU.getSymbols() << "\n"; });
+
   SymbolNameSet Duplicates;
   std::vector<SymbolStringPtr> ExistingDefsOverridden;
   std::vector<SymbolStringPtr> MUDefsOverridden;
@@ -1789,14 +1646,26 @@ Error JITDylib::defineImpl(MaterializationUnit &MU) {
   }
 
   // If there were any duplicate definitions then bail out.
-  if (!Duplicates.empty())
-    return make_error<DuplicateDefinition>(**Duplicates.begin());
+  if (!Duplicates.empty()) {
+    LLVM_DEBUG(
+        { dbgs() << "  Error: Duplicate symbols " << Duplicates << "\n"; });
+    return make_error<DuplicateDefinition>(std::string(**Duplicates.begin()));
+  }
 
   // Discard any overridden defs in this MU.
+  LLVM_DEBUG({
+    if (!MUDefsOverridden.empty())
+      dbgs() << "  Defs in this MU overridden: " << MUDefsOverridden << "\n";
+  });
   for (auto &S : MUDefsOverridden)
     MU.doDiscard(*this, S);
 
   // Discard existing overridden defs.
+  LLVM_DEBUG({
+    if (!ExistingDefsOverridden.empty())
+      dbgs() << "  Existing defs overridden by this MU: " << MUDefsOverridden
+             << "\n";
+  });
   for (auto &S : ExistingDefsOverridden) {
 
     auto UMII = UnmaterializedInfos.find(S);
@@ -1852,6 +1721,57 @@ void JITDylib::transferEmittedNodeDependencies(
   }
 }
 
+Platform::~Platform() {}
+
+Expected<DenseMap<JITDylib *, SymbolMap>> Platform::lookupInitSymbols(
+    ExecutionSession &ES,
+    const DenseMap<JITDylib *, SymbolLookupSet> &InitSyms) {
+
+  DenseMap<JITDylib *, SymbolMap> CompoundResult;
+  Error CompoundErr = Error::success();
+  std::mutex LookupMutex;
+  std::condition_variable CV;
+  uint64_t Count = InitSyms.size();
+
+  LLVM_DEBUG({
+    dbgs() << "Issuing init-symbol lookup:\n";
+    for (auto &KV : InitSyms)
+      dbgs() << "  " << KV.first->getName() << ": " << KV.second << "\n";
+  });
+
+  for (auto &KV : InitSyms) {
+    auto *JD = KV.first;
+    auto Names = std::move(KV.second);
+    ES.lookup(
+        LookupKind::Static,
+        JITDylibSearchOrder({{JD, JITDylibLookupFlags::MatchAllSymbols}}),
+        std::move(Names), SymbolState::Ready,
+        [&, JD](Expected<SymbolMap> Result) {
+          {
+            std::lock_guard<std::mutex> Lock(LookupMutex);
+            --Count;
+            if (Result) {
+              assert(!CompoundResult.count(JD) &&
+                     "Duplicate JITDylib in lookup?");
+              CompoundResult[JD] = std::move(*Result);
+            } else
+              CompoundErr =
+                  joinErrors(std::move(CompoundErr), Result.takeError());
+          }
+          CV.notify_one();
+        },
+        NoDependenciesToRegister);
+  }
+
+  std::unique_lock<std::mutex> Lock(LookupMutex);
+  CV.wait(Lock, [&] { return Count == 0 || CompoundErr; });
+
+  if (CompoundErr)
+    return std::move(CompoundErr);
+
+  return std::move(CompoundResult);
+}
+
 ExecutionSession::ExecutionSession(std::shared_ptr<SymbolStringPool> SSP)
     : SSP(SSP ? std::move(SSP) : std::make_shared<SymbolStringPool>()) {
 }
@@ -1865,15 +1785,23 @@ JITDylib *ExecutionSession::getJITDylibByName(StringRef Name) {
   });
 }
 
-JITDylib &ExecutionSession::createJITDylib(std::string Name) {
+JITDylib &ExecutionSession::createBareJITDylib(std::string Name) {
   assert(!getJITDylibByName(Name) && "JITDylib with that name already exists");
   return runSessionLocked([&, this]() -> JITDylib & {
     JDs.push_back(
-        std::unique_ptr<JITDylib>(new JITDylib(*this, std::move(Name))));
+        std::shared_ptr<JITDylib>(new JITDylib(*this, std::move(Name))));
     return *JDs.back();
   });
 }
 
+Expected<JITDylib &> ExecutionSession::createJITDylib(std::string Name) {
+  auto &JD = createBareJITDylib(Name);
+  if (P)
+    if (auto Err = P->setupJITDylib(JD))
+      return std::move(Err);
+  return JD;
+}
+
 void ExecutionSession::legacyFailQuery(AsynchronousSymbolQuery &Q, Error Err) {
   assert(!!Err && "Error should be in failure state");
 
@@ -2050,9 +1978,13 @@ void ExecutionSession::lookup(
   {
     std::lock_guard<std::recursive_mutex> Lock(OutstandingMUsMutex);
 
-    for (auto &KV : CollectedMUsMap)
-      for (auto &MU : KV.second)
-        OutstandingMUs.push_back(std::make_pair(KV.first, std::move(MU)));
+    for (auto &KV : CollectedMUsMap) {
+      auto JD = KV.first->shared_from_this();
+      for (auto &MU : KV.second) {
+        auto MR = MU->createMaterializationResponsibility(JD);
+        OutstandingMUs.push_back(std::make_pair(std::move(MU), std::move(MR)));
+      }
+    }
   }
 
   runOutstandingMUs();
@@ -2114,11 +2046,11 @@ ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
 
 Expected<JITEvaluatedSymbol>
 ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
-                         SymbolStringPtr Name) {
+                         SymbolStringPtr Name, SymbolState RequiredState) {
   SymbolLookupSet Names({Name});
 
   if (auto ResultMap = lookup(SearchOrder, std::move(Names), LookupKind::Static,
-                              SymbolState::Ready, NoDependenciesToRegister)) {
+                              RequiredState, NoDependenciesToRegister)) {
     assert(ResultMap->size() == 1 && "Unexpected number of results");
     assert(ResultMap->count(Name) && "Missing result for symbol");
     return std::move(ResultMap->begin()->second);
@@ -2127,14 +2059,15 @@ ExecutionSession::lookup(const JITDylibSearchOrder &SearchOrder,
 }
 
 Expected<JITEvaluatedSymbol>
-ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder,
-                         SymbolStringPtr Name) {
-  return lookup(makeJITDylibSearchOrder(SearchOrder), Name);
+ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder, SymbolStringPtr Name,
+                         SymbolState RequiredState) {
+  return lookup(makeJITDylibSearchOrder(SearchOrder), Name, RequiredState);
 }
 
 Expected<JITEvaluatedSymbol>
-ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder, StringRef Name) {
-  return lookup(SearchOrder, intern(Name));
+ExecutionSession::lookup(ArrayRef<JITDylib *> SearchOrder, StringRef Name,
+                         SymbolState RequiredState) {
+  return lookup(SearchOrder, intern(Name), RequiredState);
 }
 
 void ExecutionSession::dump(raw_ostream &OS) {
@@ -2146,36 +2079,33 @@ void ExecutionSession::dump(raw_ostream &OS) {
 
 void ExecutionSession::runOutstandingMUs() {
   while (1) {
-    std::pair<JITDylib *, std::unique_ptr<MaterializationUnit>> JITDylibAndMU;
+    Optional<std::pair<std::unique_ptr<MaterializationUnit>,
+                       MaterializationResponsibility>>
+        JMU;
 
     {
       std::lock_guard<std::recursive_mutex> Lock(OutstandingMUsMutex);
       if (!OutstandingMUs.empty()) {
-        JITDylibAndMU = std::move(OutstandingMUs.back());
+        JMU.emplace(std::move(OutstandingMUs.back()));
         OutstandingMUs.pop_back();
       }
     }
 
-    if (JITDylibAndMU.first) {
-      assert(JITDylibAndMU.second && "JITDylib, but no MU?");
-      dispatchMaterialization(*JITDylibAndMU.first,
-                              std::move(JITDylibAndMU.second));
-    } else
+    if (!JMU)
       break;
+
+    assert(JMU->first && "No MU?");
+    dispatchMaterialization(std::move(JMU->first), std::move(JMU->second));
   }
 }
 
-MangleAndInterner::MangleAndInterner(ExecutionSession &ES, const DataLayout &DL)
-    : ES(ES), DL(DL) {}
-
-SymbolStringPtr MangleAndInterner::operator()(StringRef Name) {
-  std::string MangledName;
-  {
-    raw_string_ostream MangledNameStream(MangledName);
-    Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
-  }
-  return ES.intern(MangledName);
+#ifndef NDEBUG
+void ExecutionSession::dumpDispatchInfo(JITDylib &JD, MaterializationUnit &MU) {
+  runSessionLocked([&]() {
+    dbgs() << "Dispatching " << MU << " for " << JD.getName() << "\n";
+  });
 }
+#endif // NDEBUG
 
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp b/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
index c9e87ff737fc..6247158919fa 100644
--- a/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/DebugUtils.cpp
@@ -7,16 +7,297 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/DebugUtils.h"
+
+#include "llvm/ExecutionEngine/Orc/Core.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FileSystem.h"
+#include "llvm/Support/Format.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/raw_ostream.h"
 
 #define DEBUG_TYPE "orc"
 
+using namespace llvm;
+
+namespace {
+
+#ifndef NDEBUG
+
+cl::opt<bool> PrintHidden("debug-orc-print-hidden", cl::init(true),
+                          cl::desc("debug print hidden symbols defined by "
+                                   "materialization units"),
+                          cl::Hidden);
+
+cl::opt<bool> PrintCallable("debug-orc-print-callable", cl::init(true),
+                            cl::desc("debug print callable symbols defined by "
+                                     "materialization units"),
+                            cl::Hidden);
+
+cl::opt<bool> PrintData("debug-orc-print-data", cl::init(true),
+                        cl::desc("debug print data symbols defined by "
+                                 "materialization units"),
+                        cl::Hidden);
+
+#endif // NDEBUG
+
+// SetPrinter predicate that prints every element.
+template <typename T> struct PrintAll {
+  bool operator()(const T &E) { return true; }
+};
+
+bool anyPrintSymbolOptionSet() {
+#ifndef NDEBUG
+  return PrintHidden || PrintCallable || PrintData;
+#else
+  return false;
+#endif // NDEBUG
+}
+
+bool flagsMatchCLOpts(const JITSymbolFlags &Flags) {
+#ifndef NDEBUG
+  // Bail out early if this is a hidden symbol and we're not printing hiddens.
+  if (!PrintHidden && !Flags.isExported())
+    return false;
+
+  // Return true if this is callable and we're printing callables.
+  if (PrintCallable && Flags.isCallable())
+    return true;
+
+  // Return true if this is data and we're printing data.
+  if (PrintData && !Flags.isCallable())
+    return true;
+
+  // otherwise return false.
+  return false;
+#else
+  return false;
+#endif // NDEBUG
+}
+
+// Prints a sequence of items, filtered by an user-supplied predicate.
+template <typename Sequence,
+          typename Pred = PrintAll<typename Sequence::value_type>>
+class SequencePrinter {
+public:
+  SequencePrinter(const Sequence &S, char OpenSeq, char CloseSeq,
+                  Pred ShouldPrint = Pred())
+      : S(S), OpenSeq(OpenSeq), CloseSeq(CloseSeq),
+        ShouldPrint(std::move(ShouldPrint)) {}
+
+  void printTo(llvm::raw_ostream &OS) const {
+    bool PrintComma = false;
+    OS << OpenSeq;
+    for (auto &E : S) {
+      if (ShouldPrint(E)) {
+        if (PrintComma)
+          OS << ',';
+        OS << ' ' << E;
+        PrintComma = true;
+      }
+    }
+    OS << ' ' << CloseSeq;
+  }
+
+private:
+  const Sequence &S;
+  char OpenSeq;
+  char CloseSeq;
+  mutable Pred ShouldPrint;
+};
+
+template <typename Sequence, typename Pred>
+SequencePrinter<Sequence, Pred> printSequence(const Sequence &S, char OpenSeq,
+                                              char CloseSeq, Pred P = Pred()) {
+  return SequencePrinter<Sequence, Pred>(S, OpenSeq, CloseSeq, std::move(P));
+}
+
+// Render a SequencePrinter by delegating to its printTo method.
+template <typename Sequence, typename Pred>
+llvm::raw_ostream &operator<<(llvm::raw_ostream &OS,
+                              const SequencePrinter<Sequence, Pred> &Printer) {
+  Printer.printTo(OS);
+  return OS;
+}
+
+struct PrintSymbolFlagsMapElemsMatchingCLOpts {
+  bool operator()(const orc::SymbolFlagsMap::value_type &KV) {
+    return flagsMatchCLOpts(KV.second);
+  }
+};
+
+struct PrintSymbolMapElemsMatchingCLOpts {
+  bool operator()(const orc::SymbolMap::value_type &KV) {
+    return flagsMatchCLOpts(KV.second.getFlags());
+  }
+};
+
+} // end anonymous namespace
+
 namespace llvm {
 namespace orc {
 
+raw_ostream &operator<<(raw_ostream &OS, const SymbolStringPtr &Sym) {
+  return OS << *Sym;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolNameSet &Symbols) {
+  return OS << printSequence(Symbols, '{', '}', PrintAll<SymbolStringPtr>());
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolNameVector &Symbols) {
+  return OS << printSequence(Symbols, '[', ']', PrintAll<SymbolStringPtr>());
+}
+
+raw_ostream &operator<<(raw_ostream &OS, ArrayRef<SymbolStringPtr> Symbols) {
+  return OS << printSequence(Symbols, '[', ']', PrintAll<SymbolStringPtr>());
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const JITSymbolFlags &Flags) {
+  if (Flags.hasError())
+    OS << "[*ERROR*]";
+  if (Flags.isCallable())
+    OS << "[Callable]";
+  else
+    OS << "[Data]";
+  if (Flags.isWeak())
+    OS << "[Weak]";
+  else if (Flags.isCommon())
+    OS << "[Common]";
+
+  if (!Flags.isExported())
+    OS << "[Hidden]";
+
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const JITEvaluatedSymbol &Sym) {
+  return OS << format("0x%016" PRIx64, Sym.getAddress()) << " "
+            << Sym.getFlags();
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap::value_type &KV) {
+  return OS << "(\"" << KV.first << "\", " << KV.second << ")";
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolMap::value_type &KV) {
+  return OS << "(\"" << KV.first << "\": " << KV.second << ")";
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolFlagsMap &SymbolFlags) {
+  return OS << printSequence(SymbolFlags, '{', '}',
+                             PrintSymbolFlagsMapElemsMatchingCLOpts());
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolMap &Symbols) {
+  return OS << printSequence(Symbols, '{', '}',
+                             PrintSymbolMapElemsMatchingCLOpts());
+}
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const SymbolDependenceMap::value_type &KV) {
+  return OS << "(" << KV.first->getName() << ", " << KV.second << ")";
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolDependenceMap &Deps) {
+  return OS << printSequence(Deps, '{', '}',
+                             PrintAll<SymbolDependenceMap::value_type>());
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const MaterializationUnit &MU) {
+  OS << "MU@" << &MU << " (\"" << MU.getName() << "\"";
+  if (anyPrintSymbolOptionSet())
+    OS << ", " << MU.getSymbols();
+  return OS << ")";
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const LookupKind &K) {
+  switch (K) {
+  case LookupKind::Static:
+    return OS << "Static";
+  case LookupKind::DLSym:
+    return OS << "DLSym";
+  }
+  llvm_unreachable("Invalid lookup kind");
+}
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const JITDylibLookupFlags &JDLookupFlags) {
+  switch (JDLookupFlags) {
+  case JITDylibLookupFlags::MatchExportedSymbolsOnly:
+    return OS << "MatchExportedSymbolsOnly";
+  case JITDylibLookupFlags::MatchAllSymbols:
+    return OS << "MatchAllSymbols";
+  }
+  llvm_unreachable("Invalid JITDylib lookup flags");
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupFlags &LookupFlags) {
+  switch (LookupFlags) {
+  case SymbolLookupFlags::RequiredSymbol:
+    return OS << "RequiredSymbol";
+  case SymbolLookupFlags::WeaklyReferencedSymbol:
+    return OS << "WeaklyReferencedSymbol";
+  }
+  llvm_unreachable("Invalid symbol lookup flags");
+}
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const SymbolLookupSet::value_type &KV) {
+  return OS << "(" << KV.first << ", " << KV.second << ")";
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolLookupSet &LookupSet) {
+  return OS << printSequence(LookupSet, '{', '}',
+                             PrintAll<SymbolLookupSet::value_type>());
+}
+
+raw_ostream &operator<<(raw_ostream &OS,
+                        const JITDylibSearchOrder &SearchOrder) {
+  OS << "[";
+  if (!SearchOrder.empty()) {
+    assert(SearchOrder.front().first &&
+           "JITDylibList entries must not be null");
+    OS << " (\"" << SearchOrder.front().first->getName() << "\", "
+       << SearchOrder.begin()->second << ")";
+    for (auto &KV :
+         make_range(std::next(SearchOrder.begin(), 1), SearchOrder.end())) {
+      assert(KV.first && "JITDylibList entries must not be null");
+      OS << ", (\"" << KV.first->getName() << "\", " << KV.second << ")";
+    }
+  }
+  OS << " ]";
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolAliasMap &Aliases) {
+  OS << "{";
+  for (auto &KV : Aliases)
+    OS << " " << *KV.first << ": " << KV.second.Aliasee << " "
+       << KV.second.AliasFlags;
+  OS << " }";
+  return OS;
+}
+
+raw_ostream &operator<<(raw_ostream &OS, const SymbolState &S) {
+  switch (S) {
+  case SymbolState::Invalid:
+    return OS << "Invalid";
+  case SymbolState::NeverSearched:
+    return OS << "Never-Searched";
+  case SymbolState::Materializing:
+    return OS << "Materializing";
+  case SymbolState::Resolved:
+    return OS << "Resolved";
+  case SymbolState::Emitted:
+    return OS << "Emitted";
+  case SymbolState::Ready:
+    return OS << "Ready";
+  }
+  llvm_unreachable("Invalid state");
+}
+
 DumpObjects::DumpObjects(std::string DumpDir, std::string IdentifierOverride)
     : DumpDir(std::move(DumpDir)),
       IdentifierOverride(std::move(IdentifierOverride)) {
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index 3d97fe9eeab1..4d255cd66c1b 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -13,6 +13,8 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Object/MachOUniversal.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -113,6 +115,26 @@ iterator_range<CtorDtorIterator> getDestructors(const Module &M) {
                     CtorDtorIterator(DtorsList, true));
 }
 
+bool StaticInitGVIterator::isStaticInitGlobal(GlobalValue &GV) {
+  if (GV.isDeclaration())
+    return false;
+
+  if (GV.hasName() && (GV.getName() == "llvm.global_ctors" ||
+                       GV.getName() == "llvm.global_dtors"))
+    return true;
+
+  if (ObjFmt == Triple::MachO) {
+    // FIXME: These section checks are too strict: We should match first and
+    // second word split by comma.
+    if (GV.hasSection() &&
+        (GV.getSection().startswith("__DATA,__objc_classlist") ||
+         GV.getSection().startswith("__DATA,__objc_selrefs")))
+      return true;
+  }
+
+  return false;
+}
+
 void CtorDtorRunner::add(iterator_range<CtorDtorIterator> CtorDtors) {
   if (CtorDtors.empty())
     return;
@@ -198,6 +220,30 @@ Error LocalCXXRuntimeOverrides::enable(JITDylib &JD,
   return JD.define(absoluteSymbols(std::move(RuntimeInterposes)));
 }
 
+void ItaniumCXAAtExitSupport::registerAtExit(void (*F)(void *), void *Ctx,
+                                             void *DSOHandle) {
+  std::lock_guard<std::mutex> Lock(AtExitsMutex);
+  AtExitRecords[DSOHandle].push_back({F, Ctx});
+}
+
+void ItaniumCXAAtExitSupport::runAtExits(void *DSOHandle) {
+  std::vector<AtExitRecord> AtExitsToRun;
+
+  {
+    std::lock_guard<std::mutex> Lock(AtExitsMutex);
+    auto I = AtExitRecords.find(DSOHandle);
+    if (I != AtExitRecords.end()) {
+      AtExitsToRun = std::move(I->second);
+      AtExitRecords.erase(I);
+    }
+  }
+
+  while (!AtExitsToRun.empty()) {
+    AtExitsToRun.back().F(AtExitsToRun.back().Ctx);
+    AtExitsToRun.pop_back();
+  }
+}
+
 DynamicLibrarySearchGenerator::DynamicLibrarySearchGenerator(
     sys::DynamicLibrary Dylib, char GlobalPrefix, SymbolPredicate Allow)
     : Dylib(std::move(Dylib)), Allow(std::move(Allow)),
@@ -259,6 +305,51 @@ StaticLibraryDefinitionGenerator::Load(ObjectLayer &L, const char *FileName) {
 }
 
 Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
+StaticLibraryDefinitionGenerator::Load(ObjectLayer &L, const char *FileName,
+                                       const Triple &TT) {
+  auto B = object::createBinary(FileName);
+  if (!B)
+    return B.takeError();
+
+  // If this is a regular archive then create an instance from it.
+  if (isa<object::Archive>(B->getBinary()))
+    return Create(L, std::move(B->takeBinary().second));
+
+  // If this is a universal binary then search for a slice matching the given
+  // Triple.
+  if (auto *UB = cast<object::MachOUniversalBinary>(B->getBinary())) {
+    for (const auto &Obj : UB->objects()) {
+      auto ObjTT = Obj.getTriple();
+      if (ObjTT.getArch() == TT.getArch() &&
+          ObjTT.getSubArch() == TT.getSubArch() &&
+          ObjTT.getVendor() == TT.getVendor()) {
+        // We found a match. Create an instance from a buffer covering this
+        // slice.
+        auto SliceBuffer = MemoryBuffer::getFileSlice(FileName, Obj.getSize(),
+                                                      Obj.getOffset());
+        if (!SliceBuffer)
+          return make_error<StringError>(
+              Twine("Could not create buffer for ") + TT.str() + " slice of " +
+                  FileName + ": [ " + formatv("{0:x}", Obj.getOffset()) +
+                  " .. " + formatv("{0:x}", Obj.getOffset() + Obj.getSize()) +
+                  ": " + SliceBuffer.getError().message(),
+              SliceBuffer.getError());
+        return Create(L, std::move(*SliceBuffer));
+      }
+    }
+
+    return make_error<StringError>(Twine("Universal binary ") + FileName +
+                                       " does not contain a slice for " +
+                                       TT.str(),
+                                   inconvertibleErrorCode());
+  }
+
+  return make_error<StringError>(Twine("Unrecognized file type for ") +
+                                     FileName,
+                                 inconvertibleErrorCode());
+}
+
+Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
 StaticLibraryDefinitionGenerator::Create(
     ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer) {
   Error Err = Error::success();
@@ -305,8 +396,8 @@ Error StaticLibraryDefinitionGenerator::tryToGenerate(
     MemoryBufferRef ChildBufferRef(ChildBufferInfo.first,
                                    ChildBufferInfo.second);
 
-    if (auto Err =
-            L.add(JD, MemoryBuffer::getMemBuffer(ChildBufferRef), VModuleKey()))
+    if (auto Err = L.add(JD, MemoryBuffer::getMemBuffer(ChildBufferRef, false),
+                         VModuleKey()))
       return Err;
   }
 
diff --git a/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp b/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
index d311f34179c7..023940dc8298 100644
--- a/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/IRCompileLayer.cpp
@@ -11,9 +11,14 @@
 namespace llvm {
 namespace orc {
 
+IRCompileLayer::IRCompiler::~IRCompiler() {}
+
 IRCompileLayer::IRCompileLayer(ExecutionSession &ES, ObjectLayer &BaseLayer,
-                                 CompileFunction Compile)
-    : IRLayer(ES), BaseLayer(BaseLayer), Compile(std::move(Compile)) {}
+                               std::unique_ptr<IRCompiler> Compile)
+    : IRLayer(ES, ManglingOpts), BaseLayer(BaseLayer),
+      Compile(std::move(Compile)) {
+  ManglingOpts = &this->Compile->getManglingOptions();
+}
 
 void IRCompileLayer::setNotifyCompiled(NotifyCompiledFunction NotifyCompiled) {
   std::lock_guard<std::mutex> Lock(IRLayerMutex);
@@ -24,7 +29,7 @@ void IRCompileLayer::emit(MaterializationResponsibility R,
                           ThreadSafeModule TSM) {
   assert(TSM && "Module must not be null");
 
-  if (auto Obj = TSM.withModuleDo(Compile)) {
+  if (auto Obj = TSM.withModuleDo(*Compile)) {
     {
       std::lock_guard<std::mutex> Lock(IRLayerMutex);
       if (NotifyCompiled)
diff --git a/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp b/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
index 845ecc71eb87..511248f83b25 100644
--- a/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/IRTransformLayer.cpp
@@ -12,10 +12,10 @@
 namespace llvm {
 namespace orc {
 
-IRTransformLayer::IRTransformLayer(ExecutionSession &ES,
-                                     IRLayer &BaseLayer,
-                                     TransformFunction Transform)
-    : IRLayer(ES), BaseLayer(BaseLayer), Transform(std::move(Transform)) {}
+IRTransformLayer::IRTransformLayer(ExecutionSession &ES, IRLayer &BaseLayer,
+                                   TransformFunction Transform)
+    : IRLayer(ES, BaseLayer.getManglingOptions()), BaseLayer(BaseLayer),
+      Transform(std::move(Transform)) {}
 
 void IRTransformLayer::emit(MaterializationResponsibility R,
                             ThreadSafeModule TSM) {
diff --git a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
index 1ac9a58aeaef..031b1afefc9d 100644
--- a/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/IndirectionUtils.cpp
@@ -10,7 +10,6 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ExecutionEngine/Orc/OrcABISupport.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -28,7 +27,7 @@ public:
   CompileCallbackMaterializationUnit(SymbolStringPtr Name,
                                      CompileFunction Compile, VModuleKey K)
       : MaterializationUnit(SymbolFlagsMap({{Name, JITSymbolFlags::Exported}}),
-                            std::move(K)),
+                            nullptr, std::move(K)),
         Name(std::move(Name)), Compile(std::move(Compile)) {}
 
   StringRef getName() const override { return "<Compile Callbacks>"; }
diff --git a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
index 114e81e41771..8cf66c9e759a 100644
--- a/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/JITTargetMachineBuilder.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 namespace orc {
@@ -33,7 +34,7 @@ Expected<JITTargetMachineBuilder> JITTargetMachineBuilder::detectHost() {
   for (auto &Feature : FeatureMap)
     TMBuilder.getFeatures().AddFeature(Feature.first(), Feature.second);
 
-  TMBuilder.setCPU(llvm::sys::getHostCPUName());
+  TMBuilder.setCPU(std::string(llvm::sys::getHostCPUName()));
 
   return TMBuilder;
 }
@@ -63,5 +64,78 @@ JITTargetMachineBuilder &JITTargetMachineBuilder::addFeatures(
   return *this;
 }
 
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const JITTargetMachineBuilder &JTMB) {
+  OS << "{ Triple = \"" << JTMB.TT.str() << "\", CPU = \"" << JTMB.CPU
+     << "\", Options = <not-printable>, Relocation Model = ";
+
+  if (JTMB.RM) {
+    switch (*JTMB.RM) {
+    case Reloc::Static:
+      OS << "Static";
+      break;
+    case Reloc::PIC_:
+      OS << "PIC_";
+      break;
+    case Reloc::DynamicNoPIC:
+      OS << "DynamicNoPIC";
+      break;
+    case Reloc::ROPI:
+      OS << "ROPI";
+      break;
+    case Reloc::RWPI:
+      OS << "RWPI";
+      break;
+    case Reloc::ROPI_RWPI:
+      OS << "ROPI_RWPI";
+      break;
+    }
+  } else
+    OS << "unspecified";
+
+  OS << ", Code Model = ";
+
+  if (JTMB.CM) {
+    switch (*JTMB.CM) {
+    case CodeModel::Tiny:
+      OS << "Tiny";
+      break;
+    case CodeModel::Small:
+      OS << "Small";
+      break;
+    case CodeModel::Kernel:
+      OS << "Kernel";
+      break;
+    case CodeModel::Medium:
+      OS << "Medium";
+      break;
+    case CodeModel::Large:
+      OS << "Large";
+      break;
+    }
+  } else
+    OS << "unspecified";
+
+  OS << ", Optimization Level = ";
+  switch (JTMB.OptLevel) {
+  case CodeGenOpt::None:
+    OS << "None";
+    break;
+  case CodeGenOpt::Less:
+    OS << "Less";
+    break;
+  case CodeGenOpt::Default:
+    OS << "Default";
+    break;
+  case CodeGenOpt::Aggressive:
+    OS << "Aggressive";
+    break;
+  }
+
+  OS << " }";
+  return OS;
+}
+#endif // NDEBUG
+
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index 54473ab46423..713a48fbf3eb 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -7,30 +7,965 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
+#include "llvm/ExecutionEngine/JITLink/EHFrameSupport.h"
 #include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h"
+#include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
 #include "llvm/ExecutionEngine/Orc/ObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/Orc/OrcError.h"
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
 #include "llvm/ExecutionEngine/SectionMemoryManager.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/DynamicLibrary.h"
+
+#include <map>
+
+#define DEBUG_TYPE "orc"
+
+using namespace llvm;
+using namespace llvm::orc;
+
+namespace {
+
+/// Adds helper function decls and wrapper functions that call the helper with
+/// some additional prefix arguments.
+///
+/// E.g. For wrapper "foo" with type i8(i8, i64), helper "bar", and prefix
+/// args i32 4 and i16 12345, this function will add:
+///
+/// declare i8 @bar(i32, i16, i8, i64)
+///
+/// define i8 @foo(i8, i64) {
+/// entry:
+///   %2 = call i8 @bar(i32 4, i16 12345, i8 %0, i64 %1)
+///   ret i8 %2
+/// }
+///
+Function *addHelperAndWrapper(Module &M, StringRef WrapperName,
+                              FunctionType *WrapperFnType,
+                              GlobalValue::VisibilityTypes WrapperVisibility,
+                              StringRef HelperName,
+                              ArrayRef<Value *> HelperPrefixArgs) {
+  std::vector<Type *> HelperArgTypes;
+  for (auto *Arg : HelperPrefixArgs)
+    HelperArgTypes.push_back(Arg->getType());
+  for (auto *T : WrapperFnType->params())
+    HelperArgTypes.push_back(T);
+  auto *HelperFnType =
+      FunctionType::get(WrapperFnType->getReturnType(), HelperArgTypes, false);
+  auto *HelperFn = Function::Create(HelperFnType, GlobalValue::ExternalLinkage,
+                                    HelperName, M);
+
+  auto *WrapperFn = Function::Create(
+      WrapperFnType, GlobalValue::ExternalLinkage, WrapperName, M);
+  WrapperFn->setVisibility(WrapperVisibility);
+
+  auto *EntryBlock = BasicBlock::Create(M.getContext(), "entry", WrapperFn);
+  IRBuilder<> IB(EntryBlock);
+
+  std::vector<Value *> HelperArgs;
+  for (auto *Arg : HelperPrefixArgs)
+    HelperArgs.push_back(Arg);
+  for (auto &Arg : WrapperFn->args())
+    HelperArgs.push_back(&Arg);
+  auto *HelperResult = IB.CreateCall(HelperFn, HelperArgs);
+  if (HelperFn->getReturnType()->isVoidTy())
+    IB.CreateRetVoid();
+  else
+    IB.CreateRet(HelperResult);
+
+  return WrapperFn;
+}
+
+class GenericLLVMIRPlatformSupport;
+
+/// orc::Platform component of Generic LLVM IR Platform support.
+/// Just forwards calls to the GenericLLVMIRPlatformSupport class below.
+class GenericLLVMIRPlatform : public Platform {
+public:
+  GenericLLVMIRPlatform(GenericLLVMIRPlatformSupport &S) : S(S) {}
+  Error setupJITDylib(JITDylib &JD) override;
+  Error notifyAdding(JITDylib &JD, const MaterializationUnit &MU) override;
+  Error notifyRemoving(JITDylib &JD, VModuleKey K) override {
+    // Noop -- Nothing to do (yet).
+    return Error::success();
+  }
+
+private:
+  GenericLLVMIRPlatformSupport &S;
+};
+
+/// This transform parses llvm.global_ctors to produce a single initialization
+/// function for the module, records the function, then deletes
+/// llvm.global_ctors.
+class GlobalCtorDtorScraper {
+public:
+
+  GlobalCtorDtorScraper(GenericLLVMIRPlatformSupport &PS,
+                        StringRef InitFunctionPrefix)
+    : PS(PS), InitFunctionPrefix(InitFunctionPrefix) {}
+  Expected<ThreadSafeModule> operator()(ThreadSafeModule TSM,
+                                        MaterializationResponsibility &R);
+
+private:
+  GenericLLVMIRPlatformSupport &PS;
+  StringRef InitFunctionPrefix;
+};
+
+/// Generic IR Platform Support
+///
+/// Scrapes llvm.global_ctors and llvm.global_dtors and replaces them with
+/// specially named 'init' and 'deinit'. Injects definitions / interposes for
+/// some runtime API, including __cxa_atexit, dlopen, and dlclose.
+class GenericLLVMIRPlatformSupport : public LLJIT::PlatformSupport {
+public:
+  // GenericLLVMIRPlatform &P) : P(P) {
+  GenericLLVMIRPlatformSupport(LLJIT &J)
+      : J(J), InitFunctionPrefix(J.mangle("__orc_init_func.")) {
+
+    getExecutionSession().setPlatform(
+        std::make_unique<GenericLLVMIRPlatform>(*this));
+
+    setInitTransform(J, GlobalCtorDtorScraper(*this, InitFunctionPrefix));
+
+    SymbolMap StdInterposes;
+
+    StdInterposes[J.mangleAndIntern("__lljit.platform_support_instance")] =
+        JITEvaluatedSymbol(pointerToJITTargetAddress(this),
+                           JITSymbolFlags::Exported);
+    StdInterposes[J.mangleAndIntern("__lljit.cxa_atexit_helper")] =
+        JITEvaluatedSymbol(pointerToJITTargetAddress(registerAtExitHelper),
+                           JITSymbolFlags());
+
+    cantFail(
+        J.getMainJITDylib().define(absoluteSymbols(std::move(StdInterposes))));
+    cantFail(setupJITDylib(J.getMainJITDylib()));
+    cantFail(J.addIRModule(J.getMainJITDylib(), createPlatformRuntimeModule()));
+  }
+
+  ExecutionSession &getExecutionSession() { return J.getExecutionSession(); }
+
+  /// Adds a module that defines the __dso_handle global.
+  Error setupJITDylib(JITDylib &JD) {
+
+    // Add per-jitdylib standard interposes.
+    SymbolMap PerJDInterposes;
+    PerJDInterposes[J.mangleAndIntern("__lljit.run_atexits_helper")] =
+        JITEvaluatedSymbol(pointerToJITTargetAddress(runAtExitsHelper),
+                           JITSymbolFlags());
+    cantFail(JD.define(absoluteSymbols(std::move(PerJDInterposes))));
+
+    auto Ctx = std::make_unique<LLVMContext>();
+    auto M = std::make_unique<Module>("__standard_lib", *Ctx);
+    M->setDataLayout(J.getDataLayout());
+
+    auto *Int64Ty = Type::getInt64Ty(*Ctx);
+    auto *DSOHandle = new GlobalVariable(
+        *M, Int64Ty, true, GlobalValue::ExternalLinkage,
+        ConstantInt::get(Int64Ty, reinterpret_cast<uintptr_t>(&JD)),
+        "__dso_handle");
+    DSOHandle->setVisibility(GlobalValue::DefaultVisibility);
+    DSOHandle->setInitializer(
+        ConstantInt::get(Int64Ty, pointerToJITTargetAddress(&JD)));
+
+    auto *GenericIRPlatformSupportTy =
+        StructType::create(*Ctx, "lljit.GenericLLJITIRPlatformSupport");
+
+    auto *PlatformInstanceDecl = new GlobalVariable(
+        *M, GenericIRPlatformSupportTy, true, GlobalValue::ExternalLinkage,
+        nullptr, "__lljit.platform_support_instance");
+
+    auto *VoidTy = Type::getVoidTy(*Ctx);
+    addHelperAndWrapper(
+        *M, "__lljit_run_atexits", FunctionType::get(VoidTy, {}, false),
+        GlobalValue::HiddenVisibility, "__lljit.run_atexits_helper",
+        {PlatformInstanceDecl, DSOHandle});
+
+    return J.addIRModule(JD, ThreadSafeModule(std::move(M), std::move(Ctx)));
+  }
+
+  Error notifyAdding(JITDylib &JD, const MaterializationUnit &MU) {
+    if (auto &InitSym = MU.getInitializerSymbol())
+      InitSymbols[&JD].add(InitSym, SymbolLookupFlags::WeaklyReferencedSymbol);
+    else {
+      // If there's no identified init symbol attached, but there is a symbol
+      // with the GenericIRPlatform::InitFunctionPrefix, then treat that as
+      // an init function. Add the symbol to both the InitSymbols map (which
+      // will trigger a lookup to materialize the module) and the InitFunctions
+      // map (which holds the names of the symbols to execute).
+      for (auto &KV : MU.getSymbols())
+        if ((*KV.first).startswith(InitFunctionPrefix)) {
+          InitSymbols[&JD].add(KV.first,
+                               SymbolLookupFlags::WeaklyReferencedSymbol);
+          InitFunctions[&JD].add(KV.first);
+        }
+    }
+    return Error::success();
+  }
+
+  Error initialize(JITDylib &JD) override {
+    LLVM_DEBUG({
+      dbgs() << "GenericLLVMIRPlatformSupport getting initializers to run\n";
+    });
+    if (auto Initializers = getInitializers(JD)) {
+      LLVM_DEBUG(
+          { dbgs() << "GenericLLVMIRPlatformSupport running initializers\n"; });
+      for (auto InitFnAddr : *Initializers) {
+        LLVM_DEBUG({
+          dbgs() << "  Running init " << formatv("{0:x16}", InitFnAddr)
+                 << "...\n";
+        });
+        auto *InitFn = jitTargetAddressToFunction<void (*)()>(InitFnAddr);
+        InitFn();
+      }
+    } else
+      return Initializers.takeError();
+    return Error::success();
+  }
+
+  Error deinitialize(JITDylib &JD) override {
+    LLVM_DEBUG({
+      dbgs() << "GenericLLVMIRPlatformSupport getting deinitializers to run\n";
+    });
+    if (auto Deinitializers = getDeinitializers(JD)) {
+      LLVM_DEBUG({
+        dbgs() << "GenericLLVMIRPlatformSupport running deinitializers\n";
+      });
+      for (auto DeinitFnAddr : *Deinitializers) {
+        LLVM_DEBUG({
+          dbgs() << "  Running init " << formatv("{0:x16}", DeinitFnAddr)
+                 << "...\n";
+        });
+        auto *DeinitFn = jitTargetAddressToFunction<void (*)()>(DeinitFnAddr);
+        DeinitFn();
+      }
+    } else
+      return Deinitializers.takeError();
+
+    return Error::success();
+  }
+
+  void registerInitFunc(JITDylib &JD, SymbolStringPtr InitName) {
+    getExecutionSession().runSessionLocked([&]() {
+        InitFunctions[&JD].add(InitName);
+      });
+  }
+
+private:
+
+  Expected<std::vector<JITTargetAddress>> getInitializers(JITDylib &JD) {
+    if (auto Err = issueInitLookups(JD))
+      return std::move(Err);
+
+    DenseMap<JITDylib *, SymbolLookupSet> LookupSymbols;
+    std::vector<JITDylib *> DFSLinkOrder;
+
+    getExecutionSession().runSessionLocked([&]() {
+        DFSLinkOrder = getDFSLinkOrder(JD);
+
+        for (auto *NextJD : DFSLinkOrder) {
+          auto IFItr = InitFunctions.find(NextJD);
+          if (IFItr != InitFunctions.end()) {
+            LookupSymbols[NextJD] = std::move(IFItr->second);
+            InitFunctions.erase(IFItr);
+          }
+        }
+      });
+
+    LLVM_DEBUG({
+      dbgs() << "JITDylib init order is [ ";
+      for (auto *JD : llvm::reverse(DFSLinkOrder))
+        dbgs() << "\"" << JD->getName() << "\" ";
+      dbgs() << "]\n";
+      dbgs() << "Looking up init functions:\n";
+      for (auto &KV : LookupSymbols)
+        dbgs() << "  \"" << KV.first->getName() << "\": " << KV.second << "\n";
+    });
+
+    auto &ES = getExecutionSession();
+    auto LookupResult = Platform::lookupInitSymbols(ES, LookupSymbols);
+
+    if (!LookupResult)
+      return LookupResult.takeError();
+
+    std::vector<JITTargetAddress> Initializers;
+    while (!DFSLinkOrder.empty()) {
+      auto &NextJD = *DFSLinkOrder.back();
+      DFSLinkOrder.pop_back();
+      auto InitsItr = LookupResult->find(&NextJD);
+      if (InitsItr == LookupResult->end())
+        continue;
+      for (auto &KV : InitsItr->second)
+        Initializers.push_back(KV.second.getAddress());
+    }
+
+    return Initializers;
+  }
+
+  Expected<std::vector<JITTargetAddress>> getDeinitializers(JITDylib &JD) {
+    auto &ES = getExecutionSession();
+
+    auto LLJITRunAtExits = J.mangleAndIntern("__lljit_run_atexits");
+
+    DenseMap<JITDylib *, SymbolLookupSet> LookupSymbols;
+    std::vector<JITDylib *> DFSLinkOrder;
+
+    ES.runSessionLocked([&]() {
+        DFSLinkOrder = getDFSLinkOrder(JD);
+
+        for (auto *NextJD : DFSLinkOrder) {
+          auto &JDLookupSymbols = LookupSymbols[NextJD];
+          auto DIFItr = DeInitFunctions.find(NextJD);
+          if (DIFItr != DeInitFunctions.end()) {
+            LookupSymbols[NextJD] = std::move(DIFItr->second);
+            DeInitFunctions.erase(DIFItr);
+          }
+          JDLookupSymbols.add(LLJITRunAtExits,
+                            SymbolLookupFlags::WeaklyReferencedSymbol);
+      }
+    });
+
+    LLVM_DEBUG({
+      dbgs() << "JITDylib deinit order is [ ";
+      for (auto *JD : DFSLinkOrder)
+        dbgs() << "\"" << JD->getName() << "\" ";
+      dbgs() << "]\n";
+      dbgs() << "Looking up deinit functions:\n";
+      for (auto &KV : LookupSymbols)
+        dbgs() << "  \"" << KV.first->getName() << "\": " << KV.second << "\n";
+    });
+
+    auto LookupResult = Platform::lookupInitSymbols(ES, LookupSymbols);
+
+    if (!LookupResult)
+      return LookupResult.takeError();
+
+    std::vector<JITTargetAddress> DeInitializers;
+    for (auto *NextJD : DFSLinkOrder) {
+      auto DeInitsItr = LookupResult->find(NextJD);
+      assert(DeInitsItr != LookupResult->end() &&
+             "Every JD should have at least __lljit_run_atexits");
+
+      auto RunAtExitsItr = DeInitsItr->second.find(LLJITRunAtExits);
+      if (RunAtExitsItr != DeInitsItr->second.end())
+        DeInitializers.push_back(RunAtExitsItr->second.getAddress());
+
+      for (auto &KV : DeInitsItr->second)
+        if (KV.first != LLJITRunAtExits)
+          DeInitializers.push_back(KV.second.getAddress());
+    }
+
+    return DeInitializers;
+  }
+
+  // Returns a DFS traversal order of the JITDylibs reachable (via
+  // links-against edges) from JD, starting with JD itself.
+  static std::vector<JITDylib *> getDFSLinkOrder(JITDylib &JD) {
+    std::vector<JITDylib *> DFSLinkOrder;
+    std::vector<JITDylib *> WorkStack({&JD});
+    DenseSet<JITDylib *> Visited;
+
+    while (!WorkStack.empty()) {
+      auto &NextJD = *WorkStack.back();
+      WorkStack.pop_back();
+      if (Visited.count(&NextJD))
+        continue;
+      Visited.insert(&NextJD);
+      DFSLinkOrder.push_back(&NextJD);
+      NextJD.withLinkOrderDo([&](const JITDylibSearchOrder &LinkOrder) {
+        for (auto &KV : LinkOrder)
+          WorkStack.push_back(KV.first);
+      });
+    }
+
+    return DFSLinkOrder;
+  }
+
+  /// Issue lookups for all init symbols required to initialize JD (and any
+  /// JITDylibs that it depends on).
+  Error issueInitLookups(JITDylib &JD) {
+    DenseMap<JITDylib *, SymbolLookupSet> RequiredInitSymbols;
+    std::vector<JITDylib *> DFSLinkOrder;
+
+    getExecutionSession().runSessionLocked([&]() {
+        DFSLinkOrder = getDFSLinkOrder(JD);
+
+        for (auto *NextJD : DFSLinkOrder) {
+          auto ISItr = InitSymbols.find(NextJD);
+          if (ISItr != InitSymbols.end()) {
+            RequiredInitSymbols[NextJD] = std::move(ISItr->second);
+            InitSymbols.erase(ISItr);
+          }
+        }
+      });
+
+    return Platform::lookupInitSymbols(getExecutionSession(),
+                                       RequiredInitSymbols)
+        .takeError();
+  }
+
+  static void registerAtExitHelper(void *Self, void (*F)(void *), void *Ctx,
+                                   void *DSOHandle) {
+    LLVM_DEBUG({
+      dbgs() << "Registering atexit function " << (void *)F << " for JD "
+             << (*static_cast<JITDylib **>(DSOHandle))->getName() << "\n";
+    });
+    static_cast<GenericLLVMIRPlatformSupport *>(Self)->AtExitMgr.registerAtExit(
+        F, Ctx, DSOHandle);
+  }
+
+  static void runAtExitsHelper(void *Self, void *DSOHandle) {
+    LLVM_DEBUG({
+      dbgs() << "Running atexit functions for JD "
+             << (*static_cast<JITDylib **>(DSOHandle))->getName() << "\n";
+    });
+    static_cast<GenericLLVMIRPlatformSupport *>(Self)->AtExitMgr.runAtExits(
+        DSOHandle);
+  }
+
+  // Constructs an LLVM IR module containing platform runtime globals,
+  // functions, and interposes.
+  ThreadSafeModule createPlatformRuntimeModule() {
+    auto Ctx = std::make_unique<LLVMContext>();
+    auto M = std::make_unique<Module>("__standard_lib", *Ctx);
+    M->setDataLayout(J.getDataLayout());
+
+    auto *GenericIRPlatformSupportTy =
+        StructType::create(*Ctx, "lljit.GenericLLJITIRPlatformSupport");
+
+    auto *PlatformInstanceDecl = new GlobalVariable(
+        *M, GenericIRPlatformSupportTy, true, GlobalValue::ExternalLinkage,
+        nullptr, "__lljit.platform_support_instance");
+
+    auto *Int8Ty = Type::getInt8Ty(*Ctx);
+    auto *IntTy = Type::getIntNTy(*Ctx, sizeof(int) * CHAR_BIT);
+    auto *VoidTy = Type::getVoidTy(*Ctx);
+    auto *BytePtrTy = PointerType::getUnqual(Int8Ty);
+    auto *AtExitCallbackTy = FunctionType::get(VoidTy, {BytePtrTy}, false);
+    auto *AtExitCallbackPtrTy = PointerType::getUnqual(AtExitCallbackTy);
+
+    addHelperAndWrapper(
+        *M, "__cxa_atexit",
+        FunctionType::get(IntTy, {AtExitCallbackPtrTy, BytePtrTy, BytePtrTy},
+                          false),
+        GlobalValue::DefaultVisibility, "__lljit.cxa_atexit_helper",
+        {PlatformInstanceDecl});
+
+    return ThreadSafeModule(std::move(M), std::move(Ctx));
+  }
+
+  LLJIT &J;
+  std::string InitFunctionPrefix;
+  DenseMap<JITDylib *, SymbolLookupSet> InitSymbols;
+  DenseMap<JITDylib *, SymbolLookupSet> InitFunctions;
+  DenseMap<JITDylib *, SymbolLookupSet> DeInitFunctions;
+  ItaniumCXAAtExitSupport AtExitMgr;
+};
+
+Error GenericLLVMIRPlatform::setupJITDylib(JITDylib &JD) {
+  return S.setupJITDylib(JD);
+}
+
+Error GenericLLVMIRPlatform::notifyAdding(JITDylib &JD,
+                                          const MaterializationUnit &MU) {
+  return S.notifyAdding(JD, MU);
+}
+
+Expected<ThreadSafeModule>
+GlobalCtorDtorScraper::operator()(ThreadSafeModule TSM,
+                                  MaterializationResponsibility &R) {
+  auto Err = TSM.withModuleDo([&](Module &M) -> Error {
+    auto &Ctx = M.getContext();
+    auto *GlobalCtors = M.getNamedGlobal("llvm.global_ctors");
+
+    // If there's no llvm.global_ctors or it's just a decl then skip.
+    if (!GlobalCtors || GlobalCtors->isDeclaration())
+      return Error::success();
+
+    std::string InitFunctionName;
+    raw_string_ostream(InitFunctionName)
+        << InitFunctionPrefix << M.getModuleIdentifier();
+
+    MangleAndInterner Mangle(PS.getExecutionSession(), M.getDataLayout());
+    auto InternedName = Mangle(InitFunctionName);
+    if (auto Err =
+            R.defineMaterializing({{InternedName, JITSymbolFlags::Callable}}))
+      return Err;
+
+    auto *InitFunc =
+        Function::Create(FunctionType::get(Type::getVoidTy(Ctx), {}, false),
+                         GlobalValue::ExternalLinkage, InitFunctionName, &M);
+    InitFunc->setVisibility(GlobalValue::HiddenVisibility);
+    std::vector<std::pair<Function *, unsigned>> Inits;
+    for (auto E : getConstructors(M))
+      Inits.push_back(std::make_pair(E.Func, E.Priority));
+    llvm::sort(Inits, [](const std::pair<Function *, unsigned> &LHS,
+                         const std::pair<Function *, unsigned> &RHS) {
+      return LHS.first < RHS.first;
+    });
+    auto *EntryBlock = BasicBlock::Create(Ctx, "entry", InitFunc);
+    IRBuilder<> IB(EntryBlock);
+    for (auto &KV : Inits)
+      IB.CreateCall(KV.first);
+    IB.CreateRetVoid();
+
+    PS.registerInitFunc(R.getTargetJITDylib(), InternedName);
+    GlobalCtors->eraseFromParent();
+    return Error::success();
+  });
+
+  if (Err)
+    return std::move(Err);
+
+  return std::move(TSM);
+}
+
+class MachOPlatformSupport : public LLJIT::PlatformSupport {
+public:
+  using DLOpenType = void *(*)(const char *Name, int Mode);
+  using DLCloseType = int (*)(void *Handle);
+  using DLSymType = void *(*)(void *Handle, const char *Name);
+  using DLErrorType = const char *(*)();
+
+  struct DlFcnValues {
+    Optional<void *> RTLDDefault;
+    DLOpenType dlopen = nullptr;
+    DLCloseType dlclose = nullptr;
+    DLSymType dlsym = nullptr;
+    DLErrorType dlerror = nullptr;
+  };
+
+  static Expected<std::unique_ptr<MachOPlatformSupport>>
+  Create(LLJIT &J, JITDylib &PlatformJITDylib) {
+
+    // Make process symbols visible.
+    {
+      std::string ErrMsg;
+      auto Lib = sys::DynamicLibrary::getPermanentLibrary(nullptr, &ErrMsg);
+      if (!Lib.isValid())
+        return make_error<StringError>(std::move(ErrMsg),
+                                       inconvertibleErrorCode());
+    }
+
+    DlFcnValues DlFcn;
+
+    // Add support for RTLDDefault on known platforms.
+#ifdef __APPLE__
+    DlFcn.RTLDDefault = reinterpret_cast<void *>(-2);
+#endif // __APPLE__
+
+    if (auto Err = hookUpFunction(DlFcn.dlopen, "dlopen"))
+      return std::move(Err);
+    if (auto Err = hookUpFunction(DlFcn.dlclose, "dlclose"))
+      return std::move(Err);
+    if (auto Err = hookUpFunction(DlFcn.dlsym, "dlsym"))
+      return std::move(Err);
+    if (auto Err = hookUpFunction(DlFcn.dlerror, "dlerror"))
+      return std::move(Err);
+
+    std::unique_ptr<MachOPlatformSupport> MP(
+        new MachOPlatformSupport(J, PlatformJITDylib, DlFcn));
+    return std::move(MP);
+  }
+
+  Error initialize(JITDylib &JD) override {
+    LLVM_DEBUG({
+      dbgs() << "MachOPlatformSupport initializing \"" << JD.getName()
+             << "\"\n";
+    });
+
+    auto InitSeq = MP.getInitializerSequence(JD);
+    if (!InitSeq)
+      return InitSeq.takeError();
+
+    // If ObjC is not enabled but there are JIT'd ObjC inits then return
+    // an error.
+    if (!objCRegistrationEnabled())
+      for (auto &KV : *InitSeq) {
+        if (!KV.second.getObjCSelRefsSections().empty() ||
+            !KV.second.getObjCClassListSections().empty())
+          return make_error<StringError>("JITDylib " + KV.first->getName() +
+                                             " contains objc metadata but objc"
+                                             " is not enabled",
+                                         inconvertibleErrorCode());
+      }
+
+    // Run the initializers.
+    for (auto &KV : *InitSeq) {
+      if (objCRegistrationEnabled()) {
+        KV.second.registerObjCSelectors();
+        if (auto Err = KV.second.registerObjCClasses()) {
+          // FIXME: Roll back registrations on error?
+          return Err;
+        }
+      }
+      KV.second.runModInits();
+    }
+
+    return Error::success();
+  }
+
+  Error deinitialize(JITDylib &JD) override {
+    auto &ES = J.getExecutionSession();
+    if (auto DeinitSeq = MP.getDeinitializerSequence(JD)) {
+      for (auto &KV : *DeinitSeq) {
+        auto DSOHandleName = ES.intern("___dso_handle");
+
+        // FIXME: Run DeInits here.
+        auto Result = ES.lookup(
+            {{KV.first, JITDylibLookupFlags::MatchAllSymbols}},
+            SymbolLookupSet(DSOHandleName,
+                            SymbolLookupFlags::WeaklyReferencedSymbol));
+        if (!Result)
+          return Result.takeError();
+        if (Result->empty())
+          continue;
+        assert(Result->count(DSOHandleName) &&
+               "Result does not contain __dso_handle");
+        auto *DSOHandle = jitTargetAddressToPointer<void *>(
+            Result->begin()->second.getAddress());
+        AtExitMgr.runAtExits(DSOHandle);
+      }
+    } else
+      return DeinitSeq.takeError();
+    return Error::success();
+  }
+
+private:
+  template <typename FunctionPtrTy>
+  static Error hookUpFunction(FunctionPtrTy &Fn, const char *Name) {
+    if (auto *FnAddr = sys::DynamicLibrary::SearchForAddressOfSymbol(Name)) {
+      Fn = reinterpret_cast<FunctionPtrTy>(Fn);
+      return Error::success();
+    }
+
+    return make_error<StringError>((Twine("Can not enable MachO JIT Platform: "
+                                          "missing function: ") +
+                                    Name)
+                                       .str(),
+                                   inconvertibleErrorCode());
+  }
+
+  MachOPlatformSupport(LLJIT &J, JITDylib &PlatformJITDylib, DlFcnValues DlFcn)
+      : J(J), MP(setupPlatform(J)), DlFcn(std::move(DlFcn)) {
+
+    SymbolMap HelperSymbols;
+
+    // platform and atexit helpers.
+    HelperSymbols[J.mangleAndIntern("__lljit.platform_support_instance")] =
+        JITEvaluatedSymbol(pointerToJITTargetAddress(this), JITSymbolFlags());
+    HelperSymbols[J.mangleAndIntern("__lljit.cxa_atexit_helper")] =
+        JITEvaluatedSymbol(pointerToJITTargetAddress(registerAtExitHelper),
+                           JITSymbolFlags());
+    HelperSymbols[J.mangleAndIntern("__lljit.run_atexits_helper")] =
+        JITEvaluatedSymbol(pointerToJITTargetAddress(runAtExitsHelper),
+                           JITSymbolFlags());
+
+    // dlfcn helpers.
+    HelperSymbols[J.mangleAndIntern("__lljit.dlopen_helper")] =
+        JITEvaluatedSymbol(pointerToJITTargetAddress(dlopenHelper),
+                           JITSymbolFlags());
+    HelperSymbols[J.mangleAndIntern("__lljit.dlclose_helper")] =
+        JITEvaluatedSymbol(pointerToJITTargetAddress(dlcloseHelper),
+                           JITSymbolFlags());
+    HelperSymbols[J.mangleAndIntern("__lljit.dlsym_helper")] =
+        JITEvaluatedSymbol(pointerToJITTargetAddress(dlsymHelper),
+                           JITSymbolFlags());
+    HelperSymbols[J.mangleAndIntern("__lljit.dlerror_helper")] =
+        JITEvaluatedSymbol(pointerToJITTargetAddress(dlerrorHelper),
+                           JITSymbolFlags());
+
+    cantFail(
+        PlatformJITDylib.define(absoluteSymbols(std::move(HelperSymbols))));
+    cantFail(MP.setupJITDylib(J.getMainJITDylib()));
+    cantFail(J.addIRModule(PlatformJITDylib, createPlatformRuntimeModule()));
+  }
+
+  static MachOPlatform &setupPlatform(LLJIT &J) {
+    auto Tmp = std::make_unique<MachOPlatform>(
+        J.getExecutionSession(),
+        static_cast<ObjectLinkingLayer &>(J.getObjLinkingLayer()),
+        createStandardSymbolsObject(J));
+    auto &MP = *Tmp;
+    J.getExecutionSession().setPlatform(std::move(Tmp));
+    return MP;
+  }
+
+  static std::unique_ptr<MemoryBuffer> createStandardSymbolsObject(LLJIT &J) {
+    LLVMContext Ctx;
+    Module M("__standard_symbols", Ctx);
+    M.setDataLayout(J.getDataLayout());
+
+    auto *Int64Ty = Type::getInt64Ty(Ctx);
+
+    auto *DSOHandle =
+        new GlobalVariable(M, Int64Ty, true, GlobalValue::ExternalLinkage,
+                           ConstantInt::get(Int64Ty, 0), "__dso_handle");
+    DSOHandle->setVisibility(GlobalValue::DefaultVisibility);
+
+    return cantFail(J.getIRCompileLayer().getCompiler()(M));
+  }
+
+  ThreadSafeModule createPlatformRuntimeModule() {
+    auto Ctx = std::make_unique<LLVMContext>();
+    auto M = std::make_unique<Module>("__standard_lib", *Ctx);
+    M->setDataLayout(J.getDataLayout());
+
+    auto *MachOPlatformSupportTy =
+        StructType::create(*Ctx, "lljit.MachOPlatformSupport");
+
+    auto *PlatformInstanceDecl = new GlobalVariable(
+        *M, MachOPlatformSupportTy, true, GlobalValue::ExternalLinkage, nullptr,
+        "__lljit.platform_support_instance");
+
+    auto *Int8Ty = Type::getInt8Ty(*Ctx);
+    auto *IntTy = Type::getIntNTy(*Ctx, sizeof(int) * CHAR_BIT);
+    auto *VoidTy = Type::getVoidTy(*Ctx);
+    auto *BytePtrTy = PointerType::getUnqual(Int8Ty);
+    auto *AtExitCallbackTy = FunctionType::get(VoidTy, {BytePtrTy}, false);
+    auto *AtExitCallbackPtrTy = PointerType::getUnqual(AtExitCallbackTy);
+
+    addHelperAndWrapper(
+        *M, "__cxa_atexit",
+        FunctionType::get(IntTy, {AtExitCallbackPtrTy, BytePtrTy, BytePtrTy},
+                          false),
+        GlobalValue::DefaultVisibility, "__lljit.cxa_atexit_helper",
+        {PlatformInstanceDecl});
+
+    addHelperAndWrapper(*M, "dlopen",
+                        FunctionType::get(BytePtrTy, {BytePtrTy, IntTy}, false),
+                        GlobalValue::DefaultVisibility, "__lljit.dlopen_helper",
+                        {PlatformInstanceDecl});
+
+    addHelperAndWrapper(*M, "dlclose",
+                        FunctionType::get(IntTy, {BytePtrTy}, false),
+                        GlobalValue::DefaultVisibility,
+                        "__lljit.dlclose_helper", {PlatformInstanceDecl});
+
+    addHelperAndWrapper(
+        *M, "dlsym",
+        FunctionType::get(BytePtrTy, {BytePtrTy, BytePtrTy}, false),
+        GlobalValue::DefaultVisibility, "__lljit.dlsym_helper",
+        {PlatformInstanceDecl});
+
+    addHelperAndWrapper(*M, "dlerror", FunctionType::get(BytePtrTy, {}, false),
+                        GlobalValue::DefaultVisibility,
+                        "__lljit.dlerror_helper", {PlatformInstanceDecl});
+
+    return ThreadSafeModule(std::move(M), std::move(Ctx));
+  }
+
+  static void registerAtExitHelper(void *Self, void (*F)(void *), void *Ctx,
+                                   void *DSOHandle) {
+    static_cast<MachOPlatformSupport *>(Self)->AtExitMgr.registerAtExit(
+        F, Ctx, DSOHandle);
+  }
+
+  static void runAtExitsHelper(void *Self, void *DSOHandle) {
+    static_cast<MachOPlatformSupport *>(Self)->AtExitMgr.runAtExits(DSOHandle);
+  }
+
+  void *jit_dlopen(const char *Path, int Mode) {
+    JITDylib *JDToOpen = nullptr;
+    // FIXME: Do the right thing with Mode flags.
+    {
+      std::lock_guard<std::mutex> Lock(PlatformSupportMutex);
+
+      // Clear any existing error messages.
+      dlErrorMsgs.erase(std::this_thread::get_id());
+
+      if (auto *JD = J.getExecutionSession().getJITDylibByName(Path)) {
+        auto I = JDRefCounts.find(JD);
+        if (I != JDRefCounts.end()) {
+          ++I->second;
+          return JD;
+        }
+
+        JDRefCounts[JD] = 1;
+        JDToOpen = JD;
+      }
+    }
+
+    if (JDToOpen) {
+      if (auto Err = initialize(*JDToOpen)) {
+        recordError(std::move(Err));
+        return 0;
+      }
+    }
+
+    // Fall through to dlopen if no JITDylib found for Path.
+    return DlFcn.dlopen(Path, Mode);
+  }
+
+  static void *dlopenHelper(void *Self, const char *Path, int Mode) {
+    return static_cast<MachOPlatformSupport *>(Self)->jit_dlopen(Path, Mode);
+  }
+
+  int jit_dlclose(void *Handle) {
+    JITDylib *JDToClose = nullptr;
+
+    {
+      std::lock_guard<std::mutex> Lock(PlatformSupportMutex);
+
+      // Clear any existing error messages.
+      dlErrorMsgs.erase(std::this_thread::get_id());
+
+      auto I = JDRefCounts.find(Handle);
+      if (I != JDRefCounts.end()) {
+        --I->second;
+        if (I->second == 0) {
+          JDRefCounts.erase(I);
+          JDToClose = static_cast<JITDylib *>(Handle);
+        } else
+          return 0;
+      }
+    }
+
+    if (JDToClose) {
+      if (auto Err = deinitialize(*JDToClose)) {
+        recordError(std::move(Err));
+        return -1;
+      }
+      return 0;
+    }
+
+    // Fall through to dlclose if no JITDylib found for Path.
+    return DlFcn.dlclose(Handle);
+  }
+
+  static int dlcloseHelper(void *Self, void *Handle) {
+    return static_cast<MachOPlatformSupport *>(Self)->jit_dlclose(Handle);
+  }
+
+  void *jit_dlsym(void *Handle, const char *Name) {
+    JITDylibSearchOrder JITSymSearchOrder;
+
+    // FIXME: RTLD_NEXT, RTLD_SELF not supported.
+    {
+      std::lock_guard<std::mutex> Lock(PlatformSupportMutex);
+
+      // Clear any existing error messages.
+      dlErrorMsgs.erase(std::this_thread::get_id());
+
+      if (JDRefCounts.count(Handle)) {
+        JITSymSearchOrder.push_back(
+            {static_cast<JITDylib *>(Handle),
+             JITDylibLookupFlags::MatchExportedSymbolsOnly});
+      } else if (Handle == DlFcn.RTLDDefault) {
+        for (auto &KV : JDRefCounts)
+          JITSymSearchOrder.push_back(
+              {static_cast<JITDylib *>(KV.first),
+               JITDylibLookupFlags::MatchExportedSymbolsOnly});
+      }
+    }
+
+    if (!JITSymSearchOrder.empty()) {
+      auto MangledName = J.mangleAndIntern(Name);
+      SymbolLookupSet Syms(MangledName,
+                           SymbolLookupFlags::WeaklyReferencedSymbol);
+      if (auto Result = J.getExecutionSession().lookup(JITSymSearchOrder, Syms,
+                                                       LookupKind::DLSym)) {
+        auto I = Result->find(MangledName);
+        if (I != Result->end())
+          return jitTargetAddressToPointer<void *>(I->second.getAddress());
+      } else {
+        recordError(Result.takeError());
+        return 0;
+      }
+    }
+
+    // Fall through to dlsym.
+    return DlFcn.dlsym(Handle, Name);
+  }
+
+  static void *dlsymHelper(void *Self, void *Handle, const char *Name) {
+    return static_cast<MachOPlatformSupport *>(Self)->jit_dlsym(Handle, Name);
+  }
+
+  const char *jit_dlerror() {
+    {
+      std::lock_guard<std::mutex> Lock(PlatformSupportMutex);
+      auto I = dlErrorMsgs.find(std::this_thread::get_id());
+      if (I != dlErrorMsgs.end())
+        return I->second->c_str();
+    }
+    return DlFcn.dlerror();
+  }
+
+  static const char *dlerrorHelper(void *Self) {
+    return static_cast<MachOPlatformSupport *>(Self)->jit_dlerror();
+  }
+
+  void recordError(Error Err) {
+    std::lock_guard<std::mutex> Lock(PlatformSupportMutex);
+    dlErrorMsgs[std::this_thread::get_id()] =
+        std::make_unique<std::string>(toString(std::move(Err)));
+  }
+
+  std::mutex PlatformSupportMutex;
+  LLJIT &J;
+  MachOPlatform &MP;
+  DlFcnValues DlFcn;
+  ItaniumCXAAtExitSupport AtExitMgr;
+  DenseMap<void *, unsigned> JDRefCounts;
+  std::map<std::thread::id, std::unique_ptr<std::string>> dlErrorMsgs;
+};
+
+} // end anonymous namespace
 
 namespace llvm {
 namespace orc {
 
+void LLJIT::PlatformSupport::setInitTransform(
+    LLJIT &J, IRTransformLayer::TransformFunction T) {
+  J.InitHelperTransformLayer->setTransform(std::move(T));
+}
+
+LLJIT::PlatformSupport::~PlatformSupport() {}
+
 Error LLJITBuilderState::prepareForConstruction() {
 
+  LLVM_DEBUG(dbgs() << "Preparing to create LLIT instance...\n");
+
   if (!JTMB) {
+    LLVM_DEBUG({
+      dbgs() << "  No explicitly set JITTargetMachineBuilder. "
+                "Detecting host...\n";
+    });
     if (auto JTMBOrErr = JITTargetMachineBuilder::detectHost())
       JTMB = std::move(*JTMBOrErr);
     else
       return JTMBOrErr.takeError();
   }
 
+  LLVM_DEBUG({
+    dbgs() << "  JITTargetMachineBuilder is " << JTMB << "\n"
+           << "  Pre-constructed ExecutionSession: " << (ES ? "Yes" : "No")
+           << "\n"
+           << "  DataLayout: ";
+    if (DL)
+      dbgs() << DL->getStringRepresentation() << "\n";
+    else
+      dbgs() << "None (will be created by JITTargetMachineBuilder)\n";
+
+    dbgs() << "  Custom object-linking-layer creator: "
+           << (CreateObjectLinkingLayer ? "Yes" : "No") << "\n"
+           << "  Custom compile-function creator: "
+           << (CreateCompileFunction ? "Yes" : "No") << "\n"
+           << "  Custom platform-setup function: "
+           << (SetUpPlatform ? "Yes" : "No") << "\n"
+           << "  Number of compile threads: " << NumCompileThreads;
+    if (!NumCompileThreads)
+      dbgs() << " (code will be compiled on the execution thread)\n";
+    else
+      dbgs() << "\n";
+  });
+
   // If the client didn't configure any linker options then auto-configure the
   // JIT linker.
-  if (!CreateObjectLinkingLayer && JTMB->getCodeModel() == None &&
-      JTMB->getRelocationModel() == None) {
-
+  if (!CreateObjectLinkingLayer) {
     auto &TT = JTMB->getTargetTriple();
     if (TT.isOSBinFormatMachO() &&
         (TT.getArch() == Triple::aarch64 || TT.getArch() == Triple::x86_64)) {
@@ -40,8 +975,11 @@ Error LLJITBuilderState::prepareForConstruction() {
       CreateObjectLinkingLayer =
           [](ExecutionSession &ES,
              const Triple &) -> std::unique_ptr<ObjectLayer> {
-        return std::make_unique<ObjectLinkingLayer>(
+        auto ObjLinkingLayer = std::make_unique<ObjectLinkingLayer>(
             ES, std::make_unique<jitlink::InProcessMemoryManager>());
+        ObjLinkingLayer->addPlugin(std::make_unique<EHFrameRegistrationPlugin>(
+            jitlink::InProcessEHFrameRegistrar::getInstance()));
+        return std::move(ObjLinkingLayer);
       };
     }
   }
@@ -54,12 +992,6 @@ LLJIT::~LLJIT() {
     CompileThreads->wait();
 }
 
-Error LLJIT::defineAbsolute(StringRef Name, JITEvaluatedSymbol Sym) {
-  auto InternedName = ES->intern(Name);
-  SymbolMap Symbols({{InternedName, Sym}});
-  return Main.define(absoluteSymbols(std::move(Symbols)));
-}
-
 Error LLJIT::addIRModule(JITDylib &JD, ThreadSafeModule TSM) {
   assert(TSM && "Can not add null module");
 
@@ -67,7 +999,8 @@ Error LLJIT::addIRModule(JITDylib &JD, ThreadSafeModule TSM) {
           TSM.withModuleDo([&](Module &M) { return applyDataLayout(M); }))
     return Err;
 
-  return CompileLayer->add(JD, std::move(TSM), ES->allocateVModule());
+  return InitHelperTransformLayer->add(JD, std::move(TSM),
+                                       ES->allocateVModule());
 }
 
 Error LLJIT::addObjectFile(JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj) {
@@ -77,10 +1010,9 @@ Error LLJIT::addObjectFile(JITDylib &JD, std::unique_ptr<MemoryBuffer> Obj) {
 }
 
 Expected<JITEvaluatedSymbol> LLJIT::lookupLinkerMangled(JITDylib &JD,
-                                                        StringRef Name) {
+                                                        SymbolStringPtr Name) {
   return ES->lookup(
-      makeJITDylibSearchOrder(&JD, JITDylibLookupFlags::MatchAllSymbols),
-      ES->intern(Name));
+      makeJITDylibSearchOrder(&JD, JITDylibLookupFlags::MatchAllSymbols), Name);
 }
 
 std::unique_ptr<ObjectLayer>
@@ -96,8 +1028,10 @@ LLJIT::createObjectLinkingLayer(LLJITBuilderState &S, ExecutionSession &ES) {
   auto ObjLinkingLayer =
       std::make_unique<RTDyldObjectLinkingLayer>(ES, std::move(GetMemMgr));
 
-  if (S.JTMB->getTargetTriple().isOSBinFormatCOFF())
+  if (S.JTMB->getTargetTriple().isOSBinFormatCOFF()) {
     ObjLinkingLayer->setOverrideObjectFlagsWithResponsibilityFlags(true);
+    ObjLinkingLayer->setAutoClaimResponsibilityForObjectSymbols(true);
+  }
 
   // FIXME: Explicit conversion to std::unique_ptr<ObjectLayer> added to silence
   //        errors from some GCC / libstdc++ bots. Remove this conversion (i.e.
@@ -105,7 +1039,7 @@ LLJIT::createObjectLinkingLayer(LLJITBuilderState &S, ExecutionSession &ES) {
   return std::unique_ptr<ObjectLayer>(std::move(ObjLinkingLayer));
 }
 
-Expected<IRCompileLayer::CompileFunction>
+Expected<std::unique_ptr<IRCompileLayer::IRCompiler>>
 LLJIT::createCompileFunction(LLJITBuilderState &S,
                              JITTargetMachineBuilder JTMB) {
 
@@ -116,25 +1050,33 @@ LLJIT::createCompileFunction(LLJITBuilderState &S,
   // Otherwise default to creating a SimpleCompiler, or ConcurrentIRCompiler,
   // depending on the number of threads requested.
   if (S.NumCompileThreads > 0)
-    return ConcurrentIRCompiler(std::move(JTMB));
+    return std::make_unique<ConcurrentIRCompiler>(std::move(JTMB));
 
   auto TM = JTMB.createTargetMachine();
   if (!TM)
     return TM.takeError();
 
-  return TMOwningSimpleCompiler(std::move(*TM));
+  return std::make_unique<TMOwningSimpleCompiler>(std::move(*TM));
 }
 
 LLJIT::LLJIT(LLJITBuilderState &S, Error &Err)
-    : ES(S.ES ? std::move(S.ES) : std::make_unique<ExecutionSession>()),
-      Main(this->ES->createJITDylib("<main>")), DL(""),
+    : ES(S.ES ? std::move(S.ES) : std::make_unique<ExecutionSession>()), Main(),
+      DL(""), TT(S.JTMB->getTargetTriple()),
       ObjLinkingLayer(createObjectLinkingLayer(S, *ES)),
-      ObjTransformLayer(*this->ES, *ObjLinkingLayer), CtorRunner(Main),
-      DtorRunner(Main) {
+      ObjTransformLayer(*this->ES, *ObjLinkingLayer) {
 
   ErrorAsOutParameter _(&Err);
 
-  if (auto DLOrErr = S.JTMB->getDefaultDataLayoutForTarget())
+  if (auto MainOrErr = this->ES->createJITDylib("main"))
+    Main = &*MainOrErr;
+  else {
+    Err = MainOrErr.takeError();
+    return;
+  }
+
+  if (S.DL)
+    DL = std::move(*S.DL);
+  else if (auto DLOrErr = S.JTMB->getDefaultDataLayoutForTarget())
     DL = std::move(*DLOrErr);
   else {
     Err = DLOrErr.takeError();
@@ -149,22 +1091,36 @@ LLJIT::LLJIT(LLJITBuilderState &S, Error &Err)
     }
     CompileLayer = std::make_unique<IRCompileLayer>(
         *ES, ObjTransformLayer, std::move(*CompileFunction));
+    TransformLayer = std::make_unique<IRTransformLayer>(*ES, *CompileLayer);
+    InitHelperTransformLayer =
+        std::make_unique<IRTransformLayer>(*ES, *TransformLayer);
   }
 
   if (S.NumCompileThreads > 0) {
-    CompileLayer->setCloneToNewContextOnEmit(true);
-    CompileThreads = std::make_unique<ThreadPool>(S.NumCompileThreads);
+    InitHelperTransformLayer->setCloneToNewContextOnEmit(true);
+    CompileThreads =
+        std::make_unique<ThreadPool>(hardware_concurrency(S.NumCompileThreads));
     ES->setDispatchMaterialization(
-        [this](JITDylib &JD, std::unique_ptr<MaterializationUnit> MU) {
-          // FIXME: Switch to move capture once we have c++14.
+        [this](std::unique_ptr<MaterializationUnit> MU,
+               MaterializationResponsibility MR) {
+          // FIXME: Switch to move capture once ThreadPool uses unique_function.
           auto SharedMU = std::shared_ptr<MaterializationUnit>(std::move(MU));
-          auto Work = [SharedMU, &JD]() { SharedMU->doMaterialize(JD); };
+          auto SharedMR =
+              std::make_shared<MaterializationResponsibility>(std::move(MR));
+          auto Work = [SharedMU, SharedMR]() mutable {
+            SharedMU->materialize(std::move(*SharedMR));
+          };
           CompileThreads->async(std::move(Work));
         });
   }
+
+  if (S.SetUpPlatform)
+    Err = S.SetUpPlatform(*this);
+  else
+    setUpGenericLLVMIRPlatform(*this);
 }
 
-std::string LLJIT::mangle(StringRef UnmangledName) {
+std::string LLJIT::mangle(StringRef UnmangledName) const {
   std::string MangledName;
   {
     raw_string_ostream MangledNameStream(MangledName);
@@ -179,15 +1135,27 @@ Error LLJIT::applyDataLayout(Module &M) {
 
   if (M.getDataLayout() != DL)
     return make_error<StringError>(
-        "Added modules have incompatible data layouts",
+        "Added modules have incompatible data layouts: " +
+            M.getDataLayout().getStringRepresentation() + " (module) vs " +
+            DL.getStringRepresentation() + " (jit)",
         inconvertibleErrorCode());
 
   return Error::success();
 }
 
-void LLJIT::recordCtorDtors(Module &M) {
-  CtorRunner.add(getConstructors(M));
-  DtorRunner.add(getDestructors(M));
+void setUpGenericLLVMIRPlatform(LLJIT &J) {
+  LLVM_DEBUG(
+      { dbgs() << "Setting up GenericLLVMIRPlatform support for LLJIT\n"; });
+  J.setPlatformSupport(std::make_unique<GenericLLVMIRPlatformSupport>(J));
+}
+
+Error setUpMachOPlatform(LLJIT &J) {
+  LLVM_DEBUG({ dbgs() << "Setting up MachOPlatform support for LLJIT\n"; });
+  auto MP = MachOPlatformSupport::Create(J, J.getMainJITDylib());
+  if (!MP)
+    return MP.takeError();
+  J.setPlatformSupport(std::move(*MP));
+  return Error::success();
 }
 
 Error LLLazyJITBuilderState::prepareForConstruction() {
@@ -200,13 +1168,8 @@ Error LLLazyJITBuilderState::prepareForConstruction() {
 Error LLLazyJIT::addLazyIRModule(JITDylib &JD, ThreadSafeModule TSM) {
   assert(TSM && "Can not add null module");
 
-  if (auto Err = TSM.withModuleDo([&](Module &M) -> Error {
-        if (auto Err = applyDataLayout(M))
-          return Err;
-
-        recordCtorDtors(M);
-        return Error::success();
-      }))
+  if (auto Err = TSM.withModuleDo(
+          [&](Module &M) -> Error { return applyDataLayout(M); }))
     return Err;
 
   return CODLayer->add(JD, std::move(TSM), ES->allocateVModule());
@@ -249,12 +1212,9 @@ LLLazyJIT::LLLazyJIT(LLLazyJITBuilderState &S, Error &Err) : LLJIT(S, Err) {
     return;
   }
 
-  // Create the transform layer.
-  TransformLayer = std::make_unique<IRTransformLayer>(*ES, *CompileLayer);
-
   // Create the COD layer.
   CODLayer = std::make_unique<CompileOnDemandLayer>(
-      *ES, *TransformLayer, *LCTMgr, std::move(ISMBuilder));
+      *ES, *InitHelperTransformLayer, *LCTMgr, std::move(ISMBuilder));
 
   if (S.NumCompileThreads > 0)
     CODLayer->setCloneToNewContextOnEmit(true);
diff --git a/llvm/lib/ExecutionEngine/Orc/Layer.cpp b/llvm/lib/ExecutionEngine/Orc/Layer.cpp
index 580e2682ec8c..61e7ab5ae68b 100644
--- a/llvm/lib/ExecutionEngine/Orc/Layer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Layer.cpp
@@ -7,6 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/Layer.h"
+
+#include "llvm/ExecutionEngine/Orc/DebugUtils.h"
+#include "llvm/ExecutionEngine/Orc/ExecutionUtils.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/Debug.h"
 
@@ -15,28 +20,79 @@
 namespace llvm {
 namespace orc {
 
-IRLayer::IRLayer(ExecutionSession &ES) : ES(ES) {}
 IRLayer::~IRLayer() {}
 
 Error IRLayer::add(JITDylib &JD, ThreadSafeModule TSM, VModuleKey K) {
   return JD.define(std::make_unique<BasicIRLayerMaterializationUnit>(
-      *this, std::move(K), std::move(TSM)));
+      *this, *getManglingOptions(), std::move(TSM), std::move(K)));
 }
 
-IRMaterializationUnit::IRMaterializationUnit(ExecutionSession &ES,
-                                             ThreadSafeModule TSM, VModuleKey K)
-    : MaterializationUnit(SymbolFlagsMap(), std::move(K)), TSM(std::move(TSM)) {
+IRMaterializationUnit::IRMaterializationUnit(
+    ExecutionSession &ES, const IRSymbolMapper::ManglingOptions &MO,
+    ThreadSafeModule TSM, VModuleKey K)
+    : MaterializationUnit(SymbolFlagsMap(), nullptr, std::move(K)),
+      TSM(std::move(TSM)) {
 
   assert(this->TSM && "Module must not be null");
 
   MangleAndInterner Mangle(ES, this->TSM.getModuleUnlocked()->getDataLayout());
   this->TSM.withModuleDo([&](Module &M) {
     for (auto &G : M.global_values()) {
-      if (G.hasName() && !G.isDeclaration() && !G.hasLocalLinkage() &&
-          !G.hasAvailableExternallyLinkage() && !G.hasAppendingLinkage()) {
-        auto MangledName = Mangle(G.getName());
-        SymbolFlags[MangledName] = JITSymbolFlags::fromGlobalValue(G);
-        SymbolToDefinition[MangledName] = &G;
+      // Skip globals that don't generate symbols.
+
+      if (!G.hasName() || G.isDeclaration() || G.hasLocalLinkage() ||
+          G.hasAvailableExternallyLinkage() || G.hasAppendingLinkage())
+        continue;
+
+      // thread locals generate different symbols depending on whether or not
+      // emulated TLS is enabled.
+      if (G.isThreadLocal() && MO.EmulatedTLS) {
+        auto &GV = cast<GlobalVariable>(G);
+
+        auto Flags = JITSymbolFlags::fromGlobalValue(GV);
+
+        auto EmuTLSV = Mangle(("__emutls_v." + GV.getName()).str());
+        SymbolFlags[EmuTLSV] = Flags;
+        SymbolToDefinition[EmuTLSV] = &GV;
+
+        // If this GV has a non-zero initializer we'll need to emit an
+        // __emutls.t symbol too.
+        if (GV.hasInitializer()) {
+          const auto *InitVal = GV.getInitializer();
+
+          // Skip zero-initializers.
+          if (isa<ConstantAggregateZero>(InitVal))
+            continue;
+          const auto *InitIntValue = dyn_cast<ConstantInt>(InitVal);
+          if (InitIntValue && InitIntValue->isZero())
+            continue;
+
+          auto EmuTLST = Mangle(("__emutls_t." + GV.getName()).str());
+          SymbolFlags[EmuTLST] = Flags;
+        }
+        continue;
+      }
+
+      // Otherwise we just need a normal linker mangling.
+      auto MangledName = Mangle(G.getName());
+      SymbolFlags[MangledName] = JITSymbolFlags::fromGlobalValue(G);
+      SymbolToDefinition[MangledName] = &G;
+    }
+
+    // If we need an init symbol for this module then create one.
+    if (!llvm::empty(getStaticInitGVs(M))) {
+      size_t Counter = 0;
+
+      while (true) {
+        std::string InitSymbolName;
+        raw_string_ostream(InitSymbolName)
+            << "$." << M.getModuleIdentifier() << ".__inits." << Counter++;
+        InitSymbol = ES.intern(InitSymbolName);
+        if (SymbolFlags.count(InitSymbol))
+          continue;
+        SymbolFlags[InitSymbol] =
+            JITSymbolFlags::MaterializationSideEffectsOnly;
+        break;
       }
     }
   });
@@ -44,8 +100,9 @@ IRMaterializationUnit::IRMaterializationUnit(ExecutionSession &ES,
 
 IRMaterializationUnit::IRMaterializationUnit(
     ThreadSafeModule TSM, VModuleKey K, SymbolFlagsMap SymbolFlags,
-    SymbolNameToDefinitionMap SymbolToDefinition)
-    : MaterializationUnit(std::move(SymbolFlags), std::move(K)),
+    SymbolStringPtr InitSymbol, SymbolNameToDefinitionMap SymbolToDefinition)
+    : MaterializationUnit(std::move(SymbolFlags), std::move(InitSymbol),
+                          std::move(K)),
       TSM(std::move(TSM)), SymbolToDefinition(std::move(SymbolToDefinition)) {}
 
 StringRef IRMaterializationUnit::getName() const {
@@ -72,8 +129,9 @@ void IRMaterializationUnit::discard(const JITDylib &JD,
 }
 
 BasicIRLayerMaterializationUnit::BasicIRLayerMaterializationUnit(
-    IRLayer &L, VModuleKey K, ThreadSafeModule TSM)
-    : IRMaterializationUnit(L.getExecutionSession(), std::move(TSM),
+    IRLayer &L, const IRSymbolMapper::ManglingOptions &MO, ThreadSafeModule TSM,
+    VModuleKey K)
+    : IRMaterializationUnit(L.getExecutionSession(), MO, std::move(TSM),
                             std::move(K)),
       L(L), K(std::move(K)) {}
 
@@ -117,22 +175,26 @@ Error ObjectLayer::add(JITDylib &JD, std::unique_ptr<MemoryBuffer> O,
 Expected<std::unique_ptr<BasicObjectLayerMaterializationUnit>>
 BasicObjectLayerMaterializationUnit::Create(ObjectLayer &L, VModuleKey K,
                                             std::unique_ptr<MemoryBuffer> O) {
-  auto SymbolFlags =
-      getObjectSymbolFlags(L.getExecutionSession(), O->getMemBufferRef());
+  auto ObjSymInfo =
+      getObjectSymbolInfo(L.getExecutionSession(), O->getMemBufferRef());
+
+  if (!ObjSymInfo)
+    return ObjSymInfo.takeError();
 
-  if (!SymbolFlags)
-    return SymbolFlags.takeError();
+  auto &SymbolFlags = ObjSymInfo->first;
+  auto &InitSymbol = ObjSymInfo->second;
 
   return std::unique_ptr<BasicObjectLayerMaterializationUnit>(
-      new BasicObjectLayerMaterializationUnit(L, K, std::move(O),
-                                              std::move(*SymbolFlags)));
+      new BasicObjectLayerMaterializationUnit(
+          L, K, std::move(O), std::move(SymbolFlags), std::move(InitSymbol)));
 }
 
 BasicObjectLayerMaterializationUnit::BasicObjectLayerMaterializationUnit(
     ObjectLayer &L, VModuleKey K, std::unique_ptr<MemoryBuffer> O,
-    SymbolFlagsMap SymbolFlags)
-    : MaterializationUnit(std::move(SymbolFlags), std::move(K)), L(L),
-      O(std::move(O)) {}
+    SymbolFlagsMap SymbolFlags, SymbolStringPtr InitSymbol)
+    : MaterializationUnit(std::move(SymbolFlags), std::move(InitSymbol),
+                          std::move(K)),
+      L(L), O(std::move(O)) {}
 
 StringRef BasicObjectLayerMaterializationUnit::getName() const {
   if (O)
@@ -147,38 +209,8 @@ void BasicObjectLayerMaterializationUnit::materialize(
 
 void BasicObjectLayerMaterializationUnit::discard(const JITDylib &JD,
                                                   const SymbolStringPtr &Name) {
-  // FIXME: Support object file level discard. This could be done by building a
-  //        filter to pass to the object layer along with the object itself.
-}
-
-Expected<SymbolFlagsMap> getObjectSymbolFlags(ExecutionSession &ES,
-                                              MemoryBufferRef ObjBuffer) {
-  auto Obj = object::ObjectFile::createObjectFile(ObjBuffer);
-
-  if (!Obj)
-    return Obj.takeError();
-
-  SymbolFlagsMap SymbolFlags;
-  for (auto &Sym : (*Obj)->symbols()) {
-    // Skip symbols not defined in this object file.
-    if (Sym.getFlags() & object::BasicSymbolRef::SF_Undefined)
-      continue;
-
-    // Skip symbols that are not global.
-    if (!(Sym.getFlags() & object::BasicSymbolRef::SF_Global))
-      continue;
-
-    auto Name = Sym.getName();
-    if (!Name)
-      return Name.takeError();
-    auto InternedName = ES.intern(*Name);
-    auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym);
-    if (!SymFlags)
-      return SymFlags.takeError();
-    SymbolFlags[InternedName] = std::move(*SymFlags);
-  }
-
-  return SymbolFlags;
+  // This is a no-op for object files: Having removed 'Name' from SymbolFlags
+  // the symbol will be dead-stripped by the JIT linker.
 }
 
 } // End namespace orc.
diff --git a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
index aab490feb8ea..153f6b80784f 100644
--- a/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LazyReexports.cpp
@@ -16,70 +16,86 @@
 namespace llvm {
 namespace orc {
 
-void LazyCallThroughManager::NotifyResolvedFunction::anchor() {}
-
 LazyCallThroughManager::LazyCallThroughManager(
-    ExecutionSession &ES, JITTargetAddress ErrorHandlerAddr,
-    std::unique_ptr<TrampolinePool> TP)
-    : ES(ES), ErrorHandlerAddr(ErrorHandlerAddr), TP(std::move(TP)) {}
+    ExecutionSession &ES, JITTargetAddress ErrorHandlerAddr, TrampolinePool *TP)
+    : ES(ES), ErrorHandlerAddr(ErrorHandlerAddr), TP(TP) {}
 
 Expected<JITTargetAddress> LazyCallThroughManager::getCallThroughTrampoline(
     JITDylib &SourceJD, SymbolStringPtr SymbolName,
-    std::shared_ptr<NotifyResolvedFunction> NotifyResolved) {
+    NotifyResolvedFunction NotifyResolved) {
+  assert(TP && "TrampolinePool not set");
+
   std::lock_guard<std::mutex> Lock(LCTMMutex);
   auto Trampoline = TP->getTrampoline();
 
   if (!Trampoline)
     return Trampoline.takeError();
 
-  Reexports[*Trampoline] = std::make_pair(&SourceJD, std::move(SymbolName));
+  Reexports[*Trampoline] = ReexportsEntry{&SourceJD, std::move(SymbolName)};
   Notifiers[*Trampoline] = std::move(NotifyResolved);
   return *Trampoline;
 }
 
-JITTargetAddress
-LazyCallThroughManager::callThroughToSymbol(JITTargetAddress TrampolineAddr) {
-  JITDylib *SourceJD = nullptr;
-  SymbolStringPtr SymbolName;
-
-  {
-    std::lock_guard<std::mutex> Lock(LCTMMutex);
-    auto I = Reexports.find(TrampolineAddr);
-    if (I == Reexports.end())
-      return ErrorHandlerAddr;
-    SourceJD = I->second.first;
-    SymbolName = I->second.second;
-  }
-
-  auto LookupResult = ES.lookup(
-      makeJITDylibSearchOrder(SourceJD, JITDylibLookupFlags::MatchAllSymbols),
-      SymbolName);
-
-  if (!LookupResult) {
-    ES.reportError(LookupResult.takeError());
-    return ErrorHandlerAddr;
-  }
+JITTargetAddress LazyCallThroughManager::reportCallThroughError(Error Err) {
+  ES.reportError(std::move(Err));
+  return ErrorHandlerAddr;
+}
 
-  auto ResolvedAddr = LookupResult->getAddress();
+Expected<LazyCallThroughManager::ReexportsEntry>
+LazyCallThroughManager::findReexport(JITTargetAddress TrampolineAddr) {
+  std::lock_guard<std::mutex> Lock(LCTMMutex);
+  auto I = Reexports.find(TrampolineAddr);
+  if (I == Reexports.end())
+    return createStringError(inconvertibleErrorCode(),
+                             "Missing reexport for trampoline address %p",
+                             TrampolineAddr);
+  return I->second;
+}
 
-  std::shared_ptr<NotifyResolvedFunction> NotifyResolved = nullptr;
+Error LazyCallThroughManager::notifyResolved(JITTargetAddress TrampolineAddr,
+                                             JITTargetAddress ResolvedAddr) {
+  NotifyResolvedFunction NotifyResolved;
   {
     std::lock_guard<std::mutex> Lock(LCTMMutex);
     auto I = Notifiers.find(TrampolineAddr);
     if (I != Notifiers.end()) {
-      NotifyResolved = I->second;
+      NotifyResolved = std::move(I->second);
       Notifiers.erase(I);
     }
   }
 
-  if (NotifyResolved) {
-    if (auto Err = (*NotifyResolved)(*SourceJD, SymbolName, ResolvedAddr)) {
-      ES.reportError(std::move(Err));
-      return ErrorHandlerAddr;
-    }
-  }
+  return NotifyResolved ? NotifyResolved(ResolvedAddr) : Error::success();
+}
 
-  return ResolvedAddr;
+void LazyCallThroughManager::resolveTrampolineLandingAddress(
+    JITTargetAddress TrampolineAddr,
+    NotifyLandingResolvedFunction NotifyLandingResolved) {
+
+  auto Entry = findReexport(TrampolineAddr);
+  if (!Entry)
+    return NotifyLandingResolved(reportCallThroughError(Entry.takeError()));
+
+  ES.lookup(
+      LookupKind::Static,
+      makeJITDylibSearchOrder(Entry->SourceJD,
+                              JITDylibLookupFlags::MatchAllSymbols),
+      SymbolLookupSet({Entry->SymbolName}), SymbolState::Ready,
+      [this, TrampolineAddr, SymbolName = Entry->SymbolName,
+       NotifyLandingResolved = std::move(NotifyLandingResolved)](
+          Expected<SymbolMap> Result) mutable {
+        if (Result) {
+          assert(Result->size() == 1 && "Unexpected result size");
+          assert(Result->count(SymbolName) && "Unexpected result value");
+          JITTargetAddress LandingAddr = (*Result)[SymbolName].getAddress();
+
+          if (auto Err = notifyResolved(TrampolineAddr, LandingAddr))
+            NotifyLandingResolved(reportCallThroughError(std::move(Err)));
+          else
+            NotifyLandingResolved(LandingAddr);
+        } else
+          NotifyLandingResolved(reportCallThroughError(Result.takeError()));
+      },
+      NoDependenciesToRegister);
 }
 
 Expected<std::unique_ptr<LazyCallThroughManager>>
@@ -125,15 +141,9 @@ LazyReexportsMaterializationUnit::LazyReexportsMaterializationUnit(
     LazyCallThroughManager &LCTManager, IndirectStubsManager &ISManager,
     JITDylib &SourceJD, SymbolAliasMap CallableAliases, ImplSymbolMap *SrcJDLoc,
     VModuleKey K)
-    : MaterializationUnit(extractFlags(CallableAliases), std::move(K)),
+    : MaterializationUnit(extractFlags(CallableAliases), nullptr, std::move(K)),
       LCTManager(LCTManager), ISManager(ISManager), SourceJD(SourceJD),
-      CallableAliases(std::move(CallableAliases)),
-      NotifyResolved(LazyCallThroughManager::createNotifyResolvedFunction(
-          [&ISManager](JITDylib &JD, const SymbolStringPtr &SymbolName,
-                       JITTargetAddress ResolvedAddr) {
-            return ISManager.updatePointer(*SymbolName, ResolvedAddr);
-          })),
-      AliaseeTable(SrcJDLoc) {}
+      CallableAliases(std::move(CallableAliases)), AliaseeTable(SrcJDLoc) {}
 
 StringRef LazyReexportsMaterializationUnit::getName() const {
   return "<Lazy Reexports>";
@@ -159,7 +169,11 @@ void LazyReexportsMaterializationUnit::materialize(
   for (auto &Alias : RequestedAliases) {
 
     auto CallThroughTrampoline = LCTManager.getCallThroughTrampoline(
-        SourceJD, Alias.second.Aliasee, NotifyResolved);
+        SourceJD, Alias.second.Aliasee,
+        [&ISManager = this->ISManager,
+         StubSym = Alias.first](JITTargetAddress ResolvedAddr) -> Error {
+          return ISManager.updatePointer(*StubSym, ResolvedAddr);
+        });
 
     if (!CallThroughTrampoline) {
       SourceJD.getExecutionSession().reportError(
diff --git a/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
new file mode 100644
index 000000000000..15c3aa79a2a8
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/MachOPlatform.cpp
@@ -0,0 +1,506 @@
+//===------ MachOPlatform.cpp - Utilities for executing MachO in Orc ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
+
+#include "llvm/BinaryFormat/MachO.h"
+#include "llvm/ExecutionEngine/Orc/DebugUtils.h"
+#include "llvm/Support/BinaryByteStream.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "orc"
+
+namespace {
+
+struct objc_class;
+struct objc_image_info;
+struct objc_object;
+struct objc_selector;
+
+using Class = objc_class *;
+using id = objc_object *;
+using SEL = objc_selector *;
+
+using ObjCMsgSendTy = id (*)(id, SEL, ...);
+using ObjCReadClassPairTy = Class (*)(Class, const objc_image_info *);
+using SelRegisterNameTy = SEL (*)(const char *);
+
+enum class ObjCRegistrationAPI { Uninitialized, Unavailable, Initialized };
+
+ObjCRegistrationAPI ObjCRegistrationAPIState =
+    ObjCRegistrationAPI::Uninitialized;
+ObjCMsgSendTy objc_msgSend = nullptr;
+ObjCReadClassPairTy objc_readClassPair = nullptr;
+SelRegisterNameTy sel_registerName = nullptr;
+
+} // end anonymous namespace
+
+namespace llvm {
+namespace orc {
+
+template <typename FnTy>
+static Error setUpObjCRegAPIFunc(FnTy &Target, sys::DynamicLibrary &LibObjC,
+                                 const char *Name) {
+  if (void *Addr = LibObjC.getAddressOfSymbol(Name))
+    Target = reinterpret_cast<FnTy>(Addr);
+  else
+    return make_error<StringError>(
+        (Twine("Could not find address for ") + Name).str(),
+        inconvertibleErrorCode());
+  return Error::success();
+}
+
+Error enableObjCRegistration(const char *PathToLibObjC) {
+  // If we've already tried to initialize then just bail out.
+  if (ObjCRegistrationAPIState != ObjCRegistrationAPI::Uninitialized)
+    return Error::success();
+
+  ObjCRegistrationAPIState = ObjCRegistrationAPI::Unavailable;
+
+  std::string ErrMsg;
+  auto LibObjC =
+      sys::DynamicLibrary::getPermanentLibrary(PathToLibObjC, &ErrMsg);
+
+  if (!LibObjC.isValid())
+    return make_error<StringError>(std::move(ErrMsg), inconvertibleErrorCode());
+
+  if (auto Err = setUpObjCRegAPIFunc(objc_msgSend, LibObjC, "objc_msgSend"))
+    return Err;
+  if (auto Err = setUpObjCRegAPIFunc(objc_readClassPair, LibObjC,
+                                     "objc_readClassPair"))
+    return Err;
+  if (auto Err =
+          setUpObjCRegAPIFunc(sel_registerName, LibObjC, "sel_registerName"))
+    return Err;
+
+  ObjCRegistrationAPIState = ObjCRegistrationAPI::Initialized;
+  return Error::success();
+}
+
+bool objCRegistrationEnabled() {
+  return ObjCRegistrationAPIState == ObjCRegistrationAPI::Initialized;
+}
+
+void MachOJITDylibInitializers::runModInits() const {
+  for (const auto &ModInit : ModInitSections) {
+    for (uint64_t I = 0; I != ModInit.NumPtrs; ++I) {
+      auto *InitializerAddr = jitTargetAddressToPointer<uintptr_t *>(
+          ModInit.Address + (I * sizeof(uintptr_t)));
+      auto *Initializer =
+          jitTargetAddressToFunction<void (*)()>(*InitializerAddr);
+      Initializer();
+    }
+  }
+}
+
+void MachOJITDylibInitializers::registerObjCSelectors() const {
+  assert(objCRegistrationEnabled() && "ObjC registration not enabled.");
+
+  for (const auto &ObjCSelRefs : ObjCSelRefsSections) {
+    for (uint64_t I = 0; I != ObjCSelRefs.NumPtrs; ++I) {
+      auto SelEntryAddr = ObjCSelRefs.Address + (I * sizeof(uintptr_t));
+      const auto *SelName =
+          *jitTargetAddressToPointer<const char **>(SelEntryAddr);
+      auto Sel = sel_registerName(SelName);
+      *jitTargetAddressToPointer<SEL *>(SelEntryAddr) = Sel;
+    }
+  }
+}
+
+Error MachOJITDylibInitializers::registerObjCClasses() const {
+  assert(objCRegistrationEnabled() && "ObjC registration not enabled.");
+
+  struct ObjCClassCompiled {
+    void *Metaclass;
+    void *Parent;
+    void *Cache1;
+    void *Cache2;
+    void *Data;
+  };
+
+  auto *ImageInfo =
+      jitTargetAddressToPointer<const objc_image_info *>(ObjCImageInfoAddr);
+  auto ClassSelector = sel_registerName("class");
+
+  for (const auto &ObjCClassList : ObjCClassListSections) {
+    for (uint64_t I = 0; I != ObjCClassList.NumPtrs; ++I) {
+      auto ClassPtrAddr = ObjCClassList.Address + (I * sizeof(uintptr_t));
+      auto Cls = *jitTargetAddressToPointer<Class *>(ClassPtrAddr);
+      auto *ClassCompiled =
+          *jitTargetAddressToPointer<ObjCClassCompiled **>(ClassPtrAddr);
+      objc_msgSend(reinterpret_cast<id>(ClassCompiled->Parent), ClassSelector);
+      auto Registered = objc_readClassPair(Cls, ImageInfo);
+
+      // FIXME: Improve diagnostic by reporting the failed class's name.
+      if (Registered != Cls)
+        return make_error<StringError>("Unable to register Objective-C class",
+                                       inconvertibleErrorCode());
+    }
+  }
+  return Error::success();
+}
+
+MachOPlatform::MachOPlatform(
+    ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
+    std::unique_ptr<MemoryBuffer> StandardSymbolsObject)
+    : ES(ES), ObjLinkingLayer(ObjLinkingLayer),
+      StandardSymbolsObject(std::move(StandardSymbolsObject)) {
+  ObjLinkingLayer.addPlugin(std::make_unique<InitScraperPlugin>(*this));
+}
+
+Error MachOPlatform::setupJITDylib(JITDylib &JD) {
+  auto ObjBuffer = MemoryBuffer::getMemBuffer(
+      StandardSymbolsObject->getMemBufferRef(), false);
+  return ObjLinkingLayer.add(JD, std::move(ObjBuffer));
+}
+
+Error MachOPlatform::notifyAdding(JITDylib &JD, const MaterializationUnit &MU) {
+  const auto &InitSym = MU.getInitializerSymbol();
+  if (!InitSym)
+    return Error::success();
+
+  RegisteredInitSymbols[&JD].add(InitSym,
+                                 SymbolLookupFlags::WeaklyReferencedSymbol);
+  LLVM_DEBUG({
+    dbgs() << "MachOPlatform: Registered init symbol " << *InitSym << " for MU "
+           << MU.getName() << "\n";
+  });
+  return Error::success();
+}
+
+Error MachOPlatform::notifyRemoving(JITDylib &JD, VModuleKey K) {
+  llvm_unreachable("Not supported yet");
+}
+
+Expected<MachOPlatform::InitializerSequence>
+MachOPlatform::getInitializerSequence(JITDylib &JD) {
+
+  LLVM_DEBUG({
+    dbgs() << "MachOPlatform: Building initializer sequence for "
+           << JD.getName() << "\n";
+  });
+
+  std::vector<JITDylib *> DFSLinkOrder;
+
+  while (true) {
+
+    DenseMap<JITDylib *, SymbolLookupSet> NewInitSymbols;
+
+    ES.runSessionLocked([&]() {
+      DFSLinkOrder = getDFSLinkOrder(JD);
+
+      for (auto *InitJD : DFSLinkOrder) {
+        auto RISItr = RegisteredInitSymbols.find(InitJD);
+        if (RISItr != RegisteredInitSymbols.end()) {
+          NewInitSymbols[InitJD] = std::move(RISItr->second);
+          RegisteredInitSymbols.erase(RISItr);
+        }
+      }
+    });
+
+    if (NewInitSymbols.empty())
+      break;
+
+    LLVM_DEBUG({
+      dbgs() << "MachOPlatform: Issuing lookups for new init symbols: "
+                "(lookup may require multiple rounds)\n";
+      for (auto &KV : NewInitSymbols)
+        dbgs() << "  \"" << KV.first->getName() << "\": " << KV.second << "\n";
+    });
+
+    // Outside the lock, issue the lookup.
+    if (auto R = lookupInitSymbols(JD.getExecutionSession(), NewInitSymbols))
+      ; // Nothing to do in the success case.
+    else
+      return R.takeError();
+  }
+
+  LLVM_DEBUG({
+    dbgs() << "MachOPlatform: Init symbol lookup complete, building init "
+              "sequence\n";
+  });
+
+  // Lock again to collect the initializers.
+  InitializerSequence FullInitSeq;
+  {
+    std::lock_guard<std::mutex> Lock(InitSeqsMutex);
+    for (auto *InitJD : reverse(DFSLinkOrder)) {
+      LLVM_DEBUG({
+        dbgs() << "MachOPlatform: Appending inits for \"" << InitJD->getName()
+               << "\" to sequence\n";
+      });
+      auto ISItr = InitSeqs.find(InitJD);
+      if (ISItr != InitSeqs.end()) {
+        FullInitSeq.emplace_back(InitJD, std::move(ISItr->second));
+        InitSeqs.erase(ISItr);
+      }
+    }
+  }
+
+  return FullInitSeq;
+}
+
+Expected<MachOPlatform::DeinitializerSequence>
+MachOPlatform::getDeinitializerSequence(JITDylib &JD) {
+  std::vector<JITDylib *> DFSLinkOrder = getDFSLinkOrder(JD);
+
+  DeinitializerSequence FullDeinitSeq;
+  {
+    std::lock_guard<std::mutex> Lock(InitSeqsMutex);
+    for (auto *DeinitJD : DFSLinkOrder) {
+      FullDeinitSeq.emplace_back(DeinitJD, MachOJITDylibDeinitializers());
+    }
+  }
+
+  return FullDeinitSeq;
+}
+
+std::vector<JITDylib *> MachOPlatform::getDFSLinkOrder(JITDylib &JD) {
+  std::vector<JITDylib *> Result, WorkStack({&JD});
+  DenseSet<JITDylib *> Visited;
+
+  while (!WorkStack.empty()) {
+    auto *NextJD = WorkStack.back();
+    WorkStack.pop_back();
+    if (Visited.count(NextJD))
+      continue;
+    Visited.insert(NextJD);
+    Result.push_back(NextJD);
+    NextJD->withLinkOrderDo([&](const JITDylibSearchOrder &LO) {
+      for (auto &KV : LO)
+        WorkStack.push_back(KV.first);
+    });
+  }
+
+  return Result;
+}
+
+void MachOPlatform::registerInitInfo(
+    JITDylib &JD, JITTargetAddress ObjCImageInfoAddr,
+    MachOJITDylibInitializers::SectionExtent ModInits,
+    MachOJITDylibInitializers::SectionExtent ObjCSelRefs,
+    MachOJITDylibInitializers::SectionExtent ObjCClassList) {
+  std::lock_guard<std::mutex> Lock(InitSeqsMutex);
+
+  auto &InitSeq = InitSeqs[&JD];
+
+  InitSeq.setObjCImageInfoAddr(ObjCImageInfoAddr);
+
+  if (ModInits.Address)
+    InitSeq.addModInitsSection(std::move(ModInits));
+
+  if (ObjCSelRefs.Address)
+    InitSeq.addObjCSelRefsSection(std::move(ObjCSelRefs));
+
+  if (ObjCClassList.Address)
+    InitSeq.addObjCClassListSection(std::move(ObjCClassList));
+}
+
+static Expected<MachOJITDylibInitializers::SectionExtent>
+getSectionExtent(jitlink::LinkGraph &G, StringRef SectionName) {
+  auto *Sec = G.findSectionByName(SectionName);
+  if (!Sec)
+    return MachOJITDylibInitializers::SectionExtent();
+  jitlink::SectionRange R(*Sec);
+  if (R.getSize() % G.getPointerSize() != 0)
+    return make_error<StringError>(SectionName + " section size is not a "
+                                                 "multiple of the pointer size",
+                                   inconvertibleErrorCode());
+  return MachOJITDylibInitializers::SectionExtent(
+      R.getStart(), R.getSize() / G.getPointerSize());
+}
+
+void MachOPlatform::InitScraperPlugin::modifyPassConfig(
+    MaterializationResponsibility &MR, const Triple &TT,
+    jitlink::PassConfiguration &Config) {
+
+  Config.PrePrunePasses.push_back([this, &MR](jitlink::LinkGraph &G) -> Error {
+    JITLinkSymbolVector InitSectionSymbols;
+    preserveInitSectionIfPresent(InitSectionSymbols, G, "__mod_init_func");
+    preserveInitSectionIfPresent(InitSectionSymbols, G, "__objc_selrefs");
+    preserveInitSectionIfPresent(InitSectionSymbols, G, "__objc_classlist");
+
+    if (!InitSymbolDeps.empty()) {
+      std::lock_guard<std::mutex> Lock(InitScraperMutex);
+      InitSymbolDeps[&MR] = std::move(InitSectionSymbols);
+    }
+
+    if (auto Err = processObjCImageInfo(G, MR))
+      return Err;
+
+    return Error::success();
+  });
+
+  Config.PostFixupPasses.push_back([this, &JD = MR.getTargetJITDylib()](
+                                       jitlink::LinkGraph &G) -> Error {
+    MachOJITDylibInitializers::SectionExtent ModInits, ObjCSelRefs,
+        ObjCClassList;
+
+    JITTargetAddress ObjCImageInfoAddr = 0;
+    if (auto *ObjCImageInfoSec = G.findSectionByName("__objc_image_info")) {
+      if (auto Addr = jitlink::SectionRange(*ObjCImageInfoSec).getStart()) {
+        ObjCImageInfoAddr = Addr;
+        dbgs() << "Recorded __objc_imageinfo @ " << formatv("{0:x16}", Addr);
+      }
+    }
+
+    // Record __mod_init_func.
+    if (auto ModInitsOrErr = getSectionExtent(G, "__mod_init_func"))
+      ModInits = std::move(*ModInitsOrErr);
+    else
+      return ModInitsOrErr.takeError();
+
+    // Record __objc_selrefs.
+    if (auto ObjCSelRefsOrErr = getSectionExtent(G, "__objc_selrefs"))
+      ObjCSelRefs = std::move(*ObjCSelRefsOrErr);
+    else
+      return ObjCSelRefsOrErr.takeError();
+
+    // Record __objc_classlist.
+    if (auto ObjCClassListOrErr = getSectionExtent(G, "__objc_classlist"))
+      ObjCClassList = std::move(*ObjCClassListOrErr);
+    else
+      return ObjCClassListOrErr.takeError();
+
+    // Dump the scraped inits.
+    LLVM_DEBUG({
+      dbgs() << "MachOPlatform: Scraped " << G.getName() << " init sections:\n";
+      dbgs() << "  __objc_selrefs: ";
+      if (ObjCSelRefs.NumPtrs)
+        dbgs() << ObjCSelRefs.NumPtrs << " pointer(s) at "
+               << formatv("{0:x16}", ObjCSelRefs.Address) << "\n";
+      else
+        dbgs() << "none\n";
+
+      dbgs() << "  __objc_classlist: ";
+      if (ObjCClassList.NumPtrs)
+        dbgs() << ObjCClassList.NumPtrs << " pointer(s) at "
+               << formatv("{0:x16}", ObjCClassList.Address) << "\n";
+      else
+        dbgs() << "none\n";
+
+      dbgs() << "  __mod_init_func: ";
+      if (ModInits.NumPtrs)
+        dbgs() << ModInits.NumPtrs << " pointer(s) at "
+               << formatv("{0:x16}", ModInits.Address) << "\n";
+      else
+        dbgs() << "none\n";
+    });
+
+    MP.registerInitInfo(JD, ObjCImageInfoAddr, std::move(ModInits),
+                        std::move(ObjCSelRefs), std::move(ObjCClassList));
+
+    return Error::success();
+  });
+}
+
+ObjectLinkingLayer::Plugin::LocalDependenciesMap
+MachOPlatform::InitScraperPlugin::getSyntheticSymbolLocalDependencies(
+    MaterializationResponsibility &MR) {
+  std::lock_guard<std::mutex> Lock(InitScraperMutex);
+  auto I = InitSymbolDeps.find(&MR);
+  if (I != InitSymbolDeps.end()) {
+    LocalDependenciesMap Result;
+    Result[MR.getInitializerSymbol()] = std::move(I->second);
+    InitSymbolDeps.erase(&MR);
+    return Result;
+  }
+  return LocalDependenciesMap();
+}
+
+void MachOPlatform::InitScraperPlugin::preserveInitSectionIfPresent(
+    JITLinkSymbolVector &Symbols, jitlink::LinkGraph &G,
+    StringRef SectionName) {
+  if (auto *Sec = G.findSectionByName(SectionName)) {
+    auto SecBlocks = Sec->blocks();
+    if (!llvm::empty(SecBlocks))
+      Symbols.push_back(
+          &G.addAnonymousSymbol(**SecBlocks.begin(), 0, 0, false, true));
+  }
+}
+
+Error MachOPlatform::InitScraperPlugin::processObjCImageInfo(
+    jitlink::LinkGraph &G, MaterializationResponsibility &MR) {
+
+  // If there's an ObjC imagine info then either
+  //   (1) It's the first __objc_imageinfo we've seen in this JITDylib. In
+  //       this case we name and record it.
+  // OR
+  //   (2) We already have a recorded __objc_imageinfo for this JITDylib,
+  //       in which case we just verify it.
+  auto *ObjCImageInfo = G.findSectionByName("__objc_imageinfo");
+  if (!ObjCImageInfo)
+    return Error::success();
+
+  auto ObjCImageInfoBlocks = ObjCImageInfo->blocks();
+
+  // Check that the section is not empty if present.
+  if (llvm::empty(ObjCImageInfoBlocks))
+    return make_error<StringError>("Empty __objc_imageinfo section in " +
+                                       G.getName(),
+                                   inconvertibleErrorCode());
+
+  // Check that there's only one block in the section.
+  if (std::next(ObjCImageInfoBlocks.begin()) != ObjCImageInfoBlocks.end())
+    return make_error<StringError>("Multiple blocks in __objc_imageinfo "
+                                   "section in " +
+                                       G.getName(),
+                                   inconvertibleErrorCode());
+
+  // Check that the __objc_imageinfo section is unreferenced.
+  // FIXME: We could optimize this check if Symbols had a ref-count.
+  for (auto &Sec : G.sections()) {
+    if (&Sec != ObjCImageInfo)
+      for (auto *B : Sec.blocks())
+        for (auto &E : B->edges())
+          if (E.getTarget().isDefined() &&
+              &E.getTarget().getBlock().getSection() == ObjCImageInfo)
+            return make_error<StringError>("__objc_imageinfo is referenced "
+                                           "within file " +
+                                               G.getName(),
+                                           inconvertibleErrorCode());
+  }
+
+  auto &ObjCImageInfoBlock = **ObjCImageInfoBlocks.begin();
+  auto *ObjCImageInfoData = ObjCImageInfoBlock.getContent().data();
+  auto Version = support::endian::read32(ObjCImageInfoData, G.getEndianness());
+  auto Flags =
+      support::endian::read32(ObjCImageInfoData + 4, G.getEndianness());
+
+  // Lock the mutex while we verify / update the ObjCImageInfos map.
+  std::lock_guard<std::mutex> Lock(InitScraperMutex);
+
+  auto ObjCImageInfoItr = ObjCImageInfos.find(&MR.getTargetJITDylib());
+  if (ObjCImageInfoItr != ObjCImageInfos.end()) {
+    // We've already registered an __objc_imageinfo section. Verify the
+    // content of this new section matches, then delete it.
+    if (ObjCImageInfoItr->second.first != Version)
+      return make_error<StringError>(
+          "ObjC version in " + G.getName() +
+              " does not match first registered version",
+          inconvertibleErrorCode());
+    if (ObjCImageInfoItr->second.second != Flags)
+      return make_error<StringError>("ObjC flags in " + G.getName() +
+                                         " do not match first registered flags",
+                                     inconvertibleErrorCode());
+
+    // __objc_imageinfo is valid. Delete the block.
+    for (auto *S : ObjCImageInfo->symbols())
+      G.removeDefinedSymbol(*S);
+    G.removeBlock(ObjCImageInfoBlock);
+  } else {
+    // We haven't registered an __objc_imageinfo section yet. Register and
+    // move on. The section should already be marked no-dead-strip.
+    ObjCImageInfos[&MR.getTargetJITDylib()] = std::make_pair(Version, Flags);
+  }
+
+  return Error::success();
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/Mangling.cpp b/llvm/lib/ExecutionEngine/Orc/Mangling.cpp
new file mode 100644
index 000000000000..606304741cf7
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/Mangling.cpp
@@ -0,0 +1,160 @@
+//===----------- Mangling.cpp -- Name Mangling Utilities for ORC ----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ExecutionEngine/Orc/Mangling.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Mangler.h"
+#include "llvm/Object/MachO.h"
+#include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "orc"
+
+namespace llvm {
+namespace orc {
+
+MangleAndInterner::MangleAndInterner(ExecutionSession &ES, const DataLayout &DL)
+    : ES(ES), DL(DL) {}
+
+SymbolStringPtr MangleAndInterner::operator()(StringRef Name) {
+  std::string MangledName;
+  {
+    raw_string_ostream MangledNameStream(MangledName);
+    Mangler::getNameWithPrefix(MangledNameStream, Name, DL);
+  }
+  return ES.intern(MangledName);
+}
+
+void IRSymbolMapper::add(ExecutionSession &ES, const ManglingOptions &MO,
+                         ArrayRef<GlobalValue *> GVs,
+                         SymbolFlagsMap &SymbolFlags,
+                         SymbolNameToDefinitionMap *SymbolToDefinition) {
+  if (GVs.empty())
+    return;
+
+  MangleAndInterner Mangle(ES, GVs[0]->getParent()->getDataLayout());
+  for (auto *G : GVs) {
+    assert(G && "GVs cannot contain null elements");
+    if (!G->hasName() || G->isDeclaration() || G->hasLocalLinkage() ||
+        G->hasAvailableExternallyLinkage() || G->hasAppendingLinkage())
+      continue;
+
+    if (G->isThreadLocal() && MO.EmulatedTLS) {
+      auto *GV = cast<GlobalVariable>(G);
+
+      auto Flags = JITSymbolFlags::fromGlobalValue(*GV);
+
+      auto EmuTLSV = Mangle(("__emutls_v." + GV->getName()).str());
+      SymbolFlags[EmuTLSV] = Flags;
+      if (SymbolToDefinition)
+        (*SymbolToDefinition)[EmuTLSV] = GV;
+
+      // If this GV has a non-zero initializer we'll need to emit an
+      // __emutls.t symbol too.
+      if (GV->hasInitializer()) {
+        const auto *InitVal = GV->getInitializer();
+
+        // Skip zero-initializers.
+        if (isa<ConstantAggregateZero>(InitVal))
+          continue;
+        const auto *InitIntValue = dyn_cast<ConstantInt>(InitVal);
+        if (InitIntValue && InitIntValue->isZero())
+          continue;
+
+        auto EmuTLST = Mangle(("__emutls_t." + GV->getName()).str());
+        SymbolFlags[EmuTLST] = Flags;
+        if (SymbolToDefinition)
+          (*SymbolToDefinition)[EmuTLST] = GV;
+      }
+      continue;
+    }
+
+    // Otherwise we just need a normal linker mangling.
+    auto MangledName = Mangle(G->getName());
+    SymbolFlags[MangledName] = JITSymbolFlags::fromGlobalValue(*G);
+    if (SymbolToDefinition)
+      (*SymbolToDefinition)[MangledName] = G;
+  }
+}
+
+Expected<std::pair<SymbolFlagsMap, SymbolStringPtr>>
+getObjectSymbolInfo(ExecutionSession &ES, MemoryBufferRef ObjBuffer) {
+  auto Obj = object::ObjectFile::createObjectFile(ObjBuffer);
+
+  if (!Obj)
+    return Obj.takeError();
+
+  bool IsMachO = isa<object::MachOObjectFile>(Obj->get());
+
+  SymbolFlagsMap SymbolFlags;
+  for (auto &Sym : (*Obj)->symbols()) {
+    Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
+    if (!SymFlagsOrErr)
+      // TODO: Test this error.
+      return SymFlagsOrErr.takeError();
+
+    // Skip symbols not defined in this object file.
+    if (*SymFlagsOrErr & object::BasicSymbolRef::SF_Undefined)
+      continue;
+
+    // Skip symbols that are not global.
+    if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global))
+      continue;
+
+    // Skip symbols that have type SF_File.
+    if (auto SymType = Sym.getType()) {
+      if (*SymType == object::SymbolRef::ST_File)
+        continue;
+    } else
+      return SymType.takeError();
+
+    auto Name = Sym.getName();
+    if (!Name)
+      return Name.takeError();
+    auto InternedName = ES.intern(*Name);
+    auto SymFlags = JITSymbolFlags::fromObjectSymbol(Sym);
+    if (!SymFlags)
+      return SymFlags.takeError();
+
+    // Strip the 'exported' flag from MachO linker-private symbols.
+    if (IsMachO && Name->startswith("l"))
+      *SymFlags &= ~JITSymbolFlags::Exported;
+
+    SymbolFlags[InternedName] = std::move(*SymFlags);
+  }
+
+  SymbolStringPtr InitSymbol;
+
+  if (IsMachO) {
+    auto &MachOObj = cast<object::MachOObjectFile>(*Obj->get());
+    for (auto &Sec : MachOObj.sections()) {
+      auto SecType = MachOObj.getSectionType(Sec);
+      if ((SecType & MachO::SECTION_TYPE) == MachO::S_MOD_INIT_FUNC_POINTERS) {
+        size_t Counter = 0;
+        while (true) {
+          std::string InitSymString;
+          raw_string_ostream(InitSymString)
+              << "$." << ObjBuffer.getBufferIdentifier() << ".__inits."
+              << Counter++;
+          InitSymbol = ES.intern(InitSymString);
+          if (SymbolFlags.count(InitSymbol))
+            continue;
+          SymbolFlags[InitSymbol] =
+              JITSymbolFlags::MaterializationSideEffectsOnly;
+          break;
+        }
+        break;
+      }
+    }
+  }
+
+  return std::make_pair(std::move(SymbolFlags), std::move(InitSymbol));
+}
+
+} // End namespace orc.
+} // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
index 2572b7f4878d..02066b458dfc 100644
--- a/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ObjectLinkingLayer.cpp
@@ -50,9 +50,9 @@ public:
   void lookup(const LookupMap &Symbols,
               std::unique_ptr<JITLinkAsyncLookupContinuation> LC) override {
 
-    JITDylibSearchOrder SearchOrder;
-    MR.getTargetJITDylib().withSearchOrderDo(
-        [&](const JITDylibSearchOrder &O) { SearchOrder = O; });
+    JITDylibSearchOrder LinkOrder;
+    MR.getTargetJITDylib().withLinkOrderDo(
+        [&](const JITDylibSearchOrder &LO) { LinkOrder = LO; });
 
     auto &ES = Layer.getExecutionSession();
 
@@ -84,7 +84,13 @@ public:
       }
     };
 
-    ES.lookup(LookupKind::Static, SearchOrder, std::move(LookupSet),
+    for (auto &KV : InternalNamedSymbolDeps) {
+      SymbolDependenceMap InternalDeps;
+      InternalDeps[&MR.getTargetJITDylib()] = std::move(KV.second);
+      MR.addDependencies(KV.first, InternalDeps);
+    }
+
+    ES.lookup(LookupKind::Static, LinkOrder, std::move(LookupSet),
               SymbolState::Resolved, std::move(OnResolve),
               [this](const SymbolDependenceMap &Deps) {
                 registerDependencies(Deps);
@@ -138,6 +144,56 @@ public:
     if (!ExtraSymbolsToClaim.empty())
       if (auto Err = MR.defineMaterializing(ExtraSymbolsToClaim))
         return notifyFailed(std::move(Err));
+
+    {
+
+      // Check that InternedResult matches up with MR.getSymbols().
+      // This guards against faulty transformations / compilers / object caches.
+
+      // First check that there aren't any missing symbols.
+      size_t NumMaterializationSideEffectsOnlySymbols = 0;
+      SymbolNameVector ExtraSymbols;
+      SymbolNameVector MissingSymbols;
+      for (auto &KV : MR.getSymbols()) {
+
+        // If this is a materialization-side-effects only symbol then bump
+        // the counter and make sure it's *not* defined, otherwise make
+        // sure that it is defined.
+        if (KV.second.hasMaterializationSideEffectsOnly()) {
+          ++NumMaterializationSideEffectsOnlySymbols;
+          if (InternedResult.count(KV.first))
+            ExtraSymbols.push_back(KV.first);
+          continue;
+        } else if (!InternedResult.count(KV.first))
+          MissingSymbols.push_back(KV.first);
+      }
+
+      // If there were missing symbols then report the error.
+      if (!MissingSymbols.empty()) {
+        ES.reportError(make_error<MissingSymbolDefinitions>(
+            G.getName(), std::move(MissingSymbols)));
+        MR.failMaterialization();
+        return;
+      }
+
+      // If there are more definitions than expected, add them to the
+      // ExtraSymbols vector.
+      if (InternedResult.size() >
+          MR.getSymbols().size() - NumMaterializationSideEffectsOnlySymbols) {
+        for (auto &KV : InternedResult)
+          if (!MR.getSymbols().count(KV.first))
+            ExtraSymbols.push_back(KV.first);
+      }
+
+      // If there were extra definitions then report the error.
+      if (!ExtraSymbols.empty()) {
+        ES.reportError(make_error<UnexpectedSymbolDefinitions>(
+            G.getName(), std::move(ExtraSymbols)));
+        MR.failMaterialization();
+        return;
+      }
+    }
+
     if (auto Err = MR.notifyResolved(InternedResult)) {
       Layer.getExecutionSession().reportError(std::move(Err));
       MR.failMaterialization();
@@ -168,16 +224,22 @@ public:
     // link graph to build the symbol dependence graph.
     Config.PrePrunePasses.push_back(
         [this](LinkGraph &G) { return externalizeWeakAndCommonSymbols(G); });
-    Config.PostPrunePasses.push_back(
-        [this](LinkGraph &G) { return computeNamedSymbolDependencies(G); });
 
     Layer.modifyPassConfig(MR, TT, Config);
 
+    Config.PostPrunePasses.push_back(
+        [this](LinkGraph &G) { return computeNamedSymbolDependencies(G); });
+
     return Error::success();
   }
 
 private:
-  using AnonToNamedDependenciesMap = DenseMap<const Symbol *, SymbolNameSet>;
+  struct LocalSymbolNamedDependencies {
+    SymbolNameSet Internal, External;
+  };
+
+  using LocalSymbolNamedDependenciesMap =
+      DenseMap<const Symbol *, LocalSymbolNamedDependencies>;
 
   Error externalizeWeakAndCommonSymbols(LinkGraph &G) {
     auto &ES = Layer.getExecutionSession();
@@ -206,30 +268,69 @@ private:
 
   Error computeNamedSymbolDependencies(LinkGraph &G) {
     auto &ES = MR.getTargetJITDylib().getExecutionSession();
-    auto AnonDeps = computeAnonDeps(G);
+    auto LocalDeps = computeLocalDeps(G);
 
+    // Compute dependencies for symbols defined in the JITLink graph.
     for (auto *Sym : G.defined_symbols()) {
 
-      // Skip anonymous and non-global atoms: we do not need dependencies for
-      // these.
+      // Skip local symbols: we do not track dependencies for these.
       if (Sym->getScope() == Scope::Local)
         continue;
+      assert(Sym->hasName() &&
+             "Defined non-local jitlink::Symbol should have a name");
 
-      auto SymName = ES.intern(Sym->getName());
-      SymbolNameSet &SymDeps = NamedSymbolDeps[SymName];
+      SymbolNameSet ExternalSymDeps, InternalSymDeps;
 
+      // Find internal and external named symbol dependencies.
       for (auto &E : Sym->getBlock().edges()) {
         auto &TargetSym = E.getTarget();
 
-        if (TargetSym.getScope() != Scope::Local)
-          SymDeps.insert(ES.intern(TargetSym.getName()));
-        else {
+        if (TargetSym.getScope() != Scope::Local) {
+          if (TargetSym.isExternal())
+            ExternalSymDeps.insert(ES.intern(TargetSym.getName()));
+          else if (&TargetSym != Sym)
+            InternalSymDeps.insert(ES.intern(TargetSym.getName()));
+        } else {
           assert(TargetSym.isDefined() &&
-                 "Anonymous/local symbols must be defined");
-          auto I = AnonDeps.find(&TargetSym);
-          if (I != AnonDeps.end())
-            for (auto &S : I->second)
-              SymDeps.insert(S);
+                 "local symbols must be defined");
+          auto I = LocalDeps.find(&TargetSym);
+          if (I != LocalDeps.end()) {
+            for (auto &S : I->second.External)
+              ExternalSymDeps.insert(S);
+            for (auto &S : I->second.Internal)
+              InternalSymDeps.insert(S);
+          }
+        }
+      }
+
+      if (ExternalSymDeps.empty() && InternalSymDeps.empty())
+        continue;
+
+      auto SymName = ES.intern(Sym->getName());
+      if (!ExternalSymDeps.empty())
+        ExternalNamedSymbolDeps[SymName] = std::move(ExternalSymDeps);
+      if (!InternalSymDeps.empty())
+        InternalNamedSymbolDeps[SymName] = std::move(InternalSymDeps);
+    }
+
+    for (auto &P : Layer.Plugins) {
+      auto SyntheticLocalDeps = P->getSyntheticSymbolLocalDependencies(MR);
+      if (SyntheticLocalDeps.empty())
+        continue;
+
+      for (auto &KV : SyntheticLocalDeps) {
+        auto &Name = KV.first;
+        auto &LocalDepsForName = KV.second;
+        for (auto *Local : LocalDepsForName) {
+          assert(Local->getScope() == Scope::Local &&
+                 "Dependence on non-local symbol");
+          auto LocalNamedDepsItr = LocalDeps.find(Local);
+          if (LocalNamedDepsItr == LocalDeps.end())
+            continue;
+          for (auto &S : LocalNamedDepsItr->second.Internal)
+            InternalNamedSymbolDeps[Name].insert(S);
+          for (auto &S : LocalNamedDepsItr->second.External)
+            ExternalNamedSymbolDeps[Name].insert(S);
         }
       }
     }
@@ -237,68 +338,85 @@ private:
     return Error::success();
   }
 
-  AnonToNamedDependenciesMap computeAnonDeps(LinkGraph &G) {
+  LocalSymbolNamedDependenciesMap computeLocalDeps(LinkGraph &G) {
+    DenseMap<jitlink::Symbol *, DenseSet<jitlink::Symbol *>> DepMap;
 
-    auto &ES = MR.getTargetJITDylib().getExecutionSession();
-    AnonToNamedDependenciesMap DepMap;
-
-    // For all anonymous symbols:
+    // For all local symbols:
     // (1) Add their named dependencies.
     // (2) Add them to the worklist for further iteration if they have any
-    //     depend on any other anonymous symbols.
+    //     depend on any other local symbols.
     struct WorklistEntry {
-      WorklistEntry(Symbol *Sym, DenseSet<Symbol *> SymAnonDeps)
-          : Sym(Sym), SymAnonDeps(std::move(SymAnonDeps)) {}
+      WorklistEntry(Symbol *Sym, DenseSet<Symbol *> LocalDeps)
+          : Sym(Sym), LocalDeps(std::move(LocalDeps)) {}
 
       Symbol *Sym = nullptr;
-      DenseSet<Symbol *> SymAnonDeps;
+      DenseSet<Symbol *> LocalDeps;
     };
     std::vector<WorklistEntry> Worklist;
     for (auto *Sym : G.defined_symbols())
-      if (!Sym->hasName()) {
+      if (Sym->getScope() == Scope::Local) {
         auto &SymNamedDeps = DepMap[Sym];
-        DenseSet<Symbol *> SymAnonDeps;
+        DenseSet<Symbol *> LocalDeps;
 
         for (auto &E : Sym->getBlock().edges()) {
           auto &TargetSym = E.getTarget();
-          if (TargetSym.hasName())
-            SymNamedDeps.insert(ES.intern(TargetSym.getName()));
+          if (TargetSym.getScope() != Scope::Local)
+            SymNamedDeps.insert(&TargetSym);
           else {
             assert(TargetSym.isDefined() &&
-                   "Anonymous symbols must be defined");
-            SymAnonDeps.insert(&TargetSym);
+                   "local symbols must be defined");
+            LocalDeps.insert(&TargetSym);
           }
         }
 
-        if (!SymAnonDeps.empty())
-          Worklist.push_back(WorklistEntry(Sym, std::move(SymAnonDeps)));
+        if (!LocalDeps.empty())
+          Worklist.push_back(WorklistEntry(Sym, std::move(LocalDeps)));
       }
 
-    // Loop over all anonymous symbols with anonymous dependencies, propagating
-    // their respective *named* dependencies. Iterate until we hit a stable
+    // Loop over all local symbols with local dependencies, propagating
+    // their respective non-local dependencies. Iterate until we hit a stable
     // state.
     bool Changed;
     do {
       Changed = false;
       for (auto &WLEntry : Worklist) {
         auto *Sym = WLEntry.Sym;
-        auto &SymNamedDeps = DepMap[Sym];
-        auto &SymAnonDeps = WLEntry.SymAnonDeps;
+        auto &NamedDeps = DepMap[Sym];
+        auto &LocalDeps = WLEntry.LocalDeps;
 
-        for (auto *TargetSym : SymAnonDeps) {
+        for (auto *TargetSym : LocalDeps) {
           auto I = DepMap.find(TargetSym);
           if (I != DepMap.end())
             for (const auto &S : I->second)
-              Changed |= SymNamedDeps.insert(S).second;
+              Changed |= NamedDeps.insert(S).second;
         }
       }
     } while (Changed);
 
-    return DepMap;
+    // Intern the results to produce a mapping of jitlink::Symbol* to internal
+    // and external symbol names.
+    auto &ES = Layer.getExecutionSession();
+    LocalSymbolNamedDependenciesMap Result;
+    for (auto &KV : DepMap) {
+      auto *Local = KV.first;
+      assert(Local->getScope() == Scope::Local &&
+             "DepMap keys should all be local symbols");
+      auto &LocalNamedDeps = Result[Local];
+      for (auto *Named : KV.second) {
+        assert(Named->getScope() != Scope::Local &&
+               "DepMap values should all be non-local symbol sets");
+        if (Named->isExternal())
+          LocalNamedDeps.External.insert(ES.intern(Named->getName()));
+        else
+          LocalNamedDeps.Internal.insert(ES.intern(Named->getName()));
+      }
+    }
+
+    return Result;
   }
 
   void registerDependencies(const SymbolDependenceMap &QueryDeps) {
-    for (auto &NamedDepsEntry : NamedSymbolDeps) {
+    for (auto &NamedDepsEntry : ExternalNamedSymbolDeps) {
       auto &Name = NamedDepsEntry.first;
       auto &NameDeps = NamedDepsEntry.second;
       SymbolDependenceMap SymbolDeps;
@@ -323,7 +441,8 @@ private:
   ObjectLinkingLayer &Layer;
   MaterializationResponsibility MR;
   std::unique_ptr<MemoryBuffer> ObjBuffer;
-  DenseMap<SymbolStringPtr, SymbolNameSet> NamedSymbolDeps;
+  DenseMap<SymbolStringPtr, SymbolNameSet> ExternalNamedSymbolDeps;
+  DenseMap<SymbolStringPtr, SymbolNameSet> InternalNamedSymbolDeps;
 };
 
 ObjectLinkingLayer::Plugin::~Plugin() {}
@@ -426,18 +545,21 @@ EHFrameRegistrationPlugin::EHFrameRegistrationPlugin(
 void EHFrameRegistrationPlugin::modifyPassConfig(
     MaterializationResponsibility &MR, const Triple &TT,
     PassConfiguration &PassConfig) {
-  assert(!InProcessLinks.count(&MR) && "Link for MR already being tracked?");
 
-  PassConfig.PostFixupPasses.push_back(
-      createEHFrameRecorderPass(TT, [this, &MR](JITTargetAddress Addr,
-                                                size_t Size) {
-        if (Addr)
-          InProcessLinks[&MR] = { Addr, Size };
+  PassConfig.PostFixupPasses.push_back(createEHFrameRecorderPass(
+      TT, [this, &MR](JITTargetAddress Addr, size_t Size) {
+        if (Addr) {
+          std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
+          assert(!InProcessLinks.count(&MR) &&
+                 "Link for MR already being tracked?");
+          InProcessLinks[&MR] = {Addr, Size};
+        }
       }));
 }
 
 Error EHFrameRegistrationPlugin::notifyEmitted(
     MaterializationResponsibility &MR) {
+  std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
 
   auto EHFrameRangeItr = InProcessLinks.find(&MR);
   if (EHFrameRangeItr == InProcessLinks.end())
@@ -457,6 +579,8 @@ Error EHFrameRegistrationPlugin::notifyEmitted(
 }
 
 Error EHFrameRegistrationPlugin::notifyRemovingModule(VModuleKey K) {
+  std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
+
   auto EHFrameRangeItr = TrackedEHFrameRanges.find(K);
   if (EHFrameRangeItr == TrackedEHFrameRanges.end())
     return Error::success();
@@ -470,6 +594,7 @@ Error EHFrameRegistrationPlugin::notifyRemovingModule(VModuleKey K) {
 }
 
 Error EHFrameRegistrationPlugin::notifyRemovingAllModules() {
+  std::lock_guard<std::mutex> Lock(EHFramePluginMutex);
 
   std::vector<EHFrameRange> EHFrameRanges =
     std::move(UntrackedEHFrameRanges);
diff --git a/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp b/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp
index 8ed23de419d1..18b3c5e12b1c 100644
--- a/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/OrcABISupport.cpp
@@ -7,13 +7,46 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/OrcABISupport.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/Process.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "orc"
+
+using namespace llvm;
+
+template <typename ORCABI>
+bool stubAndPointerRangesOk(JITTargetAddress StubBlockAddr,
+                            JITTargetAddress PointerBlockAddr,
+                            unsigned NumStubs) {
+  constexpr unsigned MaxDisp = ORCABI::StubToPointerMaxDisplacement;
+  JITTargetAddress FirstStub = StubBlockAddr;
+  JITTargetAddress LastStub = FirstStub + ((NumStubs - 1) * ORCABI::StubSize);
+  JITTargetAddress FirstPointer = PointerBlockAddr;
+  JITTargetAddress LastPointer =
+      FirstPointer + ((NumStubs - 1) * ORCABI::StubSize);
+
+  if (FirstStub < FirstPointer) {
+    if (LastStub >= FirstPointer)
+      return false; // Ranges overlap.
+    return (FirstPointer - FirstStub <= MaxDisp) &&
+           (LastPointer - LastStub <= MaxDisp); // out-of-range.
+  }
+
+  if (LastPointer >= FirstStub)
+    return false; // Ranges overlap.
+
+  return (FirstStub - FirstPointer <= MaxDisp) &&
+         (LastStub - LastPointer <= MaxDisp);
+}
 
 namespace llvm {
 namespace orc {
 
-void OrcAArch64::writeResolverCode(uint8_t *ResolverMem, JITReentryFn ReentryFn,
-                                   void *CallbackMgr) {
+void OrcAArch64::writeResolverCode(char *ResolverWorkingMem,
+                                   JITTargetAddress ResolverTargetAddress,
+                                   JITTargetAddress ReentryFnAddr,
+                                   JITTargetAddress ReentryCtxAddr) {
 
   const uint32_t ResolverCode[] = {
     // resolver_entry:
@@ -48,7 +81,7 @@ void OrcAArch64::writeResolverCode(uint8_t *ResolverMem, JITReentryFn ReentryFn,
     0xadbf17e4,        // 0x070:  stp   q4,  q5, [sp, #-32]!
     0xadbf0fe2,        // 0x074:  stp   q2,  q3, [sp, #-32]!
     0xadbf07e0,        // 0x078:  stp   q0,  q1, [sp, #-32]!
-    0x580004e0,        // 0x07c:  ldr   x0, Lcallbackmgr
+    0x580004e0,        // 0x07c:  ldr   x0, Lreentry_ctx_ptr
     0xaa1e03e1,        // 0x080:  mov   x1, x30
     0xd1003021,        // 0x084:  sub   x1,  x1, #12
     0x58000442,        // 0x088:  ldr   x2, Lreentry_fn_ptr
@@ -87,43 +120,47 @@ void OrcAArch64::writeResolverCode(uint8_t *ResolverMem, JITReentryFn ReentryFn,
     0xd65f0220,        // 0x10c:  ret  x17
     0x01234567,        // 0x110:  Lreentry_fn_ptr:
     0xdeadbeef,        // 0x114:      .quad 0
-    0x98765432,        // 0x118:  Lcallbackmgr:
+    0x98765432,        // 0x118:  Lreentry_ctx_ptr:
     0xcafef00d         // 0x11c:      .quad 0
   };
 
   const unsigned ReentryFnAddrOffset = 0x110;
-  const unsigned CallbackMgrAddrOffset = 0x118;
+  const unsigned ReentryCtxAddrOffset = 0x118;
 
-  memcpy(ResolverMem, ResolverCode, sizeof(ResolverCode));
-  memcpy(ResolverMem + ReentryFnAddrOffset, &ReentryFn, sizeof(ReentryFn));
-  memcpy(ResolverMem + CallbackMgrAddrOffset, &CallbackMgr,
-         sizeof(CallbackMgr));
+  memcpy(ResolverWorkingMem, ResolverCode, sizeof(ResolverCode));
+  memcpy(ResolverWorkingMem + ReentryFnAddrOffset, &ReentryFnAddr,
+         sizeof(uint64_t));
+  memcpy(ResolverWorkingMem + ReentryCtxAddrOffset, &ReentryCtxAddr,
+         sizeof(uint64_t));
 }
 
-void OrcAArch64::writeTrampolines(uint8_t *TrampolineMem, void *ResolverAddr,
+void OrcAArch64::writeTrampolines(char *TrampolineBlockWorkingMem,
+                                  JITTargetAddress TrampolineBlockTargetAddress,
+                                  JITTargetAddress ResolverAddr,
                                   unsigned NumTrampolines) {
 
   unsigned OffsetToPtr = alignTo(NumTrampolines * TrampolineSize, 8);
 
-  memcpy(TrampolineMem + OffsetToPtr, &ResolverAddr, sizeof(void *));
+  memcpy(TrampolineBlockWorkingMem + OffsetToPtr, &ResolverAddr,
+         sizeof(uint64_t));
 
   // OffsetToPtr is actually the offset from the PC for the 2nd instruction, so
   // subtract 32-bits.
   OffsetToPtr -= 4;
 
-  uint32_t *Trampolines = reinterpret_cast<uint32_t *>(TrampolineMem);
+  uint32_t *Trampolines =
+      reinterpret_cast<uint32_t *>(TrampolineBlockWorkingMem);
 
   for (unsigned I = 0; I < NumTrampolines; ++I, OffsetToPtr -= TrampolineSize) {
     Trampolines[3 * I + 0] = 0xaa1e03f1;                      // mov x17, x30
     Trampolines[3 * I + 1] = 0x58000010 | (OffsetToPtr << 3); // adr x16, Lptr
     Trampolines[3 * I + 2] = 0xd63f0200;                      // blr x16
   }
-
 }
 
-Error OrcAArch64::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
-                                         unsigned MinStubs,
-                                         void *InitialPtrVal) {
+void OrcAArch64::writeIndirectStubsBlock(
+    char *StubsBlockWorkingMem, JITTargetAddress StubsBlockTargetAddress,
+    JITTargetAddress PointersBlockTargetAddress, unsigned NumStubs) {
   // Stub format is:
   //
   // .section __orc_stubs
@@ -144,68 +181,41 @@ Error OrcAArch64::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
   //
   // ...
 
-  const unsigned StubSize = IndirectStubsInfo::StubSize;
-
-  // Emit at least MinStubs, rounded up to fill the pages allocated.
-  static const unsigned PageSize = sys::Process::getPageSizeEstimate();
-  unsigned NumPages = ((MinStubs * StubSize) + (PageSize - 1)) / PageSize;
-  unsigned NumStubs = (NumPages * PageSize) / StubSize;
-
-  // Allocate memory for stubs and pointers in one call.
-  std::error_code EC;
-  auto StubsMem = sys::OwningMemoryBlock(sys::Memory::allocateMappedMemory(
-      2 * NumPages * PageSize, nullptr,
-      sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC));
-
-  if (EC)
-    return errorCodeToError(EC);
-
-  // Create separate MemoryBlocks representing the stubs and pointers.
-  sys::MemoryBlock StubsBlock(StubsMem.base(), NumPages * PageSize);
-  sys::MemoryBlock PtrsBlock(static_cast<char *>(StubsMem.base()) +
-                                 NumPages * PageSize,
-                             NumPages * PageSize);
-
-  // Populate the stubs page stubs and mark it executable.
-  uint64_t *Stub = reinterpret_cast<uint64_t *>(StubsBlock.base());
-  uint64_t PtrOffsetField = static_cast<uint64_t>(NumPages * PageSize)
-                            << 3;
+  static_assert(StubSize == PointerSize,
+                "Pointer and stub size must match for algorithm below");
+  assert(stubAndPointerRangesOk<OrcAArch64>(
+             StubsBlockTargetAddress, PointersBlockTargetAddress, NumStubs) &&
+         "PointersBlock is out of range");
+  uint64_t PtrDisplacement =
+      PointersBlockTargetAddress - StubsBlockTargetAddress;
+  uint64_t *Stub = reinterpret_cast<uint64_t *>(StubsBlockWorkingMem);
+  uint64_t PtrOffsetField = PtrDisplacement << 3;
 
   for (unsigned I = 0; I < NumStubs; ++I)
     Stub[I] = 0xd61f020058000010 | PtrOffsetField;
-
-  if (auto EC = sys::Memory::protectMappedMemory(
-          StubsBlock, sys::Memory::MF_READ | sys::Memory::MF_EXEC))
-    return errorCodeToError(EC);
-
-  // Initialize all pointers to point at FailureAddress.
-  void **Ptr = reinterpret_cast<void **>(PtrsBlock.base());
-  for (unsigned I = 0; I < NumStubs; ++I)
-    Ptr[I] = InitialPtrVal;
-
-  StubsInfo = IndirectStubsInfo(NumStubs, std::move(StubsMem));
-
-  return Error::success();
 }
 
-void OrcX86_64_Base::writeTrampolines(uint8_t *TrampolineMem,
-                                      void *ResolverAddr,
-                                      unsigned NumTrampolines) {
+void OrcX86_64_Base::writeTrampolines(
+    char *TrampolineBlockWorkingMem,
+    JITTargetAddress TrampolineBlockTargetAddress,
+    JITTargetAddress ResolverAddr, unsigned NumTrampolines) {
 
   unsigned OffsetToPtr = NumTrampolines * TrampolineSize;
 
-  memcpy(TrampolineMem + OffsetToPtr, &ResolverAddr, sizeof(void *));
+  memcpy(TrampolineBlockWorkingMem + OffsetToPtr, &ResolverAddr,
+         sizeof(uint64_t));
 
-  uint64_t *Trampolines = reinterpret_cast<uint64_t *>(TrampolineMem);
+  uint64_t *Trampolines =
+      reinterpret_cast<uint64_t *>(TrampolineBlockWorkingMem);
   uint64_t CallIndirPCRel = 0xf1c40000000015ff;
 
   for (unsigned I = 0; I < NumTrampolines; ++I, OffsetToPtr -= TrampolineSize)
     Trampolines[I] = CallIndirPCRel | ((OffsetToPtr - 6) << 16);
 }
 
-Error OrcX86_64_Base::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
-                                             unsigned MinStubs,
-                                             void *InitialPtrVal) {
+void OrcX86_64_Base::writeIndirectStubsBlock(
+    char *StubsBlockWorkingMem, JITTargetAddress StubsBlockTargetAddress,
+    JITTargetAddress PointersBlockTargetAddress, unsigned NumStubs) {
   // Stub format is:
   //
   // .section __orc_stubs
@@ -226,52 +236,28 @@ Error OrcX86_64_Base::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
   //
   // ...
 
-  const unsigned StubSize = IndirectStubsInfo::StubSize;
-
-  // Emit at least MinStubs, rounded up to fill the pages allocated.
-  static const unsigned PageSize = sys::Process::getPageSizeEstimate();
-  unsigned NumPages = ((MinStubs * StubSize) + (PageSize - 1)) / PageSize;
-  unsigned NumStubs = (NumPages * PageSize) / StubSize;
-
-  // Allocate memory for stubs and pointers in one call.
-  std::error_code EC;
-  auto StubsMem = sys::OwningMemoryBlock(sys::Memory::allocateMappedMemory(
-      2 * NumPages * PageSize, nullptr,
-      sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC));
-
-  if (EC)
-    return errorCodeToError(EC);
-
-  // Create separate MemoryBlocks representing the stubs and pointers.
-  sys::MemoryBlock StubsBlock(StubsMem.base(), NumPages * PageSize);
-  sys::MemoryBlock PtrsBlock(static_cast<char *>(StubsMem.base()) +
-                                 NumPages * PageSize,
-                             NumPages * PageSize);
-
   // Populate the stubs page stubs and mark it executable.
-  uint64_t *Stub = reinterpret_cast<uint64_t *>(StubsBlock.base());
-  uint64_t PtrOffsetField = static_cast<uint64_t>(NumPages * PageSize - 6)
-                            << 16;
+  static_assert(StubSize == PointerSize,
+                "Pointer and stub size must match for algorithm below");
+  assert(stubAndPointerRangesOk<OrcX86_64_Base>(
+             StubsBlockTargetAddress, PointersBlockTargetAddress, NumStubs) &&
+         "PointersBlock is out of range");
+  uint64_t *Stub = reinterpret_cast<uint64_t *>(StubsBlockWorkingMem);
+  uint64_t PtrOffsetField =
+      (PointersBlockTargetAddress - StubsBlockTargetAddress - 6) << 16;
   for (unsigned I = 0; I < NumStubs; ++I)
     Stub[I] = 0xF1C40000000025ff | PtrOffsetField;
-
-  if (auto EC = sys::Memory::protectMappedMemory(
-          StubsBlock, sys::Memory::MF_READ | sys::Memory::MF_EXEC))
-    return errorCodeToError(EC);
-
-  // Initialize all pointers to point at FailureAddress.
-  void **Ptr = reinterpret_cast<void **>(PtrsBlock.base());
-  for (unsigned I = 0; I < NumStubs; ++I)
-    Ptr[I] = InitialPtrVal;
-
-  StubsInfo = IndirectStubsInfo(NumStubs, std::move(StubsMem));
-
-  return Error::success();
 }
 
-void OrcX86_64_SysV::writeResolverCode(uint8_t *ResolverMem,
-                                       JITReentryFn ReentryFn,
-                                       void *CallbackMgr) {
+void OrcX86_64_SysV::writeResolverCode(char *ResolverWorkingMem,
+                                       JITTargetAddress ResolverTargetAddress,
+                                       JITTargetAddress ReentryFnAddr,
+                                       JITTargetAddress ReentryCtxAddr) {
+
+  LLVM_DEBUG({
+    dbgs() << "Writing resolver code to "
+           << formatv("{0:x16}", ResolverTargetAddress) << "\n";
+  });
 
   const uint8_t ResolverCode[] = {
       // resolver_entry:
@@ -295,7 +281,7 @@ void OrcX86_64_SysV::writeResolverCode(uint8_t *ResolverMem,
       0x48, 0x0f, 0xae, 0x04, 0x24,             // 0x21: fxsave64  (%rsp)
       0x48, 0xbf,                               // 0x26: movabsq   <CBMgr>, %rdi
 
-      // 0x28: Callback manager addr.
+      // 0x28: JIT re-entry ctx addr.
       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 
       0x48, 0x8b, 0x75, 0x08,                   // 0x30: movq      8(%rbp), %rsi
@@ -325,23 +311,26 @@ void OrcX86_64_SysV::writeResolverCode(uint8_t *ResolverMem,
       0x58,                                     // 0x69: popq      %rax
       0x5d,                                     // 0x6a: popq      %rbp
       0xc3,                                     // 0x6b: retq
-  };
+ };
 
   const unsigned ReentryFnAddrOffset = 0x3a;
-  const unsigned CallbackMgrAddrOffset = 0x28;
+  const unsigned ReentryCtxAddrOffset = 0x28;
 
-  memcpy(ResolverMem, ResolverCode, sizeof(ResolverCode));
-  memcpy(ResolverMem + ReentryFnAddrOffset, &ReentryFn, sizeof(ReentryFn));
-  memcpy(ResolverMem + CallbackMgrAddrOffset, &CallbackMgr,
-         sizeof(CallbackMgr));
+  memcpy(ResolverWorkingMem, ResolverCode, sizeof(ResolverCode));
+  memcpy(ResolverWorkingMem + ReentryFnAddrOffset, &ReentryFnAddr,
+         sizeof(uint64_t));
+  memcpy(ResolverWorkingMem + ReentryCtxAddrOffset, &ReentryCtxAddr,
+         sizeof(uint64_t));
 }
 
-void OrcX86_64_Win32::writeResolverCode(uint8_t *ResolverMem,
-                                        JITReentryFn ReentryFn,
-                                        void *CallbackMgr) {
+void OrcX86_64_Win32::writeResolverCode(char *ResolverWorkingMem,
+                                        JITTargetAddress ResolverTargetAddress,
+                                        JITTargetAddress ReentryFnAddr,
+                                        JITTargetAddress ReentryCtxAddr) {
 
-  // resolverCode is similar to OrcX86_64 with differences specific to windows x64 calling convention:
-  // arguments go into rcx, rdx and come in reverse order, shadow space allocation on stack
+  // resolverCode is similar to OrcX86_64 with differences specific to windows
+  // x64 calling convention: arguments go into rcx, rdx and come in reverse
+  // order, shadow space allocation on stack
   const uint8_t ResolverCode[] = {
       // resolver_entry:
       0x55,                                      // 0x00: pushq     %rbp
@@ -364,7 +353,7 @@ void OrcX86_64_Win32::writeResolverCode(uint8_t *ResolverMem,
       0x48, 0x0f, 0xae, 0x04, 0x24,              // 0x21: fxsave64  (%rsp)
 
       0x48, 0xb9,                                // 0x26: movabsq   <CBMgr>, %rcx
-      // 0x28: Callback manager addr.
+      // 0x28: JIT re-entry ctx addr.
       0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
 
       0x48, 0x8B, 0x55, 0x08,                    // 0x30: mov       rdx, [rbp+0x8]
@@ -402,18 +391,23 @@ void OrcX86_64_Win32::writeResolverCode(uint8_t *ResolverMem,
       0xc3,                                      // 0x73: retq
   };
 
-
   const unsigned ReentryFnAddrOffset = 0x3a;
-  const unsigned CallbackMgrAddrOffset = 0x28;
+  const unsigned ReentryCtxAddrOffset = 0x28;
 
-  memcpy(ResolverMem, ResolverCode, sizeof(ResolverCode));
-  memcpy(ResolverMem + ReentryFnAddrOffset, &ReentryFn, sizeof(ReentryFn));
-  memcpy(ResolverMem + CallbackMgrAddrOffset, &CallbackMgr,
-         sizeof(CallbackMgr));
+  memcpy(ResolverWorkingMem, ResolverCode, sizeof(ResolverCode));
+  memcpy(ResolverWorkingMem + ReentryFnAddrOffset, &ReentryFnAddr,
+         sizeof(uint64_t));
+  memcpy(ResolverWorkingMem + ReentryCtxAddrOffset, &ReentryCtxAddr,
+         sizeof(uint64_t));
 }
 
-void OrcI386::writeResolverCode(uint8_t *ResolverMem, JITReentryFn ReentryFn,
-                                void *CallbackMgr) {
+void OrcI386::writeResolverCode(char *ResolverWorkingMem,
+                                JITTargetAddress ResolverTargetAddress,
+                                JITTargetAddress ReentryFnAddr,
+                                JITTargetAddress ReentryCtxAddr) {
+
+  assert((ReentryFnAddr >> 32) == 0 && "ReentryFnAddr out of range");
+  assert((ReentryCtxAddr >> 32) == 0 && "ReentryCtxAddr out of range");
 
   const uint8_t ResolverCode[] = {
       // resolver_entry:
@@ -451,29 +445,37 @@ void OrcI386::writeResolverCode(uint8_t *ResolverMem, JITReentryFn ReentryFn,
   };
 
   const unsigned ReentryFnAddrOffset = 0x2a;
-  const unsigned CallbackMgrAddrOffset = 0x25;
+  const unsigned ReentryCtxAddrOffset = 0x25;
 
-  memcpy(ResolverMem, ResolverCode, sizeof(ResolverCode));
-  memcpy(ResolverMem + ReentryFnAddrOffset, &ReentryFn, sizeof(ReentryFn));
-  memcpy(ResolverMem + CallbackMgrAddrOffset, &CallbackMgr,
-         sizeof(CallbackMgr));
+  memcpy(ResolverWorkingMem, ResolverCode, sizeof(ResolverCode));
+  memcpy(ResolverWorkingMem + ReentryFnAddrOffset, &ReentryFnAddr,
+         sizeof(uint32_t));
+  memcpy(ResolverWorkingMem + ReentryCtxAddrOffset, &ReentryCtxAddr,
+         sizeof(uint32_t));
 }
 
-void OrcI386::writeTrampolines(uint8_t *TrampolineMem, void *ResolverAddr,
+void OrcI386::writeTrampolines(char *TrampolineWorkingMem,
+                               JITTargetAddress TrampolineBlockTargetAddress,
+                               JITTargetAddress ResolverAddr,
                                unsigned NumTrampolines) {
+  assert((ResolverAddr >> 32) == 0 && "ResolverAddr out of range");
 
   uint64_t CallRelImm = 0xF1C4C400000000e8;
-  uint64_t Resolver = reinterpret_cast<uint64_t>(ResolverAddr);
-  uint64_t ResolverRel =
-      Resolver - reinterpret_cast<uint64_t>(TrampolineMem) - 5;
+  uint64_t ResolverRel = ResolverAddr - TrampolineBlockTargetAddress - 5;
 
-  uint64_t *Trampolines = reinterpret_cast<uint64_t *>(TrampolineMem);
+  uint64_t *Trampolines = reinterpret_cast<uint64_t *>(TrampolineWorkingMem);
   for (unsigned I = 0; I < NumTrampolines; ++I, ResolverRel -= TrampolineSize)
     Trampolines[I] = CallRelImm | (ResolverRel << 8);
 }
 
-Error OrcI386::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
-                                      unsigned MinStubs, void *InitialPtrVal) {
+void OrcI386::writeIndirectStubsBlock(
+    char *StubsBlockWorkingMem, JITTargetAddress StubsBlockTargetAddress,
+    JITTargetAddress PointersBlockTargetAddress, unsigned NumStubs) {
+  assert((StubsBlockTargetAddress >> 32) == 0 &&
+         "StubsBlockTargetAddress is out of range");
+  assert((PointersBlockTargetAddress >> 32) == 0 &&
+         "PointersBlockTargetAddress is out of range");
+
   // Stub format is:
   //
   // .section __orc_stubs
@@ -494,51 +496,21 @@ Error OrcI386::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
   //
   // ...
 
-  const unsigned StubSize = IndirectStubsInfo::StubSize;
+  assert(stubAndPointerRangesOk<OrcI386>(
+             StubsBlockTargetAddress, PointersBlockTargetAddress, NumStubs) &&
+         "PointersBlock is out of range");
 
-  // Emit at least MinStubs, rounded up to fill the pages allocated.
-  static const unsigned PageSize = sys::Process::getPageSizeEstimate();
-  unsigned NumPages = ((MinStubs * StubSize) + (PageSize - 1)) / PageSize;
-  unsigned NumStubs = (NumPages * PageSize) / StubSize;
-
-  // Allocate memory for stubs and pointers in one call.
-  std::error_code EC;
-  auto StubsMem = sys::OwningMemoryBlock(sys::Memory::allocateMappedMemory(
-      2 * NumPages * PageSize, nullptr,
-      sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC));
-
-  if (EC)
-    return errorCodeToError(EC);
-
-  // Create separate MemoryBlocks representing the stubs and pointers.
-  sys::MemoryBlock StubsBlock(StubsMem.base(), NumPages * PageSize);
-  sys::MemoryBlock PtrsBlock(static_cast<char *>(StubsMem.base()) +
-                                 NumPages * PageSize,
-                             NumPages * PageSize);
-
-  // Populate the stubs page stubs and mark it executable.
-  uint64_t *Stub = reinterpret_cast<uint64_t *>(StubsBlock.base());
-  uint64_t PtrAddr = reinterpret_cast<uint64_t>(PtrsBlock.base());
+  uint64_t *Stub = reinterpret_cast<uint64_t *>(StubsBlockWorkingMem);
+  uint64_t PtrAddr = PointersBlockTargetAddress;
   for (unsigned I = 0; I < NumStubs; ++I, PtrAddr += 4)
     Stub[I] = 0xF1C40000000025ff | (PtrAddr << 16);
-
-  if (auto EC = sys::Memory::protectMappedMemory(
-          StubsBlock, sys::Memory::MF_READ | sys::Memory::MF_EXEC))
-    return errorCodeToError(EC);
-
-  // Initialize all pointers to point at FailureAddress.
-  void **Ptr = reinterpret_cast<void **>(PtrsBlock.base());
-  for (unsigned I = 0; I < NumStubs; ++I)
-    Ptr[I] = InitialPtrVal;
-
-  StubsInfo = IndirectStubsInfo(NumStubs, std::move(StubsMem));
-
-  return Error::success();
 }
 
-void OrcMips32_Base::writeResolverCode(uint8_t *ResolverMem,
-                                       JITReentryFn ReentryFn,
-                                       void *CallbackMgr, bool isBigEndian) {
+void OrcMips32_Base::writeResolverCode(char *ResolverWorkingMem,
+                                       JITTargetAddress ResolverTargetAddress,
+                                       JITTargetAddress ReentryFnAddr,
+                                       JITTargetAddress ReentryCtxAddr,
+                                       bool isBigEndian) {
 
   const uint32_t ResolverCode[] = {
       // resolver_entry:
@@ -570,9 +542,9 @@ void OrcMips32_Base::writeResolverCode(uint8_t *ResolverMem,
       0xafbe0060,                    // 0x64: sw $fp,96($sp)
       0xafbf0064,                    // 0x68: sw $ra,100($sp)
 
-      // Callback manager addr.
-      0x00000000,                    // 0x6c: lui $a0,callbackmgr
-      0x00000000,                    // 0x70: addiu $a0,$a0,callbackmgr
+      // JIT re-entry ctx addr.
+      0x00000000,                    // 0x6c: lui $a0,ctx
+      0x00000000,                    // 0x70: addiu $a0,$a0,ctx
 
       0x03e02825,                    // 0x74: move $a1, $ra
       0x24a5ffec,                    // 0x78: addiu $a1,$a1,-20
@@ -614,50 +586,63 @@ void OrcMips32_Base::writeResolverCode(uint8_t *ResolverMem,
   };
 
   const unsigned ReentryFnAddrOffset = 0x7c;   // JIT re-entry fn addr lui
-  const unsigned CallbackMgrAddrOffset = 0x6c; // Callback manager addr lui
+  const unsigned ReentryCtxAddrOffset = 0x6c;  // JIT re-entry context addr lui
   const unsigned Offsett = 0xf8;
 
-  memcpy(ResolverMem, ResolverCode, sizeof(ResolverCode));
+  memcpy(ResolverWorkingMem, ResolverCode, sizeof(ResolverCode));
 
   // Depending on endian return value will be in v0 or v1.
   uint32_t MoveVxT9 = isBigEndian ? 0x0060c825 : 0x0040c825;
-  memcpy(ResolverMem + Offsett, &MoveVxT9, sizeof(MoveVxT9));
-
-  uint64_t CallMgrAddr = reinterpret_cast<uint64_t>(CallbackMgr);
-  uint32_t CallMgrLUi = 0x3c040000 | (((CallMgrAddr + 0x8000) >> 16) & 0xFFFF);
-  uint32_t CallMgrADDiu = 0x24840000 | ((CallMgrAddr) & 0xFFFF);
-  memcpy(ResolverMem + CallbackMgrAddrOffset, &CallMgrLUi, sizeof(CallMgrLUi));
-  memcpy(ResolverMem + CallbackMgrAddrOffset + 4, &CallMgrADDiu,
-         sizeof(CallMgrADDiu));
-
-  uint64_t ReentryAddr = reinterpret_cast<uint64_t>(ReentryFn);
-  uint32_t ReentryLUi = 0x3c190000 | (((ReentryAddr + 0x8000) >> 16) & 0xFFFF);
-  uint32_t ReentryADDiu = 0x27390000 | ((ReentryAddr) & 0xFFFF);
-  memcpy(ResolverMem + ReentryFnAddrOffset, &ReentryLUi, sizeof(ReentryLUi));
-  memcpy(ResolverMem + ReentryFnAddrOffset + 4, &ReentryADDiu,
-         sizeof(ReentryADDiu));
+  memcpy(ResolverWorkingMem + Offsett, &MoveVxT9, sizeof(MoveVxT9));
+
+  uint32_t ReentryCtxLUi =
+      0x3c040000 | (((ReentryCtxAddr + 0x8000) >> 16) & 0xFFFF);
+  uint32_t ReentryCtxADDiu = 0x24840000 | ((ReentryCtxAddr)&0xFFFF);
+  memcpy(ResolverWorkingMem + ReentryCtxAddrOffset, &ReentryCtxLUi,
+         sizeof(ReentryCtxLUi));
+  memcpy(ResolverWorkingMem + ReentryCtxAddrOffset + 4, &ReentryCtxADDiu,
+         sizeof(ReentryCtxADDiu));
+
+  uint32_t ReentryFnLUi =
+      0x3c190000 | (((ReentryFnAddr + 0x8000) >> 16) & 0xFFFF);
+  uint32_t ReentryFnADDiu = 0x27390000 | ((ReentryFnAddr)&0xFFFF);
+  memcpy(ResolverWorkingMem + ReentryFnAddrOffset, &ReentryFnLUi,
+         sizeof(ReentryFnLUi));
+  memcpy(ResolverWorkingMem + ReentryFnAddrOffset + 4, &ReentryFnADDiu,
+         sizeof(ReentryFnADDiu));
 }
 
-void OrcMips32_Base::writeTrampolines(uint8_t *TrampolineMem,
-                                      void *ResolverAddr,
-                                      unsigned NumTrampolines) {
+void OrcMips32_Base::writeTrampolines(
+    char *TrampolineBlockWorkingMem,
+    JITTargetAddress TrampolineBlockTargetAddress,
+    JITTargetAddress ResolverAddr, unsigned NumTrampolines) {
 
-  uint32_t *Trampolines = reinterpret_cast<uint32_t *>(TrampolineMem);
-  uint64_t ResolveAddr = reinterpret_cast<uint64_t>(ResolverAddr);
-  uint32_t RHiAddr = ((ResolveAddr + 0x8000) >> 16);
+  assert((ResolverAddr >> 32) == 0 && "ResolverAddr out of range");
+
+  uint32_t *Trampolines =
+      reinterpret_cast<uint32_t *>(TrampolineBlockWorkingMem);
+  uint32_t RHiAddr = ((ResolverAddr + 0x8000) >> 16);
 
   for (unsigned I = 0; I < NumTrampolines; ++I) {
-    Trampolines[5 * I + 0] = 0x03e0c025;                           // move $t8,$ra
-    Trampolines[5 * I + 1] = 0x3c190000 | (RHiAddr & 0xFFFF);      // lui $t9,resolveAddr
-    Trampolines[5 * I + 2] = 0x27390000 | (ResolveAddr & 0xFFFF);  // addiu $t9,$t9,resolveAddr
-    Trampolines[5 * I + 3] = 0x0320f809;                           // jalr $t9
-    Trampolines[5 * I + 4] = 0x00000000;                           // nop
+    // move $t8,$ra
+    // lui $t9,ResolverAddr
+    // addiu $t9,$t9,ResolverAddr
+    // jalr $t9
+    // nop
+    Trampolines[5 * I + 0] = 0x03e0c025;
+    Trampolines[5 * I + 1] = 0x3c190000 | (RHiAddr & 0xFFFF);
+    Trampolines[5 * I + 2] = 0x27390000 | (ResolverAddr & 0xFFFF);
+    Trampolines[5 * I + 3] = 0x0320f809;
+    Trampolines[5 * I + 4] = 0x00000000;
   }
 }
 
-Error OrcMips32_Base::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
-                                             unsigned MinStubs,
-                                             void *InitialPtrVal) {
+void OrcMips32_Base::writeIndirectStubsBlock(
+    char *StubsBlockWorkingMem, JITTargetAddress StubsBlockTargetAddress,
+    JITTargetAddress PointersBlockTargetAddress, unsigned NumStubs) {
+  assert((StubsBlockTargetAddress >> 32) == 0 &&
+         "InitialPtrVal is out of range");
+
   // Stub format is:
   //
   // .section __orc_stubs
@@ -678,33 +663,15 @@ Error OrcMips32_Base::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
   // ptr2:
   //                 .word 0x0
   //
-  // ...
-
-  const unsigned StubSize = IndirectStubsInfo::StubSize;
-
-  // Emit at least MinStubs, rounded up to fill the pages allocated.
-  static const unsigned PageSize = sys::Process::getPageSizeEstimate();
-  unsigned NumPages = ((MinStubs * StubSize) + (PageSize - 1)) / PageSize;
-  unsigned NumStubs = (NumPages * PageSize) / StubSize;
+  // i..
 
-  // Allocate memory for stubs and pointers in one call.
-  std::error_code EC;
-  auto StubsMem = sys::OwningMemoryBlock(sys::Memory::allocateMappedMemory(
-      2 * NumPages * PageSize, nullptr,
-      sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC));
-
-  if (EC)
-    return errorCodeToError(EC);
-
-  // Create separate MemoryBlocks representing the stubs and pointers.
-  sys::MemoryBlock StubsBlock(StubsMem.base(), NumPages * PageSize);
-  sys::MemoryBlock PtrsBlock(static_cast<char *>(StubsMem.base()) +
-                                 NumPages * PageSize,
-                             NumPages * PageSize);
+  assert(stubAndPointerRangesOk<OrcAArch64>(
+             StubsBlockTargetAddress, PointersBlockTargetAddress, NumStubs) &&
+         "PointersBlock is out of range");
 
   // Populate the stubs page stubs and mark it executable.
-  uint32_t *Stub = reinterpret_cast<uint32_t *>(StubsBlock.base());
-  uint64_t PtrAddr = reinterpret_cast<uint64_t>(Stub) + NumPages * PageSize;
+  uint32_t *Stub = reinterpret_cast<uint32_t *>(StubsBlockWorkingMem);
+  uint64_t PtrAddr = PointersBlockTargetAddress;
 
   for (unsigned I = 0; I < NumStubs; ++I) {
     uint32_t HiAddr = ((PtrAddr + 0x8000) >> 16);
@@ -714,26 +681,15 @@ Error OrcMips32_Base::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
     Stub[4 * I + 3] = 0x00000000;                      // nop
     PtrAddr += 4;
   }
-
-  if (auto EC = sys::Memory::protectMappedMemory(
-          StubsBlock, sys::Memory::MF_READ | sys::Memory::MF_EXEC))
-    return errorCodeToError(EC);
-
-  // Initialize all pointers to point at FailureAddress.
-  void **Ptr = reinterpret_cast<void **>(PtrsBlock.base());
-  for (unsigned I = 0; I < NumStubs; ++I)
-    Ptr[I] = InitialPtrVal;
-
-  StubsInfo = IndirectStubsInfo(NumStubs, std::move(StubsMem));
-
-  return Error::success();
 }
 
-void OrcMips64::writeResolverCode(uint8_t *ResolverMem, JITReentryFn ReentryFn,
-                                  void *CallbackMgr) {
+void OrcMips64::writeResolverCode(char *ResolverWorkingMem,
+                                  JITTargetAddress ResolverTargetAddress,
+                                  JITTargetAddress ReentryFnAddr,
+                                  JITTargetAddress ReentryCtxAddr) {
 
   const uint32_t ResolverCode[] = {
-      //resolver_entry:
+       //resolver_entry:
       0x67bdff30,                     // 0x00: daddiu $sp,$sp,-208
       0xffa20000,                     // 0x04: sd v0,0(sp)
       0xffa30008,                     // 0x08: sd v1,8(sp)
@@ -762,13 +718,13 @@ void OrcMips64::writeResolverCode(uint8_t *ResolverMem, JITReentryFn ReentryFn,
       0xffbe00c0,                     // 0x64: sd fp,192(sp)
       0xffbf00c8,                     // 0x68: sd ra,200(sp)
 
-      // Callback manager addr.
-      0x00000000,                     // 0x6c: lui $a0,heighest(callbackmgr)
-      0x00000000,                     // 0x70: daddiu $a0,$a0,heigher(callbackmgr)
+      // JIT re-entry ctx addr.
+      0x00000000,                     // 0x6c: lui $a0,heighest(ctx)
+      0x00000000,                     // 0x70: daddiu $a0,$a0,heigher(ctx)
       0x00000000,                     // 0x74: dsll $a0,$a0,16
-      0x00000000,                     // 0x78: daddiu $a0,$a0,hi(callbackmgr)
+      0x00000000,                     // 0x78: daddiu $a0,$a0,hi(ctx)
       0x00000000,                     // 0x7c: dsll $a0,$a0,16
-      0x00000000,                     // 0x80: daddiu $a0,$a0,lo(callbackmgr)
+      0x00000000,                     // 0x80: daddiu $a0,$a0,lo(ctx)
 
       0x03e02825,                     // 0x84: move $a1, $ra
       0x64a5ffdc,                     // 0x88: daddiu $a1,$a1,-36
@@ -814,73 +770,73 @@ void OrcMips64::writeResolverCode(uint8_t *ResolverMem, JITReentryFn ReentryFn,
   };
 
   const unsigned ReentryFnAddrOffset = 0x8c;   // JIT re-entry fn addr lui
-  const unsigned CallbackMgrAddrOffset = 0x6c; // Callback manager addr lui
-
-  memcpy(ResolverMem, ResolverCode, sizeof(ResolverCode));
-
-  uint64_t CallMgrAddr = reinterpret_cast<uint64_t>(CallbackMgr);
-
-  uint32_t CallMgrLUi =
-      0x3c040000 | (((CallMgrAddr + 0x800080008000) >> 48) & 0xFFFF);
-  uint32_t CallMgrDADDiu =
-      0x64840000 | (((CallMgrAddr + 0x80008000) >> 32) & 0xFFFF);
-  uint32_t CallMgrDSLL = 0x00042438;
-  uint32_t CallMgrDADDiu2 =
-      0x64840000 | ((((CallMgrAddr + 0x8000) >> 16) & 0xFFFF));
-  uint32_t CallMgrDSLL2 = 0x00042438;
-  uint32_t CallMgrDADDiu3 = 0x64840000 | ((CallMgrAddr)&0xFFFF);
-
-  memcpy(ResolverMem + CallbackMgrAddrOffset, &CallMgrLUi, sizeof(CallMgrLUi));
-  memcpy(ResolverMem + (CallbackMgrAddrOffset + 4), &CallMgrDADDiu,
-         sizeof(CallMgrDADDiu));
-  memcpy(ResolverMem + (CallbackMgrAddrOffset + 8), &CallMgrDSLL,
-         sizeof(CallMgrDSLL));
-  memcpy(ResolverMem + (CallbackMgrAddrOffset + 12), &CallMgrDADDiu2,
-         sizeof(CallMgrDADDiu2));
-  memcpy(ResolverMem + (CallbackMgrAddrOffset + 16), &CallMgrDSLL2,
-         sizeof(CallMgrDSLL2));
-  memcpy(ResolverMem + (CallbackMgrAddrOffset + 20), &CallMgrDADDiu3,
-         sizeof(CallMgrDADDiu3));
-
-  uint64_t ReentryAddr = reinterpret_cast<uint64_t>(ReentryFn);
-
-  uint32_t ReentryLUi =
-      0x3c190000 | (((ReentryAddr + 0x800080008000) >> 48) & 0xFFFF);
-
-  uint32_t ReentryDADDiu =
-      0x67390000 | (((ReentryAddr + 0x80008000) >> 32) & 0xFFFF);
-
-  uint32_t ReentryDSLL = 0x0019cc38;
-
-  uint32_t ReentryDADDiu2 =
-      0x67390000 | (((ReentryAddr + 0x8000) >> 16) & 0xFFFF);
-
-  uint32_t ReentryDSLL2 = 0x0019cc38;
-
-  uint32_t ReentryDADDiu3 = 0x67390000 | ((ReentryAddr)&0xFFFF);
-
-  memcpy(ResolverMem + ReentryFnAddrOffset, &ReentryLUi, sizeof(ReentryLUi));
-  memcpy(ResolverMem + (ReentryFnAddrOffset + 4), &ReentryDADDiu,
-         sizeof(ReentryDADDiu));
-  memcpy(ResolverMem + (ReentryFnAddrOffset + 8), &ReentryDSLL,
-         sizeof(ReentryDSLL));
-  memcpy(ResolverMem + (ReentryFnAddrOffset + 12), &ReentryDADDiu2,
-         sizeof(ReentryDADDiu2));
-  memcpy(ResolverMem + (ReentryFnAddrOffset + 16), &ReentryDSLL2,
-         sizeof(ReentryDSLL2));
-  memcpy(ResolverMem + (ReentryFnAddrOffset + 20), &ReentryDADDiu3,
-         sizeof(ReentryDADDiu3));
+  const unsigned ReentryCtxAddrOffset = 0x6c;  // JIT re-entry ctx addr lui
+
+  memcpy(ResolverWorkingMem, ResolverCode, sizeof(ResolverCode));
+
+  uint32_t ReentryCtxLUi =
+      0x3c040000 | (((ReentryCtxAddr + 0x800080008000) >> 48) & 0xFFFF);
+  uint32_t ReentryCtxDADDiu =
+      0x64840000 | (((ReentryCtxAddr + 0x80008000) >> 32) & 0xFFFF);
+  uint32_t ReentryCtxDSLL = 0x00042438;
+  uint32_t ReentryCtxDADDiu2 =
+      0x64840000 | ((((ReentryCtxAddr + 0x8000) >> 16) & 0xFFFF));
+  uint32_t ReentryCtxDSLL2 = 0x00042438;
+  uint32_t ReentryCtxDADDiu3 = 0x64840000 | ((ReentryCtxAddr)&0xFFFF);
+
+  memcpy(ResolverWorkingMem + ReentryCtxAddrOffset, &ReentryCtxLUi,
+         sizeof(ReentryCtxLUi));
+  memcpy(ResolverWorkingMem + (ReentryCtxAddrOffset + 4), &ReentryCtxDADDiu,
+         sizeof(ReentryCtxDADDiu));
+  memcpy(ResolverWorkingMem + (ReentryCtxAddrOffset + 8), &ReentryCtxDSLL,
+         sizeof(ReentryCtxDSLL));
+  memcpy(ResolverWorkingMem + (ReentryCtxAddrOffset + 12), &ReentryCtxDADDiu2,
+         sizeof(ReentryCtxDADDiu2));
+  memcpy(ResolverWorkingMem + (ReentryCtxAddrOffset + 16), &ReentryCtxDSLL2,
+         sizeof(ReentryCtxDSLL2));
+  memcpy(ResolverWorkingMem + (ReentryCtxAddrOffset + 20), &ReentryCtxDADDiu3,
+         sizeof(ReentryCtxDADDiu3));
+
+  uint32_t ReentryFnLUi =
+      0x3c190000 | (((ReentryFnAddr + 0x800080008000) >> 48) & 0xFFFF);
+
+  uint32_t ReentryFnDADDiu =
+      0x67390000 | (((ReentryFnAddr + 0x80008000) >> 32) & 0xFFFF);
+
+  uint32_t ReentryFnDSLL = 0x0019cc38;
+
+  uint32_t ReentryFnDADDiu2 =
+      0x67390000 | (((ReentryFnAddr + 0x8000) >> 16) & 0xFFFF);
+
+  uint32_t ReentryFnDSLL2 = 0x0019cc38;
+
+  uint32_t ReentryFnDADDiu3 = 0x67390000 | ((ReentryFnAddr)&0xFFFF);
+
+  memcpy(ResolverWorkingMem + ReentryFnAddrOffset, &ReentryFnLUi,
+         sizeof(ReentryFnLUi));
+  memcpy(ResolverWorkingMem + (ReentryFnAddrOffset + 4), &ReentryFnDADDiu,
+         sizeof(ReentryFnDADDiu));
+  memcpy(ResolverWorkingMem + (ReentryFnAddrOffset + 8), &ReentryFnDSLL,
+         sizeof(ReentryFnDSLL));
+  memcpy(ResolverWorkingMem + (ReentryFnAddrOffset + 12), &ReentryFnDADDiu2,
+         sizeof(ReentryFnDADDiu2));
+  memcpy(ResolverWorkingMem + (ReentryFnAddrOffset + 16), &ReentryFnDSLL2,
+         sizeof(ReentryFnDSLL2));
+  memcpy(ResolverWorkingMem + (ReentryFnAddrOffset + 20), &ReentryFnDADDiu3,
+         sizeof(ReentryFnDADDiu3));
 }
 
-void OrcMips64::writeTrampolines(uint8_t *TrampolineMem, void *ResolverAddr,
+void OrcMips64::writeTrampolines(char *TrampolineBlockWorkingMem,
+                                 JITTargetAddress TrampolineBlockTargetAddress,
+                                 JITTargetAddress ResolverAddr,
                                  unsigned NumTrampolines) {
 
-  uint32_t *Trampolines = reinterpret_cast<uint32_t *>(TrampolineMem);
-  uint64_t ResolveAddr = reinterpret_cast<uint64_t>(ResolverAddr);
+  uint32_t *Trampolines =
+      reinterpret_cast<uint32_t *>(TrampolineBlockWorkingMem);
 
-  uint64_t HeighestAddr = ((ResolveAddr + 0x800080008000) >> 48);
-  uint64_t HeigherAddr = ((ResolveAddr + 0x80008000) >> 32);
-  uint64_t HiAddr = ((ResolveAddr + 0x8000) >> 16);
+  uint64_t HeighestAddr = ((ResolverAddr + 0x800080008000) >> 48);
+  uint64_t HeigherAddr = ((ResolverAddr + 0x80008000) >> 32);
+  uint64_t HiAddr = ((ResolverAddr + 0x8000) >> 16);
 
   for (unsigned I = 0; I < NumTrampolines; ++I) {
     Trampolines[10 * I + 0] = 0x03e0c025;                            // move $t8,$ra
@@ -889,16 +845,17 @@ void OrcMips64::writeTrampolines(uint8_t *TrampolineMem, void *ResolverAddr,
     Trampolines[10 * I + 3] = 0x0019cc38;                            // dsll $t9,$t9,16
     Trampolines[10 * I + 4] = 0x67390000 | (HiAddr & 0xFFFF);        // daddiu $t9,$t9,%hi(ptr)
     Trampolines[10 * I + 5] = 0x0019cc38;                            // dsll $t9,$t9,16
-    Trampolines[10 * I + 6] = 0x67390000 | (ResolveAddr & 0xFFFF);   // daddiu $t9,$t9,%lo(ptr)
+    Trampolines[10 * I + 6] =
+        0x67390000 | (ResolverAddr & 0xFFFF); // daddiu $t9,$t9,%lo(ptr)
     Trampolines[10 * I + 7] = 0x0320f809;                            // jalr $t9
     Trampolines[10 * I + 8] = 0x00000000;                            // nop
     Trampolines[10 * I + 9] = 0x00000000;                            // nop
   }
 }
 
-Error OrcMips64::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
-                                        unsigned MinStubs,
-                                        void *InitialPtrVal) {
+void OrcMips64::writeIndirectStubsBlock(
+    char *StubsBlockWorkingMem, JITTargetAddress StubsBlockTargetAddress,
+    JITTargetAddress PointersBlockTargetAddress, unsigned NumStubs) {
   // Stub format is:
   //
   // .section __orc_stubs
@@ -926,31 +883,14 @@ Error OrcMips64::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
   //                 .dword 0x0
   //
   // ...
-  const unsigned StubSize = IndirectStubsInfo::StubSize;
-
-  // Emit at least MinStubs, rounded up to fill the pages allocated.
-  static const unsigned PageSize = sys::Process::getPageSizeEstimate();
-  unsigned NumPages = ((MinStubs * StubSize) + (PageSize - 1)) / PageSize;
-  unsigned NumStubs = (NumPages * PageSize) / StubSize;
 
-  // Allocate memory for stubs and pointers in one call.
-  std::error_code EC;
-  auto StubsMem = sys::OwningMemoryBlock(sys::Memory::allocateMappedMemory(
-      2 * NumPages * PageSize, nullptr,
-      sys::Memory::MF_READ | sys::Memory::MF_WRITE, EC));
-
-  if (EC)
-    return errorCodeToError(EC);
-
-  // Create separate MemoryBlocks representing the stubs and pointers.
-  sys::MemoryBlock StubsBlock(StubsMem.base(), NumPages * PageSize);
-  sys::MemoryBlock PtrsBlock(static_cast<char *>(StubsMem.base()) +
-                                 NumPages * PageSize,
-                             NumPages * PageSize);
+  assert(stubAndPointerRangesOk<OrcAArch64>(
+             StubsBlockTargetAddress, PointersBlockTargetAddress, NumStubs) &&
+         "PointersBlock is out of range");
 
   // Populate the stubs page stubs and mark it executable.
-  uint32_t *Stub = reinterpret_cast<uint32_t *>(StubsBlock.base());
-  uint64_t PtrAddr = reinterpret_cast<uint64_t>(PtrsBlock.base());
+  uint32_t *Stub = reinterpret_cast<uint32_t *>(StubsBlockWorkingMem);
+  uint64_t PtrAddr = PointersBlockTargetAddress;
 
   for (unsigned I = 0; I < NumStubs; ++I, PtrAddr += 8) {
     uint64_t HeighestAddr = ((PtrAddr + 0x800080008000) >> 48);
@@ -965,19 +905,6 @@ Error OrcMips64::emitIndirectStubsBlock(IndirectStubsInfo &StubsInfo,
     Stub[8 * I + 6] = 0x03200008;                            // jr $t9
     Stub[8 * I + 7] = 0x00000000;                            // nop
   }
-
-  if (auto EC = sys::Memory::protectMappedMemory(
-          StubsBlock, sys::Memory::MF_READ | sys::Memory::MF_EXEC))
-    return errorCodeToError(EC);
-
-  // Initialize all pointers to point at FailureAddress.
-  void **Ptr = reinterpret_cast<void **>(PtrsBlock.base());
-  for (unsigned I = 0; I < NumStubs; ++I)
-    Ptr[I] = InitialPtrVal;
-
-  StubsInfo = IndirectStubsInfo(NumStubs, std::move(StubsMem));
-
-  return Error::success();
 }
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h b/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
index e0af3df9d010..87bb4398765d 100644
--- a/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
+++ b/llvm/lib/ExecutionEngine/Orc/OrcCBindingsStack.h
@@ -133,7 +133,7 @@ private:
       orc::SymbolNameSet Result;
 
       for (auto &S : Symbols) {
-        if (auto Sym = findSymbol(*S)) {
+        if (auto Sym = findSymbol(std::string(*S))) {
           if (!Sym.getFlags().isStrong())
             Result.insert(S);
         } else if (auto Err = Sym.takeError()) {
@@ -151,7 +151,7 @@ private:
       orc::SymbolNameSet UnresolvedSymbols;
 
       for (auto &S : Symbols) {
-        if (auto Sym = findSymbol(*S)) {
+        if (auto Sym = findSymbol(std::string(*S))) {
           if (auto Addr = Sym.getAddress()) {
             Query->notifySymbolMetRequiredState(
                 S, JITEvaluatedSymbol(*Addr, Sym.getFlags()));
diff --git a/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h b/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
index 169dc8f1d02b..139572bd6977 100644
--- a/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
+++ b/llvm/lib/ExecutionEngine/Orc/OrcMCJITReplacement.h
@@ -154,7 +154,8 @@ class OrcMCJITReplacement : public ExecutionEngine {
           M.reportError(std::move(Err));
           return SymbolNameSet();
         } else {
-          if (auto Sym2 = M.ClientResolver->findSymbolInLogicalDylib(*S)) {
+          if (auto Sym2 =
+                  M.ClientResolver->findSymbolInLogicalDylib(std::string(*S))) {
             if (!Sym2.getFlags().isStrong())
               Result.insert(S);
           } else if (auto Err = Sym2.takeError()) {
@@ -187,7 +188,7 @@ class OrcMCJITReplacement : public ExecutionEngine {
           M.ES.legacyFailQuery(*Query, std::move(Err));
           return SymbolNameSet();
         } else {
-          if (auto Sym2 = M.ClientResolver->findSymbol(*S)) {
+          if (auto Sym2 = M.ClientResolver->findSymbol(std::string(*S))) {
             if (auto Addr = Sym2.getAddress()) {
               Query->notifySymbolMetRequiredState(
                   S, JITEvaluatedSymbol(*Addr, Sym2.getFlags()));
@@ -378,9 +379,9 @@ public:
 
 private:
   JITSymbol findMangledSymbol(StringRef Name) {
-    if (auto Sym = LazyEmitLayer.findSymbol(Name, false))
+    if (auto Sym = LazyEmitLayer.findSymbol(std::string(Name), false))
       return Sym;
-    if (auto Sym = ClientResolver->findSymbol(Name))
+    if (auto Sym = ClientResolver->findSymbol(std::string(Name)))
       return Sym;
     if (auto Sym = scanArchives(Name))
       return Sym;
diff --git a/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
new file mode 100644
index 000000000000..5933c2e666d1
--- /dev/null
+++ b/llvm/lib/ExecutionEngine/Orc/OrcV2CBindings.cpp
@@ -0,0 +1,254 @@
+//===--------------- OrcV2CBindings.cpp - C bindings OrcV2 APIs -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm-c/Orc.h"
+#include "llvm-c/TargetMachine.h"
+
+#include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
+#include "llvm/ExecutionEngine/Orc/LLJIT.h"
+
+using namespace llvm;
+using namespace llvm::orc;
+
+namespace llvm {
+namespace orc {
+
+class OrcV2CAPIHelper {
+public:
+  using PoolEntry = SymbolStringPtr::PoolEntry;
+  using PoolEntryPtr = SymbolStringPtr::PoolEntryPtr;
+
+  static PoolEntryPtr releaseSymbolStringPtr(SymbolStringPtr S) {
+    PoolEntryPtr Result = nullptr;
+    std::swap(Result, S.S);
+    return Result;
+  }
+
+  static PoolEntryPtr getRawPoolEntryPtr(const SymbolStringPtr &S) {
+    return S.S;
+  }
+
+  static void releasePoolEntry(PoolEntryPtr P) {
+    SymbolStringPtr S;
+    S.S = P;
+  }
+};
+
+} // end namespace orc
+} // end namespace llvm
+
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(ExecutionSession, LLVMOrcExecutionSessionRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(OrcV2CAPIHelper::PoolEntry,
+                                   LLVMOrcSymbolStringPoolEntryRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(JITDylib, LLVMOrcJITDylibRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(JITDylib::DefinitionGenerator,
+                                   LLVMOrcJITDylibDefinitionGeneratorRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(ThreadSafeContext,
+                                   LLVMOrcThreadSafeContextRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(ThreadSafeModule, LLVMOrcThreadSafeModuleRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(JITTargetMachineBuilder,
+                                   LLVMOrcJITTargetMachineBuilderRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(LLJITBuilder, LLVMOrcLLJITBuilderRef)
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(LLJIT, LLVMOrcLLJITRef)
+
+DEFINE_SIMPLE_CONVERSION_FUNCTIONS(TargetMachine, LLVMTargetMachineRef)
+
+LLVMOrcSymbolStringPoolEntryRef
+LLVMOrcExecutionSessionIntern(LLVMOrcExecutionSessionRef ES, const char *Name) {
+  return wrap(
+      OrcV2CAPIHelper::releaseSymbolStringPtr(unwrap(ES)->intern(Name)));
+}
+
+void LLVMOrcReleaseSymbolStringPoolEntry(LLVMOrcSymbolStringPoolEntryRef S) {
+  OrcV2CAPIHelper::releasePoolEntry(unwrap(S));
+}
+
+void LLVMOrcDisposeJITDylibDefinitionGenerator(
+    LLVMOrcJITDylibDefinitionGeneratorRef DG) {
+  delete unwrap(DG);
+}
+
+void LLVMOrcJITDylibAddGenerator(LLVMOrcJITDylibRef JD,
+                                 LLVMOrcJITDylibDefinitionGeneratorRef DG) {
+  unwrap(JD)->addGenerator(
+      std::unique_ptr<JITDylib::DefinitionGenerator>(unwrap(DG)));
+}
+
+LLVMErrorRef LLVMOrcCreateDynamicLibrarySearchGeneratorForProcess(
+    LLVMOrcJITDylibDefinitionGeneratorRef *Result, char GlobalPrefix,
+    LLVMOrcSymbolPredicate Filter, void *FilterCtx) {
+  assert(Result && "Result can not be null");
+  assert((Filter || !FilterCtx) &&
+         "if Filter is null then FilterCtx must also be null");
+
+  DynamicLibrarySearchGenerator::SymbolPredicate Pred;
+  if (Filter)
+    Pred = [=](const SymbolStringPtr &Name) -> bool {
+      return Filter(wrap(OrcV2CAPIHelper::getRawPoolEntryPtr(Name)), FilterCtx);
+    };
+
+  auto ProcessSymsGenerator =
+      DynamicLibrarySearchGenerator::GetForCurrentProcess(GlobalPrefix, Pred);
+
+  if (!ProcessSymsGenerator) {
+    *Result = 0;
+    return wrap(ProcessSymsGenerator.takeError());
+  }
+
+  *Result = wrap(ProcessSymsGenerator->release());
+  return LLVMErrorSuccess;
+}
+
+LLVMOrcThreadSafeContextRef LLVMOrcCreateNewThreadSafeContext(void) {
+  return wrap(new ThreadSafeContext(std::make_unique<LLVMContext>()));
+}
+
+LLVMContextRef
+LLVMOrcThreadSafeContextGetContext(LLVMOrcThreadSafeContextRef TSCtx) {
+  return wrap(unwrap(TSCtx)->getContext());
+}
+
+void LLVMOrcDisposeThreadSafeContext(LLVMOrcThreadSafeContextRef TSCtx) {
+  delete unwrap(TSCtx);
+}
+
+LLVMOrcThreadSafeModuleRef
+LLVMOrcCreateNewThreadSafeModule(LLVMModuleRef M,
+                                 LLVMOrcThreadSafeContextRef TSCtx) {
+  return wrap(
+      new ThreadSafeModule(std::unique_ptr<Module>(unwrap(M)), *unwrap(TSCtx)));
+}
+
+void LLVMOrcDisposeThreadSafeModule(LLVMOrcThreadSafeModuleRef TSM) {
+  delete unwrap(TSM);
+}
+
+LLVMErrorRef LLVMOrcJITTargetMachineBuilderDetectHost(
+    LLVMOrcJITTargetMachineBuilderRef *Result) {
+  assert(Result && "Result can not be null");
+
+  auto JTMB = JITTargetMachineBuilder::detectHost();
+  if (!JTMB) {
+    Result = 0;
+    return wrap(JTMB.takeError());
+  }
+
+  *Result = wrap(new JITTargetMachineBuilder(std::move(*JTMB)));
+  return LLVMErrorSuccess;
+}
+
+LLVMOrcJITTargetMachineBuilderRef
+LLVMOrcJITTargetMachineBuilderFromTargetMachine(LLVMTargetMachineRef TM) {
+  auto *TemplateTM = unwrap(TM);
+
+  auto JTMB =
+      std::make_unique<JITTargetMachineBuilder>(TemplateTM->getTargetTriple());
+
+  (*JTMB)
+      .setCPU(TemplateTM->getTargetCPU().str())
+      .setRelocationModel(TemplateTM->getRelocationModel())
+      .setCodeModel(TemplateTM->getCodeModel())
+      .setCodeGenOptLevel(TemplateTM->getOptLevel())
+      .setFeatures(TemplateTM->getTargetFeatureString())
+      .setOptions(TemplateTM->Options);
+
+  LLVMDisposeTargetMachine(TM);
+
+  return wrap(JTMB.release());
+}
+
+void LLVMOrcDisposeJITTargetMachineBuilder(
+    LLVMOrcJITTargetMachineBuilderRef JTMB) {
+  delete unwrap(JTMB);
+}
+
+LLVMOrcLLJITBuilderRef LLVMOrcCreateLLJITBuilder(void) {
+  return wrap(new LLJITBuilder());
+}
+
+void LLVMOrcDisposeLLJITBuilder(LLVMOrcLLJITBuilderRef Builder) {
+  delete unwrap(Builder);
+}
+
+void LLVMOrcLLJITBuilderSetJITTargetMachineBuilder(
+    LLVMOrcLLJITBuilderRef Builder, LLVMOrcJITTargetMachineBuilderRef JTMB) {
+  unwrap(Builder)->setJITTargetMachineBuilder(*unwrap(JTMB));
+}
+
+LLVMErrorRef LLVMOrcCreateLLJIT(LLVMOrcLLJITRef *Result,
+                                LLVMOrcLLJITBuilderRef Builder) {
+  assert(Result && "Result can not be null");
+
+  if (!Builder)
+    Builder = LLVMOrcCreateLLJITBuilder();
+
+  auto J = unwrap(Builder)->create();
+  LLVMOrcDisposeLLJITBuilder(Builder);
+
+  if (!J) {
+    Result = 0;
+    return wrap(J.takeError());
+  }
+
+  *Result = wrap(J->release());
+  return LLVMErrorSuccess;
+}
+
+LLVMErrorRef LLVMOrcDisposeLLJIT(LLVMOrcLLJITRef J) {
+  delete unwrap(J);
+  return LLVMErrorSuccess;
+}
+
+LLVMOrcExecutionSessionRef LLVMOrcLLJITGetExecutionSession(LLVMOrcLLJITRef J) {
+  return wrap(&unwrap(J)->getExecutionSession());
+}
+
+LLVMOrcJITDylibRef LLVMOrcLLJITGetMainJITDylib(LLVMOrcLLJITRef J) {
+  return wrap(&unwrap(J)->getMainJITDylib());
+}
+
+const char *LLVMOrcLLJITGetTripleString(LLVMOrcLLJITRef J) {
+  return unwrap(J)->getTargetTriple().str().c_str();
+}
+
+char LLVMOrcLLJITGetGlobalPrefix(LLVMOrcLLJITRef J) {
+  return unwrap(J)->getDataLayout().getGlobalPrefix();
+}
+
+LLVMOrcSymbolStringPoolEntryRef
+LLVMOrcLLJITMangleAndIntern(LLVMOrcLLJITRef J, const char *UnmangledName) {
+  return wrap(OrcV2CAPIHelper::releaseSymbolStringPtr(
+      unwrap(J)->mangleAndIntern(UnmangledName)));
+}
+
+LLVMErrorRef LLVMOrcLLJITAddObjectFile(LLVMOrcLLJITRef J, LLVMOrcJITDylibRef JD,
+                                       LLVMMemoryBufferRef ObjBuffer) {
+  return wrap(unwrap(J)->addObjectFile(
+      *unwrap(JD), std::unique_ptr<MemoryBuffer>(unwrap(ObjBuffer))));
+}
+
+LLVMErrorRef LLVMOrcLLJITAddLLVMIRModule(LLVMOrcLLJITRef J,
+                                         LLVMOrcJITDylibRef JD,
+                                         LLVMOrcThreadSafeModuleRef TSM) {
+  return wrap(unwrap(J)->addIRModule(*unwrap(JD), std::move(*unwrap(TSM))));
+}
+
+LLVMErrorRef LLVMOrcLLJITLookup(LLVMOrcLLJITRef J,
+                                LLVMOrcJITTargetAddress *Result,
+                                const char *Name) {
+  assert(Result && "Result can not be null");
+
+  auto Sym = unwrap(J)->lookup(Name);
+  if (!Sym) {
+    *Result = 0;
+    return wrap(Sym.takeError());
+  }
+
+  *Result = Sym->getAddress();
+  return LLVMErrorSuccess;
+}
diff --git a/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp b/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
index a92264c0be14..21925726072e 100644
--- a/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/RTDyldObjectLinkingLayer.h"
+#include "llvm/Object/COFF.h"
 
 namespace {
 
@@ -46,10 +47,10 @@ public:
       MR.addDependenciesForAll(Deps);
     };
 
-    JITDylibSearchOrder SearchOrder;
-    MR.getTargetJITDylib().withSearchOrderDo(
-        [&](const JITDylibSearchOrder &JDs) { SearchOrder = JDs; });
-    ES.lookup(LookupKind::Static, SearchOrder, InternedSymbols,
+    JITDylibSearchOrder LinkOrder;
+    MR.getTargetJITDylib().withLinkOrderDo(
+        [&](const JITDylibSearchOrder &LO) { LinkOrder = LO; });
+    ES.lookup(LookupKind::Static, LinkOrder, InternedSymbols,
               SymbolState::Resolved, std::move(OnResolvedWithUnwrap),
               RegisterDependencies);
   }
@@ -80,8 +81,12 @@ RTDyldObjectLinkingLayer::RTDyldObjectLinkingLayer(
 
 RTDyldObjectLinkingLayer::~RTDyldObjectLinkingLayer() {
   std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
-  for (auto &MemMgr : MemMgrs)
+  for (auto &MemMgr : MemMgrs) {
+    for (auto *L : EventListeners)
+      L->notifyFreeingObject(
+          static_cast<uint64_t>(reinterpret_cast<uintptr_t>(MemMgr.get())));
     MemMgr->deregisterEHFrames();
+  }
 }
 
 void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
@@ -96,13 +101,7 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
 
   auto &ES = getExecutionSession();
 
-  // Create a MemoryBufferRef backed MemoryBuffer (i.e. shallow) copy of the
-  // the underlying buffer to pass into RuntimeDyld. This allows us to hold
-  // ownership of the real underlying buffer and return it to the user once
-  // the object has been emitted.
-  auto ObjBuffer = MemoryBuffer::getMemBuffer(O->getMemBufferRef(), false);
-
-  auto Obj = object::ObjectFile::createObjectFile(*ObjBuffer);
+  auto Obj = object::ObjectFile::createObjectFile(*O);
 
   if (!Obj) {
     getExecutionSession().reportError(Obj.takeError());
@@ -115,7 +114,27 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
   auto InternalSymbols = std::make_shared<std::set<StringRef>>();
   {
     for (auto &Sym : (*Obj)->symbols()) {
-      if (!(Sym.getFlags() & object::BasicSymbolRef::SF_Global)) {
+
+      // Skip file symbols.
+      if (auto SymType = Sym.getType()) {
+        if (*SymType == object::SymbolRef::ST_File)
+          continue;
+      } else {
+        ES.reportError(SymType.takeError());
+        R.failMaterialization();
+        return;
+      }
+
+      Expected<uint32_t> SymFlagsOrErr = Sym.getFlags();
+      if (!SymFlagsOrErr) {
+        // TODO: Test this error.
+        ES.reportError(SymFlagsOrErr.takeError());
+        R.failMaterialization();
+        return;
+      }
+
+      // Don't include symbols that aren't global.
+      if (!(*SymFlagsOrErr & object::BasicSymbolRef::SF_Global)) {
         if (auto SymName = Sym.getName())
           InternalSymbols->insert(*SymName);
         else {
@@ -141,25 +160,79 @@ void RTDyldObjectLinkingLayer::emit(MaterializationResponsibility R,
   JITDylibSearchOrderResolver Resolver(*SharedR);
 
   jitLinkForORC(
-      **Obj, std::move(O), *MemMgr, Resolver, ProcessAllSections,
-      [this, K, SharedR, &Obj, InternalSymbols](
+      object::OwningBinary<object::ObjectFile>(std::move(*Obj), std::move(O)),
+      *MemMgr, Resolver, ProcessAllSections,
+      [this, K, SharedR, MemMgr, InternalSymbols](
+          const object::ObjectFile &Obj,
           std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObjInfo,
           std::map<StringRef, JITEvaluatedSymbol> ResolvedSymbols) {
-        return onObjLoad(K, *SharedR, **Obj, std::move(LoadedObjInfo),
+        return onObjLoad(K, *SharedR, Obj, MemMgr, std::move(LoadedObjInfo),
                          ResolvedSymbols, *InternalSymbols);
       },
-      [this, K, SharedR, O = std::move(O)](Error Err) mutable {
-        onObjEmit(K, std::move(O), *SharedR, std::move(Err));
+      [this, K, SharedR, MemMgr](object::OwningBinary<object::ObjectFile> Obj,
+                                 Error Err) mutable {
+        onObjEmit(K, *SharedR, std::move(Obj), MemMgr, std::move(Err));
       });
 }
 
+void RTDyldObjectLinkingLayer::registerJITEventListener(JITEventListener &L) {
+  std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
+  assert(llvm::none_of(EventListeners,
+                       [&](JITEventListener *O) { return O == &L; }) &&
+         "Listener has already been registered");
+  EventListeners.push_back(&L);
+}
+
+void RTDyldObjectLinkingLayer::unregisterJITEventListener(JITEventListener &L) {
+  std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
+  auto I = llvm::find(EventListeners, &L);
+  assert(I != EventListeners.end() && "Listener not registered");
+  EventListeners.erase(I);
+}
+
 Error RTDyldObjectLinkingLayer::onObjLoad(
-    VModuleKey K, MaterializationResponsibility &R, object::ObjectFile &Obj,
+    VModuleKey K, MaterializationResponsibility &R,
+    const object::ObjectFile &Obj, RuntimeDyld::MemoryManager *MemMgr,
     std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObjInfo,
     std::map<StringRef, JITEvaluatedSymbol> Resolved,
     std::set<StringRef> &InternalSymbols) {
   SymbolFlagsMap ExtraSymbolsToClaim;
   SymbolMap Symbols;
+
+  // Hack to support COFF constant pool comdats introduced during compilation:
+  // (See http://llvm.org/PR40074)
+  if (auto *COFFObj = dyn_cast<object::COFFObjectFile>(&Obj)) {
+    auto &ES = getExecutionSession();
+
+    // For all resolved symbols that are not already in the responsibilty set:
+    // check whether the symbol is in a comdat section and if so mark it as
+    // weak.
+    for (auto &Sym : COFFObj->symbols()) {
+      // getFlags() on COFF symbols can't fail.
+      uint32_t SymFlags = cantFail(Sym.getFlags());
+      if (SymFlags & object::BasicSymbolRef::SF_Undefined)
+        continue;
+      auto Name = Sym.getName();
+      if (!Name)
+        return Name.takeError();
+      auto I = Resolved.find(*Name);
+
+      // Skip unresolved symbols, internal symbols, and symbols that are
+      // already in the responsibility set.
+      if (I == Resolved.end() || InternalSymbols.count(*Name) ||
+          R.getSymbols().count(ES.intern(*Name)))
+        continue;
+      auto Sec = Sym.getSection();
+      if (!Sec)
+        return Sec.takeError();
+      if (*Sec == COFFObj->section_end())
+        continue;
+      auto &COFFSec = *COFFObj->getCOFFSection(**Sec);
+      if (COFFSec.Characteristics & COFF::IMAGE_SCN_LNK_COMDAT)
+        I->second.setFlags(I->second.getFlags() | JITSymbolFlags::Weak);
+    }
+  }
+
   for (auto &KV : Resolved) {
     // Scan the symbols and add them to the Symbols map for resolution.
 
@@ -184,10 +257,17 @@ Error RTDyldObjectLinkingLayer::onObjLoad(
     Symbols[InternedName] = JITEvaluatedSymbol(KV.second.getAddress(), Flags);
   }
 
-  if (!ExtraSymbolsToClaim.empty())
+  if (!ExtraSymbolsToClaim.empty()) {
     if (auto Err = R.defineMaterializing(ExtraSymbolsToClaim))
       return Err;
 
+    // If we claimed responsibility for any weak symbols but were rejected then
+    // we need to remove them from the resolved set.
+    for (auto &KV : ExtraSymbolsToClaim)
+      if (KV.second.isWeak() && !R.getSymbols().count(KV.first))
+        Symbols.erase(KV.first);
+  }
+
   if (auto Err = R.notifyResolved(Symbols)) {
     R.failMaterialization();
     return Err;
@@ -196,12 +276,17 @@ Error RTDyldObjectLinkingLayer::onObjLoad(
   if (NotifyLoaded)
     NotifyLoaded(K, Obj, *LoadedObjInfo);
 
+  std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
+  assert(!LoadedObjInfos.count(MemMgr) && "Duplicate loaded info for MemMgr");
+  LoadedObjInfos[MemMgr] = std::move(LoadedObjInfo);
+
   return Error::success();
 }
 
 void RTDyldObjectLinkingLayer::onObjEmit(
-    VModuleKey K, std::unique_ptr<MemoryBuffer> ObjBuffer,
-    MaterializationResponsibility &R, Error Err) {
+    VModuleKey K, MaterializationResponsibility &R,
+    object::OwningBinary<object::ObjectFile> O,
+    RuntimeDyld::MemoryManager *MemMgr, Error Err) {
   if (Err) {
     getExecutionSession().reportError(std::move(Err));
     R.failMaterialization();
@@ -214,6 +299,22 @@ void RTDyldObjectLinkingLayer::onObjEmit(
     return;
   }
 
+  std::unique_ptr<object::ObjectFile> Obj;
+  std::unique_ptr<MemoryBuffer> ObjBuffer;
+  std::tie(Obj, ObjBuffer) = O.takeBinary();
+
+  // Run EventListener notifyLoaded callbacks.
+  {
+    std::lock_guard<std::mutex> Lock(RTDyldLayerMutex);
+    auto LOIItr = LoadedObjInfos.find(MemMgr);
+    assert(LOIItr != LoadedObjInfos.end() && "LoadedObjInfo missing");
+    for (auto *L : EventListeners)
+      L->notifyObjectLoaded(
+          static_cast<uint64_t>(reinterpret_cast<uintptr_t>(MemMgr)), *Obj,
+          *LOIItr->second);
+    LoadedObjInfos.erase(MemMgr);
+  }
+
   if (NotifyEmitted)
     NotifyEmitted(K, std::move(ObjBuffer));
 }
diff --git a/llvm/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp b/llvm/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp
index f22acf50419d..7240c1ed0ce9 100644
--- a/llvm/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/SpeculateAnalyses.cpp
@@ -209,7 +209,7 @@ void SequenceBBQuery::traverseToExitBlock(const BasicBlock *AtBB,
     VisitedBlocks.insert(std::make_pair(AtBB, BlockHint));
   }
 
-  succ_const_iterator PIt = succ_begin(AtBB), EIt = succ_end(AtBB);
+  const_succ_iterator PIt = succ_begin(AtBB), EIt = succ_end(AtBB);
   if (PIt == EIt) // No succs.
     return;
 
diff --git a/llvm/lib/ExecutionEngine/Orc/Speculation.cpp b/llvm/lib/ExecutionEngine/Orc/Speculation.cpp
index f29201c147a1..0530b1a97b67 100644
--- a/llvm/lib/ExecutionEngine/Orc/Speculation.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/Speculation.cpp
@@ -96,7 +96,7 @@ void IRSpeculationLayer::emit(MaterializationResponsibility R,
               M, LoadValueTy, false, GlobalValue::LinkageTypes::InternalLinkage,
               ConstantInt::get(LoadValueTy, 0),
               "__orc_speculate.guard.for." + Fn.getName());
-          SpeculatorGuard->setAlignment(Align::None());
+          SpeculatorGuard->setAlignment(Align(1));
           SpeculatorGuard->setUnnamedAddr(GlobalValue::UnnamedAddr::Local);
 
           BasicBlock &ProgramEntry = Fn.getEntryBlock();
diff --git a/llvm/lib/ExecutionEngine/OrcError/OrcError.cpp b/llvm/lib/ExecutionEngine/OrcError/OrcError.cpp
index 5eab246d4b48..cc99e154fbec 100644
--- a/llvm/lib/ExecutionEngine/OrcError/OrcError.cpp
+++ b/llvm/lib/ExecutionEngine/OrcError/OrcError.cpp
@@ -61,6 +61,10 @@ public:
              "(Use StringError to get error message)";
     case OrcErrorCode::UnknownResourceHandle:
       return "Unknown resource handle";
+    case OrcErrorCode::MissingSymbolDefinitions:
+      return "MissingSymbolsDefinitions";
+    case OrcErrorCode::UnexpectedSymbolDefinitions:
+      return "UnexpectedSymbolDefinitions";
     }
     llvm_unreachable("Unhandled error code");
   }
diff --git a/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp b/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp
index cc196df3b2fa..d4c715cc59f6 100644
--- a/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp
+++ b/llvm/lib/ExecutionEngine/PerfJITEvents/PerfJITEventListener.cpp
@@ -34,9 +34,8 @@
 #include <mutex>
 
 #include <sys/mman.h>  // mmap()
-#include <sys/types.h> // getpid()
 #include <time.h>      // clock_gettime(), time(), localtime_r() */
-#include <unistd.h>    // for getpid(), read(), close()
+#include <unistd.h>    // for read(), close()
 
 using namespace llvm;
 using namespace llvm::object;
@@ -81,7 +80,7 @@ private:
   void NotifyDebug(uint64_t CodeAddr, DILineInfoTable Lines);
 
   // cache lookups
-  pid_t Pid;
+  sys::Process::Pid Pid;
 
   // base directory for output data
   std::string JitPath;
@@ -177,7 +176,8 @@ static inline uint64_t perf_get_timestamp(void) {
   return timespec_to_ns(&ts);
 }
 
-PerfJITEventListener::PerfJITEventListener() : Pid(::getpid()) {
+PerfJITEventListener::PerfJITEventListener()
+    : Pid(sys::Process::getProcessId()) {
   // check if clock-source is supported
   if (!perf_get_timestamp()) {
     errs() << "kernel does not support CLOCK_MONOTONIC\n";
@@ -328,7 +328,7 @@ bool PerfJITEventListener::InitDebuggingDir() {
     return false;
   }
 
-  JitPath = UniqueDebugDir.str();
+  JitPath = std::string(UniqueDebugDir.str());
 
   return true;
 }
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/JITSymbol.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/JITSymbol.cpp
index 4e2d0f422f39..0f6f9efe1102 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/JITSymbol.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/JITSymbol.cpp
@@ -14,11 +14,14 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/Object/ObjectFile.h"
 
 using namespace llvm;
 
 JITSymbolFlags llvm::JITSymbolFlags::fromGlobalValue(const GlobalValue &GV) {
+  assert(GV.hasName() && "Can't get flags for anonymous symbol");
+
   JITSymbolFlags Flags = JITSymbolFlags::None;
   if (GV.hasWeakLinkage() || GV.hasLinkOnceLinkage())
     Flags |= JITSymbolFlags::Weak;
@@ -33,17 +36,48 @@ JITSymbolFlags llvm::JITSymbolFlags::fromGlobalValue(const GlobalValue &GV) {
            isa<Function>(cast<GlobalAlias>(GV).getAliasee()))
     Flags |= JITSymbolFlags::Callable;
 
+  // Check for a linker-private-global-prefix on the symbol name, in which
+  // case it must be marked as non-exported.
+  if (auto *M = GV.getParent()) {
+    const auto &DL = M->getDataLayout();
+    StringRef LPGP = DL.getLinkerPrivateGlobalPrefix();
+    if (!LPGP.empty() && GV.getName().front() == '\01' &&
+        GV.getName().substr(1).startswith(LPGP))
+      Flags &= ~JITSymbolFlags::Exported;
+  }
+
+  return Flags;
+}
+
+JITSymbolFlags llvm::JITSymbolFlags::fromSummary(GlobalValueSummary *S) {
+  JITSymbolFlags Flags = JITSymbolFlags::None;
+  auto L = S->linkage();
+  if (GlobalValue::isWeakLinkage(L) || GlobalValue::isLinkOnceLinkage(L))
+    Flags |= JITSymbolFlags::Weak;
+  if (GlobalValue::isCommonLinkage(L))
+    Flags |= JITSymbolFlags::Common;
+  if (GlobalValue::isExternalLinkage(L) || GlobalValue::isExternalWeakLinkage(L))
+    Flags |= JITSymbolFlags::Exported;
+
+  if (isa<FunctionSummary>(S))
+    Flags |= JITSymbolFlags::Callable;
+
   return Flags;
 }
 
 Expected<JITSymbolFlags>
 llvm::JITSymbolFlags::fromObjectSymbol(const object::SymbolRef &Symbol) {
+  Expected<uint32_t> SymbolFlagsOrErr = Symbol.getFlags();
+  if (!SymbolFlagsOrErr)
+    // TODO: Test this error.
+    return SymbolFlagsOrErr.takeError();
+
   JITSymbolFlags Flags = JITSymbolFlags::None;
-  if (Symbol.getFlags() & object::BasicSymbolRef::SF_Weak)
+  if (*SymbolFlagsOrErr & object::BasicSymbolRef::SF_Weak)
     Flags |= JITSymbolFlags::Weak;
-  if (Symbol.getFlags() & object::BasicSymbolRef::SF_Common)
+  if (*SymbolFlagsOrErr & object::BasicSymbolRef::SF_Common)
     Flags |= JITSymbolFlags::Common;
-  if (Symbol.getFlags() & object::BasicSymbolRef::SF_Exported)
+  if (*SymbolFlagsOrErr & object::BasicSymbolRef::SF_Exported)
     Flags |= JITSymbolFlags::Exported;
 
   auto SymbolType = Symbol.getType();
@@ -58,8 +92,12 @@ llvm::JITSymbolFlags::fromObjectSymbol(const object::SymbolRef &Symbol) {
 
 ARMJITSymbolFlags
 llvm::ARMJITSymbolFlags::fromObjectSymbol(const object::SymbolRef &Symbol) {
+  Expected<uint32_t> SymbolFlagsOrErr = Symbol.getFlags();
+  if (!SymbolFlagsOrErr)
+    // TODO: Actually report errors helpfully.
+    report_fatal_error(SymbolFlagsOrErr.takeError());
   ARMJITSymbolFlags Flags;
-  if (Symbol.getFlags() & object::BasicSymbolRef::SF_Thumb)
+  if (*SymbolFlagsOrErr & object::BasicSymbolRef::SF_Thumb)
     Flags |= ARMJITSymbolFlags::Thumb;
   return Flags;
 }
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
index 2df71a5e5e74..7e9b0690ccea 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyld.cpp
@@ -214,8 +214,12 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
   {
     JITSymbolResolver::LookupSet Symbols;
     for (auto &Sym : Obj.symbols()) {
-      uint32_t Flags = Sym.getFlags();
-      if ((Flags & SymbolRef::SF_Common) || (Flags & SymbolRef::SF_Weak)) {
+      Expected<uint32_t> FlagsOrErr = Sym.getFlags();
+      if (!FlagsOrErr)
+        // TODO: Test this error.
+        return FlagsOrErr.takeError();
+      if ((*FlagsOrErr & SymbolRef::SF_Common) ||
+          (*FlagsOrErr & SymbolRef::SF_Weak)) {
         // Get symbol name.
         if (auto NameOrErr = Sym.getName())
           Symbols.insert(*NameOrErr);
@@ -234,10 +238,13 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
   LLVM_DEBUG(dbgs() << "Parse symbols:\n");
   for (symbol_iterator I = Obj.symbol_begin(), E = Obj.symbol_end(); I != E;
        ++I) {
-    uint32_t Flags = I->getFlags();
+    Expected<uint32_t> FlagsOrErr = I->getFlags();
+    if (!FlagsOrErr)
+      // TODO: Test this error.
+      return FlagsOrErr.takeError();
 
     // Skip undefined symbols.
-    if (Flags & SymbolRef::SF_Undefined)
+    if (*FlagsOrErr & SymbolRef::SF_Undefined)
       continue;
 
     // Get the symbol type.
@@ -287,7 +294,7 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
       }
     }
 
-    if (Flags & SymbolRef::SF_Absolute &&
+    if (*FlagsOrErr & SymbolRef::SF_Absolute &&
         SymType != object::SymbolRef::ST_File) {
       uint64_t Addr = 0;
       if (auto AddrOrErr = I->getAddress())
@@ -300,7 +307,7 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
       LLVM_DEBUG(dbgs() << "\tType: " << SymType << " (absolute) Name: " << Name
                         << " SID: " << SectionID
                         << " Offset: " << format("%p", (uintptr_t)Addr)
-                        << " flags: " << Flags << "\n");
+                        << " flags: " << *FlagsOrErr << "\n");
       GlobalSymbolTable[Name] = SymbolTableEntry(SectionID, Addr, *JITSymFlags);
     } else if (SymType == object::SymbolRef::ST_Function ||
                SymType == object::SymbolRef::ST_Data ||
@@ -332,7 +339,7 @@ RuntimeDyldImpl::loadObjectImpl(const object::ObjectFile &Obj) {
       LLVM_DEBUG(dbgs() << "\tType: " << SymType << " Name: " << Name
                         << " SID: " << SectionID
                         << " Offset: " << format("%p", (uintptr_t)SectOffset)
-                        << " flags: " << Flags << "\n");
+                        << " flags: " << *FlagsOrErr << "\n");
       GlobalSymbolTable[Name] =
           SymbolTableEntry(SectionID, SectOffset, *JITSymFlags);
     }
@@ -592,8 +599,11 @@ Error RuntimeDyldImpl::computeTotalAllocSize(const ObjectFile &Obj,
   uint32_t CommonAlign = 1;
   for (symbol_iterator I = Obj.symbol_begin(), E = Obj.symbol_end(); I != E;
        ++I) {
-    uint32_t Flags = I->getFlags();
-    if (Flags & SymbolRef::SF_Common) {
+    Expected<uint32_t> FlagsOrErr = I->getFlags();
+    if (!FlagsOrErr)
+      // TODO: Test this error.
+      return FlagsOrErr.takeError();
+    if (*FlagsOrErr & SymbolRef::SF_Common) {
       // Add the common symbols to a list.  We'll allocate them all below.
       uint64_t Size = I->getCommonSize();
       uint32_t Align = I->getAlignment();
@@ -1190,16 +1200,16 @@ Error RuntimeDyldImpl::resolveExternalSymbols() {
 
 void RuntimeDyldImpl::finalizeAsync(
     std::unique_ptr<RuntimeDyldImpl> This,
-    unique_function<void(Error)> OnEmitted,
-    std::unique_ptr<MemoryBuffer> UnderlyingBuffer) {
+    unique_function<void(object::OwningBinary<object::ObjectFile>, Error)>
+        OnEmitted,
+    object::OwningBinary<object::ObjectFile> O) {
 
   auto SharedThis = std::shared_ptr<RuntimeDyldImpl>(std::move(This));
   auto PostResolveContinuation =
-      [SharedThis, OnEmitted = std::move(OnEmitted),
-       UnderlyingBuffer = std::move(UnderlyingBuffer)](
+      [SharedThis, OnEmitted = std::move(OnEmitted), O = std::move(O)](
           Expected<JITSymbolResolver::LookupResult> Result) mutable {
         if (!Result) {
-          OnEmitted(Result.takeError());
+          OnEmitted(std::move(O), Result.takeError());
           return;
         }
 
@@ -1213,10 +1223,11 @@ void RuntimeDyldImpl::finalizeAsync(
         SharedThis->registerEHFrames();
         std::string ErrMsg;
         if (SharedThis->MemMgr.finalizeMemory(&ErrMsg))
-          OnEmitted(make_error<StringError>(std::move(ErrMsg),
+          OnEmitted(std::move(O),
+                    make_error<StringError>(std::move(ErrMsg),
                                             inconvertibleErrorCode()));
         else
-          OnEmitted(Error::success());
+          OnEmitted(std::move(O), Error::success());
       };
 
   JITSymbolResolver::LookupSet Symbols;
@@ -1403,32 +1414,35 @@ void RuntimeDyld::deregisterEHFrames() {
 // FIXME: Kill this with fire once we have a new JIT linker: this is only here
 // so that we can re-use RuntimeDyld's implementation without twisting the
 // interface any further for ORC's purposes.
-void jitLinkForORC(object::ObjectFile &Obj,
-                   std::unique_ptr<MemoryBuffer> UnderlyingBuffer,
-                   RuntimeDyld::MemoryManager &MemMgr,
-                   JITSymbolResolver &Resolver, bool ProcessAllSections,
-                   unique_function<Error(
-                       std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObj,
-                       std::map<StringRef, JITEvaluatedSymbol>)>
-                       OnLoaded,
-                   unique_function<void(Error)> OnEmitted) {
+void jitLinkForORC(
+    object::OwningBinary<object::ObjectFile> O,
+    RuntimeDyld::MemoryManager &MemMgr, JITSymbolResolver &Resolver,
+    bool ProcessAllSections,
+    unique_function<
+        Error(const object::ObjectFile &Obj,
+              std::unique_ptr<RuntimeDyld::LoadedObjectInfo> LoadedObj,
+              std::map<StringRef, JITEvaluatedSymbol>)>
+        OnLoaded,
+    unique_function<void(object::OwningBinary<object::ObjectFile>, Error)>
+        OnEmitted) {
 
   RuntimeDyld RTDyld(MemMgr, Resolver);
   RTDyld.setProcessAllSections(ProcessAllSections);
 
-  auto Info = RTDyld.loadObject(Obj);
+  auto Info = RTDyld.loadObject(*O.getBinary());
 
   if (RTDyld.hasError()) {
-    OnEmitted(make_error<StringError>(RTDyld.getErrorString(),
-                                      inconvertibleErrorCode()));
+    OnEmitted(std::move(O), make_error<StringError>(RTDyld.getErrorString(),
+                                                    inconvertibleErrorCode()));
     return;
   }
 
-  if (auto Err = OnLoaded(std::move(Info), RTDyld.getSymbolTable()))
-    OnEmitted(std::move(Err));
+  if (auto Err =
+          OnLoaded(*O.getBinary(), std::move(Info), RTDyld.getSymbolTable()))
+    OnEmitted(std::move(O), std::move(Err));
 
   RuntimeDyldImpl::finalizeAsync(std::move(RTDyld.Dyld), std::move(OnEmitted),
-                                 std::move(UnderlyingBuffer));
+                                 std::move(O));
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
index 6e3cd7cd2cfc..1d8f1ac8ac8a 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Object/ObjectFile.h"
+#include "llvm/Support/FormatVariadic.h"
 
 using namespace llvm;
 using namespace llvm::object;
@@ -75,7 +76,42 @@ RuntimeDyldCOFF::loadObject(const object::ObjectFile &O) {
 
 uint64_t RuntimeDyldCOFF::getSymbolOffset(const SymbolRef &Sym) {
   // The value in a relocatable COFF object is the offset.
-  return Sym.getValue();
+  return cantFail(Sym.getValue());
+}
+
+uint64_t RuntimeDyldCOFF::getDLLImportOffset(unsigned SectionID, StubMap &Stubs,
+                                             StringRef Name,
+                                             bool SetSectionIDMinus1) {
+  LLVM_DEBUG(dbgs() << "Getting DLLImport entry for " << Name << "... ");
+  assert(Name.startswith(getImportSymbolPrefix()) && "Not a DLLImport symbol?");
+  RelocationValueRef Reloc;
+  Reloc.SymbolName = Name.data();
+  auto I = Stubs.find(Reloc);
+  if (I != Stubs.end()) {
+    LLVM_DEBUG(dbgs() << format("{0:x8}", I->second) << "\n");
+    return I->second;
+  }
+
+  assert(SectionID < Sections.size() && "SectionID out of range");
+  auto &Sec = Sections[SectionID];
+  auto EntryOffset = alignTo(Sec.getStubOffset(), PointerSize);
+  Sec.advanceStubOffset(EntryOffset + PointerSize - Sec.getStubOffset());
+  Stubs[Reloc] = EntryOffset;
+
+  RelocationEntry RE(SectionID, EntryOffset, PointerReloc, 0, false,
+                     Log2_64(PointerSize));
+  // Hack to tell I386/Thumb resolveRelocation that this isn't section relative.
+  if (SetSectionIDMinus1)
+    RE.Sections.SectionA = -1;
+  addRelocationForSymbol(RE, Name.drop_front(getImportSymbolPrefix().size()));
+
+  LLVM_DEBUG({
+    dbgs() << "Creating entry at "
+           << formatv("{0:x16} + {1:x8} ( {2:x16} )", Sec.getLoadAddress(),
+                      EntryOffset, Sec.getLoadAddress() + EntryOffset)
+           << "\n";
+  });
+  return EntryOffset;
 }
 
 bool RuntimeDyldCOFF::isCompatibleFile(const object::ObjectFile &Obj) const {
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h
index 4efd18a2e6c5..41ee06c15448 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldCOFF.h
@@ -36,9 +36,22 @@ public:
 
 protected:
   RuntimeDyldCOFF(RuntimeDyld::MemoryManager &MemMgr,
-                  JITSymbolResolver &Resolver)
-    : RuntimeDyldImpl(MemMgr, Resolver) {}
+                  JITSymbolResolver &Resolver, unsigned PointerSize,
+                  uint32_t PointerReloc)
+      : RuntimeDyldImpl(MemMgr, Resolver), PointerSize(PointerSize),
+        PointerReloc(PointerReloc) {
+    assert((PointerSize == 4 || PointerSize == 8) && "Unexpected pointer size");
+  }
+
   uint64_t getSymbolOffset(const SymbolRef &Sym);
+  uint64_t getDLLImportOffset(unsigned SectionID, StubMap &Stubs,
+                              StringRef Name, bool SetSectionIDMinus1 = false);
+
+  static constexpr StringRef getImportSymbolPrefix() { return "__imp_"; }
+
+private:
+  unsigned PointerSize;
+  uint32_t PointerReloc;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
index 2ac0586ff324..e5e512672daa 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldChecker.cpp
@@ -9,6 +9,7 @@
 #include "llvm/ExecutionEngine/RuntimeDyldChecker.h"
 #include "RuntimeDyldCheckerImpl.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCInst.h"
@@ -704,10 +705,11 @@ bool RuntimeDyldCheckerImpl::checkAllRulesInBuffer(StringRef RulePrefix,
   bool DidAllTestsPass = true;
   unsigned NumRules = 0;
 
+  std::string CheckExpr;
   const char *LineStart = MemBuf->getBufferStart();
 
   // Eat whitespace.
-  while (LineStart != MemBuf->getBufferEnd() && std::isspace(*LineStart))
+  while (LineStart != MemBuf->getBufferEnd() && isSpace(*LineStart))
     ++LineStart;
 
   while (LineStart != MemBuf->getBufferEnd() && *LineStart != '\0') {
@@ -717,14 +719,23 @@ bool RuntimeDyldCheckerImpl::checkAllRulesInBuffer(StringRef RulePrefix,
       ++LineEnd;
 
     StringRef Line(LineStart, LineEnd - LineStart);
-    if (Line.startswith(RulePrefix)) {
-      DidAllTestsPass &= check(Line.substr(RulePrefix.size()));
-      ++NumRules;
+    if (Line.startswith(RulePrefix))
+      CheckExpr += Line.substr(RulePrefix.size()).str();
+
+    // If there's a check expr string...
+    if (!CheckExpr.empty()) {
+      // ... and it's complete then run it, otherwise remove the trailer '\'.
+      if (CheckExpr.back() != '\\') {
+        DidAllTestsPass &= check(CheckExpr);
+        CheckExpr.clear();
+        ++NumRules;
+      } else
+        CheckExpr.pop_back();
     }
 
     // Eat whitespace.
     LineStart = LineEnd;
-    while (LineStart != MemBuf->getBufferEnd() && std::isspace(*LineStart))
+    while (LineStart != MemBuf->getBufferEnd() && isSpace(*LineStart))
       ++LineStart;
   }
   return DidAllTestsPass && (NumRules != 0);
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
index 440ab4174a56..7c39ddc8b1da 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.cpp
@@ -399,6 +399,13 @@ void RuntimeDyldELF::resolveAArch64Relocation(const SectionEntry &Section,
   case ELF::R_AARCH64_ABS64:
     write(isBE, TargetPtr, Value + Addend);
     break;
+  case ELF::R_AARCH64_PLT32: {
+    uint64_t Result = Value + Addend - FinalAddress;
+    assert(static_cast<int64_t>(Result) >= INT32_MIN &&
+           static_cast<int64_t>(Result) <= INT32_MAX);
+    write(isBE, TargetPtr, static_cast<uint32_t>(Result));
+    break;
+  }
   case ELF::R_AARCH64_PREL32: {
     uint64_t Result = Value + Addend - FinalAddress;
     assert(static_cast<int64_t>(Result) >= INT32_MIN &&
@@ -554,7 +561,7 @@ void RuntimeDyldELF::setMipsABI(const ObjectFile &Obj) {
     IsMipsO32ABI = AbiVariant & ELF::EF_MIPS_ABI_O32;
     IsMipsN32ABI = AbiVariant & ELF::EF_MIPS_ABI2;
   }
-  IsMipsN64ABI = Obj.getFileFormatName().equals("ELF64-mips");
+  IsMipsN64ABI = Obj.getFileFormatName().equals("elf64-mips");
 }
 
 // Return the .TOC. section and offset.
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
index ef0784e2273b..31892b7466e6 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldELF.h
@@ -88,12 +88,13 @@ class RuntimeDyldELF : public RuntimeDyldImpl {
 
   void setMipsABI(const ObjectFile &Obj) override;
 
-  Error findPPC64TOCSection(const ELFObjectFileBase &Obj,
+  Error findPPC64TOCSection(const object::ELFObjectFileBase &Obj,
                             ObjSectionToIDMap &LocalSections,
                             RelocationValueRef &Rel);
-  Error findOPDEntrySection(const ELFObjectFileBase &Obj,
+  Error findOPDEntrySection(const object::ELFObjectFileBase &Obj,
                             ObjSectionToIDMap &LocalSections,
                             RelocationValueRef &Rel);
+
 protected:
   size_t getGOTEntrySize() override;
 
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
index cec7b92b8c48..a9346536fd09 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/RuntimeDyldImpl.h
@@ -26,6 +26,7 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Support/Mutex.h"
 #include "llvm/Support/SwapByteOrder.h"
+#include <deque>
 #include <map>
 #include <system_error>
 #include <unordered_map>
@@ -35,8 +36,6 @@ using namespace llvm::object;
 
 namespace llvm {
 
-class Twine;
-
 #define UNIMPLEMENTED_RELOC(RelType) \
   case RelType: \
     return make_error<RuntimeDyldError>("Unimplemented relocation: " #RelType)
@@ -74,7 +73,7 @@ class SectionEntry {
 public:
   SectionEntry(StringRef name, uint8_t *address, size_t size,
                size_t allocationSize, uintptr_t objAddress)
-      : Name(name), Address(address), Size(size),
+      : Name(std::string(name)), Address(address), Size(size),
         LoadAddress(reinterpret_cast<uintptr_t>(address)), StubOffset(size),
         AllocationSize(allocationSize), ObjAddress(objAddress) {
     // AllocationSize is used only in asserts, prevent an "unused private field"
@@ -190,13 +189,11 @@ public:
 
 class RelocationValueRef {
 public:
-  unsigned SectionID;
-  uint64_t Offset;
-  int64_t Addend;
-  const char *SymbolName;
+  unsigned SectionID = 0;
+  uint64_t Offset = 0;
+  int64_t Addend = 0;
+  const char *SymbolName = nullptr;
   bool IsStubThumb = false;
-  RelocationValueRef() : SectionID(0), Offset(0), Addend(0),
-                         SymbolName(nullptr) {}
 
   inline bool operator==(const RelocationValueRef &Other) const {
     return SectionID == Other.SectionID && Offset == Other.Offset &&
@@ -251,7 +248,9 @@ protected:
 
   // A list of all sections emitted by the dynamic linker.  These sections are
   // referenced in the code by means of their index in this list - SectionID.
-  typedef SmallVector<SectionEntry, 64> SectionList;
+  // Because references may be kept while the list grows, use a container that
+  // guarantees reference stability.
+  typedef std::deque<SectionEntry> SectionList;
   SectionList Sections;
 
   typedef unsigned SID; // Type for SectionIDs
@@ -319,32 +318,18 @@ protected:
   std::string ErrorStr;
 
   void writeInt16BE(uint8_t *Addr, uint16_t Value) {
-    if (IsTargetLittleEndian)
-      sys::swapByteOrder(Value);
-    *Addr       = (Value >> 8) & 0xFF;
-    *(Addr + 1) = Value & 0xFF;
+    llvm::support::endian::write<uint16_t, llvm::support::unaligned>(
+        Addr, Value, IsTargetLittleEndian ? support::little : support::big);
   }
 
   void writeInt32BE(uint8_t *Addr, uint32_t Value) {
-    if (IsTargetLittleEndian)
-      sys::swapByteOrder(Value);
-    *Addr       = (Value >> 24) & 0xFF;
-    *(Addr + 1) = (Value >> 16) & 0xFF;
-    *(Addr + 2) = (Value >> 8) & 0xFF;
-    *(Addr + 3) = Value & 0xFF;
+    llvm::support::endian::write<uint32_t, llvm::support::unaligned>(
+        Addr, Value, IsTargetLittleEndian ? support::little : support::big);
   }
 
   void writeInt64BE(uint8_t *Addr, uint64_t Value) {
-    if (IsTargetLittleEndian)
-      sys::swapByteOrder(Value);
-    *Addr       = (Value >> 56) & 0xFF;
-    *(Addr + 1) = (Value >> 48) & 0xFF;
-    *(Addr + 2) = (Value >> 40) & 0xFF;
-    *(Addr + 3) = (Value >> 32) & 0xFF;
-    *(Addr + 4) = (Value >> 24) & 0xFF;
-    *(Addr + 5) = (Value >> 16) & 0xFF;
-    *(Addr + 6) = (Value >> 8) & 0xFF;
-    *(Addr + 7) = Value & 0xFF;
+    llvm::support::endian::write<uint64_t, llvm::support::unaligned>(
+        Addr, Value, IsTargetLittleEndian ? support::little : support::big);
   }
 
   virtual void setMipsABI(const ObjectFile &Obj) {
@@ -548,9 +533,11 @@ public:
 
   void resolveLocalRelocations();
 
-  static void finalizeAsync(std::unique_ptr<RuntimeDyldImpl> This,
-                            unique_function<void(Error)> OnEmitted,
-                            std::unique_ptr<MemoryBuffer> UnderlyingBuffer);
+  static void finalizeAsync(
+      std::unique_ptr<RuntimeDyldImpl> This,
+      unique_function<void(object::OwningBinary<object::ObjectFile>, Error)>
+          OnEmitted,
+      object::OwningBinary<object::ObjectFile> O);
 
   void reassignSectionAddress(unsigned SectionID, uint64_t Addr);
 
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFAArch64.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFAArch64.h
index a94f54f50ac4..14510e56b35a 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFAArch64.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFAArch64.h
@@ -89,7 +89,8 @@ private:
 public:
   RuntimeDyldCOFFAArch64(RuntimeDyld::MemoryManager &MM,
                          JITSymbolResolver &Resolver)
-      : RuntimeDyldCOFF(MM, Resolver), ImageBase(0) {}
+      : RuntimeDyldCOFF(MM, Resolver, 8, COFF::IMAGE_REL_ARM64_ADDR64),
+        ImageBase(0) {}
 
   unsigned getStubAlignment() override { return 8; }
 
@@ -161,7 +162,7 @@ public:
     uint64_t Offset = RelI->getOffset();
 
     // If there is no section, this must be an external reference.
-    const bool IsExtern = Section == Obj.section_end();
+    bool IsExtern = Section == Obj.section_end();
 
     // Determine the Addend used to adjust the relocation value.
     uint64_t Addend = 0;
@@ -169,6 +170,24 @@ public:
     uintptr_t ObjTarget = AddendSection.getObjAddress() + Offset;
     uint8_t *Displacement = (uint8_t *)ObjTarget;
 
+    unsigned TargetSectionID = -1;
+    uint64_t TargetOffset = -1;
+
+    if (TargetName.startswith(getImportSymbolPrefix())) {
+      TargetSectionID = SectionID;
+      TargetOffset = getDLLImportOffset(SectionID, Stubs, TargetName);
+      TargetName = StringRef();
+      IsExtern = false;
+    } else if (!IsExtern) {
+      if (auto TargetSectionIDOrErr = findOrEmitSection(
+              Obj, *Section, Section->isText(), ObjSectionToID))
+        TargetSectionID = *TargetSectionIDOrErr;
+      else
+        return TargetSectionIDOrErr.takeError();
+
+      TargetOffset = getSymbolOffset(*Symbol);
+    }
+
     switch (RelType) {
     case COFF::IMAGE_REL_ARM64_ADDR32:
     case COFF::IMAGE_REL_ARM64_ADDR32NB:
@@ -224,18 +243,10 @@ public:
                       << TargetName << " Addend " << Addend << "\n");
 #endif
 
-    unsigned TargetSectionID = -1;
     if (IsExtern) {
       RelocationEntry RE(SectionID, Offset, RelType, Addend);
       addRelocationForSymbol(RE, TargetName);
     } else {
-      if (auto TargetSectionIDOrErr = findOrEmitSection(
-              Obj, *Section, Section->isText(), ObjSectionToID)) {
-        TargetSectionID = *TargetSectionIDOrErr;
-      } else
-        return TargetSectionIDOrErr.takeError();
-
-      uint64_t TargetOffset = getSymbolOffset(*Symbol);
       RelocationEntry RE(SectionID, Offset, RelType, TargetOffset + Addend);
       addRelocationForSection(RE, TargetSectionID);
     }
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
index 40910bea0c36..03c38260bece 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFI386.h
@@ -25,7 +25,7 @@ class RuntimeDyldCOFFI386 : public RuntimeDyldCOFF {
 public:
   RuntimeDyldCOFFI386(RuntimeDyld::MemoryManager &MM,
                       JITSymbolResolver &Resolver)
-      : RuntimeDyldCOFF(MM, Resolver) {}
+      : RuntimeDyldCOFF(MM, Resolver, 4, COFF::IMAGE_REL_I386_DIR32) {}
 
   unsigned getMaxStubSize() const override {
     return 8; // 2-byte jmp instruction + 32-bit relative address + 2 byte pad
@@ -53,10 +53,28 @@ public:
     if (!SectionOrErr)
       return SectionOrErr.takeError();
     auto Section = *SectionOrErr;
+    bool IsExtern = Section == Obj.section_end();
 
     uint64_t RelType = RelI->getType();
     uint64_t Offset = RelI->getOffset();
 
+    unsigned TargetSectionID = -1;
+    uint64_t TargetOffset = -1;
+    if (TargetName.startswith(getImportSymbolPrefix())) {
+      TargetSectionID = SectionID;
+      TargetOffset = getDLLImportOffset(SectionID, Stubs, TargetName, true);
+      TargetName = StringRef();
+      IsExtern = false;
+    } else if (!IsExtern) {
+      if (auto TargetSectionIDOrErr = findOrEmitSection(
+              Obj, *Section, Section->isText(), ObjSectionToID))
+        TargetSectionID = *TargetSectionIDOrErr;
+      else
+        return TargetSectionIDOrErr.takeError();
+      if (RelType != COFF::IMAGE_REL_I386_SECTION)
+        TargetOffset = getSymbolOffset(*Symbol);
+    }
+
     // Determine the Addend used to adjust the relocation value.
     uint64_t Addend = 0;
     SectionEntry &AddendSection = Sections[SectionID];
@@ -83,16 +101,10 @@ public:
                       << " RelType: " << RelTypeName << " TargetName: "
                       << TargetName << " Addend " << Addend << "\n");
 
-    unsigned TargetSectionID = -1;
-    if (Section == Obj.section_end()) {
+    if (IsExtern) {
       RelocationEntry RE(SectionID, Offset, RelType, 0, -1, 0, 0, 0, false, 0);
       addRelocationForSymbol(RE, TargetName);
     } else {
-      if (auto TargetSectionIDOrErr =
-          findOrEmitSection(Obj, *Section, Section->isText(), ObjSectionToID))
-        TargetSectionID = *TargetSectionIDOrErr;
-      else
-        return TargetSectionIDOrErr.takeError();
 
       switch (RelType) {
       case COFF::IMAGE_REL_I386_ABSOLUTE:
@@ -103,7 +115,7 @@ public:
       case COFF::IMAGE_REL_I386_REL32: {
         RelocationEntry RE =
             RelocationEntry(SectionID, Offset, RelType, Addend, TargetSectionID,
-                            getSymbolOffset(*Symbol), 0, 0, false, 0);
+                            TargetOffset, 0, 0, false, 0);
         addRelocationForSection(RE, TargetSectionID);
         break;
       }
@@ -114,15 +126,14 @@ public:
         break;
       }
       case COFF::IMAGE_REL_I386_SECREL: {
-        RelocationEntry RE = RelocationEntry(SectionID, Offset, RelType,
-                                             getSymbolOffset(*Symbol) + Addend);
+        RelocationEntry RE =
+            RelocationEntry(SectionID, Offset, RelType, TargetOffset + Addend);
         addRelocationForSection(RE, TargetSectionID);
         break;
       }
       default:
         llvm_unreachable("unsupported relocation type");
       }
-
     }
 
     return ++RelI;
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
index bb2e9626e0b0..721f2b14829a 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFThumb.h
@@ -48,7 +48,7 @@ class RuntimeDyldCOFFThumb : public RuntimeDyldCOFF {
 public:
   RuntimeDyldCOFFThumb(RuntimeDyld::MemoryManager &MM,
                        JITSymbolResolver &Resolver)
-      : RuntimeDyldCOFF(MM, Resolver) {}
+      : RuntimeDyldCOFF(MM, Resolver, 4, COFF::IMAGE_REL_ARM_ADDR32) {}
 
   unsigned getMaxStubSize() const override {
     return 16; // 8-byte load instructions, 4-byte jump, 4-byte padding
@@ -103,16 +103,29 @@ public:
                       << " RelType: " << RelTypeName << " TargetName: "
                       << TargetName << " Addend " << Addend << "\n");
 
+    bool IsExtern = Section == Obj.section_end();
     unsigned TargetSectionID = -1;
-    if (Section == Obj.section_end()) {
-      RelocationEntry RE(SectionID, Offset, RelType, 0, -1, 0, 0, 0, false, 0);
-      addRelocationForSymbol(RE, TargetName);
-    } else {
+    uint64_t TargetOffset = -1;
+
+    if (TargetName.startswith(getImportSymbolPrefix())) {
+      TargetSectionID = SectionID;
+      TargetOffset = getDLLImportOffset(SectionID, Stubs, TargetName, true);
+      TargetName = StringRef();
+      IsExtern = false;
+    } else if (!IsExtern) {
       if (auto TargetSectionIDOrErr =
           findOrEmitSection(Obj, *Section, Section->isText(), ObjSectionToID))
         TargetSectionID = *TargetSectionIDOrErr;
       else
         return TargetSectionIDOrErr.takeError();
+      if (RelType != COFF::IMAGE_REL_ARM_SECTION)
+        TargetOffset = getSymbolOffset(*Symbol);
+    }
+
+    if (IsExtern) {
+      RelocationEntry RE(SectionID, Offset, RelType, 0, -1, 0, 0, 0, false, 0);
+      addRelocationForSymbol(RE, TargetName);
+    } else {
 
       // We need to find out if the relocation is relative to a thumb function
       // so that we include the ISA selection bit when resolve the relocation
@@ -124,16 +137,16 @@ public:
         // This relocation is ignored.
         break;
       case COFF::IMAGE_REL_ARM_ADDR32: {
-        RelocationEntry RE = RelocationEntry(
-            SectionID, Offset, RelType, Addend, TargetSectionID,
-            getSymbolOffset(*Symbol), 0, 0, false, 0, IsTargetThumbFunc);
+        RelocationEntry RE =
+            RelocationEntry(SectionID, Offset, RelType, Addend, TargetSectionID,
+                            TargetOffset, 0, 0, false, 0, IsTargetThumbFunc);
         addRelocationForSection(RE, TargetSectionID);
         break;
       }
       case COFF::IMAGE_REL_ARM_ADDR32NB: {
         RelocationEntry RE =
             RelocationEntry(SectionID, Offset, RelType, Addend, TargetSectionID,
-                            getSymbolOffset(*Symbol), 0, 0, false, 0);
+                            TargetOffset, 0, 0, false, 0);
         addRelocationForSection(RE, TargetSectionID);
         break;
       }
@@ -144,24 +157,23 @@ public:
         break;
       }
       case COFF::IMAGE_REL_ARM_SECREL: {
-        RelocationEntry RE = RelocationEntry(SectionID, Offset, RelType,
-                                             getSymbolOffset(*Symbol) + Addend);
+        RelocationEntry RE =
+            RelocationEntry(SectionID, Offset, RelType, TargetOffset + Addend);
         addRelocationForSection(RE, TargetSectionID);
         break;
       }
       case COFF::IMAGE_REL_ARM_MOV32T: {
-        RelocationEntry RE = RelocationEntry(
-            SectionID, Offset, RelType, Addend, TargetSectionID,
-            getSymbolOffset(*Symbol), 0, 0, false, 0, IsTargetThumbFunc);
+        RelocationEntry RE =
+            RelocationEntry(SectionID, Offset, RelType, Addend, TargetSectionID,
+                            TargetOffset, 0, 0, false, 0, IsTargetThumbFunc);
         addRelocationForSection(RE, TargetSectionID);
         break;
       }
       case COFF::IMAGE_REL_ARM_BRANCH20T:
       case COFF::IMAGE_REL_ARM_BRANCH24T:
       case COFF::IMAGE_REL_ARM_BLX23T: {
-        RelocationEntry RE =
-            RelocationEntry(SectionID, Offset, RelType,
-                            getSymbolOffset(*Symbol) + Addend, true, 0);
+        RelocationEntry RE = RelocationEntry(SectionID, Offset, RelType,
+                                             TargetOffset + Addend, true, 0);
         addRelocationForSection(RE, TargetSectionID);
         break;
       }
@@ -256,7 +268,6 @@ public:
       EncodeImmediate(&Target[0],
                       (static_cast<uint32_t>(Result) >> 00) | ISASelectionBit);
       EncodeImmediate(&Target[4], static_cast<uint32_t>(Result) >> 16);
-
       break;
     }
     case COFF::IMAGE_REL_ARM_BRANCH20T: {
diff --git a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
index dc4af08583de..ebe3ca33d308 100644
--- a/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
+++ b/llvm/lib/ExecutionEngine/RuntimeDyld/Targets/RuntimeDyldCOFFX86_64.h
@@ -56,7 +56,8 @@ private:
 public:
   RuntimeDyldCOFFX86_64(RuntimeDyld::MemoryManager &MM,
                         JITSymbolResolver &Resolver)
-    : RuntimeDyldCOFF(MM, Resolver), ImageBase(0) {}
+      : RuntimeDyldCOFF(MM, Resolver, 8, COFF::IMAGE_REL_AMD64_ADDR64),
+        ImageBase(0) {}
 
   unsigned getStubAlignment() override { return 1; }
 
@@ -202,7 +203,7 @@ public:
       return SectionOrError.takeError();
     object::section_iterator SecI = *SectionOrError;
     // If there is no section, this must be an external reference.
-    const bool IsExtern = SecI == Obj.section_end();
+    bool IsExtern = SecI == Obj.section_end();
 
     // Determine the Addend used to adjust the relocation value.
     uint64_t RelType = RelI->getType();
@@ -214,7 +215,25 @@ public:
     Expected<StringRef> TargetNameOrErr = Symbol->getName();
     if (!TargetNameOrErr)
       return TargetNameOrErr.takeError();
+
     StringRef TargetName = *TargetNameOrErr;
+    unsigned TargetSectionID = 0;
+    uint64_t TargetOffset = 0;
+
+    if (TargetName.startswith(getImportSymbolPrefix())) {
+      assert(IsExtern && "DLLImport not marked extern?");
+      TargetSectionID = SectionID;
+      TargetOffset = getDLLImportOffset(SectionID, Stubs, TargetName);
+      TargetName = StringRef();
+      IsExtern = false;
+    } else if (!IsExtern) {
+      if (auto TargetSectionIDOrErr =
+              findOrEmitSection(Obj, *SecI, SecI->isText(), ObjSectionToID))
+        TargetSectionID = *TargetSectionIDOrErr;
+      else
+        return TargetSectionIDOrErr.takeError();
+      TargetOffset = getSymbolOffset(*Symbol);
+    }
 
     switch (RelType) {
 
@@ -253,14 +272,6 @@ public:
       RelocationEntry RE(SectionID, Offset, RelType, Addend);
       addRelocationForSymbol(RE, TargetName);
     } else {
-      bool IsCode = SecI->isText();
-      unsigned TargetSectionID;
-      if (auto TargetSectionIDOrErr =
-          findOrEmitSection(Obj, *SecI, IsCode, ObjSectionToID))
-        TargetSectionID = *TargetSectionIDOrErr;
-      else
-        return TargetSectionIDOrErr.takeError();
-      uint64_t TargetOffset = getSymbolOffset(*Symbol);
       RelocationEntry RE(SectionID, Offset, RelType, TargetOffset + Addend);
       addRelocationForSection(RE, TargetSectionID);
     }
diff --git a/llvm/lib/Frontend/OpenMP/OMPConstants.cpp b/llvm/lib/Frontend/OpenMP/OMPConstants.cpp
deleted file mode 100644
index ec0733903e99..000000000000
--- a/llvm/lib/Frontend/OpenMP/OMPConstants.cpp
+++ /dev/null
@@ -1,87 +0,0 @@
-//===- OMPConstants.cpp - Helpers related to OpenMP code generation ---===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Frontend/OpenMP/OMPConstants.h"
-
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
-
-using namespace llvm;
-using namespace omp;
-using namespace types;
-
-Directive llvm::omp::getOpenMPDirectiveKind(StringRef Str) {
-  return llvm::StringSwitch<Directive>(Str)
-#define OMP_DIRECTIVE(Enum, Str) .Case(Str, Enum)
-#include "llvm/Frontend/OpenMP/OMPKinds.def"
-      .Default(OMPD_unknown);
-}
-
-StringRef llvm::omp::getOpenMPDirectiveName(Directive Kind) {
-  switch (Kind) {
-#define OMP_DIRECTIVE(Enum, Str)                                               \
-  case Enum:                                                                   \
-    return Str;
-#include "llvm/Frontend/OpenMP/OMPKinds.def"
-  }
-  llvm_unreachable("Invalid OpenMP directive kind");
-}
-
-/// Declarations for LLVM-IR types (simple, function and structure) are
-/// generated below. Their names are defined and used in OpenMPKinds.def. Here
-/// we provide the declarations, the initializeTypes function will provide the
-/// values.
-///
-///{
-
-#define OMP_TYPE(VarName, InitValue) Type *llvm::omp::types::VarName = nullptr;
-#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...)                  \
-  FunctionType *llvm::omp::types::VarName = nullptr;                           \
-  PointerType *llvm::omp::types::VarName##Ptr = nullptr;
-#define OMP_STRUCT_TYPE(VarName, StrName, ...)                                 \
-  StructType *llvm::omp::types::VarName = nullptr;                             \
-  PointerType *llvm::omp::types::VarName##Ptr = nullptr;
-#include "llvm/Frontend/OpenMP/OMPKinds.def"
-
-///}
-
-void llvm::omp::types::initializeTypes(Module &M) {
-  if (Void)
-    return;
-
-  LLVMContext &Ctx = M.getContext();
-  // Create all simple and struct types exposed by the runtime and remember
-  // the llvm::PointerTypes of them for easy access later.
-  StructType *T;
-#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
-#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...)                  \
-  VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg);            \
-  VarName##Ptr = PointerType::getUnqual(VarName);
-#define OMP_STRUCT_TYPE(VarName, StructName, ...)                              \
-  T = M.getTypeByName(StructName);                                             \
-  if (!T)                                                                      \
-    T = StructType::create(Ctx, {__VA_ARGS__}, StructName);                    \
-  VarName = T;                                                                 \
-  VarName##Ptr = PointerType::getUnqual(T);
-#include "llvm/Frontend/OpenMP/OMPKinds.def"
-}
-
-void llvm::omp::types::uninitializeTypes() {
-#define OMP_TYPE(VarName, InitValue) VarName = nullptr;
-#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...)                  \
-  VarName = nullptr;                                                           \
-  VarName##Ptr = nullptr;
-#define OMP_STRUCT_TYPE(VarName, StrName, ...)                                 \
-  VarName = nullptr;                                                           \
-  VarName##Ptr = nullptr;
-#include "llvm/Frontend/OpenMP/OMPKinds.def"
-}
diff --git a/llvm/lib/Frontend/OpenMP/OMPContext.cpp b/llvm/lib/Frontend/OpenMP/OMPContext.cpp
new file mode 100644
index 000000000000..c44e858ab5ed
--- /dev/null
+++ b/llvm/lib/Frontend/OpenMP/OMPContext.cpp
@@ -0,0 +1,527 @@
+//===- OMPContext.cpp ------ Collection of helpers for OpenMP contexts ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file implements helper functions and classes to deal with OpenMP
+/// contexts as used by `[begin/end] declare variant` and `metadirective`.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Frontend/OpenMP/OMPContext.h"
+#include "llvm/ADT/SetOperations.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+
+#define DEBUG_TYPE "openmp-ir-builder"
+
+using namespace llvm;
+using namespace omp;
+
+OMPContext::OMPContext(bool IsDeviceCompilation, Triple TargetTriple) {
+  // Add the appropriate device kind trait based on the triple and the
+  // IsDeviceCompilation flag.
+  ActiveTraits.set(unsigned(IsDeviceCompilation
+                                ? TraitProperty::device_kind_nohost
+                                : TraitProperty::device_kind_host));
+  switch (TargetTriple.getArch()) {
+  case Triple::arm:
+  case Triple::armeb:
+  case Triple::aarch64:
+  case Triple::aarch64_be:
+  case Triple::aarch64_32:
+  case Triple::mips:
+  case Triple::mipsel:
+  case Triple::mips64:
+  case Triple::mips64el:
+  case Triple::ppc:
+  case Triple::ppc64:
+  case Triple::ppc64le:
+  case Triple::x86:
+  case Triple::x86_64:
+    ActiveTraits.set(unsigned(TraitProperty::device_kind_cpu));
+    break;
+  case Triple::amdgcn:
+  case Triple::nvptx:
+  case Triple::nvptx64:
+    ActiveTraits.set(unsigned(TraitProperty::device_kind_gpu));
+    break;
+  default:
+    break;
+  }
+
+  // Add the appropriate device architecture trait based on the triple.
+#define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
+  if (TraitSelector::TraitSelectorEnum == TraitSelector::device_arch)          \
+    if (TargetTriple.getArch() == TargetTriple.getArchTypeForLLVMName(Str))    \
+      ActiveTraits.set(unsigned(TraitProperty::Enum));
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+
+  // TODO: What exactly do we want to see as device ISA trait?
+  //       The discussion on the list did not seem to have come to an agreed
+  //       upon solution.
+
+  // LLVM is the "OpenMP vendor" but we could also interpret vendor as the
+  // target vendor.
+  ActiveTraits.set(unsigned(TraitProperty::implementation_vendor_llvm));
+
+  // The user condition true is accepted but not false.
+  ActiveTraits.set(unsigned(TraitProperty::user_condition_true));
+
+  // This is for sure some device.
+  ActiveTraits.set(unsigned(TraitProperty::device_kind_any));
+
+  LLVM_DEBUG({
+    dbgs() << "[" << DEBUG_TYPE
+           << "] New OpenMP context with the following properties:\n";
+    for (unsigned Bit : ActiveTraits.set_bits()) {
+      TraitProperty Property = TraitProperty(Bit);
+      dbgs() << "\t " << getOpenMPContextTraitPropertyFullName(Property)
+             << "\n";
+    }
+  });
+}
+
+/// Return true if \p C0 is a subset of \p C1. Note that both arrays are
+/// expected to be sorted.
+template <typename T> static bool isSubset(ArrayRef<T> C0, ArrayRef<T> C1) {
+#ifdef EXPENSIVE_CHECKS
+  assert(llvm::is_sorted(C0) && llvm::is_sorted(C1) &&
+         "Expected sorted arrays!");
+#endif
+  if (C0.size() > C1.size())
+    return false;
+  auto It0 = C0.begin(), End0 = C0.end();
+  auto It1 = C1.begin(), End1 = C1.end();
+  while (It0 != End0) {
+    if (It1 == End1)
+      return false;
+    if (*It0 == *It1) {
+      ++It0;
+      ++It1;
+      continue;
+    }
+    ++It0;
+  }
+  return true;
+}
+
+/// Return true if \p C0 is a strict subset of \p C1. Note that both arrays are
+/// expected to be sorted.
+template <typename T>
+static bool isStrictSubset(ArrayRef<T> C0, ArrayRef<T> C1) {
+  if (C0.size() >= C1.size())
+    return false;
+  return isSubset<T>(C0, C1);
+}
+
+static bool isStrictSubset(const VariantMatchInfo &VMI0,
+                           const VariantMatchInfo &VMI1) {
+  // If all required traits are a strict subset and the ordered vectors storing
+  // the construct traits, we say it is a strict subset. Note that the latter
+  // relation is not required to be strict.
+  if (VMI0.RequiredTraits.count() >= VMI1.RequiredTraits.count())
+    return false;
+  for (unsigned Bit : VMI0.RequiredTraits.set_bits())
+    if (!VMI1.RequiredTraits.test(Bit))
+      return false;
+  if (!isSubset<TraitProperty>(VMI0.ConstructTraits, VMI1.ConstructTraits))
+    return false;
+  return true;
+}
+
+static int isVariantApplicableInContextHelper(
+    const VariantMatchInfo &VMI, const OMPContext &Ctx,
+    SmallVectorImpl<unsigned> *ConstructMatches, bool DeviceSetOnly) {
+
+  // The match kind determines if we need to match all traits, any of the
+  // traits, or none of the traits for it to be an applicable context.
+  enum MatchKind { MK_ALL, MK_ANY, MK_NONE };
+
+  MatchKind MK = MK_ALL;
+  // Determine the match kind the user wants, "all" is the default and provided
+  // to the user only for completeness.
+  if (VMI.RequiredTraits.test(
+          unsigned(TraitProperty::implementation_extension_match_any)))
+    MK = MK_ANY;
+  if (VMI.RequiredTraits.test(
+          unsigned(TraitProperty::implementation_extension_match_none)))
+    MK = MK_NONE;
+
+  // Helper to deal with a single property that was (not) found in the OpenMP
+  // context based on the match kind selected by the user via
+  // `implementation={extensions(match_[all,any,none])}'
+  auto HandleTrait = [MK](TraitProperty Property,
+                          bool WasFound) -> Optional<bool> /* Result */ {
+    // For kind "any" a single match is enough but we ignore non-matched
+    // properties.
+    if (MK == MK_ANY) {
+      if (WasFound)
+        return true;
+      return None;
+    }
+
+    // In "all" or "none" mode we accept a matching or non-matching property
+    // respectively and move on. We are not done yet!
+    if ((WasFound && MK == MK_ALL) || (!WasFound && MK == MK_NONE))
+      return None;
+
+    // We missed a property, provide some debug output and indicate failure.
+    LLVM_DEBUG({
+      if (MK == MK_ALL)
+        dbgs() << "[" << DEBUG_TYPE << "] Property "
+               << getOpenMPContextTraitPropertyName(Property)
+               << " was not in the OpenMP context but match kind is all.\n";
+      if (MK == MK_NONE)
+        dbgs() << "[" << DEBUG_TYPE << "] Property "
+               << getOpenMPContextTraitPropertyName(Property)
+               << " was in the OpenMP context but match kind is none.\n";
+    });
+    return false;
+  };
+
+  for (unsigned Bit : VMI.RequiredTraits.set_bits()) {
+    TraitProperty Property = TraitProperty(Bit);
+    if (DeviceSetOnly &&
+        getOpenMPContextTraitSetForProperty(Property) != TraitSet::device)
+      continue;
+
+    // So far all extensions are handled elsewhere, we skip them here as they
+    // are not part of the OpenMP context.
+    if (getOpenMPContextTraitSelectorForProperty(Property) ==
+        TraitSelector::implementation_extension)
+      continue;
+
+    bool IsActiveTrait = Ctx.ActiveTraits.test(unsigned(Property));
+    Optional<bool> Result = HandleTrait(Property, IsActiveTrait);
+    if (Result.hasValue())
+      return Result.getValue();
+  }
+
+  if (!DeviceSetOnly) {
+    // We could use isSubset here but we also want to record the match
+    // locations.
+    unsigned ConstructIdx = 0, NoConstructTraits = Ctx.ConstructTraits.size();
+    for (TraitProperty Property : VMI.ConstructTraits) {
+      assert(getOpenMPContextTraitSetForProperty(Property) ==
+                 TraitSet::construct &&
+             "Variant context is ill-formed!");
+
+      // Verify the nesting.
+      bool FoundInOrder = false;
+      while (!FoundInOrder && ConstructIdx != NoConstructTraits)
+        FoundInOrder = (Ctx.ConstructTraits[ConstructIdx++] == Property);
+      if (ConstructMatches)
+        ConstructMatches->push_back(ConstructIdx - 1);
+
+      Optional<bool> Result = HandleTrait(Property, FoundInOrder);
+      if (Result.hasValue())
+        return Result.getValue();
+
+      if (!FoundInOrder) {
+        LLVM_DEBUG(dbgs() << "[" << DEBUG_TYPE << "] Construct property "
+                          << getOpenMPContextTraitPropertyName(Property)
+                          << " was not nested properly.\n");
+        return false;
+      }
+
+      // TODO: Verify SIMD
+    }
+
+    assert(isSubset<TraitProperty>(VMI.ConstructTraits, Ctx.ConstructTraits) &&
+           "Broken invariant!");
+  }
+
+  if (MK == MK_ANY) {
+    LLVM_DEBUG(dbgs() << "[" << DEBUG_TYPE
+                      << "] None of the properties was in the OpenMP context "
+                         "but match kind is any.\n");
+    return false;
+  }
+
+  return true;
+}
+
+bool llvm::omp::isVariantApplicableInContext(const VariantMatchInfo &VMI,
+                                             const OMPContext &Ctx,
+                                             bool DeviceSetOnly) {
+  return isVariantApplicableInContextHelper(
+      VMI, Ctx, /* ConstructMatches */ nullptr, DeviceSetOnly);
+}
+
+static APInt getVariantMatchScore(const VariantMatchInfo &VMI,
+                                  const OMPContext &Ctx,
+                                  SmallVectorImpl<unsigned> &ConstructMatches) {
+  APInt Score(64, 1);
+
+  unsigned NoConstructTraits = VMI.ConstructTraits.size();
+  for (unsigned Bit : VMI.RequiredTraits.set_bits()) {
+    TraitProperty Property = TraitProperty(Bit);
+    // If there is a user score attached, use it.
+    if (VMI.ScoreMap.count(Property)) {
+      const APInt &UserScore = VMI.ScoreMap.lookup(Property);
+      assert(UserScore.uge(0) && "Expect non-negative user scores!");
+      Score += UserScore.getZExtValue();
+      continue;
+    }
+
+    switch (getOpenMPContextTraitSetForProperty(Property)) {
+    case TraitSet::construct:
+      // We handle the construct traits later via the VMI.ConstructTraits
+      // container.
+      continue;
+    case TraitSet::implementation:
+      // No effect on the score (implementation defined).
+      continue;
+    case TraitSet::user:
+      // No effect on the score.
+      continue;
+    case TraitSet::device:
+      // Handled separately below.
+      break;
+    case TraitSet::invalid:
+      llvm_unreachable("Unknown trait set is not to be used!");
+    }
+
+    // device={kind(any)} is "as if" no kind selector was specified.
+    if (Property == TraitProperty::device_kind_any)
+      continue;
+
+    switch (getOpenMPContextTraitSelectorForProperty(Property)) {
+    case TraitSelector::device_kind:
+      Score += (1ULL << (NoConstructTraits + 0));
+      continue;
+    case TraitSelector::device_arch:
+      Score += (1ULL << (NoConstructTraits + 1));
+      continue;
+    case TraitSelector::device_isa:
+      Score += (1ULL << (NoConstructTraits + 2));
+      continue;
+    default:
+      continue;
+    }
+  }
+
+  unsigned ConstructIdx = 0;
+  assert(NoConstructTraits == ConstructMatches.size() &&
+         "Mismatch in the construct traits!");
+  for (TraitProperty Property : VMI.ConstructTraits) {
+    assert(getOpenMPContextTraitSetForProperty(Property) ==
+               TraitSet::construct &&
+           "Ill-formed variant match info!");
+    (void)Property;
+    // ConstructMatches is the position p - 1 and we need 2^(p-1).
+    Score += (1ULL << ConstructMatches[ConstructIdx++]);
+  }
+
+  LLVM_DEBUG(dbgs() << "[" << DEBUG_TYPE << "] Variant has a score of " << Score
+                    << "\n");
+  return Score;
+}
+
+int llvm::omp::getBestVariantMatchForContext(
+    const SmallVectorImpl<VariantMatchInfo> &VMIs, const OMPContext &Ctx) {
+
+  APInt BestScore(64, 0);
+  int BestVMIIdx = -1;
+  const VariantMatchInfo *BestVMI = nullptr;
+
+  for (unsigned u = 0, e = VMIs.size(); u < e; ++u) {
+    const VariantMatchInfo &VMI = VMIs[u];
+
+    SmallVector<unsigned, 8> ConstructMatches;
+    // If the variant is not applicable its not the best.
+    if (!isVariantApplicableInContextHelper(VMI, Ctx, &ConstructMatches,
+                                            /* DeviceSetOnly */ false))
+      continue;
+    // Check if its clearly not the best.
+    APInt Score = getVariantMatchScore(VMI, Ctx, ConstructMatches);
+    if (Score.ult(BestScore))
+      continue;
+    // Equal score need subset checks.
+    if (Score.eq(BestScore)) {
+      // Strict subset are never best.
+      if (isStrictSubset(VMI, *BestVMI))
+        continue;
+      // Same score and the current best is no strict subset so we keep it.
+      if (!isStrictSubset(*BestVMI, VMI))
+        continue;
+    }
+    // New best found.
+    BestVMI = &VMI;
+    BestVMIIdx = u;
+    BestScore = Score;
+  }
+
+  return BestVMIIdx;
+}
+
+TraitSet llvm::omp::getOpenMPContextTraitSetKind(StringRef S) {
+  return StringSwitch<TraitSet>(S)
+#define OMP_TRAIT_SET(Enum, Str) .Case(Str, TraitSet::Enum)
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+      .Default(TraitSet::invalid);
+}
+
+TraitSet
+llvm::omp::getOpenMPContextTraitSetForSelector(TraitSelector Selector) {
+  switch (Selector) {
+#define OMP_TRAIT_SELECTOR(Enum, TraitSetEnum, Str, ReqProp)                   \
+  case TraitSelector::Enum:                                                    \
+    return TraitSet::TraitSetEnum;
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  }
+  llvm_unreachable("Unknown trait selector!");
+}
+TraitSet
+llvm::omp::getOpenMPContextTraitSetForProperty(TraitProperty Property) {
+  switch (Property) {
+#define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
+  case TraitProperty::Enum:                                                    \
+    return TraitSet::TraitSetEnum;
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  }
+  llvm_unreachable("Unknown trait set!");
+}
+StringRef llvm::omp::getOpenMPContextTraitSetName(TraitSet Kind) {
+  switch (Kind) {
+#define OMP_TRAIT_SET(Enum, Str)                                               \
+  case TraitSet::Enum:                                                         \
+    return Str;
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  }
+  llvm_unreachable("Unknown trait set!");
+}
+
+TraitSelector llvm::omp::getOpenMPContextTraitSelectorKind(StringRef S) {
+  return StringSwitch<TraitSelector>(S)
+#define OMP_TRAIT_SELECTOR(Enum, TraitSetEnum, Str, ReqProp)                   \
+  .Case(Str, TraitSelector::Enum)
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+      .Default(TraitSelector::invalid);
+}
+TraitSelector
+llvm::omp::getOpenMPContextTraitSelectorForProperty(TraitProperty Property) {
+  switch (Property) {
+#define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
+  case TraitProperty::Enum:                                                    \
+    return TraitSelector::TraitSelectorEnum;
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  }
+  llvm_unreachable("Unknown trait set!");
+}
+StringRef llvm::omp::getOpenMPContextTraitSelectorName(TraitSelector Kind) {
+  switch (Kind) {
+#define OMP_TRAIT_SELECTOR(Enum, TraitSetEnum, Str, ReqProp)                   \
+  case TraitSelector::Enum:                                                    \
+    return Str;
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  }
+  llvm_unreachable("Unknown trait selector!");
+}
+
+TraitProperty llvm::omp::getOpenMPContextTraitPropertyKind(TraitSet Set,
+                                                           StringRef S) {
+#define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
+  if (Set == TraitSet::TraitSetEnum && Str == S)                               \
+    return TraitProperty::Enum;
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  return TraitProperty::invalid;
+}
+TraitProperty
+llvm::omp::getOpenMPContextTraitPropertyForSelector(TraitSelector Selector) {
+  return StringSwitch<TraitProperty>(
+             getOpenMPContextTraitSelectorName(Selector))
+#define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
+  .Case(Str, Selector == TraitSelector::TraitSelectorEnum                      \
+                 ? TraitProperty::Enum                                         \
+                 : TraitProperty::invalid)
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+      .Default(TraitProperty::invalid);
+}
+StringRef llvm::omp::getOpenMPContextTraitPropertyName(TraitProperty Kind) {
+  switch (Kind) {
+#define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
+  case TraitProperty::Enum:                                                    \
+    return Str;
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  }
+  llvm_unreachable("Unknown trait property!");
+}
+StringRef llvm::omp::getOpenMPContextTraitPropertyFullName(TraitProperty Kind) {
+  switch (Kind) {
+#define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
+  case TraitProperty::Enum:                                                    \
+    return "(" #TraitSetEnum "," #TraitSelectorEnum "," Str ")";
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  }
+  llvm_unreachable("Unknown trait property!");
+}
+
+bool llvm::omp::isValidTraitSelectorForTraitSet(TraitSelector Selector,
+                                                TraitSet Set,
+                                                bool &AllowsTraitScore,
+                                                bool &RequiresProperty) {
+  AllowsTraitScore = Set != TraitSet::construct && Set != TraitSet::device;
+  switch (Selector) {
+#define OMP_TRAIT_SELECTOR(Enum, TraitSetEnum, Str, ReqProp)                   \
+  case TraitSelector::Enum:                                                    \
+    RequiresProperty = ReqProp;                                                \
+    return Set == TraitSet::TraitSetEnum;
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  }
+  llvm_unreachable("Unknown trait selector!");
+}
+
+bool llvm::omp::isValidTraitPropertyForTraitSetAndSelector(
+    TraitProperty Property, TraitSelector Selector, TraitSet Set) {
+  switch (Property) {
+#define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
+  case TraitProperty::Enum:                                                    \
+    return Set == TraitSet::TraitSetEnum &&                                    \
+           Selector == TraitSelector::TraitSelectorEnum;
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  }
+  llvm_unreachable("Unknown trait property!");
+}
+
+std::string llvm::omp::listOpenMPContextTraitSets() {
+  std::string S;
+#define OMP_TRAIT_SET(Enum, Str)                                               \
+  if (StringRef(Str) != "invalid")                                             \
+    S.append("'").append(Str).append("'").append(" ");
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  S.pop_back();
+  return S;
+}
+
+std::string llvm::omp::listOpenMPContextTraitSelectors(TraitSet Set) {
+  std::string S;
+#define OMP_TRAIT_SELECTOR(Enum, TraitSetEnum, Str, ReqProp)                   \
+  if (TraitSet::TraitSetEnum == Set && StringRef(Str) != "Invalid")            \
+    S.append("'").append(Str).append("'").append(" ");
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  S.pop_back();
+  return S;
+}
+
+std::string
+llvm::omp::listOpenMPContextTraitProperties(TraitSet Set,
+                                            TraitSelector Selector) {
+  std::string S;
+#define OMP_TRAIT_PROPERTY(Enum, TraitSetEnum, TraitSelectorEnum, Str)         \
+  if (TraitSet::TraitSetEnum == Set &&                                         \
+      TraitSelector::TraitSelectorEnum == Selector &&                          \
+      StringRef(Str) != "invalid")                                             \
+    S.append("'").append(Str).append("'").append(" ");
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  if (S.empty())
+    return "<none>";
+  S.pop_back();
+  return S;
+}
diff --git a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
index 739c2998baa8..9468a3aa3c8d 100644
--- a/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
+++ b/llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp
@@ -18,8 +18,8 @@
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DebugInfo.h"
-#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -31,7 +31,6 @@
 
 using namespace llvm;
 using namespace omp;
-using namespace types;
 
 static cl::opt<bool>
     OptimisticAttributes("openmp-ir-builder-optimistic-attributes", cl::Hidden,
@@ -59,13 +58,17 @@ void OpenMPIRBuilder::addAttributes(omp::RuntimeFunction FnID, Function &Fn) {
   }
 }
 
-Function *OpenMPIRBuilder::getOrCreateRuntimeFunction(RuntimeFunction FnID) {
+FunctionCallee
+OpenMPIRBuilder::getOrCreateRuntimeFunction(Module &M, RuntimeFunction FnID) {
+  FunctionType *FnTy = nullptr;
   Function *Fn = nullptr;
 
   // Try to find the declation in the module first.
   switch (FnID) {
 #define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...)                          \
   case Enum:                                                                   \
+    FnTy = FunctionType::get(ReturnType, ArrayRef<Type *>{__VA_ARGS__},        \
+                             IsVarArg);                                        \
     Fn = M.getFunction(Str);                                                   \
     break;
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
@@ -74,25 +77,113 @@ Function *OpenMPIRBuilder::getOrCreateRuntimeFunction(RuntimeFunction FnID) {
   if (!Fn) {
     // Create a new declaration if we need one.
     switch (FnID) {
-#define OMP_RTL(Enum, Str, IsVarArg, ReturnType, ...)                          \
+#define OMP_RTL(Enum, Str, ...)                                                \
   case Enum:                                                                   \
-    Fn = Function::Create(FunctionType::get(ReturnType,                        \
-                                            ArrayRef<Type *>{__VA_ARGS__},     \
-                                            IsVarArg),                         \
-                          GlobalValue::ExternalLinkage, Str, M);               \
+    Fn = Function::Create(FnTy, GlobalValue::ExternalLinkage, Str, M);         \
     break;
 #include "llvm/Frontend/OpenMP/OMPKinds.def"
     }
 
+    // Add information if the runtime function takes a callback function
+    if (FnID == OMPRTL___kmpc_fork_call || FnID == OMPRTL___kmpc_fork_teams) {
+      if (!Fn->hasMetadata(LLVMContext::MD_callback)) {
+        LLVMContext &Ctx = Fn->getContext();
+        MDBuilder MDB(Ctx);
+        // Annotate the callback behavior of the runtime function:
+        //  - The callback callee is argument number 2 (microtask).
+        //  - The first two arguments of the callback callee are unknown (-1).
+        //  - All variadic arguments to the runtime function are passed to the
+        //    callback callee.
+        Fn->addMetadata(
+            LLVMContext::MD_callback,
+            *MDNode::get(Ctx, {MDB.createCallbackEncoding(
+                                  2, {-1, -1}, /* VarArgsArePassed */ true)}));
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "Created OpenMP runtime function " << Fn->getName()
+                      << " with type " << *Fn->getFunctionType() << "\n");
     addAttributes(FnID, *Fn);
+
+  } else {
+    LLVM_DEBUG(dbgs() << "Found OpenMP runtime function " << Fn->getName()
+                      << " with type " << *Fn->getFunctionType() << "\n");
   }
 
   assert(Fn && "Failed to create OpenMP runtime function");
+
+  // Cast the function to the expected type if necessary
+  Constant *C = ConstantExpr::getBitCast(Fn, FnTy->getPointerTo());
+  return {FnTy, C};
+}
+
+Function *OpenMPIRBuilder::getOrCreateRuntimeFunctionPtr(RuntimeFunction FnID) {
+  FunctionCallee RTLFn = getOrCreateRuntimeFunction(M, FnID);
+  auto *Fn = dyn_cast<llvm::Function>(RTLFn.getCallee());
+  assert(Fn && "Failed to create OpenMP runtime function pointer");
   return Fn;
 }
 
 void OpenMPIRBuilder::initialize() { initializeTypes(M); }
 
+void OpenMPIRBuilder::finalize() {
+  SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
+  SmallVector<BasicBlock *, 32> Blocks;
+  for (OutlineInfo &OI : OutlineInfos) {
+    ParallelRegionBlockSet.clear();
+    Blocks.clear();
+    OI.collectBlocks(ParallelRegionBlockSet, Blocks);
+
+    Function *OuterFn = OI.EntryBB->getParent();
+    CodeExtractorAnalysisCache CEAC(*OuterFn);
+    CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
+                            /* AggregateArgs */ false,
+                            /* BlockFrequencyInfo */ nullptr,
+                            /* BranchProbabilityInfo */ nullptr,
+                            /* AssumptionCache */ nullptr,
+                            /* AllowVarArgs */ true,
+                            /* AllowAlloca */ true,
+                            /* Suffix */ ".omp_par");
+
+    LLVM_DEBUG(dbgs() << "Before     outlining: " << *OuterFn << "\n");
+    LLVM_DEBUG(dbgs() << "Entry " << OI.EntryBB->getName()
+                      << " Exit: " << OI.ExitBB->getName() << "\n");
+    assert(Extractor.isEligible() &&
+           "Expected OpenMP outlining to be possible!");
+
+    Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
+
+    LLVM_DEBUG(dbgs() << "After      outlining: " << *OuterFn << "\n");
+    LLVM_DEBUG(dbgs() << "   Outlined function: " << *OutlinedFn << "\n");
+    assert(OutlinedFn->getReturnType()->isVoidTy() &&
+           "OpenMP outlined functions should not return a value!");
+
+    // For compability with the clang CG we move the outlined function after the
+    // one with the parallel region.
+    OutlinedFn->removeFromParent();
+    M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
+
+    // Remove the artificial entry introduced by the extractor right away, we
+    // made our own entry block after all.
+    {
+      BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
+      assert(ArtificialEntry.getUniqueSuccessor() == OI.EntryBB);
+      assert(OI.EntryBB->getUniquePredecessor() == &ArtificialEntry);
+      OI.EntryBB->moveBefore(&ArtificialEntry);
+      ArtificialEntry.eraseFromParent();
+    }
+    assert(&OutlinedFn->getEntryBlock() == OI.EntryBB);
+    assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
+
+    // Run a user callback, e.g. to add attributes.
+    if (OI.PostOutlineCB)
+      OI.PostOutlineCB(*OutlinedFn);
+  }
+
+  // Allow finalize to be called multiple times.
+  OutlineInfos.clear();
+}
+
 Value *OpenMPIRBuilder::getOrCreateIdent(Constant *SrcLocStr,
                                          IdentFlag LocFlags) {
   // Enable "C-mode".
@@ -165,7 +256,7 @@ OpenMPIRBuilder::getOrCreateSrcLocStr(const LocationDescription &Loc) {
 
 Value *OpenMPIRBuilder::getOrCreateThreadID(Value *Ident) {
   return Builder.CreateCall(
-      getOrCreateRuntimeFunction(OMPRTL___kmpc_global_thread_num), Ident,
+      getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num), Ident,
       "omp_global_thread_num");
 }
 
@@ -212,10 +303,11 @@ OpenMPIRBuilder::emitBarrierImpl(const LocationDescription &Loc, Directive Kind,
   bool UseCancelBarrier =
       !ForceSimpleCall && isLastFinalizationInfoCancellable(OMPD_parallel);
 
-  Value *Result = Builder.CreateCall(
-      getOrCreateRuntimeFunction(UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
-                                                  : OMPRTL___kmpc_barrier),
-      Args);
+  Value *Result =
+      Builder.CreateCall(getOrCreateRuntimeFunctionPtr(
+                             UseCancelBarrier ? OMPRTL___kmpc_cancel_barrier
+                                              : OMPRTL___kmpc_barrier),
+                         Args);
 
   if (UseCancelBarrier && CheckCancelFlag)
     emitCancelationCheckImpl(Result, OMPD_parallel);
@@ -253,7 +345,7 @@ OpenMPIRBuilder::CreateCancel(const LocationDescription &Loc,
   Value *Ident = getOrCreateIdent(SrcLocStr);
   Value *Args[] = {Ident, getOrCreateThreadID(Ident), CancelKind};
   Value *Result = Builder.CreateCall(
-      getOrCreateRuntimeFunction(OMPRTL___kmpc_cancel), Args);
+      getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_cancel), Args);
 
   // The actual cancel logic is shared with others, e.g., cancel_barriers.
   emitCancelationCheckImpl(Result, CanceledDirective);
@@ -318,7 +410,7 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel(
         Ident, ThreadID,
         Builder.CreateIntCast(NumThreads, Int32, /*isSigned*/ false)};
     Builder.CreateCall(
-        getOrCreateRuntimeFunction(OMPRTL___kmpc_push_num_threads), Args);
+        getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_num_threads), Args);
   }
 
   if (ProcBind != OMP_PROC_BIND_default) {
@@ -326,8 +418,8 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel(
     Value *Args[] = {
         Ident, ThreadID,
         ConstantInt::get(Int32, unsigned(ProcBind), /*isSigned=*/true)};
-    Builder.CreateCall(getOrCreateRuntimeFunction(OMPRTL___kmpc_push_proc_bind),
-                       Args);
+    Builder.CreateCall(
+        getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_push_proc_bind), Args);
   }
 
   BasicBlock *InsertBB = Builder.GetInsertBlock();
@@ -415,32 +507,135 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel(
   // PRegionExitBB          <- A common exit to simplify block collection.
   //
 
-  LLVM_DEBUG(dbgs() << "Before body codegen: " << *UI->getFunction() << "\n");
+  LLVM_DEBUG(dbgs() << "Before body codegen: " << *OuterFn << "\n");
 
   // Let the caller create the body.
   assert(BodyGenCB && "Expected body generation callback!");
   InsertPointTy CodeGenIP(PRegBodyBB, PRegBodyBB->begin());
   BodyGenCB(AllocaIP, CodeGenIP, *PRegPreFiniBB);
 
-  LLVM_DEBUG(dbgs() << "After  body codegen: " << *UI->getFunction() << "\n");
+  LLVM_DEBUG(dbgs() << "After  body codegen: " << *OuterFn << "\n");
+
+  FunctionCallee RTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_fork_call);
+  if (auto *F = dyn_cast<llvm::Function>(RTLFn.getCallee())) {
+    if (!F->hasMetadata(llvm::LLVMContext::MD_callback)) {
+      llvm::LLVMContext &Ctx = F->getContext();
+      MDBuilder MDB(Ctx);
+      // Annotate the callback behavior of the __kmpc_fork_call:
+      //  - The callback callee is argument number 2 (microtask).
+      //  - The first two arguments of the callback callee are unknown (-1).
+      //  - All variadic arguments to the __kmpc_fork_call are passed to the
+      //    callback callee.
+      F->addMetadata(
+          llvm::LLVMContext::MD_callback,
+          *llvm::MDNode::get(
+              Ctx, {MDB.createCallbackEncoding(2, {-1, -1},
+                                               /* VarArgsArePassed */ true)}));
+    }
+  }
+
+  OutlineInfo OI;
+  OI.PostOutlineCB = [=](Function &OutlinedFn) {
+    // Add some known attributes.
+    OutlinedFn.addParamAttr(0, Attribute::NoAlias);
+    OutlinedFn.addParamAttr(1, Attribute::NoAlias);
+    OutlinedFn.addFnAttr(Attribute::NoUnwind);
+    OutlinedFn.addFnAttr(Attribute::NoRecurse);
+
+    assert(OutlinedFn.arg_size() >= 2 &&
+           "Expected at least tid and bounded tid as arguments");
+    unsigned NumCapturedVars =
+        OutlinedFn.arg_size() - /* tid & bounded tid */ 2;
+
+    CallInst *CI = cast<CallInst>(OutlinedFn.user_back());
+    CI->getParent()->setName("omp_parallel");
+    Builder.SetInsertPoint(CI);
+
+    // Build call __kmpc_fork_call(Ident, n, microtask, var1, .., varn);
+    Value *ForkCallArgs[] = {
+        Ident, Builder.getInt32(NumCapturedVars),
+        Builder.CreateBitCast(&OutlinedFn, ParallelTaskPtr)};
+
+    SmallVector<Value *, 16> RealArgs;
+    RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
+    RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
+
+    Builder.CreateCall(RTLFn, RealArgs);
+
+    LLVM_DEBUG(dbgs() << "With fork_call placed: "
+                      << *Builder.GetInsertBlock()->getParent() << "\n");
+
+    InsertPointTy ExitIP(PRegExitBB, PRegExitBB->end());
+
+    // Initialize the local TID stack location with the argument value.
+    Builder.SetInsertPoint(PrivTID);
+    Function::arg_iterator OutlinedAI = OutlinedFn.arg_begin();
+    Builder.CreateStore(Builder.CreateLoad(OutlinedAI), PrivTIDAddr);
+
+    // If no "if" clause was present we do not need the call created during
+    // outlining, otherwise we reuse it in the serialized parallel region.
+    if (!ElseTI) {
+      CI->eraseFromParent();
+    } else {
+
+      // If an "if" clause was present we are now generating the serialized
+      // version into the "else" branch.
+      Builder.SetInsertPoint(ElseTI);
+
+      // Build calls __kmpc_serialized_parallel(&Ident, GTid);
+      Value *SerializedParallelCallArgs[] = {Ident, ThreadID};
+      Builder.CreateCall(
+          getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_serialized_parallel),
+          SerializedParallelCallArgs);
+
+      // OutlinedFn(&GTid, &zero, CapturedStruct);
+      CI->removeFromParent();
+      Builder.Insert(CI);
+
+      // __kmpc_end_serialized_parallel(&Ident, GTid);
+      Value *EndArgs[] = {Ident, ThreadID};
+      Builder.CreateCall(
+          getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_serialized_parallel),
+          EndArgs);
+
+      LLVM_DEBUG(dbgs() << "With serialized parallel region: "
+                        << *Builder.GetInsertBlock()->getParent() << "\n");
+    }
+
+    for (Instruction *I : ToBeDeleted)
+      I->eraseFromParent();
+  };
+
+  // Adjust the finalization stack, verify the adjustment, and call the
+  // finalize function a last time to finalize values between the pre-fini
+  // block and the exit block if we left the parallel "the normal way".
+  auto FiniInfo = FinalizationStack.pop_back_val();
+  (void)FiniInfo;
+  assert(FiniInfo.DK == OMPD_parallel &&
+         "Unexpected finalization stack state!");
+
+  Instruction *PRegPreFiniTI = PRegPreFiniBB->getTerminator();
+
+  InsertPointTy PreFiniIP(PRegPreFiniBB, PRegPreFiniTI->getIterator());
+  FiniCB(PreFiniIP);
+
+  OI.EntryBB = PRegEntryBB;
+  OI.ExitBB = PRegExitBB;
 
   SmallPtrSet<BasicBlock *, 32> ParallelRegionBlockSet;
-  SmallVector<BasicBlock *, 32> ParallelRegionBlocks, Worklist;
-  ParallelRegionBlockSet.insert(PRegEntryBB);
-  ParallelRegionBlockSet.insert(PRegExitBB);
+  SmallVector<BasicBlock *, 32> Blocks;
+  OI.collectBlocks(ParallelRegionBlockSet, Blocks);
 
-  // Collect all blocks in-between PRegEntryBB and PRegExitBB.
-  Worklist.push_back(PRegEntryBB);
-  while (!Worklist.empty()) {
-    BasicBlock *BB = Worklist.pop_back_val();
-    ParallelRegionBlocks.push_back(BB);
-    for (BasicBlock *SuccBB : successors(BB))
-      if (ParallelRegionBlockSet.insert(SuccBB).second)
-        Worklist.push_back(SuccBB);
-  }
+  // Ensure a single exit node for the outlined region by creating one.
+  // We might have multiple incoming edges to the exit now due to finalizations,
+  // e.g., cancel calls that cause the control flow to leave the region.
+  BasicBlock *PRegOutlinedExitBB = PRegExitBB;
+  PRegExitBB = SplitBlock(PRegExitBB, &*PRegExitBB->getFirstInsertionPt());
+  PRegOutlinedExitBB->setName("omp.par.outlined.exit");
+  Blocks.push_back(PRegOutlinedExitBB);
 
   CodeExtractorAnalysisCache CEAC(*OuterFn);
-  CodeExtractor Extractor(ParallelRegionBlocks, /* DominatorTree */ nullptr,
+  CodeExtractor Extractor(Blocks, /* DominatorTree */ nullptr,
                           /* AggregateArgs */ false,
                           /* BlockFrequencyInfo */ nullptr,
                           /* BranchProbabilityInfo */ nullptr,
@@ -455,10 +650,10 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel(
   Extractor.findAllocas(CEAC, SinkingCands, HoistingCands, CommonExit);
   Extractor.findInputsOutputs(Inputs, Outputs, SinkingCands);
 
-  LLVM_DEBUG(dbgs() << "Before privatization: " << *UI->getFunction() << "\n");
+  LLVM_DEBUG(dbgs() << "Before privatization: " << *OuterFn << "\n");
 
   FunctionCallee TIDRTLFn =
-      getOrCreateRuntimeFunction(OMPRTL___kmpc_global_thread_num);
+      getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_global_thread_num);
 
   auto PrivHelper = [&](Value &V) {
     if (&V == TIDAddr || &V == ZeroAddr)
@@ -491,142 +686,443 @@ IRBuilder<>::InsertPoint OpenMPIRBuilder::CreateParallel(
     LLVM_DEBUG(dbgs() << "Captured input: " << *Input << "\n");
     PrivHelper(*Input);
   }
-  for (Value *Output : Outputs) {
-    LLVM_DEBUG(dbgs() << "Captured output: " << *Output << "\n");
-    PrivHelper(*Output);
-  }
+  assert(Outputs.empty() &&
+         "OpenMP outlining should not produce live-out values!");
 
-  LLVM_DEBUG(dbgs() << "After  privatization: " << *UI->getFunction() << "\n");
+  LLVM_DEBUG(dbgs() << "After  privatization: " << *OuterFn << "\n");
   LLVM_DEBUG({
-    for (auto *BB : ParallelRegionBlocks)
+    for (auto *BB : Blocks)
       dbgs() << " PBR: " << BB->getName() << "\n";
   });
 
-  // Add some known attributes to the outlined function.
-  Function *OutlinedFn = Extractor.extractCodeRegion(CEAC);
-  OutlinedFn->addParamAttr(0, Attribute::NoAlias);
-  OutlinedFn->addParamAttr(1, Attribute::NoAlias);
-  OutlinedFn->addFnAttr(Attribute::NoUnwind);
-  OutlinedFn->addFnAttr(Attribute::NoRecurse);
-
-  LLVM_DEBUG(dbgs() << "After      outlining: " << *UI->getFunction() << "\n");
-  LLVM_DEBUG(dbgs() << "   Outlined function: " << *OutlinedFn << "\n");
-
-  // For compability with the clang CG we move the outlined function after the
-  // one with the parallel region.
-  OutlinedFn->removeFromParent();
-  M.getFunctionList().insertAfter(OuterFn->getIterator(), OutlinedFn);
-
-  // Remove the artificial entry introduced by the extractor right away, we
-  // made our own entry block after all.
-  {
-    BasicBlock &ArtificialEntry = OutlinedFn->getEntryBlock();
-    assert(ArtificialEntry.getUniqueSuccessor() == PRegEntryBB);
-    assert(PRegEntryBB->getUniquePredecessor() == &ArtificialEntry);
-    PRegEntryBB->moveBefore(&ArtificialEntry);
-    ArtificialEntry.eraseFromParent();
-  }
-  LLVM_DEBUG(dbgs() << "PP Outlined function: " << *OutlinedFn << "\n");
-  assert(&OutlinedFn->getEntryBlock() == PRegEntryBB);
+  // Register the outlined info.
+  addOutlineInfo(std::move(OI));
+
+  InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
+  UI->eraseFromParent();
 
-  assert(OutlinedFn && OutlinedFn->getNumUses() == 1);
-  assert(OutlinedFn->arg_size() >= 2 &&
-         "Expected at least tid and bounded tid as arguments");
-  unsigned NumCapturedVars = OutlinedFn->arg_size() - /* tid & bounded tid */ 2;
+  return AfterIP;
+}
 
-  CallInst *CI = cast<CallInst>(OutlinedFn->user_back());
-  CI->getParent()->setName("omp_parallel");
-  Builder.SetInsertPoint(CI);
+void OpenMPIRBuilder::emitFlush(const LocationDescription &Loc) {
+  // Build call void __kmpc_flush(ident_t *loc)
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *Args[] = {getOrCreateIdent(SrcLocStr)};
 
-  // Build call __kmpc_fork_call(Ident, n, microtask, var1, .., varn);
-  Value *ForkCallArgs[] = {Ident, Builder.getInt32(NumCapturedVars),
-                           Builder.CreateBitCast(OutlinedFn, ParallelTaskPtr)};
+  Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_flush), Args);
+}
 
-  SmallVector<Value *, 16> RealArgs;
-  RealArgs.append(std::begin(ForkCallArgs), std::end(ForkCallArgs));
-  RealArgs.append(CI->arg_begin() + /* tid & bound tid */ 2, CI->arg_end());
+void OpenMPIRBuilder::CreateFlush(const LocationDescription &Loc) {
+  if (!updateToLocation(Loc))
+    return;
+  emitFlush(Loc);
+}
 
-  FunctionCallee RTLFn = getOrCreateRuntimeFunction(OMPRTL___kmpc_fork_call);
-  if (auto *F = dyn_cast<llvm::Function>(RTLFn.getCallee())) {
-    if (!F->hasMetadata(llvm::LLVMContext::MD_callback)) {
-      llvm::LLVMContext &Ctx = F->getContext();
-      MDBuilder MDB(Ctx);
-      // Annotate the callback behavior of the __kmpc_fork_call:
-      //  - The callback callee is argument number 2 (microtask).
-      //  - The first two arguments of the callback callee are unknown (-1).
-      //  - All variadic arguments to the __kmpc_fork_call are passed to the
-      //    callback callee.
-      F->addMetadata(
-          llvm::LLVMContext::MD_callback,
-          *llvm::MDNode::get(Ctx, {MDB.createCallbackEncoding(
-                                      2, {-1, -1},
-                                      /* VarArgsArePassed */ true)}));
+void OpenMPIRBuilder::emitTaskwaitImpl(const LocationDescription &Loc) {
+  // Build call kmp_int32 __kmpc_omp_taskwait(ident_t *loc, kmp_int32
+  // global_tid);
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *Ident = getOrCreateIdent(SrcLocStr);
+  Value *Args[] = {Ident, getOrCreateThreadID(Ident)};
+
+  // Ignore return result until untied tasks are supported.
+  Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskwait),
+                     Args);
+}
+
+void OpenMPIRBuilder::CreateTaskwait(const LocationDescription &Loc) {
+  if (!updateToLocation(Loc))
+    return;
+  emitTaskwaitImpl(Loc);
+}
+
+void OpenMPIRBuilder::emitTaskyieldImpl(const LocationDescription &Loc) {
+  // Build call __kmpc_omp_taskyield(loc, thread_id, 0);
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *Ident = getOrCreateIdent(SrcLocStr);
+  Constant *I32Null = ConstantInt::getNullValue(Int32);
+  Value *Args[] = {Ident, getOrCreateThreadID(Ident), I32Null};
+
+  Builder.CreateCall(getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_omp_taskyield),
+                     Args);
+}
+
+void OpenMPIRBuilder::CreateTaskyield(const LocationDescription &Loc) {
+  if (!updateToLocation(Loc))
+    return;
+  emitTaskyieldImpl(Loc);
+}
+
+OpenMPIRBuilder::InsertPointTy
+OpenMPIRBuilder::CreateMaster(const LocationDescription &Loc,
+                              BodyGenCallbackTy BodyGenCB,
+                              FinalizeCallbackTy FiniCB) {
+
+  if (!updateToLocation(Loc))
+    return Loc.IP;
+
+  Directive OMPD = Directive::OMPD_master;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *Ident = getOrCreateIdent(SrcLocStr);
+  Value *ThreadId = getOrCreateThreadID(Ident);
+  Value *Args[] = {Ident, ThreadId};
+
+  Function *EntryRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_master);
+  Instruction *EntryCall = Builder.CreateCall(EntryRTLFn, Args);
+
+  Function *ExitRTLFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_master);
+  Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
+
+  return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
+                              /*Conditional*/ true, /*hasFinalize*/ true);
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::CreateCritical(
+    const LocationDescription &Loc, BodyGenCallbackTy BodyGenCB,
+    FinalizeCallbackTy FiniCB, StringRef CriticalName, Value *HintInst) {
+
+  if (!updateToLocation(Loc))
+    return Loc.IP;
+
+  Directive OMPD = Directive::OMPD_critical;
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *Ident = getOrCreateIdent(SrcLocStr);
+  Value *ThreadId = getOrCreateThreadID(Ident);
+  Value *LockVar = getOMPCriticalRegionLock(CriticalName);
+  Value *Args[] = {Ident, ThreadId, LockVar};
+
+  SmallVector<llvm::Value *, 4> EnterArgs(std::begin(Args), std::end(Args));
+  Function *RTFn = nullptr;
+  if (HintInst) {
+    // Add Hint to entry Args and create call
+    EnterArgs.push_back(HintInst);
+    RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical_with_hint);
+  } else {
+    RTFn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_critical);
+  }
+  Instruction *EntryCall = Builder.CreateCall(RTFn, EnterArgs);
+
+  Function *ExitRTLFn =
+      getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_end_critical);
+  Instruction *ExitCall = Builder.CreateCall(ExitRTLFn, Args);
+
+  return EmitOMPInlinedRegion(OMPD, EntryCall, ExitCall, BodyGenCB, FiniCB,
+                              /*Conditional*/ false, /*hasFinalize*/ true);
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::EmitOMPInlinedRegion(
+    Directive OMPD, Instruction *EntryCall, Instruction *ExitCall,
+    BodyGenCallbackTy BodyGenCB, FinalizeCallbackTy FiniCB, bool Conditional,
+    bool HasFinalize) {
+
+  if (HasFinalize)
+    FinalizationStack.push_back({FiniCB, OMPD, /*IsCancellable*/ false});
+
+  // Create inlined region's entry and body blocks, in preparation
+  // for conditional creation
+  BasicBlock *EntryBB = Builder.GetInsertBlock();
+  Instruction *SplitPos = EntryBB->getTerminator();
+  if (!isa_and_nonnull<BranchInst>(SplitPos))
+    SplitPos = new UnreachableInst(Builder.getContext(), EntryBB);
+  BasicBlock *ExitBB = EntryBB->splitBasicBlock(SplitPos, "omp_region.end");
+  BasicBlock *FiniBB =
+      EntryBB->splitBasicBlock(EntryBB->getTerminator(), "omp_region.finalize");
+
+  Builder.SetInsertPoint(EntryBB->getTerminator());
+  emitCommonDirectiveEntry(OMPD, EntryCall, ExitBB, Conditional);
+
+  // generate body
+  BodyGenCB(/* AllocaIP */ InsertPointTy(),
+            /* CodeGenIP */ Builder.saveIP(), *FiniBB);
+
+  // If we didn't emit a branch to FiniBB during body generation, it means
+  // FiniBB is unreachable (e.g. while(1);). stop generating all the
+  // unreachable blocks, and remove anything we are not going to use.
+  auto SkipEmittingRegion = FiniBB->hasNPredecessors(0);
+  if (SkipEmittingRegion) {
+    FiniBB->eraseFromParent();
+    ExitCall->eraseFromParent();
+    // Discard finalization if we have it.
+    if (HasFinalize) {
+      assert(!FinalizationStack.empty() &&
+             "Unexpected finalization stack state!");
+      FinalizationStack.pop_back();
     }
+  } else {
+    // emit exit call and do any needed finalization.
+    auto FinIP = InsertPointTy(FiniBB, FiniBB->getFirstInsertionPt());
+    assert(FiniBB->getTerminator()->getNumSuccessors() == 1 &&
+           FiniBB->getTerminator()->getSuccessor(0) == ExitBB &&
+           "Unexpected control flow graph state!!");
+    emitCommonDirectiveExit(OMPD, FinIP, ExitCall, HasFinalize);
+    assert(FiniBB->getUniquePredecessor()->getUniqueSuccessor() == FiniBB &&
+           "Unexpected Control Flow State!");
+    MergeBlockIntoPredecessor(FiniBB);
   }
 
-  Builder.CreateCall(RTLFn, RealArgs);
+  // If we are skipping the region of a non conditional, remove the exit
+  // block, and clear the builder's insertion point.
+  assert(SplitPos->getParent() == ExitBB &&
+         "Unexpected Insertion point location!");
+  if (!Conditional && SkipEmittingRegion) {
+    ExitBB->eraseFromParent();
+    Builder.ClearInsertionPoint();
+  } else {
+    auto merged = MergeBlockIntoPredecessor(ExitBB);
+    BasicBlock *ExitPredBB = SplitPos->getParent();
+    auto InsertBB = merged ? ExitPredBB : ExitBB;
+    if (!isa_and_nonnull<BranchInst>(SplitPos))
+      SplitPos->eraseFromParent();
+    Builder.SetInsertPoint(InsertBB);
+  }
 
-  LLVM_DEBUG(dbgs() << "With fork_call placed: "
-                    << *Builder.GetInsertBlock()->getParent() << "\n");
+  return Builder.saveIP();
+}
 
-  InsertPointTy AfterIP(UI->getParent(), UI->getParent()->end());
-  InsertPointTy ExitIP(PRegExitBB, PRegExitBB->end());
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveEntry(
+    Directive OMPD, Value *EntryCall, BasicBlock *ExitBB, bool Conditional) {
+
+  // if nothing to do, Return current insertion point.
+  if (!Conditional)
+    return Builder.saveIP();
+
+  BasicBlock *EntryBB = Builder.GetInsertBlock();
+  Value *CallBool = Builder.CreateIsNotNull(EntryCall);
+  auto *ThenBB = BasicBlock::Create(M.getContext(), "omp_region.body");
+  auto *UI = new UnreachableInst(Builder.getContext(), ThenBB);
+
+  // Emit thenBB and set the Builder's insertion point there for
+  // body generation next. Place the block after the current block.
+  Function *CurFn = EntryBB->getParent();
+  CurFn->getBasicBlockList().insertAfter(EntryBB->getIterator(), ThenBB);
+
+  // Move Entry branch to end of ThenBB, and replace with conditional
+  // branch (If-stmt)
+  Instruction *EntryBBTI = EntryBB->getTerminator();
+  Builder.CreateCondBr(CallBool, ThenBB, ExitBB);
+  EntryBBTI->removeFromParent();
+  Builder.SetInsertPoint(UI);
+  Builder.Insert(EntryBBTI);
   UI->eraseFromParent();
+  Builder.SetInsertPoint(ThenBB->getTerminator());
+
+  // return an insertion point to ExitBB.
+  return IRBuilder<>::InsertPoint(ExitBB, ExitBB->getFirstInsertionPt());
+}
+
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::emitCommonDirectiveExit(
+    omp::Directive OMPD, InsertPointTy FinIP, Instruction *ExitCall,
+    bool HasFinalize) {
+
+  Builder.restoreIP(FinIP);
+
+  // If there is finalization to do, emit it before the exit call
+  if (HasFinalize) {
+    assert(!FinalizationStack.empty() &&
+           "Unexpected finalization stack state!");
+
+    FinalizationInfo Fi = FinalizationStack.pop_back_val();
+    assert(Fi.DK == OMPD && "Unexpected Directive for Finalization call!");
+
+    Fi.FiniCB(FinIP);
+
+    BasicBlock *FiniBB = FinIP.getBlock();
+    Instruction *FiniBBTI = FiniBB->getTerminator();
 
-  // Initialize the local TID stack location with the argument value.
-  Builder.SetInsertPoint(PrivTID);
-  Function::arg_iterator OutlinedAI = OutlinedFn->arg_begin();
-  Builder.CreateStore(Builder.CreateLoad(OutlinedAI), PrivTIDAddr);
+    // set Builder IP for call creation
+    Builder.SetInsertPoint(FiniBBTI);
+  }
+
+  // place the Exitcall as last instruction before Finalization block terminator
+  ExitCall->removeFromParent();
+  Builder.Insert(ExitCall);
+
+  return IRBuilder<>::InsertPoint(ExitCall->getParent(),
+                                  ExitCall->getIterator());
+}
 
-  // If no "if" clause was present we do not need the call created during
-  // outlining, otherwise we reuse it in the serialized parallel region.
-  if (!ElseTI) {
-    CI->eraseFromParent();
+OpenMPIRBuilder::InsertPointTy OpenMPIRBuilder::CreateCopyinClauseBlocks(
+    InsertPointTy IP, Value *MasterAddr, Value *PrivateAddr,
+    llvm::IntegerType *IntPtrTy, bool BranchtoEnd) {
+  if (!IP.isSet())
+    return IP;
+
+  IRBuilder<>::InsertPointGuard IPG(Builder);
+
+  // creates the following CFG structure
+  //	   OMP_Entry : (MasterAddr != PrivateAddr)?
+  //       F     T
+  //       |      \
+  //       |     copin.not.master
+  //       |      /
+  //       v     /
+  //   copyin.not.master.end
+  //		     |
+  //         v
+  //   OMP.Entry.Next
+
+  BasicBlock *OMP_Entry = IP.getBlock();
+  Function *CurFn = OMP_Entry->getParent();
+  BasicBlock *CopyBegin =
+      BasicBlock::Create(M.getContext(), "copyin.not.master", CurFn);
+  BasicBlock *CopyEnd = nullptr;
+
+  // If entry block is terminated, split to preserve the branch to following
+  // basic block (i.e. OMP.Entry.Next), otherwise, leave everything as is.
+  if (isa_and_nonnull<BranchInst>(OMP_Entry->getTerminator())) {
+    CopyEnd = OMP_Entry->splitBasicBlock(OMP_Entry->getTerminator(),
+                                         "copyin.not.master.end");
+    OMP_Entry->getTerminator()->eraseFromParent();
   } else {
+    CopyEnd =
+        BasicBlock::Create(M.getContext(), "copyin.not.master.end", CurFn);
+  }
 
-    // If an "if" clause was present we are now generating the serialized
-    // version into the "else" branch.
-    Builder.SetInsertPoint(ElseTI);
+  Builder.SetInsertPoint(OMP_Entry);
+  Value *MasterPtr = Builder.CreatePtrToInt(MasterAddr, IntPtrTy);
+  Value *PrivatePtr = Builder.CreatePtrToInt(PrivateAddr, IntPtrTy);
+  Value *cmp = Builder.CreateICmpNE(MasterPtr, PrivatePtr);
+  Builder.CreateCondBr(cmp, CopyBegin, CopyEnd);
 
-    // Build calls __kmpc_serialized_parallel(&Ident, GTid);
-    Value *SerializedParallelCallArgs[] = {Ident, ThreadID};
-    Builder.CreateCall(
-        getOrCreateRuntimeFunction(OMPRTL___kmpc_serialized_parallel),
-        SerializedParallelCallArgs);
+  Builder.SetInsertPoint(CopyBegin);
+  if (BranchtoEnd)
+    Builder.SetInsertPoint(Builder.CreateBr(CopyEnd));
 
-    // OutlinedFn(&GTid, &zero, CapturedStruct);
-    CI->removeFromParent();
-    Builder.Insert(CI);
+  return Builder.saveIP();
+}
 
-    // __kmpc_end_serialized_parallel(&Ident, GTid);
-    Value *EndArgs[] = {Ident, ThreadID};
-    Builder.CreateCall(
-        getOrCreateRuntimeFunction(OMPRTL___kmpc_end_serialized_parallel),
-        EndArgs);
+CallInst *OpenMPIRBuilder::CreateOMPAlloc(const LocationDescription &Loc,
+                                          Value *Size, Value *Allocator,
+                                          std::string Name) {
+  IRBuilder<>::InsertPointGuard IPG(Builder);
+  Builder.restoreIP(Loc.IP);
 
-    LLVM_DEBUG(dbgs() << "With serialized parallel region: "
-                      << *Builder.GetInsertBlock()->getParent() << "\n");
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *Ident = getOrCreateIdent(SrcLocStr);
+  Value *ThreadId = getOrCreateThreadID(Ident);
+  Value *Args[] = {ThreadId, Size, Allocator};
+
+  Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_alloc);
+
+  return Builder.CreateCall(Fn, Args, Name);
+}
+
+CallInst *OpenMPIRBuilder::CreateOMPFree(const LocationDescription &Loc,
+                                         Value *Addr, Value *Allocator,
+                                         std::string Name) {
+  IRBuilder<>::InsertPointGuard IPG(Builder);
+  Builder.restoreIP(Loc.IP);
+
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *Ident = getOrCreateIdent(SrcLocStr);
+  Value *ThreadId = getOrCreateThreadID(Ident);
+  Value *Args[] = {ThreadId, Addr, Allocator};
+  Function *Fn = getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_free);
+  return Builder.CreateCall(Fn, Args, Name);
+}
+
+CallInst *OpenMPIRBuilder::CreateCachedThreadPrivate(
+    const LocationDescription &Loc, llvm::Value *Pointer,
+    llvm::ConstantInt *Size, const llvm::Twine &Name) {
+  IRBuilder<>::InsertPointGuard IPG(Builder);
+  Builder.restoreIP(Loc.IP);
+
+  Constant *SrcLocStr = getOrCreateSrcLocStr(Loc);
+  Value *Ident = getOrCreateIdent(SrcLocStr);
+  Value *ThreadId = getOrCreateThreadID(Ident);
+  Constant *ThreadPrivateCache =
+      getOrCreateOMPInternalVariable(Int8PtrPtr, Name);
+  llvm::Value *Args[] = {Ident, ThreadId, Pointer, Size, ThreadPrivateCache};
+
+  Function *Fn =
+  		getOrCreateRuntimeFunctionPtr(OMPRTL___kmpc_threadprivate_cached);
+
+  return Builder.CreateCall(Fn, Args);
+}
+
+std::string OpenMPIRBuilder::getNameWithSeparators(ArrayRef<StringRef> Parts,
+                                                   StringRef FirstSeparator,
+                                                   StringRef Separator) {
+  SmallString<128> Buffer;
+  llvm::raw_svector_ostream OS(Buffer);
+  StringRef Sep = FirstSeparator;
+  for (StringRef Part : Parts) {
+    OS << Sep << Part;
+    Sep = Separator;
   }
+  return OS.str().str();
+}
 
-  // Adjust the finalization stack, verify the adjustment, and call the
-  // finalize function a last time to finalize values between the pre-fini block
-  // and the exit block if we left the parallel "the normal way".
-  auto FiniInfo = FinalizationStack.pop_back_val();
-  (void)FiniInfo;
-  assert(FiniInfo.DK == OMPD_parallel &&
-         "Unexpected finalization stack state!");
+Constant *OpenMPIRBuilder::getOrCreateOMPInternalVariable(
+    llvm::Type *Ty, const llvm::Twine &Name, unsigned AddressSpace) {
+  // TODO: Replace the twine arg with stringref to get rid of the conversion
+  // logic. However This is taken from current implementation in clang as is.
+  // Since this method is used in many places exclusively for OMP internal use
+  // we will keep it as is for temporarily until we move all users to the
+  // builder and then, if possible, fix it everywhere in one go.
+  SmallString<256> Buffer;
+  llvm::raw_svector_ostream Out(Buffer);
+  Out << Name;
+  StringRef RuntimeName = Out.str();
+  auto &Elem = *InternalVars.try_emplace(RuntimeName, nullptr).first;
+  if (Elem.second) {
+    assert(Elem.second->getType()->getPointerElementType() == Ty &&
+           "OMP internal variable has different type than requested");
+  } else {
+    // TODO: investigate the appropriate linkage type used for the global
+    // variable for possibly changing that to internal or private, or maybe
+    // create different versions of the function for different OMP internal
+    // variables.
+    Elem.second = new llvm::GlobalVariable(
+        M, Ty, /*IsConstant*/ false, llvm::GlobalValue::CommonLinkage,
+        llvm::Constant::getNullValue(Ty), Elem.first(),
+        /*InsertBefore=*/nullptr, llvm::GlobalValue::NotThreadLocal,
+        AddressSpace);
+  }
 
-  Instruction *PreFiniTI = PRegPreFiniBB->getTerminator();
-  assert(PreFiniTI->getNumSuccessors() == 1 &&
-         PreFiniTI->getSuccessor(0)->size() == 1 &&
-         isa<ReturnInst>(PreFiniTI->getSuccessor(0)->getTerminator()) &&
-         "Unexpected CFG structure!");
+  return Elem.second;
+}
 
-  InsertPointTy PreFiniIP(PRegPreFiniBB, PreFiniTI->getIterator());
-  FiniCB(PreFiniIP);
+Value *OpenMPIRBuilder::getOMPCriticalRegionLock(StringRef CriticalName) {
+  std::string Prefix = Twine("gomp_critical_user_", CriticalName).str();
+  std::string Name = getNameWithSeparators({Prefix, "var"}, ".", ".");
+  return getOrCreateOMPInternalVariable(KmpCriticalNameTy, Name);
+}
 
-  for (Instruction *I : ToBeDeleted)
-    I->eraseFromParent();
+// Create all simple and struct types exposed by the runtime and remember
+// the llvm::PointerTypes of them for easy access later.
+void OpenMPIRBuilder::initializeTypes(Module &M) {
+  LLVMContext &Ctx = M.getContext();
+  StructType *T;
+#define OMP_TYPE(VarName, InitValue) VarName = InitValue;
+#define OMP_ARRAY_TYPE(VarName, ElemTy, ArraySize)                             \
+  VarName##Ty = ArrayType::get(ElemTy, ArraySize);                             \
+  VarName##PtrTy = PointerType::getUnqual(VarName##Ty);
+#define OMP_FUNCTION_TYPE(VarName, IsVarArg, ReturnType, ...)                  \
+  VarName = FunctionType::get(ReturnType, {__VA_ARGS__}, IsVarArg);            \
+  VarName##Ptr = PointerType::getUnqual(VarName);
+#define OMP_STRUCT_TYPE(VarName, StructName, ...)                              \
+  T = M.getTypeByName(StructName);                                             \
+  if (!T)                                                                      \
+    T = StructType::create(Ctx, {__VA_ARGS__}, StructName);                    \
+  VarName = T;                                                                 \
+  VarName##Ptr = PointerType::getUnqual(T);
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+}
 
-  return AfterIP;
+void OpenMPIRBuilder::OutlineInfo::collectBlocks(
+    SmallPtrSetImpl<BasicBlock *> &BlockSet,
+    SmallVectorImpl<BasicBlock *> &BlockVector) {
+  SmallVector<BasicBlock *, 32> Worklist;
+  BlockSet.insert(EntryBB);
+  BlockSet.insert(ExitBB);
+
+  Worklist.push_back(EntryBB);
+  while (!Worklist.empty()) {
+    BasicBlock *BB = Worklist.pop_back_val();
+    BlockVector.push_back(BB);
+    for (BasicBlock *SuccBB : successors(BB))
+      if (BlockSet.insert(SuccBB).second)
+        Worklist.push_back(SuccBB);
+  }
 }
diff --git a/llvm/lib/FuzzMutate/FuzzerCLI.cpp b/llvm/lib/FuzzMutate/FuzzerCLI.cpp
index f2368ea7f26b..be0d5bfcab46 100644
--- a/llvm/lib/FuzzMutate/FuzzerCLI.cpp
+++ b/llvm/lib/FuzzMutate/FuzzerCLI.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/FuzzMutate/FuzzerCLI.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
@@ -36,7 +37,7 @@ void llvm::parseFuzzerCLOpts(int ArgC, char *ArgV[]) {
 }
 
 void llvm::handleExecNameEncodedBEOpts(StringRef ExecName) {
-  std::vector<std::string> Args{ExecName};
+  std::vector<std::string> Args{std::string(ExecName)};
 
   auto NameAndArgs = ExecName.split("--");
   if (NameAndArgs.second.empty())
@@ -73,7 +74,7 @@ void llvm::handleExecNameEncodedBEOpts(StringRef ExecName) {
 
 void llvm::handleExecNameEncodedOptimizerOpts(StringRef ExecName) {
   // TODO: Refactor parts common with the 'handleExecNameEncodedBEOpts'
-  std::vector<std::string> Args{ExecName};
+  std::vector<std::string> Args{std::string(ExecName)};
 
   auto NameAndArgs = ExecName.split("--");
   if (NameAndArgs.second.empty())
@@ -110,7 +111,7 @@ void llvm::handleExecNameEncodedOptimizerOpts(StringRef ExecName) {
     } else if (Opt == "indvars") {
       Args.push_back("-passes=indvars");
     } else if (Opt == "strength_reduce") {
-      Args.push_back("-passes=strength-reduce");
+      Args.push_back("-passes=loop-reduce");
     } else if (Opt == "irce") {
       Args.push_back("-passes=irce");
 
diff --git a/llvm/lib/FuzzMutate/Operations.cpp b/llvm/lib/FuzzMutate/Operations.cpp
index cf55d09caf7e..a37fd5454dd4 100644
--- a/llvm/lib/FuzzMutate/Operations.cpp
+++ b/llvm/lib/FuzzMutate/Operations.cpp
@@ -244,20 +244,24 @@ static SourcePred matchScalarInAggregate() {
 
 static SourcePred validInsertValueIndex() {
   auto Pred = [](ArrayRef<Value *> Cur, const Value *V) {
-    auto *CTy = cast<CompositeType>(Cur[0]->getType());
     if (auto *CI = dyn_cast<ConstantInt>(V))
-      if (CI->getBitWidth() == 32 &&
-          CTy->getTypeAtIndex(CI->getZExtValue()) == Cur[1]->getType())
-        return true;
+      if (CI->getBitWidth() == 32) {
+        Type *Indexed = ExtractValueInst::getIndexedType(Cur[0]->getType(),
+                                                         CI->getZExtValue());
+        return Indexed == Cur[1]->getType();
+      }
     return false;
   };
   auto Make = [](ArrayRef<Value *> Cur, ArrayRef<Type *> Ts) {
     std::vector<Constant *> Result;
     auto *Int32Ty = Type::getInt32Ty(Cur[0]->getContext());
-    auto *CTy = cast<CompositeType>(Cur[0]->getType());
-    for (int I = 0, E = getAggregateNumElements(CTy); I < E; ++I)
-      if (CTy->getTypeAtIndex(I) == Cur[1]->getType())
+    auto *BaseTy = Cur[0]->getType();
+    int I = 0;
+    while (Type *Indexed = ExtractValueInst::getIndexedType(BaseTy, I)) {
+      if (Indexed == Cur[1]->getType())
         Result.push_back(ConstantInt::get(Int32Ty, I));
+      ++I;
+    }
     return Result;
   };
   return {Pred, Make};
@@ -298,12 +302,12 @@ static SourcePred validShuffleVectorIndex() {
     return ShuffleVectorInst::isValidOperands(Cur[0], Cur[1], V);
   };
   auto Make = [](ArrayRef<Value *> Cur, ArrayRef<Type *> Ts) {
-    auto *FirstTy = cast<VectorType>(Cur[0]->getType());
+    auto *FirstTy = cast<FixedVectorType>(Cur[0]->getType());
     auto *Int32Ty = Type::getInt32Ty(Cur[0]->getContext());
     // TODO: It's straighforward to make up reasonable values, but listing them
     // exhaustively would be insane. Come up with a couple of sensible ones.
-    return std::vector<Constant *>{
-        UndefValue::get(VectorType::get(Int32Ty, FirstTy->getNumElements()))};
+    return std::vector<Constant *>{UndefValue::get(
+        FixedVectorType::get(Int32Ty, FirstTy->getNumElements()))};
   };
   return {Pred, Make};
 }
diff --git a/llvm/lib/IR/AbstractCallSite.cpp b/llvm/lib/IR/AbstractCallSite.cpp
index 19b35665c3fa..6504e566ba4b 100644
--- a/llvm/lib/IR/AbstractCallSite.cpp
+++ b/llvm/lib/IR/AbstractCallSite.cpp
@@ -14,9 +14,8 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/IR/AbstractCallSite.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/Support/Debug.h"
 
 using namespace llvm;
@@ -33,9 +32,9 @@ STATISTIC(NumInvalidAbstractCallSitesUnknownCallee,
 STATISTIC(NumInvalidAbstractCallSitesNoCallback,
           "Number of invalid abstract call sites created (no callback)");
 
-void AbstractCallSite::getCallbackUses(ImmutableCallSite ICS,
-                                       SmallVectorImpl<const Use *> &CBUses) {
-  const Function *Callee = ICS.getCalledFunction();
+void AbstractCallSite::getCallbackUses(
+    const CallBase &CB, SmallVectorImpl<const Use *> &CallbackUses) {
+  const Function *Callee = CB.getCalledFunction();
   if (!Callee)
     return;
 
@@ -48,56 +47,58 @@ void AbstractCallSite::getCallbackUses(ImmutableCallSite ICS,
     auto *CBCalleeIdxAsCM = cast<ConstantAsMetadata>(OpMD->getOperand(0));
     uint64_t CBCalleeIdx =
         cast<ConstantInt>(CBCalleeIdxAsCM->getValue())->getZExtValue();
-    CBUses.push_back(ICS.arg_begin() + CBCalleeIdx);
+    if (CBCalleeIdx < CB.arg_size())
+      CallbackUses.push_back(CB.arg_begin() + CBCalleeIdx);
   }
 }
 
 /// Create an abstract call site from a use.
-AbstractCallSite::AbstractCallSite(const Use *U) : CS(U->getUser()) {
+AbstractCallSite::AbstractCallSite(const Use *U)
+    : CB(dyn_cast<CallBase>(U->getUser())) {
 
   // First handle unknown users.
-  if (!CS) {
+  if (!CB) {
 
     // If the use is actually in a constant cast expression which itself
     // has only one use, we look through the constant cast expression.
     // This happens by updating the use @p U to the use of the constant
-    // cast expression and afterwards re-initializing CS accordingly.
+    // cast expression and afterwards re-initializing CB accordingly.
     if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U->getUser()))
-      if (CE->getNumUses() == 1 && CE->isCast()) {
+      if (CE->hasOneUse() && CE->isCast()) {
         U = &*CE->use_begin();
-        CS = CallSite(U->getUser());
+        CB = dyn_cast<CallBase>(U->getUser());
       }
 
-    if (!CS) {
+    if (!CB) {
       NumInvalidAbstractCallSitesUnknownUse++;
       return;
     }
   }
 
   // Then handle direct or indirect calls. Thus, if U is the callee of the
-  // call site CS it is not a callback and we are done.
-  if (CS.isCallee(U)) {
+  // call site CB it is not a callback and we are done.
+  if (CB->isCallee(U)) {
     NumDirectAbstractCallSites++;
     return;
   }
 
   // If we cannot identify the broker function we cannot create a callback and
   // invalidate the abstract call site.
-  Function *Callee = CS.getCalledFunction();
+  Function *Callee = CB->getCalledFunction();
   if (!Callee) {
     NumInvalidAbstractCallSitesUnknownCallee++;
-    CS = CallSite();
+    CB = nullptr;
     return;
   }
 
   MDNode *CallbackMD = Callee->getMetadata(LLVMContext::MD_callback);
   if (!CallbackMD) {
     NumInvalidAbstractCallSitesNoCallback++;
-    CS = CallSite();
+    CB = nullptr;
     return;
   }
 
-  unsigned UseIdx = CS.getArgumentNo(U);
+  unsigned UseIdx = CB->getArgOperandNo(U);
   MDNode *CallbackEncMD = nullptr;
   for (const MDOperand &Op : CallbackMD->operands()) {
     MDNode *OpMD = cast<MDNode>(Op.get());
@@ -112,7 +113,7 @@ AbstractCallSite::AbstractCallSite(const Use *U) : CS(U->getUser()) {
 
   if (!CallbackEncMD) {
     NumInvalidAbstractCallSitesNoCallback++;
-    CS = CallSite();
+    CB = nullptr;
     return;
   }
 
@@ -120,7 +121,7 @@ AbstractCallSite::AbstractCallSite(const Use *U) : CS(U->getUser()) {
 
   assert(CallbackEncMD->getNumOperands() >= 2 && "Incomplete !callback metadata");
 
-  unsigned NumCallOperands = CS.getNumArgOperands();
+  unsigned NumCallOperands = CB->getNumArgOperands();
   // Skip the var-arg flag at the end when reading the metadata.
   for (unsigned u = 0, e = CallbackEncMD->getNumOperands() - 1; u < e; u++) {
     Metadata *OpAsM = CallbackEncMD->getOperand(u).get();
diff --git a/llvm/lib/IR/AsmWriter.cpp b/llvm/lib/IR/AsmWriter.cpp
index acf0e4afef27..fd08310316b3 100644
--- a/llvm/lib/IR/AsmWriter.cpp
+++ b/llvm/lib/IR/AsmWriter.cpp
@@ -228,9 +228,9 @@ static void predictValueUseListOrderImpl(const Value *V, const Function *F,
     return LU->getOperandNo() > RU->getOperandNo();
   });
 
-  if (std::is_sorted(
-          List.begin(), List.end(),
-          [](const Entry &L, const Entry &R) { return L.second < R.second; }))
+  if (llvm::is_sorted(List, [](const Entry &L, const Entry &R) {
+        return L.second < R.second;
+      }))
     // Order is already correct.
     return;
 
@@ -462,6 +462,33 @@ static void PrintLLVMName(raw_ostream &OS, const Value *V) {
                 isa<GlobalValue>(V) ? GlobalPrefix : LocalPrefix);
 }
 
+static void PrintShuffleMask(raw_ostream &Out, Type *Ty, ArrayRef<int> Mask) {
+  Out << ", <";
+  if (isa<ScalableVectorType>(Ty))
+    Out << "vscale x ";
+  Out << Mask.size() << " x i32> ";
+  bool FirstElt = true;
+  if (all_of(Mask, [](int Elt) { return Elt == 0; })) {
+    Out << "zeroinitializer";
+  } else if (all_of(Mask, [](int Elt) { return Elt == UndefMaskElem; })) {
+    Out << "undef";
+  } else {
+    Out << "<";
+    for (int Elt : Mask) {
+      if (FirstElt)
+        FirstElt = false;
+      else
+        Out << ", ";
+      Out << "i32 ";
+      if (Elt == UndefMaskElem)
+        Out << "undef";
+      else
+        Out << Elt;
+    }
+    Out << ">";
+  }
+}
+
 namespace {
 
 class TypePrinting {
@@ -561,6 +588,7 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
   switch (Ty->getTypeID()) {
   case Type::VoidTyID:      OS << "void"; return;
   case Type::HalfTyID:      OS << "half"; return;
+  case Type::BFloatTyID:    OS << "bfloat"; return;
   case Type::FloatTyID:     OS << "float"; return;
   case Type::DoubleTyID:    OS << "double"; return;
   case Type::X86_FP80TyID:  OS << "x86_fp80"; return;
@@ -623,12 +651,14 @@ void TypePrinting::print(Type *Ty, raw_ostream &OS) {
     OS << ']';
     return;
   }
-  case Type::VectorTyID: {
+  case Type::FixedVectorTyID:
+  case Type::ScalableVectorTyID: {
     VectorType *PTy = cast<VectorType>(Ty);
+    ElementCount EC = PTy->getElementCount();
     OS << "<";
-    if (PTy->isScalable())
+    if (EC.Scalable)
       OS << "vscale x ";
-    OS << PTy->getNumElements() << " x ";
+    OS << EC.Min << " x ";
     print(PTy->getElementType(), OS);
     OS << '>';
     return;
@@ -783,7 +813,7 @@ public:
 
   /// These functions do the actual initialization.
   inline void initializeIfNeeded();
-  void initializeIndexIfNeeded();
+  int initializeIndexIfNeeded();
 
   // Implementation Details
 private:
@@ -806,7 +836,8 @@ private:
   /// Add all of the module level global variables (and their initializers)
   /// and function declarations, but not the contents of those functions.
   void processModule();
-  void processIndex();
+  // Returns number of allocated slots
+  int processIndex();
 
   /// Add all of the functions arguments, basic blocks, and instructions.
   void processFunction();
@@ -920,11 +951,12 @@ inline void SlotTracker::initializeIfNeeded() {
     processFunction();
 }
 
-void SlotTracker::initializeIndexIfNeeded() {
+int SlotTracker::initializeIndexIfNeeded() {
   if (!TheIndex)
-    return;
-  processIndex();
+    return 0;
+  int NumSlots = processIndex();
   TheIndex = nullptr; ///< Prevent re-processing next time we're called.
+  return NumSlots;
 }
 
 // Iterate through all the global variables, functions, and global
@@ -1019,7 +1051,7 @@ void SlotTracker::processFunction() {
 }
 
 // Iterate through all the GUID in the index and create slots for them.
-void SlotTracker::processIndex() {
+int SlotTracker::processIndex() {
   ST_DEBUG("begin processIndex!\n");
   assert(TheIndex);
 
@@ -1038,17 +1070,17 @@ void SlotTracker::processIndex() {
   for (auto &GlobalList : *TheIndex)
     CreateGUIDSlot(GlobalList.first);
 
+  for (auto &TId : TheIndex->typeIdCompatibleVtableMap())
+    CreateGUIDSlot(GlobalValue::getGUID(TId.first));
+
   // Start numbering the TypeIds after the GUIDs.
   TypeIdNext = GUIDNext;
-
   for (auto TidIter = TheIndex->typeIds().begin();
        TidIter != TheIndex->typeIds().end(); TidIter++)
     CreateTypeIdSlot(TidIter->second.first);
 
-  for (auto &TId : TheIndex->typeIdCompatibleVtableMap())
-    CreateGUIDSlot(GlobalValue::getGUID(TId.first));
-
   ST_DEBUG("end processIndex!\n");
+  return TypeIdNext;
 }
 
 void SlotTracker::processGlobalObjectMetadata(const GlobalObject &GO) {
@@ -1348,7 +1380,7 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       return;
     }
 
-    // Either half, or some form of long double.
+    // Either half, bfloat or some form of long double.
     // These appear as a magic letter identifying the type, then a
     // fixed number of hex digits.
     Out << "0x";
@@ -1376,6 +1408,10 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       Out << 'H';
       Out << format_hex_no_prefix(API.getZExtValue(), 4,
                                   /*Upper=*/true);
+    } else if (&APF.getSemantics() == &APFloat::BFloat()) {
+      Out << 'R';
+      Out << format_hex_no_prefix(API.getZExtValue(), 4,
+                                  /*Upper=*/true);
     } else
       llvm_unreachable("Unsupported floating point type");
     return;
@@ -1475,13 +1511,14 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
   }
 
   if (isa<ConstantVector>(CV) || isa<ConstantDataVector>(CV)) {
-    Type *ETy = CV->getType()->getVectorElementType();
+    auto *CVVTy = cast<VectorType>(CV->getType());
+    Type *ETy = CVVTy->getElementType();
     Out << '<';
     TypePrinter.print(ETy, Out);
     Out << ' ';
     WriteAsOperandInternal(Out, CV->getAggregateElement(0U), &TypePrinter,
                            Machine, Context);
-    for (unsigned i = 1, e = CV->getType()->getVectorNumElements(); i != e;++i){
+    for (unsigned i = 1, e = CVVTy->getNumElements(); i != e; ++i) {
       Out << ", ";
       TypePrinter.print(ETy, Out);
       Out << ' ';
@@ -1545,6 +1582,9 @@ static void WriteConstantInternal(raw_ostream &Out, const Constant *CV,
       TypePrinter.print(CE->getType(), Out);
     }
 
+    if (CE->getOpcode() == Instruction::ShuffleVector)
+      PrintShuffleMask(Out, CE->getType(), CE->getShuffleMask());
+
     Out << ')';
     return;
   }
@@ -1614,6 +1654,8 @@ struct MDFieldPrinter {
                      bool ShouldSkipNull = true);
   template <class IntTy>
   void printInt(StringRef Name, IntTy Int, bool ShouldSkipZero = true);
+  void printAPInt(StringRef Name, const APInt &Int, bool IsUnsigned,
+                  bool ShouldSkipZero);
   void printBool(StringRef Name, bool Value, Optional<bool> Default = None);
   void printDIFlags(StringRef Name, DINode::DIFlags Flags);
   void printDISPFlags(StringRef Name, DISubprogram::DISPFlags Flags);
@@ -1689,6 +1731,15 @@ void MDFieldPrinter::printInt(StringRef Name, IntTy Int, bool ShouldSkipZero) {
   Out << FS << Name << ": " << Int;
 }
 
+void MDFieldPrinter::printAPInt(StringRef Name, const APInt &Int,
+                                bool IsUnsigned, bool ShouldSkipZero) {
+  if (ShouldSkipZero && Int.isNullValue())
+    return;
+
+  Out << FS << Name << ": ";
+  Int.print(Out, !IsUnsigned);
+}
+
 void MDFieldPrinter::printBool(StringRef Name, bool Value,
                                Optional<bool> Default) {
   if (Default && Value == *Default)
@@ -1807,9 +1858,34 @@ static void writeDISubrange(raw_ostream &Out, const DISubrange *N,
   if (auto *CE = N->getCount().dyn_cast<ConstantInt*>())
     Printer.printInt("count", CE->getSExtValue(), /* ShouldSkipZero */ false);
   else
-    Printer.printMetadata("count", N->getCount().dyn_cast<DIVariable*>(),
-                          /*ShouldSkipNull */ false);
-  Printer.printInt("lowerBound", N->getLowerBound());
+    Printer.printMetadata("count", N->getCount().dyn_cast<DIVariable *>(),
+                          /*ShouldSkipNull */ true);
+
+  // A lowerBound of constant 0 should not be skipped, since it is different
+  // from an unspecified lower bound (= nullptr).
+  auto *LBound = N->getRawLowerBound();
+  if (auto *LE = dyn_cast_or_null<ConstantAsMetadata>(LBound)) {
+    auto *LV = cast<ConstantInt>(LE->getValue());
+    Printer.printInt("lowerBound", LV->getSExtValue(),
+                     /* ShouldSkipZero */ false);
+  } else
+    Printer.printMetadata("lowerBound", LBound, /*ShouldSkipNull */ true);
+
+  auto *UBound = N->getRawUpperBound();
+  if (auto *UE = dyn_cast_or_null<ConstantAsMetadata>(UBound)) {
+    auto *UV = cast<ConstantInt>(UE->getValue());
+    Printer.printInt("upperBound", UV->getSExtValue(),
+                     /* ShouldSkipZero */ false);
+  } else
+    Printer.printMetadata("upperBound", UBound, /*ShouldSkipNull */ true);
+
+  auto *Stride = N->getRawStride();
+  if (auto *SE = dyn_cast_or_null<ConstantAsMetadata>(Stride)) {
+    auto *SV = cast<ConstantInt>(SE->getValue());
+    Printer.printInt("stride", SV->getSExtValue(), /* ShouldSkipZero */ false);
+  } else
+    Printer.printMetadata("stride", Stride, /*ShouldSkipNull */ true);
+
   Out << ")";
 }
 
@@ -1818,13 +1894,10 @@ static void writeDIEnumerator(raw_ostream &Out, const DIEnumerator *N,
   Out << "!DIEnumerator(";
   MDFieldPrinter Printer(Out);
   Printer.printString("name", N->getName(), /* ShouldSkipEmpty */ false);
-  if (N->isUnsigned()) {
-    auto Value = static_cast<uint64_t>(N->getValue());
-    Printer.printInt("value", Value, /* ShouldSkipZero */ false);
+  Printer.printAPInt("value", N->getValue(), N->isUnsigned(),
+                     /*ShouldSkipZero=*/false);
+  if (N->isUnsigned())
     Printer.printBool("isUnsigned", true);
-  } else {
-    Printer.printInt("value", N->getValue(), /* ShouldSkipZero */ false);
-  }
   Out << ")";
 }
 
@@ -1888,6 +1961,7 @@ static void writeDICompositeType(raw_ostream &Out, const DICompositeType *N,
   Printer.printMetadata("templateParams", N->getRawTemplateParams());
   Printer.printString("identifier", N->getIdentifier());
   Printer.printMetadata("discriminator", N->getRawDiscriminator());
+  Printer.printMetadata("dataLocation", N->getRawDataLocation());
   Out << ")";
 }
 
@@ -1945,6 +2019,8 @@ static void writeDICompileUnit(raw_ostream &Out, const DICompileUnit *N,
                     false);
   Printer.printNameTableKind("nameTableKind", N->getNameTableKind());
   Printer.printBool("rangesBaseAddress", N->getRangesBaseAddress(), false);
+  Printer.printString("sysroot", N->getSysRoot());
+  Printer.printString("sdk", N->getSDK());
   Out << ")";
 }
 
@@ -2057,7 +2133,9 @@ static void writeDIModule(raw_ostream &Out, const DIModule *N,
   Printer.printString("name", N->getName());
   Printer.printString("configMacros", N->getConfigurationMacros());
   Printer.printString("includePath", N->getIncludePath());
-  Printer.printString("sysroot", N->getSysRoot());
+  Printer.printString("apinotes", N->getAPINotesFile());
+  Printer.printMetadata("file", N->getRawFile());
+  Printer.printInt("line", N->getLineNo());
   Out << ")";
 }
 
@@ -2071,6 +2149,7 @@ static void writeDITemplateTypeParameter(raw_ostream &Out,
   MDFieldPrinter Printer(Out, TypePrinter, Machine, Context);
   Printer.printString("name", N->getName());
   Printer.printMetadata("type", N->getRawType(), /* ShouldSkipNull */ false);
+  Printer.printBool("defaulted", N->isDefault(), /* Default= */ false);
   Out << ")";
 }
 
@@ -2085,6 +2164,7 @@ static void writeDITemplateValueParameter(raw_ostream &Out,
     Printer.printTag(N);
   Printer.printString("name", N->getName());
   Printer.printMetadata("type", N->getRawType());
+  Printer.printBool("defaulted", N->isDefault(), /* Default= */ false);
   Printer.printMetadata("value", N->getValue(), /* ShouldSkipNull */ false);
   Out << ")";
 }
@@ -2430,10 +2510,10 @@ public:
   void printTypeIdInfo(const FunctionSummary::TypeIdInfo &TIDInfo);
   void printVFuncId(const FunctionSummary::VFuncId VFId);
   void
-  printNonConstVCalls(const std::vector<FunctionSummary::VFuncId> VCallList,
+  printNonConstVCalls(const std::vector<FunctionSummary::VFuncId> &VCallList,
                       const char *Tag);
   void
-  printConstVCalls(const std::vector<FunctionSummary::ConstVCall> VCallList,
+  printConstVCalls(const std::vector<FunctionSummary::ConstVCall> &VCallList,
                    const char *Tag);
 
 private:
@@ -2651,8 +2731,10 @@ void AssemblyWriter::printModule(const Module *M) {
   printUseLists(nullptr);
 
   // Output all of the functions.
-  for (const Function &F : *M)
+  for (const Function &F : *M) {
+    Out << '\n';
     printFunction(&F);
+  }
   assert(UseListOrders.empty() && "All use-lists should have been consumed");
 
   // Output all attribute groups.
@@ -2676,21 +2758,22 @@ void AssemblyWriter::printModule(const Module *M) {
 
 void AssemblyWriter::printModuleSummaryIndex() {
   assert(TheIndex);
-  Machine.initializeIndexIfNeeded();
+  int NumSlots = Machine.initializeIndexIfNeeded();
 
   Out << "\n";
 
   // Print module path entries. To print in order, add paths to a vector
   // indexed by module slot.
   std::vector<std::pair<std::string, ModuleHash>> moduleVec;
-  std::string RegularLTOModuleName = "[Regular LTO]";
+  std::string RegularLTOModuleName =
+      ModuleSummaryIndex::getRegularLTOModuleName();
   moduleVec.resize(TheIndex->modulePaths().size());
   for (auto &ModPath : TheIndex->modulePaths())
     moduleVec[Machine.getModulePathSlot(ModPath.first())] = std::make_pair(
         // A module id of -1 is a special entry for a regular LTO module created
         // during the thin link.
         ModPath.second.first == -1u ? RegularLTOModuleName
-                                    : (std::string)ModPath.first(),
+                                    : (std::string)std::string(ModPath.first()),
         ModPath.second.second);
 
   unsigned i = 0;
@@ -2737,6 +2820,15 @@ void AssemblyWriter::printModuleSummaryIndex() {
     printTypeIdCompatibleVtableSummary(TId.second);
     Out << ") ; guid = " << GUID << "\n";
   }
+
+  // Don't emit flags when it's not really needed (value is zero by default).
+  if (TheIndex->getFlags()) {
+    Out << "^" << NumSlots << " = flags: " << TheIndex->getFlags() << "\n";
+    ++NumSlots;
+  }
+
+  Out << "^" << NumSlots << " = blockcount: " << TheIndex->getBlockCount()
+      << "\n";
 }
 
 static const char *
@@ -2769,6 +2861,8 @@ static const char *getWholeProgDevirtResByArgKindName(
 
 static const char *getTTResKindName(TypeTestResolution::Kind K) {
   switch (K) {
+  case TypeTestResolution::Unknown:
+    return "unknown";
   case TypeTestResolution::Unsat:
     return "unsat";
   case TypeTestResolution::ByteArray:
@@ -2900,10 +2994,15 @@ void AssemblyWriter::printAliasSummary(const AliasSummary *AS) {
 }
 
 void AssemblyWriter::printGlobalVarSummary(const GlobalVarSummary *GS) {
+  auto VTableFuncs = GS->vTableFuncs();
   Out << ", varFlags: (readonly: " << GS->VarFlags.MaybeReadOnly << ", "
-      << "writeonly: " << GS->VarFlags.MaybeWriteOnly << ")";
+      << "writeonly: " << GS->VarFlags.MaybeWriteOnly << ", "
+      << "constant: " << GS->VarFlags.Constant;
+  if (!VTableFuncs.empty())
+    Out << ", "
+        << "vcall_visibility: " << GS->VarFlags.VCallVisibility;
+  Out << ")";
 
-  auto VTableFuncs = GS->vTableFuncs();
   if (!VTableFuncs.empty()) {
     Out << ", vTableFuncs: (";
     FieldSeparator FS;
@@ -2986,6 +3085,36 @@ void AssemblyWriter::printFunctionSummary(const FunctionSummary *FS) {
 
   if (const auto *TIdInfo = FS->getTypeIdInfo())
     printTypeIdInfo(*TIdInfo);
+
+  auto PrintRange = [&](const ConstantRange &Range) {
+    Out << "[" << Range.getLower() << ", " << Range.getSignedMax() << "]";
+  };
+
+  if (!FS->paramAccesses().empty()) {
+    Out << ", params: (";
+    FieldSeparator IFS;
+    for (auto &PS : FS->paramAccesses()) {
+      Out << IFS;
+      Out << "(param: " << PS.ParamNo;
+      Out << ", offset: ";
+      PrintRange(PS.Use);
+      if (!PS.Calls.empty()) {
+        Out << ", calls: (";
+        FieldSeparator IFS;
+        for (auto &Call : PS.Calls) {
+          Out << IFS;
+          Out << "(callee: ^" << Machine.getGUIDSlot(Call.Callee);
+          Out << ", param: " << Call.ParamNo;
+          Out << ", offset: ";
+          PrintRange(Call.Offsets);
+          Out << ")";
+        }
+        Out << ")";
+      }
+      Out << ")";
+    }
+    Out << ")";
+  }
 }
 
 void AssemblyWriter::printTypeIdInfo(
@@ -3057,7 +3186,7 @@ void AssemblyWriter::printVFuncId(const FunctionSummary::VFuncId VFId) {
 }
 
 void AssemblyWriter::printNonConstVCalls(
-    const std::vector<FunctionSummary::VFuncId> VCallList, const char *Tag) {
+    const std::vector<FunctionSummary::VFuncId> &VCallList, const char *Tag) {
   Out << Tag << ": (";
   FieldSeparator FS;
   for (auto &VFuncId : VCallList) {
@@ -3068,7 +3197,8 @@ void AssemblyWriter::printNonConstVCalls(
 }
 
 void AssemblyWriter::printConstVCalls(
-    const std::vector<FunctionSummary::ConstVCall> VCallList, const char *Tag) {
+    const std::vector<FunctionSummary::ConstVCall> &VCallList,
+    const char *Tag) {
   Out << Tag << ": (";
   FieldSeparator FS;
   for (auto &ConstVCall : VCallList) {
@@ -3200,11 +3330,7 @@ static void PrintVisibility(GlobalValue::VisibilityTypes Vis,
 
 static void PrintDSOLocation(const GlobalValue &GV,
                              formatted_raw_ostream &Out) {
-  // GVs with local linkage or non default visibility are implicitly dso_local,
-  // so we don't print it.
-  bool Implicit = GV.hasLocalLinkage() ||
-                  (!GV.hasExternalWeakLinkage() && !GV.hasDefaultVisibility());
-  if (GV.isDSOLocal() && !Implicit)
+  if (GV.isDSOLocal() && !GV.isImplicitDSOLocal())
     Out << "dso_local ";
 }
 
@@ -3850,7 +3976,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
       PrintCallingConv(CI->getCallingConv(), Out);
     }
 
-    Operand = CI->getCalledValue();
+    Operand = CI->getCalledOperand();
     FunctionType *FTy = CI->getFunctionType();
     Type *RetTy = FTy->getReturnType();
     const AttributeList &PAL = CI->getAttributes();
@@ -3889,7 +4015,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
 
     writeOperandBundles(CI);
   } else if (const InvokeInst *II = dyn_cast<InvokeInst>(&I)) {
-    Operand = II->getCalledValue();
+    Operand = II->getCalledOperand();
     FunctionType *FTy = II->getFunctionType();
     Type *RetTy = FTy->getReturnType();
     const AttributeList &PAL = II->getAttributes();
@@ -3932,7 +4058,7 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
     Out << " unwind ";
     writeOperand(II->getUnwindDest(), true);
   } else if (const CallBrInst *CBI = dyn_cast<CallBrInst>(&I)) {
-    Operand = CBI->getCalledValue();
+    Operand = CBI->getCalledOperand();
     FunctionType *FTy = CBI->getFunctionType();
     Type *RetTy = FTy->getReturnType();
     const AttributeList &PAL = CBI->getAttributes();
@@ -4079,6 +4205,8 @@ void AssemblyWriter::printInstruction(const Instruction &I) {
                 RMWI->getSyncScopeID());
   } else if (const FenceInst *FI = dyn_cast<FenceInst>(&I)) {
     writeAtomic(FI->getContext(), FI->getOrdering(), FI->getSyncScopeID());
+  } else if (const ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(&I)) {
+    PrintShuffleMask(Out, SVI->getType(), SVI->getShuffleMask());
   }
 
   // Print Metadata info.
@@ -4140,9 +4268,16 @@ void AssemblyWriter::writeAttribute(const Attribute &Attr, bool InAttrGroup) {
     return;
   }
 
-  assert(Attr.hasAttribute(Attribute::ByVal) && "unexpected type attr");
+  assert((Attr.hasAttribute(Attribute::ByVal) ||
+          Attr.hasAttribute(Attribute::Preallocated)) &&
+         "unexpected type attr");
+
+  if (Attr.hasAttribute(Attribute::ByVal)) {
+    Out << "byval";
+  } else {
+    Out << "preallocated";
+  }
 
-  Out << "byval";
   if (Type *Ty = Attr.getValueAsType()) {
     Out << '(';
     TypePrinter.print(Ty, Out);
@@ -4228,6 +4363,17 @@ void Function::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
   W.printFunction(this);
 }
 
+void BasicBlock::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
+                     bool ShouldPreserveUseListOrder,
+                     bool IsForDebug) const {
+  SlotTracker SlotTable(this->getModule());
+  formatted_raw_ostream OS(ROS);
+  AssemblyWriter W(OS, SlotTable, this->getModule(), AAW,
+                   IsForDebug,
+                   ShouldPreserveUseListOrder);
+  W.printBasicBlock(this);
+}
+
 void Module::print(raw_ostream &ROS, AssemblyAnnotationWriter *AAW,
                    bool ShouldPreserveUseListOrder, bool IsForDebug) const {
   SlotTracker SlotTable(this);
diff --git a/llvm/lib/IR/AttributeImpl.h b/llvm/lib/IR/AttributeImpl.h
index 15e488bbb13b..5c334348cde3 100644
--- a/llvm/lib/IR/AttributeImpl.h
+++ b/llvm/lib/IR/AttributeImpl.h
@@ -16,6 +16,7 @@
 #define LLVM_LIB_IR_ATTRIBUTEIMPL_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Attributes.h"
@@ -53,8 +54,6 @@ public:
   AttributeImpl(const AttributeImpl &) = delete;
   AttributeImpl &operator=(const AttributeImpl &) = delete;
 
-  virtual ~AttributeImpl();
-
   bool isEnumAttribute() const { return KindID == EnumAttrEntry; }
   bool isIntAttribute() const { return KindID == IntAttrEntry; }
   bool isStringAttribute() const { return KindID == StringAttrEntry; }
@@ -103,6 +102,9 @@ public:
   }
 };
 
+static_assert(std::is_trivially_destructible<AttributeImpl>::value,
+              "AttributeImpl should be trivially destructible");
+
 //===----------------------------------------------------------------------===//
 /// \class
 /// A set of classes that contain the value of the
@@ -111,8 +113,6 @@ public:
 /// attribute enties, which are for target-dependent attributes.
 
 class EnumAttributeImpl : public AttributeImpl {
-  virtual void anchor();
-
   Attribute::AttrKind Kind;
 
 protected:
@@ -129,38 +129,53 @@ public:
 class IntAttributeImpl : public EnumAttributeImpl {
   uint64_t Val;
 
-  void anchor() override;
-
 public:
   IntAttributeImpl(Attribute::AttrKind Kind, uint64_t Val)
       : EnumAttributeImpl(IntAttrEntry, Kind), Val(Val) {
-    assert((Kind == Attribute::Alignment || Kind == Attribute::StackAlignment ||
-            Kind == Attribute::Dereferenceable ||
-            Kind == Attribute::DereferenceableOrNull ||
-            Kind == Attribute::AllocSize) &&
+    assert(Attribute::doesAttrKindHaveArgument(Kind) &&
            "Wrong kind for int attribute!");
   }
 
   uint64_t getValue() const { return Val; }
 };
 
-class StringAttributeImpl : public AttributeImpl {
-  virtual void anchor();
+class StringAttributeImpl final
+    : public AttributeImpl,
+      private TrailingObjects<StringAttributeImpl, char> {
+  friend TrailingObjects;
 
-  std::string Kind;
-  std::string Val;
+  unsigned KindSize;
+  unsigned ValSize;
+  size_t numTrailingObjects(OverloadToken<char>) const {
+    return KindSize + 1 + ValSize + 1;
+  }
 
 public:
   StringAttributeImpl(StringRef Kind, StringRef Val = StringRef())
-      : AttributeImpl(StringAttrEntry), Kind(Kind), Val(Val) {}
+      : AttributeImpl(StringAttrEntry), KindSize(Kind.size()),
+        ValSize(Val.size()) {
+    char *TrailingString = getTrailingObjects<char>();
+    // Some users rely on zero-termination.
+    llvm::copy(Kind, TrailingString);
+    TrailingString[KindSize] = '\0';
+    llvm::copy(Val, &TrailingString[KindSize + 1]);
+    TrailingString[KindSize + 1 + ValSize] = '\0';
+  }
 
-  StringRef getStringKind() const { return Kind; }
-  StringRef getStringValue() const { return Val; }
+  StringRef getStringKind() const {
+    return StringRef(getTrailingObjects<char>(), KindSize);
+  }
+  StringRef getStringValue() const {
+    return StringRef(getTrailingObjects<char>() + KindSize + 1, ValSize);
+  }
+
+  static size_t totalSizeToAlloc(StringRef Kind, StringRef Val) {
+    return TrailingObjects::totalSizeToAlloc<char>(Kind.size() + 1 +
+                                                   Val.size() + 1);
+  }
 };
 
 class TypeAttributeImpl : public EnumAttributeImpl {
-  void anchor() override;
-
   Type *Ty;
 
 public:
@@ -170,6 +185,22 @@ public:
   Type *getTypeValue() const { return Ty; }
 };
 
+class AttributeBitSet {
+  /// Bitset with a bit for each available attribute Attribute::AttrKind.
+  uint8_t AvailableAttrs[12] = {};
+  static_assert(Attribute::EndAttrKinds <= sizeof(AvailableAttrs) * CHAR_BIT,
+                "Too many attributes");
+
+public:
+  bool hasAttribute(Attribute::AttrKind Kind) const {
+    return AvailableAttrs[Kind / 8] & (1 << (Kind % 8));
+  }
+
+  void addAttribute(Attribute::AttrKind Kind) {
+    AvailableAttrs[Kind / 8] |= 1 << (Kind % 8);
+  }
+};
+
 //===----------------------------------------------------------------------===//
 /// \class
 /// This class represents a group of attributes that apply to one
@@ -180,11 +211,16 @@ class AttributeSetNode final
   friend TrailingObjects;
 
   unsigned NumAttrs; ///< Number of attributes in this node.
-  /// Bitset with a bit for each available attribute Attribute::AttrKind.
-  uint8_t AvailableAttrs[12] = {};
+  AttributeBitSet AvailableAttrs; ///< Available enum attributes.
+
+  DenseMap<StringRef, Attribute> StringAttrs;
 
   AttributeSetNode(ArrayRef<Attribute> Attrs);
 
+  static AttributeSetNode *getSorted(LLVMContext &C,
+                                     ArrayRef<Attribute> SortedAttrs);
+  Optional<Attribute> findEnumAttribute(Attribute::AttrKind Kind) const;
+
 public:
   // AttributesSetNode is uniqued, these should not be available.
   AttributeSetNode(const AttributeSetNode &) = delete;
@@ -200,7 +236,7 @@ public:
   unsigned getNumAttributes() const { return NumAttrs; }
 
   bool hasAttribute(Attribute::AttrKind Kind) const {
-    return AvailableAttrs[Kind / 8] & ((uint64_t)1) << (Kind % 8);
+    return AvailableAttrs.hasAttribute(Kind);
   }
   bool hasAttribute(StringRef Kind) const;
   bool hasAttributes() const { return NumAttrs != 0; }
@@ -215,6 +251,7 @@ public:
   std::pair<unsigned, Optional<unsigned>> getAllocSizeArgs() const;
   std::string getAsString(bool InAttrGrp) const;
   Type *getByValType() const;
+  Type *getPreallocatedType() const;
 
   using iterator = const Attribute *;
 
@@ -231,8 +268,6 @@ public:
   }
 };
 
-using IndexAttrPair = std::pair<unsigned, AttributeSet>;
-
 //===----------------------------------------------------------------------===//
 /// \class
 /// This class represents a set of attributes that apply to the function,
@@ -244,32 +279,34 @@ class AttributeListImpl final
   friend TrailingObjects;
 
 private:
-  LLVMContext &Context;
   unsigned NumAttrSets; ///< Number of entries in this set.
-  /// Bitset with a bit for each available attribute Attribute::AttrKind.
-  uint8_t AvailableFunctionAttrs[12] = {};
+  /// Available enum function attributes.
+  AttributeBitSet AvailableFunctionAttrs;
+  /// Union of enum attributes available at any index.
+  AttributeBitSet AvailableSomewhereAttrs;
 
   // Helper fn for TrailingObjects class.
   size_t numTrailingObjects(OverloadToken<AttributeSet>) { return NumAttrSets; }
 
 public:
-  AttributeListImpl(LLVMContext &C, ArrayRef<AttributeSet> Sets);
+  AttributeListImpl(ArrayRef<AttributeSet> Sets);
 
   // AttributesSetImpt is uniqued, these should not be available.
   AttributeListImpl(const AttributeListImpl &) = delete;
   AttributeListImpl &operator=(const AttributeListImpl &) = delete;
 
-  void operator delete(void *p) { ::operator delete(p); }
-
-  /// Get the context that created this AttributeListImpl.
-  LLVMContext &getContext() { return Context; }
-
   /// Return true if the AttributeSet or the FunctionIndex has an
   /// enum attribute of the given kind.
   bool hasFnAttribute(Attribute::AttrKind Kind) const {
-    return AvailableFunctionAttrs[Kind / 8] & ((uint64_t)1) << (Kind % 8);
+    return AvailableFunctionAttrs.hasAttribute(Kind);
   }
 
+  /// Return true if the specified attribute is set for at least one
+  /// parameter or for the return value. If Index is not nullptr, the index
+  /// of a parameter with the specified attribute is provided.
+  bool hasAttrSomewhere(Attribute::AttrKind Kind,
+                        unsigned *Index = nullptr) const;
+
   using iterator = const AttributeSet *;
 
   iterator begin() const { return getTrailingObjects<AttributeSet>(); }
@@ -281,6 +318,9 @@ public:
   void dump() const;
 };
 
+static_assert(std::is_trivially_destructible<AttributeListImpl>::value,
+              "AttributeListImpl should be trivially destructible");
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_IR_ATTRIBUTEIMPL_H
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index 5ca99c981739..f67d96a854f4 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Function.h"
@@ -91,9 +92,9 @@ Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind,
     // If we didn't find any existing attributes of the same shape then create a
     // new one and insert it.
     if (!Val)
-      PA = new EnumAttributeImpl(Kind);
+      PA = new (pImpl->Alloc) EnumAttributeImpl(Kind);
     else
-      PA = new IntAttributeImpl(Kind, Val);
+      PA = new (pImpl->Alloc) IntAttributeImpl(Kind, Val);
     pImpl->AttrsSet.InsertNode(PA, InsertPoint);
   }
 
@@ -113,7 +114,10 @@ Attribute Attribute::get(LLVMContext &Context, StringRef Kind, StringRef Val) {
   if (!PA) {
     // If we didn't find any existing attributes of the same shape then create a
     // new one and insert it.
-    PA = new StringAttributeImpl(Kind, Val);
+    void *Mem =
+        pImpl->Alloc.Allocate(StringAttributeImpl::totalSizeToAlloc(Kind, Val),
+                              alignof(StringAttributeImpl));
+    PA = new (Mem) StringAttributeImpl(Kind, Val);
     pImpl->AttrsSet.InsertNode(PA, InsertPoint);
   }
 
@@ -134,7 +138,7 @@ Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind,
   if (!PA) {
     // If we didn't find any existing attributes of the same shape then create a
     // new one and insert it.
-    PA = new TypeAttributeImpl(Kind, Ty);
+    PA = new (pImpl->Alloc) TypeAttributeImpl(Kind, Ty);
     pImpl->AttrsSet.InsertNode(PA, InsertPoint);
   }
 
@@ -143,7 +147,7 @@ Attribute Attribute::get(LLVMContext &Context, Attribute::AttrKind Kind,
 }
 
 Attribute Attribute::getWithAlignment(LLVMContext &Context, Align A) {
-  assert(A <= 0x40000000 && "Alignment too large.");
+  assert(A <= llvm::Value::MaximumAlignment && "Alignment too large.");
   return get(Context, Alignment, A.value());
 }
 
@@ -168,6 +172,10 @@ Attribute Attribute::getWithByValType(LLVMContext &Context, Type *Ty) {
   return get(Context, ByVal, Ty);
 }
 
+Attribute Attribute::getWithPreallocatedType(LLVMContext &Context, Type *Ty) {
+  return get(Context, Preallocated, Ty);
+}
+
 Attribute
 Attribute::getWithAllocSizeArgs(LLVMContext &Context, unsigned ElemSizeArg,
                                 const Optional<unsigned> &NumElemsArg) {
@@ -176,6 +184,45 @@ Attribute::getWithAllocSizeArgs(LLVMContext &Context, unsigned ElemSizeArg,
   return get(Context, AllocSize, packAllocSizeArgs(ElemSizeArg, NumElemsArg));
 }
 
+Attribute::AttrKind Attribute::getAttrKindFromName(StringRef AttrName) {
+  return StringSwitch<Attribute::AttrKind>(AttrName)
+#define GET_ATTR_NAMES
+#define ATTRIBUTE_ENUM(ENUM_NAME, DISPLAY_NAME)                                \
+  .Case(#DISPLAY_NAME, Attribute::ENUM_NAME)
+#include "llvm/IR/Attributes.inc"
+      .Default(Attribute::None);
+}
+
+StringRef Attribute::getNameFromAttrKind(Attribute::AttrKind AttrKind) {
+  switch (AttrKind) {
+#define GET_ATTR_NAMES
+#define ATTRIBUTE_ENUM(ENUM_NAME, DISPLAY_NAME)                                \
+  case Attribute::ENUM_NAME:                                                   \
+    return #DISPLAY_NAME;
+#include "llvm/IR/Attributes.inc"
+  case Attribute::None:
+    return "none";
+  default:
+    llvm_unreachable("invalid Kind");
+  }
+}
+
+bool Attribute::doesAttrKindHaveArgument(Attribute::AttrKind AttrKind) {
+  return AttrKind == Attribute::Alignment ||
+         AttrKind == Attribute::StackAlignment ||
+         AttrKind == Attribute::Dereferenceable ||
+         AttrKind == Attribute::AllocSize ||
+         AttrKind == Attribute::DereferenceableOrNull;
+}
+
+bool Attribute::isExistingAttribute(StringRef Name) {
+  return StringSwitch<bool>(Name)
+#define GET_ATTR_NAMES
+#define ATTRIBUTE_ALL(ENUM_NAME, DISPLAY_NAME) .Case(#DISPLAY_NAME, true)
+#include "llvm/IR/Attributes.inc"
+      .Default(false);
+}
+
 //===----------------------------------------------------------------------===//
 // Attribute Accessor Methods
 //===----------------------------------------------------------------------===//
@@ -328,6 +375,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "noinline";
   if (hasAttribute(Attribute::NonLazyBind))
     return "nonlazybind";
+  if (hasAttribute(Attribute::NoMerge))
+    return "nomerge";
   if (hasAttribute(Attribute::NonNull))
     return "nonnull";
   if (hasAttribute(Attribute::NoRedZone))
@@ -336,6 +385,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "noreturn";
   if (hasAttribute(Attribute::NoSync))
     return "nosync";
+  if (hasAttribute(Attribute::NullPointerIsValid))
+    return "null_pointer_is_valid";
   if (hasAttribute(Attribute::WillReturn))
     return "willreturn";
   if (hasAttribute(Attribute::NoCfCheck))
@@ -392,6 +443,8 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return "cold";
   if (hasAttribute(Attribute::ImmArg))
     return "immarg";
+  if (hasAttribute(Attribute::NoUndef))
+    return "noundef";
 
   if (hasAttribute(Attribute::ByVal)) {
     std::string Result;
@@ -406,6 +459,17 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
     return Result;
   }
 
+  if (hasAttribute(Attribute::Preallocated)) {
+    std::string Result;
+    Result += "preallocated";
+    raw_string_ostream OS(Result);
+    Result += '(';
+    getValueAsType()->print(OS, false, true);
+    OS.flush();
+    Result += ')';
+    return Result;
+  }
+
   // FIXME: These should be output like this:
   //
   //   align=4
@@ -464,19 +528,19 @@ std::string Attribute::getAsString(bool InAttrGrp) const {
   //
   if (isStringAttribute()) {
     std::string Result;
-    Result += (Twine('"') + getKindAsString() + Twine('"')).str();
-
-    std::string AttrVal = pImpl->getValueAsString();
-    if (AttrVal.empty()) return Result;
-
-    // Since some attribute strings contain special characters that cannot be
-    // printable, those have to be escaped to make the attribute value printable
-    // as is.  e.g. "\01__gnu_mcount_nc"
     {
       raw_string_ostream OS(Result);
-      OS << "=\"";
-      printEscapedString(AttrVal, OS);
-      OS << "\"";
+      OS << '"' << getKindAsString() << '"';
+
+      // Since some attribute strings contain special characters that cannot be
+      // printable, those have to be escaped to make the attribute value
+      // printable as is.  e.g. "\01__gnu_mcount_nc"
+      const auto &AttrVal = pImpl->getValueAsString();
+      if (!AttrVal.empty()) {
+        OS << "=\"";
+        printEscapedString(AttrVal, OS);
+        OS << "\"";
+      }
     }
     return Result;
   }
@@ -491,21 +555,14 @@ bool Attribute::operator<(Attribute A) const {
   return *pImpl < *A.pImpl;
 }
 
+void Attribute::Profile(FoldingSetNodeID &ID) const {
+  ID.AddPointer(pImpl);
+}
+
 //===----------------------------------------------------------------------===//
 // AttributeImpl Definition
 //===----------------------------------------------------------------------===//
 
-// Pin the vtables to this file.
-AttributeImpl::~AttributeImpl() = default;
-
-void EnumAttributeImpl::anchor() {}
-
-void IntAttributeImpl::anchor() {}
-
-void StringAttributeImpl::anchor() {}
-
-void TypeAttributeImpl::anchor() {}
-
 bool AttributeImpl::hasAttribute(Attribute::AttrKind A) const {
   if (isStringAttribute()) return false;
   return getKindAsEnum() == A;
@@ -542,6 +599,8 @@ Type *AttributeImpl::getValueAsType() const {
 }
 
 bool AttributeImpl::operator<(const AttributeImpl &AI) const {
+  if (this == &AI)
+    return false;
   // This sorts the attributes with Attribute::AttrKinds coming first (sorted
   // relative to their enum value) and then strings.
   if (isEnumAttribute()) {
@@ -687,6 +746,10 @@ Type *AttributeSet::getByValType() const {
   return SetNode ? SetNode->getByValType() : nullptr;
 }
 
+Type *AttributeSet::getPreallocatedType() const {
+  return SetNode ? SetNode->getPreallocatedType() : nullptr;
+}
+
 std::pair<unsigned, Optional<unsigned>> AttributeSet::getAllocSizeArgs() const {
   return SetNode ? SetNode->getAllocSizeArgs()
                  : std::pair<unsigned, Optional<unsigned>>(0, 0);
@@ -721,30 +784,31 @@ AttributeSetNode::AttributeSetNode(ArrayRef<Attribute> Attrs)
   // There's memory after the node where we can store the entries in.
   llvm::copy(Attrs, getTrailingObjects<Attribute>());
 
-  static_assert(Attribute::EndAttrKinds <=
-                    sizeof(AvailableAttrs) * CHAR_BIT,
-                "Too many attributes");
-
   for (const auto &I : *this) {
-    if (!I.isStringAttribute()) {
-      Attribute::AttrKind Kind = I.getKindAsEnum();
-      AvailableAttrs[Kind / 8] |= 1ULL << (Kind % 8);
-    }
+    if (I.isStringAttribute())
+      StringAttrs.insert({ I.getKindAsString(), I });
+    else
+      AvailableAttrs.addAttribute(I.getKindAsEnum());
   }
 }
 
 AttributeSetNode *AttributeSetNode::get(LLVMContext &C,
                                         ArrayRef<Attribute> Attrs) {
-  if (Attrs.empty())
+  SmallVector<Attribute, 8> SortedAttrs(Attrs.begin(), Attrs.end());
+  llvm::sort(SortedAttrs);
+  return getSorted(C, SortedAttrs);
+}
+
+AttributeSetNode *AttributeSetNode::getSorted(LLVMContext &C,
+                                              ArrayRef<Attribute> SortedAttrs) {
+  if (SortedAttrs.empty())
     return nullptr;
 
-  // Otherwise, build a key to look up the existing attributes.
+  // Build a key to look up the existing attributes.
   LLVMContextImpl *pImpl = C.pImpl;
   FoldingSetNodeID ID;
 
-  SmallVector<Attribute, 8> SortedAttrs(Attrs.begin(), Attrs.end());
-  llvm::sort(SortedAttrs);
-
+  assert(llvm::is_sorted(SortedAttrs) && "Expected sorted attributes!");
   for (const auto &Attr : SortedAttrs)
     Attr.Profile(ID);
 
@@ -778,6 +842,9 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C, const AttrBuilder &B) {
     case Attribute::ByVal:
       Attr = Attribute::getWithByValType(C, B.getByValType());
       break;
+    case Attribute::Preallocated:
+      Attr = Attribute::getWithPreallocatedType(C, B.getPreallocatedType());
+      break;
     case Attribute::Alignment:
       assert(B.getAlignment() && "Alignment must be set");
       Attr = Attribute::getWithAlignment(C, *B.getAlignment());
@@ -809,72 +876,81 @@ AttributeSetNode *AttributeSetNode::get(LLVMContext &C, const AttrBuilder &B) {
   for (const auto &TDA : B.td_attrs())
     Attrs.emplace_back(Attribute::get(C, TDA.first, TDA.second));
 
-  return get(C, Attrs);
+  return getSorted(C, Attrs);
 }
 
 bool AttributeSetNode::hasAttribute(StringRef Kind) const {
-  for (const auto &I : *this)
-    if (I.hasAttribute(Kind))
-      return true;
-  return false;
+  return StringAttrs.count(Kind);
+}
+
+Optional<Attribute>
+AttributeSetNode::findEnumAttribute(Attribute::AttrKind Kind) const {
+  // Do a quick presence check.
+  if (!hasAttribute(Kind))
+    return None;
+
+  // Attributes in a set are sorted by enum value, followed by string
+  // attributes. Binary search the one we want.
+  const Attribute *I =
+      std::lower_bound(begin(), end() - StringAttrs.size(), Kind,
+                       [](Attribute A, Attribute::AttrKind Kind) {
+                         return A.getKindAsEnum() < Kind;
+                       });
+  assert(I != end() && I->hasAttribute(Kind) && "Presence check failed?");
+  return *I;
 }
 
 Attribute AttributeSetNode::getAttribute(Attribute::AttrKind Kind) const {
-  if (hasAttribute(Kind)) {
-    for (const auto &I : *this)
-      if (I.hasAttribute(Kind))
-        return I;
-  }
+  if (auto A = findEnumAttribute(Kind))
+    return *A;
   return {};
 }
 
 Attribute AttributeSetNode::getAttribute(StringRef Kind) const {
-  for (const auto &I : *this)
-    if (I.hasAttribute(Kind))
-      return I;
-  return {};
+  return StringAttrs.lookup(Kind);
 }
 
 MaybeAlign AttributeSetNode::getAlignment() const {
-  for (const auto &I : *this)
-    if (I.hasAttribute(Attribute::Alignment))
-      return I.getAlignment();
+  if (auto A = findEnumAttribute(Attribute::Alignment))
+    return A->getAlignment();
   return None;
 }
 
 MaybeAlign AttributeSetNode::getStackAlignment() const {
-  for (const auto &I : *this)
-    if (I.hasAttribute(Attribute::StackAlignment))
-      return I.getStackAlignment();
+  if (auto A = findEnumAttribute(Attribute::StackAlignment))
+    return A->getStackAlignment();
   return None;
 }
 
 Type *AttributeSetNode::getByValType() const {
+  if (auto A = findEnumAttribute(Attribute::ByVal))
+    return A->getValueAsType();
+  return 0;
+}
+
+Type *AttributeSetNode::getPreallocatedType() const {
   for (const auto &I : *this)
-    if (I.hasAttribute(Attribute::ByVal))
+    if (I.hasAttribute(Attribute::Preallocated))
       return I.getValueAsType();
   return 0;
 }
 
 uint64_t AttributeSetNode::getDereferenceableBytes() const {
-  for (const auto &I : *this)
-    if (I.hasAttribute(Attribute::Dereferenceable))
-      return I.getDereferenceableBytes();
+  if (auto A = findEnumAttribute(Attribute::Dereferenceable))
+    return A->getDereferenceableBytes();
   return 0;
 }
 
 uint64_t AttributeSetNode::getDereferenceableOrNullBytes() const {
-  for (const auto &I : *this)
-    if (I.hasAttribute(Attribute::DereferenceableOrNull))
-      return I.getDereferenceableOrNullBytes();
+  if (auto A = findEnumAttribute(Attribute::DereferenceableOrNull))
+    return A->getDereferenceableOrNullBytes();
   return 0;
 }
 
 std::pair<unsigned, Optional<unsigned>>
 AttributeSetNode::getAllocSizeArgs() const {
-  for (const auto &I : *this)
-    if (I.hasAttribute(Attribute::AllocSize))
-      return I.getAllocSizeArgs();
+  if (auto A = findEnumAttribute(Attribute::AllocSize))
+    return A->getAllocSizeArgs();
   return std::make_pair(0, 0);
 }
 
@@ -893,33 +969,30 @@ std::string AttributeSetNode::getAsString(bool InAttrGrp) const {
 //===----------------------------------------------------------------------===//
 
 /// Map from AttributeList index to the internal array index. Adding one happens
-/// to work, but it relies on unsigned integer wrapping. MSVC warns about
-/// unsigned wrapping in constexpr functions, so write out the conditional. LLVM
-/// folds it to add anyway.
+/// to work, because -1 wraps around to 0.
 static constexpr unsigned attrIdxToArrayIdx(unsigned Index) {
-  return Index == AttributeList::FunctionIndex ? 0 : Index + 1;
+  return Index + 1;
 }
 
-AttributeListImpl::AttributeListImpl(LLVMContext &C,
-                                     ArrayRef<AttributeSet> Sets)
-    : Context(C), NumAttrSets(Sets.size()) {
+AttributeListImpl::AttributeListImpl(ArrayRef<AttributeSet> Sets)
+    : NumAttrSets(Sets.size()) {
   assert(!Sets.empty() && "pointless AttributeListImpl");
 
   // There's memory after the node where we can store the entries in.
   llvm::copy(Sets, getTrailingObjects<AttributeSet>());
 
-  // Initialize AvailableFunctionAttrs summary bitset.
-  static_assert(Attribute::EndAttrKinds <=
-                    sizeof(AvailableFunctionAttrs) * CHAR_BIT,
-                "Too many attributes");
+  // Initialize AvailableFunctionAttrs and AvailableSomewhereAttrs
+  // summary bitsets.
   static_assert(attrIdxToArrayIdx(AttributeList::FunctionIndex) == 0U,
                 "function should be stored in slot 0");
-  for (const auto &I : Sets[0]) {
-    if (!I.isStringAttribute()) {
-      Attribute::AttrKind Kind = I.getKindAsEnum();
-      AvailableFunctionAttrs[Kind / 8] |= 1ULL << (Kind % 8);
-    }
-  }
+  for (const auto &I : Sets[0])
+    if (!I.isStringAttribute())
+      AvailableFunctionAttrs.addAttribute(I.getKindAsEnum());
+
+  for (const auto &Set : Sets)
+    for (const auto &I : Set)
+      if (!I.isStringAttribute())
+        AvailableSomewhereAttrs.addAttribute(I.getKindAsEnum());
 }
 
 void AttributeListImpl::Profile(FoldingSetNodeID &ID) const {
@@ -932,6 +1005,24 @@ void AttributeListImpl::Profile(FoldingSetNodeID &ID,
     ID.AddPointer(Set.SetNode);
 }
 
+bool AttributeListImpl::hasAttrSomewhere(Attribute::AttrKind Kind,
+                                        unsigned *Index) const {
+  if (!AvailableSomewhereAttrs.hasAttribute(Kind))
+    return false;
+
+  if (Index) {
+    for (unsigned I = 0, E = NumAttrSets; I != E; ++I) {
+      if (begin()[I].hasAttribute(Kind)) {
+        *Index = I - 1;
+        break;
+      }
+    }
+  }
+
+  return true;
+}
+
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void AttributeListImpl::dump() const {
   AttributeList(const_cast<AttributeListImpl *>(this)).dump();
@@ -958,9 +1049,10 @@ AttributeList AttributeList::getImpl(LLVMContext &C,
   // create a new one and insert it.
   if (!PA) {
     // Coallocate entries after the AttributeListImpl itself.
-    void *Mem = ::operator new(
-        AttributeListImpl::totalSizeToAlloc<AttributeSet>(AttrSets.size()));
-    PA = new (Mem) AttributeListImpl(C, AttrSets);
+    void *Mem = pImpl->Alloc.Allocate(
+        AttributeListImpl::totalSizeToAlloc<AttributeSet>(AttrSets.size()),
+        alignof(AttributeListImpl));
+    PA = new (Mem) AttributeListImpl(AttrSets);
     pImpl->AttrsLists.InsertNode(PA, InsertPoint);
   }
 
@@ -975,11 +1067,12 @@ AttributeList::get(LLVMContext &C,
   if (Attrs.empty())
     return {};
 
-  assert(std::is_sorted(Attrs.begin(), Attrs.end(),
-                        [](const std::pair<unsigned, Attribute> &LHS,
-                           const std::pair<unsigned, Attribute> &RHS) {
-                          return LHS.first < RHS.first;
-                        }) && "Misordered Attributes list!");
+  assert(llvm::is_sorted(Attrs,
+                         [](const std::pair<unsigned, Attribute> &LHS,
+                            const std::pair<unsigned, Attribute> &RHS) {
+                           return LHS.first < RHS.first;
+                         }) &&
+         "Misordered Attributes list!");
   assert(llvm::none_of(Attrs,
                        [](const std::pair<unsigned, Attribute> &Pair) {
                          return Pair.second.hasAttribute(Attribute::None);
@@ -1011,11 +1104,11 @@ AttributeList::get(LLVMContext &C,
   if (Attrs.empty())
     return {};
 
-  assert(std::is_sorted(Attrs.begin(), Attrs.end(),
-                        [](const std::pair<unsigned, AttributeSet> &LHS,
-                           const std::pair<unsigned, AttributeSet> &RHS) {
-                          return LHS.first < RHS.first;
-                        }) &&
+  assert(llvm::is_sorted(Attrs,
+                         [](const std::pair<unsigned, AttributeSet> &LHS,
+                            const std::pair<unsigned, AttributeSet> &RHS) {
+                           return LHS.first < RHS.first;
+                         }) &&
          "Misordered Attributes list!");
   assert(llvm::none_of(Attrs,
                        [](const std::pair<unsigned, AttributeSet> &Pair) {
@@ -1096,6 +1189,17 @@ AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
 }
 
 AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
+                                 ArrayRef<Attribute::AttrKind> Kinds,
+                                 ArrayRef<uint64_t> Values) {
+  assert(Kinds.size() == Values.size() && "Mismatched attribute values.");
+  SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
+  auto VI = Values.begin();
+  for (const auto K : Kinds)
+    Attrs.emplace_back(Index, Attribute::get(C, K, *VI++));
+  return get(C, Attrs);
+}
+
+AttributeList AttributeList::get(LLVMContext &C, unsigned Index,
                                  ArrayRef<StringRef> Kinds) {
   SmallVector<std::pair<unsigned, Attribute>, 8> Attrs;
   for (const auto &K : Kinds)
@@ -1184,7 +1288,7 @@ AttributeList AttributeList::addAttributes(LLVMContext &C, unsigned Index,
 AttributeList AttributeList::addParamAttribute(LLVMContext &C,
                                                ArrayRef<unsigned> ArgNos,
                                                Attribute A) const {
-  assert(std::is_sorted(ArgNos.begin(), ArgNos.end()));
+  assert(llvm::is_sorted(ArgNos));
 
   SmallVector<AttributeSet, 4> AttrSets(this->begin(), this->end());
   unsigned MaxIndex = attrIdxToArrayIdx(ArgNos.back() + FirstArgIndex);
@@ -1284,8 +1388,6 @@ AttributeList::addAllocSizeAttr(LLVMContext &C, unsigned Index,
 // AttributeList Accessor Methods
 //===----------------------------------------------------------------------===//
 
-LLVMContext &AttributeList::getContext() const { return pImpl->getContext(); }
-
 AttributeSet AttributeList::getParamAttributes(unsigned ArgNo) const {
   return getAttributes(ArgNo + FirstArgIndex);
 }
@@ -1326,17 +1428,7 @@ bool AttributeList::hasParamAttribute(unsigned ArgNo,
 
 bool AttributeList::hasAttrSomewhere(Attribute::AttrKind Attr,
                                      unsigned *Index) const {
-  if (!pImpl) return false;
-
-  for (unsigned I = index_begin(), E = index_end(); I != E; ++I) {
-    if (hasAttribute(I, Attr)) {
-      if (Index)
-        *Index = I;
-      return true;
-    }
-  }
-
-  return false;
+  return pImpl && pImpl->hasAttrSomewhere(Attr, Index);
 }
 
 Attribute AttributeList::getAttribute(unsigned Index,
@@ -1360,6 +1452,10 @@ Type *AttributeList::getParamByValType(unsigned Index) const {
   return getAttributes(Index+FirstArgIndex).getByValType();
 }
 
+Type *AttributeList::getParamPreallocatedType(unsigned Index) const {
+  return getAttributes(Index + FirstArgIndex).getPreallocatedType();
+}
+
 MaybeAlign AttributeList::getStackAlignment(unsigned Index) const {
   return getAttributes(Index).getStackAlignment();
 }
@@ -1441,12 +1537,12 @@ void AttrBuilder::clear() {
   DerefBytes = DerefOrNullBytes = 0;
   AllocSizeArgs = 0;
   ByValType = nullptr;
+  PreallocatedType = nullptr;
 }
 
 AttrBuilder &AttrBuilder::addAttribute(Attribute::AttrKind Val) {
   assert((unsigned)Val < Attribute::EndAttrKinds && "Attribute out of range!");
-  assert(Val != Attribute::Alignment && Val != Attribute::StackAlignment &&
-         Val != Attribute::Dereferenceable && Val != Attribute::AllocSize &&
+  assert(!Attribute::doesAttrKindHaveArgument(Val) &&
          "Adding integer attribute without adding a value!");
   Attrs[Val] = true;
   return *this;
@@ -1467,6 +1563,8 @@ AttrBuilder &AttrBuilder::addAttribute(Attribute Attr) {
     StackAlignment = Attr.getStackAlignment();
   else if (Kind == Attribute::ByVal)
     ByValType = Attr.getValueAsType();
+  else if (Kind == Attribute::Preallocated)
+    PreallocatedType = Attr.getValueAsType();
   else if (Kind == Attribute::Dereferenceable)
     DerefBytes = Attr.getDereferenceableBytes();
   else if (Kind == Attribute::DereferenceableOrNull)
@@ -1477,7 +1575,7 @@ AttrBuilder &AttrBuilder::addAttribute(Attribute Attr) {
 }
 
 AttrBuilder &AttrBuilder::addAttribute(StringRef A, StringRef V) {
-  TargetDepAttrs[A] = V;
+  TargetDepAttrs[std::string(A)] = std::string(V);
   return *this;
 }
 
@@ -1491,6 +1589,8 @@ AttrBuilder &AttrBuilder::removeAttribute(Attribute::AttrKind Val) {
     StackAlignment.reset();
   else if (Val == Attribute::ByVal)
     ByValType = nullptr;
+  else if (Val == Attribute::Preallocated)
+    PreallocatedType = nullptr;
   else if (Val == Attribute::Dereferenceable)
     DerefBytes = 0;
   else if (Val == Attribute::DereferenceableOrNull)
@@ -1521,7 +1621,7 @@ AttrBuilder &AttrBuilder::addAlignmentAttr(MaybeAlign Align) {
   if (!Align)
     return *this;
 
-  assert(*Align <= 0x40000000 && "Alignment too large.");
+  assert(*Align <= llvm::Value::MaximumAlignment && "Alignment too large.");
 
   Attrs[Attribute::Alignment] = true;
   Alignment = Align;
@@ -1579,6 +1679,12 @@ AttrBuilder &AttrBuilder::addByValAttr(Type *Ty) {
   return *this;
 }
 
+AttrBuilder &AttrBuilder::addPreallocatedAttr(Type *Ty) {
+  Attrs[Attribute::Preallocated] = true;
+  PreallocatedType = Ty;
+  return *this;
+}
+
 AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) {
   // FIXME: What if both have alignments, but they don't match?!
   if (!Alignment)
@@ -1599,9 +1705,12 @@ AttrBuilder &AttrBuilder::merge(const AttrBuilder &B) {
   if (!ByValType)
     ByValType = B.ByValType;
 
+  if (!PreallocatedType)
+    PreallocatedType = B.PreallocatedType;
+
   Attrs |= B.Attrs;
 
-  for (auto I : B.td_attrs())
+  for (const auto &I : B.td_attrs())
     TargetDepAttrs[I.first] = I.second;
 
   return *this;
@@ -1627,9 +1736,12 @@ AttrBuilder &AttrBuilder::remove(const AttrBuilder &B) {
   if (B.ByValType)
     ByValType = nullptr;
 
+  if (B.PreallocatedType)
+    PreallocatedType = nullptr;
+
   Attrs &= ~B.Attrs;
 
-  for (auto I : B.td_attrs())
+  for (const auto &I : B.td_attrs())
     TargetDepAttrs.erase(I.first);
 
   return *this;
@@ -1686,7 +1798,8 @@ bool AttrBuilder::operator==(const AttrBuilder &B) {
       return false;
 
   return Alignment == B.Alignment && StackAlignment == B.StackAlignment &&
-         DerefBytes == B.DerefBytes && ByValType == B.ByValType;
+         DerefBytes == B.DerefBytes && ByValType == B.ByValType &&
+         PreallocatedType == B.PreallocatedType;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1704,17 +1817,18 @@ AttrBuilder AttributeFuncs::typeIncompatible(Type *Ty) {
 
   if (!Ty->isPointerTy())
     // Attribute that only apply to pointers.
-    Incompatible.addAttribute(Attribute::ByVal)
-      .addAttribute(Attribute::Nest)
-      .addAttribute(Attribute::NoAlias)
-      .addAttribute(Attribute::NoCapture)
-      .addAttribute(Attribute::NonNull)
-      .addDereferenceableAttr(1) // the int here is ignored
-      .addDereferenceableOrNullAttr(1) // the int here is ignored
-      .addAttribute(Attribute::ReadNone)
-      .addAttribute(Attribute::ReadOnly)
-      .addAttribute(Attribute::StructRet)
-      .addAttribute(Attribute::InAlloca);
+    Incompatible.addAttribute(Attribute::Nest)
+        .addAttribute(Attribute::NoAlias)
+        .addAttribute(Attribute::NoCapture)
+        .addAttribute(Attribute::NonNull)
+        .addDereferenceableAttr(1)       // the int here is ignored
+        .addDereferenceableOrNullAttr(1) // the int here is ignored
+        .addAttribute(Attribute::ReadNone)
+        .addAttribute(Attribute::ReadOnly)
+        .addAttribute(Attribute::StructRet)
+        .addAttribute(Attribute::InAlloca)
+        .addPreallocatedAttr(Ty)
+        .addByValAttr(Ty);
 
   return Incompatible;
 }
@@ -1837,17 +1951,58 @@ adjustMinLegalVectorWidth(Function &Caller, const Function &Callee) {
   }
 }
 
-/// If the inlined function has "null-pointer-is-valid=true" attribute,
+/// If the inlined function has null_pointer_is_valid attribute,
 /// set this attribute in the caller post inlining.
 static void
 adjustNullPointerValidAttr(Function &Caller, const Function &Callee) {
   if (Callee.nullPointerIsDefined() && !Caller.nullPointerIsDefined()) {
-    Caller.addFnAttr(Callee.getFnAttribute("null-pointer-is-valid"));
+    Caller.addFnAttr(Attribute::NullPointerIsValid);
   }
 }
 
+struct EnumAttr {
+  static bool isSet(const Function &Fn,
+                    Attribute::AttrKind Kind) {
+    return Fn.hasFnAttribute(Kind);
+  }
+
+  static void set(Function &Fn,
+                  Attribute::AttrKind Kind, bool Val) {
+    if (Val)
+      Fn.addFnAttr(Kind);
+    else
+      Fn.removeFnAttr(Kind);
+  }
+};
+
+struct StrBoolAttr {
+  static bool isSet(const Function &Fn,
+                    StringRef Kind) {
+    auto A = Fn.getFnAttribute(Kind);
+    return A.getValueAsString().equals("true");
+  }
+
+  static void set(Function &Fn,
+                  StringRef Kind, bool Val) {
+    Fn.addFnAttr(Kind, Val ? "true" : "false");
+  }
+};
+
+#define GET_ATTR_NAMES
+#define ATTRIBUTE_ENUM(ENUM_NAME, DISPLAY_NAME)                                \
+  struct ENUM_NAME##Attr : EnumAttr {                                          \
+    static enum Attribute::AttrKind getKind() {                                \
+      return llvm::Attribute::ENUM_NAME;                                       \
+    }                                                                          \
+  };
+#define ATTRIBUTE_STRBOOL(ENUM_NAME, DISPLAY_NAME)                             \
+  struct ENUM_NAME##Attr : StrBoolAttr {                                       \
+    static StringRef getKind() { return #DISPLAY_NAME; }                       \
+  };
+#include "llvm/IR/Attributes.inc"
+
 #define GET_ATTR_COMPAT_FUNC
-#include "AttributesCompatFunc.inc"
+#include "llvm/IR/Attributes.inc"
 
 bool AttributeFuncs::areInlineCompatible(const Function &Caller,
                                          const Function &Callee) {
diff --git a/llvm/lib/IR/AttributesCompatFunc.td b/llvm/lib/IR/AttributesCompatFunc.td
deleted file mode 100644
index 7c85b3da9ab6..000000000000
--- a/llvm/lib/IR/AttributesCompatFunc.td
+++ /dev/null
@@ -1 +0,0 @@
-include "llvm/IR/Attributes.td"
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 6e2beeb839b6..1e8fdb506619 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -21,6 +21,7 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/IntrinsicsARM.h"
@@ -42,7 +43,7 @@ static bool UpgradePTESTIntrinsic(Function* F, Intrinsic::ID IID,
   // Check whether this is an old version of the function, which received
   // v4f32 arguments.
   Type *Arg0Type = F->getFunctionType()->getParamType(0);
-  if (Arg0Type != VectorType::get(Type::getFloatTy(F->getContext()), 4))
+  if (Arg0Type != FixedVectorType::get(Type::getFloatTy(F->getContext()), 4))
     return false;
 
   // Yes, it's old, replace it with new version.
@@ -99,7 +100,6 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name.startswith("fma4.vfmadd.s") || // Added in 7.0
       Name.startswith("fma.vfmadd.") || // Added in 7.0
       Name.startswith("fma.vfmsub.") || // Added in 7.0
-      Name.startswith("fma.vfmaddsub.") || // Added in 7.0
       Name.startswith("fma.vfmsubadd.") || // Added in 7.0
       Name.startswith("fma.vfnmadd.") || // Added in 7.0
       Name.startswith("fma.vfnmsub.") || // Added in 7.0
@@ -205,6 +205,8 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name.startswith("avx512.mask.cvtqq2pd.") || // Added in 7.0 updated 9.0
       Name.startswith("avx512.mask.cvtuqq2pd.") || // Added in 7.0 updated 9.0
       Name.startswith("avx512.mask.cvtdq2ps.") || // Added in 7.0 updated 9.0
+      Name == "avx512.mask.vcvtph2ps.128" || // Added in 11.0
+      Name == "avx512.mask.vcvtph2ps.256" || // Added in 11.0
       Name == "avx512.mask.cvtqq2ps.256" || // Added in 9.0
       Name == "avx512.mask.cvtqq2ps.512" || // Added in 9.0
       Name == "avx512.mask.cvtuqq2ps.256" || // Added in 9.0
@@ -317,6 +319,7 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name == "avx.cvtdq2.pd.256" || // Added in 3.9
       Name == "avx.cvtdq2.ps.256" || // Added in 7.0
       Name == "avx.cvt.ps2.pd.256" || // Added in 3.9
+      Name.startswith("vcvtph2ps.") || // Added in 11.0
       Name.startswith("avx.vinsertf128.") || // Added in 3.7
       Name == "avx2.vinserti128" || // Added in 3.7
       Name.startswith("avx512.mask.insert") || // Added in 4.0
@@ -372,8 +375,14 @@ static bool ShouldUpgradeX86Intrinsic(Function *F, StringRef Name) {
       Name.startswith("avx2.pblendd.") || // Added in 3.7
       Name.startswith("avx.vbroadcastf128") || // Added in 4.0
       Name == "avx2.vbroadcasti128" || // Added in 3.7
-      Name.startswith("avx512.mask.broadcastf") || // Added in 6.0
-      Name.startswith("avx512.mask.broadcasti") || // Added in 6.0
+      Name.startswith("avx512.mask.broadcastf32x4.") || // Added in 6.0
+      Name.startswith("avx512.mask.broadcastf64x2.") || // Added in 6.0
+      Name.startswith("avx512.mask.broadcastf32x8.") || // Added in 6.0
+      Name.startswith("avx512.mask.broadcastf64x4.") || // Added in 6.0
+      Name.startswith("avx512.mask.broadcasti32x4.") || // Added in 6.0
+      Name.startswith("avx512.mask.broadcasti64x2.") || // Added in 6.0
+      Name.startswith("avx512.mask.broadcasti32x8.") || // Added in 6.0
+      Name.startswith("avx512.mask.broadcasti64x4.") || // Added in 6.0
       Name == "xop.vpcmov" || // Added in 3.8
       Name == "xop.vpcmov.256" || // Added in 5.0
       Name.startswith("avx512.mask.move.s") || // Added in 4.0
@@ -891,11 +900,11 @@ GlobalVariable *llvm::UpgradeGlobalVariable(GlobalVariable *GV) {
 // to byte shuffles.
 static Value *UpgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder,
                                          Value *Op, unsigned Shift) {
-  Type *ResultTy = Op->getType();
-  unsigned NumElts = ResultTy->getVectorNumElements() * 8;
+  auto *ResultTy = cast<VectorType>(Op->getType());
+  unsigned NumElts = ResultTy->getNumElements() * 8;
 
   // Bitcast from a 64-bit element type to a byte element type.
-  Type *VecTy = VectorType::get(Builder.getInt8Ty(), NumElts);
+  Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), NumElts);
   Op = Builder.CreateBitCast(Op, VecTy, "cast");
 
   // We'll be shuffling in zeroes.
@@ -904,7 +913,7 @@ static Value *UpgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder,
   // If shift is less than 16, emit a shuffle to move the bytes. Otherwise,
   // we'll just return the zero vector.
   if (Shift < 16) {
-    uint32_t Idxs[64];
+    int Idxs[64];
     // 256/512-bit version is split into 2/4 16-byte lanes.
     for (unsigned l = 0; l != NumElts; l += 16)
       for (unsigned i = 0; i != 16; ++i) {
@@ -925,11 +934,11 @@ static Value *UpgradeX86PSLLDQIntrinsics(IRBuilder<> &Builder,
 // to byte shuffles.
 static Value *UpgradeX86PSRLDQIntrinsics(IRBuilder<> &Builder, Value *Op,
                                          unsigned Shift) {
-  Type *ResultTy = Op->getType();
-  unsigned NumElts = ResultTy->getVectorNumElements() * 8;
+  auto *ResultTy = cast<VectorType>(Op->getType());
+  unsigned NumElts = ResultTy->getNumElements() * 8;
 
   // Bitcast from a 64-bit element type to a byte element type.
-  Type *VecTy = VectorType::get(Builder.getInt8Ty(), NumElts);
+  Type *VecTy = FixedVectorType::get(Builder.getInt8Ty(), NumElts);
   Op = Builder.CreateBitCast(Op, VecTy, "cast");
 
   // We'll be shuffling in zeroes.
@@ -938,7 +947,7 @@ static Value *UpgradeX86PSRLDQIntrinsics(IRBuilder<> &Builder, Value *Op,
   // If shift is less than 16, emit a shuffle to move the bytes. Otherwise,
   // we'll just return the zero vector.
   if (Shift < 16) {
-    uint32_t Idxs[64];
+    int Idxs[64];
     // 256/512-bit version is split into 2/4 16-byte lanes.
     for (unsigned l = 0; l != NumElts; l += 16)
       for (unsigned i = 0; i != 16; ++i) {
@@ -957,14 +966,14 @@ static Value *UpgradeX86PSRLDQIntrinsics(IRBuilder<> &Builder, Value *Op,
 
 static Value *getX86MaskVec(IRBuilder<> &Builder, Value *Mask,
                             unsigned NumElts) {
-  llvm::VectorType *MaskTy = llvm::VectorType::get(Builder.getInt1Ty(),
-                             cast<IntegerType>(Mask->getType())->getBitWidth());
+  llvm::VectorType *MaskTy = FixedVectorType::get(
+      Builder.getInt1Ty(), cast<IntegerType>(Mask->getType())->getBitWidth());
   Mask = Builder.CreateBitCast(Mask, MaskTy);
 
   // If we have less than 8 elements, then the starting mask was an i8 and
   // we need to extract down to the right number of elements.
   if (NumElts < 8) {
-    uint32_t Indices[4];
+    int Indices[4];
     for (unsigned i = 0; i != NumElts; ++i)
       Indices[i] = i;
     Mask = Builder.CreateShuffleVector(Mask, Mask,
@@ -982,7 +991,8 @@ static Value *EmitX86Select(IRBuilder<> &Builder, Value *Mask,
     if (C->isAllOnesValue())
       return Op0;
 
-  Mask = getX86MaskVec(Builder, Mask, Op0->getType()->getVectorNumElements());
+  Mask = getX86MaskVec(Builder, Mask,
+                       cast<VectorType>(Op0->getType())->getNumElements());
   return Builder.CreateSelect(Mask, Op0, Op1);
 }
 
@@ -993,9 +1003,8 @@ static Value *EmitX86ScalarSelect(IRBuilder<> &Builder, Value *Mask,
     if (C->isAllOnesValue())
       return Op0;
 
-  llvm::VectorType *MaskTy =
-    llvm::VectorType::get(Builder.getInt1Ty(),
-                          Mask->getType()->getIntegerBitWidth());
+  auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(),
+                                      Mask->getType()->getIntegerBitWidth());
   Mask = Builder.CreateBitCast(Mask, MaskTy);
   Mask = Builder.CreateExtractElement(Mask, (uint64_t)0);
   return Builder.CreateSelect(Mask, Op0, Op1);
@@ -1010,7 +1019,7 @@ static Value *UpgradeX86ALIGNIntrinsics(IRBuilder<> &Builder, Value *Op0,
                                         bool IsVALIGN) {
   unsigned ShiftVal = cast<llvm::ConstantInt>(Shift)->getZExtValue();
 
-  unsigned NumElts = Op0->getType()->getVectorNumElements();
+  unsigned NumElts = cast<VectorType>(Op0->getType())->getNumElements();
   assert((IsVALIGN || NumElts % 16 == 0) && "Illegal NumElts for PALIGNR!");
   assert((!IsVALIGN || NumElts <= 16) && "NumElts too large for VALIGN!");
   assert(isPowerOf2_32(NumElts) && "NumElts not a power of 2!");
@@ -1032,7 +1041,7 @@ static Value *UpgradeX86ALIGNIntrinsics(IRBuilder<> &Builder, Value *Op0,
     Op0 = llvm::Constant::getNullValue(Op0->getType());
   }
 
-  uint32_t Indices[64];
+  int Indices[64];
   // 256-bit palignr operates on 128-bit lanes so we need to handle that
   for (unsigned l = 0; l < NumElts; l += 16) {
     for (unsigned i = 0; i != 16; ++i) {
@@ -1141,7 +1150,7 @@ static Value *upgradeX86Rotate(IRBuilder<> &Builder, CallInst &CI,
   // Funnel shifts amounts are treated as modulo and types are all power-of-2 so
   // we only care about the lowest log2 bits anyway.
   if (Amt->getType() != Ty) {
-    unsigned NumElts = Ty->getVectorNumElements();
+    unsigned NumElts = cast<VectorType>(Ty)->getNumElements();
     Amt = Builder.CreateIntCast(Amt, Ty->getScalarType(), false);
     Amt = Builder.CreateVectorSplat(NumElts, Amt);
   }
@@ -1211,7 +1220,7 @@ static Value *upgradeX86ConcatShift(IRBuilder<> &Builder, CallInst &CI,
   // Funnel shifts amounts are treated as modulo and types are all power-of-2 so
   // we only care about the lowest log2 bits anyway.
   if (Amt->getType() != Ty) {
-    unsigned NumElts = Ty->getVectorNumElements();
+    unsigned NumElts = cast<VectorType>(Ty)->getNumElements();
     Amt = Builder.CreateIntCast(Amt, Ty->getScalarType(), false);
     Amt = Builder.CreateVectorSplat(NumElts, Amt);
   }
@@ -1237,18 +1246,20 @@ static Value *UpgradeMaskedStore(IRBuilder<> &Builder,
   // Cast the pointer to the right type.
   Ptr = Builder.CreateBitCast(Ptr,
                               llvm::PointerType::getUnqual(Data->getType()));
-  unsigned Align =
-    Aligned ? cast<VectorType>(Data->getType())->getBitWidth() / 8 : 1;
+  const Align Alignment =
+      Aligned
+          ? Align(Data->getType()->getPrimitiveSizeInBits().getFixedSize() / 8)
+          : Align(1);
 
   // If the mask is all ones just emit a regular store.
   if (const auto *C = dyn_cast<Constant>(Mask))
     if (C->isAllOnesValue())
-      return Builder.CreateAlignedStore(Data, Ptr, Align);
+      return Builder.CreateAlignedStore(Data, Ptr, Alignment);
 
   // Convert the mask from an integer type to a vector of i1.
-  unsigned NumElts = Data->getType()->getVectorNumElements();
+  unsigned NumElts = cast<VectorType>(Data->getType())->getNumElements();
   Mask = getX86MaskVec(Builder, Mask, NumElts);
-  return Builder.CreateMaskedStore(Data, Ptr, Align, Mask);
+  return Builder.CreateMaskedStore(Data, Ptr, Alignment, Mask);
 }
 
 static Value *UpgradeMaskedLoad(IRBuilder<> &Builder,
@@ -1257,18 +1268,21 @@ static Value *UpgradeMaskedLoad(IRBuilder<> &Builder,
   Type *ValTy = Passthru->getType();
   // Cast the pointer to the right type.
   Ptr = Builder.CreateBitCast(Ptr, llvm::PointerType::getUnqual(ValTy));
-  unsigned Align =
-    Aligned ? cast<VectorType>(Passthru->getType())->getBitWidth() / 8 : 1;
+  const Align Alignment =
+      Aligned
+          ? Align(Passthru->getType()->getPrimitiveSizeInBits().getFixedSize() /
+                  8)
+          : Align(1);
 
   // If the mask is all ones just emit a regular store.
   if (const auto *C = dyn_cast<Constant>(Mask))
     if (C->isAllOnesValue())
-      return Builder.CreateAlignedLoad(ValTy, Ptr, Align);
+      return Builder.CreateAlignedLoad(ValTy, Ptr, Alignment);
 
   // Convert the mask from an integer type to a vector of i1.
-  unsigned NumElts = Passthru->getType()->getVectorNumElements();
+  unsigned NumElts = cast<VectorType>(Passthru->getType())->getNumElements();
   Mask = getX86MaskVec(Builder, Mask, NumElts);
-  return Builder.CreateMaskedLoad(Ptr, Align, Mask, Passthru);
+  return Builder.CreateMaskedLoad(Ptr, Alignment, Mask, Passthru);
 }
 
 static Value *upgradeAbs(IRBuilder<> &Builder, CallInst &CI) {
@@ -1330,7 +1344,7 @@ static Value *upgradePMULDQ(IRBuilder<> &Builder, CallInst &CI, bool IsSigned) {
 // Applying mask on vector of i1's and make sure result is at least 8 bits wide.
 static Value *ApplyX86MaskOn1BitsVec(IRBuilder<> &Builder, Value *Vec,
                                      Value *Mask) {
-  unsigned NumElts = Vec->getType()->getVectorNumElements();
+  unsigned NumElts = cast<VectorType>(Vec->getType())->getNumElements();
   if (Mask) {
     const auto *C = dyn_cast<Constant>(Mask);
     if (!C || !C->isAllOnesValue())
@@ -1338,7 +1352,7 @@ static Value *ApplyX86MaskOn1BitsVec(IRBuilder<> &Builder, Value *Vec,
   }
 
   if (NumElts < 8) {
-    uint32_t Indices[8];
+    int Indices[8];
     for (unsigned i = 0; i != NumElts; ++i)
       Indices[i] = i;
     for (unsigned i = NumElts; i != 8; ++i)
@@ -1353,13 +1367,15 @@ static Value *ApplyX86MaskOn1BitsVec(IRBuilder<> &Builder, Value *Vec,
 static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallInst &CI,
                                    unsigned CC, bool Signed) {
   Value *Op0 = CI.getArgOperand(0);
-  unsigned NumElts = Op0->getType()->getVectorNumElements();
+  unsigned NumElts = cast<VectorType>(Op0->getType())->getNumElements();
 
   Value *Cmp;
   if (CC == 3) {
-    Cmp = Constant::getNullValue(llvm::VectorType::get(Builder.getInt1Ty(), NumElts));
+    Cmp = Constant::getNullValue(
+        FixedVectorType::get(Builder.getInt1Ty(), NumElts));
   } else if (CC == 7) {
-    Cmp = Constant::getAllOnesValue(llvm::VectorType::get(Builder.getInt1Ty(), NumElts));
+    Cmp = Constant::getAllOnesValue(
+        FixedVectorType::get(Builder.getInt1Ty(), NumElts));
   } else {
     ICmpInst::Predicate Pred;
     switch (CC) {
@@ -1406,7 +1422,7 @@ static Value* upgradeMaskedMove(IRBuilder<> &Builder, CallInst &CI) {
 static Value* UpgradeMaskToInt(IRBuilder<> &Builder, CallInst &CI) {
   Value* Op = CI.getArgOperand(0);
   Type* ReturnOp = CI.getType();
-  unsigned NumElts = CI.getType()->getVectorNumElements();
+  unsigned NumElts = cast<VectorType>(CI.getType())->getNumElements();
   Value *Mask = getX86MaskVec(Builder, Op, NumElts);
   return Builder.CreateSExt(Mask, ReturnOp, "vpmovm2");
 }
@@ -1705,7 +1721,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *Extract =
           Builder.CreateExtractElement(Arg1, (uint64_t)0, "extractelement");
 
-      StoreInst *SI = Builder.CreateAlignedStore(Extract, Addr, 1);
+      StoreInst *SI = Builder.CreateAlignedStore(Extract, Addr, Align(1));
       SI->setMetadata(M->getMDKindID("nontemporal"), Node);
 
       // Remove intrinsic.
@@ -1728,9 +1744,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *BC = Builder.CreateBitCast(Arg0,
                                         PointerType::getUnqual(Arg1->getType()),
                                         "cast");
-      VectorType *VTy = cast<VectorType>(Arg1->getType());
-      StoreInst *SI = Builder.CreateAlignedStore(Arg1, BC,
-                                                 VTy->getBitWidth() / 8);
+      StoreInst *SI = Builder.CreateAlignedStore(
+          Arg1, BC,
+          Align(Arg1->getType()->getPrimitiveSizeInBits().getFixedSize() / 8));
       SI->setMetadata(M->getMDKindID("nontemporal"), Node);
 
       // Remove intrinsic.
@@ -1742,13 +1758,13 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *Arg0 = CI->getArgOperand(0);
       Value *Arg1 = CI->getArgOperand(1);
 
-      Type *NewVecTy = VectorType::get(Type::getInt64Ty(C), 2);
+      auto *NewVecTy = FixedVectorType::get(Type::getInt64Ty(C), 2);
       Value *BC0 = Builder.CreateBitCast(Arg1, NewVecTy, "cast");
       Value *Elt = Builder.CreateExtractElement(BC0, (uint64_t)0);
       Value *BC = Builder.CreateBitCast(Arg0,
                                         PointerType::getUnqual(Elt->getType()),
                                         "cast");
-      Builder.CreateAlignedStore(Elt, BC, 1);
+      Builder.CreateAlignedStore(Elt, BC, Align(1));
 
       // Remove intrinsic.
       CI->eraseFromParent();
@@ -1764,7 +1780,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Arg0 = Builder.CreateBitCast(Arg0,
                                    PointerType::getUnqual(Arg1->getType()),
                                    "cast");
-      Builder.CreateAlignedStore(Arg1, Arg0, 1);
+      Builder.CreateAlignedStore(Arg1, Arg0, Align(1));
 
       // Remove intrinsic.
       CI->eraseFromParent();
@@ -1856,7 +1872,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = ApplyX86MaskOn1BitsVec(Builder, Rep, Mask);
     } else if (IsX86 && (Name.startswith("avx512.mask.pbroadcast"))){
       unsigned NumElts =
-          CI->getArgOperand(1)->getType()->getVectorNumElements();
+          cast<VectorType>(CI->getArgOperand(1)->getType())->getNumElements();
       Rep = Builder.CreateVectorSplat(NumElts, CI->getArgOperand(0));
       Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                           CI->getArgOperand(1));
@@ -1864,7 +1880,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       unsigned NumElts = CI->getType()->getScalarSizeInBits();
       Value *LHS = getX86MaskVec(Builder, CI->getArgOperand(0), NumElts);
       Value *RHS = getX86MaskVec(Builder, CI->getArgOperand(1), NumElts);
-      uint32_t Indices[64];
+      int Indices[64];
       for (unsigned i = 0; i != NumElts; ++i)
         Indices[i] = i;
 
@@ -2074,16 +2090,19 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name == "sse2.cvtsi2sd" ||
                          Name == "sse.cvtsi642ss" ||
                          Name == "sse2.cvtsi642sd")) {
-      Rep = Builder.CreateSIToFP(CI->getArgOperand(1),
-                                 CI->getType()->getVectorElementType());
+      Rep = Builder.CreateSIToFP(
+          CI->getArgOperand(1),
+          cast<VectorType>(CI->getType())->getElementType());
       Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
     } else if (IsX86 && Name == "avx512.cvtusi2sd") {
-      Rep = Builder.CreateUIToFP(CI->getArgOperand(1),
-                                 CI->getType()->getVectorElementType());
+      Rep = Builder.CreateUIToFP(
+          CI->getArgOperand(1),
+          cast<VectorType>(CI->getType())->getElementType());
       Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
     } else if (IsX86 && Name == "sse2.cvtss2sd") {
       Rep = Builder.CreateExtractElement(CI->getArgOperand(1), (uint64_t)0);
-      Rep = Builder.CreateFPExt(Rep, CI->getType()->getVectorElementType());
+      Rep = Builder.CreateFPExt(
+          Rep, cast<VectorType>(CI->getType())->getElementType());
       Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
     } else if (IsX86 && (Name == "sse2.cvtdq2pd" ||
                          Name == "sse2.cvtdq2ps" ||
@@ -2103,18 +2122,17 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name == "avx.cvt.ps2.pd.256" ||
                          Name == "avx512.mask.cvtps2pd.128" ||
                          Name == "avx512.mask.cvtps2pd.256")) {
-      Type *DstTy = CI->getType();
+      auto *DstTy = cast<VectorType>(CI->getType());
       Rep = CI->getArgOperand(0);
-      Type *SrcTy = Rep->getType();
+      auto *SrcTy = cast<VectorType>(Rep->getType());
 
-      unsigned NumDstElts = DstTy->getVectorNumElements();
-      if (NumDstElts < SrcTy->getVectorNumElements()) {
+      unsigned NumDstElts = DstTy->getNumElements();
+      if (NumDstElts < SrcTy->getNumElements()) {
         assert(NumDstElts == 2 && "Unexpected vector size");
-        uint32_t ShuffleMask[2] = { 0, 1 };
-        Rep = Builder.CreateShuffleVector(Rep, Rep, ShuffleMask);
+        Rep = Builder.CreateShuffleVector(Rep, Rep, ArrayRef<int>{0, 1});
       }
 
-      bool IsPS2PD = SrcTy->getVectorElementType()->isFloatTy();
+      bool IsPS2PD = SrcTy->getElementType()->isFloatTy();
       bool IsUnsigned = (StringRef::npos != Name.find("cvtu"));
       if (IsPS2PD)
         Rep = Builder.CreateFPExt(Rep, DstTy, "cvtps2pd");
@@ -2134,6 +2152,22 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       if (CI->getNumArgOperands() >= 3)
         Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
                             CI->getArgOperand(1));
+    } else if (IsX86 && (Name.startswith("avx512.mask.vcvtph2ps.") ||
+                         Name.startswith("vcvtph2ps."))) {
+      auto *DstTy = cast<VectorType>(CI->getType());
+      Rep = CI->getArgOperand(0);
+      auto *SrcTy = cast<VectorType>(Rep->getType());
+      unsigned NumDstElts = DstTy->getNumElements();
+      if (NumDstElts != SrcTy->getNumElements()) {
+        assert(NumDstElts == 4 && "Unexpected vector size");
+        Rep = Builder.CreateShuffleVector(Rep, Rep, ArrayRef<int>{0, 1, 2, 3});
+      }
+      Rep = Builder.CreateBitCast(
+          Rep, FixedVectorType::get(Type::getHalfTy(C), NumDstElts));
+      Rep = Builder.CreateFPExt(Rep, DstTy, "cvtph2ps");
+      if (CI->getNumArgOperands() >= 3)
+        Rep = EmitX86Select(Builder, CI->getArgOperand(2), Rep,
+                            CI->getArgOperand(1));
     } else if (IsX86 && (Name.startswith("avx512.mask.loadu."))) {
       Rep = UpgradeMaskedLoad(Builder, CI->getArgOperand(0),
                               CI->getArgOperand(1), CI->getArgOperand(2),
@@ -2143,30 +2177,30 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                               CI->getArgOperand(1),CI->getArgOperand(2),
                               /*Aligned*/true);
     } else if (IsX86 && Name.startswith("avx512.mask.expand.load.")) {
-      Type *ResultTy = CI->getType();
-      Type *PtrTy = ResultTy->getVectorElementType();
+      auto *ResultTy = cast<VectorType>(CI->getType());
+      Type *PtrTy = ResultTy->getElementType();
 
       // Cast the pointer to element type.
       Value *Ptr = Builder.CreateBitCast(CI->getOperand(0),
                                          llvm::PointerType::getUnqual(PtrTy));
 
       Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
-                                     ResultTy->getVectorNumElements());
+                                     ResultTy->getNumElements());
 
       Function *ELd = Intrinsic::getDeclaration(F->getParent(),
                                                 Intrinsic::masked_expandload,
                                                 ResultTy);
       Rep = Builder.CreateCall(ELd, { Ptr, MaskVec, CI->getOperand(1) });
     } else if (IsX86 && Name.startswith("avx512.mask.compress.store.")) {
-      Type *ResultTy = CI->getArgOperand(1)->getType();
-      Type *PtrTy = ResultTy->getVectorElementType();
+      auto *ResultTy = cast<VectorType>(CI->getArgOperand(1)->getType());
+      Type *PtrTy = ResultTy->getElementType();
 
       // Cast the pointer to element type.
       Value *Ptr = Builder.CreateBitCast(CI->getOperand(0),
                                          llvm::PointerType::getUnqual(PtrTy));
 
       Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
-                                     ResultTy->getVectorNumElements());
+                                     ResultTy->getNumElements());
 
       Function *CSt = Intrinsic::getDeclaration(F->getParent(),
                                                 Intrinsic::masked_compressstore,
@@ -2174,10 +2208,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Rep = Builder.CreateCall(CSt, { CI->getArgOperand(1), Ptr, MaskVec });
     } else if (IsX86 && (Name.startswith("avx512.mask.compress.") ||
                          Name.startswith("avx512.mask.expand."))) {
-      Type *ResultTy = CI->getType();
+      auto *ResultTy = cast<VectorType>(CI->getType());
 
       Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
-                                     ResultTy->getVectorNumElements());
+                                     ResultTy->getNumElements());
 
       bool IsCompress = Name[12] == 'c';
       Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress
@@ -2254,9 +2288,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     } else if (IsX86 && (Name.startswith("avx.vbroadcast.s") ||
                          Name.startswith("avx512.vbroadcast.s"))) {
       // Replace broadcasts with a series of insertelements.
-      Type *VecTy = CI->getType();
-      Type *EltTy = VecTy->getVectorElementType();
-      unsigned EltNum = VecTy->getVectorNumElements();
+      auto *VecTy = cast<VectorType>(CI->getType());
+      Type *EltTy = VecTy->getElementType();
+      unsigned EltNum = VecTy->getNumElements();
       Value *Cast = Builder.CreateBitCast(CI->getArgOperand(0),
                                           EltTy->getPointerTo());
       Value *Load = Builder.CreateLoad(EltTy, Cast);
@@ -2276,7 +2310,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       unsigned NumDstElts = DstTy->getNumElements();
 
       // Extract a subvector of the first NumDstElts lanes and sign/zero extend.
-      SmallVector<uint32_t, 8> ShuffleMask(NumDstElts);
+      SmallVector<int, 8> ShuffleMask(NumDstElts);
       for (unsigned i = 0; i != NumDstElts; ++i)
         ShuffleMask[i] = i;
 
@@ -2301,18 +2335,19 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     } else if (IsX86 && (Name.startswith("avx.vbroadcastf128") ||
                          Name == "avx2.vbroadcasti128")) {
       // Replace vbroadcastf128/vbroadcasti128 with a vector load+shuffle.
-      Type *EltTy = CI->getType()->getVectorElementType();
+      Type *EltTy = cast<VectorType>(CI->getType())->getElementType();
       unsigned NumSrcElts = 128 / EltTy->getPrimitiveSizeInBits();
-      Type *VT = VectorType::get(EltTy, NumSrcElts);
+      auto *VT = FixedVectorType::get(EltTy, NumSrcElts);
       Value *Op = Builder.CreatePointerCast(CI->getArgOperand(0),
                                             PointerType::getUnqual(VT));
-      Value *Load = Builder.CreateAlignedLoad(VT, Op, 1);
+      Value *Load = Builder.CreateAlignedLoad(VT, Op, Align(1));
       if (NumSrcElts == 2)
-        Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
-                                          { 0, 1, 0, 1 });
+        Rep = Builder.CreateShuffleVector(
+            Load, UndefValue::get(Load->getType()), ArrayRef<int>{0, 1, 0, 1});
       else
-        Rep = Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
-                                          { 0, 1, 2, 3, 0, 1, 2, 3 });
+        Rep =
+            Builder.CreateShuffleVector(Load, UndefValue::get(Load->getType()),
+                                        ArrayRef<int>{0, 1, 2, 3, 0, 1, 2, 3});
     } else if (IsX86 && (Name.startswith("avx512.mask.shuf.i") ||
                          Name.startswith("avx512.mask.shuf.f"))) {
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
@@ -2321,7 +2356,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       unsigned NumElementsInLane = 128 / VT->getScalarSizeInBits();
       unsigned ControlBitsMask = NumLanes - 1;
       unsigned NumControlBits = NumLanes / 2;
-      SmallVector<uint32_t, 8> ShuffleMask(0);
+      SmallVector<int, 8> ShuffleMask(0);
 
       for (unsigned l = 0; l != NumLanes; ++l) {
         unsigned LaneMask = (Imm >> (l * NumControlBits)) & ControlBitsMask;
@@ -2338,10 +2373,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     }else if (IsX86 && (Name.startswith("avx512.mask.broadcastf") ||
                          Name.startswith("avx512.mask.broadcasti"))) {
       unsigned NumSrcElts =
-                        CI->getArgOperand(0)->getType()->getVectorNumElements();
-      unsigned NumDstElts = CI->getType()->getVectorNumElements();
+          cast<VectorType>(CI->getArgOperand(0)->getType())->getNumElements();
+      unsigned NumDstElts = cast<VectorType>(CI->getType())->getNumElements();
 
-      SmallVector<uint32_t, 8> ShuffleMask(NumDstElts);
+      SmallVector<int, 8> ShuffleMask(NumDstElts);
       for (unsigned i = 0; i != NumDstElts; ++i)
         ShuffleMask[i] = i % NumSrcElts;
 
@@ -2356,8 +2391,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.broadcast.s"))) {
       // Replace vp?broadcasts with a vector shuffle.
       Value *Op = CI->getArgOperand(0);
-      unsigned NumElts = CI->getType()->getVectorNumElements();
-      Type *MaskTy = VectorType::get(Type::getInt32Ty(C), NumElts);
+      ElementCount EC = cast<VectorType>(CI->getType())->getElementCount();
+      Type *MaskTy = VectorType::get(Type::getInt32Ty(C), EC);
       Rep = Builder.CreateShuffleVector(Op, UndefValue::get(Op->getType()),
                                         Constant::getNullValue(MaskTy));
 
@@ -2431,7 +2466,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       VectorType *VecTy = cast<VectorType>(CI->getType());
       unsigned NumElts = VecTy->getNumElements();
 
-      SmallVector<uint32_t, 16> Idxs(NumElts);
+      SmallVector<int, 16> Idxs(NumElts);
       for (unsigned i = 0; i != NumElts; ++i)
         Idxs[i] = ((Imm >> (i%8)) & 1) ? i + NumElts : i;
 
@@ -2442,8 +2477,8 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      unsigned DstNumElts = CI->getType()->getVectorNumElements();
-      unsigned SrcNumElts = Op1->getType()->getVectorNumElements();
+      unsigned DstNumElts = cast<VectorType>(CI->getType())->getNumElements();
+      unsigned SrcNumElts = cast<VectorType>(Op1->getType())->getNumElements();
       unsigned Scale = DstNumElts / SrcNumElts;
 
       // Mask off the high bits of the immediate value; hardware ignores those.
@@ -2451,7 +2486,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
       // Extend the second operand into a vector the size of the destination.
       Value *UndefV = UndefValue::get(Op1->getType());
-      SmallVector<uint32_t, 8> Idxs(DstNumElts);
+      SmallVector<int, 8> Idxs(DstNumElts);
       for (unsigned i = 0; i != SrcNumElts; ++i)
         Idxs[i] = i;
       for (unsigned i = SrcNumElts; i != DstNumElts; ++i)
@@ -2486,15 +2521,15 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.vextract"))) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      unsigned DstNumElts = CI->getType()->getVectorNumElements();
-      unsigned SrcNumElts = Op0->getType()->getVectorNumElements();
+      unsigned DstNumElts = cast<VectorType>(CI->getType())->getNumElements();
+      unsigned SrcNumElts = cast<VectorType>(Op0->getType())->getNumElements();
       unsigned Scale = SrcNumElts / DstNumElts;
 
       // Mask off the high bits of the immediate value; hardware ignores those.
       Imm = Imm % Scale;
 
       // Get indexes for the subvector of the input vector.
-      SmallVector<uint32_t, 8> Idxs(DstNumElts);
+      SmallVector<int, 8> Idxs(DstNumElts);
       for (unsigned i = 0; i != DstNumElts; ++i) {
         Idxs[i] = i + (Imm * DstNumElts);
       }
@@ -2513,7 +2548,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       VectorType *VecTy = cast<VectorType>(CI->getType());
       unsigned NumElts = VecTy->getNumElements();
 
-      SmallVector<uint32_t, 8> Idxs(NumElts);
+      SmallVector<int, 8> Idxs(NumElts);
       for (unsigned i = 0; i != NumElts; ++i)
         Idxs[i] = (i & ~0x3) + ((Imm >> (2 * (i & 0x3))) & 3);
 
@@ -2534,9 +2569,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
 
       uint8_t Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
 
-      unsigned NumElts = CI->getType()->getVectorNumElements();
+      unsigned NumElts = cast<VectorType>(CI->getType())->getNumElements();
       unsigned HalfSize = NumElts / 2;
-      SmallVector<uint32_t, 8> ShuffleMask(NumElts);
+      SmallVector<int, 8> ShuffleMask(NumElts);
 
       // Determine which operand(s) are actually in use for this instruction.
       Value *V0 = (Imm & 0x02) ? CI->getArgOperand(1) : CI->getArgOperand(0);
@@ -2570,7 +2605,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       unsigned IdxSize = 64 / VecTy->getScalarSizeInBits();
       unsigned IdxMask = ((1 << IdxSize) - 1);
 
-      SmallVector<uint32_t, 8> Idxs(NumElts);
+      SmallVector<int, 8> Idxs(NumElts);
       // Lookup the bits for this element, wrapping around the immediate every
       // 8-bits. Elements are grouped into sets of 2 or 4 elements so we need
       // to offset by the first index of each group.
@@ -2586,9 +2621,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.pshufl.w."))) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      unsigned NumElts = CI->getType()->getVectorNumElements();
+      unsigned NumElts = cast<VectorType>(CI->getType())->getNumElements();
 
-      SmallVector<uint32_t, 16> Idxs(NumElts);
+      SmallVector<int, 16> Idxs(NumElts);
       for (unsigned l = 0; l != NumElts; l += 8) {
         for (unsigned i = 0; i != 4; ++i)
           Idxs[i + l] = ((Imm >> (2 * i)) & 0x3) + l;
@@ -2605,9 +2640,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.pshufh.w."))) {
       Value *Op0 = CI->getArgOperand(0);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(1))->getZExtValue();
-      unsigned NumElts = CI->getType()->getVectorNumElements();
+      unsigned NumElts = cast<VectorType>(CI->getType())->getNumElements();
 
-      SmallVector<uint32_t, 16> Idxs(NumElts);
+      SmallVector<int, 16> Idxs(NumElts);
       for (unsigned l = 0; l != NumElts; l += 8) {
         for (unsigned i = 0; i != 4; ++i)
           Idxs[i + l] = i + l;
@@ -2624,12 +2659,12 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
       unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
-      unsigned NumElts = CI->getType()->getVectorNumElements();
+      unsigned NumElts = cast<VectorType>(CI->getType())->getNumElements();
 
       unsigned NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
       unsigned HalfLaneElts = NumLaneElts / 2;
 
-      SmallVector<uint32_t, 16> Idxs(NumElts);
+      SmallVector<int, 16> Idxs(NumElts);
       for (unsigned i = 0; i != NumElts; ++i) {
         // Base index is the starting element of the lane.
         Idxs[i] = i - (i % NumLaneElts);
@@ -2649,14 +2684,14 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.movshdup") ||
                          Name.startswith("avx512.mask.movsldup"))) {
       Value *Op0 = CI->getArgOperand(0);
-      unsigned NumElts = CI->getType()->getVectorNumElements();
+      unsigned NumElts = cast<VectorType>(CI->getType())->getNumElements();
       unsigned NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
 
       unsigned Offset = 0;
       if (Name.startswith("avx512.mask.movshdup."))
         Offset = 1;
 
-      SmallVector<uint32_t, 16> Idxs(NumElts);
+      SmallVector<int, 16> Idxs(NumElts);
       for (unsigned l = 0; l != NumElts; l += NumLaneElts)
         for (unsigned i = 0; i != NumLaneElts; i += 2) {
           Idxs[i + l + 0] = i + l + Offset;
@@ -2671,10 +2706,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.unpckl."))) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
-      int NumElts = CI->getType()->getVectorNumElements();
+      int NumElts = cast<VectorType>(CI->getType())->getNumElements();
       int NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
 
-      SmallVector<uint32_t, 64> Idxs(NumElts);
+      SmallVector<int, 64> Idxs(NumElts);
       for (int l = 0; l != NumElts; l += NumLaneElts)
         for (int i = 0; i != NumLaneElts; ++i)
           Idxs[i + l] = l + (i / 2) + NumElts * (i % 2);
@@ -2687,10 +2722,10 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                          Name.startswith("avx512.mask.unpckh."))) {
       Value *Op0 = CI->getArgOperand(0);
       Value *Op1 = CI->getArgOperand(1);
-      int NumElts = CI->getType()->getVectorNumElements();
+      int NumElts = cast<VectorType>(CI->getType())->getNumElements();
       int NumLaneElts = 128/CI->getType()->getScalarSizeInBits();
 
-      SmallVector<uint32_t, 64> Idxs(NumElts);
+      SmallVector<int, 64> Idxs(NumElts);
       for (int l = 0; l != NumElts; l += NumLaneElts)
         for (int i = 0; i != NumLaneElts; ++i)
           Idxs[i + l] = (NumLaneElts / 2) + l + (i / 2) + NumElts * (i % 2);
@@ -3047,12 +3082,13 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
           C, ConstantAsMetadata::get(ConstantInt::get(Type::getInt32Ty(C), 1)));
 
       Value *Ptr = CI->getArgOperand(0);
-      VectorType *VTy = cast<VectorType>(CI->getType());
 
       // Convert the type of the pointer to a pointer to the stored type.
-      Value *BC =
-          Builder.CreateBitCast(Ptr, PointerType::getUnqual(VTy), "cast");
-      LoadInst *LI = Builder.CreateAlignedLoad(VTy, BC, VTy->getBitWidth() / 8);
+      Value *BC = Builder.CreateBitCast(
+          Ptr, PointerType::getUnqual(CI->getType()), "cast");
+      LoadInst *LI = Builder.CreateAlignedLoad(
+          CI->getType(), BC,
+          Align(CI->getType()->getPrimitiveSizeInBits().getFixedSize() / 8));
       LI->setMetadata(M->getMDKindID("nontemporal"), Node);
       Rep = LI;
     } else if (IsX86 && (Name.startswith("fma.vfmadd.") ||
@@ -3209,28 +3245,26 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
                                   CI->getArgOperand(0);
 
       Rep = EmitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
-    } else if (IsX86 && (Name.startswith("fma.vfmaddsub.p") ||
-                         Name.startswith("fma.vfmsubadd.p"))) {
-      bool IsSubAdd = Name[7] == 's';
-      int NumElts = CI->getType()->getVectorNumElements();
+    } else if (IsX86 &&  Name.startswith("fma.vfmsubadd.p")) {
+      unsigned VecWidth = CI->getType()->getPrimitiveSizeInBits();
+      unsigned EltWidth = CI->getType()->getScalarSizeInBits();
+      Intrinsic::ID IID;
+      if (VecWidth == 128 && EltWidth == 32)
+        IID = Intrinsic::x86_fma_vfmaddsub_ps;
+      else if (VecWidth == 256 && EltWidth == 32)
+        IID = Intrinsic::x86_fma_vfmaddsub_ps_256;
+      else if (VecWidth == 128 && EltWidth == 64)
+        IID = Intrinsic::x86_fma_vfmaddsub_pd;
+      else if (VecWidth == 256 && EltWidth == 64)
+        IID = Intrinsic::x86_fma_vfmaddsub_pd_256;
+      else
+        llvm_unreachable("Unexpected intrinsic");
 
       Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
                        CI->getArgOperand(2) };
-
-      Function *FMA = Intrinsic::getDeclaration(CI->getModule(), Intrinsic::fma,
-                                                Ops[0]->getType());
-      Value *Odd = Builder.CreateCall(FMA, Ops);
       Ops[2] = Builder.CreateFNeg(Ops[2]);
-      Value *Even = Builder.CreateCall(FMA, Ops);
-
-      if (IsSubAdd)
-        std::swap(Even, Odd);
-
-      SmallVector<uint32_t, 32> Idxs(NumElts);
-      for (int i = 0; i != NumElts; ++i)
-        Idxs[i] = i + (i % 2) * NumElts;
-
-      Rep = Builder.CreateShuffleVector(Even, Odd, Idxs);
+      Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
+                               Ops);
     } else if (IsX86 && (Name.startswith("avx512.mask.vfmaddsub.p") ||
                          Name.startswith("avx512.mask3.vfmaddsub.p") ||
                          Name.startswith("avx512.maskz.vfmaddsub.p") ||
@@ -3240,9 +3274,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       // Drop the "avx512.mask." to make it easier.
       Name = Name.drop_front(IsMask3 || IsMaskZ ? 13 : 12);
       bool IsSubAdd = Name[3] == 's';
-      if (CI->getNumArgOperands() == 5 &&
-          (!isa<ConstantInt>(CI->getArgOperand(4)) ||
-           cast<ConstantInt>(CI->getArgOperand(4))->getZExtValue() != 4)) {
+      if (CI->getNumArgOperands() == 5) {
         Intrinsic::ID IID;
         // Check the character before ".512" in string.
         if (Name[Name.size()-5] == 's')
@@ -3256,10 +3288,9 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
           Ops[2] = Builder.CreateFNeg(Ops[2]);
 
         Rep = Builder.CreateCall(Intrinsic::getDeclaration(F->getParent(), IID),
-                                 {CI->getArgOperand(0), CI->getArgOperand(1),
-                                  CI->getArgOperand(2), CI->getArgOperand(4)});
+                                 Ops);
       } else {
-        int NumElts = CI->getType()->getVectorNumElements();
+        int NumElts = cast<VectorType>(CI->getType())->getNumElements();
 
         Value *Ops[] = { CI->getArgOperand(0), CI->getArgOperand(1),
                          CI->getArgOperand(2) };
@@ -3273,7 +3304,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
         if (IsSubAdd)
           std::swap(Even, Odd);
 
-        SmallVector<uint32_t, 32> Idxs(NumElts);
+        SmallVector<int, 32> Idxs(NumElts);
         for (int i = 0; i != NumElts; ++i)
           Idxs[i] = i + (i % 2) * NumElts;
 
@@ -3434,7 +3465,7 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
       // Cast the pointer to the right type.
       Value *Ptr = Builder.CreateBitCast(CI->getArgOperand(3),
                                  llvm::PointerType::getUnqual(Data->getType()));
-      Builder.CreateAlignedStore(Data, Ptr, 1);
+      Builder.CreateAlignedStore(Data, Ptr, Align(1));
       // Replace the original call result with the first result of the new call.
       Value *CF = Builder.CreateExtractValue(NewCall, 0);
 
@@ -3629,13 +3660,13 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     // So, the only thing required is a bitcast for both arguments.
     // First, check the arguments have the old type.
     Value *Arg0 = CI->getArgOperand(0);
-    if (Arg0->getType() != VectorType::get(Type::getFloatTy(C), 4))
+    if (Arg0->getType() != FixedVectorType::get(Type::getFloatTy(C), 4))
       return;
 
     // Old intrinsic, add bitcasts
     Value *Arg1 = CI->getArgOperand(1);
 
-    Type *NewVecTy = VectorType::get(Type::getInt64Ty(C), 2);
+    auto *NewVecTy = FixedVectorType::get(Type::getInt64Ty(C), 2);
 
     Value *BC0 = Builder.CreateBitCast(Arg0, NewVecTy, "cast");
     Value *BC1 = Builder.CreateBitCast(Arg1, NewVecTy, "cast");
@@ -3656,11 +3687,11 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     // Cast the pointer to the right type.
     Value *Ptr = Builder.CreateBitCast(CI->getArgOperand(0),
                                  llvm::PointerType::getUnqual(Data->getType()));
-    Builder.CreateAlignedStore(Data, Ptr, 1);
+    Builder.CreateAlignedStore(Data, Ptr, Align(1));
     // Replace the original call result with the first result of the new call.
     Value *TSC = Builder.CreateExtractValue(NewCall, 0);
 
-    std::string Name = CI->getName();
+    std::string Name = std::string(CI->getName());
     if (!Name.empty()) {
       CI->setName(Name + ".old");
       NewCall->setName(Name);
@@ -3726,16 +3757,16 @@ void llvm::UpgradeIntrinsicCall(CallInst *CI, Function *NewFn) {
     auto *MemCI = cast<MemIntrinsic>(NewCall);
     // All mem intrinsics support dest alignment.
     const ConstantInt *Align = cast<ConstantInt>(CI->getArgOperand(3));
-    MemCI->setDestAlignment(Align->getZExtValue());
+    MemCI->setDestAlignment(Align->getMaybeAlignValue());
     // Memcpy/Memmove also support source alignment.
     if (auto *MTI = dyn_cast<MemTransferInst>(MemCI))
-      MTI->setSourceAlignment(Align->getZExtValue());
+      MTI->setSourceAlignment(Align->getMaybeAlignValue());
     break;
   }
   }
   assert(NewCall && "Should have either set this variable or returned through "
                     "the default case");
-  std::string Name = CI->getName();
+  std::string Name = std::string(CI->getName());
   if (!Name.empty()) {
     CI->setName(Name + ".old");
     NewCall->setName(Name);
@@ -4005,6 +4036,12 @@ bool llvm::UpgradeModuleFlags(Module &M) {
     return false;
 
   bool HasObjCFlag = false, HasClassProperties = false, Changed = false;
+  bool HasSwiftVersionFlag = false;
+  uint8_t SwiftMajorVersion, SwiftMinorVersion;
+  uint32_t SwiftABIVersion;
+  auto Int8Ty = Type::getInt8Ty(M.getContext());
+  auto Int32Ty = Type::getInt32Ty(M.getContext());
+
   for (unsigned I = 0, E = ModFlags->getNumOperands(); I != E; ++I) {
     MDNode *Op = ModFlags->getOperand(I);
     if (Op->getNumOperands() != 3)
@@ -4050,6 +4087,31 @@ bool llvm::UpgradeModuleFlags(Module &M) {
         }
       }
     }
+
+    // IRUpgrader turns a i32 type "Objective-C Garbage Collection" into i8 value.
+    // If the higher bits are set, it adds new module flag for swift info.
+    if (ID->getString() == "Objective-C Garbage Collection") {
+      auto Md = dyn_cast<ConstantAsMetadata>(Op->getOperand(2));
+      if (Md) {
+        assert(Md->getValue() && "Expected non-empty metadata");
+        auto Type = Md->getValue()->getType();
+        if (Type == Int8Ty)
+          continue;
+        unsigned Val = Md->getValue()->getUniqueInteger().getZExtValue();
+        if ((Val & 0xff) != Val) {
+          HasSwiftVersionFlag = true;
+          SwiftABIVersion = (Val & 0xff00) >> 8;
+          SwiftMajorVersion = (Val & 0xff000000) >> 24;
+          SwiftMinorVersion = (Val & 0xff0000) >> 16;
+        }
+        Metadata *Ops[3] = {
+          ConstantAsMetadata::get(ConstantInt::get(Int32Ty,Module::Error)),
+          Op->getOperand(1),
+          ConstantAsMetadata::get(ConstantInt::get(Int8Ty,Val & 0xff))};
+        ModFlags->setOperand(I, MDNode::get(M.getContext(), Ops));
+        Changed = true;
+      }
+    }
   }
 
   // "Objective-C Class Properties" is recently added for Objective-C. We
@@ -4063,6 +4125,16 @@ bool llvm::UpgradeModuleFlags(Module &M) {
     Changed = true;
   }
 
+  if (HasSwiftVersionFlag) {
+    M.addModuleFlag(Module::Error, "Swift ABI Version",
+                    SwiftABIVersion);
+    M.addModuleFlag(Module::Error, "Swift Major Version",
+                    ConstantInt::get(Int8Ty, SwiftMajorVersion));
+    M.addModuleFlag(Module::Error, "Swift Minor Version",
+                    ConstantInt::get(Int8Ty, SwiftMinorVersion));
+    Changed = true;
+  }
+
   return Changed;
 }
 
@@ -4077,7 +4149,7 @@ void llvm::UpgradeSectionAttributes(Module &M) {
     for (auto Component : Components)
       OS << ',' << Component.trim();
 
-    return OS.str().substr(1);
+    return std::string(OS.str().substr(1));
   };
 
   for (auto &GV : M.globals()) {
@@ -4095,6 +4167,43 @@ void llvm::UpgradeSectionAttributes(Module &M) {
   }
 }
 
+namespace {
+// Prior to LLVM 10.0, the strictfp attribute could be used on individual
+// callsites within a function that did not also have the strictfp attribute.
+// Since 10.0, if strict FP semantics are needed within a function, the
+// function must have the strictfp attribute and all calls within the function
+// must also have the strictfp attribute. This latter restriction is
+// necessary to prevent unwanted libcall simplification when a function is
+// being cloned (such as for inlining).
+//
+// The "dangling" strictfp attribute usage was only used to prevent constant
+// folding and other libcall simplification. The nobuiltin attribute on the
+// callsite has the same effect.
+struct StrictFPUpgradeVisitor : public InstVisitor<StrictFPUpgradeVisitor> {
+  StrictFPUpgradeVisitor() {}
+
+  void visitCallBase(CallBase &Call) {
+    if (!Call.isStrictFP())
+      return;
+    if (isa<ConstrainedFPIntrinsic>(&Call))
+      return;
+    // If we get here, the caller doesn't have the strictfp attribute
+    // but this callsite does. Replace the strictfp attribute with nobuiltin.
+    Call.removeAttribute(AttributeList::FunctionIndex, Attribute::StrictFP);
+    Call.addAttribute(AttributeList::FunctionIndex, Attribute::NoBuiltin);
+  }
+};
+} // namespace
+
+void llvm::UpgradeFunctionAttributes(Function &F) {
+  // If a function definition doesn't have the strictfp attribute,
+  // convert any callsite strictfp attributes to nobuiltin.
+  if (!F.isDeclaration() && !F.hasFnAttribute(Attribute::StrictFP)) {
+    StrictFPUpgradeVisitor SFPV;
+    SFPV.visit(F);
+  }
+}
+
 static bool isOldLoopArgument(Metadata *MD) {
   auto *T = dyn_cast_or_null<MDTuple>(MD);
   if (!T)
@@ -4163,19 +4272,19 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
   // If X86, and the datalayout matches the expected format, add pointer size
   // address spaces to the datalayout.
   if (!Triple(TT).isX86() || DL.contains(AddrSpaces))
-    return DL;
+    return std::string(DL);
 
   SmallVector<StringRef, 4> Groups;
   Regex R("(e-m:[a-z](-p:32:32)?)(-[if]64:.*$)");
   if (!R.match(DL, &Groups))
-    return DL;
+    return std::string(DL);
 
   SmallString<1024> Buf;
   std::string Res = (Groups[1] + AddrSpaces + Groups[3]).toStringRef(Buf).str();
   return Res;
 }
 
-void llvm::UpgradeFramePointerAttributes(AttrBuilder &B) {
+void llvm::UpgradeAttributes(AttrBuilder &B) {
   StringRef FramePointer;
   if (B.contains("no-frame-pointer-elim")) {
     // The value can be "true" or "false".
@@ -4190,7 +4299,17 @@ void llvm::UpgradeFramePointerAttributes(AttrBuilder &B) {
       FramePointer = "non-leaf";
     B.removeAttribute("no-frame-pointer-elim-non-leaf");
   }
-
   if (!FramePointer.empty())
     B.addAttribute("frame-pointer", FramePointer);
+
+  if (B.contains("null-pointer-is-valid")) {
+    // The value can be "true" or "false".
+    bool NullPointerIsValid = false;
+    for (const auto &I : B.td_attrs())
+      if (I.first == "null-pointer-is-valid")
+        NullPointerIsValid = I.second == "true";
+    B.removeAttribute("null-pointer-is-valid");
+    if (NullPointerIsValid)
+      B.addAttribute(Attribute::NullPointerIsValid);
+  }
 }
diff --git a/llvm/lib/IR/BasicBlock.cpp b/llvm/lib/IR/BasicBlock.cpp
index bdee6990f932..64f1d3f3100c 100644
--- a/llvm/lib/IR/BasicBlock.cpp
+++ b/llvm/lib/IR/BasicBlock.cpp
@@ -33,6 +33,10 @@ LLVMContext &BasicBlock::getContext() const {
   return getType()->getContext();
 }
 
+template <> void llvm::invalidateParentIListOrdering(BasicBlock *BB) {
+  BB->invalidateOrders();
+}
+
 // Explicit instantiation of SymbolTableListTraits since some of the methods
 // are not in the public header file...
 template class llvm::SymbolTableListTraits<Instruction>;
@@ -61,6 +65,8 @@ void BasicBlock::insertInto(Function *NewParent, BasicBlock *InsertBefore) {
 }
 
 BasicBlock::~BasicBlock() {
+  validateInstrOrdering();
+
   // If the address of the block is taken and it is being deleted (e.g. because
   // it is dead), this means that there is either a dangling constant expr
   // hanging off the block, or an undefined use of the block (source code
@@ -193,6 +199,18 @@ const CallInst *BasicBlock::getTerminatingDeoptimizeCall() const {
   return nullptr;
 }
 
+const CallInst *BasicBlock::getPostdominatingDeoptimizeCall() const {
+  const BasicBlock* BB = this;
+  SmallPtrSet<const BasicBlock *, 8> Visited;
+  Visited.insert(BB);
+  while (auto *Succ = BB->getUniqueSuccessor()) {
+    if (!Visited.insert(Succ).second)
+      return nullptr;
+    BB = Succ;
+  }
+  return BB->getTerminatingDeoptimizeCall();
+}
+
 const Instruction* BasicBlock::getFirstNonPHI() const {
   for (const Instruction &I : *this)
     if (!isa<PHINode>(I))
@@ -273,7 +291,7 @@ bool BasicBlock::hasNPredecessorsOrMore(unsigned N) const {
 }
 
 const BasicBlock *BasicBlock::getSingleSuccessor() const {
-  succ_const_iterator SI = succ_begin(this), E = succ_end(this);
+  const_succ_iterator SI = succ_begin(this), E = succ_end(this);
   if (SI == E) return nullptr; // no successors
   const BasicBlock *TheSucc = *SI;
   ++SI;
@@ -281,7 +299,7 @@ const BasicBlock *BasicBlock::getSingleSuccessor() const {
 }
 
 const BasicBlock *BasicBlock::getUniqueSuccessor() const {
-  succ_const_iterator SI = succ_begin(this), E = succ_end(this);
+  const_succ_iterator SI = succ_begin(this), E = succ_end(this);
   if (SI == E) return nullptr; // No successors
   const BasicBlock *SuccBB = *SI;
   ++SI;
@@ -299,78 +317,38 @@ iterator_range<BasicBlock::phi_iterator> BasicBlock::phis() {
   return make_range<phi_iterator>(P, nullptr);
 }
 
-/// This method is used to notify a BasicBlock that the
-/// specified Predecessor of the block is no longer able to reach it.  This is
-/// actually not used to update the Predecessor list, but is actually used to
-/// update the PHI nodes that reside in the block.  Note that this should be
-/// called while the predecessor still refers to this block.
+/// Update PHI nodes in this BasicBlock before removal of predecessor \p Pred.
+/// Note that this function does not actually remove the predecessor.
 ///
+/// If \p KeepOneInputPHIs is true then don't remove PHIs that are left with
+/// zero or one incoming values, and don't simplify PHIs with all incoming
+/// values the same.
 void BasicBlock::removePredecessor(BasicBlock *Pred,
                                    bool KeepOneInputPHIs) {
-  assert((hasNUsesOrMore(16)||// Reduce cost of this assertion for complex CFGs.
+  // Use hasNUsesOrMore to bound the cost of this assertion for complex CFGs.
+  assert((hasNUsesOrMore(16) ||
           find(pred_begin(this), pred_end(this), Pred) != pred_end(this)) &&
-         "removePredecessor: BB is not a predecessor!");
-
-  if (InstList.empty()) return;
-  PHINode *APN = dyn_cast<PHINode>(&front());
-  if (!APN) return;   // Quick exit.
-
-  // If there are exactly two predecessors, then we want to nuke the PHI nodes
-  // altogether.  However, we cannot do this, if this in this case:
-  //
-  //  Loop:
-  //    %x = phi [X, Loop]
-  //    %x2 = add %x, 1         ;; This would become %x2 = add %x2, 1
-  //    br Loop                 ;; %x2 does not dominate all uses
-  //
-  // This is because the PHI node input is actually taken from the predecessor
-  // basic block.  The only case this can happen is with a self loop, so we
-  // check for this case explicitly now.
-  //
-  unsigned max_idx = APN->getNumIncomingValues();
-  assert(max_idx != 0 && "PHI Node in block with 0 predecessors!?!?!");
-  if (max_idx == 2) {
-    BasicBlock *Other = APN->getIncomingBlock(APN->getIncomingBlock(0) == Pred);
-
-    // Disable PHI elimination!
-    if (this == Other) max_idx = 3;
-  }
+         "Pred is not a predecessor!");
 
-  // <= Two predecessors BEFORE I remove one?
-  if (max_idx <= 2 && !KeepOneInputPHIs) {
-    // Yup, loop through and nuke the PHI nodes
-    while (PHINode *PN = dyn_cast<PHINode>(&front())) {
-      // Remove the predecessor first.
-      PN->removeIncomingValue(Pred, !KeepOneInputPHIs);
-
-      // If the PHI _HAD_ two uses, replace PHI node with its now *single* value
-      if (max_idx == 2) {
-        if (PN->getIncomingValue(0) != PN)
-          PN->replaceAllUsesWith(PN->getIncomingValue(0));
-        else
-          // We are left with an infinite loop with no entries: kill the PHI.
-          PN->replaceAllUsesWith(UndefValue::get(PN->getType()));
-        getInstList().pop_front();    // Remove the PHI node
-      }
-
-      // If the PHI node already only had one entry, it got deleted by
-      // removeIncomingValue.
-    }
-  } else {
-    // Okay, now we know that we need to remove predecessor #pred_idx from all
-    // PHI nodes.  Iterate over each PHI node fixing them up
-    PHINode *PN;
-    for (iterator II = begin(); (PN = dyn_cast<PHINode>(II)); ) {
-      ++II;
-      PN->removeIncomingValue(Pred, false);
-      // If all incoming values to the Phi are the same, we can replace the Phi
-      // with that value.
-      Value* PNV = nullptr;
-      if (!KeepOneInputPHIs && (PNV = PN->hasConstantValue()))
-        if (PNV != PN) {
+  // Return early if there are no PHI nodes to update.
+  if (!isa<PHINode>(begin()))
+    return;
+  unsigned NumPreds = cast<PHINode>(front()).getNumIncomingValues();
+
+  // Update all PHI nodes.
+  for (iterator II = begin(); isa<PHINode>(II);) {
+    PHINode *PN = cast<PHINode>(II++);
+    PN->removeIncomingValue(Pred, !KeepOneInputPHIs);
+    if (!KeepOneInputPHIs) {
+      // If we have a single predecessor, removeIncomingValue erased the PHI
+      // node itself.
+      if (NumPreds > 1) {
+        if (Value *PNV = PN->hasConstantValue()) {
+          // Replace the PHI node with its constant value.
           PN->replaceAllUsesWith(PNV);
           PN->eraseFromParent();
         }
+      }
     }
   }
 }
@@ -494,3 +472,29 @@ BasicBlock::iterator llvm::skipDebugIntrinsics(BasicBlock::iterator It) {
     ++It;
   return It;
 }
+
+void BasicBlock::renumberInstructions() {
+  unsigned Order = 0;
+  for (Instruction &I : *this)
+    I.Order = Order++;
+
+  // Set the bit to indicate that the instruction order valid and cached.
+  BasicBlockBits Bits = getBasicBlockBits();
+  Bits.InstrOrderValid = true;
+  setBasicBlockBits(Bits);
+}
+
+#ifndef NDEBUG
+/// In asserts builds, this checks the numbering. In non-asserts builds, it
+/// is defined as a no-op inline function in BasicBlock.h.
+void BasicBlock::validateInstrOrdering() const {
+  if (!isInstrOrderValid())
+    return;
+  const Instruction *Prev = nullptr;
+  for (const Instruction &I : *this) {
+    assert((!Prev || Prev->comesBefore(&I)) &&
+           "cached instruction ordering is incorrect");
+    Prev = &I;
+  }
+}
+#endif
diff --git a/llvm/lib/IR/ConstantFold.cpp b/llvm/lib/IR/ConstantFold.cpp
index 6e24f03c4cfd..f02246cda7fc 100644
--- a/llvm/lib/IR/ConstantFold.cpp
+++ b/llvm/lib/IR/ConstantFold.cpp
@@ -47,14 +47,24 @@ static Constant *BitCastConstantVector(Constant *CV, VectorType *DstTy) {
   if (CV->isAllOnesValue()) return Constant::getAllOnesValue(DstTy);
   if (CV->isNullValue()) return Constant::getNullValue(DstTy);
 
+  // Do not iterate on scalable vector. The num of elements is unknown at
+  // compile-time.
+  if (isa<ScalableVectorType>(DstTy))
+    return nullptr;
+
   // If this cast changes element count then we can't handle it here:
   // doing so requires endianness information.  This should be handled by
   // Analysis/ConstantFolding.cpp
-  unsigned NumElts = DstTy->getNumElements();
-  if (NumElts != CV->getType()->getVectorNumElements())
+  unsigned NumElts = cast<FixedVectorType>(DstTy)->getNumElements();
+  if (NumElts != cast<FixedVectorType>(CV->getType())->getNumElements())
     return nullptr;
 
   Type *DstEltTy = DstTy->getElementType();
+  // Fast path for splatted constants.
+  if (Constant *Splat = CV->getSplatValue()) {
+    return ConstantVector::getSplat(DstTy->getElementCount(),
+                                    ConstantExpr::getBitCast(Splat, DstEltTy));
+  }
 
   SmallVector<Constant*, 16> Result;
   Type *Ty = IntegerType::get(CV->getContext(), 32);
@@ -114,18 +124,9 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) {
           Constant::getNullValue(Type::getInt32Ty(DPTy->getContext()));
         IdxList.push_back(Zero);
         Type *ElTy = PTy->getElementType();
-        while (ElTy != DPTy->getElementType()) {
-          if (StructType *STy = dyn_cast<StructType>(ElTy)) {
-            if (STy->getNumElements() == 0) break;
-            ElTy = STy->getElementType(0);
-            IdxList.push_back(Zero);
-          } else if (SequentialType *STy =
-                     dyn_cast<SequentialType>(ElTy)) {
-            ElTy = STy->getElementType();
-            IdxList.push_back(Zero);
-          } else {
-            break;
-          }
+        while (ElTy && ElTy != DPTy->getElementType()) {
+          ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, (uint64_t)0);
+          IdxList.push_back(Zero);
         }
 
         if (ElTy == DPTy->getElementType())
@@ -138,7 +139,8 @@ static Constant *FoldBitCast(Constant *V, Type *DestTy) {
   // and dest type have the same size (otherwise its an illegal cast).
   if (VectorType *DestPTy = dyn_cast<VectorType>(DestTy)) {
     if (VectorType *SrcTy = dyn_cast<VectorType>(V->getType())) {
-      assert(DestPTy->getBitWidth() == SrcTy->getBitWidth() &&
+      assert(DestPTy->getPrimitiveSizeInBits() ==
+                 SrcTy->getPrimitiveSizeInBits() &&
              "Not cast between same sized vectors!");
       SrcTy = nullptr;
       // First, check for null.  Undef is already handled.
@@ -571,12 +573,21 @@ Constant *llvm::ConstantFoldCastInstruction(unsigned opc, Constant *V,
   // count may be mismatched; don't attempt to handle that here.
   if ((isa<ConstantVector>(V) || isa<ConstantDataVector>(V)) &&
       DestTy->isVectorTy() &&
-      DestTy->getVectorNumElements() == V->getType()->getVectorNumElements()) {
-    SmallVector<Constant*, 16> res;
+      cast<FixedVectorType>(DestTy)->getNumElements() ==
+          cast<FixedVectorType>(V->getType())->getNumElements()) {
     VectorType *DestVecTy = cast<VectorType>(DestTy);
     Type *DstEltTy = DestVecTy->getElementType();
+    // Fast path for splatted constants.
+    if (Constant *Splat = V->getSplatValue()) {
+      return ConstantVector::getSplat(
+          cast<VectorType>(DestTy)->getElementCount(),
+          ConstantExpr::getCast(opc, Splat, DstEltTy));
+    }
+    SmallVector<Constant *, 16> res;
     Type *Ty = IntegerType::get(V->getContext(), 32);
-    for (unsigned i = 0, e = V->getType()->getVectorNumElements(); i != e; ++i) {
+    for (unsigned i = 0,
+                  e = cast<FixedVectorType>(V->getType())->getNumElements();
+         i != e; ++i) {
       Constant *C =
         ConstantExpr::getExtractElement(V, ConstantInt::get(Ty, i));
       res.push_back(ConstantExpr::getCast(opc, C, DstEltTy));
@@ -738,9 +749,10 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
 
   // If the condition is a vector constant, fold the result elementwise.
   if (ConstantVector *CondV = dyn_cast<ConstantVector>(Cond)) {
+    auto *V1VTy = CondV->getType();
     SmallVector<Constant*, 16> Result;
     Type *Ty = IntegerType::get(CondV->getContext(), 32);
-    for (unsigned i = 0, e = V1->getType()->getVectorNumElements(); i != e;++i){
+    for (unsigned i = 0, e = V1VTy->getNumElements(); i != e; ++i) {
       Constant *V;
       Constant *V1Element = ConstantExpr::getExtractElement(V1,
                                                     ConstantInt::get(Ty, i));
@@ -759,7 +771,7 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
     }
 
     // If we were able to build the vector, return it.
-    if (Result.size() == V1->getType()->getVectorNumElements())
+    if (Result.size() == V1VTy->getNumElements())
       return ConstantVector::get(Result);
   }
 
@@ -767,10 +779,30 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
     if (isa<UndefValue>(V1)) return V1;
     return V2;
   }
-  if (isa<UndefValue>(V1)) return V2;
-  if (isa<UndefValue>(V2)) return V1;
+
   if (V1 == V2) return V1;
 
+  // If the true or false value is undef, we can fold to the other value as
+  // long as the other value isn't poison.
+  auto NotPoison = [](Constant *C) {
+    // TODO: We can analyze ConstExpr by opcode to determine if there is any
+    //       possibility of poison.
+    if (isa<ConstantExpr>(C))
+      return false;
+
+    if (isa<ConstantInt>(C) || isa<GlobalVariable>(C) || isa<ConstantFP>(C) ||
+        isa<ConstantPointerNull>(C) || isa<Function>(C))
+      return true;
+
+    if (C->getType()->isVectorTy())
+      return !C->containsUndefElement() && !C->containsConstantExpression();
+
+    // TODO: Recursively analyze aggregates or other constants.
+    return false;
+  };
+  if (isa<UndefValue>(V1) && NotPoison(V2)) return V2;
+  if (isa<UndefValue>(V2) && NotPoison(V1)) return V1;
+
   if (ConstantExpr *TrueVal = dyn_cast<ConstantExpr>(V1)) {
     if (TrueVal->getOpcode() == Instruction::Select)
       if (TrueVal->getOperand(0) == Cond)
@@ -787,18 +819,22 @@ Constant *llvm::ConstantFoldSelectInstruction(Constant *Cond,
 
 Constant *llvm::ConstantFoldExtractElementInstruction(Constant *Val,
                                                       Constant *Idx) {
+  auto *ValVTy = cast<VectorType>(Val->getType());
+
   // extractelt undef, C -> undef
   // extractelt C, undef -> undef
   if (isa<UndefValue>(Val) || isa<UndefValue>(Idx))
-    return UndefValue::get(Val->getType()->getVectorElementType());
+    return UndefValue::get(ValVTy->getElementType());
 
   auto *CIdx = dyn_cast<ConstantInt>(Idx);
   if (!CIdx)
     return nullptr;
 
-  // ee({w,x,y,z}, wrong_value) -> undef
-  if (CIdx->uge(Val->getType()->getVectorNumElements()))
-    return UndefValue::get(Val->getType()->getVectorElementType());
+  if (auto *ValFVTy = dyn_cast<FixedVectorType>(Val->getType())) {
+    // ee({w,x,y,z}, wrong_value) -> undef
+    if (CIdx->uge(ValFVTy->getNumElements()))
+      return UndefValue::get(ValFVTy->getElementType());
+  }
 
   // ee (gep (ptr, idx0, ...), idx) -> gep (ee (ptr, idx), ee (idx0, idx), ...)
   if (auto *CE = dyn_cast<ConstantExpr>(Val)) {
@@ -810,17 +846,26 @@ Constant *llvm::ConstantFoldExtractElementInstruction(Constant *Val,
         if (Op->getType()->isVectorTy()) {
           Constant *ScalarOp = ConstantExpr::getExtractElement(Op, Idx);
           if (!ScalarOp)
-            return  nullptr;
+            return nullptr;
           Ops.push_back(ScalarOp);
         } else
           Ops.push_back(Op);
       }
-      return CE->getWithOperands(Ops, CE->getType()->getVectorElementType(),
-                                 false,
+      return CE->getWithOperands(Ops, ValVTy->getElementType(), false,
                                  Ops[0]->getType()->getPointerElementType());
     }
   }
 
+  // CAZ of type ScalableVectorType and n < CAZ->getMinNumElements() =>
+  //   extractelt CAZ, n -> 0
+  if (auto *ValSVTy = dyn_cast<ScalableVectorType>(Val->getType())) {
+    if (!CIdx->uge(ValSVTy->getMinNumElements())) {
+      if (auto *CAZ = dyn_cast<ConstantAggregateZero>(Val))
+        return CAZ->getElementValue(CIdx->getZExtValue());
+    }
+    return nullptr;
+  }
+
   return Val->getAggregateElement(CIdx);
 }
 
@@ -835,11 +880,12 @@ Constant *llvm::ConstantFoldInsertElementInstruction(Constant *Val,
 
   // Do not iterate on scalable vector. The num of elements is unknown at
   // compile-time.
-  VectorType *ValTy = cast<VectorType>(Val->getType());
-  if (ValTy->isScalable())
+  if (isa<ScalableVectorType>(Val->getType()))
     return nullptr;
 
-  unsigned NumElts = Val->getType()->getVectorNumElements();
+  auto *ValTy = cast<FixedVectorType>(Val->getType());
+
+  unsigned NumElts = ValTy->getNumElements();
   if (CIdx->uge(NumElts))
     return UndefValue::get(Val->getType());
 
@@ -860,31 +906,38 @@ Constant *llvm::ConstantFoldInsertElementInstruction(Constant *Val,
   return ConstantVector::get(Result);
 }
 
-Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1,
-                                                     Constant *V2,
-                                                     Constant *Mask) {
-  unsigned MaskNumElts = Mask->getType()->getVectorNumElements();
-  Type *EltTy = V1->getType()->getVectorElementType();
+Constant *llvm::ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
+                                                     ArrayRef<int> Mask) {
+  auto *V1VTy = cast<VectorType>(V1->getType());
+  unsigned MaskNumElts = Mask.size();
+  ElementCount MaskEltCount = {MaskNumElts, isa<ScalableVectorType>(V1VTy)};
+  Type *EltTy = V1VTy->getElementType();
 
   // Undefined shuffle mask -> undefined value.
-  if (isa<UndefValue>(Mask))
-    return UndefValue::get(VectorType::get(EltTy, MaskNumElts));
-
-  // Don't break the bitcode reader hack.
-  if (isa<ConstantExpr>(Mask)) return nullptr;
+  if (all_of(Mask, [](int Elt) { return Elt == UndefMaskElem; })) {
+    return UndefValue::get(FixedVectorType::get(EltTy, MaskNumElts));
+  }
 
+  // If the mask is all zeros this is a splat, no need to go through all
+  // elements.
+  if (all_of(Mask, [](int Elt) { return Elt == 0; }) &&
+      !MaskEltCount.Scalable) {
+    Type *Ty = IntegerType::get(V1->getContext(), 32);
+    Constant *Elt =
+        ConstantExpr::getExtractElement(V1, ConstantInt::get(Ty, 0));
+    return ConstantVector::getSplat(MaskEltCount, Elt);
+  }
   // Do not iterate on scalable vector. The num of elements is unknown at
   // compile-time.
-  VectorType *ValTy = cast<VectorType>(V1->getType());
-  if (ValTy->isScalable())
+  if (isa<ScalableVectorType>(V1VTy))
     return nullptr;
 
-  unsigned SrcNumElts = V1->getType()->getVectorNumElements();
+  unsigned SrcNumElts = V1VTy->getElementCount().Min;
 
   // Loop over the shuffle mask, evaluating each element.
   SmallVector<Constant*, 32> Result;
   for (unsigned i = 0; i != MaskNumElts; ++i) {
-    int Elt = ShuffleVectorInst::getMaskValue(Mask, i);
+    int Elt = Mask[i];
     if (Elt == -1) {
       Result.push_back(UndefValue::get(EltTy));
       continue;
@@ -930,7 +983,7 @@ Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
   if (StructType *ST = dyn_cast<StructType>(Agg->getType()))
     NumElts = ST->getNumElements();
   else
-    NumElts = cast<SequentialType>(Agg->getType())->getNumElements();
+    NumElts = cast<ArrayType>(Agg->getType())->getNumElements();
 
   SmallVector<Constant*, 32> Result;
   for (unsigned i = 0; i != NumElts; ++i) {
@@ -945,18 +998,19 @@ Constant *llvm::ConstantFoldInsertValueInstruction(Constant *Agg,
 
   if (StructType *ST = dyn_cast<StructType>(Agg->getType()))
     return ConstantStruct::get(ST, Result);
-  if (ArrayType *AT = dyn_cast<ArrayType>(Agg->getType()))
-    return ConstantArray::get(AT, Result);
-  return ConstantVector::get(Result);
+  return ConstantArray::get(cast<ArrayType>(Agg->getType()), Result);
 }
 
 Constant *llvm::ConstantFoldUnaryInstruction(unsigned Opcode, Constant *C) {
   assert(Instruction::isUnaryOp(Opcode) && "Non-unary instruction detected");
 
-  // Handle scalar UndefValue. Vectors are always evaluated per element.
-  bool HasScalarUndef = !C->getType()->isVectorTy() && isa<UndefValue>(C);
+  // Handle scalar UndefValue and scalable vector UndefValue. Fixed-length
+  // vectors are always evaluated per element.
+  bool IsScalableVector = isa<ScalableVectorType>(C->getType());
+  bool HasScalarUndefOrScalableVectorUndef =
+      (!C->getType()->isVectorTy() || IsScalableVector) && isa<UndefValue>(C);
 
-  if (HasScalarUndef) {
+  if (HasScalarUndefOrScalableVectorUndef) {
     switch (static_cast<Instruction::UnaryOps>(Opcode)) {
     case Instruction::FNeg:
       return C; // -undef -> undef
@@ -966,7 +1020,7 @@ Constant *llvm::ConstantFoldUnaryInstruction(unsigned Opcode, Constant *C) {
   }
 
   // Constant should not be UndefValue, unless these are vector constants.
-  assert(!HasScalarUndef && "Unexpected UndefValue");
+  assert(!HasScalarUndefOrScalableVectorUndef && "Unexpected UndefValue");
   // We only have FP UnaryOps right now.
   assert(!isa<ConstantInt>(C) && "Unexpected Integer UnaryOp");
 
@@ -978,10 +1032,17 @@ Constant *llvm::ConstantFoldUnaryInstruction(unsigned Opcode, Constant *C) {
     case Instruction::FNeg:
       return ConstantFP::get(C->getContext(), neg(CV));
     }
-  } else if (VectorType *VTy = dyn_cast<VectorType>(C->getType())) {
-    // Fold each element and create a vector constant from those constants.
-    SmallVector<Constant*, 16> Result;
+  } else if (auto *VTy = dyn_cast<FixedVectorType>(C->getType())) {
+
     Type *Ty = IntegerType::get(VTy->getContext(), 32);
+    // Fast path for splatted constants.
+    if (Constant *Splat = C->getSplatValue()) {
+      Constant *Elt = ConstantExpr::get(Opcode, Splat);
+      return ConstantVector::getSplat(VTy->getElementCount(), Elt);
+    }
+
+    // Fold each element and create a vector constant from those constants.
+    SmallVector<Constant *, 16> Result;
     for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
       Constant *ExtractIdx = ConstantInt::get(Ty, i);
       Constant *Elt = ConstantExpr::getExtractElement(C, ExtractIdx);
@@ -1013,10 +1074,13 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
       return C1;
   }
 
-  // Handle scalar UndefValue. Vectors are always evaluated per element.
-  bool HasScalarUndef = !C1->getType()->isVectorTy() &&
-                        (isa<UndefValue>(C1) || isa<UndefValue>(C2));
-  if (HasScalarUndef) {
+  // Handle scalar UndefValue and scalable vector UndefValue. Fixed-length
+  // vectors are always evaluated per element.
+  bool IsScalableVector = isa<ScalableVectorType>(C1->getType());
+  bool HasScalarUndefOrScalableVectorUndef =
+      (!C1->getType()->isVectorTy() || IsScalableVector) &&
+      (isa<UndefValue>(C1) || isa<UndefValue>(C2));
+  if (HasScalarUndefOrScalableVectorUndef) {
     switch (static_cast<Instruction::BinaryOps>(Opcode)) {
     case Instruction::Xor:
       if (isa<UndefValue>(C1) && isa<UndefValue>(C2))
@@ -1097,8 +1161,12 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
         return C1;
       // undef << X -> 0
       return Constant::getNullValue(C1->getType());
-    case Instruction::FAdd:
     case Instruction::FSub:
+      // -0.0 - undef --> undef (consistent with "fneg undef")
+      if (match(C1, m_NegZeroFP()) && isa<UndefValue>(C2))
+        return C2;
+      LLVM_FALLTHROUGH;
+    case Instruction::FAdd:
     case Instruction::FMul:
     case Instruction::FDiv:
     case Instruction::FRem:
@@ -1119,7 +1187,7 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
   }
 
   // Neither constant should be UndefValue, unless these are vector constants.
-  assert(!HasScalarUndef && "Unexpected UndefValue");
+  assert((!HasScalarUndefOrScalableVectorUndef) && "Unexpected UndefValue");
 
   // Handle simplifications when the RHS is a constant int.
   if (ConstantInt *CI2 = dyn_cast<ConstantInt>(C2)) {
@@ -1173,7 +1241,8 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
           MaybeAlign GVAlign;
 
           if (Module *TheModule = GV->getParent()) {
-            GVAlign = GV->getPointerAlignment(TheModule->getDataLayout());
+            const DataLayout &DL = TheModule->getDataLayout();
+            GVAlign = GV->getPointerAlignment(DL);
 
             // If the function alignment is not specified then assume that it
             // is 4.
@@ -1184,14 +1253,14 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
             // increased code size (see https://reviews.llvm.org/D55115)
             // FIXME: This code should be deleted once existing targets have
             // appropriate defaults
-            if (!GVAlign && isa<Function>(GV))
+            if (isa<Function>(GV) && !DL.getFunctionPtrAlign())
               GVAlign = Align(4);
           } else if (isa<Function>(GV)) {
             // Without a datalayout we have to assume the worst case: that the
             // function pointer isn't aligned at all.
             GVAlign = llvm::None;
-          } else {
-            GVAlign = MaybeAlign(GV->getAlignment());
+          } else if (isa<GlobalVariable>(GV)) {
+            GVAlign = cast<GlobalVariable>(GV)->getAlign();
           }
 
           if (GVAlign && *GVAlign > 1) {
@@ -1329,7 +1398,23 @@ Constant *llvm::ConstantFoldBinaryInstruction(unsigned Opcode, Constant *C1,
         return ConstantFP::get(C1->getContext(), C3V);
       }
     }
-  } else if (VectorType *VTy = dyn_cast<VectorType>(C1->getType())) {
+  } else if (IsScalableVector) {
+    // Do not iterate on scalable vector. The number of elements is unknown at
+    // compile-time.
+    // FIXME: this branch can potentially be removed
+    return nullptr;
+  } else if (auto *VTy = dyn_cast<FixedVectorType>(C1->getType())) {
+    // Fast path for splatted constants.
+    if (Constant *C2Splat = C2->getSplatValue()) {
+      if (Instruction::isIntDivRem(Opcode) && C2Splat->isNullValue())
+        return UndefValue::get(VTy);
+      if (Constant *C1Splat = C1->getSplatValue()) {
+        return ConstantVector::getSplat(
+            VTy->getElementCount(),
+            ConstantExpr::get(Opcode, C1Splat, C2Splat));
+      }
+    }
+
     // Fold each element and create a vector constant from those constants.
     SmallVector<Constant*, 16> Result;
     Type *Ty = IntegerType::get(VTy->getContext(), 32);
@@ -1812,7 +1897,7 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
   Type *ResultTy;
   if (VectorType *VT = dyn_cast<VectorType>(C1->getType()))
     ResultTy = VectorType::get(Type::getInt1Ty(C1->getContext()),
-                               VT->getNumElements());
+                               VT->getElementCount());
   else
     ResultTy = Type::getInt1Ty(C1->getContext());
 
@@ -1942,13 +2027,26 @@ Constant *llvm::ConstantFoldCompareInstruction(unsigned short pred,
       return ConstantInt::get(ResultTy, R==APFloat::cmpGreaterThan ||
                                         R==APFloat::cmpEqual);
     }
-  } else if (C1->getType()->isVectorTy()) {
+  } else if (auto *C1VTy = dyn_cast<VectorType>(C1->getType())) {
+
+    // Do not iterate on scalable vector. The number of elements is unknown at
+    // compile-time.
+    if (isa<ScalableVectorType>(C1VTy))
+      return nullptr;
+
+    // Fast path for splatted constants.
+    if (Constant *C1Splat = C1->getSplatValue())
+      if (Constant *C2Splat = C2->getSplatValue())
+        return ConstantVector::getSplat(
+            C1VTy->getElementCount(),
+            ConstantExpr::getCompare(pred, C1Splat, C2Splat));
+
     // If we can constant fold the comparison of each element, constant fold
     // the whole vector comparison.
     SmallVector<Constant*, 4> ResElts;
     Type *Ty = IntegerType::get(C1->getContext(), 32);
     // Compare the elements, producing an i1 result or constant expr.
-    for (unsigned i = 0, e = C1->getType()->getVectorNumElements(); i != e;++i){
+    for (unsigned i = 0, e = C1VTy->getElementCount().Min; i != e; ++i) {
       Constant *C1E =
         ConstantExpr::getExtractElement(C1, ConstantInt::get(Ty, i));
       Constant *C2E =
@@ -2202,7 +2300,7 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
   if (Idxs.size() == 1 && (Idx0->isNullValue() || isa<UndefValue>(Idx0)))
     return GEPTy->isVectorTy() && !C->getType()->isVectorTy()
                ? ConstantVector::getSplat(
-                     cast<VectorType>(GEPTy)->getNumElements(), C)
+                     cast<VectorType>(GEPTy)->getElementCount(), C)
                : C;
 
   if (C->isNullValue()) {
@@ -2221,13 +2319,16 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
       Type *OrigGEPTy = PointerType::get(Ty, PtrTy->getAddressSpace());
       Type *GEPTy = PointerType::get(Ty, PtrTy->getAddressSpace());
       if (VectorType *VT = dyn_cast<VectorType>(C->getType()))
-        GEPTy = VectorType::get(OrigGEPTy, VT->getNumElements());
+        GEPTy = VectorType::get(OrigGEPTy, VT->getElementCount());
 
       // The GEP returns a vector of pointers when one of more of
       // its arguments is a vector.
       for (unsigned i = 0, e = Idxs.size(); i != e; ++i) {
         if (auto *VT = dyn_cast<VectorType>(Idxs[i]->getType())) {
-          GEPTy = VectorType::get(OrigGEPTy, VT->getNumElements());
+          assert((!isa<VectorType>(GEPTy) || isa<ScalableVectorType>(GEPTy) ==
+                                                 isa<ScalableVectorType>(VT)) &&
+                 "Mismatched GEPTy vector types");
+          GEPTy = VectorType::get(OrigGEPTy, VT->getElementCount());
           break;
         }
       }
@@ -2357,10 +2458,11 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
   SmallVector<Constant *, 8> NewIdxs;
   Type *Ty = PointeeTy;
   Type *Prev = C->getType();
+  auto GEPIter = gep_type_begin(PointeeTy, Idxs);
   bool Unknown =
       !isa<ConstantInt>(Idxs[0]) && !isa<ConstantDataVector>(Idxs[0]);
   for (unsigned i = 1, e = Idxs.size(); i != e;
-       Prev = Ty, Ty = cast<CompositeType>(Ty)->getTypeAtIndex(Idxs[i]), ++i) {
+       Prev = Ty, Ty = (++GEPIter).getIndexedType(), ++i) {
     if (!isa<ConstantInt>(Idxs[i]) && !isa<ConstantDataVector>(Idxs[i])) {
       // We don't know if it's in range or not.
       Unknown = true;
@@ -2379,12 +2481,12 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
       // The verify makes sure that GEPs into a struct are in range.
       continue;
     }
-    auto *STy = cast<SequentialType>(Ty);
-    if (isa<VectorType>(STy)) {
+    if (isa<VectorType>(Ty)) {
       // There can be awkward padding in after a non-power of two vector.
       Unknown = true;
       continue;
     }
+    auto *STy = cast<ArrayType>(Ty);
     if (ConstantInt *CI = dyn_cast<ConstantInt>(Idxs[i])) {
       if (isIndexInRangeOfArrayType(STy->getNumElements(), CI))
         // It's in range, skip to the next index.
@@ -2433,18 +2535,19 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
 
     if (!IsCurrIdxVector && IsPrevIdxVector)
       CurrIdx = ConstantDataVector::getSplat(
-          PrevIdx->getType()->getVectorNumElements(), CurrIdx);
+          cast<FixedVectorType>(PrevIdx->getType())->getNumElements(), CurrIdx);
 
     if (!IsPrevIdxVector && IsCurrIdxVector)
       PrevIdx = ConstantDataVector::getSplat(
-          CurrIdx->getType()->getVectorNumElements(), PrevIdx);
+          cast<FixedVectorType>(CurrIdx->getType())->getNumElements(), PrevIdx);
 
     Constant *Factor =
         ConstantInt::get(CurrIdx->getType()->getScalarType(), NumElements);
     if (UseVector)
       Factor = ConstantDataVector::getSplat(
-          IsPrevIdxVector ? PrevIdx->getType()->getVectorNumElements()
-                          : CurrIdx->getType()->getVectorNumElements(),
+          IsPrevIdxVector
+              ? cast<FixedVectorType>(PrevIdx->getType())->getNumElements()
+              : cast<FixedVectorType>(CurrIdx->getType())->getNumElements(),
           Factor);
 
     NewIdxs[i] = ConstantExpr::getSRem(CurrIdx, Factor);
@@ -2460,10 +2563,11 @@ Constant *llvm::ConstantFoldGetElementPtr(Type *PointeeTy, Constant *C,
     // overflow trouble.
     Type *ExtendedTy = Type::getIntNTy(Div->getContext(), CommonExtendedWidth);
     if (UseVector)
-      ExtendedTy = VectorType::get(
-          ExtendedTy, IsPrevIdxVector
-                          ? PrevIdx->getType()->getVectorNumElements()
-                          : CurrIdx->getType()->getVectorNumElements());
+      ExtendedTy = FixedVectorType::get(
+          ExtendedTy,
+          IsPrevIdxVector
+              ? cast<FixedVectorType>(PrevIdx->getType())->getNumElements()
+              : cast<FixedVectorType>(CurrIdx->getType())->getNumElements());
 
     if (!PrevIdx->getType()->isIntOrIntVectorTy(CommonExtendedWidth))
       PrevIdx = ConstantExpr::getSExt(PrevIdx, ExtendedTy);
diff --git a/llvm/lib/IR/ConstantFold.h b/llvm/lib/IR/ConstantFold.h
index 9ad6e14e9e40..0cdd5cf3cbce 100644
--- a/llvm/lib/IR/ConstantFold.h
+++ b/llvm/lib/IR/ConstantFold.h
@@ -38,7 +38,7 @@ template <typename T> class ArrayRef;
   Constant *ConstantFoldInsertElementInstruction(Constant *Val, Constant *Elt,
                                                  Constant *Idx);
   Constant *ConstantFoldShuffleVectorInstruction(Constant *V1, Constant *V2,
-                                                 Constant *Mask);
+                                                 ArrayRef<int> Mask);
   Constant *ConstantFoldExtractValueInstruction(Constant *Agg,
                                                 ArrayRef<unsigned> Idxs);
   Constant *ConstantFoldInsertValueInstruction(Constant *Agg, Constant *Val,
diff --git a/llvm/lib/IR/ConstantRange.cpp b/llvm/lib/IR/ConstantRange.cpp
index 3d25cb5bfbdf..eabaaa203927 100644
--- a/llvm/lib/IR/ConstantRange.cpp
+++ b/llvm/lib/IR/ConstantRange.cpp
@@ -802,6 +802,8 @@ ConstantRange ConstantRange::binaryOp(Instruction::BinaryOps BinOp,
     return binaryAnd(Other);
   case Instruction::Or:
     return binaryOr(Other);
+  case Instruction::Xor:
+    return binaryXor(Other);
   // Note: floating point operations applied to abstract ranges are just
   // ideal integer operations with a lossy representation
   case Instruction::FAdd:
@@ -1194,6 +1196,10 @@ ConstantRange::binaryAnd(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return getEmpty();
 
+  // Use APInt's implementation of AND for single element ranges.
+  if (isSingleElement() && Other.isSingleElement())
+    return {*getSingleElement() & *Other.getSingleElement()};
+
   // TODO: replace this with something less conservative
 
   APInt umin = APIntOps::umin(Other.getUnsignedMax(), getUnsignedMax());
@@ -1205,12 +1211,28 @@ ConstantRange::binaryOr(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
     return getEmpty();
 
+  // Use APInt's implementation of OR for single element ranges.
+  if (isSingleElement() && Other.isSingleElement())
+    return {*getSingleElement() | *Other.getSingleElement()};
+
   // TODO: replace this with something less conservative
 
   APInt umax = APIntOps::umax(getUnsignedMin(), Other.getUnsignedMin());
   return getNonEmpty(std::move(umax), APInt::getNullValue(getBitWidth()));
 }
 
+ConstantRange ConstantRange::binaryXor(const ConstantRange &Other) const {
+  if (isEmptySet() || Other.isEmptySet())
+    return getEmpty();
+
+  // Use APInt's implementation of XOR for single element ranges.
+  if (isSingleElement() && Other.isSingleElement())
+    return {*getSingleElement() ^ *Other.getSingleElement()};
+
+  // TODO: replace this with something less conservative
+  return getFull();
+}
+
 ConstantRange
 ConstantRange::shl(const ConstantRange &Other) const {
   if (isEmptySet() || Other.isEmptySet())
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index 054375aab6c3..cbbcca20ea51 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -160,8 +160,8 @@ bool Constant::isNotOneValue() const {
     return !CFP->getValueAPF().bitcastToAPInt().isOneValue();
 
   // Check that vectors don't contain 1
-  if (this->getType()->isVectorTy()) {
-    unsigned NumElts = this->getType()->getVectorNumElements();
+  if (auto *VTy = dyn_cast<VectorType>(this->getType())) {
+    unsigned NumElts = VTy->getNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Elt = this->getAggregateElement(i);
       if (!Elt || !Elt->isNotOneValue())
@@ -210,8 +210,8 @@ bool Constant::isNotMinSignedValue() const {
     return !CFP->getValueAPF().bitcastToAPInt().isMinSignedValue();
 
   // Check that vectors don't contain INT_MIN
-  if (this->getType()->isVectorTy()) {
-    unsigned NumElts = this->getType()->getVectorNumElements();
+  if (auto *VTy = dyn_cast<VectorType>(this->getType())) {
+    unsigned NumElts = VTy->getNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Elt = this->getAggregateElement(i);
       if (!Elt || !Elt->isNotMinSignedValue())
@@ -227,9 +227,10 @@ bool Constant::isNotMinSignedValue() const {
 bool Constant::isFiniteNonZeroFP() const {
   if (auto *CFP = dyn_cast<ConstantFP>(this))
     return CFP->getValueAPF().isFiniteNonZero();
-  if (!getType()->isVectorTy())
+  auto *VTy = dyn_cast<VectorType>(getType());
+  if (!VTy)
     return false;
-  for (unsigned i = 0, e = getType()->getVectorNumElements(); i != e; ++i) {
+  for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
     auto *CFP = dyn_cast_or_null<ConstantFP>(this->getAggregateElement(i));
     if (!CFP || !CFP->getValueAPF().isFiniteNonZero())
       return false;
@@ -240,9 +241,10 @@ bool Constant::isFiniteNonZeroFP() const {
 bool Constant::isNormalFP() const {
   if (auto *CFP = dyn_cast<ConstantFP>(this))
     return CFP->getValueAPF().isNormal();
-  if (!getType()->isVectorTy())
+  auto *VTy = dyn_cast<FixedVectorType>(getType());
+  if (!VTy)
     return false;
-  for (unsigned i = 0, e = getType()->getVectorNumElements(); i != e; ++i) {
+  for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
     auto *CFP = dyn_cast_or_null<ConstantFP>(this->getAggregateElement(i));
     if (!CFP || !CFP->getValueAPF().isNormal())
       return false;
@@ -253,9 +255,10 @@ bool Constant::isNormalFP() const {
 bool Constant::hasExactInverseFP() const {
   if (auto *CFP = dyn_cast<ConstantFP>(this))
     return CFP->getValueAPF().getExactInverse(nullptr);
-  if (!getType()->isVectorTy())
+  auto *VTy = dyn_cast<FixedVectorType>(getType());
+  if (!VTy)
     return false;
-  for (unsigned i = 0, e = getType()->getVectorNumElements(); i != e; ++i) {
+  for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
     auto *CFP = dyn_cast_or_null<ConstantFP>(this->getAggregateElement(i));
     if (!CFP || !CFP->getValueAPF().getExactInverse(nullptr))
       return false;
@@ -266,9 +269,10 @@ bool Constant::hasExactInverseFP() const {
 bool Constant::isNaN() const {
   if (auto *CFP = dyn_cast<ConstantFP>(this))
     return CFP->isNaN();
-  if (!getType()->isVectorTy())
+  auto *VTy = dyn_cast<FixedVectorType>(getType());
+  if (!VTy)
     return false;
-  for (unsigned i = 0, e = getType()->getVectorNumElements(); i != e; ++i) {
+  for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i) {
     auto *CFP = dyn_cast_or_null<ConstantFP>(this->getAggregateElement(i));
     if (!CFP || !CFP->isNaN())
       return false;
@@ -282,34 +286,40 @@ bool Constant::isElementWiseEqual(Value *Y) const {
     return true;
 
   // The input value must be a vector constant with the same type.
-  Type *Ty = getType();
-  if (!isa<Constant>(Y) || !Ty->isVectorTy() || Ty != Y->getType())
+  auto *VTy = dyn_cast<VectorType>(getType());
+  if (!isa<Constant>(Y) || !VTy || VTy != Y->getType())
+    return false;
+
+  // TODO: Compare pointer constants?
+  if (!(VTy->getElementType()->isIntegerTy() ||
+        VTy->getElementType()->isFloatingPointTy()))
     return false;
 
   // They may still be identical element-wise (if they have `undef`s).
-  // FIXME: This crashes on FP vector constants.
-  return match(ConstantExpr::getICmp(ICmpInst::Predicate::ICMP_EQ,
-                                     const_cast<Constant *>(this),
-                                     cast<Constant>(Y)),
-               m_One());
+  // Bitcast to integer to allow exact bitwise comparison for all types.
+  Type *IntTy = VectorType::getInteger(VTy);
+  Constant *C0 = ConstantExpr::getBitCast(const_cast<Constant *>(this), IntTy);
+  Constant *C1 = ConstantExpr::getBitCast(cast<Constant>(Y), IntTy);
+  Constant *CmpEq = ConstantExpr::getICmp(ICmpInst::ICMP_EQ, C0, C1);
+  return isa<UndefValue>(CmpEq) || match(CmpEq, m_One());
 }
 
 bool Constant::containsUndefElement() const {
-  if (!getType()->isVectorTy())
-    return false;
-  for (unsigned i = 0, e = getType()->getVectorNumElements(); i != e; ++i)
-    if (isa<UndefValue>(getAggregateElement(i)))
-      return true;
+  if (auto *VTy = dyn_cast<VectorType>(getType())) {
+    for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
+      if (isa<UndefValue>(getAggregateElement(i)))
+        return true;
+  }
 
   return false;
 }
 
 bool Constant::containsConstantExpression() const {
-  if (!getType()->isVectorTy())
-    return false;
-  for (unsigned i = 0, e = getType()->getVectorNumElements(); i != e; ++i)
-    if (isa<ConstantExpr>(getAggregateElement(i)))
-      return true;
+  if (auto *VTy = dyn_cast<VectorType>(getType())) {
+    for (unsigned i = 0, e = VTy->getNumElements(); i != e; ++i)
+      if (isa<ConstantExpr>(getAggregateElement(i)))
+        return true;
+  }
 
   return false;
 }
@@ -322,6 +332,9 @@ Constant *Constant::getNullValue(Type *Ty) {
   case Type::HalfTyID:
     return ConstantFP::get(Ty->getContext(),
                            APFloat::getZero(APFloat::IEEEhalf()));
+  case Type::BFloatTyID:
+    return ConstantFP::get(Ty->getContext(),
+                           APFloat::getZero(APFloat::BFloat()));
   case Type::FloatTyID:
     return ConstantFP::get(Ty->getContext(),
                            APFloat::getZero(APFloat::IEEEsingle()));
@@ -342,7 +355,8 @@ Constant *Constant::getNullValue(Type *Ty) {
     return ConstantPointerNull::get(cast<PointerType>(Ty));
   case Type::StructTyID:
   case Type::ArrayTyID:
-  case Type::VectorTyID:
+  case Type::FixedVectorTyID:
+  case Type::ScalableVectorTyID:
     return ConstantAggregateZero::get(Ty);
   case Type::TokenTyID:
     return ConstantTokenNone::get(Ty->getContext());
@@ -364,7 +378,7 @@ Constant *Constant::getIntegerValue(Type *Ty, const APInt &V) {
 
   // Broadcast a scalar to a vector, if necessary.
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    C = ConstantVector::getSplat(VTy->getNumElements(), C);
+    C = ConstantVector::getSplat(VTy->getElementCount(), C);
 
   return C;
 }
@@ -375,13 +389,13 @@ Constant *Constant::getAllOnesValue(Type *Ty) {
                             APInt::getAllOnesValue(ITy->getBitWidth()));
 
   if (Ty->isFloatingPointTy()) {
-    APFloat FL = APFloat::getAllOnesValue(Ty->getPrimitiveSizeInBits(),
-                                          !Ty->isPPC_FP128Ty());
+    APFloat FL = APFloat::getAllOnesValue(Ty->getFltSemantics(),
+                                          Ty->getPrimitiveSizeInBits());
     return ConstantFP::get(Ty->getContext(), FL);
   }
 
   VectorType *VTy = cast<VectorType>(Ty);
-  return ConstantVector::getSplat(VTy->getNumElements(),
+  return ConstantVector::getSplat(VTy->getElementCount(),
                                   getAllOnesValue(VTy->getElementType()));
 }
 
@@ -449,7 +463,74 @@ void Constant::destroyConstant() {
   }
 
   // Value has no outstanding references it is safe to delete it now...
-  delete this;
+  deleteConstant(this);
+}
+
+void llvm::deleteConstant(Constant *C) {
+  switch (C->getValueID()) {
+  case Constant::ConstantIntVal:
+    delete static_cast<ConstantInt *>(C);
+    break;
+  case Constant::ConstantFPVal:
+    delete static_cast<ConstantFP *>(C);
+    break;
+  case Constant::ConstantAggregateZeroVal:
+    delete static_cast<ConstantAggregateZero *>(C);
+    break;
+  case Constant::ConstantArrayVal:
+    delete static_cast<ConstantArray *>(C);
+    break;
+  case Constant::ConstantStructVal:
+    delete static_cast<ConstantStruct *>(C);
+    break;
+  case Constant::ConstantVectorVal:
+    delete static_cast<ConstantVector *>(C);
+    break;
+  case Constant::ConstantPointerNullVal:
+    delete static_cast<ConstantPointerNull *>(C);
+    break;
+  case Constant::ConstantDataArrayVal:
+    delete static_cast<ConstantDataArray *>(C);
+    break;
+  case Constant::ConstantDataVectorVal:
+    delete static_cast<ConstantDataVector *>(C);
+    break;
+  case Constant::ConstantTokenNoneVal:
+    delete static_cast<ConstantTokenNone *>(C);
+    break;
+  case Constant::BlockAddressVal:
+    delete static_cast<BlockAddress *>(C);
+    break;
+  case Constant::UndefValueVal:
+    delete static_cast<UndefValue *>(C);
+    break;
+  case Constant::ConstantExprVal:
+    if (isa<UnaryConstantExpr>(C))
+      delete static_cast<UnaryConstantExpr *>(C);
+    else if (isa<BinaryConstantExpr>(C))
+      delete static_cast<BinaryConstantExpr *>(C);
+    else if (isa<SelectConstantExpr>(C))
+      delete static_cast<SelectConstantExpr *>(C);
+    else if (isa<ExtractElementConstantExpr>(C))
+      delete static_cast<ExtractElementConstantExpr *>(C);
+    else if (isa<InsertElementConstantExpr>(C))
+      delete static_cast<InsertElementConstantExpr *>(C);
+    else if (isa<ShuffleVectorConstantExpr>(C))
+      delete static_cast<ShuffleVectorConstantExpr *>(C);
+    else if (isa<ExtractValueConstantExpr>(C))
+      delete static_cast<ExtractValueConstantExpr *>(C);
+    else if (isa<InsertValueConstantExpr>(C))
+      delete static_cast<InsertValueConstantExpr *>(C);
+    else if (isa<GetElementPtrConstantExpr>(C))
+      delete static_cast<GetElementPtrConstantExpr *>(C);
+    else if (isa<CompareConstantExpr>(C))
+      delete static_cast<CompareConstantExpr *>(C);
+    else
+      llvm_unreachable("Unexpected constant expr");
+    break;
+  default:
+    llvm_unreachable("Unexpected constant");
+  }
 }
 
 static bool canTrapImpl(const Constant *C,
@@ -633,10 +714,11 @@ Constant *Constant::replaceUndefsWith(Constant *C, Constant *Replacement) {
   }
 
   // Don't know how to deal with this constant.
-  if (!Ty->isVectorTy())
+  auto *VTy = dyn_cast<FixedVectorType>(Ty);
+  if (!VTy)
     return C;
 
-  unsigned NumElts = Ty->getVectorNumElements();
+  unsigned NumElts = VTy->getNumElements();
   SmallVector<Constant *, 32> NewC(NumElts);
   for (unsigned i = 0; i != NumElts; ++i) {
     Constant *EltC = C->getAggregateElement(i);
@@ -675,7 +757,7 @@ Constant *ConstantInt::getTrue(Type *Ty) {
   assert(Ty->isIntOrIntVectorTy(1) && "Type not i1 or vector of i1.");
   ConstantInt *TrueC = ConstantInt::getTrue(Ty->getContext());
   if (auto *VTy = dyn_cast<VectorType>(Ty))
-    return ConstantVector::getSplat(VTy->getNumElements(), TrueC);
+    return ConstantVector::getSplat(VTy->getElementCount(), TrueC);
   return TrueC;
 }
 
@@ -683,7 +765,7 @@ Constant *ConstantInt::getFalse(Type *Ty) {
   assert(Ty->isIntOrIntVectorTy(1) && "Type not i1 or vector of i1.");
   ConstantInt *FalseC = ConstantInt::getFalse(Ty->getContext());
   if (auto *VTy = dyn_cast<VectorType>(Ty))
-    return ConstantVector::getSplat(VTy->getNumElements(), FalseC);
+    return ConstantVector::getSplat(VTy->getElementCount(), FalseC);
   return FalseC;
 }
 
@@ -706,7 +788,7 @@ Constant *ConstantInt::get(Type *Ty, uint64_t V, bool isSigned) {
 
   // For vectors, broadcast the value.
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    return ConstantVector::getSplat(VTy->getNumElements(), C);
+    return ConstantVector::getSplat(VTy->getElementCount(), C);
 
   return C;
 }
@@ -730,7 +812,7 @@ Constant *ConstantInt::get(Type *Ty, const APInt& V) {
 
   // For vectors, broadcast the value.
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    return ConstantVector::getSplat(VTy->getNumElements(), C);
+    return ConstantVector::getSplat(VTy->getElementCount(), C);
 
   return C;
 }
@@ -751,6 +833,8 @@ void ConstantInt::destroyConstantImpl() {
 static const fltSemantics *TypeToFloatSemantics(Type *Ty) {
   if (Ty->isHalfTy())
     return &APFloat::IEEEhalf();
+  if (Ty->isBFloatTy())
+    return &APFloat::BFloat();
   if (Ty->isFloatTy())
     return &APFloat::IEEEsingle();
   if (Ty->isDoubleTy())
@@ -775,7 +859,7 @@ Constant *ConstantFP::get(Type *Ty, double V) {
 
   // For vectors, broadcast the value.
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    return ConstantVector::getSplat(VTy->getNumElements(), C);
+    return ConstantVector::getSplat(VTy->getElementCount(), C);
 
   return C;
 }
@@ -787,7 +871,7 @@ Constant *ConstantFP::get(Type *Ty, const APFloat &V) {
 
   // For vectors, broadcast the value.
   if (auto *VTy = dyn_cast<VectorType>(Ty))
-    return ConstantVector::getSplat(VTy->getNumElements(), C);
+    return ConstantVector::getSplat(VTy->getElementCount(), C);
 
   return C;
 }
@@ -800,7 +884,7 @@ Constant *ConstantFP::get(Type *Ty, StringRef Str) {
 
   // For vectors, broadcast the value.
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    return ConstantVector::getSplat(VTy->getNumElements(), C);
+    return ConstantVector::getSplat(VTy->getElementCount(), C);
 
   return C;
 }
@@ -811,7 +895,7 @@ Constant *ConstantFP::getNaN(Type *Ty, bool Negative, uint64_t Payload) {
   Constant *C = get(Ty->getContext(), NaN);
 
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    return ConstantVector::getSplat(VTy->getNumElements(), C);
+    return ConstantVector::getSplat(VTy->getElementCount(), C);
 
   return C;
 }
@@ -820,10 +904,10 @@ Constant *ConstantFP::getQNaN(Type *Ty, bool Negative, APInt *Payload) {
   const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType());
   APFloat NaN = APFloat::getQNaN(Semantics, Negative, Payload);
   Constant *C = get(Ty->getContext(), NaN);
-  
+
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    return ConstantVector::getSplat(VTy->getNumElements(), C);
-  
+    return ConstantVector::getSplat(VTy->getElementCount(), C);
+
   return C;
 }
 
@@ -831,10 +915,10 @@ Constant *ConstantFP::getSNaN(Type *Ty, bool Negative, APInt *Payload) {
   const fltSemantics &Semantics = *TypeToFloatSemantics(Ty->getScalarType());
   APFloat NaN = APFloat::getSNaN(Semantics, Negative, Payload);
   Constant *C = get(Ty->getContext(), NaN);
-  
+
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    return ConstantVector::getSplat(VTy->getNumElements(), C);
-  
+    return ConstantVector::getSplat(VTy->getElementCount(), C);
+
   return C;
 }
 
@@ -844,7 +928,7 @@ Constant *ConstantFP::getNegativeZero(Type *Ty) {
   Constant *C = get(Ty->getContext(), NegZero);
 
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    return ConstantVector::getSplat(VTy->getNumElements(), C);
+    return ConstantVector::getSplat(VTy->getElementCount(), C);
 
   return C;
 }
@@ -868,6 +952,8 @@ ConstantFP* ConstantFP::get(LLVMContext &Context, const APFloat& V) {
     Type *Ty;
     if (&V.getSemantics() == &APFloat::IEEEhalf())
       Ty = Type::getHalfTy(Context);
+    else if (&V.getSemantics() == &APFloat::BFloat())
+      Ty = Type::getBFloatTy(Context);
     else if (&V.getSemantics() == &APFloat::IEEEsingle())
       Ty = Type::getFloatTy(Context);
     else if (&V.getSemantics() == &APFloat::IEEEdouble())
@@ -892,7 +978,7 @@ Constant *ConstantFP::getInfinity(Type *Ty, bool Negative) {
   Constant *C = get(Ty->getContext(), APFloat::getInf(Semantics, Negative));
 
   if (VectorType *VTy = dyn_cast<VectorType>(Ty))
-    return ConstantVector::getSplat(VTy->getNumElements(), C);
+    return ConstantVector::getSplat(VTy->getElementCount(), C);
 
   return C;
 }
@@ -917,7 +1003,9 @@ void ConstantFP::destroyConstantImpl() {
 //===----------------------------------------------------------------------===//
 
 Constant *ConstantAggregateZero::getSequentialElement() const {
-  return Constant::getNullValue(getType()->getSequentialElementType());
+  if (auto *AT = dyn_cast<ArrayType>(getType()))
+    return Constant::getNullValue(AT->getElementType());
+  return Constant::getNullValue(cast<VectorType>(getType())->getElementType());
 }
 
 Constant *ConstantAggregateZero::getStructElement(unsigned Elt) const {
@@ -925,13 +1013,13 @@ Constant *ConstantAggregateZero::getStructElement(unsigned Elt) const {
 }
 
 Constant *ConstantAggregateZero::getElementValue(Constant *C) const {
-  if (isa<SequentialType>(getType()))
+  if (isa<ArrayType>(getType()) || isa<VectorType>(getType()))
     return getSequentialElement();
   return getStructElement(cast<ConstantInt>(C)->getZExtValue());
 }
 
 Constant *ConstantAggregateZero::getElementValue(unsigned Idx) const {
-  if (isa<SequentialType>(getType()))
+  if (isa<ArrayType>(getType()) || isa<VectorType>(getType()))
     return getSequentialElement();
   return getStructElement(Idx);
 }
@@ -950,7 +1038,9 @@ unsigned ConstantAggregateZero::getNumElements() const {
 //===----------------------------------------------------------------------===//
 
 UndefValue *UndefValue::getSequentialElement() const {
-  return UndefValue::get(getType()->getSequentialElementType());
+  if (ArrayType *ATy = dyn_cast<ArrayType>(getType()))
+    return UndefValue::get(ATy->getElementType());
+  return UndefValue::get(cast<VectorType>(getType())->getElementType());
 }
 
 UndefValue *UndefValue::getStructElement(unsigned Elt) const {
@@ -958,21 +1048,23 @@ UndefValue *UndefValue::getStructElement(unsigned Elt) const {
 }
 
 UndefValue *UndefValue::getElementValue(Constant *C) const {
-  if (isa<SequentialType>(getType()))
+  if (isa<ArrayType>(getType()) || isa<VectorType>(getType()))
     return getSequentialElement();
   return getStructElement(cast<ConstantInt>(C)->getZExtValue());
 }
 
 UndefValue *UndefValue::getElementValue(unsigned Idx) const {
-  if (isa<SequentialType>(getType()))
+  if (isa<ArrayType>(getType()) || isa<VectorType>(getType()))
     return getSequentialElement();
   return getStructElement(Idx);
 }
 
 unsigned UndefValue::getNumElements() const {
   Type *Ty = getType();
-  if (auto *ST = dyn_cast<SequentialType>(Ty))
-    return ST->getNumElements();
+  if (auto *AT = dyn_cast<ArrayType>(Ty))
+    return AT->getNumElements();
+  if (auto *VT = dyn_cast<VectorType>(Ty))
+    return VT->getNumElements();
   return Ty->getStructNumElements();
 }
 
@@ -1011,7 +1103,7 @@ static Constant *getFPSequenceIfElementsMatch(ArrayRef<Constant *> V) {
       Elts.push_back(CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
     else
       return nullptr;
-  return SequentialTy::getFP(V[0]->getContext(), Elts);
+  return SequentialTy::getFP(V[0]->getType(), Elts);
 }
 
 template <typename SequenceTy>
@@ -1030,7 +1122,7 @@ static Constant *getSequenceIfElementsMatch(Constant *C,
     else if (CI->getType()->isIntegerTy(64))
       return getIntSequenceIfElementsMatch<SequenceTy, uint64_t>(V);
   } else if (ConstantFP *CFP = dyn_cast<ConstantFP>(C)) {
-    if (CFP->getType()->isHalfTy())
+    if (CFP->getType()->isHalfTy() || CFP->getType()->isBFloatTy())
       return getFPSequenceIfElementsMatch<SequenceTy, uint16_t>(V);
     else if (CFP->getType()->isFloatTy())
       return getFPSequenceIfElementsMatch<SequenceTy, uint32_t>(V);
@@ -1041,19 +1133,20 @@ static Constant *getSequenceIfElementsMatch(Constant *C,
   return nullptr;
 }
 
-ConstantAggregate::ConstantAggregate(CompositeType *T, ValueTy VT,
+ConstantAggregate::ConstantAggregate(Type *T, ValueTy VT,
                                      ArrayRef<Constant *> V)
     : Constant(T, VT, OperandTraits<ConstantAggregate>::op_end(this) - V.size(),
                V.size()) {
   llvm::copy(V, op_begin());
 
   // Check that types match, unless this is an opaque struct.
-  if (auto *ST = dyn_cast<StructType>(T))
+  if (auto *ST = dyn_cast<StructType>(T)) {
     if (ST->isOpaque())
       return;
-  for (unsigned I = 0, E = V.size(); I != E; ++I)
-    assert(V[I]->getType() == T->getTypeAtIndex(I) &&
-           "Initializer for composite element doesn't match!");
+    for (unsigned I = 0, E = V.size(); I != E; ++I)
+      assert(V[I]->getType() == ST->getTypeAtIndex(I) &&
+             "Initializer for struct element doesn't match!");
+  }
 }
 
 ConstantArray::ConstantArray(ArrayType *T, ArrayRef<Constant *> V)
@@ -1161,13 +1254,13 @@ ConstantVector::ConstantVector(VectorType *T, ArrayRef<Constant *> V)
 Constant *ConstantVector::get(ArrayRef<Constant*> V) {
   if (Constant *C = getImpl(V))
     return C;
-  VectorType *Ty = VectorType::get(V.front()->getType(), V.size());
+  auto *Ty = FixedVectorType::get(V.front()->getType(), V.size());
   return Ty->getContext().pImpl->VectorConstants.getOrCreate(Ty, V);
 }
 
 Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
   assert(!V.empty() && "Vectors can't be empty");
-  VectorType *T = VectorType::get(V.front()->getType(), V.size());
+  auto *T = FixedVectorType::get(V.front()->getType(), V.size());
 
   // If this is an all-undef or all-zero vector, return a
   // ConstantAggregateZero or UndefValue.
@@ -1198,15 +1291,34 @@ Constant *ConstantVector::getImpl(ArrayRef<Constant*> V) {
   return nullptr;
 }
 
-Constant *ConstantVector::getSplat(unsigned NumElts, Constant *V) {
-  // If this splat is compatible with ConstantDataVector, use it instead of
-  // ConstantVector.
-  if ((isa<ConstantFP>(V) || isa<ConstantInt>(V)) &&
-      ConstantDataSequential::isElementTypeCompatible(V->getType()))
-    return ConstantDataVector::getSplat(NumElts, V);
+Constant *ConstantVector::getSplat(ElementCount EC, Constant *V) {
+  if (!EC.Scalable) {
+    // If this splat is compatible with ConstantDataVector, use it instead of
+    // ConstantVector.
+    if ((isa<ConstantFP>(V) || isa<ConstantInt>(V)) &&
+        ConstantDataSequential::isElementTypeCompatible(V->getType()))
+      return ConstantDataVector::getSplat(EC.Min, V);
+
+    SmallVector<Constant *, 32> Elts(EC.Min, V);
+    return get(Elts);
+  }
+
+  Type *VTy = VectorType::get(V->getType(), EC);
+
+  if (V->isNullValue())
+    return ConstantAggregateZero::get(VTy);
+  else if (isa<UndefValue>(V))
+    return UndefValue::get(VTy);
 
-  SmallVector<Constant*, 32> Elts(NumElts, V);
-  return get(Elts);
+  Type *I32Ty = Type::getInt32Ty(VTy->getContext());
+
+  // Move scalar into vector.
+  Constant *UndefV = UndefValue::get(VTy);
+  V = ConstantExpr::getInsertElement(UndefV, V, ConstantInt::get(I32Ty, 0));
+  // Build shuffle mask to perform the splat.
+  SmallVector<int, 8> Zeros(EC.Min, 0);
+  // Splat.
+  return ConstantExpr::getShuffleVector(V, UndefV, Zeros);
 }
 
 ConstantTokenNone *ConstantTokenNone::get(LLVMContext &Context) {
@@ -1271,6 +1383,14 @@ unsigned ConstantExpr::getPredicate() const {
   return cast<CompareConstantExpr>(this)->predicate;
 }
 
+ArrayRef<int> ConstantExpr::getShuffleMask() const {
+  return cast<ShuffleVectorConstantExpr>(this)->ShuffleMask;
+}
+
+Constant *ConstantExpr::getShuffleMaskForBitcode() const {
+  return cast<ShuffleVectorConstantExpr>(this)->ShuffleMaskForBitcode;
+}
+
 Constant *
 ConstantExpr::getWithOperandReplaced(unsigned OpNo, Constant *Op) const {
   assert(Op->getType() == getOperand(OpNo)->getType() &&
@@ -1322,7 +1442,7 @@ Constant *ConstantExpr::getWithOperands(ArrayRef<Constant *> Ops, Type *Ty,
   case Instruction::ExtractValue:
     return ConstantExpr::getExtractValue(Ops[0], getIndices(), OnlyIfReducedTy);
   case Instruction::ShuffleVector:
-    return ConstantExpr::getShuffleVector(Ops[0], Ops[1], Ops[2],
+    return ConstantExpr::getShuffleVector(Ops[0], Ops[1], getShuffleMask(),
                                           OnlyIfReducedTy);
   case Instruction::GetElementPtr: {
     auto *GEPO = cast<GEPOperator>(this);
@@ -1375,6 +1495,12 @@ bool ConstantFP::isValueValidForType(Type *Ty, const APFloat& Val) {
     Val2.convert(APFloat::IEEEhalf(), APFloat::rmNearestTiesToEven, &losesInfo);
     return !losesInfo;
   }
+  case Type::BFloatTyID: {
+    if (&Val2.getSemantics() == &APFloat::BFloat())
+      return true;
+    Val2.convert(APFloat::BFloat(), APFloat::rmNearestTiesToEven, &losesInfo);
+    return !losesInfo;
+  }
   case Type::FloatTyID: {
     if (&Val2.getSemantics() == &APFloat::IEEEsingle())
       return true;
@@ -1383,6 +1509,7 @@ bool ConstantFP::isValueValidForType(Type *Ty, const APFloat& Val) {
   }
   case Type::DoubleTyID: {
     if (&Val2.getSemantics() == &APFloat::IEEEhalf() ||
+        &Val2.getSemantics() == &APFloat::BFloat() ||
         &Val2.getSemantics() == &APFloat::IEEEsingle() ||
         &Val2.getSemantics() == &APFloat::IEEEdouble())
       return true;
@@ -1391,16 +1518,19 @@ bool ConstantFP::isValueValidForType(Type *Ty, const APFloat& Val) {
   }
   case Type::X86_FP80TyID:
     return &Val2.getSemantics() == &APFloat::IEEEhalf() ||
+           &Val2.getSemantics() == &APFloat::BFloat() ||
            &Val2.getSemantics() == &APFloat::IEEEsingle() ||
            &Val2.getSemantics() == &APFloat::IEEEdouble() ||
            &Val2.getSemantics() == &APFloat::x87DoubleExtended();
   case Type::FP128TyID:
     return &Val2.getSemantics() == &APFloat::IEEEhalf() ||
+           &Val2.getSemantics() == &APFloat::BFloat() ||
            &Val2.getSemantics() == &APFloat::IEEEsingle() ||
            &Val2.getSemantics() == &APFloat::IEEEdouble() ||
            &Val2.getSemantics() == &APFloat::IEEEquad();
   case Type::PPC_FP128TyID:
     return &Val2.getSemantics() == &APFloat::IEEEhalf() ||
+           &Val2.getSemantics() == &APFloat::BFloat() ||
            &Val2.getSemantics() == &APFloat::IEEEsingle() ||
            &Val2.getSemantics() == &APFloat::IEEEdouble() ||
            &Val2.getSemantics() == &APFloat::PPCDoubleDouble();
@@ -1450,11 +1580,32 @@ void ConstantVector::destroyConstantImpl() {
 Constant *Constant::getSplatValue(bool AllowUndefs) const {
   assert(this->getType()->isVectorTy() && "Only valid for vectors!");
   if (isa<ConstantAggregateZero>(this))
-    return getNullValue(this->getType()->getVectorElementType());
+    return getNullValue(cast<VectorType>(getType())->getElementType());
   if (const ConstantDataVector *CV = dyn_cast<ConstantDataVector>(this))
     return CV->getSplatValue();
   if (const ConstantVector *CV = dyn_cast<ConstantVector>(this))
     return CV->getSplatValue(AllowUndefs);
+
+  // Check if this is a constant expression splat of the form returned by
+  // ConstantVector::getSplat()
+  const auto *Shuf = dyn_cast<ConstantExpr>(this);
+  if (Shuf && Shuf->getOpcode() == Instruction::ShuffleVector &&
+      isa<UndefValue>(Shuf->getOperand(1))) {
+
+    const auto *IElt = dyn_cast<ConstantExpr>(Shuf->getOperand(0));
+    if (IElt && IElt->getOpcode() == Instruction::InsertElement &&
+        isa<UndefValue>(IElt->getOperand(0))) {
+
+      ArrayRef<int> Mask = Shuf->getShuffleMask();
+      Constant *SplatVal = IElt->getOperand(1);
+      ConstantInt *Index = dyn_cast<ConstantInt>(IElt->getOperand(2));
+
+      if (Index && Index->getValue() == 0 &&
+          std::all_of(Mask.begin(), Mask.end(), [](int I) { return I == 0; }))
+        return SplatVal;
+    }
+  }
+
   return nullptr;
 }
 
@@ -1735,8 +1886,8 @@ Constant *ConstantExpr::getFPCast(Constant *C, Type *Ty) {
 
 Constant *ConstantExpr::getTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
-  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
-  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+  bool fromVec = isa<VectorType>(C->getType());
+  bool toVec = isa<VectorType>(Ty);
 #endif
   assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
   assert(C->getType()->isIntOrIntVectorTy() && "Trunc operand must be integer");
@@ -1749,8 +1900,8 @@ Constant *ConstantExpr::getTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) {
 
 Constant *ConstantExpr::getSExt(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
-  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
-  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+  bool fromVec = isa<VectorType>(C->getType());
+  bool toVec = isa<VectorType>(Ty);
 #endif
   assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
   assert(C->getType()->isIntOrIntVectorTy() && "SExt operand must be integral");
@@ -1763,8 +1914,8 @@ Constant *ConstantExpr::getSExt(Constant *C, Type *Ty, bool OnlyIfReduced) {
 
 Constant *ConstantExpr::getZExt(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
-  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
-  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+  bool fromVec = isa<VectorType>(C->getType());
+  bool toVec = isa<VectorType>(Ty);
 #endif
   assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
   assert(C->getType()->isIntOrIntVectorTy() && "ZEXt operand must be integral");
@@ -1777,8 +1928,8 @@ Constant *ConstantExpr::getZExt(Constant *C, Type *Ty, bool OnlyIfReduced) {
 
 Constant *ConstantExpr::getFPTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
-  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
-  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+  bool fromVec = isa<VectorType>(C->getType());
+  bool toVec = isa<VectorType>(Ty);
 #endif
   assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
   assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
@@ -1789,8 +1940,8 @@ Constant *ConstantExpr::getFPTrunc(Constant *C, Type *Ty, bool OnlyIfReduced) {
 
 Constant *ConstantExpr::getFPExtend(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
-  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
-  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+  bool fromVec = isa<VectorType>(C->getType());
+  bool toVec = isa<VectorType>(Ty);
 #endif
   assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
   assert(C->getType()->isFPOrFPVectorTy() && Ty->isFPOrFPVectorTy() &&
@@ -1801,8 +1952,8 @@ Constant *ConstantExpr::getFPExtend(Constant *C, Type *Ty, bool OnlyIfReduced) {
 
 Constant *ConstantExpr::getUIToFP(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
-  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
-  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+  bool fromVec = isa<VectorType>(C->getType());
+  bool toVec = isa<VectorType>(Ty);
 #endif
   assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
   assert(C->getType()->isIntOrIntVectorTy() && Ty->isFPOrFPVectorTy() &&
@@ -1812,8 +1963,8 @@ Constant *ConstantExpr::getUIToFP(Constant *C, Type *Ty, bool OnlyIfReduced) {
 
 Constant *ConstantExpr::getSIToFP(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
-  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
-  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+  bool fromVec = isa<VectorType>(C->getType());
+  bool toVec = isa<VectorType>(Ty);
 #endif
   assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
   assert(C->getType()->isIntOrIntVectorTy() && Ty->isFPOrFPVectorTy() &&
@@ -1823,8 +1974,8 @@ Constant *ConstantExpr::getSIToFP(Constant *C, Type *Ty, bool OnlyIfReduced) {
 
 Constant *ConstantExpr::getFPToUI(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
-  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
-  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+  bool fromVec = isa<VectorType>(C->getType());
+  bool toVec = isa<VectorType>(Ty);
 #endif
   assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
   assert(C->getType()->isFPOrFPVectorTy() && Ty->isIntOrIntVectorTy() &&
@@ -1834,8 +1985,8 @@ Constant *ConstantExpr::getFPToUI(Constant *C, Type *Ty, bool OnlyIfReduced) {
 
 Constant *ConstantExpr::getFPToSI(Constant *C, Type *Ty, bool OnlyIfReduced) {
 #ifndef NDEBUG
-  bool fromVec = C->getType()->getTypeID() == Type::VectorTyID;
-  bool toVec = Ty->getTypeID() == Type::VectorTyID;
+  bool fromVec = isa<VectorType>(C->getType());
+  bool toVec = isa<VectorType>(Ty);
 #endif
   assert((fromVec == toVec) && "Cannot convert from scalar to/from vector");
   assert(C->getType()->isFPOrFPVectorTy() && Ty->isIntOrIntVectorTy() &&
@@ -1851,7 +2002,8 @@ Constant *ConstantExpr::getPtrToInt(Constant *C, Type *DstTy,
          "PtrToInt destination must be integer or integer vector");
   assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy));
   if (isa<VectorType>(C->getType()))
-    assert(C->getType()->getVectorNumElements()==DstTy->getVectorNumElements()&&
+    assert(cast<VectorType>(C->getType())->getNumElements() ==
+               cast<VectorType>(DstTy)->getNumElements() &&
            "Invalid cast between a different number of vector elements");
   return getFoldedCast(Instruction::PtrToInt, C, DstTy, OnlyIfReduced);
 }
@@ -1864,7 +2016,8 @@ Constant *ConstantExpr::getIntToPtr(Constant *C, Type *DstTy,
          "IntToPtr destination must be a pointer or pointer vector");
   assert(isa<VectorType>(C->getType()) == isa<VectorType>(DstTy));
   if (isa<VectorType>(C->getType()))
-    assert(C->getType()->getVectorNumElements()==DstTy->getVectorNumElements()&&
+    assert(cast<VectorType>(C->getType())->getNumElements() ==
+               cast<VectorType>(DstTy)->getNumElements() &&
            "Invalid cast between a different number of vector elements");
   return getFoldedCast(Instruction::IntToPtr, C, DstTy, OnlyIfReduced);
 }
@@ -1895,14 +2048,14 @@ Constant *ConstantExpr::getAddrSpaceCast(Constant *C, Type *DstTy,
     Type *MidTy = PointerType::get(DstElemTy, SrcScalarTy->getAddressSpace());
     if (VectorType *VT = dyn_cast<VectorType>(DstTy)) {
       // Handle vectors of pointers.
-      MidTy = VectorType::get(MidTy, VT->getNumElements());
+      MidTy = FixedVectorType::get(MidTy, VT->getNumElements());
     }
     C = getBitCast(C, MidTy);
   }
   return getFoldedCast(Instruction::AddrSpaceCast, C, DstTy, OnlyIfReduced);
 }
 
-Constant *ConstantExpr::get(unsigned Opcode, Constant *C, unsigned Flags, 
+Constant *ConstantExpr::get(unsigned Opcode, Constant *C, unsigned Flags,
                             Type *OnlyIfReducedTy) {
   // Check the operands for consistency first.
   assert(Instruction::isUnaryOp(Opcode) &&
@@ -2092,15 +2245,16 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C,
   unsigned AS = C->getType()->getPointerAddressSpace();
   Type *ReqTy = DestTy->getPointerTo(AS);
 
-  unsigned NumVecElts = 0;
-  if (C->getType()->isVectorTy())
-    NumVecElts = C->getType()->getVectorNumElements();
-  else for (auto Idx : Idxs)
-    if (Idx->getType()->isVectorTy())
-      NumVecElts = Idx->getType()->getVectorNumElements();
+  ElementCount EltCount = {0, false};
+  if (VectorType *VecTy = dyn_cast<VectorType>(C->getType()))
+    EltCount = VecTy->getElementCount();
+  else
+    for (auto Idx : Idxs)
+      if (VectorType *VecTy = dyn_cast<VectorType>(Idx->getType()))
+        EltCount = VecTy->getElementCount();
 
-  if (NumVecElts)
-    ReqTy = VectorType::get(ReqTy, NumVecElts);
+  if (EltCount.Min != 0)
+    ReqTy = VectorType::get(ReqTy, EltCount);
 
   if (OnlyIfReducedTy == ReqTy)
     return nullptr;
@@ -2109,14 +2263,20 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C,
   std::vector<Constant*> ArgVec;
   ArgVec.reserve(1 + Idxs.size());
   ArgVec.push_back(C);
-  for (unsigned i = 0, e = Idxs.size(); i != e; ++i) {
-    assert((!Idxs[i]->getType()->isVectorTy() ||
-            Idxs[i]->getType()->getVectorNumElements() == NumVecElts) &&
-           "getelementptr index type missmatch");
-
-    Constant *Idx = cast<Constant>(Idxs[i]);
-    if (NumVecElts && !Idxs[i]->getType()->isVectorTy())
-      Idx = ConstantVector::getSplat(NumVecElts, Idx);
+  auto GTI = gep_type_begin(Ty, Idxs), GTE = gep_type_end(Ty, Idxs);
+  for (; GTI != GTE; ++GTI) {
+    auto *Idx = cast<Constant>(GTI.getOperand());
+    assert(
+        (!isa<VectorType>(Idx->getType()) ||
+         cast<VectorType>(Idx->getType())->getElementCount() == EltCount) &&
+        "getelementptr index type missmatch");
+
+    if (GTI.isStruct() && Idx->getType()->isVectorTy()) {
+      Idx = Idx->getSplatValue();
+    } else if (GTI.isSequential() && EltCount.Min != 0 &&
+               !Idx->getType()->isVectorTy()) {
+      Idx = ConstantVector::getSplat(EltCount, Idx);
+    }
     ArgVec.push_back(Idx);
   }
 
@@ -2124,7 +2284,7 @@ Constant *ConstantExpr::getGetElementPtr(Type *Ty, Constant *C,
   if (InRangeIndex && *InRangeIndex < 63)
     SubClassOptionalData |= (*InRangeIndex + 1) << 1;
   const ConstantExprKeyType Key(Instruction::GetElementPtr, ArgVec, 0,
-                                SubClassOptionalData, None, Ty);
+                                SubClassOptionalData, None, None, Ty);
 
   LLVMContextImpl *pImpl = C->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(ReqTy, Key);
@@ -2149,7 +2309,7 @@ Constant *ConstantExpr::getICmp(unsigned short pred, Constant *LHS,
 
   Type *ResultTy = Type::getInt1Ty(LHS->getContext());
   if (VectorType *VT = dyn_cast<VectorType>(LHS->getType()))
-    ResultTy = VectorType::get(ResultTy, VT->getNumElements());
+    ResultTy = VectorType::get(ResultTy, VT->getElementCount());
 
   LLVMContextImpl *pImpl = LHS->getType()->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(ResultTy, Key);
@@ -2174,7 +2334,7 @@ Constant *ConstantExpr::getFCmp(unsigned short pred, Constant *LHS,
 
   Type *ResultTy = Type::getInt1Ty(LHS->getContext());
   if (VectorType *VT = dyn_cast<VectorType>(LHS->getType()))
-    ResultTy = VectorType::get(ResultTy, VT->getNumElements());
+    ResultTy = VectorType::get(ResultTy, VT->getElementCount());
 
   LLVMContextImpl *pImpl = LHS->getType()->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(ResultTy, Key);
@@ -2190,7 +2350,7 @@ Constant *ConstantExpr::getExtractElement(Constant *Val, Constant *Idx,
   if (Constant *FC = ConstantFoldExtractElementInstruction(Val, Idx))
     return FC;          // Fold a few common cases.
 
-  Type *ReqTy = Val->getType()->getVectorElementType();
+  Type *ReqTy = cast<VectorType>(Val->getType())->getElementType();
   if (OnlyIfReducedTy == ReqTy)
     return nullptr;
 
@@ -2206,7 +2366,7 @@ Constant *ConstantExpr::getInsertElement(Constant *Val, Constant *Elt,
                                          Constant *Idx, Type *OnlyIfReducedTy) {
   assert(Val->getType()->isVectorTy() &&
          "Tried to create insertelement operation on non-vector type!");
-  assert(Elt->getType() == Val->getType()->getVectorElementType() &&
+  assert(Elt->getType() == cast<VectorType>(Val->getType())->getElementType() &&
          "Insertelement types must match!");
   assert(Idx->getType()->isIntegerTy() &&
          "Insertelement index must be i32 type!");
@@ -2226,23 +2386,26 @@ Constant *ConstantExpr::getInsertElement(Constant *Val, Constant *Elt,
 }
 
 Constant *ConstantExpr::getShuffleVector(Constant *V1, Constant *V2,
-                                         Constant *Mask, Type *OnlyIfReducedTy) {
+                                         ArrayRef<int> Mask,
+                                         Type *OnlyIfReducedTy) {
   assert(ShuffleVectorInst::isValidOperands(V1, V2, Mask) &&
          "Invalid shuffle vector constant expr operands!");
 
   if (Constant *FC = ConstantFoldShuffleVectorInstruction(V1, V2, Mask))
     return FC;          // Fold a few common cases.
 
-  ElementCount NElts = Mask->getType()->getVectorElementCount();
-  Type *EltTy = V1->getType()->getVectorElementType();
-  Type *ShufTy = VectorType::get(EltTy, NElts);
+  unsigned NElts = Mask.size();
+  auto V1VTy = cast<VectorType>(V1->getType());
+  Type *EltTy = V1VTy->getElementType();
+  bool TypeIsScalable = isa<ScalableVectorType>(V1VTy);
+  Type *ShufTy = VectorType::get(EltTy, NElts, TypeIsScalable);
 
   if (OnlyIfReducedTy == ShufTy)
     return nullptr;
 
   // Look up the constant in the table first to ensure uniqueness
-  Constant *ArgVec[] = { V1, V2, Mask };
-  const ConstantExprKeyType Key(Instruction::ShuffleVector, ArgVec);
+  Constant *ArgVec[] = {V1, V2};
+  ConstantExprKeyType Key(Instruction::ShuffleVector, ArgVec, 0, 0, None, Mask);
 
   LLVMContextImpl *pImpl = ShufTy->getContext().pImpl;
   return pImpl->ExprConstants.getOrCreate(ShufTy, Key);
@@ -2499,7 +2662,9 @@ Type *GetElementPtrConstantExpr::getResultElementType() const {
 //                       ConstantData* implementations
 
 Type *ConstantDataSequential::getElementType() const {
-  return getType()->getElementType();
+  if (ArrayType *ATy = dyn_cast<ArrayType>(getType()))
+    return ATy->getElementType();
+  return cast<VectorType>(getType())->getElementType();
 }
 
 StringRef ConstantDataSequential::getRawDataValues() const {
@@ -2507,7 +2672,8 @@ StringRef ConstantDataSequential::getRawDataValues() const {
 }
 
 bool ConstantDataSequential::isElementTypeCompatible(Type *Ty) {
-  if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy()) return true;
+  if (Ty->isHalfTy() || Ty->isBFloatTy() || Ty->isFloatTy() || Ty->isDoubleTy())
+    return true;
   if (auto *IT = dyn_cast<IntegerType>(Ty)) {
     switch (IT->getBitWidth()) {
     case 8:
@@ -2524,7 +2690,7 @@ bool ConstantDataSequential::isElementTypeCompatible(Type *Ty) {
 unsigned ConstantDataSequential::getNumElements() const {
   if (ArrayType *AT = dyn_cast<ArrayType>(getType()))
     return AT->getNumElements();
-  return getType()->getVectorNumElements();
+  return cast<VectorType>(getType())->getNumElements();
 }
 
 
@@ -2552,7 +2718,12 @@ static bool isAllZeros(StringRef Arr) {
 /// the correct element type.  We take the bytes in as a StringRef because
 /// we *want* an underlying "char*" to avoid TBAA type punning violations.
 Constant *ConstantDataSequential::getImpl(StringRef Elements, Type *Ty) {
-  assert(isElementTypeCompatible(Ty->getSequentialElementType()));
+#ifndef NDEBUG
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Ty))
+    assert(isElementTypeCompatible(ATy->getElementType()));
+  else
+    assert(isElementTypeCompatible(cast<VectorType>(Ty)->getElementType()));
+#endif
   // If the elements are all zero or there are no elements, return a CAZ, which
   // is more dense and canonical.
   if (isAllZeros(Elements))
@@ -2620,26 +2791,29 @@ void ConstantDataSequential::destroyConstantImpl() {
   Next = nullptr;
 }
 
-/// getFP() constructors - Return a constant with array type with an element
-/// count and element type of float with precision matching the number of
-/// bits in the ArrayRef passed in. (i.e. half for 16bits, float for 32bits,
-/// double for 64bits) Note that this can return a ConstantAggregateZero
-/// object.
-Constant *ConstantDataArray::getFP(LLVMContext &Context,
-                                   ArrayRef<uint16_t> Elts) {
-  Type *Ty = ArrayType::get(Type::getHalfTy(Context), Elts.size());
+/// getFP() constructors - Return a constant of array type with a float
+/// element type taken from argument `ElementType', and count taken from
+/// argument `Elts'.  The amount of bits of the contained type must match the
+/// number of bits of the type contained in the passed in ArrayRef.
+/// (i.e. half or bfloat for 16bits, float for 32bits, double for 64bits) Note
+/// that this can return a ConstantAggregateZero object.
+Constant *ConstantDataArray::getFP(Type *ElementType, ArrayRef<uint16_t> Elts) {
+  assert((ElementType->isHalfTy() || ElementType->isBFloatTy()) &&
+         "Element type is not a 16-bit float type");
+  Type *Ty = ArrayType::get(ElementType, Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(Data, Elts.size() * 2), Ty);
 }
-Constant *ConstantDataArray::getFP(LLVMContext &Context,
-                                   ArrayRef<uint32_t> Elts) {
-  Type *Ty = ArrayType::get(Type::getFloatTy(Context), Elts.size());
+Constant *ConstantDataArray::getFP(Type *ElementType, ArrayRef<uint32_t> Elts) {
+  assert(ElementType->isFloatTy() && "Element type is not a 32-bit float type");
+  Type *Ty = ArrayType::get(ElementType, Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(Data, Elts.size() * 4), Ty);
 }
-Constant *ConstantDataArray::getFP(LLVMContext &Context,
-                                   ArrayRef<uint64_t> Elts) {
-  Type *Ty = ArrayType::get(Type::getDoubleTy(Context), Elts.size());
+Constant *ConstantDataArray::getFP(Type *ElementType, ArrayRef<uint64_t> Elts) {
+  assert(ElementType->isDoubleTy() &&
+         "Element type is not a 64-bit float type");
+  Type *Ty = ArrayType::get(ElementType, Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(Data, Elts.size() * 8), Ty);
 }
@@ -2661,56 +2835,62 @@ Constant *ConstantDataArray::getString(LLVMContext &Context,
 /// count and element type matching the ArrayRef passed in.  Note that this
 /// can return a ConstantAggregateZero object.
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint8_t> Elts){
-  Type *Ty = VectorType::get(Type::getInt8Ty(Context), Elts.size());
+  auto *Ty = FixedVectorType::get(Type::getInt8Ty(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(Data, Elts.size() * 1), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint16_t> Elts){
-  Type *Ty = VectorType::get(Type::getInt16Ty(Context), Elts.size());
+  auto *Ty = FixedVectorType::get(Type::getInt16Ty(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(Data, Elts.size() * 2), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint32_t> Elts){
-  Type *Ty = VectorType::get(Type::getInt32Ty(Context), Elts.size());
+  auto *Ty = FixedVectorType::get(Type::getInt32Ty(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(Data, Elts.size() * 4), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<uint64_t> Elts){
-  Type *Ty = VectorType::get(Type::getInt64Ty(Context), Elts.size());
+  auto *Ty = FixedVectorType::get(Type::getInt64Ty(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(Data, Elts.size() * 8), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<float> Elts) {
-  Type *Ty = VectorType::get(Type::getFloatTy(Context), Elts.size());
+  auto *Ty = FixedVectorType::get(Type::getFloatTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(Data, Elts.size() * 4), Ty);
 }
 Constant *ConstantDataVector::get(LLVMContext &Context, ArrayRef<double> Elts) {
-  Type *Ty = VectorType::get(Type::getDoubleTy(Context), Elts.size());
+  auto *Ty = FixedVectorType::get(Type::getDoubleTy(Context), Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(Data, Elts.size() * 8), Ty);
 }
 
-/// getFP() constructors - Return a constant with vector type with an element
-/// count and element type of float with the precision matching the number of
-/// bits in the ArrayRef passed in.  (i.e. half for 16bits, float for 32bits,
-/// double for 64bits) Note that this can return a ConstantAggregateZero
-/// object.
-Constant *ConstantDataVector::getFP(LLVMContext &Context,
+/// getFP() constructors - Return a constant of vector type with a float
+/// element type taken from argument `ElementType', and count taken from
+/// argument `Elts'.  The amount of bits of the contained type must match the
+/// number of bits of the type contained in the passed in ArrayRef.
+/// (i.e. half or bfloat for 16bits, float for 32bits, double for 64bits) Note
+/// that this can return a ConstantAggregateZero object.
+Constant *ConstantDataVector::getFP(Type *ElementType,
                                     ArrayRef<uint16_t> Elts) {
-  Type *Ty = VectorType::get(Type::getHalfTy(Context), Elts.size());
+  assert((ElementType->isHalfTy() || ElementType->isBFloatTy()) &&
+         "Element type is not a 16-bit float type");
+  auto *Ty = FixedVectorType::get(ElementType, Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(Data, Elts.size() * 2), Ty);
 }
-Constant *ConstantDataVector::getFP(LLVMContext &Context,
+Constant *ConstantDataVector::getFP(Type *ElementType,
                                     ArrayRef<uint32_t> Elts) {
-  Type *Ty = VectorType::get(Type::getFloatTy(Context), Elts.size());
+  assert(ElementType->isFloatTy() && "Element type is not a 32-bit float type");
+  auto *Ty = FixedVectorType::get(ElementType, Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(Data, Elts.size() * 4), Ty);
 }
-Constant *ConstantDataVector::getFP(LLVMContext &Context,
+Constant *ConstantDataVector::getFP(Type *ElementType,
                                     ArrayRef<uint64_t> Elts) {
-  Type *Ty = VectorType::get(Type::getDoubleTy(Context), Elts.size());
+  assert(ElementType->isDoubleTy() &&
+         "Element type is not a 64-bit float type");
+  auto *Ty = FixedVectorType::get(ElementType, Elts.size());
   const char *Data = reinterpret_cast<const char *>(Elts.data());
   return getImpl(StringRef(Data, Elts.size() * 8), Ty);
 }
@@ -2740,20 +2920,25 @@ Constant *ConstantDataVector::getSplat(unsigned NumElts, Constant *V) {
     if (CFP->getType()->isHalfTy()) {
       SmallVector<uint16_t, 16> Elts(
           NumElts, CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-      return getFP(V->getContext(), Elts);
+      return getFP(V->getType(), Elts);
+    }
+    if (CFP->getType()->isBFloatTy()) {
+      SmallVector<uint16_t, 16> Elts(
+          NumElts, CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
+      return getFP(V->getType(), Elts);
     }
     if (CFP->getType()->isFloatTy()) {
       SmallVector<uint32_t, 16> Elts(
           NumElts, CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-      return getFP(V->getContext(), Elts);
+      return getFP(V->getType(), Elts);
     }
     if (CFP->getType()->isDoubleTy()) {
       SmallVector<uint64_t, 16> Elts(
           NumElts, CFP->getValueAPF().bitcastToAPInt().getLimitedValue());
-      return getFP(V->getContext(), Elts);
+      return getFP(V->getType(), Elts);
     }
   }
-  return ConstantVector::getSplat(NumElts, V);
+  return ConstantVector::getSplat({NumElts, false}, V);
 }
 
 
@@ -2815,6 +3000,10 @@ APFloat ConstantDataSequential::getElementAsAPFloat(unsigned Elt) const {
     auto EltVal = *reinterpret_cast<const uint16_t *>(EltPtr);
     return APFloat(APFloat::IEEEhalf(), APInt(16, EltVal));
   }
+  case Type::BFloatTyID: {
+    auto EltVal = *reinterpret_cast<const uint16_t *>(EltPtr);
+    return APFloat(APFloat::BFloat(), APInt(16, EltVal));
+  }
   case Type::FloatTyID: {
     auto EltVal = *reinterpret_cast<const uint32_t *>(EltPtr);
     return APFloat(APFloat::IEEEsingle(), APInt(32, EltVal));
@@ -2839,8 +3028,8 @@ double ConstantDataSequential::getElementAsDouble(unsigned Elt) const {
 }
 
 Constant *ConstantDataSequential::getElementAsConstant(unsigned Elt) const {
-  if (getElementType()->isHalfTy() || getElementType()->isFloatTy() ||
-      getElementType()->isDoubleTy())
+  if (getElementType()->isHalfTy() || getElementType()->isBFloatTy() ||
+      getElementType()->isFloatTy() || getElementType()->isDoubleTy())
     return ConstantFP::get(getContext(), getElementAsAPFloat(Elt));
 
   return ConstantInt::get(getElementType(), getElementAsInteger(Elt));
@@ -2863,7 +3052,7 @@ bool ConstantDataSequential::isCString() const {
   return Str.drop_back().find(0) == StringRef::npos;
 }
 
-bool ConstantDataVector::isSplat() const {
+bool ConstantDataVector::isSplatData() const {
   const char *Base = getRawDataValues().data();
 
   // Compare elements 1+ to the 0'th element.
@@ -2875,6 +3064,14 @@ bool ConstantDataVector::isSplat() const {
   return true;
 }
 
+bool ConstantDataVector::isSplat() const {
+  if (!IsSplatSet) {
+    IsSplatSet = true;
+    IsSplat = isSplatData();
+  }
+  return IsSplat;
+}
+
 Constant *ConstantDataVector::getSplatValue() const {
   // If they're all the same, return the 0th one as a representative.
   return isSplat() ? getElementAsConstant(0) : nullptr;
@@ -3081,7 +3278,7 @@ Instruction *ConstantExpr::getAsInstruction() const {
   case Instruction::ExtractValue:
     return ExtractValueInst::Create(Ops[0], getIndices());
   case Instruction::ShuffleVector:
-    return new ShuffleVectorInst(Ops[0], Ops[1], Ops[2]);
+    return new ShuffleVectorInst(Ops[0], Ops[1], getShuffleMask());
 
   case Instruction::GetElementPtr: {
     const auto *GO = cast<GEPOperator>(this);
diff --git a/llvm/lib/IR/ConstantsContext.h b/llvm/lib/IR/ConstantsContext.h
index f5e2481f3903..95c5ab6d0ee4 100644
--- a/llvm/lib/IR/ConstantsContext.h
+++ b/llvm/lib/IR/ConstantsContext.h
@@ -26,6 +26,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/OperandTraits.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
@@ -42,7 +43,7 @@ namespace llvm {
 
 /// UnaryConstantExpr - This class is private to Constants.cpp, and is used
 /// behind the scenes to implement unary constant exprs.
-class UnaryConstantExpr : public ConstantExpr {
+class UnaryConstantExpr final : public ConstantExpr {
 public:
   UnaryConstantExpr(unsigned Opcode, Constant *C, Type *Ty)
     : ConstantExpr(Ty, Opcode, &Op<0>(), 1) {
@@ -55,11 +56,19 @@ public:
   }
 
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  static bool classof(const ConstantExpr *CE) {
+    return Instruction::isCast(CE->getOpcode()) ||
+           Instruction::isUnaryOp(CE->getOpcode());
+  }
+  static bool classof(const Value *V) {
+    return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V));
+  }
 };
 
 /// BinaryConstantExpr - This class is private to Constants.cpp, and is used
 /// behind the scenes to implement binary constant exprs.
-class BinaryConstantExpr : public ConstantExpr {
+class BinaryConstantExpr final : public ConstantExpr {
 public:
   BinaryConstantExpr(unsigned Opcode, Constant *C1, Constant *C2,
                      unsigned Flags)
@@ -76,11 +85,18 @@ public:
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  static bool classof(const ConstantExpr *CE) {
+    return Instruction::isBinaryOp(CE->getOpcode());
+  }
+  static bool classof(const Value *V) {
+    return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V));
+  }
 };
 
 /// SelectConstantExpr - This class is private to Constants.cpp, and is used
 /// behind the scenes to implement select constant exprs.
-class SelectConstantExpr : public ConstantExpr {
+class SelectConstantExpr final : public ConstantExpr {
 public:
   SelectConstantExpr(Constant *C1, Constant *C2, Constant *C3)
     : ConstantExpr(C2->getType(), Instruction::Select, &Op<0>(), 3) {
@@ -96,12 +112,19 @@ public:
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  static bool classof(const ConstantExpr *CE) {
+    return CE->getOpcode() == Instruction::Select;
+  }
+  static bool classof(const Value *V) {
+    return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V));
+  }
 };
 
 /// ExtractElementConstantExpr - This class is private to
 /// Constants.cpp, and is used behind the scenes to implement
 /// extractelement constant exprs.
-class ExtractElementConstantExpr : public ConstantExpr {
+class ExtractElementConstantExpr final : public ConstantExpr {
 public:
   ExtractElementConstantExpr(Constant *C1, Constant *C2)
     : ConstantExpr(cast<VectorType>(C1->getType())->getElementType(),
@@ -117,12 +140,19 @@ public:
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  static bool classof(const ConstantExpr *CE) {
+    return CE->getOpcode() == Instruction::ExtractElement;
+  }
+  static bool classof(const Value *V) {
+    return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V));
+  }
 };
 
 /// InsertElementConstantExpr - This class is private to
 /// Constants.cpp, and is used behind the scenes to implement
 /// insertelement constant exprs.
-class InsertElementConstantExpr : public ConstantExpr {
+class InsertElementConstantExpr final : public ConstantExpr {
 public:
   InsertElementConstantExpr(Constant *C1, Constant *C2, Constant *C3)
     : ConstantExpr(C1->getType(), Instruction::InsertElement,
@@ -139,37 +169,54 @@ public:
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  static bool classof(const ConstantExpr *CE) {
+    return CE->getOpcode() == Instruction::InsertElement;
+  }
+  static bool classof(const Value *V) {
+    return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V));
+  }
 };
 
 /// ShuffleVectorConstantExpr - This class is private to
 /// Constants.cpp, and is used behind the scenes to implement
 /// shufflevector constant exprs.
-class ShuffleVectorConstantExpr : public ConstantExpr {
+class ShuffleVectorConstantExpr final : public ConstantExpr {
 public:
-  ShuffleVectorConstantExpr(Constant *C1, Constant *C2, Constant *C3)
-  : ConstantExpr(VectorType::get(
-                   cast<VectorType>(C1->getType())->getElementType(),
-                   cast<VectorType>(C3->getType())->getElementCount()),
-                 Instruction::ShuffleVector,
-                 &Op<0>(), 3) {
+  ShuffleVectorConstantExpr(Constant *C1, Constant *C2, ArrayRef<int> Mask)
+      : ConstantExpr(VectorType::get(
+                         cast<VectorType>(C1->getType())->getElementType(),
+                         Mask.size(), isa<ScalableVectorType>(C1->getType())),
+                     Instruction::ShuffleVector, &Op<0>(), 2) {
+    assert(ShuffleVectorInst::isValidOperands(C1, C2, Mask) &&
+           "Invalid shuffle vector instruction operands!");
     Op<0>() = C1;
     Op<1>() = C2;
-    Op<2>() = C3;
+    ShuffleMask.assign(Mask.begin(), Mask.end());
+    ShuffleMaskForBitcode =
+        ShuffleVectorInst::convertShuffleMaskForBitcode(Mask, getType());
   }
 
-  // allocate space for exactly three operands
-  void *operator new(size_t s) {
-    return User::operator new(s, 3);
-  }
+  SmallVector<int, 4> ShuffleMask;
+  Constant *ShuffleMaskForBitcode;
+
+  void *operator new(size_t s) { return User::operator new(s, 2); }
 
   /// Transparently provide more efficient getOperand methods.
   DECLARE_TRANSPARENT_OPERAND_ACCESSORS(Value);
+
+  static bool classof(const ConstantExpr *CE) {
+    return CE->getOpcode() == Instruction::ShuffleVector;
+  }
+  static bool classof(const Value *V) {
+    return isa<ConstantExpr>(V) && classof(cast<ConstantExpr>(V));
+  }
 };
 
 /// ExtractValueConstantExpr - This class is private to
 /// Constants.cpp, and is used behind the scenes to implement
 /// extractvalue constant exprs.
-class ExtractValueConstantExpr : public ConstantExpr {
+class ExtractValueConstantExpr final : public ConstantExpr {
 public:
   ExtractValueConstantExpr(Constant *Agg, ArrayRef<unsigned> IdxList,
                            Type *DestTy)
@@ -200,7 +247,7 @@ public:
 /// InsertValueConstantExpr - This class is private to
 /// Constants.cpp, and is used behind the scenes to implement
 /// insertvalue constant exprs.
-class InsertValueConstantExpr : public ConstantExpr {
+class InsertValueConstantExpr final : public ConstantExpr {
 public:
   InsertValueConstantExpr(Constant *Agg, Constant *Val,
                           ArrayRef<unsigned> IdxList, Type *DestTy)
@@ -231,7 +278,7 @@ public:
 
 /// GetElementPtrConstantExpr - This class is private to Constants.cpp, and is
 /// used behind the scenes to implement getelementpr constant exprs.
-class GetElementPtrConstantExpr : public ConstantExpr {
+class GetElementPtrConstantExpr final : public ConstantExpr {
   Type *SrcElementTy;
   Type *ResElementTy;
 
@@ -265,7 +312,7 @@ public:
 // CompareConstantExpr - This class is private to Constants.cpp, and is used
 // behind the scenes to implement ICmp and FCmp constant expressions. This is
 // needed in order to store the predicate value for these instructions.
-class CompareConstantExpr : public ConstantExpr {
+class CompareConstantExpr final : public ConstantExpr {
 public:
   unsigned short predicate;
   CompareConstantExpr(Type *ty, Instruction::OtherOps opc,
@@ -319,7 +366,7 @@ DEFINE_TRANSPARENT_OPERAND_ACCESSORS(InsertElementConstantExpr, Value)
 
 template <>
 struct OperandTraits<ShuffleVectorConstantExpr>
-    : public FixedNumOperandTraits<ShuffleVectorConstantExpr, 3> {};
+    : public FixedNumOperandTraits<ShuffleVectorConstantExpr, 2> {};
 DEFINE_TRANSPARENT_OPERAND_ACCESSORS(ShuffleVectorConstantExpr, Value)
 
 template <>
@@ -454,42 +501,64 @@ struct InlineAsmKeyType {
 
   InlineAsm *create(TypeClass *Ty) const {
     assert(PointerType::getUnqual(FTy) == Ty);
-    return new InlineAsm(FTy, AsmString, Constraints, HasSideEffects,
-                         IsAlignStack, AsmDialect);
+    return new InlineAsm(FTy, std::string(AsmString), std::string(Constraints),
+                         HasSideEffects, IsAlignStack, AsmDialect);
   }
 };
 
 struct ConstantExprKeyType {
+private:
   uint8_t Opcode;
   uint8_t SubclassOptionalData;
   uint16_t SubclassData;
   ArrayRef<Constant *> Ops;
   ArrayRef<unsigned> Indexes;
+  ArrayRef<int> ShuffleMask;
   Type *ExplicitTy;
 
+  static ArrayRef<int> getShuffleMaskIfValid(const ConstantExpr *CE) {
+    if (CE->getOpcode() == Instruction::ShuffleVector)
+      return CE->getShuffleMask();
+    return None;
+  }
+
+  static ArrayRef<unsigned> getIndicesIfValid(const ConstantExpr *CE) {
+    if (CE->hasIndices())
+      return CE->getIndices();
+    return None;
+  }
+
+  static Type *getSourceElementTypeIfValid(const ConstantExpr *CE) {
+    if (auto *GEPCE = dyn_cast<GetElementPtrConstantExpr>(CE))
+      return GEPCE->getSourceElementType();
+    return nullptr;
+  }
+
+public:
   ConstantExprKeyType(unsigned Opcode, ArrayRef<Constant *> Ops,
                       unsigned short SubclassData = 0,
                       unsigned short SubclassOptionalData = 0,
                       ArrayRef<unsigned> Indexes = None,
+                      ArrayRef<int> ShuffleMask = None,
                       Type *ExplicitTy = nullptr)
       : Opcode(Opcode), SubclassOptionalData(SubclassOptionalData),
         SubclassData(SubclassData), Ops(Ops), Indexes(Indexes),
-        ExplicitTy(ExplicitTy) {}
+        ShuffleMask(ShuffleMask), ExplicitTy(ExplicitTy) {}
 
   ConstantExprKeyType(ArrayRef<Constant *> Operands, const ConstantExpr *CE)
       : Opcode(CE->getOpcode()),
         SubclassOptionalData(CE->getRawSubclassOptionalData()),
         SubclassData(CE->isCompare() ? CE->getPredicate() : 0), Ops(Operands),
-        Indexes(CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()),
-        ExplicitTy(nullptr) {}
+        Indexes(getIndicesIfValid(CE)), ShuffleMask(getShuffleMaskIfValid(CE)),
+        ExplicitTy(getSourceElementTypeIfValid(CE)) {}
 
   ConstantExprKeyType(const ConstantExpr *CE,
                       SmallVectorImpl<Constant *> &Storage)
       : Opcode(CE->getOpcode()),
         SubclassOptionalData(CE->getRawSubclassOptionalData()),
         SubclassData(CE->isCompare() ? CE->getPredicate() : 0),
-        Indexes(CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()),
-        ExplicitTy(nullptr) {
+        Indexes(getIndicesIfValid(CE)), ShuffleMask(getShuffleMaskIfValid(CE)),
+        ExplicitTy(getSourceElementTypeIfValid(CE)) {
     assert(Storage.empty() && "Expected empty storage");
     for (unsigned I = 0, E = CE->getNumOperands(); I != E; ++I)
       Storage.push_back(CE->getOperand(I));
@@ -499,7 +568,8 @@ struct ConstantExprKeyType {
   bool operator==(const ConstantExprKeyType &X) const {
     return Opcode == X.Opcode && SubclassData == X.SubclassData &&
            SubclassOptionalData == X.SubclassOptionalData && Ops == X.Ops &&
-           Indexes == X.Indexes;
+           Indexes == X.Indexes && ShuffleMask == X.ShuffleMask &&
+           ExplicitTy == X.ExplicitTy;
   }
 
   bool operator==(const ConstantExpr *CE) const {
@@ -514,15 +584,21 @@ struct ConstantExprKeyType {
     for (unsigned I = 0, E = Ops.size(); I != E; ++I)
       if (Ops[I] != CE->getOperand(I))
         return false;
-    if (Indexes != (CE->hasIndices() ? CE->getIndices() : ArrayRef<unsigned>()))
+    if (Indexes != getIndicesIfValid(CE))
+      return false;
+    if (ShuffleMask != getShuffleMaskIfValid(CE))
+      return false;
+    if (ExplicitTy != getSourceElementTypeIfValid(CE))
       return false;
     return true;
   }
 
   unsigned getHash() const {
-    return hash_combine(Opcode, SubclassOptionalData, SubclassData,
-                        hash_combine_range(Ops.begin(), Ops.end()),
-                        hash_combine_range(Indexes.begin(), Indexes.end()));
+    return hash_combine(
+        Opcode, SubclassOptionalData, SubclassData,
+        hash_combine_range(Ops.begin(), Ops.end()),
+        hash_combine_range(Indexes.begin(), Indexes.end()),
+        hash_combine_range(ShuffleMask.begin(), ShuffleMask.end()), ExplicitTy);
   }
 
   using TypeClass = ConstantInfo<ConstantExpr>::TypeClass;
@@ -546,17 +622,14 @@ struct ConstantExprKeyType {
     case Instruction::InsertElement:
       return new InsertElementConstantExpr(Ops[0], Ops[1], Ops[2]);
     case Instruction::ShuffleVector:
-      return new ShuffleVectorConstantExpr(Ops[0], Ops[1], Ops[2]);
+      return new ShuffleVectorConstantExpr(Ops[0], Ops[1], ShuffleMask);
     case Instruction::InsertValue:
       return new InsertValueConstantExpr(Ops[0], Ops[1], Indexes, Ty);
     case Instruction::ExtractValue:
       return new ExtractValueConstantExpr(Ops[0], Indexes, Ty);
     case Instruction::GetElementPtr:
-      return GetElementPtrConstantExpr::Create(
-          ExplicitTy ? ExplicitTy
-                     : cast<PointerType>(Ops[0]->getType()->getScalarType())
-                           ->getElementType(),
-          Ops[0], Ops.slice(1), Ty, SubclassOptionalData);
+      return GetElementPtrConstantExpr::Create(ExplicitTy, Ops[0], Ops.slice(1),
+                                               Ty, SubclassOptionalData);
     case Instruction::ICmp:
       return new CompareConstantExpr(Ty, Instruction::ICmp, SubclassData,
                                      Ops[0], Ops[1]);
@@ -567,6 +640,10 @@ struct ConstantExprKeyType {
   }
 };
 
+// Free memory for a given constant.  Assumes the constant has already been
+// removed from all relevant maps.
+void deleteConstant(Constant *C);
+
 template <class ConstantClass> class ConstantUniqueMap {
 public:
   using ValType = typename ConstantInfo<ConstantClass>::ValType;
@@ -630,7 +707,7 @@ public:
 
   void freeConstants() {
     for (auto &I : Map)
-      delete I; // Asserts that use_empty().
+      deleteConstant(I);
   }
 
 private:
@@ -703,6 +780,11 @@ public:
   }
 };
 
+template <> inline void ConstantUniqueMap<InlineAsm>::freeConstants() {
+  for (auto &I : Map)
+    delete I;
+}
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_IR_CONSTANTSCONTEXT_H
diff --git a/llvm/lib/IR/Core.cpp b/llvm/lib/IR/Core.cpp
index 04e34a90a9bc..6f3bbc80d4fd 100644
--- a/llvm/lib/IR/Core.cpp
+++ b/llvm/lib/IR/Core.cpp
@@ -127,11 +127,8 @@ unsigned LLVMGetMDKindID(const char *Name, unsigned SLen) {
   return LLVMGetMDKindIDInContext(LLVMGetGlobalContext(), Name, SLen);
 }
 
-#define GET_ATTR_KIND_FROM_NAME
-#include "AttributesCompatFunc.inc"
-
 unsigned LLVMGetEnumAttributeKindForName(const char *Name, size_t SLen) {
-  return getAttrKindFromName(StringRef(Name, SLen));
+  return Attribute::getAttrKindFromName(StringRef(Name, SLen));
 }
 
 unsigned LLVMGetLastEnumAttributeKind(void) {
@@ -480,6 +477,8 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
     return LLVMVoidTypeKind;
   case Type::HalfTyID:
     return LLVMHalfTypeKind;
+  case Type::BFloatTyID:
+    return LLVMBFloatTypeKind;
   case Type::FloatTyID:
     return LLVMFloatTypeKind;
   case Type::DoubleTyID:
@@ -504,12 +503,14 @@ LLVMTypeKind LLVMGetTypeKind(LLVMTypeRef Ty) {
     return LLVMArrayTypeKind;
   case Type::PointerTyID:
     return LLVMPointerTypeKind;
-  case Type::VectorTyID:
+  case Type::FixedVectorTyID:
     return LLVMVectorTypeKind;
   case Type::X86_MMXTyID:
     return LLVMX86_MMXTypeKind;
   case Type::TokenTyID:
     return LLVMTokenTypeKind;
+  case Type::ScalableVectorTyID:
+    return LLVMScalableVectorTypeKind;
   }
   llvm_unreachable("Unhandled TypeID.");
 }
@@ -596,6 +597,9 @@ unsigned LLVMGetIntTypeWidth(LLVMTypeRef IntegerTy) {
 LLVMTypeRef LLVMHalfTypeInContext(LLVMContextRef C) {
   return (LLVMTypeRef) Type::getHalfTy(*unwrap(C));
 }
+LLVMTypeRef LLVMBFloatTypeInContext(LLVMContextRef C) {
+  return (LLVMTypeRef) Type::getBFloatTy(*unwrap(C));
+}
 LLVMTypeRef LLVMFloatTypeInContext(LLVMContextRef C) {
   return (LLVMTypeRef) Type::getFloatTy(*unwrap(C));
 }
@@ -618,6 +622,9 @@ LLVMTypeRef LLVMX86MMXTypeInContext(LLVMContextRef C) {
 LLVMTypeRef LLVMHalfType(void) {
   return LLVMHalfTypeInContext(LLVMGetGlobalContext());
 }
+LLVMTypeRef LLVMBFloatType(void) {
+  return LLVMBFloatTypeInContext(LLVMGetGlobalContext());
+}
 LLVMTypeRef LLVMFloatType(void) {
   return LLVMFloatTypeInContext(LLVMGetGlobalContext());
 }
@@ -749,14 +756,16 @@ LLVMTypeRef LLVMPointerType(LLVMTypeRef ElementType, unsigned AddressSpace) {
 }
 
 LLVMTypeRef LLVMVectorType(LLVMTypeRef ElementType, unsigned ElementCount) {
-  return wrap(VectorType::get(unwrap(ElementType), ElementCount));
+  return wrap(FixedVectorType::get(unwrap(ElementType), ElementCount));
 }
 
 LLVMTypeRef LLVMGetElementType(LLVMTypeRef WrappedTy) {
   auto *Ty = unwrap<Type>(WrappedTy);
   if (auto *PTy = dyn_cast<PointerType>(Ty))
     return wrap(PTy->getElementType());
-  return wrap(cast<SequentialType>(Ty)->getElementType());
+  if (auto *ATy = dyn_cast<ArrayType>(Ty))
+    return wrap(ATy->getElementType());
+  return wrap(cast<VectorType>(Ty)->getElementType());
 }
 
 unsigned LLVMGetNumContainedTypes(LLVMTypeRef Tp) {
@@ -1784,9 +1793,11 @@ LLVMValueRef LLVMConstInsertElement(LLVMValueRef VectorConstant,
 LLVMValueRef LLVMConstShuffleVector(LLVMValueRef VectorAConstant,
                                     LLVMValueRef VectorBConstant,
                                     LLVMValueRef MaskConstant) {
+  SmallVector<int, 16> IntMask;
+  ShuffleVectorInst::getShuffleMask(unwrap<Constant>(MaskConstant), IntMask);
   return wrap(ConstantExpr::getShuffleVector(unwrap<Constant>(VectorAConstant),
                                              unwrap<Constant>(VectorBConstant),
-                                             unwrap<Constant>(MaskConstant)));
+                                             IntMask));
 }
 
 LLVMValueRef LLVMConstExtractValue(LLVMValueRef AggConstant, unsigned *IdxList,
@@ -1992,7 +2003,7 @@ LLVMTypeRef LLVMGlobalGetValueType(LLVMValueRef Global) {
 
 unsigned LLVMGetAlignment(LLVMValueRef V) {
   Value *P = unwrap<Value>(V);
-  if (GlobalValue *GV = dyn_cast<GlobalValue>(P))
+  if (GlobalObject *GV = dyn_cast<GlobalObject>(P))
     return GV->getAlignment();
   if (AllocaInst *AI = dyn_cast<AllocaInst>(P))
     return AI->getAlignment();
@@ -2002,7 +2013,7 @@ unsigned LLVMGetAlignment(LLVMValueRef V) {
     return SI->getAlignment();
 
   llvm_unreachable(
-      "only GlobalValue, AllocaInst, LoadInst and StoreInst have alignment");
+      "only GlobalObject, AllocaInst, LoadInst and StoreInst have alignment");
 }
 
 void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) {
@@ -2010,11 +2021,11 @@ void LLVMSetAlignment(LLVMValueRef V, unsigned Bytes) {
   if (GlobalObject *GV = dyn_cast<GlobalObject>(P))
     GV->setAlignment(MaybeAlign(Bytes));
   else if (AllocaInst *AI = dyn_cast<AllocaInst>(P))
-    AI->setAlignment(MaybeAlign(Bytes));
+    AI->setAlignment(Align(Bytes));
   else if (LoadInst *LI = dyn_cast<LoadInst>(P))
-    LI->setAlignment(MaybeAlign(Bytes));
+    LI->setAlignment(Align(Bytes));
   else if (StoreInst *SI = dyn_cast<StoreInst>(P))
-    SI->setAlignment(MaybeAlign(Bytes));
+    SI->setAlignment(Align(Bytes));
   else
     llvm_unreachable(
         "only GlobalValue, AllocaInst, LoadInst and StoreInst have alignment");
@@ -2837,7 +2848,7 @@ void LLVMRemoveCallSiteStringAttribute(LLVMValueRef C, LLVMAttributeIndex Idx,
 }
 
 LLVMValueRef LLVMGetCalledValue(LLVMValueRef Instr) {
-  return wrap(unwrap<CallBase>(Instr)->getCalledValue());
+  return wrap(unwrap<CallBase>(Instr)->getCalledOperand());
 }
 
 LLVMTypeRef LLVMGetCalledFunctionType(LLVMValueRef Instr) {
@@ -3439,14 +3450,14 @@ LLVMValueRef LLVMBuildArrayMalloc(LLVMBuilderRef B, LLVMTypeRef Ty,
   return wrap(unwrap(B)->Insert(Malloc, Twine(Name)));
 }
 
-LLVMValueRef LLVMBuildMemSet(LLVMBuilderRef B, LLVMValueRef Ptr, 
+LLVMValueRef LLVMBuildMemSet(LLVMBuilderRef B, LLVMValueRef Ptr,
                              LLVMValueRef Val, LLVMValueRef Len,
                              unsigned Align) {
   return wrap(unwrap(B)->CreateMemSet(unwrap(Ptr), unwrap(Val), unwrap(Len),
                                       MaybeAlign(Align)));
 }
 
-LLVMValueRef LLVMBuildMemCpy(LLVMBuilderRef B, 
+LLVMValueRef LLVMBuildMemCpy(LLVMBuilderRef B,
                              LLVMValueRef Dst, unsigned DstAlign,
                              LLVMValueRef Src, unsigned SrcAlign,
                              LLVMValueRef Size) {
diff --git a/llvm/lib/IR/DIBuilder.cpp b/llvm/lib/IR/DIBuilder.cpp
index c89f404e4296..45cbbb3a6037 100644
--- a/llvm/lib/IR/DIBuilder.cpp
+++ b/llvm/lib/IR/DIBuilder.cpp
@@ -140,7 +140,8 @@ DICompileUnit *DIBuilder::createCompileUnit(
     StringRef Flags, unsigned RunTimeVer, StringRef SplitName,
     DICompileUnit::DebugEmissionKind Kind, uint64_t DWOId,
     bool SplitDebugInlining, bool DebugInfoForProfiling,
-    DICompileUnit::DebugNameTableKind NameTableKind, bool RangesBaseAddress) {
+    DICompileUnit::DebugNameTableKind NameTableKind, bool RangesBaseAddress,
+    StringRef SysRoot, StringRef SDK) {
 
   assert(((Lang <= dwarf::DW_LANG_Fortran08 && Lang >= dwarf::DW_LANG_C89) ||
           (Lang <= dwarf::DW_LANG_hi_user && Lang >= dwarf::DW_LANG_lo_user)) &&
@@ -151,7 +152,7 @@ DICompileUnit *DIBuilder::createCompileUnit(
       VMContext, Lang, File, Producer, isOptimized, Flags, RunTimeVer,
       SplitName, Kind, nullptr, nullptr, nullptr, nullptr, nullptr, DWOId,
       SplitDebugInlining, DebugInfoForProfiling, NameTableKind,
-      RangesBaseAddress);
+      RangesBaseAddress, SysRoot, SDK);
 
   // Create a named metadata so that it is easier to find cu in a module.
   NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.dbg.cu");
@@ -245,7 +246,8 @@ DIMacroFile *DIBuilder::createTempMacroFile(DIMacroFile *Parent,
 DIEnumerator *DIBuilder::createEnumerator(StringRef Name, int64_t Val,
                                           bool IsUnsigned) {
   assert(!Name.empty() && "Unable to create enumerator without name");
-  return DIEnumerator::get(VMContext, Val, IsUnsigned, Name);
+  return DIEnumerator::get(VMContext, APInt(64, Val, !IsUnsigned), IsUnsigned,
+                           Name);
 }
 
 DIBasicType *DIBuilder::createUnspecifiedType(StringRef Name) {
@@ -405,25 +407,26 @@ DIBuilder::createObjCProperty(StringRef Name, DIFile *File, unsigned LineNumber,
 
 DITemplateTypeParameter *
 DIBuilder::createTemplateTypeParameter(DIScope *Context, StringRef Name,
-                                       DIType *Ty) {
+                                       DIType *Ty, bool isDefault) {
   assert((!Context || isa<DICompileUnit>(Context)) && "Expected compile unit");
-  return DITemplateTypeParameter::get(VMContext, Name, Ty);
+  return DITemplateTypeParameter::get(VMContext, Name, Ty, isDefault);
 }
 
 static DITemplateValueParameter *
 createTemplateValueParameterHelper(LLVMContext &VMContext, unsigned Tag,
                                    DIScope *Context, StringRef Name, DIType *Ty,
-                                   Metadata *MD) {
+                                   bool IsDefault, Metadata *MD) {
   assert((!Context || isa<DICompileUnit>(Context)) && "Expected compile unit");
-  return DITemplateValueParameter::get(VMContext, Tag, Name, Ty, MD);
+  return DITemplateValueParameter::get(VMContext, Tag, Name, Ty, IsDefault, MD);
 }
 
 DITemplateValueParameter *
 DIBuilder::createTemplateValueParameter(DIScope *Context, StringRef Name,
-                                        DIType *Ty, Constant *Val) {
+                                        DIType *Ty, bool isDefault,
+                                        Constant *Val) {
   return createTemplateValueParameterHelper(
       VMContext, dwarf::DW_TAG_template_value_parameter, Context, Name, Ty,
-      getConstantOrNull(Val));
+      isDefault, getConstantOrNull(Val));
 }
 
 DITemplateValueParameter *
@@ -431,7 +434,7 @@ DIBuilder::createTemplateTemplateParameter(DIScope *Context, StringRef Name,
                                            DIType *Ty, StringRef Val) {
   return createTemplateValueParameterHelper(
       VMContext, dwarf::DW_TAG_GNU_template_template_param, Context, Name, Ty,
-      MDString::get(VMContext, Val));
+      false, MDString::get(VMContext, Val));
 }
 
 DITemplateValueParameter *
@@ -439,7 +442,7 @@ DIBuilder::createTemplateParameterPack(DIScope *Context, StringRef Name,
                                        DIType *Ty, DINodeArray Val) {
   return createTemplateValueParameterHelper(
       VMContext, dwarf::DW_TAG_GNU_template_parameter_pack, Context, Name, Ty,
-      Val.get());
+      false, Val.get());
 }
 
 DICompositeType *DIBuilder::createClassType(
@@ -622,11 +625,22 @@ DITypeRefArray DIBuilder::getOrCreateTypeArray(ArrayRef<Metadata *> Elements) {
 }
 
 DISubrange *DIBuilder::getOrCreateSubrange(int64_t Lo, int64_t Count) {
-  return DISubrange::get(VMContext, Count, Lo);
+  auto *LB = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(VMContext), Lo));
+  auto *CountNode = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(VMContext), Count));
+  return DISubrange::get(VMContext, CountNode, LB, nullptr, nullptr);
 }
 
 DISubrange *DIBuilder::getOrCreateSubrange(int64_t Lo, Metadata *CountNode) {
-  return DISubrange::get(VMContext, CountNode, Lo);
+  auto *LB = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(VMContext), Lo));
+  return DISubrange::get(VMContext, CountNode, LB, nullptr, nullptr);
+}
+
+DISubrange *DIBuilder::getOrCreateSubrange(Metadata *CountNode, Metadata *LB,
+                                           Metadata *UB, Metadata *Stride) {
+  return DISubrange::get(VMContext, CountNode, LB, UB, Stride);
 }
 
 static void checkGlobalVariableScope(DIScope *Context) {
@@ -829,10 +843,10 @@ DINamespace *DIBuilder::createNameSpace(DIScope *Scope, StringRef Name,
 
 DIModule *DIBuilder::createModule(DIScope *Scope, StringRef Name,
                                   StringRef ConfigurationMacros,
-                                  StringRef IncludePath,
-                                  StringRef SysRoot) {
- return DIModule::get(VMContext, getNonCompileUnitScope(Scope), Name,
-                      ConfigurationMacros, IncludePath, SysRoot);
+                                  StringRef IncludePath, StringRef APINotesFile,
+                                  DIFile *File, unsigned LineNo) {
+  return DIModule::get(VMContext, File, getNonCompileUnitScope(Scope), Name,
+                       ConfigurationMacros, IncludePath, APINotesFile, LineNo);
 }
 
 DILexicalBlockFile *DIBuilder::createLexicalBlockFile(DIScope *Scope,
@@ -895,18 +909,15 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(Value *V,
   return insertDbgValueIntrinsic(V, VarInfo, Expr, DL, InsertAtEnd, nullptr);
 }
 
-/// Return an IRBuilder for inserting dbg.declare and dbg.value intrinsics. This
-/// abstracts over the various ways to specify an insert position.
-static IRBuilder<> getIRBForDbgInsertion(const DILocation *DL,
-                                         BasicBlock *InsertBB,
-                                         Instruction *InsertBefore) {
-  IRBuilder<> B(DL->getContext());
+/// Initialize IRBuilder for inserting dbg.declare and dbg.value intrinsics.
+/// This abstracts over the various ways to specify an insert position.
+static void initIRBuilder(IRBuilder<> &Builder, const DILocation *DL,
+                          BasicBlock *InsertBB, Instruction *InsertBefore) {
   if (InsertBefore)
-    B.SetInsertPoint(InsertBefore);
+    Builder.SetInsertPoint(InsertBefore);
   else if (InsertBB)
-    B.SetInsertPoint(InsertBB);
-  B.SetCurrentDebugLocation(DL);
-  return B;
+    Builder.SetInsertPoint(InsertBB);
+  Builder.SetCurrentDebugLocation(DL);
 }
 
 static Value *getDbgIntrinsicValueImpl(LLVMContext &VMContext, Value *V) {
@@ -936,7 +947,8 @@ Instruction *DIBuilder::insertDeclare(Value *Storage, DILocalVariable *VarInfo,
                    MetadataAsValue::get(VMContext, VarInfo),
                    MetadataAsValue::get(VMContext, Expr)};
 
-  IRBuilder<> B = getIRBForDbgInsertion(DL, InsertBB, InsertBefore);
+  IRBuilder<> B(DL->getContext());
+  initIRBuilder(B, DL, InsertBB, InsertBefore);
   return B.CreateCall(DeclareFn, Args);
 }
 
@@ -958,7 +970,8 @@ Instruction *DIBuilder::insertDbgValueIntrinsic(
                    MetadataAsValue::get(VMContext, VarInfo),
                    MetadataAsValue::get(VMContext, Expr)};
 
-  IRBuilder<> B = getIRBForDbgInsertion(DL, InsertBB, InsertBefore);
+  IRBuilder<> B(DL->getContext());
+  initIRBuilder(B, DL, InsertBB, InsertBefore);
   return B.CreateCall(ValueFn, Args);
 }
 
@@ -976,7 +989,8 @@ Instruction *DIBuilder::insertLabel(
   trackIfUnresolved(LabelInfo);
   Value *Args[] = {MetadataAsValue::get(VMContext, LabelInfo)};
 
-  IRBuilder<> B = getIRBForDbgInsertion(DL, InsertBB, InsertBefore);
+  IRBuilder<> B(DL->getContext());
+  initIRBuilder(B, DL, InsertBB, InsertBefore);
   return B.CreateCall(LabelFn, Args);
 }
 
diff --git a/llvm/lib/IR/DataLayout.cpp b/llvm/lib/IR/DataLayout.cpp
index 94e0740663cc..c44737c5bfc2 100644
--- a/llvm/lib/IR/DataLayout.cpp
+++ b/llvm/lib/IR/DataLayout.cpp
@@ -52,7 +52,7 @@ StructLayout::StructLayout(StructType *ST, const DataLayout &DL) {
   // Loop over each of the elements, placing them in memory.
   for (unsigned i = 0, e = NumElements; i != e; ++i) {
     Type *Ty = ST->getElementType(i);
-    const Align TyAlign(ST->isPacked() ? 1 : DL.getABITypeAlignment(Ty));
+    const Align TyAlign = ST->isPacked() ? Align(1) : DL.getABITypeAlign(Ty);
 
     // Add padding if necessary to align the data element properly.
     if (!isAligned(TyAlign, StructSize)) {
@@ -153,6 +153,8 @@ const char *DataLayout::getManglingComponent(const Triple &T) {
     return "-m:o";
   if (T.isOSWindows() && T.isOSBinFormatCOFF())
     return T.getArch() == Triple::x86 ? "-m:x" : "-m:w";
+  if (T.isOSBinFormatXCOFF())
+    return "-m:a";
   return "-m:e";
 }
 
@@ -162,7 +164,7 @@ static const LayoutAlignElem DefaultAlignments[] = {
     {INTEGER_ALIGN, 16, Align(2), Align(2)},   // i16
     {INTEGER_ALIGN, 32, Align(4), Align(4)},   // i32
     {INTEGER_ALIGN, 64, Align(4), Align(8)},   // i64
-    {FLOAT_ALIGN, 16, Align(2), Align(2)},     // half
+    {FLOAT_ALIGN, 16, Align(2), Align(2)},     // half, bfloat
     {FLOAT_ALIGN, 32, Align(4), Align(4)},     // float
     {FLOAT_ALIGN, 64, Align(8), Align(8)},     // double
     {FLOAT_ALIGN, 128, Align(16), Align(16)},  // ppcf128, quad, ...
@@ -229,7 +231,7 @@ static unsigned getAddrSpace(StringRef R) {
 }
 
 void DataLayout::parseSpecifier(StringRef Desc) {
-  StringRepresentation = Desc;
+  StringRepresentation = std::string(Desc);
   while (!Desc.empty()) {
     // Split at '-'.
     std::pair<StringRef, StringRef> Split = split(Desc, '-');
@@ -260,8 +262,8 @@ void DataLayout::parseSpecifier(StringRef Desc) {
 
     switch (Specifier) {
     case 's':
-      // Ignored for backward compatibility.
-      // FIXME: remove this on LLVM 4.0.
+      // Deprecated, but ignoring here to preserve loading older textual llvm
+      // ASM file
       break;
     case 'E':
       BigEndian = true;
@@ -444,6 +446,9 @@ void DataLayout::parseSpecifier(StringRef Desc) {
       case 'x':
         ManglingMode = MM_WinCOFFX86;
         break;
+      case 'a':
+        ManglingMode = MM_XCOFF;
+        break;
       }
       break;
     default:
@@ -559,7 +564,10 @@ Align DataLayout::getAlignmentInfo(AlignTypeEnum AlignType, uint32_t BitWidth,
     // with what clang and llvm-gcc do.
     unsigned Alignment =
         getTypeAllocSize(cast<VectorType>(Ty)->getElementType());
-    Alignment *= cast<VectorType>(Ty)->getNumElements();
+    // We're only calculating a natural alignment, so it doesn't have to be
+    // based on the full size for scalable vectors. Using the minimum element
+    // count should be enough here.
+    Alignment *= cast<VectorType>(Ty)->getElementCount().Min;
     Alignment = PowerOf2Ceil(Alignment);
     return Align(Alignment);
    }
@@ -718,7 +726,7 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
   case Type::StructTyID: {
     // Packed structure types always have an ABI alignment of one.
     if (cast<StructType>(Ty)->isPacked() && abi_or_pref)
-      return Align::None();
+      return Align(1);
 
     // Get the layout annotation... which is lazily created on demand.
     const StructLayout *Layout = getStructLayout(cast<StructType>(Ty));
@@ -729,6 +737,7 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
     AlignType = INTEGER_ALIGN;
     break;
   case Type::HalfTyID:
+  case Type::BFloatTyID:
   case Type::FloatTyID:
   case Type::DoubleTyID:
   // PPC_FP128TyID and FP128TyID have different data contents, but the
@@ -739,7 +748,8 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
     AlignType = FLOAT_ALIGN;
     break;
   case Type::X86_MMXTyID:
-  case Type::VectorTyID:
+  case Type::FixedVectorTyID:
+  case Type::ScalableVectorTyID:
     AlignType = VECTOR_ALIGN;
     break;
   default:
@@ -752,8 +762,13 @@ Align DataLayout::getAlignment(Type *Ty, bool abi_or_pref) const {
                           abi_or_pref, Ty);
 }
 
+/// TODO: Remove this function once the transition to Align is over.
 unsigned DataLayout::getABITypeAlignment(Type *Ty) const {
-  return getAlignment(Ty, true).value();
+  return getABITypeAlign(Ty).value();
+}
+
+Align DataLayout::getABITypeAlign(Type *Ty) const {
+  return getAlignment(Ty, true);
 }
 
 /// getABIIntegerTypeAlignment - Return the minimum ABI-required alignment for
@@ -762,8 +777,13 @@ Align DataLayout::getABIIntegerTypeAlignment(unsigned BitWidth) const {
   return getAlignmentInfo(INTEGER_ALIGN, BitWidth, true, nullptr);
 }
 
+/// TODO: Remove this function once the transition to Align is over.
 unsigned DataLayout::getPrefTypeAlignment(Type *Ty) const {
-  return getAlignment(Ty, false).value();
+  return getPrefTypeAlign(Ty).value();
+}
+
+Align DataLayout::getPrefTypeAlign(Type *Ty) const {
+  return getAlignment(Ty, false);
 }
 
 IntegerType *DataLayout::getIntPtrType(LLVMContext &C,
@@ -777,7 +797,7 @@ Type *DataLayout::getIntPtrType(Type *Ty) const {
   unsigned NumBits = getPointerTypeSizeInBits(Ty);
   IntegerType *IntTy = IntegerType::get(Ty->getContext(), NumBits);
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
-    return VectorType::get(IntTy, VecTy->getNumElements());
+    return VectorType::get(IntTy, VecTy);
   return IntTy;
 }
 
@@ -799,7 +819,7 @@ Type *DataLayout::getIndexType(Type *Ty) const {
   unsigned NumBits = getIndexTypeSizeInBits(Ty);
   IntegerType *IntTy = IntegerType::get(Ty->getContext(), NumBits);
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty))
-    return VectorType::get(IntTy, VecTy->getNumElements());
+    return VectorType::get(IntTy, VecTy);
   return IntTy;
 }
 
@@ -831,15 +851,14 @@ int64_t DataLayout::getIndexedOffsetInType(Type *ElemTy,
   return Result;
 }
 
-/// getPreferredAlignment - Return the preferred alignment of the specified
-/// global.  This includes an explicitly requested alignment (if the global
-/// has one).
-unsigned DataLayout::getPreferredAlignment(const GlobalVariable *GV) const {
-  unsigned GVAlignment = GV->getAlignment();
+/// getPreferredAlign - Return the preferred alignment of the specified global.
+/// This includes an explicitly requested alignment (if the global has one).
+Align DataLayout::getPreferredAlign(const GlobalVariable *GV) const {
+  MaybeAlign GVAlignment = GV->getAlign();
   // If a section is specified, always precisely honor explicit alignment,
   // so we don't insert padding into a section we don't control.
   if (GVAlignment && GV->hasSection())
-    return GVAlignment;
+    return *GVAlignment;
 
   // If no explicit alignment is specified, compute the alignment based on
   // the IR type. If an alignment is specified, increase it to match the ABI
@@ -848,30 +867,24 @@ unsigned DataLayout::getPreferredAlignment(const GlobalVariable *GV) const {
   // FIXME: Not sure it makes sense to use the alignment of the type if
   // there's already an explicit alignment specification.
   Type *ElemType = GV->getValueType();
-  unsigned Alignment = getPrefTypeAlignment(ElemType);
-  if (GVAlignment >= Alignment) {
-    Alignment = GVAlignment;
-  } else if (GVAlignment != 0) {
-    Alignment = std::max(GVAlignment, getABITypeAlignment(ElemType));
+  Align Alignment = getPrefTypeAlign(ElemType);
+  if (GVAlignment) {
+    if (*GVAlignment >= Alignment)
+      Alignment = *GVAlignment;
+    else
+      Alignment = std::max(*GVAlignment, getABITypeAlign(ElemType));
   }
 
   // If no explicit alignment is specified, and the global is large, increase
   // the alignment to 16.
   // FIXME: Why 16, specifically?
-  if (GV->hasInitializer() && GVAlignment == 0) {
-    if (Alignment < 16) {
+  if (GV->hasInitializer() && !GVAlignment) {
+    if (Alignment < Align(16)) {
       // If the global is not external, see if it is large.  If so, give it a
       // larger alignment.
       if (getTypeSizeInBits(ElemType) > 128)
-        Alignment = 16;    // 16-byte alignment.
+        Alignment = Align(16); // 16-byte alignment.
     }
   }
   return Alignment;
 }
-
-/// getPreferredAlignmentLog - Return the preferred alignment of the
-/// specified global, returned in log form.  This includes an explicitly
-/// requested alignment (if the global has one).
-unsigned DataLayout::getPreferredAlignmentLog(const GlobalVariable *GV) const {
-  return Log2_32(getPreferredAlignment(GV));
-}
diff --git a/llvm/lib/IR/DebugInfo.cpp b/llvm/lib/IR/DebugInfo.cpp
index fe8311923109..190b220dc9aa 100644
--- a/llvm/lib/IR/DebugInfo.cpp
+++ b/llvm/lib/IR/DebugInfo.cpp
@@ -105,10 +105,8 @@ void DebugInfoFinder::processCompileUnit(DICompileUnit *CU) {
 
 void DebugInfoFinder::processInstruction(const Module &M,
                                          const Instruction &I) {
-  if (auto *DDI = dyn_cast<DbgDeclareInst>(&I))
-    processDeclare(M, DDI);
-  else if (auto *DVI = dyn_cast<DbgValueInst>(&I))
-    processValue(M, DVI);
+  if (auto *DVI = dyn_cast<DbgVariableIntrinsic>(&I))
+    processVariable(M, *DVI);
 
   if (auto DbgLoc = I.getDebugLoc())
     processLocation(M, DbgLoc.get());
@@ -194,24 +192,9 @@ void DebugInfoFinder::processSubprogram(DISubprogram *SP) {
   }
 }
 
-void DebugInfoFinder::processDeclare(const Module &M,
-                                     const DbgDeclareInst *DDI) {
-  auto *N = dyn_cast<MDNode>(DDI->getVariable());
-  if (!N)
-    return;
-
-  auto *DV = dyn_cast<DILocalVariable>(N);
-  if (!DV)
-    return;
-
-  if (!NodesSeen.insert(DV).second)
-    return;
-  processScope(DV->getScope());
-  processType(DV->getType());
-}
-
-void DebugInfoFinder::processValue(const Module &M, const DbgValueInst *DVI) {
-  auto *N = dyn_cast<MDNode>(DVI->getVariable());
+void DebugInfoFinder::processVariable(const Module &M,
+                                      const DbgVariableIntrinsic &DVI) {
+  auto *N = dyn_cast<MDNode>(DVI.getVariable());
   if (!N)
     return;
 
@@ -278,6 +261,41 @@ bool DebugInfoFinder::addScope(DIScope *Scope) {
   return true;
 }
 
+static MDNode *updateLoopMetadataDebugLocationsImpl(
+    MDNode *OrigLoopID,
+    function_ref<DILocation *(const DILocation &)> Updater) {
+  assert(OrigLoopID && OrigLoopID->getNumOperands() > 0 &&
+         "Loop ID needs at least one operand");
+  assert(OrigLoopID && OrigLoopID->getOperand(0).get() == OrigLoopID &&
+         "Loop ID should refer to itself");
+
+  // Save space for the self-referential LoopID.
+  SmallVector<Metadata *, 4> MDs = {nullptr};
+
+  for (unsigned i = 1; i < OrigLoopID->getNumOperands(); ++i) {
+    Metadata *MD = OrigLoopID->getOperand(i);
+    if (DILocation *DL = dyn_cast<DILocation>(MD)) {
+      if (DILocation *NewDL = Updater(*DL))
+        MDs.push_back(NewDL);
+    } else
+      MDs.push_back(MD);
+  }
+
+  MDNode *NewLoopID = MDNode::getDistinct(OrigLoopID->getContext(), MDs);
+  // Insert the self-referential LoopID.
+  NewLoopID->replaceOperandWith(0, NewLoopID);
+  return NewLoopID;
+}
+
+void llvm::updateLoopMetadataDebugLocations(
+    Instruction &I, function_ref<DILocation *(const DILocation &)> Updater) {
+  MDNode *OrigLoopID = I.getMetadata(LLVMContext::MD_loop);
+  if (!OrigLoopID)
+    return;
+  MDNode *NewLoopID = updateLoopMetadataDebugLocationsImpl(OrigLoopID, Updater);
+  I.setMetadata(LLVMContext::MD_loop, NewLoopID);
+}
+
 static MDNode *stripDebugLocFromLoopID(MDNode *N) {
   assert(!N->operands().empty() && "Missing self reference?");
 
@@ -294,20 +312,10 @@ static MDNode *stripDebugLocFromLoopID(MDNode *N) {
       }))
     return nullptr;
 
-  SmallVector<Metadata *, 4> Args;
-  // Reserve operand 0 for loop id self reference.
-  auto TempNode = MDNode::getTemporary(N->getContext(), None);
-  Args.push_back(TempNode.get());
-  // Add all non-debug location operands back.
-  for (auto Op = N->op_begin() + 1; Op != N->op_end(); Op++) {
-    if (!isa<DILocation>(*Op))
-      Args.push_back(*Op);
-  }
-
-  // Set the first operand to itself.
-  MDNode *LoopID = MDNode::get(N->getContext(), Args);
-  LoopID->replaceOperandWith(0, LoopID);
-  return LoopID;
+  auto dropDebugLoc = [](const DILocation &) -> DILocation * {
+    return nullptr;
+  };
+  return updateLoopMetadataDebugLocationsImpl(N, dropDebugLoc);
 }
 
 bool llvm::stripDebugInfo(Function &F) {
@@ -489,7 +497,7 @@ private:
         RetainedTypes, GlobalVariables, ImportedEntities, CU->getMacros(),
         CU->getDWOId(), CU->getSplitDebugInlining(),
         CU->getDebugInfoForProfiling(), CU->getNameTableKind(),
-        CU->getRangesBaseAddress());
+        CU->getRangesBaseAddress(), CU->getSysRoot(), CU->getSDK());
   }
 
   DILocation *getReplacementMDLocation(DILocation *MLD) {
@@ -598,7 +606,9 @@ bool llvm::stripNonLineTableDebugInfo(Module &M) {
       Changed = true;
     }
   };
+  RemoveUses("llvm.dbg.addr");
   RemoveUses("llvm.dbg.declare");
+  RemoveUses("llvm.dbg.label");
   RemoveUses("llvm.dbg.value");
 
   // Delete non-CU debug info named metadata nodes.
@@ -637,7 +647,7 @@ bool llvm::stripNonLineTableDebugInfo(Module &M) {
     }
     for (auto &BB : F) {
       for (auto &I : BB) {
-        auto remapDebugLoc = [&](DebugLoc DL) -> DebugLoc {
+        auto remapDebugLoc = [&](const DebugLoc &DL) -> DebugLoc {
           auto *Scope = DL.getScope();
           MDNode *InlinedAt = DL.getInlinedAt();
           Scope = remap(Scope);
@@ -648,15 +658,10 @@ bool llvm::stripNonLineTableDebugInfo(Module &M) {
         if (I.getDebugLoc() != DebugLoc())
           I.setDebugLoc(remapDebugLoc(I.getDebugLoc()));
 
-        // Remap DILocations in untyped MDNodes (e.g., llvm.loop).
-        SmallVector<std::pair<unsigned, MDNode *>, 2> MDs;
-        I.getAllMetadata(MDs);
-        for (auto Attachment : MDs)
-          if (auto *T = dyn_cast_or_null<MDTuple>(Attachment.second))
-            for (unsigned N = 0; N < T->getNumOperands(); ++N)
-              if (auto *Loc = dyn_cast_or_null<DILocation>(T->getOperand(N)))
-                if (Loc != DebugLoc())
-                  T->replaceOperandWith(N, remapDebugLoc(Loc));
+        // Remap DILocations in llvm.loop attachments.
+        updateLoopMetadataDebugLocations(I, [&](const DILocation &Loc) {
+          return remapDebugLoc(&Loc).get();
+        });
       }
     }
   }
@@ -757,16 +762,18 @@ LLVMMetadataRef LLVMDIBuilderCreateCompileUnit(
     LLVMBool isOptimized, const char *Flags, size_t FlagsLen,
     unsigned RuntimeVer, const char *SplitName, size_t SplitNameLen,
     LLVMDWARFEmissionKind Kind, unsigned DWOId, LLVMBool SplitDebugInlining,
-    LLVMBool DebugInfoForProfiling) {
+    LLVMBool DebugInfoForProfiling, const char *SysRoot, size_t SysRootLen,
+    const char *SDK, size_t SDKLen) {
   auto File = unwrapDI<DIFile>(FileRef);
 
   return wrap(unwrap(Builder)->createCompileUnit(
-                 map_from_llvmDWARFsourcelanguage(Lang), File,
-                 StringRef(Producer, ProducerLen), isOptimized,
-                 StringRef(Flags, FlagsLen), RuntimeVer,
-                 StringRef(SplitName, SplitNameLen),
-                 static_cast<DICompileUnit::DebugEmissionKind>(Kind), DWOId,
-                 SplitDebugInlining, DebugInfoForProfiling));
+      map_from_llvmDWARFsourcelanguage(Lang), File,
+      StringRef(Producer, ProducerLen), isOptimized, StringRef(Flags, FlagsLen),
+      RuntimeVer, StringRef(SplitName, SplitNameLen),
+      static_cast<DICompileUnit::DebugEmissionKind>(Kind), DWOId,
+      SplitDebugInlining, DebugInfoForProfiling,
+      DICompileUnit::DebugNameTableKind::Default, false,
+      StringRef(SysRoot, SysRootLen), StringRef(SDK, SDKLen)));
 }
 
 LLVMMetadataRef
@@ -782,12 +789,12 @@ LLVMDIBuilderCreateModule(LLVMDIBuilderRef Builder, LLVMMetadataRef ParentScope,
                           const char *Name, size_t NameLen,
                           const char *ConfigMacros, size_t ConfigMacrosLen,
                           const char *IncludePath, size_t IncludePathLen,
-                          const char *SysRoot, size_t SysRootLen) {
+                          const char *APINotesFile, size_t APINotesFileLen) {
   return wrap(unwrap(Builder)->createModule(
       unwrapDI<DIScope>(ParentScope), StringRef(Name, NameLen),
       StringRef(ConfigMacros, ConfigMacrosLen),
       StringRef(IncludePath, IncludePathLen),
-      StringRef(SysRoot, SysRootLen)));
+      StringRef(APINotesFile, APINotesFileLen)));
 }
 
 LLVMMetadataRef LLVMDIBuilderCreateNameSpace(LLVMDIBuilderRef Builder,
diff --git a/llvm/lib/IR/DebugInfoMetadata.cpp b/llvm/lib/IR/DebugInfoMetadata.cpp
index d3ecd9b0e03d..110d94116f10 100644
--- a/llvm/lib/IR/DebugInfoMetadata.cpp
+++ b/llvm/lib/IR/DebugInfoMetadata.cpp
@@ -75,6 +75,21 @@ DILocation *DILocation::getImpl(LLVMContext &Context, unsigned Line,
                    Storage, Context.pImpl->DILocations);
 }
 
+const
+DILocation *DILocation::getMergedLocations(ArrayRef<const DILocation *> Locs) {
+  if (Locs.empty())
+    return nullptr;
+  if (Locs.size() == 1)
+    return Locs[0];
+  auto *Merged = Locs[0];
+  for (auto I = std::next(Locs.begin()), E = Locs.end(); I != E; ++I) {
+    Merged = getMergedLocation(Merged, *I);
+    if (Merged == nullptr)
+      break;
+  }
+  return Merged;
+}
+
 const DILocation *DILocation::getMergedLocation(const DILocation *LocA,
                                                 const DILocation *LocB) {
   if (!LocA || !LocB)
@@ -321,18 +336,106 @@ DISubrange *DISubrange::getImpl(LLVMContext &Context, int64_t Count, int64_t Lo,
                                 StorageType Storage, bool ShouldCreate) {
   auto *CountNode = ConstantAsMetadata::get(
       ConstantInt::getSigned(Type::getInt64Ty(Context), Count));
-  return getImpl(Context, CountNode, Lo, Storage, ShouldCreate);
+  auto *LB = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(Context), Lo));
+  return getImpl(Context, CountNode, LB, nullptr, nullptr, Storage,
+                 ShouldCreate);
 }
 
 DISubrange *DISubrange::getImpl(LLVMContext &Context, Metadata *CountNode,
                                 int64_t Lo, StorageType Storage,
                                 bool ShouldCreate) {
-  DEFINE_GETIMPL_LOOKUP(DISubrange, (CountNode, Lo));
-  Metadata *Ops[] = { CountNode };
-  DEFINE_GETIMPL_STORE(DISubrange, (CountNode, Lo), Ops);
+  auto *LB = ConstantAsMetadata::get(
+      ConstantInt::getSigned(Type::getInt64Ty(Context), Lo));
+  return getImpl(Context, CountNode, LB, nullptr, nullptr, Storage,
+                 ShouldCreate);
+}
+
+DISubrange *DISubrange::getImpl(LLVMContext &Context, Metadata *CountNode,
+                                Metadata *LB, Metadata *UB, Metadata *Stride,
+                                StorageType Storage, bool ShouldCreate) {
+  DEFINE_GETIMPL_LOOKUP(DISubrange, (CountNode, LB, UB, Stride));
+  Metadata *Ops[] = {CountNode, LB, UB, Stride};
+  DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(DISubrange, Ops);
+}
+
+DISubrange::CountType DISubrange::getCount() const {
+  if (!getRawCountNode())
+    return CountType();
+
+  if (auto *MD = dyn_cast<ConstantAsMetadata>(getRawCountNode()))
+    return CountType(cast<ConstantInt>(MD->getValue()));
+
+  if (auto *DV = dyn_cast<DIVariable>(getRawCountNode()))
+    return CountType(DV);
+
+  return CountType();
+}
+
+DISubrange::BoundType DISubrange::getLowerBound() const {
+  Metadata *LB = getRawLowerBound();
+  if (!LB)
+    return BoundType();
+
+  assert((isa<ConstantAsMetadata>(LB) || isa<DIVariable>(LB) ||
+          isa<DIExpression>(LB)) &&
+         "LowerBound must be signed constant or DIVariable or DIExpression");
+
+  if (auto *MD = dyn_cast<ConstantAsMetadata>(LB))
+    return BoundType(cast<ConstantInt>(MD->getValue()));
+
+  if (auto *MD = dyn_cast<DIVariable>(LB))
+    return BoundType(MD);
+
+  if (auto *MD = dyn_cast<DIExpression>(LB))
+    return BoundType(MD);
+
+  return BoundType();
+}
+
+DISubrange::BoundType DISubrange::getUpperBound() const {
+  Metadata *UB = getRawUpperBound();
+  if (!UB)
+    return BoundType();
+
+  assert((isa<ConstantAsMetadata>(UB) || isa<DIVariable>(UB) ||
+          isa<DIExpression>(UB)) &&
+         "UpperBound must be signed constant or DIVariable or DIExpression");
+
+  if (auto *MD = dyn_cast<ConstantAsMetadata>(UB))
+    return BoundType(cast<ConstantInt>(MD->getValue()));
+
+  if (auto *MD = dyn_cast<DIVariable>(UB))
+    return BoundType(MD);
+
+  if (auto *MD = dyn_cast<DIExpression>(UB))
+    return BoundType(MD);
+
+  return BoundType();
 }
 
-DIEnumerator *DIEnumerator::getImpl(LLVMContext &Context, int64_t Value,
+DISubrange::BoundType DISubrange::getStride() const {
+  Metadata *ST = getRawStride();
+  if (!ST)
+    return BoundType();
+
+  assert((isa<ConstantAsMetadata>(ST) || isa<DIVariable>(ST) ||
+          isa<DIExpression>(ST)) &&
+         "Stride must be signed constant or DIVariable or DIExpression");
+
+  if (auto *MD = dyn_cast<ConstantAsMetadata>(ST))
+    return BoundType(cast<ConstantInt>(MD->getValue()));
+
+  if (auto *MD = dyn_cast<DIVariable>(ST))
+    return BoundType(MD);
+
+  if (auto *MD = dyn_cast<DIExpression>(ST))
+    return BoundType(MD);
+
+  return BoundType();
+}
+
+DIEnumerator *DIEnumerator::getImpl(LLVMContext &Context, const APInt &Value,
                                     bool IsUnsigned, MDString *Name,
                                     StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
@@ -390,17 +493,18 @@ DICompositeType *DICompositeType::getImpl(
     uint32_t AlignInBits, uint64_t OffsetInBits, DIFlags Flags,
     Metadata *Elements, unsigned RuntimeLang, Metadata *VTableHolder,
     Metadata *TemplateParams, MDString *Identifier, Metadata *Discriminator,
-    StorageType Storage, bool ShouldCreate) {
+    Metadata *DataLocation, StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
 
   // Keep this in sync with buildODRType.
-  DEFINE_GETIMPL_LOOKUP(
-      DICompositeType, (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
-                        AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
-                        VTableHolder, TemplateParams, Identifier, Discriminator));
-  Metadata *Ops[] = {File,     Scope,        Name,           BaseType,
-                     Elements, VTableHolder, TemplateParams, Identifier,
-                     Discriminator};
+  DEFINE_GETIMPL_LOOKUP(DICompositeType,
+                        (Tag, Name, File, Line, Scope, BaseType, SizeInBits,
+                         AlignInBits, OffsetInBits, Flags, Elements,
+                         RuntimeLang, VTableHolder, TemplateParams, Identifier,
+                         Discriminator, DataLocation));
+  Metadata *Ops[] = {File,          Scope,        Name,           BaseType,
+                     Elements,      VTableHolder, TemplateParams, Identifier,
+                     Discriminator, DataLocation};
   DEFINE_GETIMPL_STORE(DICompositeType, (Tag, Line, RuntimeLang, SizeInBits,
                                          AlignInBits, OffsetInBits, Flags),
                        Ops);
@@ -411,7 +515,8 @@ DICompositeType *DICompositeType::buildODRType(
     Metadata *File, unsigned Line, Metadata *Scope, Metadata *BaseType,
     uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
     DIFlags Flags, Metadata *Elements, unsigned RuntimeLang,
-    Metadata *VTableHolder, Metadata *TemplateParams, Metadata *Discriminator) {
+    Metadata *VTableHolder, Metadata *TemplateParams, Metadata *Discriminator,
+    Metadata *DataLocation) {
   assert(!Identifier.getString().empty() && "Expected valid identifier");
   if (!Context.isODRUniquingDebugTypes())
     return nullptr;
@@ -420,7 +525,8 @@ DICompositeType *DICompositeType::buildODRType(
     return CT = DICompositeType::getDistinct(
                Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
                AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang,
-               VTableHolder, TemplateParams, &Identifier, Discriminator);
+               VTableHolder, TemplateParams, &Identifier, Discriminator,
+               DataLocation);
 
   // Only mutate CT if it's a forward declaration and the new operands aren't.
   assert(CT->getRawIdentifier() == &Identifier && "Wrong ODR identifier?");
@@ -430,9 +536,9 @@ DICompositeType *DICompositeType::buildODRType(
   // Mutate CT in place.  Keep this in sync with getImpl.
   CT->mutate(Tag, Line, RuntimeLang, SizeInBits, AlignInBits, OffsetInBits,
              Flags);
-  Metadata *Ops[] = {File,     Scope,        Name,           BaseType,
-                     Elements, VTableHolder, TemplateParams, &Identifier,
-                     Discriminator};
+  Metadata *Ops[] = {File,          Scope,        Name,           BaseType,
+                     Elements,      VTableHolder, TemplateParams, &Identifier,
+                     Discriminator, DataLocation};
   assert((std::end(Ops) - std::begin(Ops)) == (int)CT->getNumOperands() &&
          "Mismatched number of operands");
   for (unsigned I = 0, E = CT->getNumOperands(); I != E; ++I)
@@ -446,7 +552,8 @@ DICompositeType *DICompositeType::getODRType(
     Metadata *File, unsigned Line, Metadata *Scope, Metadata *BaseType,
     uint64_t SizeInBits, uint32_t AlignInBits, uint64_t OffsetInBits,
     DIFlags Flags, Metadata *Elements, unsigned RuntimeLang,
-    Metadata *VTableHolder, Metadata *TemplateParams, Metadata *Discriminator) {
+    Metadata *VTableHolder, Metadata *TemplateParams, Metadata *Discriminator,
+    Metadata *DataLocation) {
   assert(!Identifier.getString().empty() && "Expected valid identifier");
   if (!Context.isODRUniquingDebugTypes())
     return nullptr;
@@ -455,7 +562,7 @@ DICompositeType *DICompositeType::getODRType(
     CT = DICompositeType::getDistinct(
         Context, Tag, Name, File, Line, Scope, BaseType, SizeInBits,
         AlignInBits, OffsetInBits, Flags, Elements, RuntimeLang, VTableHolder,
-        TemplateParams, &Identifier, Discriminator);
+        TemplateParams, &Identifier, Discriminator, DataLocation);
   return CT;
 }
 
@@ -479,8 +586,9 @@ DISubroutineType *DISubroutineType::getImpl(LLVMContext &Context, DIFlags Flags,
 // FIXME: Implement this string-enum correspondence with a .def file and macros,
 // so that the association is explicit rather than implied.
 static const char *ChecksumKindName[DIFile::CSK_Last] = {
-  "CSK_MD5",
-  "CSK_SHA1"
+    "CSK_MD5",
+    "CSK_SHA1",
+    "CSK_SHA256",
 };
 
 StringRef DIFile::getChecksumKindAsString(ChecksumKind CSKind) {
@@ -495,6 +603,7 @@ Optional<DIFile::ChecksumKind> DIFile::getChecksumKind(StringRef CSKindStr) {
   return StringSwitch<Optional<DIFile::ChecksumKind>>(CSKindStr)
       .Case("CSK_MD5", DIFile::CSK_MD5)
       .Case("CSK_SHA1", DIFile::CSK_SHA1)
+      .Case("CSK_SHA256", DIFile::CSK_SHA256)
       .Default(None);
 }
 
@@ -520,17 +629,24 @@ DICompileUnit *DICompileUnit::getImpl(
     unsigned EmissionKind, Metadata *EnumTypes, Metadata *RetainedTypes,
     Metadata *GlobalVariables, Metadata *ImportedEntities, Metadata *Macros,
     uint64_t DWOId, bool SplitDebugInlining, bool DebugInfoForProfiling,
-    unsigned NameTableKind, bool RangesBaseAddress, StorageType Storage,
-    bool ShouldCreate) {
+    unsigned NameTableKind, bool RangesBaseAddress, MDString *SysRoot,
+    MDString *SDK, StorageType Storage, bool ShouldCreate) {
   assert(Storage != Uniqued && "Cannot unique DICompileUnit");
   assert(isCanonical(Producer) && "Expected canonical MDString");
   assert(isCanonical(Flags) && "Expected canonical MDString");
   assert(isCanonical(SplitDebugFilename) && "Expected canonical MDString");
 
-  Metadata *Ops[] = {
-      File,      Producer,      Flags,           SplitDebugFilename,
-      EnumTypes, RetainedTypes, GlobalVariables, ImportedEntities,
-      Macros};
+  Metadata *Ops[] = {File,
+                     Producer,
+                     Flags,
+                     SplitDebugFilename,
+                     EnumTypes,
+                     RetainedTypes,
+                     GlobalVariables,
+                     ImportedEntities,
+                     Macros,
+                     SysRoot,
+                     SDK};
   return storeImpl(new (array_lengthof(Ops)) DICompileUnit(
                        Context, Storage, SourceLanguage, IsOptimized,
                        RuntimeVersion, EmissionKind, DWOId, SplitDebugInlining,
@@ -660,12 +776,7 @@ DISubprogram *DISubprogram::getImpl(
 
 bool DISubprogram::describes(const Function *F) const {
   assert(F && "Invalid function");
-  if (F->getSubprogram() == this)
-    return true;
-  StringRef Name = getLinkageName();
-  if (Name.empty())
-    Name = getName();
-  return F->getName() == Name;
+  return F->getSubprogram() == this;
 }
 
 DILexicalBlock *DILexicalBlock::getImpl(LLVMContext &Context, Metadata *Scope,
@@ -713,35 +824,38 @@ DICommonBlock *DICommonBlock::getImpl(LLVMContext &Context, Metadata *Scope,
   DEFINE_GETIMPL_STORE(DICommonBlock, (LineNo), Ops);
 }
 
-DIModule *DIModule::getImpl(LLVMContext &Context, Metadata *Scope,
-                            MDString *Name, MDString *ConfigurationMacros,
-                            MDString *IncludePath, MDString *SysRoot,
-                            StorageType Storage, bool ShouldCreate) {
+DIModule *DIModule::getImpl(LLVMContext &Context, Metadata *File,
+                            Metadata *Scope, MDString *Name,
+                            MDString *ConfigurationMacros,
+                            MDString *IncludePath, MDString *APINotesFile,
+                            unsigned LineNo, StorageType Storage,
+                            bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(
-      DIModule, (Scope, Name, ConfigurationMacros, IncludePath, SysRoot));
-  Metadata *Ops[] = {Scope, Name, ConfigurationMacros, IncludePath, SysRoot};
-  DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(DIModule, Ops);
+  DEFINE_GETIMPL_LOOKUP(DIModule, (File, Scope, Name, ConfigurationMacros,
+                                   IncludePath, APINotesFile, LineNo));
+  Metadata *Ops[] = {File,        Scope,       Name, ConfigurationMacros,
+                     IncludePath, APINotesFile};
+  DEFINE_GETIMPL_STORE(DIModule, (LineNo), Ops);
 }
 
-DITemplateTypeParameter *DITemplateTypeParameter::getImpl(LLVMContext &Context,
-                                                          MDString *Name,
-                                                          Metadata *Type,
-                                                          StorageType Storage,
-                                                          bool ShouldCreate) {
+DITemplateTypeParameter *
+DITemplateTypeParameter::getImpl(LLVMContext &Context, MDString *Name,
+                                 Metadata *Type, bool isDefault,
+                                 StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DITemplateTypeParameter, (Name, Type));
+  DEFINE_GETIMPL_LOOKUP(DITemplateTypeParameter, (Name, Type, isDefault));
   Metadata *Ops[] = {Name, Type};
-  DEFINE_GETIMPL_STORE_NO_CONSTRUCTOR_ARGS(DITemplateTypeParameter, Ops);
+  DEFINE_GETIMPL_STORE(DITemplateTypeParameter, (isDefault), Ops);
 }
 
 DITemplateValueParameter *DITemplateValueParameter::getImpl(
     LLVMContext &Context, unsigned Tag, MDString *Name, Metadata *Type,
-    Metadata *Value, StorageType Storage, bool ShouldCreate) {
+    bool isDefault, Metadata *Value, StorageType Storage, bool ShouldCreate) {
   assert(isCanonical(Name) && "Expected canonical MDString");
-  DEFINE_GETIMPL_LOOKUP(DITemplateValueParameter, (Tag, Name, Type, Value));
+  DEFINE_GETIMPL_LOOKUP(DITemplateValueParameter,
+                        (Tag, Name, Type, isDefault, Value));
   Metadata *Ops[] = {Name, Type, Value};
-  DEFINE_GETIMPL_STORE(DITemplateValueParameter, (Tag), Ops);
+  DEFINE_GETIMPL_STORE(DITemplateValueParameter, (Tag, isDefault), Ops);
 }
 
 DIGlobalVariable *
@@ -925,6 +1039,7 @@ bool DIExpression::isValid() const {
     case dwarf::DW_OP_dup:
     case dwarf::DW_OP_regx:
     case dwarf::DW_OP_bregx:
+    case dwarf::DW_OP_push_object_address:
       break;
     }
   }
@@ -1107,7 +1222,9 @@ DIExpression *DIExpression::append(const DIExpression *Expr,
   }
 
   NewOps.append(Ops.begin(), Ops.end());
-  return DIExpression::get(Expr->getContext(), NewOps);
+  auto *result = DIExpression::get(Expr->getContext(), NewOps);
+  assert(result->isValid() && "concatenated expression is not valid");
+  return result;
 }
 
 DIExpression *DIExpression::appendToStack(const DIExpression *Expr,
diff --git a/llvm/lib/IR/DebugLoc.cpp b/llvm/lib/IR/DebugLoc.cpp
index 14d1396f1543..e945cbcba782 100644
--- a/llvm/lib/IR/DebugLoc.cpp
+++ b/llvm/lib/IR/DebugLoc.cpp
@@ -79,7 +79,7 @@ DebugLoc DebugLoc::get(unsigned Line, unsigned Col, const MDNode *Scope,
                          const_cast<MDNode *>(InlinedAt), ImplicitCode);
 }
 
-DebugLoc DebugLoc::appendInlinedAt(DebugLoc DL, DILocation *InlinedAt,
+DebugLoc DebugLoc::appendInlinedAt(const DebugLoc &DL, DILocation *InlinedAt,
                                    LLVMContext &Ctx,
                                    DenseMap<const MDNode *, MDNode *> &Cache,
                                    bool ReplaceLast) {
diff --git a/llvm/lib/IR/DiagnosticInfo.cpp b/llvm/lib/IR/DiagnosticInfo.cpp
index 99d5aec3f043..6528c723fbfa 100644
--- a/llvm/lib/IR/DiagnosticInfo.cpp
+++ b/llvm/lib/IR/DiagnosticInfo.cpp
@@ -119,7 +119,7 @@ DiagnosticLocation::DiagnosticLocation(const DebugLoc &DL) {
 DiagnosticLocation::DiagnosticLocation(const DISubprogram *SP) {
   if (!SP)
     return;
-  
+
   File = SP->getFile();
   Line = SP->getScopeLine();
   Column = 0;
@@ -132,7 +132,7 @@ StringRef DiagnosticLocation::getRelativePath() const {
 std::string DiagnosticLocation::getAbsolutePath() const {
   StringRef Name = File->getFilename();
   if (sys::path::is_absolute(Name))
-    return Name;
+    return std::string(Name);
 
   SmallString<128> Path;
   sys::path::append(Path, File->getDirectory(), Name);
@@ -160,8 +160,9 @@ const std::string DiagnosticInfoWithLocationBase::getLocationStr() const {
   return (Filename + ":" + Twine(Line) + ":" + Twine(Column)).str();
 }
 
-DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Value *V)
-    : Key(Key) {
+DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
+                                                   const Value *V)
+    : Key(std::string(Key)) {
   if (auto *F = dyn_cast<Function>(V)) {
     if (DISubprogram *SP = F->getSubprogram())
       Loc = SP;
@@ -172,7 +173,7 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Value *V
   // Only include names that correspond to user variables.  FIXME: We should use
   // debug info if available to get the name of the user variable.
   if (isa<llvm::Argument>(V) || isa<GlobalValue>(V))
-    Val = GlobalValue::dropLLVMManglingEscape(V->getName());
+    Val = std::string(GlobalValue::dropLLVMManglingEscape(V->getName()));
   else if (isa<Constant>(V)) {
     raw_string_ostream OS(Val);
     V->printAsOperand(OS, /*PrintType=*/false);
@@ -181,39 +182,39 @@ DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Value *V
 }
 
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, const Type *T)
-    : Key(Key) {
+    : Key(std::string(Key)) {
   raw_string_ostream OS(Val);
   OS << *T;
 }
 
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, StringRef S)
-    : Key(Key), Val(S.str()) {}
+    : Key(std::string(Key)), Val(S.str()) {}
 
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, int N)
-    : Key(Key), Val(itostr(N)) {}
+    : Key(std::string(Key)), Val(itostr(N)) {}
 
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, float N)
-    : Key(Key), Val(llvm::to_string(N)) {}
+    : Key(std::string(Key)), Val(llvm::to_string(N)) {}
 
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, long N)
-    : Key(Key), Val(itostr(N)) {}
+    : Key(std::string(Key)), Val(itostr(N)) {}
 
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, long long N)
-    : Key(Key), Val(itostr(N)) {}
+    : Key(std::string(Key)), Val(itostr(N)) {}
 
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, unsigned N)
-    : Key(Key), Val(utostr(N)) {}
+    : Key(std::string(Key)), Val(utostr(N)) {}
 
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
                                                    unsigned long N)
-    : Key(Key), Val(utostr(N)) {}
+    : Key(std::string(Key)), Val(utostr(N)) {}
 
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key,
                                                    unsigned long long N)
-    : Key(Key), Val(utostr(N)) {}
+    : Key(std::string(Key)), Val(utostr(N)) {}
 
 DiagnosticInfoOptimizationBase::Argument::Argument(StringRef Key, DebugLoc Loc)
-    : Key(Key), Loc(Loc) {
+    : Key(std::string(Key)), Loc(Loc) {
   if (Loc) {
     Val = (Loc->getFilename() + ":" + Twine(Loc.getLine()) + ":" +
            Twine(Loc.getCol())).str();
@@ -243,11 +244,8 @@ OptimizationRemark::OptimizationRemark(const char *PassName,
                                    RemarkName, *Inst->getParent()->getParent(),
                                    Inst->getDebugLoc(), Inst->getParent()) {}
 
-// Helper to allow for an assert before attempting to return an invalid
-// reference.
-static const BasicBlock &getFirstFunctionBlock(const Function *Func) {
-  assert(!Func->empty() && "Function does not have a body");
-  return Func->front();
+static const BasicBlock *getFirstFunctionBlock(const Function *Func) {
+  return Func->empty() ? nullptr : &Func->front();
 }
 
 OptimizationRemark::OptimizationRemark(const char *PassName,
@@ -255,7 +253,7 @@ OptimizationRemark::OptimizationRemark(const char *PassName,
                                        const Function *Func)
     : DiagnosticInfoIROptimization(DK_OptimizationRemark, DS_Remark, PassName,
                                    RemarkName, *Func, Func->getSubprogram(),
-                                   &getFirstFunctionBlock(Func)) {}
+                                   getFirstFunctionBlock(Func)) {}
 
 bool OptimizationRemark::isEnabled() const {
   const Function &Fn = getFunction();
diff --git a/llvm/lib/IR/Dominators.cpp b/llvm/lib/IR/Dominators.cpp
index 03dc4da273a3..bb1cc347dcb1 100644
--- a/llvm/lib/IR/Dominators.cpp
+++ b/llvm/lib/IR/Dominators.cpp
@@ -134,18 +134,13 @@ bool DominatorTree::dominates(const Instruction *Def,
   // dominates every instruction in UseBB.
   // A PHI is dominated only if the instruction dominates every possible use in
   // the UseBB.
-  if (isa<InvokeInst>(Def) || isa<PHINode>(User))
+  if (isa<InvokeInst>(Def) || isa<CallBrInst>(Def) || isa<PHINode>(User))
     return dominates(Def, UseBB);
 
   if (DefBB != UseBB)
     return dominates(DefBB, UseBB);
 
-  // Loop through the basic block until we find Def or User.
-  BasicBlock::const_iterator I = DefBB->begin();
-  for (; &*I != Def && &*I != User; ++I)
-    /*empty*/;
-
-  return &*I == Def;
+  return Def->comesBefore(User);
 }
 
 // true if Def would dominate a use in any instruction in UseBB.
@@ -173,6 +168,13 @@ bool DominatorTree::dominates(const Instruction *Def,
     return dominates(E, UseBB);
   }
 
+  // Callbr results are similarly only usable in the default destination.
+  if (const auto *CBI = dyn_cast<CallBrInst>(Def)) {
+    BasicBlock *NormalDest = CBI->getDefaultDest();
+    BasicBlockEdge E(DefBB, NormalDest);
+    return dominates(E, UseBB);
+  }
+
   return dominates(DefBB, UseBB);
 }
 
@@ -278,6 +280,13 @@ bool DominatorTree::dominates(const Instruction *Def, const Use &U) const {
     return dominates(E, U);
   }
 
+  // Callbr results are similarly only usable in the default destination.
+  if (const auto *CBI = dyn_cast<CallBrInst>(Def)) {
+    BasicBlock *NormalDest = CBI->getDefaultDest();
+    BasicBlockEdge E(DefBB, NormalDest);
+    return dominates(E, U);
+  }
+
   // If the def and use are in different blocks, do a simple CFG dominator
   // tree query.
   if (DefBB != UseBB)
@@ -289,12 +298,7 @@ bool DominatorTree::dominates(const Instruction *Def, const Use &U) const {
   if (isa<PHINode>(UserInst))
     return true;
 
-  // Otherwise, just loop through the basic block until we find Def or User.
-  BasicBlock::const_iterator I = DefBB->begin();
-  for (; &*I != Def && &*I != UserInst; ++I)
-    /*empty*/;
-
-  return &*I != UserInst;
+  return Def->comesBefore(UserInst);
 }
 
 bool DominatorTree::isReachableFromEntry(const Use &U) const {
@@ -312,6 +316,14 @@ bool DominatorTree::isReachableFromEntry(const Use &U) const {
   return isReachableFromEntry(I->getParent());
 }
 
+// Edge BBE1 dominates edge BBE2 if they match or BBE1 dominates start of BBE2.
+bool DominatorTree::dominates(const BasicBlockEdge &BBE1,
+                              const BasicBlockEdge &BBE2) const {
+  if (BBE1.getStart() == BBE2.getStart() && BBE1.getEnd() == BBE2.getEnd())
+    return true;
+  return dominates(BBE1, BBE2.getStart());
+}
+
 //===----------------------------------------------------------------------===//
 //  DominatorTreeAnalysis and related pass implementations
 //===----------------------------------------------------------------------===//
@@ -381,4 +393,3 @@ void DominatorTreeWrapperPass::verifyAnalysis() const {
 void DominatorTreeWrapperPass::print(raw_ostream &OS, const Module *) const {
   DT.print(OS);
 }
-
diff --git a/llvm/lib/IR/FPEnv.cpp b/llvm/lib/IR/FPEnv.cpp
index 008852658232..516c702acec7 100644
--- a/llvm/lib/IR/FPEnv.cpp
+++ b/llvm/lib/IR/FPEnv.cpp
@@ -12,41 +12,47 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/FPEnv.h"
+#include "llvm/ADT/StringSwitch.h"
 
 namespace llvm {
 
-Optional<fp::RoundingMode> StrToRoundingMode(StringRef RoundingArg) {
+Optional<RoundingMode> StrToRoundingMode(StringRef RoundingArg) {
   // For dynamic rounding mode, we use round to nearest but we will set the
   // 'exact' SDNodeFlag so that the value will not be rounded.
-  return StringSwitch<Optional<fp::RoundingMode>>(RoundingArg)
-      .Case("round.dynamic", fp::rmDynamic)
-      .Case("round.tonearest", fp::rmToNearest)
-      .Case("round.downward", fp::rmDownward)
-      .Case("round.upward", fp::rmUpward)
-      .Case("round.towardzero", fp::rmTowardZero)
+  return StringSwitch<Optional<RoundingMode>>(RoundingArg)
+      .Case("round.dynamic", RoundingMode::Dynamic)
+      .Case("round.tonearest", RoundingMode::NearestTiesToEven)
+      .Case("round.tonearestaway", RoundingMode::NearestTiesToAway)
+      .Case("round.downward", RoundingMode::TowardNegative)
+      .Case("round.upward", RoundingMode::TowardPositive)
+      .Case("round.towardzero", RoundingMode::TowardZero)
       .Default(None);
 }
 
-Optional<StringRef> RoundingModeToStr(fp::RoundingMode UseRounding) {
+Optional<StringRef> RoundingModeToStr(RoundingMode UseRounding) {
   Optional<StringRef> RoundingStr = None;
   switch (UseRounding) {
-  case fp::rmDynamic:
+  case RoundingMode::Dynamic:
     RoundingStr = "round.dynamic";
     break;
-  case fp::rmToNearest:
+  case RoundingMode::NearestTiesToEven:
     RoundingStr = "round.tonearest";
     break;
-  case fp::rmDownward:
+  case RoundingMode::NearestTiesToAway:
+    RoundingStr = "round.tonearestaway";
+    break;
+  case RoundingMode::TowardNegative:
     RoundingStr = "round.downward";
     break;
-  case fp::rmUpward:
+  case RoundingMode::TowardPositive:
     RoundingStr = "round.upward";
     break;
-  case fp::rmTowardZero:
+  case RoundingMode::TowardZero:
     RoundingStr = "round.towardzero";
     break;
+  default:
+    break;
   }
   return RoundingStr;
 }
@@ -74,5 +80,4 @@ Optional<StringRef> ExceptionBehaviorToStr(fp::ExceptionBehavior UseExcept) {
   }
   return ExceptStr;
 }
-
-}
-\ No newline at end of file
+} // namespace llvm
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 54612250b0d6..10d535e3ab11 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/IR/AbstractCallSite.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
@@ -114,11 +115,39 @@ bool Argument::hasInAllocaAttr() const {
   return hasAttribute(Attribute::InAlloca);
 }
 
-bool Argument::hasByValOrInAllocaAttr() const {
+bool Argument::hasPreallocatedAttr() const {
+  if (!getType()->isPointerTy())
+    return false;
+  return hasAttribute(Attribute::Preallocated);
+}
+
+bool Argument::hasPassPointeeByValueAttr() const {
   if (!getType()->isPointerTy()) return false;
   AttributeList Attrs = getParent()->getAttributes();
   return Attrs.hasParamAttribute(getArgNo(), Attribute::ByVal) ||
-         Attrs.hasParamAttribute(getArgNo(), Attribute::InAlloca);
+         Attrs.hasParamAttribute(getArgNo(), Attribute::InAlloca) ||
+         Attrs.hasParamAttribute(getArgNo(), Attribute::Preallocated);
+}
+
+uint64_t Argument::getPassPointeeByValueCopySize(const DataLayout &DL) const {
+  AttributeSet ParamAttrs
+    = getParent()->getAttributes().getParamAttributes(getArgNo());
+
+  // FIXME: All the type carrying attributes are mutually exclusive, so there
+  // should be a single query to get the stored type that handles any of them.
+  if (Type *ByValTy = ParamAttrs.getByValType())
+    return DL.getTypeAllocSize(ByValTy);
+  if (Type *PreAllocTy = ParamAttrs.getPreallocatedType())
+    return DL.getTypeAllocSize(PreAllocTy);
+
+  // FIXME: inalloca always depends on pointee element type. It's also possible
+  // for byval to miss it.
+  if (ParamAttrs.hasAttribute(Attribute::InAlloca) ||
+      ParamAttrs.hasAttribute(Attribute::ByVal) ||
+      ParamAttrs.hasAttribute(Attribute::Preallocated))
+    return DL.getTypeAllocSize(cast<PointerType>(getType())->getElementType());
+
+  return 0;
 }
 
 unsigned Argument::getParamAlignment() const {
@@ -320,6 +349,18 @@ static MutableArrayRef<Argument> makeArgArray(Argument *Args, size_t Count) {
   return MutableArrayRef<Argument>(Args, Count);
 }
 
+bool Function::isConstrainedFPIntrinsic() const {
+  switch (getIntrinsicID()) {
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
+  case Intrinsic::INTRINSIC:
+#include "llvm/IR/ConstrainedOps.def"
+    return true;
+#undef INSTRUCTION
+  default:
+    return false;
+  }
+}
+
 void Function::clearArguments() {
   for (Argument &A : makeArgArray(Arguments, NumArgs)) {
     A.setName("");
@@ -632,16 +673,17 @@ static std::string getMangledTypeStr(Type* Ty) {
     // Ensure nested function types are distinguishable.
     Result += "f";
   } else if (VectorType* VTy = dyn_cast<VectorType>(Ty)) {
-    if (VTy->isScalable())
+    ElementCount EC = VTy->getElementCount();
+    if (EC.Scalable)
       Result += "nx";
-    Result += "v" + utostr(VTy->getVectorNumElements()) +
-      getMangledTypeStr(VTy->getVectorElementType());
+    Result += "v" + utostr(EC.Min) + getMangledTypeStr(VTy->getElementType());
   } else if (Ty) {
     switch (Ty->getTypeID()) {
     default: llvm_unreachable("Unhandled type");
     case Type::VoidTyID:      Result += "isVoid";   break;
     case Type::MetadataTyID:  Result += "Metadata"; break;
     case Type::HalfTyID:      Result += "f16";      break;
+    case Type::BFloatTyID:    Result += "bf16";     break;
     case Type::FloatTyID:     Result += "f32";      break;
     case Type::DoubleTyID:    Result += "f64";      break;
     case Type::X86_FP80TyID:  Result += "f80";      break;
@@ -726,13 +768,18 @@ enum IIT_Info {
   IIT_SCALABLE_VEC = 43,
   IIT_SUBDIVIDE2_ARG = 44,
   IIT_SUBDIVIDE4_ARG = 45,
-  IIT_VEC_OF_BITCASTS_TO_INT = 46
+  IIT_VEC_OF_BITCASTS_TO_INT = 46,
+  IIT_V128 = 47,
+  IIT_BF16 = 48
 };
 
 static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
+                      IIT_Info LastInfo,
                       SmallVectorImpl<Intrinsic::IITDescriptor> &OutputTable) {
   using namespace Intrinsic;
 
+  bool IsScalableVector = (LastInfo == IIT_SCALABLE_VEC);
+
   IIT_Info Info = IIT_Info(Infos[NextElt++]);
   unsigned StructElts = 2;
 
@@ -755,6 +802,9 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
   case IIT_F16:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Half, 0));
     return;
+  case IIT_BF16:
+    OutputTable.push_back(IITDescriptor::get(IITDescriptor::BFloat, 0));
+    return;
   case IIT_F32:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Float, 0));
     return;
@@ -783,49 +833,53 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Integer, 128));
     return;
   case IIT_V1:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 1));
-    DecodeIITType(NextElt, Infos, OutputTable);
+    OutputTable.push_back(IITDescriptor::getVector(1, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
   case IIT_V2:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 2));
-    DecodeIITType(NextElt, Infos, OutputTable);
+    OutputTable.push_back(IITDescriptor::getVector(2, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
   case IIT_V4:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 4));
-    DecodeIITType(NextElt, Infos, OutputTable);
+    OutputTable.push_back(IITDescriptor::getVector(4, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
   case IIT_V8:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 8));
-    DecodeIITType(NextElt, Infos, OutputTable);
+    OutputTable.push_back(IITDescriptor::getVector(8, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
   case IIT_V16:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 16));
-    DecodeIITType(NextElt, Infos, OutputTable);
+    OutputTable.push_back(IITDescriptor::getVector(16, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
   case IIT_V32:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 32));
-    DecodeIITType(NextElt, Infos, OutputTable);
+    OutputTable.push_back(IITDescriptor::getVector(32, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
   case IIT_V64:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 64));
-    DecodeIITType(NextElt, Infos, OutputTable);
+    OutputTable.push_back(IITDescriptor::getVector(64, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
+    return;
+  case IIT_V128:
+    OutputTable.push_back(IITDescriptor::getVector(128, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
   case IIT_V512:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 512));
-    DecodeIITType(NextElt, Infos, OutputTable);
+    OutputTable.push_back(IITDescriptor::getVector(512, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
   case IIT_V1024:
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::Vector, 1024));
-    DecodeIITType(NextElt, Infos, OutputTable);
+    OutputTable.push_back(IITDescriptor::getVector(1024, IsScalableVector));
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
   case IIT_PTR:
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer, 0));
-    DecodeIITType(NextElt, Infos, OutputTable);
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
   case IIT_ANYPTR: {  // [ANYPTR addrspace, subtype]
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Pointer,
                                              Infos[NextElt++]));
-    DecodeIITType(NextElt, Infos, OutputTable);
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
   }
   case IIT_ARG: {
@@ -888,7 +942,7 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
     OutputTable.push_back(IITDescriptor::get(IITDescriptor::Struct,StructElts));
 
     for (unsigned i = 0; i != StructElts; ++i)
-      DecodeIITType(NextElt, Infos, OutputTable);
+      DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
   }
   case IIT_SUBDIVIDE2_ARG: {
@@ -910,9 +964,7 @@ static void DecodeIITType(unsigned &NextElt, ArrayRef<unsigned char> Infos,
     return;
   }
   case IIT_SCALABLE_VEC: {
-    OutputTable.push_back(IITDescriptor::get(IITDescriptor::ScalableVecArgument,
-                                             0));
-    DecodeIITType(NextElt, Infos, OutputTable);
+    DecodeIITType(NextElt, Infos, Info, OutputTable);
     return;
   }
   case IIT_VEC_OF_BITCASTS_TO_INT: {
@@ -957,9 +1009,9 @@ void Intrinsic::getIntrinsicInfoTableEntries(ID id,
   }
 
   // Okay, decode the table into the output vector of IITDescriptors.
-  DecodeIITType(NextElt, IITEntries, T);
+  DecodeIITType(NextElt, IITEntries, IIT_Done, T);
   while (NextElt != IITEntries.size() && IITEntries[NextElt] != 0)
-    DecodeIITType(NextElt, IITEntries, T);
+    DecodeIITType(NextElt, IITEntries, IIT_Done, T);
 }
 
 static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
@@ -976,6 +1028,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::Token: return Type::getTokenTy(Context);
   case IITDescriptor::Metadata: return Type::getMetadataTy(Context);
   case IITDescriptor::Half: return Type::getHalfTy(Context);
+  case IITDescriptor::BFloat: return Type::getBFloatTy(Context);
   case IITDescriptor::Float: return Type::getFloatTy(Context);
   case IITDescriptor::Double: return Type::getDoubleTy(Context);
   case IITDescriptor::Quad: return Type::getFP128Ty(Context);
@@ -983,7 +1036,8 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::Integer:
     return IntegerType::get(Context, D.Integer_Width);
   case IITDescriptor::Vector:
-    return VectorType::get(DecodeFixedType(Infos, Tys, Context),D.Vector_Width);
+    return VectorType::get(DecodeFixedType(Infos, Tys, Context),
+                           D.Vector_Width);
   case IITDescriptor::Pointer:
     return PointerType::get(DecodeFixedType(Infos, Tys, Context),
                             D.Pointer_AddressSpace);
@@ -1038,7 +1092,7 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
     VectorType *VTy = dyn_cast<VectorType>(Ty);
     if (!VTy)
       llvm_unreachable("Expected an argument of Vector Type");
-    Type *EltTy = VTy->getVectorElementType();
+    Type *EltTy = VTy->getElementType();
     return PointerType::getUnqual(EltTy);
   }
   case IITDescriptor::VecElementArgument: {
@@ -1056,11 +1110,6 @@ static Type *DecodeFixedType(ArrayRef<Intrinsic::IITDescriptor> &Infos,
   case IITDescriptor::VecOfAnyPtrsToElt:
     // Return the overloaded type (which determines the pointers address space)
     return Tys[D.getOverloadArgNumber()];
-  case IITDescriptor::ScalableVecArgument: {
-    Type *Ty = DecodeFixedType(Infos, Tys, Context);
-    return VectorType::get(Ty->getVectorElementType(),
-                           { Ty->getVectorNumElements(), true });
-  }
   }
   llvm_unreachable("unhandled");
 }
@@ -1158,13 +1207,14 @@ static bool matchIntrinsicType(
     case IITDescriptor::Token: return !Ty->isTokenTy();
     case IITDescriptor::Metadata: return !Ty->isMetadataTy();
     case IITDescriptor::Half: return !Ty->isHalfTy();
+    case IITDescriptor::BFloat: return !Ty->isBFloatTy();
     case IITDescriptor::Float: return !Ty->isFloatTy();
     case IITDescriptor::Double: return !Ty->isDoubleTy();
     case IITDescriptor::Quad: return !Ty->isFP128Ty();
     case IITDescriptor::Integer: return !Ty->isIntegerTy(D.Integer_Width);
     case IITDescriptor::Vector: {
       VectorType *VT = dyn_cast<VectorType>(Ty);
-      return !VT || VT->getNumElements() != D.Vector_Width ||
+      return !VT || VT->getElementCount() != D.Vector_Width ||
              matchIntrinsicType(VT->getElementType(), Infos, ArgTys,
                                 DeferredChecks, IsDeferredCheck);
     }
@@ -1264,7 +1314,7 @@ static bool matchIntrinsicType(
         if (ReferenceType->getElementCount() !=
             ThisArgType->getElementCount())
           return true;
-        EltTy = ThisArgType->getVectorElementType();
+        EltTy = ThisArgType->getElementType();
       }
       return matchIntrinsicType(EltTy, Infos, ArgTys, DeferredChecks,
                                 IsDeferredCheck);
@@ -1309,15 +1359,13 @@ static bool matchIntrinsicType(
       VectorType *ReferenceType = dyn_cast<VectorType>(ArgTys[RefArgNumber]);
       VectorType *ThisArgVecTy = dyn_cast<VectorType>(Ty);
       if (!ThisArgVecTy || !ReferenceType ||
-          (ReferenceType->getVectorNumElements() !=
-           ThisArgVecTy->getVectorNumElements()))
+          (ReferenceType->getNumElements() != ThisArgVecTy->getNumElements()))
         return true;
       PointerType *ThisArgEltTy =
-              dyn_cast<PointerType>(ThisArgVecTy->getVectorElementType());
+          dyn_cast<PointerType>(ThisArgVecTy->getElementType());
       if (!ThisArgEltTy)
         return true;
-      return ThisArgEltTy->getElementType() !=
-             ReferenceType->getVectorElementType();
+      return ThisArgEltTy->getElementType() != ReferenceType->getElementType();
     }
     case IITDescriptor::VecElementArgument: {
       if (D.getArgumentNumber() >= ArgTys.size())
@@ -1339,13 +1387,6 @@ static bool matchIntrinsicType(
       }
       return true;
     }
-    case IITDescriptor::ScalableVecArgument: {
-      VectorType *VTy = dyn_cast<VectorType>(Ty);
-      if (!VTy || !VTy->isScalable())
-        return true;
-      return matchIntrinsicType(VTy, Infos, ArgTys, DeferredChecks,
-                                IsDeferredCheck);
-    }
     case IITDescriptor::VecOfBitcastsToInt: {
       if (D.getArgumentNumber() >= ArgTys.size())
         return IsDeferredCheck || DeferCheck(Ty);
@@ -1405,42 +1446,60 @@ Intrinsic::matchIntrinsicVarArg(bool isVarArg,
   return true;
 }
 
-Optional<Function*> Intrinsic::remangleIntrinsicFunction(Function *F) {
+bool Intrinsic::getIntrinsicSignature(Function *F,
+                                      SmallVectorImpl<Type *> &ArgTys) {
   Intrinsic::ID ID = F->getIntrinsicID();
   if (!ID)
-    return None;
+    return false;
 
-  FunctionType *FTy = F->getFunctionType();
-  // Accumulate an array of overloaded types for the given intrinsic
-  SmallVector<Type *, 4> ArgTys;
-  {
-    SmallVector<Intrinsic::IITDescriptor, 8> Table;
-    getIntrinsicInfoTableEntries(ID, Table);
-    ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
-
-    if (Intrinsic::matchIntrinsicSignature(FTy, TableRef, ArgTys))
-      return None;
-    if (Intrinsic::matchIntrinsicVarArg(FTy->isVarArg(), TableRef))
-      return None;
+  SmallVector<Intrinsic::IITDescriptor, 8> Table;
+  getIntrinsicInfoTableEntries(ID, Table);
+  ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
+
+  if (Intrinsic::matchIntrinsicSignature(F->getFunctionType(), TableRef,
+                                         ArgTys) !=
+      Intrinsic::MatchIntrinsicTypesResult::MatchIntrinsicTypes_Match) {
+    return false;
   }
+  if (Intrinsic::matchIntrinsicVarArg(F->getFunctionType()->isVarArg(),
+                                      TableRef))
+    return false;
+  return true;
+}
+
+Optional<Function *> Intrinsic::remangleIntrinsicFunction(Function *F) {
+  SmallVector<Type *, 4> ArgTys;
+  if (!getIntrinsicSignature(F, ArgTys))
+    return None;
 
+  Intrinsic::ID ID = F->getIntrinsicID();
   StringRef Name = F->getName();
   if (Name == Intrinsic::getName(ID, ArgTys))
     return None;
 
   auto NewDecl = Intrinsic::getDeclaration(F->getParent(), ID, ArgTys);
   NewDecl->setCallingConv(F->getCallingConv());
-  assert(NewDecl->getFunctionType() == FTy && "Shouldn't change the signature");
+  assert(NewDecl->getFunctionType() == F->getFunctionType() &&
+         "Shouldn't change the signature");
   return NewDecl;
 }
 
 /// hasAddressTaken - returns true if there are any uses of this function
-/// other than direct calls or invokes to it.
-bool Function::hasAddressTaken(const User* *PutOffender) const {
+/// other than direct calls or invokes to it. Optionally ignores callback
+/// uses.
+bool Function::hasAddressTaken(const User **PutOffender,
+                               bool IgnoreCallbackUses) const {
   for (const Use &U : uses()) {
     const User *FU = U.getUser();
     if (isa<BlockAddress>(FU))
       continue;
+
+    if (IgnoreCallbackUses) {
+      AbstractCallSite ACS(&U);
+      if (ACS && ACS.isCallbackCall())
+        continue;
+    }
+
     const auto *Call = dyn_cast<CallBase>(FU);
     if (!Call) {
       if (PutOffender)
@@ -1620,9 +1679,7 @@ Optional<StringRef> Function::getSectionPrefix() const {
 }
 
 bool Function::nullPointerIsDefined() const {
-  return getFnAttribute("null-pointer-is-valid")
-          .getValueAsString()
-          .equals("true");
+  return hasFnAttribute(Attribute::NullPointerIsValid);
 }
 
 bool llvm::NullPointerIsDefined(const Function *F, unsigned AS) {
diff --git a/llvm/lib/IR/Globals.cpp b/llvm/lib/IR/Globals.cpp
index 46a9696b2944..dd8e62164de1 100644
--- a/llvm/lib/IR/Globals.cpp
+++ b/llvm/lib/IR/Globals.cpp
@@ -65,6 +65,7 @@ Value *GlobalValue::handleOperandChangeImpl(Value *From, Value *To) {
 void GlobalValue::copyAttributesFrom(const GlobalValue *Src) {
   setVisibility(Src->getVisibility());
   setUnnamedAddr(Src->getUnnamedAddr());
+  setThreadLocalMode(Src->getThreadLocalMode());
   setDLLStorageClass(Src->getDLLStorageClass());
   setDSOLocal(Src->isDSOLocal());
   setPartition(Src->getPartition());
@@ -94,18 +95,17 @@ void GlobalValue::eraseFromParent() {
   llvm_unreachable("not a global");
 }
 
-unsigned GlobalValue::getAlignment() const {
-  if (auto *GA = dyn_cast<GlobalAlias>(this)) {
-    // In general we cannot compute this at the IR level, but we try.
-    if (const GlobalObject *GO = GA->getBaseObject())
-      return GO->getAlignment();
+bool GlobalValue::isInterposable() const {
+  if (isInterposableLinkage(getLinkage()))
+    return true;
+  return getParent() && getParent()->getSemanticInterposition() &&
+         !isDSOLocal();
+}
 
-    // FIXME: we should also be able to handle:
-    // Alias = Global + Offset
-    // Alias = Absolute
-    return 0;
-  }
-  return cast<GlobalObject>(this)->getAlignment();
+bool GlobalValue::canBenefitFromLocalAlias() const {
+  // See AsmPrinter::getSymbolPreferLocal().
+  return GlobalObject::isExternalLinkage(getLinkage()) && !isDeclaration() &&
+         !isa<GlobalIFunc>(this) && !hasComdat();
 }
 
 unsigned GlobalValue::getAddressSpace() const {
@@ -113,12 +113,8 @@ unsigned GlobalValue::getAddressSpace() const {
   return PtrTy->getAddressSpace();
 }
 
-void GlobalObject::setAlignment(unsigned Align) {
-  setAlignment(MaybeAlign(Align));
-}
-
 void GlobalObject::setAlignment(MaybeAlign Align) {
-  assert((!Align || Align <= MaximumAlignment) &&
+  assert((!Align || *Align <= MaximumAlignment) &&
          "Alignment is greater than MaximumAlignment!");
   unsigned AlignmentData = encode(Align);
   unsigned OldData = getGlobalValueSubClassData();
@@ -143,7 +139,7 @@ std::string GlobalValue::getGlobalIdentifier(StringRef Name,
   if (Name[0] == '\1')
     Name = Name.substr(1);
 
-  std::string NewName = Name;
+  std::string NewName = std::string(Name);
   if (llvm::GlobalValue::isLocalLinkage(Linkage)) {
     // For local symbols, prepend the main file name to distinguish them.
     // Do not include the full path in the file name since there's no guarantee
@@ -242,7 +238,7 @@ bool GlobalValue::isDeclaration() const {
   return false;
 }
 
-bool GlobalValue::canIncreaseAlignment() const {
+bool GlobalObject::canIncreaseAlignment() const {
   // Firstly, can only increase the alignment of a global if it
   // is a strong definition.
   if (!isStrongDefinitionForLinker())
@@ -410,7 +406,6 @@ void GlobalVariable::setInitializer(Constant *InitVal) {
 /// from the GlobalVariable Src to this one.
 void GlobalVariable::copyAttributesFrom(const GlobalVariable *Src) {
   GlobalObject::copyAttributesFrom(Src);
-  setThreadLocalMode(Src->getThreadLocalMode());
   setExternallyInitialized(Src->isExternallyInitialized());
   setAttributes(Src->getAttributes());
 }
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 30b558a655cb..b87dfe1c8df6 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
@@ -49,7 +50,7 @@ GlobalVariable *IRBuilderBase::CreateGlobalString(StringRef Str,
                                 nullptr, GlobalVariable::NotThreadLocal,
                                 AddressSpace);
   GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(Align::None());
+  GV->setAlignment(Align(1));
   return GV;
 }
 
@@ -64,38 +65,20 @@ Value *IRBuilderBase::getCastedInt8PtrValue(Value *Ptr) {
     return Ptr;
 
   // Otherwise, we need to insert a bitcast.
-  PT = getInt8PtrTy(PT->getAddressSpace());
-  BitCastInst *BCI = new BitCastInst(Ptr, PT, "");
-  BB->getInstList().insert(InsertPt, BCI);
-  SetInstDebugLocation(BCI);
-  return BCI;
+  return CreateBitCast(Ptr, getInt8PtrTy(PT->getAddressSpace()));
 }
 
 static CallInst *createCallHelper(Function *Callee, ArrayRef<Value *> Ops,
                                   IRBuilderBase *Builder,
                                   const Twine &Name = "",
-                                  Instruction *FMFSource = nullptr) {
-  CallInst *CI = CallInst::Create(Callee, Ops, Name);
+                                  Instruction *FMFSource = nullptr,
+                                  ArrayRef<OperandBundleDef> OpBundles = {}) {
+  CallInst *CI = Builder->CreateCall(Callee, Ops, OpBundles, Name);
   if (FMFSource)
     CI->copyFastMathFlags(FMFSource);
-  Builder->GetInsertBlock()->getInstList().insert(Builder->GetInsertPoint(),CI);
-  Builder->SetInstDebugLocation(CI);
   return CI;
 }
 
-static InvokeInst *createInvokeHelper(Function *Invokee, BasicBlock *NormalDest,
-                                      BasicBlock *UnwindDest,
-                                      ArrayRef<Value *> Ops,
-                                      IRBuilderBase *Builder,
-                                      const Twine &Name = "") {
-  InvokeInst *II =
-      InvokeInst::Create(Invokee, NormalDest, UnwindDest, Ops, Name);
-  Builder->GetInsertBlock()->getInstList().insert(Builder->GetInsertPoint(),
-                                                  II);
-  Builder->SetInstDebugLocation(II);
-  return II;
-}
-
 CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size,
                                       MaybeAlign Align, bool isVolatile,
                                       MDNode *TBAATag, MDNode *ScopeTag,
@@ -152,16 +135,6 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet(
   return CI;
 }
 
-CallInst *IRBuilderBase::CreateMemCpy(Value *Dst, unsigned DstAlign, Value *Src,
-                                      unsigned SrcAlign, Value *Size,
-                                      bool isVolatile, MDNode *TBAATag,
-                                      MDNode *TBAAStructTag, MDNode *ScopeTag,
-                                      MDNode *NoAliasTag) {
-  return CreateMemCpy(Dst, MaybeAlign(DstAlign), Src, MaybeAlign(SrcAlign),
-                      Size, isVolatile, TBAATag, TBAAStructTag, ScopeTag,
-                      NoAliasTag);
-}
-
 CallInst *IRBuilderBase::CreateMemCpy(Value *Dst, MaybeAlign DstAlign,
                                       Value *Src, MaybeAlign SrcAlign,
                                       Value *Size, bool isVolatile,
@@ -200,8 +173,32 @@ CallInst *IRBuilderBase::CreateMemCpy(Value *Dst, MaybeAlign DstAlign,
   return CI;
 }
 
+CallInst *IRBuilderBase::CreateMemCpyInline(Value *Dst, MaybeAlign DstAlign,
+                                            Value *Src, MaybeAlign SrcAlign,
+                                            Value *Size) {
+  Dst = getCastedInt8PtrValue(Dst);
+  Src = getCastedInt8PtrValue(Src);
+  Value *IsVolatile = getInt1(false);
+
+  Value *Ops[] = {Dst, Src, Size, IsVolatile};
+  Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()};
+  Function *F = BB->getParent();
+  Module *M = F->getParent();
+  Function *TheFn = Intrinsic::getDeclaration(M, Intrinsic::memcpy_inline, Tys);
+
+  CallInst *CI = createCallHelper(TheFn, Ops, this);
+
+  auto *MCI = cast<MemCpyInlineInst>(CI);
+  if (DstAlign)
+    MCI->setDestAlignment(*DstAlign);
+  if (SrcAlign)
+    MCI->setSourceAlignment(*SrcAlign);
+
+  return CI;
+}
+
 CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy(
-    Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign, Value *Size,
+    Value *Dst, Align DstAlign, Value *Src, Align SrcAlign, Value *Size,
     uint32_t ElementSize, MDNode *TBAATag, MDNode *TBAAStructTag,
     MDNode *ScopeTag, MDNode *NoAliasTag) {
   assert(DstAlign >= ElementSize &&
@@ -276,7 +273,7 @@ CallInst *IRBuilderBase::CreateMemMove(Value *Dst, MaybeAlign DstAlign,
 }
 
 CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemMove(
-    Value *Dst, unsigned DstAlign, Value *Src, unsigned SrcAlign, Value *Size,
+    Value *Dst, Align DstAlign, Value *Src, Align SrcAlign, Value *Size,
     uint32_t ElementSize, MDNode *TBAATag, MDNode *TBAAStructTag,
     MDNode *ScopeTag, MDNode *NoAliasTag) {
   assert(DstAlign >= ElementSize &&
@@ -295,10 +292,8 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemMove(
   CallInst *CI = createCallHelper(TheFn, Ops, this);
 
   // Set the alignment of the pointer args.
-  CI->addParamAttr(
-      0, Attribute::getWithAlignment(CI->getContext(), Align(DstAlign)));
-  CI->addParamAttr(
-      1, Attribute::getWithAlignment(CI->getContext(), Align(SrcAlign)));
+  CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), DstAlign));
+  CI->addParamAttr(1, Attribute::getWithAlignment(CI->getContext(), SrcAlign));
 
   // Set the TBAA info if present.
   if (TBAATag)
@@ -455,25 +450,27 @@ CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) {
   return createCallHelper(TheFn, Ops, this);
 }
 
-CallInst *IRBuilderBase::CreateAssumption(Value *Cond) {
+CallInst *
+IRBuilderBase::CreateAssumption(Value *Cond,
+                                ArrayRef<OperandBundleDef> OpBundles) {
   assert(Cond->getType() == getInt1Ty() &&
          "an assumption condition must be of type i1");
 
   Value *Ops[] = { Cond };
   Module *M = BB->getParent()->getParent();
   Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume);
-  return createCallHelper(FnAssume, Ops, this);
+  return createCallHelper(FnAssume, Ops, this, "", nullptr, OpBundles);
 }
 
 /// Create a call to a Masked Load intrinsic.
-/// \p Ptr      - base pointer for the load
-/// \p Align    - alignment of the source location
-/// \p Mask     - vector of booleans which indicates what vector lanes should
-///               be accessed in memory
-/// \p PassThru - pass-through value that is used to fill the masked-off lanes
-///               of the result
-/// \p Name     - name of the result variable
-CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align,
+/// \p Ptr       - base pointer for the load
+/// \p Alignment - alignment of the source location
+/// \p Mask      - vector of booleans which indicates what vector lanes should
+///                be accessed in memory
+/// \p PassThru  - pass-through value that is used to fill the masked-off lanes
+///                of the result
+/// \p Name      - name of the result variable
+CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, Align Alignment,
                                           Value *Mask, Value *PassThru,
                                           const Twine &Name) {
   auto *PtrTy = cast<PointerType>(Ptr->getType());
@@ -483,25 +480,25 @@ CallInst *IRBuilderBase::CreateMaskedLoad(Value *Ptr, unsigned Align,
   if (!PassThru)
     PassThru = UndefValue::get(DataTy);
   Type *OverloadedTypes[] = { DataTy, PtrTy };
-  Value *Ops[] = { Ptr, getInt32(Align), Mask,  PassThru};
+  Value *Ops[] = {Ptr, getInt32(Alignment.value()), Mask, PassThru};
   return CreateMaskedIntrinsic(Intrinsic::masked_load, Ops,
                                OverloadedTypes, Name);
 }
 
 /// Create a call to a Masked Store intrinsic.
-/// \p Val   - data to be stored,
-/// \p Ptr   - base pointer for the store
-/// \p Align - alignment of the destination location
-/// \p Mask  - vector of booleans which indicates what vector lanes should
-///            be accessed in memory
+/// \p Val       - data to be stored,
+/// \p Ptr       - base pointer for the store
+/// \p Alignment - alignment of the destination location
+/// \p Mask      - vector of booleans which indicates what vector lanes should
+///                be accessed in memory
 CallInst *IRBuilderBase::CreateMaskedStore(Value *Val, Value *Ptr,
-                                           unsigned Align, Value *Mask) {
+                                           Align Alignment, Value *Mask) {
   auto *PtrTy = cast<PointerType>(Ptr->getType());
   Type *DataTy = PtrTy->getElementType();
   assert(DataTy->isVectorTy() && "Ptr should point to a vector");
   assert(Mask && "Mask should not be all-ones (null)");
   Type *OverloadedTypes[] = { DataTy, PtrTy };
-  Value *Ops[] = { Val, Ptr, getInt32(Align), Mask };
+  Value *Ops[] = {Val, Ptr, getInt32(Alignment.value()), Mask};
   return CreateMaskedIntrinsic(Intrinsic::masked_store, Ops, OverloadedTypes);
 }
 
@@ -525,23 +522,23 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id,
 /// \p PassThru - pass-through value that is used to fill the masked-off lanes
 ///               of the result
 /// \p Name     - name of the result variable
-CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, unsigned Align,
-                                            Value *Mask,  Value *PassThru,
-                                            const Twine& Name) {
+CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, Align Alignment,
+                                            Value *Mask, Value *PassThru,
+                                            const Twine &Name) {
   auto PtrsTy = cast<VectorType>(Ptrs->getType());
   auto PtrTy = cast<PointerType>(PtrsTy->getElementType());
-  unsigned NumElts = PtrsTy->getVectorNumElements();
-  Type *DataTy = VectorType::get(PtrTy->getElementType(), NumElts);
+  unsigned NumElts = PtrsTy->getNumElements();
+  auto *DataTy = FixedVectorType::get(PtrTy->getElementType(), NumElts);
 
   if (!Mask)
-    Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context),
-                                     NumElts));
+    Mask = Constant::getAllOnesValue(
+        FixedVectorType::get(Type::getInt1Ty(Context), NumElts));
 
   if (!PassThru)
     PassThru = UndefValue::get(DataTy);
 
   Type *OverloadedTypes[] = {DataTy, PtrsTy};
-  Value * Ops[] = {Ptrs, getInt32(Align), Mask, PassThru};
+  Value *Ops[] = {Ptrs, getInt32(Alignment.value()), Mask, PassThru};
 
   // We specify only one type when we create this intrinsic. Types of other
   // arguments are derived from this type.
@@ -557,36 +554,34 @@ CallInst *IRBuilderBase::CreateMaskedGather(Value *Ptrs, unsigned Align,
 /// \p Mask  - vector of booleans which indicates what vector lanes should
 ///            be accessed in memory
 CallInst *IRBuilderBase::CreateMaskedScatter(Value *Data, Value *Ptrs,
-                                             unsigned Align, Value *Mask) {
+                                             Align Alignment, Value *Mask) {
   auto PtrsTy = cast<VectorType>(Ptrs->getType());
   auto DataTy = cast<VectorType>(Data->getType());
-  unsigned NumElts = PtrsTy->getVectorNumElements();
+  unsigned NumElts = PtrsTy->getNumElements();
 
 #ifndef NDEBUG
   auto PtrTy = cast<PointerType>(PtrsTy->getElementType());
-  assert(NumElts == DataTy->getVectorNumElements() &&
+  assert(NumElts == DataTy->getNumElements() &&
          PtrTy->getElementType() == DataTy->getElementType() &&
          "Incompatible pointer and data types");
 #endif
 
   if (!Mask)
-    Mask = Constant::getAllOnesValue(VectorType::get(Type::getInt1Ty(Context),
-                                     NumElts));
+    Mask = Constant::getAllOnesValue(
+        FixedVectorType::get(Type::getInt1Ty(Context), NumElts));
 
   Type *OverloadedTypes[] = {DataTy, PtrsTy};
-  Value * Ops[] = {Data, Ptrs, getInt32(Align), Mask};
+  Value *Ops[] = {Data, Ptrs, getInt32(Alignment.value()), Mask};
 
   // We specify only one type when we create this intrinsic. Types of other
   // arguments are derived from this type.
   return CreateMaskedIntrinsic(Intrinsic::masked_scatter, Ops, OverloadedTypes);
 }
 
-template <typename T0, typename T1, typename T2, typename T3>
+template <typename T0>
 static std::vector<Value *>
 getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes,
-                  Value *ActualCallee, uint32_t Flags, ArrayRef<T0> CallArgs,
-                  ArrayRef<T1> TransitionArgs, ArrayRef<T2> DeoptArgs,
-                  ArrayRef<T3> GCArgs) {
+                  Value *ActualCallee, uint32_t Flags, ArrayRef<T0> CallArgs) {
   std::vector<Value *> Args;
   Args.push_back(B.getInt64(ID));
   Args.push_back(B.getInt32(NumPatchBytes));
@@ -594,20 +589,45 @@ getStatepointArgs(IRBuilderBase &B, uint64_t ID, uint32_t NumPatchBytes,
   Args.push_back(B.getInt32(CallArgs.size()));
   Args.push_back(B.getInt32(Flags));
   Args.insert(Args.end(), CallArgs.begin(), CallArgs.end());
-  Args.push_back(B.getInt32(TransitionArgs.size()));
-  Args.insert(Args.end(), TransitionArgs.begin(), TransitionArgs.end());
-  Args.push_back(B.getInt32(DeoptArgs.size()));
-  Args.insert(Args.end(), DeoptArgs.begin(), DeoptArgs.end());
-  Args.insert(Args.end(), GCArgs.begin(), GCArgs.end());
-
+  // GC Transition and Deopt args are now always handled via operand bundle.
+  // They will be removed from the signature of gc.statepoint shortly.
+  Args.push_back(B.getInt32(0));
+  Args.push_back(B.getInt32(0));
+  // GC args are now encoded in the gc-live operand bundle
   return Args;
 }
 
+template<typename T1, typename T2, typename T3>
+static std::vector<OperandBundleDef>
+getStatepointBundles(Optional<ArrayRef<T1>> TransitionArgs,
+                     Optional<ArrayRef<T2>> DeoptArgs,
+                     ArrayRef<T3> GCArgs) {
+  std::vector<OperandBundleDef> Rval;
+  if (DeoptArgs) {
+    SmallVector<Value*, 16> DeoptValues;
+    DeoptValues.insert(DeoptValues.end(), DeoptArgs->begin(), DeoptArgs->end());
+    Rval.emplace_back("deopt", DeoptValues);
+  }
+  if (TransitionArgs) {
+    SmallVector<Value*, 16> TransitionValues;
+    TransitionValues.insert(TransitionValues.end(),
+                            TransitionArgs->begin(), TransitionArgs->end());
+    Rval.emplace_back("gc-transition", TransitionValues);
+  }
+  if (GCArgs.size()) {
+    SmallVector<Value*, 16> LiveValues;
+    LiveValues.insert(LiveValues.end(), GCArgs.begin(), GCArgs.end());
+    Rval.emplace_back("gc-live", LiveValues);
+  }
+  return Rval;
+}
+
 template <typename T0, typename T1, typename T2, typename T3>
 static CallInst *CreateGCStatepointCallCommon(
     IRBuilderBase *Builder, uint64_t ID, uint32_t NumPatchBytes,
     Value *ActualCallee, uint32_t Flags, ArrayRef<T0> CallArgs,
-    ArrayRef<T1> TransitionArgs, ArrayRef<T2> DeoptArgs, ArrayRef<T3> GCArgs,
+    Optional<ArrayRef<T1>> TransitionArgs,
+    Optional<ArrayRef<T2>> DeoptArgs, ArrayRef<T3> GCArgs,
     const Twine &Name) {
   // Extract out the type of the callee.
   auto *FuncPtrType = cast<PointerType>(ActualCallee->getType());
@@ -623,13 +643,17 @@ static CallInst *CreateGCStatepointCallCommon(
 
   std::vector<Value *> Args =
       getStatepointArgs(*Builder, ID, NumPatchBytes, ActualCallee, Flags,
-                        CallArgs, TransitionArgs, DeoptArgs, GCArgs);
-  return createCallHelper(FnStatepoint, Args, Builder, Name);
+                        CallArgs);
+
+  return Builder->CreateCall(FnStatepoint, Args,
+                             getStatepointBundles(TransitionArgs, DeoptArgs,
+                                                  GCArgs),
+                             Name);
 }
 
 CallInst *IRBuilderBase::CreateGCStatepointCall(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee,
-    ArrayRef<Value *> CallArgs, ArrayRef<Value *> DeoptArgs,
+    ArrayRef<Value *> CallArgs, Optional<ArrayRef<Value *>> DeoptArgs,
     ArrayRef<Value *> GCArgs, const Twine &Name) {
   return CreateGCStatepointCallCommon<Value *, Value *, Value *, Value *>(
       this, ID, NumPatchBytes, ActualCallee, uint32_t(StatepointFlags::None),
@@ -638,8 +662,9 @@ CallInst *IRBuilderBase::CreateGCStatepointCall(
 
 CallInst *IRBuilderBase::CreateGCStatepointCall(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee, uint32_t Flags,
-    ArrayRef<Use> CallArgs, ArrayRef<Use> TransitionArgs,
-    ArrayRef<Use> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
+    ArrayRef<Use> CallArgs, Optional<ArrayRef<Use>> TransitionArgs,
+    Optional<ArrayRef<Use>> DeoptArgs, ArrayRef<Value *> GCArgs,
+    const Twine &Name) {
   return CreateGCStatepointCallCommon<Use, Use, Use, Value *>(
       this, ID, NumPatchBytes, ActualCallee, Flags, CallArgs, TransitionArgs,
       DeoptArgs, GCArgs, Name);
@@ -647,7 +672,7 @@ CallInst *IRBuilderBase::CreateGCStatepointCall(
 
 CallInst *IRBuilderBase::CreateGCStatepointCall(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualCallee,
-    ArrayRef<Use> CallArgs, ArrayRef<Value *> DeoptArgs,
+    ArrayRef<Use> CallArgs, Optional<ArrayRef<Value *>> DeoptArgs,
     ArrayRef<Value *> GCArgs, const Twine &Name) {
   return CreateGCStatepointCallCommon<Use, Value *, Value *, Value *>(
       this, ID, NumPatchBytes, ActualCallee, uint32_t(StatepointFlags::None),
@@ -658,8 +683,9 @@ template <typename T0, typename T1, typename T2, typename T3>
 static InvokeInst *CreateGCStatepointInvokeCommon(
     IRBuilderBase *Builder, uint64_t ID, uint32_t NumPatchBytes,
     Value *ActualInvokee, BasicBlock *NormalDest, BasicBlock *UnwindDest,
-    uint32_t Flags, ArrayRef<T0> InvokeArgs, ArrayRef<T1> TransitionArgs,
-    ArrayRef<T2> DeoptArgs, ArrayRef<T3> GCArgs, const Twine &Name) {
+    uint32_t Flags, ArrayRef<T0> InvokeArgs,
+    Optional<ArrayRef<T1>> TransitionArgs, Optional<ArrayRef<T2>> DeoptArgs,
+    ArrayRef<T3> GCArgs, const Twine &Name) {
   // Extract out the type of the callee.
   auto *FuncPtrType = cast<PointerType>(ActualInvokee->getType());
   assert(isa<FunctionType>(FuncPtrType->getElementType()) &&
@@ -672,15 +698,18 @@ static InvokeInst *CreateGCStatepointInvokeCommon(
 
   std::vector<Value *> Args =
       getStatepointArgs(*Builder, ID, NumPatchBytes, ActualInvokee, Flags,
-                        InvokeArgs, TransitionArgs, DeoptArgs, GCArgs);
-  return createInvokeHelper(FnStatepoint, NormalDest, UnwindDest, Args, Builder,
-                            Name);
+                        InvokeArgs);
+
+  return Builder->CreateInvoke(FnStatepoint, NormalDest, UnwindDest, Args,
+                               getStatepointBundles(TransitionArgs, DeoptArgs,
+                                                    GCArgs),
+                               Name);
 }
 
 InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
     BasicBlock *NormalDest, BasicBlock *UnwindDest,
-    ArrayRef<Value *> InvokeArgs, ArrayRef<Value *> DeoptArgs,
+    ArrayRef<Value *> InvokeArgs, Optional<ArrayRef<Value *>> DeoptArgs,
     ArrayRef<Value *> GCArgs, const Twine &Name) {
   return CreateGCStatepointInvokeCommon<Value *, Value *, Value *, Value *>(
       this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest,
@@ -691,8 +720,8 @@ InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
 InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
     BasicBlock *NormalDest, BasicBlock *UnwindDest, uint32_t Flags,
-    ArrayRef<Use> InvokeArgs, ArrayRef<Use> TransitionArgs,
-    ArrayRef<Use> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
+    ArrayRef<Use> InvokeArgs, Optional<ArrayRef<Use>> TransitionArgs,
+    Optional<ArrayRef<Use>> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
   return CreateGCStatepointInvokeCommon<Use, Use, Use, Value *>(
       this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest, Flags,
       InvokeArgs, TransitionArgs, DeoptArgs, GCArgs, Name);
@@ -701,7 +730,7 @@ InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
 InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
     uint64_t ID, uint32_t NumPatchBytes, Value *ActualInvokee,
     BasicBlock *NormalDest, BasicBlock *UnwindDest, ArrayRef<Use> InvokeArgs,
-    ArrayRef<Value *> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
+    Optional<ArrayRef<Value *>> DeoptArgs, ArrayRef<Value *> GCArgs, const Twine &Name) {
   return CreateGCStatepointInvokeCommon<Use, Value *, Value *, Value *>(
       this, ID, NumPatchBytes, ActualInvokee, NormalDest, UnwindDest,
       uint32_t(StatepointFlags::None), InvokeArgs, None, DeoptArgs, GCArgs,
@@ -762,3 +791,360 @@ CallInst *IRBuilderBase::CreateIntrinsic(Intrinsic::ID ID,
   Function *Fn = Intrinsic::getDeclaration(M, ID, Types);
   return createCallHelper(Fn, Args, this, Name, FMFSource);
 }
+
+CallInst *IRBuilderBase::CreateConstrainedFPBinOp(
+    Intrinsic::ID ID, Value *L, Value *R, Instruction *FMFSource,
+    const Twine &Name, MDNode *FPMathTag,
+    Optional<RoundingMode> Rounding,
+    Optional<fp::ExceptionBehavior> Except) {
+  Value *RoundingV = getConstrainedFPRounding(Rounding);
+  Value *ExceptV = getConstrainedFPExcept(Except);
+
+  FastMathFlags UseFMF = FMF;
+  if (FMFSource)
+    UseFMF = FMFSource->getFastMathFlags();
+
+  CallInst *C = CreateIntrinsic(ID, {L->getType()},
+                                {L, R, RoundingV, ExceptV}, nullptr, Name);
+  setConstrainedFPCallAttr(C);
+  setFPAttrs(C, FPMathTag, UseFMF);
+  return C;
+}
+
+Value *IRBuilderBase::CreateNAryOp(unsigned Opc, ArrayRef<Value *> Ops,
+                                   const Twine &Name, MDNode *FPMathTag) {
+  if (Instruction::isBinaryOp(Opc)) {
+    assert(Ops.size() == 2 && "Invalid number of operands!");
+    return CreateBinOp(static_cast<Instruction::BinaryOps>(Opc),
+                       Ops[0], Ops[1], Name, FPMathTag);
+  }
+  if (Instruction::isUnaryOp(Opc)) {
+    assert(Ops.size() == 1 && "Invalid number of operands!");
+    return CreateUnOp(static_cast<Instruction::UnaryOps>(Opc),
+                      Ops[0], Name, FPMathTag);
+  }
+  llvm_unreachable("Unexpected opcode!");
+}
+
+CallInst *IRBuilderBase::CreateConstrainedFPCast(
+    Intrinsic::ID ID, Value *V, Type *DestTy,
+    Instruction *FMFSource, const Twine &Name, MDNode *FPMathTag,
+    Optional<RoundingMode> Rounding,
+    Optional<fp::ExceptionBehavior> Except) {
+  Value *ExceptV = getConstrainedFPExcept(Except);
+
+  FastMathFlags UseFMF = FMF;
+  if (FMFSource)
+    UseFMF = FMFSource->getFastMathFlags();
+
+  CallInst *C;
+  bool HasRoundingMD = false;
+  switch (ID) {
+  default:
+    break;
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)        \
+  case Intrinsic::INTRINSIC:                                \
+    HasRoundingMD = ROUND_MODE;                             \
+    break;
+#include "llvm/IR/ConstrainedOps.def"
+  }
+  if (HasRoundingMD) {
+    Value *RoundingV = getConstrainedFPRounding(Rounding);
+    C = CreateIntrinsic(ID, {DestTy, V->getType()}, {V, RoundingV, ExceptV},
+                        nullptr, Name);
+  } else
+    C = CreateIntrinsic(ID, {DestTy, V->getType()}, {V, ExceptV}, nullptr,
+                        Name);
+
+  setConstrainedFPCallAttr(C);
+
+  if (isa<FPMathOperator>(C))
+    setFPAttrs(C, FPMathTag, UseFMF);
+  return C;
+}
+
+Value *IRBuilderBase::CreateFCmpHelper(
+    CmpInst::Predicate P, Value *LHS, Value *RHS, const Twine &Name,
+    MDNode *FPMathTag, bool IsSignaling) {
+  if (IsFPConstrained) {
+    auto ID = IsSignaling ? Intrinsic::experimental_constrained_fcmps
+                          : Intrinsic::experimental_constrained_fcmp;
+    return CreateConstrainedFPCmp(ID, P, LHS, RHS, Name);
+  }
+
+  if (auto *LC = dyn_cast<Constant>(LHS))
+    if (auto *RC = dyn_cast<Constant>(RHS))
+      return Insert(Folder.CreateFCmp(P, LC, RC), Name);
+  return Insert(setFPAttrs(new FCmpInst(P, LHS, RHS), FPMathTag, FMF), Name);
+}
+
+CallInst *IRBuilderBase::CreateConstrainedFPCmp(
+    Intrinsic::ID ID, CmpInst::Predicate P, Value *L, Value *R,
+    const Twine &Name, Optional<fp::ExceptionBehavior> Except) {
+  Value *PredicateV = getConstrainedFPPredicate(P);
+  Value *ExceptV = getConstrainedFPExcept(Except);
+
+  CallInst *C = CreateIntrinsic(ID, {L->getType()},
+                                {L, R, PredicateV, ExceptV}, nullptr, Name);
+  setConstrainedFPCallAttr(C);
+  return C;
+}
+
+CallInst *IRBuilderBase::CreateConstrainedFPCall(
+    Function *Callee, ArrayRef<Value *> Args, const Twine &Name,
+    Optional<RoundingMode> Rounding,
+    Optional<fp::ExceptionBehavior> Except) {
+  llvm::SmallVector<Value *, 6> UseArgs;
+
+  for (auto *OneArg : Args)
+    UseArgs.push_back(OneArg);
+  bool HasRoundingMD = false;
+  switch (Callee->getIntrinsicID()) {
+  default:
+    break;
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)        \
+  case Intrinsic::INTRINSIC:                                \
+    HasRoundingMD = ROUND_MODE;                             \
+    break;
+#include "llvm/IR/ConstrainedOps.def"
+  }
+  if (HasRoundingMD)
+    UseArgs.push_back(getConstrainedFPRounding(Rounding));
+  UseArgs.push_back(getConstrainedFPExcept(Except));
+
+  CallInst *C = CreateCall(Callee, UseArgs, Name);
+  setConstrainedFPCallAttr(C);
+  return C;
+}
+
+Value *IRBuilderBase::CreateSelect(Value *C, Value *True, Value *False,
+                                   const Twine &Name, Instruction *MDFrom) {
+  if (auto *CC = dyn_cast<Constant>(C))
+    if (auto *TC = dyn_cast<Constant>(True))
+      if (auto *FC = dyn_cast<Constant>(False))
+        return Insert(Folder.CreateSelect(CC, TC, FC), Name);
+
+  SelectInst *Sel = SelectInst::Create(C, True, False);
+  if (MDFrom) {
+    MDNode *Prof = MDFrom->getMetadata(LLVMContext::MD_prof);
+    MDNode *Unpred = MDFrom->getMetadata(LLVMContext::MD_unpredictable);
+    Sel = addBranchMetadata(Sel, Prof, Unpred);
+  }
+  if (isa<FPMathOperator>(Sel))
+    setFPAttrs(Sel, nullptr /* MDNode* */, FMF);
+  return Insert(Sel, Name);
+}
+
+Value *IRBuilderBase::CreatePtrDiff(Value *LHS, Value *RHS,
+                                    const Twine &Name) {
+  assert(LHS->getType() == RHS->getType() &&
+         "Pointer subtraction operand types must match!");
+  auto *ArgType = cast<PointerType>(LHS->getType());
+  Value *LHS_int = CreatePtrToInt(LHS, Type::getInt64Ty(Context));
+  Value *RHS_int = CreatePtrToInt(RHS, Type::getInt64Ty(Context));
+  Value *Difference = CreateSub(LHS_int, RHS_int);
+  return CreateExactSDiv(Difference,
+                         ConstantExpr::getSizeOf(ArgType->getElementType()),
+                         Name);
+}
+
+Value *IRBuilderBase::CreateLaunderInvariantGroup(Value *Ptr) {
+  assert(isa<PointerType>(Ptr->getType()) &&
+         "launder.invariant.group only applies to pointers.");
+  // FIXME: we could potentially avoid casts to/from i8*.
+  auto *PtrType = Ptr->getType();
+  auto *Int8PtrTy = getInt8PtrTy(PtrType->getPointerAddressSpace());
+  if (PtrType != Int8PtrTy)
+    Ptr = CreateBitCast(Ptr, Int8PtrTy);
+  Module *M = BB->getParent()->getParent();
+  Function *FnLaunderInvariantGroup = Intrinsic::getDeclaration(
+      M, Intrinsic::launder_invariant_group, {Int8PtrTy});
+
+  assert(FnLaunderInvariantGroup->getReturnType() == Int8PtrTy &&
+         FnLaunderInvariantGroup->getFunctionType()->getParamType(0) ==
+             Int8PtrTy &&
+         "LaunderInvariantGroup should take and return the same type");
+
+  CallInst *Fn = CreateCall(FnLaunderInvariantGroup, {Ptr});
+
+  if (PtrType != Int8PtrTy)
+    return CreateBitCast(Fn, PtrType);
+  return Fn;
+}
+
+Value *IRBuilderBase::CreateStripInvariantGroup(Value *Ptr) {
+  assert(isa<PointerType>(Ptr->getType()) &&
+         "strip.invariant.group only applies to pointers.");
+
+  // FIXME: we could potentially avoid casts to/from i8*.
+  auto *PtrType = Ptr->getType();
+  auto *Int8PtrTy = getInt8PtrTy(PtrType->getPointerAddressSpace());
+  if (PtrType != Int8PtrTy)
+    Ptr = CreateBitCast(Ptr, Int8PtrTy);
+  Module *M = BB->getParent()->getParent();
+  Function *FnStripInvariantGroup = Intrinsic::getDeclaration(
+      M, Intrinsic::strip_invariant_group, {Int8PtrTy});
+
+  assert(FnStripInvariantGroup->getReturnType() == Int8PtrTy &&
+         FnStripInvariantGroup->getFunctionType()->getParamType(0) ==
+             Int8PtrTy &&
+         "StripInvariantGroup should take and return the same type");
+
+  CallInst *Fn = CreateCall(FnStripInvariantGroup, {Ptr});
+
+  if (PtrType != Int8PtrTy)
+    return CreateBitCast(Fn, PtrType);
+  return Fn;
+}
+
+Value *IRBuilderBase::CreateVectorSplat(unsigned NumElts, Value *V,
+                                        const Twine &Name) {
+  assert(NumElts > 0 && "Cannot splat to an empty vector!");
+
+  // First insert it into an undef vector so we can shuffle it.
+  Type *I32Ty = getInt32Ty();
+  Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), NumElts));
+  V = CreateInsertElement(Undef, V, ConstantInt::get(I32Ty, 0),
+                          Name + ".splatinsert");
+
+  // Shuffle the value across the desired number of elements.
+  Value *Zeros =
+      ConstantAggregateZero::get(FixedVectorType::get(I32Ty, NumElts));
+  return CreateShuffleVector(V, Undef, Zeros, Name + ".splat");
+}
+
+Value *IRBuilderBase::CreateExtractInteger(
+    const DataLayout &DL, Value *From, IntegerType *ExtractedTy,
+    uint64_t Offset, const Twine &Name) {
+  auto *IntTy = cast<IntegerType>(From->getType());
+  assert(DL.getTypeStoreSize(ExtractedTy) + Offset <=
+             DL.getTypeStoreSize(IntTy) &&
+         "Element extends past full value");
+  uint64_t ShAmt = 8 * Offset;
+  Value *V = From;
+  if (DL.isBigEndian())
+    ShAmt = 8 * (DL.getTypeStoreSize(IntTy) -
+                 DL.getTypeStoreSize(ExtractedTy) - Offset);
+  if (ShAmt) {
+    V = CreateLShr(V, ShAmt, Name + ".shift");
+  }
+  assert(ExtractedTy->getBitWidth() <= IntTy->getBitWidth() &&
+         "Cannot extract to a larger integer!");
+  if (ExtractedTy != IntTy) {
+    V = CreateTrunc(V, ExtractedTy, Name + ".trunc");
+  }
+  return V;
+}
+
+Value *IRBuilderBase::CreatePreserveArrayAccessIndex(
+    Type *ElTy, Value *Base, unsigned Dimension, unsigned LastIndex,
+    MDNode *DbgInfo) {
+  assert(isa<PointerType>(Base->getType()) &&
+         "Invalid Base ptr type for preserve.array.access.index.");
+  auto *BaseType = Base->getType();
+
+  Value *LastIndexV = getInt32(LastIndex);
+  Constant *Zero = ConstantInt::get(Type::getInt32Ty(Context), 0);
+  SmallVector<Value *, 4> IdxList;
+  for (unsigned I = 0; I < Dimension; ++I)
+    IdxList.push_back(Zero);
+  IdxList.push_back(LastIndexV);
+
+  Type *ResultType =
+      GetElementPtrInst::getGEPReturnType(ElTy, Base, IdxList);
+
+  Module *M = BB->getParent()->getParent();
+  Function *FnPreserveArrayAccessIndex = Intrinsic::getDeclaration(
+      M, Intrinsic::preserve_array_access_index, {ResultType, BaseType});
+
+  Value *DimV = getInt32(Dimension);
+  CallInst *Fn =
+      CreateCall(FnPreserveArrayAccessIndex, {Base, DimV, LastIndexV});
+  if (DbgInfo)
+    Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
+
+  return Fn;
+}
+
+Value *IRBuilderBase::CreatePreserveUnionAccessIndex(
+    Value *Base, unsigned FieldIndex, MDNode *DbgInfo) {
+  assert(isa<PointerType>(Base->getType()) &&
+         "Invalid Base ptr type for preserve.union.access.index.");
+  auto *BaseType = Base->getType();
+
+  Module *M = BB->getParent()->getParent();
+  Function *FnPreserveUnionAccessIndex = Intrinsic::getDeclaration(
+      M, Intrinsic::preserve_union_access_index, {BaseType, BaseType});
+
+  Value *DIIndex = getInt32(FieldIndex);
+  CallInst *Fn =
+      CreateCall(FnPreserveUnionAccessIndex, {Base, DIIndex});
+  if (DbgInfo)
+    Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
+
+  return Fn;
+}
+
+Value *IRBuilderBase::CreatePreserveStructAccessIndex(
+    Type *ElTy, Value *Base, unsigned Index, unsigned FieldIndex,
+    MDNode *DbgInfo) {
+  assert(isa<PointerType>(Base->getType()) &&
+         "Invalid Base ptr type for preserve.struct.access.index.");
+  auto *BaseType = Base->getType();
+
+  Value *GEPIndex = getInt32(Index);
+  Constant *Zero = ConstantInt::get(Type::getInt32Ty(Context), 0);
+  Type *ResultType =
+      GetElementPtrInst::getGEPReturnType(ElTy, Base, {Zero, GEPIndex});
+
+  Module *M = BB->getParent()->getParent();
+  Function *FnPreserveStructAccessIndex = Intrinsic::getDeclaration(
+      M, Intrinsic::preserve_struct_access_index, {ResultType, BaseType});
+
+  Value *DIIndex = getInt32(FieldIndex);
+  CallInst *Fn = CreateCall(FnPreserveStructAccessIndex,
+                            {Base, GEPIndex, DIIndex});
+  if (DbgInfo)
+    Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
+
+  return Fn;
+}
+
+CallInst *IRBuilderBase::CreateAlignmentAssumptionHelper(const DataLayout &DL,
+                                                         Value *PtrValue,
+                                                         Value *AlignValue,
+                                                         Value *OffsetValue) {
+  SmallVector<Value *, 4> Vals({PtrValue, AlignValue});
+  if (OffsetValue)
+    Vals.push_back(OffsetValue);
+  OperandBundleDefT<Value *> AlignOpB("align", Vals);
+  return CreateAssumption(ConstantInt::getTrue(getContext()), {AlignOpB});
+}
+
+CallInst *IRBuilderBase::CreateAlignmentAssumption(const DataLayout &DL,
+                                                   Value *PtrValue,
+                                                   unsigned Alignment,
+                                                   Value *OffsetValue) {
+  assert(isa<PointerType>(PtrValue->getType()) &&
+         "trying to create an alignment assumption on a non-pointer?");
+  assert(Alignment != 0 && "Invalid Alignment");
+  auto *PtrTy = cast<PointerType>(PtrValue->getType());
+  Type *IntPtrTy = getIntPtrTy(DL, PtrTy->getAddressSpace());
+  Value *AlignValue = ConstantInt::get(IntPtrTy, Alignment);
+  return CreateAlignmentAssumptionHelper(DL, PtrValue, AlignValue, OffsetValue);
+}
+
+CallInst *IRBuilderBase::CreateAlignmentAssumption(const DataLayout &DL,
+                                                   Value *PtrValue,
+                                                   Value *Alignment,
+                                                   Value *OffsetValue) {
+  assert(isa<PointerType>(PtrValue->getType()) &&
+         "trying to create an alignment assumption on a non-pointer?");
+  return CreateAlignmentAssumptionHelper(DL, PtrValue, Alignment, OffsetValue);
+}
+
+IRBuilderDefaultInserter::~IRBuilderDefaultInserter() {}
+IRBuilderCallbackInserter::~IRBuilderCallbackInserter() {}
+IRBuilderFolder::~IRBuilderFolder() {}
+void ConstantFolder::anchor() {}
+void NoFolder::anchor() {}
diff --git a/llvm/lib/IR/InlineAsm.cpp b/llvm/lib/IR/InlineAsm.cpp
index fd732f9eda8b..ee30b92522d0 100644
--- a/llvm/lib/IR/InlineAsm.cpp
+++ b/llvm/lib/IR/InlineAsm.cpp
@@ -136,14 +136,14 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str,
       // Find the end of the register name.
       StringRef::iterator ConstraintEnd = std::find(I+1, E, '}');
       if (ConstraintEnd == E) return true;  // "{foo"
-      pCodes->push_back(StringRef(I, ConstraintEnd+1 - I));
+      pCodes->push_back(std::string(StringRef(I, ConstraintEnd + 1 - I)));
       I = ConstraintEnd+1;
     } else if (isdigit(static_cast<unsigned char>(*I))) { // Matching Constraint
       // Maximal munch numbers.
       StringRef::iterator NumStart = I;
       while (I != E && isdigit(static_cast<unsigned char>(*I)))
         ++I;
-      pCodes->push_back(StringRef(NumStart, I - NumStart));
+      pCodes->push_back(std::string(StringRef(NumStart, I - NumStart)));
       unsigned N = atoi(pCodes->back().c_str());
       // Check that this is a valid matching constraint!
       if (N >= ConstraintsSoFar.size() || ConstraintsSoFar[N].Type != isOutput||
@@ -179,7 +179,7 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str,
     } else if (*I == '^') {
       // Multi-letter constraint
       // FIXME: For now assuming these are 2-character constraints.
-      pCodes->push_back(StringRef(I+1, 2));
+      pCodes->push_back(std::string(StringRef(I + 1, 2)));
       I += 3;
     } else if (*I == '@') {
       // Multi-letter constraint
@@ -189,11 +189,11 @@ bool InlineAsm::ConstraintInfo::Parse(StringRef Str,
       int N = C - '0';
       assert(N > 0 && "Found a zero letter constraint!");
       ++I;
-      pCodes->push_back(StringRef(I, N));
+      pCodes->push_back(std::string(StringRef(I, N)));
       I += N;
     } else {
       // Single letter constraint.
-      pCodes->push_back(StringRef(I, 1));
+      pCodes->push_back(std::string(StringRef(I, 1)));
       ++I;
     }
   }
diff --git a/llvm/lib/IR/Instruction.cpp b/llvm/lib/IR/Instruction.cpp
index 7da169712896..bfbd801cb7a7 100644
--- a/llvm/lib/IR/Instruction.cpp
+++ b/llvm/lib/IR/Instruction.cpp
@@ -43,6 +43,19 @@ Instruction::Instruction(Type *ty, unsigned it, Use *Ops, unsigned NumOps,
 
 Instruction::~Instruction() {
   assert(!Parent && "Instruction still linked in the program!");
+
+  // Replace any extant metadata uses of this instruction with undef to
+  // preserve debug info accuracy. Some alternatives include:
+  // - Treat Instruction like any other Value, and point its extant metadata
+  //   uses to an empty ValueAsMetadata node. This makes extant dbg.value uses
+  //   trivially dead (i.e. fair game for deletion in many passes), leading to
+  //   stale dbg.values being in effect for too long.
+  // - Call salvageDebugInfoOrMarkUndef. Not needed to make instruction removal
+  //   correct. OTOH results in wasted work in some common cases (e.g. when all
+  //   instructions in a BasicBlock are deleted).
+  if (isUsedByMetadata())
+    ValueAsMetadata::handleRAUW(this, UndefValue::get(getType()));
+
   if (hasMetadataHashEntry())
     clearMetadataHashEntries();
 }
@@ -97,6 +110,15 @@ void Instruction::moveBefore(BasicBlock &BB,
   BB.getInstList().splice(I, getParent()->getInstList(), getIterator());
 }
 
+bool Instruction::comesBefore(const Instruction *Other) const {
+  assert(Parent && Other->Parent &&
+         "instructions without BB parents have no order");
+  assert(Parent == Other->Parent && "cross-BB instruction order comparison");
+  if (!Parent->isInstrOrderValid())
+    Parent->renumberInstructions();
+  return Order < Other->Order;
+}
+
 void Instruction::setHasNoUnsignedWrap(bool b) {
   cast<OverflowingBinaryOperator>(this)->setHasNoUnsignedWrap(b);
 }
@@ -176,6 +198,11 @@ void Instruction::setHasAllowReciprocal(bool B) {
   cast<FPMathOperator>(this)->setHasAllowReciprocal(B);
 }
 
+void Instruction::setHasAllowContract(bool B) {
+  assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
+  cast<FPMathOperator>(this)->setHasAllowContract(B);
+}
+
 void Instruction::setHasApproxFunc(bool B) {
   assert(isa<FPMathOperator>(this) && "setting fast-math flag on invalid op");
   cast<FPMathOperator>(this)->setHasApproxFunc(B);
@@ -434,6 +461,9 @@ static bool haveSameSpecialState(const Instruction *I1, const Instruction *I2,
            RMWI->isVolatile() == cast<AtomicRMWInst>(I2)->isVolatile() &&
            RMWI->getOrdering() == cast<AtomicRMWInst>(I2)->getOrdering() &&
            RMWI->getSyncScopeID() == cast<AtomicRMWInst>(I2)->getSyncScopeID();
+  if (const ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I1))
+    return SVI->getShuffleMask() ==
+           cast<ShuffleVectorInst>(I2)->getShuffleMask();
 
   return true;
 }
@@ -744,12 +774,3 @@ Instruction *Instruction::clone() const {
   New->copyMetadata(*this);
   return New;
 }
-
-void Instruction::setProfWeight(uint64_t W) {
-  assert(isa<CallBase>(this) &&
-         "Can only set weights for call like instructions");
-  SmallVector<uint32_t, 1> Weights;
-  Weights.push_back(W);
-  MDBuilder MDB(getContext());
-  setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
-}
diff --git a/llvm/lib/IR/Instructions.cpp b/llvm/lib/IR/Instructions.cpp
index c264277fa53c..2f17a0d73af4 100644
--- a/llvm/lib/IR/Instructions.cpp
+++ b/llvm/lib/IR/Instructions.cpp
@@ -18,7 +18,6 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -54,7 +53,7 @@ Optional<uint64_t>
 AllocaInst::getAllocationSizeInBits(const DataLayout &DL) const {
   uint64_t Size = DL.getTypeAllocSizeInBits(getAllocatedType());
   if (isArrayAllocation()) {
-    auto C = dyn_cast<ConstantInt>(getArraySize());
+    auto *C = dyn_cast<ConstantInt>(getArraySize());
     if (!C)
       return None;
     Size *= C->getZExtValue();
@@ -63,14 +62,6 @@ AllocaInst::getAllocationSizeInBits(const DataLayout &DL) const {
 }
 
 //===----------------------------------------------------------------------===//
-//                            CallSite Class
-//===----------------------------------------------------------------------===//
-
-User::op_iterator CallSite::getCallee() const {
-  return cast<CallBase>(getInstruction())->op_end() - 1;
-}
-
-//===----------------------------------------------------------------------===//
 //                              SelectInst Class
 //===----------------------------------------------------------------------===//
 
@@ -90,7 +81,7 @@ const char *SelectInst::areInvalidOperands(Value *Op0, Value *Op1, Value *Op2) {
     VectorType *ET = dyn_cast<VectorType>(Op1->getType());
     if (!ET)
       return "selected values for vector select must be vectors";
-    if (ET->getNumElements() != VT->getNumElements())
+    if (ET->getElementCount() != VT->getElementCount())
       return "vector select requires selected vectors to have "
                    "the same vector length as select condition";
   } else if (Op0->getType() != Type::getInt1Ty(Op0->getContext())) {
@@ -256,6 +247,20 @@ void LandingPadInst::addClause(Constant *Val) {
 //                        CallBase Implementation
 //===----------------------------------------------------------------------===//
 
+CallBase *CallBase::Create(CallBase *CB, ArrayRef<OperandBundleDef> Bundles,
+                           Instruction *InsertPt) {
+  switch (CB->getOpcode()) {
+  case Instruction::Call:
+    return CallInst::Create(cast<CallInst>(CB), Bundles, InsertPt);
+  case Instruction::Invoke:
+    return InvokeInst::Create(cast<InvokeInst>(CB), Bundles, InsertPt);
+  case Instruction::CallBr:
+    return CallBrInst::Create(cast<CallBrInst>(CB), Bundles, InsertPt);
+  default:
+    llvm_unreachable("Unknown CallBase sub-class!");
+  }
+}
+
 Function *CallBase::getCaller() { return getParent()->getParent(); }
 
 unsigned CallBase::getNumSubclassExtraOperandsDynamic() const {
@@ -264,13 +269,10 @@ unsigned CallBase::getNumSubclassExtraOperandsDynamic() const {
 }
 
 bool CallBase::isIndirectCall() const {
-  const Value *V = getCalledValue();
+  const Value *V = getCalledOperand();
   if (isa<Function>(V) || isa<Constant>(V))
     return false;
-  if (const CallInst *CI = dyn_cast<CallInst>(this))
-    if (CI->isInlineAsm())
-      return false;
-  return true;
+  return !isInlineAsm();
 }
 
 /// Tests if this call site must be tail call optimized. Only a CallInst can
@@ -342,16 +344,22 @@ bool CallBase::paramHasAttr(unsigned ArgNo, Attribute::AttrKind Kind) const {
 
 bool CallBase::hasFnAttrOnCalledFunction(Attribute::AttrKind Kind) const {
   if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasAttribute(AttributeList::FunctionIndex, Kind);
+    return F->getAttributes().hasFnAttribute(Kind);
   return false;
 }
 
 bool CallBase::hasFnAttrOnCalledFunction(StringRef Kind) const {
   if (const Function *F = getCalledFunction())
-    return F->getAttributes().hasAttribute(AttributeList::FunctionIndex, Kind);
+    return F->getAttributes().hasFnAttribute(Kind);
   return false;
 }
 
+void CallBase::getOperandBundlesAsDefs(
+    SmallVectorImpl<OperandBundleDef> &Defs) const {
+  for (unsigned i = 0, e = getNumOperandBundles(); i != e; ++i)
+    Defs.emplace_back(getOperandBundleAt(i));
+}
+
 CallBase::op_iterator
 CallBase::populateBundleOperandInfos(ArrayRef<OperandBundleDef> Bundles,
                                      const unsigned BeginIndex) {
@@ -378,6 +386,53 @@ CallBase::populateBundleOperandInfos(ArrayRef<OperandBundleDef> Bundles,
   return It;
 }
 
+CallBase::BundleOpInfo &CallBase::getBundleOpInfoForOperand(unsigned OpIdx) {
+  /// When there isn't many bundles, we do a simple linear search.
+  /// Else fallback to a binary-search that use the fact that bundles usually
+  /// have similar number of argument to get faster convergence.
+  if (bundle_op_info_end() - bundle_op_info_begin() < 8) {
+    for (auto &BOI : bundle_op_infos())
+      if (BOI.Begin <= OpIdx && OpIdx < BOI.End)
+        return BOI;
+
+    llvm_unreachable("Did not find operand bundle for operand!");
+  }
+
+  assert(OpIdx >= arg_size() && "the Idx is not in the operand bundles");
+  assert(bundle_op_info_end() - bundle_op_info_begin() > 0 &&
+         OpIdx < std::prev(bundle_op_info_end())->End &&
+         "The Idx isn't in the operand bundle");
+
+  /// We need a decimal number below and to prevent using floating point numbers
+  /// we use an intergal value multiplied by this constant.
+  constexpr unsigned NumberScaling = 1024;
+
+  bundle_op_iterator Begin = bundle_op_info_begin();
+  bundle_op_iterator End = bundle_op_info_end();
+  bundle_op_iterator Current;
+
+  while (Begin != End) {
+    unsigned ScaledOperandPerBundle =
+        NumberScaling * (std::prev(End)->End - Begin->Begin) / (End - Begin);
+    Current = Begin + (((OpIdx - Begin->Begin) * NumberScaling) /
+                       ScaledOperandPerBundle);
+    if (Current >= End)
+      Current = std::prev(End);
+    assert(Current < End && Current >= Begin &&
+           "the operand bundle doesn't cover every value in the range");
+    if (OpIdx >= Current->Begin && OpIdx < Current->End)
+      break;
+    if (OpIdx >= Current->End)
+      Begin = Current + 1;
+    else
+      End = Current;
+  }
+
+  assert(OpIdx >= Current->Begin && OpIdx < Current->End &&
+         "the operand bundle doesn't cover every value in the range");
+  return *Current;
+}
+
 //===----------------------------------------------------------------------===//
 //                        CallInst Implementation
 //===----------------------------------------------------------------------===//
@@ -450,7 +505,7 @@ CallInst *CallInst::Create(CallInst *CI, ArrayRef<OperandBundleDef> OpB,
                            Instruction *InsertPt) {
   std::vector<Value *> Args(CI->arg_begin(), CI->arg_end());
 
-  auto *NewCI = CallInst::Create(CI->getFunctionType(), CI->getCalledValue(),
+  auto *NewCI = CallInst::Create(CI->getFunctionType(), CI->getCalledOperand(),
                                  Args, OpB, CI->getName(), InsertPt);
   NewCI->setTailCallKind(CI->getTailCallKind());
   NewCI->setCallingConv(CI->getCallingConv());
@@ -761,9 +816,9 @@ InvokeInst *InvokeInst::Create(InvokeInst *II, ArrayRef<OperandBundleDef> OpB,
                                Instruction *InsertPt) {
   std::vector<Value *> Args(II->arg_begin(), II->arg_end());
 
-  auto *NewII = InvokeInst::Create(II->getFunctionType(), II->getCalledValue(),
-                                   II->getNormalDest(), II->getUnwindDest(),
-                                   Args, OpB, II->getName(), InsertPt);
+  auto *NewII = InvokeInst::Create(
+      II->getFunctionType(), II->getCalledOperand(), II->getNormalDest(),
+      II->getUnwindDest(), Args, OpB, II->getName(), InsertPt);
   NewII->setCallingConv(II->getCallingConv());
   NewII->SubclassOptionalData = II->SubclassOptionalData;
   NewII->setAttributes(II->getAttributes());
@@ -844,11 +899,9 @@ CallBrInst *CallBrInst::Create(CallBrInst *CBI, ArrayRef<OperandBundleDef> OpB,
                                Instruction *InsertPt) {
   std::vector<Value *> Args(CBI->arg_begin(), CBI->arg_end());
 
-  auto *NewCBI = CallBrInst::Create(CBI->getFunctionType(),
-                                    CBI->getCalledValue(),
-                                    CBI->getDefaultDest(),
-                                    CBI->getIndirectDests(),
-                                    Args, OpB, CBI->getName(), InsertPt);
+  auto *NewCBI = CallBrInst::Create(
+      CBI->getFunctionType(), CBI->getCalledOperand(), CBI->getDefaultDest(),
+      CBI->getIndirectDests(), Args, OpB, CBI->getName(), InsertPt);
   NewCBI->setCallingConv(CBI->getCallingConv());
   NewCBI->SubclassOptionalData = CBI->SubclassOptionalData;
   NewCBI->setAttributes(CBI->getAttributes());
@@ -921,7 +974,8 @@ CleanupReturnInst::CleanupReturnInst(const CleanupReturnInst &CRI)
                   OperandTraits<CleanupReturnInst>::op_end(this) -
                       CRI.getNumOperands(),
                   CRI.getNumOperands()) {
-  setInstructionSubclassData(CRI.getSubclassDataFromInstruction());
+  setSubclassData<Instruction::OpaqueField>(
+      CRI.getSubclassData<Instruction::OpaqueField>());
   Op<0>() = CRI.Op<0>();
   if (CRI.hasUnwindDest())
     Op<1>() = CRI.Op<1>();
@@ -929,7 +983,7 @@ CleanupReturnInst::CleanupReturnInst(const CleanupReturnInst &CRI)
 
 void CleanupReturnInst::init(Value *CleanupPad, BasicBlock *UnwindBB) {
   if (UnwindBB)
-    setInstructionSubclassData(getSubclassDataFromInstruction() | 1);
+    setSubclassData<UnwindDestField>(true);
 
   Op<0>() = CleanupPad;
   if (UnwindBB)
@@ -1033,7 +1087,7 @@ void CatchSwitchInst::init(Value *ParentPad, BasicBlock *UnwindDest,
 
   Op<0>() = ParentPad;
   if (UnwindDest) {
-    setInstructionSubclassData(getSubclassDataFromInstruction() | 1);
+    setSubclassData<UnwindDestField>(true);
     setUnwindDest(UnwindDest);
   }
 }
@@ -1207,6 +1261,19 @@ static Value *getAISize(LLVMContext &Context, Value *Amt) {
   return Amt;
 }
 
+static Align computeAllocaDefaultAlign(Type *Ty, BasicBlock *BB) {
+  assert(BB && "Insertion BB cannot be null when alignment not provided!");
+  assert(BB->getParent() &&
+         "BB must be in a Function when alignment not provided!");
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+  return DL.getPrefTypeAlign(Ty);
+}
+
+static Align computeAllocaDefaultAlign(Type *Ty, Instruction *I) {
+  assert(I && "Insertion position cannot be null when alignment not provided!");
+  return computeAllocaDefaultAlign(Ty, I->getParent());
+}
+
 AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
                        Instruction *InsertBefore)
   : AllocaInst(Ty, AddrSpace, /*ArraySize=*/nullptr, Name, InsertBefore) {}
@@ -1217,27 +1284,29 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, const Twine &Name,
 
 AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
                        const Twine &Name, Instruction *InsertBefore)
-    : AllocaInst(Ty, AddrSpace, ArraySize, /*Align=*/None, Name, InsertBefore) {
-}
+    : AllocaInst(Ty, AddrSpace, ArraySize,
+                 computeAllocaDefaultAlign(Ty, InsertBefore), Name,
+                 InsertBefore) {}
 
 AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
                        const Twine &Name, BasicBlock *InsertAtEnd)
-    : AllocaInst(Ty, AddrSpace, ArraySize, /*Align=*/None, Name, InsertAtEnd) {}
+    : AllocaInst(Ty, AddrSpace, ArraySize,
+                 computeAllocaDefaultAlign(Ty, InsertAtEnd), Name,
+                 InsertAtEnd) {}
 
 AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
-                       MaybeAlign Align, const Twine &Name,
+                       Align Align, const Twine &Name,
                        Instruction *InsertBefore)
     : UnaryInstruction(PointerType::get(Ty, AddrSpace), Alloca,
                        getAISize(Ty->getContext(), ArraySize), InsertBefore),
       AllocatedType(Ty) {
-  setAlignment(MaybeAlign(Align));
+  setAlignment(Align);
   assert(!Ty->isVoidTy() && "Cannot allocate void!");
   setName(Name);
 }
 
 AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
-                       MaybeAlign Align, const Twine &Name,
-                       BasicBlock *InsertAtEnd)
+                       Align Align, const Twine &Name, BasicBlock *InsertAtEnd)
     : UnaryInstruction(PointerType::get(Ty, AddrSpace), Alloca,
                        getAISize(Ty->getContext(), ArraySize), InsertAtEnd),
       AllocatedType(Ty) {
@@ -1246,17 +1315,6 @@ AllocaInst::AllocaInst(Type *Ty, unsigned AddrSpace, Value *ArraySize,
   setName(Name);
 }
 
-void AllocaInst::setAlignment(MaybeAlign Align) {
-  assert((!Align || *Align <= MaximumAlignment) &&
-         "Alignment is greater than MaximumAlignment!");
-  setInstructionSubclassData((getSubclassDataFromInstruction() & ~31) |
-                             encode(Align));
-  if (Align)
-    assert(getAlignment() == Align->value() &&
-           "Alignment representation error!");
-  else
-    assert(getAlignment() == 0 && "Alignment representation error!");
-}
 
 bool AllocaInst::isArrayAllocation() const {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(getOperand(0)))
@@ -1287,6 +1345,19 @@ void LoadInst::AssertOK() {
          "Alignment required for atomic load");
 }
 
+static Align computeLoadStoreDefaultAlign(Type *Ty, BasicBlock *BB) {
+  assert(BB && "Insertion BB cannot be null when alignment not provided!");
+  assert(BB->getParent() &&
+         "BB must be in a Function when alignment not provided!");
+  const DataLayout &DL = BB->getModule()->getDataLayout();
+  return DL.getABITypeAlign(Ty);
+}
+
+static Align computeLoadStoreDefaultAlign(Type *Ty, Instruction *I) {
+  assert(I && "Insertion position cannot be null when alignment not provided!");
+  return computeLoadStoreDefaultAlign(Ty, I->getParent());
+}
+
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name,
                    Instruction *InsertBef)
     : LoadInst(Ty, Ptr, Name, /*isVolatile=*/false, InsertBef) {}
@@ -1297,36 +1368,38 @@ LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name,
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
                    Instruction *InsertBef)
-    : LoadInst(Ty, Ptr, Name, isVolatile, /*Align=*/None, InsertBef) {}
+    : LoadInst(Ty, Ptr, Name, isVolatile,
+               computeLoadStoreDefaultAlign(Ty, InsertBef), InsertBef) {}
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
                    BasicBlock *InsertAE)
-    : LoadInst(Ty, Ptr, Name, isVolatile, /*Align=*/None, InsertAE) {}
+    : LoadInst(Ty, Ptr, Name, isVolatile,
+               computeLoadStoreDefaultAlign(Ty, InsertAE), InsertAE) {}
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
-                   MaybeAlign Align, Instruction *InsertBef)
+                   Align Align, Instruction *InsertBef)
     : LoadInst(Ty, Ptr, Name, isVolatile, Align, AtomicOrdering::NotAtomic,
                SyncScope::System, InsertBef) {}
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
-                   MaybeAlign Align, BasicBlock *InsertAE)
+                   Align Align, BasicBlock *InsertAE)
     : LoadInst(Ty, Ptr, Name, isVolatile, Align, AtomicOrdering::NotAtomic,
                SyncScope::System, InsertAE) {}
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
-                   MaybeAlign Align, AtomicOrdering Order, SyncScope::ID SSID,
+                   Align Align, AtomicOrdering Order, SyncScope::ID SSID,
                    Instruction *InsertBef)
     : UnaryInstruction(Ty, Load, Ptr, InsertBef) {
   assert(Ty == cast<PointerType>(Ptr->getType())->getElementType());
   setVolatile(isVolatile);
-  setAlignment(MaybeAlign(Align));
+  setAlignment(Align);
   setAtomic(Order, SSID);
   AssertOK();
   setName(Name);
 }
 
 LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
-                   MaybeAlign Align, AtomicOrdering Order, SyncScope::ID SSID,
+                   Align Align, AtomicOrdering Order, SyncScope::ID SSID,
                    BasicBlock *InsertAE)
     : UnaryInstruction(Ty, Load, Ptr, InsertAE) {
   assert(Ty == cast<PointerType>(Ptr->getType())->getElementType());
@@ -1337,14 +1410,6 @@ LoadInst::LoadInst(Type *Ty, Value *Ptr, const Twine &Name, bool isVolatile,
   setName(Name);
 }
 
-void LoadInst::setAlignment(MaybeAlign Align) {
-  assert((!Align || *Align <= MaximumAlignment) &&
-         "Alignment is greater than MaximumAlignment!");
-  setInstructionSubclassData((getSubclassDataFromInstruction() & ~(31 << 1)) |
-                             (encode(Align) << 1));
-  assert(getAlign() == Align && "Alignment representation error!");
-}
-
 //===----------------------------------------------------------------------===//
 //                           StoreInst Implementation
 //===----------------------------------------------------------------------===//
@@ -1368,23 +1433,27 @@ StoreInst::StoreInst(Value *val, Value *addr, BasicBlock *InsertAtEnd)
 
 StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
                      Instruction *InsertBefore)
-    : StoreInst(val, addr, isVolatile, /*Align=*/None, InsertBefore) {}
+    : StoreInst(val, addr, isVolatile,
+                computeLoadStoreDefaultAlign(val->getType(), InsertBefore),
+                InsertBefore) {}
 
 StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile,
                      BasicBlock *InsertAtEnd)
-    : StoreInst(val, addr, isVolatile, /*Align=*/None, InsertAtEnd) {}
+    : StoreInst(val, addr, isVolatile,
+                computeLoadStoreDefaultAlign(val->getType(), InsertAtEnd),
+                InsertAtEnd) {}
 
-StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, MaybeAlign Align,
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Align Align,
                      Instruction *InsertBefore)
     : StoreInst(val, addr, isVolatile, Align, AtomicOrdering::NotAtomic,
                 SyncScope::System, InsertBefore) {}
 
-StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, MaybeAlign Align,
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Align Align,
                      BasicBlock *InsertAtEnd)
     : StoreInst(val, addr, isVolatile, Align, AtomicOrdering::NotAtomic,
                 SyncScope::System, InsertAtEnd) {}
 
-StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, MaybeAlign Align,
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Align Align,
                      AtomicOrdering Order, SyncScope::ID SSID,
                      Instruction *InsertBefore)
     : Instruction(Type::getVoidTy(val->getContext()), Store,
@@ -1398,7 +1467,7 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, MaybeAlign Align,
   AssertOK();
 }
 
-StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, MaybeAlign Align,
+StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, Align Align,
                      AtomicOrdering Order, SyncScope::ID SSID,
                      BasicBlock *InsertAtEnd)
     : Instruction(Type::getVoidTy(val->getContext()), Store,
@@ -1412,20 +1481,13 @@ StoreInst::StoreInst(Value *val, Value *addr, bool isVolatile, MaybeAlign Align,
   AssertOK();
 }
 
-void StoreInst::setAlignment(MaybeAlign Alignment) {
-  assert((!Alignment || *Alignment <= MaximumAlignment) &&
-         "Alignment is greater than MaximumAlignment!");
-  setInstructionSubclassData((getSubclassDataFromInstruction() & ~(31 << 1)) |
-                             (encode(Alignment) << 1));
-  assert(getAlign() == Alignment && "Alignment representation error!");
-}
 
 //===----------------------------------------------------------------------===//
 //                       AtomicCmpXchgInst Implementation
 //===----------------------------------------------------------------------===//
 
 void AtomicCmpXchgInst::Init(Value *Ptr, Value *Cmp, Value *NewVal,
-                             AtomicOrdering SuccessOrdering,
+                             Align Alignment, AtomicOrdering SuccessOrdering,
                              AtomicOrdering FailureOrdering,
                              SyncScope::ID SSID) {
   Op<0>() = Ptr;
@@ -1434,6 +1496,7 @@ void AtomicCmpXchgInst::Init(Value *Ptr, Value *Cmp, Value *NewVal,
   setSuccessOrdering(SuccessOrdering);
   setFailureOrdering(FailureOrdering);
   setSyncScopeID(SSID);
+  setAlignment(Alignment);
 
   assert(getOperand(0) && getOperand(1) && getOperand(2) &&
          "All operands must be non-null!");
@@ -1458,6 +1521,7 @@ void AtomicCmpXchgInst::Init(Value *Ptr, Value *Cmp, Value *NewVal,
 }
 
 AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
+                                     Align Alignment,
                                      AtomicOrdering SuccessOrdering,
                                      AtomicOrdering FailureOrdering,
                                      SyncScope::ID SSID,
@@ -1466,10 +1530,11 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
           StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
           AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
           OperandTraits<AtomicCmpXchgInst>::operands(this), InsertBefore) {
-  Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SSID);
+  Init(Ptr, Cmp, NewVal, Alignment, SuccessOrdering, FailureOrdering, SSID);
 }
 
 AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
+                                     Align Alignment,
                                      AtomicOrdering SuccessOrdering,
                                      AtomicOrdering FailureOrdering,
                                      SyncScope::ID SSID,
@@ -1478,7 +1543,7 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
           StructType::get(Cmp->getType(), Type::getInt1Ty(Cmp->getContext())),
           AtomicCmpXchg, OperandTraits<AtomicCmpXchgInst>::op_begin(this),
           OperandTraits<AtomicCmpXchgInst>::operands(this), InsertAtEnd) {
-  Init(Ptr, Cmp, NewVal, SuccessOrdering, FailureOrdering, SSID);
+  Init(Ptr, Cmp, NewVal, Alignment, SuccessOrdering, FailureOrdering, SSID);
 }
 
 //===----------------------------------------------------------------------===//
@@ -1486,13 +1551,14 @@ AtomicCmpXchgInst::AtomicCmpXchgInst(Value *Ptr, Value *Cmp, Value *NewVal,
 //===----------------------------------------------------------------------===//
 
 void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val,
-                         AtomicOrdering Ordering,
+                         Align Alignment, AtomicOrdering Ordering,
                          SyncScope::ID SSID) {
   Op<0>() = Ptr;
   Op<1>() = Val;
   setOperation(Operation);
   setOrdering(Ordering);
   setSyncScopeID(SSID);
+  setAlignment(Alignment);
 
   assert(getOperand(0) && getOperand(1) &&
          "All operands must be non-null!");
@@ -1506,25 +1572,21 @@ void AtomicRMWInst::Init(BinOp Operation, Value *Ptr, Value *Val,
 }
 
 AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
-                             AtomicOrdering Ordering,
-                             SyncScope::ID SSID,
-                             Instruction *InsertBefore)
-  : Instruction(Val->getType(), AtomicRMW,
-                OperandTraits<AtomicRMWInst>::op_begin(this),
-                OperandTraits<AtomicRMWInst>::operands(this),
-                InsertBefore) {
-  Init(Operation, Ptr, Val, Ordering, SSID);
+                             Align Alignment, AtomicOrdering Ordering,
+                             SyncScope::ID SSID, Instruction *InsertBefore)
+    : Instruction(Val->getType(), AtomicRMW,
+                  OperandTraits<AtomicRMWInst>::op_begin(this),
+                  OperandTraits<AtomicRMWInst>::operands(this), InsertBefore) {
+  Init(Operation, Ptr, Val, Alignment, Ordering, SSID);
 }
 
 AtomicRMWInst::AtomicRMWInst(BinOp Operation, Value *Ptr, Value *Val,
-                             AtomicOrdering Ordering,
-                             SyncScope::ID SSID,
-                             BasicBlock *InsertAtEnd)
-  : Instruction(Val->getType(), AtomicRMW,
-                OperandTraits<AtomicRMWInst>::op_begin(this),
-                OperandTraits<AtomicRMWInst>::operands(this),
-                InsertAtEnd) {
-  Init(Operation, Ptr, Val, Ordering, SSID);
+                             Align Alignment, AtomicOrdering Ordering,
+                             SyncScope::ID SSID, BasicBlock *InsertAtEnd)
+    : Instruction(Val->getType(), AtomicRMW,
+                  OperandTraits<AtomicRMWInst>::op_begin(this),
+                  OperandTraits<AtomicRMWInst>::operands(this), InsertAtEnd) {
+  Init(Operation, Ptr, Val, Alignment, Ordering, SSID);
 }
 
 StringRef AtomicRMWInst::getOperationName(BinOp Op) {
@@ -1606,35 +1668,44 @@ GetElementPtrInst::GetElementPtrInst(const GetElementPtrInst &GEPI)
   SubclassOptionalData = GEPI.SubclassOptionalData;
 }
 
-/// getIndexedType - Returns the type of the element that would be accessed with
-/// a gep instruction with the specified parameters.
-///
-/// The Idxs pointer should point to a continuous piece of memory containing the
-/// indices, either as Value* or uint64_t.
-///
-/// A null type is returned if the indices are invalid for the specified
-/// pointer type.
-///
-template <typename IndexTy>
-static Type *getIndexedTypeInternal(Type *Agg, ArrayRef<IndexTy> IdxList) {
-  // Handle the special case of the empty set index set, which is always valid.
-  if (IdxList.empty())
-    return Agg;
-
-  // If there is at least one index, the top level type must be sized, otherwise
-  // it cannot be 'stepped over'.
-  if (!Agg->isSized())
+Type *GetElementPtrInst::getTypeAtIndex(Type *Ty, Value *Idx) {
+  if (auto *Struct = dyn_cast<StructType>(Ty)) {
+    if (!Struct->indexValid(Idx))
+      return nullptr;
+    return Struct->getTypeAtIndex(Idx);
+  }
+  if (!Idx->getType()->isIntOrIntVectorTy())
     return nullptr;
+  if (auto *Array = dyn_cast<ArrayType>(Ty))
+    return Array->getElementType();
+  if (auto *Vector = dyn_cast<VectorType>(Ty))
+    return Vector->getElementType();
+  return nullptr;
+}
+
+Type *GetElementPtrInst::getTypeAtIndex(Type *Ty, uint64_t Idx) {
+  if (auto *Struct = dyn_cast<StructType>(Ty)) {
+    if (Idx >= Struct->getNumElements())
+      return nullptr;
+    return Struct->getElementType(Idx);
+  }
+  if (auto *Array = dyn_cast<ArrayType>(Ty))
+    return Array->getElementType();
+  if (auto *Vector = dyn_cast<VectorType>(Ty))
+    return Vector->getElementType();
+  return nullptr;
+}
 
-  unsigned CurIdx = 1;
-  for (; CurIdx != IdxList.size(); ++CurIdx) {
-    CompositeType *CT = dyn_cast<CompositeType>(Agg);
-    if (!CT || CT->isPointerTy()) return nullptr;
-    IndexTy Index = IdxList[CurIdx];
-    if (!CT->indexValid(Index)) return nullptr;
-    Agg = CT->getTypeAtIndex(Index);
+template <typename IndexTy>
+static Type *getIndexedTypeInternal(Type *Ty, ArrayRef<IndexTy> IdxList) {
+  if (IdxList.empty())
+    return Ty;
+  for (IndexTy V : IdxList.slice(1)) {
+    Ty = GetElementPtrInst::getTypeAtIndex(Ty, V);
+    if (!Ty)
+      return Ty;
   }
-  return CurIdx == IdxList.size() ? Agg : nullptr;
+  return Ty;
 }
 
 Type *GetElementPtrInst::getIndexedType(Type *Ty, ArrayRef<Value *> IdxList) {
@@ -1781,66 +1852,120 @@ bool InsertElementInst::isValidOperands(const Value *Vec, const Value *Elt,
 ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
                                      const Twine &Name,
                                      Instruction *InsertBefore)
-: Instruction(VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
-                cast<VectorType>(Mask->getType())->getElementCount()),
-              ShuffleVector,
-              OperandTraits<ShuffleVectorInst>::op_begin(this),
-              OperandTraits<ShuffleVectorInst>::operands(this),
-              InsertBefore) {
+    : Instruction(
+          VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
+                          cast<VectorType>(Mask->getType())->getElementCount()),
+          ShuffleVector, OperandTraits<ShuffleVectorInst>::op_begin(this),
+          OperandTraits<ShuffleVectorInst>::operands(this), InsertBefore) {
   assert(isValidOperands(V1, V2, Mask) &&
          "Invalid shuffle vector instruction operands!");
+
   Op<0>() = V1;
   Op<1>() = V2;
-  Op<2>() = Mask;
+  SmallVector<int, 16> MaskArr;
+  getShuffleMask(cast<Constant>(Mask), MaskArr);
+  setShuffleMask(MaskArr);
   setName(Name);
 }
 
 ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, Value *Mask,
+                                     const Twine &Name, BasicBlock *InsertAtEnd)
+    : Instruction(
+          VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
+                          cast<VectorType>(Mask->getType())->getElementCount()),
+          ShuffleVector, OperandTraits<ShuffleVectorInst>::op_begin(this),
+          OperandTraits<ShuffleVectorInst>::operands(this), InsertAtEnd) {
+  assert(isValidOperands(V1, V2, Mask) &&
+         "Invalid shuffle vector instruction operands!");
+
+  Op<0>() = V1;
+  Op<1>() = V2;
+  SmallVector<int, 16> MaskArr;
+  getShuffleMask(cast<Constant>(Mask), MaskArr);
+  setShuffleMask(MaskArr);
+  setName(Name);
+}
+
+ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, ArrayRef<int> Mask,
                                      const Twine &Name,
-                                     BasicBlock *InsertAtEnd)
-: Instruction(VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
-                cast<VectorType>(Mask->getType())->getElementCount()),
-              ShuffleVector,
-              OperandTraits<ShuffleVectorInst>::op_begin(this),
-              OperandTraits<ShuffleVectorInst>::operands(this),
-              InsertAtEnd) {
+                                     Instruction *InsertBefore)
+    : Instruction(
+          VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
+                          Mask.size(), isa<ScalableVectorType>(V1->getType())),
+          ShuffleVector, OperandTraits<ShuffleVectorInst>::op_begin(this),
+          OperandTraits<ShuffleVectorInst>::operands(this), InsertBefore) {
+  assert(isValidOperands(V1, V2, Mask) &&
+         "Invalid shuffle vector instruction operands!");
+  Op<0>() = V1;
+  Op<1>() = V2;
+  setShuffleMask(Mask);
+  setName(Name);
+}
+
+ShuffleVectorInst::ShuffleVectorInst(Value *V1, Value *V2, ArrayRef<int> Mask,
+                                     const Twine &Name, BasicBlock *InsertAtEnd)
+    : Instruction(
+          VectorType::get(cast<VectorType>(V1->getType())->getElementType(),
+                          Mask.size(), isa<ScalableVectorType>(V1->getType())),
+          ShuffleVector, OperandTraits<ShuffleVectorInst>::op_begin(this),
+          OperandTraits<ShuffleVectorInst>::operands(this), InsertAtEnd) {
   assert(isValidOperands(V1, V2, Mask) &&
          "Invalid shuffle vector instruction operands!");
 
   Op<0>() = V1;
   Op<1>() = V2;
-  Op<2>() = Mask;
+  setShuffleMask(Mask);
   setName(Name);
 }
 
 void ShuffleVectorInst::commute() {
-  int NumOpElts = Op<0>()->getType()->getVectorNumElements();
-  int NumMaskElts = getMask()->getType()->getVectorNumElements();
-  SmallVector<Constant*, 16> NewMask(NumMaskElts);
-  Type *Int32Ty = Type::getInt32Ty(getContext());
+  int NumOpElts = cast<VectorType>(Op<0>()->getType())->getNumElements();
+  int NumMaskElts = ShuffleMask.size();
+  SmallVector<int, 16> NewMask(NumMaskElts);
   for (int i = 0; i != NumMaskElts; ++i) {
     int MaskElt = getMaskValue(i);
-    if (MaskElt == -1) {
-      NewMask[i] = UndefValue::get(Int32Ty);
+    if (MaskElt == UndefMaskElem) {
+      NewMask[i] = UndefMaskElem;
       continue;
     }
     assert(MaskElt >= 0 && MaskElt < 2 * NumOpElts && "Out-of-range mask");
     MaskElt = (MaskElt < NumOpElts) ? MaskElt + NumOpElts : MaskElt - NumOpElts;
-    NewMask[i] = ConstantInt::get(Int32Ty, MaskElt);
+    NewMask[i] = MaskElt;
   }
-  Op<2>() = ConstantVector::get(NewMask);
+  setShuffleMask(NewMask);
   Op<0>().swap(Op<1>());
 }
 
 bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
+                                        ArrayRef<int> Mask) {
+  // V1 and V2 must be vectors of the same type.
+  if (!isa<VectorType>(V1->getType()) || V1->getType() != V2->getType())
+    return false;
+
+  // Make sure the mask elements make sense.
+  int V1Size = cast<VectorType>(V1->getType())->getElementCount().Min;
+  for (int Elem : Mask)
+    if (Elem != UndefMaskElem && Elem >= V1Size * 2)
+      return false;
+
+  if (isa<ScalableVectorType>(V1->getType()))
+    if ((Mask[0] != 0 && Mask[0] != UndefMaskElem) || !is_splat(Mask))
+      return false;
+
+  return true;
+}
+
+bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
                                         const Value *Mask) {
   // V1 and V2 must be vectors of the same type.
   if (!V1->getType()->isVectorTy() || V1->getType() != V2->getType())
     return false;
 
-  // Mask must be vector of i32.
+  // Mask must be vector of i32, and must be the same kind of vector as the
+  // input vectors
   auto *MaskTy = dyn_cast<VectorType>(Mask->getType());
-  if (!MaskTy || !MaskTy->getElementType()->isIntegerTy(32))
+  if (!MaskTy || !MaskTy->getElementType()->isIntegerTy(32) ||
+      isa<ScalableVectorType>(MaskTy) != isa<ScalableVectorType>(V1->getType()))
     return false;
 
   // Check to see if Mask is valid.
@@ -1868,31 +1993,17 @@ bool ShuffleVectorInst::isValidOperands(const Value *V1, const Value *V2,
     return true;
   }
 
-  // The bitcode reader can create a place holder for a forward reference
-  // used as the shuffle mask. When this occurs, the shuffle mask will
-  // fall into this case and fail. To avoid this error, do this bit of
-  // ugliness to allow such a mask pass.
-  if (const auto *CE = dyn_cast<ConstantExpr>(Mask))
-    if (CE->getOpcode() == Instruction::UserOp1)
-      return true;
-
   return false;
 }
 
-int ShuffleVectorInst::getMaskValue(const Constant *Mask, unsigned i) {
-  assert(i < Mask->getType()->getVectorNumElements() && "Index out of range");
-  if (auto *CDS = dyn_cast<ConstantDataSequential>(Mask))
-    return CDS->getElementAsInteger(i);
-  Constant *C = Mask->getAggregateElement(i);
-  if (isa<UndefValue>(C))
-    return -1;
-  return cast<ConstantInt>(C)->getZExtValue();
-}
-
 void ShuffleVectorInst::getShuffleMask(const Constant *Mask,
                                        SmallVectorImpl<int> &Result) {
-  unsigned NumElts = Mask->getType()->getVectorNumElements();
-
+  unsigned NumElts = cast<VectorType>(Mask->getType())->getElementCount().Min;
+  if (isa<ConstantAggregateZero>(Mask)) {
+    Result.resize(NumElts, 0);
+    return;
+  }
+  Result.reserve(NumElts);
   if (auto *CDS = dyn_cast<ConstantDataSequential>(Mask)) {
     for (unsigned i = 0; i != NumElts; ++i)
       Result.push_back(CDS->getElementAsInteger(i));
@@ -1905,6 +2016,30 @@ void ShuffleVectorInst::getShuffleMask(const Constant *Mask,
   }
 }
 
+void ShuffleVectorInst::setShuffleMask(ArrayRef<int> Mask) {
+  ShuffleMask.assign(Mask.begin(), Mask.end());
+  ShuffleMaskForBitcode = convertShuffleMaskForBitcode(Mask, getType());
+}
+Constant *ShuffleVectorInst::convertShuffleMaskForBitcode(ArrayRef<int> Mask,
+                                                          Type *ResultTy) {
+  Type *Int32Ty = Type::getInt32Ty(ResultTy->getContext());
+  if (isa<ScalableVectorType>(ResultTy)) {
+    assert(is_splat(Mask) && "Unexpected shuffle");
+    Type *VecTy = VectorType::get(Int32Ty, Mask.size(), true);
+    if (Mask[0] == 0)
+      return Constant::getNullValue(VecTy);
+    return UndefValue::get(VecTy);
+  }
+  SmallVector<Constant *, 16> MaskConst;
+  for (int Elem : Mask) {
+    if (Elem == UndefMaskElem)
+      MaskConst.push_back(UndefValue::get(Int32Ty));
+    else
+      MaskConst.push_back(ConstantInt::get(Int32Ty, Elem));
+  }
+  return ConstantVector::get(MaskConst);
+}
+
 static bool isSingleSourceMaskImpl(ArrayRef<int> Mask, int NumOpElts) {
   assert(!Mask.empty() && "Shuffle mask must contain elements");
   bool UsesLHS = false;
@@ -1919,8 +2054,8 @@ static bool isSingleSourceMaskImpl(ArrayRef<int> Mask, int NumOpElts) {
     if (UsesLHS && UsesRHS)
       return false;
   }
-  assert((UsesLHS ^ UsesRHS) && "Should have selected from exactly 1 source");
-  return true;
+  // Allow for degenerate case: completely undef mask means neither source is used.
+  return UsesLHS || UsesRHS;
 }
 
 bool ShuffleVectorInst::isSingleSourceMask(ArrayRef<int> Mask) {
@@ -2048,13 +2183,15 @@ bool ShuffleVectorInst::isExtractSubvectorMask(ArrayRef<int> Mask,
 }
 
 bool ShuffleVectorInst::isIdentityWithPadding() const {
-  int NumOpElts = Op<0>()->getType()->getVectorNumElements();
-  int NumMaskElts = getType()->getVectorNumElements();
+  if (isa<UndefValue>(Op<2>()))
+    return false;
+  int NumOpElts = cast<VectorType>(Op<0>()->getType())->getNumElements();
+  int NumMaskElts = cast<VectorType>(getType())->getNumElements();
   if (NumMaskElts <= NumOpElts)
     return false;
 
   // The first part of the mask must choose elements from exactly 1 source op.
-  SmallVector<int, 16> Mask = getShuffleMask();
+  ArrayRef<int> Mask = getShuffleMask();
   if (!isIdentityMaskImpl(Mask, NumOpElts))
     return false;
 
@@ -2067,8 +2204,16 @@ bool ShuffleVectorInst::isIdentityWithPadding() const {
 }
 
 bool ShuffleVectorInst::isIdentityWithExtract() const {
-  int NumOpElts = Op<0>()->getType()->getVectorNumElements();
-  int NumMaskElts = getType()->getVectorNumElements();
+  if (isa<UndefValue>(Op<2>()))
+    return false;
+
+  // FIXME: Not currently possible to express a shuffle mask for a scalable
+  // vector for this case
+  if (isa<ScalableVectorType>(getType()))
+    return false;
+
+  int NumOpElts = cast<FixedVectorType>(Op<0>()->getType())->getNumElements();
+  int NumMaskElts = cast<FixedVectorType>(getType())->getNumElements();
   if (NumMaskElts >= NumOpElts)
     return false;
 
@@ -2077,11 +2222,12 @@ bool ShuffleVectorInst::isIdentityWithExtract() const {
 
 bool ShuffleVectorInst::isConcat() const {
   // Vector concatenation is differentiated from identity with padding.
-  if (isa<UndefValue>(Op<0>()) || isa<UndefValue>(Op<1>()))
+  if (isa<UndefValue>(Op<0>()) || isa<UndefValue>(Op<1>()) ||
+      isa<UndefValue>(Op<2>()))
     return false;
 
-  int NumOpElts = Op<0>()->getType()->getVectorNumElements();
-  int NumMaskElts = getType()->getVectorNumElements();
+  int NumOpElts = cast<VectorType>(Op<0>()->getType())->getNumElements();
+  int NumMaskElts = getType()->getNumElements();
   if (NumMaskElts != NumOpElts * 2)
     return false;
 
@@ -2163,15 +2309,15 @@ Type *ExtractValueInst::getIndexedType(Type *Agg,
     if (ArrayType *AT = dyn_cast<ArrayType>(Agg)) {
       if (Index >= AT->getNumElements())
         return nullptr;
+      Agg = AT->getElementType();
     } else if (StructType *ST = dyn_cast<StructType>(Agg)) {
       if (Index >= ST->getNumElements())
         return nullptr;
+      Agg = ST->getElementType(Index);
     } else {
       // Not a valid type to index into.
       return nullptr;
     }
-
-    Agg = cast<CompositeType>(Agg)->getTypeAtIndex(Index);
   }
   return const_cast<Type*>(Agg);
 }
@@ -2383,20 +2529,6 @@ BinaryOperator *BinaryOperator::CreateNUWNeg(Value *Op, const Twine &Name,
   return BinaryOperator::CreateNUWSub(zero, Op, Name, InsertAtEnd);
 }
 
-BinaryOperator *BinaryOperator::CreateFNeg(Value *Op, const Twine &Name,
-                                           Instruction *InsertBefore) {
-  Value *zero = ConstantFP::getZeroValueForNegation(Op->getType());
-  return new BinaryOperator(Instruction::FSub, zero, Op,
-                            Op->getType(), Name, InsertBefore);
-}
-
-BinaryOperator *BinaryOperator::CreateFNeg(Value *Op, const Twine &Name,
-                                           BasicBlock *InsertAtEnd) {
-  Value *zero = ConstantFP::getZeroValueForNegation(Op->getType());
-  return new BinaryOperator(Instruction::FSub, zero, Op,
-                            Op->getType(), Name, InsertAtEnd);
-}
-
 BinaryOperator *BinaryOperator::CreateNot(Value *Op, const Twine &Name,
                                           Instruction *InsertBefore) {
   Constant *C = Constant::getAllOnesValue(Op->getType());
@@ -2836,7 +2968,8 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
          "Invalid cast");
   assert(Ty->isVectorTy() == S->getType()->isVectorTy() && "Invalid cast");
   assert((!Ty->isVectorTy() ||
-          Ty->getVectorNumElements() == S->getType()->getVectorNumElements()) &&
+          cast<VectorType>(Ty)->getNumElements() ==
+              cast<VectorType>(S->getType())->getNumElements()) &&
          "Invalid cast");
 
   if (Ty->isIntOrIntVectorTy())
@@ -2854,7 +2987,8 @@ CastInst *CastInst::CreatePointerCast(Value *S, Type *Ty,
          "Invalid cast");
   assert(Ty->isVectorTy() == S->getType()->isVectorTy() && "Invalid cast");
   assert((!Ty->isVectorTy() ||
-          Ty->getVectorNumElements() == S->getType()->getVectorNumElements()) &&
+          cast<VectorType>(Ty)->getNumElements() ==
+              cast<VectorType>(S->getType())->getNumElements()) &&
          "Invalid cast");
 
   if (Ty->isIntOrIntVectorTy())
@@ -3185,57 +3319,54 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
       SrcTy->isAggregateType() || DstTy->isAggregateType())
     return false;
 
-  // Get the size of the types in bits, we'll need this later
-  unsigned SrcBitSize = SrcTy->getScalarSizeInBits();
-  unsigned DstBitSize = DstTy->getScalarSizeInBits();
+  // Get the size of the types in bits, and whether we are dealing
+  // with vector types, we'll need this later.
+  bool SrcIsVec = isa<VectorType>(SrcTy);
+  bool DstIsVec = isa<VectorType>(DstTy);
+  unsigned SrcScalarBitSize = SrcTy->getScalarSizeInBits();
+  unsigned DstScalarBitSize = DstTy->getScalarSizeInBits();
 
   // If these are vector types, get the lengths of the vectors (using zero for
   // scalar types means that checking that vector lengths match also checks that
   // scalars are not being converted to vectors or vectors to scalars).
-  unsigned SrcLength = SrcTy->isVectorTy() ?
-    cast<VectorType>(SrcTy)->getNumElements() : 0;
-  unsigned DstLength = DstTy->isVectorTy() ?
-    cast<VectorType>(DstTy)->getNumElements() : 0;
+  ElementCount SrcEC = SrcIsVec ? cast<VectorType>(SrcTy)->getElementCount()
+                                : ElementCount(0, false);
+  ElementCount DstEC = DstIsVec ? cast<VectorType>(DstTy)->getElementCount()
+                                : ElementCount(0, false);
 
   // Switch on the opcode provided
   switch (op) {
   default: return false; // This is an input error
   case Instruction::Trunc:
     return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() &&
-      SrcLength == DstLength && SrcBitSize > DstBitSize;
+           SrcEC == DstEC && SrcScalarBitSize > DstScalarBitSize;
   case Instruction::ZExt:
     return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() &&
-      SrcLength == DstLength && SrcBitSize < DstBitSize;
+           SrcEC == DstEC && SrcScalarBitSize < DstScalarBitSize;
   case Instruction::SExt:
     return SrcTy->isIntOrIntVectorTy() && DstTy->isIntOrIntVectorTy() &&
-      SrcLength == DstLength && SrcBitSize < DstBitSize;
+           SrcEC == DstEC && SrcScalarBitSize < DstScalarBitSize;
   case Instruction::FPTrunc:
     return SrcTy->isFPOrFPVectorTy() && DstTy->isFPOrFPVectorTy() &&
-      SrcLength == DstLength && SrcBitSize > DstBitSize;
+           SrcEC == DstEC && SrcScalarBitSize > DstScalarBitSize;
   case Instruction::FPExt:
     return SrcTy->isFPOrFPVectorTy() && DstTy->isFPOrFPVectorTy() &&
-      SrcLength == DstLength && SrcBitSize < DstBitSize;
+           SrcEC == DstEC && SrcScalarBitSize < DstScalarBitSize;
   case Instruction::UIToFP:
   case Instruction::SIToFP:
     return SrcTy->isIntOrIntVectorTy() && DstTy->isFPOrFPVectorTy() &&
-      SrcLength == DstLength;
+           SrcEC == DstEC;
   case Instruction::FPToUI:
   case Instruction::FPToSI:
     return SrcTy->isFPOrFPVectorTy() && DstTy->isIntOrIntVectorTy() &&
-      SrcLength == DstLength;
+           SrcEC == DstEC;
   case Instruction::PtrToInt:
-    if (isa<VectorType>(SrcTy) != isa<VectorType>(DstTy))
+    if (SrcEC != DstEC)
       return false;
-    if (VectorType *VT = dyn_cast<VectorType>(SrcTy))
-      if (VT->getNumElements() != cast<VectorType>(DstTy)->getNumElements())
-        return false;
     return SrcTy->isPtrOrPtrVectorTy() && DstTy->isIntOrIntVectorTy();
   case Instruction::IntToPtr:
-    if (isa<VectorType>(SrcTy) != isa<VectorType>(DstTy))
+    if (SrcEC != DstEC)
       return false;
-    if (VectorType *VT = dyn_cast<VectorType>(SrcTy))
-      if (VT->getNumElements() != cast<VectorType>(DstTy)->getNumElements())
-        return false;
     return SrcTy->isIntOrIntVectorTy() && DstTy->isPtrOrPtrVectorTy();
   case Instruction::BitCast: {
     PointerType *SrcPtrTy = dyn_cast<PointerType>(SrcTy->getScalarType());
@@ -3256,14 +3387,12 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
       return false;
 
     // A vector of pointers must have the same number of elements.
-    VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy);
-    VectorType *DstVecTy = dyn_cast<VectorType>(DstTy);
-    if (SrcVecTy && DstVecTy)
-      return (SrcVecTy->getNumElements() == DstVecTy->getNumElements());
-    if (SrcVecTy)
-      return SrcVecTy->getNumElements() == 1;
-    if (DstVecTy)
-      return DstVecTy->getNumElements() == 1;
+    if (SrcIsVec && DstIsVec)
+      return SrcEC == DstEC;
+    if (SrcIsVec)
+      return SrcEC == ElementCount(1, false);
+    if (DstIsVec)
+      return DstEC == ElementCount(1, false);
 
     return true;
   }
@@ -3279,14 +3408,7 @@ CastInst::castIsValid(Instruction::CastOps op, Value *S, Type *DstTy) {
     if (SrcPtrTy->getAddressSpace() == DstPtrTy->getAddressSpace())
       return false;
 
-    if (VectorType *SrcVecTy = dyn_cast<VectorType>(SrcTy)) {
-      if (VectorType *DstVecTy = dyn_cast<VectorType>(DstTy))
-        return (SrcVecTy->getNumElements() == DstVecTy->getNumElements());
-
-      return false;
-    }
-
-    return true;
+    return SrcEC == DstEC;
   }
   }
 }
@@ -4137,7 +4259,7 @@ InsertValueInst *InsertValueInst::cloneImpl() const {
 AllocaInst *AllocaInst::cloneImpl() const {
   AllocaInst *Result =
       new AllocaInst(getAllocatedType(), getType()->getAddressSpace(),
-                     (Value *)getOperand(0), MaybeAlign(getAlignment()));
+                     getOperand(0), getAlign());
   Result->setUsedWithInAlloca(isUsedWithInAlloca());
   Result->setSwiftError(isSwiftError());
   return Result;
@@ -4145,21 +4267,18 @@ AllocaInst *AllocaInst::cloneImpl() const {
 
 LoadInst *LoadInst::cloneImpl() const {
   return new LoadInst(getType(), getOperand(0), Twine(), isVolatile(),
-                      MaybeAlign(getAlignment()), getOrdering(),
-                      getSyncScopeID());
+                      getAlign(), getOrdering(), getSyncScopeID());
 }
 
 StoreInst *StoreInst::cloneImpl() const {
-  return new StoreInst(getOperand(0), getOperand(1), isVolatile(),
-                       MaybeAlign(getAlignment()), getOrdering(),
-                       getSyncScopeID());
+  return new StoreInst(getOperand(0), getOperand(1), isVolatile(), getAlign(),
+                       getOrdering(), getSyncScopeID());
 }
 
 AtomicCmpXchgInst *AtomicCmpXchgInst::cloneImpl() const {
-  AtomicCmpXchgInst *Result =
-    new AtomicCmpXchgInst(getOperand(0), getOperand(1), getOperand(2),
-                          getSuccessOrdering(), getFailureOrdering(),
-                          getSyncScopeID());
+  AtomicCmpXchgInst *Result = new AtomicCmpXchgInst(
+      getOperand(0), getOperand(1), getOperand(2), getAlign(),
+      getSuccessOrdering(), getFailureOrdering(), getSyncScopeID());
   Result->setVolatile(isVolatile());
   Result->setWeak(isWeak());
   return Result;
@@ -4167,8 +4286,8 @@ AtomicCmpXchgInst *AtomicCmpXchgInst::cloneImpl() const {
 
 AtomicRMWInst *AtomicRMWInst::cloneImpl() const {
   AtomicRMWInst *Result =
-    new AtomicRMWInst(getOperation(), getOperand(0), getOperand(1),
-                      getOrdering(), getSyncScopeID());
+      new AtomicRMWInst(getOperation(), getOperand(0), getOperand(1),
+                        getAlign(), getOrdering(), getSyncScopeID());
   Result->setVolatile(isVolatile());
   return Result;
 }
@@ -4254,7 +4373,7 @@ InsertElementInst *InsertElementInst::cloneImpl() const {
 }
 
 ShuffleVectorInst *ShuffleVectorInst::cloneImpl() const {
-  return new ShuffleVectorInst(getOperand(0), getOperand(1), getOperand(2));
+  return new ShuffleVectorInst(getOperand(0), getOperand(1), getShuffleMask());
 }
 
 PHINode *PHINode::cloneImpl() const { return new PHINode(*this); }
diff --git a/llvm/lib/IR/IntrinsicInst.cpp b/llvm/lib/IR/IntrinsicInst.cpp
index b23742b83c12..c4e06cd979ed 100644
--- a/llvm/lib/IR/IntrinsicInst.cpp
+++ b/llvm/lib/IR/IntrinsicInst.cpp
@@ -21,13 +21,15 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/Operator.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/IR/PatternMatch.h"
+
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
@@ -102,7 +104,7 @@ Value *InstrProfIncrementInst::getStep() const {
   return ConstantInt::get(Type::getInt64Ty(Context), 1);
 }
 
-Optional<fp::RoundingMode> ConstrainedFPIntrinsic::getRoundingMode() const {
+Optional<RoundingMode> ConstrainedFPIntrinsic::getRoundingMode() const {
   unsigned NumOperands = getNumArgOperands();
   Metadata *MD =
       cast<MetadataAsValue>(getArgOperand(NumOperands - 2))->getMetadata();
@@ -121,55 +123,53 @@ ConstrainedFPIntrinsic::getExceptionBehavior() const {
   return StrToExceptionBehavior(cast<MDString>(MD)->getString());
 }
 
-FCmpInst::Predicate
-ConstrainedFPCmpIntrinsic::getPredicate() const {
-  Metadata *MD =
-      cast<MetadataAsValue>(getArgOperand(2))->getMetadata();
+FCmpInst::Predicate ConstrainedFPCmpIntrinsic::getPredicate() const {
+  Metadata *MD = cast<MetadataAsValue>(getArgOperand(2))->getMetadata();
   if (!MD || !isa<MDString>(MD))
     return FCmpInst::BAD_FCMP_PREDICATE;
   return StringSwitch<FCmpInst::Predicate>(cast<MDString>(MD)->getString())
-           .Case("oeq", FCmpInst::FCMP_OEQ)
-           .Case("ogt", FCmpInst::FCMP_OGT)
-           .Case("oge", FCmpInst::FCMP_OGE)
-           .Case("olt", FCmpInst::FCMP_OLT)
-           .Case("ole", FCmpInst::FCMP_OLE)
-           .Case("one", FCmpInst::FCMP_ONE)
-           .Case("ord", FCmpInst::FCMP_ORD)
-           .Case("uno", FCmpInst::FCMP_UNO)
-           .Case("ueq", FCmpInst::FCMP_UEQ)
-           .Case("ugt", FCmpInst::FCMP_UGT)
-           .Case("uge", FCmpInst::FCMP_UGE)
-           .Case("ult", FCmpInst::FCMP_ULT)
-           .Case("ule", FCmpInst::FCMP_ULE)
-           .Case("une", FCmpInst::FCMP_UNE)
-           .Default(FCmpInst::BAD_FCMP_PREDICATE);
+      .Case("oeq", FCmpInst::FCMP_OEQ)
+      .Case("ogt", FCmpInst::FCMP_OGT)
+      .Case("oge", FCmpInst::FCMP_OGE)
+      .Case("olt", FCmpInst::FCMP_OLT)
+      .Case("ole", FCmpInst::FCMP_OLE)
+      .Case("one", FCmpInst::FCMP_ONE)
+      .Case("ord", FCmpInst::FCMP_ORD)
+      .Case("uno", FCmpInst::FCMP_UNO)
+      .Case("ueq", FCmpInst::FCMP_UEQ)
+      .Case("ugt", FCmpInst::FCMP_UGT)
+      .Case("uge", FCmpInst::FCMP_UGE)
+      .Case("ult", FCmpInst::FCMP_ULT)
+      .Case("ule", FCmpInst::FCMP_ULE)
+      .Case("une", FCmpInst::FCMP_UNE)
+      .Default(FCmpInst::BAD_FCMP_PREDICATE);
 }
 
 bool ConstrainedFPIntrinsic::isUnaryOp() const {
   switch (getIntrinsicID()) {
-    default:
-      return false;
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)                   \
-    case Intrinsic::INTRINSIC:                                                 \
-      return NARG == 1;
+  default:
+    return false;
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
+  case Intrinsic::INTRINSIC:                                                   \
+    return NARG == 1;
 #include "llvm/IR/ConstrainedOps.def"
   }
 }
 
 bool ConstrainedFPIntrinsic::isTernaryOp() const {
   switch (getIntrinsicID()) {
-    default:
-      return false;
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)                   \
-    case Intrinsic::INTRINSIC:                                                 \
-      return NARG == 3;
+  default:
+    return false;
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
+  case Intrinsic::INTRINSIC:                                                   \
+    return NARG == 3;
 #include "llvm/IR/ConstrainedOps.def"
   }
 }
 
 bool ConstrainedFPIntrinsic::classof(const IntrinsicInst *I) {
   switch (I->getIntrinsicID()) {
-#define INSTRUCTION(NAME, NARGS, ROUND_MODE, INTRINSIC, DAGN)                  \
+#define INSTRUCTION(NAME, NARGS, ROUND_MODE, INTRINSIC)                        \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
     return true;
@@ -178,36 +178,165 @@ bool ConstrainedFPIntrinsic::classof(const IntrinsicInst *I) {
   }
 }
 
+ElementCount VPIntrinsic::getStaticVectorLength() const {
+  auto GetVectorLengthOfType = [](const Type *T) -> ElementCount {
+    auto VT = cast<VectorType>(T);
+    auto ElemCount = VT->getElementCount();
+    return ElemCount;
+  };
+
+  auto VPMask = getMaskParam();
+  return GetVectorLengthOfType(VPMask->getType());
+}
+
+Value *VPIntrinsic::getMaskParam() const {
+  auto maskPos = GetMaskParamPos(getIntrinsicID());
+  if (maskPos)
+    return getArgOperand(maskPos.getValue());
+  return nullptr;
+}
+
+Value *VPIntrinsic::getVectorLengthParam() const {
+  auto vlenPos = GetVectorLengthParamPos(getIntrinsicID());
+  if (vlenPos)
+    return getArgOperand(vlenPos.getValue());
+  return nullptr;
+}
+
+Optional<int> VPIntrinsic::GetMaskParamPos(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  default:
+    return None;
+
+#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)                          \
+  case Intrinsic::VPID:                                                        \
+    return MASKPOS;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+Optional<int> VPIntrinsic::GetVectorLengthParamPos(Intrinsic::ID IntrinsicID) {
+  switch (IntrinsicID) {
+  default:
+    return None;
+
+#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)                          \
+  case Intrinsic::VPID:                                                        \
+    return VLENPOS;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+bool VPIntrinsic::IsVPIntrinsic(Intrinsic::ID ID) {
+  switch (ID) {
+  default:
+    return false;
+
+#define REGISTER_VP_INTRINSIC(VPID, MASKPOS, VLENPOS)                          \
+  case Intrinsic::VPID:                                                        \
+    break;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+  return true;
+}
+
+// Equivalent non-predicated opcode
+unsigned VPIntrinsic::GetFunctionalOpcodeForVP(Intrinsic::ID ID) {
+  switch (ID) {
+  default:
+    return Instruction::Call;
+
+#define HANDLE_VP_TO_OC(VPID, OC)                                              \
+  case Intrinsic::VPID:                                                        \
+    return Instruction::OC;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+Intrinsic::ID VPIntrinsic::GetForOpcode(unsigned OC) {
+  switch (OC) {
+  default:
+    return Intrinsic::not_intrinsic;
+
+#define HANDLE_VP_TO_OC(VPID, OC)                                              \
+  case Instruction::OC:                                                        \
+    return Intrinsic::VPID;
+#include "llvm/IR/VPIntrinsics.def"
+  }
+}
+
+bool VPIntrinsic::canIgnoreVectorLengthParam() const {
+  using namespace PatternMatch;
+
+  ElementCount EC = getStaticVectorLength();
+
+  // No vlen param - no lanes masked-off by it.
+  auto *VLParam = getVectorLengthParam();
+  if (!VLParam)
+    return true;
+
+  // Note that the VP intrinsic causes undefined behavior if the Explicit Vector
+  // Length parameter is strictly greater-than the number of vector elements of
+  // the operation. This function returns true when this is detected statically
+  // in the IR.
+
+  // Check whether "W == vscale * EC.Min"
+  if (EC.Scalable) {
+    // Undig the DL
+    auto ParMod = this->getModule();
+    if (!ParMod)
+      return false;
+    const auto &DL = ParMod->getDataLayout();
+
+    // Compare vscale patterns
+    uint64_t VScaleFactor;
+    if (match(VLParam, m_c_Mul(m_ConstantInt(VScaleFactor), m_VScale(DL))))
+      return VScaleFactor >= EC.Min;
+    return (EC.Min == 1) && match(VLParam, m_VScale(DL));
+  }
+
+  // standard SIMD operation
+  auto VLConst = dyn_cast<ConstantInt>(VLParam);
+  if (!VLConst)
+    return false;
+
+  uint64_t VLNum = VLConst->getZExtValue();
+  if (VLNum >= EC.Min)
+    return true;
+
+  return false;
+}
+
 Instruction::BinaryOps BinaryOpIntrinsic::getBinaryOp() const {
   switch (getIntrinsicID()) {
-    case Intrinsic::uadd_with_overflow:
-    case Intrinsic::sadd_with_overflow:
-    case Intrinsic::uadd_sat:
-    case Intrinsic::sadd_sat:
-      return Instruction::Add;
-    case Intrinsic::usub_with_overflow:
-    case Intrinsic::ssub_with_overflow:
-    case Intrinsic::usub_sat:
-    case Intrinsic::ssub_sat:
-      return Instruction::Sub;
-    case Intrinsic::umul_with_overflow:
-    case Intrinsic::smul_with_overflow:
-      return Instruction::Mul;
-    default:
-      llvm_unreachable("Invalid intrinsic");
+  case Intrinsic::uadd_with_overflow:
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::uadd_sat:
+  case Intrinsic::sadd_sat:
+    return Instruction::Add;
+  case Intrinsic::usub_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::usub_sat:
+  case Intrinsic::ssub_sat:
+    return Instruction::Sub;
+  case Intrinsic::umul_with_overflow:
+  case Intrinsic::smul_with_overflow:
+    return Instruction::Mul;
+  default:
+    llvm_unreachable("Invalid intrinsic");
   }
 }
 
 bool BinaryOpIntrinsic::isSigned() const {
   switch (getIntrinsicID()) {
-    case Intrinsic::sadd_with_overflow:
-    case Intrinsic::ssub_with_overflow:
-    case Intrinsic::smul_with_overflow:
-    case Intrinsic::sadd_sat:
-    case Intrinsic::ssub_sat:
-      return true;
-    default:
-      return false;
+  case Intrinsic::sadd_with_overflow:
+  case Intrinsic::ssub_with_overflow:
+  case Intrinsic::smul_with_overflow:
+  case Intrinsic::sadd_sat:
+  case Intrinsic::ssub_sat:
+    return true;
+  default:
+    return false;
   }
 }
 
diff --git a/llvm/lib/IR/LLVMContext.cpp b/llvm/lib/IR/LLVMContext.cpp
index cb13b27aa50f..7ebca5274369 100644
--- a/llvm/lib/IR/LLVMContext.cpp
+++ b/llvm/lib/IR/LLVMContext.cpp
@@ -19,9 +19,10 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
+#include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/RemarkStreamer.h"
+#include "llvm/Remarks/RemarkStreamer.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
@@ -67,6 +68,16 @@ LLVMContext::LLVMContext() : pImpl(new LLVMContextImpl(*this)) {
          "cfguardtarget operand bundle id drifted!");
   (void)CFGuardTargetEntry;
 
+  auto *PreallocatedEntry = pImpl->getOrInsertBundleTag("preallocated");
+  assert(PreallocatedEntry->second == LLVMContext::OB_preallocated &&
+         "preallocated operand bundle id drifted!");
+  (void)PreallocatedEntry;
+
+  auto *GCLiveEntry = pImpl->getOrInsertBundleTag("gc-live");
+  assert(GCLiveEntry->second == LLVMContext::OB_gc_live &&
+         "gc-transition operand bundle id drifted!");
+  (void)GCLiveEntry;
+
   SyncScope::ID SingleThreadSSID =
       pImpl->getOrInsertSyncScopeID("singlethread");
   assert(SingleThreadSSID == SyncScope::SingleThread &&
@@ -142,15 +153,26 @@ uint64_t LLVMContext::getDiagnosticsHotnessThreshold() const {
   return pImpl->DiagnosticsHotnessThreshold;
 }
 
-RemarkStreamer *LLVMContext::getRemarkStreamer() {
-  return pImpl->RemarkDiagStreamer.get();
+remarks::RemarkStreamer *LLVMContext::getMainRemarkStreamer() {
+  return pImpl->MainRemarkStreamer.get();
 }
-const RemarkStreamer *LLVMContext::getRemarkStreamer() const {
-  return const_cast<LLVMContext *>(this)->getRemarkStreamer();
+const remarks::RemarkStreamer *LLVMContext::getMainRemarkStreamer() const {
+  return const_cast<LLVMContext *>(this)->getMainRemarkStreamer();
 }
-void LLVMContext::setRemarkStreamer(
-    std::unique_ptr<RemarkStreamer> RemarkStreamer) {
-  pImpl->RemarkDiagStreamer = std::move(RemarkStreamer);
+void LLVMContext::setMainRemarkStreamer(
+    std::unique_ptr<remarks::RemarkStreamer> RemarkStreamer) {
+  pImpl->MainRemarkStreamer = std::move(RemarkStreamer);
+}
+
+LLVMRemarkStreamer *LLVMContext::getLLVMRemarkStreamer() {
+  return pImpl->LLVMRS.get();
+}
+const LLVMRemarkStreamer *LLVMContext::getLLVMRemarkStreamer() const {
+  return const_cast<LLVMContext *>(this)->getLLVMRemarkStreamer();
+}
+void LLVMContext::setLLVMRemarkStreamer(
+    std::unique_ptr<LLVMRemarkStreamer> RemarkStreamer) {
+  pImpl->LLVMRS = std::move(RemarkStreamer);
 }
 
 DiagnosticHandler::DiagnosticHandlerTy
@@ -214,7 +236,7 @@ LLVMContext::getDiagnosticMessagePrefix(DiagnosticSeverity Severity) {
 
 void LLVMContext::diagnose(const DiagnosticInfo &DI) {
   if (auto *OptDiagBase = dyn_cast<DiagnosticInfoOptimizationBase>(&DI))
-    if (RemarkStreamer *RS = getRemarkStreamer())
+    if (LLVMRemarkStreamer *RS = getLLVMRemarkStreamer())
       RS->emit(*OptDiagBase);
 
   // If there is a report handler, use it.
@@ -265,6 +287,11 @@ void LLVMContext::getOperandBundleTags(SmallVectorImpl<StringRef> &Tags) const {
   pImpl->getOperandBundleTags(Tags);
 }
 
+StringMapEntry<uint32_t> *
+LLVMContext::getOrInsertBundleTag(StringRef TagName) const {
+  return pImpl->getOrInsertBundleTag(TagName);
+}
+
 uint32_t LLVMContext::getOperandBundleTagID(StringRef Tag) const {
   return pImpl->getOperandBundleTagID(Tag);
 }
diff --git a/llvm/lib/IR/LLVMContextImpl.cpp b/llvm/lib/IR/LLVMContextImpl.cpp
index 5f9782714170..f197b3e67d30 100644
--- a/llvm/lib/IR/LLVMContextImpl.cpp
+++ b/llvm/lib/IR/LLVMContextImpl.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "LLVMContextImpl.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/OptBisect.h"
 #include "llvm/IR/Type.h"
@@ -25,6 +26,7 @@ LLVMContextImpl::LLVMContextImpl(LLVMContext &C)
     VoidTy(C, Type::VoidTyID),
     LabelTy(C, Type::LabelTyID),
     HalfTy(C, Type::HalfTyID),
+    BFloatTy(C, Type::BFloatTyID),
     FloatTy(C, Type::FloatTyID),
     DoubleTy(C, Type::DoubleTyID),
     MetadataTy(C, Type::MetadataTyID),
@@ -103,21 +105,6 @@ LLVMContextImpl::~LLVMContextImpl() {
     delete CDSConstant.second;
   CDSConstants.clear();
 
-  // Destroy attributes.
-  for (FoldingSetIterator<AttributeImpl> I = AttrsSet.begin(),
-         E = AttrsSet.end(); I != E; ) {
-    FoldingSetIterator<AttributeImpl> Elem = I++;
-    delete &*Elem;
-  }
-
-  // Destroy attribute lists.
-  for (FoldingSetIterator<AttributeListImpl> I = AttrsLists.begin(),
-                                             E = AttrsLists.end();
-       I != E;) {
-    FoldingSetIterator<AttributeListImpl> Elem = I++;
-    delete &*Elem;
-  }
-
   // Destroy attribute node lists.
   for (FoldingSetIterator<AttributeSetNode> I = AttrsSetNodes.begin(),
          E = AttrsSetNodes.end(); I != E; ) {
@@ -142,18 +129,19 @@ LLVMContextImpl::~LLVMContextImpl() {
 }
 
 void LLVMContextImpl::dropTriviallyDeadConstantArrays() {
-  bool Changed;
-  do {
-    Changed = false;
-
-    for (auto I = ArrayConstants.begin(), E = ArrayConstants.end(); I != E;) {
-      auto *C = *I++;
-      if (C->use_empty()) {
-        Changed = true;
-        C->destroyConstant();
+  SmallSetVector<ConstantArray *, 4> WorkList(ArrayConstants.begin(),
+                                              ArrayConstants.end());
+
+  while (!WorkList.empty()) {
+    ConstantArray *C = WorkList.pop_back_val();
+    if (C->use_empty()) {
+      for (const Use &Op : C->operands()) {
+        if (auto *COp = dyn_cast<ConstantArray>(Op))
+          WorkList.insert(COp);
       }
+      C->destroyConstant();
     }
-  } while (Changed);
+  }
 }
 
 void Module::dropTriviallyDeadConstantArrays() {
diff --git a/llvm/lib/IR/LLVMContextImpl.h b/llvm/lib/IR/LLVMContextImpl.h
index 6f5d5752b38d..1c7d8746d242 100644
--- a/llvm/lib/IR/LLVMContextImpl.h
+++ b/llvm/lib/IR/LLVMContextImpl.h
@@ -29,14 +29,13 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/IR/RemarkStreamer.h"
 #include "llvm/IR/TrackingMDRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
@@ -53,8 +52,7 @@
 
 namespace llvm {
 
-class ConstantFP;
-class ConstantInt;
+class StringRef;
 class Type;
 class Value;
 class ValueHandleBase;
@@ -325,49 +323,66 @@ template <> struct MDNodeKeyImpl<GenericDINode> : MDNodeOpsKey {
 
 template <> struct MDNodeKeyImpl<DISubrange> {
   Metadata *CountNode;
-  int64_t LowerBound;
-
-  MDNodeKeyImpl(Metadata *CountNode, int64_t LowerBound)
-      : CountNode(CountNode), LowerBound(LowerBound) {}
+  Metadata *LowerBound;
+  Metadata *UpperBound;
+  Metadata *Stride;
+
+  MDNodeKeyImpl(Metadata *CountNode, Metadata *LowerBound, Metadata *UpperBound,
+                Metadata *Stride)
+      : CountNode(CountNode), LowerBound(LowerBound), UpperBound(UpperBound),
+        Stride(Stride) {}
   MDNodeKeyImpl(const DISubrange *N)
-      : CountNode(N->getRawCountNode()),
-        LowerBound(N->getLowerBound()) {}
+      : CountNode(N->getRawCountNode()), LowerBound(N->getRawLowerBound()),
+        UpperBound(N->getRawUpperBound()), Stride(N->getRawStride()) {}
 
   bool isKeyOf(const DISubrange *RHS) const {
-    if (LowerBound != RHS->getLowerBound())
-      return false;
-
-    if (auto *RHSCount = RHS->getCount().dyn_cast<ConstantInt*>())
-      if (auto *MD = dyn_cast<ConstantAsMetadata>(CountNode))
-        if (RHSCount->getSExtValue() ==
-            cast<ConstantInt>(MD->getValue())->getSExtValue())
+    auto BoundsEqual = [=](Metadata *Node1, Metadata *Node2) -> bool {
+      if (Node1 == Node2)
+        return true;
+
+      ConstantAsMetadata *MD1 = dyn_cast_or_null<ConstantAsMetadata>(Node1);
+      ConstantAsMetadata *MD2 = dyn_cast_or_null<ConstantAsMetadata>(Node2);
+      if (MD1 && MD2) {
+        ConstantInt *CV1 = cast<ConstantInt>(MD1->getValue());
+        ConstantInt *CV2 = cast<ConstantInt>(MD2->getValue());
+        if (CV1->getSExtValue() == CV2->getSExtValue())
           return true;
+      }
+      return false;
+    };
 
-    return CountNode == RHS->getRawCountNode();
+    return BoundsEqual(CountNode, RHS->getRawCountNode()) &&
+           BoundsEqual(LowerBound, RHS->getRawLowerBound()) &&
+           BoundsEqual(UpperBound, RHS->getRawUpperBound()) &&
+           BoundsEqual(Stride, RHS->getRawStride());
   }
 
   unsigned getHashValue() const {
-    if (auto *MD = dyn_cast<ConstantAsMetadata>(CountNode))
-      return hash_combine(cast<ConstantInt>(MD->getValue())->getSExtValue(),
-                          LowerBound);
-    return hash_combine(CountNode, LowerBound);
+    if (CountNode)
+      if (auto *MD = dyn_cast<ConstantAsMetadata>(CountNode))
+        return hash_combine(cast<ConstantInt>(MD->getValue())->getSExtValue(),
+                            LowerBound, UpperBound, Stride);
+    return hash_combine(CountNode, LowerBound, UpperBound, Stride);
   }
 };
 
 template <> struct MDNodeKeyImpl<DIEnumerator> {
-  int64_t Value;
+  APInt Value;
   MDString *Name;
   bool IsUnsigned;
 
-  MDNodeKeyImpl(int64_t Value, bool IsUnsigned, MDString *Name)
+  MDNodeKeyImpl(APInt Value, bool IsUnsigned, MDString *Name)
       : Value(Value), Name(Name), IsUnsigned(IsUnsigned) {}
+  MDNodeKeyImpl(int64_t Value, bool IsUnsigned, MDString *Name)
+      : Value(APInt(64, Value, !IsUnsigned)), Name(Name),
+        IsUnsigned(IsUnsigned) {}
   MDNodeKeyImpl(const DIEnumerator *N)
       : Value(N->getValue()), Name(N->getRawName()),
         IsUnsigned(N->isUnsigned()) {}
 
   bool isKeyOf(const DIEnumerator *RHS) const {
-    return Value == RHS->getValue() && IsUnsigned == RHS->isUnsigned() &&
-           Name == RHS->getRawName();
+    return APInt::isSameValue(Value, RHS->getValue()) &&
+           IsUnsigned == RHS->isUnsigned() && Name == RHS->getRawName();
   }
 
   unsigned getHashValue() const { return hash_combine(Value, Name); }
@@ -509,19 +524,21 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
   Metadata *TemplateParams;
   MDString *Identifier;
   Metadata *Discriminator;
+  Metadata *DataLocation;
 
   MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *File, unsigned Line,
                 Metadata *Scope, Metadata *BaseType, uint64_t SizeInBits,
                 uint32_t AlignInBits, uint64_t OffsetInBits, unsigned Flags,
                 Metadata *Elements, unsigned RuntimeLang,
                 Metadata *VTableHolder, Metadata *TemplateParams,
-                MDString *Identifier, Metadata *Discriminator)
+                MDString *Identifier, Metadata *Discriminator,
+                Metadata *DataLocation)
       : Tag(Tag), Name(Name), File(File), Line(Line), Scope(Scope),
         BaseType(BaseType), SizeInBits(SizeInBits), OffsetInBits(OffsetInBits),
         AlignInBits(AlignInBits), Flags(Flags), Elements(Elements),
         RuntimeLang(RuntimeLang), VTableHolder(VTableHolder),
         TemplateParams(TemplateParams), Identifier(Identifier),
-        Discriminator(Discriminator) {}
+        Discriminator(Discriminator), DataLocation(DataLocation) {}
   MDNodeKeyImpl(const DICompositeType *N)
       : Tag(N->getTag()), Name(N->getRawName()), File(N->getRawFile()),
         Line(N->getLine()), Scope(N->getRawScope()),
@@ -531,7 +548,8 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
         RuntimeLang(N->getRuntimeLang()), VTableHolder(N->getRawVTableHolder()),
         TemplateParams(N->getRawTemplateParams()),
         Identifier(N->getRawIdentifier()),
-        Discriminator(N->getRawDiscriminator()) {}
+        Discriminator(N->getRawDiscriminator()),
+        DataLocation(N->getRawDataLocation()) {}
 
   bool isKeyOf(const DICompositeType *RHS) const {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
@@ -545,7 +563,8 @@ template <> struct MDNodeKeyImpl<DICompositeType> {
            VTableHolder == RHS->getRawVTableHolder() &&
            TemplateParams == RHS->getRawTemplateParams() &&
            Identifier == RHS->getRawIdentifier() &&
-           Discriminator == RHS->getRawDiscriminator();
+           Discriminator == RHS->getRawDiscriminator() &&
+           DataLocation == RHS->getRawDataLocation();
   }
 
   unsigned getHashValue() const {
@@ -815,67 +834,81 @@ template <> struct MDNodeKeyImpl<DICommonBlock> {
 };
 
 template <> struct MDNodeKeyImpl<DIModule> {
+  Metadata *File;
   Metadata *Scope;
   MDString *Name;
   MDString *ConfigurationMacros;
   MDString *IncludePath;
-  MDString *SysRoot;
+  MDString *APINotesFile;
+  unsigned LineNo;
 
-  MDNodeKeyImpl(Metadata *Scope, MDString *Name, MDString *ConfigurationMacros,
-                MDString *IncludePath, MDString *SysRoot)
-      : Scope(Scope), Name(Name), ConfigurationMacros(ConfigurationMacros),
-        IncludePath(IncludePath), SysRoot(SysRoot) {}
+  MDNodeKeyImpl(Metadata *File, Metadata *Scope, MDString *Name,
+                MDString *ConfigurationMacros, MDString *IncludePath,
+                MDString *APINotesFile, unsigned LineNo)
+      : File(File), Scope(Scope), Name(Name),
+        ConfigurationMacros(ConfigurationMacros), IncludePath(IncludePath),
+        APINotesFile(APINotesFile), LineNo(LineNo) {}
   MDNodeKeyImpl(const DIModule *N)
-      : Scope(N->getRawScope()), Name(N->getRawName()),
+      : File(N->getRawFile()), Scope(N->getRawScope()), Name(N->getRawName()),
         ConfigurationMacros(N->getRawConfigurationMacros()),
-        IncludePath(N->getRawIncludePath()), SysRoot(N->getRawSysRoot()) {}
+        IncludePath(N->getRawIncludePath()),
+        APINotesFile(N->getRawAPINotesFile()), LineNo(N->getLineNo()) {}
 
   bool isKeyOf(const DIModule *RHS) const {
     return Scope == RHS->getRawScope() && Name == RHS->getRawName() &&
            ConfigurationMacros == RHS->getRawConfigurationMacros() &&
            IncludePath == RHS->getRawIncludePath() &&
-           SysRoot == RHS->getRawSysRoot();
+           APINotesFile == RHS->getRawAPINotesFile() &&
+           File == RHS->getRawFile() && LineNo == RHS->getLineNo();
   }
 
   unsigned getHashValue() const {
-    return hash_combine(Scope, Name,
-                        ConfigurationMacros, IncludePath, SysRoot);
+    return hash_combine(Scope, Name, ConfigurationMacros, IncludePath);
   }
 };
 
 template <> struct MDNodeKeyImpl<DITemplateTypeParameter> {
   MDString *Name;
   Metadata *Type;
+  bool IsDefault;
 
-  MDNodeKeyImpl(MDString *Name, Metadata *Type) : Name(Name), Type(Type) {}
+  MDNodeKeyImpl(MDString *Name, Metadata *Type, bool IsDefault)
+      : Name(Name), Type(Type), IsDefault(IsDefault) {}
   MDNodeKeyImpl(const DITemplateTypeParameter *N)
-      : Name(N->getRawName()), Type(N->getRawType()) {}
+      : Name(N->getRawName()), Type(N->getRawType()),
+        IsDefault(N->isDefault()) {}
 
   bool isKeyOf(const DITemplateTypeParameter *RHS) const {
-    return Name == RHS->getRawName() && Type == RHS->getRawType();
+    return Name == RHS->getRawName() && Type == RHS->getRawType() &&
+           IsDefault == RHS->isDefault();
   }
 
-  unsigned getHashValue() const { return hash_combine(Name, Type); }
+  unsigned getHashValue() const { return hash_combine(Name, Type, IsDefault); }
 };
 
 template <> struct MDNodeKeyImpl<DITemplateValueParameter> {
   unsigned Tag;
   MDString *Name;
   Metadata *Type;
+  bool IsDefault;
   Metadata *Value;
 
-  MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *Type, Metadata *Value)
-      : Tag(Tag), Name(Name), Type(Type), Value(Value) {}
+  MDNodeKeyImpl(unsigned Tag, MDString *Name, Metadata *Type, bool IsDefault,
+                Metadata *Value)
+      : Tag(Tag), Name(Name), Type(Type), IsDefault(IsDefault), Value(Value) {}
   MDNodeKeyImpl(const DITemplateValueParameter *N)
       : Tag(N->getTag()), Name(N->getRawName()), Type(N->getRawType()),
-        Value(N->getValue()) {}
+        IsDefault(N->isDefault()), Value(N->getValue()) {}
 
   bool isKeyOf(const DITemplateValueParameter *RHS) const {
     return Tag == RHS->getTag() && Name == RHS->getRawName() &&
-           Type == RHS->getRawType() && Value == RHS->getValue();
+           Type == RHS->getRawType() && IsDefault == RHS->isDefault() &&
+           Value == RHS->getValue();
   }
 
-  unsigned getHashValue() const { return hash_combine(Tag, Name, Type, Value); }
+  unsigned getHashValue() const {
+    return hash_combine(Tag, Name, Type, IsDefault, Value);
+  }
 };
 
 template <> struct MDNodeKeyImpl<DIGlobalVariable> {
@@ -1248,11 +1281,17 @@ public:
   LLVMContext::InlineAsmDiagHandlerTy InlineAsmDiagHandler = nullptr;
   void *InlineAsmDiagContext = nullptr;
 
+  /// The main remark streamer used by all the other streamers (e.g. IR, MIR,
+  /// frontends, etc.). This should only be used by the specific streamers, and
+  /// never directly.
+  std::unique_ptr<remarks::RemarkStreamer> MainRemarkStreamer;
+
   std::unique_ptr<DiagnosticHandler> DiagHandler;
   bool RespectDiagnosticFilters = false;
   bool DiagnosticsHotnessRequested = false;
   uint64_t DiagnosticsHotnessThreshold = 0;
-  std::unique_ptr<RemarkStreamer> RemarkDiagStreamer;
+  /// The specialized remark streamer used by LLVM's OptimizationRemarkEmitter.
+  std::unique_ptr<LLVMRemarkStreamer> LLVMRS;
 
   LLVMContext::YieldCallbackTy YieldCallback = nullptr;
   void *YieldOpaqueHandle = nullptr;
@@ -1317,7 +1356,8 @@ public:
   std::unique_ptr<ConstantTokenNone> TheNoneToken;
 
   // Basic type instances.
-  Type VoidTy, LabelTy, HalfTy, FloatTy, DoubleTy, MetadataTy, TokenTy;
+  Type VoidTy, LabelTy, HalfTy, BFloatTy, FloatTy, DoubleTy, MetadataTy,
+      TokenTy;
   Type X86_FP80Ty, FP128Ty, PPC_FP128Ty, X86_MMXTy;
   IntegerType Int1Ty, Int8Ty, Int16Ty, Int32Ty, Int64Ty, Int128Ty;
 
@@ -1364,9 +1404,6 @@ public:
   /// instructions in different blocks at the same location.
   DenseMap<std::pair<const char *, unsigned>, unsigned> DiscriminatorTable;
 
-  int getOrAddScopeRecordIdxEntry(MDNode *N, int ExistingIdx);
-  int getOrAddScopeInlinedAtIdxEntry(MDNode *Scope, MDNode *IA,int ExistingIdx);
-
   /// A set of interned tags for operand bundles.  The StringMap maps
   /// bundle tags to their IDs.
   ///
diff --git a/llvm/lib/IR/RemarkStreamer.cpp b/llvm/lib/IR/LLVMRemarkStreamer.cpp
index cdbcc4f456c5..96001ab42c38 100644
--- a/llvm/lib/IR/RemarkStreamer.cpp
+++ b/llvm/lib/IR/LLVMRemarkStreamer.cpp
@@ -1,4 +1,4 @@
-//===- llvm/IR/RemarkStreamer.cpp - Remark Streamer -*- C++ -------------*-===//
+//===- llvm/IR/LLVMRemarkStreamer.cpp - Remark Streamer -*- C++ ---------*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -6,45 +6,19 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains the implementation of the remark outputting as part of
-// LLVMContext.
+// This file contains the implementation of the conversion between IR
+// Diagnostics and serializable remarks::Remark objects.
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/IR/RemarkStreamer.h"
+#include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
-#include "llvm/Remarks/BitstreamRemarkSerializer.h"
-#include "llvm/Remarks/RemarkFormat.h"
-#include "llvm/Remarks/RemarkSerializer.h"
-#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/FileSystem.h"
 
 using namespace llvm;
 
-static cl::opt<cl::boolOrDefault> EnableRemarksSection(
-    "remarks-section",
-    cl::desc(
-        "Emit a section containing remark diagnostics metadata. By default, "
-        "this is enabled for the following formats: yaml-strtab, bitstream."),
-    cl::init(cl::BOU_UNSET), cl::Hidden);
-
-RemarkStreamer::RemarkStreamer(
-    std::unique_ptr<remarks::RemarkSerializer> RemarkSerializer,
-    Optional<StringRef> FilenameIn)
-    : PassFilter(), RemarkSerializer(std::move(RemarkSerializer)),
-      Filename(FilenameIn ? Optional<std::string>(FilenameIn->str()) : None) {}
-
-Error RemarkStreamer::setFilter(StringRef Filter) {
-  Regex R = Regex(Filter);
-  std::string RegexError;
-  if (!R.isValid(RegexError))
-    return createStringError(std::make_error_code(std::errc::invalid_argument),
-                             RegexError.data());
-  PassFilter = std::move(R);
-  return Error::success();
-}
-
 /// DiagnosticKind -> remarks::Type
 static remarks::Type toRemarkType(enum DiagnosticKind Kind) {
   switch (Kind) {
@@ -81,7 +55,7 @@ toRemarkLocation(const DiagnosticLocation &DL) {
 
 /// LLVM Diagnostic -> Remark
 remarks::Remark
-RemarkStreamer::toRemark(const DiagnosticInfoOptimizationBase &Diag) {
+LLVMRemarkStreamer::toRemark(const DiagnosticInfoOptimizationBase &Diag) const {
   remarks::Remark R; // The result.
   R.RemarkType = toRemarkType(static_cast<DiagnosticKind>(Diag.getKind()));
   R.PassName = Diag.getPassName();
@@ -101,51 +75,24 @@ RemarkStreamer::toRemark(const DiagnosticInfoOptimizationBase &Diag) {
   return R;
 }
 
-void RemarkStreamer::emit(const DiagnosticInfoOptimizationBase &Diag) {
-  if (Optional<Regex> &Filter = PassFilter)
-    if (!Filter->match(Diag.getPassName()))
+void LLVMRemarkStreamer::emit(const DiagnosticInfoOptimizationBase &Diag) {
+  if (!RS.matchesFilter(Diag.getPassName()))
       return;
 
   // First, convert the diagnostic to a remark.
   remarks::Remark R = toRemark(Diag);
   // Then, emit the remark through the serializer.
-  RemarkSerializer->emit(R);
+  RS.getSerializer().emit(R);
 }
 
-bool RemarkStreamer::needsSection() const {
-  if (EnableRemarksSection == cl::BOU_TRUE)
-    return true;
-
-  if (EnableRemarksSection == cl::BOU_FALSE)
-    return false;
+char LLVMRemarkSetupFileError::ID = 0;
+char LLVMRemarkSetupPatternError::ID = 0;
+char LLVMRemarkSetupFormatError::ID = 0;
 
-  assert(EnableRemarksSection == cl::BOU_UNSET);
-
-  // We only need a section if we're in separate mode.
-  if (RemarkSerializer->Mode != remarks::SerializerMode::Separate)
-    return false;
-
-  // Only some formats need a section:
-  // * bitstream
-  // * yaml-strtab
-  switch (RemarkSerializer->SerializerFormat) {
-  case remarks::Format::YAMLStrTab:
-  case remarks::Format::Bitstream:
-    return true;
-  default:
-    return false;
-  }
-}
-
-char RemarkSetupFileError::ID = 0;
-char RemarkSetupPatternError::ID = 0;
-char RemarkSetupFormatError::ID = 0;
-
-Expected<std::unique_ptr<ToolOutputFile>>
-llvm::setupOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
-                               StringRef RemarksPasses, StringRef RemarksFormat,
-                               bool RemarksWithHotness,
-                               unsigned RemarksHotnessThreshold) {
+Expected<std::unique_ptr<ToolOutputFile>> llvm::setupLLVMOptimizationRemarks(
+    LLVMContext &Context, StringRef RemarksFilename, StringRef RemarksPasses,
+    StringRef RemarksFormat, bool RemarksWithHotness,
+    unsigned RemarksHotnessThreshold) {
   if (RemarksWithHotness)
     Context.setDiagnosticsHotnessRequested(true);
 
@@ -157,7 +104,7 @@ llvm::setupOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
 
   Expected<remarks::Format> Format = remarks::parseFormat(RemarksFormat);
   if (Error E = Format.takeError())
-    return make_error<RemarkSetupFormatError>(std::move(E));
+    return make_error<LLVMRemarkSetupFormatError>(std::move(E));
 
   std::error_code EC;
   auto Flags = *Format == remarks::Format::YAML ? sys::fs::OF_Text
@@ -167,29 +114,34 @@ llvm::setupOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
   // We don't use llvm::FileError here because some diagnostics want the file
   // name separately.
   if (EC)
-    return make_error<RemarkSetupFileError>(errorCodeToError(EC));
+    return make_error<LLVMRemarkSetupFileError>(errorCodeToError(EC));
 
   Expected<std::unique_ptr<remarks::RemarkSerializer>> RemarkSerializer =
       remarks::createRemarkSerializer(
           *Format, remarks::SerializerMode::Separate, RemarksFile->os());
   if (Error E = RemarkSerializer.takeError())
-    return make_error<RemarkSetupFormatError>(std::move(E));
+    return make_error<LLVMRemarkSetupFormatError>(std::move(E));
 
-  Context.setRemarkStreamer(std::make_unique<RemarkStreamer>(
+  // Create the main remark streamer.
+  Context.setMainRemarkStreamer(std::make_unique<remarks::RemarkStreamer>(
       std::move(*RemarkSerializer), RemarksFilename));
 
+  // Create LLVM's optimization remarks streamer.
+  Context.setLLVMRemarkStreamer(
+      std::make_unique<LLVMRemarkStreamer>(*Context.getMainRemarkStreamer()));
+
   if (!RemarksPasses.empty())
-    if (Error E = Context.getRemarkStreamer()->setFilter(RemarksPasses))
-      return make_error<RemarkSetupPatternError>(std::move(E));
+    if (Error E = Context.getMainRemarkStreamer()->setFilter(RemarksPasses))
+      return make_error<LLVMRemarkSetupPatternError>(std::move(E));
 
   return std::move(RemarksFile);
 }
 
-Error llvm::setupOptimizationRemarks(LLVMContext &Context, raw_ostream &OS,
-                                     StringRef RemarksPasses,
-                                     StringRef RemarksFormat,
-                                     bool RemarksWithHotness,
-                                     unsigned RemarksHotnessThreshold) {
+Error llvm::setupLLVMOptimizationRemarks(LLVMContext &Context, raw_ostream &OS,
+                                         StringRef RemarksPasses,
+                                         StringRef RemarksFormat,
+                                         bool RemarksWithHotness,
+                                         unsigned RemarksHotnessThreshold) {
   if (RemarksWithHotness)
     Context.setDiagnosticsHotnessRequested(true);
 
@@ -198,20 +150,25 @@ Error llvm::setupOptimizationRemarks(LLVMContext &Context, raw_ostream &OS,
 
   Expected<remarks::Format> Format = remarks::parseFormat(RemarksFormat);
   if (Error E = Format.takeError())
-    return make_error<RemarkSetupFormatError>(std::move(E));
+    return make_error<LLVMRemarkSetupFormatError>(std::move(E));
 
   Expected<std::unique_ptr<remarks::RemarkSerializer>> RemarkSerializer =
       remarks::createRemarkSerializer(*Format,
                                       remarks::SerializerMode::Separate, OS);
   if (Error E = RemarkSerializer.takeError())
-    return make_error<RemarkSetupFormatError>(std::move(E));
+    return make_error<LLVMRemarkSetupFormatError>(std::move(E));
+
+  // Create the main remark streamer.
+  Context.setMainRemarkStreamer(
+      std::make_unique<remarks::RemarkStreamer>(std::move(*RemarkSerializer)));
 
-  Context.setRemarkStreamer(
-      std::make_unique<RemarkStreamer>(std::move(*RemarkSerializer)));
+  // Create LLVM's optimization remarks streamer.
+  Context.setLLVMRemarkStreamer(
+      std::make_unique<LLVMRemarkStreamer>(*Context.getMainRemarkStreamer()));
 
   if (!RemarksPasses.empty())
-    if (Error E = Context.getRemarkStreamer()->setFilter(RemarksPasses))
-      return make_error<RemarkSetupPatternError>(std::move(E));
+    if (Error E = Context.getMainRemarkStreamer()->setFilter(RemarksPasses))
+      return make_error<LLVMRemarkSetupPatternError>(std::move(E));
 
   return Error::success();
 }
diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 90239bb76298..74869fa62c66 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -33,7 +33,6 @@
 #include <algorithm>
 #include <unordered_set>
 using namespace llvm;
-using namespace llvm::legacy;
 
 // See PassManagers.h for Pass Manager infrastructure overview.
 
@@ -132,7 +131,8 @@ bool llvm::forcePrintModuleIR() { return PrintModuleScope; }
 bool llvm::isFunctionInPrintList(StringRef FunctionName) {
   static std::unordered_set<std::string> PrintFuncNames(PrintFuncsList.begin(),
                                                         PrintFuncsList.end());
-  return PrintFuncNames.empty() || PrintFuncNames.count(FunctionName);
+  return PrintFuncNames.empty() ||
+         PrintFuncNames.count(std::string(FunctionName));
 }
 /// isPassDebuggingExecutionsOrMore - Return true if -debug-pass=Executions
 /// or higher is specified.
@@ -239,7 +239,7 @@ void PMDataManager::emitInstrCountChangedRemark(
 
   // Helper lambda that emits a remark when the size of a function has changed.
   auto EmitFunctionSizeChangedRemark = [&FunctionToInstrCount, &F, &BB,
-                                        &PassName](const std::string &Fname) {
+                                        &PassName](StringRef Fname) {
     unsigned FnCountBefore, FnCountAfter;
     std::pair<unsigned, unsigned> &Change = FunctionToInstrCount[Fname];
     std::tie(FnCountBefore, FnCountAfter) = Change;
@@ -386,8 +386,68 @@ public:
 void FunctionPassManagerImpl::anchor() {}
 
 char FunctionPassManagerImpl::ID = 0;
-} // End of legacy namespace
-} // End of llvm namespace
+
+//===----------------------------------------------------------------------===//
+// FunctionPassManagerImpl implementation
+//
+bool FunctionPassManagerImpl::doInitialization(Module &M) {
+  bool Changed = false;
+
+  dumpArguments();
+  dumpPasses();
+
+  for (ImmutablePass *ImPass : getImmutablePasses())
+    Changed |= ImPass->doInitialization(M);
+
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    Changed |= getContainedManager(Index)->doInitialization(M);
+
+  return Changed;
+}
+
+bool FunctionPassManagerImpl::doFinalization(Module &M) {
+  bool Changed = false;
+
+  for (int Index = getNumContainedManagers() - 1; Index >= 0; --Index)
+    Changed |= getContainedManager(Index)->doFinalization(M);
+
+  for (ImmutablePass *ImPass : getImmutablePasses())
+    Changed |= ImPass->doFinalization(M);
+
+  return Changed;
+}
+
+void FunctionPassManagerImpl::releaseMemoryOnTheFly() {
+  if (!wasRun)
+    return;
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
+    FPPassManager *FPPM = getContainedManager(Index);
+    for (unsigned Index = 0; Index < FPPM->getNumContainedPasses(); ++Index) {
+      FPPM->getContainedPass(Index)->releaseMemory();
+    }
+  }
+  wasRun = false;
+}
+
+// Execute all the passes managed by this top level manager.
+// Return true if any function is modified by a pass.
+bool FunctionPassManagerImpl::run(Function &F) {
+  bool Changed = false;
+
+  initializeAllAnalysisInfo();
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
+    Changed |= getContainedManager(Index)->runOnFunction(F);
+    F.getContext().yield();
+  }
+
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
+    getContainedManager(Index)->cleanup();
+
+  wasRun = true;
+  return Changed;
+}
+} // namespace legacy
+} // namespace llvm
 
 namespace {
 //===----------------------------------------------------------------------===//
@@ -405,7 +465,7 @@ public:
   // Delete on the fly managers.
   ~MPPassManager() override {
     for (auto &OnTheFlyManager : OnTheFlyManagers) {
-      FunctionPassManagerImpl *FPP = OnTheFlyManager.second;
+      legacy::FunctionPassManagerImpl *FPP = OnTheFlyManager.second;
       delete FPP;
     }
   }
@@ -436,7 +496,8 @@ public:
   /// Return function pass corresponding to PassInfo PI, that is
   /// required by module pass MP. Instantiate analysis pass, by using
   /// its runOnFunction() for function F.
-  Pass* getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F) override;
+  std::tuple<Pass *, bool> getOnTheFlyPass(Pass *MP, AnalysisID PI,
+                                           Function &F) override;
 
   StringRef getPassName() const override { return "Module Pass Manager"; }
 
@@ -449,7 +510,7 @@ public:
     for (unsigned Index = 0; Index < getNumContainedPasses(); ++Index) {
       ModulePass *MP = getContainedPass(Index);
       MP->dumpPassStructure(Offset + 1);
-      MapVector<Pass *, FunctionPassManagerImpl *>::const_iterator I =
+      MapVector<Pass *, legacy::FunctionPassManagerImpl *>::const_iterator I =
           OnTheFlyManagers.find(MP);
       if (I != OnTheFlyManagers.end())
         I->second->dumpPassStructure(Offset + 2);
@@ -469,7 +530,7 @@ public:
  private:
   /// Collection of on the fly FPPassManagers. These managers manage
   /// function passes that are required by module passes.
-   MapVector<Pass *, FunctionPassManagerImpl *> OnTheFlyManagers;
+   MapVector<Pass *, legacy::FunctionPassManagerImpl *> OnTheFlyManagers;
 };
 
 char MPPassManager::ID = 0;
@@ -532,8 +593,35 @@ public:
 void PassManagerImpl::anchor() {}
 
 char PassManagerImpl::ID = 0;
-} // End of legacy namespace
-} // End of llvm namespace
+
+//===----------------------------------------------------------------------===//
+// PassManagerImpl implementation
+
+//
+/// run - Execute all of the passes scheduled for execution.  Keep track of
+/// whether any of the passes modifies the module, and if so, return true.
+bool PassManagerImpl::run(Module &M) {
+  bool Changed = false;
+
+  dumpArguments();
+  dumpPasses();
+
+  for (ImmutablePass *ImPass : getImmutablePasses())
+    Changed |= ImPass->doInitialization(M);
+
+  initializeAllAnalysisInfo();
+  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
+    Changed |= getContainedManager(Index)->runOnModule(M);
+    M.getContext().yield();
+  }
+
+  for (ImmutablePass *ImPass : getImmutablePasses())
+    Changed |= ImPass->doFinalization(M);
+
+  return Changed;
+}
+} // namespace legacy
+} // namespace llvm
 
 //===----------------------------------------------------------------------===//
 // PMTopLevelManager implementation
@@ -1289,7 +1377,8 @@ void PMDataManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
   llvm_unreachable("Unable to schedule pass");
 }
 
-Pass *PMDataManager::getOnTheFlyPass(Pass *P, AnalysisID PI, Function &F) {
+std::tuple<Pass *, bool> PMDataManager::getOnTheFlyPass(Pass *P, AnalysisID PI,
+                                                        Function &F) {
   llvm_unreachable("Unable to find on the fly pass");
 }
 
@@ -1306,17 +1395,20 @@ Pass *AnalysisResolver::getAnalysisIfAvailable(AnalysisID ID, bool dir) const {
   return PM.findAnalysisPass(ID, dir);
 }
 
-Pass *AnalysisResolver::findImplPass(Pass *P, AnalysisID AnalysisPI,
-                                     Function &F) {
+std::tuple<Pass *, bool>
+AnalysisResolver::findImplPass(Pass *P, AnalysisID AnalysisPI, Function &F) {
   return PM.getOnTheFlyPass(P, AnalysisPI, F);
 }
 
+namespace llvm {
+namespace legacy {
+
 //===----------------------------------------------------------------------===//
 // FunctionPassManager implementation
 
 /// Create new Function pass manager
 FunctionPassManager::FunctionPassManager(Module *m) : M(m) {
-  FPM = new FunctionPassManagerImpl();
+  FPM = new legacy::FunctionPassManagerImpl();
   // FPM is the top level manager.
   FPM->setTopLevelManager(FPM);
 
@@ -1355,36 +1447,8 @@ bool FunctionPassManager::doInitialization() {
 bool FunctionPassManager::doFinalization() {
   return FPM->doFinalization(*M);
 }
-
-//===----------------------------------------------------------------------===//
-// FunctionPassManagerImpl implementation
-//
-bool FunctionPassManagerImpl::doInitialization(Module &M) {
-  bool Changed = false;
-
-  dumpArguments();
-  dumpPasses();
-
-  for (ImmutablePass *ImPass : getImmutablePasses())
-    Changed |= ImPass->doInitialization(M);
-
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
-    Changed |= getContainedManager(Index)->doInitialization(M);
-
-  return Changed;
-}
-
-bool FunctionPassManagerImpl::doFinalization(Module &M) {
-  bool Changed = false;
-
-  for (int Index = getNumContainedManagers() - 1; Index >= 0; --Index)
-    Changed |= getContainedManager(Index)->doFinalization(M);
-
-  for (ImmutablePass *ImPass : getImmutablePasses())
-    Changed |= ImPass->doFinalization(M);
-
-  return Changed;
-}
+} // namespace legacy
+} // namespace llvm
 
 /// cleanup - After running all passes, clean up pass manager cache.
 void FPPassManager::cleanup() {
@@ -1396,35 +1460,6 @@ void FPPassManager::cleanup() {
  }
 }
 
-void FunctionPassManagerImpl::releaseMemoryOnTheFly() {
-  if (!wasRun)
-    return;
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
-    FPPassManager *FPPM = getContainedManager(Index);
-    for (unsigned Index = 0; Index < FPPM->getNumContainedPasses(); ++Index) {
-      FPPM->getContainedPass(Index)->releaseMemory();
-    }
-  }
-  wasRun = false;
-}
-
-// Execute all the passes managed by this top level manager.
-// Return true if any function is modified by a pass.
-bool FunctionPassManagerImpl::run(Function &F) {
-  bool Changed = false;
-
-  initializeAllAnalysisInfo();
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
-    Changed |= getContainedManager(Index)->runOnFunction(F);
-    F.getContext().yield();
-  }
-
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index)
-    getContainedManager(Index)->cleanup();
-
-  wasRun = true;
-  return Changed;
-}
 
 //===----------------------------------------------------------------------===//
 // FPPassManager implementation
@@ -1440,6 +1475,74 @@ void FPPassManager::dumpPassStructure(unsigned Offset) {
   }
 }
 
+#ifdef EXPENSIVE_CHECKS
+namespace {
+namespace details {
+
+// Basic hashing mechanism to detect structural change to the IR, used to verify
+// pass return status consistency with actual change. Loosely copied from
+// llvm/lib/Transforms/Utils/FunctionComparator.cpp
+
+class StructuralHash {
+  uint64_t Hash = 0x6acaa36bef8325c5ULL;
+
+  void update(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); }
+
+public:
+  StructuralHash() = default;
+
+  void update(Function &F) {
+    if (F.empty())
+      return;
+
+    update(F.isVarArg());
+    update(F.arg_size());
+
+    SmallVector<const BasicBlock *, 8> BBs;
+    SmallPtrSet<const BasicBlock *, 16> VisitedBBs;
+
+    BBs.push_back(&F.getEntryBlock());
+    VisitedBBs.insert(BBs[0]);
+    while (!BBs.empty()) {
+      const BasicBlock *BB = BBs.pop_back_val();
+      update(45798); // Block header
+      for (auto &Inst : *BB)
+        update(Inst.getOpcode());
+
+      const Instruction *Term = BB->getTerminator();
+      for (unsigned i = 0, e = Term->getNumSuccessors(); i != e; ++i) {
+        if (!VisitedBBs.insert(Term->getSuccessor(i)).second)
+          continue;
+        BBs.push_back(Term->getSuccessor(i));
+      }
+    }
+  }
+
+  void update(Module &M) {
+    for (Function &F : M)
+      update(F);
+  }
+
+  uint64_t getHash() const { return Hash; }
+};
+
+} // namespace details
+
+uint64_t StructuralHash(Function &F) {
+  details::StructuralHash H;
+  H.update(F);
+  return H.getHash();
+}
+
+uint64_t StructuralHash(Module &M) {
+  details::StructuralHash H;
+  H.update(M);
+  return H.getHash();
+}
+
+} // end anonymous namespace
+
+#endif
 
 /// Execute all of the passes scheduled for execution by invoking
 /// runOnFunction method.  Keep track of whether any of the passes modifies
@@ -1478,7 +1581,16 @@ bool FPPassManager::runOnFunction(Function &F) {
     {
       PassManagerPrettyStackEntry X(FP, F);
       TimeRegion PassTimer(getPassTimer(FP));
+#ifdef EXPENSIVE_CHECKS
+      uint64_t RefHash = StructuralHash(F);
+#endif
       LocalChanged |= FP->runOnFunction(F);
+
+#ifdef EXPENSIVE_CHECKS
+      assert((LocalChanged || (RefHash == StructuralHash(F))) &&
+             "Pass modifies its input and doesn't report it.");
+#endif
+
       if (EmitICRemark) {
         unsigned NewSize = F.getInstructionCount();
 
@@ -1551,7 +1663,7 @@ MPPassManager::runOnModule(Module &M) {
 
   // Initialize on-the-fly passes
   for (auto &OnTheFlyManager : OnTheFlyManagers) {
-    FunctionPassManagerImpl *FPP = OnTheFlyManager.second;
+    legacy::FunctionPassManagerImpl *FPP = OnTheFlyManager.second;
     Changed |= FPP->doInitialization(M);
   }
 
@@ -1579,7 +1691,17 @@ MPPassManager::runOnModule(Module &M) {
       PassManagerPrettyStackEntry X(MP, M);
       TimeRegion PassTimer(getPassTimer(MP));
 
+#ifdef EXPENSIVE_CHECKS
+      uint64_t RefHash = StructuralHash(M);
+#endif
+
       LocalChanged |= MP->runOnModule(M);
+
+#ifdef EXPENSIVE_CHECKS
+      assert((LocalChanged || (RefHash == StructuralHash(M))) &&
+             "Pass modifies its input and doesn't report it.");
+#endif
+
       if (EmitICRemark) {
         // Update the size of the module.
         unsigned ModuleCount = M.getInstructionCount();
@@ -1612,7 +1734,7 @@ MPPassManager::runOnModule(Module &M) {
 
   // Finalize on-the-fly passes
   for (auto &OnTheFlyManager : OnTheFlyManagers) {
-    FunctionPassManagerImpl *FPP = OnTheFlyManager.second;
+    legacy::FunctionPassManagerImpl *FPP = OnTheFlyManager.second;
     // We don't know when is the last time an on-the-fly pass is run,
     // so we need to releaseMemory / finalize here
     FPP->releaseMemoryOnTheFly();
@@ -1633,9 +1755,9 @@ void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
           RequiredPass->getPotentialPassManagerType()) &&
          "Unable to handle Pass that requires lower level Analysis pass");
 
-  FunctionPassManagerImpl *FPP = OnTheFlyManagers[P];
+  legacy::FunctionPassManagerImpl *FPP = OnTheFlyManagers[P];
   if (!FPP) {
-    FPP = new FunctionPassManagerImpl();
+    FPP = new legacy::FunctionPassManagerImpl();
     // FPP is the top level manager.
     FPP->setTopLevelManager(FPP);
 
@@ -1664,42 +1786,19 @@ void MPPassManager::addLowerLevelRequiredPass(Pass *P, Pass *RequiredPass) {
 /// Return function pass corresponding to PassInfo PI, that is
 /// required by module pass MP. Instantiate analysis pass, by using
 /// its runOnFunction() for function F.
-Pass* MPPassManager::getOnTheFlyPass(Pass *MP, AnalysisID PI, Function &F){
-  FunctionPassManagerImpl *FPP = OnTheFlyManagers[MP];
+std::tuple<Pass *, bool> MPPassManager::getOnTheFlyPass(Pass *MP, AnalysisID PI,
+                                                        Function &F) {
+  legacy::FunctionPassManagerImpl *FPP = OnTheFlyManagers[MP];
   assert(FPP && "Unable to find on the fly pass");
 
   FPP->releaseMemoryOnTheFly();
-  FPP->run(F);
-  return ((PMTopLevelManager*)FPP)->findAnalysisPass(PI);
+  bool Changed = FPP->run(F);
+  return std::make_tuple(((PMTopLevelManager *)FPP)->findAnalysisPass(PI),
+                         Changed);
 }
 
-
-//===----------------------------------------------------------------------===//
-// PassManagerImpl implementation
-
-//
-/// run - Execute all of the passes scheduled for execution.  Keep track of
-/// whether any of the passes modifies the module, and if so, return true.
-bool PassManagerImpl::run(Module &M) {
-  bool Changed = false;
-
-  dumpArguments();
-  dumpPasses();
-
-  for (ImmutablePass *ImPass : getImmutablePasses())
-    Changed |= ImPass->doInitialization(M);
-
-  initializeAllAnalysisInfo();
-  for (unsigned Index = 0; Index < getNumContainedManagers(); ++Index) {
-    Changed |= getContainedManager(Index)->runOnModule(M);
-    M.getContext().yield();
-  }
-
-  for (ImmutablePass *ImPass : getImmutablePasses())
-    Changed |= ImPass->doFinalization(M);
-
-  return Changed;
-}
+namespace llvm {
+namespace legacy {
 
 //===----------------------------------------------------------------------===//
 // PassManager implementation
@@ -1724,6 +1823,8 @@ void PassManager::add(Pass *P) {
 bool PassManager::run(Module &M) {
   return PM->run(M);
 }
+} // namespace legacy
+} // namespace llvm
 
 //===----------------------------------------------------------------------===//
 // PMStack implementation
@@ -1814,4 +1915,4 @@ void FunctionPass::assignPassManager(PMStack &PMS,
   PM->add(this);
 }
 
-PassManagerBase::~PassManagerBase() {}
+legacy::PassManagerBase::~PassManagerBase() {}
diff --git a/llvm/lib/IR/MDBuilder.cpp b/llvm/lib/IR/MDBuilder.cpp
index 7bdb85ace522..40d70f43132d 100644
--- a/llvm/lib/IR/MDBuilder.cpp
+++ b/llvm/lib/IR/MDBuilder.cpp
@@ -68,7 +68,7 @@ MDNode *MDBuilder::createFunctionEntryCount(
   Ops.push_back(createConstant(ConstantInt::get(Int64Ty, Count)));
   if (Imports) {
     SmallVector<GlobalValue::GUID, 2> OrderID(Imports->begin(), Imports->end());
-    llvm::stable_sort(OrderID);
+    llvm::sort(OrderID);
     for (auto ID : OrderID)
       Ops.push_back(createConstant(ConstantInt::get(Int64Ty, ID)));
   }
diff --git a/llvm/lib/IR/Mangler.cpp b/llvm/lib/IR/Mangler.cpp
index d73f748b0584..0d66e321c396 100644
--- a/llvm/lib/IR/Mangler.cpp
+++ b/llvm/lib/IR/Mangler.cpp
@@ -94,15 +94,18 @@ static void addByteCountSuffix(raw_ostream &OS, const Function *F,
                                const DataLayout &DL) {
   // Calculate arguments size total.
   unsigned ArgWords = 0;
+
+  const unsigned PtrSize = DL.getPointerSize();
+
   for (Function::const_arg_iterator AI = F->arg_begin(), AE = F->arg_end();
        AI != AE; ++AI) {
-    Type *Ty = AI->getType();
     // 'Dereference' type in case of byval or inalloca parameter attribute.
-    if (AI->hasByValOrInAllocaAttr())
-      Ty = cast<PointerType>(Ty)->getElementType();
+    uint64_t AllocSize = AI->hasPassPointeeByValueAttr() ?
+      AI->getPassPointeeByValueCopySize(DL) :
+      DL.getTypeAllocSize(AI->getType());
+
     // Size should be aligned to pointer size.
-    unsigned PtrSize = DL.getPointerSize();
-    ArgWords += alignTo(DL.getTypeAllocSize(Ty), PtrSize);
+    ArgWords += alignTo(AllocSize, PtrSize);
   }
 
   OS << '@' << ArgWords;
diff --git a/llvm/lib/IR/Metadata.cpp b/llvm/lib/IR/Metadata.cpp
index de092894d30c..ce89009e86eb 100644
--- a/llvm/lib/IR/Metadata.cpp
+++ b/llvm/lib/IR/Metadata.cpp
@@ -914,7 +914,7 @@ MDNode *MDNode::intersect(MDNode *A, MDNode *B) {
 
   SmallSetVector<Metadata *, 4> MDs(A->op_begin(), A->op_end());
   SmallPtrSet<Metadata *, 4> BSet(B->op_begin(), B->op_end());
-  MDs.remove_if([&](Metadata *MD) { return !is_contained(BSet, MD); });
+  MDs.remove_if([&](Metadata *MD) { return !BSet.count(MD); });
 
   // FIXME: This preserves long-standing behaviour, but is it really the right
   // behaviour?  Or was that an unintended side-effect of node uniquing?
@@ -934,7 +934,7 @@ MDNode *MDNode::getMostGenericFPMath(MDNode *A, MDNode *B) {
 
   APFloat AVal = mdconst::extract<ConstantFP>(A->getOperand(0))->getValueAPF();
   APFloat BVal = mdconst::extract<ConstantFP>(B->getOperand(0))->getValueAPF();
-  if (AVal.compare(BVal) == APFloat::cmpLessThan)
+  if (AVal < BVal)
     return A;
   return B;
 }
@@ -1500,7 +1500,10 @@ void GlobalObject::addTypeMetadata(unsigned Offset, Metadata *TypeID) {
                      TypeID}));
 }
 
-void GlobalObject::addVCallVisibilityMetadata(VCallVisibility Visibility) {
+void GlobalObject::setVCallVisibilityMetadata(VCallVisibility Visibility) {
+  // Remove any existing vcall visibility metadata first in case we are
+  // updating.
+  eraseMetadata(LLVMContext::MD_vcall_visibility);
   addMetadata(LLVMContext::MD_vcall_visibility,
               *MDNode::get(getContext(),
                            {ConstantAsMetadata::get(ConstantInt::get(
diff --git a/llvm/lib/IR/Module.cpp b/llvm/lib/IR/Module.cpp
index 271ae126d722..3ea181a9b48d 100644
--- a/llvm/lib/IR/Module.cpp
+++ b/llvm/lib/IR/Module.cpp
@@ -33,6 +33,7 @@
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/IR/ModuleSummaryIndex.h"
 #include "llvm/IR/SymbolTableListTraits.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/TypeFinder.h"
@@ -71,9 +72,9 @@ template class llvm::SymbolTableListTraits<GlobalIFunc>;
 //
 
 Module::Module(StringRef MID, LLVMContext &C)
-    : Context(C), Materializer(), ModuleID(MID), SourceFileName(MID), DL("") {
-  ValSymTab = new ValueSymbolTable();
-  NamedMDSymTab = new StringMap<NamedMDNode *>();
+    : Context(C), ValSymTab(std::make_unique<ValueSymbolTable>()),
+      Materializer(), ModuleID(std::string(MID)),
+      SourceFileName(std::string(MID)), DL("") {
   Context.addModule(this);
 }
 
@@ -84,13 +85,11 @@ Module::~Module() {
   FunctionList.clear();
   AliasList.clear();
   IFuncList.clear();
-  NamedMDList.clear();
-  delete ValSymTab;
-  delete static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab);
 }
 
-std::unique_ptr<RandomNumberGenerator> Module::createRNG(const Pass* P) const {
-  SmallString<32> Salt(P->getPassName());
+std::unique_ptr<RandomNumberGenerator>
+Module::createRNG(const StringRef Name) const {
+  SmallString<32> Salt(Name);
 
   // This RNG is guaranteed to produce the same random stream only
   // when the Module ID and thus the input filename is the same. This
@@ -104,7 +103,8 @@ std::unique_ptr<RandomNumberGenerator> Module::createRNG(const Pass* P) const {
   // store salt metadata from the Module constructor.
   Salt += sys::path::filename(getModuleIdentifier());
 
-  return std::unique_ptr<RandomNumberGenerator>(new RandomNumberGenerator(Salt));
+  return std::unique_ptr<RandomNumberGenerator>(
+      new RandomNumberGenerator(Salt));
 }
 
 /// getNamedValue - Return the first global value in the module with
@@ -250,15 +250,14 @@ GlobalIFunc *Module::getNamedIFunc(StringRef Name) const {
 NamedMDNode *Module::getNamedMetadata(const Twine &Name) const {
   SmallString<256> NameData;
   StringRef NameRef = Name.toStringRef(NameData);
-  return static_cast<StringMap<NamedMDNode*> *>(NamedMDSymTab)->lookup(NameRef);
+  return NamedMDSymTab.lookup(NameRef);
 }
 
 /// getOrInsertNamedMetadata - Return the first named MDNode in the module
 /// with the specified name. This method returns a new NamedMDNode if a
 /// NamedMDNode with the specified name is not found.
 NamedMDNode *Module::getOrInsertNamedMetadata(StringRef Name) {
-  NamedMDNode *&NMD =
-    (*static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab))[Name];
+  NamedMDNode *&NMD = NamedMDSymTab[Name];
   if (!NMD) {
     NMD = new NamedMDNode(Name);
     NMD->setParent(this);
@@ -270,7 +269,7 @@ NamedMDNode *Module::getOrInsertNamedMetadata(StringRef Name) {
 /// eraseNamedMetadata - Remove the given NamedMDNode from this module and
 /// delete it.
 void Module::eraseNamedMetadata(NamedMDNode *NMD) {
-  static_cast<StringMap<NamedMDNode *> *>(NamedMDSymTab)->erase(NMD->getName());
+  NamedMDSymTab.erase(NMD->getName());
   NamedMDList.erase(NMD->getIterator());
 }
 
@@ -285,6 +284,20 @@ bool Module::isValidModFlagBehavior(Metadata *MD, ModFlagBehavior &MFB) {
   return false;
 }
 
+bool Module::isValidModuleFlag(const MDNode &ModFlag, ModFlagBehavior &MFB,
+                               MDString *&Key, Metadata *&Val) {
+  if (ModFlag.getNumOperands() < 3)
+    return false;
+  if (!isValidModFlagBehavior(ModFlag.getOperand(0), MFB))
+    return false;
+  MDString *K = dyn_cast_or_null<MDString>(ModFlag.getOperand(1));
+  if (!K)
+    return false;
+  Key = K;
+  Val = ModFlag.getOperand(2);
+  return true;
+}
+
 /// getModuleFlagsMetadata - Returns the module flags in the provided vector.
 void Module::
 getModuleFlagsMetadata(SmallVectorImpl<ModuleFlagEntry> &Flags) const {
@@ -293,13 +306,11 @@ getModuleFlagsMetadata(SmallVectorImpl<ModuleFlagEntry> &Flags) const {
 
   for (const MDNode *Flag : ModFlags->operands()) {
     ModFlagBehavior MFB;
-    if (Flag->getNumOperands() >= 3 &&
-        isValidModFlagBehavior(Flag->getOperand(0), MFB) &&
-        dyn_cast_or_null<MDString>(Flag->getOperand(1))) {
+    MDString *Key = nullptr;
+    Metadata *Val = nullptr;
+    if (isValidModuleFlag(*Flag, MFB, Key, Val)) {
       // Check the operands of the MDNode before accessing the operands.
       // The verifier will actually catch these failures.
-      MDString *Key = cast<MDString>(Flag->getOperand(1));
-      Metadata *Val = Flag->getOperand(2);
       Flags.push_back(ModuleFlagEntry(MFB, Key, Val));
     }
   }
@@ -360,6 +371,23 @@ void Module::addModuleFlag(MDNode *Node) {
   getOrInsertModuleFlagsMetadata()->addOperand(Node);
 }
 
+void Module::setModuleFlag(ModFlagBehavior Behavior, StringRef Key,
+                           Metadata *Val) {
+  NamedMDNode *ModFlags = getOrInsertModuleFlagsMetadata();
+  // Replace the flag if it already exists.
+  for (unsigned I = 0, E = ModFlags->getNumOperands(); I != E; ++I) {
+    MDNode *Flag = ModFlags->getOperand(I);
+    ModFlagBehavior MFB;
+    MDString *K = nullptr;
+    Metadata *V = nullptr;
+    if (isValidModuleFlag(*Flag, MFB, K, V) && K->getString() == Key) {
+      Flag->replaceOperandWith(2, Val);
+      return;
+    }
+  }
+  addModuleFlag(Behavior, Key, Val);
+}
+
 void Module::setDataLayout(StringRef Desc) {
   DL.reset(Desc);
 }
@@ -549,9 +577,9 @@ void Module::setCodeModel(CodeModel::Model CL) {
 
 void Module::setProfileSummary(Metadata *M, ProfileSummary::Kind Kind) {
   if (Kind == ProfileSummary::PSK_CSInstr)
-    addModuleFlag(ModFlagBehavior::Error, "CSProfileSummary", M);
+    setModuleFlag(ModFlagBehavior::Error, "CSProfileSummary", M);
   else
-    addModuleFlag(ModFlagBehavior::Error, "ProfileSummary", M);
+    setModuleFlag(ModFlagBehavior::Error, "ProfileSummary", M);
 }
 
 Metadata *Module::getProfileSummary(bool IsCS) {
@@ -559,6 +587,27 @@ Metadata *Module::getProfileSummary(bool IsCS) {
                : getModuleFlag("ProfileSummary"));
 }
 
+bool Module::getSemanticInterposition() const {
+  Metadata *MF = getModuleFlag("SemanticInterposition");
+
+  auto *Val = cast_or_null<ConstantAsMetadata>(MF);
+  if (!Val)
+    return false;
+
+  return cast<ConstantInt>(Val->getValue())->getZExtValue();
+}
+
+void Module::setSemanticInterposition(bool SI) {
+  addModuleFlag(ModFlagBehavior::Error, "SemanticInterposition", SI);
+}
+
+bool Module::noSemanticInterposition() const {
+  // Conservatively require an explicit zero value for now.
+  Metadata *MF = getModuleFlag("SemanticInterposition");
+  auto *Val = cast_or_null<ConstantAsMetadata>(MF);
+  return Val && cast<ConstantInt>(Val->getValue())->getZExtValue() == 0;
+}
+
 void Module::setOwnedMemoryBuffer(std::unique_ptr<MemoryBuffer> MB) {
   OwnedMemoryBuffer = std::move(MB);
 }
@@ -625,3 +674,23 @@ GlobalVariable *llvm::collectUsedGlobalVariables(
   }
   return GV;
 }
+
+void Module::setPartialSampleProfileRatio(const ModuleSummaryIndex &Index) {
+  if (auto *SummaryMD = getProfileSummary(/*IsCS*/ false)) {
+    std::unique_ptr<ProfileSummary> ProfileSummary(
+        ProfileSummary::getFromMD(SummaryMD));
+    if (ProfileSummary) {
+      if (ProfileSummary->getKind() != ProfileSummary::PSK_Sample ||
+          !ProfileSummary->isPartialProfile())
+        return;
+      uint64_t BlockCount = Index.getBlockCount();
+      uint32_t NumCounts = ProfileSummary->getNumCounts();
+      if (!NumCounts)
+        return;
+      double Ratio = (double)BlockCount / NumCounts;
+      ProfileSummary->setPartialProfileRatio(Ratio);
+      setProfileSummary(ProfileSummary->getMD(getContext()),
+                        ProfileSummary::PSK_Sample);
+    }
+  }
+}
diff --git a/llvm/lib/IR/ModuleSummaryIndex.cpp b/llvm/lib/IR/ModuleSummaryIndex.cpp
index 180f96269a13..91612eafada7 100644
--- a/llvm/lib/IR/ModuleSummaryIndex.cpp
+++ b/llvm/lib/IR/ModuleSummaryIndex.cpp
@@ -31,6 +31,12 @@ static cl::opt<bool> PropagateAttrs("propagate-attrs", cl::init(true),
                                     cl::Hidden,
                                     cl::desc("Propagate attributes in index"));
 
+static cl::opt<bool> ImportConstantsWithRefs(
+    "import-constants-with-refs", cl::init(true), cl::Hidden,
+    cl::desc("Import constant global variables with references"));
+
+constexpr uint32_t FunctionSummary::ParamAccess::RangeWidth;
+
 FunctionSummary FunctionSummary::ExternalNode =
     FunctionSummary::makeDummyFunctionSummary({});
 
@@ -68,6 +74,52 @@ std::pair<unsigned, unsigned> FunctionSummary::specialRefCounts() const {
 
 constexpr uint64_t ModuleSummaryIndex::BitcodeSummaryVersion;
 
+uint64_t ModuleSummaryIndex::getFlags() const {
+  uint64_t Flags = 0;
+  if (withGlobalValueDeadStripping())
+    Flags |= 0x1;
+  if (skipModuleByDistributedBackend())
+    Flags |= 0x2;
+  if (hasSyntheticEntryCounts())
+    Flags |= 0x4;
+  if (enableSplitLTOUnit())
+    Flags |= 0x8;
+  if (partiallySplitLTOUnits())
+    Flags |= 0x10;
+  if (withAttributePropagation())
+    Flags |= 0x20;
+  return Flags;
+}
+
+void ModuleSummaryIndex::setFlags(uint64_t Flags) {
+  assert(Flags <= 0x3f && "Unexpected bits in flag");
+  // 1 bit: WithGlobalValueDeadStripping flag.
+  // Set on combined index only.
+  if (Flags & 0x1)
+    setWithGlobalValueDeadStripping();
+  // 1 bit: SkipModuleByDistributedBackend flag.
+  // Set on combined index only.
+  if (Flags & 0x2)
+    setSkipModuleByDistributedBackend();
+  // 1 bit: HasSyntheticEntryCounts flag.
+  // Set on combined index only.
+  if (Flags & 0x4)
+    setHasSyntheticEntryCounts();
+  // 1 bit: DisableSplitLTOUnit flag.
+  // Set on per module indexes. It is up to the client to validate
+  // the consistency of this flag across modules being linked.
+  if (Flags & 0x8)
+    setEnableSplitLTOUnit();
+  // 1 bit: PartiallySplitLTOUnits flag.
+  // Set on combined index only.
+  if (Flags & 0x10)
+    setPartiallySplitLTOUnits();
+  // 1 bit: WithAttributePropagation flag.
+  // Set on combined index only.
+  if (Flags & 0x20)
+    setWithAttributePropagation();
+}
+
 // Collect for the given module the list of function it defines
 // (GUID -> Summary).
 void ModuleSummaryIndex::collectDefinedFunctionsForModule(
@@ -221,7 +273,8 @@ bool ModuleSummaryIndex::canImportGlobalVar(GlobalValueSummary *S,
     // c) Link error (external declaration with internal definition).
     // However we do not promote objects referenced by writeonly GV
     // initializer by means of converting it to 'zeroinitializer'
-    return !isReadOnly(GVS) && !isWriteOnly(GVS) && GVS->refs().size();
+    return !(ImportConstantsWithRefs && GVS->isConstant()) &&
+           !isReadOnly(GVS) && !isWriteOnly(GVS) && GVS->refs().size();
   };
   auto *GVS = cast<GlobalVarSummary>(S->getBaseObject());
 
@@ -249,7 +302,7 @@ void ModuleSummaryIndex::dumpSCCs(raw_ostream &O) {
       if (V.getSummaryList().size())
         F = cast<FunctionSummary>(V.getSummaryList().front().get());
       O << " " << (F == nullptr ? "External" : "") << " " << utostr(V.getGUID())
-        << (I.hasLoop() ? " (has loop)" : "") << "\n";
+        << (I.hasCycle() ? " (has cycle)" : "") << "\n";
     }
     O << "}\n";
   }
@@ -405,6 +458,12 @@ static bool hasWriteOnlyFlag(const GlobalValueSummary *S) {
   return false;
 }
 
+static bool hasConstantFlag(const GlobalValueSummary *S) {
+  if (auto *GVS = dyn_cast<GlobalVarSummary>(S))
+    return GVS->isConstant();
+  return false;
+}
+
 void ModuleSummaryIndex::exportToDot(
     raw_ostream &OS,
     const DenseSet<GlobalValue::GUID> &GUIDPreservedSymbols) const {
@@ -482,6 +541,8 @@ void ModuleSummaryIndex::exportToDot(
           A.addComment("immutable");
         if (Flags.Live && hasWriteOnlyFlag(SummaryIt.second))
           A.addComment("writeOnly");
+        if (Flags.Live && hasConstantFlag(SummaryIt.second))
+          A.addComment("constant");
       }
       if (Flags.DSOLocal)
         A.addComment("dsoLocal");
diff --git a/llvm/lib/IR/Operator.cpp b/llvm/lib/IR/Operator.cpp
index 8ba68674d50e..0f70fc37dee2 100644
--- a/llvm/lib/IR/Operator.cpp
+++ b/llvm/lib/IR/Operator.cpp
@@ -31,33 +31,107 @@ Type *GEPOperator::getResultElementType() const {
   return cast<GetElementPtrConstantExpr>(this)->getResultElementType();
 }
 
-bool GEPOperator::accumulateConstantOffset(const DataLayout &DL,
-                                           APInt &Offset) const {
-  assert(Offset.getBitWidth() ==
-             DL.getIndexSizeInBits(getPointerAddressSpace()) &&
-         "The offset bit width does not match DL specification.");
+Align GEPOperator::getMaxPreservedAlignment(const DataLayout &DL) const {
+  /// compute the worse possible offset for every level of the GEP et accumulate
+  /// the minimum alignment into Result.
 
+  Align Result = Align(llvm::Value::MaximumAlignment);
   for (gep_type_iterator GTI = gep_type_begin(this), GTE = gep_type_end(this);
        GTI != GTE; ++GTI) {
+    int64_t Offset = 1;
     ConstantInt *OpC = dyn_cast<ConstantInt>(GTI.getOperand());
-    if (!OpC)
-      return false;
-    if (OpC->isZero())
-      continue;
 
-    // Handle a struct index, which adds its field offset to the pointer.
     if (StructType *STy = GTI.getStructTypeOrNull()) {
-      unsigned ElementIdx = OpC->getZExtValue();
       const StructLayout *SL = DL.getStructLayout(STy);
-      Offset += APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx));
-      continue;
+      Offset = SL->getElementOffset(OpC->getZExtValue());
+    } else {
+      assert(GTI.isSequential() && "should be sequencial");
+      /// If the index isn't know we take 1 because it is the index that will
+      /// give the worse alignment of the offset.
+      int64_t ElemCount = 1;
+      if (OpC)
+        ElemCount = OpC->getZExtValue();
+      Offset = DL.getTypeAllocSize(GTI.getIndexedType()) * ElemCount;
     }
+    Result = Align(MinAlign(Offset, Result.value()));
+  }
+  return Result;
+}
 
+bool GEPOperator::accumulateConstantOffset(
+    const DataLayout &DL, APInt &Offset,
+    function_ref<bool(Value &, APInt &)> ExternalAnalysis) const {
+   assert(Offset.getBitWidth() ==
+              DL.getIndexSizeInBits(getPointerAddressSpace()) &&
+          "The offset bit width does not match DL specification.");
+
+  bool UsedExternalAnalysis = false;
+  auto AccumulateOffset = [&](APInt Index, uint64_t Size) -> bool {
+    Index = Index.sextOrTrunc(Offset.getBitWidth());
+    APInt IndexedSize = APInt(Offset.getBitWidth(), Size);
     // For array or vector indices, scale the index by the size of the type.
-    APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
-    Offset += Index * APInt(Offset.getBitWidth(),
-                            DL.getTypeAllocSize(GTI.getIndexedType()));
+    if (!UsedExternalAnalysis) {
+      Offset += Index * IndexedSize;
+    } else {
+      // External Analysis can return a result higher/lower than the value
+      // represents. We need to detect overflow/underflow.
+      bool Overflow = false;
+      APInt OffsetPlus = Index.smul_ov(IndexedSize, Overflow);
+      if (Overflow)
+        return false;
+      Offset = Offset.sadd_ov(OffsetPlus, Overflow);
+      if (Overflow)
+        return false;
+    }
+    return true;
+  };
+
+  for (gep_type_iterator GTI = gep_type_begin(this), GTE = gep_type_end(this);
+       GTI != GTE; ++GTI) {
+    // Scalable vectors are multiplied by a runtime constant.
+    bool ScalableType = false;
+    if (isa<ScalableVectorType>(GTI.getIndexedType()))
+      ScalableType = true;
+
+    Value *V = GTI.getOperand();
+    StructType *STy = GTI.getStructTypeOrNull();
+    // Handle ConstantInt if possible.
+    if (auto ConstOffset = dyn_cast<ConstantInt>(V)) {
+      if (ConstOffset->isZero())
+        continue;
+      // if the type is scalable and the constant is not zero (vscale * n * 0 =
+      // 0) bailout.
+      if (ScalableType)
+        return false;
+      // Handle a struct index, which adds its field offset to the pointer.
+      if (STy) {
+        unsigned ElementIdx = ConstOffset->getZExtValue();
+        const StructLayout *SL = DL.getStructLayout(STy);
+        // Element offset is in bytes.
+        if (!AccumulateOffset(
+                APInt(Offset.getBitWidth(), SL->getElementOffset(ElementIdx)),
+                1))
+          return false;
+        continue;
+      }
+      if (!AccumulateOffset(ConstOffset->getValue(),
+                            DL.getTypeAllocSize(GTI.getIndexedType())))
+        return false;
+      continue;
+    }
+
+    // The operand is not constant, check if an external analysis was provided.
+    // External analsis is not applicable to a struct type.
+    if (!ExternalAnalysis || STy || ScalableType)
+      return false;
+    APInt AnalysisIndex;
+    if (!ExternalAnalysis(*V, AnalysisIndex))
+      return false;
+    UsedExternalAnalysis = true;
+    if (!AccumulateOffset(AnalysisIndex,
+                          DL.getTypeAllocSize(GTI.getIndexedType())))
+      return false;
   }
   return true;
 }
-}
+} // namespace llvm
diff --git a/llvm/lib/IR/Pass.cpp b/llvm/lib/IR/Pass.cpp
index dbdbbf4cf35e..a815da2bdc51 100644
--- a/llvm/lib/IR/Pass.cpp
+++ b/llvm/lib/IR/Pass.cpp
@@ -14,8 +14,6 @@
 
 #include "llvm/Pass.h"
 #include "llvm/Config/llvm-config.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/LLVMContext.h"
@@ -24,7 +22,6 @@
 #include "llvm/IR/OptBisect.h"
 #include "llvm/PassInfo.h"
 #include "llvm/PassRegistry.h"
-#include "llvm/PassSupport.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/IR/PassManager.cpp b/llvm/lib/IR/PassManager.cpp
index cde9b873795e..624827ff8cd9 100644
--- a/llvm/lib/IR/PassManager.cpp
+++ b/llvm/lib/IR/PassManager.cpp
@@ -9,6 +9,7 @@
 #include "llvm/IR/PassManager.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PassManagerImpl.h"
 
 using namespace llvm;
 
@@ -88,7 +89,7 @@ bool FunctionAnalysisManagerModuleProxy::Result::invalidate(
   // Return false to indicate that this result is still a valid proxy.
   return false;
 }
-}
+} // namespace llvm
 
 AnalysisSetKey CFGAnalyses::SetKey;
 
diff --git a/llvm/lib/IR/PassRegistry.cpp b/llvm/lib/IR/PassRegistry.cpp
index 92c188b11898..0572c4fe5237 100644
--- a/llvm/lib/IR/PassRegistry.cpp
+++ b/llvm/lib/IR/PassRegistry.cpp
@@ -13,8 +13,8 @@
 
 #include "llvm/PassRegistry.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/Pass.h"
 #include "llvm/PassInfo.h"
-#include "llvm/PassSupport.h"
 #include "llvm/Support/ManagedStatic.h"
 #include <cassert>
 #include <memory>
diff --git a/llvm/lib/IR/PassTimingInfo.cpp b/llvm/lib/IR/PassTimingInfo.cpp
index 9cc44ea05fee..25275e5733ac 100644
--- a/llvm/lib/IR/PassTimingInfo.cpp
+++ b/llvm/lib/IR/PassTimingInfo.cpp
@@ -168,17 +168,18 @@ void reportAndResetTimings(raw_ostream *OutStream) {
 /// Returns the timer for the specified pass invocation of \p PassID.
 /// Each time it creates a new timer.
 Timer &TimePassesHandler::getPassTimer(StringRef PassID) {
-  // Bump counts for each request of the timer.
-  unsigned Count = nextPassID(PassID);
+  // Take a vector of Timers created for this \p PassID and append
+  // one more timer to it.
+  TimerVector &Timers = TimingData[PassID];
+  unsigned Count = Timers.size() + 1;
 
-  // Unconditionally appending description with a pass-invocation number.
   std::string FullDesc = formatv("{0} #{1}", PassID, Count).str();
 
-  PassInvocationID UID{PassID, Count};
   Timer *T = new Timer(PassID, FullDesc, TG);
-  auto Pair = TimingData.try_emplace(UID, T);
-  assert(Pair.second && "should always create a new timer");
-  return *(Pair.first->second.get());
+  Timers.emplace_back(T);
+  assert(Count == Timers.size() && "sanity check");
+
+  return *T;
 }
 
 TimePassesHandler::TimePassesHandler(bool Enabled)
@@ -198,17 +199,23 @@ LLVM_DUMP_METHOD void TimePassesHandler::dump() const {
   dbgs() << "Dumping timers for " << getTypeName<TimePassesHandler>()
          << ":\n\tRunning:\n";
   for (auto &I : TimingData) {
-    const Timer *MyTimer = I.second.get();
-    if (!MyTimer || MyTimer->isRunning())
-      dbgs() << "\tTimer " << MyTimer << " for pass " << I.first.first << "("
-             << I.first.second << ")\n";
+    StringRef PassID = I.getKey();
+    const TimerVector& MyTimers = I.getValue();
+    for (unsigned idx = 0; idx < MyTimers.size(); idx++) {
+      const Timer* MyTimer = MyTimers[idx].get();
+      if (MyTimer && MyTimer->isRunning())
+        dbgs() << "\tTimer " << MyTimer << " for pass " << PassID << "(" << idx << ")\n";
+    }
   }
   dbgs() << "\tTriggered:\n";
   for (auto &I : TimingData) {
-    const Timer *MyTimer = I.second.get();
-    if (!MyTimer || (MyTimer->hasTriggered() && !MyTimer->isRunning()))
-      dbgs() << "\tTimer " << MyTimer << " for pass " << I.first.first << "("
-             << I.first.second << ")\n";
+    StringRef PassID = I.getKey();
+    const TimerVector& MyTimers = I.getValue();
+    for (unsigned idx = 0; idx < MyTimers.size(); idx++) {
+      const Timer* MyTimer = MyTimers[idx].get();
+      if (MyTimer && MyTimer->hasTriggered() && !MyTimer->isRunning())
+        dbgs() << "\tTimer " << MyTimer << " for pass " << PassID << "(" << idx << ")\n";
+    }
   }
 }
 
diff --git a/llvm/lib/IR/ProfileSummary.cpp b/llvm/lib/IR/ProfileSummary.cpp
index 11d95ac19be6..ac6bcd9fe3af 100644
--- a/llvm/lib/IR/ProfileSummary.cpp
+++ b/llvm/lib/IR/ProfileSummary.cpp
@@ -18,6 +18,7 @@
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Type.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Format.h"
 
 using namespace llvm;
 
@@ -31,6 +32,14 @@ static Metadata *getKeyValMD(LLVMContext &Context, const char *Key,
   return MDTuple::get(Context, Ops);
 }
 
+static Metadata *getKeyFPValMD(LLVMContext &Context, const char *Key,
+                               double Val) {
+  Type *DoubleTy = Type::getDoubleTy(Context);
+  Metadata *Ops[2] = {MDString::get(Context, Key),
+                      ConstantAsMetadata::get(ConstantFP::get(DoubleTy, Val))};
+  return MDTuple::get(Context, Ops);
+}
+
 // Return an MDTuple with two elements. The first element is a string Key and
 // the second is a string Value.
 static Metadata *getKeyValMD(LLVMContext &Context, const char *Key,
@@ -65,35 +74,63 @@ Metadata *ProfileSummary::getDetailedSummaryMD(LLVMContext &Context) {
 // "ProfileFormat" and a string representing the format ("InstrProf" or
 // "SampleProfile"). The rest of the elements of the outer MDTuple are specific
 // to the kind of profile summary as returned by getFormatSpecificMD.
-Metadata *ProfileSummary::getMD(LLVMContext &Context) {
+// IsPartialProfile is an optional field and \p AddPartialField will decide
+// whether to add a field for it.
+// PartialProfileRatio is an optional field and \p AddPartialProfileRatioField
+// will decide whether to add a field for it.
+Metadata *ProfileSummary::getMD(LLVMContext &Context, bool AddPartialField,
+                                bool AddPartialProfileRatioField) {
   const char *KindStr[3] = {"InstrProf", "CSInstrProf", "SampleProfile"};
-  Metadata *Components[] = {
-    getKeyValMD(Context, "ProfileFormat", KindStr[PSK]),
-    getKeyValMD(Context, "TotalCount", getTotalCount()),
-    getKeyValMD(Context, "MaxCount", getMaxCount()),
-    getKeyValMD(Context, "MaxInternalCount", getMaxInternalCount()),
-    getKeyValMD(Context, "MaxFunctionCount", getMaxFunctionCount()),
-    getKeyValMD(Context, "NumCounts", getNumCounts()),
-    getKeyValMD(Context, "NumFunctions", getNumFunctions()),
-    getDetailedSummaryMD(Context),
-  };
+  SmallVector<Metadata *, 16> Components;
+  Components.push_back(getKeyValMD(Context, "ProfileFormat", KindStr[PSK]));
+  Components.push_back(getKeyValMD(Context, "TotalCount", getTotalCount()));
+  Components.push_back(getKeyValMD(Context, "MaxCount", getMaxCount()));
+  Components.push_back(
+      getKeyValMD(Context, "MaxInternalCount", getMaxInternalCount()));
+  Components.push_back(
+      getKeyValMD(Context, "MaxFunctionCount", getMaxFunctionCount()));
+  Components.push_back(getKeyValMD(Context, "NumCounts", getNumCounts()));
+  Components.push_back(getKeyValMD(Context, "NumFunctions", getNumFunctions()));
+  if (AddPartialField)
+    Components.push_back(
+        getKeyValMD(Context, "IsPartialProfile", isPartialProfile()));
+  if (AddPartialProfileRatioField)
+    Components.push_back(getKeyFPValMD(Context, "PartialProfileRatio",
+                                       getPartialProfileRatio()));
+  Components.push_back(getDetailedSummaryMD(Context));
   return MDTuple::get(Context, Components);
 }
 
-// Parse an MDTuple representing (Key, Val) pair.
-static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val) {
+// Get the value metadata for the input MD/Key.
+static ConstantAsMetadata *getValMD(MDTuple *MD, const char *Key) {
   if (!MD)
-    return false;
+    return nullptr;
   if (MD->getNumOperands() != 2)
-    return false;
+    return nullptr;
   MDString *KeyMD = dyn_cast<MDString>(MD->getOperand(0));
   ConstantAsMetadata *ValMD = dyn_cast<ConstantAsMetadata>(MD->getOperand(1));
   if (!KeyMD || !ValMD)
-    return false;
+    return nullptr;
   if (!KeyMD->getString().equals(Key))
-    return false;
-  Val = cast<ConstantInt>(ValMD->getValue())->getZExtValue();
-  return true;
+    return nullptr;
+  return ValMD;
+}
+
+// Parse an MDTuple representing (Key, Val) pair.
+static bool getVal(MDTuple *MD, const char *Key, uint64_t &Val) {
+  if (auto *ValMD = getValMD(MD, Key)) {
+    Val = cast<ConstantInt>(ValMD->getValue())->getZExtValue();
+    return true;
+  }
+  return false;
+}
+
+static bool getVal(MDTuple *MD, const char *Key, double &Val) {
+  if (auto *ValMD = getValMD(MD, Key)) {
+    Val = cast<ConstantFP>(ValMD->getValue())->getValueAPF().convertToDouble();
+    return true;
+  }
+  return false;
 }
 
 // Check if an MDTuple represents a (Key, Val) pair.
@@ -139,12 +176,29 @@ static bool getSummaryFromMD(MDTuple *MD, SummaryEntryVector &Summary) {
   return true;
 }
 
+// Get the value of an optional field. Increment 'Idx' if it was present. Return
+// true if we can move onto the next field.
+template <typename ValueType>
+static bool getOptionalVal(MDTuple *Tuple, unsigned &Idx, const char *Key,
+                           ValueType &Value) {
+  if (getVal(dyn_cast<MDTuple>(Tuple->getOperand(Idx)), Key, Value)) {
+    Idx++;
+    // Need to make sure when the key is present, we won't step over the bound
+    // of Tuple operand array. Since (non-optional) DetailedSummary always comes
+    // last, the next entry in the tuple operand array must exist.
+    return Idx < Tuple->getNumOperands();
+  }
+  // It was absent, keep going.
+  return true;
+}
+
 ProfileSummary *ProfileSummary::getFromMD(Metadata *MD) {
   MDTuple *Tuple = dyn_cast_or_null<MDTuple>(MD);
-  if (!Tuple || Tuple->getNumOperands() != 8)
+  if (!Tuple || Tuple->getNumOperands() < 8 || Tuple->getNumOperands() > 10)
     return nullptr;
 
-  auto &FormatMD = Tuple->getOperand(0);
+  unsigned I = 0;
+  auto &FormatMD = Tuple->getOperand(I++);
   ProfileSummary::Kind SummaryKind;
   if (isKeyValuePair(dyn_cast_or_null<MDTuple>(FormatMD), "ProfileFormat",
                      "SampleProfile"))
@@ -160,27 +214,55 @@ ProfileSummary *ProfileSummary::getFromMD(Metadata *MD) {
 
   uint64_t NumCounts, TotalCount, NumFunctions, MaxFunctionCount, MaxCount,
       MaxInternalCount;
-  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(1)), "TotalCount",
+  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(I++)), "TotalCount",
               TotalCount))
     return nullptr;
-  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(2)), "MaxCount", MaxCount))
+  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(I++)), "MaxCount", MaxCount))
     return nullptr;
-  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(3)), "MaxInternalCount",
+  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(I++)), "MaxInternalCount",
               MaxInternalCount))
     return nullptr;
-  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(4)), "MaxFunctionCount",
+  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(I++)), "MaxFunctionCount",
               MaxFunctionCount))
     return nullptr;
-  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(5)), "NumCounts", NumCounts))
+  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(I++)), "NumCounts",
+              NumCounts))
     return nullptr;
-  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(6)), "NumFunctions",
+  if (!getVal(dyn_cast<MDTuple>(Tuple->getOperand(I++)), "NumFunctions",
               NumFunctions))
     return nullptr;
 
+  // Optional fields. Need to initialize because the fields are optional.
+  uint64_t IsPartialProfile = 0;
+  if (!getOptionalVal(Tuple, I, "IsPartialProfile", IsPartialProfile))
+    return nullptr;
+  double PartialProfileRatio = 0;
+  if (!getOptionalVal(Tuple, I, "PartialProfileRatio", PartialProfileRatio))
+    return nullptr;
+
   SummaryEntryVector Summary;
-  if (!getSummaryFromMD(dyn_cast<MDTuple>(Tuple->getOperand(7)), Summary))
+  if (!getSummaryFromMD(dyn_cast<MDTuple>(Tuple->getOperand(I++)), Summary))
     return nullptr;
   return new ProfileSummary(SummaryKind, std::move(Summary), TotalCount,
                             MaxCount, MaxInternalCount, MaxFunctionCount,
-                            NumCounts, NumFunctions);
+                            NumCounts, NumFunctions, IsPartialProfile,
+                            PartialProfileRatio);
+}
+
+void ProfileSummary::printSummary(raw_ostream &OS) {
+  OS << "Total functions: " << NumFunctions << "\n";
+  OS << "Maximum function count: " << MaxFunctionCount << "\n";
+  OS << "Maximum block count: " << MaxCount << "\n";
+  OS << "Total number of blocks: " << NumCounts << "\n";
+  OS << "Total count: " << TotalCount << "\n";
+}
+
+void ProfileSummary::printDetailedSummary(raw_ostream &OS) {
+  OS << "Detailed summary:\n";
+  for (auto Entry : DetailedSummary) {
+    OS << Entry.NumCounts << " blocks with count >= " << Entry.MinCount
+       << " account for "
+       << format("%0.6g", (float)Entry.Cutoff / Scale * 100)
+       << " percentage of the total counts.\n";
+  }
 }
diff --git a/llvm/lib/IR/SafepointIRVerifier.cpp b/llvm/lib/IR/SafepointIRVerifier.cpp
index f9578394a827..6bf7caa50a12 100644
--- a/llvm/lib/IR/SafepointIRVerifier.cpp
+++ b/llvm/lib/IR/SafepointIRVerifier.cpp
@@ -45,6 +45,7 @@
 #include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/Allocator.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
@@ -207,7 +208,7 @@ PreservedAnalyses SafepointIRVerifierPass::run(Function &F,
   Verify(F, DT, CD);
   return PreservedAnalyses::all();
 }
-}
+} // namespace llvm
 
 namespace {
 
@@ -782,7 +783,7 @@ void GCPtrTracker::transferBlock(const BasicBlock *BB, BasicBlockState &BBS,
 
 void GCPtrTracker::transferInstruction(const Instruction &I, bool &Cleared,
                                        AvailableValueSet &Available) {
-  if (isStatepoint(I)) {
+  if (isa<GCStatepointInst>(I)) {
     Cleared = true;
     Available.clear();
   } else if (containsGCPtrType(I.getType()))
diff --git a/llvm/lib/IR/Statepoint.cpp b/llvm/lib/IR/Statepoint.cpp
index fce89b42e9bf..bbfbbe489bae 100644
--- a/llvm/lib/IR/Statepoint.cpp
+++ b/llvm/lib/IR/Statepoint.cpp
@@ -17,40 +17,6 @@
 
 using namespace llvm;
 
-bool llvm::isStatepoint(const CallBase *Call) {
-  if (auto *F = Call->getCalledFunction())
-    return F->getIntrinsicID() == Intrinsic::experimental_gc_statepoint;
-  return false;
-}
-
-bool llvm::isStatepoint(const Value *V) {
-  if (auto *Call = dyn_cast<CallBase>(V))
-    return isStatepoint(Call);
-  return false;
-}
-
-bool llvm::isStatepoint(const Value &V) {
-  return isStatepoint(&V);
-}
-
-bool llvm::isGCRelocate(const CallBase *Call) {
-  return isa<GCRelocateInst>(Call);
-}
-
-bool llvm::isGCRelocate(const Value *V) {
-  if (auto *Call = dyn_cast<CallBase>(V))
-    return isGCRelocate(Call);
-  return false;
-}
-
-bool llvm::isGCResult(const CallBase *Call) { return isa<GCResultInst>(Call); }
-
-bool llvm::isGCResult(const Value *V) {
-  if (auto *Call = dyn_cast<CallBase>(V))
-    return isGCResult(Call);
-  return false;
-}
-
 bool llvm::isStatepointDirectiveAttr(Attribute Attr) {
   return Attr.hasAttribute("statepoint-id") ||
          Attr.hasAttribute("statepoint-num-patch-bytes");
diff --git a/llvm/lib/IR/SymbolTableListTraitsImpl.h b/llvm/lib/IR/SymbolTableListTraitsImpl.h
index f399c823d6fb..4283744bd058 100644
--- a/llvm/lib/IR/SymbolTableListTraitsImpl.h
+++ b/llvm/lib/IR/SymbolTableListTraitsImpl.h
@@ -20,6 +20,11 @@
 
 namespace llvm {
 
+/// Notify basic blocks when an instruction is inserted.
+template <typename ParentClass>
+inline void invalidateParentIListOrdering(ParentClass *Parent) {}
+template <> void invalidateParentIListOrdering(BasicBlock *BB);
+
 /// setSymTabObject - This is called when (f.e.) the parent of a basic block
 /// changes.  This requires us to remove all the instruction symtab entries from
 /// the current function and reinsert them into the new function.
@@ -64,6 +69,7 @@ void SymbolTableListTraits<ValueSubClass>::addNodeToList(ValueSubClass *V) {
   assert(!V->getParent() && "Value already in a container!!");
   ItemParentClass *Owner = getListOwner();
   V->setParent(Owner);
+  invalidateParentIListOrdering(Owner);
   if (V->hasName())
     if (ValueSymbolTable *ST = getSymTab(Owner))
       ST->reinsertValue(V);
@@ -81,8 +87,13 @@ void SymbolTableListTraits<ValueSubClass>::removeNodeFromList(
 template <typename ValueSubClass>
 void SymbolTableListTraits<ValueSubClass>::transferNodesFromList(
     SymbolTableListTraits &L2, iterator first, iterator last) {
-  // We only have to do work here if transferring instructions between BBs
-  ItemParentClass *NewIP = getListOwner(), *OldIP = L2.getListOwner();
+  // Transfering nodes, even within the same BB, invalidates the ordering. The
+  // list that we removed the nodes from still has a valid ordering.
+  ItemParentClass *NewIP = getListOwner();
+  invalidateParentIListOrdering(NewIP);
+
+  // Nothing else needs to be done if we're reording nodes within the same list.
+  ItemParentClass *OldIP = L2.getListOwner();
   if (NewIP == OldIP)
     return;
 
diff --git a/llvm/lib/IR/Type.cpp b/llvm/lib/IR/Type.cpp
index 3eab5042b542..d869a6e07cca 100644
--- a/llvm/lib/IR/Type.cpp
+++ b/llvm/lib/IR/Type.cpp
@@ -40,6 +40,7 @@ Type *Type::getPrimitiveType(LLVMContext &C, TypeID IDNumber) {
   switch (IDNumber) {
   case VoidTyID      : return getVoidTy(C);
   case HalfTyID      : return getHalfTy(C);
+  case BFloatTyID    : return getBFloatTy(C);
   case FloatTyID     : return getFloatTy(C);
   case DoubleTyID    : return getDoubleTy(C);
   case X86_FP80TyID  : return getX86_FP80Ty(C);
@@ -68,20 +69,17 @@ bool Type::canLosslesslyBitCastTo(Type *Ty) const {
     return false;
 
   // Vector -> Vector conversions are always lossless if the two vector types
-  // have the same size, otherwise not.  Also, 64-bit vector types can be
-  // converted to x86mmx.
-  if (auto *thisPTy = dyn_cast<VectorType>(this)) {
-    if (auto *thatPTy = dyn_cast<VectorType>(Ty))
-      return thisPTy->getBitWidth() == thatPTy->getBitWidth();
-    if (Ty->getTypeID() == Type::X86_MMXTyID &&
-        thisPTy->getBitWidth() == 64)
-      return true;
-  }
+  // have the same size, otherwise not.
+  if (isa<VectorType>(this) && isa<VectorType>(Ty))
+    return getPrimitiveSizeInBits() == Ty->getPrimitiveSizeInBits();
 
-  if (this->getTypeID() == Type::X86_MMXTyID)
-    if (auto *thatPTy = dyn_cast<VectorType>(Ty))
-      if (thatPTy->getBitWidth() == 64)
-        return true;
+  //  64-bit fixed width vector types can be losslessly converted to x86mmx.
+  if (((isa<FixedVectorType>(this)) && Ty->isX86_MMXTy()) &&
+      getPrimitiveSizeInBits().getFixedSize() == 64)
+    return true;
+  if ((isX86_MMXTy() && isa<FixedVectorType>(Ty)) &&
+      Ty->getPrimitiveSizeInBits().getFixedSize() == 64)
+    return true;
 
   // At this point we have only various mismatches of the first class types
   // remaining and ptr->ptr. Just select the lossless conversions. Everything
@@ -115,6 +113,7 @@ bool Type::isEmptyTy() const {
 TypeSize Type::getPrimitiveSizeInBits() const {
   switch (getTypeID()) {
   case Type::HalfTyID: return TypeSize::Fixed(16);
+  case Type::BFloatTyID: return TypeSize::Fixed(16);
   case Type::FloatTyID: return TypeSize::Fixed(32);
   case Type::DoubleTyID: return TypeSize::Fixed(64);
   case Type::X86_FP80TyID: return TypeSize::Fixed(80);
@@ -123,16 +122,21 @@ TypeSize Type::getPrimitiveSizeInBits() const {
   case Type::X86_MMXTyID: return TypeSize::Fixed(64);
   case Type::IntegerTyID:
     return TypeSize::Fixed(cast<IntegerType>(this)->getBitWidth());
-  case Type::VectorTyID: {
+  case Type::FixedVectorTyID:
+  case Type::ScalableVectorTyID: {
     const VectorType *VTy = cast<VectorType>(this);
-    return TypeSize(VTy->getBitWidth(), VTy->isScalable());
+    ElementCount EC = VTy->getElementCount();
+    TypeSize ETS = VTy->getElementType()->getPrimitiveSizeInBits();
+    assert(!ETS.isScalable() && "Vector type should have fixed-width elements");
+    return {ETS.getFixedSize() * EC.Min, EC.Scalable};
   }
   default: return TypeSize::Fixed(0);
   }
 }
 
 unsigned Type::getScalarSizeInBits() const {
-  return getScalarType()->getPrimitiveSizeInBits();
+  // It is safe to assume that the scalar types have a fixed size.
+  return getScalarType()->getPrimitiveSizeInBits().getFixedSize();
 }
 
 int Type::getFPMantissaWidth() const {
@@ -140,6 +144,7 @@ int Type::getFPMantissaWidth() const {
     return VTy->getElementType()->getFPMantissaWidth();
   assert(isFloatingPointTy() && "Not a floating point type!");
   if (getTypeID() == HalfTyID) return 11;
+  if (getTypeID() == BFloatTyID) return 8;
   if (getTypeID() == FloatTyID) return 24;
   if (getTypeID() == DoubleTyID) return 53;
   if (getTypeID() == X86_FP80TyID) return 64;
@@ -165,6 +170,7 @@ bool Type::isSizedDerivedType(SmallPtrSetImpl<Type*> *Visited) const {
 Type *Type::getVoidTy(LLVMContext &C) { return &C.pImpl->VoidTy; }
 Type *Type::getLabelTy(LLVMContext &C) { return &C.pImpl->LabelTy; }
 Type *Type::getHalfTy(LLVMContext &C) { return &C.pImpl->HalfTy; }
+Type *Type::getBFloatTy(LLVMContext &C) { return &C.pImpl->BFloatTy; }
 Type *Type::getFloatTy(LLVMContext &C) { return &C.pImpl->FloatTy; }
 Type *Type::getDoubleTy(LLVMContext &C) { return &C.pImpl->DoubleTy; }
 Type *Type::getMetadataTy(LLVMContext &C) { return &C.pImpl->MetadataTy; }
@@ -189,6 +195,10 @@ PointerType *Type::getHalfPtrTy(LLVMContext &C, unsigned AS) {
   return getHalfTy(C)->getPointerTo(AS);
 }
 
+PointerType *Type::getBFloatPtrTy(LLVMContext &C, unsigned AS) {
+  return getBFloatTy(C)->getPointerTo(AS);
+}
+
 PointerType *Type::getFloatPtrTy(LLVMContext &C, unsigned AS) {
   return getFloatTy(C)->getPointerTo(AS);
 }
@@ -509,11 +519,9 @@ StringRef StructType::getName() const {
 }
 
 bool StructType::isValidElementType(Type *ElemTy) {
-  if (auto *VTy = dyn_cast<VectorType>(ElemTy))
-    return !VTy->isScalable();
   return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
          !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() &&
-         !ElemTy->isTokenTy();
+         !ElemTy->isTokenTy() && !isa<ScalableVectorType>(ElemTy);
 }
 
 bool StructType::isLayoutIdentical(StructType *Other) const {
@@ -529,52 +537,24 @@ StructType *Module::getTypeByName(StringRef Name) const {
   return getContext().pImpl->NamedStructTypes.lookup(Name);
 }
 
-//===----------------------------------------------------------------------===//
-//                       CompositeType Implementation
-//===----------------------------------------------------------------------===//
-
-Type *CompositeType::getTypeAtIndex(const Value *V) const {
-  if (auto *STy = dyn_cast<StructType>(this)) {
-    unsigned Idx =
-      (unsigned)cast<Constant>(V)->getUniqueInteger().getZExtValue();
-    assert(indexValid(Idx) && "Invalid structure index!");
-    return STy->getElementType(Idx);
-  }
-
-  return cast<SequentialType>(this)->getElementType();
+Type *StructType::getTypeAtIndex(const Value *V) const {
+  unsigned Idx = (unsigned)cast<Constant>(V)->getUniqueInteger().getZExtValue();
+  assert(indexValid(Idx) && "Invalid structure index!");
+  return getElementType(Idx);
 }
 
-Type *CompositeType::getTypeAtIndex(unsigned Idx) const{
-  if (auto *STy = dyn_cast<StructType>(this)) {
-    assert(indexValid(Idx) && "Invalid structure index!");
-    return STy->getElementType(Idx);
-  }
-
-  return cast<SequentialType>(this)->getElementType();
-}
-
-bool CompositeType::indexValid(const Value *V) const {
-  if (auto *STy = dyn_cast<StructType>(this)) {
-    // Structure indexes require (vectors of) 32-bit integer constants.  In the
-    // vector case all of the indices must be equal.
-    if (!V->getType()->isIntOrIntVectorTy(32))
-      return false;
-    const Constant *C = dyn_cast<Constant>(V);
-    if (C && V->getType()->isVectorTy())
-      C = C->getSplatValue();
-    const ConstantInt *CU = dyn_cast_or_null<ConstantInt>(C);
-    return CU && CU->getZExtValue() < STy->getNumElements();
-  }
-
-  // Sequential types can be indexed by any integer.
-  return V->getType()->isIntOrIntVectorTy();
-}
-
-bool CompositeType::indexValid(unsigned Idx) const {
-  if (auto *STy = dyn_cast<StructType>(this))
-    return Idx < STy->getNumElements();
-  // Sequential types can be indexed by any integer.
-  return true;
+bool StructType::indexValid(const Value *V) const {
+  // Structure indexes require (vectors of) 32-bit integer constants.  In the
+  // vector case all of the indices must be equal.
+  if (!V->getType()->isIntOrIntVectorTy(32))
+    return false;
+  if (isa<ScalableVectorType>(V->getType()))
+    return false;
+  const Constant *C = dyn_cast<Constant>(V);
+  if (C && V->getType()->isVectorTy())
+    C = C->getSplatValue();
+  const ConstantInt *CU = dyn_cast_or_null<ConstantInt>(C);
+  return CU && CU->getZExtValue() < getNumElements();
 }
 
 //===----------------------------------------------------------------------===//
@@ -582,7 +562,11 @@ bool CompositeType::indexValid(unsigned Idx) const {
 //===----------------------------------------------------------------------===//
 
 ArrayType::ArrayType(Type *ElType, uint64_t NumEl)
-  : SequentialType(ArrayTyID, ElType, NumEl) {}
+    : Type(ElType->getContext(), ArrayTyID), ContainedType(ElType),
+      NumElements(NumEl) {
+  ContainedTys = &ContainedType;
+  NumContainedTys = 1;
+}
 
 ArrayType *ArrayType::get(Type *ElementType, uint64_t NumElements) {
   assert(isValidElementType(ElementType) && "Invalid type for array element!");
@@ -597,37 +581,75 @@ ArrayType *ArrayType::get(Type *ElementType, uint64_t NumElements) {
 }
 
 bool ArrayType::isValidElementType(Type *ElemTy) {
-  if (auto *VTy = dyn_cast<VectorType>(ElemTy))
-    return !VTy->isScalable();
   return !ElemTy->isVoidTy() && !ElemTy->isLabelTy() &&
          !ElemTy->isMetadataTy() && !ElemTy->isFunctionTy() &&
-         !ElemTy->isTokenTy();
+         !ElemTy->isTokenTy() && !isa<ScalableVectorType>(ElemTy);
 }
 
 //===----------------------------------------------------------------------===//
 //                          VectorType Implementation
 //===----------------------------------------------------------------------===//
 
-VectorType::VectorType(Type *ElType, ElementCount EC)
-  : SequentialType(VectorTyID, ElType, EC.Min), Scalable(EC.Scalable) {}
+VectorType::VectorType(Type *ElType, unsigned EQ, Type::TypeID TID)
+    : Type(ElType->getContext(), TID), ContainedType(ElType),
+      ElementQuantity(EQ) {
+  ContainedTys = &ContainedType;
+  NumContainedTys = 1;
+}
 
 VectorType *VectorType::get(Type *ElementType, ElementCount EC) {
-  assert(EC.Min > 0 && "#Elements of a VectorType must be greater than 0");
+  if (EC.Scalable)
+    return ScalableVectorType::get(ElementType, EC.Min);
+  else
+    return FixedVectorType::get(ElementType, EC.Min);
+}
+
+bool VectorType::isValidElementType(Type *ElemTy) {
+  return ElemTy->isIntegerTy() || ElemTy->isFloatingPointTy() ||
+         ElemTy->isPointerTy();
+}
+
+//===----------------------------------------------------------------------===//
+//                        FixedVectorType Implementation
+//===----------------------------------------------------------------------===//
+
+FixedVectorType *FixedVectorType::get(Type *ElementType, unsigned NumElts) {
+  assert(NumElts > 0 && "#Elements of a VectorType must be greater than 0");
   assert(isValidElementType(ElementType) && "Element type of a VectorType must "
                                             "be an integer, floating point, or "
                                             "pointer type.");
 
+  ElementCount EC(NumElts, false);
+
   LLVMContextImpl *pImpl = ElementType->getContext().pImpl;
-  VectorType *&Entry = ElementType->getContext().pImpl
-                                 ->VectorTypes[std::make_pair(ElementType, EC)];
+  VectorType *&Entry = ElementType->getContext()
+                           .pImpl->VectorTypes[std::make_pair(ElementType, EC)];
+
   if (!Entry)
-    Entry = new (pImpl->Alloc) VectorType(ElementType, EC);
-  return Entry;
+    Entry = new (pImpl->Alloc) FixedVectorType(ElementType, NumElts);
+  return cast<FixedVectorType>(Entry);
 }
 
-bool VectorType::isValidElementType(Type *ElemTy) {
-  return ElemTy->isIntegerTy() || ElemTy->isFloatingPointTy() ||
-    ElemTy->isPointerTy();
+//===----------------------------------------------------------------------===//
+//                       ScalableVectorType Implementation
+//===----------------------------------------------------------------------===//
+
+ScalableVectorType *ScalableVectorType::get(Type *ElementType,
+                                            unsigned MinNumElts) {
+  assert(MinNumElts > 0 && "#Elements of a VectorType must be greater than 0");
+  assert(isValidElementType(ElementType) && "Element type of a VectorType must "
+                                            "be an integer, floating point, or "
+                                            "pointer type.");
+
+  ElementCount EC(MinNumElts, true);
+
+  LLVMContextImpl *pImpl = ElementType->getContext().pImpl;
+  VectorType *&Entry = ElementType->getContext()
+                           .pImpl->VectorTypes[std::make_pair(ElementType, EC)];
+
+  if (!Entry)
+    Entry = new (pImpl->Alloc) ScalableVectorType(ElementType, MinNumElts);
+  return cast<ScalableVectorType>(Entry);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/IR/Use.cpp b/llvm/lib/IR/Use.cpp
index 18c61757ee84..dc0716b85372 100644
--- a/llvm/lib/IR/Use.cpp
+++ b/llvm/lib/IR/Use.cpp
@@ -37,52 +37,10 @@ void Use::swap(Use &RHS) {
   }
 }
 
-User *Use::getUser() const {
-  const Use *End = getImpliedUser();
-  const UserRef *ref = reinterpret_cast<const UserRef *>(End);
-  return ref->getInt() ? ref->getPointer()
-                       : reinterpret_cast<User *>(const_cast<Use *>(End));
-}
-
 unsigned Use::getOperandNo() const {
   return this - getUser()->op_begin();
 }
 
-// Sets up the waymarking algorithm's tags for a series of Uses. See the
-// algorithm details here:
-//
-//   http://www.llvm.org/docs/ProgrammersManual.html#the-waymarking-algorithm
-//
-Use *Use::initTags(Use *const Start, Use *Stop) {
-  ptrdiff_t Done = 0;
-  while (Done < 20) {
-    if (Start == Stop--)
-      return Start;
-    static const PrevPtrTag tags[20] = {
-        fullStopTag,  oneDigitTag,  stopTag,      oneDigitTag, oneDigitTag,
-        stopTag,      zeroDigitTag, oneDigitTag,  oneDigitTag, stopTag,
-        zeroDigitTag, oneDigitTag,  zeroDigitTag, oneDigitTag, stopTag,
-        oneDigitTag,  oneDigitTag,  oneDigitTag,  oneDigitTag, stopTag};
-    new (Stop) Use(tags[Done++]);
-  }
-
-  ptrdiff_t Count = Done;
-  while (Start != Stop) {
-    --Stop;
-    if (!Count) {
-      new (Stop) Use(stopTag);
-      ++Done;
-      Count = Done;
-    } else {
-      new (Stop) Use(PrevPtrTag(Count & 1));
-      Count >>= 1;
-      ++Done;
-    }
-  }
-
-  return Start;
-}
-
 void Use::zap(Use *Start, const Use *Stop, bool del) {
   while (Start != Stop)
     (--Stop)->~Use();
@@ -90,37 +48,4 @@ void Use::zap(Use *Start, const Use *Stop, bool del) {
     ::operator delete(Start);
 }
 
-const Use *Use::getImpliedUser() const {
-  const Use *Current = this;
-
-  while (true) {
-    unsigned Tag = (Current++)->Prev.getInt();
-    switch (Tag) {
-    case zeroDigitTag:
-    case oneDigitTag:
-      continue;
-
-    case stopTag: {
-      ++Current;
-      ptrdiff_t Offset = 1;
-      while (true) {
-        unsigned Tag = Current->Prev.getInt();
-        switch (Tag) {
-        case zeroDigitTag:
-        case oneDigitTag:
-          ++Current;
-          Offset = (Offset << 1) + Tag;
-          continue;
-        default:
-          return Current + Offset;
-        }
-      }
-    }
-
-    case fullStopTag:
-      return Current;
-    }
-  }
-}
-
-} // End llvm namespace
+} // namespace llvm
diff --git a/llvm/lib/IR/User.cpp b/llvm/lib/IR/User.cpp
index 4a3eba9e8cf7..7da592f40127 100644
--- a/llvm/lib/IR/User.cpp
+++ b/llvm/lib/IR/User.cpp
@@ -9,6 +9,7 @@
 #include "llvm/IR/User.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/IntrinsicInst.h"
 
 namespace llvm {
 class BasicBlock;
@@ -39,20 +40,18 @@ void User::replaceUsesOfWith(Value *From, Value *To) {
 void User::allocHungoffUses(unsigned N, bool IsPhi) {
   assert(HasHungOffUses && "alloc must have hung off uses");
 
-  static_assert(alignof(Use) >= alignof(Use::UserRef),
-                "Alignment is insufficient for 'hung-off-uses' pieces");
-  static_assert(alignof(Use::UserRef) >= alignof(BasicBlock *),
+  static_assert(alignof(Use) >= alignof(BasicBlock *),
                 "Alignment is insufficient for 'hung-off-uses' pieces");
 
-  // Allocate the array of Uses, followed by a pointer (with bottom bit set) to
-  // the User.
-  size_t size = N * sizeof(Use) + sizeof(Use::UserRef);
+  // Allocate the array of Uses
+  size_t size = N * sizeof(Use);
   if (IsPhi)
     size += N * sizeof(BasicBlock *);
   Use *Begin = static_cast<Use*>(::operator new(size));
   Use *End = Begin + N;
-  (void) new(End) Use::UserRef(const_cast<User*>(this), 1);
-  setOperandList(Use::initTags(Begin, End));
+  setOperandList(Begin);
+  for (; Begin != End; Begin++)
+    new (Begin) Use(this);
 }
 
 void User::growHungoffUses(unsigned NewNumUses, bool IsPhi) {
@@ -73,10 +72,8 @@ void User::growHungoffUses(unsigned NewNumUses, bool IsPhi) {
 
   // If this is a Phi, then we need to copy the BB pointers too.
   if (IsPhi) {
-    auto *OldPtr =
-        reinterpret_cast<char *>(OldOps + OldNumUses) + sizeof(Use::UserRef);
-    auto *NewPtr =
-        reinterpret_cast<char *>(NewOps + NewNumUses) + sizeof(Use::UserRef);
+    auto *OldPtr = reinterpret_cast<char *>(OldOps + OldNumUses);
+    auto *NewPtr = reinterpret_cast<char *>(NewOps + NewNumUses);
     std::copy(OldPtr, OldPtr + (OldNumUses * sizeof(BasicBlock *)), NewPtr);
   }
   Use::zap(OldOps, OldOps + OldNumUses, true);
@@ -105,6 +102,12 @@ MutableArrayRef<uint8_t> User::getDescriptor() {
       reinterpret_cast<uint8_t *>(DI) - DI->SizeInBytes, DI->SizeInBytes);
 }
 
+bool User::isDroppable() const {
+  if (const auto *Intr = dyn_cast<IntrinsicInst>(this))
+    return Intr->getIntrinsicID() == Intrinsic::assume;
+  return false;
+}
+
 //===----------------------------------------------------------------------===//
 //                         User operator new Implementations
 //===----------------------------------------------------------------------===//
@@ -128,7 +131,8 @@ void *User::allocateFixedOperandUser(size_t Size, unsigned Us,
   Obj->NumUserOperands = Us;
   Obj->HasHungOffUses = false;
   Obj->HasDescriptor = DescBytes != 0;
-  Use::initTags(Start, End);
+  for (; Start != End; Start++)
+    new (Start) Use(Obj);
 
   if (DescBytes != 0) {
     auto *DescInfo = reinterpret_cast<DescriptorInfo *>(Storage + DescBytes);
@@ -191,4 +195,4 @@ LLVM_NO_SANITIZE_MEMORY_ATTRIBUTE void User::operator delete(void *Usr) {
   }
 }
 
-} // End llvm namespace
+} // namespace llvm
diff --git a/llvm/lib/IR/Value.cpp b/llvm/lib/IR/Value.cpp
index cf9d08f6fc02..efb8d53e8964 100644
--- a/llvm/lib/IR/Value.cpp
+++ b/llvm/lib/IR/Value.cpp
@@ -83,13 +83,17 @@ Value::~Value() {
   // reference and something is wrong.  This code is here to print out where
   // the value is still being referenced.
   //
-  if (!use_empty()) {
+  // Note that use_empty() cannot be called here, as it eventually downcasts
+  // 'this' to GlobalValue (derived class of Value), but GlobalValue has already
+  // been destructed, so accessing it is UB.
+  //
+  if (!materialized_use_empty()) {
     dbgs() << "While deleting: " << *VTy << " %" << getName() << "\n";
     for (auto *U : users())
       dbgs() << "Use still stuck around after Def is destroyed:" << *U << "\n";
   }
 #endif
-  assert(use_empty() && "Uses remain when a value is destroyed!");
+  assert(materialized_use_empty() && "Uses remain when a value is destroyed!");
 
   // If this value is named, destroy the name.  This should not be in a symtab
   // at this point.
@@ -107,6 +111,10 @@ void Value::deleteValue() {
     static_cast<DerivedUser *>(this)->DeleteValue(                             \
         static_cast<DerivedUser *>(this));                                     \
     break;
+#define HANDLE_CONSTANT(Name)                                                  \
+  case Value::Name##Val:                                                       \
+    llvm_unreachable("constants should be destroyed with destroyConstant");    \
+    break;
 #define HANDLE_INSTRUCTION(Name)  /* nothing */
 #include "llvm/IR/Value.def"
 
@@ -124,8 +132,10 @@ void Value::deleteValue() {
 
 void Value::destroyValueName() {
   ValueName *Name = getValueName();
-  if (Name)
-    Name->Destroy();
+  if (Name) {
+    MallocAllocator Allocator;
+    Name->Destroy(Allocator);
+  }
   setValueName(nullptr);
 }
 
@@ -137,6 +147,51 @@ bool Value::hasNUsesOrMore(unsigned N) const {
   return hasNItemsOrMore(use_begin(), use_end(), N);
 }
 
+static bool isUnDroppableUser(const User *U) { return !U->isDroppable(); }
+
+Use *Value::getSingleUndroppableUse() {
+  Use *Result = nullptr;
+  for (Use &U : uses()) {
+    if (!U.getUser()->isDroppable()) {
+      if (Result)
+        return nullptr;
+      Result = &U;
+    }
+  }
+  return Result;
+}
+
+bool Value::hasNUndroppableUses(unsigned int N) const {
+  return hasNItems(user_begin(), user_end(), N, isUnDroppableUser);
+}
+
+bool Value::hasNUndroppableUsesOrMore(unsigned int N) const {
+  return hasNItemsOrMore(user_begin(), user_end(), N, isUnDroppableUser);
+}
+
+void Value::dropDroppableUses(
+    llvm::function_ref<bool(const Use *)> ShouldDrop) {
+  SmallVector<Use *, 8> ToBeEdited;
+  for (Use &U : uses())
+    if (U.getUser()->isDroppable() && ShouldDrop(&U))
+      ToBeEdited.push_back(&U);
+  for (Use *U : ToBeEdited) {
+    U->removeFromList();
+    if (auto *Assume = dyn_cast<IntrinsicInst>(U->getUser())) {
+      assert(Assume->getIntrinsicID() == Intrinsic::assume);
+      unsigned OpNo = U->getOperandNo();
+      if (OpNo == 0)
+        Assume->setOperand(0, ConstantInt::getTrue(Assume->getContext()));
+      else {
+        Assume->setOperand(OpNo, UndefValue::get(U->get()->getType()));
+        CallInst::BundleOpInfo &BOI = Assume->getBundleOpInfoForOperand(OpNo);
+        BOI.Tag = getContext().pImpl->getOrInsertBundleTag("ignore");
+      }
+    } else
+      llvm_unreachable("unkown droppable use");
+  }
+}
+
 bool Value::isUsedInBasicBlock(const BasicBlock *BB) const {
   // This can be computed either by scanning the instructions in BB, or by
   // scanning the use list of this Value. Both lists can be very long, but
@@ -263,7 +318,8 @@ void Value::setNameImpl(const Twine &NewName) {
     destroyValueName();
 
     // Create the new name.
-    setValueName(ValueName::Create(NameRef));
+    MallocAllocator Allocator;
+    setValueName(ValueName::Create(NameRef, Allocator));
     getValueName()->setValue(this);
     return;
   }
@@ -463,8 +519,12 @@ enum PointerStripKind {
   PSK_InBounds
 };
 
+template <PointerStripKind StripKind> static void NoopCallback(const Value *) {}
+
 template <PointerStripKind StripKind>
-static const Value *stripPointerCastsAndOffsets(const Value *V) {
+static const Value *stripPointerCastsAndOffsets(
+    const Value *V,
+    function_ref<void(const Value *)> Func = NoopCallback<StripKind>) {
   if (!V->getType()->isPointerTy())
     return V;
 
@@ -474,6 +534,7 @@ static const Value *stripPointerCastsAndOffsets(const Value *V) {
 
   Visited.insert(V);
   do {
+    Func(V);
     if (auto *GEP = dyn_cast<GEPOperator>(V)) {
       switch (StripKind) {
       case PSK_ZeroIndices:
@@ -495,6 +556,8 @@ static const Value *stripPointerCastsAndOffsets(const Value *V) {
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast) {
       V = cast<Operator>(V)->getOperand(0);
+      if (!V->getType()->isPointerTy())
+        return V;
     } else if (StripKind != PSK_ZeroIndicesSameRepresentation &&
                Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
       // TODO: If we know an address space cast will not change the
@@ -547,9 +610,9 @@ const Value *Value::stripPointerCastsAndInvariantGroups() const {
   return stripPointerCastsAndOffsets<PSK_ZeroIndicesAndInvariantGroups>(this);
 }
 
-const Value *
-Value::stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset,
-                                         bool AllowNonInbounds) const {
+const Value *Value::stripAndAccumulateConstantOffsets(
+    const DataLayout &DL, APInt &Offset, bool AllowNonInbounds,
+    function_ref<bool(Value &, APInt &)> ExternalAnalysis) const {
   if (!getType()->isPtrOrPtrVectorTy())
     return this;
 
@@ -575,7 +638,7 @@ Value::stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset,
       // of GEP's pointer type rather than the size of the original
       // pointer type.
       APInt GEPOffset(DL.getIndexTypeSizeInBits(V->getType()), 0);
-      if (!GEP->accumulateConstantOffset(DL, GEPOffset))
+      if (!GEP->accumulateConstantOffset(DL, GEPOffset, ExternalAnalysis))
         return V;
 
       // Stop traversal if the pointer offset wouldn't fit in the bit-width
@@ -584,7 +647,20 @@ Value::stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset,
       if (GEPOffset.getMinSignedBits() > BitWidth)
         return V;
 
-      Offset += GEPOffset.sextOrTrunc(BitWidth);
+      // External Analysis can return a result higher/lower than the value
+      // represents. We need to detect overflow/underflow.
+      APInt GEPOffsetST = GEPOffset.sextOrTrunc(BitWidth);
+      if (!ExternalAnalysis) {
+        Offset += GEPOffsetST;
+      } else {
+        bool Overflow = false;
+        APInt OldOffset = Offset;
+        Offset = Offset.sadd_ov(GEPOffsetST, Overflow);
+        if (Overflow) {
+          Offset = OldOffset;
+          return V;
+        }
+      }
       V = GEP->getPointerOperand();
     } else if (Operator::getOpcode(V) == Instruction::BitCast ||
                Operator::getOpcode(V) == Instruction::AddrSpaceCast) {
@@ -602,8 +678,9 @@ Value::stripAndAccumulateConstantOffsets(const DataLayout &DL, APInt &Offset,
   return V;
 }
 
-const Value *Value::stripInBoundsOffsets() const {
-  return stripPointerCastsAndOffsets<PSK_InBounds>(this);
+const Value *
+Value::stripInBoundsOffsets(function_ref<void(const Value *)> Func) const {
+  return stripPointerCastsAndOffsets<PSK_InBounds>(this, Func);
 }
 
 uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL,
@@ -617,7 +694,7 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL,
     if (DerefBytes == 0 && (A->hasByValAttr() || A->hasStructRetAttr())) {
       Type *PT = cast<PointerType>(A->getType())->getElementType();
       if (PT->isSized())
-        DerefBytes = DL.getTypeStoreSize(PT);
+        DerefBytes = DL.getTypeStoreSize(PT).getKnownMinSize();
     }
     if (DerefBytes == 0) {
       DerefBytes = A->getDereferenceableOrNullBytes();
@@ -658,30 +735,31 @@ uint64_t Value::getPointerDereferenceableBytes(const DataLayout &DL,
     }
   } else if (auto *AI = dyn_cast<AllocaInst>(this)) {
     if (!AI->isArrayAllocation()) {
-      DerefBytes = DL.getTypeStoreSize(AI->getAllocatedType());
+      DerefBytes =
+          DL.getTypeStoreSize(AI->getAllocatedType()).getKnownMinSize();
       CanBeNull = false;
     }
   } else if (auto *GV = dyn_cast<GlobalVariable>(this)) {
     if (GV->getValueType()->isSized() && !GV->hasExternalWeakLinkage()) {
       // TODO: Don't outright reject hasExternalWeakLinkage but set the
       // CanBeNull flag.
-      DerefBytes = DL.getTypeStoreSize(GV->getValueType());
+      DerefBytes = DL.getTypeStoreSize(GV->getValueType()).getFixedSize();
       CanBeNull = false;
     }
   }
   return DerefBytes;
 }
 
-MaybeAlign Value::getPointerAlignment(const DataLayout &DL) const {
+Align Value::getPointerAlignment(const DataLayout &DL) const {
   assert(getType()->isPointerTy() && "must be pointer");
   if (auto *GO = dyn_cast<GlobalObject>(this)) {
     if (isa<Function>(GO)) {
-      const MaybeAlign FunctionPtrAlign = DL.getFunctionPtrAlign();
+      Align FunctionPtrAlign = DL.getFunctionPtrAlign().valueOrOne();
       switch (DL.getFunctionPtrAlignType()) {
       case DataLayout::FunctionPtrAlignType::Independent:
         return FunctionPtrAlign;
       case DataLayout::FunctionPtrAlignType::MultipleOfFunctionAlign:
-        return std::max(FunctionPtrAlign, MaybeAlign(GO->getAlignment()));
+        return std::max(FunctionPtrAlign, GO->getAlign().valueOrOne());
       }
       llvm_unreachable("Unhandled FunctionPtrAlignType");
     }
@@ -694,43 +772,47 @@ MaybeAlign Value::getPointerAlignment(const DataLayout &DL) const {
           // it the preferred alignment. Otherwise, we have to assume that it
           // may only have the minimum ABI alignment.
           if (GVar->isStrongDefinitionForLinker())
-            return MaybeAlign(DL.getPreferredAlignment(GVar));
+            return DL.getPreferredAlign(GVar);
           else
-            return Align(DL.getABITypeAlignment(ObjectType));
+            return DL.getABITypeAlign(ObjectType);
         }
       }
     }
-    return Alignment;
+    return Alignment.valueOrOne();
   } else if (const Argument *A = dyn_cast<Argument>(this)) {
-    const MaybeAlign Alignment(A->getParamAlignment());
+    const MaybeAlign Alignment = A->getParamAlign();
     if (!Alignment && A->hasStructRetAttr()) {
       // An sret parameter has at least the ABI alignment of the return type.
       Type *EltTy = cast<PointerType>(A->getType())->getElementType();
       if (EltTy->isSized())
-        return Align(DL.getABITypeAlignment(EltTy));
+        return DL.getABITypeAlign(EltTy);
     }
-    return Alignment;
+    return Alignment.valueOrOne();
   } else if (const AllocaInst *AI = dyn_cast<AllocaInst>(this)) {
-    const MaybeAlign Alignment(AI->getAlignment());
-    if (!Alignment) {
-      Type *AllocatedType = AI->getAllocatedType();
-      if (AllocatedType->isSized())
-        return MaybeAlign(DL.getPrefTypeAlignment(AllocatedType));
-    }
-    return Alignment;
+    return AI->getAlign();
   } else if (const auto *Call = dyn_cast<CallBase>(this)) {
-    const MaybeAlign Alignment(Call->getRetAlignment());
+    MaybeAlign Alignment = Call->getRetAlign();
     if (!Alignment && Call->getCalledFunction())
-      return MaybeAlign(
-          Call->getCalledFunction()->getAttributes().getRetAlignment());
-    return Alignment;
+      Alignment = Call->getCalledFunction()->getAttributes().getRetAlignment();
+    return Alignment.valueOrOne();
   } else if (const LoadInst *LI = dyn_cast<LoadInst>(this)) {
     if (MDNode *MD = LI->getMetadata(LLVMContext::MD_align)) {
       ConstantInt *CI = mdconst::extract<ConstantInt>(MD->getOperand(0));
-      return MaybeAlign(CI->getLimitedValue());
+      return Align(CI->getLimitedValue());
+    }
+  } else if (auto *CstPtr = dyn_cast<Constant>(this)) {
+    if (auto *CstInt = dyn_cast_or_null<ConstantInt>(ConstantExpr::getPtrToInt(
+            const_cast<Constant *>(CstPtr), DL.getIntPtrType(getType()),
+            /*OnlyIfReduced=*/true))) {
+      size_t TrailingZeros = CstInt->getValue().countTrailingZeros();
+      // While the actual alignment may be large, elsewhere we have
+      // an arbitrary upper alignmet limit, so let's clamp to it.
+      return Align(TrailingZeros < Value::MaxAlignmentExponent
+                       ? uint64_t(1) << TrailingZeros
+                       : Value::MaximumAlignment);
     }
   }
-  return llvm::None;
+  return Align(1);
 }
 
 const Value *Value::DoPHITranslation(const BasicBlock *CurBB,
@@ -754,12 +836,12 @@ void Value::reverseUseList() {
   while (Current) {
     Use *Next = Current->Next;
     Current->Next = Head;
-    Head->setPrev(&Current->Next);
+    Head->Prev = &Current->Next;
     Head = Current;
     Current = Next;
   }
   UseList = Head;
-  Head->setPrev(&UseList);
+  Head->Prev = &UseList;
 }
 
 bool Value::isSwiftError() const {
diff --git a/llvm/lib/IR/ValueSymbolTable.cpp b/llvm/lib/IR/ValueSymbolTable.cpp
index 417ec045071d..b49842315f36 100644
--- a/llvm/lib/IR/ValueSymbolTable.cpp
+++ b/llvm/lib/IR/ValueSymbolTable.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 
 // Class destructor
 ValueSymbolTable::~ValueSymbolTable() {
-#ifndef NDEBUG   // Only do this in -g mode...
+#ifndef NDEBUG // Only do this in -g mode...
   for (const auto &VI : vmap)
     dbgs() << "Value still in symbol table! Type = '"
            << *VI.getValue()->getType() << "' Name = '" << VI.getKeyData()
@@ -69,7 +69,7 @@ ValueName *ValueSymbolTable::makeUniqueName(Value *V,
 
 // Insert a value into the symbol table with the specified name...
 //
-void ValueSymbolTable::reinsertValue(Value* V) {
+void ValueSymbolTable::reinsertValue(Value *V) {
   assert(V->hasName() && "Can't insert nameless Value into symbol table");
 
   // Try inserting the name, assuming it won't conflict.
@@ -83,7 +83,8 @@ void ValueSymbolTable::reinsertValue(Value* V) {
   SmallString<256> UniqueName(V->getName().begin(), V->getName().end());
 
   // The name is too already used, just free it so we can allocate a new name.
-  V->getValueName()->Destroy();
+  MallocAllocator Allocator;
+  V->getValueName()->Destroy(Allocator);
 
   ValueName *VN = makeUniqueName(V, UniqueName);
   V->setValueName(VN);
@@ -116,11 +117,11 @@ ValueName *ValueSymbolTable::createValueName(StringRef Name, Value *V) {
 // dump - print out the symbol table
 //
 LLVM_DUMP_METHOD void ValueSymbolTable::dump() const {
-  //dbgs() << "ValueSymbolTable:\n";
+  // dbgs() << "ValueSymbolTable:\n";
   for (const auto &I : *this) {
-    //dbgs() << "  '" << I->getKeyData() << "' = ";
+    // dbgs() << "  '" << I->getKeyData() << "' = ";
     I.getValue()->dump();
-    //dbgs() << "\n";
+    // dbgs() << "\n";
   }
 }
 #endif
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index d15b70d71b47..6df1072925f9 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -397,6 +397,9 @@ public:
   }
 
 private:
+  /// Whether a metadata node is allowed to be, or contain, a DILocation.
+  enum class AreDebugLocsAllowed { No, Yes };
+
   // Verification methods...
   void visitGlobalValue(const GlobalValue &GV);
   void visitGlobalVariable(const GlobalVariable &GV);
@@ -405,7 +408,7 @@ private:
   void visitAliaseeSubExpr(SmallPtrSetImpl<const GlobalAlias *> &Visited,
                            const GlobalAlias &A, const Constant &C);
   void visitNamedMDNode(const NamedMDNode &NMD);
-  void visitMDNode(const MDNode &MD);
+  void visitMDNode(const MDNode &MD, AreDebugLocsAllowed AllowLocs);
   void visitMetadataAsValue(const MetadataAsValue &MD, Function *F);
   void visitValueAsMetadata(const ValueAsMetadata &MD, Function *F);
   void visitComdat(const Comdat &C);
@@ -567,8 +570,9 @@ void Verifier::visitGlobalValue(const GlobalValue &GV) {
   Assert(!GV.isDeclaration() || GV.hasValidDeclarationLinkage(),
          "Global is external, but doesn't have external or weak linkage!", &GV);
 
-  Assert(GV.getAlignment() <= Value::MaximumAlignment,
-         "huge alignment values are unsupported", &GV);
+  if (const GlobalObject *GO = dyn_cast<GlobalObject>(&GV))
+    Assert(GO->getAlignment() <= Value::MaximumAlignment,
+           "huge alignment values are unsupported", GO);
   Assert(!GV.hasAppendingLinkage() || isa<GlobalVariable>(GV),
          "Only global variables can have appending linkage!", &GV);
 
@@ -590,15 +594,12 @@ void Verifier::visitGlobalValue(const GlobalValue &GV) {
            "Global is marked as dllimport, but not external", &GV);
   }
 
-  if (GV.hasLocalLinkage())
+  if (GV.isImplicitDSOLocal())
     Assert(GV.isDSOLocal(),
-           "GlobalValue with private or internal linkage must be dso_local!",
+           "GlobalValue with local linkage or non-default "
+           "visibility must be dso_local!",
            &GV);
 
-  if (!GV.hasDefaultVisibility() && !GV.hasExternalWeakLinkage())
-    Assert(GV.isDSOLocal(),
-           "GlobalValue with non default visibility must be dso_local!", &GV);
-
   forEachUser(&GV, GlobalValueVisited, [&](const Value *V) -> bool {
     if (const Instruction *I = dyn_cast<Instruction>(V)) {
       if (!I->getParent() || !I->getParent()->getParent())
@@ -701,8 +702,8 @@ void Verifier::visitGlobalVariable(const GlobalVariable &GV) {
   // the runtime size. If the global is a struct or an array containing
   // scalable vectors, that will be caught by the isValidElementType methods
   // in StructType or ArrayType instead.
-  if (auto *VTy = dyn_cast<VectorType>(GV.getValueType()))
-    Assert(!VTy->isScalable(), "Globals cannot contain scalable vectors", &GV);
+  Assert(!isa<ScalableVectorType>(GV.getValueType()),
+         "Globals cannot contain scalable vectors", &GV);
 
   if (!GV.hasInitializer()) {
     visitGlobalValue(GV);
@@ -783,11 +784,11 @@ void Verifier::visitNamedMDNode(const NamedMDNode &NMD) {
     if (!MD)
       continue;
 
-    visitMDNode(*MD);
+    visitMDNode(*MD, AreDebugLocsAllowed::Yes);
   }
 }
 
-void Verifier::visitMDNode(const MDNode &MD) {
+void Verifier::visitMDNode(const MDNode &MD, AreDebugLocsAllowed AllowLocs) {
   // Only visit each node once.  Metadata can be mutually recursive, so this
   // avoids infinite recursion here, as well as being an optimization.
   if (!MDNodes.insert(&MD).second)
@@ -810,8 +811,10 @@ void Verifier::visitMDNode(const MDNode &MD) {
       continue;
     Assert(!isa<LocalAsMetadata>(Op), "Invalid operand for global metadata!",
            &MD, Op);
+    AssertDI(!isa<DILocation>(Op) || AllowLocs == AreDebugLocsAllowed::Yes,
+             "DILocation not allowed within this metadata node", &MD, Op);
     if (auto *N = dyn_cast<MDNode>(Op)) {
-      visitMDNode(*N);
+      visitMDNode(*N, AllowLocs);
       continue;
     }
     if (auto *V = dyn_cast<ValueAsMetadata>(Op)) {
@@ -854,7 +857,7 @@ void Verifier::visitValueAsMetadata(const ValueAsMetadata &MD, Function *F) {
 void Verifier::visitMetadataAsValue(const MetadataAsValue &MDV, Function *F) {
   Metadata *MD = MDV.getMetadata();
   if (auto *N = dyn_cast<MDNode>(MD)) {
-    visitMDNode(*N);
+    visitMDNode(*N, AreDebugLocsAllowed::No);
     return;
   }
 
@@ -891,12 +894,30 @@ void Verifier::visitDIScope(const DIScope &N) {
 
 void Verifier::visitDISubrange(const DISubrange &N) {
   AssertDI(N.getTag() == dwarf::DW_TAG_subrange_type, "invalid tag", &N);
+  AssertDI(N.getRawCountNode() || N.getRawUpperBound(),
+           "Subrange must contain count or upperBound", &N);
+  AssertDI(!N.getRawCountNode() || !N.getRawUpperBound(),
+           "Subrange can have any one of count or upperBound", &N);
+  AssertDI(!N.getRawCountNode() || N.getCount(),
+           "Count must either be a signed constant or a DIVariable", &N);
   auto Count = N.getCount();
-  AssertDI(Count, "Count must either be a signed constant or a DIVariable",
-           &N);
-  AssertDI(!Count.is<ConstantInt*>() ||
-               Count.get<ConstantInt*>()->getSExtValue() >= -1,
+  AssertDI(!Count || !Count.is<ConstantInt *>() ||
+               Count.get<ConstantInt *>()->getSExtValue() >= -1,
            "invalid subrange count", &N);
+  auto *LBound = N.getRawLowerBound();
+  AssertDI(!LBound || isa<ConstantAsMetadata>(LBound) ||
+               isa<DIVariable>(LBound) || isa<DIExpression>(LBound),
+           "LowerBound must be signed constant or DIVariable or DIExpression",
+           &N);
+  auto *UBound = N.getRawUpperBound();
+  AssertDI(!UBound || isa<ConstantAsMetadata>(UBound) ||
+               isa<DIVariable>(UBound) || isa<DIExpression>(UBound),
+           "UpperBound must be signed constant or DIVariable or DIExpression",
+           &N);
+  auto *Stride = N.getRawStride();
+  AssertDI(!Stride || isa<ConstantAsMetadata>(Stride) ||
+               isa<DIVariable>(Stride) || isa<DIExpression>(Stride),
+           "Stride must be signed constant or DIVariable or DIExpression", &N);
 }
 
 void Verifier::visitDIEnumerator(const DIEnumerator &N) {
@@ -1009,6 +1030,11 @@ void Verifier::visitDICompositeType(const DICompositeType &N) {
     AssertDI(isa<DIDerivedType>(D) && N.getTag() == dwarf::DW_TAG_variant_part,
              "discriminator can only appear on variant part");
   }
+
+  if (N.getRawDataLocation()) {
+    AssertDI(N.getTag() == dwarf::DW_TAG_array_type,
+             "dataLocation can only appear in array type");
+  }
 }
 
 void Verifier::visitDISubroutineType(const DISubroutineType &N) {
@@ -1037,6 +1063,9 @@ void Verifier::visitDIFile(const DIFile &N) {
     case DIFile::CSK_SHA1:
       Size = 40;
       break;
+    case DIFile::CSK_SHA256:
+      Size = 64;
+      break;
     }
     AssertDI(Checksum->Value.size() == Size, "invalid checksum length", &N);
     AssertDI(Checksum->Value.find_if_not(llvm::isHexDigit) == StringRef::npos,
@@ -1250,7 +1279,9 @@ void Verifier::visitDIGlobalVariable(const DIGlobalVariable &N) {
 
   AssertDI(N.getTag() == dwarf::DW_TAG_variable, "invalid tag", &N);
   AssertDI(isType(N.getRawType()), "invalid type ref", &N, N.getRawType());
-  AssertDI(N.getType(), "missing global variable type", &N);
+  // Assert only if the global variable is not an extern
+  if (N.isDefinition())
+    AssertDI(N.getType(), "missing global variable type", &N);
   if (auto *Member = N.getRawStaticDataMemberDeclaration()) {
     AssertDI(isa<DIDerivedType>(Member),
              "invalid static data member declaration", &N, Member);
@@ -1476,6 +1507,13 @@ Verifier::visitModuleFlag(const MDNode *Op,
            "'Linker Options' named metadata no longer supported");
   }
 
+  if (ID->getString() == "SemanticInterposition") {
+    ConstantInt *Value =
+        mdconst::dyn_extract_or_null<ConstantInt>(Op->getOperand(2));
+    Assert(Value,
+           "SemanticInterposition metadata requires constant integer argument");
+  }
+
   if (ID->getString() == "CG Profile") {
     for (const MDOperand &MDO : cast<MDNode>(Op->getOperand(2))->operands())
       visitModuleFlagCGProfileEntry(MDO);
@@ -1502,6 +1540,7 @@ void Verifier::visitModuleFlagCGProfileEntry(const MDOperand &MDO) {
 /// Return true if this attribute kind only applies to functions.
 static bool isFuncOnlyAttr(Attribute::AttrKind Kind) {
   switch (Kind) {
+  case Attribute::NoMerge:
   case Attribute::NoReturn:
   case Attribute::NoSync:
   case Attribute::WillReturn:
@@ -1545,6 +1584,7 @@ static bool isFuncOnlyAttr(Attribute::AttrKind Kind) {
   case Attribute::SpeculativeLoadHardening:
   case Attribute::Speculatable:
   case Attribute::StrictFP:
+  case Attribute::NullPointerIsValid:
     return true;
   default:
     break;
@@ -1556,7 +1596,8 @@ static bool isFuncOnlyAttr(Attribute::AttrKind Kind) {
 /// arguments.
 static bool isFuncOrArgAttr(Attribute::AttrKind Kind) {
   return Kind == Attribute::ReadOnly || Kind == Attribute::WriteOnly ||
-         Kind == Attribute::ReadNone || Kind == Attribute::NoFree;
+         Kind == Attribute::ReadNone || Kind == Attribute::NoFree ||
+         Kind == Attribute::Preallocated;
 }
 
 void Verifier::verifyAttributeTypes(AttributeSet Attrs, bool IsFunction,
@@ -1565,6 +1606,13 @@ void Verifier::verifyAttributeTypes(AttributeSet Attrs, bool IsFunction,
     if (A.isStringAttribute())
       continue;
 
+    if (A.isIntAttribute() !=
+        Attribute::doesAttrKindHaveArgument(A.getKindAsEnum())) {
+      CheckFailed("Attribute '" + A.getAsString() + "' should have an Argument",
+                  V);
+      return;
+    }
+
     if (isFuncOnlyAttr(A.getKindAsEnum())) {
       if (!IsFunction) {
         CheckFailed("Attribute '" + A.getAsString() +
@@ -1600,11 +1648,13 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
   unsigned AttrCount = 0;
   AttrCount += Attrs.hasAttribute(Attribute::ByVal);
   AttrCount += Attrs.hasAttribute(Attribute::InAlloca);
+  AttrCount += Attrs.hasAttribute(Attribute::Preallocated);
   AttrCount += Attrs.hasAttribute(Attribute::StructRet) ||
                Attrs.hasAttribute(Attribute::InReg);
   AttrCount += Attrs.hasAttribute(Attribute::Nest);
-  Assert(AttrCount <= 1, "Attributes 'byval', 'inalloca', 'inreg', 'nest', "
-                         "and 'sret' are incompatible!",
+  Assert(AttrCount <= 1,
+         "Attributes 'byval', 'inalloca', 'preallocated', 'inreg', 'nest', "
+         "and 'sret' are incompatible!",
          V);
 
   Assert(!(Attrs.hasAttribute(Attribute::InAlloca) &&
@@ -1654,6 +1704,12 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
            "Attribute 'byval' type does not match parameter!", V);
   }
 
+  if (Attrs.hasAttribute(Attribute::Preallocated)) {
+    Assert(Attrs.getPreallocatedType() ==
+               cast<PointerType>(Ty)->getElementType(),
+           "Attribute 'preallocated' type does not match parameter!", V);
+  }
+
   AttrBuilder IncompatibleAttrs = AttributeFuncs::typeIncompatible(Ty);
   Assert(!AttrBuilder(Attrs).overlaps(IncompatibleAttrs),
          "Wrong types for attribute: " +
@@ -1664,8 +1720,10 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
     SmallPtrSet<Type*, 4> Visited;
     if (!PTy->getElementType()->isSized(&Visited)) {
       Assert(!Attrs.hasAttribute(Attribute::ByVal) &&
-                 !Attrs.hasAttribute(Attribute::InAlloca),
-             "Attributes 'byval' and 'inalloca' do not support unsized types!",
+                 !Attrs.hasAttribute(Attribute::InAlloca) &&
+                 !Attrs.hasAttribute(Attribute::Preallocated),
+             "Attributes 'byval', 'inalloca', and 'preallocated' do not "
+             "support unsized types!",
              V);
     }
     if (!isa<PointerType>(PTy->getElementType()))
@@ -1706,9 +1764,11 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
           !RetAttrs.hasAttribute(Attribute::NoFree) &&
           !RetAttrs.hasAttribute(Attribute::Returned) &&
           !RetAttrs.hasAttribute(Attribute::InAlloca) &&
+          !RetAttrs.hasAttribute(Attribute::Preallocated) &&
           !RetAttrs.hasAttribute(Attribute::SwiftSelf) &&
           !RetAttrs.hasAttribute(Attribute::SwiftError)),
-         "Attributes 'byval', 'inalloca', 'nest', 'sret', 'nocapture', 'nofree'"
+         "Attributes 'byval', 'inalloca', 'preallocated', 'nest', 'sret', "
+         "'nocapture', 'nofree', "
          "'returned', 'swiftself', and 'swifterror' do not apply to return "
          "values!",
          V);
@@ -1852,16 +1912,25 @@ void Verifier::verifyFunctionAttrs(FunctionType *FT, AttributeList Attrs,
       CheckFailed("invalid value for 'frame-pointer' attribute: " + FP, V);
   }
 
+  if (Attrs.hasFnAttribute("patchable-function-prefix")) {
+    StringRef S = Attrs
+                      .getAttribute(AttributeList::FunctionIndex,
+                                    "patchable-function-prefix")
+                      .getValueAsString();
+    unsigned N;
+    if (S.getAsInteger(10, N))
+      CheckFailed(
+          "\"patchable-function-prefix\" takes an unsigned integer: " + S, V);
+  }
   if (Attrs.hasFnAttribute("patchable-function-entry")) {
-    StringRef S0 = Attrs
-                       .getAttribute(AttributeList::FunctionIndex,
-                                     "patchable-function-entry")
-                       .getValueAsString();
-    StringRef S = S0;
+    StringRef S = Attrs
+                      .getAttribute(AttributeList::FunctionIndex,
+                                    "patchable-function-entry")
+                      .getValueAsString();
     unsigned N;
     if (S.getAsInteger(10, N))
       CheckFailed(
-          "\"patchable-function-entry\" takes an unsigned integer: " + S0, V);
+          "\"patchable-function-entry\" takes an unsigned integer: " + S, V);
   }
 }
 
@@ -2037,6 +2106,13 @@ void Verifier::verifyStatepoint(const CallBase &Call) {
          "gc.statepoint number of transition arguments must be positive", Call);
   const int EndTransitionArgsInx = EndCallArgsInx + 1 + NumTransitionArgs;
 
+  // We're migrating away from inline operands to operand bundles, enforce
+  // the either/or property during transition.
+  if (Call.getOperandBundle(LLVMContext::OB_gc_transition)) {
+    Assert(NumTransitionArgs == 0,
+           "can't use both deopt operands and deopt bundle on a statepoint");
+  }
+
   const Value *NumDeoptArgsV = Call.getArgOperand(EndTransitionArgsInx + 1);
   Assert(isa<ConstantInt>(NumDeoptArgsV),
          "gc.statepoint number of deoptimization arguments "
@@ -2048,6 +2124,13 @@ void Verifier::verifyStatepoint(const CallBase &Call) {
          "must be positive",
          Call);
 
+  // We're migrating away from inline operands to operand bundles, enforce
+  // the either/or property during transition.
+  if (Call.getOperandBundle(LLVMContext::OB_deopt)) {
+    Assert(NumDeoptArgs == 0,
+           "can't use both deopt operands and deopt bundle on a statepoint");
+  }
+
   const int ExpectedNumArgs =
       7 + NumCallArgs + NumTransitionArgs + NumDeoptArgs;
   Assert(ExpectedNumArgs <= (int)Call.arg_size(),
@@ -2277,7 +2360,7 @@ void Verifier::visitFunction(const Function &F) {
              "function declaration may not have a !prof attachment", &F);
 
       // Verify the metadata itself.
-      visitMDNode(*I.second);
+      visitMDNode(*I.second, AreDebugLocsAllowed::Yes);
     }
     Assert(!F.hasPersonalityFn(),
            "Function declaration shouldn't have a personality routine", &F);
@@ -2301,6 +2384,7 @@ void Verifier::visitFunction(const Function &F) {
     // Visit metadata attachments.
     for (const auto &I : MDs) {
       // Verify that the attachment is legal.
+      auto AllowLocs = AreDebugLocsAllowed::No;
       switch (I.first) {
       default:
         break;
@@ -2315,6 +2399,7 @@ void Verifier::visitFunction(const Function &F) {
         AssertDI(!AttachedTo || AttachedTo == &F,
                  "DISubprogram attached to more than one function", SP, &F);
         AttachedTo = &F;
+        AllowLocs = AreDebugLocsAllowed::Yes;
         break;
       }
       case LLVMContext::MD_prof:
@@ -2325,7 +2410,7 @@ void Verifier::visitFunction(const Function &F) {
       }
 
       // Verify the metadata itself.
-      visitMDNode(*I.second);
+      visitMDNode(*I.second, AllowLocs);
     }
   }
 
@@ -2344,8 +2429,7 @@ void Verifier::visitFunction(const Function &F) {
   if (!HasDebugInfo)
     return;
 
-  // Check that all !dbg attachments lead to back to N (or, at least, another
-  // subprogram that describes the same function).
+  // Check that all !dbg attachments lead to back to N.
   //
   // FIXME: Check this incrementally while visiting !dbg attachments.
   // FIXME: Only check when N is the canonical subprogram for F.
@@ -2363,18 +2447,20 @@ void Verifier::visitFunction(const Function &F) {
     AssertDI(Parent && isa<DILocalScope>(Parent),
              "DILocation's scope must be a DILocalScope", N, &F, &I, DL,
              Parent);
+
     DILocalScope *Scope = DL->getInlinedAtScope();
-    if (Scope && !Seen.insert(Scope).second)
+    Assert(Scope, "Failed to find DILocalScope", DL);
+
+    if (!Seen.insert(Scope).second)
       return;
 
-    DISubprogram *SP = Scope ? Scope->getSubprogram() : nullptr;
+    DISubprogram *SP = Scope->getSubprogram();
 
     // Scope and SP could be the same MDNode and we don't want to skip
     // validation in that case
     if (SP && ((Scope != SP) && !Seen.insert(SP).second))
       return;
 
-    // FIXME: Once N is canonical, check "SP == &N".
     AssertDI(SP->describes(&F),
              "!dbg attachment points at wrong subprogram for function", N, &F,
              &I, DL, Scope, SP);
@@ -2513,8 +2599,6 @@ void Verifier::visitIndirectBrInst(IndirectBrInst &BI) {
 void Verifier::visitCallBrInst(CallBrInst &CBI) {
   Assert(CBI.isInlineAsm(), "Callbr is currently only used for asm-goto!",
          &CBI);
-  Assert(CBI.getType()->isVoidTy(), "Callbr return value is not supported!",
-         &CBI);
   for (unsigned i = 0, e = CBI.getNumSuccessors(); i != e; ++i)
     Assert(CBI.getSuccessor(i)->getType()->isLabelTy(),
            "Callbr successors must all have pointer type!", &CBI);
@@ -2532,8 +2616,7 @@ void Verifier::visitCallBrInst(CallBrInst &CBI) {
       if (auto *BA = dyn_cast<BlockAddress>(V))
         ArgBBs.insert(BA->getBasicBlock());
     for (BasicBlock *BB : CBI.getIndirectDests())
-      Assert(ArgBBs.find(BB) != ArgBBs.end(),
-             "Indirect label missing from arglist.", &CBI);
+      Assert(ArgBBs.count(BB), "Indirect label missing from arglist.", &CBI);
   }
 
   visitTerminator(CBI);
@@ -2661,8 +2744,8 @@ void Verifier::visitUIToFPInst(UIToFPInst &I) {
          &I);
 
   if (SrcVec && DstVec)
-    Assert(cast<VectorType>(SrcTy)->getNumElements() ==
-               cast<VectorType>(DestTy)->getNumElements(),
+    Assert(cast<VectorType>(SrcTy)->getElementCount() ==
+               cast<VectorType>(DestTy)->getElementCount(),
            "UIToFP source and dest vector length mismatch", &I);
 
   visitInstruction(I);
@@ -2684,8 +2767,8 @@ void Verifier::visitSIToFPInst(SIToFPInst &I) {
          &I);
 
   if (SrcVec && DstVec)
-    Assert(cast<VectorType>(SrcTy)->getNumElements() ==
-               cast<VectorType>(DestTy)->getNumElements(),
+    Assert(cast<VectorType>(SrcTy)->getElementCount() ==
+               cast<VectorType>(DestTy)->getElementCount(),
            "SIToFP source and dest vector length mismatch", &I);
 
   visitInstruction(I);
@@ -2707,8 +2790,8 @@ void Verifier::visitFPToUIInst(FPToUIInst &I) {
          "FPToUI result must be integer or integer vector", &I);
 
   if (SrcVec && DstVec)
-    Assert(cast<VectorType>(SrcTy)->getNumElements() ==
-               cast<VectorType>(DestTy)->getNumElements(),
+    Assert(cast<VectorType>(SrcTy)->getElementCount() ==
+               cast<VectorType>(DestTy)->getElementCount(),
            "FPToUI source and dest vector length mismatch", &I);
 
   visitInstruction(I);
@@ -2730,8 +2813,8 @@ void Verifier::visitFPToSIInst(FPToSIInst &I) {
          "FPToSI result must be integer or integer vector", &I);
 
   if (SrcVec && DstVec)
-    Assert(cast<VectorType>(SrcTy)->getNumElements() ==
-               cast<VectorType>(DestTy)->getNumElements(),
+    Assert(cast<VectorType>(SrcTy)->getElementCount() ==
+               cast<VectorType>(DestTy)->getElementCount(),
            "FPToSI source and dest vector length mismatch", &I);
 
   visitInstruction(I);
@@ -2753,9 +2836,9 @@ void Verifier::visitPtrToIntInst(PtrToIntInst &I) {
          &I);
 
   if (SrcTy->isVectorTy()) {
-    VectorType *VSrc = cast<VectorType>(SrcTy);
-    VectorType *VDest = cast<VectorType>(DestTy);
-    Assert(VSrc->getNumElements() == VDest->getNumElements(),
+    auto *VSrc = cast<VectorType>(SrcTy);
+    auto *VDest = cast<VectorType>(DestTy);
+    Assert(VSrc->getElementCount() == VDest->getElementCount(),
            "PtrToInt Vector width mismatch", &I);
   }
 
@@ -2778,9 +2861,9 @@ void Verifier::visitIntToPtrInst(IntToPtrInst &I) {
   Assert(SrcTy->isVectorTy() == DestTy->isVectorTy(), "IntToPtr type mismatch",
          &I);
   if (SrcTy->isVectorTy()) {
-    VectorType *VSrc = cast<VectorType>(SrcTy);
-    VectorType *VDest = cast<VectorType>(DestTy);
-    Assert(VSrc->getNumElements() == VDest->getNumElements(),
+    auto *VSrc = cast<VectorType>(SrcTy);
+    auto *VDest = cast<VectorType>(DestTy);
+    Assert(VSrc->getElementCount() == VDest->getElementCount(),
            "IntToPtr Vector width mismatch", &I);
   }
   visitInstruction(I);
@@ -2803,8 +2886,9 @@ void Verifier::visitAddrSpaceCastInst(AddrSpaceCastInst &I) {
          &I);
   Assert(SrcTy->getPointerAddressSpace() != DestTy->getPointerAddressSpace(),
          "AddrSpaceCast must be between different address spaces", &I);
-  if (SrcTy->isVectorTy())
-    Assert(SrcTy->getVectorNumElements() == DestTy->getVectorNumElements(),
+  if (auto *SrcVTy = dyn_cast<VectorType>(SrcTy))
+    Assert(SrcVTy->getNumElements() ==
+               cast<VectorType>(DestTy)->getNumElements(),
            "AddrSpaceCast vector pointer number of elements mismatch", &I);
   visitInstruction(I);
 }
@@ -2836,9 +2920,9 @@ void Verifier::visitPHINode(PHINode &PN) {
 }
 
 void Verifier::visitCallBase(CallBase &Call) {
-  Assert(Call.getCalledValue()->getType()->isPointerTy(),
+  Assert(Call.getCalledOperand()->getType()->isPointerTy(),
          "Called function must be a pointer!", Call);
-  PointerType *FPTy = cast<PointerType>(Call.getCalledValue()->getType());
+  PointerType *FPTy = cast<PointerType>(Call.getCalledOperand()->getType());
 
   Assert(FPTy->getElementType()->isFunctionTy(),
          "Called function is not pointer to function type!", Call);
@@ -2871,16 +2955,23 @@ void Verifier::visitCallBase(CallBase &Call) {
   bool IsIntrinsic = Call.getCalledFunction() &&
                      Call.getCalledFunction()->getName().startswith("llvm.");
 
-  Function *Callee
-    = dyn_cast<Function>(Call.getCalledValue()->stripPointerCasts());
+  Function *Callee =
+      dyn_cast<Function>(Call.getCalledOperand()->stripPointerCasts());
 
-  if (Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::Speculatable)) {
+  if (Attrs.hasFnAttribute(Attribute::Speculatable)) {
     // Don't allow speculatable on call sites, unless the underlying function
     // declaration is also speculatable.
     Assert(Callee && Callee->isSpeculatable(),
            "speculatable attribute may not apply to call sites", Call);
   }
 
+  if (Attrs.hasFnAttribute(Attribute::Preallocated)) {
+    Assert(Call.getCalledFunction()->getIntrinsicID() ==
+               Intrinsic::call_preallocated_arg,
+           "preallocated as a call site attribute can only be on "
+           "llvm.call.preallocated.arg");
+  }
+
   // Verify call attributes.
   verifyFunctionAttrs(FTy, Attrs, &Call, IsIntrinsic);
 
@@ -2927,6 +3018,17 @@ void Verifier::visitCallBase(CallBase &Call) {
       Assert(isa<ConstantInt>(ArgVal) || isa<ConstantFP>(ArgVal),
              "immarg operand has non-immediate parameter", ArgVal, Call);
     }
+
+    if (Call.paramHasAttr(i, Attribute::Preallocated)) {
+      Value *ArgVal = Call.getArgOperand(i);
+      bool hasOB =
+          Call.countOperandBundlesOfType(LLVMContext::OB_preallocated) != 0;
+      bool isMustTail = Call.isMustTailCall();
+      Assert(hasOB != isMustTail,
+             "preallocated operand either requires a preallocated bundle or "
+             "the call to be musttail (but not both)",
+             ArgVal, Call);
+    }
   }
 
   if (FTy->isVarArg()) {
@@ -2997,9 +3099,11 @@ void Verifier::visitCallBase(CallBase &Call) {
       visitIntrinsicCall(ID, Call);
 
   // Verify that a callsite has at most one "deopt", at most one "funclet", at
-  // most one "gc-transition", and at most one "cfguardtarget" operand bundle.
+  // most one "gc-transition", at most one "cfguardtarget",
+  // and at most one "preallocated" operand bundle.
   bool FoundDeoptBundle = false, FoundFuncletBundle = false,
-       FoundGCTransitionBundle = false, FoundCFGuardTargetBundle = false;
+       FoundGCTransitionBundle = false, FoundCFGuardTargetBundle = false,
+       FoundPreallocatedBundle = false, FoundGCLiveBundle = false;;
   for (unsigned i = 0, e = Call.getNumOperandBundles(); i < e; ++i) {
     OperandBundleUse BU = Call.getOperandBundleAt(i);
     uint32_t Tag = BU.getTagID();
@@ -3024,6 +3128,22 @@ void Verifier::visitCallBase(CallBase &Call) {
       FoundCFGuardTargetBundle = true;
       Assert(BU.Inputs.size() == 1,
              "Expected exactly one cfguardtarget bundle operand", Call);
+    } else if (Tag == LLVMContext::OB_preallocated) {
+      Assert(!FoundPreallocatedBundle, "Multiple preallocated operand bundles",
+             Call);
+      FoundPreallocatedBundle = true;
+      Assert(BU.Inputs.size() == 1,
+             "Expected exactly one preallocated bundle operand", Call);
+      auto Input = dyn_cast<IntrinsicInst>(BU.Inputs.front());
+      Assert(Input &&
+                 Input->getIntrinsicID() == Intrinsic::call_preallocated_setup,
+             "\"preallocated\" argument must be a token from "
+             "llvm.call.preallocated.setup",
+             Call);
+    } else if (Tag == LLVMContext::OB_gc_live) {
+      Assert(!FoundGCLiveBundle, "Multiple gc-live operand bundles",
+             Call);
+      FoundGCLiveBundle = true;
     }
   }
 
@@ -3054,15 +3174,17 @@ static bool isTypeCongruent(Type *L, Type *R) {
 
 static AttrBuilder getParameterABIAttributes(int I, AttributeList Attrs) {
   static const Attribute::AttrKind ABIAttrs[] = {
-      Attribute::StructRet, Attribute::ByVal, Attribute::InAlloca,
-      Attribute::InReg, Attribute::Returned, Attribute::SwiftSelf,
-      Attribute::SwiftError};
+      Attribute::StructRet,   Attribute::ByVal,     Attribute::InAlloca,
+      Attribute::InReg,       Attribute::SwiftSelf, Attribute::SwiftError,
+      Attribute::Preallocated};
   AttrBuilder Copy;
   for (auto AK : ABIAttrs) {
     if (Attrs.hasParamAttribute(I, AK))
       Copy.addAttribute(AK);
   }
-  if (Attrs.hasParamAttribute(I, Attribute::Alignment))
+  // `align` is ABI-affecting only in combination with `byval`.
+  if (Attrs.hasParamAttribute(I, Attribute::Alignment) &&
+      Attrs.hasParamAttribute(I, Attribute::ByVal))
     Copy.addAlignmentAttr(Attrs.getParamAlignment(I));
   return Copy;
 }
@@ -3096,7 +3218,7 @@ void Verifier::verifyMustTailCall(CallInst &CI) {
          "cannot guarantee tail call due to mismatched calling conv", &CI);
 
   // - All ABI-impacting function attributes, such as sret, byval, inreg,
-  //   returned, and inalloca, must match.
+  //   returned, preallocated, and inalloca, must match.
   AttributeList CallerAttrs = F->getAttributes();
   AttributeList CalleeAttrs = CI.getAttributes();
   for (int I = 0, E = CallerTy->getNumParams(); I != E; ++I) {
@@ -3154,7 +3276,7 @@ void Verifier::visitInvokeInst(InvokeInst &II) {
 /// visitUnaryOperator - Check the argument to the unary operator.
 ///
 void Verifier::visitUnaryOperator(UnaryOperator &U) {
-  Assert(U.getType() == U.getOperand(0)->getType(), 
+  Assert(U.getType() == U.getOperand(0)->getType(),
          "Unary operators must have same type for"
          "operands and result!",
          &U);
@@ -3286,7 +3408,7 @@ void Verifier::visitInsertElementInst(InsertElementInst &IE) {
 
 void Verifier::visitShuffleVectorInst(ShuffleVectorInst &SV) {
   Assert(ShuffleVectorInst::isValidOperands(SV.getOperand(0), SV.getOperand(1),
-                                            SV.getOperand(2)),
+                                            SV.getShuffleMask()),
          "Invalid shufflevector operands!", &SV);
   visitInstruction(SV);
 }
@@ -3310,16 +3432,18 @@ void Verifier::visitGetElementPtrInst(GetElementPtrInst &GEP) {
              GEP.getResultElementType() == ElTy,
          "GEP is not of right type for indices!", &GEP, ElTy);
 
-  if (GEP.getType()->isVectorTy()) {
+  if (auto *GEPVTy = dyn_cast<VectorType>(GEP.getType())) {
     // Additional checks for vector GEPs.
-    unsigned GEPWidth = GEP.getType()->getVectorNumElements();
+    ElementCount GEPWidth = GEPVTy->getElementCount();
     if (GEP.getPointerOperandType()->isVectorTy())
-      Assert(GEPWidth == GEP.getPointerOperandType()->getVectorNumElements(),
-             "Vector GEP result width doesn't match operand's", &GEP);
+      Assert(
+          GEPWidth ==
+              cast<VectorType>(GEP.getPointerOperandType())->getElementCount(),
+          "Vector GEP result width doesn't match operand's", &GEP);
     for (Value *Idx : Idxs) {
       Type *IndexTy = Idx->getType();
-      if (IndexTy->isVectorTy()) {
-        unsigned IndexWidth = IndexTy->getVectorNumElements();
+      if (auto *IndexVTy = dyn_cast<VectorType>(IndexTy)) {
+        ElementCount IndexWidth = IndexVTy->getElementCount();
         Assert(IndexWidth == GEPWidth, "Invalid GEP index vector width", &GEP);
       }
       Assert(IndexTy->isIntOrIntVectorTy(),
@@ -4050,23 +4174,28 @@ void Verifier::visitProfMetadata(Instruction &I, MDNode *MD) {
 
   // Check consistency of !prof branch_weights metadata.
   if (ProfName.equals("branch_weights")) {
-    unsigned ExpectedNumOperands = 0;
-    if (BranchInst *BI = dyn_cast<BranchInst>(&I))
-      ExpectedNumOperands = BI->getNumSuccessors();
-    else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I))
-      ExpectedNumOperands = SI->getNumSuccessors();
-    else if (isa<CallInst>(&I) || isa<InvokeInst>(&I))
-      ExpectedNumOperands = 1;
-    else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(&I))
-      ExpectedNumOperands = IBI->getNumDestinations();
-    else if (isa<SelectInst>(&I))
-      ExpectedNumOperands = 2;
-    else
-      CheckFailed("!prof branch_weights are not allowed for this instruction",
-                  MD);
+    if (isa<InvokeInst>(&I)) {
+      Assert(MD->getNumOperands() == 2 || MD->getNumOperands() == 3,
+             "Wrong number of InvokeInst branch_weights operands", MD);
+    } else {
+      unsigned ExpectedNumOperands = 0;
+      if (BranchInst *BI = dyn_cast<BranchInst>(&I))
+        ExpectedNumOperands = BI->getNumSuccessors();
+      else if (SwitchInst *SI = dyn_cast<SwitchInst>(&I))
+        ExpectedNumOperands = SI->getNumSuccessors();
+      else if (isa<CallInst>(&I))
+        ExpectedNumOperands = 1;
+      else if (IndirectBrInst *IBI = dyn_cast<IndirectBrInst>(&I))
+        ExpectedNumOperands = IBI->getNumDestinations();
+      else if (isa<SelectInst>(&I))
+        ExpectedNumOperands = 2;
+      else
+        CheckFailed("!prof branch_weights are not allowed for this instruction",
+                    MD);
 
-    Assert(MD->getNumOperands() == 1 + ExpectedNumOperands,
-           "Wrong number of operands", MD);
+      Assert(MD->getNumOperands() == 1 + ExpectedNumOperands,
+             "Wrong number of operands", MD);
+    }
     for (unsigned i = 1; i < MD->getNumOperands(); ++i) {
       auto &MDO = MD->getOperand(i);
       Assert(MDO, "second operand should not be null", MD);
@@ -4238,7 +4367,7 @@ void Verifier::visitInstruction(Instruction &I) {
 
   if (MDNode *N = I.getDebugLoc().getAsMDNode()) {
     AssertDI(isa<DILocation>(N), "invalid !dbg metadata attachment", &I, N);
-    visitMDNode(*N);
+    visitMDNode(*N, AreDebugLocsAllowed::Yes);
   }
 
   if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&I)) {
@@ -4246,6 +4375,17 @@ void Verifier::visitInstruction(Instruction &I) {
     verifyNotEntryValue(*DII);
   }
 
+  SmallVector<std::pair<unsigned, MDNode *>, 4> MDs;
+  I.getAllMetadata(MDs);
+  for (auto Attachment : MDs) {
+    unsigned Kind = Attachment.first;
+    auto AllowLocs =
+        (Kind == LLVMContext::MD_dbg || Kind == LLVMContext::MD_loop)
+            ? AreDebugLocsAllowed::Yes
+            : AreDebugLocsAllowed::No;
+    visitMDNode(*Attachment.second, AllowLocs);
+  }
+
   InstsInThisBlock.insert(&I);
 }
 
@@ -4304,6 +4444,41 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   switch (ID) {
   default:
     break;
+  case Intrinsic::assume: {
+    for (auto &Elem : Call.bundle_op_infos()) {
+      Assert(Elem.Tag->getKey() == "ignore" ||
+                 Attribute::isExistingAttribute(Elem.Tag->getKey()),
+             "tags must be valid attribute names");
+      Attribute::AttrKind Kind =
+          Attribute::getAttrKindFromName(Elem.Tag->getKey());
+      unsigned ArgCount = Elem.End - Elem.Begin;
+      if (Kind == Attribute::Alignment) {
+        Assert(ArgCount <= 3 && ArgCount >= 2,
+               "alignment assumptions should have 2 or 3 arguments");
+        Assert(Call.getOperand(Elem.Begin)->getType()->isPointerTy(),
+               "first argument should be a pointer");
+        Assert(Call.getOperand(Elem.Begin + 1)->getType()->isIntegerTy(),
+               "second argument should be an integer");
+        if (ArgCount == 3)
+          Assert(Call.getOperand(Elem.Begin + 2)->getType()->isIntegerTy(),
+                 "third argument should be an integer if present");
+        return;
+      }
+      Assert(ArgCount <= 2, "to many arguments");
+      if (Kind == Attribute::None)
+        break;
+      if (Attribute::doesAttrKindHaveArgument(Kind)) {
+        Assert(ArgCount == 2, "this attribute should have 2 arguments");
+        Assert(isa<ConstantInt>(Call.getOperand(Elem.Begin + 1)),
+               "the second argument should be a constant integral value");
+      } else if (isFuncOnlyAttr(Kind)) {
+        Assert((ArgCount) == 0, "this attribute has no argument");
+      } else if (!isFuncOrArgAttr(Kind)) {
+        Assert((ArgCount) == 1, "this attribute should have one argument");
+      }
+    }
+    break;
+  }
   case Intrinsic::coro_id: {
     auto *InfoArg = Call.getArgOperand(3)->stripPointerCasts();
     if (isa<ConstantPointerNull>(InfoArg))
@@ -4318,7 +4493,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
       "an array");
     break;
   }
-#define INSTRUCTION(NAME, NARGS, ROUND_MODE, INTRINSIC, DAGN)                  \
+#define INSTRUCTION(NAME, NARGS, ROUND_MODE, INTRINSIC)                        \
   case Intrinsic::INTRINSIC:
 #include "llvm/IR/ConstrainedOps.def"
     visitConstrainedFPIntrinsic(cast<ConstrainedFPIntrinsic>(Call));
@@ -4338,6 +4513,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     visitDbgLabelIntrinsic("label", cast<DbgLabelInst>(Call));
     break;
   case Intrinsic::memcpy:
+  case Intrinsic::memcpy_inline:
   case Intrinsic::memmove:
   case Intrinsic::memset: {
     const auto *MI = cast<MemIntrinsic>(&Call);
@@ -4368,15 +4544,6 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
            "must be a power of 2",
            Call);
 
-    if (auto *LengthCI = dyn_cast<ConstantInt>(AMI->getLength())) {
-      uint64_t Length = LengthCI->getZExtValue();
-      uint64_t ElementSize = AMI->getElementSizeInBytes();
-      Assert((Length % ElementSize) == 0,
-             "constant length must be a multiple of the element size in the "
-             "element-wise atomic memory intrinsic",
-             Call);
-    }
-
     auto IsValidAlignment = [&](uint64_t Alignment) {
       return isPowerOf2_64(Alignment) && ElementSizeVal.ule(Alignment);
     };
@@ -4390,6 +4557,85 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     }
     break;
   }
+  case Intrinsic::call_preallocated_setup: {
+    auto *NumArgs = dyn_cast<ConstantInt>(Call.getArgOperand(0));
+    Assert(NumArgs != nullptr,
+           "llvm.call.preallocated.setup argument must be a constant");
+    bool FoundCall = false;
+    for (User *U : Call.users()) {
+      auto *UseCall = dyn_cast<CallBase>(U);
+      Assert(UseCall != nullptr,
+             "Uses of llvm.call.preallocated.setup must be calls");
+      const Function *Fn = UseCall->getCalledFunction();
+      if (Fn && Fn->getIntrinsicID() == Intrinsic::call_preallocated_arg) {
+        auto *AllocArgIndex = dyn_cast<ConstantInt>(UseCall->getArgOperand(1));
+        Assert(AllocArgIndex != nullptr,
+               "llvm.call.preallocated.alloc arg index must be a constant");
+        auto AllocArgIndexInt = AllocArgIndex->getValue();
+        Assert(AllocArgIndexInt.sge(0) &&
+                   AllocArgIndexInt.slt(NumArgs->getValue()),
+               "llvm.call.preallocated.alloc arg index must be between 0 and "
+               "corresponding "
+               "llvm.call.preallocated.setup's argument count");
+      } else if (Fn && Fn->getIntrinsicID() ==
+                           Intrinsic::call_preallocated_teardown) {
+        // nothing to do
+      } else {
+        Assert(!FoundCall, "Can have at most one call corresponding to a "
+                           "llvm.call.preallocated.setup");
+        FoundCall = true;
+        size_t NumPreallocatedArgs = 0;
+        for (unsigned i = 0; i < UseCall->getNumArgOperands(); i++) {
+          if (UseCall->paramHasAttr(i, Attribute::Preallocated)) {
+            ++NumPreallocatedArgs;
+          }
+        }
+        Assert(NumPreallocatedArgs != 0,
+               "cannot use preallocated intrinsics on a call without "
+               "preallocated arguments");
+        Assert(NumArgs->equalsInt(NumPreallocatedArgs),
+               "llvm.call.preallocated.setup arg size must be equal to number "
+               "of preallocated arguments "
+               "at call site",
+               Call, *UseCall);
+        // getOperandBundle() cannot be called if more than one of the operand
+        // bundle exists. There is already a check elsewhere for this, so skip
+        // here if we see more than one.
+        if (UseCall->countOperandBundlesOfType(LLVMContext::OB_preallocated) >
+            1) {
+          return;
+        }
+        auto PreallocatedBundle =
+            UseCall->getOperandBundle(LLVMContext::OB_preallocated);
+        Assert(PreallocatedBundle,
+               "Use of llvm.call.preallocated.setup outside intrinsics "
+               "must be in \"preallocated\" operand bundle");
+        Assert(PreallocatedBundle->Inputs.front().get() == &Call,
+               "preallocated bundle must have token from corresponding "
+               "llvm.call.preallocated.setup");
+      }
+    }
+    break;
+  }
+  case Intrinsic::call_preallocated_arg: {
+    auto *Token = dyn_cast<CallBase>(Call.getArgOperand(0));
+    Assert(Token && Token->getCalledFunction()->getIntrinsicID() ==
+                        Intrinsic::call_preallocated_setup,
+           "llvm.call.preallocated.arg token argument must be a "
+           "llvm.call.preallocated.setup");
+    Assert(Call.hasFnAttr(Attribute::Preallocated),
+           "llvm.call.preallocated.arg must be called with a \"preallocated\" "
+           "call site attribute");
+    break;
+  }
+  case Intrinsic::call_preallocated_teardown: {
+    auto *Token = dyn_cast<CallBase>(Call.getArgOperand(0));
+    Assert(Token && Token->getCalledFunction()->getIntrinsicID() ==
+                        Intrinsic::call_preallocated_setup,
+           "llvm.call.preallocated.teardown token argument must be a "
+           "llvm.call.preallocated.setup");
+    break;
+  }
   case Intrinsic::gcroot:
   case Intrinsic::gcwrite:
   case Intrinsic::gcread:
@@ -4506,20 +4752,20 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
              LandingPad->getParent());
       Assert(InvokeBB->getTerminator(), "safepoint block should be well formed",
              InvokeBB);
-      Assert(isStatepoint(InvokeBB->getTerminator()),
+      Assert(isa<GCStatepointInst>(InvokeBB->getTerminator()),
              "gc relocate should be linked to a statepoint", InvokeBB);
     } else {
       // In all other cases relocate should be tied to the statepoint directly.
       // This covers relocates on a normal return path of invoke statepoint and
       // relocates of a call statepoint.
       auto Token = Call.getArgOperand(0);
-      Assert(isa<Instruction>(Token) && isStatepoint(cast<Instruction>(Token)),
+      Assert(isa<GCStatepointInst>(Token),
              "gc relocate is incorrectly tied to the statepoint", Call, Token);
     }
 
     // Verify rest of the relocate arguments.
     const CallBase &StatepointCall =
-        *cast<CallBase>(cast<GCRelocateInst>(Call).getStatepoint());
+      *cast<GCRelocateInst>(Call).getStatepoint();
 
     // Both the base and derived must be piped through the safepoint.
     Value *Base = Call.getArgOperand(1);
@@ -4530,47 +4776,55 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     Assert(isa<ConstantInt>(Derived),
            "gc.relocate operand #3 must be integer offset", Call);
 
-    const int BaseIndex = cast<ConstantInt>(Base)->getZExtValue();
-    const int DerivedIndex = cast<ConstantInt>(Derived)->getZExtValue();
+    const uint64_t BaseIndex = cast<ConstantInt>(Base)->getZExtValue();
+    const uint64_t DerivedIndex = cast<ConstantInt>(Derived)->getZExtValue();
+
     // Check the bounds
-    Assert(0 <= BaseIndex && BaseIndex < (int)StatepointCall.arg_size(),
-           "gc.relocate: statepoint base index out of bounds", Call);
-    Assert(0 <= DerivedIndex && DerivedIndex < (int)StatepointCall.arg_size(),
-           "gc.relocate: statepoint derived index out of bounds", Call);
-
-    // Check that BaseIndex and DerivedIndex fall within the 'gc parameters'
-    // section of the statepoint's argument.
-    Assert(StatepointCall.arg_size() > 0,
-           "gc.statepoint: insufficient arguments");
-    Assert(isa<ConstantInt>(StatepointCall.getArgOperand(3)),
-           "gc.statement: number of call arguments must be constant integer");
-    const unsigned NumCallArgs =
+    if (auto Opt = StatepointCall.getOperandBundle(LLVMContext::OB_gc_live)) {
+      Assert(BaseIndex < Opt->Inputs.size(),
+             "gc.relocate: statepoint base index out of bounds", Call);
+      Assert(DerivedIndex < Opt->Inputs.size(),
+             "gc.relocate: statepoint derived index out of bounds", Call);
+    } else {
+      Assert(BaseIndex < StatepointCall.arg_size(),
+             "gc.relocate: statepoint base index out of bounds", Call);
+      Assert(DerivedIndex < StatepointCall.arg_size(),
+             "gc.relocate: statepoint derived index out of bounds", Call);
+
+      // Check that BaseIndex and DerivedIndex fall within the 'gc parameters'
+      // section of the statepoint's argument.
+      Assert(StatepointCall.arg_size() > 0,
+             "gc.statepoint: insufficient arguments");
+      Assert(isa<ConstantInt>(StatepointCall.getArgOperand(3)),
+             "gc.statement: number of call arguments must be constant integer");
+      const uint64_t NumCallArgs =
         cast<ConstantInt>(StatepointCall.getArgOperand(3))->getZExtValue();
-    Assert(StatepointCall.arg_size() > NumCallArgs + 5,
-           "gc.statepoint: mismatch in number of call arguments");
-    Assert(isa<ConstantInt>(StatepointCall.getArgOperand(NumCallArgs + 5)),
-           "gc.statepoint: number of transition arguments must be "
-           "a constant integer");
-    const int NumTransitionArgs =
-        cast<ConstantInt>(StatepointCall.getArgOperand(NumCallArgs + 5))
-            ->getZExtValue();
-    const int DeoptArgsStart = 4 + NumCallArgs + 1 + NumTransitionArgs + 1;
-    Assert(isa<ConstantInt>(StatepointCall.getArgOperand(DeoptArgsStart)),
-           "gc.statepoint: number of deoptimization arguments must be "
-           "a constant integer");
-    const int NumDeoptArgs =
-        cast<ConstantInt>(StatepointCall.getArgOperand(DeoptArgsStart))
-            ->getZExtValue();
-    const int GCParamArgsStart = DeoptArgsStart + 1 + NumDeoptArgs;
-    const int GCParamArgsEnd = StatepointCall.arg_size();
-    Assert(GCParamArgsStart <= BaseIndex && BaseIndex < GCParamArgsEnd,
-           "gc.relocate: statepoint base index doesn't fall within the "
-           "'gc parameters' section of the statepoint call",
-           Call);
-    Assert(GCParamArgsStart <= DerivedIndex && DerivedIndex < GCParamArgsEnd,
-           "gc.relocate: statepoint derived index doesn't fall within the "
-           "'gc parameters' section of the statepoint call",
-           Call);
+      Assert(StatepointCall.arg_size() > NumCallArgs + 5,
+             "gc.statepoint: mismatch in number of call arguments");
+      Assert(isa<ConstantInt>(StatepointCall.getArgOperand(NumCallArgs + 5)),
+             "gc.statepoint: number of transition arguments must be "
+             "a constant integer");
+      const uint64_t NumTransitionArgs =
+          cast<ConstantInt>(StatepointCall.getArgOperand(NumCallArgs + 5))
+              ->getZExtValue();
+      const uint64_t DeoptArgsStart = 4 + NumCallArgs + 1 + NumTransitionArgs + 1;
+      Assert(isa<ConstantInt>(StatepointCall.getArgOperand(DeoptArgsStart)),
+             "gc.statepoint: number of deoptimization arguments must be "
+             "a constant integer");
+      const uint64_t NumDeoptArgs =
+          cast<ConstantInt>(StatepointCall.getArgOperand(DeoptArgsStart))
+              ->getZExtValue();
+      const uint64_t GCParamArgsStart = DeoptArgsStart + 1 + NumDeoptArgs;
+      const uint64_t GCParamArgsEnd = StatepointCall.arg_size();
+      Assert(GCParamArgsStart <= BaseIndex && BaseIndex < GCParamArgsEnd,
+             "gc.relocate: statepoint base index doesn't fall within the "
+             "'gc parameters' section of the statepoint call",
+             Call);
+      Assert(GCParamArgsStart <= DerivedIndex && DerivedIndex < GCParamArgsEnd,
+             "gc.relocate: statepoint derived index doesn't fall within the "
+             "'gc parameters' section of the statepoint call",
+             Call);
+    }
 
     // Relocated value must be either a pointer type or vector-of-pointer type,
     // but gc_relocate does not need to return the same pointer type as the
@@ -4598,6 +4852,14 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
            "eh.exceptionpointer argument must be a catchpad", Call);
     break;
   }
+  case Intrinsic::get_active_lane_mask: {
+    Assert(Call.getType()->isVectorTy(), "get_active_lane_mask: must return a "
+           "vector", Call);
+    auto *ElemTy = Call.getType()->getScalarType();
+    Assert(ElemTy->isIntegerTy(1), "get_active_lane_mask: element type is not "
+           "i1", Call);
+    break;
+  }
   case Intrinsic::masked_load: {
     Assert(Call.getType()->isVectorTy(), "masked_load: must return a vector",
            Call);
@@ -4617,8 +4879,8 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
            "masked_load: return must match pointer type", Call);
     Assert(PassThru->getType() == DataTy,
            "masked_load: pass through and data type must match", Call);
-    Assert(Mask->getType()->getVectorNumElements() ==
-               DataTy->getVectorNumElements(),
+    Assert(cast<VectorType>(Mask->getType())->getElementCount() ==
+               cast<VectorType>(DataTy)->getElementCount(),
            "masked_load: vector mask must be same length as data", Call);
     break;
   }
@@ -4636,12 +4898,27 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
     Type *DataTy = cast<PointerType>(Ptr->getType())->getElementType();
     Assert(DataTy == Val->getType(),
            "masked_store: storee must match pointer type", Call);
-    Assert(Mask->getType()->getVectorNumElements() ==
-               DataTy->getVectorNumElements(),
+    Assert(cast<VectorType>(Mask->getType())->getElementCount() ==
+               cast<VectorType>(DataTy)->getElementCount(),
            "masked_store: vector mask must be same length as data", Call);
     break;
   }
 
+  case Intrinsic::masked_gather: {
+    const APInt &Alignment =
+        cast<ConstantInt>(Call.getArgOperand(1))->getValue();
+    Assert(Alignment.isNullValue() || Alignment.isPowerOf2(),
+           "masked_gather: alignment must be 0 or a power of 2", Call);
+    break;
+  }
+  case Intrinsic::masked_scatter: {
+    const APInt &Alignment =
+        cast<ConstantInt>(Call.getArgOperand(2))->getValue();
+    Assert(Alignment.isNullValue() || Alignment.isPowerOf2(),
+           "masked_scatter: alignment must be 0 or a power of 2", Call);
+    break;
+  }
+
   case Intrinsic::experimental_guard: {
     Assert(isa<CallInst>(Call), "experimental_guard cannot be invoked", Call);
     Assert(Call.countOperandBundlesOfType(LLVMContext::OB_deopt) == 1,
@@ -4691,7 +4968,9 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
   case Intrinsic::umul_fix:
   case Intrinsic::umul_fix_sat:
   case Intrinsic::sdiv_fix:
-  case Intrinsic::udiv_fix: {
+  case Intrinsic::sdiv_fix_sat:
+  case Intrinsic::udiv_fix:
+  case Intrinsic::udiv_fix_sat: {
     Value *Op1 = Call.getArgOperand(0);
     Value *Op2 = Call.getArgOperand(1);
     Assert(Op1->getType()->isIntOrIntVectorTy(),
@@ -4706,7 +4985,7 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
            "third argument of [us][mul|div]_fix[_sat] must fit within 32 bits");
 
     if (ID == Intrinsic::smul_fix || ID == Intrinsic::smul_fix_sat ||
-        ID == Intrinsic::sdiv_fix) {
+        ID == Intrinsic::sdiv_fix || ID == Intrinsic::sdiv_fix_sat) {
       Assert(
           Op3->getZExtValue() < Op1->getType()->getScalarSizeInBits(),
           "the scale of s[mul|div]_fix[_sat] must be less than the width of "
@@ -4728,6 +5007,85 @@ void Verifier::visitIntrinsicCall(Intrinsic::ID ID, CallBase &Call) {
            "Intrinsic does not support vectors", &Call);
     break;
   }
+  case Intrinsic::bswap: {
+    Type *Ty = Call.getType();
+    unsigned Size = Ty->getScalarSizeInBits();
+    Assert(Size % 16 == 0, "bswap must be an even number of bytes", &Call);
+    break;
+  }
+  case Intrinsic::matrix_multiply:
+  case Intrinsic::matrix_transpose:
+  case Intrinsic::matrix_column_major_load:
+  case Intrinsic::matrix_column_major_store: {
+    Function *IF = Call.getCalledFunction();
+    ConstantInt *Stride = nullptr;
+    ConstantInt *NumRows;
+    ConstantInt *NumColumns;
+    VectorType *ResultTy;
+    Type *Op0ElemTy = nullptr;
+    Type *Op1ElemTy = nullptr;
+    switch (ID) {
+    case Intrinsic::matrix_multiply:
+      NumRows = cast<ConstantInt>(Call.getArgOperand(2));
+      NumColumns = cast<ConstantInt>(Call.getArgOperand(4));
+      ResultTy = cast<VectorType>(Call.getType());
+      Op0ElemTy =
+          cast<VectorType>(Call.getArgOperand(0)->getType())->getElementType();
+      Op1ElemTy =
+          cast<VectorType>(Call.getArgOperand(1)->getType())->getElementType();
+      break;
+    case Intrinsic::matrix_transpose:
+      NumRows = cast<ConstantInt>(Call.getArgOperand(1));
+      NumColumns = cast<ConstantInt>(Call.getArgOperand(2));
+      ResultTy = cast<VectorType>(Call.getType());
+      Op0ElemTy =
+          cast<VectorType>(Call.getArgOperand(0)->getType())->getElementType();
+      break;
+    case Intrinsic::matrix_column_major_load:
+      Stride = dyn_cast<ConstantInt>(Call.getArgOperand(1));
+      NumRows = cast<ConstantInt>(Call.getArgOperand(3));
+      NumColumns = cast<ConstantInt>(Call.getArgOperand(4));
+      ResultTy = cast<VectorType>(Call.getType());
+      Op0ElemTy =
+          cast<PointerType>(Call.getArgOperand(0)->getType())->getElementType();
+      break;
+    case Intrinsic::matrix_column_major_store:
+      Stride = dyn_cast<ConstantInt>(Call.getArgOperand(2));
+      NumRows = cast<ConstantInt>(Call.getArgOperand(4));
+      NumColumns = cast<ConstantInt>(Call.getArgOperand(5));
+      ResultTy = cast<VectorType>(Call.getArgOperand(0)->getType());
+      Op0ElemTy =
+          cast<VectorType>(Call.getArgOperand(0)->getType())->getElementType();
+      Op1ElemTy =
+          cast<PointerType>(Call.getArgOperand(1)->getType())->getElementType();
+      break;
+    default:
+      llvm_unreachable("unexpected intrinsic");
+    }
+
+    Assert(ResultTy->getElementType()->isIntegerTy() ||
+           ResultTy->getElementType()->isFloatingPointTy(),
+           "Result type must be an integer or floating-point type!", IF);
+
+    Assert(ResultTy->getElementType() == Op0ElemTy,
+           "Vector element type mismatch of the result and first operand "
+           "vector!", IF);
+
+    if (Op1ElemTy)
+      Assert(ResultTy->getElementType() == Op1ElemTy,
+             "Vector element type mismatch of the result and second operand "
+             "vector!", IF);
+
+    Assert(ResultTy->getNumElements() ==
+               NumRows->getZExtValue() * NumColumns->getZExtValue(),
+           "Result of a matrix operation does not fit in the returned vector!");
+
+    if (Stride)
+      Assert(Stride->getZExtValue() >= NumRows->getZExtValue(),
+             "Stride must be greater or equal than the number of rows!", IF);
+
+    break;
+  }
   };
 }
 
@@ -4754,7 +5112,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
   unsigned NumOperands;
   bool HasRoundingMD;
   switch (FPI.getIntrinsicID()) {
-#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC, DAGN)                   \
+#define INSTRUCTION(NAME, NARG, ROUND_MODE, INTRINSIC)                         \
   case Intrinsic::INTRINSIC:                                                   \
     NumOperands = NARG;                                                        \
     HasRoundingMD = ROUND_MODE;                                                \
@@ -4777,7 +5135,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
     Type *ResultTy = FPI.getType();
     Assert(!ValTy->isVectorTy() && !ResultTy->isVectorTy(),
            "Intrinsic does not support vectors", &FPI);
-  } 
+  }
     break;
 
   case Intrinsic::experimental_constrained_lround:
@@ -4787,7 +5145,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
     Assert(!ValTy->isVectorTy() && !ResultTy->isVectorTy(),
            "Intrinsic does not support vectors", &FPI);
     break;
-  } 
+  }
 
   case Intrinsic::experimental_constrained_fcmp:
   case Intrinsic::experimental_constrained_fcmps: {
@@ -4798,7 +5156,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
   }
 
   case Intrinsic::experimental_constrained_fptosi:
-  case Intrinsic::experimental_constrained_fptoui: { 
+  case Intrinsic::experimental_constrained_fptoui: {
     Value *Operand = FPI.getArgOperand(0);
     uint64_t NumSrcElem = 0;
     Assert(Operand->getType()->isFPOrFPVectorTy(),
@@ -4870,7 +5228,7 @@ void Verifier::visitConstrainedFPIntrinsic(ConstrainedFPIntrinsic &FPI) {
              "Intrinsic first argument's type must be smaller than result type",
              &FPI);
     }
-  } 
+  }
     break;
 
   default:
@@ -5136,7 +5494,7 @@ struct VerifierLegacyPass : public FunctionPass {
 
   bool runOnFunction(Function &F) override {
     if (!V->verify(F) && FatalErrors) {
-      errs() << "in function " << F.getName() << '\n'; 
+      errs() << "in function " << F.getName() << '\n';
       report_fatal_error("Broken function found, compilation aborted!");
     }
     return false;
diff --git a/llvm/lib/IRReader/IRReader.cpp b/llvm/lib/IRReader/IRReader.cpp
index 7ca6c2fca52a..cb33e40be61b 100644
--- a/llvm/lib/IRReader/IRReader.cpp
+++ b/llvm/lib/IRReader/IRReader.cpp
@@ -67,15 +67,14 @@ std::unique_ptr<Module> llvm::getLazyIRFileModule(StringRef Filename,
 
 std::unique_ptr<Module> llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
                                       LLVMContext &Context,
-                                      bool UpgradeDebugInfo,
-                                      StringRef DataLayoutString) {
+                                      DataLayoutCallbackTy DataLayoutCallback) {
   NamedRegionTimer T(TimeIRParsingName, TimeIRParsingDescription,
                      TimeIRParsingGroupName, TimeIRParsingGroupDescription,
                      TimePassesIsEnabled);
   if (isBitcode((const unsigned char *)Buffer.getBufferStart(),
                 (const unsigned char *)Buffer.getBufferEnd())) {
     Expected<std::unique_ptr<Module>> ModuleOrErr =
-        parseBitcodeFile(Buffer, Context);
+        parseBitcodeFile(Buffer, Context, DataLayoutCallback);
     if (Error E = ModuleOrErr.takeError()) {
       handleAllErrors(std::move(E), [&](ErrorInfoBase &EIB) {
         Err = SMDiagnostic(Buffer.getBufferIdentifier(), SourceMgr::DK_Error,
@@ -83,19 +82,15 @@ std::unique_ptr<Module> llvm::parseIR(MemoryBufferRef Buffer, SMDiagnostic &Err,
       });
       return nullptr;
     }
-    if (!DataLayoutString.empty())
-      ModuleOrErr.get()->setDataLayout(DataLayoutString);
     return std::move(ModuleOrErr.get());
   }
 
-  return parseAssembly(Buffer, Err, Context, nullptr, UpgradeDebugInfo,
-                       DataLayoutString);
+  return parseAssembly(Buffer, Err, Context, nullptr, DataLayoutCallback);
 }
 
-std::unique_ptr<Module> llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err,
-                                          LLVMContext &Context,
-                                          bool UpgradeDebugInfo,
-                                          StringRef DataLayoutString) {
+std::unique_ptr<Module>
+llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err, LLVMContext &Context,
+                  DataLayoutCallbackTy DataLayoutCallback) {
   ErrorOr<std::unique_ptr<MemoryBuffer>> FileOrErr =
       MemoryBuffer::getFileOrSTDIN(Filename);
   if (std::error_code EC = FileOrErr.getError()) {
@@ -105,7 +100,7 @@ std::unique_ptr<Module> llvm::parseIRFile(StringRef Filename, SMDiagnostic &Err,
   }
 
   return parseIR(FileOrErr.get()->getMemBufferRef(), Err, Context,
-                 UpgradeDebugInfo, DataLayoutString);
+                 DataLayoutCallback);
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/LTO/Caching.cpp b/llvm/lib/LTO/Caching.cpp
index 12dcd182de2d..46cac3fb1830 100644
--- a/llvm/lib/LTO/Caching.cpp
+++ b/llvm/lib/LTO/Caching.cpp
@@ -144,7 +144,7 @@ Expected<NativeObjectCache> lto::localCache(StringRef CacheDirectoryPath,
       // This CacheStream will move the temporary file into the cache when done.
       return std::make_unique<CacheStream>(
           std::make_unique<raw_fd_ostream>(Temp->FD, /* ShouldClose */ false),
-          AddBuffer, std::move(*Temp), EntryPath.str(), Task);
+          AddBuffer, std::move(*Temp), std::string(EntryPath.str()), Task);
     };
   };
 }
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 297b11de17a9..6e1e3998e490 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -12,6 +12,8 @@
 
 #include "llvm/LTO/LTO.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Analysis/StackSafetyAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
@@ -21,10 +23,10 @@
 #include "llvm/IR/AutoUpgrade.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Metadata.h"
-#include "llvm/IR/RemarkStreamer.h"
 #include "llvm/LTO/LTOBackend.h"
 #include "llvm/LTO/SummaryBasedOptimizations.h"
 #include "llvm/Linker/IRMover.h"
@@ -39,6 +41,7 @@
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Support/VCSRevision.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -96,22 +99,12 @@ void llvm::computeLTOCacheKey(
   };
   auto AddUnsigned = [&](unsigned I) {
     uint8_t Data[4];
-    Data[0] = I;
-    Data[1] = I >> 8;
-    Data[2] = I >> 16;
-    Data[3] = I >> 24;
+    support::endian::write32le(Data, I);
     Hasher.update(ArrayRef<uint8_t>{Data, 4});
   };
   auto AddUint64 = [&](uint64_t I) {
     uint8_t Data[8];
-    Data[0] = I;
-    Data[1] = I >> 8;
-    Data[2] = I >> 16;
-    Data[3] = I >> 24;
-    Data[4] = I >> 32;
-    Data[5] = I >> 40;
-    Data[6] = I >> 48;
-    Data[7] = I >> 56;
+    support::endian::write64le(Data, I);
     Hasher.update(ArrayRef<uint8_t>{Data, 8});
   };
   AddString(Conf.CPU);
@@ -147,8 +140,17 @@ void llvm::computeLTOCacheKey(
   // Include the hash for the current module
   auto ModHash = Index.getModuleHash(ModuleID);
   Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
+
+  std::vector<uint64_t> ExportsGUID;
+  ExportsGUID.reserve(ExportList.size());
   for (const auto &VI : ExportList) {
     auto GUID = VI.getGUID();
+    ExportsGUID.push_back(GUID);
+  }
+
+  // Sort the export list elements GUIDs.
+  llvm::sort(ExportsGUID);
+  for (uint64_t GUID : ExportsGUID) {
     // The export list can impact the internalization, be conservative here
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&GUID, sizeof(GUID)));
   }
@@ -156,12 +158,23 @@ void llvm::computeLTOCacheKey(
   // Include the hash for every module we import functions from. The set of
   // imported symbols for each module may affect code generation and is
   // sensitive to link order, so include that as well.
-  for (auto &Entry : ImportList) {
-    auto ModHash = Index.getModuleHash(Entry.first());
+  using ImportMapIteratorTy = FunctionImporter::ImportMapTy::const_iterator;
+  std::vector<ImportMapIteratorTy> ImportModulesVector;
+  ImportModulesVector.reserve(ImportList.size());
+
+  for (ImportMapIteratorTy It = ImportList.begin(); It != ImportList.end();
+       ++It) {
+    ImportModulesVector.push_back(It);
+  }
+  llvm::sort(ImportModulesVector,
+             [](const ImportMapIteratorTy &Lhs, const ImportMapIteratorTy &Rhs)
+                 -> bool { return Lhs->getKey() < Rhs->getKey(); });
+  for (const ImportMapIteratorTy &EntryIt : ImportModulesVector) {
+    auto ModHash = Index.getModuleHash(EntryIt->first());
     Hasher.update(ArrayRef<uint8_t>((uint8_t *)&ModHash[0], sizeof(ModHash)));
 
-    AddUint64(Entry.second.size());
-    for (auto &Fn : Entry.second)
+    AddUint64(EntryIt->second.size());
+    for (auto &Fn : EntryIt->second)
       AddUint64(Fn);
   }
 
@@ -513,7 +526,7 @@ void LTO::addModuleToGlobalRes(ArrayRef<InputFile::Symbol> Syms,
       assert(!GlobalRes.Prevailing &&
              "Multiple prevailing defs are not allowed");
       GlobalRes.Prevailing = true;
-      GlobalRes.IRName = Sym.getIRName();
+      GlobalRes.IRName = std::string(Sym.getIRName());
     } else if (!GlobalRes.Prevailing && GlobalRes.IRName.empty()) {
       // Sometimes it can be two copies of symbol in a module and prevailing
       // symbol can have no IR name. That might happen if symbol is defined in
@@ -521,7 +534,7 @@ void LTO::addModuleToGlobalRes(ArrayRef<InputFile::Symbol> Syms,
       // the same symbol we want to use IR name of the prevailing symbol.
       // Otherwise, if we haven't seen a prevailing symbol, set the name so that
       // we can later use it to check if there is any prevailing copy in IR.
-      GlobalRes.IRName = Sym.getIRName();
+      GlobalRes.IRName = std::string(Sym.getIRName());
     }
 
     // Set the partition to external if we know it is re-defined by the linker
@@ -611,6 +624,7 @@ Error LTO::addModule(InputFile &Input, unsigned ModI,
   if (LTOInfo->IsThinLTO)
     return addThinLTO(BM, ModSyms, ResI, ResE);
 
+  RegularLTO.EmptyCombinedModule = false;
   Expected<RegularLTOState::AddedModule> ModOrErr =
       addRegularLTO(BM, ModSyms, ResI, ResE);
   if (!ModOrErr)
@@ -762,10 +776,11 @@ LTO::addRegularLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
     if (Sym.isCommon()) {
       // FIXME: We should figure out what to do about commons defined by asm.
       // For now they aren't reported correctly by ModuleSymbolTable.
-      auto &CommonRes = RegularLTO.Commons[Sym.getIRName()];
+      auto &CommonRes = RegularLTO.Commons[std::string(Sym.getIRName())];
       CommonRes.Size = std::max(CommonRes.Size, Sym.getCommonSize());
-      CommonRes.Align =
-          std::max(CommonRes.Align, MaybeAlign(Sym.getCommonAlignment()));
+      MaybeAlign SymAlign(Sym.getCommonAlignment());
+      if (SymAlign)
+        CommonRes.Align = max(*SymAlign, CommonRes.Align);
       CommonRes.Prevailing |= Res.Prevailing;
     }
 
@@ -781,8 +796,15 @@ Error LTO::linkRegularLTO(RegularLTOState::AddedModule Mod,
                           bool LivenessFromIndex) {
   std::vector<GlobalValue *> Keep;
   for (GlobalValue *GV : Mod.Keep) {
-    if (LivenessFromIndex && !ThinLTO.CombinedIndex.isGUIDLive(GV->getGUID()))
+    if (LivenessFromIndex && !ThinLTO.CombinedIndex.isGUIDLive(GV->getGUID())) {
+      if (Function *F = dyn_cast<Function>(GV)) {
+        OptimizationRemarkEmitter ORE(F);
+        ORE.emit(OptimizationRemark(DEBUG_TYPE, "deadfunction", F)
+                 << ore::NV("Function", F)
+                 << " not added to the combined module ");
+      }
       continue;
+    }
 
     if (!GV->hasAvailableExternallyLinkage()) {
       Keep.push_back(GV);
@@ -849,12 +871,28 @@ Error LTO::addThinLTO(BitcodeModule BM, ArrayRef<InputFile::Symbol> Syms,
         "Expected at most one ThinLTO module per bitcode file",
         inconvertibleErrorCode());
 
+  if (!Conf.ThinLTOModulesToCompile.empty()) {
+    if (!ThinLTO.ModulesToCompile)
+      ThinLTO.ModulesToCompile = ModuleMapType();
+    // This is a fuzzy name matching where only modules with name containing the
+    // specified switch values are going to be compiled.
+    for (const std::string &Name : Conf.ThinLTOModulesToCompile) {
+      if (BM.getModuleIdentifier().contains(Name)) {
+        ThinLTO.ModulesToCompile->insert({BM.getModuleIdentifier(), BM});
+        llvm::errs() << "[ThinLTO] Selecting " << BM.getModuleIdentifier()
+                     << " to compile\n";
+      }
+    }
+  }
+
   return Error::success();
 }
 
 unsigned LTO::getMaxTasks() const {
   CalledGetMaxTasks = true;
-  return RegularLTO.ParallelCodeGenParallelismLevel + ThinLTO.ModuleMap.size();
+  auto ModuleCount = ThinLTO.ModulesToCompile ? ThinLTO.ModulesToCompile->size()
+                                              : ThinLTO.ModuleMap.size();
+  return RegularLTO.ParallelCodeGenParallelismLevel + ModuleCount;
 }
 
 // If only some of the modules were split, we cannot correctly handle
@@ -931,17 +969,6 @@ Error LTO::run(AddStreamFn AddStream, NativeObjectCache Cache) {
     return StatsFileOrErr.takeError();
   std::unique_ptr<ToolOutputFile> StatsFile = std::move(StatsFileOrErr.get());
 
-  // Finalize linking of regular LTO modules containing summaries now that
-  // we have computed liveness information.
-  for (auto &M : RegularLTO.ModsWithSummaries)
-    if (Error Err = linkRegularLTO(std::move(M),
-                                   /*LivenessFromIndex=*/true))
-      return Err;
-
-  // Ensure we don't have inconsistently split LTO units with type tests.
-  if (Error Err = checkPartiallySplit())
-    return Err;
-
   Error Result = runRegularLTO(AddStream);
   if (!Result)
     Result = runThinLTO(AddStream, Cache, GUIDPreservedSymbols);
@@ -953,6 +980,27 @@ Error LTO::run(AddStreamFn AddStream, NativeObjectCache Cache) {
 }
 
 Error LTO::runRegularLTO(AddStreamFn AddStream) {
+  // Setup optimization remarks.
+  auto DiagFileOrErr = lto::setupLLVMOptimizationRemarks(
+      RegularLTO.CombinedModule->getContext(), Conf.RemarksFilename,
+      Conf.RemarksPasses, Conf.RemarksFormat, Conf.RemarksWithHotness);
+  if (!DiagFileOrErr)
+    return DiagFileOrErr.takeError();
+
+  // Finalize linking of regular LTO modules containing summaries now that
+  // we have computed liveness information.
+  for (auto &M : RegularLTO.ModsWithSummaries)
+    if (Error Err = linkRegularLTO(std::move(M),
+                                   /*LivenessFromIndex=*/true))
+      return Err;
+
+  // Ensure we don't have inconsistently split LTO units with type tests.
+  // FIXME: this checks both LTO and ThinLTO. It happens to work as we take
+  // this path both cases but eventually this should be split into two and
+  // do the ThinLTO checks in `runThinLTO`.
+  if (Error Err = checkPartiallySplit())
+    return Err;
+
   // Make sure commons have the right size/alignment: we kept the largest from
   // all the prevailing when adding the inputs, and we apply it here.
   const DataLayout &DL = RegularLTO.CombinedModule->getDataLayout();
@@ -982,6 +1030,11 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
     }
   }
 
+  // If allowed, upgrade public vcall visibility metadata to linkage unit
+  // visibility before whole program devirtualization in the optimizer.
+  updateVCallVisibilityInModule(*RegularLTO.CombinedModule,
+                                Conf.HasWholeProgramVisibility);
+
   if (Conf.PreOptModuleHook &&
       !Conf.PreOptModuleHook(0, *RegularLTO.CombinedModule))
     return Error::success();
@@ -1012,8 +1065,15 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
         !Conf.PostInternalizeModuleHook(0, *RegularLTO.CombinedModule))
       return Error::success();
   }
-  return backend(Conf, AddStream, RegularLTO.ParallelCodeGenParallelismLevel,
-                 std::move(RegularLTO.CombinedModule), ThinLTO.CombinedIndex);
+
+  if (!RegularLTO.EmptyCombinedModule || Conf.AlwaysEmitRegularLTOObj) {
+    if (Error Err = backend(
+            Conf, AddStream, RegularLTO.ParallelCodeGenParallelismLevel,
+            std::move(RegularLTO.CombinedModule), ThinLTO.CombinedIndex))
+      return Err;
+  }
+
+  return finalizeOptimizationRemarks(std::move(*DiagFileOrErr));
 }
 
 static const char *libcallRoutineNames[] = {
@@ -1063,12 +1123,12 @@ class InProcessThinBackend : public ThinBackendProc {
 public:
   InProcessThinBackend(
       const Config &Conf, ModuleSummaryIndex &CombinedIndex,
-      unsigned ThinLTOParallelismLevel,
+      ThreadPoolStrategy ThinLTOParallelism,
       const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
       AddStreamFn AddStream, NativeObjectCache Cache)
       : ThinBackendProc(Conf, CombinedIndex, ModuleToDefinedGVSummaries),
-        BackendThreadPool(ThinLTOParallelismLevel),
-        AddStream(std::move(AddStream)), Cache(std::move(Cache)) {
+        BackendThreadPool(ThinLTOParallelism), AddStream(std::move(AddStream)),
+        Cache(std::move(Cache)) {
     for (auto &Name : CombinedIndex.cfiFunctionDefs())
       CfiFunctionDefs.insert(
           GlobalValue::getGUID(GlobalValue::dropLLVMManglingEscape(Name)));
@@ -1133,6 +1193,9 @@ public:
                 &ResolvedODR,
             const GVSummaryMapTy &DefinedGlobals,
             MapVector<StringRef, BitcodeModule> &ModuleMap) {
+          if (LLVM_ENABLE_THREADS && Conf.TimeTraceEnabled)
+            timeTraceProfilerInitialize(Conf.TimeTraceGranularity,
+                                        "thin backend");
           Error E = runThinLTOBackendThread(
               AddStream, Cache, Task, BM, CombinedIndex, ImportList, ExportList,
               ResolvedODR, DefinedGlobals, ModuleMap);
@@ -1143,6 +1206,8 @@ public:
             else
               Err = std::move(E);
           }
+          if (LLVM_ENABLE_THREADS && Conf.TimeTraceEnabled)
+            timeTraceProfilerFinishThread();
         },
         BM, std::ref(CombinedIndex), std::ref(ImportList), std::ref(ExportList),
         std::ref(ResolvedODR), std::ref(DefinedGlobals), std::ref(ModuleMap));
@@ -1159,13 +1224,13 @@ public:
 };
 } // end anonymous namespace
 
-ThinBackend lto::createInProcessThinBackend(unsigned ParallelismLevel) {
+ThinBackend lto::createInProcessThinBackend(ThreadPoolStrategy Parallelism) {
   return [=](const Config &Conf, ModuleSummaryIndex &CombinedIndex,
              const StringMap<GVSummaryMapTy> &ModuleToDefinedGVSummaries,
              AddStreamFn AddStream, NativeObjectCache Cache) {
     return std::make_unique<InProcessThinBackend>(
-        Conf, CombinedIndex, ParallelismLevel, ModuleToDefinedGVSummaries,
-        AddStream, Cache);
+        Conf, CombinedIndex, Parallelism, ModuleToDefinedGVSummaries, AddStream,
+        Cache);
   };
 }
 
@@ -1186,7 +1251,7 @@ std::string lto::getThinLTOOutputFile(const std::string &Path,
       llvm::errs() << "warning: could not create directory '" << ParentPath
                    << "': " << EC.message() << '\n';
   }
-  return NewPath.str();
+  return std::string(NewPath.str());
 }
 
 namespace {
@@ -1215,7 +1280,7 @@ public:
       MapVector<StringRef, BitcodeModule> &ModuleMap) override {
     StringRef ModulePath = BM.getModuleIdentifier();
     std::string NewModulePath =
-        getThinLTOOutputFile(ModulePath, OldPrefix, NewPrefix);
+        getThinLTOOutputFile(std::string(ModulePath), OldPrefix, NewPrefix);
 
     if (LinkedObjectsFile)
       *LinkedObjectsFile << NewModulePath << '\n';
@@ -1239,7 +1304,7 @@ public:
     }
 
     if (OnWrite)
-      OnWrite(ModulePath);
+      OnWrite(std::string(ModulePath));
     return Error::success();
   }
 
@@ -1264,6 +1329,11 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
   if (ThinLTO.ModuleMap.empty())
     return Error::success();
 
+  if (ThinLTO.ModulesToCompile && ThinLTO.ModulesToCompile->empty()) {
+    llvm::errs() << "warning: [ThinLTO] No module compiled\n";
+    return Error::success();
+  }
+
   if (Conf.CombinedIndexHook &&
       !Conf.CombinedIndexHook(ThinLTO.CombinedIndex, GUIDPreservedSymbols))
     return Error::success();
@@ -1299,6 +1369,11 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
 
   std::set<GlobalValue::GUID> ExportedGUIDs;
 
+  // If allowed, upgrade public vcall visibility to linkage unit visibility in
+  // the summaries before whole program devirtualization below.
+  updateVCallVisibilityInIndex(ThinLTO.CombinedIndex,
+                               Conf.HasWholeProgramVisibility);
+
   // Perform index-based WPD. This will return immediately if there are
   // no index entries in the typeIdMetadata map (e.g. if we are instead
   // performing IR-based WPD in hybrid regular/thin LTO mode).
@@ -1359,14 +1434,19 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
   thinLTOResolvePrevailingInIndex(ThinLTO.CombinedIndex, isPrevailing,
                                   recordNewLinkage, GUIDPreservedSymbols);
 
+  generateParamAccessSummary(ThinLTO.CombinedIndex);
+
   std::unique_ptr<ThinBackendProc> BackendProc =
       ThinLTO.Backend(Conf, ThinLTO.CombinedIndex, ModuleToDefinedGVSummaries,
                       AddStream, Cache);
 
+  auto &ModuleMap =
+      ThinLTO.ModulesToCompile ? *ThinLTO.ModulesToCompile : ThinLTO.ModuleMap;
+
   // Tasks 0 through ParallelCodeGenParallelismLevel-1 are reserved for combined
   // module and parallel code generation partitions.
   unsigned Task = RegularLTO.ParallelCodeGenParallelismLevel;
-  for (auto &Mod : ThinLTO.ModuleMap) {
+  for (auto &Mod : ModuleMap) {
     if (Error E = BackendProc->start(Task, Mod.second, ImportLists[Mod.first],
                                      ExportLists[Mod.first],
                                      ResolvedODR[Mod.first], ThinLTO.ModuleMap))
@@ -1377,11 +1457,10 @@ Error LTO::runThinLTO(AddStreamFn AddStream, NativeObjectCache Cache,
   return BackendProc->wait();
 }
 
-Expected<std::unique_ptr<ToolOutputFile>>
-lto::setupOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
-                              StringRef RemarksPasses, StringRef RemarksFormat,
-                              bool RemarksWithHotness, int Count) {
-  std::string Filename = RemarksFilename;
+Expected<std::unique_ptr<ToolOutputFile>> lto::setupLLVMOptimizationRemarks(
+    LLVMContext &Context, StringRef RemarksFilename, StringRef RemarksPasses,
+    StringRef RemarksFormat, bool RemarksWithHotness, int Count) {
+  std::string Filename = std::string(RemarksFilename);
   // For ThinLTO, file.opt.<format> becomes
   // file.opt.<format>.thin.<num>.<format>.
   if (!Filename.empty() && Count != -1)
@@ -1389,7 +1468,7 @@ lto::setupOptimizationRemarks(LLVMContext &Context, StringRef RemarksFilename,
         (Twine(Filename) + ".thin." + llvm::utostr(Count) + "." + RemarksFormat)
             .str();
 
-  auto ResultOrErr = llvm::setupOptimizationRemarks(
+  auto ResultOrErr = llvm::setupLLVMOptimizationRemarks(
       Context, Filename, RemarksPasses, RemarksFormat, RemarksWithHotness);
   if (Error E = ResultOrErr.takeError())
     return std::move(E);
diff --git a/llvm/lib/LTO/LTOBackend.cpp b/llvm/lib/LTO/LTOBackend.cpp
index dcde7277b820..0c395f9bbf28 100644
--- a/llvm/lib/LTO/LTOBackend.cpp
+++ b/llvm/lib/LTO/LTOBackend.cpp
@@ -16,18 +16,20 @@
 #include "llvm/LTO/LTOBackend.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
+#include "llvm/Analysis/ModuleSummaryAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Bitcode/BitcodeReader.h"
 #include "llvm/Bitcode/BitcodeWriter.h"
+#include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/PassManager.h"
-#include "llvm/IR/RemarkStreamer.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/LTO/LTO.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Object/ModuleSymbolTable.h"
 #include "llvm/Passes/PassBuilder.h"
+#include "llvm/Passes/PassPlugin.h"
 #include "llvm/Passes/StandardInstrumentations.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
@@ -61,8 +63,10 @@ Error Config::addSaveTemps(std::string OutputFileName,
   std::error_code EC;
   ResolutionFile = std::make_unique<raw_fd_ostream>(
       OutputFileName + "resolution.txt", EC, sys::fs::OpenFlags::OF_Text);
-  if (EC)
+  if (EC) {
+    ResolutionFile.reset();
     return errorCodeToError(EC);
+  }
 
   auto setHook = [&](std::string PathSuffix, ModuleHookFn &Hook) {
     // Keep track of the hook provided by the linker, which also needs to run.
@@ -125,6 +129,29 @@ Error Config::addSaveTemps(std::string OutputFileName,
   return Error::success();
 }
 
+#define HANDLE_EXTENSION(Ext)                                                  \
+  llvm::PassPluginLibraryInfo get##Ext##PluginInfo();
+#include "llvm/Support/Extension.def"
+
+static void RegisterPassPlugins(ArrayRef<std::string> PassPlugins,
+                                PassBuilder &PB) {
+#define HANDLE_EXTENSION(Ext)                                                  \
+  get##Ext##PluginInfo().RegisterPassBuilderCallbacks(PB);
+#include "llvm/Support/Extension.def"
+
+  // Load requested pass plugins and let them register pass builder callbacks
+  for (auto &PluginFN : PassPlugins) {
+    auto PassPlugin = PassPlugin::Load(PluginFN);
+    if (!PassPlugin) {
+      errs() << "Failed to load passes from '" << PluginFN
+             << "'. Request ignored.\n";
+      continue;
+    }
+
+    PassPlugin->registerPassBuilderCallbacks(PB);
+  }
+}
+
 namespace {
 
 std::unique_ptr<TargetMachine>
@@ -179,6 +206,8 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   if (auto Err = PB.parseAAPipeline(AA, "default"))
     report_fatal_error("Error parsing default AA pipeline");
 
+  RegisterPassPlugins(Conf.PassPlugins, PB);
+
   LoopAnalysisManager LAM(Conf.DebugPassManager);
   FunctionAnalysisManager FAM(Conf.DebugPassManager);
   CGSCCAnalysisManager CGAM(Conf.DebugPassManager);
@@ -203,16 +232,16 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   default:
     llvm_unreachable("Invalid optimization level");
   case 0:
-    OL = PassBuilder::O0;
+    OL = PassBuilder::OptimizationLevel::O0;
     break;
   case 1:
-    OL = PassBuilder::O1;
+    OL = PassBuilder::OptimizationLevel::O1;
     break;
   case 2:
-    OL = PassBuilder::O2;
+    OL = PassBuilder::OptimizationLevel::O2;
     break;
   case 3:
-    OL = PassBuilder::O3;
+    OL = PassBuilder::OptimizationLevel::O3;
     break;
   }
 
@@ -226,8 +255,8 @@ static void runNewPMPasses(const Config &Conf, Module &Mod, TargetMachine *TM,
   // FIXME (davide): verify the output.
 }
 
-static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
-                                 std::string PipelineDesc,
+static void runNewPMCustomPasses(const Config &Conf, Module &Mod,
+                                 TargetMachine *TM, std::string PipelineDesc,
                                  std::string AAPipelineDesc,
                                  bool DisableVerify) {
   PassBuilder PB(TM);
@@ -239,6 +268,8 @@ static void runNewPMCustomPasses(Module &Mod, TargetMachine *TM,
       report_fatal_error("unable to parse AA pipeline description '" +
                          AAPipelineDesc + "': " + toString(std::move(Err)));
 
+  RegisterPassPlugins(Conf.PassPlugins, PB);
+
   LoopAnalysisManager LAM;
   FunctionAnalysisManager FAM;
   CGSCCAnalysisManager CGAM;
@@ -305,7 +336,7 @@ bool opt(const Config &Conf, TargetMachine *TM, unsigned Task, Module &Mod,
          const ModuleSummaryIndex *ImportSummary) {
   // FIXME: Plumb the combined index into the new pass manager.
   if (!Conf.OptPipeline.empty())
-    runNewPMCustomPasses(Mod, TM, Conf.OptPipeline, Conf.AAPipeline,
+    runNewPMCustomPasses(Conf, Mod, TM, Conf.OptPipeline, Conf.AAPipeline,
                          Conf.DisableVerify);
   else if (Conf.UseNewPM)
     runNewPMPasses(Conf, Mod, TM, Conf.OptLevel, IsThinLTO, ExportSummary,
@@ -333,7 +364,8 @@ static void EmitBitcodeSection(Module &M, const Config &Conf) {
 }
 
 void codegen(const Config &Conf, TargetMachine *TM, AddStreamFn AddStream,
-             unsigned Task, Module &Mod) {
+             unsigned Task, Module &Mod,
+             const ModuleSummaryIndex &CombinedIndex) {
   if (Conf.PreCodeGenModuleHook && !Conf.PreCodeGenModuleHook(Task, Mod))
     return;
 
@@ -349,7 +381,7 @@ void codegen(const Config &Conf, TargetMachine *TM, AddStreamFn AddStream,
 
     DwoFile = Conf.DwoDir;
     sys::path::append(DwoFile, std::to_string(Task) + ".dwo");
-    TM->Options.MCOptions.SplitDwarfFile = DwoFile.str().str();
+    TM->Options.MCOptions.SplitDwarfFile = std::string(DwoFile);
   } else
     TM->Options.MCOptions.SplitDwarfFile = Conf.SplitDwarfFile;
 
@@ -362,6 +394,8 @@ void codegen(const Config &Conf, TargetMachine *TM, AddStreamFn AddStream,
 
   auto Stream = AddStream(Task);
   legacy::PassManager CodeGenPasses;
+  CodeGenPasses.add(
+      createImmutableModuleSummaryIndexWrapperPass(&CombinedIndex));
   if (TM->addPassesToEmitFile(CodeGenPasses, *Stream->OS,
                               DwoOut ? &DwoOut->os() : nullptr,
                               Conf.CGFileType))
@@ -374,8 +408,10 @@ void codegen(const Config &Conf, TargetMachine *TM, AddStreamFn AddStream,
 
 void splitCodeGen(const Config &C, TargetMachine *TM, AddStreamFn AddStream,
                   unsigned ParallelCodeGenParallelismLevel,
-                  std::unique_ptr<Module> Mod) {
-  ThreadPool CodegenThreadPool(ParallelCodeGenParallelismLevel);
+                  std::unique_ptr<Module> Mod,
+                  const ModuleSummaryIndex &CombinedIndex) {
+  ThreadPool CodegenThreadPool(
+      heavyweight_hardware_concurrency(ParallelCodeGenParallelismLevel));
   unsigned ThreadCount = 0;
   const Target *T = &TM->getTarget();
 
@@ -406,7 +442,8 @@ void splitCodeGen(const Config &C, TargetMachine *TM, AddStreamFn AddStream,
               std::unique_ptr<TargetMachine> TM =
                   createTargetMachine(C, T, *MPartInCtx);
 
-              codegen(C, TM.get(), AddStream, ThreadId, *MPartInCtx);
+              codegen(C, TM.get(), AddStream, ThreadId, *MPartInCtx,
+                      CombinedIndex);
             },
             // Pass BC using std::move to ensure that it get moved rather than
             // copied into the thread's context.
@@ -434,8 +471,8 @@ Expected<const Target *> initAndLookupTarget(const Config &C, Module &Mod) {
 }
 }
 
-static Error
-finalizeOptimizationRemarks(std::unique_ptr<ToolOutputFile> DiagOutputFile) {
+Error lto::finalizeOptimizationRemarks(
+    std::unique_ptr<ToolOutputFile> DiagOutputFile) {
   // Make sure we flush the diagnostic remarks file in case the linker doesn't
   // call the global destructors before exiting.
   if (!DiagOutputFile)
@@ -455,27 +492,19 @@ Error lto::backend(const Config &C, AddStreamFn AddStream,
 
   std::unique_ptr<TargetMachine> TM = createTargetMachine(C, *TOrErr, *Mod);
 
-  // Setup optimization remarks.
-  auto DiagFileOrErr = lto::setupOptimizationRemarks(
-      Mod->getContext(), C.RemarksFilename, C.RemarksPasses, C.RemarksFormat,
-      C.RemarksWithHotness);
-  if (!DiagFileOrErr)
-    return DiagFileOrErr.takeError();
-  auto DiagnosticOutputFile = std::move(*DiagFileOrErr);
-
   if (!C.CodeGenOnly) {
     if (!opt(C, TM.get(), 0, *Mod, /*IsThinLTO=*/false,
              /*ExportSummary=*/&CombinedIndex, /*ImportSummary=*/nullptr))
-      return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
+      return Error::success();
   }
 
   if (ParallelCodeGenParallelismLevel == 1) {
-    codegen(C, TM.get(), AddStream, 0, *Mod);
+    codegen(C, TM.get(), AddStream, 0, *Mod, CombinedIndex);
   } else {
     splitCodeGen(C, TM.get(), AddStream, ParallelCodeGenParallelismLevel,
-                 std::move(Mod));
+                 std::move(Mod), CombinedIndex);
   }
-  return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
+  return Error::success();
 }
 
 static void dropDeadSymbols(Module &Mod, const GVSummaryMapTy &DefinedGlobals,
@@ -511,22 +540,32 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
   std::unique_ptr<TargetMachine> TM = createTargetMachine(Conf, *TOrErr, Mod);
 
   // Setup optimization remarks.
-  auto DiagFileOrErr = lto::setupOptimizationRemarks(
+  auto DiagFileOrErr = lto::setupLLVMOptimizationRemarks(
       Mod.getContext(), Conf.RemarksFilename, Conf.RemarksPasses,
       Conf.RemarksFormat, Conf.RemarksWithHotness, Task);
   if (!DiagFileOrErr)
     return DiagFileOrErr.takeError();
   auto DiagnosticOutputFile = std::move(*DiagFileOrErr);
 
+  // Set the partial sample profile ratio in the profile summary module flag of
+  // the module, if applicable.
+  Mod.setPartialSampleProfileRatio(CombinedIndex);
+
   if (Conf.CodeGenOnly) {
-    codegen(Conf, TM.get(), AddStream, Task, Mod);
+    codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
   }
 
   if (Conf.PreOptModuleHook && !Conf.PreOptModuleHook(Task, Mod))
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
-  renameModuleForThinLTO(Mod, CombinedIndex);
+  // When linking an ELF shared object, dso_local should be dropped. We
+  // conservatively do this for -fpic.
+  bool ClearDSOLocalOnDeclarations =
+      TM->getTargetTriple().isOSBinFormatELF() &&
+      TM->getRelocationModel() != Reloc::Static &&
+      Mod.getPIELevel() == PIELevel::Default;
+  renameModuleForThinLTO(Mod, CombinedIndex, ClearDSOLocalOnDeclarations);
 
   dropDeadSymbols(Mod, DefinedGlobals, CombinedIndex);
 
@@ -552,7 +591,8 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
                                    /*IsImporting*/ true);
   };
 
-  FunctionImporter Importer(CombinedIndex, ModuleLoader);
+  FunctionImporter Importer(CombinedIndex, ModuleLoader,
+                            ClearDSOLocalOnDeclarations);
   if (Error Err = Importer.importFunctions(Mod, ImportList).takeError())
     return Err;
 
@@ -563,6 +603,6 @@ Error lto::thinBackend(const Config &Conf, unsigned Task, AddStreamFn AddStream,
            /*ExportSummary=*/nullptr, /*ImportSummary=*/&CombinedIndex))
     return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 
-  codegen(Conf, TM.get(), AddStream, Task, Mod);
+  codegen(Conf, TM.get(), AddStream, Task, Mod, CombinedIndex);
   return finalizeOptimizationRemarks(std::move(DiagnosticOutputFile));
 }
diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp
index 5fef14230a9b..25ab1404b4e1 100644
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -29,11 +29,11 @@
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassTimingInfo.h"
-#include "llvm/IR/RemarkStreamer.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/LTO/LTO.h"
@@ -57,6 +57,7 @@
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
+#include "llvm/Transforms/IPO/WholeProgramDevirt.h"
 #include "llvm/Transforms/ObjCARC.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <system_error>
@@ -133,10 +134,12 @@ void LTOCodeGenerator::initializeLTOPasses() {
   initializeSimpleInlinerPass(R);
   initializePruneEHPass(R);
   initializeGlobalDCELegacyPassPass(R);
+  initializeOpenMPOptLegacyPassPass(R);
   initializeArgPromotionPass(R);
   initializeJumpThreadingPass(R);
   initializeSROALegacyPassPass(R);
   initializeAttributorLegacyPassPass(R);
+  initializeAttributorCGSCCLegacyPassPass(R);
   initializePostOrderFunctionAttrsLegacyPassPass(R);
   initializeReversePostOrderFunctionAttrsLegacyPassPass(R);
   initializeGlobalsAAWrapperPassPass(R);
@@ -526,8 +529,8 @@ bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline,
     return false;
 
   auto DiagFileOrErr =
-      lto::setupOptimizationRemarks(Context, RemarksFilename, RemarksPasses,
-                                    RemarksFormat, RemarksWithHotness);
+      lto::setupLLVMOptimizationRemarks(Context, RemarksFilename, RemarksPasses,
+                                        RemarksFormat, RemarksWithHotness);
   if (!DiagFileOrErr) {
     errs() << "Error: " << toString(DiagFileOrErr.takeError()) << "\n";
     report_fatal_error("Can't get an output file for the remarks");
@@ -542,6 +545,13 @@ bool LTOCodeGenerator::optimize(bool DisableVerify, bool DisableInline,
   }
   StatsFile = std::move(StatsFileOrErr.get());
 
+  // Currently there is no support for enabling whole program visibility via a
+  // linker option in the old LTO API, but this call allows it to be specified
+  // via the internal option. Must be done before WPD invoked via the optimizer
+  // pipeline run below.
+  updateVCallVisibilityInModule(*MergedModule,
+                                /* WholeProgramVisibilityEnabledInLTO */ false);
+
   // We always run the verifier once on the merged module, the `DisableVerify`
   // parameter only applies to subsequent verify.
   verifyMergedModuleOnce();
@@ -622,9 +632,9 @@ bool LTOCodeGenerator::compileOptimized(ArrayRef<raw_pwrite_stream *> Out) {
   return true;
 }
 
-void LTOCodeGenerator::setCodeGenDebugOptions(ArrayRef<const char *> Options) {
+void LTOCodeGenerator::setCodeGenDebugOptions(ArrayRef<StringRef> Options) {
   for (StringRef Option : Options)
-    CodegenOptions.push_back(Option);
+    CodegenOptions.push_back(Option.str());
 }
 
 void LTOCodeGenerator::parseCodeGenDebugOptions() {
diff --git a/llvm/lib/LTO/LTOModule.cpp b/llvm/lib/LTO/LTOModule.cpp
index 587b332e7064..ebe779aea62e 100644
--- a/llvm/lib/LTO/LTOModule.cpp
+++ b/llvm/lib/LTO/LTOModule.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Object/IRObjectFile.h"
+#include "llvm/Object/MachO.h"
 #include "llvm/Object/ObjectFile.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Host.h"
@@ -413,9 +414,8 @@ void LTOModule::addDefinedFunctionSymbol(StringRef Name, const Function *F) {
 
 void LTOModule::addDefinedSymbol(StringRef Name, const GlobalValue *def,
                                  bool isFunction) {
-  // set alignment part log2() can have rounding errors
-  uint32_t align = def->getAlignment();
-  uint32_t attr = align ? countTrailingZeros(align) : 0;
+  const GlobalObject *go = dyn_cast<GlobalObject>(def);
+  uint32_t attr = go ? Log2(go->getAlign().valueOrOne()) : 0;
 
   // set permissions part
   if (isFunction) {
@@ -676,3 +676,11 @@ const char *LTOModule::getDependentLibrary(lto::InputFile *input, size_t index,
   *size = S.size();
   return S.data();
 }
+
+Expected<uint32_t> LTOModule::getMachOCPUType() const {
+  return MachO::getCPUType(Triple(Mod->getTargetTriple()));
+}
+
+Expected<uint32_t> LTOModule::getMachOCPUSubType() const {
+  return MachO::getCPUSubType(Triple(Mod->getTargetTriple()));
+}
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index a9e27832917c..d0a1e1889c61 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -27,10 +27,10 @@
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DiagnosticPrinter.h"
 #include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/LLVMRemarkStreamer.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/IR/PassTimingInfo.h"
-#include "llvm/IR/RemarkStreamer.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/IRReader/IRReader.h"
 #include "llvm/LTO/LTO.h"
@@ -48,7 +48,6 @@
 #include "llvm/Support/ThreadPool.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/ToolOutputFile.h"
-#include "llvm/Support/VCSRevision.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/FunctionImport.h"
@@ -81,8 +80,10 @@ extern cl::opt<std::string> RemarksFormat;
 
 namespace {
 
-static cl::opt<int>
-    ThreadCount("threads", cl::init(llvm::heavyweight_hardware_concurrency()));
+// Default to using all available threads in the system, but using only one
+// thred per core, as indicated by the usage of
+// heavyweight_hardware_concurrency() below.
+static cl::opt<int> ThreadCount("threads", cl::init(0));
 
 // Simple helper to save temporary files for debug.
 static void saveTempBitcode(const Module &TheModule, StringRef TempDir,
@@ -151,8 +152,9 @@ generateModuleMap(std::vector<std::unique_ptr<lto::InputFile>> &Modules) {
   return ModuleMap;
 }
 
-static void promoteModule(Module &TheModule, const ModuleSummaryIndex &Index) {
-  if (renameModuleForThinLTO(TheModule, Index))
+static void promoteModule(Module &TheModule, const ModuleSummaryIndex &Index,
+                          bool ClearDSOLocalOnDeclarations) {
+  if (renameModuleForThinLTO(TheModule, Index, ClearDSOLocalOnDeclarations))
     report_fatal_error("renameModuleForThinLTO failed");
 }
 
@@ -204,15 +206,16 @@ static std::unique_ptr<Module> loadModuleFromInput(lto::InputFile *Input,
 
 static void
 crossImportIntoModule(Module &TheModule, const ModuleSummaryIndex &Index,
-                      StringMap<lto::InputFile*> &ModuleMap,
-                      const FunctionImporter::ImportMapTy &ImportList) {
+                      StringMap<lto::InputFile *> &ModuleMap,
+                      const FunctionImporter::ImportMapTy &ImportList,
+                      bool ClearDSOLocalOnDeclarations) {
   auto Loader = [&](StringRef Identifier) {
     auto &Input = ModuleMap[Identifier];
     return loadModuleFromInput(Input, TheModule.getContext(),
                                /*Lazy=*/true, /*IsImporting*/ true);
   };
 
-  FunctionImporter Importer(Index, Loader);
+  FunctionImporter Importer(Index, Loader, ClearDSOLocalOnDeclarations);
   Expected<bool> Result = Importer.importFunctions(TheModule, ImportList);
   if (!Result) {
     handleAllErrors(Result.takeError(), [&](ErrorInfoBase &EIB) {
@@ -410,8 +413,15 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
   // "Benchmark"-like optimization: single-source case
   bool SingleModule = (ModuleMap.size() == 1);
 
+  // When linking an ELF shared object, dso_local should be dropped. We
+  // conservatively do this for -fpic.
+  bool ClearDSOLocalOnDeclarations =
+      TM.getTargetTriple().isOSBinFormatELF() &&
+      TM.getRelocationModel() != Reloc::Static &&
+      TheModule.getPIELevel() == PIELevel::Default;
+
   if (!SingleModule) {
-    promoteModule(TheModule, Index);
+    promoteModule(TheModule, Index, ClearDSOLocalOnDeclarations);
 
     // Apply summary-based prevailing-symbol resolution decisions.
     thinLTOResolvePrevailingInModule(TheModule, DefinedGlobals);
@@ -431,7 +441,8 @@ ProcessThinLTOModule(Module &TheModule, ModuleSummaryIndex &Index,
   saveTempBitcode(TheModule, SaveTempsDir, count, ".2.internalized.bc");
 
   if (!SingleModule) {
-    crossImportIntoModule(TheModule, Index, ModuleMap, ImportList);
+    crossImportIntoModule(TheModule, Index, ModuleMap, ImportList,
+                          ClearDSOLocalOnDeclarations);
 
     // Save temps: after cross-module import.
     saveTempBitcode(TheModule, SaveTempsDir, count, ".3.imported.bc");
@@ -672,7 +683,8 @@ void ThinLTOCodeGenerator::promote(Module &TheModule, ModuleSummaryIndex &Index,
       Index, IsExported(ExportLists, GUIDPreservedSymbols),
       IsPrevailing(PrevailingCopy));
 
-  promoteModule(TheModule, Index);
+  // FIXME Set ClearDSOLocalOnDeclarations.
+  promoteModule(TheModule, Index, /*ClearDSOLocalOnDeclarations=*/false);
 }
 
 /**
@@ -704,7 +716,9 @@ void ThinLTOCodeGenerator::crossModuleImport(Module &TheModule,
                            ExportLists);
   auto &ImportList = ImportLists[TheModule.getModuleIdentifier()];
 
-  crossImportIntoModule(TheModule, Index, ModuleMap, ImportList);
+  // FIXME Set ClearDSOLocalOnDeclarations.
+  crossImportIntoModule(TheModule, Index, ModuleMap, ImportList,
+                        /*ClearDSOLocalOnDeclarations=*/false);
 }
 
 /**
@@ -831,7 +845,8 @@ void ThinLTOCodeGenerator::internalize(Module &TheModule,
       Index, IsExported(ExportLists, GUIDPreservedSymbols),
       IsPrevailing(PrevailingCopy));
 
-  promoteModule(TheModule, Index);
+  // FIXME Set ClearDSOLocalOnDeclarations.
+  promoteModule(TheModule, Index, /*ClearDSOLocalOnDeclarations=*/false);
 
   // Internalization
   thinLTOResolvePrevailingInModule(
@@ -871,15 +886,15 @@ ThinLTOCodeGenerator::writeGeneratedObject(int count, StringRef CacheEntryPath,
     // Cache is enabled, hard-link the entry (or copy if hard-link fails).
     auto Err = sys::fs::create_hard_link(CacheEntryPath, OutputPath);
     if (!Err)
-      return OutputPath.str();
+      return std::string(OutputPath.str());
     // Hard linking failed, try to copy.
     Err = sys::fs::copy_file(CacheEntryPath, OutputPath);
     if (!Err)
-      return OutputPath.str();
+      return std::string(OutputPath.str());
     // Copy failed (could be because the CacheEntry was removed from the cache
     // in the meantime by another process), fall back and try to write down the
     // buffer to the output.
-    errs() << "error: can't link or copy from cached entry '" << CacheEntryPath
+    errs() << "remark: can't link or copy from cached entry '" << CacheEntryPath
            << "' to '" << OutputPath << "'\n";
   }
   // No cache entry, just write out the buffer.
@@ -888,7 +903,7 @@ ThinLTOCodeGenerator::writeGeneratedObject(int count, StringRef CacheEntryPath,
   if (Err)
     report_fatal_error("Can't open output '" + OutputPath + "'\n");
   OS << OutputBuffer.getBuffer();
-  return OutputPath.str();
+  return std::string(OutputPath.str());
 }
 
 // Main entry point for the ThinLTO processing
@@ -970,6 +985,12 @@ void ThinLTOCodeGenerator::run() {
   // Synthesize entry counts for functions in the combined index.
   computeSyntheticCounts(*Index);
 
+  // Currently there is no support for enabling whole program visibility via a
+  // linker option in the old LTO API, but this call allows it to be specified
+  // via the internal option. Must be done before WPD below.
+  updateVCallVisibilityInIndex(*Index,
+                               /* WholeProgramVisibilityEnabledInLTO */ false);
+
   // Perform index-based WPD. This will return immediately if there are
   // no index entries in the typeIdMetadata map (e.g. if we are instead
   // performing IR-based WPD in hybrid regular/thin LTO mode).
@@ -1037,7 +1058,7 @@ void ThinLTOCodeGenerator::run() {
 
   // Parallel optimizer + codegen
   {
-    ThreadPool Pool(ThreadCount);
+    ThreadPool Pool(heavyweight_hardware_concurrency(ThreadCount));
     for (auto IndexCount : ModulesOrdering) {
       auto &Mod = Modules[IndexCount];
       Pool.async([&](int count) {
@@ -1074,7 +1095,7 @@ void ThinLTOCodeGenerator::run() {
         LLVMContext Context;
         Context.setDiscardValueNames(LTODiscardValueNames);
         Context.enableDebugTypeODRUniquing();
-        auto DiagFileOrErr = lto::setupOptimizationRemarks(
+        auto DiagFileOrErr = lto::setupLLVMOptimizationRemarks(
             Context, RemarksFilename, RemarksPasses, RemarksFormat,
             RemarksWithHotness, count);
         if (!DiagFileOrErr) {
@@ -1113,7 +1134,7 @@ void ThinLTOCodeGenerator::run() {
             auto ReloadedBufferOrErr = CacheEntry.tryLoadingBuffer();
             if (auto EC = ReloadedBufferOrErr.getError()) {
               // On error, keep the preexisting buffer and print a diagnostic.
-              errs() << "error: can't reload cached file '" << CacheEntryPath
+              errs() << "remark: can't reload cached file '" << CacheEntryPath
                      << "': " << EC.message() << "\n";
             } else {
               OutputBuffer = std::move(*ReloadedBufferOrErr);
diff --git a/llvm/lib/LTO/UpdateCompilerUsed.cpp b/llvm/lib/LTO/UpdateCompilerUsed.cpp
index 6434f902088d..040e1106523c 100644
--- a/llvm/lib/LTO/UpdateCompilerUsed.cpp
+++ b/llvm/lib/LTO/UpdateCompilerUsed.cpp
@@ -17,6 +17,7 @@
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
diff --git a/llvm/lib/LineEditor/LineEditor.cpp b/llvm/lib/LineEditor/LineEditor.cpp
index 57e62bd64871..1aa3476eb357 100644
--- a/llvm/lib/LineEditor/LineEditor.cpp
+++ b/llvm/lib/LineEditor/LineEditor.cpp
@@ -24,7 +24,7 @@ std::string LineEditor::getDefaultHistoryPath(StringRef ProgName) {
   SmallString<32> Path;
   if (sys::path::home_directory(Path)) {
     sys::path::append(Path, "." + ProgName + "-history");
-    return Path.str();
+    return std::string(Path.str());
   }
   return std::string();
 }
@@ -197,7 +197,7 @@ unsigned char ElCompletionFn(EditLine *EL, int ch) {
 
 LineEditor::LineEditor(StringRef ProgName, StringRef HistoryPath, FILE *In,
                        FILE *Out, FILE *Err)
-    : Prompt((ProgName + "> ").str()), HistoryPath(HistoryPath),
+    : Prompt((ProgName + "> ").str()), HistoryPath(std::string(HistoryPath)),
       Data(new InternalData) {
   if (HistoryPath.empty())
     this->HistoryPath = getDefaultHistoryPath(ProgName);
diff --git a/llvm/lib/Linker/IRMover.cpp b/llvm/lib/Linker/IRMover.cpp
index e13656ed1c10..055689b16e8f 100644
--- a/llvm/lib/Linker/IRMover.cpp
+++ b/llvm/lib/Linker/IRMover.cpp
@@ -173,9 +173,11 @@ bool TypeMapTy::areTypesIsomorphic(Type *DstTy, Type *SrcTy) {
     if (DSTy->isLiteral() != SSTy->isLiteral() ||
         DSTy->isPacked() != SSTy->isPacked())
       return false;
-  } else if (auto *DSeqTy = dyn_cast<SequentialType>(DstTy)) {
-    if (DSeqTy->getNumElements() !=
-        cast<SequentialType>(SrcTy)->getNumElements())
+  } else if (auto *DArrTy = dyn_cast<ArrayType>(DstTy)) {
+    if (DArrTy->getNumElements() != cast<ArrayType>(SrcTy)->getNumElements())
+      return false;
+  } else if (auto *DVecTy = dyn_cast<VectorType>(DstTy)) {
+    if (DVecTy->getElementCount() != cast<VectorType>(SrcTy)->getElementCount())
       return false;
   }
 
@@ -303,9 +305,11 @@ Type *TypeMapTy::get(Type *Ty, SmallPtrSet<StructType *, 8> &Visited) {
   case Type::ArrayTyID:
     return *Entry = ArrayType::get(ElementTypes[0],
                                    cast<ArrayType>(Ty)->getNumElements());
-  case Type::VectorTyID:
-    return *Entry = VectorType::get(ElementTypes[0],
-                                    cast<VectorType>(Ty)->getNumElements());
+  case Type::ScalableVectorTyID:
+    // FIXME: handle scalable vectors
+  case Type::FixedVectorTyID:
+    return *Entry = FixedVectorType::get(
+               ElementTypes[0], cast<FixedVectorType>(Ty)->getNumElements());
   case Type::PointerTyID:
     return *Entry = PointerType::get(ElementTypes[0],
                                      cast<PointerType>(Ty)->getAddressSpace());
@@ -1277,11 +1281,17 @@ Error IRLinker::linkModuleFlagsMetadata() {
     }
 
     // Diagnose inconsistent merge behavior types.
-    if (SrcBehaviorValue != DstBehaviorValue)
-      return stringErr("linking module flags '" + ID->getString() +
-                       "': IDs have conflicting behaviors in '" +
-                       SrcM->getModuleIdentifier() + "' and '" +
-                       DstM.getModuleIdentifier() + "'");
+    if (SrcBehaviorValue != DstBehaviorValue) {
+      bool MaxAndWarn = (SrcBehaviorValue == Module::Max &&
+                         DstBehaviorValue == Module::Warning) ||
+                        (DstBehaviorValue == Module::Max &&
+                         SrcBehaviorValue == Module::Warning);
+      if (!MaxAndWarn)
+        return stringErr("linking module flags '" + ID->getString() +
+                         "': IDs have conflicting behaviors in '" +
+                         SrcM->getModuleIdentifier() + "' and '" +
+                         DstM.getModuleIdentifier() + "'");
+    }
 
     auto replaceDstValue = [&](MDNode *New) {
       Metadata *FlagOps[] = {DstOp->getOperand(0), ID, New};
@@ -1290,6 +1300,40 @@ Error IRLinker::linkModuleFlagsMetadata() {
       Flags[ID].first = Flag;
     };
 
+    // Emit a warning if the values differ and either source or destination
+    // request Warning behavior.
+    if ((DstBehaviorValue == Module::Warning ||
+         SrcBehaviorValue == Module::Warning) &&
+        SrcOp->getOperand(2) != DstOp->getOperand(2)) {
+      std::string Str;
+      raw_string_ostream(Str)
+          << "linking module flags '" << ID->getString()
+          << "': IDs have conflicting values ('" << *SrcOp->getOperand(2)
+          << "' from " << SrcM->getModuleIdentifier() << " with '"
+          << *DstOp->getOperand(2) << "' from " << DstM.getModuleIdentifier()
+          << ')';
+      emitWarning(Str);
+    }
+
+    // Choose the maximum if either source or destination request Max behavior.
+    if (DstBehaviorValue == Module::Max || SrcBehaviorValue == Module::Max) {
+      ConstantInt *DstValue =
+          mdconst::extract<ConstantInt>(DstOp->getOperand(2));
+      ConstantInt *SrcValue =
+          mdconst::extract<ConstantInt>(SrcOp->getOperand(2));
+
+      // The resulting flag should have a Max behavior, and contain the maximum
+      // value from between the source and destination values.
+      Metadata *FlagOps[] = {
+          (DstBehaviorValue != Module::Max ? SrcOp : DstOp)->getOperand(0), ID,
+          (SrcValue->getZExtValue() > DstValue->getZExtValue() ? SrcOp : DstOp)
+              ->getOperand(2)};
+      MDNode *Flag = MDNode::get(DstM.getContext(), FlagOps);
+      DstModFlags->setOperand(DstIndex, Flag);
+      Flags[ID].first = Flag;
+      continue;
+    }
+
     // Perform the merge for standard behavior types.
     switch (SrcBehaviorValue) {
     case Module::Require:
@@ -1305,26 +1349,9 @@ Error IRLinker::linkModuleFlagsMetadata() {
       continue;
     }
     case Module::Warning: {
-      // Emit a warning if the values differ.
-      if (SrcOp->getOperand(2) != DstOp->getOperand(2)) {
-        std::string str;
-        raw_string_ostream(str)
-            << "linking module flags '" << ID->getString()
-            << "': IDs have conflicting values ('" << *SrcOp->getOperand(2)
-            << "' from " << SrcM->getModuleIdentifier() << " with '"
-            << *DstOp->getOperand(2) << "' from " << DstM.getModuleIdentifier()
-            << ')';
-        emitWarning(str);
-      }
-      continue;
+      break;
     }
     case Module::Max: {
-      ConstantInt *DstValue =
-          mdconst::extract<ConstantInt>(DstOp->getOperand(2));
-      ConstantInt *SrcValue =
-          mdconst::extract<ConstantInt>(SrcOp->getOperand(2));
-      if (SrcValue->getZExtValue() > DstValue->getZExtValue())
-        overrideDstValue();
       break;
     }
     case Module::Append: {
@@ -1350,6 +1377,7 @@ Error IRLinker::linkModuleFlagsMetadata() {
       break;
     }
     }
+
   }
 
   // Check all of the requirements.
diff --git a/llvm/lib/MC/ConstantPools.cpp b/llvm/lib/MC/ConstantPools.cpp
index 8cba6b3281a5..d4199025ad77 100644
--- a/llvm/lib/MC/ConstantPools.cpp
+++ b/llvm/lib/MC/ConstantPools.cpp
@@ -26,13 +26,13 @@ using namespace llvm;
 void ConstantPool::emitEntries(MCStreamer &Streamer) {
   if (Entries.empty())
     return;
-  Streamer.EmitDataRegion(MCDR_DataRegion);
+  Streamer.emitDataRegion(MCDR_DataRegion);
   for (const ConstantPoolEntry &Entry : Entries) {
-    Streamer.EmitCodeAlignment(Entry.Size); // align naturally
-    Streamer.EmitLabel(Entry.Label);
-    Streamer.EmitValue(Entry.Value, Entry.Size, Entry.Loc);
+    Streamer.emitCodeAlignment(Entry.Size); // align naturally
+    Streamer.emitLabel(Entry.Label);
+    Streamer.emitValue(Entry.Value, Entry.Size, Entry.Loc);
   }
-  Streamer.EmitDataRegion(MCDR_DataRegionEnd);
+  Streamer.emitDataRegion(MCDR_DataRegionEnd);
   Entries.clear();
 }
 
diff --git a/llvm/lib/MC/ELFObjectWriter.cpp b/llvm/lib/MC/ELFObjectWriter.cpp
index 6b4b45eb8eff..1ca9d0fe1e18 100644
--- a/llvm/lib/MC/ELFObjectWriter.cpp
+++ b/llvm/lib/MC/ELFObjectWriter.cpp
@@ -40,7 +40,7 @@
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compression.h"
-#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Host.h"
@@ -73,7 +73,7 @@ class ELFObjectWriter;
 struct ELFWriter;
 
 bool isDwoSection(const MCSectionELF &Sec) {
-  return Sec.getSectionName().endswith(".dwo");
+  return Sec.getName().endswith(".dwo");
 }
 
 class SymbolTableWriter {
@@ -343,7 +343,7 @@ void ELFWriter::align(unsigned Alignment) {
 
 unsigned ELFWriter::addToSectionTable(const MCSectionELF *Sec) {
   SectionTable.push_back(Sec);
-  StrTabBuilder.add(Sec->getSectionName());
+  StrTabBuilder.add(Sec->getName());
   return SectionTable.size();
 }
 
@@ -640,7 +640,7 @@ void ELFWriter::computeSymbolTable(
       continue;
 
     if (Symbol.isTemporary() && Symbol.isUndefined()) {
-      Ctx.reportError(SMLoc(), "Undefined temporary symbol");
+      Ctx.reportError(SMLoc(), "Undefined temporary symbol " + Symbol.getName());
       continue;
     }
 
@@ -784,7 +784,7 @@ MCSectionELF *ELFWriter::createRelocationSection(MCContext &Ctx,
   if (OWriter.Relocations[&Sec].empty())
     return nullptr;
 
-  const StringRef SectionName = Sec.getSectionName();
+  const StringRef SectionName = Sec.getName();
   std::string RelaSectionName = hasRelocationAddend() ? ".rela" : ".rel";
   RelaSectionName += SectionName;
 
@@ -843,7 +843,7 @@ bool ELFWriter::maybeWriteCompression(
 void ELFWriter::writeSectionData(const MCAssembler &Asm, MCSection &Sec,
                                  const MCAsmLayout &Layout) {
   MCSectionELF &Section = static_cast<MCSectionELF &>(Sec);
-  StringRef SectionName = Section.getSectionName();
+  StringRef SectionName = Section.getName();
 
   auto &MC = Asm.getContext();
   const auto &MAI = MC.getAsmInfo();
@@ -1001,7 +1001,7 @@ void ELFWriter::writeSection(const SectionIndexMapTy &SectionIndexMap,
   case ELF::SHT_RELA: {
     sh_link = SymbolTableIndex;
     assert(sh_link && ".symtab not found");
-    const MCSection *InfoSection = Section.getAssociatedSection();
+    const MCSection *InfoSection = Section.getLinkedToSection();
     sh_info = SectionIndexMap.lookup(cast<MCSectionELF>(InfoSection));
     break;
   }
@@ -1024,12 +1024,12 @@ void ELFWriter::writeSection(const SectionIndexMapTy &SectionIndexMap,
   }
 
   if (Section.getFlags() & ELF::SHF_LINK_ORDER) {
-    const MCSymbol *Sym = Section.getAssociatedSymbol();
+    const MCSymbol *Sym = Section.getLinkedToSymbol();
     const MCSectionELF *Sec = cast<MCSectionELF>(&Sym->getSection());
     sh_link = SectionIndexMap.lookup(Sec);
   }
 
-  WriteSecHdrEntry(StrTabBuilder.getOffset(Section.getSectionName()),
+  WriteSecHdrEntry(StrTabBuilder.getOffset(Section.getName()),
                    Section.getType(), Section.getFlags(), 0, Offset, Size,
                    sh_link, sh_info, Section.getAlignment(),
                    Section.getEntrySize());
@@ -1180,7 +1180,7 @@ uint64_t ELFWriter::writeObject(MCAssembler &Asm, const MCAsmLayout &Layout) {
       uint64_t SecStart = W.OS.tell();
 
       writeRelocations(Asm,
-                       cast<MCSectionELF>(*RelSection->getAssociatedSection()));
+                       cast<MCSectionELF>(*RelSection->getLinkedToSection()));
 
       uint64_t SecEnd = W.OS.tell();
       SectionOffsets[RelSection] = std::make_pair(SecStart, SecEnd);
diff --git a/llvm/lib/MC/MCAsmInfo.cpp b/llvm/lib/MC/MCAsmInfo.cpp
index 420dbaa80ae9..9767ee6c1133 100644
--- a/llvm/lib/MC/MCAsmInfo.cpp
+++ b/llvm/lib/MC/MCAsmInfo.cpp
@@ -64,7 +64,7 @@ MCAsmInfo::MCAsmInfo() {
   // - Generic_GCC toolchains enable the integrated assembler on a per
   //   architecture basis.
   //   - The target subclasses for AArch64, ARM, and X86 handle these cases
-  UseIntegratedAssembler = false;
+  UseIntegratedAssembler = true;
   PreserveAsmComments = true;
 }
 
@@ -95,7 +95,7 @@ MCAsmInfo::getExprForFDESymbol(const MCSymbol *Sym,
   MCContext &Context = Streamer.getContext();
   const MCExpr *Res = MCSymbolRefExpr::create(Sym, Context);
   MCSymbol *PCSym = Context.createTempSymbol();
-  Streamer.EmitLabel(PCSym);
+  Streamer.emitLabel(PCSym);
   const MCExpr *PC = MCSymbolRefExpr::create(PCSym, Context);
   return MCBinaryExpr::createSub(Res, PC, Context);
 }
diff --git a/llvm/lib/MC/MCAsmInfoCOFF.cpp b/llvm/lib/MC/MCAsmInfoCOFF.cpp
index 9f19d163f57b..0b8781c61eb8 100644
--- a/llvm/lib/MC/MCAsmInfoCOFF.cpp
+++ b/llvm/lib/MC/MCAsmInfoCOFF.cpp
@@ -26,7 +26,7 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
   HasDotTypeDotSizeDirective = false;
   HasSingleParameterDotFile = true;
   WeakRefDirective = "\t.weak\t";
-  HasLinkOnceDirective = true;
+  AvoidWeakIfComdat = true;
 
   // Doesn't support visibility:
   HiddenVisibilityAttr = HiddenDeclarationVisibilityAttr = MCSA_Invalid;
@@ -36,8 +36,6 @@ MCAsmInfoCOFF::MCAsmInfoCOFF() {
   SupportsDebugInformation = true;
   NeedsDwarfSectionOffsetDirective = true;
 
-  UseIntegratedAssembler = true;
-
   // At least MSVC inline-asm does AShr.
   UseLogicalShr = false;
 
diff --git a/llvm/lib/MC/MCAsmInfoDarwin.cpp b/llvm/lib/MC/MCAsmInfoDarwin.cpp
index 62bc5b8c9418..2137b81c799f 100644
--- a/llvm/lib/MC/MCAsmInfoDarwin.cpp
+++ b/llvm/lib/MC/MCAsmInfoDarwin.cpp
@@ -29,11 +29,10 @@ bool MCAsmInfoDarwin::isSectionAtomizableBySymbols(
   if (SMO.getType() == MachO::S_CSTRING_LITERALS)
     return false;
 
-  if (SMO.getSegmentName() == "__DATA" && SMO.getSectionName() == "__cfstring")
+  if (SMO.getSegmentName() == "__DATA" && SMO.getName() == "__cfstring")
     return false;
 
-  if (SMO.getSegmentName() == "__DATA" &&
-      SMO.getSectionName() == "__objc_classrefs")
+  if (SMO.getSegmentName() == "__DATA" && SMO.getName() == "__objc_classrefs")
     return false;
 
   switch (SMO.getType()) {
@@ -91,7 +90,5 @@ MCAsmInfoDarwin::MCAsmInfoDarwin() {
   HasAltEntry = true;
 
   DwarfUsesRelocationsAcrossSections = false;
-
-  UseIntegratedAssembler = true;
   SetDirectiveSuppressesReloc = true;
 }
diff --git a/llvm/lib/MC/MCAsmInfoXCOFF.cpp b/llvm/lib/MC/MCAsmInfoXCOFF.cpp
index c51cdff59fa0..b5c5bb3ace8e 100644
--- a/llvm/lib/MC/MCAsmInfoXCOFF.cpp
+++ b/llvm/lib/MC/MCAsmInfoXCOFF.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCAsmInfoXCOFF.h"
+#include "llvm/ADT/StringExtras.h"
 
 using namespace llvm;
 
@@ -14,16 +15,26 @@ void MCAsmInfoXCOFF::anchor() {}
 
 MCAsmInfoXCOFF::MCAsmInfoXCOFF() {
   IsLittleEndian = false;
-  HasDotTypeDotSizeDirective = false;
-  COMMDirectiveAlignmentIsInBytes = false;
-  LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
+  HasVisibilityOnlyWithLinkage = true;
+  PrivateGlobalPrefix = "L..";
+  PrivateLabelPrefix = "L..";
+  SupportsQuotedNames = false;
   UseDotAlignForAlignment = true;
+  ZeroDirective = "\t.space\t";
+  ZeroDirectiveSupportsNonZeroValue = false;
   AsciiDirective = nullptr; // not supported
   AscizDirective = nullptr; // not supported
+
+  // Use .vbyte for data definition to avoid directives that apply an implicit
+  // alignment.
+  Data16bitsDirective = "\t.vbyte\t2, ";
+  Data32bitsDirective = "\t.vbyte\t4, ";
+
+  COMMDirectiveAlignmentIsInBytes = false;
+  LCOMMDirectiveAlignmentType = LCOMM::Log2Alignment;
+  HasDotTypeDotSizeDirective = false;
+  UseIntegratedAssembler = false;
   NeedsFunctionDescriptors = true;
-  HasDotLGloblDirective = true;
-  Data64bitsDirective = "\t.llong\t";
-  SupportsQuotedNames = false;
 }
 
 bool MCAsmInfoXCOFF::isAcceptableChar(char C) const {
@@ -32,5 +43,8 @@ bool MCAsmInfoXCOFF::isAcceptableChar(char C) const {
   if (C == '[' || C == ']')
     return true;
 
-  return MCAsmInfo::isAcceptableChar(C);
+  // For AIX assembler, symbols may consist of numeric digits,
+  // underscores, periods, uppercase or lowercase letters, or
+  // any combination of these.
+  return isAlnum(C) || C == '_' || C == '.';
 }
diff --git a/llvm/lib/MC/MCAsmStreamer.cpp b/llvm/lib/MC/MCAsmStreamer.cpp
index 5d369503995b..6a8572e57922 100644
--- a/llvm/lib/MC/MCAsmStreamer.cpp
+++ b/llvm/lib/MC/MCAsmStreamer.cpp
@@ -28,6 +28,7 @@
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Format.h"
 #include "llvm/Support/FormattedStream.h"
@@ -58,8 +59,8 @@ class MCAsmStreamer final : public MCStreamer {
   unsigned UseDwarfDirectory : 1;
 
   void EmitRegisterName(int64_t Register);
-  void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
-  void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
+  void emitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
+  void emitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
 
 public:
   MCAsmStreamer(MCContext &Context, std::unique_ptr<formatted_raw_ostream> os,
@@ -79,6 +80,8 @@ public:
         InstPrinter->setCommentStream(CommentStream);
     if (Assembler->getBackendPtr())
       setAllowAutoPadding(Assembler->getBackend().allowAutoPadding());
+
+    Context.setUseNamesOnTempLabels(true);
   }
 
   MCAssembler &getAssembler() { return *Assembler; }
@@ -95,7 +98,7 @@ public:
     EmitCommentsAndEOL();
   }
 
-  void EmitSyntaxDirective() override;
+  void emitSyntaxDirective() override;
 
   void EmitCommentsAndEOL();
 
@@ -135,28 +138,28 @@ public:
   /// @name MCStreamer Interface
   /// @{
 
-  void ChangeSection(MCSection *Section, const MCExpr *Subsection) override;
+  void changeSection(MCSection *Section, const MCExpr *Subsection) override;
 
   void emitELFSymverDirective(StringRef AliasName,
                               const MCSymbol *Aliasee) override;
 
-  void EmitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) override;
-  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
+  void emitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) override;
+  void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
 
-  void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
-  void EmitLinkerOptions(ArrayRef<std::string> Options) override;
-  void EmitDataRegion(MCDataRegionType Kind) override;
-  void EmitVersionMin(MCVersionMinType Kind, unsigned Major, unsigned Minor,
+  void emitAssemblerFlag(MCAssemblerFlag Flag) override;
+  void emitLinkerOptions(ArrayRef<std::string> Options) override;
+  void emitDataRegion(MCDataRegionType Kind) override;
+  void emitVersionMin(MCVersionMinType Kind, unsigned Major, unsigned Minor,
                       unsigned Update, VersionTuple SDKVersion) override;
-  void EmitBuildVersion(unsigned Platform, unsigned Major, unsigned Minor,
+  void emitBuildVersion(unsigned Platform, unsigned Major, unsigned Minor,
                         unsigned Update, VersionTuple SDKVersion) override;
-  void EmitThumbFunc(MCSymbol *Func) override;
+  void emitThumbFunc(MCSymbol *Func) override;
 
-  void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
-  void EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override;
-  bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
+  void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
+  void emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) override;
+  bool emitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
 
-  void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
+  void emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
   void BeginCOFFSymbolDef(const MCSymbol *Symbol) override;
   void EmitCOFFSymbolStorageClass(int StorageClass) override;
   void EmitCOFFSymbolType(int Type) override;
@@ -166,11 +169,17 @@ public:
   void EmitCOFFSectionIndex(MCSymbol const *Symbol) override;
   void EmitCOFFSecRel32(MCSymbol const *Symbol, uint64_t Offset) override;
   void EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) override;
-  void EmitXCOFFLocalCommonSymbol(MCSymbol *LabelSym, uint64_t Size,
+  void emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym, uint64_t Size,
                                   MCSymbol *CsectSym,
                                   unsigned ByteAlign) override;
+  void emitXCOFFSymbolLinkageWithVisibility(MCSymbol *Symbol,
+                                            MCSymbolAttr Linakge,
+                                            MCSymbolAttr Visibility) override;
+  void emitXCOFFRenameDirective(const MCSymbol *Name,
+                                StringRef Rename) override;
+
   void emitELFSize(MCSymbol *Symbol, const MCExpr *Value) override;
-  void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+  void emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
 
   /// Emit a local common (.lcomm) symbol.
@@ -178,37 +187,38 @@ public:
   /// @param Symbol - The common symbol to emit.
   /// @param Size - The size of the common symbol.
   /// @param ByteAlignment - The alignment of the common symbol in bytes.
-  void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+  void emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                              unsigned ByteAlignment) override;
 
-  void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
+  void emitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
                     uint64_t Size = 0, unsigned ByteAlignment = 0,
                     SMLoc Loc = SMLoc()) override;
 
-  void EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
+  void emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                       unsigned ByteAlignment = 0) override;
 
-  void EmitBinaryData(StringRef Data) override;
+  void emitBinaryData(StringRef Data) override;
 
-  void EmitBytes(StringRef Data) override;
+  void emitBytes(StringRef Data) override;
 
-  void EmitValueImpl(const MCExpr *Value, unsigned Size,
+  void emitValueImpl(const MCExpr *Value, unsigned Size,
                      SMLoc Loc = SMLoc()) override;
-  void EmitIntValue(uint64_t Value, unsigned Size) override;
-  void EmitIntValueInHex(uint64_t Value, unsigned Size) override;
+  void emitIntValue(uint64_t Value, unsigned Size) override;
+  void emitIntValueInHex(uint64_t Value, unsigned Size) override;
+  void emitIntValueInHexWithPadding(uint64_t Value, unsigned Size) override;
 
-  void EmitULEB128Value(const MCExpr *Value) override;
+  void emitULEB128Value(const MCExpr *Value) override;
 
-  void EmitSLEB128Value(const MCExpr *Value) override;
+  void emitSLEB128Value(const MCExpr *Value) override;
 
-  void EmitDTPRel32Value(const MCExpr *Value) override;
-  void EmitDTPRel64Value(const MCExpr *Value) override;
-  void EmitTPRel32Value(const MCExpr *Value) override;
-  void EmitTPRel64Value(const MCExpr *Value) override;
+  void emitDTPRel32Value(const MCExpr *Value) override;
+  void emitDTPRel64Value(const MCExpr *Value) override;
+  void emitTPRel32Value(const MCExpr *Value) override;
+  void emitTPRel64Value(const MCExpr *Value) override;
 
-  void EmitGPRel64Value(const MCExpr *Value) override;
+  void emitGPRel64Value(const MCExpr *Value) override;
 
-  void EmitGPRel32Value(const MCExpr *Value) override;
+  void emitGPRel32Value(const MCExpr *Value) override;
 
   void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
                 SMLoc Loc = SMLoc()) override;
@@ -216,18 +226,18 @@ public:
   void emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr,
                 SMLoc Loc = SMLoc()) override;
 
-  void EmitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
+  void emitValueToAlignment(unsigned ByteAlignment, int64_t Value = 0,
                             unsigned ValueSize = 1,
                             unsigned MaxBytesToEmit = 0) override;
 
-  void EmitCodeAlignment(unsigned ByteAlignment,
+  void emitCodeAlignment(unsigned ByteAlignment,
                          unsigned MaxBytesToEmit = 0) override;
 
   void emitValueToOffset(const MCExpr *Offset,
                          unsigned char Value,
                          SMLoc Loc) override;
 
-  void EmitFileDirective(StringRef Filename) override;
+  void emitFileDirective(StringRef Filename) override;
   Expected<unsigned> tryEmitDwarfFileDirective(unsigned FileNo,
                                                StringRef Directory,
                                                StringRef Filename,
@@ -238,9 +248,9 @@ public:
                                Optional<MD5::MD5Result> Checksum,
                                Optional<StringRef> Source,
                                unsigned CUID = 0) override;
-  void EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
-                             unsigned Column, unsigned Flags,
-                             unsigned Isa, unsigned Discriminator,
+  void emitDwarfLocDirective(unsigned FileNo, unsigned Line, unsigned Column,
+                             unsigned Flags, unsigned Isa,
+                             unsigned Discriminator,
                              StringRef FileName) override;
   MCSymbol *getDwarfLineTableSymbol(unsigned CUID) override;
 
@@ -251,12 +261,12 @@ public:
   bool EmitCVInlineSiteIdDirective(unsigned FunctionId, unsigned IAFunc,
                                    unsigned IAFile, unsigned IALine,
                                    unsigned IACol, SMLoc Loc) override;
-  void EmitCVLocDirective(unsigned FunctionId, unsigned FileNo, unsigned Line,
+  void emitCVLocDirective(unsigned FunctionId, unsigned FileNo, unsigned Line,
                           unsigned Column, bool PrologueEnd, bool IsStmt,
                           StringRef FileName, SMLoc Loc) override;
-  void EmitCVLinetableDirective(unsigned FunctionId, const MCSymbol *FnStart,
+  void emitCVLinetableDirective(unsigned FunctionId, const MCSymbol *FnStart,
                                 const MCSymbol *FnEnd) override;
-  void EmitCVInlineLinetableDirective(unsigned PrimaryFunctionId,
+  void emitCVInlineLinetableDirective(unsigned PrimaryFunctionId,
                                       unsigned SourceFileId,
                                       unsigned SourceLineNum,
                                       const MCSymbol *FnStartSym,
@@ -265,50 +275,50 @@ public:
   void PrintCVDefRangePrefix(
       ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges);
 
-  void EmitCVDefRangeDirective(
+  void emitCVDefRangeDirective(
       ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
       codeview::DefRangeRegisterRelHeader DRHdr) override;
 
-  void EmitCVDefRangeDirective(
+  void emitCVDefRangeDirective(
       ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
       codeview::DefRangeSubfieldRegisterHeader DRHdr) override;
 
-  void EmitCVDefRangeDirective(
+  void emitCVDefRangeDirective(
       ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
       codeview::DefRangeRegisterHeader DRHdr) override;
 
-  void EmitCVDefRangeDirective(
+  void emitCVDefRangeDirective(
       ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
       codeview::DefRangeFramePointerRelHeader DRHdr) override;
 
-  void EmitCVStringTableDirective() override;
-  void EmitCVFileChecksumsDirective() override;
-  void EmitCVFileChecksumOffsetDirective(unsigned FileNo) override;
+  void emitCVStringTableDirective() override;
+  void emitCVFileChecksumsDirective() override;
+  void emitCVFileChecksumOffsetDirective(unsigned FileNo) override;
   void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc L) override;
 
-  void EmitIdent(StringRef IdentString) override;
-  void EmitCFIBKeyFrame() override;
-  void EmitCFISections(bool EH, bool Debug) override;
-  void EmitCFIDefCfa(int64_t Register, int64_t Offset) override;
-  void EmitCFIDefCfaOffset(int64_t Offset) override;
-  void EmitCFIDefCfaRegister(int64_t Register) override;
-  void EmitCFIOffset(int64_t Register, int64_t Offset) override;
-  void EmitCFIPersonality(const MCSymbol *Sym, unsigned Encoding) override;
-  void EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) override;
-  void EmitCFIRememberState() override;
-  void EmitCFIRestoreState() override;
-  void EmitCFIRestore(int64_t Register) override;
-  void EmitCFISameValue(int64_t Register) override;
-  void EmitCFIRelOffset(int64_t Register, int64_t Offset) override;
-  void EmitCFIAdjustCfaOffset(int64_t Adjustment) override;
-  void EmitCFIEscape(StringRef Values) override;
-  void EmitCFIGnuArgsSize(int64_t Size) override;
-  void EmitCFISignalFrame() override;
-  void EmitCFIUndefined(int64_t Register) override;
-  void EmitCFIRegister(int64_t Register1, int64_t Register2) override;
-  void EmitCFIWindowSave() override;
-  void EmitCFINegateRAState() override;
-  void EmitCFIReturnColumn(int64_t Register) override;
+  void emitIdent(StringRef IdentString) override;
+  void emitCFIBKeyFrame() override;
+  void emitCFISections(bool EH, bool Debug) override;
+  void emitCFIDefCfa(int64_t Register, int64_t Offset) override;
+  void emitCFIDefCfaOffset(int64_t Offset) override;
+  void emitCFIDefCfaRegister(int64_t Register) override;
+  void emitCFIOffset(int64_t Register, int64_t Offset) override;
+  void emitCFIPersonality(const MCSymbol *Sym, unsigned Encoding) override;
+  void emitCFILsda(const MCSymbol *Sym, unsigned Encoding) override;
+  void emitCFIRememberState() override;
+  void emitCFIRestoreState() override;
+  void emitCFIRestore(int64_t Register) override;
+  void emitCFISameValue(int64_t Register) override;
+  void emitCFIRelOffset(int64_t Register, int64_t Offset) override;
+  void emitCFIAdjustCfaOffset(int64_t Adjustment) override;
+  void emitCFIEscape(StringRef Values) override;
+  void emitCFIGnuArgsSize(int64_t Size) override;
+  void emitCFISignalFrame() override;
+  void emitCFIUndefined(int64_t Register) override;
+  void emitCFIRegister(int64_t Register1, int64_t Register2) override;
+  void emitCFIWindowSave() override;
+  void emitCFINegateRAState() override;
+  void emitCFIReturnColumn(int64_t Register) override;
 
   void EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) override;
   void EmitWinCFIEndProc(SMLoc Loc) override;
@@ -333,25 +343,25 @@ public:
   void emitCGProfileEntry(const MCSymbolRefExpr *From,
                           const MCSymbolRefExpr *To, uint64_t Count) override;
 
-  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+  void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
 
-  void EmitBundleAlignMode(unsigned AlignPow2) override;
-  void EmitBundleLock(bool AlignToEnd) override;
-  void EmitBundleUnlock() override;
+  void emitBundleAlignMode(unsigned AlignPow2) override;
+  void emitBundleLock(bool AlignToEnd) override;
+  void emitBundleUnlock() override;
 
-  bool EmitRelocDirective(const MCExpr &Offset, StringRef Name,
-                          const MCExpr *Expr, SMLoc Loc,
-                          const MCSubtargetInfo &STI) override;
+  Optional<std::pair<bool, std::string>>
+  emitRelocDirective(const MCExpr &Offset, StringRef Name, const MCExpr *Expr,
+                     SMLoc Loc, const MCSubtargetInfo &STI) override;
 
-  void EmitAddrsig() override;
-  void EmitAddrsigSym(const MCSymbol *Sym) override;
+  void emitAddrsig() override;
+  void emitAddrsigSym(const MCSymbol *Sym) override;
 
   /// If this file is backed by an assembly streamer, this dumps the specified
   /// string in the output .s file. This capability is indicated by the
   /// hasRawTextSupport() predicate.
-  void EmitRawTextImpl(StringRef String) override;
+  void emitRawTextImpl(StringRef String) override;
 
-  void FinishImpl() override;
+  void finishImpl() override;
 };
 
 } // end anonymous namespace.
@@ -443,7 +453,7 @@ void MCAsmStreamer::emitExplicitComments() {
   ExplicitCommentToEmit.clear();
 }
 
-void MCAsmStreamer::ChangeSection(MCSection *Section,
+void MCAsmStreamer::changeSection(MCSection *Section,
                                   const MCExpr *Subsection) {
   assert(Section && "Cannot switch to a null section!");
   if (MCTargetStreamer *TS = getTargetStreamer()) {
@@ -463,8 +473,8 @@ void MCAsmStreamer::emitELFSymverDirective(StringRef AliasName,
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
-  MCStreamer::EmitLabel(Symbol, Loc);
+void MCAsmStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
+  MCStreamer::emitLabel(Symbol, Loc);
 
   Symbol->print(OS, MAI);
   OS << MAI->getLabelSuffix();
@@ -472,7 +482,7 @@ void MCAsmStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) {
+void MCAsmStreamer::emitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) {
   StringRef str = MCLOHIdToName(Kind);
 
 #ifndef NDEBUG
@@ -492,7 +502,7 @@ void MCAsmStreamer::EmitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+void MCAsmStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {
   switch (Flag) {
   case MCAF_SyntaxUnified:         OS << "\t.syntax unified"; break;
   case MCAF_SubsectionsViaSymbols: OS << ".subsections_via_symbols"; break;
@@ -503,7 +513,7 @@ void MCAsmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitLinkerOptions(ArrayRef<std::string> Options) {
+void MCAsmStreamer::emitLinkerOptions(ArrayRef<std::string> Options) {
   assert(!Options.empty() && "At least one option is required!");
   OS << "\t.linker_option \"" << Options[0] << '"';
   for (ArrayRef<std::string>::iterator it = Options.begin() + 1,
@@ -513,7 +523,7 @@ void MCAsmStreamer::EmitLinkerOptions(ArrayRef<std::string> Options) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitDataRegion(MCDataRegionType Kind) {
+void MCAsmStreamer::emitDataRegion(MCDataRegionType Kind) {
   if (!MAI->doesSupportDataRegionDirectives())
     return;
   switch (Kind) {
@@ -549,7 +559,7 @@ static void EmitSDKVersionSuffix(raw_ostream &OS,
   }
 }
 
-void MCAsmStreamer::EmitVersionMin(MCVersionMinType Type, unsigned Major,
+void MCAsmStreamer::emitVersionMin(MCVersionMinType Type, unsigned Major,
                                    unsigned Minor, unsigned Update,
                                    VersionTuple SDKVersion) {
   OS << '\t' << getVersionMinDirective(Type) << ' ' << Major << ", " << Minor;
@@ -574,7 +584,7 @@ static const char *getPlatformName(MachO::PlatformType Type) {
   llvm_unreachable("Invalid Mach-O platform type");
 }
 
-void MCAsmStreamer::EmitBuildVersion(unsigned Platform, unsigned Major,
+void MCAsmStreamer::emitBuildVersion(unsigned Platform, unsigned Major,
                                      unsigned Minor, unsigned Update,
                                      VersionTuple SDKVersion) {
   const char *PlatformName = getPlatformName((MachO::PlatformType)Platform);
@@ -585,7 +595,7 @@ void MCAsmStreamer::EmitBuildVersion(unsigned Platform, unsigned Major,
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitThumbFunc(MCSymbol *Func) {
+void MCAsmStreamer::emitThumbFunc(MCSymbol *Func) {
   // This needs to emit to a temporary string to get properly quoted
   // MCSymbols when they have spaces in them.
   OS << "\t.thumb_func";
@@ -597,7 +607,7 @@ void MCAsmStreamer::EmitThumbFunc(MCSymbol *Func) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+void MCAsmStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
   // Do not emit a .set on inlined target assignments.
   bool EmitSet = true;
   if (auto *E = dyn_cast<MCTargetExpr>(Value))
@@ -612,10 +622,10 @@ void MCAsmStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
     EmitEOL();
   }
 
-  MCStreamer::EmitAssignment(Symbol, Value);
+  MCStreamer::emitAssignment(Symbol, Value);
 }
 
-void MCAsmStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {
+void MCAsmStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {
   OS << ".weakref ";
   Alias->print(OS, MAI);
   OS << ", ";
@@ -623,7 +633,7 @@ void MCAsmStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {
   EmitEOL();
 }
 
-bool MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+bool MCAsmStreamer::emitSymbolAttribute(MCSymbol *Symbol,
                                         MCSymbolAttr Attribute) {
   switch (Attribute) {
   case MCSA_Invalid: llvm_unreachable("Invalid symbol attribute");
@@ -672,6 +682,9 @@ bool MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
     break;
   case MCSA_Protected:      OS << "\t.protected\t";       break;
   case MCSA_Reference:      OS << "\t.reference\t";       break;
+  case MCSA_Extern:
+    OS << "\t.extern\t";
+    break;
   case MCSA_Weak:           OS << MAI->getWeakDirective(); break;
   case MCSA_WeakDefinition:
     OS << "\t.weak_definition\t";
@@ -690,14 +703,14 @@ bool MCAsmStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   return true;
 }
 
-void MCAsmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+void MCAsmStreamer::emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   OS << ".desc" << ' ';
   Symbol->print(OS, MAI);
   OS << ',' << DescValue;
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitSyntaxDirective() {
+void MCAsmStreamer::emitSyntaxDirective() {
   if (MAI->getAssemblerDialect() == 1) {
     OS << "\t.intel_syntax noprefix";
     EmitEOL();
@@ -768,7 +781,7 @@ void MCAsmStreamer::EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {
 // We need an XCOFF-specific version of this directive as the AIX syntax
 // requires a QualName argument identifying the csect name and storage mapping
 // class to appear before the alignment if we are specifying it.
-void MCAsmStreamer::EmitXCOFFLocalCommonSymbol(MCSymbol *LabelSym,
+void MCAsmStreamer::emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym,
                                                uint64_t Size,
                                                MCSymbol *CsectSym,
                                                unsigned ByteAlignment) {
@@ -785,6 +798,65 @@ void MCAsmStreamer::EmitXCOFFLocalCommonSymbol(MCSymbol *LabelSym,
   EmitEOL();
 }
 
+void MCAsmStreamer::emitXCOFFSymbolLinkageWithVisibility(
+    MCSymbol *Symbol, MCSymbolAttr Linkage, MCSymbolAttr Visibility) {
+  // Print symbol's rename (original name contains invalid character(s)) if
+  // there is one.
+  if (cast<MCSymbolXCOFF>(Symbol)->hasRename())
+    emitXCOFFRenameDirective(Symbol,
+                             cast<MCSymbolXCOFF>(Symbol)->getSymbolTableName());
+
+  switch (Linkage) {
+  case MCSA_Global:
+    OS << MAI->getGlobalDirective();
+    break;
+  case MCSA_Weak:
+    OS << MAI->getWeakDirective();
+    break;
+  case MCSA_Extern:
+    OS << "\t.extern\t";
+    break;
+  case MCSA_LGlobal:
+    OS << "\t.lglobl\t";
+    break;
+  default:
+    report_fatal_error("unhandled linkage type");
+  }
+
+  Symbol->print(OS, MAI);
+
+  switch (Visibility) {
+  case MCSA_Invalid:
+    // Nothing to do.
+    break;
+  case MCSA_Hidden:
+    OS << ",hidden";
+    break;
+  case MCSA_Protected:
+    OS << ",protected";
+    break;
+  default:
+    report_fatal_error("unexpected value for Visibility type");
+  }
+  EmitEOL();
+}
+
+void MCAsmStreamer::emitXCOFFRenameDirective(const MCSymbol *Name,
+                                             StringRef Rename) {
+  OS << "\t.rename\t";
+  Name->print(OS, MAI);
+  const char DQ = '"';
+  OS << ',' << DQ;
+  for (char C : Rename) {
+    // To escape a double quote character, the character should be doubled.
+    if (C == DQ)
+      OS << DQ;
+    OS << C;
+  }
+  OS << DQ;
+  EmitEOL();
+}
+
 void MCAsmStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
   assert(MAI->hasDotTypeDotSizeDirective());
   OS << "\t.size\t";
@@ -794,8 +866,14 @@ void MCAsmStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+void MCAsmStreamer::emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                      unsigned ByteAlignment) {
+  // Print symbol's rename (original name contains invalid character(s)) if
+  // there is one.
+  MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(Symbol);
+  if (XSym && XSym->hasRename())
+    emitXCOFFRenameDirective(XSym, XSym->getSymbolTableName());
+
   OS << "\t.comm\t";
   Symbol->print(OS, MAI);
   OS << ',' << Size;
@@ -809,7 +887,7 @@ void MCAsmStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+void MCAsmStreamer::emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                           unsigned ByteAlign) {
   OS << "\t.lcomm\t";
   Symbol->print(OS, MAI);
@@ -831,7 +909,7 @@ void MCAsmStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
+void MCAsmStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
                                  uint64_t Size, unsigned ByteAlignment,
                                  SMLoc Loc) {
   if (Symbol)
@@ -845,7 +923,7 @@ void MCAsmStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
   // This is a mach-o specific directive.
 
   const MCSectionMachO *MOSection = ((const MCSectionMachO*)Section);
-  OS << MOSection->getSegmentName() << "," << MOSection->getSectionName();
+  OS << MOSection->getSegmentName() << "," << MOSection->getName();
 
   if (Symbol) {
     OS << ',';
@@ -860,7 +938,7 @@ void MCAsmStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
 // .tbss sym, size, align
 // This depends that the symbol has already been mangled from the original,
 // e.g. _a.
-void MCAsmStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
+void MCAsmStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment) {
   AssignFragment(Symbol, &Section->getDummyFragment());
 
@@ -917,7 +995,7 @@ static void PrintQuotedString(StringRef Data, raw_ostream &OS) {
   OS << '"';
 }
 
-void MCAsmStreamer::EmitBytes(StringRef Data) {
+void MCAsmStreamer::emitBytes(StringRef Data) {
   assert(getCurrentSectionOnly() &&
          "Cannot emit contents before setting section!");
   if (Data.empty()) return;
@@ -951,7 +1029,7 @@ void MCAsmStreamer::EmitBytes(StringRef Data) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitBinaryData(StringRef Data) {
+void MCAsmStreamer::emitBinaryData(StringRef Data) {
   // This is binary data. Print it in a grid of hex bytes for readability.
   const size_t Cols = 4;
   for (size_t I = 0, EI = alignTo(Data.size(), Cols); I < EI; I += Cols) {
@@ -965,15 +1043,20 @@ void MCAsmStreamer::EmitBinaryData(StringRef Data) {
   }
 }
 
-void MCAsmStreamer::EmitIntValue(uint64_t Value, unsigned Size) {
-  EmitValue(MCConstantExpr::create(Value, getContext()), Size);
+void MCAsmStreamer::emitIntValue(uint64_t Value, unsigned Size) {
+  emitValue(MCConstantExpr::create(Value, getContext()), Size);
+}
+
+void MCAsmStreamer::emitIntValueInHex(uint64_t Value, unsigned Size) {
+  emitValue(MCConstantExpr::create(Value, getContext(), true), Size);
 }
 
-void MCAsmStreamer::EmitIntValueInHex(uint64_t Value, unsigned Size) {
-  EmitValue(MCConstantExpr::create(Value, getContext(), true), Size);
+void MCAsmStreamer::emitIntValueInHexWithPadding(uint64_t Value,
+                                                 unsigned Size) {
+  emitValue(MCConstantExpr::create(Value, getContext(), true, Size), Size);
 }
 
-void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+void MCAsmStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
                                   SMLoc Loc) {
   assert(Size <= 8 && "Invalid size");
   assert(getCurrentSectionOnly() &&
@@ -1015,7 +1098,7 @@ void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
                          std::numeric_limits<unsigned long long>::digits) &&
              "undefined behavior");
       ValueToEmit &= ~0ULL >> Shift;
-      EmitIntValue(ValueToEmit, EmissionSize);
+      emitIntValue(ValueToEmit, EmissionSize);
       Emitted += EmissionSize;
     }
     return;
@@ -1031,10 +1114,10 @@ void MCAsmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
   }
 }
 
-void MCAsmStreamer::EmitULEB128Value(const MCExpr *Value) {
+void MCAsmStreamer::emitULEB128Value(const MCExpr *Value) {
   int64_t IntValue;
   if (Value->evaluateAsAbsolute(IntValue)) {
-    EmitULEB128IntValue(IntValue);
+    emitULEB128IntValue(IntValue);
     return;
   }
   OS << "\t.uleb128 ";
@@ -1042,10 +1125,10 @@ void MCAsmStreamer::EmitULEB128Value(const MCExpr *Value) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) {
+void MCAsmStreamer::emitSLEB128Value(const MCExpr *Value) {
   int64_t IntValue;
   if (Value->evaluateAsAbsolute(IntValue)) {
-    EmitSLEB128IntValue(IntValue);
+    emitSLEB128IntValue(IntValue);
     return;
   }
   OS << "\t.sleb128 ";
@@ -1053,42 +1136,42 @@ void MCAsmStreamer::EmitSLEB128Value(const MCExpr *Value) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitDTPRel64Value(const MCExpr *Value) {
+void MCAsmStreamer::emitDTPRel64Value(const MCExpr *Value) {
   assert(MAI->getDTPRel64Directive() != nullptr);
   OS << MAI->getDTPRel64Directive();
   Value->print(OS, MAI);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitDTPRel32Value(const MCExpr *Value) {
+void MCAsmStreamer::emitDTPRel32Value(const MCExpr *Value) {
   assert(MAI->getDTPRel32Directive() != nullptr);
   OS << MAI->getDTPRel32Directive();
   Value->print(OS, MAI);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitTPRel64Value(const MCExpr *Value) {
+void MCAsmStreamer::emitTPRel64Value(const MCExpr *Value) {
   assert(MAI->getTPRel64Directive() != nullptr);
   OS << MAI->getTPRel64Directive();
   Value->print(OS, MAI);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitTPRel32Value(const MCExpr *Value) {
+void MCAsmStreamer::emitTPRel32Value(const MCExpr *Value) {
   assert(MAI->getTPRel32Directive() != nullptr);
   OS << MAI->getTPRel32Directive();
   Value->print(OS, MAI);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitGPRel64Value(const MCExpr *Value) {
+void MCAsmStreamer::emitGPRel64Value(const MCExpr *Value) {
   assert(MAI->getGPRel64Directive() != nullptr);
   OS << MAI->getGPRel64Directive();
   Value->print(OS, MAI);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitGPRel32Value(const MCExpr *Value) {
+void MCAsmStreamer::emitGPRel32Value(const MCExpr *Value) {
   assert(MAI->getGPRel32Directive() != nullptr);
   OS << MAI->getGPRel32Directive();
   Value->print(OS, MAI);
@@ -1098,16 +1181,27 @@ void MCAsmStreamer::EmitGPRel32Value(const MCExpr *Value) {
 void MCAsmStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue,
                              SMLoc Loc) {
   int64_t IntNumBytes;
-  if (NumBytes.evaluateAsAbsolute(IntNumBytes) && IntNumBytes == 0)
+  const bool IsAbsolute = NumBytes.evaluateAsAbsolute(IntNumBytes);
+  if (IsAbsolute && IntNumBytes == 0)
     return;
 
   if (const char *ZeroDirective = MAI->getZeroDirective()) {
-    // FIXME: Emit location directives
-    OS << ZeroDirective;
-    NumBytes.print(OS, MAI);
-    if (FillValue != 0)
-      OS << ',' << (int)FillValue;
-    EmitEOL();
+    if (MAI->doesZeroDirectiveSupportNonZeroValue() || FillValue == 0) {
+      // FIXME: Emit location directives
+      OS << ZeroDirective;
+      NumBytes.print(OS, MAI);
+      if (FillValue != 0)
+        OS << ',' << (int)FillValue;
+      EmitEOL();
+    } else {
+      if (!IsAbsolute)
+        report_fatal_error(
+            "Cannot emit non-absolute expression lengths of fill.");
+      for (int i = 0; i < IntNumBytes; ++i) {
+        OS << MAI->getData8bitsDirective() << (int)FillValue;
+        EmitEOL();
+      }
+    }
     return;
   }
 
@@ -1124,7 +1218,7 @@ void MCAsmStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
+void MCAsmStreamer::emitValueToAlignment(unsigned ByteAlignment, int64_t Value,
                                          unsigned ValueSize,
                                          unsigned MaxBytesToEmit) {
   if (MAI->useDotAlignForAlignment()) {
@@ -1186,10 +1280,10 @@ void MCAsmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCodeAlignment(unsigned ByteAlignment,
+void MCAsmStreamer::emitCodeAlignment(unsigned ByteAlignment,
                                       unsigned MaxBytesToEmit) {
   // Emit with a text fill value.
-  EmitValueToAlignment(ByteAlignment, MAI->getTextAlignFillValue(),
+  emitValueToAlignment(ByteAlignment, MAI->getTextAlignFillValue(),
                        1, MaxBytesToEmit);
 }
 
@@ -1203,7 +1297,7 @@ void MCAsmStreamer::emitValueToOffset(const MCExpr *Offset,
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitFileDirective(StringRef Filename) {
+void MCAsmStreamer::emitFileDirective(StringRef Filename) {
   assert(MAI->hasSingleParameterDotFile());
   OS << "\t.file\t";
   PrintQuotedString(Filename, OS);
@@ -1267,7 +1361,7 @@ Expected<unsigned> MCAsmStreamer::tryEmitDwarfFileDirective(
   if (MCTargetStreamer *TS = getTargetStreamer())
     TS->emitDwarfFileDirective(OS1.str());
   else
-    EmitRawText(OS1.str());
+    emitRawText(OS1.str());
 
   return FileNo;
 }
@@ -1293,13 +1387,12 @@ void MCAsmStreamer::emitDwarfFile0Directive(StringRef Directory,
   if (MCTargetStreamer *TS = getTargetStreamer())
     TS->emitDwarfFileDirective(OS1.str());
   else
-    EmitRawText(OS1.str());
+    emitRawText(OS1.str());
 }
 
-void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
+void MCAsmStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                           unsigned Column, unsigned Flags,
-                                          unsigned Isa,
-                                          unsigned Discriminator,
+                                          unsigned Isa, unsigned Discriminator,
                                           StringRef FileName) {
   OS << "\t.loc\t" << FileNo << " " << Line << " " << Column;
   if (MAI->supportsExtendedDwarfLocDirective()) {
@@ -1332,8 +1425,8 @@ void MCAsmStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
        << Line << ':' << Column;
   }
   EmitEOL();
-  this->MCStreamer::EmitDwarfLocDirective(FileNo, Line, Column, Flags,
-                                          Isa, Discriminator, FileName);
+  this->MCStreamer::emitDwarfLocDirective(FileNo, Line, Column, Flags, Isa,
+                                          Discriminator, FileName);
 }
 
 MCSymbol *MCAsmStreamer::getDwarfLineTableSymbol(unsigned CUID) {
@@ -1381,7 +1474,7 @@ bool MCAsmStreamer::EmitCVInlineSiteIdDirective(unsigned FunctionId,
                                                  IALine, IACol, Loc);
 }
 
-void MCAsmStreamer::EmitCVLocDirective(unsigned FunctionId, unsigned FileNo,
+void MCAsmStreamer::emitCVLocDirective(unsigned FunctionId, unsigned FileNo,
                                        unsigned Line, unsigned Column,
                                        bool PrologueEnd, bool IsStmt,
                                        StringRef FileName, SMLoc Loc) {
@@ -1405,7 +1498,7 @@ void MCAsmStreamer::EmitCVLocDirective(unsigned FunctionId, unsigned FileNo,
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCVLinetableDirective(unsigned FunctionId,
+void MCAsmStreamer::emitCVLinetableDirective(unsigned FunctionId,
                                              const MCSymbol *FnStart,
                                              const MCSymbol *FnEnd) {
   OS << "\t.cv_linetable\t" << FunctionId << ", ";
@@ -1413,10 +1506,10 @@ void MCAsmStreamer::EmitCVLinetableDirective(unsigned FunctionId,
   OS << ", ";
   FnEnd->print(OS, MAI);
   EmitEOL();
-  this->MCStreamer::EmitCVLinetableDirective(FunctionId, FnStart, FnEnd);
+  this->MCStreamer::emitCVLinetableDirective(FunctionId, FnStart, FnEnd);
 }
 
-void MCAsmStreamer::EmitCVInlineLinetableDirective(unsigned PrimaryFunctionId,
+void MCAsmStreamer::emitCVInlineLinetableDirective(unsigned PrimaryFunctionId,
                                                    unsigned SourceFileId,
                                                    unsigned SourceLineNum,
                                                    const MCSymbol *FnStartSym,
@@ -1427,7 +1520,7 @@ void MCAsmStreamer::EmitCVInlineLinetableDirective(unsigned PrimaryFunctionId,
   OS << ' ';
   FnEndSym->print(OS, MAI);
   EmitEOL();
-  this->MCStreamer::EmitCVInlineLinetableDirective(
+  this->MCStreamer::emitCVInlineLinetableDirective(
       PrimaryFunctionId, SourceFileId, SourceLineNum, FnStartSym, FnEndSym);
 }
 
@@ -1442,7 +1535,7 @@ void MCAsmStreamer::PrintCVDefRangePrefix(
   }
 }
 
-void MCAsmStreamer::EmitCVDefRangeDirective(
+void MCAsmStreamer::emitCVDefRangeDirective(
     ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
     codeview::DefRangeRegisterRelHeader DRHdr) {
   PrintCVDefRangePrefix(Ranges);
@@ -1452,7 +1545,7 @@ void MCAsmStreamer::EmitCVDefRangeDirective(
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCVDefRangeDirective(
+void MCAsmStreamer::emitCVDefRangeDirective(
     ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
     codeview::DefRangeSubfieldRegisterHeader DRHdr) {
   PrintCVDefRangePrefix(Ranges);
@@ -1461,7 +1554,7 @@ void MCAsmStreamer::EmitCVDefRangeDirective(
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCVDefRangeDirective(
+void MCAsmStreamer::emitCVDefRangeDirective(
     ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
     codeview::DefRangeRegisterHeader DRHdr) {
   PrintCVDefRangePrefix(Ranges);
@@ -1470,7 +1563,7 @@ void MCAsmStreamer::EmitCVDefRangeDirective(
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCVDefRangeDirective(
+void MCAsmStreamer::emitCVDefRangeDirective(
     ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
     codeview::DefRangeFramePointerRelHeader DRHdr) {
   PrintCVDefRangePrefix(Ranges);
@@ -1479,17 +1572,17 @@ void MCAsmStreamer::EmitCVDefRangeDirective(
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCVStringTableDirective() {
+void MCAsmStreamer::emitCVStringTableDirective() {
   OS << "\t.cv_stringtable";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCVFileChecksumsDirective() {
+void MCAsmStreamer::emitCVFileChecksumsDirective() {
   OS << "\t.cv_filechecksums";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCVFileChecksumOffsetDirective(unsigned FileNo) {
+void MCAsmStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) {
   OS << "\t.cv_filechecksumoffset\t" << FileNo;
   EmitEOL();
 }
@@ -1500,15 +1593,15 @@ void MCAsmStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc L) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitIdent(StringRef IdentString) {
+void MCAsmStreamer::emitIdent(StringRef IdentString) {
   assert(MAI->hasIdentDirective() && ".ident directive not supported");
   OS << "\t.ident\t";
   PrintQuotedString(IdentString, OS);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFISections(bool EH, bool Debug) {
-  MCStreamer::EmitCFISections(EH, Debug);
+void MCAsmStreamer::emitCFISections(bool EH, bool Debug) {
+  MCStreamer::emitCFISections(EH, Debug);
   OS << "\t.cfi_sections ";
   if (EH) {
     OS << ".eh_frame";
@@ -1521,15 +1614,15 @@ void MCAsmStreamer::EmitCFISections(bool EH, bool Debug) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
+void MCAsmStreamer::emitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
   OS << "\t.cfi_startproc";
   if (Frame.IsSimple)
     OS << " simple";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
-  MCStreamer::EmitCFIEndProcImpl(Frame);
+void MCAsmStreamer::emitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
+  MCStreamer::emitCFIEndProcImpl(Frame);
   OS << "\t.cfi_endproc";
   EmitEOL();
 }
@@ -1548,16 +1641,16 @@ void MCAsmStreamer::EmitRegisterName(int64_t Register) {
   OS << Register;
 }
 
-void MCAsmStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) {
-  MCStreamer::EmitCFIDefCfa(Register, Offset);
+void MCAsmStreamer::emitCFIDefCfa(int64_t Register, int64_t Offset) {
+  MCStreamer::emitCFIDefCfa(Register, Offset);
   OS << "\t.cfi_def_cfa ";
   EmitRegisterName(Register);
   OS << ", " << Offset;
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIDefCfaOffset(int64_t Offset) {
-  MCStreamer::EmitCFIDefCfaOffset(Offset);
+void MCAsmStreamer::emitCFIDefCfaOffset(int64_t Offset) {
+  MCStreamer::emitCFIDefCfaOffset(Offset);
   OS << "\t.cfi_def_cfa_offset " << Offset;
   EmitEOL();
 }
@@ -1572,14 +1665,14 @@ static void PrintCFIEscape(llvm::formatted_raw_ostream &OS, StringRef Values) {
   }
 }
 
-void MCAsmStreamer::EmitCFIEscape(StringRef Values) {
-  MCStreamer::EmitCFIEscape(Values);
+void MCAsmStreamer::emitCFIEscape(StringRef Values) {
+  MCStreamer::emitCFIEscape(Values);
   PrintCFIEscape(OS, Values);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIGnuArgsSize(int64_t Size) {
-  MCStreamer::EmitCFIGnuArgsSize(Size);
+void MCAsmStreamer::emitCFIGnuArgsSize(int64_t Size) {
+  MCStreamer::emitCFIGnuArgsSize(Size);
 
   uint8_t Buffer[16] = { dwarf::DW_CFA_GNU_args_size };
   unsigned Len = encodeULEB128(Size, Buffer + 1) + 1;
@@ -1588,114 +1681,119 @@ void MCAsmStreamer::EmitCFIGnuArgsSize(int64_t Size) {
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIDefCfaRegister(int64_t Register) {
-  MCStreamer::EmitCFIDefCfaRegister(Register);
+void MCAsmStreamer::emitCFIDefCfaRegister(int64_t Register) {
+  MCStreamer::emitCFIDefCfaRegister(Register);
   OS << "\t.cfi_def_cfa_register ";
   EmitRegisterName(Register);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) {
-  this->MCStreamer::EmitCFIOffset(Register, Offset);
+void MCAsmStreamer::emitCFIOffset(int64_t Register, int64_t Offset) {
+  this->MCStreamer::emitCFIOffset(Register, Offset);
   OS << "\t.cfi_offset ";
   EmitRegisterName(Register);
   OS << ", " << Offset;
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIPersonality(const MCSymbol *Sym,
+void MCAsmStreamer::emitCFIPersonality(const MCSymbol *Sym,
                                        unsigned Encoding) {
-  MCStreamer::EmitCFIPersonality(Sym, Encoding);
+  MCStreamer::emitCFIPersonality(Sym, Encoding);
   OS << "\t.cfi_personality " << Encoding << ", ";
   Sym->print(OS, MAI);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) {
-  MCStreamer::EmitCFILsda(Sym, Encoding);
+void MCAsmStreamer::emitCFILsda(const MCSymbol *Sym, unsigned Encoding) {
+  MCStreamer::emitCFILsda(Sym, Encoding);
   OS << "\t.cfi_lsda " << Encoding << ", ";
   Sym->print(OS, MAI);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIRememberState() {
-  MCStreamer::EmitCFIRememberState();
+void MCAsmStreamer::emitCFIRememberState() {
+  MCStreamer::emitCFIRememberState();
   OS << "\t.cfi_remember_state";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIRestoreState() {
-  MCStreamer::EmitCFIRestoreState();
+void MCAsmStreamer::emitCFIRestoreState() {
+  MCStreamer::emitCFIRestoreState();
   OS << "\t.cfi_restore_state";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIRestore(int64_t Register) {
-  MCStreamer::EmitCFIRestore(Register);
+void MCAsmStreamer::emitCFIRestore(int64_t Register) {
+  MCStreamer::emitCFIRestore(Register);
   OS << "\t.cfi_restore ";
   EmitRegisterName(Register);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFISameValue(int64_t Register) {
-  MCStreamer::EmitCFISameValue(Register);
+void MCAsmStreamer::emitCFISameValue(int64_t Register) {
+  MCStreamer::emitCFISameValue(Register);
   OS << "\t.cfi_same_value ";
   EmitRegisterName(Register);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIRelOffset(int64_t Register, int64_t Offset) {
-  MCStreamer::EmitCFIRelOffset(Register, Offset);
+void MCAsmStreamer::emitCFIRelOffset(int64_t Register, int64_t Offset) {
+  MCStreamer::emitCFIRelOffset(Register, Offset);
   OS << "\t.cfi_rel_offset ";
   EmitRegisterName(Register);
   OS << ", " << Offset;
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIAdjustCfaOffset(int64_t Adjustment) {
-  MCStreamer::EmitCFIAdjustCfaOffset(Adjustment);
+void MCAsmStreamer::emitCFIAdjustCfaOffset(int64_t Adjustment) {
+  MCStreamer::emitCFIAdjustCfaOffset(Adjustment);
   OS << "\t.cfi_adjust_cfa_offset " << Adjustment;
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFISignalFrame() {
-  MCStreamer::EmitCFISignalFrame();
+void MCAsmStreamer::emitCFISignalFrame() {
+  MCStreamer::emitCFISignalFrame();
   OS << "\t.cfi_signal_frame";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIUndefined(int64_t Register) {
-  MCStreamer::EmitCFIUndefined(Register);
-  OS << "\t.cfi_undefined " << Register;
+void MCAsmStreamer::emitCFIUndefined(int64_t Register) {
+  MCStreamer::emitCFIUndefined(Register);
+  OS << "\t.cfi_undefined ";
+  EmitRegisterName(Register);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIRegister(int64_t Register1, int64_t Register2) {
-  MCStreamer::EmitCFIRegister(Register1, Register2);
-  OS << "\t.cfi_register " << Register1 << ", " << Register2;
+void MCAsmStreamer::emitCFIRegister(int64_t Register1, int64_t Register2) {
+  MCStreamer::emitCFIRegister(Register1, Register2);
+  OS << "\t.cfi_register ";
+  EmitRegisterName(Register1);
+  OS << ", ";
+  EmitRegisterName(Register2);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIWindowSave() {
-  MCStreamer::EmitCFIWindowSave();
+void MCAsmStreamer::emitCFIWindowSave() {
+  MCStreamer::emitCFIWindowSave();
   OS << "\t.cfi_window_save";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFINegateRAState() {
-  MCStreamer::EmitCFINegateRAState();
+void MCAsmStreamer::emitCFINegateRAState() {
+  MCStreamer::emitCFINegateRAState();
   OS << "\t.cfi_negate_ra_state";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIReturnColumn(int64_t Register) {
-  MCStreamer::EmitCFIReturnColumn(Register);
-  OS << "\t.cfi_return_column " << Register;
+void MCAsmStreamer::emitCFIReturnColumn(int64_t Register) {
+  MCStreamer::emitCFIReturnColumn(Register);
+  OS << "\t.cfi_return_column ";
+  EmitRegisterName(Register);
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitCFIBKeyFrame() {
-  MCStreamer::EmitCFIBKeyFrame();
+void MCAsmStreamer::emitCFIBKeyFrame() {
+  MCStreamer::emitCFIBKeyFrame();
   OS << "\t.cfi_b_key_frame";
   EmitEOL();
 }
@@ -1931,7 +2029,7 @@ void MCAsmStreamer::AddEncodingComment(const MCInst &Inst,
   }
 }
 
-void MCAsmStreamer::EmitInstruction(const MCInst &Inst,
+void MCAsmStreamer::emitInstruction(const MCInst &Inst,
                                     const MCSubtargetInfo &STI) {
   assert(getCurrentSectionOnly() &&
          "Cannot emit contents before setting section!");
@@ -1957,26 +2055,27 @@ void MCAsmStreamer::EmitInstruction(const MCInst &Inst,
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitBundleAlignMode(unsigned AlignPow2) {
+void MCAsmStreamer::emitBundleAlignMode(unsigned AlignPow2) {
   OS << "\t.bundle_align_mode " << AlignPow2;
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitBundleLock(bool AlignToEnd) {
+void MCAsmStreamer::emitBundleLock(bool AlignToEnd) {
   OS << "\t.bundle_lock";
   if (AlignToEnd)
     OS << " align_to_end";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitBundleUnlock() {
+void MCAsmStreamer::emitBundleUnlock() {
   OS << "\t.bundle_unlock";
   EmitEOL();
 }
 
-bool MCAsmStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
-                                       const MCExpr *Expr, SMLoc,
-                                       const MCSubtargetInfo &STI) {
+Optional<std::pair<bool, std::string>>
+MCAsmStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
+                                  const MCExpr *Expr, SMLoc,
+                                  const MCSubtargetInfo &STI) {
   OS << "\t.reloc ";
   Offset.print(OS, MAI);
   OS << ", " << Name;
@@ -1985,15 +2084,15 @@ bool MCAsmStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
     Expr->print(OS, MAI);
   }
   EmitEOL();
-  return false;
+  return None;
 }
 
-void MCAsmStreamer::EmitAddrsig() {
+void MCAsmStreamer::emitAddrsig() {
   OS << "\t.addrsig";
   EmitEOL();
 }
 
-void MCAsmStreamer::EmitAddrsigSym(const MCSymbol *Sym) {
+void MCAsmStreamer::emitAddrsigSym(const MCSymbol *Sym) {
   OS << "\t.addrsig_sym ";
   Sym->print(OS, MAI);
   EmitEOL();
@@ -2002,14 +2101,14 @@ void MCAsmStreamer::EmitAddrsigSym(const MCSymbol *Sym) {
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
-void MCAsmStreamer::EmitRawTextImpl(StringRef String) {
+void MCAsmStreamer::emitRawTextImpl(StringRef String) {
   if (!String.empty() && String.back() == '\n')
     String = String.substr(0, String.size()-1);
   OS << String;
   EmitEOL();
 }
 
-void MCAsmStreamer::FinishImpl() {
+void MCAsmStreamer::finishImpl() {
   // If we are generating dwarf for assembly source files dump out the sections.
   if (getContext().getGenDwarfForAssembly())
     MCGenDwarfInfo::Emit(this);
@@ -2022,7 +2121,7 @@ void MCAsmStreamer::FinishImpl() {
     assert(Tables.size() == 1 && "asm output only supports one line table");
     if (auto *Label = Tables.begin()->second.getLabel()) {
       SwitchSection(getContext().getObjectFileInfo()->getDwarfLineSection());
-      EmitLabel(Label);
+      emitLabel(Label);
     }
   }
 }
diff --git a/llvm/lib/MC/MCAssembler.cpp b/llvm/lib/MC/MCAssembler.cpp
index b30137aafb8d..3ca8714b7817 100644
--- a/llvm/lib/MC/MCAssembler.cpp
+++ b/llvm/lib/MC/MCAssembler.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Support/Alignment.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
@@ -217,6 +218,14 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
   }
 
   assert(getBackendPtr() && "Expected assembler backend");
+  bool IsTarget = getBackendPtr()->getFixupKindInfo(Fixup.getKind()).Flags &
+                  MCFixupKindInfo::FKF_IsTarget;
+
+  if (IsTarget)
+    return getBackend().evaluateTargetFixup(*this, Layout, Fixup, DF, Target,
+                                            Value, WasForced);
+
+  unsigned FixupFlags = getBackendPtr()->getFixupKindInfo(Fixup.getKind()).Flags;
   bool IsPCRel = getBackendPtr()->getFixupKindInfo(Fixup.getKind()).Flags &
                  MCFixupKindInfo::FKF_IsPCRel;
 
@@ -232,8 +241,9 @@ bool MCAssembler::evaluateFixup(const MCAsmLayout &Layout,
       if (A->getKind() != MCSymbolRefExpr::VK_None || SA.isUndefined()) {
         IsResolved = false;
       } else if (auto *Writer = getWriterPtr()) {
-        IsResolved = Writer->isSymbolRefDifferenceFullyResolvedImpl(
-            *this, SA, *DF, false, true);
+        IsResolved = (FixupFlags & MCFixupKindInfo::FKF_Constant) ||
+                     Writer->isSymbolRefDifferenceFullyResolvedImpl(
+                         *this, SA, *DF, false, true);
       }
     }
   } else {
@@ -387,6 +397,9 @@ void MCAsmLayout::layoutFragment(MCFragment *F) {
   assert((!Prev || isFragmentValid(Prev)) &&
          "Attempt to compute fragment before its predecessor!");
 
+  assert(!F->IsBeingLaidOut && "Already being laid out!");
+  F->IsBeingLaidOut = true;
+
   ++stats::FragmentLayouts;
 
   // Compute fragment offset and size.
@@ -394,6 +407,7 @@ void MCAsmLayout::layoutFragment(MCFragment *F) {
     F->Offset = Prev->Offset + getAssembler().computeFragmentSize(*this, *Prev);
   else
     F->Offset = 0;
+  F->IsBeingLaidOut = false;
   LastValidFragment[F->getParent()] = F;
 
   // If bundling is enabled and this fragment has instructions in it, it has to
@@ -674,14 +688,16 @@ void MCAssembler::writeSectionData(raw_ostream &OS, const MCSection *Sec,
         // directives to fill the contents of virtual sections.
         const MCDataFragment &DF = cast<MCDataFragment>(F);
         if (DF.fixup_begin() != DF.fixup_end())
-          report_fatal_error("cannot have fixups in virtual section!");
+          getContext().reportError(SMLoc(), Sec->getVirtualSectionKind() +
+                                                " section '" + Sec->getName() +
+                                                "' cannot have fixups");
         for (unsigned i = 0, e = DF.getContents().size(); i != e; ++i)
           if (DF.getContents()[i]) {
-            if (auto *ELFSec = dyn_cast<const MCSectionELF>(Sec))
-              report_fatal_error("non-zero initializer found in section '" +
-                  ELFSec->getSectionName() + "'");
-            else
-              report_fatal_error("non-zero initializer found in virtual section");
+            getContext().reportError(SMLoc(),
+                                     Sec->getVirtualSectionKind() +
+                                         " section '" + Sec->getName() +
+                                         "' cannot have non-zero initializers");
+            break;
           }
         break;
       }
@@ -776,9 +792,15 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
   }
 
   // Layout until everything fits.
-  while (layoutOnce(Layout))
+  while (layoutOnce(Layout)) {
     if (getContext().hadError())
       return;
+    // Size of fragments in one section can depend on the size of fragments in
+    // another. If any fragment has changed size, we have to re-layout (and
+    // as a result possibly further relax) all.
+    for (MCSection &Sec : *this)
+      Layout.invalidateFragmentsFrom(&*Sec.begin());
+  }
 
   DEBUG_WITH_TYPE("mc-dump", {
       errs() << "assembler backend - post-relaxation\n--\n";
@@ -798,48 +820,57 @@ void MCAssembler::layout(MCAsmLayout &Layout) {
   // Evaluate and apply the fixups, generating relocation entries as necessary.
   for (MCSection &Sec : *this) {
     for (MCFragment &Frag : Sec) {
-      // Data and relaxable fragments both have fixups.  So only process
-      // those here.
-      // FIXME: Is there a better way to do this?  MCEncodedFragmentWithFixups
-      // being templated makes this tricky.
-      if (isa<MCEncodedFragment>(&Frag) &&
-          isa<MCCompactEncodedInstFragment>(&Frag))
-        continue;
-      if (!isa<MCEncodedFragment>(&Frag) && !isa<MCCVDefRangeFragment>(&Frag) &&
-          !isa<MCAlignFragment>(&Frag))
-        continue;
       ArrayRef<MCFixup> Fixups;
       MutableArrayRef<char> Contents;
       const MCSubtargetInfo *STI = nullptr;
-      if (auto *FragWithFixups = dyn_cast<MCDataFragment>(&Frag)) {
-        Fixups = FragWithFixups->getFixups();
-        Contents = FragWithFixups->getContents();
-        STI = FragWithFixups->getSubtargetInfo();
-        assert(!FragWithFixups->hasInstructions() || STI != nullptr);
-      } else if (auto *FragWithFixups = dyn_cast<MCRelaxableFragment>(&Frag)) {
-        Fixups = FragWithFixups->getFixups();
-        Contents = FragWithFixups->getContents();
-        STI = FragWithFixups->getSubtargetInfo();
-        assert(!FragWithFixups->hasInstructions() || STI != nullptr);
-      } else if (auto *FragWithFixups = dyn_cast<MCCVDefRangeFragment>(&Frag)) {
-        Fixups = FragWithFixups->getFixups();
-        Contents = FragWithFixups->getContents();
-      } else if (auto *FragWithFixups = dyn_cast<MCDwarfLineAddrFragment>(&Frag)) {
-        Fixups = FragWithFixups->getFixups();
-        Contents = FragWithFixups->getContents();
-      } else if (auto *AF = dyn_cast<MCAlignFragment>(&Frag)) {
+
+      // Process MCAlignFragment and MCEncodedFragmentWithFixups here.
+      switch (Frag.getKind()) {
+      default:
+        continue;
+      case MCFragment::FT_Align: {
+        MCAlignFragment &AF = cast<MCAlignFragment>(Frag);
         // Insert fixup type for code alignment if the target define
         // shouldInsertFixupForCodeAlign target hook.
-        if (Sec.UseCodeAlign() && AF->hasEmitNops()) {
-          getBackend().shouldInsertFixupForCodeAlign(*this, Layout, *AF);
-        }
+        if (Sec.UseCodeAlign() && AF.hasEmitNops())
+          getBackend().shouldInsertFixupForCodeAlign(*this, Layout, AF);
         continue;
-      } else if (auto *FragWithFixups =
-                     dyn_cast<MCDwarfCallFrameFragment>(&Frag)) {
-        Fixups = FragWithFixups->getFixups();
-        Contents = FragWithFixups->getContents();
-      } else
-        llvm_unreachable("Unknown fragment with fixups!");
+      }
+      case MCFragment::FT_Data: {
+        MCDataFragment &DF = cast<MCDataFragment>(Frag);
+        Fixups = DF.getFixups();
+        Contents = DF.getContents();
+        STI = DF.getSubtargetInfo();
+        assert(!DF.hasInstructions() || STI != nullptr);
+        break;
+      }
+      case MCFragment::FT_Relaxable: {
+        MCRelaxableFragment &RF = cast<MCRelaxableFragment>(Frag);
+        Fixups = RF.getFixups();
+        Contents = RF.getContents();
+        STI = RF.getSubtargetInfo();
+        assert(!RF.hasInstructions() || STI != nullptr);
+        break;
+      }
+      case MCFragment::FT_CVDefRange: {
+        MCCVDefRangeFragment &CF = cast<MCCVDefRangeFragment>(Frag);
+        Fixups = CF.getFixups();
+        Contents = CF.getContents();
+        break;
+      }
+      case MCFragment::FT_Dwarf: {
+        MCDwarfLineAddrFragment &DF = cast<MCDwarfLineAddrFragment>(Frag);
+        Fixups = DF.getFixups();
+        Contents = DF.getContents();
+        break;
+      }
+      case MCFragment::FT_DwarfFrame: {
+        MCDwarfCallFrameFragment &DF = cast<MCDwarfCallFrameFragment>(Frag);
+        Fixups = DF.getFixups();
+        Contents = DF.getContents();
+        break;
+      }
+      }
       for (const MCFixup &Fixup : Fixups) {
         uint64_t FixedValue;
         bool IsResolved;
@@ -908,8 +939,8 @@ bool MCAssembler::relaxInstruction(MCAsmLayout &Layout,
 
   // Relax the fragment.
 
-  MCInst Relaxed;
-  getBackend().relaxInstruction(F.getInst(), *F.getSubtargetInfo(), Relaxed);
+  MCInst Relaxed = F.getInst();
+  getBackend().relaxInstruction(Relaxed, *F.getSubtargetInfo());
 
   // Encode the new instruction.
   //
@@ -987,27 +1018,22 @@ static bool needPadding(uint64_t StartAddr, uint64_t Size,
 
 bool MCAssembler::relaxBoundaryAlign(MCAsmLayout &Layout,
                                      MCBoundaryAlignFragment &BF) {
-  // The MCBoundaryAlignFragment that doesn't emit NOP should not be relaxed.
-  if (!BF.canEmitNops())
+  // BoundaryAlignFragment that doesn't need to align any fragment should not be
+  // relaxed.
+  if (!BF.getLastFragment())
     return false;
 
-  uint64_t AlignedOffset = Layout.getFragmentOffset(BF.getNextNode());
+  uint64_t AlignedOffset = Layout.getFragmentOffset(&BF);
   uint64_t AlignedSize = 0;
-  const MCFragment *F = BF.getNextNode();
-  // If the branch is unfused, it is emitted into one fragment, otherwise it is
-  // emitted into two fragments at most, the next MCBoundaryAlignFragment(if
-  // exists) also marks the end of the branch.
-  for (auto i = 0, N = BF.isFused() ? 2 : 1;
-       i != N && !isa<MCBoundaryAlignFragment>(F); ++i, F = F->getNextNode()) {
+  for (const MCFragment *F = BF.getLastFragment(); F != &BF;
+       F = F->getPrevNode())
     AlignedSize += computeFragmentSize(Layout, *F);
-  }
-  uint64_t OldSize = BF.getSize();
-  AlignedOffset -= OldSize;
+
   Align BoundaryAlignment = BF.getAlignment();
   uint64_t NewSize = needPadding(AlignedOffset, AlignedSize, BoundaryAlignment)
                          ? offsetToAlignment(AlignedOffset, BoundaryAlignment)
                          : 0U;
-  if (NewSize == OldSize)
+  if (NewSize == BF.getSize())
     return false;
   BF.setSize(NewSize);
   Layout.invalidateFragmentsFrom(&BF);
@@ -1099,6 +1125,30 @@ bool MCAssembler::relaxCVDefRange(MCAsmLayout &Layout,
   return OldSize != F.getContents().size();
 }
 
+bool MCAssembler::relaxFragment(MCAsmLayout &Layout, MCFragment &F) {
+  switch(F.getKind()) {
+  default:
+    return false;
+  case MCFragment::FT_Relaxable:
+    assert(!getRelaxAll() &&
+           "Did not expect a MCRelaxableFragment in RelaxAll mode");
+    return relaxInstruction(Layout, cast<MCRelaxableFragment>(F));
+  case MCFragment::FT_Dwarf:
+    return relaxDwarfLineAddr(Layout, cast<MCDwarfLineAddrFragment>(F));
+  case MCFragment::FT_DwarfFrame:
+    return relaxDwarfCallFrameFragment(Layout,
+                                       cast<MCDwarfCallFrameFragment>(F));
+  case MCFragment::FT_LEB:
+    return relaxLEB(Layout, cast<MCLEBFragment>(F));
+  case MCFragment::FT_BoundaryAlign:
+    return relaxBoundaryAlign(Layout, cast<MCBoundaryAlignFragment>(F));
+  case MCFragment::FT_CVInlineLines:
+    return relaxCVInlineLineTable(Layout, cast<MCCVInlineLineTableFragment>(F));
+  case MCFragment::FT_CVDefRange:
+    return relaxCVDefRange(Layout, cast<MCCVDefRangeFragment>(F));
+  }
+}
+
 bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSection &Sec) {
   // Holds the first fragment which needed relaxing during this layout. It will
   // remain NULL if none were relaxed.
@@ -1107,43 +1157,11 @@ bool MCAssembler::layoutSectionOnce(MCAsmLayout &Layout, MCSection &Sec) {
   MCFragment *FirstRelaxedFragment = nullptr;
 
   // Attempt to relax all the fragments in the section.
-  for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) {
+  for (MCFragment &Frag : Sec) {
     // Check if this is a fragment that needs relaxation.
-    bool RelaxedFrag = false;
-    switch(I->getKind()) {
-    default:
-      break;
-    case MCFragment::FT_Relaxable:
-      assert(!getRelaxAll() &&
-             "Did not expect a MCRelaxableFragment in RelaxAll mode");
-      RelaxedFrag = relaxInstruction(Layout, *cast<MCRelaxableFragment>(I));
-      break;
-    case MCFragment::FT_Dwarf:
-      RelaxedFrag = relaxDwarfLineAddr(Layout,
-                                       *cast<MCDwarfLineAddrFragment>(I));
-      break;
-    case MCFragment::FT_DwarfFrame:
-      RelaxedFrag =
-        relaxDwarfCallFrameFragment(Layout,
-                                    *cast<MCDwarfCallFrameFragment>(I));
-      break;
-    case MCFragment::FT_LEB:
-      RelaxedFrag = relaxLEB(Layout, *cast<MCLEBFragment>(I));
-      break;
-    case MCFragment::FT_BoundaryAlign:
-      RelaxedFrag =
-          relaxBoundaryAlign(Layout, *cast<MCBoundaryAlignFragment>(I));
-      break;
-    case MCFragment::FT_CVInlineLines:
-      RelaxedFrag =
-          relaxCVInlineLineTable(Layout, *cast<MCCVInlineLineTableFragment>(I));
-      break;
-    case MCFragment::FT_CVDefRange:
-      RelaxedFrag = relaxCVDefRange(Layout, *cast<MCCVDefRangeFragment>(I));
-      break;
-    }
+    bool RelaxedFrag = relaxFragment(Layout, Frag);
     if (RelaxedFrag && !FirstRelaxedFragment)
-      FirstRelaxedFragment = &*I;
+      FirstRelaxedFragment = &Frag;
   }
   if (FirstRelaxedFragment) {
     Layout.invalidateFragmentsFrom(FirstRelaxedFragment);
@@ -1156,8 +1174,7 @@ bool MCAssembler::layoutOnce(MCAsmLayout &Layout) {
   ++stats::RelaxationSteps;
 
   bool WasRelaxed = false;
-  for (iterator it = begin(), ie = end(); it != ie; ++it) {
-    MCSection &Sec = *it;
+  for (MCSection &Sec : *this) {
     while (layoutSectionOnce(Layout, Sec))
       WasRelaxed = true;
   }
diff --git a/llvm/lib/MC/MCCodeView.cpp b/llvm/lib/MC/MCCodeView.cpp
index 1a71b542bd06..7849196432b8 100644
--- a/llvm/lib/MC/MCCodeView.cpp
+++ b/llvm/lib/MC/MCCodeView.cpp
@@ -172,9 +172,9 @@ void CodeViewContext::emitStringTable(MCObjectStreamer &OS) {
   MCSymbol *StringBegin = Ctx.createTempSymbol("strtab_begin", false),
            *StringEnd = Ctx.createTempSymbol("strtab_end", false);
 
-  OS.EmitIntValue(unsigned(DebugSubsectionKind::StringTable), 4);
+  OS.emitInt32(uint32_t(DebugSubsectionKind::StringTable));
   OS.emitAbsoluteSymbolDiff(StringEnd, StringBegin, 4);
-  OS.EmitLabel(StringBegin);
+  OS.emitLabel(StringBegin);
 
   // Put the string table data fragment here, if we haven't already put it
   // somewhere else. If somebody wants two string tables in their .s file, one
@@ -184,9 +184,9 @@ void CodeViewContext::emitStringTable(MCObjectStreamer &OS) {
     InsertedStrTabFragment = true;
   }
 
-  OS.EmitValueToAlignment(4, 0);
+  OS.emitValueToAlignment(4, 0);
 
-  OS.EmitLabel(StringEnd);
+  OS.emitLabel(StringEnd);
 }
 
 void CodeViewContext::emitFileChecksums(MCObjectStreamer &OS) {
@@ -199,9 +199,9 @@ void CodeViewContext::emitFileChecksums(MCObjectStreamer &OS) {
   MCSymbol *FileBegin = Ctx.createTempSymbol("filechecksums_begin", false),
            *FileEnd = Ctx.createTempSymbol("filechecksums_end", false);
 
-  OS.EmitIntValue(unsigned(DebugSubsectionKind::FileChecksums), 4);
+  OS.emitInt32(uint32_t(DebugSubsectionKind::FileChecksums));
   OS.emitAbsoluteSymbolDiff(FileEnd, FileBegin, 4);
-  OS.EmitLabel(FileBegin);
+  OS.emitLabel(FileBegin);
 
   unsigned CurrentOffset = 0;
 
@@ -209,7 +209,7 @@ void CodeViewContext::emitFileChecksums(MCObjectStreamer &OS) {
   // user-provided file number.  Each entry may be a variable number of bytes
   // determined by the checksum kind and size.
   for (auto File : Files) {
-    OS.EmitAssignment(File.ChecksumTableOffset,
+    OS.emitAssignment(File.ChecksumTableOffset,
                       MCConstantExpr::create(CurrentOffset, Ctx));
     CurrentOffset += 4; // String table offset.
     if (!File.ChecksumKind) {
@@ -221,21 +221,21 @@ void CodeViewContext::emitFileChecksums(MCObjectStreamer &OS) {
       CurrentOffset = alignTo(CurrentOffset, 4);
     }
 
-    OS.EmitIntValue(File.StringTableOffset, 4);
+    OS.emitInt32(File.StringTableOffset);
 
     if (!File.ChecksumKind) {
       // There is no checksum.  Therefore zero the next two fields and align
       // back to 4 bytes.
-      OS.EmitIntValue(0, 4);
+      OS.emitInt32(0);
       continue;
     }
-    OS.EmitIntValue(static_cast<uint8_t>(File.Checksum.size()), 1);
-    OS.EmitIntValue(File.ChecksumKind, 1);
-    OS.EmitBytes(toStringRef(File.Checksum));
-    OS.EmitValueToAlignment(4);
+    OS.emitInt8(static_cast<uint8_t>(File.Checksum.size()));
+    OS.emitInt8(File.ChecksumKind);
+    OS.emitBytes(toStringRef(File.Checksum));
+    OS.emitValueToAlignment(4);
   }
 
-  OS.EmitLabel(FileEnd);
+  OS.emitLabel(FileEnd);
 
   ChecksumOffsetsAssigned = true;
 }
@@ -252,14 +252,14 @@ void CodeViewContext::emitFileChecksumOffset(MCObjectStreamer &OS,
     Files.resize(Idx + 1);
 
   if (ChecksumOffsetsAssigned) {
-    OS.EmitSymbolValue(Files[Idx].ChecksumTableOffset, 4);
+    OS.emitSymbolValue(Files[Idx].ChecksumTableOffset, 4);
     return;
   }
 
   const MCSymbolRefExpr *SRE =
       MCSymbolRefExpr::create(Files[Idx].ChecksumTableOffset, OS.getContext());
 
-  OS.EmitValueImpl(SRE, 4);
+  OS.emitValueImpl(SRE, 4);
 }
 
 void CodeViewContext::addLineEntry(const MCCVLoc &LineEntry) {
@@ -331,9 +331,9 @@ void CodeViewContext::emitLineTableForFunction(MCObjectStreamer &OS,
   MCSymbol *LineBegin = Ctx.createTempSymbol("linetable_begin", false),
            *LineEnd = Ctx.createTempSymbol("linetable_end", false);
 
-  OS.EmitIntValue(unsigned(DebugSubsectionKind::Lines), 4);
+  OS.emitInt32(uint32_t(DebugSubsectionKind::Lines));
   OS.emitAbsoluteSymbolDiff(LineEnd, LineBegin, 4);
-  OS.EmitLabel(LineBegin);
+  OS.emitLabel(LineBegin);
   OS.EmitCOFFSecRel32(FuncBegin, /*Offset=*/0);
   OS.EmitCOFFSectionIndex(FuncBegin);
 
@@ -342,7 +342,7 @@ void CodeViewContext::emitLineTableForFunction(MCObjectStreamer &OS,
   bool HaveColumns = any_of(Locs, [](const MCCVLoc &LineEntry) {
     return LineEntry.getColumn() != 0;
   });
-  OS.EmitIntValue(HaveColumns ? int(LF_HaveColumns) : 0, 2);
+  OS.emitInt16(HaveColumns ? int(LF_HaveColumns) : 0);
   OS.emitAbsoluteSymbolDiff(FuncEnd, FuncBegin, 4);
 
   for (auto I = Locs.begin(), E = Locs.end(); I != E;) {
@@ -358,30 +358,30 @@ void CodeViewContext::emitLineTableForFunction(MCObjectStreamer &OS,
         Twine(getStringTableFragment()
                   ->getContents()[Files[CurFileNum - 1].StringTableOffset]) +
         "' begins");
-    OS.EmitCVFileChecksumOffsetDirective(CurFileNum);
-    OS.EmitIntValue(EntryCount, 4);
+    OS.emitCVFileChecksumOffsetDirective(CurFileNum);
+    OS.emitInt32(EntryCount);
     uint32_t SegmentSize = 12;
     SegmentSize += 8 * EntryCount;
     if (HaveColumns)
       SegmentSize += 4 * EntryCount;
-    OS.EmitIntValue(SegmentSize, 4);
+    OS.emitInt32(SegmentSize);
 
     for (auto J = I; J != FileSegEnd; ++J) {
       OS.emitAbsoluteSymbolDiff(J->getLabel(), FuncBegin, 4);
       unsigned LineData = J->getLine();
       if (J->isStmt())
         LineData |= LineInfo::StatementFlag;
-      OS.EmitIntValue(LineData, 4);
+      OS.emitInt32(LineData);
     }
     if (HaveColumns) {
       for (auto J = I; J != FileSegEnd; ++J) {
-        OS.EmitIntValue(J->getColumn(), 2);
-        OS.EmitIntValue(0, 2);
+        OS.emitInt16(J->getColumn());
+        OS.emitInt16(0);
       }
     }
     I = FileSegEnd;
   }
-  OS.EmitLabel(LineEnd);
+  OS.emitLabel(LineEnd);
 }
 
 static bool compressAnnotation(uint32_t Data, SmallVectorImpl<char> &Buffer) {
diff --git a/llvm/lib/MC/MCContext.cpp b/llvm/lib/MC/MCContext.cpp
index a6417113fd38..a0f9212f3b14 100644
--- a/llvm/lib/MC/MCContext.cpp
+++ b/llvm/lib/MC/MCContext.cpp
@@ -68,8 +68,8 @@ MCContext::MCContext(const MCAsmInfo *mai, const MCRegisterInfo *mri,
   SecureLogFile = AsSecureLogFileName;
 
   if (SrcMgr && SrcMgr->getNumBuffers())
-    MainFileName =
-        SrcMgr->getMemoryBuffer(SrcMgr->getMainFileID())->getBufferIdentifier();
+    MainFileName = std::string(SrcMgr->getMemoryBuffer(SrcMgr->getMainFileID())
+                                   ->getBufferIdentifier());
 }
 
 MCContext::~MCContext() {
@@ -114,6 +114,9 @@ void MCContext::reset() {
   WasmUniquingMap.clear();
   XCOFFUniquingMap.clear();
 
+  ELFEntrySizeMap.clear();
+  ELFSeenGenericMergeableSections.clear();
+
   NextID.clear();
   AllowTemporaryLabels = true;
   DwarfLocSeen = false;
@@ -158,6 +161,16 @@ MCSymbol *MCContext::getOrCreateLSDASymbol(StringRef FuncName) {
 
 MCSymbol *MCContext::createSymbolImpl(const StringMapEntry<bool> *Name,
                                       bool IsTemporary) {
+  static_assert(std::is_trivially_destructible<MCSymbolCOFF>(),
+                "MCSymbol classes must be trivially destructible");
+  static_assert(std::is_trivially_destructible<MCSymbolELF>(),
+                "MCSymbol classes must be trivially destructible");
+  static_assert(std::is_trivially_destructible<MCSymbolMachO>(),
+                "MCSymbol classes must be trivially destructible");
+  static_assert(std::is_trivially_destructible<MCSymbolWasm>(),
+                "MCSymbol classes must be trivially destructible");
+  static_assert(std::is_trivially_destructible<MCSymbolXCOFF>(),
+                "MCSymbol classes must be trivially destructible");
   if (MOFI) {
     switch (MOFI->getObjectFileType()) {
     case MCObjectFileInfo::IsCOFF:
@@ -169,7 +182,7 @@ MCSymbol *MCContext::createSymbolImpl(const StringMapEntry<bool> *Name,
     case MCObjectFileInfo::IsWasm:
       return new (Name, *this) MCSymbolWasm(Name, IsTemporary);
     case MCObjectFileInfo::IsXCOFF:
-      return new (Name, *this) MCSymbolXCOFF(Name, IsTemporary);
+      return createXCOFFSymbolImpl(Name, IsTemporary);
     }
   }
   return new (Name, *this) MCSymbol(MCSymbol::SymbolKindUnset, Name,
@@ -272,13 +285,68 @@ void MCContext::setSymbolValue(MCStreamer &Streamer,
                               StringRef Sym,
                               uint64_t Val) {
   auto Symbol = getOrCreateSymbol(Sym);
-  Streamer.EmitAssignment(Symbol, MCConstantExpr::create(Val, *this));
+  Streamer.emitAssignment(Symbol, MCConstantExpr::create(Val, *this));
 }
 
 void MCContext::registerInlineAsmLabel(MCSymbol *Sym) {
   InlineAsmUsedLabelNames[Sym->getName()] = Sym;
 }
 
+MCSymbolXCOFF *
+MCContext::createXCOFFSymbolImpl(const StringMapEntry<bool> *Name,
+                                 bool IsTemporary) {
+  if (!Name)
+    return new (nullptr, *this) MCSymbolXCOFF(nullptr, IsTemporary);
+
+  StringRef OriginalName = Name->first();
+  if (OriginalName.startswith("._Renamed..") ||
+      OriginalName.startswith("_Renamed.."))
+    reportError(SMLoc(), "invalid symbol name from source");
+
+  if (MAI->isValidUnquotedName(OriginalName))
+    return new (Name, *this) MCSymbolXCOFF(Name, IsTemporary);
+
+  // Now we have a name that contains invalid character(s) for XCOFF symbol.
+  // Let's replace with something valid, but save the original name so that
+  // we could still use the original name in the symbol table.
+  SmallString<128> InvalidName(OriginalName);
+
+  // If it's an entry point symbol, we will keep the '.'
+  // in front for the convention purpose. Otherwise, add "_Renamed.."
+  // as prefix to signal this is an renamed symbol.
+  const bool IsEntryPoint = !InvalidName.empty() && InvalidName[0] == '.';
+  SmallString<128> ValidName =
+      StringRef(IsEntryPoint ? "._Renamed.." : "_Renamed..");
+
+  // Append the hex values of '_' and invalid characters with "_Renamed..";
+  // at the same time replace invalid characters with '_'.
+  for (size_t I = 0; I < InvalidName.size(); ++I) {
+    if (!MAI->isAcceptableChar(InvalidName[I]) || InvalidName[I] == '_') {
+      raw_svector_ostream(ValidName).write_hex(InvalidName[I]);
+      InvalidName[I] = '_';
+    }
+  }
+
+  // Skip entry point symbol's '.' as we already have a '.' in front of
+  // "_Renamed".
+  if (IsEntryPoint)
+    ValidName.append(InvalidName.substr(1, InvalidName.size() - 1));
+  else
+    ValidName.append(InvalidName);
+
+  auto NameEntry = UsedNames.insert(std::make_pair(ValidName, true));
+  assert((NameEntry.second || !NameEntry.first->second) &&
+         "This name is used somewhere else.");
+  // Mark the name as used for a non-section symbol.
+  NameEntry.first->second = true;
+  // Have the MCSymbol object itself refer to the copy of the string
+  // that is embedded in the UsedNames entry.
+  MCSymbolXCOFF *XSym = new (&*NameEntry.first, *this)
+      MCSymbolXCOFF(&*NameEntry.first, IsTemporary);
+  XSym->setSymbolTableName(MCSymbolXCOFF::getUnqualifiedName(OriginalName));
+  return XSym;
+}
+
 //===----------------------------------------------------------------------===//
 // Section Management
 //===----------------------------------------------------------------------===//
@@ -292,23 +360,25 @@ MCSectionMachO *MCContext::getMachOSection(StringRef Segment, StringRef Section,
   // diagnosed by the client as an error.
 
   // Form the name to look up.
-  SmallString<64> Name;
-  Name += Segment;
-  Name.push_back(',');
-  Name += Section;
+  assert(Section.size() <= 16 && "section name is too long");
+  assert(!memchr(Section.data(), '\0', Section.size()) &&
+         "section name cannot contain NUL");
 
   // Do the lookup, if we have a hit, return it.
-  MCSectionMachO *&Entry = MachOUniquingMap[Name];
-  if (Entry)
-    return Entry;
+  auto R = MachOUniquingMap.try_emplace((Segment + Twine(',') + Section).str());
+  if (!R.second)
+    return R.first->second;
 
   MCSymbol *Begin = nullptr;
   if (BeginSymName)
     Begin = createTempSymbol(BeginSymName, false);
 
   // Otherwise, return a new section.
-  return Entry = new (MachOAllocator.Allocate()) MCSectionMachO(
-             Segment, Section, TypeAndAttributes, Reserved2, Kind, Begin);
+  StringRef Name = R.first->first();
+  R.first->second = new (MachOAllocator.Allocate())
+      MCSectionMachO(Segment, Name.substr(Name.size() - Section.size()),
+                     TypeAndAttributes, Reserved2, Kind, Begin);
+  return R.first->second;
 }
 
 void MCContext::renameELFSection(MCSectionELF *Section, StringRef Name) {
@@ -316,12 +386,14 @@ void MCContext::renameELFSection(MCSectionELF *Section, StringRef Name) {
   if (const MCSymbol *Group = Section->getGroup())
     GroupName = Group->getName();
 
+  // This function is only used by .debug*, which should not have the
+  // SHF_LINK_ORDER flag.
   unsigned UniqueID = Section->getUniqueID();
   ELFUniquingMap.erase(
-      ELFSectionKey{Section->getSectionName(), GroupName, UniqueID});
-  auto I = ELFUniquingMap.insert(std::make_pair(
-                                     ELFSectionKey{Name, GroupName, UniqueID},
-                                     Section))
+      ELFSectionKey{Section->getName(), GroupName, "", UniqueID});
+  auto I = ELFUniquingMap
+               .insert(std::make_pair(
+                   ELFSectionKey{Name, GroupName, "", UniqueID}, Section))
                .first;
   StringRef CachedName = I->first.SectionName;
   const_cast<MCSectionELF *>(Section)->setSectionName(CachedName);
@@ -332,7 +404,7 @@ MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type,
                                               unsigned EntrySize,
                                               const MCSymbolELF *Group,
                                               unsigned UniqueID,
-                                              const MCSymbolELF *Associated) {
+                                              const MCSymbolELF *LinkedToSym) {
   MCSymbolELF *R;
   MCSymbol *&Sym = Symbols[Section];
   // A section symbol can not redefine regular symbols. There may be multiple
@@ -352,7 +424,7 @@ MCSectionELF *MCContext::createELFSectionImpl(StringRef Section, unsigned Type,
   R->setType(ELF::STT_SECTION);
 
   auto *Ret = new (ELFAllocator.Allocate()) MCSectionELF(
-      Section, Type, Flags, K, EntrySize, Group, UniqueID, R, Associated);
+      Section, Type, Flags, K, EntrySize, Group, UniqueID, R, LinkedToSym);
 
   auto *F = new MCDataFragment();
   Ret->getFragmentList().insert(Ret->begin(), F);
@@ -386,26 +458,29 @@ MCSectionELF *MCContext::getELFNamedSection(const Twine &Prefix,
 MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
                                        unsigned Flags, unsigned EntrySize,
                                        const Twine &Group, unsigned UniqueID,
-                                       const MCSymbolELF *Associated) {
+                                       const MCSymbolELF *LinkedToSym) {
   MCSymbolELF *GroupSym = nullptr;
   if (!Group.isTriviallyEmpty() && !Group.str().empty())
     GroupSym = cast<MCSymbolELF>(getOrCreateSymbol(Group));
 
   return getELFSection(Section, Type, Flags, EntrySize, GroupSym, UniqueID,
-                       Associated);
+                       LinkedToSym);
 }
 
 MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
                                        unsigned Flags, unsigned EntrySize,
                                        const MCSymbolELF *GroupSym,
                                        unsigned UniqueID,
-                                       const MCSymbolELF *Associated) {
+                                       const MCSymbolELF *LinkedToSym) {
   StringRef Group = "";
   if (GroupSym)
     Group = GroupSym->getName();
+  assert(!(LinkedToSym && LinkedToSym->getName().empty()));
   // Do the lookup, if we have a hit, return it.
-  auto IterBool = ELFUniquingMap.insert(
-      std::make_pair(ELFSectionKey{Section.str(), Group, UniqueID}, nullptr));
+  auto IterBool = ELFUniquingMap.insert(std::make_pair(
+      ELFSectionKey{Section.str(), Group,
+                    LinkedToSym ? LinkedToSym->getName() : "", UniqueID},
+      nullptr));
   auto &Entry = *IterBool.first;
   if (!IterBool.second)
     return Entry.second;
@@ -420,16 +495,55 @@ MCSectionELF *MCContext::getELFSection(const Twine &Section, unsigned Type,
   else
     Kind = SectionKind::getReadOnly();
 
-  MCSectionELF *Result = createELFSectionImpl(
-      CachedName, Type, Flags, Kind, EntrySize, GroupSym, UniqueID, Associated);
+  MCSectionELF *Result =
+      createELFSectionImpl(CachedName, Type, Flags, Kind, EntrySize, GroupSym,
+                           UniqueID, LinkedToSym);
   Entry.second = Result;
+
+  recordELFMergeableSectionInfo(Result->getName(), Result->getFlags(),
+                                Result->getUniqueID(), Result->getEntrySize());
+
   return Result;
 }
 
 MCSectionELF *MCContext::createELFGroupSection(const MCSymbolELF *Group) {
   return createELFSectionImpl(".group", ELF::SHT_GROUP, 0,
-                              SectionKind::getReadOnly(), 4, Group, ~0,
-                              nullptr);
+                              SectionKind::getReadOnly(), 4, Group,
+                              MCSection::NonUniqueID, nullptr);
+}
+
+void MCContext::recordELFMergeableSectionInfo(StringRef SectionName,
+                                              unsigned Flags, unsigned UniqueID,
+                                              unsigned EntrySize) {
+  bool IsMergeable = Flags & ELF::SHF_MERGE;
+  if (IsMergeable && (UniqueID == GenericSectionID))
+    ELFSeenGenericMergeableSections.insert(SectionName);
+
+  // For mergeable sections or non-mergeable sections with a generic mergeable
+  // section name we enter their Unique ID into the ELFEntrySizeMap so that
+  // compatible globals can be assigned to the same section.
+  if (IsMergeable || isELFGenericMergeableSection(SectionName)) {
+    ELFEntrySizeMap.insert(std::make_pair(
+        ELFEntrySizeKey{SectionName, Flags, EntrySize}, UniqueID));
+  }
+}
+
+bool MCContext::isELFImplicitMergeableSectionNamePrefix(StringRef SectionName) {
+  return SectionName.startswith(".rodata.str") ||
+         SectionName.startswith(".rodata.cst");
+}
+
+bool MCContext::isELFGenericMergeableSection(StringRef SectionName) {
+  return isELFImplicitMergeableSectionNamePrefix(SectionName) ||
+         ELFSeenGenericMergeableSections.count(SectionName);
+}
+
+Optional<unsigned> MCContext::getELFUniqueIDForEntsize(StringRef SectionName,
+                                                       unsigned Flags,
+                                                       unsigned EntrySize) {
+  auto I = ELFEntrySizeMap.find(
+      MCContext::ELFEntrySizeKey{SectionName, Flags, EntrySize});
+  return (I != ELFEntrySizeMap.end()) ? Optional<unsigned>(I->second) : None;
 }
 
 MCSectionCOFF *MCContext::getCOFFSection(StringRef Section,
@@ -484,13 +598,13 @@ MCSectionCOFF *MCContext::getAssociativeCOFFSection(MCSectionCOFF *Sec,
   unsigned Characteristics = Sec->getCharacteristics();
   if (KeySym) {
     Characteristics |= COFF::IMAGE_SCN_LNK_COMDAT;
-    return getCOFFSection(Sec->getSectionName(), Characteristics,
-                          Sec->getKind(), KeySym->getName(),
+    return getCOFFSection(Sec->getName(), Characteristics, Sec->getKind(),
+                          KeySym->getName(),
                           COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE, UniqueID);
   }
 
-  return getCOFFSection(Sec->getSectionName(), Characteristics, Sec->getKind(),
-                        "", 0, UniqueID);
+  return getCOFFSection(Sec->getName(), Characteristics, Sec->getKind(), "", 0,
+                        UniqueID);
 }
 
 MCSectionWasm *MCContext::getWasmSection(const Twine &Section, SectionKind K,
@@ -551,15 +665,18 @@ MCSectionXCOFF *MCContext::getXCOFFSection(StringRef Section,
 
   // Otherwise, return a new section.
   StringRef CachedName = Entry.first.SectionName;
-  MCSymbol *QualName = getOrCreateSymbol(
-      CachedName + "[" + XCOFF::getMappingClassString(SMC) + "]");
+  MCSymbolXCOFF *QualName = cast<MCSymbolXCOFF>(getOrCreateSymbol(
+      CachedName + "[" + XCOFF::getMappingClassString(SMC) + "]"));
 
   MCSymbol *Begin = nullptr;
   if (BeginSymName)
     Begin = createTempSymbol(BeginSymName, false);
 
-  MCSectionXCOFF *Result = new (XCOFFAllocator.Allocate()) MCSectionXCOFF(
-      CachedName, SMC, Type, SC, Kind, cast<MCSymbolXCOFF>(QualName), Begin);
+  // QualName->getUnqualifiedName() and CachedName are the same except when
+  // CachedName contains invalid character(s) such as '$' for an XCOFF symbol.
+  MCSectionXCOFF *Result = new (XCOFFAllocator.Allocate())
+      MCSectionXCOFF(QualName->getUnqualifiedName(), SMC, Type, SC, Kind,
+                     QualName, Begin, CachedName);
   Entry.second = Result;
 
   auto *F = new MCDataFragment();
@@ -583,17 +700,21 @@ void MCContext::addDebugPrefixMapEntry(const std::string &From,
 
 void MCContext::RemapDebugPaths() {
   const auto &DebugPrefixMap = this->DebugPrefixMap;
+  if (DebugPrefixMap.empty())
+    return;
+
   const auto RemapDebugPath = [&DebugPrefixMap](std::string &Path) {
-    for (const auto &Entry : DebugPrefixMap)
-      if (StringRef(Path).startswith(Entry.first)) {
-        std::string RemappedPath =
-            (Twine(Entry.second) + Path.substr(Entry.first.size())).str();
-        Path.swap(RemappedPath);
+    SmallString<256> P(Path);
+    for (const auto &Entry : DebugPrefixMap) {
+      if (llvm::sys::path::replace_path_prefix(P, Entry.first, Entry.second)) {
+        Path = P.str().str();
+        break;
       }
+    }
   };
 
   // Remap compilation directory.
-  std::string CompDir = CompilationDir.str();
+  std::string CompDir = std::string(CompilationDir.str());
   RemapDebugPath(CompDir);
   CompilationDir = CompDir;
 
diff --git a/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp b/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp
index 373916fbed78..a58e8f6d9bcc 100644
--- a/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp
+++ b/llvm/lib/MC/MCDisassembler/MCDisassembler.cpp
@@ -16,12 +16,11 @@ using namespace llvm;
 
 MCDisassembler::~MCDisassembler() = default;
 
-MCDisassembler::DecodeStatus
-MCDisassembler::onSymbolStart(StringRef Name, uint64_t &Size,
+Optional<MCDisassembler::DecodeStatus>
+MCDisassembler::onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &CStream) const {
-  Size = 0;
-  return MCDisassembler::Success;
+  return None;
 }
 
 bool MCDisassembler::tryAddingSymbolicOperand(MCInst &Inst, int64_t Value,
@@ -43,3 +42,56 @@ void MCDisassembler::tryAddingPcLoadReferenceComment(int64_t Value,
 void MCDisassembler::setSymbolizer(std::unique_ptr<MCSymbolizer> Symzer) {
   Symbolizer = std::move(Symzer);
 }
+
+#define SMC_PCASE(A, P)                                                         \
+  case XCOFF::XMC_##A:                                                         \
+    return P;
+
+static uint8_t getSMCPriority(XCOFF::StorageMappingClass SMC) {
+  switch (SMC) {
+    SMC_PCASE(PR, 1)
+    SMC_PCASE(RO, 1)
+    SMC_PCASE(DB, 1)
+    SMC_PCASE(GL, 1)
+    SMC_PCASE(XO, 1)
+    SMC_PCASE(SV, 1)
+    SMC_PCASE(SV64, 1)
+    SMC_PCASE(SV3264, 1)
+    SMC_PCASE(TI, 1)
+    SMC_PCASE(TB, 1)
+    SMC_PCASE(RW, 1)
+    SMC_PCASE(TC0, 0)
+    SMC_PCASE(TC, 1)
+    SMC_PCASE(TD, 1)
+    SMC_PCASE(DS, 1)
+    SMC_PCASE(UA, 1)
+    SMC_PCASE(BS, 1)
+    SMC_PCASE(UC, 1)
+    SMC_PCASE(TL, 1)
+    SMC_PCASE(UL, 1)
+    SMC_PCASE(TE, 1)
+#undef SMC_PCASE
+  }
+  return 0;
+}
+
+/// The function is for symbol sorting when symbols have the same address.
+/// The symbols in the same section are sorted in ascending order.
+/// llvm-objdump -D will choose the highest priority symbol to display when
+/// there are symbols with the same address.
+bool XCOFFSymbolInfo::operator<(const XCOFFSymbolInfo &SymInfo) const {
+  // Label symbols have higher priority than non-label symbols.
+  if (IsLabel != SymInfo.IsLabel)
+    return SymInfo.IsLabel;
+
+  // Symbols with a StorageMappingClass have higher priority than those without.
+  if (StorageMappingClass.hasValue() != SymInfo.StorageMappingClass.hasValue())
+    return SymInfo.StorageMappingClass.hasValue();
+
+  if (StorageMappingClass.hasValue()) {
+    return getSMCPriority(StorageMappingClass.getValue()) <
+           getSMCPriority(SymInfo.StorageMappingClass.getValue());
+  }
+
+  return false;
+}
diff --git a/llvm/lib/MC/MCDwarf.cpp b/llvm/lib/MC/MCDwarf.cpp
index b4b3c9956cc2..7f72d062b7ac 100644
--- a/llvm/lib/MC/MCDwarf.cpp
+++ b/llvm/lib/MC/MCDwarf.cpp
@@ -45,6 +45,29 @@
 
 using namespace llvm;
 
+MCSymbol *mcdwarf::emitListsTableHeaderStart(MCStreamer &S) {
+  MCSymbol *Start =
+      S.getContext().createTempSymbol("debug_list_header_start", true, true);
+  MCSymbol *End =
+      S.getContext().createTempSymbol("debug_list_header_end", true, true);
+  auto DwarfFormat = S.getContext().getDwarfFormat();
+  if (DwarfFormat == dwarf::DWARF64) {
+    S.AddComment("DWARF64 mark");
+    S.emitInt32(dwarf::DW_LENGTH_DWARF64);
+  }
+  S.AddComment("Length");
+  S.emitAbsoluteSymbolDiff(End, Start,
+                           dwarf::getDwarfOffsetByteSize(DwarfFormat));
+  S.emitLabel(Start);
+  S.AddComment("Version");
+  S.emitInt16(S.getContext().getDwarfVersion());
+  S.AddComment("Address size");
+  S.emitInt8(S.getContext().getAsmInfo()->getCodePointerSize());
+  S.AddComment("Segment selector size");
+  S.emitInt8(0);
+  return End;
+}
+
 /// Manage the .debug_line_str section contents, if we use it.
 class llvm::MCDwarfLineStr {
   MCSymbol *LineStrLabel = nullptr;
@@ -91,7 +114,7 @@ void MCDwarfLineEntry::Make(MCObjectStreamer *MCOS, MCSection *Section) {
   // Create a symbol at in the current section for use in the line entry.
   MCSymbol *LineSym = MCOS->getContext().createTempSymbol();
   // Set the value of the symbol to use for the MCDwarfLineEntry.
-  MCOS->EmitLabel(LineSym);
+  MCOS->emitLabel(LineSym);
 
   // Get the current .loc info saved in the context.
   const MCDwarfLoc &DwarfLoc = MCOS->getContext().getCurrentDwarfLoc();
@@ -112,21 +135,16 @@ void MCDwarfLineEntry::Make(MCObjectStreamer *MCOS, MCSection *Section) {
 //
 // This helper routine returns an expression of End - Start + IntVal .
 //
-static inline const MCExpr *MakeStartMinusEndExpr(const MCStreamer &MCOS,
+static inline const MCExpr *makeEndMinusStartExpr(MCContext &Ctx,
                                                   const MCSymbol &Start,
                                                   const MCSymbol &End,
                                                   int IntVal) {
   MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
-  const MCExpr *Res =
-    MCSymbolRefExpr::create(&End, Variant, MCOS.getContext());
-  const MCExpr *RHS =
-    MCSymbolRefExpr::create(&Start, Variant, MCOS.getContext());
-  const MCExpr *Res1 =
-    MCBinaryExpr::create(MCBinaryExpr::Sub, Res, RHS, MCOS.getContext());
-  const MCExpr *Res2 =
-    MCConstantExpr::create(IntVal, MCOS.getContext());
-  const MCExpr *Res3 =
-    MCBinaryExpr::create(MCBinaryExpr::Sub, Res1, Res2, MCOS.getContext());
+  const MCExpr *Res = MCSymbolRefExpr::create(&End, Variant, Ctx);
+  const MCExpr *RHS = MCSymbolRefExpr::create(&Start, Variant, Ctx);
+  const MCExpr *Res1 = MCBinaryExpr::create(MCBinaryExpr::Sub, Res, RHS, Ctx);
+  const MCExpr *Res2 = MCConstantExpr::create(IntVal, Ctx);
+  const MCExpr *Res3 = MCBinaryExpr::create(MCBinaryExpr::Sub, Res1, Res2, Ctx);
   return Res3;
 }
 
@@ -146,9 +164,9 @@ makeStartPlusIntExpr(MCContext &Ctx, const MCSymbol &Start, int IntVal) {
 // This emits the Dwarf line table for the specified section from the entries
 // in the LineSection.
 //
-static inline void
-EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
-                   const MCLineSection::MCDwarfLineEntryCollection &LineEntries) {
+static inline void emitDwarfLineTable(
+    MCObjectStreamer *MCOS, MCSection *Section,
+    const MCLineSection::MCDwarfLineEntryCollection &LineEntries) {
   unsigned FileNum = 1;
   unsigned LastLine = 1;
   unsigned Column = 0;
@@ -163,38 +181,38 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
 
     if (FileNum != LineEntry.getFileNum()) {
       FileNum = LineEntry.getFileNum();
-      MCOS->EmitIntValue(dwarf::DW_LNS_set_file, 1);
-      MCOS->EmitULEB128IntValue(FileNum);
+      MCOS->emitInt8(dwarf::DW_LNS_set_file);
+      MCOS->emitULEB128IntValue(FileNum);
     }
     if (Column != LineEntry.getColumn()) {
       Column = LineEntry.getColumn();
-      MCOS->EmitIntValue(dwarf::DW_LNS_set_column, 1);
-      MCOS->EmitULEB128IntValue(Column);
+      MCOS->emitInt8(dwarf::DW_LNS_set_column);
+      MCOS->emitULEB128IntValue(Column);
     }
     if (Discriminator != LineEntry.getDiscriminator() &&
         MCOS->getContext().getDwarfVersion() >= 4) {
       Discriminator = LineEntry.getDiscriminator();
       unsigned Size = getULEB128Size(Discriminator);
-      MCOS->EmitIntValue(dwarf::DW_LNS_extended_op, 1);
-      MCOS->EmitULEB128IntValue(Size + 1);
-      MCOS->EmitIntValue(dwarf::DW_LNE_set_discriminator, 1);
-      MCOS->EmitULEB128IntValue(Discriminator);
+      MCOS->emitInt8(dwarf::DW_LNS_extended_op);
+      MCOS->emitULEB128IntValue(Size + 1);
+      MCOS->emitInt8(dwarf::DW_LNE_set_discriminator);
+      MCOS->emitULEB128IntValue(Discriminator);
     }
     if (Isa != LineEntry.getIsa()) {
       Isa = LineEntry.getIsa();
-      MCOS->EmitIntValue(dwarf::DW_LNS_set_isa, 1);
-      MCOS->EmitULEB128IntValue(Isa);
+      MCOS->emitInt8(dwarf::DW_LNS_set_isa);
+      MCOS->emitULEB128IntValue(Isa);
     }
     if ((LineEntry.getFlags() ^ Flags) & DWARF2_FLAG_IS_STMT) {
       Flags = LineEntry.getFlags();
-      MCOS->EmitIntValue(dwarf::DW_LNS_negate_stmt, 1);
+      MCOS->emitInt8(dwarf::DW_LNS_negate_stmt);
     }
     if (LineEntry.getFlags() & DWARF2_FLAG_BASIC_BLOCK)
-      MCOS->EmitIntValue(dwarf::DW_LNS_set_basic_block, 1);
+      MCOS->emitInt8(dwarf::DW_LNS_set_basic_block);
     if (LineEntry.getFlags() & DWARF2_FLAG_PROLOGUE_END)
-      MCOS->EmitIntValue(dwarf::DW_LNS_set_prologue_end, 1);
+      MCOS->emitInt8(dwarf::DW_LNS_set_prologue_end);
     if (LineEntry.getFlags() & DWARF2_FLAG_EPILOGUE_BEGIN)
-      MCOS->EmitIntValue(dwarf::DW_LNS_set_epilogue_begin, 1);
+      MCOS->emitInt8(dwarf::DW_LNS_set_epilogue_begin);
 
     MCSymbol *Label = LineEntry.getLabel();
 
@@ -202,7 +220,7 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
     // line numbers and the increment of the address from the previous Label
     // and the current Label.
     const MCAsmInfo *asmInfo = MCOS->getContext().getAsmInfo();
-    MCOS->EmitDwarfAdvanceLineAddr(LineDelta, LastLabel, Label,
+    MCOS->emitDwarfAdvanceLineAddr(LineDelta, LastLabel, Label,
                                    asmInfo->getCodePointerSize());
 
     Discriminator = 0;
@@ -222,7 +240,7 @@ EmitDwarfLineTable(MCObjectStreamer *MCOS, MCSection *Section,
   MCOS->SwitchSection(Ctx.getObjectFileInfo()->getDwarfLineSection());
 
   const MCAsmInfo *AsmInfo = Ctx.getAsmInfo();
-  MCOS->EmitDwarfAdvanceLineAddr(INT64_MAX, LastLabel, SectionEnd,
+  MCOS->emitDwarfAdvanceLineAddr(INT64_MAX, LastLabel, SectionEnd,
                                  AsmInfo->getCodePointerSize());
 }
 
@@ -263,7 +281,7 @@ void MCDwarfDwoLineTable::Emit(MCStreamer &MCOS, MCDwarfLineTableParams Params,
     return;
   Optional<MCDwarfLineStr> NoLineStr(None);
   MCOS.SwitchSection(Section);
-  MCOS.EmitLabel(Header.Emit(&MCOS, Params, None, NoLineStr).second);
+  MCOS.emitLabel(Header.Emit(&MCOS, Params, None, NoLineStr).second);
 }
 
 std::pair<MCSymbol *, MCSymbol *>
@@ -298,13 +316,13 @@ static const MCExpr *forceExpAbs(MCStreamer &OS, const MCExpr* Expr) {
     return Expr;
 
   MCSymbol *ABS = Context.createTempSymbol();
-  OS.EmitAssignment(ABS, Expr);
+  OS.emitAssignment(ABS, Expr);
   return MCSymbolRefExpr::create(ABS, Context);
 }
 
 static void emitAbsValue(MCStreamer &OS, const MCExpr *Value, unsigned Size) {
   const MCExpr *ABS = forceExpAbs(OS, Value);
-  OS.EmitValue(ABS, Size);
+  OS.emitValue(ABS, Size);
 }
 
 void MCDwarfLineStr::emitSection(MCStreamer *MCOS) {
@@ -316,37 +334,38 @@ void MCDwarfLineStr::emitSection(MCStreamer *MCOS) {
   SmallString<0> Data;
   Data.resize(LineStrings.getSize());
   LineStrings.write((uint8_t *)Data.data());
-  MCOS->EmitBinaryData(Data.str());
+  MCOS->emitBinaryData(Data.str());
 }
 
 void MCDwarfLineStr::emitRef(MCStreamer *MCOS, StringRef Path) {
-  int RefSize = 4; // FIXME: Support DWARF-64
+  int RefSize =
+      dwarf::getDwarfOffsetByteSize(MCOS->getContext().getDwarfFormat());
   size_t Offset = LineStrings.add(Path);
   if (UseRelocs) {
     MCContext &Ctx = MCOS->getContext();
-    MCOS->EmitValue(makeStartPlusIntExpr(Ctx, *LineStrLabel, Offset), RefSize);
+    MCOS->emitValue(makeStartPlusIntExpr(Ctx, *LineStrLabel, Offset), RefSize);
   } else
-    MCOS->EmitIntValue(Offset, RefSize);
+    MCOS->emitIntValue(Offset, RefSize);
 }
 
 void MCDwarfLineTableHeader::emitV2FileDirTables(MCStreamer *MCOS) const {
   // First the directory table.
   for (auto &Dir : MCDwarfDirs) {
-    MCOS->EmitBytes(Dir);                // The DirectoryName, and...
-    MCOS->EmitBytes(StringRef("\0", 1)); // its null terminator.
+    MCOS->emitBytes(Dir);                // The DirectoryName, and...
+    MCOS->emitBytes(StringRef("\0", 1)); // its null terminator.
   }
-  MCOS->EmitIntValue(0, 1); // Terminate the directory list.
+  MCOS->emitInt8(0); // Terminate the directory list.
 
   // Second the file table.
   for (unsigned i = 1; i < MCDwarfFiles.size(); i++) {
     assert(!MCDwarfFiles[i].Name.empty());
-    MCOS->EmitBytes(MCDwarfFiles[i].Name); // FileName and...
-    MCOS->EmitBytes(StringRef("\0", 1));   // its null terminator.
-    MCOS->EmitULEB128IntValue(MCDwarfFiles[i].DirIndex); // Directory number.
-    MCOS->EmitIntValue(0, 1); // Last modification timestamp (always 0).
-    MCOS->EmitIntValue(0, 1); // File size (always 0).
+    MCOS->emitBytes(MCDwarfFiles[i].Name); // FileName and...
+    MCOS->emitBytes(StringRef("\0", 1));   // its null terminator.
+    MCOS->emitULEB128IntValue(MCDwarfFiles[i].DirIndex); // Directory number.
+    MCOS->emitInt8(0); // Last modification timestamp (always 0).
+    MCOS->emitInt8(0); // File size (always 0).
   }
-  MCOS->EmitIntValue(0, 1); // Terminate the file list.
+  MCOS->emitInt8(0); // Terminate the file list.
 }
 
 static void emitOneV5FileEntry(MCStreamer *MCOS, const MCDwarfFile &DwarfFile,
@@ -356,13 +375,13 @@ static void emitOneV5FileEntry(MCStreamer *MCOS, const MCDwarfFile &DwarfFile,
   if (LineStr)
     LineStr->emitRef(MCOS, DwarfFile.Name);
   else {
-    MCOS->EmitBytes(DwarfFile.Name);     // FileName and...
-    MCOS->EmitBytes(StringRef("\0", 1)); // its null terminator.
+    MCOS->emitBytes(DwarfFile.Name);     // FileName and...
+    MCOS->emitBytes(StringRef("\0", 1)); // its null terminator.
   }
-  MCOS->EmitULEB128IntValue(DwarfFile.DirIndex); // Directory number.
+  MCOS->emitULEB128IntValue(DwarfFile.DirIndex); // Directory number.
   if (EmitMD5) {
     const MD5::MD5Result &Cksum = *DwarfFile.Checksum;
-    MCOS->EmitBinaryData(
+    MCOS->emitBinaryData(
         StringRef(reinterpret_cast<const char *>(Cksum.Bytes.data()),
                   Cksum.Bytes.size()));
   }
@@ -370,9 +389,9 @@ static void emitOneV5FileEntry(MCStreamer *MCOS, const MCDwarfFile &DwarfFile,
     if (LineStr)
       LineStr->emitRef(MCOS, DwarfFile.Source.getValueOr(StringRef()));
     else {
-      MCOS->EmitBytes(
+      MCOS->emitBytes(
           DwarfFile.Source.getValueOr(StringRef())); // Source and...
-      MCOS->EmitBytes(StringRef("\0", 1));           // its null terminator.
+      MCOS->emitBytes(StringRef("\0", 1));           // its null terminator.
     }
   }
 }
@@ -382,11 +401,11 @@ void MCDwarfLineTableHeader::emitV5FileDirTables(
   // The directory format, which is just a list of the directory paths.  In a
   // non-split object, these are references to .debug_line_str; in a split
   // object, they are inline strings.
-  MCOS->EmitIntValue(1, 1);
-  MCOS->EmitULEB128IntValue(dwarf::DW_LNCT_path);
-  MCOS->EmitULEB128IntValue(LineStr ? dwarf::DW_FORM_line_strp
+  MCOS->emitInt8(1);
+  MCOS->emitULEB128IntValue(dwarf::DW_LNCT_path);
+  MCOS->emitULEB128IntValue(LineStr ? dwarf::DW_FORM_line_strp
                                     : dwarf::DW_FORM_string);
-  MCOS->EmitULEB128IntValue(MCDwarfDirs.size() + 1);
+  MCOS->emitULEB128IntValue(MCDwarfDirs.size() + 1);
   // Try not to emit an empty compilation directory.
   const StringRef CompDir = CompilationDir.empty()
                                 ? MCOS->getContext().getCompilationDir()
@@ -398,11 +417,11 @@ void MCDwarfLineTableHeader::emitV5FileDirTables(
       LineStr->emitRef(MCOS, Dir);
   } else {
     // The list of directory paths.  Compilation directory comes first.
-    MCOS->EmitBytes(CompDir);
-    MCOS->EmitBytes(StringRef("\0", 1));
+    MCOS->emitBytes(CompDir);
+    MCOS->emitBytes(StringRef("\0", 1));
     for (const auto &Dir : MCDwarfDirs) {
-      MCOS->EmitBytes(Dir);                // The DirectoryName, and...
-      MCOS->EmitBytes(StringRef("\0", 1)); // its null terminator.
+      MCOS->emitBytes(Dir);                // The DirectoryName, and...
+      MCOS->emitBytes(StringRef("\0", 1)); // its null terminator.
     }
   }
 
@@ -414,26 +433,26 @@ void MCDwarfLineTableHeader::emitV5FileDirTables(
     Entries += 1;
   if (HasSource)
     Entries += 1;
-  MCOS->EmitIntValue(Entries, 1);
-  MCOS->EmitULEB128IntValue(dwarf::DW_LNCT_path);
-  MCOS->EmitULEB128IntValue(LineStr ? dwarf::DW_FORM_line_strp
+  MCOS->emitInt8(Entries);
+  MCOS->emitULEB128IntValue(dwarf::DW_LNCT_path);
+  MCOS->emitULEB128IntValue(LineStr ? dwarf::DW_FORM_line_strp
                                     : dwarf::DW_FORM_string);
-  MCOS->EmitULEB128IntValue(dwarf::DW_LNCT_directory_index);
-  MCOS->EmitULEB128IntValue(dwarf::DW_FORM_udata);
+  MCOS->emitULEB128IntValue(dwarf::DW_LNCT_directory_index);
+  MCOS->emitULEB128IntValue(dwarf::DW_FORM_udata);
   if (HasAllMD5) {
-    MCOS->EmitULEB128IntValue(dwarf::DW_LNCT_MD5);
-    MCOS->EmitULEB128IntValue(dwarf::DW_FORM_data16);
+    MCOS->emitULEB128IntValue(dwarf::DW_LNCT_MD5);
+    MCOS->emitULEB128IntValue(dwarf::DW_FORM_data16);
   }
   if (HasSource) {
-    MCOS->EmitULEB128IntValue(dwarf::DW_LNCT_LLVM_source);
-    MCOS->EmitULEB128IntValue(LineStr ? dwarf::DW_FORM_line_strp
+    MCOS->emitULEB128IntValue(dwarf::DW_LNCT_LLVM_source);
+    MCOS->emitULEB128IntValue(LineStr ? dwarf::DW_FORM_line_strp
                                       : dwarf::DW_FORM_string);
   }
   // Then the counted list of files. The root file is file #0, then emit the
   // files as provide by .file directives.
   // MCDwarfFiles has an unused element [0] so use size() not size()+1.
   // But sometimes MCDwarfFiles is empty, in which case we still emit one file.
-  MCOS->EmitULEB128IntValue(MCDwarfFiles.empty() ? 1 : MCDwarfFiles.size());
+  MCOS->emitULEB128IntValue(MCDwarfFiles.empty() ? 1 : MCDwarfFiles.size());
   // To accommodate assembler source written for DWARF v4 but trying to emit
   // v5: If we didn't see a root file explicitly, replicate file #1.
   assert((!RootFile.Name.empty() || MCDwarfFiles.size() >= 1) &&
@@ -455,56 +474,66 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
   if (!LineStartSym)
     LineStartSym = context.createTempSymbol();
   // Set the value of the symbol, as we are at the start of the line table.
-  MCOS->EmitLabel(LineStartSym);
+  MCOS->emitLabel(LineStartSym);
 
   // Create a symbol for the end of the section (to be set when we get there).
   MCSymbol *LineEndSym = context.createTempSymbol();
 
-  // The first 4 bytes is the total length of the information for this
-  // compilation unit (not including these 4 bytes for the length).
+  unsigned UnitLengthBytes =
+      dwarf::getUnitLengthFieldByteSize(context.getDwarfFormat());
+  unsigned OffsetSize = dwarf::getDwarfOffsetByteSize(context.getDwarfFormat());
+
+  if (context.getDwarfFormat() == dwarf::DWARF64)
+    // Emit DWARF64 mark.
+    MCOS->emitInt32(dwarf::DW_LENGTH_DWARF64);
+
+  // The length field does not include itself and, in case of the 64-bit DWARF
+  // format, the DWARF64 mark.
   emitAbsValue(*MCOS,
-               MakeStartMinusEndExpr(*MCOS, *LineStartSym, *LineEndSym, 4), 4);
+               makeEndMinusStartExpr(context, *LineStartSym, *LineEndSym,
+                                     UnitLengthBytes),
+               OffsetSize);
 
   // Next 2 bytes is the Version.
   unsigned LineTableVersion = context.getDwarfVersion();
-  MCOS->EmitIntValue(LineTableVersion, 2);
+  MCOS->emitInt16(LineTableVersion);
 
   // Keep track of the bytes between the very start and where the header length
   // comes out.
-  unsigned PreHeaderLengthBytes = 4 + 2;
+  unsigned PreHeaderLengthBytes = UnitLengthBytes + 2;
 
   // In v5, we get address info next.
   if (LineTableVersion >= 5) {
-    MCOS->EmitIntValue(context.getAsmInfo()->getCodePointerSize(), 1);
-    MCOS->EmitIntValue(0, 1); // Segment selector; same as EmitGenDwarfAranges.
+    MCOS->emitInt8(context.getAsmInfo()->getCodePointerSize());
+    MCOS->emitInt8(0); // Segment selector; same as EmitGenDwarfAranges.
     PreHeaderLengthBytes += 2;
   }
 
   // Create a symbol for the end of the prologue (to be set when we get there).
   MCSymbol *ProEndSym = context.createTempSymbol(); // Lprologue_end
 
-  // Length of the prologue, is the next 4 bytes.  This is actually the length
-  // from after the length word, to the end of the prologue.
+  // Length of the prologue, is the next 4 bytes (8 bytes for DWARF64). This is
+  // actually the length from after the length word, to the end of the prologue.
   emitAbsValue(*MCOS,
-               MakeStartMinusEndExpr(*MCOS, *LineStartSym, *ProEndSym,
-                                     (PreHeaderLengthBytes + 4)),
-               4);
+               makeEndMinusStartExpr(context, *LineStartSym, *ProEndSym,
+                                     (PreHeaderLengthBytes + OffsetSize)),
+               OffsetSize);
 
   // Parameters of the state machine, are next.
-  MCOS->EmitIntValue(context.getAsmInfo()->getMinInstAlignment(), 1);
+  MCOS->emitInt8(context.getAsmInfo()->getMinInstAlignment());
   // maximum_operations_per_instruction
   // For non-VLIW architectures this field is always 1.
   // FIXME: VLIW architectures need to update this field accordingly.
   if (LineTableVersion >= 4)
-    MCOS->EmitIntValue(1, 1);
-  MCOS->EmitIntValue(DWARF2_LINE_DEFAULT_IS_STMT, 1);
-  MCOS->EmitIntValue(Params.DWARF2LineBase, 1);
-  MCOS->EmitIntValue(Params.DWARF2LineRange, 1);
-  MCOS->EmitIntValue(StandardOpcodeLengths.size() + 1, 1);
+    MCOS->emitInt8(1);
+  MCOS->emitInt8(DWARF2_LINE_DEFAULT_IS_STMT);
+  MCOS->emitInt8(Params.DWARF2LineBase);
+  MCOS->emitInt8(Params.DWARF2LineRange);
+  MCOS->emitInt8(StandardOpcodeLengths.size() + 1);
 
   // Standard opcode lengths
   for (char Length : StandardOpcodeLengths)
-    MCOS->EmitIntValue(Length, 1);
+    MCOS->emitInt8(Length);
 
   // Put out the directory and file tables.  The formats vary depending on
   // the version.
@@ -515,7 +544,7 @@ MCDwarfLineTableHeader::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
 
   // This is the end of the prologue, so set the value of the symbol at the
   // end of the prologue (that was used in a previous expression).
-  MCOS->EmitLabel(ProEndSym);
+  MCOS->emitLabel(ProEndSym);
 
   return std::make_pair(LineStartSym, LineEndSym);
 }
@@ -527,11 +556,11 @@ void MCDwarfLineTable::EmitCU(MCObjectStreamer *MCOS,
 
   // Put out the line tables.
   for (const auto &LineSec : MCLineSections.getMCLineEntries())
-    EmitDwarfLineTable(MCOS, LineSec.first, LineSec.second);
+    emitDwarfLineTable(MCOS, LineSec.first, LineSec.second);
 
   // This is the end of the section, so set the value of the symbol at the end
   // of this section (that was used in a previous expression).
-  MCOS->EmitLabel(LineEndSym);
+  MCOS->emitLabel(LineEndSym);
 }
 
 Expected<unsigned> MCDwarfLineTable::tryGetFile(StringRef &Directory,
@@ -620,7 +649,7 @@ MCDwarfLineTableHeader::tryGetFile(StringRef &Directory,
   } else {
     DirIndex = llvm::find(MCDwarfDirs, Directory) - MCDwarfDirs.begin();
     if (DirIndex >= MCDwarfDirs.size())
-      MCDwarfDirs.push_back(Directory);
+      MCDwarfDirs.push_back(std::string(Directory));
     // The DirIndex is one based, as DirIndex of 0 is used for FileNames with
     // no directories.  MCDwarfDirs[] is unlike MCDwarfFiles[] in that the
     // directory names are stored at MCDwarfDirs[DirIndex-1] where FileNames
@@ -628,7 +657,7 @@ MCDwarfLineTableHeader::tryGetFile(StringRef &Directory,
     DirIndex++;
   }
 
-  File.Name = FileName;
+  File.Name = std::string(FileName);
   File.DirIndex = DirIndex;
   File.Checksum = Checksum;
   trackMD5Usage(Checksum.hasValue());
@@ -647,7 +676,7 @@ void MCDwarfLineAddr::Emit(MCStreamer *MCOS, MCDwarfLineTableParams Params,
   SmallString<256> Tmp;
   raw_svector_ostream OS(Tmp);
   MCDwarfLineAddr::Encode(Context, Params, LineDelta, AddrDelta, OS);
-  MCOS->EmitBytes(OS.str());
+  MCOS->emitBytes(OS.str());
 }
 
 /// Given a special op, return the address skip amount (in units of
@@ -790,8 +819,8 @@ bool MCDwarfLineAddr::FixedEncode(MCContext &Context,
 
 // Utility function to write a tuple for .debug_abbrev.
 static void EmitAbbrev(MCStreamer *MCOS, uint64_t Name, uint64_t Form) {
-  MCOS->EmitULEB128IntValue(Name);
-  MCOS->EmitULEB128IntValue(Form);
+  MCOS->emitULEB128IntValue(Name);
+  MCOS->emitULEB128IntValue(Form);
 }
 
 // When generating dwarf for assembly source files this emits
@@ -801,17 +830,18 @@ static void EmitGenDwarfAbbrev(MCStreamer *MCOS) {
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfAbbrevSection());
 
   // DW_TAG_compile_unit DIE abbrev (1).
-  MCOS->EmitULEB128IntValue(1);
-  MCOS->EmitULEB128IntValue(dwarf::DW_TAG_compile_unit);
-  MCOS->EmitIntValue(dwarf::DW_CHILDREN_yes, 1);
-  EmitAbbrev(MCOS, dwarf::DW_AT_stmt_list, context.getDwarfVersion() >= 4
-                                               ? dwarf::DW_FORM_sec_offset
-                                               : dwarf::DW_FORM_data4);
+  MCOS->emitULEB128IntValue(1);
+  MCOS->emitULEB128IntValue(dwarf::DW_TAG_compile_unit);
+  MCOS->emitInt8(dwarf::DW_CHILDREN_yes);
+  dwarf::Form SecOffsetForm =
+      context.getDwarfVersion() >= 4
+          ? dwarf::DW_FORM_sec_offset
+          : (context.getDwarfFormat() == dwarf::DWARF64 ? dwarf::DW_FORM_data8
+                                                        : dwarf::DW_FORM_data4);
+  EmitAbbrev(MCOS, dwarf::DW_AT_stmt_list, SecOffsetForm);
   if (context.getGenDwarfSectionSyms().size() > 1 &&
       context.getDwarfVersion() >= 3) {
-    EmitAbbrev(MCOS, dwarf::DW_AT_ranges, context.getDwarfVersion() >= 4
-                                              ? dwarf::DW_FORM_sec_offset
-                                              : dwarf::DW_FORM_data4);
+    EmitAbbrev(MCOS, dwarf::DW_AT_ranges, SecOffsetForm);
   } else {
     EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr);
     EmitAbbrev(MCOS, dwarf::DW_AT_high_pc, dwarf::DW_FORM_addr);
@@ -827,24 +857,17 @@ static void EmitGenDwarfAbbrev(MCStreamer *MCOS) {
   EmitAbbrev(MCOS, 0, 0);
 
   // DW_TAG_label DIE abbrev (2).
-  MCOS->EmitULEB128IntValue(2);
-  MCOS->EmitULEB128IntValue(dwarf::DW_TAG_label);
-  MCOS->EmitIntValue(dwarf::DW_CHILDREN_yes, 1);
+  MCOS->emitULEB128IntValue(2);
+  MCOS->emitULEB128IntValue(dwarf::DW_TAG_label);
+  MCOS->emitInt8(dwarf::DW_CHILDREN_no);
   EmitAbbrev(MCOS, dwarf::DW_AT_name, dwarf::DW_FORM_string);
   EmitAbbrev(MCOS, dwarf::DW_AT_decl_file, dwarf::DW_FORM_data4);
   EmitAbbrev(MCOS, dwarf::DW_AT_decl_line, dwarf::DW_FORM_data4);
   EmitAbbrev(MCOS, dwarf::DW_AT_low_pc, dwarf::DW_FORM_addr);
-  EmitAbbrev(MCOS, dwarf::DW_AT_prototyped, dwarf::DW_FORM_flag);
-  EmitAbbrev(MCOS, 0, 0);
-
-  // DW_TAG_unspecified_parameters DIE abbrev (3).
-  MCOS->EmitULEB128IntValue(3);
-  MCOS->EmitULEB128IntValue(dwarf::DW_TAG_unspecified_parameters);
-  MCOS->EmitIntValue(dwarf::DW_CHILDREN_no, 1);
   EmitAbbrev(MCOS, 0, 0);
 
   // Terminate the abbreviations for this compilation unit.
-  MCOS->EmitIntValue(0, 1);
+  MCOS->emitInt8(0);
 }
 
 // When generating dwarf for assembly source files this emits the data for
@@ -859,9 +882,13 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS,
 
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection());
 
+  unsigned UnitLengthBytes =
+      dwarf::getUnitLengthFieldByteSize(context.getDwarfFormat());
+  unsigned OffsetSize = dwarf::getDwarfOffsetByteSize(context.getDwarfFormat());
+
   // This will be the length of the .debug_aranges section, first account for
   // the size of each item in the header (see below where we emit these items).
-  int Length = 4 + 2 + 4 + 1 + 1;
+  int Length = UnitLengthBytes + 2 + OffsetSize + 1 + 1;
 
   // Figure the padding after the header before the table of address and size
   // pairs who's values are PointerSize'ed.
@@ -879,24 +906,28 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS,
   Length += 2 * AddrSize;
 
   // Emit the header for this section.
-  // The 4 byte length not including the 4 byte value for the length.
-  MCOS->EmitIntValue(Length - 4, 4);
+  if (context.getDwarfFormat() == dwarf::DWARF64)
+    // The DWARF64 mark.
+    MCOS->emitInt32(dwarf::DW_LENGTH_DWARF64);
+  // The 4 (8 for DWARF64) byte length not including the length of the unit
+  // length field itself.
+  MCOS->emitIntValue(Length - UnitLengthBytes, OffsetSize);
   // The 2 byte version, which is 2.
-  MCOS->EmitIntValue(2, 2);
-  // The 4 byte offset to the compile unit in the .debug_info from the start
-  // of the .debug_info.
+  MCOS->emitInt16(2);
+  // The 4 (8 for DWARF64) byte offset to the compile unit in the .debug_info
+  // from the start of the .debug_info.
   if (InfoSectionSymbol)
-    MCOS->EmitSymbolValue(InfoSectionSymbol, 4,
+    MCOS->emitSymbolValue(InfoSectionSymbol, OffsetSize,
                           asmInfo->needsDwarfSectionOffsetDirective());
   else
-    MCOS->EmitIntValue(0, 4);
+    MCOS->emitIntValue(0, OffsetSize);
   // The 1 byte size of an address.
-  MCOS->EmitIntValue(AddrSize, 1);
+  MCOS->emitInt8(AddrSize);
   // The 1 byte size of a segment descriptor, we use a value of zero.
-  MCOS->EmitIntValue(0, 1);
+  MCOS->emitInt8(0);
   // Align the header with the padding if needed, before we put out the table.
   for(int i = 0; i < Pad; i++)
-    MCOS->EmitIntValue(0, 1);
+    MCOS->emitInt8(0);
 
   // Now emit the table of pairs of PointerSize'ed values for the section
   // addresses and sizes.
@@ -908,15 +939,15 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS,
 
     const MCExpr *Addr = MCSymbolRefExpr::create(
       StartSymbol, MCSymbolRefExpr::VK_None, context);
-    const MCExpr *Size = MakeStartMinusEndExpr(*MCOS,
-      *StartSymbol, *EndSymbol, 0);
-    MCOS->EmitValue(Addr, AddrSize);
+    const MCExpr *Size =
+        makeEndMinusStartExpr(context, *StartSymbol, *EndSymbol, 0);
+    MCOS->emitValue(Addr, AddrSize);
     emitAbsValue(*MCOS, Size, AddrSize);
   }
 
   // And finally the pair of terminating zeros.
-  MCOS->EmitIntValue(0, AddrSize);
-  MCOS->EmitIntValue(0, AddrSize);
+  MCOS->emitIntValue(0, AddrSize);
+  MCOS->emitIntValue(0, AddrSize);
 }
 
 // When generating dwarf for assembly source files this emits the data for
@@ -925,7 +956,7 @@ static void EmitGenDwarfAranges(MCStreamer *MCOS,
 static void EmitGenDwarfInfo(MCStreamer *MCOS,
                              const MCSymbol *AbbrevSectionSymbol,
                              const MCSymbol *LineSectionSymbol,
-                             const MCSymbol *RangesSectionSymbol) {
+                             const MCSymbol *RangesSymbol) {
   MCContext &context = MCOS->getContext();
 
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection());
@@ -933,57 +964,66 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
   // Create a symbol at the start and end of this section used in here for the
   // expression to calculate the length in the header.
   MCSymbol *InfoStart = context.createTempSymbol();
-  MCOS->EmitLabel(InfoStart);
+  MCOS->emitLabel(InfoStart);
   MCSymbol *InfoEnd = context.createTempSymbol();
 
   // First part: the header.
 
-  // The 4 byte total length of the information for this compilation unit, not
-  // including these 4 bytes.
-  const MCExpr *Length = MakeStartMinusEndExpr(*MCOS, *InfoStart, *InfoEnd, 4);
-  emitAbsValue(*MCOS, Length, 4);
+  unsigned UnitLengthBytes =
+      dwarf::getUnitLengthFieldByteSize(context.getDwarfFormat());
+  unsigned OffsetSize = dwarf::getDwarfOffsetByteSize(context.getDwarfFormat());
+
+  if (context.getDwarfFormat() == dwarf::DWARF64)
+    // Emit DWARF64 mark.
+    MCOS->emitInt32(dwarf::DW_LENGTH_DWARF64);
+
+  // The 4 (8 for DWARF64) byte total length of the information for this
+  // compilation unit, not including the unit length field itself.
+  const MCExpr *Length =
+      makeEndMinusStartExpr(context, *InfoStart, *InfoEnd, UnitLengthBytes);
+  emitAbsValue(*MCOS, Length, OffsetSize);
 
   // The 2 byte DWARF version.
-  MCOS->EmitIntValue(context.getDwarfVersion(), 2);
+  MCOS->emitInt16(context.getDwarfVersion());
 
   // The DWARF v5 header has unit type, address size, abbrev offset.
   // Earlier versions have abbrev offset, address size.
   const MCAsmInfo &AsmInfo = *context.getAsmInfo();
   int AddrSize = AsmInfo.getCodePointerSize();
   if (context.getDwarfVersion() >= 5) {
-    MCOS->EmitIntValue(dwarf::DW_UT_compile, 1);
-    MCOS->EmitIntValue(AddrSize, 1);
+    MCOS->emitInt8(dwarf::DW_UT_compile);
+    MCOS->emitInt8(AddrSize);
   }
-  // The 4 byte offset to the debug abbrevs from the start of the .debug_abbrev,
-  // it is at the start of that section so this is zero.
-  if (AbbrevSectionSymbol == nullptr)
-    MCOS->EmitIntValue(0, 4);
-  else
-    MCOS->EmitSymbolValue(AbbrevSectionSymbol, 4,
+  // The 4 (8 for DWARF64) byte offset to the debug abbrevs from the start of
+  // the .debug_abbrev.
+  if (AbbrevSectionSymbol)
+    MCOS->emitSymbolValue(AbbrevSectionSymbol, OffsetSize,
                           AsmInfo.needsDwarfSectionOffsetDirective());
+  else
+    // Since the abbrevs are at the start of the section, the offset is zero.
+    MCOS->emitIntValue(0, OffsetSize);
   if (context.getDwarfVersion() <= 4)
-    MCOS->EmitIntValue(AddrSize, 1);
+    MCOS->emitInt8(AddrSize);
 
   // Second part: the compile_unit DIE.
 
   // The DW_TAG_compile_unit DIE abbrev (1).
-  MCOS->EmitULEB128IntValue(1);
+  MCOS->emitULEB128IntValue(1);
 
-  // DW_AT_stmt_list, a 4 byte offset from the start of the .debug_line section,
-  // which is at the start of that section so this is zero.
+  // DW_AT_stmt_list, a 4 (8 for DWARF64) byte offset from the start of the
+  // .debug_line section.
   if (LineSectionSymbol)
-    MCOS->EmitSymbolValue(LineSectionSymbol, 4,
+    MCOS->emitSymbolValue(LineSectionSymbol, OffsetSize,
                           AsmInfo.needsDwarfSectionOffsetDirective());
   else
-    MCOS->EmitIntValue(0, 4);
-
-  if (RangesSectionSymbol) {
-    // There are multiple sections containing code, so we must use the
-    // .debug_ranges sections.
-
-    // AT_ranges, the 4 byte offset from the start of the .debug_ranges section
-    // to the address range list for this compilation unit.
-    MCOS->EmitSymbolValue(RangesSectionSymbol, 4);
+    // The line table is at the start of the section, so the offset is zero.
+    MCOS->emitIntValue(0, OffsetSize);
+
+  if (RangesSymbol) {
+    // There are multiple sections containing code, so we must use
+    // .debug_ranges/.debug_rnglists. AT_ranges, the 4/8 byte offset from the
+    // start of the .debug_ranges/.debug_rnglists.
+    MCOS->emitSymbolValue(RangesSymbol, OffsetSize);
   } else {
     // If we only have one non-empty code section, we can use the simpler
     // AT_low_pc and AT_high_pc attributes.
@@ -1001,20 +1041,20 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
     // AT_low_pc, the first address of the default .text section.
     const MCExpr *Start = MCSymbolRefExpr::create(
         StartSymbol, MCSymbolRefExpr::VK_None, context);
-    MCOS->EmitValue(Start, AddrSize);
+    MCOS->emitValue(Start, AddrSize);
 
     // AT_high_pc, the last address of the default .text section.
     const MCExpr *End = MCSymbolRefExpr::create(
       EndSymbol, MCSymbolRefExpr::VK_None, context);
-    MCOS->EmitValue(End, AddrSize);
+    MCOS->emitValue(End, AddrSize);
   }
 
   // AT_name, the name of the source file.  Reconstruct from the first directory
   // and file table entries.
   const SmallVectorImpl<std::string> &MCDwarfDirs = context.getMCDwarfDirs();
   if (MCDwarfDirs.size() > 0) {
-    MCOS->EmitBytes(MCDwarfDirs[0]);
-    MCOS->EmitBytes(sys::path::get_separator());
+    MCOS->emitBytes(MCDwarfDirs[0]);
+    MCOS->emitBytes(sys::path::get_separator());
   }
   const SmallVectorImpl<MCDwarfFile> &MCDwarfFiles = context.getMCDwarfFiles();
   // MCDwarfFiles might be empty if we have an empty source file.
@@ -1024,33 +1064,33 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
       MCDwarfFiles.empty()
           ? context.getMCDwarfLineTable(/*CUID=*/0).getRootFile()
           : MCDwarfFiles[1];
-  MCOS->EmitBytes(RootFile.Name);
-  MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
+  MCOS->emitBytes(RootFile.Name);
+  MCOS->emitInt8(0); // NULL byte to terminate the string.
 
   // AT_comp_dir, the working directory the assembly was done in.
   if (!context.getCompilationDir().empty()) {
-    MCOS->EmitBytes(context.getCompilationDir());
-    MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
+    MCOS->emitBytes(context.getCompilationDir());
+    MCOS->emitInt8(0); // NULL byte to terminate the string.
   }
 
   // AT_APPLE_flags, the command line arguments of the assembler tool.
   StringRef DwarfDebugFlags = context.getDwarfDebugFlags();
   if (!DwarfDebugFlags.empty()){
-    MCOS->EmitBytes(DwarfDebugFlags);
-    MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
+    MCOS->emitBytes(DwarfDebugFlags);
+    MCOS->emitInt8(0); // NULL byte to terminate the string.
   }
 
   // AT_producer, the version of the assembler tool.
   StringRef DwarfDebugProducer = context.getDwarfDebugProducer();
   if (!DwarfDebugProducer.empty())
-    MCOS->EmitBytes(DwarfDebugProducer);
+    MCOS->emitBytes(DwarfDebugProducer);
   else
-    MCOS->EmitBytes(StringRef("llvm-mc (based on LLVM " PACKAGE_VERSION ")"));
-  MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
+    MCOS->emitBytes(StringRef("llvm-mc (based on LLVM " PACKAGE_VERSION ")"));
+  MCOS->emitInt8(0); // NULL byte to terminate the string.
 
   // AT_language, a 4 byte value.  We use DW_LANG_Mips_Assembler as the dwarf2
   // draft has no standard code for assembler.
-  MCOS->EmitIntValue(dwarf::DW_LANG_Mips_Assembler, 2);
+  MCOS->emitInt16(dwarf::DW_LANG_Mips_Assembler);
 
   // Third part: the list of label DIEs.
 
@@ -1059,74 +1099,89 @@ static void EmitGenDwarfInfo(MCStreamer *MCOS,
       MCOS->getContext().getMCGenDwarfLabelEntries();
   for (const auto &Entry : Entries) {
     // The DW_TAG_label DIE abbrev (2).
-    MCOS->EmitULEB128IntValue(2);
+    MCOS->emitULEB128IntValue(2);
 
     // AT_name, of the label without any leading underbar.
-    MCOS->EmitBytes(Entry.getName());
-    MCOS->EmitIntValue(0, 1); // NULL byte to terminate the string.
+    MCOS->emitBytes(Entry.getName());
+    MCOS->emitInt8(0); // NULL byte to terminate the string.
 
     // AT_decl_file, index into the file table.
-    MCOS->EmitIntValue(Entry.getFileNumber(), 4);
+    MCOS->emitInt32(Entry.getFileNumber());
 
     // AT_decl_line, source line number.
-    MCOS->EmitIntValue(Entry.getLineNumber(), 4);
+    MCOS->emitInt32(Entry.getLineNumber());
 
     // AT_low_pc, start address of the label.
     const MCExpr *AT_low_pc = MCSymbolRefExpr::create(Entry.getLabel(),
                                              MCSymbolRefExpr::VK_None, context);
-    MCOS->EmitValue(AT_low_pc, AddrSize);
-
-    // DW_AT_prototyped, a one byte flag value of 0 saying we have no prototype.
-    MCOS->EmitIntValue(0, 1);
-
-    // The DW_TAG_unspecified_parameters DIE abbrev (3).
-    MCOS->EmitULEB128IntValue(3);
-
-    // Add the NULL DIE terminating the DW_TAG_unspecified_parameters DIE's.
-    MCOS->EmitIntValue(0, 1);
+    MCOS->emitValue(AT_low_pc, AddrSize);
   }
 
   // Add the NULL DIE terminating the Compile Unit DIE's.
-  MCOS->EmitIntValue(0, 1);
+  MCOS->emitInt8(0);
 
   // Now set the value of the symbol at the end of the info section.
-  MCOS->EmitLabel(InfoEnd);
+  MCOS->emitLabel(InfoEnd);
 }
 
 // When generating dwarf for assembly source files this emits the data for
 // .debug_ranges section. We only emit one range list, which spans all of the
 // executable sections of this file.
-static void EmitGenDwarfRanges(MCStreamer *MCOS) {
+static MCSymbol *emitGenDwarfRanges(MCStreamer *MCOS) {
   MCContext &context = MCOS->getContext();
   auto &Sections = context.getGenDwarfSectionSyms();
 
   const MCAsmInfo *AsmInfo = context.getAsmInfo();
   int AddrSize = AsmInfo->getCodePointerSize();
+  MCSymbol *RangesSymbol;
+
+  if (MCOS->getContext().getDwarfVersion() >= 5) {
+    MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfRnglistsSection());
+    MCSymbol *EndSymbol = mcdwarf::emitListsTableHeaderStart(*MCOS);
+    MCOS->AddComment("Offset entry count");
+    MCOS->emitInt32(0);
+    RangesSymbol = context.createTempSymbol("debug_rnglist0_start", true, true);
+    MCOS->emitLabel(RangesSymbol);
+    for (MCSection *Sec : Sections) {
+      const MCSymbol *StartSymbol = Sec->getBeginSymbol();
+      const MCSymbol *EndSymbol = Sec->getEndSymbol(context);
+      const MCExpr *SectionStartAddr = MCSymbolRefExpr::create(
+          StartSymbol, MCSymbolRefExpr::VK_None, context);
+      const MCExpr *SectionSize =
+          makeEndMinusStartExpr(context, *StartSymbol, *EndSymbol, 0);
+      MCOS->emitInt8(dwarf::DW_RLE_start_length);
+      MCOS->emitValue(SectionStartAddr, AddrSize);
+      MCOS->emitULEB128Value(SectionSize);
+    }
+    MCOS->emitInt8(dwarf::DW_RLE_end_of_list);
+    MCOS->emitLabel(EndSymbol);
+  } else {
+    MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfRangesSection());
+    RangesSymbol = context.createTempSymbol("debug_ranges_start", true, true);
+    MCOS->emitLabel(RangesSymbol);
+    for (MCSection *Sec : Sections) {
+      const MCSymbol *StartSymbol = Sec->getBeginSymbol();
+      const MCSymbol *EndSymbol = Sec->getEndSymbol(context);
+
+      // Emit a base address selection entry for the section start.
+      const MCExpr *SectionStartAddr = MCSymbolRefExpr::create(
+          StartSymbol, MCSymbolRefExpr::VK_None, context);
+      MCOS->emitFill(AddrSize, 0xFF);
+      MCOS->emitValue(SectionStartAddr, AddrSize);
+
+      // Emit a range list entry spanning this section.
+      const MCExpr *SectionSize =
+          makeEndMinusStartExpr(context, *StartSymbol, *EndSymbol, 0);
+      MCOS->emitIntValue(0, AddrSize);
+      emitAbsValue(*MCOS, SectionSize, AddrSize);
+    }
 
-  MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfRangesSection());
-
-  for (MCSection *Sec : Sections) {
-    const MCSymbol *StartSymbol = Sec->getBeginSymbol();
-    MCSymbol *EndSymbol = Sec->getEndSymbol(context);
-    assert(StartSymbol && "StartSymbol must not be NULL");
-    assert(EndSymbol && "EndSymbol must not be NULL");
-
-    // Emit a base address selection entry for the start of this section
-    const MCExpr *SectionStartAddr = MCSymbolRefExpr::create(
-      StartSymbol, MCSymbolRefExpr::VK_None, context);
-    MCOS->emitFill(AddrSize, 0xFF);
-    MCOS->EmitValue(SectionStartAddr, AddrSize);
-
-    // Emit a range list entry spanning this section
-    const MCExpr *SectionSize = MakeStartMinusEndExpr(*MCOS,
-      *StartSymbol, *EndSymbol, 0);
-    MCOS->EmitIntValue(0, AddrSize);
-    emitAbsValue(*MCOS, SectionSize, AddrSize);
+    // Emit end of list entry
+    MCOS->emitIntValue(0, AddrSize);
+    MCOS->emitIntValue(0, AddrSize);
   }
 
-  // Emit end of list entry
-  MCOS->EmitIntValue(0, AddrSize);
-  MCOS->EmitIntValue(0, AddrSize);
+  return RangesSymbol;
 }
 
 //
@@ -1145,7 +1200,7 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) {
     LineSectionSymbol = MCOS->getDwarfLineTableSymbol(0);
   MCSymbol *AbbrevSectionSymbol = nullptr;
   MCSymbol *InfoSectionSymbol = nullptr;
-  MCSymbol *RangesSectionSymbol = nullptr;
+  MCSymbol *RangesSymbol = nullptr;
 
   // Create end symbols for each section, and remove empty sections
   MCOS->getContext().finalizeDwarfSections(*MCOS);
@@ -1165,37 +1220,29 @@ void MCGenDwarfInfo::Emit(MCStreamer *MCOS) {
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfInfoSection());
   if (CreateDwarfSectionSymbols) {
     InfoSectionSymbol = context.createTempSymbol();
-    MCOS->EmitLabel(InfoSectionSymbol);
+    MCOS->emitLabel(InfoSectionSymbol);
   }
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfAbbrevSection());
   if (CreateDwarfSectionSymbols) {
     AbbrevSectionSymbol = context.createTempSymbol();
-    MCOS->EmitLabel(AbbrevSectionSymbol);
+    MCOS->emitLabel(AbbrevSectionSymbol);
   }
-  if (UseRangesSection) {
-    MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfRangesSection());
-    if (CreateDwarfSectionSymbols) {
-      RangesSectionSymbol = context.createTempSymbol();
-      MCOS->EmitLabel(RangesSectionSymbol);
-    }
-  }
-
-  assert((RangesSectionSymbol != nullptr) || !UseRangesSection);
 
   MCOS->SwitchSection(context.getObjectFileInfo()->getDwarfARangesSection());
 
   // Output the data for .debug_aranges section.
   EmitGenDwarfAranges(MCOS, InfoSectionSymbol);
 
-  if (UseRangesSection)
-    EmitGenDwarfRanges(MCOS);
+  if (UseRangesSection) {
+    RangesSymbol = emitGenDwarfRanges(MCOS);
+    assert(RangesSymbol);
+  }
 
   // Output the data for .debug_abbrev section.
   EmitGenDwarfAbbrev(MCOS);
 
   // Output the data for .debug_info section.
-  EmitGenDwarfInfo(MCOS, AbbrevSectionSymbol, LineSectionSymbol,
-                   RangesSectionSymbol);
+  EmitGenDwarfInfo(MCOS, AbbrevSectionSymbol, LineSectionSymbol, RangesSymbol);
 }
 
 //
@@ -1234,7 +1281,7 @@ void MCGenDwarfLabelEntry::Make(MCSymbol *Symbol, MCStreamer *MCOS,
   // original symbol. So when used they won't get a low bit set after
   // relocation.
   MCSymbol *Label = context.createTempSymbol();
-  MCOS->EmitLabel(Label);
+  MCOS->emitLabel(Label);
 
   // Create and entry for the info and add it to the other entries.
   MCOS->getContext().addMCGenDwarfLabelEntry(
@@ -1283,7 +1330,7 @@ static void emitFDESymbol(MCObjectStreamer &streamer, const MCSymbol &symbol,
   if (asmInfo->doDwarfFDESymbolsUseAbsDiff() && isEH)
     emitAbsValue(streamer, v, size);
   else
-    streamer.EmitValue(v, size);
+    streamer.emitValue(v, size);
 }
 
 static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol,
@@ -1294,7 +1341,7 @@ static void EmitPersonality(MCStreamer &streamer, const MCSymbol &symbol,
                                                          symbolEncoding,
                                                          streamer);
   unsigned size = getSizeForEncoding(streamer, symbolEncoding);
-  streamer.EmitValue(v, size);
+  streamer.emitValue(v, size);
 }
 
 namespace {
@@ -1315,18 +1362,18 @@ public:
   const MCSymbol &EmitCIE(const MCDwarfFrameInfo &F);
   void EmitFDE(const MCSymbol &cieStart, const MCDwarfFrameInfo &frame,
                bool LastInSection, const MCSymbol &SectionStart);
-  void EmitCFIInstructions(ArrayRef<MCCFIInstruction> Instrs,
+  void emitCFIInstructions(ArrayRef<MCCFIInstruction> Instrs,
                            MCSymbol *BaseLabel);
-  void EmitCFIInstruction(const MCCFIInstruction &Instr);
+  void emitCFIInstruction(const MCCFIInstruction &Instr);
 };
 
 } // end anonymous namespace
 
 static void emitEncodingByte(MCObjectStreamer &Streamer, unsigned Encoding) {
-  Streamer.EmitIntValue(Encoding, 1);
+  Streamer.emitInt8(Encoding);
 }
 
-void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
+void FrameEmitterImpl::emitCFIInstruction(const MCCFIInstruction &Instr) {
   int dataAlignmentFactor = getDataAlignmentFactor(Streamer);
   auto *MRI = Streamer.getContext().getRegisterInfo();
 
@@ -1338,23 +1385,23 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
       Reg1 = MRI->getDwarfRegNumFromDwarfEHRegNum(Reg1);
       Reg2 = MRI->getDwarfRegNumFromDwarfEHRegNum(Reg2);
     }
-    Streamer.EmitIntValue(dwarf::DW_CFA_register, 1);
-    Streamer.EmitULEB128IntValue(Reg1);
-    Streamer.EmitULEB128IntValue(Reg2);
+    Streamer.emitInt8(dwarf::DW_CFA_register);
+    Streamer.emitULEB128IntValue(Reg1);
+    Streamer.emitULEB128IntValue(Reg2);
     return;
   }
   case MCCFIInstruction::OpWindowSave:
-    Streamer.EmitIntValue(dwarf::DW_CFA_GNU_window_save, 1);
+    Streamer.emitInt8(dwarf::DW_CFA_GNU_window_save);
     return;
 
   case MCCFIInstruction::OpNegateRAState:
-    Streamer.EmitIntValue(dwarf::DW_CFA_AARCH64_negate_ra_state, 1);
+    Streamer.emitInt8(dwarf::DW_CFA_AARCH64_negate_ra_state);
     return;
 
   case MCCFIInstruction::OpUndefined: {
     unsigned Reg = Instr.getRegister();
-    Streamer.EmitIntValue(dwarf::DW_CFA_undefined, 1);
-    Streamer.EmitULEB128IntValue(Reg);
+    Streamer.emitInt8(dwarf::DW_CFA_undefined);
+    Streamer.emitULEB128IntValue(Reg);
     return;
   }
   case MCCFIInstruction::OpAdjustCfaOffset:
@@ -1362,14 +1409,14 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
     const bool IsRelative =
       Instr.getOperation() == MCCFIInstruction::OpAdjustCfaOffset;
 
-    Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa_offset, 1);
+    Streamer.emitInt8(dwarf::DW_CFA_def_cfa_offset);
 
     if (IsRelative)
       CFAOffset += Instr.getOffset();
     else
-      CFAOffset = -Instr.getOffset();
+      CFAOffset = Instr.getOffset();
 
-    Streamer.EmitULEB128IntValue(CFAOffset);
+    Streamer.emitULEB128IntValue(CFAOffset);
 
     return;
   }
@@ -1377,10 +1424,10 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
     unsigned Reg = Instr.getRegister();
     if (!IsEH)
       Reg = MRI->getDwarfRegNumFromDwarfEHRegNum(Reg);
-    Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa, 1);
-    Streamer.EmitULEB128IntValue(Reg);
-    CFAOffset = -Instr.getOffset();
-    Streamer.EmitULEB128IntValue(CFAOffset);
+    Streamer.emitInt8(dwarf::DW_CFA_def_cfa);
+    Streamer.emitULEB128IntValue(Reg);
+    CFAOffset = Instr.getOffset();
+    Streamer.emitULEB128IntValue(CFAOffset);
 
     return;
   }
@@ -1388,8 +1435,8 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
     unsigned Reg = Instr.getRegister();
     if (!IsEH)
       Reg = MRI->getDwarfRegNumFromDwarfEHRegNum(Reg);
-    Streamer.EmitIntValue(dwarf::DW_CFA_def_cfa_register, 1);
-    Streamer.EmitULEB128IntValue(Reg);
+    Streamer.emitInt8(dwarf::DW_CFA_def_cfa_register);
+    Streamer.emitULEB128IntValue(Reg);
 
     return;
   }
@@ -1408,29 +1455,29 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
     Offset = Offset / dataAlignmentFactor;
 
     if (Offset < 0) {
-      Streamer.EmitIntValue(dwarf::DW_CFA_offset_extended_sf, 1);
-      Streamer.EmitULEB128IntValue(Reg);
-      Streamer.EmitSLEB128IntValue(Offset);
+      Streamer.emitInt8(dwarf::DW_CFA_offset_extended_sf);
+      Streamer.emitULEB128IntValue(Reg);
+      Streamer.emitSLEB128IntValue(Offset);
     } else if (Reg < 64) {
-      Streamer.EmitIntValue(dwarf::DW_CFA_offset + Reg, 1);
-      Streamer.EmitULEB128IntValue(Offset);
+      Streamer.emitInt8(dwarf::DW_CFA_offset + Reg);
+      Streamer.emitULEB128IntValue(Offset);
     } else {
-      Streamer.EmitIntValue(dwarf::DW_CFA_offset_extended, 1);
-      Streamer.EmitULEB128IntValue(Reg);
-      Streamer.EmitULEB128IntValue(Offset);
+      Streamer.emitInt8(dwarf::DW_CFA_offset_extended);
+      Streamer.emitULEB128IntValue(Reg);
+      Streamer.emitULEB128IntValue(Offset);
     }
     return;
   }
   case MCCFIInstruction::OpRememberState:
-    Streamer.EmitIntValue(dwarf::DW_CFA_remember_state, 1);
+    Streamer.emitInt8(dwarf::DW_CFA_remember_state);
     return;
   case MCCFIInstruction::OpRestoreState:
-    Streamer.EmitIntValue(dwarf::DW_CFA_restore_state, 1);
+    Streamer.emitInt8(dwarf::DW_CFA_restore_state);
     return;
   case MCCFIInstruction::OpSameValue: {
     unsigned Reg = Instr.getRegister();
-    Streamer.EmitIntValue(dwarf::DW_CFA_same_value, 1);
-    Streamer.EmitULEB128IntValue(Reg);
+    Streamer.emitInt8(dwarf::DW_CFA_same_value);
+    Streamer.emitULEB128IntValue(Reg);
     return;
   }
   case MCCFIInstruction::OpRestore: {
@@ -1438,27 +1485,27 @@ void FrameEmitterImpl::EmitCFIInstruction(const MCCFIInstruction &Instr) {
     if (!IsEH)
       Reg = MRI->getDwarfRegNumFromDwarfEHRegNum(Reg);
     if (Reg < 64) {
-      Streamer.EmitIntValue(dwarf::DW_CFA_restore | Reg, 1);
+      Streamer.emitInt8(dwarf::DW_CFA_restore | Reg);
     } else {
-      Streamer.EmitIntValue(dwarf::DW_CFA_restore_extended, 1);
-      Streamer.EmitULEB128IntValue(Reg);
+      Streamer.emitInt8(dwarf::DW_CFA_restore_extended);
+      Streamer.emitULEB128IntValue(Reg);
     }
     return;
   }
   case MCCFIInstruction::OpGnuArgsSize:
-    Streamer.EmitIntValue(dwarf::DW_CFA_GNU_args_size, 1);
-    Streamer.EmitULEB128IntValue(Instr.getOffset());
+    Streamer.emitInt8(dwarf::DW_CFA_GNU_args_size);
+    Streamer.emitULEB128IntValue(Instr.getOffset());
     return;
 
   case MCCFIInstruction::OpEscape:
-    Streamer.EmitBytes(Instr.getValues());
+    Streamer.emitBytes(Instr.getValues());
     return;
   }
   llvm_unreachable("Unhandled case in switch");
 }
 
 /// Emit frame instructions to describe the layout of the frame.
-void FrameEmitterImpl::EmitCFIInstructions(ArrayRef<MCCFIInstruction> Instrs,
+void FrameEmitterImpl::emitCFIInstructions(ArrayRef<MCCFIInstruction> Instrs,
                                            MCSymbol *BaseLabel) {
   for (const MCCFIInstruction &Instr : Instrs) {
     MCSymbol *Label = Instr.getLabel();
@@ -1469,12 +1516,12 @@ void FrameEmitterImpl::EmitCFIInstructions(ArrayRef<MCCFIInstruction> Instrs,
     if (BaseLabel && Label) {
       MCSymbol *ThisSym = Label;
       if (ThisSym != BaseLabel) {
-        Streamer.EmitDwarfAdvanceFrameAddr(BaseLabel, ThisSym);
+        Streamer.emitDwarfAdvanceFrameAddr(BaseLabel, ThisSym);
         BaseLabel = ThisSym;
       }
     }
 
-    EmitCFIInstruction(Instr);
+    emitCFIInstruction(Instr);
   }
 }
 
@@ -1516,30 +1563,30 @@ void FrameEmitterImpl::EmitCompactUnwind(const MCDwarfFrameInfo &Frame) {
   // Range Start
   unsigned FDEEncoding = MOFI->getFDEEncoding();
   unsigned Size = getSizeForEncoding(Streamer, FDEEncoding);
-  Streamer.EmitSymbolValue(Frame.Begin, Size);
+  Streamer.emitSymbolValue(Frame.Begin, Size);
 
   // Range Length
-  const MCExpr *Range = MakeStartMinusEndExpr(Streamer, *Frame.Begin,
-                                              *Frame.End, 0);
+  const MCExpr *Range =
+      makeEndMinusStartExpr(Context, *Frame.Begin, *Frame.End, 0);
   emitAbsValue(Streamer, Range, 4);
 
   // Compact Encoding
   Size = getSizeForEncoding(Streamer, dwarf::DW_EH_PE_udata4);
-  Streamer.EmitIntValue(Encoding, Size);
+  Streamer.emitIntValue(Encoding, Size);
 
   // Personality Function
   Size = getSizeForEncoding(Streamer, dwarf::DW_EH_PE_absptr);
   if (!DwarfEHFrameOnly && Frame.Personality)
-    Streamer.EmitSymbolValue(Frame.Personality, Size);
+    Streamer.emitSymbolValue(Frame.Personality, Size);
   else
-    Streamer.EmitIntValue(0, Size); // No personality fn
+    Streamer.emitIntValue(0, Size); // No personality fn
 
   // LSDA
   Size = getSizeForEncoding(Streamer, Frame.LsdaEncoding);
   if (!DwarfEHFrameOnly && Frame.Lsda)
-    Streamer.EmitSymbolValue(Frame.Lsda, Size);
+    Streamer.emitSymbolValue(Frame.Lsda, Size);
   else
-    Streamer.EmitIntValue(0, Size); // No LSDA
+    Streamer.emitIntValue(0, Size); // No LSDA
 }
 
 static unsigned getCIEVersion(bool IsEH, unsigned DwarfVersion) {
@@ -1563,22 +1610,32 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(const MCDwarfFrameInfo &Frame) {
   const MCObjectFileInfo *MOFI = context.getObjectFileInfo();
 
   MCSymbol *sectionStart = context.createTempSymbol();
-  Streamer.EmitLabel(sectionStart);
+  Streamer.emitLabel(sectionStart);
 
   MCSymbol *sectionEnd = context.createTempSymbol();
 
+  dwarf::DwarfFormat Format = IsEH ? dwarf::DWARF32 : context.getDwarfFormat();
+  unsigned UnitLengthBytes = dwarf::getUnitLengthFieldByteSize(Format);
+  unsigned OffsetSize = dwarf::getDwarfOffsetByteSize(Format);
+  bool IsDwarf64 = Format == dwarf::DWARF64;
+
+  if (IsDwarf64)
+    // DWARF64 mark
+    Streamer.emitInt32(dwarf::DW_LENGTH_DWARF64);
+
   // Length
-  const MCExpr *Length =
-      MakeStartMinusEndExpr(Streamer, *sectionStart, *sectionEnd, 4);
-  emitAbsValue(Streamer, Length, 4);
+  const MCExpr *Length = makeEndMinusStartExpr(context, *sectionStart,
+                                               *sectionEnd, UnitLengthBytes);
+  emitAbsValue(Streamer, Length, OffsetSize);
 
   // CIE ID
-  unsigned CIE_ID = IsEH ? 0 : -1;
-  Streamer.EmitIntValue(CIE_ID, 4);
+  uint64_t CIE_ID =
+      IsEH ? 0 : (IsDwarf64 ? dwarf::DW64_CIE_ID : dwarf::DW_CIE_ID);
+  Streamer.emitIntValue(CIE_ID, OffsetSize);
 
   // Version
   uint8_t CIEVersion = getCIEVersion(IsEH, context.getDwarfVersion());
-  Streamer.EmitIntValue(CIEVersion, 1);
+  Streamer.emitInt8(CIEVersion);
 
   if (IsEH) {
     SmallString<8> Augmentation;
@@ -1592,23 +1649,23 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(const MCDwarfFrameInfo &Frame) {
       Augmentation += "S";
     if (Frame.IsBKeyFrame)
       Augmentation += "B";
-    Streamer.EmitBytes(Augmentation);
+    Streamer.emitBytes(Augmentation);
   }
-  Streamer.EmitIntValue(0, 1);
+  Streamer.emitInt8(0);
 
   if (CIEVersion >= 4) {
     // Address Size
-    Streamer.EmitIntValue(context.getAsmInfo()->getCodePointerSize(), 1);
+    Streamer.emitInt8(context.getAsmInfo()->getCodePointerSize());
 
     // Segment Descriptor Size
-    Streamer.EmitIntValue(0, 1);
+    Streamer.emitInt8(0);
   }
 
   // Code Alignment Factor
-  Streamer.EmitULEB128IntValue(context.getAsmInfo()->getMinInstAlignment());
+  Streamer.emitULEB128IntValue(context.getAsmInfo()->getMinInstAlignment());
 
   // Data Alignment Factor
-  Streamer.EmitSLEB128IntValue(getDataAlignmentFactor(Streamer));
+  Streamer.emitSLEB128IntValue(getDataAlignmentFactor(Streamer));
 
   // Return Address Register
   unsigned RAReg = Frame.RAReg;
@@ -1618,9 +1675,9 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(const MCDwarfFrameInfo &Frame) {
   if (CIEVersion == 1) {
     assert(RAReg <= 255 &&
            "DWARF 2 encodes return_address_register in one byte");
-    Streamer.EmitIntValue(RAReg, 1);
+    Streamer.emitInt8(RAReg);
   } else {
-    Streamer.EmitULEB128IntValue(RAReg);
+    Streamer.emitULEB128IntValue(RAReg);
   }
 
   // Augmentation Data Length (optional)
@@ -1638,7 +1695,7 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(const MCDwarfFrameInfo &Frame) {
     // Encoding of the FDE pointers
     augmentationLength += 1;
 
-    Streamer.EmitULEB128IntValue(augmentationLength);
+    Streamer.emitULEB128IntValue(augmentationLength);
 
     // Augmentation Data (optional)
     if (Frame.Personality) {
@@ -1661,15 +1718,15 @@ const MCSymbol &FrameEmitterImpl::EmitCIE(const MCDwarfFrameInfo &Frame) {
   if (!Frame.IsSimple) {
     const std::vector<MCCFIInstruction> &Instructions =
         MAI->getInitialFrameState();
-    EmitCFIInstructions(Instructions, nullptr);
+    emitCFIInstructions(Instructions, nullptr);
   }
 
   InitialCFAOffset = CFAOffset;
 
   // Padding
-  Streamer.EmitValueToAlignment(IsEH ? 4 : MAI->getCodePointerSize());
+  Streamer.emitValueToAlignment(IsEH ? 4 : MAI->getCodePointerSize());
 
-  Streamer.EmitLabel(sectionEnd);
+  Streamer.emitLabel(sectionEnd);
   return *sectionStart;
 }
 
@@ -1684,24 +1741,31 @@ void FrameEmitterImpl::EmitFDE(const MCSymbol &cieStart,
 
   CFAOffset = InitialCFAOffset;
 
+  dwarf::DwarfFormat Format = IsEH ? dwarf::DWARF32 : context.getDwarfFormat();
+  unsigned OffsetSize = dwarf::getDwarfOffsetByteSize(Format);
+
+  if (Format == dwarf::DWARF64)
+    // DWARF64 mark
+    Streamer.emitInt32(dwarf::DW_LENGTH_DWARF64);
+
   // Length
-  const MCExpr *Length = MakeStartMinusEndExpr(Streamer, *fdeStart, *fdeEnd, 0);
-  emitAbsValue(Streamer, Length, 4);
+  const MCExpr *Length = makeEndMinusStartExpr(context, *fdeStart, *fdeEnd, 0);
+  emitAbsValue(Streamer, Length, OffsetSize);
 
-  Streamer.EmitLabel(fdeStart);
+  Streamer.emitLabel(fdeStart);
 
   // CIE Pointer
   const MCAsmInfo *asmInfo = context.getAsmInfo();
   if (IsEH) {
     const MCExpr *offset =
-        MakeStartMinusEndExpr(Streamer, cieStart, *fdeStart, 0);
-    emitAbsValue(Streamer, offset, 4);
+        makeEndMinusStartExpr(context, cieStart, *fdeStart, 0);
+    emitAbsValue(Streamer, offset, OffsetSize);
   } else if (!asmInfo->doesDwarfUseRelocationsAcrossSections()) {
     const MCExpr *offset =
-        MakeStartMinusEndExpr(Streamer, SectionStart, cieStart, 0);
-    emitAbsValue(Streamer, offset, 4);
+        makeEndMinusStartExpr(context, SectionStart, cieStart, 0);
+    emitAbsValue(Streamer, offset, OffsetSize);
   } else {
-    Streamer.EmitSymbolValue(&cieStart, 4,
+    Streamer.emitSymbolValue(&cieStart, OffsetSize,
                              asmInfo->needsDwarfSectionOffsetDirective());
   }
 
@@ -1713,7 +1777,7 @@ void FrameEmitterImpl::EmitFDE(const MCSymbol &cieStart,
 
   // PC Range
   const MCExpr *Range =
-      MakeStartMinusEndExpr(Streamer, *frame.Begin, *frame.End, 0);
+      makeEndMinusStartExpr(context, *frame.Begin, *frame.End, 0);
   emitAbsValue(Streamer, Range, PCSize);
 
   if (IsEH) {
@@ -1723,7 +1787,7 @@ void FrameEmitterImpl::EmitFDE(const MCSymbol &cieStart,
     if (frame.Lsda)
       augmentationLength += getSizeForEncoding(Streamer, frame.LsdaEncoding);
 
-    Streamer.EmitULEB128IntValue(augmentationLength);
+    Streamer.emitULEB128IntValue(augmentationLength);
 
     // Augmentation Data
     if (frame.Lsda)
@@ -1731,16 +1795,16 @@ void FrameEmitterImpl::EmitFDE(const MCSymbol &cieStart,
   }
 
   // Call Frame Instructions
-  EmitCFIInstructions(frame.Instructions, frame.Begin);
+  emitCFIInstructions(frame.Instructions, frame.Begin);
 
   // Padding
   // The size of a .eh_frame section has to be a multiple of the alignment
   // since a null CIE is interpreted as the end. Old systems overaligned
   // .eh_frame, so we do too and account for it in the last FDE.
   unsigned Align = LastInSection ? asmInfo->getCodePointerSize() : PCSize;
-  Streamer.EmitValueToAlignment(Align);
+  Streamer.emitValueToAlignment(Align);
 
-  Streamer.EmitLabel(fdeEnd);
+  Streamer.emitLabel(fdeEnd);
 }
 
 namespace {
@@ -1837,7 +1901,7 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
       if (Frame.CompactUnwindEncoding == 0) continue;
       if (!SectionEmitted) {
         Streamer.SwitchSection(MOFI->getCompactUnwindSection());
-        Streamer.EmitValueToAlignment(AsmInfo->getCodePointerSize());
+        Streamer.emitValueToAlignment(AsmInfo->getCodePointerSize());
         SectionEmitted = true;
       }
       NeedsEHFrameSection |=
@@ -1855,7 +1919,7 @@ void MCDwarfFrameEmitter::Emit(MCObjectStreamer &Streamer, MCAsmBackend *MAB,
 
   Streamer.SwitchSection(&Section);
   MCSymbol *SectionStart = Context.createTempSymbol();
-  Streamer.EmitLabel(SectionStart);
+  Streamer.emitLabel(SectionStart);
 
   DenseMap<CIEKey, const MCSymbol *> CIEStarts;
 
@@ -1894,7 +1958,7 @@ void MCDwarfFrameEmitter::EmitAdvanceLoc(MCObjectStreamer &Streamer,
   SmallString<256> Tmp;
   raw_svector_ostream OS(Tmp);
   MCDwarfFrameEmitter::EncodeAdvanceLoc(Context, AddrDelta, OS);
-  Streamer.EmitBytes(OS.str());
+  Streamer.emitBytes(OS.str());
 }
 
 void MCDwarfFrameEmitter::EncodeAdvanceLoc(MCContext &Context,
diff --git a/llvm/lib/MC/MCELFStreamer.cpp b/llvm/lib/MC/MCELFStreamer.cpp
index 0a0c30df9c07..49d863f258bf 100644
--- a/llvm/lib/MC/MCELFStreamer.cpp
+++ b/llvm/lib/MC/MCELFStreamer.cpp
@@ -90,15 +90,15 @@ void MCELFStreamer::mergeFragment(MCDataFragment *DF,
 void MCELFStreamer::InitSections(bool NoExecStack) {
   MCContext &Ctx = getContext();
   SwitchSection(Ctx.getObjectFileInfo()->getTextSection());
-  EmitCodeAlignment(4);
+  emitCodeAlignment(4);
 
   if (NoExecStack)
     SwitchSection(Ctx.getAsmInfo()->getNonexecutableStackSection(Ctx));
 }
 
-void MCELFStreamer::EmitLabel(MCSymbol *S, SMLoc Loc) {
+void MCELFStreamer::emitLabel(MCSymbol *S, SMLoc Loc) {
   auto *Symbol = cast<MCSymbolELF>(S);
-  MCObjectStreamer::EmitLabel(Symbol, Loc);
+  MCObjectStreamer::emitLabel(Symbol, Loc);
 
   const MCSectionELF &Section =
       static_cast<const MCSectionELF &>(*getCurrentSectionOnly());
@@ -106,10 +106,10 @@ void MCELFStreamer::EmitLabel(MCSymbol *S, SMLoc Loc) {
     Symbol->setType(ELF::STT_TLS);
 }
 
-void MCELFStreamer::EmitLabelAtPos(MCSymbol *S, SMLoc Loc, MCFragment *F,
+void MCELFStreamer::emitLabelAtPos(MCSymbol *S, SMLoc Loc, MCFragment *F,
                                    uint64_t Offset) {
   auto *Symbol = cast<MCSymbolELF>(S);
-  MCObjectStreamer::EmitLabelAtPos(Symbol, Loc, F, Offset);
+  MCObjectStreamer::emitLabelAtPos(Symbol, Loc, F, Offset);
 
   const MCSectionELF &Section =
       static_cast<const MCSectionELF &>(*getCurrentSectionOnly());
@@ -117,7 +117,7 @@ void MCELFStreamer::EmitLabelAtPos(MCSymbol *S, SMLoc Loc, MCFragment *F,
     Symbol->setType(ELF::STT_TLS);
 }
 
-void MCELFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+void MCELFStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {
   // Let the target do whatever target specific stuff it needs to do.
   getAssembler().getBackend().handleAssemblerFlag(Flag);
   // Do any generic stuff we need to do.
@@ -143,7 +143,7 @@ static void setSectionAlignmentForBundling(const MCAssembler &Assembler,
     Section->setAlignment(Align(Assembler.getBundleAlignSize()));
 }
 
-void MCELFStreamer::ChangeSection(MCSection *Section,
+void MCELFStreamer::changeSection(MCSection *Section,
                                   const MCExpr *Subsection) {
   MCSection *CurSection = getCurrentSectionOnly();
   if (CurSection && isBundleLocked())
@@ -161,7 +161,7 @@ void MCELFStreamer::ChangeSection(MCSection *Section,
   Asm.registerSymbol(*Section->getBeginSymbol());
 }
 
-void MCELFStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {
+void MCELFStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {
   getAssembler().registerSymbol(*Symbol);
   const MCExpr *Value = MCSymbolRefExpr::create(
       Symbol, MCSymbolRefExpr::VK_WEAKREF, getContext());
@@ -187,7 +187,7 @@ static unsigned CombineSymbolTypes(unsigned T1, unsigned T2) {
   return T2;
 }
 
-bool MCELFStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
+bool MCELFStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
   auto *Symbol = cast<MCSymbolELF>(S);
 
   // Adding a symbol attribute always introduces the symbol, note that an
@@ -203,6 +203,7 @@ bool MCELFStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
   // defined.
   switch (Attribute) {
   case MCSA_Cold:
+  case MCSA_Extern:
   case MCSA_LazyReference:
   case MCSA_Reference:
   case MCSA_SymbolResolver:
@@ -286,7 +287,7 @@ bool MCELFStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
   return true;
 }
 
-void MCELFStreamer::EmitCommonSymbol(MCSymbol *S, uint64_t Size,
+void MCELFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size,
                                      unsigned ByteAlignment) {
   auto *Symbol = cast<MCSymbolELF>(S);
   getAssembler().registerSymbol(*Symbol);
@@ -304,9 +305,9 @@ void MCELFStreamer::EmitCommonSymbol(MCSymbol *S, uint64_t Size,
     MCSectionSubPair P = getCurrentSection();
     SwitchSection(&Section);
 
-    EmitValueToAlignment(ByteAlignment, 0, 1, 0);
-    EmitLabel(Symbol);
-    EmitZeros(Size);
+    emitValueToAlignment(ByteAlignment, 0, 1, 0);
+    emitLabel(Symbol);
+    emitZeros(Size);
 
     SwitchSection(P.first, P.second);
   } else {
@@ -328,31 +329,31 @@ void MCELFStreamer::emitELFSymverDirective(StringRef AliasName,
   getAssembler().Symvers.push_back({AliasName, Aliasee});
 }
 
-void MCELFStreamer::EmitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
+void MCELFStreamer::emitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
                                           unsigned ByteAlignment) {
   auto *Symbol = cast<MCSymbolELF>(S);
   // FIXME: Should this be caught and done earlier?
   getAssembler().registerSymbol(*Symbol);
   Symbol->setBinding(ELF::STB_LOCAL);
   Symbol->setExternal(false);
-  EmitCommonSymbol(Symbol, Size, ByteAlignment);
+  emitCommonSymbol(Symbol, Size, ByteAlignment);
 }
 
-void MCELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+void MCELFStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
                                   SMLoc Loc) {
   if (isBundleLocked())
     report_fatal_error("Emitting values inside a locked bundle is forbidden");
   fixSymbolsInTLSFixups(Value);
-  MCObjectStreamer::EmitValueImpl(Value, Size, Loc);
+  MCObjectStreamer::emitValueImpl(Value, Size, Loc);
 }
 
-void MCELFStreamer::EmitValueToAlignment(unsigned ByteAlignment,
+void MCELFStreamer::emitValueToAlignment(unsigned ByteAlignment,
                                          int64_t Value,
                                          unsigned ValueSize,
                                          unsigned MaxBytesToEmit) {
   if (isBundleLocked())
     report_fatal_error("Emitting values inside a locked bundle is forbidden");
-  MCObjectStreamer::EmitValueToAlignment(ByteAlignment, Value,
+  MCObjectStreamer::emitValueToAlignment(ByteAlignment, Value,
                                          ValueSize, MaxBytesToEmit);
 }
 
@@ -362,17 +363,17 @@ void MCELFStreamer::emitCGProfileEntry(const MCSymbolRefExpr *From,
   getAssembler().CGProfile.push_back({From, To, Count});
 }
 
-void MCELFStreamer::EmitIdent(StringRef IdentString) {
+void MCELFStreamer::emitIdent(StringRef IdentString) {
   MCSection *Comment = getAssembler().getContext().getELFSection(
       ".comment", ELF::SHT_PROGBITS, ELF::SHF_MERGE | ELF::SHF_STRINGS, 1, "");
   PushSection();
   SwitchSection(Comment);
   if (!SeenIdent) {
-    EmitIntValue(0, 1);
+    emitInt8(0);
     SeenIdent = true;
   }
-  EmitBytes(IdentString);
-  EmitIntValue(0, 1);
+  emitBytes(IdentString);
+  emitInt8(0);
   PopSection();
 }
 
@@ -491,9 +492,9 @@ void MCELFStreamer::finalizeCGProfile() {
   }
 }
 
-void MCELFStreamer::EmitInstToFragment(const MCInst &Inst,
+void MCELFStreamer::emitInstToFragment(const MCInst &Inst,
                                        const MCSubtargetInfo &STI) {
-  this->MCObjectStreamer::EmitInstToFragment(Inst, STI);
+  this->MCObjectStreamer::emitInstToFragment(Inst, STI);
   MCRelaxableFragment &F = *cast<MCRelaxableFragment>(getCurrentFragment());
 
   for (unsigned i = 0, e = F.getFixups().size(); i != e; ++i)
@@ -509,7 +510,7 @@ static void CheckBundleSubtargets(const MCSubtargetInfo *OldSTI,
     report_fatal_error("A Bundle can only have one Subtarget.");
 }
 
-void MCELFStreamer::EmitInstToData(const MCInst &Inst,
+void MCELFStreamer::emitInstToData(const MCInst &Inst,
                                    const MCSubtargetInfo &STI) {
   MCAssembler &Assembler = getAssembler();
   SmallVector<MCFixup, 4> Fixups;
@@ -600,7 +601,7 @@ void MCELFStreamer::EmitInstToData(const MCInst &Inst,
   }
 }
 
-void MCELFStreamer::EmitBundleAlignMode(unsigned AlignPow2) {
+void MCELFStreamer::emitBundleAlignMode(unsigned AlignPow2) {
   assert(AlignPow2 <= 30 && "Invalid bundle alignment");
   MCAssembler &Assembler = getAssembler();
   if (AlignPow2 > 0 && (Assembler.getBundleAlignSize() == 0 ||
@@ -610,7 +611,7 @@ void MCELFStreamer::EmitBundleAlignMode(unsigned AlignPow2) {
     report_fatal_error(".bundle_align_mode cannot be changed once set");
 }
 
-void MCELFStreamer::EmitBundleLock(bool AlignToEnd) {
+void MCELFStreamer::emitBundleLock(bool AlignToEnd) {
   MCSection &Sec = *getCurrentSectionOnly();
 
   // Sanity checks
@@ -631,7 +632,7 @@ void MCELFStreamer::EmitBundleLock(bool AlignToEnd) {
                                     : MCSection::BundleLocked);
 }
 
-void MCELFStreamer::EmitBundleUnlock() {
+void MCELFStreamer::emitBundleUnlock() {
   MCSection &Sec = *getCurrentSectionOnly();
 
   // Sanity checks
@@ -665,32 +666,32 @@ void MCELFStreamer::EmitBundleUnlock() {
     Sec.setBundleLockState(MCSection::NotBundleLocked);
 }
 
-void MCELFStreamer::FinishImpl() {
+void MCELFStreamer::finishImpl() {
   // Ensure the last section gets aligned if necessary.
   MCSection *CurSection = getCurrentSectionOnly();
   setSectionAlignmentForBundling(getAssembler(), CurSection);
 
   finalizeCGProfile();
-  EmitFrames(nullptr);
+  emitFrames(nullptr);
 
-  this->MCObjectStreamer::FinishImpl();
+  this->MCObjectStreamer::finishImpl();
 }
 
-void MCELFStreamer::EmitThumbFunc(MCSymbol *Func) {
+void MCELFStreamer::emitThumbFunc(MCSymbol *Func) {
   llvm_unreachable("Generic ELF doesn't support this directive");
 }
 
-void MCELFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+void MCELFStreamer::emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   llvm_unreachable("ELF doesn't support this directive");
 }
 
-void MCELFStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
+void MCELFStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
                                  uint64_t Size, unsigned ByteAlignment,
                                  SMLoc Loc) {
   llvm_unreachable("ELF doesn't support this directive");
 }
 
-void MCELFStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
+void MCELFStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment) {
   llvm_unreachable("ELF doesn't support this directive");
 }
diff --git a/llvm/lib/MC/MCExpr.cpp b/llvm/lib/MC/MCExpr.cpp
index 7f25fd4e90a7..ecf63b10f73f 100644
--- a/llvm/lib/MC/MCExpr.cpp
+++ b/llvm/lib/MC/MCExpr.cpp
@@ -46,8 +46,25 @@ void MCExpr::print(raw_ostream &OS, const MCAsmInfo *MAI, bool InParens) const {
   case MCExpr::Constant: {
     auto Value = cast<MCConstantExpr>(*this).getValue();
     auto PrintInHex = cast<MCConstantExpr>(*this).useHexFormat();
+    auto SizeInBytes = cast<MCConstantExpr>(*this).getSizeInBytes();
     if (PrintInHex)
-      OS << "0x" << Twine::utohexstr(Value);
+      switch (SizeInBytes) {
+      default:
+        OS << "0x" << Twine::utohexstr(Value);
+        break;
+      case 1:
+        OS << format("0x%02" PRIx64, Value);
+        break;
+      case 2:
+        OS << format("0x%04" PRIx64, Value);
+        break;
+      case 4:
+        OS << format("0x%08" PRIx64, Value);
+        break;
+      case 8:
+        OS << format("0x%016" PRIx64, Value);
+        break;
+      }
     else
       OS << Value;
     return;
@@ -167,17 +184,18 @@ const MCUnaryExpr *MCUnaryExpr::create(Opcode Opc, const MCExpr *Expr,
 }
 
 const MCConstantExpr *MCConstantExpr::create(int64_t Value, MCContext &Ctx,
-                                             bool PrintInHex) {
-  return new (Ctx) MCConstantExpr(Value, PrintInHex);
+                                             bool PrintInHex,
+                                             unsigned SizeInBytes) {
+  return new (Ctx) MCConstantExpr(Value, PrintInHex, SizeInBytes);
 }
 
 /* *** */
 
 MCSymbolRefExpr::MCSymbolRefExpr(const MCSymbol *Symbol, VariantKind Kind,
                                  const MCAsmInfo *MAI, SMLoc Loc)
-    : MCExpr(MCExpr::SymbolRef, Loc), Kind(Kind),
-      UseParensForSymbolVariant(MAI->useParensForSymbolVariant()),
-      HasSubsectionsViaSymbols(MAI->hasSubsectionsViaSymbols()),
+    : MCExpr(MCExpr::SymbolRef, Loc,
+             encodeSubclassData(Kind, MAI->useParensForSymbolVariant(),
+                                MAI->hasSubsectionsViaSymbols())),
       Symbol(Symbol) {
   assert(Symbol);
 }
@@ -203,6 +221,7 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_GOT: return "GOT";
   case VK_GOTOFF: return "GOTOFF";
   case VK_GOTREL: return "GOTREL";
+  case VK_PCREL: return "PCREL";
   case VK_GOTPCREL: return "GOTPCREL";
   case VK_GOTTPOFF: return "GOTTPOFF";
   case VK_INDNTPOFF: return "INDNTPOFF";
@@ -298,10 +317,12 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_PPC_GOT_TLSLD_LO: return "got@tlsld@l";
   case VK_PPC_GOT_TLSLD_HI: return "got@tlsld@h";
   case VK_PPC_GOT_TLSLD_HA: return "got@tlsld@ha";
+  case VK_PPC_GOT_PCREL:
+    return "got@pcrel";
   case VK_PPC_TLSLD: return "tlsld";
   case VK_PPC_LOCAL: return "local";
+  case VK_PPC_NOTOC: return "notoc";
   case VK_COFF_IMGREL32: return "IMGREL";
-  case VK_Hexagon_PCREL: return "PCREL";
   case VK_Hexagon_LO16: return "LO16";
   case VK_Hexagon_HI16: return "HI16";
   case VK_Hexagon_GPREL: return "GPREL";
@@ -321,6 +342,20 @@ StringRef MCSymbolRefExpr::getVariantKindName(VariantKind Kind) {
   case VK_AMDGPU_REL64: return "rel64";
   case VK_AMDGPU_ABS32_LO: return "abs32@lo";
   case VK_AMDGPU_ABS32_HI: return "abs32@hi";
+  case VK_VE_HI32: return "hi";
+  case VK_VE_LO32: return "lo";
+  case VK_VE_PC_HI32: return "pc_hi";
+  case VK_VE_PC_LO32: return "pc_lo";
+  case VK_VE_GOT_HI32: return "got_hi";
+  case VK_VE_GOT_LO32: return "got_lo";
+  case VK_VE_GOTOFF_HI32: return "gotoff_hi";
+  case VK_VE_GOTOFF_LO32: return "gotoff_lo";
+  case VK_VE_PLT_HI32: return "plt_hi";
+  case VK_VE_PLT_LO32: return "plt_lo";
+  case VK_VE_TLS_GD_HI32: return "tls_gd_hi";
+  case VK_VE_TLS_GD_LO32: return "tls_gd_lo";
+  case VK_VE_TPOFF_HI32: return "tpoff_hi";
+  case VK_VE_TPOFF_LO32: return "tpoff_lo";
   }
   llvm_unreachable("Invalid variant kind");
 }
@@ -333,6 +368,7 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("got", VK_GOT)
     .Case("gotoff", VK_GOTOFF)
     .Case("gotrel", VK_GOTREL)
+    .Case("pcrel", VK_PCREL)
     .Case("gotpcrel", VK_GOTPCREL)
     .Case("gottpoff", VK_GOTTPOFF)
     .Case("indntpoff", VK_INDNTPOFF)
@@ -413,13 +449,14 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("got@tlsld@l", VK_PPC_GOT_TLSLD_LO)
     .Case("got@tlsld@h", VK_PPC_GOT_TLSLD_HI)
     .Case("got@tlsld@ha", VK_PPC_GOT_TLSLD_HA)
+    .Case("got@pcrel", VK_PPC_GOT_PCREL)
+    .Case("notoc", VK_PPC_NOTOC)
     .Case("gdgot", VK_Hexagon_GD_GOT)
     .Case("gdplt", VK_Hexagon_GD_PLT)
     .Case("iegot", VK_Hexagon_IE_GOT)
     .Case("ie", VK_Hexagon_IE)
     .Case("ldgot", VK_Hexagon_LD_GOT)
     .Case("ldplt", VK_Hexagon_LD_PLT)
-    .Case("pcrel", VK_Hexagon_PCREL)
     .Case("none", VK_ARM_NONE)
     .Case("got_prel", VK_ARM_GOT_PREL)
     .Case("target1", VK_ARM_TARGET1)
@@ -440,11 +477,25 @@ MCSymbolRefExpr::getVariantKindForName(StringRef Name) {
     .Case("rel64", VK_AMDGPU_REL64)
     .Case("abs32@lo", VK_AMDGPU_ABS32_LO)
     .Case("abs32@hi", VK_AMDGPU_ABS32_HI)
+    .Case("hi", VK_VE_HI32)
+    .Case("lo", VK_VE_LO32)
+    .Case("pc_hi", VK_VE_PC_HI32)
+    .Case("pc_lo", VK_VE_PC_LO32)
+    .Case("got_hi", VK_VE_GOT_HI32)
+    .Case("got_lo", VK_VE_GOT_LO32)
+    .Case("gotoff_hi", VK_VE_GOTOFF_HI32)
+    .Case("gotoff_lo", VK_VE_GOTOFF_LO32)
+    .Case("plt_hi", VK_VE_PLT_HI32)
+    .Case("plt_lo", VK_VE_PLT_LO32)
+    .Case("tls_gd_hi", VK_VE_TLS_GD_HI32)
+    .Case("tls_gd_lo", VK_VE_TLS_GD_LO32)
+    .Case("tpoff_hi", VK_VE_TPOFF_HI32)
+    .Case("tpoff_lo", VK_VE_TPOFF_LO32)
     .Default(VK_Invalid);
 }
 
 void MCSymbolRefExpr::printVariantKind(raw_ostream &OS) const {
-  if (UseParensForSymbolVariant)
+  if (useParensForSymbolVariant())
     OS << '(' << MCSymbolRefExpr::getVariantKindName(getKind()) << ')';
   else
     OS << '@' << MCSymbolRefExpr::getVariantKindName(getKind());
@@ -524,8 +575,10 @@ static void AttemptToFoldSymbolOffsetDifference(
   if (!Asm->getWriter().isSymbolRefDifferenceFullyResolved(*Asm, A, B, InSet))
     return;
 
-  if (SA.getFragment() == SB.getFragment() && !SA.isVariable() &&
-      !SA.isUnset() && !SB.isVariable() && !SB.isUnset()) {
+  MCFragment *FA = SA.getFragment();
+  MCFragment *FB = SB.getFragment();
+  if (FA == FB && !SA.isVariable() && !SA.isUnset() && !SB.isVariable() &&
+      !SB.isUnset()) {
     Addend += (SA.getOffset() - SB.getOffset());
 
     // Pointers to Thumb symbols need to have their low-bit set to allow
@@ -547,12 +600,17 @@ static void AttemptToFoldSymbolOffsetDifference(
   if (!Layout)
     return;
 
-  const MCSection &SecA = *SA.getFragment()->getParent();
-  const MCSection &SecB = *SB.getFragment()->getParent();
+  const MCSection &SecA = *FA->getParent();
+  const MCSection &SecB = *FB->getParent();
 
   if ((&SecA != &SecB) && !Addrs)
     return;
 
+  // One of the symbol involved is part of a fragment being laid out. Quit now
+  // to avoid a self loop.
+  if (!Layout->canGetFragmentOffset(FA) || !Layout->canGetFragmentOffset(FB))
+    return;
+
   // Eagerly evaluate.
   Addend += Layout->getSymbolOffset(A->getSymbol()) -
             Layout->getSymbolOffset(B->getSymbol());
diff --git a/llvm/lib/MC/MCFragment.cpp b/llvm/lib/MC/MCFragment.cpp
index a96b8e86aed3..8e90e07a4dbf 100644
--- a/llvm/lib/MC/MCFragment.cpp
+++ b/llvm/lib/MC/MCFragment.cpp
@@ -48,6 +48,25 @@ bool MCAsmLayout::isFragmentValid(const MCFragment *F) const {
   return F->getLayoutOrder() <= LastValid->getLayoutOrder();
 }
 
+bool MCAsmLayout::canGetFragmentOffset(const MCFragment *F) const {
+  MCSection *Sec = F->getParent();
+  MCSection::iterator I;
+  if (MCFragment *LastValid = LastValidFragment[Sec]) {
+    // Fragment already valid, offset is available.
+    if (F->getLayoutOrder() <= LastValid->getLayoutOrder())
+      return true;
+    I = ++MCSection::iterator(LastValid);
+  } else
+    I = Sec->begin();
+
+  // A fragment ordered before F is currently being laid out.
+  const MCFragment *FirstInvalidFragment = &*I;
+  if (FirstInvalidFragment->IsBeingLaidOut)
+    return false;
+
+  return true;
+}
+
 void MCAsmLayout::invalidateFragmentsFrom(MCFragment *F) {
   // If this fragment wasn't already valid, we don't need to do anything.
   if (!isFragmentValid(F))
@@ -235,7 +254,7 @@ void ilist_alloc_traits<MCFragment>::deleteNode(MCFragment *V) { V->destroy(); }
 MCFragment::MCFragment(FragmentType Kind, bool HasInstructions,
                        MCSection *Parent)
     : Parent(Parent), Atom(nullptr), Offset(~UINT64_C(0)), LayoutOrder(0),
-      Kind(Kind), HasInstructions(HasInstructions) {
+      Kind(Kind), IsBeingLaidOut(false), HasInstructions(HasInstructions) {
   if (Parent && !isa<MCDummyFragment>(*this))
     Parent->getFragmentList().push_back(this);
 }
@@ -394,6 +413,7 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
     OS << "\n       ";
     OS << " Inst:";
     F->getInst().dump_pretty(OS);
+    OS << " (" << F->getContents().size() << " bytes)";
     break;
   }
   case MCFragment::FT_Org:  {
@@ -424,14 +444,9 @@ LLVM_DUMP_METHOD void MCFragment::dump() const {
   }
   case MCFragment::FT_BoundaryAlign: {
     const auto *BF = cast<MCBoundaryAlignFragment>(this);
-    if (BF->canEmitNops())
-      OS << " (can emit nops to align";
-    if (BF->isFused())
-      OS << " fused branch)";
-    else
-      OS << " unfused branch)";
     OS << "\n       ";
     OS << " BoundarySize:" << BF->getAlignment().value()
+       << " LastFragment:" << BF->getLastFragment()
        << " Size:" << BF->getSize();
     break;
   }
diff --git a/llvm/lib/MC/MCInstPrinter.cpp b/llvm/lib/MC/MCInstPrinter.cpp
index 8bf699279ada..7ce92b968f47 100644
--- a/llvm/lib/MC/MCInstPrinter.cpp
+++ b/llvm/lib/MC/MCInstPrinter.cpp
@@ -62,12 +62,29 @@ void MCInstPrinter::printAnnotation(raw_ostream &OS, StringRef Annot) {
 static bool matchAliasCondition(const MCInst &MI, const MCSubtargetInfo *STI,
                                 const MCRegisterInfo &MRI, unsigned &OpIdx,
                                 const AliasMatchingData &M,
-                                const AliasPatternCond &C) {
+                                const AliasPatternCond &C,
+                                bool &OrPredicateResult) {
   // Feature tests are special, they don't consume operands.
   if (C.Kind == AliasPatternCond::K_Feature)
     return STI->getFeatureBits().test(C.Value);
   if (C.Kind == AliasPatternCond::K_NegFeature)
     return !STI->getFeatureBits().test(C.Value);
+  // For feature tests where just one feature is required in a list, set the
+  // predicate result bit to whether the expression will return true, and only
+  // return the real result at the end of list marker.
+  if (C.Kind == AliasPatternCond::K_OrFeature) {
+    OrPredicateResult |= STI->getFeatureBits().test(C.Value);
+    return true;
+  }
+  if (C.Kind == AliasPatternCond::K_OrNegFeature) {
+    OrPredicateResult |= !(STI->getFeatureBits().test(C.Value));
+    return true;
+  }
+  if (C.Kind == AliasPatternCond::K_EndOrFeatures) {
+    bool Res = OrPredicateResult;
+    OrPredicateResult = false;
+    return Res;
+  }
 
   // Get and consume an operand.
   const MCOperand &Opnd = MI.getOperand(OpIdx);
@@ -95,6 +112,9 @@ static bool matchAliasCondition(const MCInst &MI, const MCSubtargetInfo *STI,
     return true;
   case AliasPatternCond::K_Feature:
   case AliasPatternCond::K_NegFeature:
+  case AliasPatternCond::K_OrFeature:
+  case AliasPatternCond::K_OrNegFeature:
+  case AliasPatternCond::K_EndOrFeatures:
     llvm_unreachable("handled earlier");
   }
   llvm_unreachable("invalid kind");
@@ -125,8 +145,10 @@ const char *MCInstPrinter::matchAliasPatterns(const MCInst *MI,
     ArrayRef<AliasPatternCond> Conds =
         M.PatternConds.slice(P.AliasCondStart, P.NumConds);
     unsigned OpIdx = 0;
+    bool OrPredicateResult = false;
     if (llvm::all_of(Conds, [&](const AliasPatternCond &C) {
-          return matchAliasCondition(*MI, STI, MRI, OpIdx, M, C);
+          return matchAliasCondition(*MI, STI, MRI, OpIdx, M, C,
+                                     OrPredicateResult);
         })) {
       // If all conditions matched, use this asm string.
       AsmStrOffset = P.AsmStrOffset;
diff --git a/llvm/lib/MC/MCInstrAnalysis.cpp b/llvm/lib/MC/MCInstrAnalysis.cpp
index 54741fdd686d..a7dc0626d0ab 100644
--- a/llvm/lib/MC/MCInstrAnalysis.cpp
+++ b/llvm/lib/MC/MCInstrAnalysis.cpp
@@ -23,15 +23,10 @@ bool MCInstrAnalysis::clearsSuperRegisters(const MCRegisterInfo &MRI,
   return false;
 }
 
-bool MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
-                                     uint64_t Size, uint64_t &Target) const {
-  if (Inst.getNumOperands() == 0 ||
-      Info->get(Inst.getOpcode()).OpInfo[0].OperandType != MCOI::OPERAND_PCREL)
-    return false;
-
-  int64_t Imm = Inst.getOperand(0).getImm();
-  Target = Addr+Size+Imm;
-  return true;
+bool MCInstrAnalysis::evaluateBranch(const MCInst & /*Inst*/, uint64_t /*Addr*/,
+                                     uint64_t /*Size*/,
+                                     uint64_t & /*Target*/) const {
+  return false;
 }
 
 Optional<uint64_t>
diff --git a/llvm/lib/MC/MCInstrDesc.cpp b/llvm/lib/MC/MCInstrDesc.cpp
index d54aeba89edc..b5c43f5edc0d 100644
--- a/llvm/lib/MC/MCInstrDesc.cpp
+++ b/llvm/lib/MC/MCInstrDesc.cpp
@@ -18,17 +18,6 @@
 
 using namespace llvm;
 
-bool MCInstrDesc::getDeprecatedInfo(MCInst &MI, const MCSubtargetInfo &STI,
-                                    std::string &Info) const {
-  if (ComplexDeprecationInfo)
-    return ComplexDeprecationInfo(MI, STI, Info);
-  if (DeprecatedFeature != -1 && STI.getFeatureBits()[DeprecatedFeature]) {
-    // FIXME: it would be nice to include the subtarget feature here.
-    Info = "deprecated";
-    return true;
-  }
-  return false;
-}
 bool MCInstrDesc::mayAffectControlFlow(const MCInst &MI,
                                        const MCRegisterInfo &RI) const {
   if (isBranch() || isCall() || isReturn() || isIndirectBranch())
diff --git a/llvm/lib/MC/MCInstrInfo.cpp b/llvm/lib/MC/MCInstrInfo.cpp
new file mode 100644
index 000000000000..ab63db040532
--- /dev/null
+++ b/llvm/lib/MC/MCInstrInfo.cpp
@@ -0,0 +1,27 @@
+//===- lib/MC/MCInstrInfo.cpp - Target Instruction Info -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+
+using namespace llvm;
+
+bool MCInstrInfo::getDeprecatedInfo(MCInst &MI, const MCSubtargetInfo &STI,
+                                    std::string &Info) const {
+  unsigned Opcode = MI.getOpcode();
+  if (ComplexDeprecationInfos && ComplexDeprecationInfos[Opcode])
+    return ComplexDeprecationInfos[Opcode](MI, STI, Info);
+  if (DeprecatedFeatures && DeprecatedFeatures[Opcode] != uint8_t(-1U) &&
+      STI.getFeatureBits()[DeprecatedFeatures[Opcode]]) {
+    // FIXME: it would be nice to include the subtarget feature here.
+    Info = "deprecated";
+    return true;
+  }
+  return false;
+}
diff --git a/llvm/lib/MC/MCMachOStreamer.cpp b/llvm/lib/MC/MCMachOStreamer.cpp
index 8e558a36b7a1..2b1d1b28ea18 100644
--- a/llvm/lib/MC/MCMachOStreamer.cpp
+++ b/llvm/lib/MC/MCMachOStreamer.cpp
@@ -56,10 +56,10 @@ private:
   /// labels in the middle of the section.
   DenseMap<const MCSection*, bool> HasSectionLabel;
 
-  void EmitInstToData(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+  void emitInstToData(const MCInst &Inst, const MCSubtargetInfo &STI) override;
 
-  void EmitDataRegion(DataRegionData::KindTy Kind);
-  void EmitDataRegionEnd();
+  void emitDataRegion(DataRegionData::KindTy Kind);
+  void emitDataRegionEnd();
 
 public:
   MCMachOStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
@@ -81,40 +81,40 @@ public:
   /// @name MCStreamer Interface
   /// @{
 
-  void ChangeSection(MCSection *Sect, const MCExpr *Subsect) override;
-  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
-  void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
-  void EmitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol) override;
-  void EmitAssemblerFlag(MCAssemblerFlag Flag) override;
-  void EmitLinkerOptions(ArrayRef<std::string> Options) override;
-  void EmitDataRegion(MCDataRegionType Kind) override;
-  void EmitVersionMin(MCVersionMinType Kind, unsigned Major, unsigned Minor,
+  void changeSection(MCSection *Sect, const MCExpr *Subsect) override;
+  void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
+  void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
+  void emitEHSymAttributes(const MCSymbol *Symbol, MCSymbol *EHSymbol) override;
+  void emitAssemblerFlag(MCAssemblerFlag Flag) override;
+  void emitLinkerOptions(ArrayRef<std::string> Options) override;
+  void emitDataRegion(MCDataRegionType Kind) override;
+  void emitVersionMin(MCVersionMinType Kind, unsigned Major, unsigned Minor,
                       unsigned Update, VersionTuple SDKVersion) override;
-  void EmitBuildVersion(unsigned Platform, unsigned Major, unsigned Minor,
+  void emitBuildVersion(unsigned Platform, unsigned Major, unsigned Minor,
                         unsigned Update, VersionTuple SDKVersion) override;
-  void EmitThumbFunc(MCSymbol *Func) override;
-  bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
-  void EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
-  void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+  void emitThumbFunc(MCSymbol *Func) override;
+  bool emitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
+  void emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) override;
+  void emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
 
-  void EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+  void emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                              unsigned ByteAlignment) override;
-  void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
+  void emitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
                     uint64_t Size = 0, unsigned ByteAlignment = 0,
                     SMLoc Loc = SMLoc()) override;
-  void EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
+  void emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                       unsigned ByteAlignment = 0) override;
 
-  void EmitIdent(StringRef IdentString) override {
+  void emitIdent(StringRef IdentString) override {
     llvm_unreachable("macho doesn't support this directive");
   }
 
-  void EmitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) override {
+  void emitLOHDirective(MCLOHType Kind, const MCLOHArgs &Args) override {
     getAssembler().getLOHContainer().addDirective(Kind, Args);
   }
 
-  void FinishImpl() override;
+  void finishImpl() override;
 };
 
 } // end anonymous namespace.
@@ -123,7 +123,7 @@ static bool canGoAfterDWARF(const MCSectionMachO &MSec) {
   // These sections are created by the assembler itself after the end of
   // the .s file.
   StringRef SegName = MSec.getSegmentName();
-  StringRef SecName = MSec.getSectionName();
+  StringRef SecName = MSec.getName();
 
   if (SegName == "__LD" && SecName == "__compact_unwind")
     return true;
@@ -146,7 +146,7 @@ static bool canGoAfterDWARF(const MCSectionMachO &MSec) {
   return false;
 }
 
-void MCMachOStreamer::ChangeSection(MCSection *Section,
+void MCMachOStreamer::changeSection(MCSection *Section,
                                     const MCExpr *Subsection) {
   // Change the section normally.
   bool Created = changeSectionImpl(Section, Subsection);
@@ -155,7 +155,9 @@ void MCMachOStreamer::ChangeSection(MCSection *Section,
   if (SegName == "__DWARF")
     CreatedADWARFSection = true;
   else if (Created && DWARFMustBeAtTheEnd && !canGoAfterDWARF(MSec))
-    assert(!CreatedADWARFSection && "Creating regular section after DWARF");
+    assert((!CreatedADWARFSection ||
+            Section == getContext().getObjectFileInfo()->getStackMapSection())
+           && "Creating regular section after DWARF");
 
   // Output a linker-local symbol so we don't need section-relative local
   // relocations. The linker hates us when we do that.
@@ -167,24 +169,24 @@ void MCMachOStreamer::ChangeSection(MCSection *Section,
   }
 }
 
-void MCMachOStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
+void MCMachOStreamer::emitEHSymAttributes(const MCSymbol *Symbol,
                                           MCSymbol *EHSymbol) {
   getAssembler().registerSymbol(*Symbol);
   if (Symbol->isExternal())
-    EmitSymbolAttribute(EHSymbol, MCSA_Global);
+    emitSymbolAttribute(EHSymbol, MCSA_Global);
   if (cast<MCSymbolMachO>(Symbol)->isWeakDefinition())
-    EmitSymbolAttribute(EHSymbol, MCSA_WeakDefinition);
+    emitSymbolAttribute(EHSymbol, MCSA_WeakDefinition);
   if (Symbol->isPrivateExtern())
-    EmitSymbolAttribute(EHSymbol, MCSA_PrivateExtern);
+    emitSymbolAttribute(EHSymbol, MCSA_PrivateExtern);
 }
 
-void MCMachOStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
+void MCMachOStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
   // We have to create a new fragment if this is an atom defining symbol,
   // fragments cannot span atoms.
   if (getAssembler().isSymbolLinkerVisible(*Symbol))
     insert(new MCDataFragment());
 
-  MCObjectStreamer::EmitLabel(Symbol, Loc);
+  MCObjectStreamer::emitLabel(Symbol, Loc);
 
   // This causes the reference type flag to be cleared. Darwin 'as' was "trying"
   // to clear the weak reference and weak definition bits too, but the
@@ -196,7 +198,7 @@ void MCMachOStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
   cast<MCSymbolMachO>(Symbol)->clearReferenceType();
 }
 
-void MCMachOStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+void MCMachOStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
   MCValue Res;
 
   if (Value->evaluateAsRelocatable(Res, nullptr, nullptr)) {
@@ -206,30 +208,30 @@ void MCMachOStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
         cast<MCSymbolMachO>(Symbol)->setAltEntry();
     }
   }
-  MCObjectStreamer::EmitAssignment(Symbol, Value);
+  MCObjectStreamer::emitAssignment(Symbol, Value);
 }
 
-void MCMachOStreamer::EmitDataRegion(DataRegionData::KindTy Kind) {
+void MCMachOStreamer::emitDataRegion(DataRegionData::KindTy Kind) {
   // Create a temporary label to mark the start of the data region.
   MCSymbol *Start = getContext().createTempSymbol();
-  EmitLabel(Start);
+  emitLabel(Start);
   // Record the region for the object writer to use.
   DataRegionData Data = { Kind, Start, nullptr };
   std::vector<DataRegionData> &Regions = getAssembler().getDataRegions();
   Regions.push_back(Data);
 }
 
-void MCMachOStreamer::EmitDataRegionEnd() {
+void MCMachOStreamer::emitDataRegionEnd() {
   std::vector<DataRegionData> &Regions = getAssembler().getDataRegions();
   assert(!Regions.empty() && "Mismatched .end_data_region!");
   DataRegionData &Data = Regions.back();
   assert(!Data.End && "Mismatched .end_data_region!");
   // Create a temporary label to mark the end of the data region.
   Data.End = getContext().createTempSymbol();
-  EmitLabel(Data.End);
+  emitLabel(Data.End);
 }
 
-void MCMachOStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+void MCMachOStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {
   // Let the target do whatever target specific stuff it needs to do.
   getAssembler().getBackend().handleAssemblerFlag(Flag);
   // Do any generic stuff we need to do.
@@ -244,51 +246,51 @@ void MCMachOStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   }
 }
 
-void MCMachOStreamer::EmitLinkerOptions(ArrayRef<std::string> Options) {
+void MCMachOStreamer::emitLinkerOptions(ArrayRef<std::string> Options) {
   getAssembler().getLinkerOptions().push_back(Options);
 }
 
-void MCMachOStreamer::EmitDataRegion(MCDataRegionType Kind) {
+void MCMachOStreamer::emitDataRegion(MCDataRegionType Kind) {
   switch (Kind) {
   case MCDR_DataRegion:
-    EmitDataRegion(DataRegionData::Data);
+    emitDataRegion(DataRegionData::Data);
     return;
   case MCDR_DataRegionJT8:
-    EmitDataRegion(DataRegionData::JumpTable8);
+    emitDataRegion(DataRegionData::JumpTable8);
     return;
   case MCDR_DataRegionJT16:
-    EmitDataRegion(DataRegionData::JumpTable16);
+    emitDataRegion(DataRegionData::JumpTable16);
     return;
   case MCDR_DataRegionJT32:
-    EmitDataRegion(DataRegionData::JumpTable32);
+    emitDataRegion(DataRegionData::JumpTable32);
     return;
   case MCDR_DataRegionEnd:
-    EmitDataRegionEnd();
+    emitDataRegionEnd();
     return;
   }
 }
 
-void MCMachOStreamer::EmitVersionMin(MCVersionMinType Kind, unsigned Major,
+void MCMachOStreamer::emitVersionMin(MCVersionMinType Kind, unsigned Major,
                                      unsigned Minor, unsigned Update,
                                      VersionTuple SDKVersion) {
   getAssembler().setVersionMin(Kind, Major, Minor, Update, SDKVersion);
 }
 
-void MCMachOStreamer::EmitBuildVersion(unsigned Platform, unsigned Major,
+void MCMachOStreamer::emitBuildVersion(unsigned Platform, unsigned Major,
                                        unsigned Minor, unsigned Update,
                                        VersionTuple SDKVersion) {
   getAssembler().setBuildVersion((MachO::PlatformType)Platform, Major, Minor,
                                  Update, SDKVersion);
 }
 
-void MCMachOStreamer::EmitThumbFunc(MCSymbol *Symbol) {
+void MCMachOStreamer::emitThumbFunc(MCSymbol *Symbol) {
   // Remember that the function is a thumb function. Fixup and relocation
   // values will need adjusted.
   getAssembler().setIsThumbFunc(Symbol);
   cast<MCSymbolMachO>(Symbol)->setThumbFunc();
 }
 
-bool MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Sym,
+bool MCMachOStreamer::emitSymbolAttribute(MCSymbol *Sym,
                                           MCSymbolAttr Attribute) {
   MCSymbolMachO *Symbol = cast<MCSymbolMachO>(Sym);
 
@@ -324,6 +326,7 @@ bool MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Sym,
   case MCSA_ELF_TypeCommon:
   case MCSA_ELF_TypeNoType:
   case MCSA_ELF_TypeGnuUniqueObject:
+  case MCSA_Extern:
   case MCSA_Hidden:
   case MCSA_IndirectSymbol:
   case MCSA_Internal:
@@ -396,13 +399,13 @@ bool MCMachOStreamer::EmitSymbolAttribute(MCSymbol *Sym,
   return true;
 }
 
-void MCMachOStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+void MCMachOStreamer::emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   // Encode the 'desc' value into the lowest implementation defined bits.
   getAssembler().registerSymbol(*Symbol);
   cast<MCSymbolMachO>(Symbol)->setDesc(DescValue);
 }
 
-void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+void MCMachOStreamer::emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                        unsigned ByteAlignment) {
   // FIXME: Darwin 'as' does appear to allow redef of a .comm by itself.
   assert(Symbol->isUndefined() && "Cannot define a symbol twice!");
@@ -412,14 +415,14 @@ void MCMachOStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
   Symbol->setCommon(Size, ByteAlignment);
 }
 
-void MCMachOStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+void MCMachOStreamer::emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                             unsigned ByteAlignment) {
   // '.lcomm' is equivalent to '.zerofill'.
-  return EmitZerofill(getContext().getObjectFileInfo()->getDataBSSSection(),
+  return emitZerofill(getContext().getObjectFileInfo()->getDataBSSSection(),
                       Symbol, Size, ByteAlignment);
 }
 
-void MCMachOStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
+void MCMachOStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment,
                                    SMLoc Loc) {
   // On darwin all virtual sections have zerofill type. Disallow the usage of
@@ -438,21 +441,21 @@ void MCMachOStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
 
   // The symbol may not be present, which only creates the section.
   if (Symbol) {
-    EmitValueToAlignment(ByteAlignment, 0, 1, 0);
-    EmitLabel(Symbol);
-    EmitZeros(Size);
+    emitValueToAlignment(ByteAlignment, 0, 1, 0);
+    emitLabel(Symbol);
+    emitZeros(Size);
   }
   PopSection();
 }
 
 // This should always be called with the thread local bss section.  Like the
 // .zerofill directive this doesn't actually switch sections on us.
-void MCMachOStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
+void MCMachOStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                      uint64_t Size, unsigned ByteAlignment) {
-  EmitZerofill(Section, Symbol, Size, ByteAlignment);
+  emitZerofill(Section, Symbol, Size, ByteAlignment);
 }
 
-void MCMachOStreamer::EmitInstToData(const MCInst &Inst,
+void MCMachOStreamer::emitInstToData(const MCInst &Inst,
                                      const MCSubtargetInfo &STI) {
   MCDataFragment *DF = getOrCreateDataFragment();
 
@@ -470,8 +473,8 @@ void MCMachOStreamer::EmitInstToData(const MCInst &Inst,
   DF->getContents().append(Code.begin(), Code.end());
 }
 
-void MCMachOStreamer::FinishImpl() {
-  EmitFrames(&getAssembler().getBackend());
+void MCMachOStreamer::finishImpl() {
+  emitFrames(&getAssembler().getBackend());
 
   // We have to set the fragment atom associations so we can relax properly for
   // Mach-O.
@@ -500,7 +503,7 @@ void MCMachOStreamer::FinishImpl() {
     }
   }
 
-  this->MCObjectStreamer::FinishImpl();
+  this->MCObjectStreamer::finishImpl();
 }
 
 MCStreamer *llvm::createMachOStreamer(MCContext &Context,
@@ -513,7 +516,7 @@ MCStreamer *llvm::createMachOStreamer(MCContext &Context,
       new MCMachOStreamer(Context, std::move(MAB), std::move(OW), std::move(CE),
                           DWARFMustBeAtTheEnd, LabelSections);
   const Triple &Target = Context.getObjectFileInfo()->getTargetTriple();
-  S->EmitVersionForTarget(Target, Context.getObjectFileInfo()->getSDKVersion());
+  S->emitVersionForTarget(Target, Context.getObjectFileInfo()->getSDKVersion());
   if (RelaxAll)
     S->getAssembler().setRelaxAll(true);
   return S;
diff --git a/llvm/lib/MC/MCNullStreamer.cpp b/llvm/lib/MC/MCNullStreamer.cpp
index 8452317c8c6b..291d840b4f4b 100644
--- a/llvm/lib/MC/MCNullStreamer.cpp
+++ b/llvm/lib/MC/MCNullStreamer.cpp
@@ -23,19 +23,19 @@ namespace {
     /// @{
 
     bool hasRawTextSupport() const override { return true; }
-    void EmitRawTextImpl(StringRef String) override {}
+    void emitRawTextImpl(StringRef String) override {}
 
-    bool EmitSymbolAttribute(MCSymbol *Symbol,
+    bool emitSymbolAttribute(MCSymbol *Symbol,
                              MCSymbolAttr Attribute) override {
       return true;
     }
 
-    void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+    void emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                           unsigned ByteAlignment) override {}
-    void EmitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
+    void emitZerofill(MCSection *Section, MCSymbol *Symbol = nullptr,
                       uint64_t Size = 0, unsigned ByteAlignment = 0,
                       SMLoc Loc = SMLoc()) override {}
-    void EmitGPRel32Value(const MCExpr *Value) override {}
+    void emitGPRel32Value(const MCExpr *Value) override {}
     void BeginCOFFSymbolDef(const MCSymbol *Symbol) override {}
     void EmitCOFFSymbolStorageClass(int StorageClass) override {}
     void EmitCOFFSymbolType(int Type) override {}
diff --git a/llvm/lib/MC/MCObjectFileInfo.cpp b/llvm/lib/MC/MCObjectFileInfo.cpp
index d567cc14a830..b77a9635f64c 100644
--- a/llvm/lib/MC/MCObjectFileInfo.cpp
+++ b/llvm/lib/MC/MCObjectFileInfo.cpp
@@ -276,6 +276,9 @@ void MCObjectFileInfo::initMachOMCObjectFileInfo(const Triple &T) {
   DwarfMacinfoSection =
       Ctx->getMachOSection("__DWARF", "__debug_macinfo", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata(), "debug_macinfo");
+  DwarfMacroSection =
+      Ctx->getMachOSection("__DWARF", "__debug_macro", MachO::S_ATTR_DEBUG,
+                           SectionKind::getMetadata(), "debug_macro");
   DwarfDebugInlineSection =
       Ctx->getMachOSection("__DWARF", "__debug_inlined", MachO::S_ATTR_DEBUG,
                            SectionKind::getMetadata());
@@ -427,6 +430,7 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
       Ctx->getELFSection(".debug_ranges", DebugSecType, 0);
   DwarfMacinfoSection =
       Ctx->getELFSection(".debug_macinfo", DebugSecType, 0);
+  DwarfMacroSection = Ctx->getELFSection(".debug_macro", DebugSecType, 0);
 
   // DWARF5 Experimental Debug Info
 
@@ -469,6 +473,8 @@ void MCObjectFileInfo::initELFMCObjectFileInfo(const Triple &T, bool Large) {
       Ctx->getELFSection(".debug_rnglists.dwo", DebugSecType, ELF::SHF_EXCLUDE);
   DwarfMacinfoDWOSection =
       Ctx->getELFSection(".debug_macinfo.dwo", DebugSecType, ELF::SHF_EXCLUDE);
+  DwarfMacroDWOSection =
+      Ctx->getELFSection(".debug_macro.dwo", DebugSecType, ELF::SHF_EXCLUDE);
 
   DwarfLoclistsDWOSection =
       Ctx->getELFSection(".debug_loclists.dwo", DebugSecType, ELF::SHF_EXCLUDE);
@@ -610,6 +616,11 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getMetadata(), "section_debug_loc");
+  DwarfLoclistsSection = Ctx->getCOFFSection(
+      ".debug_loclists",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata(), "section_debug_loclists");
   DwarfARangesSection = Ctx->getCOFFSection(
       ".debug_aranges",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -620,16 +631,31 @@ void MCObjectFileInfo::initCOFFMCObjectFileInfo(const Triple &T) {
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getMetadata(), "debug_range");
+  DwarfRnglistsSection = Ctx->getCOFFSection(
+      ".debug_rnglists",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata(), "debug_rnglists");
   DwarfMacinfoSection = Ctx->getCOFFSection(
       ".debug_macinfo",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getMetadata(), "debug_macinfo");
+  DwarfMacroSection = Ctx->getCOFFSection(
+      ".debug_macro",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata(), "debug_macro");
   DwarfMacinfoDWOSection = Ctx->getCOFFSection(
       ".debug_macinfo.dwo",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
           COFF::IMAGE_SCN_MEM_READ,
       SectionKind::getMetadata(), "debug_macinfo.dwo");
+  DwarfMacroDWOSection = Ctx->getCOFFSection(
+      ".debug_macro.dwo",
+      COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+          COFF::IMAGE_SCN_MEM_READ,
+      SectionKind::getMetadata(), "debug_macro.dwo");
   DwarfInfoDWOSection = Ctx->getCOFFSection(
       ".debug_info.dwo",
       COFF::IMAGE_SCN_MEM_DISCARDABLE | COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
@@ -761,7 +787,8 @@ void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) {
       Ctx->getWasmSection(".debug_ranges", SectionKind::getMetadata());
   DwarfMacinfoSection =
       Ctx->getWasmSection(".debug_macinfo", SectionKind::getMetadata());
-  DwarfAddrSection = Ctx->getWasmSection(".debug_addr", SectionKind::getMetadata());
+  DwarfMacroSection =
+      Ctx->getWasmSection(".debug_macro", SectionKind::getMetadata());
   DwarfCUIndexSection = Ctx->getWasmSection(".debug_cu_index", SectionKind::getMetadata());
   DwarfTUIndexSection = Ctx->getWasmSection(".debug_tu_index", SectionKind::getMetadata());
   DwarfInfoSection =
@@ -770,6 +797,17 @@ void MCObjectFileInfo::initWasmMCObjectFileInfo(const Triple &T) {
   DwarfPubNamesSection = Ctx->getWasmSection(".debug_pubnames", SectionKind::getMetadata());
   DwarfPubTypesSection = Ctx->getWasmSection(".debug_pubtypes", SectionKind::getMetadata());
 
+  DwarfDebugNamesSection =
+      Ctx->getWasmSection(".debug_names", SectionKind::getMetadata());
+  DwarfStrOffSection =
+      Ctx->getWasmSection(".debug_str_offsets", SectionKind::getMetadata());
+  DwarfAddrSection =
+      Ctx->getWasmSection(".debug_addr", SectionKind::getMetadata());
+  DwarfRnglistsSection =
+      Ctx->getWasmSection(".debug_rnglists", SectionKind::getMetadata());
+  DwarfLoclistsSection =
+      Ctx->getWasmSection(".debug_loclists", SectionKind::getMetadata());
+
   // Wasm use data section for LSDA.
   // TODO Consider putting each function's exception table in a separate
   // section, as in -function-sections, to facilitate lld's --gc-section.
@@ -795,6 +833,29 @@ void MCObjectFileInfo::initXCOFFMCObjectFileInfo(const Triple &T) {
   ReadOnlySection = Ctx->getXCOFFSection(
       ".rodata", XCOFF::StorageMappingClass::XMC_RO, XCOFF::XTY_SD,
       XCOFF::C_HIDEXT, SectionKind::getReadOnly());
+
+  TOCBaseSection = Ctx->getXCOFFSection(
+      "TOC", XCOFF::StorageMappingClass::XMC_TC0, XCOFF::XTY_SD,
+      XCOFF::C_HIDEXT, SectionKind::getData());
+
+  // The TOC-base always has 0 size, but 4 byte alignment.
+  TOCBaseSection->setAlignment(Align(4));
+
+  // DWARF sections for XCOFF are not csects. They are special STYP_DWARF
+  // sections, and the individual DWARF sections are distinguished by their
+  // section subtype.
+  // TODO: Populate the DWARF sections appropriately.
+  DwarfAbbrevSection = nullptr;   // SSUBTYP_DWABREV
+  DwarfInfoSection = nullptr;     // SSUBTYP_DWINFO
+  DwarfLineSection = nullptr;     // SSUBTYP_DWLINE
+  DwarfFrameSection = nullptr;    // SSUBTYP_DWFRAME
+  DwarfPubNamesSection = nullptr; // SSUBTYP_DWPBNMS
+  DwarfPubTypesSection = nullptr; // SSUBTYP_DWPBTYP
+  DwarfStrSection = nullptr;      // SSUBTYP_DWSTR
+  DwarfLocSection = nullptr;      // SSUBTYP_DWLOC
+  DwarfARangesSection = nullptr;  // SSUBTYP_DWARNGE
+  DwarfRangesSection = nullptr;   // SSUBTYP_DWRNGES
+  DwarfMacinfoSection = nullptr;  // SSUBTYP_DWMAC
 }
 
 void MCObjectFileInfo::InitMCObjectFileInfo(const Triple &TheTriple, bool PIC,
@@ -884,10 +945,7 @@ MCObjectFileInfo::getStackSizesSection(const MCSection &TextSec) const {
     Flags |= ELF::SHF_GROUP;
   }
 
-  const MCSymbol *Link = TextSec.getBeginSymbol();
-  auto It = StackSizesUniquing.insert({Link, StackSizesUniquing.size()});
-  unsigned UniqueID = It.first->second;
-
   return Ctx->getELFSection(".stack_sizes", ELF::SHT_PROGBITS, Flags, 0,
-                            GroupName, UniqueID, cast<MCSymbolELF>(Link));
+                            GroupName, MCSection::NonUniqueID,
+                            cast<MCSymbolELF>(TextSec.getBeginSymbol()));
 }
diff --git a/llvm/lib/MC/MCObjectStreamer.cpp b/llvm/lib/MC/MCObjectStreamer.cpp
index 3d1358df475f..e39c4a03bc1e 100644
--- a/llvm/lib/MC/MCObjectStreamer.cpp
+++ b/llvm/lib/MC/MCObjectStreamer.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSection.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SourceMgr.h"
 using namespace llvm;
@@ -59,12 +60,8 @@ void MCObjectStreamer::addPendingLabel(MCSymbol* S) {
     CurSection->addPendingLabel(S, CurSubsectionIdx);
 
     // Add this Section to the list of PendingLabelSections.
-    auto SecIt = std::find(PendingLabelSections.begin(),
-                           PendingLabelSections.end(), CurSection);
-    if (SecIt == PendingLabelSections.end())
-      PendingLabelSections.push_back(CurSection);
-  }
-  else
+    PendingLabelSections.insert(CurSection);
+  } else
     // There is no Section / Subsection for this label yet.
     PendingLabels.push_back(S);
 }
@@ -145,7 +142,7 @@ void MCObjectStreamer::emitAbsoluteSymbolDiff(const MCSymbol *Hi,
                                               const MCSymbol *Lo,
                                               unsigned Size) {
   if (Optional<uint64_t> Diff = absoluteSymbolDiff(getAssembler(), Hi, Lo)) {
-    EmitIntValue(*Diff, Size);
+    emitIntValue(*Diff, Size);
     return;
   }
   MCStreamer::emitAbsoluteSymbolDiff(Hi, Lo, Size);
@@ -154,7 +151,7 @@ void MCObjectStreamer::emitAbsoluteSymbolDiff(const MCSymbol *Hi,
 void MCObjectStreamer::emitAbsoluteSymbolDiffAsULEB128(const MCSymbol *Hi,
                                                        const MCSymbol *Lo) {
   if (Optional<uint64_t> Diff = absoluteSymbolDiff(getAssembler(), Hi, Lo)) {
-    EmitULEB128IntValue(*Diff);
+    emitULEB128IntValue(*Diff);
     return;
   }
   MCStreamer::emitAbsoluteSymbolDiffAsULEB128(Hi, Lo);
@@ -171,7 +168,7 @@ void MCObjectStreamer::reset() {
   MCStreamer::reset();
 }
 
-void MCObjectStreamer::EmitFrames(MCAsmBackend *MAB) {
+void MCObjectStreamer::emitFrames(MCAsmBackend *MAB) {
   if (!getNumFrameInfos())
     return;
 
@@ -191,13 +188,13 @@ MCFragment *MCObjectStreamer::getCurrentFragment() const {
   return nullptr;
 }
 
-static bool CanReuseDataFragment(const MCDataFragment &F,
+static bool canReuseDataFragment(const MCDataFragment &F,
                                  const MCAssembler &Assembler,
                                  const MCSubtargetInfo *STI) {
   if (!F.hasInstructions())
     return true;
   // When bundling is enabled, we don't want to add data to a fragment that
-  // already has instructions (see MCELFStreamer::EmitInstToData for details)
+  // already has instructions (see MCELFStreamer::emitInstToData for details)
   if (Assembler.isBundlingEnabled())
     return Assembler.getRelaxAll();
   // If the subtarget is changed mid fragment we start a new fragment to record
@@ -208,7 +205,7 @@ static bool CanReuseDataFragment(const MCDataFragment &F,
 MCDataFragment *
 MCObjectStreamer::getOrCreateDataFragment(const MCSubtargetInfo *STI) {
   MCDataFragment *F = dyn_cast_or_null<MCDataFragment>(getCurrentFragment());
-  if (!F || !CanReuseDataFragment(*F, *Assembler, STI)) {
+  if (!F || !canReuseDataFragment(*F, *Assembler, STI)) {
     F = new MCDataFragment();
     insert(F);
   }
@@ -219,15 +216,15 @@ void MCObjectStreamer::visitUsedSymbol(const MCSymbol &Sym) {
   Assembler->registerSymbol(Sym);
 }
 
-void MCObjectStreamer::EmitCFISections(bool EH, bool Debug) {
-  MCStreamer::EmitCFISections(EH, Debug);
+void MCObjectStreamer::emitCFISections(bool EH, bool Debug) {
+  MCStreamer::emitCFISections(EH, Debug);
   EmitEHFrame = EH;
   EmitDebugFrame = Debug;
 }
 
-void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+void MCObjectStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
                                      SMLoc Loc) {
-  MCStreamer::EmitValueImpl(Value, Size, Loc);
+  MCStreamer::emitValueImpl(Value, Size, Loc);
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
@@ -241,7 +238,7 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
           Loc, "value evaluated as " + Twine(AbsValue) + " is out of range.");
       return;
     }
-    EmitIntValue(AbsValue, Size);
+    emitIntValue(AbsValue, Size);
     return;
   }
   DF->getFixups().push_back(
@@ -250,25 +247,25 @@ void MCObjectStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
   DF->getContents().resize(DF->getContents().size() + Size, 0);
 }
 
-MCSymbol *MCObjectStreamer::EmitCFILabel() {
+MCSymbol *MCObjectStreamer::emitCFILabel() {
   MCSymbol *Label = getContext().createTempSymbol("cfi", true);
-  EmitLabel(Label);
+  emitLabel(Label);
   return Label;
 }
 
-void MCObjectStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
+void MCObjectStreamer::emitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
   // We need to create a local symbol to avoid relocations.
   Frame.Begin = getContext().createTempSymbol();
-  EmitLabel(Frame.Begin);
+  emitLabel(Frame.Begin);
 }
 
-void MCObjectStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
+void MCObjectStreamer::emitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
   Frame.End = getContext().createTempSymbol();
-  EmitLabel(Frame.End);
+  emitLabel(Frame.End);
 }
 
-void MCObjectStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
-  MCStreamer::EmitLabel(Symbol, Loc);
+void MCObjectStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
+  MCStreamer::emitLabel(Symbol, Loc);
 
   getAssembler().registerSymbol(*Symbol);
 
@@ -291,11 +288,11 @@ void MCObjectStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
 
 // Emit a label at a previously emitted fragment/offset position. This must be
 // within the currently-active section.
-void MCObjectStreamer::EmitLabelAtPos(MCSymbol *Symbol, SMLoc Loc,
+void MCObjectStreamer::emitLabelAtPos(MCSymbol *Symbol, SMLoc Loc,
                                       MCFragment *F, uint64_t Offset) {
   assert(F->getParent() == getCurrentSectionOnly());
 
-  MCStreamer::EmitLabel(Symbol, Loc);
+  MCStreamer::emitLabel(Symbol, Loc);
   getAssembler().registerSymbol(*Symbol);
   auto *DF = dyn_cast_or_null<MCDataFragment>(F);
   Symbol->setOffset(Offset);
@@ -309,30 +306,30 @@ void MCObjectStreamer::EmitLabelAtPos(MCSymbol *Symbol, SMLoc Loc,
   }
 }
 
-void MCObjectStreamer::EmitULEB128Value(const MCExpr *Value) {
+void MCObjectStreamer::emitULEB128Value(const MCExpr *Value) {
   int64_t IntValue;
   if (Value->evaluateAsAbsolute(IntValue, getAssemblerPtr())) {
-    EmitULEB128IntValue(IntValue);
+    emitULEB128IntValue(IntValue);
     return;
   }
   insert(new MCLEBFragment(*Value, false));
 }
 
-void MCObjectStreamer::EmitSLEB128Value(const MCExpr *Value) {
+void MCObjectStreamer::emitSLEB128Value(const MCExpr *Value) {
   int64_t IntValue;
   if (Value->evaluateAsAbsolute(IntValue, getAssemblerPtr())) {
-    EmitSLEB128IntValue(IntValue);
+    emitSLEB128IntValue(IntValue);
     return;
   }
   insert(new MCLEBFragment(*Value, true));
 }
 
-void MCObjectStreamer::EmitWeakReference(MCSymbol *Alias,
+void MCObjectStreamer::emitWeakReference(MCSymbol *Alias,
                                          const MCSymbol *Symbol) {
   report_fatal_error("This file format doesn't support weak aliases.");
 }
 
-void MCObjectStreamer::ChangeSection(MCSection *Section,
+void MCObjectStreamer::changeSection(MCSection *Section,
                                      const MCExpr *Subsection) {
   changeSectionImpl(Section, Subsection);
 }
@@ -356,25 +353,32 @@ bool MCObjectStreamer::changeSectionImpl(MCSection *Section,
   return Created;
 }
 
-void MCObjectStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+void MCObjectStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
   getAssembler().registerSymbol(*Symbol);
-  MCStreamer::EmitAssignment(Symbol, Value);
+  MCStreamer::emitAssignment(Symbol, Value);
 }
 
 bool MCObjectStreamer::mayHaveInstructions(MCSection &Sec) const {
   return Sec.hasInstructions();
 }
 
-void MCObjectStreamer::EmitInstruction(const MCInst &Inst,
+void MCObjectStreamer::emitInstruction(const MCInst &Inst,
                                        const MCSubtargetInfo &STI) {
-  getAssembler().getBackend().alignBranchesBegin(*this, Inst);
-  EmitInstructionImpl(Inst, STI);
-  getAssembler().getBackend().alignBranchesEnd(*this, Inst);
+  const MCSection &Sec = *getCurrentSectionOnly();
+  if (Sec.isVirtualSection()) {
+    getContext().reportError(Inst.getLoc(), Twine(Sec.getVirtualSectionKind()) +
+                                                " section '" + Sec.getName() +
+                                                "' cannot have instructions");
+    return;
+  }
+  getAssembler().getBackend().emitInstructionBegin(*this, Inst);
+  emitInstructionImpl(Inst, STI);
+  getAssembler().getBackend().emitInstructionEnd(*this, Inst);
 }
 
-void MCObjectStreamer::EmitInstructionImpl(const MCInst &Inst,
+void MCObjectStreamer::emitInstructionImpl(const MCInst &Inst,
                                            const MCSubtargetInfo &STI) {
-  MCStreamer::EmitInstruction(Inst, STI);
+  MCStreamer::emitInstruction(Inst, STI);
 
   MCSection *Sec = getCurrentSectionOnly();
   Sec->setHasInstructions(true);
@@ -385,8 +389,10 @@ void MCObjectStreamer::EmitInstructionImpl(const MCInst &Inst,
 
   // If this instruction doesn't need relaxation, just emit it as data.
   MCAssembler &Assembler = getAssembler();
-  if (!Assembler.getBackend().mayNeedRelaxation(Inst, STI)) {
-    EmitInstToData(Inst, STI);
+  MCAsmBackend &Backend = Assembler.getBackend();
+  if (!(Backend.mayNeedRelaxation(Inst, STI) ||
+        Backend.allowEnhancedRelaxation())) {
+    emitInstToData(Inst, STI);
     return;
   }
 
@@ -397,19 +403,18 @@ void MCObjectStreamer::EmitInstructionImpl(const MCInst &Inst,
   //   fragment.
   if (Assembler.getRelaxAll() ||
       (Assembler.isBundlingEnabled() && Sec->isBundleLocked())) {
-    MCInst Relaxed;
-    getAssembler().getBackend().relaxInstruction(Inst, STI, Relaxed);
-    while (getAssembler().getBackend().mayNeedRelaxation(Relaxed, STI))
-      getAssembler().getBackend().relaxInstruction(Relaxed, STI, Relaxed);
-    EmitInstToData(Relaxed, STI);
+    MCInst Relaxed = Inst;
+    while (Backend.mayNeedRelaxation(Relaxed, STI))
+      Backend.relaxInstruction(Relaxed, STI);
+    emitInstToData(Relaxed, STI);
     return;
   }
 
   // Otherwise emit to a separate fragment.
-  EmitInstToFragment(Inst, STI);
+  emitInstToFragment(Inst, STI);
 }
 
-void MCObjectStreamer::EmitInstToFragment(const MCInst &Inst,
+void MCObjectStreamer::emitInstToFragment(const MCInst &Inst,
                                           const MCSubtargetInfo &STI) {
   if (getAssembler().getRelaxAll() && getAssembler().isBundlingEnabled())
     llvm_unreachable("All instructions should have already been relaxed");
@@ -431,19 +436,19 @@ static const char *const BundlingNotImplementedMsg =
   "Aligned bundling is not implemented for this object format";
 #endif
 
-void MCObjectStreamer::EmitBundleAlignMode(unsigned AlignPow2) {
+void MCObjectStreamer::emitBundleAlignMode(unsigned AlignPow2) {
   llvm_unreachable(BundlingNotImplementedMsg);
 }
 
-void MCObjectStreamer::EmitBundleLock(bool AlignToEnd) {
+void MCObjectStreamer::emitBundleLock(bool AlignToEnd) {
   llvm_unreachable(BundlingNotImplementedMsg);
 }
 
-void MCObjectStreamer::EmitBundleUnlock() {
+void MCObjectStreamer::emitBundleUnlock() {
   llvm_unreachable(BundlingNotImplementedMsg);
 }
 
-void MCObjectStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
+void MCObjectStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                              unsigned Column, unsigned Flags,
                                              unsigned Isa,
                                              unsigned Discriminator,
@@ -452,8 +457,8 @@ void MCObjectStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
   // first one gets a line entry.
   MCDwarfLineEntry::Make(this, getCurrentSectionOnly());
 
-  this->MCStreamer::EmitDwarfLocDirective(FileNo, Line, Column, Flags,
-                                          Isa, Discriminator, FileName);
+  this->MCStreamer::emitDwarfLocDirective(FileNo, Line, Column, Flags, Isa,
+                                          Discriminator, FileName);
 }
 
 static const MCExpr *buildSymbolDiff(MCObjectStreamer &OS, const MCSymbol *A,
@@ -472,16 +477,16 @@ static void emitDwarfSetLineAddr(MCObjectStreamer &OS,
                                  int64_t LineDelta, const MCSymbol *Label,
                                  int PointerSize) {
   // emit the sequence to set the address
-  OS.EmitIntValue(dwarf::DW_LNS_extended_op, 1);
-  OS.EmitULEB128IntValue(PointerSize + 1);
-  OS.EmitIntValue(dwarf::DW_LNE_set_address, 1);
-  OS.EmitSymbolValue(Label, PointerSize);
+  OS.emitIntValue(dwarf::DW_LNS_extended_op, 1);
+  OS.emitULEB128IntValue(PointerSize + 1);
+  OS.emitIntValue(dwarf::DW_LNE_set_address, 1);
+  OS.emitSymbolValue(Label, PointerSize);
 
   // emit the sequence for the LineDelta (from 1) and a zero address delta.
   MCDwarfLineAddr::Emit(&OS, Params, LineDelta, 0);
 }
 
-void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
+void MCObjectStreamer::emitDwarfAdvanceLineAddr(int64_t LineDelta,
                                                 const MCSymbol *LastLabel,
                                                 const MCSymbol *Label,
                                                 unsigned PointerSize) {
@@ -500,7 +505,7 @@ void MCObjectStreamer::EmitDwarfAdvanceLineAddr(int64_t LineDelta,
   insert(new MCDwarfLineAddrFragment(LineDelta, *AddrDelta));
 }
 
-void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
+void MCObjectStreamer::emitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                                  const MCSymbol *Label) {
   const MCExpr *AddrDelta = buildSymbolDiff(*this, Label, LastLabel);
   int64_t Res;
@@ -511,7 +516,7 @@ void MCObjectStreamer::EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
   insert(new MCDwarfCallFrameFragment(*AddrDelta));
 }
 
-void MCObjectStreamer::EmitCVLocDirective(unsigned FunctionId, unsigned FileNo,
+void MCObjectStreamer::emitCVLocDirective(unsigned FunctionId, unsigned FileNo,
                                           unsigned Line, unsigned Column,
                                           bool PrologueEnd, bool IsStmt,
                                           StringRef FileName, SMLoc Loc) {
@@ -521,31 +526,31 @@ void MCObjectStreamer::EmitCVLocDirective(unsigned FunctionId, unsigned FileNo,
 
   // Emit a label at the current position and record it in the CodeViewContext.
   MCSymbol *LineSym = getContext().createTempSymbol();
-  EmitLabel(LineSym);
+  emitLabel(LineSym);
   getContext().getCVContext().recordCVLoc(getContext(), LineSym, FunctionId,
                                           FileNo, Line, Column, PrologueEnd,
                                           IsStmt);
 }
 
-void MCObjectStreamer::EmitCVLinetableDirective(unsigned FunctionId,
+void MCObjectStreamer::emitCVLinetableDirective(unsigned FunctionId,
                                                 const MCSymbol *Begin,
                                                 const MCSymbol *End) {
   getContext().getCVContext().emitLineTableForFunction(*this, FunctionId, Begin,
                                                        End);
-  this->MCStreamer::EmitCVLinetableDirective(FunctionId, Begin, End);
+  this->MCStreamer::emitCVLinetableDirective(FunctionId, Begin, End);
 }
 
-void MCObjectStreamer::EmitCVInlineLinetableDirective(
+void MCObjectStreamer::emitCVInlineLinetableDirective(
     unsigned PrimaryFunctionId, unsigned SourceFileId, unsigned SourceLineNum,
     const MCSymbol *FnStartSym, const MCSymbol *FnEndSym) {
   getContext().getCVContext().emitInlineLineTableForFunction(
       *this, PrimaryFunctionId, SourceFileId, SourceLineNum, FnStartSym,
       FnEndSym);
-  this->MCStreamer::EmitCVInlineLinetableDirective(
+  this->MCStreamer::emitCVInlineLinetableDirective(
       PrimaryFunctionId, SourceFileId, SourceLineNum, FnStartSym, FnEndSym);
 }
 
-void MCObjectStreamer::EmitCVDefRangeDirective(
+void MCObjectStreamer::emitCVDefRangeDirective(
     ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
     StringRef FixedSizePortion) {
   MCFragment *Frag =
@@ -553,28 +558,28 @@ void MCObjectStreamer::EmitCVDefRangeDirective(
   // Attach labels that were pending before we created the defrange fragment to
   // the beginning of the new fragment.
   flushPendingLabels(Frag, 0);
-  this->MCStreamer::EmitCVDefRangeDirective(Ranges, FixedSizePortion);
+  this->MCStreamer::emitCVDefRangeDirective(Ranges, FixedSizePortion);
 }
 
-void MCObjectStreamer::EmitCVStringTableDirective() {
+void MCObjectStreamer::emitCVStringTableDirective() {
   getContext().getCVContext().emitStringTable(*this);
 }
-void MCObjectStreamer::EmitCVFileChecksumsDirective() {
+void MCObjectStreamer::emitCVFileChecksumsDirective() {
   getContext().getCVContext().emitFileChecksums(*this);
 }
 
-void MCObjectStreamer::EmitCVFileChecksumOffsetDirective(unsigned FileNo) {
+void MCObjectStreamer::emitCVFileChecksumOffsetDirective(unsigned FileNo) {
   getContext().getCVContext().emitFileChecksumOffset(*this, FileNo);
 }
 
-void MCObjectStreamer::EmitBytes(StringRef Data) {
+void MCObjectStreamer::emitBytes(StringRef Data) {
   MCDwarfLineEntry::Make(this, getCurrentSectionOnly());
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
   DF->getContents().append(Data.begin(), Data.end());
 }
 
-void MCObjectStreamer::EmitValueToAlignment(unsigned ByteAlignment,
+void MCObjectStreamer::emitValueToAlignment(unsigned ByteAlignment,
                                             int64_t Value,
                                             unsigned ValueSize,
                                             unsigned MaxBytesToEmit) {
@@ -588,9 +593,9 @@ void MCObjectStreamer::EmitValueToAlignment(unsigned ByteAlignment,
     CurSec->setAlignment(Align(ByteAlignment));
 }
 
-void MCObjectStreamer::EmitCodeAlignment(unsigned ByteAlignment,
+void MCObjectStreamer::emitCodeAlignment(unsigned ByteAlignment,
                                          unsigned MaxBytesToEmit) {
-  EmitValueToAlignment(ByteAlignment, 0, 1, MaxBytesToEmit);
+  emitValueToAlignment(ByteAlignment, 0, 1, MaxBytesToEmit);
   cast<MCAlignFragment>(getCurrentFragment())->setEmitNops(true);
 }
 
@@ -601,7 +606,7 @@ void MCObjectStreamer::emitValueToOffset(const MCExpr *Offset,
 }
 
 // Associate DTPRel32 fixup with data and resize data area
-void MCObjectStreamer::EmitDTPRel32Value(const MCExpr *Value) {
+void MCObjectStreamer::emitDTPRel32Value(const MCExpr *Value) {
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
@@ -611,7 +616,7 @@ void MCObjectStreamer::EmitDTPRel32Value(const MCExpr *Value) {
 }
 
 // Associate DTPRel64 fixup with data and resize data area
-void MCObjectStreamer::EmitDTPRel64Value(const MCExpr *Value) {
+void MCObjectStreamer::emitDTPRel64Value(const MCExpr *Value) {
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
@@ -621,7 +626,7 @@ void MCObjectStreamer::EmitDTPRel64Value(const MCExpr *Value) {
 }
 
 // Associate TPRel32 fixup with data and resize data area
-void MCObjectStreamer::EmitTPRel32Value(const MCExpr *Value) {
+void MCObjectStreamer::emitTPRel32Value(const MCExpr *Value) {
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
@@ -631,7 +636,7 @@ void MCObjectStreamer::EmitTPRel32Value(const MCExpr *Value) {
 }
 
 // Associate TPRel64 fixup with data and resize data area
-void MCObjectStreamer::EmitTPRel64Value(const MCExpr *Value) {
+void MCObjectStreamer::emitTPRel64Value(const MCExpr *Value) {
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
@@ -641,7 +646,7 @@ void MCObjectStreamer::EmitTPRel64Value(const MCExpr *Value) {
 }
 
 // Associate GPRel32 fixup with data and resize data area
-void MCObjectStreamer::EmitGPRel32Value(const MCExpr *Value) {
+void MCObjectStreamer::emitGPRel32Value(const MCExpr *Value) {
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
@@ -651,7 +656,7 @@ void MCObjectStreamer::EmitGPRel32Value(const MCExpr *Value) {
 }
 
 // Associate GPRel64 fixup with data and resize data area
-void MCObjectStreamer::EmitGPRel64Value(const MCExpr *Value) {
+void MCObjectStreamer::emitGPRel64Value(const MCExpr *Value) {
   MCDataFragment *DF = getOrCreateDataFragment();
   flushPendingLabels(DF, DF->getContents().size());
 
@@ -660,12 +665,13 @@ void MCObjectStreamer::EmitGPRel64Value(const MCExpr *Value) {
   DF->getContents().resize(DF->getContents().size() + 8, 0);
 }
 
-bool MCObjectStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
-                                          const MCExpr *Expr, SMLoc Loc,
-                                          const MCSubtargetInfo &STI) {
+Optional<std::pair<bool, std::string>>
+MCObjectStreamer::emitRelocDirective(const MCExpr &Offset, StringRef Name,
+                                     const MCExpr *Expr, SMLoc Loc,
+                                     const MCSubtargetInfo &STI) {
   Optional<MCFixupKind> MaybeKind = Assembler->getBackend().getFixupKind(Name);
   if (!MaybeKind.hasValue())
-    return true;
+    return std::make_pair(true, std::string("unknown relocation name"));
 
   MCFixupKind Kind = *MaybeKind;
 
@@ -676,27 +682,33 @@ bool MCObjectStreamer::EmitRelocDirective(const MCExpr &Offset, StringRef Name,
   MCDataFragment *DF = getOrCreateDataFragment(&STI);
   flushPendingLabels(DF, DF->getContents().size());
 
-  int64_t OffsetValue;
-  if (Offset.evaluateAsAbsolute(OffsetValue)) {
-    if (OffsetValue < 0)
-      llvm_unreachable(".reloc offset is negative");
-    DF->getFixups().push_back(MCFixup::create(OffsetValue, Expr, Kind, Loc));
-    return false;
+  MCValue OffsetVal;
+  if (!Offset.evaluateAsRelocatable(OffsetVal, nullptr, nullptr))
+    return std::make_pair(false,
+                          std::string(".reloc offset is not relocatable"));
+  if (OffsetVal.isAbsolute()) {
+    if (OffsetVal.getConstant() < 0)
+      return std::make_pair(false, std::string(".reloc offset is negative"));
+    DF->getFixups().push_back(
+        MCFixup::create(OffsetVal.getConstant(), Expr, Kind, Loc));
+    return None;
   }
+  if (OffsetVal.getSymB())
+    return std::make_pair(false,
+                          std::string(".reloc offset is not representable"));
 
-  if (Offset.getKind() != llvm::MCExpr::SymbolRef)
-    llvm_unreachable(".reloc offset is not absolute nor a label");
-
-  const MCSymbolRefExpr &SRE = cast<MCSymbolRefExpr>(Offset);
+  const MCSymbolRefExpr &SRE = cast<MCSymbolRefExpr>(*OffsetVal.getSymA());
   if (SRE.getSymbol().isDefined()) {
-    DF->getFixups().push_back(MCFixup::create(SRE.getSymbol().getOffset(),
-                                              Expr, Kind, Loc));
-    return false;
+    // FIXME SRE.getSymbol() may not be relative to DF.
+    DF->getFixups().push_back(
+        MCFixup::create(SRE.getSymbol().getOffset() + OffsetVal.getConstant(),
+                        Expr, Kind, Loc));
+    return None;
   }
 
   PendingFixups.emplace_back(&SRE.getSymbol(), DF,
-                                         MCFixup::create(-1, Expr, Kind, Loc));
-  return false;
+                             MCFixup::create(-1, Expr, Kind, Loc));
+  return None;
 }
 
 void MCObjectStreamer::emitFill(const MCExpr &NumBytes, uint64_t FillValue,
@@ -723,9 +735,9 @@ void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
     int64_t NonZeroSize = Size > 4 ? 4 : Size;
     Expr &= ~0ULL >> (64 - NonZeroSize * 8);
     for (uint64_t i = 0, e = IntNumValues; i != e; ++i) {
-      EmitIntValue(Expr, NonZeroSize);
+      emitIntValue(Expr, NonZeroSize);
       if (NonZeroSize < Size)
-        EmitIntValue(0, Size - NonZeroSize);
+        emitIntValue(0, Size - NonZeroSize);
     }
     return;
   }
@@ -738,20 +750,20 @@ void MCObjectStreamer::emitFill(const MCExpr &NumValues, int64_t Size,
   insert(new MCFillFragment(Expr, Size, NumValues, Loc));
 }
 
-void MCObjectStreamer::EmitFileDirective(StringRef Filename) {
+void MCObjectStreamer::emitFileDirective(StringRef Filename) {
   getAssembler().addFileName(Filename);
 }
 
-void MCObjectStreamer::EmitAddrsig() {
+void MCObjectStreamer::emitAddrsig() {
   getAssembler().getWriter().emitAddrsigSection();
 }
 
-void MCObjectStreamer::EmitAddrsigSym(const MCSymbol *Sym) {
+void MCObjectStreamer::emitAddrsigSym(const MCSymbol *Sym) {
   getAssembler().registerSymbol(*Sym);
   getAssembler().getWriter().addAddrsigSymbol(Sym);
 }
 
-void MCObjectStreamer::FinishImpl() {
+void MCObjectStreamer::finishImpl() {
   getContext().RemapDebugPaths();
 
   // If we are generating dwarf for assembly source files dump out the sections.
diff --git a/llvm/lib/MC/MCParser/AsmLexer.cpp b/llvm/lib/MC/MCParser/AsmLexer.cpp
index 9155ae05d29d..5a571c7c0c0e 100644
--- a/llvm/lib/MC/MCParser/AsmLexer.cpp
+++ b/llvm/lib/MC/MCParser/AsmLexer.cpp
@@ -36,7 +36,8 @@ AsmLexer::AsmLexer(const MCAsmInfo &MAI) : MAI(MAI) {
 
 AsmLexer::~AsmLexer() = default;
 
-void AsmLexer::setBuffer(StringRef Buf, const char *ptr) {
+void AsmLexer::setBuffer(StringRef Buf, const char *ptr,
+                         bool EndStatementAtEOF) {
   CurBuf = Buf;
 
   if (ptr)
@@ -45,6 +46,7 @@ void AsmLexer::setBuffer(StringRef Buf, const char *ptr) {
     CurPtr = CurBuf.begin();
 
   TokStart = nullptr;
+  this->EndStatementAtEOF = EndStatementAtEOF;
 }
 
 /// ReturnError - Set the error to the specified string at the specified
@@ -584,7 +586,7 @@ AsmToken AsmLexer::LexToken() {
 
   // If we're missing a newline at EOF, make sure we still get an
   // EndOfStatement token before the Eof token.
-  if (CurChar == EOF && !IsAtStartOfStatement) {
+  if (CurChar == EOF && !IsAtStartOfStatement && EndStatementAtEOF) {
     IsAtStartOfLine = true;
     IsAtStartOfStatement = true;
     return AsmToken(AsmToken::EndOfStatement, StringRef(TokStart, 1));
@@ -594,15 +596,24 @@ AsmToken AsmLexer::LexToken() {
   IsAtStartOfStatement = false;
   switch (CurChar) {
   default:
-    // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
-    if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
-      return LexIdentifier();
+    if (MAI.doesAllowSymbolAtNameStart()) {
+      // Handle Microsoft-style identifier: [a-zA-Z_$.@?][a-zA-Z0-9_$.@?]*
+      if (!isDigit(CurChar) &&
+          IsIdentifierChar(CurChar, MAI.doesAllowAtInName()))
+        return LexIdentifier();
+    } else {
+      // Handle identifier: [a-zA-Z_.][a-zA-Z0-9_$.@]*
+      if (isalpha(CurChar) || CurChar == '_' || CurChar == '.')
+        return LexIdentifier();
+    }
 
     // Unknown character, emit an error.
     return ReturnError(TokStart, "invalid character in input");
   case EOF:
-    IsAtStartOfLine = true;
-    IsAtStartOfStatement = true;
+    if (EndStatementAtEOF) {
+      IsAtStartOfLine = true;
+      IsAtStartOfStatement = true;
+    }
     return AsmToken(AsmToken::Eof, StringRef(TokStart, 0));
   case 0:
   case ' ':
diff --git a/llvm/lib/MC/MCParser/AsmParser.cpp b/llvm/lib/MC/MCParser/AsmParser.cpp
index 94a44c1f93b1..c05f26cbdda5 100644
--- a/llvm/lib/MC/MCParser/AsmParser.cpp
+++ b/llvm/lib/MC/MCParser/AsmParser.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This class implements the parser for assembly files.
+// This class implements a parser for assembly files similar to gas syntax.
 //
 //===----------------------------------------------------------------------===//
 
@@ -74,9 +74,7 @@ using namespace llvm;
 
 MCAsmParserSemaCallback::~MCAsmParserSemaCallback() = default;
 
-static cl::opt<unsigned> AsmMacroMaxNestingDepth(
-     "asm-macro-max-nesting-depth", cl::init(20), cl::Hidden,
-     cl::desc("The maximum nesting depth allowed for assembly macros."));
+extern cl::opt<unsigned> AsmMacroMaxNestingDepth;
 
 namespace {
 
@@ -176,7 +174,7 @@ private:
   bool IsDarwin = false;
 
   /// Are we parsing ms-style inline assembly?
-  bool ParsingInlineAsm = false;
+  bool ParsingMSInlineAsm = false;
 
   /// Did we already inform the user about inconsistent MD5 usage?
   bool ReportedInconsistentMD5 = false;
@@ -199,7 +197,7 @@ public:
   }
 
   void addAliasForDirective(StringRef Directive, StringRef Alias) override {
-    DirectiveKindMap[Directive] = DirectiveKindMap[Alias];
+    DirectiveKindMap[Directive.lower()] = DirectiveKindMap[Alias.lower()];
   }
 
   /// @name MCAsmParser Interface
@@ -228,13 +226,13 @@ public:
 
   const AsmToken &Lex() override;
 
-  void setParsingInlineAsm(bool V) override {
-    ParsingInlineAsm = V;
+  void setParsingMSInlineAsm(bool V) override {
+    ParsingMSInlineAsm = V;
     // When parsing MS inline asm, we must lex 0b1101 and 0ABCH as binary and
     // hex integer literals.
     Lexer.setLexMasmIntegers(V);
   }
-  bool isParsingInlineAsm() override { return ParsingInlineAsm; }
+  bool isParsingMSInlineAsm() override { return ParsingMSInlineAsm; }
 
   bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
                         unsigned &NumOutputs, unsigned &NumInputs,
@@ -269,7 +267,7 @@ private:
   bool parseStatement(ParseStatementInfo &Info,
                       MCAsmParserSemaCallback *SI);
   bool parseCurlyBlockScope(SmallVectorImpl<AsmRewrite>& AsmStrRewrites);
-  bool parseCppHashLineFilenameComment(SMLoc L);
+  bool parseCppHashLineFilenameComment(SMLoc L, bool SaveLocInfo = true);
 
   void checkForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
                         ArrayRef<MCAsmMacroParameter> Parameters);
@@ -645,6 +643,7 @@ private:
   bool parseDirectiveElse(SMLoc DirectiveLoc); // ".else"
   bool parseDirectiveEndIf(SMLoc DirectiveLoc); // .endif
   bool parseEscapedString(std::string &Data) override;
+  bool parseAngleBracketString(std::string &Data) override;
 
   const MCExpr *applyModifierToExpr(const MCExpr *E,
                                     MCSymbolRefExpr::VariantKind Variant);
@@ -814,7 +813,7 @@ bool AsmParser::processIncbinFile(const std::string &Filename, int64_t Skip,
       return Warning(Loc, "negative count has no effect");
     Bytes = Bytes.take_front(Res);
   }
-  getStreamer().EmitBytes(Bytes);
+  getStreamer().emitBytes(Bytes);
   return false;
 }
 
@@ -875,7 +874,7 @@ bool AsmParser::enabledGenDwarfForAssembly() {
                                           /*Cksum=*/None, /*Source=*/None);
     const MCDwarfFile &RootFile =
         getContext().getMCDwarfLineTable(/*CUID=*/0).getRootFile();
-    getContext().setGenDwarfFileNumber(getStreamer().EmitDwarfFileDirective(
+    getContext().setGenDwarfFileNumber(getStreamer().emitDwarfFileDirective(
         /*CUID=*/0, getContext().getCompilationDir(), RootFile.Name,
         RootFile.Checksum, RootFile.Source));
   }
@@ -902,7 +901,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
     MCSection *Sec = getStreamer().getCurrentSectionOnly();
     if (!Sec->getBeginSymbol()) {
       MCSymbol *SectionStartSym = getContext().createTempSymbol();
-      getStreamer().EmitLabel(SectionStartSym);
+      getStreamer().emitLabel(SectionStartSym);
       Sec->setBeginSymbol(SectionStartSym);
     }
     bool InsertResult = getContext().addGenDwarfSection(Sec);
@@ -995,7 +994,7 @@ bool AsmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
 }
 
 bool AsmParser::checkForValidSection() {
-  if (!ParsingInlineAsm && !getStreamer().getCurrentSectionOnly()) {
+  if (!ParsingMSInlineAsm && !getStreamer().getCurrentSectionOnly()) {
     Out.InitSections(false);
     return Error(getTok().getLoc(),
                  "expected section directive before assembly directive");
@@ -1097,7 +1096,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
           // This is a '$' reference, which references the current PC.  Emit a
           // temporary label to the streamer and refer to it.
           MCSymbol *Sym = Ctx.createTempSymbol();
-          Out.EmitLabel(Sym);
+          Out.emitLabel(Sym);
           Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
                                         getContext());
           EndLoc = FirstTokenLoc;
@@ -1223,7 +1222,7 @@ bool AsmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
     // This is a '.' reference, which references the current PC.  Emit a
     // temporary label to the streamer and refer to it.
     MCSymbol *Sym = Ctx.createTempSymbol();
-    Out.EmitLabel(Sym);
+    Out.emitLabel(Sym);
     Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
     EndLoc = Lexer.getTok().getEndLoc();
     Lex(); // Eat identifier.
@@ -1365,7 +1364,7 @@ AsmParser::applyModifierToExpr(const MCExpr *E,
 /// implementation. GCC does not fully support this feature and so we will not
 /// support it.
 /// TODO: Adding single quote as a string.
-static bool isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
+static bool isAngleBracketString(SMLoc &StrLoc, SMLoc &EndLoc) {
   assert((StrLoc.getPointer() != nullptr) &&
          "Argument to the function cannot be a NULL value");
   const char *CharPtr = StrLoc.getPointer();
@@ -1383,7 +1382,7 @@ static bool isAltmacroString(SMLoc &StrLoc, SMLoc &EndLoc) {
 }
 
 /// creating a string without the escape characters '!'.
-static std::string altMacroString(StringRef AltMacroStr) {
+static std::string angleBracketString(StringRef AltMacroStr) {
   std::string Res;
   for (size_t Pos = 0; Pos < AltMacroStr.size(); Pos++) {
     if (AltMacroStr[Pos] == '!')
@@ -1700,7 +1699,9 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
   StringRef IDVal;
   int64_t LocalLabelVal = -1;
   if (Lexer.is(AsmToken::HashDirective))
-    return parseCppHashLineFilenameComment(IDLoc);
+    return parseCppHashLineFilenameComment(IDLoc,
+                                           !isInsideMacroInstantiation());
+
   // Allow an integer followed by a ':' as a directional local label.
   if (Lexer.is(AsmToken::Integer)) {
     LocalLabelVal = getTok().getIntVal();
@@ -1750,7 +1751,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
   // have to do this so that .endif isn't skipped in a ".if 0" block for
   // example.
   StringMap<DirectiveKind>::const_iterator DirKindIt =
-      DirectiveKindMap.find(IDVal);
+      DirectiveKindMap.find(IDVal.lower());
   DirectiveKind DirKind = (DirKindIt == DirectiveKindMap.end())
 
                               ? DK_NO_DIRECTIVE
@@ -1822,7 +1823,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     // implicitly marked as external.
     MCSymbol *Sym;
     if (LocalLabelVal == -1) {
-      if (ParsingInlineAsm && SI) {
+      if (ParsingMSInlineAsm && SI) {
         StringRef RewrittenLabel =
             SI->LookupInlineAsmLabel(IDVal, getSourceManager(), IDLoc, true);
         assert(!RewrittenLabel.empty() &&
@@ -1853,8 +1854,8 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     getTargetParser().doBeforeLabelEmit(Sym);
 
     // Emit the label.
-    if (!getTargetParser().isParsingInlineAsm())
-      Out.EmitLabel(Sym, IDLoc);
+    if (!getTargetParser().isParsingMSInlineAsm())
+      Out.emitLabel(Sym, IDLoc);
 
     // If we are generating dwarf for assembly source files then gather the
     // info to make a dwarf label entry for this label if needed.
@@ -2194,15 +2195,15 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
   }
 
   // __asm _emit or __asm __emit
-  if (ParsingInlineAsm && (IDVal == "_emit" || IDVal == "__emit" ||
-                           IDVal == "_EMIT" || IDVal == "__EMIT"))
+  if (ParsingMSInlineAsm && (IDVal == "_emit" || IDVal == "__emit" ||
+                             IDVal == "_EMIT" || IDVal == "__EMIT"))
     return parseDirectiveMSEmit(IDLoc, Info, IDVal.size());
 
   // __asm align
-  if (ParsingInlineAsm && (IDVal == "align" || IDVal == "ALIGN"))
+  if (ParsingMSInlineAsm && (IDVal == "align" || IDVal == "ALIGN"))
     return parseDirectiveMSAlign(IDLoc, Info);
 
-  if (ParsingInlineAsm && (IDVal == "even" || IDVal == "EVEN"))
+  if (ParsingMSInlineAsm && (IDVal == "even" || IDVal == "EVEN"))
     Info.AsmRewrites->emplace_back(AOK_EVEN, IDLoc, 4);
   if (checkForValidSection())
     return true;
@@ -2249,7 +2250,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     // current Dwarf File is for the CppHashFilename if not then emit the
     // Dwarf File table for it and adjust the line number for the .loc.
     if (!CppHashInfo.Filename.empty()) {
-      unsigned FileNumber = getStreamer().EmitDwarfFileDirective(
+      unsigned FileNumber = getStreamer().emitDwarfFileDirective(
           0, StringRef(), CppHashInfo.Filename);
       getContext().setGenDwarfFileNumber(FileNumber);
 
@@ -2258,7 +2259,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
       Line = CppHashInfo.LineNumber - 1 + (Line - CppHashLocLineNo);
     }
 
-    getStreamer().EmitDwarfLocDirective(
+    getStreamer().emitDwarfLocDirective(
         getContext().getGenDwarfFileNumber(), Line, 0,
         DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0, 0, 0,
         StringRef());
@@ -2269,7 +2270,7 @@ bool AsmParser::parseStatement(ParseStatementInfo &Info,
     uint64_t ErrorInfo;
     if (getTargetParser().MatchAndEmitInstruction(
             IDLoc, Info.Opcode, Info.ParsedOperands, Out, ErrorInfo,
-            getTargetParser().isParsingInlineAsm()))
+            getTargetParser().isParsingMSInlineAsm()))
       return true;
   }
   return false;
@@ -2295,7 +2296,7 @@ AsmParser::parseCurlyBlockScope(SmallVectorImpl<AsmRewrite> &AsmStrRewrites) {
 
 /// parseCppHashLineFilenameComment as this:
 ///   ::= # number "filename"
-bool AsmParser::parseCppHashLineFilenameComment(SMLoc L) {
+bool AsmParser::parseCppHashLineFilenameComment(SMLoc L, bool SaveLocInfo) {
   Lex(); // Eat the hash token.
   // Lexer only ever emits HashDirective if it fully formed if it's
   // done the checking already so this is an internal error.
@@ -2308,6 +2309,9 @@ bool AsmParser::parseCppHashLineFilenameComment(SMLoc L) {
   StringRef Filename = getTok().getString();
   Lex();
 
+  if (!SaveLocInfo)
+    return false;
+
   // Get rid of the enclosing quotes.
   Filename = Filename.substr(1, Filename.size() - 2);
 
@@ -2358,7 +2362,7 @@ void AsmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
   // Use the CppHashFilename and calculate a line number based on the
   // CppHashInfo.Loc and CppHashInfo.LineNumber relative to this Diag's SMLoc
   // for the diagnostic.
-  const std::string &Filename = Parser->CppHashInfo.Filename;
+  const std::string &Filename = std::string(Parser->CppHashInfo.Filename);
 
   int DiagLocLineNo = DiagSrcMgr.FindLineNumber(DiagLoc, DiagBuf);
   int CppHashLocLineNo =
@@ -2497,7 +2501,7 @@ bool AsmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
             // is considered altMacroString!!!
             else if (AltMacroMode && Token.getString().front() == '<' &&
                      Token.is(AsmToken::String)) {
-              OS << altMacroString(Token.getStringContents());
+              OS << angleBracketString(Token.getStringContents());
             }
             // We expect no quotes around the string's contents when
             // parsing for varargs.
@@ -2690,7 +2694,7 @@ bool AsmParser::parseMacroArguments(const MCAsmMacro *M,
                         StringRef(StrChar, EndChar - StrChar), Value);
       FA.Value.push_back(newToken);
     } else if (AltMacroMode && Lexer.is(AsmToken::Less) &&
-               isAltmacroString(StrLoc, EndLoc)) {
+               isAngleBracketString(StrLoc, EndLoc)) {
       const char *StrChar = StrLoc.getPointer();
       const char *EndChar = EndLoc.getPointer();
       jumpToLoc(EndLoc, CurBuffer);
@@ -2831,9 +2835,9 @@ bool AsmParser::parseAssignment(StringRef Name, bool allow_redef,
   }
 
   // Do the assignment.
-  Out.EmitAssignment(Sym, Value);
+  Out.emitAssignment(Sym, Value);
   if (NoDeadStrip)
-    Out.EmitSymbolAttribute(Sym, MCSA_NoDeadStrip);
+    Out.emitSymbolAttribute(Sym, MCSA_NoDeadStrip);
 
   return false;
 }
@@ -2855,18 +2859,18 @@ bool AsmParser::parseIdentifier(StringRef &Res) {
     AsmToken Buf[1];
     Lexer.peekTokens(Buf, false);
 
-    if (Buf[0].isNot(AsmToken::Identifier))
+    if (Buf[0].isNot(AsmToken::Identifier) && Buf[0].isNot(AsmToken::Integer))
       return true;
 
-    // We have a '$' or '@' followed by an identifier, make sure they are adjacent.
+    // We have a '$' or '@' followed by an identifier or integer token, make
+    // sure they are adjacent.
     if (PrefixLoc.getPointer() + 1 != Buf[0].getLoc().getPointer())
       return true;
 
     // eat $ or @
     Lexer.Lex(); // Lexer's Lex guarantees consecutive token.
     // Construct the joined identifier and consume the token.
-    Res =
-        StringRef(PrefixLoc.getPointer(), getTok().getIdentifier().size() + 1);
+    Res = StringRef(PrefixLoc.getPointer(), getTok().getString().size() + 1);
     Lex(); // Parser Lex to maintain invariants.
     return false;
   }
@@ -2969,6 +2973,21 @@ bool AsmParser::parseEscapedString(std::string &Data) {
   return false;
 }
 
+bool AsmParser::parseAngleBracketString(std::string &Data) {
+  SMLoc EndLoc, StartLoc = getTok().getLoc();
+  if (isAngleBracketString(StartLoc, EndLoc)) {
+    const char *StartChar = StartLoc.getPointer() + 1;
+    const char *EndChar = EndLoc.getPointer() - 1;
+    jumpToLoc(EndLoc, CurBuffer);
+    /// Eat from '<' to '>'
+    Lex();
+
+    Data = angleBracketString(StringRef(StartChar, EndChar - StartChar));
+    return false;
+  }
+  return true;
+}
+
 /// parseDirectiveAscii:
 ///   ::= ( .ascii | .asciz | .string ) [ "string" ( , "string" )* ]
 bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
@@ -2976,9 +2995,9 @@ bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
     std::string Data;
     if (checkForValidSection() || parseEscapedString(Data))
       return true;
-    getStreamer().EmitBytes(Data);
+    getStreamer().emitBytes(Data);
     if (ZeroTerminated)
-      getStreamer().EmitBytes(StringRef("\0", 1));
+      getStreamer().emitBytes(StringRef("\0", 1));
     return false;
   };
 
@@ -2992,20 +3011,12 @@ bool AsmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
 bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
   const MCExpr *Offset;
   const MCExpr *Expr = nullptr;
-  int64_t OffsetValue;
   SMLoc OffsetLoc = Lexer.getTok().getLoc();
 
   if (parseExpression(Offset))
     return true;
-
-  if ((Offset->evaluateAsAbsolute(OffsetValue,
-                                  getStreamer().getAssemblerPtr()) &&
-       check(OffsetValue < 0, OffsetLoc, "expression is negative")) ||
-      (check(Offset->getKind() != llvm::MCExpr::Constant &&
-             Offset->getKind() != llvm::MCExpr::SymbolRef,
-             OffsetLoc, "expected non-negative number or a label")) ||
-      (parseToken(AsmToken::Comma, "expected comma") ||
-       check(getTok().isNot(AsmToken::Identifier), "expected relocation name")))
+  if (parseToken(AsmToken::Comma, "expected comma") ||
+      check(getTok().isNot(AsmToken::Identifier), "expected relocation name"))
     return true;
 
   SMLoc NameLoc = Lexer.getTok().getLoc();
@@ -3029,8 +3040,10 @@ bool AsmParser::parseDirectiveReloc(SMLoc DirectiveLoc) {
 
   const MCTargetAsmParser &MCT = getTargetParser();
   const MCSubtargetInfo &STI = MCT.getSTI();
-  if (getStreamer().EmitRelocDirective(*Offset, Name, Expr, DirectiveLoc, STI))
-    return Error(NameLoc, "unknown relocation name");
+  if (Optional<std::pair<bool, std::string>> Err =
+          getStreamer().emitRelocDirective(*Offset, Name, Expr, DirectiveLoc,
+                                           STI))
+    return Error(Err->first ? NameLoc : OffsetLoc, Err->second);
 
   return false;
 }
@@ -3049,9 +3062,9 @@ bool AsmParser::parseDirectiveValue(StringRef IDVal, unsigned Size) {
       uint64_t IntValue = MCE->getValue();
       if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
         return Error(ExprLoc, "out of range literal value");
-      getStreamer().EmitIntValue(IntValue, Size);
+      getStreamer().emitIntValue(IntValue, Size);
     } else
-      getStreamer().EmitValue(Value, Size, ExprLoc);
+      getStreamer().emitValue(Value, Size, ExprLoc);
     return false;
   };
 
@@ -3090,11 +3103,11 @@ bool AsmParser::parseDirectiveOctaValue(StringRef IDVal) {
     if (parseHexOcta(*this, hi, lo))
       return true;
     if (MAI.isLittleEndian()) {
-      getStreamer().EmitIntValue(lo, 8);
-      getStreamer().EmitIntValue(hi, 8);
+      getStreamer().emitInt64(lo);
+      getStreamer().emitInt64(hi);
     } else {
-      getStreamer().EmitIntValue(hi, 8);
-      getStreamer().EmitIntValue(lo, 8);
+      getStreamer().emitInt64(hi);
+      getStreamer().emitInt64(lo);
     }
     return false;
   };
@@ -3153,7 +3166,7 @@ bool AsmParser::parseDirectiveRealValue(StringRef IDVal,
     APInt AsInt;
     if (checkForValidSection() || parseRealValue(Semantics, AsInt))
       return true;
-    getStreamer().EmitIntValue(AsInt.getLimitedValue(),
+    getStreamer().emitIntValue(AsInt.getLimitedValue(),
                                AsInt.getBitWidth() / 8);
     return false;
   };
@@ -3335,10 +3348,10 @@ bool AsmParser::parseDirectiveAlign(bool IsPow2, unsigned ValueSize) {
   bool UseCodeAlign = Section->UseCodeAlign();
   if ((!HasFillExpr || Lexer.getMAI().getTextAlignFillValue() == FillExpr) &&
       ValueSize == 1 && UseCodeAlign) {
-    getStreamer().EmitCodeAlignment(Alignment, MaxBytesToFill);
+    getStreamer().emitCodeAlignment(Alignment, MaxBytesToFill);
   } else {
     // FIXME: Target specific behavior about how the "extra" bytes are filled.
-    getStreamer().EmitValueToAlignment(Alignment, FillExpr, ValueSize,
+    getStreamer().emitValueToAlignment(Alignment, FillExpr, ValueSize,
                                        MaxBytesToFill);
   }
 
@@ -3419,7 +3432,7 @@ bool AsmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
     // numberless .file directives. This allows some portability of assembler
     // between different object file formats.
     if (getContext().getAsmInfo()->hasSingleParameterDotFile())
-      getStreamer().EmitFileDirective(Filename);
+      getStreamer().emitFileDirective(Filename);
   } else {
     // In case there is a -g option as well as debug info from directive .file,
     // we turn off the -g option, directly use the existing debug info instead.
@@ -3514,7 +3527,8 @@ bool AsmParser::parseDirectiveLoc() {
     Lex();
   }
 
-  unsigned Flags = DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0;
+  auto PrevFlags = getContext().getCurrentDwarfLoc().getFlags();
+  unsigned Flags = PrevFlags & DWARF2_FLAG_IS_STMT;
   unsigned Isa = 0;
   int64_t Discriminator = 0;
 
@@ -3573,7 +3587,7 @@ bool AsmParser::parseDirectiveLoc() {
   if (parseMany(parseLocOp, false /*hasComma*/))
     return true;
 
-  getStreamer().EmitDwarfLocDirective(FileNumber, LineNumber, ColumnPos, Flags,
+  getStreamer().emitDwarfLocDirective(FileNumber, LineNumber, ColumnPos, Flags,
                                       Isa, Discriminator, StringRef());
 
   return false;
@@ -3787,7 +3801,7 @@ bool AsmParser::parseDirectiveCVLoc() {
   if (parseMany(parseOp, false /*hasComma*/))
     return true;
 
-  getStreamer().EmitCVLocDirective(FunctionId, FileNumber, LineNumber,
+  getStreamer().emitCVLocDirective(FunctionId, FileNumber, LineNumber,
                                    ColumnPos, PrologueEnd, IsStmt, StringRef(),
                                    DirectiveLoc);
   return false;
@@ -3813,7 +3827,7 @@ bool AsmParser::parseDirectiveCVLinetable() {
   MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
   MCSymbol *FnEndSym = getContext().getOrCreateSymbol(FnEndName);
 
-  getStreamer().EmitCVLinetableDirective(FunctionId, FnStartSym, FnEndSym);
+  getStreamer().emitCVLinetableDirective(FunctionId, FnStartSym, FnEndSym);
   return false;
 }
 
@@ -3847,7 +3861,7 @@ bool AsmParser::parseDirectiveCVInlineLinetable() {
 
   MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
   MCSymbol *FnEndSym = getContext().getOrCreateSymbol(FnEndName);
-  getStreamer().EmitCVInlineLinetableDirective(PrimaryFunctionId, SourceFileId,
+  getStreamer().emitCVInlineLinetableDirective(PrimaryFunctionId, SourceFileId,
                                                SourceLineNum, FnStartSym,
                                                FnEndSym);
   return false;
@@ -3904,7 +3918,7 @@ bool AsmParser::parseDirectiveCVDefRange() {
     codeview::DefRangeRegisterHeader DRHdr;
     DRHdr.Register = DRRegister;
     DRHdr.MayHaveNoName = 0;
-    getStreamer().EmitCVDefRangeDirective(Ranges, DRHdr);
+    getStreamer().emitCVDefRangeDirective(Ranges, DRHdr);
     break;
   }
   case CVDR_DEFRANGE_FRAMEPOINTER_REL: {
@@ -3916,7 +3930,7 @@ bool AsmParser::parseDirectiveCVDefRange() {
 
     codeview::DefRangeFramePointerRelHeader DRHdr;
     DRHdr.Offset = DROffset;
-    getStreamer().EmitCVDefRangeDirective(Ranges, DRHdr);
+    getStreamer().emitCVDefRangeDirective(Ranges, DRHdr);
     break;
   }
   case CVDR_DEFRANGE_SUBFIELD_REGISTER: {
@@ -3935,7 +3949,7 @@ bool AsmParser::parseDirectiveCVDefRange() {
     DRHdr.Register = DRRegister;
     DRHdr.MayHaveNoName = 0;
     DRHdr.OffsetInParent = DROffsetInParent;
-    getStreamer().EmitCVDefRangeDirective(Ranges, DRHdr);
+    getStreamer().emitCVDefRangeDirective(Ranges, DRHdr);
     break;
   }
   case CVDR_DEFRANGE_REGISTER_REL: {
@@ -3960,7 +3974,7 @@ bool AsmParser::parseDirectiveCVDefRange() {
     DRHdr.Register = DRRegister;
     DRHdr.Flags = DRFlags;
     DRHdr.BasePointerOffset = DRBasePointerOffset;
-    getStreamer().EmitCVDefRangeDirective(Ranges, DRHdr);
+    getStreamer().emitCVDefRangeDirective(Ranges, DRHdr);
     break;
   }
   default:
@@ -3979,21 +3993,21 @@ bool AsmParser::parseDirectiveCVString() {
   // Put the string in the table and emit the offset.
   std::pair<StringRef, unsigned> Insertion =
       getCVContext().addToStringTable(Data);
-  getStreamer().EmitIntValue(Insertion.second, 4);
+  getStreamer().emitInt32(Insertion.second);
   return false;
 }
 
 /// parseDirectiveCVStringTable
 /// ::= .cv_stringtable
 bool AsmParser::parseDirectiveCVStringTable() {
-  getStreamer().EmitCVStringTableDirective();
+  getStreamer().emitCVStringTableDirective();
   return false;
 }
 
 /// parseDirectiveCVFileChecksums
 /// ::= .cv_filechecksums
 bool AsmParser::parseDirectiveCVFileChecksums() {
-  getStreamer().EmitCVFileChecksumsDirective();
+  getStreamer().emitCVFileChecksumsDirective();
   return false;
 }
 
@@ -4005,7 +4019,7 @@ bool AsmParser::parseDirectiveCVFileChecksumOffset() {
     return true;
   if (parseToken(AsmToken::EndOfStatement, "Expected End of Statement"))
     return true;
-  getStreamer().EmitCVFileChecksumOffsetDirective(FileNo);
+  getStreamer().emitCVFileChecksumOffsetDirective(FileNo);
   return false;
 }
 
@@ -4050,7 +4064,7 @@ bool AsmParser::parseDirectiveCFISections() {
       Debug = true;
   }
 
-  getStreamer().EmitCFISections(EH, Debug);
+  getStreamer().emitCFISections(EH, Debug);
   return false;
 }
 
@@ -4070,14 +4084,14 @@ bool AsmParser::parseDirectiveCFIStartProc() {
   // expansion which can *ONLY* happen if Clang's cc1as is the API consumer.
   // Tools like llvm-mc on the other hand are not affected by it, and report
   // correct context information.
-  getStreamer().EmitCFIStartProc(!Simple.empty(), Lexer.getLoc());
+  getStreamer().emitCFIStartProc(!Simple.empty(), Lexer.getLoc());
   return false;
 }
 
 /// parseDirectiveCFIEndProc
 /// ::= .cfi_endproc
 bool AsmParser::parseDirectiveCFIEndProc() {
-  getStreamer().EmitCFIEndProc();
+  getStreamer().emitCFIEndProc();
   return false;
 }
 
@@ -4105,7 +4119,7 @@ bool AsmParser::parseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
       parseAbsoluteExpression(Offset))
     return true;
 
-  getStreamer().EmitCFIDefCfa(Register, Offset);
+  getStreamer().emitCFIDefCfa(Register, Offset);
   return false;
 }
 
@@ -4116,7 +4130,7 @@ bool AsmParser::parseDirectiveCFIDefCfaOffset() {
   if (parseAbsoluteExpression(Offset))
     return true;
 
-  getStreamer().EmitCFIDefCfaOffset(Offset);
+  getStreamer().emitCFIDefCfaOffset(Offset);
   return false;
 }
 
@@ -4129,14 +4143,14 @@ bool AsmParser::parseDirectiveCFIRegister(SMLoc DirectiveLoc) {
       parseRegisterOrRegisterNumber(Register2, DirectiveLoc))
     return true;
 
-  getStreamer().EmitCFIRegister(Register1, Register2);
+  getStreamer().emitCFIRegister(Register1, Register2);
   return false;
 }
 
 /// parseDirectiveCFIWindowSave
 /// ::= .cfi_window_save
 bool AsmParser::parseDirectiveCFIWindowSave() {
-  getStreamer().EmitCFIWindowSave();
+  getStreamer().emitCFIWindowSave();
   return false;
 }
 
@@ -4147,7 +4161,7 @@ bool AsmParser::parseDirectiveCFIAdjustCfaOffset() {
   if (parseAbsoluteExpression(Adjustment))
     return true;
 
-  getStreamer().EmitCFIAdjustCfaOffset(Adjustment);
+  getStreamer().emitCFIAdjustCfaOffset(Adjustment);
   return false;
 }
 
@@ -4158,7 +4172,7 @@ bool AsmParser::parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc) {
   if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
-  getStreamer().EmitCFIDefCfaRegister(Register);
+  getStreamer().emitCFIDefCfaRegister(Register);
   return false;
 }
 
@@ -4173,7 +4187,7 @@ bool AsmParser::parseDirectiveCFIOffset(SMLoc DirectiveLoc) {
       parseAbsoluteExpression(Offset))
     return true;
 
-  getStreamer().EmitCFIOffset(Register, Offset);
+  getStreamer().emitCFIOffset(Register, Offset);
   return false;
 }
 
@@ -4187,7 +4201,7 @@ bool AsmParser::parseDirectiveCFIRelOffset(SMLoc DirectiveLoc) {
       parseAbsoluteExpression(Offset))
     return true;
 
-  getStreamer().EmitCFIRelOffset(Register, Offset);
+  getStreamer().emitCFIRelOffset(Register, Offset);
   return false;
 }
 
@@ -4233,23 +4247,23 @@ bool AsmParser::parseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
   MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
 
   if (IsPersonality)
-    getStreamer().EmitCFIPersonality(Sym, Encoding);
+    getStreamer().emitCFIPersonality(Sym, Encoding);
   else
-    getStreamer().EmitCFILsda(Sym, Encoding);
+    getStreamer().emitCFILsda(Sym, Encoding);
   return false;
 }
 
 /// parseDirectiveCFIRememberState
 /// ::= .cfi_remember_state
 bool AsmParser::parseDirectiveCFIRememberState() {
-  getStreamer().EmitCFIRememberState();
+  getStreamer().emitCFIRememberState();
   return false;
 }
 
 /// parseDirectiveCFIRestoreState
 /// ::= .cfi_remember_state
 bool AsmParser::parseDirectiveCFIRestoreState() {
-  getStreamer().EmitCFIRestoreState();
+  getStreamer().emitCFIRestoreState();
   return false;
 }
 
@@ -4261,7 +4275,7 @@ bool AsmParser::parseDirectiveCFISameValue(SMLoc DirectiveLoc) {
   if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
-  getStreamer().EmitCFISameValue(Register);
+  getStreamer().emitCFISameValue(Register);
   return false;
 }
 
@@ -4272,7 +4286,7 @@ bool AsmParser::parseDirectiveCFIRestore(SMLoc DirectiveLoc) {
   if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
-  getStreamer().EmitCFIRestore(Register);
+  getStreamer().emitCFIRestore(Register);
   return false;
 }
 
@@ -4295,7 +4309,7 @@ bool AsmParser::parseDirectiveCFIEscape() {
     Values.push_back((uint8_t)CurrValue);
   }
 
-  getStreamer().EmitCFIEscape(Values);
+  getStreamer().emitCFIEscape(Values);
   return false;
 }
 
@@ -4305,7 +4319,7 @@ bool AsmParser::parseDirectiveCFIReturnColumn(SMLoc DirectiveLoc) {
   int64_t Register = 0;
   if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
-  getStreamer().EmitCFIReturnColumn(Register);
+  getStreamer().emitCFIReturnColumn(Register);
   return false;
 }
 
@@ -4316,7 +4330,7 @@ bool AsmParser::parseDirectiveCFISignalFrame() {
                  "unexpected token in '.cfi_signal_frame'"))
     return true;
 
-  getStreamer().EmitCFISignalFrame();
+  getStreamer().emitCFISignalFrame();
   return false;
 }
 
@@ -4328,7 +4342,7 @@ bool AsmParser::parseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
   if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
     return true;
 
-  getStreamer().EmitCFIUndefined(Register);
+  getStreamer().emitCFIUndefined(Register);
   return false;
 }
 
@@ -4368,9 +4382,9 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
   while (getLexer().isNot(AsmToken::EndOfStatement)) {
 
     if (!Parameters.empty() && Parameters.back().Vararg)
-      return Error(Lexer.getLoc(),
-                   "Vararg parameter '" + Parameters.back().Name +
-                   "' should be last one in the list of parameters.");
+      return Error(Lexer.getLoc(), "vararg parameter '" +
+                                       Parameters.back().Name +
+                                       "' should be the last parameter");
 
     MCAsmMacroParameter Parameter;
     if (parseIdentifier(Parameter.Name))
@@ -4439,7 +4453,8 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
     if (getLexer().is(AsmToken::Eof))
       return Error(DirectiveLoc, "no matching '.endmacro' in definition");
 
-    // Otherwise, check whether we have reach the .endmacro.
+    // Otherwise, check whether we have reach the .endmacro or the start of a
+    // preprocessor line marker.
     if (getLexer().is(AsmToken::Identifier)) {
       if (getTok().getIdentifier() == ".endm" ||
           getTok().getIdentifier() == ".endmacro") {
@@ -4459,6 +4474,8 @@ bool AsmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
         // macro is expanded so just ignore them for now.
         ++MacroDepth;
       }
+    } else if (Lexer.is(AsmToken::HashDirective)) {
+      (void)parseCppHashLineFilenameComment(getLexer().getLoc());
     }
 
     // Otherwise, scan til the end of the statement.
@@ -4661,7 +4678,7 @@ bool AsmParser::parseDirectiveBundleAlignMode() {
 
   // Because of AlignSizePow2's verified range we can safely truncate it to
   // unsigned.
-  getStreamer().EmitBundleAlignMode(static_cast<unsigned>(AlignSizePow2));
+  getStreamer().emitBundleAlignMode(static_cast<unsigned>(AlignSizePow2));
   return false;
 }
 
@@ -4686,7 +4703,7 @@ bool AsmParser::parseDirectiveBundleLock() {
     AlignToEnd = true;
   }
 
-  getStreamer().EmitBundleLock(AlignToEnd);
+  getStreamer().emitBundleLock(AlignToEnd);
   return false;
 }
 
@@ -4698,7 +4715,7 @@ bool AsmParser::parseDirectiveBundleUnlock() {
                  "unexpected token in '.bundle_unlock' directive"))
     return true;
 
-  getStreamer().EmitBundleUnlock();
+  getStreamer().emitBundleUnlock();
   return false;
 }
 
@@ -4752,10 +4769,10 @@ bool AsmParser::parseDirectiveDCB(StringRef IDVal, unsigned Size) {
     if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
       return Error(ExprLoc, "literal value out of range for directive");
     for (uint64_t i = 0, e = NumValues; i != e; ++i)
-      getStreamer().EmitIntValue(IntValue, Size);
+      getStreamer().emitIntValue(IntValue, Size);
   } else {
     for (uint64_t i = 0, e = NumValues; i != e; ++i)
-      getStreamer().EmitValue(Value, Size, ExprLoc);
+      getStreamer().emitValue(Value, Size, ExprLoc);
   }
 
   if (parseToken(AsmToken::EndOfStatement,
@@ -4791,7 +4808,7 @@ bool AsmParser::parseDirectiveRealDCB(StringRef IDVal, const fltSemantics &Seman
     return true;
 
   for (uint64_t i = 0, e = NumValues; i != e; ++i)
-    getStreamer().EmitIntValue(AsInt.getLimitedValue(),
+    getStreamer().emitIntValue(AsInt.getLimitedValue(),
                                AsInt.getBitWidth() / 8);
 
   return false;
@@ -4831,9 +4848,9 @@ bool AsmParser::parseDirectiveLEB128(bool Signed) {
     if (parseExpression(Value))
       return true;
     if (Signed)
-      getStreamer().EmitSLEB128Value(Value);
+      getStreamer().emitSLEB128Value(Value);
     else
-      getStreamer().EmitULEB128Value(Value);
+      getStreamer().emitULEB128Value(Value);
     return false;
   };
 
@@ -4857,7 +4874,7 @@ bool AsmParser::parseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
     if (Sym->isTemporary())
       return Error(Loc, "non-local symbol required");
 
-    if (!getStreamer().EmitSymbolAttribute(Sym, Attr))
+    if (!getStreamer().emitSymbolAttribute(Sym, Attr))
       return Error(Loc, "unable to emit symbol attribute");
     return false;
   };
@@ -4934,11 +4951,11 @@ bool AsmParser::parseDirectiveComm(bool IsLocal) {
 
   // Create the Symbol as a common or local common with Size and Pow2Alignment
   if (IsLocal) {
-    getStreamer().EmitLocalCommonSymbol(Sym, Size, 1 << Pow2Alignment);
+    getStreamer().emitLocalCommonSymbol(Sym, Size, 1 << Pow2Alignment);
     return false;
   }
 
-  getStreamer().EmitCommonSymbol(Sym, Size, 1 << Pow2Alignment);
+  getStreamer().emitCommonSymbol(Sym, Size, 1 << Pow2Alignment);
   return false;
 }
 
@@ -5320,6 +5337,12 @@ bool AsmParser::parseDirectiveEndIf(SMLoc DirectiveLoc) {
 }
 
 void AsmParser::initializeDirectiveKindMap() {
+  /* Lookup will be done with the directive
+   * converted to lower case, so all these
+   * keys should be lower case.
+   * (target specific directives are handled
+   *  elsewhere)
+   */
   DirectiveKindMap[".set"] = DK_SET;
   DirectiveKindMap[".equ"] = DK_EQU;
   DirectiveKindMap[".equiv"] = DK_EQUIV;
@@ -5720,7 +5743,7 @@ bool AsmParser::parseDirectivePrint(SMLoc DirectiveLoc) {
 }
 
 bool AsmParser::parseDirectiveAddrsig() {
-  getStreamer().EmitAddrsig();
+  getStreamer().emitAddrsig();
   return false;
 }
 
@@ -5730,7 +5753,7 @@ bool AsmParser::parseDirectiveAddrsigSym() {
             "expected identifier in '.addrsig_sym' directive"))
     return true;
   MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-  getStreamer().EmitAddrsigSym(Sym);
+  getStreamer().emitAddrsigSym(Sym);
   return false;
 }
 
diff --git a/llvm/lib/MC/MCParser/COFFAsmParser.cpp b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
index 51bb1fe92b73..2104fb83b309 100644
--- a/llvm/lib/MC/MCParser/COFFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/COFFAsmParser.cpp
@@ -70,6 +70,7 @@ class COFFAsmParser : public MCAsmParserExtension {
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveLinkOnce>(".linkonce");
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveRVA>(".rva");
     addDirectiveHandler<&COFFAsmParser::ParseDirectiveSymbolAttribute>(".weak");
+    addDirectiveHandler<&COFFAsmParser::ParseDirectiveCGProfile>(".cg_profile");
 
     // Win64 EH directives.
     addDirectiveHandler<&COFFAsmParser::ParseSEHDirectiveStartProc>(
@@ -125,6 +126,7 @@ class COFFAsmParser : public MCAsmParserExtension {
   bool parseCOMDATType(COFF::COMDATType &Type);
   bool ParseDirectiveLinkOnce(StringRef, SMLoc);
   bool ParseDirectiveRVA(StringRef, SMLoc);
+  bool ParseDirectiveCGProfile(StringRef, SMLoc);
 
   // Win64 EH directives.
   bool ParseSEHDirectiveStartProc(StringRef, SMLoc);
@@ -284,7 +286,7 @@ bool COFFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
 
       MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
 
-      getStreamer().EmitSymbolAttribute(Sym, Attr);
+      getStreamer().emitSymbolAttribute(Sym, Attr);
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
@@ -299,6 +301,10 @@ bool COFFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
   return false;
 }
 
+bool COFFAsmParser::ParseDirectiveCGProfile(StringRef S, SMLoc Loc) {
+  return MCAsmParserExtension::ParseDirectiveCGProfile(S, Loc);
+}
+
 bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
                                        unsigned Characteristics,
                                        SectionKind Kind) {
@@ -321,7 +327,7 @@ bool COFFAsmParser::ParseSectionSwitch(StringRef Section,
 }
 
 bool COFFAsmParser::ParseSectionName(StringRef &SectionName) {
-  if (!getLexer().is(AsmToken::Identifier))
+  if (!getLexer().is(AsmToken::Identifier) && !getLexer().is(AsmToken::String))
     return true;
 
   SectionName = getTok().getIdentifier();
@@ -591,8 +597,8 @@ bool COFFAsmParser::ParseDirectiveLinkOnce(StringRef, SMLoc Loc) {
     return Error(Loc, "cannot make section associative with .linkonce");
 
   if (Current->getCharacteristics() & COFF::IMAGE_SCN_LNK_COMDAT)
-    return Error(Loc, Twine("section '") + Current->getSectionName() +
-                                                       "' is already linkonce");
+    return Error(Loc, Twine("section '") + Current->getName() +
+                          "' is already linkonce");
 
   Current->setSelection(Type);
 
diff --git a/llvm/lib/MC/MCParser/COFFMasmParser.cpp b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
new file mode 100644
index 000000000000..b7c48e92961b
--- /dev/null
+++ b/llvm/lib/MC/MCParser/COFFMasmParser.cpp
@@ -0,0 +1,386 @@
+//===- COFFMasmParser.cpp - COFF MASM Assembly Parser ---------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/COFF.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCParser/MCAsmParserUtils.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSectionCOFF.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/SectionKind.h"
+#include "llvm/Support/SMLoc.h"
+#include <cassert>
+#include <cstdint>
+#include <limits>
+#include <utility>
+
+using namespace llvm;
+
+namespace {
+
+class COFFMasmParser : public MCAsmParserExtension {
+  template <bool (COFFMasmParser::*HandlerMethod)(StringRef, SMLoc)>
+  void addDirectiveHandler(StringRef Directive) {
+    MCAsmParser::ExtensionDirectiveHandler Handler =
+        std::make_pair(this, HandleDirective<COFFMasmParser, HandlerMethod>);
+    getParser().addDirectiveHandler(Directive, Handler);
+  }
+
+  bool ParseSectionSwitch(StringRef Section, unsigned Characteristics,
+                          SectionKind Kind);
+
+  bool ParseSectionSwitch(StringRef Section, unsigned Characteristics,
+                          SectionKind Kind, StringRef COMDATSymName,
+                          COFF::COMDATType Type);
+
+  bool ParseDirectiveProc(StringRef, SMLoc);
+  bool ParseDirectiveEndProc(StringRef, SMLoc);
+  bool ParseDirectiveSegment(StringRef, SMLoc);
+  bool ParseDirectiveSegmentEnd(StringRef, SMLoc);
+  bool ParseDirectiveIncludelib(StringRef, SMLoc);
+
+  bool IgnoreDirective(StringRef, SMLoc) {
+    while (!getLexer().is(AsmToken::EndOfStatement)) {
+      Lex();
+    }
+    return false;
+  }
+
+  void Initialize(MCAsmParser &Parser) override {
+    // Call the base implementation.
+    MCAsmParserExtension::Initialize(Parser);
+
+    // x64 directives
+    // .allocstack
+    // .endprolog
+    // .pushframe
+    // .pushreg
+    // .savereg
+    // .savexmm128
+    // .setframe
+
+    // Code label directives
+    // label
+    // org
+
+    // Conditional control flow directives
+    // .break
+    // .continue
+    // .else
+    // .elseif
+    // .endif
+    // .endw
+    // .if
+    // .repeat
+    // .until
+    // .untilcxz
+    // .while
+
+    // Data allocation directives
+    // align
+    // byte/sbyte
+    // dword/sdword
+    // even
+    // fword
+    // qword
+    // real4
+    // real8
+    // real10
+    // tbyte
+    // word/sword
+
+    // Listing control directives
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".cref");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".list");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".listall");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".listif");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".listmacro");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".listmacroall");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".nocref");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".nolist");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".nolistif");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".nolistmacro");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>("page");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>("subtitle");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".tfcond");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>("title");
+
+    // Macro directives
+    // endm
+    // exitm
+    // goto
+    // local
+    // macro
+    // purge
+
+    // Miscellaneous directives
+    // alias
+    // assume
+    // .fpo
+    addDirectiveHandler<&COFFMasmParser::ParseDirectiveIncludelib>(
+        "includelib");
+    // mmword
+    // option
+    // popcontext
+    // pushcontext
+    // .radix
+    // .safeseh
+    // xmmword
+    // ymmword
+
+    // Procedure directives
+    addDirectiveHandler<&COFFMasmParser::ParseDirectiveEndProc>("endp");
+    // invoke (32-bit only)
+    addDirectiveHandler<&COFFMasmParser::ParseDirectiveProc>("proc");
+    // proto
+
+    // Processor directives
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".386");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".386P");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".387");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".486");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".486P");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".586");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".586P");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".686");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".686P");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".k3d");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".mmx");
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".xmm");
+
+    // Repeat blocks directives
+    // for
+    // forc
+    // goto
+    // repeat
+    // while
+
+    // Scope directives
+    // comm
+    // externdef
+
+    // Segment directives
+    // .alpha (32-bit only, order segments alphabetically)
+    // .dosseg (32-bit only, order segments in DOS convention)
+    // .seq (32-bit only, order segments sequentially)
+    addDirectiveHandler<&COFFMasmParser::ParseDirectiveSegmentEnd>("ends");
+    // group (32-bit only)
+    addDirectiveHandler<&COFFMasmParser::ParseDirectiveSegment>("segment");
+
+    // Simplified segment directives
+    addDirectiveHandler<&COFFMasmParser::ParseSectionDirectiveCode>(".code");
+    // .const
+    addDirectiveHandler<
+        &COFFMasmParser::ParseSectionDirectiveInitializedData>(".data");
+    addDirectiveHandler<
+        &COFFMasmParser::ParseSectionDirectiveUninitializedData>(".data?");
+    // .exit
+    // .fardata
+    // .fardata?
+    addDirectiveHandler<&COFFMasmParser::IgnoreDirective>(".model");
+    // .stack
+    // .startup
+
+    // String directives, written <name> <directive> <params>
+    // catstr (equivalent to <name> TEXTEQU <params>)
+    // instr (equivalent to <name> = @InStr(<params>))
+    // sizestr (equivalent to <name> = @SizeStr(<params>))
+    // substr (equivalent to <name> TEXTEQU @SubStr(<params>))
+
+    // Structure and record directives
+    // ends
+    // record
+    // struct
+    // typedef
+    // union
+  }
+
+  bool ParseSectionDirectiveCode(StringRef, SMLoc) {
+    return ParseSectionSwitch(".text",
+                              COFF::IMAGE_SCN_CNT_CODE
+                            | COFF::IMAGE_SCN_MEM_EXECUTE
+                            | COFF::IMAGE_SCN_MEM_READ,
+                              SectionKind::getText());
+  }
+
+  bool ParseSectionDirectiveInitializedData(StringRef, SMLoc) {
+    return ParseSectionSwitch(".data",
+                              COFF::IMAGE_SCN_CNT_INITIALIZED_DATA
+                            | COFF::IMAGE_SCN_MEM_READ
+                            | COFF::IMAGE_SCN_MEM_WRITE,
+                              SectionKind::getData());
+  }
+
+  bool ParseSectionDirectiveUninitializedData(StringRef, SMLoc) {
+    return ParseSectionSwitch(".bss",
+                              COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA
+                            | COFF::IMAGE_SCN_MEM_READ
+                            | COFF::IMAGE_SCN_MEM_WRITE,
+                              SectionKind::getBSS());
+  }
+
+  StringRef CurrentProcedure;
+
+public:
+  COFFMasmParser() = default;
+};
+
+} // end anonymous namespace.
+
+static SectionKind computeSectionKind(unsigned Flags) {
+  if (Flags & COFF::IMAGE_SCN_MEM_EXECUTE)
+    return SectionKind::getText();
+  if (Flags & COFF::IMAGE_SCN_MEM_READ &&
+      (Flags & COFF::IMAGE_SCN_MEM_WRITE) == 0)
+    return SectionKind::getReadOnly();
+  return SectionKind::getData();
+}
+
+bool COFFMasmParser::ParseSectionSwitch(StringRef Section,
+                                        unsigned Characteristics,
+                                        SectionKind Kind) {
+  return ParseSectionSwitch(Section, Characteristics, Kind, "",
+                            (COFF::COMDATType)0);
+}
+
+bool COFFMasmParser::ParseSectionSwitch(StringRef Section,
+                                        unsigned Characteristics,
+                                        SectionKind Kind,
+                                        StringRef COMDATSymName,
+                                        COFF::COMDATType Type) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in section switching directive");
+  Lex();
+
+  getStreamer().SwitchSection(getContext().getCOFFSection(
+      Section, Characteristics, Kind, COMDATSymName, Type));
+
+  return false;
+}
+
+bool COFFMasmParser::ParseDirectiveSegment(StringRef Directive, SMLoc Loc) {
+  StringRef SegmentName;
+  if (!getLexer().is(AsmToken::Identifier))
+    return TokError("expected identifier in directive");
+  SegmentName = getTok().getIdentifier();
+  Lex();
+
+  StringRef SectionName = SegmentName;
+  SmallVector<char, 247> SectionNameVector;
+  unsigned Flags = COFF::IMAGE_SCN_CNT_INITIALIZED_DATA |
+                   COFF::IMAGE_SCN_MEM_READ | COFF::IMAGE_SCN_MEM_WRITE;
+  if (SegmentName == "_TEXT" || SegmentName.startswith("_TEXT$")) {
+    if (SegmentName.size() == 5) {
+      SectionName = ".text";
+    } else {
+      SectionName =
+          (".text$" + SegmentName.substr(6)).toStringRef(SectionNameVector);
+    }
+    Flags = COFF::IMAGE_SCN_CNT_CODE | COFF::IMAGE_SCN_MEM_EXECUTE |
+            COFF::IMAGE_SCN_MEM_READ;
+  }
+  SectionKind Kind = computeSectionKind(Flags);
+  getStreamer().SwitchSection(getContext().getCOFFSection(
+      SectionName, Flags, Kind, "", (COFF::COMDATType)(0)));
+  return false;
+}
+
+/// ParseDirectiveSegmentEnd
+///  ::= identifier "ends"
+bool COFFMasmParser::ParseDirectiveSegmentEnd(StringRef Directive, SMLoc Loc) {
+  StringRef SegmentName;
+  if (!getLexer().is(AsmToken::Identifier))
+    return TokError("expected identifier in directive");
+  SegmentName = getTok().getIdentifier();
+
+  // Ignore; no action necessary.
+  Lex();
+  return false;
+}
+
+/// ParseDirectiveIncludelib
+///  ::= "includelib" identifier
+bool COFFMasmParser::ParseDirectiveIncludelib(StringRef Directive, SMLoc Loc) {
+  StringRef Lib;
+  if (getParser().parseIdentifier(Lib))
+    return TokError("expected identifier in includelib directive");
+
+  unsigned Flags = COFF::IMAGE_SCN_MEM_PRELOAD | COFF::IMAGE_SCN_MEM_16BIT;
+  SectionKind Kind = computeSectionKind(Flags);
+  getStreamer().PushSection();
+  getStreamer().SwitchSection(getContext().getCOFFSection(
+      ".drectve", Flags, Kind, "", (COFF::COMDATType)(0)));
+  getStreamer().emitBytes("/DEFAULTLIB:");
+  getStreamer().emitBytes(Lib);
+  getStreamer().emitBytes(" ");
+  getStreamer().PopSection();
+  return false;
+}
+
+/// ParseDirectiveProc
+/// TODO(epastor): Implement parameters and other attributes.
+///  ::= label "proc" [[distance]]
+///          statements
+///      label "endproc"
+bool COFFMasmParser::ParseDirectiveProc(StringRef Directive, SMLoc Loc) {
+  StringRef Label;
+  if (getParser().parseIdentifier(Label))
+    return Error(Loc, "expected identifier for procedure");
+  if (getLexer().is(AsmToken::Identifier)) {
+    StringRef nextVal = getTok().getString();
+    SMLoc nextLoc = getTok().getLoc();
+    if (nextVal.equals_lower("far")) {
+      // TODO(epastor): Handle far procedure definitions.
+      Lex();
+      return Error(nextLoc, "far procedure definitions not yet supported");
+    } else if (nextVal.equals_lower("near")) {
+      Lex();
+      nextVal = getTok().getString();
+      nextLoc = getTok().getLoc();
+    }
+  }
+  MCSymbol *Sym = getContext().getOrCreateSymbol(Label);
+
+  // Define symbol as simple function
+  getStreamer().BeginCOFFSymbolDef(Sym);
+  getStreamer().EmitCOFFSymbolStorageClass(2);
+  getStreamer().EmitCOFFSymbolType(0x20);
+  getStreamer().EndCOFFSymbolDef();
+
+  getStreamer().emitLabel(Sym, Loc);
+  CurrentProcedure = Label;
+  return false;
+}
+bool COFFMasmParser::ParseDirectiveEndProc(StringRef Directive, SMLoc Loc) {
+  StringRef Label;
+  SMLoc LabelLoc = getTok().getLoc();
+  if (getParser().parseIdentifier(Label))
+    return Error(LabelLoc, "expected identifier for procedure end");
+
+  if (CurrentProcedure.empty())
+    return Error(Loc, "endp outside of procedure block");
+  else if (CurrentProcedure != Label)
+    return Error(LabelLoc, "endp does not match current procedure '" +
+                               CurrentProcedure + "'");
+  return false;
+}
+
+namespace llvm {
+
+MCAsmParserExtension *createCOFFMasmParser() { return new COFFMasmParser; }
+
+} // end namespace llvm
diff --git a/llvm/lib/MC/MCParser/DarwinAsmParser.cpp b/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
index bd66e5f39c0d..b670355a392b 100644
--- a/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/DarwinAsmParser.cpp
@@ -492,7 +492,7 @@ bool DarwinAsmParser::parseSectionSwitch(StringRef Segment, StringRef Section,
   // is no good reason for someone to intentionally emit incorrectly sized
   // values into the implicitly aligned sections.
   if (Align)
-    getStreamer().EmitValueToAlignment(Align);
+    getStreamer().emitValueToAlignment(Align);
 
   return false;
 }
@@ -510,7 +510,7 @@ bool DarwinAsmParser::parseDirectiveAltEntry(StringRef, SMLoc) {
   if (Sym->isDefined())
     return TokError(".alt_entry must preceed symbol definition");
 
-  if (!getStreamer().EmitSymbolAttribute(Sym, MCSA_AltEntry))
+  if (!getStreamer().emitSymbolAttribute(Sym, MCSA_AltEntry))
     return TokError("unable to emit symbol attribute");
 
   Lex();
@@ -541,7 +541,7 @@ bool DarwinAsmParser::parseDirectiveDesc(StringRef, SMLoc) {
   Lex();
 
   // Set the n_desc field of this Symbol to this DescValue
-  getStreamer().EmitSymbolDesc(Sym, DescValue);
+  getStreamer().emitSymbolDesc(Sym, DescValue);
 
   return false;
 }
@@ -569,7 +569,7 @@ bool DarwinAsmParser::parseDirectiveIndirectSymbol(StringRef, SMLoc Loc) {
   if (Sym->isTemporary())
     return TokError("non-local symbol required in directive");
 
-  if (!getStreamer().EmitSymbolAttribute(Sym, MCSA_IndirectSymbol))
+  if (!getStreamer().emitSymbolAttribute(Sym, MCSA_IndirectSymbol))
     return TokError("unable to emit indirect symbol attribute for: " + Name);
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
@@ -625,7 +625,7 @@ bool DarwinAsmParser::parseDirectiveLinkerOption(StringRef IDVal, SMLoc) {
     Lex();
   }
 
-  getStreamer().EmitLinkerOptions(Args);
+  getStreamer().emitLinkerOptions(Args);
   return false;
 }
 
@@ -672,7 +672,7 @@ bool DarwinAsmParser::parseDirectiveSection(StringRef, SMLoc) {
   if (!getLexer().is(AsmToken::Comma))
     return TokError("unexpected token in '.section' directive");
 
-  std::string SectionSpec = SectionName;
+  std::string SectionSpec = std::string(SectionName);
   SectionSpec += ",";
 
   // Add all the tokens until the end of the line, ParseSectionSpecifier will
@@ -819,7 +819,7 @@ bool DarwinAsmParser::parseDirectiveSubsectionsViaSymbols(StringRef, SMLoc) {
 
   Lex();
 
-  getStreamer().EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+  getStreamer().emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
 
   return false;
 }
@@ -870,11 +870,11 @@ bool DarwinAsmParser::parseDirectiveTBSS(StringRef, SMLoc) {
   if (!Sym->isUndefined())
     return Error(IDLoc, "invalid symbol redefinition");
 
-  getStreamer().EmitTBSSSymbol(getContext().getMachOSection(
-                                 "__DATA", "__thread_bss",
-                                 MachO::S_THREAD_LOCAL_ZEROFILL,
-                                 0, SectionKind::getThreadBSS()),
-                               Sym, Size, 1 << Pow2Alignment);
+  getStreamer().emitTBSSSymbol(
+      getContext().getMachOSection("__DATA", "__thread_bss",
+                                   MachO::S_THREAD_LOCAL_ZEROFILL, 0,
+                                   SectionKind::getThreadBSS()),
+      Sym, Size, 1 << Pow2Alignment);
 
   return false;
 }
@@ -901,7 +901,7 @@ bool DarwinAsmParser::parseDirectiveZerofill(StringRef, SMLoc) {
   // the section but with no symbol.
   if (getLexer().is(AsmToken::EndOfStatement)) {
     // Create the zerofill section but no symbol
-    getStreamer().EmitZerofill(
+    getStreamer().emitZerofill(
         getContext().getMachOSection(Segment, Section, MachO::S_ZEROFILL, 0,
                                      SectionKind::getBSS()),
         /*Symbol=*/nullptr, /*Size=*/0, /*ByteAlignment=*/0, SectionLoc);
@@ -960,7 +960,7 @@ bool DarwinAsmParser::parseDirectiveZerofill(StringRef, SMLoc) {
   // Create the zerofill Symbol with Size and Pow2Alignment
   //
   // FIXME: Arch specific.
-  getStreamer().EmitZerofill(getContext().getMachOSection(
+  getStreamer().emitZerofill(getContext().getMachOSection(
                                Segment, Section, MachO::S_ZEROFILL,
                                0, SectionKind::getBSS()),
                              Sym, Size, 1 << Pow2Alignment, SectionLoc);
@@ -973,7 +973,7 @@ bool DarwinAsmParser::parseDirectiveZerofill(StringRef, SMLoc) {
 bool DarwinAsmParser::parseDirectiveDataRegion(StringRef, SMLoc) {
   if (getLexer().is(AsmToken::EndOfStatement)) {
     Lex();
-    getStreamer().EmitDataRegion(MCDR_DataRegion);
+    getStreamer().emitDataRegion(MCDR_DataRegion);
     return false;
   }
   StringRef RegionType;
@@ -989,7 +989,7 @@ bool DarwinAsmParser::parseDirectiveDataRegion(StringRef, SMLoc) {
     return Error(Loc, "unknown region type in '.data_region' directive");
   Lex();
 
-  getStreamer().EmitDataRegion((MCDataRegionType)Kind);
+  getStreamer().emitDataRegion((MCDataRegionType)Kind);
   return false;
 }
 
@@ -1000,7 +1000,7 @@ bool DarwinAsmParser::parseDirectiveDataRegionEnd(StringRef, SMLoc) {
     return TokError("unexpected token in '.end_data_region' directive");
 
   Lex();
-  getStreamer().EmitDataRegion(MCDR_DataRegionEnd);
+  getStreamer().emitDataRegion(MCDR_DataRegionEnd);
   return false;
 }
 
@@ -1137,7 +1137,7 @@ bool DarwinAsmParser::parseVersionMin(StringRef Directive, SMLoc Loc,
 
   Triple::OSType ExpectedOS = getOSTypeFromMCVM(Type);
   checkVersion(Directive, StringRef(), Loc, ExpectedOS);
-  getStreamer().EmitVersionMin(Type, Major, Minor, Update, SDKVersion);
+  getStreamer().emitVersionMin(Type, Major, Minor, Update, SDKVersion);
   return false;
 }
 
@@ -1194,7 +1194,7 @@ bool DarwinAsmParser::parseBuildVersion(StringRef Directive, SMLoc Loc) {
   Triple::OSType ExpectedOS
     = getOSTypeFromPlatform((MachO::PlatformType)Platform);
   checkVersion(Directive, PlatformName, Loc, ExpectedOS);
-  getStreamer().EmitBuildVersion(Platform, Major, Minor, Update, SDKVersion);
+  getStreamer().emitBuildVersion(Platform, Major, Minor, Update, SDKVersion);
   return false;
 }
 
diff --git a/llvm/lib/MC/MCParser/ELFAsmParser.cpp b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
index a55bdd5364cb..e5ab13bc719d 100644
--- a/llvm/lib/MC/MCParser/ELFAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/ELFAsmParser.cpp
@@ -158,7 +158,7 @@ private:
   bool maybeParseSectionType(StringRef &TypeName);
   bool parseMergeSize(int64_t &Size);
   bool parseGroup(StringRef &GroupName);
-  bool parseMetadataSym(MCSymbolELF *&Associated);
+  bool parseLinkedToSym(MCSymbolELF *&LinkedToSym);
   bool maybeParseUniqueID(int64_t &UniqueID);
 };
 
@@ -184,7 +184,7 @@ bool ELFAsmParser::ParseDirectiveSymbolAttribute(StringRef Directive, SMLoc) {
 
       MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
 
-      getStreamer().EmitSymbolAttribute(Sym, Attr);
+      getStreamer().emitSymbolAttribute(Sym, Attr);
 
       if (getLexer().is(AsmToken::EndOfStatement))
         break;
@@ -443,17 +443,18 @@ bool ELFAsmParser::parseGroup(StringRef &GroupName) {
   return false;
 }
 
-bool ELFAsmParser::parseMetadataSym(MCSymbolELF *&Associated) {
+bool ELFAsmParser::parseLinkedToSym(MCSymbolELF *&LinkedToSym) {
   MCAsmLexer &L = getLexer();
   if (L.isNot(AsmToken::Comma))
-    return TokError("expected metadata symbol");
+    return TokError("expected linked-to symbol");
   Lex();
   StringRef Name;
+  SMLoc StartLoc = L.getLoc();
   if (getParser().parseIdentifier(Name))
-    return TokError("invalid metadata symbol");
-  Associated = dyn_cast_or_null<MCSymbolELF>(getContext().lookupSymbol(Name));
-  if (!Associated || !Associated->isInSection())
-    return TokError("symbol is not in a section: " + Name);
+    return TokError("invalid linked-to symbol");
+  LinkedToSym = dyn_cast_or_null<MCSymbolELF>(getContext().lookupSymbol(Name));
+  if (!LinkedToSym || !LinkedToSym->isInSection())
+    return Error(StartLoc, "linked-to symbol is not in a section: " + Name);
   return false;
 }
 
@@ -495,7 +496,7 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
   unsigned Flags = 0;
   const MCExpr *Subsection = nullptr;
   bool UseLastGroup = false;
-  MCSymbolELF *Associated = nullptr;
+  MCSymbolELF *LinkedToSym = nullptr;
   int64_t UniqueID = ~0;
 
   // Set the defaults first.
@@ -568,7 +569,7 @@ bool ELFAsmParser::ParseSectionArguments(bool IsPush, SMLoc loc) {
       if (parseGroup(GroupName))
         return true;
     if (Flags & ELF::SHF_LINK_ORDER)
-      if (parseMetadataSym(Associated))
+      if (parseLinkedToSym(LinkedToSym))
         return true;
     if (maybeParseUniqueID(UniqueID))
       return true;
@@ -633,21 +634,33 @@ EndStmt:
       }
   }
 
-  MCSection *ELFSection =
-      getContext().getELFSection(SectionName, Type, Flags, Size, GroupName,
-                                 UniqueID, Associated);
-  getStreamer().SwitchSection(ELFSection, Subsection);
+  MCSectionELF *Section = getContext().getELFSection(
+      SectionName, Type, Flags, Size, GroupName, UniqueID, LinkedToSym);
+  getStreamer().SwitchSection(Section, Subsection);
+  // x86-64 psABI names SHT_X86_64_UNWIND as the canonical type for .eh_frame,
+  // but GNU as emits SHT_PROGBITS .eh_frame for .cfi_* directives. Don't error
+  // for SHT_PROGBITS .eh_frame
+  if (Section->getType() != Type &&
+      !(SectionName == ".eh_frame" && Type == ELF::SHT_PROGBITS))
+    Error(loc, "changed section type for " + SectionName + ", expected: 0x" +
+                   utohexstr(Section->getType()));
+  if (Section->getFlags() != Flags)
+    Error(loc, "changed section flags for " + SectionName + ", expected: 0x" +
+                   utohexstr(Section->getFlags()));
+  if (Section->getEntrySize() != Size)
+    Error(loc, "changed section entsize for " + SectionName +
+                   ", expected: " + Twine(Section->getEntrySize()));
 
   if (getContext().getGenDwarfForAssembly()) {
-    bool InsertResult = getContext().addGenDwarfSection(ELFSection);
+    bool InsertResult = getContext().addGenDwarfSection(Section);
     if (InsertResult) {
       if (getContext().getDwarfVersion() <= 2)
         Warning(loc, "DWARF2 only supports one section per compilation unit");
 
-      if (!ELFSection->getBeginSymbol()) {
+      if (!Section->getBeginSymbol()) {
         MCSymbol *SectionStartSymbol = getContext().createTempSymbol();
-        getStreamer().EmitLabel(SectionStartSymbol);
-        ELFSection->setBeginSymbol(SectionStartSymbol);
+        getStreamer().emitLabel(SectionStartSymbol);
+        Section->setBeginSymbol(SectionStartSymbol);
       }
     }
   }
@@ -729,7 +742,7 @@ bool ELFAsmParser::ParseDirectiveType(StringRef, SMLoc) {
     return TokError("unexpected token in '.type' directive");
   Lex();
 
-  getStreamer().EmitSymbolAttribute(Sym, Attr);
+  getStreamer().emitSymbolAttribute(Sym, Attr);
 
   return false;
 }
@@ -748,7 +761,7 @@ bool ELFAsmParser::ParseDirectiveIdent(StringRef, SMLoc) {
     return TokError("unexpected token in '.ident' directive");
   Lex();
 
-  getStreamer().EmitIdent(Data);
+  getStreamer().emitIdent(Data);
   return false;
 }
 
@@ -797,12 +810,12 @@ bool ELFAsmParser::ParseDirectiveVersion(StringRef, SMLoc) {
 
   getStreamer().PushSection();
   getStreamer().SwitchSection(Note);
-  getStreamer().EmitIntValue(Data.size()+1, 4); // namesz.
-  getStreamer().EmitIntValue(0, 4);             // descsz = 0 (no description).
-  getStreamer().EmitIntValue(1, 4);             // type = NT_VERSION.
-  getStreamer().EmitBytes(Data);                // name.
-  getStreamer().EmitIntValue(0, 1);             // terminate the string.
-  getStreamer().EmitValueToAlignment(4);        // ensure 4 byte alignment.
+  getStreamer().emitInt32(Data.size() + 1); // namesz
+  getStreamer().emitInt32(0);               // descsz = 0 (no description).
+  getStreamer().emitInt32(1);               // type = NT_VERSION
+  getStreamer().emitBytes(Data);            // name
+  getStreamer().emitInt8(0);                // NUL
+  getStreamer().emitValueToAlignment(4);
   getStreamer().PopSection();
   return false;
 }
@@ -829,7 +842,7 @@ bool ELFAsmParser::ParseDirectiveWeakref(StringRef, SMLoc) {
 
   MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
 
-  getStreamer().EmitWeakReference(Alias, Sym);
+  getStreamer().emitWeakReference(Alias, Sym);
   return false;
 }
 
@@ -849,45 +862,8 @@ bool ELFAsmParser::ParseDirectiveSubsection(StringRef, SMLoc) {
   return false;
 }
 
-/// ParseDirectiveCGProfile
-///  ::= .cg_profile identifier, identifier, <number>
-bool ELFAsmParser::ParseDirectiveCGProfile(StringRef, SMLoc) {
-  StringRef From;
-  SMLoc FromLoc = getLexer().getLoc();
-  if (getParser().parseIdentifier(From))
-    return TokError("expected identifier in directive");
-
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("expected a comma");
-  Lex();
-
-  StringRef To;
-  SMLoc ToLoc = getLexer().getLoc();
-  if (getParser().parseIdentifier(To))
-    return TokError("expected identifier in directive");
-
-  if (getLexer().isNot(AsmToken::Comma))
-    return TokError("expected a comma");
-  Lex();
-
-  int64_t Count;
-  if (getParser().parseIntToken(
-          Count, "expected integer count in '.cg_profile' directive"))
-    return true;
-
-  if (getLexer().isNot(AsmToken::EndOfStatement))
-    return TokError("unexpected token in directive");
-
-  MCSymbol *FromSym = getContext().getOrCreateSymbol(From);
-  MCSymbol *ToSym = getContext().getOrCreateSymbol(To);
-
-  getStreamer().emitCGProfileEntry(
-      MCSymbolRefExpr::create(FromSym, MCSymbolRefExpr::VK_None, getContext(),
-                              FromLoc),
-      MCSymbolRefExpr::create(ToSym, MCSymbolRefExpr::VK_None, getContext(),
-                              ToLoc),
-      Count);
-  return false;
+bool ELFAsmParser::ParseDirectiveCGProfile(StringRef S, SMLoc Loc) {
+  return MCAsmParserExtension::ParseDirectiveCGProfile(S, Loc);
 }
 
 namespace llvm {
diff --git a/llvm/lib/MC/MCParser/MCAsmParser.cpp b/llvm/lib/MC/MCParser/MCAsmParser.cpp
index 41a1ee555d6f..c2fa7be56ad2 100644
--- a/llvm/lib/MC/MCParser/MCAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/MCAsmParser.cpp
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCParser/MCAsmLexer.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
 #include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/SMLoc.h"
 #include "llvm/Support/raw_ostream.h"
@@ -20,6 +21,10 @@
 
 using namespace llvm;
 
+cl::opt<unsigned> AsmMacroMaxNestingDepth(
+    "asm-macro-max-nesting-depth", cl::init(20), cl::Hidden,
+    cl::desc("The maximum nesting depth allowed for assembly macros."));
+
 MCAsmParser::MCAsmParser() {}
 
 MCAsmParser::~MCAsmParser() = default;
diff --git a/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp b/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp
index 18d18f0cf6ed..0b5046cd8fad 100644
--- a/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp
+++ b/llvm/lib/MC/MCParser/MCAsmParserExtension.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCStreamer.h"
 
 using namespace llvm;
 
@@ -17,3 +19,44 @@ MCAsmParserExtension::~MCAsmParserExtension() = default;
 void MCAsmParserExtension::Initialize(MCAsmParser &Parser) {
   this->Parser = &Parser;
 }
+
+/// ParseDirectiveCGProfile
+///  ::= .cg_profile identifier, identifier, <number>
+bool MCAsmParserExtension::ParseDirectiveCGProfile(StringRef, SMLoc) {
+  StringRef From;
+  SMLoc FromLoc = getLexer().getLoc();
+  if (getParser().parseIdentifier(From))
+    return TokError("expected identifier in directive");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("expected a comma");
+  Lex();
+
+  StringRef To;
+  SMLoc ToLoc = getLexer().getLoc();
+  if (getParser().parseIdentifier(To))
+    return TokError("expected identifier in directive");
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("expected a comma");
+  Lex();
+
+  int64_t Count;
+  if (getParser().parseIntToken(
+          Count, "expected integer count in '.cg_profile' directive"))
+    return true;
+
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in directive");
+
+  MCSymbol *FromSym = getContext().getOrCreateSymbol(From);
+  MCSymbol *ToSym = getContext().getOrCreateSymbol(To);
+
+  getStreamer().emitCGProfileEntry(
+      MCSymbolRefExpr::create(FromSym, MCSymbolRefExpr::VK_None, getContext(),
+                              FromLoc),
+      MCSymbolRefExpr::create(ToSym, MCSymbolRefExpr::VK_None, getContext(),
+                              ToLoc),
+      Count);
+  return false;
+}
diff --git a/llvm/lib/MC/MCParser/MasmParser.cpp b/llvm/lib/MC/MCParser/MasmParser.cpp
new file mode 100644
index 000000000000..58c22b2ccef2
--- /dev/null
+++ b/llvm/lib/MC/MCParser/MasmParser.cpp
@@ -0,0 +1,6876 @@
+//===- AsmParser.cpp - Parser for Assembly Files --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class implements the parser for assembly files.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/APFloat.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/BinaryFormat/Dwarf.h"
+#include "llvm/DebugInfo/CodeView/SymbolRecord.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCCodeView.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDirectives.h"
+#include "llvm/MC/MCDwarf.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInstPrinter.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectFileInfo.h"
+#include "llvm/MC/MCParser/AsmCond.h"
+#include "llvm/MC/MCParser/AsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCAsmParserExtension.h"
+#include "llvm/MC/MCParser/MCAsmParserUtils.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSection.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Support/MathExtras.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <cassert>
+#include <cctype>
+#include <climits>
+#include <cstddef>
+#include <cstdint>
+#include <deque>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+using namespace llvm;
+
+extern cl::opt<unsigned> AsmMacroMaxNestingDepth;
+
+namespace {
+
+/// Helper types for tracking macro definitions.
+typedef std::vector<AsmToken> MCAsmMacroArgument;
+typedef std::vector<MCAsmMacroArgument> MCAsmMacroArguments;
+
+/// Helper class for storing information about an active macro instantiation.
+struct MacroInstantiation {
+  /// The location of the instantiation.
+  SMLoc InstantiationLoc;
+
+  /// The buffer where parsing should resume upon instantiation completion.
+  unsigned ExitBuffer;
+
+  /// The location where parsing should resume upon instantiation completion.
+  SMLoc ExitLoc;
+
+  /// The depth of TheCondStack at the start of the instantiation.
+  size_t CondStackDepth;
+};
+
+struct ParseStatementInfo {
+  /// The parsed operands from the last parsed statement.
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 8> ParsedOperands;
+
+  /// The opcode from the last parsed instruction.
+  unsigned Opcode = ~0U;
+
+  /// Was there an error parsing the inline assembly?
+  bool ParseError = false;
+
+  SmallVectorImpl<AsmRewrite> *AsmRewrites = nullptr;
+
+  ParseStatementInfo() = delete;
+  ParseStatementInfo(SmallVectorImpl<AsmRewrite> *rewrites)
+      : AsmRewrites(rewrites) {}
+};
+
+enum FieldType {
+  FT_INTEGRAL, // Initializer: integer expression, stored as an MCExpr.
+  FT_REAL,     // Initializer: real number, stored as an APInt.
+  FT_STRUCT    // Initializer: struct initializer, stored recursively.
+};
+
+struct FieldInfo;
+struct StructInfo {
+  StringRef Name;
+  bool IsUnion = false;
+  size_t Alignment = 0;
+  size_t Size = 0;
+  std::vector<FieldInfo> Fields;
+  StringMap<size_t> FieldsByName;
+
+  FieldInfo &addField(StringRef FieldName, FieldType FT);
+
+  StructInfo() = default;
+
+  StructInfo(StringRef StructName, bool Union, unsigned AlignmentValue)
+      : Name(StructName), IsUnion(Union), Alignment(AlignmentValue) {}
+};
+
+// FIXME: This should probably use a class hierarchy, raw pointers between the
+// objects, and dynamic type resolution instead of a union. On the other hand,
+// ownership then becomes much more complicated; the obvious thing would be to
+// use BumpPtrAllocator, but the lack of a destructor makes that messy.
+
+struct StructInitializer;
+struct IntFieldInfo {
+  SmallVector<const MCExpr *, 1> Values;
+
+  IntFieldInfo() = default;
+  IntFieldInfo(const SmallVector<const MCExpr *, 1> &V) { Values = V; }
+  IntFieldInfo(SmallVector<const MCExpr *, 1> &&V) { Values = V; }
+};
+struct RealFieldInfo {
+  SmallVector<APInt, 1> AsIntValues;
+
+  RealFieldInfo() = default;
+  RealFieldInfo(const SmallVector<APInt, 1> &V) { AsIntValues = V; }
+  RealFieldInfo(SmallVector<APInt, 1> &&V) { AsIntValues = V; }
+};
+struct StructFieldInfo {
+  std::vector<StructInitializer> Initializers;
+  StructInfo Structure;
+
+  StructFieldInfo() = default;
+  StructFieldInfo(const std::vector<StructInitializer> &V, StructInfo S) {
+    Initializers = V;
+    Structure = S;
+  }
+  StructFieldInfo(std::vector<StructInitializer> &&V, StructInfo S) {
+    Initializers = V;
+    Structure = S;
+  }
+};
+
+class FieldInitializer {
+public:
+  FieldType FT;
+  union {
+    IntFieldInfo IntInfo;
+    RealFieldInfo RealInfo;
+    StructFieldInfo StructInfo;
+  };
+
+  ~FieldInitializer() {
+    switch (FT) {
+    case FT_INTEGRAL:
+      IntInfo.~IntFieldInfo();
+      break;
+    case FT_REAL:
+      RealInfo.~RealFieldInfo();
+      break;
+    case FT_STRUCT:
+      StructInfo.~StructFieldInfo();
+      break;
+    }
+  }
+
+  FieldInitializer(FieldType FT) : FT(FT) {
+    switch (FT) {
+    case FT_INTEGRAL:
+      new (&IntInfo) IntFieldInfo();
+      break;
+    case FT_REAL:
+      new (&RealInfo) RealFieldInfo();
+      break;
+    case FT_STRUCT:
+      new (&StructInfo) StructFieldInfo();
+      break;
+    }
+  }
+
+  FieldInitializer(SmallVector<const MCExpr *, 1> &&Values) : FT(FT_INTEGRAL) {
+    new (&IntInfo) IntFieldInfo(Values);
+  }
+
+  FieldInitializer(SmallVector<APInt, 1> &&AsIntValues) : FT(FT_REAL) {
+    new (&RealInfo) RealFieldInfo(AsIntValues);
+  }
+
+  FieldInitializer(std::vector<StructInitializer> &&Initializers,
+                   struct StructInfo Structure)
+      : FT(FT_STRUCT) {
+    new (&StructInfo) StructFieldInfo(Initializers, Structure);
+  }
+
+  FieldInitializer(const FieldInitializer &Initializer) : FT(Initializer.FT) {
+    switch (FT) {
+    case FT_INTEGRAL:
+      new (&IntInfo) IntFieldInfo(Initializer.IntInfo);
+      break;
+    case FT_REAL:
+      new (&RealInfo) RealFieldInfo(Initializer.RealInfo);
+      break;
+    case FT_STRUCT:
+      new (&StructInfo) StructFieldInfo(Initializer.StructInfo);
+      break;
+    }
+  }
+
+  FieldInitializer(FieldInitializer &&Initializer) : FT(Initializer.FT) {
+    switch (FT) {
+    case FT_INTEGRAL:
+      new (&IntInfo) IntFieldInfo(Initializer.IntInfo);
+      break;
+    case FT_REAL:
+      new (&RealInfo) RealFieldInfo(Initializer.RealInfo);
+      break;
+    case FT_STRUCT:
+      new (&StructInfo) StructFieldInfo(Initializer.StructInfo);
+      break;
+    }
+  }
+
+  FieldInitializer &operator=(const FieldInitializer &Initializer) {
+    if (FT != Initializer.FT) {
+      switch (FT) {
+      case FT_INTEGRAL:
+        IntInfo.~IntFieldInfo();
+        break;
+      case FT_REAL:
+        RealInfo.~RealFieldInfo();
+        break;
+      case FT_STRUCT:
+        StructInfo.~StructFieldInfo();
+        break;
+      }
+    }
+    FT = Initializer.FT;
+    switch (FT) {
+    case FT_INTEGRAL:
+      IntInfo = Initializer.IntInfo;
+      break;
+    case FT_REAL:
+      RealInfo = Initializer.RealInfo;
+      break;
+    case FT_STRUCT:
+      StructInfo = Initializer.StructInfo;
+      break;
+    }
+    return *this;
+  }
+
+  FieldInitializer &operator=(FieldInitializer &&Initializer) {
+    if (FT != Initializer.FT) {
+      switch (FT) {
+      case FT_INTEGRAL:
+        IntInfo.~IntFieldInfo();
+        break;
+      case FT_REAL:
+        RealInfo.~RealFieldInfo();
+        break;
+      case FT_STRUCT:
+        StructInfo.~StructFieldInfo();
+        break;
+      }
+    }
+    FT = Initializer.FT;
+    switch (FT) {
+    case FT_INTEGRAL:
+      IntInfo = Initializer.IntInfo;
+      break;
+    case FT_REAL:
+      RealInfo = Initializer.RealInfo;
+      break;
+    case FT_STRUCT:
+      StructInfo = Initializer.StructInfo;
+      break;
+    }
+    return *this;
+  }
+};
+
+struct StructInitializer {
+  std::vector<FieldInitializer> FieldInitializers;
+};
+
+struct FieldInfo {
+  // Offset of the field within the containing STRUCT.
+  size_t Offset = 0;
+
+  // Total size of the field (= LengthOf * Type).
+  size_t SizeOf = 0;
+
+  // Number of elements in the field (1 if scalar, >1 if an array).
+  size_t LengthOf = 0;
+
+  // Size of a single entry in this field, in bytes ("type" in MASM standards).
+  size_t Type = 0;
+
+  FieldInitializer Contents;
+
+  FieldInfo(FieldType FT) : Contents(FT) {}
+};
+
+FieldInfo &StructInfo::addField(StringRef FieldName, FieldType FT) {
+  if (!FieldName.empty())
+    FieldsByName[FieldName] = Fields.size();
+  Fields.emplace_back(FT);
+  FieldInfo &Field = Fields.back();
+  if (IsUnion) {
+    Field.Offset = 0;
+  } else {
+    Size = llvm::alignTo(Size, Alignment);
+    Field.Offset = Size;
+  }
+  return Field;
+}
+
+/// The concrete assembly parser instance.
+// Note that this is a full MCAsmParser, not an MCAsmParserExtension!
+// It's a peer of AsmParser, not of COFFAsmParser, WasmAsmParser, etc.
+class MasmParser : public MCAsmParser {
+private:
+  AsmLexer Lexer;
+  MCContext &Ctx;
+  MCStreamer &Out;
+  const MCAsmInfo &MAI;
+  SourceMgr &SrcMgr;
+  SourceMgr::DiagHandlerTy SavedDiagHandler;
+  void *SavedDiagContext;
+  std::unique_ptr<MCAsmParserExtension> PlatformParser;
+
+  /// This is the current buffer index we're lexing from as managed by the
+  /// SourceMgr object.
+  unsigned CurBuffer;
+
+  AsmCond TheCondState;
+  std::vector<AsmCond> TheCondStack;
+
+  /// maps directive names to handler methods in parser
+  /// extensions. Extensions register themselves in this map by calling
+  /// addDirectiveHandler.
+  StringMap<ExtensionDirectiveHandler> ExtensionDirectiveMap;
+
+  /// maps assembly-time variable names to variables.
+  struct Variable {
+    StringRef Name;
+    bool Redefinable = true;
+    bool IsText = false;
+    int64_t NumericValue = 0;
+    std::string TextValue;
+  };
+  StringMap<Variable> Variables;
+
+  /// Stack of active struct definitions.
+  SmallVector<StructInfo, 1> StructInProgress;
+
+  /// Maps struct tags to struct definitions.
+  StringMap<StructInfo> Structs;
+
+  /// Maps data location names to user-defined types.
+  StringMap<const StructInfo *> KnownType;
+
+  /// Stack of active macro instantiations.
+  std::vector<MacroInstantiation*> ActiveMacros;
+
+  /// List of bodies of anonymous macros.
+  std::deque<MCAsmMacro> MacroLikeBodies;
+
+  /// Keeps track of how many .macro's have been instantiated.
+  unsigned NumOfMacroInstantiations;
+
+  /// The values from the last parsed cpp hash file line comment if any.
+  struct CppHashInfoTy {
+    StringRef Filename;
+    int64_t LineNumber;
+    SMLoc Loc;
+    unsigned Buf;
+    CppHashInfoTy() : Filename(), LineNumber(0), Loc(), Buf(0) {}
+  };
+  CppHashInfoTy CppHashInfo;
+
+  /// The filename from the first cpp hash file line comment, if any.
+  StringRef FirstCppHashFilename;
+
+  /// List of forward directional labels for diagnosis at the end.
+  SmallVector<std::tuple<SMLoc, CppHashInfoTy, MCSymbol *>, 4> DirLabels;
+
+  /// AssemblerDialect. ~OU means unset value and use value provided by MAI.
+  /// Defaults to 1U, meaning Intel.
+  unsigned AssemblerDialect = 1U;
+
+  /// is Darwin compatibility enabled?
+  bool IsDarwin = false;
+
+  /// Are we parsing ms-style inline assembly?
+  bool ParsingMSInlineAsm = false;
+
+  /// Did we already inform the user about inconsistent MD5 usage?
+  bool ReportedInconsistentMD5 = false;
+
+  // Is alt macro mode enabled.
+  bool AltMacroMode = false;
+
+  // Current <...> expression depth.
+  unsigned AngleBracketDepth = 0U;
+
+public:
+  MasmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
+             const MCAsmInfo &MAI, unsigned CB);
+  MasmParser(const MasmParser &) = delete;
+  MasmParser &operator=(const MasmParser &) = delete;
+  ~MasmParser() override;
+
+  bool Run(bool NoInitialTextSection, bool NoFinalize = false) override;
+
+  void addDirectiveHandler(StringRef Directive,
+                           ExtensionDirectiveHandler Handler) override {
+    ExtensionDirectiveMap[Directive] = Handler;
+    if (DirectiveKindMap.find(Directive) == DirectiveKindMap.end()) {
+      DirectiveKindMap[Directive] = DK_HANDLER_DIRECTIVE;
+    }
+  }
+
+  void addAliasForDirective(StringRef Directive, StringRef Alias) override {
+    DirectiveKindMap[Directive] = DirectiveKindMap[Alias];
+  }
+
+  /// @name MCAsmParser Interface
+  /// {
+
+  SourceMgr &getSourceManager() override { return SrcMgr; }
+  MCAsmLexer &getLexer() override { return Lexer; }
+  MCContext &getContext() override { return Ctx; }
+  MCStreamer &getStreamer() override { return Out; }
+
+  CodeViewContext &getCVContext() { return Ctx.getCVContext(); }
+
+  unsigned getAssemblerDialect() override {
+    if (AssemblerDialect == ~0U)
+      return MAI.getAssemblerDialect();
+    else
+      return AssemblerDialect;
+  }
+  void setAssemblerDialect(unsigned i) override {
+    AssemblerDialect = i;
+  }
+
+  void Note(SMLoc L, const Twine &Msg, SMRange Range = None) override;
+  bool Warning(SMLoc L, const Twine &Msg, SMRange Range = None) override;
+  bool printError(SMLoc L, const Twine &Msg, SMRange Range = None) override;
+
+  const AsmToken &Lex() override;
+
+  void setParsingMSInlineAsm(bool V) override {
+    ParsingMSInlineAsm = V;
+    // When parsing MS inline asm, we must lex 0b1101 and 0ABCH as binary and
+    // hex integer literals.
+    Lexer.setLexMasmIntegers(V);
+  }
+  bool isParsingMSInlineAsm() override { return ParsingMSInlineAsm; }
+
+  bool isParsingMasm() const override { return true; }
+
+  bool lookUpField(StringRef Name, StringRef &Type,
+                   unsigned &Offset) const override;
+  bool lookUpField(StringRef Base, StringRef Member, StringRef &Type,
+                   unsigned &Offset) const override;
+
+  bool parseMSInlineAsm(void *AsmLoc, std::string &AsmString,
+                        unsigned &NumOutputs, unsigned &NumInputs,
+                        SmallVectorImpl<std::pair<void *,bool>> &OpDecls,
+                        SmallVectorImpl<std::string> &Constraints,
+                        SmallVectorImpl<std::string> &Clobbers,
+                        const MCInstrInfo *MII, const MCInstPrinter *IP,
+                        MCAsmParserSemaCallback &SI) override;
+
+  bool parseExpression(const MCExpr *&Res);
+  bool parseExpression(const MCExpr *&Res, SMLoc &EndLoc) override;
+  bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override;
+  bool parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) override;
+  bool parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res,
+                             SMLoc &EndLoc) override;
+  bool parseAbsoluteExpression(int64_t &Res) override;
+
+  /// Parse a floating point expression using the float \p Semantics
+  /// and set \p Res to the value.
+  bool parseRealValue(const fltSemantics &Semantics, APInt &Res);
+
+  /// Parse an identifier or string (as a quoted identifier)
+  /// and set \p Res to the identifier contents.
+  bool parseIdentifier(StringRef &Res) override;
+  void eatToEndOfStatement() override;
+
+  bool checkForValidSection() override;
+
+  /// }
+
+private:
+  bool parseStatement(ParseStatementInfo &Info,
+                      MCAsmParserSemaCallback *SI);
+  bool parseCurlyBlockScope(SmallVectorImpl<AsmRewrite>& AsmStrRewrites);
+  bool parseCppHashLineFilenameComment(SMLoc L);
+
+  void checkForBadMacro(SMLoc DirectiveLoc, StringRef Name, StringRef Body,
+                        ArrayRef<MCAsmMacroParameter> Parameters);
+  bool expandMacro(raw_svector_ostream &OS, StringRef Body,
+                   ArrayRef<MCAsmMacroParameter> Parameters,
+                   ArrayRef<MCAsmMacroArgument> A, bool EnableAtPseudoVariable,
+                   SMLoc L);
+
+  /// Are we inside a macro instantiation?
+  bool isInsideMacroInstantiation() {return !ActiveMacros.empty();}
+
+  /// Handle entry to macro instantiation.
+  ///
+  /// \param M The macro.
+  /// \param NameLoc Instantiation location.
+  bool handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc);
+
+  /// Handle exit from macro instantiation.
+  void handleMacroExit();
+
+  /// Extract AsmTokens for a macro argument.
+  bool parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg);
+
+  /// Parse all macro arguments for a given macro.
+  bool parseMacroArguments(const MCAsmMacro *M, MCAsmMacroArguments &A);
+
+  void printMacroInstantiations();
+  void printMessage(SMLoc Loc, SourceMgr::DiagKind Kind, const Twine &Msg,
+                    SMRange Range = None) const {
+    ArrayRef<SMRange> Ranges(Range);
+    SrcMgr.PrintMessage(Loc, Kind, Msg, Ranges);
+  }
+  static void DiagHandler(const SMDiagnostic &Diag, void *Context);
+
+  bool lookUpField(const StructInfo &Structure, StringRef Member,
+                   StringRef &Type, unsigned &Offset) const;
+
+  /// Should we emit DWARF describing this assembler source?  (Returns false if
+  /// the source has .file directives, which means we don't want to generate
+  /// info describing the assembler source itself.)
+  bool enabledGenDwarfForAssembly();
+
+  /// Enter the specified file. This returns true on failure.
+  bool enterIncludeFile(const std::string &Filename);
+
+  /// Reset the current lexer position to that given by \p Loc. The
+  /// current token is not set; clients should ensure Lex() is called
+  /// subsequently.
+  ///
+  /// \param InBuffer If not 0, should be the known buffer id that contains the
+  /// location.
+  void jumpToLoc(SMLoc Loc, unsigned InBuffer = 0);
+
+  /// Parse up to the end of statement and a return the contents from the
+  /// current token until the end of the statement; the current token on exit
+  /// will be either the EndOfStatement or EOF.
+  StringRef parseStringToEndOfStatement() override;
+
+  bool parseTextItem(std::string &Data);
+
+  unsigned getBinOpPrecedence(AsmToken::TokenKind K,
+                              MCBinaryExpr::Opcode &Kind);
+
+  bool parseBinOpRHS(unsigned Precedence, const MCExpr *&Res, SMLoc &EndLoc);
+  bool parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc);
+  bool parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc);
+
+  bool parseRegisterOrRegisterNumber(int64_t &Register, SMLoc DirectiveLoc);
+
+  bool parseCVFunctionId(int64_t &FunctionId, StringRef DirectiveName);
+  bool parseCVFileId(int64_t &FileId, StringRef DirectiveName);
+
+  // Generic (target and platform independent) directive parsing.
+  enum DirectiveKind {
+    DK_NO_DIRECTIVE, // Placeholder
+    DK_HANDLER_DIRECTIVE,
+    DK_ASSIGN,
+    DK_EQU,
+    DK_TEXTEQU,
+    DK_ASCII,
+    DK_ASCIZ,
+    DK_STRING,
+    DK_BYTE,
+    DK_SBYTE,
+    DK_WORD,
+    DK_SWORD,
+    DK_DWORD,
+    DK_SDWORD,
+    DK_FWORD,
+    DK_QWORD,
+    DK_SQWORD,
+    DK_DB,
+    DK_DD,
+    DK_DQ,
+    DK_DW,
+    DK_REAL4,
+    DK_REAL8,
+    DK_ALIGN,
+    DK_ORG,
+    DK_ENDR,
+    DK_EXTERN,
+    DK_PUBLIC,
+    DK_COMM,
+    DK_COMMENT,
+    DK_INCLUDE,
+    DK_REPT,
+    DK_IRP,
+    DK_IRPC,
+    DK_IF,
+    DK_IFE,
+    DK_IFB,
+    DK_IFNB,
+    DK_IFDEF,
+    DK_IFNDEF,
+    DK_IFDIF,
+    DK_IFDIFI,
+    DK_IFIDN,
+    DK_IFIDNI,
+    DK_ELSEIF,
+    DK_ELSEIFE,
+    DK_ELSEIFB,
+    DK_ELSEIFNB,
+    DK_ELSEIFDEF,
+    DK_ELSEIFNDEF,
+    DK_ELSEIFDIF,
+    DK_ELSEIFDIFI,
+    DK_ELSEIFIDN,
+    DK_ELSEIFIDNI,
+    DK_ELSE,
+    DK_ENDIF,
+    DK_FILE,
+    DK_LINE,
+    DK_LOC,
+    DK_STABS,
+    DK_CV_FILE,
+    DK_CV_FUNC_ID,
+    DK_CV_INLINE_SITE_ID,
+    DK_CV_LOC,
+    DK_CV_LINETABLE,
+    DK_CV_INLINE_LINETABLE,
+    DK_CV_DEF_RANGE,
+    DK_CV_STRINGTABLE,
+    DK_CV_STRING,
+    DK_CV_FILECHECKSUMS,
+    DK_CV_FILECHECKSUM_OFFSET,
+    DK_CV_FPO_DATA,
+    DK_CFI_SECTIONS,
+    DK_CFI_STARTPROC,
+    DK_CFI_ENDPROC,
+    DK_CFI_DEF_CFA,
+    DK_CFI_DEF_CFA_OFFSET,
+    DK_CFI_ADJUST_CFA_OFFSET,
+    DK_CFI_DEF_CFA_REGISTER,
+    DK_CFI_OFFSET,
+    DK_CFI_REL_OFFSET,
+    DK_CFI_PERSONALITY,
+    DK_CFI_LSDA,
+    DK_CFI_REMEMBER_STATE,
+    DK_CFI_RESTORE_STATE,
+    DK_CFI_SAME_VALUE,
+    DK_CFI_RESTORE,
+    DK_CFI_ESCAPE,
+    DK_CFI_RETURN_COLUMN,
+    DK_CFI_SIGNAL_FRAME,
+    DK_CFI_UNDEFINED,
+    DK_CFI_REGISTER,
+    DK_CFI_WINDOW_SAVE,
+    DK_CFI_B_KEY_FRAME,
+    DK_ALTMACRO,
+    DK_NOALTMACRO,
+    DK_MACRO,
+    DK_EXITM,
+    DK_ENDM,
+    DK_PURGEM,
+    DK_ERR,
+    DK_ERRB,
+    DK_ERRNB,
+    DK_ERRDEF,
+    DK_ERRNDEF,
+    DK_ERRDIF,
+    DK_ERRDIFI,
+    DK_ERRIDN,
+    DK_ERRIDNI,
+    DK_ERRE,
+    DK_ERRNZ,
+    DK_ECHO,
+    DK_STRUCT,
+    DK_UNION,
+    DK_ENDS,
+    DK_END
+  };
+
+  /// Maps directive name --> DirectiveKind enum, for directives parsed by this
+  /// class.
+  StringMap<DirectiveKind> DirectiveKindMap;
+
+  // Codeview def_range type parsing.
+  enum CVDefRangeType {
+    CVDR_DEFRANGE = 0, // Placeholder
+    CVDR_DEFRANGE_REGISTER,
+    CVDR_DEFRANGE_FRAMEPOINTER_REL,
+    CVDR_DEFRANGE_SUBFIELD_REGISTER,
+    CVDR_DEFRANGE_REGISTER_REL
+  };
+
+  /// Maps Codeview def_range types --> CVDefRangeType enum, for Codeview
+  /// def_range types parsed by this class.
+  StringMap<CVDefRangeType> CVDefRangeTypeMap;
+
+  bool parseInitValue(unsigned Size);
+
+  // ".ascii", ".asciz", ".string"
+  bool parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated);
+
+  // "byte", "word", ...
+  bool emitIntValue(const MCExpr *Value, unsigned Size);
+  bool parseScalarInitializer(unsigned Size,
+                              SmallVectorImpl<const MCExpr *> &Values,
+                              unsigned StringPadLength = 0);
+  bool parseScalarInstList(
+      unsigned Size, SmallVectorImpl<const MCExpr *> &Values,
+      const AsmToken::TokenKind EndToken = AsmToken::EndOfStatement);
+  bool emitIntegralValues(unsigned Size);
+  bool addIntegralField(StringRef Name, unsigned Size);
+  bool parseDirectiveValue(StringRef IDVal, unsigned Size);
+  bool parseDirectiveNamedValue(StringRef IDVal, unsigned Size, StringRef Name,
+                                SMLoc NameLoc);
+
+  // "real4", "real8"
+  bool emitRealValues(const fltSemantics &Semantics);
+  bool addRealField(StringRef Name, const fltSemantics &Semantics);
+  bool parseDirectiveRealValue(StringRef IDVal, const fltSemantics &Semantics);
+  bool parseRealInstList(
+      const fltSemantics &Semantics, SmallVectorImpl<APInt> &Values,
+      const AsmToken::TokenKind EndToken = AsmToken::EndOfStatement);
+  bool parseDirectiveNamedRealValue(StringRef IDVal,
+                                    const fltSemantics &Semantics,
+                                    StringRef Name, SMLoc NameLoc);
+
+  bool parseOptionalAngleBracketOpen();
+  bool parseAngleBracketClose(const Twine &Msg = "expected '>'");
+
+  bool parseFieldInitializer(const FieldInfo &Field,
+                             FieldInitializer &Initializer);
+  bool parseFieldInitializer(const FieldInfo &Field,
+                             const IntFieldInfo &Contents,
+                             FieldInitializer &Initializer);
+  bool parseFieldInitializer(const FieldInfo &Field,
+                             const RealFieldInfo &Contents,
+                             FieldInitializer &Initializer);
+  bool parseFieldInitializer(const FieldInfo &Field,
+                             const StructFieldInfo &Contents,
+                             FieldInitializer &Initializer);
+
+  bool parseStructInitializer(const StructInfo &Structure,
+                              StructInitializer &Initializer);
+  bool parseStructInstList(
+      const StructInfo &Structure, std::vector<StructInitializer> &Initializers,
+      const AsmToken::TokenKind EndToken = AsmToken::EndOfStatement);
+
+  bool emitFieldValue(const FieldInfo &Field);
+  bool emitFieldValue(const FieldInfo &Field, const IntFieldInfo &Contents);
+  bool emitFieldValue(const FieldInfo &Field, const RealFieldInfo &Contents);
+  bool emitFieldValue(const FieldInfo &Field, const StructFieldInfo &Contents);
+
+  bool emitStructValue(const StructInfo &Structure);
+
+  bool emitFieldInitializer(const FieldInfo &Field,
+                            const FieldInitializer &Initializer);
+  bool emitFieldInitializer(const FieldInfo &Field,
+                            const IntFieldInfo &Contents,
+                            const IntFieldInfo &Initializer);
+  bool emitFieldInitializer(const FieldInfo &Field,
+                            const RealFieldInfo &Contents,
+                            const RealFieldInfo &Initializer);
+  bool emitFieldInitializer(const FieldInfo &Field,
+                            const StructFieldInfo &Contents,
+                            const StructFieldInfo &Initializer);
+
+  bool emitStructInitializer(const StructInfo &Structure,
+                             const StructInitializer &Initializer);
+
+  // User-defined types (structs, unions):
+  bool emitStructValue(const StructInfo &Structure,
+                       const StructInitializer &Initializer,
+                       size_t InitialOffset = 0, size_t InitialField = 0);
+  bool emitStructValues(const StructInfo &Structure);
+  bool addStructField(StringRef Name, const StructInfo &Structure);
+  bool parseDirectiveStructValue(const StructInfo &Structure,
+                                 StringRef Directive, SMLoc DirLoc);
+  bool parseDirectiveNamedStructValue(const StructInfo &Structure,
+                                      StringRef Directive, SMLoc DirLoc,
+                                      StringRef Name);
+
+  // "=", "equ", "textequ"
+  bool parseDirectiveEquate(StringRef IDVal, StringRef Name,
+                            DirectiveKind DirKind);
+
+  bool parseDirectiveOrg(); // ".org"
+  bool parseDirectiveAlign();  // "align"
+
+  // ".file", ".line", ".loc", ".stabs"
+  bool parseDirectiveFile(SMLoc DirectiveLoc);
+  bool parseDirectiveLine();
+  bool parseDirectiveLoc();
+  bool parseDirectiveStabs();
+
+  // ".cv_file", ".cv_func_id", ".cv_inline_site_id", ".cv_loc", ".cv_linetable",
+  // ".cv_inline_linetable", ".cv_def_range", ".cv_string"
+  bool parseDirectiveCVFile();
+  bool parseDirectiveCVFuncId();
+  bool parseDirectiveCVInlineSiteId();
+  bool parseDirectiveCVLoc();
+  bool parseDirectiveCVLinetable();
+  bool parseDirectiveCVInlineLinetable();
+  bool parseDirectiveCVDefRange();
+  bool parseDirectiveCVString();
+  bool parseDirectiveCVStringTable();
+  bool parseDirectiveCVFileChecksums();
+  bool parseDirectiveCVFileChecksumOffset();
+  bool parseDirectiveCVFPOData();
+
+  // .cfi directives
+  bool parseDirectiveCFIRegister(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIWindowSave();
+  bool parseDirectiveCFISections();
+  bool parseDirectiveCFIStartProc();
+  bool parseDirectiveCFIEndProc();
+  bool parseDirectiveCFIDefCfaOffset();
+  bool parseDirectiveCFIDefCfa(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIAdjustCfaOffset();
+  bool parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIOffset(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIRelOffset(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIPersonalityOrLsda(bool IsPersonality);
+  bool parseDirectiveCFIRememberState();
+  bool parseDirectiveCFIRestoreState();
+  bool parseDirectiveCFISameValue(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIRestore(SMLoc DirectiveLoc);
+  bool parseDirectiveCFIEscape();
+  bool parseDirectiveCFIReturnColumn(SMLoc DirectiveLoc);
+  bool parseDirectiveCFISignalFrame();
+  bool parseDirectiveCFIUndefined(SMLoc DirectiveLoc);
+
+  // macro directives
+  bool parseDirectivePurgeMacro(SMLoc DirectiveLoc);
+  bool parseDirectiveExitMacro(StringRef Directive);
+  bool parseDirectiveEndMacro(StringRef Directive);
+  bool parseDirectiveMacro(SMLoc DirectiveLoc);
+  // alternate macro mode directives
+  bool parseDirectiveAltmacro(StringRef Directive);
+
+  bool parseDirectiveStruct(StringRef Directive, DirectiveKind DirKind,
+                            StringRef Name, SMLoc NameLoc);
+  bool parseDirectiveNestedStruct(StringRef Directive, DirectiveKind DirKind);
+  bool parseDirectiveEnds(StringRef Name, SMLoc NameLoc);
+  bool parseDirectiveNestedEnds();
+
+  /// Parse a directive like ".globl" which accepts a single symbol (which
+  /// should be a label or an external).
+  bool parseDirectiveSymbolAttribute(MCSymbolAttr Attr);
+
+  bool parseDirectiveComm(bool IsLocal); // ".comm" and ".lcomm"
+
+  bool parseDirectiveComment(SMLoc DirectiveLoc); // "comment"
+
+  bool parseDirectiveInclude(); // "include"
+
+  // "if" or "ife"
+  bool parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind);
+  // "ifb" or "ifnb", depending on ExpectBlank.
+  bool parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank);
+  // "ifidn", "ifdif", "ifidni", or "ifdifi", depending on ExpectEqual and
+  // CaseInsensitive.
+  bool parseDirectiveIfidn(SMLoc DirectiveLoc, bool ExpectEqual,
+                           bool CaseInsensitive);
+  // "ifdef" or "ifndef", depending on expect_defined
+  bool parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined);
+  // "elseif" or "elseife"
+  bool parseDirectiveElseIf(SMLoc DirectiveLoc, DirectiveKind DirKind);
+  // "elseifb" or "elseifnb", depending on ExpectBlank.
+  bool parseDirectiveElseIfb(SMLoc DirectiveLoc, bool ExpectBlank);
+  // ".elseifdef" or ".elseifndef", depending on expect_defined
+  bool parseDirectiveElseIfdef(SMLoc DirectiveLoc, bool expect_defined);
+  // "elseifidn", "elseifdif", "elseifidni", or "elseifdifi", depending on
+  // ExpectEqual and CaseInsensitive.
+  bool parseDirectiveElseIfidn(SMLoc DirectiveLoc, bool ExpectEqual,
+                               bool CaseInsensitive);
+  bool parseDirectiveElse(SMLoc DirectiveLoc);   // "else"
+  bool parseDirectiveEndIf(SMLoc DirectiveLoc);  // "endif"
+  bool parseEscapedString(std::string &Data) override;
+  bool parseAngleBracketString(std::string &Data) override;
+
+  // Macro-like directives
+  MCAsmMacro *parseMacroLikeBody(SMLoc DirectiveLoc);
+  void instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
+                                raw_svector_ostream &OS);
+  bool parseDirectiveRept(SMLoc DirectiveLoc, StringRef Directive);
+  bool parseDirectiveIrp(SMLoc DirectiveLoc);  // ".irp"
+  bool parseDirectiveIrpc(SMLoc DirectiveLoc); // ".irpc"
+  bool parseDirectiveEndr(SMLoc DirectiveLoc); // ".endr"
+
+  // "_emit" or "__emit"
+  bool parseDirectiveMSEmit(SMLoc DirectiveLoc, ParseStatementInfo &Info,
+                            size_t Len);
+
+  // "align"
+  bool parseDirectiveMSAlign(SMLoc DirectiveLoc, ParseStatementInfo &Info);
+
+  // "end"
+  bool parseDirectiveEnd(SMLoc DirectiveLoc);
+
+  // ".err"
+  bool parseDirectiveError(SMLoc DirectiveLoc);
+  // ".errb" or ".errnb", depending on ExpectBlank.
+  bool parseDirectiveErrorIfb(SMLoc DirectiveLoc, bool ExpectBlank);
+  // ".errdef" or ".errndef", depending on ExpectBlank.
+  bool parseDirectiveErrorIfdef(SMLoc DirectiveLoc, bool ExpectDefined);
+  // ".erridn", ".errdif", ".erridni", or ".errdifi", depending on ExpectEqual
+  // and CaseInsensitive.
+  bool parseDirectiveErrorIfidn(SMLoc DirectiveLoc, bool ExpectEqual,
+                                bool CaseInsensitive);
+  // ".erre" or ".errnz", depending on ExpectZero.
+  bool parseDirectiveErrorIfe(SMLoc DirectiveLoc, bool ExpectZero);
+
+  // "echo"
+  bool parseDirectiveEcho();
+
+  void initializeDirectiveKindMap();
+  void initializeCVDefRangeTypeMap();
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+extern MCAsmParserExtension *createCOFFMasmParser();
+
+} // end namespace llvm
+
+enum { DEFAULT_ADDRSPACE = 0 };
+
+MasmParser::MasmParser(SourceMgr &SM, MCContext &Ctx, MCStreamer &Out,
+                       const MCAsmInfo &MAI, unsigned CB = 0)
+    : Lexer(MAI), Ctx(Ctx), Out(Out), MAI(MAI), SrcMgr(SM),
+      CurBuffer(CB ? CB : SM.getMainFileID()) {
+  HadError = false;
+  // Save the old handler.
+  SavedDiagHandler = SrcMgr.getDiagHandler();
+  SavedDiagContext = SrcMgr.getDiagContext();
+  // Set our own handler which calls the saved handler.
+  SrcMgr.setDiagHandler(DiagHandler, this);
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
+
+  // Initialize the platform / file format parser.
+  switch (Ctx.getObjectFileInfo()->getObjectFileType()) {
+  case MCObjectFileInfo::IsCOFF:
+    PlatformParser.reset(createCOFFMasmParser());
+    break;
+  default:
+    report_fatal_error("llvm-ml currently supports only COFF output.");
+    break;
+  }
+
+  initializeDirectiveKindMap();
+  PlatformParser->Initialize(*this);
+  initializeCVDefRangeTypeMap();
+
+  NumOfMacroInstantiations = 0;
+}
+
+MasmParser::~MasmParser() {
+  assert((HadError || ActiveMacros.empty()) &&
+         "Unexpected active macro instantiation!");
+
+  // Restore the saved diagnostics handler and context for use during
+  // finalization.
+  SrcMgr.setDiagHandler(SavedDiagHandler, SavedDiagContext);
+}
+
+void MasmParser::printMacroInstantiations() {
+  // Print the active macro instantiation stack.
+  for (std::vector<MacroInstantiation *>::const_reverse_iterator
+           it = ActiveMacros.rbegin(),
+           ie = ActiveMacros.rend();
+       it != ie; ++it)
+    printMessage((*it)->InstantiationLoc, SourceMgr::DK_Note,
+                 "while in macro instantiation");
+}
+
+void MasmParser::Note(SMLoc L, const Twine &Msg, SMRange Range) {
+  printPendingErrors();
+  printMessage(L, SourceMgr::DK_Note, Msg, Range);
+  printMacroInstantiations();
+}
+
+bool MasmParser::Warning(SMLoc L, const Twine &Msg, SMRange Range) {
+  if (getTargetParser().getTargetOptions().MCNoWarn)
+    return false;
+  if (getTargetParser().getTargetOptions().MCFatalWarnings)
+    return Error(L, Msg, Range);
+  printMessage(L, SourceMgr::DK_Warning, Msg, Range);
+  printMacroInstantiations();
+  return false;
+}
+
+bool MasmParser::printError(SMLoc L, const Twine &Msg, SMRange Range) {
+  HadError = true;
+  printMessage(L, SourceMgr::DK_Error, Msg, Range);
+  printMacroInstantiations();
+  return true;
+}
+
+bool MasmParser::enterIncludeFile(const std::string &Filename) {
+  std::string IncludedFile;
+  unsigned NewBuf =
+      SrcMgr.AddIncludeFile(Filename, Lexer.getLoc(), IncludedFile);
+  if (!NewBuf)
+    return true;
+
+  CurBuffer = NewBuf;
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
+  return false;
+}
+
+void MasmParser::jumpToLoc(SMLoc Loc, unsigned InBuffer) {
+  CurBuffer = InBuffer ? InBuffer : SrcMgr.FindBufferContainingLoc(Loc);
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(),
+                  Loc.getPointer());
+}
+
+const AsmToken &MasmParser::Lex() {
+  if (Lexer.getTok().is(AsmToken::Error))
+    Error(Lexer.getErrLoc(), Lexer.getErr());
+
+  // if it's a end of statement with a comment in it
+  if (getTok().is(AsmToken::EndOfStatement)) {
+    // if this is a line comment output it.
+    if (!getTok().getString().empty() && getTok().getString().front() != '\n' &&
+        getTok().getString().front() != '\r' && MAI.preserveAsmComments())
+      Out.addExplicitComment(Twine(getTok().getString()));
+  }
+
+  const AsmToken *tok = &Lexer.Lex();
+
+  while (tok->is(AsmToken::Identifier)) {
+    auto it = Variables.find(tok->getIdentifier());
+    if (it != Variables.end() && it->second.IsText) {
+      std::unique_ptr<MemoryBuffer> Instantiation =
+          MemoryBuffer::getMemBufferCopy(it->second.TextValue,
+                                         "<instantiation>");
+
+      // Jump to the macro instantiation and prime the lexer.
+      CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(Instantiation),
+                                            getTok().getEndLoc());
+      Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer(), nullptr,
+                      /*EndStatementAtEOF=*/false);
+      tok = &Lexer.Lex();
+    } else {
+      break;
+    }
+  }
+
+  // Parse comments here to be deferred until end of next statement.
+  while (tok->is(AsmToken::Comment)) {
+    if (MAI.preserveAsmComments())
+      Out.addExplicitComment(Twine(tok->getString()));
+    tok = &Lexer.Lex();
+  }
+
+  if (tok->is(AsmToken::Eof)) {
+    // If this is the end of an included file, pop the parent file off the
+    // include stack.
+    SMLoc ParentIncludeLoc = SrcMgr.getParentIncludeLoc(CurBuffer);
+    if (ParentIncludeLoc != SMLoc()) {
+      jumpToLoc(ParentIncludeLoc);
+      return Lex();
+    }
+  }
+
+  return *tok;
+}
+
+bool MasmParser::enabledGenDwarfForAssembly() {
+  // Check whether the user specified -g.
+  if (!getContext().getGenDwarfForAssembly())
+    return false;
+  // If we haven't encountered any .file directives (which would imply that
+  // the assembler source was produced with debug info already) then emit one
+  // describing the assembler source file itself.
+  if (getContext().getGenDwarfFileNumber() == 0) {
+    // Use the first #line directive for this, if any. It's preprocessed, so
+    // there is no checksum, and of course no source directive.
+    if (!FirstCppHashFilename.empty())
+      getContext().setMCLineTableRootFile(/*CUID=*/0,
+                                          getContext().getCompilationDir(),
+                                          FirstCppHashFilename,
+                                          /*Cksum=*/None, /*Source=*/None);
+    const MCDwarfFile &RootFile =
+        getContext().getMCDwarfLineTable(/*CUID=*/0).getRootFile();
+    getContext().setGenDwarfFileNumber(getStreamer().emitDwarfFileDirective(
+        /*CUID=*/0, getContext().getCompilationDir(), RootFile.Name,
+        RootFile.Checksum, RootFile.Source));
+  }
+  return true;
+}
+
+bool MasmParser::Run(bool NoInitialTextSection, bool NoFinalize) {
+  // Create the initial section, if requested.
+  if (!NoInitialTextSection)
+    Out.InitSections(false);
+
+  // Prime the lexer.
+  Lex();
+
+  HadError = false;
+  AsmCond StartingCondState = TheCondState;
+  SmallVector<AsmRewrite, 4> AsmStrRewrites;
+
+  // If we are generating dwarf for assembly source files save the initial text
+  // section.  (Don't use enabledGenDwarfForAssembly() here, as we aren't
+  // emitting any actual debug info yet and haven't had a chance to parse any
+  // embedded .file directives.)
+  if (getContext().getGenDwarfForAssembly()) {
+    MCSection *Sec = getStreamer().getCurrentSectionOnly();
+    if (!Sec->getBeginSymbol()) {
+      MCSymbol *SectionStartSym = getContext().createTempSymbol();
+      getStreamer().emitLabel(SectionStartSym);
+      Sec->setBeginSymbol(SectionStartSym);
+    }
+    bool InsertResult = getContext().addGenDwarfSection(Sec);
+    assert(InsertResult && ".text section should not have debug info yet");
+    (void)InsertResult;
+  }
+
+  // While we have input, parse each statement.
+  while (Lexer.isNot(AsmToken::Eof)) {
+    ParseStatementInfo Info(&AsmStrRewrites);
+    bool Parsed = parseStatement(Info, nullptr);
+
+    // If we have a Lexer Error we are on an Error Token. Load in Lexer Error
+    // for printing ErrMsg via Lex() only if no (presumably better) parser error
+    // exists.
+    if (Parsed && !hasPendingError() && Lexer.getTok().is(AsmToken::Error)) {
+      Lex();
+    }
+
+    // parseStatement returned true so may need to emit an error.
+    printPendingErrors();
+
+    // Skipping to the next line if needed.
+    if (Parsed && !getLexer().isAtStartOfStatement())
+      eatToEndOfStatement();
+  }
+
+  getTargetParser().onEndOfFile();
+  printPendingErrors();
+
+  // All errors should have been emitted.
+  assert(!hasPendingError() && "unexpected error from parseStatement");
+
+  getTargetParser().flushPendingInstructions(getStreamer());
+
+  if (TheCondState.TheCond != StartingCondState.TheCond ||
+      TheCondState.Ignore != StartingCondState.Ignore)
+    printError(getTok().getLoc(), "unmatched .ifs or .elses");
+  // Check to see there are no empty DwarfFile slots.
+  const auto &LineTables = getContext().getMCDwarfLineTables();
+  if (!LineTables.empty()) {
+    unsigned Index = 0;
+    for (const auto &File : LineTables.begin()->second.getMCDwarfFiles()) {
+      if (File.Name.empty() && Index != 0)
+        printError(getTok().getLoc(), "unassigned file number: " +
+                                          Twine(Index) +
+                                          " for .file directives");
+      ++Index;
+    }
+  }
+
+  // Check to see that all assembler local symbols were actually defined.
+  // Targets that don't do subsections via symbols may not want this, though,
+  // so conservatively exclude them. Only do this if we're finalizing, though,
+  // as otherwise we won't necessarilly have seen everything yet.
+  if (!NoFinalize) {
+    if (MAI.hasSubsectionsViaSymbols()) {
+      for (const auto &TableEntry : getContext().getSymbols()) {
+        MCSymbol *Sym = TableEntry.getValue();
+        // Variable symbols may not be marked as defined, so check those
+        // explicitly. If we know it's a variable, we have a definition for
+        // the purposes of this check.
+        if (Sym->isTemporary() && !Sym->isVariable() && !Sym->isDefined())
+          // FIXME: We would really like to refer back to where the symbol was
+          // first referenced for a source location. We need to add something
+          // to track that. Currently, we just point to the end of the file.
+          printError(getTok().getLoc(), "assembler local symbol '" +
+                                            Sym->getName() + "' not defined");
+      }
+    }
+
+    // Temporary symbols like the ones for directional jumps don't go in the
+    // symbol table. They also need to be diagnosed in all (final) cases.
+    for (std::tuple<SMLoc, CppHashInfoTy, MCSymbol *> &LocSym : DirLabels) {
+      if (std::get<2>(LocSym)->isUndefined()) {
+        // Reset the state of any "# line file" directives we've seen to the
+        // context as it was at the diagnostic site.
+        CppHashInfo = std::get<1>(LocSym);
+        printError(std::get<0>(LocSym), "directional label undefined");
+      }
+    }
+  }
+
+  // Finalize the output stream if there are no errors and if the client wants
+  // us to.
+  if (!HadError && !NoFinalize)
+    Out.Finish();
+
+  return HadError || getContext().hadError();
+}
+
+bool MasmParser::checkForValidSection() {
+  if (!ParsingMSInlineAsm && !getStreamer().getCurrentSectionOnly()) {
+    Out.InitSections(false);
+    return Error(getTok().getLoc(),
+                 "expected section directive before assembly directive");
+  }
+  return false;
+}
+
+/// Throw away the rest of the line for testing purposes.
+void MasmParser::eatToEndOfStatement() {
+  while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
+    Lexer.Lex();
+
+  // Eat EOL.
+  if (Lexer.is(AsmToken::EndOfStatement))
+    Lexer.Lex();
+}
+
+StringRef MasmParser::parseStringToEndOfStatement() {
+  const char *Start = getTok().getLoc().getPointer();
+
+  while (Lexer.isNot(AsmToken::EndOfStatement) && Lexer.isNot(AsmToken::Eof))
+    Lexer.Lex();
+
+  const char *End = getTok().getLoc().getPointer();
+  return StringRef(Start, End - Start);
+}
+
+/// Parse a paren expression and return it.
+/// NOTE: This assumes the leading '(' has already been consumed.
+///
+/// parenexpr ::= expr)
+///
+bool MasmParser::parseParenExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+  if (parseExpression(Res))
+    return true;
+  if (Lexer.isNot(AsmToken::RParen))
+    return TokError("expected ')' in parentheses expression");
+  EndLoc = Lexer.getTok().getEndLoc();
+  Lex();
+  return false;
+}
+
+/// Parse a bracket expression and return it.
+/// NOTE: This assumes the leading '[' has already been consumed.
+///
+/// bracketexpr ::= expr]
+///
+bool MasmParser::parseBracketExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+  if (parseExpression(Res))
+    return true;
+  EndLoc = getTok().getEndLoc();
+  if (parseToken(AsmToken::RBrac, "expected ']' in brackets expression"))
+    return true;
+  return false;
+}
+
+/// Parse a primary expression and return it.
+///  primaryexpr ::= (parenexpr
+///  primaryexpr ::= symbol
+///  primaryexpr ::= number
+///  primaryexpr ::= '.'
+///  primaryexpr ::= ~,+,- primaryexpr
+bool MasmParser::parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) {
+  SMLoc FirstTokenLoc = getLexer().getLoc();
+  AsmToken::TokenKind FirstTokenKind = Lexer.getKind();
+  switch (FirstTokenKind) {
+  default:
+    return TokError("unknown token in expression");
+  // If we have an error assume that we've already handled it.
+  case AsmToken::Error:
+    return true;
+  case AsmToken::Exclaim:
+    Lex(); // Eat the operator.
+    if (parsePrimaryExpr(Res, EndLoc))
+      return true;
+    Res = MCUnaryExpr::createLNot(Res, getContext(), FirstTokenLoc);
+    return false;
+  case AsmToken::Dollar:
+  case AsmToken::At:
+  case AsmToken::String:
+  case AsmToken::Identifier: {
+    StringRef Identifier;
+    if (parseIdentifier(Identifier)) {
+      // We may have failed but $ may be a valid token.
+      if (getTok().is(AsmToken::Dollar)) {
+        if (Lexer.getMAI().getDollarIsPC()) {
+          Lex();
+          // This is a '$' reference, which references the current PC.  Emit a
+          // temporary label to the streamer and refer to it.
+          MCSymbol *Sym = Ctx.createTempSymbol();
+          Out.emitLabel(Sym);
+          Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
+                                        getContext());
+          EndLoc = FirstTokenLoc;
+          return false;
+        }
+        return Error(FirstTokenLoc, "invalid token in expression");
+      }
+    }
+    // Parse symbol variant.
+    std::pair<StringRef, StringRef> Split;
+    if (!MAI.useParensForSymbolVariant()) {
+      if (FirstTokenKind == AsmToken::String) {
+        if (Lexer.is(AsmToken::At)) {
+          Lex(); // eat @
+          SMLoc AtLoc = getLexer().getLoc();
+          StringRef VName;
+          if (parseIdentifier(VName))
+            return Error(AtLoc, "expected symbol variant after '@'");
+
+          Split = std::make_pair(Identifier, VName);
+        }
+      } else {
+        Split = Identifier.split('@');
+      }
+    } else if (Lexer.is(AsmToken::LParen)) {
+      Lex(); // eat '('.
+      StringRef VName;
+      parseIdentifier(VName);
+      // eat ')'.
+      if (parseToken(AsmToken::RParen,
+                     "unexpected token in variant, expected ')'"))
+        return true;
+      Split = std::make_pair(Identifier, VName);
+    }
+
+    EndLoc = SMLoc::getFromPointer(Identifier.end());
+
+    // This is a symbol reference.
+    StringRef SymbolName = Identifier;
+    if (SymbolName.empty())
+      return Error(getLexer().getLoc(), "expected a symbol reference");
+
+    MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+
+    // Look up the symbol variant if used.
+    if (!Split.second.empty()) {
+      Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
+      if (Variant != MCSymbolRefExpr::VK_Invalid) {
+        SymbolName = Split.first;
+      } else if (MAI.doesAllowAtInName() && !MAI.useParensForSymbolVariant()) {
+        Variant = MCSymbolRefExpr::VK_None;
+      } else {
+        return Error(SMLoc::getFromPointer(Split.second.begin()),
+                     "invalid variant '" + Split.second + "'");
+      }
+    }
+
+    // Find the field offset if used.
+    StringRef Type;
+    unsigned Offset = 0;
+    Split = SymbolName.split('.');
+    if (!Split.second.empty()) {
+      SymbolName = Split.first;
+      if (Structs.count(SymbolName.lower()) &&
+          !lookUpField(SymbolName, Split.second, Type, Offset)) {
+        // This is actually a reference to a field offset.
+        Res = MCConstantExpr::create(Offset, getContext());
+        return false;
+      }
+
+      auto TypeIt = KnownType.find(SymbolName);
+      if (TypeIt == KnownType.end() ||
+          lookUpField(*TypeIt->second, Split.second, Type, Offset)) {
+        std::pair<StringRef, StringRef> BaseMember = Split.second.split('.');
+        StringRef Base = BaseMember.first, Member = BaseMember.second;
+        lookUpField(Base, Member, Type, Offset);
+      }
+    }
+
+    MCSymbol *Sym = getContext().getInlineAsmLabel(SymbolName);
+    if (!Sym)
+      Sym = getContext().getOrCreateSymbol(SymbolName);
+
+    // If this is an absolute variable reference, substitute it now to preserve
+    // semantics in the face of reassignment.
+    if (Sym->isVariable()) {
+      auto V = Sym->getVariableValue(/*SetUsed*/ false);
+      bool DoInline = isa<MCConstantExpr>(V) && !Variant;
+      if (auto TV = dyn_cast<MCTargetExpr>(V))
+        DoInline = TV->inlineAssignedExpr();
+      if (DoInline) {
+        if (Variant)
+          return Error(EndLoc, "unexpected modifier on variable reference");
+        Res = Sym->getVariableValue(/*SetUsed*/ false);
+        return false;
+      }
+    }
+
+    // Otherwise create a symbol ref.
+    const MCExpr *SymRef =
+        MCSymbolRefExpr::create(Sym, Variant, getContext(), FirstTokenLoc);
+    if (Offset) {
+      Res = MCBinaryExpr::create(MCBinaryExpr::Add, SymRef,
+                                 MCConstantExpr::create(Offset, getContext()),
+                                 getContext());
+    } else {
+      Res = SymRef;
+    }
+    return false;
+  }
+  case AsmToken::BigNum:
+    return TokError("literal value out of range for directive");
+  case AsmToken::Integer: {
+    SMLoc Loc = getTok().getLoc();
+    int64_t IntVal = getTok().getIntVal();
+    Res = MCConstantExpr::create(IntVal, getContext());
+    EndLoc = Lexer.getTok().getEndLoc();
+    Lex(); // Eat token.
+    // Look for 'b' or 'f' following an Integer as a directional label.
+    if (Lexer.getKind() == AsmToken::Identifier) {
+      StringRef IDVal = getTok().getString();
+      // Look up the symbol variant if used.
+      std::pair<StringRef, StringRef> Split = IDVal.split('@');
+      MCSymbolRefExpr::VariantKind Variant = MCSymbolRefExpr::VK_None;
+      if (Split.first.size() != IDVal.size()) {
+        Variant = MCSymbolRefExpr::getVariantKindForName(Split.second);
+        if (Variant == MCSymbolRefExpr::VK_Invalid)
+          return TokError("invalid variant '" + Split.second + "'");
+        IDVal = Split.first;
+      }
+      if (IDVal == "f" || IDVal == "b") {
+        MCSymbol *Sym =
+            Ctx.getDirectionalLocalSymbol(IntVal, IDVal == "b");
+        Res = MCSymbolRefExpr::create(Sym, Variant, getContext());
+        if (IDVal == "b" && Sym->isUndefined())
+          return Error(Loc, "directional label undefined");
+        DirLabels.push_back(std::make_tuple(Loc, CppHashInfo, Sym));
+        EndLoc = Lexer.getTok().getEndLoc();
+        Lex(); // Eat identifier.
+      }
+    }
+    return false;
+  }
+  case AsmToken::Real: {
+    APFloat RealVal(APFloat::IEEEdouble(), getTok().getString());
+    uint64_t IntVal = RealVal.bitcastToAPInt().getZExtValue();
+    Res = MCConstantExpr::create(IntVal, getContext());
+    EndLoc = Lexer.getTok().getEndLoc();
+    Lex(); // Eat token.
+    return false;
+  }
+  case AsmToken::Dot: {
+    // This is a '.' reference, which references the current PC.  Emit a
+    // temporary label to the streamer and refer to it.
+    MCSymbol *Sym = Ctx.createTempSymbol();
+    Out.emitLabel(Sym);
+    Res = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None, getContext());
+    EndLoc = Lexer.getTok().getEndLoc();
+    Lex(); // Eat identifier.
+    return false;
+  }
+  case AsmToken::LParen:
+    Lex(); // Eat the '('.
+    return parseParenExpr(Res, EndLoc);
+  case AsmToken::LBrac:
+    if (!PlatformParser->HasBracketExpressions())
+      return TokError("brackets expression not supported on this target");
+    Lex(); // Eat the '['.
+    return parseBracketExpr(Res, EndLoc);
+  case AsmToken::Minus:
+    Lex(); // Eat the operator.
+    if (parsePrimaryExpr(Res, EndLoc))
+      return true;
+    Res = MCUnaryExpr::createMinus(Res, getContext(), FirstTokenLoc);
+    return false;
+  case AsmToken::Plus:
+    Lex(); // Eat the operator.
+    if (parsePrimaryExpr(Res, EndLoc))
+      return true;
+    Res = MCUnaryExpr::createPlus(Res, getContext(), FirstTokenLoc);
+    return false;
+  case AsmToken::Tilde:
+    Lex(); // Eat the operator.
+    if (parsePrimaryExpr(Res, EndLoc))
+      return true;
+    Res = MCUnaryExpr::createNot(Res, getContext(), FirstTokenLoc);
+    return false;
+  // MIPS unary expression operators. The lexer won't generate these tokens if
+  // MCAsmInfo::HasMipsExpressions is false for the target.
+  case AsmToken::PercentCall16:
+  case AsmToken::PercentCall_Hi:
+  case AsmToken::PercentCall_Lo:
+  case AsmToken::PercentDtprel_Hi:
+  case AsmToken::PercentDtprel_Lo:
+  case AsmToken::PercentGot:
+  case AsmToken::PercentGot_Disp:
+  case AsmToken::PercentGot_Hi:
+  case AsmToken::PercentGot_Lo:
+  case AsmToken::PercentGot_Ofst:
+  case AsmToken::PercentGot_Page:
+  case AsmToken::PercentGottprel:
+  case AsmToken::PercentGp_Rel:
+  case AsmToken::PercentHi:
+  case AsmToken::PercentHigher:
+  case AsmToken::PercentHighest:
+  case AsmToken::PercentLo:
+  case AsmToken::PercentNeg:
+  case AsmToken::PercentPcrel_Hi:
+  case AsmToken::PercentPcrel_Lo:
+  case AsmToken::PercentTlsgd:
+  case AsmToken::PercentTlsldm:
+  case AsmToken::PercentTprel_Hi:
+  case AsmToken::PercentTprel_Lo:
+    Lex(); // Eat the operator.
+    if (Lexer.isNot(AsmToken::LParen))
+      return TokError("expected '(' after operator");
+    Lex(); // Eat the operator.
+    if (parseExpression(Res, EndLoc))
+      return true;
+    if (Lexer.isNot(AsmToken::RParen))
+      return TokError("expected ')'");
+    Lex(); // Eat the operator.
+    Res = getTargetParser().createTargetUnaryExpr(Res, FirstTokenKind, Ctx);
+    return !Res;
+  }
+}
+
+bool MasmParser::parseExpression(const MCExpr *&Res) {
+  SMLoc EndLoc;
+  return parseExpression(Res, EndLoc);
+}
+
+/// This function checks if the next token is <string> type or arithmetic.
+/// string that begin with character '<' must end with character '>'.
+/// otherwise it is arithmetics.
+/// If the function returns a 'true' value,
+/// the End argument will be filled with the last location pointed to the '>'
+/// character.
+
+/// There is a gap between the AltMacro's documentation and the single quote
+/// implementation. GCC does not fully support this feature and so we will not
+/// support it.
+/// TODO: Adding single quote as a string.
+static bool isAngleBracketString(SMLoc &StrLoc, SMLoc &EndLoc) {
+  assert((StrLoc.getPointer() != nullptr) &&
+         "Argument to the function cannot be a NULL value");
+  const char *CharPtr = StrLoc.getPointer();
+  while ((*CharPtr != '>') && (*CharPtr != '\n') && (*CharPtr != '\r') &&
+         (*CharPtr != '\0')) {
+    if (*CharPtr == '!')
+      CharPtr++;
+    CharPtr++;
+  }
+  if (*CharPtr == '>') {
+    EndLoc = StrLoc.getFromPointer(CharPtr + 1);
+    return true;
+  }
+  return false;
+}
+
+/// creating a string without the escape characters '!'.
+static std::string angleBracketString(StringRef AltMacroStr) {
+  std::string Res;
+  for (size_t Pos = 0; Pos < AltMacroStr.size(); Pos++) {
+    if (AltMacroStr[Pos] == '!')
+      Pos++;
+    Res += AltMacroStr[Pos];
+  }
+  return Res;
+}
+
+/// Parse an expression and return it.
+///
+///  expr ::= expr &&,|| expr               -> lowest.
+///  expr ::= expr |,^,&,! expr
+///  expr ::= expr ==,!=,<>,<,<=,>,>= expr
+///  expr ::= expr <<,>> expr
+///  expr ::= expr +,- expr
+///  expr ::= expr *,/,% expr               -> highest.
+///  expr ::= primaryexpr
+///
+bool MasmParser::parseExpression(const MCExpr *&Res, SMLoc &EndLoc) {
+  // Parse the expression.
+  Res = nullptr;
+  if (getTargetParser().parsePrimaryExpr(Res, EndLoc) ||
+      parseBinOpRHS(1, Res, EndLoc))
+    return true;
+
+  // Try to constant fold it up front, if possible. Do not exploit
+  // assembler here.
+  int64_t Value;
+  if (Res->evaluateAsAbsolute(Value))
+    Res = MCConstantExpr::create(Value, getContext());
+
+  return false;
+}
+
+bool MasmParser::parseParenExpression(const MCExpr *&Res, SMLoc &EndLoc) {
+  Res = nullptr;
+  return parseParenExpr(Res, EndLoc) || parseBinOpRHS(1, Res, EndLoc);
+}
+
+bool MasmParser::parseParenExprOfDepth(unsigned ParenDepth, const MCExpr *&Res,
+                                       SMLoc &EndLoc) {
+  if (parseParenExpr(Res, EndLoc))
+    return true;
+
+  for (; ParenDepth > 0; --ParenDepth) {
+    if (parseBinOpRHS(1, Res, EndLoc))
+      return true;
+
+    // We don't Lex() the last RParen.
+    // This is the same behavior as parseParenExpression().
+    if (ParenDepth - 1 > 0) {
+      EndLoc = getTok().getEndLoc();
+      if (parseToken(AsmToken::RParen,
+                     "expected ')' in parentheses expression"))
+        return true;
+    }
+  }
+  return false;
+}
+
+bool MasmParser::parseAbsoluteExpression(int64_t &Res) {
+  const MCExpr *Expr;
+
+  SMLoc StartLoc = Lexer.getLoc();
+  if (parseExpression(Expr))
+    return true;
+
+  if (!Expr->evaluateAsAbsolute(Res, getStreamer().getAssemblerPtr()))
+    return Error(StartLoc, "expected absolute expression");
+
+  return false;
+}
+
+static unsigned getGNUBinOpPrecedence(AsmToken::TokenKind K,
+                                      MCBinaryExpr::Opcode &Kind,
+                                      bool ShouldUseLogicalShr,
+                                      bool EndExpressionAtGreater) {
+  switch (K) {
+  default:
+    return 0; // not a binop.
+
+  // Lowest Precedence: &&, ||
+  case AsmToken::AmpAmp:
+    Kind = MCBinaryExpr::LAnd;
+    return 2;
+  case AsmToken::PipePipe:
+    Kind = MCBinaryExpr::LOr;
+    return 1;
+
+  // Low Precedence: ==, !=, <>, <, <=, >, >=
+  case AsmToken::EqualEqual:
+    Kind = MCBinaryExpr::EQ;
+    return 3;
+  case AsmToken::ExclaimEqual:
+  case AsmToken::LessGreater:
+    Kind = MCBinaryExpr::NE;
+    return 3;
+  case AsmToken::Less:
+    Kind = MCBinaryExpr::LT;
+    return 3;
+  case AsmToken::LessEqual:
+    Kind = MCBinaryExpr::LTE;
+    return 3;
+  case AsmToken::Greater:
+    if (EndExpressionAtGreater)
+      return 0;
+    Kind = MCBinaryExpr::GT;
+    return 3;
+  case AsmToken::GreaterEqual:
+    Kind = MCBinaryExpr::GTE;
+    return 3;
+
+  // Low Intermediate Precedence: +, -
+  case AsmToken::Plus:
+    Kind = MCBinaryExpr::Add;
+    return 4;
+  case AsmToken::Minus:
+    Kind = MCBinaryExpr::Sub;
+    return 4;
+
+  // High Intermediate Precedence: |, &, ^
+  //
+  // FIXME: gas seems to support '!' as an infix operator?
+  case AsmToken::Pipe:
+    Kind = MCBinaryExpr::Or;
+    return 5;
+  case AsmToken::Caret:
+    Kind = MCBinaryExpr::Xor;
+    return 5;
+  case AsmToken::Amp:
+    Kind = MCBinaryExpr::And;
+    return 5;
+
+  // Highest Precedence: *, /, %, <<, >>
+  case AsmToken::Star:
+    Kind = MCBinaryExpr::Mul;
+    return 6;
+  case AsmToken::Slash:
+    Kind = MCBinaryExpr::Div;
+    return 6;
+  case AsmToken::Percent:
+    Kind = MCBinaryExpr::Mod;
+    return 6;
+  case AsmToken::LessLess:
+    Kind = MCBinaryExpr::Shl;
+    return 6;
+  case AsmToken::GreaterGreater:
+    if (EndExpressionAtGreater)
+      return 0;
+    Kind = ShouldUseLogicalShr ? MCBinaryExpr::LShr : MCBinaryExpr::AShr;
+    return 6;
+  }
+}
+
+unsigned MasmParser::getBinOpPrecedence(AsmToken::TokenKind K,
+                                        MCBinaryExpr::Opcode &Kind) {
+  bool ShouldUseLogicalShr = MAI.shouldUseLogicalShr();
+  return getGNUBinOpPrecedence(K, Kind, ShouldUseLogicalShr,
+                               AngleBracketDepth > 0);
+}
+
+/// Parse all binary operators with precedence >= 'Precedence'.
+/// Res contains the LHS of the expression on input.
+bool MasmParser::parseBinOpRHS(unsigned Precedence, const MCExpr *&Res,
+                               SMLoc &EndLoc) {
+  SMLoc StartLoc = Lexer.getLoc();
+  while (true) {
+    MCBinaryExpr::Opcode Kind = MCBinaryExpr::Add;
+    unsigned TokPrec = getBinOpPrecedence(Lexer.getKind(), Kind);
+
+    // If the next token is lower precedence than we are allowed to eat, return
+    // successfully with what we ate already.
+    if (TokPrec < Precedence)
+      return false;
+
+    Lex();
+
+    // Eat the next primary expression.
+    const MCExpr *RHS;
+    if (getTargetParser().parsePrimaryExpr(RHS, EndLoc))
+      return true;
+
+    // If BinOp binds less tightly with RHS than the operator after RHS, let
+    // the pending operator take RHS as its LHS.
+    MCBinaryExpr::Opcode Dummy;
+    unsigned NextTokPrec = getBinOpPrecedence(Lexer.getKind(), Dummy);
+    if (TokPrec < NextTokPrec && parseBinOpRHS(TokPrec + 1, RHS, EndLoc))
+      return true;
+
+    // Merge LHS and RHS according to operator.
+    Res = MCBinaryExpr::create(Kind, Res, RHS, getContext(), StartLoc);
+  }
+}
+
+/// ParseStatement:
+///   ::= EndOfStatement
+///   ::= Label* Directive ...Operands... EndOfStatement
+///   ::= Label* Identifier OperandList* EndOfStatement
+bool MasmParser::parseStatement(ParseStatementInfo &Info,
+                                MCAsmParserSemaCallback *SI) {
+  assert(!hasPendingError() && "parseStatement started with pending error");
+  // Eat initial spaces and comments.
+  while (Lexer.is(AsmToken::Space))
+    Lex();
+  if (Lexer.is(AsmToken::EndOfStatement)) {
+    // If this is a line comment we can drop it safely.
+    if (getTok().getString().empty() || getTok().getString().front() == '\r' ||
+        getTok().getString().front() == '\n')
+      Out.AddBlankLine();
+    Lex();
+    return false;
+  }
+  // Statements always start with an identifier, unless we're dealing with a
+  // processor directive (.386, .686, etc.) that lexes as a real.
+  AsmToken ID = getTok();
+  SMLoc IDLoc = ID.getLoc();
+  StringRef IDVal;
+  int64_t LocalLabelVal = -1;
+  if (Lexer.is(AsmToken::HashDirective))
+    return parseCppHashLineFilenameComment(IDLoc);
+  // Allow an integer followed by a ':' as a directional local label.
+  if (Lexer.is(AsmToken::Integer)) {
+    LocalLabelVal = getTok().getIntVal();
+    if (LocalLabelVal < 0) {
+      if (!TheCondState.Ignore) {
+        Lex(); // always eat a token
+        return Error(IDLoc, "unexpected token at start of statement");
+      }
+      IDVal = "";
+    } else {
+      IDVal = getTok().getString();
+      Lex(); // Consume the integer token to be used as an identifier token.
+      if (Lexer.getKind() != AsmToken::Colon) {
+        if (!TheCondState.Ignore) {
+          Lex(); // always eat a token
+          return Error(IDLoc, "unexpected token at start of statement");
+        }
+      }
+    }
+  } else if (Lexer.is(AsmToken::Dot)) {
+    // Treat '.' as a valid identifier in this context.
+    Lex();
+    IDVal = ".";
+  } else if (Lexer.is(AsmToken::LCurly)) {
+    // Treat '{' as a valid identifier in this context.
+    Lex();
+    IDVal = "{";
+
+  } else if (Lexer.is(AsmToken::RCurly)) {
+    // Treat '}' as a valid identifier in this context.
+    Lex();
+    IDVal = "}";
+  } else if (Lexer.is(AsmToken::Star) &&
+             getTargetParser().starIsStartOfStatement()) {
+    // Accept '*' as a valid start of statement.
+    Lex();
+    IDVal = "*";
+  } else if (Lexer.is(AsmToken::Real)) {
+    // Treat ".<number>" as a valid identifier in this context.
+    IDVal = getTok().getString();
+    Lex(); // always eat a token
+    if (!IDVal.startswith("."))
+      return Error(IDLoc, "unexpected token at start of statement");
+  } else if (parseIdentifier(IDVal)) {
+    if (!TheCondState.Ignore) {
+      Lex(); // always eat a token
+      return Error(IDLoc, "unexpected token at start of statement");
+    }
+    IDVal = "";
+  }
+
+  // Handle conditional assembly here before checking for skipping.  We
+  // have to do this so that .endif isn't skipped in a ".if 0" block for
+  // example.
+  StringMap<DirectiveKind>::const_iterator DirKindIt =
+      DirectiveKindMap.find(IDVal.lower());
+  DirectiveKind DirKind = (DirKindIt == DirectiveKindMap.end())
+                              ? DK_NO_DIRECTIVE
+                              : DirKindIt->getValue();
+  switch (DirKind) {
+  default:
+    break;
+  case DK_IF:
+  case DK_IFE:
+    return parseDirectiveIf(IDLoc, DirKind);
+  case DK_IFB:
+    return parseDirectiveIfb(IDLoc, true);
+  case DK_IFNB:
+    return parseDirectiveIfb(IDLoc, false);
+  case DK_IFDEF:
+    return parseDirectiveIfdef(IDLoc, true);
+  case DK_IFNDEF:
+    return parseDirectiveIfdef(IDLoc, false);
+  case DK_IFDIF:
+    return parseDirectiveIfidn(IDLoc, /*ExpectEqual=*/false,
+                               /*CaseInsensitive=*/false);
+  case DK_IFDIFI:
+    return parseDirectiveIfidn(IDLoc, /*ExpectEqual=*/false,
+                               /*CaseInsensitive=*/true);
+  case DK_IFIDN:
+    return parseDirectiveIfidn(IDLoc, /*ExpectEqual=*/true,
+                               /*CaseInsensitive=*/false);
+  case DK_IFIDNI:
+    return parseDirectiveIfidn(IDLoc, /*ExpectEqual=*/true,
+                               /*CaseInsensitive=*/true);
+  case DK_ELSEIF:
+  case DK_ELSEIFE:
+    return parseDirectiveElseIf(IDLoc, DirKind);
+  case DK_ELSEIFB:
+    return parseDirectiveElseIfb(IDLoc, true);
+  case DK_ELSEIFNB:
+    return parseDirectiveElseIfb(IDLoc, false);
+  case DK_ELSEIFDEF:
+    return parseDirectiveElseIfdef(IDLoc, true);
+  case DK_ELSEIFNDEF:
+    return parseDirectiveElseIfdef(IDLoc, false);
+  case DK_ELSEIFDIF:
+    return parseDirectiveElseIfidn(IDLoc, /*ExpectEqual=*/false,
+                                   /*CaseInsensitive=*/false);
+  case DK_ELSEIFDIFI:
+    return parseDirectiveElseIfidn(IDLoc, /*ExpectEqual=*/false,
+                                   /*CaseInsensitive=*/true);
+  case DK_ELSEIFIDN:
+    return parseDirectiveElseIfidn(IDLoc, /*ExpectEqual=*/true,
+                                   /*CaseInsensitive=*/false);
+  case DK_ELSEIFIDNI:
+    return parseDirectiveElseIfidn(IDLoc, /*ExpectEqual=*/true,
+                                   /*CaseInsensitive=*/true);
+  case DK_ELSE:
+    return parseDirectiveElse(IDLoc);
+  case DK_ENDIF:
+    return parseDirectiveEndIf(IDLoc);
+  }
+
+  // Ignore the statement if in the middle of inactive conditional
+  // (e.g. ".if 0").
+  if (TheCondState.Ignore) {
+    eatToEndOfStatement();
+    return false;
+  }
+
+  // FIXME: Recurse on local labels?
+
+  // See what kind of statement we have.
+  switch (Lexer.getKind()) {
+  case AsmToken::Colon: {
+    if (!getTargetParser().isLabel(ID))
+      break;
+    if (checkForValidSection())
+      return true;
+
+    // identifier ':'   -> Label.
+    Lex();
+
+    // Diagnose attempt to use '.' as a label.
+    if (IDVal == ".")
+      return Error(IDLoc, "invalid use of pseudo-symbol '.' as a label");
+
+    // Diagnose attempt to use a variable as a label.
+    //
+    // FIXME: Diagnostics. Note the location of the definition as a label.
+    // FIXME: This doesn't diagnose assignment to a symbol which has been
+    // implicitly marked as external.
+    MCSymbol *Sym;
+    if (LocalLabelVal == -1) {
+      if (ParsingMSInlineAsm && SI) {
+        StringRef RewrittenLabel =
+            SI->LookupInlineAsmLabel(IDVal, getSourceManager(), IDLoc, true);
+        assert(!RewrittenLabel.empty() &&
+               "We should have an internal name here.");
+        Info.AsmRewrites->emplace_back(AOK_Label, IDLoc, IDVal.size(),
+                                       RewrittenLabel);
+        IDVal = RewrittenLabel;
+      }
+      Sym = getContext().getOrCreateSymbol(IDVal);
+    } else
+      Sym = Ctx.createDirectionalLocalSymbol(LocalLabelVal);
+    // End of Labels should be treated as end of line for lexing
+    // purposes but that information is not available to the Lexer who
+    // does not understand Labels. This may cause us to see a Hash
+    // here instead of a preprocessor line comment.
+    if (getTok().is(AsmToken::Hash)) {
+      StringRef CommentStr = parseStringToEndOfStatement();
+      Lexer.Lex();
+      Lexer.UnLex(AsmToken(AsmToken::EndOfStatement, CommentStr));
+    }
+
+    // Consume any end of statement token, if present, to avoid spurious
+    // AddBlankLine calls().
+    if (getTok().is(AsmToken::EndOfStatement)) {
+      Lex();
+    }
+
+    getTargetParser().doBeforeLabelEmit(Sym);
+
+    // Emit the label.
+    if (!getTargetParser().isParsingMSInlineAsm())
+      Out.emitLabel(Sym, IDLoc);
+
+    // If we are generating dwarf for assembly source files then gather the
+    // info to make a dwarf label entry for this label if needed.
+    if (enabledGenDwarfForAssembly())
+      MCGenDwarfLabelEntry::Make(Sym, &getStreamer(), getSourceManager(),
+                                 IDLoc);
+
+    getTargetParser().onLabelParsed(Sym);
+
+    return false;
+  }
+
+  default: // Normal instruction or directive.
+    break;
+  }
+
+  // If macros are enabled, check to see if this is a macro instantiation.
+  if (const MCAsmMacro *M = getContext().lookupMacro(IDVal)) {
+    return handleMacroEntry(M, IDLoc);
+  }
+
+  // Otherwise, we have a normal instruction or directive.
+
+  if (DirKind != DK_NO_DIRECTIVE) {
+    // There are several entities interested in parsing directives:
+    //
+    // 1. Asm parser extensions. For example, platform-specific parsers
+    //    (like the ELF parser) register themselves as extensions.
+    // 2. The target-specific assembly parser. Some directives are target
+    //    specific or may potentially behave differently on certain targets.
+    // 3. The generic directive parser implemented by this class. These are
+    //    all the directives that behave in a target and platform independent
+    //    manner, or at least have a default behavior that's shared between
+    //    all targets and platforms.
+
+    getTargetParser().flushPendingInstructions(getStreamer());
+
+    // Special-case handling of structure-end directives at higher priority,
+    // since ENDS is overloaded as a segment-end directive.
+    if (IDVal.equals_lower("ends") && StructInProgress.size() > 1 &&
+        getTok().is(AsmToken::EndOfStatement)) {
+      return parseDirectiveNestedEnds();
+    }
+
+    // First, check the extension directive map to see if any extension has
+    // registered itself to parse this directive.
+    std::pair<MCAsmParserExtension *, DirectiveHandler> Handler =
+        ExtensionDirectiveMap.lookup(IDVal.lower());
+    if (Handler.first)
+      return (*Handler.second)(Handler.first, IDVal, IDLoc);
+
+    // Next, let the target-specific assembly parser try.
+    SMLoc StartTokLoc = getTok().getLoc();
+    bool TPDirectiveReturn =
+        ID.is(AsmToken::Identifier) && getTargetParser().ParseDirective(ID);
+
+    if (hasPendingError())
+      return true;
+    // Currently the return value should be true if we are
+    // uninterested but as this is at odds with the standard parsing
+    // convention (return true = error) we have instances of a parsed
+    // directive that fails returning true as an error. Catch these
+    // cases as best as possible errors here.
+    if (TPDirectiveReturn && StartTokLoc != getTok().getLoc())
+      return true;
+    // Return if we did some parsing or believe we succeeded.
+    if (!TPDirectiveReturn || StartTokLoc != getTok().getLoc())
+      return false;
+
+    // Finally, if no one else is interested in this directive, it must be
+    // generic and familiar to this class.
+    switch (DirKind) {
+    default:
+      break;
+    case DK_ASCII:
+      return parseDirectiveAscii(IDVal, false);
+    case DK_ASCIZ:
+    case DK_STRING:
+      return parseDirectiveAscii(IDVal, true);
+    case DK_BYTE:
+    case DK_SBYTE:
+    case DK_DB:
+      return parseDirectiveValue(IDVal, 1);
+    case DK_WORD:
+    case DK_SWORD:
+    case DK_DW:
+      return parseDirectiveValue(IDVal, 2);
+    case DK_DWORD:
+    case DK_SDWORD:
+    case DK_DD:
+      return parseDirectiveValue(IDVal, 4);
+    case DK_FWORD:
+      return parseDirectiveValue(IDVal, 6);
+    case DK_QWORD:
+    case DK_SQWORD:
+    case DK_DQ:
+      return parseDirectiveValue(IDVal, 8);
+    case DK_REAL4:
+      return parseDirectiveRealValue(IDVal, APFloat::IEEEsingle());
+    case DK_REAL8:
+      return parseDirectiveRealValue(IDVal, APFloat::IEEEdouble());
+    case DK_STRUCT:
+    case DK_UNION:
+      return parseDirectiveNestedStruct(IDVal, DirKind);
+    case DK_ENDS:
+      return parseDirectiveNestedEnds();
+    case DK_ALIGN:
+      return parseDirectiveAlign();
+    case DK_ORG:
+      return parseDirectiveOrg();
+    case DK_EXTERN:
+      eatToEndOfStatement(); // .extern is the default, ignore it.
+      return false;
+    case DK_PUBLIC:
+      return parseDirectiveSymbolAttribute(MCSA_Global);
+    case DK_COMM:
+      return parseDirectiveComm(/*IsLocal=*/false);
+    case DK_COMMENT:
+      return parseDirectiveComment(IDLoc);
+    case DK_INCLUDE:
+      return parseDirectiveInclude();
+    case DK_REPT:
+      return parseDirectiveRept(IDLoc, IDVal);
+    case DK_IRP:
+      return parseDirectiveIrp(IDLoc);
+    case DK_IRPC:
+      return parseDirectiveIrpc(IDLoc);
+    case DK_ENDR:
+      return parseDirectiveEndr(IDLoc);
+    case DK_FILE:
+      return parseDirectiveFile(IDLoc);
+    case DK_LINE:
+      return parseDirectiveLine();
+    case DK_LOC:
+      return parseDirectiveLoc();
+    case DK_STABS:
+      return parseDirectiveStabs();
+    case DK_CV_FILE:
+      return parseDirectiveCVFile();
+    case DK_CV_FUNC_ID:
+      return parseDirectiveCVFuncId();
+    case DK_CV_INLINE_SITE_ID:
+      return parseDirectiveCVInlineSiteId();
+    case DK_CV_LOC:
+      return parseDirectiveCVLoc();
+    case DK_CV_LINETABLE:
+      return parseDirectiveCVLinetable();
+    case DK_CV_INLINE_LINETABLE:
+      return parseDirectiveCVInlineLinetable();
+    case DK_CV_DEF_RANGE:
+      return parseDirectiveCVDefRange();
+    case DK_CV_STRING:
+      return parseDirectiveCVString();
+    case DK_CV_STRINGTABLE:
+      return parseDirectiveCVStringTable();
+    case DK_CV_FILECHECKSUMS:
+      return parseDirectiveCVFileChecksums();
+    case DK_CV_FILECHECKSUM_OFFSET:
+      return parseDirectiveCVFileChecksumOffset();
+    case DK_CV_FPO_DATA:
+      return parseDirectiveCVFPOData();
+    case DK_CFI_SECTIONS:
+      return parseDirectiveCFISections();
+    case DK_CFI_STARTPROC:
+      return parseDirectiveCFIStartProc();
+    case DK_CFI_ENDPROC:
+      return parseDirectiveCFIEndProc();
+    case DK_CFI_DEF_CFA:
+      return parseDirectiveCFIDefCfa(IDLoc);
+    case DK_CFI_DEF_CFA_OFFSET:
+      return parseDirectiveCFIDefCfaOffset();
+    case DK_CFI_ADJUST_CFA_OFFSET:
+      return parseDirectiveCFIAdjustCfaOffset();
+    case DK_CFI_DEF_CFA_REGISTER:
+      return parseDirectiveCFIDefCfaRegister(IDLoc);
+    case DK_CFI_OFFSET:
+      return parseDirectiveCFIOffset(IDLoc);
+    case DK_CFI_REL_OFFSET:
+      return parseDirectiveCFIRelOffset(IDLoc);
+    case DK_CFI_PERSONALITY:
+      return parseDirectiveCFIPersonalityOrLsda(true);
+    case DK_CFI_LSDA:
+      return parseDirectiveCFIPersonalityOrLsda(false);
+    case DK_CFI_REMEMBER_STATE:
+      return parseDirectiveCFIRememberState();
+    case DK_CFI_RESTORE_STATE:
+      return parseDirectiveCFIRestoreState();
+    case DK_CFI_SAME_VALUE:
+      return parseDirectiveCFISameValue(IDLoc);
+    case DK_CFI_RESTORE:
+      return parseDirectiveCFIRestore(IDLoc);
+    case DK_CFI_ESCAPE:
+      return parseDirectiveCFIEscape();
+    case DK_CFI_RETURN_COLUMN:
+      return parseDirectiveCFIReturnColumn(IDLoc);
+    case DK_CFI_SIGNAL_FRAME:
+      return parseDirectiveCFISignalFrame();
+    case DK_CFI_UNDEFINED:
+      return parseDirectiveCFIUndefined(IDLoc);
+    case DK_CFI_REGISTER:
+      return parseDirectiveCFIRegister(IDLoc);
+    case DK_CFI_WINDOW_SAVE:
+      return parseDirectiveCFIWindowSave();
+    case DK_MACRO:
+      return parseDirectiveMacro(IDLoc);
+    case DK_ALTMACRO:
+    case DK_NOALTMACRO:
+      return parseDirectiveAltmacro(IDVal);
+    case DK_EXITM:
+      return parseDirectiveExitMacro(IDVal);
+    case DK_ENDM:
+      return parseDirectiveEndMacro(IDVal);
+    case DK_PURGEM:
+      return parseDirectivePurgeMacro(IDLoc);
+    case DK_END:
+      return parseDirectiveEnd(IDLoc);
+    case DK_ERR:
+      return parseDirectiveError(IDLoc);
+    case DK_ERRB:
+      return parseDirectiveErrorIfb(IDLoc, true);
+    case DK_ERRNB:
+      return parseDirectiveErrorIfb(IDLoc, false);
+    case DK_ERRDEF:
+      return parseDirectiveErrorIfdef(IDLoc, true);
+    case DK_ERRNDEF:
+      return parseDirectiveErrorIfdef(IDLoc, false);
+    case DK_ERRDIF:
+      return parseDirectiveErrorIfidn(IDLoc, /*ExpectEqual=*/false,
+                                      /*CaseInsensitive=*/false);
+    case DK_ERRDIFI:
+      return parseDirectiveErrorIfidn(IDLoc, /*ExpectEqual=*/false,
+                                      /*CaseInsensitive=*/true);
+    case DK_ERRIDN:
+      return parseDirectiveErrorIfidn(IDLoc, /*ExpectEqual=*/true,
+                                      /*CaseInsensitive=*/false);
+    case DK_ERRIDNI:
+      return parseDirectiveErrorIfidn(IDLoc, /*ExpectEqual=*/true,
+                                      /*CaseInsensitive=*/true);
+    case DK_ERRE:
+      return parseDirectiveErrorIfe(IDLoc, true);
+    case DK_ERRNZ:
+      return parseDirectiveErrorIfe(IDLoc, false);
+    case DK_ECHO:
+      return parseDirectiveEcho();
+    }
+
+    return Error(IDLoc, "unknown directive");
+  }
+
+  // We also check if this is allocating memory with user-defined type.
+  auto IDIt = Structs.find(IDVal.lower());
+  if (IDIt != Structs.end())
+    return parseDirectiveStructValue(/*Structure=*/IDIt->getValue(), IDVal,
+                                     IDLoc);
+
+  // Non-conditional Microsoft directives sometimes follow their first argument.
+  const AsmToken nextTok = getTok();
+  const StringRef nextVal = nextTok.getString();
+  const SMLoc nextLoc = nextTok.getLoc();
+
+  // There are several entities interested in parsing infix directives:
+  //
+  // 1. Asm parser extensions. For example, platform-specific parsers
+  //    (like the ELF parser) register themselves as extensions.
+  // 2. The generic directive parser implemented by this class. These are
+  //    all the directives that behave in a target and platform independent
+  //    manner, or at least have a default behavior that's shared between
+  //    all targets and platforms.
+
+  getTargetParser().flushPendingInstructions(getStreamer());
+
+  // Special-case handling of structure-end directives at higher priority, since
+  // ENDS is overloaded as a segment-end directive.
+  if (nextVal.equals_lower("ends") && StructInProgress.size() == 1) {
+    Lex();
+    return parseDirectiveEnds(IDVal, IDLoc);
+  }
+
+  // First, check the extension directive map to see if any extension has
+  // registered itself to parse this directive.
+  std::pair<MCAsmParserExtension *, DirectiveHandler> Handler =
+      ExtensionDirectiveMap.lookup(nextVal.lower());
+  if (Handler.first) {
+    Lex();
+    Lexer.UnLex(ID);
+    return (*Handler.second)(Handler.first, nextVal, nextLoc);
+  }
+
+  // If no one else is interested in this directive, it must be
+  // generic and familiar to this class.
+  DirKindIt = DirectiveKindMap.find(nextVal.lower());
+  DirKind = (DirKindIt == DirectiveKindMap.end())
+                ? DK_NO_DIRECTIVE
+                : DirKindIt->getValue();
+  switch (DirKind) {
+  default:
+    break;
+  case DK_ASSIGN:
+  case DK_EQU:
+  case DK_TEXTEQU:
+    Lex();
+    return parseDirectiveEquate(nextVal, IDVal, DirKind);
+  case DK_BYTE:
+  case DK_DB:
+    Lex();
+    return parseDirectiveNamedValue(nextVal, 1, IDVal, IDLoc);
+  case DK_WORD:
+  case DK_DW:
+    Lex();
+    return parseDirectiveNamedValue(nextVal, 2, IDVal, IDLoc);
+  case DK_DWORD:
+  case DK_DD:
+    Lex();
+    return parseDirectiveNamedValue(nextVal, 4, IDVal, IDLoc);
+  case DK_FWORD:
+    Lex();
+    return parseDirectiveNamedValue(nextVal, 6, IDVal, IDLoc);
+  case DK_QWORD:
+  case DK_DQ:
+    Lex();
+    return parseDirectiveNamedValue(nextVal, 8, IDVal, IDLoc);
+  case DK_REAL4:
+    Lex();
+    return parseDirectiveNamedRealValue(nextVal, APFloat::IEEEsingle(), IDVal,
+                                        IDLoc);
+  case DK_REAL8:
+    Lex();
+    return parseDirectiveNamedRealValue(nextVal, APFloat::IEEEdouble(), IDVal,
+                                        IDLoc);
+  case DK_STRUCT:
+  case DK_UNION:
+    Lex();
+    return parseDirectiveStruct(nextVal, DirKind, IDVal, IDLoc);
+  case DK_ENDS:
+    Lex();
+    return parseDirectiveEnds(IDVal, IDLoc);
+  }
+
+  // Finally, we check if this is allocating a variable with user-defined type.
+  auto NextIt = Structs.find(nextVal.lower());
+  if (NextIt != Structs.end()) {
+    Lex();
+    return parseDirectiveNamedStructValue(/*Structure=*/NextIt->getValue(),
+                                          nextVal, nextLoc, IDVal);
+  }
+
+  // __asm _emit or __asm __emit
+  if (ParsingMSInlineAsm && (IDVal == "_emit" || IDVal == "__emit" ||
+                             IDVal == "_EMIT" || IDVal == "__EMIT"))
+    return parseDirectiveMSEmit(IDLoc, Info, IDVal.size());
+
+  // __asm align
+  if (ParsingMSInlineAsm && (IDVal == "align" || IDVal == "ALIGN"))
+    return parseDirectiveMSAlign(IDLoc, Info);
+
+  if (ParsingMSInlineAsm && (IDVal == "even" || IDVal == "EVEN"))
+    Info.AsmRewrites->emplace_back(AOK_EVEN, IDLoc, 4);
+  if (checkForValidSection())
+    return true;
+
+  // Canonicalize the opcode to lower case.
+  std::string OpcodeStr = IDVal.lower();
+  ParseInstructionInfo IInfo(Info.AsmRewrites);
+  bool ParseHadError = getTargetParser().ParseInstruction(IInfo, OpcodeStr, ID,
+                                                          Info.ParsedOperands);
+  Info.ParseError = ParseHadError;
+
+  // Dump the parsed representation, if requested.
+  if (getShowParsedOperands()) {
+    SmallString<256> Str;
+    raw_svector_ostream OS(Str);
+    OS << "parsed instruction: [";
+    for (unsigned i = 0; i != Info.ParsedOperands.size(); ++i) {
+      if (i != 0)
+        OS << ", ";
+      Info.ParsedOperands[i]->print(OS);
+    }
+    OS << "]";
+
+    printMessage(IDLoc, SourceMgr::DK_Note, OS.str());
+  }
+
+  // Fail even if ParseInstruction erroneously returns false.
+  if (hasPendingError() || ParseHadError)
+    return true;
+
+  // If we are generating dwarf for the current section then generate a .loc
+  // directive for the instruction.
+  if (!ParseHadError && enabledGenDwarfForAssembly() &&
+      getContext().getGenDwarfSectionSyms().count(
+          getStreamer().getCurrentSectionOnly())) {
+    unsigned Line;
+    if (ActiveMacros.empty())
+      Line = SrcMgr.FindLineNumber(IDLoc, CurBuffer);
+    else
+      Line = SrcMgr.FindLineNumber(ActiveMacros.front()->InstantiationLoc,
+                                   ActiveMacros.front()->ExitBuffer);
+
+    // If we previously parsed a cpp hash file line comment then make sure the
+    // current Dwarf File is for the CppHashFilename if not then emit the
+    // Dwarf File table for it and adjust the line number for the .loc.
+    if (!CppHashInfo.Filename.empty()) {
+      unsigned FileNumber = getStreamer().emitDwarfFileDirective(
+          0, StringRef(), CppHashInfo.Filename);
+      getContext().setGenDwarfFileNumber(FileNumber);
+
+      unsigned CppHashLocLineNo =
+        SrcMgr.FindLineNumber(CppHashInfo.Loc, CppHashInfo.Buf);
+      Line = CppHashInfo.LineNumber - 1 + (Line - CppHashLocLineNo);
+    }
+
+    getStreamer().emitDwarfLocDirective(
+        getContext().getGenDwarfFileNumber(), Line, 0,
+        DWARF2_LINE_DEFAULT_IS_STMT ? DWARF2_FLAG_IS_STMT : 0, 0, 0,
+        StringRef());
+  }
+
+  // If parsing succeeded, match the instruction.
+  if (!ParseHadError) {
+    uint64_t ErrorInfo;
+    if (getTargetParser().MatchAndEmitInstruction(
+            IDLoc, Info.Opcode, Info.ParsedOperands, Out, ErrorInfo,
+            getTargetParser().isParsingMSInlineAsm()))
+      return true;
+  }
+  return false;
+}
+
+// Parse and erase curly braces marking block start/end.
+bool MasmParser::parseCurlyBlockScope(
+    SmallVectorImpl<AsmRewrite> &AsmStrRewrites) {
+  // Identify curly brace marking block start/end.
+  if (Lexer.isNot(AsmToken::LCurly) && Lexer.isNot(AsmToken::RCurly))
+    return false;
+
+  SMLoc StartLoc = Lexer.getLoc();
+  Lex(); // Eat the brace.
+  if (Lexer.is(AsmToken::EndOfStatement))
+    Lex(); // Eat EndOfStatement following the brace.
+
+  // Erase the block start/end brace from the output asm string.
+  AsmStrRewrites.emplace_back(AOK_Skip, StartLoc, Lexer.getLoc().getPointer() -
+                                                  StartLoc.getPointer());
+  return true;
+}
+
+/// parseCppHashLineFilenameComment as this:
+///   ::= # number "filename"
+bool MasmParser::parseCppHashLineFilenameComment(SMLoc L) {
+  Lex(); // Eat the hash token.
+  // Lexer only ever emits HashDirective if it fully formed if it's
+  // done the checking already so this is an internal error.
+  assert(getTok().is(AsmToken::Integer) &&
+         "Lexing Cpp line comment: Expected Integer");
+  int64_t LineNumber = getTok().getIntVal();
+  Lex();
+  assert(getTok().is(AsmToken::String) &&
+         "Lexing Cpp line comment: Expected String");
+  StringRef Filename = getTok().getString();
+  Lex();
+
+  // Get rid of the enclosing quotes.
+  Filename = Filename.substr(1, Filename.size() - 2);
+
+  // Save the SMLoc, Filename and LineNumber for later use by diagnostics
+  // and possibly DWARF file info.
+  CppHashInfo.Loc = L;
+  CppHashInfo.Filename = Filename;
+  CppHashInfo.LineNumber = LineNumber;
+  CppHashInfo.Buf = CurBuffer;
+  if (FirstCppHashFilename.empty())
+    FirstCppHashFilename = Filename;
+  return false;
+}
+
+/// will use the last parsed cpp hash line filename comment
+/// for the Filename and LineNo if any in the diagnostic.
+void MasmParser::DiagHandler(const SMDiagnostic &Diag, void *Context) {
+  const MasmParser *Parser = static_cast<const MasmParser *>(Context);
+  raw_ostream &OS = errs();
+
+  const SourceMgr &DiagSrcMgr = *Diag.getSourceMgr();
+  SMLoc DiagLoc = Diag.getLoc();
+  unsigned DiagBuf = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
+  unsigned CppHashBuf =
+      Parser->SrcMgr.FindBufferContainingLoc(Parser->CppHashInfo.Loc);
+
+  // Like SourceMgr::printMessage() we need to print the include stack if any
+  // before printing the message.
+  unsigned DiagCurBuffer = DiagSrcMgr.FindBufferContainingLoc(DiagLoc);
+  if (!Parser->SavedDiagHandler && DiagCurBuffer &&
+      DiagCurBuffer != DiagSrcMgr.getMainFileID()) {
+    SMLoc ParentIncludeLoc = DiagSrcMgr.getParentIncludeLoc(DiagCurBuffer);
+    DiagSrcMgr.PrintIncludeStack(ParentIncludeLoc, OS);
+  }
+
+  // If we have not parsed a cpp hash line filename comment or the source
+  // manager changed or buffer changed (like in a nested include) then just
+  // print the normal diagnostic using its Filename and LineNo.
+  if (!Parser->CppHashInfo.LineNumber || &DiagSrcMgr != &Parser->SrcMgr ||
+      DiagBuf != CppHashBuf) {
+    if (Parser->SavedDiagHandler)
+      Parser->SavedDiagHandler(Diag, Parser->SavedDiagContext);
+    else
+      Diag.print(nullptr, OS);
+    return;
+  }
+
+  // Use the CppHashFilename and calculate a line number based on the
+  // CppHashInfo.Loc and CppHashInfo.LineNumber relative to this Diag's SMLoc
+  // for the diagnostic.
+  const std::string &Filename = std::string(Parser->CppHashInfo.Filename);
+
+  int DiagLocLineNo = DiagSrcMgr.FindLineNumber(DiagLoc, DiagBuf);
+  int CppHashLocLineNo =
+      Parser->SrcMgr.FindLineNumber(Parser->CppHashInfo.Loc, CppHashBuf);
+  int LineNo =
+      Parser->CppHashInfo.LineNumber - 1 + (DiagLocLineNo - CppHashLocLineNo);
+
+  SMDiagnostic NewDiag(*Diag.getSourceMgr(), Diag.getLoc(), Filename, LineNo,
+                       Diag.getColumnNo(), Diag.getKind(), Diag.getMessage(),
+                       Diag.getLineContents(), Diag.getRanges());
+
+  if (Parser->SavedDiagHandler)
+    Parser->SavedDiagHandler(NewDiag, Parser->SavedDiagContext);
+  else
+    NewDiag.print(nullptr, OS);
+}
+
+// FIXME: This is mostly duplicated from the function in AsmLexer.cpp. The
+// difference being that that function accepts '@' as part of identifiers and
+// we can't do that. AsmLexer.cpp should probably be changed to handle
+// '@' as a special case when needed.
+static bool isIdentifierChar(char c) {
+  return isalnum(static_cast<unsigned char>(c)) || c == '_' || c == '$' ||
+         c == '.';
+}
+
+bool MasmParser::expandMacro(raw_svector_ostream &OS, StringRef Body,
+                             ArrayRef<MCAsmMacroParameter> Parameters,
+                             ArrayRef<MCAsmMacroArgument> A,
+                             bool EnableAtPseudoVariable, SMLoc L) {
+  unsigned NParameters = Parameters.size();
+  bool HasVararg = NParameters ? Parameters.back().Vararg : false;
+  if ((!IsDarwin || NParameters != 0) && NParameters != A.size())
+    return Error(L, "Wrong number of arguments");
+
+  // A macro without parameters is handled differently on Darwin:
+  // gas accepts no arguments and does no substitutions
+  while (!Body.empty()) {
+    // Scan for the next substitution.
+    std::size_t End = Body.size(), Pos = 0;
+    for (; Pos != End; ++Pos) {
+      // Check for a substitution or escape.
+      if (IsDarwin && !NParameters) {
+        // This macro has no parameters, look for $0, $1, etc.
+        if (Body[Pos] != '$' || Pos + 1 == End)
+          continue;
+
+        char Next = Body[Pos + 1];
+        if (Next == '$' || Next == 'n' ||
+            isdigit(static_cast<unsigned char>(Next)))
+          break;
+      } else {
+        // This macro has parameters, look for \foo, \bar, etc.
+        if (Body[Pos] == '\\' && Pos + 1 != End)
+          break;
+      }
+    }
+
+    // Add the prefix.
+    OS << Body.slice(0, Pos);
+
+    // Check if we reached the end.
+    if (Pos == End)
+      break;
+
+    if (IsDarwin && !NParameters) {
+      switch (Body[Pos + 1]) {
+      // $$ => $
+      case '$':
+        OS << '$';
+        break;
+
+      // $n => number of arguments
+      case 'n':
+        OS << A.size();
+        break;
+
+      // $[0-9] => argument
+      default: {
+        // Missing arguments are ignored.
+        unsigned Index = Body[Pos + 1] - '0';
+        if (Index >= A.size())
+          break;
+
+        // Otherwise substitute with the token values, with spaces eliminated.
+        for (const AsmToken &Token : A[Index])
+          OS << Token.getString();
+        break;
+      }
+      }
+      Pos += 2;
+    } else {
+      unsigned I = Pos + 1;
+
+      // Check for the \@ pseudo-variable.
+      if (EnableAtPseudoVariable && Body[I] == '@' && I + 1 != End)
+        ++I;
+      else
+        while (isIdentifierChar(Body[I]) && I + 1 != End)
+          ++I;
+
+      const char *Begin = Body.data() + Pos + 1;
+      StringRef Argument(Begin, I - (Pos + 1));
+      unsigned Index = 0;
+
+      if (Argument == "@") {
+        OS << NumOfMacroInstantiations;
+        Pos += 2;
+      } else {
+        for (; Index < NParameters; ++Index)
+          if (Parameters[Index].Name == Argument)
+            break;
+
+        if (Index == NParameters) {
+          if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
+            Pos += 3;
+          else {
+            OS << '\\' << Argument;
+            Pos = I;
+          }
+        } else {
+          bool VarargParameter = HasVararg && Index == (NParameters - 1);
+          for (const AsmToken &Token : A[Index])
+            // For altmacro mode, you can write '%expr'.
+            // The prefix '%' evaluates the expression 'expr'
+            // and uses the result as a string (e.g. replace %(1+2) with the
+            // string "3").
+            // Here, we identify the integer token which is the result of the
+            // absolute expression evaluation and replace it with its string
+            // representation.
+            if (AltMacroMode && Token.getString().front() == '%' &&
+                Token.is(AsmToken::Integer))
+              // Emit an integer value to the buffer.
+              OS << Token.getIntVal();
+            // Only Token that was validated as a string and begins with '<'
+            // is considered altMacroString!!!
+            else if (AltMacroMode && Token.getString().front() == '<' &&
+                     Token.is(AsmToken::String)) {
+              OS << angleBracketString(Token.getStringContents());
+            }
+            // We expect no quotes around the string's contents when
+            // parsing for varargs.
+            else if (Token.isNot(AsmToken::String) || VarargParameter)
+              OS << Token.getString();
+            else
+              OS << Token.getStringContents();
+
+          Pos += 1 + Argument.size();
+        }
+      }
+    }
+    // Update the scan point.
+    Body = Body.substr(Pos);
+  }
+
+  return false;
+}
+
+static bool isOperator(AsmToken::TokenKind kind) {
+  switch (kind) {
+  default:
+    return false;
+  case AsmToken::Plus:
+  case AsmToken::Minus:
+  case AsmToken::Tilde:
+  case AsmToken::Slash:
+  case AsmToken::Star:
+  case AsmToken::Dot:
+  case AsmToken::Equal:
+  case AsmToken::EqualEqual:
+  case AsmToken::Pipe:
+  case AsmToken::PipePipe:
+  case AsmToken::Caret:
+  case AsmToken::Amp:
+  case AsmToken::AmpAmp:
+  case AsmToken::Exclaim:
+  case AsmToken::ExclaimEqual:
+  case AsmToken::Less:
+  case AsmToken::LessEqual:
+  case AsmToken::LessLess:
+  case AsmToken::LessGreater:
+  case AsmToken::Greater:
+  case AsmToken::GreaterEqual:
+  case AsmToken::GreaterGreater:
+    return true;
+  }
+}
+
+namespace {
+
+class AsmLexerSkipSpaceRAII {
+public:
+  AsmLexerSkipSpaceRAII(AsmLexer &Lexer, bool SkipSpace) : Lexer(Lexer) {
+    Lexer.setSkipSpace(SkipSpace);
+  }
+
+  ~AsmLexerSkipSpaceRAII() {
+    Lexer.setSkipSpace(true);
+  }
+
+private:
+  AsmLexer &Lexer;
+};
+
+} // end anonymous namespace
+
+bool MasmParser::parseMacroArgument(MCAsmMacroArgument &MA, bool Vararg) {
+
+  if (Vararg) {
+    if (Lexer.isNot(AsmToken::EndOfStatement)) {
+      StringRef Str = parseStringToEndOfStatement();
+      MA.emplace_back(AsmToken::String, Str);
+    }
+    return false;
+  }
+
+  unsigned ParenLevel = 0;
+
+  // Darwin doesn't use spaces to delmit arguments.
+  AsmLexerSkipSpaceRAII ScopedSkipSpace(Lexer, IsDarwin);
+
+  bool SpaceEaten;
+
+  while (true) {
+    SpaceEaten = false;
+    if (Lexer.is(AsmToken::Eof) || Lexer.is(AsmToken::Equal))
+      return TokError("unexpected token in macro instantiation");
+
+    if (ParenLevel == 0) {
+
+      if (Lexer.is(AsmToken::Comma))
+        break;
+
+      if (Lexer.is(AsmToken::Space)) {
+        SpaceEaten = true;
+        Lexer.Lex(); // Eat spaces.
+      }
+
+      // Spaces can delimit parameters, but could also be part an expression.
+      // If the token after a space is an operator, add the token and the next
+      // one into this argument
+      if (!IsDarwin) {
+        if (isOperator(Lexer.getKind())) {
+          MA.push_back(getTok());
+          Lexer.Lex();
+
+          // Whitespace after an operator can be ignored.
+          if (Lexer.is(AsmToken::Space))
+            Lexer.Lex();
+
+          continue;
+        }
+      }
+      if (SpaceEaten)
+        break;
+    }
+
+    // handleMacroEntry relies on not advancing the lexer here
+    // to be able to fill in the remaining default parameter values
+    if (Lexer.is(AsmToken::EndOfStatement))
+      break;
+
+    // Adjust the current parentheses level.
+    if (Lexer.is(AsmToken::LParen))
+      ++ParenLevel;
+    else if (Lexer.is(AsmToken::RParen) && ParenLevel)
+      --ParenLevel;
+
+    // Append the token to the current argument list.
+    MA.push_back(getTok());
+    Lexer.Lex();
+  }
+
+  if (ParenLevel != 0)
+    return TokError("unbalanced parentheses in macro argument");
+  return false;
+}
+
+// Parse the macro instantiation arguments.
+bool MasmParser::parseMacroArguments(const MCAsmMacro *M,
+                                     MCAsmMacroArguments &A) {
+  const unsigned NParameters = M ? M->Parameters.size() : 0;
+  bool NamedParametersFound = false;
+  SmallVector<SMLoc, 4> FALocs;
+
+  A.resize(NParameters);
+  FALocs.resize(NParameters);
+
+  // Parse two kinds of macro invocations:
+  // - macros defined without any parameters accept an arbitrary number of them
+  // - macros defined with parameters accept at most that many of them
+  bool HasVararg = NParameters ? M->Parameters.back().Vararg : false;
+  for (unsigned Parameter = 0; !NParameters || Parameter < NParameters;
+       ++Parameter) {
+    SMLoc IDLoc = Lexer.getLoc();
+    MCAsmMacroParameter FA;
+
+    if (Lexer.is(AsmToken::Identifier) && Lexer.peekTok().is(AsmToken::Equal)) {
+      if (parseIdentifier(FA.Name))
+        return Error(IDLoc, "invalid argument identifier for formal argument");
+
+      if (Lexer.isNot(AsmToken::Equal))
+        return TokError("expected '=' after formal parameter identifier");
+
+      Lex();
+
+      NamedParametersFound = true;
+    }
+    bool Vararg = HasVararg && Parameter == (NParameters - 1);
+
+    if (NamedParametersFound && FA.Name.empty())
+      return Error(IDLoc, "cannot mix positional and keyword arguments");
+
+    SMLoc StrLoc = Lexer.getLoc();
+    SMLoc EndLoc;
+    if (AltMacroMode && Lexer.is(AsmToken::Percent)) {
+      const MCExpr *AbsoluteExp;
+      int64_t Value;
+      /// Eat '%'.
+      Lex();
+      if (parseExpression(AbsoluteExp, EndLoc))
+        return false;
+      if (!AbsoluteExp->evaluateAsAbsolute(Value,
+                                           getStreamer().getAssemblerPtr()))
+        return Error(StrLoc, "expected absolute expression");
+      const char *StrChar = StrLoc.getPointer();
+      const char *EndChar = EndLoc.getPointer();
+      AsmToken newToken(AsmToken::Integer,
+                        StringRef(StrChar, EndChar - StrChar), Value);
+      FA.Value.push_back(newToken);
+    } else if (AltMacroMode && Lexer.is(AsmToken::Less) &&
+               isAngleBracketString(StrLoc, EndLoc)) {
+      const char *StrChar = StrLoc.getPointer();
+      const char *EndChar = EndLoc.getPointer();
+      jumpToLoc(EndLoc, CurBuffer);
+      /// Eat from '<' to '>'.
+      Lex();
+      AsmToken newToken(AsmToken::String,
+                        StringRef(StrChar, EndChar - StrChar));
+      FA.Value.push_back(newToken);
+    } else if(parseMacroArgument(FA.Value, Vararg))
+      return true;
+
+    unsigned PI = Parameter;
+    if (!FA.Name.empty()) {
+      unsigned FAI = 0;
+      for (FAI = 0; FAI < NParameters; ++FAI)
+        if (M->Parameters[FAI].Name == FA.Name)
+          break;
+
+      if (FAI >= NParameters) {
+        assert(M && "expected macro to be defined");
+        return Error(IDLoc, "parameter named '" + FA.Name +
+                                "' does not exist for macro '" + M->Name + "'");
+      }
+      PI = FAI;
+    }
+
+    if (!FA.Value.empty()) {
+      if (A.size() <= PI)
+        A.resize(PI + 1);
+      A[PI] = FA.Value;
+
+      if (FALocs.size() <= PI)
+        FALocs.resize(PI + 1);
+
+      FALocs[PI] = Lexer.getLoc();
+    }
+
+    // At the end of the statement, fill in remaining arguments that have
+    // default values. If there aren't any, then the next argument is
+    // required but missing
+    if (Lexer.is(AsmToken::EndOfStatement)) {
+      bool Failure = false;
+      for (unsigned FAI = 0; FAI < NParameters; ++FAI) {
+        if (A[FAI].empty()) {
+          if (M->Parameters[FAI].Required) {
+            Error(FALocs[FAI].isValid() ? FALocs[FAI] : Lexer.getLoc(),
+                  "missing value for required parameter "
+                  "'" + M->Parameters[FAI].Name + "' in macro '" + M->Name + "'");
+            Failure = true;
+          }
+
+          if (!M->Parameters[FAI].Value.empty())
+            A[FAI] = M->Parameters[FAI].Value;
+        }
+      }
+      return Failure;
+    }
+
+    if (Lexer.is(AsmToken::Comma))
+      Lex();
+  }
+
+  return TokError("too many positional arguments");
+}
+
+bool MasmParser::handleMacroEntry(const MCAsmMacro *M, SMLoc NameLoc) {
+  // Arbitrarily limit macro nesting depth (default matches 'as'). We can
+  // eliminate this, although we should protect against infinite loops.
+  unsigned MaxNestingDepth = AsmMacroMaxNestingDepth;
+  if (ActiveMacros.size() == MaxNestingDepth) {
+    std::ostringstream MaxNestingDepthError;
+    MaxNestingDepthError << "macros cannot be nested more than "
+                         << MaxNestingDepth << " levels deep."
+                         << " Use -asm-macro-max-nesting-depth to increase "
+                            "this limit.";
+    return TokError(MaxNestingDepthError.str());
+  }
+
+  MCAsmMacroArguments A;
+  if (parseMacroArguments(M, A))
+    return true;
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  StringRef Body = M->Body;
+  raw_svector_ostream OS(Buf);
+
+  if (expandMacro(OS, Body, M->Parameters, A, true, getTok().getLoc()))
+    return true;
+
+  // We include the .endmacro in the buffer as our cue to exit the macro
+  // instantiation.
+  OS << ".endmacro\n";
+
+  std::unique_ptr<MemoryBuffer> Instantiation =
+      MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+
+  // Create the macro instantiation object and add to the current macro
+  // instantiation stack.
+  MacroInstantiation *MI = new MacroInstantiation{
+      NameLoc, CurBuffer, getTok().getLoc(), TheCondStack.size()};
+  ActiveMacros.push_back(MI);
+
+  ++NumOfMacroInstantiations;
+
+  // Jump to the macro instantiation and prime the lexer.
+  CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(Instantiation), SMLoc());
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
+  Lex();
+
+  return false;
+}
+
+void MasmParser::handleMacroExit() {
+  // Jump to the EndOfStatement we should return to, and consume it.
+  jumpToLoc(ActiveMacros.back()->ExitLoc, ActiveMacros.back()->ExitBuffer);
+  Lex();
+
+  // Pop the instantiation entry.
+  delete ActiveMacros.back();
+  ActiveMacros.pop_back();
+}
+
+/// parseIdentifier:
+///   ::= identifier
+///   ::= string
+bool MasmParser::parseIdentifier(StringRef &Res) {
+  // The assembler has relaxed rules for accepting identifiers, in particular we
+  // allow things like '.globl $foo' and '.def @feat.00', which would normally
+  // be separate tokens. At this level, we have already lexed so we cannot
+  // (currently) handle this as a context dependent token, instead we detect
+  // adjacent tokens and return the combined identifier.
+  if (Lexer.is(AsmToken::Dollar) || Lexer.is(AsmToken::At)) {
+    SMLoc PrefixLoc = getLexer().getLoc();
+
+    // Consume the prefix character, and check for a following identifier.
+
+    AsmToken Buf[1];
+    Lexer.peekTokens(Buf, false);
+
+    if (Buf[0].isNot(AsmToken::Identifier))
+      return true;
+
+    // We have a '$' or '@' followed by an identifier, make sure they are adjacent.
+    if (PrefixLoc.getPointer() + 1 != Buf[0].getLoc().getPointer())
+      return true;
+
+    // eat $ or @
+    Lexer.Lex(); // Lexer's Lex guarantees consecutive token.
+    // Construct the joined identifier and consume the token.
+    Res =
+        StringRef(PrefixLoc.getPointer(), getTok().getIdentifier().size() + 1);
+    Lex(); // Parser Lex to maintain invariants.
+    return false;
+  }
+
+  if (Lexer.isNot(AsmToken::Identifier) && Lexer.isNot(AsmToken::String))
+    return true;
+
+  Res = getTok().getIdentifier();
+
+  Lex(); // Consume the identifier token.
+
+  return false;
+}
+
+/// parseDirectiveEquate:
+///  ::= name "=" expression
+///    | name "equ" expression    (not redefinable)
+///    | name "equ" text-list
+///    | name "textequ" text-list
+bool MasmParser::parseDirectiveEquate(StringRef IDVal, StringRef Name,
+                                      DirectiveKind DirKind) {
+  Variable &Var = Variables[Name];
+  if (Var.Name.empty()) {
+    Var.Name = Name;
+  } else if (!Var.Redefinable) {
+    return TokError("invalid variable redefinition");
+  }
+  Var.Redefinable = (DirKind != DK_EQU);
+
+  if (DirKind == DK_EQU || DirKind == DK_TEXTEQU) {
+    // "equ" and "textequ" both allow text expressions.
+    std::string Value;
+    if (!parseTextItem(Value)) {
+      Var.IsText = true;
+      Var.TextValue = Value;
+
+      // Accept a text-list, not just one text-item.
+      auto parseItem = [&]() -> bool {
+        if (parseTextItem(Value))
+          return true;
+        Var.TextValue += Value;
+        return false;
+      };
+      if (parseOptionalToken(AsmToken::Comma) && parseMany(parseItem))
+        return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+
+      return false;
+    }
+  }
+  if (DirKind == DK_TEXTEQU)
+    return TokError("expected <text> in '" + Twine(IDVal) + "' directive");
+
+  // Parse as expression assignment.
+  const MCExpr *Expr;
+  SMLoc EndLoc, StartLoc = Lexer.getLoc();
+  if (parseExpression(Expr, EndLoc))
+    return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+  if (Expr->evaluateAsAbsolute(Var.NumericValue,
+                               getStreamer().getAssemblerPtr()))
+    return false;
+
+  // Not an absolute expression; define as a text replacement.
+  Var.IsText = true;
+  Var.TextValue = StringRef(StartLoc.getPointer(),
+                            EndLoc.getPointer() - StartLoc.getPointer()).str();
+  return false;
+}
+
+bool MasmParser::parseEscapedString(std::string &Data) {
+  if (check(getTok().isNot(AsmToken::String), "expected string"))
+    return true;
+
+  Data = "";
+  StringRef Str = getTok().getStringContents();
+  for (unsigned i = 0, e = Str.size(); i != e; ++i) {
+    if (Str[i] != '\\') {
+      Data += Str[i];
+      continue;
+    }
+
+    // Recognize escaped characters. Note that this escape semantics currently
+    // loosely follows Darwin 'as'.
+    ++i;
+    if (i == e)
+      return TokError("unexpected backslash at end of string");
+
+    // Recognize hex sequences similarly to GNU 'as'.
+    if (Str[i] == 'x' || Str[i] == 'X') {
+      size_t length = Str.size();
+      if (i + 1 >= length || !isHexDigit(Str[i + 1]))
+        return TokError("invalid hexadecimal escape sequence");
+
+      // Consume hex characters. GNU 'as' reads all hexadecimal characters and
+      // then truncates to the lower 16 bits. Seems reasonable.
+      unsigned Value = 0;
+      while (i + 1 < length && isHexDigit(Str[i + 1]))
+        Value = Value * 16 + hexDigitValue(Str[++i]);
+
+      Data += (unsigned char)(Value & 0xFF);
+      continue;
+    }
+
+    // Recognize octal sequences.
+    if ((unsigned)(Str[i] - '0') <= 7) {
+      // Consume up to three octal characters.
+      unsigned Value = Str[i] - '0';
+
+      if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
+        ++i;
+        Value = Value * 8 + (Str[i] - '0');
+
+        if (i + 1 != e && ((unsigned)(Str[i + 1] - '0')) <= 7) {
+          ++i;
+          Value = Value * 8 + (Str[i] - '0');
+        }
+      }
+
+      if (Value > 255)
+        return TokError("invalid octal escape sequence (out of range)");
+
+      Data += (unsigned char)Value;
+      continue;
+    }
+
+    // Otherwise recognize individual escapes.
+    switch (Str[i]) {
+    default:
+      // Just reject invalid escape sequences for now.
+      return TokError("invalid escape sequence (unrecognized character)");
+
+    case 'b': Data += '\b'; break;
+    case 'f': Data += '\f'; break;
+    case 'n': Data += '\n'; break;
+    case 'r': Data += '\r'; break;
+    case 't': Data += '\t'; break;
+    case '"': Data += '"'; break;
+    case '\\': Data += '\\'; break;
+    }
+  }
+
+  Lex();
+  return false;
+}
+
+bool MasmParser::parseAngleBracketString(std::string &Data) {
+  SMLoc EndLoc, StartLoc = getTok().getLoc();
+  if (isAngleBracketString(StartLoc, EndLoc)) {
+    const char *StartChar = StartLoc.getPointer() + 1;
+    const char *EndChar = EndLoc.getPointer() - 1;
+    jumpToLoc(EndLoc, CurBuffer);
+    // Eat from '<' to '>'.
+    Lex();
+
+    Data = angleBracketString(StringRef(StartChar, EndChar - StartChar));
+    return false;
+  }
+  return true;
+}
+
+/// textItem ::= textLiteral | textMacroID | % constExpr
+bool MasmParser::parseTextItem(std::string &Data) {
+  // TODO(epastor): Support textMacroID and % expansion of expressions.
+  return parseAngleBracketString(Data);
+}
+
+/// parseDirectiveAscii:
+///   ::= ( .ascii | .asciz | .string ) [ "string" ( , "string" )* ]
+bool MasmParser::parseDirectiveAscii(StringRef IDVal, bool ZeroTerminated) {
+  auto parseOp = [&]() -> bool {
+    std::string Data;
+    if (checkForValidSection() || parseEscapedString(Data))
+      return true;
+    getStreamer().emitBytes(Data);
+    if (ZeroTerminated)
+      getStreamer().emitBytes(StringRef("\0", 1));
+    return false;
+  };
+
+  if (parseMany(parseOp))
+    return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+  return false;
+}
+
+bool MasmParser::emitIntValue(const MCExpr *Value, unsigned Size) {
+  // Special case constant expressions to match code generator.
+  if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
+    assert(Size <= 8 && "Invalid size");
+    int64_t IntValue = MCE->getValue();
+    if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
+      return Error(MCE->getLoc(), "out of range literal value");
+    getStreamer().emitIntValue(IntValue, Size);
+  } else {
+    const MCSymbolRefExpr *MSE = dyn_cast<MCSymbolRefExpr>(Value);
+    if (MSE && MSE->getSymbol().getName() == "?") {
+      // ? initializer; treat as 0.
+      getStreamer().emitIntValue(0, Size);
+    } else {
+      getStreamer().emitValue(Value, Size, Value->getLoc());
+    }
+  }
+  return false;
+}
+
+bool MasmParser::parseScalarInitializer(unsigned Size,
+                                        SmallVectorImpl<const MCExpr *> &Values,
+                                        unsigned StringPadLength) {
+  if (getTok().is(AsmToken::String)) {
+    StringRef Value = getTok().getStringContents();
+    if (Size == 1) {
+      // Treat each character as an initializer.
+      for (const char CharVal : Value)
+        Values.push_back(MCConstantExpr::create(CharVal, getContext()));
+
+      // Pad the string with spaces to the specified length.
+      for (size_t i = Value.size(); i < StringPadLength; ++i)
+        Values.push_back(MCConstantExpr::create(' ', getContext()));
+    } else {
+      // Treat the string as an initial value in big-endian representation.
+      if (Value.size() > Size)
+        return Error(getTok().getLoc(), "out of range literal value");
+
+      uint64_t IntValue = 0;
+      for (const unsigned char CharVal : Value.bytes())
+        IntValue = (IntValue << 8) | CharVal;
+      Values.push_back(MCConstantExpr::create(IntValue, getContext()));
+    }
+    Lex();
+  } else {
+    const MCExpr *Value;
+    if (checkForValidSection() || parseExpression(Value))
+      return true;
+    if (getTok().is(AsmToken::Identifier) &&
+        getTok().getString().equals_lower("dup")) {
+      Lex(); // Eat 'dup'.
+      const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+      if (!MCE)
+        return Error(Value->getLoc(),
+                     "cannot repeat value a non-constant number of times");
+      const int64_t Repetitions = MCE->getValue();
+      if (Repetitions < 0)
+        return Error(Value->getLoc(),
+                     "cannot repeat value a negative number of times");
+
+      SmallVector<const MCExpr *, 1> DuplicatedValues;
+      if (parseToken(AsmToken::LParen,
+                     "parentheses required for 'dup' contents") ||
+          parseScalarInstList(Size, DuplicatedValues) ||
+          parseToken(AsmToken::RParen, "unmatched parentheses"))
+        return true;
+
+      for (int i = 0; i < Repetitions; ++i)
+        Values.append(DuplicatedValues.begin(), DuplicatedValues.end());
+    } else {
+      Values.push_back(Value);
+    }
+  }
+  return false;
+}
+
+bool MasmParser::parseScalarInstList(unsigned Size,
+                                     SmallVectorImpl<const MCExpr *> &Values,
+                                     const AsmToken::TokenKind EndToken) {
+  while (getTok().isNot(EndToken) &&
+         (EndToken != AsmToken::Greater ||
+          getTok().isNot(AsmToken::GreaterGreater))) {
+    parseScalarInitializer(Size, Values);
+
+    // If we see a comma, continue, and allow line continuation.
+    if (!parseOptionalToken(AsmToken::Comma))
+      break;
+    parseOptionalToken(AsmToken::EndOfStatement);
+  }
+  return false;
+}
+
+bool MasmParser::emitIntegralValues(unsigned Size) {
+  SmallVector<const MCExpr *, 1> Values;
+  if (checkForValidSection() || parseScalarInstList(Size, Values))
+    return true;
+
+  for (auto Value : Values) {
+    emitIntValue(Value, Size);
+  }
+  return false;
+}
+
+// Add a field to the current structure.
+bool MasmParser::addIntegralField(StringRef Name, unsigned Size) {
+  StructInfo &Struct = StructInProgress.back();
+  FieldInfo &Field = Struct.addField(Name, FT_INTEGRAL);
+  IntFieldInfo &IntInfo = Field.Contents.IntInfo;
+
+  Field.Type = Size;
+
+  if (parseScalarInstList(Size, IntInfo.Values))
+    return true;
+
+  Field.SizeOf = Field.Type * IntInfo.Values.size();
+  Field.LengthOf = IntInfo.Values.size();
+  if (Struct.IsUnion)
+    Struct.Size = std::max(Struct.Size, Field.SizeOf);
+  else
+    Struct.Size += Field.SizeOf;
+  return false;
+}
+
+/// parseDirectiveValue
+///  ::= (byte | word | ... ) [ expression (, expression)* ]
+bool MasmParser::parseDirectiveValue(StringRef IDVal, unsigned Size) {
+  if (StructInProgress.empty()) {
+    // Initialize data value.
+    if (emitIntegralValues(Size))
+      return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+  } else if (addIntegralField("", Size)) {
+    return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+  }
+
+  return false;
+}
+
+/// parseDirectiveNamedValue
+///  ::= name (byte | word | ... ) [ expression (, expression)* ]
+bool MasmParser::parseDirectiveNamedValue(StringRef IDVal, unsigned Size,
+                                          StringRef Name, SMLoc NameLoc) {
+  if (StructInProgress.empty()) {
+    // Initialize named data value.
+    MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+    getStreamer().emitLabel(Sym);
+    if (emitIntegralValues(Size))
+      return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+  } else if (addIntegralField(Name, Size)) {
+    return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+  }
+
+  return false;
+}
+
+static bool parseHexOcta(MasmParser &Asm, uint64_t &hi, uint64_t &lo) {
+  if (Asm.getTok().isNot(AsmToken::Integer) &&
+      Asm.getTok().isNot(AsmToken::BigNum))
+    return Asm.TokError("unknown token in expression");
+  SMLoc ExprLoc = Asm.getTok().getLoc();
+  APInt IntValue = Asm.getTok().getAPIntVal();
+  Asm.Lex();
+  if (!IntValue.isIntN(128))
+    return Asm.Error(ExprLoc, "out of range literal value");
+  if (!IntValue.isIntN(64)) {
+    hi = IntValue.getHiBits(IntValue.getBitWidth() - 64).getZExtValue();
+    lo = IntValue.getLoBits(64).getZExtValue();
+  } else {
+    hi = 0;
+    lo = IntValue.getZExtValue();
+  }
+  return false;
+}
+
+bool MasmParser::parseRealValue(const fltSemantics &Semantics, APInt &Res) {
+  // We don't truly support arithmetic on floating point expressions, so we
+  // have to manually parse unary prefixes.
+  bool IsNeg = false;
+  if (getLexer().is(AsmToken::Minus)) {
+    Lexer.Lex();
+    IsNeg = true;
+  } else if (getLexer().is(AsmToken::Plus)) {
+    Lexer.Lex();
+  }
+
+  if (Lexer.is(AsmToken::Error))
+    return TokError(Lexer.getErr());
+  if (Lexer.isNot(AsmToken::Integer) && Lexer.isNot(AsmToken::Real) &&
+      Lexer.isNot(AsmToken::Identifier))
+    return TokError("unexpected token in directive");
+
+  // Convert to an APFloat.
+  APFloat Value(Semantics);
+  StringRef IDVal = getTok().getString();
+  if (getLexer().is(AsmToken::Identifier)) {
+    if (IDVal.equals_lower("infinity") || IDVal.equals_lower("inf"))
+      Value = APFloat::getInf(Semantics);
+    else if (IDVal.equals_lower("nan"))
+      Value = APFloat::getNaN(Semantics, false, ~0);
+    else if (IDVal.equals_lower("?"))
+      Value = APFloat::getZero(Semantics);
+    else
+      return TokError("invalid floating point literal");
+  } else if (errorToBool(
+                 Value.convertFromString(IDVal, APFloat::rmNearestTiesToEven)
+                     .takeError())) {
+    return TokError("invalid floating point literal");
+  }
+  if (IsNeg)
+    Value.changeSign();
+
+  // Consume the numeric token.
+  Lex();
+
+  Res = Value.bitcastToAPInt();
+
+  return false;
+}
+
+bool MasmParser::parseRealInstList(const fltSemantics &Semantics,
+                                   SmallVectorImpl<APInt> &ValuesAsInt,
+                                   const AsmToken::TokenKind EndToken) {
+  while (getTok().isNot(EndToken) ||
+         (EndToken == AsmToken::Greater &&
+          getTok().isNot(AsmToken::GreaterGreater))) {
+    const AsmToken NextTok = Lexer.peekTok();
+    if (NextTok.is(AsmToken::Identifier) &&
+        NextTok.getString().equals_lower("dup")) {
+      const MCExpr *Value;
+      if (parseExpression(Value) || parseToken(AsmToken::Identifier))
+        return true;
+      const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+      if (!MCE)
+        return Error(Value->getLoc(),
+                     "cannot repeat value a non-constant number of times");
+      const int64_t Repetitions = MCE->getValue();
+      if (Repetitions < 0)
+        return Error(Value->getLoc(),
+                     "cannot repeat value a negative number of times");
+
+      SmallVector<APInt, 1> DuplicatedValues;
+      if (parseToken(AsmToken::LParen,
+                     "parentheses required for 'dup' contents") ||
+          parseRealInstList(Semantics, DuplicatedValues) ||
+          parseToken(AsmToken::RParen, "unmatched parentheses"))
+        return true;
+
+      for (int i = 0; i < Repetitions; ++i)
+        ValuesAsInt.append(DuplicatedValues.begin(), DuplicatedValues.end());
+    } else {
+      APInt AsInt;
+      if (parseRealValue(Semantics, AsInt))
+        return true;
+      ValuesAsInt.push_back(AsInt);
+    }
+
+    // Continue if we see a comma. (Also, allow line continuation.)
+    if (!parseOptionalToken(AsmToken::Comma))
+      break;
+    parseOptionalToken(AsmToken::EndOfStatement);
+  }
+
+  return false;
+}
+
+// Initialize real data values.
+bool MasmParser::emitRealValues(const fltSemantics &Semantics) {
+  SmallVector<APInt, 1> ValuesAsInt;
+  if (parseRealInstList(Semantics, ValuesAsInt))
+    return true;
+
+  for (const APInt &AsInt : ValuesAsInt) {
+    getStreamer().emitIntValue(AsInt.getLimitedValue(),
+                               AsInt.getBitWidth() / 8);
+  }
+  return false;
+}
+
+// Add a real field to the current struct.
+bool MasmParser::addRealField(StringRef Name, const fltSemantics &Semantics) {
+  StructInfo &Struct = StructInProgress.back();
+  FieldInfo &Field = Struct.addField(Name, FT_REAL);
+  RealFieldInfo &RealInfo = Field.Contents.RealInfo;
+
+  Field.SizeOf = 0;
+
+  if (checkForValidSection() ||
+      parseRealInstList(Semantics, RealInfo.AsIntValues))
+    return true;
+
+  Field.Type = RealInfo.AsIntValues.back().getBitWidth() / 8;
+  Field.LengthOf = RealInfo.AsIntValues.size();
+  Field.SizeOf = Field.Type * Field.LengthOf;
+  if (Struct.IsUnion)
+    Struct.Size = std::max(Struct.Size, Field.SizeOf);
+  else
+    Struct.Size += Field.SizeOf;
+  return false;
+}
+
+/// parseDirectiveRealValue
+///  ::= (real4 | real8) [ expression (, expression)* ]
+bool MasmParser::parseDirectiveRealValue(StringRef IDVal,
+                                         const fltSemantics &Semantics) {
+  if (checkForValidSection())
+    return true;
+
+  if (StructInProgress.empty()) {
+    // Initialize data value.
+    if (emitRealValues(Semantics))
+      return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+  } else if (addRealField("", Semantics)) {
+    return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+  }
+  return false;
+}
+
+/// parseDirectiveNamedRealValue
+///  ::= name (real4 | real8) [ expression (, expression)* ]
+bool MasmParser::parseDirectiveNamedRealValue(StringRef IDVal,
+                                              const fltSemantics &Semantics,
+                                              StringRef Name, SMLoc NameLoc) {
+  if (checkForValidSection())
+    return true;
+
+  if (StructInProgress.empty()) {
+    // Initialize named data value.
+    MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+    getStreamer().emitLabel(Sym);
+    if (emitRealValues(Semantics))
+      return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+  } else if (addRealField(Name, Semantics)) {
+    return addErrorSuffix(" in '" + Twine(IDVal) + "' directive");
+  }
+  return false;
+}
+
+bool MasmParser::parseOptionalAngleBracketOpen() {
+  const AsmToken Tok = getTok();
+  if (parseOptionalToken(AsmToken::LessLess)) {
+    AngleBracketDepth++;
+    Lexer.UnLex(AsmToken(AsmToken::Less, Tok.getString().substr(1)));
+    return true;
+  } else if (parseOptionalToken(AsmToken::LessGreater)) {
+    AngleBracketDepth++;
+    Lexer.UnLex(AsmToken(AsmToken::Greater, Tok.getString().substr(1)));
+    return true;
+  } else if (parseOptionalToken(AsmToken::Less)) {
+    AngleBracketDepth++;
+    return true;
+  }
+
+  return false;
+}
+
+bool MasmParser::parseAngleBracketClose(const Twine &Msg) {
+  const AsmToken Tok = getTok();
+  if (parseOptionalToken(AsmToken::GreaterGreater)) {
+    Lexer.UnLex(AsmToken(AsmToken::Greater, Tok.getString().substr(1)));
+  } else if (parseToken(AsmToken::Greater, Msg)) {
+    return true;
+  }
+  AngleBracketDepth--;
+  return false;
+}
+
+bool MasmParser::parseFieldInitializer(const FieldInfo &Field,
+                                       const IntFieldInfo &Contents,
+                                       FieldInitializer &Initializer) {
+  SMLoc Loc = getTok().getLoc();
+
+  SmallVector<const MCExpr *, 1> Values;
+  if (parseOptionalToken(AsmToken::LCurly)) {
+    if (Field.LengthOf == 1 && Field.Type > 1)
+      return Error(Loc, "Cannot initialize scalar field with array value");
+    if (parseScalarInstList(Field.Type, Values, AsmToken::RCurly) ||
+        parseToken(AsmToken::RCurly))
+      return true;
+  } else if (parseOptionalAngleBracketOpen()) {
+    if (Field.LengthOf == 1 && Field.Type > 1)
+      return Error(Loc, "Cannot initialize scalar field with array value");
+    if (parseScalarInstList(Field.Type, Values, AsmToken::Greater) ||
+        parseAngleBracketClose())
+      return true;
+  } else if (Field.LengthOf > 1 && Field.Type > 1) {
+    return Error(Loc, "Cannot initialize array field with scalar value");
+  } else if (parseScalarInitializer(Field.Type, Values,
+                                    /*StringPadLength=*/Field.LengthOf)) {
+    return true;
+  }
+
+  if (Values.size() > Field.LengthOf) {
+    return Error(Loc, "Initializer too long for field; expected at most " +
+                          std::to_string(Field.LengthOf) + " elements, got " +
+                          std::to_string(Values.size()));
+  }
+  // Default-initialize all remaining values.
+  Values.append(Contents.Values.begin() + Values.size(), Contents.Values.end());
+
+  Initializer = FieldInitializer(std::move(Values));
+  return false;
+}
+
+bool MasmParser::parseFieldInitializer(const FieldInfo &Field,
+                                       const RealFieldInfo &Contents,
+                                       FieldInitializer &Initializer) {
+  const fltSemantics &Semantics =
+      (Field.Type == 4) ? APFloat::IEEEsingle() : APFloat::IEEEdouble();
+
+  SMLoc Loc = getTok().getLoc();
+
+  SmallVector<APInt, 1> AsIntValues;
+  if (parseOptionalToken(AsmToken::LCurly)) {
+    if (Field.LengthOf == 1)
+      return Error(Loc, "Cannot initialize scalar field with array value");
+    if (parseRealInstList(Semantics, AsIntValues, AsmToken::RCurly) ||
+        parseToken(AsmToken::RCurly))
+      return true;
+  } else if (parseOptionalAngleBracketOpen()) {
+    if (Field.LengthOf == 1)
+      return Error(Loc, "Cannot initialize scalar field with array value");
+    if (parseRealInstList(Semantics, AsIntValues, AsmToken::Greater) ||
+        parseAngleBracketClose())
+      return true;
+  } else if (Field.LengthOf > 1) {
+    return Error(Loc, "Cannot initialize array field with scalar value");
+  } else {
+    AsIntValues.emplace_back();
+    if (parseRealValue(Semantics, AsIntValues.back()))
+      return true;
+  }
+
+  if (AsIntValues.size() > Field.LengthOf) {
+    return Error(Loc, "Initializer too long for field; expected at most " +
+                          std::to_string(Field.LengthOf) + " elements, got " +
+                          std::to_string(AsIntValues.size()));
+  }
+  // Default-initialize all remaining values.
+  AsIntValues.append(Contents.AsIntValues.begin() + AsIntValues.size(),
+                     Contents.AsIntValues.end());
+
+  Initializer = FieldInitializer(std::move(AsIntValues));
+  return false;
+}
+
+bool MasmParser::parseFieldInitializer(const FieldInfo &Field,
+                                       const StructFieldInfo &Contents,
+                                       FieldInitializer &Initializer) {
+  SMLoc Loc = getTok().getLoc();
+
+  std::vector<StructInitializer> Initializers;
+  if (Field.LengthOf > 1) {
+    if (parseOptionalToken(AsmToken::LCurly)) {
+      if (parseStructInstList(Contents.Structure, Initializers,
+                              AsmToken::RCurly) ||
+          parseToken(AsmToken::RCurly))
+        return true;
+    } else if (parseOptionalAngleBracketOpen()) {
+      if (parseStructInstList(Contents.Structure, Initializers,
+                              AsmToken::Greater) ||
+          parseAngleBracketClose())
+        return true;
+    } else {
+      return Error(Loc, "Cannot initialize array field with scalar value");
+    }
+  } else {
+    Initializers.emplace_back();
+    if (parseStructInitializer(Contents.Structure, Initializers.back()))
+      return true;
+  }
+
+  if (Initializers.size() > Field.LengthOf) {
+    return Error(Loc, "Initializer too long for field; expected at most " +
+                          std::to_string(Field.LengthOf) + " elements, got " +
+                          std::to_string(Initializers.size()));
+  }
+  // Default-initialize all remaining values.
+  Initializers.insert(Initializers.end(),
+                      Contents.Initializers.begin() + Initializers.size(),
+                      Contents.Initializers.end());
+
+  Initializer = FieldInitializer(std::move(Initializers), Contents.Structure);
+  return false;
+}
+
+bool MasmParser::parseFieldInitializer(const FieldInfo &Field,
+                                       FieldInitializer &Initializer) {
+  switch (Field.Contents.FT) {
+  case FT_INTEGRAL:
+    return parseFieldInitializer(Field, Field.Contents.IntInfo, Initializer);
+  case FT_REAL:
+    return parseFieldInitializer(Field, Field.Contents.RealInfo, Initializer);
+  case FT_STRUCT:
+    return parseFieldInitializer(Field, Field.Contents.StructInfo, Initializer);
+  }
+  llvm_unreachable("Unhandled FieldType enum");
+}
+
+bool MasmParser::parseStructInitializer(const StructInfo &Structure,
+                                        StructInitializer &Initializer) {
+  const AsmToken FirstToken = getTok();
+
+  Optional<AsmToken::TokenKind> EndToken;
+  if (parseOptionalToken(AsmToken::LCurly)) {
+    EndToken = AsmToken::RCurly;
+  } else if (parseOptionalAngleBracketOpen()) {
+    EndToken = AsmToken::Greater;
+    AngleBracketDepth++;
+  } else if (FirstToken.is(AsmToken::Identifier) &&
+             FirstToken.getString() == "?") {
+    // ? initializer; leave EndToken uninitialized to treat as empty.
+    if (parseToken(AsmToken::Identifier))
+      return true;
+  } else {
+    return Error(FirstToken.getLoc(), "Expected struct initializer");
+  }
+
+  auto &FieldInitializers = Initializer.FieldInitializers;
+  size_t FieldIndex = 0;
+  if (EndToken.hasValue()) {
+    // Initialize all fields with given initializers.
+    while (getTok().isNot(EndToken.getValue()) &&
+           FieldIndex < Structure.Fields.size()) {
+      const FieldInfo &Field = Structure.Fields[FieldIndex++];
+      if (parseOptionalToken(AsmToken::Comma)) {
+        // Empty initializer; use the default and continue. (Also, allow line
+        // continuation.)
+        FieldInitializers.push_back(Field.Contents);
+        parseOptionalToken(AsmToken::EndOfStatement);
+        continue;
+      }
+      FieldInitializers.emplace_back(Field.Contents.FT);
+      if (parseFieldInitializer(Field, FieldInitializers.back()))
+        return true;
+
+      // Continue if we see a comma. (Also, allow line continuation.)
+      SMLoc CommaLoc = getTok().getLoc();
+      if (!parseOptionalToken(AsmToken::Comma))
+        break;
+      if (FieldIndex == Structure.Fields.size())
+        return Error(CommaLoc, "'" + Structure.Name +
+                                   "' initializer initializes too many fields");
+      parseOptionalToken(AsmToken::EndOfStatement);
+    }
+  }
+  // Default-initialize all remaining fields.
+  for (auto It = Structure.Fields.begin() + FieldIndex;
+       It != Structure.Fields.end(); ++It) {
+    const FieldInfo &Field = *It;
+    FieldInitializers.push_back(Field.Contents);
+  }
+
+  if (EndToken.hasValue()) {
+    if (EndToken.getValue() == AsmToken::Greater)
+      return parseAngleBracketClose();
+
+    return parseToken(EndToken.getValue());
+  }
+
+  return false;
+}
+
+bool MasmParser::parseStructInstList(
+    const StructInfo &Structure, std::vector<StructInitializer> &Initializers,
+    const AsmToken::TokenKind EndToken) {
+  while (getTok().isNot(EndToken) ||
+         (EndToken == AsmToken::Greater &&
+          getTok().isNot(AsmToken::GreaterGreater))) {
+    const AsmToken NextTok = Lexer.peekTok();
+    if (NextTok.is(AsmToken::Identifier) &&
+        NextTok.getString().equals_lower("dup")) {
+      const MCExpr *Value;
+      if (parseExpression(Value) || parseToken(AsmToken::Identifier))
+        return true;
+      const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+      if (!MCE)
+        return Error(Value->getLoc(),
+                     "cannot repeat value a non-constant number of times");
+      const int64_t Repetitions = MCE->getValue();
+      if (Repetitions < 0)
+        return Error(Value->getLoc(),
+                     "cannot repeat value a negative number of times");
+
+      std::vector<StructInitializer> DuplicatedValues;
+      if (parseToken(AsmToken::LParen,
+                     "parentheses required for 'dup' contents") ||
+          parseStructInstList(Structure, DuplicatedValues) ||
+          parseToken(AsmToken::RParen, "unmatched parentheses"))
+        return true;
+
+      for (int i = 0; i < Repetitions; ++i)
+        Initializers.insert(Initializers.end(), DuplicatedValues.begin(),
+                            DuplicatedValues.end());
+    } else {
+      Initializers.emplace_back();
+      if (parseStructInitializer(Structure, Initializers.back()))
+        return true;
+    }
+
+    // Continue if we see a comma. (Also, allow line continuation.)
+    if (!parseOptionalToken(AsmToken::Comma))
+      break;
+    parseOptionalToken(AsmToken::EndOfStatement);
+  }
+
+  return false;
+}
+
+bool MasmParser::emitFieldValue(const FieldInfo &Field,
+                                const IntFieldInfo &Contents) {
+  // Default-initialize all values.
+  for (const MCExpr *Value : Contents.Values) {
+    if (emitIntValue(Value, Field.Type))
+      return true;
+  }
+  return false;
+}
+
+bool MasmParser::emitFieldValue(const FieldInfo &Field,
+                                const RealFieldInfo &Contents) {
+  for (const APInt &AsInt : Contents.AsIntValues) {
+    getStreamer().emitIntValue(AsInt.getLimitedValue(),
+                               AsInt.getBitWidth() / 8);
+  }
+  return false;
+}
+
+bool MasmParser::emitFieldValue(const FieldInfo &Field,
+                                const StructFieldInfo &Contents) {
+  for (const auto &Initializer : Contents.Initializers) {
+    size_t Index = 0, Offset = 0;
+    for (const auto &SubField : Contents.Structure.Fields) {
+      getStreamer().emitZeros(SubField.Offset - Offset);
+      Offset = SubField.Offset + SubField.SizeOf;
+      emitFieldInitializer(SubField, Initializer.FieldInitializers[Index++]);
+    }
+  }
+  return false;
+}
+
+bool MasmParser::emitFieldValue(const FieldInfo &Field) {
+  switch (Field.Contents.FT) {
+  case FT_INTEGRAL:
+    return emitFieldValue(Field, Field.Contents.IntInfo);
+  case FT_REAL:
+    return emitFieldValue(Field, Field.Contents.RealInfo);
+  case FT_STRUCT:
+    return emitFieldValue(Field, Field.Contents.StructInfo);
+  }
+  llvm_unreachable("Unhandled FieldType enum");
+}
+
+bool MasmParser::emitStructValue(const StructInfo &Structure) {
+  size_t Offset = 0;
+  for (const auto &Field : Structure.Fields) {
+    getStreamer().emitZeros(Field.Offset - Offset);
+    if (emitFieldValue(Field))
+      return true;
+    Offset = Field.Offset + Field.SizeOf;
+  }
+  // Add final padding.
+  if (Offset != Structure.Size)
+    getStreamer().emitZeros(Structure.Size - Offset);
+  return false;
+}
+
+bool MasmParser::emitFieldInitializer(const FieldInfo &Field,
+                                      const IntFieldInfo &Contents,
+                                      const IntFieldInfo &Initializer) {
+  for (const auto &Value : Initializer.Values) {
+    if (emitIntValue(Value, Field.Type))
+      return true;
+  }
+  // Default-initialize all remaining values.
+  for (auto it = Contents.Values.begin() + Initializer.Values.size();
+       it != Contents.Values.end(); ++it) {
+    const auto &Value = *it;
+    if (emitIntValue(Value, Field.Type))
+      return true;
+  }
+  return false;
+}
+
+bool MasmParser::emitFieldInitializer(const FieldInfo &Field,
+                                      const RealFieldInfo &Contents,
+                                      const RealFieldInfo &Initializer) {
+  for (const auto &AsInt : Initializer.AsIntValues) {
+    getStreamer().emitIntValue(AsInt.getLimitedValue(),
+                               AsInt.getBitWidth() / 8);
+  }
+  // Default-initialize all remaining values.
+  for (auto It = Contents.AsIntValues.begin() + Initializer.AsIntValues.size();
+       It != Contents.AsIntValues.end(); ++It) {
+    const auto &AsInt = *It;
+    getStreamer().emitIntValue(AsInt.getLimitedValue(),
+                               AsInt.getBitWidth() / 8);
+  }
+  return false;
+}
+
+bool MasmParser::emitFieldInitializer(const FieldInfo &Field,
+                                      const StructFieldInfo &Contents,
+                                      const StructFieldInfo &Initializer) {
+  for (const auto &Init : Initializer.Initializers) {
+    emitStructInitializer(Contents.Structure, Init);
+  }
+  // Default-initialize all remaining values.
+  for (auto It =
+           Contents.Initializers.begin() + Initializer.Initializers.size();
+       It != Contents.Initializers.end(); ++It) {
+    const auto &Init = *It;
+    emitStructInitializer(Contents.Structure, Init);
+  }
+  return false;
+}
+
+bool MasmParser::emitFieldInitializer(const FieldInfo &Field,
+                                      const FieldInitializer &Initializer) {
+  switch (Field.Contents.FT) {
+  case FT_INTEGRAL:
+    return emitFieldInitializer(Field, Field.Contents.IntInfo,
+                                Initializer.IntInfo);
+  case FT_REAL:
+    return emitFieldInitializer(Field, Field.Contents.RealInfo,
+                                Initializer.RealInfo);
+  case FT_STRUCT:
+    return emitFieldInitializer(Field, Field.Contents.StructInfo,
+                                Initializer.StructInfo);
+  }
+  llvm_unreachable("Unhandled FieldType enum");
+}
+
+bool MasmParser::emitStructInitializer(const StructInfo &Structure,
+                                       const StructInitializer &Initializer) {
+  size_t Index = 0, Offset = 0;
+  for (const auto &Init : Initializer.FieldInitializers) {
+    const auto &Field = Structure.Fields[Index++];
+    getStreamer().emitZeros(Field.Offset - Offset);
+    Offset = Field.Offset + Field.SizeOf;
+    if (emitFieldInitializer(Field, Init))
+      return true;
+  }
+  // Default-initialize all remaining fields.
+  for (auto It =
+           Structure.Fields.begin() + Initializer.FieldInitializers.size();
+       It != Structure.Fields.end(); ++It) {
+    const auto &Field = *It;
+    getStreamer().emitZeros(Field.Offset - Offset);
+    Offset = Field.Offset + Field.SizeOf;
+    if (emitFieldValue(Field))
+      return true;
+  }
+  // Add final padding.
+  if (Offset != Structure.Size)
+    getStreamer().emitZeros(Structure.Size - Offset);
+  return false;
+}
+
+// Set data values from initializers.
+bool MasmParser::emitStructValues(const StructInfo &Structure) {
+  std::vector<StructInitializer> Initializers;
+  if (parseStructInstList(Structure, Initializers))
+    return true;
+
+  for (const auto &Initializer : Initializers) {
+    if (emitStructInitializer(Structure, Initializer))
+      return true;
+  }
+
+  return false;
+}
+
+// Declare a field in the current struct.
+bool MasmParser::addStructField(StringRef Name, const StructInfo &Structure) {
+  StructInfo &OwningStruct = StructInProgress.back();
+  FieldInfo &Field = OwningStruct.addField(Name, FT_STRUCT);
+  StructFieldInfo &StructInfo = Field.Contents.StructInfo;
+
+  StructInfo.Structure = Structure;
+  Field.Type = Structure.Size;
+
+  if (parseStructInstList(Structure, StructInfo.Initializers))
+    return true;
+
+  Field.LengthOf = StructInfo.Initializers.size();
+  Field.SizeOf = Field.Type * Field.LengthOf;
+  if (OwningStruct.IsUnion)
+    OwningStruct.Size = std::max(OwningStruct.Size, Field.SizeOf);
+  else
+    OwningStruct.Size += Field.SizeOf;
+
+  return false;
+}
+
+/// parseDirectiveStructValue
+///  ::= struct-id (<struct-initializer> | {struct-initializer})
+///                [, (<struct-initializer> | {struct-initializer})]*
+bool MasmParser::parseDirectiveStructValue(const StructInfo &Structure,
+                                           StringRef Directive, SMLoc DirLoc) {
+  if (StructInProgress.empty()) {
+    if (emitStructValues(Structure))
+      return true;
+  } else if (addStructField("", Structure)) {
+    return addErrorSuffix(" in '" + Twine(Directive) + "' directive");
+  }
+
+  return false;
+}
+
+/// parseDirectiveNamedValue
+///  ::= name (byte | word | ... ) [ expression (, expression)* ]
+bool MasmParser::parseDirectiveNamedStructValue(const StructInfo &Structure,
+                                                StringRef Directive,
+                                                SMLoc DirLoc, StringRef Name) {
+  if (StructInProgress.empty()) {
+    // Initialize named data value.
+    MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+    getStreamer().emitLabel(Sym);
+    KnownType[Name] = &Structure;
+    if (emitStructValues(Structure))
+      return true;
+  } else if (addStructField(Name, Structure)) {
+    return addErrorSuffix(" in '" + Twine(Directive) + "' directive");
+  }
+
+  return false;
+}
+
+/// parseDirectiveStruct
+///  ::= <name> (STRUC | STRUCT | UNION) [fieldAlign] [, NONUNIQUE]
+///      (dataDir | generalDir | offsetDir | nestedStruct)+
+///      <name> ENDS
+////// dataDir = data declaration
+////// offsetDir = EVEN, ORG, ALIGN
+bool MasmParser::parseDirectiveStruct(StringRef Directive,
+                                      DirectiveKind DirKind, StringRef Name,
+                                      SMLoc NameLoc) {
+  // We ignore NONUNIQUE; we do not support OPTION M510 or OPTION OLDSTRUCTS
+  // anyway, so all field accesses must be qualified.
+  AsmToken NextTok = getTok();
+  int64_t AlignmentValue = 1;
+  if (NextTok.isNot(AsmToken::Comma) &&
+      NextTok.isNot(AsmToken::EndOfStatement) &&
+      parseAbsoluteExpression(AlignmentValue)) {
+    return addErrorSuffix(" in alignment value for '" + Twine(Directive) +
+                          "' directive");
+  }
+  if (!isPowerOf2_64(AlignmentValue)) {
+    return Error(NextTok.getLoc(), "alignment must be a power of two; was " +
+                                       std::to_string(AlignmentValue));
+  }
+
+  StringRef Qualifier;
+  SMLoc QualifierLoc;
+  if (parseOptionalToken(AsmToken::Comma)) {
+    QualifierLoc = getTok().getLoc();
+    if (parseIdentifier(Qualifier))
+      return addErrorSuffix(" in '" + Twine(Directive) + "' directive");
+    if (!Qualifier.equals_lower("nonunique"))
+      return Error(QualifierLoc, "Unrecognized qualifier for '" +
+                                     Twine(Directive) +
+                                     "' directive; expected none or NONUNIQUE");
+  }
+
+  if (parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix(" in '" + Twine(Directive) + "' directive");
+
+  StructInProgress.emplace_back(Name, DirKind == DK_UNION, AlignmentValue);
+  return false;
+}
+
+/// parseDirectiveNestedStruct
+///  ::= (STRUC | STRUCT | UNION) [name]
+///      (dataDir | generalDir | offsetDir | nestedStruct)+
+///      ENDS
+bool MasmParser::parseDirectiveNestedStruct(StringRef Directive,
+                                            DirectiveKind DirKind) {
+  if (StructInProgress.empty())
+    return TokError("missing name in top-level '" + Twine(Directive) +
+                    "' directive");
+
+  StringRef Name;
+  if (getTok().is(AsmToken::Identifier)) {
+    Name = getTok().getIdentifier();
+    parseToken(AsmToken::Identifier);
+  }
+  if (parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix(" in '" + Twine(Directive) + "' directive");
+
+  StructInProgress.emplace_back(Name, DirKind == DK_UNION,
+                                StructInProgress.back().Alignment);
+  return false;
+}
+
+bool MasmParser::parseDirectiveEnds(StringRef Name, SMLoc NameLoc) {
+  if (StructInProgress.empty())
+    return Error(NameLoc, "ENDS directive without matching STRUC/STRUCT/UNION");
+  if (StructInProgress.size() > 1)
+    return Error(NameLoc, "unexpected name in nested ENDS directive");
+  if (StructInProgress.back().Name.compare_lower(Name))
+    return Error(NameLoc, "mismatched name in ENDS directive; expected '" +
+                              StructInProgress.back().Name + "'");
+  StructInfo Structure = StructInProgress.pop_back_val();
+  // Pad to make the structure's size divisible by its alignment.
+  Structure.Size = llvm::alignTo(Structure.Size, Structure.Alignment);
+  Structs[Name.lower()] = Structure;
+
+  if (parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix(" in ENDS directive");
+
+  return false;
+}
+
+bool MasmParser::parseDirectiveNestedEnds() {
+  if (StructInProgress.empty())
+    return TokError("ENDS directive without matching STRUC/STRUCT/UNION");
+  if (StructInProgress.size() == 1)
+    return TokError("missing name in top-level ENDS directive");
+
+  if (parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix(" in nested ENDS directive");
+
+  StructInfo Structure = StructInProgress.pop_back_val();
+  // Pad to make the structure's size divisible by its alignment.
+  Structure.Size = llvm::alignTo(Structure.Size, Structure.Alignment);
+
+  StructInfo &ParentStruct = StructInProgress.back();
+  if (Structure.Name.empty()) {
+    const size_t OldFields = ParentStruct.Fields.size();
+    ParentStruct.Fields.insert(
+        ParentStruct.Fields.end(),
+        std::make_move_iterator(Structure.Fields.begin()),
+        std::make_move_iterator(Structure.Fields.end()));
+    for (const auto &FieldByName : Structure.FieldsByName) {
+      ParentStruct.FieldsByName[FieldByName.getKey()] =
+          FieldByName.getValue() + OldFields;
+    }
+    if (!ParentStruct.IsUnion) {
+      for (auto FieldIter = ParentStruct.Fields.begin() + OldFields;
+           FieldIter != ParentStruct.Fields.end(); ++FieldIter) {
+        FieldIter->Offset += ParentStruct.Size;
+      }
+    }
+
+    if (ParentStruct.IsUnion)
+      ParentStruct.Size = std::max(ParentStruct.Size, Structure.Size);
+    else
+      ParentStruct.Size += Structure.Size;
+  } else {
+    FieldInfo &Field = ParentStruct.addField(Structure.Name, FT_STRUCT);
+    StructFieldInfo &StructInfo = Field.Contents.StructInfo;
+    Field.Type = Structure.Size;
+    Field.LengthOf = 1;
+    Field.SizeOf = Structure.Size;
+
+    if (ParentStruct.IsUnion)
+      ParentStruct.Size = std::max(ParentStruct.Size, Field.SizeOf);
+    else
+      ParentStruct.Size += Field.SizeOf;
+
+    StructInfo.Structure = Structure;
+    StructInfo.Initializers.emplace_back();
+    auto &FieldInitializers = StructInfo.Initializers.back().FieldInitializers;
+    for (const auto &SubField : Structure.Fields) {
+      FieldInitializers.push_back(SubField.Contents);
+    }
+  }
+
+  return false;
+}
+
+/// parseDirectiveOrg
+///  ::= .org expression [ , expression ]
+bool MasmParser::parseDirectiveOrg() {
+  const MCExpr *Offset;
+  SMLoc OffsetLoc = Lexer.getLoc();
+  if (checkForValidSection() || parseExpression(Offset))
+    return true;
+
+  // Parse optional fill expression.
+  int64_t FillExpr = 0;
+  if (parseOptionalToken(AsmToken::Comma))
+    if (parseAbsoluteExpression(FillExpr))
+      return addErrorSuffix(" in '.org' directive");
+  if (parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix(" in '.org' directive");
+
+  getStreamer().emitValueToOffset(Offset, FillExpr, OffsetLoc);
+  return false;
+}
+
+/// parseDirectiveAlign
+///  ::= align expression
+bool MasmParser::parseDirectiveAlign() {
+  SMLoc AlignmentLoc = getLexer().getLoc();
+  int64_t Alignment;
+
+  if (checkForValidSection())
+    return addErrorSuffix(" in align directive");
+  // Ignore empty 'align' directives.
+  if (getTok().is(AsmToken::EndOfStatement)) {
+    Warning(AlignmentLoc, "align directive with no operand is ignored");
+    return parseToken(AsmToken::EndOfStatement);
+  }
+  if (parseAbsoluteExpression(Alignment) ||
+      parseToken(AsmToken::EndOfStatement))
+    return addErrorSuffix(" in align directive");
+
+  // Always emit an alignment here even if we thrown an error.
+  bool ReturnVal = false;
+
+  // Reject alignments that aren't either a power of two or zero, for gas
+  // compatibility. Alignment of zero is silently rounded up to one.
+  if (Alignment == 0)
+    Alignment = 1;
+  if (!isPowerOf2_64(Alignment))
+    ReturnVal |= Error(AlignmentLoc, "alignment must be a power of 2");
+
+  // Check whether we should use optimal code alignment for this align
+  // directive.
+  const MCSection *Section = getStreamer().getCurrentSectionOnly();
+  assert(Section && "must have section to emit alignment");
+  if (Section->UseCodeAlign()) {
+    getStreamer().emitCodeAlignment(Alignment, /*MaxBytesToEmit=*/0);
+  } else {
+    // FIXME: Target specific behavior about how the "extra" bytes are filled.
+    getStreamer().emitValueToAlignment(Alignment, /*Value=*/0, /*ValueSize=*/1,
+                                       /*MaxBytesToEmit=*/0);
+  }
+
+  return ReturnVal;
+}
+
+/// parseDirectiveFile
+/// ::= .file filename
+/// ::= .file number [directory] filename [md5 checksum] [source source-text]
+bool MasmParser::parseDirectiveFile(SMLoc DirectiveLoc) {
+  // FIXME: I'm not sure what this is.
+  int64_t FileNumber = -1;
+  if (getLexer().is(AsmToken::Integer)) {
+    FileNumber = getTok().getIntVal();
+    Lex();
+
+    if (FileNumber < 0)
+      return TokError("negative file number");
+  }
+
+  std::string Path;
+
+  // Usually the directory and filename together, otherwise just the directory.
+  // Allow the strings to have escaped octal character sequence.
+  if (check(getTok().isNot(AsmToken::String),
+            "unexpected token in '.file' directive") ||
+      parseEscapedString(Path))
+    return true;
+
+  StringRef Directory;
+  StringRef Filename;
+  std::string FilenameData;
+  if (getLexer().is(AsmToken::String)) {
+    if (check(FileNumber == -1,
+              "explicit path specified, but no file number") ||
+        parseEscapedString(FilenameData))
+      return true;
+    Filename = FilenameData;
+    Directory = Path;
+  } else {
+    Filename = Path;
+  }
+
+  uint64_t MD5Hi, MD5Lo;
+  bool HasMD5 = false;
+
+  Optional<StringRef> Source;
+  bool HasSource = false;
+  std::string SourceString;
+
+  while (!parseOptionalToken(AsmToken::EndOfStatement)) {
+    StringRef Keyword;
+    if (check(getTok().isNot(AsmToken::Identifier),
+              "unexpected token in '.file' directive") ||
+        parseIdentifier(Keyword))
+      return true;
+    if (Keyword == "md5") {
+      HasMD5 = true;
+      if (check(FileNumber == -1,
+                "MD5 checksum specified, but no file number") ||
+          parseHexOcta(*this, MD5Hi, MD5Lo))
+        return true;
+    } else if (Keyword == "source") {
+      HasSource = true;
+      if (check(FileNumber == -1,
+                "source specified, but no file number") ||
+          check(getTok().isNot(AsmToken::String),
+                "unexpected token in '.file' directive") ||
+          parseEscapedString(SourceString))
+        return true;
+    } else {
+      return TokError("unexpected token in '.file' directive");
+    }
+  }
+
+  if (FileNumber == -1) {
+    // Ignore the directive if there is no number and the target doesn't support
+    // numberless .file directives. This allows some portability of assembler
+    // between different object file formats.
+    if (getContext().getAsmInfo()->hasSingleParameterDotFile())
+      getStreamer().emitFileDirective(Filename);
+  } else {
+    // In case there is a -g option as well as debug info from directive .file,
+    // we turn off the -g option, directly use the existing debug info instead.
+    // Throw away any implicit file table for the assembler source.
+    if (Ctx.getGenDwarfForAssembly()) {
+      Ctx.getMCDwarfLineTable(0).resetFileTable();
+      Ctx.setGenDwarfForAssembly(false);
+    }
+
+    Optional<MD5::MD5Result> CKMem;
+    if (HasMD5) {
+      MD5::MD5Result Sum;
+      for (unsigned i = 0; i != 8; ++i) {
+        Sum.Bytes[i] = uint8_t(MD5Hi >> ((7 - i) * 8));
+        Sum.Bytes[i + 8] = uint8_t(MD5Lo >> ((7 - i) * 8));
+      }
+      CKMem = Sum;
+    }
+    if (HasSource) {
+      char *SourceBuf = static_cast<char *>(Ctx.allocate(SourceString.size()));
+      memcpy(SourceBuf, SourceString.data(), SourceString.size());
+      Source = StringRef(SourceBuf, SourceString.size());
+    }
+    if (FileNumber == 0) {
+      if (Ctx.getDwarfVersion() < 5)
+        return Warning(DirectiveLoc, "file 0 not supported prior to DWARF-5");
+      getStreamer().emitDwarfFile0Directive(Directory, Filename, CKMem, Source);
+    } else {
+      Expected<unsigned> FileNumOrErr = getStreamer().tryEmitDwarfFileDirective(
+          FileNumber, Directory, Filename, CKMem, Source);
+      if (!FileNumOrErr)
+        return Error(DirectiveLoc, toString(FileNumOrErr.takeError()));
+    }
+    // Alert the user if there are some .file directives with MD5 and some not.
+    // But only do that once.
+    if (!ReportedInconsistentMD5 && !Ctx.isDwarfMD5UsageConsistent(0)) {
+      ReportedInconsistentMD5 = true;
+      return Warning(DirectiveLoc, "inconsistent use of MD5 checksums");
+    }
+  }
+
+  return false;
+}
+
+/// parseDirectiveLine
+/// ::= .line [number]
+bool MasmParser::parseDirectiveLine() {
+  int64_t LineNumber;
+  if (getLexer().is(AsmToken::Integer)) {
+    if (parseIntToken(LineNumber, "unexpected token in '.line' directive"))
+      return true;
+    (void)LineNumber;
+    // FIXME: Do something with the .line.
+  }
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.line' directive"))
+    return true;
+
+  return false;
+}
+
+/// parseDirectiveLoc
+/// ::= .loc FileNumber [LineNumber] [ColumnPos] [basic_block] [prologue_end]
+///                                [epilogue_begin] [is_stmt VALUE] [isa VALUE]
+/// The first number is a file number, must have been previously assigned with
+/// a .file directive, the second number is the line number and optionally the
+/// third number is a column position (zero if not specified).  The remaining
+/// optional items are .loc sub-directives.
+bool MasmParser::parseDirectiveLoc() {
+  int64_t FileNumber = 0, LineNumber = 0;
+  SMLoc Loc = getTok().getLoc();
+  if (parseIntToken(FileNumber, "unexpected token in '.loc' directive") ||
+      check(FileNumber < 1 && Ctx.getDwarfVersion() < 5, Loc,
+            "file number less than one in '.loc' directive") ||
+      check(!getContext().isValidDwarfFileNumber(FileNumber), Loc,
+            "unassigned file number in '.loc' directive"))
+    return true;
+
+  // optional
+  if (getLexer().is(AsmToken::Integer)) {
+    LineNumber = getTok().getIntVal();
+    if (LineNumber < 0)
+      return TokError("line number less than zero in '.loc' directive");
+    Lex();
+  }
+
+  int64_t ColumnPos = 0;
+  if (getLexer().is(AsmToken::Integer)) {
+    ColumnPos = getTok().getIntVal();
+    if (ColumnPos < 0)
+      return TokError("column position less than zero in '.loc' directive");
+    Lex();
+  }
+
+  auto PrevFlags = getContext().getCurrentDwarfLoc().getFlags();
+  unsigned Flags = PrevFlags & DWARF2_FLAG_IS_STMT;
+  unsigned Isa = 0;
+  int64_t Discriminator = 0;
+
+  auto parseLocOp = [&]() -> bool {
+    StringRef Name;
+    SMLoc Loc = getTok().getLoc();
+    if (parseIdentifier(Name))
+      return TokError("unexpected token in '.loc' directive");
+
+    if (Name == "basic_block")
+      Flags |= DWARF2_FLAG_BASIC_BLOCK;
+    else if (Name == "prologue_end")
+      Flags |= DWARF2_FLAG_PROLOGUE_END;
+    else if (Name == "epilogue_begin")
+      Flags |= DWARF2_FLAG_EPILOGUE_BEGIN;
+    else if (Name == "is_stmt") {
+      Loc = getTok().getLoc();
+      const MCExpr *Value;
+      if (parseExpression(Value))
+        return true;
+      // The expression must be the constant 0 or 1.
+      if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
+        int Value = MCE->getValue();
+        if (Value == 0)
+          Flags &= ~DWARF2_FLAG_IS_STMT;
+        else if (Value == 1)
+          Flags |= DWARF2_FLAG_IS_STMT;
+        else
+          return Error(Loc, "is_stmt value not 0 or 1");
+      } else {
+        return Error(Loc, "is_stmt value not the constant value of 0 or 1");
+      }
+    } else if (Name == "isa") {
+      Loc = getTok().getLoc();
+      const MCExpr *Value;
+      if (parseExpression(Value))
+        return true;
+      // The expression must be a constant greater or equal to 0.
+      if (const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value)) {
+        int Value = MCE->getValue();
+        if (Value < 0)
+          return Error(Loc, "isa number less than zero");
+        Isa = Value;
+      } else {
+        return Error(Loc, "isa number not a constant value");
+      }
+    } else if (Name == "discriminator") {
+      if (parseAbsoluteExpression(Discriminator))
+        return true;
+    } else {
+      return Error(Loc, "unknown sub-directive in '.loc' directive");
+    }
+    return false;
+  };
+
+  if (parseMany(parseLocOp, false /*hasComma*/))
+    return true;
+
+  getStreamer().emitDwarfLocDirective(FileNumber, LineNumber, ColumnPos, Flags,
+                                      Isa, Discriminator, StringRef());
+
+  return false;
+}
+
+/// parseDirectiveStabs
+/// ::= .stabs string, number, number, number
+bool MasmParser::parseDirectiveStabs() {
+  return TokError("unsupported directive '.stabs'");
+}
+
+/// parseDirectiveCVFile
+/// ::= .cv_file number filename [checksum] [checksumkind]
+bool MasmParser::parseDirectiveCVFile() {
+  SMLoc FileNumberLoc = getTok().getLoc();
+  int64_t FileNumber;
+  std::string Filename;
+  std::string Checksum;
+  int64_t ChecksumKind = 0;
+
+  if (parseIntToken(FileNumber,
+                    "expected file number in '.cv_file' directive") ||
+      check(FileNumber < 1, FileNumberLoc, "file number less than one") ||
+      check(getTok().isNot(AsmToken::String),
+            "unexpected token in '.cv_file' directive") ||
+      parseEscapedString(Filename))
+    return true;
+  if (!parseOptionalToken(AsmToken::EndOfStatement)) {
+    if (check(getTok().isNot(AsmToken::String),
+              "unexpected token in '.cv_file' directive") ||
+        parseEscapedString(Checksum) ||
+        parseIntToken(ChecksumKind,
+                      "expected checksum kind in '.cv_file' directive") ||
+        parseToken(AsmToken::EndOfStatement,
+                   "unexpected token in '.cv_file' directive"))
+      return true;
+  }
+
+  Checksum = fromHex(Checksum);
+  void *CKMem = Ctx.allocate(Checksum.size(), 1);
+  memcpy(CKMem, Checksum.data(), Checksum.size());
+  ArrayRef<uint8_t> ChecksumAsBytes(reinterpret_cast<const uint8_t *>(CKMem),
+                                    Checksum.size());
+
+  if (!getStreamer().EmitCVFileDirective(FileNumber, Filename, ChecksumAsBytes,
+                                         static_cast<uint8_t>(ChecksumKind)))
+    return Error(FileNumberLoc, "file number already allocated");
+
+  return false;
+}
+
+bool MasmParser::parseCVFunctionId(int64_t &FunctionId,
+                                   StringRef DirectiveName) {
+  SMLoc Loc;
+  return parseTokenLoc(Loc) ||
+         parseIntToken(FunctionId, "expected function id in '" + DirectiveName +
+                                       "' directive") ||
+         check(FunctionId < 0 || FunctionId >= UINT_MAX, Loc,
+               "expected function id within range [0, UINT_MAX)");
+}
+
+bool MasmParser::parseCVFileId(int64_t &FileNumber, StringRef DirectiveName) {
+  SMLoc Loc;
+  return parseTokenLoc(Loc) ||
+         parseIntToken(FileNumber, "expected integer in '" + DirectiveName +
+                                       "' directive") ||
+         check(FileNumber < 1, Loc, "file number less than one in '" +
+                                        DirectiveName + "' directive") ||
+         check(!getCVContext().isValidFileNumber(FileNumber), Loc,
+               "unassigned file number in '" + DirectiveName + "' directive");
+}
+
+/// parseDirectiveCVFuncId
+/// ::= .cv_func_id FunctionId
+///
+/// Introduces a function ID that can be used with .cv_loc.
+bool MasmParser::parseDirectiveCVFuncId() {
+  SMLoc FunctionIdLoc = getTok().getLoc();
+  int64_t FunctionId;
+
+  if (parseCVFunctionId(FunctionId, ".cv_func_id") ||
+      parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.cv_func_id' directive"))
+    return true;
+
+  if (!getStreamer().EmitCVFuncIdDirective(FunctionId))
+    return Error(FunctionIdLoc, "function id already allocated");
+
+  return false;
+}
+
+/// parseDirectiveCVInlineSiteId
+/// ::= .cv_inline_site_id FunctionId
+///         "within" IAFunc
+///         "inlined_at" IAFile IALine [IACol]
+///
+/// Introduces a function ID that can be used with .cv_loc. Includes "inlined
+/// at" source location information for use in the line table of the caller,
+/// whether the caller is a real function or another inlined call site.
+bool MasmParser::parseDirectiveCVInlineSiteId() {
+  SMLoc FunctionIdLoc = getTok().getLoc();
+  int64_t FunctionId;
+  int64_t IAFunc;
+  int64_t IAFile;
+  int64_t IALine;
+  int64_t IACol = 0;
+
+  // FunctionId
+  if (parseCVFunctionId(FunctionId, ".cv_inline_site_id"))
+    return true;
+
+  // "within"
+  if (check((getLexer().isNot(AsmToken::Identifier) ||
+             getTok().getIdentifier() != "within"),
+            "expected 'within' identifier in '.cv_inline_site_id' directive"))
+    return true;
+  Lex();
+
+  // IAFunc
+  if (parseCVFunctionId(IAFunc, ".cv_inline_site_id"))
+    return true;
+
+  // "inlined_at"
+  if (check((getLexer().isNot(AsmToken::Identifier) ||
+             getTok().getIdentifier() != "inlined_at"),
+            "expected 'inlined_at' identifier in '.cv_inline_site_id' "
+            "directive") )
+    return true;
+  Lex();
+
+  // IAFile IALine
+  if (parseCVFileId(IAFile, ".cv_inline_site_id") ||
+      parseIntToken(IALine, "expected line number after 'inlined_at'"))
+    return true;
+
+  // [IACol]
+  if (getLexer().is(AsmToken::Integer)) {
+    IACol = getTok().getIntVal();
+    Lex();
+  }
+
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.cv_inline_site_id' directive"))
+    return true;
+
+  if (!getStreamer().EmitCVInlineSiteIdDirective(FunctionId, IAFunc, IAFile,
+                                                 IALine, IACol, FunctionIdLoc))
+    return Error(FunctionIdLoc, "function id already allocated");
+
+  return false;
+}
+
+/// parseDirectiveCVLoc
+/// ::= .cv_loc FunctionId FileNumber [LineNumber] [ColumnPos] [prologue_end]
+///                                [is_stmt VALUE]
+/// The first number is a file number, must have been previously assigned with
+/// a .file directive, the second number is the line number and optionally the
+/// third number is a column position (zero if not specified).  The remaining
+/// optional items are .loc sub-directives.
+bool MasmParser::parseDirectiveCVLoc() {
+  SMLoc DirectiveLoc = getTok().getLoc();
+  int64_t FunctionId, FileNumber;
+  if (parseCVFunctionId(FunctionId, ".cv_loc") ||
+      parseCVFileId(FileNumber, ".cv_loc"))
+    return true;
+
+  int64_t LineNumber = 0;
+  if (getLexer().is(AsmToken::Integer)) {
+    LineNumber = getTok().getIntVal();
+    if (LineNumber < 0)
+      return TokError("line number less than zero in '.cv_loc' directive");
+    Lex();
+  }
+
+  int64_t ColumnPos = 0;
+  if (getLexer().is(AsmToken::Integer)) {
+    ColumnPos = getTok().getIntVal();
+    if (ColumnPos < 0)
+      return TokError("column position less than zero in '.cv_loc' directive");
+    Lex();
+  }
+
+  bool PrologueEnd = false;
+  uint64_t IsStmt = 0;
+
+  auto parseOp = [&]() -> bool {
+    StringRef Name;
+    SMLoc Loc = getTok().getLoc();
+    if (parseIdentifier(Name))
+      return TokError("unexpected token in '.cv_loc' directive");
+    if (Name == "prologue_end")
+      PrologueEnd = true;
+    else if (Name == "is_stmt") {
+      Loc = getTok().getLoc();
+      const MCExpr *Value;
+      if (parseExpression(Value))
+        return true;
+      // The expression must be the constant 0 or 1.
+      IsStmt = ~0ULL;
+      if (const auto *MCE = dyn_cast<MCConstantExpr>(Value))
+        IsStmt = MCE->getValue();
+
+      if (IsStmt > 1)
+        return Error(Loc, "is_stmt value not 0 or 1");
+    } else {
+      return Error(Loc, "unknown sub-directive in '.cv_loc' directive");
+    }
+    return false;
+  };
+
+  if (parseMany(parseOp, false /*hasComma*/))
+    return true;
+
+  getStreamer().emitCVLocDirective(FunctionId, FileNumber, LineNumber,
+                                   ColumnPos, PrologueEnd, IsStmt, StringRef(),
+                                   DirectiveLoc);
+  return false;
+}
+
+/// parseDirectiveCVLinetable
+/// ::= .cv_linetable FunctionId, FnStart, FnEnd
+bool MasmParser::parseDirectiveCVLinetable() {
+  int64_t FunctionId;
+  StringRef FnStartName, FnEndName;
+  SMLoc Loc = getTok().getLoc();
+  if (parseCVFunctionId(FunctionId, ".cv_linetable") ||
+      parseToken(AsmToken::Comma,
+                 "unexpected token in '.cv_linetable' directive") ||
+      parseTokenLoc(Loc) || check(parseIdentifier(FnStartName), Loc,
+                                  "expected identifier in directive") ||
+      parseToken(AsmToken::Comma,
+                 "unexpected token in '.cv_linetable' directive") ||
+      parseTokenLoc(Loc) || check(parseIdentifier(FnEndName), Loc,
+                                  "expected identifier in directive"))
+    return true;
+
+  MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
+  MCSymbol *FnEndSym = getContext().getOrCreateSymbol(FnEndName);
+
+  getStreamer().emitCVLinetableDirective(FunctionId, FnStartSym, FnEndSym);
+  return false;
+}
+
+/// parseDirectiveCVInlineLinetable
+/// ::= .cv_inline_linetable PrimaryFunctionId FileId LineNum FnStart FnEnd
+bool MasmParser::parseDirectiveCVInlineLinetable() {
+  int64_t PrimaryFunctionId, SourceFileId, SourceLineNum;
+  StringRef FnStartName, FnEndName;
+  SMLoc Loc = getTok().getLoc();
+  if (parseCVFunctionId(PrimaryFunctionId, ".cv_inline_linetable") ||
+      parseTokenLoc(Loc) ||
+      parseIntToken(
+          SourceFileId,
+          "expected SourceField in '.cv_inline_linetable' directive") ||
+      check(SourceFileId <= 0, Loc,
+            "File id less than zero in '.cv_inline_linetable' directive") ||
+      parseTokenLoc(Loc) ||
+      parseIntToken(
+          SourceLineNum,
+          "expected SourceLineNum in '.cv_inline_linetable' directive") ||
+      check(SourceLineNum < 0, Loc,
+            "Line number less than zero in '.cv_inline_linetable' directive") ||
+      parseTokenLoc(Loc) || check(parseIdentifier(FnStartName), Loc,
+                                  "expected identifier in directive") ||
+      parseTokenLoc(Loc) || check(parseIdentifier(FnEndName), Loc,
+                                  "expected identifier in directive"))
+    return true;
+
+  if (parseToken(AsmToken::EndOfStatement, "Expected End of Statement"))
+    return true;
+
+  MCSymbol *FnStartSym = getContext().getOrCreateSymbol(FnStartName);
+  MCSymbol *FnEndSym = getContext().getOrCreateSymbol(FnEndName);
+  getStreamer().emitCVInlineLinetableDirective(PrimaryFunctionId, SourceFileId,
+                                               SourceLineNum, FnStartSym,
+                                               FnEndSym);
+  return false;
+}
+
+void MasmParser::initializeCVDefRangeTypeMap() {
+  CVDefRangeTypeMap["reg"] = CVDR_DEFRANGE_REGISTER;
+  CVDefRangeTypeMap["frame_ptr_rel"] = CVDR_DEFRANGE_FRAMEPOINTER_REL;
+  CVDefRangeTypeMap["subfield_reg"] = CVDR_DEFRANGE_SUBFIELD_REGISTER;
+  CVDefRangeTypeMap["reg_rel"] = CVDR_DEFRANGE_REGISTER_REL;
+}
+
+/// parseDirectiveCVDefRange
+/// ::= .cv_def_range RangeStart RangeEnd (GapStart GapEnd)*, bytes*
+bool MasmParser::parseDirectiveCVDefRange() {
+  SMLoc Loc;
+  std::vector<std::pair<const MCSymbol *, const MCSymbol *>> Ranges;
+  while (getLexer().is(AsmToken::Identifier)) {
+    Loc = getLexer().getLoc();
+    StringRef GapStartName;
+    if (parseIdentifier(GapStartName))
+      return Error(Loc, "expected identifier in directive");
+    MCSymbol *GapStartSym = getContext().getOrCreateSymbol(GapStartName);
+
+    Loc = getLexer().getLoc();
+    StringRef GapEndName;
+    if (parseIdentifier(GapEndName))
+      return Error(Loc, "expected identifier in directive");
+    MCSymbol *GapEndSym = getContext().getOrCreateSymbol(GapEndName);
+
+    Ranges.push_back({GapStartSym, GapEndSym});
+  }
+
+  StringRef CVDefRangeTypeStr;
+  if (parseToken(
+          AsmToken::Comma,
+          "expected comma before def_range type in .cv_def_range directive") ||
+      parseIdentifier(CVDefRangeTypeStr))
+    return Error(Loc, "expected def_range type in directive");
+
+  StringMap<CVDefRangeType>::const_iterator CVTypeIt =
+      CVDefRangeTypeMap.find(CVDefRangeTypeStr);
+  CVDefRangeType CVDRType = (CVTypeIt == CVDefRangeTypeMap.end())
+                                ? CVDR_DEFRANGE
+                                : CVTypeIt->getValue();
+  switch (CVDRType) {
+  case CVDR_DEFRANGE_REGISTER: {
+    int64_t DRRegister;
+    if (parseToken(AsmToken::Comma, "expected comma before register number in "
+                                    ".cv_def_range directive") ||
+        parseAbsoluteExpression(DRRegister))
+      return Error(Loc, "expected register number");
+
+    codeview::DefRangeRegisterHeader DRHdr;
+    DRHdr.Register = DRRegister;
+    DRHdr.MayHaveNoName = 0;
+    getStreamer().emitCVDefRangeDirective(Ranges, DRHdr);
+    break;
+  }
+  case CVDR_DEFRANGE_FRAMEPOINTER_REL: {
+    int64_t DROffset;
+    if (parseToken(AsmToken::Comma,
+                   "expected comma before offset in .cv_def_range directive") ||
+        parseAbsoluteExpression(DROffset))
+      return Error(Loc, "expected offset value");
+
+    codeview::DefRangeFramePointerRelHeader DRHdr;
+    DRHdr.Offset = DROffset;
+    getStreamer().emitCVDefRangeDirective(Ranges, DRHdr);
+    break;
+  }
+  case CVDR_DEFRANGE_SUBFIELD_REGISTER: {
+    int64_t DRRegister;
+    int64_t DROffsetInParent;
+    if (parseToken(AsmToken::Comma, "expected comma before register number in "
+                                    ".cv_def_range directive") ||
+        parseAbsoluteExpression(DRRegister))
+      return Error(Loc, "expected register number");
+    if (parseToken(AsmToken::Comma,
+                   "expected comma before offset in .cv_def_range directive") ||
+        parseAbsoluteExpression(DROffsetInParent))
+      return Error(Loc, "expected offset value");
+
+    codeview::DefRangeSubfieldRegisterHeader DRHdr;
+    DRHdr.Register = DRRegister;
+    DRHdr.MayHaveNoName = 0;
+    DRHdr.OffsetInParent = DROffsetInParent;
+    getStreamer().emitCVDefRangeDirective(Ranges, DRHdr);
+    break;
+  }
+  case CVDR_DEFRANGE_REGISTER_REL: {
+    int64_t DRRegister;
+    int64_t DRFlags;
+    int64_t DRBasePointerOffset;
+    if (parseToken(AsmToken::Comma, "expected comma before register number in "
+                                    ".cv_def_range directive") ||
+        parseAbsoluteExpression(DRRegister))
+      return Error(Loc, "expected register value");
+    if (parseToken(
+            AsmToken::Comma,
+            "expected comma before flag value in .cv_def_range directive") ||
+        parseAbsoluteExpression(DRFlags))
+      return Error(Loc, "expected flag value");
+    if (parseToken(AsmToken::Comma, "expected comma before base pointer offset "
+                                    "in .cv_def_range directive") ||
+        parseAbsoluteExpression(DRBasePointerOffset))
+      return Error(Loc, "expected base pointer offset value");
+
+    codeview::DefRangeRegisterRelHeader DRHdr;
+    DRHdr.Register = DRRegister;
+    DRHdr.Flags = DRFlags;
+    DRHdr.BasePointerOffset = DRBasePointerOffset;
+    getStreamer().emitCVDefRangeDirective(Ranges, DRHdr);
+    break;
+  }
+  default:
+    return Error(Loc, "unexpected def_range type in .cv_def_range directive");
+  }
+  return true;
+}
+
+/// parseDirectiveCVString
+/// ::= .cv_stringtable "string"
+bool MasmParser::parseDirectiveCVString() {
+  std::string Data;
+  if (checkForValidSection() || parseEscapedString(Data))
+    return addErrorSuffix(" in '.cv_string' directive");
+
+  // Put the string in the table and emit the offset.
+  std::pair<StringRef, unsigned> Insertion =
+      getCVContext().addToStringTable(Data);
+  getStreamer().emitIntValue(Insertion.second, 4);
+  return false;
+}
+
+/// parseDirectiveCVStringTable
+/// ::= .cv_stringtable
+bool MasmParser::parseDirectiveCVStringTable() {
+  getStreamer().emitCVStringTableDirective();
+  return false;
+}
+
+/// parseDirectiveCVFileChecksums
+/// ::= .cv_filechecksums
+bool MasmParser::parseDirectiveCVFileChecksums() {
+  getStreamer().emitCVFileChecksumsDirective();
+  return false;
+}
+
+/// parseDirectiveCVFileChecksumOffset
+/// ::= .cv_filechecksumoffset fileno
+bool MasmParser::parseDirectiveCVFileChecksumOffset() {
+  int64_t FileNo;
+  if (parseIntToken(FileNo, "expected identifier in directive"))
+    return true;
+  if (parseToken(AsmToken::EndOfStatement, "Expected End of Statement"))
+    return true;
+  getStreamer().emitCVFileChecksumOffsetDirective(FileNo);
+  return false;
+}
+
+/// parseDirectiveCVFPOData
+/// ::= .cv_fpo_data procsym
+bool MasmParser::parseDirectiveCVFPOData() {
+  SMLoc DirLoc = getLexer().getLoc();
+  StringRef ProcName;
+  if (parseIdentifier(ProcName))
+    return TokError("expected symbol name");
+  if (parseEOL("unexpected tokens"))
+    return addErrorSuffix(" in '.cv_fpo_data' directive");
+  MCSymbol *ProcSym = getContext().getOrCreateSymbol(ProcName);
+  getStreamer().EmitCVFPOData(ProcSym, DirLoc);
+  return false;
+}
+
+/// parseDirectiveCFISections
+/// ::= .cfi_sections section [, section]
+bool MasmParser::parseDirectiveCFISections() {
+  StringRef Name;
+  bool EH = false;
+  bool Debug = false;
+
+  if (parseIdentifier(Name))
+    return TokError("Expected an identifier");
+
+  if (Name == ".eh_frame")
+    EH = true;
+  else if (Name == ".debug_frame")
+    Debug = true;
+
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+
+    if (parseIdentifier(Name))
+      return TokError("Expected an identifier");
+
+    if (Name == ".eh_frame")
+      EH = true;
+    else if (Name == ".debug_frame")
+      Debug = true;
+  }
+
+  getStreamer().emitCFISections(EH, Debug);
+  return false;
+}
+
+/// parseDirectiveCFIStartProc
+/// ::= .cfi_startproc [simple]
+bool MasmParser::parseDirectiveCFIStartProc() {
+  StringRef Simple;
+  if (!parseOptionalToken(AsmToken::EndOfStatement)) {
+    if (check(parseIdentifier(Simple) || Simple != "simple",
+              "unexpected token") ||
+        parseToken(AsmToken::EndOfStatement))
+      return addErrorSuffix(" in '.cfi_startproc' directive");
+  }
+
+  // TODO(kristina): Deal with a corner case of incorrect diagnostic context
+  // being produced if this directive is emitted as part of preprocessor macro
+  // expansion which can *ONLY* happen if Clang's cc1as is the API consumer.
+  // Tools like llvm-mc on the other hand are not affected by it, and report
+  // correct context information.
+  getStreamer().emitCFIStartProc(!Simple.empty(), Lexer.getLoc());
+  return false;
+}
+
+/// parseDirectiveCFIEndProc
+/// ::= .cfi_endproc
+bool MasmParser::parseDirectiveCFIEndProc() {
+  getStreamer().emitCFIEndProc();
+  return false;
+}
+
+/// parse register name or number.
+bool MasmParser::parseRegisterOrRegisterNumber(int64_t &Register,
+                                               SMLoc DirectiveLoc) {
+  unsigned RegNo;
+
+  if (getLexer().isNot(AsmToken::Integer)) {
+    if (getTargetParser().ParseRegister(RegNo, DirectiveLoc, DirectiveLoc))
+      return true;
+    Register = getContext().getRegisterInfo()->getDwarfRegNum(RegNo, true);
+  } else
+    return parseAbsoluteExpression(Register);
+
+  return false;
+}
+
+/// parseDirectiveCFIDefCfa
+/// ::= .cfi_def_cfa register,  offset
+bool MasmParser::parseDirectiveCFIDefCfa(SMLoc DirectiveLoc) {
+  int64_t Register = 0, Offset = 0;
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) ||
+      parseToken(AsmToken::Comma, "unexpected token in directive") ||
+      parseAbsoluteExpression(Offset))
+    return true;
+
+  getStreamer().emitCFIDefCfa(Register, Offset);
+  return false;
+}
+
+/// parseDirectiveCFIDefCfaOffset
+/// ::= .cfi_def_cfa_offset offset
+bool MasmParser::parseDirectiveCFIDefCfaOffset() {
+  int64_t Offset = 0;
+  if (parseAbsoluteExpression(Offset))
+    return true;
+
+  getStreamer().emitCFIDefCfaOffset(Offset);
+  return false;
+}
+
+/// parseDirectiveCFIRegister
+/// ::= .cfi_register register, register
+bool MasmParser::parseDirectiveCFIRegister(SMLoc DirectiveLoc) {
+  int64_t Register1 = 0, Register2 = 0;
+  if (parseRegisterOrRegisterNumber(Register1, DirectiveLoc) ||
+      parseToken(AsmToken::Comma, "unexpected token in directive") ||
+      parseRegisterOrRegisterNumber(Register2, DirectiveLoc))
+    return true;
+
+  getStreamer().emitCFIRegister(Register1, Register2);
+  return false;
+}
+
+/// parseDirectiveCFIWindowSave
+/// ::= .cfi_window_save
+bool MasmParser::parseDirectiveCFIWindowSave() {
+  getStreamer().emitCFIWindowSave();
+  return false;
+}
+
+/// parseDirectiveCFIAdjustCfaOffset
+/// ::= .cfi_adjust_cfa_offset adjustment
+bool MasmParser::parseDirectiveCFIAdjustCfaOffset() {
+  int64_t Adjustment = 0;
+  if (parseAbsoluteExpression(Adjustment))
+    return true;
+
+  getStreamer().emitCFIAdjustCfaOffset(Adjustment);
+  return false;
+}
+
+/// parseDirectiveCFIDefCfaRegister
+/// ::= .cfi_def_cfa_register register
+bool MasmParser::parseDirectiveCFIDefCfaRegister(SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
+    return true;
+
+  getStreamer().emitCFIDefCfaRegister(Register);
+  return false;
+}
+
+/// parseDirectiveCFIOffset
+/// ::= .cfi_offset register, offset
+bool MasmParser::parseDirectiveCFIOffset(SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+  int64_t Offset = 0;
+
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) ||
+      parseToken(AsmToken::Comma, "unexpected token in directive") ||
+      parseAbsoluteExpression(Offset))
+    return true;
+
+  getStreamer().emitCFIOffset(Register, Offset);
+  return false;
+}
+
+/// parseDirectiveCFIRelOffset
+/// ::= .cfi_rel_offset register, offset
+bool MasmParser::parseDirectiveCFIRelOffset(SMLoc DirectiveLoc) {
+  int64_t Register = 0, Offset = 0;
+
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc) ||
+      parseToken(AsmToken::Comma, "unexpected token in directive") ||
+      parseAbsoluteExpression(Offset))
+    return true;
+
+  getStreamer().emitCFIRelOffset(Register, Offset);
+  return false;
+}
+
+static bool isValidEncoding(int64_t Encoding) {
+  if (Encoding & ~0xff)
+    return false;
+
+  if (Encoding == dwarf::DW_EH_PE_omit)
+    return true;
+
+  const unsigned Format = Encoding & 0xf;
+  if (Format != dwarf::DW_EH_PE_absptr && Format != dwarf::DW_EH_PE_udata2 &&
+      Format != dwarf::DW_EH_PE_udata4 && Format != dwarf::DW_EH_PE_udata8 &&
+      Format != dwarf::DW_EH_PE_sdata2 && Format != dwarf::DW_EH_PE_sdata4 &&
+      Format != dwarf::DW_EH_PE_sdata8 && Format != dwarf::DW_EH_PE_signed)
+    return false;
+
+  const unsigned Application = Encoding & 0x70;
+  if (Application != dwarf::DW_EH_PE_absptr &&
+      Application != dwarf::DW_EH_PE_pcrel)
+    return false;
+
+  return true;
+}
+
+/// parseDirectiveCFIPersonalityOrLsda
+/// IsPersonality true for cfi_personality, false for cfi_lsda
+/// ::= .cfi_personality encoding, [symbol_name]
+/// ::= .cfi_lsda encoding, [symbol_name]
+bool MasmParser::parseDirectiveCFIPersonalityOrLsda(bool IsPersonality) {
+  int64_t Encoding = 0;
+  if (parseAbsoluteExpression(Encoding))
+    return true;
+  if (Encoding == dwarf::DW_EH_PE_omit)
+    return false;
+
+  StringRef Name;
+  if (check(!isValidEncoding(Encoding), "unsupported encoding.") ||
+      parseToken(AsmToken::Comma, "unexpected token in directive") ||
+      check(parseIdentifier(Name), "expected identifier in directive"))
+    return true;
+
+  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+
+  if (IsPersonality)
+    getStreamer().emitCFIPersonality(Sym, Encoding);
+  else
+    getStreamer().emitCFILsda(Sym, Encoding);
+  return false;
+}
+
+/// parseDirectiveCFIRememberState
+/// ::= .cfi_remember_state
+bool MasmParser::parseDirectiveCFIRememberState() {
+  getStreamer().emitCFIRememberState();
+  return false;
+}
+
+/// parseDirectiveCFIRestoreState
+/// ::= .cfi_remember_state
+bool MasmParser::parseDirectiveCFIRestoreState() {
+  getStreamer().emitCFIRestoreState();
+  return false;
+}
+
+/// parseDirectiveCFISameValue
+/// ::= .cfi_same_value register
+bool MasmParser::parseDirectiveCFISameValue(SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
+    return true;
+
+  getStreamer().emitCFISameValue(Register);
+  return false;
+}
+
+/// parseDirectiveCFIRestore
+/// ::= .cfi_restore register
+bool MasmParser::parseDirectiveCFIRestore(SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
+    return true;
+
+  getStreamer().emitCFIRestore(Register);
+  return false;
+}
+
+/// parseDirectiveCFIEscape
+/// ::= .cfi_escape expression[,...]
+bool MasmParser::parseDirectiveCFIEscape() {
+  std::string Values;
+  int64_t CurrValue;
+  if (parseAbsoluteExpression(CurrValue))
+    return true;
+
+  Values.push_back((uint8_t)CurrValue);
+
+  while (getLexer().is(AsmToken::Comma)) {
+    Lex();
+
+    if (parseAbsoluteExpression(CurrValue))
+      return true;
+
+    Values.push_back((uint8_t)CurrValue);
+  }
+
+  getStreamer().emitCFIEscape(Values);
+  return false;
+}
+
+/// parseDirectiveCFIReturnColumn
+/// ::= .cfi_return_column register
+bool MasmParser::parseDirectiveCFIReturnColumn(SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
+    return true;
+  getStreamer().emitCFIReturnColumn(Register);
+  return false;
+}
+
+/// parseDirectiveCFISignalFrame
+/// ::= .cfi_signal_frame
+bool MasmParser::parseDirectiveCFISignalFrame() {
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.cfi_signal_frame'"))
+    return true;
+
+  getStreamer().emitCFISignalFrame();
+  return false;
+}
+
+/// parseDirectiveCFIUndefined
+/// ::= .cfi_undefined register
+bool MasmParser::parseDirectiveCFIUndefined(SMLoc DirectiveLoc) {
+  int64_t Register = 0;
+
+  if (parseRegisterOrRegisterNumber(Register, DirectiveLoc))
+    return true;
+
+  getStreamer().emitCFIUndefined(Register);
+  return false;
+}
+
+/// parseDirectiveAltmacro
+/// ::= .altmacro
+/// ::= .noaltmacro
+bool MasmParser::parseDirectiveAltmacro(StringRef Directive) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '" + Directive + "' directive");
+  AltMacroMode = (Directive == ".altmacro");
+  return false;
+}
+
+/// parseDirectiveMacro
+/// ::= .macro name[,] [parameters]
+bool MasmParser::parseDirectiveMacro(SMLoc DirectiveLoc) {
+  StringRef Name;
+  if (parseIdentifier(Name))
+    return TokError("expected identifier in '.macro' directive");
+
+  if (getLexer().is(AsmToken::Comma))
+    Lex();
+
+  MCAsmMacroParameters Parameters;
+  while (getLexer().isNot(AsmToken::EndOfStatement)) {
+
+    if (!Parameters.empty() && Parameters.back().Vararg)
+      return Error(Lexer.getLoc(),
+                   "Vararg parameter '" + Parameters.back().Name +
+                   "' should be last one in the list of parameters.");
+
+    MCAsmMacroParameter Parameter;
+    if (parseIdentifier(Parameter.Name))
+      return TokError("expected identifier in '.macro' directive");
+
+    // Emit an error if two (or more) named parameters share the same name.
+    for (const MCAsmMacroParameter& CurrParam : Parameters)
+      if (CurrParam.Name.equals(Parameter.Name))
+        return TokError("macro '" + Name + "' has multiple parameters"
+                        " named '" + Parameter.Name + "'");
+
+    if (Lexer.is(AsmToken::Colon)) {
+      Lex();  // consume ':'
+
+      SMLoc QualLoc;
+      StringRef Qualifier;
+
+      QualLoc = Lexer.getLoc();
+      if (parseIdentifier(Qualifier))
+        return Error(QualLoc, "missing parameter qualifier for "
+                     "'" + Parameter.Name + "' in macro '" + Name + "'");
+
+      if (Qualifier == "req")
+        Parameter.Required = true;
+      else if (Qualifier == "vararg")
+        Parameter.Vararg = true;
+      else
+        return Error(QualLoc, Qualifier + " is not a valid parameter qualifier "
+                     "for '" + Parameter.Name + "' in macro '" + Name + "'");
+    }
+
+    if (getLexer().is(AsmToken::Equal)) {
+      Lex();
+
+      SMLoc ParamLoc;
+
+      ParamLoc = Lexer.getLoc();
+      if (parseMacroArgument(Parameter.Value, /*Vararg=*/false ))
+        return true;
+
+      if (Parameter.Required)
+        Warning(ParamLoc, "pointless default value for required parameter "
+                "'" + Parameter.Name + "' in macro '" + Name + "'");
+    }
+
+    Parameters.push_back(std::move(Parameter));
+
+    if (getLexer().is(AsmToken::Comma))
+      Lex();
+  }
+
+  // Eat just the end of statement.
+  Lexer.Lex();
+
+  // Consuming deferred text, so use Lexer.Lex to ignore Lexing Errors.
+  AsmToken EndToken, StartToken = getTok();
+  unsigned MacroDepth = 0;
+  // Lex the macro definition.
+  while (true) {
+    // Ignore Lexing errors in macros.
+    while (Lexer.is(AsmToken::Error)) {
+      Lexer.Lex();
+    }
+
+    // Check whether we have reached the end of the file.
+    if (getLexer().is(AsmToken::Eof))
+      return Error(DirectiveLoc, "no matching '.endmacro' in definition");
+
+    // Otherwise, check whether we have reach the .endmacro.
+    if (getLexer().is(AsmToken::Identifier)) {
+      if (getTok().getIdentifier() == ".endm" ||
+          getTok().getIdentifier() == ".endmacro") {
+        if (MacroDepth == 0) { // Outermost macro.
+          EndToken = getTok();
+          Lexer.Lex();
+          if (getLexer().isNot(AsmToken::EndOfStatement))
+            return TokError("unexpected token in '" + EndToken.getIdentifier() +
+                            "' directive");
+          break;
+        } else {
+          // Otherwise we just found the end of an inner macro.
+          --MacroDepth;
+        }
+      } else if (getTok().getIdentifier() == ".macro") {
+        // We allow nested macros. Those aren't instantiated until the outermost
+        // macro is expanded so just ignore them for now.
+        ++MacroDepth;
+      }
+    }
+
+    // Otherwise, scan til the end of the statement.
+    eatToEndOfStatement();
+  }
+
+  if (getContext().lookupMacro(Name)) {
+    return Error(DirectiveLoc, "macro '" + Name + "' is already defined");
+  }
+
+  const char *BodyStart = StartToken.getLoc().getPointer();
+  const char *BodyEnd = EndToken.getLoc().getPointer();
+  StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
+  checkForBadMacro(DirectiveLoc, Name, Body, Parameters);
+  MCAsmMacro Macro(Name, Body, std::move(Parameters));
+  DEBUG_WITH_TYPE("asm-macros", dbgs() << "Defining new macro:\n";
+                  Macro.dump());
+  getContext().defineMacro(Name, std::move(Macro));
+  return false;
+}
+
+/// checkForBadMacro
+///
+/// With the support added for named parameters there may be code out there that
+/// is transitioning from positional parameters.  In versions of gas that did
+/// not support named parameters they would be ignored on the macro definition.
+/// But to support both styles of parameters this is not possible so if a macro
+/// definition has named parameters but does not use them and has what appears
+/// to be positional parameters, strings like $1, $2, ... and $n, then issue a
+/// warning that the positional parameter found in body which have no effect.
+/// Hoping the developer will either remove the named parameters from the macro
+/// definition so the positional parameters get used if that was what was
+/// intended or change the macro to use the named parameters.  It is possible
+/// this warning will trigger when the none of the named parameters are used
+/// and the strings like $1 are infact to simply to be passed trough unchanged.
+void MasmParser::checkForBadMacro(SMLoc DirectiveLoc, StringRef Name,
+                                  StringRef Body,
+                                  ArrayRef<MCAsmMacroParameter> Parameters) {
+  // If this macro is not defined with named parameters the warning we are
+  // checking for here doesn't apply.
+  unsigned NParameters = Parameters.size();
+  if (NParameters == 0)
+    return;
+
+  bool NamedParametersFound = false;
+  bool PositionalParametersFound = false;
+
+  // Look at the body of the macro for use of both the named parameters and what
+  // are likely to be positional parameters.  This is what expandMacro() is
+  // doing when it finds the parameters in the body.
+  while (!Body.empty()) {
+    // Scan for the next possible parameter.
+    std::size_t End = Body.size(), Pos = 0;
+    for (; Pos != End; ++Pos) {
+      // Check for a substitution or escape.
+      // This macro is defined with parameters, look for \foo, \bar, etc.
+      if (Body[Pos] == '\\' && Pos + 1 != End)
+        break;
+
+      // This macro should have parameters, but look for $0, $1, ..., $n too.
+      if (Body[Pos] != '$' || Pos + 1 == End)
+        continue;
+      char Next = Body[Pos + 1];
+      if (Next == '$' || Next == 'n' ||
+          isdigit(static_cast<unsigned char>(Next)))
+        break;
+    }
+
+    // Check if we reached the end.
+    if (Pos == End)
+      break;
+
+    if (Body[Pos] == '$') {
+      switch (Body[Pos + 1]) {
+      // $$ => $
+      case '$':
+        break;
+
+      // $n => number of arguments
+      case 'n':
+        PositionalParametersFound = true;
+        break;
+
+      // $[0-9] => argument
+      default: {
+        PositionalParametersFound = true;
+        break;
+      }
+      }
+      Pos += 2;
+    } else {
+      unsigned I = Pos + 1;
+      while (isIdentifierChar(Body[I]) && I + 1 != End)
+        ++I;
+
+      const char *Begin = Body.data() + Pos + 1;
+      StringRef Argument(Begin, I - (Pos + 1));
+      unsigned Index = 0;
+      for (; Index < NParameters; ++Index)
+        if (Parameters[Index].Name == Argument)
+          break;
+
+      if (Index == NParameters) {
+        if (Body[Pos + 1] == '(' && Body[Pos + 2] == ')')
+          Pos += 3;
+        else {
+          Pos = I;
+        }
+      } else {
+        NamedParametersFound = true;
+        Pos += 1 + Argument.size();
+      }
+    }
+    // Update the scan point.
+    Body = Body.substr(Pos);
+  }
+
+  if (!NamedParametersFound && PositionalParametersFound)
+    Warning(DirectiveLoc, "macro defined with named parameters which are not "
+                          "used in macro body, possible positional parameter "
+                          "found in body which will have no effect");
+}
+
+/// parseDirectiveExitMacro
+/// ::= .exitm
+bool MasmParser::parseDirectiveExitMacro(StringRef Directive) {
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '" + Directive + "' directive"))
+    return true;
+
+  if (!isInsideMacroInstantiation())
+    return TokError("unexpected '" + Directive + "' in file, "
+                                                 "no current macro definition");
+
+  // Exit all conditionals that are active in the current macro.
+  while (TheCondStack.size() != ActiveMacros.back()->CondStackDepth) {
+    TheCondState = TheCondStack.back();
+    TheCondStack.pop_back();
+  }
+
+  handleMacroExit();
+  return false;
+}
+
+/// parseDirectiveEndMacro
+/// ::= .endm
+/// ::= .endmacro
+bool MasmParser::parseDirectiveEndMacro(StringRef Directive) {
+  if (getLexer().isNot(AsmToken::EndOfStatement))
+    return TokError("unexpected token in '" + Directive + "' directive");
+
+  // If we are inside a macro instantiation, terminate the current
+  // instantiation.
+  if (isInsideMacroInstantiation()) {
+    handleMacroExit();
+    return false;
+  }
+
+  // Otherwise, this .endmacro is a stray entry in the file; well formed
+  // .endmacro directives are handled during the macro definition parsing.
+  return TokError("unexpected '" + Directive + "' in file, "
+                                               "no current macro definition");
+}
+
+/// parseDirectivePurgeMacro
+/// ::= .purgem
+bool MasmParser::parseDirectivePurgeMacro(SMLoc DirectiveLoc) {
+  StringRef Name;
+  SMLoc Loc;
+  if (parseTokenLoc(Loc) ||
+      check(parseIdentifier(Name), Loc,
+            "expected identifier in '.purgem' directive") ||
+      parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.purgem' directive"))
+    return true;
+
+  if (!getContext().lookupMacro(Name))
+    return Error(DirectiveLoc, "macro '" + Name + "' is not defined");
+
+  getContext().undefineMacro(Name);
+  DEBUG_WITH_TYPE("asm-macros", dbgs()
+                                    << "Un-defining macro: " << Name << "\n");
+  return false;
+}
+
+/// parseDirectiveSymbolAttribute
+///  ::= { ".globl", ".weak", ... } [ identifier ( , identifier )* ]
+bool MasmParser::parseDirectiveSymbolAttribute(MCSymbolAttr Attr) {
+  auto parseOp = [&]() -> bool {
+    StringRef Name;
+    SMLoc Loc = getTok().getLoc();
+    if (parseIdentifier(Name))
+      return Error(Loc, "expected identifier");
+    MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+
+    // Assembler local symbols don't make any sense here. Complain loudly.
+    if (Sym->isTemporary())
+      return Error(Loc, "non-local symbol required");
+
+    if (!getStreamer().emitSymbolAttribute(Sym, Attr))
+      return Error(Loc, "unable to emit symbol attribute");
+    return false;
+  };
+
+  if (parseMany(parseOp))
+    return addErrorSuffix(" in directive");
+  return false;
+}
+
+/// parseDirectiveComm
+///  ::= ( .comm | .lcomm ) identifier , size_expression [ , align_expression ]
+bool MasmParser::parseDirectiveComm(bool IsLocal) {
+  if (checkForValidSection())
+    return true;
+
+  SMLoc IDLoc = getLexer().getLoc();
+  StringRef Name;
+  if (parseIdentifier(Name))
+    return TokError("expected identifier in directive");
+
+  // Handle the identifier as the key symbol.
+  MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
+
+  if (getLexer().isNot(AsmToken::Comma))
+    return TokError("unexpected token in directive");
+  Lex();
+
+  int64_t Size;
+  SMLoc SizeLoc = getLexer().getLoc();
+  if (parseAbsoluteExpression(Size))
+    return true;
+
+  int64_t Pow2Alignment = 0;
+  SMLoc Pow2AlignmentLoc;
+  if (getLexer().is(AsmToken::Comma)) {
+    Lex();
+    Pow2AlignmentLoc = getLexer().getLoc();
+    if (parseAbsoluteExpression(Pow2Alignment))
+      return true;
+
+    LCOMM::LCOMMType LCOMM = Lexer.getMAI().getLCOMMDirectiveAlignmentType();
+    if (IsLocal && LCOMM == LCOMM::NoAlignment)
+      return Error(Pow2AlignmentLoc, "alignment not supported on this target");
+
+    // If this target takes alignments in bytes (not log) validate and convert.
+    if ((!IsLocal && Lexer.getMAI().getCOMMDirectiveAlignmentIsInBytes()) ||
+        (IsLocal && LCOMM == LCOMM::ByteAlignment)) {
+      if (!isPowerOf2_64(Pow2Alignment))
+        return Error(Pow2AlignmentLoc, "alignment must be a power of 2");
+      Pow2Alignment = Log2_64(Pow2Alignment);
+    }
+  }
+
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.comm' or '.lcomm' directive"))
+    return true;
+
+  // NOTE: a size of zero for a .comm should create a undefined symbol
+  // but a size of .lcomm creates a bss symbol of size zero.
+  if (Size < 0)
+    return Error(SizeLoc, "invalid '.comm' or '.lcomm' directive size, can't "
+                          "be less than zero");
+
+  // NOTE: The alignment in the directive is a power of 2 value, the assembler
+  // may internally end up wanting an alignment in bytes.
+  // FIXME: Diagnose overflow.
+  if (Pow2Alignment < 0)
+    return Error(Pow2AlignmentLoc, "invalid '.comm' or '.lcomm' directive "
+                                   "alignment, can't be less than zero");
+
+  Sym->redefineIfPossible();
+  if (!Sym->isUndefined())
+    return Error(IDLoc, "invalid symbol redefinition");
+
+  // Create the Symbol as a common or local common with Size and Pow2Alignment.
+  if (IsLocal) {
+    getStreamer().emitLocalCommonSymbol(Sym, Size, 1 << Pow2Alignment);
+    return false;
+  }
+
+  getStreamer().emitCommonSymbol(Sym, Size, 1 << Pow2Alignment);
+  return false;
+}
+
+/// parseDirectiveComment
+///  ::= comment delimiter [[text]]
+///              [[text]]
+///              [[text]] delimiter [[text]]
+bool MasmParser::parseDirectiveComment(SMLoc DirectiveLoc) {
+  StringRef FirstLine = parseStringToEndOfStatement();
+  size_t DelimiterEnd = FirstLine.find_first_of("\b\t\v\f\r\x1A ");
+  StringRef Delimiter = FirstLine.take_front(DelimiterEnd);
+  if (Delimiter.empty())
+    return Error(DirectiveLoc, "no delimiter in 'comment' directive");
+  do {
+    if (getTok().is(AsmToken::Eof))
+      return Error(DirectiveLoc, "unmatched delimiter in 'comment' directive");
+    Lex();  // eat end of statement
+  } while (!parseStringToEndOfStatement().contains(Delimiter));
+  return parseToken(AsmToken::EndOfStatement,
+                    "unexpected token in 'comment' directive");
+}
+
+/// parseDirectiveInclude
+///  ::= include <filename>
+///    | include filename
+bool MasmParser::parseDirectiveInclude() {
+  // Allow the strings to have escaped octal character sequence.
+  std::string Filename;
+  SMLoc IncludeLoc = getTok().getLoc();
+
+  if (!parseAngleBracketString(Filename))
+    Filename = parseStringToEndOfStatement().str();
+  if (check(!Filename.empty(), "missing filename in 'include' directive") ||
+      check(getTok().isNot(AsmToken::EndOfStatement),
+            "unexpected token in 'include' directive") ||
+      // Attempt to switch the lexer to the included file before consuming the
+      // end of statement to avoid losing it when we switch.
+      check(enterIncludeFile(Filename), IncludeLoc,
+            "Could not find include file '" + Filename + "'"))
+    return true;
+
+  return false;
+}
+
+/// parseDirectiveIf
+/// ::= .if{,eq,ge,gt,le,lt,ne} expression
+bool MasmParser::parseDirectiveIf(SMLoc DirectiveLoc, DirectiveKind DirKind) {
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+  if (TheCondState.Ignore) {
+    eatToEndOfStatement();
+  } else {
+    int64_t ExprValue;
+    if (parseAbsoluteExpression(ExprValue) ||
+        parseToken(AsmToken::EndOfStatement,
+                   "unexpected token in '.if' directive"))
+      return true;
+
+    switch (DirKind) {
+    default:
+      llvm_unreachable("unsupported directive");
+    case DK_IF:
+      break;
+    case DK_IFE:
+      ExprValue = ExprValue == 0;
+      break;
+    }
+
+    TheCondState.CondMet = ExprValue;
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// parseDirectiveIfb
+/// ::= .ifb string
+bool MasmParser::parseDirectiveIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+
+  if (TheCondState.Ignore) {
+    eatToEndOfStatement();
+  } else {
+    std::string Str;
+    if (parseTextItem(Str))
+      return TokError("expected string parameter for 'ifb' directive");
+
+    if (parseToken(AsmToken::EndOfStatement,
+                   "unexpected token in 'ifb' directive"))
+      return true;
+
+    TheCondState.CondMet = ExpectBlank == Str.empty();
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// parseDirectiveIfidn
+///   ::= ifidn string1, string2
+bool MasmParser::parseDirectiveIfidn(SMLoc DirectiveLoc, bool ExpectEqual, bool CaseInsensitive) {
+  std::string String1, String2;
+
+  if (parseTextItem(String1)) {
+    if (ExpectEqual)
+      return TokError("expected string parameter for 'ifidn' directive");
+    return TokError("expected string parameter for 'ifdif' directive");
+  }
+
+  if (Lexer.isNot(AsmToken::Comma)) {
+    if (ExpectEqual)
+      return TokError(
+          "expected comma after first string for 'ifidn' directive");
+    return TokError("expected comma after first string for 'ifdif' directive");
+  }
+  Lex();
+
+  if (parseTextItem(String2)) {
+    if (ExpectEqual)
+      return TokError("expected string parameter for 'ifidn' directive");
+    return TokError("expected string parameter for 'ifdif' directive");
+  }
+
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+  if (CaseInsensitive)
+    TheCondState.CondMet =
+        ExpectEqual == (StringRef(String1).equals_lower(String2));
+  else
+    TheCondState.CondMet = ExpectEqual == (String1 == String2);
+  TheCondState.Ignore = !TheCondState.CondMet;
+
+  return false;
+}
+
+/// parseDirectiveIfdef
+/// ::= ifdef symbol
+///   | ifdef variable
+bool MasmParser::parseDirectiveIfdef(SMLoc DirectiveLoc, bool expect_defined) {
+  TheCondStack.push_back(TheCondState);
+  TheCondState.TheCond = AsmCond::IfCond;
+
+  if (TheCondState.Ignore) {
+    eatToEndOfStatement();
+  } else {
+    bool is_defined = false;
+    unsigned RegNo;
+    SMLoc StartLoc, EndLoc;
+    is_defined = (getTargetParser().tryParseRegister(
+                      RegNo, StartLoc, EndLoc) == MatchOperand_Success);
+    if (!is_defined) {
+      StringRef Name;
+      if (check(parseIdentifier(Name), "expected identifier after 'ifdef'") ||
+          parseToken(AsmToken::EndOfStatement, "unexpected token in 'ifdef'"))
+        return true;
+
+      if (Variables.find(Name) != Variables.end()) {
+        is_defined = true;
+      } else {
+        MCSymbol *Sym = getContext().lookupSymbol(Name);
+        is_defined = (Sym && !Sym->isUndefined(false));
+      }
+    }
+
+    TheCondState.CondMet = (is_defined == expect_defined);
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// parseDirectiveElseIf
+/// ::= elseif expression
+bool MasmParser::parseDirectiveElseIf(SMLoc DirectiveLoc,
+                                      DirectiveKind DirKind) {
+  if (TheCondState.TheCond != AsmCond::IfCond &&
+      TheCondState.TheCond != AsmCond::ElseIfCond)
+    return Error(DirectiveLoc, "Encountered a .elseif that doesn't follow an"
+                               " .if or  an .elseif");
+  TheCondState.TheCond = AsmCond::ElseIfCond;
+
+  bool LastIgnoreState = false;
+  if (!TheCondStack.empty())
+    LastIgnoreState = TheCondStack.back().Ignore;
+  if (LastIgnoreState || TheCondState.CondMet) {
+    TheCondState.Ignore = true;
+    eatToEndOfStatement();
+  } else {
+    int64_t ExprValue;
+    if (parseAbsoluteExpression(ExprValue))
+      return true;
+
+    if (parseToken(AsmToken::EndOfStatement,
+                   "unexpected token in '.elseif' directive"))
+      return true;
+
+    switch (DirKind) {
+    default:
+      llvm_unreachable("unsupported directive");
+    case DK_ELSEIF:
+      break;
+    case DK_ELSEIFE:
+      ExprValue = ExprValue == 0;
+      break;
+    }
+
+    TheCondState.CondMet = ExprValue;
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// parseDirectiveElseIfb
+/// ::= elseifb expression
+bool MasmParser::parseDirectiveElseIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
+  if (TheCondState.TheCond != AsmCond::IfCond &&
+      TheCondState.TheCond != AsmCond::ElseIfCond)
+    return Error(DirectiveLoc, "Encountered an elseif that doesn't follow an"
+                               " if or an elseif");
+  TheCondState.TheCond = AsmCond::ElseIfCond;
+
+  bool LastIgnoreState = false;
+  if (!TheCondStack.empty())
+    LastIgnoreState = TheCondStack.back().Ignore;
+  if (LastIgnoreState || TheCondState.CondMet) {
+    TheCondState.Ignore = true;
+    eatToEndOfStatement();
+  } else {
+    std::string Str;
+    if (parseTextItem(Str))
+      return TokError("expected string parameter for 'elseifb' directive");
+
+    if (parseToken(AsmToken::EndOfStatement,
+                   "unexpected token in 'elseifb' directive"))
+      return true;
+
+    TheCondState.CondMet = ExpectBlank == Str.empty();
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// parseDirectiveElseIfdef
+/// ::= elseifdef symbol
+///   | elseifdef variable
+bool MasmParser::parseDirectiveElseIfdef(SMLoc DirectiveLoc,
+                                         bool expect_defined) {
+  if (TheCondState.TheCond != AsmCond::IfCond &&
+      TheCondState.TheCond != AsmCond::ElseIfCond)
+    return Error(DirectiveLoc, "Encountered an elseif that doesn't follow an"
+                               " if or an elseif");
+  TheCondState.TheCond = AsmCond::ElseIfCond;
+
+  bool LastIgnoreState = false;
+  if (!TheCondStack.empty())
+    LastIgnoreState = TheCondStack.back().Ignore;
+  if (LastIgnoreState || TheCondState.CondMet) {
+    TheCondState.Ignore = true;
+    eatToEndOfStatement();
+  } else {
+    bool is_defined = false;
+    unsigned RegNo;
+    SMLoc StartLoc, EndLoc;
+    is_defined = (getTargetParser().tryParseRegister(RegNo, StartLoc, EndLoc) ==
+                  MatchOperand_Success);
+    if (!is_defined) {
+      StringRef Name;
+      if (check(parseIdentifier(Name),
+                "expected identifier after 'elseifdef'") ||
+          parseToken(AsmToken::EndOfStatement,
+                     "unexpected token in 'elseifdef'"))
+        return true;
+
+      if (Variables.find(Name) != Variables.end()) {
+        is_defined = true;
+      } else {
+        MCSymbol *Sym = getContext().lookupSymbol(Name);
+        is_defined = (Sym && !Sym->isUndefined(false));
+      }
+    }
+
+    TheCondState.CondMet = (is_defined == expect_defined);
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// parseDirectiveElseIfidn
+/// ::= elseifidn string1, string2
+bool MasmParser::parseDirectiveElseIfidn(SMLoc DirectiveLoc, bool ExpectEqual,
+                                         bool CaseInsensitive) {
+  if (TheCondState.TheCond != AsmCond::IfCond &&
+      TheCondState.TheCond != AsmCond::ElseIfCond)
+    return Error(DirectiveLoc, "Encountered an elseif that doesn't follow an"
+                               " if or an elseif");
+  TheCondState.TheCond = AsmCond::ElseIfCond;
+
+  bool LastIgnoreState = false;
+  if (!TheCondStack.empty())
+    LastIgnoreState = TheCondStack.back().Ignore;
+  if (LastIgnoreState || TheCondState.CondMet) {
+    TheCondState.Ignore = true;
+    eatToEndOfStatement();
+  } else {
+    std::string String1, String2;
+
+    if (parseTextItem(String1)) {
+      if (ExpectEqual)
+        return TokError("expected string parameter for 'elseifidn' directive");
+      return TokError("expected string parameter for 'elseifdif' directive");
+    }
+
+    if (Lexer.isNot(AsmToken::Comma)) {
+      if (ExpectEqual)
+        return TokError(
+            "expected comma after first string for 'elseifidn' directive");
+      return TokError(
+          "expected comma after first string for 'elseifdif' directive");
+    }
+    Lex();
+
+    if (parseTextItem(String2)) {
+      if (ExpectEqual)
+        return TokError("expected string parameter for 'elseifidn' directive");
+      return TokError("expected string parameter for 'elseifdif' directive");
+    }
+
+    if (CaseInsensitive)
+      TheCondState.CondMet =
+          ExpectEqual == (StringRef(String1).equals_lower(String2));
+    else
+      TheCondState.CondMet = ExpectEqual == (String1 == String2);
+    TheCondState.Ignore = !TheCondState.CondMet;
+  }
+
+  return false;
+}
+
+/// parseDirectiveElse
+/// ::= else
+bool MasmParser::parseDirectiveElse(SMLoc DirectiveLoc) {
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in 'else' directive"))
+    return true;
+
+  if (TheCondState.TheCond != AsmCond::IfCond &&
+      TheCondState.TheCond != AsmCond::ElseIfCond)
+    return Error(DirectiveLoc, "Encountered an else that doesn't follow an if"
+                               " or an elseif");
+  TheCondState.TheCond = AsmCond::ElseCond;
+  bool LastIgnoreState = false;
+  if (!TheCondStack.empty())
+    LastIgnoreState = TheCondStack.back().Ignore;
+  if (LastIgnoreState || TheCondState.CondMet)
+    TheCondState.Ignore = true;
+  else
+    TheCondState.Ignore = false;
+
+  return false;
+}
+
+/// parseDirectiveEnd
+/// ::= end
+bool MasmParser::parseDirectiveEnd(SMLoc DirectiveLoc) {
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in 'end' directive"))
+    return true;
+
+  while (Lexer.isNot(AsmToken::Eof))
+    Lexer.Lex();
+
+  return false;
+}
+
+/// parseDirectiveError
+///   ::= .err [message]
+bool MasmParser::parseDirectiveError(SMLoc DirectiveLoc) {
+  if (!TheCondStack.empty()) {
+    if (TheCondStack.back().Ignore) {
+      eatToEndOfStatement();
+      return false;
+    }
+  }
+
+  StringRef Message = ".err directive invoked in source file";
+  if (Lexer.isNot(AsmToken::EndOfStatement))
+    Message = parseStringToEndOfStatement();
+  Lex();
+
+  return Error(DirectiveLoc, Message);
+}
+
+/// parseDirectiveErrorIfb
+///   ::= .errb textitem[, message]
+bool MasmParser::parseDirectiveErrorIfb(SMLoc DirectiveLoc, bool ExpectBlank) {
+  if (!TheCondStack.empty()) {
+    if (TheCondStack.back().Ignore) {
+      eatToEndOfStatement();
+      return false;
+    }
+  }
+
+  std::string Text;
+  if (parseTextItem(Text))
+    return Error(getTok().getLoc(), "missing text item in '.errb' directive");
+
+  StringRef Message = ".errb directive invoked in source file";
+  if (Lexer.isNot(AsmToken::EndOfStatement)) {
+    if (parseToken(AsmToken::Comma))
+      return addErrorSuffix(" in '.errb' directive");
+    Message = parseStringToEndOfStatement();
+  }
+  Lex();
+
+  if (Text.empty() == ExpectBlank)
+    return Error(DirectiveLoc, Message);
+  return false;
+}
+
+/// parseDirectiveErrorIfdef
+///   ::= .errdef name[, message]
+bool MasmParser::parseDirectiveErrorIfdef(SMLoc DirectiveLoc,
+                                          bool ExpectDefined) {
+  if (!TheCondStack.empty()) {
+    if (TheCondStack.back().Ignore) {
+      eatToEndOfStatement();
+      return false;
+    }
+  }
+
+  bool IsDefined = false;
+  unsigned RegNo;
+  SMLoc StartLoc, EndLoc;
+  IsDefined = (getTargetParser().tryParseRegister(RegNo, StartLoc, EndLoc) ==
+               MatchOperand_Success);
+  if (!IsDefined) {
+    StringRef Name;
+    if (check(parseIdentifier(Name), "expected identifier after '.errdef'"))
+      return true;
+
+    if (Variables.find(Name) != Variables.end()) {
+      IsDefined = true;
+    } else {
+      MCSymbol *Sym = getContext().lookupSymbol(Name);
+      IsDefined = (Sym && !Sym->isUndefined(false));
+    }
+  }
+
+  StringRef Message = ".errdef directive invoked in source file";
+  if (Lexer.isNot(AsmToken::EndOfStatement)) {
+    if (parseToken(AsmToken::Comma))
+      return addErrorSuffix(" in '.errdef' directive");
+    Message = parseStringToEndOfStatement();
+  }
+  Lex();
+
+  if (IsDefined == ExpectDefined)
+    return Error(DirectiveLoc, Message);
+  return false;
+}
+
+/// parseDirectiveErrorIfidn
+///   ::= .erridn textitem1, textitem2[, message]
+bool MasmParser::parseDirectiveErrorIfidn(SMLoc DirectiveLoc, bool ExpectEqual,
+                                          bool CaseInsensitive) {
+  if (!TheCondStack.empty()) {
+    if (TheCondStack.back().Ignore) {
+      eatToEndOfStatement();
+      return false;
+    }
+  }
+
+  std::string String1, String2;
+
+  if (parseTextItem(String1)) {
+    if (ExpectEqual)
+      return TokError("expected string parameter for '.erridn' directive");
+    return TokError("expected string parameter for '.errdif' directive");
+  }
+
+  if (Lexer.isNot(AsmToken::Comma)) {
+    if (ExpectEqual)
+      return TokError(
+          "expected comma after first string for '.erridn' directive");
+    return TokError(
+        "expected comma after first string for '.errdif' directive");
+  }
+  Lex();
+
+  if (parseTextItem(String2)) {
+    if (ExpectEqual)
+      return TokError("expected string parameter for '.erridn' directive");
+    return TokError("expected string parameter for '.errdif' directive");
+  }
+
+  StringRef Message;
+  if (ExpectEqual)
+    Message = ".erridn directive invoked in source file";
+  else
+    Message = ".errdif directive invoked in source file";
+  if (Lexer.isNot(AsmToken::EndOfStatement)) {
+    if (parseToken(AsmToken::Comma))
+      return addErrorSuffix(" in '.erridn' directive");
+    Message = parseStringToEndOfStatement();
+  }
+  Lex();
+
+  if (CaseInsensitive)
+    TheCondState.CondMet =
+        ExpectEqual == (StringRef(String1).equals_lower(String2));
+  else
+    TheCondState.CondMet = ExpectEqual == (String1 == String2);
+  TheCondState.Ignore = !TheCondState.CondMet;
+
+  if ((CaseInsensitive &&
+       ExpectEqual == StringRef(String1).equals_lower(String2)) ||
+      (ExpectEqual == (String1 == String2)))
+    return Error(DirectiveLoc, Message);
+  return false;
+}
+
+/// parseDirectiveErrorIfe
+///   ::= .erre expression[, message]
+bool MasmParser::parseDirectiveErrorIfe(SMLoc DirectiveLoc, bool ExpectZero) {
+  if (!TheCondStack.empty()) {
+    if (TheCondStack.back().Ignore) {
+      eatToEndOfStatement();
+      return false;
+    }
+  }
+
+  int64_t ExprValue;
+  if (parseAbsoluteExpression(ExprValue))
+    return addErrorSuffix(" in '.erre' directive");
+
+  StringRef Message = ".erre directive invoked in source file";
+  if (Lexer.isNot(AsmToken::EndOfStatement)) {
+    if (parseToken(AsmToken::Comma))
+      return addErrorSuffix(" in '.erre' directive");
+    Message = parseStringToEndOfStatement();
+  }
+  Lex();
+
+  if ((ExprValue == 0) == ExpectZero)
+    return Error(DirectiveLoc, Message);
+  return false;
+}
+
+/// parseDirectiveEndIf
+/// ::= .endif
+bool MasmParser::parseDirectiveEndIf(SMLoc DirectiveLoc) {
+  if (parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '.endif' directive"))
+    return true;
+
+  if ((TheCondState.TheCond == AsmCond::NoCond) || TheCondStack.empty())
+    return Error(DirectiveLoc, "Encountered a .endif that doesn't follow "
+                               "an .if or .else");
+  if (!TheCondStack.empty()) {
+    TheCondState = TheCondStack.back();
+    TheCondStack.pop_back();
+  }
+
+  return false;
+}
+
+void MasmParser::initializeDirectiveKindMap() {
+  DirectiveKindMap["="] = DK_ASSIGN;
+  DirectiveKindMap["equ"] = DK_EQU;
+  DirectiveKindMap["textequ"] = DK_TEXTEQU;
+  // DirectiveKindMap[".ascii"] = DK_ASCII;
+  // DirectiveKindMap[".asciz"] = DK_ASCIZ;
+  // DirectiveKindMap[".string"] = DK_STRING;
+  DirectiveKindMap["byte"] = DK_BYTE;
+  DirectiveKindMap["sbyte"] = DK_SBYTE;
+  DirectiveKindMap["word"] = DK_WORD;
+  DirectiveKindMap["sword"] = DK_SWORD;
+  DirectiveKindMap["dword"] = DK_DWORD;
+  DirectiveKindMap["sdword"] = DK_SDWORD;
+  DirectiveKindMap["fword"] = DK_FWORD;
+  DirectiveKindMap["qword"] = DK_QWORD;
+  DirectiveKindMap["sqword"] = DK_SQWORD;
+  DirectiveKindMap["real4"] = DK_REAL4;
+  DirectiveKindMap["real8"] = DK_REAL8;
+  DirectiveKindMap["align"] = DK_ALIGN;
+  // DirectiveKindMap[".org"] = DK_ORG;
+  DirectiveKindMap["extern"] = DK_EXTERN;
+  DirectiveKindMap["public"] = DK_PUBLIC;
+  // DirectiveKindMap[".comm"] = DK_COMM;
+  DirectiveKindMap["comment"] = DK_COMMENT;
+  DirectiveKindMap["include"] = DK_INCLUDE;
+  // DirectiveKindMap[".rept"] = DK_REPT;
+  // DirectiveKindMap[".rep"] = DK_REPT;
+  // DirectiveKindMap[".irp"] = DK_IRP;
+  // DirectiveKindMap[".irpc"] = DK_IRPC;
+  // DirectiveKindMap[".endr"] = DK_ENDR;
+  DirectiveKindMap["if"] = DK_IF;
+  DirectiveKindMap["ife"] = DK_IFE;
+  DirectiveKindMap["ifb"] = DK_IFB;
+  DirectiveKindMap["ifnb"] = DK_IFNB;
+  DirectiveKindMap["ifdef"] = DK_IFDEF;
+  DirectiveKindMap["ifndef"] = DK_IFNDEF;
+  DirectiveKindMap["ifdif"] = DK_IFDIF;
+  DirectiveKindMap["ifdifi"] = DK_IFDIFI;
+  DirectiveKindMap["ifidn"] = DK_IFIDN;
+  DirectiveKindMap["ifidni"] = DK_IFIDNI;
+  DirectiveKindMap["elseif"] = DK_ELSEIF;
+  DirectiveKindMap["elseifdef"] = DK_ELSEIFDEF;
+  DirectiveKindMap["elseifndef"] = DK_ELSEIFNDEF;
+  DirectiveKindMap["elseifdif"] = DK_ELSEIFDIF;
+  DirectiveKindMap["elseifidn"] = DK_ELSEIFIDN;
+  DirectiveKindMap["else"] = DK_ELSE;
+  DirectiveKindMap["end"] = DK_END;
+  DirectiveKindMap["endif"] = DK_ENDIF;
+  // DirectiveKindMap[".file"] = DK_FILE;
+  // DirectiveKindMap[".line"] = DK_LINE;
+  // DirectiveKindMap[".loc"] = DK_LOC;
+  // DirectiveKindMap[".stabs"] = DK_STABS;
+  // DirectiveKindMap[".cv_file"] = DK_CV_FILE;
+  // DirectiveKindMap[".cv_func_id"] = DK_CV_FUNC_ID;
+  // DirectiveKindMap[".cv_loc"] = DK_CV_LOC;
+  // DirectiveKindMap[".cv_linetable"] = DK_CV_LINETABLE;
+  // DirectiveKindMap[".cv_inline_linetable"] = DK_CV_INLINE_LINETABLE;
+  // DirectiveKindMap[".cv_inline_site_id"] = DK_CV_INLINE_SITE_ID;
+  // DirectiveKindMap[".cv_def_range"] = DK_CV_DEF_RANGE;
+  // DirectiveKindMap[".cv_string"] = DK_CV_STRING;
+  // DirectiveKindMap[".cv_stringtable"] = DK_CV_STRINGTABLE;
+  // DirectiveKindMap[".cv_filechecksums"] = DK_CV_FILECHECKSUMS;
+  // DirectiveKindMap[".cv_filechecksumoffset"] = DK_CV_FILECHECKSUM_OFFSET;
+  // DirectiveKindMap[".cv_fpo_data"] = DK_CV_FPO_DATA;
+  // DirectiveKindMap[".cfi_sections"] = DK_CFI_SECTIONS;
+  // DirectiveKindMap[".cfi_startproc"] = DK_CFI_STARTPROC;
+  // DirectiveKindMap[".cfi_endproc"] = DK_CFI_ENDPROC;
+  // DirectiveKindMap[".cfi_def_cfa"] = DK_CFI_DEF_CFA;
+  // DirectiveKindMap[".cfi_def_cfa_offset"] = DK_CFI_DEF_CFA_OFFSET;
+  // DirectiveKindMap[".cfi_adjust_cfa_offset"] = DK_CFI_ADJUST_CFA_OFFSET;
+  // DirectiveKindMap[".cfi_def_cfa_register"] = DK_CFI_DEF_CFA_REGISTER;
+  // DirectiveKindMap[".cfi_offset"] = DK_CFI_OFFSET;
+  // DirectiveKindMap[".cfi_rel_offset"] = DK_CFI_REL_OFFSET;
+  // DirectiveKindMap[".cfi_personality"] = DK_CFI_PERSONALITY;
+  // DirectiveKindMap[".cfi_lsda"] = DK_CFI_LSDA;
+  // DirectiveKindMap[".cfi_remember_state"] = DK_CFI_REMEMBER_STATE;
+  // DirectiveKindMap[".cfi_restore_state"] = DK_CFI_RESTORE_STATE;
+  // DirectiveKindMap[".cfi_same_value"] = DK_CFI_SAME_VALUE;
+  // DirectiveKindMap[".cfi_restore"] = DK_CFI_RESTORE;
+  // DirectiveKindMap[".cfi_escape"] = DK_CFI_ESCAPE;
+  // DirectiveKindMap[".cfi_return_column"] = DK_CFI_RETURN_COLUMN;
+  // DirectiveKindMap[".cfi_signal_frame"] = DK_CFI_SIGNAL_FRAME;
+  // DirectiveKindMap[".cfi_undefined"] = DK_CFI_UNDEFINED;
+  // DirectiveKindMap[".cfi_register"] = DK_CFI_REGISTER;
+  // DirectiveKindMap[".cfi_window_save"] = DK_CFI_WINDOW_SAVE;
+  // DirectiveKindMap[".cfi_b_key_frame"] = DK_CFI_B_KEY_FRAME;
+  // DirectiveKindMap[".macro"] = DK_MACRO;
+  // DirectiveKindMap[".exitm"] = DK_EXITM;
+  // DirectiveKindMap[".endm"] = DK_ENDM;
+  // DirectiveKindMap[".purgem"] = DK_PURGEM;
+  DirectiveKindMap[".err"] = DK_ERR;
+  DirectiveKindMap[".errb"] = DK_ERRB;
+  DirectiveKindMap[".errnb"] = DK_ERRNB;
+  DirectiveKindMap[".errdef"] = DK_ERRDEF;
+  DirectiveKindMap[".errndef"] = DK_ERRNDEF;
+  DirectiveKindMap[".errdif"] = DK_ERRDIF;
+  DirectiveKindMap[".errdifi"] = DK_ERRDIFI;
+  DirectiveKindMap[".erridn"] = DK_ERRIDN;
+  DirectiveKindMap[".erridni"] = DK_ERRIDNI;
+  DirectiveKindMap[".erre"] = DK_ERRE;
+  DirectiveKindMap[".errnz"] = DK_ERRNZ;
+  // DirectiveKindMap[".altmacro"] = DK_ALTMACRO;
+  // DirectiveKindMap[".noaltmacro"] = DK_NOALTMACRO;
+  DirectiveKindMap["db"] = DK_DB;
+  DirectiveKindMap["dd"] = DK_DD;
+  DirectiveKindMap["dq"] = DK_DQ;
+  DirectiveKindMap["dw"] = DK_DW;
+  DirectiveKindMap["echo"] = DK_ECHO;
+  DirectiveKindMap["struc"] = DK_STRUCT;
+  DirectiveKindMap["struct"] = DK_STRUCT;
+  DirectiveKindMap["union"] = DK_UNION;
+  DirectiveKindMap["ends"] = DK_ENDS;
+}
+
+MCAsmMacro *MasmParser::parseMacroLikeBody(SMLoc DirectiveLoc) {
+  AsmToken EndToken, StartToken = getTok();
+
+  unsigned NestLevel = 0;
+  while (true) {
+    // Check whether we have reached the end of the file.
+    if (getLexer().is(AsmToken::Eof)) {
+      printError(DirectiveLoc, "no matching '.endr' in definition");
+      return nullptr;
+    }
+
+    if (Lexer.is(AsmToken::Identifier) &&
+        (getTok().getIdentifier() == ".rep" ||
+         getTok().getIdentifier() == ".rept" ||
+         getTok().getIdentifier() == ".irp" ||
+         getTok().getIdentifier() == ".irpc")) {
+      ++NestLevel;
+    }
+
+    // Otherwise, check whether we have reached the .endr.
+    if (Lexer.is(AsmToken::Identifier) && getTok().getIdentifier() == ".endr") {
+      if (NestLevel == 0) {
+        EndToken = getTok();
+        Lex();
+        if (Lexer.isNot(AsmToken::EndOfStatement)) {
+          printError(getTok().getLoc(),
+                     "unexpected token in '.endr' directive");
+          return nullptr;
+        }
+        break;
+      }
+      --NestLevel;
+    }
+
+    // Otherwise, scan till the end of the statement.
+    eatToEndOfStatement();
+  }
+
+  const char *BodyStart = StartToken.getLoc().getPointer();
+  const char *BodyEnd = EndToken.getLoc().getPointer();
+  StringRef Body = StringRef(BodyStart, BodyEnd - BodyStart);
+
+  // We Are Anonymous.
+  MacroLikeBodies.emplace_back(StringRef(), Body, MCAsmMacroParameters());
+  return &MacroLikeBodies.back();
+}
+
+void MasmParser::instantiateMacroLikeBody(MCAsmMacro *M, SMLoc DirectiveLoc,
+                                          raw_svector_ostream &OS) {
+  OS << ".endr\n";
+
+  std::unique_ptr<MemoryBuffer> Instantiation =
+      MemoryBuffer::getMemBufferCopy(OS.str(), "<instantiation>");
+
+  // Create the macro instantiation object and add to the current macro
+  // instantiation stack.
+  MacroInstantiation *MI = new MacroInstantiation{
+      DirectiveLoc, CurBuffer, getTok().getLoc(), TheCondStack.size()};
+  ActiveMacros.push_back(MI);
+
+  // Jump to the macro instantiation and prime the lexer.
+  CurBuffer = SrcMgr.AddNewSourceBuffer(std::move(Instantiation), SMLoc());
+  Lexer.setBuffer(SrcMgr.getMemoryBuffer(CurBuffer)->getBuffer());
+  Lex();
+}
+
+/// parseDirectiveRept
+///   ::= .rep | .rept count
+bool MasmParser::parseDirectiveRept(SMLoc DirectiveLoc, StringRef Dir) {
+  const MCExpr *CountExpr;
+  SMLoc CountLoc = getTok().getLoc();
+  if (parseExpression(CountExpr))
+    return true;
+
+  int64_t Count;
+  if (!CountExpr->evaluateAsAbsolute(Count, getStreamer().getAssemblerPtr())) {
+    return Error(CountLoc, "unexpected token in '" + Dir + "' directive");
+  }
+
+  if (check(Count < 0, CountLoc, "Count is negative") ||
+      parseToken(AsmToken::EndOfStatement,
+                 "unexpected token in '" + Dir + "' directive"))
+    return true;
+
+  // Lex the rept definition.
+  MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
+  if (!M)
+    return true;
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  raw_svector_ostream OS(Buf);
+  while (Count--) {
+    // Note that the AtPseudoVariable is disabled for instantiations of .rep(t).
+    if (expandMacro(OS, M->Body, None, None, false, getTok().getLoc()))
+      return true;
+  }
+  instantiateMacroLikeBody(M, DirectiveLoc, OS);
+
+  return false;
+}
+
+/// parseDirectiveIrp
+/// ::= .irp symbol,values
+bool MasmParser::parseDirectiveIrp(SMLoc DirectiveLoc) {
+  MCAsmMacroParameter Parameter;
+  MCAsmMacroArguments A;
+  if (check(parseIdentifier(Parameter.Name),
+            "expected identifier in '.irp' directive") ||
+      parseToken(AsmToken::Comma, "expected comma in '.irp' directive") ||
+      parseMacroArguments(nullptr, A) ||
+      parseToken(AsmToken::EndOfStatement, "expected End of Statement"))
+    return true;
+
+  // Lex the irp definition.
+  MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
+  if (!M)
+    return true;
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  raw_svector_ostream OS(Buf);
+
+  for (const MCAsmMacroArgument &Arg : A) {
+    // Note that the AtPseudoVariable is enabled for instantiations of .irp.
+    // This is undocumented, but GAS seems to support it.
+    if (expandMacro(OS, M->Body, Parameter, Arg, true, getTok().getLoc()))
+      return true;
+  }
+
+  instantiateMacroLikeBody(M, DirectiveLoc, OS);
+
+  return false;
+}
+
+/// parseDirectiveIrpc
+/// ::= .irpc symbol,values
+bool MasmParser::parseDirectiveIrpc(SMLoc DirectiveLoc) {
+  MCAsmMacroParameter Parameter;
+  MCAsmMacroArguments A;
+
+  if (check(parseIdentifier(Parameter.Name),
+            "expected identifier in '.irpc' directive") ||
+      parseToken(AsmToken::Comma, "expected comma in '.irpc' directive") ||
+      parseMacroArguments(nullptr, A))
+    return true;
+
+  if (A.size() != 1 || A.front().size() != 1)
+    return TokError("unexpected token in '.irpc' directive");
+
+  // Eat the end of statement.
+  if (parseToken(AsmToken::EndOfStatement, "expected end of statement"))
+    return true;
+
+  // Lex the irpc definition.
+  MCAsmMacro *M = parseMacroLikeBody(DirectiveLoc);
+  if (!M)
+    return true;
+
+  // Macro instantiation is lexical, unfortunately. We construct a new buffer
+  // to hold the macro body with substitutions.
+  SmallString<256> Buf;
+  raw_svector_ostream OS(Buf);
+
+  StringRef Values = A.front().front().getString();
+  for (std::size_t I = 0, End = Values.size(); I != End; ++I) {
+    MCAsmMacroArgument Arg;
+    Arg.emplace_back(AsmToken::Identifier, Values.slice(I, I + 1));
+
+    // Note that the AtPseudoVariable is enabled for instantiations of .irpc.
+    // This is undocumented, but GAS seems to support it.
+    if (expandMacro(OS, M->Body, Parameter, Arg, true, getTok().getLoc()))
+      return true;
+  }
+
+  instantiateMacroLikeBody(M, DirectiveLoc, OS);
+
+  return false;
+}
+
+bool MasmParser::parseDirectiveEndr(SMLoc DirectiveLoc) {
+  if (ActiveMacros.empty())
+    return TokError("unmatched '.endr' directive");
+
+  // The only .repl that should get here are the ones created by
+  // instantiateMacroLikeBody.
+  assert(getLexer().is(AsmToken::EndOfStatement));
+
+  handleMacroExit();
+  return false;
+}
+
+bool MasmParser::parseDirectiveMSEmit(SMLoc IDLoc, ParseStatementInfo &Info,
+                                      size_t Len) {
+  const MCExpr *Value;
+  SMLoc ExprLoc = getLexer().getLoc();
+  if (parseExpression(Value))
+    return true;
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+  if (!MCE)
+    return Error(ExprLoc, "unexpected expression in _emit");
+  uint64_t IntValue = MCE->getValue();
+  if (!isUInt<8>(IntValue) && !isInt<8>(IntValue))
+    return Error(ExprLoc, "literal value out of range for directive");
+
+  Info.AsmRewrites->emplace_back(AOK_Emit, IDLoc, Len);
+  return false;
+}
+
+bool MasmParser::parseDirectiveMSAlign(SMLoc IDLoc, ParseStatementInfo &Info) {
+  const MCExpr *Value;
+  SMLoc ExprLoc = getLexer().getLoc();
+  if (parseExpression(Value))
+    return true;
+  const MCConstantExpr *MCE = dyn_cast<MCConstantExpr>(Value);
+  if (!MCE)
+    return Error(ExprLoc, "unexpected expression in align");
+  uint64_t IntValue = MCE->getValue();
+  if (!isPowerOf2_64(IntValue))
+    return Error(ExprLoc, "literal value not a power of two greater then zero");
+
+  Info.AsmRewrites->emplace_back(AOK_Align, IDLoc, 5, Log2_64(IntValue));
+  return false;
+}
+
+bool MasmParser::parseDirectiveEcho() {
+  StringRef Message = parseStringToEndOfStatement();
+  Lex();  // eat end of statement
+  llvm::outs() << Message << '\n';
+  return false;
+}
+
+// We are comparing pointers, but the pointers are relative to a single string.
+// Thus, this should always be deterministic.
+static int rewritesSort(const AsmRewrite *AsmRewriteA,
+                        const AsmRewrite *AsmRewriteB) {
+  if (AsmRewriteA->Loc.getPointer() < AsmRewriteB->Loc.getPointer())
+    return -1;
+  if (AsmRewriteB->Loc.getPointer() < AsmRewriteA->Loc.getPointer())
+    return 1;
+
+  // It's possible to have a SizeDirective, Imm/ImmPrefix and an Input/Output
+  // rewrite to the same location.  Make sure the SizeDirective rewrite is
+  // performed first, then the Imm/ImmPrefix and finally the Input/Output.  This
+  // ensures the sort algorithm is stable.
+  if (AsmRewritePrecedence[AsmRewriteA->Kind] >
+      AsmRewritePrecedence[AsmRewriteB->Kind])
+    return -1;
+
+  if (AsmRewritePrecedence[AsmRewriteA->Kind] <
+      AsmRewritePrecedence[AsmRewriteB->Kind])
+    return 1;
+  llvm_unreachable("Unstable rewrite sort.");
+}
+
+bool MasmParser::lookUpField(StringRef Name, StringRef &Type,
+                             unsigned &Offset) const {
+  const std::pair<StringRef, StringRef> BaseMember = Name.split('.');
+  const StringRef Base = BaseMember.first, Member = BaseMember.second;
+  return lookUpField(Base, Member, Type, Offset);
+}
+
+bool MasmParser::lookUpField(StringRef Base, StringRef Member, StringRef &Type,
+                             unsigned &Offset) const {
+  if (Base.empty())
+    return true;
+
+  unsigned BaseOffset = 0;
+  if (Base.contains('.') && !lookUpField(Base, Type, BaseOffset))
+    Base = Type;
+
+  auto TypeIt = KnownType.find(Base);
+  if (TypeIt != KnownType.end())
+    return lookUpField(*TypeIt->second, Member, Type, Offset);
+
+  auto StructIt = Structs.find(Base.lower());
+  if (StructIt != Structs.end())
+    return lookUpField(StructIt->second, Member, Type, Offset);
+
+  return true;
+}
+
+bool MasmParser::lookUpField(const StructInfo &Structure, StringRef Member,
+                             StringRef &Type, unsigned &Offset) const {
+  if (Member.empty()) {
+    Type = Structure.Name;
+    return false;
+  }
+
+  std::pair<StringRef, StringRef> Split = Member.split('.');
+  const StringRef FieldName = Split.first, FieldMember = Split.second;
+
+  auto StructIt = Structs.find(FieldName.lower());
+  if (StructIt != Structs.end())
+    return lookUpField(StructIt->second, FieldMember, Type, Offset);
+
+  auto FieldIt = Structure.FieldsByName.find(FieldName.lower());
+  if (FieldIt == Structure.FieldsByName.end())
+    return true;
+
+  const FieldInfo &Field = Structure.Fields[FieldIt->second];
+  if (FieldMember.empty()) {
+    Offset += Field.Offset;
+    if (Field.Contents.FT == FT_STRUCT)
+      Type = Field.Contents.StructInfo.Structure.Name;
+    return false;
+  }
+
+  if (Field.Contents.FT != FT_STRUCT)
+    return true;
+  const StructFieldInfo &StructInfo = Field.Contents.StructInfo;
+
+  bool Result = lookUpField(StructInfo.Structure, FieldMember, Type, Offset);
+  if (Result)
+    return true;
+
+  Offset += Field.Offset;
+  return false;
+}
+
+bool MasmParser::parseMSInlineAsm(
+    void *AsmLoc, std::string &AsmString, unsigned &NumOutputs,
+    unsigned &NumInputs, SmallVectorImpl<std::pair<void *, bool>> &OpDecls,
+    SmallVectorImpl<std::string> &Constraints,
+    SmallVectorImpl<std::string> &Clobbers, const MCInstrInfo *MII,
+    const MCInstPrinter *IP, MCAsmParserSemaCallback &SI) {
+  SmallVector<void *, 4> InputDecls;
+  SmallVector<void *, 4> OutputDecls;
+  SmallVector<bool, 4> InputDeclsAddressOf;
+  SmallVector<bool, 4> OutputDeclsAddressOf;
+  SmallVector<std::string, 4> InputConstraints;
+  SmallVector<std::string, 4> OutputConstraints;
+  SmallVector<unsigned, 4> ClobberRegs;
+
+  SmallVector<AsmRewrite, 4> AsmStrRewrites;
+
+  // Prime the lexer.
+  Lex();
+
+  // While we have input, parse each statement.
+  unsigned InputIdx = 0;
+  unsigned OutputIdx = 0;
+  while (getLexer().isNot(AsmToken::Eof)) {
+    // Parse curly braces marking block start/end.
+    if (parseCurlyBlockScope(AsmStrRewrites))
+      continue;
+
+    ParseStatementInfo Info(&AsmStrRewrites);
+    bool StatementErr = parseStatement(Info, &SI);
+
+    if (StatementErr || Info.ParseError) {
+      // Emit pending errors if any exist.
+      printPendingErrors();
+      return true;
+    }
+
+    // No pending error should exist here.
+    assert(!hasPendingError() && "unexpected error from parseStatement");
+
+    if (Info.Opcode == ~0U)
+      continue;
+
+    const MCInstrDesc &Desc = MII->get(Info.Opcode);
+
+    // Build the list of clobbers, outputs and inputs.
+    for (unsigned i = 1, e = Info.ParsedOperands.size(); i != e; ++i) {
+      MCParsedAsmOperand &Operand = *Info.ParsedOperands[i];
+
+      // Register operand.
+      if (Operand.isReg() && !Operand.needAddressOf() &&
+          !getTargetParser().OmitRegisterFromClobberLists(Operand.getReg())) {
+        unsigned NumDefs = Desc.getNumDefs();
+        // Clobber.
+        if (NumDefs && Operand.getMCOperandNum() < NumDefs)
+          ClobberRegs.push_back(Operand.getReg());
+        continue;
+      }
+
+      // Expr/Input or Output.
+      StringRef SymName = Operand.getSymName();
+      if (SymName.empty())
+        continue;
+
+      void *OpDecl = Operand.getOpDecl();
+      if (!OpDecl)
+        continue;
+
+      StringRef Constraint = Operand.getConstraint();
+      if (Operand.isImm()) {
+        // Offset as immediate.
+        if (Operand.isOffsetOfLocal())
+          Constraint = "r";
+        else
+          Constraint = "i";
+      }
+
+      bool isOutput = (i == 1) && Desc.mayStore();
+      SMLoc Start = SMLoc::getFromPointer(SymName.data());
+      if (isOutput) {
+        ++InputIdx;
+        OutputDecls.push_back(OpDecl);
+        OutputDeclsAddressOf.push_back(Operand.needAddressOf());
+        OutputConstraints.push_back(("=" + Constraint).str());
+        AsmStrRewrites.emplace_back(AOK_Output, Start, SymName.size());
+      } else {
+        InputDecls.push_back(OpDecl);
+        InputDeclsAddressOf.push_back(Operand.needAddressOf());
+        InputConstraints.push_back(Constraint.str());
+        if (Desc.OpInfo[i - 1].isBranchTarget())
+          AsmStrRewrites.emplace_back(AOK_CallInput, Start, SymName.size());
+        else
+          AsmStrRewrites.emplace_back(AOK_Input, Start, SymName.size());
+      }
+    }
+
+    // Consider implicit defs to be clobbers.  Think of cpuid and push.
+    ArrayRef<MCPhysReg> ImpDefs(Desc.getImplicitDefs(),
+                                Desc.getNumImplicitDefs());
+    ClobberRegs.insert(ClobberRegs.end(), ImpDefs.begin(), ImpDefs.end());
+  }
+
+  // Set the number of Outputs and Inputs.
+  NumOutputs = OutputDecls.size();
+  NumInputs = InputDecls.size();
+
+  // Set the unique clobbers.
+  array_pod_sort(ClobberRegs.begin(), ClobberRegs.end());
+  ClobberRegs.erase(std::unique(ClobberRegs.begin(), ClobberRegs.end()),
+                    ClobberRegs.end());
+  Clobbers.assign(ClobberRegs.size(), std::string());
+  for (unsigned I = 0, E = ClobberRegs.size(); I != E; ++I) {
+    raw_string_ostream OS(Clobbers[I]);
+    IP->printRegName(OS, ClobberRegs[I]);
+  }
+
+  // Merge the various outputs and inputs.  Output are expected first.
+  if (NumOutputs || NumInputs) {
+    unsigned NumExprs = NumOutputs + NumInputs;
+    OpDecls.resize(NumExprs);
+    Constraints.resize(NumExprs);
+    for (unsigned i = 0; i < NumOutputs; ++i) {
+      OpDecls[i] = std::make_pair(OutputDecls[i], OutputDeclsAddressOf[i]);
+      Constraints[i] = OutputConstraints[i];
+    }
+    for (unsigned i = 0, j = NumOutputs; i < NumInputs; ++i, ++j) {
+      OpDecls[j] = std::make_pair(InputDecls[i], InputDeclsAddressOf[i]);
+      Constraints[j] = InputConstraints[i];
+    }
+  }
+
+  // Build the IR assembly string.
+  std::string AsmStringIR;
+  raw_string_ostream OS(AsmStringIR);
+  StringRef ASMString =
+      SrcMgr.getMemoryBuffer(SrcMgr.getMainFileID())->getBuffer();
+  const char *AsmStart = ASMString.begin();
+  const char *AsmEnd = ASMString.end();
+  array_pod_sort(AsmStrRewrites.begin(), AsmStrRewrites.end(), rewritesSort);
+  for (auto it = AsmStrRewrites.begin(); it != AsmStrRewrites.end(); ++it) {
+    const AsmRewrite &AR = *it;
+    // Check if this has already been covered by another rewrite...
+    if (AR.Done)
+      continue;
+    AsmRewriteKind Kind = AR.Kind;
+
+    const char *Loc = AR.Loc.getPointer();
+    assert(Loc >= AsmStart && "Expected Loc to be at or after Start!");
+
+    // Emit everything up to the immediate/expression.
+    if (unsigned Len = Loc - AsmStart)
+      OS << StringRef(AsmStart, Len);
+
+    // Skip the original expression.
+    if (Kind == AOK_Skip) {
+      AsmStart = Loc + AR.Len;
+      continue;
+    }
+
+    unsigned AdditionalSkip = 0;
+    // Rewrite expressions in $N notation.
+    switch (Kind) {
+    default:
+      break;
+    case AOK_IntelExpr:
+      assert(AR.IntelExp.isValid() && "cannot write invalid intel expression");
+      if (AR.IntelExp.NeedBracs)
+        OS << "[";
+      if (AR.IntelExp.hasBaseReg())
+        OS << AR.IntelExp.BaseReg;
+      if (AR.IntelExp.hasIndexReg())
+        OS << (AR.IntelExp.hasBaseReg() ? " + " : "")
+           << AR.IntelExp.IndexReg;
+      if (AR.IntelExp.Scale > 1)
+        OS << " * $$" << AR.IntelExp.Scale;
+      if (AR.IntelExp.hasOffset()) {
+        if (AR.IntelExp.hasRegs())
+          OS << " + ";
+        // Fuse this rewrite with a rewrite of the offset name, if present.
+        StringRef OffsetName = AR.IntelExp.OffsetName;
+        SMLoc OffsetLoc = SMLoc::getFromPointer(AR.IntelExp.OffsetName.data());
+        size_t OffsetLen = OffsetName.size();
+        auto rewrite_it = std::find_if(
+            it, AsmStrRewrites.end(), [&](const AsmRewrite &FusingAR) {
+              return FusingAR.Loc == OffsetLoc && FusingAR.Len == OffsetLen &&
+                     (FusingAR.Kind == AOK_Input ||
+                      FusingAR.Kind == AOK_CallInput);
+            });
+        if (rewrite_it == AsmStrRewrites.end()) {
+          OS << "offset " << OffsetName;
+        } else if (rewrite_it->Kind == AOK_CallInput) {
+          OS << "${" << InputIdx++ << ":P}";
+          rewrite_it->Done = true;
+        } else {
+          OS << '$' << InputIdx++;
+          rewrite_it->Done = true;
+        }
+      }
+      if (AR.IntelExp.Imm || AR.IntelExp.emitImm())
+        OS << (AR.IntelExp.emitImm() ? "$$" : " + $$") << AR.IntelExp.Imm;
+      if (AR.IntelExp.NeedBracs)
+        OS << "]";
+      break;
+    case AOK_Label:
+      OS << Ctx.getAsmInfo()->getPrivateLabelPrefix() << AR.Label;
+      break;
+    case AOK_Input:
+      OS << '$' << InputIdx++;
+      break;
+    case AOK_CallInput:
+      OS << "${" << InputIdx++ << ":P}";
+      break;
+    case AOK_Output:
+      OS << '$' << OutputIdx++;
+      break;
+    case AOK_SizeDirective:
+      switch (AR.Val) {
+      default: break;
+      case 8:  OS << "byte ptr "; break;
+      case 16: OS << "word ptr "; break;
+      case 32: OS << "dword ptr "; break;
+      case 64: OS << "qword ptr "; break;
+      case 80: OS << "xword ptr "; break;
+      case 128: OS << "xmmword ptr "; break;
+      case 256: OS << "ymmword ptr "; break;
+      }
+      break;
+    case AOK_Emit:
+      OS << ".byte";
+      break;
+    case AOK_Align: {
+      // MS alignment directives are measured in bytes. If the native assembler
+      // measures alignment in bytes, we can pass it straight through.
+      OS << ".align";
+      if (getContext().getAsmInfo()->getAlignmentIsInBytes())
+        break;
+
+      // Alignment is in log2 form, so print that instead and skip the original
+      // immediate.
+      unsigned Val = AR.Val;
+      OS << ' ' << Val;
+      assert(Val < 10 && "Expected alignment less then 2^10.");
+      AdditionalSkip = (Val < 4) ? 2 : Val < 7 ? 3 : 4;
+      break;
+    }
+    case AOK_EVEN:
+      OS << ".even";
+      break;
+    case AOK_EndOfStatement:
+      OS << "\n\t";
+      break;
+    }
+
+    // Skip the original expression.
+    AsmStart = Loc + AR.Len + AdditionalSkip;
+  }
+
+  // Emit the remainder of the asm string.
+  if (AsmStart != AsmEnd)
+    OS << StringRef(AsmStart, AsmEnd - AsmStart);
+
+  AsmString = OS.str();
+  return false;
+}
+
+/// Create an MCAsmParser instance.
+MCAsmParser *llvm::createMCMasmParser(SourceMgr &SM, MCContext &C,
+                                      MCStreamer &Out, const MCAsmInfo &MAI,
+                                      unsigned CB) {
+  return new MasmParser(SM, C, Out, MAI, CB);
+}
diff --git a/llvm/lib/MC/MCParser/WasmAsmParser.cpp b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
index 0c242aed706d..05f23e143341 100644
--- a/llvm/lib/MC/MCParser/WasmAsmParser.cpp
+++ b/llvm/lib/MC/MCParser/WasmAsmParser.cpp
@@ -210,7 +210,7 @@ public:
     if (getLexer().isNot(AsmToken::EndOfStatement))
       return TokError("unexpected token in '.ident' directive");
     Lex();
-    getStreamer().EmitIdent(Data);
+    getStreamer().emitIdent(Data);
     return false;
   }
 
@@ -232,7 +232,7 @@ public:
         if (getParser().parseIdentifier(Name))
           return TokError("expected identifier in directive");
         MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-        getStreamer().EmitSymbolAttribute(Sym, Attr);
+        getStreamer().emitSymbolAttribute(Sym, Attr);
         if (getLexer().is(AsmToken::EndOfStatement))
           break;
         if (getLexer().isNot(AsmToken::Comma))
diff --git a/llvm/lib/MC/MCSection.cpp b/llvm/lib/MC/MCSection.cpp
index 074534bd73db..ba256102080a 100644
--- a/llvm/lib/MC/MCSection.cpp
+++ b/llvm/lib/MC/MCSection.cpp
@@ -20,9 +20,11 @@
 
 using namespace llvm;
 
-MCSection::MCSection(SectionVariant V, SectionKind K, MCSymbol *Begin)
+MCSection::MCSection(SectionVariant V, StringRef Name, SectionKind K,
+                     MCSymbol *Begin)
     : Begin(Begin), BundleGroupBeforeFirstInst(false), HasInstructions(false),
-      IsRegistered(false), DummyFragment(this), Variant(V), Kind(K) {}
+      IsRegistered(false), DummyFragment(this), Name(Name), Variant(V),
+      Kind(K) {}
 
 MCSymbol *MCSection::getEndSymbol(MCContext &Ctx) {
   if (!End)
@@ -85,7 +87,9 @@ MCSection::getSubsectionInsertionPoint(unsigned Subsection) {
   return IP;
 }
 
-void MCSection::addPendingLabel(MCSymbol* label, unsigned Subsection) {
+StringRef MCSection::getVirtualSectionKind() const { return "virtual"; }
+
+void MCSection::addPendingLabel(MCSymbol *label, unsigned Subsection) {
   PendingLabels.push_back(PendingLabel(label, Subsection));
 }
 
diff --git a/llvm/lib/MC/MCSectionCOFF.cpp b/llvm/lib/MC/MCSectionCOFF.cpp
index f0c06f70bd73..387bf2c884e5 100644
--- a/llvm/lib/MC/MCSectionCOFF.cpp
+++ b/llvm/lib/MC/MCSectionCOFF.cpp
@@ -38,12 +38,12 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                          raw_ostream &OS,
                                          const MCExpr *Subsection) const {
   // standard sections don't require the '.section'
-  if (ShouldOmitSectionDirective(SectionName, MAI)) {
-    OS << '\t' << getSectionName() << '\n';
+  if (ShouldOmitSectionDirective(getName(), MAI)) {
+    OS << '\t' << getName() << '\n';
     return;
   }
 
-  OS << "\t.section\t" << getSectionName() << ",\"";
+  OS << "\t.section\t" << getName() << ",\"";
   if (getCharacteristics() & COFF::IMAGE_SCN_CNT_INITIALIZED_DATA)
     OS << 'd';
   if (getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA)
@@ -61,7 +61,7 @@ void MCSectionCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   if (getCharacteristics() & COFF::IMAGE_SCN_MEM_SHARED)
     OS << 's';
   if ((getCharacteristics() & COFF::IMAGE_SCN_MEM_DISCARDABLE) &&
-      !isImplicitlyDiscardable(SectionName))
+      !isImplicitlyDiscardable(getName()))
     OS << 'D';
   OS << '"';
 
@@ -111,3 +111,7 @@ bool MCSectionCOFF::UseCodeAlign() const {
 bool MCSectionCOFF::isVirtualSection() const {
   return getCharacteristics() & COFF::IMAGE_SCN_CNT_UNINITIALIZED_DATA;
 }
+
+StringRef MCSectionCOFF::getVirtualSectionKind() const {
+  return "IMAGE_SCN_CNT_UNINITIALIZED_DATA";
+}
diff --git a/llvm/lib/MC/MCSectionELF.cpp b/llvm/lib/MC/MCSectionELF.cpp
index efe504b2024c..77c259c27a04 100644
--- a/llvm/lib/MC/MCSectionELF.cpp
+++ b/llvm/lib/MC/MCSectionELF.cpp
@@ -53,8 +53,8 @@ static void printName(raw_ostream &OS, StringRef Name) {
 void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                         raw_ostream &OS,
                                         const MCExpr *Subsection) const {
-  if (ShouldOmitSectionDirective(SectionName, MAI)) {
-    OS << '\t' << getSectionName();
+  if (ShouldOmitSectionDirective(getName(), MAI)) {
+    OS << '\t' << getName();
     if (Subsection) {
       OS << '\t';
       Subsection->print(OS, &MAI);
@@ -64,7 +64,7 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   }
 
   OS << "\t.section\t";
-  printName(OS, getSectionName());
+  printName(OS, getName());
 
   // Handle the weird solaris syntax if desired.
   if (MAI.usesSunStyleELFSectionSwitchSyntax() &&
@@ -158,7 +158,7 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
     OS << "llvm_sympart";
   else
     report_fatal_error("unsupported type 0x" + Twine::utohexstr(Type) +
-                       " for section " + getSectionName());
+                       " for section " + getName());
 
   if (EntrySize) {
     assert(Flags & ELF::SHF_MERGE);
@@ -172,9 +172,9 @@ void MCSectionELF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   }
 
   if (Flags & ELF::SHF_LINK_ORDER) {
-    assert(AssociatedSymbol);
+    assert(LinkedToSym);
     OS << ",";
-    printName(OS, AssociatedSymbol->getName());
+    printName(OS, LinkedToSym->getName());
   }
 
   if (isUnique())
@@ -196,3 +196,5 @@ bool MCSectionELF::UseCodeAlign() const {
 bool MCSectionELF::isVirtualSection() const {
   return getType() == ELF::SHT_NOBITS;
 }
+
+StringRef MCSectionELF::getVirtualSectionKind() const { return "SHT_NOBITS"; }
diff --git a/llvm/lib/MC/MCSectionMachO.cpp b/llvm/lib/MC/MCSectionMachO.cpp
index 0fd89dcbe5fa..21a63ce83330 100644
--- a/llvm/lib/MC/MCSectionMachO.cpp
+++ b/llvm/lib/MC/MCSectionMachO.cpp
@@ -83,7 +83,7 @@ ENTRY("" /*FIXME*/,          S_ATTR_LOC_RELOC)
 MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
                                unsigned TAA, unsigned reserved2, SectionKind K,
                                MCSymbol *Begin)
-    : MCSection(SV_MachO, K, Begin), TypeAndAttributes(TAA),
+    : MCSection(SV_MachO, Section, K, Begin), TypeAndAttributes(TAA),
       Reserved2(reserved2) {
   assert(Segment.size() <= 16 && Section.size() <= 16 &&
          "Segment or section string too long");
@@ -92,18 +92,13 @@ MCSectionMachO::MCSectionMachO(StringRef Segment, StringRef Section,
       SegmentName[i] = Segment[i];
     else
       SegmentName[i] = 0;
-
-    if (i < Section.size())
-      SectionName[i] = Section[i];
-    else
-      SectionName[i] = 0;
   }
 }
 
 void MCSectionMachO::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                           raw_ostream &OS,
                                           const MCExpr *Subsection) const {
-  OS << "\t.section\t" << getSegmentName() << ',' << getSectionName();
+  OS << "\t.section\t" << getSegmentName() << ',' << getName();
 
   // Get the section type and attributes.
   unsigned TAA = getTypeAndAttributes();
diff --git a/llvm/lib/MC/MCSectionWasm.cpp b/llvm/lib/MC/MCSectionWasm.cpp
index 8633c10a73fd..27ed51802a2e 100644
--- a/llvm/lib/MC/MCSectionWasm.cpp
+++ b/llvm/lib/MC/MCSectionWasm.cpp
@@ -10,6 +10,7 @@
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/MCSymbolWasm.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -48,8 +49,8 @@ void MCSectionWasm::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                          raw_ostream &OS,
                                          const MCExpr *Subsection) const {
 
-  if (shouldOmitSectionDirective(SectionName, MAI)) {
-    OS << '\t' << getSectionName();
+  if (shouldOmitSectionDirective(getName(), MAI)) {
+    OS << '\t' << getName();
     if (Subsection) {
       OS << '\t';
       Subsection->print(OS, &MAI);
@@ -59,7 +60,7 @@ void MCSectionWasm::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
   }
 
   OS << "\t.section\t";
-  printName(OS, getSectionName());
+  printName(OS, getName());
   OS << ",\"";
 
   if (IsPassive)
diff --git a/llvm/lib/MC/MCSectionXCOFF.cpp b/llvm/lib/MC/MCSectionXCOFF.cpp
index 8377e295532a..1fa495239f74 100644
--- a/llvm/lib/MC/MCSectionXCOFF.cpp
+++ b/llvm/lib/MC/MCSectionXCOFF.cpp
@@ -15,6 +15,10 @@ using namespace llvm;
 
 MCSectionXCOFF::~MCSectionXCOFF() = default;
 
+void MCSectionXCOFF::printCsectDirective(raw_ostream &OS) const {
+  OS << "\t.csect " << QualName->getName() << "," << Log2_32(getAlignment())
+     << '\n';
+}
 
 void MCSectionXCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
                                           raw_ostream &OS,
@@ -23,14 +27,14 @@ void MCSectionXCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
     if (getMappingClass() != XCOFF::XMC_PR)
       report_fatal_error("Unhandled storage-mapping class for .text csect");
 
-    OS << "\t.csect " << QualName->getName() << '\n';
+    printCsectDirective(OS);
     return;
   }
 
   if (getKind().isReadOnly()) {
     if (getMappingClass() != XCOFF::XMC_RO)
       report_fatal_error("Unhandled storage-mapping class for .rodata csect.");
-    OS << "\t.csect " << QualName->getName() << '\n';
+    printCsectDirective(OS);
     return;
   }
 
@@ -38,7 +42,7 @@ void MCSectionXCOFF::PrintSwitchToSection(const MCAsmInfo &MAI, const Triple &T,
     switch (getMappingClass()) {
     case XCOFF::XMC_RW:
     case XCOFF::XMC_DS:
-      OS << "\t.csect " << QualName->getName() << '\n';
+      printCsectDirective(OS);
       break;
     case XCOFF::XMC_TC:
       break;
diff --git a/llvm/lib/MC/MCStreamer.cpp b/llvm/lib/MC/MCStreamer.cpp
index 0ab883536779..6d3a933c96a3 100644
--- a/llvm/lib/MC/MCStreamer.cpp
+++ b/llvm/lib/MC/MCStreamer.cpp
@@ -63,7 +63,7 @@ void MCTargetStreamer::changeSection(const MCSection *CurSection,
 }
 
 void MCTargetStreamer::emitDwarfFileDirective(StringRef Directive) {
-  Streamer.EmitRawText(Directive);
+  Streamer.emitRawText(Directive);
 }
 
 void MCTargetStreamer::emitValue(const MCExpr *Value) {
@@ -71,7 +71,7 @@ void MCTargetStreamer::emitValue(const MCExpr *Value) {
   raw_svector_ostream OS(Str);
 
   Value->print(OS, Streamer.getContext().getAsmInfo());
-  Streamer.EmitRawText(OS.str());
+  Streamer.emitRawText(OS.str());
 }
 
 void MCTargetStreamer::emitRawBytes(StringRef Data) {
@@ -82,7 +82,7 @@ void MCTargetStreamer::emitRawBytes(StringRef Data) {
     raw_svector_ostream OS(Str);
 
     OS << Directive << (unsigned)C;
-    Streamer.EmitRawText(OS.str());
+    Streamer.emitRawText(OS.str());
   }
 }
 
@@ -128,73 +128,71 @@ void MCStreamer::generateCompactUnwindEncodings(MCAsmBackend *MAB) {
 
 /// EmitIntValue - Special case of EmitValue that avoids the client having to
 /// pass in a MCExpr for constant integers.
-void MCStreamer::EmitIntValue(uint64_t Value, unsigned Size) {
+void MCStreamer::emitIntValue(uint64_t Value, unsigned Size) {
   assert(1 <= Size && Size <= 8 && "Invalid size");
   assert((isUIntN(8 * Size, Value) || isIntN(8 * Size, Value)) &&
          "Invalid size");
-  char buf[8];
-  const bool isLittleEndian = Context.getAsmInfo()->isLittleEndian();
-  for (unsigned i = 0; i != Size; ++i) {
-    unsigned index = isLittleEndian ? i : (Size - i - 1);
-    buf[i] = uint8_t(Value >> (index * 8));
-  }
-  EmitBytes(StringRef(buf, Size));
+  const bool IsLittleEndian = Context.getAsmInfo()->isLittleEndian();
+  uint64_t Swapped = support::endian::byte_swap(
+      Value, IsLittleEndian ? support::little : support::big);
+  unsigned Index = IsLittleEndian ? 0 : 8 - Size;
+  emitBytes(StringRef(reinterpret_cast<char *>(&Swapped) + Index, Size));
 }
 
 /// EmitULEB128IntValue - Special case of EmitULEB128Value that avoids the
 /// client having to pass in a MCExpr for constant integers.
-void MCStreamer::EmitULEB128IntValue(uint64_t Value, unsigned PadTo) {
+void MCStreamer::emitULEB128IntValue(uint64_t Value, unsigned PadTo) {
   SmallString<128> Tmp;
   raw_svector_ostream OSE(Tmp);
   encodeULEB128(Value, OSE, PadTo);
-  EmitBytes(OSE.str());
+  emitBytes(OSE.str());
 }
 
 /// EmitSLEB128IntValue - Special case of EmitSLEB128Value that avoids the
 /// client having to pass in a MCExpr for constant integers.
-void MCStreamer::EmitSLEB128IntValue(int64_t Value) {
+void MCStreamer::emitSLEB128IntValue(int64_t Value) {
   SmallString<128> Tmp;
   raw_svector_ostream OSE(Tmp);
   encodeSLEB128(Value, OSE);
-  EmitBytes(OSE.str());
+  emitBytes(OSE.str());
 }
 
-void MCStreamer::EmitValue(const MCExpr *Value, unsigned Size, SMLoc Loc) {
-  EmitValueImpl(Value, Size, Loc);
+void MCStreamer::emitValue(const MCExpr *Value, unsigned Size, SMLoc Loc) {
+  emitValueImpl(Value, Size, Loc);
 }
 
-void MCStreamer::EmitSymbolValue(const MCSymbol *Sym, unsigned Size,
+void MCStreamer::emitSymbolValue(const MCSymbol *Sym, unsigned Size,
                                  bool IsSectionRelative) {
   assert((!IsSectionRelative || Size == 4) &&
          "SectionRelative value requires 4-bytes");
 
   if (!IsSectionRelative)
-    EmitValueImpl(MCSymbolRefExpr::create(Sym, getContext()), Size);
+    emitValueImpl(MCSymbolRefExpr::create(Sym, getContext()), Size);
   else
     EmitCOFFSecRel32(Sym, /*Offset=*/0);
 }
 
-void MCStreamer::EmitDTPRel64Value(const MCExpr *Value) {
+void MCStreamer::emitDTPRel64Value(const MCExpr *Value) {
   report_fatal_error("unsupported directive in streamer");
 }
 
-void MCStreamer::EmitDTPRel32Value(const MCExpr *Value) {
+void MCStreamer::emitDTPRel32Value(const MCExpr *Value) {
   report_fatal_error("unsupported directive in streamer");
 }
 
-void MCStreamer::EmitTPRel64Value(const MCExpr *Value) {
+void MCStreamer::emitTPRel64Value(const MCExpr *Value) {
   report_fatal_error("unsupported directive in streamer");
 }
 
-void MCStreamer::EmitTPRel32Value(const MCExpr *Value) {
+void MCStreamer::emitTPRel32Value(const MCExpr *Value) {
   report_fatal_error("unsupported directive in streamer");
 }
 
-void MCStreamer::EmitGPRel64Value(const MCExpr *Value) {
+void MCStreamer::emitGPRel64Value(const MCExpr *Value) {
   report_fatal_error("unsupported directive in streamer");
 }
 
-void MCStreamer::EmitGPRel32Value(const MCExpr *Value) {
+void MCStreamer::emitGPRel32Value(const MCExpr *Value) {
   report_fatal_error("unsupported directive in streamer");
 }
 
@@ -205,9 +203,7 @@ void MCStreamer::emitFill(uint64_t NumBytes, uint8_t FillValue) {
 }
 
 /// The implementation in this class just redirects to emitFill.
-void MCStreamer::EmitZeros(uint64_t NumBytes) {
-  emitFill(NumBytes, 0);
-}
+void MCStreamer::emitZeros(uint64_t NumBytes) { emitFill(NumBytes, 0); }
 
 Expected<unsigned>
 MCStreamer::tryEmitDwarfFileDirective(unsigned FileNo, StringRef Directory,
@@ -228,17 +224,16 @@ void MCStreamer::emitDwarfFile0Directive(StringRef Directory,
                                       Source);
 }
 
-void MCStreamer::EmitCFIBKeyFrame() {
+void MCStreamer::emitCFIBKeyFrame() {
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame)
     return;
   CurFrame->IsBKeyFrame = true;
 }
 
-void MCStreamer::EmitDwarfLocDirective(unsigned FileNo, unsigned Line,
+void MCStreamer::emitDwarfLocDirective(unsigned FileNo, unsigned Line,
                                        unsigned Column, unsigned Flags,
-                                       unsigned Isa,
-                                       unsigned Discriminator,
+                                       unsigned Isa, unsigned Discriminator,
                                        StringRef FileName) {
   getContext().setCurrentDwarfLoc(FileNo, Line, Column, Flags, Isa,
                                   Discriminator);
@@ -293,7 +288,7 @@ bool MCStreamer::EmitCVInlineSiteIdDirective(unsigned FunctionId,
       FunctionId, IAFunc, IAFile, IALine, IACol);
 }
 
-void MCStreamer::EmitCVLocDirective(unsigned FunctionId, unsigned FileNo,
+void MCStreamer::emitCVLocDirective(unsigned FunctionId, unsigned FileNo,
                                     unsigned Line, unsigned Column,
                                     bool PrologueEnd, bool IsStmt,
                                     StringRef FileName, SMLoc Loc) {}
@@ -320,11 +315,11 @@ bool MCStreamer::checkCVLocSection(unsigned FuncId, unsigned FileNo,
   return true;
 }
 
-void MCStreamer::EmitCVLinetableDirective(unsigned FunctionId,
+void MCStreamer::emitCVLinetableDirective(unsigned FunctionId,
                                           const MCSymbol *Begin,
                                           const MCSymbol *End) {}
 
-void MCStreamer::EmitCVInlineLinetableDirective(unsigned PrimaryFunctionId,
+void MCStreamer::emitCVInlineLinetableDirective(unsigned PrimaryFunctionId,
                                                 unsigned SourceFileId,
                                                 unsigned SourceLineNum,
                                                 const MCSymbol *FnStartSym,
@@ -342,45 +337,45 @@ static void copyBytesForDefRange(SmallString<20> &BytePrefix,
   memcpy(&BytePrefix[2], &DefRangeHeader, sizeof(T));
 }
 
-void MCStreamer::EmitCVDefRangeDirective(
+void MCStreamer::emitCVDefRangeDirective(
     ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
     StringRef FixedSizePortion) {}
 
-void MCStreamer::EmitCVDefRangeDirective(
+void MCStreamer::emitCVDefRangeDirective(
     ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
     codeview::DefRangeRegisterRelHeader DRHdr) {
   SmallString<20> BytePrefix;
   copyBytesForDefRange(BytePrefix, codeview::S_DEFRANGE_REGISTER_REL, DRHdr);
-  EmitCVDefRangeDirective(Ranges, BytePrefix);
+  emitCVDefRangeDirective(Ranges, BytePrefix);
 }
 
-void MCStreamer::EmitCVDefRangeDirective(
+void MCStreamer::emitCVDefRangeDirective(
     ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
     codeview::DefRangeSubfieldRegisterHeader DRHdr) {
   SmallString<20> BytePrefix;
   copyBytesForDefRange(BytePrefix, codeview::S_DEFRANGE_SUBFIELD_REGISTER,
                        DRHdr);
-  EmitCVDefRangeDirective(Ranges, BytePrefix);
+  emitCVDefRangeDirective(Ranges, BytePrefix);
 }
 
-void MCStreamer::EmitCVDefRangeDirective(
+void MCStreamer::emitCVDefRangeDirective(
     ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
     codeview::DefRangeRegisterHeader DRHdr) {
   SmallString<20> BytePrefix;
   copyBytesForDefRange(BytePrefix, codeview::S_DEFRANGE_REGISTER, DRHdr);
-  EmitCVDefRangeDirective(Ranges, BytePrefix);
+  emitCVDefRangeDirective(Ranges, BytePrefix);
 }
 
-void MCStreamer::EmitCVDefRangeDirective(
+void MCStreamer::emitCVDefRangeDirective(
     ArrayRef<std::pair<const MCSymbol *, const MCSymbol *>> Ranges,
     codeview::DefRangeFramePointerRelHeader DRHdr) {
   SmallString<20> BytePrefix;
   copyBytesForDefRange(BytePrefix, codeview::S_DEFRANGE_FRAMEPOINTER_REL,
                        DRHdr);
-  EmitCVDefRangeDirective(Ranges, BytePrefix);
+  emitCVDefRangeDirective(Ranges, BytePrefix);
 }
 
-void MCStreamer::EmitEHSymAttributes(const MCSymbol *Symbol,
+void MCStreamer::emitEHSymAttributes(const MCSymbol *Symbol,
                                      MCSymbol *EHSymbol) {
 }
 
@@ -397,7 +392,7 @@ void MCStreamer::AssignFragment(MCSymbol *Symbol, MCFragment *Fragment) {
   SymbolOrdering[Symbol] = 1 + SymbolOrdering.size();
 }
 
-void MCStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
+void MCStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
   Symbol->redefineIfPossible();
 
   if (!Symbol->isUndefined() || Symbol->isVariable())
@@ -415,18 +410,18 @@ void MCStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
     TS->emitLabel(Symbol);
 }
 
-void MCStreamer::EmitCFISections(bool EH, bool Debug) {
+void MCStreamer::emitCFISections(bool EH, bool Debug) {
   assert(EH || Debug);
 }
 
-void MCStreamer::EmitCFIStartProc(bool IsSimple, SMLoc Loc) {
+void MCStreamer::emitCFIStartProc(bool IsSimple, SMLoc Loc) {
   if (hasUnfinishedDwarfFrameInfo())
     return getContext().reportError(
         Loc, "starting new .cfi frame before finishing the previous one");
 
   MCDwarfFrameInfo Frame;
   Frame.IsSimple = IsSimple;
-  EmitCFIStartProcImpl(Frame);
+  emitCFIStartProcImpl(Frame);
 
   const MCAsmInfo* MAI = Context.getAsmInfo();
   if (MAI) {
@@ -441,32 +436,32 @@ void MCStreamer::EmitCFIStartProc(bool IsSimple, SMLoc Loc) {
   DwarfFrameInfos.push_back(Frame);
 }
 
-void MCStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
+void MCStreamer::emitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
 }
 
-void MCStreamer::EmitCFIEndProc() {
+void MCStreamer::emitCFIEndProc() {
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame)
     return;
-  EmitCFIEndProcImpl(*CurFrame);
+  emitCFIEndProcImpl(*CurFrame);
 }
 
-void MCStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
+void MCStreamer::emitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
   // Put a dummy non-null value in Frame.End to mark that this frame has been
   // closed.
   Frame.End = (MCSymbol *)1;
 }
 
-MCSymbol *MCStreamer::EmitCFILabel() {
+MCSymbol *MCStreamer::emitCFILabel() {
   // Return a dummy non-null value so that label fields appear filled in when
   // generating textual assembly.
   return (MCSymbol *)1;
 }
 
-void MCStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFIDefCfa(int64_t Register, int64_t Offset) {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction =
-    MCCFIInstruction::createDefCfa(Label, Register, Offset);
+      MCCFIInstruction::cfiDefCfa(Label, Register, Offset);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame)
     return;
@@ -474,18 +469,18 @@ void MCStreamer::EmitCFIDefCfa(int64_t Register, int64_t Offset) {
   CurFrame->CurrentCfaRegister = static_cast<unsigned>(Register);
 }
 
-void MCStreamer::EmitCFIDefCfaOffset(int64_t Offset) {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFIDefCfaOffset(int64_t Offset) {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction =
-    MCCFIInstruction::createDefCfaOffset(Label, Offset);
+      MCCFIInstruction::cfiDefCfaOffset(Label, Offset);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame)
     return;
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::EmitCFIAdjustCfaOffset(int64_t Adjustment) {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFIAdjustCfaOffset(int64_t Adjustment) {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createAdjustCfaOffset(Label, Adjustment);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
@@ -494,8 +489,8 @@ void MCStreamer::EmitCFIAdjustCfaOffset(int64_t Adjustment) {
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::EmitCFIDefCfaRegister(int64_t Register) {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFIDefCfaRegister(int64_t Register) {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createDefCfaRegister(Label, Register);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
@@ -505,8 +500,8 @@ void MCStreamer::EmitCFIDefCfaRegister(int64_t Register) {
   CurFrame->CurrentCfaRegister = static_cast<unsigned>(Register);
 }
 
-void MCStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFIOffset(int64_t Register, int64_t Offset) {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createOffset(Label, Register, Offset);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
@@ -515,8 +510,8 @@ void MCStreamer::EmitCFIOffset(int64_t Register, int64_t Offset) {
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::EmitCFIRelOffset(int64_t Register, int64_t Offset) {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFIRelOffset(int64_t Register, int64_t Offset) {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createRelOffset(Label, Register, Offset);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
@@ -525,7 +520,7 @@ void MCStreamer::EmitCFIRelOffset(int64_t Register, int64_t Offset) {
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::EmitCFIPersonality(const MCSymbol *Sym,
+void MCStreamer::emitCFIPersonality(const MCSymbol *Sym,
                                     unsigned Encoding) {
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame)
@@ -534,7 +529,7 @@ void MCStreamer::EmitCFIPersonality(const MCSymbol *Sym,
   CurFrame->PersonalityEncoding = Encoding;
 }
 
-void MCStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) {
+void MCStreamer::emitCFILsda(const MCSymbol *Sym, unsigned Encoding) {
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame)
     return;
@@ -542,8 +537,8 @@ void MCStreamer::EmitCFILsda(const MCSymbol *Sym, unsigned Encoding) {
   CurFrame->LsdaEncoding = Encoding;
 }
 
-void MCStreamer::EmitCFIRememberState() {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFIRememberState() {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction = MCCFIInstruction::createRememberState(Label);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame)
@@ -551,9 +546,9 @@ void MCStreamer::EmitCFIRememberState() {
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::EmitCFIRestoreState() {
+void MCStreamer::emitCFIRestoreState() {
   // FIXME: Error if there is no matching cfi_remember_state.
-  MCSymbol *Label = EmitCFILabel();
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction = MCCFIInstruction::createRestoreState(Label);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame)
@@ -561,8 +556,8 @@ void MCStreamer::EmitCFIRestoreState() {
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::EmitCFISameValue(int64_t Register) {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFISameValue(int64_t Register) {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createSameValue(Label, Register);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
@@ -571,8 +566,8 @@ void MCStreamer::EmitCFISameValue(int64_t Register) {
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::EmitCFIRestore(int64_t Register) {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFIRestore(int64_t Register) {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createRestore(Label, Register);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
@@ -581,8 +576,8 @@ void MCStreamer::EmitCFIRestore(int64_t Register) {
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::EmitCFIEscape(StringRef Values) {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFIEscape(StringRef Values) {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction = MCCFIInstruction::createEscape(Label, Values);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame)
@@ -590,8 +585,8 @@ void MCStreamer::EmitCFIEscape(StringRef Values) {
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::EmitCFIGnuArgsSize(int64_t Size) {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFIGnuArgsSize(int64_t Size) {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createGnuArgsSize(Label, Size);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
@@ -600,15 +595,15 @@ void MCStreamer::EmitCFIGnuArgsSize(int64_t Size) {
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::EmitCFISignalFrame() {
+void MCStreamer::emitCFISignalFrame() {
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame)
     return;
   CurFrame->IsSignalFrame = true;
 }
 
-void MCStreamer::EmitCFIUndefined(int64_t Register) {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFIUndefined(int64_t Register) {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createUndefined(Label, Register);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
@@ -617,8 +612,8 @@ void MCStreamer::EmitCFIUndefined(int64_t Register) {
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::EmitCFIRegister(int64_t Register1, int64_t Register2) {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFIRegister(int64_t Register1, int64_t Register2) {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createRegister(Label, Register1, Register2);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
@@ -627,8 +622,8 @@ void MCStreamer::EmitCFIRegister(int64_t Register1, int64_t Register2) {
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::EmitCFIWindowSave() {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFIWindowSave() {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction =
     MCCFIInstruction::createWindowSave(Label);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
@@ -637,8 +632,8 @@ void MCStreamer::EmitCFIWindowSave() {
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::EmitCFINegateRAState() {
-  MCSymbol *Label = EmitCFILabel();
+void MCStreamer::emitCFINegateRAState() {
+  MCSymbol *Label = emitCFILabel();
   MCCFIInstruction Instruction = MCCFIInstruction::createNegateRAState(Label);
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame)
@@ -646,7 +641,7 @@ void MCStreamer::EmitCFINegateRAState() {
   CurFrame->Instructions.push_back(Instruction);
 }
 
-void MCStreamer::EmitCFIReturnColumn(int64_t Register) {
+void MCStreamer::emitCFIReturnColumn(int64_t Register) {
   MCDwarfFrameInfo *CurFrame = getCurrentDwarfFrameInfo();
   if (!CurFrame)
     return;
@@ -677,7 +672,7 @@ void MCStreamer::EmitWinCFIStartProc(const MCSymbol *Symbol, SMLoc Loc) {
     getContext().reportError(
         Loc, "Starting a function before ending the previous one!");
 
-  MCSymbol *StartProc = EmitCFILabel();
+  MCSymbol *StartProc = emitCFILabel();
 
   WinFrameInfos.emplace_back(
       std::make_unique<WinEH::FrameInfo>(Symbol, StartProc));
@@ -692,7 +687,7 @@ void MCStreamer::EmitWinCFIEndProc(SMLoc Loc) {
   if (CurFrame->ChainedParent)
     getContext().reportError(Loc, "Not all chained regions terminated!");
 
-  MCSymbol *Label = EmitCFILabel();
+  MCSymbol *Label = emitCFILabel();
   CurFrame->End = Label;
 }
 
@@ -703,7 +698,7 @@ void MCStreamer::EmitWinCFIFuncletOrFuncEnd(SMLoc Loc) {
   if (CurFrame->ChainedParent)
     getContext().reportError(Loc, "Not all chained regions terminated!");
 
-  MCSymbol *Label = EmitCFILabel();
+  MCSymbol *Label = emitCFILabel();
   CurFrame->FuncletOrFuncEnd = Label;
 }
 
@@ -712,7 +707,7 @@ void MCStreamer::EmitWinCFIStartChained(SMLoc Loc) {
   if (!CurFrame)
     return;
 
-  MCSymbol *StartProc = EmitCFILabel();
+  MCSymbol *StartProc = emitCFILabel();
 
   WinFrameInfos.emplace_back(std::make_unique<WinEH::FrameInfo>(
       CurFrame->Function, StartProc, CurFrame));
@@ -728,7 +723,7 @@ void MCStreamer::EmitWinCFIEndChained(SMLoc Loc) {
     return getContext().reportError(
         Loc, "End of a chained region outside a chained region!");
 
-  MCSymbol *Label = EmitCFILabel();
+  MCSymbol *Label = emitCFILabel();
 
   CurFrame->End = Label;
   CurrentWinFrameInfo = const_cast<WinEH::FrameInfo *>(CurFrame->ChainedParent);
@@ -784,10 +779,9 @@ static MCSection *getWinCFISection(MCContext &Context, unsigned *NextWinCFIID,
     // GCC does, which is to make plain comdat selectany section named like
     // ".[px]data$_Z3foov".
     if (!Context.getAsmInfo()->hasCOFFAssociativeComdats()) {
-      std::string SectionName =
-          (MainCFISecCOFF->getSectionName() + "$" +
-           TextSecCOFF->getSectionName().split('$').second)
-              .str();
+      std::string SectionName = (MainCFISecCOFF->getName() + "$" +
+                                 TextSecCOFF->getName().split('$').second)
+                                    .str();
       return Context.getCOFFSection(
           SectionName,
           MainCFISecCOFF->getCharacteristics() | COFF::IMAGE_SCN_LNK_COMDAT,
@@ -810,7 +804,7 @@ MCSection *MCStreamer::getAssociatedXDataSection(const MCSection *TextSec) {
                           TextSec);
 }
 
-void MCStreamer::EmitSyntaxDirective() {}
+void MCStreamer::emitSyntaxDirective() {}
 
 static unsigned encodeSEHRegNum(MCContext &Ctx, MCRegister Reg) {
   return Ctx.getRegisterInfo()->getSEHRegNum(Reg);
@@ -821,7 +815,7 @@ void MCStreamer::EmitWinCFIPushReg(MCRegister Register, SMLoc Loc) {
   if (!CurFrame)
     return;
 
-  MCSymbol *Label = EmitCFILabel();
+  MCSymbol *Label = emitCFILabel();
 
   WinEH::Instruction Inst = Win64EH::Instruction::PushNonVol(
       Label, encodeSEHRegNum(Context, Register));
@@ -842,7 +836,7 @@ void MCStreamer::EmitWinCFISetFrame(MCRegister Register, unsigned Offset,
     return getContext().reportError(
         Loc, "frame offset must be less than or equal to 240");
 
-  MCSymbol *Label = EmitCFILabel();
+  MCSymbol *Label = emitCFILabel();
 
   WinEH::Instruction Inst = Win64EH::Instruction::SetFPReg(
       Label, encodeSEHRegNum(getContext(), Register), Offset);
@@ -861,7 +855,7 @@ void MCStreamer::EmitWinCFIAllocStack(unsigned Size, SMLoc Loc) {
     return getContext().reportError(
         Loc, "stack allocation size is not a multiple of 8");
 
-  MCSymbol *Label = EmitCFILabel();
+  MCSymbol *Label = emitCFILabel();
 
   WinEH::Instruction Inst = Win64EH::Instruction::Alloc(Label, Size);
   CurFrame->Instructions.push_back(Inst);
@@ -877,7 +871,7 @@ void MCStreamer::EmitWinCFISaveReg(MCRegister Register, unsigned Offset,
     return getContext().reportError(
         Loc, "register save offset is not 8 byte aligned");
 
-  MCSymbol *Label = EmitCFILabel();
+  MCSymbol *Label = emitCFILabel();
 
   WinEH::Instruction Inst = Win64EH::Instruction::SaveNonVol(
       Label, encodeSEHRegNum(Context, Register), Offset);
@@ -892,7 +886,7 @@ void MCStreamer::EmitWinCFISaveXMM(MCRegister Register, unsigned Offset,
   if (Offset & 0x0F)
     return getContext().reportError(Loc, "offset is not a multiple of 16");
 
-  MCSymbol *Label = EmitCFILabel();
+  MCSymbol *Label = emitCFILabel();
 
   WinEH::Instruction Inst = Win64EH::Instruction::SaveXMM(
       Label, encodeSEHRegNum(Context, Register), Offset);
@@ -907,7 +901,7 @@ void MCStreamer::EmitWinCFIPushFrame(bool Code, SMLoc Loc) {
     return getContext().reportError(
         Loc, "If present, PushMachFrame must be the first UOP");
 
-  MCSymbol *Label = EmitCFILabel();
+  MCSymbol *Label = emitCFILabel();
 
   WinEH::Instruction Inst = Win64EH::Instruction::PushMachFrame(Label, Code);
   CurFrame->Instructions.push_back(Inst);
@@ -918,7 +912,7 @@ void MCStreamer::EmitWinCFIEndProlog(SMLoc Loc) {
   if (!CurFrame)
     return;
 
-  MCSymbol *Label = EmitCFILabel();
+  MCSymbol *Label = emitCFILabel();
 
   CurFrame->PrologEnd = Label;
 }
@@ -936,7 +930,7 @@ void MCStreamer::EmitCOFFImgRel32(MCSymbol const *Symbol, int64_t Offset) {}
 /// EmitRawText - If this file is backed by an assembly streamer, this dumps
 /// the specified string in the output .s file.  This capability is
 /// indicated by the hasRawTextSupport() predicate.
-void MCStreamer::EmitRawTextImpl(StringRef String) {
+void MCStreamer::emitRawTextImpl(StringRef String) {
   // This is not llvm_unreachable for the sake of out of tree backend
   // developers who may not have assembly streamers and should serve as a
   // reminder to not accidentally call EmitRawText in the absence of such.
@@ -945,9 +939,9 @@ void MCStreamer::EmitRawTextImpl(StringRef String) {
                      "implementation)");
 }
 
-void MCStreamer::EmitRawText(const Twine &T) {
+void MCStreamer::emitRawText(const Twine &T) {
   SmallString<128> Str;
-  EmitRawTextImpl(T.toStringRef(Str));
+  emitRawTextImpl(T.toStringRef(Str));
 }
 
 void MCStreamer::EmitWindowsUnwindTables() {
@@ -964,10 +958,10 @@ void MCStreamer::Finish() {
   if (TS)
     TS->finish();
 
-  FinishImpl();
+  finishImpl();
 }
 
-void MCStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+void MCStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
   visitUsedExpr(*Value);
   Symbol->setVariableValue(Value);
 
@@ -1012,7 +1006,7 @@ void MCStreamer::visitUsedExpr(const MCExpr &Expr) {
   }
 }
 
-void MCStreamer::EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &) {
+void MCStreamer::emitInstruction(const MCInst &Inst, const MCSubtargetInfo &) {
   // Scan for values.
   for (unsigned i = Inst.getNumOperands(); i--;)
     if (Inst.getOperand(i).isExpr())
@@ -1028,14 +1022,14 @@ void MCStreamer::emitAbsoluteSymbolDiff(const MCSymbol *Hi, const MCSymbol *Lo,
 
   const MCAsmInfo *MAI = Context.getAsmInfo();
   if (!MAI->doesSetDirectiveSuppressReloc()) {
-    EmitValue(Diff, Size);
+    emitValue(Diff, Size);
     return;
   }
 
   // Otherwise, emit with .set (aka assignment).
   MCSymbol *SetLabel = Context.createTempSymbol("set", true);
-  EmitAssignment(SetLabel, Diff);
-  EmitSymbolValue(SetLabel, Size);
+  emitAssignment(SetLabel, Diff);
+  emitSymbolValue(SetLabel, Size);
 }
 
 void MCStreamer::emitAbsoluteSymbolDiffAsULEB128(const MCSymbol *Hi,
@@ -1045,72 +1039,86 @@ void MCStreamer::emitAbsoluteSymbolDiffAsULEB128(const MCSymbol *Hi,
       MCBinaryExpr::createSub(MCSymbolRefExpr::create(Hi, Context),
                               MCSymbolRefExpr::create(Lo, Context), Context);
 
-  EmitULEB128Value(Diff);
+  emitULEB128Value(Diff);
 }
 
-void MCStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {}
-void MCStreamer::EmitThumbFunc(MCSymbol *Func) {}
-void MCStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
+void MCStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {}
+void MCStreamer::emitThumbFunc(MCSymbol *Func) {}
+void MCStreamer::emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {}
 void MCStreamer::BeginCOFFSymbolDef(const MCSymbol *Symbol) {
   llvm_unreachable("this directive only supported on COFF targets");
 }
 void MCStreamer::EndCOFFSymbolDef() {
   llvm_unreachable("this directive only supported on COFF targets");
 }
-void MCStreamer::EmitFileDirective(StringRef Filename) {}
+void MCStreamer::emitFileDirective(StringRef Filename) {}
 void MCStreamer::EmitCOFFSymbolStorageClass(int StorageClass) {
   llvm_unreachable("this directive only supported on COFF targets");
 }
 void MCStreamer::EmitCOFFSymbolType(int Type) {
   llvm_unreachable("this directive only supported on COFF targets");
 }
-void MCStreamer::EmitXCOFFLocalCommonSymbol(MCSymbol *LabelSym, uint64_t Size,
+void MCStreamer::emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym, uint64_t Size,
                                             MCSymbol *CsectSym,
                                             unsigned ByteAlign) {
   llvm_unreachable("this directive only supported on XCOFF targets");
 }
+
+void MCStreamer::emitXCOFFSymbolLinkageWithVisibility(MCSymbol *Symbol,
+                                                      MCSymbolAttr Linkage,
+                                                      MCSymbolAttr Visibility) {
+  llvm_unreachable("emitXCOFFSymbolLinkageWithVisibility is only supported on "
+                   "XCOFF targets");
+}
+
+void MCStreamer::emitXCOFFRenameDirective(const MCSymbol *Name,
+                                          StringRef Rename) {
+  llvm_unreachable("emitXCOFFRenameDirective is only supported on "
+                   "XCOFF targets");
+}
+
 void MCStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {}
 void MCStreamer::emitELFSymverDirective(StringRef AliasName,
                                         const MCSymbol *Aliasee) {}
-void MCStreamer::EmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+void MCStreamer::emitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                        unsigned ByteAlignment) {}
-void MCStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
+void MCStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                 uint64_t Size, unsigned ByteAlignment) {}
-void MCStreamer::ChangeSection(MCSection *, const MCExpr *) {}
-void MCStreamer::EmitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {}
-void MCStreamer::EmitBytes(StringRef Data) {}
-void MCStreamer::EmitBinaryData(StringRef Data) { EmitBytes(Data); }
-void MCStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) {
+void MCStreamer::changeSection(MCSection *, const MCExpr *) {}
+void MCStreamer::emitWeakReference(MCSymbol *Alias, const MCSymbol *Symbol) {}
+void MCStreamer::emitBytes(StringRef Data) {}
+void MCStreamer::emitBinaryData(StringRef Data) { emitBytes(Data); }
+void MCStreamer::emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) {
   visitUsedExpr(*Value);
 }
-void MCStreamer::EmitULEB128Value(const MCExpr *Value) {}
-void MCStreamer::EmitSLEB128Value(const MCExpr *Value) {}
+void MCStreamer::emitULEB128Value(const MCExpr *Value) {}
+void MCStreamer::emitSLEB128Value(const MCExpr *Value) {}
 void MCStreamer::emitFill(const MCExpr &NumBytes, uint64_t Value, SMLoc Loc) {}
 void MCStreamer::emitFill(const MCExpr &NumValues, int64_t Size, int64_t Expr,
                           SMLoc Loc) {}
-void MCStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
+void MCStreamer::emitValueToAlignment(unsigned ByteAlignment, int64_t Value,
                                       unsigned ValueSize,
                                       unsigned MaxBytesToEmit) {}
-void MCStreamer::EmitCodeAlignment(unsigned ByteAlignment,
+void MCStreamer::emitCodeAlignment(unsigned ByteAlignment,
                                    unsigned MaxBytesToEmit) {}
 void MCStreamer::emitValueToOffset(const MCExpr *Offset, unsigned char Value,
                                    SMLoc Loc) {}
-void MCStreamer::EmitBundleAlignMode(unsigned AlignPow2) {}
-void MCStreamer::EmitBundleLock(bool AlignToEnd) {}
-void MCStreamer::FinishImpl() {}
-void MCStreamer::EmitBundleUnlock() {}
+void MCStreamer::emitBundleAlignMode(unsigned AlignPow2) {}
+void MCStreamer::emitBundleLock(bool AlignToEnd) {}
+void MCStreamer::finishImpl() {}
+void MCStreamer::emitBundleUnlock() {}
 
 void MCStreamer::SwitchSection(MCSection *Section, const MCExpr *Subsection) {
   assert(Section && "Cannot switch to a null section!");
   MCSectionSubPair curSection = SectionStack.back().first;
   SectionStack.back().second = curSection;
   if (MCSectionSubPair(Section, Subsection) != curSection) {
-    ChangeSection(Section, Subsection);
+    changeSection(Section, Subsection);
     SectionStack.back().first = MCSectionSubPair(Section, Subsection);
     assert(!Section->hasEnded() && "Section already ended");
     MCSymbol *Sym = Section->getBeginSymbol();
     if (Sym && !Sym->isInSection())
-      EmitLabel(Sym);
+      emitLabel(Sym);
   }
 }
 
@@ -1122,11 +1130,84 @@ MCSymbol *MCStreamer::endSection(MCSection *Section) {
     return Sym;
 
   SwitchSection(Section);
-  EmitLabel(Sym);
+  emitLabel(Sym);
   return Sym;
 }
 
-void MCStreamer::EmitVersionForTarget(const Triple &Target,
+static VersionTuple
+targetVersionOrMinimumSupportedOSVersion(const Triple &Target,
+                                         VersionTuple TargetVersion) {
+  VersionTuple Min = Target.getMinimumSupportedOSVersion();
+  return !Min.empty() && Min > TargetVersion ? Min : TargetVersion;
+}
+
+static MCVersionMinType
+getMachoVersionMinLoadCommandType(const Triple &Target) {
+  assert(Target.isOSDarwin() && "expected a darwin OS");
+  switch (Target.getOS()) {
+  case Triple::MacOSX:
+  case Triple::Darwin:
+    return MCVM_OSXVersionMin;
+  case Triple::IOS:
+    assert(!Target.isMacCatalystEnvironment() &&
+           "mac Catalyst should use LC_BUILD_VERSION");
+    return MCVM_IOSVersionMin;
+  case Triple::TvOS:
+    return MCVM_TvOSVersionMin;
+  case Triple::WatchOS:
+    return MCVM_WatchOSVersionMin;
+  default:
+    break;
+  }
+  llvm_unreachable("unexpected OS type");
+}
+
+static VersionTuple getMachoBuildVersionSupportedOS(const Triple &Target) {
+  assert(Target.isOSDarwin() && "expected a darwin OS");
+  switch (Target.getOS()) {
+  case Triple::MacOSX:
+  case Triple::Darwin:
+    return VersionTuple(10, 14);
+  case Triple::IOS:
+    // Mac Catalyst always uses the build version load command.
+    if (Target.isMacCatalystEnvironment())
+      return VersionTuple();
+    LLVM_FALLTHROUGH;
+  case Triple::TvOS:
+    return VersionTuple(12);
+  case Triple::WatchOS:
+    return VersionTuple(5);
+  default:
+    break;
+  }
+  llvm_unreachable("unexpected OS type");
+}
+
+static MachO::PlatformType
+getMachoBuildVersionPlatformType(const Triple &Target) {
+  assert(Target.isOSDarwin() && "expected a darwin OS");
+  switch (Target.getOS()) {
+  case Triple::MacOSX:
+  case Triple::Darwin:
+    return MachO::PLATFORM_MACOS;
+  case Triple::IOS:
+    if (Target.isMacCatalystEnvironment())
+      return MachO::PLATFORM_MACCATALYST;
+    return Target.isSimulatorEnvironment() ? MachO::PLATFORM_IOSSIMULATOR
+                                           : MachO::PLATFORM_IOS;
+  case Triple::TvOS:
+    return Target.isSimulatorEnvironment() ? MachO::PLATFORM_TVOSSIMULATOR
+                                           : MachO::PLATFORM_TVOS;
+  case Triple::WatchOS:
+    return Target.isSimulatorEnvironment() ? MachO::PLATFORM_WATCHOSSIMULATOR
+                                           : MachO::PLATFORM_WATCHOS;
+  default:
+    break;
+  }
+  llvm_unreachable("unexpected OS type");
+}
+
+void MCStreamer::emitVersionForTarget(const Triple &Target,
                                       const VersionTuple &SDKVersion) {
   if (!Target.isOSBinFormatMachO() || !Target.isOSDarwin())
     return;
@@ -1134,33 +1215,37 @@ void MCStreamer::EmitVersionForTarget(const Triple &Target,
   if (Target.getOSMajorVersion() == 0)
     return;
 
-  unsigned Major;
-  unsigned Minor;
-  unsigned Update;
-  if (Target.isMacCatalystEnvironment()) {
-    // Mac Catalyst always uses the build version load command.
+  unsigned Major = 0;
+  unsigned Minor = 0;
+  unsigned Update = 0;
+  switch (Target.getOS()) {
+  case Triple::MacOSX:
+  case Triple::Darwin:
+    Target.getMacOSXVersion(Major, Minor, Update);
+    break;
+  case Triple::IOS:
+  case Triple::TvOS:
     Target.getiOSVersion(Major, Minor, Update);
-    assert(Major && "A non-zero major version is expected");
-    EmitBuildVersion(MachO::PLATFORM_MACCATALYST, Major, Minor, Update,
-                     SDKVersion);
-    return;
-  }
-
-  MCVersionMinType VersionType;
-  if (Target.isWatchOS()) {
-    VersionType = MCVM_WatchOSVersionMin;
+    break;
+  case Triple::WatchOS:
     Target.getWatchOSVersion(Major, Minor, Update);
-  } else if (Target.isTvOS()) {
-    VersionType = MCVM_TvOSVersionMin;
-    Target.getiOSVersion(Major, Minor, Update);
-  } else if (Target.isMacOSX()) {
-    VersionType = MCVM_OSXVersionMin;
-    if (!Target.getMacOSXVersion(Major, Minor, Update))
-      Major = 0;
-  } else {
-    VersionType = MCVM_IOSVersionMin;
-    Target.getiOSVersion(Major, Minor, Update);
+    break;
+  default:
+    llvm_unreachable("unexpected OS type");
   }
-  if (Major != 0)
-    EmitVersionMin(VersionType, Major, Minor, Update, SDKVersion);
+  assert(Major != 0 && "A non-zero major version is expected");
+  auto LinkedTargetVersion = targetVersionOrMinimumSupportedOSVersion(
+      Target, VersionTuple(Major, Minor, Update));
+  auto BuildVersionOSVersion = getMachoBuildVersionSupportedOS(Target);
+  if (BuildVersionOSVersion.empty() ||
+      LinkedTargetVersion >= BuildVersionOSVersion)
+    return emitBuildVersion(getMachoBuildVersionPlatformType(Target),
+                            LinkedTargetVersion.getMajor(),
+                            *LinkedTargetVersion.getMinor(),
+                            *LinkedTargetVersion.getSubminor(), SDKVersion);
+
+  emitVersionMin(getMachoVersionMinLoadCommandType(Target),
+                 LinkedTargetVersion.getMajor(),
+                 *LinkedTargetVersion.getMinor(),
+                 *LinkedTargetVersion.getSubminor(), SDKVersion);
 }
diff --git a/llvm/lib/MC/MCSubtargetInfo.cpp b/llvm/lib/MC/MCSubtargetInfo.cpp
index c8678df02bfd..1c187d616e4e 100644
--- a/llvm/lib/MC/MCSubtargetInfo.cpp
+++ b/llvm/lib/MC/MCSubtargetInfo.cpp
@@ -155,10 +155,8 @@ static FeatureBitset getFeatures(StringRef CPU, StringRef FS,
   if (ProcDesc.empty() || ProcFeatures.empty())
     return FeatureBitset();
 
-  assert(std::is_sorted(std::begin(ProcDesc), std::end(ProcDesc)) &&
-         "CPU table is not sorted");
-  assert(std::is_sorted(std::begin(ProcFeatures), std::end(ProcFeatures)) &&
-         "CPU features table is not sorted");
+  assert(llvm::is_sorted(ProcDesc) && "CPU table is not sorted");
+  assert(llvm::is_sorted(ProcFeatures) && "CPU features table is not sorted");
   // Resulting bits
   FeatureBitset Bits;
 
@@ -185,7 +183,7 @@ static FeatureBitset getFeatures(StringRef CPU, StringRef FS,
     // Check for help
     if (Feature == "+help")
       Help(ProcDesc, ProcFeatures);
-    else if (Feature == "+cpuHelp")
+    else if (Feature == "+cpuhelp")
       cpuHelp(ProcDesc);
     else
       ApplyFeatureFlag(Bits, Feature, ProcFeatures);
@@ -206,15 +204,17 @@ void MCSubtargetInfo::setDefaultFeatures(StringRef CPU, StringRef FS) {
   FeatureBits = getFeatures(CPU, FS, ProcDesc, ProcFeatures);
 }
 
-MCSubtargetInfo::MCSubtargetInfo(
-    const Triple &TT, StringRef C, StringRef FS,
-    ArrayRef<SubtargetFeatureKV> PF, ArrayRef<SubtargetSubTypeKV> PD,
-    const MCWriteProcResEntry *WPR,
-    const MCWriteLatencyEntry *WL, const MCReadAdvanceEntry *RA,
-    const InstrStage *IS, const unsigned *OC, const unsigned *FP)
-    : TargetTriple(TT), CPU(C), ProcFeatures(PF), ProcDesc(PD),
-      WriteProcResTable(WPR), WriteLatencyTable(WL),
-      ReadAdvanceTable(RA), Stages(IS), OperandCycles(OC), ForwardingPaths(FP) {
+MCSubtargetInfo::MCSubtargetInfo(const Triple &TT, StringRef C, StringRef FS,
+                                 ArrayRef<SubtargetFeatureKV> PF,
+                                 ArrayRef<SubtargetSubTypeKV> PD,
+                                 const MCWriteProcResEntry *WPR,
+                                 const MCWriteLatencyEntry *WL,
+                                 const MCReadAdvanceEntry *RA,
+                                 const InstrStage *IS, const unsigned *OC,
+                                 const unsigned *FP)
+    : TargetTriple(TT), CPU(std::string(C)), ProcFeatures(PF), ProcDesc(PD),
+      WriteProcResTable(WPR), WriteLatencyTable(WL), ReadAdvanceTable(RA),
+      Stages(IS), OperandCycles(OC), ForwardingPaths(FP) {
   InitMCProcessorInfo(CPU, FS);
 }
 
@@ -288,7 +288,7 @@ bool MCSubtargetInfo::checkFeatures(StringRef FS) const {
 }
 
 const MCSchedModel &MCSubtargetInfo::getSchedModelForCPU(StringRef CPU) const {
-  assert(std::is_sorted(ProcDesc.begin(), ProcDesc.end()) &&
+  assert(llvm::is_sorted(ProcDesc) &&
          "Processor machine model table is not sorted");
 
   // Find entry
@@ -337,6 +337,13 @@ unsigned MCSubtargetInfo::getMaxPrefetchIterationsAhead() const {
   return UINT_MAX;
 }
 
-unsigned MCSubtargetInfo::getMinPrefetchStride() const {
+bool MCSubtargetInfo::enableWritePrefetching() const {
+  return false;
+}
+
+unsigned MCSubtargetInfo::getMinPrefetchStride(unsigned NumMemAccesses,
+                                               unsigned NumStridedMemAccesses,
+                                               unsigned NumPrefetches,
+                                               bool HasCall) const {
   return 1;
 }
diff --git a/llvm/lib/MC/MCSymbolXCOFF.cpp b/llvm/lib/MC/MCSymbolXCOFF.cpp
new file mode 100644
index 000000000000..536153e5518b
--- /dev/null
+++ b/llvm/lib/MC/MCSymbolXCOFF.cpp
@@ -0,0 +1,39 @@
+//===- lib/MC/MCSymbolXCOFF.cpp - XCOFF Code Symbol Representation --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCSectionXCOFF.h"
+
+using namespace llvm;
+
+MCSectionXCOFF *MCSymbolXCOFF::getRepresentedCsect() const {
+  assert(RepresentedCsect &&
+         "Trying to get csect representation of this symbol but none was set.");
+  assert((!getName().equals(getUnqualifiedName()) ||
+          RepresentedCsect->getCSectType() == XCOFF::XTY_ER) &&
+         "Symbol does not represent a csect; MCSectionXCOFF that represents "
+         "the symbol should not be (but is) set.");
+  assert(getSymbolTableName().equals(RepresentedCsect->getSymbolTableName()) &&
+         "SymbolTableNames need to be the same for this symbol and its csect "
+         "representation.");
+  return RepresentedCsect;
+}
+
+void MCSymbolXCOFF::setRepresentedCsect(MCSectionXCOFF *C) {
+  assert(C && "Assigned csect should not be null.");
+  assert((!RepresentedCsect || RepresentedCsect == C) &&
+         "Trying to set a csect that doesn't match the one that"
+         "this symbol is already mapped to.");
+  assert((!getName().equals(getUnqualifiedName()) ||
+          C->getCSectType() == XCOFF::XTY_ER) &&
+         "Symbol does not represent a csect; can only set a MCSectionXCOFF "
+         "representation for a csect.");
+  assert(getSymbolTableName().equals(C->getSymbolTableName()) &&
+         "SymbolTableNames need to be the same for this symbol and its csect "
+         "representation.");
+  RepresentedCsect = C;
+}
diff --git a/llvm/lib/MC/MCTargetOptions.cpp b/llvm/lib/MC/MCTargetOptions.cpp
index 5848e3ecadbe..d35ef942d2db 100644
--- a/llvm/lib/MC/MCTargetOptions.cpp
+++ b/llvm/lib/MC/MCTargetOptions.cpp
@@ -16,8 +16,12 @@ MCTargetOptions::MCTargetOptions()
       MCNoWarn(false), MCNoDeprecatedWarn(false), MCSaveTempLabels(false),
       MCUseDwarfDirectory(false), MCIncrementalLinkerCompatible(false),
       ShowMCEncoding(false), ShowMCInst(false), AsmVerbose(false),
-      PreserveAsmComments(true) {}
+      PreserveAsmComments(true), Dwarf64(false) {}
 
 StringRef MCTargetOptions::getABIName() const {
   return ABIName;
 }
+
+StringRef MCTargetOptions::getAssemblyLanguage() const {
+  return AssemblyLanguage;
+}
diff --git a/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
new file mode 100644
index 000000000000..38996f90006e
--- /dev/null
+++ b/llvm/lib/MC/MCTargetOptionsCommandFlags.cpp
@@ -0,0 +1,114 @@
+//===-- MCTargetOptionsCommandFlags.cpp --------------------------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains machine code-specific flags that are shared between
+// different command line tools.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/MC/MCTargetOptionsCommandFlags.h"
+#include "llvm/MC/MCTargetOptions.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define MCOPT(TY, NAME)                                                        \
+  static cl::opt<TY> *NAME##View;                                              \
+  TY llvm::mc::get##NAME() {                                                   \
+    assert(NAME##View && "RegisterMCTargetOptionsFlags not created.");         \
+    return *NAME##View;                                                        \
+  }
+
+#define MCOPT_EXP(TY, NAME)                                                    \
+  MCOPT(TY, NAME)                                                              \
+  Optional<TY> llvm::mc::getExplicit##NAME() {                                 \
+    if (NAME##View->getNumOccurrences()) {                                     \
+      TY res = *NAME##View;                                                    \
+      return res;                                                              \
+    }                                                                          \
+    return None;                                                               \
+  }
+
+MCOPT_EXP(bool, RelaxAll)
+MCOPT(bool, IncrementalLinkerCompatible)
+MCOPT(int, DwarfVersion)
+MCOPT(bool, Dwarf64)
+MCOPT(bool, ShowMCInst)
+MCOPT(bool, FatalWarnings)
+MCOPT(bool, NoWarn)
+MCOPT(bool, NoDeprecatedWarn)
+MCOPT(std::string, ABIName)
+
+llvm::mc::RegisterMCTargetOptionsFlags::RegisterMCTargetOptionsFlags() {
+#define MCBINDOPT(NAME)                                                        \
+  do {                                                                         \
+    NAME##View = std::addressof(NAME);                                         \
+  } while (0)
+
+  static cl::opt<bool> RelaxAll(
+      "mc-relax-all", cl::desc("When used with filetype=obj, relax all fixups "
+                               "in the emitted object file"));
+  MCBINDOPT(RelaxAll);
+
+  static cl::opt<bool> IncrementalLinkerCompatible(
+      "incremental-linker-compatible",
+      cl::desc(
+          "When used with filetype=obj, "
+          "emit an object file which can be used with an incremental linker"));
+  MCBINDOPT(IncrementalLinkerCompatible);
+
+  static cl::opt<int> DwarfVersion("dwarf-version", cl::desc("Dwarf version"),
+                                   cl::init(0));
+  MCBINDOPT(DwarfVersion);
+
+  static cl::opt<bool> Dwarf64(
+      "dwarf64",
+      cl::desc("Generate debugging info in the 64-bit DWARF format"));
+  MCBINDOPT(Dwarf64);
+
+  static cl::opt<bool> ShowMCInst(
+      "asm-show-inst",
+      cl::desc("Emit internal instruction representation to assembly file"));
+  MCBINDOPT(ShowMCInst);
+
+  static cl::opt<bool> FatalWarnings("fatal-warnings",
+                                     cl::desc("Treat warnings as errors"));
+  MCBINDOPT(FatalWarnings);
+
+  static cl::opt<bool> NoWarn("no-warn", cl::desc("Suppress all warnings"));
+  static cl::alias NoWarnW("W", cl::desc("Alias for --no-warn"),
+                           cl::aliasopt(NoWarn));
+  MCBINDOPT(NoWarn);
+
+  static cl::opt<bool> NoDeprecatedWarn(
+      "no-deprecated-warn", cl::desc("Suppress all deprecated warnings"));
+  MCBINDOPT(NoDeprecatedWarn);
+
+  static cl::opt<std::string> ABIName(
+      "target-abi", cl::Hidden,
+      cl::desc("The name of the ABI to be targeted from the backend."),
+      cl::init(""));
+  MCBINDOPT(ABIName);
+
+#undef MCBINDOPT
+}
+
+MCTargetOptions llvm::mc::InitMCTargetOptionsFromFlags() {
+  MCTargetOptions Options;
+  Options.MCRelaxAll = getRelaxAll();
+  Options.MCIncrementalLinkerCompatible = getIncrementalLinkerCompatible();
+  Options.Dwarf64 = getDwarf64();
+  Options.DwarfVersion = getDwarfVersion();
+  Options.ShowMCInst = getShowMCInst();
+  Options.ABIName = getABIName();
+  Options.MCFatalWarnings = getFatalWarnings();
+  Options.MCNoWarn = getNoWarn();
+  Options.MCNoDeprecatedWarn = getNoDeprecatedWarn();
+  return Options;
+}
diff --git a/llvm/lib/MC/MCWasmStreamer.cpp b/llvm/lib/MC/MCWasmStreamer.cpp
index e7e96ecbb3a0..bf8b142b355a 100644
--- a/llvm/lib/MC/MCWasmStreamer.cpp
+++ b/llvm/lib/MC/MCWasmStreamer.cpp
@@ -49,7 +49,7 @@ void MCWasmStreamer::mergeFragment(MCDataFragment *DF, MCDataFragment *EF) {
   DF->getContents().append(EF->getContents().begin(), EF->getContents().end());
 }
 
-void MCWasmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+void MCWasmStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {
   // Let the target do whatever target specific stuff it needs to do.
   getAssembler().getBackend().handleAssemblerFlag(Flag);
 
@@ -57,7 +57,7 @@ void MCWasmStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   llvm_unreachable("invalid assembler flag!");
 }
 
-void MCWasmStreamer::ChangeSection(MCSection *Section,
+void MCWasmStreamer::changeSection(MCSection *Section,
                                    const MCExpr *Subsection) {
   MCAssembler &Asm = getAssembler();
   auto *SectionWasm = cast<MCSectionWasm>(Section);
@@ -65,11 +65,11 @@ void MCWasmStreamer::ChangeSection(MCSection *Section,
   if (Grp)
     Asm.registerSymbol(*Grp);
 
-  this->MCObjectStreamer::ChangeSection(Section, Subsection);
+  this->MCObjectStreamer::changeSection(Section, Subsection);
   Asm.registerSymbol(*Section->getBeginSymbol());
 }
 
-void MCWasmStreamer::EmitWeakReference(MCSymbol *Alias,
+void MCWasmStreamer::emitWeakReference(MCSymbol *Alias,
                                        const MCSymbol *Symbol) {
   getAssembler().registerSymbol(*Symbol);
   const MCExpr *Value = MCSymbolRefExpr::create(
@@ -77,7 +77,7 @@ void MCWasmStreamer::EmitWeakReference(MCSymbol *Alias,
   Alias->setVariableValue(Value);
 }
 
-bool MCWasmStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
+bool MCWasmStreamer::emitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
   assert(Attribute != MCSA_IndirectSymbol && "indirect symbols not supported");
 
   auto *Symbol = cast<MCSymbolWasm>(S);
@@ -134,7 +134,7 @@ bool MCWasmStreamer::EmitSymbolAttribute(MCSymbol *S, MCSymbolAttr Attribute) {
   return true;
 }
 
-void MCWasmStreamer::EmitCommonSymbol(MCSymbol *S, uint64_t Size,
+void MCWasmStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size,
                                       unsigned ByteAlignment) {
   llvm_unreachable("Common symbols are not yet implemented for Wasm");
 }
@@ -143,34 +143,34 @@ void MCWasmStreamer::emitELFSize(MCSymbol *Symbol, const MCExpr *Value) {
   cast<MCSymbolWasm>(Symbol)->setSize(Value);
 }
 
-void MCWasmStreamer::EmitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
+void MCWasmStreamer::emitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
                                            unsigned ByteAlignment) {
   llvm_unreachable("Local common symbols are not yet implemented for Wasm");
 }
 
-void MCWasmStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+void MCWasmStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
                                    SMLoc Loc) {
-  MCObjectStreamer::EmitValueImpl(Value, Size, Loc);
+  MCObjectStreamer::emitValueImpl(Value, Size, Loc);
 }
 
-void MCWasmStreamer::EmitValueToAlignment(unsigned ByteAlignment, int64_t Value,
+void MCWasmStreamer::emitValueToAlignment(unsigned ByteAlignment, int64_t Value,
                                           unsigned ValueSize,
                                           unsigned MaxBytesToEmit) {
-  MCObjectStreamer::EmitValueToAlignment(ByteAlignment, Value, ValueSize,
+  MCObjectStreamer::emitValueToAlignment(ByteAlignment, Value, ValueSize,
                                          MaxBytesToEmit);
 }
 
-void MCWasmStreamer::EmitIdent(StringRef IdentString) {
+void MCWasmStreamer::emitIdent(StringRef IdentString) {
   // TODO(sbc): Add the ident section once we support mergable strings
   // sections in the object format
 }
 
-void MCWasmStreamer::EmitInstToFragment(const MCInst &Inst,
+void MCWasmStreamer::emitInstToFragment(const MCInst &Inst,
                                         const MCSubtargetInfo &STI) {
-  this->MCObjectStreamer::EmitInstToFragment(Inst, STI);
+  this->MCObjectStreamer::emitInstToFragment(Inst, STI);
 }
 
-void MCWasmStreamer::EmitInstToData(const MCInst &Inst,
+void MCWasmStreamer::emitInstToData(const MCInst &Inst,
                                     const MCSubtargetInfo &STI) {
   MCAssembler &Assembler = getAssembler();
   SmallVector<MCFixup, 4> Fixups;
@@ -191,10 +191,10 @@ void MCWasmStreamer::EmitInstToData(const MCInst &Inst,
   DF->getContents().append(Code.begin(), Code.end());
 }
 
-void MCWasmStreamer::FinishImpl() {
-  EmitFrames(nullptr);
+void MCWasmStreamer::finishImpl() {
+  emitFrames(nullptr);
 
-  this->MCObjectStreamer::FinishImpl();
+  this->MCObjectStreamer::finishImpl();
 }
 
 MCStreamer *llvm::createWasmStreamer(MCContext &Context,
@@ -209,21 +209,21 @@ MCStreamer *llvm::createWasmStreamer(MCContext &Context,
   return S;
 }
 
-void MCWasmStreamer::EmitThumbFunc(MCSymbol *Func) {
+void MCWasmStreamer::emitThumbFunc(MCSymbol *Func) {
   llvm_unreachable("Generic Wasm doesn't support this directive");
 }
 
-void MCWasmStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+void MCWasmStreamer::emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   llvm_unreachable("Wasm doesn't support this directive");
 }
 
-void MCWasmStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
+void MCWasmStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
                                   uint64_t Size, unsigned ByteAlignment,
                                   SMLoc Loc) {
   llvm_unreachable("Wasm doesn't support this directive");
 }
 
-void MCWasmStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
+void MCWasmStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                     uint64_t Size, unsigned ByteAlignment) {
   llvm_unreachable("Wasm doesn't support this directive");
 }
diff --git a/llvm/lib/MC/MCWin64EH.cpp b/llvm/lib/MC/MCWin64EH.cpp
index 4e9a29667097..ac288ca08c93 100644
--- a/llvm/lib/MC/MCWin64EH.cpp
+++ b/llvm/lib/MC/MCWin64EH.cpp
@@ -55,7 +55,7 @@ static void EmitAbsDifference(MCStreamer &Streamer, const MCSymbol *LHS,
   const MCExpr *Diff =
       MCBinaryExpr::createSub(MCSymbolRefExpr::create(LHS, Context),
                               MCSymbolRefExpr::create(RHS, Context), Context);
-  Streamer.EmitValue(Diff, 1);
+  Streamer.emitValue(Diff, 1);
 }
 
 static void EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin,
@@ -69,59 +69,59 @@ static void EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin,
   case Win64EH::UOP_PushNonVol:
     EmitAbsDifference(streamer, inst.Label, begin);
     b2 |= (inst.Register & 0x0F) << 4;
-    streamer.EmitIntValue(b2, 1);
+    streamer.emitInt8(b2);
     break;
   case Win64EH::UOP_AllocLarge:
     EmitAbsDifference(streamer, inst.Label, begin);
     if (inst.Offset > 512 * 1024 - 8) {
       b2 |= 0x10;
-      streamer.EmitIntValue(b2, 1);
+      streamer.emitInt8(b2);
       w = inst.Offset & 0xFFF8;
-      streamer.EmitIntValue(w, 2);
+      streamer.emitInt16(w);
       w = inst.Offset >> 16;
     } else {
-      streamer.EmitIntValue(b2, 1);
+      streamer.emitInt8(b2);
       w = inst.Offset >> 3;
     }
-    streamer.EmitIntValue(w, 2);
+    streamer.emitInt16(w);
     break;
   case Win64EH::UOP_AllocSmall:
     b2 |= (((inst.Offset - 8) >> 3) & 0x0F) << 4;
     EmitAbsDifference(streamer, inst.Label, begin);
-    streamer.EmitIntValue(b2, 1);
+    streamer.emitInt8(b2);
     break;
   case Win64EH::UOP_SetFPReg:
     EmitAbsDifference(streamer, inst.Label, begin);
-    streamer.EmitIntValue(b2, 1);
+    streamer.emitInt8(b2);
     break;
   case Win64EH::UOP_SaveNonVol:
   case Win64EH::UOP_SaveXMM128:
     b2 |= (inst.Register & 0x0F) << 4;
     EmitAbsDifference(streamer, inst.Label, begin);
-    streamer.EmitIntValue(b2, 1);
+    streamer.emitInt8(b2);
     w = inst.Offset >> 3;
     if (inst.Operation == Win64EH::UOP_SaveXMM128)
       w >>= 1;
-    streamer.EmitIntValue(w, 2);
+    streamer.emitInt16(w);
     break;
   case Win64EH::UOP_SaveNonVolBig:
   case Win64EH::UOP_SaveXMM128Big:
     b2 |= (inst.Register & 0x0F) << 4;
     EmitAbsDifference(streamer, inst.Label, begin);
-    streamer.EmitIntValue(b2, 1);
+    streamer.emitInt8(b2);
     if (inst.Operation == Win64EH::UOP_SaveXMM128Big)
       w = inst.Offset & 0xFFF0;
     else
       w = inst.Offset & 0xFFF8;
-    streamer.EmitIntValue(w, 2);
+    streamer.emitInt16(w);
     w = inst.Offset >> 16;
-    streamer.EmitIntValue(w, 2);
+    streamer.emitInt16(w);
     break;
   case Win64EH::UOP_PushMachFrame:
     if (inst.Offset == 1)
       b2 |= 0x10;
     EmitAbsDifference(streamer, inst.Label, begin);
-    streamer.EmitIntValue(b2, 1);
+    streamer.emitInt8(b2);
     break;
   }
 }
@@ -136,17 +136,17 @@ static void EmitSymbolRefWithOfs(MCStreamer &streamer,
   const MCSymbolRefExpr *BaseRefRel = MCSymbolRefExpr::create(Base,
                                               MCSymbolRefExpr::VK_COFF_IMGREL32,
                                               Context);
-  streamer.EmitValue(MCBinaryExpr::createAdd(BaseRefRel, Ofs, Context), 4);
+  streamer.emitValue(MCBinaryExpr::createAdd(BaseRefRel, Ofs, Context), 4);
 }
 
 static void EmitRuntimeFunction(MCStreamer &streamer,
                                 const WinEH::FrameInfo *info) {
   MCContext &context = streamer.getContext();
 
-  streamer.EmitValueToAlignment(4);
+  streamer.emitValueToAlignment(4);
   EmitSymbolRefWithOfs(streamer, info->Function, info->Begin);
   EmitSymbolRefWithOfs(streamer, info->Function, info->End);
-  streamer.EmitValue(MCSymbolRefExpr::create(info->Symbol,
+  streamer.emitValue(MCSymbolRefExpr::create(info->Symbol,
                                              MCSymbolRefExpr::VK_COFF_IMGREL32,
                                              context), 4);
 }
@@ -159,8 +159,8 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
   MCContext &context = streamer.getContext();
   MCSymbol *Label = context.createTempSymbol();
 
-  streamer.EmitValueToAlignment(4);
-  streamer.EmitLabel(Label);
+  streamer.emitValueToAlignment(4);
+  streamer.emitLabel(Label);
   info->Symbol = Label;
 
   // Upper 3 bits are the version number (currently 1).
@@ -173,15 +173,15 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
     if (info->HandlesExceptions)
       flags |= Win64EH::UNW_ExceptionHandler << 3;
   }
-  streamer.EmitIntValue(flags, 1);
+  streamer.emitInt8(flags);
 
   if (info->PrologEnd)
     EmitAbsDifference(streamer, info->PrologEnd, info->Begin);
   else
-    streamer.EmitIntValue(0, 1);
+    streamer.emitInt8(0);
 
   uint8_t numCodes = CountOfUnwindCodes(info->Instructions);
-  streamer.EmitIntValue(numCodes, 1);
+  streamer.emitInt8(numCodes);
 
   uint8_t frame = 0;
   if (info->LastFrameInst >= 0) {
@@ -189,7 +189,7 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
     assert(frameInst.Operation == Win64EH::UOP_SetFPReg);
     frame = (frameInst.Register & 0x0F) | (frameInst.Offset & 0xF0);
   }
-  streamer.EmitIntValue(frame, 1);
+  streamer.emitInt8(frame);
 
   // Emit unwind instructions (in reverse order).
   uint8_t numInst = info->Instructions.size();
@@ -204,21 +204,21 @@ static void EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
   // the array will be one longer than indicated by the count of unwind codes
   // field).
   if (numCodes & 1) {
-    streamer.EmitIntValue(0, 2);
+    streamer.emitInt16(0);
   }
 
   if (flags & (Win64EH::UNW_ChainInfo << 3))
     EmitRuntimeFunction(streamer, info->ChainedParent);
   else if (flags &
            ((Win64EH::UNW_TerminateHandler|Win64EH::UNW_ExceptionHandler) << 3))
-    streamer.EmitValue(MCSymbolRefExpr::create(info->ExceptionHandler,
+    streamer.emitValue(MCSymbolRefExpr::create(info->ExceptionHandler,
                                               MCSymbolRefExpr::VK_COFF_IMGREL32,
                                               context), 4);
   else if (numCodes == 0) {
     // The minimum size of an UNWIND_INFO struct is 8 bytes. If we're not
     // a chained unwind info, if there is no handler, and if there are fewer
     // than 2 slots used in the unwind code array, we have to pad to 8 bytes.
-    streamer.EmitIntValue(0, 4);
+    streamer.emitInt32(0);
   }
 }
 
@@ -337,121 +337,121 @@ static void ARM64EmitUnwindCode(MCStreamer &streamer, const MCSymbol *begin,
     llvm_unreachable("Unsupported ARM64 unwind code");
   case Win64EH::UOP_AllocSmall:
     b = (inst.Offset >> 4) & 0x1F;
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   case Win64EH::UOP_AllocMedium: {
     uint16_t hw = (inst.Offset >> 4) & 0x7FF;
     b = 0xC0;
     b |= (hw >> 8);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     b = hw & 0xFF;
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   }
   case Win64EH::UOP_AllocLarge: {
     uint32_t w;
     b = 0xE0;
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     w = inst.Offset >> 4;
     b = (w & 0x00FF0000) >> 16;
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     b = (w & 0x0000FF00) >> 8;
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     b = w & 0x000000FF;
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   }
   case Win64EH::UOP_SetFP:
     b = 0xE1;
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   case Win64EH::UOP_AddFP:
     b = 0xE2;
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     b = (inst.Offset >> 3);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   case Win64EH::UOP_Nop:
     b = 0xE3;
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   case Win64EH::UOP_SaveFPLRX:
     b = 0x80;
     b |= ((inst.Offset - 1) >> 3) & 0x3F;
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   case Win64EH::UOP_SaveFPLR:
     b = 0x40;
     b |= (inst.Offset >> 3) & 0x3F;
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   case Win64EH::UOP_SaveReg:
     assert(inst.Register >= 19 && "Saved reg must be >= 19");
     reg = inst.Register - 19;
     b = 0xD0 | ((reg & 0xC) >> 2);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     b = ((reg & 0x3) << 6) | (inst.Offset >> 3);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   case Win64EH::UOP_SaveRegX:
     assert(inst.Register >= 19 && "Saved reg must be >= 19");
     reg = inst.Register - 19;
     b = 0xD4 | ((reg & 0x8) >> 3);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     b = ((reg & 0x7) << 5) | ((inst.Offset >> 3) - 1);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   case Win64EH::UOP_SaveRegP:
     assert(inst.Register >= 19 && "Saved registers must be >= 19");
     reg = inst.Register - 19;
     b = 0xC8 | ((reg & 0xC) >> 2);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     b = ((reg & 0x3) << 6) | (inst.Offset >> 3);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   case Win64EH::UOP_SaveRegPX:
     assert(inst.Register >= 19 && "Saved registers must be >= 19");
     reg = inst.Register - 19;
     b = 0xCC | ((reg & 0xC) >> 2);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     b = ((reg & 0x3) << 6) | ((inst.Offset >> 3) - 1);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   case Win64EH::UOP_SaveFReg:
     assert(inst.Register >= 8 && "Saved dreg must be >= 8");
     reg = inst.Register - 8;
     b = 0xDC | ((reg & 0x4) >> 2);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     b = ((reg & 0x3) << 6) | (inst.Offset >> 3);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   case Win64EH::UOP_SaveFRegX:
     assert(inst.Register >= 8 && "Saved dreg must be >= 8");
     reg = inst.Register - 8;
     b = 0xDE;
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     b = ((reg & 0x7) << 5) | ((inst.Offset >> 3) - 1);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   case Win64EH::UOP_SaveFRegP:
     assert(inst.Register >= 8 && "Saved dregs must be >= 8");
     reg = inst.Register - 8;
     b = 0xD8 | ((reg & 0x4) >> 2);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     b = ((reg & 0x3) << 6) | (inst.Offset >> 3);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   case Win64EH::UOP_SaveFRegPX:
     assert(inst.Register >= 8 && "Saved dregs must be >= 8");
     reg = inst.Register - 8;
     b = 0xDA | ((reg & 0x4) >> 2);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     b = ((reg & 0x3) << 6) | ((inst.Offset >> 3) - 1);
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   case Win64EH::UOP_End:
     b = 0xE4;
-    streamer.EmitIntValue(b, 1);
+    streamer.emitInt8(b);
     break;
   }
 }
@@ -498,8 +498,8 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
   MCContext &context = streamer.getContext();
   MCSymbol *Label = context.createTempSymbol();
 
-  streamer.EmitValueToAlignment(4);
-  streamer.EmitLabel(Label);
+  streamer.emitValueToAlignment(4);
+  streamer.emitLabel(Label);
   info->Symbol = Label;
 
   int64_t RawFuncLength;
@@ -585,7 +585,7 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
   if (info->HandlesExceptions) // X
     row1 |= 1 << 20;
   row1 |= FuncLength & 0x3FFFF;
-  streamer.EmitIntValue(row1, 4);
+  streamer.emitInt32(row1);
 
   // Extended Code Words, Extended Epilog Count
   if (ExtensionWord) {
@@ -597,7 +597,7 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
     uint32_t row2 = 0x0;
     row2 |= (CodeWords & 0xFF) << 16;
     row2 |= (EpilogCount & 0xFFFF);
-    streamer.EmitIntValue(row2, 4);
+    streamer.emitInt32(row2);
   }
 
   // Epilog Start Index, Epilog Start Offset
@@ -610,7 +610,7 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
       EpilogOffset /= 4;
     uint32_t row3 = EpilogOffset;
     row3 |= (EpilogIndex & 0x3FF) << 22;
-    streamer.EmitIntValue(row3, 4);
+    streamer.emitInt32(row3);
   }
 
   // Emit prolog unwind instructions (in reverse order).
@@ -633,10 +633,10 @@ static void ARM64EmitUnwindInfo(MCStreamer &streamer, WinEH::FrameInfo *info) {
   int32_t BytesMod = CodeWords * 4 - TotalCodeBytes;
   assert(BytesMod >= 0);
   for (int i = 0; i < BytesMod; i++)
-    streamer.EmitIntValue(0xE3, 1);
+    streamer.emitInt8(0xE3);
 
   if (info->HandlesExceptions)
-    streamer.EmitValue(
+    streamer.emitValue(
         MCSymbolRefExpr::create(info->ExceptionHandler,
                                 MCSymbolRefExpr::VK_COFF_IMGREL32, context),
         4);
@@ -646,9 +646,9 @@ static void ARM64EmitRuntimeFunction(MCStreamer &streamer,
                                      const WinEH::FrameInfo *info) {
   MCContext &context = streamer.getContext();
 
-  streamer.EmitValueToAlignment(4);
+  streamer.emitValueToAlignment(4);
   EmitSymbolRefWithOfs(streamer, info->Function, info->Begin);
-  streamer.EmitValue(MCSymbolRefExpr::create(info->Symbol,
+  streamer.emitValue(MCSymbolRefExpr::create(info->Symbol,
                                              MCSymbolRefExpr::VK_COFF_IMGREL32,
                                              context),
                      4);
diff --git a/llvm/lib/MC/MCWinCOFFStreamer.cpp b/llvm/lib/MC/MCWinCOFFStreamer.cpp
index c5a21312140b..d8fde4004d44 100644
--- a/llvm/lib/MC/MCWinCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCWinCOFFStreamer.cpp
@@ -48,7 +48,7 @@ MCWinCOFFStreamer::MCWinCOFFStreamer(MCContext &Context,
     : MCObjectStreamer(Context, std::move(MAB), std::move(OW), std::move(CE)),
       CurSymbol(nullptr) {}
 
-void MCWinCOFFStreamer::EmitInstToData(const MCInst &Inst,
+void MCWinCOFFStreamer::emitInstToData(const MCInst &Inst,
                                        const MCSubtargetInfo &STI) {
   MCDataFragment *DF = getOrCreateDataFragment();
 
@@ -71,23 +71,23 @@ void MCWinCOFFStreamer::InitSections(bool NoExecStack) {
   // This emulates the same behavior of GNU as. This makes it easier
   // to compare the output as the major sections are in the same order.
   SwitchSection(getContext().getObjectFileInfo()->getTextSection());
-  EmitCodeAlignment(4);
+  emitCodeAlignment(4);
 
   SwitchSection(getContext().getObjectFileInfo()->getDataSection());
-  EmitCodeAlignment(4);
+  emitCodeAlignment(4);
 
   SwitchSection(getContext().getObjectFileInfo()->getBSSSection());
-  EmitCodeAlignment(4);
+  emitCodeAlignment(4);
 
   SwitchSection(getContext().getObjectFileInfo()->getTextSection());
 }
 
-void MCWinCOFFStreamer::EmitLabel(MCSymbol *S, SMLoc Loc) {
+void MCWinCOFFStreamer::emitLabel(MCSymbol *S, SMLoc Loc) {
   auto *Symbol = cast<MCSymbolCOFF>(S);
-  MCObjectStreamer::EmitLabel(Symbol, Loc);
+  MCObjectStreamer::emitLabel(Symbol, Loc);
 }
 
-void MCWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
+void MCWinCOFFStreamer::emitAssemblerFlag(MCAssemblerFlag Flag) {
   // Let the target do whatever target specific stuff it needs to do.
   getAssembler().getBackend().handleAssemblerFlag(Flag);
 
@@ -103,11 +103,11 @@ void MCWinCOFFStreamer::EmitAssemblerFlag(MCAssemblerFlag Flag) {
   }
 }
 
-void MCWinCOFFStreamer::EmitThumbFunc(MCSymbol *Func) {
+void MCWinCOFFStreamer::emitThumbFunc(MCSymbol *Func) {
   llvm_unreachable("not implemented");
 }
 
-bool MCWinCOFFStreamer::EmitSymbolAttribute(MCSymbol *S,
+bool MCWinCOFFStreamer::emitSymbolAttribute(MCSymbol *S,
                                             MCSymbolAttr Attribute) {
   auto *Symbol = cast<MCSymbolCOFF>(S);
   getAssembler().registerSymbol(*Symbol);
@@ -129,7 +129,7 @@ bool MCWinCOFFStreamer::EmitSymbolAttribute(MCSymbol *S,
   return true;
 }
 
-void MCWinCOFFStreamer::EmitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
+void MCWinCOFFStreamer::emitSymbolDesc(MCSymbol *Symbol, unsigned DescValue) {
   llvm_unreachable("not implemented");
 }
 
@@ -262,7 +262,7 @@ void MCWinCOFFStreamer::EmitCOFFImgRel32(const MCSymbol *Symbol,
   DF->getContents().resize(DF->getContents().size() + 4, 0);
 }
 
-void MCWinCOFFStreamer::EmitCommonSymbol(MCSymbol *S, uint64_t Size,
+void MCWinCOFFStreamer::emitCommonSymbol(MCSymbol *S, uint64_t Size,
                                          unsigned ByteAlignment) {
   auto *Symbol = cast<MCSymbolCOFF>(S);
 
@@ -289,38 +289,38 @@ void MCWinCOFFStreamer::EmitCommonSymbol(MCSymbol *S, uint64_t Size,
 
     PushSection();
     SwitchSection(MFI->getDrectveSection());
-    EmitBytes(Directive);
+    emitBytes(Directive);
     PopSection();
   }
 }
 
-void MCWinCOFFStreamer::EmitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
+void MCWinCOFFStreamer::emitLocalCommonSymbol(MCSymbol *S, uint64_t Size,
                                               unsigned ByteAlignment) {
   auto *Symbol = cast<MCSymbolCOFF>(S);
 
   MCSection *Section = getContext().getObjectFileInfo()->getBSSSection();
   PushSection();
   SwitchSection(Section);
-  EmitValueToAlignment(ByteAlignment, 0, 1, 0);
-  EmitLabel(Symbol);
+  emitValueToAlignment(ByteAlignment, 0, 1, 0);
+  emitLabel(Symbol);
   Symbol->setExternal(false);
-  EmitZeros(Size);
+  emitZeros(Size);
   PopSection();
 }
 
-void MCWinCOFFStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
+void MCWinCOFFStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
                                      uint64_t Size, unsigned ByteAlignment,
                                      SMLoc Loc) {
   llvm_unreachable("not implemented");
 }
 
-void MCWinCOFFStreamer::EmitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
+void MCWinCOFFStreamer::emitTBSSSymbol(MCSection *Section, MCSymbol *Symbol,
                                        uint64_t Size, unsigned ByteAlignment) {
   llvm_unreachable("not implemented");
 }
 
 // TODO: Implement this if you want to emit .comment section in COFF obj files.
-void MCWinCOFFStreamer::EmitIdent(StringRef IdentString) {
+void MCWinCOFFStreamer::emitIdent(StringRef IdentString) {
   llvm_unreachable("not implemented");
 }
 
@@ -328,8 +328,35 @@ void MCWinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
   llvm_unreachable("not implemented");
 }
 
-void MCWinCOFFStreamer::FinishImpl() {
-  MCObjectStreamer::FinishImpl();
+void MCWinCOFFStreamer::emitCGProfileEntry(const MCSymbolRefExpr *From,
+                                           const MCSymbolRefExpr *To,
+                                           uint64_t Count) {
+  // Ignore temporary symbols for now.
+  if (!From->getSymbol().isTemporary() && !To->getSymbol().isTemporary())
+    getAssembler().CGProfile.push_back({From, To, Count});
+}
+
+void MCWinCOFFStreamer::finalizeCGProfileEntry(const MCSymbolRefExpr *&SRE) {
+  const MCSymbol *S = &SRE->getSymbol();
+  bool Created;
+  getAssembler().registerSymbol(*S, &Created);
+  if (Created) {
+    cast<MCSymbolCOFF>(S)->setIsWeakExternal();
+    cast<MCSymbolCOFF>(S)->setExternal(true);
+  }
+}
+
+void MCWinCOFFStreamer::finalizeCGProfile() {
+  for (MCAssembler::CGProfileEntry &E : getAssembler().CGProfile) {
+    finalizeCGProfileEntry(E.From);
+    finalizeCGProfileEntry(E.To);
+  }
+}
+
+void MCWinCOFFStreamer::finishImpl() {
+  finalizeCGProfile();
+
+  MCObjectStreamer::finishImpl();
 }
 
 void MCWinCOFFStreamer::Error(const Twine &Msg) const {
diff --git a/llvm/lib/MC/MCXCOFFStreamer.cpp b/llvm/lib/MC/MCXCOFFStreamer.cpp
index 6efa167ced42..ec9e89fac416 100644
--- a/llvm/lib/MC/MCXCOFFStreamer.cpp
+++ b/llvm/lib/MC/MCXCOFFStreamer.cpp
@@ -10,12 +10,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/MC/MCXCOFFStreamer.h"
 #include "llvm/BinaryFormat/XCOFF.h"
 #include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCSymbolXCOFF.h"
-#include "llvm/MC/MCXCOFFStreamer.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -27,41 +29,73 @@ MCXCOFFStreamer::MCXCOFFStreamer(MCContext &Context,
     : MCObjectStreamer(Context, std::move(MAB), std::move(OW),
                        std::move(Emitter)) {}
 
-bool MCXCOFFStreamer::EmitSymbolAttribute(MCSymbol *Sym,
+bool MCXCOFFStreamer::emitSymbolAttribute(MCSymbol *Sym,
                                           MCSymbolAttr Attribute) {
   auto *Symbol = cast<MCSymbolXCOFF>(Sym);
   getAssembler().registerSymbol(*Symbol);
 
   switch (Attribute) {
   case MCSA_Global:
+  case MCSA_Extern:
     Symbol->setStorageClass(XCOFF::C_EXT);
     Symbol->setExternal(true);
     break;
+  case MCSA_LGlobal:
+    Symbol->setStorageClass(XCOFF::C_HIDEXT);
+    Symbol->setExternal(true);
+    break;
+  case llvm::MCSA_Weak:
+    Symbol->setStorageClass(XCOFF::C_WEAKEXT);
+    Symbol->setExternal(true);
+    break;
+  case llvm::MCSA_Hidden:
+    Symbol->setVisibilityType(XCOFF::SYM_V_HIDDEN);
+    break;
+  case llvm::MCSA_Protected:
+    Symbol->setVisibilityType(XCOFF::SYM_V_PROTECTED);
+    break;
   default:
     report_fatal_error("Not implemented yet.");
   }
   return true;
 }
 
-void MCXCOFFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+void MCXCOFFStreamer::emitXCOFFSymbolLinkageWithVisibility(
+    MCSymbol *Symbol, MCSymbolAttr Linkage, MCSymbolAttr Visibility) {
+
+  emitSymbolAttribute(Symbol, Linkage);
+
+  // When the caller passes `MCSA_Invalid` for the visibility, do not emit one.
+  if (Visibility == MCSA_Invalid)
+    return;
+
+  emitSymbolAttribute(Symbol, Visibility);
+}
+
+void MCXCOFFStreamer::emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                        unsigned ByteAlignment) {
   getAssembler().registerSymbol(*Symbol);
   Symbol->setExternal(cast<MCSymbolXCOFF>(Symbol)->getStorageClass() !=
                       XCOFF::C_HIDEXT);
   Symbol->setCommon(Size, ByteAlignment);
 
+  // Default csect align is 4, but common symbols have explicit alignment values
+  // and we should honor it.
+  cast<MCSymbolXCOFF>(Symbol)->getRepresentedCsect()->setAlignment(
+      Align(ByteAlignment));
+
   // Emit the alignment and storage for the variable to the section.
-  EmitValueToAlignment(ByteAlignment);
-  EmitZeros(Size);
+  emitValueToAlignment(ByteAlignment);
+  emitZeros(Size);
 }
 
-void MCXCOFFStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
+void MCXCOFFStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
                                    uint64_t Size, unsigned ByteAlignment,
                                    SMLoc Loc) {
   report_fatal_error("Zero fill not implemented for XCOFF.");
 }
 
-void MCXCOFFStreamer::EmitInstToData(const MCInst &Inst,
+void MCXCOFFStreamer::emitInstToData(const MCInst &Inst,
                                      const MCSubtargetInfo &STI) {
   MCAssembler &Assembler = getAssembler();
   SmallVector<MCFixup, 4> Fixups;
@@ -69,9 +103,15 @@ void MCXCOFFStreamer::EmitInstToData(const MCInst &Inst,
   raw_svector_ostream VecOS(Code);
   Assembler.getEmitter().encodeInstruction(Inst, VecOS, Fixups, STI);
 
-  // TODO: Handle Fixups later
-
+  // Add the fixups and data.
   MCDataFragment *DF = getOrCreateDataFragment(&STI);
+  const size_t ContentsSize = DF->getContents().size();
+  auto &DataFragmentFixups = DF->getFixups();
+  for (auto &Fixup : Fixups) {
+    Fixup.setOffset(Fixup.getOffset() + ContentsSize);
+    DataFragmentFixups.push_back(Fixup);
+  }
+
   DF->setHasInstructions(STI);
   DF->getContents().append(Code.begin(), Code.end());
 }
@@ -88,9 +128,9 @@ MCStreamer *llvm::createXCOFFStreamer(MCContext &Context,
   return S;
 }
 
-void MCXCOFFStreamer::EmitXCOFFLocalCommonSymbol(MCSymbol *LabelSym,
+void MCXCOFFStreamer::emitXCOFFLocalCommonSymbol(MCSymbol *LabelSym,
                                                  uint64_t Size,
                                                  MCSymbol *CsectSym,
                                                  unsigned ByteAlignment) {
-  EmitCommonSymbol(CsectSym, Size, ByteAlignment);
+  emitCommonSymbol(CsectSym, Size, ByteAlignment);
 }
diff --git a/llvm/lib/MC/MachObjectWriter.cpp b/llvm/lib/MC/MachObjectWriter.cpp
index 9f6af981aca1..10ae27c2acc2 100644
--- a/llvm/lib/MC/MachObjectWriter.cpp
+++ b/llvm/lib/MC/MachObjectWriter.cpp
@@ -231,7 +231,7 @@ void MachObjectWriter::writeSection(const MCAsmLayout &Layout,
   uint64_t Start = W.OS.tell();
   (void) Start;
 
-  writeWithPadding(Section.getSectionName(), 16);
+  writeWithPadding(Section.getName(), 16);
   writeWithPadding(Section.getSegmentName(), 16);
   if (is64Bit()) {
     W.write<uint64_t>(VMAddr);      // address
@@ -831,11 +831,11 @@ uint64_t MachObjectWriter::writeObject(MCAssembler &Asm,
     SectionDataFileSize = std::max(SectionDataFileSize, Address + FileSize);
   }
 
-  // The section data is padded to 4 bytes.
+  // The section data is padded to pointer size bytes.
   //
   // FIXME: Is this machine dependent?
   unsigned SectionDataPadding =
-      offsetToAlignment(SectionDataFileSize, Align(4));
+      offsetToAlignment(SectionDataFileSize, is64Bit() ? Align(8) : Align(4));
   SectionDataFileSize += SectionDataPadding;
 
   // Write the prolog, starting with the header and load command...
diff --git a/llvm/lib/MC/SubtargetFeature.cpp b/llvm/lib/MC/SubtargetFeature.cpp
index c4dd77359b24..3155adcf2674 100644
--- a/llvm/lib/MC/SubtargetFeature.cpp
+++ b/llvm/lib/MC/SubtargetFeature.cpp
@@ -33,7 +33,9 @@ using namespace llvm;
 void SubtargetFeatures::Split(std::vector<std::string> &V, StringRef S) {
   SmallVector<StringRef, 3> Tmp;
   S.split(Tmp, ',', -1, false /* KeepEmpty */);
-  V.assign(Tmp.begin(), Tmp.end());
+  V.reserve(Tmp.size());
+  for (StringRef T : Tmp)
+    V.push_back(std::string(T));
 }
 
 void SubtargetFeatures::AddFeature(StringRef String, bool Enable) {
diff --git a/llvm/lib/MC/WasmObjectWriter.cpp b/llvm/lib/MC/WasmObjectWriter.cpp
index 321f93d76092..f51d908c53e1 100644
--- a/llvm/lib/MC/WasmObjectWriter.cpp
+++ b/llvm/lib/MC/WasmObjectWriter.cpp
@@ -27,6 +27,7 @@
 #include "llvm/MC/MCWasmObjectWriter.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/StringSaver.h"
@@ -107,7 +108,7 @@ struct WasmDataSegment {
   MCSectionWasm *Section;
   StringRef Name;
   uint32_t InitFlags;
-  uint32_t Offset;
+  uint64_t Offset;
   uint32_t Alignment;
   uint32_t LinkerFlags;
   SmallVector<char, 4> Data;
@@ -152,7 +153,7 @@ struct WasmRelocationEntry {
   void print(raw_ostream &Out) const {
     Out << wasm::relocTypetoString(Type) << " Off=" << Offset
         << ", Sym=" << *Symbol << ", Addend=" << Addend
-        << ", FixupSection=" << FixupSection->getSectionName();
+        << ", FixupSection=" << FixupSection->getName();
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -184,31 +185,37 @@ raw_ostream &operator<<(raw_ostream &OS, const WasmRelocationEntry &Rel) {
 
 // Write X as an (unsigned) LEB value at offset Offset in Stream, padded
 // to allow patching.
-static void writePatchableLEB(raw_pwrite_stream &Stream, uint32_t X,
-                              uint64_t Offset) {
-  uint8_t Buffer[5];
-  unsigned SizeLen = encodeULEB128(X, Buffer, 5);
-  assert(SizeLen == 5);
+template <int W>
+void writePatchableLEB(raw_pwrite_stream &Stream, uint64_t X, uint64_t Offset) {
+  uint8_t Buffer[W];
+  unsigned SizeLen = encodeULEB128(X, Buffer, W);
+  assert(SizeLen == W);
   Stream.pwrite((char *)Buffer, SizeLen, Offset);
 }
 
 // Write X as an signed LEB value at offset Offset in Stream, padded
 // to allow patching.
-static void writePatchableSLEB(raw_pwrite_stream &Stream, int32_t X,
-                               uint64_t Offset) {
-  uint8_t Buffer[5];
-  unsigned SizeLen = encodeSLEB128(X, Buffer, 5);
-  assert(SizeLen == 5);
+template <int W>
+void writePatchableSLEB(raw_pwrite_stream &Stream, int64_t X, uint64_t Offset) {
+  uint8_t Buffer[W];
+  unsigned SizeLen = encodeSLEB128(X, Buffer, W);
+  assert(SizeLen == W);
   Stream.pwrite((char *)Buffer, SizeLen, Offset);
 }
 
 // Write X as a plain integer value at offset Offset in Stream.
-static void writeI32(raw_pwrite_stream &Stream, uint32_t X, uint64_t Offset) {
+static void patchI32(raw_pwrite_stream &Stream, uint32_t X, uint64_t Offset) {
   uint8_t Buffer[4];
   support::endian::write32le(Buffer, X);
   Stream.pwrite((char *)Buffer, sizeof(Buffer), Offset);
 }
 
+static void patchI64(raw_pwrite_stream &Stream, uint64_t X, uint64_t Offset) {
+  uint8_t Buffer[8];
+  support::endian::write64le(Buffer, X);
+  Stream.pwrite((char *)Buffer, sizeof(Buffer), Offset);
+}
+
 class WasmObjectWriter : public MCObjectWriter {
   support::endian::Writer W;
 
@@ -217,11 +224,8 @@ class WasmObjectWriter : public MCObjectWriter {
 
   // Relocations for fixing up references in the code section.
   std::vector<WasmRelocationEntry> CodeRelocations;
-  uint32_t CodeSectionIndex;
-
   // Relocations for fixing up references in the data section.
   std::vector<WasmRelocationEntry> DataRelocations;
-  uint32_t DataSectionIndex;
 
   // Index values to use for fixing up call_indirect type indices.
   // Maps function symbols to the index of the type of the function
@@ -307,19 +311,32 @@ private:
     W.OS << Str;
   }
 
+  void writeI32(int32_t val) {
+    char Buffer[4];
+    support::endian::write32le(Buffer, val);
+    W.OS.write(Buffer, sizeof(Buffer));
+  }
+
+  void writeI64(int64_t val) {
+    char Buffer[8];
+    support::endian::write64le(Buffer, val);
+    W.OS.write(Buffer, sizeof(Buffer));
+  }
+
   void writeValueType(wasm::ValType Ty) { W.OS << static_cast<char>(Ty); }
 
   void writeTypeSection(ArrayRef<WasmSignature> Signatures);
-  void writeImportSection(ArrayRef<wasm::WasmImport> Imports, uint32_t DataSize,
+  void writeImportSection(ArrayRef<wasm::WasmImport> Imports, uint64_t DataSize,
                           uint32_t NumElements);
   void writeFunctionSection(ArrayRef<WasmFunction> Functions);
   void writeExportSection(ArrayRef<wasm::WasmExport> Exports);
   void writeElemSection(ArrayRef<uint32_t> TableElems);
   void writeDataCountSection();
-  void writeCodeSection(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                        ArrayRef<WasmFunction> Functions);
-  void writeDataSection();
+  uint32_t writeCodeSection(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                            ArrayRef<WasmFunction> Functions);
+  uint32_t writeDataSection(const MCAsmLayout &Layout);
   void writeEventSection(ArrayRef<wasm::WasmEventType> Events);
+  void writeGlobalSection(ArrayRef<wasm::WasmGlobal> Globals);
   void writeRelocSection(uint32_t SectionIndex, StringRef Name,
                          std::vector<WasmRelocationEntry> &Relocations);
   void writeLinkingMetaDataSection(
@@ -333,9 +350,10 @@ private:
   updateCustomSectionRelocations(const SmallVector<WasmFunction, 4> &Functions,
                                  const MCAsmLayout &Layout);
 
-  uint32_t getProvisionalValue(const WasmRelocationEntry &RelEntry);
+  uint64_t getProvisionalValue(const WasmRelocationEntry &RelEntry,
+                               const MCAsmLayout &Layout);
   void applyRelocations(ArrayRef<WasmRelocationEntry> Relocations,
-                        uint64_t ContentsOffset);
+                        uint64_t ContentsOffset, const MCAsmLayout &Layout);
 
   uint32_t getRelocationIndexValue(const WasmRelocationEntry &RelEntry);
   uint32_t getFunctionType(const MCSymbolWasm &Symbol);
@@ -396,8 +414,8 @@ void WasmObjectWriter::endSection(SectionBookkeeping &Section) {
 
   // Write the final section size to the payload_len field, which follows
   // the section id byte.
-  writePatchableLEB(static_cast<raw_pwrite_stream &>(W.OS), Size,
-                    Section.SizeOffset);
+  writePatchableLEB<5>(static_cast<raw_pwrite_stream &>(W.OS), Size,
+                       Section.SizeOffset);
 }
 
 // Emit the Wasm header.
@@ -417,7 +435,7 @@ void WasmObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
       auto Pair = SectionFunctions.insert(std::make_pair(&Sec, &S));
       if (!Pair.second)
         report_fatal_error("section already has a defining function: " +
-                           Sec.getSectionName());
+                           Sec.getName());
     }
   }
 }
@@ -436,10 +454,6 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
   uint64_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
   MCContext &Ctx = Asm.getContext();
 
-  // The .init_array isn't translated as data, so don't do relocations in it.
-  if (FixupSection.getSectionName().startswith(".init_array"))
-    return;
-
   if (const MCSymbolRefExpr *RefB = Target.getSymB()) {
     // To get here the A - B expression must have failed evaluateAsRelocatable.
     // This means either A or B must be undefined and in WebAssembly we can't
@@ -456,11 +470,17 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
   const MCSymbolRefExpr *RefA = Target.getSymA();
   const auto *SymA = cast<MCSymbolWasm>(&RefA->getSymbol());
 
+  // The .init_array isn't translated as data, so don't do relocations in it.
+  if (FixupSection.getName().startswith(".init_array")) {
+    SymA->setUsedInInitArray();
+    return;
+  }
+
   if (SymA->isVariable()) {
     const MCExpr *Expr = SymA->getVariableValue();
-    const auto *Inner = cast<MCSymbolRefExpr>(Expr);
-    if (Inner->getKind() == MCSymbolRefExpr::VK_WEAKREF)
-      llvm_unreachable("weakref used in reloc not yet implemented");
+    if (const auto *Inner = dyn_cast<MCSymbolRefExpr>(Expr))
+      if (Inner->getKind() == MCSymbolRefExpr::VK_WEAKREF)
+        llvm_unreachable("weakref used in reloc not yet implemented");
   }
 
   // Put any constant offset in an addend. Offsets can be negative, and
@@ -519,23 +539,16 @@ void WasmObjectWriter::recordRelocation(MCAssembler &Asm,
   }
 }
 
-static const MCSymbolWasm *resolveSymbol(const MCSymbolWasm &Symbol) {
-  const MCSymbolWasm* Ret = &Symbol;
-  while (Ret->isVariable()) {
-    const MCExpr *Expr = Ret->getVariableValue();
-    auto *Inner = cast<MCSymbolRefExpr>(Expr);
-    Ret = cast<MCSymbolWasm>(&Inner->getSymbol());
-  }
-  return Ret;
-}
-
 // Compute a value to write into the code at the location covered
 // by RelEntry. This value isn't used by the static linker; it just serves
 // to make the object format more readable and more likely to be directly
 // useable.
-uint32_t
-WasmObjectWriter::getProvisionalValue(const WasmRelocationEntry &RelEntry) {
-  if (RelEntry.Type == wasm::R_WASM_GLOBAL_INDEX_LEB && !RelEntry.Symbol->isGlobal()) {
+uint64_t
+WasmObjectWriter::getProvisionalValue(const WasmRelocationEntry &RelEntry,
+                                      const MCAsmLayout &Layout) {
+  if ((RelEntry.Type == wasm::R_WASM_GLOBAL_INDEX_LEB ||
+       RelEntry.Type == wasm::R_WASM_GLOBAL_INDEX_I32) &&
+      !RelEntry.Symbol->isGlobal()) {
     assert(GOTIndices.count(RelEntry.Symbol) > 0 && "symbol not found in GOT index space");
     return GOTIndices[RelEntry.Symbol];
   }
@@ -545,15 +558,20 @@ WasmObjectWriter::getProvisionalValue(const WasmRelocationEntry &RelEntry) {
   case wasm::R_WASM_TABLE_INDEX_SLEB:
   case wasm::R_WASM_TABLE_INDEX_I32: {
     // Provisional value is table address of the resolved symbol itself
-    const MCSymbolWasm *Sym = resolveSymbol(*RelEntry.Symbol);
-    assert(Sym->isFunction());
-    return TableIndices[Sym];
+    const MCSymbolWasm *Base =
+        cast<MCSymbolWasm>(Layout.getBaseSymbol(*RelEntry.Symbol));
+    assert(Base->isFunction());
+    if (RelEntry.Type == wasm::R_WASM_TABLE_INDEX_REL_SLEB)
+      return TableIndices[Base] - InitialTableOffset;
+    else
+      return TableIndices[Base];
   }
   case wasm::R_WASM_TYPE_INDEX_LEB:
     // Provisional value is same as the index
     return getRelocationIndexValue(RelEntry);
   case wasm::R_WASM_FUNCTION_INDEX_LEB:
   case wasm::R_WASM_GLOBAL_INDEX_LEB:
+  case wasm::R_WASM_GLOBAL_INDEX_I32:
   case wasm::R_WASM_EVENT_INDEX_LEB:
     // Provisional value is function/global/event Wasm index
     assert(WasmIndices.count(RelEntry.Symbol) > 0 && "symbol not found in wasm index space");
@@ -565,15 +583,20 @@ WasmObjectWriter::getProvisionalValue(const WasmRelocationEntry &RelEntry) {
     return Section.getSectionOffset() + RelEntry.Addend;
   }
   case wasm::R_WASM_MEMORY_ADDR_LEB:
-  case wasm::R_WASM_MEMORY_ADDR_I32:
+  case wasm::R_WASM_MEMORY_ADDR_LEB64:
+  case wasm::R_WASM_MEMORY_ADDR_SLEB:
+  case wasm::R_WASM_MEMORY_ADDR_SLEB64:
   case wasm::R_WASM_MEMORY_ADDR_REL_SLEB:
-  case wasm::R_WASM_MEMORY_ADDR_SLEB: {
+  case wasm::R_WASM_MEMORY_ADDR_REL_SLEB64:
+  case wasm::R_WASM_MEMORY_ADDR_I32:
+  case wasm::R_WASM_MEMORY_ADDR_I64: {
     // Provisional value is address of the global
-    const MCSymbolWasm *Sym = resolveSymbol(*RelEntry.Symbol);
+    const MCSymbolWasm *Base =
+        cast<MCSymbolWasm>(Layout.getBaseSymbol(*RelEntry.Symbol));
     // For undefined symbols, use zero
-    if (!Sym->isDefined())
+    if (!Base->isDefined())
       return 0;
-    const wasm::WasmDataReference &Ref = DataLocations[Sym];
+    const wasm::WasmDataReference &Ref = DataLocations[Base];
     const WasmDataSegment &Segment = DataSegments[Ref.Segment];
     // Ignore overflow. LLVM allows address arithmetic to silently wrap.
     return Segment.Offset + Ref.Offset + RelEntry.Addend;
@@ -585,7 +608,7 @@ WasmObjectWriter::getProvisionalValue(const WasmRelocationEntry &RelEntry) {
 
 static void addData(SmallVectorImpl<char> &DataBytes,
                     MCSectionWasm &DataSection) {
-  LLVM_DEBUG(errs() << "addData: " << DataSection.getSectionName() << "\n");
+  LLVM_DEBUG(errs() << "addData: " << DataSection.getName() << "\n");
 
   DataBytes.resize(alignTo(DataBytes.size(), DataSection.getAlignment()));
 
@@ -636,7 +659,8 @@ WasmObjectWriter::getRelocationIndexValue(const WasmRelocationEntry &RelEntry) {
 // Apply the portions of the relocation records that we can handle ourselves
 // directly.
 void WasmObjectWriter::applyRelocations(
-    ArrayRef<WasmRelocationEntry> Relocations, uint64_t ContentsOffset) {
+    ArrayRef<WasmRelocationEntry> Relocations, uint64_t ContentsOffset,
+    const MCAsmLayout &Layout) {
   auto &Stream = static_cast<raw_pwrite_stream &>(W.OS);
   for (const WasmRelocationEntry &RelEntry : Relocations) {
     uint64_t Offset = ContentsOffset +
@@ -644,7 +668,7 @@ void WasmObjectWriter::applyRelocations(
                       RelEntry.Offset;
 
     LLVM_DEBUG(dbgs() << "applyRelocation: " << RelEntry << "\n");
-    uint32_t Value = getProvisionalValue(RelEntry);
+    auto Value = getProvisionalValue(RelEntry, Layout);
 
     switch (RelEntry.Type) {
     case wasm::R_WASM_FUNCTION_INDEX_LEB:
@@ -652,19 +676,30 @@ void WasmObjectWriter::applyRelocations(
     case wasm::R_WASM_GLOBAL_INDEX_LEB:
     case wasm::R_WASM_MEMORY_ADDR_LEB:
     case wasm::R_WASM_EVENT_INDEX_LEB:
-      writePatchableLEB(Stream, Value, Offset);
+      writePatchableLEB<5>(Stream, Value, Offset);
+      break;
+    case wasm::R_WASM_MEMORY_ADDR_LEB64:
+      writePatchableLEB<10>(Stream, Value, Offset);
       break;
     case wasm::R_WASM_TABLE_INDEX_I32:
     case wasm::R_WASM_MEMORY_ADDR_I32:
     case wasm::R_WASM_FUNCTION_OFFSET_I32:
     case wasm::R_WASM_SECTION_OFFSET_I32:
-      writeI32(Stream, Value, Offset);
+    case wasm::R_WASM_GLOBAL_INDEX_I32:
+      patchI32(Stream, Value, Offset);
+      break;
+    case wasm::R_WASM_MEMORY_ADDR_I64:
+      patchI64(Stream, Value, Offset);
       break;
     case wasm::R_WASM_TABLE_INDEX_SLEB:
     case wasm::R_WASM_TABLE_INDEX_REL_SLEB:
     case wasm::R_WASM_MEMORY_ADDR_SLEB:
     case wasm::R_WASM_MEMORY_ADDR_REL_SLEB:
-      writePatchableSLEB(Stream, Value, Offset);
+      writePatchableSLEB<5>(Stream, Value, Offset);
+      break;
+    case wasm::R_WASM_MEMORY_ADDR_SLEB64:
+    case wasm::R_WASM_MEMORY_ADDR_REL_SLEB64:
+      writePatchableSLEB<10>(Stream, Value, Offset);
       break;
     default:
       llvm_unreachable("invalid relocation type");
@@ -695,12 +730,12 @@ void WasmObjectWriter::writeTypeSection(ArrayRef<WasmSignature> Signatures) {
 }
 
 void WasmObjectWriter::writeImportSection(ArrayRef<wasm::WasmImport> Imports,
-                                          uint32_t DataSize,
+                                          uint64_t DataSize,
                                           uint32_t NumElements) {
   if (Imports.empty())
     return;
 
-  uint32_t NumPages = (DataSize + wasm::WasmPageSize - 1) / wasm::WasmPageSize;
+  uint64_t NumPages = (DataSize + wasm::WasmPageSize - 1) / wasm::WasmPageSize;
 
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_IMPORT);
@@ -720,8 +755,8 @@ void WasmObjectWriter::writeImportSection(ArrayRef<wasm::WasmImport> Imports,
       W.OS << char(Import.Global.Mutable ? 1 : 0);
       break;
     case wasm::WASM_EXTERNAL_MEMORY:
-      encodeULEB128(0, W.OS);        // flags
-      encodeULEB128(NumPages, W.OS); // initial
+      encodeULEB128(Import.Memory.Flags, W.OS);
+      encodeULEB128(NumPages, W.OS);  // initial
       break;
     case wasm::WASM_EXTERNAL_TABLE:
       W.OS << char(Import.Table.ElemType);
@@ -770,6 +805,43 @@ void WasmObjectWriter::writeEventSection(ArrayRef<wasm::WasmEventType> Events) {
   endSection(Section);
 }
 
+void WasmObjectWriter::writeGlobalSection(ArrayRef<wasm::WasmGlobal> Globals) {
+  if (Globals.empty())
+    return;
+
+  SectionBookkeeping Section;
+  startSection(Section, wasm::WASM_SEC_GLOBAL);
+
+  encodeULEB128(Globals.size(), W.OS);
+  for (const wasm::WasmGlobal &Global : Globals) {
+    encodeULEB128(Global.Type.Type, W.OS);
+    W.OS << char(Global.Type.Mutable);
+    W.OS << char(Global.InitExpr.Opcode);
+    switch (Global.Type.Type) {
+    case wasm::WASM_TYPE_I32:
+      encodeSLEB128(0, W.OS);
+      break;
+    case wasm::WASM_TYPE_I64:
+      encodeSLEB128(0, W.OS);
+      break;
+    case wasm::WASM_TYPE_F32:
+      writeI32(0);
+      break;
+    case wasm::WASM_TYPE_F64:
+      writeI64(0);
+      break;
+    case wasm::WASM_TYPE_EXTERNREF:
+      writeValueType(wasm::ValType::EXTERNREF);
+      break;
+    default:
+      llvm_unreachable("unexpected type");
+    }
+    W.OS << char(wasm::WASM_OPCODE_END);
+  }
+
+  endSection(Section);
+}
+
 void WasmObjectWriter::writeExportSection(ArrayRef<wasm::WasmExport> Exports) {
   if (Exports.empty())
     return;
@@ -819,15 +891,14 @@ void WasmObjectWriter::writeDataCountSection() {
   endSection(Section);
 }
 
-void WasmObjectWriter::writeCodeSection(const MCAssembler &Asm,
-                                        const MCAsmLayout &Layout,
-                                        ArrayRef<WasmFunction> Functions) {
+uint32_t WasmObjectWriter::writeCodeSection(const MCAssembler &Asm,
+                                            const MCAsmLayout &Layout,
+                                            ArrayRef<WasmFunction> Functions) {
   if (Functions.empty())
-    return;
+    return 0;
 
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_CODE);
-  CodeSectionIndex = Section.Index;
 
   encodeULEB128(Functions.size(), W.OS);
 
@@ -844,18 +915,18 @@ void WasmObjectWriter::writeCodeSection(const MCAssembler &Asm,
   }
 
   // Apply fixups.
-  applyRelocations(CodeRelocations, Section.ContentsOffset);
+  applyRelocations(CodeRelocations, Section.ContentsOffset, Layout);
 
   endSection(Section);
+  return Section.Index;
 }
 
-void WasmObjectWriter::writeDataSection() {
+uint32_t WasmObjectWriter::writeDataSection(const MCAsmLayout &Layout) {
   if (DataSegments.empty())
-    return;
+    return 0;
 
   SectionBookkeeping Section;
   startSection(Section, wasm::WASM_SEC_DATA);
-  DataSectionIndex = Section.Index;
 
   encodeULEB128(DataSegments.size(), W.OS); // count
 
@@ -864,7 +935,9 @@ void WasmObjectWriter::writeDataSection() {
     if (Segment.InitFlags & wasm::WASM_SEGMENT_HAS_MEMINDEX)
       encodeULEB128(0, W.OS); // memory index
     if ((Segment.InitFlags & wasm::WASM_SEGMENT_IS_PASSIVE) == 0) {
-      W.OS << char(wasm::WASM_OPCODE_I32_CONST);
+      W.OS << char(Segment.Offset > std::numeric_limits<int32_t>().max()
+                     ? wasm::WASM_OPCODE_I64_CONST
+                     : wasm::WASM_OPCODE_I32_CONST);
       encodeSLEB128(Segment.Offset, W.OS); // offset
       W.OS << char(wasm::WASM_OPCODE_END);
     }
@@ -874,9 +947,10 @@ void WasmObjectWriter::writeDataSection() {
   }
 
   // Apply fixups.
-  applyRelocations(DataRelocations, Section.ContentsOffset);
+  applyRelocations(DataRelocations, Section.ContentsOffset, Layout);
 
   endSection(Section);
+  return Section.Index;
 }
 
 void WasmObjectWriter::writeRelocSection(
@@ -1027,7 +1101,7 @@ void WasmObjectWriter::writeCustomSection(WasmCustomSection &CustomSection,
 
   // Apply fixups.
   auto &Relocations = CustomSectionsRelocations[CustomSection.Section];
-  applyRelocations(Relocations, CustomSection.OutputContentsOffset);
+  applyRelocations(Relocations, CustomSection.OutputContentsOffset, Layout);
 }
 
 uint32_t WasmObjectWriter::getFunctionType(const MCSymbolWasm &Symbol) {
@@ -1046,8 +1120,8 @@ void WasmObjectWriter::registerFunctionType(const MCSymbolWasm &Symbol) {
   assert(Symbol.isFunction());
 
   WasmSignature S;
-  const MCSymbolWasm *ResolvedSym = resolveSymbol(Symbol);
-  if (auto *Sig = ResolvedSym->getSignature()) {
+
+  if (auto *Sig = Symbol.getSignature()) {
     S.Returns = Sig->Returns;
     S.Params = Sig->Params;
   }
@@ -1084,16 +1158,13 @@ void WasmObjectWriter::registerEventType(const MCSymbolWasm &Symbol) {
 }
 
 static bool isInSymtab(const MCSymbolWasm &Sym) {
-  if (Sym.isUsedInReloc())
+  if (Sym.isUsedInReloc() || Sym.isUsedInInitArray())
     return true;
 
   if (Sym.isComdat() && !Sym.isDefined())
     return false;
 
-  if (Sym.isTemporary() && Sym.getName().empty())
-    return false;
-
-  if (Sym.isTemporary() && Sym.isData() && !Sym.getSize())
+  if (Sym.isTemporary())
     return false;
 
   if (Sym.isSection())
@@ -1114,10 +1185,11 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
   SmallVector<wasm::WasmImport, 4> Imports;
   SmallVector<wasm::WasmExport, 4> Exports;
   SmallVector<wasm::WasmEventType, 1> Events;
+  SmallVector<wasm::WasmGlobal, 1> Globals;
   SmallVector<wasm::WasmSymbolInfo, 4> SymbolInfos;
   SmallVector<std::pair<uint16_t, uint32_t>, 2> InitFuncs;
   std::map<StringRef, std::vector<WasmComdatEntry>> Comdats;
-  uint32_t DataSize = 0;
+  uint64_t DataSize = 0;
 
   // For now, always emit the memory import, since loads and stores are not
   // valid without it. In the future, we could perhaps be more clever and omit
@@ -1126,6 +1198,8 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
   MemImport.Module = "env";
   MemImport.Field = "__linear_memory";
   MemImport.Kind = wasm::WASM_EXTERNAL_MEMORY;
+  MemImport.Memory.Flags = is64Bit() ? wasm::WASM_LIMITS_FLAG_IS_64
+                                     : wasm::WASM_LIMITS_FLAG_NONE;
   Imports.push_back(MemImport);
 
   // For now, always emit the table section, since indirect calls are not
@@ -1146,8 +1220,10 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
 
     // Register types for all functions, including those with private linkage
     // (because wasm always needs a type signature).
-    if (WS.isFunction())
-      registerFunctionType(WS);
+    if (WS.isFunction()) {
+      const MCSymbolWasm *Base = cast<MCSymbolWasm>(Layout.getBaseSymbol(S));
+      registerFunctionType(*Base);
+    }
 
     if (WS.isEvent())
       registerEventType(WS);
@@ -1217,7 +1293,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
   // populating DataLocations.
   for (MCSection &Sec : Asm) {
     auto &Section = static_cast<MCSectionWasm &>(Sec);
-    StringRef SectionName = Section.getSectionName();
+    StringRef SectionName = Section.getName();
 
     // .init_array sections are handled specially elsewhere.
     if (SectionName.startswith(".init_array"))
@@ -1365,30 +1441,53 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
       // For each data symbol, export it in the symtab as a reference to the
       // corresponding Wasm data segment.
       wasm::WasmDataReference Ref = wasm::WasmDataReference{
-          DataSection.getSegmentIndex(),
-          static_cast<uint32_t>(Layout.getSymbolOffset(WS)),
-          static_cast<uint32_t>(Size)};
+          DataSection.getSegmentIndex(), Layout.getSymbolOffset(WS),
+          static_cast<uint64_t>(Size)};
       DataLocations[&WS] = Ref;
       LLVM_DEBUG(dbgs() << "  -> segment index: " << Ref.Segment << "\n");
 
     } else if (WS.isGlobal()) {
       // A "true" Wasm global (currently just __stack_pointer)
-      if (WS.isDefined())
-        report_fatal_error("don't yet support defined globals");
-
-      // An import; the index was assigned above
-      LLVM_DEBUG(dbgs() << "  -> global index: "
-                        << WasmIndices.find(&WS)->second << "\n");
-
+      if (WS.isDefined()) {
+        assert(WasmIndices.count(&WS) == 0);
+        wasm::WasmGlobal Global;
+        Global.Type = WS.getGlobalType();
+        Global.Index = NumGlobalImports + Globals.size();
+        switch (Global.Type.Type) {
+        case wasm::WASM_TYPE_I32:
+          Global.InitExpr.Opcode = wasm::WASM_OPCODE_I32_CONST;
+          break;
+        case wasm::WASM_TYPE_I64:
+          Global.InitExpr.Opcode = wasm::WASM_OPCODE_I64_CONST;
+          break;
+        case wasm::WASM_TYPE_F32:
+          Global.InitExpr.Opcode = wasm::WASM_OPCODE_F32_CONST;
+          break;
+        case wasm::WASM_TYPE_F64:
+          Global.InitExpr.Opcode = wasm::WASM_OPCODE_F64_CONST;
+          break;
+        case wasm::WASM_TYPE_EXTERNREF:
+          Global.InitExpr.Opcode = wasm::WASM_OPCODE_REF_NULL;
+          break;
+        default:
+          llvm_unreachable("unexpected type");
+        }
+        WasmIndices[&WS] = Global.Index;
+        Globals.push_back(Global);
+      } else {
+        // An import; the index was assigned above
+        LLVM_DEBUG(dbgs() << "  -> global index: "
+                          << WasmIndices.find(&WS)->second << "\n");
+      }
     } else if (WS.isEvent()) {
       // C++ exception symbol (__cpp_exception)
       unsigned Index;
       if (WS.isDefined()) {
+        assert(WasmIndices.count(&WS) == 0);
         Index = NumEventImports + Events.size();
         wasm::WasmEventType Event;
         Event.SigIndex = getEventType(WS);
         Event.Attribute = wasm::WASM_EVENT_ATTRIBUTE_EXCEPTION;
-        assert(WasmIndices.count(&WS) == 0);
         WasmIndices[&WS] = Index;
         Events.push_back(Event);
       } else {
@@ -1413,22 +1512,36 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
 
     assert(S.isDefined());
 
+    const MCSymbolWasm *Base = cast<MCSymbolWasm>(Layout.getBaseSymbol(S));
+
     // Find the target symbol of this weak alias and export that index
     const auto &WS = static_cast<const MCSymbolWasm &>(S);
-    const MCSymbolWasm *ResolvedSym = resolveSymbol(WS);
-    LLVM_DEBUG(dbgs() << WS.getName() << ": weak alias of '" << *ResolvedSym
-                      << "'\n");
+    LLVM_DEBUG(dbgs() << WS.getName() << ": weak alias of '" << *Base << "'\n");
 
-    if (ResolvedSym->isFunction()) {
-      assert(WasmIndices.count(ResolvedSym) > 0);
-      uint32_t WasmIndex = WasmIndices.find(ResolvedSym)->second;
+    if (Base->isFunction()) {
+      assert(WasmIndices.count(Base) > 0);
+      uint32_t WasmIndex = WasmIndices.find(Base)->second;
       assert(WasmIndices.count(&WS) == 0);
       WasmIndices[&WS] = WasmIndex;
       LLVM_DEBUG(dbgs() << "  -> index:" << WasmIndex << "\n");
-    } else if (ResolvedSym->isData()) {
-      assert(DataLocations.count(ResolvedSym) > 0);
-      const wasm::WasmDataReference &Ref =
-          DataLocations.find(ResolvedSym)->second;
+    } else if (Base->isData()) {
+      auto &DataSection = static_cast<MCSectionWasm &>(WS.getSection());
+      uint64_t Offset = Layout.getSymbolOffset(S);
+      int64_t Size = 0;
+      // For data symbol alias we use the size of the base symbol as the
+      // size of the alias.  When an offset from the base is involved this
+      // can result in a offset + size goes past the end of the data section
+      // which out object format doesn't support.  So we must clamp it.
+      if (!Base->getSize()->evaluateAsAbsolute(Size, Layout))
+        report_fatal_error(".size expression must be evaluatable");
+      const WasmDataSegment &Segment =
+          DataSegments[DataSection.getSegmentIndex()];
+      Size =
+          std::min(static_cast<uint64_t>(Size), Segment.Data.size() - Offset);
+      wasm::WasmDataReference Ref = wasm::WasmDataReference{
+          DataSection.getSegmentIndex(),
+          static_cast<uint32_t>(Layout.getSymbolOffset(S)),
+          static_cast<uint32_t>(Size)};
       DataLocations[&WS] = Ref;
       LLVM_DEBUG(dbgs() << "  -> index:" << Ref.Segment << "\n");
     } else {
@@ -1486,17 +1599,19 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
       // purely to make the object file's provisional values readable, and is
       // ignored by the linker, which re-calculates the relocations itself.
       if (Rel.Type != wasm::R_WASM_TABLE_INDEX_I32 &&
-          Rel.Type != wasm::R_WASM_TABLE_INDEX_SLEB)
+          Rel.Type != wasm::R_WASM_TABLE_INDEX_SLEB &&
+          Rel.Type != wasm::R_WASM_TABLE_INDEX_REL_SLEB)
         return;
       assert(Rel.Symbol->isFunction());
-      const MCSymbolWasm &WS = *resolveSymbol(*Rel.Symbol);
-      uint32_t FunctionIndex = WasmIndices.find(&WS)->second;
+      const MCSymbolWasm *Base =
+          cast<MCSymbolWasm>(Layout.getBaseSymbol(*Rel.Symbol));
+      uint32_t FunctionIndex = WasmIndices.find(Base)->second;
       uint32_t TableIndex = TableElems.size() + InitialTableOffset;
-      if (TableIndices.try_emplace(&WS, TableIndex).second) {
-        LLVM_DEBUG(dbgs() << "  -> adding " << WS.getName()
+      if (TableIndices.try_emplace(Base, TableIndex).second) {
+        LLVM_DEBUG(dbgs() << "  -> adding " << Base->getName()
                           << " to table: " << TableIndex << "\n");
         TableElems.push_back(FunctionIndex);
-        registerFunctionType(WS);
+        registerFunctionType(*Base);
       }
     };
 
@@ -1509,9 +1624,9 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
   // Translate .init_array section contents into start functions.
   for (const MCSection &S : Asm) {
     const auto &WS = static_cast<const MCSectionWasm &>(S);
-    if (WS.getSectionName().startswith(".fini_array"))
+    if (WS.getName().startswith(".fini_array"))
       report_fatal_error(".fini_array sections are unsupported");
-    if (!WS.getSectionName().startswith(".init_array"))
+    if (!WS.getName().startswith(".init_array"))
       continue;
     if (WS.getFragmentList().empty())
       continue;
@@ -1538,13 +1653,11 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
 
     uint16_t Priority = UINT16_MAX;
     unsigned PrefixLength = strlen(".init_array");
-    if (WS.getSectionName().size() > PrefixLength) {
-      if (WS.getSectionName()[PrefixLength] != '.')
+    if (WS.getName().size() > PrefixLength) {
+      if (WS.getName()[PrefixLength] != '.')
         report_fatal_error(
             ".init_array section priority should start with '.'");
-      if (WS.getSectionName()
-              .substr(PrefixLength + 1)
-              .getAsInteger(10, Priority))
+      if (WS.getName().substr(PrefixLength + 1).getAsInteger(10, Priority))
         report_fatal_error("invalid .init_array section priority");
     }
     const auto &DataFrag = cast<MCDataFragment>(Frag);
@@ -1565,7 +1678,7 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
         report_fatal_error("fixups in .init_array should be symbol references");
       const auto &TargetSym = cast<const MCSymbolWasm>(SymRef->getSymbol());
       if (TargetSym.getIndex() == InvalidIndex)
-        report_fatal_error("symbols in .init_array should exist in symbtab");
+        report_fatal_error("symbols in .init_array should exist in symtab");
       if (!TargetSym.isFunction())
         report_fatal_error("symbols in .init_array should be for functions");
       InitFuncs.push_back(
@@ -1582,11 +1695,12 @@ uint64_t WasmObjectWriter::writeObject(MCAssembler &Asm,
   // Skip the "table" section; we import the table instead.
   // Skip the "memory" section; we import the memory instead.
   writeEventSection(Events);
+  writeGlobalSection(Globals);
   writeExportSection(Exports);
   writeElemSection(TableElems);
   writeDataCountSection();
-  writeCodeSection(Asm, Layout, Functions);
-  writeDataSection();
+  uint32_t CodeSectionIndex = writeCodeSection(Asm, Layout, Functions);
+  uint32_t DataSectionIndex = writeDataSection(Layout);
   for (auto &CustomSection : CustomSections)
     writeCustomSection(CustomSection, Asm, Layout);
   writeLinkingMetaDataSection(SymbolInfos, InitFuncs, Comdats);
diff --git a/llvm/lib/MC/WinCOFFObjectWriter.cpp b/llvm/lib/MC/WinCOFFObjectWriter.cpp
index 749ed8badfaa..4796ef531054 100644
--- a/llvm/lib/MC/WinCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/WinCOFFObjectWriter.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
@@ -33,7 +34,7 @@
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Support/CRC.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/Endian.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
@@ -118,7 +119,7 @@ public:
   COFFSymbol *Symbol = nullptr;
   relocations Relocations;
 
-  COFFSection(StringRef Name) : Name(Name) {}
+  COFFSection(StringRef Name) : Name(std::string(Name)) {}
 };
 
 class WinCOFFObjectWriter : public MCObjectWriter {
@@ -131,6 +132,8 @@ public:
   using symbol_map = DenseMap<MCSymbol const *, COFFSymbol *>;
   using section_map = DenseMap<MCSection const *, COFFSection *>;
 
+  using symbol_list = DenseSet<COFFSymbol *>;
+
   std::unique_ptr<MCWinCOFFObjectTargetWriter> TargetObjectWriter;
 
   // Root level file contents.
@@ -143,12 +146,16 @@ public:
   section_map SectionMap;
   symbol_map SymbolMap;
 
+  symbol_list WeakDefaults;
+
   bool UseBigObj;
 
   bool EmitAddrsigSection = false;
   MCSectionCOFF *AddrsigSection;
   std::vector<const MCSymbol *> AddrsigSyms;
 
+  MCSectionCOFF *CGProfileSection = nullptr;
+
   WinCOFFObjectWriter(std::unique_ptr<MCWinCOFFObjectTargetWriter> MOTW,
                       raw_pwrite_stream &OS);
 
@@ -205,6 +212,7 @@ public:
                         MCValue Target, uint64_t &FixedValue) override;
 
   void createFileSymbols(MCAssembler &Asm);
+  void setWeakDefaultNames();
   void assignSectionNumbers();
   void assignFileOffsets(MCAssembler &Asm, const MCAsmLayout &Layout);
 
@@ -292,8 +300,8 @@ static uint32_t getAlignment(const MCSectionCOFF &Sec) {
 /// This function takes a section data object from the assembler
 /// and creates the associated COFF section staging object.
 void WinCOFFObjectWriter::defineSection(const MCSectionCOFF &MCSec) {
-  COFFSection *Section = createSection(MCSec.getSectionName());
-  COFFSymbol *Symbol = createSymbol(MCSec.getSectionName());
+  COFFSection *Section = createSection(MCSec.getName());
+  COFFSymbol *Symbol = createSymbol(MCSec.getName());
   Section->Symbol = Symbol;
   Symbol->Section = Section;
   Symbol->Data.StorageClass = COFF::IMAGE_SYM_CLASS_STATIC;
@@ -376,6 +384,7 @@ void WinCOFFObjectWriter::DefineSymbol(const MCSymbol &MCSym,
         WeakDefault->Data.SectionNumber = COFF::IMAGE_SYM_ABSOLUTE;
       else
         WeakDefault->Section = Sec;
+      WeakDefaults.insert(WeakDefault);
       Local = WeakDefault;
     }
 
@@ -667,6 +676,13 @@ void WinCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
     Asm.registerSection(*AddrsigSection);
   }
 
+  if (!Asm.CGProfile.empty()) {
+    CGProfileSection = Asm.getContext().getCOFFSection(
+        ".llvm.call-graph-profile", COFF::IMAGE_SCN_LNK_REMOVE,
+        SectionKind::getMetadata());
+    Asm.registerSection(*CGProfileSection);
+  }
+
   // "Define" each section & symbol. This creates section & symbol
   // entries in the staging area.
   for (const auto &Section : Asm)
@@ -863,6 +879,47 @@ void WinCOFFObjectWriter::createFileSymbols(MCAssembler &Asm) {
   }
 }
 
+void WinCOFFObjectWriter::setWeakDefaultNames() {
+  if (WeakDefaults.empty())
+    return;
+
+  // If multiple object files use a weak symbol (either with a regular
+  // defined default, or an absolute zero symbol as default), the defaults
+  // cause duplicate definitions unless their names are made unique. Look
+  // for a defined extern symbol, that isn't comdat - that should be unique
+  // unless there are other duplicate definitions. And if none is found,
+  // allow picking a comdat symbol, as that's still better than nothing.
+
+  COFFSymbol *Unique = nullptr;
+  for (bool AllowComdat : {false, true}) {
+    for (auto &Sym : Symbols) {
+      // Don't include the names of the defaults themselves
+      if (WeakDefaults.count(Sym.get()))
+        continue;
+      // Only consider external symbols
+      if (Sym->Data.StorageClass != COFF::IMAGE_SYM_CLASS_EXTERNAL)
+        continue;
+      // Only consider symbols defined in a section or that are absolute
+      if (!Sym->Section && Sym->Data.SectionNumber != COFF::IMAGE_SYM_ABSOLUTE)
+        continue;
+      if (!AllowComdat && Sym->Section &&
+          Sym->Section->Header.Characteristics & COFF::IMAGE_SCN_LNK_COMDAT)
+        continue;
+      Unique = Sym.get();
+      break;
+    }
+    if (Unique)
+      break;
+  }
+  // If we didn't find any unique symbol to use for the names, just skip this.
+  if (!Unique)
+    return;
+  for (auto *Sym : WeakDefaults) {
+    Sym->Name.append(".");
+    Sym->Name.append(Unique->Name);
+  }
+}
+
 static bool isAssociative(const COFFSection &Section) {
   return Section.Symbol->Aux[0].Aux.SectionDefinition.Selection ==
          COFF::IMAGE_COMDAT_SELECT_ASSOCIATIVE;
@@ -961,6 +1018,7 @@ uint64_t WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
   Header.NumberOfSections = Sections.size();
   Header.NumberOfSymbols = 0;
 
+  setWeakDefaultNames();
   assignSectionNumbers();
   createFileSymbols(Asm);
 
@@ -1014,7 +1072,7 @@ uint64_t WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
     // without a section.
     if (!AssocMCSym->isInSection()) {
       Asm.getContext().reportError(
-          SMLoc(), Twine("cannot make section ") + MCSec.getSectionName() +
+          SMLoc(), Twine("cannot make section ") + MCSec.getName() +
                        Twine(" associative with sectionless symbol ") +
                        AssocMCSym->getName());
       continue;
@@ -1050,6 +1108,20 @@ uint64_t WinCOFFObjectWriter::writeObject(MCAssembler &Asm,
     }
   }
 
+  // Create the contents of the .llvm.call-graph-profile section.
+  if (CGProfileSection) {
+    auto *Frag = new MCDataFragment(CGProfileSection);
+    Frag->setLayoutOrder(0);
+    raw_svector_ostream OS(Frag->getContents());
+    for (const MCAssembler::CGProfileEntry &CGPE : Asm.CGProfile) {
+      uint32_t FromIndex = CGPE.From->getSymbol().getIndex();
+      uint32_t ToIndex = CGPE.To->getSymbol().getIndex();
+      support::endian::write(OS, FromIndex, W.Endian);
+      support::endian::write(OS, ToIndex, W.Endian);
+      support::endian::write(OS, CGPE.Count, W.Endian);
+    }
+  }
+
   assignFileOffsets(Asm, Layout);
 
   // MS LINK expects to be able to use this timestamp to implement their
diff --git a/llvm/lib/MC/XCOFFObjectWriter.cpp b/llvm/lib/MC/XCOFFObjectWriter.cpp
index e584c6222a5a..0dabdc9777d6 100644
--- a/llvm/lib/MC/XCOFFObjectWriter.cpp
+++ b/llvm/lib/MC/XCOFFObjectWriter.cpp
@@ -11,14 +11,18 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/BinaryFormat/XCOFF.h"
+#include "llvm/MC/MCAsmBackend.h"
 #include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSectionXCOFF.h"
 #include "llvm/MC/MCSymbolXCOFF.h"
 #include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCXCOFFObjectWriter.h"
 #include "llvm/MC/StringTableBuilder.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/MathExtras.h"
 
@@ -49,6 +53,13 @@ constexpr int16_t MaxSectionIndex = INT16_MAX;
 // Packs the csect's alignment and type into a byte.
 uint8_t getEncodedType(const MCSectionXCOFF *);
 
+struct XCOFFRelocation {
+  uint32_t SymbolTableIndex;
+  uint32_t FixupOffsetInCsect;
+  uint8_t SignAndSize;
+  uint8_t Type;
+};
+
 // Wrapper around an MCSymbolXCOFF.
 struct Symbol {
   const MCSymbolXCOFF *const MCSym;
@@ -57,7 +68,7 @@ struct Symbol {
   XCOFF::StorageClass getStorageClass() const {
     return MCSym->getStorageClass();
   }
-  StringRef getName() const { return MCSym->getName(); }
+  StringRef getSymbolTableName() const { return MCSym->getSymbolTableName(); }
   Symbol(const MCSymbolXCOFF *MCSym) : MCSym(MCSym), SymbolTableIndex(-1) {}
 };
 
@@ -69,7 +80,8 @@ struct ControlSection {
   uint32_t Size;
 
   SmallVector<Symbol, 1> Syms;
-  StringRef getName() const { return MCCsect->getSectionName(); }
+  SmallVector<XCOFFRelocation, 1> Relocations;
+  StringRef getSymbolTableName() const { return MCCsect->getSymbolTableName(); }
   ControlSection(const MCSectionXCOFF *MCSec)
       : MCCsect(MCSec), SymbolTableIndex(-1), Address(-1), Size(0) {}
 };
@@ -79,7 +91,6 @@ struct ControlSection {
 // with a storage mapping class of `xmc_pr` will get placed into the same
 // container.
 using CsectGroup = std::deque<ControlSection>;
-
 using CsectGroups = std::deque<CsectGroup *>;
 
 // Represents the data related to a section excluding the csects that make up
@@ -141,11 +152,21 @@ class XCOFFObjectWriter : public MCObjectWriter {
   uint32_t SymbolTableEntryCount = 0;
   uint32_t SymbolTableOffset = 0;
   uint16_t SectionCount = 0;
+  uint32_t RelocationEntryOffset = 0;
 
   support::endian::Writer W;
   std::unique_ptr<MCXCOFFObjectTargetWriter> TargetObjectWriter;
   StringTableBuilder Strings;
 
+  // Maps the MCSection representation to its corresponding ControlSection
+  // wrapper. Needed for finding the ControlSection to insert an MCSymbol into
+  // from its containing MCSectionXCOFF.
+  DenseMap<const MCSectionXCOFF *, ControlSection *> SectionMap;
+
+  // Maps the MCSymbol representation to its corrresponding symbol table index.
+  // Needed for relocation.
+  DenseMap<const MCSymbol *, uint32_t> SymbolIndexMap;
+
   // CsectGroups. These store the csects which make up different parts of
   // the sections. Should have one for each set of csects that get mapped into
   // the same section and get handled in a 'similar' way.
@@ -188,6 +209,8 @@ class XCOFFObjectWriter : public MCObjectWriter {
   void writeSectionHeaderTable();
   void writeSections(const MCAssembler &Asm, const MCAsmLayout &Layout);
   void writeSymbolTable(const MCAsmLayout &Layout);
+  void writeRelocations();
+  void writeRelocation(XCOFFRelocation Reloc, const ControlSection &CSection);
 
   // Called after all the csects and symbols have been processed by
   // `executePostLayoutBinding`, this function handles building up the majority
@@ -198,6 +221,7 @@ class XCOFFObjectWriter : public MCObjectWriter {
   // *) Builds up the section header table by adding any non-empty sections to
   //    `Sections`.
   void assignAddressesAndIndices(const MCAsmLayout &);
+  void finalizeSectionInfo();
 
   bool
   needsAuxiliaryHeader() const { /* TODO aux header support not implemented. */
@@ -228,16 +252,20 @@ XCOFFObjectWriter::XCOFFObjectWriter(
           CsectGroups{&BSSCsects}) {}
 
 void XCOFFObjectWriter::reset() {
-  UndefinedCsects.clear();
+  // Clear the mappings we created.
+  SymbolIndexMap.clear();
+  SectionMap.clear();
 
+  UndefinedCsects.clear();
   // Reset any sections we have written to, and empty the section header table.
   for (auto *Sec : Sections)
     Sec->reset();
 
-  // Reset the symbol table and string table.
+  // Reset states in XCOFFObjectWriter.
   SymbolTableEntryCount = 0;
   SymbolTableOffset = 0;
   SectionCount = 0;
+  RelocationEntryOffset = 0;
   Strings.clear();
 
   MCObjectWriter::reset();
@@ -286,31 +314,32 @@ CsectGroup &XCOFFObjectWriter::getCsectGroup(const MCSectionXCOFF *MCSec) {
   }
 }
 
+static MCSectionXCOFF *getContainingCsect(const MCSymbolXCOFF *XSym) {
+  if (XSym->isDefined())
+    return cast<MCSectionXCOFF>(XSym->getFragment()->getParent());
+  return XSym->getRepresentedCsect();
+}
+
 void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
                                                  const MCAsmLayout &Layout) {
   if (TargetObjectWriter->is64Bit())
     report_fatal_error("64-bit XCOFF object files are not supported yet.");
 
-  // Maps the MC Section representation to its corresponding ControlSection
-  // wrapper. Needed for finding the ControlSection to insert an MCSymbol into
-  // from its containing MCSectionXCOFF.
-  DenseMap<const MCSectionXCOFF *, ControlSection *> WrapperMap;
-
   for (const auto &S : Asm) {
     const auto *MCSec = cast<const MCSectionXCOFF>(&S);
-    assert(WrapperMap.find(MCSec) == WrapperMap.end() &&
+    assert(SectionMap.find(MCSec) == SectionMap.end() &&
            "Cannot add a csect twice.");
     assert(XCOFF::XTY_ER != MCSec->getCSectType() &&
            "An undefined csect should not get registered.");
 
     // If the name does not fit in the storage provided in the symbol table
     // entry, add it to the string table.
-    if (nameShouldBeInStringTable(MCSec->getSectionName()))
-      Strings.add(MCSec->getSectionName());
+    if (nameShouldBeInStringTable(MCSec->getSymbolTableName()))
+      Strings.add(MCSec->getSymbolTableName());
 
     CsectGroup &Group = getCsectGroup(MCSec);
     Group.emplace_back(MCSec);
-    WrapperMap[MCSec] = &Group.back();
+    SectionMap[MCSec] = &Group.back();
   }
 
   for (const MCSymbol &S : Asm.symbols()) {
@@ -319,11 +348,14 @@ void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
       continue;
 
     const MCSymbolXCOFF *XSym = cast<MCSymbolXCOFF>(&S);
-    const MCSectionXCOFF *ContainingCsect = XSym->getContainingCsect();
+    const MCSectionXCOFF *ContainingCsect = getContainingCsect(XSym);
 
-    // Handle undefined symbol.
     if (ContainingCsect->getCSectType() == XCOFF::XTY_ER) {
+      // Handle undefined symbol.
       UndefinedCsects.emplace_back(ContainingCsect);
+      SectionMap[ContainingCsect] = &UndefinedCsects.back();
+      if (nameShouldBeInStringTable(ContainingCsect->getSymbolTableName()))
+        Strings.add(ContainingCsect->getSymbolTableName());
       continue;
     }
 
@@ -332,26 +364,112 @@ void XCOFFObjectWriter::executePostLayoutBinding(MCAssembler &Asm,
     if (XSym == ContainingCsect->getQualNameSymbol())
       continue;
 
-    assert(WrapperMap.find(ContainingCsect) != WrapperMap.end() &&
-           "Expected containing csect to exist in map");
+    // Only put a label into the symbol table when it is an external label.
+    if (!XSym->isExternal())
+      continue;
 
+    assert(SectionMap.find(ContainingCsect) != SectionMap.end() &&
+           "Expected containing csect to exist in map");
     // Lookup the containing csect and add the symbol to it.
-    WrapperMap[ContainingCsect]->Syms.emplace_back(XSym);
+    SectionMap[ContainingCsect]->Syms.emplace_back(XSym);
 
     // If the name does not fit in the storage provided in the symbol table
     // entry, add it to the string table.
-    if (nameShouldBeInStringTable(XSym->getName()))
-      Strings.add(XSym->getName());
-    }
+    if (nameShouldBeInStringTable(XSym->getSymbolTableName()))
+      Strings.add(XSym->getSymbolTableName());
+  }
 
   Strings.finalize();
   assignAddressesAndIndices(Layout);
 }
 
-void XCOFFObjectWriter::recordRelocation(MCAssembler &, const MCAsmLayout &,
-                                         const MCFragment *, const MCFixup &,
-                                         MCValue, uint64_t &) {
-  // TODO: recordRelocation is not yet implemented.
+void XCOFFObjectWriter::recordRelocation(MCAssembler &Asm,
+                                         const MCAsmLayout &Layout,
+                                         const MCFragment *Fragment,
+                                         const MCFixup &Fixup, MCValue Target,
+                                         uint64_t &FixedValue) {
+  auto getIndex = [this](const MCSymbol *Sym,
+                         const MCSectionXCOFF *ContainingCsect) {
+    // If we could not find the symbol directly in SymbolIndexMap, this symbol
+    // could either be a temporary symbol or an undefined symbol. In this case,
+    // we would need to have the relocation reference its csect instead.
+    return SymbolIndexMap.find(Sym) != SymbolIndexMap.end()
+               ? SymbolIndexMap[Sym]
+               : SymbolIndexMap[ContainingCsect->getQualNameSymbol()];
+  };
+
+  auto getVirtualAddress = [this,
+                            &Layout](const MCSymbol *Sym,
+                                     const MCSectionXCOFF *ContainingCsect) {
+    // If Sym is a csect, return csect's address.
+    // If Sym is a label, return csect's address + label's offset from the csect.
+    return SectionMap[ContainingCsect]->Address +
+           (Sym->isDefined() ? Layout.getSymbolOffset(*Sym) : 0);
+  };
+
+  const MCSymbol *const SymA = &Target.getSymA()->getSymbol();
+
+  MCAsmBackend &Backend = Asm.getBackend();
+  bool IsPCRel = Backend.getFixupKindInfo(Fixup.getKind()).Flags &
+                 MCFixupKindInfo::FKF_IsPCRel;
+
+  uint8_t Type;
+  uint8_t SignAndSize;
+  std::tie(Type, SignAndSize) =
+      TargetObjectWriter->getRelocTypeAndSignSize(Target, Fixup, IsPCRel);
+
+  const MCSectionXCOFF *SymASec = getContainingCsect(cast<MCSymbolXCOFF>(SymA));
+  assert(SectionMap.find(SymASec) != SectionMap.end() &&
+         "Expected containing csect to exist in map.");
+
+  const uint32_t Index = getIndex(SymA, SymASec);
+  if (Type == XCOFF::RelocationType::R_POS)
+    // The FixedValue should be symbol's virtual address in this object file
+    // plus any constant value that we might get.
+    FixedValue = getVirtualAddress(SymA, SymASec) + Target.getConstant();
+  else if (Type == XCOFF::RelocationType::R_TOC)
+    // The FixedValue should be the TC entry offset from TOC-base.
+    FixedValue = SectionMap[SymASec]->Address - TOCCsects.front().Address;
+
+  assert(
+      (TargetObjectWriter->is64Bit() ||
+       Fixup.getOffset() <= UINT32_MAX - Layout.getFragmentOffset(Fragment)) &&
+      "Fragment offset + fixup offset is overflowed in 32-bit mode.");
+  uint32_t FixupOffsetInCsect =
+      Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
+
+  XCOFFRelocation Reloc = {Index, FixupOffsetInCsect, SignAndSize, Type};
+  MCSectionXCOFF *RelocationSec = cast<MCSectionXCOFF>(Fragment->getParent());
+  assert(SectionMap.find(RelocationSec) != SectionMap.end() &&
+         "Expected containing csect to exist in map.");
+  SectionMap[RelocationSec]->Relocations.push_back(Reloc);
+
+  if (!Target.getSymB())
+    return;
+
+  const MCSymbol *const SymB = &Target.getSymB()->getSymbol();
+  if (SymA == SymB)
+    report_fatal_error("relocation for opposite term is not yet supported");
+
+  const MCSectionXCOFF *SymBSec = getContainingCsect(cast<MCSymbolXCOFF>(SymB));
+  assert(SectionMap.find(SymBSec) != SectionMap.end() &&
+         "Expected containing csect to exist in map.");
+  if (SymASec == SymBSec)
+    report_fatal_error(
+        "relocation for paired relocatable term is not yet supported");
+
+  assert(Type == XCOFF::RelocationType::R_POS &&
+         "SymA must be R_POS here if it's not opposite term or paired "
+         "relocatable term.");
+  const uint32_t IndexB = getIndex(SymB, SymBSec);
+  // SymB must be R_NEG here, given the general form of Target(MCValue) is
+  // "SymbolA - SymbolB + imm64".
+  const uint8_t TypeB = XCOFF::RelocationType::R_NEG;
+  XCOFFRelocation RelocB = {IndexB, FixupOffsetInCsect, SignAndSize, TypeB};
+  SectionMap[RelocationSec]->Relocations.push_back(RelocB);
+  // We already folded "SymbolA + imm64" above when Type is R_POS for SymbolA,
+  // now we just need to fold "- SymbolB" here.
+  FixedValue -= getVirtualAddress(SymB, SymBSec);
 }
 
 void XCOFFObjectWriter::writeSections(const MCAssembler &Asm,
@@ -362,8 +480,14 @@ void XCOFFObjectWriter::writeSections(const MCAssembler &Asm,
     if (Section->Index == Section::UninitializedIndex || Section->IsVirtual)
       continue;
 
-    assert(CurrentAddressLocation == Section->Address &&
-           "Sections should be written consecutively.");
+    // There could be a gap (without corresponding zero padding) between
+    // sections.
+    assert(CurrentAddressLocation <= Section->Address &&
+           "CurrentAddressLocation should be less than or equal to section "
+           "address.");
+
+    CurrentAddressLocation = Section->Address;
+
     for (const auto *Group : Section->Groups) {
       for (const auto &Csect : *Group) {
         if (uint32_t PaddingSize = Csect.Address - CurrentAddressLocation)
@@ -396,12 +520,13 @@ uint64_t XCOFFObjectWriter::writeObject(MCAssembler &Asm,
   if (TargetObjectWriter->is64Bit())
     report_fatal_error("64-bit XCOFF object files are not supported yet.");
 
+  finalizeSectionInfo();
   uint64_t StartOffset = W.OS.tell();
 
   writeFileHeader();
   writeSectionHeaderTable();
   writeSections(Asm, Layout);
-  // TODO writeRelocations();
+  writeRelocations();
 
   writeSymbolTable(Layout);
   // Write the string table.
@@ -430,7 +555,7 @@ void XCOFFObjectWriter::writeSymbolTableEntryForCsectMemberLabel(
     const Symbol &SymbolRef, const ControlSection &CSectionRef,
     int16_t SectionIndex, uint64_t SymbolOffset) {
   // Name or Zeros and string table offset
-  writeSymbolName(SymbolRef.getName());
+  writeSymbolName(SymbolRef.getSymbolTableName());
   assert(SymbolOffset <= UINT32_MAX - CSectionRef.Address &&
          "Symbol address overflows.");
   W.write<uint32_t>(CSectionRef.Address + SymbolOffset);
@@ -467,7 +592,7 @@ void XCOFFObjectWriter::writeSymbolTableEntryForControlSection(
     const ControlSection &CSectionRef, int16_t SectionIndex,
     XCOFF::StorageClass StorageClass) {
   // n_name, n_zeros, n_offset
-  writeSymbolName(CSectionRef.getName());
+  writeSymbolName(CSectionRef.getSymbolTableName());
   // n_value
   W.write<uint32_t>(CSectionRef.Address);
   // n_scnum
@@ -536,19 +661,46 @@ void XCOFFObjectWriter::writeSectionHeaderTable() {
 
     W.write<uint32_t>(Sec->Size);
     W.write<uint32_t>(Sec->FileOffsetToData);
+    W.write<uint32_t>(Sec->FileOffsetToRelocations);
 
-    // Relocation pointer and Lineno pointer. Not supported yet.
-    W.write<uint32_t>(0);
+    // Line number pointer. Not supported yet.
     W.write<uint32_t>(0);
 
-    // Relocation and line-number counts. Not supported yet.
-    W.write<uint16_t>(0);
+    W.write<uint16_t>(Sec->RelocationCount);
+
+    // Line number counts. Not supported yet.
     W.write<uint16_t>(0);
 
     W.write<int32_t>(Sec->Flags);
   }
 }
 
+void XCOFFObjectWriter::writeRelocation(XCOFFRelocation Reloc,
+                                        const ControlSection &CSection) {
+  W.write<uint32_t>(CSection.Address + Reloc.FixupOffsetInCsect);
+  W.write<uint32_t>(Reloc.SymbolTableIndex);
+  W.write<uint8_t>(Reloc.SignAndSize);
+  W.write<uint8_t>(Reloc.Type);
+}
+
+void XCOFFObjectWriter::writeRelocations() {
+  for (const auto *Section : Sections) {
+    if (Section->Index == Section::UninitializedIndex)
+      // Nothing to write for this Section.
+      continue;
+
+    for (const auto *Group : Section->Groups) {
+      if (Group->empty())
+        continue;
+
+      for (const auto &Csect : *Group) {
+        for (const auto Reloc : Csect.Relocations)
+          writeRelocation(Reloc, Csect);
+      }
+    }
+  }
+}
+
 void XCOFFObjectWriter::writeSymbolTable(const MCAsmLayout &Layout) {
   for (const auto &Csect : UndefinedCsects) {
     writeSymbolTableEntryForControlSection(
@@ -556,8 +708,8 @@ void XCOFFObjectWriter::writeSymbolTable(const MCAsmLayout &Layout) {
   }
 
   for (const auto *Section : Sections) {
-    // Nothing to write for this Section.
     if (Section->Index == Section::UninitializedIndex)
+      // Nothing to write for this Section.
       continue;
 
     for (const auto *Group : Section->Groups) {
@@ -578,6 +730,49 @@ void XCOFFObjectWriter::writeSymbolTable(const MCAsmLayout &Layout) {
   }
 }
 
+void XCOFFObjectWriter::finalizeSectionInfo() {
+  for (auto *Section : Sections) {
+    if (Section->Index == Section::UninitializedIndex)
+      // Nothing to record for this Section.
+      continue;
+
+    for (const auto *Group : Section->Groups) {
+      if (Group->empty())
+        continue;
+
+      for (auto &Csect : *Group) {
+        const size_t CsectRelocCount = Csect.Relocations.size();
+        if (CsectRelocCount >= XCOFF::RelocOverflow ||
+            Section->RelocationCount >= XCOFF::RelocOverflow - CsectRelocCount)
+          report_fatal_error(
+              "relocation entries overflowed; overflow section is "
+              "not implemented yet");
+
+        Section->RelocationCount += CsectRelocCount;
+      }
+    }
+  }
+
+  // Calculate the file offset to the relocation entries.
+  uint64_t RawPointer = RelocationEntryOffset;
+  for (auto Sec : Sections) {
+    if (Sec->Index == Section::UninitializedIndex || !Sec->RelocationCount)
+      continue;
+
+    Sec->FileOffsetToRelocations = RawPointer;
+    const uint32_t RelocationSizeInSec =
+        Sec->RelocationCount * XCOFF::RelocationSerializationSize32;
+    RawPointer += RelocationSizeInSec;
+    if (RawPointer > UINT32_MAX)
+      report_fatal_error("Relocation data overflowed this object file.");
+  }
+
+  // TODO Error check that the number of symbol table entries fits in 32-bits
+  // signed ...
+  if (SymbolTableEntryCount)
+    SymbolTableOffset = RawPointer;
+}
+
 void XCOFFObjectWriter::assignAddressesAndIndices(const MCAsmLayout &Layout) {
   // The first symbol table entry is for the file name. We are not emitting it
   // yet, so start at index 0.
@@ -588,6 +783,7 @@ void XCOFFObjectWriter::assignAddressesAndIndices(const MCAsmLayout &Layout) {
     Csect.Size = 0;
     Csect.Address = 0;
     Csect.SymbolTableIndex = SymbolTableIndex;
+    SymbolIndexMap[Csect.MCCsect->getQualNameSymbol()] = Csect.SymbolTableIndex;
     // 1 main and 1 auxiliary symbol table entry for each contained symbol.
     SymbolTableIndex += 2;
   }
@@ -622,11 +818,13 @@ void XCOFFObjectWriter::assignAddressesAndIndices(const MCAsmLayout &Layout) {
         Csect.Size = Layout.getSectionAddressSize(MCSec);
         Address = Csect.Address + Csect.Size;
         Csect.SymbolTableIndex = SymbolTableIndex;
+        SymbolIndexMap[MCSec->getQualNameSymbol()] = Csect.SymbolTableIndex;
         // 1 main and 1 auxiliary symbol table entry for the csect.
         SymbolTableIndex += 2;
-        
+
         for (auto &Sym : Csect.Syms) {
           Sym.SymbolTableIndex = SymbolTableIndex;
+          SymbolIndexMap[Sym.MCSym] = Sym.SymbolTableIndex;
           // 1 main and 1 auxiliary symbol table entry for each contained
           // symbol.
           SymbolTableIndex += 2;
@@ -656,14 +854,11 @@ void XCOFFObjectWriter::assignAddressesAndIndices(const MCAsmLayout &Layout) {
 
     Sec->FileOffsetToData = RawPointer;
     RawPointer += Sec->Size;
+    if (RawPointer > UINT32_MAX)
+      report_fatal_error("Section raw data overflowed this object file.");
   }
 
-  // TODO Add in Relocation storage to the RawPointer Calculation.
-  // TODO What to align the SymbolTable to?
-  // TODO Error check that the number of symbol table entries fits in 32-bits
-  // signed ...
-  if (SymbolTableEntryCount)
-    SymbolTableOffset = RawPointer;
+  RelocationEntryOffset = RawPointer;
 }
 
 // Takes the log base 2 of the alignment and shifts the result into the 5 most
diff --git a/llvm/lib/MCA/CodeEmitter.cpp b/llvm/lib/MCA/CodeEmitter.cpp
index 294107219cb0..dcb92d253bae 100644
--- a/llvm/lib/MCA/CodeEmitter.cpp
+++ b/llvm/lib/MCA/CodeEmitter.cpp
@@ -25,7 +25,7 @@ CodeEmitter::getOrCreateEncodingInfo(unsigned MCID) {
   const MCInst &Inst = Sequence[MCID];
   MCInst Relaxed(Sequence[MCID]);
   if (MAB.mayNeedRelaxation(Inst, STI))
-    MAB.relaxInstruction(Inst, STI, Relaxed);
+    MAB.relaxInstruction(Relaxed, STI);
 
   EI.first = Code.size();
   MCE.encodeInstruction(Relaxed, VecOS, Fixups, STI);
diff --git a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
index 0ee084c7ce1a..e945e8cecce9 100644
--- a/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
+++ b/llvm/lib/MCA/HardwareUnits/LSUnit.cpp
@@ -77,9 +77,6 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
     acquireSQSlot();
 
   if (Desc.MayStore) {
-    // Always create a new group for store operations.
-
-    // A store may not pass a previous store or store barrier.
     unsigned NewGID = createMemoryGroup();
     MemoryGroup &NewGroup = getGroup(NewGID);
     NewGroup.addInstruction();
@@ -91,16 +88,32 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
       MemoryGroup &IDom = getGroup(ImmediateLoadDominator);
       LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << ImmediateLoadDominator
                         << ") --> (" << NewGID << ")\n");
-      IDom.addSuccessor(&NewGroup);
+      IDom.addSuccessor(&NewGroup, !assumeNoAlias());
+    }
+
+    // A store may not pass a previous store barrier.
+    if (CurrentStoreBarrierGroupID) {
+      MemoryGroup &StoreGroup = getGroup(CurrentStoreBarrierGroupID);
+      LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: ("
+                        << CurrentStoreBarrierGroupID
+                        << ") --> (" << NewGID << ")\n");
+      StoreGroup.addSuccessor(&NewGroup, true);
     }
-    if (CurrentStoreGroupID) {
+
+    // A store may not pass a previous store.
+    if (CurrentStoreGroupID &&
+        (CurrentStoreGroupID != CurrentStoreBarrierGroupID)) {
       MemoryGroup &StoreGroup = getGroup(CurrentStoreGroupID);
       LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << CurrentStoreGroupID
                         << ") --> (" << NewGID << ")\n");
-      StoreGroup.addSuccessor(&NewGroup);
+      StoreGroup.addSuccessor(&NewGroup, !assumeNoAlias());
     }
 
+
     CurrentStoreGroupID = NewGID;
+    if (IsMemBarrier)
+      CurrentStoreBarrierGroupID = NewGID;
+
     if (Desc.MayLoad) {
       CurrentLoadGroupID = NewGID;
       if (IsMemBarrier)
@@ -112,31 +125,59 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
 
   assert(Desc.MayLoad && "Expected a load!");
 
-  // Always create a new memory group if this is the first load of the sequence.
+  unsigned ImmediateLoadDominator =
+      std::max(CurrentLoadGroupID, CurrentLoadBarrierGroupID);
+
+  // A new load group is created if we are in one of the following situations:
+  // 1) This is a load barrier (by construction, a load barrier is always
+  //    assigned to a different memory group).
+  // 2) There is no load in flight (by construction we always keep loads and
+  //    stores into separate memory groups).
+  // 3) There is a load barrier in flight. This load depends on it.
+  // 4) There is an intervening store between the last load dispatched to the
+  //    LSU and this load. We always create a new group even if this load
+  //    does not alias the last dispatched store.
+  // 5) There is no intervening store and there is an active load group.
+  //    However that group has already started execution, so we cannot add
+  //    this load to it.
+  bool ShouldCreateANewGroup =
+      IsMemBarrier || !ImmediateLoadDominator ||
+      CurrentLoadBarrierGroupID == ImmediateLoadDominator ||
+      ImmediateLoadDominator <= CurrentStoreGroupID ||
+      getGroup(ImmediateLoadDominator).isExecuting();
 
-  // A load may not pass a previous store unless flag 'NoAlias' is set.
-  // A load may pass a previous load.
-  // A younger load cannot pass a older load barrier.
-  // A load barrier cannot pass a older load.
-  bool ShouldCreateANewGroup = !CurrentLoadGroupID || IsMemBarrier ||
-                               CurrentLoadGroupID <= CurrentStoreGroupID ||
-                               CurrentLoadGroupID <= CurrentLoadBarrierGroupID;
   if (ShouldCreateANewGroup) {
     unsigned NewGID = createMemoryGroup();
     MemoryGroup &NewGroup = getGroup(NewGID);
     NewGroup.addInstruction();
 
+    // A load may not pass a previous store or store barrier
+    // unless flag 'NoAlias' is set.
     if (!assumeNoAlias() && CurrentStoreGroupID) {
-      MemoryGroup &StGroup = getGroup(CurrentStoreGroupID);
+      MemoryGroup &StoreGroup = getGroup(CurrentStoreGroupID);
       LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << CurrentStoreGroupID
                         << ") --> (" << NewGID << ")\n");
-      StGroup.addSuccessor(&NewGroup);
+      StoreGroup.addSuccessor(&NewGroup, true);
     }
-    if (CurrentLoadBarrierGroupID) {
-      MemoryGroup &LdGroup = getGroup(CurrentLoadBarrierGroupID);
-      LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: (" << CurrentLoadBarrierGroupID
-                        << ") --> (" << NewGID << ")\n");
-      LdGroup.addSuccessor(&NewGroup);
+
+    // A load barrier may not pass a previous load or load barrier.
+    if (IsMemBarrier) {
+      if (ImmediateLoadDominator) {
+        MemoryGroup &LoadGroup = getGroup(ImmediateLoadDominator);
+        LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: ("
+                          << ImmediateLoadDominator
+                          << ") --> (" << NewGID << ")\n");
+        LoadGroup.addSuccessor(&NewGroup, true);
+      }
+    } else {
+      // A younger load cannot pass a older load barrier.
+      if (CurrentLoadBarrierGroupID) {
+        MemoryGroup &LoadGroup = getGroup(CurrentLoadBarrierGroupID);
+        LLVM_DEBUG(dbgs() << "[LSUnit]: GROUP DEP: ("
+                          << CurrentLoadBarrierGroupID
+                          << ") --> (" << NewGID << ")\n");
+        LoadGroup.addSuccessor(&NewGroup, true);
+      }
     }
 
     CurrentLoadGroupID = NewGID;
@@ -145,6 +186,7 @@ unsigned LSUnit::dispatch(const InstRef &IR) {
     return NewGID;
   }
 
+  // A load may pass a previous load.
   MemoryGroup &Group = getGroup(CurrentLoadGroupID);
   Group.addInstruction();
   return CurrentLoadGroupID;
diff --git a/llvm/lib/MCA/InstrBuilder.cpp b/llvm/lib/MCA/InstrBuilder.cpp
index c137f1da8a44..24e2a9d2f0ce 100644
--- a/llvm/lib/MCA/InstrBuilder.cpp
+++ b/llvm/lib/MCA/InstrBuilder.cpp
@@ -160,8 +160,11 @@ static void initializeUsedResources(InstrDesc &ID,
     if (countPopulation(RPC.first) > 1 && !RPC.second.isReserved()) {
       // Remove the leading 1 from the resource group mask.
       uint64_t Mask = RPC.first ^ PowerOf2Floor(RPC.first);
-      if ((Mask & UsedResourceUnits) == Mask)
+      uint64_t MaxResourceUnits = countPopulation(Mask);
+      if (RPC.second.NumUnits > countPopulation(Mask)) {
         RPC.second.setReserved();
+        RPC.second.NumUnits = MaxResourceUnits;
+      }
     }
   }
 
@@ -485,24 +488,16 @@ Error InstrBuilder::verifyInstrDesc(const InstrDesc &ID,
   if (ID.NumMicroOps != 0)
     return ErrorSuccess();
 
-  bool UsesMemory = ID.MayLoad || ID.MayStore;
   bool UsesBuffers = ID.UsedBuffers;
   bool UsesResources = !ID.Resources.empty();
-  if (!UsesMemory && !UsesBuffers && !UsesResources)
+  if (!UsesBuffers && !UsesResources)
     return ErrorSuccess();
 
-  StringRef Message;
-  if (UsesMemory) {
-    Message = "found an inconsistent instruction that decodes "
-              "into zero opcodes and that consumes load/store "
-              "unit resources.";
-  } else {
-    Message = "found an inconsistent instruction that decodes "
-              "to zero opcodes and that consumes scheduler "
-              "resources.";
-  }
-
-  return make_error<InstructionError<MCInst>>(Message, MCI);
+  // FIXME: see PR44797. We should revisit these checks and possibly move them
+  // in CodeGenSchedule.cpp.
+  StringRef Message = "found an inconsistent instruction that decodes to zero "
+                      "opcodes and that consumes scheduler resources.";
+  return make_error<InstructionError<MCInst>>(std::string(Message), MCI);
 }
 
 Expected<const InstrDesc &>
diff --git a/llvm/lib/Object/Archive.cpp b/llvm/lib/Object/Archive.cpp
index 148c011d9cd4..c18dd11a72cc 100644
--- a/llvm/lib/Object/Archive.cpp
+++ b/llvm/lib/Object/Archive.cpp
@@ -392,12 +392,8 @@ Archive::Child::Child(const Archive *Parent, const char *Start, Error *Err)
 }
 
 Expected<uint64_t> Archive::Child::getSize() const {
-  if (Parent->IsThin) {
-    Expected<uint32_t> Size = Header.getSize();
-    if (!Size)
-      return Size.takeError();
-    return Size.get();
-  }
+  if (Parent->IsThin)
+    return Header.getSize();
   return Data.size() - StartOfFile;
 }
 
@@ -423,12 +419,12 @@ Expected<std::string> Archive::Child::getFullName() const {
     return NameOrErr.takeError();
   StringRef Name = *NameOrErr;
   if (sys::path::is_absolute(Name))
-    return Name;
+    return std::string(Name);
 
   SmallString<128> FullName = sys::path::parent_path(
       Parent->getMemoryBufferRef().getBufferIdentifier());
   sys::path::append(FullName, Name);
-  return StringRef(FullName);
+  return std::string(FullName.str());
 }
 
 Expected<StringRef> Archive::Child::getBuffer() const {
@@ -437,7 +433,7 @@ Expected<StringRef> Archive::Child::getBuffer() const {
     return isThinOrErr.takeError();
   bool isThin = isThinOrErr.get();
   if (!isThin) {
-    Expected<uint32_t> Size = getSize();
+    Expected<uint64_t> Size = getSize();
     if (!Size)
       return Size.takeError();
     return StringRef(Data.data() + StartOfFile, Size.get());
diff --git a/llvm/lib/Object/ArchiveWriter.cpp b/llvm/lib/Object/ArchiveWriter.cpp
index 5234b0e18233..6f92c547164b 100644
--- a/llvm/lib/Object/ArchiveWriter.cpp
+++ b/llvm/lib/Object/ArchiveWriter.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/Object/ArchiveWriter.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/Magic.h"
 #include "llvm/IR/LLVMContext.h"
@@ -263,12 +264,15 @@ static sys::TimePoint<std::chrono::seconds> now(bool Deterministic) {
 }
 
 static bool isArchiveSymbol(const object::BasicSymbolRef &S) {
-  uint32_t Symflags = S.getFlags();
-  if (Symflags & object::SymbolRef::SF_FormatSpecific)
+  Expected<uint32_t> SymFlagsOrErr = S.getFlags();
+  if (!SymFlagsOrErr)
+    // TODO: Actually report errors helpfully.
+    report_fatal_error(SymFlagsOrErr.takeError());
+  if (*SymFlagsOrErr & object::SymbolRef::SF_FormatSpecific)
     return false;
-  if (!(Symflags & object::SymbolRef::SF_Global))
+  if (!(*SymFlagsOrErr & object::SymbolRef::SF_Global))
     return false;
-  if (Symflags & object::SymbolRef::SF_Undefined)
+  if (*SymFlagsOrErr & object::SymbolRef::SF_Undefined)
     return false;
   return true;
 }
@@ -545,7 +549,7 @@ Expected<std::string> computeArchiveRelativePath(StringRef From, StringRef To) {
   for (auto ToE = sys::path::end(PathTo); ToI != ToE; ++ToI)
     sys::path::append(Relative, sys::path::Style::posix, *ToI);
 
-  return Relative.str();
+  return std::string(Relative.str());
 }
 
 Error writeArchive(StringRef ArcName, ArrayRef<NewArchiveMember> NewMembers,
diff --git a/llvm/lib/Object/COFFImportFile.cpp b/llvm/lib/Object/COFFImportFile.cpp
index ff4a799be60c..69bbf70b43a1 100644
--- a/llvm/lib/Object/COFFImportFile.cpp
+++ b/llvm/lib/Object/COFFImportFile.cpp
@@ -600,7 +600,7 @@ Error writeImportLibrary(StringRef ImportName, StringRef Path,
                                   : getNameType(SymbolName, E.Name,
                                                 Machine, MinGW);
     Expected<std::string> Name = E.ExtName.empty()
-                                     ? SymbolName
+                                     ? std::string(SymbolName)
                                      : replace(SymbolName, E.Name, E.ExtName);
 
     if (!Name)
diff --git a/llvm/lib/Object/COFFModuleDefinition.cpp b/llvm/lib/Object/COFFModuleDefinition.cpp
index 64d4cf0efda2..8f29f7a658fd 100644
--- a/llvm/lib/Object/COFFModuleDefinition.cpp
+++ b/llvm/lib/Object/COFFModuleDefinition.cpp
@@ -229,14 +229,14 @@ private:
 
   Error parseExport() {
     COFFShortExport E;
-    E.Name = Tok.Value;
+    E.Name = std::string(Tok.Value);
     read();
     if (Tok.K == Equal) {
       read();
       if (Tok.K != Identifier)
         return createError("identifier expected, but got " + Tok.Value);
       E.ExtName = E.Name;
-      E.Name = Tok.Value;
+      E.Name = std::string(Tok.Value);
     } else {
       unget();
     }
@@ -285,7 +285,7 @@ private:
       }
       if (Tok.K == EqualEqual) {
         read();
-        E.AliasTarget = Tok.Value;
+        E.AliasTarget = std::string(Tok.Value);
         if (Machine == IMAGE_FILE_MACHINE_I386 && !isDecorated(E.AliasTarget, MingwDef))
           E.AliasTarget = std::string("_").append(E.AliasTarget);
         continue;
@@ -315,7 +315,7 @@ private:
   Error parseName(std::string *Out, uint64_t *Baseaddr) {
     read();
     if (Tok.K == Identifier) {
-      *Out = Tok.Value;
+      *Out = std::string(Tok.Value);
     } else {
       *Out = "";
       unget();
diff --git a/llvm/lib/Object/COFFObjectFile.cpp b/llvm/lib/Object/COFFObjectFile.cpp
index 2c0f6dc2b1e9..c26d7721b3fe 100644
--- a/llvm/lib/Object/COFFObjectFile.cpp
+++ b/llvm/lib/Object/COFFObjectFile.cpp
@@ -12,6 +12,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/BinaryFormat/COFF.h"
@@ -54,14 +55,13 @@ static bool checkSize(MemoryBufferRef M, std::error_code &EC, uint64_t Size) {
 // Sets Obj unless any bytes in [addr, addr + size) fall outsize of m.
 // Returns unexpected_eof if error.
 template <typename T>
-static std::error_code getObject(const T *&Obj, MemoryBufferRef M,
-                                 const void *Ptr,
-                                 const uint64_t Size = sizeof(T)) {
+static Error getObject(const T *&Obj, MemoryBufferRef M, const void *Ptr,
+                       const uint64_t Size = sizeof(T)) {
   uintptr_t Addr = uintptr_t(Ptr);
-  if (std::error_code EC = Binary::checkOffset(M, Addr, Size))
-    return EC;
+  if (Error E = Binary::checkOffset(M, Addr, Size))
+    return E;
   Obj = reinterpret_cast<const T *>(Addr);
-  return std::error_code();
+  return Error::success();
 }
 
 // Decode a string table entry in base 64 (//AAAAAA). Expects \arg Str without
@@ -147,11 +147,7 @@ void COFFObjectFile::moveSymbolNext(DataRefImpl &Ref) const {
 }
 
 Expected<StringRef> COFFObjectFile::getSymbolName(DataRefImpl Ref) const {
-  COFFSymbolRef Symb = getCOFFSymbol(Ref);
-  StringRef Result;
-  if (std::error_code EC = getSymbolName(Symb, Result))
-    return errorCodeToError(EC);
-  return Result;
+  return getSymbolName(getCOFFSymbol(Ref));
 }
 
 uint64_t COFFObjectFile::getSymbolValueImpl(DataRefImpl Ref) const {
@@ -166,7 +162,7 @@ uint32_t COFFObjectFile::getSymbolAlignment(DataRefImpl Ref) const {
 }
 
 Expected<uint64_t> COFFObjectFile::getSymbolAddress(DataRefImpl Ref) const {
-  uint64_t Result = getSymbolValue(Ref);
+  uint64_t Result = cantFail(getSymbolValue(Ref));
   COFFSymbolRef Symb = getCOFFSymbol(Ref);
   int32_t SectionNumber = Symb.getSectionNumber();
 
@@ -174,10 +170,10 @@ Expected<uint64_t> COFFObjectFile::getSymbolAddress(DataRefImpl Ref) const {
       COFF::isReservedSectionNumber(SectionNumber))
     return Result;
 
-  const coff_section *Section = nullptr;
-  if (std::error_code EC = getSection(SectionNumber, Section))
-    return errorCodeToError(EC);
-  Result += Section->VirtualAddress;
+  Expected<const coff_section *> Section = getSection(SectionNumber);
+  if (!Section)
+    return Section.takeError();
+  Result += (*Section)->VirtualAddress;
 
   // The section VirtualAddress does not include ImageBase, and we want to
   // return virtual addresses.
@@ -209,7 +205,7 @@ Expected<SymbolRef::Type> COFFObjectFile::getSymbolType(DataRefImpl Ref) const {
   return SymbolRef::ST_Other;
 }
 
-uint32_t COFFObjectFile::getSymbolFlags(DataRefImpl Ref) const {
+Expected<uint32_t> COFFObjectFile::getSymbolFlags(DataRefImpl Ref) const {
   COFFSymbolRef Symb = getCOFFSymbol(Ref);
   uint32_t Result = SymbolRef::SF_None;
 
@@ -250,11 +246,11 @@ COFFObjectFile::getSymbolSection(DataRefImpl Ref) const {
   COFFSymbolRef Symb = getCOFFSymbol(Ref);
   if (COFF::isReservedSectionNumber(Symb.getSectionNumber()))
     return section_end();
-  const coff_section *Sec = nullptr;
-  if (std::error_code EC = getSection(Symb.getSectionNumber(), Sec))
-    return errorCodeToError(EC);
+  Expected<const coff_section *> Sec = getSection(Symb.getSectionNumber());
+  if (!Sec)
+    return Sec.takeError();
   DataRefImpl Ret;
-  Ret.p = reinterpret_cast<uintptr_t>(Sec);
+  Ret.p = reinterpret_cast<uintptr_t>(*Sec);
   return section_iterator(SectionRef(Ret, this));
 }
 
@@ -328,6 +324,12 @@ bool COFFObjectFile::isSectionBSS(DataRefImpl Ref) const {
   return (Sec->Characteristics & BssFlags) == BssFlags;
 }
 
+// The .debug sections are the only debug sections for COFF
+// (\see MCObjectFileInfo.cpp).
+bool COFFObjectFile::isDebugSection(StringRef SectionName) const {
+  return SectionName.startswith(".debug");
+}
+
 unsigned COFFObjectFile::getSectionID(SectionRef Sec) const {
   uintptr_t Offset =
       uintptr_t(Sec.getRawDataRefImpl().p) - uintptr_t(SectionTable);
@@ -350,9 +352,12 @@ static uint32_t getNumberOfRelocations(const coff_section *Sec,
   // VirtualAddress field in the first relocation entry.
   if (Sec->hasExtendedRelocations()) {
     const coff_relocation *FirstReloc;
-    if (getObject(FirstReloc, M, reinterpret_cast<const coff_relocation*>(
-        base + Sec->PointerToRelocations)))
+    if (Error E = getObject(FirstReloc, M,
+                            reinterpret_cast<const coff_relocation *>(
+                                base + Sec->PointerToRelocations))) {
+      consumeError(std::move(E));
       return 0;
+    }
     // -1 to exclude this first relocation entry.
     return FirstReloc->VirtualAddress - 1;
   }
@@ -371,9 +376,11 @@ getFirstReloc(const coff_section *Sec, MemoryBufferRef M, const uint8_t *Base) {
     // relocations.
     begin++;
   }
-  if (Binary::checkOffset(M, uintptr_t(begin),
-                          sizeof(coff_relocation) * NumRelocs))
+  if (auto E = Binary::checkOffset(M, uintptr_t(begin),
+                                   sizeof(coff_relocation) * NumRelocs)) {
+    consumeError(std::move(E));
     return nullptr;
+  }
   return begin;
 }
 
@@ -398,18 +405,18 @@ relocation_iterator COFFObjectFile::section_rel_end(DataRefImpl Ref) const {
 }
 
 // Initialize the pointer to the symbol table.
-std::error_code COFFObjectFile::initSymbolTablePtr() {
+Error COFFObjectFile::initSymbolTablePtr() {
   if (COFFHeader)
-    if (std::error_code EC = getObject(
+    if (Error E = getObject(
             SymbolTable16, Data, base() + getPointerToSymbolTable(),
             (uint64_t)getNumberOfSymbols() * getSymbolTableEntrySize()))
-      return EC;
+      return E;
 
   if (COFFBigObjHeader)
-    if (std::error_code EC = getObject(
+    if (Error E = getObject(
             SymbolTable32, Data, base() + getPointerToSymbolTable(),
             (uint64_t)getNumberOfSymbols() * getSymbolTableEntrySize()))
-      return EC;
+      return E;
 
   // Find string table. The first four byte of the string table contains the
   // total size of the string table, including the size field itself. If the
@@ -418,22 +425,21 @@ std::error_code COFFObjectFile::initSymbolTablePtr() {
                                getNumberOfSymbols() * getSymbolTableEntrySize();
   const uint8_t *StringTableAddr = base() + StringTableOffset;
   const ulittle32_t *StringTableSizePtr;
-  if (std::error_code EC = getObject(StringTableSizePtr, Data, StringTableAddr))
-    return EC;
+  if (Error E = getObject(StringTableSizePtr, Data, StringTableAddr))
+    return E;
   StringTableSize = *StringTableSizePtr;
-  if (std::error_code EC =
-          getObject(StringTable, Data, StringTableAddr, StringTableSize))
-    return EC;
+  if (Error E = getObject(StringTable, Data, StringTableAddr, StringTableSize))
+    return E;
 
   // Treat table sizes < 4 as empty because contrary to the PECOFF spec, some
   // tools like cvtres write a size of 0 for an empty table instead of 4.
   if (StringTableSize < 4)
-      StringTableSize = 4;
+    StringTableSize = 4;
 
   // Check that the string table is null terminated if has any in it.
   if (StringTableSize > 4 && StringTable[StringTableSize - 1] != 0)
-    return  object_error::parse_failed;
-  return std::error_code();
+    return errorCodeToError(object_error::parse_failed);
+  return Error::success();
 }
 
 uint64_t COFFObjectFile::getImageBase() const {
@@ -446,7 +452,7 @@ uint64_t COFFObjectFile::getImageBase() const {
 }
 
 // Returns the file offset for the given VA.
-std::error_code COFFObjectFile::getVaPtr(uint64_t Addr, uintptr_t &Res) const {
+Error COFFObjectFile::getVaPtr(uint64_t Addr, uintptr_t &Res) const {
   uint64_t ImageBase = getImageBase();
   uint64_t Rva = Addr - ImageBase;
   assert(Rva <= UINT32_MAX);
@@ -454,7 +460,7 @@ std::error_code COFFObjectFile::getVaPtr(uint64_t Addr, uintptr_t &Res) const {
 }
 
 // Returns the file offset for the given RVA.
-std::error_code COFFObjectFile::getRvaPtr(uint32_t Addr, uintptr_t &Res) const {
+Error COFFObjectFile::getRvaPtr(uint32_t Addr, uintptr_t &Res) const {
   for (const SectionRef &S : sections()) {
     const coff_section *Section = getCOFFSection(S);
     uint32_t SectionStart = Section->VirtualAddress;
@@ -462,15 +468,14 @@ std::error_code COFFObjectFile::getRvaPtr(uint32_t Addr, uintptr_t &Res) const {
     if (SectionStart <= Addr && Addr < SectionEnd) {
       uint32_t Offset = Addr - SectionStart;
       Res = uintptr_t(base()) + Section->PointerToRawData + Offset;
-      return std::error_code();
+      return Error::success();
     }
   }
-  return object_error::parse_failed;
+  return errorCodeToError(object_error::parse_failed);
 }
 
-std::error_code
-COFFObjectFile::getRvaAndSizeAsBytes(uint32_t RVA, uint32_t Size,
-                                     ArrayRef<uint8_t> &Contents) const {
+Error COFFObjectFile::getRvaAndSizeAsBytes(uint32_t RVA, uint32_t Size,
+                                           ArrayRef<uint8_t> &Contents) const {
   for (const SectionRef &S : sections()) {
     const coff_section *Section = getCOFFSection(S);
     uint32_t SectionStart = Section->VirtualAddress;
@@ -483,196 +488,207 @@ COFFObjectFile::getRvaAndSizeAsBytes(uint32_t RVA, uint32_t Size,
           uintptr_t(base()) + Section->PointerToRawData + OffsetIntoSection;
       Contents =
           ArrayRef<uint8_t>(reinterpret_cast<const uint8_t *>(Begin), Size);
-      return std::error_code();
+      return Error::success();
     }
   }
-  return object_error::parse_failed;
+  return errorCodeToError(object_error::parse_failed);
 }
 
 // Returns hint and name fields, assuming \p Rva is pointing to a Hint/Name
 // table entry.
-std::error_code COFFObjectFile::getHintName(uint32_t Rva, uint16_t &Hint,
-                                            StringRef &Name) const {
+Error COFFObjectFile::getHintName(uint32_t Rva, uint16_t &Hint,
+                                  StringRef &Name) const {
   uintptr_t IntPtr = 0;
-  if (std::error_code EC = getRvaPtr(Rva, IntPtr))
-    return EC;
+  if (Error E = getRvaPtr(Rva, IntPtr))
+    return E;
   const uint8_t *Ptr = reinterpret_cast<const uint8_t *>(IntPtr);
   Hint = *reinterpret_cast<const ulittle16_t *>(Ptr);
   Name = StringRef(reinterpret_cast<const char *>(Ptr + 2));
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code
-COFFObjectFile::getDebugPDBInfo(const debug_directory *DebugDir,
-                                const codeview::DebugInfo *&PDBInfo,
-                                StringRef &PDBFileName) const {
+Error COFFObjectFile::getDebugPDBInfo(const debug_directory *DebugDir,
+                                      const codeview::DebugInfo *&PDBInfo,
+                                      StringRef &PDBFileName) const {
   ArrayRef<uint8_t> InfoBytes;
-  if (std::error_code EC = getRvaAndSizeAsBytes(
+  if (Error E = getRvaAndSizeAsBytes(
           DebugDir->AddressOfRawData, DebugDir->SizeOfData, InfoBytes))
-    return EC;
+    return E;
   if (InfoBytes.size() < sizeof(*PDBInfo) + 1)
-    return object_error::parse_failed;
+    return errorCodeToError(object_error::parse_failed);
   PDBInfo = reinterpret_cast<const codeview::DebugInfo *>(InfoBytes.data());
   InfoBytes = InfoBytes.drop_front(sizeof(*PDBInfo));
   PDBFileName = StringRef(reinterpret_cast<const char *>(InfoBytes.data()),
                           InfoBytes.size());
   // Truncate the name at the first null byte. Ignore any padding.
   PDBFileName = PDBFileName.split('\0').first;
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code
-COFFObjectFile::getDebugPDBInfo(const codeview::DebugInfo *&PDBInfo,
-                                StringRef &PDBFileName) const {
+Error COFFObjectFile::getDebugPDBInfo(const codeview::DebugInfo *&PDBInfo,
+                                      StringRef &PDBFileName) const {
   for (const debug_directory &D : debug_directories())
     if (D.Type == COFF::IMAGE_DEBUG_TYPE_CODEVIEW)
       return getDebugPDBInfo(&D, PDBInfo, PDBFileName);
   // If we get here, there is no PDB info to return.
   PDBInfo = nullptr;
   PDBFileName = StringRef();
-  return std::error_code();
+  return Error::success();
 }
 
 // Find the import table.
-std::error_code COFFObjectFile::initImportTablePtr() {
+Error COFFObjectFile::initImportTablePtr() {
   // First, we get the RVA of the import table. If the file lacks a pointer to
   // the import table, do nothing.
-  const data_directory *DataEntry;
-  if (getDataDirectory(COFF::IMPORT_TABLE, DataEntry))
-    return std::error_code();
+  const data_directory *DataEntry = getDataDirectory(COFF::IMPORT_TABLE);
+  if (!DataEntry)
+    return Error::success();
 
   // Do nothing if the pointer to import table is NULL.
   if (DataEntry->RelativeVirtualAddress == 0)
-    return std::error_code();
+    return Error::success();
 
   uint32_t ImportTableRva = DataEntry->RelativeVirtualAddress;
 
   // Find the section that contains the RVA. This is needed because the RVA is
   // the import table's memory address which is different from its file offset.
   uintptr_t IntPtr = 0;
-  if (std::error_code EC = getRvaPtr(ImportTableRva, IntPtr))
-    return EC;
-  if (std::error_code EC = checkOffset(Data, IntPtr, DataEntry->Size))
-    return EC;
+  if (Error E = getRvaPtr(ImportTableRva, IntPtr))
+    return E;
+  if (Error E = checkOffset(Data, IntPtr, DataEntry->Size))
+    return E;
   ImportDirectory = reinterpret_cast<
       const coff_import_directory_table_entry *>(IntPtr);
-  return std::error_code();
+  return Error::success();
 }
 
 // Initializes DelayImportDirectory and NumberOfDelayImportDirectory.
-std::error_code COFFObjectFile::initDelayImportTablePtr() {
-  const data_directory *DataEntry;
-  if (getDataDirectory(COFF::DELAY_IMPORT_DESCRIPTOR, DataEntry))
-    return std::error_code();
+Error COFFObjectFile::initDelayImportTablePtr() {
+  const data_directory *DataEntry =
+      getDataDirectory(COFF::DELAY_IMPORT_DESCRIPTOR);
+  if (!DataEntry)
+    return Error::success();
   if (DataEntry->RelativeVirtualAddress == 0)
-    return std::error_code();
+    return Error::success();
 
   uint32_t RVA = DataEntry->RelativeVirtualAddress;
   NumberOfDelayImportDirectory = DataEntry->Size /
       sizeof(delay_import_directory_table_entry) - 1;
 
   uintptr_t IntPtr = 0;
-  if (std::error_code EC = getRvaPtr(RVA, IntPtr))
-    return EC;
+  if (Error E = getRvaPtr(RVA, IntPtr))
+    return E;
   DelayImportDirectory = reinterpret_cast<
       const delay_import_directory_table_entry *>(IntPtr);
-  return std::error_code();
+  return Error::success();
 }
 
 // Find the export table.
-std::error_code COFFObjectFile::initExportTablePtr() {
+Error COFFObjectFile::initExportTablePtr() {
   // First, we get the RVA of the export table. If the file lacks a pointer to
   // the export table, do nothing.
-  const data_directory *DataEntry;
-  if (getDataDirectory(COFF::EXPORT_TABLE, DataEntry))
-    return std::error_code();
+  const data_directory *DataEntry = getDataDirectory(COFF::EXPORT_TABLE);
+  if (!DataEntry)
+    return Error::success();
 
   // Do nothing if the pointer to export table is NULL.
   if (DataEntry->RelativeVirtualAddress == 0)
-    return std::error_code();
+    return Error::success();
 
   uint32_t ExportTableRva = DataEntry->RelativeVirtualAddress;
   uintptr_t IntPtr = 0;
-  if (std::error_code EC = getRvaPtr(ExportTableRva, IntPtr))
-    return EC;
+  if (Error E = getRvaPtr(ExportTableRva, IntPtr))
+    return E;
   ExportDirectory =
       reinterpret_cast<const export_directory_table_entry *>(IntPtr);
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code COFFObjectFile::initBaseRelocPtr() {
-  const data_directory *DataEntry;
-  if (getDataDirectory(COFF::BASE_RELOCATION_TABLE, DataEntry))
-    return std::error_code();
+Error COFFObjectFile::initBaseRelocPtr() {
+  const data_directory *DataEntry =
+      getDataDirectory(COFF::BASE_RELOCATION_TABLE);
+  if (!DataEntry)
+    return Error::success();
   if (DataEntry->RelativeVirtualAddress == 0)
-    return std::error_code();
+    return Error::success();
 
   uintptr_t IntPtr = 0;
-  if (std::error_code EC = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
-    return EC;
+  if (Error E = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
+    return E;
   BaseRelocHeader = reinterpret_cast<const coff_base_reloc_block_header *>(
       IntPtr);
   BaseRelocEnd = reinterpret_cast<coff_base_reloc_block_header *>(
       IntPtr + DataEntry->Size);
   // FIXME: Verify the section containing BaseRelocHeader has at least
   // DataEntry->Size bytes after DataEntry->RelativeVirtualAddress.
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code COFFObjectFile::initDebugDirectoryPtr() {
+Error COFFObjectFile::initDebugDirectoryPtr() {
   // Get the RVA of the debug directory. Do nothing if it does not exist.
-  const data_directory *DataEntry;
-  if (getDataDirectory(COFF::DEBUG_DIRECTORY, DataEntry))
-    return std::error_code();
+  const data_directory *DataEntry = getDataDirectory(COFF::DEBUG_DIRECTORY);
+  if (!DataEntry)
+    return Error::success();
 
   // Do nothing if the RVA is NULL.
   if (DataEntry->RelativeVirtualAddress == 0)
-    return std::error_code();
+    return Error::success();
 
   // Check that the size is a multiple of the entry size.
   if (DataEntry->Size % sizeof(debug_directory) != 0)
-    return object_error::parse_failed;
+    return errorCodeToError(object_error::parse_failed);
 
   uintptr_t IntPtr = 0;
-  if (std::error_code EC = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
-    return EC;
+  if (Error E = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
+    return E;
   DebugDirectoryBegin = reinterpret_cast<const debug_directory *>(IntPtr);
   DebugDirectoryEnd = reinterpret_cast<const debug_directory *>(
       IntPtr + DataEntry->Size);
   // FIXME: Verify the section containing DebugDirectoryBegin has at least
   // DataEntry->Size bytes after DataEntry->RelativeVirtualAddress.
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code COFFObjectFile::initLoadConfigPtr() {
+Error COFFObjectFile::initLoadConfigPtr() {
   // Get the RVA of the debug directory. Do nothing if it does not exist.
-  const data_directory *DataEntry;
-  if (getDataDirectory(COFF::LOAD_CONFIG_TABLE, DataEntry))
-    return std::error_code();
+  const data_directory *DataEntry = getDataDirectory(COFF::LOAD_CONFIG_TABLE);
+  if (!DataEntry)
+    return Error::success();
 
   // Do nothing if the RVA is NULL.
   if (DataEntry->RelativeVirtualAddress == 0)
-    return std::error_code();
+    return Error::success();
   uintptr_t IntPtr = 0;
-  if (std::error_code EC = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
-    return EC;
+  if (Error E = getRvaPtr(DataEntry->RelativeVirtualAddress, IntPtr))
+    return E;
 
   LoadConfig = (const void *)IntPtr;
-  return std::error_code();
+  return Error::success();
+}
+
+Expected<std::unique_ptr<COFFObjectFile>>
+COFFObjectFile::create(MemoryBufferRef Object) {
+  std::unique_ptr<COFFObjectFile> Obj(new COFFObjectFile(std::move(Object)));
+  if (Error E = Obj->initialize())
+    return std::move(E);
+  return std::move(Obj);
 }
 
-COFFObjectFile::COFFObjectFile(MemoryBufferRef Object, std::error_code &EC)
+COFFObjectFile::COFFObjectFile(MemoryBufferRef Object)
     : ObjectFile(Binary::ID_COFF, Object), COFFHeader(nullptr),
       COFFBigObjHeader(nullptr), PE32Header(nullptr), PE32PlusHeader(nullptr),
       DataDirectory(nullptr), SectionTable(nullptr), SymbolTable16(nullptr),
       SymbolTable32(nullptr), StringTable(nullptr), StringTableSize(0),
-      ImportDirectory(nullptr),
-      DelayImportDirectory(nullptr), NumberOfDelayImportDirectory(0),
-      ExportDirectory(nullptr), BaseRelocHeader(nullptr), BaseRelocEnd(nullptr),
-      DebugDirectoryBegin(nullptr), DebugDirectoryEnd(nullptr) {
+      ImportDirectory(nullptr), DelayImportDirectory(nullptr),
+      NumberOfDelayImportDirectory(0), ExportDirectory(nullptr),
+      BaseRelocHeader(nullptr), BaseRelocEnd(nullptr),
+      DebugDirectoryBegin(nullptr), DebugDirectoryEnd(nullptr) {}
+
+Error COFFObjectFile::initialize() {
   // Check that we at least have enough room for a header.
+  std::error_code EC;
   if (!checkSize(Data, EC, sizeof(coff_file_header)))
-    return;
+    return errorCodeToError(EC);
 
   // The current location in the file where we are looking at.
   uint64_t CurPtr = 0;
@@ -690,24 +706,23 @@ COFFObjectFile::COFFObjectFile(MemoryBufferRef Object, std::error_code &EC)
       CurPtr = DH->AddressOfNewExeHeader;
       // Check the PE magic bytes. ("PE\0\0")
       if (memcmp(base() + CurPtr, COFF::PEMagic, sizeof(COFF::PEMagic)) != 0) {
-        EC = object_error::parse_failed;
-        return;
+        return errorCodeToError(object_error::parse_failed);
       }
       CurPtr += sizeof(COFF::PEMagic); // Skip the PE magic bytes.
       HasPEHeader = true;
     }
   }
 
-  if ((EC = getObject(COFFHeader, Data, base() + CurPtr)))
-    return;
+  if (Error E = getObject(COFFHeader, Data, base() + CurPtr))
+    return E;
 
   // It might be a bigobj file, let's check.  Note that COFF bigobj and COFF
   // import libraries share a common prefix but bigobj is more restrictive.
   if (!HasPEHeader && COFFHeader->Machine == COFF::IMAGE_FILE_MACHINE_UNKNOWN &&
       COFFHeader->NumberOfSections == uint16_t(0xffff) &&
       checkSize(Data, EC, sizeof(coff_bigobj_file_header))) {
-    if ((EC = getObject(COFFBigObjHeader, Data, base() + CurPtr)))
-      return;
+    if (Error E = getObject(COFFBigObjHeader, Data, base() + CurPtr))
+      return E;
 
     // Verify that we are dealing with bigobj.
     if (COFFBigObjHeader->Version >= COFF::BigObjHeader::MinBigObjectVersion &&
@@ -727,13 +742,13 @@ COFFObjectFile::COFFObjectFile(MemoryBufferRef Object, std::error_code &EC)
     CurPtr += sizeof(coff_file_header);
 
     if (COFFHeader->isImportLibrary())
-      return;
+      return errorCodeToError(EC);
   }
 
   if (HasPEHeader) {
     const pe32_header *Header;
-    if ((EC = getObject(Header, Data, base() + CurPtr)))
-      return;
+    if (Error E = getObject(Header, Data, base() + CurPtr))
+      return E;
 
     const uint8_t *DataDirAddr;
     uint64_t DataDirSize;
@@ -747,23 +762,27 @@ COFFObjectFile::COFFObjectFile(MemoryBufferRef Object, std::error_code &EC)
       DataDirSize = sizeof(data_directory) * PE32PlusHeader->NumberOfRvaAndSize;
     } else {
       // It's neither PE32 nor PE32+.
-      EC = object_error::parse_failed;
-      return;
+      return errorCodeToError(object_error::parse_failed);
     }
-    if ((EC = getObject(DataDirectory, Data, DataDirAddr, DataDirSize)))
-      return;
+    if (Error E = getObject(DataDirectory, Data, DataDirAddr, DataDirSize))
+      return E;
   }
 
   if (COFFHeader)
     CurPtr += COFFHeader->SizeOfOptionalHeader;
 
-  if ((EC = getObject(SectionTable, Data, base() + CurPtr,
-                      (uint64_t)getNumberOfSections() * sizeof(coff_section))))
-    return;
+  assert(COFFHeader || COFFBigObjHeader);
+
+  if (Error E =
+          getObject(SectionTable, Data, base() + CurPtr,
+                    (uint64_t)getNumberOfSections() * sizeof(coff_section)))
+    return E;
 
   // Initialize the pointer to the symbol table.
   if (getPointerToSymbolTable() != 0) {
-    if ((EC = initSymbolTablePtr())) {
+    if (Error E = initSymbolTablePtr()) {
+      // Recover from errors reading the symbol table.
+      consumeError(std::move(E));
       SymbolTable16 = nullptr;
       SymbolTable32 = nullptr;
       StringTable = nullptr;
@@ -772,33 +791,32 @@ COFFObjectFile::COFFObjectFile(MemoryBufferRef Object, std::error_code &EC)
   } else {
     // We had better not have any symbols if we don't have a symbol table.
     if (getNumberOfSymbols() != 0) {
-      EC = object_error::parse_failed;
-      return;
+      return errorCodeToError(object_error::parse_failed);
     }
   }
 
   // Initialize the pointer to the beginning of the import table.
-  if ((EC = initImportTablePtr()))
-    return;
-  if ((EC = initDelayImportTablePtr()))
-    return;
+  if (Error E = initImportTablePtr())
+    return E;
+  if (Error E = initDelayImportTablePtr())
+    return E;
 
   // Initialize the pointer to the export table.
-  if ((EC = initExportTablePtr()))
-    return;
+  if (Error E = initExportTablePtr())
+    return E;
 
   // Initialize the pointer to the base relocation table.
-  if ((EC = initBaseRelocPtr()))
-    return;
+  if (Error E = initBaseRelocPtr())
+    return E;
 
   // Initialize the pointer to the export table.
-  if ((EC = initDebugDirectoryPtr()))
-    return;
+  if (Error E = initDebugDirectoryPtr())
+    return E;
 
-  if ((EC = initLoadConfigPtr()))
-    return;
+  if (Error E = initLoadConfigPtr())
+    return E;
 
-  EC = std::error_code();
+  return Error::success();
 }
 
 basic_symbol_iterator COFFObjectFile::symbol_begin() const {
@@ -936,86 +954,54 @@ iterator_range<base_reloc_iterator> COFFObjectFile::base_relocs() const {
   return make_range(base_reloc_begin(), base_reloc_end());
 }
 
-std::error_code
-COFFObjectFile::getDataDirectory(uint32_t Index,
-                                 const data_directory *&Res) const {
-  // Error if there's no data directory or the index is out of range.
-  if (!DataDirectory) {
-    Res = nullptr;
-    return object_error::parse_failed;
-  }
+const data_directory *COFFObjectFile::getDataDirectory(uint32_t Index) const {
+  if (!DataDirectory)
+    return nullptr;
   assert(PE32Header || PE32PlusHeader);
   uint32_t NumEnt = PE32Header ? PE32Header->NumberOfRvaAndSize
                                : PE32PlusHeader->NumberOfRvaAndSize;
-  if (Index >= NumEnt) {
-    Res = nullptr;
-    return object_error::parse_failed;
-  }
-  Res = &DataDirectory[Index];
-  return std::error_code();
+  if (Index >= NumEnt)
+    return nullptr;
+  return &DataDirectory[Index];
 }
 
-std::error_code COFFObjectFile::getSection(int32_t Index,
-                                           const coff_section *&Result) const {
-  Result = nullptr;
+Expected<const coff_section *> COFFObjectFile::getSection(int32_t Index) const {
+  // Perhaps getting the section of a reserved section index should be an error,
+  // but callers rely on this to return null.
   if (COFF::isReservedSectionNumber(Index))
-    return std::error_code();
+    return (const coff_section *)nullptr;
   if (static_cast<uint32_t>(Index) <= getNumberOfSections()) {
     // We already verified the section table data, so no need to check again.
-    Result = SectionTable + (Index - 1);
-    return std::error_code();
-  }
-  return object_error::parse_failed;
-}
-
-std::error_code COFFObjectFile::getSection(StringRef SectionName,
-                                           const coff_section *&Result) const {
-  Result = nullptr;
-  for (const SectionRef &Section : sections()) {
-    auto NameOrErr = Section.getName();
-    if (!NameOrErr)
-      return errorToErrorCode(NameOrErr.takeError());
-
-    if (*NameOrErr == SectionName) {
-      Result = getCOFFSection(Section);
-      return std::error_code();
-    }
+    return SectionTable + (Index - 1);
   }
-  return object_error::parse_failed;
+  return errorCodeToError(object_error::parse_failed);
 }
 
-std::error_code COFFObjectFile::getString(uint32_t Offset,
-                                          StringRef &Result) const {
+Expected<StringRef> COFFObjectFile::getString(uint32_t Offset) const {
   if (StringTableSize <= 4)
     // Tried to get a string from an empty string table.
-    return object_error::parse_failed;
+    return errorCodeToError(object_error::parse_failed);
   if (Offset >= StringTableSize)
-    return object_error::unexpected_eof;
-  Result = StringRef(StringTable + Offset);
-  return std::error_code();
+    return errorCodeToError(object_error::unexpected_eof);
+  return StringRef(StringTable + Offset);
 }
 
-std::error_code COFFObjectFile::getSymbolName(COFFSymbolRef Symbol,
-                                              StringRef &Res) const {
-  return getSymbolName(Symbol.getGeneric(), Res);
+Expected<StringRef> COFFObjectFile::getSymbolName(COFFSymbolRef Symbol) const {
+  return getSymbolName(Symbol.getGeneric());
 }
 
-std::error_code COFFObjectFile::getSymbolName(const coff_symbol_generic *Symbol,
-                                              StringRef &Res) const {
+Expected<StringRef>
+COFFObjectFile::getSymbolName(const coff_symbol_generic *Symbol) const {
   // Check for string table entry. First 4 bytes are 0.
-  if (Symbol->Name.Offset.Zeroes == 0) {
-    if (std::error_code EC = getString(Symbol->Name.Offset.Offset, Res))
-      return EC;
-    return std::error_code();
-  }
+  if (Symbol->Name.Offset.Zeroes == 0)
+    return getString(Symbol->Name.Offset.Offset);
 
+  // Null terminated, let ::strlen figure out the length.
   if (Symbol->Name.ShortName[COFF::NameSize - 1] == 0)
-    // Null terminated, let ::strlen figure out the length.
-    Res = StringRef(Symbol->Name.ShortName);
-  else
-    // Not null terminated, use all 8 bytes.
-    Res = StringRef(Symbol->Name.ShortName, COFF::NameSize);
-  return std::error_code();
+    return StringRef(Symbol->Name.ShortName);
+
+  // Not null terminated, use all 8 bytes.
+  return StringRef(Symbol->Name.ShortName, COFF::NameSize);
 }
 
 ArrayRef<uint8_t>
@@ -1067,14 +1053,13 @@ COFFObjectFile::getSectionName(const coff_section *Sec) const {
     if (Name.startswith("//")) {
       if (decodeBase64StringEntry(Name.substr(2), Offset))
         return createStringError(object_error::parse_failed,
-                                 "inalid section name");
+                                 "invalid section name");
     } else {
       if (Name.substr(1).getAsInteger(10, Offset))
         return createStringError(object_error::parse_failed,
                                  "invalid section name");
     }
-    if (std::error_code EC = getString(Offset, Name))
-      return errorCodeToError(EC);
+    return getString(Offset);
   }
 
   return Name;
@@ -1107,8 +1092,8 @@ Error COFFObjectFile::getSectionContents(const coff_section *Sec,
   // data, as there's nothing that says that is not allowed.
   uintptr_t ConStart = uintptr_t(base()) + Sec->PointerToRawData;
   uint32_t SectionSize = getSectionSize(Sec);
-  if (checkOffset(Data, ConStart, SectionSize))
-    return make_error<BinaryError>();
+  if (Error E = checkOffset(Data, ConStart, SectionSize))
+    return E;
   Res = makeArrayRef(reinterpret_cast<const uint8_t *>(ConStart), SectionSize);
   return Error::success();
 }
@@ -1304,7 +1289,7 @@ void ImportDirectoryEntryRef::moveNext() {
   }
 }
 
-std::error_code ImportDirectoryEntryRef::getImportTableEntry(
+Error ImportDirectoryEntryRef::getImportTableEntry(
     const coff_import_directory_table_entry *&Result) const {
   return getObject(Result, OwningObject->Data, ImportTable + Index);
 }
@@ -1323,14 +1308,16 @@ makeImportedSymbolIterator(const COFFObjectFile *Object,
 static imported_symbol_iterator
 importedSymbolBegin(uint32_t RVA, const COFFObjectFile *Object) {
   uintptr_t IntPtr = 0;
-  Object->getRvaPtr(RVA, IntPtr);
+  // FIXME: Handle errors.
+  cantFail(Object->getRvaPtr(RVA, IntPtr));
   return makeImportedSymbolIterator(Object, IntPtr, 0);
 }
 
 static imported_symbol_iterator
 importedSymbolEnd(uint32_t RVA, const COFFObjectFile *Object) {
   uintptr_t IntPtr = 0;
-  Object->getRvaPtr(RVA, IntPtr);
+  // FIXME: Handle errors.
+  cantFail(Object->getRvaPtr(RVA, IntPtr));
   // Forward the pointer to the last entry which is null.
   int Index = 0;
   if (Object->getBytesInAddress() == 4) {
@@ -1377,25 +1364,24 @@ ImportDirectoryEntryRef::lookup_table_symbols() const {
   return make_range(lookup_table_begin(), lookup_table_end());
 }
 
-std::error_code ImportDirectoryEntryRef::getName(StringRef &Result) const {
+Error ImportDirectoryEntryRef::getName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
-  if (std::error_code EC =
-          OwningObject->getRvaPtr(ImportTable[Index].NameRVA, IntPtr))
-    return EC;
+  if (Error E = OwningObject->getRvaPtr(ImportTable[Index].NameRVA, IntPtr))
+    return E;
   Result = StringRef(reinterpret_cast<const char *>(IntPtr));
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code
+Error
 ImportDirectoryEntryRef::getImportLookupTableRVA(uint32_t  &Result) const {
   Result = ImportTable[Index].ImportLookupTableRVA;
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code
-ImportDirectoryEntryRef::getImportAddressTableRVA(uint32_t &Result) const {
+Error ImportDirectoryEntryRef::getImportAddressTableRVA(
+    uint32_t &Result) const {
   Result = ImportTable[Index].ImportAddressTableRVA;
-  return std::error_code();
+  return Error::success();
 }
 
 bool DelayImportDirectoryEntryRef::
@@ -1424,32 +1410,32 @@ DelayImportDirectoryEntryRef::imported_symbols() const {
   return make_range(imported_symbol_begin(), imported_symbol_end());
 }
 
-std::error_code DelayImportDirectoryEntryRef::getName(StringRef &Result) const {
+Error DelayImportDirectoryEntryRef::getName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
-  if (std::error_code EC = OwningObject->getRvaPtr(Table[Index].Name, IntPtr))
-    return EC;
+  if (Error E = OwningObject->getRvaPtr(Table[Index].Name, IntPtr))
+    return E;
   Result = StringRef(reinterpret_cast<const char *>(IntPtr));
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code DelayImportDirectoryEntryRef::
-getDelayImportTable(const delay_import_directory_table_entry *&Result) const {
+Error DelayImportDirectoryEntryRef::getDelayImportTable(
+    const delay_import_directory_table_entry *&Result) const {
   Result = &Table[Index];
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code DelayImportDirectoryEntryRef::
-getImportAddress(int AddrIndex, uint64_t &Result) const {
+Error DelayImportDirectoryEntryRef::getImportAddress(int AddrIndex,
+                                                     uint64_t &Result) const {
   uint32_t RVA = Table[Index].DelayImportAddressTable +
       AddrIndex * (OwningObject->is64() ? 8 : 4);
   uintptr_t IntPtr = 0;
-  if (std::error_code EC = OwningObject->getRvaPtr(RVA, IntPtr))
-    return EC;
+  if (Error E = OwningObject->getRvaPtr(RVA, IntPtr))
+    return E;
   if (OwningObject->is64())
     Result = *reinterpret_cast<const ulittle64_t *>(IntPtr);
   else
     Result = *reinterpret_cast<const ulittle32_t *>(IntPtr);
-  return std::error_code();
+  return Error::success();
 }
 
 bool ExportDirectoryEntryRef::
@@ -1463,46 +1449,44 @@ void ExportDirectoryEntryRef::moveNext() {
 
 // Returns the name of the current export symbol. If the symbol is exported only
 // by ordinal, the empty string is set as a result.
-std::error_code ExportDirectoryEntryRef::getDllName(StringRef &Result) const {
+Error ExportDirectoryEntryRef::getDllName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
-  if (std::error_code EC =
-          OwningObject->getRvaPtr(ExportTable->NameRVA, IntPtr))
-    return EC;
+  if (Error E = OwningObject->getRvaPtr(ExportTable->NameRVA, IntPtr))
+    return E;
   Result = StringRef(reinterpret_cast<const char *>(IntPtr));
-  return std::error_code();
+  return Error::success();
 }
 
 // Returns the starting ordinal number.
-std::error_code
-ExportDirectoryEntryRef::getOrdinalBase(uint32_t &Result) const {
+Error ExportDirectoryEntryRef::getOrdinalBase(uint32_t &Result) const {
   Result = ExportTable->OrdinalBase;
-  return std::error_code();
+  return Error::success();
 }
 
 // Returns the export ordinal of the current export symbol.
-std::error_code ExportDirectoryEntryRef::getOrdinal(uint32_t &Result) const {
+Error ExportDirectoryEntryRef::getOrdinal(uint32_t &Result) const {
   Result = ExportTable->OrdinalBase + Index;
-  return std::error_code();
+  return Error::success();
 }
 
 // Returns the address of the current export symbol.
-std::error_code ExportDirectoryEntryRef::getExportRVA(uint32_t &Result) const {
+Error ExportDirectoryEntryRef::getExportRVA(uint32_t &Result) const {
   uintptr_t IntPtr = 0;
-  if (std::error_code EC =
+  if (Error EC =
           OwningObject->getRvaPtr(ExportTable->ExportAddressTableRVA, IntPtr))
     return EC;
   const export_address_table_entry *entry =
       reinterpret_cast<const export_address_table_entry *>(IntPtr);
   Result = entry[Index].ExportRVA;
-  return std::error_code();
+  return Error::success();
 }
 
 // Returns the name of the current export symbol. If the symbol is exported only
 // by ordinal, the empty string is set as a result.
-std::error_code
+Error
 ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const {
   uintptr_t IntPtr = 0;
-  if (std::error_code EC =
+  if (Error EC =
           OwningObject->getRvaPtr(ExportTable->OrdinalTableRVA, IntPtr))
     return EC;
   const ulittle16_t *Start = reinterpret_cast<const ulittle16_t *>(IntPtr);
@@ -1513,33 +1497,34 @@ ExportDirectoryEntryRef::getSymbolName(StringRef &Result) const {
        I < E; ++I, ++Offset) {
     if (*I != Index)
       continue;
-    if (std::error_code EC =
+    if (Error EC =
             OwningObject->getRvaPtr(ExportTable->NamePointerRVA, IntPtr))
       return EC;
     const ulittle32_t *NamePtr = reinterpret_cast<const ulittle32_t *>(IntPtr);
-    if (std::error_code EC = OwningObject->getRvaPtr(NamePtr[Offset], IntPtr))
+    if (Error EC = OwningObject->getRvaPtr(NamePtr[Offset], IntPtr))
       return EC;
     Result = StringRef(reinterpret_cast<const char *>(IntPtr));
-    return std::error_code();
+    return Error::success();
   }
   Result = "";
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code ExportDirectoryEntryRef::isForwarder(bool &Result) const {
-  const data_directory *DataEntry;
-  if (auto EC = OwningObject->getDataDirectory(COFF::EXPORT_TABLE, DataEntry))
-    return EC;
+Error ExportDirectoryEntryRef::isForwarder(bool &Result) const {
+  const data_directory *DataEntry =
+      OwningObject->getDataDirectory(COFF::EXPORT_TABLE);
+  if (!DataEntry)
+    return errorCodeToError(object_error::parse_failed);
   uint32_t RVA;
   if (auto EC = getExportRVA(RVA))
     return EC;
   uint32_t Begin = DataEntry->RelativeVirtualAddress;
   uint32_t End = DataEntry->RelativeVirtualAddress + DataEntry->Size;
   Result = (Begin <= RVA && RVA < End);
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code ExportDirectoryEntryRef::getForwardTo(StringRef &Result) const {
+Error ExportDirectoryEntryRef::getForwardTo(StringRef &Result) const {
   uint32_t RVA;
   if (auto EC = getExportRVA(RVA))
     return EC;
@@ -1547,7 +1532,7 @@ std::error_code ExportDirectoryEntryRef::getForwardTo(StringRef &Result) const {
   if (auto EC = OwningObject->getRvaPtr(RVA, IntPtr))
     return EC;
   Result = StringRef(reinterpret_cast<const char *>(IntPtr));
-  return std::error_code();
+  return Error::success();
 }
 
 bool ImportedSymbolRef::
@@ -1560,72 +1545,67 @@ void ImportedSymbolRef::moveNext() {
   ++Index;
 }
 
-std::error_code
-ImportedSymbolRef::getSymbolName(StringRef &Result) const {
+Error ImportedSymbolRef::getSymbolName(StringRef &Result) const {
   uint32_t RVA;
   if (Entry32) {
     // If a symbol is imported only by ordinal, it has no name.
     if (Entry32[Index].isOrdinal())
-      return std::error_code();
+      return Error::success();
     RVA = Entry32[Index].getHintNameRVA();
   } else {
     if (Entry64[Index].isOrdinal())
-      return std::error_code();
+      return Error::success();
     RVA = Entry64[Index].getHintNameRVA();
   }
   uintptr_t IntPtr = 0;
-  if (std::error_code EC = OwningObject->getRvaPtr(RVA, IntPtr))
+  if (Error EC = OwningObject->getRvaPtr(RVA, IntPtr))
     return EC;
   // +2 because the first two bytes is hint.
   Result = StringRef(reinterpret_cast<const char *>(IntPtr + 2));
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code ImportedSymbolRef::isOrdinal(bool &Result) const {
+Error ImportedSymbolRef::isOrdinal(bool &Result) const {
   if (Entry32)
     Result = Entry32[Index].isOrdinal();
   else
     Result = Entry64[Index].isOrdinal();
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code ImportedSymbolRef::getHintNameRVA(uint32_t &Result) const {
+Error ImportedSymbolRef::getHintNameRVA(uint32_t &Result) const {
   if (Entry32)
     Result = Entry32[Index].getHintNameRVA();
   else
     Result = Entry64[Index].getHintNameRVA();
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code ImportedSymbolRef::getOrdinal(uint16_t &Result) const {
+Error ImportedSymbolRef::getOrdinal(uint16_t &Result) const {
   uint32_t RVA;
   if (Entry32) {
     if (Entry32[Index].isOrdinal()) {
       Result = Entry32[Index].getOrdinal();
-      return std::error_code();
+      return Error::success();
     }
     RVA = Entry32[Index].getHintNameRVA();
   } else {
     if (Entry64[Index].isOrdinal()) {
       Result = Entry64[Index].getOrdinal();
-      return std::error_code();
+      return Error::success();
     }
     RVA = Entry64[Index].getHintNameRVA();
   }
   uintptr_t IntPtr = 0;
-  if (std::error_code EC = OwningObject->getRvaPtr(RVA, IntPtr))
+  if (Error EC = OwningObject->getRvaPtr(RVA, IntPtr))
     return EC;
   Result = *reinterpret_cast<const ulittle16_t *>(IntPtr);
-  return std::error_code();
+  return Error::success();
 }
 
 Expected<std::unique_ptr<COFFObjectFile>>
 ObjectFile::createCOFFObjectFile(MemoryBufferRef Object) {
-  std::error_code EC;
-  std::unique_ptr<COFFObjectFile> Ret(new COFFObjectFile(Object, EC));
-  if (EC)
-    return errorCodeToError(EC);
-  return std::move(Ret);
+  return COFFObjectFile::create(Object);
 }
 
 bool BaseRelocRef::operator==(const BaseRelocRef &Other) const {
@@ -1650,16 +1630,16 @@ void BaseRelocRef::moveNext() {
   }
 }
 
-std::error_code BaseRelocRef::getType(uint8_t &Type) const {
+Error BaseRelocRef::getType(uint8_t &Type) const {
   auto *Entry = reinterpret_cast<const coff_base_reloc_block_entry *>(Header + 1);
   Type = Entry[Index].getType();
-  return std::error_code();
+  return Error::success();
 }
 
-std::error_code BaseRelocRef::getRVA(uint32_t &Result) const {
+Error BaseRelocRef::getRVA(uint32_t &Result) const {
   auto *Entry = reinterpret_cast<const coff_base_reloc_block_entry *>(Header + 1);
   Result = Header->PageRVA + Entry[Index].getOffset();
-  return std::error_code();
+  return Error::success();
 }
 
 #define RETURN_IF_ERROR(Expr)                                                  \
@@ -1823,15 +1803,16 @@ ResourceSectionRef::getContents(const coff_resource_data_entry &Entry) {
     Expected<COFFSymbolRef> Sym = Obj->getSymbol(R.SymbolTableIndex);
     if (!Sym)
       return Sym.takeError();
-    const coff_section *Section = nullptr;
     // And the symbol's section
-    if (std::error_code EC = Obj->getSection(Sym->getSectionNumber(), Section))
-      return errorCodeToError(EC);
+    Expected<const coff_section *> Section =
+        Obj->getSection(Sym->getSectionNumber());
+    if (!Section)
+      return Section.takeError();
     // Add the initial value of DataRVA to the symbol's offset to find the
     // data it points at.
     uint64_t Offset = Entry.DataRVA + Sym->getValue();
     ArrayRef<uint8_t> Contents;
-    if (Error E = Obj->getSectionContents(Section, Contents))
+    if (Error E = Obj->getSectionContents(*Section, Contents))
       return std::move(E);
     if (Offset + Entry.DataSize > Contents.size())
       return createStringError(object_error::parse_failed,
diff --git a/llvm/lib/Object/ELF.cpp b/llvm/lib/Object/ELF.cpp
index f17a6da23d7d..2515695095a1 100644
--- a/llvm/lib/Object/ELF.cpp
+++ b/llvm/lib/Object/ELF.cpp
@@ -145,6 +145,13 @@ StringRef llvm::object::getELFRelocationTypeName(uint32_t Machine,
       break;
     }
     break;
+  case ELF::EM_VE:
+    switch (Type) {
+#include "llvm/BinaryFormat/ELFRelocs/VE.def"
+    default:
+      break;
+    }
+    break;
   default:
     break;
   }
@@ -223,6 +230,9 @@ StringRef llvm::object::getELFSectionTypeName(uint32_t Machine, unsigned Type) {
       STRINGIFY_ENUM_CASE(ELF, SHT_MIPS_ABIFLAGS);
     }
     break;
+  case ELF::EM_RISCV:
+    switch (Type) { STRINGIFY_ENUM_CASE(ELF, SHT_RISCV_ATTRIBUTES); }
+    break;
   default:
     break;
   }
@@ -499,7 +509,6 @@ std::string ELFFile<ELFT>::getDynamicTagAsString(uint64_t Type) const {
 template <class ELFT>
 Expected<typename ELFT::DynRange> ELFFile<ELFT>::dynamicEntries() const {
   ArrayRef<Elf_Dyn> Dyn;
-  size_t DynSecSize = 0;
 
   auto ProgramHeadersOrError = program_headers();
   if (!ProgramHeadersOrError)
@@ -510,7 +519,6 @@ Expected<typename ELFT::DynRange> ELFFile<ELFT>::dynamicEntries() const {
       Dyn = makeArrayRef(
           reinterpret_cast<const Elf_Dyn *>(base() + Phdr.p_offset),
           Phdr.p_filesz / sizeof(Elf_Dyn));
-      DynSecSize = Phdr.p_filesz;
       break;
     }
   }
@@ -529,7 +537,6 @@ Expected<typename ELFT::DynRange> ELFFile<ELFT>::dynamicEntries() const {
         if (!DynOrError)
           return DynOrError.takeError();
         Dyn = *DynOrError;
-        DynSecSize = Sec.sh_size;
         break;
       }
     }
@@ -542,10 +549,6 @@ Expected<typename ELFT::DynRange> ELFFile<ELFT>::dynamicEntries() const {
     // TODO: this error is untested.
     return createError("invalid empty dynamic section");
 
-  if (DynSecSize % sizeof(Elf_Dyn) != 0)
-    // TODO: this error is untested.
-    return createError("malformed dynamic section");
-
   if (Dyn.back().d_tag != ELF::DT_NULL)
     // TODO: this error is untested.
     return createError("dynamic sections must be DT_NULL terminated");
@@ -580,7 +583,18 @@ Expected<const uint8_t *> ELFFile<ELFT>::toMappedAddr(uint64_t VAddr) const {
   if (Delta >= Phdr.p_filesz)
     return createError("virtual address is not in any segment: 0x" +
                        Twine::utohexstr(VAddr));
-  return base() + Phdr.p_offset + Delta;
+
+  uint64_t Offset = Phdr.p_offset + Delta;
+  if (Offset >= getBufSize())
+    return createError("can't map virtual address 0x" +
+                       Twine::utohexstr(VAddr) + " to the segment with index " +
+                       Twine(&Phdr - (*ProgramHeadersOrError).data() + 1) +
+                       ": the segment ends at 0x" +
+                       Twine::utohexstr(Phdr.p_offset + Phdr.p_filesz) +
+                       ", which is greater than the file size (0x" +
+                       Twine::utohexstr(getBufSize()) + ")");
+
+  return base() + Offset;
 }
 
 template class llvm::object::ELFFile<ELF32LE>;
diff --git a/llvm/lib/Object/ELFObjectFile.cpp b/llvm/lib/Object/ELFObjectFile.cpp
index bf6ffd6c37b9..c919d25855d2 100644
--- a/llvm/lib/Object/ELFObjectFile.cpp
+++ b/llvm/lib/Object/ELFObjectFile.cpp
@@ -23,6 +23,8 @@
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/RISCVAttributeParser.h"
+#include "llvm/Support/RISCVAttributes.h"
 #include "llvm/Support/TargetRegistry.h"
 #include <algorithm>
 #include <cstddef>
@@ -157,17 +159,21 @@ SubtargetFeatures ELFObjectFileBase::getMIPSFeatures() const {
 SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
   SubtargetFeatures Features;
   ARMAttributeParser Attributes;
-  if (Error E = getBuildAttributes(Attributes))
+  if (Error E = getBuildAttributes(Attributes)) {
+    consumeError(std::move(E));
     return SubtargetFeatures();
+  }
 
   // both ARMv7-M and R have to support thumb hardware div
   bool isV7 = false;
-  if (Attributes.hasAttribute(ARMBuildAttrs::CPU_arch))
-    isV7 = Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch)
-      == ARMBuildAttrs::v7;
-
-  if (Attributes.hasAttribute(ARMBuildAttrs::CPU_arch_profile)) {
-    switch(Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch_profile)) {
+  Optional<unsigned> Attr =
+      Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch);
+  if (Attr.hasValue())
+    isV7 = Attr.getValue() == ARMBuildAttrs::v7;
+
+  Attr = Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch_profile);
+  if (Attr.hasValue()) {
+    switch (Attr.getValue()) {
     case ARMBuildAttrs::ApplicationProfile:
       Features.AddFeature("aclass");
       break;
@@ -184,8 +190,9 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
     }
   }
 
-  if (Attributes.hasAttribute(ARMBuildAttrs::THUMB_ISA_use)) {
-    switch(Attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use)) {
+  Attr = Attributes.getAttributeValue(ARMBuildAttrs::THUMB_ISA_use);
+  if (Attr.hasValue()) {
+    switch (Attr.getValue()) {
     default:
       break;
     case ARMBuildAttrs::Not_Allowed:
@@ -198,8 +205,9 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
     }
   }
 
-  if (Attributes.hasAttribute(ARMBuildAttrs::FP_arch)) {
-    switch(Attributes.getAttributeValue(ARMBuildAttrs::FP_arch)) {
+  Attr = Attributes.getAttributeValue(ARMBuildAttrs::FP_arch);
+  if (Attr.hasValue()) {
+    switch (Attr.getValue()) {
     default:
       break;
     case ARMBuildAttrs::Not_Allowed:
@@ -221,8 +229,9 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
     }
   }
 
-  if (Attributes.hasAttribute(ARMBuildAttrs::Advanced_SIMD_arch)) {
-    switch(Attributes.getAttributeValue(ARMBuildAttrs::Advanced_SIMD_arch)) {
+  Attr = Attributes.getAttributeValue(ARMBuildAttrs::Advanced_SIMD_arch);
+  if (Attr.hasValue()) {
+    switch (Attr.getValue()) {
     default:
       break;
     case ARMBuildAttrs::Not_Allowed:
@@ -239,8 +248,9 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
     }
   }
 
-  if (Attributes.hasAttribute(ARMBuildAttrs::MVE_arch)) {
-    switch(Attributes.getAttributeValue(ARMBuildAttrs::MVE_arch)) {
+  Attr = Attributes.getAttributeValue(ARMBuildAttrs::MVE_arch);
+  if (Attr.hasValue()) {
+    switch (Attr.getValue()) {
     default:
       break;
     case ARMBuildAttrs::Not_Allowed:
@@ -257,8 +267,9 @@ SubtargetFeatures ELFObjectFileBase::getARMFeatures() const {
     }
   }
 
-  if (Attributes.hasAttribute(ARMBuildAttrs::DIV_use)) {
-    switch(Attributes.getAttributeValue(ARMBuildAttrs::DIV_use)) {
+  Attr = Attributes.getAttributeValue(ARMBuildAttrs::DIV_use);
+  if (Attr.hasValue()) {
+    switch (Attr.getValue()) {
     default:
       break;
     case ARMBuildAttrs::DisallowDIV:
@@ -283,6 +294,51 @@ SubtargetFeatures ELFObjectFileBase::getRISCVFeatures() const {
     Features.AddFeature("c");
   }
 
+  // Add features according to the ELF attribute section.
+  // If there are any unrecognized features, ignore them.
+  RISCVAttributeParser Attributes;
+  if (Error E = getBuildAttributes(Attributes)) {
+    // TODO Propagate Error.
+    consumeError(std::move(E));
+    return Features; // Keep "c" feature if there is one in PlatformFlags.
+  }
+
+  Optional<StringRef> Attr = Attributes.getAttributeString(RISCVAttrs::ARCH);
+  if (Attr.hasValue()) {
+    // The Arch pattern is [rv32|rv64][i|e]version(_[m|a|f|d|c]version)*
+    // Version string pattern is (major)p(minor). Major and minor are optional.
+    // For example, a version number could be 2p0, 2, or p92.
+    StringRef Arch = Attr.getValue();
+    if (Arch.consume_front("rv32"))
+      Features.AddFeature("64bit", false);
+    else if (Arch.consume_front("rv64"))
+      Features.AddFeature("64bit");
+
+    while (!Arch.empty()) {
+      switch (Arch[0]) {
+      default:
+        break; // Ignore unexpected features.
+      case 'i':
+        Features.AddFeature("e", false);
+        break;
+      case 'd':
+        Features.AddFeature("f"); // D-ext will imply F-ext.
+        LLVM_FALLTHROUGH;
+      case 'e':
+      case 'm':
+      case 'a':
+      case 'f':
+      case 'c':
+        Features.AddFeature(Arch.take_front());
+        break;
+      }
+
+      // FIXME: Handle version numbers.
+      Arch = Arch.drop_until([](char c) { return c == '_' || c == '\0'; });
+      Arch = Arch.drop_while([](char c) { return c == '_'; });
+    }
+  }
+
   return Features;
 }
 
@@ -305,8 +361,11 @@ void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const {
     return;
 
   ARMAttributeParser Attributes;
-  if (Error E = getBuildAttributes(Attributes))
+  if (Error E = getBuildAttributes(Attributes)) {
+    // TODO Propagate Error.
+    consumeError(std::move(E));
     return;
+  }
 
   std::string Triple;
   // Default to ARM, but use the triple if it's been set.
@@ -315,8 +374,10 @@ void ELFObjectFileBase::setARMSubArch(Triple &TheTriple) const {
   else
     Triple = "arm";
 
-  if (Attributes.hasAttribute(ARMBuildAttrs::CPU_arch)) {
-    switch(Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch)) {
+  Optional<unsigned> Attr =
+      Attributes.getAttributeValue(ARMBuildAttrs::CPU_arch);
+  if (Attr.hasValue()) {
+    switch (Attr.getValue()) {
     case ARMBuildAttrs::v4:
       Triple += "v4";
       break;
diff --git a/llvm/lib/Object/Error.cpp b/llvm/lib/Object/Error.cpp
index 010c5b42dac2..bc75bc6c0445 100644
--- a/llvm/lib/Object/Error.cpp
+++ b/llvm/lib/Object/Error.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/Error.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/ManagedStatic.h"
 
@@ -60,9 +61,10 @@ void BinaryError::anchor() {}
 char BinaryError::ID = 0;
 char GenericBinaryError::ID = 0;
 
-GenericBinaryError::GenericBinaryError(Twine Msg) : Msg(Msg.str()) {}
+GenericBinaryError::GenericBinaryError(const Twine &Msg) : Msg(Msg.str()) {}
 
-GenericBinaryError::GenericBinaryError(Twine Msg, object_error ECOverride)
+GenericBinaryError::GenericBinaryError(const Twine &Msg,
+                                       object_error ECOverride)
     : Msg(Msg.str()) {
   setErrorCode(make_error_code(ECOverride));
 }
diff --git a/llvm/lib/Object/IRObjectFile.cpp b/llvm/lib/Object/IRObjectFile.cpp
index 636f1521262f..befba5d57127 100644
--- a/llvm/lib/Object/IRObjectFile.cpp
+++ b/llvm/lib/Object/IRObjectFile.cpp
@@ -47,7 +47,7 @@ Error IRObjectFile::printSymbolName(raw_ostream &OS, DataRefImpl Symb) const {
   return Error::success();
 }
 
-uint32_t IRObjectFile::getSymbolFlags(DataRefImpl Symb) const {
+Expected<uint32_t> IRObjectFile::getSymbolFlags(DataRefImpl Symb) const {
   return SymTab.getSymbolFlags(getSym(Symb));
 }
 
@@ -94,6 +94,7 @@ IRObjectFile::findBitcodeInMemBuffer(MemoryBufferRef Object) {
     return Object;
   case file_magic::elf_relocatable:
   case file_magic::macho_object:
+  case file_magic::wasm_object:
   case file_magic::coff_object: {
     Expected<std::unique_ptr<ObjectFile>> ObjFile =
         ObjectFile::createObjectFile(Object, Type);
diff --git a/llvm/lib/Object/IRSymtab.cpp b/llvm/lib/Object/IRSymtab.cpp
index e4282b9d6bd3..e39cb732add1 100644
--- a/llvm/lib/Object/IRSymtab.cpp
+++ b/llvm/lib/Object/IRSymtab.cpp
@@ -181,7 +181,7 @@ Expected<int> Builder::getComdatIndex(const Comdat *C, const Module *M) {
       llvm::raw_string_ostream OS(Name);
       Mang.getNameWithPrefix(OS, GV, false);
     } else {
-      Name = C->getName();
+      Name = std::string(C->getName());
     }
 
     storage::Comdat Comdat;
@@ -264,9 +264,13 @@ Error Builder::addSymbol(const ModuleSymbolTable &Msymtab,
   Sym.Flags |= unsigned(GV->getVisibility()) << storage::Symbol::FB_visibility;
 
   if (Flags & object::BasicSymbolRef::SF_Common) {
+    auto *GVar = dyn_cast<GlobalVariable>(GV);
+    if (!GVar)
+      return make_error<StringError>("Only variables can have common linkage!",
+                                     inconvertibleErrorCode());
     Uncommon().CommonSize = GV->getParent()->getDataLayout().getTypeAllocSize(
         GV->getType()->getElementType());
-    Uncommon().CommonAlign = GV->getAlignment();
+    Uncommon().CommonAlign = GVar->getAlignment();
   }
 
   const GlobalObject *Base = GV->getBaseObject();
diff --git a/llvm/lib/Object/MachOObjectFile.cpp b/llvm/lib/Object/MachOObjectFile.cpp
index 8540b7ab03cd..4d85e6f40ec4 100644
--- a/llvm/lib/Object/MachOObjectFile.cpp
+++ b/llvm/lib/Object/MachOObjectFile.cpp
@@ -1804,8 +1804,8 @@ Expected<uint64_t> MachOObjectFile::getSymbolAddress(DataRefImpl Sym) const {
 }
 
 uint32_t MachOObjectFile::getSymbolAlignment(DataRefImpl DRI) const {
-  uint32_t flags = getSymbolFlags(DRI);
-  if (flags & SymbolRef::SF_Common) {
+  uint32_t Flags = cantFail(getSymbolFlags(DRI));
+  if (Flags & SymbolRef::SF_Common) {
     MachO::nlist_base Entry = getSymbolTableEntryBase(*this, DRI);
     return 1 << MachO::GET_COMM_ALIGN(Entry.n_desc);
   }
@@ -1840,7 +1840,7 @@ MachOObjectFile::getSymbolType(DataRefImpl Symb) const {
   return SymbolRef::ST_Other;
 }
 
-uint32_t MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const {
+Expected<uint32_t> MachOObjectFile::getSymbolFlags(DataRefImpl DRI) const {
   MachO::nlist_base Entry = getSymbolTableEntryBase(*this, DRI);
 
   uint8_t MachOType = Entry.n_type;
@@ -2030,6 +2030,11 @@ bool MachOObjectFile::isSectionBSS(DataRefImpl Sec) const {
           SectionType == MachO::S_GB_ZEROFILL);
 }
 
+bool MachOObjectFile::isDebugSection(StringRef SectionName) const {
+  return SectionName.startswith("__debug") ||
+         SectionName.startswith("__zdebug") || SectionName == "__gdb_index";
+}
+
 unsigned MachOObjectFile::getSectionID(SectionRef Sec) const {
   return Sec.getRawDataRefImpl().d.a;
 }
@@ -3214,6 +3219,7 @@ void MachORebaseEntry::moveNext() {
                                        SegmentOffset) << "\n");
       break;
     case MachO::REBASE_OPCODE_ADD_ADDR_IMM_SCALED:
+      SegmentOffset += ImmValue * PointerSize;
       error = O->RebaseEntryCheckSegAndOffsets(SegmentIndex, SegmentOffset,
                                                PointerSize);
       if (error) {
@@ -3223,18 +3229,6 @@ void MachORebaseEntry::moveNext() {
         moveToEnd();
         return;
       }
-      SegmentOffset += ImmValue * PointerSize;
-      error = O->RebaseEntryCheckSegAndOffsets(SegmentIndex, SegmentOffset,
-                                               PointerSize);
-      if (error) {
-        *E =
-            malformedError("for REBASE_OPCODE_ADD_ADDR_IMM_SCALED "
-                           " (after adding immediate times the pointer size) " +
-                           Twine(error) + " for opcode at: 0x" +
-                           Twine::utohexstr(OpcodeStart - Opcodes.begin()));
-        moveToEnd();
-        return;
-      }
       DEBUG_WITH_TYPE("mach-o-rebase",
                       dbgs() << "REBASE_OPCODE_ADD_ADDR_IMM_SCALED: "
                              << format("SegmentOffset=0x%06X",
@@ -3803,15 +3797,6 @@ void MachOBindEntry::moveNext() {
         moveToEnd();
         return;
       }
-      error = O->BindEntryCheckSegAndOffsets(SegmentIndex, SegmentOffset,
-                                             PointerSize);
-      if (error) {
-        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED " +
-                            Twine(error) + " for opcode at: 0x" +
-                            Twine::utohexstr(OpcodeStart - Opcodes.begin()));
-        moveToEnd();
-        return;
-      }
       if (SymbolName == StringRef()) {
         *E = malformedError(
             "for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED "
@@ -3835,11 +3820,9 @@ void MachOBindEntry::moveNext() {
       error = O->BindEntryCheckSegAndOffsets(SegmentIndex, SegmentOffset +
                                              AdvanceAmount, PointerSize);
       if (error) {
-        *E =
-            malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED "
-                           " (after adding immediate times the pointer size) " +
-                           Twine(error) + " for opcode at: 0x" +
-                           Twine::utohexstr(OpcodeStart - Opcodes.begin()));
+        *E = malformedError("for BIND_OPCODE_DO_BIND_ADD_ADDR_IMM_SCALED " +
+                            Twine(error) + " for opcode at: 0x" +
+                            Twine::utohexstr(OpcodeStart - Opcodes.begin()));
         moveToEnd();
         return;
       }
diff --git a/llvm/lib/Object/ModuleSymbolTable.cpp b/llvm/lib/Object/ModuleSymbolTable.cpp
index 17ac4afda2d6..7f3055b5dcfa 100644
--- a/llvm/lib/Object/ModuleSymbolTable.cpp
+++ b/llvm/lib/Object/ModuleSymbolTable.cpp
@@ -23,6 +23,7 @@
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/InlineAsm.h"
 #include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
@@ -63,7 +64,8 @@ void ModuleSymbolTable::addModule(Module *M) {
     SymTab.push_back(&GV);
 
   CollectAsmSymbols(*M, [this](StringRef Name, BasicSymbolRef::Flags Flags) {
-    SymTab.push_back(new (AsmSymbols.Allocate()) AsmSymbol(Name, Flags));
+    SymTab.push_back(new (AsmSymbols.Allocate())
+                         AsmSymbol(std::string(Name), Flags));
   });
 }
 
@@ -115,6 +117,10 @@ initializeRecordStreamer(const Module &M,
   if (!TAP)
     return;
 
+  // Module-level inline asm is assumed to use At&t syntax (see
+  // AsmPrinter::doInitialization()).
+  Parser->setAssemblerDialect(InlineAsm::AD_ATT);
+
   Parser->setTargetParser(*TAP);
   if (Parser->Run(false))
     return;
diff --git a/llvm/lib/Object/ObjectFile.cpp b/llvm/lib/Object/ObjectFile.cpp
index 098b3d8f8dd0..61b36ea0f448 100644
--- a/llvm/lib/Object/ObjectFile.cpp
+++ b/llvm/lib/Object/ObjectFile.cpp
@@ -54,12 +54,15 @@ bool SectionRef::containsSymbol(SymbolRef S) const {
   return *this == **SymSec;
 }
 
-uint64_t ObjectFile::getSymbolValue(DataRefImpl Ref) const {
-  uint32_t Flags = getSymbolFlags(Ref);
-  if (Flags & SymbolRef::SF_Undefined)
-    return 0;
-  if (Flags & SymbolRef::SF_Common)
-    return getCommonSymbolSize(Ref);
+Expected<uint64_t> ObjectFile::getSymbolValue(DataRefImpl Ref) const {
+  if (Expected<uint32_t> FlagsOrErr = getSymbolFlags(Ref)) {
+    if (*FlagsOrErr & SymbolRef::SF_Undefined)
+      return 0;
+    if (*FlagsOrErr & SymbolRef::SF_Common)
+      return getCommonSymbolSize(Ref);
+  } else
+    // TODO: Test this error.
+    return FlagsOrErr.takeError();
   return getSymbolValueImpl(Ref);
 }
 
@@ -91,6 +94,10 @@ bool ObjectFile::isBerkeleyData(DataRefImpl Sec) const {
   return isSectionData(Sec);
 }
 
+bool ObjectFile::isDebugSection(StringRef SectionName) const {
+  return false;
+}
+
 Expected<section_iterator>
 ObjectFile::getRelocatedSection(DataRefImpl Sec) const {
   return section_iterator(SectionRef(Sec, this));
@@ -108,14 +115,17 @@ Triple ObjectFile::makeTriple() const {
     setARMSubArch(TheTriple);
 
   // TheTriple defaults to ELF, and COFF doesn't have an environment:
-  // the best we can do here is indicate that it is mach-o.
-  if (isMachO())
+  // something we can do here is indicate that it is mach-o.
+  if (isMachO()) {
     TheTriple.setObjectFormat(Triple::MachO);
-
-  if (isCOFF()) {
+  } else if (isCOFF()) {
     const auto COFFObj = cast<COFFObjectFile>(this);
     if (COFFObj->getArch() == Triple::thumb)
       TheTriple.setTriple("thumbv7-windows");
+  } else if (isXCOFF()) {
+    // XCOFF implies AIX.
+    TheTriple.setOS(Triple::AIX);
+    TheTriple.setObjectFormat(Triple::XCOFF);
   }
 
   return TheTriple;
diff --git a/llvm/lib/Object/RecordStreamer.cpp b/llvm/lib/Object/RecordStreamer.cpp
index f39a6c28ed50..b2f973eff361 100644
--- a/llvm/lib/Object/RecordStreamer.cpp
+++ b/llvm/lib/Object/RecordStreamer.cpp
@@ -81,22 +81,22 @@ RecordStreamer::const_iterator RecordStreamer::begin() {
 
 RecordStreamer::const_iterator RecordStreamer::end() { return Symbols.end(); }
 
-void RecordStreamer::EmitInstruction(const MCInst &Inst,
+void RecordStreamer::emitInstruction(const MCInst &Inst,
                                      const MCSubtargetInfo &STI) {
-  MCStreamer::EmitInstruction(Inst, STI);
+  MCStreamer::emitInstruction(Inst, STI);
 }
 
-void RecordStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
-  MCStreamer::EmitLabel(Symbol);
+void RecordStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
+  MCStreamer::emitLabel(Symbol);
   markDefined(*Symbol);
 }
 
-void RecordStreamer::EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
+void RecordStreamer::emitAssignment(MCSymbol *Symbol, const MCExpr *Value) {
   markDefined(*Symbol);
-  MCStreamer::EmitAssignment(Symbol, Value);
+  MCStreamer::emitAssignment(Symbol, Value);
 }
 
-bool RecordStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
+bool RecordStreamer::emitSymbolAttribute(MCSymbol *Symbol,
                                          MCSymbolAttr Attribute) {
   if (Attribute == MCSA_Global || Attribute == MCSA_Weak)
     markGlobal(*Symbol, Attribute);
@@ -105,13 +105,13 @@ bool RecordStreamer::EmitSymbolAttribute(MCSymbol *Symbol,
   return true;
 }
 
-void RecordStreamer::EmitZerofill(MCSection *Section, MCSymbol *Symbol,
+void RecordStreamer::emitZerofill(MCSection *Section, MCSymbol *Symbol,
                                   uint64_t Size, unsigned ByteAlignment,
                                   SMLoc Loc) {
   markDefined(*Symbol);
 }
 
-void RecordStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+void RecordStreamer::emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                       unsigned ByteAlignment) {
   markDefined(*Symbol);
 }
@@ -224,9 +224,9 @@ void RecordStreamer::flushSymverDirectives() {
       if (IsDefined)
         markDefined(*Alias);
       // Don't use EmitAssignment override as it always marks alias as defined.
-      MCStreamer::EmitAssignment(Alias, Value);
+      MCStreamer::emitAssignment(Alias, Value);
       if (Attr != MCSA_Invalid)
-        EmitSymbolAttribute(Alias, Attr);
+        emitSymbolAttribute(Alias, Attr);
     }
   }
 }
diff --git a/llvm/lib/Object/RecordStreamer.h b/llvm/lib/Object/RecordStreamer.h
index c8b75bcc6d1d..99d15f790a15 100644
--- a/llvm/lib/Object/RecordStreamer.h
+++ b/llvm/lib/Object/RecordStreamer.h
@@ -13,13 +13,12 @@
 #include "llvm/ADT/StringMap.h"
 #include "llvm/MC/MCDirectives.h"
 #include "llvm/MC/MCStreamer.h"
-#include "llvm/MC/MCSymbol.h"
 #include "llvm/Support/SMLoc.h"
 #include <vector>
 
 namespace llvm {
 
-class GlobalValue;
+class MCSymbol;
 class Module;
 
 class RecordStreamer : public MCStreamer {
@@ -46,13 +45,13 @@ private:
 public:
   RecordStreamer(MCContext &Context, const Module &M);
 
-  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
-  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
-  void EmitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
-  bool EmitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
-  void EmitZerofill(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
+  void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+  void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
+  void emitAssignment(MCSymbol *Symbol, const MCExpr *Value) override;
+  bool emitSymbolAttribute(MCSymbol *Symbol, MCSymbolAttr Attribute) override;
+  void emitZerofill(MCSection *Section, MCSymbol *Symbol, uint64_t Size,
                     unsigned ByteAlignment, SMLoc Loc = SMLoc()) override;
-  void EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
+  void emitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                         unsigned ByteAlignment) override;
 
   // Ignore COFF-specific directives; we do not need any information from them,
diff --git a/llvm/lib/Object/RelocationResolver.cpp b/llvm/lib/Object/RelocationResolver.cpp
index 31478be7899e..3f3f79b0f4ff 100644
--- a/llvm/lib/Object/RelocationResolver.cpp
+++ b/llvm/lib/Object/RelocationResolver.cpp
@@ -127,6 +127,27 @@ static uint64_t resolveMips64(RelocationRef R, uint64_t S, uint64_t A) {
   }
 }
 
+static bool supportsMSP430(uint64_t Type) {
+  switch (Type) {
+  case ELF::R_MSP430_32:
+  case ELF::R_MSP430_16_BYTE:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static uint64_t resolveMSP430(RelocationRef R, uint64_t S, uint64_t A) {
+  switch (R.getType()) {
+  case ELF::R_MSP430_32:
+    return (S + getELFAddend(R)) & 0xFFFFFFFF;
+  case ELF::R_MSP430_16_BYTE:
+    return (S + getELFAddend(R)) & 0xFFFF;
+  default:
+    llvm_unreachable("Invalid relocation type");
+  }
+}
+
 static bool supportsPPC64(uint64_t Type) {
   switch (Type) {
   case ELF::R_PPC64_ADDR32:
@@ -498,12 +519,24 @@ static bool supportsWasm32(uint64_t Type) {
   case wasm::R_WASM_FUNCTION_OFFSET_I32:
   case wasm::R_WASM_SECTION_OFFSET_I32:
   case wasm::R_WASM_EVENT_INDEX_LEB:
+  case wasm::R_WASM_GLOBAL_INDEX_I32:
     return true;
   default:
     return false;
   }
 }
 
+static bool supportsWasm64(uint64_t Type) {
+  switch (Type) {
+  case wasm::R_WASM_MEMORY_ADDR_LEB64:
+  case wasm::R_WASM_MEMORY_ADDR_SLEB64:
+  case wasm::R_WASM_MEMORY_ADDR_I64:
+    return true;
+  default:
+    return supportsWasm32(Type);
+  }
+}
+
 static uint64_t resolveWasm32(RelocationRef R, uint64_t S, uint64_t A) {
   switch (R.getType()) {
   case wasm::R_WASM_FUNCTION_INDEX_LEB:
@@ -517,6 +550,7 @@ static uint64_t resolveWasm32(RelocationRef R, uint64_t S, uint64_t A) {
   case wasm::R_WASM_FUNCTION_OFFSET_I32:
   case wasm::R_WASM_SECTION_OFFSET_I32:
   case wasm::R_WASM_EVENT_INDEX_LEB:
+  case wasm::R_WASM_GLOBAL_INDEX_I32:
     // For wasm section, its offset at 0 -- ignoring Value
     return A;
   default:
@@ -524,6 +558,18 @@ static uint64_t resolveWasm32(RelocationRef R, uint64_t S, uint64_t A) {
   }
 }
 
+static uint64_t resolveWasm64(RelocationRef R, uint64_t S, uint64_t A) {
+  switch (R.getType()) {
+  case wasm::R_WASM_MEMORY_ADDR_LEB64:
+  case wasm::R_WASM_MEMORY_ADDR_SLEB64:
+  case wasm::R_WASM_MEMORY_ADDR_I64:
+    // For wasm section, its offset at 0 -- ignoring Value
+    return A;
+  default:
+    return resolveWasm32(R, S, A);
+  }
+}
+
 std::pair<bool (*)(uint64_t), RelocationResolver>
 getRelocationResolver(const ObjectFile &Obj) {
   if (Obj.isCOFF()) {
@@ -589,6 +635,8 @@ getRelocationResolver(const ObjectFile &Obj) {
     case Triple::mipsel:
     case Triple::mips:
       return {supportsMips32, resolveMips32};
+    case Triple::msp430:
+      return {supportsMSP430, resolveMSP430};
     case Triple::sparc:
       return {supportsSparc32, resolveSparc32};
     case Triple::hexagon:
@@ -605,6 +653,8 @@ getRelocationResolver(const ObjectFile &Obj) {
   } else if (Obj.isWasm()) {
     if (Obj.getArch() == Triple::wasm32)
       return {supportsWasm32, resolveWasm32};
+    if (Obj.getArch() == Triple::wasm64)
+      return {supportsWasm64, resolveWasm64};
     return {nullptr, nullptr};
   }
 
diff --git a/llvm/lib/Object/SymbolSize.cpp b/llvm/lib/Object/SymbolSize.cpp
index bdf4dc55cf3c..84eed4d169d3 100644
--- a/llvm/lib/Object/SymbolSize.cpp
+++ b/llvm/lib/Object/SymbolSize.cpp
@@ -11,6 +11,7 @@
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/ELFObjectFile.h"
 #include "llvm/Object/MachO.h"
+#include "llvm/Object/Wasm.h"
 
 using namespace llvm;
 using namespace object;
@@ -27,12 +28,17 @@ int llvm::object::compareAddress(const SymEntry *A, const SymEntry *B) {
 static unsigned getSectionID(const ObjectFile &O, SectionRef Sec) {
   if (auto *M = dyn_cast<MachOObjectFile>(&O))
     return M->getSectionID(Sec);
+  if (isa<WasmObjectFile>(&O))
+    return Sec.getIndex();
+
   return cast<COFFObjectFile>(O).getSectionID(Sec);
 }
 
 static unsigned getSymbolSectionID(const ObjectFile &O, SymbolRef Sym) {
   if (auto *M = dyn_cast<MachOObjectFile>(&O))
     return M->getSymbolSectionID(Sym);
+  if (const auto *M = dyn_cast<WasmObjectFile>(&O))
+    return M->getSymbolSectionId(Sym);
   return cast<COFFObjectFile>(O).getSymbolSectionID(Sym);
 }
 
@@ -55,8 +61,11 @@ llvm::object::computeSymbolSizes(const ObjectFile &O) {
   unsigned SymNum = 0;
   for (symbol_iterator I = O.symbol_begin(), E = O.symbol_end(); I != E; ++I) {
     SymbolRef Sym = *I;
-    uint64_t Value = Sym.getValue();
-    Addresses.push_back({I, Value, SymNum, getSymbolSectionID(O, Sym)});
+    Expected<uint64_t> ValueOrErr = Sym.getValue();
+    if (!ValueOrErr)
+      // TODO: Actually report errors helpfully.
+      report_fatal_error(ValueOrErr.takeError());
+    Addresses.push_back({I, *ValueOrErr, SymNum, getSymbolSectionID(O, Sym)});
     ++SymNum;
   }
   for (SectionRef Sec : O.sections()) {
diff --git a/llvm/lib/Object/TapiFile.cpp b/llvm/lib/Object/TapiFile.cpp
index c409bd8e5995..7a361990ba5d 100644
--- a/llvm/lib/Object/TapiFile.cpp
+++ b/llvm/lib/Object/TapiFile.cpp
@@ -40,7 +40,7 @@ static uint32_t getFlags(const Symbol *Sym) {
 
 TapiFile::TapiFile(MemoryBufferRef Source, const InterfaceFile &interface,
                    Architecture Arch)
-    : SymbolicFile(ID_TapiFile, Source) {
+    : SymbolicFile(ID_TapiFile, Source), Arch(Arch) {
   for (const auto *Symbol : interface.symbols()) {
     if (!Symbol->getArchitectures().has(Arch))
       continue;
@@ -75,30 +75,28 @@ TapiFile::TapiFile(MemoryBufferRef Source, const InterfaceFile &interface,
 
 TapiFile::~TapiFile() = default;
 
-void TapiFile::moveSymbolNext(DataRefImpl &DRI) const {
-  const auto *Sym = reinterpret_cast<const Symbol *>(DRI.p);
-  DRI.p = reinterpret_cast<uintptr_t>(++Sym);
-}
+void TapiFile::moveSymbolNext(DataRefImpl &DRI) const { DRI.d.a++; }
 
 Error TapiFile::printSymbolName(raw_ostream &OS, DataRefImpl DRI) const {
-  const auto *Sym = reinterpret_cast<const Symbol *>(DRI.p);
-  OS << Sym->Prefix << Sym->Name;
+  assert(DRI.d.a < Symbols.size() && "Attempt to access symbol out of bounds");
+  const Symbol &Sym = Symbols[DRI.d.a];
+  OS << Sym.Prefix << Sym.Name;
   return Error::success();
 }
 
-uint32_t TapiFile::getSymbolFlags(DataRefImpl DRI) const {
-  const auto *Sym = reinterpret_cast<const Symbol *>(DRI.p);
-  return Sym->Flags;
+Expected<uint32_t> TapiFile::getSymbolFlags(DataRefImpl DRI) const {
+  assert(DRI.d.a < Symbols.size() && "Attempt to access symbol out of bounds");
+  return Symbols[DRI.d.a].Flags;
 }
 
 basic_symbol_iterator TapiFile::symbol_begin() const {
   DataRefImpl DRI;
-  DRI.p = reinterpret_cast<uintptr_t>(&*Symbols.begin());
+  DRI.d.a = 0;
   return BasicSymbolRef{DRI, this};
 }
 
 basic_symbol_iterator TapiFile::symbol_end() const {
   DataRefImpl DRI;
-  DRI.p = reinterpret_cast<uintptr_t>(&*Symbols.end());
+  DRI.d.a = Symbols.size();
   return BasicSymbolRef{DRI, this};
 }
diff --git a/llvm/lib/Object/TapiUniversal.cpp b/llvm/lib/Object/TapiUniversal.cpp
index b3273e345a61..48cb949cb6f4 100644
--- a/llvm/lib/Object/TapiUniversal.cpp
+++ b/llvm/lib/Object/TapiUniversal.cpp
@@ -22,7 +22,7 @@ using namespace object;
 
 TapiUniversal::TapiUniversal(MemoryBufferRef Source, Error &Err)
     : Binary(ID_TapiUniversal, Source) {
-  auto Result = TextAPIReader::get(Source);
+  Expected<std::unique_ptr<InterfaceFile>> Result = TextAPIReader::get(Source);
   ErrorAsOutParameter ErrAsOuParam(&Err);
   if (!Result) {
     Err = Result.takeError();
@@ -30,9 +30,16 @@ TapiUniversal::TapiUniversal(MemoryBufferRef Source, Error &Err)
   }
   ParsedFile = std::move(Result.get());
 
-  auto Archs = ParsedFile->getArchitectures();
-  for (auto Arch : Archs)
-    Architectures.emplace_back(Arch);
+  auto FlattenObjectInfo = [this](const auto &File) {
+    StringRef Name = File->getInstallName();
+    for (const Architecture Arch : File->getArchitectures())
+      Libraries.emplace_back(Library({Name, Arch}));
+  };
+
+  FlattenObjectInfo(ParsedFile);
+  // Get inlined documents from tapi file.
+  for (const std::shared_ptr<InterfaceFile> &File : ParsedFile->documents())
+    FlattenObjectInfo(File);
 }
 
 TapiUniversal::~TapiUniversal() = default;
@@ -41,7 +48,7 @@ Expected<std::unique_ptr<TapiFile>>
 TapiUniversal::ObjectForArch::getAsObjectFile() const {
   return std::unique_ptr<TapiFile>(new TapiFile(Parent->getMemoryBufferRef(),
                                                 *Parent->ParsedFile.get(),
-                                                Parent->Architectures[Index]));
+                                                Parent->Libraries[Index].Arch));
 }
 
 Expected<std::unique_ptr<TapiUniversal>>
diff --git a/llvm/lib/Object/WasmObjectFile.cpp b/llvm/lib/Object/WasmObjectFile.cpp
index ab8918ce1919..bb2e81d64047 100644
--- a/llvm/lib/Object/WasmObjectFile.cpp
+++ b/llvm/lib/Object/WasmObjectFile.cpp
@@ -12,6 +12,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/MC/SubtargetFeature.h"
@@ -155,6 +156,10 @@ static int64_t readVarint64(WasmObjectFile::ReadContext &Ctx) {
   return readLEB128(Ctx);
 }
 
+static uint64_t readVaruint64(WasmObjectFile::ReadContext &Ctx) {
+  return readULEB128(Ctx);
+}
+
 static uint8_t readOpcode(WasmObjectFile::ReadContext &Ctx) {
   return readUint8(Ctx);
 }
@@ -179,6 +184,14 @@ static Error readInitExpr(wasm::WasmInitExpr &Expr,
   case wasm::WASM_OPCODE_GLOBAL_GET:
     Expr.Value.Global = readULEB128(Ctx);
     break;
+  case wasm::WASM_OPCODE_REF_NULL: {
+    wasm::ValType Ty = static_cast<wasm::ValType>(readULEB128(Ctx));
+    if (Ty != wasm::ValType::EXTERNREF) {
+      return make_error<GenericBinaryError>("Invalid type for ref.null",
+                                            object_error::parse_failed);
+    }
+    break;
+  }
   default:
     return make_error<GenericBinaryError>("Invalid opcode in init_expr",
                                           object_error::parse_failed);
@@ -195,9 +208,9 @@ static Error readInitExpr(wasm::WasmInitExpr &Expr,
 static wasm::WasmLimits readLimits(WasmObjectFile::ReadContext &Ctx) {
   wasm::WasmLimits Result;
   Result.Flags = readVaruint32(Ctx);
-  Result.Initial = readVaruint32(Ctx);
+  Result.Initial = readVaruint64(Ctx);
   if (Result.Flags & wasm::WASM_LIMITS_FLAG_HAS_MAX)
-    Result.Maximum = readVaruint32(Ctx);
+    Result.Maximum = readVaruint64(Ctx);
   return Result;
 }
 
@@ -302,10 +315,10 @@ Error WasmObjectFile::parseSection(WasmSection &Sec) {
     return parseTableSection(Ctx);
   case wasm::WASM_SEC_MEMORY:
     return parseMemorySection(Ctx);
-  case wasm::WASM_SEC_GLOBAL:
-    return parseGlobalSection(Ctx);
   case wasm::WASM_SEC_EVENT:
     return parseEventSection(Ctx);
+  case wasm::WASM_SEC_GLOBAL:
+    return parseGlobalSection(Ctx);
   case wasm::WASM_SEC_EXPORT:
     return parseExportSection(Ctx);
   case wasm::WASM_SEC_START:
@@ -508,13 +521,16 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
           Function.SymbolName = Info.Name;
       } else {
         wasm::WasmImport &Import = *ImportedFunctions[Info.ElementIndex];
-        if ((Info.Flags & wasm::WASM_SYMBOL_EXPLICIT_NAME) != 0)
+        if ((Info.Flags & wasm::WASM_SYMBOL_EXPLICIT_NAME) != 0) {
           Info.Name = readString(Ctx);
-        else
+          Info.ImportName = Import.Field;
+        } else {
           Info.Name = Import.Field;
+        }
         Signature = &Signatures[Import.SigIndex];
-        Info.ImportName = Import.Field;
-        Info.ImportModule = Import.Module;
+        if (!Import.Module.empty()) {
+          Info.ImportModule = Import.Module;
+        }
       }
       break;
 
@@ -537,25 +553,29 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
           Global.SymbolName = Info.Name;
       } else {
         wasm::WasmImport &Import = *ImportedGlobals[Info.ElementIndex];
-        if ((Info.Flags & wasm::WASM_SYMBOL_EXPLICIT_NAME) != 0)
+        if ((Info.Flags & wasm::WASM_SYMBOL_EXPLICIT_NAME) != 0) {
           Info.Name = readString(Ctx);
-        else
+          Info.ImportName = Import.Field;
+        } else {
           Info.Name = Import.Field;
+        }
         GlobalType = &Import.Global;
         Info.ImportName = Import.Field;
-        Info.ImportModule = Import.Module;
+        if (!Import.Module.empty()) {
+          Info.ImportModule = Import.Module;
+        }
       }
       break;
 
     case wasm::WASM_SYMBOL_TYPE_DATA:
       Info.Name = readString(Ctx);
       if (IsDefined) {
-        uint32_t Index = readVaruint32(Ctx);
+        auto Index = readVaruint32(Ctx);
         if (Index >= DataSegments.size())
           return make_error<GenericBinaryError>("invalid data symbol index",
                                                 object_error::parse_failed);
-        uint32_t Offset = readVaruint32(Ctx);
-        uint32_t Size = readVaruint32(Ctx);
+        auto Offset = readVaruint64(Ctx);
+        auto Size = readVaruint64(Ctx);
         if (Offset + Size > DataSegments[Index].Data.Content.size())
           return make_error<GenericBinaryError>("invalid data symbol offset",
                                                 object_error::parse_failed);
@@ -597,14 +617,17 @@ Error WasmObjectFile::parseLinkingSectionSymtab(ReadContext &Ctx) {
 
       } else {
         wasm::WasmImport &Import = *ImportedEvents[Info.ElementIndex];
-        if ((Info.Flags & wasm::WASM_SYMBOL_EXPLICIT_NAME) != 0)
+        if ((Info.Flags & wasm::WASM_SYMBOL_EXPLICIT_NAME) != 0) {
           Info.Name = readString(Ctx);
-        else
+          Info.ImportName = Import.Field;
+        } else {
           Info.Name = Import.Field;
+        }
         EventType = &Import.Event;
         Signature = &Signatures[EventType->SigIndex];
-        Info.ImportName = Import.Field;
-        Info.ImportModule = Import.Module;
+        if (!Import.Module.empty()) {
+          Info.ImportModule = Import.Module;
+        }
       }
       break;
     }
@@ -708,7 +731,7 @@ Error WasmObjectFile::parseProducersSection(ReadContext &Ctx) {
             "Producers section contains repeated producer",
             object_error::parse_failed);
       }
-      ProducerVec->emplace_back(Name, Version);
+      ProducerVec->emplace_back(std::string(Name), std::string(Version));
     }
   }
   if (Ctx.Ptr != Ctx.End)
@@ -732,7 +755,7 @@ Error WasmObjectFile::parseTargetFeaturesSection(ReadContext &Ctx) {
       return make_error<GenericBinaryError>("Unknown feature policy prefix",
                                             object_error::parse_failed);
     }
-    Feature.Name = readString(Ctx);
+    Feature.Name = std::string(readString(Ctx));
     if (!FeaturesSeen.insert(Feature.Name).second)
       return make_error<GenericBinaryError>(
           "Target features section contains repeated feature \"" +
@@ -788,6 +811,11 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
         return make_error<GenericBinaryError>("Bad relocation global index",
                                               object_error::parse_failed);
       break;
+    case wasm::R_WASM_GLOBAL_INDEX_I32:
+      if (!isValidGlobalSymbol(Reloc.Index))
+        return make_error<GenericBinaryError>("Bad relocation global index",
+                                              object_error::parse_failed);
+      break;
     case wasm::R_WASM_EVENT_INDEX_LEB:
       if (!isValidEventSymbol(Reloc.Index))
         return make_error<GenericBinaryError>("Bad relocation event index",
@@ -802,6 +830,15 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
                                               object_error::parse_failed);
       Reloc.Addend = readVarint32(Ctx);
       break;
+    case wasm::R_WASM_MEMORY_ADDR_LEB64:
+    case wasm::R_WASM_MEMORY_ADDR_SLEB64:
+    case wasm::R_WASM_MEMORY_ADDR_I64:
+    case wasm::R_WASM_MEMORY_ADDR_REL_SLEB64:
+      if (!isValidDataSymbol(Reloc.Index))
+        return make_error<GenericBinaryError>("Bad relocation data index",
+                                              object_error::parse_failed);
+      Reloc.Addend = readVarint64(Ctx);
+      break;
     case wasm::R_WASM_FUNCTION_OFFSET_I32:
       if (!isValidFunctionSymbol(Reloc.Index))
         return make_error<GenericBinaryError>("Bad relocation function index",
@@ -824,11 +861,18 @@ Error WasmObjectFile::parseRelocSection(StringRef Name, ReadContext &Ctx) {
     // also shouldn't overlap a function/element boundary, but we don't bother
     // to check that.
     uint64_t Size = 5;
+    if (Reloc.Type == wasm::R_WASM_MEMORY_ADDR_LEB64 ||
+        Reloc.Type == wasm::R_WASM_MEMORY_ADDR_SLEB64 ||
+        Reloc.Type == wasm::R_WASM_MEMORY_ADDR_REL_SLEB64)
+      Size = 10;
     if (Reloc.Type == wasm::R_WASM_TABLE_INDEX_I32 ||
         Reloc.Type == wasm::R_WASM_MEMORY_ADDR_I32 ||
         Reloc.Type == wasm::R_WASM_SECTION_OFFSET_I32 ||
-        Reloc.Type == wasm::R_WASM_FUNCTION_OFFSET_I32)
+        Reloc.Type == wasm::R_WASM_FUNCTION_OFFSET_I32 ||
+        Reloc.Type == wasm::R_WASM_GLOBAL_INDEX_I32)
       Size = 4;
+    if (Reloc.Type == wasm::R_WASM_MEMORY_ADDR_I64)
+      Size = 8;
     if (Reloc.Offset + Size > EndOffset)
       return make_error<GenericBinaryError>("Bad relocation offset",
                                             object_error::parse_failed);
@@ -983,6 +1027,24 @@ Error WasmObjectFile::parseMemorySection(ReadContext &Ctx) {
   return Error::success();
 }
 
+Error WasmObjectFile::parseEventSection(ReadContext &Ctx) {
+  EventSection = Sections.size();
+  uint32_t Count = readVarint32(Ctx);
+  Events.reserve(Count);
+  while (Count--) {
+    wasm::WasmEvent Event;
+    Event.Index = NumImportedEvents + Events.size();
+    Event.Type.Attribute = readVaruint32(Ctx);
+    Event.Type.SigIndex = readVarint32(Ctx);
+    Events.push_back(Event);
+  }
+
+  if (Ctx.Ptr != Ctx.End)
+    return make_error<GenericBinaryError>("Event section ended prematurely",
+                                          object_error::parse_failed);
+  return Error::success();
+}
+
 Error WasmObjectFile::parseGlobalSection(ReadContext &Ctx) {
   GlobalSection = Sections.size();
   uint32_t Count = readVaruint32(Ctx);
@@ -1002,24 +1064,6 @@ Error WasmObjectFile::parseGlobalSection(ReadContext &Ctx) {
   return Error::success();
 }
 
-Error WasmObjectFile::parseEventSection(ReadContext &Ctx) {
-  EventSection = Sections.size();
-  uint32_t Count = readVarint32(Ctx);
-  Events.reserve(Count);
-  while (Count--) {
-    wasm::WasmEvent Event;
-    Event.Index = NumImportedEvents + Events.size();
-    Event.Type.Attribute = readVaruint32(Ctx);
-    Event.Type.SigIndex = readVarint32(Ctx);
-    Events.push_back(Event);
-  }
-
-  if (Ctx.Ptr != Ctx.End)
-    return make_error<GenericBinaryError>("Event section ended prematurely",
-                                          object_error::parse_failed);
-  return Error::success();
-}
-
 Error WasmObjectFile::parseExportSection(ReadContext &Ctx) {
   uint32_t Count = readVaruint32(Ctx);
   Exports.reserve(Count);
@@ -1250,7 +1294,7 @@ const wasm::WasmObjectHeader &WasmObjectFile::getHeader() const {
 
 void WasmObjectFile::moveSymbolNext(DataRefImpl &Symb) const { Symb.d.b++; }
 
-uint32_t WasmObjectFile::getSymbolFlags(DataRefImpl Symb) const {
+Expected<uint32_t> WasmObjectFile::getSymbolFlags(DataRefImpl Symb) const {
   uint32_t Result = SymbolRef::SF_None;
   const WasmSymbol &Sym = getWasmSymbol(Symb);
 
@@ -1314,8 +1358,13 @@ uint64_t WasmObjectFile::getWasmSymbolValue(const WasmSymbol &Sym) const {
     // offset within the segment.
     uint32_t SegmentIndex = Sym.Info.DataRef.Segment;
     const wasm::WasmDataSegment &Segment = DataSegments[SegmentIndex].Data;
-    assert(Segment.Offset.Opcode == wasm::WASM_OPCODE_I32_CONST);
-    return Segment.Offset.Value.Int32 + Sym.Info.DataRef.Offset;
+    if (Segment.Offset.Opcode == wasm::WASM_OPCODE_I32_CONST) {
+      return Segment.Offset.Value.Int32 + Sym.Info.DataRef.Offset;
+    } else if (Segment.Offset.Opcode == wasm::WASM_OPCODE_I64_CONST) {
+      return Segment.Offset.Value.Int64 + Sym.Info.DataRef.Offset;
+    } else {
+      llvm_unreachable("unknown init expr opcode");
+    }
   }
   case wasm::WASM_SYMBOL_TYPE_SECTION:
     return 0;
@@ -1365,26 +1414,30 @@ WasmObjectFile::getSymbolSection(DataRefImpl Symb) const {
     return section_end();
 
   DataRefImpl Ref;
+  Ref.d.a = getSymbolSectionIdImpl(Sym);
+  return section_iterator(SectionRef(Ref, this));
+}
+
+uint32_t WasmObjectFile::getSymbolSectionId(SymbolRef Symb) const {
+  const WasmSymbol &Sym = getWasmSymbol(Symb);
+  return getSymbolSectionIdImpl(Sym);
+}
+
+uint32_t WasmObjectFile::getSymbolSectionIdImpl(const WasmSymbol &Sym) const {
   switch (Sym.Info.Kind) {
   case wasm::WASM_SYMBOL_TYPE_FUNCTION:
-    Ref.d.a = CodeSection;
-    break;
+    return CodeSection;
   case wasm::WASM_SYMBOL_TYPE_GLOBAL:
-    Ref.d.a = GlobalSection;
-    break;
+    return GlobalSection;
   case wasm::WASM_SYMBOL_TYPE_DATA:
-    Ref.d.a = DataSection;
-    break;
+    return DataSection;
   case wasm::WASM_SYMBOL_TYPE_SECTION:
-    Ref.d.a = Sym.Info.ElementIndex;
-    break;
+    return Sym.Info.ElementIndex;
   case wasm::WASM_SYMBOL_TYPE_EVENT:
-    Ref.d.a = EventSection;
-    break;
+    return EventSection;
   default:
     llvm_unreachable("Unknown WasmSymbol::SymbolType");
   }
-  return section_iterator(SectionRef(Ref, this));
 }
 
 void WasmObjectFile::moveSectionNext(DataRefImpl &Sec) const { Sec.d.a++; }
@@ -1455,8 +1508,6 @@ bool WasmObjectFile::isSectionBSS(DataRefImpl Sec) const { return false; }
 
 bool WasmObjectFile::isSectionVirtual(DataRefImpl Sec) const { return false; }
 
-bool WasmObjectFile::isSectionBitcode(DataRefImpl Sec) const { return false; }
-
 relocation_iterator WasmObjectFile::section_rel_begin(DataRefImpl Ref) const {
   DataRefImpl RelocRef;
   RelocRef.d.a = Ref.d.a;
@@ -1608,30 +1659,50 @@ int WasmSectionOrderChecker::getSectionOrder(unsigned ID,
 // Represents the edges in a directed graph where any node B reachable from node
 // A is not allowed to appear before A in the section ordering, but may appear
 // afterward.
-int WasmSectionOrderChecker::DisallowedPredecessors[WASM_NUM_SEC_ORDERS][WASM_NUM_SEC_ORDERS] = {
-  {}, // WASM_SEC_ORDER_NONE
-  {WASM_SEC_ORDER_TYPE, WASM_SEC_ORDER_IMPORT}, // WASM_SEC_ORDER_TYPE,
-  {WASM_SEC_ORDER_IMPORT, WASM_SEC_ORDER_FUNCTION}, // WASM_SEC_ORDER_IMPORT,
-  {WASM_SEC_ORDER_FUNCTION, WASM_SEC_ORDER_TABLE}, // WASM_SEC_ORDER_FUNCTION,
-  {WASM_SEC_ORDER_TABLE, WASM_SEC_ORDER_MEMORY}, // WASM_SEC_ORDER_TABLE,
-  {WASM_SEC_ORDER_MEMORY, WASM_SEC_ORDER_GLOBAL}, // WASM_SEC_ORDER_MEMORY,
-  {WASM_SEC_ORDER_GLOBAL, WASM_SEC_ORDER_EVENT}, // WASM_SEC_ORDER_GLOBAL,
-  {WASM_SEC_ORDER_EVENT, WASM_SEC_ORDER_EXPORT}, // WASM_SEC_ORDER_EVENT,
-  {WASM_SEC_ORDER_EXPORT, WASM_SEC_ORDER_START}, // WASM_SEC_ORDER_EXPORT,
-  {WASM_SEC_ORDER_START, WASM_SEC_ORDER_ELEM}, // WASM_SEC_ORDER_START,
-  {WASM_SEC_ORDER_ELEM, WASM_SEC_ORDER_DATACOUNT}, // WASM_SEC_ORDER_ELEM,
-  {WASM_SEC_ORDER_DATACOUNT, WASM_SEC_ORDER_CODE}, // WASM_SEC_ORDER_DATACOUNT,
-  {WASM_SEC_ORDER_CODE, WASM_SEC_ORDER_DATA}, // WASM_SEC_ORDER_CODE,
-  {WASM_SEC_ORDER_DATA, WASM_SEC_ORDER_LINKING}, // WASM_SEC_ORDER_DATA,
-
-  // Custom Sections
-  {WASM_SEC_ORDER_DYLINK, WASM_SEC_ORDER_TYPE}, // WASM_SEC_ORDER_DYLINK,
-  {WASM_SEC_ORDER_LINKING, WASM_SEC_ORDER_RELOC, WASM_SEC_ORDER_NAME}, // WASM_SEC_ORDER_LINKING,
-  {}, // WASM_SEC_ORDER_RELOC (can be repeated),
-  {WASM_SEC_ORDER_NAME, WASM_SEC_ORDER_PRODUCERS}, // WASM_SEC_ORDER_NAME,
-  {WASM_SEC_ORDER_PRODUCERS, WASM_SEC_ORDER_TARGET_FEATURES}, // WASM_SEC_ORDER_PRODUCERS,
-  {WASM_SEC_ORDER_TARGET_FEATURES}  // WASM_SEC_ORDER_TARGET_FEATURES
-};
+int WasmSectionOrderChecker::DisallowedPredecessors
+    [WASM_NUM_SEC_ORDERS][WASM_NUM_SEC_ORDERS] = {
+        // WASM_SEC_ORDER_NONE
+        {},
+        // WASM_SEC_ORDER_TYPE
+        {WASM_SEC_ORDER_TYPE, WASM_SEC_ORDER_IMPORT},
+        // WASM_SEC_ORDER_IMPORT
+        {WASM_SEC_ORDER_IMPORT, WASM_SEC_ORDER_FUNCTION},
+        // WASM_SEC_ORDER_FUNCTION
+        {WASM_SEC_ORDER_FUNCTION, WASM_SEC_ORDER_TABLE},
+        // WASM_SEC_ORDER_TABLE
+        {WASM_SEC_ORDER_TABLE, WASM_SEC_ORDER_MEMORY},
+        // WASM_SEC_ORDER_MEMORY
+        {WASM_SEC_ORDER_MEMORY, WASM_SEC_ORDER_EVENT},
+        // WASM_SEC_ORDER_EVENT
+        {WASM_SEC_ORDER_EVENT, WASM_SEC_ORDER_GLOBAL},
+        // WASM_SEC_ORDER_GLOBAL
+        {WASM_SEC_ORDER_GLOBAL, WASM_SEC_ORDER_EXPORT},
+        // WASM_SEC_ORDER_EXPORT
+        {WASM_SEC_ORDER_EXPORT, WASM_SEC_ORDER_START},
+        // WASM_SEC_ORDER_START
+        {WASM_SEC_ORDER_START, WASM_SEC_ORDER_ELEM},
+        // WASM_SEC_ORDER_ELEM
+        {WASM_SEC_ORDER_ELEM, WASM_SEC_ORDER_DATACOUNT},
+        // WASM_SEC_ORDER_DATACOUNT
+        {WASM_SEC_ORDER_DATACOUNT, WASM_SEC_ORDER_CODE},
+        // WASM_SEC_ORDER_CODE
+        {WASM_SEC_ORDER_CODE, WASM_SEC_ORDER_DATA},
+        // WASM_SEC_ORDER_DATA
+        {WASM_SEC_ORDER_DATA, WASM_SEC_ORDER_LINKING},
+
+        // Custom Sections
+        // WASM_SEC_ORDER_DYLINK
+        {WASM_SEC_ORDER_DYLINK, WASM_SEC_ORDER_TYPE},
+        // WASM_SEC_ORDER_LINKING
+        {WASM_SEC_ORDER_LINKING, WASM_SEC_ORDER_RELOC, WASM_SEC_ORDER_NAME},
+        // WASM_SEC_ORDER_RELOC (can be repeated)
+        {},
+        // WASM_SEC_ORDER_NAME
+        {WASM_SEC_ORDER_NAME, WASM_SEC_ORDER_PRODUCERS},
+        // WASM_SEC_ORDER_PRODUCERS
+        {WASM_SEC_ORDER_PRODUCERS, WASM_SEC_ORDER_TARGET_FEATURES},
+        // WASM_SEC_ORDER_TARGET_FEATURES
+        {WASM_SEC_ORDER_TARGET_FEATURES}};
 
 bool WasmSectionOrderChecker::isValidSectionOrder(unsigned ID,
                                                   StringRef CustomSectionName) {
diff --git a/llvm/lib/Object/WindowsResource.cpp b/llvm/lib/Object/WindowsResource.cpp
index 10717718b201..2a69c6c46b59 100644
--- a/llvm/lib/Object/WindowsResource.cpp
+++ b/llvm/lib/Object/WindowsResource.cpp
@@ -346,7 +346,7 @@ Error WindowsResourceParser::parse(WindowsResource *WR,
 
   ResourceEntryRef Entry = EntryOrErr.get();
   uint32_t Origin = InputFilenames.size();
-  InputFilenames.push_back(WR->getFileName());
+  InputFilenames.push_back(std::string(WR->getFileName()));
   bool End = false;
   while (!End) {
 
@@ -368,7 +368,7 @@ Error WindowsResourceParser::parse(ResourceSectionRef &RSR, StringRef Filename,
                                    std::vector<std::string> &Duplicates) {
   UNWRAP_REF_OR_RETURN(BaseTable, RSR.getBaseTable());
   uint32_t Origin = InputFilenames.size();
-  InputFilenames.push_back(Filename);
+  InputFilenames.push_back(std::string(Filename));
   std::vector<StringOrID> Context;
   return addChildren(Root, RSR, BaseTable, Origin, Context, Duplicates);
 }
@@ -721,8 +721,10 @@ WindowsResourceCOFFWriter::write(uint32_t TimeDateStamp) {
 // it's okay to *not* copy the trailing zero.
 static void coffnamecpy(char (&Dest)[COFF::NameSize], StringRef Src) {
   assert(Src.size() <= COFF::NameSize &&
-         "Src is not larger than COFF::NameSize");
-  strncpy(Dest, Src.data(), (size_t)COFF::NameSize);
+         "Src is larger than COFF::NameSize");
+  assert((Src.size() == COFF::NameSize || Dest[Src.size()] == '\0') &&
+         "Dest not zeroed upon initialization");
+  memcpy(Dest, Src.data(), Src.size());
 }
 
 void WindowsResourceCOFFWriter::writeCOFFHeader(uint32_t TimeDateStamp) {
diff --git a/llvm/lib/Object/XCOFFObjectFile.cpp b/llvm/lib/Object/XCOFFObjectFile.cpp
index f98cd69a0d37..533361666cf2 100644
--- a/llvm/lib/Object/XCOFFObjectFile.cpp
+++ b/llvm/lib/Object/XCOFFObjectFile.cpp
@@ -11,13 +11,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Object/XCOFFObjectFile.h"
+#include "llvm/MC/SubtargetFeature.h"
 #include <cstddef>
 #include <cstring>
 
 namespace llvm {
 namespace object {
 
-enum { FUNCTION_SYM = 0x20, SYM_TYPE_MASK = 0x07, RELOC_OVERFLOW = 65535 };
+static const uint8_t FunctionSym = 0x20;
+static const uint8_t SymTypeMask = 0x07;
+static const uint16_t NoRelMask = 0x0001;
 
 // Checks that [Ptr, Ptr + Size) bytes fall inside the memory buffer
 // 'M'. Returns a pointer to the underlying object on success.
@@ -25,8 +28,8 @@ template <typename T>
 static Expected<const T *> getObject(MemoryBufferRef M, const void *Ptr,
                                      const uint64_t Size = sizeof(T)) {
   uintptr_t Addr = uintptr_t(Ptr);
-  if (std::error_code EC = Binary::checkOffset(M, Addr, Size))
-    return errorCodeToError(EC);
+  if (Error E = Binary::checkOffset(M, Addr, Size))
+    return std::move(E);
   return reinterpret_cast<const T *>(Addr);
 }
 
@@ -314,58 +317,98 @@ bool XCOFFObjectFile::isSectionVirtual(DataRefImpl Sec) const {
 }
 
 relocation_iterator XCOFFObjectFile::section_rel_begin(DataRefImpl Sec) const {
-  llvm_unreachable("Not yet implemented!");
-  return relocation_iterator(RelocationRef());
+  if (is64Bit())
+    report_fatal_error("64-bit support not implemented yet");
+  const XCOFFSectionHeader32 *SectionEntPtr = toSection32(Sec);
+  auto RelocationsOrErr = relocations(*SectionEntPtr);
+  if (Error E = RelocationsOrErr.takeError())
+    return relocation_iterator(RelocationRef());
+  DataRefImpl Ret;
+  Ret.p = reinterpret_cast<uintptr_t>(&*RelocationsOrErr.get().begin());
+  return relocation_iterator(RelocationRef(Ret, this));
 }
 
 relocation_iterator XCOFFObjectFile::section_rel_end(DataRefImpl Sec) const {
-  llvm_unreachable("Not yet implemented!");
-  return relocation_iterator(RelocationRef());
+  if (is64Bit())
+    report_fatal_error("64-bit support not implemented yet");
+  const XCOFFSectionHeader32 *SectionEntPtr = toSection32(Sec);
+  auto RelocationsOrErr = relocations(*SectionEntPtr);
+  if (Error E = RelocationsOrErr.takeError())
+    return relocation_iterator(RelocationRef());
+  DataRefImpl Ret;
+  Ret.p = reinterpret_cast<uintptr_t>(&*RelocationsOrErr.get().end());
+  return relocation_iterator(RelocationRef(Ret, this));
 }
 
 void XCOFFObjectFile::moveRelocationNext(DataRefImpl &Rel) const {
-  llvm_unreachable("Not yet implemented!");
-  return;
+  Rel.p = reinterpret_cast<uintptr_t>(viewAs<XCOFFRelocation32>(Rel.p) + 1);
 }
 
 uint64_t XCOFFObjectFile::getRelocationOffset(DataRefImpl Rel) const {
-  llvm_unreachable("Not yet implemented!");
-  uint64_t Result = 0;
-  return Result;
+  if (is64Bit())
+    report_fatal_error("64-bit support not implemented yet");
+  const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
+  const XCOFFSectionHeader32 *Sec32 = sectionHeaderTable32();
+  const uint32_t RelocAddress = Reloc->VirtualAddress;
+  const uint16_t NumberOfSections = getNumberOfSections();
+  for (uint16_t i = 0; i < NumberOfSections; ++i) {
+    // Find which section this relocation is belonging to, and get the
+    // relocation offset relative to the start of the section.
+    if (Sec32->VirtualAddress <= RelocAddress &&
+        RelocAddress < Sec32->VirtualAddress + Sec32->SectionSize) {
+      return RelocAddress - Sec32->VirtualAddress;
+    }
+    ++Sec32;
+  }
+  return InvalidRelocOffset;
 }
 
 symbol_iterator XCOFFObjectFile::getRelocationSymbol(DataRefImpl Rel) const {
-  llvm_unreachable("Not yet implemented!");
-  return symbol_iterator(SymbolRef());
+  if (is64Bit())
+    report_fatal_error("64-bit support not implemented yet");
+  const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
+  const uint32_t Index = Reloc->SymbolIndex;
+
+  if (Index >= getLogicalNumberOfSymbolTableEntries32())
+    return symbol_end();
+
+  DataRefImpl SymDRI;
+  SymDRI.p = reinterpret_cast<uintptr_t>(getPointerToSymbolTable() + Index);
+  return symbol_iterator(SymbolRef(SymDRI, this));
 }
 
 uint64_t XCOFFObjectFile::getRelocationType(DataRefImpl Rel) const {
-  llvm_unreachable("Not yet implemented!");
-  uint64_t Result = 0;
-  return Result;
+  if (is64Bit())
+    report_fatal_error("64-bit support not implemented yet");
+  return viewAs<XCOFFRelocation32>(Rel.p)->Type;
 }
 
 void XCOFFObjectFile::getRelocationTypeName(
     DataRefImpl Rel, SmallVectorImpl<char> &Result) const {
-  llvm_unreachable("Not yet implemented!");
-  return;
+  if (is64Bit())
+    report_fatal_error("64-bit support not implemented yet");
+  const XCOFFRelocation32 *Reloc = viewAs<XCOFFRelocation32>(Rel.p);
+  StringRef Res = XCOFF::getRelocationTypeString(Reloc->Type);
+  Result.append(Res.begin(), Res.end());
 }
 
-uint32_t XCOFFObjectFile::getSymbolFlags(DataRefImpl Symb) const {
+Expected<uint32_t> XCOFFObjectFile::getSymbolFlags(DataRefImpl Symb) const {
   uint32_t Result = 0;
   llvm_unreachable("Not yet implemented!");
   return Result;
 }
 
 basic_symbol_iterator XCOFFObjectFile::symbol_begin() const {
-  assert(!is64Bit() && "64-bit support not implemented yet.");
+  if (is64Bit())
+    report_fatal_error("64-bit support not implemented yet");
   DataRefImpl SymDRI;
   SymDRI.p = reinterpret_cast<uintptr_t>(SymbolTblPtr);
   return basic_symbol_iterator(SymbolRef(SymDRI, this));
 }
 
 basic_symbol_iterator XCOFFObjectFile::symbol_end() const {
-  assert(!is64Bit() && "64-bit support not implemented yet.");
+  if (is64Bit())
+    report_fatal_error("64-bit support not implemented yet");
   DataRefImpl SymDRI;
   SymDRI.p = reinterpret_cast<uintptr_t>(
       SymbolTblPtr + getLogicalNumberOfSymbolTableEntries32());
@@ -400,9 +443,9 @@ SubtargetFeatures XCOFFObjectFile::getFeatures() const {
 }
 
 bool XCOFFObjectFile::isRelocatableObject() const {
-  bool Result = false;
-  llvm_unreachable("Not yet implemented!");
-  return Result;
+  if (is64Bit())
+    report_fatal_error("64-bit support not implemented yet");
+  return !(fileHeader32()->Flags & NoRelMask);
 }
 
 Expected<uint64_t> XCOFFObjectFile::getStartAddress() const {
@@ -588,7 +631,7 @@ Expected<uint32_t> XCOFFObjectFile::getLogicalNumberOfRelocationEntries(
 
   uint16_t SectionIndex = &Sec - sectionHeaderTable32() + 1;
 
-  if (Sec.NumberOfRelocations < RELOC_OVERFLOW)
+  if (Sec.NumberOfRelocations < XCOFF::RelocOverflow)
     return Sec.NumberOfRelocations;
   for (const auto &Sec : sections32()) {
     if (Sec.Flags == XCOFF::STYP_OVRFLO &&
@@ -608,6 +651,7 @@ XCOFFObjectFile::relocations(const XCOFFSectionHeader32 &Sec) const {
 
   uint32_t NumRelocEntries = NumRelocEntriesOrErr.get();
 
+  assert(sizeof(XCOFFRelocation32) == XCOFF::RelocationSerializationSize32);
   auto RelocationOrErr =
       getObject<XCOFFRelocation32>(Data, reinterpret_cast<void *>(RelocAddr),
                                    NumRelocEntries * sizeof(XCOFFRelocation32));
@@ -623,9 +667,11 @@ Expected<XCOFFStringTable>
 XCOFFObjectFile::parseStringTable(const XCOFFObjectFile *Obj, uint64_t Offset) {
   // If there is a string table, then the buffer must contain at least 4 bytes
   // for the string table's size. Not having a string table is not an error.
-  if (auto EC = Binary::checkOffset(
-          Obj->Data, reinterpret_cast<uintptr_t>(Obj->base() + Offset), 4))
+  if (Error E = Binary::checkOffset(
+          Obj->Data, reinterpret_cast<uintptr_t>(Obj->base() + Offset), 4)) {
+    consumeError(std::move(E));
     return XCOFFStringTable{0, nullptr};
+  }
 
   // Read the size out of the buffer.
   uint32_t Size = support::endian::read32be(Obj->base() + Offset);
@@ -722,6 +768,8 @@ uint8_t XCOFFSymbolRef::getNumberOfAuxEntries() const {
   return OwningObjectPtr->toSymbolEntry(SymEntDataRef)->NumberOfAuxEntries;
 }
 
+// TODO: The function needs to return an error if there is no csect auxiliary
+// entry.
 const XCOFFCsectAuxEnt32 *XCOFFSymbolRef::getXCOFFCsectAuxEnt32() const {
   assert(!OwningObjectPtr->is64Bit() &&
          "32-bit interface called on 64-bit object file.");
@@ -747,6 +795,8 @@ int16_t XCOFFSymbolRef::getSectionNumber() const {
   return OwningObjectPtr->toSymbolEntry(SymEntDataRef)->SectionNumber;
 }
 
+// TODO: The function name needs to be changed to express the purpose of the
+// function.
 bool XCOFFSymbolRef::hasCsectAuxEnt() const {
   XCOFF::StorageClass SC = getStorageClass();
   return (SC == XCOFF::C_EXT || SC == XCOFF::C_WEAKEXT ||
@@ -757,7 +807,7 @@ bool XCOFFSymbolRef::isFunction() const {
   if (OwningObjectPtr->is64Bit())
     report_fatal_error("64-bit support is unimplemented yet.");
 
-  if (getType() & FUNCTION_SYM)
+  if (getType() & FunctionSym)
     return true;
 
   if (!hasCsectAuxEnt())
@@ -766,7 +816,7 @@ bool XCOFFSymbolRef::isFunction() const {
   const XCOFFCsectAuxEnt32 *CsectAuxEnt = getXCOFFCsectAuxEnt32();
 
   // A function definition should be a label definition.
-  if ((CsectAuxEnt->SymbolAlignmentAndType & SYM_TYPE_MASK) != XCOFF::XTY_LD)
+  if ((CsectAuxEnt->SymbolAlignmentAndType & SymTypeMask) != XCOFF::XTY_LD)
     return false;
 
   if (CsectAuxEnt->StorageMappingClass != XCOFF::XMC_PR)
diff --git a/llvm/lib/ObjectYAML/COFFEmitter.cpp b/llvm/lib/ObjectYAML/COFFEmitter.cpp
index ec3ec55011f9..734e1be4b2d5 100644
--- a/llvm/lib/ObjectYAML/COFFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/COFFEmitter.cpp
@@ -187,7 +187,7 @@ toDebugS(ArrayRef<CodeViewYAML::YAMLDebugSubsection> Subsections,
   std::vector<DebugSubsectionRecordBuilder> Builders;
   uint32_t Size = sizeof(uint32_t);
   for (auto &SS : CVSS) {
-    DebugSubsectionRecordBuilder B(SS, CodeViewContainer::ObjectFile);
+    DebugSubsectionRecordBuilder B(SS);
     Size += B.calculateSerializedLength();
     Builders.push_back(std::move(B));
   }
@@ -197,7 +197,7 @@ toDebugS(ArrayRef<CodeViewYAML::YAMLDebugSubsection> Subsections,
 
   Err(Writer.writeInteger<uint32_t>(COFF::DEBUG_SECTION_MAGIC));
   for (const auto &B : Builders) {
-    Err(B.commit(Writer));
+    Err(B.commit(Writer, CodeViewContainer::ObjectFile));
   }
   return {Output};
 }
diff --git a/llvm/lib/ObjectYAML/DWARFEmitter.cpp b/llvm/lib/ObjectYAML/DWARFEmitter.cpp
index b410fed16f09..ed3732ba29f6 100644
--- a/llvm/lib/ObjectYAML/DWARFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/DWARFEmitter.cpp
@@ -15,12 +15,15 @@
 #include "DWARFVisitor.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/ObjectYAML/DWARFYAML.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/Host.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
@@ -41,8 +44,8 @@ static void writeInteger(T Integer, raw_ostream &OS, bool IsLittleEndian) {
   OS.write(reinterpret_cast<char *>(&Integer), sizeof(T));
 }
 
-static void writeVariableSizedInteger(uint64_t Integer, size_t Size,
-                                      raw_ostream &OS, bool IsLittleEndian) {
+static Error writeVariableSizedInteger(uint64_t Integer, size_t Size,
+                                       raw_ostream &OS, bool IsLittleEndian) {
   if (8 == Size)
     writeInteger((uint64_t)Integer, OS, IsLittleEndian);
   else if (4 == Size)
@@ -52,7 +55,10 @@ static void writeVariableSizedInteger(uint64_t Integer, size_t Size,
   else if (1 == Size)
     writeInteger((uint8_t)Integer, OS, IsLittleEndian);
   else
-    assert(false && "Invalid integer write size.");
+    return createStringError(errc::not_supported,
+                             "invalid integer write size: %zu", Size);
+
+  return Error::success();
 }
 
 static void ZeroFillBytes(raw_ostream &OS, size_t Size) {
@@ -68,16 +74,31 @@ static void writeInitialLength(const DWARFYAML::InitialLength &Length,
     writeInteger((uint64_t)Length.TotalLength64, OS, IsLittleEndian);
 }
 
-void DWARFYAML::EmitDebugStr(raw_ostream &OS, const DWARFYAML::Data &DI) {
+static void writeInitialLength(const dwarf::DwarfFormat Format,
+                               const uint64_t Length, raw_ostream &OS,
+                               bool IsLittleEndian) {
+  bool IsDWARF64 = Format == dwarf::DWARF64;
+  if (IsDWARF64)
+    cantFail(writeVariableSizedInteger(dwarf::DW_LENGTH_DWARF64, 4, OS,
+                                       IsLittleEndian));
+  cantFail(
+      writeVariableSizedInteger(Length, IsDWARF64 ? 8 : 4, OS, IsLittleEndian));
+}
+
+Error DWARFYAML::emitDebugStr(raw_ostream &OS, const DWARFYAML::Data &DI) {
   for (auto Str : DI.DebugStrings) {
     OS.write(Str.data(), Str.size());
     OS.write('\0');
   }
+
+  return Error::success();
 }
 
-void DWARFYAML::EmitDebugAbbrev(raw_ostream &OS, const DWARFYAML::Data &DI) {
+Error DWARFYAML::emitDebugAbbrev(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  uint64_t AbbrevCode = 0;
   for (auto AbbrevDecl : DI.AbbrevDecls) {
-    encodeULEB128(AbbrevDecl.Code, OS);
+    AbbrevCode = AbbrevDecl.Code ? (uint64_t)*AbbrevDecl.Code : AbbrevCode + 1;
+    encodeULEB128(AbbrevCode, OS);
     encodeULEB128(AbbrevDecl.Tag, OS);
     OS.write(AbbrevDecl.Children);
     for (auto Attr : AbbrevDecl.Attributes) {
@@ -89,14 +110,23 @@ void DWARFYAML::EmitDebugAbbrev(raw_ostream &OS, const DWARFYAML::Data &DI) {
     encodeULEB128(0, OS);
     encodeULEB128(0, OS);
   }
+
+  // The abbreviations for a given compilation unit end with an entry consisting
+  // of a 0 byte for the abbreviation code.
+  OS.write_zeros(1);
+
+  return Error::success();
 }
 
-void DWARFYAML::EmitDebugAranges(raw_ostream &OS, const DWARFYAML::Data &DI) {
+Error DWARFYAML::emitDebugAranges(raw_ostream &OS, const DWARFYAML::Data &DI) {
   for (auto Range : DI.ARanges) {
     auto HeaderStart = OS.tell();
-    writeInitialLength(Range.Length, OS, DI.IsLittleEndian);
+    writeInitialLength(Range.Format, Range.Length, OS, DI.IsLittleEndian);
     writeInteger((uint16_t)Range.Version, OS, DI.IsLittleEndian);
-    writeInteger((uint32_t)Range.CuOffset, OS, DI.IsLittleEndian);
+    if (Range.Format == dwarf::DWARF64)
+      writeInteger((uint64_t)Range.CuOffset, OS, DI.IsLittleEndian);
+    else
+      writeInteger((uint32_t)Range.CuOffset, OS, DI.IsLittleEndian);
     writeInteger((uint8_t)Range.AddrSize, OS, DI.IsLittleEndian);
     writeInteger((uint8_t)Range.SegSize, OS, DI.IsLittleEndian);
 
@@ -105,29 +135,73 @@ void DWARFYAML::EmitDebugAranges(raw_ostream &OS, const DWARFYAML::Data &DI) {
     ZeroFillBytes(OS, FirstDescriptor - HeaderSize);
 
     for (auto Descriptor : Range.Descriptors) {
-      writeVariableSizedInteger(Descriptor.Address, Range.AddrSize, OS,
-                                DI.IsLittleEndian);
-      writeVariableSizedInteger(Descriptor.Length, Range.AddrSize, OS,
-                                DI.IsLittleEndian);
+      if (Error Err = writeVariableSizedInteger(
+              Descriptor.Address, Range.AddrSize, OS, DI.IsLittleEndian))
+        return createStringError(errc::not_supported,
+                                 "unable to write debug_aranges address: %s",
+                                 toString(std::move(Err)).c_str());
+      cantFail(writeVariableSizedInteger(Descriptor.Length, Range.AddrSize, OS,
+                                         DI.IsLittleEndian));
     }
     ZeroFillBytes(OS, Range.AddrSize * 2);
   }
+
+  return Error::success();
 }
 
-void DWARFYAML::EmitPubSection(raw_ostream &OS,
-                               const DWARFYAML::PubSection &Sect,
-                               bool IsLittleEndian) {
+Error DWARFYAML::emitDebugRanges(raw_ostream &OS, const DWARFYAML::Data &DI) {
+  const size_t RangesOffset = OS.tell();
+  uint64_t EntryIndex = 0;
+  for (auto DebugRanges : DI.DebugRanges) {
+    const size_t CurrOffset = OS.tell() - RangesOffset;
+    if (DebugRanges.Offset && (uint64_t)*DebugRanges.Offset < CurrOffset)
+      return createStringError(errc::invalid_argument,
+                               "'Offset' for 'debug_ranges' with index " +
+                                   Twine(EntryIndex) +
+                                   " must be greater than or equal to the "
+                                   "number of bytes written already (0x" +
+                                   Twine::utohexstr(CurrOffset) + ")");
+    if (DebugRanges.Offset)
+      ZeroFillBytes(OS, *DebugRanges.Offset - CurrOffset);
+
+    uint8_t AddrSize;
+    if (DebugRanges.AddrSize)
+      AddrSize = *DebugRanges.AddrSize;
+    else
+      AddrSize = DI.Is64BitAddrSize ? 8 : 4;
+    for (auto Entry : DebugRanges.Entries) {
+      if (Error Err = writeVariableSizedInteger(Entry.LowOffset, AddrSize, OS,
+                                                DI.IsLittleEndian))
+        return createStringError(
+            errc::not_supported,
+            "unable to write debug_ranges address offset: %s",
+            toString(std::move(Err)).c_str());
+      cantFail(writeVariableSizedInteger(Entry.HighOffset, AddrSize, OS,
+                                         DI.IsLittleEndian));
+    }
+    ZeroFillBytes(OS, AddrSize * 2);
+    ++EntryIndex;
+  }
+
+  return Error::success();
+}
+
+Error DWARFYAML::emitPubSection(raw_ostream &OS,
+                                const DWARFYAML::PubSection &Sect,
+                                bool IsLittleEndian, bool IsGNUPubSec) {
   writeInitialLength(Sect.Length, OS, IsLittleEndian);
   writeInteger((uint16_t)Sect.Version, OS, IsLittleEndian);
   writeInteger((uint32_t)Sect.UnitOffset, OS, IsLittleEndian);
   writeInteger((uint32_t)Sect.UnitSize, OS, IsLittleEndian);
   for (auto Entry : Sect.Entries) {
     writeInteger((uint32_t)Entry.DieOffset, OS, IsLittleEndian);
-    if (Sect.IsGNUStyle)
-      writeInteger((uint32_t)Entry.Descriptor, OS, IsLittleEndian);
+    if (IsGNUPubSec)
+      writeInteger((uint8_t)Entry.Descriptor, OS, IsLittleEndian);
     OS.write(Entry.Name.data(), Entry.Name.size());
     OS.write('\0');
   }
+
+  return Error::success();
 }
 
 namespace {
@@ -138,14 +212,18 @@ class DumpVisitor : public DWARFYAML::ConstVisitor {
 
 protected:
   void onStartCompileUnit(const DWARFYAML::Unit &CU) override {
-    writeInitialLength(CU.Length, OS, DebugInfo.IsLittleEndian);
+    writeInitialLength(CU.Format, CU.Length, OS, DebugInfo.IsLittleEndian);
     writeInteger((uint16_t)CU.Version, OS, DebugInfo.IsLittleEndian);
-    if(CU.Version >= 5) {
+    if (CU.Version >= 5) {
       writeInteger((uint8_t)CU.Type, OS, DebugInfo.IsLittleEndian);
       writeInteger((uint8_t)CU.AddrSize, OS, DebugInfo.IsLittleEndian);
-      writeInteger((uint32_t)CU.AbbrOffset, OS, DebugInfo.IsLittleEndian);
-    }else {
-      writeInteger((uint32_t)CU.AbbrOffset, OS, DebugInfo.IsLittleEndian);
+      cantFail(writeVariableSizedInteger(CU.AbbrOffset,
+                                         CU.Format == dwarf::DWARF64 ? 8 : 4,
+                                         OS, DebugInfo.IsLittleEndian));
+    } else {
+      cantFail(writeVariableSizedInteger(CU.AbbrOffset,
+                                         CU.Format == dwarf::DWARF64 ? 8 : 4,
+                                         OS, DebugInfo.IsLittleEndian));
       writeInteger((uint8_t)CU.AddrSize, OS, DebugInfo.IsLittleEndian);
     }
   }
@@ -196,12 +274,12 @@ public:
 };
 } // namespace
 
-void DWARFYAML::EmitDebugInfo(raw_ostream &OS, const DWARFYAML::Data &DI) {
+Error DWARFYAML::emitDebugInfo(raw_ostream &OS, const DWARFYAML::Data &DI) {
   DumpVisitor Visitor(DI, OS);
-  Visitor.traverseDebugInfo();
+  return Visitor.traverseDebugInfo();
 }
 
-static void EmitFileEntry(raw_ostream &OS, const DWARFYAML::File &File) {
+static void emitFileEntry(raw_ostream &OS, const DWARFYAML::File &File) {
   OS.write(File.Name.data(), File.Name.size());
   OS.write('\0');
   encodeULEB128(File.DirIdx, OS);
@@ -209,13 +287,14 @@ static void EmitFileEntry(raw_ostream &OS, const DWARFYAML::File &File) {
   encodeULEB128(File.Length, OS);
 }
 
-void DWARFYAML::EmitDebugLine(raw_ostream &OS, const DWARFYAML::Data &DI) {
+Error DWARFYAML::emitDebugLine(raw_ostream &OS, const DWARFYAML::Data &DI) {
   for (const auto &LineTable : DI.DebugLines) {
-    writeInitialLength(LineTable.Length, OS, DI.IsLittleEndian);
-    uint64_t SizeOfPrologueLength = LineTable.Length.isDWARF64() ? 8 : 4;
+    writeInitialLength(LineTable.Format, LineTable.Length, OS,
+                       DI.IsLittleEndian);
+    uint64_t SizeOfPrologueLength = LineTable.Format == dwarf::DWARF64 ? 8 : 4;
     writeInteger((uint16_t)LineTable.Version, OS, DI.IsLittleEndian);
-    writeVariableSizedInteger(LineTable.PrologueLength, SizeOfPrologueLength,
-                              OS, DI.IsLittleEndian);
+    cantFail(writeVariableSizedInteger(
+        LineTable.PrologueLength, SizeOfPrologueLength, OS, DI.IsLittleEndian));
     writeInteger((uint8_t)LineTable.MinInstLength, OS, DI.IsLittleEndian);
     if (LineTable.Version >= 4)
       writeInteger((uint8_t)LineTable.MaxOpsPerInst, OS, DI.IsLittleEndian);
@@ -234,7 +313,7 @@ void DWARFYAML::EmitDebugLine(raw_ostream &OS, const DWARFYAML::Data &DI) {
     OS.write('\0');
 
     for (auto File : LineTable.Files)
-      EmitFileEntry(OS, File);
+      emitFileEntry(OS, File);
     OS.write('\0');
 
     for (auto Op : LineTable.Opcodes) {
@@ -245,11 +324,13 @@ void DWARFYAML::EmitDebugLine(raw_ostream &OS, const DWARFYAML::Data &DI) {
         switch (Op.SubOpcode) {
         case dwarf::DW_LNE_set_address:
         case dwarf::DW_LNE_set_discriminator:
-          writeVariableSizedInteger(Op.Data, DI.CompileUnits[0].AddrSize, OS,
-                                    DI.IsLittleEndian);
+          // TODO: Test this error.
+          if (Error Err = writeVariableSizedInteger(
+                  Op.Data, DI.CompileUnits[0].AddrSize, OS, DI.IsLittleEndian))
+            return Err;
           break;
         case dwarf::DW_LNE_define_file:
-          EmitFileEntry(OS, Op.FileEntry);
+          emitFileEntry(OS, Op.FileEntry);
           break;
         case dwarf::DW_LNE_end_sequence:
           break;
@@ -290,20 +371,66 @@ void DWARFYAML::EmitDebugLine(raw_ostream &OS, const DWARFYAML::Data &DI) {
       }
     }
   }
+
+  return Error::success();
+}
+
+Error DWARFYAML::emitDebugAddr(raw_ostream &OS, const Data &DI) {
+  for (const AddrTableEntry &TableEntry : DI.DebugAddr) {
+    uint8_t AddrSize;
+    if (TableEntry.AddrSize)
+      AddrSize = *TableEntry.AddrSize;
+    else
+      AddrSize = DI.Is64BitAddrSize ? 8 : 4;
+
+    uint64_t Length;
+    if (TableEntry.Length)
+      Length = (uint64_t)*TableEntry.Length;
+    else
+      // 2 (version) + 1 (address_size) + 1 (segment_selector_size) = 4
+      Length = 4 + (AddrSize + TableEntry.SegSelectorSize) *
+                       TableEntry.SegAddrPairs.size();
+
+    writeInitialLength(TableEntry.Format, Length, OS, DI.IsLittleEndian);
+    writeInteger((uint16_t)TableEntry.Version, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)AddrSize, OS, DI.IsLittleEndian);
+    writeInteger((uint8_t)TableEntry.SegSelectorSize, OS, DI.IsLittleEndian);
+
+    for (const SegAddrPair &Pair : TableEntry.SegAddrPairs) {
+      if (TableEntry.SegSelectorSize != 0)
+        if (Error Err = writeVariableSizedInteger(Pair.Segment,
+                                                  TableEntry.SegSelectorSize,
+                                                  OS, DI.IsLittleEndian))
+          return createStringError(errc::not_supported,
+                                   "unable to write debug_addr segment: %s",
+                                   toString(std::move(Err)).c_str());
+      if (AddrSize != 0)
+        if (Error Err = writeVariableSizedInteger(Pair.Address, AddrSize, OS,
+                                                  DI.IsLittleEndian))
+          return createStringError(errc::not_supported,
+                                   "unable to write debug_addr address: %s",
+                                   toString(std::move(Err)).c_str());
+    }
+  }
+
+  return Error::success();
 }
 
-using EmitFuncType = void (*)(raw_ostream &, const DWARFYAML::Data &);
+using EmitFuncType = Error (*)(raw_ostream &, const DWARFYAML::Data &);
 
-static void
-EmitDebugSectionImpl(const DWARFYAML::Data &DI, EmitFuncType EmitFunc,
+static Error
+emitDebugSectionImpl(const DWARFYAML::Data &DI, EmitFuncType EmitFunc,
                      StringRef Sec,
                      StringMap<std::unique_ptr<MemoryBuffer>> &OutputBuffers) {
   std::string Data;
   raw_string_ostream DebugInfoStream(Data);
-  EmitFunc(DebugInfoStream, DI);
+  if (Error Err = EmitFunc(DebugInfoStream, DI))
+    return Err;
   DebugInfoStream.flush();
   if (!Data.empty())
     OutputBuffers[Sec] = MemoryBuffer::getMemBufferCopy(Data);
+
+  return Error::success();
 }
 
 namespace {
@@ -313,69 +440,84 @@ class DIEFixupVisitor : public DWARFYAML::Visitor {
 public:
   DIEFixupVisitor(DWARFYAML::Data &DI) : DWARFYAML::Visitor(DI){};
 
-private:
-  virtual void onStartCompileUnit(DWARFYAML::Unit &CU) {
+protected:
+  void onStartCompileUnit(DWARFYAML::Unit &CU) override {
     // Size of the unit header, excluding the length field itself.
     Length = CU.Version >= 5 ? 8 : 7;
   }
 
-  virtual void onEndCompileUnit(DWARFYAML::Unit &CU) {
-    CU.Length.setLength(Length);
-  }
+  void onEndCompileUnit(DWARFYAML::Unit &CU) override { CU.Length = Length; }
 
-  virtual void onStartDIE(DWARFYAML::Unit &CU, DWARFYAML::Entry &DIE) {
+  void onStartDIE(DWARFYAML::Unit &CU, DWARFYAML::Entry &DIE) override {
     Length += getULEB128Size(DIE.AbbrCode);
   }
 
-  virtual void onValue(const uint8_t U) { Length += 1; }
-  virtual void onValue(const uint16_t U) { Length += 2; }
-  virtual void onValue(const uint32_t U) { Length += 4; }
-  virtual void onValue(const uint64_t U, const bool LEB = false) {
+  void onValue(const uint8_t U) override { Length += 1; }
+  void onValue(const uint16_t U) override { Length += 2; }
+  void onValue(const uint32_t U) override { Length += 4; }
+  void onValue(const uint64_t U, const bool LEB = false) override {
     if (LEB)
       Length += getULEB128Size(U);
     else
       Length += 8;
   }
-  virtual void onValue(const int64_t S, const bool LEB = false) {
+  void onValue(const int64_t S, const bool LEB = false) override {
     if (LEB)
       Length += getSLEB128Size(S);
     else
       Length += 8;
   }
-  virtual void onValue(const StringRef String) { Length += String.size() + 1; }
+  void onValue(const StringRef String) override { Length += String.size() + 1; }
 
-  virtual void onValue(const MemoryBufferRef MBR) {
+  void onValue(const MemoryBufferRef MBR) override {
     Length += MBR.getBufferSize();
   }
 };
 } // namespace
 
 Expected<StringMap<std::unique_ptr<MemoryBuffer>>>
-DWARFYAML::EmitDebugSections(StringRef YAMLString, bool ApplyFixups,
+DWARFYAML::emitDebugSections(StringRef YAMLString, bool ApplyFixups,
                              bool IsLittleEndian) {
-  yaml::Input YIn(YAMLString);
+  auto CollectDiagnostic = [](const SMDiagnostic &Diag, void *DiagContext) {
+    *static_cast<SMDiagnostic *>(DiagContext) = Diag;
+  };
+
+  SMDiagnostic GeneratedDiag;
+  yaml::Input YIn(YAMLString, /*Ctxt=*/nullptr, CollectDiagnostic,
+                  &GeneratedDiag);
 
   DWARFYAML::Data DI;
   DI.IsLittleEndian = IsLittleEndian;
   YIn >> DI;
   if (YIn.error())
-    return errorCodeToError(YIn.error());
+    return createStringError(YIn.error(), GeneratedDiag.getMessage());
 
   if (ApplyFixups) {
     DIEFixupVisitor DIFixer(DI);
-    DIFixer.traverseDebugInfo();
+    if (Error Err = DIFixer.traverseDebugInfo())
+      return std::move(Err);
   }
 
   StringMap<std::unique_ptr<MemoryBuffer>> DebugSections;
-  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugInfo, "debug_info",
-                       DebugSections);
-  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugLine, "debug_line",
-                       DebugSections);
-  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugStr, "debug_str",
-                       DebugSections);
-  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugAbbrev, "debug_abbrev",
-                       DebugSections);
-  EmitDebugSectionImpl(DI, &DWARFYAML::EmitDebugAranges, "debug_aranges",
-                       DebugSections);
+  Error Err = emitDebugSectionImpl(DI, &DWARFYAML::emitDebugInfo, "debug_info",
+                                   DebugSections);
+  Err = joinErrors(std::move(Err),
+                   emitDebugSectionImpl(DI, &DWARFYAML::emitDebugLine,
+                                        "debug_line", DebugSections));
+  Err = joinErrors(std::move(Err),
+                   emitDebugSectionImpl(DI, &DWARFYAML::emitDebugStr,
+                                        "debug_str", DebugSections));
+  Err = joinErrors(std::move(Err),
+                   emitDebugSectionImpl(DI, &DWARFYAML::emitDebugAbbrev,
+                                        "debug_abbrev", DebugSections));
+  Err = joinErrors(std::move(Err),
+                   emitDebugSectionImpl(DI, &DWARFYAML::emitDebugAranges,
+                                        "debug_aranges", DebugSections));
+  Err = joinErrors(std::move(Err),
+                   emitDebugSectionImpl(DI, &DWARFYAML::emitDebugRanges,
+                                        "debug_ranges", DebugSections));
+
+  if (Err)
+    return std::move(Err);
   return std::move(DebugSections);
 }
diff --git a/llvm/lib/ObjectYAML/DWARFVisitor.cpp b/llvm/lib/ObjectYAML/DWARFVisitor.cpp
index ecb5967ac532..a2dd37b5fe32 100644
--- a/llvm/lib/ObjectYAML/DWARFVisitor.cpp
+++ b/llvm/lib/ObjectYAML/DWARFVisitor.cpp
@@ -9,7 +9,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "DWARFVisitor.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/ObjectYAML/DWARFYAML.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 
 using namespace llvm;
 
@@ -34,7 +37,7 @@ void DWARFYAML::VisitorImpl<T>::onVariableSizeValue(uint64_t U, unsigned Size) {
 }
 
 static unsigned getOffsetSize(const DWARFYAML::Unit &Unit) {
-  return Unit.Length.isDWARF64() ? 8 : 4;
+  return Unit.Format == dwarf::DWARF64 ? 8 : 4;
 }
 
 static unsigned getRefSize(const DWARFYAML::Unit &Unit) {
@@ -43,16 +46,24 @@ static unsigned getRefSize(const DWARFYAML::Unit &Unit) {
   return getOffsetSize(Unit);
 }
 
-template <typename T> void DWARFYAML::VisitorImpl<T>::traverseDebugInfo() {
+template <typename T> Error DWARFYAML::VisitorImpl<T>::traverseDebugInfo() {
   for (auto &Unit : DebugInfo.CompileUnits) {
     onStartCompileUnit(Unit);
-    auto FirstAbbrevCode = Unit.Entries[0].AbbrCode;
+    if (Unit.Entries.empty())
+      continue;
 
     for (auto &Entry : Unit.Entries) {
       onStartDIE(Unit, Entry);
-      if (Entry.AbbrCode == 0u)
+      uint32_t AbbrCode = Entry.AbbrCode;
+      if (AbbrCode == 0 || Entry.Values.empty())
         continue;
-      auto &Abbrev = DebugInfo.AbbrevDecls[Entry.AbbrCode - FirstAbbrevCode];
+
+      if (AbbrCode > DebugInfo.AbbrevDecls.size())
+        return createStringError(
+            errc::invalid_argument,
+            "abbrev code must be less than or equal to the number of "
+            "entries in abbreviation table");
+      const DWARFYAML::Abbrev &Abbrev = DebugInfo.AbbrevDecls[AbbrCode - 1];
       auto FormVal = Entry.Values.begin();
       auto AbbrForm = Abbrev.Attributes.begin();
       for (;
@@ -105,6 +116,12 @@ template <typename T> void DWARFYAML::VisitorImpl<T>::traverseDebugInfo() {
                                 ""));
             break;
           }
+          case dwarf::DW_FORM_strx:
+          case dwarf::DW_FORM_addrx:
+          case dwarf::DW_FORM_rnglistx:
+          case dwarf::DW_FORM_loclistx:
+            onValue((uint64_t)FormVal->Value, /*LEB=*/true);
+            break;
           case dwarf::DW_FORM_data1:
           case dwarf::DW_FORM_ref1:
           case dwarf::DW_FORM_flag:
@@ -170,6 +187,8 @@ template <typename T> void DWARFYAML::VisitorImpl<T>::traverseDebugInfo() {
     }
     onEndCompileUnit(Unit);
   }
+
+  return Error::success();
 }
 
 // Explicitly instantiate the two template expansions.
diff --git a/llvm/lib/ObjectYAML/DWARFVisitor.h b/llvm/lib/ObjectYAML/DWARFVisitor.h
index 50e88aa7a26b..3b2c4303c7f7 100644
--- a/llvm/lib/ObjectYAML/DWARFVisitor.h
+++ b/llvm/lib/ObjectYAML/DWARFVisitor.h
@@ -16,6 +16,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 
 namespace llvm {
+class Error;
 
 namespace DWARFYAML {
 
@@ -68,7 +69,7 @@ public:
 
   virtual ~VisitorImpl() {}
 
-  void traverseDebugInfo();
+  Error traverseDebugInfo();
 
 private:
   void onVariableSizeValue(uint64_t U, unsigned Size);
diff --git a/llvm/lib/ObjectYAML/DWARFYAML.cpp b/llvm/lib/ObjectYAML/DWARFYAML.cpp
index bb3b1422eb62..bedf31dc8179 100644
--- a/llvm/lib/ObjectYAML/DWARFYAML.cpp
+++ b/llvm/lib/ObjectYAML/DWARFYAML.cpp
@@ -12,38 +12,69 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ObjectYAML/DWARFYAML.h"
+#include "llvm/BinaryFormat/Dwarf.h"
 
 namespace llvm {
 
 bool DWARFYAML::Data::isEmpty() const {
-  return 0 == DebugStrings.size() + AbbrevDecls.size();
+  return DebugStrings.empty() && AbbrevDecls.empty() && ARanges.empty() &&
+         DebugRanges.empty() && !PubNames && !PubTypes && !GNUPubNames &&
+         !GNUPubTypes && CompileUnits.empty() && DebugLines.empty();
+}
+
+SetVector<StringRef> DWARFYAML::Data::getUsedSectionNames() const {
+  SetVector<StringRef> SecNames;
+  if (!DebugStrings.empty())
+    SecNames.insert("debug_str");
+  if (!ARanges.empty())
+    SecNames.insert("debug_aranges");
+  if (!DebugRanges.empty())
+    SecNames.insert("debug_ranges");
+  if (!DebugLines.empty())
+    SecNames.insert("debug_line");
+  if (!DebugAddr.empty())
+    SecNames.insert("debug_addr");
+  if (!AbbrevDecls.empty())
+    SecNames.insert("debug_abbrev");
+  if (!CompileUnits.empty())
+    SecNames.insert("debug_info");
+  if (PubNames)
+    SecNames.insert("debug_pubnames");
+  if (PubTypes)
+    SecNames.insert("debug_pubtypes");
+  if (GNUPubNames)
+    SecNames.insert("debug_gnu_pubnames");
+  if (GNUPubTypes)
+    SecNames.insert("debug_gnu_pubtypes");
+  return SecNames;
 }
 
 namespace yaml {
 
 void MappingTraits<DWARFYAML::Data>::mapping(IO &IO, DWARFYAML::Data &DWARF) {
-  auto oldContext = IO.getContext();
-  IO.setContext(&DWARF);
+  void *OldContext = IO.getContext();
+  DWARFYAML::DWARFContext DWARFCtx;
+  IO.setContext(&DWARFCtx);
   IO.mapOptional("debug_str", DWARF.DebugStrings);
   IO.mapOptional("debug_abbrev", DWARF.AbbrevDecls);
   if (!DWARF.ARanges.empty() || !IO.outputting())
     IO.mapOptional("debug_aranges", DWARF.ARanges);
-  if (!DWARF.PubNames.Entries.empty() || !IO.outputting())
-    IO.mapOptional("debug_pubnames", DWARF.PubNames);
-  if (!DWARF.PubTypes.Entries.empty() || !IO.outputting())
-    IO.mapOptional("debug_pubtypes", DWARF.PubTypes);
-  if (!DWARF.GNUPubNames.Entries.empty() || !IO.outputting())
-    IO.mapOptional("debug_gnu_pubnames", DWARF.GNUPubNames);
-  if (!DWARF.GNUPubTypes.Entries.empty() || !IO.outputting())
-    IO.mapOptional("debug_gnu_pubtypes", DWARF.GNUPubTypes);
+  if (!DWARF.DebugRanges.empty() || !IO.outputting())
+    IO.mapOptional("debug_ranges", DWARF.DebugRanges);
+  IO.mapOptional("debug_pubnames", DWARF.PubNames);
+  IO.mapOptional("debug_pubtypes", DWARF.PubTypes);
+  DWARFCtx.IsGNUPubSec = true;
+  IO.mapOptional("debug_gnu_pubnames", DWARF.GNUPubNames);
+  IO.mapOptional("debug_gnu_pubtypes", DWARF.GNUPubTypes);
   IO.mapOptional("debug_info", DWARF.CompileUnits);
   IO.mapOptional("debug_line", DWARF.DebugLines);
-  IO.setContext(&oldContext);
+  IO.mapOptional("debug_addr", DWARF.DebugAddr);
+  IO.setContext(OldContext);
 }
 
 void MappingTraits<DWARFYAML::Abbrev>::mapping(IO &IO,
                                                DWARFYAML::Abbrev &Abbrev) {
-  IO.mapRequired("Code", Abbrev.Code);
+  IO.mapOptional("Code", Abbrev.Code);
   IO.mapRequired("Tag", Abbrev.Tag);
   IO.mapRequired("Children", Abbrev.Children);
   IO.mapRequired("Attributes", Abbrev.Attributes);
@@ -64,38 +95,48 @@ void MappingTraits<DWARFYAML::ARangeDescriptor>::mapping(
 }
 
 void MappingTraits<DWARFYAML::ARange>::mapping(IO &IO,
-                                               DWARFYAML::ARange &Range) {
-  IO.mapRequired("Length", Range.Length);
-  IO.mapRequired("Version", Range.Version);
-  IO.mapRequired("CuOffset", Range.CuOffset);
-  IO.mapRequired("AddrSize", Range.AddrSize);
-  IO.mapRequired("SegSize", Range.SegSize);
-  IO.mapRequired("Descriptors", Range.Descriptors);
+                                               DWARFYAML::ARange &ARange) {
+  IO.mapOptional("Format", ARange.Format, dwarf::DWARF32);
+  IO.mapRequired("Length", ARange.Length);
+  IO.mapRequired("Version", ARange.Version);
+  IO.mapRequired("CuOffset", ARange.CuOffset);
+  IO.mapRequired("AddrSize", ARange.AddrSize);
+  IO.mapRequired("SegSize", ARange.SegSize);
+  IO.mapRequired("Descriptors", ARange.Descriptors);
+}
+
+void MappingTraits<DWARFYAML::RangeEntry>::mapping(
+    IO &IO, DWARFYAML::RangeEntry &Descriptor) {
+  IO.mapRequired("LowOffset", Descriptor.LowOffset);
+  IO.mapRequired("HighOffset", Descriptor.HighOffset);
+}
+
+void MappingTraits<DWARFYAML::Ranges>::mapping(IO &IO,
+                                               DWARFYAML::Ranges &DebugRanges) {
+  IO.mapOptional("Offset", DebugRanges.Offset);
+  IO.mapOptional("AddrSize", DebugRanges.AddrSize);
+  IO.mapRequired("Entries", DebugRanges.Entries);
 }
 
 void MappingTraits<DWARFYAML::PubEntry>::mapping(IO &IO,
                                                  DWARFYAML::PubEntry &Entry) {
   IO.mapRequired("DieOffset", Entry.DieOffset);
-  if (reinterpret_cast<DWARFYAML::PubSection *>(IO.getContext())->IsGNUStyle)
+  if (static_cast<DWARFYAML::DWARFContext *>(IO.getContext())->IsGNUPubSec)
     IO.mapRequired("Descriptor", Entry.Descriptor);
   IO.mapRequired("Name", Entry.Name);
 }
 
 void MappingTraits<DWARFYAML::PubSection>::mapping(
     IO &IO, DWARFYAML::PubSection &Section) {
-  auto OldContext = IO.getContext();
-  IO.setContext(&Section);
-
   IO.mapRequired("Length", Section.Length);
   IO.mapRequired("Version", Section.Version);
   IO.mapRequired("UnitOffset", Section.UnitOffset);
   IO.mapRequired("UnitSize", Section.UnitSize);
   IO.mapRequired("Entries", Section.Entries);
-
-  IO.setContext(OldContext);
 }
 
 void MappingTraits<DWARFYAML::Unit>::mapping(IO &IO, DWARFYAML::Unit &Unit) {
+  IO.mapOptional("Format", Unit.Format, dwarf::DWARF32);
   IO.mapRequired("Length", Unit.Length);
   IO.mapRequired("Version", Unit.Version);
   if (Unit.Version >= 5)
@@ -147,6 +188,7 @@ void MappingTraits<DWARFYAML::LineTableOpcode>::mapping(
 
 void MappingTraits<DWARFYAML::LineTable>::mapping(
     IO &IO, DWARFYAML::LineTable &LineTable) {
+  IO.mapOptional("Format", LineTable.Format, dwarf::DWARF32);
   IO.mapRequired("Length", LineTable.Length);
   IO.mapRequired("Version", LineTable.Version);
   IO.mapRequired("PrologueLength", LineTable.PrologueLength);
@@ -163,6 +205,22 @@ void MappingTraits<DWARFYAML::LineTable>::mapping(
   IO.mapRequired("Opcodes", LineTable.Opcodes);
 }
 
+void MappingTraits<DWARFYAML::SegAddrPair>::mapping(
+    IO &IO, DWARFYAML::SegAddrPair &SegAddrPair) {
+  IO.mapOptional("Segment", SegAddrPair.Segment, 0);
+  IO.mapOptional("Address", SegAddrPair.Address, 0);
+}
+
+void MappingTraits<DWARFYAML::AddrTableEntry>::mapping(
+    IO &IO, DWARFYAML::AddrTableEntry &AddrTable) {
+  IO.mapOptional("Format", AddrTable.Format, dwarf::DWARF32);
+  IO.mapOptional("Length", AddrTable.Length);
+  IO.mapRequired("Version", AddrTable.Version);
+  IO.mapOptional("AddressSize", AddrTable.AddrSize);
+  IO.mapOptional("SegmentSelectorSize", AddrTable.SegSelectorSize, 0);
+  IO.mapOptional("Entries", AddrTable.SegAddrPairs);
+}
+
 void MappingTraits<DWARFYAML::InitialLength>::mapping(
     IO &IO, DWARFYAML::InitialLength &InitialLength) {
   IO.mapRequired("TotalLength", InitialLength.TotalLength);
diff --git a/llvm/lib/ObjectYAML/ELFEmitter.cpp b/llvm/lib/ObjectYAML/ELFEmitter.cpp
index ee7d5f616a73..f9f2f128e2e8 100644
--- a/llvm/lib/ObjectYAML/ELFEmitter.cpp
+++ b/llvm/lib/ObjectYAML/ELFEmitter.cpp
@@ -13,13 +13,18 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/StringTableBuilder.h"
 #include "llvm/Object/ELFObjectFile.h"
+#include "llvm/ObjectYAML/DWARFEmitter.h"
+#include "llvm/ObjectYAML/DWARFYAML.h"
 #include "llvm/ObjectYAML/ELFYAML.h"
 #include "llvm/ObjectYAML/yaml2obj.h"
 #include "llvm/Support/EndianStream.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/WithColor.h"
@@ -31,33 +36,94 @@ using namespace llvm;
 // This class is used to build up a contiguous binary blob while keeping
 // track of an offset in the output (which notionally begins at
 // `InitialOffset`).
+// The blob might be limited to an arbitrary size. All attempts to write data
+// are ignored and the error condition is remembered once the limit is reached.
+// Such an approach allows us to simplify the code by delaying error reporting
+// and doing it at a convenient time.
 namespace {
 class ContiguousBlobAccumulator {
   const uint64_t InitialOffset;
+  const uint64_t MaxSize;
+
   SmallVector<char, 128> Buf;
   raw_svector_ostream OS;
+  Error ReachedLimitErr = Error::success();
+
+  bool checkLimit(uint64_t Size) {
+    if (!ReachedLimitErr && getOffset() + Size <= MaxSize)
+      return true;
+    if (!ReachedLimitErr)
+      ReachedLimitErr = createStringError(errc::invalid_argument,
+                                          "reached the output size limit");
+    return false;
+  }
 
 public:
-  ContiguousBlobAccumulator(uint64_t InitialOffset_)
-      : InitialOffset(InitialOffset_), Buf(), OS(Buf) {}
+  ContiguousBlobAccumulator(uint64_t BaseOffset, uint64_t SizeLimit)
+      : InitialOffset(BaseOffset), MaxSize(SizeLimit), OS(Buf) {}
+
+  uint64_t tell() const { return OS.tell(); }
+  uint64_t getOffset() const { return InitialOffset + OS.tell(); }
+  void writeBlobToStream(raw_ostream &Out) const { Out << OS.str(); }
 
-  template <class Integer>
-  raw_ostream &getOSAndAlignedOffset(Integer &Offset, unsigned Align) {
-    Offset = padToAlignment(Align);
-    return OS;
+  Error takeLimitError() {
+    // Request to write 0 bytes to check we did not reach the limit.
+    checkLimit(0);
+    return std::move(ReachedLimitErr);
   }
 
   /// \returns The new offset.
   uint64_t padToAlignment(unsigned Align) {
-    if (Align == 0)
-      Align = 1;
-    uint64_t CurrentOffset = InitialOffset + OS.tell();
-    uint64_t AlignedOffset = alignTo(CurrentOffset, Align);
-    OS.write_zeros(AlignedOffset - CurrentOffset);
-    return AlignedOffset; // == CurrentOffset;
+    uint64_t CurrentOffset = getOffset();
+    if (ReachedLimitErr)
+      return CurrentOffset;
+
+    uint64_t AlignedOffset = alignTo(CurrentOffset, Align == 0 ? 1 : Align);
+    uint64_t PaddingSize = AlignedOffset - CurrentOffset;
+    if (!checkLimit(PaddingSize))
+      return CurrentOffset;
+
+    writeZeros(PaddingSize);
+    return AlignedOffset;
+  }
+
+  raw_ostream *getRawOS(uint64_t Size) {
+    if (checkLimit(Size))
+      return &OS;
+    return nullptr;
+  }
+
+  void writeAsBinary(const yaml::BinaryRef &Bin, uint64_t N = UINT64_MAX) {
+    if (!checkLimit(Bin.binary_size()))
+      return;
+    Bin.writeAsBinary(OS, N);
+  }
+
+  void writeZeros(uint64_t Num) {
+    if (checkLimit(Num))
+      OS.write_zeros(Num);
+  }
+
+  void write(const char *Ptr, size_t Size) {
+    if (checkLimit(Size))
+      OS.write(Ptr, Size);
+  }
+
+  void write(unsigned char C) {
+    if (checkLimit(1))
+      OS.write(C);
+  }
+
+  unsigned writeULEB128(uint64_t Val) {
+    if (!checkLimit(sizeof(uint64_t)))
+      return 0;
+    return encodeULEB128(Val, OS);
   }
 
-  void writeBlobToStream(raw_ostream &Out) { Out << OS.str(); }
+  template <typename T> void write(T Val, support::endianness E) {
+    if (checkLimit(sizeof(T)))
+      support::endian::write<T>(OS, Val, E);
+  }
 };
 
 // Used to keep track of section and symbol names, so that in the YAML file
@@ -128,9 +194,13 @@ template <class ELFT> class ELFState {
   NameToIdxMap DynSymN2I;
   ELFYAML::Object &Doc;
 
+  StringSet<> ExcludedSectionHeaders;
+
+  uint64_t LocationCounter = 0;
   bool HasError = false;
   yaml::ErrorHandler ErrHandler;
   void reportError(const Twine &Msg);
+  void reportError(Error Err);
 
   std::vector<Elf_Sym> toELFSymbols(ArrayRef<ELFYAML::Symbol> Symbols,
                                     const StringTableBuilder &Strtab);
@@ -151,6 +221,9 @@ template <class ELFT> class ELFState {
                                StringTableBuilder &STB,
                                ContiguousBlobAccumulator &CBA,
                                ELFYAML::Section *YAMLSec);
+  void initDWARFSectionHeader(Elf_Shdr &SHeader, StringRef Name,
+                              ContiguousBlobAccumulator &CBA,
+                              ELFYAML::Section *YAMLSec);
   void setProgramHeaderLayout(std::vector<Elf_Phdr> &PHeaders,
                               std::vector<Elf_Shdr> &SHeaders);
 
@@ -159,7 +232,10 @@ template <class ELFT> class ELFState {
                    ArrayRef<typename ELFT::Shdr> SHeaders);
 
   void finalizeStrings();
-  void writeELFHeader(ContiguousBlobAccumulator &CBA, raw_ostream &OS);
+  void writeELFHeader(raw_ostream &OS, uint64_t SHOff);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::NoBitsSection &Section,
+                           ContiguousBlobAccumulator &CBA);
   void writeSectionContent(Elf_Shdr &SHeader,
                            const ELFYAML::RawContentSection &Section,
                            ContiguousBlobAccumulator &CBA);
@@ -210,14 +286,27 @@ template <class ELFT> class ELFState {
   void writeSectionContent(Elf_Shdr &SHeader,
                            const ELFYAML::DependentLibrariesSection &Section,
                            ContiguousBlobAccumulator &CBA);
+  void writeSectionContent(Elf_Shdr &SHeader,
+                           const ELFYAML::CallGraphProfileSection &Section,
+                           ContiguousBlobAccumulator &CBA);
 
   void writeFill(ELFYAML::Fill &Fill, ContiguousBlobAccumulator &CBA);
 
   ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH);
 
+  void assignSectionAddress(Elf_Shdr &SHeader, ELFYAML::Section *YAMLSec);
+
+  DenseMap<StringRef, size_t> buildSectionHeaderReorderMap();
+
+  BumpPtrAllocator StringAlloc;
+  uint64_t alignToOffset(ContiguousBlobAccumulator &CBA, uint64_t Align,
+                         llvm::Optional<llvm::yaml::Hex64> Offset);
+
+  uint64_t getSectionNameOffset(StringRef Name);
+
 public:
   static bool writeELF(raw_ostream &OS, ELFYAML::Object &Doc,
-                       yaml::ErrorHandler EH);
+                       yaml::ErrorHandler EH, uint64_t MaxSize);
 };
 } // end anonymous namespace
 
@@ -235,11 +324,6 @@ template <class ELFT>
 ELFState<ELFT>::ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH)
     : Doc(D), ErrHandler(EH) {
   std::vector<ELFYAML::Section *> Sections = Doc.getSections();
-  StringSet<> DocSections;
-  for (const ELFYAML::Section *Sec : Sections)
-    if (!Sec->Name.empty())
-      DocSections.insert(Sec->Name);
-
   // Insert SHT_NULL section implicitly when it is not defined in YAML.
   if (Sections.empty() || Sections.front()->Type != ELF::SHT_NULL)
     Doc.Chunks.insert(
@@ -247,14 +331,36 @@ ELFState<ELFT>::ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH)
         std::make_unique<ELFYAML::Section>(
             ELFYAML::Chunk::ChunkKind::RawContent, /*IsImplicit=*/true));
 
+  // We add a technical suffix for each unnamed section/fill. It does not affect
+  // the output, but allows us to map them by name in the code and report better
+  // error messages.
+  StringSet<> DocSections;
+  for (size_t I = 0; I < Doc.Chunks.size(); ++I) {
+    const std::unique_ptr<ELFYAML::Chunk> &C = Doc.Chunks[I];
+    if (C->Name.empty()) {
+      std::string NewName = ELFYAML::appendUniqueSuffix(
+          /*Name=*/"", "index " + Twine(I));
+      C->Name = StringRef(NewName).copy(StringAlloc);
+      assert(ELFYAML::dropUniqueSuffix(C->Name).empty());
+    }
+
+    if (!DocSections.insert(C->Name).second)
+      reportError("repeated section/fill name: '" + C->Name +
+                  "' at YAML section/fill number " + Twine(I));
+  }
+
   std::vector<StringRef> ImplicitSections;
+  if (Doc.DynamicSymbols)
+    ImplicitSections.insert(ImplicitSections.end(), {".dynsym", ".dynstr"});
   if (Doc.Symbols)
     ImplicitSections.push_back(".symtab");
+  if (Doc.DWARF)
+    for (StringRef DebugSecName : Doc.DWARF->getUsedSectionNames()) {
+      std::string SecName = ("." + DebugSecName).str();
+      ImplicitSections.push_back(StringRef(SecName).copy(StringAlloc));
+    }
   ImplicitSections.insert(ImplicitSections.end(), {".strtab", ".shstrtab"});
 
-  if (Doc.DynamicSymbols)
-    ImplicitSections.insert(ImplicitSections.end(), {".dynsym", ".dynstr"});
-
   // Insert placeholders for implicit sections that are not
   // defined explicitly in YAML.
   for (StringRef SecName : ImplicitSections) {
@@ -269,7 +375,7 @@ ELFState<ELFT>::ELFState(ELFYAML::Object &D, yaml::ErrorHandler EH)
 }
 
 template <class ELFT>
-void ELFState<ELFT>::writeELFHeader(ContiguousBlobAccumulator &CBA, raw_ostream &OS) {
+void ELFState<ELFT>::writeELFHeader(raw_ostream &OS, uint64_t SHOff) {
   using namespace llvm::ELF;
 
   Elf_Ehdr Header;
@@ -287,55 +393,134 @@ void ELFState<ELFT>::writeELFHeader(ContiguousBlobAccumulator &CBA, raw_ostream
   Header.e_machine = Doc.Header.Machine;
   Header.e_version = EV_CURRENT;
   Header.e_entry = Doc.Header.Entry;
-  Header.e_phoff = Doc.ProgramHeaders.size() ? sizeof(Header) : 0;
   Header.e_flags = Doc.Header.Flags;
   Header.e_ehsize = sizeof(Elf_Ehdr);
-  Header.e_phentsize = Doc.ProgramHeaders.size() ? sizeof(Elf_Phdr) : 0;
-  Header.e_phnum = Doc.ProgramHeaders.size();
-
-  Header.e_shentsize =
-      Doc.Header.SHEntSize ? (uint16_t)*Doc.Header.SHEntSize : sizeof(Elf_Shdr);
-  // Immediately following the ELF header and program headers.
-  // Align the start of the section header and write the ELF header.
-  uint64_t SHOff;
-  CBA.getOSAndAlignedOffset(SHOff, sizeof(typename ELFT::uint));
-  Header.e_shoff =
-      Doc.Header.SHOff ? typename ELFT::uint(*Doc.Header.SHOff) : SHOff;
-  Header.e_shnum =
-      Doc.Header.SHNum ? (uint16_t)*Doc.Header.SHNum : Doc.getSections().size();
-  Header.e_shstrndx = Doc.Header.SHStrNdx ? (uint16_t)*Doc.Header.SHStrNdx
-                                          : SN2I.get(".shstrtab");
+
+  if (Doc.Header.EPhOff)
+    Header.e_phoff = *Doc.Header.EPhOff;
+  else if (!Doc.ProgramHeaders.empty())
+    Header.e_phoff = sizeof(Header);
+  else
+    Header.e_phoff = 0;
+
+  if (Doc.Header.EPhEntSize)
+    Header.e_phentsize = *Doc.Header.EPhEntSize;
+  else if (!Doc.ProgramHeaders.empty())
+    Header.e_phentsize = sizeof(Elf_Phdr);
+  else
+    Header.e_phentsize = 0;
+
+  if (Doc.Header.EPhNum)
+    Header.e_phnum = *Doc.Header.EPhNum;
+  else if (!Doc.ProgramHeaders.empty())
+    Header.e_phnum = Doc.ProgramHeaders.size();
+  else
+    Header.e_phnum = 0;
+
+  Header.e_shentsize = Doc.Header.EShEntSize ? (uint16_t)*Doc.Header.EShEntSize
+                                             : sizeof(Elf_Shdr);
+
+  const bool NoShdrs =
+      Doc.SectionHeaders && Doc.SectionHeaders->NoHeaders.getValueOr(false);
+
+  if (Doc.Header.EShOff)
+    Header.e_shoff = *Doc.Header.EShOff;
+  else if (NoShdrs)
+    Header.e_shoff = 0;
+  else
+    Header.e_shoff = SHOff;
+
+  if (Doc.Header.EShNum)
+    Header.e_shnum = *Doc.Header.EShNum;
+  else if (!Doc.SectionHeaders)
+    Header.e_shnum = Doc.getSections().size();
+  else if (NoShdrs)
+    Header.e_shnum = 0;
+  else
+    Header.e_shnum =
+        (Doc.SectionHeaders->Sections ? Doc.SectionHeaders->Sections->size()
+                                      : 0) +
+        /*Null section*/ 1;
+
+  if (Doc.Header.EShStrNdx)
+    Header.e_shstrndx = *Doc.Header.EShStrNdx;
+  else if (NoShdrs || ExcludedSectionHeaders.count(".shstrtab"))
+    Header.e_shstrndx = 0;
+  else
+    Header.e_shstrndx = SN2I.get(".shstrtab");
 
   OS.write((const char *)&Header, sizeof(Header));
 }
 
 template <class ELFT>
 void ELFState<ELFT>::initProgramHeaders(std::vector<Elf_Phdr> &PHeaders) {
-  for (const auto &YamlPhdr : Doc.ProgramHeaders) {
+  DenseMap<StringRef, ELFYAML::Fill *> NameToFill;
+  for (const std::unique_ptr<ELFYAML::Chunk> &D : Doc.Chunks)
+    if (auto S = dyn_cast<ELFYAML::Fill>(D.get()))
+      NameToFill[S->Name] = S;
+
+  std::vector<ELFYAML::Section *> Sections = Doc.getSections();
+  for (ELFYAML::ProgramHeader &YamlPhdr : Doc.ProgramHeaders) {
     Elf_Phdr Phdr;
+    zero(Phdr);
     Phdr.p_type = YamlPhdr.Type;
     Phdr.p_flags = YamlPhdr.Flags;
     Phdr.p_vaddr = YamlPhdr.VAddr;
     Phdr.p_paddr = YamlPhdr.PAddr;
     PHeaders.push_back(Phdr);
+
+    // Map Sections list to corresponding chunks.
+    for (const ELFYAML::SectionName &SecName : YamlPhdr.Sections) {
+      if (ELFYAML::Fill *Fill = NameToFill.lookup(SecName.Section)) {
+        YamlPhdr.Chunks.push_back(Fill);
+        continue;
+      }
+
+      unsigned Index;
+      if (SN2I.lookup(SecName.Section, Index)) {
+        YamlPhdr.Chunks.push_back(Sections[Index]);
+        continue;
+      }
+
+      reportError("unknown section or fill referenced: '" + SecName.Section +
+                  "' by program header");
+    }
   }
 }
 
 template <class ELFT>
 unsigned ELFState<ELFT>::toSectionIndex(StringRef S, StringRef LocSec,
                                         StringRef LocSym) {
+  assert(LocSec.empty() || LocSym.empty());
+
   unsigned Index;
-  if (SN2I.lookup(S, Index) || to_integer(S, Index))
+  if (!SN2I.lookup(S, Index) && !to_integer(S, Index)) {
+    if (!LocSym.empty())
+      reportError("unknown section referenced: '" + S + "' by YAML symbol '" +
+                  LocSym + "'");
+    else
+      reportError("unknown section referenced: '" + S + "' by YAML section '" +
+                  LocSec + "'");
+    return 0;
+  }
+
+  if (!Doc.SectionHeaders || (Doc.SectionHeaders->NoHeaders &&
+                              !Doc.SectionHeaders->NoHeaders.getValue()))
     return Index;
 
-  assert(LocSec.empty() || LocSym.empty());
-  if (!LocSym.empty())
-    reportError("unknown section referenced: '" + S + "' by YAML symbol '" +
-                LocSym + "'");
-  else
-    reportError("unknown section referenced: '" + S + "' by YAML section '" +
-                LocSec + "'");
-  return 0;
+  assert(!Doc.SectionHeaders->NoHeaders.getValueOr(false) ||
+         !Doc.SectionHeaders->Sections);
+  size_t FirstExcluded =
+      Doc.SectionHeaders->Sections ? Doc.SectionHeaders->Sections->size() : 0;
+  if (Index >= FirstExcluded) {
+    if (LocSym.empty())
+      reportError("unable to link '" + LocSec + "' to excluded section '" + S +
+                  "'");
+    else
+      reportError("excluded section referenced: '" + S + "'  by symbol '" +
+                  LocSym + "'");
+  }
+  return Index;
 }
 
 template <class ELFT>
@@ -385,19 +570,53 @@ bool ELFState<ELFT>::initImplicitHeader(ContiguousBlobAccumulator &CBA,
     initSymtabSectionHeader(Header, SymtabType::Dynamic, CBA, YAMLSec);
   else if (SecName == ".dynstr")
     initStrtabSectionHeader(Header, SecName, DotDynstr, CBA, YAMLSec);
-  else
+  else if (SecName.startswith(".debug_")) {
+    // If a ".debug_*" section's type is a preserved one, e.g., SHT_DYNAMIC, we
+    // will not treat it as a debug section.
+    if (YAMLSec && !isa<ELFYAML::RawContentSection>(YAMLSec))
+      return false;
+    initDWARFSectionHeader(Header, SecName, CBA, YAMLSec);
+  } else
     return false;
 
+  LocationCounter += Header.sh_size;
+
   // Override section fields if requested.
   overrideFields<ELFT>(YAMLSec, Header);
   return true;
 }
 
+constexpr char SuffixStart = '(';
+constexpr char SuffixEnd = ')';
+
+std::string llvm::ELFYAML::appendUniqueSuffix(StringRef Name,
+                                              const Twine &Msg) {
+  // Do not add a space when a Name is empty.
+  std::string Ret = Name.empty() ? "" : Name.str() + ' ';
+  return Ret + (Twine(SuffixStart) + Msg + Twine(SuffixEnd)).str();
+}
+
 StringRef llvm::ELFYAML::dropUniqueSuffix(StringRef S) {
-  size_t SuffixPos = S.rfind(" [");
-  if (SuffixPos == StringRef::npos)
+  if (S.empty() || S.back() != SuffixEnd)
     return S;
-  return S.substr(0, SuffixPos);
+
+  // A special case for empty names. See appendUniqueSuffix() above.
+  size_t SuffixPos = S.rfind(SuffixStart);
+  if (SuffixPos == 0)
+    return "";
+
+  if (SuffixPos == StringRef::npos || S[SuffixPos - 1] != ' ')
+    return S;
+  return S.substr(0, SuffixPos - 1);
+}
+
+template <class ELFT>
+uint64_t ELFState<ELFT>::getSectionNameOffset(StringRef Name) {
+  // If a section is excluded from section headers, we do not save its name in
+  // the string table.
+  if (ExcludedSectionHeaders.count(Name))
+    return 0;
+  return DotShStrtab.getOffset(Name);
 }
 
 template <class ELFT>
@@ -407,23 +626,24 @@ void ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
   // valid SHN_UNDEF entry since SHT_NULL == 0.
   SHeaders.resize(Doc.getSections().size());
 
-  size_t SecNdx = -1;
   for (const std::unique_ptr<ELFYAML::Chunk> &D : Doc.Chunks) {
-    if (auto S = dyn_cast<ELFYAML::Fill>(D.get())) {
+    if (ELFYAML::Fill *S = dyn_cast<ELFYAML::Fill>(D.get())) {
+      S->Offset = alignToOffset(CBA, /*Align=*/1, S->Offset);
       writeFill(*S, CBA);
+      LocationCounter += S->Size;
       continue;
     }
 
-    ++SecNdx;
     ELFYAML::Section *Sec = cast<ELFYAML::Section>(D.get());
-    if (SecNdx == 0 && Sec->IsImplicit)
+    bool IsFirstUndefSection = D == Doc.Chunks.front();
+    if (IsFirstUndefSection && Sec->IsImplicit)
       continue;
 
     // We have a few sections like string or symbol tables that are usually
     // added implicitly to the end. However, if they are explicitly specified
     // in the YAML, we need to write them here. This ensures the file offset
     // remains correct.
-    Elf_Shdr &SHeader = SHeaders[SecNdx];
+    Elf_Shdr &SHeader = SHeaders[SN2I.get(Sec->Name)];
     if (initImplicitHeader(CBA, SHeader, Sec->Name,
                            Sec->IsImplicit ? nullptr : Sec))
       continue;
@@ -432,17 +652,23 @@ void ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
                   "implicit sections should already have been handled above.");
 
     SHeader.sh_name =
-        DotShStrtab.getOffset(ELFYAML::dropUniqueSuffix(Sec->Name));
+        getSectionNameOffset(ELFYAML::dropUniqueSuffix(Sec->Name));
     SHeader.sh_type = Sec->Type;
     if (Sec->Flags)
       SHeader.sh_flags = *Sec->Flags;
-    SHeader.sh_addr = Sec->Address;
     SHeader.sh_addralign = Sec->AddressAlign;
 
+    // Set the offset for all sections, except the SHN_UNDEF section with index
+    // 0 when not explicitly requested.
+    if (!IsFirstUndefSection || Sec->Offset)
+      SHeader.sh_offset = alignToOffset(CBA, SHeader.sh_addralign, Sec->Offset);
+
+    assignSectionAddress(SHeader, Sec);
+
     if (!Sec->Link.empty())
       SHeader.sh_link = toSectionIndex(Sec->Link, Sec->Name);
 
-    if (SecNdx == 0) {
+    if (IsFirstUndefSection) {
       if (auto RawSec = dyn_cast<ELFYAML::RawContentSection>(Sec)) {
         // We do not write any content for special SHN_UNDEF section.
         if (RawSec->Size)
@@ -465,11 +691,7 @@ void ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
     } else if (auto S = dyn_cast<ELFYAML::MipsABIFlags>(Sec)) {
       writeSectionContent(SHeader, *S, CBA);
     } else if (auto S = dyn_cast<ELFYAML::NoBitsSection>(Sec)) {
-      SHeader.sh_entsize = 0;
-      SHeader.sh_size = S->Size;
-      // SHT_NOBITS section does not have content
-      // so just to setup the section offset.
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+      writeSectionContent(SHeader, *S, CBA);
     } else if (auto S = dyn_cast<ELFYAML::DynamicSection>(Sec)) {
       writeSectionContent(SHeader, *S, CBA);
     } else if (auto S = dyn_cast<ELFYAML::SymverSection>(Sec)) {
@@ -492,15 +714,40 @@ void ELFState<ELFT>::initSectionHeaders(std::vector<Elf_Shdr> &SHeaders,
       writeSectionContent(SHeader, *S, CBA);
     } else if (auto S = dyn_cast<ELFYAML::DependentLibrariesSection>(Sec)) {
       writeSectionContent(SHeader, *S, CBA);
+    } else if (auto S = dyn_cast<ELFYAML::CallGraphProfileSection>(Sec)) {
+      writeSectionContent(SHeader, *S, CBA);
     } else {
       llvm_unreachable("Unknown section type");
     }
 
+    LocationCounter += SHeader.sh_size;
+
     // Override section fields if requested.
     overrideFields<ELFT>(Sec, SHeader);
   }
 }
 
+template <class ELFT>
+void ELFState<ELFT>::assignSectionAddress(Elf_Shdr &SHeader,
+                                          ELFYAML::Section *YAMLSec) {
+  if (YAMLSec && YAMLSec->Address) {
+    SHeader.sh_addr = *YAMLSec->Address;
+    LocationCounter = *YAMLSec->Address;
+    return;
+  }
+
+  // sh_addr represents the address in the memory image of a process. Sections
+  // in a relocatable object file or non-allocatable sections do not need
+  // sh_addr assignment.
+  if (Doc.Header.Type.value == ELF::ET_REL ||
+      !(SHeader.sh_flags & ELF::SHF_ALLOC))
+    return;
+
+  LocationCounter =
+      alignTo(LocationCounter, SHeader.sh_addralign ? SHeader.sh_addralign : 1);
+  SHeader.sh_addr = LocationCounter;
+}
+
 static size_t findFirstNonGlobal(ArrayRef<ELFYAML::Symbol> Symbols) {
   for (size_t I = 0; I < Symbols.size(); ++I)
     if (Symbols[I].Binding.value != ELF::STB_LOCAL)
@@ -508,19 +755,19 @@ static size_t findFirstNonGlobal(ArrayRef<ELFYAML::Symbol> Symbols) {
   return Symbols.size();
 }
 
-static uint64_t writeContent(raw_ostream &OS,
+static uint64_t writeContent(ContiguousBlobAccumulator &CBA,
                              const Optional<yaml::BinaryRef> &Content,
                              const Optional<llvm::yaml::Hex64> &Size) {
   size_t ContentSize = 0;
   if (Content) {
-    Content->writeAsBinary(OS);
+    CBA.writeAsBinary(*Content);
     ContentSize = Content->binary_size();
   }
 
   if (!Size)
     return ContentSize;
 
-  OS.write_zeros(*Size - ContentSize);
+  CBA.writeZeros(*Size - ContentSize);
   return *Size;
 }
 
@@ -538,8 +785,8 @@ ELFState<ELFT>::toELFSymbols(ArrayRef<ELFYAML::Symbol> Symbols,
     // If NameIndex, which contains the name offset, is explicitly specified, we
     // use it. This is useful for preparing broken objects. Otherwise, we add
     // the specified Name to the string table builder to get its offset.
-    if (Sym.NameIndex)
-      Symbol.st_name = *Sym.NameIndex;
+    if (Sym.StName)
+      Symbol.st_name = *Sym.StName;
     else if (!Sym.Name.empty())
       Symbol.st_name = Strtab.getOffset(ELFYAML::dropUniqueSuffix(Sym.Name));
 
@@ -588,7 +835,7 @@ void ELFState<ELFT>::initSymtabSectionHeader(Elf_Shdr &SHeader,
   }
 
   zero(SHeader);
-  SHeader.sh_name = DotShStrtab.getOffset(IsStatic ? ".symtab" : ".dynsym");
+  SHeader.sh_name = getSectionNameOffset(IsStatic ? ".symtab" : ".dynsym");
 
   if (YAMLSec)
     SHeader.sh_type = YAMLSec->Type;
@@ -605,10 +852,13 @@ void ELFState<ELFT>::initSymtabSectionHeader(Elf_Shdr &SHeader,
     // added implicitly and we should be able to leave the Link zeroed if
     // .dynstr is not defined.
     unsigned Link = 0;
-    if (IsStatic)
-      Link = SN2I.get(".strtab");
-    else
-      SN2I.lookup(".dynstr", Link);
+    if (IsStatic) {
+      if (!ExcludedSectionHeaders.count(".strtab"))
+        Link = SN2I.get(".strtab");
+    } else {
+      if (!ExcludedSectionHeaders.count(".dynstr"))
+        SN2I.lookup(".dynstr", Link);
+    }
     SHeader.sh_link = Link;
   }
 
@@ -625,19 +875,21 @@ void ELFState<ELFT>::initSymtabSectionHeader(Elf_Shdr &SHeader,
                            ? (uint64_t)(*YAMLSec->EntSize)
                            : sizeof(Elf_Sym);
   SHeader.sh_addralign = YAMLSec ? (uint64_t)YAMLSec->AddressAlign : 8;
-  SHeader.sh_addr = YAMLSec ? (uint64_t)YAMLSec->Address : 0;
 
-  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+  assignSectionAddress(SHeader, YAMLSec);
+
+  SHeader.sh_offset = alignToOffset(CBA, SHeader.sh_addralign, /*Offset=*/None);
+
   if (RawSec && (RawSec->Content || RawSec->Size)) {
     assert(Symbols.empty());
-    SHeader.sh_size = writeContent(OS, RawSec->Content, RawSec->Size);
+    SHeader.sh_size = writeContent(CBA, RawSec->Content, RawSec->Size);
     return;
   }
 
   std::vector<Elf_Sym> Syms =
       toELFSymbols(Symbols, IsStatic ? DotStrtab : DotDynstr);
-  writeArrayData(OS, makeArrayRef(Syms));
-  SHeader.sh_size = arrayDataSize(makeArrayRef(Syms));
+  SHeader.sh_size = Syms.size() * sizeof(Elf_Sym);
+  CBA.write((const char *)Syms.data(), SHeader.sh_size);
 }
 
 template <class ELFT>
@@ -646,18 +898,20 @@ void ELFState<ELFT>::initStrtabSectionHeader(Elf_Shdr &SHeader, StringRef Name,
                                              ContiguousBlobAccumulator &CBA,
                                              ELFYAML::Section *YAMLSec) {
   zero(SHeader);
-  SHeader.sh_name = DotShStrtab.getOffset(Name);
+  SHeader.sh_name = getSectionNameOffset(Name);
   SHeader.sh_type = YAMLSec ? YAMLSec->Type : ELF::SHT_STRTAB;
   SHeader.sh_addralign = YAMLSec ? (uint64_t)YAMLSec->AddressAlign : 1;
 
   ELFYAML::RawContentSection *RawSec =
       dyn_cast_or_null<ELFYAML::RawContentSection>(YAMLSec);
 
-  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
+  SHeader.sh_offset = alignToOffset(CBA, SHeader.sh_addralign, /*Offset=*/None);
+
   if (RawSec && (RawSec->Content || RawSec->Size)) {
-    SHeader.sh_size = writeContent(OS, RawSec->Content, RawSec->Size);
+    SHeader.sh_size = writeContent(CBA, RawSec->Content, RawSec->Size);
   } else {
-    STB.write(OS);
+    if (raw_ostream *OS = CBA.getRawOS(STB.getSize()))
+      STB.write(*OS);
     SHeader.sh_size = STB.getSize();
   }
 
@@ -672,10 +926,110 @@ void ELFState<ELFT>::initStrtabSectionHeader(Elf_Shdr &SHeader, StringRef Name,
   else if (Name == ".dynstr")
     SHeader.sh_flags = ELF::SHF_ALLOC;
 
-  // If the section is explicitly described in the YAML
-  // then we want to use its section address.
-  if (YAMLSec)
-    SHeader.sh_addr = YAMLSec->Address;
+  assignSectionAddress(SHeader, YAMLSec);
+}
+
+static bool shouldEmitDWARF(DWARFYAML::Data &DWARF, StringRef Name) {
+  SetVector<StringRef> DebugSecNames = DWARF.getUsedSectionNames();
+  return Name.consume_front(".") && DebugSecNames.count(Name);
+}
+
+template <class ELFT>
+Expected<uint64_t> emitDWARF(typename ELFT::Shdr &SHeader, StringRef Name,
+                             const DWARFYAML::Data &DWARF,
+                             ContiguousBlobAccumulator &CBA) {
+  // We are unable to predict the size of debug data, so we request to write 0
+  // bytes. This should always return us an output stream unless CBA is already
+  // in an error state.
+  raw_ostream *OS = CBA.getRawOS(0);
+  if (!OS)
+    return 0;
+
+  uint64_t BeginOffset = CBA.tell();
+  Error Err = Error::success();
+  cantFail(std::move(Err));
+
+  if (Name == ".debug_str")
+    Err = DWARFYAML::emitDebugStr(*OS, DWARF);
+  else if (Name == ".debug_aranges")
+    Err = DWARFYAML::emitDebugAranges(*OS, DWARF);
+  else if (Name == ".debug_ranges")
+    Err = DWARFYAML::emitDebugRanges(*OS, DWARF);
+  else if (Name == ".debug_line")
+    Err = DWARFYAML::emitDebugLine(*OS, DWARF);
+  else if (Name == ".debug_addr")
+    Err = DWARFYAML::emitDebugAddr(*OS, DWARF);
+  else if (Name == ".debug_abbrev")
+    Err = DWARFYAML::emitDebugAbbrev(*OS, DWARF);
+  else if (Name == ".debug_info")
+    Err = DWARFYAML::emitDebugInfo(*OS, DWARF);
+  else if (Name == ".debug_pubnames")
+    Err = DWARFYAML::emitPubSection(*OS, *DWARF.PubNames, DWARF.IsLittleEndian);
+  else if (Name == ".debug_pubtypes")
+    Err = DWARFYAML::emitPubSection(*OS, *DWARF.PubTypes, DWARF.IsLittleEndian);
+  else if (Name == ".debug_gnu_pubnames")
+    Err = DWARFYAML::emitPubSection(*OS, *DWARF.GNUPubNames,
+                                    DWARF.IsLittleEndian, /*IsGNUStyle=*/true);
+  else if (Name == ".debug_gnu_pubtypes")
+    Err = DWARFYAML::emitPubSection(*OS, *DWARF.GNUPubTypes,
+                                    DWARF.IsLittleEndian, /*IsGNUStyle=*/true);
+  else
+    llvm_unreachable("unexpected emitDWARF() call");
+
+  if (Err)
+    return std::move(Err);
+
+  return CBA.tell() - BeginOffset;
+}
+
+template <class ELFT>
+void ELFState<ELFT>::initDWARFSectionHeader(Elf_Shdr &SHeader, StringRef Name,
+                                            ContiguousBlobAccumulator &CBA,
+                                            ELFYAML::Section *YAMLSec) {
+  zero(SHeader);
+  SHeader.sh_name = getSectionNameOffset(ELFYAML::dropUniqueSuffix(Name));
+  SHeader.sh_type = YAMLSec ? YAMLSec->Type : ELF::SHT_PROGBITS;
+  SHeader.sh_addralign = YAMLSec ? (uint64_t)YAMLSec->AddressAlign : 1;
+  SHeader.sh_offset = alignToOffset(CBA, SHeader.sh_addralign,
+                                    YAMLSec ? YAMLSec->Offset : None);
+
+  ELFYAML::RawContentSection *RawSec =
+      dyn_cast_or_null<ELFYAML::RawContentSection>(YAMLSec);
+  if (Doc.DWARF && shouldEmitDWARF(*Doc.DWARF, Name)) {
+    if (RawSec && (RawSec->Content || RawSec->Size))
+      reportError("cannot specify section '" + Name +
+                  "' contents in the 'DWARF' entry and the 'Content' "
+                  "or 'Size' in the 'Sections' entry at the same time");
+    else {
+      if (Expected<uint64_t> ShSizeOrErr =
+              emitDWARF<ELFT>(SHeader, Name, *Doc.DWARF, CBA))
+        SHeader.sh_size = *ShSizeOrErr;
+      else
+        reportError(ShSizeOrErr.takeError());
+    }
+  } else if (RawSec)
+    SHeader.sh_size = writeContent(CBA, RawSec->Content, RawSec->Size);
+  else
+    llvm_unreachable("debug sections can only be initialized via the 'DWARF' "
+                     "entry or a RawContentSection");
+
+  if (YAMLSec && YAMLSec->EntSize)
+    SHeader.sh_entsize = *YAMLSec->EntSize;
+  else if (Name == ".debug_str")
+    SHeader.sh_entsize = 1;
+
+  if (RawSec && RawSec->Info)
+    SHeader.sh_info = *RawSec->Info;
+
+  if (YAMLSec && YAMLSec->Flags)
+    SHeader.sh_flags = *YAMLSec->Flags;
+  else if (Name == ".debug_str")
+    SHeader.sh_flags = ELF::SHF_MERGE | ELF::SHF_STRINGS;
+
+  if (YAMLSec && !YAMLSec->Link.empty())
+    SHeader.sh_link = toSectionIndex(YAMLSec->Link, Name);
+
+  assignSectionAddress(SHeader, YAMLSec);
 }
 
 template <class ELFT> void ELFState<ELFT>::reportError(const Twine &Msg) {
@@ -683,34 +1037,28 @@ template <class ELFT> void ELFState<ELFT>::reportError(const Twine &Msg) {
   HasError = true;
 }
 
+template <class ELFT> void ELFState<ELFT>::reportError(Error Err) {
+  handleAllErrors(std::move(Err), [&](const ErrorInfoBase &Err) {
+    reportError(Err.message());
+  });
+}
+
 template <class ELFT>
 std::vector<Fragment>
 ELFState<ELFT>::getPhdrFragments(const ELFYAML::ProgramHeader &Phdr,
-                                 ArrayRef<typename ELFT::Shdr> SHeaders) {
-  DenseMap<StringRef, ELFYAML::Fill *> NameToFill;
-  for (const std::unique_ptr<ELFYAML::Chunk> &D : Doc.Chunks)
-    if (auto S = dyn_cast<ELFYAML::Fill>(D.get()))
-      NameToFill[S->Name] = S;
-
+                                 ArrayRef<Elf_Shdr> SHeaders) {
   std::vector<Fragment> Ret;
-  for (const ELFYAML::SectionName &SecName : Phdr.Sections) {
-    unsigned Index;
-    if (SN2I.lookup(SecName.Section, Index)) {
-      const typename ELFT::Shdr &H = SHeaders[Index];
-      Ret.push_back({H.sh_offset, H.sh_size, H.sh_type, H.sh_addralign});
-      continue;
-    }
-
-    if (ELFYAML::Fill *Fill = NameToFill.lookup(SecName.Section)) {
-      Ret.push_back({Fill->ShOffset, Fill->Size, llvm::ELF::SHT_PROGBITS,
+  for (const ELFYAML::Chunk *C : Phdr.Chunks) {
+    if (const ELFYAML::Fill *F = dyn_cast<ELFYAML::Fill>(C)) {
+      Ret.push_back({*F->Offset, F->Size, llvm::ELF::SHT_PROGBITS,
                      /*ShAddrAlign=*/1});
       continue;
     }
 
-    reportError("unknown section or fill referenced: '" + SecName.Section +
-                "' by program header");
+    const ELFYAML::Section *S = cast<ELFYAML::Section>(C);
+    const Elf_Shdr &H = SHeaders[SN2I.get(S->Name)];
+    Ret.push_back({H.sh_offset, H.sh_size, H.sh_type, H.sh_addralign});
   }
-
   return Ret;
 }
 
@@ -721,35 +1069,41 @@ void ELFState<ELFT>::setProgramHeaderLayout(std::vector<Elf_Phdr> &PHeaders,
   for (auto &YamlPhdr : Doc.ProgramHeaders) {
     Elf_Phdr &PHeader = PHeaders[PhdrIdx++];
     std::vector<Fragment> Fragments = getPhdrFragments(YamlPhdr, SHeaders);
+    if (!llvm::is_sorted(Fragments, [](const Fragment &A, const Fragment &B) {
+          return A.Offset < B.Offset;
+        }))
+      reportError("sections in the program header with index " +
+                  Twine(PhdrIdx) + " are not sorted by their file offset");
 
     if (YamlPhdr.Offset) {
+      if (!Fragments.empty() && *YamlPhdr.Offset > Fragments.front().Offset)
+        reportError("'Offset' for segment with index " + Twine(PhdrIdx) +
+                    " must be less than or equal to the minimum file offset of "
+                    "all included sections (0x" +
+                    Twine::utohexstr(Fragments.front().Offset) + ")");
       PHeader.p_offset = *YamlPhdr.Offset;
-    } else {
-      if (YamlPhdr.Sections.size())
-        PHeader.p_offset = UINT32_MAX;
-      else
-        PHeader.p_offset = 0;
-
-      // Find the minimum offset for the program header.
-      for (const Fragment &F : Fragments)
-        PHeader.p_offset = std::min((uint64_t)PHeader.p_offset, F.Offset);
+    } else if (!Fragments.empty()) {
+      PHeader.p_offset = Fragments.front().Offset;
     }
 
-    // Find the maximum offset of the end of a section in order to set p_filesz
-    // and p_memsz. When setting p_filesz, trailing SHT_NOBITS sections are not
-    // counted.
-    uint64_t FileOffset = PHeader.p_offset, MemOffset = PHeader.p_offset;
-    for (const Fragment &F : Fragments) {
-      uint64_t End = F.Offset + F.Size;
-      MemOffset = std::max(MemOffset, End);
-
-      if (F.Type != llvm::ELF::SHT_NOBITS)
-        FileOffset = std::max(FileOffset, End);
+    // Set the file size if not set explicitly.
+    if (YamlPhdr.FileSize) {
+      PHeader.p_filesz = *YamlPhdr.FileSize;
+    } else if (!Fragments.empty()) {
+      uint64_t FileSize = Fragments.back().Offset - PHeader.p_offset;
+      // SHT_NOBITS sections occupy no physical space in a file, we should not
+      // take their sizes into account when calculating the file size of a
+      // segment.
+      if (Fragments.back().Type != llvm::ELF::SHT_NOBITS)
+        FileSize += Fragments.back().Size;
+      PHeader.p_filesz = FileSize;
     }
 
-    // Set the file size and the memory size if not set explicitly.
-    PHeader.p_filesz = YamlPhdr.FileSize ? uint64_t(*YamlPhdr.FileSize)
-                                         : FileOffset - PHeader.p_offset;
+    // Find the maximum offset of the end of a section in order to set p_memsz.
+    uint64_t MemOffset = PHeader.p_offset;
+    for (const Fragment &F : Fragments)
+      MemOffset = std::max(MemOffset, F.Offset + F.Size);
+    // Set the memory size if not set explicitly.
     PHeader.p_memsz = YamlPhdr.MemSize ? uint64_t(*YamlPhdr.MemSize)
                                        : MemOffset - PHeader.p_offset;
 
@@ -766,13 +1120,40 @@ void ELFState<ELFT>::setProgramHeaderLayout(std::vector<Elf_Phdr> &PHeaders,
   }
 }
 
+static bool shouldAllocateFileSpace(ArrayRef<ELFYAML::ProgramHeader> Phdrs,
+                                    const ELFYAML::NoBitsSection &S) {
+  for (const ELFYAML::ProgramHeader &PH : Phdrs) {
+    auto It = llvm::find_if(
+        PH.Chunks, [&](ELFYAML::Chunk *C) { return C->Name == S.Name; });
+    if (std::any_of(It, PH.Chunks.end(), [](ELFYAML::Chunk *C) {
+          return (isa<ELFYAML::Fill>(C) ||
+                  cast<ELFYAML::Section>(C)->Type != ELF::SHT_NOBITS);
+        }))
+      return true;
+  }
+  return false;
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
+                                         const ELFYAML::NoBitsSection &S,
+                                         ContiguousBlobAccumulator &CBA) {
+  // SHT_NOBITS sections do not have any content to write.
+  SHeader.sh_entsize = 0;
+  SHeader.sh_size = S.Size;
+
+  // When a nobits section is followed by a non-nobits section or fill
+  // in the same segment, we allocate the file space for it. This behavior
+  // matches linkers.
+  if (shouldAllocateFileSpace(Doc.ProgramHeaders, S))
+    CBA.writeZeros(S.Size);
+}
+
 template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(
     Elf_Shdr &SHeader, const ELFYAML::RawContentSection &Section,
     ContiguousBlobAccumulator &CBA) {
-  raw_ostream &OS =
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
-  SHeader.sh_size = writeContent(OS, Section.Content, Section.Size);
+  SHeader.sh_size = writeContent(CBA, Section.Content, Section.Size);
 
   if (Section.EntSize)
     SHeader.sh_entsize = *Section.EntSize;
@@ -796,18 +1177,22 @@ void ELFState<ELFT>::writeSectionContent(
          "Section type is not SHT_REL nor SHT_RELA");
 
   bool IsRela = Section.Type == llvm::ELF::SHT_RELA;
-  SHeader.sh_entsize = IsRela ? sizeof(Elf_Rela) : sizeof(Elf_Rel);
-  SHeader.sh_size = SHeader.sh_entsize * Section.Relocations.size();
+  if (Section.EntSize)
+    SHeader.sh_entsize = *Section.EntSize;
+  else
+    SHeader.sh_entsize = IsRela ? sizeof(Elf_Rela) : sizeof(Elf_Rel);
+  SHeader.sh_size = (IsRela ? sizeof(Elf_Rela) : sizeof(Elf_Rel)) *
+                    Section.Relocations.size();
 
   // For relocation section set link to .symtab by default.
   unsigned Link = 0;
-  if (Section.Link.empty() && SN2I.lookup(".symtab", Link))
+  if (Section.Link.empty() && !ExcludedSectionHeaders.count(".symtab") &&
+      SN2I.lookup(".symtab", Link))
     SHeader.sh_link = Link;
 
   if (!Section.RelocatableSec.empty())
     SHeader.sh_info = toSectionIndex(Section.RelocatableSec, Section.Name);
 
-  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
   for (const auto &Rel : Section.Relocations) {
     unsigned SymIdx = Rel.Symbol ? toSymbolIndex(*Rel.Symbol, Section.Name,
                                                  Section.Link == ".dynsym")
@@ -818,13 +1203,13 @@ void ELFState<ELFT>::writeSectionContent(
       REntry.r_offset = Rel.Offset;
       REntry.r_addend = Rel.Addend;
       REntry.setSymbolAndType(SymIdx, Rel.Type, isMips64EL(Doc));
-      OS.write((const char *)&REntry, sizeof(REntry));
+      CBA.write((const char *)&REntry, sizeof(REntry));
     } else {
       Elf_Rel REntry;
       zero(REntry);
       REntry.r_offset = Rel.Offset;
       REntry.setSymbolAndType(SymIdx, Rel.Type, isMips64EL(Doc));
-      OS.write((const char *)&REntry, sizeof(REntry));
+      CBA.write((const char *)&REntry, sizeof(REntry));
     }
   }
 }
@@ -833,13 +1218,11 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::RelrSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
-  raw_ostream &OS =
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
   SHeader.sh_entsize =
       Section.EntSize ? uint64_t(*Section.EntSize) : sizeof(Elf_Relr);
 
   if (Section.Content) {
-    SHeader.sh_size = writeContent(OS, Section.Content, None);
+    SHeader.sh_size = writeContent(CBA, Section.Content, None);
     return;
   }
 
@@ -850,7 +1233,7 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
     if (!ELFT::Is64Bits && E > UINT32_MAX)
       reportError(Section.Name + ": the value is too large for 32-bits: 0x" +
                   Twine::utohexstr(E));
-    support::endian::write<uintX_t>(OS, E, ELFT::TargetEndianness);
+    CBA.write<uintX_t>(E, ELFT::TargetEndianness);
   }
 
   SHeader.sh_size = sizeof(uintX_t) * Section.Entries->size();
@@ -860,11 +1243,8 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(
     Elf_Shdr &SHeader, const ELFYAML::SymtabShndxSection &Shndx,
     ContiguousBlobAccumulator &CBA) {
-  raw_ostream &OS =
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
-
   for (uint32_t E : Shndx.Entries)
-    support::endian::write<uint32_t>(OS, E, ELFT::TargetEndianness);
+    CBA.write<uint32_t>(E, ELFT::TargetEndianness);
 
   SHeader.sh_entsize = Shndx.EntSize ? (uint64_t)*Shndx.EntSize : 4;
   SHeader.sh_size = Shndx.Entries.size() * SHeader.sh_entsize;
@@ -878,7 +1258,8 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
          "Section type is not SHT_GROUP");
 
   unsigned Link = 0;
-  if (Section.Link.empty() && SN2I.lookup(".symtab", Link))
+  if (Section.Link.empty() && !ExcludedSectionHeaders.count(".symtab") &&
+      SN2I.lookup(".symtab", Link))
     SHeader.sh_link = Link;
 
   SHeader.sh_entsize = 4;
@@ -888,16 +1269,13 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
     SHeader.sh_info =
         toSymbolIndex(*Section.Signature, Section.Name, /*IsDynamic=*/false);
 
-  raw_ostream &OS =
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
-
   for (const ELFYAML::SectionOrType &Member : Section.Members) {
     unsigned int SectionIndex = 0;
     if (Member.sectionNameOrType == "GRP_COMDAT")
       SectionIndex = llvm::ELF::GRP_COMDAT;
     else
       SectionIndex = toSectionIndex(Member.sectionNameOrType, Section.Name);
-    support::endian::write<uint32_t>(OS, SectionIndex, ELFT::TargetEndianness);
+    CBA.write<uint32_t>(SectionIndex, ELFT::TargetEndianness);
   }
 }
 
@@ -905,10 +1283,8 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::SymverSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
-  raw_ostream &OS =
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
   for (uint16_t Version : Section.Entries)
-    support::endian::write<uint16_t>(OS, Version, ELFT::TargetEndianness);
+    CBA.write<uint16_t>(Version, ELFT::TargetEndianness);
 
   SHeader.sh_entsize = Section.EntSize ? (uint64_t)*Section.EntSize : 2;
   SHeader.sh_size = Section.Entries.size() * SHeader.sh_entsize;
@@ -918,17 +1294,14 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(
     Elf_Shdr &SHeader, const ELFYAML::StackSizesSection &Section,
     ContiguousBlobAccumulator &CBA) {
-  raw_ostream &OS =
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
-
   if (Section.Content || Section.Size) {
-    SHeader.sh_size = writeContent(OS, Section.Content, Section.Size);
+    SHeader.sh_size = writeContent(CBA, Section.Content, Section.Size);
     return;
   }
 
   for (const ELFYAML::StackSizeEntry &E : *Section.Entries) {
-    support::endian::write<uintX_t>(OS, E.Address, ELFT::TargetEndianness);
-    SHeader.sh_size += sizeof(uintX_t) + encodeULEB128(E.Size, OS);
+    CBA.write<uintX_t>(E.Address, ELFT::TargetEndianness);
+    SHeader.sh_size += sizeof(uintX_t) + CBA.writeULEB128(E.Size);
   }
 }
 
@@ -936,11 +1309,8 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(
     Elf_Shdr &SHeader, const ELFYAML::LinkerOptionsSection &Section,
     ContiguousBlobAccumulator &CBA) {
-  raw_ostream &OS =
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
-
   if (Section.Content) {
-    SHeader.sh_size = writeContent(OS, Section.Content, None);
+    SHeader.sh_size = writeContent(CBA, Section.Content, None);
     return;
   }
 
@@ -948,10 +1318,10 @@ void ELFState<ELFT>::writeSectionContent(
     return;
 
   for (const ELFYAML::LinkerOption &LO : *Section.Options) {
-    OS.write(LO.Key.data(), LO.Key.size());
-    OS.write('\0');
-    OS.write(LO.Value.data(), LO.Value.size());
-    OS.write('\0');
+    CBA.write(LO.Key.data(), LO.Key.size());
+    CBA.write('\0');
+    CBA.write(LO.Value.data(), LO.Value.size());
+    CBA.write('\0');
     SHeader.sh_size += (LO.Key.size() + LO.Value.size() + 2);
   }
 }
@@ -960,11 +1330,8 @@ template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(
     Elf_Shdr &SHeader, const ELFYAML::DependentLibrariesSection &Section,
     ContiguousBlobAccumulator &CBA) {
-  raw_ostream &OS =
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
-
   if (Section.Content) {
-    SHeader.sh_size = writeContent(OS, Section.Content, None);
+    SHeader.sh_size = writeContent(CBA, Section.Content, None);
     return;
   }
 
@@ -972,36 +1339,94 @@ void ELFState<ELFT>::writeSectionContent(
     return;
 
   for (StringRef Lib : *Section.Libs) {
-    OS.write(Lib.data(), Lib.size());
-    OS.write('\0');
+    CBA.write(Lib.data(), Lib.size());
+    CBA.write('\0');
     SHeader.sh_size += Lib.size() + 1;
   }
 }
 
 template <class ELFT>
+uint64_t
+ELFState<ELFT>::alignToOffset(ContiguousBlobAccumulator &CBA, uint64_t Align,
+                              llvm::Optional<llvm::yaml::Hex64> Offset) {
+  uint64_t CurrentOffset = CBA.getOffset();
+  uint64_t AlignedOffset;
+
+  if (Offset) {
+    if ((uint64_t)*Offset < CurrentOffset) {
+      reportError("the 'Offset' value (0x" +
+                  Twine::utohexstr((uint64_t)*Offset) + ") goes backward");
+      return CurrentOffset;
+    }
+
+    // We ignore an alignment when an explicit offset has been requested.
+    AlignedOffset = *Offset;
+  } else {
+    AlignedOffset = alignTo(CurrentOffset, std::max(Align, (uint64_t)1));
+  }
+
+  CBA.writeZeros(AlignedOffset - CurrentOffset);
+  return AlignedOffset;
+}
+
+template <class ELFT>
+void ELFState<ELFT>::writeSectionContent(
+    Elf_Shdr &SHeader, const ELFYAML::CallGraphProfileSection &Section,
+    ContiguousBlobAccumulator &CBA) {
+  if (Section.EntSize)
+    SHeader.sh_entsize = *Section.EntSize;
+  else
+    SHeader.sh_entsize = 16;
+
+  unsigned Link = 0;
+  if (Section.Link.empty() && !ExcludedSectionHeaders.count(".symtab") &&
+      SN2I.lookup(".symtab", Link))
+    SHeader.sh_link = Link;
+
+  if (Section.Content) {
+    SHeader.sh_size = writeContent(CBA, Section.Content, None);
+    return;
+  }
+
+  if (!Section.Entries)
+    return;
+
+  for (const ELFYAML::CallGraphEntry &E : *Section.Entries) {
+    unsigned From = toSymbolIndex(E.From, Section.Name, /*IsDynamic=*/false);
+    unsigned To = toSymbolIndex(E.To, Section.Name, /*IsDynamic=*/false);
+
+    CBA.write<uint32_t>(From, ELFT::TargetEndianness);
+    CBA.write<uint32_t>(To, ELFT::TargetEndianness);
+    CBA.write<uint64_t>(E.Weight, ELFT::TargetEndianness);
+    SHeader.sh_size += 16;
+  }
+}
+
+template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::HashSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
-  raw_ostream &OS =
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
-
   unsigned Link = 0;
-  if (Section.Link.empty() && SN2I.lookup(".dynsym", Link))
+  if (Section.Link.empty() && !ExcludedSectionHeaders.count(".dynsym") &&
+      SN2I.lookup(".dynsym", Link))
     SHeader.sh_link = Link;
 
   if (Section.Content || Section.Size) {
-    SHeader.sh_size = writeContent(OS, Section.Content, Section.Size);
+    SHeader.sh_size = writeContent(CBA, Section.Content, Section.Size);
     return;
   }
 
-  support::endian::write<uint32_t>(OS, Section.Bucket->size(),
-                                   ELFT::TargetEndianness);
-  support::endian::write<uint32_t>(OS, Section.Chain->size(),
-                                   ELFT::TargetEndianness);
+  CBA.write<uint32_t>(
+      Section.NBucket.getValueOr(llvm::yaml::Hex64(Section.Bucket->size())),
+      ELFT::TargetEndianness);
+  CBA.write<uint32_t>(
+      Section.NChain.getValueOr(llvm::yaml::Hex64(Section.Chain->size())),
+      ELFT::TargetEndianness);
+
   for (uint32_t Val : *Section.Bucket)
-    support::endian::write<uint32_t>(OS, Val, ELFT::TargetEndianness);
+    CBA.write<uint32_t>(Val, ELFT::TargetEndianness);
   for (uint32_t Val : *Section.Chain)
-    support::endian::write<uint32_t>(OS, Val, ELFT::TargetEndianness);
+    CBA.write<uint32_t>(Val, ELFT::TargetEndianness);
 
   SHeader.sh_size = (2 + Section.Bucket->size() + Section.Chain->size()) * 4;
 }
@@ -1012,13 +1437,11 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          ContiguousBlobAccumulator &CBA) {
   typedef typename ELFT::Verdef Elf_Verdef;
   typedef typename ELFT::Verdaux Elf_Verdaux;
-  raw_ostream &OS =
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
 
   SHeader.sh_info = Section.Info;
 
   if (Section.Content) {
-    SHeader.sh_size = writeContent(OS, Section.Content, None);
+    SHeader.sh_size = writeContent(CBA, Section.Content, None);
     return;
   }
 
@@ -1041,7 +1464,7 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
     else
       VerDef.vd_next =
           sizeof(Elf_Verdef) + E.VerNames.size() * sizeof(Elf_Verdaux);
-    OS.write((const char *)&VerDef, sizeof(Elf_Verdef));
+    CBA.write((const char *)&VerDef, sizeof(Elf_Verdef));
 
     for (size_t J = 0; J < E.VerNames.size(); ++J, ++AuxCnt) {
       Elf_Verdaux VernAux;
@@ -1050,7 +1473,7 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
         VernAux.vda_next = 0;
       else
         VernAux.vda_next = sizeof(Elf_Verdaux);
-      OS.write((const char *)&VernAux, sizeof(Elf_Verdaux));
+      CBA.write((const char *)&VernAux, sizeof(Elf_Verdaux));
     }
   }
 
@@ -1065,11 +1488,10 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
   typedef typename ELFT::Verneed Elf_Verneed;
   typedef typename ELFT::Vernaux Elf_Vernaux;
 
-  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
   SHeader.sh_info = Section.Info;
 
   if (Section.Content) {
-    SHeader.sh_size = writeContent(OS, Section.Content, None);
+    SHeader.sh_size = writeContent(CBA, Section.Content, None);
     return;
   }
 
@@ -1090,7 +1512,7 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
           sizeof(Elf_Verneed) + VE.AuxV.size() * sizeof(Elf_Vernaux);
     VerNeed.vn_cnt = VE.AuxV.size();
     VerNeed.vn_aux = sizeof(Elf_Verneed);
-    OS.write((const char *)&VerNeed, sizeof(Elf_Verneed));
+    CBA.write((const char *)&VerNeed, sizeof(Elf_Verneed));
 
     for (size_t J = 0; J < VE.AuxV.size(); ++J, ++AuxCnt) {
       const ELFYAML::VernauxEntry &VAuxE = VE.AuxV[J];
@@ -1104,7 +1526,7 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
         VernAux.vna_next = 0;
       else
         VernAux.vna_next = sizeof(Elf_Vernaux);
-      OS.write((const char *)&VernAux, sizeof(Elf_Vernaux));
+      CBA.write((const char *)&VernAux, sizeof(Elf_Vernaux));
     }
   }
 
@@ -1124,7 +1546,6 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
   SHeader.sh_entsize = sizeof(Flags);
   SHeader.sh_size = SHeader.sh_entsize;
 
-  auto &OS = CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
   Flags.version = Section.Version;
   Flags.isa_level = Section.ISALevel;
   Flags.isa_rev = Section.ISARevision;
@@ -1136,7 +1557,7 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
   Flags.ases = Section.ASEs;
   Flags.flags1 = Section.Flags1;
   Flags.flags2 = Section.Flags2;
-  OS.write((const char *)&Flags, sizeof(Flags));
+  CBA.write((const char *)&Flags, sizeof(Flags));
 }
 
 template <class ELFT>
@@ -1160,102 +1581,87 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
   else
     SHeader.sh_entsize = sizeof(Elf_Dyn);
 
-  raw_ostream &OS =
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
   for (const ELFYAML::DynamicEntry &DE : Section.Entries) {
-    support::endian::write<uintX_t>(OS, DE.Tag, ELFT::TargetEndianness);
-    support::endian::write<uintX_t>(OS, DE.Val, ELFT::TargetEndianness);
+    CBA.write<uintX_t>(DE.Tag, ELFT::TargetEndianness);
+    CBA.write<uintX_t>(DE.Val, ELFT::TargetEndianness);
   }
   if (Section.Content)
-    Section.Content->writeAsBinary(OS);
+    CBA.writeAsBinary(*Section.Content);
 }
 
 template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::AddrsigSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
-  raw_ostream &OS =
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
-
   unsigned Link = 0;
-  if (Section.Link.empty() && SN2I.lookup(".symtab", Link))
+  if (Section.Link.empty() && !ExcludedSectionHeaders.count(".symtab") &&
+      SN2I.lookup(".symtab", Link))
     SHeader.sh_link = Link;
 
   if (Section.Content || Section.Size) {
-    SHeader.sh_size = writeContent(OS, Section.Content, Section.Size);
+    SHeader.sh_size = writeContent(CBA, Section.Content, Section.Size);
     return;
   }
 
-  for (const ELFYAML::AddrsigSymbol &Sym : *Section.Symbols) {
-    uint64_t Val =
-        Sym.Name ? toSymbolIndex(*Sym.Name, Section.Name, /*IsDynamic=*/false)
-                 : (uint32_t)*Sym.Index;
-    SHeader.sh_size += encodeULEB128(Val, OS);
-  }
+  for (StringRef Sym : *Section.Symbols)
+    SHeader.sh_size +=
+        CBA.writeULEB128(toSymbolIndex(Sym, Section.Name, /*IsDynamic=*/false));
 }
 
 template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::NoteSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
-  raw_ostream &OS =
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
-  uint64_t Offset = OS.tell();
-
+  uint64_t Offset = CBA.tell();
   if (Section.Content || Section.Size) {
-    SHeader.sh_size = writeContent(OS, Section.Content, Section.Size);
+    SHeader.sh_size = writeContent(CBA, Section.Content, Section.Size);
     return;
   }
 
   for (const ELFYAML::NoteEntry &NE : *Section.Notes) {
     // Write name size.
     if (NE.Name.empty())
-      support::endian::write<uint32_t>(OS, 0, ELFT::TargetEndianness);
+      CBA.write<uint32_t>(0, ELFT::TargetEndianness);
     else
-      support::endian::write<uint32_t>(OS, NE.Name.size() + 1,
-                                       ELFT::TargetEndianness);
+      CBA.write<uint32_t>(NE.Name.size() + 1, ELFT::TargetEndianness);
 
     // Write description size.
     if (NE.Desc.binary_size() == 0)
-      support::endian::write<uint32_t>(OS, 0, ELFT::TargetEndianness);
+      CBA.write<uint32_t>(0, ELFT::TargetEndianness);
     else
-      support::endian::write<uint32_t>(OS, NE.Desc.binary_size(),
-                                       ELFT::TargetEndianness);
+      CBA.write<uint32_t>(NE.Desc.binary_size(), ELFT::TargetEndianness);
 
     // Write type.
-    support::endian::write<uint32_t>(OS, NE.Type, ELFT::TargetEndianness);
+    CBA.write<uint32_t>(NE.Type, ELFT::TargetEndianness);
 
     // Write name, null terminator and padding.
     if (!NE.Name.empty()) {
-      support::endian::write<uint8_t>(OS, arrayRefFromStringRef(NE.Name),
-                                      ELFT::TargetEndianness);
-      support::endian::write<uint8_t>(OS, 0, ELFT::TargetEndianness);
+      CBA.write(NE.Name.data(), NE.Name.size());
+      CBA.write('\0');
       CBA.padToAlignment(4);
     }
 
     // Write description and padding.
     if (NE.Desc.binary_size() != 0) {
-      NE.Desc.writeAsBinary(OS);
+      CBA.writeAsBinary(NE.Desc);
       CBA.padToAlignment(4);
     }
   }
 
-  SHeader.sh_size = OS.tell() - Offset;
+  SHeader.sh_size = CBA.tell() - Offset;
 }
 
 template <class ELFT>
 void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
                                          const ELFYAML::GnuHashSection &Section,
                                          ContiguousBlobAccumulator &CBA) {
-  raw_ostream &OS =
-      CBA.getOSAndAlignedOffset(SHeader.sh_offset, SHeader.sh_addralign);
-
   unsigned Link = 0;
-  if (Section.Link.empty() && SN2I.lookup(".dynsym", Link))
+  if (Section.Link.empty() && !ExcludedSectionHeaders.count(".dynsym") &&
+      SN2I.lookup(".dynsym", Link))
     SHeader.sh_link = Link;
 
   if (Section.Content) {
-    SHeader.sh_size = writeContent(OS, Section.Content, None);
+    SHeader.sh_size = writeContent(CBA, Section.Content, None);
     return;
   }
 
@@ -1264,42 +1670,35 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
   // be used to override this field, which is useful for producing broken
   // objects.
   if (Section.Header->NBuckets)
-    support::endian::write<uint32_t>(OS, *Section.Header->NBuckets,
-                                     ELFT::TargetEndianness);
+    CBA.write<uint32_t>(*Section.Header->NBuckets, ELFT::TargetEndianness);
   else
-    support::endian::write<uint32_t>(OS, Section.HashBuckets->size(),
-                                     ELFT::TargetEndianness);
+    CBA.write<uint32_t>(Section.HashBuckets->size(), ELFT::TargetEndianness);
 
   // Write the index of the first symbol in the dynamic symbol table accessible
   // via the hash table.
-  support::endian::write<uint32_t>(OS, Section.Header->SymNdx,
-                                   ELFT::TargetEndianness);
+  CBA.write<uint32_t>(Section.Header->SymNdx, ELFT::TargetEndianness);
 
   // Write the number of words in the Bloom filter. As above, the "MaskWords"
   // property can be used to set this field to any value.
   if (Section.Header->MaskWords)
-    support::endian::write<uint32_t>(OS, *Section.Header->MaskWords,
-                                     ELFT::TargetEndianness);
+    CBA.write<uint32_t>(*Section.Header->MaskWords, ELFT::TargetEndianness);
   else
-    support::endian::write<uint32_t>(OS, Section.BloomFilter->size(),
-                                     ELFT::TargetEndianness);
+    CBA.write<uint32_t>(Section.BloomFilter->size(), ELFT::TargetEndianness);
 
   // Write the shift constant used by the Bloom filter.
-  support::endian::write<uint32_t>(OS, Section.Header->Shift2,
-                                   ELFT::TargetEndianness);
+  CBA.write<uint32_t>(Section.Header->Shift2, ELFT::TargetEndianness);
 
   // We've finished writing the header. Now write the Bloom filter.
   for (llvm::yaml::Hex64 Val : *Section.BloomFilter)
-    support::endian::write<typename ELFT::uint>(OS, Val,
-                                                ELFT::TargetEndianness);
+    CBA.write<uintX_t>(Val, ELFT::TargetEndianness);
 
   // Write an array of hash buckets.
   for (llvm::yaml::Hex32 Val : *Section.HashBuckets)
-    support::endian::write<uint32_t>(OS, Val, ELFT::TargetEndianness);
+    CBA.write<uint32_t>(Val, ELFT::TargetEndianness);
 
   // Write an array of hash values.
   for (llvm::yaml::Hex32 Val : *Section.HashValues)
-    support::endian::write<uint32_t>(OS, Val, ELFT::TargetEndianness);
+    CBA.write<uint32_t>(Val, ELFT::TargetEndianness);
 
   SHeader.sh_size = 16 /*Header size*/ +
                     Section.BloomFilter->size() * sizeof(typename ELFT::uint) +
@@ -1310,42 +1709,91 @@ void ELFState<ELFT>::writeSectionContent(Elf_Shdr &SHeader,
 template <class ELFT>
 void ELFState<ELFT>::writeFill(ELFYAML::Fill &Fill,
                                ContiguousBlobAccumulator &CBA) {
-  raw_ostream &OS = CBA.getOSAndAlignedOffset(Fill.ShOffset, /*Align=*/1);
-
   size_t PatternSize = Fill.Pattern ? Fill.Pattern->binary_size() : 0;
   if (!PatternSize) {
-    OS.write_zeros(Fill.Size);
+    CBA.writeZeros(Fill.Size);
     return;
   }
 
   // Fill the content with the specified pattern.
   uint64_t Written = 0;
   for (; Written + PatternSize <= Fill.Size; Written += PatternSize)
-    Fill.Pattern->writeAsBinary(OS);
-  Fill.Pattern->writeAsBinary(OS, Fill.Size - Written);
+    CBA.writeAsBinary(*Fill.Pattern);
+  CBA.writeAsBinary(*Fill.Pattern, Fill.Size - Written);
 }
 
-template <class ELFT> void ELFState<ELFT>::buildSectionIndex() {
-  size_t SecNdx = -1;
+template <class ELFT>
+DenseMap<StringRef, size_t> ELFState<ELFT>::buildSectionHeaderReorderMap() {
+  if (!Doc.SectionHeaders || Doc.SectionHeaders->NoHeaders)
+    return DenseMap<StringRef, size_t>();
+
+  DenseMap<StringRef, size_t> Ret;
+  size_t SecNdx = 0;
   StringSet<> Seen;
-  for (size_t I = 0; I < Doc.Chunks.size(); ++I) {
-    const std::unique_ptr<ELFYAML::Chunk> &C = Doc.Chunks[I];
-    bool IsSection = isa<ELFYAML::Section>(C.get());
-    if (IsSection)
-      ++SecNdx;
 
-    if (C->Name.empty())
-      continue;
+  auto AddSection = [&](const ELFYAML::SectionHeader &Hdr) {
+    if (!Ret.try_emplace(Hdr.Name, ++SecNdx).second)
+      reportError("repeated section name: '" + Hdr.Name +
+                  "' in the section header description");
+    Seen.insert(Hdr.Name);
+  };
 
-    if (!Seen.insert(C->Name).second)
-      reportError("repeated section/fill name: '" + C->Name +
-                  "' at YAML section/fill number " + Twine(I));
-    if (!IsSection || HasError)
+  if (Doc.SectionHeaders->Sections)
+    for (const ELFYAML::SectionHeader &Hdr : *Doc.SectionHeaders->Sections)
+      AddSection(Hdr);
+
+  if (Doc.SectionHeaders->Excluded)
+    for (const ELFYAML::SectionHeader &Hdr : *Doc.SectionHeaders->Excluded)
+      AddSection(Hdr);
+
+  for (const ELFYAML::Section *S : Doc.getSections()) {
+    // Ignore special first SHT_NULL section.
+    if (S == Doc.getSections().front())
       continue;
+    if (!Seen.count(S->Name))
+      reportError("section '" + S->Name +
+                  "' should be present in the 'Sections' or 'Excluded' lists");
+    Seen.erase(S->Name);
+  }
 
-    if (!SN2I.addName(C->Name, SecNdx))
+  for (const auto &It : Seen)
+    reportError("section header contains undefined section '" + It.getKey() +
+                "'");
+  return Ret;
+}
+
+template <class ELFT> void ELFState<ELFT>::buildSectionIndex() {
+  // A YAML description can have an explicit section header declaration that
+  // allows to change the order of section headers.
+  DenseMap<StringRef, size_t> ReorderMap = buildSectionHeaderReorderMap();
+
+  if (HasError)
+    return;
+
+  // Build excluded section headers map.
+  std::vector<ELFYAML::Section *> Sections = Doc.getSections();
+  if (Doc.SectionHeaders) {
+    if (Doc.SectionHeaders->Excluded)
+      for (const ELFYAML::SectionHeader &Hdr : *Doc.SectionHeaders->Excluded)
+        if (!ExcludedSectionHeaders.insert(Hdr.Name).second)
+          llvm_unreachable("buildSectionIndex() failed");
+
+    if (Doc.SectionHeaders->NoHeaders.getValueOr(false))
+      for (const ELFYAML::Section *S : Sections)
+        if (!ExcludedSectionHeaders.insert(S->Name).second)
+          llvm_unreachable("buildSectionIndex() failed");
+  }
+
+  size_t SecNdx = -1;
+  for (const ELFYAML::Section *S : Sections) {
+    ++SecNdx;
+
+    size_t Index = ReorderMap.empty() ? SecNdx : ReorderMap.lookup(S->Name);
+    if (!SN2I.addName(S->Name, Index))
       llvm_unreachable("buildSectionIndex() failed");
-    DotShStrtab.add(ELFYAML::dropUniqueSuffix(C->Name));
+
+    if (!ExcludedSectionHeaders.count(S->Name))
+      DotShStrtab.add(ELFYAML::dropUniqueSuffix(S->Name));
   }
 
   DotShStrtab.finalize();
@@ -1402,8 +1850,10 @@ template <class ELFT> void ELFState<ELFT>::finalizeStrings() {
 
 template <class ELFT>
 bool ELFState<ELFT>::writeELF(raw_ostream &OS, ELFYAML::Object &Doc,
-                              yaml::ErrorHandler EH) {
+                              yaml::ErrorHandler EH, uint64_t MaxSize) {
   ELFState<ELFT> State(Doc, EH);
+  if (State.HasError)
+    return false;
 
   // Finalize .strtab and .dynstr sections. We do that early because want to
   // finalize the string table builders before writing the content of the
@@ -1411,11 +1861,11 @@ bool ELFState<ELFT>::writeELF(raw_ostream &OS, ELFYAML::Object &Doc,
   State.finalizeStrings();
 
   State.buildSectionIndex();
+  State.buildSymbolIndexes();
+
   if (State.HasError)
     return false;
 
-  State.buildSymbolIndexes();
-
   std::vector<Elf_Phdr> PHeaders;
   State.initProgramHeaders(PHeaders);
 
@@ -1423,7 +1873,11 @@ bool ELFState<ELFT>::writeELF(raw_ostream &OS, ELFYAML::Object &Doc,
   // things to `OS`.
   const size_t SectionContentBeginOffset =
       sizeof(Elf_Ehdr) + sizeof(Elf_Phdr) * Doc.ProgramHeaders.size();
-  ContiguousBlobAccumulator CBA(SectionContentBeginOffset);
+  // It is quite easy to accidentally create output with yaml2obj that is larger
+  // than intended, for example, due to an issue in the YAML description.
+  // We limit the maximum allowed output size, but also provide a command line
+  // option to change this limitation.
+  ContiguousBlobAccumulator CBA(SectionContentBeginOffset, MaxSize);
 
   std::vector<Elf_Shdr> SHeaders;
   State.initSectionHeaders(SHeaders, CBA);
@@ -1431,10 +1885,26 @@ bool ELFState<ELFT>::writeELF(raw_ostream &OS, ELFYAML::Object &Doc,
   // Now we can decide segment offsets.
   State.setProgramHeaderLayout(PHeaders, SHeaders);
 
+  // Align the start of the section header table, which is written after all
+  // section data.
+  uint64_t SHOff =
+      State.alignToOffset(CBA, sizeof(typename ELFT::uint), /*Offset=*/None);
+  bool ReachedLimit = SHOff + arrayDataSize(makeArrayRef(SHeaders)) > MaxSize;
+  if (Error E = CBA.takeLimitError()) {
+    // We report a custom error message instead below.
+    consumeError(std::move(E));
+    ReachedLimit = true;
+  }
+
+  if (ReachedLimit)
+    State.reportError(
+        "the desired output size is greater than permitted. Use the "
+        "--max-size option to change the limit");
+
   if (State.HasError)
     return false;
 
-  State.writeELFHeader(CBA, OS);
+  State.writeELFHeader(OS, SHOff);
   writeArrayData(OS, makeArrayRef(PHeaders));
   CBA.writeBlobToStream(OS);
   writeArrayData(OS, makeArrayRef(SHeaders));
@@ -1444,17 +1914,18 @@ bool ELFState<ELFT>::writeELF(raw_ostream &OS, ELFYAML::Object &Doc,
 namespace llvm {
 namespace yaml {
 
-bool yaml2elf(llvm::ELFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH) {
+bool yaml2elf(llvm::ELFYAML::Object &Doc, raw_ostream &Out, ErrorHandler EH,
+              uint64_t MaxSize) {
   bool IsLE = Doc.Header.Data == ELFYAML::ELF_ELFDATA(ELF::ELFDATA2LSB);
   bool Is64Bit = Doc.Header.Class == ELFYAML::ELF_ELFCLASS(ELF::ELFCLASS64);
   if (Is64Bit) {
     if (IsLE)
-      return ELFState<object::ELF64LE>::writeELF(Out, Doc, EH);
-    return ELFState<object::ELF64BE>::writeELF(Out, Doc, EH);
+      return ELFState<object::ELF64LE>::writeELF(Out, Doc, EH, MaxSize);
+    return ELFState<object::ELF64BE>::writeELF(Out, Doc, EH, MaxSize);
   }
   if (IsLE)
-    return ELFState<object::ELF32LE>::writeELF(Out, Doc, EH);
-  return ELFState<object::ELF32BE>::writeELF(Out, Doc, EH);
+    return ELFState<object::ELF32LE>::writeELF(Out, Doc, EH, MaxSize);
+  return ELFState<object::ELF32BE>::writeELF(Out, Doc, EH, MaxSize);
 }
 
 } // namespace yaml
diff --git a/llvm/lib/ObjectYAML/ELFYAML.cpp b/llvm/lib/ObjectYAML/ELFYAML.cpp
index efa7ecb4728b..2353b34f188b 100644
--- a/llvm/lib/ObjectYAML/ELFYAML.cpp
+++ b/llvm/lib/ObjectYAML/ELFYAML.cpp
@@ -221,6 +221,7 @@ void ScalarEnumerationTraits<ELFYAML::ELF_EM>::enumeration(
   ECase(EM_RISCV);
   ECase(EM_LANAI);
   ECase(EM_BPF);
+  ECase(EM_VE);
 #undef ECase
   IO.enumFallback<Hex16>(Value);
 }
@@ -349,6 +350,9 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCase(EF_HEXAGON_MACH_V60);
     BCase(EF_HEXAGON_MACH_V62);
     BCase(EF_HEXAGON_MACH_V65);
+    BCase(EF_HEXAGON_MACH_V66);
+    BCase(EF_HEXAGON_MACH_V67);
+    BCase(EF_HEXAGON_MACH_V67T);
     BCase(EF_HEXAGON_ISA_V2);
     BCase(EF_HEXAGON_ISA_V3);
     BCase(EF_HEXAGON_ISA_V4);
@@ -357,6 +361,8 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCase(EF_HEXAGON_ISA_V60);
     BCase(EF_HEXAGON_ISA_V62);
     BCase(EF_HEXAGON_ISA_V65);
+    BCase(EF_HEXAGON_ISA_V66);
+    BCase(EF_HEXAGON_ISA_V67);
     break;
   case ELF::EM_AVR:
     BCase(EF_AVR_ARCH_AVR1);
@@ -423,6 +429,7 @@ void ScalarBitSetTraits<ELFYAML::ELF_EF>::bitset(IO &IO,
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1010, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1011, EF_AMDGPU_MACH);
     BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1012, EF_AMDGPU_MACH);
+    BCaseMask(EF_AMDGPU_MACH_AMDGCN_GFX1030, EF_AMDGPU_MACH);
     BCase(EF_AMDGPU_XNACK);
     BCase(EF_AMDGPU_SRAM_ECC);
     break;
@@ -495,6 +502,9 @@ void ScalarEnumerationTraits<ELFYAML::ELF_SHT>::enumeration(
     ECase(SHT_MIPS_DWARF);
     ECase(SHT_MIPS_ABIFLAGS);
     break;
+  case ELF::EM_RISCV:
+    ECase(SHT_RISCV_ATTRIBUTES);
+    break;
   default:
     // Nothing to do.
     break;
@@ -654,6 +664,9 @@ void ScalarEnumerationTraits<ELFYAML::ELF_REL>::enumeration(
   case ELF::EM_BPF:
 #include "llvm/BinaryFormat/ELFRelocs/BPF.def"
     break;
+  case ELF::EM_VE:
+#include "llvm/BinaryFormat/ELFRelocs/VE.def"
+    break;
   case ELF::EM_PPC64:
 #include "llvm/BinaryFormat/ELFRelocs/PowerPC64.def"
     break;
@@ -820,6 +833,28 @@ void ScalarBitSetTraits<ELFYAML::MIPS_AFL_FLAGS1>::bitset(
 #undef BCase
 }
 
+void MappingTraits<ELFYAML::SectionHeader>::mapping(
+    IO &IO, ELFYAML::SectionHeader &SHdr) {
+  IO.mapRequired("Name", SHdr.Name);
+}
+
+void MappingTraits<ELFYAML::SectionHeaderTable>::mapping(
+    IO &IO, ELFYAML::SectionHeaderTable &SectionHeader) {
+  IO.mapOptional("Sections", SectionHeader.Sections);
+  IO.mapOptional("Excluded", SectionHeader.Excluded);
+  IO.mapOptional("NoHeaders", SectionHeader.NoHeaders);
+}
+
+StringRef MappingTraits<ELFYAML::SectionHeaderTable>::validate(
+    IO &IO, ELFYAML::SectionHeaderTable &SecHdrTable) {
+  if (SecHdrTable.NoHeaders && (SecHdrTable.Sections || SecHdrTable.Excluded))
+    return "NoHeaders can't be used together with Sections/Excluded";
+  if (!SecHdrTable.NoHeaders && !SecHdrTable.Sections && !SecHdrTable.Excluded)
+    return "SectionHeaderTable can't be empty. Use 'NoHeaders' key to drop the "
+           "section header table";
+  return StringRef();
+}
+
 void MappingTraits<ELFYAML::FileHeader>::mapping(IO &IO,
                                                  ELFYAML::FileHeader &FileHdr) {
   IO.mapRequired("Class", FileHdr.Class);
@@ -831,10 +866,16 @@ void MappingTraits<ELFYAML::FileHeader>::mapping(IO &IO,
   IO.mapOptional("Flags", FileHdr.Flags, ELFYAML::ELF_EF(0));
   IO.mapOptional("Entry", FileHdr.Entry, Hex64(0));
 
-  IO.mapOptional("SHEntSize", FileHdr.SHEntSize);
-  IO.mapOptional("SHOff", FileHdr.SHOff);
-  IO.mapOptional("SHNum", FileHdr.SHNum);
-  IO.mapOptional("SHStrNdx", FileHdr.SHStrNdx);
+  // obj2yaml does not dump these fields.
+  assert(!IO.outputting() ||
+         (!FileHdr.EPhOff && !FileHdr.EPhEntSize && !FileHdr.EPhNum));
+  IO.mapOptional("EPhOff", FileHdr.EPhOff);
+  IO.mapOptional("EPhEntSize", FileHdr.EPhEntSize);
+  IO.mapOptional("EPhNum", FileHdr.EPhNum);
+  IO.mapOptional("EShEntSize", FileHdr.EShEntSize);
+  IO.mapOptional("EShOff", FileHdr.EShOff);
+  IO.mapOptional("EShNum", FileHdr.EShNum);
+  IO.mapOptional("EShStrNdx", FileHdr.EShStrNdx);
 }
 
 void MappingTraits<ELFYAML::ProgramHeader>::mapping(
@@ -843,7 +884,7 @@ void MappingTraits<ELFYAML::ProgramHeader>::mapping(
   IO.mapOptional("Flags", Phdr.Flags, ELFYAML::ELF_PF(0));
   IO.mapOptional("Sections", Phdr.Sections);
   IO.mapOptional("VAddr", Phdr.VAddr, Hex64(0));
-  IO.mapOptional("PAddr", Phdr.PAddr, Hex64(0));
+  IO.mapOptional("PAddr", Phdr.PAddr, Phdr.VAddr);
   IO.mapOptional("Align", Phdr.Align);
   IO.mapOptional("FileSize", Phdr.FileSize);
   IO.mapOptional("MemSize", Phdr.MemSize);
@@ -977,9 +1018,41 @@ struct NormalizedOther {
 
 } // end anonymous namespace
 
+void ScalarTraits<ELFYAML::YAMLIntUInt>::output(const ELFYAML::YAMLIntUInt &Val,
+                                                void *Ctx, raw_ostream &Out) {
+  Out << Val;
+}
+
+StringRef ScalarTraits<ELFYAML::YAMLIntUInt>::input(StringRef Scalar, void *Ctx,
+                                                    ELFYAML::YAMLIntUInt &Val) {
+  const bool Is64 = static_cast<ELFYAML::Object *>(Ctx)->Header.Class ==
+                    ELFYAML::ELF_ELFCLASS(ELF::ELFCLASS64);
+  StringRef ErrMsg = "invalid number";
+  // We do not accept negative hex numbers because their meaning is ambiguous.
+  // For example, would -0xfffffffff mean 1 or INT32_MIN?
+  if (Scalar.empty() || Scalar.startswith("-0x"))
+    return ErrMsg;
+
+  if (Scalar.startswith("-")) {
+    const int64_t MinVal = Is64 ? INT64_MIN : INT32_MIN;
+    long long Int;
+    if (getAsSignedInteger(Scalar, /*Radix=*/0, Int) || (Int < MinVal))
+      return ErrMsg;
+    Val = Int;
+    return "";
+  }
+
+  const uint64_t MaxVal = Is64 ? UINT64_MAX : UINT32_MAX;
+  unsigned long long UInt;
+  if (getAsUnsignedInteger(Scalar, /*Radix=*/0, UInt) || (UInt > MaxVal))
+    return ErrMsg;
+  Val = UInt;
+  return "";
+}
+
 void MappingTraits<ELFYAML::Symbol>::mapping(IO &IO, ELFYAML::Symbol &Symbol) {
   IO.mapOptional("Name", Symbol.Name, StringRef());
-  IO.mapOptional("NameIndex", Symbol.NameIndex);
+  IO.mapOptional("StName", Symbol.StName);
   IO.mapOptional("Type", Symbol.Type, ELFYAML::ELF_STT(0));
   IO.mapOptional("Section", Symbol.Section, StringRef());
   IO.mapOptional("Index", Symbol.Index);
@@ -1001,8 +1074,6 @@ StringRef MappingTraits<ELFYAML::Symbol>::validate(IO &IO,
                                                    ELFYAML::Symbol &Symbol) {
   if (Symbol.Index && Symbol.Section.data())
     return "Index and Section cannot both be specified for Symbol";
-  if (Symbol.NameIndex && !Symbol.Name.empty())
-    return "Name and NameIndex cannot both be specified for Symbol";
   return StringRef();
 }
 
@@ -1010,10 +1081,11 @@ static void commonSectionMapping(IO &IO, ELFYAML::Section &Section) {
   IO.mapOptional("Name", Section.Name, StringRef());
   IO.mapRequired("Type", Section.Type);
   IO.mapOptional("Flags", Section.Flags);
-  IO.mapOptional("Address", Section.Address, Hex64(0));
+  IO.mapOptional("Address", Section.Address);
   IO.mapOptional("Link", Section.Link, StringRef());
   IO.mapOptional("AddressAlign", Section.AddressAlign, Hex64(0));
   IO.mapOptional("EntSize", Section.EntSize);
+  IO.mapOptional("Offset", Section.Offset);
 
   // obj2yaml does not dump these fields. They are expected to be empty when we
   // are producing YAML, because yaml2obj sets appropriate values for them
@@ -1036,6 +1108,17 @@ static void sectionMapping(IO &IO, ELFYAML::DynamicSection &Section) {
 static void sectionMapping(IO &IO, ELFYAML::RawContentSection &Section) {
   commonSectionMapping(IO, Section);
   IO.mapOptional("Content", Section.Content);
+
+  // We also support reading a content as array of bytes using the ContentArray
+  // key. obj2yaml never prints this field.
+  assert(!IO.outputting() || !Section.ContentBuf.hasValue());
+  IO.mapOptional("ContentArray", Section.ContentBuf);
+  if (Section.ContentBuf) {
+    if (Section.Content)
+      IO.setError("Content and ContentArray can't be used together");
+    Section.Content = yaml::BinaryRef(*Section.ContentBuf);
+  }
+
   IO.mapOptional("Size", Section.Size);
   IO.mapOptional("Info", Section.Info);
 }
@@ -1053,6 +1136,13 @@ static void sectionMapping(IO &IO, ELFYAML::HashSection &Section) {
   IO.mapOptional("Bucket", Section.Bucket);
   IO.mapOptional("Chain", Section.Chain);
   IO.mapOptional("Size", Section.Size);
+
+  // obj2yaml does not dump these fields. They can be used to override nchain
+  // and nbucket values for creating broken sections.
+  assert(!IO.outputting() ||
+         (!Section.NBucket.hasValue() && !Section.NChain.hasValue()));
+  IO.mapOptional("NChain", Section.NChain);
+  IO.mapOptional("NBucket", Section.NBucket);
 }
 
 static void sectionMapping(IO &IO, ELFYAML::NoteSection &Section) {
@@ -1128,6 +1218,7 @@ static void sectionMapping(IO &IO, ELFYAML::AddrsigSection &Section) {
 static void fillMapping(IO &IO, ELFYAML::Fill &Fill) {
   IO.mapOptional("Name", Fill.Name, StringRef());
   IO.mapOptional("Pattern", Fill.Pattern);
+  IO.mapOptional("Offset", Fill.Offset);
   IO.mapRequired("Size", Fill.Size);
 }
 
@@ -1144,6 +1235,12 @@ static void sectionMapping(IO &IO,
   IO.mapOptional("Content", Section.Content);
 }
 
+static void sectionMapping(IO &IO, ELFYAML::CallGraphProfileSection &Section) {
+  commonSectionMapping(IO, Section);
+  IO.mapOptional("Entries", Section.Entries);
+  IO.mapOptional("Content", Section.Content);
+}
+
 void MappingTraits<ELFYAML::SectionOrType>::mapping(
     IO &IO, ELFYAML::SectionOrType &sectionOrType) {
   IO.mapRequired("SectionOrType", sectionOrType.sectionNameOrType);
@@ -1277,6 +1374,11 @@ void MappingTraits<std::unique_ptr<ELFYAML::Chunk>>::mapping(
     sectionMapping(IO,
                    *cast<ELFYAML::DependentLibrariesSection>(Section.get()));
     break;
+  case ELF::SHT_LLVM_CALL_GRAPH_PROFILE:
+    if (!IO.outputting())
+      Section.reset(new ELFYAML::CallGraphProfileSection());
+    sectionMapping(IO, *cast<ELFYAML::CallGraphProfileSection>(Section.get()));
+    break;
   default:
     if (!IO.outputting()) {
       StringRef Name;
@@ -1367,11 +1469,6 @@ StringRef MappingTraits<std::unique_ptr<ELFYAML::Chunk>>::validate(
 
     if (!Sec->Symbols)
       return {};
-
-    for (const ELFYAML::AddrsigSymbol &AS : *Sec->Symbols)
-      if (AS.Index && AS.Name)
-        return "\"Index\" and \"Name\" cannot be used together when defining a "
-               "symbol";
     return {};
   }
 
@@ -1458,6 +1555,12 @@ StringRef MappingTraits<std::unique_ptr<ELFYAML::Chunk>>::validate(
     return {};
   }
 
+  if (const auto *CGP = dyn_cast<ELFYAML::CallGraphProfileSection>(C.get())) {
+    if (CGP->Entries && CGP->Content)
+      return "\"Entries\" and \"Content\" can't be used together";
+    return {};
+  }
+
   return {};
 }
 
@@ -1553,7 +1656,7 @@ void MappingTraits<ELFYAML::Relocation>::mapping(IO &IO,
   const auto *Object = static_cast<ELFYAML::Object *>(IO.getContext());
   assert(Object && "The IO context is not initialized");
 
-  IO.mapRequired("Offset", Rel.Offset);
+  IO.mapOptional("Offset", Rel.Offset, (Hex64)0);
   IO.mapOptional("Symbol", Rel.Symbol);
 
   if (Object->Header.Machine == ELFYAML::ELF_EM(ELF::EM_MIPS) &&
@@ -1567,7 +1670,7 @@ void MappingTraits<ELFYAML::Relocation>::mapping(IO &IO,
   } else
     IO.mapRequired("Type", Rel.Type);
 
-  IO.mapOptional("Addend", Rel.Addend, (int64_t)0);
+  IO.mapOptional("Addend", Rel.Addend, (ELFYAML::YAMLIntUInt)0);
 }
 
 void MappingTraits<ELFYAML::Object>::mapping(IO &IO, ELFYAML::Object &Object) {
@@ -1575,19 +1678,21 @@ void MappingTraits<ELFYAML::Object>::mapping(IO &IO, ELFYAML::Object &Object) {
   IO.setContext(&Object);
   IO.mapTag("!ELF", true);
   IO.mapRequired("FileHeader", Object.Header);
+  IO.mapOptional("SectionHeaderTable", Object.SectionHeaders);
   IO.mapOptional("ProgramHeaders", Object.ProgramHeaders);
   IO.mapOptional("Sections", Object.Chunks);
   IO.mapOptional("Symbols", Object.Symbols);
   IO.mapOptional("DynamicSymbols", Object.DynamicSymbols);
+  IO.mapOptional("DWARF", Object.DWARF);
+  if (Object.DWARF) {
+    Object.DWARF->IsLittleEndian =
+        Object.Header.Data == ELFYAML::ELF_ELFDATA(ELF::ELFDATA2LSB);
+    Object.DWARF->Is64BitAddrSize =
+        Object.Header.Class == ELFYAML::ELF_ELFCLASS(ELF::ELFCLASS64);
+  }
   IO.setContext(nullptr);
 }
 
-void MappingTraits<ELFYAML::AddrsigSymbol>::mapping(IO &IO, ELFYAML::AddrsigSymbol &Sym) {
-  assert(IO.getContext() && "The IO context is not initialized");
-  IO.mapOptional("Name", Sym.Name);
-  IO.mapOptional("Index", Sym.Index);
-}
-
 void MappingTraits<ELFYAML::LinkerOption>::mapping(IO &IO,
                                                    ELFYAML::LinkerOption &Opt) {
   assert(IO.getContext() && "The IO context is not initialized");
@@ -1595,6 +1700,14 @@ void MappingTraits<ELFYAML::LinkerOption>::mapping(IO &IO,
   IO.mapRequired("Value", Opt.Value);
 }
 
+void MappingTraits<ELFYAML::CallGraphEntry>::mapping(
+    IO &IO, ELFYAML::CallGraphEntry &E) {
+  assert(IO.getContext() && "The IO context is not initialized");
+  IO.mapRequired("From", E.From);
+  IO.mapRequired("To", E.To);
+  IO.mapRequired("Weight", E.Weight);
+}
+
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_AFL_REG)
 LLVM_YAML_STRONG_TYPEDEF(uint8_t, MIPS_ABI_FP)
 LLVM_YAML_STRONG_TYPEDEF(uint32_t, MIPS_AFL_EXT)
diff --git a/llvm/lib/ObjectYAML/MachOEmitter.cpp b/llvm/lib/ObjectYAML/MachOEmitter.cpp
index bda4aed885b4..680264484704 100644
--- a/llvm/lib/ObjectYAML/MachOEmitter.cpp
+++ b/llvm/lib/ObjectYAML/MachOEmitter.cpp
@@ -15,6 +15,8 @@
 #include "llvm/ObjectYAML/DWARFEmitter.h"
 #include "llvm/ObjectYAML/ObjectYAML.h"
 #include "llvm/ObjectYAML/yaml2obj.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/Error.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/Support/raw_ostream.h"
@@ -33,12 +35,13 @@ public:
     memset(reinterpret_cast<void *>(&Header), 0, sizeof(MachO::mach_header_64));
   }
 
-  void writeMachO(raw_ostream &OS);
+  Error writeMachO(raw_ostream &OS);
 
 private:
   void writeHeader(raw_ostream &OS);
   void writeLoadCommands(raw_ostream &OS);
-  void writeSectionData(raw_ostream &OS);
+  Error writeSectionData(raw_ostream &OS);
+  void writeRelocations(raw_ostream &OS);
   void writeLinkEditData(raw_ostream &OS);
 
   void writeBindOpcodes(raw_ostream &OS,
@@ -58,15 +61,23 @@ private:
   MachOYAML::Object &Obj;
   bool is64Bit;
   uint64_t fileStart;
-
   MachO::mach_header_64 Header;
+
+  // Old PPC Object Files didn't have __LINKEDIT segments, the data was just
+  // stuck at the end of the file.
+  bool FoundLinkEditSeg = false;
 };
 
-void MachOWriter::writeMachO(raw_ostream &OS) {
+Error MachOWriter::writeMachO(raw_ostream &OS) {
   fileStart = OS.tell();
   writeHeader(OS);
   writeLoadCommands(OS);
-  writeSectionData(OS);
+  if (Error Err = writeSectionData(OS))
+    return Err;
+  writeRelocations(OS);
+  if (!FoundLinkEditSeg)
+    writeLinkEditData(OS);
+  return Error::success();
 }
 
 void MachOWriter::writeHeader(raw_ostream &OS) {
@@ -254,8 +265,7 @@ void MachOWriter::writeLoadCommands(raw_ostream &OS) {
   }
 }
 
-void MachOWriter::writeSectionData(raw_ostream &OS) {
-  bool FoundLinkEditSeg = false;
+Error MachOWriter::writeSectionData(raw_ostream &OS) {
   for (auto &LC : Obj.LoadCommands) {
     switch (LC.Data.load_command_data.cmd) {
     case MachO::LC_SEGMENT:
@@ -271,27 +281,37 @@ void MachOWriter::writeSectionData(raw_ostream &OS) {
         ZeroToOffset(OS, Sec.offset);
         // Zero Fill any data between the end of the last thing we wrote and the
         // start of this section.
-        assert((OS.tell() - fileStart <= Sec.offset ||
-                Sec.offset == (uint32_t)0) &&
-               "Wrote too much data somewhere, section offsets don't line up.");
+        if (OS.tell() - fileStart > Sec.offset && Sec.offset != (uint32_t)0)
+          return createStringError(
+              errc::invalid_argument,
+              "wrote too much data somewhere, section offsets don't line up");
         if (0 == strncmp(&Sec.segname[0], "__DWARF", 16)) {
-          if (0 == strncmp(&Sec.sectname[0], "__debug_str", 16)) {
-            DWARFYAML::EmitDebugStr(OS, Obj.DWARF);
-          } else if (0 == strncmp(&Sec.sectname[0], "__debug_abbrev", 16)) {
-            DWARFYAML::EmitDebugAbbrev(OS, Obj.DWARF);
-          } else if (0 == strncmp(&Sec.sectname[0], "__debug_aranges", 16)) {
-            DWARFYAML::EmitDebugAranges(OS, Obj.DWARF);
-          } else if (0 == strncmp(&Sec.sectname[0], "__debug_pubnames", 16)) {
-            DWARFYAML::EmitPubSection(OS, Obj.DWARF.PubNames,
-                                      Obj.IsLittleEndian);
+          Error Err = Error::success();
+          cantFail(std::move(Err));
+
+          if (0 == strncmp(&Sec.sectname[0], "__debug_str", 16))
+            Err = DWARFYAML::emitDebugStr(OS, Obj.DWARF);
+          else if (0 == strncmp(&Sec.sectname[0], "__debug_abbrev", 16))
+            Err = DWARFYAML::emitDebugAbbrev(OS, Obj.DWARF);
+          else if (0 == strncmp(&Sec.sectname[0], "__debug_aranges", 16))
+            Err = DWARFYAML::emitDebugAranges(OS, Obj.DWARF);
+          else if (0 == strncmp(&Sec.sectname[0], "__debug_ranges", 16))
+            Err = DWARFYAML::emitDebugRanges(OS, Obj.DWARF);
+          else if (0 == strncmp(&Sec.sectname[0], "__debug_pubnames", 16)) {
+            if (Obj.DWARF.PubNames)
+              Err = DWARFYAML::emitPubSection(OS, *Obj.DWARF.PubNames,
+                                              Obj.IsLittleEndian);
           } else if (0 == strncmp(&Sec.sectname[0], "__debug_pubtypes", 16)) {
-            DWARFYAML::EmitPubSection(OS, Obj.DWARF.PubTypes,
-                                      Obj.IsLittleEndian);
-          } else if (0 == strncmp(&Sec.sectname[0], "__debug_info", 16)) {
-            DWARFYAML::EmitDebugInfo(OS, Obj.DWARF);
-          } else if (0 == strncmp(&Sec.sectname[0], "__debug_line", 16)) {
-            DWARFYAML::EmitDebugLine(OS, Obj.DWARF);
-          }
+            if (Obj.DWARF.PubTypes)
+              Err = DWARFYAML::emitPubSection(OS, *Obj.DWARF.PubTypes,
+                                              Obj.IsLittleEndian);
+          } else if (0 == strncmp(&Sec.sectname[0], "__debug_info", 16))
+            Err = DWARFYAML::emitDebugInfo(OS, Obj.DWARF);
+          else if (0 == strncmp(&Sec.sectname[0], "__debug_line", 16))
+            Err = DWARFYAML::emitDebugLine(OS, Obj.DWARF);
+
+          if (Err)
+            return Err;
 
           continue;
         }
@@ -315,10 +335,62 @@ void MachOWriter::writeSectionData(raw_ostream &OS) {
       break;
     }
   }
-  // Old PPC Object Files didn't have __LINKEDIT segments, the data was just
-  // stuck at the end of the file.
-  if (!FoundLinkEditSeg)
-    writeLinkEditData(OS);
+
+  return Error::success();
+}
+
+// The implementation of makeRelocationInfo and makeScatteredRelocationInfo is
+// consistent with how libObject parses MachO binary files. For the reference
+// see getStruct, getRelocation, getPlainRelocationPCRel,
+// getPlainRelocationLength and related methods in MachOObjectFile.cpp
+static MachO::any_relocation_info
+makeRelocationInfo(const MachOYAML::Relocation &R, bool IsLE) {
+  assert(!R.is_scattered && "non-scattered relocation expected");
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = R.address;
+  if (IsLE)
+    MRE.r_word1 = ((unsigned)R.symbolnum << 0) | ((unsigned)R.is_pcrel << 24) |
+                  ((unsigned)R.length << 25) | ((unsigned)R.is_extern << 27) |
+                  ((unsigned)R.type << 28);
+  else
+    MRE.r_word1 = ((unsigned)R.symbolnum << 8) | ((unsigned)R.is_pcrel << 7) |
+                  ((unsigned)R.length << 5) | ((unsigned)R.is_extern << 4) |
+                  ((unsigned)R.type << 0);
+  return MRE;
+}
+
+static MachO::any_relocation_info
+makeScatteredRelocationInfo(const MachOYAML::Relocation &R) {
+  assert(R.is_scattered && "scattered relocation expected");
+  MachO::any_relocation_info MRE;
+  MRE.r_word0 = (((unsigned)R.address << 0) | ((unsigned)R.type << 24) |
+                 ((unsigned)R.length << 28) | ((unsigned)R.is_pcrel << 30) |
+                 MachO::R_SCATTERED);
+  MRE.r_word1 = R.value;
+  return MRE;
+}
+
+void MachOWriter::writeRelocations(raw_ostream &OS) {
+  for (const MachOYAML::LoadCommand &LC : Obj.LoadCommands) {
+    switch (LC.Data.load_command_data.cmd) {
+    case MachO::LC_SEGMENT:
+    case MachO::LC_SEGMENT_64:
+      for (const MachOYAML::Section &Sec : LC.Sections) {
+        if (Sec.relocations.empty())
+          continue;
+        ZeroToOffset(OS, Sec.reloff);
+        for (const MachOYAML::Relocation &R : Sec.relocations) {
+          MachO::any_relocation_info MRE =
+              R.is_scattered ? makeScatteredRelocationInfo(R)
+                             : makeRelocationInfo(R, Obj.IsLittleEndian);
+          if (Obj.IsLittleEndian != sys::IsLittleEndianHost)
+            MachO::swapStruct(MRE);
+          OS.write(reinterpret_cast<const char *>(&MRE),
+                   sizeof(MachO::any_relocation_info));
+        }
+      }
+    }
+  }
 }
 
 void MachOWriter::writeBindOpcodes(
@@ -470,7 +542,7 @@ public:
   UniversalWriter(yaml::YamlObjectFile &ObjectFile)
       : ObjectFile(ObjectFile), fileStart(0) {}
 
-  void writeMachO(raw_ostream &OS);
+  Error writeMachO(raw_ostream &OS);
 
 private:
   void writeFatHeader(raw_ostream &OS);
@@ -482,28 +554,33 @@ private:
   uint64_t fileStart;
 };
 
-void UniversalWriter::writeMachO(raw_ostream &OS) {
+Error UniversalWriter::writeMachO(raw_ostream &OS) {
   fileStart = OS.tell();
   if (ObjectFile.MachO) {
     MachOWriter Writer(*ObjectFile.MachO);
-    Writer.writeMachO(OS);
-    return;
+    return Writer.writeMachO(OS);
   }
 
   writeFatHeader(OS);
   writeFatArchs(OS);
 
   auto &FatFile = *ObjectFile.FatMachO;
-  assert(FatFile.FatArchs.size() >= FatFile.Slices.size() &&
-         "Cannot write Slices if not decribed in FatArches");
+  if (FatFile.FatArchs.size() < FatFile.Slices.size())
+    return createStringError(
+        errc::invalid_argument,
+        "cannot write 'Slices' if not described in 'FatArches'");
+
   for (size_t i = 0; i < FatFile.Slices.size(); i++) {
     ZeroToOffset(OS, FatFile.FatArchs[i].offset);
     MachOWriter Writer(FatFile.Slices[i]);
-    Writer.writeMachO(OS);
+    if (Error Err = Writer.writeMachO(OS))
+      return Err;
 
     auto SliceEnd = FatFile.FatArchs[i].offset + FatFile.FatArchs[i].size;
     ZeroToOffset(OS, SliceEnd);
   }
+
+  return Error::success();
 }
 
 void UniversalWriter::writeFatHeader(raw_ostream &OS) {
@@ -571,9 +648,13 @@ void UniversalWriter::ZeroToOffset(raw_ostream &OS, size_t Offset) {
 namespace llvm {
 namespace yaml {
 
-bool yaml2macho(YamlObjectFile &Doc, raw_ostream &Out, ErrorHandler /*EH*/) {
+bool yaml2macho(YamlObjectFile &Doc, raw_ostream &Out, ErrorHandler EH) {
   UniversalWriter Writer(Doc);
-  Writer.writeMachO(Out);
+  if (Error Err = Writer.writeMachO(Out)) {
+    handleAllErrors(std::move(Err),
+                    [&](const ErrorInfoBase &Err) { EH(Err.message()); });
+    return false;
+  }
   return true;
 }
 
diff --git a/llvm/lib/ObjectYAML/MachOYAML.cpp b/llvm/lib/ObjectYAML/MachOYAML.cpp
index 0f7cd1e1495c..86aad0233767 100644
--- a/llvm/lib/ObjectYAML/MachOYAML.cpp
+++ b/llvm/lib/ObjectYAML/MachOYAML.cpp
@@ -107,6 +107,8 @@ void MappingTraits<MachOYAML::Object>::mapping(IO &IO,
   Object.DWARF.IsLittleEndian = Object.IsLittleEndian;
 
   IO.mapRequired("FileHeader", Object.Header);
+  Object.DWARF.Is64BitAddrSize = Object.Header.magic == MachO::MH_MAGIC_64 ||
+                                 Object.Header.magic == MachO::MH_CIGAM_64;
   IO.mapOptional("LoadCommands", Object.LoadCommands);
   if(!Object.LinkEdit.isEmpty() || !IO.outputting())
     IO.mapOptional("LinkEditData", Object.LinkEdit);
@@ -273,6 +275,18 @@ void MappingTraits<MachO::dyld_info_command>::mapping(
   IO.mapRequired("export_size", LoadCommand.export_size);
 }
 
+void MappingTraits<MachOYAML::Relocation>::mapping(
+    IO &IO, MachOYAML::Relocation &Relocation) {
+  IO.mapRequired("address", Relocation.address);
+  IO.mapRequired("symbolnum", Relocation.symbolnum);
+  IO.mapRequired("pcrel", Relocation.is_pcrel);
+  IO.mapRequired("length", Relocation.length);
+  IO.mapRequired("extern", Relocation.is_extern);
+  IO.mapRequired("type", Relocation.type);
+  IO.mapRequired("scattered", Relocation.is_scattered);
+  IO.mapRequired("value", Relocation.value);
+}
+
 void MappingTraits<MachOYAML::Section>::mapping(IO &IO,
                                                 MachOYAML::Section &Section) {
   IO.mapRequired("sectname", Section.sectname);
@@ -288,6 +302,7 @@ void MappingTraits<MachOYAML::Section>::mapping(IO &IO,
   IO.mapRequired("reserved2", Section.reserved2);
   IO.mapOptional("reserved3", Section.reserved3);
   IO.mapOptional("content", Section.content);
+  IO.mapOptional("relocations", Section.relocations);
 }
 
 StringRef
diff --git a/llvm/lib/ObjectYAML/WasmEmitter.cpp b/llvm/lib/ObjectYAML/WasmEmitter.cpp
index debc040587a8..cbb062d87ae6 100644
--- a/llvm/lib/ObjectYAML/WasmEmitter.cpp
+++ b/llvm/lib/ObjectYAML/WasmEmitter.cpp
@@ -41,8 +41,8 @@ private:
   void writeSectionContent(raw_ostream &OS, WasmYAML::FunctionSection &Section);
   void writeSectionContent(raw_ostream &OS, WasmYAML::TableSection &Section);
   void writeSectionContent(raw_ostream &OS, WasmYAML::MemorySection &Section);
-  void writeSectionContent(raw_ostream &OS, WasmYAML::GlobalSection &Section);
   void writeSectionContent(raw_ostream &OS, WasmYAML::EventSection &Section);
+  void writeSectionContent(raw_ostream &OS, WasmYAML::GlobalSection &Section);
   void writeSectionContent(raw_ostream &OS, WasmYAML::ExportSection &Section);
   void writeSectionContent(raw_ostream &OS, WasmYAML::StartSection &Section);
   void writeSectionContent(raw_ostream &OS, WasmYAML::ElemSection &Section);
@@ -415,6 +415,21 @@ void WasmWriter::writeSectionContent(raw_ostream &OS,
 }
 
 void WasmWriter::writeSectionContent(raw_ostream &OS,
+                                     WasmYAML::EventSection &Section) {
+  encodeULEB128(Section.Events.size(), OS);
+  uint32_t ExpectedIndex = NumImportedEvents;
+  for (auto &Event : Section.Events) {
+    if (Event.Index != ExpectedIndex) {
+      reportError("unexpected event index: " + Twine(Event.Index));
+      return;
+    }
+    ++ExpectedIndex;
+    encodeULEB128(Event.Attribute, OS);
+    encodeULEB128(Event.SigIndex, OS);
+  }
+}
+
+void WasmWriter::writeSectionContent(raw_ostream &OS,
                                      WasmYAML::GlobalSection &Section) {
   encodeULEB128(Section.Globals.size(), OS);
   uint32_t ExpectedIndex = NumImportedGlobals;
@@ -431,21 +446,6 @@ void WasmWriter::writeSectionContent(raw_ostream &OS,
 }
 
 void WasmWriter::writeSectionContent(raw_ostream &OS,
-                                    WasmYAML::EventSection &Section) {
-  encodeULEB128(Section.Events.size(), OS);
-  uint32_t ExpectedIndex = NumImportedEvents;
-  for (auto &Event : Section.Events) {
-    if (Event.Index != ExpectedIndex) {
-      reportError("unexpected event index: " + Twine(Event.Index));
-      return;
-    }
-    ++ExpectedIndex;
-    encodeULEB128(Event.Attribute, OS);
-    encodeULEB128(Event.SigIndex, OS);
-  }
-}
-
-void WasmWriter::writeSectionContent(raw_ostream &OS,
                                      WasmYAML::ElemSection &Section) {
   encodeULEB128(Section.Segments.size(), OS);
   for (auto &Segment : Section.Segments) {
@@ -532,8 +532,11 @@ void WasmWriter::writeRelocSection(raw_ostream &OS, WasmYAML::Section &Sec,
     encodeULEB128(Reloc.Index, OS);
     switch (Reloc.Type) {
     case wasm::R_WASM_MEMORY_ADDR_LEB:
+    case wasm::R_WASM_MEMORY_ADDR_LEB64:
     case wasm::R_WASM_MEMORY_ADDR_SLEB:
+    case wasm::R_WASM_MEMORY_ADDR_SLEB64:
     case wasm::R_WASM_MEMORY_ADDR_I32:
+    case wasm::R_WASM_MEMORY_ADDR_I64:
     case wasm::R_WASM_FUNCTION_OFFSET_I32:
     case wasm::R_WASM_SECTION_OFFSET_I32:
       encodeULEB128(Reloc.Addend, OS);
@@ -571,10 +574,10 @@ bool WasmWriter::writeWasm(raw_ostream &OS) {
       writeSectionContent(StringStream, *S);
     else if (auto S = dyn_cast<WasmYAML::MemorySection>(Sec.get()))
       writeSectionContent(StringStream, *S);
-    else if (auto S = dyn_cast<WasmYAML::GlobalSection>(Sec.get()))
-      writeSectionContent(StringStream, *S);
     else if (auto S = dyn_cast<WasmYAML::EventSection>(Sec.get()))
       writeSectionContent(StringStream, *S);
+    else if (auto S = dyn_cast<WasmYAML::GlobalSection>(Sec.get()))
+      writeSectionContent(StringStream, *S);
     else if (auto S = dyn_cast<WasmYAML::ExportSection>(Sec.get()))
       writeSectionContent(StringStream, *S);
     else if (auto S = dyn_cast<WasmYAML::StartSection>(Sec.get()))
diff --git a/llvm/lib/ObjectYAML/WasmYAML.cpp b/llvm/lib/ObjectYAML/WasmYAML.cpp
index 232d5122004a..d1aa1181a344 100644
--- a/llvm/lib/ObjectYAML/WasmYAML.cpp
+++ b/llvm/lib/ObjectYAML/WasmYAML.cpp
@@ -118,14 +118,14 @@ static void sectionMapping(IO &IO, WasmYAML::MemorySection &Section) {
   IO.mapOptional("Memories", Section.Memories);
 }
 
-static void sectionMapping(IO &IO, WasmYAML::GlobalSection &Section) {
+static void sectionMapping(IO &IO, WasmYAML::EventSection &Section) {
   commonSectionMapping(IO, Section);
-  IO.mapOptional("Globals", Section.Globals);
+  IO.mapOptional("Events", Section.Events);
 }
 
-static void sectionMapping(IO &IO, WasmYAML::EventSection &Section) {
+static void sectionMapping(IO &IO, WasmYAML::GlobalSection &Section) {
   commonSectionMapping(IO, Section);
-  IO.mapOptional("Events", Section.Events);
+  IO.mapOptional("Globals", Section.Globals);
 }
 
 static void sectionMapping(IO &IO, WasmYAML::ExportSection &Section) {
@@ -227,16 +227,16 @@ void MappingTraits<std::unique_ptr<WasmYAML::Section>>::mapping(
       Section.reset(new WasmYAML::MemorySection());
     sectionMapping(IO, *cast<WasmYAML::MemorySection>(Section.get()));
     break;
-  case wasm::WASM_SEC_GLOBAL:
-    if (!IO.outputting())
-      Section.reset(new WasmYAML::GlobalSection());
-    sectionMapping(IO, *cast<WasmYAML::GlobalSection>(Section.get()));
-    break;
   case wasm::WASM_SEC_EVENT:
     if (!IO.outputting())
       Section.reset(new WasmYAML::EventSection());
     sectionMapping(IO, *cast<WasmYAML::EventSection>(Section.get()));
     break;
+  case wasm::WASM_SEC_GLOBAL:
+    if (!IO.outputting())
+      Section.reset(new WasmYAML::GlobalSection());
+    sectionMapping(IO, *cast<WasmYAML::GlobalSection>(Section.get()));
+    break;
   case wasm::WASM_SEC_EXPORT:
     if (!IO.outputting())
       Section.reset(new WasmYAML::ExportSection());
@@ -433,6 +433,11 @@ void MappingTraits<wasm::WasmInitExpr>::mapping(IO &IO,
   case wasm::WASM_OPCODE_GLOBAL_GET:
     IO.mapRequired("Index", Expr.Value.Global);
     break;
+  case wasm::WASM_OPCODE_REF_NULL: {
+    WasmYAML::ValueType Ty = wasm::WASM_TYPE_EXTERNREF;
+    IO.mapRequired("Type", Ty);
+    break;
+  }
   }
 }
 
@@ -517,6 +522,7 @@ void ScalarBitSetTraits<WasmYAML::LimitFlags>::bitset(
 #define BCase(X) IO.bitSetCase(Value, #X, wasm::WASM_LIMITS_FLAG_##X)
   BCase(HAS_MAX);
   BCase(IS_SHARED);
+  BCase(IS_64);
 #undef BCase
 }
 
@@ -559,6 +565,8 @@ void ScalarEnumerationTraits<WasmYAML::ValueType>::enumeration(
   ECase(F64);
   ECase(V128);
   ECase(FUNCREF);
+  ECase(EXNREF);
+  ECase(EXTERNREF);
   ECase(FUNC);
 #undef ECase
 }
@@ -583,6 +591,7 @@ void ScalarEnumerationTraits<WasmYAML::Opcode>::enumeration(
   ECase(F64_CONST);
   ECase(F32_CONST);
   ECase(GLOBAL_GET);
+  ECase(REF_NULL);
 #undef ECase
 }
 
diff --git a/llvm/lib/ObjectYAML/yaml2obj.cpp b/llvm/lib/ObjectYAML/yaml2obj.cpp
index c18fa5cfdb5e..a04345f1294a 100644
--- a/llvm/lib/ObjectYAML/yaml2obj.cpp
+++ b/llvm/lib/ObjectYAML/yaml2obj.cpp
@@ -19,7 +19,7 @@ namespace llvm {
 namespace yaml {
 
 bool convertYAML(yaml::Input &YIn, raw_ostream &Out, ErrorHandler ErrHandler,
-                 unsigned DocNum) {
+                 unsigned DocNum, uint64_t MaxSize) {
   unsigned CurDocNum = 0;
   do {
     if (++CurDocNum != DocNum)
@@ -33,7 +33,7 @@ bool convertYAML(yaml::Input &YIn, raw_ostream &Out, ErrorHandler ErrHandler,
     }
 
     if (Doc.Elf)
-      return yaml2elf(*Doc.Elf, Out, ErrHandler);
+      return yaml2elf(*Doc.Elf, Out, ErrHandler, MaxSize);
     if (Doc.Coff)
       return yaml2coff(*Doc.Coff, Out, ErrHandler);
     if (Doc.MachO || Doc.FatMachO)
diff --git a/llvm/lib/Option/Arg.cpp b/llvm/lib/Option/Arg.cpp
index ea382b347345..2da32bfacf30 100644
--- a/llvm/lib/Option/Arg.cpp
+++ b/llvm/lib/Option/Arg.cpp
@@ -81,7 +81,7 @@ std::string Arg::getAsString(const ArgList &Args) const {
     OS << *it;
   }
 
-  return OS.str();
+  return std::string(OS.str());
 }
 
 void Arg::renderAsInput(const ArgList &Args, ArgStringList &Output) const {
diff --git a/llvm/lib/Option/ArgList.cpp b/llvm/lib/Option/ArgList.cpp
index 09e921502eb6..ad7be5fbec19 100644
--- a/llvm/lib/Option/ArgList.cpp
+++ b/llvm/lib/Option/ArgList.cpp
@@ -209,7 +209,7 @@ unsigned InputArgList::MakeIndex(StringRef String0) const {
   unsigned Index = ArgStrings.size();
 
   // Tuck away so we have a reliable const char *.
-  SynthesizedStrings.push_back(String0);
+  SynthesizedStrings.push_back(std::string(String0));
   ArgStrings.push_back(SynthesizedStrings.back().c_str());
 
   return Index;
diff --git a/llvm/lib/Option/OptTable.cpp b/llvm/lib/Option/OptTable.cpp
index 5833d03069f8..926eb8e0437f 100644
--- a/llvm/lib/Option/OptTable.cpp
+++ b/llvm/lib/Option/OptTable.cpp
@@ -219,7 +219,7 @@ OptTable::suggestValueCompletions(StringRef Option, StringRef Arg) const {
     std::vector<std::string> Result;
     for (StringRef Val : Candidates)
       if (Val.startswith(Arg) && Arg.compare(Val))
-        Result.push_back(Val);
+        Result.push_back(std::string(Val));
     return Result;
   }
   return {};
@@ -283,10 +283,10 @@ unsigned OptTable::findNearest(StringRef Option, std::string &NearestString,
     StringRef LHS, RHS;
     char Last = CandidateName.back();
     bool CandidateHasDelimiter = Last == '=' || Last == ':';
-    std::string NormalizedName = Option;
+    std::string NormalizedName = std::string(Option);
     if (CandidateHasDelimiter) {
       std::tie(LHS, RHS) = Option.split(Last);
-      NormalizedName = LHS;
+      NormalizedName = std::string(LHS);
       if (Option.find(Last) == LHS.size())
         NormalizedName += Last;
     }
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 53b7db8689c4..4db7bebcb77c 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -33,11 +33,15 @@
 #include "llvm/Analysis/DominanceFrontier.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/InlineAdvisor.h"
+#include "llvm/Analysis/InlineFeaturesAnalysis.h"
+#include "llvm/Analysis/InlineSizeEstimatorAnalysis.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/LazyValueInfo.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopCacheAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopNestAnalysis.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ModuleSummaryAnalysis.h"
@@ -49,13 +53,11 @@
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/ScopedNoAliasAA.h"
+#include "llvm/Analysis/StackLifetime.h"
 #include "llvm/Analysis/StackSafetyAnalysis.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/TypeBasedAliasAnalysis.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/PreISelIntrinsicLowering.h"
-#include "llvm/CodeGen/UnreachableBlockElim.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRPrintingPasses.h"
 #include "llvm/IR/PassManager.h"
@@ -67,6 +69,10 @@
 #include "llvm/Support/Regex.h"
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Transforms/AggressiveInstCombine/AggressiveInstCombine.h"
+#include "llvm/Transforms/Coroutines/CoroCleanup.h"
+#include "llvm/Transforms/Coroutines/CoroEarly.h"
+#include "llvm/Transforms/Coroutines/CoroElide.h"
+#include "llvm/Transforms/Coroutines/CoroSplit.h"
 #include "llvm/Transforms/IPO/AlwaysInliner.h"
 #include "llvm/Transforms/IPO/ArgumentPromotion.h"
 #include "llvm/Transforms/IPO/Attributor.h"
@@ -87,6 +93,7 @@
 #include "llvm/Transforms/IPO/Internalize.h"
 #include "llvm/Transforms/IPO/LowerTypeTests.h"
 #include "llvm/Transforms/IPO/MergeFunctions.h"
+#include "llvm/Transforms/IPO/OpenMPOpt.h"
 #include "llvm/Transforms/IPO/PartialInlining.h"
 #include "llvm/Transforms/IPO/SCCP.h"
 #include "llvm/Transforms/IPO/SampleProfile.h"
@@ -169,8 +176,10 @@
 #include "llvm/Transforms/Scalar/TailRecursionElimination.h"
 #include "llvm/Transforms/Scalar/WarnMissedTransforms.h"
 #include "llvm/Transforms/Utils/AddDiscriminators.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/BreakCriticalEdges.h"
 #include "llvm/Transforms/Utils/CanonicalizeAliases.h"
+#include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
 #include "llvm/Transforms/Utils/EntryExitInstrumenter.h"
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
 #include "llvm/Transforms/Utils/LCSSA.h"
@@ -183,6 +192,7 @@
 #include "llvm/Transforms/Vectorize/LoadStoreVectorizer.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+#include "llvm/Transforms/Vectorize/VectorCombine.h"
 
 using namespace llvm;
 
@@ -207,6 +217,16 @@ static cl::opt<bool> EnableGVNHoist(
     "enable-npm-gvn-hoist", cl::init(false), cl::Hidden,
     cl::desc("Enable the GVN hoisting pass for the new PM (default = off)"));
 
+static cl::opt<InliningAdvisorMode> UseInlineAdvisor(
+    "enable-ml-inliner", cl::init(InliningAdvisorMode::Default), cl::Hidden,
+    cl::desc("Enable ML policy for inliner. Currently trained for -Oz only"),
+    cl::values(clEnumValN(InliningAdvisorMode::Default, "default",
+                          "Heuristics-based inliner version."),
+               clEnumValN(InliningAdvisorMode::Development, "development",
+                          "Use development mode (runtime-loadable model)."),
+               clEnumValN(InliningAdvisorMode::Release, "release",
+                          "Use release mode (AOT-compiled model).")));
+
 static cl::opt<bool> EnableGVNSink(
     "enable-npm-gvn-sink", cl::init(false), cl::Hidden,
     cl::desc("Enable the GVN hoisting pass for the new PM (default = off)"));
@@ -229,14 +249,22 @@ static cl::opt<bool>
     EnableCHR("enable-chr-npm", cl::init(true), cl::Hidden,
               cl::desc("Enable control height reduction optimization (CHR)"));
 
+/// Flag to enable inline deferral during PGO.
+static cl::opt<bool>
+    EnablePGOInlineDeferral("enable-npm-pgo-inline-deferral", cl::init(true),
+                            cl::Hidden,
+                            cl::desc("Enable inline deferral during PGO"));
+
 PipelineTuningOptions::PipelineTuningOptions() {
-  LoopInterleaving = EnableLoopInterleaving;
-  LoopVectorization = EnableLoopVectorization;
-  SLPVectorization = RunSLPVectorization;
+  LoopInterleaving = true;
+  LoopVectorization = true;
+  SLPVectorization = false;
   LoopUnrolling = true;
   ForgetAllSCEVInLoopUnroll = ForgetSCEVInLoopUnroll;
+  Coroutines = false;
   LicmMssaOptCap = SetLicmMssaOptCap;
   LicmMssaNoAccForPromotionCap = SetLicmMssaNoAccForPromotionCap;
+  CallGraphProfile = true;
 }
 
 extern cl::opt<bool> EnableHotColdSplit;
@@ -244,28 +272,40 @@ extern cl::opt<bool> EnableOrderFileInstrumentation;
 
 extern cl::opt<bool> FlattenedProfileUsed;
 
-static bool isOptimizingForSize(PassBuilder::OptimizationLevel Level) {
-  switch (Level) {
-  case PassBuilder::O0:
-  case PassBuilder::O1:
-  case PassBuilder::O2:
-  case PassBuilder::O3:
-    return false;
-
-  case PassBuilder::Os:
-  case PassBuilder::Oz:
-    return true;
-  }
-  llvm_unreachable("Invalid optimization level!");
-}
+extern cl::opt<AttributorRunOption> AttributorRun;
+extern cl::opt<bool> EnableKnowledgeRetention;
+
+const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O0 = {
+    /*SpeedLevel*/ 0,
+    /*SizeLevel*/ 0};
+const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O1 = {
+    /*SpeedLevel*/ 1,
+    /*SizeLevel*/ 0};
+const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O2 = {
+    /*SpeedLevel*/ 2,
+    /*SizeLevel*/ 0};
+const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::O3 = {
+    /*SpeedLevel*/ 3,
+    /*SizeLevel*/ 0};
+const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::Os = {
+    /*SpeedLevel*/ 2,
+    /*SizeLevel*/ 1};
+const PassBuilder::OptimizationLevel PassBuilder::OptimizationLevel::Oz = {
+    /*SpeedLevel*/ 2,
+    /*SizeLevel*/ 2};
 
 namespace {
 
+// The following passes/analyses have custom names, otherwise their name will
+// include `(anonymous namespace)`. These are special since they are only for
+// testing purposes and don't live in a header file.
+
 /// No-op module pass which does nothing.
-struct NoOpModulePass {
+struct NoOpModulePass : PassInfoMixin<NoOpModulePass> {
   PreservedAnalyses run(Module &M, ModuleAnalysisManager &) {
     return PreservedAnalyses::all();
   }
+
   static StringRef name() { return "NoOpModulePass"; }
 };
 
@@ -281,7 +321,7 @@ public:
 };
 
 /// No-op CGSCC pass which does nothing.
-struct NoOpCGSCCPass {
+struct NoOpCGSCCPass : PassInfoMixin<NoOpCGSCCPass> {
   PreservedAnalyses run(LazyCallGraph::SCC &C, CGSCCAnalysisManager &,
                         LazyCallGraph &, CGSCCUpdateResult &UR) {
     return PreservedAnalyses::all();
@@ -303,7 +343,7 @@ public:
 };
 
 /// No-op function pass which does nothing.
-struct NoOpFunctionPass {
+struct NoOpFunctionPass : PassInfoMixin<NoOpFunctionPass> {
   PreservedAnalyses run(Function &F, FunctionAnalysisManager &) {
     return PreservedAnalyses::all();
   }
@@ -322,7 +362,7 @@ public:
 };
 
 /// No-op loop pass which does nothing.
-struct NoOpLoopPass {
+struct NoOpLoopPass : PassInfoMixin<NoOpLoopPass> {
   PreservedAnalyses run(Loop &L, LoopAnalysisManager &,
                         LoopStandardAnalysisResults &, LPMUpdater &) {
     return PreservedAnalyses::all();
@@ -348,7 +388,7 @@ AnalysisKey NoOpCGSCCAnalysis::Key;
 AnalysisKey NoOpFunctionAnalysis::Key;
 AnalysisKey NoOpLoopAnalysis::Key;
 
-} // End anonymous namespace.
+} // namespace
 
 void PassBuilder::invokePeepholeEPCallbacks(
     FunctionPassManager &FPM, PassBuilder::OptimizationLevel Level) {
@@ -392,11 +432,138 @@ void PassBuilder::registerLoopAnalyses(LoopAnalysisManager &LAM) {
     C(LAM);
 }
 
+// TODO: Investigate the cost/benefit of tail call elimination on debugging.
+FunctionPassManager PassBuilder::buildO1FunctionSimplificationPipeline(
+    OptimizationLevel Level, ThinLTOPhase Phase, bool DebugLogging) {
+
+  FunctionPassManager FPM(DebugLogging);
+
+  // Form SSA out of local memory accesses after breaking apart aggregates into
+  // scalars.
+  FPM.addPass(SROA());
+
+  // Catch trivial redundancies
+  FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
+
+  // Hoisting of scalars and load expressions.
+  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(InstCombinePass());
+
+  FPM.addPass(LibCallsShrinkWrapPass());
+
+  invokePeepholeEPCallbacks(FPM, Level);
+
+  FPM.addPass(SimplifyCFGPass());
+
+  // Form canonically associated expression trees, and simplify the trees using
+  // basic mathematical properties. For example, this will form (nearly)
+  // minimal multiplication trees.
+  FPM.addPass(ReassociatePass());
+
+  // Add the primary loop simplification pipeline.
+  // FIXME: Currently this is split into two loop pass pipelines because we run
+  // some function passes in between them. These can and should be removed
+  // and/or replaced by scheduling the loop pass equivalents in the correct
+  // positions. But those equivalent passes aren't powerful enough yet.
+  // Specifically, `SimplifyCFGPass` and `InstCombinePass` are currently still
+  // used. We have `LoopSimplifyCFGPass` which isn't yet powerful enough yet to
+  // fully replace `SimplifyCFGPass`, and the closest to the other we have is
+  // `LoopInstSimplify`.
+  LoopPassManager LPM1(DebugLogging), LPM2(DebugLogging);
+
+  // Simplify the loop body. We do this initially to clean up after other loop
+  // passes run, either when iterating on a loop or on inner loops with
+  // implications on the outer loop.
+  LPM1.addPass(LoopInstSimplifyPass());
+  LPM1.addPass(LoopSimplifyCFGPass());
+
+  LPM1.addPass(LoopRotatePass(/* Disable header duplication */ true));
+  // TODO: Investigate promotion cap for O1.
+  LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
+  LPM1.addPass(SimpleLoopUnswitchPass());
+  LPM2.addPass(IndVarSimplifyPass());
+  LPM2.addPass(LoopIdiomRecognizePass());
+
+  for (auto &C : LateLoopOptimizationsEPCallbacks)
+    C(LPM2, Level);
+
+  LPM2.addPass(LoopDeletionPass());
+  // Do not enable unrolling in PreLinkThinLTO phase during sample PGO
+  // because it changes IR to makes profile annotation in back compile
+  // inaccurate. The normal unroller doesn't pay attention to forced full unroll
+  // attributes so we need to make sure and allow the full unroll pass to pay
+  // attention to it.
+  if (Phase != ThinLTOPhase::PreLink || !PGOOpt ||
+      PGOOpt->Action != PGOOptions::SampleUse)
+    LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
+                                    /* OnlyWhenForced= */ !PTO.LoopUnrolling,
+                                    PTO.ForgetAllSCEVInLoopUnroll));
+
+  for (auto &C : LoopOptimizerEndEPCallbacks)
+    C(LPM2, Level);
+
+  // We provide the opt remark emitter pass for LICM to use. We only need to do
+  // this once as it is immutable.
+  FPM.addPass(
+      RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      std::move(LPM1), EnableMSSALoopDependency, DebugLogging));
+  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(InstCombinePass());
+  // The loop passes in LPM2 (LoopFullUnrollPass) do not preserve MemorySSA.
+  // *All* loop passes must preserve it, in order to be able to use it.
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      std::move(LPM2), /*UseMemorySSA=*/false, DebugLogging));
+
+  // Delete small array after loop unroll.
+  FPM.addPass(SROA());
+
+  // Specially optimize memory movement as it doesn't look like dataflow in SSA.
+  FPM.addPass(MemCpyOptPass());
+
+  // Sparse conditional constant propagation.
+  // FIXME: It isn't clear why we do this *after* loop passes rather than
+  // before...
+  FPM.addPass(SCCPPass());
+
+  // Delete dead bit computations (instcombine runs after to fold away the dead
+  // computations, and then ADCE will run later to exploit any new DCE
+  // opportunities that creates).
+  FPM.addPass(BDCEPass());
+
+  // Run instcombine after redundancy and dead bit elimination to exploit
+  // opportunities opened up by them.
+  FPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(FPM, Level);
+
+  if (PTO.Coroutines)
+    FPM.addPass(CoroElidePass());
+
+  for (auto &C : ScalarOptimizerLateEPCallbacks)
+    C(FPM, Level);
+
+  // Finally, do an expensive DCE pass to catch all the dead code exposed by
+  // the simplifications and basic cleanup after all the simplifications.
+  // TODO: Investigate if this is too expensive.
+  FPM.addPass(ADCEPass());
+  FPM.addPass(SimplifyCFGPass());
+  FPM.addPass(InstCombinePass());
+  invokePeepholeEPCallbacks(FPM, Level);
+
+  return FPM;
+}
+
 FunctionPassManager
 PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
                                                  ThinLTOPhase Phase,
                                                  bool DebugLogging) {
-  assert(Level != O0 && "Must request optimizations!");
+  assert(Level != OptimizationLevel::O0 && "Must request optimizations!");
+
+  // The O1 pipeline has a separate pipeline creation function to simplify
+  // construction readability.
+  if (Level.getSpeedupLevel() == 1)
+    return buildO1FunctionSimplificationPipeline(Level, Phase, DebugLogging);
+
   FunctionPassManager FPM(DebugLogging);
 
   // Form SSA out of local memory accesses after breaking apart aggregates into
@@ -405,33 +572,32 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   // Catch trivial redundancies
   FPM.addPass(EarlyCSEPass(true /* Enable mem-ssa. */));
+  if (EnableKnowledgeRetention)
+    FPM.addPass(AssumeSimplifyPass());
 
   // Hoisting of scalars and load expressions.
-  if (Level > O1) {
-    if (EnableGVNHoist)
-      FPM.addPass(GVNHoistPass());
-
-    // Global value numbering based sinking.
-    if (EnableGVNSink) {
-      FPM.addPass(GVNSinkPass());
-      FPM.addPass(SimplifyCFGPass());
-    }
+  if (EnableGVNHoist)
+    FPM.addPass(GVNHoistPass());
+
+  // Global value numbering based sinking.
+  if (EnableGVNSink) {
+    FPM.addPass(GVNSinkPass());
+    FPM.addPass(SimplifyCFGPass());
   }
 
   // Speculative execution if the target has divergent branches; otherwise nop.
-  if (Level > O1) {
-    FPM.addPass(SpeculativeExecutionPass());
+  FPM.addPass(SpeculativeExecutionPass(/* OnlyIfDivergentTarget =*/true));
+
+  // Optimize based on known information about branches, and cleanup afterward.
+  FPM.addPass(JumpThreadingPass());
+  FPM.addPass(CorrelatedValuePropagationPass());
 
-    // Optimize based on known information about branches, and cleanup afterward.
-    FPM.addPass(JumpThreadingPass());
-    FPM.addPass(CorrelatedValuePropagationPass());
-  }
   FPM.addPass(SimplifyCFGPass());
-  if (Level == O3)
+  if (Level == OptimizationLevel::O3)
     FPM.addPass(AggressiveInstCombinePass());
   FPM.addPass(InstCombinePass());
 
-  if (!isOptimizingForSize(Level))
+  if (!Level.isOptimizingForSize())
     FPM.addPass(LibCallsShrinkWrapPass());
 
   invokePeepholeEPCallbacks(FPM, Level);
@@ -439,12 +605,10 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   // For PGO use pipeline, try to optimize memory intrinsics such as memcpy
   // using the size value profile. Don't perform this when optimizing for size.
   if (PGOOpt && PGOOpt->Action == PGOOptions::IRUse &&
-      !isOptimizingForSize(Level) && Level > O1)
+      !Level.isOptimizingForSize())
     FPM.addPass(PGOMemOPSizeOpt());
 
-  // TODO: Investigate the cost/benefit of tail call elimination on debugging.
-  if (Level > O1)
-    FPM.addPass(TailCallElimPass());
+  FPM.addPass(TailCallElimPass());
   FPM.addPass(SimplifyCFGPass());
 
   // Form canonically associated expression trees, and simplify the trees using
@@ -470,7 +634,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   LPM1.addPass(LoopSimplifyCFGPass());
 
   // Rotate Loop - disable header duplication at -Oz
-  LPM1.addPass(LoopRotatePass(Level != Oz));
+  LPM1.addPass(LoopRotatePass(Level != OptimizationLevel::Oz));
   // TODO: Investigate promotion cap for O1.
   LPM1.addPass(LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap));
   LPM1.addPass(SimpleLoopUnswitchPass());
@@ -483,11 +647,13 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   LPM2.addPass(LoopDeletionPass());
   // Do not enable unrolling in PreLinkThinLTO phase during sample PGO
   // because it changes IR to makes profile annotation in back compile
-  // inaccurate.
-  if ((Phase != ThinLTOPhase::PreLink || !PGOOpt ||
-       PGOOpt->Action != PGOOptions::SampleUse) &&
-      PTO.LoopUnrolling)
-    LPM2.addPass(LoopFullUnrollPass(Level, /*OnlyWhenForced=*/false,
+  // inaccurate. The normal unroller doesn't pay attention to forced full unroll
+  // attributes so we need to make sure and allow the full unroll pass to pay
+  // attention to it.
+  if (Phase != ThinLTOPhase::PreLink || !PGOOpt ||
+      PGOOpt->Action != PGOOptions::SampleUse)
+    LPM2.addPass(LoopFullUnrollPass(Level.getSpeedupLevel(),
+                                    /* OnlyWhenForced= */ !PTO.LoopUnrolling,
                                     PTO.ForgetAllSCEVInLoopUnroll));
 
   for (auto &C : LoopOptimizerEndEPCallbacks)
@@ -495,7 +661,8 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   // We provide the opt remark emitter pass for LICM to use. We only need to do
   // this once as it is immutable.
-  FPM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
+  FPM.addPass(
+      RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
   FPM.addPass(createFunctionToLoopPassAdaptor(
       std::move(LPM1), EnableMSSALoopDependency, DebugLogging));
   FPM.addPass(SimplifyCFGPass());
@@ -510,14 +677,11 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(SROA());
 
   // Eliminate redundancies.
-  if (Level != O1) {
-    // These passes add substantial compile time so skip them at O1.
-    FPM.addPass(MergedLoadStoreMotionPass());
-    if (RunNewGVN)
-      FPM.addPass(NewGVNPass());
-    else
-      FPM.addPass(GVN());
-  }
+  FPM.addPass(MergedLoadStoreMotionPass());
+  if (RunNewGVN)
+    FPM.addPass(NewGVNPass());
+  else
+    FPM.addPass(GVN());
 
   // Specially optimize memory movement as it doesn't look like dataflow in SSA.
   FPM.addPass(MemCpyOptPass());
@@ -539,14 +703,15 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
 
   // Re-consider control flow based optimizations after redundancy elimination,
   // redo DCE, etc.
-  if (Level > O1) {
-    FPM.addPass(JumpThreadingPass());
-    FPM.addPass(CorrelatedValuePropagationPass());
-    FPM.addPass(DSEPass());
-    FPM.addPass(createFunctionToLoopPassAdaptor(
-        LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
-        EnableMSSALoopDependency, DebugLogging));
-  }
+  FPM.addPass(JumpThreadingPass());
+  FPM.addPass(CorrelatedValuePropagationPass());
+  FPM.addPass(DSEPass());
+  FPM.addPass(createFunctionToLoopPassAdaptor(
+      LICMPass(PTO.LicmMssaOptCap, PTO.LicmMssaNoAccForPromotionCap),
+      EnableMSSALoopDependency, DebugLogging));
+
+  if (PTO.Coroutines)
+    FPM.addPass(CoroElidePass());
 
   for (auto &C : ScalarOptimizerLateEPCallbacks)
     C(FPM, Level);
@@ -559,7 +724,7 @@ PassBuilder::buildFunctionSimplificationPipeline(OptimizationLevel Level,
   FPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(FPM, Level);
 
-  if (EnableCHR && Level == O3 && PGOOpt &&
+  if (EnableCHR && Level == OptimizationLevel::O3 && PGOOpt &&
       (PGOOpt->Action == PGOOptions::IRUse ||
        PGOOpt->Action == PGOOptions::SampleUse))
     FPM.addPass(ControlHeightReductionPass());
@@ -572,13 +737,13 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
                                     bool RunProfileGen, bool IsCS,
                                     std::string ProfileFile,
                                     std::string ProfileRemappingFile) {
-  assert(Level != O0 && "Not expecting O0 here!");
+  assert(Level != OptimizationLevel::O0 && "Not expecting O0 here!");
   // Generally running simplification passes and the inliner with an high
   // threshold results in smaller executables, but there may be cases where
   // the size grows, so let's be conservative here and skip this simplification
   // at -Os/Oz. We will not do this  inline for context sensistive PGO (when
   // IsCS is true).
-  if (!isOptimizingForSize(Level) && !IsCS) {
+  if (!Level.isOptimizingForSize() && !IsCS) {
     InlineParams IP;
 
     IP.DefaultThreshold = PreInlineThreshold;
@@ -587,10 +752,8 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
     // This should probably be lowered after performance testing.
     // FIXME: this comment is cargo culted from the old pass manager, revisit).
     IP.HintThreshold = 325;
-
-    CGSCCPassManager CGPipeline(DebugLogging);
-
-    CGPipeline.addPass(InlinerPass(IP));
+    ModuleInlinerWrapperPass MIWP(IP, DebugLogging);
+    CGSCCPassManager &CGPipeline = MIWP.getPM();
 
     FunctionPassManager FPM;
     FPM.addPass(SROA());
@@ -601,7 +764,7 @@ void PassBuilder::addPGOInstrPasses(ModulePassManager &MPM, bool DebugLogging,
 
     CGPipeline.addPass(createCGSCCToFunctionPassAdaptor(std::move(FPM)));
 
-    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPipeline)));
+    MPM.addPass(std::move(MIWP));
 
     // Delete anything that is now dead to make sure that we don't instrument
     // dead code. Instrumentation can end up keeping dead code around and
@@ -663,16 +826,74 @@ void PassBuilder::addPGOInstrPassesForO0(ModulePassManager &MPM,
 
 static InlineParams
 getInlineParamsFromOptLevel(PassBuilder::OptimizationLevel Level) {
-  auto O3 = PassBuilder::O3;
-  unsigned OptLevel = Level > O3 ? 2 : Level;
-  unsigned SizeLevel = Level > O3 ? Level - O3 : 0;
-  return getInlineParams(OptLevel, SizeLevel);
+  return getInlineParams(Level.getSpeedupLevel(), Level.getSizeLevel());
 }
 
-ModulePassManager
-PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
-                                               ThinLTOPhase Phase,
-                                               bool DebugLogging) {
+ModuleInlinerWrapperPass
+PassBuilder::buildInlinerPipeline(OptimizationLevel Level, ThinLTOPhase Phase,
+                                  bool DebugLogging) {
+  InlineParams IP = getInlineParamsFromOptLevel(Level);
+  if (Phase == PassBuilder::ThinLTOPhase::PreLink && PGOOpt &&
+      PGOOpt->Action == PGOOptions::SampleUse)
+    IP.HotCallSiteThreshold = 0;
+
+  if (PGOOpt)
+    IP.EnableDeferral = EnablePGOInlineDeferral;
+
+  ModuleInlinerWrapperPass MIWP(IP, DebugLogging, UseInlineAdvisor,
+                                MaxDevirtIterations);
+
+  // Require the GlobalsAA analysis for the module so we can query it within
+  // the CGSCC pipeline.
+  MIWP.addRequiredModuleAnalysis<GlobalsAA>();
+
+  // Require the ProfileSummaryAnalysis for the module so we can query it within
+  // the inliner pass.
+  MIWP.addRequiredModuleAnalysis<ProfileSummaryAnalysis>();
+
+  // Now begin the main postorder CGSCC pipeline.
+  // FIXME: The current CGSCC pipeline has its origins in the legacy pass
+  // manager and trying to emulate its precise behavior. Much of this doesn't
+  // make a lot of sense and we should revisit the core CGSCC structure.
+  CGSCCPassManager &MainCGPipeline = MIWP.getPM();
+
+  // Note: historically, the PruneEH pass was run first to deduce nounwind and
+  // generally clean up exception handling overhead. It isn't clear this is
+  // valuable as the inliner doesn't currently care whether it is inlining an
+  // invoke or a call.
+
+  if (AttributorRun & AttributorRunOption::CGSCC)
+    MainCGPipeline.addPass(AttributorCGSCCPass());
+
+  if (PTO.Coroutines)
+    MainCGPipeline.addPass(CoroSplitPass());
+
+  // Now deduce any function attributes based in the current code.
+  MainCGPipeline.addPass(PostOrderFunctionAttrsPass());
+
+  // When at O3 add argument promotion to the pass pipeline.
+  // FIXME: It isn't at all clear why this should be limited to O3.
+  if (Level == OptimizationLevel::O3)
+    MainCGPipeline.addPass(ArgumentPromotionPass());
+
+  // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
+  // there are no OpenMP runtime calls present in the module.
+  if (Level == OptimizationLevel::O2 || Level == OptimizationLevel::O3)
+    MainCGPipeline.addPass(OpenMPOptPass());
+
+  // Lastly, add the core function simplification pipeline nested inside the
+  // CGSCC walk.
+  MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
+      buildFunctionSimplificationPipeline(Level, Phase, DebugLogging)));
+
+  for (auto &C : CGSCCOptimizerLateEPCallbacks)
+    C(MainCGPipeline, Level);
+
+  return MIWP;
+}
+
+ModulePassManager PassBuilder::buildModuleSimplificationPipeline(
+    OptimizationLevel Level, ThinLTOPhase Phase, bool DebugLogging) {
   ModulePassManager MPM(DebugLogging);
 
   bool HasSampleProfile = PGOOpt && (PGOOpt->Action == PGOOptions::SampleUse);
@@ -712,7 +933,9 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   EarlyFPM.addPass(SROA());
   EarlyFPM.addPass(EarlyCSEPass());
   EarlyFPM.addPass(LowerExpectIntrinsicPass());
-  if (Level == O3)
+  if (PTO.Coroutines)
+    EarlyFPM.addPass(CoroEarlyPass());
+  if (Level == OptimizationLevel::O3)
     EarlyFPM.addPass(CallSiteSplittingPass());
 
   // In SamplePGO ThinLTO backend, we need instcombine before profile annotation
@@ -745,6 +968,15 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
                                            true /* SamplePGO */));
   }
 
+  if (AttributorRun & AttributorRunOption::MODULE)
+    MPM.addPass(AttributorPass());
+
+  // Lower type metadata and the type.test intrinsic in the ThinLTO
+  // post link pipeline after ICP. This is to enable usage of the type
+  // tests in ICP sequences.
+  if (Phase == ThinLTOPhase::PostLink)
+    MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
+
   // Interprocedural constant propagation now that basic cleanup has occurred
   // and prior to optimizing globals.
   // FIXME: This position in the pipeline hasn't been carefully considered in
@@ -765,7 +997,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   // constants.
   MPM.addPass(createModuleToFunctionPassAdaptor(PromotePass()));
 
-  // Remove any dead arguments exposed by cleanups and constand folding
+  // Remove any dead arguments exposed by cleanups and constant folding
   // globals.
   MPM.addPass(DeadArgumentEliminationPass());
 
@@ -796,61 +1028,7 @@ PassBuilder::buildModuleSimplificationPipeline(OptimizationLevel Level,
   if (EnableSyntheticCounts && !PGOOpt)
     MPM.addPass(SyntheticCountsPropagation());
 
-  // Require the GlobalsAA analysis for the module so we can query it within
-  // the CGSCC pipeline.
-  MPM.addPass(RequireAnalysisPass<GlobalsAA, Module>());
-
-  // Require the ProfileSummaryAnalysis for the module so we can query it within
-  // the inliner pass.
-  MPM.addPass(RequireAnalysisPass<ProfileSummaryAnalysis, Module>());
-
-  // Now begin the main postorder CGSCC pipeline.
-  // FIXME: The current CGSCC pipeline has its origins in the legacy pass
-  // manager and trying to emulate its precise behavior. Much of this doesn't
-  // make a lot of sense and we should revisit the core CGSCC structure.
-  CGSCCPassManager MainCGPipeline(DebugLogging);
-
-  // Note: historically, the PruneEH pass was run first to deduce nounwind and
-  // generally clean up exception handling overhead. It isn't clear this is
-  // valuable as the inliner doesn't currently care whether it is inlining an
-  // invoke or a call.
-
-  // Run the inliner first. The theory is that we are walking bottom-up and so
-  // the callees have already been fully optimized, and we want to inline them
-  // into the callers so that our optimizations can reflect that.
-  // For PreLinkThinLTO pass, we disable hot-caller heuristic for sample PGO
-  // because it makes profile annotation in the backend inaccurate.
-  InlineParams IP = getInlineParamsFromOptLevel(Level);
-  if (Phase == ThinLTOPhase::PreLink && PGOOpt &&
-      PGOOpt->Action == PGOOptions::SampleUse)
-    IP.HotCallSiteThreshold = 0;
-  MainCGPipeline.addPass(InlinerPass(IP));
-
-  // Now deduce any function attributes based in the current code.
-  MainCGPipeline.addPass(PostOrderFunctionAttrsPass());
-
-  // When at O3 add argument promotion to the pass pipeline.
-  // FIXME: It isn't at all clear why this should be limited to O3.
-  if (Level == O3)
-    MainCGPipeline.addPass(ArgumentPromotionPass());
-
-  // Lastly, add the core function simplification pipeline nested inside the
-  // CGSCC walk.
-  MainCGPipeline.addPass(createCGSCCToFunctionPassAdaptor(
-      buildFunctionSimplificationPipeline(Level, Phase, DebugLogging)));
-
-  for (auto &C : CGSCCOptimizerLateEPCallbacks)
-    C(MainCGPipeline, Level);
-
-  // We wrap the CGSCC pipeline in a devirtualization repeater. This will try
-  // to detect when we devirtualize indirect calls and iterate the SCC passes
-  // in that case to try and catch knock-on inlining or function attrs
-  // opportunities. Then we add it to the module pipeline by walking the SCCs
-  // in postorder (or bottom-up).
-  MPM.addPass(
-      createModuleToPostOrderCGSCCPassAdaptor(createDevirtSCCRepeatedPass(
-          std::move(MainCGPipeline), MaxDevirtIterations)));
-
+  MPM.addPass(buildInlinerPipeline(Level, Phase, DebugLogging));
   return MPM;
 }
 
@@ -935,6 +1113,10 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   // llvm.loop.distribute=true or when -enable-loop-distribute is specified.
   OptimizePM.addPass(LoopDistributePass());
 
+  // Populates the VFABI attribute with the scalar-to-vector mappings
+  // from the TargetLibraryInfo.
+  OptimizePM.addPass(InjectTLIMappings());
+
   // Now run the core loop vectorizer.
   OptimizePM.addPass(LoopVectorizePass(
       LoopVectorizeOptions(!PTO.LoopInterleaving, !PTO.LoopVectorization)));
@@ -965,6 +1147,8 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   if (PTO.SLPVectorization)
     OptimizePM.addPass(SLPVectorizerPass());
 
+  // Enhance/cleanup vector code.
+  OptimizePM.addPass(VectorCombinePass());
   OptimizePM.addPass(InstCombinePass());
 
   // Unroll small loops to hide loop backedge latency and saturate any parallel
@@ -975,11 +1159,11 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   // across the loop nests.
   // We do UnrollAndJam in a separate LPM to ensure it happens before unroll
   if (EnableUnrollAndJam && PTO.LoopUnrolling) {
-    OptimizePM.addPass(LoopUnrollAndJamPass(Level));
+    OptimizePM.addPass(LoopUnrollAndJamPass(Level.getSpeedupLevel()));
   }
-  OptimizePM.addPass(LoopUnrollPass(
-      LoopUnrollOptions(Level, /*OnlyWhenForced=*/!PTO.LoopUnrolling,
-                        PTO.ForgetAllSCEVInLoopUnroll)));
+  OptimizePM.addPass(LoopUnrollPass(LoopUnrollOptions(
+      Level.getSpeedupLevel(), /*OnlyWhenForced=*/!PTO.LoopUnrolling,
+      PTO.ForgetAllSCEVInLoopUnroll)));
   OptimizePM.addPass(WarnMissedTransformationsPass());
   OptimizePM.addPass(InstCombinePass());
   OptimizePM.addPass(RequireAnalysisPass<OptimizationRemarkEmitterAnalysis, Function>());
@@ -1020,13 +1204,17 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
   // inserting redundancies into the program. This even includes SimplifyCFG.
   OptimizePM.addPass(SpeculateAroundPHIsPass());
 
-  for (auto &C : OptimizerLastEPCallbacks)
-    C(OptimizePM, Level);
+  if (PTO.Coroutines)
+    OptimizePM.addPass(CoroCleanupPass());
 
   // Add the core optimizing pipeline.
   MPM.addPass(createModuleToFunctionPassAdaptor(std::move(OptimizePM)));
 
-  MPM.addPass(CGProfilePass());
+  for (auto &C : OptimizerLastEPCallbacks)
+    C(MPM, Level);
+
+  if (PTO.CallGraphProfile)
+    MPM.addPass(CGProfilePass());
 
   // Now we need to do some global optimization transforms.
   // FIXME: It would seem like these should come first in the optimization
@@ -1041,7 +1229,8 @@ ModulePassManager PassBuilder::buildModuleOptimizationPipeline(
 ModulePassManager
 PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
                                            bool DebugLogging, bool LTOPreLink) {
-  assert(Level != O0 && "Must request optimizations for the default pipeline!");
+  assert(Level != OptimizationLevel::O0 &&
+         "Must request optimizations for the default pipeline!");
 
   ModulePassManager MPM(DebugLogging);
 
@@ -1068,7 +1257,8 @@ PassBuilder::buildPerModuleDefaultPipeline(OptimizationLevel Level,
 ModulePassManager
 PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level,
                                                 bool DebugLogging) {
-  assert(Level != O0 && "Must request optimizations for the default pipeline!");
+  assert(Level != OptimizationLevel::O0 &&
+         "Must request optimizations for the default pipeline!");
 
   ModulePassManager MPM(DebugLogging);
 
@@ -1101,6 +1291,12 @@ PassBuilder::buildThinLTOPreLinkDefaultPipeline(OptimizationLevel Level,
   // Reduce the size of the IR as much as possible.
   MPM.addPass(GlobalOptPass());
 
+  // Module simplification splits coroutines, but does not fully clean up
+  // coroutine intrinsics. To ensure ThinLTO optimization passes don't trip up
+  // on these, we schedule the cleanup here.
+  if (PTO.Coroutines)
+    MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass()));
+
   return MPM;
 }
 
@@ -1129,7 +1325,7 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
     MPM.addPass(LowerTypeTestsPass(nullptr, ImportSummary));
   }
 
-  if (Level == O0)
+  if (Level == OptimizationLevel::O0)
     return MPM;
 
   // Force any function attributes we want the rest of the pipeline to observe.
@@ -1148,10 +1344,11 @@ ModulePassManager PassBuilder::buildThinLTODefaultPipeline(
 ModulePassManager
 PassBuilder::buildLTOPreLinkDefaultPipeline(OptimizationLevel Level,
                                             bool DebugLogging) {
-  assert(Level != O0 && "Must request optimizations for the default pipeline!");
+  assert(Level != OptimizationLevel::O0 &&
+         "Must request optimizations for the default pipeline!");
   // FIXME: We should use a customized pre-link pipeline!
   return buildPerModuleDefaultPipeline(Level, DebugLogging,
-                                       /* LTOPreLink */true);
+                                       /* LTOPreLink */ true);
 }
 
 ModulePassManager
@@ -1159,11 +1356,14 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
                                      ModuleSummaryIndex *ExportSummary) {
   ModulePassManager MPM(DebugLogging);
 
-  if (Level == O0) {
+  if (Level == OptimizationLevel::O0) {
     // The WPD and LowerTypeTest passes need to run at -O0 to lower type
     // metadata and intrinsics.
     MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
     MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
+    // Run a second time to clean up any type tests left behind by WPD for use
+    // in ICP.
+    MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
     return MPM;
   }
 
@@ -1188,7 +1388,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
   // libraries and other oracles.
   MPM.addPass(InferFunctionAttrsPass());
 
-  if (Level > 1) {
+  if (Level.getSpeedupLevel() > 1) {
     FunctionPassManager EarlyFPM(DebugLogging);
     EarlyFPM.addPass(CallSiteSplittingPass());
     MPM.addPass(createModuleToFunctionPassAdaptor(std::move(EarlyFPM)));
@@ -1202,11 +1402,11 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
     // Propagate constants at call sites into the functions they call.  This
     // opens opportunities for globalopt (and inlining) by substituting function
     // pointers passed as arguments to direct uses of functions.
-   MPM.addPass(IPSCCPPass());
+    MPM.addPass(IPSCCPPass());
 
-   // Attach metadata to indirect call sites indicating the set of functions
-   // they may target at run-time. This should follow IPSCCP.
-   MPM.addPass(CalledValuePropagationPass());
+    // Attach metadata to indirect call sites indicating the set of functions
+    // they may target at run-time. This should follow IPSCCP.
+    MPM.addPass(CalledValuePropagationPass());
   }
 
   // Now deduce any function attributes based in the current code.
@@ -1226,10 +1426,14 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
   MPM.addPass(WholeProgramDevirtPass(ExportSummary, nullptr));
 
   // Stop here at -O1.
-  if (Level == 1) {
+  if (Level == OptimizationLevel::O1) {
     // The LowerTypeTestsPass needs to run to lower type metadata and the
     // type.test intrinsics. The pass does nothing if CFI is disabled.
     MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
+    // Run a second time to clean up any type tests left behind by WPD for use
+    // in ICP (which is performed earlier than this in the regular LTO
+    // pipeline).
+    MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
     return MPM;
   }
 
@@ -1251,7 +1455,7 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
   // function pointers.  When this happens, we often have to resolve varargs
   // calls, etc, so let instcombine do this.
   FunctionPassManager PeepholeFPM(DebugLogging);
-  if (Level == O3)
+  if (Level == OptimizationLevel::O3)
     PeepholeFPM.addPass(AggressiveInstCombinePass());
   PeepholeFPM.addPass(InstCombinePass());
   invokePeepholeEPCallbacks(PeepholeFPM, Level);
@@ -1263,8 +1467,8 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
   // valuable as the inliner doesn't currently care whether it is inlining an
   // invoke or a call.
   // Run the inliner now.
-  MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
-      InlinerPass(getInlineParamsFromOptLevel(Level))));
+  MPM.addPass(ModuleInlinerWrapperPass(getInlineParamsFromOptLevel(Level),
+                                       DebugLogging));
 
   // Optimize globals again after we ran the inliner.
   MPM.addPass(GlobalOptPass());
@@ -1357,6 +1561,9 @@ PassBuilder::buildLTODefaultPipeline(OptimizationLevel Level, bool DebugLogging,
   // to be run at link time if CFI is enabled. This pass does nothing if
   // CFI is disabled.
   MPM.addPass(LowerTypeTestsPass(ExportSummary, nullptr));
+  // Run a second time to clean up any type tests left behind by WPD for use
+  // in ICP (which is performed earlier than this in the regular LTO pipeline).
+  MPM.addPass(LowerTypeTestsPass(nullptr, nullptr, true));
 
   // Enable splitting late in the FullLTO post-link pipeline. This is done in
   // the same stage in the old pass manager (\ref addLateLTOOptimizationPasses).
@@ -1635,6 +1842,49 @@ Expected<bool> parseMergedLoadStoreMotionOptions(StringRef Params) {
   }
   return Result;
 }
+
+Expected<GVNOptions> parseGVNOptions(StringRef Params) {
+  GVNOptions Result;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    bool Enable = !ParamName.consume_front("no-");
+    if (ParamName == "pre") {
+      Result.setPRE(Enable);
+    } else if (ParamName == "load-pre") {
+      Result.setLoadPRE(Enable);
+    } else if (ParamName == "memdep") {
+      Result.setMemDep(Enable);
+    } else {
+      return make_error<StringError>(
+          formatv("invalid GVN pass parameter '{0}' ", ParamName).str(),
+          inconvertibleErrorCode());
+    }
+  }
+  return Result;
+}
+
+Expected<StackLifetime::LivenessType>
+parseStackLifetimeOptions(StringRef Params) {
+  StackLifetime::LivenessType Result = StackLifetime::LivenessType::May;
+  while (!Params.empty()) {
+    StringRef ParamName;
+    std::tie(ParamName, Params) = Params.split(';');
+
+    if (ParamName == "may") {
+      Result = StackLifetime::LivenessType::May;
+    } else if (ParamName == "must") {
+      Result = StackLifetime::LivenessType::Must;
+    } else {
+      return make_error<StringError>(
+          formatv("invalid StackLifetime parameter '{0}' ", ParamName).str(),
+          inconvertibleErrorCode());
+    }
+  }
+  return Result;
+}
+
 } // namespace
 
 /// Tests whether a pass name starts with a valid prefix for a default pipeline
@@ -1887,13 +2137,13 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
     assert(Matches.size() == 3 && "Must capture two matched strings!");
 
     OptimizationLevel L = StringSwitch<OptimizationLevel>(Matches[2])
-                              .Case("O0", O0)
-                              .Case("O1", O1)
-                              .Case("O2", O2)
-                              .Case("O3", O3)
-                              .Case("Os", Os)
-                              .Case("Oz", Oz);
-    if (L == O0) {
+                              .Case("O0", OptimizationLevel::O0)
+                              .Case("O1", OptimizationLevel::O1)
+                              .Case("O2", OptimizationLevel::O2)
+                              .Case("O3", OptimizationLevel::O3)
+                              .Case("Os", OptimizationLevel::Os)
+                              .Case("Oz", OptimizationLevel::Oz);
+    if (L == OptimizationLevel::O0) {
       // Add instrumentation PGO passes -- at O0 we can still do PGO.
       if (PGOOpt && Matches[1] != "thinlto" &&
           (PGOOpt->Action == PGOOptions::IRInstr ||
@@ -1903,6 +2153,20 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
             /* RunProfileGen */ (PGOOpt->Action == PGOOptions::IRInstr),
             /* IsCS */ false, PGOOpt->ProfileFile,
             PGOOpt->ProfileRemappingFile);
+
+      // For IR that makes use of coroutines intrinsics, coroutine passes must
+      // be run, even at -O0.
+      if (PTO.Coroutines) {
+        MPM.addPass(createModuleToFunctionPassAdaptor(CoroEarlyPass()));
+
+        CGSCCPassManager CGPM(DebugLogging);
+        CGPM.addPass(CoroSplitPass());
+        CGPM.addPass(createCGSCCToFunctionPassAdaptor(CoroElidePass()));
+        MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(CGPM)));
+
+        MPM.addPass(createModuleToFunctionPassAdaptor(CoroCleanupPass()));
+      }
+
       // Do nothing else at all!
       return Error::success();
     }
@@ -1910,8 +2174,10 @@ Error PassBuilder::parseModulePass(ModulePassManager &MPM,
     // This is consistent with old pass manager invoked via opt, but
     // inconsistent with clang. Clang doesn't enable loop vectorization
     // but does enable slp vectorization at Oz.
-    PTO.LoopVectorization = L > O1 && L < Oz;
-    PTO.SLPVectorization = L > O1 && L < Oz;
+    PTO.LoopVectorization =
+        L.getSpeedupLevel() > 1 && L != OptimizationLevel::Oz;
+    PTO.SLPVectorization =
+        L.getSpeedupLevel() > 1 && L != OptimizationLevel::Oz;
 
     if (Matches[1] == "default") {
       MPM.addPass(buildPerModuleDefaultPipeline(L, DebugLogging));
@@ -2408,3 +2674,28 @@ Error PassBuilder::parseAAPipeline(AAManager &AA, StringRef PipelineText) {
 
   return Error::success();
 }
+
+bool PassBuilder::isAAPassName(StringRef PassName) {
+#define FUNCTION_ALIAS_ANALYSIS(NAME, CREATE_PASS)                             \
+  if (PassName == NAME)                                                        \
+    return true;
+#include "PassRegistry.def"
+  return false;
+}
+
+bool PassBuilder::isAnalysisPassName(StringRef PassName) {
+#define MODULE_ANALYSIS(NAME, CREATE_PASS)                                     \
+  if (PassName == NAME)                                                        \
+    return true;
+#define FUNCTION_ANALYSIS(NAME, CREATE_PASS)                                   \
+  if (PassName == NAME)                                                        \
+    return true;
+#define LOOP_ANALYSIS(NAME, CREATE_PASS)                                       \
+  if (PassName == NAME)                                                        \
+    return true;
+#define CGSSC_ANALYSIS(NAME, CREATE_PASS)                                      \
+  if (PassName == NAME)                                                        \
+    return true;
+#include "PassRegistry.def"
+  return false;
+}
diff --git a/llvm/lib/Passes/PassRegistry.def b/llvm/lib/Passes/PassRegistry.def
index 355dd6f96812..dfdfc3d05976 100644
--- a/llvm/lib/Passes/PassRegistry.def
+++ b/llvm/lib/Passes/PassRegistry.def
@@ -27,6 +27,7 @@ MODULE_ANALYSIS("stack-safety", StackSafetyGlobalAnalysis())
 MODULE_ANALYSIS("verify", VerifierAnalysis())
 MODULE_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 MODULE_ANALYSIS("asan-globals-md", ASanGlobalsMetadataAnalysis())
+MODULE_ANALYSIS("inline-advisor", InlineAdvisorAnalysis())
 
 #ifndef MODULE_ALIAS_ANALYSIS
 #define MODULE_ALIAS_ANALYSIS(NAME, CREATE_PASS)                               \
@@ -57,6 +58,7 @@ MODULE_PASS("hotcoldsplit", HotColdSplittingPass())
 MODULE_PASS("hwasan", HWAddressSanitizerPass(false, false))
 MODULE_PASS("khwasan", HWAddressSanitizerPass(true, true))
 MODULE_PASS("inferattrs", InferFunctionAttrsPass())
+MODULE_PASS("inliner-wrapper", ModuleInlinerWrapperPass())
 MODULE_PASS("insert-gcov-profiling", GCOVProfilerPass())
 MODULE_PASS("instrorderfile", InstrOrderFilePass())
 MODULE_PASS("instrprof", InstrProfiling())
@@ -71,7 +73,6 @@ MODULE_PASS("partial-inliner", PartialInlinerPass())
 MODULE_PASS("pgo-icall-prom", PGOIndirectCallPromotion())
 MODULE_PASS("pgo-instr-gen", PGOInstrumentationGen())
 MODULE_PASS("pgo-instr-use", PGOInstrumentationUse())
-MODULE_PASS("pre-isel-intrinsic-lowering", PreISelIntrinsicLoweringPass())
 MODULE_PASS("print-profile-summary", ProfileSummaryPrinterPass(dbgs()))
 MODULE_PASS("print-callgraph", CallGraphPrinterPass(dbgs()))
 MODULE_PASS("print", PrintModulePass(dbgs()))
@@ -82,6 +83,8 @@ MODULE_PASS("rewrite-statepoints-for-gc", RewriteStatepointsForGC())
 MODULE_PASS("rewrite-symbols", RewriteSymbolPass())
 MODULE_PASS("rpo-functionattrs", ReversePostOrderFunctionAttrsPass())
 MODULE_PASS("sample-profile", SampleProfileLoaderPass())
+MODULE_PASS("scc-oz-module-inliner",
+  buildInlinerPipeline(OptimizationLevel::Oz, ThinLTOPhase::None, DebugLogging))
 MODULE_PASS("strip-dead-prototypes", StripDeadPrototypesPass())
 MODULE_PASS("synthetic-counts-propagation", SyntheticCountsPropagation())
 MODULE_PASS("wholeprogramdevirt", WholeProgramDevirtPass(nullptr, nullptr))
@@ -108,7 +111,10 @@ CGSCC_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 CGSCC_PASS("argpromotion", ArgumentPromotionPass())
 CGSCC_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 CGSCC_PASS("function-attrs", PostOrderFunctionAttrsPass())
+CGSCC_PASS("attributor-cgscc", AttributorCGSCCPass())
 CGSCC_PASS("inline", InlinerPass())
+CGSCC_PASS("openmpopt", OpenMPOptPass())
+CGSCC_PASS("coro-split", CoroSplitPass())
 CGSCC_PASS("no-op-cgscc", NoOpCGSCCPass())
 #undef CGSCC_PASS
 
@@ -126,6 +132,8 @@ FUNCTION_ANALYSIS("domfrontier", DominanceFrontierAnalysis())
 FUNCTION_ANALYSIS("loops", LoopAnalysis())
 FUNCTION_ANALYSIS("lazy-value-info", LazyValueAnalysis())
 FUNCTION_ANALYSIS("da", DependenceAnalysis())
+FUNCTION_ANALYSIS("inliner-features", InlineFeaturesAnalysis())
+FUNCTION_ANALYSIS("inliner-size-estimator", InlineSizeEstimatorAnalysis())
 FUNCTION_ANALYSIS("memdep", MemoryDependenceAnalysis())
 FUNCTION_ANALYSIS("memoryssa", MemorySSAAnalysis())
 FUNCTION_ANALYSIS("phi-values", PhiValuesAnalysis())
@@ -160,6 +168,8 @@ FUNCTION_PASS("aa-eval", AAEvaluator())
 FUNCTION_PASS("adce", ADCEPass())
 FUNCTION_PASS("add-discriminators", AddDiscriminatorsPass())
 FUNCTION_PASS("aggressive-instcombine", AggressiveInstCombinePass())
+FUNCTION_PASS("assume-builder", AssumeBuilderPass())
+FUNCTION_PASS("assume-simplify", AssumeSimplifyPass())
 FUNCTION_PASS("alignment-from-assumptions", AlignmentFromAssumptionsPass())
 FUNCTION_PASS("bdce", BDCEPass())
 FUNCTION_PASS("bounds-checking", BoundsCheckingPass())
@@ -167,6 +177,9 @@ FUNCTION_PASS("break-crit-edges", BreakCriticalEdgesPass())
 FUNCTION_PASS("callsite-splitting", CallSiteSplittingPass())
 FUNCTION_PASS("consthoist", ConstantHoistingPass())
 FUNCTION_PASS("chr", ControlHeightReductionPass())
+FUNCTION_PASS("coro-early", CoroEarlyPass())
+FUNCTION_PASS("coro-elide", CoroElidePass())
+FUNCTION_PASS("coro-cleanup", CoroCleanupPass())
 FUNCTION_PASS("correlated-propagation", CorrelatedValuePropagationPass())
 FUNCTION_PASS("dce", DCEPass())
 FUNCTION_PASS("div-rem-pairs", DivRemPairsPass())
@@ -182,6 +195,7 @@ FUNCTION_PASS("gvn-hoist", GVNHoistPass())
 FUNCTION_PASS("instcombine", InstCombinePass())
 FUNCTION_PASS("instsimplify", InstSimplifyPass())
 FUNCTION_PASS("invalidate<all>", InvalidateAllAnalysesPass())
+FUNCTION_PASS("irce", IRCEPass())
 FUNCTION_PASS("float2int", Float2IntPass())
 FUNCTION_PASS("no-op-function", NoOpFunctionPass())
 FUNCTION_PASS("libcalls-shrinkwrap", LibCallsShrinkWrapPass())
@@ -193,10 +207,10 @@ FUNCTION_PASS("lower-constant-intrinsics", LowerConstantIntrinsicsPass())
 FUNCTION_PASS("lower-matrix-intrinsics", LowerMatrixIntrinsicsPass())
 FUNCTION_PASS("lower-widenable-condition", LowerWidenableConditionPass())
 FUNCTION_PASS("guard-widening", GuardWideningPass())
-FUNCTION_PASS("gvn", GVN())
 FUNCTION_PASS("load-store-vectorizer", LoadStoreVectorizerPass())
 FUNCTION_PASS("loop-simplify", LoopSimplifyPass())
 FUNCTION_PASS("loop-sink", LoopSinkPass())
+FUNCTION_PASS("loop-unroll-and-jam", LoopUnrollAndJamPass())
 FUNCTION_PASS("lowerinvoke", LowerInvokePass())
 FUNCTION_PASS("mem2reg", PromotePass())
 FUNCTION_PASS("memcpyopt", MemCpyOptPass())
@@ -208,7 +222,7 @@ FUNCTION_PASS("partially-inline-libcalls", PartiallyInlineLibCallsPass())
 FUNCTION_PASS("lcssa", LCSSAPass())
 FUNCTION_PASS("loop-data-prefetch", LoopDataPrefetchPass())
 FUNCTION_PASS("loop-load-elim", LoopLoadEliminationPass())
-FUNCTION_PASS("loop-fuse", LoopFusePass())
+FUNCTION_PASS("loop-fusion", LoopFusePass())
 FUNCTION_PASS("loop-distribute", LoopDistributePass())
 FUNCTION_PASS("pgo-memop-opt", PGOMemOPSizeOpt())
 FUNCTION_PASS("print", PrintFunctionPass(dbgs()))
@@ -220,23 +234,25 @@ FUNCTION_PASS("print<domtree>", DominatorTreePrinterPass(dbgs()))
 FUNCTION_PASS("print<postdomtree>", PostDominatorTreePrinterPass(dbgs()))
 FUNCTION_PASS("print<demanded-bits>", DemandedBitsPrinterPass(dbgs()))
 FUNCTION_PASS("print<domfrontier>", DominanceFrontierPrinterPass(dbgs()))
+FUNCTION_PASS("print<inline-cost>", InlineCostAnnotationPrinterPass(dbgs()))
 FUNCTION_PASS("print<loops>", LoopPrinterPass(dbgs()))
 FUNCTION_PASS("print<memoryssa>", MemorySSAPrinterPass(dbgs()))
 FUNCTION_PASS("print<phi-values>", PhiValuesPrinterPass(dbgs()))
 FUNCTION_PASS("print<regions>", RegionInfoPrinterPass(dbgs()))
 FUNCTION_PASS("print<scalar-evolution>", ScalarEvolutionPrinterPass(dbgs()))
 FUNCTION_PASS("print<stack-safety-local>", StackSafetyPrinterPass(dbgs()))
+FUNCTION_PASS("print-predicateinfo", PredicateInfoPrinterPass(dbgs()))
 FUNCTION_PASS("reassociate", ReassociatePass())
 FUNCTION_PASS("scalarizer", ScalarizerPass())
 FUNCTION_PASS("sccp", SCCPPass())
+FUNCTION_PASS("simplifycfg", SimplifyCFGPass())
 FUNCTION_PASS("sink", SinkingPass())
 FUNCTION_PASS("slp-vectorizer", SLPVectorizerPass())
 FUNCTION_PASS("speculative-execution", SpeculativeExecutionPass())
 FUNCTION_PASS("spec-phis", SpeculateAroundPHIsPass())
 FUNCTION_PASS("sroa", SROA())
 FUNCTION_PASS("tailcallelim", TailCallElimPass())
-FUNCTION_PASS("unreachableblockelim", UnreachableBlockElimPass())
-FUNCTION_PASS("unroll-and-jam", LoopUnrollAndJamPass())
+FUNCTION_PASS("vector-combine", VectorCombinePass())
 FUNCTION_PASS("verify", VerifierPass())
 FUNCTION_PASS("verify<domtree>", DominatorTreeVerifierPass())
 FUNCTION_PASS("verify<loops>", LoopVerifierPass())
@@ -257,7 +273,7 @@ FUNCTION_PASS("tsan", ThreadSanitizerPass())
 #ifndef FUNCTION_PASS_WITH_PARAMS
 #define FUNCTION_PASS_WITH_PARAMS(NAME, CREATE_PASS, PARSER)
 #endif
-FUNCTION_PASS_WITH_PARAMS("unroll",
+FUNCTION_PASS_WITH_PARAMS("loop-unroll",
                            [](LoopUnrollOptions Opts) {
                              return LoopUnrollPass(Opts);
                            },
@@ -282,6 +298,16 @@ FUNCTION_PASS_WITH_PARAMS("mldst-motion",
                              return MergedLoadStoreMotionPass(Opts);
                            },
                            parseMergedLoadStoreMotionOptions)
+FUNCTION_PASS_WITH_PARAMS("gvn",
+                           [](GVNOptions Opts) {
+                             return GVN(Opts);
+                           },
+                           parseGVNOptions)
+FUNCTION_PASS_WITH_PARAMS("print<stack-lifetime>",
+                           [](StackLifetime::LivenessType Type) {
+                             return StackLifetimePrinterPass(dbgs(), Type);
+                           },
+                           parseStackLifetimeOptions)
 #undef FUNCTION_PASS_WITH_PARAMS
 
 #ifndef LOOP_ANALYSIS
@@ -297,6 +323,7 @@ LOOP_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 #ifndef LOOP_PASS
 #define LOOP_PASS(NAME, CREATE_PASS)
 #endif
+LOOP_PASS("canon-freeze", CanonicalizeFreezeInLoopsPass())
 LOOP_PASS("invalidate<all>", InvalidateAllAnalysesPass())
 LOOP_PASS("licm", LICMPass())
 LOOP_PASS("loop-idiom", LoopIdiomRecognizePass())
@@ -306,16 +333,17 @@ LOOP_PASS("no-op-loop", NoOpLoopPass())
 LOOP_PASS("print", PrintLoopPass(dbgs()))
 LOOP_PASS("loop-deletion", LoopDeletionPass())
 LOOP_PASS("simplify-cfg", LoopSimplifyCFGPass())
-LOOP_PASS("strength-reduce", LoopStrengthReducePass())
+LOOP_PASS("loop-reduce", LoopStrengthReducePass())
 LOOP_PASS("indvars", IndVarSimplifyPass())
-LOOP_PASS("irce", IRCEPass())
-LOOP_PASS("unroll-full", LoopFullUnrollPass())
+LOOP_PASS("loop-unroll-full", LoopFullUnrollPass())
 LOOP_PASS("print-access-info", LoopAccessInfoPrinterPass(dbgs()))
 LOOP_PASS("print<ddg>", DDGAnalysisPrinterPass(dbgs()))
 LOOP_PASS("print<ivusers>", IVUsersPrinterPass(dbgs()))
+LOOP_PASS("print<loopnest>", LoopNestPrinterPass(dbgs()))
 LOOP_PASS("print<loop-cache-cost>", LoopCachePrinterPass(dbgs()))
 LOOP_PASS("loop-predication", LoopPredicationPass())
 LOOP_PASS("guard-widening", GuardWideningPass())
+LOOP_PASS("simple-loop-unswitch", SimpleLoopUnswitchPass())
 #undef LOOP_PASS
 
 #ifndef LOOP_PASS_WITH_PARAMS
diff --git a/llvm/lib/Passes/StandardInstrumentations.cpp b/llvm/lib/Passes/StandardInstrumentations.cpp
index 5cf0ca8e28f6..1e1a6b98a65a 100644
--- a/llvm/lib/Passes/StandardInstrumentations.cpp
+++ b/llvm/lib/Passes/StandardInstrumentations.cpp
@@ -70,16 +70,24 @@ Optional<std::pair<const Module *, std::string>> unwrapModule(Any IR) {
   llvm_unreachable("Unknown IR unit");
 }
 
-void printIR(const Module *M, StringRef Banner, StringRef Extra = StringRef()) {
-  dbgs() << Banner << Extra << "\n";
-  M->print(dbgs(), nullptr, false);
-}
 void printIR(const Function *F, StringRef Banner,
              StringRef Extra = StringRef()) {
   if (!llvm::isFunctionInPrintList(F->getName()))
     return;
   dbgs() << Banner << Extra << "\n" << static_cast<const Value &>(*F);
 }
+
+void printIR(const Module *M, StringRef Banner, StringRef Extra = StringRef()) {
+  if (llvm::isFunctionInPrintList("*") || llvm::forcePrintModuleIR()) {
+    dbgs() << Banner << Extra << "\n";
+    M->print(dbgs(), nullptr, false);
+  } else {
+    for (const auto &F : M->functions()) {
+      printIR(&F, Banner, Extra);
+    }
+  }
+}
+
 void printIR(const LazyCallGraph::SCC *C, StringRef Banner,
              StringRef Extra = StringRef()) {
   bool BannerPrinted = false;
@@ -98,7 +106,7 @@ void printIR(const Loop *L, StringRef Banner) {
   const Function *F = L->getHeader()->getParent();
   if (!llvm::isFunctionInPrintList(F->getName()))
     return;
-  llvm::printLoop(const_cast<Loop &>(*L), dbgs(), Banner);
+  llvm::printLoop(const_cast<Loop &>(*L), dbgs(), std::string(Banner));
 }
 
 /// Generic IR-printing helper that unpacks a pointer to IRUnit wrapped into
@@ -127,7 +135,7 @@ void unwrapAndPrint(Any IR, StringRef Banner, bool ForceModule = false) {
   if (any_isa<const LazyCallGraph::SCC *>(IR)) {
     const LazyCallGraph::SCC *C = any_cast<const LazyCallGraph::SCC *>(IR);
     assert(C && "scc should be valid for printing");
-    std::string Extra = formatv(" (scc: {0})", C->getName());
+    std::string Extra = std::string(formatv(" (scc: {0})", C->getName()));
     printIR(C, Banner, Extra);
     return;
   }
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
index 8d5e56e26c0f..70f00d333db1 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMapping.cpp
@@ -222,7 +222,8 @@ Error CoverageMapping::loadFunctionRecord(
                                                 Record.FunctionHash, Counts)) {
     instrprof_error IPE = InstrProfError::take(std::move(E));
     if (IPE == instrprof_error::hash_mismatch) {
-      FuncHashMismatches.emplace_back(Record.FunctionName, Record.FunctionHash);
+      FuncHashMismatches.emplace_back(std::string(Record.FunctionName),
+                                      Record.FunctionHash);
       return Error::success();
     } else if (IPE != instrprof_error::unknown_function)
       return make_error<InstrProfError>(IPE);
@@ -804,6 +805,8 @@ static std::string getCoverageMapErrString(coveragemap_error Err) {
     return "Truncated coverage data";
   case coveragemap_error::malformed:
     return "Malformed coverage data";
+  case coveragemap_error::decompression_failed:
+    return "Failed to decompress coverage data (zlib)";
   }
   llvm_unreachable("A value of coveragemap_error has no message.");
 }
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index 679ff3525eeb..b75738bc360c 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/Object/Binary.h"
@@ -25,6 +26,7 @@
 #include "llvm/Object/COFF.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compression.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Error.h"
@@ -40,6 +42,9 @@ using namespace object;
 
 #define DEBUG_TYPE "coverage-mapping"
 
+STATISTIC(CovMapNumRecords, "The # of coverage function records");
+STATISTIC(CovMapNumUsedRecords, "The # of used coverage function records");
+
 void CoverageMappingIterator::increment() {
   if (ReadErr != coveragemap_error::success)
     return;
@@ -92,10 +97,60 @@ Error RawCoverageReader::readString(StringRef &Result) {
   return Error::success();
 }
 
-Error RawCoverageFilenamesReader::read() {
+Error RawCoverageFilenamesReader::read(
+    CovMapVersion Version,
+    BinaryCoverageReader::DecompressedData &Decompressed) {
   uint64_t NumFilenames;
   if (auto Err = readSize(NumFilenames))
     return Err;
+  if (!NumFilenames)
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
+
+  if (Version < CovMapVersion::Version4)
+    return readUncompressed(NumFilenames);
+
+  // The uncompressed length may exceed the size of the encoded filenames.
+  // Skip size validation.
+  uint64_t UncompressedLen;
+  if (auto Err = readULEB128(UncompressedLen))
+    return Err;
+
+  uint64_t CompressedLen;
+  if (auto Err = readSize(CompressedLen))
+    return Err;
+
+  if (CompressedLen > 0) {
+    if (!zlib::isAvailable())
+      return make_error<CoverageMapError>(
+          coveragemap_error::decompression_failed);
+
+    // Allocate memory for the decompressed filenames. Transfer ownership of
+    // the memory to BinaryCoverageReader.
+    auto DecompressedStorage = std::make_unique<SmallVector<char, 0>>();
+    SmallVectorImpl<char> &StorageBuf = *DecompressedStorage.get();
+    Decompressed.push_back(std::move(DecompressedStorage));
+
+    // Read compressed filenames.
+    StringRef CompressedFilenames = Data.substr(0, CompressedLen);
+    Data = Data.substr(CompressedLen);
+    auto Err =
+        zlib::uncompress(CompressedFilenames, StorageBuf, UncompressedLen);
+    if (Err) {
+      consumeError(std::move(Err));
+      return make_error<CoverageMapError>(
+          coveragemap_error::decompression_failed);
+    }
+
+    StringRef UncompressedFilenames(StorageBuf.data(), StorageBuf.size());
+    RawCoverageFilenamesReader Delegate(UncompressedFilenames, Filenames);
+    return Delegate.readUncompressed(NumFilenames);
+  }
+
+  return readUncompressed(NumFilenames);
+}
+
+Error RawCoverageFilenamesReader::readUncompressed(uint64_t NumFilenames) {
+  // Read uncompressed filenames.
   for (size_t I = 0; I < NumFilenames; ++I) {
     StringRef Filename;
     if (auto Err = readString(Filename))
@@ -380,20 +435,51 @@ static Expected<bool> isCoverageMappingDummy(uint64_t Hash, StringRef Mapping) {
   return RawCoverageMappingDummyChecker(Mapping).isDummy();
 }
 
+/// A range of filename indices. Used to specify the location of a batch of
+/// filenames in a vector-like container.
+struct FilenameRange {
+  unsigned StartingIndex;
+  unsigned Length;
+
+  FilenameRange(unsigned StartingIndex, unsigned Length)
+      : StartingIndex(StartingIndex), Length(Length) {}
+
+  void markInvalid() { Length = 0; }
+  bool isInvalid() const { return Length == 0; }
+};
+
 namespace {
 
+/// The interface to read coverage mapping function records for a module.
 struct CovMapFuncRecordReader {
   virtual ~CovMapFuncRecordReader() = default;
 
-  // The interface to read coverage mapping function records for a module.
+  // Read a coverage header.
   //
-  // \p Buf points to the buffer containing the \c CovHeader of the coverage
+  // \p CovBuf points to the buffer containing the \c CovHeader of the coverage
   // mapping data associated with the module.
   //
-  // Returns a pointer to the next \c CovHeader if it exists, or a pointer
-  // greater than \p End if not.
-  virtual Expected<const char *> readFunctionRecords(const char *Buf,
-                                                     const char *End) = 0;
+  // Returns a pointer to the next \c CovHeader if it exists, or to an address
+  // greater than \p CovEnd if not.
+  virtual Expected<const char *>
+  readCoverageHeader(const char *CovBuf, const char *CovBufEnd,
+                     BinaryCoverageReader::DecompressedData &Decompressed) = 0;
+
+  // Read function records.
+  //
+  // \p FuncRecBuf points to the buffer containing a batch of function records.
+  // \p FuncRecBufEnd points past the end of the batch of records.
+  //
+  // Prior to Version4, \p OutOfLineFileRange points to a sequence of filenames
+  // associated with the function records. It is unused in Version4.
+  //
+  // Prior to Version4, \p OutOfLineMappingBuf points to a sequence of coverage
+  // mappings associated with the function records. It is unused in Version4.
+  virtual Error readFunctionRecords(const char *FuncRecBuf,
+                                    const char *FuncRecBufEnd,
+                                    Optional<FilenameRange> OutOfLineFileRange,
+                                    const char *OutOfLineMappingBuf,
+                                    const char *OutOfLineMappingBufEnd) = 0;
 
   template <class IntPtrT, support::endianness Endian>
   static Expected<std::unique_ptr<CovMapFuncRecordReader>>
@@ -416,6 +502,10 @@ class VersionedCovMapFuncRecordReader : public CovMapFuncRecordReader {
   std::vector<StringRef> &Filenames;
   std::vector<BinaryCoverageReader::ProfileMappingRecord> &Records;
 
+  // Maps a hash of the filenames in a TU to a \c FileRange. The range
+  // specifies the location of the hashed filenames in \c Filenames.
+  DenseMap<uint64_t, FilenameRange> FileRangeMap;
+
   // Add the record to the collection if we don't already have a record that
   // points to the same function name. This is useful to ignore the redundant
   // records for the functions with ODR linkage.
@@ -423,7 +513,9 @@ class VersionedCovMapFuncRecordReader : public CovMapFuncRecordReader {
   // records, which were emitted for inline functions which were seen but
   // not used in the corresponding translation unit.
   Error insertFunctionRecordIfNeeded(const FuncRecordType *CFR,
-                                     StringRef Mapping, size_t FilenamesBegin) {
+                                     StringRef Mapping,
+                                     FilenameRange FileRange) {
+    ++CovMapNumRecords;
     uint64_t FuncHash = CFR->template getFuncHash<Endian>();
     NameRefType NameRef = CFR->template getFuncNameRef<Endian>();
     auto InsertResult =
@@ -434,8 +526,9 @@ class VersionedCovMapFuncRecordReader : public CovMapFuncRecordReader {
         return Err;
       if (FuncName.empty())
         return make_error<InstrProfError>(instrprof_error::malformed);
-      Records.emplace_back(Version, FuncName, FuncHash, Mapping, FilenamesBegin,
-                           Filenames.size() - FilenamesBegin);
+      ++CovMapNumUsedRecords;
+      Records.emplace_back(Version, FuncName, FuncHash, Mapping,
+                           FileRange.StartingIndex, FileRange.Length);
       return Error::success();
     }
     // Update the existing record if it's a dummy and the new record is real.
@@ -454,10 +547,11 @@ class VersionedCovMapFuncRecordReader : public CovMapFuncRecordReader {
       return Err;
     if (*NewIsDummyExpected)
       return Error::success();
+    ++CovMapNumUsedRecords;
     OldRecord.FunctionHash = FuncHash;
     OldRecord.CoverageMapping = Mapping;
-    OldRecord.FilenamesBegin = FilenamesBegin;
-    OldRecord.FilenamesSize = Filenames.size() - FilenamesBegin;
+    OldRecord.FilenamesBegin = FileRange.StartingIndex;
+    OldRecord.FilenamesSize = FileRange.Length;
     return Error::success();
   }
 
@@ -470,61 +564,134 @@ public:
 
   ~VersionedCovMapFuncRecordReader() override = default;
 
-  Expected<const char *> readFunctionRecords(const char *Buf,
-                                             const char *End) override {
+  Expected<const char *> readCoverageHeader(
+      const char *CovBuf, const char *CovBufEnd,
+      BinaryCoverageReader::DecompressedData &Decompressed) override {
     using namespace support;
 
-    if (Buf + sizeof(CovMapHeader) > End)
+    if (CovBuf + sizeof(CovMapHeader) > CovBufEnd)
       return make_error<CoverageMapError>(coveragemap_error::malformed);
-    auto CovHeader = reinterpret_cast<const CovMapHeader *>(Buf);
+    auto CovHeader = reinterpret_cast<const CovMapHeader *>(CovBuf);
     uint32_t NRecords = CovHeader->getNRecords<Endian>();
     uint32_t FilenamesSize = CovHeader->getFilenamesSize<Endian>();
     uint32_t CoverageSize = CovHeader->getCoverageSize<Endian>();
     assert((CovMapVersion)CovHeader->getVersion<Endian>() == Version);
-    Buf = reinterpret_cast<const char *>(CovHeader + 1);
+    CovBuf = reinterpret_cast<const char *>(CovHeader + 1);
 
     // Skip past the function records, saving the start and end for later.
-    const char *FunBuf = Buf;
-    Buf += NRecords * sizeof(FuncRecordType);
-    const char *FunEnd = Buf;
+    // This is a no-op in Version4 (function records are read after all headers
+    // are read).
+    const char *FuncRecBuf = nullptr;
+    const char *FuncRecBufEnd = nullptr;
+    if (Version < CovMapVersion::Version4)
+      FuncRecBuf = CovBuf;
+    CovBuf += NRecords * sizeof(FuncRecordType);
+    if (Version < CovMapVersion::Version4)
+      FuncRecBufEnd = CovBuf;
 
     // Get the filenames.
-    if (Buf + FilenamesSize > End)
+    if (CovBuf + FilenamesSize > CovBufEnd)
       return make_error<CoverageMapError>(coveragemap_error::malformed);
     size_t FilenamesBegin = Filenames.size();
-    RawCoverageFilenamesReader Reader(StringRef(Buf, FilenamesSize), Filenames);
-    if (auto Err = Reader.read())
+    StringRef FilenameRegion(CovBuf, FilenamesSize);
+    RawCoverageFilenamesReader Reader(FilenameRegion, Filenames);
+    if (auto Err = Reader.read(Version, Decompressed))
       return std::move(Err);
-    Buf += FilenamesSize;
+    CovBuf += FilenamesSize;
+    FilenameRange FileRange(FilenamesBegin, Filenames.size() - FilenamesBegin);
+
+    if (Version == CovMapVersion::Version4) {
+      // Map a hash of the filenames region to the filename range associated
+      // with this coverage header.
+      int64_t FilenamesRef =
+          llvm::IndexedInstrProf::ComputeHash(FilenameRegion);
+      auto Insert =
+          FileRangeMap.insert(std::make_pair(FilenamesRef, FileRange));
+      if (!Insert.second) {
+        // The same filenames ref was encountered twice. It's possible that
+        // the associated filenames are the same.
+        auto It = Filenames.begin();
+        FilenameRange &OrigRange = Insert.first->getSecond();
+        if (std::equal(It + OrigRange.StartingIndex,
+                       It + OrigRange.StartingIndex + OrigRange.Length,
+                       It + FileRange.StartingIndex,
+                       It + FileRange.StartingIndex + FileRange.Length))
+          // Map the new range to the original one.
+          FileRange = OrigRange;
+        else
+          // This is a hash collision. Mark the filenames ref invalid.
+          OrigRange.markInvalid();
+      }
+    }
 
     // We'll read the coverage mapping records in the loop below.
-    const char *CovBuf = Buf;
-    Buf += CoverageSize;
-    const char *CovEnd = Buf;
+    // This is a no-op in Version4 (coverage mappings are not affixed to the
+    // coverage header).
+    const char *MappingBuf = CovBuf;
+    if (Version == CovMapVersion::Version4 && CoverageSize != 0)
+      return make_error<CoverageMapError>(coveragemap_error::malformed);
+    CovBuf += CoverageSize;
+    const char *MappingEnd = CovBuf;
 
-    if (Buf > End)
+    if (CovBuf > CovBufEnd)
       return make_error<CoverageMapError>(coveragemap_error::malformed);
+
+    if (Version < CovMapVersion::Version4) {
+      // Read each function record.
+      if (Error E = readFunctionRecords(FuncRecBuf, FuncRecBufEnd, FileRange,
+                                        MappingBuf, MappingEnd))
+        return std::move(E);
+    }
+
     // Each coverage map has an alignment of 8, so we need to adjust alignment
     // before reading the next map.
-    Buf += offsetToAlignedAddr(Buf, Align(8));
-
-    auto CFR = reinterpret_cast<const FuncRecordType *>(FunBuf);
-    while ((const char *)CFR < FunEnd) {
-      // Read the function information
-      uint32_t DataSize = CFR->template getDataSize<Endian>();
-
-      // Now use that to read the coverage data.
-      if (CovBuf + DataSize > CovEnd)
-        return make_error<CoverageMapError>(coveragemap_error::malformed);
-      auto Mapping = StringRef(CovBuf, DataSize);
-      CovBuf += DataSize;
-
-      if (Error Err =
-              insertFunctionRecordIfNeeded(CFR, Mapping, FilenamesBegin))
-        return std::move(Err);
-      CFR++;
+    CovBuf += offsetToAlignedAddr(CovBuf, Align(8));
+
+    return CovBuf;
+  }
+
+  Error readFunctionRecords(const char *FuncRecBuf, const char *FuncRecBufEnd,
+                            Optional<FilenameRange> OutOfLineFileRange,
+                            const char *OutOfLineMappingBuf,
+                            const char *OutOfLineMappingBufEnd) override {
+    auto CFR = reinterpret_cast<const FuncRecordType *>(FuncRecBuf);
+    while ((const char *)CFR < FuncRecBufEnd) {
+      // Validate the length of the coverage mapping for this function.
+      const char *NextMappingBuf;
+      const FuncRecordType *NextCFR;
+      std::tie(NextMappingBuf, NextCFR) =
+          CFR->template advanceByOne<Endian>(OutOfLineMappingBuf);
+      if (Version < CovMapVersion::Version4)
+        if (NextMappingBuf > OutOfLineMappingBufEnd)
+          return make_error<CoverageMapError>(coveragemap_error::malformed);
+
+      // Look up the set of filenames associated with this function record.
+      Optional<FilenameRange> FileRange;
+      if (Version < CovMapVersion::Version4) {
+        FileRange = OutOfLineFileRange;
+      } else {
+        uint64_t FilenamesRef = CFR->template getFilenamesRef<Endian>();
+        auto It = FileRangeMap.find(FilenamesRef);
+        if (It == FileRangeMap.end())
+          return make_error<CoverageMapError>(coveragemap_error::malformed);
+        else
+          FileRange = It->getSecond();
+      }
+
+      // Now, read the coverage data.
+      if (FileRange && !FileRange->isInvalid()) {
+        StringRef Mapping =
+            CFR->template getCoverageMapping<Endian>(OutOfLineMappingBuf);
+        if (Version == CovMapVersion::Version4 &&
+            Mapping.data() + Mapping.size() > FuncRecBufEnd)
+          return make_error<CoverageMapError>(coveragemap_error::malformed);
+        if (Error Err = insertFunctionRecordIfNeeded(CFR, Mapping, *FileRange))
+          return Err;
+      }
+
+      std::tie(OutOfLineMappingBuf, CFR) = std::tie(NextMappingBuf, NextCFR);
     }
-    return Buf;
+    return Error::success();
   }
 };
 
@@ -543,29 +710,34 @@ Expected<std::unique_ptr<CovMapFuncRecordReader>> CovMapFuncRecordReader::get(
         CovMapVersion::Version1, IntPtrT, Endian>>(P, R, F);
   case CovMapVersion::Version2:
   case CovMapVersion::Version3:
+  case CovMapVersion::Version4:
     // Decompress the name data.
     if (Error E = P.create(P.getNameData()))
       return std::move(E);
     if (Version == CovMapVersion::Version2)
       return std::make_unique<VersionedCovMapFuncRecordReader<
           CovMapVersion::Version2, IntPtrT, Endian>>(P, R, F);
-    else
+    else if (Version == CovMapVersion::Version3)
       return std::make_unique<VersionedCovMapFuncRecordReader<
           CovMapVersion::Version3, IntPtrT, Endian>>(P, R, F);
+    else if (Version == CovMapVersion::Version4)
+      return std::make_unique<VersionedCovMapFuncRecordReader<
+          CovMapVersion::Version4, IntPtrT, Endian>>(P, R, F);
   }
   llvm_unreachable("Unsupported version");
 }
 
 template <typename T, support::endianness Endian>
 static Error readCoverageMappingData(
-    InstrProfSymtab &ProfileNames, StringRef Data,
+    InstrProfSymtab &ProfileNames, StringRef CovMap, StringRef FuncRecords,
     std::vector<BinaryCoverageReader::ProfileMappingRecord> &Records,
-    std::vector<StringRef> &Filenames) {
+    std::vector<StringRef> &Filenames,
+    BinaryCoverageReader::DecompressedData &Decompressed) {
   using namespace coverage;
 
   // Read the records in the coverage data section.
   auto CovHeader =
-      reinterpret_cast<const CovMapHeader *>(Data.data());
+      reinterpret_cast<const CovMapHeader *>(CovMap.data());
   CovMapVersion Version = (CovMapVersion)CovHeader->getVersion<Endian>();
   if (Version > CovMapVersion::CurrentVersion)
     return make_error<CoverageMapError>(coveragemap_error::unsupported_version);
@@ -575,12 +747,28 @@ static Error readCoverageMappingData(
   if (Error E = ReaderExpected.takeError())
     return E;
   auto Reader = std::move(ReaderExpected.get());
-  for (const char *Buf = Data.data(), *End = Buf + Data.size(); Buf < End;) {
-    auto NextHeaderOrErr = Reader->readFunctionRecords(Buf, End);
-    if (auto E = NextHeaderOrErr.takeError())
+  const char *CovBuf = CovMap.data();
+  const char *CovBufEnd = CovBuf + CovMap.size();
+  const char *FuncRecBuf = FuncRecords.data();
+  const char *FuncRecBufEnd = FuncRecords.data() + FuncRecords.size();
+  while (CovBuf < CovBufEnd) {
+    // Read the current coverage header & filename data.
+    //
+    // Prior to Version4, this also reads all function records affixed to the
+    // header.
+    //
+    // Return a pointer to the next coverage header.
+    auto NextOrErr =
+        Reader->readCoverageHeader(CovBuf, CovBufEnd, Decompressed);
+    if (auto E = NextOrErr.takeError())
       return E;
-    Buf = NextHeaderOrErr.get();
+    CovBuf = NextOrErr.get();
   }
+  // In Version4, function records are not affixed to coverage headers. Read
+  // the records from their dedicated section.
+  if (Version == CovMapVersion::Version4)
+    return Reader->readFunctionRecords(FuncRecBuf, FuncRecBufEnd, None, nullptr,
+                                       nullptr);
   return Error::success();
 }
 
@@ -588,31 +776,35 @@ static const char *TestingFormatMagic = "llvmcovmtestdata";
 
 Expected<std::unique_ptr<BinaryCoverageReader>>
 BinaryCoverageReader::createCoverageReaderFromBuffer(
-    StringRef Coverage, InstrProfSymtab &&ProfileNames, uint8_t BytesInAddress,
-    support::endianness Endian) {
-  std::unique_ptr<BinaryCoverageReader> Reader(new BinaryCoverageReader());
+    StringRef Coverage, std::string &&FuncRecords, InstrProfSymtab &&ProfileNames,
+    uint8_t BytesInAddress, support::endianness Endian) {
+  std::unique_ptr<BinaryCoverageReader> Reader(
+      new BinaryCoverageReader(std::move(FuncRecords)));
   Reader->ProfileNames = std::move(ProfileNames);
+  StringRef FuncRecordsRef = Reader->FuncRecords;
   if (BytesInAddress == 4 && Endian == support::endianness::little) {
     if (Error E =
             readCoverageMappingData<uint32_t, support::endianness::little>(
-                Reader->ProfileNames, Coverage, Reader->MappingRecords,
-                Reader->Filenames))
+                Reader->ProfileNames, Coverage, FuncRecordsRef,
+                Reader->MappingRecords, Reader->Filenames,
+                Reader->Decompressed))
       return std::move(E);
   } else if (BytesInAddress == 4 && Endian == support::endianness::big) {
     if (Error E = readCoverageMappingData<uint32_t, support::endianness::big>(
-            Reader->ProfileNames, Coverage, Reader->MappingRecords,
-            Reader->Filenames))
+            Reader->ProfileNames, Coverage, FuncRecordsRef,
+            Reader->MappingRecords, Reader->Filenames, Reader->Decompressed))
       return std::move(E);
   } else if (BytesInAddress == 8 && Endian == support::endianness::little) {
     if (Error E =
             readCoverageMappingData<uint64_t, support::endianness::little>(
-                Reader->ProfileNames, Coverage, Reader->MappingRecords,
-                Reader->Filenames))
+                Reader->ProfileNames, Coverage, FuncRecordsRef,
+                Reader->MappingRecords, Reader->Filenames,
+                Reader->Decompressed))
       return std::move(E);
   } else if (BytesInAddress == 8 && Endian == support::endianness::big) {
     if (Error E = readCoverageMappingData<uint64_t, support::endianness::big>(
-            Reader->ProfileNames, Coverage, Reader->MappingRecords,
-            Reader->Filenames))
+            Reader->ProfileNames, Coverage, FuncRecordsRef,
+            Reader->MappingRecords, Reader->Filenames, Reader->Decompressed))
       return std::move(E);
   } else
     return make_error<CoverageMapError>(coveragemap_error::malformed);
@@ -653,10 +845,13 @@ loadTestingFormat(StringRef Data) {
     return make_error<CoverageMapError>(coveragemap_error::malformed);
   CoverageMapping = CoverageMapping.substr(Pad);
   return BinaryCoverageReader::createCoverageReaderFromBuffer(
-      CoverageMapping, std::move(ProfileNames), BytesInAddress, Endian);
+      CoverageMapping, "", std::move(ProfileNames), BytesInAddress, Endian);
 }
 
-static Expected<SectionRef> lookupSection(ObjectFile &OF, StringRef Name) {
+/// Find all sections that match \p Name. There may be more than one if comdats
+/// are in use, e.g. for the __llvm_covfun section on ELF.
+static Expected<std::vector<SectionRef>> lookupSections(ObjectFile &OF,
+                                                        StringRef Name) {
   // On COFF, the object file section name may end in "$M". This tells the
   // linker to sort these sections between "$A" and "$Z". The linker removes the
   // dollar and everything after it in the final binary. Do the same to match.
@@ -666,14 +861,17 @@ static Expected<SectionRef> lookupSection(ObjectFile &OF, StringRef Name) {
   };
   Name = stripSuffix(Name);
 
+  std::vector<SectionRef> Sections;
   for (const auto &Section : OF.sections()) {
     Expected<StringRef> NameOrErr = Section.getName();
     if (!NameOrErr)
       return NameOrErr.takeError();
     if (stripSuffix(*NameOrErr) == Name)
-      return Section;
+      Sections.push_back(Section);
   }
-  return make_error<CoverageMapError>(coveragemap_error::no_data_found);
+  if (Sections.empty())
+    return make_error<CoverageMapError>(coveragemap_error::no_data_found);
+  return Sections;
 }
 
 static Expected<std::unique_ptr<BinaryCoverageReader>>
@@ -705,28 +903,51 @@ loadBinaryFormat(std::unique_ptr<Binary> Bin, StringRef Arch) {
   // Look for the sections that we are interested in.
   auto ObjFormat = OF->getTripleObjectFormat();
   auto NamesSection =
-      lookupSection(*OF, getInstrProfSectionName(IPSK_name, ObjFormat,
+      lookupSections(*OF, getInstrProfSectionName(IPSK_name, ObjFormat,
                                                  /*AddSegmentInfo=*/false));
   if (auto E = NamesSection.takeError())
     return std::move(E);
   auto CoverageSection =
-      lookupSection(*OF, getInstrProfSectionName(IPSK_covmap, ObjFormat,
-                                                 /*AddSegmentInfo=*/false));
+      lookupSections(*OF, getInstrProfSectionName(IPSK_covmap, ObjFormat,
+                                                  /*AddSegmentInfo=*/false));
   if (auto E = CoverageSection.takeError())
     return std::move(E);
-
-  // Get the contents of the given sections.
-  auto CoverageMappingOrErr = CoverageSection->getContents();
+  std::vector<SectionRef> CoverageSectionRefs = *CoverageSection;
+  if (CoverageSectionRefs.size() != 1)
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
+  auto CoverageMappingOrErr = CoverageSectionRefs.back().getContents();
   if (!CoverageMappingOrErr)
     return CoverageMappingOrErr.takeError();
+  StringRef CoverageMapping = CoverageMappingOrErr.get();
 
   InstrProfSymtab ProfileNames;
-  if (Error E = ProfileNames.create(*NamesSection))
+  std::vector<SectionRef> NamesSectionRefs = *NamesSection;
+  if (NamesSectionRefs.size() != 1)
+    return make_error<CoverageMapError>(coveragemap_error::malformed);
+  if (Error E = ProfileNames.create(NamesSectionRefs.back()))
     return std::move(E);
 
+  // Look for the coverage records section (Version4 only).
+  std::string FuncRecords;
+  auto CoverageRecordsSections =
+      lookupSections(*OF, getInstrProfSectionName(IPSK_covfun, ObjFormat,
+                                                  /*AddSegmentInfo=*/false));
+  if (auto E = CoverageRecordsSections.takeError())
+    consumeError(std::move(E));
+  else {
+    for (SectionRef Section : *CoverageRecordsSections) {
+      auto CoverageRecordsOrErr = Section.getContents();
+      if (!CoverageRecordsOrErr)
+        return CoverageRecordsOrErr.takeError();
+      FuncRecords += CoverageRecordsOrErr.get();
+      while (FuncRecords.size() % 8 != 0)
+        FuncRecords += '\0';
+    }
+  }
+
   return BinaryCoverageReader::createCoverageReaderFromBuffer(
-      CoverageMappingOrErr.get(), std::move(ProfileNames), BytesInAddress,
-      Endian);
+      CoverageMapping, std::move(FuncRecords), std::move(ProfileNames),
+      BytesInAddress, Endian);
 }
 
 Expected<std::vector<std::unique_ptr<BinaryCoverageReader>>>
diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
index d75854a60d1e..8d3c429c4484 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingWriter.cpp
@@ -11,9 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ProfileData/InstrProf.h"
 #include "llvm/ProfileData/Coverage/CoverageMappingWriter.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Support/Compression.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
@@ -34,12 +36,34 @@ CoverageFilenamesSectionWriter::CoverageFilenamesSectionWriter(
 #endif
 }
 
-void CoverageFilenamesSectionWriter::write(raw_ostream &OS) {
-  encodeULEB128(Filenames.size(), OS);
-  for (const auto &Filename : Filenames) {
-    encodeULEB128(Filename.size(), OS);
-    OS << Filename;
+void CoverageFilenamesSectionWriter::write(raw_ostream &OS, bool Compress) {
+  std::string FilenamesStr;
+  {
+    raw_string_ostream FilenamesOS{FilenamesStr};
+    for (const auto &Filename : Filenames) {
+      encodeULEB128(Filename.size(), FilenamesOS);
+      FilenamesOS << Filename;
+    }
+  }
+
+  SmallString<128> CompressedStr;
+  bool doCompression =
+      Compress && zlib::isAvailable() && DoInstrProfNameCompression;
+  if (doCompression) {
+    auto E =
+        zlib::compress(FilenamesStr, CompressedStr, zlib::BestSizeCompression);
+    if (E)
+      report_bad_alloc_error("Failed to zlib compress coverage data");
   }
+
+  // ::= <num-filenames>
+  //     <uncompressed-len>
+  //     <compressed-len-or-zero>
+  //     (<compressed-filenames> | <uncompressed-filenames>)
+  encodeULEB128(Filenames.size(), OS);
+  encodeULEB128(FilenamesStr.size(), OS);
+  encodeULEB128(doCompression ? CompressedStr.size() : 0U, OS);
+  OS << (doCompression ? StringRef(CompressedStr) : StringRef(FilenamesStr));
 }
 
 namespace {
diff --git a/llvm/lib/ProfileData/GCOV.cpp b/llvm/lib/ProfileData/GCOV.cpp
index 228c1b3b442a..71ea44a1a722 100644
--- a/llvm/lib/ProfileData/GCOV.cpp
+++ b/llvm/lib/ProfileData/GCOV.cpp
@@ -25,25 +25,118 @@
 
 using namespace llvm;
 
+enum : uint32_t {
+  GCOV_ARC_ON_TREE = 1 << 0,
+  GCOV_ARC_FALLTHROUGH = 1 << 2,
+
+  GCOV_TAG_FUNCTION = 0x01000000,
+  GCOV_TAG_BLOCKS = 0x01410000,
+  GCOV_TAG_ARCS = 0x01430000,
+  GCOV_TAG_LINES = 0x01450000,
+  GCOV_TAG_COUNTER_ARCS = 0x01a10000,
+  // GCOV_TAG_OBJECT_SUMMARY superseded GCOV_TAG_PROGRAM_SUMMARY in GCC 9.
+  GCOV_TAG_OBJECT_SUMMARY = 0xa1000000,
+  GCOV_TAG_PROGRAM_SUMMARY = 0xa3000000,
+};
+
 //===----------------------------------------------------------------------===//
 // GCOVFile implementation.
 
 /// readGCNO - Read GCNO buffer.
-bool GCOVFile::readGCNO(GCOVBuffer &Buffer) {
-  if (!Buffer.readGCNOFormat())
+bool GCOVFile::readGCNO(GCOVBuffer &buf) {
+  if (!buf.readGCNOFormat())
     return false;
-  if (!Buffer.readGCOVVersion(Version))
+  if (!buf.readGCOVVersion(Version))
     return false;
 
-  if (!Buffer.readInt(Checksum))
-    return false;
-  while (true) {
-    if (!Buffer.readFunctionTag())
-      break;
-    auto GFun = std::make_unique<GCOVFunction>(*this);
-    if (!GFun->readGCNO(Buffer, Version))
+  Checksum = buf.getWord();
+  if (Version >= GCOV::V900)
+    cwd = buf.getString();
+  if (Version >= GCOV::V800)
+    buf.getWord(); // hasUnexecutedBlocks
+
+  uint32_t tag, length;
+  GCOVFunction *fn;
+  while ((tag = buf.getWord())) {
+    if (!buf.readInt(length))
       return false;
-    Functions.push_back(std::move(GFun));
+    if (tag == GCOV_TAG_FUNCTION) {
+      Functions.push_back(std::make_unique<GCOVFunction>(*this));
+      fn = Functions.back().get();
+      fn->ident = buf.getWord();
+      fn->linenoChecksum = buf.getWord();
+      if (Version >= GCOV::V407)
+        fn->cfgChecksum = buf.getWord();
+      buf.readString(fn->Name);
+      StringRef filename;
+      if (Version < GCOV::V800) {
+        filename = buf.getString();
+        fn->startLine = buf.getWord();
+      } else {
+        fn->artificial = buf.getWord();
+        filename = buf.getString();
+        fn->startLine = buf.getWord();
+        fn->startColumn = buf.getWord();
+        fn->endLine = buf.getWord();
+        if (Version >= GCOV::V900)
+          fn->endColumn = buf.getWord();
+      }
+      auto r = filenameToIdx.try_emplace(filename, filenameToIdx.size());
+      if (r.second)
+        filenames.emplace_back(filename);
+      fn->srcIdx = r.first->second;
+      IdentToFunction[fn->ident] = fn;
+    } else if (tag == GCOV_TAG_BLOCKS && fn) {
+      if (Version < GCOV::V800) {
+        for (uint32_t i = 0; i != length; ++i) {
+          buf.getWord(); // Ignored block flags
+          fn->Blocks.push_back(std::make_unique<GCOVBlock>(*fn, i));
+        }
+      } else {
+        uint32_t num = buf.getWord();
+        for (uint32_t i = 0; i != num; ++i)
+          fn->Blocks.push_back(std::make_unique<GCOVBlock>(*fn, i));
+      }
+    } else if (tag == GCOV_TAG_ARCS && fn) {
+      uint32_t srcNo = buf.getWord();
+      if (srcNo >= fn->Blocks.size()) {
+        errs() << "unexpected block number: " << srcNo << " (in "
+               << fn->Blocks.size() << ")\n";
+        return false;
+      }
+      GCOVBlock *src = fn->Blocks[srcNo].get();
+      for (uint32_t i = 0, e = (length - 1) / 2; i != e; ++i) {
+        uint32_t dstNo = buf.getWord(), flags = buf.getWord();
+        GCOVBlock *dst = fn->Blocks[dstNo].get();
+        auto arc =
+            std::make_unique<GCOVArc>(*src, *dst, flags & GCOV_ARC_FALLTHROUGH);
+        src->addDstEdge(arc.get());
+        dst->addSrcEdge(arc.get());
+        if (flags & GCOV_ARC_ON_TREE)
+          fn->treeArcs.push_back(std::move(arc));
+        else
+          fn->arcs.push_back(std::move(arc));
+      }
+    } else if (tag == GCOV_TAG_LINES && fn) {
+      uint32_t srcNo = buf.getWord();
+      if (srcNo >= fn->Blocks.size()) {
+        errs() << "unexpected block number: " << srcNo << " (in "
+               << fn->Blocks.size() << ")\n";
+        return false;
+      }
+      GCOVBlock &Block = *fn->Blocks[srcNo];
+      for (;;) {
+        uint32_t line = buf.getWord();
+        if (line)
+          Block.addLine(line);
+        else {
+          StringRef filename = buf.getString();
+          if (filename.empty())
+            break;
+          // TODO Unhandled
+        }
+      }
+    }
   }
 
   GCNOInitialized = true;
@@ -52,12 +145,12 @@ bool GCOVFile::readGCNO(GCOVBuffer &Buffer) {
 
 /// readGCDA - Read GCDA buffer. It is required that readGCDA() can only be
 /// called after readGCNO().
-bool GCOVFile::readGCDA(GCOVBuffer &Buffer) {
+bool GCOVFile::readGCDA(GCOVBuffer &buf) {
   assert(GCNOInitialized && "readGCDA() can only be called after readGCNO()");
-  if (!Buffer.readGCDAFormat())
+  if (!buf.readGCDAFormat())
     return false;
   GCOV::GCOVVersion GCDAVersion;
-  if (!Buffer.readGCOVVersion(GCDAVersion))
+  if (!buf.readGCOVVersion(GCDAVersion))
     return false;
   if (Version != GCDAVersion) {
     errs() << "GCOV versions do not match.\n";
@@ -65,48 +158,87 @@ bool GCOVFile::readGCDA(GCOVBuffer &Buffer) {
   }
 
   uint32_t GCDAChecksum;
-  if (!Buffer.readInt(GCDAChecksum))
+  if (!buf.readInt(GCDAChecksum))
     return false;
   if (Checksum != GCDAChecksum) {
     errs() << "File checksums do not match: " << Checksum
            << " != " << GCDAChecksum << ".\n";
     return false;
   }
-  for (size_t i = 0, e = Functions.size(); i < e; ++i) {
-    if (!Buffer.readFunctionTag()) {
-      errs() << "Unexpected number of functions.\n";
+  uint32_t dummy, tag, length;
+  uint32_t ident;
+  GCOVFunction *fn = nullptr;
+  while ((tag = buf.getWord())) {
+    if (!buf.readInt(length))
       return false;
+    uint32_t pos = buf.cursor.tell();
+    if (tag == GCOV_TAG_OBJECT_SUMMARY) {
+      buf.readInt(RunCount);
+      buf.readInt(dummy);
+      // clang<11 uses a fake 4.2 format which sets length to 9.
+      if (length == 9)
+        buf.readInt(RunCount);
+    } else if (tag == GCOV_TAG_PROGRAM_SUMMARY) {
+      // clang<11 uses a fake 4.2 format which sets length to 0.
+      if (length > 0) {
+        buf.readInt(dummy);
+        buf.readInt(dummy);
+        buf.readInt(RunCount);
+      }
+      ++ProgramCount;
+    } else if (tag == GCOV_TAG_FUNCTION) {
+      if (length == 0) // Placeholder
+        continue;
+      // As of GCC 10, GCOV_TAG_FUNCTION_LENGTH has never been larger than 3.
+      // However, clang<11 uses a fake 4.2 format which may set length larger
+      // than 3.
+      if (length < 2 || !buf.readInt(ident))
+        return false;
+      auto It = IdentToFunction.find(ident);
+      uint32_t linenoChecksum, cfgChecksum = 0;
+      buf.readInt(linenoChecksum);
+      if (Version >= GCOV::V407)
+        buf.readInt(cfgChecksum);
+      if (It != IdentToFunction.end()) {
+        fn = It->second;
+        if (linenoChecksum != fn->linenoChecksum ||
+            cfgChecksum != fn->cfgChecksum) {
+          errs() << fn->Name
+                 << format(": checksum mismatch, (%u, %u) != (%u, %u)\n",
+                           linenoChecksum, cfgChecksum, fn->linenoChecksum,
+                           fn->cfgChecksum);
+          return false;
+        }
+      }
+    } else if (tag == GCOV_TAG_COUNTER_ARCS && fn) {
+      if (length != 2 * fn->arcs.size()) {
+        errs() << fn->Name
+               << format(
+                      ": GCOV_TAG_COUNTER_ARCS mismatch, got %u, expected %u\n",
+                      length, unsigned(2 * fn->arcs.size()));
+        return false;
+      }
+      for (std::unique_ptr<GCOVArc> &arc : fn->arcs) {
+        if (!buf.readInt64(arc->Count))
+          return false;
+        // FIXME Fix counters
+        arc->src.Counter += arc->Count;
+        if (arc->dst.succ.empty())
+          arc->dst.Counter += arc->Count;
+      }
     }
-    if (!Functions[i]->readGCDA(Buffer, Version))
-      return false;
-  }
-  if (Buffer.readObjectTag()) {
-    uint32_t Length;
-    uint32_t Dummy;
-    if (!Buffer.readInt(Length))
-      return false;
-    if (!Buffer.readInt(Dummy))
-      return false; // checksum
-    if (!Buffer.readInt(Dummy))
-      return false; // num
-    if (!Buffer.readInt(RunCount))
-      return false;
-    Buffer.advanceCursor(Length - 3);
-  }
-  while (Buffer.readProgramTag()) {
-    uint32_t Length;
-    if (!Buffer.readInt(Length))
+    pos += 4 * length;
+    if (pos < buf.cursor.tell())
       return false;
-    Buffer.advanceCursor(Length);
-    ++ProgramCount;
+    buf.de.skip(buf.cursor, pos - buf.cursor.tell());
   }
 
   return true;
 }
 
 void GCOVFile::print(raw_ostream &OS) const {
-  for (const auto &FPtr : Functions)
-    FPtr->print(OS);
+  for (const GCOVFunction &f : *this)
+    f.print(OS);
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -116,225 +248,22 @@ LLVM_DUMP_METHOD void GCOVFile::dump() const { print(dbgs()); }
 
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
-void GCOVFile::collectLineCounts(FileInfo &FI) {
-  for (const auto &FPtr : Functions)
-    FPtr->collectLineCounts(FI);
-  FI.setRunCount(RunCount);
-  FI.setProgramCount(ProgramCount);
+void GCOVFile::collectLineCounts(FileInfo &fi) {
+  assert(fi.sources.empty());
+  for (StringRef filename : filenames)
+    fi.sources.emplace_back(filename);
+  for (GCOVFunction &f : *this) {
+    f.collectLineCounts(fi);
+    fi.sources[f.srcIdx].functions.push_back(&f);
+  }
+  fi.setRunCount(RunCount);
+  fi.setProgramCount(ProgramCount);
 }
 
 //===----------------------------------------------------------------------===//
 // GCOVFunction implementation.
 
-/// readGCNO - Read a function from the GCNO buffer. Return false if an error
-/// occurs.
-bool GCOVFunction::readGCNO(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
-  uint32_t Dummy;
-  if (!Buff.readInt(Dummy))
-    return false; // Function header length
-  if (!Buff.readInt(Ident))
-    return false;
-  if (!Buff.readInt(Checksum))
-    return false;
-  if (Version != GCOV::V402) {
-    uint32_t CfgChecksum;
-    if (!Buff.readInt(CfgChecksum))
-      return false;
-    if (Parent.getChecksum() != CfgChecksum) {
-      errs() << "File checksums do not match: " << Parent.getChecksum()
-             << " != " << CfgChecksum << " in (" << Name << ").\n";
-      return false;
-    }
-  }
-  if (!Buff.readString(Name))
-    return false;
-  if (!Buff.readString(Filename))
-    return false;
-  if (!Buff.readInt(LineNumber))
-    return false;
-
-  // read blocks.
-  if (!Buff.readBlockTag()) {
-    errs() << "Block tag not found.\n";
-    return false;
-  }
-  uint32_t BlockCount;
-  if (!Buff.readInt(BlockCount))
-    return false;
-  for (uint32_t i = 0, e = BlockCount; i != e; ++i) {
-    if (!Buff.readInt(Dummy))
-      return false; // Block flags;
-    Blocks.push_back(std::make_unique<GCOVBlock>(*this, i));
-  }
-
-  // read edges.
-  while (Buff.readEdgeTag()) {
-    uint32_t EdgeCount;
-    if (!Buff.readInt(EdgeCount))
-      return false;
-    EdgeCount = (EdgeCount - 1) / 2;
-    uint32_t BlockNo;
-    if (!Buff.readInt(BlockNo))
-      return false;
-    if (BlockNo >= BlockCount) {
-      errs() << "Unexpected block number: " << BlockNo << " (in " << Name
-             << ").\n";
-      return false;
-    }
-    for (uint32_t i = 0, e = EdgeCount; i != e; ++i) {
-      uint32_t Dst;
-      if (!Buff.readInt(Dst))
-        return false;
-      Edges.push_back(std::make_unique<GCOVEdge>(*Blocks[BlockNo], *Blocks[Dst]));
-      GCOVEdge *Edge = Edges.back().get();
-      Blocks[BlockNo]->addDstEdge(Edge);
-      Blocks[Dst]->addSrcEdge(Edge);
-      if (!Buff.readInt(Dummy))
-        return false; // Edge flag
-    }
-  }
-
-  // read line table.
-  while (Buff.readLineTag()) {
-    uint32_t LineTableLength;
-    // Read the length of this line table.
-    if (!Buff.readInt(LineTableLength))
-      return false;
-    uint32_t EndPos = Buff.getCursor() + LineTableLength * 4;
-    uint32_t BlockNo;
-    // Read the block number this table is associated with.
-    if (!Buff.readInt(BlockNo))
-      return false;
-    if (BlockNo >= BlockCount) {
-      errs() << "Unexpected block number: " << BlockNo << " (in " << Name
-             << ").\n";
-      return false;
-    }
-    GCOVBlock &Block = *Blocks[BlockNo];
-    // Read the word that pads the beginning of the line table. This may be a
-    // flag of some sort, but seems to always be zero.
-    if (!Buff.readInt(Dummy))
-      return false;
-
-    // Line information starts here and continues up until the last word.
-    if (Buff.getCursor() != (EndPos - sizeof(uint32_t))) {
-      StringRef F;
-      // Read the source file name.
-      if (!Buff.readString(F))
-        return false;
-      if (Filename != F) {
-        errs() << "Multiple sources for a single basic block: " << Filename
-               << " != " << F << " (in " << Name << ").\n";
-        return false;
-      }
-      // Read lines up to, but not including, the null terminator.
-      while (Buff.getCursor() < (EndPos - 2 * sizeof(uint32_t))) {
-        uint32_t Line;
-        if (!Buff.readInt(Line))
-          return false;
-        // Line 0 means this instruction was injected by the compiler. Skip it.
-        if (!Line)
-          continue;
-        Block.addLine(Line);
-      }
-      // Read the null terminator.
-      if (!Buff.readInt(Dummy))
-        return false;
-    }
-    // The last word is either a flag or padding, it isn't clear which. Skip
-    // over it.
-    if (!Buff.readInt(Dummy))
-      return false;
-  }
-  return true;
-}
-
-/// readGCDA - Read a function from the GCDA buffer. Return false if an error
-/// occurs.
-bool GCOVFunction::readGCDA(GCOVBuffer &Buff, GCOV::GCOVVersion Version) {
-  uint32_t HeaderLength;
-  if (!Buff.readInt(HeaderLength))
-    return false; // Function header length
-
-  uint64_t EndPos = Buff.getCursor() + HeaderLength * sizeof(uint32_t);
-
-  uint32_t GCDAIdent;
-  if (!Buff.readInt(GCDAIdent))
-    return false;
-  if (Ident != GCDAIdent) {
-    errs() << "Function identifiers do not match: " << Ident
-           << " != " << GCDAIdent << " (in " << Name << ").\n";
-    return false;
-  }
-
-  uint32_t GCDAChecksum;
-  if (!Buff.readInt(GCDAChecksum))
-    return false;
-  if (Checksum != GCDAChecksum) {
-    errs() << "Function checksums do not match: " << Checksum
-           << " != " << GCDAChecksum << " (in " << Name << ").\n";
-    return false;
-  }
-
-  uint32_t CfgChecksum;
-  if (Version != GCOV::V402) {
-    if (!Buff.readInt(CfgChecksum))
-      return false;
-    if (Parent.getChecksum() != CfgChecksum) {
-      errs() << "File checksums do not match: " << Parent.getChecksum()
-             << " != " << CfgChecksum << " (in " << Name << ").\n";
-      return false;
-    }
-  }
-
-  if (Buff.getCursor() < EndPos) {
-    StringRef GCDAName;
-    if (!Buff.readString(GCDAName))
-      return false;
-    if (Name != GCDAName) {
-      errs() << "Function names do not match: " << Name << " != " << GCDAName
-             << ".\n";
-      return false;
-    }
-  }
-
-  if (!Buff.readArcTag()) {
-    errs() << "Arc tag not found (in " << Name << ").\n";
-    return false;
-  }
-
-  uint32_t Count;
-  if (!Buff.readInt(Count))
-    return false;
-  Count /= 2;
-
-  // This for loop adds the counts for each block. A second nested loop is
-  // required to combine the edge counts that are contained in the GCDA file.
-  for (uint32_t BlockNo = 0; Count > 0; ++BlockNo) {
-    // The last block is always reserved for exit block
-    if (BlockNo >= Blocks.size()) {
-      errs() << "Unexpected number of edges (in " << Name << ").\n";
-      return false;
-    }
-    if (BlockNo == Blocks.size() - 1)
-      errs() << "(" << Name << ") has arcs from exit block.\n";
-    GCOVBlock &Block = *Blocks[BlockNo];
-    for (size_t EdgeNo = 0, End = Block.getNumDstEdges(); EdgeNo < End;
-         ++EdgeNo) {
-      if (Count == 0) {
-        errs() << "Unexpected number of edges (in " << Name << ").\n";
-        return false;
-      }
-      uint64_t ArcCount;
-      if (!Buff.readInt64(ArcCount))
-        return false;
-      Block.addCount(EdgeNo, ArcCount);
-      --Count;
-    }
-    Block.sortDstEdges();
-  }
-  return true;
-}
+StringRef GCOVFunction::getFilename() const { return file.filenames[srcIdx]; }
 
 /// getEntryCount - Get the number of times the function was called by
 /// retrieving the entry block's count.
@@ -349,8 +278,8 @@ uint64_t GCOVFunction::getExitCount() const {
 }
 
 void GCOVFunction::print(raw_ostream &OS) const {
-  OS << "===== " << Name << " (" << Ident << ") @ " << Filename << ":"
-     << LineNumber << "\n";
+  OS << "===== " << Name << " (" << ident << ") @ " << getFilename() << ":"
+     << startLine << "\n";
   for (const auto &Block : Blocks)
     Block->print(OS);
 }
@@ -365,43 +294,17 @@ LLVM_DUMP_METHOD void GCOVFunction::dump() const { print(dbgs()); }
 void GCOVFunction::collectLineCounts(FileInfo &FI) {
   // If the line number is zero, this is a function that doesn't actually appear
   // in the source file, so there isn't anything we can do with it.
-  if (LineNumber == 0)
+  if (startLine == 0)
     return;
 
   for (const auto &Block : Blocks)
     Block->collectLineCounts(FI);
-  FI.addFunctionLine(Filename, LineNumber, this);
+  FI.addFunctionLine(getFilename(), startLine, this);
 }
 
 //===----------------------------------------------------------------------===//
 // GCOVBlock implementation.
 
-/// ~GCOVBlock - Delete GCOVBlock and its content.
-GCOVBlock::~GCOVBlock() {
-  SrcEdges.clear();
-  DstEdges.clear();
-  Lines.clear();
-}
-
-/// addCount - Add to block counter while storing the edge count. If the
-/// destination has no outgoing edges, also update that block's count too.
-void GCOVBlock::addCount(size_t DstEdgeNo, uint64_t N) {
-  assert(DstEdgeNo < DstEdges.size()); // up to caller to ensure EdgeNo is valid
-  DstEdges[DstEdgeNo]->Count = N;
-  Counter += N;
-  if (!DstEdges[DstEdgeNo]->Dst.getNumDstEdges())
-    DstEdges[DstEdgeNo]->Dst.Counter += N;
-}
-
-/// sortDstEdges - Sort destination edges by block number, nop if already
-/// sorted. This is required for printing branch info in the correct order.
-void GCOVBlock::sortDstEdges() {
-  if (!DstEdgesAreSorted)
-    llvm::stable_sort(DstEdges, [](const GCOVEdge *E1, const GCOVEdge *E2) {
-      return E1->Dst.Number < E2->Dst.Number;
-    });
-}
-
 /// collectLineCounts - Collect line counts. This must be used after
 /// reading .gcno and .gcda files.
 void GCOVBlock::collectLineCounts(FileInfo &FI) {
@@ -411,16 +314,16 @@ void GCOVBlock::collectLineCounts(FileInfo &FI) {
 
 void GCOVBlock::print(raw_ostream &OS) const {
   OS << "Block : " << Number << " Counter : " << Counter << "\n";
-  if (!SrcEdges.empty()) {
+  if (!pred.empty()) {
     OS << "\tSource Edges : ";
-    for (const GCOVEdge *Edge : SrcEdges)
-      OS << Edge->Src.Number << " (" << Edge->Count << "), ";
+    for (const GCOVArc *Edge : pred)
+      OS << Edge->src.Number << " (" << Edge->Count << "), ";
     OS << "\n";
   }
-  if (!DstEdges.empty()) {
+  if (!succ.empty()) {
     OS << "\tDestination Edges : ";
-    for (const GCOVEdge *Edge : DstEdges)
-      OS << Edge->Dst.Number << " (" << Edge->Count << "), ";
+    for (const GCOVArc *Edge : succ)
+      OS << Edge->dst.Number << " (" << Edge->Count << "), ";
     OS << "\n";
   }
   if (!Lines.empty()) {
@@ -482,7 +385,7 @@ bool GCOVBlock::lookForCircuit(const GCOVBlock *V, const GCOVBlock *Start,
   bool FoundCircuit = false;
 
   for (auto E : V->dsts()) {
-    const GCOVBlock *W = &E->Dst;
+    const GCOVBlock *W = &E->dst;
     if (W < Start || find(Blocks, W) == Blocks.end()) {
       continue;
     }
@@ -506,7 +409,7 @@ bool GCOVBlock::lookForCircuit(const GCOVBlock *V, const GCOVBlock *Start,
     GCOVBlock::unblock(V, Blocked, BlockLists);
   } else {
     for (auto E : V->dsts()) {
-      const GCOVBlock *W = &E->Dst;
+      const GCOVBlock *W = &E->dst;
       if (W < Start || find(Blocks, W) == Blocks.end()) {
         continue;
       }
@@ -545,7 +448,7 @@ uint64_t GCOVBlock::getLineCount(const BlockVector &Blocks) {
     } else {
       // Add counts from predecessors that are not on the same line.
       for (auto E : Block->srcs()) {
-        const GCOVBlock *W = &E->Src;
+        const GCOVBlock *W = &E->src;
         if (find(Blocks, W) == Blocks.end()) {
           Count += E->Count;
         }
@@ -617,6 +520,7 @@ class LineConsumer {
   StringRef Remaining;
 
 public:
+  LineConsumer() = default;
   LineConsumer(StringRef Filename) {
     ErrorOr<std::unique_ptr<MemoryBuffer>> BufferOrErr =
         MemoryBuffer::getFileOrSTDIN(Filename);
@@ -672,7 +576,7 @@ static std::string mangleCoveragePath(StringRef Filename, bool PreservePaths) {
 
   if (S < I)
     Result.append(S, I);
-  return Result.str();
+  return std::string(Result.str());
 }
 
 std::string FileInfo::getCoveragePath(StringRef Filename,
@@ -681,7 +585,7 @@ std::string FileInfo::getCoveragePath(StringRef Filename,
     // This is probably a bug in gcov, but when -n is specified, paths aren't
     // mangled at all, and the -l and -p options are ignored. Here, we do the
     // same.
-    return Filename;
+    return std::string(Filename);
 
   std::string CoveragePath;
   if (Options.LongFileNames && !Filename.equals(MainFilename))
@@ -693,7 +597,7 @@ std::string FileInfo::getCoveragePath(StringRef Filename,
     MD5::MD5Result Result;
     Hasher.update(Filename.str());
     Hasher.final(Result);
-    CoveragePath += "##" + Result.digest().str().str();
+    CoveragePath += "##" + std::string(Result.digest());
   }
   CoveragePath += ".gcov";
   return CoveragePath;
@@ -701,9 +605,6 @@ std::string FileInfo::getCoveragePath(StringRef Filename,
 
 std::unique_ptr<raw_ostream>
 FileInfo::openCoveragePath(StringRef CoveragePath) {
-  if (Options.NoOutput)
-    return std::make_unique<raw_null_ostream>();
-
   std::error_code EC;
   auto OS =
       std::make_unique<raw_fd_ostream>(CoveragePath, EC, sys::fs::OF_Text);
@@ -716,24 +617,30 @@ FileInfo::openCoveragePath(StringRef CoveragePath) {
 
 /// print -  Print source files with collected line count information.
 void FileInfo::print(raw_ostream &InfoOS, StringRef MainFilename,
-                     StringRef GCNOFile, StringRef GCDAFile) {
+                     StringRef GCNOFile, StringRef GCDAFile, GCOVFile &file) {
   SmallVector<StringRef, 4> Filenames;
   for (const auto &LI : LineInfo)
     Filenames.push_back(LI.first());
   llvm::sort(Filenames);
 
   for (StringRef Filename : Filenames) {
-    auto AllLines = LineConsumer(Filename);
-
+    auto AllLines =
+        Options.Intermediate ? LineConsumer() : LineConsumer(Filename);
     std::string CoveragePath = getCoveragePath(Filename, MainFilename);
-    std::unique_ptr<raw_ostream> CovStream = openCoveragePath(CoveragePath);
-    raw_ostream &CovOS = *CovStream;
+    std::unique_ptr<raw_ostream> CovStream;
+    if (Options.NoOutput || Options.Intermediate)
+      CovStream = std::make_unique<raw_null_ostream>();
+    else if (!Options.UseStdout)
+      CovStream = openCoveragePath(CoveragePath);
+    raw_ostream &CovOS =
+        !Options.NoOutput && Options.UseStdout ? llvm::outs() : *CovStream;
 
     CovOS << "        -:    0:Source:" << Filename << "\n";
     CovOS << "        -:    0:Graph:" << GCNOFile << "\n";
     CovOS << "        -:    0:Data:" << GCDAFile << "\n";
     CovOS << "        -:    0:Runs:" << RunCount << "\n";
-    CovOS << "        -:    0:Programs:" << ProgramCount << "\n";
+    if (file.getVersion() < GCOV::V900)
+      CovOS << "        -:    0:Programs:" << ProgramCount << "\n";
 
     const LineData &Line = LineInfo[Filename];
     GCOVCoverage FileCoverage(Filename);
@@ -815,19 +722,67 @@ void FileInfo::print(raw_ostream &InfoOS, StringRef MainFilename,
             if (NumEdges > 1)
               printBranchInfo(CovOS, *Block, FileCoverage, EdgeNo);
             else if (Options.UncondBranch && NumEdges == 1)
-              printUncondBranchInfo(CovOS, EdgeNo,
-                                    (*Block->dst_begin())->Count);
+              printUncondBranchInfo(CovOS, EdgeNo, Block->succ[0]->Count);
+          }
+        }
+      }
+    }
+    SourceInfo &source = sources[file.filenameToIdx.find(Filename)->second];
+    source.name = CoveragePath;
+    source.coverage = FileCoverage;
+  }
+
+  if (Options.Intermediate && !Options.NoOutput) {
+    // gcov 7.* unexpectedly create multiple .gcov files, which was fixed in 8.0
+    // (PR GCC/82702). We create just one file.
+    std::string outputPath(sys::path::filename(MainFilename));
+    std::error_code ec;
+    raw_fd_ostream os(outputPath + ".gcov", ec, sys::fs::OF_Text);
+    if (ec) {
+      errs() << ec.message() << "\n";
+      return;
+    }
+
+    for (const SourceInfo &source : sources) {
+      os << "file:" << source.filename << '\n';
+      for (const GCOVFunction *f : source.functions)
+        os << "function:" << f->startLine << ',' << f->getEntryCount() << ','
+           << f->Name << '\n';
+      const LineData &line = LineInfo[source.filename];
+      for (uint32_t lineNum = 0; lineNum != line.LastLine; ++lineNum) {
+        BlockLines::const_iterator BlocksIt = line.Blocks.find(lineNum);
+        if (BlocksIt == line.Blocks.end())
+          continue;
+        const BlockVector &blocks = BlocksIt->second;
+        // GCC 8 (r254259) added third third field for Ada:
+        // lcount:<line>,<count>,<has_unexecuted_blocks>
+        // We don't need the third field.
+        os << "lcount:" << (lineNum + 1) << ','
+           << GCOVBlock::getLineCount(blocks) << '\n';
+
+        if (!Options.BranchInfo)
+          continue;
+        for (const GCOVBlock *block : blocks) {
+          if (block->getLastLine() != lineNum + 1 ||
+              block->getNumDstEdges() < 2)
+            continue;
+          for (const GCOVArc *arc : block->dsts()) {
+            const char *type = block->getCount()
+                                   ? arc->Count ? "taken" : "nottaken"
+                                   : "notexec";
+            os << "branch:" << (lineNum + 1) << ',' << type << '\n';
           }
         }
       }
     }
-    FileCoverages.push_back(std::make_pair(CoveragePath, FileCoverage));
   }
 
-  // FIXME: There is no way to detect calls given current instrumentation.
-  if (Options.FuncCoverage)
-    printFuncCoverage(InfoOS);
-  printFileCoverage(InfoOS);
+  if (!Options.UseStdout) {
+    // FIXME: There is no way to detect calls given current instrumentation.
+    if (Options.FuncCoverage)
+      printFuncCoverage(InfoOS);
+    printFileCoverage(InfoOS);
+  }
 }
 
 /// printFunctionSummary - Print function and block summary.
@@ -862,7 +817,7 @@ void FileInfo::printBranchInfo(raw_ostream &OS, const GCOVBlock &Block,
                                GCOVCoverage &Coverage, uint32_t &EdgeNo) {
   SmallVector<uint64_t, 16> BranchCounts;
   uint64_t TotalCounts = 0;
-  for (const GCOVEdge *Edge : Block.dsts()) {
+  for (const GCOVArc *Edge : Block.dsts()) {
     BranchCounts.push_back(Edge->Count);
     TotalCounts += Edge->Count;
     if (Block.getCount())
@@ -928,13 +883,12 @@ void FileInfo::printFuncCoverage(raw_ostream &OS) const {
 
 // printFileCoverage - Print per-file coverage info.
 void FileInfo::printFileCoverage(raw_ostream &OS) const {
-  for (const auto &FC : FileCoverages) {
-    const std::string &Filename = FC.first;
-    const GCOVCoverage &Coverage = FC.second;
+  for (const SourceInfo &source : sources) {
+    const GCOVCoverage &Coverage = source.coverage;
     OS << "File '" << Coverage.Name << "'\n";
     printCoverage(OS, Coverage);
-    if (!Options.NoOutput)
-      OS << Coverage.Name << ":creating '" << Filename << "'\n";
+    if (!Options.NoOutput && !Options.Intermediate)
+      OS << "Creating '" << source.name << "'\n";
     OS << "\n";
   }
 }
diff --git a/llvm/lib/ProfileData/InstrProf.cpp b/llvm/lib/ProfileData/InstrProf.cpp
index 57d4fbc59f83..b9d8ae9ba60d 100644
--- a/llvm/lib/ProfileData/InstrProf.cpp
+++ b/llvm/lib/ProfileData/InstrProf.cpp
@@ -162,6 +162,10 @@ const char *InstrProfSectNamePrefix[] = {
 
 namespace llvm {
 
+cl::opt<bool> DoInstrProfNameCompression(
+    "enable-name-compression",
+    cl::desc("Enable name/filename string compression"), cl::init(true));
+
 std::string getInstrProfSectionName(InstrProfSectKind IPSK,
                                     Triple::ObjectFormatType OF,
                                     bool AddSegmentInfo) {
@@ -286,7 +290,7 @@ StringRef getFuncNameWithoutPrefix(StringRef PGOFuncName, StringRef FileName) {
 // symbol is created to hold the name. Return the legalized symbol name.
 std::string getPGOFuncNameVarName(StringRef FuncName,
                                   GlobalValue::LinkageTypes Linkage) {
-  std::string VarName = getInstrProfNameVarPrefix();
+  std::string VarName = std::string(getInstrProfNameVarPrefix());
   VarName += FuncName;
 
   if (!GlobalValue::isLocalLinkage(Linkage))
@@ -427,7 +431,7 @@ Error collectPGOFuncNameStrings(ArrayRef<GlobalVariable *> NameVars,
                                 std::string &Result, bool doCompression) {
   std::vector<std::string> NameStrs;
   for (auto *NameVar : NameVars) {
-    NameStrs.push_back(getPGOFuncNameVarInitializer(NameVar));
+    NameStrs.push_back(std::string(getPGOFuncNameVarInitializer(NameVar)));
   }
   return collectPGOFuncNameStrings(
       NameStrs, zlib::isAvailable() && doCompression, Result);
diff --git a/llvm/lib/ProfileData/InstrProfReader.cpp b/llvm/lib/ProfileData/InstrProfReader.cpp
index b904f983dceb..16a69cb5457b 100644
--- a/llvm/lib/ProfileData/InstrProfReader.cpp
+++ b/llvm/lib/ProfileData/InstrProfReader.cpp
@@ -15,6 +15,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/ProfileSummary.h"
 #include "llvm/ProfileData/InstrProf.h"
@@ -144,7 +145,7 @@ bool TextInstrProfReader::hasFormat(const MemoryBuffer &Buffer) {
   StringRef buffer = Buffer.getBufferStart();
   return count == 0 ||
          std::all_of(buffer.begin(), buffer.begin() + count,
-                     [](char c) { return isPrint(c) || ::isspace(c); });
+                     [](char c) { return isPrint(c) || isSpace(c); });
 }
 
 // Read the profile variant flag from the header: ":FE" means this is a FE
@@ -422,11 +423,11 @@ Error RawInstrProfReader<IntPtrT>::readRawCounts(
 
   // Check bounds. Note that the counter pointer embedded in the data record
   // may itself be corrupt.
-  if (NumCounters > MaxNumCounters)
+  if (MaxNumCounters < 0 || NumCounters > (uint32_t)MaxNumCounters)
     return error(instrprof_error::malformed);
   ptrdiff_t CounterOffset = getCounterOffset(CounterPtr);
   if (CounterOffset < 0 || CounterOffset > MaxNumCounters ||
-      (CounterOffset + NumCounters) > MaxNumCounters)
+      ((uint32_t)CounterOffset + NumCounters) > (uint32_t)MaxNumCounters)
     return error(instrprof_error::malformed);
 
   auto RawCounts = makeArrayRef(getCounter(CounterOffset), NumCounters);
diff --git a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
index 3299b5f92069..5d3a07640942 100644
--- a/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
+++ b/llvm/lib/ProfileData/ProfileSummaryBuilder.cpp
@@ -31,6 +31,19 @@ static const uint32_t DefaultCutoffsData[] = {
 const ArrayRef<uint32_t> ProfileSummaryBuilder::DefaultCutoffs =
     DefaultCutoffsData;
 
+const ProfileSummaryEntry &
+ProfileSummaryBuilder::getEntryForPercentile(SummaryEntryVector &DS,
+                                             uint64_t Percentile) {
+  auto It = partition_point(DS, [=](const ProfileSummaryEntry &Entry) {
+    return Entry.Cutoff < Percentile;
+  });
+  // The required percentile has to be <= one of the percentiles in the
+  // detailed summary.
+  if (It == DS.end())
+    report_fatal_error("Desired percentile exceeds the maximum cutoff");
+  return *It;
+}
+
 void InstrProfSummaryBuilder::addRecord(const InstrProfRecord &R) {
   // The first counter is not necessarily an entry count for IR
   // instrumentation profiles.
diff --git a/llvm/lib/ProfileData/SampleProf.cpp b/llvm/lib/ProfileData/SampleProf.cpp
index 003e8d4d4296..e5d0fdba5fc4 100644
--- a/llvm/lib/ProfileData/SampleProf.cpp
+++ b/llvm/lib/ProfileData/SampleProf.cpp
@@ -30,6 +30,7 @@ using namespace sampleprof;
 namespace llvm {
 namespace sampleprof {
 SampleProfileFormat FunctionSamples::Format;
+bool FunctionSamples::UseMD5;
 } // namespace sampleprof
 } // namespace llvm
 
diff --git a/llvm/lib/ProfileData/SampleProfReader.cpp b/llvm/lib/ProfileData/SampleProfReader.cpp
index 001aafce7bfd..03f1ac190b91 100644
--- a/llvm/lib/ProfileData/SampleProfReader.cpp
+++ b/llvm/lib/ProfileData/SampleProfReader.cpp
@@ -245,7 +245,7 @@ std::error_code SampleProfileReaderText::readImpl() {
           InlineStack.pop_back();
         }
         FunctionSamples &FSamples = InlineStack.back()->functionSamplesAt(
-            LineLocation(LineOffset, Discriminator))[FName];
+            LineLocation(LineOffset, Discriminator))[std::string(FName)];
         FSamples.setName(FName);
         MergeResult(Result, FSamples.addTotalSamples(NumSamples));
         InlineStack.push_back(&FSamples);
@@ -430,7 +430,7 @@ SampleProfileReaderBinary::readProfile(FunctionSamples &FProfile) {
       return EC;
 
     FunctionSamples &CalleeProfile = FProfile.functionSamplesAt(
-        LineLocation(*LineOffset, *Discriminator))[*FName];
+        LineLocation(*LineOffset, *Discriminator))[std::string(*FName)];
     CalleeProfile.setName(*FName);
     if (std::error_code EC = readProfile(CalleeProfile))
       return EC;
@@ -470,18 +470,20 @@ std::error_code SampleProfileReaderBinary::readImpl() {
   return sampleprof_error::success;
 }
 
-std::error_code
-SampleProfileReaderExtBinary::readOneSection(const uint8_t *Start,
-                                             uint64_t Size, SecType Type) {
+std::error_code SampleProfileReaderExtBinary::readOneSection(
+    const uint8_t *Start, uint64_t Size, const SecHdrTableEntry &Entry) {
   Data = Start;
   End = Start + Size;
-  switch (Type) {
+  switch (Entry.Type) {
   case SecProfSummary:
     if (std::error_code EC = readSummary())
       return EC;
+    if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagPartial))
+      Summary->setPartialProfile(true);
     break;
   case SecNameTable:
-    if (std::error_code EC = readNameTable())
+    if (std::error_code EC = readNameTableSec(
+            hasSecFlag(Entry, SecNameTableFlags::SecFlagMD5Name)))
       return EC;
     break;
   case SecLBRProfile:
@@ -546,15 +548,28 @@ std::error_code SampleProfileReaderExtBinary::readFuncProfiles() {
     }
   }
 
-  for (auto NameOffset : FuncOffsetTable) {
-    auto FuncName = NameOffset.first;
-    if (!FuncsToUse.count(FuncName) &&
-        (!Remapper || !Remapper->exist(FuncName)))
-      continue;
-    const uint8_t *FuncProfileAddr = Start + NameOffset.second;
-    assert(FuncProfileAddr < End && "out of LBRProfile section");
-    if (std::error_code EC = readFuncProfile(FuncProfileAddr))
-      return EC;
+  if (useMD5()) {
+    for (auto Name : FuncsToUse) {
+      auto GUID = std::to_string(MD5Hash(Name));
+      auto iter = FuncOffsetTable.find(StringRef(GUID));
+      if (iter == FuncOffsetTable.end())
+        continue;
+      const uint8_t *FuncProfileAddr = Start + iter->second;
+      assert(FuncProfileAddr < End && "out of LBRProfile section");
+      if (std::error_code EC = readFuncProfile(FuncProfileAddr))
+        return EC;
+    }
+  } else {
+    for (auto NameOffset : FuncOffsetTable) {
+      auto FuncName = NameOffset.first;
+      if (!FuncsToUse.count(FuncName) &&
+          (!Remapper || !Remapper->exist(FuncName)))
+        continue;
+      const uint8_t *FuncProfileAddr = Start + NameOffset.second;
+      assert(FuncProfileAddr < End && "out of LBRProfile section");
+      if (std::error_code EC = readFuncProfile(FuncProfileAddr))
+        return EC;
+    }
   }
 
   Data = End;
@@ -617,7 +632,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readImpl() {
     // DecompressBuf before reading the actual data. The pointee of
     // 'Data' will be changed to buffer hold by DecompressBuf
     // temporarily when reading the actual data.
-    bool isCompressed = hasSecFlag(Entry, SecFlagCompress);
+    bool isCompressed = hasSecFlag(Entry, SecCommonFlags::SecFlagCompress);
     if (isCompressed) {
       const uint8_t *DecompressBuf;
       uint64_t DecompressBufSize;
@@ -628,7 +643,7 @@ std::error_code SampleProfileReaderExtBinaryBase::readImpl() {
       SecSize = DecompressBufSize;
     }
 
-    if (std::error_code EC = readOneSection(SecStart, SecSize, Entry.Type))
+    if (std::error_code EC = readOneSection(SecStart, SecSize, Entry))
       return EC;
     if (Data != SecStart + SecSize)
       return sampleprof_error::malformed;
@@ -705,6 +720,31 @@ std::error_code SampleProfileReaderBinary::readNameTable() {
   return sampleprof_error::success;
 }
 
+std::error_code SampleProfileReaderExtBinary::readMD5NameTable() {
+  auto Size = readNumber<uint64_t>();
+  if (std::error_code EC = Size.getError())
+    return EC;
+  NameTable.reserve(*Size);
+  MD5StringBuf = std::make_unique<std::vector<std::string>>();
+  MD5StringBuf->reserve(*Size);
+  for (uint32_t I = 0; I < *Size; ++I) {
+    auto FID = readNumber<uint64_t>();
+    if (std::error_code EC = FID.getError())
+      return EC;
+    MD5StringBuf->push_back(std::to_string(*FID));
+    // NameTable is a vector of StringRef. Here it is pushing back a
+    // StringRef initialized with the last string in MD5stringBuf.
+    NameTable.push_back(MD5StringBuf->back());
+  }
+  return sampleprof_error::success;
+}
+
+std::error_code SampleProfileReaderExtBinary::readNameTableSec(bool IsMD5) {
+  if (IsMD5)
+    return readMD5NameTable();
+  return SampleProfileReaderBinary::readNameTable();
+}
+
 std::error_code SampleProfileReaderCompactBinary::readNameTable() {
   auto Size = readNumber<uint64_t>();
   if (std::error_code EC = Size.getError())
@@ -793,11 +833,40 @@ uint64_t SampleProfileReaderExtBinaryBase::getFileSize() {
   return FileSize;
 }
 
+static std::string getSecFlagsStr(const SecHdrTableEntry &Entry) {
+  std::string Flags;
+  if (hasSecFlag(Entry, SecCommonFlags::SecFlagCompress))
+    Flags.append("{compressed,");
+  else
+    Flags.append("{");
+
+  switch (Entry.Type) {
+  case SecNameTable:
+    if (hasSecFlag(Entry, SecNameTableFlags::SecFlagMD5Name))
+      Flags.append("md5,");
+    break;
+  case SecProfSummary:
+    if (hasSecFlag(Entry, SecProfSummaryFlags::SecFlagPartial))
+      Flags.append("partial,");
+    break;
+  default:
+    break;
+  }
+  char &last = Flags.back();
+  if (last == ',')
+    last = '}';
+  else
+    Flags.append("}");
+  return Flags;
+}
+
 bool SampleProfileReaderExtBinaryBase::dumpSectionInfo(raw_ostream &OS) {
   uint64_t TotalSecsSize = 0;
   for (auto &Entry : SecHdrTable) {
     OS << getSecName(Entry.Type) << " - Offset: " << Entry.Offset
-       << ", Size: " << Entry.Size << "\n";
+       << ", Size: " << Entry.Size << ", Flags: " << getSecFlagsStr(Entry)
+       << "\n";
+    ;
     TotalSecsSize += getSectionSize(Entry.Type);
   }
   uint64_t HeaderSize = SecHdrTable.front().Offset;
@@ -1007,7 +1076,7 @@ std::error_code SampleProfileReaderGCC::readHeader() {
   if (!GcovBuffer.readGCOVVersion(version))
     return sampleprof_error::unrecognized_format;
 
-  if (version != GCOV::V704)
+  if (version != GCOV::V407)
     return sampleprof_error::unsupported_version;
 
   // Skip the empty integer.
@@ -1043,7 +1112,7 @@ std::error_code SampleProfileReaderGCC::readNameTable() {
     StringRef Str;
     if (!GcovBuffer.readString(Str))
       return sampleprof_error::truncated;
-    Names.push_back(Str);
+    Names.push_back(std::string(Str));
   }
 
   return sampleprof_error::success;
@@ -1107,7 +1176,7 @@ std::error_code SampleProfileReaderGCC::readOneFunctionProfile(
     uint32_t LineOffset = Offset >> 16;
     uint32_t Discriminator = Offset & 0xffff;
     FProfile = &CallerProfile->functionSamplesAt(
-        LineLocation(LineOffset, Discriminator))[Name];
+        LineLocation(LineOffset, Discriminator))[std::string(Name)];
   }
   FProfile->setName(Name);
 
@@ -1210,9 +1279,9 @@ bool SampleProfileReaderGCC::hasFormat(const MemoryBuffer &Buffer) {
 }
 
 void SampleProfileReaderItaniumRemapper::applyRemapping(LLVMContext &Ctx) {
-  // If the reader is in compact format, we can't remap it because
+  // If the reader uses MD5 to represent string, we can't remap it because
   // we don't know what the original function names were.
-  if (Reader.getFormat() == SPF_Compact_Binary) {
+  if (Reader.useMD5()) {
     Ctx.diagnose(DiagnosticInfoSampleProfile(
         Reader.getBuffer()->getBufferIdentifier(),
         "Profile data remapping cannot be applied to profile data "
diff --git a/llvm/lib/ProfileData/SampleProfWriter.cpp b/llvm/lib/ProfileData/SampleProfWriter.cpp
index 8d09af31f94b..48d3faa6cd2f 100644
--- a/llvm/lib/ProfileData/SampleProfWriter.cpp
+++ b/llvm/lib/ProfileData/SampleProfWriter.cpp
@@ -87,7 +87,7 @@ uint64_t SampleProfileWriterExtBinaryBase::markSectionStart(SecType Type) {
   uint64_t SectionStart = OutputStream->tell();
   auto &Entry = getEntryInLayout(Type);
   // Use LocalBuf as a temporary output for writting data.
-  if (hasSecFlag(Entry, SecFlagCompress))
+  if (hasSecFlag(Entry, SecCommonFlags::SecFlagCompress))
     LocalBufStream.swap(OutputStream);
   return SectionStart;
 }
@@ -117,7 +117,7 @@ std::error_code
 SampleProfileWriterExtBinaryBase::addNewSection(SecType Type,
                                                 uint64_t SectionStart) {
   auto Entry = getEntryInLayout(Type);
-  if (hasSecFlag(Entry, SecFlagCompress)) {
+  if (hasSecFlag(Entry, SecCommonFlags::SecFlagCompress)) {
     LocalBufStream.swap(OutputStream);
     if (std::error_code EC = compressAndOutput())
       return EC;
@@ -166,6 +166,22 @@ std::error_code SampleProfileWriterExtBinary::writeFuncOffsetTable() {
   return sampleprof_error::success;
 }
 
+std::error_code SampleProfileWriterExtBinary::writeNameTable() {
+  if (!UseMD5)
+    return SampleProfileWriterBinary::writeNameTable();
+
+  auto &OS = *OutputStream;
+  std::set<StringRef> V;
+  stablizeNameTable(V);
+
+  // Write out the name table.
+  encodeULEB128(NameTable.size(), OS);
+  for (auto N : V) {
+    encodeULEB128(MD5Hash(N), OS);
+  }
+  return sampleprof_error::success;
+}
+
 std::error_code SampleProfileWriterExtBinary::writeSections(
     const StringMap<FunctionSamples> &ProfileMap) {
   uint64_t SectionStart = markSectionStart(SecProfSummary);
@@ -390,19 +406,11 @@ std::error_code SampleProfileWriterBinary::writeHeader(
 
 void SampleProfileWriterExtBinaryBase::setToCompressAllSections() {
   for (auto &Entry : SectionHdrLayout)
-    addSecFlags(Entry, SecFlagCompress);
+    addSecFlag(Entry, SecCommonFlags::SecFlagCompress);
 }
 
 void SampleProfileWriterExtBinaryBase::setToCompressSection(SecType Type) {
-  addSectionFlags(Type, SecFlagCompress);
-}
-
-void SampleProfileWriterExtBinaryBase::addSectionFlags(SecType Type,
-                                                       SecFlags Flags) {
-  for (auto &Entry : SectionHdrLayout) {
-    if (Entry.Type == Type)
-      addSecFlags(Entry, Flags);
-  }
+  addSectionFlag(Type, SecCommonFlags::SecFlagCompress);
 }
 
 void SampleProfileWriterExtBinaryBase::allocSecHdrTable() {
diff --git a/llvm/lib/Remarks/BitstreamRemarkParser.cpp b/llvm/lib/Remarks/BitstreamRemarkParser.cpp
index 4c4508879114..25fbea7d31c2 100644
--- a/llvm/lib/Remarks/BitstreamRemarkParser.cpp
+++ b/llvm/lib/Remarks/BitstreamRemarkParser.cpp
@@ -323,7 +323,7 @@ remarks::createBitstreamParserFromMeta(
              : std::make_unique<BitstreamRemarkParser>(Buf);
 
   if (ExternalFilePrependPath)
-    Parser->ExternalFilePrependPath = *ExternalFilePrependPath;
+    Parser->ExternalFilePrependPath = std::string(*ExternalFilePrependPath);
 
   return std::move(Parser);
 }
diff --git a/llvm/lib/Remarks/Remark.cpp b/llvm/lib/Remarks/Remark.cpp
index 401ac514b011..057d1a378599 100644
--- a/llvm/lib/Remarks/Remark.cpp
+++ b/llvm/lib/Remarks/Remark.cpp
@@ -11,8 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Remarks/Remark.h"
-#include "llvm-c/Remarks.h"
-#include "llvm/Support/CBindingWrapping.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Remarks/RemarkLinker.cpp b/llvm/lib/Remarks/RemarkLinker.cpp
index 617ce770af66..dd1bba3d1762 100644
--- a/llvm/lib/Remarks/RemarkLinker.cpp
+++ b/llvm/lib/Remarks/RemarkLinker.cpp
@@ -57,7 +57,7 @@ Remark &RemarkLinker::keep(std::unique_ptr<Remark> Remark) {
 }
 
 void RemarkLinker::setExternalFilePrependPath(StringRef PrependPathIn) {
-  PrependPath = PrependPathIn;
+  PrependPath = std::string(PrependPathIn);
 }
 
 // Discard remarks with no source location.
diff --git a/llvm/lib/Remarks/RemarkStreamer.cpp b/llvm/lib/Remarks/RemarkStreamer.cpp
new file mode 100644
index 000000000000..2f00b8e73670
--- /dev/null
+++ b/llvm/lib/Remarks/RemarkStreamer.cpp
@@ -0,0 +1,72 @@
+//===- llvm/Remarks/RemarkStreamer.cpp - Remark Streamer -*- C++ --------*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the main remark streamer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Remarks/RemarkStreamer.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+using namespace llvm::remarks;
+
+static cl::opt<cl::boolOrDefault> EnableRemarksSection(
+    "remarks-section",
+    cl::desc(
+        "Emit a section containing remark diagnostics metadata. By default, "
+        "this is enabled for the following formats: yaml-strtab, bitstream."),
+    cl::init(cl::BOU_UNSET), cl::Hidden);
+
+RemarkStreamer::RemarkStreamer(
+    std::unique_ptr<remarks::RemarkSerializer> RemarkSerializer,
+    Optional<StringRef> FilenameIn)
+    : PassFilter(), RemarkSerializer(std::move(RemarkSerializer)),
+      Filename(FilenameIn ? Optional<std::string>(FilenameIn->str()) : None) {}
+
+Error RemarkStreamer::setFilter(StringRef Filter) {
+  Regex R = Regex(Filter);
+  std::string RegexError;
+  if (!R.isValid(RegexError))
+    return createStringError(std::make_error_code(std::errc::invalid_argument),
+                             RegexError.data());
+  PassFilter = std::move(R);
+  return Error::success();
+}
+
+bool RemarkStreamer::matchesFilter(StringRef Str) {
+  if (PassFilter)
+    return PassFilter->match(Str);
+  // No filter means all strings pass.
+  return true;
+}
+
+bool RemarkStreamer::needsSection() const {
+  if (EnableRemarksSection == cl::BOU_TRUE)
+    return true;
+
+  if (EnableRemarksSection == cl::BOU_FALSE)
+    return false;
+
+  assert(EnableRemarksSection == cl::BOU_UNSET);
+
+  // We only need a section if we're in separate mode.
+  if (RemarkSerializer->Mode != remarks::SerializerMode::Separate)
+    return false;
+
+  // Only some formats need a section:
+  // * bitstream
+  // * yaml-strtab
+  switch (RemarkSerializer->SerializerFormat) {
+  case remarks::Format::YAMLStrTab:
+  case remarks::Format::Bitstream:
+    return true;
+  default:
+    return false;
+  }
+}
diff --git a/llvm/lib/Remarks/RemarkStringTable.cpp b/llvm/lib/Remarks/RemarkStringTable.cpp
index 51156465be51..5f462f01bb9a 100644
--- a/llvm/lib/Remarks/RemarkStringTable.cpp
+++ b/llvm/lib/Remarks/RemarkStringTable.cpp
@@ -11,10 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Remarks/RemarkStringTable.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Remarks/Remark.h"
 #include "llvm/Remarks/RemarkParser.h"
-#include "llvm/Support/EndianStream.h"
-#include "llvm/Support/Error.h"
+#include "llvm/Support/raw_ostream.h"
 #include <vector>
 
 using namespace llvm;
diff --git a/llvm/lib/Remarks/YAMLRemarkParser.cpp b/llvm/lib/Remarks/YAMLRemarkParser.cpp
index dd834d85676e..3d9996c931ae 100644
--- a/llvm/lib/Remarks/YAMLRemarkParser.cpp
+++ b/llvm/lib/Remarks/YAMLRemarkParser.cpp
@@ -13,7 +13,6 @@
 
 #include "YAMLRemarkParser.h"
 #include "llvm/ADT/StringSwitch.h"
-#include "llvm/Remarks/RemarkParser.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Path.h"
 
diff --git a/llvm/lib/Remarks/YAMLRemarkParser.h b/llvm/lib/Remarks/YAMLRemarkParser.h
index 03707433bc03..df3b908f4779 100644
--- a/llvm/lib/Remarks/YAMLRemarkParser.h
+++ b/llvm/lib/Remarks/YAMLRemarkParser.h
@@ -35,7 +35,7 @@ public:
   YAMLParseError(StringRef Message, SourceMgr &SM, yaml::Stream &Stream,
                  yaml::Node &Node);
 
-  YAMLParseError(StringRef Message) : Message(Message) {}
+  YAMLParseError(StringRef Message) : Message(std::string(Message)) {}
 
   void log(raw_ostream &OS) const override { OS << Message; }
   std::error_code convertToErrorCode() const override {
diff --git a/llvm/lib/Support/AArch64TargetParser.cpp b/llvm/lib/Support/AArch64TargetParser.cpp
index b5cd4af0eb3d..a6de44605675 100644
--- a/llvm/lib/Support/AArch64TargetParser.cpp
+++ b/llvm/lib/Support/AArch64TargetParser.cpp
@@ -12,8 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/AArch64TargetParser.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include <cctype>
 
 using namespace llvm;
@@ -116,6 +116,8 @@ bool AArch64::getArchFeatures(AArch64::ArchKind AK,
     Features.push_back("+v8.4a");
   if (AK == ArchKind::ARMV8_5A)
     Features.push_back("+v8.5a");
+  if (AK == AArch64::ArchKind::ARMV8_6A)
+    Features.push_back("+v8.6a");
 
   return AK != ArchKind::INVALID;
 }
@@ -191,7 +193,7 @@ AArch64::ArchKind AArch64::parseArch(StringRef Arch) {
     return ArchKind::INVALID;
 
   StringRef Syn = ARM::getArchSynonym(Arch);
-  for (const auto A : AArch64ARCHNames) {
+  for (const auto &A : AArch64ARCHNames) {
     if (A.getName().endswith(Syn))
       return A.ID;
   }
@@ -199,7 +201,7 @@ AArch64::ArchKind AArch64::parseArch(StringRef Arch) {
 }
 
 AArch64::ArchExtKind AArch64::parseArchExt(StringRef ArchExt) {
-  for (const auto A : AArch64ARCHExtNames) {
+  for (const auto &A : AArch64ARCHExtNames) {
     if (ArchExt == A.getName())
       return static_cast<ArchExtKind>(A.ID);
   }
@@ -207,7 +209,7 @@ AArch64::ArchExtKind AArch64::parseArchExt(StringRef ArchExt) {
 }
 
 AArch64::ArchKind AArch64::parseCPUArch(StringRef CPU) {
-  for (const auto C : AArch64CPUNames) {
+  for (const auto &C : AArch64CPUNames) {
     if (CPU == C.getName())
       return C.ArchID;
   }
diff --git a/llvm/lib/Support/AMDGPUMetadata.cpp b/llvm/lib/Support/AMDGPUMetadata.cpp
index 4ea197a97389..bfa1fe86cd3e 100644
--- a/llvm/lib/Support/AMDGPUMetadata.cpp
+++ b/llvm/lib/Support/AMDGPUMetadata.cpp
@@ -111,7 +111,11 @@ struct MappingTraits<Kernel::Arg::Metadata> {
     YIO.mapRequired(Kernel::Arg::Key::Size, MD.mSize);
     YIO.mapRequired(Kernel::Arg::Key::Align, MD.mAlign);
     YIO.mapRequired(Kernel::Arg::Key::ValueKind, MD.mValueKind);
-    YIO.mapRequired(Kernel::Arg::Key::ValueType, MD.mValueType);
+
+    // Removed. Accepted for parsing compatibility, but not emitted.
+    Optional<ValueType> Unused;
+    YIO.mapOptional(Kernel::Arg::Key::ValueType, Unused);
+
     YIO.mapOptional(Kernel::Arg::Key::PointeeAlign, MD.mPointeeAlign,
                     uint32_t(0));
     YIO.mapOptional(Kernel::Arg::Key::AddrSpaceQual, MD.mAddrSpaceQual,
diff --git a/llvm/lib/Support/APFloat.cpp b/llvm/lib/Support/APFloat.cpp
index 050c37baefb8..569cac790af9 100644
--- a/llvm/lib/Support/APFloat.cpp
+++ b/llvm/lib/Support/APFloat.cpp
@@ -69,6 +69,7 @@ namespace llvm {
   };
 
   static const fltSemantics semIEEEhalf = {15, -14, 11, 16};
+  static const fltSemantics semBFloat = {127, -126, 8, 16};
   static const fltSemantics semIEEEsingle = {127, -126, 24, 32};
   static const fltSemantics semIEEEdouble = {1023, -1022, 53, 64};
   static const fltSemantics semIEEEquad = {16383, -16382, 113, 128};
@@ -117,6 +118,8 @@ namespace llvm {
     switch (S) {
     case S_IEEEhalf:
       return IEEEhalf();
+    case S_BFloat:
+      return BFloat();
     case S_IEEEsingle:
       return IEEEsingle();
     case S_IEEEdouble:
@@ -135,6 +138,8 @@ namespace llvm {
   APFloatBase::SemanticsToEnum(const llvm::fltSemantics &Sem) {
     if (&Sem == &llvm::APFloat::IEEEhalf())
       return S_IEEEhalf;
+    else if (&Sem == &llvm::APFloat::BFloat())
+      return S_BFloat;
     else if (&Sem == &llvm::APFloat::IEEEsingle())
       return S_IEEEsingle;
     else if (&Sem == &llvm::APFloat::IEEEdouble())
@@ -152,6 +157,9 @@ namespace llvm {
   const fltSemantics &APFloatBase::IEEEhalf() {
     return semIEEEhalf;
   }
+  const fltSemantics &APFloatBase::BFloat() {
+    return semBFloat;
+  }
   const fltSemantics &APFloatBase::IEEEsingle() {
     return semIEEEsingle;
   }
@@ -171,6 +179,12 @@ namespace llvm {
     return semPPCDoubleDouble;
   }
 
+  constexpr RoundingMode APFloatBase::rmNearestTiesToEven;
+  constexpr RoundingMode APFloatBase::rmTowardPositive;
+  constexpr RoundingMode APFloatBase::rmTowardNegative;
+  constexpr RoundingMode APFloatBase::rmTowardZero;
+  constexpr RoundingMode APFloatBase::rmNearestTiesToAway;
+
   /* A tight upper bound on number of parts required to hold the value
      pow(5, power) is
 
@@ -1323,6 +1337,9 @@ bool IEEEFloat::roundAwayFromZero(roundingMode rounding_mode,
 
   case rmTowardNegative:
     return sign;
+
+  default:
+    break;
   }
   llvm_unreachable("Invalid rounding mode found");
 }
@@ -1439,25 +1456,26 @@ IEEEFloat::opStatus IEEEFloat::addOrSubtractSpecials(const IEEEFloat &rhs,
   default:
     llvm_unreachable(nullptr);
 
+  case PackCategoriesIntoKey(fcZero, fcNaN):
+  case PackCategoriesIntoKey(fcNormal, fcNaN):
+  case PackCategoriesIntoKey(fcInfinity, fcNaN):
+    assign(rhs);
+    LLVM_FALLTHROUGH;
   case PackCategoriesIntoKey(fcNaN, fcZero):
   case PackCategoriesIntoKey(fcNaN, fcNormal):
   case PackCategoriesIntoKey(fcNaN, fcInfinity):
   case PackCategoriesIntoKey(fcNaN, fcNaN):
+    if (isSignaling()) {
+      makeQuiet();
+      return opInvalidOp;
+    }
+    return rhs.isSignaling() ? opInvalidOp : opOK;
+
   case PackCategoriesIntoKey(fcNormal, fcZero):
   case PackCategoriesIntoKey(fcInfinity, fcNormal):
   case PackCategoriesIntoKey(fcInfinity, fcZero):
     return opOK;
 
-  case PackCategoriesIntoKey(fcZero, fcNaN):
-  case PackCategoriesIntoKey(fcNormal, fcNaN):
-  case PackCategoriesIntoKey(fcInfinity, fcNaN):
-    // We need to be sure to flip the sign here for subtraction because we
-    // don't have a separate negate operation so -NaN becomes 0 - NaN here.
-    sign = rhs.sign ^ subtract;
-    category = fcNaN;
-    copySignificand(rhs);
-    return opOK;
-
   case PackCategoriesIntoKey(fcNormal, fcInfinity):
   case PackCategoriesIntoKey(fcZero, fcInfinity):
     category = fcInfinity;
@@ -1562,20 +1580,22 @@ IEEEFloat::opStatus IEEEFloat::multiplySpecials(const IEEEFloat &rhs) {
   default:
     llvm_unreachable(nullptr);
 
-  case PackCategoriesIntoKey(fcNaN, fcZero):
-  case PackCategoriesIntoKey(fcNaN, fcNormal):
-  case PackCategoriesIntoKey(fcNaN, fcInfinity):
-  case PackCategoriesIntoKey(fcNaN, fcNaN):
-    sign = false;
-    return opOK;
-
   case PackCategoriesIntoKey(fcZero, fcNaN):
   case PackCategoriesIntoKey(fcNormal, fcNaN):
   case PackCategoriesIntoKey(fcInfinity, fcNaN):
+    assign(rhs);
     sign = false;
-    category = fcNaN;
-    copySignificand(rhs);
-    return opOK;
+    LLVM_FALLTHROUGH;
+  case PackCategoriesIntoKey(fcNaN, fcZero):
+  case PackCategoriesIntoKey(fcNaN, fcNormal):
+  case PackCategoriesIntoKey(fcNaN, fcInfinity):
+  case PackCategoriesIntoKey(fcNaN, fcNaN):
+    sign ^= rhs.sign; // restore the original sign
+    if (isSignaling()) {
+      makeQuiet();
+      return opInvalidOp;
+    }
+    return rhs.isSignaling() ? opInvalidOp : opOK;
 
   case PackCategoriesIntoKey(fcNormal, fcInfinity):
   case PackCategoriesIntoKey(fcInfinity, fcNormal):
@@ -1607,15 +1627,20 @@ IEEEFloat::opStatus IEEEFloat::divideSpecials(const IEEEFloat &rhs) {
   case PackCategoriesIntoKey(fcZero, fcNaN):
   case PackCategoriesIntoKey(fcNormal, fcNaN):
   case PackCategoriesIntoKey(fcInfinity, fcNaN):
-    category = fcNaN;
-    copySignificand(rhs);
+    assign(rhs);
+    sign = false;
     LLVM_FALLTHROUGH;
   case PackCategoriesIntoKey(fcNaN, fcZero):
   case PackCategoriesIntoKey(fcNaN, fcNormal):
   case PackCategoriesIntoKey(fcNaN, fcInfinity):
   case PackCategoriesIntoKey(fcNaN, fcNaN):
-    sign = false;
-    LLVM_FALLTHROUGH;
+    sign ^= rhs.sign; // restore the original sign
+    if (isSignaling()) {
+      makeQuiet();
+      return opInvalidOp;
+    }
+    return rhs.isSignaling() ? opInvalidOp : opOK;
+
   case PackCategoriesIntoKey(fcInfinity, fcZero):
   case PackCategoriesIntoKey(fcInfinity, fcNormal):
   case PackCategoriesIntoKey(fcZero, fcInfinity):
@@ -1645,21 +1670,62 @@ IEEEFloat::opStatus IEEEFloat::modSpecials(const IEEEFloat &rhs) {
   default:
     llvm_unreachable(nullptr);
 
+  case PackCategoriesIntoKey(fcZero, fcNaN):
+  case PackCategoriesIntoKey(fcNormal, fcNaN):
+  case PackCategoriesIntoKey(fcInfinity, fcNaN):
+    assign(rhs);
+    LLVM_FALLTHROUGH;
   case PackCategoriesIntoKey(fcNaN, fcZero):
   case PackCategoriesIntoKey(fcNaN, fcNormal):
   case PackCategoriesIntoKey(fcNaN, fcInfinity):
   case PackCategoriesIntoKey(fcNaN, fcNaN):
+    if (isSignaling()) {
+      makeQuiet();
+      return opInvalidOp;
+    }
+    return rhs.isSignaling() ? opInvalidOp : opOK;
+
   case PackCategoriesIntoKey(fcZero, fcInfinity):
   case PackCategoriesIntoKey(fcZero, fcNormal):
   case PackCategoriesIntoKey(fcNormal, fcInfinity):
     return opOK;
 
+  case PackCategoriesIntoKey(fcNormal, fcZero):
+  case PackCategoriesIntoKey(fcInfinity, fcZero):
+  case PackCategoriesIntoKey(fcInfinity, fcNormal):
+  case PackCategoriesIntoKey(fcInfinity, fcInfinity):
+  case PackCategoriesIntoKey(fcZero, fcZero):
+    makeNaN();
+    return opInvalidOp;
+
+  case PackCategoriesIntoKey(fcNormal, fcNormal):
+    return opOK;
+  }
+}
+
+IEEEFloat::opStatus IEEEFloat::remainderSpecials(const IEEEFloat &rhs) {
+  switch (PackCategoriesIntoKey(category, rhs.category)) {
+  default:
+    llvm_unreachable(nullptr);
+
   case PackCategoriesIntoKey(fcZero, fcNaN):
   case PackCategoriesIntoKey(fcNormal, fcNaN):
   case PackCategoriesIntoKey(fcInfinity, fcNaN):
-    sign = false;
-    category = fcNaN;
-    copySignificand(rhs);
+    assign(rhs);
+    LLVM_FALLTHROUGH;
+  case PackCategoriesIntoKey(fcNaN, fcZero):
+  case PackCategoriesIntoKey(fcNaN, fcNormal):
+  case PackCategoriesIntoKey(fcNaN, fcInfinity):
+  case PackCategoriesIntoKey(fcNaN, fcNaN):
+    if (isSignaling()) {
+      makeQuiet();
+      return opInvalidOp;
+    }
+    return rhs.isSignaling() ? opInvalidOp : opOK;
+
+  case PackCategoriesIntoKey(fcZero, fcInfinity):
+  case PackCategoriesIntoKey(fcZero, fcNormal):
+  case PackCategoriesIntoKey(fcNormal, fcInfinity):
     return opOK;
 
   case PackCategoriesIntoKey(fcNormal, fcZero):
@@ -1671,7 +1737,7 @@ IEEEFloat::opStatus IEEEFloat::modSpecials(const IEEEFloat &rhs) {
     return opInvalidOp;
 
   case PackCategoriesIntoKey(fcNormal, fcNormal):
-    return opOK;
+    return opDivByZero; // fake status, indicating this is not a special case
   }
 }
 
@@ -1759,40 +1825,108 @@ IEEEFloat::opStatus IEEEFloat::divide(const IEEEFloat &rhs,
   return fs;
 }
 
-/* Normalized remainder.  This is not currently correct in all cases.  */
+/* Normalized remainder.  */
 IEEEFloat::opStatus IEEEFloat::remainder(const IEEEFloat &rhs) {
   opStatus fs;
-  IEEEFloat V = *this;
   unsigned int origSign = sign;
 
-  fs = V.divide(rhs, rmNearestTiesToEven);
-  if (fs == opDivByZero)
+  // First handle the special cases.
+  fs = remainderSpecials(rhs);
+  if (fs != opDivByZero)
     return fs;
 
-  int parts = partCount();
-  integerPart *x = new integerPart[parts];
-  bool ignored;
-  fs = V.convertToInteger(makeMutableArrayRef(x, parts),
-                          parts * integerPartWidth, true, rmNearestTiesToEven,
-                          &ignored);
-  if (fs == opInvalidOp) {
-    delete[] x;
-    return fs;
+  fs = opOK;
+
+  // Make sure the current value is less than twice the denom. If the addition
+  // did not succeed (an overflow has happened), which means that the finite
+  // value we currently posses must be less than twice the denom (as we are
+  // using the same semantics).
+  IEEEFloat P2 = rhs;
+  if (P2.add(rhs, rmNearestTiesToEven) == opOK) {
+    fs = mod(P2);
+    assert(fs == opOK);
   }
 
-  fs = V.convertFromZeroExtendedInteger(x, parts * integerPartWidth, true,
-                                        rmNearestTiesToEven);
-  assert(fs==opOK);   // should always work
+  // Lets work with absolute numbers.
+  IEEEFloat P = rhs;
+  P.sign = false;
+  sign = false;
 
-  fs = V.multiply(rhs, rmNearestTiesToEven);
-  assert(fs==opOK || fs==opInexact);   // should not overflow or underflow
+  //
+  // To calculate the remainder we use the following scheme.
+  //
+  // The remainder is defained as follows:
+  //
+  // remainder = numer - rquot * denom = x - r * p
+  //
+  // Where r is the result of: x/p, rounded toward the nearest integral value
+  // (with halfway cases rounded toward the even number).
+  //
+  // Currently, (after x mod 2p):
+  // r is the number of 2p's present inside x, which is inherently, an even
+  // number of p's.
+  //
+  // We may split the remaining calculation into 4 options:
+  // - if x < 0.5p then we round to the nearest number with is 0, and are done.
+  // - if x == 0.5p then we round to the nearest even number which is 0, and we
+  //   are done as well.
+  // - if 0.5p < x < p then we round to nearest number which is 1, and we have
+  //   to subtract 1p at least once.
+  // - if x >= p then we must subtract p at least once, as x must be a
+  //   remainder.
+  //
+  // By now, we were done, or we added 1 to r, which in turn, now an odd number.
+  //
+  // We can now split the remaining calculation to the following 3 options:
+  // - if x < 0.5p then we round to the nearest number with is 0, and are done.
+  // - if x == 0.5p then we round to the nearest even number. As r is odd, we
+  //   must round up to the next even number. so we must subtract p once more.
+  // - if x > 0.5p (and inherently x < p) then we must round r up to the next
+  //   integral, and subtract p once more.
+  //
 
-  fs = subtract(V, rmNearestTiesToEven);
-  assert(fs==opOK || fs==opInexact);   // likewise
+  // Extend the semantics to prevent an overflow/underflow or inexact result.
+  bool losesInfo;
+  fltSemantics extendedSemantics = *semantics;
+  extendedSemantics.maxExponent++;
+  extendedSemantics.minExponent--;
+  extendedSemantics.precision += 2;
+
+  IEEEFloat VEx = *this;
+  fs = VEx.convert(extendedSemantics, rmNearestTiesToEven, &losesInfo);
+  assert(fs == opOK && !losesInfo);
+  IEEEFloat PEx = P;
+  fs = PEx.convert(extendedSemantics, rmNearestTiesToEven, &losesInfo);
+  assert(fs == opOK && !losesInfo);
+
+  // It is simpler to work with 2x instead of 0.5p, and we do not need to lose
+  // any fraction.
+  fs = VEx.add(VEx, rmNearestTiesToEven);
+  assert(fs == opOK);
+
+  if (VEx.compare(PEx) == cmpGreaterThan) {
+    fs = subtract(P, rmNearestTiesToEven);
+    assert(fs == opOK);
+
+    // Make VEx = this.add(this), but because we have different semantics, we do
+    // not want to `convert` again, so we just subtract PEx twice (which equals
+    // to the desired value).
+    fs = VEx.subtract(PEx, rmNearestTiesToEven);
+    assert(fs == opOK);
+    fs = VEx.subtract(PEx, rmNearestTiesToEven);
+    assert(fs == opOK);
+
+    cmpResult result = VEx.compare(PEx);
+    if (result == cmpGreaterThan || result == cmpEqual) {
+      fs = subtract(P, rmNearestTiesToEven);
+      assert(fs == opOK);
+    }
+  }
 
   if (isZero())
     sign = origSign;    // IEEE754 requires this
-  delete[] x;
+  else
+    sign ^= origSign;
   return fs;
 }
 
@@ -1860,14 +1994,59 @@ IEEEFloat::opStatus IEEEFloat::fusedMultiplyAdd(const IEEEFloat &multiplicand,
   return fs;
 }
 
-/* Rounding-mode corrrect round to integral value.  */
+/* Rounding-mode correct round to integral value.  */
 IEEEFloat::opStatus IEEEFloat::roundToIntegral(roundingMode rounding_mode) {
   opStatus fs;
 
+  if (isInfinity())
+    // [IEEE Std 754-2008 6.1]:
+    // The behavior of infinity in floating-point arithmetic is derived from the
+    // limiting cases of real arithmetic with operands of arbitrarily
+    // large magnitude, when such a limit exists.
+    // ...
+    // Operations on infinite operands are usually exact and therefore signal no
+    // exceptions ...
+    return opOK;
+
+  if (isNaN()) {
+    if (isSignaling()) {
+      // [IEEE Std 754-2008 6.2]:
+      // Under default exception handling, any operation signaling an invalid
+      // operation exception and for which a floating-point result is to be
+      // delivered shall deliver a quiet NaN.
+      makeQuiet();
+      // [IEEE Std 754-2008 6.2]:
+      // Signaling NaNs shall be reserved operands that, under default exception
+      // handling, signal the invalid operation exception(see 7.2) for every
+      // general-computational and signaling-computational operation except for
+      // the conversions described in 5.12.
+      return opInvalidOp;
+    } else {
+      // [IEEE Std 754-2008 6.2]:
+      // For an operation with quiet NaN inputs, other than maximum and minimum
+      // operations, if a floating-point result is to be delivered the result
+      // shall be a quiet NaN which should be one of the input NaNs.
+      // ...
+      // Every general-computational and quiet-computational operation involving
+      // one or more input NaNs, none of them signaling, shall signal no
+      // exception, except fusedMultiplyAdd might signal the invalid operation
+      // exception(see 7.2).
+      return opOK;
+    }
+  }
+
+  if (isZero()) {
+    // [IEEE Std 754-2008 6.3]:
+    // ... the sign of the result of conversions, the quantize operation, the
+    // roundToIntegral operations, and the roundToIntegralExact(see 5.3.1) is
+    // the sign of the first or only operand.
+    return opOK;
+  }
+
   // If the exponent is large enough, we know that this value is already
   // integral, and the arithmetic below would potentially cause it to saturate
   // to +/-Inf.  Bail out early instead.
-  if (isFiniteNonZero() && exponent+1 >= (int)semanticsPrecision(*semantics))
+  if (exponent+1 >= (int)semanticsPrecision(*semantics))
     return opOK;
 
   // The algorithm here is quite simple: we add 2^(p-1), where p is the
@@ -1881,19 +2060,18 @@ IEEEFloat::opStatus IEEEFloat::roundToIntegral(roundingMode rounding_mode) {
   IEEEFloat MagicConstant(*semantics);
   fs = MagicConstant.convertFromAPInt(IntegerConstant, false,
                                       rmNearestTiesToEven);
+  assert(fs == opOK);
   MagicConstant.sign = sign;
 
-  if (fs != opOK)
-    return fs;
-
-  // Preserve the input sign so that we can handle 0.0/-0.0 cases correctly.
+  // Preserve the input sign so that we can handle the case of zero result
+  // correctly.
   bool inputSign = isNegative();
 
   fs = add(MagicConstant, rounding_mode);
-  if (fs != opOK && fs != opInexact)
-    return fs;
 
-  fs = subtract(MagicConstant, rounding_mode);
+  // Current value and 'MagicConstant' are both integers, so the result of the
+  // subtraction is always exact according to Sterbenz' lemma.
+  subtract(MagicConstant, rounding_mode);
 
   // Restore the input sign.
   if (inputSign != isNegative())
@@ -2621,24 +2799,70 @@ IEEEFloat::convertFromDecimalString(StringRef str, roundingMode rounding_mode) {
 }
 
 bool IEEEFloat::convertFromStringSpecials(StringRef str) {
+  const size_t MIN_NAME_SIZE = 3;
+
+  if (str.size() < MIN_NAME_SIZE)
+    return false;
+
   if (str.equals("inf") || str.equals("INFINITY") || str.equals("+Inf")) {
     makeInf(false);
     return true;
   }
 
-  if (str.equals("-inf") || str.equals("-INFINITY") || str.equals("-Inf")) {
-    makeInf(true);
-    return true;
+  bool IsNegative = str.front() == '-';
+  if (IsNegative) {
+    str = str.drop_front();
+    if (str.size() < MIN_NAME_SIZE)
+      return false;
+
+    if (str.equals("inf") || str.equals("INFINITY") || str.equals("Inf")) {
+      makeInf(true);
+      return true;
+    }
   }
 
-  if (str.equals("nan") || str.equals("NaN")) {
-    makeNaN(false, false);
-    return true;
+  // If we have a 's' (or 'S') prefix, then this is a Signaling NaN.
+  bool IsSignaling = str.front() == 's' || str.front() == 'S';
+  if (IsSignaling) {
+    str = str.drop_front();
+    if (str.size() < MIN_NAME_SIZE)
+      return false;
   }
 
-  if (str.equals("-nan") || str.equals("-NaN")) {
-    makeNaN(false, true);
-    return true;
+  if (str.startswith("nan") || str.startswith("NaN")) {
+    str = str.drop_front(3);
+
+    // A NaN without payload.
+    if (str.empty()) {
+      makeNaN(IsSignaling, IsNegative);
+      return true;
+    }
+
+    // Allow the payload to be inside parentheses.
+    if (str.front() == '(') {
+      // Parentheses should be balanced (and not empty).
+      if (str.size() <= 2 || str.back() != ')')
+        return false;
+
+      str = str.slice(1, str.size() - 1);
+    }
+
+    // Determine the payload number's radix.
+    unsigned Radix = 10;
+    if (str[0] == '0') {
+      if (str.size() > 1 && tolower(str[1]) == 'x') {
+        str = str.drop_front(2);
+        Radix = 16;
+      } else
+        Radix = 8;
+    }
+
+    // Parse the payload and make the NaN.
+    APInt Payload;
+    if (!str.getAsInteger(Radix, Payload)) {
+      makeNaN(IsSignaling, IsNegative, &Payload);
+      return true;
+    }
   }
 
   return false;
@@ -3039,6 +3263,33 @@ APInt IEEEFloat::convertFloatAPFloatToAPInt() const {
                     (mysignificand & 0x7fffff)));
 }
 
+APInt IEEEFloat::convertBFloatAPFloatToAPInt() const {
+  assert(semantics == (const llvm::fltSemantics *)&semBFloat);
+  assert(partCount() == 1);
+
+  uint32_t myexponent, mysignificand;
+
+  if (isFiniteNonZero()) {
+    myexponent = exponent + 127; // bias
+    mysignificand = (uint32_t)*significandParts();
+    if (myexponent == 1 && !(mysignificand & 0x80))
+      myexponent = 0; // denormal
+  } else if (category == fcZero) {
+    myexponent = 0;
+    mysignificand = 0;
+  } else if (category == fcInfinity) {
+    myexponent = 0xff;
+    mysignificand = 0;
+  } else {
+    assert(category == fcNaN && "Unknown category!");
+    myexponent = 0xff;
+    mysignificand = (uint32_t)*significandParts();
+  }
+
+  return APInt(16, (((sign & 1) << 15) | ((myexponent & 0xff) << 7) |
+                    (mysignificand & 0x7f)));
+}
+
 APInt IEEEFloat::convertHalfAPFloatToAPInt() const {
   assert(semantics == (const llvm::fltSemantics*)&semIEEEhalf);
   assert(partCount()==1);
@@ -3074,6 +3325,9 @@ APInt IEEEFloat::bitcastToAPInt() const {
   if (semantics == (const llvm::fltSemantics*)&semIEEEhalf)
     return convertHalfAPFloatToAPInt();
 
+  if (semantics == (const llvm::fltSemantics *)&semBFloat)
+    return convertBFloatAPFloatToAPInt();
+
   if (semantics == (const llvm::fltSemantics*)&semIEEEsingle)
     return convertFloatAPFloatToAPInt();
 
@@ -3270,6 +3524,37 @@ void IEEEFloat::initFromFloatAPInt(const APInt &api) {
   }
 }
 
+void IEEEFloat::initFromBFloatAPInt(const APInt &api) {
+  assert(api.getBitWidth() == 16);
+  uint32_t i = (uint32_t)*api.getRawData();
+  uint32_t myexponent = (i >> 7) & 0xff;
+  uint32_t mysignificand = i & 0x7f;
+
+  initialize(&semBFloat);
+  assert(partCount() == 1);
+
+  sign = i >> 15;
+  if (myexponent == 0 && mysignificand == 0) {
+    // exponent, significand meaningless
+    category = fcZero;
+  } else if (myexponent == 0xff && mysignificand == 0) {
+    // exponent, significand meaningless
+    category = fcInfinity;
+  } else if (myexponent == 0xff && mysignificand != 0) {
+    // sign, exponent, significand meaningless
+    category = fcNaN;
+    *significandParts() = mysignificand;
+  } else {
+    category = fcNormal;
+    exponent = myexponent - 127; // bias
+    *significandParts() = mysignificand;
+    if (myexponent == 0) // denormal
+      exponent = -126;
+    else
+      *significandParts() |= 0x80; // integer bit
+  }
+}
+
 void IEEEFloat::initFromHalfAPInt(const APInt &api) {
   assert(api.getBitWidth()==16);
   uint32_t i = (uint32_t)*api.getRawData();
@@ -3308,6 +3593,8 @@ void IEEEFloat::initFromHalfAPInt(const APInt &api) {
 void IEEEFloat::initFromAPInt(const fltSemantics *Sem, const APInt &api) {
   if (Sem == &semIEEEhalf)
     return initFromHalfAPInt(api);
+  if (Sem == &semBFloat)
+    return initFromBFloatAPInt(api);
   if (Sem == &semIEEEsingle)
     return initFromFloatAPInt(api);
   if (Sem == &semIEEEdouble)
@@ -4425,7 +4712,7 @@ bool DoubleAPFloat::isDenormal() const {
   return getCategory() == fcNormal &&
          (Floats[0].isDenormal() || Floats[1].isDenormal() ||
           // (double)(Hi + Lo) == Hi defines a normal number.
-          Floats[0].compare(Floats[0] + Floats[1]) != cmpEqual);
+          Floats[0] != Floats[0] + Floats[1]);
 }
 
 bool DoubleAPFloat::isSmallest() const {
@@ -4547,26 +4834,9 @@ APFloat::opStatus APFloat::convert(const fltSemantics &ToSemantics,
   llvm_unreachable("Unexpected semantics");
 }
 
-APFloat APFloat::getAllOnesValue(unsigned BitWidth, bool isIEEE) {
-  if (isIEEE) {
-    switch (BitWidth) {
-    case 16:
-      return APFloat(semIEEEhalf, APInt::getAllOnesValue(BitWidth));
-    case 32:
-      return APFloat(semIEEEsingle, APInt::getAllOnesValue(BitWidth));
-    case 64:
-      return APFloat(semIEEEdouble, APInt::getAllOnesValue(BitWidth));
-    case 80:
-      return APFloat(semX87DoubleExtended, APInt::getAllOnesValue(BitWidth));
-    case 128:
-      return APFloat(semIEEEquad, APInt::getAllOnesValue(BitWidth));
-    default:
-      llvm_unreachable("Unknown floating bit width");
-    }
-  } else {
-    assert(BitWidth == 128);
-    return APFloat(semPPCDoubleDouble, APInt::getAllOnesValue(BitWidth));
-  }
+APFloat APFloat::getAllOnesValue(const fltSemantics &Semantics,
+                                 unsigned BitWidth) {
+  return APFloat(Semantics, APInt::getAllOnesValue(BitWidth));
 }
 
 void APFloat::print(raw_ostream &OS) const {
diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index 9b9cd70078b3..9a6f93feaa29 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -548,9 +548,11 @@ unsigned APInt::getBitsNeeded(StringRef str, uint8_t radix) {
 
 hash_code llvm::hash_value(const APInt &Arg) {
   if (Arg.isSingleWord())
-    return hash_combine(Arg.U.VAL);
+    return hash_combine(Arg.BitWidth, Arg.U.VAL);
 
-  return hash_combine_range(Arg.U.pVal, Arg.U.pVal + Arg.getNumWords());
+  return hash_combine(
+      Arg.BitWidth,
+      hash_combine_range(Arg.U.pVal, Arg.U.pVal + Arg.getNumWords()));
 }
 
 bool APInt::isSplat(unsigned SplatSizeInBits) const {
@@ -670,20 +672,16 @@ bool APInt::isSubsetOfSlowCase(const APInt &RHS) const {
 }
 
 APInt APInt::byteSwap() const {
-  assert(BitWidth >= 16 && BitWidth % 16 == 0 && "Cannot byteswap!");
+  assert(BitWidth >= 16 && BitWidth % 8 == 0 && "Cannot byteswap!");
   if (BitWidth == 16)
     return APInt(BitWidth, ByteSwap_16(uint16_t(U.VAL)));
   if (BitWidth == 32)
     return APInt(BitWidth, ByteSwap_32(unsigned(U.VAL)));
-  if (BitWidth == 48) {
-    unsigned Tmp1 = unsigned(U.VAL >> 16);
-    Tmp1 = ByteSwap_32(Tmp1);
-    uint16_t Tmp2 = uint16_t(U.VAL);
-    Tmp2 = ByteSwap_16(Tmp2);
-    return APInt(BitWidth, (uint64_t(Tmp2) << 32) | Tmp1);
+  if (BitWidth <= 64) {
+    uint64_t Tmp1 = ByteSwap_64(U.VAL);
+    Tmp1 >>= (64 - BitWidth);
+    return APInt(BitWidth, Tmp1);
   }
-  if (BitWidth == 64)
-    return APInt(BitWidth, ByteSwap_64(U.VAL));
 
   APInt Result(getNumWords() * APINT_BITS_PER_WORD, 0);
   for (unsigned I = 0, N = getNumWords(); I != N; ++I)
@@ -2283,7 +2281,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix,
 std::string APInt::toString(unsigned Radix = 10, bool Signed = true) const {
   SmallString<40> S;
   toString(S, Radix, Signed, /* formatAsCLiteral = */false);
-  return S.str();
+  return std::string(S.str());
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -3088,7 +3086,8 @@ void llvm::StoreIntToMemory(const APInt &IntVal, uint8_t *Dst,
 
 /// LoadIntFromMemory - Loads the integer stored in the LoadBytes bytes starting
 /// from Src into IntVal, which is assumed to be wide enough and to hold zero.
-void llvm::LoadIntFromMemory(APInt &IntVal, uint8_t *Src, unsigned LoadBytes) {
+void llvm::LoadIntFromMemory(APInt &IntVal, const uint8_t *Src,
+                             unsigned LoadBytes) {
   assert((IntVal.getBitWidth()+7)/8 >= LoadBytes && "Integer too small!");
   uint8_t *Dst = reinterpret_cast<uint8_t *>(
                    const_cast<uint64_t *>(IntVal.getRawData()));
diff --git a/llvm/lib/Support/APSInt.cpp b/llvm/lib/Support/APSInt.cpp
index 7c48880f96ea..b65b6824eaf8 100644
--- a/llvm/lib/Support/APSInt.cpp
+++ b/llvm/lib/Support/APSInt.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/StringRef.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -25,14 +26,14 @@ APSInt::APSInt(StringRef Str) {
   APInt Tmp(NumBits, Str, /*radix=*/10);
   if (Str[0] == '-') {
     unsigned MinBits = Tmp.getMinSignedBits();
-    if (MinBits > 0 && MinBits < NumBits)
-      Tmp = Tmp.trunc(MinBits);
+    if (MinBits < NumBits)
+      Tmp = Tmp.trunc(std::max<unsigned>(1, MinBits));
     *this = APSInt(Tmp, /*isUnsigned=*/false);
     return;
   }
   unsigned ActiveBits = Tmp.getActiveBits();
-  if (ActiveBits > 0 && ActiveBits < NumBits)
-    Tmp = Tmp.trunc(ActiveBits);
+  if (ActiveBits < NumBits)
+    Tmp = Tmp.trunc(std::max<unsigned>(1, ActiveBits));
   *this = APSInt(Tmp, /*isUnsigned=*/true);
 }
 
diff --git a/llvm/lib/Support/ARMAttributeParser.cpp b/llvm/lib/Support/ARMAttributeParser.cpp
index 8a89f4c45fb9..17ad38d22614 100644
--- a/llvm/lib/Support/ARMAttributeParser.cpp
+++ b/llvm/lib/Support/ARMAttributeParser.cpp
@@ -1,4 +1,4 @@
-//===--- ARMAttributeParser.cpp - ARM Attribute Information Printer -------===//
+//===- ARMAttributeParser.cpp - ARM Attribute Information Printer ---------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -9,719 +9,365 @@
 #include "llvm/Support/ARMAttributeParser.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/LEB128.h"
 #include "llvm/Support/ScopedPrinter.h"
 
 using namespace llvm;
 using namespace llvm::ARMBuildAttrs;
 
-
-static const EnumEntry<unsigned> TagNames[] = {
-  { "Tag_File", ARMBuildAttrs::File },
-  { "Tag_Section", ARMBuildAttrs::Section },
-  { "Tag_Symbol", ARMBuildAttrs::Symbol },
-};
-
-namespace llvm {
-#define ATTRIBUTE_HANDLER(Attr_)                                                \
-  { ARMBuildAttrs::Attr_, &ARMAttributeParser::Attr_ }
-
-const ARMAttributeParser::DisplayHandler
-ARMAttributeParser::DisplayRoutines[] = {
-  { ARMBuildAttrs::CPU_raw_name, &ARMAttributeParser::StringAttribute, },
-  { ARMBuildAttrs::CPU_name, &ARMAttributeParser::StringAttribute },
-  ATTRIBUTE_HANDLER(CPU_arch),
-  ATTRIBUTE_HANDLER(CPU_arch_profile),
-  ATTRIBUTE_HANDLER(ARM_ISA_use),
-  ATTRIBUTE_HANDLER(THUMB_ISA_use),
-  ATTRIBUTE_HANDLER(FP_arch),
-  ATTRIBUTE_HANDLER(WMMX_arch),
-  ATTRIBUTE_HANDLER(Advanced_SIMD_arch),
-  ATTRIBUTE_HANDLER(MVE_arch),
-  ATTRIBUTE_HANDLER(PCS_config),
-  ATTRIBUTE_HANDLER(ABI_PCS_R9_use),
-  ATTRIBUTE_HANDLER(ABI_PCS_RW_data),
-  ATTRIBUTE_HANDLER(ABI_PCS_RO_data),
-  ATTRIBUTE_HANDLER(ABI_PCS_GOT_use),
-  ATTRIBUTE_HANDLER(ABI_PCS_wchar_t),
-  ATTRIBUTE_HANDLER(ABI_FP_rounding),
-  ATTRIBUTE_HANDLER(ABI_FP_denormal),
-  ATTRIBUTE_HANDLER(ABI_FP_exceptions),
-  ATTRIBUTE_HANDLER(ABI_FP_user_exceptions),
-  ATTRIBUTE_HANDLER(ABI_FP_number_model),
-  ATTRIBUTE_HANDLER(ABI_align_needed),
-  ATTRIBUTE_HANDLER(ABI_align_preserved),
-  ATTRIBUTE_HANDLER(ABI_enum_size),
-  ATTRIBUTE_HANDLER(ABI_HardFP_use),
-  ATTRIBUTE_HANDLER(ABI_VFP_args),
-  ATTRIBUTE_HANDLER(ABI_WMMX_args),
-  ATTRIBUTE_HANDLER(ABI_optimization_goals),
-  ATTRIBUTE_HANDLER(ABI_FP_optimization_goals),
-  ATTRIBUTE_HANDLER(compatibility),
-  ATTRIBUTE_HANDLER(CPU_unaligned_access),
-  ATTRIBUTE_HANDLER(FP_HP_extension),
-  ATTRIBUTE_HANDLER(ABI_FP_16bit_format),
-  ATTRIBUTE_HANDLER(MPextension_use),
-  ATTRIBUTE_HANDLER(DIV_use),
-  ATTRIBUTE_HANDLER(DSP_extension),
-  ATTRIBUTE_HANDLER(T2EE_use),
-  ATTRIBUTE_HANDLER(Virtualization_use),
-  ATTRIBUTE_HANDLER(nodefaults)
+#define ATTRIBUTE_HANDLER(attr)                                                \
+  { ARMBuildAttrs::attr, &ARMAttributeParser::attr }
+
+const ARMAttributeParser::DisplayHandler ARMAttributeParser::displayRoutines[] =
+    {
+        {ARMBuildAttrs::CPU_raw_name, &ARMAttributeParser::stringAttribute},
+        {ARMBuildAttrs::CPU_name, &ARMAttributeParser::stringAttribute},
+        ATTRIBUTE_HANDLER(CPU_arch),
+        ATTRIBUTE_HANDLER(CPU_arch_profile),
+        ATTRIBUTE_HANDLER(ARM_ISA_use),
+        ATTRIBUTE_HANDLER(THUMB_ISA_use),
+        ATTRIBUTE_HANDLER(FP_arch),
+        ATTRIBUTE_HANDLER(WMMX_arch),
+        ATTRIBUTE_HANDLER(Advanced_SIMD_arch),
+        ATTRIBUTE_HANDLER(MVE_arch),
+        ATTRIBUTE_HANDLER(PCS_config),
+        ATTRIBUTE_HANDLER(ABI_PCS_R9_use),
+        ATTRIBUTE_HANDLER(ABI_PCS_RW_data),
+        ATTRIBUTE_HANDLER(ABI_PCS_RO_data),
+        ATTRIBUTE_HANDLER(ABI_PCS_GOT_use),
+        ATTRIBUTE_HANDLER(ABI_PCS_wchar_t),
+        ATTRIBUTE_HANDLER(ABI_FP_rounding),
+        ATTRIBUTE_HANDLER(ABI_FP_denormal),
+        ATTRIBUTE_HANDLER(ABI_FP_exceptions),
+        ATTRIBUTE_HANDLER(ABI_FP_user_exceptions),
+        ATTRIBUTE_HANDLER(ABI_FP_number_model),
+        ATTRIBUTE_HANDLER(ABI_align_needed),
+        ATTRIBUTE_HANDLER(ABI_align_preserved),
+        ATTRIBUTE_HANDLER(ABI_enum_size),
+        ATTRIBUTE_HANDLER(ABI_HardFP_use),
+        ATTRIBUTE_HANDLER(ABI_VFP_args),
+        ATTRIBUTE_HANDLER(ABI_WMMX_args),
+        ATTRIBUTE_HANDLER(ABI_optimization_goals),
+        ATTRIBUTE_HANDLER(ABI_FP_optimization_goals),
+        ATTRIBUTE_HANDLER(compatibility),
+        ATTRIBUTE_HANDLER(CPU_unaligned_access),
+        ATTRIBUTE_HANDLER(FP_HP_extension),
+        ATTRIBUTE_HANDLER(ABI_FP_16bit_format),
+        ATTRIBUTE_HANDLER(MPextension_use),
+        ATTRIBUTE_HANDLER(DIV_use),
+        ATTRIBUTE_HANDLER(DSP_extension),
+        ATTRIBUTE_HANDLER(T2EE_use),
+        ATTRIBUTE_HANDLER(Virtualization_use),
+        ATTRIBUTE_HANDLER(nodefaults),
 };
 
 #undef ATTRIBUTE_HANDLER
 
-uint64_t ARMAttributeParser::ParseInteger(const uint8_t *Data,
-                                          uint32_t &Offset) {
-  unsigned DecodeLength;
-  uint64_t Value = decodeULEB128(Data + Offset, &DecodeLength);
-  Offset += DecodeLength;
-  return Value;
-}
-
-StringRef ARMAttributeParser::ParseString(const uint8_t *Data,
-                                          uint32_t &Offset) {
-  const char *String = reinterpret_cast<const char*>(Data + Offset);
-  size_t Length = std::strlen(String);
-  Offset = Offset + Length + 1;
-  return StringRef(String, Length);
-}
-
-void ARMAttributeParser::IntegerAttribute(AttrType Tag, const uint8_t *Data,
-                                          uint32_t &Offset) {
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  Attributes.insert(std::make_pair(Tag, Value));
-
-  if (SW)
-    SW->printNumber(ARMBuildAttrs::AttrTypeAsString(Tag), Value);
-}
-
-void ARMAttributeParser::StringAttribute(AttrType Tag, const uint8_t *Data,
-                                         uint32_t &Offset) {
-  StringRef TagName = ARMBuildAttrs::AttrTypeAsString(Tag, /*TagPrefix*/false);
-  StringRef ValueDesc = ParseString(Data, Offset);
-
-  if (SW) {
-    DictScope AS(*SW, "Attribute");
-    SW->printNumber("Tag", Tag);
-    if (!TagName.empty())
-      SW->printString("TagName", TagName);
-    SW->printString("Value", ValueDesc);
+Error ARMAttributeParser::stringAttribute(AttrType tag) {
+  StringRef tagName =
+      ELFAttrs::attrTypeAsString(tag, tagToStringMap, /*TagPrefix=*/false);
+  StringRef desc = de.getCStrRef(cursor);
+
+  if (sw) {
+    DictScope scope(*sw, "Attribute");
+    sw->printNumber("Tag", tag);
+    if (!tagName.empty())
+      sw->printString("TagName", tagName);
+    sw->printString("Value", desc);
   }
+  return Error::success();
 }
 
-void ARMAttributeParser::PrintAttribute(unsigned Tag, unsigned Value,
-                                        StringRef ValueDesc) {
-  Attributes.insert(std::make_pair(Tag, Value));
-
-  if (SW) {
-    StringRef TagName = ARMBuildAttrs::AttrTypeAsString(Tag,
-                                                        /*TagPrefix*/false);
-    DictScope AS(*SW, "Attribute");
-    SW->printNumber("Tag", Tag);
-    SW->printNumber("Value", Value);
-    if (!TagName.empty())
-      SW->printString("TagName", TagName);
-    if (!ValueDesc.empty())
-      SW->printString("Description", ValueDesc);
-  }
-}
-
-void ARMAttributeParser::CPU_arch(AttrType Tag, const uint8_t *Data,
-                                  uint32_t &Offset) {
-  static const char *const Strings[] = {
+Error ARMAttributeParser::CPU_arch(AttrType tag) {
+  static const char *strings[] = {
     "Pre-v4", "ARM v4", "ARM v4T", "ARM v5T", "ARM v5TE", "ARM v5TEJ", "ARM v6",
     "ARM v6KZ", "ARM v6T2", "ARM v6K", "ARM v7", "ARM v6-M", "ARM v6S-M",
     "ARM v7E-M", "ARM v8", nullptr,
     "ARM v8-M Baseline", "ARM v8-M Mainline", nullptr, nullptr, nullptr,
     "ARM v8.1-M Mainline"
   };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+  return parseStringAttribute("CPU_arch", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::CPU_arch_profile(AttrType Tag, const uint8_t *Data,
-                                          uint32_t &Offset) {
-  uint64_t Encoded = ParseInteger(Data, Offset);
+Error ARMAttributeParser::CPU_arch_profile(AttrType tag) {
+  uint64_t value = de.getULEB128(cursor);
 
-  StringRef Profile;
-  switch (Encoded) {
-  default:  Profile = "Unknown"; break;
-  case 'A': Profile = "Application"; break;
-  case 'R': Profile = "Real-time"; break;
-  case 'M': Profile = "Microcontroller"; break;
-  case 'S': Profile = "Classic"; break;
-  case 0: Profile = "None"; break;
+  StringRef profile;
+  switch (value) {
+  default: profile = "Unknown"; break;
+  case 'A': profile = "Application"; break;
+  case 'R': profile = "Real-time"; break;
+  case 'M': profile = "Microcontroller"; break;
+  case 'S': profile = "Classic"; break;
+  case 0: profile = "None"; break;
   }
 
-  PrintAttribute(Tag, Encoded, Profile);
+  printAttribute(tag, value, profile);
+  return Error::success();
 }
 
-void ARMAttributeParser::ARM_ISA_use(AttrType Tag, const uint8_t *Data,
-                                     uint32_t &Offset) {
-  static const char *const Strings[] = { "Not Permitted", "Permitted" };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ARM_ISA_use(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "Permitted"};
+  return parseStringAttribute("ARM_ISA_use", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::THUMB_ISA_use(AttrType Tag, const uint8_t *Data,
-                                       uint32_t &Offset) {
-  static const char *const Strings[] = { "Not Permitted", "Thumb-1", "Thumb-2" };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::THUMB_ISA_use(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "Thumb-1", "Thumb-2"};
+  return parseStringAttribute("THUMB_ISA_use", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::FP_arch(AttrType Tag, const uint8_t *Data,
-                                 uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "Not Permitted", "VFPv1", "VFPv2", "VFPv3", "VFPv3-D16", "VFPv4",
-    "VFPv4-D16", "ARMv8-a FP", "ARMv8-a FP-D16"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::FP_arch(AttrType tag) {
+  static const char *strings[] = {
+      "Not Permitted", "VFPv1",     "VFPv2",      "VFPv3",         "VFPv3-D16",
+      "VFPv4",         "VFPv4-D16", "ARMv8-a FP", "ARMv8-a FP-D16"};
+  return parseStringAttribute("FP_arch", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::WMMX_arch(AttrType Tag, const uint8_t *Data,
-                                   uint32_t &Offset) {
-  static const char *const Strings[] = { "Not Permitted", "WMMXv1", "WMMXv2" };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::WMMX_arch(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "WMMXv1", "WMMXv2"};
+  return parseStringAttribute("WMMX_arch", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::Advanced_SIMD_arch(AttrType Tag, const uint8_t *Data,
-                                            uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "Not Permitted", "NEONv1", "NEONv2+FMA", "ARMv8-a NEON", "ARMv8.1-a NEON"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::Advanced_SIMD_arch(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "NEONv1", "NEONv2+FMA",
+                                  "ARMv8-a NEON", "ARMv8.1-a NEON"};
+  return parseStringAttribute("Advanced_SIMD_arch", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::MVE_arch(AttrType Tag, const uint8_t *Data,
-                                  uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "Not Permitted", "MVE integer", "MVE integer and float"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::MVE_arch(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "MVE integer",
+                                  "MVE integer and float"};
+  return parseStringAttribute("MVE_arch", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::PCS_config(AttrType Tag, const uint8_t *Data,
-                                    uint32_t &Offset) {
-  static const char *const Strings[] = {
+Error ARMAttributeParser::PCS_config(AttrType tag) {
+  static const char *strings[] = {
     "None", "Bare Platform", "Linux Application", "Linux DSO", "Palm OS 2004",
-    "Reserved (Palm OS)", "Symbian OS 2004", "Reserved (Symbian OS)"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+    "Reserved (Palm OS)", "Symbian OS 2004", "Reserved (Symbian OS)"};
+  return parseStringAttribute("PCS_config", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_PCS_R9_use(AttrType Tag, const uint8_t *Data,
-                                        uint32_t &Offset) {
-  static const char *const Strings[] = { "v6", "Static Base", "TLS", "Unused" };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_PCS_R9_use(AttrType tag) {
+  static const char *strings[] = {"v6", "Static Base", "TLS", "Unused"};
+  return parseStringAttribute("ABI_PCS_R9_use", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_PCS_RW_data(AttrType Tag, const uint8_t *Data,
-                                         uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "Absolute", "PC-relative", "SB-relative", "Not Permitted"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_PCS_RW_data(AttrType tag) {
+  static const char *strings[] = {"Absolute", "PC-relative", "SB-relative",
+                                  "Not Permitted"};
+  return parseStringAttribute("ABI_PCS_RW_data", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_PCS_RO_data(AttrType Tag, const uint8_t *Data,
-                                         uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "Absolute", "PC-relative", "Not Permitted"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_PCS_RO_data(AttrType tag) {
+  static const char *strings[] = {"Absolute", "PC-relative", "Not Permitted"};
+  return parseStringAttribute("ABI_PCS_RO_data", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_PCS_GOT_use(AttrType Tag, const uint8_t *Data,
-                                         uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "Not Permitted", "Direct", "GOT-Indirect"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_PCS_GOT_use(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "Direct", "GOT-Indirect"};
+  return parseStringAttribute("ABI_PCS_GOT_use", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_PCS_wchar_t(AttrType Tag, const uint8_t *Data,
-                                         uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "Not Permitted", "Unknown", "2-byte", "Unknown", "4-byte"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_PCS_wchar_t(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "Unknown", "2-byte",
+                                  "Unknown", "4-byte"};
+  return parseStringAttribute("ABI_PCS_wchar_t", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_FP_rounding(AttrType Tag, const uint8_t *Data,
-                                         uint32_t &Offset) {
-  static const char *const Strings[] = { "IEEE-754", "Runtime" };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_FP_rounding(AttrType tag) {
+  static const char *strings[] = {"IEEE-754", "Runtime"};
+  return parseStringAttribute("ABI_FP_rounding", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_FP_denormal(AttrType Tag, const uint8_t *Data,
-                                         uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "Unsupported", "IEEE-754", "Sign Only"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_FP_denormal(AttrType tag) {
+  static const char *strings[] = {"Unsupported", "IEEE-754", "Sign Only"};
+  return parseStringAttribute("ABI_FP_denormal", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_FP_exceptions(AttrType Tag, const uint8_t *Data,
-                                           uint32_t &Offset) {
-  static const char *const Strings[] = { "Not Permitted", "IEEE-754" };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_FP_exceptions(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "IEEE-754"};
+  return parseStringAttribute("ABI_FP_exceptions", tag, makeArrayRef(strings));
 }
-
-void ARMAttributeParser::ABI_FP_user_exceptions(AttrType Tag,
-                                                const uint8_t *Data,
-                                                uint32_t &Offset) {
-  static const char *const Strings[] = { "Not Permitted", "IEEE-754" };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_FP_user_exceptions(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "IEEE-754"};
+  return parseStringAttribute("ABI_FP_user_exceptions", tag,
+                              makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_FP_number_model(AttrType Tag, const uint8_t *Data,
-                                             uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "Not Permitted", "Finite Only", "RTABI", "IEEE-754"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_FP_number_model(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "Finite Only", "RTABI",
+                                  "IEEE-754"};
+  return parseStringAttribute("ABI_FP_number_model", tag,
+                              makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_align_needed(AttrType Tag, const uint8_t *Data,
-                                          uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "Not Permitted", "8-byte alignment", "4-byte alignment", "Reserved"
-  };
+Error ARMAttributeParser::ABI_align_needed(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "8-byte alignment",
+                                  "4-byte alignment", "Reserved"};
 
-  uint64_t Value = ParseInteger(Data, Offset);
+  uint64_t value = de.getULEB128(cursor);
 
-  std::string Description;
-  if (Value < array_lengthof(Strings))
-    Description = std::string(Strings[Value]);
-  else if (Value <= 12)
-    Description = std::string("8-byte alignment, ") + utostr(1ULL << Value)
-                + std::string("-byte extended alignment");
+  std::string description;
+  if (value < array_lengthof(strings))
+    description = strings[value];
+  else if (value <= 12)
+    description = "8-byte alignment, " + utostr(1ULL << value) +
+                  "-byte extended alignment";
   else
-    Description = "Invalid";
+    description = "Invalid";
 
-  PrintAttribute(Tag, Value, Description);
+  printAttribute(tag, value, description);
+  return Error::success();
 }
 
-void ARMAttributeParser::ABI_align_preserved(AttrType Tag, const uint8_t *Data,
-                                             uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "Not Required", "8-byte data alignment", "8-byte data and code alignment",
-    "Reserved"
-  };
+Error ARMAttributeParser::ABI_align_preserved(AttrType tag) {
+  static const char *strings[] = {"Not Required", "8-byte data alignment",
+                                  "8-byte data and code alignment", "Reserved"};
 
-  uint64_t Value = ParseInteger(Data, Offset);
+  uint64_t value = de.getULEB128(cursor);
 
-  std::string Description;
-  if (Value < array_lengthof(Strings))
-    Description = std::string(Strings[Value]);
-  else if (Value <= 12)
-    Description = std::string("8-byte stack alignment, ") +
-                  utostr(1ULL << Value) + std::string("-byte data alignment");
+  std::string description;
+  if (value < array_lengthof(strings))
+    description = std::string(strings[value]);
+  else if (value <= 12)
+    description = std::string("8-byte stack alignment, ") +
+                  utostr(1ULL << value) + std::string("-byte data alignment");
   else
-    Description = "Invalid";
+    description = "Invalid";
 
-  PrintAttribute(Tag, Value, Description);
+  printAttribute(tag, value, description);
+  return Error::success();
 }
 
-void ARMAttributeParser::ABI_enum_size(AttrType Tag, const uint8_t *Data,
-                                       uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "Not Permitted", "Packed", "Int32", "External Int32"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_enum_size(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "Packed", "Int32",
+                                  "External Int32"};
+  return parseStringAttribute("ABI_enum_size", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_HardFP_use(AttrType Tag, const uint8_t *Data,
-                                        uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "Tag_FP_arch", "Single-Precision", "Reserved", "Tag_FP_arch (deprecated)"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_HardFP_use(AttrType tag) {
+  static const char *strings[] = {"Tag_FP_arch", "Single-Precision", "Reserved",
+                                  "Tag_FP_arch (deprecated)"};
+  return parseStringAttribute("ABI_HardFP_use", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_VFP_args(AttrType Tag, const uint8_t *Data,
-                                      uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "AAPCS", "AAPCS VFP", "Custom", "Not Permitted"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_VFP_args(AttrType tag) {
+  static const char *strings[] = {"AAPCS", "AAPCS VFP", "Custom",
+                                  "Not Permitted"};
+  return parseStringAttribute("ABI_VFP_args", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_WMMX_args(AttrType Tag, const uint8_t *Data,
-                                       uint32_t &Offset) {
-  static const char *const Strings[] = { "AAPCS", "iWMMX", "Custom" };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_WMMX_args(AttrType tag) {
+  static const char *strings[] = {"AAPCS", "iWMMX", "Custom"};
+  return parseStringAttribute("ABI_WMMX_args", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_optimization_goals(AttrType Tag,
-                                                const uint8_t *Data,
-                                                uint32_t &Offset) {
-  static const char *const Strings[] = {
+Error ARMAttributeParser::ABI_optimization_goals(AttrType tag) {
+  static const char *strings[] = {
     "None", "Speed", "Aggressive Speed", "Size", "Aggressive Size", "Debugging",
     "Best Debugging"
   };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
-}
-
-void ARMAttributeParser::ABI_FP_optimization_goals(AttrType Tag,
-                                                   const uint8_t *Data,
-                                                   uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "None", "Speed", "Aggressive Speed", "Size", "Aggressive Size", "Accuracy",
-    "Best Accuracy"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
-}
-
-void ARMAttributeParser::compatibility(AttrType Tag, const uint8_t *Data,
-                                       uint32_t &Offset) {
-  uint64_t Integer = ParseInteger(Data, Offset);
-  StringRef String = ParseString(Data, Offset);
-
-  if (SW) {
-    DictScope AS(*SW, "Attribute");
-    SW->printNumber("Tag", Tag);
-    SW->startLine() << "Value: " << Integer << ", " << String << '\n';
-    SW->printString("TagName", AttrTypeAsString(Tag, /*TagPrefix*/false));
-    switch (Integer) {
+  return parseStringAttribute("ABI_optimization_goals", tag,
+                              makeArrayRef(strings));
+}
+
+Error ARMAttributeParser::ABI_FP_optimization_goals(AttrType tag) {
+  static const char *strings[] = {
+      "None",     "Speed",        "Aggressive Speed", "Size", "Aggressive Size",
+      "Accuracy", "Best Accuracy"};
+  return parseStringAttribute("ABI_FP_optimization_goals", tag,
+                              makeArrayRef(strings));
+}
+
+Error ARMAttributeParser::compatibility(AttrType tag) {
+  uint64_t integer = de.getULEB128(cursor);
+  StringRef string = de.getCStrRef(cursor);
+
+  if (sw) {
+    DictScope scope(*sw, "Attribute");
+    sw->printNumber("Tag", tag);
+    sw->startLine() << "Value: " << integer << ", " << string << '\n';
+    sw->printString("TagName",
+                    ELFAttrs::attrTypeAsString(tag, tagToStringMap,
+                                               /*hasTagPrefix=*/false));
+    switch (integer) {
     case 0:
-      SW->printString("Description", StringRef("No Specific Requirements"));
+      sw->printString("Description", StringRef("No Specific Requirements"));
       break;
     case 1:
-      SW->printString("Description", StringRef("AEABI Conformant"));
+      sw->printString("Description", StringRef("AEABI Conformant"));
       break;
     default:
-      SW->printString("Description", StringRef("AEABI Non-Conformant"));
+      sw->printString("Description", StringRef("AEABI Non-Conformant"));
       break;
     }
   }
+  return Error::success();
 }
 
-void ARMAttributeParser::CPU_unaligned_access(AttrType Tag, const uint8_t *Data,
-                                              uint32_t &Offset) {
-  static const char *const Strings[] = { "Not Permitted", "v6-style" };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::CPU_unaligned_access(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "v6-style"};
+  return parseStringAttribute("CPU_unaligned_access", tag,
+                              makeArrayRef(strings));
 }
 
-void ARMAttributeParser::FP_HP_extension(AttrType Tag, const uint8_t *Data,
-                                         uint32_t &Offset) {
-  static const char *const Strings[] = { "If Available", "Permitted" };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::FP_HP_extension(AttrType tag) {
+  static const char *strings[] = {"If Available", "Permitted"};
+  return parseStringAttribute("FP_HP_extension", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::ABI_FP_16bit_format(AttrType Tag, const uint8_t *Data,
-                                             uint32_t &Offset) {
-  static const char *const Strings[] = { "Not Permitted", "IEEE-754", "VFPv3" };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::ABI_FP_16bit_format(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "IEEE-754", "VFPv3"};
+  return parseStringAttribute("ABI_FP_16bit_format", tag,
+                              makeArrayRef(strings));
 }
 
-void ARMAttributeParser::MPextension_use(AttrType Tag, const uint8_t *Data,
-                                         uint32_t &Offset) {
-  static const char *const Strings[] = { "Not Permitted", "Permitted" };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::MPextension_use(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "Permitted"};
+  return parseStringAttribute("MPextension_use", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::DIV_use(AttrType Tag, const uint8_t *Data,
-                                 uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "If Available", "Not Permitted", "Permitted"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::DIV_use(AttrType tag) {
+  static const char *strings[] = {"If Available", "Not Permitted", "Permitted"};
+  return parseStringAttribute("DIV_use", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::DSP_extension(AttrType Tag, const uint8_t *Data,
-                                       uint32_t &Offset) {
-  static const char *const Strings[] = { "Not Permitted", "Permitted" };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::DSP_extension(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "Permitted"};
+  return parseStringAttribute("DSP_extension", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::T2EE_use(AttrType Tag, const uint8_t *Data,
-                                  uint32_t &Offset) {
-  static const char *const Strings[] = { "Not Permitted", "Permitted" };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::T2EE_use(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "Permitted"};
+  return parseStringAttribute("T2EE_use", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::Virtualization_use(AttrType Tag, const uint8_t *Data,
-                                            uint32_t &Offset) {
-  static const char *const Strings[] = {
-    "Not Permitted", "TrustZone", "Virtualization Extensions",
-    "TrustZone + Virtualization Extensions"
-  };
-
-  uint64_t Value = ParseInteger(Data, Offset);
-  StringRef ValueDesc =
-    (Value < array_lengthof(Strings)) ? Strings[Value] : nullptr;
-  PrintAttribute(Tag, Value, ValueDesc);
+Error ARMAttributeParser::Virtualization_use(AttrType tag) {
+  static const char *strings[] = {"Not Permitted", "TrustZone",
+                                  "Virtualization Extensions",
+                                  "TrustZone + Virtualization Extensions"};
+  return parseStringAttribute("Virtualization_use", tag, makeArrayRef(strings));
 }
 
-void ARMAttributeParser::nodefaults(AttrType Tag, const uint8_t *Data,
-                                    uint32_t &Offset) {
-  uint64_t Value = ParseInteger(Data, Offset);
-  PrintAttribute(Tag, Value, "Unspecified Tags UNDEFINED");
-}
-
-void ARMAttributeParser::ParseIndexList(const uint8_t *Data, uint32_t &Offset,
-                                        SmallVectorImpl<uint8_t> &IndexList) {
-  for (;;) {
-    unsigned DecodeLength;
-    uint64_t Value = decodeULEB128(Data + Offset, &DecodeLength);
-    Offset += DecodeLength;
-    if (Value == 0)
-      break;
-    IndexList.push_back(Value);
-  }
+Error ARMAttributeParser::nodefaults(AttrType tag) {
+  uint64_t value = de.getULEB128(cursor);
+  printAttribute(tag, value, "Unspecified Tags UNDEFINED");
+  return Error::success();
 }
 
-void ARMAttributeParser::ParseAttributeList(const uint8_t *Data,
-                                            uint32_t &Offset, uint32_t Length) {
-  while (Offset < Length) {
-    unsigned DecodeLength;
-    uint64_t Tag = decodeULEB128(Data + Offset, &DecodeLength);
-    Offset += DecodeLength;
-
-    bool Handled = false;
-    for (unsigned AHI = 0, AHE = array_lengthof(DisplayRoutines);
-         AHI != AHE && !Handled; ++AHI) {
-      if (uint64_t(DisplayRoutines[AHI].Attribute) == Tag) {
-        (this->*DisplayRoutines[AHI].Routine)(ARMBuildAttrs::AttrType(Tag),
-                                              Data, Offset);
-        Handled = true;
-        break;
-      }
-    }
-    if (!Handled) {
-      if (Tag < 32) {
-        errs() << "unhandled AEABI Tag " << Tag
-               << " (" << ARMBuildAttrs::AttrTypeAsString(Tag) << ")\n";
-        continue;
-      }
-
-      if (Tag % 2 == 0)
-        IntegerAttribute(ARMBuildAttrs::AttrType(Tag), Data, Offset);
-      else
-        StringAttribute(ARMBuildAttrs::AttrType(Tag), Data, Offset);
-    }
-  }
-}
-
-void ARMAttributeParser::ParseSubsection(const uint8_t *Data, uint32_t Length) {
-  uint32_t Offset = sizeof(uint32_t); /* SectionLength */
-
-  const char *VendorName = reinterpret_cast<const char*>(Data + Offset);
-  size_t VendorNameLength = std::strlen(VendorName);
-  Offset = Offset + VendorNameLength + 1;
-
-  if (SW) {
-    SW->printNumber("SectionLength", Length);
-    SW->printString("Vendor", StringRef(VendorName, VendorNameLength));
-  }
-
-  if (StringRef(VendorName, VendorNameLength).lower() != "aeabi") {
-    return;
-  }
-
-  while (Offset < Length) {
-    /// Tag_File | Tag_Section | Tag_Symbol   uleb128:byte-size
-    uint8_t Tag = Data[Offset];
-    Offset = Offset + sizeof(Tag);
-
-    uint32_t Size =
-      *reinterpret_cast<const support::ulittle32_t*>(Data + Offset);
-    Offset = Offset + sizeof(Size);
-
-    if (SW) {
-      SW->printEnum("Tag", Tag, makeArrayRef(TagNames));
-      SW->printNumber("Size", Size);
-    }
-
-    if (Size > Length) {
-      errs() << "subsection length greater than section length\n";
-      return;
-    }
-
-    StringRef ScopeName, IndexName;
-    SmallVector<uint8_t, 8> Indicies;
-    switch (Tag) {
-    case ARMBuildAttrs::File:
-      ScopeName = "FileAttributes";
+Error ARMAttributeParser::handler(uint64_t tag, bool &handled) {
+  handled = false;
+  for (unsigned AHI = 0, AHE = array_lengthof(displayRoutines); AHI != AHE;
+       ++AHI) {
+    if (uint64_t(displayRoutines[AHI].attribute) == tag) {
+      if (Error e =
+              (this->*displayRoutines[AHI].routine)(static_cast<AttrType>(tag)))
+        return e;
+      handled = true;
       break;
-    case ARMBuildAttrs::Section:
-      ScopeName = "SectionAttributes";
-      IndexName = "Sections";
-      ParseIndexList(Data, Offset, Indicies);
-      break;
-    case ARMBuildAttrs::Symbol:
-      ScopeName = "SymbolAttributes";
-      IndexName = "Symbols";
-      ParseIndexList(Data, Offset, Indicies);
-      break;
-    default:
-      errs() << "unrecognised tag: 0x" << Twine::utohexstr(Tag) << '\n';
-      return;
-    }
-
-    if (SW) {
-      DictScope ASS(*SW, ScopeName);
-      if (!Indicies.empty())
-        SW->printList(IndexName, Indicies);
-      ParseAttributeList(Data, Offset, Length);
-    } else {
-      ParseAttributeList(Data, Offset, Length);
     }
   }
-}
-
-void ARMAttributeParser::Parse(ArrayRef<uint8_t> Section, bool isLittle) {
-  uint64_t Offset = 1;
-  unsigned SectionNumber = 0;
-
-  while (Offset < Section.size()) {
-    uint32_t SectionLength = isLittle ?
-      support::endian::read32le(Section.data() + Offset) :
-      support::endian::read32be(Section.data() + Offset);
-
-    if (SW) {
-      SW->startLine() << "Section " << ++SectionNumber << " {\n";
-      SW->indent();
-    }
 
-    if (SectionLength == 0 || (SectionLength + Offset) > Section.size()) {
-      errs() << "invalid subsection length " << SectionLength << " at offset "
-             << Offset << "\n";
-      return;
-    }
-
-    ParseSubsection(Section.data() + Offset, SectionLength);
-    Offset = Offset + SectionLength;
-
-    if (SW) {
-      SW->unindent();
-      SW->startLine() << "}\n";
-    }
-  }
-}
+  return Error::success();
 }
diff --git a/llvm/lib/Support/ARMBuildAttrs.cpp b/llvm/lib/Support/ARMBuildAttrs.cpp
index d0c4fb792cb8..5aaf0a4e7c62 100644
--- a/llvm/lib/Support/ARMBuildAttrs.cpp
+++ b/llvm/lib/Support/ARMBuildAttrs.cpp
@@ -6,97 +6,63 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/ARMBuildAttributes.h"
 
 using namespace llvm;
 
-namespace {
-const struct {
-  ARMBuildAttrs::AttrType Attr;
-  StringRef TagName;
-} ARMAttributeTags[] = {
-  { ARMBuildAttrs::File, "Tag_File" },
-  { ARMBuildAttrs::Section, "Tag_Section" },
-  { ARMBuildAttrs::Symbol, "Tag_Symbol" },
-  { ARMBuildAttrs::CPU_raw_name, "Tag_CPU_raw_name" },
-  { ARMBuildAttrs::CPU_name, "Tag_CPU_name" },
-  { ARMBuildAttrs::CPU_arch, "Tag_CPU_arch" },
-  { ARMBuildAttrs::CPU_arch_profile, "Tag_CPU_arch_profile" },
-  { ARMBuildAttrs::ARM_ISA_use, "Tag_ARM_ISA_use" },
-  { ARMBuildAttrs::THUMB_ISA_use, "Tag_THUMB_ISA_use" },
-  { ARMBuildAttrs::FP_arch, "Tag_FP_arch" },
-  { ARMBuildAttrs::WMMX_arch, "Tag_WMMX_arch" },
-  { ARMBuildAttrs::Advanced_SIMD_arch, "Tag_Advanced_SIMD_arch" },
-  { ARMBuildAttrs::MVE_arch, "Tag_MVE_arch" },
-  { ARMBuildAttrs::PCS_config, "Tag_PCS_config" },
-  { ARMBuildAttrs::ABI_PCS_R9_use, "Tag_ABI_PCS_R9_use" },
-  { ARMBuildAttrs::ABI_PCS_RW_data, "Tag_ABI_PCS_RW_data" },
-  { ARMBuildAttrs::ABI_PCS_RO_data, "Tag_ABI_PCS_RO_data" },
-  { ARMBuildAttrs::ABI_PCS_GOT_use, "Tag_ABI_PCS_GOT_use" },
-  { ARMBuildAttrs::ABI_PCS_wchar_t, "Tag_ABI_PCS_wchar_t" },
-  { ARMBuildAttrs::ABI_FP_rounding, "Tag_ABI_FP_rounding" },
-  { ARMBuildAttrs::ABI_FP_denormal, "Tag_ABI_FP_denormal" },
-  { ARMBuildAttrs::ABI_FP_exceptions, "Tag_ABI_FP_exceptions" },
-  { ARMBuildAttrs::ABI_FP_user_exceptions, "Tag_ABI_FP_user_exceptions" },
-  { ARMBuildAttrs::ABI_FP_number_model, "Tag_ABI_FP_number_model" },
-  { ARMBuildAttrs::ABI_align_needed, "Tag_ABI_align_needed" },
-  { ARMBuildAttrs::ABI_align_preserved, "Tag_ABI_align_preserved" },
-  { ARMBuildAttrs::ABI_enum_size, "Tag_ABI_enum_size" },
-  { ARMBuildAttrs::ABI_HardFP_use, "Tag_ABI_HardFP_use" },
-  { ARMBuildAttrs::ABI_VFP_args, "Tag_ABI_VFP_args" },
-  { ARMBuildAttrs::ABI_WMMX_args, "Tag_ABI_WMMX_args" },
-  { ARMBuildAttrs::ABI_optimization_goals, "Tag_ABI_optimization_goals" },
-  { ARMBuildAttrs::ABI_FP_optimization_goals, "Tag_ABI_FP_optimization_goals" },
-  { ARMBuildAttrs::compatibility, "Tag_compatibility" },
-  { ARMBuildAttrs::CPU_unaligned_access, "Tag_CPU_unaligned_access" },
-  { ARMBuildAttrs::FP_HP_extension, "Tag_FP_HP_extension" },
-  { ARMBuildAttrs::ABI_FP_16bit_format, "Tag_ABI_FP_16bit_format" },
-  { ARMBuildAttrs::MPextension_use, "Tag_MPextension_use" },
-  { ARMBuildAttrs::DIV_use, "Tag_DIV_use" },
-  { ARMBuildAttrs::DSP_extension, "Tag_DSP_extension" },
-  { ARMBuildAttrs::nodefaults, "Tag_nodefaults" },
-  { ARMBuildAttrs::also_compatible_with, "Tag_also_compatible_with" },
-  { ARMBuildAttrs::T2EE_use, "Tag_T2EE_use" },
-  { ARMBuildAttrs::conformance, "Tag_conformance" },
-  { ARMBuildAttrs::Virtualization_use, "Tag_Virtualization_use" },
+static const TagNameItem tagData[] = {
+    {ARMBuildAttrs::File, "Tag_File"},
+    {ARMBuildAttrs::Section, "Tag_Section"},
+    {ARMBuildAttrs::Symbol, "Tag_Symbol"},
+    {ARMBuildAttrs::CPU_raw_name, "Tag_CPU_raw_name"},
+    {ARMBuildAttrs::CPU_name, "Tag_CPU_name"},
+    {ARMBuildAttrs::CPU_arch, "Tag_CPU_arch"},
+    {ARMBuildAttrs::CPU_arch_profile, "Tag_CPU_arch_profile"},
+    {ARMBuildAttrs::ARM_ISA_use, "Tag_ARM_ISA_use"},
+    {ARMBuildAttrs::THUMB_ISA_use, "Tag_THUMB_ISA_use"},
+    {ARMBuildAttrs::FP_arch, "Tag_FP_arch"},
+    {ARMBuildAttrs::WMMX_arch, "Tag_WMMX_arch"},
+    {ARMBuildAttrs::Advanced_SIMD_arch, "Tag_Advanced_SIMD_arch"},
+    {ARMBuildAttrs::MVE_arch, "Tag_MVE_arch"},
+    {ARMBuildAttrs::PCS_config, "Tag_PCS_config"},
+    {ARMBuildAttrs::ABI_PCS_R9_use, "Tag_ABI_PCS_R9_use"},
+    {ARMBuildAttrs::ABI_PCS_RW_data, "Tag_ABI_PCS_RW_data"},
+    {ARMBuildAttrs::ABI_PCS_RO_data, "Tag_ABI_PCS_RO_data"},
+    {ARMBuildAttrs::ABI_PCS_GOT_use, "Tag_ABI_PCS_GOT_use"},
+    {ARMBuildAttrs::ABI_PCS_wchar_t, "Tag_ABI_PCS_wchar_t"},
+    {ARMBuildAttrs::ABI_FP_rounding, "Tag_ABI_FP_rounding"},
+    {ARMBuildAttrs::ABI_FP_denormal, "Tag_ABI_FP_denormal"},
+    {ARMBuildAttrs::ABI_FP_exceptions, "Tag_ABI_FP_exceptions"},
+    {ARMBuildAttrs::ABI_FP_user_exceptions, "Tag_ABI_FP_user_exceptions"},
+    {ARMBuildAttrs::ABI_FP_number_model, "Tag_ABI_FP_number_model"},
+    {ARMBuildAttrs::ABI_align_needed, "Tag_ABI_align_needed"},
+    {ARMBuildAttrs::ABI_align_preserved, "Tag_ABI_align_preserved"},
+    {ARMBuildAttrs::ABI_enum_size, "Tag_ABI_enum_size"},
+    {ARMBuildAttrs::ABI_HardFP_use, "Tag_ABI_HardFP_use"},
+    {ARMBuildAttrs::ABI_VFP_args, "Tag_ABI_VFP_args"},
+    {ARMBuildAttrs::ABI_WMMX_args, "Tag_ABI_WMMX_args"},
+    {ARMBuildAttrs::ABI_optimization_goals, "Tag_ABI_optimization_goals"},
+    {ARMBuildAttrs::ABI_FP_optimization_goals, "Tag_ABI_FP_optimization_goals"},
+    {ARMBuildAttrs::compatibility, "Tag_compatibility"},
+    {ARMBuildAttrs::CPU_unaligned_access, "Tag_CPU_unaligned_access"},
+    {ARMBuildAttrs::FP_HP_extension, "Tag_FP_HP_extension"},
+    {ARMBuildAttrs::ABI_FP_16bit_format, "Tag_ABI_FP_16bit_format"},
+    {ARMBuildAttrs::MPextension_use, "Tag_MPextension_use"},
+    {ARMBuildAttrs::DIV_use, "Tag_DIV_use"},
+    {ARMBuildAttrs::DSP_extension, "Tag_DSP_extension"},
+    {ARMBuildAttrs::nodefaults, "Tag_nodefaults"},
+    {ARMBuildAttrs::also_compatible_with, "Tag_also_compatible_with"},
+    {ARMBuildAttrs::T2EE_use, "Tag_T2EE_use"},
+    {ARMBuildAttrs::conformance, "Tag_conformance"},
+    {ARMBuildAttrs::Virtualization_use, "Tag_Virtualization_use"},
 
-  // Legacy Names
-  { ARMBuildAttrs::FP_arch, "Tag_VFP_arch" },
-  { ARMBuildAttrs::FP_HP_extension, "Tag_VFP_HP_extension" },
-  { ARMBuildAttrs::ABI_align_needed, "Tag_ABI_align8_needed" },
-  { ARMBuildAttrs::ABI_align_preserved, "Tag_ABI_align8_preserved" },
+    // Legacy Names
+    {ARMBuildAttrs::FP_arch, "Tag_VFP_arch"},
+    {ARMBuildAttrs::FP_HP_extension, "Tag_VFP_HP_extension"},
+    {ARMBuildAttrs::ABI_align_needed, "Tag_ABI_align8_needed"},
+    {ARMBuildAttrs::ABI_align_preserved, "Tag_ABI_align8_preserved"},
 };
-}
-
-namespace llvm {
-namespace ARMBuildAttrs {
-StringRef AttrTypeAsString(unsigned Attr, bool HasTagPrefix) {
-  return AttrTypeAsString(static_cast<AttrType>(Attr), HasTagPrefix);
-}
-
-StringRef AttrTypeAsString(AttrType Attr, bool HasTagPrefix) {
-  for (unsigned TI = 0, TE = sizeof(ARMAttributeTags) / sizeof(*ARMAttributeTags);
-       TI != TE; ++TI)
-    if (ARMAttributeTags[TI].Attr == Attr) {
-      auto TagName = ARMAttributeTags[TI].TagName;
-      return HasTagPrefix ? TagName : TagName.drop_front(4);
-    }
-  return "";
-}
-
-int AttrTypeFromString(StringRef Tag) {
-  bool HasTagPrefix = Tag.startswith("Tag_");
-  for (unsigned TI = 0,
-                TE = sizeof(ARMAttributeTags) / sizeof(*ARMAttributeTags);
-       TI != TE; ++TI) {
-    auto TagName = ARMAttributeTags[TI].TagName;
-    if (TagName.drop_front(HasTagPrefix ? 0 : 4) == Tag) {
-      return ARMAttributeTags[TI].Attr;
-    }
-  }
-  return -1;
-}
-}
-}
 
+const TagNameMap llvm::ARMBuildAttrs::ARMAttributeTags(tagData,
+                                                       sizeof(tagData) /
+                                                           sizeof(TagNameItem));
diff --git a/llvm/lib/Support/ARMTargetParser.cpp b/llvm/lib/Support/ARMTargetParser.cpp
index f2c22fd93c8b..56a91f7dc787 100644
--- a/llvm/lib/Support/ARMTargetParser.cpp
+++ b/llvm/lib/Support/ARMTargetParser.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Support/ARMTargetParser.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include <cctype>
 
 using namespace llvm;
@@ -27,7 +28,7 @@ static StringRef getHWDivSynonym(StringRef HWDiv) {
 ARM::ArchKind ARM::parseArch(StringRef Arch) {
   Arch = getCanonicalArchName(Arch);
   StringRef Syn = getArchSynonym(Arch);
-  for (const auto A : ARCHNames) {
+  for (const auto &A : ARCHNames) {
     if (A.getName().endswith(Syn))
       return A.ID;
   }
@@ -74,6 +75,7 @@ unsigned ARM::parseArchVersion(StringRef Arch) {
   case ArchKind::ARMV8_3A:
   case ArchKind::ARMV8_4A:
   case ArchKind::ARMV8_5A:
+  case ArchKind::ARMV8_6A:
   case ArchKind::ARMV8R:
   case ArchKind::ARMV8MBaseline:
   case ArchKind::ARMV8MMainline:
@@ -108,6 +110,7 @@ ARM::ProfileKind ARM::parseArchProfile(StringRef Arch) {
   case ArchKind::ARMV8_3A:
   case ArchKind::ARMV8_4A:
   case ArchKind::ARMV8_5A:
+  case ArchKind::ARMV8_6A:
     return ProfileKind::A;
   case ArchKind::ARMV2:
   case ArchKind::ARMV2A:
@@ -150,6 +153,7 @@ StringRef ARM::getArchSynonym(StringRef Arch) {
       .Case("v8.3a", "v8.3-a")
       .Case("v8.4a", "v8.4-a")
       .Case("v8.5a", "v8.5-a")
+      .Case("v8.6a", "v8.6-a")
       .Case("v8r", "v8-r")
       .Case("v8m.base", "v8-m.base")
       .Case("v8m.main", "v8-m.main")
@@ -367,11 +371,11 @@ unsigned ARM::getDefaultFPU(StringRef CPU, ARM::ArchKind AK) {
    .Default(ARM::FK_INVALID);
 }
 
-unsigned ARM::getDefaultExtensions(StringRef CPU, ARM::ArchKind AK) {
+uint64_t ARM::getDefaultExtensions(StringRef CPU, ARM::ArchKind AK) {
   if (CPU == "generic")
     return ARM::ARCHNames[static_cast<unsigned>(AK)].ArchBaseExtensions;
 
-  return StringSwitch<unsigned>(CPU)
+  return StringSwitch<uint64_t>(CPU)
 #define ARM_CPU_NAME(NAME, ID, DEFAULT_FPU, IS_DEFAULT, DEFAULT_EXT)           \
   .Case(NAME,                                                                  \
         ARCHNames[static_cast<unsigned>(ArchKind::ID)].ArchBaseExtensions |    \
@@ -380,7 +384,7 @@ unsigned ARM::getDefaultExtensions(StringRef CPU, ARM::ArchKind AK) {
   .Default(ARM::AEK_INVALID);
 }
 
-bool ARM::getHWDivFeatures(unsigned HWDivKind,
+bool ARM::getHWDivFeatures(uint64_t HWDivKind,
                            std::vector<StringRef> &Features) {
 
   if (HWDivKind == AEK_INVALID)
@@ -399,7 +403,7 @@ bool ARM::getHWDivFeatures(unsigned HWDivKind,
   return true;
 }
 
-bool ARM::getExtensionFeatures(unsigned Extensions,
+bool ARM::getExtensionFeatures(uint64_t Extensions,
                                std::vector<StringRef> &Features) {
 
   if (Extensions == AEK_INVALID)
@@ -431,7 +435,7 @@ unsigned ARM::getArchAttr(ARM::ArchKind AK) {
   return ARCHNames[static_cast<unsigned>(AK)].ArchAttr;
 }
 
-StringRef ARM::getArchExtName(unsigned ArchExtKind) {
+StringRef ARM::getArchExtName(uint64_t ArchExtKind) {
   for (const auto AE : ARCHExtNames) {
     if (ArchExtKind == AE.ID)
       return AE.getName();
@@ -486,29 +490,25 @@ static unsigned findDoublePrecisionFPU(unsigned InputFPUKind) {
   return ARM::FK_INVALID;
 }
 
-static unsigned getAEKID(StringRef ArchExtName) {
-  for (const auto AE : ARM::ARCHExtNames)
-    if (AE.getName() == ArchExtName)
-      return AE.ID;
-  return ARM::AEK_INVALID;
-}
-
 bool ARM::appendArchExtFeatures(
   StringRef CPU, ARM::ArchKind AK, StringRef ArchExt,
   std::vector<StringRef> &Features) {
 
   size_t StartingNumFeatures = Features.size();
   const bool Negated = stripNegationPrefix(ArchExt);
-  unsigned ID = getAEKID(ArchExt);
+  uint64_t ID = parseArchExt(ArchExt);
 
   if (ID == AEK_INVALID)
     return false;
 
   for (const auto AE : ARCHExtNames) {
-    if (Negated && (AE.ID & ID) == ID && AE.NegFeature)
-      Features.push_back(AE.NegFeature);
-    else if (AE.ID == ID && AE.Feature)
-      Features.push_back(AE.Feature);
+    if (Negated) {
+      if ((AE.ID & ID) == ID && AE.NegFeature)
+        Features.push_back(AE.NegFeature);
+    } else {
+      if ((AE.ID & ID) == AE.ID && AE.Feature)
+        Features.push_back(AE.Feature);
+    }
   }
 
   if (CPU == "")
@@ -532,7 +532,7 @@ bool ARM::appendArchExtFeatures(
   return StartingNumFeatures != Features.size();
 }
 
-StringRef ARM::getHWDivName(unsigned HWDivKind) {
+StringRef ARM::getHWDivName(uint64_t HWDivKind) {
   for (const auto D : HWDivNames) {
     if (HWDivKind == D.ID)
       return D.getName();
@@ -555,7 +555,7 @@ StringRef ARM::getDefaultCPU(StringRef Arch) {
   return "generic";
 }
 
-unsigned ARM::parseHWDiv(StringRef HWDiv) {
+uint64_t ARM::parseHWDiv(StringRef HWDiv) {
   StringRef Syn = getHWDivSynonym(HWDiv);
   for (const auto D : HWDivNames) {
     if (Syn == D.getName())
@@ -564,7 +564,7 @@ unsigned ARM::parseHWDiv(StringRef HWDiv) {
   return AEK_INVALID;
 }
 
-unsigned ARM::parseArchExt(StringRef ArchExt) {
+uint64_t ARM::parseArchExt(StringRef ArchExt) {
   for (const auto A : ARCHExtNames) {
     if (ArchExt == A.getName())
       return A.ID;
diff --git a/llvm/lib/Support/BranchProbability.cpp b/llvm/lib/Support/BranchProbability.cpp
index 195e2d58d8e1..60d5478a9052 100644
--- a/llvm/lib/Support/BranchProbability.cpp
+++ b/llvm/lib/Support/BranchProbability.cpp
@@ -19,7 +19,7 @@
 
 using namespace llvm;
 
-const uint32_t BranchProbability::D;
+constexpr uint32_t BranchProbability::D;
 
 raw_ostream &BranchProbability::print(raw_ostream &OS) const {
   if (isUnknown())
diff --git a/llvm/lib/Support/CRC.cpp b/llvm/lib/Support/CRC.cpp
index a3dba1a3aa10..7ff09debe3b7 100644
--- a/llvm/lib/Support/CRC.cpp
+++ b/llvm/lib/Support/CRC.cpp
@@ -25,7 +25,7 @@
 
 using namespace llvm;
 
-#if !LLVM_ENABLE_ZLIB
+#if LLVM_ENABLE_ZLIB == 0 || !HAVE_ZLIB_H
 
 static const uint32_t CRCTable[256] = {
     0x00000000, 0x77073096, 0xee0e612c, 0x990951ba, 0x076dc419, 0x706af48f,
@@ -85,7 +85,15 @@ uint32_t llvm::crc32(uint32_t CRC, ArrayRef<uint8_t> Data) {
 
 #include <zlib.h>
 uint32_t llvm::crc32(uint32_t CRC, ArrayRef<uint8_t> Data) {
-  return ::crc32(CRC, (const Bytef *)Data.data(), Data.size());
+  // Zlib's crc32() only takes a 32-bit length, so we have to iterate for larger
+  // sizes. One could use crc32_z() instead, but that's a recent (2017) addition
+  // and may not be available on all systems.
+  do {
+    ArrayRef<uint8_t> Slice = Data.take_front(UINT32_MAX);
+    CRC = ::crc32(CRC, (const Bytef *)Slice.data(), (uInt)Slice.size());
+    Data = Data.drop_front(Slice.size());
+  } while (Data.size() > 0);
+  return CRC;
 }
 
 #endif
diff --git a/llvm/lib/Support/CachePruning.cpp b/llvm/lib/Support/CachePruning.cpp
index 7a2f6c53435a..7663644db558 100644
--- a/llvm/lib/Support/CachePruning.cpp
+++ b/llvm/lib/Support/CachePruning.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/CachePruning.h"
-
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
diff --git a/llvm/lib/Support/CodeGenCoverage.cpp b/llvm/lib/Support/CodeGenCoverage.cpp
index 2db4193ce382..93f386b6e23d 100644
--- a/llvm/lib/Support/CodeGenCoverage.cpp
+++ b/llvm/lib/Support/CodeGenCoverage.cpp
@@ -11,20 +11,14 @@
 
 #include "llvm/Support/CodeGenCoverage.h"
 
-#include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/Support/Mutex.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/ToolOutputFile.h"
 
-#if LLVM_ON_UNIX
-#include <unistd.h>
-#elif defined(_WIN32)
-#include <windows.h>
-#endif
-
 using namespace llvm;
 
 static sys::SmartMutex<true> OutputMutex;
@@ -89,14 +83,7 @@ bool CodeGenCoverage::emit(StringRef CoveragePrefix,
     // We can handle locking within a process easily enough but we don't want to
     // manage it between multiple processes. Use the process ID to ensure no
     // more than one process is ever writing to the same file at the same time.
-    std::string Pid =
-#if LLVM_ON_UNIX
-        llvm::to_string(::getpid());
-#elif defined(_WIN32)
-        llvm::to_string(::GetCurrentProcessId());
-#else
-        "";
-#endif
+    std::string Pid = llvm::to_string(sys::Process::getProcessId());
 
     std::string CoverageFilename = (CoveragePrefix + Pid).str();
 
diff --git a/llvm/lib/Support/CommandLine.cpp b/llvm/lib/Support/CommandLine.cpp
index cb73380ba383..12ef0d511b14 100644
--- a/llvm/lib/Support/CommandLine.cpp
+++ b/llvm/lib/Support/CommandLine.cpp
@@ -592,6 +592,10 @@ static Option *LookupNearestOption(StringRef Arg,
                                            ie = OptionsMap.end();
        it != ie; ++it) {
     Option *O = it->second;
+    // Do not suggest really hidden options (not shown in any help).
+    if (O->getOptionHiddenFlag() == ReallyHidden)
+      continue;
+
     SmallVector<StringRef, 16> OptionNames;
     O->getExtraOptionNames(OptionNames);
     if (O->hasArgStr())
@@ -606,7 +610,7 @@ static Option *LookupNearestOption(StringRef Arg,
         Best = O;
         BestDistance = Distance;
         if (RHS.empty() || !PermitValue)
-          NearestString = Name;
+          NearestString = std::string(Name);
         else
           NearestString = (Twine(Name) + "=" + RHS).str();
       }
@@ -919,91 +923,118 @@ static size_t parseBackslash(StringRef Src, size_t I, SmallString<128> &Token) {
   return I - 1;
 }
 
-void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
-                                    SmallVectorImpl<const char *> &NewArgv,
-                                    bool MarkEOLs) {
+// Windows treats whitespace, double quotes, and backslashes specially.
+static bool isWindowsSpecialChar(char C) {
+  return isWhitespaceOrNull(C) || C == '\\' || C == '\"';
+}
+
+// Windows tokenization implementation. The implementation is designed to be
+// inlined and specialized for the two user entry points.
+static inline void
+tokenizeWindowsCommandLineImpl(StringRef Src, StringSaver &Saver,
+                               function_ref<void(StringRef)> AddToken,
+                               bool AlwaysCopy, function_ref<void()> MarkEOL) {
   SmallString<128> Token;
 
-  // This is a small state machine to consume characters until it reaches the
-  // end of the source string.
+  // Try to do as much work inside the state machine as possible.
   enum { INIT, UNQUOTED, QUOTED } State = INIT;
-  for (size_t I = 0, E = Src.size(); I != E; ++I) {
-    char C = Src[I];
-
-    // INIT state indicates that the current input index is at the start of
-    // the string or between tokens.
-    if (State == INIT) {
-      if (isWhitespaceOrNull(C)) {
-        // Mark the end of lines in response files
-        if (MarkEOLs && C == '\n')
-          NewArgv.push_back(nullptr);
-        continue;
+  for (size_t I = 0, E = Src.size(); I < E; ++I) {
+    switch (State) {
+    case INIT: {
+      assert(Token.empty() && "token should be empty in initial state");
+      // Eat whitespace before a token.
+      while (I < E && isWhitespaceOrNull(Src[I])) {
+        if (Src[I] == '\n')
+          MarkEOL();
+        ++I;
       }
-      if (C == '"') {
+      // Stop if this was trailing whitespace.
+      if (I >= E)
+        break;
+      size_t Start = I;
+      while (I < E && !isWindowsSpecialChar(Src[I]))
+        ++I;
+      StringRef NormalChars = Src.slice(Start, I);
+      if (I >= E || isWhitespaceOrNull(Src[I])) {
+        if (I < E && Src[I] == '\n')
+          MarkEOL();
+        // No special characters: slice out the substring and start the next
+        // token. Copy the string if the caller asks us to.
+        AddToken(AlwaysCopy ? Saver.save(NormalChars) : NormalChars);
+      } else if (Src[I] == '\"') {
+        Token += NormalChars;
         State = QUOTED;
-        continue;
-      }
-      if (C == '\\') {
+      } else if (Src[I] == '\\') {
+        Token += NormalChars;
         I = parseBackslash(Src, I, Token);
         State = UNQUOTED;
-        continue;
+      } else {
+        llvm_unreachable("unexpected special character");
       }
-      Token.push_back(C);
-      State = UNQUOTED;
-      continue;
+      break;
     }
 
-    // UNQUOTED state means that it's reading a token not quoted by double
-    // quotes.
-    if (State == UNQUOTED) {
-      // Whitespace means the end of the token.
-      if (isWhitespaceOrNull(C)) {
-        NewArgv.push_back(Saver.save(StringRef(Token)).data());
+    case UNQUOTED:
+      if (isWhitespaceOrNull(Src[I])) {
+        // Whitespace means the end of the token. If we are in this state, the
+        // token must have contained a special character, so we must copy the
+        // token.
+        AddToken(Saver.save(Token.str()));
         Token.clear();
+        if (Src[I] == '\n')
+          MarkEOL();
         State = INIT;
-        // Mark the end of lines in response files
-        if (MarkEOLs && C == '\n')
-          NewArgv.push_back(nullptr);
-        continue;
-      }
-      if (C == '"') {
+      } else if (Src[I] == '\"') {
         State = QUOTED;
-        continue;
-      }
-      if (C == '\\') {
+      } else if (Src[I] == '\\') {
         I = parseBackslash(Src, I, Token);
-        continue;
+      } else {
+        Token.push_back(Src[I]);
       }
-      Token.push_back(C);
-      continue;
-    }
+      break;
 
-    // QUOTED state means that it's reading a token quoted by double quotes.
-    if (State == QUOTED) {
-      if (C == '"') {
+    case QUOTED:
+      if (Src[I] == '\"') {
         if (I < (E - 1) && Src[I + 1] == '"') {
           // Consecutive double-quotes inside a quoted string implies one
           // double-quote.
           Token.push_back('"');
-          I = I + 1;
-          continue;
+          ++I;
+        } else {
+          // Otherwise, end the quoted portion and return to the unquoted state.
+          State = UNQUOTED;
         }
-        State = UNQUOTED;
-        continue;
-      }
-      if (C == '\\') {
+      } else if (Src[I] == '\\') {
         I = parseBackslash(Src, I, Token);
-        continue;
+      } else {
+        Token.push_back(Src[I]);
       }
-      Token.push_back(C);
+      break;
     }
   }
-  // Append the last token after hitting EOF with no whitespace.
-  if (!Token.empty())
-    NewArgv.push_back(Saver.save(StringRef(Token)).data());
-  // Mark the end of response files
-  if (MarkEOLs)
-    NewArgv.push_back(nullptr);
+
+  if (State == UNQUOTED)
+    AddToken(Saver.save(Token.str()));
+}
+
+void cl::TokenizeWindowsCommandLine(StringRef Src, StringSaver &Saver,
+                                    SmallVectorImpl<const char *> &NewArgv,
+                                    bool MarkEOLs) {
+  auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok.data()); };
+  auto OnEOL = [&]() {
+    if (MarkEOLs)
+      NewArgv.push_back(nullptr);
+  };
+  tokenizeWindowsCommandLineImpl(Src, Saver, AddToken,
+                                 /*AlwaysCopy=*/true, OnEOL);
+}
+
+void cl::TokenizeWindowsCommandLineNoCopy(StringRef Src, StringSaver &Saver,
+                                          SmallVectorImpl<StringRef> &NewArgv) {
+  auto AddToken = [&](StringRef Tok) { NewArgv.push_back(Tok); };
+  auto OnEOL = []() {};
+  tokenizeWindowsCommandLineImpl(Src, Saver, AddToken, /*AlwaysCopy=*/false,
+                                 OnEOL);
 }
 
 void cl::tokenizeConfigFile(StringRef Source, StringSaver &Saver,
@@ -1324,7 +1355,7 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
   argc = static_cast<int>(newArgv.size());
 
   // Copy the program name into ProgName, making sure not to overflow it.
-  ProgramName = sys::path::filename(StringRef(argv[0]));
+  ProgramName = std::string(sys::path::filename(StringRef(argv[0])));
 
   ProgramOverview = Overview;
   bool IgnoreErrors = Errs;
@@ -1581,9 +1612,9 @@ bool CommandLineParser::ParseCommandLineOptions(int argc,
   } else {
     assert(ConsumeAfterOpt && NumPositionalRequired <= PositionalVals.size());
     unsigned ValNo = 0;
-    for (size_t j = 1, e = PositionalOpts.size(); j != e; ++j)
-      if (RequiresValue(PositionalOpts[j])) {
-        ErrorParsing |= ProvidePositionalOption(PositionalOpts[j],
+    for (size_t J = 0, E = PositionalOpts.size(); J != E; ++J)
+      if (RequiresValue(PositionalOpts[J])) {
+        ErrorParsing |= ProvidePositionalOption(PositionalOpts[J],
                                                 PositionalVals[ValNo].first,
                                                 PositionalVals[ValNo].second);
         ValNo++;
@@ -1751,9 +1782,10 @@ void basic_parser_impl::printOptionInfo(const Option &O,
   if (!ValName.empty()) {
     if (O.getMiscFlags() & PositionalEatsArgs) {
       outs() << " <" << getValueStr(O, ValName) << ">...";
-    } else {
+    } else if (O.getValueExpectedFlag() == ValueOptional)
+      outs() << "[=<" << getValueStr(O, ValName) << ">]";
+    else
       outs() << "=<" << getValueStr(O, ValName) << '>';
-    }
   }
 
   Option::printHelpStr(O.HelpStr, GlobalWidth, getOptionWidth(O));
@@ -2482,7 +2514,7 @@ public:
     OS << " with assertions";
 #endif
 #if LLVM_VERSION_PRINTER_SHOW_HOST_TARGET_INFO
-    std::string CPU = sys::getHostCPUName();
+    std::string CPU = std::string(sys::getHostCPUName());
     if (CPU == "generic")
       CPU = "(unknown)";
     OS << ".\n"
@@ -2505,7 +2537,7 @@ public:
     // information.
     if (ExtraVersionPrinters != nullptr) {
       outs() << '\n';
-      for (auto I : *ExtraVersionPrinters)
+      for (const auto &I : *ExtraVersionPrinters)
         I(outs());
     }
 
diff --git a/llvm/lib/Support/Compression.cpp b/llvm/lib/Support/Compression.cpp
index 4165a2740cd0..27d92f0e0aec 100644
--- a/llvm/lib/Support/Compression.cpp
+++ b/llvm/lib/Support/Compression.cpp
@@ -17,13 +17,13 @@
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorHandling.h"
-#if LLVM_ENABLE_ZLIB
+#if LLVM_ENABLE_ZLIB == 1 && HAVE_ZLIB_H
 #include <zlib.h>
 #endif
 
 using namespace llvm;
 
-#if LLVM_ENABLE_ZLIB
+#if LLVM_ENABLE_ZLIB == 1 && HAVE_LIBZ
 static Error createError(StringRef Err) {
   return make_error<StringError>(Err, inconvertibleErrorCode());
 }
@@ -74,10 +74,10 @@ Error zlib::uncompress(StringRef InputBuffer, char *UncompressedBuffer,
 Error zlib::uncompress(StringRef InputBuffer,
                        SmallVectorImpl<char> &UncompressedBuffer,
                        size_t UncompressedSize) {
-  UncompressedBuffer.resize(UncompressedSize);
+  UncompressedBuffer.reserve(UncompressedSize);
   Error E =
       uncompress(InputBuffer, UncompressedBuffer.data(), UncompressedSize);
-  UncompressedBuffer.resize(UncompressedSize);
+  UncompressedBuffer.set_size(UncompressedSize);
   return E;
 }
 
diff --git a/llvm/lib/Support/ConvertUTFWrapper.cpp b/llvm/lib/Support/ConvertUTFWrapper.cpp
index eb4ead6b46b4..6ec567882ea6 100644
--- a/llvm/lib/Support/ConvertUTFWrapper.cpp
+++ b/llvm/lib/Support/ConvertUTFWrapper.cpp
@@ -102,7 +102,7 @@ bool convertUTF16ToUTF8String(ArrayRef<char> SrcBytes, std::string &Out) {
   if (Src[0] == UNI_UTF16_BYTE_ORDER_MARK_SWAPPED) {
     ByteSwapped.insert(ByteSwapped.end(), Src, SrcEnd);
     for (unsigned I = 0, E = ByteSwapped.size(); I != E; ++I)
-      ByteSwapped[I] = llvm::sys::SwapByteOrder_16(ByteSwapped[I]);
+      ByteSwapped[I] = llvm::ByteSwap_16(ByteSwapped[I]);
     Src = &ByteSwapped[0];
     SrcEnd = &ByteSwapped[ByteSwapped.size() - 1] + 1;
   }
diff --git a/llvm/lib/Support/CrashRecoveryContext.cpp b/llvm/lib/Support/CrashRecoveryContext.cpp
index b9031f52375c..ec7d7d641dce 100644
--- a/llvm/lib/Support/CrashRecoveryContext.cpp
+++ b/llvm/lib/Support/CrashRecoveryContext.cpp
@@ -14,9 +14,6 @@
 #include "llvm/Support/ThreadLocal.h"
 #include <mutex>
 #include <setjmp.h>
-#ifdef _WIN32
-#include <excpt.h> // for GetExceptionInformation
-#endif
 #if LLVM_ON_UNIX
 #include <sysexits.h> // EX_IOERR
 #endif
@@ -41,11 +38,11 @@ struct CrashRecoveryContextImpl {
   ::jmp_buf JumpBuffer;
   volatile unsigned Failed : 1;
   unsigned SwitchedThread : 1;
+  unsigned ValidJumpBuffer : 1;
 
 public:
-  CrashRecoveryContextImpl(CrashRecoveryContext *CRC) : CRC(CRC),
-                                                        Failed(false),
-                                                        SwitchedThread(false) {
+  CrashRecoveryContextImpl(CrashRecoveryContext *CRC) noexcept
+      : CRC(CRC), Failed(false), SwitchedThread(false), ValidJumpBuffer(false) {
     Next = CurrentContext->get();
     CurrentContext->set(this);
   }
@@ -80,10 +77,13 @@ public:
     CRC->RetCode = RetCode;
 
     // Jump back to the RunSafely we were called under.
-    longjmp(JumpBuffer, 1);
+    if (ValidJumpBuffer)
+      longjmp(JumpBuffer, 1);
+
+    // Otherwise let the caller decide of the outcome of the crash. Currently
+    // this occurs when using SEH on Windows with MSVC or clang-cl.
   }
 };
-
 }
 
 static ManagedStatic<std::mutex> gCrashRecoveryContextMutex;
@@ -175,6 +175,9 @@ CrashRecoveryContext::unregisterCleanup(CrashRecoveryContextCleanup *cleanup) {
 }
 
 #if defined(_MSC_VER)
+
+#include <windows.h> // for GetExceptionInformation
+
 // If _MSC_VER is defined, we must have SEH. Use it if it's available. It's way
 // better than VEH. Vectored exception handling catches all exceptions happening
 // on the thread with installed exception handlers, so it can interfere with
@@ -188,30 +191,45 @@ static void uninstallExceptionOrSignalHandlers() {}
 
 // We need this function because the call to GetExceptionInformation() can only
 // occur inside the __except evaluation block
-static int ExceptionFilter(bool DumpStackAndCleanup,
-                           _EXCEPTION_POINTERS *Except) {
-  if (DumpStackAndCleanup)
-    sys::CleanupOnSignal((uintptr_t)Except);
-  return EXCEPTION_EXECUTE_HANDLER;
-}
+static int ExceptionFilter(_EXCEPTION_POINTERS *Except) {
+  // Lookup the current thread local recovery object.
+  const CrashRecoveryContextImpl *CRCI = CurrentContext->get();
 
-static bool InvokeFunctionCall(function_ref<void()> Fn,
-                               bool DumpStackAndCleanup, int &RetCode) {
-  __try {
-    Fn();
-  } __except (ExceptionFilter(DumpStackAndCleanup, GetExceptionInformation())) {
-    RetCode = GetExceptionCode();
-    return false;
+  if (!CRCI) {
+    // Something has gone horribly wrong, so let's just tell everyone
+    // to keep searching
+    CrashRecoveryContext::Disable();
+    return EXCEPTION_CONTINUE_SEARCH;
   }
-  return true;
+
+  int RetCode = (int)Except->ExceptionRecord->ExceptionCode;
+  if ((RetCode & 0xF0000000) == 0xE0000000)
+    RetCode &= ~0xF0000000; // this crash was generated by sys::Process::Exit
+
+  // Handle the crash
+  const_cast<CrashRecoveryContextImpl *>(CRCI)->HandleCrash(
+      RetCode, reinterpret_cast<uintptr_t>(Except));
+
+  return EXCEPTION_EXECUTE_HANDLER;
 }
 
+#if defined(__clang__) && defined(_M_IX86)
+// Work around PR44697.
+__attribute__((optnone))
+#endif
 bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
   if (!gCrashRecoveryEnabled) {
     Fn();
     return true;
   }
-  return InvokeFunctionCall(Fn, DumpStackAndCleanupOnFailure, RetCode);
+  assert(!Impl && "Crash recovery context already initialized!");
+  Impl = new CrashRecoveryContextImpl(this);
+  __try {
+    Fn();
+  } __except (ExceptionFilter(GetExceptionInformation())) {
+    return false;
+  }
+  return true;
 }
 
 #else // !_MSC_VER
@@ -236,7 +254,7 @@ bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
 // XP, so if support for older versions of Windows is required,
 // it will have to be added.
 
-#include "Windows/WindowsSupport.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
 
 static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
 {
@@ -264,10 +282,13 @@ static LONG CALLBACK ExceptionHandler(PEXCEPTION_POINTERS ExceptionInfo)
   // TODO: We can capture the stack backtrace here and store it on the
   // implementation if we so choose.
 
+  int RetCode = (int)ExceptionInfo->ExceptionRecord->ExceptionCode;
+  if ((RetCode & 0xF0000000) == 0xE0000000)
+    RetCode &= ~0xF0000000; // this crash was generated by sys::Process::Exit
+
   // Handle the crash
   const_cast<CrashRecoveryContextImpl *>(CRCI)->HandleCrash(
-      (int)ExceptionInfo->ExceptionRecord->ExceptionCode,
-      reinterpret_cast<uintptr_t>(ExceptionInfo));
+      RetCode, reinterpret_cast<uintptr_t>(ExceptionInfo));
 
   // Note that we don't actually get here because HandleCrash calls
   // longjmp, which means the HandleCrash function never returns.
@@ -388,6 +409,7 @@ bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
     CrashRecoveryContextImpl *CRCI = new CrashRecoveryContextImpl(this);
     Impl = CRCI;
 
+    CRCI->ValidJumpBuffer = true;
     if (setjmp(CRCI->JumpBuffer) != 0) {
       return false;
     }
@@ -399,12 +421,19 @@ bool CrashRecoveryContext::RunSafely(function_ref<void()> Fn) {
 
 #endif // !_MSC_VER
 
-void CrashRecoveryContext::HandleCrash() {
-  CrashRecoveryContextImpl *CRCI = (CrashRecoveryContextImpl *) Impl;
+LLVM_ATTRIBUTE_NORETURN
+void CrashRecoveryContext::HandleExit(int RetCode) {
+#if defined(_WIN32)
+  // SEH and VEH
+  ::RaiseException(0xE0000000 | RetCode, 0, 0, NULL);
+#else
+  // On Unix we don't need to raise an exception, we go directly to
+  // HandleCrash(), then longjmp will unwind the stack for us.
+  CrashRecoveryContextImpl *CRCI = (CrashRecoveryContextImpl *)Impl;
   assert(CRCI && "Crash recovery context never initialized!");
-  // As per convention, -2 indicates a crash or timeout as opposed to failure to
-  // execute (see llvm/include/llvm/Support/Program.h)
-  CRCI->HandleCrash(-2, 0);
+  CRCI->HandleCrash(RetCode, 0 /*no sig num*/);
+#endif
+  llvm_unreachable("Most likely setjmp wasn't called!");
 }
 
 // FIXME: Portability.
diff --git a/llvm/lib/Support/DataExtractor.cpp b/llvm/lib/Support/DataExtractor.cpp
index a98297cdb35f..133d674275e8 100644
--- a/llvm/lib/Support/DataExtractor.cpp
+++ b/llvm/lib/Support/DataExtractor.cpp
@@ -15,29 +15,40 @@
 
 using namespace llvm;
 
-static void unexpectedEndReached(Error *E) {
-  if (E)
-    *E = createStringError(errc::illegal_byte_sequence,
-                           "unexpected end of data");
+bool DataExtractor::prepareRead(uint64_t Offset, uint64_t Size,
+                                Error *E) const {
+  if (isValidOffsetForDataOfSize(Offset, Size))
+    return true;
+  if (E) {
+    if (Offset <= Data.size())
+      *E = createStringError(
+          errc::illegal_byte_sequence,
+          "unexpected end of data at offset 0x%zx while reading [0x%" PRIx64
+          ", 0x%" PRIx64 ")",
+          Data.size(), Offset, Offset + Size);
+    else
+      *E = createStringError(errc::invalid_argument,
+                             "offset 0x%" PRIx64
+                             " is beyond the end of data at 0x%zx",
+                             Offset, Data.size());
+  }
+  return false;
 }
 
 static bool isError(Error *E) { return E && *E; }
 
 template <typename T>
-static T getU(uint64_t *offset_ptr, const DataExtractor *de,
-              bool isLittleEndian, const char *Data, llvm::Error *Err) {
+T DataExtractor::getU(uint64_t *offset_ptr, Error *Err) const {
   ErrorAsOutParameter ErrAsOut(Err);
   T val = 0;
   if (isError(Err))
     return val;
 
   uint64_t offset = *offset_ptr;
-  if (!de->isValidOffsetForDataOfSize(offset, sizeof(T))) {
-    unexpectedEndReached(Err);
+  if (!prepareRead(offset, sizeof(T), Err))
     return val;
-  }
-  std::memcpy(&val, &Data[offset], sizeof(val));
-  if (sys::IsLittleEndianHost != isLittleEndian)
+  std::memcpy(&val, &Data.data()[offset], sizeof(val));
+  if (sys::IsLittleEndianHost != IsLittleEndian)
     sys::swapByteOrder(val);
 
   // Advance the offset
@@ -46,22 +57,19 @@ static T getU(uint64_t *offset_ptr, const DataExtractor *de,
 }
 
 template <typename T>
-static T *getUs(uint64_t *offset_ptr, T *dst, uint32_t count,
-                const DataExtractor *de, bool isLittleEndian, const char *Data,
-                llvm::Error *Err) {
+T *DataExtractor::getUs(uint64_t *offset_ptr, T *dst, uint32_t count,
+                        Error *Err) const {
   ErrorAsOutParameter ErrAsOut(Err);
   if (isError(Err))
     return nullptr;
 
   uint64_t offset = *offset_ptr;
 
-  if (!de->isValidOffsetForDataOfSize(offset, sizeof(*dst) * count)) {
-    unexpectedEndReached(Err);
+  if (!prepareRead(offset, sizeof(*dst) * count, Err))
     return nullptr;
-  }
   for (T *value_ptr = dst, *end = dst + count; value_ptr != end;
        ++value_ptr, offset += sizeof(*dst))
-    *value_ptr = getU<T>(offset_ptr, de, isLittleEndian, Data, Err);
+    *value_ptr = getU<T>(offset_ptr, Err);
   // Advance the offset
   *offset_ptr = offset;
   // Return a non-NULL pointer to the converted data as an indicator of
@@ -70,55 +78,49 @@ static T *getUs(uint64_t *offset_ptr, T *dst, uint32_t count,
 }
 
 uint8_t DataExtractor::getU8(uint64_t *offset_ptr, llvm::Error *Err) const {
-  return getU<uint8_t>(offset_ptr, this, IsLittleEndian, Data.data(), Err);
+  return getU<uint8_t>(offset_ptr, Err);
 }
 
-uint8_t *
-DataExtractor::getU8(uint64_t *offset_ptr, uint8_t *dst, uint32_t count) const {
-  return getUs<uint8_t>(offset_ptr, dst, count, this, IsLittleEndian,
-                        Data.data(), nullptr);
+uint8_t *DataExtractor::getU8(uint64_t *offset_ptr, uint8_t *dst,
+                              uint32_t count) const {
+  return getUs<uint8_t>(offset_ptr, dst, count, nullptr);
 }
 
 uint8_t *DataExtractor::getU8(Cursor &C, uint8_t *Dst, uint32_t Count) const {
-  return getUs<uint8_t>(&C.Offset, Dst, Count, this, IsLittleEndian,
-                        Data.data(), &C.Err);
+  return getUs<uint8_t>(&C.Offset, Dst, Count, &C.Err);
 }
 
 uint16_t DataExtractor::getU16(uint64_t *offset_ptr, llvm::Error *Err) const {
-  return getU<uint16_t>(offset_ptr, this, IsLittleEndian, Data.data(), Err);
+  return getU<uint16_t>(offset_ptr, Err);
 }
 
 uint16_t *DataExtractor::getU16(uint64_t *offset_ptr, uint16_t *dst,
                                 uint32_t count) const {
-  return getUs<uint16_t>(offset_ptr, dst, count, this, IsLittleEndian,
-                         Data.data(), nullptr);
+  return getUs<uint16_t>(offset_ptr, dst, count, nullptr);
 }
 
-uint32_t DataExtractor::getU24(uint64_t *offset_ptr) const {
-  uint24_t ExtractedVal =
-      getU<uint24_t>(offset_ptr, this, IsLittleEndian, Data.data(), nullptr);
+uint32_t DataExtractor::getU24(uint64_t *OffsetPtr, Error *Err) const {
+  uint24_t ExtractedVal = getU<uint24_t>(OffsetPtr, Err);
   // The 3 bytes are in the correct byte order for the host.
   return ExtractedVal.getAsUint32(sys::IsLittleEndianHost);
 }
 
 uint32_t DataExtractor::getU32(uint64_t *offset_ptr, llvm::Error *Err) const {
-  return getU<uint32_t>(offset_ptr, this, IsLittleEndian, Data.data(), Err);
+  return getU<uint32_t>(offset_ptr, Err);
 }
 
 uint32_t *DataExtractor::getU32(uint64_t *offset_ptr, uint32_t *dst,
                                 uint32_t count) const {
-  return getUs<uint32_t>(offset_ptr, dst, count, this, IsLittleEndian,
-                         Data.data(), nullptr);
+  return getUs<uint32_t>(offset_ptr, dst, count, nullptr);
 }
 
 uint64_t DataExtractor::getU64(uint64_t *offset_ptr, llvm::Error *Err) const {
-  return getU<uint64_t>(offset_ptr, this, IsLittleEndian, Data.data(), Err);
+  return getU<uint64_t>(offset_ptr, Err);
 }
 
 uint64_t *DataExtractor::getU64(uint64_t *offset_ptr, uint64_t *dst,
                                 uint32_t count) const {
-  return getUs<uint64_t>(offset_ptr, dst, count, this, IsLittleEndian,
-                         Data.data(), nullptr);
+  return getUs<uint64_t>(offset_ptr, dst, count, nullptr);
 }
 
 uint64_t DataExtractor::getUnsigned(uint64_t *offset_ptr, uint32_t byte_size,
@@ -151,59 +153,77 @@ DataExtractor::getSigned(uint64_t *offset_ptr, uint32_t byte_size) const {
   llvm_unreachable("getSigned unhandled case!");
 }
 
-const char *DataExtractor::getCStr(uint64_t *offset_ptr) const {
-  uint64_t offset = *offset_ptr;
-  StringRef::size_type pos = Data.find('\0', offset);
-  if (pos != StringRef::npos) {
-    *offset_ptr = pos + 1;
-    return Data.data() + offset;
-  }
-  return nullptr;
-}
+StringRef DataExtractor::getCStrRef(uint64_t *OffsetPtr, Error *Err) const {
+  ErrorAsOutParameter ErrAsOut(Err);
+  if (isError(Err))
+    return StringRef();
 
-StringRef DataExtractor::getCStrRef(uint64_t *offset_ptr) const {
-  uint64_t Start = *offset_ptr;
+  uint64_t Start = *OffsetPtr;
   StringRef::size_type Pos = Data.find('\0', Start);
   if (Pos != StringRef::npos) {
-    *offset_ptr = Pos + 1;
+    *OffsetPtr = Pos + 1;
     return StringRef(Data.data() + Start, Pos - Start);
   }
+  if (Err)
+    *Err = createStringError(errc::illegal_byte_sequence,
+                             "no null terminated string at offset 0x%" PRIx64,
+                             Start);
   return StringRef();
 }
 
-uint64_t DataExtractor::getULEB128(uint64_t *offset_ptr,
-                                   llvm::Error *Err) const {
-  assert(*offset_ptr <= Data.size());
+StringRef DataExtractor::getFixedLengthString(uint64_t *OffsetPtr,
+                                              uint64_t Length,
+                                              StringRef TrimChars) const {
+  StringRef Bytes(getBytes(OffsetPtr, Length));
+  return Bytes.trim(TrimChars);
+}
+
+StringRef DataExtractor::getBytes(uint64_t *OffsetPtr, uint64_t Length,
+                                  Error *Err) const {
   ErrorAsOutParameter ErrAsOut(Err);
   if (isError(Err))
-    return 0;
+    return StringRef();
+
+  if (!prepareRead(*OffsetPtr, Length, Err))
+    return StringRef();
+
+  StringRef Result = Data.substr(*OffsetPtr, Length);
+  *OffsetPtr += Length;
+  return Result;
+}
+
+template <typename T>
+static T getLEB128(StringRef Data, uint64_t *OffsetPtr, Error *Err,
+                   T (&Decoder)(const uint8_t *p, unsigned *n,
+                                const uint8_t *end, const char **error)) {
+  ArrayRef<uint8_t> Bytes = arrayRefFromStringRef(Data);
+  assert(*OffsetPtr <= Bytes.size());
+  ErrorAsOutParameter ErrAsOut(Err);
+  if (isError(Err))
+    return T();
 
   const char *error;
   unsigned bytes_read;
-  uint64_t result = decodeULEB128(
-      reinterpret_cast<const uint8_t *>(Data.data() + *offset_ptr), &bytes_read,
-      reinterpret_cast<const uint8_t *>(Data.data() + Data.size()), &error);
+  T result =
+      Decoder(Bytes.data() + *OffsetPtr, &bytes_read, Bytes.end(), &error);
   if (error) {
     if (Err)
-      *Err = createStringError(errc::illegal_byte_sequence, error);
-    return 0;
+      *Err = createStringError(errc::illegal_byte_sequence,
+                               "unable to decode LEB128 at offset 0x%8.8" PRIx64
+                               ": %s",
+                               *OffsetPtr, error);
+    return T();
   }
-  *offset_ptr += bytes_read;
+  *OffsetPtr += bytes_read;
   return result;
 }
 
-int64_t DataExtractor::getSLEB128(uint64_t *offset_ptr) const {
-  assert(*offset_ptr <= Data.size());
+uint64_t DataExtractor::getULEB128(uint64_t *offset_ptr, Error *Err) const {
+  return getLEB128(Data, offset_ptr, Err, decodeULEB128);
+}
 
-  const char *error;
-  unsigned bytes_read;
-  int64_t result = decodeSLEB128(
-      reinterpret_cast<const uint8_t *>(Data.data() + *offset_ptr), &bytes_read,
-      reinterpret_cast<const uint8_t *>(Data.data() + Data.size()), &error);
-  if (error)
-    return 0;
-  *offset_ptr += bytes_read;
-  return result;
+int64_t DataExtractor::getSLEB128(uint64_t *offset_ptr, Error *Err) const {
+  return getLEB128(Data, offset_ptr, Err, decodeSLEB128);
 }
 
 void DataExtractor::skip(Cursor &C, uint64_t Length) const {
@@ -211,8 +231,6 @@ void DataExtractor::skip(Cursor &C, uint64_t Length) const {
   if (isError(&C.Err))
     return;
 
-  if (isValidOffsetForDataOfSize(C.Offset, Length))
+  if (prepareRead(C.Offset, Length, &C.Err))
     C.Offset += Length;
-  else
-    unexpectedEndReached(&C.Err);
 }
diff --git a/llvm/lib/Support/Debug.cpp b/llvm/lib/Support/Debug.cpp
index 737cd576ed80..73b25d55237b 100644
--- a/llvm/lib/Support/Debug.cpp
+++ b/llvm/lib/Support/Debug.cpp
@@ -105,7 +105,7 @@ struct DebugOnlyOpt {
     SmallVector<StringRef,8> dbgTypes;
     StringRef(Val).split(dbgTypes, ',', -1, false);
     for (auto dbgType : dbgTypes)
-      CurrentDebugType->push_back(dbgType);
+      CurrentDebugType->push_back(std::string(dbgType));
   }
 };
 
diff --git a/llvm/lib/Support/DebugCounter.cpp b/llvm/lib/Support/DebugCounter.cpp
index 1e3ec300964c..8c579f395282 100644
--- a/llvm/lib/Support/DebugCounter.cpp
+++ b/llvm/lib/Support/DebugCounter.cpp
@@ -31,7 +31,7 @@ private:
     // width, so we do the same.
     Option::printHelpStr(HelpStr, GlobalWidth, ArgStr.size() + 6);
     const auto &CounterInstance = DebugCounter::instance();
-    for (auto Name : CounterInstance) {
+    for (const auto &Name : CounterInstance) {
       const auto Info =
           CounterInstance.getCounterInfo(CounterInstance.getCounterId(Name));
       size_t NumSpaces = GlobalWidth - Info.first.size() - 8;
@@ -85,7 +85,7 @@ void DebugCounter::push_back(const std::string &Val) {
   // add it to the counter values.
   if (CounterPair.first.endswith("-skip")) {
     auto CounterName = CounterPair.first.drop_back(5);
-    unsigned CounterID = getCounterId(CounterName);
+    unsigned CounterID = getCounterId(std::string(CounterName));
     if (!CounterID) {
       errs() << "DebugCounter Error: " << CounterName
              << " is not a registered counter\n";
@@ -98,7 +98,7 @@ void DebugCounter::push_back(const std::string &Val) {
     Counter.IsSet = true;
   } else if (CounterPair.first.endswith("-count")) {
     auto CounterName = CounterPair.first.drop_back(6);
-    unsigned CounterID = getCounterId(CounterName);
+    unsigned CounterID = getCounterId(std::string(CounterName));
     if (!CounterID) {
       errs() << "DebugCounter Error: " << CounterName
              << " is not a registered counter\n";
@@ -123,7 +123,7 @@ void DebugCounter::print(raw_ostream &OS) const {
   auto &Us = instance();
   OS << "Counters and values:\n";
   for (auto &CounterName : CounterNames) {
-    unsigned CounterID = getCounterId(CounterName);
+    unsigned CounterID = getCounterId(std::string(CounterName));
     OS << left_justify(RegisteredCounters[CounterID], 32) << ": {"
        << Us.Counters[CounterID].Count << "," << Us.Counters[CounterID].Skip
        << "," << Us.Counters[CounterID].StopAfter << "}\n";
diff --git a/llvm/lib/Support/ELFAttributeParser.cpp b/llvm/lib/Support/ELFAttributeParser.cpp
new file mode 100644
index 000000000000..df955cdf5d30
--- /dev/null
+++ b/llvm/lib/Support/ELFAttributeParser.cpp
@@ -0,0 +1,233 @@
+//===--- ELFAttributeParser.cpp - ELF Attribute Parser --------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ELFAttributeParser.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Support/Errc.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/ScopedPrinter.h"
+
+using namespace llvm;
+using namespace llvm::ELFAttrs;
+
+static const EnumEntry<unsigned> tagNames[] = {
+    {"Tag_File", ELFAttrs::File},
+    {"Tag_Section", ELFAttrs::Section},
+    {"Tag_Symbol", ELFAttrs::Symbol},
+};
+
+Error ELFAttributeParser::parseStringAttribute(const char *name, unsigned tag,
+                                               ArrayRef<const char *> strings) {
+  uint64_t value = de.getULEB128(cursor);
+  if (value >= strings.size()) {
+    printAttribute(tag, value, "");
+    return createStringError(errc::invalid_argument,
+                             "unknown " + Twine(name) +
+                                 " value: " + Twine(value));
+  }
+  printAttribute(tag, value, strings[value]);
+  return Error::success();
+}
+
+Error ELFAttributeParser::integerAttribute(unsigned tag) {
+  StringRef tagName =
+      ELFAttrs::attrTypeAsString(tag, tagToStringMap, /*hasTagPrefix=*/false);
+  uint64_t value = de.getULEB128(cursor);
+  attributes.insert(std::make_pair(tag, value));
+
+  if (sw) {
+    DictScope scope(*sw, "Attribute");
+    sw->printNumber("Tag", tag);
+    if (!tagName.empty())
+      sw->printString("TagName", tagName);
+    sw->printNumber("Value", value);
+  }
+  return Error::success();
+}
+
+Error ELFAttributeParser::stringAttribute(unsigned tag) {
+  StringRef tagName =
+      ELFAttrs::attrTypeAsString(tag, tagToStringMap, /*hasTagPrefix=*/false);
+  StringRef desc = de.getCStrRef(cursor);
+  attributesStr.insert(std::make_pair(tag, desc));
+
+  if (sw) {
+    DictScope scope(*sw, "Attribute");
+    sw->printNumber("Tag", tag);
+    if (!tagName.empty())
+      sw->printString("TagName", tagName);
+    sw->printString("Value", desc);
+  }
+  return Error::success();
+}
+
+void ELFAttributeParser::printAttribute(unsigned tag, unsigned value,
+                                        StringRef valueDesc) {
+  attributes.insert(std::make_pair(tag, value));
+
+  if (sw) {
+    StringRef tagName = ELFAttrs::attrTypeAsString(tag, tagToStringMap,
+                                                   /*hasTagPrefix=*/false);
+    DictScope as(*sw, "Attribute");
+    sw->printNumber("Tag", tag);
+    sw->printNumber("Value", value);
+    if (!tagName.empty())
+      sw->printString("TagName", tagName);
+    if (!valueDesc.empty())
+      sw->printString("Description", valueDesc);
+  }
+}
+
+void ELFAttributeParser::parseIndexList(SmallVectorImpl<uint8_t> &indexList) {
+  for (;;) {
+    uint64_t value = de.getULEB128(cursor);
+    if (!cursor || !value)
+      break;
+    indexList.push_back(value);
+  }
+}
+
+Error ELFAttributeParser::parseAttributeList(uint32_t length) {
+  uint64_t pos;
+  uint64_t end = cursor.tell() + length;
+  while ((pos = cursor.tell()) < end) {
+    uint64_t tag = de.getULEB128(cursor);
+    bool handled;
+    if (Error e = handler(tag, handled))
+      return e;
+
+    if (!handled) {
+      if (tag < 32) {
+        return createStringError(errc::invalid_argument,
+                                 "invalid tag 0x" + Twine::utohexstr(tag) +
+                                     " at offset 0x" + Twine::utohexstr(pos));
+      }
+
+      if (tag % 2 == 0) {
+        if (Error e = integerAttribute(tag))
+          return e;
+      } else {
+        if (Error e = stringAttribute(tag))
+          return e;
+      }
+    }
+  }
+  return Error::success();
+}
+
+Error ELFAttributeParser::parseSubsection(uint32_t length) {
+  uint64_t end = cursor.tell() - sizeof(length) + length;
+  StringRef vendorName = de.getCStrRef(cursor);
+  if (sw) {
+    sw->printNumber("SectionLength", length);
+    sw->printString("Vendor", vendorName);
+  }
+
+  // Ignore unrecognized vendor-name.
+  if (vendorName.lower() != vendor)
+    return createStringError(errc::invalid_argument,
+                             "unrecognized vendor-name: " + vendorName);
+
+  while (cursor.tell() < end) {
+    /// Tag_File | Tag_Section | Tag_Symbol   uleb128:byte-size
+    uint8_t tag = de.getU8(cursor);
+    uint32_t size = de.getU32(cursor);
+    if (!cursor)
+      return cursor.takeError();
+
+    if (sw) {
+      sw->printEnum("Tag", tag, makeArrayRef(tagNames));
+      sw->printNumber("Size", size);
+    }
+    if (size < 5)
+      return createStringError(errc::invalid_argument,
+                               "invalid attribute size " + Twine(size) +
+                                   " at offset 0x" +
+                                   Twine::utohexstr(cursor.tell() - 5));
+
+    StringRef scopeName, indexName;
+    SmallVector<uint8_t, 8> indicies;
+    switch (tag) {
+    case ELFAttrs::File:
+      scopeName = "FileAttributes";
+      break;
+    case ELFAttrs::Section:
+      scopeName = "SectionAttributes";
+      indexName = "Sections";
+      parseIndexList(indicies);
+      break;
+    case ELFAttrs::Symbol:
+      scopeName = "SymbolAttributes";
+      indexName = "Symbols";
+      parseIndexList(indicies);
+      break;
+    default:
+      return createStringError(errc::invalid_argument,
+                               "unrecognized tag 0x" + Twine::utohexstr(tag) +
+                                   " at offset 0x" +
+                                   Twine::utohexstr(cursor.tell() - 5));
+    }
+
+    if (sw) {
+      DictScope scope(*sw, scopeName);
+      if (!indicies.empty())
+        sw->printList(indexName, indicies);
+      if (Error e = parseAttributeList(size - 5))
+        return e;
+    } else if (Error e = parseAttributeList(size - 5))
+      return e;
+  }
+  return Error::success();
+}
+
+Error ELFAttributeParser::parse(ArrayRef<uint8_t> section,
+                                support::endianness endian) {
+  unsigned sectionNumber = 0;
+  de = DataExtractor(section, endian == support::little, 0);
+
+  // For early returns, we have more specific errors, consume the Error in
+  // cursor.
+  struct ClearCursorError {
+    DataExtractor::Cursor &cursor;
+    ~ClearCursorError() { consumeError(cursor.takeError()); }
+  } clear{cursor};
+
+  // Unrecognized format-version.
+  uint8_t formatVersion = de.getU8(cursor);
+  if (formatVersion != 'A')
+    return createStringError(errc::invalid_argument,
+                             "unrecognized format-version: 0x" +
+                                 utohexstr(formatVersion));
+
+  while (!de.eof(cursor)) {
+    uint32_t sectionLength = de.getU32(cursor);
+    if (!cursor)
+      return cursor.takeError();
+
+    if (sw) {
+      sw->startLine() << "Section " << ++sectionNumber << " {\n";
+      sw->indent();
+    }
+
+    if (sectionLength < 4 || cursor.tell() - 4 + sectionLength > section.size())
+      return createStringError(errc::invalid_argument,
+                               "invalid section length " +
+                                   Twine(sectionLength) + " at offset 0x" +
+                                   utohexstr(cursor.tell() - 4));
+
+    if (Error e = parseSubsection(sectionLength))
+      return e;
+    if (sw) {
+      sw->unindent();
+      sw->startLine() << "}\n";
+    }
+  }
+
+  return cursor.takeError();
+}
diff --git a/llvm/lib/Support/ELFAttributes.cpp b/llvm/lib/Support/ELFAttributes.cpp
new file mode 100644
index 000000000000..5be38825d6c6
--- /dev/null
+++ b/llvm/lib/Support/ELFAttributes.cpp
@@ -0,0 +1,34 @@
+//===-- ELFAttributes.cpp - ELF Attributes --------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ELFAttributes.h"
+#include "llvm/ADT/StringRef.h"
+
+using namespace llvm;
+
+StringRef ELFAttrs::attrTypeAsString(unsigned attr, TagNameMap tagNameMap,
+                                     bool hasTagPrefix) {
+  auto tagNameIt = find_if(
+      tagNameMap, [attr](const TagNameItem item) { return item.attr == attr; });
+  if (tagNameIt == tagNameMap.end())
+    return "";
+  StringRef tagName = tagNameIt->tagName;
+  return hasTagPrefix ? tagName : tagName.drop_front(4);
+}
+
+Optional<unsigned> ELFAttrs::attrTypeFromString(StringRef tag,
+                                                TagNameMap tagNameMap) {
+  bool hasTagPrefix = tag.startswith("Tag_");
+  auto tagNameIt =
+      find_if(tagNameMap, [tag, hasTagPrefix](const TagNameItem item) {
+        return item.tagName.drop_front(hasTagPrefix ? 0 : 4) == tag;
+      });
+  if (tagNameIt == tagNameMap.end())
+    return None;
+  return tagNameIt->attr;
+}
diff --git a/llvm/lib/Support/ErrorHandling.cpp b/llvm/lib/Support/ErrorHandling.cpp
index 0f13f7a536f1..f70a6921a41a 100644
--- a/llvm/lib/Support/ErrorHandling.cpp
+++ b/llvm/lib/Support/ErrorHandling.cpp
@@ -19,6 +19,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/Threading.h"
 #include "llvm/Support/WindowsError.h"
@@ -122,7 +123,7 @@ void llvm::report_fatal_error(const Twine &Reason, bool GenCrashDiag) {
   // files registered with RemoveFileOnSignal.
   sys::RunInterruptHandlers();
 
-  exit(1);
+  abort();
 }
 
 void llvm::install_bad_alloc_error_handler(fatal_error_handler_t handler,
diff --git a/llvm/lib/Support/ExtensibleRTTI.cpp b/llvm/lib/Support/ExtensibleRTTI.cpp
new file mode 100644
index 000000000000..1c98d1bb8feb
--- /dev/null
+++ b/llvm/lib/Support/ExtensibleRTTI.cpp
@@ -0,0 +1,13 @@
+//===----- lib/Support/ExtensibleRTTI.cpp - ExtensibleRTTI utilities ------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ExtensibleRTTI.h"
+
+void llvm::RTTIRoot::anchor() {}
+char llvm::RTTIRoot::ID = 0;
diff --git a/llvm/lib/Support/FileCheck.cpp b/llvm/lib/Support/FileCheck.cpp
index 2261ecc236c2..d0e79c675bcb 100644
--- a/llvm/lib/Support/FileCheck.cpp
+++ b/llvm/lib/Support/FileCheck.cpp
@@ -17,6 +17,7 @@
 #include "FileCheckImpl.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Support/CheckedArithmetic.h"
 #include "llvm/Support/FormatVariadic.h"
 #include <cstdint>
 #include <list>
@@ -25,17 +26,301 @@
 
 using namespace llvm;
 
-Expected<uint64_t> NumericVariableUse::eval() const {
-  Optional<uint64_t> Value = Variable->getValue();
+StringRef ExpressionFormat::toString() const {
+  switch (Value) {
+  case Kind::NoFormat:
+    return StringRef("<none>");
+  case Kind::Unsigned:
+    return StringRef("%u");
+  case Kind::Signed:
+    return StringRef("%d");
+  case Kind::HexUpper:
+    return StringRef("%X");
+  case Kind::HexLower:
+    return StringRef("%x");
+  }
+  llvm_unreachable("unknown expression format");
+}
+
+Expected<StringRef> ExpressionFormat::getWildcardRegex() const {
+  switch (Value) {
+  case Kind::Unsigned:
+    return StringRef("[0-9]+");
+  case Kind::Signed:
+    return StringRef("-?[0-9]+");
+  case Kind::HexUpper:
+    return StringRef("[0-9A-F]+");
+  case Kind::HexLower:
+    return StringRef("[0-9a-f]+");
+  default:
+    return createStringError(std::errc::invalid_argument,
+                             "trying to match value with invalid format");
+  }
+}
+
+Expected<std::string>
+ExpressionFormat::getMatchingString(ExpressionValue IntegerValue) const {
+  if (Value == Kind::Signed) {
+    Expected<int64_t> SignedValue = IntegerValue.getSignedValue();
+    if (!SignedValue)
+      return SignedValue.takeError();
+    return itostr(*SignedValue);
+  }
+
+  Expected<uint64_t> UnsignedValue = IntegerValue.getUnsignedValue();
+  if (!UnsignedValue)
+    return UnsignedValue.takeError();
+  switch (Value) {
+  case Kind::Unsigned:
+    return utostr(*UnsignedValue);
+  case Kind::HexUpper:
+    return utohexstr(*UnsignedValue, /*LowerCase=*/false);
+  case Kind::HexLower:
+    return utohexstr(*UnsignedValue, /*LowerCase=*/true);
+  default:
+    return createStringError(std::errc::invalid_argument,
+                             "trying to match value with invalid format");
+  }
+}
+
+Expected<ExpressionValue>
+ExpressionFormat::valueFromStringRepr(StringRef StrVal,
+                                      const SourceMgr &SM) const {
+  bool ValueIsSigned = Value == Kind::Signed;
+  StringRef OverflowErrorStr = "unable to represent numeric value";
+  if (ValueIsSigned) {
+    int64_t SignedValue;
+
+    if (StrVal.getAsInteger(10, SignedValue))
+      return ErrorDiagnostic::get(SM, StrVal, OverflowErrorStr);
+
+    return ExpressionValue(SignedValue);
+  }
+
+  bool Hex = Value == Kind::HexUpper || Value == Kind::HexLower;
+  uint64_t UnsignedValue;
+  if (StrVal.getAsInteger(Hex ? 16 : 10, UnsignedValue))
+    return ErrorDiagnostic::get(SM, StrVal, OverflowErrorStr);
+
+  return ExpressionValue(UnsignedValue);
+}
+
+static int64_t getAsSigned(uint64_t UnsignedValue) {
+  // Use memcpy to reinterpret the bitpattern in Value since casting to
+  // signed is implementation-defined if the unsigned value is too big to be
+  // represented in the signed type and using an union violates type aliasing
+  // rules.
+  int64_t SignedValue;
+  memcpy(&SignedValue, &UnsignedValue, sizeof(SignedValue));
+  return SignedValue;
+}
+
+Expected<int64_t> ExpressionValue::getSignedValue() const {
+  if (Negative)
+    return getAsSigned(Value);
+
+  if (Value > (uint64_t)std::numeric_limits<int64_t>::max())
+    return make_error<OverflowError>();
+
+  // Value is in the representable range of int64_t so we can use cast.
+  return static_cast<int64_t>(Value);
+}
+
+Expected<uint64_t> ExpressionValue::getUnsignedValue() const {
+  if (Negative)
+    return make_error<OverflowError>();
+
+  return Value;
+}
+
+ExpressionValue ExpressionValue::getAbsolute() const {
+  if (!Negative)
+    return *this;
+
+  int64_t SignedValue = getAsSigned(Value);
+  int64_t MaxInt64 = std::numeric_limits<int64_t>::max();
+  // Absolute value can be represented as int64_t.
+  if (SignedValue >= -MaxInt64)
+    return ExpressionValue(-getAsSigned(Value));
+
+  // -X == -(max int64_t + Rem), negate each component independently.
+  SignedValue += MaxInt64;
+  uint64_t RemainingValueAbsolute = -SignedValue;
+  return ExpressionValue(MaxInt64 + RemainingValueAbsolute);
+}
+
+Expected<ExpressionValue> llvm::operator+(const ExpressionValue &LeftOperand,
+                                          const ExpressionValue &RightOperand) {
+  if (LeftOperand.isNegative() && RightOperand.isNegative()) {
+    int64_t LeftValue = cantFail(LeftOperand.getSignedValue());
+    int64_t RightValue = cantFail(RightOperand.getSignedValue());
+    Optional<int64_t> Result = checkedAdd<int64_t>(LeftValue, RightValue);
+    if (!Result)
+      return make_error<OverflowError>();
+
+    return ExpressionValue(*Result);
+  }
+
+  // (-A) + B == B - A.
+  if (LeftOperand.isNegative())
+    return RightOperand - LeftOperand.getAbsolute();
+
+  // A + (-B) == A - B.
+  if (RightOperand.isNegative())
+    return LeftOperand - RightOperand.getAbsolute();
+
+  // Both values are positive at this point.
+  uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
+  uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+  Optional<uint64_t> Result =
+      checkedAddUnsigned<uint64_t>(LeftValue, RightValue);
+  if (!Result)
+    return make_error<OverflowError>();
+
+  return ExpressionValue(*Result);
+}
+
+Expected<ExpressionValue> llvm::operator-(const ExpressionValue &LeftOperand,
+                                          const ExpressionValue &RightOperand) {
+  // Result will be negative and thus might underflow.
+  if (LeftOperand.isNegative() && !RightOperand.isNegative()) {
+    int64_t LeftValue = cantFail(LeftOperand.getSignedValue());
+    uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+    // Result <= -1 - (max int64_t) which overflows on 1- and 2-complement.
+    if (RightValue > (uint64_t)std::numeric_limits<int64_t>::max())
+      return make_error<OverflowError>();
+    Optional<int64_t> Result =
+        checkedSub(LeftValue, static_cast<int64_t>(RightValue));
+    if (!Result)
+      return make_error<OverflowError>();
+
+    return ExpressionValue(*Result);
+  }
+
+  // (-A) - (-B) == B - A.
+  if (LeftOperand.isNegative())
+    return RightOperand.getAbsolute() - LeftOperand.getAbsolute();
+
+  // A - (-B) == A + B.
+  if (RightOperand.isNegative())
+    return LeftOperand + RightOperand.getAbsolute();
+
+  // Both values are positive at this point.
+  uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
+  uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+  if (LeftValue >= RightValue)
+    return ExpressionValue(LeftValue - RightValue);
+  else {
+    uint64_t AbsoluteDifference = RightValue - LeftValue;
+    uint64_t MaxInt64 = std::numeric_limits<int64_t>::max();
+    // Value might underflow.
+    if (AbsoluteDifference > MaxInt64) {
+      AbsoluteDifference -= MaxInt64;
+      int64_t Result = -MaxInt64;
+      int64_t MinInt64 = std::numeric_limits<int64_t>::min();
+      // Underflow, tested by:
+      //   abs(Result + (max int64_t)) > abs((min int64_t) + (max int64_t))
+      if (AbsoluteDifference > static_cast<uint64_t>(-(MinInt64 - Result)))
+        return make_error<OverflowError>();
+      Result -= static_cast<int64_t>(AbsoluteDifference);
+      return ExpressionValue(Result);
+    }
+
+    return ExpressionValue(-static_cast<int64_t>(AbsoluteDifference));
+  }
+}
+
+Expected<ExpressionValue> llvm::operator*(const ExpressionValue &LeftOperand,
+                                          const ExpressionValue &RightOperand) {
+  // -A * -B == A * B
+  if (LeftOperand.isNegative() && RightOperand.isNegative())
+    return LeftOperand.getAbsolute() * RightOperand.getAbsolute();
+
+  // A * -B == -B * A
+  if (RightOperand.isNegative())
+    return RightOperand * LeftOperand;
+
+  assert(!RightOperand.isNegative() && "Unexpected negative operand!");
+
+  // Result will be negative and can underflow.
+  if (LeftOperand.isNegative()) {
+    auto Result = LeftOperand.getAbsolute() * RightOperand.getAbsolute();
+    if (!Result)
+      return Result;
+
+    return ExpressionValue(0) - *Result;
+  }
+
+  // Result will be positive and can overflow.
+  uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
+  uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+  Optional<uint64_t> Result =
+      checkedMulUnsigned<uint64_t>(LeftValue, RightValue);
+  if (!Result)
+    return make_error<OverflowError>();
+
+  return ExpressionValue(*Result);
+}
+
+Expected<ExpressionValue> llvm::operator/(const ExpressionValue &LeftOperand,
+                                          const ExpressionValue &RightOperand) {
+  // -A / -B == A / B
+  if (LeftOperand.isNegative() && RightOperand.isNegative())
+    return LeftOperand.getAbsolute() / RightOperand.getAbsolute();
+
+  // Check for divide by zero.
+  if (RightOperand == ExpressionValue(0))
+    return make_error<OverflowError>();
+
+  // Result will be negative and can underflow.
+  if (LeftOperand.isNegative() || RightOperand.isNegative())
+    return ExpressionValue(0) -
+           cantFail(LeftOperand.getAbsolute() / RightOperand.getAbsolute());
+
+  uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
+  uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+  return ExpressionValue(LeftValue / RightValue);
+}
+
+Expected<ExpressionValue> llvm::max(const ExpressionValue &LeftOperand,
+                                    const ExpressionValue &RightOperand) {
+  if (LeftOperand.isNegative() && RightOperand.isNegative()) {
+    int64_t LeftValue = cantFail(LeftOperand.getSignedValue());
+    int64_t RightValue = cantFail(RightOperand.getSignedValue());
+    return ExpressionValue(std::max(LeftValue, RightValue));
+  }
+
+  if (!LeftOperand.isNegative() && !RightOperand.isNegative()) {
+    uint64_t LeftValue = cantFail(LeftOperand.getUnsignedValue());
+    uint64_t RightValue = cantFail(RightOperand.getUnsignedValue());
+    return ExpressionValue(std::max(LeftValue, RightValue));
+  }
+
+  if (LeftOperand.isNegative())
+    return RightOperand;
+
+  return LeftOperand;
+}
+
+Expected<ExpressionValue> llvm::min(const ExpressionValue &LeftOperand,
+                                    const ExpressionValue &RightOperand) {
+  if (cantFail(max(LeftOperand, RightOperand)) == LeftOperand)
+    return RightOperand;
+
+  return LeftOperand;
+}
+
+Expected<ExpressionValue> NumericVariableUse::eval() const {
+  Optional<ExpressionValue> Value = Variable->getValue();
   if (Value)
     return *Value;
 
-  return make_error<UndefVarError>(Name);
+  return make_error<UndefVarError>(getExpressionStr());
 }
 
-Expected<uint64_t> BinaryOperation::eval() const {
-  Expected<uint64_t> LeftOp = LeftOperand->eval();
-  Expected<uint64_t> RightOp = RightOperand->eval();
+Expected<ExpressionValue> BinaryOperation::eval() const {
+  Expected<ExpressionValue> LeftOp = LeftOperand->eval();
+  Expected<ExpressionValue> RightOp = RightOperand->eval();
 
   // Bubble up any error (e.g. undefined variables) in the recursive
   // evaluation.
@@ -51,11 +336,42 @@ Expected<uint64_t> BinaryOperation::eval() const {
   return EvalBinop(*LeftOp, *RightOp);
 }
 
+Expected<ExpressionFormat>
+BinaryOperation::getImplicitFormat(const SourceMgr &SM) const {
+  Expected<ExpressionFormat> LeftFormat = LeftOperand->getImplicitFormat(SM);
+  Expected<ExpressionFormat> RightFormat = RightOperand->getImplicitFormat(SM);
+  if (!LeftFormat || !RightFormat) {
+    Error Err = Error::success();
+    if (!LeftFormat)
+      Err = joinErrors(std::move(Err), LeftFormat.takeError());
+    if (!RightFormat)
+      Err = joinErrors(std::move(Err), RightFormat.takeError());
+    return std::move(Err);
+  }
+
+  if (*LeftFormat != ExpressionFormat::Kind::NoFormat &&
+      *RightFormat != ExpressionFormat::Kind::NoFormat &&
+      *LeftFormat != *RightFormat)
+    return ErrorDiagnostic::get(
+        SM, getExpressionStr(),
+        "implicit format conflict between '" + LeftOperand->getExpressionStr() +
+            "' (" + LeftFormat->toString() + ") and '" +
+            RightOperand->getExpressionStr() + "' (" + RightFormat->toString() +
+            "), need an explicit format specifier");
+
+  return *LeftFormat != ExpressionFormat::Kind::NoFormat ? *LeftFormat
+                                                         : *RightFormat;
+}
+
 Expected<std::string> NumericSubstitution::getResult() const {
-  Expected<uint64_t> EvaluatedValue = ExpressionASTPointer->eval();
+  assert(ExpressionPointer->getAST() != nullptr &&
+         "Substituting empty expression");
+  Expected<ExpressionValue> EvaluatedValue =
+      ExpressionPointer->getAST()->eval();
   if (!EvaluatedValue)
     return EvaluatedValue.takeError();
-  return utostr(*EvaluatedValue);
+  ExpressionFormat Format = ExpressionPointer->getFormat();
+  return Format.getMatchingString(*EvaluatedValue);
 }
 
 Expected<std::string> StringSubstitution::getResult() const {
@@ -66,30 +382,27 @@ Expected<std::string> StringSubstitution::getResult() const {
   return Regex::escape(*VarVal);
 }
 
-bool Pattern::isValidVarNameStart(char C) { return C == '_' || isalpha(C); }
+bool Pattern::isValidVarNameStart(char C) { return C == '_' || isAlpha(C); }
 
 Expected<Pattern::VariableProperties>
 Pattern::parseVariable(StringRef &Str, const SourceMgr &SM) {
   if (Str.empty())
     return ErrorDiagnostic::get(SM, Str, "empty variable name");
 
-  bool ParsedOneChar = false;
-  unsigned I = 0;
+  size_t I = 0;
   bool IsPseudo = Str[0] == '@';
 
   // Global vars start with '$'.
   if (Str[0] == '$' || IsPseudo)
     ++I;
 
-  for (unsigned E = Str.size(); I != E; ++I) {
-    if (!ParsedOneChar && !isValidVarNameStart(Str[I]))
-      return ErrorDiagnostic::get(SM, Str, "invalid variable name");
+  if (!isValidVarNameStart(Str[I++]))
+    return ErrorDiagnostic::get(SM, Str, "invalid variable name");
 
+  for (size_t E = Str.size(); I != E; ++I)
     // Variable names are composed of alphanumeric characters and underscores.
-    if (Str[I] != '_' && !isalnum(Str[I]))
+    if (Str[I] != '_' && !isAlnum(Str[I]))
       break;
-    ParsedOneChar = true;
-  }
 
   StringRef Name = Str.take_front(I);
   Str = Str.substr(I);
@@ -107,13 +420,15 @@ static char popFront(StringRef &S) {
   return C;
 }
 
+char OverflowError::ID = 0;
 char UndefVarError::ID = 0;
 char ErrorDiagnostic::ID = 0;
 char NotFoundError::ID = 0;
 
 Expected<NumericVariable *> Pattern::parseNumericVariableDefinition(
     StringRef &Expr, FileCheckPatternContext *Context,
-    Optional<size_t> LineNumber, const SourceMgr &SM) {
+    Optional<size_t> LineNumber, ExpressionFormat ImplicitFormat,
+    const SourceMgr &SM) {
   Expected<VariableProperties> ParseVarResult = parseVariable(Expr, SM);
   if (!ParseVarResult)
     return ParseVarResult.takeError();
@@ -137,10 +452,14 @@ Expected<NumericVariable *> Pattern::parseNumericVariableDefinition(
 
   NumericVariable *DefinedNumericVariable;
   auto VarTableIter = Context->GlobalNumericVariableTable.find(Name);
-  if (VarTableIter != Context->GlobalNumericVariableTable.end())
+  if (VarTableIter != Context->GlobalNumericVariableTable.end()) {
     DefinedNumericVariable = VarTableIter->second;
-  else
-    DefinedNumericVariable = Context->makeNumericVariable(Name, LineNumber);
+    if (DefinedNumericVariable->getImplicitFormat() != ImplicitFormat)
+      return ErrorDiagnostic::get(
+          SM, Expr, "format different from previous variable definition");
+  } else
+    DefinedNumericVariable =
+        Context->makeNumericVariable(Name, ImplicitFormat, LineNumber);
 
   return DefinedNumericVariable;
 }
@@ -165,7 +484,8 @@ Expected<std::unique_ptr<NumericVariableUse>> Pattern::parseNumericVariableUse(
   if (VarTableIter != Context->GlobalNumericVariableTable.end())
     NumericVariable = VarTableIter->second;
   else {
-    NumericVariable = Context->makeNumericVariable(Name);
+    NumericVariable = Context->makeNumericVariable(
+        Name, ExpressionFormat(ExpressionFormat::Kind::Unsigned));
     Context->GlobalNumericVariableTable[Name] = NumericVariable;
   }
 
@@ -180,16 +500,36 @@ Expected<std::unique_ptr<NumericVariableUse>> Pattern::parseNumericVariableUse(
 }
 
 Expected<std::unique_ptr<ExpressionAST>> Pattern::parseNumericOperand(
-    StringRef &Expr, AllowedOperand AO, Optional<size_t> LineNumber,
-    FileCheckPatternContext *Context, const SourceMgr &SM) {
+    StringRef &Expr, AllowedOperand AO, bool MaybeInvalidConstraint,
+    Optional<size_t> LineNumber, FileCheckPatternContext *Context,
+    const SourceMgr &SM) {
+  if (Expr.startswith("(")) {
+    if (AO != AllowedOperand::Any)
+      return ErrorDiagnostic::get(
+          SM, Expr, "parenthesized expression not permitted here");
+    return parseParenExpr(Expr, LineNumber, Context, SM);
+  }
+
   if (AO == AllowedOperand::LineVar || AO == AllowedOperand::Any) {
     // Try to parse as a numeric variable use.
     Expected<Pattern::VariableProperties> ParseVarResult =
         parseVariable(Expr, SM);
-    if (ParseVarResult)
+    if (ParseVarResult) {
+      // Try to parse a function call.
+      if (Expr.ltrim(SpaceChars).startswith("(")) {
+        if (AO != AllowedOperand::Any)
+          return ErrorDiagnostic::get(SM, ParseVarResult->Name,
+                                      "unexpected function call");
+
+        return parseCallExpr(Expr, ParseVarResult->Name, LineNumber, Context,
+                             SM);
+      }
+
       return parseNumericVariableUse(ParseVarResult->Name,
                                      ParseVarResult->IsPseudo, LineNumber,
                                      Context, SM);
+    }
+
     if (AO == AllowedOperand::LineVar)
       return ParseVarResult.takeError();
     // Ignore the error and retry parsing as a literal.
@@ -197,41 +537,79 @@ Expected<std::unique_ptr<ExpressionAST>> Pattern::parseNumericOperand(
   }
 
   // Otherwise, parse it as a literal.
-  uint64_t LiteralValue;
-  if (!Expr.consumeInteger(/*Radix=*/10, LiteralValue))
-    return std::make_unique<ExpressionLiteral>(LiteralValue);
-
-  return ErrorDiagnostic::get(SM, Expr,
-                              "invalid operand format '" + Expr + "'");
+  int64_t SignedLiteralValue;
+  uint64_t UnsignedLiteralValue;
+  StringRef SaveExpr = Expr;
+  // Accept both signed and unsigned literal, default to signed literal.
+  if (!Expr.consumeInteger((AO == AllowedOperand::LegacyLiteral) ? 10 : 0,
+                           UnsignedLiteralValue))
+    return std::make_unique<ExpressionLiteral>(SaveExpr.drop_back(Expr.size()),
+                                               UnsignedLiteralValue);
+  Expr = SaveExpr;
+  if (AO == AllowedOperand::Any && !Expr.consumeInteger(0, SignedLiteralValue))
+    return std::make_unique<ExpressionLiteral>(SaveExpr.drop_back(Expr.size()),
+                                               SignedLiteralValue);
+
+  return ErrorDiagnostic::get(
+      SM, Expr,
+      Twine("invalid ") +
+          (MaybeInvalidConstraint ? "matching constraint or " : "") +
+          "operand format");
 }
 
-static uint64_t add(uint64_t LeftOp, uint64_t RightOp) {
-  return LeftOp + RightOp;
-}
+Expected<std::unique_ptr<ExpressionAST>>
+Pattern::parseParenExpr(StringRef &Expr, Optional<size_t> LineNumber,
+                        FileCheckPatternContext *Context, const SourceMgr &SM) {
+  Expr = Expr.ltrim(SpaceChars);
+  assert(Expr.startswith("("));
+
+  // Parse right operand.
+  Expr.consume_front("(");
+  Expr = Expr.ltrim(SpaceChars);
+  if (Expr.empty())
+    return ErrorDiagnostic::get(SM, Expr, "missing operand in expression");
 
-static uint64_t sub(uint64_t LeftOp, uint64_t RightOp) {
-  return LeftOp - RightOp;
+  // Note: parseNumericOperand handles nested opening parentheses.
+  Expected<std::unique_ptr<ExpressionAST>> SubExprResult = parseNumericOperand(
+      Expr, AllowedOperand::Any, /*MaybeInvalidConstraint=*/false, LineNumber,
+      Context, SM);
+  Expr = Expr.ltrim(SpaceChars);
+  while (SubExprResult && !Expr.empty() && !Expr.startswith(")")) {
+    StringRef OrigExpr = Expr;
+    SubExprResult = parseBinop(OrigExpr, Expr, std::move(*SubExprResult), false,
+                               LineNumber, Context, SM);
+    Expr = Expr.ltrim(SpaceChars);
+  }
+  if (!SubExprResult)
+    return SubExprResult;
+
+  if (!Expr.consume_front(")")) {
+    return ErrorDiagnostic::get(SM, Expr,
+                                "missing ')' at end of nested expression");
+  }
+  return SubExprResult;
 }
 
 Expected<std::unique_ptr<ExpressionAST>>
-Pattern::parseBinop(StringRef &Expr, std::unique_ptr<ExpressionAST> LeftOp,
+Pattern::parseBinop(StringRef Expr, StringRef &RemainingExpr,
+                    std::unique_ptr<ExpressionAST> LeftOp,
                     bool IsLegacyLineExpr, Optional<size_t> LineNumber,
                     FileCheckPatternContext *Context, const SourceMgr &SM) {
-  Expr = Expr.ltrim(SpaceChars);
-  if (Expr.empty())
+  RemainingExpr = RemainingExpr.ltrim(SpaceChars);
+  if (RemainingExpr.empty())
     return std::move(LeftOp);
 
   // Check if this is a supported operation and select a function to perform
   // it.
-  SMLoc OpLoc = SMLoc::getFromPointer(Expr.data());
-  char Operator = popFront(Expr);
+  SMLoc OpLoc = SMLoc::getFromPointer(RemainingExpr.data());
+  char Operator = popFront(RemainingExpr);
   binop_eval_t EvalBinop;
   switch (Operator) {
   case '+':
-    EvalBinop = add;
+    EvalBinop = operator+;
     break;
   case '-':
-    EvalBinop = sub;
+    EvalBinop = operator-;
     break;
   default:
     return ErrorDiagnostic::get(
@@ -239,29 +617,145 @@ Pattern::parseBinop(StringRef &Expr, std::unique_ptr<ExpressionAST> LeftOp,
   }
 
   // Parse right operand.
-  Expr = Expr.ltrim(SpaceChars);
-  if (Expr.empty())
-    return ErrorDiagnostic::get(SM, Expr, "missing operand in expression");
+  RemainingExpr = RemainingExpr.ltrim(SpaceChars);
+  if (RemainingExpr.empty())
+    return ErrorDiagnostic::get(SM, RemainingExpr,
+                                "missing operand in expression");
   // The second operand in a legacy @LINE expression is always a literal.
   AllowedOperand AO =
-      IsLegacyLineExpr ? AllowedOperand::Literal : AllowedOperand::Any;
+      IsLegacyLineExpr ? AllowedOperand::LegacyLiteral : AllowedOperand::Any;
   Expected<std::unique_ptr<ExpressionAST>> RightOpResult =
-      parseNumericOperand(Expr, AO, LineNumber, Context, SM);
+      parseNumericOperand(RemainingExpr, AO, /*MaybeInvalidConstraint=*/false,
+                          LineNumber, Context, SM);
   if (!RightOpResult)
     return RightOpResult;
 
-  Expr = Expr.ltrim(SpaceChars);
-  return std::make_unique<BinaryOperation>(EvalBinop, std::move(LeftOp),
+  Expr = Expr.drop_back(RemainingExpr.size());
+  return std::make_unique<BinaryOperation>(Expr, EvalBinop, std::move(LeftOp),
                                            std::move(*RightOpResult));
 }
 
-Expected<std::unique_ptr<ExpressionAST>> Pattern::parseNumericSubstitutionBlock(
+Expected<std::unique_ptr<ExpressionAST>>
+Pattern::parseCallExpr(StringRef &Expr, StringRef FuncName,
+                       Optional<size_t> LineNumber,
+                       FileCheckPatternContext *Context, const SourceMgr &SM) {
+  Expr = Expr.ltrim(SpaceChars);
+  assert(Expr.startswith("("));
+
+  auto OptFunc = StringSwitch<Optional<binop_eval_t>>(FuncName)
+                     .Case("add", operator+)
+                     .Case("div", operator/)
+                     .Case("max", max)
+                     .Case("min", min)
+                     .Case("mul", operator*)
+                     .Case("sub", operator-)
+                     .Default(None);
+
+  if (!OptFunc)
+    return ErrorDiagnostic::get(
+        SM, FuncName, Twine("call to undefined function '") + FuncName + "'");
+
+  Expr.consume_front("(");
+  Expr = Expr.ltrim(SpaceChars);
+
+  // Parse call arguments, which are comma separated.
+  SmallVector<std::unique_ptr<ExpressionAST>, 4> Args;
+  while (!Expr.empty() && !Expr.startswith(")")) {
+    if (Expr.startswith(","))
+      return ErrorDiagnostic::get(SM, Expr, "missing argument");
+
+    // Parse the argument, which is an arbitary expression.
+    StringRef OuterBinOpExpr = Expr;
+    Expected<std::unique_ptr<ExpressionAST>> Arg = parseNumericOperand(
+        Expr, AllowedOperand::Any, /*MaybeInvalidConstraint=*/false, LineNumber,
+        Context, SM);
+    while (Arg && !Expr.empty()) {
+      Expr = Expr.ltrim(SpaceChars);
+      // Have we reached an argument terminator?
+      if (Expr.startswith(",") || Expr.startswith(")"))
+        break;
+
+      // Arg = Arg <op> <expr>
+      Arg = parseBinop(OuterBinOpExpr, Expr, std::move(*Arg), false, LineNumber,
+                       Context, SM);
+    }
+
+    // Prefer an expression error over a generic invalid argument message.
+    if (!Arg)
+      return Arg.takeError();
+    Args.push_back(std::move(*Arg));
+
+    // Have we parsed all available arguments?
+    Expr = Expr.ltrim(SpaceChars);
+    if (!Expr.consume_front(","))
+      break;
+
+    Expr = Expr.ltrim(SpaceChars);
+    if (Expr.startswith(")"))
+      return ErrorDiagnostic::get(SM, Expr, "missing argument");
+  }
+
+  if (!Expr.consume_front(")"))
+    return ErrorDiagnostic::get(SM, Expr,
+                                "missing ')' at end of call expression");
+
+  const unsigned NumArgs = Args.size();
+  if (NumArgs == 2)
+    return std::make_unique<BinaryOperation>(Expr, *OptFunc, std::move(Args[0]),
+                                             std::move(Args[1]));
+
+  // TODO: Support more than binop_eval_t.
+  return ErrorDiagnostic::get(SM, FuncName,
+                              Twine("function '") + FuncName +
+                                  Twine("' takes 2 arguments but ") +
+                                  Twine(NumArgs) + " given");
+}
+
+Expected<std::unique_ptr<Expression>> Pattern::parseNumericSubstitutionBlock(
     StringRef Expr, Optional<NumericVariable *> &DefinedNumericVariable,
     bool IsLegacyLineExpr, Optional<size_t> LineNumber,
     FileCheckPatternContext *Context, const SourceMgr &SM) {
   std::unique_ptr<ExpressionAST> ExpressionASTPointer = nullptr;
   StringRef DefExpr = StringRef();
   DefinedNumericVariable = None;
+  ExpressionFormat ExplicitFormat = ExpressionFormat();
+
+  // Parse format specifier (NOTE: ',' is also an argument seperator).
+  size_t FormatSpecEnd = Expr.find(',');
+  size_t FunctionStart = Expr.find('(');
+  if (FormatSpecEnd != StringRef::npos && FormatSpecEnd < FunctionStart) {
+    Expr = Expr.ltrim(SpaceChars);
+    if (!Expr.consume_front("%"))
+      return ErrorDiagnostic::get(
+          SM, Expr, "invalid matching format specification in expression");
+
+    // Check for unknown matching format specifier and set matching format in
+    // class instance representing this expression.
+    SMLoc fmtloc = SMLoc::getFromPointer(Expr.data());
+    switch (popFront(Expr)) {
+    case 'u':
+      ExplicitFormat = ExpressionFormat(ExpressionFormat::Kind::Unsigned);
+      break;
+    case 'd':
+      ExplicitFormat = ExpressionFormat(ExpressionFormat::Kind::Signed);
+      break;
+    case 'x':
+      ExplicitFormat = ExpressionFormat(ExpressionFormat::Kind::HexLower);
+      break;
+    case 'X':
+      ExplicitFormat = ExpressionFormat(ExpressionFormat::Kind::HexUpper);
+      break;
+    default:
+      return ErrorDiagnostic::get(SM, fmtloc,
+                                  "invalid format specifier in expression");
+    }
+
+    Expr = Expr.ltrim(SpaceChars);
+    if (!Expr.consume_front(","))
+      return ErrorDiagnostic::get(
+          SM, Expr, "invalid matching format specification in expression");
+  }
+
   // Save variable definition expression if any.
   size_t DefEnd = Expr.find(':');
   if (DefEnd != StringRef::npos) {
@@ -269,18 +763,30 @@ Expected<std::unique_ptr<ExpressionAST>> Pattern::parseNumericSubstitutionBlock(
     Expr = Expr.substr(DefEnd + 1);
   }
 
+  // Parse matching constraint.
+  Expr = Expr.ltrim(SpaceChars);
+  bool HasParsedValidConstraint = false;
+  if (Expr.consume_front("=="))
+    HasParsedValidConstraint = true;
+
   // Parse the expression itself.
   Expr = Expr.ltrim(SpaceChars);
-  if (!Expr.empty()) {
+  if (Expr.empty()) {
+    if (HasParsedValidConstraint)
+      return ErrorDiagnostic::get(
+          SM, Expr, "empty numeric expression should not have a constraint");
+  } else {
+    Expr = Expr.rtrim(SpaceChars);
+    StringRef OuterBinOpExpr = Expr;
     // The first operand in a legacy @LINE expression is always the @LINE
     // pseudo variable.
     AllowedOperand AO =
         IsLegacyLineExpr ? AllowedOperand::LineVar : AllowedOperand::Any;
-    Expected<std::unique_ptr<ExpressionAST>> ParseResult =
-        parseNumericOperand(Expr, AO, LineNumber, Context, SM);
+    Expected<std::unique_ptr<ExpressionAST>> ParseResult = parseNumericOperand(
+        Expr, AO, !HasParsedValidConstraint, LineNumber, Context, SM);
     while (ParseResult && !Expr.empty()) {
-      ParseResult = parseBinop(Expr, std::move(*ParseResult), IsLegacyLineExpr,
-                               LineNumber, Context, SM);
+      ParseResult = parseBinop(OuterBinOpExpr, Expr, std::move(*ParseResult),
+                               IsLegacyLineExpr, LineNumber, Context, SM);
       // Legacy @LINE expressions only allow 2 operands.
       if (ParseResult && IsLegacyLineExpr && !Expr.empty())
         return ErrorDiagnostic::get(
@@ -288,22 +794,42 @@ Expected<std::unique_ptr<ExpressionAST>> Pattern::parseNumericSubstitutionBlock(
             "unexpected characters at end of expression '" + Expr + "'");
     }
     if (!ParseResult)
-      return ParseResult;
+      return ParseResult.takeError();
     ExpressionASTPointer = std::move(*ParseResult);
   }
 
+  // Select format of the expression, i.e. (i) its explicit format, if any,
+  // otherwise (ii) its implicit format, if any, otherwise (iii) the default
+  // format (unsigned). Error out in case of conflicting implicit format
+  // without explicit format.
+  ExpressionFormat Format;
+  if (ExplicitFormat)
+    Format = ExplicitFormat;
+  else if (ExpressionASTPointer) {
+    Expected<ExpressionFormat> ImplicitFormat =
+        ExpressionASTPointer->getImplicitFormat(SM);
+    if (!ImplicitFormat)
+      return ImplicitFormat.takeError();
+    Format = *ImplicitFormat;
+  }
+  if (!Format)
+    Format = ExpressionFormat(ExpressionFormat::Kind::Unsigned);
+
+  std::unique_ptr<Expression> ExpressionPointer =
+      std::make_unique<Expression>(std::move(ExpressionASTPointer), Format);
+
   // Parse the numeric variable definition.
   if (DefEnd != StringRef::npos) {
     DefExpr = DefExpr.ltrim(SpaceChars);
-    Expected<NumericVariable *> ParseResult =
-        parseNumericVariableDefinition(DefExpr, Context, LineNumber, SM);
+    Expected<NumericVariable *> ParseResult = parseNumericVariableDefinition(
+        DefExpr, Context, LineNumber, ExpressionPointer->getFormat(), SM);
 
     if (!ParseResult)
       return ParseResult.takeError();
     DefinedNumericVariable = *ParseResult;
   }
 
-  return std::move(ExpressionASTPointer);
+  return std::move(ExpressionPointer);
 }
 
 bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
@@ -476,10 +1002,10 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
       }
 
       // Parse numeric substitution block.
-      std::unique_ptr<ExpressionAST> ExpressionASTPointer;
+      std::unique_ptr<Expression> ExpressionPointer;
       Optional<NumericVariable *> DefinedNumericVariable;
       if (IsNumBlock) {
-        Expected<std::unique_ptr<ExpressionAST>> ParseResult =
+        Expected<std::unique_ptr<Expression>> ParseResult =
             parseNumericSubstitutionBlock(MatchStr, DefinedNumericVariable,
                                           IsLegacyLineExpr, LineNumber, Context,
                                           SM);
@@ -487,16 +1013,18 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
           logAllUnhandledErrors(ParseResult.takeError(), errs());
           return true;
         }
-        ExpressionASTPointer = std::move(*ParseResult);
-        SubstNeeded = ExpressionASTPointer != nullptr;
+        ExpressionPointer = std::move(*ParseResult);
+        SubstNeeded = ExpressionPointer->getAST() != nullptr;
         if (DefinedNumericVariable) {
           IsDefinition = true;
           DefName = (*DefinedNumericVariable)->getName();
         }
         if (SubstNeeded)
           SubstStr = MatchStr;
-        else
-          MatchRegexp = "[0-9]+";
+        else {
+          ExpressionFormat Format = ExpressionPointer->getFormat();
+          MatchRegexp = cantFail(Format.getWildcardRegex());
+        }
       }
 
       // Handle variable definition: [[<def>:(...)]] and [[#(...)<def>:(...)]].
@@ -554,8 +1082,7 @@ bool Pattern::parsePattern(StringRef PatternStr, StringRef Prefix,
           Substitution *Substitution =
               IsNumBlock
                   ? Context->makeNumericSubstitution(
-                        SubstStr, std::move(ExpressionASTPointer),
-                        SubstInsertIdx)
+                        SubstStr, std::move(ExpressionPointer), SubstInsertIdx)
                   : Context->makeStringSubstitution(SubstStr, SubstInsertIdx);
           Substitutions.push_back(Substitution);
         }
@@ -626,7 +1153,7 @@ Expected<size_t> Pattern::match(StringRef Buffer, size_t &MatchLen,
   if (!Substitutions.empty()) {
     TmpStr = RegExStr;
     if (LineNumber)
-      Context->LineVariable->setValue(*LineNumber);
+      Context->LineVariable->setValue(ExpressionValue(*LineNumber));
 
     size_t InsertOffset = 0;
     // Substitute all string variables and expressions whose values are only
@@ -635,8 +1162,18 @@ Expected<size_t> Pattern::match(StringRef Buffer, size_t &MatchLen,
     for (const auto &Substitution : Substitutions) {
       // Substitute and check for failure (e.g. use of undefined variable).
       Expected<std::string> Value = Substitution->getResult();
-      if (!Value)
-        return Value.takeError();
+      if (!Value) {
+        // Convert to an ErrorDiagnostic to get location information. This is
+        // done here rather than PrintNoMatch since now we know which
+        // substitution block caused the overflow.
+        Error Err =
+            handleErrors(Value.takeError(), [&](const OverflowError &E) {
+              return ErrorDiagnostic::get(SM, Substitution->getFromString(),
+                                          "unable to substitute variable or "
+                                          "numeric expression: overflow error");
+            });
+        return std::move(Err);
+      }
 
       // Plop it into the regex at the adjusted offset.
       TmpStr.insert(TmpStr.begin() + Substitution->getIndex() + InsertOffset,
@@ -676,11 +1213,12 @@ Expected<size_t> Pattern::match(StringRef Buffer, size_t &MatchLen,
         NumericVariableMatch.DefinedNumericVariable;
 
     StringRef MatchedValue = MatchInfo[CaptureParenGroup];
-    uint64_t Val;
-    if (MatchedValue.getAsInteger(10, Val))
-      return ErrorDiagnostic::get(SM, MatchedValue,
-                                  "Unable to represent numeric value");
-    DefinedNumericVariable->setValue(Val);
+    ExpressionFormat Format = DefinedNumericVariable->getImplicitFormat();
+    Expected<ExpressionValue> Value =
+        Format.valueFromStringRepr(MatchedValue, SM);
+    if (!Value)
+      return Value.takeError();
+    DefinedNumericVariable->setValue(*Value);
   }
 
   // Like CHECK-NEXT, CHECK-EMPTY's match range is considered to start after
@@ -721,17 +1259,20 @@ void Pattern::printSubstitutions(const SourceMgr &SM, StringRef Buffer,
       // variables it uses.
       if (!MatchedValue) {
         bool UndefSeen = false;
-        handleAllErrors(MatchedValue.takeError(), [](const NotFoundError &E) {},
-                        // Handled in PrintNoMatch().
-                        [](const ErrorDiagnostic &E) {},
-                        [&](const UndefVarError &E) {
-                          if (!UndefSeen) {
-                            OS << "uses undefined variable(s):";
-                            UndefSeen = true;
-                          }
-                          OS << " ";
-                          E.log(OS);
-                        });
+        handleAllErrors(
+            MatchedValue.takeError(), [](const NotFoundError &E) {},
+            // Handled in PrintNoMatch().
+            [](const ErrorDiagnostic &E) {},
+            // Handled in match().
+            [](const OverflowError &E) {},
+            [&](const UndefVarError &E) {
+              if (!UndefSeen) {
+                OS << "uses undefined variable(s):";
+                UndefSeen = true;
+              }
+              OS << " ";
+              E.log(OS);
+            });
       } else {
         // Substitution succeeded. Print substituted value.
         OS << "with \"";
@@ -837,10 +1378,10 @@ FileCheckPatternContext::makeStringSubstitution(StringRef VarName,
 }
 
 Substitution *FileCheckPatternContext::makeNumericSubstitution(
-    StringRef ExpressionStr,
-    std::unique_ptr<ExpressionAST> ExpressionASTPointer, size_t InsertIdx) {
+    StringRef ExpressionStr, std::unique_ptr<Expression> Expression,
+    size_t InsertIdx) {
   Substitutions.push_back(std::make_unique<NumericSubstitution>(
-      this, ExpressionStr, std::move(ExpressionASTPointer), InsertIdx));
+      this, ExpressionStr, std::move(Expression), InsertIdx));
   return Substitutions.back().get();
 }
 
@@ -915,20 +1456,17 @@ FileCheckDiag::FileCheckDiag(const SourceMgr &SM,
                              const Check::FileCheckType &CheckTy,
                              SMLoc CheckLoc, MatchType MatchTy,
                              SMRange InputRange)
-    : CheckTy(CheckTy), MatchTy(MatchTy) {
+    : CheckTy(CheckTy), CheckLoc(CheckLoc), MatchTy(MatchTy) {
   auto Start = SM.getLineAndColumn(InputRange.Start);
   auto End = SM.getLineAndColumn(InputRange.End);
   InputStartLine = Start.first;
   InputStartCol = Start.second;
   InputEndLine = End.first;
   InputEndCol = End.second;
-  Start = SM.getLineAndColumn(CheckLoc);
-  CheckLine = Start.first;
-  CheckCol = Start.second;
 }
 
 static bool IsPartOfWord(char c) {
-  return (isalnum(c) || c == '-' || c == '_');
+  return (isAlnum(c) || c == '-' || c == '_');
 }
 
 Check::FileCheckType &Check::FileCheckType::setCount(int C) {
@@ -946,7 +1484,7 @@ std::string Check::FileCheckType::getDescription(StringRef Prefix) const {
   case Check::CheckPlain:
     if (Count > 1)
       return Prefix.str() + "-COUNT";
-    return Prefix;
+    return std::string(Prefix);
   case Check::CheckNext:
     return Prefix.str() + "-NEXT";
   case Check::CheckSame:
@@ -959,6 +1497,8 @@ std::string Check::FileCheckType::getDescription(StringRef Prefix) const {
     return Prefix.str() + "-LABEL";
   case Check::CheckEmpty:
     return Prefix.str() + "-EMPTY";
+  case Check::CheckComment:
+    return std::string(Prefix);
   case Check::CheckEOF:
     return "implicit EOF";
   case Check::CheckBadNot:
@@ -970,13 +1510,24 @@ std::string Check::FileCheckType::getDescription(StringRef Prefix) const {
 }
 
 static std::pair<Check::FileCheckType, StringRef>
-FindCheckType(StringRef Buffer, StringRef Prefix) {
+FindCheckType(const FileCheckRequest &Req, StringRef Buffer, StringRef Prefix) {
   if (Buffer.size() <= Prefix.size())
     return {Check::CheckNone, StringRef()};
 
   char NextChar = Buffer[Prefix.size()];
 
   StringRef Rest = Buffer.drop_front(Prefix.size() + 1);
+
+  // Check for comment.
+  if (Req.CommentPrefixes.end() != std::find(Req.CommentPrefixes.begin(),
+                                             Req.CommentPrefixes.end(),
+                                             Prefix)) {
+    if (NextChar == ':')
+      return {Check::CheckComment, Rest};
+    // Ignore a comment prefix if it has a suffix like "-NOT".
+    return {Check::CheckNone, StringRef()};
+  }
+
   // Verify that the : is present after the prefix.
   if (NextChar == ':')
     return {Check::CheckPlain, Rest};
@@ -1055,8 +1606,9 @@ static size_t SkipWord(StringRef Str, size_t Loc) {
 /// If no valid prefix is found, the state of Buffer, LineNumber, and CheckTy
 /// is unspecified.
 static std::pair<StringRef, StringRef>
-FindFirstMatchingPrefix(Regex &PrefixRE, StringRef &Buffer,
-                        unsigned &LineNumber, Check::FileCheckType &CheckTy) {
+FindFirstMatchingPrefix(const FileCheckRequest &Req, Regex &PrefixRE,
+                        StringRef &Buffer, unsigned &LineNumber,
+                        Check::FileCheckType &CheckTy) {
   SmallVector<StringRef, 2> Matches;
 
   while (!Buffer.empty()) {
@@ -1084,7 +1636,7 @@ FindFirstMatchingPrefix(Regex &PrefixRE, StringRef &Buffer,
     if (Skipped.empty() || !IsPartOfWord(Skipped.back())) {
       // Now extract the type.
       StringRef AfterSuffix;
-      std::tie(CheckTy, AfterSuffix) = FindCheckType(Buffer, Prefix);
+      std::tie(CheckTy, AfterSuffix) = FindCheckType(Req, Buffer, Prefix);
 
       // If we've found a valid check type for this prefix, we're done.
       if (CheckTy != Check::CheckNone)
@@ -1104,7 +1656,8 @@ FindFirstMatchingPrefix(Regex &PrefixRE, StringRef &Buffer,
 void FileCheckPatternContext::createLineVariable() {
   assert(!LineVariable && "@LINE pseudo numeric variable already created");
   StringRef LineName = "@LINE";
-  LineVariable = makeNumericVariable(LineName);
+  LineVariable = makeNumericVariable(
+      LineName, ExpressionFormat(ExpressionFormat::Kind::Unsigned));
   GlobalNumericVariableTable[LineName] = LineVariable;
 }
 
@@ -1114,8 +1667,12 @@ FileCheck::FileCheck(FileCheckRequest Req)
 
 FileCheck::~FileCheck() = default;
 
-bool FileCheck::readCheckFile(SourceMgr &SM, StringRef Buffer,
-                              Regex &PrefixRE) {
+bool FileCheck::readCheckFile(
+    SourceMgr &SM, StringRef Buffer, Regex &PrefixRE,
+    std::pair<unsigned, unsigned> *ImpPatBufferIDRange) {
+  if (ImpPatBufferIDRange)
+    ImpPatBufferIDRange->first = ImpPatBufferIDRange->second = 0;
+
   Error DefineError =
       PatternContext->defineCmdlineVariables(Req.GlobalDefines, SM);
   if (DefineError) {
@@ -1126,17 +1683,27 @@ bool FileCheck::readCheckFile(SourceMgr &SM, StringRef Buffer,
   PatternContext->createLineVariable();
 
   std::vector<Pattern> ImplicitNegativeChecks;
-  for (const auto &PatternString : Req.ImplicitCheckNot) {
+  for (StringRef PatternString : Req.ImplicitCheckNot) {
     // Create a buffer with fake command line content in order to display the
     // command line option responsible for the specific implicit CHECK-NOT.
     std::string Prefix = "-implicit-check-not='";
     std::string Suffix = "'";
     std::unique_ptr<MemoryBuffer> CmdLine = MemoryBuffer::getMemBufferCopy(
-        Prefix + PatternString + Suffix, "command line");
+        (Prefix + PatternString + Suffix).str(), "command line");
 
     StringRef PatternInBuffer =
         CmdLine->getBuffer().substr(Prefix.size(), PatternString.size());
-    SM.AddNewSourceBuffer(std::move(CmdLine), SMLoc());
+    unsigned BufferID = SM.AddNewSourceBuffer(std::move(CmdLine), SMLoc());
+    if (ImpPatBufferIDRange) {
+      if (ImpPatBufferIDRange->first == ImpPatBufferIDRange->second) {
+        ImpPatBufferIDRange->first = BufferID;
+        ImpPatBufferIDRange->second = BufferID + 1;
+      } else {
+        assert(BufferID == ImpPatBufferIDRange->second &&
+               "expected consecutive source buffer IDs");
+        ++ImpPatBufferIDRange->second;
+      }
+    }
 
     ImplicitNegativeChecks.push_back(
         Pattern(Check::CheckNot, PatternContext.get()));
@@ -1150,6 +1717,7 @@ bool FileCheck::readCheckFile(SourceMgr &SM, StringRef Buffer,
   // found.
   unsigned LineNumber = 1;
 
+  bool FoundUsedCheckPrefix = false;
   while (1) {
     Check::FileCheckType CheckTy;
 
@@ -1157,9 +1725,12 @@ bool FileCheck::readCheckFile(SourceMgr &SM, StringRef Buffer,
     StringRef UsedPrefix;
     StringRef AfterSuffix;
     std::tie(UsedPrefix, AfterSuffix) =
-        FindFirstMatchingPrefix(PrefixRE, Buffer, LineNumber, CheckTy);
+        FindFirstMatchingPrefix(Req, PrefixRE, Buffer, LineNumber, CheckTy);
     if (UsedPrefix.empty())
       break;
+    if (CheckTy != Check::CheckComment)
+      FoundUsedCheckPrefix = true;
+
     assert(UsedPrefix.data() == Buffer.data() &&
            "Failed to move Buffer's start forward, or pointed prefix outside "
            "of the buffer!");
@@ -1201,9 +1772,17 @@ bool FileCheck::readCheckFile(SourceMgr &SM, StringRef Buffer,
     // Remember the location of the start of the pattern, for diagnostics.
     SMLoc PatternLoc = SMLoc::getFromPointer(Buffer.data());
 
+    // Extract the pattern from the buffer.
+    StringRef PatternBuffer = Buffer.substr(0, EOL);
+    Buffer = Buffer.substr(EOL);
+
+    // If this is a comment, we're done.
+    if (CheckTy == Check::CheckComment)
+      continue;
+
     // Parse the pattern.
     Pattern P(CheckTy, PatternContext.get(), LineNumber);
-    if (P.parsePattern(Buffer.substr(0, EOL), UsedPrefix, SM, Req))
+    if (P.parsePattern(PatternBuffer, UsedPrefix, SM, Req))
       return true;
 
     // Verify that CHECK-LABEL lines do not define or use variables
@@ -1215,8 +1794,6 @@ bool FileCheck::readCheckFile(SourceMgr &SM, StringRef Buffer,
       return true;
     }
 
-    Buffer = Buffer.substr(EOL);
-
     // Verify that CHECK-NEXT/SAME/EMPTY lines have at least one CHECK line before them.
     if ((CheckTy == Check::CheckNext || CheckTy == Check::CheckSame ||
          CheckTy == Check::CheckEmpty) &&
@@ -1243,31 +1820,30 @@ bool FileCheck::readCheckFile(SourceMgr &SM, StringRef Buffer,
     DagNotMatches = ImplicitNegativeChecks;
   }
 
-  // Add an EOF pattern for any trailing CHECK-DAG/-NOTs, and use the first
-  // prefix as a filler for the error message.
-  if (!DagNotMatches.empty()) {
-    CheckStrings->emplace_back(
-        Pattern(Check::CheckEOF, PatternContext.get(), LineNumber + 1),
-        *Req.CheckPrefixes.begin(), SMLoc::getFromPointer(Buffer.data()));
-    std::swap(DagNotMatches, CheckStrings->back().DagNotStrings);
-  }
-
-  if (CheckStrings->empty()) {
+  // When there are no used prefixes we report an error except in the case that
+  // no prefix is specified explicitly but -implicit-check-not is specified.
+  if (!FoundUsedCheckPrefix &&
+      (ImplicitNegativeChecks.empty() || !Req.IsDefaultCheckPrefix)) {
     errs() << "error: no check strings found with prefix"
            << (Req.CheckPrefixes.size() > 1 ? "es " : " ");
-    auto I = Req.CheckPrefixes.begin();
-    auto E = Req.CheckPrefixes.end();
-    if (I != E) {
-      errs() << "\'" << *I << ":'";
-      ++I;
+    for (size_t I = 0, E = Req.CheckPrefixes.size(); I != E; ++I) {
+      if (I != 0)
+        errs() << ", ";
+      errs() << "\'" << Req.CheckPrefixes[I] << ":'";
     }
-    for (; I != E; ++I)
-      errs() << ", \'" << *I << ":'";
-
     errs() << '\n';
     return true;
   }
 
+  // Add an EOF pattern for any trailing --implicit-check-not/CHECK-DAG/-NOTs,
+  // and use the first prefix as a filler for the error message.
+  if (!DagNotMatches.empty()) {
+    CheckStrings->emplace_back(
+        Pattern(Check::CheckEOF, PatternContext.get(), LineNumber + 1),
+        *Req.CheckPrefixes.begin(), SMLoc::getFromPointer(Buffer.data()));
+    std::swap(DagNotMatches, CheckStrings->back().DagNotStrings);
+  }
+
   return false;
 }
 
@@ -1706,43 +2282,74 @@ size_t FileCheckString::CheckDag(const SourceMgr &SM, StringRef Buffer,
   return StartPos;
 }
 
-// A check prefix must contain only alphanumeric, hyphens and underscores.
-static bool ValidateCheckPrefix(StringRef CheckPrefix) {
-  static const Regex Validator("^[a-zA-Z0-9_-]*$");
-  return Validator.match(CheckPrefix);
-}
-
-bool FileCheck::ValidateCheckPrefixes() {
-  StringSet<> PrefixSet;
-
-  for (StringRef Prefix : Req.CheckPrefixes) {
-    // Reject empty prefixes.
-    if (Prefix == "")
+static bool ValidatePrefixes(StringRef Kind, StringSet<> &UniquePrefixes,
+                             ArrayRef<StringRef> SuppliedPrefixes) {
+  for (StringRef Prefix : SuppliedPrefixes) {
+    if (Prefix.empty()) {
+      errs() << "error: supplied " << Kind << " prefix must not be the empty "
+             << "string\n";
       return false;
-
-    if (!PrefixSet.insert(Prefix).second)
+    }
+    static const Regex Validator("^[a-zA-Z0-9_-]*$");
+    if (!Validator.match(Prefix)) {
+      errs() << "error: supplied " << Kind << " prefix must start with a "
+             << "letter and contain only alphanumeric characters, hyphens, and "
+             << "underscores: '" << Prefix << "'\n";
       return false;
-
-    if (!ValidateCheckPrefix(Prefix))
+    }
+    if (!UniquePrefixes.insert(Prefix).second) {
+      errs() << "error: supplied " << Kind << " prefix must be unique among "
+             << "check and comment prefixes: '" << Prefix << "'\n";
       return false;
+    }
   }
+  return true;
+}
+
+static const char *DefaultCheckPrefixes[] = {"CHECK"};
+static const char *DefaultCommentPrefixes[] = {"COM", "RUN"};
 
+bool FileCheck::ValidateCheckPrefixes() {
+  StringSet<> UniquePrefixes;
+  // Add default prefixes to catch user-supplied duplicates of them below.
+  if (Req.CheckPrefixes.empty()) {
+    for (const char *Prefix : DefaultCheckPrefixes)
+      UniquePrefixes.insert(Prefix);
+  }
+  if (Req.CommentPrefixes.empty()) {
+    for (const char *Prefix : DefaultCommentPrefixes)
+      UniquePrefixes.insert(Prefix);
+  }
+  // Do not validate the default prefixes, or diagnostics about duplicates might
+  // incorrectly indicate that they were supplied by the user.
+  if (!ValidatePrefixes("check", UniquePrefixes, Req.CheckPrefixes))
+    return false;
+  if (!ValidatePrefixes("comment", UniquePrefixes, Req.CommentPrefixes))
+    return false;
   return true;
 }
 
 Regex FileCheck::buildCheckPrefixRegex() {
-  // I don't think there's a way to specify an initial value for cl::list,
-  // so if nothing was specified, add the default
-  if (Req.CheckPrefixes.empty())
-    Req.CheckPrefixes.push_back("CHECK");
+  if (Req.CheckPrefixes.empty()) {
+    for (const char *Prefix : DefaultCheckPrefixes)
+      Req.CheckPrefixes.push_back(Prefix);
+    Req.IsDefaultCheckPrefix = true;
+  }
+  if (Req.CommentPrefixes.empty()) {
+    for (const char *Prefix : DefaultCommentPrefixes)
+      Req.CommentPrefixes.push_back(Prefix);
+  }
 
-  // We already validated the contents of CheckPrefixes so just concatenate
-  // them as alternatives.
+  // We already validated the contents of CheckPrefixes and CommentPrefixes so
+  // just concatenate them as alternatives.
   SmallString<32> PrefixRegexStr;
-  for (StringRef Prefix : Req.CheckPrefixes) {
-    if (Prefix != Req.CheckPrefixes.front())
+  for (size_t I = 0, E = Req.CheckPrefixes.size(); I != E; ++I) {
+    if (I != 0)
       PrefixRegexStr.push_back('|');
-
+    PrefixRegexStr.append(Req.CheckPrefixes[I]);
+  }
+  for (StringRef Prefix : Req.CommentPrefixes) {
+    PrefixRegexStr.push_back('|');
     PrefixRegexStr.append(Prefix);
   }
 
@@ -1750,7 +2357,7 @@ Regex FileCheck::buildCheckPrefixRegex() {
 }
 
 Error FileCheckPatternContext::defineCmdlineVariables(
-    std::vector<std::string> &CmdlineDefines, SourceMgr &SM) {
+    ArrayRef<StringRef> CmdlineDefines, SourceMgr &SM) {
   assert(GlobalVariableTable.empty() && GlobalNumericVariableTable.empty() &&
          "Overriding defined variable with command-line variable definitions");
 
@@ -1777,7 +2384,7 @@ Error FileCheckPatternContext::defineCmdlineVariables(
       // format as in the input file to be able to reuse
       // parseNumericSubstitutionBlock.
       CmdlineDefsDiag += (DefPrefix + CmdlineDef + " (parsed as: [[").str();
-      std::string SubstitutionStr = CmdlineDef;
+      std::string SubstitutionStr = std::string(CmdlineDef);
       SubstitutionStr[EqIdx] = ':';
       CmdlineDefsIndices.push_back(
           std::make_pair(CmdlineDefsDiag.size(), SubstitutionStr.size()));
@@ -1815,20 +2422,19 @@ Error FileCheckPatternContext::defineCmdlineVariables(
       // to create the necessary class instance.
       StringRef CmdlineDefExpr = CmdlineDef.substr(1);
       Optional<NumericVariable *> DefinedNumericVariable;
-      Expected<std::unique_ptr<ExpressionAST>> ExpressionASTResult =
+      Expected<std::unique_ptr<Expression>> ExpressionResult =
           Pattern::parseNumericSubstitutionBlock(
               CmdlineDefExpr, DefinedNumericVariable, false, None, this, SM);
-      if (!ExpressionASTResult) {
-        Errs = joinErrors(std::move(Errs), ExpressionASTResult.takeError());
+      if (!ExpressionResult) {
+        Errs = joinErrors(std::move(Errs), ExpressionResult.takeError());
         continue;
       }
-      std::unique_ptr<ExpressionAST> ExpressionASTPointer =
-          std::move(*ExpressionASTResult);
+      std::unique_ptr<Expression> Expression = std::move(*ExpressionResult);
       // Now evaluate the expression whose value this variable should be set
       // to, since the expression of a command-line variable definition should
       // only use variables defined earlier on the command-line. If not, this
       // is an error and we report it.
-      Expected<uint64_t> Value = ExpressionASTPointer->eval();
+      Expected<ExpressionValue> Value = Expression->getAST()->eval();
       if (!Value) {
         Errs = joinErrors(std::move(Errs), Value.takeError());
         continue;
diff --git a/llvm/lib/Support/FileCheckImpl.h b/llvm/lib/Support/FileCheckImpl.h
index dc07d22aefd8..6ca67ec2964c 100644
--- a/llvm/lib/Support/FileCheckImpl.h
+++ b/llvm/lib/Support/FileCheckImpl.h
@@ -15,6 +15,7 @@
 #ifndef LLVM_LIB_SUPPORT_FILECHECKIMPL_H
 #define LLVM_LIB_SUPPORT_FILECHECKIMPL_H
 
+#include "llvm/Support/FileCheck.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
@@ -30,28 +31,175 @@ namespace llvm {
 // Numeric substitution handling code.
 //===----------------------------------------------------------------------===//
 
+class ExpressionValue;
+
+/// Type representing the format an expression value should be textualized into
+/// for matching. Used to represent both explicit format specifiers as well as
+/// implicit format from using numeric variables.
+struct ExpressionFormat {
+  enum class Kind {
+    /// Denote absence of format. Used for implicit format of literals and
+    /// empty expressions.
+    NoFormat,
+    /// Value is an unsigned integer and should be printed as a decimal number.
+    Unsigned,
+    /// Value is a signed integer and should be printed as a decimal number.
+    Signed,
+    /// Value should be printed as an uppercase hex number.
+    HexUpper,
+    /// Value should be printed as a lowercase hex number.
+    HexLower
+  };
+
+private:
+  Kind Value;
+
+public:
+  /// Evaluates a format to true if it can be used in a match.
+  explicit operator bool() const { return Value != Kind::NoFormat; }
+
+  /// Define format equality: formats are equal if neither is NoFormat and
+  /// their kinds are the same.
+  bool operator==(const ExpressionFormat &Other) const {
+    return Value != Kind::NoFormat && Value == Other.Value;
+  }
+
+  bool operator!=(const ExpressionFormat &Other) const {
+    return !(*this == Other);
+  }
+
+  bool operator==(Kind OtherValue) const { return Value == OtherValue; }
+
+  bool operator!=(Kind OtherValue) const { return !(*this == OtherValue); }
+
+  /// \returns the format specifier corresponding to this format as a string.
+  StringRef toString() const;
+
+  ExpressionFormat() : Value(Kind::NoFormat){};
+  explicit ExpressionFormat(Kind Value) : Value(Value){};
+
+  /// \returns a wildcard regular expression StringRef that matches any value
+  /// in the format represented by this instance, or an error if the format is
+  /// NoFormat.
+  Expected<StringRef> getWildcardRegex() const;
+
+  /// \returns the string representation of \p Value in the format represented
+  /// by this instance, or an error if conversion to this format failed or the
+  /// format is NoFormat.
+  Expected<std::string> getMatchingString(ExpressionValue Value) const;
+
+  /// \returns the value corresponding to string representation \p StrVal
+  /// according to the matching format represented by this instance or an error
+  /// with diagnostic against \p SM if \p StrVal does not correspond to a valid
+  /// and representable value.
+  Expected<ExpressionValue> valueFromStringRepr(StringRef StrVal,
+                                                const SourceMgr &SM) const;
+};
+
+/// Class to represent an overflow error that might result when manipulating a
+/// value.
+class OverflowError : public ErrorInfo<OverflowError> {
+public:
+  static char ID;
+
+  std::error_code convertToErrorCode() const override {
+    return std::make_error_code(std::errc::value_too_large);
+  }
+
+  void log(raw_ostream &OS) const override { OS << "overflow error"; }
+};
+
+/// Class representing a numeric value.
+class ExpressionValue {
+private:
+  uint64_t Value;
+  bool Negative;
+
+public:
+  template <class T>
+  explicit ExpressionValue(T Val) : Value(Val), Negative(Val < 0) {}
+
+  bool operator==(const ExpressionValue &Other) const {
+    return Value == Other.Value && isNegative() == Other.isNegative();
+  }
+
+  bool operator!=(const ExpressionValue &Other) const {
+    return !(*this == Other);
+  }
+
+  /// Returns true if value is signed and negative, false otherwise.
+  bool isNegative() const {
+    assert((Value != 0 || !Negative) && "Unexpected negative zero!");
+    return Negative;
+  }
+
+  /// \returns the value as a signed integer or an error if the value is out of
+  /// range.
+  Expected<int64_t> getSignedValue() const;
+
+  /// \returns the value as an unsigned integer or an error if the value is out
+  /// of range.
+  Expected<uint64_t> getUnsignedValue() const;
+
+  /// \returns an unsigned ExpressionValue instance whose value is the absolute
+  /// value to this object's value.
+  ExpressionValue getAbsolute() const;
+};
+
+/// Performs operation and \returns its result or an error in case of failure,
+/// such as if an overflow occurs.
+Expected<ExpressionValue> operator+(const ExpressionValue &Lhs,
+                                    const ExpressionValue &Rhs);
+Expected<ExpressionValue> operator-(const ExpressionValue &Lhs,
+                                    const ExpressionValue &Rhs);
+Expected<ExpressionValue> operator*(const ExpressionValue &Lhs,
+                                    const ExpressionValue &Rhs);
+Expected<ExpressionValue> operator/(const ExpressionValue &Lhs,
+                                    const ExpressionValue &Rhs);
+Expected<ExpressionValue> max(const ExpressionValue &Lhs,
+                              const ExpressionValue &Rhs);
+Expected<ExpressionValue> min(const ExpressionValue &Lhs,
+                              const ExpressionValue &Rhs);
+
 /// Base class representing the AST of a given expression.
 class ExpressionAST {
+private:
+  StringRef ExpressionStr;
+
 public:
+  ExpressionAST(StringRef ExpressionStr) : ExpressionStr(ExpressionStr) {}
+
   virtual ~ExpressionAST() = default;
 
+  StringRef getExpressionStr() const { return ExpressionStr; }
+
   /// Evaluates and \returns the value of the expression represented by this
   /// AST or an error if evaluation fails.
-  virtual Expected<uint64_t> eval() const = 0;
+  virtual Expected<ExpressionValue> eval() const = 0;
+
+  /// \returns either the implicit format of this AST, a diagnostic against
+  /// \p SM if implicit formats of the AST's components conflict, or NoFormat
+  /// if the AST has no implicit format (e.g. AST is made up of a single
+  /// literal).
+  virtual Expected<ExpressionFormat>
+  getImplicitFormat(const SourceMgr &SM) const {
+    return ExpressionFormat();
+  }
 };
 
 /// Class representing an unsigned literal in the AST of an expression.
 class ExpressionLiteral : public ExpressionAST {
 private:
   /// Actual value of the literal.
-  uint64_t Value;
+  ExpressionValue Value;
 
 public:
-  /// Constructs a literal with the specified value.
-  ExpressionLiteral(uint64_t Val) : Value(Val) {}
+  template <class T>
+  explicit ExpressionLiteral(StringRef ExpressionStr, T Val)
+      : ExpressionAST(ExpressionStr), Value(Val) {}
 
   /// \returns the literal's value.
-  Expected<uint64_t> eval() const override { return Value; }
+  Expected<ExpressionValue> eval() const override { return Value; }
 };
 
 /// Class to represent an undefined variable error, which quotes that
@@ -78,14 +226,40 @@ public:
   }
 };
 
+/// Class representing an expression and its matching format.
+class Expression {
+private:
+  /// Pointer to AST of the expression.
+  std::unique_ptr<ExpressionAST> AST;
+
+  /// Format to use (e.g. hex upper case letters) when matching the value.
+  ExpressionFormat Format;
+
+public:
+  /// Generic constructor for an expression represented by the given \p AST and
+  /// whose matching format is \p Format.
+  Expression(std::unique_ptr<ExpressionAST> AST, ExpressionFormat Format)
+      : AST(std::move(AST)), Format(Format) {}
+
+  /// \returns pointer to AST of the expression. Pointer is guaranteed to be
+  /// valid as long as this object is.
+  ExpressionAST *getAST() const { return AST.get(); }
+
+  ExpressionFormat getFormat() const { return Format; }
+};
+
 /// Class representing a numeric variable and its associated current value.
 class NumericVariable {
 private:
   /// Name of the numeric variable.
   StringRef Name;
 
+  /// Format to use for expressions using this variable without an explicit
+  /// format.
+  ExpressionFormat ImplicitFormat;
+
   /// Value of numeric variable, if defined, or None otherwise.
-  Optional<uint64_t> Value;
+  Optional<ExpressionValue> Value;
 
   /// Line number where this variable is defined, or None if defined before
   /// input is parsed. Used to determine whether a variable is defined on the
@@ -93,20 +267,25 @@ private:
   Optional<size_t> DefLineNumber;
 
 public:
-  /// Constructor for a variable \p Name defined at line \p DefLineNumber or
-  /// defined before input is parsed if \p DefLineNumber is None.
-  explicit NumericVariable(StringRef Name,
+  /// Constructor for a variable \p Name with implicit format \p ImplicitFormat
+  /// defined at line \p DefLineNumber or defined before input is parsed if
+  /// \p DefLineNumber is None.
+  explicit NumericVariable(StringRef Name, ExpressionFormat ImplicitFormat,
                            Optional<size_t> DefLineNumber = None)
-      : Name(Name), DefLineNumber(DefLineNumber) {}
+      : Name(Name), ImplicitFormat(ImplicitFormat),
+        DefLineNumber(DefLineNumber) {}
 
   /// \returns name of this numeric variable.
   StringRef getName() const { return Name; }
 
+  /// \returns implicit format of this numeric variable.
+  ExpressionFormat getImplicitFormat() const { return ImplicitFormat; }
+
   /// \returns this variable's value.
-  Optional<uint64_t> getValue() const { return Value; }
+  Optional<ExpressionValue> getValue() const { return Value; }
 
   /// Sets value of this numeric variable to \p NewValue.
-  void setValue(uint64_t NewValue) { Value = NewValue; }
+  void setValue(ExpressionValue NewValue) { Value = NewValue; }
 
   /// Clears value of this numeric variable, regardless of whether it is
   /// currently defined or not.
@@ -121,22 +300,25 @@ public:
 /// expression.
 class NumericVariableUse : public ExpressionAST {
 private:
-  /// Name of the numeric variable.
-  StringRef Name;
-
   /// Pointer to the class instance for the variable this use is about.
   NumericVariable *Variable;
 
 public:
   NumericVariableUse(StringRef Name, NumericVariable *Variable)
-      : Name(Name), Variable(Variable) {}
-
+      : ExpressionAST(Name), Variable(Variable) {}
   /// \returns the value of the variable referenced by this instance.
-  Expected<uint64_t> eval() const override;
+  Expected<ExpressionValue> eval() const override;
+
+  /// \returns implicit format of this numeric variable.
+  Expected<ExpressionFormat>
+  getImplicitFormat(const SourceMgr &SM) const override {
+    return Variable->getImplicitFormat();
+  }
 };
 
 /// Type of functions evaluating a given binary operation.
-using binop_eval_t = uint64_t (*)(uint64_t, uint64_t);
+using binop_eval_t = Expected<ExpressionValue> (*)(const ExpressionValue &,
+                                                   const ExpressionValue &);
 
 /// Class representing a single binary operation in the AST of an expression.
 class BinaryOperation : public ExpressionAST {
@@ -151,9 +333,10 @@ private:
   binop_eval_t EvalBinop;
 
 public:
-  BinaryOperation(binop_eval_t EvalBinop, std::unique_ptr<ExpressionAST> LeftOp,
+  BinaryOperation(StringRef ExpressionStr, binop_eval_t EvalBinop,
+                  std::unique_ptr<ExpressionAST> LeftOp,
                   std::unique_ptr<ExpressionAST> RightOp)
-      : EvalBinop(EvalBinop) {
+      : ExpressionAST(ExpressionStr), EvalBinop(EvalBinop) {
     LeftOperand = std::move(LeftOp);
     RightOperand = std::move(RightOp);
   }
@@ -162,7 +345,14 @@ public:
   /// using EvalBinop on the result of recursively evaluating the operands.
   /// \returns the expression value or an error if an undefined numeric
   /// variable is used in one of the operands.
-  Expected<uint64_t> eval() const override;
+  Expected<ExpressionValue> eval() const override;
+
+  /// \returns the implicit format of this AST, if any, a diagnostic against
+  /// \p SM if the implicit formats of the AST's components conflict, or no
+  /// format if the AST has no implicit format (e.g. AST is made of a single
+  /// literal).
+  Expected<ExpressionFormat>
+  getImplicitFormat(const SourceMgr &SM) const override;
 };
 
 class FileCheckPatternContext;
@@ -218,14 +408,14 @@ class NumericSubstitution : public Substitution {
 private:
   /// Pointer to the class representing the expression whose value is to be
   /// substituted.
-  std::unique_ptr<ExpressionAST> ExpressionASTPointer;
+  std::unique_ptr<Expression> ExpressionPointer;
 
 public:
-  NumericSubstitution(FileCheckPatternContext *Context, StringRef Expr,
-                      std::unique_ptr<ExpressionAST> ExprAST, size_t InsertIdx)
-      : Substitution(Context, Expr, InsertIdx) {
-    ExpressionASTPointer = std::move(ExprAST);
-  }
+  NumericSubstitution(FileCheckPatternContext *Context, StringRef ExpressionStr,
+                      std::unique_ptr<Expression> ExpressionPointer,
+                      size_t InsertIdx)
+      : Substitution(Context, ExpressionStr, InsertIdx),
+        ExpressionPointer(std::move(ExpressionPointer)) {}
 
   /// \returns a string containing the result of evaluating the expression in
   /// this substitution, or an error if evaluation failed.
@@ -236,8 +426,6 @@ public:
 // Pattern handling code.
 //===----------------------------------------------------------------------===//
 
-struct FileCheckDiag;
-
 /// Class holding the Pattern global state, shared by all patterns: tables
 /// holding values of variables and whether they are defined or not at any
 /// given time in the matching process.
@@ -270,6 +458,10 @@ private:
   /// automatically free them once they are guaranteed to no longer be used.
   std::vector<std::unique_ptr<NumericVariable>> NumericVariables;
 
+  /// Vector holding pointers to all parsed expressions. Used to automatically
+  /// free the expressions once they are guaranteed to no longer be used.
+  std::vector<std::unique_ptr<Expression>> Expressions;
+
   /// Vector holding pointers to all substitutions. Used to automatically free
   /// them once they are guaranteed to no longer be used.
   std::vector<std::unique_ptr<Substitution>> Substitutions;
@@ -283,7 +475,7 @@ public:
   /// command line, passed as a vector of [#]VAR=VAL strings in
   /// \p CmdlineDefines. \returns an error list containing diagnostics against
   /// \p SM for all definition parsing failures, if any, or Success otherwise.
-  Error defineCmdlineVariables(std::vector<std::string> &CmdlineDefines,
+  Error defineCmdlineVariables(ArrayRef<StringRef> CmdlineDefines,
                                SourceMgr &SM);
 
   /// Create @LINE pseudo variable. Value is set when pattern are being
@@ -307,10 +499,9 @@ private:
 
   /// Makes a new numeric substitution and registers it for destruction when
   /// the context is destroyed.
-  Substitution *
-  makeNumericSubstitution(StringRef ExpressionStr,
-                          std::unique_ptr<ExpressionAST> ExpressionAST,
-                          size_t InsertIdx);
+  Substitution *makeNumericSubstitution(StringRef ExpressionStr,
+                                        std::unique_ptr<Expression> Expression,
+                                        size_t InsertIdx);
 };
 
 /// Class to represent an error holding a diagnostic with location information
@@ -388,12 +579,12 @@ class Pattern {
   std::map<StringRef, unsigned> VariableDefs;
 
   /// Structure representing the definition of a numeric variable in a pattern.
-  /// It holds the pointer to the class representing the numeric variable whose
-  /// value is being defined and the number of the parenthesis group in
-  /// RegExStr to capture that value.
+  /// It holds the pointer to the class instance holding the value and matching
+  /// format of the numeric variable whose value is being defined and the
+  /// number of the parenthesis group in RegExStr to capture that value.
   struct NumericVariableMatch {
-    /// Pointer to class representing the numeric variable whose value is being
-    /// defined.
+    /// Pointer to class instance holding the value and matching format of the
+    /// numeric variable being defined.
     NumericVariable *DefinedNumericVariable;
 
     /// Number of the parenthesis group in RegExStr that captures the value of
@@ -457,12 +648,12 @@ public:
   /// \p IsLegacyLineExpr indicates whether \p Expr should be a legacy @LINE
   /// expression and \p Context points to the class instance holding the live
   /// string and numeric variables. \returns a pointer to the class instance
-  /// representing the AST of the expression whose value must be substitued, or
-  /// an error holding a diagnostic against \p SM if parsing fails. If
-  /// substitution was successful, sets \p DefinedNumericVariable to point to
-  /// the class representing the numeric variable defined in this numeric
-  /// substitution block, or None if this block does not define any variable.
-  static Expected<std::unique_ptr<ExpressionAST>> parseNumericSubstitutionBlock(
+  /// representing the expression whose value must be substitued, or an error
+  /// holding a diagnostic against \p SM if parsing fails. If substitution was
+  /// successful, sets \p DefinedNumericVariable to point to the class
+  /// representing the numeric variable defined in this numeric substitution
+  /// block, or None if this block does not define any variable.
+  static Expected<std::unique_ptr<Expression>> parseNumericSubstitutionBlock(
       StringRef Expr, Optional<NumericVariable *> &DefinedNumericVariable,
       bool IsLegacyLineExpr, Optional<size_t> LineNumber,
       FileCheckPatternContext *Context, const SourceMgr &SM);
@@ -526,7 +717,8 @@ private:
   /// should defining such a variable be invalid.
   static Expected<NumericVariable *> parseNumericVariableDefinition(
       StringRef &Expr, FileCheckPatternContext *Context,
-      Optional<size_t> LineNumber, const SourceMgr &SM);
+      Optional<size_t> LineNumber, ExpressionFormat ImplicitFormat,
+      const SourceMgr &SM);
   /// Parses \p Name as a (pseudo if \p IsPseudo is true) numeric variable use
   /// at line \p LineNumber, or before input is parsed if \p LineNumber is
   /// None. Parameter \p Context points to the class instance holding the live
@@ -536,29 +728,56 @@ private:
   static Expected<std::unique_ptr<NumericVariableUse>> parseNumericVariableUse(
       StringRef Name, bool IsPseudo, Optional<size_t> LineNumber,
       FileCheckPatternContext *Context, const SourceMgr &SM);
-  enum class AllowedOperand { LineVar, Literal, Any };
+  enum class AllowedOperand { LineVar, LegacyLiteral, Any };
   /// Parses \p Expr for use of a numeric operand at line \p LineNumber, or
-  /// before input is parsed if \p LineNumber is None. Accepts both literal
-  /// values and numeric variables, depending on the value of \p AO. Parameter
-  /// \p Context points to the class instance holding the live string and
-  /// numeric variables. \returns the class representing that operand in the
-  /// AST of the expression or an error holding a diagnostic against \p SM
-  /// otherwise.
+  /// before input is parsed if \p LineNumber is None. Accepts literal values,
+  /// numeric variables and function calls, depending on the value of \p AO.
+  /// \p MaybeInvalidConstraint indicates whether the text being parsed could
+  /// be an invalid constraint. \p Context points to the class instance holding
+  /// the live string and numeric variables. \returns the class representing
+  /// that operand in the AST of the expression or an error holding a
+  /// diagnostic against \p SM otherwise. If \p Expr starts with a "(" this
+  /// function will attempt to parse a parenthesized expression.
   static Expected<std::unique_ptr<ExpressionAST>>
-  parseNumericOperand(StringRef &Expr, AllowedOperand AO,
+  parseNumericOperand(StringRef &Expr, AllowedOperand AO, bool ConstraintParsed,
                       Optional<size_t> LineNumber,
                       FileCheckPatternContext *Context, const SourceMgr &SM);
-  /// Parses \p Expr for a binary operation at line \p LineNumber, or before
-  /// input is parsed if \p LineNumber is None. The left operand of this binary
-  /// operation is given in \p LeftOp and \p IsLegacyLineExpr indicates whether
-  /// we are parsing a legacy @LINE expression. Parameter \p Context points to
-  /// the class instance holding the live string and numeric variables.
-  /// \returns the class representing the binary operation in the AST of the
-  /// expression, or an error holding a diagnostic against \p SM otherwise.
+  /// Parses and updates \p RemainingExpr for a binary operation at line
+  /// \p LineNumber, or before input is parsed if \p LineNumber is None. The
+  /// left operand of this binary operation is given in \p LeftOp and \p Expr
+  /// holds the string for the full expression, including the left operand.
+  /// Parameter \p IsLegacyLineExpr indicates whether we are parsing a legacy
+  /// @LINE expression. Parameter \p Context points to the class instance
+  /// holding the live string and numeric variables. \returns the class
+  /// representing the binary operation in the AST of the expression, or an
+  /// error holding a diagnostic against \p SM otherwise.
+  static Expected<std::unique_ptr<ExpressionAST>>
+  parseBinop(StringRef Expr, StringRef &RemainingExpr,
+             std::unique_ptr<ExpressionAST> LeftOp, bool IsLegacyLineExpr,
+             Optional<size_t> LineNumber, FileCheckPatternContext *Context,
+             const SourceMgr &SM);
+
+  /// Parses a parenthesized expression inside \p Expr at line \p LineNumber, or
+  /// before input is parsed if \p LineNumber is None. \p Expr must start with
+  /// a '('. Accepts both literal values and numeric variables. Parameter \p
+  /// Context points to the class instance holding the live string and numeric
+  /// variables. \returns the class representing that operand in the AST of the
+  /// expression or an error holding a diagnostic against \p SM otherwise.
+  static Expected<std::unique_ptr<ExpressionAST>>
+  parseParenExpr(StringRef &Expr, Optional<size_t> LineNumber,
+                 FileCheckPatternContext *Context, const SourceMgr &SM);
+
+  /// Parses \p Expr for an argument list belonging to a call to function \p
+  /// FuncName at line \p LineNumber, or before input is parsed if \p LineNumber
+  /// is None. Parameter \p FuncLoc is the source location used for diagnostics.
+  /// Parameter \p Context points to the class instance holding the live string
+  /// and numeric variables. \returns the class representing that call in the
+  /// AST of the expression or an error holding a diagnostic against \p SM
+  /// otherwise.
   static Expected<std::unique_ptr<ExpressionAST>>
-  parseBinop(StringRef &Expr, std::unique_ptr<ExpressionAST> LeftOp,
-             bool IsLegacyLineExpr, Optional<size_t> LineNumber,
-             FileCheckPatternContext *Context, const SourceMgr &SM);
+  parseCallExpr(StringRef &Expr, StringRef FuncName,
+                Optional<size_t> LineNumber, FileCheckPatternContext *Context,
+                const SourceMgr &SM);
 };
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Support/FileCollector.cpp b/llvm/lib/Support/FileCollector.cpp
index 47fca6413722..59755556a5a3 100644
--- a/llvm/lib/Support/FileCollector.cpp
+++ b/llvm/lib/Support/FileCollector.cpp
@@ -8,6 +8,7 @@
 
 #include "llvm/Support/FileCollector.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Process.h"
@@ -34,7 +35,6 @@ static bool isCaseSensitivePath(StringRef Path) {
 
 FileCollector::FileCollector(std::string Root, std::string OverlayRoot)
     : Root(std::move(Root)), OverlayRoot(std::move(OverlayRoot)) {
-  sys::fs::create_directories(this->Root, true);
 }
 
 bool FileCollector::getRealPath(StringRef SrcPath,
@@ -51,7 +51,7 @@ bool FileCollector::getRealPath(StringRef SrcPath,
     auto EC = sys::fs::real_path(Directory, RealPath);
     if (EC)
       return false;
-    SymlinkMap[Directory] = RealPath.str();
+    SymlinkMap[Directory] = std::string(RealPath.str());
   } else {
     RealPath = DirWithSymlink->second;
   }
@@ -61,13 +61,19 @@ bool FileCollector::getRealPath(StringRef SrcPath,
   return true;
 }
 
-void FileCollector::addFile(const Twine &file) {
+void FileCollector::addFile(const Twine &File) {
   std::lock_guard<std::mutex> lock(Mutex);
-  std::string FileStr = file.str();
+  std::string FileStr = File.str();
   if (markAsSeen(FileStr))
     addFileImpl(FileStr);
 }
 
+void FileCollector::addDirectory(const Twine &Dir) {
+  assert(sys::fs::is_directory(Dir));
+  std::error_code EC;
+  addDirectoryImpl(Dir, vfs::getRealFileSystem(), EC);
+}
+
 void FileCollector::addFileImpl(StringRef SrcPath) {
   // We need an absolute src path to append to the root.
   SmallString<256> AbsoluteSrc = SrcPath;
@@ -101,6 +107,27 @@ void FileCollector::addFileImpl(StringRef SrcPath) {
   addFileToMapping(VirtualPath, DstPath);
 }
 
+llvm::vfs::directory_iterator
+FileCollector::addDirectoryImpl(const llvm::Twine &Dir,
+                                IntrusiveRefCntPtr<vfs::FileSystem> FS,
+                                std::error_code &EC) {
+  auto It = FS->dir_begin(Dir, EC);
+  if (EC)
+    return It;
+  addFile(Dir);
+  for (; !EC && It != llvm::vfs::directory_iterator(); It.increment(EC)) {
+    if (It->type() == sys::fs::file_type::regular_file ||
+        It->type() == sys::fs::file_type::directory_file ||
+        It->type() == sys::fs::file_type::symlink_file) {
+      addFile(It->path());
+    }
+  }
+  if (EC)
+    return It;
+  // Return a new iterator.
+  return FS->dir_begin(Dir, EC);
+}
+
 /// Set the access and modification time for the given file from the given
 /// status object.
 static std::error_code
@@ -123,6 +150,13 @@ copyAccessAndModificationTime(StringRef Filename,
 }
 
 std::error_code FileCollector::copyFiles(bool StopOnError) {
+  auto Err = sys::fs::create_directories(Root, /*IgnoreExisting=*/true);
+  if (Err) {
+    return Err;
+  }
+
+  std::lock_guard<std::mutex> lock(Mutex);
+
   for (auto &entry : VFSWriter.getMappings()) {
     // Create directory tree.
     if (std::error_code EC =
@@ -171,7 +205,7 @@ std::error_code FileCollector::copyFiles(bool StopOnError) {
   return {};
 }
 
-std::error_code FileCollector::writeMapping(StringRef mapping_file) {
+std::error_code FileCollector::writeMapping(StringRef MappingFile) {
   std::lock_guard<std::mutex> lock(Mutex);
 
   VFSWriter.setOverlayDir(OverlayRoot);
@@ -179,7 +213,7 @@ std::error_code FileCollector::writeMapping(StringRef mapping_file) {
   VFSWriter.setUseExternalNames(false);
 
   std::error_code EC;
-  raw_fd_ostream os(mapping_file, EC, sys::fs::OF_Text);
+  raw_fd_ostream os(MappingFile, EC, sys::fs::OF_Text);
   if (EC)
     return EC;
 
@@ -188,7 +222,7 @@ std::error_code FileCollector::writeMapping(StringRef mapping_file) {
   return {};
 }
 
-namespace {
+namespace llvm {
 
 class FileCollectorFileSystem : public vfs::FileSystem {
 public:
@@ -213,22 +247,7 @@ public:
 
   llvm::vfs::directory_iterator dir_begin(const llvm::Twine &Dir,
                                           std::error_code &EC) override {
-    auto It = FS->dir_begin(Dir, EC);
-    if (EC)
-      return It;
-    // Collect everything that's listed in case the user needs it.
-    Collector->addFile(Dir);
-    for (; !EC && It != llvm::vfs::directory_iterator(); It.increment(EC)) {
-      if (It->type() == sys::fs::file_type::regular_file ||
-          It->type() == sys::fs::file_type::directory_file ||
-          It->type() == sys::fs::file_type::symlink_file) {
-        Collector->addFile(It->path());
-      }
-    }
-    if (EC)
-      return It;
-    // Return a new iterator.
-    return FS->dir_begin(Dir, EC);
+    return Collector->addDirectoryImpl(Dir, FS, EC);
   }
 
   std::error_code getRealPath(const Twine &Path,
@@ -259,7 +278,7 @@ private:
   std::shared_ptr<FileCollector> Collector;
 };
 
-} // end anonymous namespace
+} // namespace llvm
 
 IntrusiveRefCntPtr<vfs::FileSystem>
 FileCollector::createCollectorVFS(IntrusiveRefCntPtr<vfs::FileSystem> BaseFS,
diff --git a/llvm/lib/Support/FileOutputBuffer.cpp b/llvm/lib/Support/FileOutputBuffer.cpp
index 0a5306f684d4..3342682270dc 100644
--- a/llvm/lib/Support/FileOutputBuffer.cpp
+++ b/llvm/lib/Support/FileOutputBuffer.cpp
@@ -12,8 +12,8 @@
 
 #include "llvm/Support/FileOutputBuffer.h"
 #include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallString.h"
 #include "llvm/Support/Errc.h"
+#include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Memory.h"
 #include "llvm/Support/Path.h"
 #include <system_error>
@@ -172,6 +172,10 @@ FileOutputBuffer::create(StringRef Path, size_t Size, unsigned Flags) {
   if (Flags & F_executable)
     Mode |= fs::all_exe;
 
+  // If Size is zero, don't use mmap which will fail with EINVAL.
+  if (Size == 0)
+    return createInMemoryBuffer(Path, Size, Mode);
+
   fs::file_status Stat;
   fs::status(Path, Stat);
 
diff --git a/llvm/lib/Support/FileUtilities.cpp b/llvm/lib/Support/FileUtilities.cpp
index d11fbb54dc0d..e4a86bb69de4 100644
--- a/llvm/lib/Support/FileUtilities.cpp
+++ b/llvm/lib/Support/FileUtilities.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Support/FileUtilities.h"
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/MemoryBuffer.h"
@@ -92,9 +93,9 @@ static bool CompareNumbers(const char *&F1P, const char *&F2P,
 
   // If one of the positions is at a space and the other isn't, chomp up 'til
   // the end of the space.
-  while (isspace(static_cast<unsigned char>(*F1P)) && F1P != F1End)
+  while (isSpace(static_cast<unsigned char>(*F1P)) && F1P != F1End)
     ++F1P;
-  while (isspace(static_cast<unsigned char>(*F2P)) && F2P != F2End)
+  while (isSpace(static_cast<unsigned char>(*F2P)) && F2P != F2End)
     ++F2P;
 
   // If we stop on numbers, compare their difference.
@@ -318,9 +319,8 @@ llvm::Error llvm::writeFileAtomically(
         atomic_write_error::output_stream_error);
   }
 
-  if (const std::error_code Error =
-          sys::fs::rename(/*from=*/GeneratedUniqPath.c_str(),
-                          /*to=*/FinalPath.str().c_str())) {
+  if (sys::fs::rename(/*from=*/GeneratedUniqPath.c_str(),
+                      /*to=*/FinalPath.str().c_str())) {
     return llvm::make_error<AtomicFileWriteError>(
         atomic_write_error::failed_to_rename_temp_file);
   }
diff --git a/llvm/lib/Support/FoldingSet.cpp b/llvm/lib/Support/FoldingSet.cpp
index ce6f196e1060..e3d7168305af 100644
--- a/llvm/lib/Support/FoldingSet.cpp
+++ b/llvm/lib/Support/FoldingSet.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/Hashing.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Host.h"
@@ -85,6 +86,10 @@ void FoldingSetNodeID::AddInteger(unsigned long long I) {
 
 void FoldingSetNodeID::AddString(StringRef String) {
   unsigned Size =  String.size();
+
+  unsigned NumInserts = 1 + divideCeil(Size, 4);
+  Bits.reserve(Bits.size() + NumInserts);
+
   Bits.push_back(Size);
   if (!Size) return;
 
@@ -223,8 +228,6 @@ static void **AllocateBuckets(unsigned NumBuckets) {
 //===----------------------------------------------------------------------===//
 // FoldingSetBase Implementation
 
-void FoldingSetBase::anchor() {}
-
 FoldingSetBase::FoldingSetBase(unsigned Log2InitSize) {
   assert(5 < Log2InitSize && Log2InitSize < 32 &&
          "Initial hash table size out of range");
@@ -266,8 +269,10 @@ void FoldingSetBase::clear() {
   NumNodes = 0;
 }
 
-void FoldingSetBase::GrowBucketCount(unsigned NewBucketCount) {
-  assert((NewBucketCount > NumBuckets) && "Can't shrink a folding set with GrowBucketCount");
+void FoldingSetBase::GrowBucketCount(unsigned NewBucketCount,
+                                     const FoldingSetInfo &Info) {
+  assert((NewBucketCount > NumBuckets) &&
+         "Can't shrink a folding set with GrowBucketCount");
   assert(isPowerOf2_32(NewBucketCount) && "Bad bucket count!");
   void **OldBuckets = Buckets;
   unsigned OldNumBuckets = NumBuckets;
@@ -290,8 +295,9 @@ void FoldingSetBase::GrowBucketCount(unsigned NewBucketCount) {
 
       // Insert the node into the new bucket, after recomputing the hash.
       InsertNode(NodeInBucket,
-                 GetBucketFor(ComputeNodeHash(NodeInBucket, TempID),
-                              Buckets, NumBuckets));
+                 GetBucketFor(Info.ComputeNodeHash(this, NodeInBucket, TempID),
+                              Buckets, NumBuckets),
+                 Info);
       TempID.clear();
     }
   }
@@ -301,25 +307,24 @@ void FoldingSetBase::GrowBucketCount(unsigned NewBucketCount) {
 
 /// GrowHashTable - Double the size of the hash table and rehash everything.
 ///
-void FoldingSetBase::GrowHashTable() {
-  GrowBucketCount(NumBuckets * 2);
+void FoldingSetBase::GrowHashTable(const FoldingSetInfo &Info) {
+  GrowBucketCount(NumBuckets * 2, Info);
 }
 
-void FoldingSetBase::reserve(unsigned EltCount) {
+void FoldingSetBase::reserve(unsigned EltCount, const FoldingSetInfo &Info) {
   // This will give us somewhere between EltCount / 2 and
   // EltCount buckets.  This puts us in the load factor
   // range of 1.0 - 2.0.
   if(EltCount < capacity())
     return;
-  GrowBucketCount(PowerOf2Floor(EltCount));
+  GrowBucketCount(PowerOf2Floor(EltCount), Info);
 }
 
 /// FindNodeOrInsertPos - Look up the node specified by ID.  If it exists,
 /// return it.  If not, return the insertion token that will make insertion
 /// faster.
-FoldingSetBase::Node *
-FoldingSetBase::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
-                                    void *&InsertPos) {
+FoldingSetBase::Node *FoldingSetBase::FindNodeOrInsertPos(
+    const FoldingSetNodeID &ID, void *&InsertPos, const FoldingSetInfo &Info) {
   unsigned IDHash = ID.ComputeHash();
   void **Bucket = GetBucketFor(IDHash, Buckets, NumBuckets);
   void *Probe = *Bucket;
@@ -328,7 +333,7 @@ FoldingSetBase::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
 
   FoldingSetNodeID TempID;
   while (Node *NodeInBucket = GetNextPtr(Probe)) {
-    if (NodeEquals(NodeInBucket, ID, IDHash, TempID))
+    if (Info.NodeEquals(this, NodeInBucket, ID, IDHash, TempID))
       return NodeInBucket;
     TempID.clear();
 
@@ -343,13 +348,15 @@ FoldingSetBase::FindNodeOrInsertPos(const FoldingSetNodeID &ID,
 /// InsertNode - Insert the specified node into the folding set, knowing that it
 /// is not already in the map.  InsertPos must be obtained from
 /// FindNodeOrInsertPos.
-void FoldingSetBase::InsertNode(Node *N, void *InsertPos) {
+void FoldingSetBase::InsertNode(Node *N, void *InsertPos,
+                                const FoldingSetInfo &Info) {
   assert(!N->getNextInBucket());
   // Do we need to grow the hashtable?
   if (NumNodes+1 > capacity()) {
-    GrowHashTable();
+    GrowHashTable(Info);
     FoldingSetNodeID TempID;
-    InsertPos = GetBucketFor(ComputeNodeHash(N, TempID), Buckets, NumBuckets);
+    InsertPos = GetBucketFor(Info.ComputeNodeHash(this, N, TempID), Buckets,
+                             NumBuckets);
   }
 
   ++NumNodes;
@@ -413,13 +420,15 @@ bool FoldingSetBase::RemoveNode(Node *N) {
 /// GetOrInsertNode - If there is an existing simple Node exactly
 /// equal to the specified node, return it.  Otherwise, insert 'N' and it
 /// instead.
-FoldingSetBase::Node *FoldingSetBase::GetOrInsertNode(FoldingSetBase::Node *N) {
+FoldingSetBase::Node *
+FoldingSetBase::GetOrInsertNode(FoldingSetBase::Node *N,
+                                const FoldingSetInfo &Info) {
   FoldingSetNodeID ID;
-  GetNodeProfile(N, ID);
+  Info.GetNodeProfile(this, N, ID);
   void *IP;
-  if (Node *E = FindNodeOrInsertPos(ID, IP))
+  if (Node *E = FindNodeOrInsertPos(ID, IP, Info))
     return E;
-  InsertNode(N, IP);
+  InsertNode(N, IP, Info);
   return N;
 }
 
diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp
index f9e89f69b528..632e879e540d 100644
--- a/llvm/lib/Support/FormatVariadic.cpp
+++ b/llvm/lib/Support/FormatVariadic.cpp
@@ -6,6 +6,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/FormatVariadic.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -140,9 +141,9 @@ formatv_object_base::splitLiteralAndReplacement(StringRef Fmt) {
   return std::make_pair(ReplacementItem{Fmt}, StringRef());
 }
 
-std::vector<ReplacementItem>
+SmallVector<ReplacementItem, 2>
 formatv_object_base::parseFormatString(StringRef Fmt) {
-  std::vector<ReplacementItem> Replacements;
+  SmallVector<ReplacementItem, 2> Replacements;
   ReplacementItem I;
   while (!Fmt.empty()) {
     std::tie(I, Fmt) = splitLiteralAndReplacement(Fmt);
diff --git a/llvm/lib/Support/FormattedStream.cpp b/llvm/lib/Support/FormattedStream.cpp
index 4eb747038bb9..5716afc187e4 100644
--- a/llvm/lib/Support/FormattedStream.cpp
+++ b/llvm/lib/Support/FormattedStream.cpp
@@ -11,7 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/Unicode.h"
 #include "llvm/Support/raw_ostream.h"
 #include <algorithm>
 
@@ -19,16 +21,22 @@ using namespace llvm;
 
 /// UpdatePosition - Examine the given char sequence and figure out which
 /// column we end up in after output, and how many line breaks are contained.
-///
-static void UpdatePosition(std::pair<unsigned, unsigned> &Position, const char *Ptr, size_t Size) {
+/// This assumes that the input string is well-formed UTF-8, and takes into
+/// account Unicode characters which render as multiple columns wide.
+void formatted_raw_ostream::UpdatePosition(const char *Ptr, size_t Size) {
   unsigned &Column = Position.first;
   unsigned &Line = Position.second;
 
-  // Keep track of the current column and line by scanning the string for
-  // special characters
-  for (const char *End = Ptr + Size; Ptr != End; ++Ptr) {
-    ++Column;
-    switch (*Ptr) {
+  auto ProcessUTF8CodePoint = [&Line, &Column](StringRef CP) {
+    int Width = sys::unicode::columnWidthUTF8(CP);
+    if (Width != sys::unicode::ErrorNonPrintableCharacter)
+      Column += Width;
+
+    // The only special whitespace characters we care about are single-byte.
+    if (CP.size() > 1)
+      return;
+
+    switch (CP[0]) {
     case '\n':
       Line += 1;
       LLVM_FALLTHROUGH;
@@ -40,6 +48,46 @@ static void UpdatePosition(std::pair<unsigned, unsigned> &Position, const char *
       Column += (8 - (Column & 0x7)) & 0x7;
       break;
     }
+  };
+
+  // If we have a partial UTF-8 sequence from the previous buffer, check that
+  // first.
+  if (PartialUTF8Char.size()) {
+    size_t BytesFromBuffer =
+        getNumBytesForUTF8(PartialUTF8Char[0]) - PartialUTF8Char.size();
+    if (Size < BytesFromBuffer) {
+      // If we still don't have enough bytes for a complete code point, just
+      // append what we have.
+      PartialUTF8Char.append(StringRef(Ptr, Size));
+      return;
+    } else {
+      // The first few bytes from the buffer will complete the code point.
+      // Concatenate them and process their effect on the line and column
+      // numbers.
+      PartialUTF8Char.append(StringRef(Ptr, BytesFromBuffer));
+      ProcessUTF8CodePoint(PartialUTF8Char);
+      PartialUTF8Char.clear();
+      Ptr += BytesFromBuffer;
+      Size -= BytesFromBuffer;
+    }
+  }
+
+  // Now scan the rest of the buffer.
+  unsigned NumBytes;
+  for (const char *End = Ptr + Size; Ptr < End; Ptr += NumBytes) {
+    NumBytes = getNumBytesForUTF8(*Ptr);
+
+    // The buffer might end part way through a UTF-8 code unit sequence for a
+    // Unicode scalar value if it got flushed. If this happens, we can't know
+    // the display width until we see the rest of the code point. Stash the
+    // bytes we do have, so that we can reconstruct the whole code point later,
+    // even if the buffer is being flushed.
+    if ((unsigned)(End - Ptr) < NumBytes) {
+      PartialUTF8Char = StringRef(Ptr, End - Ptr);
+      return;
+    }
+
+    ProcessUTF8CodePoint(StringRef(Ptr, NumBytes));
   }
 }
 
@@ -52,9 +100,9 @@ void formatted_raw_ostream::ComputePosition(const char *Ptr, size_t Size) {
   if (Ptr <= Scanned && Scanned <= Ptr + Size)
     // Scan all characters added since our last scan to determine the new
     // column.
-    UpdatePosition(Position, Scanned, Size - (Scanned - Ptr));
+    UpdatePosition(Scanned, Size - (Scanned - Ptr));
   else
-    UpdatePosition(Position, Ptr, Size);
+    UpdatePosition(Ptr, Size);
 
   // Update the scanning pointer.
   Scanned = Ptr + Size;
diff --git a/llvm/lib/Support/GraphWriter.cpp b/llvm/lib/Support/GraphWriter.cpp
index c689a81925d4..d8aae9260323 100644
--- a/llvm/lib/Support/GraphWriter.cpp
+++ b/llvm/lib/Support/GraphWriter.cpp
@@ -76,17 +76,42 @@ StringRef llvm::DOT::getColorString(unsigned ColorNumber) {
   return Colors[ColorNumber % NumColors];
 }
 
+static std::string replaceIllegalFilenameChars(std::string Filename,
+                                               const char ReplacementChar) {
+#ifdef _WIN32
+  std::string IllegalChars = "\\/:?\"<>|";
+#else
+  std::string IllegalChars = "/";
+#endif
+
+  for (char IllegalChar : IllegalChars) {
+    std::replace(Filename.begin(), Filename.end(), IllegalChar,
+                 ReplacementChar);
+  }
+
+  return Filename;
+}
+
 std::string llvm::createGraphFilename(const Twine &Name, int &FD) {
   FD = -1;
   SmallString<128> Filename;
-  std::error_code EC = sys::fs::createTemporaryFile(Name, "dot", FD, Filename);
+
+  // Windows can't always handle long paths, so limit the length of the name.
+  std::string N = Name.str();
+  N = N.substr(0, std::min<std::size_t>(N.size(), 140));
+
+  // Replace illegal characters in graph Filename with '_' if needed
+  std::string CleansedName = replaceIllegalFilenameChars(N, '_');
+
+  std::error_code EC =
+      sys::fs::createTemporaryFile(CleansedName, "dot", FD, Filename);
   if (EC) {
     errs() << "Error: " << EC.message() << "\n";
     return "";
   }
 
   errs() << "Writing '" << Filename << "'... ";
-  return Filename.str();
+  return std::string(Filename.str());
 }
 
 // Execute the graph viewer. Return true if there were errors.
@@ -147,7 +172,7 @@ static const char *getProgramName(GraphProgram::Name program) {
 
 bool llvm::DisplayGraph(StringRef FilenameRef, bool wait,
                         GraphProgram::Name program) {
-  std::string Filename = FilenameRef;
+  std::string Filename = std::string(FilenameRef);
   std::string ErrMsg;
   std::string ViewerPath;
   GraphSession S;
diff --git a/llvm/lib/Support/Host.cpp b/llvm/lib/Support/Host.cpp
index ef38c1c09413..658c1ee74cfe 100644
--- a/llvm/lib/Support/Host.cpp
+++ b/llvm/lib/Support/Host.cpp
@@ -11,9 +11,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/Host.h"
-#include "llvm/Support/TargetParser.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
@@ -21,6 +21,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/X86TargetParser.h"
 #include "llvm/Support/raw_ostream.h"
 #include <assert.h>
 #include <string.h>
@@ -28,6 +29,7 @@
 // Include the platform-specific parts of this class.
 #ifdef LLVM_ON_UNIX
 #include "Unix/Host.inc"
+#include <sched.h>
 #endif
 #ifdef _WIN32
 #include "Windows/Host.inc"
@@ -140,6 +142,7 @@ StringRef sys::detail::getHostCPUNameForPowerPC(StringRef ProcCpuinfoContent) {
       .Case("POWER8E", "pwr8")
       .Case("POWER8NVL", "pwr8")
       .Case("POWER9", "pwr9")
+      .Case("POWER10", "pwr10")
       // FIXME: If we get a simulator or machine with the capabilities of
       // mcpu=future, we should revisit this and add the name reported by the
       // simulator/machine.
@@ -178,6 +181,8 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
         // The CPU part is a 3 digit hexadecimal number with a 0x prefix. The
         // values correspond to the "Part number" in the CP15/c0 register. The
         // contents are specified in the various processor manuals.
+        // This corresponds to the Main ID Register in Technical Reference Manuals.
+        // and is used in programs like sys-utils
         return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
             .Case("0x926", "arm926ej-s")
             .Case("0xb02", "mpcore")
@@ -190,6 +195,8 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
             .Case("0xc20", "cortex-m0")
             .Case("0xc23", "cortex-m3")
             .Case("0xc24", "cortex-m4")
+            .Case("0xd22", "cortex-m55")
+            .Case("0xd02", "cortex-a34")
             .Case("0xd04", "cortex-a35")
             .Case("0xd03", "cortex-a53")
             .Case("0xd07", "cortex-a57")
@@ -197,6 +204,10 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
             .Case("0xd09", "cortex-a73")
             .Case("0xd0a", "cortex-a75")
             .Case("0xd0b", "cortex-a76")
+            .Case("0xd0d", "cortex-a77")
+            .Case("0xd41", "cortex-a78")
+            .Case("0xd44", "cortex-x1")
+            .Case("0xd0c", "neoverse-n1")
             .Default("generic");
   }
 
@@ -215,6 +226,26 @@ StringRef sys::detail::getHostCPUNameForARM(StringRef ProcCpuinfoContent) {
     }
   }
 
+  if (Implementer == "0x46") { // Fujitsu Ltd.
+    for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
+      if (Lines[I].startswith("CPU part")) {
+        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
+          .Case("0x001", "a64fx")
+          .Default("generic");
+      }
+    }
+  }
+
+  if (Implementer == "0x4e") { // NVIDIA Corporation
+    for (unsigned I = 0, E = Lines.size(); I != E; ++I) {
+      if (Lines[I].startswith("CPU part")) {
+        return StringSwitch<const char *>(Lines[I].substr(8).ltrim("\t :"))
+            .Case("0x004", "carmel")
+            .Default("generic");
+      }
+    }
+  }
+
   if (Implementer == "0x48") // HiSilicon Technologies, Inc.
     // Look for the CPU part line.
     for (unsigned I = 0, E = Lines.size(); I != E; ++I)
@@ -552,58 +583,32 @@ static void detectX86FamilyModel(unsigned EAX, unsigned *Family,
   }
 }
 
-static void
+static StringRef
 getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
-                                unsigned Brand_id, unsigned Features,
-                                unsigned Features2, unsigned Features3,
+                                const unsigned *Features,
                                 unsigned *Type, unsigned *Subtype) {
-  if (Brand_id != 0)
-    return;
+  auto testFeature = [&](unsigned F) {
+    return (Features[F / 32] & (1U << (F % 32))) != 0;
+  };
+
+  StringRef CPU;
+
   switch (Family) {
   case 3:
-    *Type = X86::INTEL_i386;
+    CPU = "i386";
     break;
   case 4:
-    *Type = X86::INTEL_i486;
+    CPU = "i486";
     break;
   case 5:
-    if (Features & (1 << X86::FEATURE_MMX)) {
-      *Type = X86::INTEL_PENTIUM_MMX;
+    if (testFeature(X86::FEATURE_MMX)) {
+      CPU = "pentium-mmx";
       break;
     }
-    *Type = X86::INTEL_PENTIUM;
+    CPU = "pentium";
     break;
   case 6:
     switch (Model) {
-    case 0x01: // Pentium Pro processor
-      *Type = X86::INTEL_PENTIUM_PRO;
-      break;
-    case 0x03: // Intel Pentium II OverDrive processor, Pentium II processor,
-               // model 03
-    case 0x05: // Pentium II processor, model 05, Pentium II Xeon processor,
-               // model 05, and Intel Celeron processor, model 05
-    case 0x06: // Celeron processor, model 06
-      *Type = X86::INTEL_PENTIUM_II;
-      break;
-    case 0x07: // Pentium III processor, model 07, and Pentium III Xeon
-               // processor, model 07
-    case 0x08: // Pentium III processor, model 08, Pentium III Xeon processor,
-               // model 08, and Celeron processor, model 08
-    case 0x0a: // Pentium III Xeon processor, model 0Ah
-    case 0x0b: // Pentium III processor, model 0Bh
-      *Type = X86::INTEL_PENTIUM_III;
-      break;
-    case 0x09: // Intel Pentium M processor, Intel Celeron M processor model 09.
-    case 0x0d: // Intel Pentium M processor, Intel Celeron M processor, model
-               // 0Dh. All processors are manufactured using the 90 nm process.
-    case 0x15: // Intel EP80579 Integrated Processor and Intel EP80579
-               // Integrated Processor with Intel QuickAssist Technology
-      *Type = X86::INTEL_PENTIUM_M;
-      break;
-    case 0x0e: // Intel Core Duo processor, Intel Core Solo processor, model
-               // 0Eh. All processors are manufactured using the 65 nm process.
-      *Type = X86::INTEL_CORE_DUO;
-      break;   // yonah
     case 0x0f: // Intel Core 2 Duo processor, Intel Core 2 Duo mobile
                // processor, Intel Core 2 Quad processor, Intel Core 2 Quad
                // mobile processor, Intel Core 2 Extreme processor, Intel
@@ -611,8 +616,8 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
                // 0Fh. All processors are manufactured using the 65 nm process.
     case 0x16: // Intel Celeron processor model 16h. All processors are
                // manufactured using the 65 nm process
-      *Type = X86::INTEL_CORE2; // "core2"
-      *Subtype = X86::INTEL_CORE2_65;
+      CPU = "core2";
+      *Type = X86::INTEL_CORE2;
       break;
     case 0x17: // Intel Core 2 Extreme processor, Intel Xeon processor, model
                // 17h. All processors are manufactured using the 45 nm process.
@@ -620,34 +625,38 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
                // 45nm: Penryn , Wolfdale, Yorkfield (XE)
     case 0x1d: // Intel Xeon processor MP. All processors are manufactured using
                // the 45 nm process.
-      *Type = X86::INTEL_CORE2; // "penryn"
-      *Subtype = X86::INTEL_CORE2_45;
+      CPU = "penryn";
+      *Type = X86::INTEL_CORE2;
       break;
     case 0x1a: // Intel Core i7 processor and Intel Xeon processor. All
                // processors are manufactured using the 45 nm process.
     case 0x1e: // Intel(R) Core(TM) i7 CPU         870  @ 2.93GHz.
                // As found in a Summer 2010 model iMac.
     case 0x1f:
-    case 0x2e:             // Nehalem EX
-      *Type = X86::INTEL_COREI7; // "nehalem"
+    case 0x2e:              // Nehalem EX
+      CPU = "nehalem";
+      *Type = X86::INTEL_COREI7;
       *Subtype = X86::INTEL_COREI7_NEHALEM;
       break;
     case 0x25: // Intel Core i7, laptop version.
     case 0x2c: // Intel Core i7 processor and Intel Xeon processor. All
                // processors are manufactured using the 32 nm process.
     case 0x2f: // Westmere EX
-      *Type = X86::INTEL_COREI7; // "westmere"
+      CPU = "westmere";
+      *Type = X86::INTEL_COREI7;
       *Subtype = X86::INTEL_COREI7_WESTMERE;
       break;
     case 0x2a: // Intel Core i7 processor. All processors are manufactured
                // using the 32 nm process.
     case 0x2d:
-      *Type = X86::INTEL_COREI7; //"sandybridge"
+      CPU = "sandybridge";
+      *Type = X86::INTEL_COREI7;
       *Subtype = X86::INTEL_COREI7_SANDYBRIDGE;
       break;
     case 0x3a:
-    case 0x3e:             // Ivy Bridge EP
-      *Type = X86::INTEL_COREI7; // "ivybridge"
+    case 0x3e:              // Ivy Bridge EP
+      CPU = "ivybridge";
+      *Type = X86::INTEL_COREI7;
       *Subtype = X86::INTEL_COREI7_IVYBRIDGE;
       break;
 
@@ -656,7 +665,8 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     case 0x3f:
     case 0x45:
     case 0x46:
-      *Type = X86::INTEL_COREI7; // "haswell"
+      CPU = "haswell";
+      *Type = X86::INTEL_COREI7;
       *Subtype = X86::INTEL_COREI7_HASWELL;
       break;
 
@@ -665,7 +675,8 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     case 0x47:
     case 0x4f:
     case 0x56:
-      *Type = X86::INTEL_COREI7; // "broadwell"
+      CPU = "broadwell";
+      *Type = X86::INTEL_COREI7;
       *Subtype = X86::INTEL_COREI7_BROADWELL;
       break;
 
@@ -674,39 +685,49 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     case 0x5e:              // Skylake desktop
     case 0x8e:              // Kaby Lake mobile
     case 0x9e:              // Kaby Lake desktop
-      *Type = X86::INTEL_COREI7; // "skylake"
+    case 0xa5:              // Comet Lake-H/S
+    case 0xa6:              // Comet Lake-U
+      CPU = "skylake";
+      *Type = X86::INTEL_COREI7;
       *Subtype = X86::INTEL_COREI7_SKYLAKE;
       break;
 
     // Skylake Xeon:
     case 0x55:
       *Type = X86::INTEL_COREI7;
-      if (Features2 & (1 << (X86::FEATURE_AVX512BF16 - 32)))
-        *Subtype = X86::INTEL_COREI7_COOPERLAKE; // "cooperlake"
-      else if (Features2 & (1 << (X86::FEATURE_AVX512VNNI - 32)))
-        *Subtype = X86::INTEL_COREI7_CASCADELAKE; // "cascadelake"
-      else
-        *Subtype = X86::INTEL_COREI7_SKYLAKE_AVX512; // "skylake-avx512"
+      if (testFeature(X86::FEATURE_AVX512BF16)) {
+        CPU = "cooperlake";
+        *Subtype = X86::INTEL_COREI7_COOPERLAKE;
+      } else if (testFeature(X86::FEATURE_AVX512VNNI)) {
+        CPU = "cascadelake";
+        *Subtype = X86::INTEL_COREI7_CASCADELAKE;
+      } else {
+        CPU = "skylake-avx512";
+        *Subtype = X86::INTEL_COREI7_SKYLAKE_AVX512;
+      }
       break;
 
     // Cannonlake:
     case 0x66:
+      CPU = "cannonlake";
       *Type = X86::INTEL_COREI7;
-      *Subtype = X86::INTEL_COREI7_CANNONLAKE; // "cannonlake"
+      *Subtype = X86::INTEL_COREI7_CANNONLAKE;
       break;
 
     // Icelake:
     case 0x7d:
     case 0x7e:
+      CPU = "icelake-client";
       *Type = X86::INTEL_COREI7;
-      *Subtype = X86::INTEL_COREI7_ICELAKE_CLIENT; // "icelake-client"
+      *Subtype = X86::INTEL_COREI7_ICELAKE_CLIENT;
       break;
 
     // Icelake Xeon:
     case 0x6a:
     case 0x6c:
+      CPU = "icelake-server";
       *Type = X86::INTEL_COREI7;
-      *Subtype = X86::INTEL_COREI7_ICELAKE_SERVER; // "icelake-server"
+      *Subtype = X86::INTEL_COREI7_ICELAKE_SERVER;
       break;
 
     case 0x1c: // Most 45 nm Intel Atom processors
@@ -714,8 +735,9 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     case 0x27: // 32 nm Atom Medfield
     case 0x35: // 32 nm Atom Midview
     case 0x36: // 32 nm Atom Midview
+      CPU = "bonnell";
       *Type = X86::INTEL_BONNELL;
-      break; // "bonnell"
+      break;
 
     // Atom Silvermont codes from the Intel software optimization guide.
     case 0x37:
@@ -724,14 +746,17 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     case 0x5a:
     case 0x5d:
     case 0x4c: // really airmont
+      CPU = "silvermont";
       *Type = X86::INTEL_SILVERMONT;
-      break; // "silvermont"
+      break;
     // Goldmont:
     case 0x5c: // Apollo Lake
     case 0x5f: // Denverton
+      CPU = "goldmont";
       *Type = X86::INTEL_GOLDMONT;
-      break; // "goldmont"
+      break;
     case 0x7a:
+      CPU = "goldmont-plus";
       *Type = X86::INTEL_GOLDMONT_PLUS;
       break;
     case 0x86:
@@ -739,189 +764,140 @@ getIntelProcessorTypeAndSubtype(unsigned Family, unsigned Model,
       break;
 
     case 0x57:
-      *Type = X86::INTEL_KNL; // knl
+      CPU = "tremont";
+      *Type = X86::INTEL_KNL;
       break;
 
     case 0x85:
-      *Type = X86::INTEL_KNM; // knm
+      CPU = "knm";
+      *Type = X86::INTEL_KNM;
       break;
 
     default: // Unknown family 6 CPU, try to guess.
-      // TODO detect tigerlake host
-      if (Features3 & (1 << (X86::FEATURE_AVX512VP2INTERSECT - 64))) {
-        *Type = X86::INTEL_COREI7;
-        *Subtype = X86::INTEL_COREI7_TIGERLAKE;
-        break;
-      }
-
-      if (Features & (1 << X86::FEATURE_AVX512VBMI2)) {
-        *Type = X86::INTEL_COREI7;
-        *Subtype = X86::INTEL_COREI7_ICELAKE_CLIENT;
-        break;
-      }
-
-      if (Features & (1 << X86::FEATURE_AVX512VBMI)) {
-        *Type = X86::INTEL_COREI7;
-        *Subtype = X86::INTEL_COREI7_CANNONLAKE;
-        break;
-      }
-
-      if (Features2 & (1 << (X86::FEATURE_AVX512BF16 - 32))) {
-        *Type = X86::INTEL_COREI7;
-        *Subtype = X86::INTEL_COREI7_COOPERLAKE;
-        break;
-      }
-
-      if (Features2 & (1 << (X86::FEATURE_AVX512VNNI - 32))) {
-        *Type = X86::INTEL_COREI7;
-        *Subtype = X86::INTEL_COREI7_CASCADELAKE;
-        break;
+      // Don't both with Type/Subtype here, they aren't used by the caller.
+      // They're used above to keep the code in sync with compiler-rt.
+      // TODO detect tigerlake host from model
+      if (testFeature(X86::FEATURE_AVX512VP2INTERSECT)) {
+        CPU = "tigerlake";
+      } else if (testFeature(X86::FEATURE_AVX512VBMI2)) {
+        CPU = "icelake-client";
+      } else if (testFeature(X86::FEATURE_AVX512VBMI)) {
+        CPU = "cannonlake";
+      } else if (testFeature(X86::FEATURE_AVX512BF16)) {
+        CPU = "cooperlake";
+      } else if (testFeature(X86::FEATURE_AVX512VNNI)) {
+        CPU = "cascadelake";
+      } else if (testFeature(X86::FEATURE_AVX512VL)) {
+        CPU = "skylake-avx512";
+      } else if (testFeature(X86::FEATURE_AVX512ER)) {
+        CPU = "knl";
+      } else if (testFeature(X86::FEATURE_CLFLUSHOPT)) {
+        if (testFeature(X86::FEATURE_SHA))
+          CPU = "goldmont";
+        else
+          CPU = "skylake";
+      } else if (testFeature(X86::FEATURE_ADX)) {
+        CPU = "broadwell";
+      } else if (testFeature(X86::FEATURE_AVX2)) {
+        CPU = "haswell";
+      } else if (testFeature(X86::FEATURE_AVX)) {
+        CPU = "sandybridge";
+      } else if (testFeature(X86::FEATURE_SSE4_2)) {
+        if (testFeature(X86::FEATURE_MOVBE))
+          CPU = "silvermont";
+        else
+          CPU = "nehalem";
+      } else if (testFeature(X86::FEATURE_SSE4_1)) {
+        CPU = "penryn";
+      } else if (testFeature(X86::FEATURE_SSSE3)) {
+        if (testFeature(X86::FEATURE_MOVBE))
+          CPU = "bonnell";
+        else
+          CPU = "core2";
+      } else if (testFeature(X86::FEATURE_64BIT)) {
+        CPU = "core2";
+      } else if (testFeature(X86::FEATURE_SSE3)) {
+        CPU = "yonah";
+      } else if (testFeature(X86::FEATURE_SSE2)) {
+        CPU = "pentium-m";
+      } else if (testFeature(X86::FEATURE_SSE)) {
+        CPU = "pentium3";
+      } else if (testFeature(X86::FEATURE_MMX)) {
+        CPU = "pentium2";
+      } else {
+        CPU = "pentiumpro";
       }
-
-      if (Features & (1 << X86::FEATURE_AVX512VL)) {
-        *Type = X86::INTEL_COREI7;
-        *Subtype = X86::INTEL_COREI7_SKYLAKE_AVX512;
-        break;
-      }
-
-      if (Features & (1 << X86::FEATURE_AVX512ER)) {
-        *Type = X86::INTEL_KNL; // knl
-        break;
-      }
-
-      if (Features3 & (1 << (X86::FEATURE_CLFLUSHOPT - 64))) {
-        if (Features3 & (1 << (X86::FEATURE_SHA - 64))) {
-          *Type = X86::INTEL_GOLDMONT;
-        } else {
-          *Type = X86::INTEL_COREI7;
-          *Subtype = X86::INTEL_COREI7_SKYLAKE;
-        }
-        break;
-      }
-      if (Features3 & (1 << (X86::FEATURE_ADX - 64))) {
-        *Type = X86::INTEL_COREI7;
-        *Subtype = X86::INTEL_COREI7_BROADWELL;
-        break;
-      }
-      if (Features & (1 << X86::FEATURE_AVX2)) {
-        *Type = X86::INTEL_COREI7;
-        *Subtype = X86::INTEL_COREI7_HASWELL;
-        break;
-      }
-      if (Features & (1 << X86::FEATURE_AVX)) {
-        *Type = X86::INTEL_COREI7;
-        *Subtype = X86::INTEL_COREI7_SANDYBRIDGE;
-        break;
-      }
-      if (Features & (1 << X86::FEATURE_SSE4_2)) {
-        if (Features3 & (1 << (X86::FEATURE_MOVBE - 64))) {
-          *Type = X86::INTEL_SILVERMONT;
-        } else {
-          *Type = X86::INTEL_COREI7;
-          *Subtype = X86::INTEL_COREI7_NEHALEM;
-        }
-        break;
-      }
-      if (Features & (1 << X86::FEATURE_SSE4_1)) {
-        *Type = X86::INTEL_CORE2; // "penryn"
-        *Subtype = X86::INTEL_CORE2_45;
-        break;
-      }
-      if (Features & (1 << X86::FEATURE_SSSE3)) {
-        if (Features3 & (1 << (X86::FEATURE_MOVBE - 64))) {
-          *Type = X86::INTEL_BONNELL; // "bonnell"
-        } else {
-          *Type = X86::INTEL_CORE2; // "core2"
-          *Subtype = X86::INTEL_CORE2_65;
-        }
-        break;
-      }
-      if (Features3 & (1 << (X86::FEATURE_EM64T - 64))) {
-        *Type = X86::INTEL_CORE2; // "core2"
-        *Subtype = X86::INTEL_CORE2_65;
-        break;
-      }
-      if (Features & (1 << X86::FEATURE_SSE3)) {
-        *Type = X86::INTEL_CORE_DUO;
-        break;
-      }
-      if (Features & (1 << X86::FEATURE_SSE2)) {
-        *Type = X86::INTEL_PENTIUM_M;
-        break;
-      }
-      if (Features & (1 << X86::FEATURE_SSE)) {
-        *Type = X86::INTEL_PENTIUM_III;
-        break;
-      }
-      if (Features & (1 << X86::FEATURE_MMX)) {
-        *Type = X86::INTEL_PENTIUM_II;
-        break;
-      }
-      *Type = X86::INTEL_PENTIUM_PRO;
       break;
     }
     break;
   case 15: {
-    if (Features3 & (1 << (X86::FEATURE_EM64T - 64))) {
-      *Type = X86::INTEL_NOCONA;
+    if (testFeature(X86::FEATURE_64BIT)) {
+      CPU = "nocona";
       break;
     }
-    if (Features & (1 << X86::FEATURE_SSE3)) {
-      *Type = X86::INTEL_PRESCOTT;
+    if (testFeature(X86::FEATURE_SSE3)) {
+      CPU = "prescott";
       break;
     }
-    *Type = X86::INTEL_PENTIUM_IV;
+    CPU = "pentium4";
     break;
   }
   default:
-    break; /*"generic"*/
+    break; // Unknown.
   }
+
+  return CPU;
 }
 
-static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
-                                          unsigned Features, unsigned *Type,
-                                          unsigned *Subtype) {
-  // FIXME: this poorly matches the generated SubtargetFeatureKV table.  There
-  // appears to be no way to generate the wide variety of AMD-specific targets
-  // from the information returned from CPUID.
+static StringRef
+getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
+                              const unsigned *Features,
+                              unsigned *Type, unsigned *Subtype) {
+  auto testFeature = [&](unsigned F) {
+    return (Features[F / 32] & (1U << (F % 32))) != 0;
+  };
+
+  StringRef CPU;
+
   switch (Family) {
   case 4:
-    *Type = X86::AMD_i486;
+    CPU = "i486";
     break;
   case 5:
-    *Type = X86::AMDPENTIUM;
+    CPU = "pentium";
     switch (Model) {
     case 6:
     case 7:
-      *Subtype = X86::AMDPENTIUM_K6;
-      break; // "k6"
+      CPU = "k6";
+      break;
     case 8:
-      *Subtype = X86::AMDPENTIUM_K62;
-      break; // "k6-2"
+      CPU = "k6-2";
+      break;
     case 9:
     case 13:
-      *Subtype = X86::AMDPENTIUM_K63;
-      break; // "k6-3"
+      CPU = "k6-3";
+      break;
     case 10:
-      *Subtype = X86::AMDPENTIUM_GEODE;
-      break; // "geode"
+      CPU = "geode";
+      break;
     }
     break;
   case 6:
-    if (Features & (1 << X86::FEATURE_SSE)) {
-      *Type = X86::AMD_ATHLON_XP;
-      break; // "athlon-xp"
+    if (testFeature(X86::FEATURE_SSE)) {
+      CPU = "athlon-xp";
+      break;
     }
-    *Type = X86::AMD_ATHLON;
-    break; // "athlon"
+    CPU = "athlon";
+    break;
   case 15:
-    if (Features & (1 << X86::FEATURE_SSE3)) {
-      *Type = X86::AMD_K8SSE3;
-      break; // "k8-sse3"
+    if (testFeature(X86::FEATURE_SSE3)) {
+      CPU = "k8-sse3";
+      break;
     }
-    *Type = X86::AMD_K8;
-    break; // "k8"
+    CPU = "k8";
+    break;
   case 16:
+    CPU = "amdfam10";
     *Type = X86::AMDFAM10H; // "amdfam10"
     switch (Model) {
     case 2:
@@ -936,63 +912,62 @@ static void getAMDProcessorTypeAndSubtype(unsigned Family, unsigned Model,
     }
     break;
   case 20:
+    CPU = "btver1";
     *Type = X86::AMD_BTVER1;
-    break; // "btver1";
+    break;
   case 21:
+    CPU = "bdver1";
     *Type = X86::AMDFAM15H;
     if (Model >= 0x60 && Model <= 0x7f) {
+      CPU = "bdver4";
       *Subtype = X86::AMDFAM15H_BDVER4;
-      break; // "bdver4"; 60h-7Fh: Excavator
+      break; // 60h-7Fh: Excavator
     }
     if (Model >= 0x30 && Model <= 0x3f) {
+      CPU = "bdver3";
       *Subtype = X86::AMDFAM15H_BDVER3;
-      break; // "bdver3"; 30h-3Fh: Steamroller
+      break; // 30h-3Fh: Steamroller
     }
     if ((Model >= 0x10 && Model <= 0x1f) || Model == 0x02) {
+      CPU = "bdver2";
       *Subtype = X86::AMDFAM15H_BDVER2;
-      break; // "bdver2"; 02h, 10h-1Fh: Piledriver
+      break; // 02h, 10h-1Fh: Piledriver
     }
     if (Model <= 0x0f) {
       *Subtype = X86::AMDFAM15H_BDVER1;
-      break; // "bdver1"; 00h-0Fh: Bulldozer
+      break; // 00h-0Fh: Bulldozer
     }
     break;
   case 22:
+    CPU = "btver2";
     *Type = X86::AMD_BTVER2;
-    break; // "btver2"
+    break;
   case 23:
+    CPU = "znver1";
     *Type = X86::AMDFAM17H;
     if ((Model >= 0x30 && Model <= 0x3f) || Model == 0x71) {
+      CPU = "znver2";
       *Subtype = X86::AMDFAM17H_ZNVER2;
-      break; // "znver2"; 30h-3fh, 71h: Zen2
+      break; // 30h-3fh, 71h: Zen2
     }
     if (Model <= 0x0f) {
       *Subtype = X86::AMDFAM17H_ZNVER1;
-      break; // "znver1"; 00h-0Fh: Zen1
+      break; // 00h-0Fh: Zen1
     }
     break;
   default:
-    break; // "generic"
+    break; // Unknown AMD CPU.
   }
+
+  return CPU;
 }
 
 static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
-                                 unsigned *FeaturesOut, unsigned *Features2Out,
-                                 unsigned *Features3Out) {
-  unsigned Features = 0;
-  unsigned Features2 = 0;
-  unsigned Features3 = 0;
+                                 unsigned *Features) {
   unsigned EAX, EBX;
 
   auto setFeature = [&](unsigned F) {
-    if (F < 32)
-      Features |= 1U << (F & 0x1f);
-    else if (F < 64)
-      Features2 |= 1U << ((F - 32) & 0x1f);
-    else if (F < 96)
-      Features3 |= 1U << ((F - 64) & 0x1f);
-    else
-      llvm_unreachable("Unexpected FeatureBit");
+    Features[F / 32] |= 1U << (F % 32);
   };
 
   if ((EDX >> 15) & 1)
@@ -1115,56 +1090,42 @@ static void getAvailableFeatures(unsigned ECX, unsigned EDX, unsigned MaxLeaf,
     setFeature(X86::FEATURE_FMA4);
 
   if (HasExtLeaf1 && ((EDX >> 29) & 1))
-    setFeature(X86::FEATURE_EM64T);
-
-  *FeaturesOut  = Features;
-  *Features2Out = Features2;
-  *Features3Out = Features3;
+    setFeature(X86::FEATURE_64BIT);
 }
 
 StringRef sys::getHostCPUName() {
   unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
   unsigned MaxLeaf, Vendor;
 
-#if defined(__GNUC__) || defined(__clang__)
-  //FIXME: include cpuid.h from clang or copy __get_cpuid_max here
-  // and simplify it to not invoke __cpuid (like cpu_model.c in
-  // compiler-rt/lib/builtins/cpu_model.c?
-  // Opting for the second option.
-  if(!isCpuIdSupported())
+  if (!isCpuIdSupported())
     return "generic";
-#endif
+
   if (getX86CpuIDAndInfo(0, &MaxLeaf, &Vendor, &ECX, &EDX) || MaxLeaf < 1)
     return "generic";
   getX86CpuIDAndInfo(0x1, &EAX, &EBX, &ECX, &EDX);
 
-  unsigned Brand_id = EBX & 0xff;
   unsigned Family = 0, Model = 0;
-  unsigned Features = 0, Features2 = 0, Features3 = 0;
+  unsigned Features[(X86::CPU_FEATURE_MAX + 31) / 32] = {0};
   detectX86FamilyModel(EAX, &Family, &Model);
-  getAvailableFeatures(ECX, EDX, MaxLeaf, &Features, &Features2, &Features3);
+  getAvailableFeatures(ECX, EDX, MaxLeaf, Features);
 
+  // These aren't consumed in this file, but we try to keep some source code the
+  // same or similar to compiler-rt.
   unsigned Type = 0;
   unsigned Subtype = 0;
 
+  StringRef CPU;
+
   if (Vendor == SIG_INTEL) {
-    getIntelProcessorTypeAndSubtype(Family, Model, Brand_id, Features,
-                                    Features2, Features3, &Type, &Subtype);
+    CPU = getIntelProcessorTypeAndSubtype(Family, Model, Features, &Type,
+                                          &Subtype);
   } else if (Vendor == SIG_AMD) {
-    getAMDProcessorTypeAndSubtype(Family, Model, Features, &Type, &Subtype);
+    CPU = getAMDProcessorTypeAndSubtype(Family, Model, Features, &Type,
+                                        &Subtype);
   }
 
-  // Check subtypes first since those are more specific.
-#define X86_CPU_SUBTYPE(ARCHNAME, ENUM) \
-  if (Subtype == X86::ENUM) \
-    return ARCHNAME;
-#include "llvm/Support/X86TargetParser.def"
-
-  // Now check types.
-#define X86_CPU_TYPE(ARCHNAME, ENUM) \
-  if (Type == X86::ENUM) \
-    return ARCHNAME;
-#include "llvm/Support/X86TargetParser.def"
+  if (!CPU.empty())
+    return CPU;
 
   return "generic";
 }
@@ -1255,18 +1216,25 @@ StringRef sys::getHostCPUName() {
       return "swift";
     default:;
     }
-  
+
   return "generic";
 }
 #else
 StringRef sys::getHostCPUName() { return "generic"; }
 #endif
 
-#if defined(__linux__) && defined(__x86_64__)
+#if defined(__linux__) && (defined(__i386__) || defined(__x86_64__))
 // On Linux, the number of physical cores can be computed from /proc/cpuinfo,
 // using the number of unique physical/core id pairs. The following
 // implementation reads the /proc/cpuinfo format on an x86_64 system.
-static int computeHostNumPhysicalCores() {
+int computeHostNumPhysicalCores() {
+  // Enabled represents the number of physical id/core id pairs with at least
+  // one processor id enabled by the CPU affinity mask.
+  cpu_set_t Affinity, Enabled;
+  if (sched_getaffinity(0, sizeof(Affinity), &Affinity) != 0)
+    return -1;
+  CPU_ZERO(&Enabled);
+
   // Read /proc/cpuinfo as a stream (until EOF reached). It cannot be
   // mmapped because it appears to have 0 size.
   llvm::ErrorOr<std::unique_ptr<llvm::MemoryBuffer>> Text =
@@ -1279,40 +1247,36 @@ static int computeHostNumPhysicalCores() {
   SmallVector<StringRef, 8> strs;
   (*Text)->getBuffer().split(strs, "\n", /*MaxSplit=*/-1,
                              /*KeepEmpty=*/false);
+  int CurProcessor = -1;
   int CurPhysicalId = -1;
+  int CurSiblings = -1;
   int CurCoreId = -1;
-  SmallSet<std::pair<int, int>, 32> UniqueItems;
-  for (auto &Line : strs) {
-    Line = Line.trim();
-    if (!Line.startswith("physical id") && !Line.startswith("core id"))
-      continue;
+  for (StringRef Line : strs) {
     std::pair<StringRef, StringRef> Data = Line.split(':');
     auto Name = Data.first.trim();
     auto Val = Data.second.trim();
-    if (Name == "physical id") {
-      assert(CurPhysicalId == -1 &&
-             "Expected a core id before seeing another physical id");
+    // These fields are available if the kernel is configured with CONFIG_SMP.
+    if (Name == "processor")
+      Val.getAsInteger(10, CurProcessor);
+    else if (Name == "physical id")
       Val.getAsInteger(10, CurPhysicalId);
-    }
-    if (Name == "core id") {
-      assert(CurCoreId == -1 &&
-             "Expected a physical id before seeing another core id");
+    else if (Name == "siblings")
+      Val.getAsInteger(10, CurSiblings);
+    else if (Name == "core id") {
       Val.getAsInteger(10, CurCoreId);
-    }
-    if (CurPhysicalId != -1 && CurCoreId != -1) {
-      UniqueItems.insert(std::make_pair(CurPhysicalId, CurCoreId));
-      CurPhysicalId = -1;
-      CurCoreId = -1;
+      // The processor id corresponds to an index into cpu_set_t.
+      if (CPU_ISSET(CurProcessor, &Affinity))
+        CPU_SET(CurPhysicalId * CurSiblings + CurCoreId, &Enabled);
     }
   }
-  return UniqueItems.size();
+  return CPU_COUNT(&Enabled);
 }
 #elif defined(__APPLE__) && defined(__x86_64__)
 #include <sys/param.h>
 #include <sys/sysctl.h>
 
 // Gets the number of *physical cores* on the machine.
-static int computeHostNumPhysicalCores() {
+int computeHostNumPhysicalCores() {
   uint32_t count;
   size_t len = sizeof(count);
   sysctlbyname("hw.physicalcpu", &count, &len, NULL, 0);
@@ -1326,6 +1290,9 @@ static int computeHostNumPhysicalCores() {
   }
   return count;
 }
+#elif defined(_WIN32) && LLVM_ENABLE_THREADS != 0
+// Defined in llvm/lib/Support/Windows/Threading.inc
+int computeHostNumPhysicalCores();
 #else
 // On other systems, return -1 to indicate unknown.
 static int computeHostNumPhysicalCores() { return -1; }
@@ -1341,13 +1308,8 @@ int sys::getHostNumPhysicalCores() {
 bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   unsigned EAX = 0, EBX = 0, ECX = 0, EDX = 0;
   unsigned MaxLevel;
-  union {
-    unsigned u[3];
-    char c[12];
-  } text;
 
-  if (getX86CpuIDAndInfo(0, &MaxLevel, text.u + 0, text.u + 2, text.u + 1) ||
-      MaxLevel < 1)
+  if (getX86CpuIDAndInfo(0, &MaxLevel, &EBX, &ECX, &EDX) || MaxLevel < 1)
     return false;
 
   getX86CpuIDAndInfo(1, &EAX, &EBX, &ECX, &EDX);
@@ -1373,8 +1335,8 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   // If CPUID indicates support for XSAVE, XRESTORE and AVX, and XGETBV
   // indicates that the AVX registers will be saved and restored on context
   // switch, then we have full AVX support.
-  bool HasAVXSave = ((ECX >> 27) & 1) && ((ECX >> 28) & 1) &&
-                    !getX86XCR0(&EAX, &EDX) && ((EAX & 0x6) == 0x6);
+  bool HasXSave = ((ECX >> 27) & 1) && !getX86XCR0(&EAX, &EDX);
+  bool HasAVXSave = HasXSave && ((ECX >> 28) & 1) && ((EAX & 0x6) == 0x6);
 #if defined(__APPLE__)
   // Darwin lazily saves the AVX512 context on first use: trust that the OS will
   // save the AVX512 context if we use AVX512 instructions, even the bit is not
@@ -1384,6 +1346,9 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   // AVX512 requires additional context to be saved by the OS.
   bool HasAVX512Save = HasAVXSave && ((EAX & 0xe0) == 0xe0);
 #endif
+  // AMX requires additional context to be saved by the OS.
+  const unsigned AMXBits = (1 << 17) | (1 << 18);
+  bool HasAMXSave = HasXSave && ((EAX & AMXBits) == AMXBits);
 
   Features["avx"]   = HasAVXSave;
   Features["fma"]   = ((ECX >> 12) & 1) && HasAVXSave;
@@ -1459,6 +1424,10 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   Features["movdir64b"]       = HasLeaf7 && ((ECX >> 28) & 1);
   Features["enqcmd"]          = HasLeaf7 && ((ECX >> 29) & 1);
 
+  Features["avx512vp2intersect"] =
+      HasLeaf7 && ((EDX >> 8) & 1) && HasAVX512Save;
+  Features["serialize"]       = HasLeaf7 && ((EDX >> 14) & 1);
+  Features["tsxldtrk"]        = HasLeaf7 && ((EDX >> 16) & 1);
   // There are two CPUID leafs which information associated with the pconfig
   // instruction:
   // EAX=0x7, ECX=0x0 indicates the availability of the instruction (via the 18th
@@ -1470,6 +1439,9 @@ bool sys::getHostCPUFeatures(StringMap<bool> &Features) {
   // detecting features using the "-march=native" flag.
   // For more info, see X86 ISA docs.
   Features["pconfig"] = HasLeaf7 && ((EDX >> 18) & 1);
+  Features["amx-bf16"]   = HasLeaf7 && ((EDX >> 22) & 1) && HasAMXSave;
+  Features["amx-tile"]   = HasLeaf7 && ((EDX >> 24) & 1) && HasAMXSave;
+  Features["amx-int8"]   = HasLeaf7 && ((EDX >> 25) & 1) && HasAMXSave;
   bool HasLeaf7Subleaf1 =
       MaxLevel >= 7 && !getX86CpuIDAndInfoEx(0x7, 0x1, &EAX, &EBX, &ECX, &EDX);
   Features["avx512bf16"] = HasLeaf7Subleaf1 && ((EAX >> 5) & 1) && HasAVX512Save;
diff --git a/llvm/lib/Support/InitLLVM.cpp b/llvm/lib/Support/InitLLVM.cpp
index bb9b569d2de6..5c56b773ea69 100644
--- a/llvm/lib/Support/InitLLVM.cpp
+++ b/llvm/lib/Support/InitLLVM.cpp
@@ -15,7 +15,7 @@
 #include <string>
 
 #ifdef _WIN32
-#include "Windows/WindowsSupport.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
 #endif
 
 using namespace llvm;
diff --git a/llvm/lib/Support/IntEqClasses.cpp b/llvm/lib/Support/IntEqClasses.cpp
index 4a976dcefc65..ebb02e6c01e5 100644
--- a/llvm/lib/Support/IntEqClasses.cpp
+++ b/llvm/lib/Support/IntEqClasses.cpp
@@ -18,6 +18,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/IntEqClasses.h"
+#include <cassert>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Support/IntervalMap.cpp b/llvm/lib/Support/IntervalMap.cpp
index f15c7c9403c3..674e0f962fa1 100644
--- a/llvm/lib/Support/IntervalMap.cpp
+++ b/llvm/lib/Support/IntervalMap.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/IntervalMap.h"
+#include <cassert>
 
 namespace llvm {
 namespace IntervalMapImpl {
diff --git a/llvm/lib/Support/ItaniumManglingCanonicalizer.cpp b/llvm/lib/Support/ItaniumManglingCanonicalizer.cpp
index bbc06d186fba..9d3cf61459dd 100644
--- a/llvm/lib/Support/ItaniumManglingCanonicalizer.cpp
+++ b/llvm/lib/Support/ItaniumManglingCanonicalizer.cpp
@@ -7,16 +7,12 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/ItaniumManglingCanonicalizer.h"
-
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/FoldingSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Demangle/ItaniumDemangle.h"
 #include "llvm/Support/Allocator.h"
 
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/ADT/FoldingSet.h"
-#include "llvm/ADT/StringRef.h"
-
 using namespace llvm;
 using llvm::itanium_demangle::ForwardTemplateReference;
 using llvm::itanium_demangle::Node;
@@ -30,9 +26,8 @@ struct FoldingSetNodeIDBuilder {
   void operator()(StringView Str) {
     ID.AddString(llvm::StringRef(Str.begin(), Str.size()));
   }
-  template<typename T>
-  typename std::enable_if<std::is_integral<T>::value ||
-                          std::is_enum<T>::value>::type
+  template <typename T>
+  std::enable_if_t<std::is_integral<T>::value || std::is_enum<T>::value>
   operator()(T V) {
     ID.AddInteger((unsigned long long)V);
   }
diff --git a/llvm/lib/Support/KnownBits.cpp b/llvm/lib/Support/KnownBits.cpp
index 8f3f4aa8caea..1ff66d504cbe 100644
--- a/llvm/lib/Support/KnownBits.cpp
+++ b/llvm/lib/Support/KnownBits.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/KnownBits.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -81,3 +82,28 @@ KnownBits KnownBits::computeForAddSub(bool Add, bool NSW,
 
   return KnownOut;
 }
+
+KnownBits &KnownBits::operator&=(const KnownBits &RHS) {
+  // Result bit is 0 if either operand bit is 0.
+  Zero |= RHS.Zero;
+  // Result bit is 1 if both operand bits are 1.
+  One &= RHS.One;
+  return *this;
+}
+
+KnownBits &KnownBits::operator|=(const KnownBits &RHS) {
+  // Result bit is 0 if both operand bits are 0.
+  Zero &= RHS.Zero;
+  // Result bit is 1 if either operand bit is 1.
+  One |= RHS.One;
+  return *this;
+}
+
+KnownBits &KnownBits::operator^=(const KnownBits &RHS) {
+  // Result bit is 0 if both operand bits are 0 or both are 1.
+  APInt Z = (Zero & RHS.Zero) | (One & RHS.One);
+  // Result bit is 1 if one operand bit is 0 and the other is 1.
+  One = (Zero & RHS.One) | (One & RHS.Zero);
+  Zero = std::move(Z);
+  return *this;
+}
diff --git a/llvm/lib/Support/LockFileManager.cpp b/llvm/lib/Support/LockFileManager.cpp
index 5c6508c3b007..a2b56ab295c4 100644
--- a/llvm/lib/Support/LockFileManager.cpp
+++ b/llvm/lib/Support/LockFileManager.cpp
@@ -14,15 +14,20 @@
 #include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Process.h"
 #include "llvm/Support/Signals.h"
 #include "llvm/Support/raw_ostream.h"
 #include <cerrno>
+#include <chrono>
 #include <ctime>
 #include <memory>
+#include <random>
 #include <sys/stat.h>
 #include <sys/types.h>
 #include <system_error>
+#include <thread>
 #include <tuple>
+
 #ifdef _WIN32
 #include <windows.h>
 #endif
@@ -158,7 +163,7 @@ LockFileManager::LockFileManager(StringRef FileName)
   this->FileName = FileName;
   if (std::error_code EC = sys::fs::make_absolute(this->FileName)) {
     std::string S("failed to obtain absolute path for ");
-    S.append(this->FileName.str());
+    S.append(std::string(this->FileName.str()));
     setError(EC, S);
     return;
   }
@@ -177,7 +182,7 @@ LockFileManager::LockFileManager(StringRef FileName)
   if (std::error_code EC = sys::fs::createUniqueFile(
           UniqueLockFileName, UniqueLockFileID, UniqueLockFileName)) {
     std::string S("failed to create unique file ");
-    S.append(UniqueLockFileName.str());
+    S.append(std::string(UniqueLockFileName.str()));
     setError(EC, S);
     return;
   }
@@ -191,19 +196,14 @@ LockFileManager::LockFileManager(StringRef FileName)
     }
 
     raw_fd_ostream Out(UniqueLockFileID, /*shouldClose=*/true);
-    Out << HostID << ' ';
-#if LLVM_ON_UNIX
-    Out << getpid();
-#else
-    Out << "1";
-#endif
+    Out << HostID << ' ' << sys::Process::getProcessId();
     Out.close();
 
     if (Out.has_error()) {
       // We failed to write out PID, so report the error, remove the
       // unique lock file, and fail.
       std::string S("failed to write to ");
-      S.append(UniqueLockFileName.str());
+      S.append(std::string(UniqueLockFileName.str()));
       setError(Out.error(), S);
       sys::fs::remove(UniqueLockFileName);
       return;
@@ -249,7 +249,7 @@ LockFileManager::LockFileManager(StringRef FileName)
     // ownership.
     if ((EC = sys::fs::remove(LockFileName))) {
       std::string S("failed to remove lockfile ");
-      S.append(UniqueLockFileName.str());
+      S.append(std::string(UniqueLockFileName.str()));
       setError(EC, S);
       return;
     }
@@ -295,23 +295,29 @@ LockFileManager::waitForUnlock(const unsigned MaxSeconds) {
   if (getState() != LFS_Shared)
     return Res_Success;
 
-#ifdef _WIN32
-  unsigned long Interval = 1;
-#else
-  struct timespec Interval;
-  Interval.tv_sec = 0;
-  Interval.tv_nsec = 1000000;
-#endif
+  // Since we don't yet have an event-based method to wait for the lock file,
+  // implement randomized exponential backoff, similar to Ethernet collision
+  // algorithm. This improves performance on machines with high core counts
+  // when the file lock is heavily contended by multiple clang processes
+  const unsigned long MinWaitDurationMS = 10;
+  const unsigned long MaxWaitMultiplier = 50; // 500ms max wait
+  unsigned long WaitMultiplier = 1;
+  unsigned long ElapsedTimeSeconds = 0;
+
+  std::random_device Device;
+  std::default_random_engine Engine(Device());
+
+  auto StartTime = std::chrono::steady_clock::now();
+
   do {
+    // FIXME: implement event-based waiting
+
     // Sleep for the designated interval, to allow the owning process time to
     // finish up and remove the lock file.
-    // FIXME: Should we hook in to system APIs to get a notification when the
-    // lock file is deleted?
-#ifdef _WIN32
-    Sleep(Interval);
-#else
-    nanosleep(&Interval, nullptr);
-#endif
+    std::uniform_int_distribution<unsigned long> Distribution(1,
+                                                              WaitMultiplier);
+    unsigned long WaitDurationMS = MinWaitDurationMS * Distribution(Engine);
+    std::this_thread::sleep_for(std::chrono::milliseconds(WaitDurationMS));
 
     if (sys::fs::access(LockFileName.c_str(), sys::fs::AccessMode::Exist) ==
         errc::no_such_file_or_directory) {
@@ -325,24 +331,16 @@ LockFileManager::waitForUnlock(const unsigned MaxSeconds) {
     if (!processStillExecuting((*Owner).first, (*Owner).second))
       return Res_OwnerDied;
 
-    // Exponentially increase the time we wait for the lock to be removed.
-#ifdef _WIN32
-    Interval *= 2;
-#else
-    Interval.tv_sec *= 2;
-    Interval.tv_nsec *= 2;
-    if (Interval.tv_nsec >= 1000000000) {
-      ++Interval.tv_sec;
-      Interval.tv_nsec -= 1000000000;
+    WaitMultiplier *= 2;
+    if (WaitMultiplier > MaxWaitMultiplier) {
+      WaitMultiplier = MaxWaitMultiplier;
     }
-#endif
-  } while (
-#ifdef _WIN32
-           Interval < MaxSeconds * 1000
-#else
-           Interval.tv_sec < (time_t)MaxSeconds
-#endif
-           );
+
+    ElapsedTimeSeconds = std::chrono::duration_cast<std::chrono::seconds>(
+                             std::chrono::steady_clock::now() - StartTime)
+                             .count();
+
+  } while (ElapsedTimeSeconds < MaxSeconds);
 
   // Give up.
   return Res_Timeout;
diff --git a/llvm/lib/Support/MD5.cpp b/llvm/lib/Support/MD5.cpp
index 9b02f62912fa..5e0b076f176e 100644
--- a/llvm/lib/Support/MD5.cpp
+++ b/llvm/lib/Support/MD5.cpp
@@ -39,6 +39,7 @@
 
 #include "llvm/Support/MD5.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Format.h"
diff --git a/llvm/lib/Support/MemAlloc.cpp b/llvm/lib/Support/MemAlloc.cpp
new file mode 100644
index 000000000000..7aaa0dc6e205
--- /dev/null
+++ b/llvm/lib/Support/MemAlloc.cpp
@@ -0,0 +1,34 @@
+//===- MemAlloc.cpp - Memory allocation functions -------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/MemAlloc.h"
+
+// These are out of line to have __cpp_aligned_new not affect ABI.
+
+LLVM_ATTRIBUTE_RETURNS_NONNULL LLVM_ATTRIBUTE_RETURNS_NOALIAS void *
+llvm::allocate_buffer(size_t Size, size_t Alignment) {
+  return ::operator new(Size
+#ifdef __cpp_aligned_new
+                        ,
+                        std::align_val_t(Alignment)
+#endif
+  );
+}
+
+void llvm::deallocate_buffer(void *Ptr, size_t Size, size_t Alignment) {
+  ::operator delete(Ptr
+#ifdef __cpp_sized_deallocation
+                    ,
+                    Size
+#endif
+#ifdef __cpp_aligned_new
+                    ,
+                    std::align_val_t(Alignment)
+#endif
+  );
+}
diff --git a/llvm/lib/Support/MemoryBuffer.cpp b/llvm/lib/Support/MemoryBuffer.cpp
index e4027ca7bbfd..248fb72c4968 100644
--- a/llvm/lib/Support/MemoryBuffer.cpp
+++ b/llvm/lib/Support/MemoryBuffer.cpp
@@ -162,6 +162,20 @@ MemoryBuffer::getFileSlice(const Twine &FilePath, uint64_t MapSize,
 //===----------------------------------------------------------------------===//
 
 namespace {
+
+template <typename MB>
+constexpr sys::fs::mapped_file_region::mapmode Mapmode =
+    sys::fs::mapped_file_region::readonly;
+template <>
+constexpr sys::fs::mapped_file_region::mapmode Mapmode<MemoryBuffer> =
+    sys::fs::mapped_file_region::readonly;
+template <>
+constexpr sys::fs::mapped_file_region::mapmode Mapmode<WritableMemoryBuffer> =
+    sys::fs::mapped_file_region::priv;
+template <>
+constexpr sys::fs::mapped_file_region::mapmode
+    Mapmode<WriteThroughMemoryBuffer> = sys::fs::mapped_file_region::readwrite;
+
 /// Memory maps a file descriptor using sys::fs::mapped_file_region.
 ///
 /// This handles converting the offset into a legal offset on the platform.
@@ -184,7 +198,7 @@ class MemoryBufferMMapFile : public MB {
 public:
   MemoryBufferMMapFile(bool RequiresNullTerminator, sys::fs::file_t FD, uint64_t Len,
                        uint64_t Offset, std::error_code &EC)
-      : MFR(FD, MB::Mapmode, getLegalMapSize(Len, Offset),
+      : MFR(FD, Mapmode<MB>, getLegalMapSize(Len, Offset),
             getLegalMapOffset(Offset), EC) {
     if (!EC) {
       const char *Start = getStart(Len, Offset);
@@ -315,7 +329,7 @@ static bool shouldUseMmap(sys::fs::file_t FD,
   // mmap may leave the buffer without null terminator if the file size changed
   // by the time the last page is mapped in, so avoid it if the file size is
   // likely to change.
-  if (IsVolatile)
+  if (IsVolatile && RequiresNullTerminator)
     return false;
 
   // We don't use mmap for small files because this can severely fragment our
diff --git a/llvm/lib/Support/NativeFormatting.cpp b/llvm/lib/Support/NativeFormatting.cpp
index 3731e0c56359..ae4bffbd94c1 100644
--- a/llvm/lib/Support/NativeFormatting.cpp
+++ b/llvm/lib/Support/NativeFormatting.cpp
@@ -7,12 +7,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/NativeFormatting.h"
-
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/Format.h"
-
+#include "llvm/Support/raw_ostream.h"
 #include <float.h>
 
 using namespace llvm;
@@ -89,7 +88,7 @@ static void write_signed(raw_ostream &S, T N, size_t MinDigits,
                          IntegerStyle Style) {
   static_assert(std::is_signed<T>::value, "Value is not signed!");
 
-  using UnsignedT = typename std::make_unsigned<T>::type;
+  using UnsignedT = std::make_unsigned_t<T>;
 
   if (N >= 0) {
     write_unsigned(S, static_cast<UnsignedT>(N), MinDigits, Style);
diff --git a/llvm/lib/Support/OptimizedStructLayout.cpp b/llvm/lib/Support/OptimizedStructLayout.cpp
new file mode 100644
index 000000000000..9bbd767c5ce9
--- /dev/null
+++ b/llvm/lib/Support/OptimizedStructLayout.cpp
@@ -0,0 +1,449 @@
+//===--- OptimizedStructLayout.cpp - Optimal data layout algorithm ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the performOptimizedStructLayout interface.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/OptimizedStructLayout.h"
+
+using namespace llvm;
+
+using Field = OptimizedStructLayoutField;
+
+#ifndef NDEBUG
+static void checkValidLayout(ArrayRef<Field> Fields, uint64_t Size,
+                             Align MaxAlign) {
+  uint64_t LastEnd = 0;
+  Align ComputedMaxAlign;
+  for (auto &Field : Fields) {
+    assert(Field.hasFixedOffset() &&
+           "didn't assign a fixed offset to field");
+    assert(isAligned(Field.Alignment, Field.Offset) &&
+           "didn't assign a correctly-aligned offset to field");
+    assert(Field.Offset >= LastEnd &&
+           "didn't assign offsets in ascending order");
+    LastEnd = Field.getEndOffset();
+    assert(Field.Alignment <= MaxAlign &&
+           "didn't compute MaxAlign correctly");
+    ComputedMaxAlign = std::max(Field.Alignment, MaxAlign);
+  }
+  assert(LastEnd == Size && "didn't compute LastEnd correctly");
+  assert(ComputedMaxAlign == MaxAlign && "didn't compute MaxAlign correctly");
+}
+#endif
+
+std::pair<uint64_t, Align>
+llvm::performOptimizedStructLayout(MutableArrayRef<Field> Fields) {
+#ifndef NDEBUG
+  // Do some simple precondition checks.
+  {
+    bool InFixedPrefix = true;
+    size_t LastEnd = 0;
+    for (auto &Field : Fields) {
+      assert(Field.Size > 0 && "field of zero size");
+      if (Field.hasFixedOffset()) {
+        assert(InFixedPrefix &&
+               "fixed-offset fields are not a strict prefix of array");
+        assert(LastEnd <= Field.Offset &&
+               "fixed-offset fields overlap or are not in order");
+        LastEnd = Field.getEndOffset();
+        assert(LastEnd > Field.Offset &&
+               "overflow in fixed-offset end offset");
+      } else {
+        InFixedPrefix = false;
+      }
+    }
+  }
+#endif
+
+  // Do an initial pass over the fields.
+  Align MaxAlign;
+
+  // Find the first flexible-offset field, tracking MaxAlign.
+  auto FirstFlexible = Fields.begin(), E = Fields.end();
+  while (FirstFlexible != E && FirstFlexible->hasFixedOffset()) {
+    MaxAlign = std::max(MaxAlign, FirstFlexible->Alignment);
+    ++FirstFlexible;
+  }
+
+  // If there are no flexible fields, we're done.
+  if (FirstFlexible == E) {
+    uint64_t Size = 0;
+    if (!Fields.empty())
+      Size = Fields.back().getEndOffset();
+
+#ifndef NDEBUG
+    checkValidLayout(Fields, Size, MaxAlign);
+#endif
+    return std::make_pair(Size, MaxAlign);
+  }
+
+  // Walk over the flexible-offset fields, tracking MaxAlign and
+  // assigning them a unique number in order of their appearance.
+  // We'll use this unique number in the comparison below so that
+  // we can use array_pod_sort, which isn't stable.  We won't use it
+  // past that point.
+  {
+    uintptr_t UniqueNumber = 0;
+    for (auto I = FirstFlexible; I != E; ++I) {
+      I->Scratch = reinterpret_cast<void*>(UniqueNumber++);
+      MaxAlign = std::max(MaxAlign, I->Alignment);
+    }
+  }
+
+  // Sort the flexible elements in order of decreasing alignment,
+  // then decreasing size, and then the original order as recorded
+  // in Scratch.  The decreasing-size aspect of this is only really
+  // important if we get into the gap-filling stage below, but it
+  // doesn't hurt here.
+  array_pod_sort(FirstFlexible, E,
+                 [](const Field *lhs, const Field *rhs) -> int {
+    // Decreasing alignment.
+    if (lhs->Alignment != rhs->Alignment)
+      return (lhs->Alignment < rhs->Alignment ? 1 : -1);
+
+    // Decreasing size.
+    if (lhs->Size != rhs->Size)
+      return (lhs->Size < rhs->Size ? 1 : -1);
+
+    // Original order.
+    auto lhsNumber = reinterpret_cast<uintptr_t>(lhs->Scratch);
+    auto rhsNumber = reinterpret_cast<uintptr_t>(rhs->Scratch);
+    if (lhsNumber != rhsNumber)
+      return (lhsNumber < rhsNumber ? -1 : 1);
+
+    return 0;
+  });
+
+  // Do a quick check for whether that sort alone has given us a perfect
+  // layout with no interior padding.  This is very common: if the
+  // fixed-layout fields have no interior padding, and they end at a
+  // sufficiently-aligned offset for all the flexible-layout fields,
+  // and the flexible-layout fields all have sizes that are multiples
+  // of their alignment, then this will reliably trigger.
+  {
+    bool HasPadding = false;
+    uint64_t LastEnd = 0;
+
+    // Walk the fixed-offset fields.
+    for (auto I = Fields.begin(); I != FirstFlexible; ++I) {
+      assert(I->hasFixedOffset());
+      if (LastEnd != I->Offset) {
+        HasPadding = true;
+        break;
+      }
+      LastEnd = I->getEndOffset();
+    }
+
+    // Walk the flexible-offset fields, optimistically assigning fixed
+    // offsets.  Note that we maintain a strict division between the
+    // fixed-offset and flexible-offset fields, so if we end up
+    // discovering padding later in this loop, we can just abandon this
+    // work and we'll ignore the offsets we already assigned.
+    if (!HasPadding) {
+      for (auto I = FirstFlexible; I != E; ++I) {
+        auto Offset = alignTo(LastEnd, I->Alignment);
+        if (LastEnd != Offset) {
+          HasPadding = true;
+          break;
+        }
+        I->Offset = Offset;
+        LastEnd = I->getEndOffset();
+      }
+    }
+
+    // If we already have a perfect layout, we're done.
+    if (!HasPadding) {
+#ifndef NDEBUG
+      checkValidLayout(Fields, LastEnd, MaxAlign);
+#endif
+      return std::make_pair(LastEnd, MaxAlign);
+    }
+  }
+
+  // The algorithm sketch at this point is as follows.
+  //
+  // Consider the padding gaps between fixed-offset fields in ascending
+  // order.  Let LastEnd be the offset of the first byte following the
+  // field before the gap, or 0 if the gap is at the beginning of the
+  // structure.  Find the "best" flexible-offset field according to the
+  // criteria below.  If no such field exists, proceed to the next gap.
+  // Otherwise, add the field at the first properly-aligned offset for
+  // that field that is >= LastEnd, then update LastEnd and repeat in
+  // order to fill any remaining gap following that field.
+  //
+  // Next, let LastEnd to be the offset of the first byte following the
+  // last fixed-offset field, or 0 if there are no fixed-offset fields.
+  // While there are flexible-offset fields remaining, find the "best"
+  // flexible-offset field according to the criteria below, add it at
+  // the first properly-aligned offset for that field that is >= LastEnd,
+  // and update LastEnd to the first byte following the field.
+  //
+  // The "best" field is chosen by the following criteria, considered
+  // strictly in order:
+  //
+  // - When filling a gap betweeen fields, the field must fit.
+  // - A field is preferred if it requires less padding following LastEnd.
+  // - A field is preferred if it is more aligned.
+  // - A field is preferred if it is larger.
+  // - A field is preferred if it appeared earlier in the initial order.
+  //
+  // Minimizing leading padding is a greedy attempt to avoid padding
+  // entirely.  Preferring more-aligned fields is an attempt to eliminate
+  // stricter constraints earlier, with the idea that weaker alignment
+  // constraints may be resolvable with less padding elsewhere.  These
+  // These two rules are sufficient to ensure that we get the optimal
+  // layout in the "C-style" case.  Preferring larger fields tends to take
+  // better advantage of large gaps and may be more likely to have a size
+  // that's a multiple of a useful alignment.  Preferring the initial
+  // order may help somewhat with locality but is mostly just a way of
+  // ensuring deterministic output.
+  //
+  // Note that this algorithm does not guarantee a minimal layout.  Picking
+  // a larger object greedily may leave a gap that cannot be filled as
+  // efficiently.  Unfortunately, solving this perfectly is an NP-complete
+  // problem (by reduction from bin-packing: let B_i be the bin sizes and
+  // O_j be the object sizes; add fixed-offset fields such that the gaps
+  // between them have size B_i, and add flexible-offset fields with
+  // alignment 1 and size O_j; if the layout size is equal to the end of
+  // the last fixed-layout field, the objects fit in the bins; note that
+  // this doesn't even require the complexity of alignment).
+
+  // The implementation below is essentially just an optimized version of
+  // scanning the list of remaining fields looking for the best, which
+  // would be O(n^2).  In the worst case, it doesn't improve on that.
+  // However, in practice it'll just scan the array of alignment bins
+  // and consider the first few elements from one or two bins.  The
+  // number of bins is bounded by a small constant: alignments are powers
+  // of two that are vanishingly unlikely to be over 64 and fairly unlikely
+  // to be over 8.  And multiple elements only need to be considered when
+  // filling a gap between fixed-offset fields, which doesn't happen very
+  // often.  We could use a data structure within bins that optimizes for
+  // finding the best-sized match, but it would require allocating memory
+  // and copying data, so it's unlikely to be worthwhile.
+
+
+  // Start by organizing the flexible-offset fields into bins according to
+  // their alignment.  We expect a small enough number of bins that we
+  // don't care about the asymptotic costs of walking this.
+  struct AlignmentQueue {
+    /// The minimum size of anything currently in this queue.
+    uint64_t MinSize;
+
+    /// The head of the queue.  A singly-linked list.  The order here should
+    /// be consistent with the earlier sort, i.e. the elements should be
+    /// monotonically descending in size and otherwise in the original order.
+    ///
+    /// We remove the queue from the array as soon as this is empty.
+    OptimizedStructLayoutField *Head;
+
+    /// The alignment requirement of the queue.
+    Align Alignment;
+
+    static Field *getNext(Field *Cur) {
+      return static_cast<Field *>(Cur->Scratch);
+    }
+  };
+  SmallVector<AlignmentQueue, 8> FlexibleFieldsByAlignment;
+  for (auto I = FirstFlexible; I != E; ) {
+    auto Head = I;
+    auto Alignment = I->Alignment;
+
+    uint64_t MinSize = I->Size;
+    auto LastInQueue = I;
+    for (++I; I != E && I->Alignment == Alignment; ++I) {
+      LastInQueue->Scratch = I;
+      LastInQueue = I;
+      MinSize = std::min(MinSize, I->Size);
+    }
+    LastInQueue->Scratch = nullptr;
+
+    FlexibleFieldsByAlignment.push_back({MinSize, Head, Alignment});
+  }
+
+#ifndef NDEBUG
+  // Verify that we set the queues up correctly.
+  auto checkQueues = [&]{
+    bool FirstQueue = true;
+    Align LastQueueAlignment;
+    for (auto &Queue : FlexibleFieldsByAlignment) {
+      assert((FirstQueue || Queue.Alignment < LastQueueAlignment) &&
+             "bins not in order of descending alignment");
+      LastQueueAlignment = Queue.Alignment;
+      FirstQueue = false;
+
+      assert(Queue.Head && "queue was empty");
+      uint64_t LastSize = ~(uint64_t)0;
+      for (auto I = Queue.Head; I; I = Queue.getNext(I)) {
+        assert(I->Alignment == Queue.Alignment && "bad field in queue");
+        assert(I->Size <= LastSize && "queue not in descending size order");
+        LastSize = I->Size;
+      }
+    }
+  };
+  checkQueues();
+#endif
+
+  /// Helper function to remove a field from a queue.
+  auto spliceFromQueue = [&](AlignmentQueue *Queue, Field *Last, Field *Cur) {
+    assert(Last ? Queue->getNext(Last) == Cur : Queue->Head == Cur);
+
+    // If we're removing Cur from a non-initial position, splice it out
+    // of the linked list.
+    if (Last) {
+      Last->Scratch = Cur->Scratch;
+
+      // If Cur was the last field in the list, we need to update MinSize.
+      // We can just use the last field's size because the list is in
+      // descending order of size.
+      if (!Cur->Scratch)
+        Queue->MinSize = Last->Size;
+
+    // Otherwise, replace the head.
+    } else {
+      if (auto NewHead = Queue->getNext(Cur))
+        Queue->Head = NewHead;
+
+      // If we just emptied the queue, destroy its bin.
+      else
+        FlexibleFieldsByAlignment.erase(Queue);
+    }
+  };
+
+  // Do layout into a local array.  Doing this in-place on Fields is
+  // not really feasible.
+  SmallVector<Field, 16> Layout;
+  Layout.reserve(Fields.size());
+
+  // The offset that we're currently looking to insert at (or after).
+  uint64_t LastEnd = 0;
+
+  // Helper function to splice Cur out of the given queue and add it
+  // to the layout at the given offset.
+  auto addToLayout = [&](AlignmentQueue *Queue, Field *Last, Field *Cur,
+                         uint64_t Offset) -> bool {
+    assert(Offset == alignTo(LastEnd, Cur->Alignment));
+
+    // Splice out.  This potentially invalidates Queue.
+    spliceFromQueue(Queue, Last, Cur);
+
+    // Add Cur to the layout.
+    Layout.push_back(*Cur);
+    Layout.back().Offset = Offset;
+    LastEnd = Layout.back().getEndOffset();
+
+    // Always return true so that we can be tail-called.
+    return true;
+  };
+
+  // Helper function to try to find a field in the given queue that'll
+  // fit starting at StartOffset but before EndOffset (if present).
+  // Note that this never fails if EndOffset is not provided.
+  auto tryAddFillerFromQueue = [&](AlignmentQueue *Queue,
+                                   uint64_t StartOffset,
+                                   Optional<uint64_t> EndOffset) -> bool {
+    assert(Queue->Head);
+    assert(StartOffset == alignTo(LastEnd, Queue->Alignment));
+
+    // Figure out the maximum size that a field can be, and ignore this
+    // queue if there's nothing in it that small.
+    auto MaxViableSize =
+      (EndOffset ? *EndOffset - StartOffset : ~(uint64_t)0);
+    if (Queue->MinSize > MaxViableSize) return false;
+
+    // Find the matching field.  Note that this should always find
+    // something because of the MinSize check above.
+    for (Field *Cur = Queue->Head, *Last = nullptr; true;
+           Last = Cur, Cur = Queue->getNext(Cur)) {
+      assert(Cur && "didn't find a match in queue despite its MinSize");
+      if (Cur->Size <= MaxViableSize)
+        return addToLayout(Queue, Last, Cur, StartOffset);
+    }
+
+    llvm_unreachable("didn't find a match in queue despite its MinSize");
+  };
+
+  // Helper function to find the "best" flexible-offset field according
+  // to the criteria described above.
+  auto tryAddBestField = [&](Optional<uint64_t> BeforeOffset) -> bool {
+    auto QueueB = FlexibleFieldsByAlignment.begin();
+    auto QueueE = FlexibleFieldsByAlignment.end();
+
+    // Start by looking for the most-aligned queue that doesn't need any
+    // leading padding after LastEnd.
+    auto FirstQueueToSearch = QueueB;
+    for (; FirstQueueToSearch != QueueE; ++FirstQueueToSearch) {
+      if (isAligned(FirstQueueToSearch->Alignment, LastEnd))
+        break;
+    }
+
+    uint64_t Offset = LastEnd;
+    while (true) {
+      // Invariant: all of the queues in [FirstQueueToSearch, QueueE)
+      // require the same initial padding offset.
+
+      // Search those queues in descending order of alignment for a
+      // satisfactory field.
+      for (auto Queue = FirstQueueToSearch; Queue != QueueE; ++Queue) {
+        if (tryAddFillerFromQueue(Queue, Offset, BeforeOffset))
+          return true;
+      }
+
+      // Okay, we don't need to scan those again.
+      QueueE = FirstQueueToSearch;
+
+      // If we started from the first queue, we're done.
+      if (FirstQueueToSearch == QueueB)
+        return false;
+
+      // Otherwise, scan backwards to find the most-aligned queue that
+      // still has minimal leading padding after LastEnd.
+      --FirstQueueToSearch;
+      Offset = alignTo(LastEnd, FirstQueueToSearch->Alignment);
+      while (FirstQueueToSearch != QueueB &&
+             Offset == alignTo(LastEnd, FirstQueueToSearch[-1].Alignment))
+        --FirstQueueToSearch;
+    }
+  };
+
+  // Phase 1: fill the gaps between fixed-offset fields with the best
+  // flexible-offset field that fits.
+  for (auto I = Fields.begin(); I != FirstFlexible; ++I) {
+    while (LastEnd != I->Offset) {
+      if (!tryAddBestField(I->Offset))
+        break;
+    }
+    Layout.push_back(*I);
+    LastEnd = I->getEndOffset();
+  }
+
+#ifndef NDEBUG
+  checkQueues();
+#endif
+
+  // Phase 2: repeatedly add the best flexible-offset field until
+  // they're all gone.
+  while (!FlexibleFieldsByAlignment.empty()) {
+    bool Success = tryAddBestField(None);
+    assert(Success && "didn't find a field with no fixed limit?");
+    (void) Success;
+  }
+
+  // Copy the layout back into place.
+  assert(Layout.size() == Fields.size());
+  memcpy(Fields.data(), Layout.data(),
+         Fields.size() * sizeof(OptimizedStructLayoutField));
+
+#ifndef NDEBUG
+  // Make a final check that the layout is valid.
+  checkValidLayout(Fields, LastEnd, MaxAlign);
+#endif
+
+  return std::make_pair(LastEnd, MaxAlign);
+}
diff --git a/llvm/lib/Support/Parallel.cpp b/llvm/lib/Support/Parallel.cpp
index 523665d14b02..9a2e1003da5a 100644
--- a/llvm/lib/Support/Parallel.cpp
+++ b/llvm/lib/Support/Parallel.cpp
@@ -9,9 +9,6 @@
 #include "llvm/Support/Parallel.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/ManagedStatic.h"
-
-#if LLVM_ENABLE_THREADS
-
 #include "llvm/Support/Threading.h"
 
 #include <atomic>
@@ -20,6 +17,10 @@
 #include <thread>
 #include <vector>
 
+llvm::ThreadPoolStrategy llvm::parallel::strategy;
+
+#if LLVM_ENABLE_THREADS
+
 namespace llvm {
 namespace parallel {
 namespace detail {
@@ -39,20 +40,21 @@ public:
 ///   in filo order.
 class ThreadPoolExecutor : public Executor {
 public:
-  explicit ThreadPoolExecutor(unsigned ThreadCount = hardware_concurrency()) {
+  explicit ThreadPoolExecutor(ThreadPoolStrategy S = hardware_concurrency()) {
+    unsigned ThreadCount = S.compute_thread_count();
     // Spawn all but one of the threads in another thread as spawning threads
     // can take a while.
     Threads.reserve(ThreadCount);
     Threads.resize(1);
     std::lock_guard<std::mutex> Lock(Mutex);
-    Threads[0] = std::thread([&, ThreadCount] {
-      for (unsigned i = 1; i < ThreadCount; ++i) {
-        Threads.emplace_back([=] { work(); });
+    Threads[0] = std::thread([this, ThreadCount, S] {
+      for (unsigned I = 1; I < ThreadCount; ++I) {
+        Threads.emplace_back([=] { work(S, I); });
         if (Stop)
           break;
       }
       ThreadsCreated.set_value();
-      work();
+      work(S, 0);
     });
   }
 
@@ -77,6 +79,9 @@ public:
         T.join();
   }
 
+  struct Creator {
+    static void *call() { return new ThreadPoolExecutor(strategy); }
+  };
   struct Deleter {
     static void call(void *Ptr) { ((ThreadPoolExecutor *)Ptr)->stop(); }
   };
@@ -90,7 +95,8 @@ public:
   }
 
 private:
-  void work() {
+  void work(ThreadPoolStrategy S, unsigned ThreadID) {
+    S.apply_thread_strategy(ThreadID);
     while (true) {
       std::unique_lock<std::mutex> Lock(Mutex);
       Cond.wait(Lock, [&] { return Stop || !WorkStack.empty(); });
@@ -129,7 +135,8 @@ Executor *Executor::getDefaultExecutor() {
   // are more frequent with the debug static runtime.
   //
   // This also prevents intermittent deadlocks on exit with the MinGW runtime.
-  static ManagedStatic<ThreadPoolExecutor, object_creator<ThreadPoolExecutor>,
+
+  static ManagedStatic<ThreadPoolExecutor, ThreadPoolExecutor::Creator,
                        ThreadPoolExecutor::Deleter>
       ManagedExec;
   static std::unique_ptr<ThreadPoolExecutor> Exec(&(*ManagedExec));
diff --git a/llvm/lib/Support/Path.cpp b/llvm/lib/Support/Path.cpp
index 3c9a08cb4077..37b3086fddf5 100644
--- a/llvm/lib/Support/Path.cpp
+++ b/llvm/lib/Support/Path.cpp
@@ -496,49 +496,44 @@ void replace_extension(SmallVectorImpl<char> &path, const Twine &extension,
   path.append(ext.begin(), ext.end());
 }
 
-bool replace_path_prefix(SmallVectorImpl<char> &Path,
-                         const StringRef &OldPrefix, const StringRef &NewPrefix,
-                         Style style, bool strict) {
+static bool starts_with(StringRef Path, StringRef Prefix,
+                        Style style = Style::native) {
+  // Windows prefix matching : case and separator insensitive
+  if (real_style(style) == Style::windows) {
+    if (Path.size() < Prefix.size())
+      return false;
+    for (size_t I = 0, E = Prefix.size(); I != E; ++I) {
+      bool SepPath = is_separator(Path[I], style);
+      bool SepPrefix = is_separator(Prefix[I], style);
+      if (SepPath != SepPrefix)
+        return false;
+      if (!SepPath && toLower(Path[I]) != toLower(Prefix[I]))
+        return false;
+    }
+    return true;
+  }
+  return Path.startswith(Prefix);
+}
+
+bool replace_path_prefix(SmallVectorImpl<char> &Path, StringRef OldPrefix,
+                         StringRef NewPrefix, Style style) {
   if (OldPrefix.empty() && NewPrefix.empty())
     return false;
 
   StringRef OrigPath(Path.begin(), Path.size());
-  StringRef OldPrefixDir;
-
-  if (!strict && OldPrefix.size() > OrigPath.size())
+  if (!starts_with(OrigPath, OldPrefix, style))
     return false;
 
-  // Ensure OldPrefixDir does not have a trailing separator.
-  if (!OldPrefix.empty() && is_separator(OldPrefix.back()))
-    OldPrefixDir = parent_path(OldPrefix, style);
-  else
-    OldPrefixDir = OldPrefix;
-
-  if (!OrigPath.startswith(OldPrefixDir))
-    return false;
-
-  if (OrigPath.size() > OldPrefixDir.size())
-    if (!is_separator(OrigPath[OldPrefixDir.size()], style) && strict)
-      return false;
-
   // If prefixes have the same size we can simply copy the new one over.
-  if (OldPrefixDir.size() == NewPrefix.size() && !strict) {
+  if (OldPrefix.size() == NewPrefix.size()) {
     llvm::copy(NewPrefix, Path.begin());
     return true;
   }
 
-  StringRef RelPath = OrigPath.substr(OldPrefixDir.size());
+  StringRef RelPath = OrigPath.substr(OldPrefix.size());
   SmallString<256> NewPath;
-  path::append(NewPath, style, NewPrefix);
-  if (!RelPath.empty()) {
-    if (!is_separator(RelPath[0], style) || !strict)
-      path::append(NewPath, style, RelPath);
-    else
-      path::append(NewPath, style, relative_path(RelPath, style));
-  }
-
+  (Twine(NewPrefix) + RelPath).toVector(NewPath);
   Path.swap(NewPath);
-
   return true;
 }
 
@@ -564,21 +559,15 @@ void native(SmallVectorImpl<char> &Path, Style style) {
       Path = PathHome;
     }
   } else {
-    for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI) {
-      if (*PI == '\\') {
-        auto PN = PI + 1;
-        if (PN < PE && *PN == '\\')
-          ++PI; // increment once, the for loop will move over the escaped slash
-        else
-          *PI = '/';
-      }
-    }
+    for (auto PI = Path.begin(), PE = Path.end(); PI < PE; ++PI)
+      if (*PI == '\\')
+        *PI = '/';
   }
 }
 
 std::string convert_to_slash(StringRef path, Style style) {
   if (real_style(style) != Style::windows)
-    return path;
+    return std::string(path);
 
   std::string s = path.str();
   std::replace(s.begin(), s.end(), '\\', '/');
@@ -708,43 +697,69 @@ StringRef remove_leading_dotslash(StringRef Path, Style style) {
   return Path;
 }
 
-static SmallString<256> remove_dots(StringRef path, bool remove_dot_dot,
-                                    Style style) {
+// Remove path traversal components ("." and "..") when possible, and
+// canonicalize slashes.
+bool remove_dots(SmallVectorImpl<char> &the_path, bool remove_dot_dot,
+                 Style style) {
+  style = real_style(style);
+  StringRef remaining(the_path.data(), the_path.size());
+  bool needs_change = false;
   SmallVector<StringRef, 16> components;
 
-  // Skip the root path, then look for traversal in the components.
-  StringRef rel = path::relative_path(path, style);
-  for (StringRef C :
-       llvm::make_range(path::begin(rel, style), path::end(rel))) {
-    if (C == ".")
-      continue;
-    // Leading ".." will remain in the path unless it's at the root.
-    if (remove_dot_dot && C == "..") {
+  // Consume the root path, if present.
+  StringRef root = path::root_path(remaining, style);
+  bool absolute = !root.empty();
+  if (absolute)
+    remaining = remaining.drop_front(root.size());
+
+  // Loop over path components manually. This makes it easier to detect
+  // non-preferred slashes and double separators that must be canonicalized.
+  while (!remaining.empty()) {
+    size_t next_slash = remaining.find_first_of(separators(style));
+    if (next_slash == StringRef::npos)
+      next_slash = remaining.size();
+    StringRef component = remaining.take_front(next_slash);
+    remaining = remaining.drop_front(next_slash);
+
+    // Eat the slash, and check if it is the preferred separator.
+    if (!remaining.empty()) {
+      needs_change |= remaining.front() != preferred_separator(style);
+      remaining = remaining.drop_front();
+      // The path needs to be rewritten if it has a trailing slash.
+      // FIXME: This is emergent behavior that could be removed.
+      needs_change |= remaining.empty();
+    }
+
+    // Check for path traversal components or double separators.
+    if (component.empty() || component == ".") {
+      needs_change = true;
+    } else if (remove_dot_dot && component == "..") {
+      needs_change = true;
+      // Do not allow ".." to remove the root component. If this is the
+      // beginning of a relative path, keep the ".." component.
       if (!components.empty() && components.back() != "..") {
         components.pop_back();
-        continue;
+      } else if (!absolute) {
+        components.push_back(component);
       }
-      if (path::is_absolute(path, style))
-        continue;
+    } else {
+      components.push_back(component);
     }
-    components.push_back(C);
   }
 
-  SmallString<256> buffer = path::root_path(path, style);
-  for (StringRef C : components)
-    path::append(buffer, style, C);
-  return buffer;
-}
-
-bool remove_dots(SmallVectorImpl<char> &path, bool remove_dot_dot,
-                 Style style) {
-  StringRef p(path.data(), path.size());
-
-  SmallString<256> result = remove_dots(p, remove_dot_dot, style);
-  if (result == path)
+  // Avoid rewriting the path unless we have to.
+  if (!needs_change)
     return false;
 
-  path.swap(result);
+  SmallString<256> buffer = root;
+  if (!components.empty()) {
+    buffer += components[0];
+    for (StringRef C : makeArrayRef(components).drop_front()) {
+      buffer += preferred_separator(style);
+      buffer += C;
+    }
+  }
+  the_path.swap(buffer);
   return true;
 }
 
@@ -1114,7 +1129,7 @@ void directory_entry::replace_filename(const Twine &Filename, file_type Type,
                                        basic_file_status Status) {
   SmallString<128> PathStr = path::parent_path(Path);
   path::append(PathStr, Filename);
-  this->Path = PathStr.str();
+  this->Path = std::string(PathStr.str());
   this->Type = Type;
   this->Status = Status;
 }
@@ -1142,7 +1157,8 @@ ErrorOr<perms> getPermissions(const Twine &Path) {
 namespace llvm {
 namespace sys {
 namespace fs {
-TempFile::TempFile(StringRef Name, int FD) : TmpName(Name), FD(FD) {}
+TempFile::TempFile(StringRef Name, int FD)
+    : TmpName(std::string(Name)), FD(FD) {}
 TempFile::TempFile(TempFile &&Other) { *this = std::move(Other); }
 TempFile &TempFile::operator=(TempFile &&Other) {
   TmpName = std::move(Other.TmpName);
diff --git a/llvm/lib/Support/PrettyStackTrace.cpp b/llvm/lib/Support/PrettyStackTrace.cpp
index bfb238cc8539..9072f9d2d2ee 100644
--- a/llvm/lib/Support/PrettyStackTrace.cpp
+++ b/llvm/lib/Support/PrettyStackTrace.cpp
@@ -22,6 +22,7 @@
 #include "llvm/Support/raw_ostream.h"
 
 #include <atomic>
+#include <cassert>
 #include <cstdarg>
 #include <cstdio>
 #include <tuple>
@@ -32,6 +33,10 @@
 
 using namespace llvm;
 
+static const char *BugReportMsg =
+    "PLEASE submit a bug report to " BUG_REPORT_URL
+    " and include the crash backtrace.\n";
+
 // If backtrace support is not enabled, compile out support for pretty stack
 // traces.  This has the secondary effect of not requiring thread local storage
 // when backtrace support is disabled.
@@ -144,6 +149,8 @@ static CrashHandlerStringStorage crashHandlerStringStorage;
 /// This callback is run if a fatal signal is delivered to the process, it
 /// prints the pretty stack trace.
 static void CrashHandler(void *) {
+  errs() << BugReportMsg ;
+
 #ifndef __APPLE__
   // On non-apple systems, just emit the crash stack trace to stderr.
   PrintCurStackTrace(errs());
@@ -195,6 +202,14 @@ static void printForSigInfoIfNeeded() {
 
 #endif // ENABLE_BACKTRACES
 
+void llvm::setBugReportMsg(const char *Msg) {
+  BugReportMsg = Msg;
+}
+
+const char *llvm::getBugReportMsg() {
+  return BugReportMsg;
+}
+
 PrettyStackTraceEntry::PrettyStackTraceEntry() {
 #if ENABLE_BACKTRACES
   // Handle SIGINFO first, because we haven't finished constructing yet.
diff --git a/llvm/lib/Support/Process.cpp b/llvm/lib/Support/Process.cpp
index 5b6471008159..9e6e233b26ac 100644
--- a/llvm/lib/Support/Process.cpp
+++ b/llvm/lib/Support/Process.cpp
@@ -13,8 +13,9 @@
 #include "llvm/Support/Process.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Config/llvm-config.h"
 #include "llvm/Config/config.h"
+#include "llvm/Config/llvm-config.h"
+#include "llvm/Support/CrashRecoveryContext.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
 #include "llvm/Support/Program.h"
@@ -55,7 +56,7 @@ Optional<std::string> Process::FindInEnvPath(StringRef EnvName,
     SmallString<128> FilePath(Dir);
     path::append(FilePath, FileName);
     if (fs::exists(Twine(FilePath))) {
-      FoundPath = FilePath.str();
+      FoundPath = std::string(FilePath.str());
       break;
     }
   }
@@ -88,6 +89,13 @@ static bool coreFilesPrevented = !LLVM_ENABLE_CRASH_DUMPS;
 
 bool Process::AreCoreFilesPrevented() { return coreFilesPrevented; }
 
+LLVM_ATTRIBUTE_NORETURN
+void Process::Exit(int RetCode) {
+  if (CrashRecoveryContext *CRC = CrashRecoveryContext::GetCurrent())
+    CRC->HandleExit(RetCode);
+  ::exit(RetCode);
+}
+
 // Include the platform-specific parts of this class.
 #ifdef LLVM_ON_UNIX
 #include "Unix/Process.inc"
diff --git a/llvm/lib/Support/Program.cpp b/llvm/lib/Support/Program.cpp
index 0a9363c59fc6..5294f65bd5a5 100644
--- a/llvm/lib/Support/Program.cpp
+++ b/llvm/lib/Support/Program.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Support/Program.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Config/llvm-config.h"
+#include "llvm/Support/raw_ostream.h"
 #include <system_error>
 using namespace llvm;
 using namespace sys;
@@ -31,14 +32,16 @@ int sys::ExecuteAndWait(StringRef Program, ArrayRef<StringRef> Args,
                         Optional<ArrayRef<StringRef>> Env,
                         ArrayRef<Optional<StringRef>> Redirects,
                         unsigned SecondsToWait, unsigned MemoryLimit,
-                        std::string *ErrMsg, bool *ExecutionFailed) {
+                        std::string *ErrMsg, bool *ExecutionFailed,
+                        Optional<ProcessStatistics> *ProcStat) {
   assert(Redirects.empty() || Redirects.size() == 3);
   ProcessInfo PI;
   if (Execute(PI, Program, Args, Env, Redirects, MemoryLimit, ErrMsg)) {
     if (ExecutionFailed)
       *ExecutionFailed = false;
-    ProcessInfo Result = Wait(
-        PI, SecondsToWait, /*WaitUntilTerminates=*/SecondsToWait == 0, ErrMsg);
+    ProcessInfo Result =
+        Wait(PI, SecondsToWait, /*WaitUntilTerminates=*/SecondsToWait == 0,
+             ErrMsg, ProcStat);
     return Result.ReturnCode;
   }
 
@@ -73,6 +76,24 @@ bool sys::commandLineFitsWithinSystemLimits(StringRef Program,
   return commandLineFitsWithinSystemLimits(Program, StringRefArgs);
 }
 
+void sys::printArg(raw_ostream &OS, StringRef Arg, bool Quote) {
+  const bool Escape = Arg.find_first_of(" \"\\$") != StringRef::npos;
+
+  if (!Quote && !Escape) {
+    OS << Arg;
+    return;
+  }
+
+  // Quote and escape. This isn't really complete, but good enough.
+  OS << '"';
+  for (const auto c : Arg) {
+    if (c == '"' || c == '\\' || c == '$')
+      OS << '\\';
+    OS << c;
+  }
+  OS << '"';
+}
+
 // Include the platform-specific parts of this class.
 #ifdef LLVM_ON_UNIX
 #include "Unix/Program.inc"
diff --git a/llvm/lib/Support/RISCVAttributeParser.cpp b/llvm/lib/Support/RISCVAttributeParser.cpp
new file mode 100644
index 000000000000..393861c73a4a
--- /dev/null
+++ b/llvm/lib/Support/RISCVAttributeParser.cpp
@@ -0,0 +1,67 @@
+//===-- RISCVAttributeParser.cpp - RISCV Attribute Parser -----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/RISCVAttributeParser.h"
+#include "llvm/ADT/StringExtras.h"
+
+using namespace llvm;
+
+const RISCVAttributeParser::DisplayHandler
+    RISCVAttributeParser::displayRoutines[] = {
+        {
+            RISCVAttrs::ARCH,
+            &ELFAttributeParser::stringAttribute,
+        },
+        {
+            RISCVAttrs::PRIV_SPEC,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            RISCVAttrs::PRIV_SPEC_MINOR,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            RISCVAttrs::PRIV_SPEC_REVISION,
+            &ELFAttributeParser::integerAttribute,
+        },
+        {
+            RISCVAttrs::STACK_ALIGN,
+            &RISCVAttributeParser::stackAlign,
+        },
+        {
+            RISCVAttrs::UNALIGNED_ACCESS,
+            &RISCVAttributeParser::unalignedAccess,
+        }};
+
+Error RISCVAttributeParser::unalignedAccess(unsigned tag) {
+  static const char *strings[] = {"No unaligned access", "Unaligned access"};
+  return parseStringAttribute("Unaligned_access", tag, makeArrayRef(strings));
+}
+
+Error RISCVAttributeParser::stackAlign(unsigned tag) {
+  uint64_t value = de.getULEB128(cursor);
+  std::string description =
+      "Stack alignment is " + utostr(value) + std::string("-bytes");
+  printAttribute(tag, value, description);
+  return Error::success();
+}
+
+Error RISCVAttributeParser::handler(uint64_t tag, bool &handled) {
+  handled = false;
+  for (unsigned AHI = 0, AHE = array_lengthof(displayRoutines); AHI != AHE;
+       ++AHI) {
+    if (uint64_t(displayRoutines[AHI].attribute) == tag) {
+      if (Error e = (this->*displayRoutines[AHI].routine)(tag))
+        return e;
+      handled = true;
+      break;
+    }
+  }
+
+  return Error::success();
+}
diff --git a/llvm/lib/Support/RISCVAttributes.cpp b/llvm/lib/Support/RISCVAttributes.cpp
new file mode 100644
index 000000000000..201048e03009
--- /dev/null
+++ b/llvm/lib/Support/RISCVAttributes.cpp
@@ -0,0 +1,25 @@
+//===-- RISCVAttributes.cpp - RISCV Attributes ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/RISCVAttributes.h"
+
+using namespace llvm;
+using namespace llvm::RISCVAttrs;
+
+static const TagNameItem tagData[] = {
+    {STACK_ALIGN, "Tag_stack_align"},
+    {ARCH, "Tag_arch"},
+    {UNALIGNED_ACCESS, "Tag_unaligned_access"},
+    {PRIV_SPEC, "Tag_priv_spec"},
+    {PRIV_SPEC_MINOR, "Tag_priv_spec_minor"},
+    {PRIV_SPEC_REVISION, "Tag_priv_spec_revision"},
+};
+
+const TagNameMap llvm::RISCVAttrs::RISCVAttributeTags(tagData,
+                                                      sizeof(tagData) /
+                                                          sizeof(TagNameItem));
diff --git a/llvm/lib/Support/RandomNumberGenerator.cpp b/llvm/lib/Support/RandomNumberGenerator.cpp
index 09fad1979985..f9c41ee5eaaf 100644
--- a/llvm/lib/Support/RandomNumberGenerator.cpp
+++ b/llvm/lib/Support/RandomNumberGenerator.cpp
@@ -17,7 +17,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #ifdef _WIN32
-#include "Windows/WindowsSupport.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
 #else
 #include "Unix/Unix.h"
 #endif
diff --git a/llvm/lib/Support/Regex.cpp b/llvm/lib/Support/Regex.cpp
index 8da345d4f140..0d5cc1c00db1 100644
--- a/llvm/lib/Support/Regex.cpp
+++ b/llvm/lib/Support/Regex.cpp
@@ -14,6 +14,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Twine.h"
+#include <cassert>
 #include <string>
 
 // Important this comes last because it defines "_REGEX_H_". At least on
@@ -25,7 +26,7 @@ using namespace llvm;
 
 Regex::Regex() : preg(nullptr), error(REG_BADPAT) {}
 
-Regex::Regex(StringRef regex, unsigned Flags) {
+Regex::Regex(StringRef regex, RegexFlags Flags) {
   unsigned flags = 0;
   preg = new llvm_regex();
   preg->re_endp = regex.end();
@@ -38,6 +39,9 @@ Regex::Regex(StringRef regex, unsigned Flags) {
   error = llvm_regcomp(preg, regex.data(), flags|REG_PEND);
 }
 
+Regex::Regex(StringRef regex, unsigned Flags)
+    : Regex(regex, static_cast<RegexFlags>(Flags)) {}
+
 Regex::Regex(Regex &&regex) {
   preg = regex.preg;
   error = regex.error;
@@ -135,7 +139,7 @@ std::string Regex::sub(StringRef Repl, StringRef String,
 
   // Return the input if there was no match.
   if (!match(String, &Matches, Error))
-    return String;
+    return std::string(String);
 
   // Otherwise splice in the replacement string, starting with the prefix before
   // the match.
diff --git a/llvm/lib/Support/SHA1.cpp b/llvm/lib/Support/SHA1.cpp
index a98ca41a3354..417b13fea05a 100644
--- a/llvm/lib/Support/SHA1.cpp
+++ b/llvm/lib/Support/SHA1.cpp
@@ -16,13 +16,13 @@
 
 #include "llvm/Support/SHA1.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Endian.h"
 #include "llvm/Support/Host.h"
-using namespace llvm;
-
-#include <stdint.h>
 #include <string.h>
 
+using namespace llvm;
+
 #if defined(BYTE_ORDER) && defined(BIG_ENDIAN) && BYTE_ORDER == BIG_ENDIAN
 #define SHA_BIG_ENDIAN
 #endif
@@ -238,6 +238,11 @@ void SHA1::update(ArrayRef<uint8_t> Data) {
     addUncounted(C);
 }
 
+void SHA1::update(StringRef Str) {
+  update(
+      ArrayRef<uint8_t>((uint8_t *)const_cast<char *>(Str.data()), Str.size()));
+}
+
 void SHA1::pad() {
   // Implement SHA-1 padding (fips180-2 5.1.1)
 
diff --git a/llvm/lib/Support/Signals.cpp b/llvm/lib/Support/Signals.cpp
index add6fde0eb5e..2cfdf2d42a4a 100644
--- a/llvm/lib/Support/Signals.cpp
+++ b/llvm/lib/Support/Signals.cpp
@@ -131,7 +131,7 @@ static bool printSymbolizedStackTrace(StringRef Argv0, void **StackTrace,
   // If we don't know argv0 or the address of main() at this point, try
   // to guess it anyway (it's possible on some platforms).
   std::string MainExecutableName =
-      sys::fs::exists(Argv0) ? (std::string)Argv0
+      sys::fs::exists(Argv0) ? (std::string)std::string(Argv0)
                              : sys::fs::getMainExecutable(nullptr, nullptr);
   BumpPtrAllocator Allocator;
   StringSaver StrPool(Allocator);
diff --git a/llvm/lib/Support/SmallVector.cpp b/llvm/lib/Support/SmallVector.cpp
index 36f0a81f6b00..6d5fe7165f63 100644
--- a/llvm/lib/Support/SmallVector.cpp
+++ b/llvm/lib/Support/SmallVector.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/SmallVector.h"
+#include <cstdint>
 using namespace llvm;
 
 // Check that no bytes are wasted and everything is well-aligned.
@@ -37,17 +38,30 @@ static_assert(sizeof(SmallVector<void *, 1>) ==
                   sizeof(unsigned) * 2 + sizeof(void *) * 2,
               "wasted space in SmallVector size 1");
 
-/// grow_pod - This is an implementation of the grow() method which only works
-/// on POD-like datatypes and is out of line to reduce code duplication.
-void SmallVectorBase::grow_pod(void *FirstEl, size_t MinCapacity,
-                               size_t TSize) {
-  // Ensure we can fit the new capacity in 32 bits.
-  if (MinCapacity > UINT32_MAX)
+static_assert(sizeof(SmallVector<char, 0>) ==
+                  sizeof(void *) * 2 + sizeof(void *),
+              "1 byte elements have word-sized type for size and capacity");
+
+// Note: Moving this function into the header may cause performance regression.
+template <class Size_T>
+void SmallVectorBase<Size_T>::grow_pod(void *FirstEl, size_t MinCapacity,
+                                       size_t TSize) {
+  // Ensure we can fit the new capacity.
+  // This is only going to be applicable when the capacity is 32 bit.
+  if (MinCapacity > SizeTypeMax())
     report_bad_alloc_error("SmallVector capacity overflow during allocation");
 
+  // Ensure we can meet the guarantee of space for at least one more element.
+  // The above check alone will not catch the case where grow is called with a
+  // default MinCapacity of 0, but the current capacity cannot be increased.
+  // This is only going to be applicable when the capacity is 32 bit.
+  if (capacity() == SizeTypeMax())
+    report_bad_alloc_error("SmallVector capacity unable to grow");
+
+  // In theory 2*capacity can overflow if the capacity is 64 bit, but the
+  // original capacity would never be large enough for this to be a problem.
   size_t NewCapacity = 2 * capacity() + 1; // Always grow.
-  NewCapacity =
-      std::min(std::max(NewCapacity, MinCapacity), size_t(UINT32_MAX));
+  NewCapacity = std::min(std::max(NewCapacity, MinCapacity), SizeTypeMax());
 
   void *NewElts;
   if (BeginX == FirstEl) {
@@ -63,3 +77,20 @@ void SmallVectorBase::grow_pod(void *FirstEl, size_t MinCapacity,
   this->BeginX = NewElts;
   this->Capacity = NewCapacity;
 }
+
+template class llvm::SmallVectorBase<uint32_t>;
+
+// Disable the uint64_t instantiation for 32-bit builds.
+// Both uint32_t and uint64_t instantations are needed for 64-bit builds.
+// This instantiation will never be used in 32-bit builds, and will cause
+// warnings when sizeof(Size_T) > sizeof(size_t).
+#if SIZE_MAX > UINT32_MAX
+template class llvm::SmallVectorBase<uint64_t>;
+
+// Assertions to ensure this #if stays in sync with SmallVectorSizeType.
+static_assert(sizeof(SmallVectorSizeType<char>) == sizeof(uint64_t),
+              "Expected SmallVectorBase<uint64_t> variant to be in use.");
+#else
+static_assert(sizeof(SmallVectorSizeType<char>) == sizeof(uint32_t),
+              "Expected SmallVectorBase<uint32_t> variant to be in use.");
+#endif
diff --git a/llvm/lib/Support/SourceMgr.cpp b/llvm/lib/Support/SourceMgr.cpp
index 2a241f18c362..9cc69732a964 100644
--- a/llvm/lib/Support/SourceMgr.cpp
+++ b/llvm/lib/Support/SourceMgr.cpp
@@ -42,7 +42,7 @@ unsigned SourceMgr::AddIncludeFile(const std::string &Filename,
                                    std::string &IncludedFile) {
   IncludedFile = Filename;
   ErrorOr<std::unique_ptr<MemoryBuffer>> NewBufOrErr =
-    MemoryBuffer::getFile(IncludedFile);
+      MemoryBuffer::getFile(IncludedFile);
 
   // If the file didn't exist directly, see if it's in an include path.
   for (unsigned i = 0, e = IncludeDirectories.size(); i != e && !NewBufOrErr;
@@ -69,54 +69,109 @@ unsigned SourceMgr::FindBufferContainingLoc(SMLoc Loc) const {
 }
 
 template <typename T>
-unsigned SourceMgr::SrcBuffer::getLineNumber(const char *Ptr) const {
-
-  // Ensure OffsetCache is allocated and populated with offsets of all the
-  // '\n' bytes.
-  std::vector<T> *Offsets = nullptr;
-  if (OffsetCache.isNull()) {
-    Offsets = new std::vector<T>();
-    OffsetCache = Offsets;
-    size_t Sz = Buffer->getBufferSize();
-    assert(Sz <= std::numeric_limits<T>::max());
-    StringRef S = Buffer->getBuffer();
-    for (size_t N = 0; N < Sz; ++N) {
-      if (S[N] == '\n') {
-        Offsets->push_back(static_cast<T>(N));
-      }
-    }
-  } else {
-    Offsets = OffsetCache.get<std::vector<T> *>();
+static std::vector<T> &GetOrCreateOffsetCache(void *&OffsetCache,
+                                              MemoryBuffer *Buffer) {
+  if (OffsetCache)
+    return *static_cast<std::vector<T> *>(OffsetCache);
+
+  // Lazily fill in the offset cache.
+  auto *Offsets = new std::vector<T>();
+  size_t Sz = Buffer->getBufferSize();
+  assert(Sz <= std::numeric_limits<T>::max());
+  StringRef S = Buffer->getBuffer();
+  for (size_t N = 0; N < Sz; ++N) {
+    if (S[N] == '\n')
+      Offsets->push_back(static_cast<T>(N));
   }
 
+  OffsetCache = Offsets;
+  return *Offsets;
+}
+
+template <typename T>
+unsigned SourceMgr::SrcBuffer::getLineNumberSpecialized(const char *Ptr) const {
+  std::vector<T> &Offsets =
+      GetOrCreateOffsetCache<T>(OffsetCache, Buffer.get());
+
   const char *BufStart = Buffer->getBufferStart();
   assert(Ptr >= BufStart && Ptr <= Buffer->getBufferEnd());
   ptrdiff_t PtrDiff = Ptr - BufStart;
-  assert(PtrDiff >= 0 && static_cast<size_t>(PtrDiff) <= std::numeric_limits<T>::max());
+  assert(PtrDiff >= 0 &&
+         static_cast<size_t>(PtrDiff) <= std::numeric_limits<T>::max());
   T PtrOffset = static_cast<T>(PtrDiff);
 
   // llvm::lower_bound gives the number of EOL before PtrOffset. Add 1 to get
   // the line number.
-  return llvm::lower_bound(*Offsets, PtrOffset) - Offsets->begin() + 1;
+  return llvm::lower_bound(Offsets, PtrOffset) - Offsets.begin() + 1;
+}
+
+/// Look up a given \p Ptr in in the buffer, determining which line it came
+/// from.
+unsigned SourceMgr::SrcBuffer::getLineNumber(const char *Ptr) const {
+  size_t Sz = Buffer->getBufferSize();
+  if (Sz <= std::numeric_limits<uint8_t>::max())
+    return getLineNumberSpecialized<uint8_t>(Ptr);
+  else if (Sz <= std::numeric_limits<uint16_t>::max())
+    return getLineNumberSpecialized<uint16_t>(Ptr);
+  else if (Sz <= std::numeric_limits<uint32_t>::max())
+    return getLineNumberSpecialized<uint32_t>(Ptr);
+  else
+    return getLineNumberSpecialized<uint64_t>(Ptr);
+}
+
+template <typename T>
+const char *SourceMgr::SrcBuffer::getPointerForLineNumberSpecialized(
+    unsigned LineNo) const {
+  std::vector<T> &Offsets =
+      GetOrCreateOffsetCache<T>(OffsetCache, Buffer.get());
+
+  // We start counting line and column numbers from 1.
+  if (LineNo != 0)
+    --LineNo;
+
+  const char *BufStart = Buffer->getBufferStart();
+
+  // The offset cache contains the location of the \n for the specified line,
+  // we want the start of the line.  As such, we look for the previous entry.
+  if (LineNo == 0)
+    return BufStart;
+  if (LineNo > Offsets.size())
+    return nullptr;
+  return BufStart + Offsets[LineNo - 1] + 1;
+}
+
+/// Return a pointer to the first character of the specified line number or
+/// null if the line number is invalid.
+const char *
+SourceMgr::SrcBuffer::getPointerForLineNumber(unsigned LineNo) const {
+  size_t Sz = Buffer->getBufferSize();
+  if (Sz <= std::numeric_limits<uint8_t>::max())
+    return getPointerForLineNumberSpecialized<uint8_t>(LineNo);
+  else if (Sz <= std::numeric_limits<uint16_t>::max())
+    return getPointerForLineNumberSpecialized<uint16_t>(LineNo);
+  else if (Sz <= std::numeric_limits<uint32_t>::max())
+    return getPointerForLineNumberSpecialized<uint32_t>(LineNo);
+  else
+    return getPointerForLineNumberSpecialized<uint64_t>(LineNo);
 }
 
 SourceMgr::SrcBuffer::SrcBuffer(SourceMgr::SrcBuffer &&Other)
-  : Buffer(std::move(Other.Buffer)),
-    OffsetCache(Other.OffsetCache),
-    IncludeLoc(Other.IncludeLoc) {
+    : Buffer(std::move(Other.Buffer)), OffsetCache(Other.OffsetCache),
+      IncludeLoc(Other.IncludeLoc) {
   Other.OffsetCache = nullptr;
 }
 
 SourceMgr::SrcBuffer::~SrcBuffer() {
-  if (!OffsetCache.isNull()) {
-    if (OffsetCache.is<std::vector<uint8_t>*>())
-      delete OffsetCache.get<std::vector<uint8_t>*>();
-    else if (OffsetCache.is<std::vector<uint16_t>*>())
-      delete OffsetCache.get<std::vector<uint16_t>*>();
-    else if (OffsetCache.is<std::vector<uint32_t>*>())
-      delete OffsetCache.get<std::vector<uint32_t>*>();
+  if (OffsetCache) {
+    size_t Sz = Buffer->getBufferSize();
+    if (Sz <= std::numeric_limits<uint8_t>::max())
+      delete static_cast<std::vector<uint8_t> *>(OffsetCache);
+    else if (Sz <= std::numeric_limits<uint16_t>::max())
+      delete static_cast<std::vector<uint16_t> *>(OffsetCache);
+    else if (Sz <= std::numeric_limits<uint32_t>::max())
+      delete static_cast<std::vector<uint32_t> *>(OffsetCache);
     else
-      delete OffsetCache.get<std::vector<uint64_t>*>();
+      delete static_cast<std::vector<uint64_t> *>(OffsetCache);
     OffsetCache = nullptr;
   }
 }
@@ -130,39 +185,58 @@ SourceMgr::getLineAndColumn(SMLoc Loc, unsigned BufferID) const {
   auto &SB = getBufferInfo(BufferID);
   const char *Ptr = Loc.getPointer();
 
-  size_t Sz = SB.Buffer->getBufferSize();
-  unsigned LineNo;
-  if (Sz <= std::numeric_limits<uint8_t>::max())
-    LineNo = SB.getLineNumber<uint8_t>(Ptr);
-  else if (Sz <= std::numeric_limits<uint16_t>::max())
-    LineNo = SB.getLineNumber<uint16_t>(Ptr);
-  else if (Sz <= std::numeric_limits<uint32_t>::max())
-    LineNo = SB.getLineNumber<uint32_t>(Ptr);
-  else
-    LineNo = SB.getLineNumber<uint64_t>(Ptr);
-
+  unsigned LineNo = SB.getLineNumber(Ptr);
   const char *BufStart = SB.Buffer->getBufferStart();
-  size_t NewlineOffs = StringRef(BufStart, Ptr-BufStart).find_last_of("\n\r");
-  if (NewlineOffs == StringRef::npos) NewlineOffs = ~(size_t)0;
-  return std::make_pair(LineNo, Ptr-BufStart-NewlineOffs);
+  size_t NewlineOffs = StringRef(BufStart, Ptr - BufStart).find_last_of("\n\r");
+  if (NewlineOffs == StringRef::npos)
+    NewlineOffs = ~(size_t)0;
+  return std::make_pair(LineNo, Ptr - BufStart - NewlineOffs);
+}
+
+/// Given a line and column number in a mapped buffer, turn it into an SMLoc.
+/// This will return a null SMLoc if the line/column location is invalid.
+SMLoc SourceMgr::FindLocForLineAndColumn(unsigned BufferID, unsigned LineNo,
+                                         unsigned ColNo) {
+  auto &SB = getBufferInfo(BufferID);
+  const char *Ptr = SB.getPointerForLineNumber(LineNo);
+  if (!Ptr)
+    return SMLoc();
+
+  // We start counting line and column numbers from 1.
+  if (ColNo != 0)
+    --ColNo;
+
+  // If we have a column number, validate it.
+  if (ColNo) {
+    // Make sure the location is within the current line.
+    if (Ptr + ColNo > SB.Buffer->getBufferEnd())
+      return SMLoc();
+
+    // Make sure there is no newline in the way.
+    if (StringRef(Ptr, ColNo).find_first_of("\n\r") != StringRef::npos)
+      return SMLoc();
+
+    Ptr += ColNo;
+  }
+
+  return SMLoc::getFromPointer(Ptr);
 }
 
 void SourceMgr::PrintIncludeStack(SMLoc IncludeLoc, raw_ostream &OS) const {
-  if (IncludeLoc == SMLoc()) return;  // Top of stack.
+  if (IncludeLoc == SMLoc())
+    return; // Top of stack.
 
   unsigned CurBuf = FindBufferContainingLoc(IncludeLoc);
   assert(CurBuf && "Invalid or unspecified location!");
 
   PrintIncludeStack(getBufferInfo(CurBuf).IncludeLoc, OS);
 
-  OS << "Included from "
-     << getBufferInfo(CurBuf).Buffer->getBufferIdentifier()
+  OS << "Included from " << getBufferInfo(CurBuf).Buffer->getBufferIdentifier()
      << ":" << FindLineNumber(IncludeLoc, CurBuf) << ":\n";
 }
 
 SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
-                                   const Twine &Msg,
-                                   ArrayRef<SMRange> Ranges,
+                                   const Twine &Msg, ArrayRef<SMRange> Ranges,
                                    ArrayRef<SMFixIt> FixIts) const {
   // First thing to do: find the current buffer containing the specified
   // location to pull out the source line.
@@ -196,7 +270,8 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
     // location.
     for (unsigned i = 0, e = Ranges.size(); i != e; ++i) {
       SMRange R = Ranges[i];
-      if (!R.isValid()) continue;
+      if (!R.isValid())
+        continue;
 
       // If the line doesn't contain any part of the range, then ignore it.
       if (R.Start.getPointer() > LineEnd || R.End.getPointer() < LineStart)
@@ -210,16 +285,16 @@ SMDiagnostic SourceMgr::GetMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
 
       // Translate from SMLoc ranges to column ranges.
       // FIXME: Handle multibyte characters.
-      ColRanges.push_back(std::make_pair(R.Start.getPointer()-LineStart,
-                                         R.End.getPointer()-LineStart));
+      ColRanges.push_back(std::make_pair(R.Start.getPointer() - LineStart,
+                                         R.End.getPointer() - LineStart));
     }
 
     LineAndCol = getLineAndColumn(Loc, CurBuf);
   }
 
   return SMDiagnostic(*this, Loc, BufferID, LineAndCol.first,
-                      LineAndCol.second-1, Kind, Msg.str(),
-                      LineStr, ColRanges, FixIts);
+                      LineAndCol.second - 1, Kind, Msg.str(), LineStr,
+                      ColRanges, FixIts);
 }
 
 void SourceMgr::PrintMessage(raw_ostream &OS, const SMDiagnostic &Diagnostic,
@@ -240,9 +315,9 @@ void SourceMgr::PrintMessage(raw_ostream &OS, const SMDiagnostic &Diagnostic,
 }
 
 void SourceMgr::PrintMessage(raw_ostream &OS, SMLoc Loc,
-                             SourceMgr::DiagKind Kind,
-                             const Twine &Msg, ArrayRef<SMRange> Ranges,
-                             ArrayRef<SMFixIt> FixIts, bool ShowColors) const {
+                             SourceMgr::DiagKind Kind, const Twine &Msg,
+                             ArrayRef<SMRange> Ranges, ArrayRef<SMFixIt> FixIts,
+                             bool ShowColors) const {
   PrintMessage(OS, GetMessage(Loc, Kind, Msg, Ranges, FixIts), ShowColors);
 }
 
@@ -253,22 +328,32 @@ void SourceMgr::PrintMessage(SMLoc Loc, SourceMgr::DiagKind Kind,
 }
 
 //===----------------------------------------------------------------------===//
+// SMFixIt Implementation
+//===----------------------------------------------------------------------===//
+
+SMFixIt::SMFixIt(SMRange R, const Twine &Replacement)
+    : Range(R), Text(Replacement.str()) {
+  assert(R.isValid());
+}
+
+//===----------------------------------------------------------------------===//
 // SMDiagnostic Implementation
 //===----------------------------------------------------------------------===//
 
-SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, StringRef FN,
-                           int Line, int Col, SourceMgr::DiagKind Kind,
-                           StringRef Msg, StringRef LineStr,
-                           ArrayRef<std::pair<unsigned,unsigned>> Ranges,
+SMDiagnostic::SMDiagnostic(const SourceMgr &sm, SMLoc L, StringRef FN, int Line,
+                           int Col, SourceMgr::DiagKind Kind, StringRef Msg,
+                           StringRef LineStr,
+                           ArrayRef<std::pair<unsigned, unsigned>> Ranges,
                            ArrayRef<SMFixIt> Hints)
-  : SM(&sm), Loc(L), Filename(FN), LineNo(Line), ColumnNo(Col), Kind(Kind),
-    Message(Msg), LineContents(LineStr), Ranges(Ranges.vec()),
-    FixIts(Hints.begin(), Hints.end()) {
+    : SM(&sm), Loc(L), Filename(std::string(FN)), LineNo(Line), ColumnNo(Col),
+      Kind(Kind), Message(std::string(Msg)), LineContents(std::string(LineStr)),
+      Ranges(Ranges.vec()), FixIts(Hints.begin(), Hints.end()) {
   llvm::sort(FixIts);
 }
 
 static void buildFixItLine(std::string &CaretLine, std::string &FixItLine,
-                           ArrayRef<SMFixIt> FixIts, ArrayRef<char> SourceLine){
+                           ArrayRef<SMFixIt> FixIts,
+                           ArrayRef<char> SourceLine) {
   if (FixIts.empty())
     return;
 
@@ -277,8 +362,8 @@ static void buildFixItLine(std::string &CaretLine, std::string &FixItLine,
 
   size_t PrevHintEndCol = 0;
 
-  for (ArrayRef<SMFixIt>::iterator I = FixIts.begin(), E = FixIts.end();
-       I != E; ++I) {
+  for (ArrayRef<SMFixIt>::iterator I = FixIts.begin(), E = FixIts.end(); I != E;
+       ++I) {
     // If the fixit contains a newline or tab, ignore it.
     if (I->getText().find_first_of("\n\r\t") != StringRef::npos)
       continue;
@@ -361,14 +446,14 @@ static void printSourceLine(raw_ostream &S, StringRef LineContents) {
   S << '\n';
 }
 
-static bool isNonASCII(char c) {
-  return c & 0x80;
-}
+static bool isNonASCII(char c) { return c & 0x80; }
+
+void SMDiagnostic::print(const char *ProgName, raw_ostream &OS, bool ShowColors,
+                         bool ShowKindLabel) const {
+  ColorMode Mode = ShowColors ? ColorMode::Auto : ColorMode::Disable;
 
-void SMDiagnostic::print(const char *ProgName, raw_ostream &OS,
-                         bool ShowColors, bool ShowKindLabel) const {
   {
-    WithColor S(OS, raw_ostream::SAVEDCOLOR, true, false, !ShowColors);
+    WithColor S(OS, raw_ostream::SAVEDCOLOR, true, false, Mode);
 
     if (ProgName && ProgName[0])
       S << ProgName << ": ";
@@ -405,8 +490,7 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &OS,
     }
   }
 
-  WithColor(OS, raw_ostream::SAVEDCOLOR, true, false, !ShowColors)
-      << Message << '\n';
+  WithColor(OS, raw_ostream::SAVEDCOLOR, true, false, Mode) << Message << '\n';
 
   if (LineNo == -1 || ColumnNo == -1)
     return;
@@ -423,22 +507,21 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &OS,
   size_t NumColumns = LineContents.size();
 
   // Build the line with the caret and ranges.
-  std::string CaretLine(NumColumns+1, ' ');
+  std::string CaretLine(NumColumns + 1, ' ');
 
   // Expand any ranges.
   for (unsigned r = 0, e = Ranges.size(); r != e; ++r) {
     std::pair<unsigned, unsigned> R = Ranges[r];
     std::fill(&CaretLine[R.first],
-              &CaretLine[std::min((size_t)R.second, CaretLine.size())],
-              '~');
+              &CaretLine[std::min((size_t)R.second, CaretLine.size())], '~');
   }
 
   // Add any fix-its.
   // FIXME: Find the beginning of the line properly for multibyte characters.
   std::string FixItInsertionLine;
-  buildFixItLine(CaretLine, FixItInsertionLine, FixIts,
-                 makeArrayRef(Loc.getPointer() - ColumnNo,
-                              LineContents.size()));
+  buildFixItLine(
+      CaretLine, FixItInsertionLine, FixIts,
+      makeArrayRef(Loc.getPointer() - ColumnNo, LineContents.size()));
 
   // Finally, plop on the caret.
   if (unsigned(ColumnNo) <= NumColumns)
@@ -449,12 +532,13 @@ void SMDiagnostic::print(const char *ProgName, raw_ostream &OS,
   // ... and remove trailing whitespace so the output doesn't wrap for it.  We
   // know that the line isn't completely empty because it has the caret in it at
   // least.
-  CaretLine.erase(CaretLine.find_last_not_of(' ')+1);
+  CaretLine.erase(CaretLine.find_last_not_of(' ') + 1);
 
   printSourceLine(OS, LineContents);
 
   {
-    WithColor S(OS, raw_ostream::GREEN, true, false, !ShowColors);
+    ColorMode Mode = ShowColors ? ColorMode::Auto : ColorMode::Disable;
+    WithColor S(OS, raw_ostream::GREEN, true, false, Mode);
 
     // Print out the caret line, matching tabs in the source line.
     for (unsigned i = 0, e = CaretLine.size(), OutCol = 0; i != e; ++i) {
diff --git a/llvm/lib/Support/SpecialCaseList.cpp b/llvm/lib/Support/SpecialCaseList.cpp
index d1ff44cefb08..73f852624a69 100644
--- a/llvm/lib/Support/SpecialCaseList.cpp
+++ b/llvm/lib/Support/SpecialCaseList.cpp
@@ -126,7 +126,7 @@ bool SpecialCaseList::createInternal(const MemoryBuffer *MB,
 bool SpecialCaseList::parse(const MemoryBuffer *MB,
                             StringMap<size_t> &SectionsMap,
                             std::string &Error) {
-  // Iterate through each line in the blacklist file.
+  // Iterate through each line in the exclusion list file.
   SmallVector<StringRef, 16> Lines;
   MB->getBuffer().split(Lines, '\n');
 
@@ -172,14 +172,14 @@ bool SpecialCaseList::parse(const MemoryBuffer *MB,
     }
 
     std::pair<StringRef, StringRef> SplitRegexp = SplitLine.second.split("=");
-    std::string Regexp = SplitRegexp.first;
+    std::string Regexp = std::string(SplitRegexp.first);
     StringRef Category = SplitRegexp.second;
 
     // Create this section if it has not been seen before.
     if (SectionsMap.find(Section) == SectionsMap.end()) {
       std::unique_ptr<Matcher> M = std::make_unique<Matcher>();
       std::string REError;
-      if (!M->insert(Section, LineNo, REError)) {
+      if (!M->insert(std::string(Section), LineNo, REError)) {
         Error = (Twine("malformed section ") + Section + ": '" + REError).str();
         return false;
       }
diff --git a/llvm/lib/Support/Statistic.cpp b/llvm/lib/Support/Statistic.cpp
index 25f13871e2e4..e9308ab575ab 100644
--- a/llvm/lib/Support/Statistic.cpp
+++ b/llvm/lib/Support/Statistic.cpp
@@ -246,7 +246,7 @@ void llvm::PrintStatistics() {
     // Get the stream to write to.
     std::unique_ptr<raw_ostream> OutStream = CreateInfoOutputFile();
     (*OutStream) << "Statistics are disabled.  "
-                 << "Build with asserts or with -DLLVM_ENABLE_STATS\n";
+                 << "Build with asserts or with -DLLVM_FORCE_ENABLE_STATS\n";
   }
 #endif
 }
diff --git a/llvm/lib/Support/StringExtras.cpp b/llvm/lib/Support/StringExtras.cpp
index af8dd463e125..c206bd214519 100644
--- a/llvm/lib/Support/StringExtras.cpp
+++ b/llvm/lib/Support/StringExtras.cpp
@@ -13,6 +13,8 @@
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cctype>
+
 using namespace llvm;
 
 /// StrInStrNoCase - Portable version of strcasestr.  Locates the first
@@ -90,3 +92,46 @@ void llvm::printLowerCase(StringRef String, raw_ostream &Out) {
   for (const char C : String)
     Out << toLower(C);
 }
+
+std::string llvm::convertToSnakeFromCamelCase(StringRef input) {
+  if (input.empty())
+    return "";
+
+  std::string snakeCase;
+  snakeCase.reserve(input.size());
+  for (char c : input) {
+    if (!std::isupper(c)) {
+      snakeCase.push_back(c);
+      continue;
+    }
+
+    if (!snakeCase.empty() && snakeCase.back() != '_')
+      snakeCase.push_back('_');
+    snakeCase.push_back(llvm::toLower(c));
+  }
+  return snakeCase;
+}
+
+std::string llvm::convertToCamelFromSnakeCase(StringRef input,
+                                              bool capitalizeFirst) {
+  if (input.empty())
+    return "";
+
+  std::string output;
+  output.reserve(input.size());
+
+  // Push the first character, capatilizing if necessary.
+  if (capitalizeFirst && std::islower(input.front()))
+    output.push_back(llvm::toUpper(input.front()));
+  else
+    output.push_back(input.front());
+
+  // Walk the input converting any `*_[a-z]` snake case into `*[A-Z]` camelCase.
+  for (size_t pos = 1, e = input.size(); pos < e; ++pos) {
+    if (input[pos] == '_' && pos != (e - 1) && std::islower(input[pos + 1]))
+      output.push_back(llvm::toUpper(input[++pos]));
+    else
+      output.push_back(input[pos]);
+  }
+  return output;
+}
diff --git a/llvm/lib/Support/StringMap.cpp b/llvm/lib/Support/StringMap.cpp
index 6b5ea020dd46..f65d3846623c 100644
--- a/llvm/lib/Support/StringMap.cpp
+++ b/llvm/lib/Support/StringMap.cpp
@@ -12,10 +12,8 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/ADT/StringExtras.h"
-#include "llvm/Support/Compiler.h"
 #include "llvm/Support/DJB.h"
 #include "llvm/Support/MathExtras.h"
-#include <cassert>
 
 using namespace llvm;
 
@@ -50,23 +48,22 @@ StringMapImpl::StringMapImpl(unsigned InitSize, unsigned itemSize) {
 }
 
 void StringMapImpl::init(unsigned InitSize) {
-  assert((InitSize & (InitSize-1)) == 0 &&
+  assert((InitSize & (InitSize - 1)) == 0 &&
          "Init Size must be a power of 2 or zero!");
 
   unsigned NewNumBuckets = InitSize ? InitSize : 16;
   NumItems = 0;
   NumTombstones = 0;
 
-  TheTable = static_cast<StringMapEntryBase **>(
-      safe_calloc(NewNumBuckets+1,
-                  sizeof(StringMapEntryBase **) + sizeof(unsigned)));
+  TheTable = static_cast<StringMapEntryBase **>(safe_calloc(
+      NewNumBuckets + 1, sizeof(StringMapEntryBase **) + sizeof(unsigned)));
 
   // Set the member only if TheTable was successfully allocated
   NumBuckets = NewNumBuckets;
 
   // Allocate one extra bucket, set it to look filled so the iterators stop at
   // end.
-  TheTable[NumBuckets] = (StringMapEntryBase*)2;
+  TheTable[NumBuckets] = (StringMapEntryBase *)2;
 }
 
 /// LookupBucketFor - Look up the bucket that the specified string should end
@@ -76,12 +73,12 @@ void StringMapImpl::init(unsigned InitSize) {
 /// of the string.
 unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
   unsigned HTSize = NumBuckets;
-  if (HTSize == 0) {  // Hash table unallocated so far?
+  if (HTSize == 0) { // Hash table unallocated so far?
     init(16);
     HTSize = NumBuckets;
   }
   unsigned FullHashValue = djbHash(Name, 0);
-  unsigned BucketNo = FullHashValue & (HTSize-1);
+  unsigned BucketNo = FullHashValue & (HTSize - 1);
   unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1);
 
   unsigned ProbeAmt = 1;
@@ -103,7 +100,8 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
 
     if (BucketItem == getTombstoneVal()) {
       // Skip over tombstones.  However, remember the first one we see.
-      if (FirstTombstone == -1) FirstTombstone = BucketNo;
+      if (FirstTombstone == -1)
+        FirstTombstone = BucketNo;
     } else if (LLVM_LIKELY(HashTable[BucketNo] == FullHashValue)) {
       // If the full hash value matches, check deeply for a match.  The common
       // case here is that we are only looking at the buckets (for item info
@@ -112,7 +110,7 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
 
       // Do the comparison like this because Name isn't necessarily
       // null-terminated!
-      char *ItemStr = (char*)BucketItem+ItemSize;
+      char *ItemStr = (char *)BucketItem + ItemSize;
       if (Name == StringRef(ItemStr, BucketItem->getKeyLength())) {
         // We found a match!
         return BucketNo;
@@ -120,7 +118,7 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
     }
 
     // Okay, we didn't find the item.  Probe to the next bucket.
-    BucketNo = (BucketNo+ProbeAmt) & (HTSize-1);
+    BucketNo = (BucketNo + ProbeAmt) & (HTSize - 1);
 
     // Use quadratic probing, it has fewer clumping artifacts than linear
     // probing and has good cache behavior in the common case.
@@ -133,9 +131,10 @@ unsigned StringMapImpl::LookupBucketFor(StringRef Name) {
 /// This does not modify the map.
 int StringMapImpl::FindKey(StringRef Key) const {
   unsigned HTSize = NumBuckets;
-  if (HTSize == 0) return -1;  // Really empty table?
+  if (HTSize == 0)
+    return -1; // Really empty table?
   unsigned FullHashValue = djbHash(Key, 0);
-  unsigned BucketNo = FullHashValue & (HTSize-1);
+  unsigned BucketNo = FullHashValue & (HTSize - 1);
   unsigned *HashTable = (unsigned *)(TheTable + NumBuckets + 1);
 
   unsigned ProbeAmt = 1;
@@ -155,7 +154,7 @@ int StringMapImpl::FindKey(StringRef Key) const {
 
       // Do the comparison like this because NameStart isn't necessarily
       // null-terminated!
-      char *ItemStr = (char*)BucketItem+ItemSize;
+      char *ItemStr = (char *)BucketItem + ItemSize;
       if (Key == StringRef(ItemStr, BucketItem->getKeyLength())) {
         // We found a match!
         return BucketNo;
@@ -163,7 +162,7 @@ int StringMapImpl::FindKey(StringRef Key) const {
     }
 
     // Okay, we didn't find the item.  Probe to the next bucket.
-    BucketNo = (BucketNo+ProbeAmt) & (HTSize-1);
+    BucketNo = (BucketNo + ProbeAmt) & (HTSize - 1);
 
     // Use quadratic probing, it has fewer clumping artifacts than linear
     // probing and has good cache behavior in the common case.
@@ -174,7 +173,7 @@ int StringMapImpl::FindKey(StringRef Key) const {
 /// RemoveKey - Remove the specified StringMapEntry from the table, but do not
 /// delete it.  This aborts if the value isn't in the table.
 void StringMapImpl::RemoveKey(StringMapEntryBase *V) {
-  const char *VStr = (char*)V + ItemSize;
+  const char *VStr = (char *)V + ItemSize;
   StringMapEntryBase *V2 = RemoveKey(StringRef(VStr, V->getKeyLength()));
   (void)V2;
   assert(V == V2 && "Didn't find key?");
@@ -184,7 +183,8 @@ void StringMapImpl::RemoveKey(StringMapEntryBase *V) {
 /// table, returning it.  If the key is not in the table, this returns null.
 StringMapEntryBase *StringMapImpl::RemoveKey(StringRef Key) {
   int Bucket = FindKey(Key);
-  if (Bucket == -1) return nullptr;
+  if (Bucket == -1)
+    return nullptr;
 
   StringMapEntryBase *Result = TheTable[Bucket];
   TheTable[Bucket] = getTombstoneVal();
@@ -205,7 +205,7 @@ unsigned StringMapImpl::RehashTable(unsigned BucketNo) {
   // the buckets are empty (meaning that many are filled with tombstones),
   // grow/rehash the table.
   if (LLVM_UNLIKELY(NumItems * 4 > NumBuckets * 3)) {
-    NewSize = NumBuckets*2;
+    NewSize = NumBuckets * 2;
   } else if (LLVM_UNLIKELY(NumBuckets - (NumItems + NumTombstones) <=
                            NumBuckets / 8)) {
     NewSize = NumBuckets;
@@ -216,11 +216,11 @@ unsigned StringMapImpl::RehashTable(unsigned BucketNo) {
   unsigned NewBucketNo = BucketNo;
   // Allocate one extra bucket which will always be non-empty.  This allows the
   // iterators to stop at end.
-  auto NewTableArray = static_cast<StringMapEntryBase **>(
-      safe_calloc(NewSize+1, sizeof(StringMapEntryBase *) + sizeof(unsigned)));
+  auto NewTableArray = static_cast<StringMapEntryBase **>(safe_calloc(
+      NewSize + 1, sizeof(StringMapEntryBase *) + sizeof(unsigned)));
 
   unsigned *NewHashArray = (unsigned *)(NewTableArray + NewSize + 1);
-  NewTableArray[NewSize] = (StringMapEntryBase*)2;
+  NewTableArray[NewSize] = (StringMapEntryBase *)2;
 
   // Rehash all the items into their new buckets.  Luckily :) we already have
   // the hash values available, so we don't have to rehash any strings.
@@ -229,10 +229,10 @@ unsigned StringMapImpl::RehashTable(unsigned BucketNo) {
     if (Bucket && Bucket != getTombstoneVal()) {
       // Fast case, bucket available.
       unsigned FullHash = HashTable[I];
-      unsigned NewBucket = FullHash & (NewSize-1);
+      unsigned NewBucket = FullHash & (NewSize - 1);
       if (!NewTableArray[NewBucket]) {
-        NewTableArray[FullHash & (NewSize-1)] = Bucket;
-        NewHashArray[FullHash & (NewSize-1)] = FullHash;
+        NewTableArray[FullHash & (NewSize - 1)] = Bucket;
+        NewHashArray[FullHash & (NewSize - 1)] = FullHash;
         if (I == BucketNo)
           NewBucketNo = NewBucket;
         continue;
@@ -241,7 +241,7 @@ unsigned StringMapImpl::RehashTable(unsigned BucketNo) {
       // Otherwise probe for a spot.
       unsigned ProbeSize = 1;
       do {
-        NewBucket = (NewBucket + ProbeSize++) & (NewSize-1);
+        NewBucket = (NewBucket + ProbeSize++) & (NewSize - 1);
       } while (NewTableArray[NewBucket]);
 
       // Finally found a slot.  Fill it in.
diff --git a/llvm/lib/Support/StringPool.cpp b/llvm/lib/Support/StringPool.cpp
deleted file mode 100644
index 82351017b8cc..000000000000
--- a/llvm/lib/Support/StringPool.cpp
+++ /dev/null
@@ -1,34 +0,0 @@
-//===-- StringPool.cpp - Interned string pool -----------------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file implements the StringPool class.
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/StringPool.h"
-#include "llvm/ADT/StringRef.h"
-
-using namespace llvm;
-
-StringPool::StringPool() {}
-
-StringPool::~StringPool() {
-  assert(InternTable.empty() && "PooledStringPtr leaked!");
-}
-
-PooledStringPtr StringPool::intern(StringRef Key) {
-  table_t::iterator I = InternTable.find(Key);
-  if (I != InternTable.end())
-    return PooledStringPtr(&*I);
-
-  entry_t *S = entry_t::Create(Key);
-  S->getValue().Pool = this;
-  InternTable.insert(S);
-
-  return PooledStringPtr(S);
-}
diff --git a/llvm/lib/Support/StringRef.cpp b/llvm/lib/Support/StringRef.cpp
index 104482de4ad7..ab67ef9ce85c 100644
--- a/llvm/lib/Support/StringRef.cpp
+++ b/llvm/lib/Support/StringRef.cpp
@@ -19,7 +19,7 @@ using namespace llvm;
 
 // MSVC emits references to this into the translation units which reference it.
 #ifndef _MSC_VER
-const size_t StringRef::npos;
+constexpr size_t StringRef::npos;
 #endif
 
 // strncasecmp() is not available on non-POSIX systems, so define an
@@ -106,19 +106,13 @@ unsigned StringRef::edit_distance(llvm::StringRef Other,
 //===----------------------------------------------------------------------===//
 
 std::string StringRef::lower() const {
-  std::string Result(size(), char());
-  for (size_type i = 0, e = size(); i != e; ++i) {
-    Result[i] = toLower(Data[i]);
-  }
-  return Result;
+  return std::string(map_iterator(begin(), toLower),
+                     map_iterator(end(), toLower));
 }
 
 std::string StringRef::upper() const {
-  std::string Result(size(), char());
-  for (size_type i = 0, e = size(); i != e; ++i) {
-    Result[i] = toUpper(Data[i]);
-  }
-  return Result;
+  return std::string(map_iterator(begin(), toUpper),
+                     map_iterator(end(), toUpper));
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Support/SuffixTree.cpp b/llvm/lib/Support/SuffixTree.cpp
new file mode 100644
index 000000000000..0d419f12cd1d
--- /dev/null
+++ b/llvm/lib/Support/SuffixTree.cpp
@@ -0,0 +1,210 @@
+//===- llvm/Support/SuffixTree.cpp - Implement Suffix Tree ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the Suffix Tree class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/SuffixTree.h"
+#include "llvm/Support/Allocator.h"
+#include <vector>
+
+using namespace llvm;
+
+SuffixTree::SuffixTree(const std::vector<unsigned> &Str) : Str(Str) {
+  Root = insertInternalNode(nullptr, EmptyIdx, EmptyIdx, 0);
+  Active.Node = Root;
+
+  // Keep track of the number of suffixes we have to add of the current
+  // prefix.
+  unsigned SuffixesToAdd = 0;
+
+  // Construct the suffix tree iteratively on each prefix of the string.
+  // PfxEndIdx is the end index of the current prefix.
+  // End is one past the last element in the string.
+  for (unsigned PfxEndIdx = 0, End = Str.size(); PfxEndIdx < End; PfxEndIdx++) {
+    SuffixesToAdd++;
+    LeafEndIdx = PfxEndIdx; // Extend each of the leaves.
+    SuffixesToAdd = extend(PfxEndIdx, SuffixesToAdd);
+  }
+
+  // Set the suffix indices of each leaf.
+  assert(Root && "Root node can't be nullptr!");
+  setSuffixIndices();
+}
+
+SuffixTreeNode *SuffixTree::insertLeaf(SuffixTreeNode &Parent,
+                                       unsigned StartIdx, unsigned Edge) {
+
+  assert(StartIdx <= LeafEndIdx && "String can't start after it ends!");
+
+  SuffixTreeNode *N = new (NodeAllocator.Allocate())
+      SuffixTreeNode(StartIdx, &LeafEndIdx, nullptr);
+  Parent.Children[Edge] = N;
+
+  return N;
+}
+
+SuffixTreeNode *SuffixTree::insertInternalNode(SuffixTreeNode *Parent,
+                                               unsigned StartIdx,
+                                               unsigned EndIdx, unsigned Edge) {
+
+  assert(StartIdx <= EndIdx && "String can't start after it ends!");
+  assert(!(!Parent && StartIdx != EmptyIdx) &&
+         "Non-root internal nodes must have parents!");
+
+  unsigned *E = new (InternalEndIdxAllocator) unsigned(EndIdx);
+  SuffixTreeNode *N =
+      new (NodeAllocator.Allocate()) SuffixTreeNode(StartIdx, E, Root);
+  if (Parent)
+    Parent->Children[Edge] = N;
+
+  return N;
+}
+
+void SuffixTree::setSuffixIndices() {
+  // List of nodes we need to visit along with the current length of the
+  // string.
+  std::vector<std::pair<SuffixTreeNode *, unsigned>> ToVisit;
+
+  // Current node being visited.
+  SuffixTreeNode *CurrNode = Root;
+
+  // Sum of the lengths of the nodes down the path to the current one.
+  unsigned CurrNodeLen = 0;
+  ToVisit.push_back({CurrNode, CurrNodeLen});
+  while (!ToVisit.empty()) {
+    std::tie(CurrNode, CurrNodeLen) = ToVisit.back();
+    ToVisit.pop_back();
+    CurrNode->ConcatLen = CurrNodeLen;
+    for (auto &ChildPair : CurrNode->Children) {
+      assert(ChildPair.second && "Node had a null child!");
+      ToVisit.push_back(
+          {ChildPair.second, CurrNodeLen + ChildPair.second->size()});
+    }
+
+    // No children, so we are at the end of the string.
+    if (CurrNode->Children.size() == 0 && !CurrNode->isRoot())
+      CurrNode->SuffixIdx = Str.size() - CurrNodeLen;
+  }
+}
+
+unsigned SuffixTree::extend(unsigned EndIdx, unsigned SuffixesToAdd) {
+  SuffixTreeNode *NeedsLink = nullptr;
+
+  while (SuffixesToAdd > 0) {
+
+    // Are we waiting to add anything other than just the last character?
+    if (Active.Len == 0) {
+      // If not, then say the active index is the end index.
+      Active.Idx = EndIdx;
+    }
+
+    assert(Active.Idx <= EndIdx && "Start index can't be after end index!");
+
+    // The first character in the current substring we're looking at.
+    unsigned FirstChar = Str[Active.Idx];
+
+    // Have we inserted anything starting with FirstChar at the current node?
+    if (Active.Node->Children.count(FirstChar) == 0) {
+      // If not, then we can just insert a leaf and move to the next step.
+      insertLeaf(*Active.Node, EndIdx, FirstChar);
+
+      // The active node is an internal node, and we visited it, so it must
+      // need a link if it doesn't have one.
+      if (NeedsLink) {
+        NeedsLink->Link = Active.Node;
+        NeedsLink = nullptr;
+      }
+    } else {
+      // There's a match with FirstChar, so look for the point in the tree to
+      // insert a new node.
+      SuffixTreeNode *NextNode = Active.Node->Children[FirstChar];
+
+      unsigned SubstringLen = NextNode->size();
+
+      // Is the current suffix we're trying to insert longer than the size of
+      // the child we want to move to?
+      if (Active.Len >= SubstringLen) {
+        // If yes, then consume the characters we've seen and move to the next
+        // node.
+        Active.Idx += SubstringLen;
+        Active.Len -= SubstringLen;
+        Active.Node = NextNode;
+        continue;
+      }
+
+      // Otherwise, the suffix we're trying to insert must be contained in the
+      // next node we want to move to.
+      unsigned LastChar = Str[EndIdx];
+
+      // Is the string we're trying to insert a substring of the next node?
+      if (Str[NextNode->StartIdx + Active.Len] == LastChar) {
+        // If yes, then we're done for this step. Remember our insertion point
+        // and move to the next end index. At this point, we have an implicit
+        // suffix tree.
+        if (NeedsLink && !Active.Node->isRoot()) {
+          NeedsLink->Link = Active.Node;
+          NeedsLink = nullptr;
+        }
+
+        Active.Len++;
+        break;
+      }
+
+      // The string we're trying to insert isn't a substring of the next node,
+      // but matches up to a point. Split the node.
+      //
+      // For example, say we ended our search at a node n and we're trying to
+      // insert ABD. Then we'll create a new node s for AB, reduce n to just
+      // representing C, and insert a new leaf node l to represent d. This
+      // allows us to ensure that if n was a leaf, it remains a leaf.
+      //
+      //   | ABC  ---split--->  | AB
+      //   n                    s
+      //                     C / \ D
+      //                      n   l
+
+      // The node s from the diagram
+      SuffixTreeNode *SplitNode =
+          insertInternalNode(Active.Node, NextNode->StartIdx,
+                             NextNode->StartIdx + Active.Len - 1, FirstChar);
+
+      // Insert the new node representing the new substring into the tree as
+      // a child of the split node. This is the node l from the diagram.
+      insertLeaf(*SplitNode, EndIdx, LastChar);
+
+      // Make the old node a child of the split node and update its start
+      // index. This is the node n from the diagram.
+      NextNode->StartIdx += Active.Len;
+      SplitNode->Children[Str[NextNode->StartIdx]] = NextNode;
+
+      // SplitNode is an internal node, update the suffix link.
+      if (NeedsLink)
+        NeedsLink->Link = SplitNode;
+
+      NeedsLink = SplitNode;
+    }
+
+    // We've added something new to the tree, so there's one less suffix to
+    // add.
+    SuffixesToAdd--;
+
+    if (Active.Node->isRoot()) {
+      if (Active.Len > 0) {
+        Active.Len--;
+        Active.Idx = EndIdx - SuffixesToAdd + 1;
+      }
+    } else {
+      // Start the next phase at the next smallest suffix.
+      Active.Node = Active.Node->Link;
+    }
+  }
+
+  return SuffixesToAdd;
+}
diff --git a/llvm/lib/Support/SystemUtils.cpp b/llvm/lib/Support/SystemUtils.cpp
index 47e0c72ec7c1..f1149e48dce5 100644
--- a/llvm/lib/Support/SystemUtils.cpp
+++ b/llvm/lib/Support/SystemUtils.cpp
@@ -15,15 +15,12 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-bool llvm::CheckBitcodeOutputToConsole(raw_ostream &stream_to_check,
-                                       bool print_warning) {
+bool llvm::CheckBitcodeOutputToConsole(raw_ostream &stream_to_check) {
   if (stream_to_check.is_displayed()) {
-    if (print_warning) {
-      errs() << "WARNING: You're attempting to print out a bitcode file.\n"
-                "This is inadvisable as it may cause display problems. If\n"
-                "you REALLY want to taste LLVM bitcode first-hand, you\n"
-                "can force output with the `-f' option.\n\n";
-    }
+    errs() << "WARNING: You're attempting to print out a bitcode file.\n"
+              "This is inadvisable as it may cause display problems. If\n"
+              "you REALLY want to taste LLVM bitcode first-hand, you\n"
+              "can force output with the `-f' option.\n\n";
     return true;
   }
   return false;
diff --git a/llvm/lib/Support/TarWriter.cpp b/llvm/lib/Support/TarWriter.cpp
index 6136e9219767..c7a744f0fc98 100644
--- a/llvm/lib/Support/TarWriter.cpp
+++ b/llvm/lib/Support/TarWriter.cpp
@@ -131,7 +131,17 @@ static bool splitUstar(StringRef Path, StringRef &Prefix, StringRef &Name) {
     return true;
   }
 
-  size_t Sep = Path.rfind('/', sizeof(UstarHeader::Prefix) + 1);
+  // tar 1.13 and earlier unconditionally look at the tar header interpreted
+  // as an 'oldgnu_header', which has an 'isextended' byte at offset 482 in the
+  // header, corresponding to offset 137 in the prefix. That's the version of
+  // tar in gnuwin, so only use 137 of the 155 bytes in the prefix. This means
+  // we'll need a pax header after 237 bytes of path instead of after 255,
+  // but in return paths up to 237 bytes work with gnuwin, instead of just
+  // 137 bytes of directory + 100 bytes of basename previously.
+  // (tar-1.13 also doesn't support pax headers, but in practice all paths in
+  // llvm's test suite are short enough for that to not matter.)
+  const int MaxPrefix = 137;
+  size_t Sep = Path.rfind('/', MaxPrefix + 1);
   if (Sep == StringRef::npos)
     return false;
   if (Path.size() - Sep - 1 >= sizeof(UstarHeader::Name))
@@ -167,7 +177,8 @@ Expected<std::unique_ptr<TarWriter>> TarWriter::create(StringRef OutputPath,
 }
 
 TarWriter::TarWriter(int FD, StringRef BaseDir)
-    : OS(FD, /*shouldClose=*/true, /*unbuffered=*/false), BaseDir(BaseDir) {}
+    : OS(FD, /*shouldClose=*/true, /*unbuffered=*/false),
+      BaseDir(std::string(BaseDir)) {}
 
 // Append a given file to an archive.
 void TarWriter::append(StringRef Path, StringRef Data) {
diff --git a/llvm/lib/Support/TargetParser.cpp b/llvm/lib/Support/TargetParser.cpp
index 84ead58b98cd..be9b541237c7 100644
--- a/llvm/lib/Support/TargetParser.cpp
+++ b/llvm/lib/Support/TargetParser.cpp
@@ -62,7 +62,7 @@ constexpr GPUInfo R600GPUs[26] = {
 
 // This table should be sorted by the value of GPUKind
 // Don't bother listing the implicitly true features
-constexpr GPUInfo AMDGCNGPUs[37] = {
+constexpr GPUInfo AMDGCNGPUs[38] = {
   // Name         Canonical    Kind        Features
   //              Name
   {{"gfx600"},    {"gfx600"},  GK_GFX600,  FEATURE_FAST_FMA_F32},
@@ -99,9 +99,10 @@ constexpr GPUInfo AMDGCNGPUs[37] = {
   {{"gfx906"},    {"gfx906"},  GK_GFX906,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
   {{"gfx908"},    {"gfx908"},  GK_GFX908,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
   {{"gfx909"},    {"gfx909"},  GK_GFX909,  FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
-  {{"gfx1010"},   {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
-  {{"gfx1011"},   {"gfx1011"}, GK_GFX1011, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
-  {{"gfx1012"},   {"gfx1012"}, GK_GFX1012, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32},
+  {{"gfx1010"},   {"gfx1010"}, GK_GFX1010, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+  {{"gfx1011"},   {"gfx1011"}, GK_GFX1011, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+  {{"gfx1012"},   {"gfx1012"}, GK_GFX1012, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
+  {{"gfx1030"},   {"gfx1030"}, GK_GFX1030, FEATURE_FAST_FMA_F32|FEATURE_FAST_DENORMAL_F32|FEATURE_WAVE32},
 };
 
 const GPUInfo *getArchEntry(AMDGPU::GPUKind AK, ArrayRef<GPUInfo> Table) {
@@ -203,6 +204,7 @@ AMDGPU::IsaVersion AMDGPU::getIsaVersion(StringRef GPU) {
   case GK_GFX1010: return {10, 1, 0};
   case GK_GFX1011: return {10, 1, 1};
   case GK_GFX1012: return {10, 1, 2};
+  case GK_GFX1030: return {10, 3, 0};
   default:         return {0, 0, 0};
   }
 }
diff --git a/llvm/lib/Support/ThreadPool.cpp b/llvm/lib/Support/ThreadPool.cpp
index 40982d777914..46a1990cd719 100644
--- a/llvm/lib/Support/ThreadPool.cpp
+++ b/llvm/lib/Support/ThreadPool.cpp
@@ -20,16 +20,14 @@ using namespace llvm;
 
 #if LLVM_ENABLE_THREADS
 
-// Default to hardware_concurrency
-ThreadPool::ThreadPool() : ThreadPool(hardware_concurrency()) {}
-
-ThreadPool::ThreadPool(unsigned ThreadCount)
-    : ActiveThreads(0), EnableFlag(true) {
+ThreadPool::ThreadPool(ThreadPoolStrategy S)
+    : ThreadCount(S.compute_thread_count()) {
   // Create ThreadCount threads that will loop forever, wait on QueueCondition
   // for tasks to be queued or the Pool to be destroyed.
   Threads.reserve(ThreadCount);
   for (unsigned ThreadID = 0; ThreadID < ThreadCount; ++ThreadID) {
-    Threads.emplace_back([&] {
+    Threads.emplace_back([S, ThreadID, this] {
+      S.apply_thread_strategy(ThreadID);
       while (true) {
         PackagedTaskTy Task;
         {
@@ -45,24 +43,24 @@ ThreadPool::ThreadPool(unsigned ThreadCount)
           // We first need to signal that we are active before popping the queue
           // in order for wait() to properly detect that even if the queue is
           // empty, there is still a task in flight.
-          {
-            std::unique_lock<std::mutex> LockGuard(CompletionLock);
-            ++ActiveThreads;
-          }
+          ++ActiveThreads;
           Task = std::move(Tasks.front());
           Tasks.pop();
         }
         // Run the task we just grabbed
         Task();
 
+        bool Notify;
         {
           // Adjust `ActiveThreads`, in case someone waits on ThreadPool::wait()
-          std::unique_lock<std::mutex> LockGuard(CompletionLock);
+          std::lock_guard<std::mutex> LockGuard(QueueLock);
           --ActiveThreads;
+          Notify = workCompletedUnlocked();
         }
-
-        // Notify task completion, in case someone waits on ThreadPool::wait()
-        CompletionCondition.notify_all();
+        // Notify task completion if this is the last active thread, in case
+        // someone waits on ThreadPool::wait().
+        if (Notify)
+          CompletionCondition.notify_all();
       }
     });
   }
@@ -70,12 +68,8 @@ ThreadPool::ThreadPool(unsigned ThreadCount)
 
 void ThreadPool::wait() {
   // Wait for all threads to complete and the queue to be empty
-  std::unique_lock<std::mutex> LockGuard(CompletionLock);
-  // The order of the checks for ActiveThreads and Tasks.empty() matters because
-  // any active threads might be modifying the Tasks queue, and this would be a
-  // race.
-  CompletionCondition.wait(LockGuard,
-                           [&] { return !ActiveThreads && Tasks.empty(); });
+  std::unique_lock<std::mutex> LockGuard(QueueLock);
+  CompletionCondition.wait(LockGuard, [&] { return workCompletedUnlocked(); });
 }
 
 std::shared_future<void> ThreadPool::asyncImpl(TaskTy Task) {
@@ -108,12 +102,10 @@ ThreadPool::~ThreadPool() {
 
 #else // LLVM_ENABLE_THREADS Disabled
 
-ThreadPool::ThreadPool() : ThreadPool(0) {}
-
 // No threads are launched, issue a warning if ThreadCount is not 0
-ThreadPool::ThreadPool(unsigned ThreadCount)
-    : ActiveThreads(0) {
-  if (ThreadCount) {
+ThreadPool::ThreadPool(ThreadPoolStrategy S)
+    : ThreadCount(S.compute_thread_count()) {
+  if (ThreadCount != 1) {
     errs() << "Warning: request a ThreadPool with " << ThreadCount
            << " threads, but LLVM_ENABLE_THREADS has been turned off\n";
   }
@@ -138,8 +130,6 @@ std::shared_future<void> ThreadPool::asyncImpl(TaskTy Task) {
   return Future;
 }
 
-ThreadPool::~ThreadPool() {
-  wait();
-}
+ThreadPool::~ThreadPool() { wait(); }
 
 #endif
diff --git a/llvm/lib/Support/Threading.cpp b/llvm/lib/Support/Threading.cpp
index 48750cef5ec2..61f8ee5be5b3 100644
--- a/llvm/lib/Support/Threading.cpp
+++ b/llvm/lib/Support/Threading.cpp
@@ -45,10 +45,6 @@ void llvm::llvm_execute_on_thread(void (*Fn)(void *), void *UserData,
   Fn(UserData);
 }
 
-unsigned llvm::heavyweight_hardware_concurrency() { return 1; }
-
-unsigned llvm::hardware_concurrency() { return 1; }
-
 uint64_t llvm::get_threadid() { return 0; }
 
 uint32_t llvm::get_max_thread_name_length() { return 0; }
@@ -57,6 +53,13 @@ void llvm::set_thread_name(const Twine &Name) {}
 
 void llvm::get_thread_name(SmallVectorImpl<char> &Name) { Name.clear(); }
 
+llvm::BitVector llvm::get_thread_affinity_mask() { return {}; }
+
+unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
+  // When threads are disabled, ensure clients will loop at least once.
+  return 1;
+}
+
 #if LLVM_ENABLE_THREADS == 0
 void llvm::llvm_execute_on_thread_async(
     llvm::unique_function<void()> Func,
@@ -78,30 +81,18 @@ void llvm::llvm_execute_on_thread_async(
 
 #else
 
-#include <thread>
-unsigned llvm::heavyweight_hardware_concurrency() {
-  // Since we can't get here unless LLVM_ENABLE_THREADS == 1, it is safe to use
-  // `std::thread` directly instead of `llvm::thread` (and indeed, doing so
-  // allows us to not define `thread` in the llvm namespace, which conflicts
-  // with some platforms such as FreeBSD whose headers also define a struct
-  // called `thread` in the global namespace which can cause ambiguity due to
-  // ADL.
-  int NumPhysical = sys::getHostNumPhysicalCores();
-  if (NumPhysical == -1)
-    return std::thread::hardware_concurrency();
-  return NumPhysical;
-}
-
-unsigned llvm::hardware_concurrency() {
-#if defined(HAVE_SCHED_GETAFFINITY) && defined(HAVE_CPU_COUNT)
-  cpu_set_t Set;
-  if (sched_getaffinity(0, sizeof(Set), &Set))
-    return CPU_COUNT(&Set);
-#endif
-  // Guard against std::thread::hardware_concurrency() returning 0.
-  if (unsigned Val = std::thread::hardware_concurrency())
-    return Val;
-  return 1;
+int computeHostNumHardwareThreads();
+
+unsigned llvm::ThreadPoolStrategy::compute_thread_count() const {
+  int MaxThreadCount = UseHyperThreads ? computeHostNumHardwareThreads()
+                                       : sys::getHostNumPhysicalCores();
+  if (MaxThreadCount <= 0)
+    MaxThreadCount = 1;
+  if (ThreadsRequested == 0)
+    return MaxThreadCount;
+  if (!Limit)
+    return ThreadsRequested;
+  return std::min((unsigned)MaxThreadCount, ThreadsRequested);
 }
 
 namespace {
@@ -140,3 +131,23 @@ void llvm::llvm_execute_on_thread_async(
 }
 
 #endif
+
+Optional<ThreadPoolStrategy>
+llvm::get_threadpool_strategy(StringRef Num, ThreadPoolStrategy Default) {
+  if (Num == "all")
+    return llvm::hardware_concurrency();
+  if (Num.empty())
+    return Default;
+  unsigned V;
+  if (Num.getAsInteger(10, V))
+    return None; // malformed 'Num' value
+  if (V == 0)
+    return Default;
+
+  // Do not take the Default into account. This effectively disables
+  // heavyweight_hardware_concurrency() if the user asks for any number of
+  // threads on the cmd-line.
+  ThreadPoolStrategy S = llvm::hardware_concurrency();
+  S.ThreadsRequested = V;
+  return S;
+}
diff --git a/llvm/lib/Support/TimeProfiler.cpp b/llvm/lib/Support/TimeProfiler.cpp
index a7c85509064e..93bf6f57e348 100644
--- a/llvm/lib/Support/TimeProfiler.cpp
+++ b/llvm/lib/Support/TimeProfiler.cpp
@@ -11,20 +11,33 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/TimeProfiler.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/JSON.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Process.h"
+#include "llvm/Support/Threading.h"
+#include <algorithm>
 #include <cassert>
 #include <chrono>
+#include <mutex>
 #include <string>
 #include <vector>
 
 using namespace std::chrono;
+using namespace llvm;
 
-namespace llvm {
+static std::mutex Mu;
+// List of all instances
+static std::vector<TimeTraceProfiler *>
+    ThreadTimeTraceProfilerInstances; // GUARDED_BY(Mu)
+// Per Thread instance
+static LLVM_THREAD_LOCAL TimeTraceProfiler *TimeTraceProfilerInstance = nullptr;
 
-TimeTraceProfiler *TimeTraceProfilerInstance = nullptr;
+TimeTraceProfiler *llvm::getTimeTraceProfilerInstance() {
+  return TimeTraceProfilerInstance;
+}
 
 typedef duration<steady_clock::rep, steady_clock::period> DurationType;
 typedef time_point<steady_clock> TimePointType;
@@ -32,6 +45,7 @@ typedef std::pair<size_t, DurationType> CountAndDurationType;
 typedef std::pair<std::string, CountAndDurationType>
     NameAndCountAndDurationType;
 
+namespace {
 struct Entry {
   const TimePointType Start;
   TimePointType End;
@@ -57,11 +71,15 @@ struct Entry {
         .count();
   }
 };
+} // namespace
 
-struct TimeTraceProfiler {
+struct llvm::TimeTraceProfiler {
   TimeTraceProfiler(unsigned TimeTraceGranularity = 0, StringRef ProcName = "")
-      : StartTime(steady_clock::now()), ProcName(ProcName),
-        TimeTraceGranularity(TimeTraceGranularity) {}
+      : BeginningOfTime(system_clock::now()), StartTime(steady_clock::now()),
+        ProcName(ProcName), Pid(sys::Process::getProcessId()),
+        Tid(llvm::get_threadid()), TimeTraceGranularity(TimeTraceGranularity) {
+    llvm::get_thread_name(ThreadName);
+  }
 
   void begin(std::string Name, llvm::function_ref<std::string()> Detail) {
     Stack.emplace_back(steady_clock::now(), TimePointType(), std::move(Name),
@@ -70,7 +88,7 @@ struct TimeTraceProfiler {
 
   void end() {
     assert(!Stack.empty() && "Must call begin() first");
-    auto &E = Stack.back();
+    Entry &E = Stack.back();
     E.End = steady_clock::now();
 
     // Check that end times monotonically increase.
@@ -103,22 +121,30 @@ struct TimeTraceProfiler {
     Stack.pop_back();
   }
 
-  void Write(raw_pwrite_stream &OS) {
+  // Write events from this TimeTraceProfilerInstance and
+  // ThreadTimeTraceProfilerInstances.
+  void write(raw_pwrite_stream &OS) {
+    // Acquire Mutex as reading ThreadTimeTraceProfilerInstances.
+    std::lock_guard<std::mutex> Lock(Mu);
     assert(Stack.empty() &&
-           "All profiler sections should be ended when calling Write");
+           "All profiler sections should be ended when calling write");
+    assert(llvm::all_of(ThreadTimeTraceProfilerInstances,
+                        [](const auto &TTP) { return TTP->Stack.empty(); }) &&
+           "All profiler sections should be ended when calling write");
+
     json::OStream J(OS);
     J.objectBegin();
     J.attributeBegin("traceEvents");
     J.arrayBegin();
 
     // Emit all events for the main flame graph.
-    for (const auto &E : Entries) {
+    auto writeEvent = [&](const auto &E, uint64_t Tid) {
       auto StartUs = E.getFlameGraphStartUs(StartTime);
       auto DurUs = E.getFlameGraphDurUs();
 
-      J.object([&]{
-        J.attribute("pid", 1);
-        J.attribute("tid", 0);
+      J.object([&] {
+        J.attribute("pid", Pid);
+        J.attribute("tid", int64_t(Tid));
         J.attribute("ph", "X");
         J.attribute("ts", StartUs);
         J.attribute("dur", DurUs);
@@ -127,100 +153,178 @@ struct TimeTraceProfiler {
           J.attributeObject("args", [&] { J.attribute("detail", E.Detail); });
         }
       });
-    }
+    };
+    for (const Entry &E : Entries)
+      writeEvent(E, this->Tid);
+    for (const TimeTraceProfiler *TTP : ThreadTimeTraceProfilerInstances)
+      for (const Entry &E : TTP->Entries)
+        writeEvent(E, TTP->Tid);
 
     // Emit totals by section name as additional "thread" events, sorted from
     // longest one.
-    int Tid = 1;
+    // Find highest used thread id.
+    uint64_t MaxTid = this->Tid;
+    for (const TimeTraceProfiler *TTP : ThreadTimeTraceProfilerInstances)
+      MaxTid = std::max(MaxTid, TTP->Tid);
+
+    // Combine all CountAndTotalPerName from threads into one.
+    StringMap<CountAndDurationType> AllCountAndTotalPerName;
+    auto combineStat = [&](const auto &Stat) {
+      StringRef Key = Stat.getKey();
+      auto Value = Stat.getValue();
+      auto &CountAndTotal = AllCountAndTotalPerName[Key];
+      CountAndTotal.first += Value.first;
+      CountAndTotal.second += Value.second;
+    };
+    for (const auto &Stat : CountAndTotalPerName)
+      combineStat(Stat);
+    for (const TimeTraceProfiler *TTP : ThreadTimeTraceProfilerInstances)
+      for (const auto &Stat : TTP->CountAndTotalPerName)
+        combineStat(Stat);
+
     std::vector<NameAndCountAndDurationType> SortedTotals;
-    SortedTotals.reserve(CountAndTotalPerName.size());
-    for (const auto &E : CountAndTotalPerName)
-      SortedTotals.emplace_back(E.getKey(), E.getValue());
-
-    llvm::sort(SortedTotals.begin(), SortedTotals.end(),
-               [](const NameAndCountAndDurationType &A,
-                  const NameAndCountAndDurationType &B) {
-                 return A.second.second > B.second.second;
-               });
-    for (const auto &E : SortedTotals) {
-      auto DurUs = duration_cast<microseconds>(E.second.second).count();
-      auto Count = CountAndTotalPerName[E.first].first;
-
-      J.object([&]{
-        J.attribute("pid", 1);
-        J.attribute("tid", Tid);
+    SortedTotals.reserve(AllCountAndTotalPerName.size());
+    for (const auto &Total : AllCountAndTotalPerName)
+      SortedTotals.emplace_back(std::string(Total.getKey()), Total.getValue());
+
+    llvm::sort(SortedTotals, [](const NameAndCountAndDurationType &A,
+                                const NameAndCountAndDurationType &B) {
+      return A.second.second > B.second.second;
+    });
+
+    // Report totals on separate threads of tracing file.
+    uint64_t TotalTid = MaxTid + 1;
+    for (const NameAndCountAndDurationType &Total : SortedTotals) {
+      auto DurUs = duration_cast<microseconds>(Total.second.second).count();
+      auto Count = AllCountAndTotalPerName[Total.first].first;
+
+      J.object([&] {
+        J.attribute("pid", Pid);
+        J.attribute("tid", int64_t(TotalTid));
         J.attribute("ph", "X");
         J.attribute("ts", 0);
         J.attribute("dur", DurUs);
-        J.attribute("name", "Total " + E.first);
+        J.attribute("name", "Total " + Total.first);
         J.attributeObject("args", [&] {
           J.attribute("count", int64_t(Count));
           J.attribute("avg ms", int64_t(DurUs / Count / 1000));
         });
       });
 
-      ++Tid;
+      ++TotalTid;
     }
 
-    // Emit metadata event with process name.
-    J.object([&] {
-      J.attribute("cat", "");
-      J.attribute("pid", 1);
-      J.attribute("tid", 0);
-      J.attribute("ts", 0);
-      J.attribute("ph", "M");
-      J.attribute("name", "process_name");
-      J.attributeObject("args", [&] { J.attribute("name", ProcName); });
-    });
+    auto writeMetadataEvent = [&](const char *Name, uint64_t Tid,
+                                  StringRef arg) {
+      J.object([&] {
+        J.attribute("cat", "");
+        J.attribute("pid", Pid);
+        J.attribute("tid", int64_t(Tid));
+        J.attribute("ts", 0);
+        J.attribute("ph", "M");
+        J.attribute("name", Name);
+        J.attributeObject("args", [&] { J.attribute("name", arg); });
+      });
+    };
+
+    writeMetadataEvent("process_name", Tid, ProcName);
+    writeMetadataEvent("thread_name", Tid, ThreadName);
+    for (const TimeTraceProfiler *TTP : ThreadTimeTraceProfilerInstances)
+      writeMetadataEvent("thread_name", TTP->Tid, TTP->ThreadName);
 
     J.arrayEnd();
     J.attributeEnd();
+
+    // Emit the absolute time when this TimeProfiler started.
+    // This can be used to combine the profiling data from
+    // multiple processes and preserve actual time intervals.
+    J.attribute("beginningOfTime",
+                time_point_cast<microseconds>(BeginningOfTime)
+                    .time_since_epoch()
+                    .count());
+
     J.objectEnd();
   }
 
   SmallVector<Entry, 16> Stack;
   SmallVector<Entry, 128> Entries;
   StringMap<CountAndDurationType> CountAndTotalPerName;
+  const time_point<system_clock> BeginningOfTime;
   const TimePointType StartTime;
   const std::string ProcName;
+  const sys::Process::Pid Pid;
+  SmallString<0> ThreadName;
+  const uint64_t Tid;
 
   // Minimum time granularity (in microseconds)
   const unsigned TimeTraceGranularity;
 };
 
-void timeTraceProfilerInitialize(unsigned TimeTraceGranularity,
-                                 StringRef ProcName) {
+void llvm::timeTraceProfilerInitialize(unsigned TimeTraceGranularity,
+                                       StringRef ProcName) {
   assert(TimeTraceProfilerInstance == nullptr &&
          "Profiler should not be initialized");
   TimeTraceProfilerInstance = new TimeTraceProfiler(
       TimeTraceGranularity, llvm::sys::path::filename(ProcName));
 }
 
-void timeTraceProfilerCleanup() {
+// Removes all TimeTraceProfilerInstances.
+// Called from main thread.
+void llvm::timeTraceProfilerCleanup() {
   delete TimeTraceProfilerInstance;
+  std::lock_guard<std::mutex> Lock(Mu);
+  for (auto TTP : ThreadTimeTraceProfilerInstances)
+    delete TTP;
+  ThreadTimeTraceProfilerInstances.clear();
+}
+
+// Finish TimeTraceProfilerInstance on a worker thread.
+// This doesn't remove the instance, just moves the pointer to global vector.
+void llvm::timeTraceProfilerFinishThread() {
+  std::lock_guard<std::mutex> Lock(Mu);
+  ThreadTimeTraceProfilerInstances.push_back(TimeTraceProfilerInstance);
   TimeTraceProfilerInstance = nullptr;
 }
 
-void timeTraceProfilerWrite(raw_pwrite_stream &OS) {
+void llvm::timeTraceProfilerWrite(raw_pwrite_stream &OS) {
   assert(TimeTraceProfilerInstance != nullptr &&
          "Profiler object can't be null");
-  TimeTraceProfilerInstance->Write(OS);
+  TimeTraceProfilerInstance->write(OS);
 }
 
-void timeTraceProfilerBegin(StringRef Name, StringRef Detail) {
+Error llvm::timeTraceProfilerWrite(StringRef PreferredFileName,
+                                   StringRef FallbackFileName) {
+  assert(TimeTraceProfilerInstance != nullptr &&
+         "Profiler object can't be null");
+
+  std::string Path = PreferredFileName.str();
+  if (Path.empty()) {
+    Path = FallbackFileName == "-" ? "out" : FallbackFileName.str();
+    Path += ".time-trace";
+  }
+
+  std::error_code EC;
+  raw_fd_ostream OS(Path, EC, sys::fs::OF_Text);
+  if (EC)
+    return createStringError(EC, "Could not open " + Path);
+
+  timeTraceProfilerWrite(OS);
+  return Error::success();
+}
+
+void llvm::timeTraceProfilerBegin(StringRef Name, StringRef Detail) {
   if (TimeTraceProfilerInstance != nullptr)
-    TimeTraceProfilerInstance->begin(Name, [&]() { return Detail; });
+    TimeTraceProfilerInstance->begin(std::string(Name),
+                                     [&]() { return std::string(Detail); });
 }
 
-void timeTraceProfilerBegin(StringRef Name,
-                            llvm::function_ref<std::string()> Detail) {
+void llvm::timeTraceProfilerBegin(StringRef Name,
+                                  llvm::function_ref<std::string()> Detail) {
   if (TimeTraceProfilerInstance != nullptr)
-    TimeTraceProfilerInstance->begin(Name, Detail);
+    TimeTraceProfilerInstance->begin(std::string(Name), Detail);
 }
 
-void timeTraceProfilerEnd() {
+void llvm::timeTraceProfilerEnd() {
   if (TimeTraceProfilerInstance != nullptr)
     TimeTraceProfilerInstance->end();
 }
-
-} // namespace llvm
diff --git a/llvm/lib/Support/Timer.cpp b/llvm/lib/Support/Timer.cpp
index 613d2eaae6d3..c97538cb560a 100644
--- a/llvm/lib/Support/Timer.cpp
+++ b/llvm/lib/Support/Timer.cpp
@@ -247,7 +247,8 @@ TimerGroup::TimerGroup(StringRef Name, StringRef Description,
     : TimerGroup(Name, Description) {
   TimersToPrint.reserve(Records.size());
   for (const auto &P : Records)
-    TimersToPrint.emplace_back(P.getValue(), P.getKey(), P.getKey());
+    TimersToPrint.emplace_back(P.getValue(), std::string(P.getKey()),
+                               std::string(P.getKey()));
   assert(TimersToPrint.size() == Records.size() && "Size mismatch");
 }
 
@@ -441,3 +442,7 @@ const char *TimerGroup::printAllJSONValues(raw_ostream &OS, const char *delim) {
 void TimerGroup::ConstructTimerLists() {
   (void)*NamedGroupedTimers;
 }
+
+std::unique_ptr<TimerGroup> TimerGroup::aquireDefaultGroup() {
+  return std::unique_ptr<TimerGroup>(DefaultTimerGroup.claim());
+}
diff --git a/llvm/lib/Support/ToolOutputFile.cpp b/llvm/lib/Support/ToolOutputFile.cpp
index ed3a247f0115..c2ca97a59c62 100644
--- a/llvm/lib/Support/ToolOutputFile.cpp
+++ b/llvm/lib/Support/ToolOutputFile.cpp
@@ -15,31 +15,45 @@
 #include "llvm/Support/Signals.h"
 using namespace llvm;
 
+static bool isStdout(StringRef Filename) { return Filename == "-"; }
+
 ToolOutputFile::CleanupInstaller::CleanupInstaller(StringRef Filename)
-    : Filename(Filename), Keep(false) {
+    : Filename(std::string(Filename)), Keep(false) {
   // Arrange for the file to be deleted if the process is killed.
-  if (Filename != "-")
+  if (!isStdout(Filename))
     sys::RemoveFileOnSignal(Filename);
 }
 
 ToolOutputFile::CleanupInstaller::~CleanupInstaller() {
+  if (isStdout(Filename))
+    return;
+
   // Delete the file if the client hasn't told us not to.
-  if (!Keep && Filename != "-")
+  if (!Keep)
     sys::fs::remove(Filename);
 
   // Ok, the file is successfully written and closed, or deleted. There's no
   // further need to clean it up on signals.
-  if (Filename != "-")
-    sys::DontRemoveFileOnSignal(Filename);
+  sys::DontRemoveFileOnSignal(Filename);
 }
 
 ToolOutputFile::ToolOutputFile(StringRef Filename, std::error_code &EC,
                                sys::fs::OpenFlags Flags)
-    : Installer(Filename), OS(Filename, EC, Flags) {
+    : Installer(Filename) {
+  if (isStdout(Filename)) {
+    OS = &outs();
+    EC = std::error_code();
+    return;
+  }
+  OSHolder.emplace(Filename, EC, Flags);
+  OS = OSHolder.getPointer();
   // If open fails, no cleanup is needed.
   if (EC)
     Installer.Keep = true;
 }
 
 ToolOutputFile::ToolOutputFile(StringRef Filename, int FD)
-    : Installer(Filename), OS(FD, true) {}
+    : Installer(Filename) {
+  OSHolder.emplace(FD, true);
+  OS = OSHolder.getPointer();
+}
diff --git a/llvm/lib/Support/TrigramIndex.cpp b/llvm/lib/Support/TrigramIndex.cpp
index 94810b56db8e..88375e6e7863 100644
--- a/llvm/lib/Support/TrigramIndex.cpp
+++ b/llvm/lib/Support/TrigramIndex.cpp
@@ -16,6 +16,7 @@
 
 #include "llvm/Support/TrigramIndex.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
 
 #include <set>
 #include <string>
diff --git a/llvm/lib/Support/Triple.cpp b/llvm/lib/Support/Triple.cpp
index 2c480c1094a5..fec1985ccaca 100644
--- a/llvm/lib/Support/Triple.cpp
+++ b/llvm/lib/Support/Triple.cpp
@@ -9,10 +9,14 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Host.h"
+#include "llvm/Support/SwapByteOrder.h"
 #include "llvm/Support/TargetParser.h"
+#include "llvm/Support/VersionTuple.h"
+#include <cassert>
 #include <cstring>
 using namespace llvm;
 
@@ -625,6 +629,8 @@ static Triple::SubArchType parseSubArch(StringRef SubArchName) {
     return Triple::ARMSubArch_v8_4a;
   case ARM::ArchKind::ARMV8_5A:
     return Triple::ARMSubArch_v8_5a;
+  case ARM::ArchKind::ARMV8_6A:
+    return Triple::ARMSubArch_v8_6a;
   case ARM::ArchKind::ARMV8R:
     return Triple::ARMSubArch_v8r;
   case ARM::ArchKind::ARMV8MBaseline:
@@ -710,9 +716,7 @@ static Triple::ObjectFormatType getDefaultFormat(const Triple &T) {
 
   case Triple::ppc64:
   case Triple::ppc:
-    if (T.isOSDarwin())
-      return Triple::MachO;
-    else if (T.isOSAIX())
+    if (T.isOSAIX())
       return Triple::XCOFF;
     return Triple::ELF;
 
@@ -983,12 +987,7 @@ std::string Triple::normalize(StringRef Str) {
   }
 
   // Stick the corrected components back together to form the normalized string.
-  std::string Normalized;
-  for (unsigned i = 0, e = Components.size(); i != e; ++i) {
-    if (i) Normalized += '-';
-    Normalized += Components[i];
-  }
-  return Normalized;
+  return join(Components, "-");
 }
 
 StringRef Triple::getArchName() const {
@@ -1088,17 +1087,23 @@ bool Triple::getMacOSXVersion(unsigned &Major, unsigned &Minor,
     // Darwin version numbers are skewed from OS X versions.
     if (Major < 4)
       return false;
-    Micro = 0;
-    Minor = Major - 4;
-    Major = 10;
+    if (Major <= 19) {
+      Micro = 0;
+      Minor = Major - 4;
+      Major = 10;
+    } else {
+      Micro = 0;
+      Minor = 0;
+      // darwin20+ corresponds to macOS 11+.
+      Major = 11 + Major - 20;
+    }
     break;
   case MacOSX:
     // Default to 10.4.
     if (Major == 0) {
       Major = 10;
       Minor = 4;
-    }
-    if (Major != 10)
+    } else if (Major < 10)
       return false;
     break;
   case IOS:
@@ -1602,6 +1607,52 @@ std::string Triple::merge(const Triple &Other) const {
   return Other.str();
 }
 
+bool Triple::isMacOSXVersionLT(unsigned Major, unsigned Minor,
+                               unsigned Micro) const {
+  assert(isMacOSX() && "Not an OS X triple!");
+
+  // If this is OS X, expect a sane version number.
+  if (getOS() == Triple::MacOSX)
+    return isOSVersionLT(Major, Minor, Micro);
+
+  // Otherwise, compare to the "Darwin" number.
+  if (Major == 10) {
+    return isOSVersionLT(Minor + 4, Micro, 0);
+  } else {
+    assert(Major >= 11 && "Unexpected major version");
+    return isOSVersionLT(Major - 11 + 20, Minor, Micro);
+  }
+}
+
+VersionTuple Triple::getMinimumSupportedOSVersion() const {
+  if (getVendor() != Triple::Apple || getArch() != Triple::aarch64)
+    return VersionTuple();
+  switch (getOS()) {
+  case Triple::MacOSX:
+    // ARM64 slice is supported starting from macOS 11.0+.
+    return VersionTuple(11, 0, 0);
+  case Triple::IOS:
+    // ARM64 slice is supported starting from Mac Catalyst 14 (macOS 11).
+    // ARM64 simulators are supported for iOS 14+.
+    if (isMacCatalystEnvironment() || isSimulatorEnvironment())
+      return VersionTuple(14, 0, 0);
+    break;
+  case Triple::TvOS:
+    // ARM64 simulators are supported for tvOS 14+.
+    if (isSimulatorEnvironment())
+      return VersionTuple(14, 0, 0);
+    break;
+  case Triple::WatchOS:
+    // ARM64 simulators are supported for watchOS 7+.
+    if (isSimulatorEnvironment())
+      return VersionTuple(7, 0, 0);
+    break;
+  default:
+    break;
+  }
+  return VersionTuple();
+}
+
 StringRef Triple::getARMCPUForArch(StringRef MArch) const {
   if (MArch.empty())
     MArch = getArchName();
@@ -1664,3 +1715,16 @@ StringRef Triple::getARMCPUForArch(StringRef MArch) const {
 
   llvm_unreachable("invalid arch name");
 }
+
+VersionTuple Triple::getCanonicalVersionForOS(OSType OSKind,
+                                              const VersionTuple &Version) {
+  switch (OSKind) {
+  case MacOSX:
+    // macOS 10.16 is canonicalized to macOS 11.
+    if (Version == VersionTuple(10, 16))
+      return VersionTuple(11, 0);
+    LLVM_FALLTHROUGH;
+  default:
+    return Version;
+  }
+}
diff --git a/llvm/lib/Support/Unix/Host.inc b/llvm/lib/Support/Unix/Host.inc
index 17d78dc18be7..dfcfdd0dee68 100644
--- a/llvm/lib/Support/Unix/Host.inc
+++ b/llvm/lib/Support/Unix/Host.inc
@@ -56,7 +56,7 @@ static std::string updateTripleOSVersion(std::string TargetTripleString) {
     if (TT.getOS() == Triple::AIX && !TT.getOSMajorVersion()) {
       struct utsname name;
       if (uname(&name) != -1) {
-        std::string NewOSName = Triple::getOSTypeName(Triple::AIX);
+        std::string NewOSName = std::string(Triple::getOSTypeName(Triple::AIX));
         NewOSName += name.version;
         NewOSName += '.';
         NewOSName += name.release;
diff --git a/llvm/lib/Support/Unix/Memory.inc b/llvm/lib/Support/Unix/Memory.inc
index 79b1759359e1..be88e7db1400 100644
--- a/llvm/lib/Support/Unix/Memory.inc
+++ b/llvm/lib/Support/Unix/Memory.inc
@@ -12,6 +12,7 @@
 
 #include "Unix.h"
 #include "llvm/Config/config.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/DataTypes.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Process.h"
diff --git a/llvm/lib/Support/Unix/Path.inc b/llvm/lib/Support/Unix/Path.inc
index 2a03dc682bce..d91b269cc6d3 100644
--- a/llvm/lib/Support/Unix/Path.inc
+++ b/llvm/lib/Support/Unix/Path.inc
@@ -48,6 +48,8 @@ extern char **environ;
 #endif
 #elif defined(__DragonFly__)
 #include <sys/mount.h>
+#elif defined(__MVS__)
+#include <sys/ps.h>
 #endif
 
 // Both stdio.h and cstdio are included via different paths and
@@ -56,10 +58,13 @@ extern char **environ;
 #undef ferror
 #undef feof
 
+#if !defined(PATH_MAX)
 // For GNU Hurd
-#if defined(__GNU__) && !defined(PATH_MAX)
-# define PATH_MAX 4096
-# define MAXPATHLEN 4096
+#if defined(__GNU__)
+#define PATH_MAX 4096
+#elif defined(__MVS__)
+#define PATH_MAX _XOPEN_PATH_MAX
+#endif
 #endif
 
 #include <sys/types.h>
@@ -101,7 +106,8 @@ typedef uint_t uint;
 #define STATVFS_F_FRSIZE(vfs) static_cast<uint64_t>(vfs.f_bsize)
 #endif
 
-#if defined(__NetBSD__) || defined(__DragonFly__) || defined(__GNU__)
+#if defined(__NetBSD__) || defined(__DragonFly__) || defined(__GNU__) || \
+    defined(__MVS__)
 #define STATVFS_F_FLAG(vfs) (vfs).f_flag
 #else
 #define STATVFS_F_FLAG(vfs) (vfs).f_flags
@@ -184,10 +190,10 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
   // On OS X the executable path is saved to the stack by dyld. Reading it
   // from there is much faster than calling dladdr, especially for large
   // binaries with symbols.
-  char exe_path[MAXPATHLEN];
+  char exe_path[PATH_MAX];
   uint32_t size = sizeof(exe_path);
   if (_NSGetExecutablePath(exe_path, &size) == 0) {
-    char link_path[MAXPATHLEN];
+    char link_path[PATH_MAX];
     if (realpath(exe_path, link_path))
       return link_path;
   }
@@ -208,14 +214,9 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
   while (*p++ != 0)
     ;
   // Iterate through auxiliary vectors for AT_EXECPATH.
-  for (;;) {
-    switch (*(uintptr_t *)p++) {
-    case AT_EXECPATH:
+  for (; *(uintptr_t *)p != AT_NULL; p++) {
+    if (*(uintptr_t *)p++ == AT_EXECPATH)
       return *p;
-    case AT_NULL:
-      break;
-    }
-    p++;
   }
 #endif
   // Fall back to argv[0] if auxiliary vectors are not available.
@@ -239,7 +240,7 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
   if (getprogpath(exe_path, argv0) != NULL)
     return exe_path;
 #elif defined(__linux__) || defined(__CYGWIN__) || defined(__gnu_hurd__)
-  char exe_path[MAXPATHLEN];
+  char exe_path[PATH_MAX];
   const char *aPath = "/proc/self/exe";
   if (sys::fs::exists(aPath)) {
     // /proc is not always mounted under Linux (chroot for example).
@@ -263,7 +264,7 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
       return ret;
     }
 #else
-    char real_path[MAXPATHLEN];
+    char real_path[PATH_MAX];
     if (realpath(exe_path, real_path))
       return std::string(real_path);
 #endif
@@ -271,6 +272,26 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
   // Fall back to the classical detection.
   if (getprogpath(exe_path, argv0))
     return exe_path;
+#elif defined(__MVS__)
+  int token = 0;
+  W_PSPROC buf;
+  char exe_path[PS_PATHBLEN];
+  pid_t pid = getpid();
+
+  memset(&buf, 0, sizeof(buf));
+  buf.ps_pathptr = exe_path;
+  buf.ps_pathlen = sizeof(exe_path);
+
+  while (true) {
+    if ((token = w_getpsent(token, &buf, sizeof(buf))) <= 0)
+      break;
+    if (buf.ps_pid != pid)
+      continue;
+    char real_path[PATH_MAX];
+    if (realpath(exe_path, real_path))
+      return std::string(real_path);
+    break;  // Found entry, but realpath failed.
+  }
 #elif defined(HAVE_DLFCN_H) && defined(HAVE_DLADDR)
   // Use dladdr to get executable path if available.
   Dl_info DLInfo;
@@ -280,7 +301,7 @@ std::string getMainExecutable(const char *argv0, void *MainAddr) {
 
   // If the filename is a symlink, we need to resolve and return the location of
   // the actual executable.
-  char link_path[MAXPATHLEN];
+  char link_path[PATH_MAX];
   if (realpath(DLInfo.dli_fname, link_path))
     return link_path;
 #else
@@ -330,12 +351,7 @@ std::error_code current_path(SmallVectorImpl<char> &result) {
     return std::error_code();
   }
 
-#ifdef MAXPATHLEN
-  result.reserve(MAXPATHLEN);
-#else
-// For GNU Hurd
-  result.reserve(1024);
-#endif
+  result.reserve(PATH_MAX);
 
   while (true) {
     if (::getcwd(result.data(), result.capacity()) == nullptr) {
@@ -504,6 +520,10 @@ static bool is_local_impl(struct STATVFS &Vfs) {
 
   // vmount entry not found; "remote" is the conservative answer.
   return false;
+#elif defined(__MVS__)
+  // The file system can have an arbitrary structure on z/OS; must go with the
+  // conservative answer.
+  return false;
 #else
   return !!(STATVFS_F_FLAG(Vfs) & MNT_LOCAL);
 #endif
@@ -998,7 +1018,7 @@ std::error_code openFileForRead(const Twine &Name, int &ResultFD,
 #if defined(F_GETPATH)
   // When F_GETPATH is availble, it is the quickest way to get
   // the real path name.
-  char Buffer[MAXPATHLEN];
+  char Buffer[PATH_MAX];
   if (::fcntl(ResultFD, F_GETPATH, Buffer) != -1)
     RealPath->append(Buffer, Buffer + strlen(Buffer));
 #else
@@ -1169,6 +1189,51 @@ static bool getDarwinConfDir(bool TempDir, SmallVectorImpl<char> &Result) {
   return false;
 }
 
+bool user_config_directory(SmallVectorImpl<char> &result) {
+#ifdef __APPLE__
+  // Mac: ~/Library/Preferences/
+  if (home_directory(result)) {
+    append(result, "Library", "Preferences");
+    return true;
+  }
+#else
+  // XDG_CONFIG_HOME as defined in the XDG Base Directory Specification:
+  // http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html
+  if (const char *RequestedDir = getenv("XDG_CONFIG_HOME")) {
+    result.clear();
+    result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
+    return true;
+  }
+#endif
+  // Fallback: ~/.config
+  if (!home_directory(result)) {
+    return false;
+  }
+  append(result, ".config");
+  return true;
+}
+
+bool cache_directory(SmallVectorImpl<char> &result) {
+#ifdef __APPLE__
+  if (getDarwinConfDir(false/*tempDir*/, result)) {
+    return true;
+  }
+#else
+  // XDG_CACHE_HOME as defined in the XDG Base Directory Specification:
+  // http://standards.freedesktop.org/basedir-spec/basedir-spec-latest.html
+  if (const char *RequestedDir = getenv("XDG_CACHE_HOME")) {
+    result.clear();
+    result.append(RequestedDir, RequestedDir + strlen(RequestedDir));
+    return true;
+  }
+#endif
+  if (!home_directory(result)) {
+    return false;
+  }
+  append(result, ".cache");
+  return true;
+}
+
 static const char *getEnvTempDir() {
   // Check whether the temporary directory is specified by an environment
   // variable.
diff --git a/llvm/lib/Support/Unix/Process.inc b/llvm/lib/Support/Unix/Process.inc
index dfe81d7e2833..24f16b51af7b 100644
--- a/llvm/lib/Support/Unix/Process.inc
+++ b/llvm/lib/Support/Unix/Process.inc
@@ -66,6 +66,12 @@ static std::pair<std::chrono::microseconds, std::chrono::microseconds> getRUsage
 #endif
 }
 
+Process::Pid Process::getProcessId() {
+  static_assert(sizeof(Pid) >= sizeof(pid_t),
+                "Process::Pid should be big enough to store pid_t");
+  return Pid(::getpid());
+}
+
 // On Cygwin, getpagesize() returns 64k(AllocationGranularity) and
 // offset in mmap(3) should be aligned to the AllocationGranularity.
 Expected<unsigned> Process::getPageSize() {
@@ -280,7 +286,7 @@ bool Process::FileDescriptorIsDisplayed(int fd) {
 #endif
 }
 
-static unsigned getColumns(int FileID) {
+static unsigned getColumns() {
   // If COLUMNS is defined in the environment, wrap to that many columns.
   if (const char *ColumnsStr = std::getenv("COLUMNS")) {
     int Columns = std::atoi(ColumnsStr);
@@ -288,31 +294,23 @@ static unsigned getColumns(int FileID) {
       return Columns;
   }
 
-  unsigned Columns = 0;
-
-#if defined(HAVE_SYS_IOCTL_H) && defined(HAVE_TERMIOS_H) \
-  && !(defined(_XOPEN_SOURCE) || defined(_POSIX_C_SOURCE))
-  // Try to determine the width of the terminal.
-  struct winsize ws;
-  if (ioctl(FileID, TIOCGWINSZ, &ws) == 0)
-    Columns = ws.ws_col;
-#endif
-
-  return Columns;
+  // We used to call ioctl TIOCGWINSZ to determine the width. It is considered
+  // unuseful.
+  return 0;
 }
 
 unsigned Process::StandardOutColumns() {
   if (!StandardOutIsDisplayed())
     return 0;
 
-  return getColumns(1);
+  return getColumns();
 }
 
 unsigned Process::StandardErrColumns() {
   if (!StandardErrIsDisplayed())
     return 0;
 
-  return getColumns(2);
+  return getColumns();
 }
 
 #ifdef HAVE_TERMINFO
diff --git a/llvm/lib/Support/Unix/Program.inc b/llvm/lib/Support/Unix/Program.inc
index 520685a0e987..8f41fc015163 100644
--- a/llvm/lib/Support/Unix/Program.inc
+++ b/llvm/lib/Support/Unix/Program.inc
@@ -15,6 +15,8 @@
 //===          is guaranteed to work on *all* UNIX variants.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/Program.h"
+
 #include "Unix.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Config/config.h"
@@ -59,8 +61,7 @@
 #endif
 #endif
 
-namespace llvm {
-
+using namespace llvm;
 using namespace sys;
 
 ProcessInfo::ProcessInfo() : Pid(0), ReturnCode(0) {}
@@ -70,8 +71,7 @@ ErrorOr<std::string> sys::findProgramByName(StringRef Name,
   assert(!Name.empty() && "Must have a name!");
   // Use the given path verbatim if it contains any slashes; this matches
   // the behavior of sh(1) and friends.
-  if (Name.find('/') != StringRef::npos)
-    return std::string(Name);
+  if (Name.find('/') != StringRef::npos) return std::string(Name);
 
   SmallVector<StringRef, 16> EnvironmentPaths;
   if (Paths.empty())
@@ -88,7 +88,7 @@ ErrorOr<std::string> sys::findProgramByName(StringRef Name,
     SmallString<128> FilePath(Path);
     sys::path::append(FilePath, Name);
     if (sys::fs::can_execute(FilePath.c_str()))
-      return std::string(FilePath.str()); // Found the executable!
+      return std::string(FilePath.str());  // Found the executable!
   }
   return errc::no_such_file_or_directory;
 }
@@ -101,7 +101,7 @@ static bool RedirectIO(Optional<StringRef> Path, int FD, std::string* ErrMsg) {
     // Redirect empty paths to /dev/null
     File = "/dev/null";
   else
-    File = *Path;
+    File = std::string(*Path);
 
   // Open the file
   int InFD = open(File.c_str(), FD == 0 ? O_RDONLY : O_WRONLY|O_CREAT, 0666);
@@ -162,8 +162,6 @@ static void SetMemoryLimits(unsigned size) {
 #endif
 }
 
-}
-
 static std::vector<const char *>
 toNullTerminatedCStringArray(ArrayRef<StringRef> Strings, StringSaver &Saver) {
   std::vector<const char *> Result;
@@ -213,7 +211,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
       std::string *RedirectsStr[3] = {nullptr, nullptr, nullptr};
       for (int I = 0; I < 3; ++I) {
         if (Redirects[I]) {
-          RedirectsStorage[I] = *Redirects[I];
+          RedirectsStorage[I] = std::string(*Redirects[I]);
           RedirectsStr[I] = &RedirectsStorage[I];
         }
       }
@@ -304,7 +302,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
       }
 
       // Execute!
-      std::string PathStr = Program;
+      std::string PathStr = std::string(Program);
       if (Envp != nullptr)
         execve(PathStr.c_str(), const_cast<char **>(Argv),
                const_cast<char **>(Envp));
@@ -331,9 +329,54 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
 }
 
 namespace llvm {
+namespace sys {
+
+#ifndef _AIX
+using ::wait4;
+#else
+static pid_t (wait4)(pid_t pid, int *status, int options, struct rusage *usage);
+#endif
 
-ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
-                      bool WaitUntilTerminates, std::string *ErrMsg) {
+} // namespace sys
+} // namespace llvm
+
+#ifdef _AIX
+#ifndef _ALL_SOURCE
+extern "C" pid_t (wait4)(pid_t pid, int *status, int options,
+                         struct rusage *usage);
+#endif
+pid_t (llvm::sys::wait4)(pid_t pid, int *status, int options,
+                         struct rusage *usage) {
+  assert(pid > 0 && "Only expecting to handle actual PID values!");
+  assert((options & ~WNOHANG) == 0 && "Expecting WNOHANG at most!");
+  assert(usage && "Expecting usage collection!");
+
+  // AIX wait4 does not work well with WNOHANG.
+  if (!(options & WNOHANG))
+    return ::wait4(pid, status, options, usage);
+
+  // For WNOHANG, we use waitid (which supports WNOWAIT) until the child process
+  // has terminated.
+  siginfo_t WaitIdInfo;
+  WaitIdInfo.si_pid = 0;
+  int WaitIdRetVal =
+      waitid(P_PID, pid, &WaitIdInfo, WNOWAIT | WEXITED | options);
+
+  if (WaitIdRetVal == -1 || WaitIdInfo.si_pid == 0)
+    return WaitIdRetVal;
+
+  assert(WaitIdInfo.si_pid == pid);
+
+  // The child has already terminated, so a blocking wait on it is okay in the
+  // absence of indiscriminate `wait` calls from the current process (which
+  // would cause the call here to fail with ECHILD).
+  return ::wait4(pid, status, options & ~WNOHANG, usage);
+}
+#endif
+
+ProcessInfo llvm::sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
+                            bool WaitUntilTerminates, std::string *ErrMsg,
+                            Optional<ProcessStatistics> *ProcStat) {
   struct sigaction Act, Old;
   assert(PI.Pid && "invalid pid to wait on, process not started?");
 
@@ -349,6 +392,7 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
     Act.sa_handler = TimeOutHandler;
     sigemptyset(&Act.sa_mask);
     sigaction(SIGALRM, &Act, &Old);
+    // FIXME The alarm signal may be delivered to another thread.
     alarm(SecondsToWait);
   } else if (SecondsToWait == 0)
     WaitPidOptions = WNOHANG;
@@ -356,9 +400,12 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
   // Parent process: Wait for the child process to terminate.
   int status;
   ProcessInfo WaitResult;
+  rusage Info;
+  if (ProcStat)
+    ProcStat->reset();
 
   do {
-    WaitResult.Pid = waitpid(ChildPid, &status, WaitPidOptions);
+    WaitResult.Pid = sys::wait4(ChildPid, &status, WaitPidOptions, &Info);
   } while (WaitUntilTerminates && WaitResult.Pid == -1 && errno == EINTR);
 
   if (WaitResult.Pid != PI.Pid) {
@@ -375,6 +422,8 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
         sigaction(SIGALRM, &Old, nullptr);
 
         // Wait for child to die
+        // FIXME This could grab some other child process out from another
+        // waiting thread and then leave a zombie anyway.
         if (wait(&status) != ChildPid)
           MakeErrMsg(ErrMsg, "Child timed out but wouldn't die");
         else
@@ -396,6 +445,13 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
     sigaction(SIGALRM, &Old, nullptr);
   }
 
+  if (ProcStat) {
+    std::chrono::microseconds UserT = toDuration(Info.ru_utime);
+    std::chrono::microseconds KernelT = toDuration(Info.ru_stime);
+    uint64_t PeakMemory = static_cast<uint64_t>(Info.ru_maxrss);
+    *ProcStat = ProcessStatistics{UserT + KernelT, UserT, PeakMemory};
+  }
+
   // Return the proper exit status. Detect error conditions
   // so we can return -1 for them and set ErrMsg informatively.
   int result = 0;
@@ -430,12 +486,12 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
   return WaitResult;
 }
 
-std::error_code sys::ChangeStdinToBinary() {
+std::error_code llvm::sys::ChangeStdinToBinary() {
   // Do nothing, as Unix doesn't differentiate between text and binary.
   return std::error_code();
 }
 
-std::error_code sys::ChangeStdoutToBinary() {
+std::error_code llvm::sys::ChangeStdoutToBinary() {
   // Do nothing, as Unix doesn't differentiate between text and binary.
   return std::error_code();
 }
@@ -497,4 +553,3 @@ bool llvm::sys::commandLineFitsWithinSystemLimits(StringRef Program,
 
   return true;
 }
-}
diff --git a/llvm/lib/Support/Unix/Threading.inc b/llvm/lib/Support/Unix/Threading.inc
index afb887fc1096..2d0aacabf092 100644
--- a/llvm/lib/Support/Unix/Threading.inc
+++ b/llvm/lib/Support/Unix/Threading.inc
@@ -37,7 +37,12 @@
 #include <lwp.h> // For _lwp_self()
 #endif
 
+#if defined(__OpenBSD__)
+#include <unistd.h> // For getthrid()
+#endif
+
 #if defined(__linux__)
+#include <sched.h>       // For sched_getaffinity
 #include <sys/syscall.h> // For syscall codes
 #include <unistd.h>      // For syscall()
 #endif
@@ -89,6 +94,10 @@ llvm_execute_on_thread_impl(void *(*ThreadFunc)(void *), void *Arg,
     if ((errnum = ::pthread_join(Thread, nullptr)) != 0) {
       ReportErrnumFatal("pthread_join failed", errnum);
     }
+  } else if (JP == JoiningPolicy::Detach) {
+    if ((errnum = ::pthread_detach(Thread)) != 0) {
+      ReportErrnumFatal("pthread_detach failed", errnum);
+    }
   }
 }
 
@@ -104,6 +113,8 @@ uint64_t llvm::get_threadid() {
   return uint64_t(pthread_getthreadid_np());
 #elif defined(__NetBSD__)
   return uint64_t(_lwp_self());
+#elif defined(__OpenBSD__)
+  return uint64_t(getthrid());
 #elif defined(__ANDROID__)
   return uint64_t(gettid());
 #elif defined(__linux__)
@@ -267,3 +278,27 @@ SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
 #endif
   return SetThreadPriorityResult::FAILURE;
 }
+
+#include <thread>
+
+int computeHostNumHardwareThreads() {
+#ifdef __linux__
+  cpu_set_t Set;
+  if (sched_getaffinity(0, sizeof(Set), &Set) == 0)
+    return CPU_COUNT(&Set);
+#endif
+  // Guard against std::thread::hardware_concurrency() returning 0.
+  if (unsigned Val = std::thread::hardware_concurrency())
+    return Val;
+  return 1;
+}
+
+void llvm::ThreadPoolStrategy::apply_thread_strategy(
+    unsigned ThreadPoolNum) const {}
+
+llvm::BitVector llvm::get_thread_affinity_mask() {
+  // FIXME: Implement
+  llvm_unreachable("Not implemented!");
+}
+
+unsigned llvm::get_cpus() { return 1; }
diff --git a/llvm/lib/Support/Unix/Unix.h b/llvm/lib/Support/Unix/Unix.h
index 1fc9a414f749..60929139598b 100644
--- a/llvm/lib/Support/Unix/Unix.h
+++ b/llvm/lib/Support/Unix/Unix.h
@@ -36,10 +36,6 @@
 #include <unistd.h>
 #endif
 
-#ifdef HAVE_SYS_PARAM_H
-#include <sys/param.h>
-#endif
-
 #ifdef HAVE_SYS_TIME_H
 # include <sys/time.h>
 #endif
diff --git a/llvm/lib/Support/VersionTuple.cpp b/llvm/lib/Support/VersionTuple.cpp
index 60b59424fbb4..6a516481ac25 100644
--- a/llvm/lib/Support/VersionTuple.cpp
+++ b/llvm/lib/Support/VersionTuple.cpp
@@ -10,8 +10,11 @@
 // the form major[.minor[.subminor]].
 //
 //===----------------------------------------------------------------------===//
+
 #include "llvm/Support/VersionTuple.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Support/VirtualFileSystem.cpp b/llvm/lib/Support/VirtualFileSystem.cpp
index edd4234fe501..5b757c9ea80d 100644
--- a/llvm/lib/Support/VirtualFileSystem.cpp
+++ b/llvm/lib/Support/VirtualFileSystem.cpp
@@ -306,12 +306,12 @@ RealFileSystem::openFileForRead(const Twine &Name) {
 
 llvm::ErrorOr<std::string> RealFileSystem::getCurrentWorkingDirectory() const {
   if (WD)
-    return WD->Specified.str();
+    return std::string(WD->Specified.str());
 
   SmallString<128> Dir;
   if (std::error_code EC = llvm::sys::fs::current_path(Dir))
     return EC;
-  return Dir.str();
+  return std::string(Dir.str());
 }
 
 std::error_code RealFileSystem::setCurrentWorkingDirectory(const Twine &Path) {
@@ -535,7 +535,8 @@ class InMemoryNode {
 
 public:
   InMemoryNode(llvm::StringRef FileName, InMemoryNodeKind Kind)
-      : Kind(Kind), FileName(llvm::sys::path::filename(FileName)) {}
+      : Kind(Kind), FileName(std::string(llvm::sys::path::filename(FileName))) {
+  }
   virtual ~InMemoryNode() = default;
 
   /// Get the filename of this node (the name without the directory part).
@@ -904,7 +905,7 @@ class InMemoryDirIterator : public llvm::vfs::detail::DirIterImpl {
         Type = sys::fs::file_type::directory_file;
         break;
       }
-      CurrentEntry = directory_entry(Path.str(), Type);
+      CurrentEntry = directory_entry(std::string(Path.str()), Type);
     } else {
       // When we're at the end, make CurrentEntry invalid and DirIterImpl will
       // do the rest.
@@ -960,7 +961,7 @@ std::error_code InMemoryFileSystem::setCurrentWorkingDirectory(const Twine &P) {
     llvm::sys::path::remove_dots(Path, /*remove_dot_dot=*/true);
 
   if (!Path.empty())
-    WorkingDirectory = Path.str();
+    WorkingDirectory = std::string(Path.str());
   return {};
 }
 
@@ -989,6 +990,28 @@ std::error_code InMemoryFileSystem::isLocal(const Twine &Path, bool &Result) {
 // RedirectingFileSystem implementation
 //===-----------------------------------------------------------------------===/
 
+namespace {
+
+/// Removes leading "./" as well as path components like ".." and ".".
+static llvm::SmallString<256> canonicalize(llvm::StringRef Path) {
+  // First detect the path style in use by checking the first separator.
+  llvm::sys::path::Style style = llvm::sys::path::Style::native;
+  const size_t n = Path.find_first_of("/\\");
+  if (n != static_cast<size_t>(-1))
+    style = (Path[n] == '/') ? llvm::sys::path::Style::posix
+                             : llvm::sys::path::Style::windows;
+
+  // Now remove the dots.  Explicitly specifying the path style prevents the
+  // direction of the slashes from changing.
+  llvm::SmallString<256> result =
+      llvm::sys::path::remove_leading_dotslash(Path, style);
+  llvm::sys::path::remove_dots(result, /*remove_dot_dot=*/true, style);
+  return result;
+}
+
+} // anonymous namespace
+
+
 RedirectingFileSystem::RedirectingFileSystem(IntrusiveRefCntPtr<FileSystem> FS)
     : ExternalFS(std::move(FS)) {
   if (ExternalFS)
@@ -1064,7 +1087,7 @@ RedirectingFileSystem::setCurrentWorkingDirectory(const Twine &Path) {
   Path.toVector(AbsolutePath);
   if (std::error_code EC = makeAbsolute(AbsolutePath))
     return EC;
-  WorkingDirectory = AbsolutePath.str();
+  WorkingDirectory = std::string(AbsolutePath.str());
   return {};
 }
 
@@ -1082,7 +1105,23 @@ std::error_code RedirectingFileSystem::makeAbsolute(SmallVectorImpl<char> &Path)
   if (!WorkingDir)
     return WorkingDir.getError();
 
-  llvm::sys::fs::make_absolute(WorkingDir.get(), Path);
+  // We can't use sys::fs::make_absolute because that assumes the path style
+  // is native and there is no way to override that.  Since we know WorkingDir
+  // is absolute, we can use it to determine which style we actually have and
+  // append Path ourselves.
+  sys::path::Style style = sys::path::Style::windows;
+  if (sys::path::is_absolute(WorkingDir.get(), sys::path::Style::posix)) {
+    style = sys::path::Style::posix;
+  }
+
+  std::string Result = WorkingDir.get();
+  StringRef Dir(Result);
+  if (!Dir.endswith(sys::path::get_separator(style))) {
+    Result += sys::path::get_separator(style);
+  }
+  Result.append(Path.data(), Path.size());
+  Path.assign(Result.begin(), Result.end());
+
   return {};
 }
 
@@ -1317,8 +1356,8 @@ class llvm::vfs::RedirectingFileSystemParser {
     bool HasContents = false; // external or otherwise
     std::vector<std::unique_ptr<RedirectingFileSystem::Entry>>
         EntryArrayContents;
-    std::string ExternalContentsPath;
-    std::string Name;
+    SmallString<256> ExternalContentsPath;
+    SmallString<256> Name;
     yaml::Node *NameValueNode = nullptr;
     auto UseExternalName =
         RedirectingFileSystem::RedirectingFileEntry::NK_NotSet;
@@ -1341,16 +1380,9 @@ class llvm::vfs::RedirectingFileSystemParser {
           return nullptr;
 
         NameValueNode = I.getValue();
-        if (FS->UseCanonicalizedPaths) {
-          SmallString<256> Path(Value);
-          // Guarantee that old YAML files containing paths with ".." and "."
-          // are properly canonicalized before read into the VFS.
-          Path = sys::path::remove_leading_dotslash(Path);
-          sys::path::remove_dots(Path, /*remove_dot_dot=*/true);
-          Name = Path.str();
-        } else {
-          Name = Value;
-        }
+        // Guarantee that old YAML files containing paths with ".." and "."
+        // are properly canonicalized before read into the VFS.
+        Name = canonicalize(Value).str();
       } else if (Key == "type") {
         if (!parseScalarString(I.getValue(), Value, Buffer))
           return nullptr;
@@ -1403,12 +1435,9 @@ class llvm::vfs::RedirectingFileSystemParser {
           FullPath = Value;
         }
 
-        if (FS->UseCanonicalizedPaths) {
-          // Guarantee that old YAML files containing paths with ".." and "."
-          // are properly canonicalized before read into the VFS.
-          FullPath = sys::path::remove_leading_dotslash(FullPath);
-          sys::path::remove_dots(FullPath, /*remove_dot_dot=*/true);
-        }
+        // Guarantee that old YAML files containing paths with ".." and "."
+        // are properly canonicalized before read into the VFS.
+        FullPath = canonicalize(FullPath);
         ExternalContentsPath = FullPath.str();
       } else if (Key == "use-external-name") {
         bool Val;
@@ -1653,14 +1682,10 @@ RedirectingFileSystem::lookupPath(const Twine &Path_) const {
   if (std::error_code EC = makeAbsolute(Path))
     return EC;
 
-  // Canonicalize path by removing ".", "..", "./", etc components. This is
-  // a VFS request, do bot bother about symlinks in the path components
+  // Canonicalize path by removing ".", "..", "./", components. This is
+  // a VFS request, do not bother about symlinks in the path components
   // but canonicalize in order to perform the correct entry search.
-  if (UseCanonicalizedPaths) {
-    Path = sys::path::remove_leading_dotslash(Path);
-    sys::path::remove_dots(Path, /*remove_dot_dot=*/true);
-  }
-
+  Path = canonicalize(Path);
   if (Path.empty())
     return make_error_code(llvm::errc::invalid_argument);
 
@@ -1679,16 +1704,9 @@ ErrorOr<RedirectingFileSystem::Entry *>
 RedirectingFileSystem::lookupPath(sys::path::const_iterator Start,
                                   sys::path::const_iterator End,
                                   RedirectingFileSystem::Entry *From) const {
-#ifndef _WIN32
   assert(!isTraversalComponent(*Start) &&
          !isTraversalComponent(From->getName()) &&
          "Paths should not contain traversal components");
-#else
-  // FIXME: this is here to support windows, remove it once canonicalized
-  // paths become globally default.
-  if (Start->equals("."))
-    ++Start;
-#endif
 
   StringRef FromName = From->getName();
 
@@ -1894,11 +1912,21 @@ UniqueID vfs::getNextVirtualUniqueID() {
   return UniqueID(std::numeric_limits<uint64_t>::max(), ID);
 }
 
-void YAMLVFSWriter::addFileMapping(StringRef VirtualPath, StringRef RealPath) {
+void YAMLVFSWriter::addEntry(StringRef VirtualPath, StringRef RealPath,
+                             bool IsDirectory) {
   assert(sys::path::is_absolute(VirtualPath) && "virtual path not absolute");
   assert(sys::path::is_absolute(RealPath) && "real path not absolute");
   assert(!pathHasTraversal(VirtualPath) && "path traversal is not supported");
-  Mappings.emplace_back(VirtualPath, RealPath);
+  Mappings.emplace_back(VirtualPath, RealPath, IsDirectory);
+}
+
+void YAMLVFSWriter::addFileMapping(StringRef VirtualPath, StringRef RealPath) {
+  addEntry(VirtualPath, RealPath, /*IsDirectory=*/false);
+}
+
+void YAMLVFSWriter::addDirectoryMapping(StringRef VirtualPath,
+                                        StringRef RealPath) {
+  addEntry(VirtualPath, RealPath, /*IsDirectory=*/true);
 }
 
 namespace {
@@ -1999,7 +2027,10 @@ void JSONWriter::write(ArrayRef<YAMLVFSEntry> Entries,
 
   if (!Entries.empty()) {
     const YAMLVFSEntry &Entry = Entries.front();
-    startDirectory(path::parent_path(Entry.VPath));
+
+    startDirectory(
+      Entry.IsDirectory ? Entry.VPath : path::parent_path(Entry.VPath)
+    );
 
     StringRef RPath = Entry.RPath;
     if (UseOverlayRelative) {
@@ -2009,19 +2040,31 @@ void JSONWriter::write(ArrayRef<YAMLVFSEntry> Entries,
       RPath = RPath.slice(OverlayDirLen, RPath.size());
     }
 
-    writeEntry(path::filename(Entry.VPath), RPath);
+    bool IsCurrentDirEmpty = true;
+    if (!Entry.IsDirectory) {
+      writeEntry(path::filename(Entry.VPath), RPath);
+      IsCurrentDirEmpty = false;
+    }
 
     for (const auto &Entry : Entries.slice(1)) {
-      StringRef Dir = path::parent_path(Entry.VPath);
-      if (Dir == DirStack.back())
-        OS << ",\n";
-      else {
+      StringRef Dir =
+          Entry.IsDirectory ? Entry.VPath : path::parent_path(Entry.VPath);
+      if (Dir == DirStack.back()) {
+        if (!IsCurrentDirEmpty) {
+          OS << ",\n";
+        }
+      } else {
+        bool IsDirPoppedFromStack = false;
         while (!DirStack.empty() && !containedIn(DirStack.back(), Dir)) {
           OS << "\n";
           endDirectory();
+          IsDirPoppedFromStack = true;
+        }
+        if (IsDirPoppedFromStack || !IsCurrentDirEmpty) {
+          OS << ",\n";
         }
-        OS << ",\n";
         startDirectory(Dir);
+        IsCurrentDirEmpty = true;
       }
       StringRef RPath = Entry.RPath;
       if (UseOverlayRelative) {
@@ -2030,7 +2073,10 @@ void JSONWriter::write(ArrayRef<YAMLVFSEntry> Entries,
                "Overlay dir must be contained in RPath");
         RPath = RPath.slice(OverlayDirLen, RPath.size());
       }
-      writeEntry(path::filename(Entry.VPath), RPath);
+      if (!Entry.IsDirectory) {
+        writeEntry(path::filename(Entry.VPath), RPath);
+        IsCurrentDirEmpty = false;
+      }
     }
 
     while (!DirStack.empty()) {
@@ -2104,7 +2150,7 @@ std::error_code VFSFromYamlDirIterImpl::incrementContent(bool IsFirstTime) {
       Type = sys::fs::file_type::regular_file;
       break;
     }
-    CurrentEntry = directory_entry(PathStr.str(), Type);
+    CurrentEntry = directory_entry(std::string(PathStr.str()), Type);
     return {};
   }
   return incrementExternal();
diff --git a/llvm/lib/Support/Windows/DynamicLibrary.inc b/llvm/lib/Support/Windows/DynamicLibrary.inc
index 71b206c4cf9e..a3f78fb0d6ba 100644
--- a/llvm/lib/Support/Windows/DynamicLibrary.inc
+++ b/llvm/lib/Support/Windows/DynamicLibrary.inc
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "WindowsSupport.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/raw_ostream.h"
 
diff --git a/llvm/lib/Support/Windows/Host.inc b/llvm/lib/Support/Windows/Host.inc
index 21b947f26df3..5583db909045 100644
--- a/llvm/lib/Support/Windows/Host.inc
+++ b/llvm/lib/Support/Windows/Host.inc
@@ -10,7 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "WindowsSupport.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
 #include <cstdio>
 #include <string>
 
diff --git a/llvm/lib/Support/Windows/Memory.inc b/llvm/lib/Support/Windows/Memory.inc
index c5566f9910a5..1b2de1915ec4 100644
--- a/llvm/lib/Support/Windows/Memory.inc
+++ b/llvm/lib/Support/Windows/Memory.inc
@@ -17,7 +17,7 @@
 #include "llvm/Support/WindowsError.h"
 
 // The Windows.h header must be the last one included.
-#include "WindowsSupport.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
 
 static DWORD getWindowsProtectionFlags(unsigned Flags) {
   switch (Flags & llvm::sys::Memory::MF_RWE_MASK) {
diff --git a/llvm/lib/Support/Windows/Path.inc b/llvm/lib/Support/Windows/Path.inc
index c3b13abef5de..e352beb77616 100644
--- a/llvm/lib/Support/Windows/Path.inc
+++ b/llvm/lib/Support/Windows/Path.inc
@@ -25,7 +25,7 @@
 
 // These two headers must be included last, and make sure shlobj is required
 // after Windows.h to make sure it picks up our definition of _WIN32_WINNT
-#include "WindowsSupport.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
 #include <shellapi.h>
 #include <shlobj.h>
 
@@ -47,7 +47,7 @@ using namespace llvm;
 using llvm::sys::windows::UTF8ToUTF16;
 using llvm::sys::windows::CurCPToUTF16;
 using llvm::sys::windows::UTF16ToUTF8;
-using llvm::sys::path::widenPath;
+using llvm::sys::windows::widenPath;
 
 static bool is_separator(const wchar_t value) {
   switch (value) {
@@ -61,64 +61,64 @@ static bool is_separator(const wchar_t value) {
 
 namespace llvm {
 namespace sys  {
-namespace path {
+namespace windows {
 
-// Convert a UTF-8 path to UTF-16.  Also, if the absolute equivalent of the
-// path is longer than CreateDirectory can tolerate, make it absolute and
-// prefixed by '\\?\'.
-std::error_code widenPath(const Twine &Path8,
-                          SmallVectorImpl<wchar_t> &Path16) {
-  const size_t MaxDirLen = MAX_PATH - 12; // Must leave room for 8.3 filename.
+// Convert a UTF-8 path to UTF-16. Also, if the absolute equivalent of the path
+// is longer than the limit that the Win32 Unicode File API can tolerate, make
+// it an absolute normalized path prefixed by '\\?\'.
+std::error_code widenPath(const Twine &Path8, SmallVectorImpl<wchar_t> &Path16,
+                          size_t MaxPathLen) {
+  assert(MaxPathLen <= MAX_PATH);
 
-  // Several operations would convert Path8 to SmallString; more efficient to
-  // do it once up front.
-  SmallString<128> Path8Str;
+  // Several operations would convert Path8 to SmallString; more efficient to do
+  // it once up front.
+  SmallString<MAX_PATH> Path8Str;
   Path8.toVector(Path8Str);
 
-  // If we made this path absolute, how much longer would it get?
+  if (std::error_code EC = UTF8ToUTF16(Path8Str, Path16))
+    return EC;
+
+  const bool IsAbsolute = llvm::sys::path::is_absolute(Path8);
   size_t CurPathLen;
-  if (llvm::sys::path::is_absolute(Twine(Path8Str)))
+  if (IsAbsolute)
     CurPathLen = 0; // No contribution from current_path needed.
   else {
-    CurPathLen = ::GetCurrentDirectoryW(0, NULL);
+    CurPathLen = ::GetCurrentDirectoryW(
+        0, NULL); // Returns the size including the null terminator.
     if (CurPathLen == 0)
       return mapWindowsError(::GetLastError());
   }
 
-  // Would the absolute path be longer than our limit?
-  if ((Path8Str.size() + CurPathLen) >= MaxDirLen &&
-      !Path8Str.startswith("\\\\?\\")) {
-    SmallString<2*MAX_PATH> FullPath("\\\\?\\");
-    if (CurPathLen) {
-      SmallString<80> CurPath;
-      if (std::error_code EC = llvm::sys::fs::current_path(CurPath))
-        return EC;
-      FullPath.append(CurPath);
-    }
-    // Traverse the requested path, canonicalizing . and .. (because the \\?\
-    // prefix is documented to treat them as real components).  Ignore
-    // separators, which can be returned from the iterator if the path has a
-    // drive name.  We don't need to call native() on the result since append()
-    // always attaches preferred_separator.
-    for (llvm::sys::path::const_iterator I = llvm::sys::path::begin(Path8Str),
-                                         E = llvm::sys::path::end(Path8Str);
-                                         I != E; ++I) {
-      if (I->size() == 1 && is_separator((*I)[0]))
-        continue;
-      if (I->size() == 1 && *I == ".")
-        continue;
-      if (I->size() == 2 && *I == "..")
-        llvm::sys::path::remove_filename(FullPath);
-      else
-        llvm::sys::path::append(FullPath, *I);
-    }
-    return UTF8ToUTF16(FullPath, Path16);
+  const char *const LongPathPrefix = "\\\\?\\";
+
+  if ((Path16.size() + CurPathLen) < MaxPathLen ||
+      Path8Str.startswith(LongPathPrefix))
+    return std::error_code();
+
+  if (!IsAbsolute) {
+    if (std::error_code EC = llvm::sys::fs::make_absolute(Path8Str))
+      return EC;
   }
 
-  // Just use the caller's original path.
-  return UTF8ToUTF16(Path8Str, Path16);
+  // Remove '.' and '..' because long paths treat these as real path components.
+  llvm::sys::path::native(Path8Str, path::Style::windows);
+  llvm::sys::path::remove_dots(Path8Str, true);
+
+  const StringRef RootName = llvm::sys::path::root_name(Path8Str);
+  assert(!RootName.empty() &&
+         "Root name cannot be empty for an absolute path!");
+
+  SmallString<2 * MAX_PATH> FullPath(LongPathPrefix);
+  if (RootName[1] != ':') { // Check if UNC.
+    FullPath.append("UNC\\");
+    FullPath.append(Path8Str.begin() + 2, Path8Str.end());
+  } else
+    FullPath.append(Path8Str);
+
+  return UTF8ToUTF16(FullPath, Path16);
 }
-} // end namespace path
+
+} // end namespace windows
 
 namespace fs {
 
@@ -227,7 +227,9 @@ std::error_code create_directory(const Twine &path, bool IgnoreExisting,
                                  perms Perms) {
   SmallVector<wchar_t, 128> path_utf16;
 
-  if (std::error_code ec = widenPath(path, path_utf16))
+  // CreateDirectoryW has a lower maximum path length as it must leave room for
+  // an 8.3 filename.
+  if (std::error_code ec = widenPath(path, path_utf16, MAX_PATH - 12))
     return ec;
 
   if (!::CreateDirectoryW(path_utf16.begin(), NULL)) {
@@ -553,6 +555,11 @@ std::error_code rename(const Twine &From, const Twine &To) {
                       NULL, OPEN_EXISTING, FILE_ATTRIBUTE_NORMAL, NULL);
     if (FromHandle)
       break;
+
+    // We don't want to loop if the file doesn't exist.
+    auto EC = mapWindowsError(GetLastError());
+    if (EC == errc::no_such_file_or_directory)
+      return EC;
   }
   if (!FromHandle)
     return mapWindowsError(GetLastError());
@@ -950,9 +957,9 @@ std::error_code detail::directory_iterator_construct(detail::DirIterState &IT,
     return EC;
 
   // Convert path to the format that Windows is happy with.
-  if (PathUTF16.size() > 0 &&
-      !is_separator(PathUTF16[Path.size() - 1]) &&
-      PathUTF16[Path.size() - 1] != L':') {
+  size_t PathUTF16Len = PathUTF16.size();
+  if (PathUTF16Len > 0 && !is_separator(PathUTF16[PathUTF16Len - 1]) &&
+      PathUTF16[PathUTF16Len - 1] != L':') {
     PathUTF16.push_back(L'\\');
     PathUTF16.push_back(L'*');
   } else {
@@ -1365,6 +1372,16 @@ bool home_directory(SmallVectorImpl<char> &result) {
   return getKnownFolderPath(FOLDERID_Profile, result);
 }
 
+bool user_config_directory(SmallVectorImpl<char> &result) {
+  // Either local or roaming appdata may be suitable in some cases, depending
+  // on the data. Local is more conservative, Roaming may not always be correct.
+  return getKnownFolderPath(FOLDERID_LocalAppData, result);
+}
+
+bool cache_directory(SmallVectorImpl<char> &result) {
+  return getKnownFolderPath(FOLDERID_LocalAppData, result);
+}
+
 static bool getTempDirEnvVar(const wchar_t *Var, SmallVectorImpl<char> &Res) {
   SmallVector<wchar_t, 1024> Buf;
   size_t Size = 1024;
diff --git a/llvm/lib/Support/Windows/Process.inc b/llvm/lib/Support/Windows/Process.inc
index 3526e3dee6fa..8064d4e17b29 100644
--- a/llvm/lib/Support/Windows/Process.inc
+++ b/llvm/lib/Support/Windows/Process.inc
@@ -19,7 +19,7 @@
 #include <malloc.h>
 
 // The Windows.h header must be after LLVM and standard headers.
-#include "WindowsSupport.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
 
 #include <direct.h>
 #include <io.h>
@@ -43,6 +43,12 @@
 
 using namespace llvm;
 
+Process::Pid Process::getProcessId() {
+  static_assert(sizeof(Pid) >= sizeof(DWORD),
+                "Process::Pid should be big enough to store DWORD");
+  return Pid(::GetCurrentProcessId());
+}
+
 // This function retrieves the page size using GetNativeSystemInfo() and is
 // present solely so it can be called once to initialize the self_process member
 // below.
@@ -439,18 +445,38 @@ const char *Process::ResetColor() {
   return 0;
 }
 
+static unsigned GetRandomNumberSeed() {
+  // Generate a random number seed from the millisecond-resolution Windows
+  // system clock and the current process id.
+  FILETIME Time;
+  GetSystemTimeAsFileTime(&Time);
+  DWORD Pid = GetCurrentProcessId();
+  return hash_combine(Time.dwHighDateTime, Time.dwLowDateTime, Pid);
+}
+
+static unsigned GetPseudoRandomNumber() {
+  // Arrange to call srand once when this function is first used, and
+  // otherwise (if GetRandomNumber always succeeds in using
+  // CryptGenRandom) don't bother at all.
+  static int x = (static_cast<void>(::srand(GetRandomNumberSeed())), 0);
+  (void)x;
+  return ::rand();
+}
+
 unsigned Process::GetRandomNumber() {
+  // Try to use CryptGenRandom.
   HCRYPTPROV HCPC;
-  if (!::CryptAcquireContextW(&HCPC, NULL, NULL, PROV_RSA_FULL,
-                              CRYPT_VERIFYCONTEXT))
-    ReportLastErrorFatal("Could not acquire a cryptographic context");
-
-  ScopedCryptContext CryptoProvider(HCPC);
-  unsigned Ret;
-  if (!::CryptGenRandom(CryptoProvider, sizeof(Ret),
-                        reinterpret_cast<BYTE *>(&Ret)))
-    ReportLastErrorFatal("Could not generate a random number");
-  return Ret;
+  if (::CryptAcquireContextW(&HCPC, NULL, NULL, PROV_RSA_FULL,
+                             CRYPT_VERIFYCONTEXT)) {
+    ScopedCryptContext CryptoProvider(HCPC);
+    unsigned Ret;
+    if (::CryptGenRandom(CryptoProvider, sizeof(Ret),
+                         reinterpret_cast<BYTE *>(&Ret)))
+      return Ret;
+  }
+
+  // If that fails, fall back to pseudo-random numbers.
+  return GetPseudoRandomNumber();
 }
 
 typedef NTSTATUS(WINAPI* RtlGetVersionPtr)(PRTL_OSVERSIONINFOW);
diff --git a/llvm/lib/Support/Windows/Program.inc b/llvm/lib/Support/Windows/Program.inc
index a1482bf17c60..9fe05d24ec2e 100644
--- a/llvm/lib/Support/Windows/Program.inc
+++ b/llvm/lib/Support/Windows/Program.inc
@@ -10,14 +10,15 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "WindowsSupport.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/Support/ConvertUTF.h"
 #include "llvm/Support/Errc.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/Path.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
 #include "llvm/Support/WindowsError.h"
 #include "llvm/Support/raw_ostream.h"
+#include <psapi.h>
 #include <cstdio>
 #include <fcntl.h>
 #include <io.h>
@@ -138,7 +139,7 @@ static HANDLE RedirectIO(Optional<StringRef> Path, int fd,
   if (Path->empty())
     fname = "NUL";
   else
-    fname = *Path;
+    fname = std::string(*Path);
 
   SECURITY_ATTRIBUTES sa;
   sa.nLength = sizeof(sa);
@@ -151,7 +152,7 @@ static HANDLE RedirectIO(Optional<StringRef> Path, int fd,
     if (windows::UTF8ToUTF16(fname, fnameUnicode))
       return INVALID_HANDLE_VALUE;
   } else {
-    if (path::widenPath(fname, fnameUnicode))
+    if (sys::windows::widenPath(fname, fnameUnicode))
       return INVALID_HANDLE_VALUE;
   }
   h = CreateFileW(fnameUnicode.data(), fd ? GENERIC_WRITE : GENERIC_READ,
@@ -263,7 +264,7 @@ static bool Execute(ProcessInfo &PI, StringRef Program,
   fflush(stderr);
 
   SmallVector<wchar_t, MAX_PATH> ProgramUtf16;
-  if (std::error_code ec = path::widenPath(Program, ProgramUtf16)) {
+  if (std::error_code ec = sys::windows::widenPath(Program, ProgramUtf16)) {
     SetLastError(ec.value());
     MakeErrMsg(ErrMsg,
                std::string("Unable to convert application name to UTF-16"));
@@ -390,7 +391,8 @@ std::string sys::flattenWindowsCommandLine(ArrayRef<StringRef> Args) {
 }
 
 ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
-                      bool WaitUntilChildTerminates, std::string *ErrMsg) {
+                      bool WaitUntilChildTerminates, std::string *ErrMsg,
+                      Optional<ProcessStatistics> *ProcStat) {
   assert(PI.Pid && "invalid pid to wait on, process not started?");
   assert((PI.Process && PI.Process != INVALID_HANDLE_VALUE) &&
          "invalid process handle to wait on, process not started?");
@@ -401,6 +403,8 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
     milliSecondsToWait = SecondsToWait * 1000;
 
   ProcessInfo WaitResult = PI;
+  if (ProcStat)
+    ProcStat->reset();
   DWORD WaitStatus = WaitForSingleObject(PI.Process, milliSecondsToWait);
   if (WaitStatus == WAIT_TIMEOUT) {
     if (SecondsToWait) {
@@ -421,6 +425,22 @@ ProcessInfo sys::Wait(const ProcessInfo &PI, unsigned SecondsToWait,
     }
   }
 
+  // Get process execution statistics.
+  if (ProcStat) {
+    FILETIME CreationTime, ExitTime, KernelTime, UserTime;
+    PROCESS_MEMORY_COUNTERS MemInfo;
+    if (GetProcessTimes(PI.Process, &CreationTime, &ExitTime, &KernelTime,
+                        &UserTime) &&
+        GetProcessMemoryInfo(PI.Process, &MemInfo, sizeof(MemInfo))) {
+      auto UserT = std::chrono::duration_cast<std::chrono::microseconds>(
+          toDuration(UserTime));
+      auto KernelT = std::chrono::duration_cast<std::chrono::microseconds>(
+          toDuration(KernelTime));
+      uint64_t PeakMemory = MemInfo.PeakPagefileUsage / 1024;
+      *ProcStat = ProcessStatistics{UserT + KernelT, UserT, PeakMemory};
+    }
+  }
+
   // Get its exit status.
   DWORD status;
   BOOL rc = GetExitCodeProcess(PI.Process, &status);
diff --git a/llvm/lib/Support/Windows/Signals.inc b/llvm/lib/Support/Windows/Signals.inc
index 8b525f1bd4ac..0c3681fa9654 100644
--- a/llvm/lib/Support/Windows/Signals.inc
+++ b/llvm/lib/Support/Windows/Signals.inc
@@ -23,7 +23,7 @@
 #include "llvm/Support/raw_ostream.h"
 
 // The Windows.h header must be after LLVM and standard headers.
-#include "WindowsSupport.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
 
 #ifdef __MINGW32__
  #include <imagehlp.h>
@@ -460,7 +460,7 @@ bool sys::RemoveFileOnSignal(StringRef Filename, std::string* ErrMsg) {
   if (FilesToRemove == NULL)
     FilesToRemove = new std::vector<std::string>;
 
-  FilesToRemove->push_back(Filename);
+  FilesToRemove->push_back(std::string(Filename));
 
   LeaveCriticalSection(&CriticalSection);
   return false;
@@ -584,7 +584,7 @@ void llvm::sys::AddSignalHandler(sys::SignalHandlerCallback FnPtr,
   LeaveCriticalSection(&CriticalSection);
 }
 
-static void Cleanup() {
+static void Cleanup(bool ExecuteSignalHandlers) {
   if (CleanupExecuted)
     return;
 
@@ -600,7 +600,10 @@ static void Cleanup() {
       llvm::sys::fs::remove(FilesToRemove->back());
       FilesToRemove->pop_back();
     }
-  llvm::sys::RunSignalHandlers();
+
+  if (ExecuteSignalHandlers)
+    llvm::sys::RunSignalHandlers();
+
   LeaveCriticalSection(&CriticalSection);
 }
 
@@ -610,7 +613,7 @@ void llvm::sys::RunInterruptHandlers() {
   // error handler). We must ensure that the critical section is properly
   // initialized.
   InitializeThreading();
-  Cleanup();
+  Cleanup(true);
 }
 
 /// Find the Windows Registry Key for a given location.
@@ -803,7 +806,7 @@ void sys::CleanupOnSignal(uintptr_t Context) {
 }
 
 static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
-  Cleanup();
+  Cleanup(true);
 
   // We'll automatically write a Minidump file here to help diagnose
   // the nasty sorts of crashes that aren't 100% reproducible from a set of
@@ -820,7 +823,13 @@ static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
                    << "\n";
   }
 
-  LocalPrintStackTrace(llvm::errs(), ep ? ep->ContextRecord : nullptr);
+  // Stack unwinding appears to modify the context. Copy it to preserve the
+  // caller's context.
+  CONTEXT ContextCopy;
+  if (ep)
+    memcpy(&ContextCopy, ep->ContextRecord, sizeof(ContextCopy));
+
+  LocalPrintStackTrace(llvm::errs(), ep ? &ContextCopy : nullptr);
 
   return EXCEPTION_EXECUTE_HANDLER;
 }
@@ -828,7 +837,10 @@ static LONG WINAPI LLVMUnhandledExceptionFilter(LPEXCEPTION_POINTERS ep) {
 static BOOL WINAPI LLVMConsoleCtrlHandler(DWORD dwCtrlType) {
   // We are running in our very own thread, courtesy of Windows.
   EnterCriticalSection(&CriticalSection);
-  Cleanup();
+  // This function is only ever called when a CTRL-C or similar control signal
+  // is fired. Killing a process in this way is normal, so don't trigger the
+  // signal handlers.
+  Cleanup(false);
 
   // If an interrupt function has been set, go and run one it; otherwise,
   // the process dies.
diff --git a/llvm/lib/Support/Windows/ThreadLocal.inc b/llvm/lib/Support/Windows/ThreadLocal.inc
index 1e0ed955e9ab..696e5c843ead 100644
--- a/llvm/lib/Support/Windows/ThreadLocal.inc
+++ b/llvm/lib/Support/Windows/ThreadLocal.inc
@@ -15,7 +15,7 @@
 //===          is guaranteed to work on *all* Win32 variants.
 //===----------------------------------------------------------------------===//
 
-#include "WindowsSupport.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
 #include "llvm/Support/ThreadLocal.h"
 
 namespace llvm {
diff --git a/llvm/lib/Support/Windows/Threading.inc b/llvm/lib/Support/Windows/Threading.inc
index 9456efa686ff..296e87b77695 100644
--- a/llvm/lib/Support/Windows/Threading.inc
+++ b/llvm/lib/Support/Windows/Threading.inc
@@ -13,9 +13,11 @@
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 
-#include "WindowsSupport.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
 #include <process.h>
 
+#include <bitset>
+
 // Windows will at times define MemoryFence.
 #ifdef MemoryFence
 #undef MemoryFence
@@ -122,3 +124,175 @@ SetThreadPriorityResult llvm::set_thread_priority(ThreadPriority Priority) {
              ? SetThreadPriorityResult::SUCCESS
              : SetThreadPriorityResult::FAILURE;
 }
+
+struct ProcessorGroup {
+  unsigned ID;
+  unsigned AllThreads;
+  unsigned UsableThreads;
+  unsigned ThreadsPerCore;
+  uint64_t Affinity;
+
+  unsigned useableCores() const {
+    return std::max(1U, UsableThreads / ThreadsPerCore);
+  }
+};
+
+template <typename F>
+static bool IterateProcInfo(LOGICAL_PROCESSOR_RELATIONSHIP Relationship, F Fn) {
+  DWORD Len = 0;
+  BOOL R = ::GetLogicalProcessorInformationEx(Relationship, NULL, &Len);
+  if (R || GetLastError() != ERROR_INSUFFICIENT_BUFFER) {
+    return false;
+  }
+  auto *Info = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)calloc(1, Len);
+  R = ::GetLogicalProcessorInformationEx(Relationship, Info, &Len);
+  if (R) {
+    auto *End =
+        (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Info + Len);
+    for (auto *Curr = Info; Curr < End;
+         Curr = (SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *)((uint8_t *)Curr +
+                                                            Curr->Size)) {
+      if (Curr->Relationship != Relationship)
+        continue;
+      Fn(Curr);
+    }
+  }
+  free(Info);
+  return true;
+}
+
+static ArrayRef<ProcessorGroup> getProcessorGroups() {
+  auto computeGroups = []() {
+    SmallVector<ProcessorGroup, 4> Groups;
+
+    auto HandleGroup = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
+      GROUP_RELATIONSHIP &El = ProcInfo->Group;
+      for (unsigned J = 0; J < El.ActiveGroupCount; ++J) {
+        ProcessorGroup G;
+        G.ID = Groups.size();
+        G.AllThreads = El.GroupInfo[J].MaximumProcessorCount;
+        G.UsableThreads = El.GroupInfo[J].ActiveProcessorCount;
+        assert(G.UsableThreads <= 64);
+        G.Affinity = El.GroupInfo[J].ActiveProcessorMask;
+        Groups.push_back(G);
+      }
+    };
+
+    if (!IterateProcInfo(RelationGroup, HandleGroup))
+      return std::vector<ProcessorGroup>();
+
+    auto HandleProc = [&](SYSTEM_LOGICAL_PROCESSOR_INFORMATION_EX *ProcInfo) {
+      PROCESSOR_RELATIONSHIP &El = ProcInfo->Processor;
+      assert(El.GroupCount == 1);
+      unsigned NumHyperThreads = 1;
+      // If the flag is set, each core supports more than one hyper-thread.
+      if (El.Flags & LTP_PC_SMT)
+        NumHyperThreads = std::bitset<64>(El.GroupMask[0].Mask).count();
+      unsigned I = El.GroupMask[0].Group;
+      Groups[I].ThreadsPerCore = NumHyperThreads;
+    };
+
+    if (!IterateProcInfo(RelationProcessorCore, HandleProc))
+      return std::vector<ProcessorGroup>();
+
+    // If there's an affinity mask set on one of the CPUs, then assume the user
+    // wants to constrain the current process to only a single CPU.
+    for (auto &G : Groups) {
+      if (G.UsableThreads != G.AllThreads) {
+        ProcessorGroup NewG{G};
+        Groups.clear();
+        Groups.push_back(NewG);
+        break;
+      }
+    }
+
+    return std::vector<ProcessorGroup>(Groups.begin(), Groups.end());
+  };
+  static auto Groups = computeGroups();
+  return ArrayRef<ProcessorGroup>(Groups);
+}
+
+template <typename R, typename UnaryPredicate>
+static unsigned aggregate(R &&Range, UnaryPredicate P) {
+  unsigned I{};
+  for (const auto &It : Range)
+    I += P(It);
+  return I;
+}
+
+// for sys::getHostNumPhysicalCores
+int computeHostNumPhysicalCores() {
+  static unsigned Cores =
+      aggregate(getProcessorGroups(), [](const ProcessorGroup &G) {
+        return G.UsableThreads / G.ThreadsPerCore;
+      });
+  return Cores;
+}
+
+int computeHostNumHardwareThreads() {
+  static unsigned Threads =
+      aggregate(getProcessorGroups(),
+                [](const ProcessorGroup &G) { return G.UsableThreads; });
+  return Threads;
+}
+
+// Finds the proper CPU socket where a thread number should go. Returns 'None'
+// if the thread shall remain on the actual CPU socket.
+Optional<unsigned>
+llvm::ThreadPoolStrategy::compute_cpu_socket(unsigned ThreadPoolNum) const {
+  ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
+  // Only one CPU socket in the system or process affinity was set, no need to
+  // move the thread(s) to another CPU socket.
+  if (Groups.size() <= 1)
+    return None;
+
+  // We ask for less threads than there are hardware threads per CPU socket, no
+  // need to dispatch threads to other CPU sockets.
+  unsigned MaxThreadsPerSocket =
+      UseHyperThreads ? Groups[0].UsableThreads : Groups[0].useableCores();
+  if (compute_thread_count() <= MaxThreadsPerSocket)
+    return None;
+
+  assert(ThreadPoolNum < compute_thread_count() &&
+         "The thread index is not within thread strategy's range!");
+
+  // Assumes the same number of hardware threads per CPU socket.
+  return (ThreadPoolNum * Groups.size()) / compute_thread_count();
+}
+
+// Assign the current thread to a more appropriate CPU socket or CPU group
+void llvm::ThreadPoolStrategy::apply_thread_strategy(
+    unsigned ThreadPoolNum) const {
+  Optional<unsigned> Socket = compute_cpu_socket(ThreadPoolNum);
+  if (!Socket)
+    return;
+  ArrayRef<ProcessorGroup> Groups = getProcessorGroups();
+  GROUP_AFFINITY Affinity{};
+  Affinity.Group = Groups[*Socket].ID;
+  Affinity.Mask = Groups[*Socket].Affinity;
+  SetThreadGroupAffinity(GetCurrentThread(), &Affinity, nullptr);
+}
+
+llvm::BitVector llvm::get_thread_affinity_mask() {
+  GROUP_AFFINITY Affinity{};
+  GetThreadGroupAffinity(GetCurrentThread(), &Affinity);
+
+  static unsigned All =
+      aggregate(getProcessorGroups(),
+                [](const ProcessorGroup &G) { return G.AllThreads; });
+
+  unsigned StartOffset =
+      aggregate(getProcessorGroups(), [&](const ProcessorGroup &G) {
+        return G.ID < Affinity.Group ? G.AllThreads : 0;
+      });
+
+  llvm::BitVector V;
+  V.resize(All);
+  for (unsigned I = 0; I < sizeof(KAFFINITY) * 8; ++I) {
+    if ((Affinity.Mask >> I) & 1)
+      V.set(StartOffset + I);
+  }
+  return V;
+}
+
+unsigned llvm::get_cpus() { return getProcessorGroups().size(); }
diff --git a/llvm/lib/Support/Windows/WindowsSupport.h b/llvm/lib/Support/Windows/WindowsSupport.h
deleted file mode 100644
index bb7e79b86018..000000000000
--- a/llvm/lib/Support/Windows/WindowsSupport.h
+++ /dev/null
@@ -1,243 +0,0 @@
-//===- WindowsSupport.h - Common Windows Include File -----------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file defines things specific to Windows implementations.  In addition to
-// providing some helpers for working with win32 APIs, this header wraps
-// <windows.h> with some portability macros.  Always include WindowsSupport.h
-// instead of including <windows.h> directly.
-//
-//===----------------------------------------------------------------------===//
-
-//===----------------------------------------------------------------------===//
-//=== WARNING: Implementation here must contain only generic Win32 code that
-//===          is guaranteed to work on *all* Win32 variants.
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_SUPPORT_WINDOWSSUPPORT_H
-#define LLVM_SUPPORT_WINDOWSSUPPORT_H
-
-// mingw-w64 tends to define it as 0x0502 in its headers.
-#undef _WIN32_WINNT
-#undef _WIN32_IE
-
-// Require at least Windows 7 API.
-#define _WIN32_WINNT 0x0601
-#define _WIN32_IE    0x0800 // MinGW at it again. FIXME: verify if still needed.
-#define WIN32_LEAN_AND_MEAN
-#ifndef NOMINMAX
-#define NOMINMAX
-#endif
-
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/ADT/StringExtras.h"
-#include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Config/config.h" // Get build system configuration settings
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/Chrono.h"
-#include "llvm/Support/Compiler.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/VersionTuple.h"
-#include <cassert>
-#include <string>
-#include <system_error>
-#include <windows.h>
-
-// Must be included after windows.h
-#include <wincrypt.h>
-
-namespace llvm {
-
-/// Determines if the program is running on Windows 8 or newer. This
-/// reimplements one of the helpers in the Windows 8.1 SDK, which are intended
-/// to supercede raw calls to GetVersionEx. Old SDKs, Cygwin, and MinGW don't
-/// yet have VersionHelpers.h, so we have our own helper.
-bool RunningWindows8OrGreater();
-
-/// Returns the Windows version as Major.Minor.0.BuildNumber. Uses
-/// RtlGetVersion or GetVersionEx under the hood depending on what is available.
-/// GetVersionEx is deprecated, but this API exposes the build number which can
-/// be useful for working around certain kernel bugs.
-llvm::VersionTuple GetWindowsOSVersion();
-
-bool MakeErrMsg(std::string *ErrMsg, const std::string &prefix);
-
-// Include GetLastError() in a fatal error message.
-LLVM_ATTRIBUTE_NORETURN inline void ReportLastErrorFatal(const char *Msg) {
-  std::string ErrMsg;
-  MakeErrMsg(&ErrMsg, Msg);
-  llvm::report_fatal_error(ErrMsg);
-}
-
-template <typename HandleTraits>
-class ScopedHandle {
-  typedef typename HandleTraits::handle_type handle_type;
-  handle_type Handle;
-
-  ScopedHandle(const ScopedHandle &other) = delete;
-  void operator=(const ScopedHandle &other) = delete;
-public:
-  ScopedHandle()
-    : Handle(HandleTraits::GetInvalid()) {}
-
-  explicit ScopedHandle(handle_type h)
-    : Handle(h) {}
-
-  ~ScopedHandle() {
-    if (HandleTraits::IsValid(Handle))
-      HandleTraits::Close(Handle);
-  }
-
-  handle_type take() {
-    handle_type t = Handle;
-    Handle = HandleTraits::GetInvalid();
-    return t;
-  }
-
-  ScopedHandle &operator=(handle_type h) {
-    if (HandleTraits::IsValid(Handle))
-      HandleTraits::Close(Handle);
-    Handle = h;
-    return *this;
-  }
-
-  // True if Handle is valid.
-  explicit operator bool() const {
-    return HandleTraits::IsValid(Handle) ? true : false;
-  }
-
-  operator handle_type() const {
-    return Handle;
-  }
-};
-
-struct CommonHandleTraits {
-  typedef HANDLE handle_type;
-
-  static handle_type GetInvalid() {
-    return INVALID_HANDLE_VALUE;
-  }
-
-  static void Close(handle_type h) {
-    ::CloseHandle(h);
-  }
-
-  static bool IsValid(handle_type h) {
-    return h != GetInvalid();
-  }
-};
-
-struct JobHandleTraits : CommonHandleTraits {
-  static handle_type GetInvalid() {
-    return NULL;
-  }
-};
-
-struct CryptContextTraits : CommonHandleTraits {
-  typedef HCRYPTPROV handle_type;
-
-  static handle_type GetInvalid() {
-    return 0;
-  }
-
-  static void Close(handle_type h) {
-    ::CryptReleaseContext(h, 0);
-  }
-
-  static bool IsValid(handle_type h) {
-    return h != GetInvalid();
-  }
-};
-
-struct RegTraits : CommonHandleTraits {
-  typedef HKEY handle_type;
-
-  static handle_type GetInvalid() {
-    return NULL;
-  }
-
-  static void Close(handle_type h) {
-    ::RegCloseKey(h);
-  }
-
-  static bool IsValid(handle_type h) {
-    return h != GetInvalid();
-  }
-};
-
-struct FindHandleTraits : CommonHandleTraits {
-  static void Close(handle_type h) {
-    ::FindClose(h);
-  }
-};
-
-struct FileHandleTraits : CommonHandleTraits {};
-
-typedef ScopedHandle<CommonHandleTraits> ScopedCommonHandle;
-typedef ScopedHandle<FileHandleTraits>   ScopedFileHandle;
-typedef ScopedHandle<CryptContextTraits> ScopedCryptContext;
-typedef ScopedHandle<RegTraits>          ScopedRegHandle;
-typedef ScopedHandle<FindHandleTraits>   ScopedFindHandle;
-typedef ScopedHandle<JobHandleTraits>    ScopedJobHandle;
-
-template <class T>
-class SmallVectorImpl;
-
-template <class T>
-typename SmallVectorImpl<T>::const_pointer
-c_str(SmallVectorImpl<T> &str) {
-  str.push_back(0);
-  str.pop_back();
-  return str.data();
-}
-
-namespace sys {
-
-inline std::chrono::nanoseconds toDuration(FILETIME Time) {
-  ULARGE_INTEGER TimeInteger;
-  TimeInteger.LowPart = Time.dwLowDateTime;
-  TimeInteger.HighPart = Time.dwHighDateTime;
-
-  // FILETIME's are # of 100 nanosecond ticks (1/10th of a microsecond)
-  return std::chrono::nanoseconds(100 * TimeInteger.QuadPart);
-}
-
-inline TimePoint<> toTimePoint(FILETIME Time) {
-  ULARGE_INTEGER TimeInteger;
-  TimeInteger.LowPart = Time.dwLowDateTime;
-  TimeInteger.HighPart = Time.dwHighDateTime;
-
-  // Adjust for different epoch
-  TimeInteger.QuadPart -= 11644473600ll * 10000000;
-
-  // FILETIME's are # of 100 nanosecond ticks (1/10th of a microsecond)
-  return TimePoint<>(std::chrono::nanoseconds(100 * TimeInteger.QuadPart));
-}
-
-inline FILETIME toFILETIME(TimePoint<> TP) {
-  ULARGE_INTEGER TimeInteger;
-  TimeInteger.QuadPart = TP.time_since_epoch().count() / 100;
-  TimeInteger.QuadPart += 11644473600ll * 10000000;
-
-  FILETIME Time;
-  Time.dwLowDateTime = TimeInteger.LowPart;
-  Time.dwHighDateTime = TimeInteger.HighPart;
-  return Time;
-}
-
-namespace windows {
-// Returns command line arguments. Unlike arguments given to main(),
-// this function guarantees that the returned arguments are encoded in
-// UTF-8 regardless of the current code page setting.
-std::error_code GetCommandLineArguments(SmallVectorImpl<const char *> &Args,
-                                        BumpPtrAllocator &Alloc);
-} // end namespace windows
-} // end namespace sys
-} // end namespace llvm.
-
-#endif
diff --git a/llvm/lib/Support/WithColor.cpp b/llvm/lib/Support/WithColor.cpp
index 345dd9cf3949..cb5f413d44b7 100644
--- a/llvm/lib/Support/WithColor.cpp
+++ b/llvm/lib/Support/WithColor.cpp
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Support/WithColor.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
 
@@ -18,8 +18,8 @@ static cl::opt<cl::boolOrDefault>
              cl::desc("Use colors in output (default=autodetect)"),
              cl::init(cl::BOU_UNSET));
 
-WithColor::WithColor(raw_ostream &OS, HighlightColor Color, bool DisableColors)
-    : OS(OS), DisableColors(DisableColors) {
+WithColor::WithColor(raw_ostream &OS, HighlightColor Color, ColorMode Mode)
+    : OS(OS), Mode(Mode) {
   // Detect color from terminal type unless the user passed the --color option.
   if (colorsEnabled()) {
     switch (Color) {
@@ -69,7 +69,9 @@ raw_ostream &WithColor::error(raw_ostream &OS, StringRef Prefix,
                               bool DisableColors) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Error, DisableColors).get()
+  return WithColor(OS, HighlightColor::Error,
+                   DisableColors ? ColorMode::Disable : ColorMode::Auto)
+             .get()
          << "error: ";
 }
 
@@ -77,7 +79,9 @@ raw_ostream &WithColor::warning(raw_ostream &OS, StringRef Prefix,
                                 bool DisableColors) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Warning, DisableColors).get()
+  return WithColor(OS, HighlightColor::Warning,
+                   DisableColors ? ColorMode::Disable : ColorMode::Auto)
+             .get()
          << "warning: ";
 }
 
@@ -85,23 +89,33 @@ raw_ostream &WithColor::note(raw_ostream &OS, StringRef Prefix,
                              bool DisableColors) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Note, DisableColors).get() << "note: ";
+  return WithColor(OS, HighlightColor::Note,
+                   DisableColors ? ColorMode::Disable : ColorMode::Auto)
+             .get()
+         << "note: ";
 }
 
 raw_ostream &WithColor::remark(raw_ostream &OS, StringRef Prefix,
                                bool DisableColors) {
   if (!Prefix.empty())
     OS << Prefix << ": ";
-  return WithColor(OS, HighlightColor::Remark, DisableColors).get()
+  return WithColor(OS, HighlightColor::Remark,
+                   DisableColors ? ColorMode::Disable : ColorMode::Auto)
+             .get()
          << "remark: ";
 }
 
 bool WithColor::colorsEnabled() {
-  if (DisableColors)
+  switch (Mode) {
+  case ColorMode::Enable:
+    return true;
+  case ColorMode::Disable:
     return false;
-  if (UseColor == cl::BOU_UNSET)
-    return OS.has_colors();
-  return UseColor == cl::BOU_TRUE;
+  case ColorMode::Auto:
+    return UseColor == cl::BOU_UNSET ? OS.has_colors()
+                                     : UseColor == cl::BOU_TRUE;
+  }
+  llvm_unreachable("All cases handled above.");
 }
 
 WithColor &WithColor::changeColor(raw_ostream::Colors Color, bool Bold,
@@ -118,3 +132,15 @@ WithColor &WithColor::resetColor() {
 }
 
 WithColor::~WithColor() { resetColor(); }
+
+void WithColor::defaultErrorHandler(Error Err) {
+  handleAllErrors(std::move(Err), [](ErrorInfoBase &Info) {
+    WithColor::error() << Info.message() << '\n';
+  });
+}
+
+void WithColor::defaultWarningHandler(Error Warning) {
+  handleAllErrors(std::move(Warning), [](ErrorInfoBase &Info) {
+    WithColor::warning() << Info.message() << '\n';
+  });
+}
diff --git a/llvm/lib/Support/X86TargetParser.cpp b/llvm/lib/Support/X86TargetParser.cpp
new file mode 100644
index 000000000000..572d1203aaf2
--- /dev/null
+++ b/llvm/lib/Support/X86TargetParser.cpp
@@ -0,0 +1,595 @@
+//===-- X86TargetParser - Parser for X86 features ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements a target parser to recognise X86 hardware features.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/X86TargetParser.h"
+#include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
+
+using namespace llvm;
+using namespace llvm::X86;
+
+namespace {
+
+/// Container class for CPU features.
+/// This is a constexpr reimplementation of a subset of std::bitset. It would be
+/// nice to use std::bitset directly, but it doesn't support constant
+/// initialization.
+class FeatureBitset {
+  static constexpr unsigned NUM_FEATURE_WORDS =
+      (X86::CPU_FEATURE_MAX + 31) / 32;
+
+  // This cannot be a std::array, operator[] is not constexpr until C++17.
+  uint32_t Bits[NUM_FEATURE_WORDS] = {};
+
+public:
+  constexpr FeatureBitset() = default;
+  constexpr FeatureBitset(std::initializer_list<unsigned> Init) {
+    for (auto I : Init)
+      set(I);
+  }
+
+  constexpr FeatureBitset &set(unsigned I) {
+    // GCC <6.2 crashes if this is written in a single statement.
+    uint32_t NewBits = Bits[I / 32] | (uint32_t(1) << (I % 32));
+    Bits[I / 32] = NewBits;
+    return *this;
+  }
+
+  constexpr bool operator[](unsigned I) const {
+    uint32_t Mask = uint32_t(1) << (I % 32);
+    return (Bits[I / 32] & Mask) != 0;
+  }
+
+  constexpr FeatureBitset &operator&=(const FeatureBitset &RHS) {
+    for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I) {
+      // GCC <6.2 crashes if this is written in a single statement.
+      uint32_t NewBits = Bits[I] & RHS.Bits[I];
+      Bits[I] = NewBits;
+    }
+    return *this;
+  }
+
+  constexpr FeatureBitset &operator|=(const FeatureBitset &RHS) {
+    for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I) {
+      // GCC <6.2 crashes if this is written in a single statement.
+      uint32_t NewBits = Bits[I] | RHS.Bits[I];
+      Bits[I] = NewBits;
+    }
+    return *this;
+  }
+
+  // gcc 5.3 miscompiles this if we try to write this using operator&=.
+  constexpr FeatureBitset operator&(const FeatureBitset &RHS) const {
+    FeatureBitset Result;
+    for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I)
+      Result.Bits[I] = Bits[I] & RHS.Bits[I];
+    return Result;
+  }
+
+  // gcc 5.3 miscompiles this if we try to write this using operator&=.
+  constexpr FeatureBitset operator|(const FeatureBitset &RHS) const {
+    FeatureBitset Result;
+    for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I)
+      Result.Bits[I] = Bits[I] | RHS.Bits[I];
+    return Result;
+  }
+
+  constexpr FeatureBitset operator~() const {
+    FeatureBitset Result;
+    for (unsigned I = 0, E = array_lengthof(Bits); I != E; ++I)
+      Result.Bits[I] = ~Bits[I];
+    return Result;
+  }
+};
+
+struct ProcInfo {
+  StringLiteral Name;
+  X86::CPUKind Kind;
+  unsigned KeyFeature;
+  FeatureBitset Features;
+};
+
+struct FeatureInfo {
+  StringLiteral Name;
+  FeatureBitset ImpliedFeatures;
+};
+
+} // end anonymous namespace
+
+#define X86_FEATURE(ENUM, STRING)                                              \
+  static constexpr FeatureBitset Feature##ENUM = {X86::FEATURE_##ENUM};
+#include "llvm/Support/X86TargetParser.def"
+
+// Pentium with MMX.
+static constexpr FeatureBitset FeaturesPentiumMMX =
+    FeatureX87 | FeatureCMPXCHG8B | FeatureMMX;
+
+// Pentium 2 and 3.
+static constexpr FeatureBitset FeaturesPentium2 =
+    FeatureX87 | FeatureCMPXCHG8B | FeatureMMX | FeatureFXSR;
+static constexpr FeatureBitset FeaturesPentium3 = FeaturesPentium2 | FeatureSSE;
+
+// Pentium 4 CPUs
+static constexpr FeatureBitset FeaturesPentium4 =
+    FeaturesPentium3 | FeatureSSE2;
+static constexpr FeatureBitset FeaturesPrescott =
+    FeaturesPentium4 | FeatureSSE3;
+static constexpr FeatureBitset FeaturesNocona =
+    FeaturesPrescott | Feature64BIT | FeatureCMPXCHG16B;
+
+// Basic 64-bit capable CPU.
+static constexpr FeatureBitset FeaturesX86_64 = FeaturesPentium4 | Feature64BIT;
+
+// Intel Core CPUs
+static constexpr FeatureBitset FeaturesCore2 =
+    FeaturesNocona | FeatureSAHF | FeatureSSSE3;
+static constexpr FeatureBitset FeaturesPenryn = FeaturesCore2 | FeatureSSE4_1;
+static constexpr FeatureBitset FeaturesNehalem =
+    FeaturesPenryn | FeaturePOPCNT | FeatureSSE4_2;
+static constexpr FeatureBitset FeaturesWestmere =
+    FeaturesNehalem | FeaturePCLMUL;
+static constexpr FeatureBitset FeaturesSandyBridge =
+    FeaturesWestmere | FeatureAVX | FeatureXSAVE | FeatureXSAVEOPT;
+static constexpr FeatureBitset FeaturesIvyBridge =
+    FeaturesSandyBridge | FeatureF16C | FeatureFSGSBASE | FeatureRDRND;
+static constexpr FeatureBitset FeaturesHaswell =
+    FeaturesIvyBridge | FeatureAVX2 | FeatureBMI | FeatureBMI2 | FeatureFMA |
+    FeatureINVPCID | FeatureLZCNT | FeatureMOVBE;
+static constexpr FeatureBitset FeaturesBroadwell =
+    FeaturesHaswell | FeatureADX | FeaturePRFCHW | FeatureRDSEED;
+
+// Intel Knights Landing and Knights Mill
+// Knights Landing has feature parity with Broadwell.
+static constexpr FeatureBitset FeaturesKNL =
+    FeaturesBroadwell | FeatureAES | FeatureAVX512F | FeatureAVX512CD |
+    FeatureAVX512ER | FeatureAVX512PF | FeaturePREFETCHWT1;
+static constexpr FeatureBitset FeaturesKNM =
+    FeaturesKNL | FeatureAVX512VPOPCNTDQ;
+
+// Intel Skylake processors.
+static constexpr FeatureBitset FeaturesSkylakeClient =
+    FeaturesBroadwell | FeatureAES | FeatureCLFLUSHOPT | FeatureXSAVEC |
+    FeatureXSAVES | FeatureSGX;
+// SkylakeServer inherits all SkylakeClient features except SGX.
+// FIXME: That doesn't match gcc.
+static constexpr FeatureBitset FeaturesSkylakeServer =
+    (FeaturesSkylakeClient & ~FeatureSGX) | FeatureAVX512F | FeatureAVX512CD |
+    FeatureAVX512DQ | FeatureAVX512BW | FeatureAVX512VL | FeatureCLWB |
+    FeaturePKU;
+static constexpr FeatureBitset FeaturesCascadeLake =
+    FeaturesSkylakeServer | FeatureAVX512VNNI;
+static constexpr FeatureBitset FeaturesCooperLake =
+    FeaturesCascadeLake | FeatureAVX512BF16;
+
+// Intel 10nm processors.
+static constexpr FeatureBitset FeaturesCannonlake =
+    FeaturesSkylakeClient | FeatureAVX512F | FeatureAVX512CD | FeatureAVX512DQ |
+    FeatureAVX512BW | FeatureAVX512VL | FeatureAVX512IFMA | FeatureAVX512VBMI |
+    FeaturePKU | FeatureSHA;
+static constexpr FeatureBitset FeaturesICLClient =
+    FeaturesCannonlake | FeatureAVX512BITALG | FeatureAVX512VBMI2 |
+    FeatureAVX512VNNI | FeatureAVX512VPOPCNTDQ | FeatureCLWB | FeatureGFNI |
+    FeatureRDPID | FeatureVAES | FeatureVPCLMULQDQ;
+static constexpr FeatureBitset FeaturesICLServer =
+    FeaturesICLClient | FeaturePCONFIG | FeatureWBNOINVD;
+static constexpr FeatureBitset FeaturesTigerlake =
+    FeaturesICLClient | FeatureAVX512VP2INTERSECT | FeatureMOVDIR64B |
+    FeatureMOVDIRI | FeatureSHSTK;
+
+// Intel Atom processors.
+// Bonnell has feature parity with Core2 and adds MOVBE.
+static constexpr FeatureBitset FeaturesBonnell = FeaturesCore2 | FeatureMOVBE;
+// Silvermont has parity with Westmere and Bonnell plus PRFCHW and RDRND.
+static constexpr FeatureBitset FeaturesSilvermont =
+    FeaturesBonnell | FeaturesWestmere | FeaturePRFCHW | FeatureRDRND;
+static constexpr FeatureBitset FeaturesGoldmont =
+    FeaturesSilvermont | FeatureAES | FeatureCLFLUSHOPT | FeatureFSGSBASE |
+    FeatureRDSEED | FeatureSHA | FeatureXSAVE | FeatureXSAVEC |
+    FeatureXSAVEOPT | FeatureXSAVES;
+static constexpr FeatureBitset FeaturesGoldmontPlus =
+    FeaturesGoldmont | FeaturePTWRITE | FeatureRDPID | FeatureSGX;
+static constexpr FeatureBitset FeaturesTremont =
+    FeaturesGoldmontPlus | FeatureCLWB | FeatureGFNI;
+
+// Geode Processor.
+static constexpr FeatureBitset FeaturesGeode =
+    FeatureX87 | FeatureCMPXCHG8B | FeatureMMX | Feature3DNOW | Feature3DNOWA;
+
+// K6 processor.
+static constexpr FeatureBitset FeaturesK6 =
+    FeatureX87 | FeatureCMPXCHG8B | FeatureMMX;
+
+// K7 and K8 architecture processors.
+static constexpr FeatureBitset FeaturesAthlon =
+    FeatureX87 | FeatureCMPXCHG8B | FeatureMMX | Feature3DNOW | Feature3DNOWA;
+static constexpr FeatureBitset FeaturesAthlonXP =
+    FeaturesAthlon | FeatureFXSR | FeatureSSE;
+static constexpr FeatureBitset FeaturesK8 =
+    FeaturesAthlonXP | FeatureSSE2 | Feature64BIT;
+static constexpr FeatureBitset FeaturesK8SSE3 = FeaturesK8 | FeatureSSE3;
+static constexpr FeatureBitset FeaturesAMDFAM10 =
+    FeaturesK8SSE3 | FeatureCMPXCHG16B | FeatureLZCNT | FeaturePOPCNT |
+    FeaturePRFCHW | FeatureSAHF | FeatureSSE4_A;
+
+// Bobcat architecture processors.
+static constexpr FeatureBitset FeaturesBTVER1 =
+    FeatureX87 | FeatureCMPXCHG8B | FeatureCMPXCHG16B | Feature64BIT |
+    FeatureFXSR | FeatureLZCNT | FeatureMMX | FeaturePOPCNT | FeaturePRFCHW |
+    FeatureSSE | FeatureSSE2 | FeatureSSE3 | FeatureSSSE3 | FeatureSSE4_A |
+    FeatureSAHF;
+static constexpr FeatureBitset FeaturesBTVER2 =
+    FeaturesBTVER1 | FeatureAES | FeatureAVX | FeatureBMI | FeatureF16C |
+    FeatureMOVBE | FeaturePCLMUL | FeatureXSAVE | FeatureXSAVEOPT;
+
+// AMD Bulldozer architecture processors.
+static constexpr FeatureBitset FeaturesBDVER1 =
+    FeatureX87 | FeatureAES | FeatureAVX | FeatureCMPXCHG8B |
+    FeatureCMPXCHG16B | Feature64BIT | FeatureFMA4 | FeatureFXSR | FeatureLWP |
+    FeatureLZCNT | FeatureMMX | FeaturePCLMUL | FeaturePOPCNT | FeaturePRFCHW |
+    FeatureSAHF | FeatureSSE | FeatureSSE2 | FeatureSSE3 | FeatureSSSE3 |
+    FeatureSSE4_1 | FeatureSSE4_2 | FeatureSSE4_A | FeatureXOP | FeatureXSAVE;
+static constexpr FeatureBitset FeaturesBDVER2 =
+    FeaturesBDVER1 | FeatureBMI | FeatureFMA | FeatureF16C | FeatureTBM;
+static constexpr FeatureBitset FeaturesBDVER3 =
+    FeaturesBDVER2 | FeatureFSGSBASE | FeatureXSAVEOPT;
+static constexpr FeatureBitset FeaturesBDVER4 =
+    FeaturesBDVER3 | FeatureAVX2 | FeatureBMI2 | FeatureMOVBE | FeatureMWAITX |
+    FeatureRDRND;
+
+// AMD Zen architecture processors.
+static constexpr FeatureBitset FeaturesZNVER1 =
+    FeatureX87 | FeatureADX | FeatureAES | FeatureAVX | FeatureAVX2 |
+    FeatureBMI | FeatureBMI2 | FeatureCLFLUSHOPT | FeatureCLZERO |
+    FeatureCMPXCHG8B | FeatureCMPXCHG16B | Feature64BIT | FeatureF16C |
+    FeatureFMA | FeatureFSGSBASE | FeatureFXSR | FeatureLZCNT | FeatureMMX |
+    FeatureMOVBE | FeatureMWAITX | FeaturePCLMUL | FeaturePOPCNT |
+    FeaturePRFCHW | FeatureRDRND | FeatureRDSEED | FeatureSAHF | FeatureSHA |
+    FeatureSSE | FeatureSSE2 | FeatureSSE3 | FeatureSSSE3 | FeatureSSE4_1 |
+    FeatureSSE4_2 | FeatureSSE4_A | FeatureXSAVE | FeatureXSAVEC |
+    FeatureXSAVEOPT | FeatureXSAVES;
+static constexpr FeatureBitset FeaturesZNVER2 =
+    FeaturesZNVER1 | FeatureCLWB | FeatureRDPID | FeatureWBNOINVD;
+
+static constexpr ProcInfo Processors[] = {
+  // Empty processor. Include X87 and CMPXCHG8 for backwards compatibility.
+  { {""}, CK_None, ~0U, FeatureX87 | FeatureCMPXCHG8B },
+  // i386-generation processors.
+  { {"i386"}, CK_i386, ~0U, FeatureX87 },
+  // i486-generation processors.
+  { {"i486"}, CK_i486, ~0U, FeatureX87 },
+  { {"winchip-c6"}, CK_WinChipC6, ~0U, FeaturesPentiumMMX },
+  { {"winchip2"}, CK_WinChip2, ~0U, FeaturesPentiumMMX | Feature3DNOW },
+  { {"c3"}, CK_C3, ~0U, FeaturesPentiumMMX | Feature3DNOW },
+  // i586-generation processors, P5 microarchitecture based.
+  { {"i586"}, CK_i586, ~0U, FeatureX87 | FeatureCMPXCHG8B },
+  { {"pentium"}, CK_Pentium, ~0U, FeatureX87 | FeatureCMPXCHG8B },
+  { {"pentium-mmx"}, CK_PentiumMMX, ~0U, FeaturesPentiumMMX },
+  // i686-generation processors, P6 / Pentium M microarchitecture based.
+  { {"pentiumpro"}, CK_PentiumPro, ~0U, FeatureX87 | FeatureCMPXCHG8B },
+  { {"i686"}, CK_i686, ~0U, FeatureX87 | FeatureCMPXCHG8B },
+  { {"pentium2"}, CK_Pentium2, ~0U, FeaturesPentium2 },
+  { {"pentium3"}, CK_Pentium3, ~0U, FeaturesPentium3 },
+  { {"pentium3m"}, CK_Pentium3, ~0U, FeaturesPentium3 },
+  { {"pentium-m"}, CK_PentiumM, ~0U, FeaturesPentium4 },
+  { {"c3-2"}, CK_C3_2, ~0U, FeaturesPentium3 },
+  { {"yonah"}, CK_Yonah, ~0U, FeaturesPrescott },
+  // Netburst microarchitecture based processors.
+  { {"pentium4"}, CK_Pentium4, ~0U, FeaturesPentium4 },
+  { {"pentium4m"}, CK_Pentium4, ~0U, FeaturesPentium4 },
+  { {"prescott"}, CK_Prescott, ~0U, FeaturesPrescott },
+  { {"nocona"}, CK_Nocona, ~0U, FeaturesNocona },
+  // Core microarchitecture based processors.
+  { {"core2"}, CK_Core2, ~0U, FeaturesCore2 },
+  { {"penryn"}, CK_Penryn, ~0U, FeaturesPenryn },
+  // Atom processors
+  { {"bonnell"}, CK_Bonnell, FEATURE_SSSE3, FeaturesBonnell },
+  { {"atom"}, CK_Bonnell, FEATURE_SSSE3, FeaturesBonnell },
+  { {"silvermont"}, CK_Silvermont, FEATURE_SSE4_2, FeaturesSilvermont },
+  { {"slm"}, CK_Silvermont, FEATURE_SSE4_2, FeaturesSilvermont },
+  { {"goldmont"}, CK_Goldmont, FEATURE_SSE4_2, FeaturesGoldmont },
+  { {"goldmont-plus"}, CK_GoldmontPlus, FEATURE_SSE4_2, FeaturesGoldmontPlus },
+  { {"tremont"}, CK_Tremont, FEATURE_SSE4_2, FeaturesTremont },
+  // Nehalem microarchitecture based processors.
+  { {"nehalem"}, CK_Nehalem, FEATURE_SSE4_2, FeaturesNehalem },
+  { {"corei7"}, CK_Nehalem, FEATURE_SSE4_2, FeaturesNehalem },
+  // Westmere microarchitecture based processors.
+  { {"westmere"}, CK_Westmere, FEATURE_PCLMUL, FeaturesWestmere },
+  // Sandy Bridge microarchitecture based processors.
+  { {"sandybridge"}, CK_SandyBridge, FEATURE_AVX, FeaturesSandyBridge },
+  { {"corei7-avx"}, CK_SandyBridge, FEATURE_AVX, FeaturesSandyBridge },
+  // Ivy Bridge microarchitecture based processors.
+  { {"ivybridge"}, CK_IvyBridge, FEATURE_AVX, FeaturesIvyBridge },
+  { {"core-avx-i"}, CK_IvyBridge, FEATURE_AVX, FeaturesIvyBridge },
+  // Haswell microarchitecture based processors.
+  { {"haswell"}, CK_Haswell, FEATURE_AVX2, FeaturesHaswell },
+  { {"core-avx2"}, CK_Haswell, FEATURE_AVX2, FeaturesHaswell },
+  // Broadwell microarchitecture based processors.
+  { {"broadwell"}, CK_Broadwell, FEATURE_AVX2, FeaturesBroadwell },
+  // Skylake client microarchitecture based processors.
+  { {"skylake"}, CK_SkylakeClient, FEATURE_AVX2, FeaturesSkylakeClient },
+  // Skylake server microarchitecture based processors.
+  { {"skylake-avx512"}, CK_SkylakeServer, FEATURE_AVX512F, FeaturesSkylakeServer },
+  { {"skx"}, CK_SkylakeServer, FEATURE_AVX512F, FeaturesSkylakeServer },
+  // Cascadelake Server microarchitecture based processors.
+  { {"cascadelake"}, CK_Cascadelake, FEATURE_AVX512VNNI, FeaturesCascadeLake },
+  // Cooperlake Server microarchitecture based processors.
+  { {"cooperlake"}, CK_Cooperlake, FEATURE_AVX512BF16, FeaturesCooperLake },
+  // Cannonlake client microarchitecture based processors.
+  { {"cannonlake"}, CK_Cannonlake, FEATURE_AVX512VBMI, FeaturesCannonlake },
+  // Icelake client microarchitecture based processors.
+  { {"icelake-client"}, CK_IcelakeClient, FEATURE_AVX512VBMI2, FeaturesICLClient },
+  // Icelake server microarchitecture based processors.
+  { {"icelake-server"}, CK_IcelakeServer, FEATURE_AVX512VBMI2, FeaturesICLServer },
+  // Tigerlake microarchitecture based processors.
+  { {"tigerlake"}, CK_Tigerlake, FEATURE_AVX512VP2INTERSECT, FeaturesTigerlake },
+  // Knights Landing processor.
+  { {"knl"}, CK_KNL, FEATURE_AVX512F, FeaturesKNL },
+  // Knights Mill processor.
+  { {"knm"}, CK_KNM, FEATURE_AVX5124FMAPS, FeaturesKNM },
+  // Lakemont microarchitecture based processors.
+  { {"lakemont"}, CK_Lakemont, ~0U, FeatureCMPXCHG8B },
+  // K6 architecture processors.
+  { {"k6"}, CK_K6, ~0U, FeaturesK6 },
+  { {"k6-2"}, CK_K6_2, ~0U, FeaturesK6 | Feature3DNOW },
+  { {"k6-3"}, CK_K6_3, ~0U, FeaturesK6 | Feature3DNOW },
+  // K7 architecture processors.
+  { {"athlon"}, CK_Athlon, ~0U, FeaturesAthlon },
+  { {"athlon-tbird"}, CK_Athlon, ~0U, FeaturesAthlon },
+  { {"athlon-xp"}, CK_AthlonXP, ~0U, FeaturesAthlonXP },
+  { {"athlon-mp"}, CK_AthlonXP, ~0U, FeaturesAthlonXP },
+  { {"athlon-4"}, CK_AthlonXP, ~0U, FeaturesAthlonXP },
+  // K8 architecture processors.
+  { {"k8"}, CK_K8, ~0U, FeaturesK8 },
+  { {"athlon64"}, CK_K8, ~0U, FeaturesK8 },
+  { {"athlon-fx"}, CK_K8, ~0U, FeaturesK8 },
+  { {"opteron"}, CK_K8, ~0U, FeaturesK8 },
+  { {"k8-sse3"}, CK_K8SSE3, ~0U, FeaturesK8SSE3 },
+  { {"athlon64-sse3"}, CK_K8SSE3, ~0U, FeaturesK8SSE3 },
+  { {"opteron-sse3"}, CK_K8SSE3, ~0U, FeaturesK8SSE3 },
+  { {"amdfam10"}, CK_AMDFAM10, FEATURE_SSE4_A, FeaturesAMDFAM10 },
+  { {"barcelona"}, CK_AMDFAM10, FEATURE_SSE4_A, FeaturesAMDFAM10 },
+  // Bobcat architecture processors.
+  { {"btver1"}, CK_BTVER1, FEATURE_SSE4_A, FeaturesBTVER1 },
+  { {"btver2"}, CK_BTVER2, FEATURE_BMI, FeaturesBTVER2 },
+  // Bulldozer architecture processors.
+  { {"bdver1"}, CK_BDVER1, FEATURE_XOP, FeaturesBDVER1 },
+  { {"bdver2"}, CK_BDVER2, FEATURE_FMA, FeaturesBDVER2 },
+  { {"bdver3"}, CK_BDVER3, FEATURE_FMA, FeaturesBDVER3 },
+  { {"bdver4"}, CK_BDVER4, FEATURE_AVX2, FeaturesBDVER4 },
+  // Zen architecture processors.
+  { {"znver1"}, CK_ZNVER1, FEATURE_AVX2, FeaturesZNVER1 },
+  { {"znver2"}, CK_ZNVER2, FEATURE_AVX2, FeaturesZNVER2 },
+  // Generic 64-bit processor.
+  { {"x86-64"}, CK_x86_64, ~0U, FeaturesX86_64 },
+  // Geode processors.
+  { {"geode"}, CK_Geode, ~0U, FeaturesGeode },
+};
+
+X86::CPUKind llvm::X86::parseArchX86(StringRef CPU, bool Only64Bit) {
+  for (const auto &P : Processors)
+    if (P.Name == CPU && (P.Features[FEATURE_64BIT] || !Only64Bit))
+      return P.Kind;
+
+  return CK_None;
+}
+
+void llvm::X86::fillValidCPUArchList(SmallVectorImpl<StringRef> &Values,
+                                     bool Only64Bit) {
+  for (const auto &P : Processors)
+    if (!P.Name.empty() && (P.Features[FEATURE_64BIT] || !Only64Bit))
+      Values.emplace_back(P.Name);
+}
+
+ProcessorFeatures llvm::X86::getKeyFeature(X86::CPUKind Kind) {
+  // FIXME: Can we avoid a linear search here? The table might be sorted by
+  // CPUKind so we could binary search?
+  for (const auto &P : Processors) {
+    if (P.Kind == Kind) {
+      assert(P.KeyFeature != ~0U && "Processor does not have a key feature.");
+      return static_cast<ProcessorFeatures>(P.KeyFeature);
+    }
+  }
+
+  llvm_unreachable("Unable to find CPU kind!");
+}
+
+// Features with no dependencies.
+static constexpr FeatureBitset ImpliedFeatures64BIT = {};
+static constexpr FeatureBitset ImpliedFeaturesADX = {};
+static constexpr FeatureBitset ImpliedFeaturesBMI = {};
+static constexpr FeatureBitset ImpliedFeaturesBMI2 = {};
+static constexpr FeatureBitset ImpliedFeaturesCLDEMOTE = {};
+static constexpr FeatureBitset ImpliedFeaturesCLFLUSHOPT = {};
+static constexpr FeatureBitset ImpliedFeaturesCLWB = {};
+static constexpr FeatureBitset ImpliedFeaturesCLZERO = {};
+static constexpr FeatureBitset ImpliedFeaturesCMOV = {};
+static constexpr FeatureBitset ImpliedFeaturesCMPXCHG16B = {};
+static constexpr FeatureBitset ImpliedFeaturesCMPXCHG8B = {};
+static constexpr FeatureBitset ImpliedFeaturesENQCMD = {};
+static constexpr FeatureBitset ImpliedFeaturesFSGSBASE = {};
+static constexpr FeatureBitset ImpliedFeaturesFXSR = {};
+static constexpr FeatureBitset ImpliedFeaturesINVPCID = {};
+static constexpr FeatureBitset ImpliedFeaturesLWP = {};
+static constexpr FeatureBitset ImpliedFeaturesLZCNT = {};
+static constexpr FeatureBitset ImpliedFeaturesMWAITX = {};
+static constexpr FeatureBitset ImpliedFeaturesMOVBE = {};
+static constexpr FeatureBitset ImpliedFeaturesMOVDIR64B = {};
+static constexpr FeatureBitset ImpliedFeaturesMOVDIRI = {};
+static constexpr FeatureBitset ImpliedFeaturesPCONFIG = {};
+static constexpr FeatureBitset ImpliedFeaturesPOPCNT = {};
+static constexpr FeatureBitset ImpliedFeaturesPKU = {};
+static constexpr FeatureBitset ImpliedFeaturesPREFETCHWT1 = {};
+static constexpr FeatureBitset ImpliedFeaturesPRFCHW = {};
+static constexpr FeatureBitset ImpliedFeaturesPTWRITE = {};
+static constexpr FeatureBitset ImpliedFeaturesRDPID = {};
+static constexpr FeatureBitset ImpliedFeaturesRDRND = {};
+static constexpr FeatureBitset ImpliedFeaturesRDSEED = {};
+static constexpr FeatureBitset ImpliedFeaturesRTM = {};
+static constexpr FeatureBitset ImpliedFeaturesSAHF = {};
+static constexpr FeatureBitset ImpliedFeaturesSERIALIZE = {};
+static constexpr FeatureBitset ImpliedFeaturesSGX = {};
+static constexpr FeatureBitset ImpliedFeaturesSHSTK = {};
+static constexpr FeatureBitset ImpliedFeaturesTBM = {};
+static constexpr FeatureBitset ImpliedFeaturesTSXLDTRK = {};
+static constexpr FeatureBitset ImpliedFeaturesWAITPKG = {};
+static constexpr FeatureBitset ImpliedFeaturesWBNOINVD = {};
+static constexpr FeatureBitset ImpliedFeaturesVZEROUPPER = {};
+static constexpr FeatureBitset ImpliedFeaturesX87 = {};
+static constexpr FeatureBitset ImpliedFeaturesXSAVE = {};
+
+// Not really CPU features, but need to be in the table because clang uses
+// target features to communicate them to the backend.
+static constexpr FeatureBitset ImpliedFeaturesRETPOLINE_EXTERNAL_THUNK = {};
+static constexpr FeatureBitset ImpliedFeaturesRETPOLINE_INDIRECT_BRANCHES = {};
+static constexpr FeatureBitset ImpliedFeaturesRETPOLINE_INDIRECT_CALLS = {};
+static constexpr FeatureBitset ImpliedFeaturesLVI_CFI = {};
+static constexpr FeatureBitset ImpliedFeaturesLVI_LOAD_HARDENING = {};
+
+// XSAVE features are dependent on basic XSAVE.
+static constexpr FeatureBitset ImpliedFeaturesXSAVEC = FeatureXSAVE;
+static constexpr FeatureBitset ImpliedFeaturesXSAVEOPT = FeatureXSAVE;
+static constexpr FeatureBitset ImpliedFeaturesXSAVES = FeatureXSAVE;
+
+// MMX->3DNOW->3DNOWA chain.
+static constexpr FeatureBitset ImpliedFeaturesMMX = {};
+static constexpr FeatureBitset ImpliedFeatures3DNOW = FeatureMMX;
+static constexpr FeatureBitset ImpliedFeatures3DNOWA = Feature3DNOW;
+
+// SSE/AVX/AVX512F chain.
+static constexpr FeatureBitset ImpliedFeaturesSSE = {};
+static constexpr FeatureBitset ImpliedFeaturesSSE2 = FeatureSSE;
+static constexpr FeatureBitset ImpliedFeaturesSSE3 = FeatureSSE2;
+static constexpr FeatureBitset ImpliedFeaturesSSSE3 = FeatureSSE3;
+static constexpr FeatureBitset ImpliedFeaturesSSE4_1 = FeatureSSSE3;
+static constexpr FeatureBitset ImpliedFeaturesSSE4_2 = FeatureSSE4_1;
+static constexpr FeatureBitset ImpliedFeaturesAVX = FeatureSSE4_2;
+static constexpr FeatureBitset ImpliedFeaturesAVX2 = FeatureAVX;
+static constexpr FeatureBitset ImpliedFeaturesAVX512F =
+    FeatureAVX2 | FeatureF16C | FeatureFMA;
+
+// Vector extensions that build on SSE or AVX.
+static constexpr FeatureBitset ImpliedFeaturesAES = FeatureSSE2;
+static constexpr FeatureBitset ImpliedFeaturesF16C = FeatureAVX;
+static constexpr FeatureBitset ImpliedFeaturesFMA = FeatureAVX;
+static constexpr FeatureBitset ImpliedFeaturesGFNI = FeatureSSE2;
+static constexpr FeatureBitset ImpliedFeaturesPCLMUL = FeatureSSE2;
+static constexpr FeatureBitset ImpliedFeaturesSHA = FeatureSSE2;
+static constexpr FeatureBitset ImpliedFeaturesVAES = FeatureAES | FeatureAVX;
+static constexpr FeatureBitset ImpliedFeaturesVPCLMULQDQ =
+    FeatureAVX | FeaturePCLMUL;
+
+// AVX512 features.
+static constexpr FeatureBitset ImpliedFeaturesAVX512CD = FeatureAVX512F;
+static constexpr FeatureBitset ImpliedFeaturesAVX512BW = FeatureAVX512F;
+static constexpr FeatureBitset ImpliedFeaturesAVX512DQ = FeatureAVX512F;
+static constexpr FeatureBitset ImpliedFeaturesAVX512ER = FeatureAVX512F;
+static constexpr FeatureBitset ImpliedFeaturesAVX512PF = FeatureAVX512F;
+static constexpr FeatureBitset ImpliedFeaturesAVX512VL = FeatureAVX512F;
+
+static constexpr FeatureBitset ImpliedFeaturesAVX512BF16 = FeatureAVX512BW;
+static constexpr FeatureBitset ImpliedFeaturesAVX512BITALG = FeatureAVX512BW;
+static constexpr FeatureBitset ImpliedFeaturesAVX512IFMA = FeatureAVX512F;
+static constexpr FeatureBitset ImpliedFeaturesAVX512VNNI = FeatureAVX512F;
+static constexpr FeatureBitset ImpliedFeaturesAVX512VPOPCNTDQ = FeatureAVX512F;
+static constexpr FeatureBitset ImpliedFeaturesAVX512VBMI = FeatureAVX512BW;
+static constexpr FeatureBitset ImpliedFeaturesAVX512VBMI2 = FeatureAVX512BW;
+static constexpr FeatureBitset ImpliedFeaturesAVX512VP2INTERSECT =
+    FeatureAVX512F;
+
+// FIXME: These two aren't really implemented and just exist in the feature
+// list for __builtin_cpu_supports. So omit their dependencies.
+static constexpr FeatureBitset ImpliedFeaturesAVX5124FMAPS = {};
+static constexpr FeatureBitset ImpliedFeaturesAVX5124VNNIW = {};
+
+// SSE4_A->FMA4->XOP chain.
+static constexpr FeatureBitset ImpliedFeaturesSSE4_A = FeatureSSSE3;
+static constexpr FeatureBitset ImpliedFeaturesFMA4 = FeatureAVX | FeatureSSE4_A;
+static constexpr FeatureBitset ImpliedFeaturesXOP = FeatureFMA4;
+
+// AMX Features
+static constexpr FeatureBitset ImpliedFeaturesAMX_TILE = {};
+static constexpr FeatureBitset ImpliedFeaturesAMX_BF16 = FeatureAMX_TILE;
+static constexpr FeatureBitset ImpliedFeaturesAMX_INT8 = FeatureAMX_TILE;
+
+static constexpr FeatureInfo FeatureInfos[X86::CPU_FEATURE_MAX] = {
+#define X86_FEATURE(ENUM, STR) {{STR}, ImpliedFeatures##ENUM},
+#include "llvm/Support/X86TargetParser.def"
+};
+
+// Convert the set bits in FeatureBitset to a list of strings.
+static void getFeatureBitsAsStrings(const FeatureBitset &Bits,
+                                    SmallVectorImpl<StringRef> &Features) {
+  for (unsigned i = 0; i != CPU_FEATURE_MAX; ++i)
+    if (Bits[i] && !FeatureInfos[i].Name.empty())
+      Features.push_back(FeatureInfos[i].Name);
+}
+
+void llvm::X86::getFeaturesForCPU(StringRef CPU,
+                                  SmallVectorImpl<StringRef> &EnabledFeatures) {
+  auto I = llvm::find_if(Processors,
+                         [&](const ProcInfo &P) { return P.Name == CPU; });
+  assert(I != std::end(Processors) && "Processor not found!");
+
+  FeatureBitset Bits = I->Features;
+
+  // Remove the 64-bit feature which we only use to validate if a CPU can
+  // be used with 64-bit mode.
+  Bits &= ~Feature64BIT;
+
+  // Add the string version of all set bits.
+  getFeatureBitsAsStrings(Bits, EnabledFeatures);
+}
+
+// For each feature that is (transitively) implied by this feature, set it.
+static void getImpliedEnabledFeatures(FeatureBitset &Bits,
+                                      const FeatureBitset &Implies) {
+  Bits |= Implies;
+  for (unsigned i = 0; i != CPU_FEATURE_MAX; ++i) {
+    if (Implies[i])
+      getImpliedEnabledFeatures(Bits, FeatureInfos[i].ImpliedFeatures);
+  }
+}
+
+/// Create bit vector of features that are implied disabled if the feature
+/// passed in Value is disabled.
+static void getImpliedDisabledFeatures(FeatureBitset &Bits, unsigned Value) {
+  // Check all features looking for any dependent on this feature. If we find
+  // one, mark it and recursively find any feature that depend on it.
+  for (unsigned i = 0; i != CPU_FEATURE_MAX; ++i) {
+    if (FeatureInfos[i].ImpliedFeatures[Value]) {
+      Bits.set(i);
+      getImpliedDisabledFeatures(Bits, i);
+    }
+  }
+}
+
+void llvm::X86::getImpliedFeatures(
+    StringRef Feature, bool Enabled,
+    SmallVectorImpl<StringRef> &ImpliedFeatures) {
+  auto I = llvm::find_if(
+      FeatureInfos, [&](const FeatureInfo &FI) { return FI.Name == Feature; });
+  if (I == std::end(FeatureInfos)) {
+    // FIXME: This shouldn't happen, but may not have all features in the table
+    // yet.
+    return;
+  }
+
+  FeatureBitset ImpliedBits;
+  if (Enabled)
+    getImpliedEnabledFeatures(ImpliedBits, I->ImpliedFeatures);
+  else
+    getImpliedDisabledFeatures(ImpliedBits,
+                               std::distance(std::begin(FeatureInfos), I));
+
+  // Convert all the found bits into strings.
+  getFeatureBitsAsStrings(ImpliedBits, ImpliedFeatures);
+}
diff --git a/llvm/lib/Support/YAMLParser.cpp b/llvm/lib/Support/YAMLParser.cpp
index d17e7b227f4a..ca8ffdc47afa 100644
--- a/llvm/lib/Support/YAMLParser.cpp
+++ b/llvm/lib/Support/YAMLParser.cpp
@@ -268,8 +268,8 @@ public:
   }
 
   void setError(const Twine &Message, StringRef::iterator Position) {
-    if (Current >= End)
-      Current = End - 1;
+    if (Position >= End)
+      Position = End - 1;
 
     // propagate the error if possible
     if (EC)
@@ -278,14 +278,10 @@ public:
     // Don't print out more errors after the first one we encounter. The rest
     // are just the result of the first, and have no meaning.
     if (!Failed)
-      printError(SMLoc::getFromPointer(Current), SourceMgr::DK_Error, Message);
+      printError(SMLoc::getFromPointer(Position), SourceMgr::DK_Error, Message);
     Failed = true;
   }
 
-  void setError(const Twine &Message) {
-    setError(Message, Current);
-  }
-
   /// Returns true if an error occurred while parsing.
   bool failed() {
     return Failed;
@@ -934,13 +930,13 @@ void Scanner::scan_ns_uri_char() {
 
 bool Scanner::consume(uint32_t Expected) {
   if (Expected >= 0x80) {
-    setError("Cannot consume non-ascii characters");
+    setError("Cannot consume non-ascii characters", Current);
     return false;
   }
   if (Current == End)
     return false;
   if (uint8_t(*Current) >= 0x80) {
-    setError("Cannot consume non-ascii characters");
+    setError("Cannot consume non-ascii characters", Current);
     return false;
   }
   if (uint8_t(*Current) == Expected) {
@@ -1642,7 +1638,7 @@ bool Scanner::scanBlockScalar(bool IsLiteral) {
   Token T;
   T.Kind = Token::TK_BlockScalar;
   T.Range = StringRef(Start, Current - Start);
-  T.Value = Str.str().str();
+  T.Value = std::string(Str);
   TokenQueue.push_back(T);
   return true;
 }
@@ -1763,7 +1759,7 @@ bool Scanner::fetchMoreTokens() {
                       && !isBlankOrBreak(Current + 2)))
     return scanPlainScalar();
 
-  setError("Unrecognized character while tokenizing.");
+  setError("Unrecognized character while tokenizing.", Current);
   return false;
 }
 
@@ -1819,11 +1815,11 @@ std::string Node::getVerbatimTag() const {
   if (!Raw.empty() && Raw != "!") {
     std::string Ret;
     if (Raw.find_last_of('!') == 0) {
-      Ret = Doc->getTagMap().find("!")->second;
+      Ret = std::string(Doc->getTagMap().find("!")->second);
       Ret += Raw.substr(1);
       return Ret;
     } else if (Raw.startswith("!!")) {
-      Ret = Doc->getTagMap().find("!!")->second;
+      Ret = std::string(Doc->getTagMap().find("!!")->second);
       Ret += Raw.substr(2);
       return Ret;
     } else {
@@ -1831,7 +1827,7 @@ std::string Node::getVerbatimTag() const {
       std::map<StringRef, StringRef>::const_iterator It =
           Doc->getTagMap().find(TagHandle);
       if (It != Doc->getTagMap().end())
-        Ret = It->second;
+        Ret = std::string(It->second);
       else {
         Token T;
         T.Kind = Token::TK_Tag;
diff --git a/llvm/lib/Support/YAMLTraits.cpp b/llvm/lib/Support/YAMLTraits.cpp
index 5f0cedc71829..9ac7c65e19f7 100644
--- a/llvm/lib/Support/YAMLTraits.cpp
+++ b/llvm/lib/Support/YAMLTraits.cpp
@@ -166,6 +166,8 @@ bool Input::preflightKey(const char *Key, bool Required, bool, bool &UseDefault,
   if (!MN) {
     if (Required || !isa<EmptyHNode>(CurrentNode))
       setError(CurrentNode, "not a mapping");
+    else
+      UseDefault = true;
     return false;
   }
   MN->ValidKeys.push_back(Key);
@@ -738,7 +740,7 @@ bool Output::canElideEmptySequence() {
   // the whole key/value can be not written.  But, that produces wrong yaml
   // if the key/value is the only thing in the map and the map is used in
   // a sequence.  This detects if the this sequence is the first key/value
-  // in map that itself is embedded in a sequnce.
+  // in map that itself is embedded in a sequence.
   if (StateStack.size() < 2)
     return true;
   if (StateStack.back() != inMapFirstKey)
@@ -876,12 +878,12 @@ StringRef ScalarTraits<StringRef>::input(StringRef Scalar, void *,
 }
 
 void ScalarTraits<std::string>::output(const std::string &Val, void *,
-                                     raw_ostream &Out) {
+                                       raw_ostream &Out) {
   Out << Val;
 }
 
 StringRef ScalarTraits<std::string>::input(StringRef Scalar, void *,
-                                         std::string &Val) {
+                                           std::string &Val) {
   Val = Scalar.str();
   return StringRef();
 }
diff --git a/llvm/lib/Support/Z3Solver.cpp b/llvm/lib/Support/Z3Solver.cpp
index a83d0f441a4b..9485536d1312 100644
--- a/llvm/lib/Support/Z3Solver.cpp
+++ b/llvm/lib/Support/Z3Solver.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h"
 #include "llvm/Support/SMTAPI.h"
@@ -516,16 +517,16 @@ public:
     SMTExprRef RoundingMode = getFloatRoundingMode();
     return newExprRef(
         Z3Expr(Context,
-               Z3_mk_fpa_mul(Context.Context, toZ3Expr(*LHS).AST,
-                             toZ3Expr(*RHS).AST, toZ3Expr(*RoundingMode).AST)));
+               Z3_mk_fpa_mul(Context.Context, toZ3Expr(*RoundingMode).AST,
+                             toZ3Expr(*LHS).AST, toZ3Expr(*RHS).AST)));
   }
 
   SMTExprRef mkFPDiv(const SMTExprRef &LHS, const SMTExprRef &RHS) override {
     SMTExprRef RoundingMode = getFloatRoundingMode();
     return newExprRef(
         Z3Expr(Context,
-               Z3_mk_fpa_div(Context.Context, toZ3Expr(*LHS).AST,
-                             toZ3Expr(*RHS).AST, toZ3Expr(*RoundingMode).AST)));
+               Z3_mk_fpa_div(Context.Context, toZ3Expr(*RoundingMode).AST,
+                             toZ3Expr(*LHS).AST, toZ3Expr(*RHS).AST)));
   }
 
   SMTExprRef mkFPRem(const SMTExprRef &LHS, const SMTExprRef &RHS) override {
@@ -538,16 +539,16 @@ public:
     SMTExprRef RoundingMode = getFloatRoundingMode();
     return newExprRef(
         Z3Expr(Context,
-               Z3_mk_fpa_add(Context.Context, toZ3Expr(*LHS).AST,
-                             toZ3Expr(*RHS).AST, toZ3Expr(*RoundingMode).AST)));
+               Z3_mk_fpa_add(Context.Context, toZ3Expr(*RoundingMode).AST,
+                             toZ3Expr(*LHS).AST, toZ3Expr(*RHS).AST)));
   }
 
   SMTExprRef mkFPSub(const SMTExprRef &LHS, const SMTExprRef &RHS) override {
     SMTExprRef RoundingMode = getFloatRoundingMode();
     return newExprRef(
         Z3Expr(Context,
-               Z3_mk_fpa_sub(Context.Context, toZ3Expr(*LHS).AST,
-                             toZ3Expr(*RHS).AST, toZ3Expr(*RoundingMode).AST)));
+               Z3_mk_fpa_sub(Context.Context, toZ3Expr(*RoundingMode).AST,
+                             toZ3Expr(*LHS).AST, toZ3Expr(*RHS).AST)));
   }
 
   SMTExprRef mkFPLt(const SMTExprRef &LHS, const SMTExprRef &RHS) override {
@@ -723,10 +724,25 @@ public:
   }
 
   SMTExprRef mkBitvector(const llvm::APSInt Int, unsigned BitWidth) override {
-    const SMTSortRef Sort = getBitvectorSort(BitWidth);
-    return newExprRef(
-        Z3Expr(Context, Z3_mk_numeral(Context.Context, Int.toString(10).c_str(),
-                                      toZ3Sort(*Sort).Sort)));
+    const Z3_sort Z3Sort = toZ3Sort(*getBitvectorSort(BitWidth)).Sort;
+
+    // Slow path, when 64 bits are not enough.
+    if (LLVM_UNLIKELY(Int.getBitWidth() > 64u)) {
+      SmallString<40> Buffer;
+      Int.toString(Buffer, 10);
+      return newExprRef(Z3Expr(
+          Context, Z3_mk_numeral(Context.Context, Buffer.c_str(), Z3Sort)));
+    }
+
+    const int64_t BitReprAsSigned = Int.getExtValue();
+    const uint64_t BitReprAsUnsigned =
+        reinterpret_cast<const uint64_t &>(BitReprAsSigned);
+
+    Z3_ast Literal =
+        Int.isSigned()
+            ? Z3_mk_int64(Context.Context, BitReprAsSigned, Z3Sort)
+            : Z3_mk_unsigned_int64(Context.Context, BitReprAsUnsigned, Z3Sort);
+    return newExprRef(Z3Expr(Context, Literal));
   }
 
   SMTExprRef mkFloat(const llvm::APFloat Float) override {
diff --git a/llvm/lib/Support/raw_ostream.cpp b/llvm/lib/Support/raw_ostream.cpp
index 4bb315f824af..f2d78d773239 100644
--- a/llvm/lib/Support/raw_ostream.cpp
+++ b/llvm/lib/Support/raw_ostream.cpp
@@ -60,21 +60,21 @@
 
 #ifdef _WIN32
 #include "llvm/Support/ConvertUTF.h"
-#include "Windows/WindowsSupport.h"
+#include "llvm/Support/Windows/WindowsSupport.h"
 #endif
 
 using namespace llvm;
 
-const raw_ostream::Colors raw_ostream::BLACK;
-const raw_ostream::Colors raw_ostream::RED;
-const raw_ostream::Colors raw_ostream::GREEN;
-const raw_ostream::Colors raw_ostream::YELLOW;
-const raw_ostream::Colors raw_ostream::BLUE;
-const raw_ostream::Colors raw_ostream::MAGENTA;
-const raw_ostream::Colors raw_ostream::CYAN;
-const raw_ostream::Colors raw_ostream::WHITE;
-const raw_ostream::Colors raw_ostream::SAVEDCOLOR;
-const raw_ostream::Colors raw_ostream::RESET;
+constexpr raw_ostream::Colors raw_ostream::BLACK;
+constexpr raw_ostream::Colors raw_ostream::RED;
+constexpr raw_ostream::Colors raw_ostream::GREEN;
+constexpr raw_ostream::Colors raw_ostream::YELLOW;
+constexpr raw_ostream::Colors raw_ostream::BLUE;
+constexpr raw_ostream::Colors raw_ostream::MAGENTA;
+constexpr raw_ostream::Colors raw_ostream::CYAN;
+constexpr raw_ostream::Colors raw_ostream::WHITE;
+constexpr raw_ostream::Colors raw_ostream::SAVEDCOLOR;
+constexpr raw_ostream::Colors raw_ostream::RESET;
 
 raw_ostream::~raw_ostream() {
   // raw_ostream's subclasses should take care to flush the buffer
@@ -216,7 +216,7 @@ void raw_ostream::flush_nonempty() {
   assert(OutBufCur > OutBufStart && "Invalid call to flush_nonempty.");
   size_t Length = OutBufCur - OutBufStart;
   OutBufCur = OutBufStart;
-  write_impl(OutBufStart, Length);
+  flush_tied_then_write(OutBufStart, Length);
 }
 
 raw_ostream &raw_ostream::write(unsigned char C) {
@@ -224,7 +224,7 @@ raw_ostream &raw_ostream::write(unsigned char C) {
   if (LLVM_UNLIKELY(OutBufCur >= OutBufEnd)) {
     if (LLVM_UNLIKELY(!OutBufStart)) {
       if (BufferMode == BufferKind::Unbuffered) {
-        write_impl(reinterpret_cast<char*>(&C), 1);
+        flush_tied_then_write(reinterpret_cast<char *>(&C), 1);
         return *this;
       }
       // Set up a buffer and start over.
@@ -244,7 +244,7 @@ raw_ostream &raw_ostream::write(const char *Ptr, size_t Size) {
   if (LLVM_UNLIKELY(size_t(OutBufEnd - OutBufCur) < Size)) {
     if (LLVM_UNLIKELY(!OutBufStart)) {
       if (BufferMode == BufferKind::Unbuffered) {
-        write_impl(Ptr, Size);
+        flush_tied_then_write(Ptr, Size);
         return *this;
       }
       // Set up a buffer and start over.
@@ -260,7 +260,7 @@ raw_ostream &raw_ostream::write(const char *Ptr, size_t Size) {
     if (LLVM_UNLIKELY(OutBufCur == OutBufStart)) {
       assert(NumBytes != 0 && "undefined behavior");
       size_t BytesToWrite = Size - (Size % NumBytes);
-      write_impl(Ptr, BytesToWrite);
+      flush_tied_then_write(Ptr, BytesToWrite);
       size_t BytesRemaining = Size - BytesToWrite;
       if (BytesRemaining > size_t(OutBufEnd - OutBufCur)) {
         // Too much left over to copy into our buffer.
@@ -301,6 +301,12 @@ void raw_ostream::copy_to_buffer(const char *Ptr, size_t Size) {
   OutBufCur += Size;
 }
 
+void raw_ostream::flush_tied_then_write(const char *Ptr, size_t Size) {
+  if (TiedStream)
+    TiedStream->flush();
+  write_impl(Ptr, Size);
+}
+
 // Formatted output.
 raw_ostream &raw_ostream::operator<<(const format_object_base &Fmt) {
   // If we have more than a few bytes left in our output buffer, try
@@ -343,36 +349,33 @@ raw_ostream &raw_ostream::operator<<(const format_object_base &Fmt) {
 }
 
 raw_ostream &raw_ostream::operator<<(const formatv_object_base &Obj) {
-  SmallString<128> S;
   Obj.format(*this);
   return *this;
 }
 
 raw_ostream &raw_ostream::operator<<(const FormattedString &FS) {
-  if (FS.Str.size() >= FS.Width || FS.Justify == FormattedString::JustifyNone) {
-    this->operator<<(FS.Str);
-    return *this;
-  }
-  const size_t Difference = FS.Width - FS.Str.size();
-  switch (FS.Justify) {
-  case FormattedString::JustifyLeft:
-    this->operator<<(FS.Str);
-    this->indent(Difference);
-    break;
-  case FormattedString::JustifyRight:
-    this->indent(Difference);
-    this->operator<<(FS.Str);
-    break;
-  case FormattedString::JustifyCenter: {
-    int PadAmount = Difference / 2;
-    this->indent(PadAmount);
-    this->operator<<(FS.Str);
-    this->indent(Difference - PadAmount);
-    break;
-  }
-  default:
-    llvm_unreachable("Bad Justification");
+  unsigned LeftIndent = 0;
+  unsigned RightIndent = 0;
+  const ssize_t Difference = FS.Width - FS.Str.size();
+  if (Difference > 0) {
+    switch (FS.Justify) {
+    case FormattedString::JustifyNone:
+      break;
+    case FormattedString::JustifyLeft:
+      RightIndent = Difference;
+      break;
+    case FormattedString::JustifyRight:
+      LeftIndent = Difference;
+      break;
+    case FormattedString::JustifyCenter:
+      LeftIndent = Difference / 2;
+      RightIndent = Difference - LeftIndent;
+      break;
+    }
   }
+  indent(LeftIndent);
+  (*this) << FS.Str;
+  indent(RightIndent);
   return *this;
 }
 
@@ -502,6 +505,53 @@ raw_ostream &raw_ostream::write_zeros(unsigned NumZeros) {
   return write_padding<'\0'>(*this, NumZeros);
 }
 
+bool raw_ostream::prepare_colors() {
+  // Colors were explicitly disabled.
+  if (!ColorEnabled)
+    return false;
+
+  // Colors require changing the terminal but this stream is not going to a
+  // terminal.
+  if (sys::Process::ColorNeedsFlush() && !is_displayed())
+    return false;
+
+  if (sys::Process::ColorNeedsFlush())
+    flush();
+
+  return true;
+}
+
+raw_ostream &raw_ostream::changeColor(enum Colors colors, bool bold, bool bg) {
+  if (!prepare_colors())
+    return *this;
+
+  const char *colorcode =
+      (colors == SAVEDCOLOR)
+          ? sys::Process::OutputBold(bg)
+          : sys::Process::OutputColor(static_cast<char>(colors), bold, bg);
+  if (colorcode)
+    write(colorcode, strlen(colorcode));
+  return *this;
+}
+
+raw_ostream &raw_ostream::resetColor() {
+  if (!prepare_colors())
+    return *this;
+
+  if (const char *colorcode = sys::Process::ResetColor())
+    write(colorcode, strlen(colorcode));
+  return *this;
+}
+
+raw_ostream &raw_ostream::reverseColor() {
+  if (!prepare_colors())
+    return *this;
+
+  if (const char *colorcode = sys::Process::OutputReverse())
+    write(colorcode, strlen(colorcode));
+  return *this;
+}
+
 void raw_ostream::anchor() {}
 
 //===----------------------------------------------------------------------===//
@@ -577,6 +627,8 @@ raw_fd_ostream::raw_fd_ostream(int fd, bool shouldClose, bool unbuffered)
     return;
   }
 
+  enable_colors(true);
+
   // Do not attempt to close stdout or stderr. We used to try to maintain the
   // property that tools that support writing file to stdout should not also
   // write informational output to stdout, but in practice we were never able to
@@ -792,7 +844,7 @@ size_t raw_fd_ostream::preferred_buffer_size() const {
   // If this is a terminal, don't use buffering. Line buffering
   // would be a more traditional thing to do, but it's not worth
   // the complexity.
-  if (S_ISCHR(statbuf.st_mode) && isatty(FD))
+  if (S_ISCHR(statbuf.st_mode) && is_displayed())
     return 0;
   // Return the preferred block size.
   return statbuf.st_blksize;
@@ -801,58 +853,6 @@ size_t raw_fd_ostream::preferred_buffer_size() const {
 #endif
 }
 
-raw_ostream &raw_fd_ostream::changeColor(enum Colors colors, bool bold,
-                                         bool bg) {
-  if (!ColorEnabled)
-    return *this;
-
-  if (sys::Process::ColorNeedsFlush())
-    flush();
-  const char *colorcode =
-      (colors == SAVEDCOLOR)
-          ? sys::Process::OutputBold(bg)
-          : sys::Process::OutputColor(static_cast<char>(colors), bold, bg);
-  if (colorcode) {
-    size_t len = strlen(colorcode);
-    write(colorcode, len);
-    // don't account colors towards output characters
-    pos -= len;
-  }
-  return *this;
-}
-
-raw_ostream &raw_fd_ostream::resetColor() {
-  if (!ColorEnabled)
-    return *this;
-
-  if (sys::Process::ColorNeedsFlush())
-    flush();
-  const char *colorcode = sys::Process::ResetColor();
-  if (colorcode) {
-    size_t len = strlen(colorcode);
-    write(colorcode, len);
-    // don't account colors towards output characters
-    pos -= len;
-  }
-  return *this;
-}
-
-raw_ostream &raw_fd_ostream::reverseColor() {
-  if (!ColorEnabled)
-    return *this;
-
-  if (sys::Process::ColorNeedsFlush())
-    flush();
-  const char *colorcode = sys::Process::OutputReverse();
-  if (colorcode) {
-    size_t len = strlen(colorcode);
-    write(colorcode, len);
-    // don't account colors towards output characters
-    pos -= len;
-  }
-  return *this;
-}
-
 bool raw_fd_ostream::is_displayed() const {
   return sys::Process::FileDescriptorIsDisplayed(FD);
 }
@@ -867,9 +867,7 @@ void raw_fd_ostream::anchor() {}
 //  outs(), errs(), nulls()
 //===----------------------------------------------------------------------===//
 
-/// outs() - This returns a reference to a raw_ostream for standard output.
-/// Use it like: outs() << "foo" << "bar";
-raw_ostream &llvm::outs() {
+raw_fd_ostream &llvm::outs() {
   // Set buffer settings to model stdout behavior.
   std::error_code EC;
   static raw_fd_ostream S("-", EC, sys::fs::OF_None);
@@ -877,10 +875,8 @@ raw_ostream &llvm::outs() {
   return S;
 }
 
-/// errs() - This returns a reference to a raw_ostream for standard error.
-/// Use it like: errs() << "foo" << "bar";
-raw_ostream &llvm::errs() {
-  // Set standard error to be unbuffered by default.
+raw_fd_ostream &llvm::errs() {
+  // Set standard error to be unbuffered and tied to outs() by default.
   static raw_fd_ostream S(STDERR_FILENO, false, true);
   return S;
 }
diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index 427bd6778577..77f1b61cf930 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -80,7 +80,7 @@ static int createDependencyFile(const TGParser &Parser, const char *argv0) {
   return 0;
 }
 
-int llvm::TableGenMain(char *argv0, TableGenMainFn *MainFn) {
+int llvm::TableGenMain(const char *argv0, TableGenMainFn *MainFn) {
   RecordKeeper Records;
 
   // Parse the input file.
diff --git a/llvm/lib/TableGen/Record.cpp b/llvm/lib/TableGen/Record.cpp
index 9db842dc678e..d3db004196b8 100644
--- a/llvm/lib/TableGen/Record.cpp
+++ b/llvm/lib/TableGen/Record.cpp
@@ -1030,7 +1030,7 @@ Init *BinOpInit::Fold(Record *CurRec) const {
       case MUL: Result = LHSv *  RHSv; break;
       case AND: Result = LHSv &  RHSv; break;
       case OR: Result = LHSv | RHSv; break;
-      case SHL: Result = LHSv << RHSv; break;
+      case SHL: Result = (uint64_t)LHSv << (uint64_t)RHSv; break;
       case SRA: Result = LHSv >> RHSv; break;
       case SRL: Result = (uint64_t)LHSv >> (uint64_t)RHSv; break;
       }
@@ -1183,21 +1183,22 @@ Init *TernOpInit::Fold(Record *CurRec) const {
       return DefInit::get(Val);
     }
     if (LHSv && MHSv && RHSv) {
-      std::string Val = RHSv->getName();
+      std::string Val = std::string(RHSv->getName());
       if (LHSv->getAsString() == RHSv->getAsString())
-        Val = MHSv->getName();
+        Val = std::string(MHSv->getName());
       return VarInit::get(Val, getType());
     }
     if (LHSs && MHSs && RHSs) {
-      std::string Val = RHSs->getValue();
+      std::string Val = std::string(RHSs->getValue());
 
       std::string::size_type found;
       std::string::size_type idx = 0;
       while (true) {
-        found = Val.find(LHSs->getValue(), idx);
+        found = Val.find(std::string(LHSs->getValue()), idx);
         if (found == std::string::npos)
           break;
-        Val.replace(found, LHSs->getValue().size(), MHSs->getValue());
+        Val.replace(found, LHSs->getValue().size(),
+                    std::string(MHSs->getValue()));
         idx = found + MHSs->getValue().size();
       }
 
@@ -1612,9 +1613,7 @@ RecTy *DefInit::getFieldType(StringInit *FieldName) const {
   return nullptr;
 }
 
-std::string DefInit::getAsString() const {
-  return Def->getName();
-}
+std::string DefInit::getAsString() const { return std::string(Def->getName()); }
 
 static void ProfileVarDefInit(FoldingSetNodeID &ID,
                               Record *Class,
@@ -1779,6 +1778,14 @@ Init *FieldInit::Fold(Record *CurRec) const {
   return const_cast<FieldInit *>(this);
 }
 
+bool FieldInit::isConcrete() const {
+  if (DefInit *DI = dyn_cast<DefInit>(Rec)) {
+    Init *FieldVal = DI->getDef()->getValue(FieldName)->getValue();
+    return FieldVal->isConcrete();
+  }
+  return false;
+}
+
 static void ProfileCondOpInit(FoldingSetNodeID &ID,
                              ArrayRef<Init *> CondRange,
                              ArrayRef<Init *> ValRange,
@@ -2149,11 +2156,6 @@ void Record::resolveReferences() {
   resolveReferences(R);
 }
 
-void Record::resolveReferencesTo(const RecordVal *RV) {
-  RecordValResolver R(*this, RV);
-  resolveReferences(R, RV);
-}
-
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 LLVM_DUMP_METHOD void Record::dump() const { errs() << *this; }
 #endif
@@ -2292,6 +2294,8 @@ Record::getValueAsListOfStrings(StringRef FieldName) const {
   for (Init *I : List->getValues()) {
     if (StringInit *SI = dyn_cast<StringInit>(I))
       Strings.push_back(SI->getValue());
+    else if (CodeInit *CI = dyn_cast<CodeInit>(I))
+      Strings.push_back(CI->getValue());
     else
       PrintFatalError(getLoc(),
                       Twine("Record `") + getName() + "', field `" + FieldName +
diff --git a/llvm/lib/TableGen/SetTheory.cpp b/llvm/lib/TableGen/SetTheory.cpp
index 5a30ee98cce9..0389bd3ac830 100644
--- a/llvm/lib/TableGen/SetTheory.cpp
+++ b/llvm/lib/TableGen/SetTheory.cpp
@@ -191,7 +191,7 @@ struct SequenceOp : public SetTheory::Operator {
 
     std::string Format;
     if (StringInit *SI = dyn_cast<StringInit>(Expr->arg_begin()[0]))
-      Format = SI->getValue();
+      Format = std::string(SI->getValue());
     else
       PrintFatalError(Loc,  "Format must be a string: " + Expr->getAsString());
 
diff --git a/llvm/lib/TableGen/TGLexer.cpp b/llvm/lib/TableGen/TGLexer.cpp
index 1a3f5a7392d5..9e6cc947925d 100644
--- a/llvm/lib/TableGen/TGLexer.cpp
+++ b/llvm/lib/TableGen/TGLexer.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "TGLexer.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Config/config.h" // for strtoull()/strtoll() define
diff --git a/llvm/lib/TableGen/TGLexer.h b/llvm/lib/TableGen/TGLexer.h
index 6d10af348674..5b3b0a44e3ef 100644
--- a/llvm/lib/TableGen/TGLexer.h
+++ b/llvm/lib/TableGen/TGLexer.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TABLEGEN_TGLEXER_H
 #define LLVM_LIB_TABLEGEN_TGLEXER_H
 
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/DataTypes.h"
@@ -22,10 +21,11 @@
 #include <memory>
 #include <set>
 #include <string>
+#include <vector>
 
 namespace llvm {
+template <typename T> class ArrayRef;
 class SourceMgr;
-class SMLoc;
 class Twine;
 
 namespace tgtok {
diff --git a/llvm/lib/TableGen/TGParser.cpp b/llvm/lib/TableGen/TGParser.cpp
index 01cc1af34ab6..47f471ae2c4b 100644
--- a/llvm/lib/TableGen/TGParser.cpp
+++ b/llvm/lib/TableGen/TGParser.cpp
@@ -15,12 +15,13 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
-#include "llvm/TableGen/Record.h"
+#include "llvm/Support/SourceMgr.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -487,6 +488,14 @@ static bool isObjectStart(tgtok::TokKind K) {
          K == tgtok::Defset || K == tgtok::Defvar || K == tgtok::If;
 }
 
+bool TGParser::consume(tgtok::TokKind K) {
+  if (Lex.getCode() == K) {
+    Lex.Lex();
+    return true;
+  }
+  return false;
+}
+
 /// ParseObjectName - If a valid object name is specified, return it. If no
 /// name is specified, return the unset initializer. Return nullptr on parse
 /// error.
@@ -590,11 +599,10 @@ ParseSubClassReference(Record *CurRec, bool isDefm) {
   if (!Result.Rec) return Result;
 
   // If there is no template arg list, we're done.
-  if (Lex.getCode() != tgtok::less) {
+  if (!consume(tgtok::less)) {
     Result.RefRange.End = Lex.getLoc();
     return Result;
   }
-  Lex.Lex();  // Eat the '<'
 
   if (Lex.getCode() == tgtok::greater) {
     TokError("subclass reference requires a non-empty list of template values");
@@ -608,12 +616,11 @@ ParseSubClassReference(Record *CurRec, bool isDefm) {
     return Result;
   }
 
-  if (Lex.getCode() != tgtok::greater) {
+  if (!consume(tgtok::greater)) {
     TokError("expected '>' in template value list");
     Result.Rec = nullptr;
     return Result;
   }
-  Lex.Lex();
   Result.RefRange.End = Lex.getLoc();
 
   return Result;
@@ -635,11 +642,10 @@ ParseSubMultiClassReference(MultiClass *CurMC) {
   if (!Result.MC) return Result;
 
   // If there is no template arg list, we're done.
-  if (Lex.getCode() != tgtok::less) {
+  if (!consume(tgtok::less)) {
     Result.RefRange.End = Lex.getLoc();
     return Result;
   }
-  Lex.Lex();  // Eat the '<'
 
   if (Lex.getCode() == tgtok::greater) {
     TokError("subclass reference requires a non-empty list of template values");
@@ -653,12 +659,11 @@ ParseSubMultiClassReference(MultiClass *CurMC) {
     return Result;
   }
 
-  if (Lex.getCode() != tgtok::greater) {
+  if (!consume(tgtok::greater)) {
     TokError("expected '>' in template value list");
     Result.MC = nullptr;
     return Result;
   }
-  Lex.Lex();
   Result.RefRange.End = Lex.getLoc();
 
   return Result;
@@ -730,36 +735,30 @@ void TGParser::ParseRangeList(SmallVectorImpl<unsigned> &Result) {
     Result.clear();
     return;
   }
-  while (Lex.getCode() == tgtok::comma) {
-    Lex.Lex();  // Eat the comma.
-
+  while (consume(tgtok::comma))
     // Parse the next range piece.
     if (ParseRangePiece(Result)) {
       Result.clear();
       return;
     }
-  }
 }
 
 /// ParseOptionalRangeList - Parse either a range list in <>'s or nothing.
 ///   OptionalRangeList ::= '<' RangeList '>'
 ///   OptionalRangeList ::= /*empty*/
 bool TGParser::ParseOptionalRangeList(SmallVectorImpl<unsigned> &Ranges) {
-  if (Lex.getCode() != tgtok::less)
-    return false;
-
   SMLoc StartLoc = Lex.getLoc();
-  Lex.Lex(); // eat the '<'
+  if (!consume(tgtok::less))
+    return false;
 
   // Parse the range list.
   ParseRangeList(Ranges);
   if (Ranges.empty()) return true;
 
-  if (Lex.getCode() != tgtok::greater) {
+  if (!consume(tgtok::greater)) {
     TokError("expected '>' at end of range list");
     return Error(StartLoc, "to match this '<'");
   }
-  Lex.Lex();   // eat the '>'.
   return false;
 }
 
@@ -767,21 +766,18 @@ bool TGParser::ParseOptionalRangeList(SmallVectorImpl<unsigned> &Ranges) {
 ///   OptionalBitList ::= '{' RangeList '}'
 ///   OptionalBitList ::= /*empty*/
 bool TGParser::ParseOptionalBitList(SmallVectorImpl<unsigned> &Ranges) {
-  if (Lex.getCode() != tgtok::l_brace)
-    return false;
-
   SMLoc StartLoc = Lex.getLoc();
-  Lex.Lex(); // eat the '{'
+  if (!consume(tgtok::l_brace))
+    return false;
 
   // Parse the range list.
   ParseRangeList(Ranges);
   if (Ranges.empty()) return true;
 
-  if (Lex.getCode() != tgtok::r_brace) {
+  if (!consume(tgtok::r_brace)) {
     TokError("expected '}' at end of bit list");
     return Error(StartLoc, "to match this '{'");
   }
-  Lex.Lex();   // eat the '}'.
   return false;
 }
 
@@ -834,11 +830,10 @@ RecTy *TGParser::ParseType() {
     RecTy *SubType = ParseType();
     if (!SubType) return nullptr;
 
-    if (Lex.getCode() != tgtok::greater) {
+    if (!consume(tgtok::greater)) {
       TokError("expected '>' at end of list<ty> type");
       return nullptr;
     }
-    Lex.Lex();  // Eat '>'
     return ListRecTy::get(SubType);
   }
   }
@@ -973,11 +968,10 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       Code = UnOpInit::GETOP;
       break;
     }
-    if (Lex.getCode() != tgtok::l_paren) {
+    if (!consume(tgtok::l_paren)) {
       TokError("expected '(' after unary operator");
       return nullptr;
     }
-    Lex.Lex();  // eat the '('
 
     Init *LHS = ParseValue(CurRec);
     if (!LHS) return nullptr;
@@ -1035,11 +1029,10 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       }
     }
 
-    if (Lex.getCode() != tgtok::r_paren) {
+    if (!consume(tgtok::r_paren)) {
       TokError("expected ')' in unary operator");
       return nullptr;
     }
-    Lex.Lex();  // eat the ')'
     return (UnOpInit::get(Code, LHS, Type))->Fold(CurRec);
   }
 
@@ -1051,21 +1044,19 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     if (!Type)
       return nullptr;
 
-    if (Lex.getCode() != tgtok::l_paren) {
+    if (!consume(tgtok::l_paren)) {
       TokError("expected '(' after type of !isa");
       return nullptr;
     }
-    Lex.Lex(); // eat the '('
 
     Init *LHS = ParseValue(CurRec);
     if (!LHS)
       return nullptr;
 
-    if (Lex.getCode() != tgtok::r_paren) {
+    if (!consume(tgtok::r_paren)) {
       TokError("expected ')' in !isa");
       return nullptr;
     }
-    Lex.Lex(); // eat the ')'
 
     return (IsAOpInit::get(Type, LHS))->Fold();
   }
@@ -1167,11 +1158,10 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       return nullptr;
     }
 
-    if (Lex.getCode() != tgtok::l_paren) {
+    if (!consume(tgtok::l_paren)) {
       TokError("expected '(' after binary operator");
       return nullptr;
     }
-    Lex.Lex();  // eat the '('
 
     SmallVector<Init*, 2> InitList;
 
@@ -1180,7 +1170,13 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       InitList.push_back(ParseValue(CurRec, ArgType));
       if (!InitList.back()) return nullptr;
 
-      RecTy *ListType = cast<TypedInit>(InitList.back())->getType();
+      TypedInit *InitListBack = dyn_cast<TypedInit>(InitList.back());
+      if (!InitListBack) {
+        Error(OpLoc, Twine("expected value to be a typed value, got '" +
+                           InitList.back()->getAsString() + "'"));
+        return nullptr;
+      }
+      RecTy *ListType = InitListBack->getType();
       if (!ArgType) {
         ArgType = ListType;
 
@@ -1257,16 +1253,14 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
           break;
       }
 
-      if (Lex.getCode() != tgtok::comma)
+      if (!consume(tgtok::comma))
         break;
-      Lex.Lex();  // eat the ','
     }
 
-    if (Lex.getCode() != tgtok::r_paren) {
+    if (!consume(tgtok::r_paren)) {
       TokError("expected ')' in operator");
       return nullptr;
     }
-    Lex.Lex();  // eat the ')'
 
     // listconcat returns a list with type of the argument.
     if (Code == BinOpInit::LISTCONCAT)
@@ -1310,6 +1304,7 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     }
 
     Init *LHS = StringInit::get(Lex.getCurStrVal());
+    Lex.Lex();
 
     if (CurRec && CurRec->getValue(LHS)) {
       TokError((Twine("iteration variable '") + LHS->getAsString() +
@@ -1318,21 +1313,19 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       return nullptr;
     }
 
-    if (Lex.Lex() != tgtok::comma) { // eat the id
+    if (!consume(tgtok::comma)) { // eat the id
       TokError("expected ',' in ternary operator");
       return nullptr;
     }
-    Lex.Lex();  // eat the ','
 
     Init *MHS = ParseValue(CurRec);
     if (!MHS)
       return nullptr;
 
-    if (Lex.getCode() != tgtok::comma) {
+    if (!consume(tgtok::comma)) {
       TokError("expected ',' in ternary operator");
       return nullptr;
     }
-    Lex.Lex();  // eat the ','
 
     TypedInit *MHSt = dyn_cast<TypedInit>(MHS);
     if (!MHSt) {
@@ -1385,11 +1378,10 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
     if (!RHS)
       return nullptr;
 
-    if (Lex.getCode() != tgtok::r_paren) {
+    if (!consume(tgtok::r_paren)) {
       TokError("expected ')' in binary operator");
       return nullptr;
     }
-    Lex.Lex();  // eat the ')'
 
     RecTy *OutType;
     if (IsDAG) {
@@ -1429,42 +1421,38 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       Code = TernOpInit::SUBST;
       break;
     }
-    if (Lex.getCode() != tgtok::l_paren) {
+    if (!consume(tgtok::l_paren)) {
       TokError("expected '(' after ternary operator");
       return nullptr;
     }
-    Lex.Lex();  // eat the '('
 
     Init *LHS = ParseValue(CurRec);
     if (!LHS) return nullptr;
 
-    if (Lex.getCode() != tgtok::comma) {
+    if (!consume(tgtok::comma)) {
       TokError("expected ',' in ternary operator");
       return nullptr;
     }
-    Lex.Lex();  // eat the ','
 
     SMLoc MHSLoc = Lex.getLoc();
     Init *MHS = ParseValue(CurRec, ItemType);
     if (!MHS)
       return nullptr;
 
-    if (Lex.getCode() != tgtok::comma) {
+    if (!consume(tgtok::comma)) {
       TokError("expected ',' in ternary operator");
       return nullptr;
     }
-    Lex.Lex();  // eat the ','
 
     SMLoc RHSLoc = Lex.getLoc();
     Init *RHS = ParseValue(CurRec, ItemType);
     if (!RHS)
       return nullptr;
 
-    if (Lex.getCode() != tgtok::r_paren) {
+    if (!consume(tgtok::r_paren)) {
       TokError("expected ')' in binary operator");
       return nullptr;
     }
-    Lex.Lex();  // eat the ')'
 
     switch (LexCode) {
     default: llvm_unreachable("Unhandled code!");
@@ -1554,11 +1542,10 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
   case tgtok::XFoldl: {
     // Value ::= !foldl '(' Id ',' Id ',' Value ',' Value ',' Value ')'
     Lex.Lex(); // eat the operation
-    if (Lex.getCode() != tgtok::l_paren) {
+    if (!consume(tgtok::l_paren)) {
       TokError("expected '(' after !foldl");
       return nullptr;
     }
-    Lex.Lex(); // eat the '('
 
     Init *StartUntyped = ParseValue(CurRec);
     if (!StartUntyped)
@@ -1571,11 +1558,10 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       return nullptr;
     }
 
-    if (Lex.getCode() != tgtok::comma) {
+    if (!consume(tgtok::comma)) {
       TokError("expected ',' in !foldl");
       return nullptr;
     }
-    Lex.Lex(); // eat the ','
 
     Init *ListUntyped = ParseValue(CurRec);
     if (!ListUntyped)
@@ -1667,11 +1653,10 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
       return nullptr;
     }
 
-    if (Lex.getCode() != tgtok::r_paren) {
+    if (!consume(tgtok::r_paren)) {
       TokError("expected ')' in fold operator");
       return nullptr;
     }
-    Lex.Lex(); // eat the ')'
 
     return FoldOpInit::get(Start, List, A, B, Expr, Start->getType())
         ->Fold(CurRec);
@@ -1687,11 +1672,10 @@ Init *TGParser::ParseOperation(Record *CurRec, RecTy *ItemType) {
 RecTy *TGParser::ParseOperatorType() {
   RecTy *Type = nullptr;
 
-  if (Lex.getCode() != tgtok::less) {
+  if (!consume(tgtok::less)) {
     TokError("expected type name for operator");
     return nullptr;
   }
-  Lex.Lex();  // eat the <
 
   Type = ParseType();
 
@@ -1700,11 +1684,10 @@ RecTy *TGParser::ParseOperatorType() {
     return nullptr;
   }
 
-  if (Lex.getCode() != tgtok::greater) {
+  if (!consume(tgtok::greater)) {
     TokError("expected type name for operator");
     return nullptr;
   }
-  Lex.Lex();  // eat the >
 
   return Type;
 }
@@ -1712,47 +1695,40 @@ RecTy *TGParser::ParseOperatorType() {
 Init *TGParser::ParseOperationCond(Record *CurRec, RecTy *ItemType) {
   Lex.Lex();  // eat the operation 'cond'
 
-  if (Lex.getCode() != tgtok::l_paren) {
-     TokError("expected '(' after !cond operator");
-     return nullptr;
+  if (!consume(tgtok::l_paren)) {
+    TokError("expected '(' after !cond operator");
+    return nullptr;
   }
-  Lex.Lex();  // eat the '('
 
   // Parse through '[Case: Val,]+'
   SmallVector<Init *, 4> Case;
   SmallVector<Init *, 4> Val;
   while (true) {
-    if (Lex.getCode() == tgtok::r_paren) {
-      Lex.Lex(); // eat the ')'
+    if (consume(tgtok::r_paren))
       break;
-    }
 
     Init *V = ParseValue(CurRec);
     if (!V)
       return nullptr;
     Case.push_back(V);
 
-    if (Lex.getCode() != tgtok::colon) {
+    if (!consume(tgtok::colon)) {
       TokError("expected ':'  following a condition in !cond operator");
       return nullptr;
     }
-    Lex.Lex(); // eat the ':'
 
     V = ParseValue(CurRec, ItemType);
     if (!V)
       return nullptr;
     Val.push_back(V);
 
-    if (Lex.getCode() == tgtok::r_paren) {
-      Lex.Lex(); // eat the ')'
+    if (consume(tgtok::r_paren))
       break;
-    }
 
-    if (Lex.getCode() != tgtok::comma) {
+    if (!consume(tgtok::comma)) {
       TokError("expected ',' or ')' following a value in !cond operator");
       return nullptr;
     }
-    Lex.Lex();  // eat the ','
   }
 
   if (Case.size() < 1) {
@@ -1820,11 +1796,6 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
   Init *R = nullptr;
   switch (Lex.getCode()) {
   default: TokError("Unknown token when parsing a value"); break;
-  case tgtok::paste:
-    // This is a leading paste operation.  This is deprecated but
-    // still exists in some .td files.  Ignore it.
-    Lex.Lex();  // Skip '#'.
-    return ParseSimpleValue(CurRec, ItemType, Mode);
   case tgtok::IntVal: R = IntInit::get(Lex.getCurIntVal()); Lex.Lex(); break;
   case tgtok::BinaryIntVal: {
     auto BinaryVal = Lex.getCurBinaryIntVal();
@@ -1881,11 +1852,10 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
     ParseValueList(Args, CurRec, Class);
     if (Args.empty()) return nullptr;
 
-    if (Lex.getCode() != tgtok::greater) {
+    if (!consume(tgtok::greater)) {
       TokError("expected '>' at end of value list");
       return nullptr;
     }
-    Lex.Lex();  // eat the '>'
 
     // Typecheck the template arguments list
     ArrayRef<Init *> ExpectedArgs = Class->getTemplateArgs();
@@ -1930,11 +1900,10 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
       ParseValueList(Vals, CurRec);
       if (Vals.empty()) return nullptr;
     }
-    if (Lex.getCode() != tgtok::r_brace) {
+    if (!consume(tgtok::r_brace)) {
       TokError("expected '}' at end of bit list value");
       return nullptr;
     }
-    Lex.Lex();  // eat the '}'
 
     SmallVector<Init *, 16> NewBits;
 
@@ -1994,28 +1963,24 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
                      GivenListTy ? GivenListTy->getElementType() : nullptr);
       if (Vals.empty()) return nullptr;
     }
-    if (Lex.getCode() != tgtok::r_square) {
+    if (!consume(tgtok::r_square)) {
       TokError("expected ']' at end of list value");
       return nullptr;
     }
-    Lex.Lex();  // eat the ']'
 
     RecTy *GivenEltTy = nullptr;
-    if (Lex.getCode() == tgtok::less) {
+    if (consume(tgtok::less)) {
       // Optional list element type
-      Lex.Lex();  // eat the '<'
-
       GivenEltTy = ParseType();
       if (!GivenEltTy) {
         // Couldn't parse element type
         return nullptr;
       }
 
-      if (Lex.getCode() != tgtok::greater) {
+      if (!consume(tgtok::greater)) {
         TokError("expected '>' at end of list element type");
         return nullptr;
       }
-      Lex.Lex();  // eat the '>'
     }
 
     // Check elements
@@ -2080,8 +2045,8 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
 
     // If the operator name is present, parse it.
     StringInit *OperatorName = nullptr;
-    if (Lex.getCode() == tgtok::colon) {
-      if (Lex.Lex() != tgtok::VarName) { // eat the ':'
+    if (consume(tgtok::colon)) {
+      if (Lex.getCode() != tgtok::VarName) { // eat the ':'
         TokError("expected variable name in dag operator");
         return nullptr;
       }
@@ -2095,11 +2060,10 @@ Init *TGParser::ParseSimpleValue(Record *CurRec, RecTy *ItemType,
       if (DagArgs.empty()) return nullptr;
     }
 
-    if (Lex.getCode() != tgtok::r_paren) {
+    if (!consume(tgtok::r_paren)) {
       TokError("expected ')' in dag init");
       return nullptr;
     }
-    Lex.Lex();  // eat the ')'
 
     return DagInit::get(Operator, OperatorName, DagArgs);
   }
@@ -2177,11 +2141,10 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
       }
 
       // Eat the '}'.
-      if (Lex.getCode() != tgtok::r_brace) {
+      if (!consume(tgtok::r_brace)) {
         TokError("expected '}' at end of bit range list");
         return nullptr;
       }
-      Lex.Lex();
       break;
     }
     case tgtok::l_square: {
@@ -2198,11 +2161,10 @@ Init *TGParser::ParseValue(Record *CurRec, RecTy *ItemType, IDParseMode Mode) {
       }
 
       // Eat the ']'.
-      if (Lex.getCode() != tgtok::r_square) {
+      if (!consume(tgtok::r_square)) {
         TokError("expected ']' at end of list slice");
         return nullptr;
       }
-      Lex.Lex();
       break;
     }
     case tgtok::period: {
@@ -2343,8 +2305,8 @@ void TGParser::ParseDagArgList(
 
       Result.push_back(std::make_pair(Val, VarName));
     }
-    if (Lex.getCode() != tgtok::comma) break;
-    Lex.Lex(); // eat the ','
+    if (!consume(tgtok::comma))
+      break;
   }
 }
 
@@ -2380,9 +2342,7 @@ void TGParser::ParseValueList(SmallVectorImpl<Init*> &Result, Record *CurRec,
     return;
   }
 
-  while (Lex.getCode() == tgtok::comma) {
-    Lex.Lex();  // Eat the comma
-
+  while (consume(tgtok::comma)) {
     // ignore trailing comma for lists
     if (Lex.getCode() == tgtok::r_square)
       return;
@@ -2420,8 +2380,7 @@ void TGParser::ParseValueList(SmallVectorImpl<Init*> &Result, Record *CurRec,
 Init *TGParser::ParseDeclaration(Record *CurRec,
                                        bool ParsingTemplateArgs) {
   // Read the field prefix if present.
-  bool HasField = Lex.getCode() == tgtok::Field;
-  if (HasField) Lex.Lex();
+  bool HasField = consume(tgtok::Field);
 
   RecTy *Type = ParseType();
   if (!Type) return nullptr;
@@ -2456,8 +2415,7 @@ Init *TGParser::ParseDeclaration(Record *CurRec,
     return nullptr;
 
   // If a value is present, parse it.
-  if (Lex.getCode() == tgtok::equal) {
-    Lex.Lex();
+  if (consume(tgtok::equal)) {
     SMLoc ValLoc = Lex.getLoc();
     Init *Val = ParseValue(CurRec, Type);
     if (!Val ||
@@ -2489,11 +2447,10 @@ VarInit *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) {
   Lex.Lex();
 
   // If a value is present, parse it.
-  if (Lex.getCode() != tgtok::equal) {
+  if (!consume(tgtok::equal)) {
     TokError("Expected '=' in foreach declaration");
     return nullptr;
   }
-  Lex.Lex();  // Eat the '='
 
   RecTy *IterType = nullptr;
   SmallVector<unsigned, 16> Ranges;
@@ -2502,11 +2459,10 @@ VarInit *TGParser::ParseForeachDeclaration(Init *&ForeachListValue) {
   case tgtok::l_brace: { // '{' RangeList '}'
     Lex.Lex(); // eat the '{'
     ParseRangeList(Ranges);
-    if (Lex.getCode() != tgtok::r_brace) {
+    if (!consume(tgtok::r_brace)) {
       TokError("expected '}' at end of bit range list");
       return nullptr;
     }
-    Lex.Lex();
     break;
   }
 
@@ -2577,9 +2533,7 @@ bool TGParser::ParseTemplateArgList(Record *CurRec) {
 
   TheRecToAddTo->addTemplateArg(TemplArg);
 
-  while (Lex.getCode() == tgtok::comma) {
-    Lex.Lex(); // eat the ','
-
+  while (consume(tgtok::comma)) {
     // Read the following declarations.
     SMLoc Loc = Lex.getLoc();
     TemplArg = ParseDeclaration(CurRec, true/*templateargs*/);
@@ -2593,9 +2547,8 @@ bool TGParser::ParseTemplateArgList(Record *CurRec) {
     TheRecToAddTo->addTemplateArg(TemplArg);
   }
 
-  if (Lex.getCode() != tgtok::greater)
+  if (!consume(tgtok::greater))
     return TokError("expected '>' at end of template argument list");
-  Lex.Lex(); // eat the '>'.
   return false;
 }
 
@@ -2612,9 +2565,8 @@ bool TGParser::ParseBodyItem(Record *CurRec) {
     if (!ParseDeclaration(CurRec, false))
       return true;
 
-    if (Lex.getCode() != tgtok::semi)
+    if (!consume(tgtok::semi))
       return TokError("expected ';' after declaration");
-    Lex.Lex();
     return false;
   }
 
@@ -2631,22 +2583,25 @@ bool TGParser::ParseBodyItem(Record *CurRec) {
     return true;
   std::reverse(BitList.begin(), BitList.end());
 
-  if (Lex.getCode() != tgtok::equal)
+  if (!consume(tgtok::equal))
     return TokError("expected '=' in let expression");
-  Lex.Lex();  // eat the '='.
 
   RecordVal *Field = CurRec->getValue(FieldName);
   if (!Field)
     return TokError("Value '" + FieldName->getValue() + "' unknown!");
 
   RecTy *Type = Field->getType();
+  if (!BitList.empty() && isa<BitsRecTy>(Type)) {
+    // When assigning to a subset of a 'bits' object, expect the RHS to have
+    // the type of that subset instead of the type of the whole object.
+    Type = BitsRecTy::get(BitList.size());
+  }
 
   Init *Val = ParseValue(CurRec, Type);
   if (!Val) return true;
 
-  if (Lex.getCode() != tgtok::semi)
+  if (!consume(tgtok::semi))
     return TokError("expected ';' after let expression");
-  Lex.Lex();
 
   return SetValue(CurRec, IdLoc, FieldName, BitList, Val);
 }
@@ -2660,15 +2615,11 @@ bool TGParser::ParseBodyItem(Record *CurRec) {
 ///
 bool TGParser::ParseBody(Record *CurRec) {
   // If this is a null definition, just eat the semi and return.
-  if (Lex.getCode() == tgtok::semi) {
-    Lex.Lex();
+  if (consume(tgtok::semi))
     return false;
-  }
 
-  if (Lex.getCode() != tgtok::l_brace)
+  if (!consume(tgtok::l_brace))
     return TokError("Expected ';' or '{' to start body");
-  // Eat the '{'.
-  Lex.Lex();
 
   // An object body introduces a new scope for local variables.
   TGLocalVarScope *BodyScope = PushLocalScope();
@@ -2717,8 +2668,7 @@ bool TGParser::ApplyLetStack(RecordsEntry &Entry) {
 ///
 bool TGParser::ParseObjectBody(Record *CurRec) {
   // If there is a baseclass list, read it.
-  if (Lex.getCode() == tgtok::colon) {
-    Lex.Lex();
+  if (consume(tgtok::colon)) {
 
     // Read all of the subclasses.
     SubClassReference SubClass = ParseSubClassReference(CurRec, false);
@@ -2730,8 +2680,8 @@ bool TGParser::ParseObjectBody(Record *CurRec) {
       if (AddSubClass(CurRec, SubClass))
         return true;
 
-      if (Lex.getCode() != tgtok::comma) break;
-      Lex.Lex(); // eat ','.
+      if (!consume(tgtok::comma))
+        break;
       SubClass = ParseSubClassReference(CurRec, false);
     }
   }
@@ -2806,11 +2756,10 @@ bool TGParser::ParseDefset() {
   if (Err)
     return true;
 
-  if (Lex.getCode() != tgtok::r_brace) {
+  if (!consume(tgtok::r_brace)) {
     TokError("expected '}' at end of defset");
     return Error(BraceLoc, "to match this '{'");
   }
-  Lex.Lex();  // Eat the '}'
 
   Records.addExtraGlobal(DeclName->getValue(),
                          ListInit::get(Defset.Elements, Defset.EltTy));
@@ -2836,17 +2785,16 @@ bool TGParser::ParseDefvar() {
       return TokError("def or global variable of this name already exists");
   }
 
-  if (Lex.Lex() != tgtok::equal) // Eat the identifier
+  Lex.Lex();
+  if (!consume(tgtok::equal))
     return TokError("expected '='");
-  Lex.Lex(); // Eat the '='
 
   Init *Value = ParseValue(nullptr);
   if (!Value)
     return true;
 
-  if (Lex.getCode() != tgtok::semi)
+  if (!consume(tgtok::semi))
     return TokError("expected ';'");
-  Lex.Lex(); // Eat the ';'
 
   if (CurLocalScope)
     CurLocalScope->addVar(DeclName->getValue(), Value);
@@ -2874,9 +2822,8 @@ bool TGParser::ParseForeach(MultiClass *CurMultiClass) {
   if (!IterName)
     return TokError("expected declaration in for");
 
-  if (Lex.getCode() != tgtok::In)
+  if (!consume(tgtok::In))
     return TokError("Unknown tok");
-  Lex.Lex();  // Eat the in
 
   // Create a loop object and remember it.
   Loops.push_back(std::make_unique<ForeachLoop>(Loc, IterName, ListValue));
@@ -2897,11 +2844,10 @@ bool TGParser::ParseForeach(MultiClass *CurMultiClass) {
     if (ParseObjectList(CurMultiClass))
       return true;
 
-    if (Lex.getCode() != tgtok::r_brace) {
+    if (!consume(tgtok::r_brace)) {
       TokError("expected '}' at end of foreach command");
       return Error(BraceLoc, "to match this '{'");
     }
-    Lex.Lex();  // Eat the }
   }
 
   PopLocalScope(ForeachScope);
@@ -2929,9 +2875,8 @@ bool TGParser::ParseIf(MultiClass *CurMultiClass) {
   if (!Condition)
     return true;
 
-  if (Lex.getCode() != tgtok::Then)
+  if (!consume(tgtok::Then))
     return TokError("Unknown tok");
-  Lex.Lex(); // Eat the 'then'
 
   // We have to be able to save if statements to execute later, and they have
   // to live on the same stack as foreach loops. The simplest implementation
@@ -2964,9 +2909,7 @@ bool TGParser::ParseIf(MultiClass *CurMultiClass) {
   // dangling-else ambiguity, and by greedily matching an else here if we can,
   // we implement the usual resolution of pairing with the innermost unmatched
   // if.
-  if (Lex.getCode() == tgtok::ElseKW) {
-    Lex.Lex(); // Eat the 'else'
-
+  if (consume(tgtok::ElseKW)) {
     // The foreach containing the else-clause uses the same pair of lists as
     // above, but this time, selects SingletonList if the condition is *false*.
     Init *ElseClauseList =
@@ -3010,12 +2953,10 @@ bool TGParser::ParseIfBody(MultiClass *CurMultiClass, StringRef Kind) {
     if (ParseObjectList(CurMultiClass))
       return true;
 
-    if (Lex.getCode() != tgtok::r_brace) {
+    if (!consume(tgtok::r_brace)) {
       TokError("expected '}' at end of '" + Kind + "' clause");
       return Error(BraceLoc, "to match this '{'");
     }
-
-    Lex.Lex(); // Eat the }
   }
 
   PopLocalScope(BodyScope);
@@ -3066,7 +3007,7 @@ bool TGParser::ParseClass() {
 ///   LetItem ::= ID OptionalRangeList '=' Value
 ///
 void TGParser::ParseLetList(SmallVectorImpl<LetRecord> &Result) {
-  while (true) {
+  do {
     if (Lex.getCode() != tgtok::Id) {
       TokError("expected identifier in let definition");
       Result.clear();
@@ -3085,12 +3026,11 @@ void TGParser::ParseLetList(SmallVectorImpl<LetRecord> &Result) {
     }
     std::reverse(Bits.begin(), Bits.end());
 
-    if (Lex.getCode() != tgtok::equal) {
+    if (!consume(tgtok::equal)) {
       TokError("expected '=' in let expression");
       Result.clear();
       return;
     }
-    Lex.Lex();  // eat the '='.
 
     Init *Val = ParseValue(nullptr);
     if (!Val) {
@@ -3100,11 +3040,7 @@ void TGParser::ParseLetList(SmallVectorImpl<LetRecord> &Result) {
 
     // Now that we have everything, add the record.
     Result.emplace_back(Name, Bits, Val, NameLoc);
-
-    if (Lex.getCode() != tgtok::comma)
-      return;
-    Lex.Lex();  // eat the comma.
-  }
+  } while (consume(tgtok::comma));
 }
 
 /// ParseTopLevelLet - Parse a 'let' at top level.  This can be a couple of
@@ -3123,9 +3059,8 @@ bool TGParser::ParseTopLevelLet(MultiClass *CurMultiClass) {
   if (LetInfo.empty()) return true;
   LetStack.push_back(std::move(LetInfo));
 
-  if (Lex.getCode() != tgtok::In)
+  if (!consume(tgtok::In))
     return TokError("expected 'in' at end of top-level 'let'");
-  Lex.Lex();
 
   TGLocalVarScope *LetScope = PushLocalScope();
 
@@ -3143,11 +3078,10 @@ bool TGParser::ParseTopLevelLet(MultiClass *CurMultiClass) {
     if (ParseObjectList(CurMultiClass))
       return true;
 
-    if (Lex.getCode() != tgtok::r_brace) {
+    if (!consume(tgtok::r_brace)) {
       TokError("expected '}' at end of top level let command");
       return Error(BraceLoc, "to match this '{'");
     }
-    Lex.Lex();
   }
 
   PopLocalScope(LetScope);
@@ -3193,11 +3127,9 @@ bool TGParser::ParseMultiClass() {
   bool inherits = false;
 
   // If there are submulticlasses, parse them.
-  if (Lex.getCode() == tgtok::colon) {
+  if (consume(tgtok::colon)) {
     inherits = true;
 
-    Lex.Lex();
-
     // Read all of the submulticlasses.
     SubMultiClassReference SubMultiClass =
       ParseSubMultiClassReference(CurMultiClass);
@@ -3209,8 +3141,8 @@ bool TGParser::ParseMultiClass() {
       if (AddSubMultiClass(CurMultiClass, SubMultiClass))
         return true;
 
-      if (Lex.getCode() != tgtok::comma) break;
-      Lex.Lex(); // eat ','.
+      if (!consume(tgtok::comma))
+        break;
       SubMultiClass = ParseSubMultiClassReference(CurMultiClass);
     }
   }
@@ -3218,9 +3150,8 @@ bool TGParser::ParseMultiClass() {
   if (Lex.getCode() != tgtok::l_brace) {
     if (!inherits)
       return TokError("expected '{' in multiclass definition");
-    if (Lex.getCode() != tgtok::semi)
+    if (!consume(tgtok::semi))
       return TokError("expected ';' in multiclass definition");
-    Lex.Lex();  // eat the ';'.
   } else {
     if (Lex.Lex() == tgtok::r_brace)  // eat the '{'.
       return TokError("multiclass must contain at least one def");
@@ -3294,7 +3225,7 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
     // To instantiate a multiclass, we need to first get the multiclass, then
     // instantiate each def contained in the multiclass with the SubClassRef
     // template parameters.
-    MultiClass *MC = MultiClasses[Ref.Rec->getName()].get();
+    MultiClass *MC = MultiClasses[std::string(Ref.Rec->getName())].get();
     assert(MC && "Didn't lookup multiclass correctly?");
     ArrayRef<Init*> TemplateVals = Ref.TemplateArgs;
 
@@ -3327,8 +3258,8 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
                 &SubClassLoc))
       return true;
 
-    if (Lex.getCode() != tgtok::comma) break;
-    Lex.Lex(); // eat ','.
+    if (!consume(tgtok::comma))
+      break;
 
     if (Lex.getCode() != tgtok::Id)
       return TokError("expected identifier");
@@ -3361,8 +3292,8 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
           return true;
       }
 
-      if (Lex.getCode() != tgtok::comma) break;
-      Lex.Lex(); // eat ','.
+      if (!consume(tgtok::comma))
+        break;
       SubClass = ParseSubClassReference(nullptr, false);
     }
   }
@@ -3374,9 +3305,8 @@ bool TGParser::ParseDefm(MultiClass *CurMultiClass) {
     addEntry(std::move(E));
   }
 
-  if (Lex.getCode() != tgtok::semi)
+  if (!consume(tgtok::semi))
     return TokError("expected ';' at end of defm");
-  Lex.Lex();
 
   return false;
 }
diff --git a/llvm/lib/TableGen/TGParser.h b/llvm/lib/TableGen/TGParser.h
index c66c79771298..07a4003219f5 100644
--- a/llvm/lib/TableGen/TGParser.h
+++ b/llvm/lib/TableGen/TGParser.h
@@ -14,18 +14,13 @@
 #define LLVM_LIB_TABLEGEN_TGPARSER_H
 
 #include "TGLexer.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/Support/SourceMgr.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include <map>
 
 namespace llvm {
-  class Record;
-  class RecordVal;
-  class RecordKeeper;
-  class RecTy;
-  class Init;
+  class SourceMgr;
+  class Twine;
   struct ForeachLoop;
   struct MultiClass;
   struct SubClassReference;
@@ -112,7 +107,7 @@ public:
   }
 
   void addVar(StringRef Name, Init *I) {
-    bool Ins = vars.insert(std::make_pair(Name, I)).second;
+    bool Ins = vars.insert(std::make_pair(std::string(Name), I)).second;
     (void)Ins;
     assert(Ins && "Local variable already exists");
   }
@@ -215,6 +210,7 @@ private: // Semantic analysis methods.
   bool addDefOne(std::unique_ptr<Record> Rec);
 
 private:  // Parser methods.
+  bool consume(tgtok::TokKind K);
   bool ParseObjectList(MultiClass *MC = nullptr);
   bool ParseObject(MultiClass *MC);
   bool ParseClass();
diff --git a/llvm/lib/TableGen/TableGenBackend.cpp b/llvm/lib/TableGen/TableGenBackend.cpp
index e11b28e8cff9..252f126d2d00 100644
--- a/llvm/lib/TableGen/TableGenBackend.cpp
+++ b/llvm/lib/TableGen/TableGenBackend.cpp
@@ -13,6 +13,7 @@
 #include "llvm/TableGen/TableGenBackend.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/raw_ostream.h"
+#include <cassert>
 
 using namespace llvm;
 
diff --git a/llvm/lib/Target/AArch64/AArch64.h b/llvm/lib/Target/AArch64/AArch64.h
index ac765ebcddc0..fd35b530e3ce 100644
--- a/llvm/lib/Target/AArch64/AArch64.h
+++ b/llvm/lib/Target/AArch64/AArch64.h
@@ -38,6 +38,8 @@ FunctionPass *createAArch64ISelDag(AArch64TargetMachine &TM,
                                  CodeGenOpt::Level OptLevel);
 FunctionPass *createAArch64StorePairSuppressPass();
 FunctionPass *createAArch64ExpandPseudoPass();
+FunctionPass *createAArch64SLSHardeningPass();
+FunctionPass *createAArch64IndirectThunks();
 FunctionPass *createAArch64SpeculationHardeningPass();
 FunctionPass *createAArch64LoadStoreOptimizationPass();
 FunctionPass *createAArch64SIMDInstrOptPass();
@@ -52,11 +54,13 @@ FunctionPass *createAArch64BranchTargetsPass();
 FunctionPass *createAArch64CleanupLocalDynamicTLSPass();
 
 FunctionPass *createAArch64CollectLOHPass();
+ModulePass *createSVEIntrinsicOptsPass();
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &,
                                  AArch64Subtarget &, AArch64RegisterBankInfo &);
 FunctionPass *createAArch64PreLegalizeCombiner(bool IsOptNone);
-FunctionPass *createAArch64StackTaggingPass(bool MergeInit);
+FunctionPass *createAArch64PostLegalizeCombiner(bool IsOptNone);
+FunctionPass *createAArch64StackTaggingPass(bool IsOptNone);
 FunctionPass *createAArch64StackTaggingPreRAPass();
 
 void initializeAArch64A53Fix835769Pass(PassRegistry&);
@@ -70,16 +74,19 @@ void initializeAArch64ConditionalComparesPass(PassRegistry&);
 void initializeAArch64ConditionOptimizerPass(PassRegistry&);
 void initializeAArch64DeadRegisterDefinitionsPass(PassRegistry&);
 void initializeAArch64ExpandPseudoPass(PassRegistry&);
+void initializeAArch64SLSHardeningPass(PassRegistry&);
 void initializeAArch64SpeculationHardeningPass(PassRegistry&);
 void initializeAArch64LoadStoreOptPass(PassRegistry&);
 void initializeAArch64SIMDInstrOptPass(PassRegistry&);
 void initializeAArch64PreLegalizerCombinerPass(PassRegistry&);
+void initializeAArch64PostLegalizerCombinerPass(PassRegistry &);
 void initializeAArch64PromoteConstantPass(PassRegistry&);
 void initializeAArch64RedundantCopyEliminationPass(PassRegistry&);
 void initializeAArch64StorePairSuppressPass(PassRegistry&);
 void initializeFalkorHWPFFixPass(PassRegistry&);
 void initializeFalkorMarkStridedAccessesLegacyPass(PassRegistry&);
 void initializeLDTLSCleanupPass(PassRegistry&);
+void initializeSVEIntrinsicOptsPass(PassRegistry&);
 void initializeAArch64StackTaggingPass(PassRegistry&);
 void initializeAArch64StackTaggingPreRAPass(PassRegistry&);
 } // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64.td b/llvm/lib/Target/AArch64/AArch64.td
index 0106355b1a44..534af9686af0 100644
--- a/llvm/lib/Target/AArch64/AArch64.td
+++ b/llvm/lib/Target/AArch64/AArch64.td
@@ -42,11 +42,11 @@ def FeatureAES : SubtargetFeature<
     "Enable AES support", [FeatureNEON]>;
 
 // Crypto has been split up and any combination is now valid (see the
-// crypto defintions above). Also, crypto is now context sensitive:
+// crypto definitions above). Also, crypto is now context sensitive:
 // it has a different meaning for e.g. Armv8.4 than it has for Armv8.2.
 // Therefore, we rely on Clang, the user interacing tool, to pass on the
 // appropriate crypto options. But here in the backend, crypto has very little
-// meaning anymore. We kept the Crypto defintion here for backward
+// meaning anymore. We kept the Crypto definition here for backward
 // compatibility, and now imply features SHA2 and AES, which was the
 // "traditional" meaning of Crypto.
 def FeatureCrypto : SubtargetFeature<"crypto", "HasCrypto", "true",
@@ -101,7 +101,25 @@ def FeatureCCPP : SubtargetFeature<"ccpp", "HasCCPP",
     "true", "Enable v8.2 data Cache Clean to Point of Persistence" >;
 
 def FeatureSVE : SubtargetFeature<"sve", "HasSVE", "true",
-  "Enable Scalable Vector Extension (SVE) instructions">;
+  "Enable Scalable Vector Extension (SVE) instructions", [FeatureFullFP16]>;
+
+// This flag is currently still labeled as Experimental, but when fully
+// implemented this should tell the compiler to use the zeroing pseudos to
+// benefit from the reverse instructions (e.g. SUB vs SUBR) if the inactive
+// lanes are known to be zero. The pseudos will then be expanded using the
+// MOVPRFX instruction to zero the inactive lanes. This feature should only be
+// enabled if MOVPRFX instructions are known to merge with the destructive
+// operations they prefix.
+//
+// This feature could similarly be extended to support cheap merging of _any_
+// value into the inactive lanes using the MOVPRFX instruction that uses
+// merging-predication.
+def FeatureExperimentalZeroingPseudos
+    : SubtargetFeature<"use-experimental-zeroing-pseudos",
+                       "UseExperimentalZeroingPseudos", "true",
+                       "Hint to the compiler that the MOVPRFX instruction is "
+                       "merged with destructive operations",
+                       []>;
 
 def FeatureSVE2 : SubtargetFeature<"sve2", "HasSVE2", "true",
   "Enable Scalable Vector Extension 2 (SVE2) instructions", [FeatureSVE]>;
@@ -142,7 +160,7 @@ def FeatureStrictAlign : SubtargetFeature<"strict-align",
                                           "Disallow all unaligned memory "
                                           "access">;
 
-foreach i = {1-7,9-15,18,20-28} in
+foreach i = {1-7,9-15,18,20-28,30} in
     def FeatureReserveX#i : SubtargetFeature<"reserve-x"#i, "ReserveXRegister["#i#"]", "true",
                                              "Reserve X"#i#", making it unavailable "
                                              "as a GPR">;
@@ -240,11 +258,11 @@ def FeatureDotProd : SubtargetFeature<
 
 def FeaturePA : SubtargetFeature<
     "pa", "HasPA", "true",
-    "Enable v8.3-A Pointer Authentication enchancement">;
+    "Enable v8.3-A Pointer Authentication extension">;
 
 def FeatureJS : SubtargetFeature<
     "jsconv", "HasJS", "true",
-    "Enable v8.3-A JavaScript FP conversion enchancement",
+    "Enable v8.3-A JavaScript FP conversion instructions",
     [FeatureFPARMv8]>;
 
 def FeatureCCIDX : SubtargetFeature<
@@ -281,6 +299,11 @@ def FeatureAM : SubtargetFeature<
     "am", "HasAM", "true",
     "Enable v8.4-A Activity Monitors extension">;
 
+def FeatureAMVS : SubtargetFeature<
+    "amvs", "HasAMVS", "true",
+    "Enable v8.6-A Activity Monitors Virtualization support",
+    [FeatureAM]>;
+
 def FeatureSEL2 : SubtargetFeature<
     "sel2", "HasSEL2", "true",
     "Enable v8.4-A Secure Exception Level 2 extension">;
@@ -365,6 +388,25 @@ def FeatureTaggedGlobals : SubtargetFeature<"tagged-globals",
     "true", "Use an instruction sequence for taking the address of a global "
     "that allows a memory tag in the upper address bits">;
 
+def FeatureBF16 : SubtargetFeature<"bf16", "HasBF16",
+    "true", "Enable BFloat16 Extension" >;
+
+def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
+    "true", "Enable Matrix Multiply Int8 Extension">;
+
+def FeatureMatMulFP32 : SubtargetFeature<"f32mm", "HasMatMulFP32",
+    "true", "Enable Matrix Multiply FP32 Extension", [FeatureSVE]>;
+
+def FeatureMatMulFP64 : SubtargetFeature<"f64mm", "HasMatMulFP64",
+    "true", "Enable Matrix Multiply FP64 Extension", [FeatureSVE]>;
+
+def FeatureFineGrainedTraps : SubtargetFeature<"fgt", "HasFineGrainedTraps",
+    "true", "Enable fine grained virtualization traps extension">;
+
+def FeatureEnhancedCounterVirtualization :
+      SubtargetFeature<"ecv", "HasEnhancedCounterVirtualization",
+      "true", "Enable enhanced counter virtualization extension">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //
@@ -391,8 +433,13 @@ def HasV8_5aOps : SubtargetFeature<
   "v8.5a", "HasV8_5aOps", "true", "Support ARM v8.5a instructions",
   [HasV8_4aOps, FeatureAltFPCmp, FeatureFRInt3264, FeatureSpecRestrict,
    FeatureSSBS, FeatureSB, FeaturePredRes, FeatureCacheDeepPersist,
-   FeatureBranchTargetId]
->;
+   FeatureBranchTargetId]>;
+
+def HasV8_6aOps : SubtargetFeature<
+  "v8.6a", "HasV8_6aOps", "true", "Support ARM v8.6a instructions",
+
+  [HasV8_5aOps, FeatureAMVS, FeatureBF16, FeatureFineGrainedTraps,
+   FeatureEnhancedCounterVirtualization, FeatureMatMulInt8]>;
 
 //===----------------------------------------------------------------------===//
 // Register File Description
@@ -429,6 +476,17 @@ def FeatureUseEL#i#ForTP : SubtargetFeature<"tpidr-el"#i, "UseEL"#i#"ForTP",
   "true", "Permit use of TPIDR_EL"#i#" for the TLS base">;
 
 //===----------------------------------------------------------------------===//
+// Control codegen mitigation against Straight Line Speculation vulnerability.
+//===----------------------------------------------------------------------===//
+
+def FeatureHardenSlsRetBr : SubtargetFeature<"harden-sls-retbr",
+  "HardenSlsRetBr", "true",
+  "Harden against straight line speculation across RET and BR instructions">;
+def FeatureHardenSlsBlr : SubtargetFeature<"harden-sls-blr",
+  "HardenSlsBlr", "true",
+  "Harden against straight line speculation across BLR instructions">;
+
+//===----------------------------------------------------------------------===//
 // AArch64 Processors supported.
 //
 
@@ -443,6 +501,10 @@ def SVEUnsupported : AArch64Unsupported {
            HasSVE2BitPerm];
 }
 
+def PAUnsupported : AArch64Unsupported {
+  let F = [HasPA];
+}
+
 include "AArch64SchedA53.td"
 include "AArch64SchedA57.td"
 include "AArch64SchedCyclone.td"
@@ -453,6 +515,7 @@ include "AArch64SchedExynosM4.td"
 include "AArch64SchedExynosM5.td"
 include "AArch64SchedThunderX.td"
 include "AArch64SchedThunderX2T99.td"
+include "AArch64SchedThunderX3T110.td"
 
 def ProcA35     : SubtargetFeature<"a35", "ARMProcFamily", "CortexA35",
                                    "Cortex-A35 ARM processors", [
@@ -563,6 +626,67 @@ def ProcA76     : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
                                     FeatureSSBS
                                     ]>;
 
+def ProcA77     : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
+                                   "Cortex-A77 ARM processors", [
+                                    HasV8_2aOps,
+                                    FeatureFPARMv8,
+                                    FeatureNEON, FeatureRCPC,
+                                    FeatureCrypto,
+                                    FeatureFullFP16,
+                                    FeatureDotProd
+                                    ]>;
+
+def ProcA78 : SubtargetFeature<"cortex-a78", "ARMProcFamily",
+                               "CortexA78",
+                               "Cortex-A78 ARM processors", [
+                               HasV8_2aOps,
+                               FeatureCrypto,
+                               FeatureFPARMv8,
+                               FeatureFuseAES,
+                               FeatureNEON,
+                               FeatureRCPC,
+                               FeaturePerfMon,
+                               FeaturePostRAScheduler,
+                               FeatureSPE,
+                               FeatureFullFP16,
+                               FeatureSSBS,
+                               FeatureDotProd]>;
+
+def ProcX1 : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
+                                  "Cortex-X1 ARM processors", [
+                                  HasV8_2aOps,
+                                  FeatureCrypto,
+                                  FeatureFPARMv8,
+                                  FeatureFuseAES,
+                                  FeatureNEON,
+                                  FeatureRCPC,
+                                  FeaturePerfMon,
+                                  FeaturePostRAScheduler,
+                                  FeatureSPE,
+                                  FeatureFullFP16,
+                                  FeatureDotProd]>;
+
+def ProcA64FX : SubtargetFeature<"a64fx", "ARMProcFamily", "A64FX",
+                                 "Fujitsu A64FX processors", [
+                                  HasV8_2aOps,
+                                  FeatureFPARMv8,
+                                  FeatureNEON,
+                                  FeatureSHA2,
+                                  FeaturePerfMon,
+                                  FeatureFullFP16,
+                                  FeatureSVE,
+                                  FeaturePostRAScheduler,
+                                  FeatureComplxNum
+                                  ]>;
+
+def ProcCarmel : SubtargetFeature<"carmel", "ARMProcFamily", "Carmel",
+                                  "Nvidia Carmel processors", [
+                                   HasV8_2aOps,
+                                   FeatureNEON,
+                                   FeatureCrypto,
+                                   FeatureFullFP16
+                                   ]>;
+
 // Note that cyclone does not fuse AES instructions, but newer apple chips do
 // perform the fusion and cyclone is used by default when targetting apple OSes.
 def ProcAppleA7 : SubtargetFeature<"apple-a7", "ARMProcFamily", "AppleA7",
@@ -780,6 +904,25 @@ def ProcThunderX2T99  : SubtargetFeature<"thunderx2t99", "ARMProcFamily",
                                           FeatureLSE,
                                           HasV8_1aOps]>;
 
+def ProcThunderX3T110  : SubtargetFeature<"thunderx3t110", "ARMProcFamily",
+                                         "ThunderX3T110",
+                                         "Marvell ThunderX3 processors", [
+                                          FeatureAggressiveFMA,
+                                          FeatureCRC,
+                                          FeatureCrypto,
+                                          FeatureFPARMv8,
+                                          FeatureArithmeticBccFusion,
+                                          FeatureNEON,
+                                          FeaturePostRAScheduler,
+                                          FeaturePredictableSelectIsExpensive,
+                                          FeatureLSE,
+                                          FeaturePA,
+                                          FeatureUseAA,
+                                          FeatureBalanceFPOps,
+                                          FeaturePerfMon,
+                                          FeatureStrictAlign,
+                                          HasV8_3aOps]>;
+
 def ProcThunderX : SubtargetFeature<"thunderx", "ARMProcFamily", "ThunderX",
                                     "Cavium ThunderX processors", [
                                     FeatureCRC,
@@ -844,7 +987,7 @@ def : ProcessorModel<"generic", NoSchedModel, [
                      FeatureNEON,
                      FeaturePerfMon,
                      FeaturePostRAScheduler,
-// ETE and TRBE are future architecture extensions. We temporariliy enable them
+// ETE and TRBE are future architecture extensions. We temporarily enable them
 // by default for users targeting generic AArch64, until it is decided in which
 // armv8.x-a architecture revision they will end up. The extensions do not
 // affect code generated by the compiler and can be used only by explicitly
@@ -853,6 +996,7 @@ def : ProcessorModel<"generic", NoSchedModel, [
                      ]>;
 
 def : ProcessorModel<"cortex-a35", CortexA53Model, [ProcA35]>;
+def : ProcessorModel<"cortex-a34", CortexA53Model, [ProcA35]>;
 def : ProcessorModel<"cortex-a53", CortexA53Model, [ProcA53]>;
 def : ProcessorModel<"cortex-a55", CortexA53Model, [ProcA55]>;
 def : ProcessorModel<"cortex-a57", CortexA57Model, [ProcA57]>;
@@ -863,6 +1007,9 @@ def : ProcessorModel<"cortex-a73", CortexA57Model, [ProcA73]>;
 def : ProcessorModel<"cortex-a75", CortexA57Model, [ProcA75]>;
 def : ProcessorModel<"cortex-a76", CortexA57Model, [ProcA76]>;
 def : ProcessorModel<"cortex-a76ae", CortexA57Model, [ProcA76]>;
+def : ProcessorModel<"cortex-a77", CortexA57Model, [ProcA77]>;
+def : ProcessorModel<"cortex-a78", CortexA57Model, [ProcA78]>;
+def : ProcessorModel<"cortex-x1", CortexA57Model, [ProcX1]>;
 def : ProcessorModel<"neoverse-e1", CortexA53Model, [ProcNeoverseE1]>;
 def : ProcessorModel<"neoverse-n1", CortexA57Model, [ProcNeoverseN1]>;
 def : ProcessorModel<"exynos-m3", ExynosM3Model, [ProcExynosM3]>;
@@ -878,6 +1025,8 @@ def : ProcessorModel<"thunderxt81", ThunderXT8XModel,  [ProcThunderXT81]>;
 def : ProcessorModel<"thunderxt83", ThunderXT8XModel,  [ProcThunderXT83]>;
 // Cavium ThunderX2T9X  Processors. Formerly Broadcom Vulcan.
 def : ProcessorModel<"thunderx2t99", ThunderX2T99Model, [ProcThunderX2T99]>;
+// Marvell ThunderX3T110 Processors.
+def : ProcessorModel<"thunderx3t110", ThunderX3T110Model, [ProcThunderX3T110]>;
 // FIXME: HiSilicon TSV110 is currently modeled as a Cortex-A57.
 def : ProcessorModel<"tsv110", CortexA57Model, [ProcTSV110]>;
 
@@ -900,6 +1049,13 @@ def : ProcessorModel<"apple-s5", CycloneModel, [ProcAppleA12]>;
 // Alias for the latest Apple processor model supported by LLVM.
 def : ProcessorModel<"apple-latest", CycloneModel, [ProcAppleA13]>;
 
+// Fujitsu A64FX
+// FIXME: Scheduling model is not implemented yet.
+def : ProcessorModel<"a64fx", NoSchedModel, [ProcA64FX]>;
+
+// Nvidia Carmel
+def : ProcessorModel<"carmel", NoSchedModel, [ProcCarmel]>;
+
 //===----------------------------------------------------------------------===//
 // Assembly parser
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
index 00e321f9b850..3a94820dac8d 100644
--- a/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
+++ b/llvm/lib/Target/AArch64/AArch64AsmPrinter.cpp
@@ -84,8 +84,8 @@ public:
     return MCInstLowering.lowerOperand(MO, MCOp);
   }
 
-  void EmitStartOfAsmFile(Module &M) override;
-  void EmitJumpTableInfo() override;
+  void emitStartOfAsmFile(Module &M) override;
+  void emitJumpTableInfo() override;
   void emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
                           const MachineBasicBlock *MBB, unsigned JTI);
 
@@ -112,7 +112,9 @@ public:
   bool emitPseudoExpansionLowering(MCStreamer &OutStreamer,
                                    const MachineInstr *MI);
 
-  void EmitInstruction(const MachineInstr *MI) override;
+  void emitInstruction(const MachineInstr *MI) override;
+
+  void emitFunctionHeaderComment() override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AsmPrinter::getAnalysisUsage(AU);
@@ -139,7 +141,7 @@ public:
     }
 
     // Emit the rest of the function body.
-    EmitFunctionBody();
+    emitFunctionBody();
 
     // Emit the XRay table for this function.
     emitXRayTable();
@@ -162,10 +164,10 @@ private:
 
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
 
-  void EmitFunctionBodyEnd() override;
+  void emitFunctionBodyEnd() override;
 
   MCSymbol *GetCPISymbol(unsigned CPID) const override;
-  void EmitEndOfAsmFile(Module &M) override;
+  void emitEndOfAsmFile(Module &M) override;
 
   AArch64FunctionInfo *AArch64FI = nullptr;
 
@@ -182,7 +184,7 @@ private:
 
 } // end anonymous namespace
 
-void AArch64AsmPrinter::EmitStartOfAsmFile(Module &M) {
+void AArch64AsmPrinter::emitStartOfAsmFile(Module &M) {
   if (!TM.getTargetTriple().isOSBinFormatELF())
     return;
 
@@ -225,22 +227,29 @@ void AArch64AsmPrinter::EmitStartOfAsmFile(Module &M) {
   OutStreamer->SwitchSection(Nt);
 
   // Emit the note header.
-  EmitAlignment(Align(8));
-  OutStreamer->EmitIntValue(4, 4);     // data size for "GNU\0"
-  OutStreamer->EmitIntValue(4 * 4, 4); // Elf_Prop size
-  OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4);
-  OutStreamer->EmitBytes(StringRef("GNU", 4)); // note name
+  emitAlignment(Align(8));
+  OutStreamer->emitInt32(4);     // data size for "GNU\0"
+  OutStreamer->emitInt32(4 * 4); // Elf_Prop size
+  OutStreamer->emitInt32(ELF::NT_GNU_PROPERTY_TYPE_0);
+  OutStreamer->emitBytes(StringRef("GNU", 4)); // note name
 
   // Emit the PAC/BTI properties.
-  OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND, 4);
-  OutStreamer->EmitIntValue(4, 4);     // data size
-  OutStreamer->EmitIntValue(Flags, 4); // data
-  OutStreamer->EmitIntValue(0, 4);     // pad
+  OutStreamer->emitInt32(ELF::GNU_PROPERTY_AARCH64_FEATURE_1_AND);
+  OutStreamer->emitInt32(4);     // data size
+  OutStreamer->emitInt32(Flags); // data
+  OutStreamer->emitInt32(0);     // pad
 
   OutStreamer->endSection(Nt);
   OutStreamer->SwitchSection(Cur);
 }
 
+void AArch64AsmPrinter::emitFunctionHeaderComment() {
+  const AArch64FunctionInfo *FI = MF->getInfo<AArch64FunctionInfo>();
+  Optional<std::string> OutlinerString = FI->getOutliningStyle();
+  if (OutlinerString != None)
+    OutStreamer->GetCommentOS() << ' ' << OutlinerString;
+}
+
 void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
 {
   const Function &F = MF->getFunction();
@@ -250,8 +259,7 @@ void AArch64AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
             .getValueAsString()
             .getAsInteger(10, Num))
       return;
-    for (; Num; --Num)
-      EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
+    emitNops(Num);
     return;
   }
 
@@ -291,9 +299,9 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
   //   ;DATA: higher 32 bits of the address of the trampoline
   //   LDP X0, X30, [SP], #16 ; pop X0 and the link register from the stack
   //
-  OutStreamer->EmitCodeAlignment(4);
+  OutStreamer->emitCodeAlignment(4);
   auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
-  OutStreamer->EmitLabel(CurSled);
+  OutStreamer->emitLabel(CurSled);
   auto Target = OutContext.createTempSymbol();
 
   // Emit "B #32" instruction, which jumps over the next 28 bytes.
@@ -304,8 +312,8 @@ void AArch64AsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
   for (int8_t I = 0; I < NoopsInSledCount; I++)
     EmitToStreamer(*OutStreamer, MCInstBuilder(AArch64::HINT).addImm(0));
 
-  OutStreamer->EmitLabel(Target);
-  recordSled(CurSled, MI, Kind);
+  OutStreamer->emitLabel(Target);
+  recordSled(CurSled, MI, Kind, 2);
 }
 
 void AArch64AsmPrinter::LowerHWASAN_CHECK_MEMACCESS(const MachineInstr &MI) {
@@ -364,25 +372,25 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
         ELF::SHF_EXECINSTR | ELF::SHF_ALLOC | ELF::SHF_GROUP, 0,
         Sym->getName()));
 
-    OutStreamer->EmitSymbolAttribute(Sym, MCSA_ELF_TypeFunction);
-    OutStreamer->EmitSymbolAttribute(Sym, MCSA_Weak);
-    OutStreamer->EmitSymbolAttribute(Sym, MCSA_Hidden);
-    OutStreamer->EmitLabel(Sym);
+    OutStreamer->emitSymbolAttribute(Sym, MCSA_ELF_TypeFunction);
+    OutStreamer->emitSymbolAttribute(Sym, MCSA_Weak);
+    OutStreamer->emitSymbolAttribute(Sym, MCSA_Hidden);
+    OutStreamer->emitLabel(Sym);
 
-    OutStreamer->EmitInstruction(MCInstBuilder(AArch64::UBFMXri)
+    OutStreamer->emitInstruction(MCInstBuilder(AArch64::UBFMXri)
                                      .addReg(AArch64::X16)
                                      .addReg(Reg)
                                      .addImm(4)
                                      .addImm(55),
                                  *STI);
-    OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBroX)
+    OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDRBBroX)
                                      .addReg(AArch64::W16)
                                      .addReg(AArch64::X9)
                                      .addReg(AArch64::X16)
                                      .addImm(0)
                                      .addImm(0),
                                  *STI);
-    OutStreamer->EmitInstruction(
+    OutStreamer->emitInstruction(
         MCInstBuilder(AArch64::SUBSXrs)
             .addReg(AArch64::XZR)
             .addReg(AArch64::X16)
@@ -390,33 +398,33 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
             .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
         *STI);
     MCSymbol *HandleMismatchOrPartialSym = OutContext.createTempSymbol();
-    OutStreamer->EmitInstruction(
+    OutStreamer->emitInstruction(
         MCInstBuilder(AArch64::Bcc)
             .addImm(AArch64CC::NE)
             .addExpr(MCSymbolRefExpr::create(HandleMismatchOrPartialSym,
                                              OutContext)),
         *STI);
     MCSymbol *ReturnSym = OutContext.createTempSymbol();
-    OutStreamer->EmitLabel(ReturnSym);
-    OutStreamer->EmitInstruction(
+    OutStreamer->emitLabel(ReturnSym);
+    OutStreamer->emitInstruction(
         MCInstBuilder(AArch64::RET).addReg(AArch64::LR), *STI);
-    OutStreamer->EmitLabel(HandleMismatchOrPartialSym);
+    OutStreamer->emitLabel(HandleMismatchOrPartialSym);
 
     if (IsShort) {
-      OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWri)
+      OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSWri)
                                        .addReg(AArch64::WZR)
                                        .addReg(AArch64::W16)
                                        .addImm(15)
                                        .addImm(0),
                                    *STI);
       MCSymbol *HandleMismatchSym = OutContext.createTempSymbol();
-      OutStreamer->EmitInstruction(
+      OutStreamer->emitInstruction(
           MCInstBuilder(AArch64::Bcc)
               .addImm(AArch64CC::HI)
               .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
           *STI);
 
-      OutStreamer->EmitInstruction(
+      OutStreamer->emitInstruction(
           MCInstBuilder(AArch64::ANDXri)
               .addReg(AArch64::X17)
               .addReg(Reg)
@@ -424,59 +432,59 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
           *STI);
       unsigned Size = 1 << (AccessInfo & 0xf);
       if (Size != 1)
-        OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ADDXri)
+        OutStreamer->emitInstruction(MCInstBuilder(AArch64::ADDXri)
                                          .addReg(AArch64::X17)
                                          .addReg(AArch64::X17)
                                          .addImm(Size - 1)
                                          .addImm(0),
                                      *STI);
-      OutStreamer->EmitInstruction(MCInstBuilder(AArch64::SUBSWrs)
+      OutStreamer->emitInstruction(MCInstBuilder(AArch64::SUBSWrs)
                                        .addReg(AArch64::WZR)
                                        .addReg(AArch64::W16)
                                        .addReg(AArch64::W17)
                                        .addImm(0),
                                    *STI);
-      OutStreamer->EmitInstruction(
+      OutStreamer->emitInstruction(
           MCInstBuilder(AArch64::Bcc)
               .addImm(AArch64CC::LS)
               .addExpr(MCSymbolRefExpr::create(HandleMismatchSym, OutContext)),
           *STI);
 
-      OutStreamer->EmitInstruction(
+      OutStreamer->emitInstruction(
           MCInstBuilder(AArch64::ORRXri)
               .addReg(AArch64::X16)
               .addReg(Reg)
               .addImm(AArch64_AM::encodeLogicalImmediate(0xf, 64)),
           *STI);
-      OutStreamer->EmitInstruction(MCInstBuilder(AArch64::LDRBBui)
+      OutStreamer->emitInstruction(MCInstBuilder(AArch64::LDRBBui)
                                        .addReg(AArch64::W16)
                                        .addReg(AArch64::X16)
                                        .addImm(0),
                                    *STI);
-      OutStreamer->EmitInstruction(
+      OutStreamer->emitInstruction(
           MCInstBuilder(AArch64::SUBSXrs)
               .addReg(AArch64::XZR)
               .addReg(AArch64::X16)
               .addReg(Reg)
               .addImm(AArch64_AM::getShifterImm(AArch64_AM::LSR, 56)),
           *STI);
-      OutStreamer->EmitInstruction(
+      OutStreamer->emitInstruction(
           MCInstBuilder(AArch64::Bcc)
               .addImm(AArch64CC::EQ)
               .addExpr(MCSymbolRefExpr::create(ReturnSym, OutContext)),
           *STI);
 
-      OutStreamer->EmitLabel(HandleMismatchSym);
+      OutStreamer->emitLabel(HandleMismatchSym);
     }
 
-    OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXpre)
+    OutStreamer->emitInstruction(MCInstBuilder(AArch64::STPXpre)
                                      .addReg(AArch64::SP)
                                      .addReg(AArch64::X0)
                                      .addReg(AArch64::X1)
                                      .addReg(AArch64::SP)
                                      .addImm(-32),
                                  *STI);
-    OutStreamer->EmitInstruction(MCInstBuilder(AArch64::STPXi)
+    OutStreamer->emitInstruction(MCInstBuilder(AArch64::STPXi)
                                      .addReg(AArch64::FP)
                                      .addReg(AArch64::LR)
                                      .addReg(AArch64::SP)
@@ -484,13 +492,13 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
                                  *STI);
 
     if (Reg != AArch64::X0)
-      OutStreamer->EmitInstruction(MCInstBuilder(AArch64::ORRXrs)
+      OutStreamer->emitInstruction(MCInstBuilder(AArch64::ORRXrs)
                                        .addReg(AArch64::X0)
                                        .addReg(AArch64::XZR)
                                        .addReg(Reg)
                                        .addImm(0),
                                    *STI);
-    OutStreamer->EmitInstruction(MCInstBuilder(AArch64::MOVZXi)
+    OutStreamer->emitInstruction(MCInstBuilder(AArch64::MOVZXi)
                                      .addReg(AArch64::X1)
                                      .addImm(AccessInfo)
                                      .addImm(0),
@@ -499,14 +507,14 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
     // Intentionally load the GOT entry and branch to it, rather than possibly
     // late binding the function, which may clobber the registers before we have
     // a chance to save them.
-    OutStreamer->EmitInstruction(
+    OutStreamer->emitInstruction(
         MCInstBuilder(AArch64::ADRP)
             .addReg(AArch64::X16)
             .addExpr(AArch64MCExpr::create(
                 HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_PAGE,
                 OutContext)),
         *STI);
-    OutStreamer->EmitInstruction(
+    OutStreamer->emitInstruction(
         MCInstBuilder(AArch64::LDRXui)
             .addReg(AArch64::X16)
             .addReg(AArch64::X16)
@@ -514,12 +522,12 @@ void AArch64AsmPrinter::EmitHwasanMemaccessSymbols(Module &M) {
                 HwasanTagMismatchRef, AArch64MCExpr::VariantKind::VK_GOT_LO12,
                 OutContext)),
         *STI);
-    OutStreamer->EmitInstruction(
+    OutStreamer->emitInstruction(
         MCInstBuilder(AArch64::BR).addReg(AArch64::X16), *STI);
   }
 }
 
-void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
+void AArch64AsmPrinter::emitEndOfAsmFile(Module &M) {
   EmitHwasanMemaccessSymbols(M);
 
   const Triple &TT = TM.getTargetTriple();
@@ -529,7 +537,7 @@ void AArch64AsmPrinter::EmitEndOfAsmFile(Module &M) {
     // implementation of multiple entry points).  If this doesn't occur, the
     // linker can safely perform dead code stripping.  Since LLVM never
     // generates code that does this, it is always safe to set.
-    OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+    OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
   emitStackMaps(SM);
 }
@@ -544,12 +552,12 @@ void AArch64AsmPrinter::EmitLOHs() {
              "Label hasn't been inserted for LOH related instruction");
       MCArgs.push_back(LabelIt->second);
     }
-    OutStreamer->EmitLOHDirective(D.getKind(), MCArgs);
+    OutStreamer->emitLOHDirective(D.getKind(), MCArgs);
     MCArgs.clear();
   }
 }
 
-void AArch64AsmPrinter::EmitFunctionBodyEnd() {
+void AArch64AsmPrinter::emitFunctionBodyEnd() {
   if (!AArch64FI->getLOHRelated().empty())
     EmitLOHs();
 }
@@ -741,11 +749,10 @@ void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
   assert(NOps == 4);
   OS << '\t' << MAI->getCommentString() << "DEBUG_VALUE: ";
   // cast away const; DIetc do not take const operands for some reason.
-  OS << cast<DILocalVariable>(MI->getOperand(NOps - 2).getMetadata())
-            ->getName();
+  OS << MI->getDebugVariable()->getName();
   OS << " <- ";
   // Frame address.  Currently handles register +- offset only.
-  assert(MI->getOperand(0).isReg() && MI->getOperand(1).isImm());
+  assert(MI->getDebugOperand(0).isReg() && MI->isDebugOffsetImm());
   OS << '[';
   printOperand(MI, 0, OS);
   OS << '+';
@@ -755,7 +762,7 @@ void AArch64AsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
   printOperand(MI, NOps - 2, OS);
 }
 
-void AArch64AsmPrinter::EmitJumpTableInfo() {
+void AArch64AsmPrinter::emitJumpTableInfo() {
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
   if (!MJTI) return;
 
@@ -783,8 +790,8 @@ void AArch64AsmPrinter::EmitJumpTableInfo() {
     if (JTBBs.empty()) continue;
 
     unsigned Size = AFI->getJumpTableEntrySize(JTI);
-    EmitAlignment(Align(Size));
-    OutStreamer->EmitLabel(GetJTISymbol(JTI));
+    emitAlignment(Align(Size));
+    OutStreamer->emitLabel(GetJTISymbol(JTI));
 
     for (auto *JTBB : JTBBs)
       emitJumpTableEntry(MJTI, JTBB, JTI);
@@ -812,7 +819,7 @@ void AArch64AsmPrinter::emitJumpTableEntry(const MachineJumpTableInfo *MJTI,
         Value, MCConstantExpr::create(2, OutContext), OutContext);
   }
 
-  OutStreamer->EmitValue(Value, Size);
+  OutStreamer->emitValue(Value, Size);
 }
 
 /// Small jump tables contain an unsigned byte or half, representing the offset
@@ -868,7 +875,7 @@ void AArch64AsmPrinter::LowerSTACKMAP(MCStreamer &OutStreamer, StackMaps &SM,
 
   auto &Ctx = OutStreamer.getContext();
   MCSymbol *MILabel = Ctx.createTempSymbol();
-  OutStreamer.EmitLabel(MILabel);
+  OutStreamer.emitLabel(MILabel);
 
   SM.recordStackMap(*MILabel, MI);
   assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
@@ -898,7 +905,7 @@ void AArch64AsmPrinter::LowerPATCHPOINT(MCStreamer &OutStreamer, StackMaps &SM,
                                         const MachineInstr &MI) {
   auto &Ctx = OutStreamer.getContext();
   MCSymbol *MILabel = Ctx.createTempSymbol();
-  OutStreamer.EmitLabel(MILabel);
+  OutStreamer.emitLabel(MILabel);
   SM.recordPatchPoint(*MILabel, MI);
 
   PatchPointOpers Opers(&MI);
@@ -982,7 +989,7 @@ void AArch64AsmPrinter::EmitFMov0(const MachineInstr &MI) {
 // instructions) auto-generated.
 #include "AArch64GenMCPseudoLowering.inc"
 
-void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void AArch64AsmPrinter::emitInstruction(const MachineInstr *MI) {
   // Do any auto-generated pseudo lowerings.
   if (emitPseudoExpansionLowering(*OutStreamer, MI))
     return;
@@ -992,7 +999,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     MCSymbol *LOHLabel = createTempSymbol("loh");
     // Associate the instruction with the label
     LOHInstToLabel[MI] = LOHLabel;
-    OutStreamer->EmitLabel(LOHLabel);
+    OutStreamer->emitLabel(LOHLabel);
   }
 
   AArch64TargetStreamer *TS =
@@ -1001,6 +1008,26 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   switch (MI->getOpcode()) {
   default:
     break;
+  case AArch64::HINT: {
+    // CurrentPatchableFunctionEntrySym can be CurrentFnBegin only for
+    // -fpatchable-function-entry=N,0. The entry MBB is guaranteed to be
+    // non-empty. If MI is the initial BTI, place the
+    // __patchable_function_entries label after BTI.
+    if (CurrentPatchableFunctionEntrySym &&
+        CurrentPatchableFunctionEntrySym == CurrentFnBegin &&
+        MI == &MF->front().front()) {
+      int64_t Imm = MI->getOperand(0).getImm();
+      if ((Imm & 32) && (Imm & 6)) {
+        MCInst Inst;
+        MCInstLowering.Lower(MI, Inst);
+        EmitToStreamer(*OutStreamer, Inst);
+        CurrentPatchableFunctionEntrySym = createTempSymbol("patch");
+        OutStreamer->emitLabel(CurrentPatchableFunctionEntrySym);
+        return;
+      }
+    }
+    break;
+  }
     case AArch64::MOVMCSym: {
       Register DestReg = MI->getOperand(0).getReg();
       const MachineOperand &MO_Sym = MI->getOperand(1);
@@ -1048,7 +1075,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       SmallString<128> TmpStr;
       raw_svector_ostream OS(TmpStr);
       PrintDebugValueComment(MI, OS);
-      OutStreamer->EmitRawText(StringRef(OS.str()));
+      OutStreamer->emitRawText(StringRef(OS.str()));
     }
     return;
 
@@ -1061,7 +1088,7 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       if (needsCFIMoves() == CFI_M_None)
         return;
 
-      OutStreamer->EmitCFIBKeyFrame();
+      OutStreamer->emitCFIBKeyFrame();
       return;
     }
   }
@@ -1087,6 +1114,25 @@ void AArch64AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, TmpInst);
     return;
   }
+  case AArch64::SpeculationBarrierISBDSBEndBB: {
+    // Print DSB SYS + ISB
+    MCInst TmpInstDSB;
+    TmpInstDSB.setOpcode(AArch64::DSB);
+    TmpInstDSB.addOperand(MCOperand::createImm(0xf));
+    EmitToStreamer(*OutStreamer, TmpInstDSB);
+    MCInst TmpInstISB;
+    TmpInstISB.setOpcode(AArch64::ISB);
+    TmpInstISB.addOperand(MCOperand::createImm(0xf));
+    EmitToStreamer(*OutStreamer, TmpInstISB);
+    return;
+  }
+  case AArch64::SpeculationBarrierSBEndBB: {
+    // Print SB
+    MCInst TmpInstSB;
+    TmpInstSB.setOpcode(AArch64::SB);
+    EmitToStreamer(*OutStreamer, TmpInstSB);
+    return;
+  }
   case AArch64::TLSDESC_CALLSEQ: {
     /// lower this to:
     ///    adrp  x0, :tlsdesc:var
diff --git a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
index 6fa3a462bc71..1956014b738d 100644
--- a/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
+++ b/llvm/lib/Target/AArch64/AArch64BranchTargets.cpp
@@ -118,9 +118,15 @@ void AArch64BranchTargets::addBTI(MachineBasicBlock &MBB, bool CouldCall,
 
   auto MBBI = MBB.begin();
 
-  // PACI[AB]SP are implicitly BTI JC, so no BTI instruction needed there.
-  if (MBBI != MBB.end() && (MBBI->getOpcode() == AArch64::PACIASP ||
-                            MBBI->getOpcode() == AArch64::PACIBSP))
+  // Skip the meta instuctions, those will be removed anyway.
+  for (; MBBI != MBB.end() && MBBI->isMetaInstruction(); ++MBBI)
+    ;
+
+  // SCTLR_EL1.BT[01] is set to 0 by default which means
+  // PACI[AB]SP are implicitly BTI C so no BTI C instruction is needed there.
+  if (MBBI != MBB.end() && HintNum == 34 &&
+      (MBBI->getOpcode() == AArch64::PACIASP ||
+       MBBI->getOpcode() == AArch64::PACIBSP))
     return;
 
   BuildMI(MBB, MBB.begin(), MBB.findDebugLoc(MBB.begin()),
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
index a0695cef615f..84ec5afcc9c1 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.cpp
@@ -38,18 +38,17 @@ static const MCPhysReg QRegList[] = {AArch64::Q0, AArch64::Q1, AArch64::Q2,
 
 static bool finishStackBlock(SmallVectorImpl<CCValAssign> &PendingMembers,
                              MVT LocVT, ISD::ArgFlagsTy &ArgFlags,
-                             CCState &State, unsigned SlotAlign) {
+                             CCState &State, Align SlotAlign) {
   unsigned Size = LocVT.getSizeInBits() / 8;
   const Align StackAlign =
       State.getMachineFunction().getDataLayout().getStackAlignment();
-  const Align OrigAlign(ArgFlags.getOrigAlign());
-  const Align Align = std::min(OrigAlign, StackAlign);
+  const Align OrigAlign = ArgFlags.getNonZeroOrigAlign();
+  const Align Alignment = std::min(OrigAlign, StackAlign);
 
   for (auto &It : PendingMembers) {
-    It.convertToMem(State.AllocateStack(
-        Size, std::max((unsigned)Align.value(), SlotAlign)));
+    It.convertToMem(State.AllocateStack(Size, std::max(Alignment, SlotAlign)));
     State.addLoc(It);
-    SlotAlign = 1;
+    SlotAlign = Align(1);
   }
 
   // All pending members have now been allocated
@@ -72,7 +71,7 @@ static bool CC_AArch64_Custom_Stack_Block(
   if (!ArgFlags.isInConsecutiveRegsLast())
     return true;
 
-  return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, 8);
+  return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, Align(8));
 }
 
 /// Given an [N x Ty] block, it should be passed in a consecutive sequence of
@@ -146,7 +145,7 @@ static bool CC_AArch64_Custom_Block(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   for (auto Reg : RegList)
     State.AllocateReg(Reg);
 
-  unsigned SlotAlign = Subtarget.isTargetDarwin() ? 1 : 8;
+  const Align SlotAlign = Subtarget.isTargetDarwin() ? Align(1) : Align(8);
 
   return finishStackBlock(PendingMembers, LocVT, ArgFlags, State, SlotAlign);
 }
diff --git a/llvm/lib/Target/AArch64/AArch64CallingConvention.td b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
index a0b2d7712b66..fdcc890bf589 100644
--- a/llvm/lib/Target/AArch64/AArch64CallingConvention.td
+++ b/llvm/lib/Target/AArch64/AArch64CallingConvention.td
@@ -10,9 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-/// CCIfAlign - Match of the original alignment of the arg
-class CCIfAlign<string Align, CCAction A> :
-  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
 /// CCIfBigEndian - Match only if we're in big endian mode.
 class CCIfBigEndian<CCAction A> :
   CCIf<"State.getMachineFunction().getDataLayout().isBigEndian()", A>;
@@ -33,9 +30,9 @@ def CC_AArch64_AAPCS : CallingConv<[
 
   // Big endian vectors must be passed as if they were 1-element vectors so that
   // their lanes are in a consistent order.
-  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v4bf16, v8i8],
                          CCBitConvertToType<f64>>>,
-  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v8bf16, v16i8],
                          CCBitConvertToType<f128>>>,
 
   // In AAPCS, an SRet is passed in X8, not X0 like a normal pointer parameter.
@@ -75,10 +72,10 @@ def CC_AArch64_AAPCS : CallingConv<[
   CCIfConsecutiveRegs<CCCustom<"CC_AArch64_Custom_Block">>,
 
   CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
-            nxv2f32, nxv4f32, nxv2f64],
+            nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64],
            CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
   CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
-            nxv2f32, nxv4f32, nxv2f64],
+            nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64],
            CCPassIndirect<i64>>,
 
   CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
@@ -102,22 +99,24 @@ def CC_AArch64_AAPCS : CallingConv<[
                                           [W0, W1, W2, W3, W4, W5, W6, W7]>>,
   CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
            CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                    [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   // If more than will fit in registers, pass them on the stack instead.
-  CCIfType<[i1, i8, i16, f16], CCAssignToStack<8, 8>>,
+  CCIfType<[i1, i8, i16, f16, bf16], CCAssignToStack<8, 8>>,
   CCIfType<[i32, f32], CCAssignToStack<8, 8>>,
-  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16],
            CCAssignToStack<8, 8>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToStack<16, 16>>
 ]>;
 
@@ -132,9 +131,9 @@ def RetCC_AArch64_AAPCS : CallingConv<[
 
   // Big endian vectors must be passed as if they were 1-element vectors so that
   // their lanes are in a consistent order.
-  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v8i8],
+  CCIfBigEndian<CCIfType<[v2i32, v2f32, v4i16, v4f16, v4bf16, v8i8],
                          CCBitConvertToType<f64>>>,
-  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v16i8],
+  CCIfBigEndian<CCIfType<[v2i64, v2f64, v4i32, v4f32, v8i16, v8f16, v8bf16, v16i8],
                          CCBitConvertToType<f128>>>,
 
   CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
@@ -144,18 +143,20 @@ def RetCC_AArch64_AAPCS : CallingConv<[
                                           [W0, W1, W2, W3, W4, W5, W6, W7]>>,
   CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
       CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                               [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+  CCIfType<[f128, v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
       CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   CCIfType<[nxv16i8, nxv8i16, nxv4i32, nxv2i64, nxv2f16, nxv4f16, nxv8f16,
-            nxv2f32, nxv4f32, nxv2f64],
+            nxv2bf16, nxv4bf16, nxv8bf16, nxv2f32, nxv4f32, nxv2f64],
            CCAssignToReg<[Z0, Z1, Z2, Z3, Z4, Z5, Z6, Z7]>>,
 
   CCIfType<[nxv2i1, nxv4i1, nxv8i1, nxv16i1],
@@ -165,7 +166,7 @@ def RetCC_AArch64_AAPCS : CallingConv<[
 // Vararg functions on windows pass floats in integer registers
 let Entry = 1 in
 def CC_AArch64_Win64_VarArg : CallingConv<[
-  CCIfType<[f16, f32], CCPromoteToType<f64>>,
+  CCIfType<[f16, bf16, f32], CCPromoteToType<f64>>,
   CCIfType<[f64], CCBitConvertToType<i64>>,
   CCDelegateTo<CC_AArch64_AAPCS>
 ]>;
@@ -219,19 +220,22 @@ def CC_AArch64_DarwinPCS : CallingConv<[
                                           [W0, W1, W2, W3, W4, W5, W6, W7]>>,
   CCIfType<[f16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
+  CCIfType<[bf16], CCAssignToRegWithShadow<[H0, H1, H2, H3, H4, H5, H6, H7],
+                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f32], CCAssignToRegWithShadow<[S0, S1, S2, S3, S4, S5, S6, S7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
   CCIfType<[f64], CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                           [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+  CCIfType<[v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
            CCAssignToRegWithShadow<[D0, D1, D2, D3, D4, D5, D6, D7],
                                    [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToReg<[Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7]>>,
 
   // If more than will fit in registers, pass them on the stack instead.
   CCIf<"ValVT == MVT::i1 || ValVT == MVT::i8", CCAssignToStack<1, 1>>,
-  CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16", CCAssignToStack<2, 2>>,
+  CCIf<"ValVT == MVT::i16 || ValVT == MVT::f16 || ValVT == MVT::bf16",
+  CCAssignToStack<2, 2>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
 
   // Re-demote pointers to 32-bits so we don't end up storing 64-bit
@@ -239,9 +243,9 @@ def CC_AArch64_DarwinPCS : CallingConv<[
   CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
   CCIfPtr<CCIfILP32<CCAssignToStack<4, 4>>>,
 
-  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16],
+  CCIfType<[i64, f64, v1f64, v2f32, v1i64, v2i32, v4i16, v8i8, v4f16, v4bf16],
            CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToStack<16, 16>>
 ]>;
 
@@ -255,14 +259,14 @@ def CC_AArch64_DarwinPCS_VarArg : CallingConv<[
 
   // Handle all scalar types as either i64 or f64.
   CCIfType<[i8, i16, i32], CCPromoteToType<i64>>,
-  CCIfType<[f16, f32],     CCPromoteToType<f64>>,
+  CCIfType<[f16, bf16, f32], CCPromoteToType<f64>>,
 
   // Everything is on the stack.
   // i128 is split to two i64s, and its stack alignment is 16 bytes.
   CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
-  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
            CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToStack<16, 16>>
 ]>;
 
@@ -275,16 +279,16 @@ def CC_AArch64_DarwinPCS_ILP32_VarArg : CallingConv<[
 
   // Handle all scalar types as either i32 or f32.
   CCIfType<[i8, i16], CCPromoteToType<i32>>,
-  CCIfType<[f16],     CCPromoteToType<f32>>,
+  CCIfType<[f16, bf16], CCPromoteToType<f32>>,
 
   // Everything is on the stack.
   // i128 is split to two i64s, and its stack alignment is 16 bytes.
   CCIfPtr<CCIfILP32<CCTruncToType<i32>>>,
   CCIfType<[i32, f32], CCAssignToStack<4, 4>>,
   CCIfType<[i64], CCIfSplit<CCAssignToStack<8, 16>>>,
-  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16],
+  CCIfType<[i64, f64, v1i64, v2i32, v4i16, v8i8, v1f64, v2f32, v4f16, v4bf16],
            CCAssignToStack<8, 8>>,
-  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16],
+  CCIfType<[v2i64, v4i32, v8i16, v16i8, v4f32, v2f64, v8f16, v8bf16],
            CCAssignToStack<16, 16>>
 ]>;
 
@@ -377,11 +381,9 @@ def CSR_AArch64_AAPCS : CalleeSavedRegs<(add X19, X20, X21, X22, X23, X24,
                                            D8,  D9,  D10, D11,
                                            D12, D13, D14, D15)>;
 
-// Darwin puts the frame-record at the top of the callee-save area.
-def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
-                                           X23, X24, X25, X26, X27, X28,
-                                           D8,  D9,  D10, D11,
-                                           D12, D13, D14, D15)>;
+// A variant for treating X18 as callee saved, when interfacing with
+// code that needs X18 to be preserved.
+def CSR_AArch64_AAPCS_X18 : CalleeSavedRegs<(add X18, CSR_AArch64_AAPCS)>;
 
 // Win64 has unwinding codes for an (FP,LR) pair, save_fplr and save_fplr_x.
 // We put FP before LR, so that frame lowering logic generates (FP,LR) pairs,
@@ -421,33 +423,7 @@ def CSR_AArch64_SVE_AAPCS : CalleeSavedRegs<(add (sequence "Z%u", 8, 23),
 def CSR_AArch64_AAPCS_ThisReturn : CalleeSavedRegs<(add CSR_AArch64_AAPCS, X0)>;
 
 def CSR_AArch64_AAPCS_SwiftError
-    : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>;
-
-// The function used by Darwin to obtain the address of a thread-local variable
-// guarantees more than a normal AAPCS function. x16 and x17 are used on the
-// fast path for calculation, but other registers except X0 (argument/return)
-// and LR (it is a call, after all) are preserved.
-def CSR_AArch64_TLS_Darwin
-    : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
-                           FP,
-                           (sequence "Q%u", 0, 31))>;
-
-// We can only handle a register pair with adjacent registers, the register pair
-// should belong to the same class as well. Since the access function on the
-// fast path calls a function that follows CSR_AArch64_TLS_Darwin,
-// CSR_AArch64_CXX_TLS_Darwin should be a subset of CSR_AArch64_TLS_Darwin.
-def CSR_AArch64_CXX_TLS_Darwin
-    : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS,
-                           (sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
-                           (sequence "D%u", 0, 31))>;
-
-// CSRs that are handled by prologue, epilogue.
-def CSR_AArch64_CXX_TLS_Darwin_PE
-    : CalleeSavedRegs<(add LR, FP)>;
-
-// CSRs that are handled explicitly via copies.
-def CSR_AArch64_CXX_TLS_Darwin_ViaCopy
-    : CalleeSavedRegs<(sub CSR_AArch64_CXX_TLS_Darwin, LR, FP)>;
+    : CalleeSavedRegs<(sub CSR_AArch64_AAPCS, X21)>;
 
 // The ELF stub used for TLS-descriptor access saves every feasible
 // register. Only X0 and LR are clobbered.
@@ -472,14 +448,57 @@ def CSR_AArch64_StackProbe_Windows
                            (sequence "X%u", 18, 28), FP, SP,
                            (sequence "Q%u", 0, 31))>;
 
+// Darwin variants of AAPCS.
+// Darwin puts the frame-record at the top of the callee-save area.
+def CSR_Darwin_AArch64_AAPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21, X22,
+                                                X23, X24, X25, X26, X27, X28,
+                                                D8,  D9,  D10, D11,
+                                                D12, D13, D14, D15)>;
+
+def CSR_Darwin_AArch64_AAVPCS : CalleeSavedRegs<(add LR, FP, X19, X20, X21,
+                                                 X22, X23, X24, X25, X26, X27,
+                                                 X28, (sequence "Q%u", 8, 23))>;
+def CSR_Darwin_AArch64_AAPCS_ThisReturn
+    : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, X0)>;
+
+def CSR_Darwin_AArch64_AAPCS_SwiftError
+    : CalleeSavedRegs<(sub CSR_Darwin_AArch64_AAPCS, X21)>;
+
+// The function used by Darwin to obtain the address of a thread-local variable
+// guarantees more than a normal AAPCS function. x16 and x17 are used on the
+// fast path for calculation, but other registers except X0 (argument/return)
+// and LR (it is a call, after all) are preserved.
+def CSR_Darwin_AArch64_TLS
+    : CalleeSavedRegs<(add (sub (sequence "X%u", 1, 28), X16, X17),
+                           FP,
+                           (sequence "Q%u", 0, 31))>;
+
+// We can only handle a register pair with adjacent registers, the register pair
+// should belong to the same class as well. Since the access function on the
+// fast path calls a function that follows CSR_Darwin_AArch64_TLS,
+// CSR_Darwin_AArch64_CXX_TLS should be a subset of CSR_Darwin_AArch64_TLS.
+def CSR_Darwin_AArch64_CXX_TLS
+    : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS,
+                           (sub (sequence "X%u", 1, 28), X15, X16, X17, X18),
+                           (sequence "D%u", 0, 31))>;
+
+// CSRs that are handled by prologue, epilogue.
+def CSR_Darwin_AArch64_CXX_TLS_PE
+    : CalleeSavedRegs<(add LR, FP)>;
+
+// CSRs that are handled explicitly via copies.
+def CSR_Darwin_AArch64_CXX_TLS_ViaCopy
+    : CalleeSavedRegs<(sub CSR_Darwin_AArch64_CXX_TLS, LR, FP)>;
+
+def CSR_Darwin_AArch64_RT_MostRegs
+    : CalleeSavedRegs<(add CSR_Darwin_AArch64_AAPCS, (sequence "X%u", 9, 15))>;
+
 // Variants of the standard calling conventions for shadow call stack.
 // These all preserve x18 in addition to any other registers.
 def CSR_AArch64_NoRegs_SCS
     : CalleeSavedRegs<(add CSR_AArch64_NoRegs, X18)>;
 def CSR_AArch64_AllRegs_SCS
     : CalleeSavedRegs<(add CSR_AArch64_AllRegs, X18)>;
-def CSR_AArch64_CXX_TLS_Darwin_SCS
-    : CalleeSavedRegs<(add CSR_AArch64_CXX_TLS_Darwin, X18)>;
 def CSR_AArch64_AAPCS_SwiftError_SCS
     : CalleeSavedRegs<(add CSR_AArch64_AAPCS_SwiftError, X18)>;
 def CSR_AArch64_RT_MostRegs_SCS
diff --git a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
index 688bd1b28e85..3f244ba10102 100644
--- a/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CleanupLocalDynamicTLSPass.cpp
@@ -105,6 +105,10 @@ struct LDTLSCleanup : public MachineFunctionPass {
                                  TII->get(TargetOpcode::COPY), AArch64::X0)
                              .addReg(TLSBaseAddrReg);
 
+    // Update the call site info.
+    if (I.shouldUpdateCallSiteInfo())
+      I.getMF()->eraseCallSiteInfo(&I);
+
     // Erase the TLS_base_addr instruction.
     I.eraseFromParent();
 
diff --git a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
index 35e6fef24363..efdb1131abc9 100644
--- a/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CollectLOH.cpp
@@ -382,7 +382,7 @@ static bool handleMiddleInst(const MachineInstr &MI, LOHInfo &DefInfo,
 
 /// Update state when seeing and ADRP instruction.
 static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI,
-                       LOHInfo &Info) {
+                       LOHInfo &Info, LOHInfo *LOHInfos) {
   if (Info.LastADRP != nullptr) {
     LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAdrp:\n"
                       << '\t' << MI << '\t' << *Info.LastADRP);
@@ -393,12 +393,24 @@ static void handleADRP(const MachineInstr &MI, AArch64FunctionInfo &AFI,
   // Produce LOH directive if possible.
   if (Info.IsCandidate) {
     switch (Info.Type) {
-    case MCLOH_AdrpAdd:
+    case MCLOH_AdrpAdd: {
+      // ADRPs and ADDs for this candidate may be split apart if using
+      // GlobalISel instead of pseudo-expanded. If that happens, the
+      // def register of the ADD may have a use in between. Adding an LOH in
+      // this case can cause the linker to rewrite the ADRP to write to that
+      // register, clobbering the use.
+      const MachineInstr *AddMI = Info.MI0;
+      int DefIdx = mapRegToGPRIndex(MI.getOperand(0).getReg());
+      int OpIdx = mapRegToGPRIndex(AddMI->getOperand(0).getReg());
+      LOHInfo DefInfo = LOHInfos[OpIdx];
+      if (DefIdx != OpIdx && (DefInfo.OneUser || DefInfo.MultiUsers))
+        break;
       LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpAdd:\n"
                         << '\t' << MI << '\t' << *Info.MI0);
       AFI.addLOHDirective(MCLOH_AdrpAdd, {&MI, Info.MI0});
       ++NumADRSimpleCandidate;
       break;
+    }
     case MCLOH_AdrpLdr:
       if (supportLoadFromLiteral(*Info.MI0)) {
         LLVM_DEBUG(dbgs() << "Adding MCLOH_AdrpLdr:\n"
@@ -522,7 +534,8 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
 
     // Walk the basic block backwards and update the per register state machine
     // in the process.
-    for (const MachineInstr &MI : make_range(MBB.rbegin(), MBB.rend())) {
+    for (const MachineInstr &MI :
+         instructionsWithoutDebug(MBB.rbegin(), MBB.rend())) {
       unsigned Opcode = MI.getOpcode();
       switch (Opcode) {
       case AArch64::ADDXri:
@@ -544,7 +557,7 @@ bool AArch64CollectLOH::runOnMachineFunction(MachineFunction &MF) {
         const MachineOperand &Op0 = MI.getOperand(0);
         int Idx = mapRegToGPRIndex(Op0.getReg());
         if (Idx >= 0) {
-          handleADRP(MI, AFI, LOHInfos[Idx]);
+          handleADRP(MI, AFI, LOHInfos[Idx], LOHInfos);
           continue;
         }
         break;
diff --git a/llvm/lib/Target/AArch64/AArch64Combine.td b/llvm/lib/Target/AArch64/AArch64Combine.td
index bb99f2516ecf..aa41cae289e8 100644
--- a/llvm/lib/Target/AArch64/AArch64Combine.td
+++ b/llvm/lib/Target/AArch64/AArch64Combine.td
@@ -11,8 +11,74 @@
 
 include "llvm/Target/GlobalISel/Combine.td"
 
+def fconstant_to_constant : GICombineRule<
+  (defs root:$root),
+  (match (wip_match_opcode G_FCONSTANT):$root,
+         [{ return matchFConstantToConstant(*${root}, MRI); }]),
+  (apply [{ applyFConstantToConstant(*${root}); }])>;
+
 def AArch64PreLegalizerCombinerHelper: GICombinerHelper<
   "AArch64GenPreLegalizerCombinerHelper", [all_combines,
-                                           elide_br_by_inverting_cond]> {
+                                           elide_br_by_inverting_cond,
+                                           fconstant_to_constant]> {
   let DisableRuleOption = "aarch64prelegalizercombiner-disable-rule";
+  let StateClass = "AArch64PreLegalizerCombinerHelperState";
+  let AdditionalArguments = [];
+}
+
+// Matchdata for combines which replace a G_SHUFFLE_VECTOR with a
+// target-specific opcode.
+def shuffle_matchdata : GIDefMatchData<"ShuffleVectorPseudo">;
+
+def rev : GICombineRule<
+  (defs root:$root, shuffle_matchdata:$matchinfo),
+  (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+         [{ return matchREV(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
+>;
+
+def zip : GICombineRule<
+  (defs root:$root, shuffle_matchdata:$matchinfo),
+  (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+         [{ return matchZip(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
+>;
+
+def uzp : GICombineRule<
+  (defs root:$root, shuffle_matchdata:$matchinfo),
+  (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+         [{ return matchUZP(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
+>;
+
+def dup: GICombineRule <
+  (defs root:$root, shuffle_matchdata:$matchinfo),
+  (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+         [{ return matchDup(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
+>;
+
+def trn : GICombineRule<
+  (defs root:$root, shuffle_matchdata:$matchinfo),
+  (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+         [{ return matchTRN(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyShuffleVectorPseudo(*${root}, ${matchinfo}); }])
+>;
+
+def ext: GICombineRule <
+  (defs root:$root, shuffle_matchdata:$matchinfo),
+  (match (wip_match_opcode G_SHUFFLE_VECTOR):$root,
+         [{ return matchEXT(*${root}, MRI, ${matchinfo}); }]),
+  (apply [{ applyEXT(*${root}, ${matchinfo}); }])
+>;
+
+// Combines which replace a G_SHUFFLE_VECTOR with a target-specific pseudo
+// instruction.
+def shuffle_vector_pseudos : GICombineGroup<[dup, rev, ext, zip, uzp, trn]>;
+
+def AArch64PostLegalizerCombinerHelper
+    : GICombinerHelper<"AArch64GenPostLegalizerCombinerHelper",
+                       [erase_undef_store, combines_for_extload,
+                        sext_already_extended, shuffle_vector_pseudos]> {
+  let DisableRuleOption = "aarch64postlegalizercombiner-disable-rule";
 }
diff --git a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
index 259238705965..57dc8a4061f1 100644
--- a/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CompressJumpTables.cpp
@@ -79,7 +79,7 @@ void AArch64CompressJumpTables::scanFunction() {
   for (MachineBasicBlock &MBB : *MF) {
     const Align Alignment = MBB.getAlignment();
     unsigned AlignedOffset;
-    if (Alignment == Align::None())
+    if (Alignment == Align(1))
       AlignedOffset = Offset;
     else
       AlignedOffset = alignTo(Offset, Alignment);
diff --git a/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp b/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
index 25e23e4623de..e90e8e3da057 100644
--- a/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
+++ b/llvm/lib/Target/AArch64/AArch64CondBrTuning.cpp
@@ -194,12 +194,8 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
 
       // There must not be any instruction between DefMI and MI that clobbers or
       // reads NZCV.
-      MachineBasicBlock::iterator I(DefMI), E(MI);
-      for (I = std::next(I); I != E; ++I) {
-        if (I->modifiesRegister(AArch64::NZCV, TRI) ||
-            I->readsRegister(AArch64::NZCV, TRI))
-          return false;
-      }
+      if (isNZCVTouchedInInstructionRange(DefMI, MI, TRI))
+        return false;
       LLVM_DEBUG(dbgs() << "  Replacing instructions:\n    ");
       LLVM_DEBUG(DefMI.print(dbgs()));
       LLVM_DEBUG(dbgs() << "    ");
@@ -253,12 +249,8 @@ bool AArch64CondBrTuning::tryToTuneBranch(MachineInstr &MI,
         return false;
       // There must not be any instruction between DefMI and MI that clobbers or
       // reads NZCV.
-      MachineBasicBlock::iterator I(DefMI), E(MI);
-      for (I = std::next(I); I != E; ++I) {
-        if (I->modifiesRegister(AArch64::NZCV, TRI) ||
-            I->readsRegister(AArch64::NZCV, TRI))
-          return false;
-      }
+      if (isNZCVTouchedInInstructionRange(DefMI, MI, TRI))
+        return false;
       LLVM_DEBUG(dbgs() << "  Replacing instructions:\n    ");
       LLVM_DEBUG(DefMI.print(dbgs()));
       LLVM_DEBUG(dbgs() << "    ");
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
index 51b2ce029701..64f0bb63762d 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionOptimizer.cpp
@@ -145,11 +145,11 @@ void AArch64ConditionOptimizer::getAnalysisUsage(AnalysisUsage &AU) const {
 // instructions.
 MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
     MachineBasicBlock *MBB) {
-  MachineBasicBlock::iterator I = MBB->getFirstTerminator();
-  if (I == MBB->end())
+  MachineBasicBlock::iterator Term = MBB->getFirstTerminator();
+  if (Term == MBB->end())
     return nullptr;
 
-  if (I->getOpcode() != AArch64::Bcc)
+  if (Term->getOpcode() != AArch64::Bcc)
     return nullptr;
 
   // Since we may modify cmp of this MBB, make sure NZCV does not live out.
@@ -158,32 +158,33 @@ MachineInstr *AArch64ConditionOptimizer::findSuitableCompare(
       return nullptr;
 
   // Now find the instruction controlling the terminator.
-  for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
-    --I;
-    assert(!I->isTerminator() && "Spurious terminator");
+  for (MachineBasicBlock::iterator B = MBB->begin(), It = Term; It != B;) {
+    It = prev_nodbg(It, B);
+    MachineInstr &I = *It;
+    assert(!I.isTerminator() && "Spurious terminator");
     // Check if there is any use of NZCV between CMP and Bcc.
-    if (I->readsRegister(AArch64::NZCV))
+    if (I.readsRegister(AArch64::NZCV))
       return nullptr;
-    switch (I->getOpcode()) {
+    switch (I.getOpcode()) {
     // cmp is an alias for subs with a dead destination register.
     case AArch64::SUBSWri:
     case AArch64::SUBSXri:
     // cmn is an alias for adds with a dead destination register.
     case AArch64::ADDSWri:
     case AArch64::ADDSXri: {
-      unsigned ShiftAmt = AArch64_AM::getShiftValue(I->getOperand(3).getImm());
-      if (!I->getOperand(2).isImm()) {
-        LLVM_DEBUG(dbgs() << "Immediate of cmp is symbolic, " << *I << '\n');
+      unsigned ShiftAmt = AArch64_AM::getShiftValue(I.getOperand(3).getImm());
+      if (!I.getOperand(2).isImm()) {
+        LLVM_DEBUG(dbgs() << "Immediate of cmp is symbolic, " << I << '\n');
         return nullptr;
-      } else if (I->getOperand(2).getImm() << ShiftAmt >= 0xfff) {
-        LLVM_DEBUG(dbgs() << "Immediate of cmp may be out of range, " << *I
+      } else if (I.getOperand(2).getImm() << ShiftAmt >= 0xfff) {
+        LLVM_DEBUG(dbgs() << "Immediate of cmp may be out of range, " << I
                           << '\n');
         return nullptr;
-      } else if (!MRI->use_empty(I->getOperand(0).getReg())) {
-        LLVM_DEBUG(dbgs() << "Destination of cmp is not dead, " << *I << '\n');
+      } else if (!MRI->use_nodbg_empty(I.getOperand(0).getReg())) {
+        LLVM_DEBUG(dbgs() << "Destination of cmp is not dead, " << I << '\n');
         return nullptr;
       }
-      return &*I;
+      return &I;
     }
     // Prevent false positive case like:
     // cmp      w19, #0
@@ -294,12 +295,10 @@ void AArch64ConditionOptimizer::modifyCmp(MachineInstr *CmpMI,
       .add(BrMI.getOperand(1));
   BrMI.eraseFromParent();
 
-  MBB->updateTerminator();
-
   ++NumConditionsAdjusted;
 }
 
-// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// Parse a condition code returned by analyzeBranch, and compute the CondCode
 // corresponding to TBB.
 // Returns true if parsing was successful, otherwise false is returned.
 static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 054ef8f482ca..82e8df3b73f9 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -157,7 +157,7 @@ public:
   MachineInstr *CmpMI;
 
 private:
-  /// The branch condition in Head as determined by AnalyzeBranch.
+  /// The branch condition in Head as determined by analyzeBranch.
   SmallVector<MachineOperand, 4> HeadCond;
 
   /// The condition code that makes Head branch to CmpBB.
@@ -267,7 +267,7 @@ bool SSACCmpConv::isDeadDef(unsigned DstReg) {
   return MRI->use_nodbg_empty(DstReg);
 }
 
-// Parse a condition code returned by AnalyzeBranch, and compute the CondCode
+// Parse a condition code returned by analyzeBranch, and compute the CondCode
 // corresponding to TBB.
 // Return
 static bool parseCond(ArrayRef<MachineOperand> Cond, AArch64CC::CondCode &CC) {
@@ -317,7 +317,7 @@ MachineInstr *SSACCmpConv::findConvertibleCompare(MachineBasicBlock *MBB) {
 
   // Now find the instruction controlling the terminator.
   for (MachineBasicBlock::iterator B = MBB->begin(); I != B;) {
-    --I;
+    I = prev_nodbg(I, MBB->begin());
     assert(!I->isTerminator() && "Spurious terminator");
     switch (I->getOpcode()) {
     // cmp is an alias for subs with a dead destination register.
@@ -509,7 +509,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
   // landing pad.
   if (!TBB || HeadCond.empty()) {
     LLVM_DEBUG(
-        dbgs() << "AnalyzeBranch didn't find conditional branch in Head.\n");
+        dbgs() << "analyzeBranch didn't find conditional branch in Head.\n");
     ++NumHeadBranchRejs;
     return false;
   }
@@ -536,7 +536,7 @@ bool SSACCmpConv::canConvert(MachineBasicBlock *MBB) {
 
   if (!TBB || CmpBBCond.empty()) {
     LLVM_DEBUG(
-        dbgs() << "AnalyzeBranch didn't find conditional branch in CmpBB.\n");
+        dbgs() << "analyzeBranch didn't find conditional branch in CmpBB.\n");
     ++NumCmpBranchRejs;
     return false;
   }
@@ -710,7 +710,7 @@ void SSACCmpConv::convert(SmallVectorImpl<MachineBasicBlock *> &RemovedBlocks) {
         .add(CmpMI->getOperand(1)); // Branch target.
   }
   CmpMI->eraseFromParent();
-  Head->updateTerminator();
+  Head->updateTerminator(CmpBB->getNextNode());
 
   RemovedBlocks.push_back(CmpBB);
   CmpBB->eraseFromParent();
@@ -828,7 +828,7 @@ void AArch64ConditionalCompares::updateDomTree(
     assert(Node != HeadNode && "Cannot erase the head node");
     assert(Node->getIDom() == HeadNode && "CmpBB should be dominated by Head");
     while (Node->getNumChildren())
-      DomTree->changeImmediateDominator(Node->getChildren().back(), HeadNode);
+      DomTree->changeImmediateDominator(Node->back(), HeadNode);
     DomTree->eraseNode(RemovedMBB);
   }
 }
diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 3b8f8a19fe49..9e65ad2e18f9 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -68,6 +68,8 @@ private:
   bool expandMOVImm(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                     unsigned BitSize);
 
+  bool expand_DestructiveOp(MachineInstr &MI, MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI);
   bool expandCMP_SWAP(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                       unsigned LdarOp, unsigned StlrOp, unsigned CmpOp,
                       unsigned ExtendImm, unsigned ZeroReg,
@@ -78,6 +80,9 @@ private:
   bool expandSetTagLoop(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator MBBI,
                         MachineBasicBlock::iterator &NextMBBI);
+  bool expandSVESpillFill(MachineBasicBlock &MBB,
+                          MachineBasicBlock::iterator MBBI, unsigned Opc,
+                          unsigned N);
 };
 
 } // end anonymous namespace
@@ -344,27 +349,225 @@ bool AArch64ExpandPseudo::expandCMP_SWAP_128(
   return true;
 }
 
+/// \brief Expand Pseudos to Instructions with destructive operands.
+///
+/// This mechanism uses MOVPRFX instructions for zeroing the false lanes
+/// or for fixing relaxed register allocation conditions to comply with
+/// the instructions register constraints. The latter case may be cheaper
+/// than setting the register constraints in the register allocator,
+/// since that will insert regular MOV instructions rather than MOVPRFX.
+///
+/// Example (after register allocation):
+///
+///   FSUB_ZPZZ_ZERO_B Z0, Pg, Z1, Z0
+///
+/// * The Pseudo FSUB_ZPZZ_ZERO_B maps to FSUB_ZPmZ_B.
+/// * We cannot map directly to FSUB_ZPmZ_B because the register
+///   constraints of the instruction are not met.
+/// * Also the _ZERO specifies the false lanes need to be zeroed.
+///
+/// We first try to see if the destructive operand == result operand,
+/// if not, we try to swap the operands, e.g.
+///
+///   FSUB_ZPmZ_B  Z0, Pg/m, Z0, Z1
+///
+/// But because FSUB_ZPmZ is not commutative, this is semantically
+/// different, so we need a reverse instruction:
+///
+///   FSUBR_ZPmZ_B  Z0, Pg/m, Z0, Z1
+///
+/// Then we implement the zeroing of the false lanes of Z0 by adding
+/// a zeroing MOVPRFX instruction:
+///
+///   MOVPRFX_ZPzZ_B Z0, Pg/z, Z0
+///   FSUBR_ZPmZ_B   Z0, Pg/m, Z0, Z1
+///
+/// Note that this can only be done for _ZERO or _UNDEF variants where
+/// we can guarantee the false lanes to be zeroed (by implementing this)
+/// or that they are undef (don't care / not used), otherwise the
+/// swapping of operands is illegal because the operation is not
+/// (or cannot be emulated to be) fully commutative.
+bool AArch64ExpandPseudo::expand_DestructiveOp(
+                            MachineInstr &MI,
+                            MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI) {
+  unsigned Opcode = AArch64::getSVEPseudoMap(MI.getOpcode());
+  uint64_t DType = TII->get(Opcode).TSFlags & AArch64::DestructiveInstTypeMask;
+  uint64_t FalseLanes = MI.getDesc().TSFlags & AArch64::FalseLanesMask;
+  bool FalseZero = FalseLanes == AArch64::FalseLanesZero;
+
+  unsigned DstReg = MI.getOperand(0).getReg();
+  bool DstIsDead = MI.getOperand(0).isDead();
+
+  if (DType == AArch64::DestructiveBinary)
+    assert(DstReg != MI.getOperand(3).getReg());
+
+  bool UseRev = false;
+  unsigned PredIdx, DOPIdx, SrcIdx;
+  switch (DType) {
+  case AArch64::DestructiveBinaryComm:
+  case AArch64::DestructiveBinaryCommWithRev:
+    if (DstReg == MI.getOperand(3).getReg()) {
+      // FSUB Zd, Pg, Zs1, Zd  ==> FSUBR   Zd, Pg/m, Zd, Zs1
+      std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 3, 2);
+      UseRev = true;
+      break;
+    }
+    LLVM_FALLTHROUGH;
+  case AArch64::DestructiveBinary:
+  case AArch64::DestructiveBinaryImm:
+    std::tie(PredIdx, DOPIdx, SrcIdx) = std::make_tuple(1, 2, 3);
+   break;
+  default:
+    llvm_unreachable("Unsupported Destructive Operand type");
+  }
+
+#ifndef NDEBUG
+  // MOVPRFX can only be used if the destination operand
+  // is the destructive operand, not as any other operand,
+  // so the Destructive Operand must be unique.
+  bool DOPRegIsUnique = false;
+  switch (DType) {
+  case AArch64::DestructiveBinaryComm:
+  case AArch64::DestructiveBinaryCommWithRev:
+    DOPRegIsUnique =
+      DstReg != MI.getOperand(DOPIdx).getReg() ||
+      MI.getOperand(DOPIdx).getReg() != MI.getOperand(SrcIdx).getReg();
+    break;
+  case AArch64::DestructiveBinaryImm:
+    DOPRegIsUnique = true;
+    break;
+  }
+#endif
+
+  // Resolve the reverse opcode
+  if (UseRev) {
+    int NewOpcode;
+    // e.g. DIV -> DIVR
+    if ((NewOpcode = AArch64::getSVERevInstr(Opcode)) != -1)
+      Opcode = NewOpcode;
+    // e.g. DIVR -> DIV
+    else if ((NewOpcode = AArch64::getSVENonRevInstr(Opcode)) != -1)
+      Opcode = NewOpcode;
+  }
+
+  // Get the right MOVPRFX
+  uint64_t ElementSize = TII->getElementSizeForOpcode(Opcode);
+  unsigned MovPrfx, MovPrfxZero;
+  switch (ElementSize) {
+  case AArch64::ElementSizeNone:
+  case AArch64::ElementSizeB:
+    MovPrfx = AArch64::MOVPRFX_ZZ;
+    MovPrfxZero = AArch64::MOVPRFX_ZPzZ_B;
+    break;
+  case AArch64::ElementSizeH:
+    MovPrfx = AArch64::MOVPRFX_ZZ;
+    MovPrfxZero = AArch64::MOVPRFX_ZPzZ_H;
+    break;
+  case AArch64::ElementSizeS:
+    MovPrfx = AArch64::MOVPRFX_ZZ;
+    MovPrfxZero = AArch64::MOVPRFX_ZPzZ_S;
+    break;
+  case AArch64::ElementSizeD:
+    MovPrfx = AArch64::MOVPRFX_ZZ;
+    MovPrfxZero = AArch64::MOVPRFX_ZPzZ_D;
+    break;
+  default:
+    llvm_unreachable("Unsupported ElementSize");
+  }
+
+  //
+  // Create the destructive operation (if required)
+  //
+  MachineInstrBuilder PRFX, DOP;
+  if (FalseZero) {
+#ifndef NDEBUG
+    assert(DOPRegIsUnique && "The destructive operand should be unique");
+#endif
+    assert(ElementSize != AArch64::ElementSizeNone &&
+           "This instruction is unpredicated");
+
+    // Merge source operand into destination register
+    PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfxZero))
+               .addReg(DstReg, RegState::Define)
+               .addReg(MI.getOperand(PredIdx).getReg())
+               .addReg(MI.getOperand(DOPIdx).getReg());
+
+    // After the movprfx, the destructive operand is same as Dst
+    DOPIdx = 0;
+  } else if (DstReg != MI.getOperand(DOPIdx).getReg()) {
+#ifndef NDEBUG
+    assert(DOPRegIsUnique && "The destructive operand should be unique");
+#endif
+    PRFX = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(MovPrfx))
+               .addReg(DstReg, RegState::Define)
+               .addReg(MI.getOperand(DOPIdx).getReg());
+    DOPIdx = 0;
+  }
+
+  //
+  // Create the destructive operation
+  //
+  DOP = BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opcode))
+    .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead));
+
+  switch (DType) {
+  case AArch64::DestructiveBinaryImm:
+  case AArch64::DestructiveBinaryComm:
+  case AArch64::DestructiveBinaryCommWithRev:
+    DOP.add(MI.getOperand(PredIdx))
+       .addReg(MI.getOperand(DOPIdx).getReg(), RegState::Kill)
+       .add(MI.getOperand(SrcIdx));
+    break;
+  }
+
+  if (PRFX) {
+    finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator());
+    transferImpOps(MI, PRFX, DOP);
+  } else
+    transferImpOps(MI, DOP, DOP);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AArch64ExpandPseudo::expandSetTagLoop(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     MachineBasicBlock::iterator &NextMBBI) {
   MachineInstr &MI = *MBBI;
   DebugLoc DL = MI.getDebugLoc();
-  Register SizeReg = MI.getOperand(2).getReg();
-  Register AddressReg = MI.getOperand(3).getReg();
+  Register SizeReg = MI.getOperand(0).getReg();
+  Register AddressReg = MI.getOperand(1).getReg();
 
   MachineFunction *MF = MBB.getParent();
 
-  bool ZeroData = MI.getOpcode() == AArch64::STZGloop;
-  const unsigned OpCode =
+  bool ZeroData = MI.getOpcode() == AArch64::STZGloop_wback;
+  const unsigned OpCode1 =
+      ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex;
+  const unsigned OpCode2 =
       ZeroData ? AArch64::STZ2GPostIndex : AArch64::ST2GPostIndex;
 
+  unsigned Size = MI.getOperand(2).getImm();
+  assert(Size > 0 && Size % 16 == 0);
+  if (Size % (16 * 2) != 0) {
+    BuildMI(MBB, MBBI, DL, TII->get(OpCode1), AddressReg)
+        .addReg(AddressReg)
+        .addReg(AddressReg)
+        .addImm(1);
+    Size -= 16;
+  }
+  MachineBasicBlock::iterator I =
+      BuildMI(MBB, MBBI, DL, TII->get(AArch64::MOVi64imm), SizeReg)
+          .addImm(Size);
+  expandMOVImm(MBB, I, 64);
+
   auto LoopBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
   auto DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
 
   MF->insert(++MBB.getIterator(), LoopBB);
   MF->insert(++LoopBB->getIterator(), DoneBB);
 
-  BuildMI(LoopBB, DL, TII->get(OpCode))
+  BuildMI(LoopBB, DL, TII->get(OpCode2))
       .addDef(AddressReg)
       .addReg(AddressReg)
       .addReg(AddressReg)
@@ -402,6 +605,28 @@ bool AArch64ExpandPseudo::expandSetTagLoop(
   return true;
 }
 
+bool AArch64ExpandPseudo::expandSVESpillFill(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MBBI,
+                                             unsigned Opc, unsigned N) {
+  const TargetRegisterInfo *TRI =
+      MBB.getParent()->getSubtarget().getRegisterInfo();
+  MachineInstr &MI = *MBBI;
+  for (unsigned Offset = 0; Offset < N; ++Offset) {
+    int ImmOffset = MI.getOperand(2).getImm() + Offset;
+    bool Kill = (Offset + 1 == N) ? MI.getOperand(1).isKill() : false;
+    assert(ImmOffset >= -256 && ImmOffset < 256 &&
+           "Immediate spill offset out of range");
+    BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(Opc))
+        .addReg(
+            TRI->getSubReg(MI.getOperand(0).getReg(), AArch64::zsub0 + Offset),
+            Opc == AArch64::LDR_ZXI ? RegState::Define : 0)
+        .addReg(MI.getOperand(1).getReg(), getKillRegState(Kill))
+        .addImm(ImmOffset);
+  }
+  MI.eraseFromParent();
+  return true;
+}
+
 /// If MBBI references a pseudo instruction that should be expanded here,
 /// do the expansion and return true.  Otherwise return false.
 bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
@@ -409,10 +634,76 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator &NextMBBI) {
   MachineInstr &MI = *MBBI;
   unsigned Opcode = MI.getOpcode();
+
+  // Check if we can expand the destructive op
+  int OrigInstr = AArch64::getSVEPseudoMap(MI.getOpcode());
+  if (OrigInstr != -1) {
+    auto &Orig = TII->get(OrigInstr);
+    if ((Orig.TSFlags & AArch64::DestructiveInstTypeMask)
+           != AArch64::NotDestructive) {
+      return expand_DestructiveOp(MI, MBB, MBBI);
+    }
+  }
+
   switch (Opcode) {
   default:
     break;
 
+  case AArch64::BSPv8i8:
+  case AArch64::BSPv16i8: {
+    Register DstReg = MI.getOperand(0).getReg();
+    if (DstReg == MI.getOperand(3).getReg()) {
+      // Expand to BIT
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BITv8i8
+                                                  : AArch64::BITv16i8))
+          .add(MI.getOperand(0))
+          .add(MI.getOperand(3))
+          .add(MI.getOperand(2))
+          .add(MI.getOperand(1));
+    } else if (DstReg == MI.getOperand(2).getReg()) {
+      // Expand to BIF
+      BuildMI(MBB, MBBI, MI.getDebugLoc(),
+              TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BIFv8i8
+                                                  : AArch64::BIFv16i8))
+          .add(MI.getOperand(0))
+          .add(MI.getOperand(2))
+          .add(MI.getOperand(3))
+          .add(MI.getOperand(1));
+    } else {
+      // Expand to BSL, use additional move if required
+      if (DstReg == MI.getOperand(1).getReg()) {
+        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+                                                    : AArch64::BSLv16i8))
+            .add(MI.getOperand(0))
+            .add(MI.getOperand(1))
+            .add(MI.getOperand(2))
+            .add(MI.getOperand(3));
+      } else {
+        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::ORRv8i8
+                                                    : AArch64::ORRv16i8))
+            .addReg(DstReg,
+                    RegState::Define |
+                        getRenamableRegState(MI.getOperand(0).isRenamable()))
+            .add(MI.getOperand(1))
+            .add(MI.getOperand(1));
+        BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                TII->get(Opcode == AArch64::BSPv8i8 ? AArch64::BSLv8i8
+                                                    : AArch64::BSLv16i8))
+            .add(MI.getOperand(0))
+            .addReg(DstReg,
+                    RegState::Kill |
+                        getRenamableRegState(MI.getOperand(0).isRenamable()))
+            .add(MI.getOperand(2))
+            .add(MI.getOperand(3));
+      }
+    }
+    MI.eraseFromParent();
+    return true;
+  }
+
   case AArch64::ADDWrr:
   case AArch64::SUBWrr:
   case AArch64::ADDXrr:
@@ -599,10 +890,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
     Register DstReg = MI.getOperand(0).getReg();
     auto SysReg = AArch64SysReg::TPIDR_EL0;
     MachineFunction *MF = MBB.getParent();
-    if (MF->getTarget().getTargetTriple().isOSFuchsia() &&
-        MF->getTarget().getCodeModel() == CodeModel::Kernel)
-      SysReg = AArch64SysReg::TPIDR_EL1;
-    else if (MF->getSubtarget<AArch64Subtarget>().useEL3ForTP())
+    if (MF->getSubtarget<AArch64Subtarget>().useEL3ForTP())
       SysReg = AArch64SysReg::TPIDR_EL3;
     else if (MF->getSubtarget<AArch64Subtarget>().useEL2ForTP())
       SysReg = AArch64SysReg::TPIDR_EL2;
@@ -676,7 +964,7 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      // almost always point to SP-after-prologue; if not, emit a longer
      // instruction sequence.
      int BaseOffset = -AFI->getTaggedBasePointerOffset();
-     unsigned FrameReg;
+     Register FrameReg;
      StackOffset FrameRegOffset = TFI->resolveFrameOffsetReference(
          MF, BaseOffset, false /*isFixed*/, false /*isSVE*/, FrameReg,
          /*PreferFP=*/false,
@@ -706,9 +994,26 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
      MI.eraseFromParent();
      return true;
    }
+   case AArch64::STGloop_wback:
+   case AArch64::STZGloop_wback:
+     return expandSetTagLoop(MBB, MBBI, NextMBBI);
    case AArch64::STGloop:
    case AArch64::STZGloop:
-     return expandSetTagLoop(MBB, MBBI, NextMBBI);
+     report_fatal_error(
+         "Non-writeback variants of STGloop / STZGloop should not "
+         "survive past PrologEpilogInserter.");
+   case AArch64::STR_ZZZZXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4);
+   case AArch64::STR_ZZZXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3);
+   case AArch64::STR_ZZXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2);
+   case AArch64::LDR_ZZZZXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4);
+   case AArch64::LDR_ZZZXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
+   case AArch64::LDR_ZZXI:
+     return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
   }
   return false;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
index c1fc183b04f6..538863ebe95a 100644
--- a/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FalkorHWPFFix.cpp
@@ -823,9 +823,6 @@ bool FalkorHWPFFix::runOnMachineFunction(MachineFunction &Fn) {
   TII = static_cast<const AArch64InstrInfo *>(ST.getInstrInfo());
   TRI = ST.getRegisterInfo();
 
-  assert(TRI->trackLivenessAfterRegAlloc(Fn) &&
-         "Register liveness not available!");
-
   MachineLoopInfo &LI = getAnalysis<MachineLoopInfo>();
 
   Modified = false;
diff --git a/llvm/lib/Target/AArch64/AArch64FastISel.cpp b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
index 7e9c68f2bb30..0f63f4ca62e5 100644
--- a/llvm/lib/Target/AArch64/AArch64FastISel.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FastISel.cpp
@@ -434,11 +434,9 @@ unsigned AArch64FastISel::materializeFP(const ConstantFP *CFP, MVT VT) {
 
   // Materialize via constant pool.  MachineConstantPool wants an explicit
   // alignment.
-  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
-  if (Align == 0)
-    Align = DL.getTypeAllocSize(CFP->getType());
+  Align Alignment = DL.getPrefTypeAlign(CFP->getType());
 
-  unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  unsigned CPI = MCP.getConstantPoolIndex(cast<Constant>(CFP), Alignment);
   unsigned ADRPReg = createResultReg(&AArch64::GPR64commonRegClass);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AArch64::ADRP),
           ADRPReg).addConstantPoolIndex(CPI, 0, AArch64II::MO_PAGE);
@@ -1130,7 +1128,7 @@ void AArch64FastISel::addLoadStoreOperands(Address &Addr,
     // and alignment should be based on the VT.
     MMO = FuncInfo.MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags,
-        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+        MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
     // Now add the rest of the operands.
     MIB.addFrameIndex(FI).addImm(Offset);
   } else {
@@ -3137,7 +3135,7 @@ bool AArch64FastISel::processCallArgs(CallLoweringInfo &CLI,
       Addr.setReg(AArch64::SP);
       Addr.setOffset(VA.getLocMemOffset() + BEAlign);
 
-      unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+      Align Alignment = DL.getABITypeAlign(ArgVal->getType());
       MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
           MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()),
           MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
@@ -3272,7 +3270,8 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   // Issue the call.
   MachineInstrBuilder MIB;
   if (Subtarget->useSmallAddressing()) {
-    const MCInstrDesc &II = TII.get(Addr.getReg() ? AArch64::BLR : AArch64::BL);
+    const MCInstrDesc &II =
+        TII.get(Addr.getReg() ? getBLRCallOpcode(*MF) : (unsigned)AArch64::BL);
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II);
     if (Symbol)
       MIB.addSym(Symbol, 0);
@@ -3305,7 +3304,7 @@ bool AArch64FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     if (!CallReg)
       return false;
 
-    const MCInstrDesc &II = TII.get(AArch64::BLR);
+    const MCInstrDesc &II = TII.get(getBLRCallOpcode(*MF));
     CallReg = constrainOperandRegClass(II, CallReg, 0);
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II).addReg(CallReg);
   }
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index ea3e800a1ad2..efa3fd5ca9ce 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -170,8 +170,45 @@ static cl::opt<bool>
                          cl::desc("reverse the CSR restore sequence"),
                          cl::init(false), cl::Hidden);
 
+static cl::opt<bool> StackTaggingMergeSetTag(
+    "stack-tagging-merge-settag",
+    cl::desc("merge settag instruction in function epilog"), cl::init(true),
+    cl::Hidden);
+
 STATISTIC(NumRedZoneFunctions, "Number of functions using red zone");
 
+/// Returns the argument pop size.
+static uint64_t getArgumentPopSize(MachineFunction &MF,
+                                   MachineBasicBlock &MBB) {
+  MachineBasicBlock::iterator MBBI = MBB.getLastNonDebugInstr();
+  bool IsTailCallReturn = false;
+  if (MBB.end() != MBBI) {
+    unsigned RetOpcode = MBBI->getOpcode();
+    IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
+                       RetOpcode == AArch64::TCRETURNri ||
+                       RetOpcode == AArch64::TCRETURNriBTI;
+  }
+  AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
+
+  uint64_t ArgumentPopSize = 0;
+  if (IsTailCallReturn) {
+    MachineOperand &StackAdjust = MBBI->getOperand(1);
+
+    // For a tail-call in a callee-pops-arguments environment, some or all of
+    // the stack may actually be in use for the call's arguments, this is
+    // calculated during LowerCall and consumed here...
+    ArgumentPopSize = StackAdjust.getImm();
+  } else {
+    // ... otherwise the amount to pop is *all* of the argument space,
+    // conveniently stored in the MachineFunctionInfo by
+    // LowerFormalArguments. This will, of course, be zero for the C calling
+    // convention.
+    ArgumentPopSize = AFI->getArgumentStackToRestore();
+  }
+
+  return ArgumentPopSize;
+}
+
 /// This is the biggest offset to the stack pointer we can encode in aarch64
 /// instructions (without using a separate calculation and a temp register).
 /// Note that the exception here are vector stores/loads which cannot encode any
@@ -211,6 +248,24 @@ AArch64FrameLowering::getStackIDForScalableVectors() const {
   return TargetStackID::SVEVector;
 }
 
+/// Returns the size of the fixed object area (allocated next to sp on entry)
+/// On Win64 this may include a var args area and an UnwindHelp object for EH.
+static unsigned getFixedObjectSize(const MachineFunction &MF,
+                                   const AArch64FunctionInfo *AFI, bool IsWin64,
+                                   bool IsFunclet) {
+  if (!IsWin64 || IsFunclet) {
+    // Only Win64 uses fixed objects, and then only for the function (not
+    // funclets)
+    return 0;
+  } else {
+    // Var args are stored here in the primary function.
+    const unsigned VarArgsArea = AFI->getVarArgsGPRSize();
+    // To support EH funclets we allocate an UnwindHelp object
+    const unsigned UnwindHelpObject = (MF.hasEHFunclets() ? 8 : 0);
+    return alignTo(VarArgsArea + UnwindHelpObject, 16);
+  }
+}
+
 /// Returns the size of the entire SVE stackframe (calleesaves + spills).
 static StackOffset getSVEStackSize(const MachineFunction &MF) {
   const AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
@@ -286,10 +341,8 @@ MachineBasicBlock::iterator AArch64FrameLowering::eliminateCallFramePseudoInstr(
   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
 
   if (!hasReservedCallFrame(MF)) {
-    unsigned Align = getStackAlignment();
-
     int64_t Amount = I->getOperand(0).getImm();
-    Amount = alignTo(Amount, Align);
+    Amount = alignTo(Amount, getStackAlign());
     if (!IsDestroy)
       Amount = -Amount;
 
@@ -480,6 +533,39 @@ bool AArch64FrameLowering::shouldCombineCSRLocalStackBump(
   return true;
 }
 
+bool AArch64FrameLowering::shouldCombineCSRLocalStackBumpInEpilogue(
+    MachineBasicBlock &MBB, unsigned StackBumpBytes) const {
+  if (!shouldCombineCSRLocalStackBump(*MBB.getParent(), StackBumpBytes))
+    return false;
+
+  if (MBB.empty())
+    return true;
+
+  // Disable combined SP bump if the last instruction is an MTE tag store. It
+  // is almost always better to merge SP adjustment into those instructions.
+  MachineBasicBlock::iterator LastI = MBB.getFirstTerminator();
+  MachineBasicBlock::iterator Begin = MBB.begin();
+  while (LastI != Begin) {
+    --LastI;
+    if (LastI->isTransient())
+      continue;
+    if (!LastI->getFlag(MachineInstr::FrameDestroy))
+      break;
+  }
+  switch (LastI->getOpcode()) {
+  case AArch64::STGloop:
+  case AArch64::STZGloop:
+  case AArch64::STGOffset:
+  case AArch64::STZGOffset:
+  case AArch64::ST2GOffset:
+  case AArch64::STZ2GOffset:
+    return false;
+  default:
+    return true;
+  }
+  llvm_unreachable("unreachable");
+}
+
 // Given a load or a store instruction, generate an appropriate unwinding SEH
 // code on Windows.
 static MachineBasicBlock::iterator InsertSEH(MachineBasicBlock::iterator MBBI,
@@ -940,11 +1026,11 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
         // Label used to tie together the PROLOG_LABEL and the MachineMoves.
         MCSymbol *FrameLabel = MMI.getContext().createTempSymbol();
           // Encode the stack size of the leaf function.
-          unsigned CFIIndex = MF.addFrameInst(
-              MCCFIInstruction::createDefCfaOffset(FrameLabel, -NumBytes));
-          BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
-              .addCFIIndex(CFIIndex)
-              .setMIFlags(MachineInstr::FrameSetup);
+        unsigned CFIIndex = MF.addFrameInst(
+            MCCFIInstruction::cfiDefCfaOffset(FrameLabel, NumBytes));
+        BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
+            .addCFIIndex(CFIIndex)
+            .setMIFlags(MachineInstr::FrameSetup);
       }
     }
 
@@ -959,10 +1045,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
 
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
-  // Var args are accounted for in the containing function, so don't
-  // include them for funclets.
-  unsigned FixedObject = (IsWin64 && !IsFunclet) ?
-                         alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+  unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
 
   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
   // All of the remaining stack allocations are for locals.
@@ -993,32 +1076,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     ++MBBI;
   }
 
-  // The code below is not applicable to funclets. We have emitted all the SEH
-  // opcodes that we needed to emit.  The FP and BP belong to the containing
-  // function.
-  if (IsFunclet) {
-    if (NeedsWinCFI) {
-      HasWinCFI = true;
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_PrologEnd))
-          .setMIFlag(MachineInstr::FrameSetup);
-    }
-
-    // SEH funclets are passed the frame pointer in X1.  If the parent
-    // function uses the base register, then the base register is used
-    // directly, and is not retrieved from X1.
-    if (F.hasPersonalityFn()) {
-      EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
-      if (isAsynchronousEHPersonality(Per)) {
-        BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
-            .addReg(AArch64::X1).setMIFlag(MachineInstr::FrameSetup);
-        MBB.addLiveIn(AArch64::X1);
-      }
-    }
-
-    return;
-  }
-
-  if (HasFP) {
+  // For funclets the FP belongs to the containing function.
+  if (!IsFunclet && HasFP) {
     // Only set up FP if we actually need to.
     int64_t FPOffset = isTargetDarwin(MF) ? (AFI->getCalleeSavedStackSize() - 16) : 0;
 
@@ -1099,7 +1158,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
             .setMIFlag(MachineInstr::FrameSetup);
       }
 
-      BuildMI(MBB, MBBI, DL, TII->get(AArch64::BLR))
+      BuildMI(MBB, MBBI, DL, TII->get(getBLRCallOpcode(MF)))
           .addReg(AArch64::X16, RegState::Kill)
           .addReg(AArch64::X15, RegState::Implicit | RegState::Define)
           .addReg(AArch64::X16, RegState::Implicit | RegState::Define | RegState::Dead)
@@ -1161,7 +1220,9 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
 
   // Allocate space for the rest of the frame.
   if (NumBytes) {
-    const bool NeedsRealignment = RegInfo->needsStackRealignment(MF);
+    // Alignment is required for the parent frame, not the funclet
+    const bool NeedsRealignment =
+        !IsFunclet && RegInfo->needsStackRealignment(MF);
     unsigned scratchSPReg = AArch64::SP;
 
     if (NeedsRealignment) {
@@ -1179,8 +1240,7 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
                       false, NeedsWinCFI, &HasWinCFI);
 
     if (NeedsRealignment) {
-      const unsigned Alignment = MFI.getMaxAlignment();
-      const unsigned NrBitsToZero = countTrailingZeros(Alignment);
+      const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
       assert(NrBitsToZero > 1);
       assert(scratchSPReg != AArch64::SP);
 
@@ -1215,7 +1275,8 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
   // FIXME: Clarify FrameSetup flags here.
   // Note: Use emitFrameOffset() like above for FP if the FrameSetup flag is
   // needed.
-  if (RegInfo->hasBasePointer(MF)) {
+  // For funclets the BP belongs to the containing function.
+  if (!IsFunclet && RegInfo->hasBasePointer(MF)) {
     TII->copyPhysReg(MBB, MBBI, DL, RegInfo->getBaseRegister(), AArch64::SP,
                      false);
     if (NeedsWinCFI) {
@@ -1232,6 +1293,19 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  // SEH funclets are passed the frame pointer in X1.  If the parent
+  // function uses the base register, then the base register is used
+  // directly, and is not retrieved from X1.
+  if (IsFunclet && F.hasPersonalityFn()) {
+    EHPersonality Per = classifyEHPersonality(F.getPersonalityFn());
+    if (isAsynchronousEHPersonality(Per)) {
+      BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::COPY), AArch64::FP)
+          .addReg(AArch64::X1)
+          .setMIFlag(MachineInstr::FrameSetup);
+      MBB.addLiveIn(AArch64::X1);
+    }
+  }
+
   if (needsFrameMoves) {
     const DataLayout &TD = MF.getDataLayout();
     const int StackGrowth = isTargetDarwin(MF)
@@ -1307,15 +1381,15 @@ void AArch64FrameLowering::emitPrologue(MachineFunction &MF,
     if (HasFP) {
       // Define the current CFA rule to use the provided FP.
       unsigned Reg = RegInfo->getDwarfRegNum(FramePtr, true);
-      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
-          nullptr, Reg, StackGrowth - FixedObject));
+      unsigned CFIIndex = MF.addFrameInst(
+          MCCFIInstruction::cfiDefCfa(nullptr, Reg, FixedObject - StackGrowth));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
     } else {
       // Encode the stack size of the leaf function.
       unsigned CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize()));
+          MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
@@ -1374,7 +1448,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL;
-  bool IsTailCallReturn = false;
   bool NeedsWinCFI = needsWinCFI(MF);
   bool HasWinCFI = false;
   bool IsFunclet = false;
@@ -1385,10 +1458,6 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (MBB.end() != MBBI) {
     DL = MBBI->getDebugLoc();
-    unsigned RetOpcode = MBBI->getOpcode();
-    IsTailCallReturn = RetOpcode == AArch64::TCRETURNdi ||
-                       RetOpcode == AArch64::TCRETURNri ||
-                       RetOpcode == AArch64::TCRETURNriBTI;
     IsFunclet = isFuncletReturnInstr(*MBBI);
   }
 
@@ -1403,21 +1472,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
 
   // Initial and residual are named for consistency with the prologue. Note that
   // in the epilogue, the residual adjustment is executed first.
-  uint64_t ArgumentPopSize = 0;
-  if (IsTailCallReturn) {
-    MachineOperand &StackAdjust = MBBI->getOperand(1);
-
-    // For a tail-call in a callee-pops-arguments environment, some or all of
-    // the stack may actually be in use for the call's arguments, this is
-    // calculated during LowerCall and consumed here...
-    ArgumentPopSize = StackAdjust.getImm();
-  } else {
-    // ... otherwise the amount to pop is *all* of the argument space,
-    // conveniently stored in the MachineFunctionInfo by
-    // LowerFormalArguments. This will, of course, be zero for the C calling
-    // convention.
-    ArgumentPopSize = AFI->getArgumentStackToRestore();
-  }
+  uint64_t ArgumentPopSize = getArgumentPopSize(MF, MBB);
 
   // The stack frame should be like below,
   //
@@ -1450,10 +1505,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
 
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
-  // Var args are accounted for in the containing function, so don't
-  // include them for funclets.
-  unsigned FixedObject =
-      (IsWin64 && !IsFunclet) ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+  unsigned FixedObject = getFixedObjectSize(MF, AFI, IsWin64, IsFunclet);
 
   uint64_t AfterCSRPopSize = ArgumentPopSize;
   auto PrologueSaveSize = AFI->getCalleeSavedStackSize() + FixedObject;
@@ -1463,7 +1515,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
   // function.
   if (MF.hasEHFunclets())
     AFI->setLocalStackSize(NumBytes - PrologueSaveSize);
-  bool CombineSPBump = shouldCombineCSRLocalStackBump(MF, NumBytes);
+  bool CombineSPBump = shouldCombineCSRLocalStackBumpInEpilogue(MBB, NumBytes);
   // Assume we can't combine the last pop with the sp restore.
 
   if (!CombineSPBump && PrologueSaveSize != 0) {
@@ -1660,7 +1712,7 @@ void AArch64FrameLowering::emitEpilogue(MachineFunction &MF,
 /// SP-relative and simple call frames aren't used.
 int AArch64FrameLowering::getFrameIndexReference(const MachineFunction &MF,
                                                  int FI,
-                                                 unsigned &FrameReg) const {
+                                                 Register &FrameReg) const {
   return resolveFrameIndexReference(
              MF, FI, FrameReg,
              /*PreferFP=*/
@@ -1679,7 +1731,9 @@ static StackOffset getFPOffset(const MachineFunction &MF, int64_t ObjectOffset)
   const auto &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   bool IsWin64 =
       Subtarget.isCallingConvWin64(MF.getFunction().getCallingConv());
-  unsigned FixedObject = IsWin64 ? alignTo(AFI->getVarArgsGPRSize(), 16) : 0;
+
+  unsigned FixedObject =
+      getFixedObjectSize(MF, AFI, IsWin64, /*IsFunclet=*/false);
   unsigned FPAdjust = isTargetDarwin(MF)
                         ? 16 : AFI->getCalleeSavedStackSize(MF.getFrameInfo());
   return {ObjectOffset + FixedObject + FPAdjust, MVT::i8};
@@ -1701,7 +1755,7 @@ int AArch64FrameLowering::getSEHFrameIndexOffset(const MachineFunction &MF,
 }
 
 StackOffset AArch64FrameLowering::resolveFrameIndexReference(
-    const MachineFunction &MF, int FI, unsigned &FrameReg, bool PreferFP,
+    const MachineFunction &MF, int FI, Register &FrameReg, bool PreferFP,
     bool ForSimm) const {
   const auto &MFI = MF.getFrameInfo();
   int64_t ObjectOffset = MFI.getObjectOffset(FI);
@@ -1713,7 +1767,7 @@ StackOffset AArch64FrameLowering::resolveFrameIndexReference(
 
 StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
     const MachineFunction &MF, int64_t ObjectOffset, bool isFixed, bool isSVE,
-    unsigned &FrameReg, bool PreferFP, bool ForSimm) const {
+    Register &FrameReg, bool PreferFP, bool ForSimm) const {
   const auto &MFI = MF.getFrameInfo();
   const auto *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
@@ -1764,10 +1818,8 @@ StackOffset AArch64FrameLowering::resolveFrameOffsetReference(
         bool CanUseBP = RegInfo->hasBasePointer(MF);
         if (FPOffsetFits && CanUseBP) // Both are ok. Pick the best.
           UseFP = PreferFP;
-        else if (!CanUseBP) { // Can't use BP. Forced to use FP.
-          assert(!SVEStackSize && "Expected BP to be available");
+        else if (!CanUseBP) // Can't use BP. Forced to use FP.
           UseFP = true;
-        }
         // else we can use BP and FP, but the offset from FP won't fit.
         // That will make us scavenge registers which we can probably avoid by
         // using BP. If it won't fit for BP either, we'll scavenge anyway.
@@ -1933,7 +1985,7 @@ struct RegPairInfo {
 } // end anonymous namespace
 
 static void computeCalleeSaveRegisterPairs(
-    MachineFunction &MF, const std::vector<CalleeSavedInfo> &CSI,
+    MachineFunction &MF, ArrayRef<CalleeSavedInfo> CSI,
     const TargetRegisterInfo *TRI, SmallVectorImpl<RegPairInfo> &RegPairs,
     bool &NeedShadowCallStackProlog, bool NeedsFrameRecord) {
 
@@ -2058,8 +2110,8 @@ static void computeCalleeSaveRegisterPairs(
       FixupDone = true;
       ByteOffset -= 8;
       assert(ByteOffset % 16 == 0);
-      assert(MFI.getObjectAlignment(RPI.FrameIdx) <= 16);
-      MFI.setObjectAlignment(RPI.FrameIdx, 16);
+      assert(MFI.getObjectAlign(RPI.FrameIdx) <= Align(16));
+      MFI.setObjectAlignment(RPI.FrameIdx, Align(16));
     }
 
     int Offset = RPI.isScalable() ? ScalableByteOffset : ByteOffset;
@@ -2078,8 +2130,7 @@ static void computeCalleeSaveRegisterPairs(
 
 bool AArch64FrameLowering::spillCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   bool NeedsWinCFI = needsWinCFI(MF);
@@ -2142,32 +2193,33 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
     // Rationale: This sequence saves uop updates compared to a sequence of
     // pre-increment spills like stp xi,xj,[sp,#-16]!
     // Note: Similar rationale and sequence for restores in epilog.
-    unsigned Size, Align;
+    unsigned Size;
+    Align Alignment;
     switch (RPI.Type) {
     case RegPairInfo::GPR:
        StrOpc = RPI.isPaired() ? AArch64::STPXi : AArch64::STRXui;
        Size = 8;
-       Align = 8;
+       Alignment = Align(8);
        break;
     case RegPairInfo::FPR64:
        StrOpc = RPI.isPaired() ? AArch64::STPDi : AArch64::STRDui;
        Size = 8;
-       Align = 8;
+       Alignment = Align(8);
        break;
     case RegPairInfo::FPR128:
        StrOpc = RPI.isPaired() ? AArch64::STPQi : AArch64::STRQui;
        Size = 16;
-       Align = 16;
+       Alignment = Align(16);
        break;
     case RegPairInfo::ZPR:
        StrOpc = AArch64::STR_ZXI;
        Size = 16;
-       Align = 16;
+       Alignment = Align(16);
        break;
     case RegPairInfo::PPR:
        StrOpc = AArch64::STR_PXI;
        Size = 2;
-       Align = 2;
+       Alignment = Align(2);
        break;
     }
     LLVM_DEBUG(dbgs() << "CSR spill: (" << printReg(Reg1, TRI);
@@ -2196,7 +2248,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
       MIB.addReg(Reg2, getPrologueDeath(MF, Reg2));
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
-          MachineMemOperand::MOStore, Size, Align));
+          MachineMemOperand::MOStore, Size, Alignment));
     }
     MIB.addReg(Reg1, getPrologueDeath(MF, Reg1))
         .addReg(AArch64::SP)
@@ -2204,8 +2256,8 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
                             // where factor*scale is implicit
         .setMIFlag(MachineInstr::FrameSetup);
     MIB.addMemOperand(MF.getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(MF,FrameIdxReg1),
-        MachineMemOperand::MOStore, Size, Align));
+        MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
+        MachineMemOperand::MOStore, Size, Alignment));
     if (NeedsWinCFI)
       InsertSEH(MIB, TII, MachineInstr::FrameSetup);
 
@@ -2220,8 +2272,7 @@ bool AArch64FrameLowering::spillCalleeSavedRegisters(
 
 bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc DL;
@@ -2248,32 +2299,33 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
     //    ldp     x22, x21, [sp, #0]      // addImm(+0)
     // Note: see comment in spillCalleeSavedRegisters()
     unsigned LdrOpc;
-    unsigned Size, Align;
+    unsigned Size;
+    Align Alignment;
     switch (RPI.Type) {
     case RegPairInfo::GPR:
        LdrOpc = RPI.isPaired() ? AArch64::LDPXi : AArch64::LDRXui;
        Size = 8;
-       Align = 8;
+       Alignment = Align(8);
        break;
     case RegPairInfo::FPR64:
        LdrOpc = RPI.isPaired() ? AArch64::LDPDi : AArch64::LDRDui;
        Size = 8;
-       Align = 8;
+       Alignment = Align(8);
        break;
     case RegPairInfo::FPR128:
        LdrOpc = RPI.isPaired() ? AArch64::LDPQi : AArch64::LDRQui;
        Size = 16;
-       Align = 16;
+       Alignment = Align(16);
        break;
     case RegPairInfo::ZPR:
        LdrOpc = AArch64::LDR_ZXI;
        Size = 16;
-       Align = 16;
+       Alignment = Align(16);
        break;
     case RegPairInfo::PPR:
        LdrOpc = AArch64::LDR_PXI;
        Size = 2;
-       Align = 2;
+       Alignment = Align(2);
        break;
     }
     LLVM_DEBUG(dbgs() << "CSR restore: (" << printReg(Reg1, TRI);
@@ -2296,7 +2348,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
       MIB.addReg(Reg2, getDefRegState(true));
       MIB.addMemOperand(MF.getMachineMemOperand(
           MachinePointerInfo::getFixedStack(MF, FrameIdxReg2),
-          MachineMemOperand::MOLoad, Size, Align));
+          MachineMemOperand::MOLoad, Size, Alignment));
     }
     MIB.addReg(Reg1, getDefRegState(true))
         .addReg(AArch64::SP)
@@ -2305,7 +2357,7 @@ bool AArch64FrameLowering::restoreCalleeSavedRegisters(
         .setMIFlag(MachineInstr::FrameDestroy);
     MIB.addMemOperand(MF.getMachineMemOperand(
         MachinePointerInfo::getFixedStack(MF, FrameIdxReg1),
-        MachineMemOperand::MOLoad, Size, Align));
+        MachineMemOperand::MOLoad, Size, Alignment));
     if (NeedsWinCFI)
       InsertSEH(MIB, TII, MachineInstr::FrameDestroy);
   };
@@ -2348,6 +2400,7 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
   const AArch64RegisterInfo *RegInfo = static_cast<const AArch64RegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
+  const AArch64Subtarget &Subtarget = MF.getSubtarget<AArch64Subtarget>();
   AArch64FunctionInfo *AFI = MF.getInfo<AArch64FunctionInfo>();
   unsigned UnspilledCSGPR = AArch64::NoRegister;
   unsigned UnspilledCSGPRPaired = AArch64::NoRegister;
@@ -2396,6 +2449,16 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
     }
   }
 
+  if (MF.getFunction().getCallingConv() == CallingConv::Win64 &&
+      !Subtarget.isTargetWindows()) {
+    // For Windows calling convention on a non-windows OS, where X18 is treated
+    // as reserved, back up X18 when entering non-windows code (marked with the
+    // Windows calling convention) and restore when returning regardless of
+    // whether the individual function uses it - it might call other functions
+    // that clobber it.
+    SavedRegs.set(AArch64::X18);
+  }
+
   // Calculates the callee saved stack size.
   unsigned CSStackSize = 0;
   unsigned SVECSStackSize = 0;
@@ -2467,8 +2530,8 @@ void AArch64FrameLowering::determineCalleeSaves(MachineFunction &MF,
       const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
       const TargetRegisterClass &RC = AArch64::GPR64RegClass;
       unsigned Size = TRI->getSpillSize(RC);
-      unsigned Align = TRI->getSpillAlignment(RC);
-      int FI = MFI.CreateStackObject(Size, Align, false);
+      Align Alignment = TRI->getSpillAlign(RC);
+      int FI = MFI.CreateStackObject(Size, Alignment, false);
       RS->addScavengingFrameIndex(FI);
       LLVM_DEBUG(dbgs() << "No available CS registers, allocated fi#" << FI
                         << " as the emergency spill slot.\n");
@@ -2549,12 +2612,12 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
   // Then process all callee saved slots.
   if (getSVECalleeSaveSlotRange(MFI, MinCSFrameIndex, MaxCSFrameIndex)) {
     // Make sure to align the last callee save slot.
-    MFI.setObjectAlignment(MaxCSFrameIndex, 16U);
+    MFI.setObjectAlignment(MaxCSFrameIndex, Align(16));
 
     // Assign offsets to the callee save slots.
     for (int I = MinCSFrameIndex; I <= MaxCSFrameIndex; ++I) {
       Offset += MFI.getObjectSize(I);
-      Offset = alignTo(Offset, MFI.getObjectAlignment(I));
+      Offset = alignTo(Offset, MFI.getObjectAlign(I));
       if (AssignOffsets)
         Assign(I, -Offset);
     }
@@ -2576,15 +2639,15 @@ static int64_t determineSVEStackObjectOffsets(MachineFrameInfo &MFI,
 
   // Allocate all SVE locals and spills
   for (unsigned FI : ObjectsToAllocate) {
-    unsigned Align = MFI.getObjectAlignment(FI);
+    Align Alignment = MFI.getObjectAlign(FI);
     // FIXME: Given that the length of SVE vectors is not necessarily a power of
     // two, we'd need to align every object dynamically at runtime if the
     // alignment is larger than 16. This is not yet supported.
-    if (Align > 16)
+    if (Alignment > Align(16))
       report_fatal_error(
           "Alignment of scalable vectors > 16 bytes is not yet supported");
 
-    Offset = alignTo(Offset + MFI.getObjectSize(FI), Align);
+    Offset = alignTo(Offset + MFI.getObjectSize(FI), Alignment);
     if (AssignOffsets)
       Assign(FI, -Offset);
   }
@@ -2632,9 +2695,14 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
     ++MBBI;
 
   // Create an UnwindHelp object.
-  int UnwindHelpFI =
-      MFI.CreateStackObject(/*size*/8, /*alignment*/16, false);
+  // The UnwindHelp object is allocated at the start of the fixed object area
+  int64_t FixedObject =
+      getFixedObjectSize(MF, AFI, /*IsWin64*/ true, /*IsFunclet*/ false);
+  int UnwindHelpFI = MFI.CreateFixedObject(/*Size*/ 8,
+                                           /*SPOffset*/ -FixedObject,
+                                           /*IsImmutable=*/false);
   EHInfo.UnwindHelpFrameIdx = UnwindHelpFI;
+
   // We need to store -2 into the UnwindHelp object at the start of the
   // function.
   DebugLoc DL;
@@ -2649,17 +2717,411 @@ void AArch64FrameLowering::processFunctionBeforeFrameFinalized(
       .addImm(0);
 }
 
-/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP before
-/// the update.  This is easily retrieved as it is exactly the offset that is set
-/// in processFunctionBeforeFrameFinalized.
+namespace {
+struct TagStoreInstr {
+  MachineInstr *MI;
+  int64_t Offset, Size;
+  explicit TagStoreInstr(MachineInstr *MI, int64_t Offset, int64_t Size)
+      : MI(MI), Offset(Offset), Size(Size) {}
+};
+
+class TagStoreEdit {
+  MachineFunction *MF;
+  MachineBasicBlock *MBB;
+  MachineRegisterInfo *MRI;
+  // Tag store instructions that are being replaced.
+  SmallVector<TagStoreInstr, 8> TagStores;
+  // Combined memref arguments of the above instructions.
+  SmallVector<MachineMemOperand *, 8> CombinedMemRefs;
+
+  // Replace allocation tags in [FrameReg + FrameRegOffset, FrameReg +
+  // FrameRegOffset + Size) with the address tag of SP.
+  Register FrameReg;
+  StackOffset FrameRegOffset;
+  int64_t Size;
+  // If not None, move FrameReg to (FrameReg + FrameRegUpdate) at the end.
+  Optional<int64_t> FrameRegUpdate;
+  // MIFlags for any FrameReg updating instructions.
+  unsigned FrameRegUpdateFlags;
+
+  // Use zeroing instruction variants.
+  bool ZeroData;
+  DebugLoc DL;
+
+  void emitUnrolled(MachineBasicBlock::iterator InsertI);
+  void emitLoop(MachineBasicBlock::iterator InsertI);
+
+public:
+  TagStoreEdit(MachineBasicBlock *MBB, bool ZeroData)
+      : MBB(MBB), ZeroData(ZeroData) {
+    MF = MBB->getParent();
+    MRI = &MF->getRegInfo();
+  }
+  // Add an instruction to be replaced. Instructions must be added in the
+  // ascending order of Offset, and have to be adjacent.
+  void addInstruction(TagStoreInstr I) {
+    assert((TagStores.empty() ||
+            TagStores.back().Offset + TagStores.back().Size == I.Offset) &&
+           "Non-adjacent tag store instructions.");
+    TagStores.push_back(I);
+  }
+  void clear() { TagStores.clear(); }
+  // Emit equivalent code at the given location, and erase the current set of
+  // instructions. May skip if the replacement is not profitable. May invalidate
+  // the input iterator and replace it with a valid one.
+  void emitCode(MachineBasicBlock::iterator &InsertI,
+                const AArch64FrameLowering *TFI, bool IsLast);
+};
+
+void TagStoreEdit::emitUnrolled(MachineBasicBlock::iterator InsertI) {
+  const AArch64InstrInfo *TII =
+      MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+  const int64_t kMinOffset = -256 * 16;
+  const int64_t kMaxOffset = 255 * 16;
+
+  Register BaseReg = FrameReg;
+  int64_t BaseRegOffsetBytes = FrameRegOffset.getBytes();
+  if (BaseRegOffsetBytes < kMinOffset ||
+      BaseRegOffsetBytes + (Size - Size % 32) > kMaxOffset) {
+    Register ScratchReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+    emitFrameOffset(*MBB, InsertI, DL, ScratchReg, BaseReg,
+                    {BaseRegOffsetBytes, MVT::i8}, TII);
+    BaseReg = ScratchReg;
+    BaseRegOffsetBytes = 0;
+  }
+
+  MachineInstr *LastI = nullptr;
+  while (Size) {
+    int64_t InstrSize = (Size > 16) ? 32 : 16;
+    unsigned Opcode =
+        InstrSize == 16
+            ? (ZeroData ? AArch64::STZGOffset : AArch64::STGOffset)
+            : (ZeroData ? AArch64::STZ2GOffset : AArch64::ST2GOffset);
+    MachineInstr *I = BuildMI(*MBB, InsertI, DL, TII->get(Opcode))
+                          .addReg(AArch64::SP)
+                          .addReg(BaseReg)
+                          .addImm(BaseRegOffsetBytes / 16)
+                          .setMemRefs(CombinedMemRefs);
+    // A store to [BaseReg, #0] should go last for an opportunity to fold the
+    // final SP adjustment in the epilogue.
+    if (BaseRegOffsetBytes == 0)
+      LastI = I;
+    BaseRegOffsetBytes += InstrSize;
+    Size -= InstrSize;
+  }
+
+  if (LastI)
+    MBB->splice(InsertI, MBB, LastI);
+}
+
+void TagStoreEdit::emitLoop(MachineBasicBlock::iterator InsertI) {
+  const AArch64InstrInfo *TII =
+      MF->getSubtarget<AArch64Subtarget>().getInstrInfo();
+
+  Register BaseReg = FrameRegUpdate
+                         ? FrameReg
+                         : MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+  Register SizeReg = MRI->createVirtualRegister(&AArch64::GPR64RegClass);
+
+  emitFrameOffset(*MBB, InsertI, DL, BaseReg, FrameReg, FrameRegOffset, TII);
+
+  int64_t LoopSize = Size;
+  // If the loop size is not a multiple of 32, split off one 16-byte store at
+  // the end to fold BaseReg update into.
+  if (FrameRegUpdate && *FrameRegUpdate)
+    LoopSize -= LoopSize % 32;
+  MachineInstr *LoopI = BuildMI(*MBB, InsertI, DL,
+                                TII->get(ZeroData ? AArch64::STZGloop_wback
+                                                  : AArch64::STGloop_wback))
+                            .addDef(SizeReg)
+                            .addDef(BaseReg)
+                            .addImm(LoopSize)
+                            .addReg(BaseReg)
+                            .setMemRefs(CombinedMemRefs);
+  if (FrameRegUpdate)
+    LoopI->setFlags(FrameRegUpdateFlags);
+
+  int64_t ExtraBaseRegUpdate =
+      FrameRegUpdate ? (*FrameRegUpdate - FrameRegOffset.getBytes() - Size) : 0;
+  if (LoopSize < Size) {
+    assert(FrameRegUpdate);
+    assert(Size - LoopSize == 16);
+    // Tag 16 more bytes at BaseReg and update BaseReg.
+    BuildMI(*MBB, InsertI, DL,
+            TII->get(ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex))
+        .addDef(BaseReg)
+        .addReg(BaseReg)
+        .addReg(BaseReg)
+        .addImm(1 + ExtraBaseRegUpdate / 16)
+        .setMemRefs(CombinedMemRefs)
+        .setMIFlags(FrameRegUpdateFlags);
+  } else if (ExtraBaseRegUpdate) {
+    // Update BaseReg.
+    BuildMI(
+        *MBB, InsertI, DL,
+        TII->get(ExtraBaseRegUpdate > 0 ? AArch64::ADDXri : AArch64::SUBXri))
+        .addDef(BaseReg)
+        .addReg(BaseReg)
+        .addImm(std::abs(ExtraBaseRegUpdate))
+        .addImm(0)
+        .setMIFlags(FrameRegUpdateFlags);
+  }
+}
+
+// Check if *II is a register update that can be merged into STGloop that ends
+// at (Reg + Size). RemainingOffset is the required adjustment to Reg after the
+// end of the loop.
+bool canMergeRegUpdate(MachineBasicBlock::iterator II, unsigned Reg,
+                       int64_t Size, int64_t *TotalOffset) {
+  MachineInstr &MI = *II;
+  if ((MI.getOpcode() == AArch64::ADDXri ||
+       MI.getOpcode() == AArch64::SUBXri) &&
+      MI.getOperand(0).getReg() == Reg && MI.getOperand(1).getReg() == Reg) {
+    unsigned Shift = AArch64_AM::getShiftValue(MI.getOperand(3).getImm());
+    int64_t Offset = MI.getOperand(2).getImm() << Shift;
+    if (MI.getOpcode() == AArch64::SUBXri)
+      Offset = -Offset;
+    int64_t AbsPostOffset = std::abs(Offset - Size);
+    const int64_t kMaxOffset =
+        0xFFF; // Max encoding for unshifted ADDXri / SUBXri
+    if (AbsPostOffset <= kMaxOffset && AbsPostOffset % 16 == 0) {
+      *TotalOffset = Offset;
+      return true;
+    }
+  }
+  return false;
+}
+
+void mergeMemRefs(const SmallVectorImpl<TagStoreInstr> &TSE,
+                  SmallVectorImpl<MachineMemOperand *> &MemRefs) {
+  MemRefs.clear();
+  for (auto &TS : TSE) {
+    MachineInstr *MI = TS.MI;
+    // An instruction without memory operands may access anything. Be
+    // conservative and return an empty list.
+    if (MI->memoperands_empty()) {
+      MemRefs.clear();
+      return;
+    }
+    MemRefs.append(MI->memoperands_begin(), MI->memoperands_end());
+  }
+}
+
+void TagStoreEdit::emitCode(MachineBasicBlock::iterator &InsertI,
+                            const AArch64FrameLowering *TFI, bool IsLast) {
+  if (TagStores.empty())
+    return;
+  TagStoreInstr &FirstTagStore = TagStores[0];
+  TagStoreInstr &LastTagStore = TagStores[TagStores.size() - 1];
+  Size = LastTagStore.Offset - FirstTagStore.Offset + LastTagStore.Size;
+  DL = TagStores[0].MI->getDebugLoc();
+
+  Register Reg;
+  FrameRegOffset = TFI->resolveFrameOffsetReference(
+      *MF, FirstTagStore.Offset, false /*isFixed*/, false /*isSVE*/, Reg,
+      /*PreferFP=*/false, /*ForSimm=*/true);
+  FrameReg = Reg;
+  FrameRegUpdate = None;
+
+  mergeMemRefs(TagStores, CombinedMemRefs);
+
+  LLVM_DEBUG(dbgs() << "Replacing adjacent STG instructions:\n";
+             for (const auto &Instr
+                  : TagStores) { dbgs() << "  " << *Instr.MI; });
+
+  // Size threshold where a loop becomes shorter than a linear sequence of
+  // tagging instructions.
+  const int kSetTagLoopThreshold = 176;
+  if (Size < kSetTagLoopThreshold) {
+    if (TagStores.size() < 2)
+      return;
+    emitUnrolled(InsertI);
+  } else {
+    MachineInstr *UpdateInstr = nullptr;
+    int64_t TotalOffset;
+    if (IsLast) {
+      // See if we can merge base register update into the STGloop.
+      // This is done in AArch64LoadStoreOptimizer for "normal" stores,
+      // but STGloop is way too unusual for that, and also it only
+      // realistically happens in function epilogue. Also, STGloop is expanded
+      // before that pass.
+      if (InsertI != MBB->end() &&
+          canMergeRegUpdate(InsertI, FrameReg, FrameRegOffset.getBytes() + Size,
+                            &TotalOffset)) {
+        UpdateInstr = &*InsertI++;
+        LLVM_DEBUG(dbgs() << "Folding SP update into loop:\n  "
+                          << *UpdateInstr);
+      }
+    }
+
+    if (!UpdateInstr && TagStores.size() < 2)
+      return;
+
+    if (UpdateInstr) {
+      FrameRegUpdate = TotalOffset;
+      FrameRegUpdateFlags = UpdateInstr->getFlags();
+    }
+    emitLoop(InsertI);
+    if (UpdateInstr)
+      UpdateInstr->eraseFromParent();
+  }
+
+  for (auto &TS : TagStores)
+    TS.MI->eraseFromParent();
+}
+
+bool isMergeableStackTaggingInstruction(MachineInstr &MI, int64_t &Offset,
+                                        int64_t &Size, bool &ZeroData) {
+  MachineFunction &MF = *MI.getParent()->getParent();
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  unsigned Opcode = MI.getOpcode();
+  ZeroData = (Opcode == AArch64::STZGloop || Opcode == AArch64::STZGOffset ||
+              Opcode == AArch64::STZ2GOffset);
+
+  if (Opcode == AArch64::STGloop || Opcode == AArch64::STZGloop) {
+    if (!MI.getOperand(0).isDead() || !MI.getOperand(1).isDead())
+      return false;
+    if (!MI.getOperand(2).isImm() || !MI.getOperand(3).isFI())
+      return false;
+    Offset = MFI.getObjectOffset(MI.getOperand(3).getIndex());
+    Size = MI.getOperand(2).getImm();
+    return true;
+  }
+
+  if (Opcode == AArch64::STGOffset || Opcode == AArch64::STZGOffset)
+    Size = 16;
+  else if (Opcode == AArch64::ST2GOffset || Opcode == AArch64::STZ2GOffset)
+    Size = 32;
+  else
+    return false;
+
+  if (MI.getOperand(0).getReg() != AArch64::SP || !MI.getOperand(1).isFI())
+    return false;
+
+  Offset = MFI.getObjectOffset(MI.getOperand(1).getIndex()) +
+           16 * MI.getOperand(2).getImm();
+  return true;
+}
+
+// Detect a run of memory tagging instructions for adjacent stack frame slots,
+// and replace them with a shorter instruction sequence:
+// * replace STG + STG with ST2G
+// * replace STGloop + STGloop with STGloop
+// This code needs to run when stack slot offsets are already known, but before
+// FrameIndex operands in STG instructions are eliminated.
+MachineBasicBlock::iterator tryMergeAdjacentSTG(MachineBasicBlock::iterator II,
+                                                const AArch64FrameLowering *TFI,
+                                                RegScavenger *RS) {
+  bool FirstZeroData;
+  int64_t Size, Offset;
+  MachineInstr &MI = *II;
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineBasicBlock::iterator NextI = ++II;
+  if (&MI == &MBB->instr_back())
+    return II;
+  if (!isMergeableStackTaggingInstruction(MI, Offset, Size, FirstZeroData))
+    return II;
+
+  SmallVector<TagStoreInstr, 4> Instrs;
+  Instrs.emplace_back(&MI, Offset, Size);
+
+  constexpr int kScanLimit = 10;
+  int Count = 0;
+  for (MachineBasicBlock::iterator E = MBB->end();
+       NextI != E && Count < kScanLimit; ++NextI) {
+    MachineInstr &MI = *NextI;
+    bool ZeroData;
+    int64_t Size, Offset;
+    // Collect instructions that update memory tags with a FrameIndex operand
+    // and (when applicable) constant size, and whose output registers are dead
+    // (the latter is almost always the case in practice). Since these
+    // instructions effectively have no inputs or outputs, we are free to skip
+    // any non-aliasing instructions in between without tracking used registers.
+    if (isMergeableStackTaggingInstruction(MI, Offset, Size, ZeroData)) {
+      if (ZeroData != FirstZeroData)
+        break;
+      Instrs.emplace_back(&MI, Offset, Size);
+      continue;
+    }
+
+    // Only count non-transient, non-tagging instructions toward the scan
+    // limit.
+    if (!MI.isTransient())
+      ++Count;
+
+    // Just in case, stop before the epilogue code starts.
+    if (MI.getFlag(MachineInstr::FrameSetup) ||
+        MI.getFlag(MachineInstr::FrameDestroy))
+      break;
+
+    // Reject anything that may alias the collected instructions.
+    if (MI.mayLoadOrStore() || MI.hasUnmodeledSideEffects())
+      break;
+  }
+
+  // New code will be inserted after the last tagging instruction we've found.
+  MachineBasicBlock::iterator InsertI = Instrs.back().MI;
+  InsertI++;
+
+  llvm::stable_sort(Instrs,
+                    [](const TagStoreInstr &Left, const TagStoreInstr &Right) {
+                      return Left.Offset < Right.Offset;
+                    });
+
+  // Make sure that we don't have any overlapping stores.
+  int64_t CurOffset = Instrs[0].Offset;
+  for (auto &Instr : Instrs) {
+    if (CurOffset > Instr.Offset)
+      return NextI;
+    CurOffset = Instr.Offset + Instr.Size;
+  }
+
+  // Find contiguous runs of tagged memory and emit shorter instruction
+  // sequencies for them when possible.
+  TagStoreEdit TSE(MBB, FirstZeroData);
+  Optional<int64_t> EndOffset;
+  for (auto &Instr : Instrs) {
+    if (EndOffset && *EndOffset != Instr.Offset) {
+      // Found a gap.
+      TSE.emitCode(InsertI, TFI, /*IsLast = */ false);
+      TSE.clear();
+    }
+
+    TSE.addInstruction(Instr);
+    EndOffset = Instr.Offset + Instr.Size;
+  }
+
+  TSE.emitCode(InsertI, TFI, /*IsLast = */ true);
+
+  return InsertI;
+}
+} // namespace
+
+void AArch64FrameLowering::processFunctionBeforeFrameIndicesReplaced(
+    MachineFunction &MF, RegScavenger *RS = nullptr) const {
+  if (StackTaggingMergeSetTag)
+    for (auto &BB : MF)
+      for (MachineBasicBlock::iterator II = BB.begin(); II != BB.end();)
+        II = tryMergeAdjacentSTG(II, this, RS);
+}
+
+/// For Win64 AArch64 EH, the offset to the Unwind object is from the SP
+/// before the update.  This is easily retrieved as it is exactly the offset
+/// that is set in processFunctionBeforeFrameFinalized.
 int AArch64FrameLowering::getFrameIndexReferencePreferSP(
-    const MachineFunction &MF, int FI, unsigned &FrameReg,
+    const MachineFunction &MF, int FI, Register &FrameReg,
     bool IgnoreSPUpdates) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
-                    << MFI.getObjectOffset(FI) << "\n");
-  FrameReg = AArch64::SP;
-  return MFI.getObjectOffset(FI);
+  if (IgnoreSPUpdates) {
+    LLVM_DEBUG(dbgs() << "Offset from the SP for " << FI << " is "
+                      << MFI.getObjectOffset(FI) << "\n");
+    FrameReg = AArch64::SP;
+    return MFI.getObjectOffset(FI);
+  }
+
+  return getFrameIndexReference(MF, FI, FrameReg);
 }
 
 /// The parent frame offset (aka dispFrame) is only used on X86_64 to retrieve
@@ -2678,5 +3140,5 @@ unsigned AArch64FrameLowering::getWinEHFuncletFrameSize(
       MF.getInfo<AArch64FunctionInfo>()->getCalleeSavedStackSize();
   // This is the amount of stack a funclet needs to allocate.
   return alignTo(CSSize + MF.getFrameInfo().getMaxCallFrameSize(),
-                 getStackAlignment());
+                 getStackAlign());
 }
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index b5719feb6b15..9d0a6d9eaf25 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -24,8 +24,9 @@ public:
       : TargetFrameLowering(StackGrowsDown, Align(16), 0, Align(16),
                             true /*StackRealignable*/) {}
 
-  void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
-                                 MachineBasicBlock::iterator MBBI) const;
+  void
+  emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI) const override;
 
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
@@ -39,23 +40,24 @@ public:
   bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
 
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
+                             Register &FrameReg) const override;
   StackOffset resolveFrameIndexReference(const MachineFunction &MF, int FI,
-                                         unsigned &FrameReg, bool PreferFP,
+                                         Register &FrameReg, bool PreferFP,
                                          bool ForSimm) const;
   StackOffset resolveFrameOffsetReference(const MachineFunction &MF,
                                           int64_t ObjectOffset, bool isFixed,
-                                          bool isSVE, unsigned &FrameReg,
+                                          bool isSVE, Register &FrameReg,
                                           bool PreferFP, bool ForSimm) const;
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
                                  const TargetRegisterInfo *TRI) const override;
 
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MI,
-                                  std::vector<CalleeSavedInfo> &CSI,
-                                  const TargetRegisterInfo *TRI) const override;
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
+                              const TargetRegisterInfo *TRI) const override;
 
   /// Can this function use the red zone for local allocations.
   bool canUseRedZone(const MachineFunction &MF) const;
@@ -77,12 +79,16 @@ public:
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                              RegScavenger *RS) const override;
 
+  void
+  processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
+                                            RegScavenger *RS) const override;
+
   unsigned getWinEHParentFrameOffset(const MachineFunction &MF) const override;
 
   unsigned getWinEHFuncletFrameSize(const MachineFunction &MF) const;
 
   int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
-                                     unsigned &FrameReg,
+                                     Register &FrameReg,
                                      bool IgnoreSPUpdates) const override;
   int getNonLocalFrameIndexReference(const MachineFunction &MF,
                                int FI) const override;
@@ -107,6 +113,8 @@ private:
   int64_t assignSVEStackObjectOffsets(MachineFrameInfo &MF,
                                       int &MinCSFrameIndex,
                                       int &MaxCSFrameIndex) const;
+  bool shouldCombineCSRLocalStackBumpInEpilogue(MachineBasicBlock &MBB,
+                                                unsigned StackBumpBytes) const;
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index a51aa85a931c..10c477853353 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -62,6 +62,9 @@ public:
                                     unsigned ConstraintID,
                                     std::vector<SDValue> &OutOps) override;
 
+  template <signed Low, signed High, signed Scale>
+  bool SelectRDVLImm(SDValue N, SDValue &Imm);
+
   bool tryMLAV64LaneV128(SDNode *N);
   bool tryMULLV64LaneV128(unsigned IntNo, SDNode *N);
   bool SelectArithExtendedRegister(SDValue N, SDValue &Reg, SDValue &Shift);
@@ -159,6 +162,24 @@ public:
     return false;
   }
 
+  bool SelectDupZero(SDValue N) {
+    switch(N->getOpcode()) {
+    case AArch64ISD::DUP:
+    case ISD::SPLAT_VECTOR: {
+      auto Opnd0 = N->getOperand(0);
+      if (auto CN = dyn_cast<ConstantSDNode>(Opnd0))
+        if (CN->isNullValue())
+          return true;
+      if (auto CN = dyn_cast<ConstantFPSDNode>(Opnd0))
+        if (CN->isZero())
+          return true;
+      break;
+    }
+    }
+
+    return false;
+  }
+
   template<MVT::SimpleValueType VT>
   bool SelectSVEAddSubImm(SDValue N, SDValue &Imm, SDValue &Shift) {
     return SelectSVEAddSubImm(N, VT, Imm, Shift);
@@ -169,6 +190,11 @@ public:
     return SelectSVELogicalImm(N, VT, Imm);
   }
 
+  template <unsigned Low, unsigned High>
+  bool SelectSVEShiftImm64(SDValue N, SDValue &Imm) {
+    return SelectSVEShiftImm64(N, Low, High, Imm);
+  }
+
   // Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
   template<signed Min, signed Max, signed Scale, bool Shift>
   bool SelectCntImm(SDValue N, SDValue &Imm) {
@@ -197,6 +223,9 @@ public:
   /// unchanged; otherwise a REG_SEQUENCE value is returned.
   SDValue createDTuple(ArrayRef<SDValue> Vecs);
   SDValue createQTuple(ArrayRef<SDValue> Vecs);
+  // Form a sequence of SVE registers for instructions using list of vectors,
+  // e.g. structured loads and stores (ldN, stN).
+  SDValue createZTuple(ArrayRef<SDValue> Vecs);
 
   /// Generic helper for the createDTuple/createQTuple
   /// functions. Those should almost always be called instead.
@@ -216,11 +245,31 @@ public:
                          unsigned SubRegIdx);
   void SelectLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
   void SelectPostLoadLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  void SelectPredicatedLoad(SDNode *N, unsigned NumVecs, const unsigned Opc);
+
+  bool SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base, SDValue &OffImm);
+  /// SVE Reg+Imm addressing mode.
+  template <int64_t Min, int64_t Max>
+  bool SelectAddrModeIndexedSVE(SDNode *Root, SDValue N, SDValue &Base,
+                                SDValue &OffImm);
+  /// SVE Reg+Reg address mode.
+  template <unsigned Scale>
+  bool SelectSVERegRegAddrMode(SDValue N, SDValue &Base, SDValue &Offset) {
+    return SelectSVERegRegAddrMode(N, Scale, Base, Offset);
+  }
 
   void SelectStore(SDNode *N, unsigned NumVecs, unsigned Opc);
   void SelectPostStore(SDNode *N, unsigned NumVecs, unsigned Opc);
   void SelectStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
   void SelectPostStoreLane(SDNode *N, unsigned NumVecs, unsigned Opc);
+  template <unsigned Scale>
+  void SelectPredicatedStore(SDNode *N, unsigned NumVecs, const unsigned Opc_rr,
+                             const unsigned Opc_ri);
+  template <unsigned Scale>
+  std::tuple<unsigned, SDValue, SDValue>
+  findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,
+                           const unsigned Opc_ri, const SDValue &OldBase,
+                           const SDValue &OldOffset);
 
   bool tryBitfieldExtractOp(SDNode *N);
   bool tryBitfieldExtractOpFromSExt(SDNode *N);
@@ -268,13 +317,19 @@ private:
 
   bool SelectCMP_SWAP(SDNode *N);
 
+  bool SelectSVE8BitLslImm(SDValue N, SDValue &Imm, SDValue &Shift);
+
   bool SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift);
 
   bool SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm);
 
   bool SelectSVESignedArithImm(SDValue N, SDValue &Imm);
+  bool SelectSVEShiftImm64(SDValue N, uint64_t Low, uint64_t High,
+                           SDValue &Imm);
 
   bool SelectSVEArithImm(SDValue N, SDValue &Imm);
+  bool SelectSVERegRegAddrMode(SDValue N, unsigned Scale, SDValue &Base,
+                               SDValue &Offset);
 };
 } // end anonymous namespace
 
@@ -679,6 +734,23 @@ static SDValue narrowIfNeeded(SelectionDAG *CurDAG, SDValue N) {
   return SDValue(Node, 0);
 }
 
+// Returns a suitable CNT/INC/DEC/RDVL multiplier to calculate VSCALE*N.
+template<signed Low, signed High, signed Scale>
+bool AArch64DAGToDAGISel::SelectRDVLImm(SDValue N, SDValue &Imm) {
+  if (!isa<ConstantSDNode>(N))
+    return false;
+
+  int64_t MulImm = cast<ConstantSDNode>(N)->getSExtValue();
+  if ((MulImm % std::abs(Scale)) == 0) {
+    int64_t RDVLImm = MulImm / Scale;
+    if ((RDVLImm >= Low) && (RDVLImm <= High)) {
+      Imm = CurDAG->getTargetConstant(RDVLImm, SDLoc(N), MVT::i32);
+      return true;
+    }
+  }
+
+  return false;
+}
 
 /// SelectArithExtendedRegister - Select a "extended register" operand.  This
 /// operand folds in an extend followed by an optional left shift.
@@ -832,16 +904,9 @@ bool AArch64DAGToDAGISel::SelectAddrModeIndexed(SDValue N, unsigned Size,
     if (!GAN)
       return true;
 
-    if (GAN->getOffset() % Size == 0) {
-      const GlobalValue *GV = GAN->getGlobal();
-      unsigned Alignment = GV->getAlignment();
-      Type *Ty = GV->getValueType();
-      if (Alignment == 0 && Ty->isSized())
-        Alignment = DL.getABITypeAlignment(Ty);
-
-      if (Alignment >= Size)
-        return true;
-    }
+    if (GAN->getOffset() % Size == 0 &&
+        GAN->getGlobal()->getPointerAlignment(DL) >= Size)
+      return true;
   }
 
   if (CurDAG->isBaseWithConstantOffset(N)) {
@@ -1132,6 +1197,16 @@ SDValue AArch64DAGToDAGISel::createQTuple(ArrayRef<SDValue> Regs) {
   return createTuple(Regs, RegClassIDs, SubRegs);
 }
 
+SDValue AArch64DAGToDAGISel::createZTuple(ArrayRef<SDValue> Regs) {
+  static const unsigned RegClassIDs[] = {AArch64::ZPR2RegClassID,
+                                         AArch64::ZPR3RegClassID,
+                                         AArch64::ZPR4RegClassID};
+  static const unsigned SubRegs[] = {AArch64::zsub0, AArch64::zsub1,
+                                     AArch64::zsub2, AArch64::zsub3};
+
+  return createTuple(Regs, RegClassIDs, SubRegs);
+}
+
 SDValue AArch64DAGToDAGISel::createTuple(ArrayRef<SDValue> Regs,
                                          const unsigned RegClassIDs[],
                                          const unsigned SubRegs[]) {
@@ -1240,6 +1315,8 @@ bool AArch64DAGToDAGISel::tryIndexedLoad(SDNode *N) {
     }
   } else if (VT == MVT::f16) {
     Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
+  } else if (VT == MVT::bf16) {
+    Opcode = IsPre ? AArch64::LDRHpre : AArch64::LDRHpost;
   } else if (VT == MVT::f32) {
     Opcode = IsPre ? AArch64::LDRSpre : AArch64::LDRSpost;
   } else if (VT == MVT::f64 || VT.is64BitVector()) {
@@ -1334,6 +1411,54 @@ void AArch64DAGToDAGISel::SelectPostLoad(SDNode *N, unsigned NumVecs,
   CurDAG->RemoveDeadNode(N);
 }
 
+/// Optimize \param OldBase and \param OldOffset selecting the best addressing
+/// mode. Returns a tuple consisting of an Opcode, an SDValue representing the
+/// new Base and an SDValue representing the new offset.
+template <unsigned Scale>
+std::tuple<unsigned, SDValue, SDValue>
+AArch64DAGToDAGISel::findAddrModeSVELoadStore(SDNode *N, const unsigned Opc_rr,
+                                              const unsigned Opc_ri,
+                                              const SDValue &OldBase,
+                                              const SDValue &OldOffset) {
+  SDValue NewBase = OldBase;
+  SDValue NewOffset = OldOffset;
+  // Detect a possible Reg+Imm addressing mode.
+  const bool IsRegImm = SelectAddrModeIndexedSVE</*Min=*/-8, /*Max=*/7>(
+      N, OldBase, NewBase, NewOffset);
+
+  // Detect a possible reg+reg addressing mode, but only if we haven't already
+  // detected a Reg+Imm one.
+  const bool IsRegReg =
+      !IsRegImm && SelectSVERegRegAddrMode<Scale>(OldBase, NewBase, NewOffset);
+
+  // Select the instruction.
+  return std::make_tuple(IsRegReg ? Opc_rr : Opc_ri, NewBase, NewOffset);
+}
+
+void AArch64DAGToDAGISel::SelectPredicatedLoad(SDNode *N, unsigned NumVecs,
+                                               const unsigned Opc) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+  SDValue Chain = N->getOperand(0);
+
+  SDValue Ops[] = {N->getOperand(1), // Predicate
+                   N->getOperand(2), // Memory operand
+                   CurDAG->getTargetConstant(0, DL, MVT::i64), Chain};
+
+  const EVT ResTys[] = {MVT::Untyped, MVT::Other};
+
+  SDNode *Load = CurDAG->getMachineNode(Opc, DL, ResTys, Ops);
+  SDValue SuperReg = SDValue(Load, 0);
+  for (unsigned i = 0; i < NumVecs; ++i)
+    ReplaceUses(SDValue(N, i), CurDAG->getTargetExtractSubreg(
+                                   AArch64::zsub0 + i, DL, VT, SuperReg));
+
+  // Copy chain
+  unsigned ChainIdx = NumVecs;
+  ReplaceUses(SDValue(N, ChainIdx), SDValue(Load, 1));
+  CurDAG->RemoveDeadNode(N);
+}
+
 void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
                                       unsigned Opc) {
   SDLoc dl(N);
@@ -1354,6 +1479,49 @@ void AArch64DAGToDAGISel::SelectStore(SDNode *N, unsigned NumVecs,
   ReplaceNode(N, St);
 }
 
+template <unsigned Scale>
+void AArch64DAGToDAGISel::SelectPredicatedStore(SDNode *N, unsigned NumVecs,
+                                                const unsigned Opc_rr,
+                                                const unsigned Opc_ri) {
+  SDLoc dl(N);
+
+  // Form a REG_SEQUENCE to force register allocation.
+  SmallVector<SDValue, 4> Regs(N->op_begin() + 2, N->op_begin() + 2 + NumVecs);
+  SDValue RegSeq = createZTuple(Regs);
+
+  // Optimize addressing mode.
+  unsigned Opc;
+  SDValue Offset, Base;
+  std::tie(Opc, Base, Offset) = findAddrModeSVELoadStore<Scale>(
+      N, Opc_rr, Opc_ri, N->getOperand(NumVecs + 3),
+      CurDAG->getTargetConstant(0, dl, MVT::i64));
+
+  SDValue Ops[] = {RegSeq, N->getOperand(NumVecs + 2), // predicate
+                   Base,                               // address
+                   Offset,                             // offset
+                   N->getOperand(0)};                  // chain
+  SDNode *St = CurDAG->getMachineNode(Opc, dl, N->getValueType(0), Ops);
+
+  ReplaceNode(N, St);
+}
+
+bool AArch64DAGToDAGISel::SelectAddrModeFrameIndexSVE(SDValue N, SDValue &Base,
+                                                      SDValue &OffImm) {
+  SDLoc dl(N);
+  const DataLayout &DL = CurDAG->getDataLayout();
+  const TargetLowering *TLI = getTargetLowering();
+
+  // Try to match it for the frame address
+  if (auto FINode = dyn_cast<FrameIndexSDNode>(N)) {
+    int FI = FINode->getIndex();
+    Base = CurDAG->getTargetFrameIndex(FI, TLI->getPointerTy(DL));
+    OffImm = CurDAG->getTargetConstant(0, dl, MVT::i64);
+    return true;
+  }
+
+  return false;
+}
+
 void AArch64DAGToDAGISel::SelectPostStore(SDNode *N, unsigned NumVecs,
                                           unsigned Opc) {
   SDLoc dl(N);
@@ -2632,7 +2800,8 @@ bool AArch64DAGToDAGISel::tryShiftAmountMod(SDNode *N) {
     // bits that are implicitly ANDed off by the above opcodes and if so, skip
     // the AND.
     uint64_t MaskImm;
-    if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm))
+    if (!isOpcWithIntImmediate(ShiftAmt.getNode(), ISD::AND, MaskImm) &&
+        !isOpcWithIntImmediate(ShiftAmt.getNode(), AArch64ISD::ANDS, MaskImm))
       return false;
 
     if (countTrailingOnes(MaskImm) < Bits)
@@ -2879,6 +3048,32 @@ bool AArch64DAGToDAGISel::SelectCMP_SWAP(SDNode *N) {
   return true;
 }
 
+bool AArch64DAGToDAGISel::SelectSVE8BitLslImm(SDValue N, SDValue &Base,
+                                                  SDValue &Offset) {
+  auto C = dyn_cast<ConstantSDNode>(N);
+  if (!C)
+    return false;
+
+  auto Ty = N->getValueType(0);
+
+  int64_t Imm = C->getSExtValue();
+  SDLoc DL(N);
+
+  if ((Imm >= -128) && (Imm <= 127)) {
+    Base = CurDAG->getTargetConstant(Imm, DL, Ty);
+    Offset = CurDAG->getTargetConstant(0, DL, Ty);
+    return true;
+  }
+
+  if (((Imm % 256) == 0) && (Imm >= -32768) && (Imm <= 32512)) {
+    Base = CurDAG->getTargetConstant(Imm/256, DL, Ty);
+    Offset = CurDAG->getTargetConstant(8, DL, Ty);
+    return true;
+  }
+
+  return false;
+}
+
 bool AArch64DAGToDAGISel::SelectSVEAddSubImm(SDValue N, MVT VT, SDValue &Imm, SDValue &Shift) {
   if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
     const int64_t ImmVal = CNode->getZExtValue();
@@ -2917,7 +3112,7 @@ bool AArch64DAGToDAGISel::SelectSVESignedArithImm(SDValue N, SDValue &Imm) {
   if (auto CNode = dyn_cast<ConstantSDNode>(N)) {
     int64_t ImmVal = CNode->getSExtValue();
     SDLoc DL(N);
-    if (ImmVal >= -127 && ImmVal < 127) {
+    if (ImmVal >= -128 && ImmVal < 128) {
       Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
       return true;
     }
@@ -2975,6 +3170,24 @@ bool AArch64DAGToDAGISel::SelectSVELogicalImm(SDValue N, MVT VT, SDValue &Imm) {
   return false;
 }
 
+// This method is only needed to "cast" i64s into i32s when the value
+// is a valid shift which has been splatted into a vector with i64 elements.
+// Every other type is fine in tablegen.
+bool AArch64DAGToDAGISel::SelectSVEShiftImm64(SDValue N, uint64_t Low,
+                                              uint64_t High, SDValue &Imm) {
+  if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
+    uint64_t ImmVal = CN->getZExtValue();
+    SDLoc DL(N);
+
+    if (ImmVal >= Low && ImmVal <= High) {
+      Imm = CurDAG->getTargetConstant(ImmVal, DL, MVT::i32);
+      return true;
+    }
+  }
+
+  return false;
+}
+
 bool AArch64DAGToDAGISel::trySelectStackSlotTagP(SDNode *N) {
   // tagp(FrameIndex, IRGstack, tag_offset):
   // since the offset between FrameIndex and IRGstack is a compile-time
@@ -3027,6 +3240,63 @@ void AArch64DAGToDAGISel::SelectTagP(SDNode *N) {
   ReplaceNode(N, N3);
 }
 
+// NOTE: We cannot use EXTRACT_SUBREG in all cases because the fixed length
+// vector types larger than NEON don't have a matching SubRegIndex.
+static SDNode *extractSubReg(SelectionDAG *DAG, EVT VT, SDValue V) {
+  assert(V.getValueType().isScalableVector() &&
+         V.getValueType().getSizeInBits().getKnownMinSize() ==
+             AArch64::SVEBitsPerBlock &&
+         "Expected to extract from a packed scalable vector!");
+  assert(VT.isFixedLengthVector() &&
+         "Expected to extract a fixed length vector!");
+
+  SDLoc DL(V);
+  switch (VT.getSizeInBits()) {
+  case 64: {
+    auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
+    return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg);
+  }
+  case 128: {
+    auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
+    return DAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, VT, V, SubReg);
+  }
+  default: {
+    auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64);
+    return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
+  }
+  }
+}
+
+// NOTE: We cannot use INSERT_SUBREG in all cases because the fixed length
+// vector types larger than NEON don't have a matching SubRegIndex.
+static SDNode *insertSubReg(SelectionDAG *DAG, EVT VT, SDValue V) {
+  assert(VT.isScalableVector() &&
+         VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock &&
+         "Expected to insert into a packed scalable vector!");
+  assert(V.getValueType().isFixedLengthVector() &&
+         "Expected to insert a fixed length vector!");
+
+  SDLoc DL(V);
+  switch (V.getValueType().getSizeInBits()) {
+  case 64: {
+    auto SubReg = DAG->getTargetConstant(AArch64::dsub, DL, MVT::i32);
+    auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
+    return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT,
+                               SDValue(Container, 0), V, SubReg);
+  }
+  case 128: {
+    auto SubReg = DAG->getTargetConstant(AArch64::zsub, DL, MVT::i32);
+    auto Container = DAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, DL, VT);
+    return DAG->getMachineNode(TargetOpcode::INSERT_SUBREG, DL, VT,
+                               SDValue(Container, 0), V, SubReg);
+  }
+  default: {
+    auto RC = DAG->getTargetConstant(AArch64::ZPRRegClassID, DL, MVT::i64);
+    return DAG->getMachineNode(TargetOpcode::COPY_TO_REGCLASS, DL, VT, V, RC);
+  }
+  }
+}
+
 void AArch64DAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
@@ -3100,6 +3370,52 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       return;
     break;
 
+  case ISD::EXTRACT_SUBVECTOR: {
+    // Bail when not a "cast" like extract_subvector.
+    if (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue() != 0)
+      break;
+
+    // Bail when normal isel can do the job.
+    EVT InVT = Node->getOperand(0).getValueType();
+    if (VT.isScalableVector() || InVT.isFixedLengthVector())
+      break;
+
+    // NOTE: We can only get here when doing fixed length SVE code generation.
+    // We do manual selection because the types involved are not linked to real
+    // registers (despite being legal) and must be coerced into SVE registers.
+    //
+    // NOTE: If the above changes, be aware that selection will still not work
+    // because the td definition of extract_vector does not support extracting
+    // a fixed length vector from a scalable vector.
+
+    ReplaceNode(Node, extractSubReg(CurDAG, VT, Node->getOperand(0)));
+    return;
+  }
+
+  case ISD::INSERT_SUBVECTOR: {
+    // Bail when not a "cast" like insert_subvector.
+    if (cast<ConstantSDNode>(Node->getOperand(2))->getZExtValue() != 0)
+      break;
+    if (!Node->getOperand(0).isUndef())
+      break;
+
+    // Bail when normal isel should do the job.
+    EVT InVT = Node->getOperand(1).getValueType();
+    if (VT.isFixedLengthVector() || InVT.isScalableVector())
+      break;
+
+    // NOTE: We can only get here when doing fixed length SVE code generation.
+    // We do manual selection because the types involved are not linked to real
+    // registers (despite being legal) and must be coerced into SVE registers.
+    //
+    // NOTE: If the above changes, be aware that selection will still not work
+    // because the td definition of insert_vector does not support inserting a
+    // fixed length vector into a scalable vector.
+
+    ReplaceNode(Node, insertSubReg(CurDAG, VT, Node->getOperand(1)));
+    return;
+  }
+
   case ISD::Constant: {
     // Materialize zero constants as copies from WZR/XZR.  This allows
     // the coalescer to propagate these into other instructions.
@@ -3185,10 +3501,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectLoad(Node, 2, AArch64::LD1Twov16b, AArch64::qsub0);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
         SelectLoad(Node, 2, AArch64::LD1Twov4h, AArch64::dsub0);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
         SelectLoad(Node, 2, AArch64::LD1Twov8h, AArch64::qsub0);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3212,10 +3528,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectLoad(Node, 3, AArch64::LD1Threev16b, AArch64::qsub0);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
         SelectLoad(Node, 3, AArch64::LD1Threev4h, AArch64::dsub0);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
         SelectLoad(Node, 3, AArch64::LD1Threev8h, AArch64::qsub0);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3239,10 +3555,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectLoad(Node, 4, AArch64::LD1Fourv16b, AArch64::qsub0);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
         SelectLoad(Node, 4, AArch64::LD1Fourv4h, AArch64::dsub0);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
         SelectLoad(Node, 4, AArch64::LD1Fourv8h, AArch64::qsub0);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3266,10 +3582,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectLoad(Node, 2, AArch64::LD2Twov16b, AArch64::qsub0);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
         SelectLoad(Node, 2, AArch64::LD2Twov4h, AArch64::dsub0);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
         SelectLoad(Node, 2, AArch64::LD2Twov8h, AArch64::qsub0);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3293,10 +3609,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectLoad(Node, 3, AArch64::LD3Threev16b, AArch64::qsub0);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
         SelectLoad(Node, 3, AArch64::LD3Threev4h, AArch64::dsub0);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
         SelectLoad(Node, 3, AArch64::LD3Threev8h, AArch64::qsub0);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3320,10 +3636,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectLoad(Node, 4, AArch64::LD4Fourv16b, AArch64::qsub0);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
         SelectLoad(Node, 4, AArch64::LD4Fourv4h, AArch64::dsub0);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
         SelectLoad(Node, 4, AArch64::LD4Fourv8h, AArch64::qsub0);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3347,10 +3663,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectLoad(Node, 2, AArch64::LD2Rv16b, AArch64::qsub0);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
         SelectLoad(Node, 2, AArch64::LD2Rv4h, AArch64::dsub0);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
         SelectLoad(Node, 2, AArch64::LD2Rv8h, AArch64::qsub0);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3374,10 +3690,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectLoad(Node, 3, AArch64::LD3Rv16b, AArch64::qsub0);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
         SelectLoad(Node, 3, AArch64::LD3Rv4h, AArch64::dsub0);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
         SelectLoad(Node, 3, AArch64::LD3Rv8h, AArch64::qsub0);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3401,10 +3717,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectLoad(Node, 4, AArch64::LD4Rv16b, AArch64::qsub0);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
         SelectLoad(Node, 4, AArch64::LD4Rv4h, AArch64::dsub0);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
         SelectLoad(Node, 4, AArch64::LD4Rv8h, AArch64::qsub0);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3426,7 +3742,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
         SelectLoadLane(Node, 2, AArch64::LD2i8);
         return;
       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-                 VT == MVT::v8f16) {
+                 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
         SelectLoadLane(Node, 2, AArch64::LD2i16);
         return;
       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -3444,7 +3760,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
         SelectLoadLane(Node, 3, AArch64::LD3i8);
         return;
       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-                 VT == MVT::v8f16) {
+                 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
         SelectLoadLane(Node, 3, AArch64::LD3i16);
         return;
       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -3462,7 +3778,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
         SelectLoadLane(Node, 4, AArch64::LD4i8);
         return;
       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-                 VT == MVT::v8f16) {
+                 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
         SelectLoadLane(Node, 4, AArch64::LD4i16);
         return;
       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -3537,10 +3853,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectStore(Node, 2, AArch64::ST1Twov16b);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v4bf16) {
         SelectStore(Node, 2, AArch64::ST1Twov4h);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
+                 VT == MVT::v8bf16) {
         SelectStore(Node, 2, AArch64::ST1Twov8h);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3565,10 +3883,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectStore(Node, 3, AArch64::ST1Threev16b);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v4bf16) {
         SelectStore(Node, 3, AArch64::ST1Threev4h);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
+                 VT == MVT::v8bf16) {
         SelectStore(Node, 3, AArch64::ST1Threev8h);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3593,10 +3913,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectStore(Node, 4, AArch64::ST1Fourv16b);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v4bf16) {
         SelectStore(Node, 4, AArch64::ST1Fourv4h);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
+                 VT == MVT::v8bf16) {
         SelectStore(Node, 4, AArch64::ST1Fourv8h);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3621,10 +3943,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectStore(Node, 2, AArch64::ST2Twov16b);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v4bf16) {
         SelectStore(Node, 2, AArch64::ST2Twov4h);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
+                 VT == MVT::v8bf16) {
         SelectStore(Node, 2, AArch64::ST2Twov8h);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3649,10 +3973,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectStore(Node, 3, AArch64::ST3Threev16b);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v4bf16) {
         SelectStore(Node, 3, AArch64::ST3Threev4h);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
+                 VT == MVT::v8bf16) {
         SelectStore(Node, 3, AArch64::ST3Threev8h);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3677,10 +4003,12 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       } else if (VT == MVT::v16i8) {
         SelectStore(Node, 4, AArch64::ST4Fourv16b);
         return;
-      } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+      } else if (VT == MVT::v4i16 || VT == MVT::v4f16 ||
+                 VT == MVT::v4bf16) {
         SelectStore(Node, 4, AArch64::ST4Fourv4h);
         return;
-      } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+      } else if (VT == MVT::v8i16 || VT == MVT::v8f16 ||
+                 VT == MVT::v8bf16) {
         SelectStore(Node, 4, AArch64::ST4Fourv8h);
         return;
       } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3703,7 +4031,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
         SelectStoreLane(Node, 2, AArch64::ST2i8);
         return;
       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-                 VT == MVT::v8f16) {
+                 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
         SelectStoreLane(Node, 2, AArch64::ST2i16);
         return;
       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -3722,7 +4050,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
         SelectStoreLane(Node, 3, AArch64::ST3i8);
         return;
       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-                 VT == MVT::v8f16) {
+                 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
         SelectStoreLane(Node, 3, AArch64::ST3i16);
         return;
       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -3741,7 +4069,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
         SelectStoreLane(Node, 4, AArch64::ST4i8);
         return;
       } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-                 VT == MVT::v8f16) {
+                 VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
         SelectStoreLane(Node, 4, AArch64::ST4i16);
         return;
       } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -3755,6 +4083,69 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       }
       break;
     }
+    case Intrinsic::aarch64_sve_st2: {
+      if (VT == MVT::nxv16i8) {
+        SelectPredicatedStore</*Scale=*/0>(Node, 2, AArch64::ST2B,
+                                           AArch64::ST2B_IMM);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+        SelectPredicatedStore</*Scale=*/1>(Node, 2, AArch64::ST2H,
+                                           AArch64::ST2H_IMM);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectPredicatedStore</*Scale=*/2>(Node, 2, AArch64::ST2W,
+                                           AArch64::ST2W_IMM);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectPredicatedStore</*Scale=*/3>(Node, 2, AArch64::ST2D,
+                                           AArch64::ST2D_IMM);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_sve_st3: {
+      if (VT == MVT::nxv16i8) {
+        SelectPredicatedStore</*Scale=*/0>(Node, 3, AArch64::ST3B,
+                                           AArch64::ST3B_IMM);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+        SelectPredicatedStore</*Scale=*/1>(Node, 3, AArch64::ST3H,
+                                           AArch64::ST3H_IMM);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectPredicatedStore</*Scale=*/2>(Node, 3, AArch64::ST3W,
+                                           AArch64::ST3W_IMM);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectPredicatedStore</*Scale=*/3>(Node, 3, AArch64::ST3D,
+                                           AArch64::ST3D_IMM);
+        return;
+      }
+      break;
+    }
+    case Intrinsic::aarch64_sve_st4: {
+      if (VT == MVT::nxv16i8) {
+        SelectPredicatedStore</*Scale=*/0>(Node, 4, AArch64::ST4B,
+                                           AArch64::ST4B_IMM);
+        return;
+      } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+                 (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+        SelectPredicatedStore</*Scale=*/1>(Node, 4, AArch64::ST4H,
+                                           AArch64::ST4H_IMM);
+        return;
+      } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+        SelectPredicatedStore</*Scale=*/2>(Node, 4, AArch64::ST4W,
+                                           AArch64::ST4W_IMM);
+        return;
+      } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+        SelectPredicatedStore</*Scale=*/3>(Node, 4, AArch64::ST4D,
+                                           AArch64::ST4D_IMM);
+        return;
+      }
+      break;
+    }
     }
     break;
   }
@@ -3765,10 +4156,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostLoad(Node, 2, AArch64::LD2Twov16b_POST, AArch64::qsub0);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostLoad(Node, 2, AArch64::LD2Twov4h_POST, AArch64::dsub0);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
       SelectPostLoad(Node, 2, AArch64::LD2Twov8h_POST, AArch64::qsub0);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3793,10 +4184,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostLoad(Node, 3, AArch64::LD3Threev16b_POST, AArch64::qsub0);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostLoad(Node, 3, AArch64::LD3Threev4h_POST, AArch64::dsub0);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
       SelectPostLoad(Node, 3, AArch64::LD3Threev8h_POST, AArch64::qsub0);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3821,10 +4212,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostLoad(Node, 4, AArch64::LD4Fourv16b_POST, AArch64::qsub0);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostLoad(Node, 4, AArch64::LD4Fourv4h_POST, AArch64::dsub0);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
       SelectPostLoad(Node, 4, AArch64::LD4Fourv8h_POST, AArch64::qsub0);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3849,10 +4240,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostLoad(Node, 2, AArch64::LD1Twov16b_POST, AArch64::qsub0);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostLoad(Node, 2, AArch64::LD1Twov4h_POST, AArch64::dsub0);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
       SelectPostLoad(Node, 2, AArch64::LD1Twov8h_POST, AArch64::qsub0);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3877,10 +4268,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostLoad(Node, 3, AArch64::LD1Threev16b_POST, AArch64::qsub0);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostLoad(Node, 3, AArch64::LD1Threev4h_POST, AArch64::dsub0);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
       SelectPostLoad(Node, 3, AArch64::LD1Threev8h_POST, AArch64::qsub0);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3905,10 +4296,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostLoad(Node, 4, AArch64::LD1Fourv16b_POST, AArch64::qsub0);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostLoad(Node, 4, AArch64::LD1Fourv4h_POST, AArch64::dsub0);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
       SelectPostLoad(Node, 4, AArch64::LD1Fourv8h_POST, AArch64::qsub0);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3933,10 +4324,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostLoad(Node, 1, AArch64::LD1Rv16b_POST, AArch64::qsub0);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostLoad(Node, 1, AArch64::LD1Rv4h_POST, AArch64::dsub0);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
       SelectPostLoad(Node, 1, AArch64::LD1Rv8h_POST, AArch64::qsub0);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3961,10 +4352,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostLoad(Node, 2, AArch64::LD2Rv16b_POST, AArch64::qsub0);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostLoad(Node, 2, AArch64::LD2Rv4h_POST, AArch64::dsub0);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
       SelectPostLoad(Node, 2, AArch64::LD2Rv8h_POST, AArch64::qsub0);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -3989,10 +4380,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostLoad(Node, 3, AArch64::LD3Rv16b_POST, AArch64::qsub0);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostLoad(Node, 3, AArch64::LD3Rv4h_POST, AArch64::dsub0);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
       SelectPostLoad(Node, 3, AArch64::LD3Rv8h_POST, AArch64::qsub0);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4017,10 +4408,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostLoad(Node, 4, AArch64::LD4Rv16b_POST, AArch64::qsub0);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostLoad(Node, 4, AArch64::LD4Rv4h_POST, AArch64::dsub0);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16  || VT == MVT::v8bf16) {
       SelectPostLoad(Node, 4, AArch64::LD4Rv8h_POST, AArch64::qsub0);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4043,7 +4434,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       SelectPostLoadLane(Node, 1, AArch64::LD1i8_POST);
       return;
     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16) {
+               VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
       SelectPostLoadLane(Node, 1, AArch64::LD1i16_POST);
       return;
     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -4062,7 +4453,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       SelectPostLoadLane(Node, 2, AArch64::LD2i8_POST);
       return;
     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16) {
+               VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
       SelectPostLoadLane(Node, 2, AArch64::LD2i16_POST);
       return;
     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -4081,7 +4472,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       SelectPostLoadLane(Node, 3, AArch64::LD3i8_POST);
       return;
     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16) {
+               VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
       SelectPostLoadLane(Node, 3, AArch64::LD3i16_POST);
       return;
     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -4100,7 +4491,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       SelectPostLoadLane(Node, 4, AArch64::LD4i8_POST);
       return;
     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16) {
+               VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
       SelectPostLoadLane(Node, 4, AArch64::LD4i16_POST);
       return;
     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -4122,10 +4513,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostStore(Node, 2, AArch64::ST2Twov16b_POST);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostStore(Node, 2, AArch64::ST2Twov4h_POST);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
       SelectPostStore(Node, 2, AArch64::ST2Twov8h_POST);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4151,10 +4542,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostStore(Node, 3, AArch64::ST3Threev16b_POST);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostStore(Node, 3, AArch64::ST3Threev4h_POST);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
       SelectPostStore(Node, 3, AArch64::ST3Threev8h_POST);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4180,10 +4571,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostStore(Node, 4, AArch64::ST4Fourv16b_POST);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostStore(Node, 4, AArch64::ST4Fourv4h_POST);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
       SelectPostStore(Node, 4, AArch64::ST4Fourv8h_POST);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4209,10 +4600,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostStore(Node, 2, AArch64::ST1Twov16b_POST);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostStore(Node, 2, AArch64::ST1Twov4h_POST);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
       SelectPostStore(Node, 2, AArch64::ST1Twov8h_POST);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4238,10 +4629,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostStore(Node, 3, AArch64::ST1Threev16b_POST);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostStore(Node, 3, AArch64::ST1Threev4h_POST);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16 ) {
       SelectPostStore(Node, 3, AArch64::ST1Threev8h_POST);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4267,10 +4658,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     } else if (VT == MVT::v16i8) {
       SelectPostStore(Node, 4, AArch64::ST1Fourv16b_POST);
       return;
-    } else if (VT == MVT::v4i16 || VT == MVT::v4f16) {
+    } else if (VT == MVT::v4i16 || VT == MVT::v4f16 || VT == MVT::v4bf16) {
       SelectPostStore(Node, 4, AArch64::ST1Fourv4h_POST);
       return;
-    } else if (VT == MVT::v8i16 || VT == MVT::v8f16) {
+    } else if (VT == MVT::v8i16 || VT == MVT::v8f16 || VT == MVT::v8bf16) {
       SelectPostStore(Node, 4, AArch64::ST1Fourv8h_POST);
       return;
     } else if (VT == MVT::v2i32 || VT == MVT::v2f32) {
@@ -4294,7 +4685,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       SelectPostStoreLane(Node, 2, AArch64::ST2i8_POST);
       return;
     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16) {
+               VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
       SelectPostStoreLane(Node, 2, AArch64::ST2i16_POST);
       return;
     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -4314,7 +4705,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       SelectPostStoreLane(Node, 3, AArch64::ST3i8_POST);
       return;
     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16) {
+               VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
       SelectPostStoreLane(Node, 3, AArch64::ST3i16_POST);
       return;
     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -4334,7 +4725,7 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
       SelectPostStoreLane(Node, 4, AArch64::ST4i8_POST);
       return;
     } else if (VT == MVT::v8i16 || VT == MVT::v4i16 || VT == MVT::v4f16 ||
-               VT == MVT::v8f16) {
+               VT == MVT::v8f16 || VT == MVT::v4bf16 || VT == MVT::v8bf16) {
       SelectPostStoreLane(Node, 4, AArch64::ST4i16_POST);
       return;
     } else if (VT == MVT::v4i32 || VT == MVT::v2i32 || VT == MVT::v4f32 ||
@@ -4348,6 +4739,57 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
+  case AArch64ISD::SVE_LD2_MERGE_ZERO: {
+    if (VT == MVT::nxv16i8) {
+      SelectPredicatedLoad(Node, 2, AArch64::LD2B_IMM);
+      return;
+    } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+               (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+      SelectPredicatedLoad(Node, 2, AArch64::LD2H_IMM);
+      return;
+    } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+      SelectPredicatedLoad(Node, 2, AArch64::LD2W_IMM);
+      return;
+    } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+      SelectPredicatedLoad(Node, 2, AArch64::LD2D_IMM);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::SVE_LD3_MERGE_ZERO: {
+    if (VT == MVT::nxv16i8) {
+      SelectPredicatedLoad(Node, 3, AArch64::LD3B_IMM);
+      return;
+    } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+               (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+      SelectPredicatedLoad(Node, 3, AArch64::LD3H_IMM);
+      return;
+    } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+      SelectPredicatedLoad(Node, 3, AArch64::LD3W_IMM);
+      return;
+    } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+      SelectPredicatedLoad(Node, 3, AArch64::LD3D_IMM);
+      return;
+    }
+    break;
+  }
+  case AArch64ISD::SVE_LD4_MERGE_ZERO: {
+    if (VT == MVT::nxv16i8) {
+      SelectPredicatedLoad(Node, 4, AArch64::LD4B_IMM);
+      return;
+    } else if (VT == MVT::nxv8i16 || VT == MVT::nxv8f16 ||
+               (VT == MVT::nxv8bf16 && Subtarget->hasBF16())) {
+      SelectPredicatedLoad(Node, 4, AArch64::LD4H_IMM);
+      return;
+    } else if (VT == MVT::nxv4i32 || VT == MVT::nxv4f32) {
+      SelectPredicatedLoad(Node, 4, AArch64::LD4W_IMM);
+      return;
+    } else if (VT == MVT::nxv2i64 || VT == MVT::nxv2f64) {
+      SelectPredicatedLoad(Node, 4, AArch64::LD4D_IMM);
+      return;
+    }
+    break;
+  }
   }
 
   // Select the default instruction
@@ -4360,3 +4802,130 @@ FunctionPass *llvm::createAArch64ISelDag(AArch64TargetMachine &TM,
                                          CodeGenOpt::Level OptLevel) {
   return new AArch64DAGToDAGISel(TM, OptLevel);
 }
+
+/// When \p PredVT is a scalable vector predicate in the form
+/// MVT::nx<M>xi1, it builds the correspondent scalable vector of
+/// integers MVT::nx<M>xi<bits> s.t. M x bits = 128. If the input
+/// PredVT is not in the form MVT::nx<M>xi1, it returns an invalid
+/// EVT.
+static EVT getPackedVectorTypeFromPredicateType(LLVMContext &Ctx, EVT PredVT) {
+  if (!PredVT.isScalableVector() || PredVT.getVectorElementType() != MVT::i1)
+    return EVT();
+
+  if (PredVT != MVT::nxv16i1 && PredVT != MVT::nxv8i1 &&
+      PredVT != MVT::nxv4i1 && PredVT != MVT::nxv2i1)
+    return EVT();
+
+  ElementCount EC = PredVT.getVectorElementCount();
+  EVT ScalarVT = EVT::getIntegerVT(Ctx, AArch64::SVEBitsPerBlock / EC.Min);
+  EVT MemVT = EVT::getVectorVT(Ctx, ScalarVT, EC);
+  return MemVT;
+}
+
+/// Return the EVT of the data associated to a memory operation in \p
+/// Root. If such EVT cannot be retrived, it returns an invalid EVT.
+static EVT getMemVTFromNode(LLVMContext &Ctx, SDNode *Root) {
+  if (isa<MemSDNode>(Root))
+    return cast<MemSDNode>(Root)->getMemoryVT();
+
+  if (isa<MemIntrinsicSDNode>(Root))
+    return cast<MemIntrinsicSDNode>(Root)->getMemoryVT();
+
+  const unsigned Opcode = Root->getOpcode();
+  // For custom ISD nodes, we have to look at them individually to extract the
+  // type of the data moved to/from memory.
+  switch (Opcode) {
+  case AArch64ISD::LD1_MERGE_ZERO:
+  case AArch64ISD::LD1S_MERGE_ZERO:
+  case AArch64ISD::LDNF1_MERGE_ZERO:
+  case AArch64ISD::LDNF1S_MERGE_ZERO:
+    return cast<VTSDNode>(Root->getOperand(3))->getVT();
+  case AArch64ISD::ST1_PRED:
+    return cast<VTSDNode>(Root->getOperand(4))->getVT();
+  default:
+    break;
+  }
+
+  if (Opcode != ISD::INTRINSIC_VOID)
+    return EVT();
+
+  const unsigned IntNo =
+      cast<ConstantSDNode>(Root->getOperand(1))->getZExtValue();
+  if (IntNo != Intrinsic::aarch64_sve_prf)
+    return EVT();
+
+  // We are using an SVE prefetch intrinsic. Type must be inferred
+  // from the width of the predicate.
+  return getPackedVectorTypeFromPredicateType(
+      Ctx, Root->getOperand(2)->getValueType(0));
+}
+
+/// SelectAddrModeIndexedSVE - Attempt selection of the addressing mode:
+/// Base + OffImm * sizeof(MemVT) for Min >= OffImm <= Max
+/// where Root is the memory access using N for its address.
+template <int64_t Min, int64_t Max>
+bool AArch64DAGToDAGISel::SelectAddrModeIndexedSVE(SDNode *Root, SDValue N,
+                                                   SDValue &Base,
+                                                   SDValue &OffImm) {
+  const EVT MemVT = getMemVTFromNode(*(CurDAG->getContext()), Root);
+
+  if (MemVT == EVT())
+    return false;
+
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+
+  SDValue VScale = N.getOperand(1);
+  if (VScale.getOpcode() != ISD::VSCALE)
+    return false;
+
+  TypeSize TS = MemVT.getSizeInBits();
+  int64_t MemWidthBytes = static_cast<int64_t>(TS.getKnownMinSize()) / 8;
+  int64_t MulImm = cast<ConstantSDNode>(VScale.getOperand(0))->getSExtValue();
+
+  if ((MulImm % MemWidthBytes) != 0)
+    return false;
+
+  int64_t Offset = MulImm / MemWidthBytes;
+  if (Offset < Min || Offset > Max)
+    return false;
+
+  Base = N.getOperand(0);
+  OffImm = CurDAG->getTargetConstant(Offset, SDLoc(N), MVT::i64);
+  return true;
+}
+
+/// Select register plus register addressing mode for SVE, with scaled
+/// offset.
+bool AArch64DAGToDAGISel::SelectSVERegRegAddrMode(SDValue N, unsigned Scale,
+                                                  SDValue &Base,
+                                                  SDValue &Offset) {
+  if (N.getOpcode() != ISD::ADD)
+    return false;
+
+  // Process an ADD node.
+  const SDValue LHS = N.getOperand(0);
+  const SDValue RHS = N.getOperand(1);
+
+  // 8 bit data does not come with the SHL node, so it is treated
+  // separately.
+  if (Scale == 0) {
+    Base = LHS;
+    Offset = RHS;
+    return true;
+  }
+
+  // Check if the RHS is a shift node with a constant.
+  if (RHS.getOpcode() != ISD::SHL)
+    return false;
+
+  const SDValue ShiftRHS = RHS.getOperand(1);
+  if (auto *C = dyn_cast<ConstantSDNode>(ShiftRHS))
+    if (C->getZExtValue() == Scale) {
+      Base = LHS;
+      Offset = RHS.getOperand(0);
+      return true;
+    }
+
+  return false;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index d45a80057564..85db14ab66fe 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -99,11 +99,6 @@ STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumShiftInserts, "Number of vector shift inserts");
 STATISTIC(NumOptimizedImms, "Number of times immediates were optimized");
 
-static cl::opt<bool>
-EnableAArch64SlrGeneration("aarch64-shift-insert-generation", cl::Hidden,
-                           cl::desc("Allow AArch64 SLI/SRI formation"),
-                           cl::init(false));
-
 // FIXME: The necessary dtprel relocations don't seem to be supported
 // well in the GNU bfd and gold linkers at the moment. Therefore, by
 // default, for now, fall back to GeneralDynamic code generation.
@@ -121,6 +116,18 @@ EnableOptimizeLogicalImm("aarch64-enable-logical-imm", cl::Hidden,
 /// Value type used for condition codes.
 static const MVT MVT_CC = MVT::i32;
 
+/// Returns true if VT's elements occupy the lowest bit positions of its
+/// associated register class without any intervening space.
+///
+/// For example, nxv2f16, nxv4f16 and nxv8f16 are legal types that belong to the
+/// same register class, but only nxv8f16 can be treated as a packed vector.
+static inline bool isPackedVectorType(EVT VT, SelectionDAG &DAG) {
+  assert(VT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+         "Expected legal vector type!");
+  return VT.isFixedLengthVector() ||
+         VT.getSizeInBits().getKnownMinSize() == AArch64::SVEBitsPerBlock;
+}
+
 AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
                                              const AArch64Subtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -137,6 +144,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   if (Subtarget->hasFPARMv8()) {
     addRegisterClass(MVT::f16, &AArch64::FPR16RegClass);
+    addRegisterClass(MVT::bf16, &AArch64::FPR16RegClass);
     addRegisterClass(MVT::f32, &AArch64::FPR32RegClass);
     addRegisterClass(MVT::f64, &AArch64::FPR64RegClass);
     addRegisterClass(MVT::f128, &AArch64::FPR128RegClass);
@@ -153,6 +161,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     addDRTypeForNEON(MVT::v1i64);
     addDRTypeForNEON(MVT::v1f64);
     addDRTypeForNEON(MVT::v4f16);
+    addDRTypeForNEON(MVT::v4bf16);
 
     addQRTypeForNEON(MVT::v4f32);
     addQRTypeForNEON(MVT::v2f64);
@@ -161,6 +170,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     addQRTypeForNEON(MVT::v4i32);
     addQRTypeForNEON(MVT::v2i64);
     addQRTypeForNEON(MVT::v8f16);
+    addQRTypeForNEON(MVT::v8bf16);
   }
 
   if (Subtarget->hasSVE()) {
@@ -183,21 +193,51 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::nxv4f32, &AArch64::ZPRRegClass);
     addRegisterClass(MVT::nxv2f64, &AArch64::ZPRRegClass);
 
+    if (Subtarget->hasBF16()) {
+      addRegisterClass(MVT::nxv2bf16, &AArch64::ZPRRegClass);
+      addRegisterClass(MVT::nxv4bf16, &AArch64::ZPRRegClass);
+      addRegisterClass(MVT::nxv8bf16, &AArch64::ZPRRegClass);
+    }
+
+    if (useSVEForFixedLengthVectors()) {
+      for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
+        if (useSVEForFixedLengthVectorVT(VT))
+          addRegisterClass(VT, &AArch64::ZPRRegClass);
+
+      for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
+        if (useSVEForFixedLengthVectorVT(VT))
+          addRegisterClass(VT, &AArch64::ZPRRegClass);
+    }
+
     for (auto VT : { MVT::nxv16i8, MVT::nxv8i16, MVT::nxv4i32, MVT::nxv2i64 }) {
       setOperationAction(ISD::SADDSAT, VT, Legal);
       setOperationAction(ISD::UADDSAT, VT, Legal);
       setOperationAction(ISD::SSUBSAT, VT, Legal);
       setOperationAction(ISD::USUBSAT, VT, Legal);
-      setOperationAction(ISD::SMAX, VT, Legal);
-      setOperationAction(ISD::UMAX, VT, Legal);
-      setOperationAction(ISD::SMIN, VT, Legal);
-      setOperationAction(ISD::UMIN, VT, Legal);
+      setOperationAction(ISD::UREM, VT, Expand);
+      setOperationAction(ISD::SREM, VT, Expand);
+      setOperationAction(ISD::SDIVREM, VT, Expand);
+      setOperationAction(ISD::UDIVREM, VT, Expand);
     }
 
     for (auto VT :
          { MVT::nxv2i8, MVT::nxv2i16, MVT::nxv2i32, MVT::nxv2i64, MVT::nxv4i8,
            MVT::nxv4i16, MVT::nxv4i32, MVT::nxv8i8, MVT::nxv8i16 })
       setOperationAction(ISD::SIGN_EXTEND_INREG, VT, Legal);
+
+    for (auto VT :
+         { MVT::nxv2f16, MVT::nxv4f16, MVT::nxv8f16, MVT::nxv2f32, MVT::nxv4f32,
+           MVT::nxv2f64 }) {
+      setCondCodeAction(ISD::SETO, VT, Expand);
+      setCondCodeAction(ISD::SETOLT, VT, Expand);
+      setCondCodeAction(ISD::SETOLE, VT, Expand);
+      setCondCodeAction(ISD::SETULT, VT, Expand);
+      setCondCodeAction(ISD::SETULE, VT, Expand);
+      setCondCodeAction(ISD::SETUGE, VT, Expand);
+      setCondCodeAction(ISD::SETUGT, VT, Expand);
+      setCondCodeAction(ISD::SETUEQ, VT, Expand);
+      setCondCodeAction(ISD::SETUNE, VT, Expand);
+    }
   }
 
   // Compute derived properties from the register classes
@@ -211,6 +251,12 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::SETCC, MVT::f16, Custom);
   setOperationAction(ISD::SETCC, MVT::f32, Custom);
   setOperationAction(ISD::SETCC, MVT::f64, Custom);
+  setOperationAction(ISD::STRICT_FSETCC, MVT::f16, Custom);
+  setOperationAction(ISD::STRICT_FSETCC, MVT::f32, Custom);
+  setOperationAction(ISD::STRICT_FSETCC, MVT::f64, Custom);
+  setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
+  setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
+  setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
   setOperationAction(ISD::BITREVERSE, MVT::i64, Legal);
   setOperationAction(ISD::BRCOND, MVT::Other, Expand);
@@ -266,6 +312,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FSUB, MVT::f128, Custom);
   setOperationAction(ISD::FTRUNC, MVT::f128, Expand);
   setOperationAction(ISD::SETCC, MVT::f128, Custom);
+  setOperationAction(ISD::STRICT_FSETCC, MVT::f128, Custom);
+  setOperationAction(ISD::STRICT_FSETCCS, MVT::f128, Custom);
   setOperationAction(ISD::BR_CC, MVT::f128, Custom);
   setOperationAction(ISD::SELECT, MVT::f128, Custom);
   setOperationAction(ISD::SELECT_CC, MVT::f128, Custom);
@@ -276,17 +324,31 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
   setOperationAction(ISD::FP_TO_SINT, MVT::i64, Custom);
   setOperationAction(ISD::FP_TO_SINT, MVT::i128, Custom);
+  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
+  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i64, Custom);
+  setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i128, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
   setOperationAction(ISD::FP_TO_UINT, MVT::i128, Custom);
+  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
+  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
+  setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i128, Custom);
   setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
   setOperationAction(ISD::SINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::SINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i128, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i32, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i128, Custom);
+  setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Custom);
+  setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i64, Custom);
+  setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i128, Custom);
   setOperationAction(ISD::FP_ROUND, MVT::f32, Custom);
   setOperationAction(ISD::FP_ROUND, MVT::f64, Custom);
+  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Custom);
+  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Custom);
 
   // Variable arguments.
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
@@ -327,12 +389,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::ROTR, VT, Expand);
   }
 
+  // AArch64 doesn't have i32 MULH{S|U}.
+  setOperationAction(ISD::MULHU, MVT::i32, Expand);
+  setOperationAction(ISD::MULHS, MVT::i32, Expand);
+
   // AArch64 doesn't have {U|S}MUL_LOHI.
   setOperationAction(ISD::UMUL_LOHI, MVT::i64, Expand);
   setOperationAction(ISD::SMUL_LOHI, MVT::i64, Expand);
 
   setOperationAction(ISD::CTPOP, MVT::i32, Custom);
   setOperationAction(ISD::CTPOP, MVT::i64, Custom);
+  setOperationAction(ISD::CTPOP, MVT::i128, Custom);
 
   setOperationAction(ISD::SDIVREM, MVT::i32, Expand);
   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
@@ -525,6 +592,17 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::LOAD, MVT::i128, Custom);
   setOperationAction(ISD::STORE, MVT::i128, Custom);
 
+  // 256 bit non-temporal stores can be lowered to STNP. Do this as part of the
+  // custom lowering, as there are no un-paired non-temporal stores and
+  // legalization will break up 256 bit inputs.
+  setOperationAction(ISD::STORE, MVT::v32i8, Custom);
+  setOperationAction(ISD::STORE, MVT::v16i16, Custom);
+  setOperationAction(ISD::STORE, MVT::v16f16, Custom);
+  setOperationAction(ISD::STORE, MVT::v8i32, Custom);
+  setOperationAction(ISD::STORE, MVT::v8f32, Custom);
+  setOperationAction(ISD::STORE, MVT::v4f64, Custom);
+  setOperationAction(ISD::STORE, MVT::v4i64, Custom);
+
   // Lower READCYCLECOUNTER using an mrs from PMCCNTR_EL0.
   // This requires the Performance Monitors extension.
   if (Subtarget->hasPerfMon())
@@ -574,6 +652,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::BITCAST, MVT::i16, Custom);
   setOperationAction(ISD::BITCAST, MVT::f16, Custom);
+  setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
 
   // Indexed loads and stores are supported.
   for (unsigned im = (unsigned)ISD::PRE_INC;
@@ -585,6 +664,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setIndexedLoadAction(im, MVT::f64, Legal);
     setIndexedLoadAction(im, MVT::f32, Legal);
     setIndexedLoadAction(im, MVT::f16, Legal);
+    setIndexedLoadAction(im, MVT::bf16, Legal);
     setIndexedStoreAction(im, MVT::i8, Legal);
     setIndexedStoreAction(im, MVT::i16, Legal);
     setIndexedStoreAction(im, MVT::i32, Legal);
@@ -592,6 +672,7 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     setIndexedStoreAction(im, MVT::f64, Legal);
     setIndexedStoreAction(im, MVT::f32, Legal);
     setIndexedStoreAction(im, MVT::f16, Legal);
+    setIndexedStoreAction(im, MVT::bf16, Legal);
   }
 
   // Trap.
@@ -769,6 +850,8 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::UADDSAT, VT, Legal);
       setOperationAction(ISD::SSUBSAT, VT, Legal);
       setOperationAction(ISD::USUBSAT, VT, Legal);
+
+      setOperationAction(ISD::TRUNCATE, VT, Custom);
     }
     for (MVT VT : { MVT::v4f16, MVT::v2f32,
                     MVT::v8f16, MVT::v4f32, MVT::v2f64 }) {
@@ -825,6 +908,9 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
       }
     }
 
+    if (Subtarget->hasSVE())
+      setOperationAction(ISD::VSCALE, MVT::i32, Custom);
+
     setTruncStoreAction(MVT::v4i16, MVT::v4i8, Custom);
   }
 
@@ -833,11 +919,60 @@ AArch64TargetLowering::AArch64TargetLowering(const TargetMachine &TM,
     // splat of 0 or undef) once vector selects supported in SVE codegen. See
     // D68877 for more details.
     for (MVT VT : MVT::integer_scalable_vector_valuetypes()) {
-      if (isTypeLegal(VT))
+      if (isTypeLegal(VT)) {
+        setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
         setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+        setOperationAction(ISD::SELECT, VT, Custom);
+        setOperationAction(ISD::SDIV, VT, Custom);
+        setOperationAction(ISD::UDIV, VT, Custom);
+        setOperationAction(ISD::SMIN, VT, Custom);
+        setOperationAction(ISD::UMIN, VT, Custom);
+        setOperationAction(ISD::SMAX, VT, Custom);
+        setOperationAction(ISD::UMAX, VT, Custom);
+        setOperationAction(ISD::SHL, VT, Custom);
+        setOperationAction(ISD::SRL, VT, Custom);
+        setOperationAction(ISD::SRA, VT, Custom);
+        if (VT.getScalarType() == MVT::i1)
+          setOperationAction(ISD::SETCC, VT, Custom);
+      }
     }
+
+    for (auto VT : {MVT::nxv8i8, MVT::nxv4i16, MVT::nxv2i32})
+      setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i8, Custom);
     setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::i16, Custom);
+
+    for (MVT VT : MVT::fp_scalable_vector_valuetypes()) {
+      if (isTypeLegal(VT)) {
+        setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
+        setOperationAction(ISD::SPLAT_VECTOR, VT, Custom);
+        setOperationAction(ISD::SELECT, VT, Custom);
+        setOperationAction(ISD::FMA, VT, Custom);
+      }
+    }
+
+    // NOTE: Currently this has to happen after computeRegisterProperties rather
+    // than the preferred option of combining it with the addRegisterClass call.
+    if (useSVEForFixedLengthVectors()) {
+      for (MVT VT : MVT::integer_fixedlen_vector_valuetypes())
+        if (useSVEForFixedLengthVectorVT(VT))
+          addTypeForFixedLengthSVE(VT);
+      for (MVT VT : MVT::fp_fixedlen_vector_valuetypes())
+        if (useSVEForFixedLengthVectorVT(VT))
+          addTypeForFixedLengthSVE(VT);
+
+      // 64bit results can mean a bigger than NEON input.
+      for (auto VT : {MVT::v8i8, MVT::v4i16})
+        setOperationAction(ISD::TRUNCATE, VT, Custom);
+      setOperationAction(ISD::FP_ROUND, MVT::v4f16, Custom);
+
+      // 128bit results imply a bigger than NEON input.
+      for (auto VT : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
+        setOperationAction(ISD::TRUNCATE, VT, Custom);
+      for (auto VT : {MVT::v8f16, MVT::v4f32})
+        setOperationAction(ISD::FP_ROUND, VT, Expand);
+    }
   }
 
   PredictableSelectIsExpensive = Subtarget->predictableSelectIsExpensive();
@@ -922,6 +1057,24 @@ void AArch64TargetLowering::addTypeForNEON(MVT VT, MVT PromotedBitwiseVT) {
   }
 }
 
+void AArch64TargetLowering::addTypeForFixedLengthSVE(MVT VT) {
+  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+  // By default everything must be expanded.
+  for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op)
+    setOperationAction(Op, VT, Expand);
+
+  // We use EXTRACT_SUBVECTOR to "cast" a scalable vector to a fixed length one.
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
+
+  // Lower fixed length vector operations to scalable equivalents.
+  setOperationAction(ISD::ADD, VT, Custom);
+  setOperationAction(ISD::FADD, VT, Custom);
+  setOperationAction(ISD::LOAD, VT, Custom);
+  setOperationAction(ISD::STORE, VT, Custom);
+  setOperationAction(ISD::TRUNCATE, VT, Custom);
+}
+
 void AArch64TargetLowering::addDRTypeForNEON(MVT VT) {
   addRegisterClass(VT, &AArch64::FPR64RegClass);
   addTypeForNEON(VT, MVT::v2i32);
@@ -932,10 +1085,12 @@ void AArch64TargetLowering::addQRTypeForNEON(MVT VT) {
   addTypeForNEON(VT, MVT::v4i32);
 }
 
-EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
-                                              EVT VT) const {
+EVT AArch64TargetLowering::getSetCCResultType(const DataLayout &,
+                                              LLVMContext &C, EVT VT) const {
   if (!VT.isVector())
     return MVT::i32;
+  if (VT.isScalableVector())
+    return EVT::getVectorVT(C, MVT::i1, VT.getVectorElementCount());
   return VT.changeVectorElementTypeToInteger();
 }
 
@@ -1035,7 +1190,8 @@ static bool optimizeLogicalImm(SDValue Op, unsigned Size, uint64_t Imm,
 }
 
 bool AArch64TargetLowering::targetShrinkDemandedConstant(
-    SDValue Op, const APInt &Demanded, TargetLoweringOpt &TLO) const {
+    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+    TargetLoweringOpt &TLO) const {
   // Delay this optimization to as late as possible.
   if (!TLO.LegalOps)
     return false;
@@ -1052,7 +1208,7 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant(
          "i32 or i64 is expected after legalization.");
 
   // Exit early if we demand all bits.
-  if (Demanded.countPopulation() == Size)
+  if (DemandedBits.countPopulation() == Size)
     return false;
 
   unsigned NewOpc;
@@ -1073,7 +1229,7 @@ bool AArch64TargetLowering::targetShrinkDemandedConstant(
   if (!C)
     return false;
   uint64_t Imm = C->getZExtValue();
-  return optimizeLogicalImm(Op, Size, Imm, Demanded, TLO, NewOpc);
+  return optimizeLogicalImm(Op, Size, Imm, DemandedBits, TLO, NewOpc);
 }
 
 /// computeKnownBitsForTargetNode - Determine which of the bits specified in
@@ -1177,7 +1333,7 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
 
 // Same as above but handling LLTs instead.
 bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
-    LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
+    LLT Ty, unsigned AddrSpace, Align Alignment, MachineMemOperand::Flags Flags,
     bool *Fast) const {
   if (Subtarget->requiresStrictAlign())
     return false;
@@ -1192,7 +1348,7 @@ bool AArch64TargetLowering::allowsMisalignedMemoryAccesses(
             // Code that uses clang vector extensions can mark that it
             // wants unaligned accesses to be treated as fast by
             // underspecifying alignment to be 1 or 2.
-            Align <= 2 ||
+            Alignment <= 2 ||
 
             // Disregard v2i64. Memcpy lowering produces those and splitting
             // them regresses performance on micro-benchmarks and olden/bh.
@@ -1208,181 +1364,246 @@ AArch64TargetLowering::createFastISel(FunctionLoweringInfo &funcInfo,
 }
 
 const char *AArch64TargetLowering::getTargetNodeName(unsigned Opcode) const {
+#define MAKE_CASE(V)                                                           \
+  case V:                                                                      \
+    return #V;
   switch ((AArch64ISD::NodeType)Opcode) {
-  case AArch64ISD::FIRST_NUMBER:      break;
-  case AArch64ISD::CALL:              return "AArch64ISD::CALL";
-  case AArch64ISD::ADRP:              return "AArch64ISD::ADRP";
-  case AArch64ISD::ADR:               return "AArch64ISD::ADR";
-  case AArch64ISD::ADDlow:            return "AArch64ISD::ADDlow";
-  case AArch64ISD::LOADgot:           return "AArch64ISD::LOADgot";
-  case AArch64ISD::RET_FLAG:          return "AArch64ISD::RET_FLAG";
-  case AArch64ISD::BRCOND:            return "AArch64ISD::BRCOND";
-  case AArch64ISD::CSEL:              return "AArch64ISD::CSEL";
-  case AArch64ISD::FCSEL:             return "AArch64ISD::FCSEL";
-  case AArch64ISD::CSINV:             return "AArch64ISD::CSINV";
-  case AArch64ISD::CSNEG:             return "AArch64ISD::CSNEG";
-  case AArch64ISD::CSINC:             return "AArch64ISD::CSINC";
-  case AArch64ISD::THREAD_POINTER:    return "AArch64ISD::THREAD_POINTER";
-  case AArch64ISD::TLSDESC_CALLSEQ:   return "AArch64ISD::TLSDESC_CALLSEQ";
-  case AArch64ISD::ADC:               return "AArch64ISD::ADC";
-  case AArch64ISD::SBC:               return "AArch64ISD::SBC";
-  case AArch64ISD::ADDS:              return "AArch64ISD::ADDS";
-  case AArch64ISD::SUBS:              return "AArch64ISD::SUBS";
-  case AArch64ISD::ADCS:              return "AArch64ISD::ADCS";
-  case AArch64ISD::SBCS:              return "AArch64ISD::SBCS";
-  case AArch64ISD::ANDS:              return "AArch64ISD::ANDS";
-  case AArch64ISD::CCMP:              return "AArch64ISD::CCMP";
-  case AArch64ISD::CCMN:              return "AArch64ISD::CCMN";
-  case AArch64ISD::FCCMP:             return "AArch64ISD::FCCMP";
-  case AArch64ISD::FCMP:              return "AArch64ISD::FCMP";
-  case AArch64ISD::DUP:               return "AArch64ISD::DUP";
-  case AArch64ISD::DUPLANE8:          return "AArch64ISD::DUPLANE8";
-  case AArch64ISD::DUPLANE16:         return "AArch64ISD::DUPLANE16";
-  case AArch64ISD::DUPLANE32:         return "AArch64ISD::DUPLANE32";
-  case AArch64ISD::DUPLANE64:         return "AArch64ISD::DUPLANE64";
-  case AArch64ISD::MOVI:              return "AArch64ISD::MOVI";
-  case AArch64ISD::MOVIshift:         return "AArch64ISD::MOVIshift";
-  case AArch64ISD::MOVIedit:          return "AArch64ISD::MOVIedit";
-  case AArch64ISD::MOVImsl:           return "AArch64ISD::MOVImsl";
-  case AArch64ISD::FMOV:              return "AArch64ISD::FMOV";
-  case AArch64ISD::MVNIshift:         return "AArch64ISD::MVNIshift";
-  case AArch64ISD::MVNImsl:           return "AArch64ISD::MVNImsl";
-  case AArch64ISD::BICi:              return "AArch64ISD::BICi";
-  case AArch64ISD::ORRi:              return "AArch64ISD::ORRi";
-  case AArch64ISD::BSL:               return "AArch64ISD::BSL";
-  case AArch64ISD::NEG:               return "AArch64ISD::NEG";
-  case AArch64ISD::EXTR:              return "AArch64ISD::EXTR";
-  case AArch64ISD::ZIP1:              return "AArch64ISD::ZIP1";
-  case AArch64ISD::ZIP2:              return "AArch64ISD::ZIP2";
-  case AArch64ISD::UZP1:              return "AArch64ISD::UZP1";
-  case AArch64ISD::UZP2:              return "AArch64ISD::UZP2";
-  case AArch64ISD::TRN1:              return "AArch64ISD::TRN1";
-  case AArch64ISD::TRN2:              return "AArch64ISD::TRN2";
-  case AArch64ISD::REV16:             return "AArch64ISD::REV16";
-  case AArch64ISD::REV32:             return "AArch64ISD::REV32";
-  case AArch64ISD::REV64:             return "AArch64ISD::REV64";
-  case AArch64ISD::EXT:               return "AArch64ISD::EXT";
-  case AArch64ISD::VSHL:              return "AArch64ISD::VSHL";
-  case AArch64ISD::VLSHR:             return "AArch64ISD::VLSHR";
-  case AArch64ISD::VASHR:             return "AArch64ISD::VASHR";
-  case AArch64ISD::CMEQ:              return "AArch64ISD::CMEQ";
-  case AArch64ISD::CMGE:              return "AArch64ISD::CMGE";
-  case AArch64ISD::CMGT:              return "AArch64ISD::CMGT";
-  case AArch64ISD::CMHI:              return "AArch64ISD::CMHI";
-  case AArch64ISD::CMHS:              return "AArch64ISD::CMHS";
-  case AArch64ISD::FCMEQ:             return "AArch64ISD::FCMEQ";
-  case AArch64ISD::FCMGE:             return "AArch64ISD::FCMGE";
-  case AArch64ISD::FCMGT:             return "AArch64ISD::FCMGT";
-  case AArch64ISD::CMEQz:             return "AArch64ISD::CMEQz";
-  case AArch64ISD::CMGEz:             return "AArch64ISD::CMGEz";
-  case AArch64ISD::CMGTz:             return "AArch64ISD::CMGTz";
-  case AArch64ISD::CMLEz:             return "AArch64ISD::CMLEz";
-  case AArch64ISD::CMLTz:             return "AArch64ISD::CMLTz";
-  case AArch64ISD::FCMEQz:            return "AArch64ISD::FCMEQz";
-  case AArch64ISD::FCMGEz:            return "AArch64ISD::FCMGEz";
-  case AArch64ISD::FCMGTz:            return "AArch64ISD::FCMGTz";
-  case AArch64ISD::FCMLEz:            return "AArch64ISD::FCMLEz";
-  case AArch64ISD::FCMLTz:            return "AArch64ISD::FCMLTz";
-  case AArch64ISD::SADDV:             return "AArch64ISD::SADDV";
-  case AArch64ISD::UADDV:             return "AArch64ISD::UADDV";
-  case AArch64ISD::SMINV:             return "AArch64ISD::SMINV";
-  case AArch64ISD::UMINV:             return "AArch64ISD::UMINV";
-  case AArch64ISD::SMAXV:             return "AArch64ISD::SMAXV";
-  case AArch64ISD::UMAXV:             return "AArch64ISD::UMAXV";
-  case AArch64ISD::SMAXV_PRED:        return "AArch64ISD::SMAXV_PRED";
-  case AArch64ISD::UMAXV_PRED:        return "AArch64ISD::UMAXV_PRED";
-  case AArch64ISD::SMINV_PRED:        return "AArch64ISD::SMINV_PRED";
-  case AArch64ISD::UMINV_PRED:        return "AArch64ISD::UMINV_PRED";
-  case AArch64ISD::ORV_PRED:          return "AArch64ISD::ORV_PRED";
-  case AArch64ISD::EORV_PRED:         return "AArch64ISD::EORV_PRED";
-  case AArch64ISD::ANDV_PRED:         return "AArch64ISD::ANDV_PRED";
-  case AArch64ISD::CLASTA_N:          return "AArch64ISD::CLASTA_N";
-  case AArch64ISD::CLASTB_N:          return "AArch64ISD::CLASTB_N";
-  case AArch64ISD::LASTA:             return "AArch64ISD::LASTA";
-  case AArch64ISD::LASTB:             return "AArch64ISD::LASTB";
-  case AArch64ISD::REV:               return "AArch64ISD::REV";
-  case AArch64ISD::TBL:               return "AArch64ISD::TBL";
-  case AArch64ISD::NOT:               return "AArch64ISD::NOT";
-  case AArch64ISD::BIT:               return "AArch64ISD::BIT";
-  case AArch64ISD::CBZ:               return "AArch64ISD::CBZ";
-  case AArch64ISD::CBNZ:              return "AArch64ISD::CBNZ";
-  case AArch64ISD::TBZ:               return "AArch64ISD::TBZ";
-  case AArch64ISD::TBNZ:              return "AArch64ISD::TBNZ";
-  case AArch64ISD::TC_RETURN:         return "AArch64ISD::TC_RETURN";
-  case AArch64ISD::PREFETCH:          return "AArch64ISD::PREFETCH";
-  case AArch64ISD::SITOF:             return "AArch64ISD::SITOF";
-  case AArch64ISD::UITOF:             return "AArch64ISD::UITOF";
-  case AArch64ISD::NVCAST:            return "AArch64ISD::NVCAST";
-  case AArch64ISD::SQSHL_I:           return "AArch64ISD::SQSHL_I";
-  case AArch64ISD::UQSHL_I:           return "AArch64ISD::UQSHL_I";
-  case AArch64ISD::SRSHR_I:           return "AArch64ISD::SRSHR_I";
-  case AArch64ISD::URSHR_I:           return "AArch64ISD::URSHR_I";
-  case AArch64ISD::SQSHLU_I:          return "AArch64ISD::SQSHLU_I";
-  case AArch64ISD::WrapperLarge:      return "AArch64ISD::WrapperLarge";
-  case AArch64ISD::LD2post:           return "AArch64ISD::LD2post";
-  case AArch64ISD::LD3post:           return "AArch64ISD::LD3post";
-  case AArch64ISD::LD4post:           return "AArch64ISD::LD4post";
-  case AArch64ISD::ST2post:           return "AArch64ISD::ST2post";
-  case AArch64ISD::ST3post:           return "AArch64ISD::ST3post";
-  case AArch64ISD::ST4post:           return "AArch64ISD::ST4post";
-  case AArch64ISD::LD1x2post:         return "AArch64ISD::LD1x2post";
-  case AArch64ISD::LD1x3post:         return "AArch64ISD::LD1x3post";
-  case AArch64ISD::LD1x4post:         return "AArch64ISD::LD1x4post";
-  case AArch64ISD::ST1x2post:         return "AArch64ISD::ST1x2post";
-  case AArch64ISD::ST1x3post:         return "AArch64ISD::ST1x3post";
-  case AArch64ISD::ST1x4post:         return "AArch64ISD::ST1x4post";
-  case AArch64ISD::LD1DUPpost:        return "AArch64ISD::LD1DUPpost";
-  case AArch64ISD::LD2DUPpost:        return "AArch64ISD::LD2DUPpost";
-  case AArch64ISD::LD3DUPpost:        return "AArch64ISD::LD3DUPpost";
-  case AArch64ISD::LD4DUPpost:        return "AArch64ISD::LD4DUPpost";
-  case AArch64ISD::LD1LANEpost:       return "AArch64ISD::LD1LANEpost";
-  case AArch64ISD::LD2LANEpost:       return "AArch64ISD::LD2LANEpost";
-  case AArch64ISD::LD3LANEpost:       return "AArch64ISD::LD3LANEpost";
-  case AArch64ISD::LD4LANEpost:       return "AArch64ISD::LD4LANEpost";
-  case AArch64ISD::ST2LANEpost:       return "AArch64ISD::ST2LANEpost";
-  case AArch64ISD::ST3LANEpost:       return "AArch64ISD::ST3LANEpost";
-  case AArch64ISD::ST4LANEpost:       return "AArch64ISD::ST4LANEpost";
-  case AArch64ISD::SMULL:             return "AArch64ISD::SMULL";
-  case AArch64ISD::UMULL:             return "AArch64ISD::UMULL";
-  case AArch64ISD::FRECPE:            return "AArch64ISD::FRECPE";
-  case AArch64ISD::FRECPS:            return "AArch64ISD::FRECPS";
-  case AArch64ISD::FRSQRTE:           return "AArch64ISD::FRSQRTE";
-  case AArch64ISD::FRSQRTS:           return "AArch64ISD::FRSQRTS";
-  case AArch64ISD::STG:               return "AArch64ISD::STG";
-  case AArch64ISD::STZG:              return "AArch64ISD::STZG";
-  case AArch64ISD::ST2G:              return "AArch64ISD::ST2G";
-  case AArch64ISD::STZ2G:             return "AArch64ISD::STZ2G";
-  case AArch64ISD::SUNPKHI:           return "AArch64ISD::SUNPKHI";
-  case AArch64ISD::SUNPKLO:           return "AArch64ISD::SUNPKLO";
-  case AArch64ISD::UUNPKHI:           return "AArch64ISD::UUNPKHI";
-  case AArch64ISD::UUNPKLO:           return "AArch64ISD::UUNPKLO";
-  case AArch64ISD::INSR:              return "AArch64ISD::INSR";
-  case AArch64ISD::PTEST:             return "AArch64ISD::PTEST";
-  case AArch64ISD::PTRUE:             return "AArch64ISD::PTRUE";
-  case AArch64ISD::GLD1:              return "AArch64ISD::GLD1";
-  case AArch64ISD::GLD1_SCALED:       return "AArch64ISD::GLD1_SCALED";
-  case AArch64ISD::GLD1_SXTW:         return "AArch64ISD::GLD1_SXTW";
-  case AArch64ISD::GLD1_UXTW:         return "AArch64ISD::GLD1_UXTW";
-  case AArch64ISD::GLD1_SXTW_SCALED:  return "AArch64ISD::GLD1_SXTW_SCALED";
-  case AArch64ISD::GLD1_UXTW_SCALED:  return "AArch64ISD::GLD1_UXTW_SCALED";
-  case AArch64ISD::GLD1_IMM:          return "AArch64ISD::GLD1_IMM";
-  case AArch64ISD::GLD1S:             return "AArch64ISD::GLD1S";
-  case AArch64ISD::GLD1S_SCALED:      return "AArch64ISD::GLD1S_SCALED";
-  case AArch64ISD::GLD1S_SXTW:        return "AArch64ISD::GLD1S_SXTW";
-  case AArch64ISD::GLD1S_UXTW:        return "AArch64ISD::GLD1S_UXTW";
-  case AArch64ISD::GLD1S_SXTW_SCALED: return "AArch64ISD::GLD1S_SXTW_SCALED";
-  case AArch64ISD::GLD1S_UXTW_SCALED: return "AArch64ISD::GLD1S_UXTW_SCALED";
-  case AArch64ISD::GLD1S_IMM:         return "AArch64ISD::GLD1S_IMM";
-  case AArch64ISD::SST1:              return "AArch64ISD::SST1";
-  case AArch64ISD::SST1_SCALED:       return "AArch64ISD::SST1_SCALED";
-  case AArch64ISD::SST1_SXTW:         return "AArch64ISD::SST1_SXTW";
-  case AArch64ISD::SST1_UXTW:         return "AArch64ISD::SST1_UXTW";
-  case AArch64ISD::SST1_SXTW_SCALED:  return "AArch64ISD::SST1_SXTW_SCALED";
-  case AArch64ISD::SST1_UXTW_SCALED:  return "AArch64ISD::SST1_UXTW_SCALED";
-  case AArch64ISD::SST1_IMM:          return "AArch64ISD::SST1_IMM";
-  case AArch64ISD::LDP:               return "AArch64ISD::LDP";
-  case AArch64ISD::STP:               return "AArch64ISD::STP";
-  }
+  case AArch64ISD::FIRST_NUMBER:
+    break;
+    MAKE_CASE(AArch64ISD::CALL)
+    MAKE_CASE(AArch64ISD::ADRP)
+    MAKE_CASE(AArch64ISD::ADR)
+    MAKE_CASE(AArch64ISD::ADDlow)
+    MAKE_CASE(AArch64ISD::LOADgot)
+    MAKE_CASE(AArch64ISD::RET_FLAG)
+    MAKE_CASE(AArch64ISD::BRCOND)
+    MAKE_CASE(AArch64ISD::CSEL)
+    MAKE_CASE(AArch64ISD::FCSEL)
+    MAKE_CASE(AArch64ISD::CSINV)
+    MAKE_CASE(AArch64ISD::CSNEG)
+    MAKE_CASE(AArch64ISD::CSINC)
+    MAKE_CASE(AArch64ISD::THREAD_POINTER)
+    MAKE_CASE(AArch64ISD::TLSDESC_CALLSEQ)
+    MAKE_CASE(AArch64ISD::ADD_PRED)
+    MAKE_CASE(AArch64ISD::SDIV_PRED)
+    MAKE_CASE(AArch64ISD::UDIV_PRED)
+    MAKE_CASE(AArch64ISD::SMIN_MERGE_OP1)
+    MAKE_CASE(AArch64ISD::UMIN_MERGE_OP1)
+    MAKE_CASE(AArch64ISD::SMAX_MERGE_OP1)
+    MAKE_CASE(AArch64ISD::UMAX_MERGE_OP1)
+    MAKE_CASE(AArch64ISD::SHL_MERGE_OP1)
+    MAKE_CASE(AArch64ISD::SRL_MERGE_OP1)
+    MAKE_CASE(AArch64ISD::SRA_MERGE_OP1)
+    MAKE_CASE(AArch64ISD::SETCC_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::ADC)
+    MAKE_CASE(AArch64ISD::SBC)
+    MAKE_CASE(AArch64ISD::ADDS)
+    MAKE_CASE(AArch64ISD::SUBS)
+    MAKE_CASE(AArch64ISD::ADCS)
+    MAKE_CASE(AArch64ISD::SBCS)
+    MAKE_CASE(AArch64ISD::ANDS)
+    MAKE_CASE(AArch64ISD::CCMP)
+    MAKE_CASE(AArch64ISD::CCMN)
+    MAKE_CASE(AArch64ISD::FCCMP)
+    MAKE_CASE(AArch64ISD::FCMP)
+    MAKE_CASE(AArch64ISD::STRICT_FCMP)
+    MAKE_CASE(AArch64ISD::STRICT_FCMPE)
+    MAKE_CASE(AArch64ISD::DUP)
+    MAKE_CASE(AArch64ISD::DUPLANE8)
+    MAKE_CASE(AArch64ISD::DUPLANE16)
+    MAKE_CASE(AArch64ISD::DUPLANE32)
+    MAKE_CASE(AArch64ISD::DUPLANE64)
+    MAKE_CASE(AArch64ISD::MOVI)
+    MAKE_CASE(AArch64ISD::MOVIshift)
+    MAKE_CASE(AArch64ISD::MOVIedit)
+    MAKE_CASE(AArch64ISD::MOVImsl)
+    MAKE_CASE(AArch64ISD::FMOV)
+    MAKE_CASE(AArch64ISD::MVNIshift)
+    MAKE_CASE(AArch64ISD::MVNImsl)
+    MAKE_CASE(AArch64ISD::BICi)
+    MAKE_CASE(AArch64ISD::ORRi)
+    MAKE_CASE(AArch64ISD::BSP)
+    MAKE_CASE(AArch64ISD::NEG)
+    MAKE_CASE(AArch64ISD::EXTR)
+    MAKE_CASE(AArch64ISD::ZIP1)
+    MAKE_CASE(AArch64ISD::ZIP2)
+    MAKE_CASE(AArch64ISD::UZP1)
+    MAKE_CASE(AArch64ISD::UZP2)
+    MAKE_CASE(AArch64ISD::TRN1)
+    MAKE_CASE(AArch64ISD::TRN2)
+    MAKE_CASE(AArch64ISD::REV16)
+    MAKE_CASE(AArch64ISD::REV32)
+    MAKE_CASE(AArch64ISD::REV64)
+    MAKE_CASE(AArch64ISD::EXT)
+    MAKE_CASE(AArch64ISD::VSHL)
+    MAKE_CASE(AArch64ISD::VLSHR)
+    MAKE_CASE(AArch64ISD::VASHR)
+    MAKE_CASE(AArch64ISD::VSLI)
+    MAKE_CASE(AArch64ISD::VSRI)
+    MAKE_CASE(AArch64ISD::CMEQ)
+    MAKE_CASE(AArch64ISD::CMGE)
+    MAKE_CASE(AArch64ISD::CMGT)
+    MAKE_CASE(AArch64ISD::CMHI)
+    MAKE_CASE(AArch64ISD::CMHS)
+    MAKE_CASE(AArch64ISD::FCMEQ)
+    MAKE_CASE(AArch64ISD::FCMGE)
+    MAKE_CASE(AArch64ISD::FCMGT)
+    MAKE_CASE(AArch64ISD::CMEQz)
+    MAKE_CASE(AArch64ISD::CMGEz)
+    MAKE_CASE(AArch64ISD::CMGTz)
+    MAKE_CASE(AArch64ISD::CMLEz)
+    MAKE_CASE(AArch64ISD::CMLTz)
+    MAKE_CASE(AArch64ISD::FCMEQz)
+    MAKE_CASE(AArch64ISD::FCMGEz)
+    MAKE_CASE(AArch64ISD::FCMGTz)
+    MAKE_CASE(AArch64ISD::FCMLEz)
+    MAKE_CASE(AArch64ISD::FCMLTz)
+    MAKE_CASE(AArch64ISD::SADDV)
+    MAKE_CASE(AArch64ISD::UADDV)
+    MAKE_CASE(AArch64ISD::SRHADD)
+    MAKE_CASE(AArch64ISD::URHADD)
+    MAKE_CASE(AArch64ISD::SMINV)
+    MAKE_CASE(AArch64ISD::UMINV)
+    MAKE_CASE(AArch64ISD::SMAXV)
+    MAKE_CASE(AArch64ISD::UMAXV)
+    MAKE_CASE(AArch64ISD::SMAXV_PRED)
+    MAKE_CASE(AArch64ISD::UMAXV_PRED)
+    MAKE_CASE(AArch64ISD::SMINV_PRED)
+    MAKE_CASE(AArch64ISD::UMINV_PRED)
+    MAKE_CASE(AArch64ISD::ORV_PRED)
+    MAKE_CASE(AArch64ISD::EORV_PRED)
+    MAKE_CASE(AArch64ISD::ANDV_PRED)
+    MAKE_CASE(AArch64ISD::CLASTA_N)
+    MAKE_CASE(AArch64ISD::CLASTB_N)
+    MAKE_CASE(AArch64ISD::LASTA)
+    MAKE_CASE(AArch64ISD::LASTB)
+    MAKE_CASE(AArch64ISD::REV)
+    MAKE_CASE(AArch64ISD::REINTERPRET_CAST)
+    MAKE_CASE(AArch64ISD::TBL)
+    MAKE_CASE(AArch64ISD::FADD_PRED)
+    MAKE_CASE(AArch64ISD::FADDA_PRED)
+    MAKE_CASE(AArch64ISD::FADDV_PRED)
+    MAKE_CASE(AArch64ISD::FMA_PRED)
+    MAKE_CASE(AArch64ISD::FMAXV_PRED)
+    MAKE_CASE(AArch64ISD::FMAXNMV_PRED)
+    MAKE_CASE(AArch64ISD::FMINV_PRED)
+    MAKE_CASE(AArch64ISD::FMINNMV_PRED)
+    MAKE_CASE(AArch64ISD::NOT)
+    MAKE_CASE(AArch64ISD::BIT)
+    MAKE_CASE(AArch64ISD::CBZ)
+    MAKE_CASE(AArch64ISD::CBNZ)
+    MAKE_CASE(AArch64ISD::TBZ)
+    MAKE_CASE(AArch64ISD::TBNZ)
+    MAKE_CASE(AArch64ISD::TC_RETURN)
+    MAKE_CASE(AArch64ISD::PREFETCH)
+    MAKE_CASE(AArch64ISD::SITOF)
+    MAKE_CASE(AArch64ISD::UITOF)
+    MAKE_CASE(AArch64ISD::NVCAST)
+    MAKE_CASE(AArch64ISD::SQSHL_I)
+    MAKE_CASE(AArch64ISD::UQSHL_I)
+    MAKE_CASE(AArch64ISD::SRSHR_I)
+    MAKE_CASE(AArch64ISD::URSHR_I)
+    MAKE_CASE(AArch64ISD::SQSHLU_I)
+    MAKE_CASE(AArch64ISD::WrapperLarge)
+    MAKE_CASE(AArch64ISD::LD2post)
+    MAKE_CASE(AArch64ISD::LD3post)
+    MAKE_CASE(AArch64ISD::LD4post)
+    MAKE_CASE(AArch64ISD::ST2post)
+    MAKE_CASE(AArch64ISD::ST3post)
+    MAKE_CASE(AArch64ISD::ST4post)
+    MAKE_CASE(AArch64ISD::LD1x2post)
+    MAKE_CASE(AArch64ISD::LD1x3post)
+    MAKE_CASE(AArch64ISD::LD1x4post)
+    MAKE_CASE(AArch64ISD::ST1x2post)
+    MAKE_CASE(AArch64ISD::ST1x3post)
+    MAKE_CASE(AArch64ISD::ST1x4post)
+    MAKE_CASE(AArch64ISD::LD1DUPpost)
+    MAKE_CASE(AArch64ISD::LD2DUPpost)
+    MAKE_CASE(AArch64ISD::LD3DUPpost)
+    MAKE_CASE(AArch64ISD::LD4DUPpost)
+    MAKE_CASE(AArch64ISD::LD1LANEpost)
+    MAKE_CASE(AArch64ISD::LD2LANEpost)
+    MAKE_CASE(AArch64ISD::LD3LANEpost)
+    MAKE_CASE(AArch64ISD::LD4LANEpost)
+    MAKE_CASE(AArch64ISD::ST2LANEpost)
+    MAKE_CASE(AArch64ISD::ST3LANEpost)
+    MAKE_CASE(AArch64ISD::ST4LANEpost)
+    MAKE_CASE(AArch64ISD::SMULL)
+    MAKE_CASE(AArch64ISD::UMULL)
+    MAKE_CASE(AArch64ISD::FRECPE)
+    MAKE_CASE(AArch64ISD::FRECPS)
+    MAKE_CASE(AArch64ISD::FRSQRTE)
+    MAKE_CASE(AArch64ISD::FRSQRTS)
+    MAKE_CASE(AArch64ISD::STG)
+    MAKE_CASE(AArch64ISD::STZG)
+    MAKE_CASE(AArch64ISD::ST2G)
+    MAKE_CASE(AArch64ISD::STZ2G)
+    MAKE_CASE(AArch64ISD::SUNPKHI)
+    MAKE_CASE(AArch64ISD::SUNPKLO)
+    MAKE_CASE(AArch64ISD::UUNPKHI)
+    MAKE_CASE(AArch64ISD::UUNPKLO)
+    MAKE_CASE(AArch64ISD::INSR)
+    MAKE_CASE(AArch64ISD::PTEST)
+    MAKE_CASE(AArch64ISD::PTRUE)
+    MAKE_CASE(AArch64ISD::LD1_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::LD1S_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::LDNF1_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::LDNF1S_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::LDFF1_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::LDFF1S_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::LD1RQ_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::LD1RO_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::SVE_LD2_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::SVE_LD3_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::SVE_LD4_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1_SCALED_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1_SXTW_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1_UXTW_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1_IMM_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1S_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1S_SCALED_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1S_SXTW_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1S_UXTW_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLD1S_IMM_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDFF1_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDFF1_SCALED_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDFF1_SXTW_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDFF1_UXTW_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDFF1_IMM_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDFF1S_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDFF1S_IMM_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDNT1_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDNT1_INDEX_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::GLDNT1S_MERGE_ZERO)
+    MAKE_CASE(AArch64ISD::ST1_PRED)
+    MAKE_CASE(AArch64ISD::SST1_PRED)
+    MAKE_CASE(AArch64ISD::SST1_SCALED_PRED)
+    MAKE_CASE(AArch64ISD::SST1_SXTW_PRED)
+    MAKE_CASE(AArch64ISD::SST1_UXTW_PRED)
+    MAKE_CASE(AArch64ISD::SST1_SXTW_SCALED_PRED)
+    MAKE_CASE(AArch64ISD::SST1_UXTW_SCALED_PRED)
+    MAKE_CASE(AArch64ISD::SST1_IMM_PRED)
+    MAKE_CASE(AArch64ISD::SSTNT1_PRED)
+    MAKE_CASE(AArch64ISD::SSTNT1_INDEX_PRED)
+    MAKE_CASE(AArch64ISD::LDP)
+    MAKE_CASE(AArch64ISD::STP)
+    MAKE_CASE(AArch64ISD::STNP)
+    MAKE_CASE(AArch64ISD::DUP_MERGE_PASSTHRU)
+    MAKE_CASE(AArch64ISD::INDEX_VECTOR)
+  }
+#undef MAKE_CASE
   return nullptr;
 }
 
@@ -1454,12 +1675,6 @@ MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchRet(
   return BB;
 }
 
-MachineBasicBlock *AArch64TargetLowering::EmitLoweredCatchPad(
-     MachineInstr &MI, MachineBasicBlock *BB) const {
-  MI.eraseFromParent();
-  return BB;
-}
-
 MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *BB) const {
   switch (MI.getOpcode()) {
@@ -1478,8 +1693,6 @@ MachineBasicBlock *AArch64TargetLowering::EmitInstrWithCustomInserter(
 
   case AArch64::CATCHRET:
     return EmitLoweredCatchRet(MI, BB);
-  case AArch64::CATCHPAD:
-    return EmitLoweredCatchPad(MI, BB);
   }
 }
 
@@ -1668,6 +1881,17 @@ static bool isCMN(SDValue Op, ISD::CondCode CC) {
          (CC == ISD::SETEQ || CC == ISD::SETNE);
 }
 
+static SDValue emitStrictFPComparison(SDValue LHS, SDValue RHS, const SDLoc &dl,
+                                      SelectionDAG &DAG, SDValue Chain,
+                                      bool IsSignaling) {
+  EVT VT = LHS.getValueType();
+  assert(VT != MVT::f128);
+  assert(VT != MVT::f16 && "Lowering of strict fp16 not yet implemented");
+  unsigned Opcode =
+      IsSignaling ? AArch64ISD::STRICT_FCMPE : AArch64ISD::STRICT_FCMP;
+  return DAG.getNode(Opcode, dl, {VT, MVT::Other}, {Chain, LHS, RHS});
+}
+
 static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                               const SDLoc &dl, SelectionDAG &DAG) {
   EVT VT = LHS.getValueType();
@@ -1699,14 +1923,22 @@ static SDValue emitComparison(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     // we combine a (CMP (sub 0, op1), op2) into a CMN instruction ?
     Opcode = AArch64ISD::ADDS;
     LHS = LHS.getOperand(1);
-  } else if (LHS.getOpcode() == ISD::AND && isNullConstant(RHS) &&
-             !isUnsignedIntSetCC(CC)) {
-    // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
-    // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
-    // of the signed comparisons.
-    Opcode = AArch64ISD::ANDS;
-    RHS = LHS.getOperand(1);
-    LHS = LHS.getOperand(0);
+  } else if (isNullConstant(RHS) && !isUnsignedIntSetCC(CC)) {
+    if (LHS.getOpcode() == ISD::AND) {
+      // Similarly, (CMP (and X, Y), 0) can be implemented with a TST
+      // (a.k.a. ANDS) except that the flags are only guaranteed to work for one
+      // of the signed comparisons.
+      const SDValue ANDSNode = DAG.getNode(AArch64ISD::ANDS, dl,
+                                           DAG.getVTList(VT, MVT_CC),
+                                           LHS.getOperand(0),
+                                           LHS.getOperand(1));
+      // Replace all users of (and X, Y) with newly generated (ands X, Y)
+      DAG.ReplaceAllUsesWith(LHS, ANDSNode);
+      return ANDSNode.getValue(1);
+    } else if (LHS.getOpcode() == AArch64ISD::ANDS) {
+      // Use result of ANDS
+      return LHS.getValue(1);
+    }
   }
 
   return DAG.getNode(Opcode, dl, DAG.getVTList(VT, MVT_CC), LHS, RHS)
@@ -2284,18 +2516,16 @@ getAArch64XALUOOp(AArch64CC::CondCode &CC, SDValue Op, SelectionDAG &DAG) {
 
 SDValue AArch64TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
                                              RTLIB::Libcall Call) const {
-  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
+  bool IsStrict = Op->isStrictFPOpcode();
+  unsigned Offset = IsStrict ? 1 : 0;
+  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+  SmallVector<SDValue, 2> Ops(Op->op_begin() + Offset, Op->op_end());
   MakeLibCallOptions CallOptions;
-  return makeLibCall(DAG, Call, MVT::f128, Ops, CallOptions, SDLoc(Op)).first;
-}
-
-// Returns true if the given Op is the overflow flag result of an overflow
-// intrinsic operation.
-static bool isOverflowIntrOpRes(SDValue Op) {
-  unsigned Opc = Op.getOpcode();
-  return (Op.getResNo() == 1 &&
-          (Opc == ISD::SADDO || Opc == ISD::UADDO || Opc == ISD::SSUBO ||
-           Opc == ISD::USUBO || Opc == ISD::SMULO || Opc == ISD::UMULO));
+  SDValue Result;
+  SDLoc dl(Op);
+  std::tie(Result, Chain) = makeLibCall(DAG, Call, Op.getValueType(), Ops,
+                                        CallOptions, dl, Chain);
+  return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
 }
 
 static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
@@ -2310,7 +2540,7 @@ static SDValue LowerXOR(SDValue Op, SelectionDAG &DAG) {
   // (csel 1, 0, invert(cc), overflow_op_bool)
   // ... which later gets transformed to just a cset instruction with an
   // inverted condition code, rather than a cset + eor sequence.
-  if (isOneConstant(Other) && isOverflowIntrOpRes(Sel)) {
+  if (isOneConstant(Other) && ISD::isOverflowIntrOpRes(Sel)) {
     // Only lower legal XALUO ops.
     if (!DAG.getTargetLoweringInfo().isTypeLegal(Sel->getValueType(0)))
       return SDValue();
@@ -2483,21 +2713,32 @@ SDValue AArch64TargetLowering::LowerFP_EXTEND(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerFP_ROUND(SDValue Op,
                                              SelectionDAG &DAG) const {
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
+  bool IsStrict = Op->isStrictFPOpcode();
+  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
+  EVT SrcVT = SrcVal.getValueType();
+
+  if (SrcVT != MVT::f128) {
+    // Expand cases where the input is a vector bigger than NEON.
+    if (useSVEForFixedLengthVectorVT(SrcVT))
+      return SDValue();
+
     // It's legal except when f128 is involved
     return Op;
   }
 
   RTLIB::Libcall LC;
-  LC = RTLIB::getFPROUND(Op.getOperand(0).getValueType(), Op.getValueType());
+  LC = RTLIB::getFPROUND(SrcVT, Op.getValueType());
 
   // FP_ROUND node has a second operand indicating whether it is known to be
   // precise. That doesn't take part in the LibCall so we can't directly use
   // LowerF128Call.
-  SDValue SrcVal = Op.getOperand(0);
   MakeLibCallOptions CallOptions;
-  return makeLibCall(DAG, LC, Op.getValueType(), SrcVal, CallOptions,
-                     SDLoc(Op)).first;
+  SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
+  SDValue Result;
+  SDLoc dl(Op);
+  std::tie(Result, Chain) = makeLibCall(DAG, LC, Op.getValueType(), SrcVal,
+                                        CallOptions, dl, Chain);
+  return IsStrict ? DAG.getMergeValues({Result, Chain}, dl) : Result;
 }
 
 SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
@@ -2542,32 +2783,34 @@ SDValue AArch64TargetLowering::LowerVectorFP_TO_INT(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerFP_TO_INT(SDValue Op,
                                               SelectionDAG &DAG) const {
-  if (Op.getOperand(0).getValueType().isVector())
+  bool IsStrict = Op->isStrictFPOpcode();
+  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
+
+  if (SrcVal.getValueType().isVector())
     return LowerVectorFP_TO_INT(Op, DAG);
 
   // f16 conversions are promoted to f32 when full fp16 is not supported.
-  if (Op.getOperand(0).getValueType() == MVT::f16 &&
-      !Subtarget->hasFullFP16()) {
+  if (SrcVal.getValueType() == MVT::f16 && !Subtarget->hasFullFP16()) {
+    assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
     SDLoc dl(Op);
     return DAG.getNode(
         Op.getOpcode(), dl, Op.getValueType(),
-        DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, Op.getOperand(0)));
+        DAG.getNode(ISD::FP_EXTEND, dl, MVT::f32, SrcVal));
   }
 
-  if (Op.getOperand(0).getValueType() != MVT::f128) {
+  if (SrcVal.getValueType() != MVT::f128) {
     // It's legal except when f128 is involved
     return Op;
   }
 
   RTLIB::Libcall LC;
-  if (Op.getOpcode() == ISD::FP_TO_SINT)
-    LC = RTLIB::getFPTOSINT(Op.getOperand(0).getValueType(), Op.getValueType());
+  if (Op.getOpcode() == ISD::FP_TO_SINT ||
+      Op.getOpcode() == ISD::STRICT_FP_TO_SINT)
+    LC = RTLIB::getFPTOSINT(SrcVal.getValueType(), Op.getValueType());
   else
-    LC = RTLIB::getFPTOUINT(Op.getOperand(0).getValueType(), Op.getValueType());
+    LC = RTLIB::getFPTOUINT(SrcVal.getValueType(), Op.getValueType());
 
-  SmallVector<SDValue, 2> Ops(Op->op_begin(), Op->op_end());
-  MakeLibCallOptions CallOptions;
-  return makeLibCall(DAG, LC, Op.getValueType(), Ops, CallOptions, SDLoc(Op)).first;
+  return LowerF128Call(Op, DAG, LC);
 }
 
 static SDValue LowerVectorINT_TO_FP(SDValue Op, SelectionDAG &DAG) {
@@ -2603,18 +2846,22 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
   if (Op.getValueType().isVector())
     return LowerVectorINT_TO_FP(Op, DAG);
 
+  bool IsStrict = Op->isStrictFPOpcode();
+  SDValue SrcVal = Op.getOperand(IsStrict ? 1 : 0);
+
   // f16 conversions are promoted to f32 when full fp16 is not supported.
   if (Op.getValueType() == MVT::f16 &&
       !Subtarget->hasFullFP16()) {
+    assert(!IsStrict && "Lowering of strict fp16 not yet implemented");
     SDLoc dl(Op);
     return DAG.getNode(
         ISD::FP_ROUND, dl, MVT::f16,
-        DAG.getNode(Op.getOpcode(), dl, MVT::f32, Op.getOperand(0)),
+        DAG.getNode(Op.getOpcode(), dl, MVT::f32, SrcVal),
         DAG.getIntPtrConstant(0, dl));
   }
 
   // i128 conversions are libcalls.
-  if (Op.getOperand(0).getValueType() == MVT::i128)
+  if (SrcVal.getValueType() == MVT::i128)
     return SDValue();
 
   // Other conversions are legal, unless it's to the completely software-based
@@ -2623,10 +2870,11 @@ SDValue AArch64TargetLowering::LowerINT_TO_FP(SDValue Op,
     return Op;
 
   RTLIB::Libcall LC;
-  if (Op.getOpcode() == ISD::SINT_TO_FP)
-    LC = RTLIB::getSINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+  if (Op.getOpcode() == ISD::SINT_TO_FP ||
+      Op.getOpcode() == ISD::STRICT_SINT_TO_FP)
+    LC = RTLIB::getSINTTOFP(SrcVal.getValueType(), Op.getValueType());
   else
-    LC = RTLIB::getUINTTOFP(Op.getOperand(0).getValueType(), Op.getValueType());
+    LC = RTLIB::getUINTTOFP(SrcVal.getValueType(), Op.getValueType());
 
   return LowerF128Call(Op, DAG, LC);
 }
@@ -2666,7 +2914,8 @@ SDValue AArch64TargetLowering::LowerFSINCOS(SDValue Op,
 }
 
 static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
-  if (Op.getValueType() != MVT::f16)
+  EVT OpVT = Op.getValueType();
+  if (OpVT != MVT::f16 && OpVT != MVT::bf16)
     return SDValue();
 
   assert(Op.getOperand(0).getValueType() == MVT::i16);
@@ -2675,7 +2924,7 @@ static SDValue LowerBITCAST(SDValue Op, SelectionDAG &DAG) {
   Op = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op.getOperand(0));
   Op = DAG.getNode(ISD::BITCAST, DL, MVT::f32, Op);
   return SDValue(
-      DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, MVT::f16, Op,
+      DAG.getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL, OpVT, Op,
                          DAG.getTargetConstant(AArch64::hsub, DL, MVT::i32)),
       0);
 }
@@ -2804,16 +3053,19 @@ SDValue AArch64TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   // so that the shift + and get folded into a bitfield extract.
   SDLoc dl(Op);
 
-  SDValue FPCR_64 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, MVT::i64,
-                                DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl,
-                                                MVT::i64));
+  SDValue Chain = Op.getOperand(0);
+  SDValue FPCR_64 = DAG.getNode(
+      ISD::INTRINSIC_W_CHAIN, dl, {MVT::i64, MVT::Other},
+      {Chain, DAG.getConstant(Intrinsic::aarch64_get_fpcr, dl, MVT::i64)});
+  Chain = FPCR_64.getValue(1);
   SDValue FPCR_32 = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, FPCR_64);
   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPCR_32,
                                   DAG.getConstant(1U << 22, dl, MVT::i32));
   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
                               DAG.getConstant(22, dl, MVT::i32));
-  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
-                     DAG.getConstant(3, dl, MVT::i32));
+  SDValue AND = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
+                            DAG.getConstant(3, dl, MVT::i32));
+  return DAG.getMergeValues({AND, Chain}, dl);
 }
 
 static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
@@ -2885,6 +3137,12 @@ static SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) {
                                DAG.getNode(ISD::BITCAST, DL, Op1VT, N01), Op1));
 }
 
+static inline SDValue getPTrue(SelectionDAG &DAG, SDLoc DL, EVT VT,
+                               int Pattern) {
+  return DAG.getNode(AArch64ISD::PTRUE, DL, VT,
+                     DAG.getTargetConstant(Pattern, DL, MVT::i32));
+}
+
 SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                                      SelectionDAG &DAG) const {
   unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
@@ -2972,6 +3230,26 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::aarch64_sve_ptrue:
     return DAG.getNode(AArch64ISD::PTRUE, dl, Op.getValueType(),
                        Op.getOperand(1));
+  case Intrinsic::aarch64_sve_dupq_lane:
+    return LowerDUPQLane(Op, DAG);
+  case Intrinsic::aarch64_sve_convert_from_svbool:
+    return DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, Op.getValueType(),
+                       Op.getOperand(1));
+  case Intrinsic::aarch64_sve_convert_to_svbool: {
+    EVT OutVT = Op.getValueType();
+    EVT InVT = Op.getOperand(1).getValueType();
+    // Return the operand if the cast isn't changing type,
+    // i.e. <n x 16 x i1> -> <n x 16 x i1>
+    if (InVT == OutVT)
+      return Op.getOperand(1);
+    // Otherwise, zero the newly introduced lanes.
+    SDValue Reinterpret =
+        DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Op.getOperand(1));
+    SDValue Mask = getPTrue(DAG, dl, InVT, AArch64SVEPredPattern::all);
+    SDValue MaskReinterpret =
+        DAG.getNode(AArch64ISD::REINTERPRET_CAST, dl, OutVT, Mask);
+    return DAG.getNode(ISD::AND, dl, OutVT, Reinterpret, MaskReinterpret);
+  }
 
   case Intrinsic::aarch64_sve_insr: {
     SDValue Scalar = Op.getOperand(2);
@@ -3004,6 +3282,29 @@ SDValue AArch64TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
           "llvm.eh.recoverfp must take a function as the first argument");
     return IncomingFPOp;
   }
+
+  case Intrinsic::aarch64_neon_vsri:
+  case Intrinsic::aarch64_neon_vsli: {
+    EVT Ty = Op.getValueType();
+
+    if (!Ty.isVector())
+      report_fatal_error("Unexpected type for aarch64_neon_vsli");
+
+    assert(Op.getConstantOperandVal(3) <= Ty.getScalarSizeInBits());
+
+    bool IsShiftRight = IntNo == Intrinsic::aarch64_neon_vsri;
+    unsigned Opcode = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
+    return DAG.getNode(Opcode, dl, Ty, Op.getOperand(1), Op.getOperand(2),
+                       Op.getOperand(3));
+  }
+
+  case Intrinsic::aarch64_neon_srhadd:
+  case Intrinsic::aarch64_neon_urhadd: {
+    bool IsSignedAdd = IntNo == Intrinsic::aarch64_neon_srhadd;
+    unsigned Opcode = IsSignedAdd ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
+    return DAG.getNode(Opcode, dl, Op.getValueType(), Op.getOperand(1),
+                       Op.getOperand(2));
+  }
   }
 }
 
@@ -3058,10 +3359,13 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
   EVT MemVT = StoreNode->getMemoryVT();
 
   if (VT.isVector()) {
+    if (useSVEForFixedLengthVectorVT(VT))
+      return LowerFixedLengthVectorStoreToSVE(Op, DAG);
+
     unsigned AS = StoreNode->getAddressSpace();
-    unsigned Align = StoreNode->getAlignment();
-    if (Align < MemVT.getStoreSize() &&
-        !allowsMisalignedMemoryAccesses(MemVT, AS, Align,
+    Align Alignment = StoreNode->getAlign();
+    if (Alignment < MemVT.getStoreSize() &&
+        !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(),
                                         StoreNode->getMemOperand()->getFlags(),
                                         nullptr)) {
       return scalarizeVectorStore(StoreNode, DAG);
@@ -3070,6 +3374,30 @@ SDValue AArch64TargetLowering::LowerSTORE(SDValue Op,
     if (StoreNode->isTruncatingStore()) {
       return LowerTruncateVectorStore(Dl, StoreNode, VT, MemVT, DAG);
     }
+    // 256 bit non-temporal stores can be lowered to STNP. Do this as part of
+    // the custom lowering, as there are no un-paired non-temporal stores and
+    // legalization will break up 256 bit inputs.
+    if (StoreNode->isNonTemporal() && MemVT.getSizeInBits() == 256u &&
+        MemVT.getVectorElementCount().Min % 2u == 0 &&
+        ((MemVT.getScalarSizeInBits() == 8u ||
+          MemVT.getScalarSizeInBits() == 16u ||
+          MemVT.getScalarSizeInBits() == 32u ||
+          MemVT.getScalarSizeInBits() == 64u))) {
+      SDValue Lo =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, Dl,
+                      MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
+                      StoreNode->getValue(), DAG.getConstant(0, Dl, MVT::i64));
+      SDValue Hi = DAG.getNode(
+          ISD::EXTRACT_SUBVECTOR, Dl,
+          MemVT.getHalfNumVectorElementsVT(*DAG.getContext()),
+          StoreNode->getValue(),
+          DAG.getConstant(MemVT.getVectorElementCount().Min / 2, Dl, MVT::i64));
+      SDValue Result = DAG.getMemIntrinsicNode(
+          AArch64ISD::STNP, Dl, DAG.getVTList(MVT::Other),
+          {StoreNode->getChain(), Lo, Hi, StoreNode->getBasePtr()},
+          StoreNode->getMemoryVT(), StoreNode->getMemOperand());
+      return Result;
+    }
   } else if (MemVT == MVT::i128 && StoreNode->isVolatile()) {
     assert(StoreNode->getValue()->getValueType(0) == MVT::i128);
     SDValue Lo =
@@ -3104,6 +3432,8 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::GlobalTLSAddress:
     return LowerGlobalTLSAddress(Op, DAG);
   case ISD::SETCC:
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS:
     return LowerSETCC(Op, DAG);
   case ISD::BR_CC:
     return LowerBR_CC(Op, DAG);
@@ -3138,14 +3468,19 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
   case ISD::UMULO:
     return LowerXALUO(Op, DAG);
   case ISD::FADD:
+    if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::FADD_PRED);
     return LowerF128Call(Op, DAG, RTLIB::ADD_F128);
   case ISD::FSUB:
     return LowerF128Call(Op, DAG, RTLIB::SUB_F128);
   case ISD::FMUL:
     return LowerF128Call(Op, DAG, RTLIB::MUL_F128);
+  case ISD::FMA:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::FMA_PRED);
   case ISD::FDIV:
     return LowerF128Call(Op, DAG, RTLIB::DIV_F128);
   case ISD::FP_ROUND:
+  case ISD::STRICT_FP_ROUND:
     return LowerFP_ROUND(Op, DAG);
   case ISD::FP_EXTEND:
     return LowerFP_EXTEND(Op, DAG);
@@ -3169,6 +3504,20 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerSPLAT_VECTOR(Op, DAG);
   case ISD::EXTRACT_SUBVECTOR:
     return LowerEXTRACT_SUBVECTOR(Op, DAG);
+  case ISD::INSERT_SUBVECTOR:
+    return LowerINSERT_SUBVECTOR(Op, DAG);
+  case ISD::SDIV:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SDIV_PRED);
+  case ISD::UDIV:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UDIV_PRED);
+  case ISD::SMIN:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMIN_MERGE_OP1);
+  case ISD::UMIN:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMIN_MERGE_OP1);
+  case ISD::SMAX:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SMAX_MERGE_OP1);
+  case ISD::UMAX:
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::UMAX_MERGE_OP1);
   case ISD::SRA:
   case ISD::SRL:
   case ISD::SHL:
@@ -3190,9 +3539,13 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerPREFETCH(Op, DAG);
   case ISD::SINT_TO_FP:
   case ISD::UINT_TO_FP:
+  case ISD::STRICT_SINT_TO_FP:
+  case ISD::STRICT_UINT_TO_FP:
     return LowerINT_TO_FP(Op, DAG);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
     return LowerFP_TO_INT(Op, DAG);
   case ISD::FSINCOS:
     return LowerFSINCOS(Op, DAG);
@@ -3218,9 +3571,68 @@ SDValue AArch64TargetLowering::LowerOperation(SDValue Op,
     return LowerATOMIC_LOAD_AND(Op, DAG);
   case ISD::DYNAMIC_STACKALLOC:
     return LowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::VSCALE:
+    return LowerVSCALE(Op, DAG);
+  case ISD::TRUNCATE:
+    return LowerTRUNCATE(Op, DAG);
+  case ISD::LOAD:
+    if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+      return LowerFixedLengthVectorLoadToSVE(Op, DAG);
+    llvm_unreachable("Unexpected request to lower ISD::LOAD");
+  case ISD::ADD:
+    if (useSVEForFixedLengthVectorVT(Op.getValueType()))
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::ADD_PRED);
+    llvm_unreachable("Unexpected request to lower ISD::ADD");
   }
 }
 
+bool AArch64TargetLowering::useSVEForFixedLengthVectors() const {
+  // Prefer NEON unless larger SVE registers are available.
+  return Subtarget->hasSVE() && Subtarget->getMinSVEVectorSizeInBits() >= 256;
+}
+
+bool AArch64TargetLowering::useSVEForFixedLengthVectorVT(EVT VT) const {
+  if (!useSVEForFixedLengthVectors())
+    return false;
+
+  if (!VT.isFixedLengthVector())
+    return false;
+
+  // Fixed length predicates should be promoted to i8.
+  // NOTE: This is consistent with how NEON (and thus 64/128bit vectors) work.
+  if (VT.getVectorElementType() == MVT::i1)
+    return false;
+
+  // Don't use SVE for vectors we cannot scalarize if required.
+  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+  default:
+    return false;
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+  case MVT::i64:
+  case MVT::f16:
+  case MVT::f32:
+  case MVT::f64:
+    break;
+  }
+
+  // Ensure NEON MVTs only belong to a single register class.
+  if (VT.getSizeInBits() <= 128)
+    return false;
+
+  // Don't use SVE for types that don't fit.
+  if (VT.getSizeInBits() > Subtarget->getMinSVEVectorSizeInBits())
+    return false;
+
+  // TODO: Perhaps an artificial restriction, but worth having whilst getting
+  // the base fixed length SVE support in place.
+  if (!VT.isPow2VectorType())
+    return false;
+
+  return true;
+}
+
 //===----------------------------------------------------------------------===//
 //                      Calling Convention Implementation
 //===----------------------------------------------------------------------===//
@@ -3231,9 +3643,6 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
   switch (CC) {
   default:
     report_fatal_error("Unsupported calling convention.");
-  case CallingConv::AArch64_SVE_VectorCall:
-    // Calling SVE functions is currently not yet supported.
-    report_fatal_error("Unsupported calling convention.");
   case CallingConv::WebKit_JS:
     return CC_AArch64_WebKit_JS;
   case CallingConv::GHC:
@@ -3256,6 +3665,7 @@ CCAssignFn *AArch64TargetLowering::CCAssignFnForCall(CallingConv::ID CC,
    case CallingConv::CFGuard_Check:
      return CC_AArch64_Win64_CFGuard_Check;
    case CallingConv::AArch64_VectorCall:
+   case CallingConv::AArch64_SVE_VectorCall:
      return CC_AArch64_AAPCS;
   }
 }
@@ -3343,7 +3753,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
         RC = &AArch64::GPR32RegClass;
       else if (RegVT == MVT::i64)
         RC = &AArch64::GPR64RegClass;
-      else if (RegVT == MVT::f16)
+      else if (RegVT == MVT::f16 || RegVT == MVT::bf16)
         RC = &AArch64::FPR16RegClass;
       else if (RegVT == MVT::f32)
         RC = &AArch64::FPR32RegClass;
@@ -3374,7 +3784,7 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       case CCValAssign::Indirect:
         assert(VA.getValVT().isScalableVector() &&
                "Only scalable vectors can be passed indirectly");
-        llvm_unreachable("Spilling of SVE vectors not yet implemented");
+        break;
       case CCValAssign::BCvt:
         ArgValue = DAG.getNode(ISD::BITCAST, DL, VA.getValVT(), ArgValue);
         break;
@@ -3391,7 +3801,9 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
     } else { // VA.isRegLoc()
       assert(VA.isMemLoc() && "CCValAssign is neither reg nor mem");
       unsigned ArgOffset = VA.getLocMemOffset();
-      unsigned ArgSize = VA.getValVT().getSizeInBits() / 8;
+      unsigned ArgSize = (VA.getLocInfo() == CCValAssign::Indirect
+                              ? VA.getLocVT().getSizeInBits()
+                              : VA.getValVT().getSizeInBits()) / 8;
 
       uint32_t BEAlign = 0;
       if (!Subtarget->isLittleEndian() && ArgSize < 8 &&
@@ -3417,7 +3829,8 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
       case CCValAssign::Indirect:
         assert(VA.getValVT().isScalableVector() &&
                "Only scalable vectors can be passed indirectly");
-        llvm_unreachable("Spilling of SVE vectors not yet implemented");
+        MemVT = VA.getLocVT();
+        break;
       case CCValAssign::SExt:
         ExtType = ISD::SEXTLOAD;
         break;
@@ -3435,6 +3848,15 @@ SDValue AArch64TargetLowering::LowerFormalArguments(
           MemVT);
 
     }
+
+    if (VA.getLocInfo() == CCValAssign::Indirect) {
+      assert(VA.getValVT().isScalableVector() &&
+           "Only scalable vectors can be passed indirectly");
+      // If value is passed via pointer - do a load.
+      ArgValue =
+          DAG.getLoad(VA.getValVT(), DL, Chain, ArgValue, MachinePointerInfo());
+    }
+
     if (Subtarget->isTargetILP32() && Ins[i].Flags.isPointer())
       ArgValue = DAG.getNode(ISD::AssertZext, DL, ArgValue.getValueType(),
                              ArgValue, DAG.getValueType(MVT::i32));
@@ -3550,7 +3972,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
         // The extra size here, if triggered, will always be 8.
         MFI.CreateFixedObject(16 - (GPRSaveSize & 15), -(int)alignTo(GPRSaveSize, 16), false);
     } else
-      GPRIdx = MFI.CreateStackObject(GPRSaveSize, 8, false);
+      GPRIdx = MFI.CreateStackObject(GPRSaveSize, Align(8), false);
 
     SDValue FIN = DAG.getFrameIndex(GPRIdx, PtrVT);
 
@@ -3582,7 +4004,7 @@ void AArch64TargetLowering::saveVarArgRegisters(CCState &CCInfo,
     unsigned FPRSaveSize = 16 * (NumFPRArgRegs - FirstVariadicFPR);
     int FPRIdx = 0;
     if (FPRSaveSize != 0) {
-      FPRIdx = MFI.CreateStackObject(FPRSaveSize, 16, false);
+      FPRIdx = MFI.CreateStackObject(FPRSaveSize, Align(16), false);
 
       SDValue FIN = DAG.getFrameIndex(FPRIdx, PtrVT);
 
@@ -3703,6 +4125,13 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
   CallingConv::ID CallerCC = CallerF.getCallingConv();
   bool CCMatch = CallerCC == CalleeCC;
 
+  // When using the Windows calling convention on a non-windows OS, we want
+  // to back up and restore X18 in such functions; we can't do a tail call
+  // from those functions.
+  if (CallerCC == CallingConv::Win64 && !Subtarget->isTargetWindows() &&
+      CalleeCC != CallingConv::Win64)
+    return false;
+
   // Byval parameters hand the function a pointer directly into the stack area
   // we want to reuse during a tail call. Working around this *is* possible (see
   // X86) but less efficient and uglier in LowerCall.
@@ -3795,6 +4224,18 @@ bool AArch64TargetLowering::isEligibleForTailCallOptimization(
 
   const AArch64FunctionInfo *FuncInfo = MF.getInfo<AArch64FunctionInfo>();
 
+  // If any of the arguments is passed indirectly, it must be SVE, so the
+  // 'getBytesInStackArgArea' is not sufficient to determine whether we need to
+  // allocate space on the stack. That is why we determine this explicitly here
+  // the call cannot be a tailcall.
+  if (llvm::any_of(ArgLocs, [](CCValAssign &A) {
+        assert((A.getLocInfo() != CCValAssign::Indirect ||
+                A.getValVT().isScalableVector()) &&
+               "Expected value to be scalable");
+        return A.getLocInfo() == CCValAssign::Indirect;
+      }))
+    return false;
+
   // If the stack arguments for this call do not fit into our own save area then
   // the call cannot be made tail.
   if (CCInfo.getNextStackOffset() > FuncInfo->getBytesInStackArgArea())
@@ -3873,7 +4314,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     // Check if it's really possible to do a tail call.
     IsTailCall = isEligibleForTailCallOptimization(
         Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
-    if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
+    if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
       report_fatal_error("failed to perform tail call elimination on a call "
                          "site marked musttail");
 
@@ -3983,7 +4424,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
   SmallVector<SDValue, 8> MemOpChains;
   auto PtrVT = getPointerTy(DAG.getDataLayout());
 
-  if (IsVarArg && CLI.CS && CLI.CS.isMustTailCall()) {
+  if (IsVarArg && CLI.CB && CLI.CB->isMustTailCall()) {
     const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
     for (const auto &F : Forwards) {
       SDValue Val = DAG.getCopyFromReg(Chain, DL, F.VReg, F.VT);
@@ -4035,7 +4476,20 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
     case CCValAssign::Indirect:
       assert(VA.getValVT().isScalableVector() &&
              "Only scalable vectors can be passed indirectly");
-      llvm_unreachable("Spilling of SVE vectors not yet implemented");
+      MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+      Type *Ty = EVT(VA.getValVT()).getTypeForEVT(*DAG.getContext());
+      Align Alignment = DAG.getDataLayout().getPrefTypeAlign(Ty);
+      int FI = MFI.CreateStackObject(
+          VA.getValVT().getStoreSize().getKnownMinSize(), Alignment, false);
+      MFI.setStackID(FI, TargetStackID::SVEVector);
+
+      SDValue SpillSlot = DAG.getFrameIndex(
+          FI, DAG.getTargetLoweringInfo().getFrameIndexTy(DAG.getDataLayout()));
+      Chain = DAG.getStore(
+          Chain, DL, Arg, SpillSlot,
+          MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+      Arg = SpillSlot;
+      break;
     }
 
     if (VA.isRegLoc()) {
@@ -4071,7 +4525,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         RegsToPass.emplace_back(VA.getLocReg(), Arg);
         RegsUsed.insert(VA.getLocReg());
         const TargetOptions &Options = DAG.getTarget().Options;
-        if (Options.EnableDebugEntryValues)
+        if (Options.EmitCallSiteInfo)
           CSInfo.emplace_back(VA.getLocReg(), i);
       }
     } else {
@@ -4083,8 +4537,12 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
       // FIXME: This works on big-endian for composite byvals, which are the
       // common case. It should also work for fundamental types too.
       uint32_t BEAlign = 0;
-      unsigned OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
-                                        : VA.getValVT().getSizeInBits();
+      unsigned OpSize;
+      if (VA.getLocInfo() == CCValAssign::Indirect)
+        OpSize = VA.getLocVT().getSizeInBits();
+      else
+        OpSize = Flags.isByVal() ? Flags.getByValSize() * 8
+                                 : VA.getValVT().getSizeInBits();
       OpSize = (OpSize + 7) / 8;
       if (!Subtarget->isLittleEndian() && !Flags.isByVal() &&
           !Flags.isInConsecutiveRegs()) {
@@ -4120,10 +4578,10 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
         SDValue SizeNode =
             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i64);
         SDValue Cpy = DAG.getMemcpy(
-            Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
+            Chain, DL, DstAddr, Arg, SizeNode,
+            Outs[i].Flags.getNonZeroByValAlign(),
             /*isVol = */ false, /*AlwaysInline = */ false,
-            /*isTailCall = */ false,
-            DstInfo, MachinePointerInfo());
+            /*isTailCall = */ false, DstInfo, MachinePointerInfo());
 
         MemOpChains.push_back(Cpy);
       } else {
@@ -4257,6 +4715,7 @@ AArch64TargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   // Returns a chain and a flag for retval copy to use.
   Chain = DAG.getNode(AArch64ISD::CALL, DL, NodeTys, Ops);
+  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   InFlag = Chain.getValue(1);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
 
@@ -4422,7 +4881,7 @@ SDValue AArch64TargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
 SDValue AArch64TargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
                                              SelectionDAG &DAG,
                                              unsigned Flag) const {
-  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
                                    N->getOffset(), Flag);
 }
 
@@ -4913,7 +5372,7 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
 
   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a branch
   // instruction.
-  if (isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
+  if (ISD::isOverflowIntrOpRes(LHS) && isOneConstant(RHS) &&
       (CC == ISD::SETEQ || CC == ISD::SETNE)) {
     // Only lower legal XALUO ops.
     if (!DAG.getTargetLoweringInfo().isTypeLegal(LHS->getValueType(0)))
@@ -4997,8 +5456,8 @@ SDValue AArch64TargetLowering::LowerBR_CC(SDValue Op, SelectionDAG &DAG) const {
                        Cmp);
   }
 
-  assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::f32 ||
-         LHS.getValueType() == MVT::f64);
+  assert(LHS.getValueType() == MVT::f16 || LHS.getValueType() == MVT::bf16 ||
+         LHS.getValueType() == MVT::f32 || LHS.getValueType() == MVT::f64);
 
   // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't totally
   // clean.  Some of them require two branches to implement.
@@ -5124,6 +5583,15 @@ SDValue AArch64TargetLowering::LowerCTPOP(SDValue Op, SelectionDAG &DAG) const {
     if (VT == MVT::i64)
       UaddLV = DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i64, UaddLV);
     return UaddLV;
+  } else if (VT == MVT::i128) {
+    Val = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Val);
+
+    SDValue CtPop = DAG.getNode(ISD::CTPOP, DL, MVT::v16i8, Val);
+    SDValue UaddLV = DAG.getNode(
+        ISD::INTRINSIC_WO_CHAIN, DL, MVT::i32,
+        DAG.getConstant(Intrinsic::aarch64_neon_uaddlv, DL, MVT::i32), CtPop);
+
+    return DAG.getNode(ISD::ZERO_EXTEND, DL, MVT::i128, UaddLV);
   }
 
   assert((VT == MVT::v1i64 || VT == MVT::v2i64 || VT == MVT::v2i32 ||
@@ -5154,9 +5622,15 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   if (Op.getValueType().isVector())
     return LowerVSETCC(Op, DAG);
 
-  SDValue LHS = Op.getOperand(0);
-  SDValue RHS = Op.getOperand(1);
-  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
+  bool IsStrict = Op->isStrictFPOpcode();
+  bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
+  unsigned OpNo = IsStrict ? 1 : 0;
+  SDValue Chain;
+  if (IsStrict)
+    Chain = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(OpNo + 0);
+  SDValue RHS = Op.getOperand(OpNo + 1);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(OpNo + 2))->get();
   SDLoc dl(Op);
 
   // We chose ZeroOrOneBooleanContents, so use zero and one.
@@ -5167,13 +5641,14 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
   // Handle f128 first, since one possible outcome is a normal integer
   // comparison which gets picked up by the next if statement.
   if (LHS.getValueType() == MVT::f128) {
-    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS);
+    softenSetCCOperands(DAG, MVT::f128, LHS, RHS, CC, dl, LHS, RHS, Chain,
+                        IsSignaling);
 
     // If softenSetCCOperands returned a scalar, use it.
     if (!RHS.getNode()) {
       assert(LHS.getValueType() == Op.getValueType() &&
              "Unexpected setcc expansion!");
-      return LHS;
+      return IsStrict ? DAG.getMergeValues({LHS, Chain}, dl) : LHS;
     }
   }
 
@@ -5185,7 +5660,8 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     // Note that we inverted the condition above, so we reverse the order of
     // the true and false operands here.  This will allow the setcc to be
     // matched to a single CSINC instruction.
-    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
+    SDValue Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CCVal, Cmp);
+    return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
   }
 
   // Now we know we're dealing with FP values.
@@ -5194,10 +5670,15 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   // If that fails, we'll need to perform an FCMP + CSEL sequence.  Go ahead
   // and do the comparison.
-  SDValue Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
+  SDValue Cmp;
+  if (IsStrict)
+    Cmp = emitStrictFPComparison(LHS, RHS, dl, DAG, Chain, IsSignaling);
+  else
+    Cmp = emitComparison(LHS, RHS, CC, dl, DAG);
 
   AArch64CC::CondCode CC1, CC2;
   changeFPCCToAArch64CC(CC, CC1, CC2);
+  SDValue Res;
   if (CC2 == AArch64CC::AL) {
     changeFPCCToAArch64CC(ISD::getSetCCInverse(CC, LHS.getValueType()), CC1,
                           CC2);
@@ -5206,7 +5687,7 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     // Note that we inverted the condition above, so we reverse the order of
     // the true and false operands here.  This will allow the setcc to be
     // matched to a single CSINC instruction.
-    return DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
+    Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, FVal, TVal, CC1Val, Cmp);
   } else {
     // Unfortunately, the mapping of LLVM FP CC's onto AArch64 CC's isn't
     // totally clean.  Some of them require two CSELs to implement.  As is in
@@ -5219,8 +5700,9 @@ SDValue AArch64TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
         DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, FVal, CC1Val, Cmp);
 
     SDValue CC2Val = DAG.getConstant(CC2, dl, MVT::i32);
-    return DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
+    Res = DAG.getNode(AArch64ISD::CSEL, dl, VT, TVal, CS1, CC2Val, Cmp);
   }
+  return IsStrict ? DAG.getMergeValues({Res, Cmp.getValue(1)}, dl) : Res;
 }
 
 SDValue AArch64TargetLowering::LowerSELECT_CC(ISD::CondCode CC, SDValue LHS,
@@ -5429,9 +5911,17 @@ SDValue AArch64TargetLowering::LowerSELECT(SDValue Op,
   SDValue FVal = Op->getOperand(2);
   SDLoc DL(Op);
 
+  EVT Ty = Op.getValueType();
+  if (Ty.isScalableVector()) {
+    SDValue TruncCC = DAG.getNode(ISD::TRUNCATE, DL, MVT::i1, CCVal);
+    MVT PredVT = MVT::getVectorVT(MVT::i1, Ty.getVectorElementCount());
+    SDValue SplatPred = DAG.getNode(ISD::SPLAT_VECTOR, DL, PredVT, TruncCC);
+    return DAG.getNode(ISD::VSELECT, DL, Ty, SplatPred, TVal, FVal);
+  }
+
   // Optimize {s|u}{add|sub|mul}.with.overflow feeding into a select
   // instruction.
-  if (isOverflowIntrOpRes(CCVal)) {
+  if (ISD::isOverflowIntrOpRes(CCVal)) {
     // Only lower legal XALUO ops.
     if (!DAG.getTargetLoweringInfo().isTypeLegal(CCVal->getValueType(0)))
       return SDValue();
@@ -5642,9 +6132,9 @@ SDValue AArch64TargetLowering::LowerVACOPY(SDValue Op,
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
 
   return DAG.getMemcpy(Op.getOperand(0), DL, Op.getOperand(1), Op.getOperand(2),
-                       DAG.getConstant(VaListSize, DL, MVT::i32), PtrSize,
-                       false, false, false, MachinePointerInfo(DestSV),
-                       MachinePointerInfo(SrcSV));
+                       DAG.getConstant(VaListSize, DL, MVT::i32),
+                       Align(PtrSize), false, false, false,
+                       MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
 }
 
 SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
@@ -5656,7 +6146,7 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   SDLoc DL(Op);
   SDValue Chain = Op.getOperand(0);
   SDValue Addr = Op.getOperand(1);
-  unsigned Align = Op.getConstantOperandVal(3);
+  MaybeAlign Align(Op.getConstantOperandVal(3));
   unsigned MinSlotSize = Subtarget->isTargetILP32() ? 4 : 8;
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   auto PtrMemVT = getPointerMemTy(DAG.getDataLayout());
@@ -5665,12 +6155,11 @@ SDValue AArch64TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   Chain = VAList.getValue(1);
   VAList = DAG.getZExtOrTrunc(VAList, DL, PtrVT);
 
-  if (Align > MinSlotSize) {
-    assert(((Align & (Align - 1)) == 0) && "Expected Align to be a power of 2");
+  if (Align && *Align > MinSlotSize) {
     VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
-                         DAG.getConstant(Align - 1, DL, PtrVT));
+                         DAG.getConstant(Align->value() - 1, DL, PtrVT));
     VAList = DAG.getNode(ISD::AND, DL, PtrVT, VAList,
-                         DAG.getConstant(-(int64_t)Align, DL, PtrVT));
+                         DAG.getConstant(-(int64_t)Align->value(), DL, PtrVT));
   }
 
   Type *ArgTy = VT.getTypeForEVT(*DAG.getContext());
@@ -7001,7 +7490,8 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
       return DAG.getNode(AArch64ISD::REV64, dl, VT, OpLHS);
     // vrev <4 x i16> -> REV32
     if (VT.getVectorElementType() == MVT::i16 ||
-        VT.getVectorElementType() == MVT::f16)
+        VT.getVectorElementType() == MVT::f16 ||
+        VT.getVectorElementType() == MVT::bf16)
       return DAG.getNode(AArch64ISD::REV32, dl, VT, OpLHS);
     // vrev <4 x i8> -> REV16
     assert(VT.getVectorElementType() == MVT::i8);
@@ -7014,7 +7504,7 @@ static SDValue GeneratePerfectShuffle(unsigned PFEntry, SDValue LHS,
     unsigned Opcode;
     if (EltTy == MVT::i8)
       Opcode = AArch64ISD::DUPLANE8;
-    else if (EltTy == MVT::i16 || EltTy == MVT::f16)
+    else if (EltTy == MVT::i16 || EltTy == MVT::f16 || EltTy == MVT::bf16)
       Opcode = AArch64ISD::DUPLANE16;
     else if (EltTy == MVT::i32 || EltTy == MVT::f32)
       Opcode = AArch64ISD::DUPLANE32;
@@ -7121,7 +7611,7 @@ static SDValue GenerateTBL(SDValue Op, ArrayRef<int> ShuffleMask,
 static unsigned getDUPLANEOp(EVT EltType) {
   if (EltType == MVT::i8)
     return AArch64ISD::DUPLANE8;
-  if (EltType == MVT::i16 || EltType == MVT::f16)
+  if (EltType == MVT::i16 || EltType == MVT::f16 || EltType == MVT::bf16)
     return AArch64ISD::DUPLANE16;
   if (EltType == MVT::i32 || EltType == MVT::f32)
     return AArch64ISD::DUPLANE32;
@@ -7330,18 +7820,16 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
   // Extend input splat value where needed to fit into a GPR (32b or 64b only)
   // FPRs don't have this restriction.
   switch (ElemVT.getSimpleVT().SimpleTy) {
-  case MVT::i8:
-  case MVT::i16:
-  case MVT::i32:
-    SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
-    return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
-  case MVT::i64:
-    SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
-    return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
   case MVT::i1: {
+    // The only legal i1 vectors are SVE vectors, so we can use SVE-specific
+    // lowering code.
+    if (auto *ConstVal = dyn_cast<ConstantSDNode>(SplatVal)) {
+      if (ConstVal->isOne())
+        return getPTrue(DAG, dl, VT, AArch64SVEPredPattern::all);
+      // TODO: Add special case for constant false
+    }
     // The general case of i1.  There isn't any natural way to do this,
     // so we use some trickery with whilelo.
-    // TODO: Add special cases for splat of constant true/false.
     SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
     SplatVal = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::i64, SplatVal,
                            DAG.getValueType(MVT::i1));
@@ -7350,15 +7838,76 @@ SDValue AArch64TargetLowering::LowerSPLAT_VECTOR(SDValue Op,
     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT, ID,
                        DAG.getConstant(0, dl, MVT::i64), SplatVal);
   }
-  // TODO: we can support float types, but haven't added patterns yet.
+  case MVT::i8:
+  case MVT::i16:
+  case MVT::i32:
+    SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i32);
+    break;
+  case MVT::i64:
+    SplatVal = DAG.getAnyExtOrTrunc(SplatVal, dl, MVT::i64);
+    break;
   case MVT::f16:
+  case MVT::bf16:
   case MVT::f32:
   case MVT::f64:
+    // Fine as is
+    break;
   default:
     report_fatal_error("Unsupported SPLAT_VECTOR input operand type");
   }
+
+  return DAG.getNode(AArch64ISD::DUP, dl, VT, SplatVal);
+}
+
+SDValue AArch64TargetLowering::LowerDUPQLane(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+
+  EVT VT = Op.getValueType();
+  if (!isTypeLegal(VT) || !VT.isScalableVector())
+    return SDValue();
+
+  // Current lowering only supports the SVE-ACLE types.
+  if (VT.getSizeInBits().getKnownMinSize() != AArch64::SVEBitsPerBlock)
+    return SDValue();
+
+  // The DUPQ operation is indepedent of element type so normalise to i64s.
+  SDValue V = DAG.getNode(ISD::BITCAST, DL, MVT::nxv2i64, Op.getOperand(1));
+  SDValue Idx128 = Op.getOperand(2);
+
+  // DUPQ can be used when idx is in range.
+  auto *CIdx = dyn_cast<ConstantSDNode>(Idx128);
+  if (CIdx && (CIdx->getZExtValue() <= 3)) {
+    SDValue CI = DAG.getTargetConstant(CIdx->getZExtValue(), DL, MVT::i64);
+    SDNode *DUPQ =
+        DAG.getMachineNode(AArch64::DUP_ZZI_Q, DL, MVT::nxv2i64, V, CI);
+    return DAG.getNode(ISD::BITCAST, DL, VT, SDValue(DUPQ, 0));
+  }
+
+  // The ACLE says this must produce the same result as:
+  //   svtbl(data, svadd_x(svptrue_b64(),
+  //                       svand_x(svptrue_b64(), svindex_u64(0, 1), 1),
+  //                       index * 2))
+  SDValue One = DAG.getConstant(1, DL, MVT::i64);
+  SDValue SplatOne = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, One);
+
+  // create the vector 0,1,0,1,...
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  SDValue SV = DAG.getNode(AArch64ISD::INDEX_VECTOR,
+                           DL, MVT::nxv2i64, Zero, One);
+  SV = DAG.getNode(ISD::AND, DL, MVT::nxv2i64, SV, SplatOne);
+
+  // create the vector idx64,idx64+1,idx64,idx64+1,...
+  SDValue Idx64 = DAG.getNode(ISD::ADD, DL, MVT::i64, Idx128, Idx128);
+  SDValue SplatIdx64 = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Idx64);
+  SDValue ShuffleMask = DAG.getNode(ISD::ADD, DL, MVT::nxv2i64, SV, SplatIdx64);
+
+  // create the vector Val[idx64],Val[idx64+1],Val[idx64],Val[idx64+1],...
+  SDValue TBL = DAG.getNode(AArch64ISD::TBL, DL, MVT::nxv2i64, V, ShuffleMask);
+  return DAG.getNode(ISD::BITCAST, DL, VT, TBL);
 }
 
+
 static bool resolveBuildVector(BuildVectorSDNode *BVN, APInt &CnstBits,
                                APInt &UndefBits) {
   EVT VT = BVN->getValueType(0);
@@ -7609,8 +8158,10 @@ static unsigned getIntrinsicID(const SDNode *N) {
 
 // Attempt to form a vector S[LR]I from (or (and X, BvecC1), (lsl Y, C2)),
 // to (SLI X, Y, C2), where X and Y have matching vector types, BvecC1 is a
-// BUILD_VECTORs with constant element C1, C2 is a constant, and C1 == ~C2.
-// Also, logical shift right -> sri, with the same structure.
+// BUILD_VECTORs with constant element C1, C2 is a constant, and:
+//   - for the SLI case: C1 == ~(Ones(ElemSizeInBits) << C2)
+//   - for the SRI case: C1 == ~(Ones(ElemSizeInBits) >> C2)
+// The (or (lsl Y, C2), (and X, BvecC1)) case is also handled.
 static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
 
@@ -7619,49 +8170,70 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
 
   SDLoc DL(N);
 
-  // Is the first op an AND?
-  const SDValue And = N->getOperand(0);
-  if (And.getOpcode() != ISD::AND)
+  SDValue And;
+  SDValue Shift;
+
+  SDValue FirstOp = N->getOperand(0);
+  unsigned FirstOpc = FirstOp.getOpcode();
+  SDValue SecondOp = N->getOperand(1);
+  unsigned SecondOpc = SecondOp.getOpcode();
+
+  // Is one of the operands an AND or a BICi? The AND may have been optimised to
+  // a BICi in order to use an immediate instead of a register.
+  // Is the other operand an shl or lshr? This will have been turned into:
+  // AArch64ISD::VSHL vector, #shift or AArch64ISD::VLSHR vector, #shift.
+  if ((FirstOpc == ISD::AND || FirstOpc == AArch64ISD::BICi) &&
+      (SecondOpc == AArch64ISD::VSHL || SecondOpc == AArch64ISD::VLSHR)) {
+    And = FirstOp;
+    Shift = SecondOp;
+
+  } else if ((SecondOpc == ISD::AND || SecondOpc == AArch64ISD::BICi) &&
+             (FirstOpc == AArch64ISD::VSHL || FirstOpc == AArch64ISD::VLSHR)) {
+    And = SecondOp;
+    Shift = FirstOp;
+  } else
     return SDValue();
 
-  // Is the second op an shl or lshr?
-  SDValue Shift = N->getOperand(1);
-  // This will have been turned into: AArch64ISD::VSHL vector, #shift
-  // or AArch64ISD::VLSHR vector, #shift
-  unsigned ShiftOpc = Shift.getOpcode();
-  if ((ShiftOpc != AArch64ISD::VSHL && ShiftOpc != AArch64ISD::VLSHR))
-    return SDValue();
-  bool IsShiftRight = ShiftOpc == AArch64ISD::VLSHR;
+  bool IsAnd = And.getOpcode() == ISD::AND;
+  bool IsShiftRight = Shift.getOpcode() == AArch64ISD::VLSHR;
 
   // Is the shift amount constant?
   ConstantSDNode *C2node = dyn_cast<ConstantSDNode>(Shift.getOperand(1));
   if (!C2node)
     return SDValue();
 
-  // Is the and mask vector all constant?
   uint64_t C1;
-  if (!isAllConstantBuildVector(And.getOperand(1), C1))
-    return SDValue();
+  if (IsAnd) {
+    // Is the and mask vector all constant?
+    if (!isAllConstantBuildVector(And.getOperand(1), C1))
+      return SDValue();
+  } else {
+    // Reconstruct the corresponding AND immediate from the two BICi immediates.
+    ConstantSDNode *C1nodeImm = dyn_cast<ConstantSDNode>(And.getOperand(1));
+    ConstantSDNode *C1nodeShift = dyn_cast<ConstantSDNode>(And.getOperand(2));
+    assert(C1nodeImm && C1nodeShift);
+    C1 = ~(C1nodeImm->getZExtValue() << C1nodeShift->getZExtValue());
+  }
 
-  // Is C1 == ~C2, taking into account how much one can shift elements of a
-  // particular size?
+  // Is C1 == ~(Ones(ElemSizeInBits) << C2) or
+  // C1 == ~(Ones(ElemSizeInBits) >> C2), taking into account
+  // how much one can shift elements of a particular size?
   uint64_t C2 = C2node->getZExtValue();
   unsigned ElemSizeInBits = VT.getScalarSizeInBits();
   if (C2 > ElemSizeInBits)
     return SDValue();
-  unsigned ElemMask = (1 << ElemSizeInBits) - 1;
-  if ((C1 & ElemMask) != (~C2 & ElemMask))
+
+  APInt C1AsAPInt(ElemSizeInBits, C1);
+  APInt RequiredC1 = IsShiftRight ? APInt::getHighBitsSet(ElemSizeInBits, C2)
+                                  : APInt::getLowBitsSet(ElemSizeInBits, C2);
+  if (C1AsAPInt != RequiredC1)
     return SDValue();
 
   SDValue X = And.getOperand(0);
   SDValue Y = Shift.getOperand(0);
 
-  unsigned Intrin =
-      IsShiftRight ? Intrinsic::aarch64_neon_vsri : Intrinsic::aarch64_neon_vsli;
-  SDValue ResultSLI =
-      DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                  DAG.getConstant(Intrin, DL, MVT::i32), X, Y,
-                  Shift.getOperand(1));
+  unsigned Inst = IsShiftRight ? AArch64ISD::VSRI : AArch64ISD::VSLI;
+  SDValue ResultSLI = DAG.getNode(Inst, DL, VT, X, Y, Shift.getOperand(1));
 
   LLVM_DEBUG(dbgs() << "aarch64-lower: transformed: \n");
   LLVM_DEBUG(N->dump(&DAG));
@@ -7675,10 +8247,8 @@ static SDValue tryLowerToSLI(SDNode *N, SelectionDAG &DAG) {
 SDValue AArch64TargetLowering::LowerVectorOR(SDValue Op,
                                              SelectionDAG &DAG) const {
   // Attempt to form a vector S[LR]I from (or (and X, C1), (lsl Y, C2))
-  if (EnableAArch64SlrGeneration) {
-    if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
-      return Res;
-  }
+  if (SDValue Res = tryLowerToSLI(Op.getNode(), DAG))
+    return Res;
 
   EVT VT = Op.getValueType();
 
@@ -7966,8 +8536,8 @@ SDValue AArch64TargetLowering::LowerBUILD_VECTOR(SDValue Op,
     if (VT.getVectorElementType().isFloatingPoint()) {
       SmallVector<SDValue, 8> Ops;
       EVT EltTy = VT.getVectorElementType();
-      assert ((EltTy == MVT::f16 || EltTy == MVT::f32 || EltTy == MVT::f64) &&
-              "Unsupported floating-point vector type");
+      assert ((EltTy == MVT::f16 || EltTy == MVT::bf16 || EltTy == MVT::f32 ||
+               EltTy == MVT::f64) && "Unsupported floating-point vector type");
       LLVM_DEBUG(
           dbgs() << "LowerBUILD_VECTOR: float constant splats, creating int "
                     "BITCASTS, and try again\n");
@@ -8086,11 +8656,12 @@ SDValue AArch64TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   // Insertion/extraction are legal for V128 types.
   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
-      VT == MVT::v8f16)
+      VT == MVT::v8f16 || VT == MVT::v8bf16)
     return Op;
 
   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
+      VT != MVT::v4bf16)
     return SDValue();
 
   // For V64 types, we perform insertion by expanding the value
@@ -8120,11 +8691,12 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   // Insertion/extraction are legal for V128 types.
   if (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
       VT == MVT::v2i64 || VT == MVT::v4f32 || VT == MVT::v2f64 ||
-      VT == MVT::v8f16)
+      VT == MVT::v8f16 || VT == MVT::v8bf16)
     return Op;
 
   if (VT != MVT::v8i8 && VT != MVT::v4i16 && VT != MVT::v2i32 &&
-      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16)
+      VT != MVT::v1i64 && VT != MVT::v2f32 && VT != MVT::v4f16 &&
+      VT != MVT::v4bf16)
     return SDValue();
 
   // For V64 types, we perform extraction by expanding the value
@@ -8144,32 +8716,57 @@ AArch64TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
 SDValue AArch64TargetLowering::LowerEXTRACT_SUBVECTOR(SDValue Op,
                                                       SelectionDAG &DAG) const {
-  EVT VT = Op.getOperand(0).getValueType();
-  SDLoc dl(Op);
-  // Just in case...
-  if (!VT.isVector())
-    return SDValue();
-
-  ConstantSDNode *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1));
-  if (!Cst)
-    return SDValue();
-  unsigned Val = Cst->getZExtValue();
+  assert(Op.getValueType().isFixedLengthVector() &&
+         "Only cases that extract a fixed length vector are supported!");
 
+  EVT InVT = Op.getOperand(0).getValueType();
+  unsigned Idx = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
   unsigned Size = Op.getValueSizeInBits();
 
+  if (InVT.isScalableVector()) {
+    // This will be matched by custom code during ISelDAGToDAG.
+    if (Idx == 0 && isPackedVectorType(InVT, DAG))
+      return Op;
+
+    return SDValue();
+  }
+
   // This will get lowered to an appropriate EXTRACT_SUBREG in ISel.
-  if (Val == 0)
+  if (Idx == 0 && InVT.getSizeInBits() <= 128)
     return Op;
 
   // If this is extracting the upper 64-bits of a 128-bit vector, we match
   // that directly.
-  if (Size == 64 && Val * VT.getScalarSizeInBits() == 64)
+  if (Size == 64 && Idx * InVT.getScalarSizeInBits() == 64)
+    return Op;
+
+  return SDValue();
+}
+
+SDValue AArch64TargetLowering::LowerINSERT_SUBVECTOR(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  assert(Op.getValueType().isScalableVector() &&
+         "Only expect to lower inserts into scalable vectors!");
+
+  EVT InVT = Op.getOperand(1).getValueType();
+  unsigned Idx = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+
+  // We don't have any patterns for scalable vector yet.
+  if (InVT.isScalableVector() || !useSVEForFixedLengthVectorVT(InVT))
+    return SDValue();
+
+  // This will be matched by custom code during ISelDAGToDAG.
+  if (Idx == 0 && isPackedVectorType(InVT, DAG) && Op.getOperand(0).isUndef())
     return Op;
 
   return SDValue();
 }
 
 bool AArch64TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
+  // Currently no fixed length shuffles that require SVE are legal.
+  if (useSVEForFixedLengthVectorVT(VT))
+    return false;
+
   if (VT.getVectorNumElements() == 4 &&
       (VT.is128BitVector() || VT.is64BitVector())) {
     unsigned PFIndexes[4];
@@ -8249,6 +8846,81 @@ static bool isVShiftRImm(SDValue Op, EVT VT, bool isNarrow, int64_t &Cnt) {
   return (Cnt >= 1 && Cnt <= (isNarrow ? ElementBits / 2 : ElementBits));
 }
 
+// Attempt to form urhadd(OpA, OpB) from
+// truncate(vlshr(sub(zext(OpB), xor(zext(OpA), Ones(ElemSizeInBits))), 1)).
+// The original form of this expression is
+// truncate(srl(add(zext(OpB), add(zext(OpA), 1)), 1)) and before this function
+// is called the srl will have been lowered to AArch64ISD::VLSHR and the
+// ((OpA + OpB + 1) >> 1) expression will have been changed to (OpB - (~OpA)).
+// This pass can also recognize a variant of this pattern that uses sign
+// extension instead of zero extension and form a srhadd(OpA, OpB) from it.
+SDValue AArch64TargetLowering::LowerTRUNCATE(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+
+  if (!VT.isVector() || VT.isScalableVector())
+    return Op;
+
+  if (useSVEForFixedLengthVectorVT(Op.getOperand(0).getValueType()))
+    return LowerFixedLengthVectorTruncateToSVE(Op, DAG);
+
+  // Since we are looking for a right shift by a constant value of 1 and we are
+  // operating on types at least 16 bits in length (sign/zero extended OpA and
+  // OpB, which are at least 8 bits), it follows that the truncate will always
+  // discard the shifted-in bit and therefore the right shift will be logical
+  // regardless of the signedness of OpA and OpB.
+  SDValue Shift = Op.getOperand(0);
+  if (Shift.getOpcode() != AArch64ISD::VLSHR)
+    return Op;
+
+  // Is the right shift using an immediate value of 1?
+  uint64_t ShiftAmount = Shift.getConstantOperandVal(1);
+  if (ShiftAmount != 1)
+    return Op;
+
+  SDValue Sub = Shift->getOperand(0);
+  if (Sub.getOpcode() != ISD::SUB)
+    return Op;
+
+  SDValue Xor = Sub.getOperand(1);
+  if (Xor.getOpcode() != ISD::XOR)
+    return Op;
+
+  SDValue ExtendOpA = Xor.getOperand(0);
+  SDValue ExtendOpB = Sub.getOperand(0);
+  unsigned ExtendOpAOpc = ExtendOpA.getOpcode();
+  unsigned ExtendOpBOpc = ExtendOpB.getOpcode();
+  if (!(ExtendOpAOpc == ExtendOpBOpc &&
+        (ExtendOpAOpc == ISD::ZERO_EXTEND || ExtendOpAOpc == ISD::SIGN_EXTEND)))
+    return Op;
+
+  // Is the result of the right shift being truncated to the same value type as
+  // the original operands, OpA and OpB?
+  SDValue OpA = ExtendOpA.getOperand(0);
+  SDValue OpB = ExtendOpB.getOperand(0);
+  EVT OpAVT = OpA.getValueType();
+  assert(ExtendOpA.getValueType() == ExtendOpB.getValueType());
+  if (!(VT == OpAVT && OpAVT == OpB.getValueType()))
+    return Op;
+
+  // Is the XOR using a constant amount of all ones in the right hand side?
+  uint64_t C;
+  if (!isAllConstantBuildVector(Xor.getOperand(1), C))
+    return Op;
+
+  unsigned ElemSizeInBits = VT.getScalarSizeInBits();
+  APInt CAsAPInt(ElemSizeInBits, C);
+  if (CAsAPInt != APInt::getAllOnesValue(ElemSizeInBits))
+    return Op;
+
+  SDLoc DL(Op);
+  bool IsSignExtend = ExtendOpAOpc == ISD::SIGN_EXTEND;
+  unsigned RHADDOpc = IsSignExtend ? AArch64ISD::SRHADD : AArch64ISD::URHADD;
+  SDValue ResultURHADD = DAG.getNode(RHADDOpc, DL, VT, OpA, OpB);
+
+  return ResultURHADD;
+}
+
 SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
                                                       SelectionDAG &DAG) const {
   EVT VT = Op.getValueType();
@@ -8264,6 +8936,9 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
     llvm_unreachable("unexpected shift opcode");
 
   case ISD::SHL:
+    if (VT.isScalableVector())
+      return LowerToPredicatedOp(Op, DAG, AArch64ISD::SHL_MERGE_OP1);
+
     if (isVShiftLImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize)
       return DAG.getNode(AArch64ISD::VSHL, DL, VT, Op.getOperand(0),
                          DAG.getConstant(Cnt, DL, MVT::i32));
@@ -8273,6 +8948,12 @@ SDValue AArch64TargetLowering::LowerVectorSRA_SRL_SHL(SDValue Op,
                        Op.getOperand(0), Op.getOperand(1));
   case ISD::SRA:
   case ISD::SRL:
+    if (VT.isScalableVector()) {
+      unsigned Opc = Op.getOpcode() == ISD::SRA ? AArch64ISD::SRA_MERGE_OP1
+                                                : AArch64ISD::SRL_MERGE_OP1;
+      return LowerToPredicatedOp(Op, DAG, Opc);
+    }
+
     // Right shift immediate
     if (isVShiftRImm(Op.getOperand(1), VT, false, Cnt) && Cnt < EltSize) {
       unsigned Opc =
@@ -8395,6 +9076,12 @@ static SDValue EmitVectorComparison(SDValue LHS, SDValue RHS,
 
 SDValue AArch64TargetLowering::LowerVSETCC(SDValue Op,
                                            SelectionDAG &DAG) const {
+  if (Op.getValueType().isScalableVector()) {
+    if (Op.getOperand(0).getValueType().isFloatingPoint())
+      return Op;
+    return LowerToPredicatedOp(Op, DAG, AArch64ISD::SETCC_MERGE_ZERO);
+  }
+
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(2))->get();
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -8570,7 +9257,8 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SDNode *Node = Op.getNode();
   SDValue Chain = Op.getOperand(0);
   SDValue Size = Op.getOperand(1);
-  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+  MaybeAlign Align =
+      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
   EVT VT = Node->getValueType(0);
 
   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
@@ -8580,7 +9268,7 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
     if (Align)
       SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
-                       DAG.getConstant(-(uint64_t)Align, dl, VT));
+                       DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
     Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
     SDValue Ops[2] = {SP, Chain};
     return DAG.getMergeValues(Ops, dl);
@@ -8595,7 +9283,7 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   SP = DAG.getNode(ISD::SUB, dl, MVT::i64, SP, Size);
   if (Align)
     SP = DAG.getNode(ISD::AND, dl, VT, SP.getValue(0),
-                     DAG.getConstant(-(uint64_t)Align, dl, VT));
+                     DAG.getConstant(-(uint64_t)Align->value(), dl, VT));
   Chain = DAG.getCopyToReg(Chain, dl, AArch64::SP, SP);
 
   Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, dl, true),
@@ -8605,6 +9293,41 @@ AArch64TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   return DAG.getMergeValues(Ops, dl);
 }
 
+SDValue AArch64TargetLowering::LowerVSCALE(SDValue Op,
+                                           SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  assert(VT != MVT::i64 && "Expected illegal VSCALE node");
+
+  SDLoc DL(Op);
+  APInt MulImm = cast<ConstantSDNode>(Op.getOperand(0))->getAPIntValue();
+  return DAG.getZExtOrTrunc(DAG.getVScale(DL, MVT::i64, MulImm.sextOrSelf(64)),
+                            DL, VT);
+}
+
+/// Set the IntrinsicInfo for the `aarch64_sve_st<N>` intrinsics.
+template <unsigned NumVecs>
+static bool setInfoSVEStN(AArch64TargetLowering::IntrinsicInfo &Info,
+                          const CallInst &CI) {
+  Info.opc = ISD::INTRINSIC_VOID;
+  // Retrieve EC from first vector argument.
+  const EVT VT = EVT::getEVT(CI.getArgOperand(0)->getType());
+  ElementCount EC = VT.getVectorElementCount();
+#ifndef NDEBUG
+  // Check the assumption that all input vectors are the same type.
+  for (unsigned I = 0; I < NumVecs; ++I)
+    assert(VT == EVT::getEVT(CI.getArgOperand(I)->getType()) &&
+           "Invalid type.");
+#endif
+  // memVT is `NumVecs * VT`.
+  Info.memVT = EVT::getVectorVT(CI.getType()->getContext(), VT.getScalarType(),
+                                EC * NumVecs);
+  Info.ptrVal = CI.getArgOperand(CI.getNumArgOperands() - 1);
+  Info.offset = 0;
+  Info.align.reset();
+  Info.flags = MachineMemOperand::MOStore;
+  return true;
+}
+
 /// getTgtMemIntrinsic - Represent NEON load and store intrinsics as
 /// MemIntrinsicNodes.  The associated MachineMemOperands record the alignment
 /// specified in the intrinsic calls.
@@ -8614,6 +9337,12 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
                                                unsigned Intrinsic) const {
   auto &DL = I.getModule()->getDataLayout();
   switch (Intrinsic) {
+  case Intrinsic::aarch64_sve_st2:
+    return setInfoSVEStN<2>(Info, I);
+  case Intrinsic::aarch64_sve_st3:
+    return setInfoSVEStN<3>(Info, I);
+  case Intrinsic::aarch64_sve_st4:
+    return setInfoSVEStN<4>(Info, I);
   case Intrinsic::aarch64_neon_ld2:
   case Intrinsic::aarch64_neon_ld3:
   case Intrinsic::aarch64_neon_ld4:
@@ -8670,7 +9399,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
+    Info.align = DL.getABITypeAlign(PtrTy->getElementType());
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
   }
@@ -8681,7 +9410,7 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
+    Info.align = DL.getABITypeAlign(PtrTy->getElementType());
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
   }
@@ -8706,21 +9435,25 @@ bool AArch64TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   case Intrinsic::aarch64_sve_ldnt1: {
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(1)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.memVT = MVT::getVT(I.getType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
-    Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MONonTemporal;
+    Info.align = DL.getABITypeAlign(PtrTy->getElementType());
+    Info.flags = MachineMemOperand::MOLoad;
+    if (Intrinsic == Intrinsic::aarch64_sve_ldnt1)
+      Info.flags |= MachineMemOperand::MONonTemporal;
     return true;
   }
   case Intrinsic::aarch64_sve_stnt1: {
     PointerType *PtrTy = cast<PointerType>(I.getArgOperand(2)->getType());
     Info.opc = ISD::INTRINSIC_W_CHAIN;
-    Info.memVT = MVT::getVT(PtrTy->getElementType());
+    Info.memVT = MVT::getVT(I.getOperand(0)->getType());
     Info.ptrVal = I.getArgOperand(2);
     Info.offset = 0;
-    Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
-    Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MONonTemporal;
+    Info.align = DL.getABITypeAlign(PtrTy->getElementType());
+    Info.flags = MachineMemOperand::MOStore;
+    if (Intrinsic == Intrinsic::aarch64_sve_stnt1)
+      Info.flags |= MachineMemOperand::MONonTemporal;
     return true;
   }
   default:
@@ -8895,21 +9628,22 @@ bool AArch64TargetLowering::isExtFreeImpl(const Instruction *Ext) const {
 /// or upper half of the vector elements.
 static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
   auto areTypesHalfed = [](Value *FullV, Value *HalfV) {
-    auto *FullVT = cast<VectorType>(FullV->getType());
-    auto *HalfVT = cast<VectorType>(HalfV->getType());
-    return FullVT->getBitWidth() == 2 * HalfVT->getBitWidth();
+    auto *FullTy = FullV->getType();
+    auto *HalfTy = HalfV->getType();
+    return FullTy->getPrimitiveSizeInBits().getFixedSize() ==
+           2 * HalfTy->getPrimitiveSizeInBits().getFixedSize();
   };
 
   auto extractHalf = [](Value *FullV, Value *HalfV) {
-    auto *FullVT = cast<VectorType>(FullV->getType());
-    auto *HalfVT = cast<VectorType>(HalfV->getType());
+    auto *FullVT = cast<FixedVectorType>(FullV->getType());
+    auto *HalfVT = cast<FixedVectorType>(HalfV->getType());
     return FullVT->getNumElements() == 2 * HalfVT->getNumElements();
   };
 
-  Constant *M1, *M2;
+  ArrayRef<int> M1, M2;
   Value *S1Op1, *S2Op1;
-  if (!match(Op1, m_ShuffleVector(m_Value(S1Op1), m_Undef(), m_Constant(M1))) ||
-      !match(Op2, m_ShuffleVector(m_Value(S2Op1), m_Undef(), m_Constant(M2))))
+  if (!match(Op1, m_Shuffle(m_Value(S1Op1), m_Undef(), m_Mask(M1))) ||
+      !match(Op2, m_Shuffle(m_Value(S2Op1), m_Undef(), m_Mask(M2))))
     return false;
 
   // Check that the operands are half as wide as the result and we extract
@@ -8922,7 +9656,7 @@ static bool areExtractShuffleVectors(Value *Op1, Value *Op2) {
   // elements.
   int M1Start = -1;
   int M2Start = -1;
-  int NumElements = cast<VectorType>(Op1->getType())->getNumElements() * 2;
+  int NumElements = cast<FixedVectorType>(Op1->getType())->getNumElements() * 2;
   if (!ShuffleVectorInst::isExtractSubvectorMask(M1, NumElements, M1Start) ||
       !ShuffleVectorInst::isExtractSubvectorMask(M2, NumElements, M2Start) ||
       M1Start != M2Start || (M1Start != 0 && M2Start != (NumElements / 2)))
@@ -8948,6 +9682,22 @@ static bool areExtractExts(Value *Ext1, Value *Ext2) {
   return true;
 }
 
+/// Check if Op could be used with vmull_high_p64 intrinsic.
+static bool isOperandOfVmullHighP64(Value *Op) {
+  Value *VectorOperand = nullptr;
+  ConstantInt *ElementIndex = nullptr;
+  return match(Op, m_ExtractElt(m_Value(VectorOperand),
+                                m_ConstantInt(ElementIndex))) &&
+         ElementIndex->getValue() == 1 &&
+         isa<FixedVectorType>(VectorOperand->getType()) &&
+         cast<FixedVectorType>(VectorOperand->getType())->getNumElements() == 2;
+}
+
+/// Check if Op1 and Op2 could be used with vmull_high_p64 intrinsic.
+static bool areOperandsOfVmullHighP64(Value *Op1, Value *Op2) {
+  return isOperandOfVmullHighP64(Op1) && isOperandOfVmullHighP64(Op2);
+}
+
 /// Check if sinking \p I's operands to I's basic block is profitable, because
 /// the operands can be folded into a target instruction, e.g.
 /// shufflevectors extracts and/or sext/zext can be folded into (u,s)subl(2).
@@ -8964,6 +9714,15 @@ bool AArch64TargetLowering::shouldSinkOperands(
       Ops.push_back(&II->getOperandUse(0));
       Ops.push_back(&II->getOperandUse(1));
       return true;
+
+    case Intrinsic::aarch64_neon_pmull64:
+      if (!areOperandsOfVmullHighP64(II->getArgOperand(0),
+                                     II->getArgOperand(1)))
+        return false;
+      Ops.push_back(&II->getArgOperandUse(0));
+      Ops.push_back(&II->getArgOperandUse(1));
+      return true;
+
     default:
       return false;
     }
@@ -8996,12 +9755,12 @@ bool AArch64TargetLowering::shouldSinkOperands(
 }
 
 bool AArch64TargetLowering::hasPairedLoad(EVT LoadedType,
-                                          unsigned &RequiredAligment) const {
+                                          Align &RequiredAligment) const {
   if (!LoadedType.isSimple() ||
       (!LoadedType.isInteger() && !LoadedType.isFloatingPoint()))
     return false;
   // Cyclone supports unaligned accesses.
-  RequiredAligment = 0;
+  RequiredAligment = Align(1);
   unsigned NumBits = LoadedType.getSizeInBits();
   return NumBits == 32 || NumBits == 64;
 }
@@ -9015,7 +9774,7 @@ AArch64TargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
 }
 
 MachineMemOperand::Flags
-AArch64TargetLowering::getMMOFlags(const Instruction &I) const {
+AArch64TargetLowering::getTargetMMOFlags(const Instruction &I) const {
   if (Subtarget->getProcFamily() == AArch64Subtarget::Falkor &&
       I.getMetadata(FALKOR_STRIDED_ACCESS_MD) != nullptr)
     return MOStridedAccess;
@@ -9029,7 +9788,7 @@ bool AArch64TargetLowering::isLegalInterleavedAccessType(
   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
 
   // Ensure the number of vector elements is greater than 1.
-  if (VecTy->getNumElements() < 2)
+  if (cast<FixedVectorType>(VecTy)->getNumElements() < 2)
     return false;
 
   // Ensure the element type is legal.
@@ -9063,22 +9822,24 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
 
-  VectorType *VecTy = Shuffles[0]->getType();
+  VectorType *VTy = Shuffles[0]->getType();
 
   // Skip if we do not have NEON and skip illegal vector types. We can
   // "legalize" wide vector types into multiple interleaved accesses as long as
   // the vector types are divisible by 128.
-  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VecTy, DL))
+  if (!Subtarget->hasNEON() || !isLegalInterleavedAccessType(VTy, DL))
     return false;
 
-  unsigned NumLoads = getNumInterleavedAccesses(VecTy, DL);
+  unsigned NumLoads = getNumInterleavedAccesses(VTy, DL);
+
+  auto *FVTy = cast<FixedVectorType>(VTy);
 
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
   // load integer vectors first and then convert to pointer vectors.
-  Type *EltTy = VecTy->getVectorElementType();
+  Type *EltTy = FVTy->getElementType();
   if (EltTy->isPointerTy())
-    VecTy =
-        VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
+    FVTy =
+        FixedVectorType::get(DL.getIntPtrType(EltTy), FVTy->getNumElements());
 
   IRBuilder<> Builder(LI);
 
@@ -9088,19 +9849,19 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
   if (NumLoads > 1) {
     // If we're going to generate more than one load, reset the sub-vector type
     // to something legal.
-    VecTy = VectorType::get(VecTy->getVectorElementType(),
-                            VecTy->getVectorNumElements() / NumLoads);
+    FVTy = FixedVectorType::get(FVTy->getElementType(),
+                                FVTy->getNumElements() / NumLoads);
 
     // We will compute the pointer operand of each load from the original base
     // address using GEPs. Cast the base address to a pointer to the scalar
     // element type.
     BaseAddr = Builder.CreateBitCast(
-        BaseAddr, VecTy->getVectorElementType()->getPointerTo(
-                      LI->getPointerAddressSpace()));
+        BaseAddr,
+        FVTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
   }
 
-  Type *PtrTy = VecTy->getPointerTo(LI->getPointerAddressSpace());
-  Type *Tys[2] = {VecTy, PtrTy};
+  Type *PtrTy = FVTy->getPointerTo(LI->getPointerAddressSpace());
+  Type *Tys[2] = {FVTy, PtrTy};
   static const Intrinsic::ID LoadInts[3] = {Intrinsic::aarch64_neon_ld2,
                                             Intrinsic::aarch64_neon_ld3,
                                             Intrinsic::aarch64_neon_ld4};
@@ -9117,9 +9878,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
     // If we're generating more than one load, compute the base address of
     // subsequent loads as an offset from the previous.
     if (LoadCount > 0)
-      BaseAddr =
-          Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
-                                     VecTy->getVectorNumElements() * Factor);
+      BaseAddr = Builder.CreateConstGEP1_32(FVTy->getElementType(), BaseAddr,
+                                            FVTy->getNumElements() * Factor);
 
     CallInst *LdN = Builder.CreateCall(
         LdNFunc, Builder.CreateBitCast(BaseAddr, PtrTy), "ldN");
@@ -9134,8 +9894,8 @@ bool AArch64TargetLowering::lowerInterleavedLoad(
       // Convert the integer vector to pointer vector if the element is pointer.
       if (EltTy->isPointerTy())
         SubVec = Builder.CreateIntToPtr(
-            SubVec, VectorType::get(SVI->getType()->getVectorElementType(),
-                                    VecTy->getVectorNumElements()));
+            SubVec, FixedVectorType::get(SVI->getType()->getElementType(),
+                                         FVTy->getNumElements()));
       SubVecs[SVI].push_back(SubVec);
     }
   }
@@ -9186,13 +9946,12 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
 
-  VectorType *VecTy = SVI->getType();
-  assert(VecTy->getVectorNumElements() % Factor == 0 &&
-         "Invalid interleaved store");
+  auto *VecTy = cast<FixedVectorType>(SVI->getType());
+  assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
 
-  unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
-  Type *EltTy = VecTy->getVectorElementType();
-  VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
+  unsigned LaneLen = VecTy->getNumElements() / Factor;
+  Type *EltTy = VecTy->getElementType();
+  auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
 
@@ -9212,14 +9971,15 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   // vectors to integer vectors.
   if (EltTy->isPointerTy()) {
     Type *IntTy = DL.getIntPtrType(EltTy);
-    unsigned NumOpElts = Op0->getType()->getVectorNumElements();
+    unsigned NumOpElts =
+        cast<FixedVectorType>(Op0->getType())->getNumElements();
 
     // Convert to the corresponding integer vector.
-    Type *IntVecTy = VectorType::get(IntTy, NumOpElts);
+    auto *IntVecTy = FixedVectorType::get(IntTy, NumOpElts);
     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
 
-    SubVecTy = VectorType::get(IntTy, LaneLen);
+    SubVecTy = FixedVectorType::get(IntTy, LaneLen);
   }
 
   // The base address of the store.
@@ -9229,14 +9989,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
     // If we're going to generate more than one store, reset the lane length
     // and sub-vector type to something legal.
     LaneLen /= NumStores;
-    SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
+    SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
 
     // We will compute the pointer operand of each store from the original base
     // address using GEPs. Cast the base address to a pointer to the scalar
     // element type.
     BaseAddr = Builder.CreateBitCast(
-        BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
-                      SI->getPointerAddressSpace()));
+        BaseAddr,
+        SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
   }
 
   auto Mask = SVI->getShuffleMask();
@@ -9258,7 +10018,7 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
       unsigned IdxI = StoreCount * LaneLen * Factor + i;
       if (Mask[IdxI] >= 0) {
         Ops.push_back(Builder.CreateShuffleVector(
-            Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
+            Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
       } else {
         unsigned StartMask = 0;
         for (unsigned j = 1; j < LaneLen; j++) {
@@ -9274,14 +10034,14 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
         // Note: StartMask cannot be negative, it's checked in
         // isReInterleaveMask
         Ops.push_back(Builder.CreateShuffleVector(
-            Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
+            Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
       }
     }
 
     // If we generating more than one store, we compute the base address of
     // subsequent stores as an offset from the previous.
     if (StoreCount > 0)
-      BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
+      BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
                                             BaseAddr, LaneLen * Factor);
 
     Ops.push_back(Builder.CreateBitCast(BaseAddr, PtrTy));
@@ -9290,16 +10050,59 @@ bool AArch64TargetLowering::lowerInterleavedStore(StoreInst *SI,
   return true;
 }
 
-static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
-                       unsigned AlignCheck) {
-  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
-          (DstAlign == 0 || DstAlign % AlignCheck == 0));
+// Lower an SVE structured load intrinsic returning a tuple type to target
+// specific intrinsic taking the same input but returning a multi-result value
+// of the split tuple type.
+//
+// E.g. Lowering an LD3:
+//
+//  call <vscale x 12 x i32> @llvm.aarch64.sve.ld3.nxv12i32(
+//                                                    <vscale x 4 x i1> %pred,
+//                                                    <vscale x 4 x i32>* %addr)
+//
+//  Output DAG:
+//
+//    t0: ch = EntryToken
+//        t2: nxv4i1,ch = CopyFromReg t0, Register:nxv4i1 %0
+//        t4: i64,ch = CopyFromReg t0, Register:i64 %1
+//    t5: nxv4i32,nxv4i32,nxv4i32,ch = AArch64ISD::SVE_LD3 t0, t2, t4
+//    t6: nxv12i32 = concat_vectors t5, t5:1, t5:2
+//
+// This is called pre-legalization to avoid widening/splitting issues with
+// non-power-of-2 tuple types used for LD3, such as nxv12i32.
+SDValue AArch64TargetLowering::LowerSVEStructLoad(unsigned Intrinsic,
+                                                  ArrayRef<SDValue> LoadOps,
+                                                  EVT VT, SelectionDAG &DAG,
+                                                  const SDLoc &DL) const {
+  assert(VT.isScalableVector() && "Can only lower scalable vectors");
+
+  unsigned N, Opcode;
+  static std::map<unsigned, std::pair<unsigned, unsigned>> IntrinsicMap = {
+      {Intrinsic::aarch64_sve_ld2, {2, AArch64ISD::SVE_LD2_MERGE_ZERO}},
+      {Intrinsic::aarch64_sve_ld3, {3, AArch64ISD::SVE_LD3_MERGE_ZERO}},
+      {Intrinsic::aarch64_sve_ld4, {4, AArch64ISD::SVE_LD4_MERGE_ZERO}}};
+
+  std::tie(N, Opcode) = IntrinsicMap[Intrinsic];
+  assert(VT.getVectorElementCount().Min % N == 0 &&
+         "invalid tuple vector type!");
+
+  EVT SplitVT = EVT::getVectorVT(*DAG.getContext(), VT.getVectorElementType(),
+                                 VT.getVectorElementCount() / N);
+  assert(isTypeLegal(SplitVT));
+
+  SmallVector<EVT, 5> VTs(N, SplitVT);
+  VTs.push_back(MVT::Other); // Chain
+  SDVTList NodeTys = DAG.getVTList(VTs);
+
+  SDValue PseudoLoad = DAG.getNode(Opcode, DL, NodeTys, LoadOps);
+  SmallVector<SDValue, 4> PseudoLoadOps;
+  for (unsigned I = 0; I < N; ++I)
+    PseudoLoadOps.push_back(SDValue(PseudoLoad.getNode(), I));
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, PseudoLoadOps);
 }
 
 EVT AArch64TargetLowering::getOptimalMemOpType(
-    uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
-    bool ZeroMemset, bool MemcpyStrSrc,
-    const AttributeList &FuncAttributes) const {
+    const MemOp &Op, const AttributeList &FuncAttributes) const {
   bool CanImplicitFloat =
       !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
   bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
@@ -9307,9 +10110,9 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
   // Only use AdvSIMD to implement memset of 32-byte and above. It would have
   // taken one instruction to materialize the v2i64 zero and one store (with
   // restrictive addressing mode). Just do i64 stores.
-  bool IsSmallMemset = IsMemset && Size < 32;
-  auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) {
-    if (memOpAlign(SrcAlign, DstAlign, AlignCheck))
+  bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
+  auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
+    if (Op.isAligned(AlignCheck))
       return true;
     bool Fast;
     return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
@@ -9317,22 +10120,20 @@ EVT AArch64TargetLowering::getOptimalMemOpType(
            Fast;
   };
 
-  if (CanUseNEON && IsMemset && !IsSmallMemset &&
-      AlignmentIsAcceptable(MVT::v2i64, 16))
+  if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
+      AlignmentIsAcceptable(MVT::v2i64, Align(16)))
     return MVT::v2i64;
-  if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16))
+  if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
     return MVT::f128;
-  if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8))
+  if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
     return MVT::i64;
-  if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4))
+  if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
     return MVT::i32;
   return MVT::Other;
 }
 
 LLT AArch64TargetLowering::getOptimalMemOpLLT(
-    uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
-    bool ZeroMemset, bool MemcpyStrSrc,
-    const AttributeList &FuncAttributes) const {
+    const MemOp &Op, const AttributeList &FuncAttributes) const {
   bool CanImplicitFloat =
       !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat);
   bool CanUseNEON = Subtarget->hasNEON() && CanImplicitFloat;
@@ -9340,9 +10141,9 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
   // Only use AdvSIMD to implement memset of 32-byte and above. It would have
   // taken one instruction to materialize the v2i64 zero and one store (with
   // restrictive addressing mode). Just do i64 stores.
-  bool IsSmallMemset = IsMemset && Size < 32;
-  auto AlignmentIsAcceptable = [&](EVT VT, unsigned AlignCheck) {
-    if (memOpAlign(SrcAlign, DstAlign, AlignCheck))
+  bool IsSmallMemset = Op.isMemset() && Op.size() < 32;
+  auto AlignmentIsAcceptable = [&](EVT VT, Align AlignCheck) {
+    if (Op.isAligned(AlignCheck))
       return true;
     bool Fast;
     return allowsMisalignedMemoryAccesses(VT, 0, 1, MachineMemOperand::MONone,
@@ -9350,14 +10151,14 @@ LLT AArch64TargetLowering::getOptimalMemOpLLT(
            Fast;
   };
 
-  if (CanUseNEON && IsMemset && !IsSmallMemset &&
-      AlignmentIsAcceptable(MVT::v2i64, 16))
+  if (CanUseNEON && Op.isMemset() && !IsSmallMemset &&
+      AlignmentIsAcceptable(MVT::v2i64, Align(16)))
     return LLT::vector(2, 64);
-  if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, 16))
+  if (CanUseFP && !IsSmallMemset && AlignmentIsAcceptable(MVT::f128, Align(16)))
     return LLT::scalar(128);
-  if (Size >= 8 && AlignmentIsAcceptable(MVT::i64, 8))
+  if (Op.size() >= 8 && AlignmentIsAcceptable(MVT::i64, Align(8)))
     return LLT::scalar(64);
-  if (Size >= 4 && AlignmentIsAcceptable(MVT::i32, 4))
+  if (Op.size() >= 4 && AlignmentIsAcceptable(MVT::i32, Align(4)))
     return LLT::scalar(32);
   return LLT();
 }
@@ -9404,6 +10205,10 @@ bool AArch64TargetLowering::isLegalAddressingMode(const DataLayout &DL,
   if (AM.HasBaseReg && AM.BaseOffs && AM.Scale)
     return false;
 
+  // FIXME: Update this method to support scalable addressing modes.
+  if (isa<ScalableVectorType>(Ty))
+    return AM.HasBaseReg && !AM.BaseOffs && !AM.Scale;
+
   // check reg + imm case:
   // i.e., reg + 0, reg + imm9, reg + SIZE_IN_BYTES * uimm12
   uint64_t NumBytes = 0;
@@ -10110,7 +10915,7 @@ static SDValue tryCombineToBSL(SDNode *N,
       }
 
       if (FoundMatch)
-        return DAG.getNode(AArch64ISD::BSL, DL, VT, SDValue(BVN0, 0),
+        return DAG.getNode(AArch64ISD::BSP, DL, VT, SDValue(BVN0, 0),
                            N0->getOperand(1 - i), N1->getOperand(1 - j));
     }
 
@@ -10167,29 +10972,81 @@ static SDValue performSVEAndCombine(SDNode *N,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  SelectionDAG &DAG = DCI.DAG;
   SDValue Src = N->getOperand(0);
+  unsigned Opc = Src->getOpcode();
+
+  // Zero/any extend of an unsigned unpack
+  if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
+    SDValue UnpkOp = Src->getOperand(0);
+    SDValue Dup = N->getOperand(1);
+
+    if (Dup.getOpcode() != AArch64ISD::DUP)
+      return SDValue();
+
+    SDLoc DL(N);
+    ConstantSDNode *C = dyn_cast<ConstantSDNode>(Dup->getOperand(0));
+    uint64_t ExtVal = C->getZExtValue();
+
+    // If the mask is fully covered by the unpack, we don't need to push
+    // a new AND onto the operand
+    EVT EltTy = UnpkOp->getValueType(0).getVectorElementType();
+    if ((ExtVal == 0xFF && EltTy == MVT::i8) ||
+        (ExtVal == 0xFFFF && EltTy == MVT::i16) ||
+        (ExtVal == 0xFFFFFFFF && EltTy == MVT::i32))
+      return Src;
+
+    // Truncate to prevent a DUP with an over wide constant
+    APInt Mask = C->getAPIntValue().trunc(EltTy.getSizeInBits());
+
+    // Otherwise, make sure we propagate the AND to the operand
+    // of the unpack
+    Dup = DAG.getNode(AArch64ISD::DUP, DL,
+                      UnpkOp->getValueType(0),
+                      DAG.getConstant(Mask.zextOrTrunc(32), DL, MVT::i32));
+
+    SDValue And = DAG.getNode(ISD::AND, DL,
+                              UnpkOp->getValueType(0), UnpkOp, Dup);
+
+    return DAG.getNode(Opc, DL, N->getValueType(0), And);
+  }
+
   SDValue Mask = N->getOperand(1);
 
   if (!Src.hasOneUse())
     return SDValue();
 
-  // GLD1* instructions perform an implicit zero-extend, which makes them
+  EVT MemVT;
+
+  // SVE load instructions perform an implicit zero-extend, which makes them
   // perfect candidates for combining.
-  switch (Src->getOpcode()) {
-  case AArch64ISD::GLD1:
-  case AArch64ISD::GLD1_SCALED:
-  case AArch64ISD::GLD1_SXTW:
-  case AArch64ISD::GLD1_SXTW_SCALED:
-  case AArch64ISD::GLD1_UXTW:
-  case AArch64ISD::GLD1_UXTW_SCALED:
-  case AArch64ISD::GLD1_IMM:
+  switch (Opc) {
+  case AArch64ISD::LD1_MERGE_ZERO:
+  case AArch64ISD::LDNF1_MERGE_ZERO:
+  case AArch64ISD::LDFF1_MERGE_ZERO:
+    MemVT = cast<VTSDNode>(Src->getOperand(3))->getVT();
+    break;
+  case AArch64ISD::GLD1_MERGE_ZERO:
+  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+  case AArch64ISD::GLD1_IMM_MERGE_ZERO:
+  case AArch64ISD::GLDFF1_MERGE_ZERO:
+  case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
+  case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
+  case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
+  case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
+  case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
+  case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
+  case AArch64ISD::GLDNT1_MERGE_ZERO:
+    MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
     break;
   default:
     return SDValue();
   }
 
-  EVT MemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
-
   if (isConstantSplatVectorMaskForType(Mask.getNode(), MemVT))
     return Src;
 
@@ -10273,6 +11130,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0), N1 = N->getOperand(1);
+  unsigned N0Opc = N0->getOpcode(), N1Opc = N1->getOpcode();
 
   // Optimize concat_vectors of truncated vectors, where the intermediate
   // type is illegal, to avoid said illegality,  e.g.,
@@ -10285,9 +11143,8 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   // This isn't really target-specific, but ISD::TRUNCATE legality isn't keyed
   // on both input and result type, so we might generate worse code.
   // On AArch64 we know it's fine for v2i64->v4i16 and v4i32->v8i8.
-  if (N->getNumOperands() == 2 &&
-      N0->getOpcode() == ISD::TRUNCATE &&
-      N1->getOpcode() == ISD::TRUNCATE) {
+  if (N->getNumOperands() == 2 && N0Opc == ISD::TRUNCATE &&
+      N1Opc == ISD::TRUNCATE) {
     SDValue N00 = N0->getOperand(0);
     SDValue N10 = N1->getOperand(0);
     EVT N00VT = N00.getValueType();
@@ -10312,6 +11169,52 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  // Optimise concat_vectors of two [us]rhadds that use extracted subvectors
+  // from the same original vectors. Combine these into a single [us]rhadd that
+  // operates on the two original vectors. Example:
+  //  (v16i8 (concat_vectors (v8i8 (urhadd (extract_subvector (v16i8 OpA, <0>),
+  //                                        extract_subvector (v16i8 OpB,
+  //                                        <0>))),
+  //                         (v8i8 (urhadd (extract_subvector (v16i8 OpA, <8>),
+  //                                        extract_subvector (v16i8 OpB,
+  //                                        <8>)))))
+  // ->
+  //  (v16i8(urhadd(v16i8 OpA, v16i8 OpB)))
+  if (N->getNumOperands() == 2 && N0Opc == N1Opc &&
+      (N0Opc == AArch64ISD::URHADD || N0Opc == AArch64ISD::SRHADD)) {
+    SDValue N00 = N0->getOperand(0);
+    SDValue N01 = N0->getOperand(1);
+    SDValue N10 = N1->getOperand(0);
+    SDValue N11 = N1->getOperand(1);
+
+    EVT N00VT = N00.getValueType();
+    EVT N10VT = N10.getValueType();
+
+    if (N00->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        N01->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        N10->getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        N11->getOpcode() == ISD::EXTRACT_SUBVECTOR && N00VT == N10VT) {
+      SDValue N00Source = N00->getOperand(0);
+      SDValue N01Source = N01->getOperand(0);
+      SDValue N10Source = N10->getOperand(0);
+      SDValue N11Source = N11->getOperand(0);
+
+      if (N00Source == N10Source && N01Source == N11Source &&
+          N00Source.getValueType() == VT && N01Source.getValueType() == VT) {
+        assert(N0.getValueType() == N1.getValueType());
+
+        uint64_t N00Index = N00.getConstantOperandVal(1);
+        uint64_t N01Index = N01.getConstantOperandVal(1);
+        uint64_t N10Index = N10.getConstantOperandVal(1);
+        uint64_t N11Index = N11.getConstantOperandVal(1);
+
+        if (N00Index == N01Index && N10Index == N11Index && N00Index == 0 &&
+            N10Index == N00VT.getVectorNumElements())
+          return DAG.getNode(N0Opc, dl, VT, N00Source, N01Source);
+      }
+    }
+  }
+
   // If we see a (concat_vectors (v1x64 A), (v1x64 A)) it's really a vector
   // splat. The indexed instructions are going to be expecting a DUPLANE64, so
   // canonicalise to that.
@@ -10330,7 +11233,7 @@ static SDValue performConcatVectorsCombine(SDNode *N,
   // becomes
   //    (bitconvert (concat_vectors (v4i16 (bitconvert LHS)), RHS))
 
-  if (N1->getOpcode() != ISD::BITCAST)
+  if (N1Opc != ISD::BITCAST)
     return SDValue();
   SDValue RHS = N1->getOperand(0);
   MVT RHSTy = RHS.getValueType().getSimpleVT();
@@ -10794,6 +11697,35 @@ static SDValue LowerSVEIntReduction(SDNode *N, unsigned Opc,
   return SDValue();
 }
 
+static SDValue LowerSVEIntrinsicIndex(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  SDValue Op1 = N->getOperand(1);
+  SDValue Op2 = N->getOperand(2);
+  EVT ScalarTy = Op1.getValueType();
+
+  if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16)) {
+    Op1 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op1);
+    Op2 = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Op2);
+  }
+
+  return DAG.getNode(AArch64ISD::INDEX_VECTOR, DL, N->getValueType(0),
+                     Op1, Op2);
+}
+
+static SDValue LowerSVEIntrinsicDUP(SDNode *N, SelectionDAG &DAG) {
+  SDLoc dl(N);
+  SDValue Scalar = N->getOperand(3);
+  EVT ScalarTy = Scalar.getValueType();
+
+  if ((ScalarTy == MVT::i8) || (ScalarTy == MVT::i16))
+    Scalar = DAG.getNode(ISD::ANY_EXTEND, dl, MVT::i32, Scalar);
+
+  SDValue Passthru = N->getOperand(1);
+  SDValue Pred = N->getOperand(2);
+  return DAG.getNode(AArch64ISD::DUP_MERGE_PASSTHRU, dl, N->getValueType(0),
+                     Pred, Scalar, Passthru);
+}
+
 static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
   SDLoc dl(N);
   LLVMContext &Ctx = *DAG.getContext();
@@ -10819,8 +11751,7 @@ static SDValue LowerSVEIntrinsicEXT(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(ISD::BITCAST, dl, VT, EXT);
 }
 
-static SDValue tryConvertSVEWideCompare(SDNode *N, unsigned ReplacementIID,
-                                        bool Invert,
+static SDValue tryConvertSVEWideCompare(SDNode *N, ISD::CondCode CC,
                                         TargetLowering::DAGCombinerInfo &DCI,
                                         SelectionDAG &DAG) {
   if (DCI.isBeforeLegalize())
@@ -10873,18 +11804,12 @@ static SDValue tryConvertSVEWideCompare(SDNode *N, unsigned ReplacementIID,
     }
     }
 
+    if (!Imm)
+      return SDValue();
+
     SDValue Splat = DAG.getNode(ISD::SPLAT_VECTOR, DL, CmpVT, Imm);
-    SDValue ID = DAG.getTargetConstant(ReplacementIID, DL, MVT::i64);
-    SDValue Op0, Op1;
-    if (Invert) {
-      Op0 = Splat;
-      Op1 = N->getOperand(2);
-    } else {
-      Op0 = N->getOperand(2);
-      Op1 = Splat;
-    }
-    return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, VT,
-                       ID, Pred, Op0, Op1);
+    return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, DL, VT, Pred,
+                       N->getOperand(2), Splat, DAG.getCondCode(CC));
   }
 
   return SDValue();
@@ -10914,6 +11839,46 @@ static SDValue getPTest(SelectionDAG &DAG, EVT VT, SDValue Pg, SDValue Op,
   return DAG.getZExtOrTrunc(Res, DL, VT);
 }
 
+static SDValue combineSVEReductionFP(SDNode *N, unsigned Opc,
+                                     SelectionDAG &DAG) {
+  SDLoc DL(N);
+
+  SDValue Pred = N->getOperand(1);
+  SDValue VecToReduce = N->getOperand(2);
+
+  EVT ReduceVT = VecToReduce.getValueType();
+  SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, VecToReduce);
+
+  // SVE reductions set the whole vector register with the first element
+  // containing the reduction result, which we'll now extract.
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
+                     Zero);
+}
+
+static SDValue combineSVEReductionOrderedFP(SDNode *N, unsigned Opc,
+                                            SelectionDAG &DAG) {
+  SDLoc DL(N);
+
+  SDValue Pred = N->getOperand(1);
+  SDValue InitVal = N->getOperand(2);
+  SDValue VecToReduce = N->getOperand(3);
+  EVT ReduceVT = VecToReduce.getValueType();
+
+  // Ordered reductions use the first lane of the result vector as the
+  // reduction's initial value.
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  InitVal = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, ReduceVT,
+                        DAG.getUNDEF(ReduceVT), InitVal, Zero);
+
+  SDValue Reduce = DAG.getNode(Opc, DL, ReduceVT, Pred, InitVal, VecToReduce);
+
+  // SVE reductions set the whole vector register with the first element
+  // containing the reduction result, which we'll now extract.
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, N->getValueType(0), Reduce,
+                     Zero);
+}
+
 static SDValue performIntrinsicCombine(SDNode *N,
                                        TargetLowering::DAGCombinerInfo &DCI,
                                        const AArch64Subtarget *Subtarget) {
@@ -10982,38 +11947,107 @@ static SDValue performIntrinsicCombine(SDNode *N,
     return LowerSVEIntReduction(N, AArch64ISD::EORV_PRED, DAG);
   case Intrinsic::aarch64_sve_andv:
     return LowerSVEIntReduction(N, AArch64ISD::ANDV_PRED, DAG);
+  case Intrinsic::aarch64_sve_index:
+    return LowerSVEIntrinsicIndex(N, DAG);
+  case Intrinsic::aarch64_sve_dup:
+    return LowerSVEIntrinsicDUP(N, DAG);
+  case Intrinsic::aarch64_sve_dup_x:
+    return DAG.getNode(ISD::SPLAT_VECTOR, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1));
   case Intrinsic::aarch64_sve_ext:
     return LowerSVEIntrinsicEXT(N, DAG);
+  case Intrinsic::aarch64_sve_smin:
+    return DAG.getNode(AArch64ISD::SMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
+  case Intrinsic::aarch64_sve_umin:
+    return DAG.getNode(AArch64ISD::UMIN_MERGE_OP1, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
+  case Intrinsic::aarch64_sve_smax:
+    return DAG.getNode(AArch64ISD::SMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
+  case Intrinsic::aarch64_sve_umax:
+    return DAG.getNode(AArch64ISD::UMAX_MERGE_OP1, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
+  case Intrinsic::aarch64_sve_lsl:
+    return DAG.getNode(AArch64ISD::SHL_MERGE_OP1, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
+  case Intrinsic::aarch64_sve_lsr:
+    return DAG.getNode(AArch64ISD::SRL_MERGE_OP1, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
+  case Intrinsic::aarch64_sve_asr:
+    return DAG.getNode(AArch64ISD::SRA_MERGE_OP1, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
+  case Intrinsic::aarch64_sve_cmphs:
+    if (!N->getOperand(2).getValueType().isFloatingPoint())
+      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+                         N->getValueType(0), N->getOperand(1), N->getOperand(2),
+                         N->getOperand(3), DAG.getCondCode(ISD::SETUGE));
+    break;
+  case Intrinsic::aarch64_sve_cmphi:
+    if (!N->getOperand(2).getValueType().isFloatingPoint())
+      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+                         N->getValueType(0), N->getOperand(1), N->getOperand(2),
+                         N->getOperand(3), DAG.getCondCode(ISD::SETUGT));
+    break;
+  case Intrinsic::aarch64_sve_cmpge:
+    if (!N->getOperand(2).getValueType().isFloatingPoint())
+      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+                         N->getValueType(0), N->getOperand(1), N->getOperand(2),
+                         N->getOperand(3), DAG.getCondCode(ISD::SETGE));
+    break;
+  case Intrinsic::aarch64_sve_cmpgt:
+    if (!N->getOperand(2).getValueType().isFloatingPoint())
+      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+                         N->getValueType(0), N->getOperand(1), N->getOperand(2),
+                         N->getOperand(3), DAG.getCondCode(ISD::SETGT));
+    break;
+  case Intrinsic::aarch64_sve_cmpeq:
+    if (!N->getOperand(2).getValueType().isFloatingPoint())
+      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+                         N->getValueType(0), N->getOperand(1), N->getOperand(2),
+                         N->getOperand(3), DAG.getCondCode(ISD::SETEQ));
+    break;
+  case Intrinsic::aarch64_sve_cmpne:
+    if (!N->getOperand(2).getValueType().isFloatingPoint())
+      return DAG.getNode(AArch64ISD::SETCC_MERGE_ZERO, SDLoc(N),
+                         N->getValueType(0), N->getOperand(1), N->getOperand(2),
+                         N->getOperand(3), DAG.getCondCode(ISD::SETNE));
+    break;
+  case Intrinsic::aarch64_sve_fadda:
+    return combineSVEReductionOrderedFP(N, AArch64ISD::FADDA_PRED, DAG);
+  case Intrinsic::aarch64_sve_faddv:
+    return combineSVEReductionFP(N, AArch64ISD::FADDV_PRED, DAG);
+  case Intrinsic::aarch64_sve_fmaxnmv:
+    return combineSVEReductionFP(N, AArch64ISD::FMAXNMV_PRED, DAG);
+  case Intrinsic::aarch64_sve_fmaxv:
+    return combineSVEReductionFP(N, AArch64ISD::FMAXV_PRED, DAG);
+  case Intrinsic::aarch64_sve_fminnmv:
+    return combineSVEReductionFP(N, AArch64ISD::FMINNMV_PRED, DAG);
+  case Intrinsic::aarch64_sve_fminv:
+    return combineSVEReductionFP(N, AArch64ISD::FMINV_PRED, DAG);
+  case Intrinsic::aarch64_sve_sel:
+    return DAG.getNode(ISD::VSELECT, SDLoc(N), N->getValueType(0),
+                       N->getOperand(1), N->getOperand(2), N->getOperand(3));
   case Intrinsic::aarch64_sve_cmpeq_wide:
-    return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpeq,
-                                    false, DCI, DAG);
+    return tryConvertSVEWideCompare(N, ISD::SETEQ, DCI, DAG);
   case Intrinsic::aarch64_sve_cmpne_wide:
-    return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpne,
-                                    false, DCI, DAG);
+    return tryConvertSVEWideCompare(N, ISD::SETNE, DCI, DAG);
   case Intrinsic::aarch64_sve_cmpge_wide:
-    return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge,
-                                    false, DCI, DAG);
+    return tryConvertSVEWideCompare(N, ISD::SETGE, DCI, DAG);
   case Intrinsic::aarch64_sve_cmpgt_wide:
-    return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt,
-                                    false, DCI, DAG);
+    return tryConvertSVEWideCompare(N, ISD::SETGT, DCI, DAG);
   case Intrinsic::aarch64_sve_cmplt_wide:
-    return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpgt,
-                                    true, DCI, DAG);
+    return tryConvertSVEWideCompare(N, ISD::SETLT, DCI, DAG);
   case Intrinsic::aarch64_sve_cmple_wide:
-    return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmpge,
-                                    true, DCI, DAG);
+    return tryConvertSVEWideCompare(N, ISD::SETLE, DCI, DAG);
   case Intrinsic::aarch64_sve_cmphs_wide:
-    return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs,
-                                    false, DCI, DAG);
+    return tryConvertSVEWideCompare(N, ISD::SETUGE, DCI, DAG);
   case Intrinsic::aarch64_sve_cmphi_wide:
-    return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi,
-                                    false, DCI, DAG);
+    return tryConvertSVEWideCompare(N, ISD::SETUGT, DCI, DAG);
   case Intrinsic::aarch64_sve_cmplo_wide:
-    return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphi, true,
-                                    DCI, DAG);
+    return tryConvertSVEWideCompare(N, ISD::SETULT, DCI, DAG);
   case Intrinsic::aarch64_sve_cmpls_wide:
-    return tryConvertSVEWideCompare(N, Intrinsic::aarch64_sve_cmphs, true,
-                                    DCI, DAG);
+    return tryConvertSVEWideCompare(N, ISD::SETULE, DCI, DAG);
   case Intrinsic::aarch64_sve_ptest_any:
     return getPTest(DAG, N->getValueType(0), N->getOperand(1), N->getOperand(2),
                     AArch64CC::ANY_ACTIVE);
@@ -11091,14 +12125,14 @@ static SDValue performExtendCombine(SDNode *N,
   if (!ResVT.isSimple() || !SrcVT.isSimple())
     return SDValue();
 
-  // If the source VT is a 64-bit vector, we can play games and get the
-  // better results we want.
-  if (SrcVT.getSizeInBits() != 64)
+  // If the source VT is a 64-bit fixed or scalable vector, we can play games
+  // and get the better results we want.
+  if (SrcVT.getSizeInBits().getKnownMinSize() != 64)
     return SDValue();
 
   unsigned SrcEltSize = SrcVT.getScalarSizeInBits();
-  unsigned ElementCount = SrcVT.getVectorNumElements();
-  SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), ElementCount);
+  ElementCount SrcEC = SrcVT.getVectorElementCount();
+  SrcVT = MVT::getVectorVT(MVT::getIntegerVT(SrcEltSize * 2), SrcEC);
   SDLoc DL(N);
   Src = DAG.getNode(N->getOpcode(), DL, SrcVT, Src);
 
@@ -11106,17 +12140,14 @@ static SDValue performExtendCombine(SDNode *N,
   // bit source.
   EVT LoVT, HiVT;
   SDValue Lo, Hi;
-  unsigned NumElements = ResVT.getVectorNumElements();
-  assert(!(NumElements & 1) && "Splitting vector, but not in half!");
-  LoVT = HiVT = EVT::getVectorVT(*DAG.getContext(),
-                                 ResVT.getVectorElementType(), NumElements / 2);
+  LoVT = HiVT = ResVT.getHalfNumVectorElementsVT(*DAG.getContext());
 
   EVT InNVT = EVT::getVectorVT(*DAG.getContext(), SrcVT.getVectorElementType(),
-                               LoVT.getVectorNumElements());
+                               LoVT.getVectorElementCount());
   Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
                    DAG.getConstant(0, DL, MVT::i64));
   Hi = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, InNVT, Src,
-                   DAG.getConstant(InNVT.getVectorNumElements(), DL, MVT::i64));
+                   DAG.getConstant(InNVT.getVectorMinNumElements(), DL, MVT::i64));
   Lo = DAG.getNode(N->getOpcode(), DL, LoVT, Lo);
   Hi = DAG.getNode(N->getOpcode(), DL, HiVT, Hi);
 
@@ -11165,11 +12196,71 @@ static SDValue splitStoreSplat(SelectionDAG &DAG, StoreSDNode &St,
   return NewST1;
 }
 
+// Returns an SVE type that ContentTy can be trivially sign or zero extended
+// into.
+static MVT getSVEContainerType(EVT ContentTy) {
+  assert(ContentTy.isSimple() && "No SVE containers for extended types");
+
+  switch (ContentTy.getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("No known SVE container for this MVT type");
+  case MVT::nxv2i8:
+  case MVT::nxv2i16:
+  case MVT::nxv2i32:
+  case MVT::nxv2i64:
+  case MVT::nxv2f32:
+  case MVT::nxv2f64:
+    return MVT::nxv2i64;
+  case MVT::nxv4i8:
+  case MVT::nxv4i16:
+  case MVT::nxv4i32:
+  case MVT::nxv4f32:
+    return MVT::nxv4i32;
+  case MVT::nxv8i8:
+  case MVT::nxv8i16:
+  case MVT::nxv8f16:
+  case MVT::nxv8bf16:
+    return MVT::nxv8i16;
+  case MVT::nxv16i8:
+    return MVT::nxv16i8;
+  }
+}
+
+static SDValue performLD1Combine(SDNode *N, SelectionDAG &DAG, unsigned Opc) {
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  if (VT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
+    return SDValue();
+
+  EVT ContainerVT = VT;
+  if (ContainerVT.isInteger())
+    ContainerVT = getSVEContainerType(ContainerVT);
+
+  SDVTList VTs = DAG.getVTList(ContainerVT, MVT::Other);
+  SDValue Ops[] = { N->getOperand(0), // Chain
+                    N->getOperand(2), // Pg
+                    N->getOperand(3), // Base
+                    DAG.getValueType(VT) };
+
+  SDValue Load = DAG.getNode(Opc, DL, VTs, Ops);
+  SDValue LoadChain = SDValue(Load.getNode(), 1);
+
+  if (ContainerVT.isInteger() && (VT != ContainerVT))
+    Load = DAG.getNode(ISD::TRUNCATE, DL, VT, Load.getValue(0));
+
+  return DAG.getMergeValues({ Load, LoadChain }, DL);
+}
+
 static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
   SDLoc DL(N);
   EVT VT = N->getValueType(0);
   EVT PtrTy = N->getOperand(3).getValueType();
 
+  if (VT == MVT::nxv8bf16 &&
+      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+    return SDValue();
+
   EVT LoadVT = VT;
   if (VT.isFloatingPoint())
     LoadVT = VT.changeTypeToInteger();
@@ -11190,6 +12281,58 @@ static SDValue performLDNT1Combine(SDNode *N, SelectionDAG &DAG) {
   return L;
 }
 
+template <unsigned Opcode>
+static SDValue performLD1ReplicateCombine(SDNode *N, SelectionDAG &DAG) {
+  static_assert(Opcode == AArch64ISD::LD1RQ_MERGE_ZERO ||
+                    Opcode == AArch64ISD::LD1RO_MERGE_ZERO,
+                "Unsupported opcode.");
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  EVT LoadVT = VT;
+  if (VT.isFloatingPoint())
+    LoadVT = VT.changeTypeToInteger();
+
+  SDValue Ops[] = {N->getOperand(0), N->getOperand(2), N->getOperand(3)};
+  SDValue Load = DAG.getNode(Opcode, DL, {LoadVT, MVT::Other}, Ops);
+  SDValue LoadChain = SDValue(Load.getNode(), 1);
+
+  if (VT.isFloatingPoint())
+    Load = DAG.getNode(ISD::BITCAST, DL, VT, Load.getValue(0));
+
+  return DAG.getMergeValues({Load, LoadChain}, DL);
+}
+
+static SDValue performST1Combine(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  SDValue Data = N->getOperand(2);
+  EVT DataVT = Data.getValueType();
+  EVT HwSrcVt = getSVEContainerType(DataVT);
+  SDValue InputVT = DAG.getValueType(DataVT);
+
+  if (DataVT == MVT::nxv8bf16 &&
+      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+    return SDValue();
+
+  if (DataVT.isFloatingPoint())
+    InputVT = DAG.getValueType(HwSrcVt);
+
+  SDValue SrcNew;
+  if (Data.getValueType().isFloatingPoint())
+    SrcNew = DAG.getNode(ISD::BITCAST, DL, HwSrcVt, Data);
+  else
+    SrcNew = DAG.getNode(ISD::ANY_EXTEND, DL, HwSrcVt, Data);
+
+  SDValue Ops[] = { N->getOperand(0), // Chain
+                    SrcNew,
+                    N->getOperand(4), // Base
+                    N->getOperand(3), // Pg
+                    InputVT
+                  };
+
+  return DAG.getNode(AArch64ISD::ST1_PRED, DL, N->getValueType(0), Ops);
+}
+
 static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
   SDLoc DL(N);
 
@@ -11197,6 +12340,10 @@ static SDValue performSTNT1Combine(SDNode *N, SelectionDAG &DAG) {
   EVT DataVT = Data.getValueType();
   EVT PtrTy = N->getOperand(4).getValueType();
 
+  if (DataVT == MVT::nxv8bf16 &&
+      !static_cast<const AArch64Subtarget &>(DAG.getSubtarget()).hasBF16())
+    return SDValue();
+
   if (DataVT.isFloatingPoint())
     Data = DAG.getNode(ISD::BITCAST, DL, DataVT.changeTypeToInteger(), Data);
 
@@ -11226,6 +12373,10 @@ static SDValue replaceZeroVectorStore(SelectionDAG &DAG, StoreSDNode &St) {
   SDValue StVal = St.getValue();
   EVT VT = StVal.getValueType();
 
+  // Avoid scalarizing zero splat stores for scalable vectors.
+  if (VT.isScalableVector())
+    return SDValue();
+
   // It is beneficial to scalarize a zero splat store for 2 or 3 i64 elements or
   // 2, 3 or 4 i32 elements.
   int NumVecElts = VT.getVectorNumElements();
@@ -11348,7 +12499,8 @@ static SDValue splitStores(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
 
   SDValue StVal = S->getValue();
   EVT VT = StVal.getValueType();
-  if (!VT.isVector())
+
+  if (!VT.isFixedLengthVector())
     return SDValue();
 
   // If we get a splat of zeros, convert this vector store to a store of
@@ -11419,6 +12571,9 @@ static SDValue performPostLD1Combine(SDNode *N,
   SelectionDAG &DAG = DCI.DAG;
   EVT VT = N->getValueType(0);
 
+  if (VT.isScalableVector())
+    return SDValue();
+
   unsigned LoadIdx = IsLaneOp ? 1 : 0;
   SDNode *LD = N->getOperand(LoadIdx).getNode();
   // If it is not LOAD, can not do such combine.
@@ -12258,32 +13413,57 @@ static SDValue performGlobalAddressCombine(SDNode *N, SelectionDAG &DAG,
                      DAG.getConstant(MinOffset, DL, MVT::i64));
 }
 
-// Returns an SVE type that ContentTy can be trivially sign or zero extended
-// into.
-static MVT getSVEContainerType(EVT ContentTy) {
-  assert(ContentTy.isSimple() && "No SVE containers for extended types");
+// Turns the vector of indices into a vector of byte offstes by scaling Offset
+// by (BitWidth / 8).
+static SDValue getScaledOffsetForBitWidth(SelectionDAG &DAG, SDValue Offset,
+                                          SDLoc DL, unsigned BitWidth) {
+  assert(Offset.getValueType().isScalableVector() &&
+         "This method is only for scalable vectors of offsets");
 
-  switch (ContentTy.getSimpleVT().SimpleTy) {
-  default:
-    llvm_unreachable("No known SVE container for this MVT type");
-  case MVT::nxv2i8:
-  case MVT::nxv2i16:
-  case MVT::nxv2i32:
-  case MVT::nxv2i64:
-  case MVT::nxv2f32:
-  case MVT::nxv2f64:
-    return MVT::nxv2i64;
-  case MVT::nxv4i8:
-  case MVT::nxv4i16:
-  case MVT::nxv4i32:
-  case MVT::nxv4f32:
-    return MVT::nxv4i32;
-  }
+  SDValue Shift = DAG.getConstant(Log2_32(BitWidth / 8), DL, MVT::i64);
+  SDValue SplatShift = DAG.getNode(ISD::SPLAT_VECTOR, DL, MVT::nxv2i64, Shift);
+
+  return DAG.getNode(ISD::SHL, DL, MVT::nxv2i64, Offset, SplatShift);
 }
 
-static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
-                                        unsigned Opcode,
-                                        bool OnlyPackedOffsets = true) {
+/// Check if the value of \p OffsetInBytes can be used as an immediate for
+/// the gather load/prefetch and scatter store instructions with vector base and
+/// immediate offset addressing mode:
+///
+///      [<Zn>.[S|D]{, #<imm>}]
+///
+/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
+
+inline static bool isValidImmForSVEVecImmAddrMode(unsigned OffsetInBytes,
+                                                  unsigned ScalarSizeInBytes) {
+  // The immediate is not a multiple of the scalar size.
+  if (OffsetInBytes % ScalarSizeInBytes)
+    return false;
+
+  // The immediate is out of range.
+  if (OffsetInBytes / ScalarSizeInBytes > 31)
+    return false;
+
+  return true;
+}
+
+/// Check if the value of \p Offset represents a valid immediate for the SVE
+/// gather load/prefetch and scatter store instructiona with vector base and
+/// immediate offset addressing mode:
+///
+///      [<Zn>.[S|D]{, #<imm>}]
+///
+/// where <imm> = sizeof(<T>) * k, for k = 0, 1, ..., 31.
+static bool isValidImmForSVEVecImmAddrMode(SDValue Offset,
+                                           unsigned ScalarSizeInBytes) {
+  ConstantSDNode *OffsetConst = dyn_cast<ConstantSDNode>(Offset.getNode());
+  return OffsetConst && isValidImmForSVEVecImmAddrMode(
+                            OffsetConst->getZExtValue(), ScalarSizeInBytes);
+}
+
+static SDValue performScatterStoreCombine(SDNode *N, SelectionDAG &DAG,
+                                          unsigned Opcode,
+                                          bool OnlyPackedOffsets = true) {
   const SDValue Src = N->getOperand(2);
   const EVT SrcVT = Src->getValueType(0);
   assert(SrcVT.isScalableVector() &&
@@ -12303,11 +13483,46 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
 
   // Depending on the addressing mode, this is either a pointer or a vector of
   // pointers (that fits into one register)
-  const SDValue Base = N->getOperand(4);
+  SDValue Base = N->getOperand(4);
   // Depending on the addressing mode, this is either a single offset or a
   // vector of offsets  (that fits into one register)
   SDValue Offset = N->getOperand(5);
 
+  // For "scalar + vector of indices", just scale the indices. This only
+  // applies to non-temporal scatters because there's no instruction that takes
+  // indicies.
+  if (Opcode == AArch64ISD::SSTNT1_INDEX_PRED) {
+    Offset =
+        getScaledOffsetForBitWidth(DAG, Offset, DL, SrcElVT.getSizeInBits());
+    Opcode = AArch64ISD::SSTNT1_PRED;
+  }
+
+  // In the case of non-temporal gather loads there's only one SVE instruction
+  // per data-size: "scalar + vector", i.e.
+  //    * stnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
+  // Since we do have intrinsics that allow the arguments to be in a different
+  // order, we may need to swap them to match the spec.
+  if (Opcode == AArch64ISD::SSTNT1_PRED && Offset.getValueType().isVector())
+    std::swap(Base, Offset);
+
+  // SST1_IMM requires that the offset is an immediate that is:
+  //    * a multiple of #SizeInBytes,
+  //    * in the range [0, 31 x #SizeInBytes],
+  // where #SizeInBytes is the size in bytes of the stored items. For
+  // immediates outside that range and non-immediate scalar offsets use SST1 or
+  // SST1_UXTW instead.
+  if (Opcode == AArch64ISD::SST1_IMM_PRED) {
+    if (!isValidImmForSVEVecImmAddrMode(Offset,
+                                        SrcVT.getScalarSizeInBits() / 8)) {
+      if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
+        Opcode = AArch64ISD::SST1_UXTW_PRED;
+      else
+        Opcode = AArch64ISD::SST1_PRED;
+
+      std::swap(Base, Offset);
+    }
+  }
+
   auto &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.isTypeLegal(Base.getValueType()))
     return SDValue();
@@ -12325,9 +13540,9 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
   // Source value type that is representable in hardware
   EVT HwSrcVt = getSVEContainerType(SrcVT);
 
-  // Keep the original type of the input data to store - this is needed to
-  // differentiate between ST1B, ST1H, ST1W and ST1D. For FP values we want the
-  // integer equivalent, so just use HwSrcVt.
+  // Keep the original type of the input data to store - this is needed to be
+  // able to select the correct instruction, e.g. ST1B, ST1H, ST1W and ST1D. For
+  // FP values we want the integer equivalent, so just use HwSrcVt.
   SDValue InputVT = DAG.getValueType(SrcVT);
   if (SrcVT.isFloatingPoint())
     InputVT = DAG.getValueType(HwSrcVt);
@@ -12350,24 +13565,67 @@ static SDValue performST1ScatterCombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getNode(Opcode, DL, VTs, Ops);
 }
 
-static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
-                                       unsigned Opcode,
-                                       bool OnlyPackedOffsets = true) {
-  EVT RetVT = N->getValueType(0);
+static SDValue performGatherLoadCombine(SDNode *N, SelectionDAG &DAG,
+                                        unsigned Opcode,
+                                        bool OnlyPackedOffsets = true) {
+  const EVT RetVT = N->getValueType(0);
   assert(RetVT.isScalableVector() &&
          "Gather loads are only possible for SVE vectors");
+
   SDLoc DL(N);
 
+  // Make sure that the loaded data will fit into an SVE register
   if (RetVT.getSizeInBits().getKnownMinSize() > AArch64::SVEBitsPerBlock)
     return SDValue();
 
   // Depending on the addressing mode, this is either a pointer or a vector of
   // pointers (that fits into one register)
-  const SDValue Base = N->getOperand(3);
+  SDValue Base = N->getOperand(3);
   // Depending on the addressing mode, this is either a single offset or a
   // vector of offsets  (that fits into one register)
   SDValue Offset = N->getOperand(4);
 
+  // For "scalar + vector of indices", just scale the indices. This only
+  // applies to non-temporal gathers because there's no instruction that takes
+  // indicies.
+  if (Opcode == AArch64ISD::GLDNT1_INDEX_MERGE_ZERO) {
+    Offset = getScaledOffsetForBitWidth(DAG, Offset, DL,
+                                        RetVT.getScalarSizeInBits());
+    Opcode = AArch64ISD::GLDNT1_MERGE_ZERO;
+  }
+
+  // In the case of non-temporal gather loads there's only one SVE instruction
+  // per data-size: "scalar + vector", i.e.
+  //    * ldnt1{b|h|w|d} { z0.s }, p0/z, [z0.s, x0]
+  // Since we do have intrinsics that allow the arguments to be in a different
+  // order, we may need to swap them to match the spec.
+  if (Opcode == AArch64ISD::GLDNT1_MERGE_ZERO &&
+      Offset.getValueType().isVector())
+    std::swap(Base, Offset);
+
+  // GLD{FF}1_IMM requires that the offset is an immediate that is:
+  //    * a multiple of #SizeInBytes,
+  //    * in the range [0, 31 x #SizeInBytes],
+  // where #SizeInBytes is the size in bytes of the loaded items. For
+  // immediates outside that range and non-immediate scalar offsets use
+  // GLD1_MERGE_ZERO or GLD1_UXTW_MERGE_ZERO instead.
+  if (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO ||
+      Opcode == AArch64ISD::GLDFF1_IMM_MERGE_ZERO) {
+    if (!isValidImmForSVEVecImmAddrMode(Offset,
+                                        RetVT.getScalarSizeInBits() / 8)) {
+      if (MVT::nxv4i32 == Base.getValueType().getSimpleVT().SimpleTy)
+        Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
+                     ? AArch64ISD::GLD1_UXTW_MERGE_ZERO
+                     : AArch64ISD::GLDFF1_UXTW_MERGE_ZERO;
+      else
+        Opcode = (Opcode == AArch64ISD::GLD1_IMM_MERGE_ZERO)
+                     ? AArch64ISD::GLD1_MERGE_ZERO
+                     : AArch64ISD::GLDFF1_MERGE_ZERO;
+
+      std::swap(Base, Offset);
+    }
+  }
+
   auto &TLI = DAG.getTargetLoweringInfo();
   if (!TLI.isTypeLegal(Base.getValueType()))
     return SDValue();
@@ -12382,10 +13640,9 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
   // Return value type that is representable in hardware
   EVT HwRetVt = getSVEContainerType(RetVT);
 
-  // Keep the original output value type around - this will better inform
-  // optimisations (e.g. instruction folding when load is followed by
-  // zext/sext). This will only be used for ints, so the value for FPs
-  // doesn't matter.
+  // Keep the original output value type around - this is needed to be able to
+  // select the correct instruction, e.g. LD1B, LD1H, LD1W and LD1D. For FP
+  // values we want the integer equivalent, so just use HwRetVT.
   SDValue OutVT = DAG.getValueType(RetVT);
   if (RetVT.isFloatingPoint())
     OutVT = DAG.getValueType(HwRetVt);
@@ -12409,55 +13666,126 @@ static SDValue performLD1GatherCombine(SDNode *N, SelectionDAG &DAG,
   return DAG.getMergeValues({Load, LoadChain}, DL);
 }
 
-
 static SDValue
 performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
                               SelectionDAG &DAG) {
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
+  SDLoc DL(N);
   SDValue Src = N->getOperand(0);
   unsigned Opc = Src->getOpcode();
 
-  // Gather load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
+  // Sign extend of an unsigned unpack -> signed unpack
+  if (Opc == AArch64ISD::UUNPKHI || Opc == AArch64ISD::UUNPKLO) {
+
+    unsigned SOpc = Opc == AArch64ISD::UUNPKHI ? AArch64ISD::SUNPKHI
+                                               : AArch64ISD::SUNPKLO;
+
+    // Push the sign extend to the operand of the unpack
+    // This is necessary where, for example, the operand of the unpack
+    // is another unpack:
+    // 4i32 sign_extend_inreg (4i32 uunpklo(8i16 uunpklo (16i8 opnd)), from 4i8)
+    // ->
+    // 4i32 sunpklo (8i16 sign_extend_inreg(8i16 uunpklo (16i8 opnd), from 8i8)
+    // ->
+    // 4i32 sunpklo(8i16 sunpklo(16i8 opnd))
+    SDValue ExtOp = Src->getOperand(0);
+    auto VT = cast<VTSDNode>(N->getOperand(1))->getVT();
+    EVT EltTy = VT.getVectorElementType();
+    (void)EltTy;
+
+    assert((EltTy == MVT::i8 || EltTy == MVT::i16 || EltTy == MVT::i32) &&
+           "Sign extending from an invalid type");
+
+    EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
+                                 VT.getVectorElementType(),
+                                 VT.getVectorElementCount() * 2);
+
+    SDValue Ext = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, ExtOp.getValueType(),
+                              ExtOp, DAG.getValueType(ExtVT));
+
+    return DAG.getNode(SOpc, DL, N->getValueType(0), Ext);
+  }
+
+  // SVE load nodes (e.g. AArch64ISD::GLD1) are straightforward candidates
   // for DAG Combine with SIGN_EXTEND_INREG. Bail out for all other nodes.
   unsigned NewOpc;
+  unsigned MemVTOpNum = 4;
   switch (Opc) {
-  case AArch64ISD::GLD1:
-    NewOpc = AArch64ISD::GLD1S;
+  case AArch64ISD::LD1_MERGE_ZERO:
+    NewOpc = AArch64ISD::LD1S_MERGE_ZERO;
+    MemVTOpNum = 3;
+    break;
+  case AArch64ISD::LDNF1_MERGE_ZERO:
+    NewOpc = AArch64ISD::LDNF1S_MERGE_ZERO;
+    MemVTOpNum = 3;
+    break;
+  case AArch64ISD::LDFF1_MERGE_ZERO:
+    NewOpc = AArch64ISD::LDFF1S_MERGE_ZERO;
+    MemVTOpNum = 3;
+    break;
+  case AArch64ISD::GLD1_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLD1S_MERGE_ZERO;
     break;
-  case AArch64ISD::GLD1_SCALED:
-    NewOpc = AArch64ISD::GLD1S_SCALED;
+  case AArch64ISD::GLD1_SCALED_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLD1S_SCALED_MERGE_ZERO;
     break;
-  case AArch64ISD::GLD1_SXTW:
-    NewOpc = AArch64ISD::GLD1S_SXTW;
+  case AArch64ISD::GLD1_SXTW_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLD1S_SXTW_MERGE_ZERO;
     break;
-  case AArch64ISD::GLD1_SXTW_SCALED:
-    NewOpc = AArch64ISD::GLD1S_SXTW_SCALED;
+  case AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO;
     break;
-  case AArch64ISD::GLD1_UXTW:
-    NewOpc = AArch64ISD::GLD1S_UXTW;
+  case AArch64ISD::GLD1_UXTW_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLD1S_UXTW_MERGE_ZERO;
     break;
-  case AArch64ISD::GLD1_UXTW_SCALED:
-    NewOpc = AArch64ISD::GLD1S_UXTW_SCALED;
+  case AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO;
     break;
-  case AArch64ISD::GLD1_IMM:
-    NewOpc = AArch64ISD::GLD1S_IMM;
+  case AArch64ISD::GLD1_IMM_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLD1S_IMM_MERGE_ZERO;
+    break;
+  case AArch64ISD::GLDFF1_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLDFF1S_MERGE_ZERO;
+    break;
+  case AArch64ISD::GLDFF1_SCALED_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO;
+    break;
+  case AArch64ISD::GLDFF1_SXTW_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO;
+    break;
+  case AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO;
+    break;
+  case AArch64ISD::GLDFF1_UXTW_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO;
+    break;
+  case AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO;
+    break;
+  case AArch64ISD::GLDFF1_IMM_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLDFF1S_IMM_MERGE_ZERO;
+    break;
+  case AArch64ISD::GLDNT1_MERGE_ZERO:
+    NewOpc = AArch64ISD::GLDNT1S_MERGE_ZERO;
     break;
   default:
     return SDValue();
   }
 
   EVT SignExtSrcVT = cast<VTSDNode>(N->getOperand(1))->getVT();
-  EVT GLD1SrcMemVT = cast<VTSDNode>(Src->getOperand(4))->getVT();
+  EVT SrcMemVT = cast<VTSDNode>(Src->getOperand(MemVTOpNum))->getVT();
 
-  if ((SignExtSrcVT != GLD1SrcMemVT) || !Src.hasOneUse())
+  if ((SignExtSrcVT != SrcMemVT) || !Src.hasOneUse())
     return SDValue();
 
   EVT DstVT = N->getValueType(0);
   SDVTList VTs = DAG.getVTList(DstVT, MVT::Other);
-  SDValue Ops[] = {Src->getOperand(0), Src->getOperand(1), Src->getOperand(2),
-                   Src->getOperand(3), Src->getOperand(4)};
+
+  SmallVector<SDValue, 5> Ops;
+  for (unsigned I = 0; I < Src->getNumOperands(); ++I)
+    Ops.push_back(Src->getOperand(I));
 
   SDValue ExtLoad = DAG.getNode(NewOpc, SDLoc(N), VTs, Ops);
   DCI.CombineTo(N, ExtLoad);
@@ -12467,6 +13795,51 @@ performSignExtendInRegCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
   return SDValue(N, 0);
 }
 
+/// Legalize the gather prefetch (scalar + vector addressing mode) when the
+/// offset vector is an unpacked 32-bit scalable vector. The other cases (Offset
+/// != nxv2i32) do not need legalization.
+static SDValue legalizeSVEGatherPrefetchOffsVec(SDNode *N, SelectionDAG &DAG) {
+  const unsigned OffsetPos = 4;
+  SDValue Offset = N->getOperand(OffsetPos);
+
+  // Not an unpacked vector, bail out.
+  if (Offset.getValueType().getSimpleVT().SimpleTy != MVT::nxv2i32)
+    return SDValue();
+
+  // Extend the unpacked offset vector to 64-bit lanes.
+  SDLoc DL(N);
+  Offset = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::nxv2i64, Offset);
+  SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
+  // Replace the offset operand with the 64-bit one.
+  Ops[OffsetPos] = Offset;
+
+  return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
+}
+
+/// Combines a node carrying the intrinsic
+/// `aarch64_sve_prf<T>_gather_scalar_offset` into a node that uses
+/// `aarch64_sve_prfb_gather_uxtw_index` when the scalar offset passed to
+/// `aarch64_sve_prf<T>_gather_scalar_offset` is not a valid immediate for the
+/// sve gather prefetch instruction with vector plus immediate addressing mode.
+static SDValue combineSVEPrefetchVecBaseImmOff(SDNode *N, SelectionDAG &DAG,
+                                               unsigned ScalarSizeInBytes) {
+  const unsigned ImmPos = 4, OffsetPos = 3;
+  // No need to combine the node if the immediate is valid...
+  if (isValidImmForSVEVecImmAddrMode(N->getOperand(ImmPos), ScalarSizeInBytes))
+    return SDValue();
+
+  // ...otherwise swap the offset base with the offset...
+  SmallVector<SDValue, 5> Ops(N->op_begin(), N->op_end());
+  std::swap(Ops[ImmPos], Ops[OffsetPos]);
+  // ...and remap the intrinsic `aarch64_sve_prf<T>_gather_scalar_offset` to
+  // `aarch64_sve_prfb_gather_uxtw_index`.
+  SDLoc DL(N);
+  Ops[1] = DAG.getConstant(Intrinsic::aarch64_sve_prfb_gather_uxtw_index, DL,
+                           MVT::i64);
+
+  return DAG.getNode(N->getOpcode(), DL, DAG.getVTList(MVT::Other), Ops);
+}
+
 SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
                                                  DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -12531,6 +13904,23 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::INTRINSIC_VOID:
   case ISD::INTRINSIC_W_CHAIN:
     switch (cast<ConstantSDNode>(N->getOperand(1))->getZExtValue()) {
+    case Intrinsic::aarch64_sve_prfb_gather_scalar_offset:
+      return combineSVEPrefetchVecBaseImmOff(N, DAG, 1 /*=ScalarSizeInBytes*/);
+    case Intrinsic::aarch64_sve_prfh_gather_scalar_offset:
+      return combineSVEPrefetchVecBaseImmOff(N, DAG, 2 /*=ScalarSizeInBytes*/);
+    case Intrinsic::aarch64_sve_prfw_gather_scalar_offset:
+      return combineSVEPrefetchVecBaseImmOff(N, DAG, 4 /*=ScalarSizeInBytes*/);
+    case Intrinsic::aarch64_sve_prfd_gather_scalar_offset:
+      return combineSVEPrefetchVecBaseImmOff(N, DAG, 8 /*=ScalarSizeInBytes*/);
+    case Intrinsic::aarch64_sve_prfb_gather_uxtw_index:
+    case Intrinsic::aarch64_sve_prfb_gather_sxtw_index:
+    case Intrinsic::aarch64_sve_prfh_gather_uxtw_index:
+    case Intrinsic::aarch64_sve_prfh_gather_sxtw_index:
+    case Intrinsic::aarch64_sve_prfw_gather_uxtw_index:
+    case Intrinsic::aarch64_sve_prfw_gather_sxtw_index:
+    case Intrinsic::aarch64_sve_prfd_gather_uxtw_index:
+    case Intrinsic::aarch64_sve_prfd_gather_sxtw_index:
+      return legalizeSVEGatherPrefetchOffsVec(N, DAG);
     case Intrinsic::aarch64_neon_ld2:
     case Intrinsic::aarch64_neon_ld3:
     case Intrinsic::aarch64_neon_ld4:
@@ -12555,44 +13945,180 @@ SDValue AArch64TargetLowering::PerformDAGCombine(SDNode *N,
       return performNEONPostLDSTCombine(N, DCI, DAG);
     case Intrinsic::aarch64_sve_ldnt1:
       return performLDNT1Combine(N, DAG);
+    case Intrinsic::aarch64_sve_ld1rq:
+      return performLD1ReplicateCombine<AArch64ISD::LD1RQ_MERGE_ZERO>(N, DAG);
+    case Intrinsic::aarch64_sve_ld1ro:
+      return performLD1ReplicateCombine<AArch64ISD::LD1RO_MERGE_ZERO>(N, DAG);
+    case Intrinsic::aarch64_sve_ldnt1_gather_scalar_offset:
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
+    case Intrinsic::aarch64_sve_ldnt1_gather:
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
+    case Intrinsic::aarch64_sve_ldnt1_gather_index:
+      return performGatherLoadCombine(N, DAG,
+                                      AArch64ISD::GLDNT1_INDEX_MERGE_ZERO);
+    case Intrinsic::aarch64_sve_ldnt1_gather_uxtw:
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDNT1_MERGE_ZERO);
+    case Intrinsic::aarch64_sve_ld1:
+      return performLD1Combine(N, DAG, AArch64ISD::LD1_MERGE_ZERO);
+    case Intrinsic::aarch64_sve_ldnf1:
+      return performLD1Combine(N, DAG, AArch64ISD::LDNF1_MERGE_ZERO);
+    case Intrinsic::aarch64_sve_ldff1:
+      return performLD1Combine(N, DAG, AArch64ISD::LDFF1_MERGE_ZERO);
+    case Intrinsic::aarch64_sve_st1:
+      return performST1Combine(N, DAG);
     case Intrinsic::aarch64_sve_stnt1:
       return performSTNT1Combine(N, DAG);
+    case Intrinsic::aarch64_sve_stnt1_scatter_scalar_offset:
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
+    case Intrinsic::aarch64_sve_stnt1_scatter_uxtw:
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
+    case Intrinsic::aarch64_sve_stnt1_scatter:
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_PRED);
+    case Intrinsic::aarch64_sve_stnt1_scatter_index:
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SSTNT1_INDEX_PRED);
     case Intrinsic::aarch64_sve_ld1_gather:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1);
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_MERGE_ZERO);
     case Intrinsic::aarch64_sve_ld1_gather_index:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SCALED);
+      return performGatherLoadCombine(N, DAG,
+                                      AArch64ISD::GLD1_SCALED_MERGE_ZERO);
     case Intrinsic::aarch64_sve_ld1_gather_sxtw:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW,
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_SXTW_MERGE_ZERO,
                                       /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_ld1_gather_uxtw:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW,
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_UXTW_MERGE_ZERO,
                                       /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_ld1_gather_sxtw_index:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_SXTW_SCALED,
+      return performGatherLoadCombine(N, DAG,
+                                      AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO,
                                       /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_ld1_gather_uxtw_index:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_UXTW_SCALED,
+      return performGatherLoadCombine(N, DAG,
+                                      AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO,
+                                      /*OnlyPackedOffsets=*/false);
+    case Intrinsic::aarch64_sve_ld1_gather_scalar_offset:
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLD1_IMM_MERGE_ZERO);
+    case Intrinsic::aarch64_sve_ldff1_gather:
+      return performGatherLoadCombine(N, DAG, AArch64ISD::GLDFF1_MERGE_ZERO);
+    case Intrinsic::aarch64_sve_ldff1_gather_index:
+      return performGatherLoadCombine(N, DAG,
+                                      AArch64ISD::GLDFF1_SCALED_MERGE_ZERO);
+    case Intrinsic::aarch64_sve_ldff1_gather_sxtw:
+      return performGatherLoadCombine(N, DAG,
+                                      AArch64ISD::GLDFF1_SXTW_MERGE_ZERO,
+                                      /*OnlyPackedOffsets=*/false);
+    case Intrinsic::aarch64_sve_ldff1_gather_uxtw:
+      return performGatherLoadCombine(N, DAG,
+                                      AArch64ISD::GLDFF1_UXTW_MERGE_ZERO,
+                                      /*OnlyPackedOffsets=*/false);
+    case Intrinsic::aarch64_sve_ldff1_gather_sxtw_index:
+      return performGatherLoadCombine(N, DAG,
+                                      AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO,
+                                      /*OnlyPackedOffsets=*/false);
+    case Intrinsic::aarch64_sve_ldff1_gather_uxtw_index:
+      return performGatherLoadCombine(N, DAG,
+                                      AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO,
                                       /*OnlyPackedOffsets=*/false);
-    case Intrinsic::aarch64_sve_ld1_gather_imm:
-      return performLD1GatherCombine(N, DAG, AArch64ISD::GLD1_IMM);
+    case Intrinsic::aarch64_sve_ldff1_gather_scalar_offset:
+      return performGatherLoadCombine(N, DAG,
+                                      AArch64ISD::GLDFF1_IMM_MERGE_ZERO);
     case Intrinsic::aarch64_sve_st1_scatter:
-      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1);
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_PRED);
     case Intrinsic::aarch64_sve_st1_scatter_index:
-      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SCALED);
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SCALED_PRED);
     case Intrinsic::aarch64_sve_st1_scatter_sxtw:
-      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW,
-                                      /*OnlyPackedOffsets=*/false);
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_SXTW_PRED,
+                                        /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_st1_scatter_uxtw:
-      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW,
-                                      /*OnlyPackedOffsets=*/false);
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_UXTW_PRED,
+                                        /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_st1_scatter_sxtw_index:
-      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_SXTW_SCALED,
-                                      /*OnlyPackedOffsets=*/false);
+      return performScatterStoreCombine(N, DAG,
+                                        AArch64ISD::SST1_SXTW_SCALED_PRED,
+                                        /*OnlyPackedOffsets=*/false);
     case Intrinsic::aarch64_sve_st1_scatter_uxtw_index:
-      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_UXTW_SCALED,
-                                      /*OnlyPackedOffsets=*/false);
-    case Intrinsic::aarch64_sve_st1_scatter_imm:
-      return performST1ScatterCombine(N, DAG, AArch64ISD::SST1_IMM);
+      return performScatterStoreCombine(N, DAG,
+                                        AArch64ISD::SST1_UXTW_SCALED_PRED,
+                                        /*OnlyPackedOffsets=*/false);
+    case Intrinsic::aarch64_sve_st1_scatter_scalar_offset:
+      return performScatterStoreCombine(N, DAG, AArch64ISD::SST1_IMM_PRED);
+    case Intrinsic::aarch64_sve_tuple_get: {
+      SDLoc DL(N);
+      SDValue Chain = N->getOperand(0);
+      SDValue Src1 = N->getOperand(2);
+      SDValue Idx = N->getOperand(3);
+
+      uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
+      EVT ResVT = N->getValueType(0);
+      uint64_t NumLanes = ResVT.getVectorElementCount().Min;
+      SDValue Val =
+          DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, ResVT, Src1,
+                      DAG.getConstant(IdxConst * NumLanes, DL, MVT::i32));
+      return DAG.getMergeValues({Val, Chain}, DL);
+    }
+    case Intrinsic::aarch64_sve_tuple_set: {
+      SDLoc DL(N);
+      SDValue Chain = N->getOperand(0);
+      SDValue Tuple = N->getOperand(2);
+      SDValue Idx = N->getOperand(3);
+      SDValue Vec = N->getOperand(4);
+
+      EVT TupleVT = Tuple.getValueType();
+      uint64_t TupleLanes = TupleVT.getVectorElementCount().Min;
+
+      uint64_t IdxConst = cast<ConstantSDNode>(Idx)->getZExtValue();
+      uint64_t NumLanes = Vec.getValueType().getVectorElementCount().Min;
+
+      if ((TupleLanes % NumLanes) != 0)
+        report_fatal_error("invalid tuple vector!");
+
+      uint64_t NumVecs = TupleLanes / NumLanes;
+
+      SmallVector<SDValue, 4> Opnds;
+      for (unsigned I = 0; I < NumVecs; ++I) {
+        if (I == IdxConst)
+          Opnds.push_back(Vec);
+        else {
+          Opnds.push_back(
+              DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, Vec.getValueType(), Tuple,
+                          DAG.getConstant(I * NumLanes, DL, MVT::i32)));
+        }
+      }
+      SDValue Concat =
+          DAG.getNode(ISD::CONCAT_VECTORS, DL, Tuple.getValueType(), Opnds);
+      return DAG.getMergeValues({Concat, Chain}, DL);
+    }
+    case Intrinsic::aarch64_sve_tuple_create2:
+    case Intrinsic::aarch64_sve_tuple_create3:
+    case Intrinsic::aarch64_sve_tuple_create4: {
+      SDLoc DL(N);
+      SDValue Chain = N->getOperand(0);
+
+      SmallVector<SDValue, 4> Opnds;
+      for (unsigned I = 2; I < N->getNumOperands(); ++I)
+        Opnds.push_back(N->getOperand(I));
+
+      EVT VT = Opnds[0].getValueType();
+      EVT EltVT = VT.getVectorElementType();
+      EVT DestVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
+                                    VT.getVectorElementCount() *
+                                        (N->getNumOperands() - 2));
+      SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, DestVT, Opnds);
+      return DAG.getMergeValues({Concat, Chain}, DL);
+    }
+    case Intrinsic::aarch64_sve_ld2:
+    case Intrinsic::aarch64_sve_ld3:
+    case Intrinsic::aarch64_sve_ld4: {
+      SDLoc DL(N);
+      SDValue Chain = N->getOperand(0);
+      SDValue Mask = N->getOperand(2);
+      SDValue BasePtr = N->getOperand(3);
+      SDValue LoadOps[] = {Chain, Mask, BasePtr};
+      unsigned IntrinsicID =
+          cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+      SDValue Result =
+          LowerSVEStructLoad(IntrinsicID, LoadOps, N->getValueType(0), DAG, DL);
+      return DAG.getMergeValues({Result, Chain}, DL);
+    }
     default:
       break;
     }
@@ -12724,7 +14250,8 @@ static void ReplaceBITCASTResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
   SDLoc DL(N);
   SDValue Op = N->getOperand(0);
 
-  if (N->getValueType(0) != MVT::i16 || Op.getValueType() != MVT::f16)
+  if (N->getValueType(0) != MVT::i16 ||
+      (Op.getValueType() != MVT::f16 && Op.getValueType() != MVT::bf16))
     return;
 
   Op = SDValue(
@@ -12759,6 +14286,40 @@ static std::pair<SDValue, SDValue> splitInt128(SDValue N, SelectionDAG &DAG) {
   return std::make_pair(Lo, Hi);
 }
 
+void AArch64TargetLowering::ReplaceExtractSubVectorResults(
+    SDNode *N, SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  SDValue In = N->getOperand(0);
+  EVT InVT = In.getValueType();
+
+  // Common code will handle these just fine.
+  if (!InVT.isScalableVector() || !InVT.isInteger())
+    return;
+
+  SDLoc DL(N);
+  EVT VT = N->getValueType(0);
+
+  // The following checks bail if this is not a halving operation.
+
+  ElementCount ResEC = VT.getVectorElementCount();
+
+  if (InVT.getVectorElementCount().Min != (ResEC.Min * 2))
+    return;
+
+  auto *CIndex = dyn_cast<ConstantSDNode>(N->getOperand(1));
+  if (!CIndex)
+    return;
+
+  unsigned Index = CIndex->getZExtValue();
+  if ((Index != 0) && (Index != ResEC.Min))
+    return;
+
+  unsigned Opcode = (Index == 0) ? AArch64ISD::UUNPKLO : AArch64ISD::UUNPKHI;
+  EVT ExtendedHalfVT = VT.widenIntegerVectorElementType(*DAG.getContext());
+
+  SDValue Half = DAG.getNode(Opcode, DL, ExtendedHalfVT, N->getOperand(0));
+  Results.push_back(DAG.getNode(ISD::TRUNCATE, DL, VT, Half));
+}
+
 // Create an even/odd pair of X registers holding integer value V.
 static SDValue createGPRPairNode(SelectionDAG &DAG, SDValue V) {
   SDLoc dl(V.getNode());
@@ -12822,10 +14383,12 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
     unsigned SubReg1 = AArch64::sube64, SubReg2 = AArch64::subo64;
     if (DAG.getDataLayout().isBigEndian())
       std::swap(SubReg1, SubReg2);
-    Results.push_back(DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
-                                                 SDValue(CmpSwap, 0)));
-    Results.push_back(DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
-                                                 SDValue(CmpSwap, 0)));
+    SDValue Lo = DAG.getTargetExtractSubreg(SubReg1, SDLoc(N), MVT::i64,
+                                            SDValue(CmpSwap, 0));
+    SDValue Hi = DAG.getTargetExtractSubreg(SubReg2, SDLoc(N), MVT::i64,
+                                            SDValue(CmpSwap, 0));
+    Results.push_back(
+        DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128, Lo, Hi));
     Results.push_back(SDValue(CmpSwap, 1)); // Chain out
     return;
   }
@@ -12841,8 +14404,8 @@ static void ReplaceCMP_SWAP_128Results(SDNode *N,
   MachineMemOperand *MemOp = cast<MemSDNode>(N)->getMemOperand();
   DAG.setNodeMemRefs(cast<MachineSDNode>(CmpSwap), {MemOp});
 
-  Results.push_back(SDValue(CmpSwap, 0));
-  Results.push_back(SDValue(CmpSwap, 1));
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i128,
+                                SDValue(CmpSwap, 0), SDValue(CmpSwap, 1)));
   Results.push_back(SDValue(CmpSwap, 3));
 }
 
@@ -12862,6 +14425,9 @@ void AArch64TargetLowering::ReplaceNodeResults(
     Results.push_back(LowerVECREDUCE(SDValue(N, 0), DAG));
     return;
 
+  case ISD::CTPOP:
+    Results.push_back(LowerCTPOP(SDValue(N, 0), DAG));
+    return;
   case AArch64ISD::SADDV:
     ReplaceReductionResults(N, Results, DAG, ISD::ADD, AArch64ISD::SADDV);
     return;
@@ -12909,6 +14475,9 @@ void AArch64TargetLowering::ReplaceNodeResults(
     Results.append({Pair, Result.getValue(2) /* Chain */});
     return;
   }
+  case ISD::EXTRACT_SUBVECTOR:
+    ReplaceExtractSubVectorResults(N, Results, DAG);
+    return;
   case ISD::INTRINSIC_WO_CHAIN: {
     EVT VT = N->getValueType(0);
     assert((VT == MVT::i8 || VT == MVT::i16) &&
@@ -13019,7 +14588,7 @@ AArch64TargetLowering::shouldExpandAtomicCmpXchgInIR(
   // on the stack and close enough to the spill slot, this can lead to a
   // situation where the monitor always gets cleared and the atomic operation
   // can never succeed. So at -O0 we need a late-expanded pseudo-inst instead.
-  if (getTargetMachine().getOptLevel() == 0)
+  if (getTargetMachine().getOptLevel() == CodeGenOpt::None)
     return AtomicExpansionKind::None;
   return AtomicExpansionKind::LLSC;
 }
@@ -13278,8 +14847,7 @@ bool AArch64TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // integer division, leaving the division as-is is a loss even in terms of
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
-  bool OptSize =
-      Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
+  bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
 
@@ -13309,3 +14877,280 @@ void AArch64TargetLowering::finalizeLowering(MachineFunction &MF) const {
 bool AArch64TargetLowering::needsFixedCatchObjects() const {
   return false;
 }
+
+bool AArch64TargetLowering::shouldLocalize(
+    const MachineInstr &MI, const TargetTransformInfo *TTI) const {
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_GLOBAL_VALUE: {
+    // On Darwin, TLS global vars get selected into function calls, which
+    // we don't want localized, as they can get moved into the middle of a
+    // another call sequence.
+    const GlobalValue &GV = *MI.getOperand(1).getGlobal();
+    if (GV.isThreadLocal() && Subtarget->isTargetMachO())
+      return false;
+    break;
+  }
+  // If we legalized G_GLOBAL_VALUE into ADRP + G_ADD_LOW, mark both as being
+  // localizable.
+  case AArch64::ADRP:
+  case AArch64::G_ADD_LOW:
+    return true;
+  default:
+    break;
+  }
+  return TargetLoweringBase::shouldLocalize(MI, TTI);
+}
+
+bool AArch64TargetLowering::fallBackToDAGISel(const Instruction &Inst) const {
+  if (isa<ScalableVectorType>(Inst.getType()))
+    return true;
+
+  for (unsigned i = 0; i < Inst.getNumOperands(); ++i)
+    if (isa<ScalableVectorType>(Inst.getOperand(i)->getType()))
+      return true;
+
+  return false;
+}
+
+// Return the largest legal scalable vector type that matches VT's element type.
+static EVT getContainerForFixedLengthVector(SelectionDAG &DAG, EVT VT) {
+  assert(VT.isFixedLengthVector() &&
+         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+         "Expected legal fixed length vector!");
+  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("unexpected element type for SVE container");
+  case MVT::i8:
+    return EVT(MVT::nxv16i8);
+  case MVT::i16:
+    return EVT(MVT::nxv8i16);
+  case MVT::i32:
+    return EVT(MVT::nxv4i32);
+  case MVT::i64:
+    return EVT(MVT::nxv2i64);
+  case MVT::f16:
+    return EVT(MVT::nxv8f16);
+  case MVT::f32:
+    return EVT(MVT::nxv4f32);
+  case MVT::f64:
+    return EVT(MVT::nxv2f64);
+  }
+}
+
+// Return a PTRUE with active lanes corresponding to the extent of VT.
+static SDValue getPredicateForFixedLengthVector(SelectionDAG &DAG, SDLoc &DL,
+                                                EVT VT) {
+  assert(VT.isFixedLengthVector() &&
+         DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+         "Expected legal fixed length vector!");
+
+  int PgPattern;
+  switch (VT.getVectorNumElements()) {
+  default:
+    llvm_unreachable("unexpected element count for SVE predicate");
+  case 1:
+    PgPattern = AArch64SVEPredPattern::vl1;
+    break;
+  case 2:
+    PgPattern = AArch64SVEPredPattern::vl2;
+    break;
+  case 4:
+    PgPattern = AArch64SVEPredPattern::vl4;
+    break;
+  case 8:
+    PgPattern = AArch64SVEPredPattern::vl8;
+    break;
+  case 16:
+    PgPattern = AArch64SVEPredPattern::vl16;
+    break;
+  case 32:
+    PgPattern = AArch64SVEPredPattern::vl32;
+    break;
+  case 64:
+    PgPattern = AArch64SVEPredPattern::vl64;
+    break;
+  case 128:
+    PgPattern = AArch64SVEPredPattern::vl128;
+    break;
+  case 256:
+    PgPattern = AArch64SVEPredPattern::vl256;
+    break;
+  }
+
+  // TODO: For vectors that are exactly getMaxSVEVectorSizeInBits big, we can
+  // use AArch64SVEPredPattern::all, which can enable the use of unpredicated
+  // variants of instructions when available.
+
+  MVT MaskVT;
+  switch (VT.getVectorElementType().getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("unexpected element type for SVE predicate");
+  case MVT::i8:
+    MaskVT = MVT::nxv16i1;
+    break;
+  case MVT::i16:
+  case MVT::f16:
+    MaskVT = MVT::nxv8i1;
+    break;
+  case MVT::i32:
+  case MVT::f32:
+    MaskVT = MVT::nxv4i1;
+    break;
+  case MVT::i64:
+  case MVT::f64:
+    MaskVT = MVT::nxv2i1;
+    break;
+  }
+
+  return DAG.getNode(AArch64ISD::PTRUE, DL, MaskVT,
+                     DAG.getTargetConstant(PgPattern, DL, MVT::i64));
+}
+
+static SDValue getPredicateForScalableVector(SelectionDAG &DAG, SDLoc &DL,
+                                             EVT VT) {
+  assert(VT.isScalableVector() && DAG.getTargetLoweringInfo().isTypeLegal(VT) &&
+         "Expected legal scalable vector!");
+  auto PredTy = VT.changeVectorElementType(MVT::i1);
+  return getPTrue(DAG, DL, PredTy, AArch64SVEPredPattern::all);
+}
+
+static SDValue getPredicateForVector(SelectionDAG &DAG, SDLoc &DL, EVT VT) {
+  if (VT.isFixedLengthVector())
+    return getPredicateForFixedLengthVector(DAG, DL, VT);
+
+  return getPredicateForScalableVector(DAG, DL, VT);
+}
+
+// Grow V to consume an entire SVE register.
+static SDValue convertToScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
+  assert(VT.isScalableVector() &&
+         "Expected to convert into a scalable vector!");
+  assert(V.getValueType().isFixedLengthVector() &&
+         "Expected a fixed length vector operand!");
+  SDLoc DL(V);
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, DAG.getUNDEF(VT), V, Zero);
+}
+
+// Shrink V so it's just big enough to maintain a VT's worth of data.
+static SDValue convertFromScalableVector(SelectionDAG &DAG, EVT VT, SDValue V) {
+  assert(VT.isFixedLengthVector() &&
+         "Expected to convert into a fixed length vector!");
+  assert(V.getValueType().isScalableVector() &&
+         "Expected a scalable vector operand!");
+  SDLoc DL(V);
+  SDValue Zero = DAG.getConstant(0, DL, MVT::i64);
+  return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, V, Zero);
+}
+
+// Convert all fixed length vector loads larger than NEON to masked_loads.
+SDValue AArch64TargetLowering::LowerFixedLengthVectorLoadToSVE(
+    SDValue Op, SelectionDAG &DAG) const {
+  auto Load = cast<LoadSDNode>(Op);
+
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+  auto NewLoad = DAG.getMaskedLoad(
+      ContainerVT, DL, Load->getChain(), Load->getBasePtr(), Load->getOffset(),
+      getPredicateForFixedLengthVector(DAG, DL, VT), DAG.getUNDEF(ContainerVT),
+      Load->getMemoryVT(), Load->getMemOperand(), Load->getAddressingMode(),
+      Load->getExtensionType());
+
+  auto Result = convertFromScalableVector(DAG, VT, NewLoad);
+  SDValue MergedValues[2] = {Result, Load->getChain()};
+  return DAG.getMergeValues(MergedValues, DL);
+}
+
+// Convert all fixed length vector stores larger than NEON to masked_stores.
+SDValue AArch64TargetLowering::LowerFixedLengthVectorStoreToSVE(
+    SDValue Op, SelectionDAG &DAG) const {
+  auto Store = cast<StoreSDNode>(Op);
+
+  SDLoc DL(Op);
+  EVT VT = Store->getValue().getValueType();
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+  auto NewValue = convertToScalableVector(DAG, ContainerVT, Store->getValue());
+  return DAG.getMaskedStore(
+      Store->getChain(), DL, NewValue, Store->getBasePtr(), Store->getOffset(),
+      getPredicateForFixedLengthVector(DAG, DL, VT), Store->getMemoryVT(),
+      Store->getMemOperand(), Store->getAddressingMode(),
+      Store->isTruncatingStore());
+}
+
+SDValue AArch64TargetLowering::LowerFixedLengthVectorTruncateToSVE(
+    SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  assert(VT.isFixedLengthVector() && "Expected fixed length vector type!");
+
+  SDLoc DL(Op);
+  SDValue Val = Op.getOperand(0);
+  EVT ContainerVT = getContainerForFixedLengthVector(DAG, Val.getValueType());
+  Val = convertToScalableVector(DAG, ContainerVT, Val);
+
+  // Repeatedly truncate Val until the result is of the desired element type.
+  switch (ContainerVT.getSimpleVT().SimpleTy) {
+  default:
+    llvm_unreachable("unimplemented container type");
+  case MVT::nxv2i64:
+    Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv4i32, Val);
+    Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv4i32, Val, Val);
+    if (VT.getVectorElementType() == MVT::i32)
+      break;
+    LLVM_FALLTHROUGH;
+  case MVT::nxv4i32:
+    Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv8i16, Val);
+    Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv8i16, Val, Val);
+    if (VT.getVectorElementType() == MVT::i16)
+      break;
+    LLVM_FALLTHROUGH;
+  case MVT::nxv8i16:
+    Val = DAG.getNode(ISD::BITCAST, DL, MVT::nxv16i8, Val);
+    Val = DAG.getNode(AArch64ISD::UZP1, DL, MVT::nxv16i8, Val, Val);
+    assert(VT.getVectorElementType() == MVT::i8 && "Unexpected element type!");
+    break;
+  }
+
+  return convertFromScalableVector(DAG, VT, Val);
+}
+
+SDValue AArch64TargetLowering::LowerToPredicatedOp(SDValue Op,
+                                                   SelectionDAG &DAG,
+                                                   unsigned NewOp) const {
+  EVT VT = Op.getValueType();
+  SDLoc DL(Op);
+  auto Pg = getPredicateForVector(DAG, DL, VT);
+
+  if (useSVEForFixedLengthVectorVT(VT)) {
+    EVT ContainerVT = getContainerForFixedLengthVector(DAG, VT);
+
+    // Create list of operands by convereting existing ones to scalable types.
+    SmallVector<SDValue, 4> Operands = {Pg};
+    for (const SDValue &V : Op->op_values()) {
+      if (isa<CondCodeSDNode>(V)) {
+        Operands.push_back(V);
+        continue;
+      }
+
+      assert(useSVEForFixedLengthVectorVT(V.getValueType()) &&
+             "Only fixed length vectors are supported!");
+      Operands.push_back(convertToScalableVector(DAG, ContainerVT, V));
+    }
+
+    auto ScalableRes = DAG.getNode(NewOp, DL, ContainerVT, Operands);
+    return convertFromScalableVector(DAG, VT, ScalableRes);
+  }
+
+  assert(VT.isScalableVector() && "Only expect to lower scalable vector op!");
+
+  SmallVector<SDValue, 4> Operands = {Pg};
+  for (const SDValue &V : Op->op_values()) {
+    assert((isa<CondCodeSDNode>(V) || V.getValueType().isScalableVector()) &&
+           "Only scalable vectors are supported!");
+    Operands.push_back(V);
+  }
+
+  return DAG.getNode(NewOp, DL, VT, Operands);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index 672dfc4fcbc0..4fe77481706b 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -25,6 +25,26 @@ namespace llvm {
 
 namespace AArch64ISD {
 
+// For predicated nodes where the result is a vector, the operation is
+// controlled by a governing predicate and the inactive lanes are explicitly
+// defined with a value, please stick the following naming convention:
+//
+//    _MERGE_OP<n>        The result value is a vector with inactive lanes equal
+//                        to source operand OP<n>.
+//
+//    _MERGE_ZERO         The result value is a vector with inactive lanes
+//                        actively zeroed.
+//
+//    _MERGE_PASSTHRU     The result value is a vector with inactive lanes equal
+//                        to the last source operand which only purpose is being
+//                        a passthru value.
+//
+// For other cases where no explicit action is needed to set the inactive lanes,
+// or when the result is not a vector and it is needed or helpful to
+// distinguish a node from similar unpredicated nodes, use:
+//
+//    _PRED
+//
 enum NodeType : unsigned {
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
   WrapperLarge, // 4-instruction MOVZ/MOVK sequence for 64-bit addresses.
@@ -52,6 +72,22 @@ enum NodeType : unsigned {
   ADC,
   SBC, // adc, sbc instructions
 
+  // Arithmetic instructions
+  ADD_PRED,
+  FADD_PRED,
+  SDIV_PRED,
+  UDIV_PRED,
+  FMA_PRED,
+  SMIN_MERGE_OP1,
+  UMIN_MERGE_OP1,
+  SMAX_MERGE_OP1,
+  UMAX_MERGE_OP1,
+  SHL_MERGE_OP1,
+  SRL_MERGE_OP1,
+  SRA_MERGE_OP1,
+
+  SETCC_MERGE_ZERO,
+
   // Arithmetic instructions which write flags.
   ADDS,
   SUBS,
@@ -90,9 +126,9 @@ enum NodeType : unsigned {
   BICi,
   ORRi,
 
-  // Vector bit select: similar to ISD::VSELECT but not all bits within an
+  // Vector bitwise select: similar to ISD::VSELECT but not all bits within an
   // element must be identical.
-  BSL,
+  BSP,
 
   // Vector arithmetic negation
   NEG,
@@ -121,6 +157,10 @@ enum NodeType : unsigned {
   SRSHR_I,
   URSHR_I,
 
+  // Vector shift by constant and insert
+  VSLI,
+  VSRI,
+
   // Vector comparisons
   CMEQ,
   CMGE,
@@ -148,6 +188,10 @@ enum NodeType : unsigned {
   SADDV,
   UADDV,
 
+  // Vector rounding halving addition
+  SRHADD,
+  URHADD,
+
   // Vector across-lanes min/max
   // Only the lower result lane is defined.
   SMINV,
@@ -166,7 +210,7 @@ enum NodeType : unsigned {
   // Vector bitwise negation
   NOT,
 
-  // Vector bitwise selection
+  // Vector bitwise insertion
   BIT,
 
   // Compare-and-branch
@@ -196,8 +240,10 @@ enum NodeType : unsigned {
   UMULL,
 
   // Reciprocal estimates and steps.
-  FRECPE, FRECPS,
-  FRSQRTE, FRSQRTS,
+  FRECPE,
+  FRECPS,
+  FRSQRTE,
+  FRSQRTS,
 
   SUNPKHI,
   SUNPKLO,
@@ -211,35 +257,97 @@ enum NodeType : unsigned {
   REV,
   TBL,
 
+  // Floating-point reductions.
+  FADDA_PRED,
+  FADDV_PRED,
+  FMAXV_PRED,
+  FMAXNMV_PRED,
+  FMINV_PRED,
+  FMINNMV_PRED,
+
   INSR,
   PTEST,
   PTRUE,
 
+  DUP_MERGE_PASSTHRU,
+  INDEX_VECTOR,
+
+  REINTERPRET_CAST,
+
+  LD1_MERGE_ZERO,
+  LD1S_MERGE_ZERO,
+  LDNF1_MERGE_ZERO,
+  LDNF1S_MERGE_ZERO,
+  LDFF1_MERGE_ZERO,
+  LDFF1S_MERGE_ZERO,
+  LD1RQ_MERGE_ZERO,
+  LD1RO_MERGE_ZERO,
+
+  // Structured loads.
+  SVE_LD2_MERGE_ZERO,
+  SVE_LD3_MERGE_ZERO,
+  SVE_LD4_MERGE_ZERO,
+
   // Unsigned gather loads.
-  GLD1,
-  GLD1_SCALED,
-  GLD1_UXTW,
-  GLD1_SXTW,
-  GLD1_UXTW_SCALED,
-  GLD1_SXTW_SCALED,
-  GLD1_IMM,
+  GLD1_MERGE_ZERO,
+  GLD1_SCALED_MERGE_ZERO,
+  GLD1_UXTW_MERGE_ZERO,
+  GLD1_SXTW_MERGE_ZERO,
+  GLD1_UXTW_SCALED_MERGE_ZERO,
+  GLD1_SXTW_SCALED_MERGE_ZERO,
+  GLD1_IMM_MERGE_ZERO,
 
   // Signed gather loads
-  GLD1S,
-  GLD1S_SCALED,
-  GLD1S_UXTW,
-  GLD1S_SXTW,
-  GLD1S_UXTW_SCALED,
-  GLD1S_SXTW_SCALED,
-  GLD1S_IMM,
+  GLD1S_MERGE_ZERO,
+  GLD1S_SCALED_MERGE_ZERO,
+  GLD1S_UXTW_MERGE_ZERO,
+  GLD1S_SXTW_MERGE_ZERO,
+  GLD1S_UXTW_SCALED_MERGE_ZERO,
+  GLD1S_SXTW_SCALED_MERGE_ZERO,
+  GLD1S_IMM_MERGE_ZERO,
+
+  // Unsigned gather loads.
+  GLDFF1_MERGE_ZERO,
+  GLDFF1_SCALED_MERGE_ZERO,
+  GLDFF1_UXTW_MERGE_ZERO,
+  GLDFF1_SXTW_MERGE_ZERO,
+  GLDFF1_UXTW_SCALED_MERGE_ZERO,
+  GLDFF1_SXTW_SCALED_MERGE_ZERO,
+  GLDFF1_IMM_MERGE_ZERO,
+
+  // Signed gather loads.
+  GLDFF1S_MERGE_ZERO,
+  GLDFF1S_SCALED_MERGE_ZERO,
+  GLDFF1S_UXTW_MERGE_ZERO,
+  GLDFF1S_SXTW_MERGE_ZERO,
+  GLDFF1S_UXTW_SCALED_MERGE_ZERO,
+  GLDFF1S_SXTW_SCALED_MERGE_ZERO,
+  GLDFF1S_IMM_MERGE_ZERO,
+
+  // Non-temporal gather loads
+  GLDNT1_MERGE_ZERO,
+  GLDNT1_INDEX_MERGE_ZERO,
+  GLDNT1S_MERGE_ZERO,
+
+  // Contiguous masked store.
+  ST1_PRED,
+
   // Scatter store
-  SST1,
-  SST1_SCALED,
-  SST1_UXTW,
-  SST1_SXTW,
-  SST1_UXTW_SCALED,
-  SST1_SXTW_SCALED,
-  SST1_IMM,
+  SST1_PRED,
+  SST1_SCALED_PRED,
+  SST1_UXTW_PRED,
+  SST1_SXTW_PRED,
+  SST1_UXTW_SCALED_PRED,
+  SST1_SXTW_SCALED_PRED,
+  SST1_IMM_PRED,
+
+  // Non-temporal scatter store
+  SSTNT1_PRED,
+  SSTNT1_INDEX_PRED,
+
+  // Strict (exception-raising) floating point comparison
+  STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
+  STRICT_FCMPE,
 
   // NEON Load/Store with post-increment base updates
   LD2post = ISD::FIRST_TARGET_MEMORY_OPCODE,
@@ -272,7 +380,8 @@ enum NodeType : unsigned {
   STZ2G,
 
   LDP,
-  STP
+  STP,
+  STNP
 };
 
 } // end namespace AArch64ISD
@@ -321,7 +430,8 @@ public:
     return MVT::getIntegerVT(64);
   }
 
-  bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+  bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
+                                    const APInt &DemandedElts,
                                     TargetLoweringOpt &TLO) const override;
 
   MVT getScalarShiftAmountTy(const DataLayout &DL, EVT) const override;
@@ -333,9 +443,10 @@ public:
       MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
       bool *Fast = nullptr) const override;
   /// LLT variant.
-  bool allowsMisalignedMemoryAccesses(
-    LLT Ty, unsigned AddrSpace, unsigned Align, MachineMemOperand::Flags Flags,
-    bool *Fast = nullptr) const override;
+  bool allowsMisalignedMemoryAccesses(LLT Ty, unsigned AddrSpace,
+                                      Align Alignment,
+                                      MachineMemOperand::Flags Flags,
+                                      bool *Fast = nullptr) const override;
 
   /// Provide custom lowering hooks for some operations.
   SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
@@ -376,9 +487,6 @@ public:
   MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
                                            MachineBasicBlock *BB) const;
 
-  MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
-                                         MachineBasicBlock *BB) const;
-
   MachineBasicBlock *
   EmitInstrWithCustomInserter(MachineInstr &MI,
                               MachineBasicBlock *MBB) const override;
@@ -402,7 +510,7 @@ public:
   bool shouldSinkOperands(Instruction *I,
                           SmallVectorImpl<Use *> &Ops) const override;
 
-  bool hasPairedLoad(EVT LoadedType, unsigned &RequiredAligment) const override;
+  bool hasPairedLoad(EVT LoadedType, Align &RequiredAligment) const override;
 
   unsigned getMaxSupportedInterleaveFactor() const override { return 4; }
 
@@ -418,13 +526,11 @@ public:
 
   bool shouldConsiderGEPOffsetSplit() const override;
 
-  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
-                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+  EVT getOptimalMemOpType(const MemOp &Op,
                           const AttributeList &FuncAttributes) const override;
 
-  LLT getOptimalMemOpLLT(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
-                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
-                          const AttributeList &FuncAttributes) const override;
+  LLT getOptimalMemOpLLT(const MemOp &Op,
+                         const AttributeList &FuncAttributes) const override;
 
   /// Return true if the addressing mode represented by AM is legal for this
   /// target, for a load/store of the specified type.
@@ -463,6 +569,13 @@ public:
   bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                unsigned Index) const override;
 
+  bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
+                            bool MathUsed) const override {
+    // Using overflow ops for overflow checks only should beneficial on
+    // AArch64.
+    return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
+  }
+
   Value *emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
                         AtomicOrdering Ord) const override;
   Value *emitStoreConditional(IRBuilder<> &Builder, Value *Val,
@@ -497,7 +610,7 @@ public:
 
   /// If a physical register, this returns the register that receives the
   /// exception address on entry to an EH pad.
-  unsigned
+  Register
   getExceptionPointerRegister(const Constant *PersonalityFn) const override {
     // FIXME: This is a guess. Has this been defined yet?
     return AArch64::X0;
@@ -505,7 +618,7 @@ public:
 
   /// If a physical register, this returns the register that receives the
   /// exception typeid on entry to a landing pad.
-  unsigned
+  Register
   getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
     // FIXME: This is a guess. Has this been defined yet?
     return AArch64::X1;
@@ -611,13 +724,27 @@ public:
   unsigned getNumInterleavedAccesses(VectorType *VecTy,
                                      const DataLayout &DL) const;
 
-  MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;
+  MachineMemOperand::Flags getTargetMMOFlags(
+    const Instruction &I) const override;
 
   bool functionArgumentNeedsConsecutiveRegisters(Type *Ty,
                                                  CallingConv::ID CallConv,
                                                  bool isVarArg) const override;
   /// Used for exception handling on Win64.
   bool needsFixedCatchObjects() const override;
+
+  bool fallBackToDAGISel(const Instruction &Inst) const override;
+
+  /// SVE code generation for fixed length vectors does not custom lower
+  /// BUILD_VECTOR. This makes BUILD_VECTOR legalisation a source of stores to
+  /// merge. However, merging them creates a BUILD_VECTOR that is just as
+  /// illegal as the original, thus leading to an infinite legalisation loop.
+  /// NOTE: Once BUILD_VECTOR is legal or can be custom lowered for all legal
+  /// vector types this override can be removed.
+  bool mergeStoresAfterLegalization(EVT VT) const override {
+    return !useSVEForFixedLengthVectors();
+  }
+
 private:
   /// Keep a pointer to the AArch64Subtarget around so that we can
   /// make the right decision when generating code for different targets.
@@ -626,6 +753,7 @@ private:
   bool isExtFreeImpl(const Instruction *Ext) const override;
 
   void addTypeForNEON(MVT VT, MVT PromotedBitwiseVT);
+  void addTypeForFixedLengthSVE(MVT VT);
   void addDRTypeForNEON(MVT VT);
   void addQRTypeForNEON(MVT VT);
 
@@ -729,7 +857,11 @@ private:
   SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerSPLAT_VECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDUPQLane(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerToPredicatedOp(SDValue Op, SelectionDAG &DAG,
+                              unsigned NewOp) const;
   SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerINSERT_SUBVECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVectorSRA_SRL_SHL(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShiftRightParts(SDValue Op, SelectionDAG &DAG) const;
@@ -746,6 +878,8 @@ private:
   SDValue LowerVectorOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerCONCAT_VECTORS(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVSCALE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVECREDUCE(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_LOAD_SUB(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerATOMIC_LOAD_AND(SDValue Op, SelectionDAG &DAG) const;
@@ -753,6 +887,13 @@ private:
   SDValue LowerWindowsDYNAMIC_STACKALLOC(SDValue Op, SDValue Chain,
                                          SDValue &Size,
                                          SelectionDAG &DAG) const;
+  SDValue LowerSVEStructLoad(unsigned Intrinsic, ArrayRef<SDValue> LoadOps,
+                             EVT VT, SelectionDAG &DAG, const SDLoc &DL) const;
+
+  SDValue LowerFixedLengthVectorLoadToSVE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFixedLengthVectorStoreToSVE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerFixedLengthVectorTruncateToSVE(SDValue Op,
+                                              SelectionDAG &DAG) const;
 
   SDValue BuildSDIVPow2(SDNode *N, const APInt &Divisor, SelectionDAG &DAG,
                         SmallVectorImpl<SDNode *> &Created) const override;
@@ -807,10 +948,19 @@ private:
 
   void ReplaceNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
                           SelectionDAG &DAG) const override;
+  void ReplaceExtractSubVectorResults(SDNode *N,
+                                      SmallVectorImpl<SDValue> &Results,
+                                      SelectionDAG &DAG) const;
 
   bool shouldNormalizeToSelectSequence(LLVMContext &, EVT) const override;
 
   void finalizeLowering(MachineFunction &MF) const override;
+
+  bool shouldLocalize(const MachineInstr &MI,
+                      const TargetTransformInfo *TTI) const override;
+
+  bool useSVEForFixedLengthVectors() const;
+  bool useSVEForFixedLengthVectorVT(EVT VT) const;
 };
 
 namespace AArch64 {
diff --git a/llvm/lib/Target/AArch64/AArch64InstrFormats.td b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
index c3efe03a0987..6df7970f4d82 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrFormats.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrFormats.td
@@ -20,6 +20,30 @@ class Format<bits<2> val> {
 def PseudoFrm   : Format<0>;
 def NormalFrm   : Format<1>; // Do we need any others?
 
+// Enum describing whether an instruction is
+// destructive in its first source operand.
+class DestructiveInstTypeEnum<bits<4> val> {
+  bits<4> Value = val;
+}
+def NotDestructive                : DestructiveInstTypeEnum<0>;
+// Destructive in its first operand and can be MOVPRFX'd, but has no other
+// special properties.
+def DestructiveOther              : DestructiveInstTypeEnum<1>;
+def DestructiveUnary              : DestructiveInstTypeEnum<2>;
+def DestructiveBinaryImm          : DestructiveInstTypeEnum<3>;
+def DestructiveBinaryShImmUnpred  : DestructiveInstTypeEnum<4>;
+def DestructiveBinary             : DestructiveInstTypeEnum<5>;
+def DestructiveBinaryComm         : DestructiveInstTypeEnum<6>;
+def DestructiveBinaryCommWithRev  : DestructiveInstTypeEnum<7>;
+def DestructiveTernaryCommWithRev : DestructiveInstTypeEnum<8>;
+
+class FalseLanesEnum<bits<2> val> {
+  bits<2> Value = val;
+}
+def FalseLanesNone  : FalseLanesEnum<0>;
+def FalseLanesZero  : FalseLanesEnum<1>;
+def FalseLanesUndef : FalseLanesEnum<2>;
+
 // AArch64 Instruction Format
 class AArch64Inst<Format f, string cstr> : Instruction {
   field bits<32> Inst; // Instruction encoding.
@@ -34,6 +58,16 @@ class AArch64Inst<Format f, string cstr> : Instruction {
   let Namespace   = "AArch64";
   Format F        = f;
   bits<2> Form    = F.Value;
+
+  // Defaults
+  FalseLanesEnum FalseLanes = FalseLanesNone;
+  DestructiveInstTypeEnum DestructiveInstType = NotDestructive;
+  ElementSizeEnum ElementSize = ElementSizeNone;
+
+  let TSFlags{8-7} = FalseLanes.Value;
+  let TSFlags{6-3} = DestructiveInstType.Value;
+  let TSFlags{2-0} = ElementSize.Value;
+
   let Pattern     = [];
   let Constraints = cstr;
 }
@@ -48,6 +82,7 @@ class Pseudo<dag oops, dag iops, list<dag> pattern, string cstr = "">
   dag InOperandList  = iops;
   let Pattern        = pattern;
   let isCodeGenOnly  = 1;
+  let isPseudo       = 1;
 }
 
 // Real instructions (have encoding information)
@@ -56,14 +91,6 @@ class EncodedI<string cstr, list<dag> pattern> : AArch64Inst<NormalFrm, cstr> {
   let Size = 4;
 }
 
-// Enum describing whether an instruction is
-// destructive in its first source operand.
-class DestructiveInstTypeEnum<bits<1> val> {
-  bits<1> Value = val;
-}
-def NotDestructive  : DestructiveInstTypeEnum<0>;
-def Destructive     : DestructiveInstTypeEnum<1>;
-
 // Normal instructions
 class I<dag oops, dag iops, string asm, string operands, string cstr,
         list<dag> pattern>
@@ -71,13 +98,6 @@ class I<dag oops, dag iops, string asm, string operands, string cstr,
   dag OutOperandList = oops;
   dag InOperandList  = iops;
   let AsmString      = !strconcat(asm, operands);
-
-  // Destructive operations (SVE)
-  DestructiveInstTypeEnum DestructiveInstType = NotDestructive;
-  ElementSizeEnum ElementSize = ElementSizeB;
-
-  let TSFlags{3} = DestructiveInstType.Value;
-  let TSFlags{2-0} = ElementSize.Value;
 }
 
 class TriOpFrag<dag res> : PatFrag<(ops node:$LHS, node:$MHS, node:$RHS), res>;
@@ -327,6 +347,18 @@ def simm5_32b : Operand<i32>, ImmLeaf<i32, [{ return Imm >= -16 && Imm < 16; }]>
   let DecoderMethod = "DecodeSImm<5>";
 }
 
+def simm5_8b : Operand<i32>, ImmLeaf<i32, [{ return (int8_t)Imm >= -16 && (int8_t)Imm < 16; }]> {
+  let ParserMatchClass = SImm5Operand;
+  let DecoderMethod = "DecodeSImm<5>";
+  let PrintMethod = "printSImm<8>";
+}
+
+def simm5_16b : Operand<i32>, ImmLeaf<i32, [{ return (int16_t)Imm >= -16 && (int16_t)Imm < 16; }]> {
+  let ParserMatchClass = SImm5Operand;
+  let DecoderMethod = "DecodeSImm<5>";
+  let PrintMethod = "printSImm<16>";
+}
+
 // simm7sN predicate - True if the immediate is a multiple of N in the range
 // [-64 * N, 63 * N].
 
@@ -349,6 +381,8 @@ def simm7s16 : Operand<i32> {
   let PrintMethod = "printImmScale<16>";
 }
 
+def am_sve_fi : ComplexPattern<i64, 2, "SelectAddrModeFrameIndexSVE", []>;
+
 def am_indexed7s8   : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S8", []>;
 def am_indexed7s16  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S16", []>;
 def am_indexed7s32  : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S32", []>;
@@ -358,6 +392,9 @@ def am_indexed7s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexed7S128", []>;
 def am_indexedu6s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedU6S128", []>;
 def am_indexeds9s128 : ComplexPattern<i64, 2, "SelectAddrModeIndexedS9S128", []>;
 
+def UImmS1XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i64);
+}]>;
 def UImmS2XForm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue() / 2, SDLoc(N), MVT::i64);
 }]>;
@@ -446,6 +483,19 @@ def uimm6s16 : Operand<i64>, ImmLeaf<i64,
   let ParserMatchClass = UImm6s16Operand;
 }
 
+def SImmS2XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue() / 2, SDLoc(N), MVT::i64);
+}]>;
+def SImmS3XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue() / 3, SDLoc(N), MVT::i64);
+}]>;
+def SImmS4XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue() / 4, SDLoc(N), MVT::i64);
+}]>;
+def SImmS16XForm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue() / 16, SDLoc(N), MVT::i64);
+}]>;
+
 // simm6sN predicate - True if the immediate is a multiple of N in the range
 // [-32 * N, 31 * N].
 def SImm6s1Operand : SImmScaledMemoryIndexed<6, 1>;
@@ -461,6 +511,7 @@ def SImm4s2Operand  : SImmScaledMemoryIndexed<4, 2>;
 def SImm4s3Operand  : SImmScaledMemoryIndexed<4, 3>;
 def SImm4s4Operand  : SImmScaledMemoryIndexed<4, 4>;
 def SImm4s16Operand : SImmScaledMemoryIndexed<4, 16>;
+def SImm4s32Operand : SImmScaledMemoryIndexed<4, 32>;
 
 def simm4s1 : Operand<i64>, ImmLeaf<i64,
 [{ return Imm >=-8  && Imm <= 7; }]> {
@@ -469,31 +520,37 @@ def simm4s1 : Operand<i64>, ImmLeaf<i64,
 }
 
 def simm4s2 : Operand<i64>, ImmLeaf<i64,
-[{ return Imm >=-16  && Imm <= 14 && (Imm % 2) == 0x0; }]> {
+[{ return Imm >=-16  && Imm <= 14 && (Imm % 2) == 0x0; }], SImmS2XForm> {
   let PrintMethod = "printImmScale<2>";
   let ParserMatchClass = SImm4s2Operand;
   let DecoderMethod = "DecodeSImm<4>";
 }
 
 def simm4s3 : Operand<i64>, ImmLeaf<i64,
-[{ return Imm >=-24  && Imm <= 21 && (Imm % 3) == 0x0; }]> {
+[{ return Imm >=-24  && Imm <= 21 && (Imm % 3) == 0x0; }], SImmS3XForm> {
   let PrintMethod = "printImmScale<3>";
   let ParserMatchClass = SImm4s3Operand;
   let DecoderMethod = "DecodeSImm<4>";
 }
 
 def simm4s4 : Operand<i64>, ImmLeaf<i64,
-[{ return Imm >=-32  && Imm <= 28 && (Imm % 4) == 0x0; }]> {
+[{ return Imm >=-32  && Imm <= 28 && (Imm % 4) == 0x0; }], SImmS4XForm> {
   let PrintMethod = "printImmScale<4>";
   let ParserMatchClass = SImm4s4Operand;
   let DecoderMethod = "DecodeSImm<4>";
 }
 def simm4s16 : Operand<i64>, ImmLeaf<i64,
-[{ return Imm >=-128  && Imm <= 112 && (Imm % 16) == 0x0; }]> {
+[{ return Imm >=-128  && Imm <= 112 && (Imm % 16) == 0x0; }], SImmS16XForm> {
   let PrintMethod = "printImmScale<16>";
   let ParserMatchClass = SImm4s16Operand;
   let DecoderMethod = "DecodeSImm<4>";
 }
+def simm4s32 : Operand<i64>, ImmLeaf<i64,
+[{ return Imm >=-256  && Imm <= 224 && (Imm % 32) == 0x0; }]> {
+  let PrintMethod = "printImmScale<32>";
+  let ParserMatchClass = SImm4s32Operand;
+  let DecoderMethod = "DecodeSImm<4>";
+}
 
 def Imm1_8Operand : AsmImmRange<1, 8>;
 def Imm1_16Operand : AsmImmRange<1, 16>;
@@ -647,6 +704,13 @@ def tvecshiftR32 : Operand<i32>, TImmLeaf<i32, [{
   let DecoderMethod = "DecodeVecShiftR32Imm";
   let ParserMatchClass = Imm1_32Operand;
 }
+def tvecshiftR64 : Operand<i32>, TImmLeaf<i32, [{
+  return (((uint32_t)Imm) > 0) && (((uint32_t)Imm) < 65);
+}]> {
+  let EncoderMethod = "getVecShiftR64OpValue";
+  let DecoderMethod = "DecodeVecShiftR64Imm";
+  let ParserMatchClass = Imm1_64Operand;
+}
 
 def Imm0_1Operand : AsmImmRange<0, 1>;
 def Imm0_7Operand : AsmImmRange<0, 7>;
@@ -683,6 +747,36 @@ def vecshiftL64 : Operand<i32>, ImmLeaf<i32, [{
   let ParserMatchClass = Imm0_63Operand;
 }
 
+// Same as vecshiftL#N, but use TargetConstant (TimmLeaf) instead of Constant
+// (ImmLeaf)
+def tvecshiftL8 : Operand<i32>, TImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 8);
+}]> {
+  let EncoderMethod = "getVecShiftL8OpValue";
+  let DecoderMethod = "DecodeVecShiftL8Imm";
+  let ParserMatchClass = Imm0_7Operand;
+}
+def tvecshiftL16 : Operand<i32>, TImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 16);
+}]> {
+  let EncoderMethod = "getVecShiftL16OpValue";
+  let DecoderMethod = "DecodeVecShiftL16Imm";
+  let ParserMatchClass = Imm0_15Operand;
+}
+def tvecshiftL32 : Operand<i32>, TImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 32);
+}]> {
+  let EncoderMethod = "getVecShiftL32OpValue";
+  let DecoderMethod = "DecodeVecShiftL32Imm";
+  let ParserMatchClass = Imm0_31Operand;
+}
+def tvecshiftL64 : Operand<i32>, TImmLeaf<i32, [{
+  return (((uint32_t)Imm) < 64);
+}]> {
+  let EncoderMethod = "getVecShiftL64OpValue";
+  let DecoderMethod = "DecodeVecShiftL64Imm";
+  let ParserMatchClass = Imm0_63Operand;
+}
 
 // Crazy immediate formats used by 32-bit and 64-bit logical immediate
 // instructions for splatting repeating bit patterns across the immediate.
@@ -796,7 +890,7 @@ def imm0_31 : Operand<i64>, ImmLeaf<i64, [{
 }
 
 // timm0_31 predicate - same ass imm0_31, but use TargetConstant (TimmLeaf)
-// instead of Contant (ImmLeaf)
+// instead of Constant (ImmLeaf)
 def timm0_31 : Operand<i64>, TImmLeaf<i64, [{
   return ((uint64_t)Imm) < 32;
 }]> {
@@ -832,7 +926,7 @@ def imm0_7 : Operand<i64>, ImmLeaf<i64, [{
 }
 
 // imm32_0_7 predicate - True if the 32-bit immediate is in the range [0,7]
-def imm32_0_7 : Operand<i32>, ImmLeaf<i32, [{
+def imm32_0_7 : Operand<i32>, TImmLeaf<i32, [{
   return ((uint32_t)Imm) < 8;
 }]> {
   let ParserMatchClass = Imm0_7Operand;
@@ -1091,29 +1185,44 @@ class AsmVectorIndex<int Min, int Max, string NamePrefix=""> : AsmOperandClass {
   let RenderMethod = "addVectorIndexOperands";
 }
 
-class AsmVectorIndexOpnd<ValueType ty, AsmOperandClass mc, code pred>
-    : Operand<ty>, ImmLeaf<ty, pred> {
+class AsmVectorIndexOpnd<ValueType ty, AsmOperandClass mc>
+    : Operand<ty> {
   let ParserMatchClass = mc;
   let PrintMethod = "printVectorIndex";
 }
 
+multiclass VectorIndex<ValueType ty, AsmOperandClass mc, code pred> {
+  def "" : AsmVectorIndexOpnd<ty, mc>, ImmLeaf<ty, pred>;
+  def _timm : AsmVectorIndexOpnd<ty, mc>, TImmLeaf<ty, pred>;
+}
+
 def VectorIndex1Operand : AsmVectorIndex<1, 1>;
 def VectorIndexBOperand : AsmVectorIndex<0, 15>;
 def VectorIndexHOperand : AsmVectorIndex<0, 7>;
 def VectorIndexSOperand : AsmVectorIndex<0, 3>;
 def VectorIndexDOperand : AsmVectorIndex<0, 1>;
 
-def VectorIndex1 : AsmVectorIndexOpnd<i64, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
-def VectorIndexB : AsmVectorIndexOpnd<i64, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
-def VectorIndexH : AsmVectorIndexOpnd<i64, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
-def VectorIndexS : AsmVectorIndexOpnd<i64, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
-def VectorIndexD : AsmVectorIndexOpnd<i64, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
-
-def VectorIndex132b : AsmVectorIndexOpnd<i32, VectorIndex1Operand, [{ return ((uint64_t)Imm) == 1; }]>;
-def VectorIndexB32b : AsmVectorIndexOpnd<i32, VectorIndexBOperand, [{ return ((uint64_t)Imm) < 16; }]>;
-def VectorIndexH32b : AsmVectorIndexOpnd<i32, VectorIndexHOperand, [{ return ((uint64_t)Imm) < 8; }]>;
-def VectorIndexS32b : AsmVectorIndexOpnd<i32, VectorIndexSOperand, [{ return ((uint64_t)Imm) < 4; }]>;
-def VectorIndexD32b : AsmVectorIndexOpnd<i32, VectorIndexDOperand, [{ return ((uint64_t)Imm) < 2; }]>;
+defm VectorIndex1 : VectorIndex<i64, VectorIndex1Operand,
+                                [{ return ((uint64_t)Imm) == 1; }]>;
+defm VectorIndexB : VectorIndex<i64, VectorIndexBOperand,
+                                [{ return ((uint64_t)Imm) < 16; }]>;
+defm VectorIndexH : VectorIndex<i64, VectorIndexHOperand,
+                                [{ return ((uint64_t)Imm) < 8; }]>;
+defm VectorIndexS : VectorIndex<i64, VectorIndexSOperand,
+                                [{ return ((uint64_t)Imm) < 4; }]>;
+defm VectorIndexD : VectorIndex<i64, VectorIndexDOperand,
+                                [{ return ((uint64_t)Imm) < 2; }]>;
+
+defm VectorIndex132b : VectorIndex<i32, VectorIndex1Operand,
+                                   [{ return ((uint64_t)Imm) == 1; }]>;
+defm VectorIndexB32b : VectorIndex<i32, VectorIndexBOperand,
+                                   [{ return ((uint64_t)Imm) < 16; }]>;
+defm VectorIndexH32b : VectorIndex<i32, VectorIndexHOperand,
+                                   [{ return ((uint64_t)Imm) < 8; }]>;
+defm VectorIndexS32b : VectorIndex<i32, VectorIndexSOperand,
+                                   [{ return ((uint64_t)Imm) < 4; }]>;
+defm VectorIndexD32b : VectorIndex<i32, VectorIndexDOperand,
+                                   [{ return ((uint64_t)Imm) < 2; }]>;
 
 def SVEVectorIndexExtDupBOperand : AsmVectorIndex<0, 63, "SVE">;
 def SVEVectorIndexExtDupHOperand : AsmVectorIndex<0, 31, "SVE">;
@@ -1121,16 +1230,21 @@ def SVEVectorIndexExtDupSOperand : AsmVectorIndex<0, 15, "SVE">;
 def SVEVectorIndexExtDupDOperand : AsmVectorIndex<0, 7, "SVE">;
 def SVEVectorIndexExtDupQOperand : AsmVectorIndex<0, 3, "SVE">;
 
-def sve_elm_idx_extdup_b
-  : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupBOperand, [{ return ((uint64_t)Imm) < 64; }]>;
-def sve_elm_idx_extdup_h
-  : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupHOperand, [{ return ((uint64_t)Imm) < 32; }]>;
-def sve_elm_idx_extdup_s
-  : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupSOperand, [{ return ((uint64_t)Imm) < 16; }]>;
-def sve_elm_idx_extdup_d
-  : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupDOperand, [{ return ((uint64_t)Imm) < 8; }]>;
-def sve_elm_idx_extdup_q
-  : AsmVectorIndexOpnd<i64, SVEVectorIndexExtDupQOperand, [{ return ((uint64_t)Imm) < 4; }]>;
+defm sve_elm_idx_extdup_b
+  : VectorIndex<i64, SVEVectorIndexExtDupBOperand,
+                [{ return ((uint64_t)Imm) < 64; }]>;
+defm sve_elm_idx_extdup_h
+  : VectorIndex<i64, SVEVectorIndexExtDupHOperand,
+                [{ return ((uint64_t)Imm) < 32; }]>;
+defm sve_elm_idx_extdup_s
+  : VectorIndex<i64, SVEVectorIndexExtDupSOperand,
+                [{ return ((uint64_t)Imm) < 16; }]>;
+defm sve_elm_idx_extdup_d
+  : VectorIndex<i64, SVEVectorIndexExtDupDOperand,
+                [{ return ((uint64_t)Imm) < 8; }]>;
+defm sve_elm_idx_extdup_q
+  : VectorIndex<i64, SVEVectorIndexExtDupQOperand,
+                [{ return ((uint64_t)Imm) < 4; }]>;
 
 // 8-bit immediate for AdvSIMD where 64-bit values of the form:
 // aaaaaaaa bbbbbbbb cccccccc dddddddd eeeeeeee ffffffff gggggggg hhhhhhhh
@@ -1533,6 +1647,8 @@ class BaseAuthLoad<bit M, bit W, dag oops, dag iops, string asm,
   let Inst{10} = 1;
   let Inst{9-5} = Rn;
   let Inst{4-0} = Rt;
+
+  let DecoderMethod = "DecodeAuthLoadInstruction";
 }
 
 multiclass AuthLoad<bit M, string asm, Operand opr> {
@@ -4333,14 +4449,14 @@ multiclass FPToIntegerUnscaled<bits<2> rmode, bits<3> opcode, string asm,
            SDPatternOperator OpN> {
   // Unscaled half-precision to 32-bit
   def UWHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR32, asm,
-                                     [(set GPR32:$Rd, (OpN FPR16:$Rn))]> {
+                                     [(set GPR32:$Rd, (OpN (f16 FPR16:$Rn)))]> {
     let Inst{31} = 0; // 32-bit GPR flag
     let Predicates = [HasFullFP16];
   }
 
   // Unscaled half-precision to 64-bit
   def UXHr : BaseFPToIntegerUnscaled<0b11, rmode, opcode, FPR16, GPR64, asm,
-                                     [(set GPR64:$Rd, (OpN FPR16:$Rn))]> {
+                                     [(set GPR64:$Rd, (OpN (f16 FPR16:$Rn)))]> {
     let Inst{31} = 1; // 64-bit GPR flag
     let Predicates = [HasFullFP16];
   }
@@ -4375,7 +4491,7 @@ multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
   // Scaled half-precision to 32-bit
   def SWHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR32,
                               fixedpoint_f16_i32, asm,
-              [(set GPR32:$Rd, (OpN (fmul FPR16:$Rn,
+              [(set GPR32:$Rd, (OpN (fmul (f16 FPR16:$Rn),
                                           fixedpoint_f16_i32:$scale)))]> {
     let Inst{31} = 0; // 32-bit GPR flag
     let scale{5} = 1;
@@ -4385,7 +4501,7 @@ multiclass FPToIntegerScaled<bits<2> rmode, bits<3> opcode, string asm,
   // Scaled half-precision to 64-bit
   def SXHri : BaseFPToInteger<0b11, rmode, opcode, FPR16, GPR64,
                               fixedpoint_f16_i64, asm,
-              [(set GPR64:$Rd, (OpN (fmul FPR16:$Rn,
+              [(set GPR64:$Rd, (OpN (fmul (f16 FPR16:$Rn),
                                           fixedpoint_f16_i64:$scale)))]> {
     let Inst{31} = 1; // 64-bit GPR flag
     let Predicates = [HasFullFP16];
@@ -4501,7 +4617,7 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
 
   // Scaled
   def SWHri: BaseIntegerToFP<isUnsigned, GPR32, FPR16, fixedpoint_f16_i32, asm,
-                             [(set FPR16:$Rd,
+                             [(set (f16 FPR16:$Rd),
                                    (fdiv (node GPR32:$Rn),
                                          fixedpoint_f16_i32:$scale))]> {
     let Inst{31} = 0; // 32-bit GPR flag
@@ -4529,7 +4645,7 @@ multiclass IntegerToFP<bit isUnsigned, string asm, SDNode node> {
   }
 
   def SXHri: BaseIntegerToFP<isUnsigned, GPR64, FPR16, fixedpoint_f16_i64, asm,
-                             [(set FPR16:$Rd,
+                             [(set (f16 FPR16:$Rd),
                                    (fdiv (node GPR64:$Rn),
                                          fixedpoint_f16_i64:$scale))]> {
     let Inst{31} = 1; // 64-bit GPR flag
@@ -4702,19 +4818,19 @@ class BaseFPConversion<bits<2> type, bits<2> opcode, RegisterClass dstType,
 multiclass FPConversion<string asm> {
   // Double-precision to Half-precision
   def HDr : BaseFPConversion<0b01, 0b11, FPR16, FPR64, asm,
-                             [(set FPR16:$Rd, (fpround FPR64:$Rn))]>;
+                             [(set (f16 FPR16:$Rd), (any_fpround FPR64:$Rn))]>;
 
   // Double-precision to Single-precision
   def SDr : BaseFPConversion<0b01, 0b00, FPR32, FPR64, asm,
-                             [(set FPR32:$Rd, (fpround FPR64:$Rn))]>;
+                             [(set FPR32:$Rd, (any_fpround FPR64:$Rn))]>;
 
   // Half-precision to Double-precision
   def DHr : BaseFPConversion<0b11, 0b01, FPR64, FPR16, asm,
-                             [(set FPR64:$Rd, (fpextend FPR16:$Rn))]>;
+                             [(set FPR64:$Rd, (fpextend (f16 FPR16:$Rn)))]>;
 
   // Half-precision to Single-precision
   def SHr : BaseFPConversion<0b11, 0b00, FPR32, FPR16, asm,
-                             [(set FPR32:$Rd, (fpextend FPR16:$Rn))]>;
+                             [(set FPR32:$Rd, (fpextend (f16 FPR16:$Rn)))]>;
 
   // Single-precision to Double-precision
   def DSr : BaseFPConversion<0b00, 0b01, FPR64, FPR32, asm,
@@ -4722,7 +4838,7 @@ multiclass FPConversion<string asm> {
 
   // Single-precision to Half-precision
   def HSr : BaseFPConversion<0b00, 0b11, FPR16, FPR32, asm,
-                             [(set FPR16:$Rd, (fpround FPR32:$Rn))]>;
+                             [(set (f16 FPR16:$Rd), (any_fpround FPR32:$Rn))]>;
 }
 
 //---
@@ -4824,7 +4940,7 @@ multiclass TwoOperandFPData<bits<4> opcode, string asm,
 
 multiclass TwoOperandFPDataNeg<bits<4> opcode, string asm, SDNode node> {
   def Hrr : BaseTwoOperandFPData<opcode, FPR16, asm,
-                  [(set FPR16:$Rd, (fneg (node FPR16:$Rn, (f16 FPR16:$Rm))))]> {
+                  [(set (f16 FPR16:$Rd), (fneg (node (f16 FPR16:$Rn), (f16 FPR16:$Rm))))]> {
     let Inst{23-22} = 0b11; // 16-bit size flag
     let Predicates = [HasFullFP16];
   }
@@ -4866,7 +4982,7 @@ class BaseThreeOperandFPData<bit isNegated, bit isSub,
 multiclass ThreeOperandFPData<bit isNegated, bit isSub,string asm,
                               SDPatternOperator node> {
   def Hrrr : BaseThreeOperandFPData<isNegated, isSub, FPR16, asm,
-            [(set FPR16:$Rd,
+            [(set (f16 FPR16:$Rd),
                   (node (f16 FPR16:$Rn), (f16 FPR16:$Rm), (f16 FPR16:$Ra)))]> {
     let Inst{23-22} = 0b11; // 16-bit size flag
     let Predicates = [HasFullFP16];
@@ -4928,7 +5044,7 @@ multiclass FPComparison<bit signalAllNans, string asm,
                         SDPatternOperator OpNode = null_frag> {
   let Defs = [NZCV] in {
   def Hrr : BaseTwoOperandFPComparison<signalAllNans, FPR16, asm,
-      [(OpNode FPR16:$Rn, (f16 FPR16:$Rm)), (implicit NZCV)]> {
+      [(OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)), (implicit NZCV)]> {
     let Inst{23-22} = 0b11;
     let Predicates = [HasFullFP16];
   }
@@ -5142,6 +5258,47 @@ class BaseSIMDThreeSameVectorTied<bit Q, bit U, bits<3> size, bits<5> opcode,
   let Inst{4-0}   = Rd;
 }
 
+let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in
+class BaseSIMDThreeSameVectorPseudo<RegisterOperand regtype, list<dag> pattern>
+  : Pseudo<(outs regtype:$dst), (ins regtype:$Rd, regtype:$Rn, regtype:$Rm), pattern>,
+    Sched<[WriteV]>;
+
+multiclass SIMDLogicalThreeVectorPseudo<SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorPseudo<V64,
+             [(set (v8i8 V64:$dst),
+                   (OpNode (v8i8 V64:$Rd), (v8i8 V64:$Rn), (v8i8 V64:$Rm)))]>;
+  def v16i8  : BaseSIMDThreeSameVectorPseudo<V128,
+             [(set (v16i8 V128:$dst),
+                   (OpNode (v16i8 V128:$Rd), (v16i8 V128:$Rn),
+                           (v16i8 V128:$Rm)))]>;
+
+  def : Pat<(v4i16 (OpNode (v4i16 V64:$LHS), (v4i16 V64:$MHS),
+                           (v4i16 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v2i32 (OpNode (v2i32 V64:$LHS), (v2i32 V64:$MHS),
+                           (v2i32 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+  def : Pat<(v1i64 (OpNode (v1i64 V64:$LHS), (v1i64 V64:$MHS),
+                           (v1i64 V64:$RHS))),
+          (!cast<Instruction>(NAME#"v8i8")
+            V64:$LHS, V64:$MHS, V64:$RHS)>;
+
+  def : Pat<(v8i16 (OpNode (v8i16 V128:$LHS), (v8i16 V128:$MHS),
+                           (v8i16 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v4i32 (OpNode (v4i32 V128:$LHS), (v4i32 V128:$MHS),
+                           (v4i32 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+  def : Pat<(v2i64 (OpNode (v2i64 V128:$LHS), (v2i64 V128:$MHS),
+                           (v2i64 V128:$RHS))),
+      (!cast<Instruction>(NAME#"v16i8")
+        V128:$LHS, V128:$MHS, V128:$RHS)>;
+}
+
 // All operand sizes distinguished in the encoding.
 multiclass SIMDThreeSameVector<bit U, bits<5> opc, string asm,
                                SDPatternOperator OpNode> {
@@ -5362,7 +5519,7 @@ multiclass SIMDLogicalThreeVector<bit U, bits<2> size, string asm,
 }
 
 multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
-                                  string asm, SDPatternOperator OpNode> {
+                                  string asm, SDPatternOperator OpNode = null_frag> {
   def v8i8  : BaseSIMDThreeSameVectorTied<0, U, {size,1}, 0b00011, V64,
                                      asm, ".8b",
              [(set (v8i8 V64:$dst),
@@ -5402,11 +5559,11 @@ multiclass SIMDLogicalThreeVectorTied<bit U, bits<2> size,
 
 // ARMv8.2-A Dot Product Instructions (Vector): These instructions extract
 // bytes from S-sized elements.
-class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
+class BaseSIMDThreeSameVectorDot<bit Q, bit U, bit Mixed, string asm, string kind1,
                                  string kind2, RegisterOperand RegType,
                                  ValueType AccumType, ValueType InputType,
                                  SDPatternOperator OpNode> :
-        BaseSIMDThreeSameVectorTied<Q, U, 0b100, 0b10010, RegType, asm, kind1,
+        BaseSIMDThreeSameVectorTied<Q, U, 0b100, {0b1001, Mixed}, RegType, asm, kind1,
         [(set (AccumType RegType:$dst),
               (OpNode (AccumType RegType:$Rd),
                       (InputType RegType:$Rn),
@@ -5414,10 +5571,10 @@ class BaseSIMDThreeSameVectorDot<bit Q, bit U, string asm, string kind1,
   let AsmString = !strconcat(asm, "{\t$Rd" # kind1 # ", $Rn" # kind2 # ", $Rm" # kind2 # "}");
 }
 
-multiclass SIMDThreeSameVectorDot<bit U, string asm, SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, asm, ".2s", ".8b", V64,
+multiclass SIMDThreeSameVectorDot<bit U, bit Mixed, string asm, SDPatternOperator OpNode> {
+  def v8i8  : BaseSIMDThreeSameVectorDot<0, U, Mixed, asm, ".2s", ".8b", V64,
                                          v2i32, v8i8, OpNode>;
-  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, asm, ".4s", ".16b", V128,
+  def v16i8 : BaseSIMDThreeSameVectorDot<1, U, Mixed, asm, ".4s", ".16b", V128,
                                          v4i32, v16i8, OpNode>;
 }
 
@@ -6581,13 +6738,13 @@ multiclass SIMDThreeScalarHSTied<bit U, bit R, bits<5> opc, string asm,
 multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
                              SDPatternOperator OpNode = null_frag> {
   let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
+    def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
       [(set (f64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
+    def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
       [(set FPR32:$Rd, (OpNode FPR32:$Rn, FPR32:$Rm))]>;
     let Predicates = [HasNEON, HasFullFP16] in {
-    def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
-      [(set FPR16:$Rd, (OpNode FPR16:$Rn, FPR16:$Rm))]>;
+    def NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+      [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn), (f16 FPR16:$Rm)))]>;
     } // Predicates = [HasNEON, HasFullFP16]
   }
 
@@ -6598,12 +6755,12 @@ multiclass SIMDFPThreeScalar<bit U, bit S, bits<3> opc, string asm,
 multiclass SIMDThreeScalarFPCmp<bit U, bit S, bits<3> opc, string asm,
                                 SDPatternOperator OpNode = null_frag> {
   let mayLoad = 0, mayStore = 0, hasSideEffects = 0 in {
-    def #NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
+    def NAME#64 : BaseSIMDThreeScalar<U, {S,0b11}, {0b11,opc}, FPR64, asm,
       [(set (i64 FPR64:$Rd), (OpNode (f64 FPR64:$Rn), (f64 FPR64:$Rm)))]>;
-    def #NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
+    def NAME#32 : BaseSIMDThreeScalar<U, {S,0b01}, {0b11,opc}, FPR32, asm,
       [(set (i32 FPR32:$Rd), (OpNode (f32 FPR32:$Rn), (f32 FPR32:$Rm)))]>;
     let Predicates = [HasNEON, HasFullFP16] in {
-    def #NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
+    def NAME#16 : BaseSIMDThreeScalar<U, {S,0b10}, {0b00,opc}, FPR16, asm,
       []>;
     } // Predicates = [HasNEON, HasFullFP16]
   }
@@ -6794,7 +6951,7 @@ multiclass SIMDFPTwoScalarCVT<bit U, bit S, bits<5> opc, string asm,
                                 [(set FPR32:$Rd, (OpNode (f32 FPR32:$Rn)))]>;
   let Predicates = [HasNEON, HasFullFP16] in {
   def v1i16 : BaseSIMDTwoScalar<U, {S,1}, 0b11, opc, FPR16, FPR16, asm,
-                                [(set FPR16:$Rd, (OpNode (f16 FPR16:$Rn)))]>;
+                                [(set (f16 FPR16:$Rd), (OpNode (f16 FPR16:$Rn)))]>;
   }
 }
 
@@ -6936,10 +7093,10 @@ multiclass SIMDFPAcrossLanes<bits<5> opcode, bit sz1, string asm,
   let Predicates = [HasNEON, HasFullFP16] in {
   def v4i16v : BaseSIMDAcrossLanes<0, 0, {sz1, 0}, opcode, FPR16, V64,
                                    asm, ".4h",
-        [(set FPR16:$Rd, (intOp (v4f16 V64:$Rn)))]>;
+        [(set (f16 FPR16:$Rd), (intOp (v4f16 V64:$Rn)))]>;
   def v8i16v : BaseSIMDAcrossLanes<1, 0, {sz1, 0}, opcode, FPR16, V128,
                                    asm, ".8h",
-        [(set FPR16:$Rd, (intOp (v8f16 V128:$Rn)))]>;
+        [(set (f16 FPR16:$Rd), (intOp (v8f16 V128:$Rn)))]>;
   } // Predicates = [HasNEON, HasFullFP16]
   def v4i32v : BaseSIMDAcrossLanes<1, 1, {sz1, 0}, opcode, FPR32, V128,
                                    asm, ".4s",
@@ -7136,7 +7293,7 @@ class SIMDInsMainMovAlias<string size, Instruction inst,
                 (inst V128:$dst, idxtype:$idx, regtype:$src)>;
 class SIMDInsElementMovAlias<string size, Instruction inst,
                              Operand idxtype>
-    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2" #
+    : InstAlias<"mov" # "{\t$dst" # size # "$idx, $src" # size # "$idx2"
                       # "|" # size #"\t$dst$idx, $src$idx2}",
                 (inst V128:$dst, idxtype:$idx, V128:$src, idxtype:$idx2)>;
 
@@ -7377,7 +7534,7 @@ class BaseSIMDScalarCPY<RegisterClass regtype, RegisterOperand vectype,
 
 class SIMDScalarCPYAlias<string asm, string size, Instruction inst,
       RegisterClass regtype, RegisterOperand vectype, Operand idxtype>
-    : InstAlias<asm # "{\t$dst, $src" # size # "$index" #
+    : InstAlias<asm # "{\t$dst, $src" # size # "$index"
                     # "|\t$dst, $src$index}",
                 (inst regtype:$dst, vectype:$src, idxtype:$index), 0>;
 
@@ -7651,13 +7808,152 @@ class BaseSIMDIndexedTied<bit Q, bit U, bit Scalar, bits<2> size, bits<4> opc,
   let Inst{4-0}   = Rd;
 }
 
+
+//----------------------------------------------------------------------------
+// Armv8.6 BFloat16 Extension
+//----------------------------------------------------------------------------
+let mayStore = 0, mayLoad = 0, hasSideEffects = 0 in {
+
+class BaseSIMDThreeSameVectorBFDot<bit Q, bit U, string asm, string kind1,
+                                   string kind2, RegisterOperand RegType,
+                                   ValueType AccumType, ValueType InputType>
+  : BaseSIMDThreeSameVectorTied<Q, U, 0b010, 0b11111, RegType, asm, kind1, [(set (AccumType RegType:$dst),
+                    (int_aarch64_neon_bfdot (AccumType RegType:$Rd),
+                                            (InputType RegType:$Rn),
+                                            (InputType RegType:$Rm)))]> {
+  let AsmString = !strconcat(asm,
+                             "{\t$Rd" # kind1 # ", $Rn" # kind2 #
+                               ", $Rm" # kind2 # "}");
+}
+
+multiclass SIMDThreeSameVectorBFDot<bit U, string asm> {
+  def v4bf16 : BaseSIMDThreeSameVectorBFDot<0, U, asm, ".2s", ".4h", V64,
+                                           v2f32, v8i8>;
+  def v8bf16 : BaseSIMDThreeSameVectorBFDot<1, U, asm, ".4s", ".8h", V128,
+                                           v4f32, v16i8>;
+}
+
+class BaseSIMDThreeSameVectorBF16DotI<bit Q, bit U, string asm,
+                                      string dst_kind, string lhs_kind,
+                                      string rhs_kind,
+                                      RegisterOperand RegType,
+                                      ValueType AccumType,
+                                      ValueType InputType>
+  : BaseSIMDIndexedTied<Q, U, 0b0, 0b01, 0b1111,
+                        RegType, RegType, V128, VectorIndexS,
+                        asm, "", dst_kind, lhs_kind, rhs_kind,
+        [(set (AccumType RegType:$dst),
+              (AccumType (int_aarch64_neon_bfdot
+                                 (AccumType RegType:$Rd),
+                                 (InputType RegType:$Rn),
+                                 (InputType (bitconvert (AccumType
+                                    (AArch64duplane32 (v4f32 V128:$Rm),
+                                        VectorIndexH:$idx)))))))]> {
+
+  bits<2> idx;
+  let Inst{21}    = idx{0};  // L
+  let Inst{11}    = idx{1};  // H
+}
+
+multiclass SIMDThreeSameVectorBF16DotI<bit U, string asm> {
+
+  def v4bf16  : BaseSIMDThreeSameVectorBF16DotI<0, U, asm, ".2s", ".4h",
+                                               ".2h", V64, v2f32, v8i8>;
+  def v8bf16 : BaseSIMDThreeSameVectorBF16DotI<1, U, asm, ".4s", ".8h",
+                                              ".2h", V128, v4f32, v16i8>;
+}
+
+class SIMDBF16MLAL<bit Q, string asm, SDPatternOperator OpNode>
+  : BaseSIMDThreeSameVectorTied<Q, 0b1, 0b110, 0b11111, V128, asm, ".4s",
+              [(set (v4f32 V128:$dst), (OpNode (v4f32 V128:$Rd),
+                                               (v16i8 V128:$Rn),
+                                               (v16i8 V128:$Rm)))]> {
+  let AsmString = !strconcat(asm, "{\t$Rd.4s, $Rn.8h, $Rm.8h}");
+}
+
+class SIMDBF16MLALIndex<bit Q, string asm, SDPatternOperator OpNode>
+  : I<(outs V128:$dst),
+      (ins V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx), asm,
+      "{\t$Rd.4s, $Rn.8h, $Rm.h$idx}", "$Rd = $dst",
+          [(set (v4f32 V128:$dst),
+                (v4f32 (OpNode (v4f32 V128:$Rd),
+                               (v16i8 V128:$Rn),
+                               (v16i8 (bitconvert (v8bf16
+                                  (AArch64duplane16 (v8bf16 V128_lo:$Rm),
+                                      VectorIndexH:$idx)))))))]>,
+    Sched<[WriteV]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  bits<4> Rm;
+  bits<3> idx;
+
+  let Inst{31}    = 0;
+  let Inst{30}    = Q;
+  let Inst{29-22} = 0b00111111;
+  let Inst{21-20} = idx{1-0};
+  let Inst{19-16} = Rm;
+  let Inst{15-12} = 0b1111;
+  let Inst{11}    = idx{2};   // H
+  let Inst{10}    = 0;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+
+class SIMDThreeSameVectorBF16MatrixMul<string asm>
+  : BaseSIMDThreeSameVectorTied<1, 1, 0b010, 0b11101,
+                                V128, asm, ".4s",
+                          [(set (v4f32 V128:$dst),
+                                (int_aarch64_neon_bfmmla (v4f32 V128:$Rd),
+                                                         (v16i8 V128:$Rn),
+                                                         (v16i8 V128:$Rm)))]> {
+  let AsmString = !strconcat(asm, "{\t$Rd", ".4s", ", $Rn", ".8h",
+                                    ", $Rm", ".8h", "}");
+}
+
+class SIMD_BFCVTN
+  : BaseSIMDMixedTwoVector<0, 0, 0b10, 0b10110, V128, V128,
+                           "bfcvtn", ".4h", ".4s",
+    [(set (v8bf16 V128:$Rd),
+          (int_aarch64_neon_bfcvtn (v4f32 V128:$Rn)))]>;
+
+class SIMD_BFCVTN2
+  : BaseSIMDMixedTwoVectorTied<1, 0, 0b10, 0b10110, V128, V128,
+                           "bfcvtn2", ".8h", ".4s",
+    [(set (v8bf16 V128:$dst),
+          (int_aarch64_neon_bfcvtn2 (v8bf16 V128:$Rd), (v4f32 V128:$Rn)))]>;
+
+class BF16ToSinglePrecision<string asm>
+  : I<(outs FPR16:$Rd), (ins FPR32:$Rn), asm, "\t$Rd, $Rn", "",
+    [(set (bf16 FPR16:$Rd), (int_aarch64_neon_bfcvt (f32 FPR32:$Rn)))]>,
+    Sched<[WriteFCvt]> {
+  bits<5> Rd;
+  bits<5> Rn;
+  let Inst{31-10} = 0b0001111001100011010000;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Rd;
+}
+} // End of let mayStore = 0, mayLoad = 0, hasSideEffects = 0
+
+//----------------------------------------------------------------------------
+// Armv8.6 Matrix Multiply Extension
+//----------------------------------------------------------------------------
+
+class SIMDThreeSameVectorMatMul<bit B, bit U, string asm, SDPatternOperator OpNode>
+  : BaseSIMDThreeSameVectorTied<1, U, 0b100, {0b1010, B}, V128, asm, ".4s",
+              [(set (v4i32 V128:$dst), (OpNode (v4i32 V128:$Rd),
+                                               (v16i8 V128:$Rn),
+                                               (v16i8 V128:$Rm)))]> {
+  let AsmString = asm # "{\t$Rd.4s, $Rn.16b, $Rm.16b}";
+}
+
+//----------------------------------------------------------------------------
 // ARMv8.2-A Dot Product Instructions (Indexed)
-class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
-                                      string lhs_kind, string rhs_kind,
+class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, bit Mixed, bits<2> size, string asm,
+                                      string dst_kind, string lhs_kind, string rhs_kind,
                                       RegisterOperand RegType,
                                       ValueType AccumType, ValueType InputType,
                                       SDPatternOperator OpNode> :
-        BaseSIMDIndexedTied<Q, U, 0b0, 0b10, 0b1110, RegType, RegType, V128,
+        BaseSIMDIndexedTied<Q, U, 0b0, size, {0b111, Mixed}, RegType, RegType, V128,
                             VectorIndexS, asm, "", dst_kind, lhs_kind, rhs_kind,
         [(set (AccumType RegType:$dst),
               (AccumType (OpNode (AccumType RegType:$Rd),
@@ -7670,11 +7966,11 @@ class BaseSIMDThreeSameVectorDotIndex<bit Q, bit U, string asm, string dst_kind,
   let Inst{11}    = idx{1};  // H
 }
 
-multiclass SIMDThreeSameVectorDotIndex<bit U, string asm,
+multiclass SIMDThreeSameVectorDotIndex<bit U, bit Mixed, bits<2> size, string asm,
                                        SDPatternOperator OpNode> {
-  def v8i8  : BaseSIMDThreeSameVectorDotIndex<0, U, asm, ".2s", ".8b", ".4b",
+  def v8i8  : BaseSIMDThreeSameVectorDotIndex<0, U, Mixed, size, asm, ".2s", ".8b", ".4b",
                                               V64, v2i32, v8i8, OpNode>;
-  def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, asm, ".4s", ".16b", ".4b",
+  def v16i8 : BaseSIMDThreeSameVectorDotIndex<1, U, Mixed, size, asm, ".4s", ".16b", ".4b",
                                               V128, v4i32, v16i8, OpNode>;
 }
 
@@ -7813,6 +8109,34 @@ multiclass SIMDFPIndexed<bit U, bits<4> opc, string asm,
 }
 
 multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
+  let Predicates = [HasNEON, HasFullFP16] in {
+  // Patterns for f16: DUPLANE, DUP scalar and vector_extract.
+  def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn),
+                           (AArch64duplane16 (v8f16 V128_lo:$Rm),
+                                           VectorIndexH:$idx))),
+            (!cast<Instruction>(INST # "v8i16_indexed")
+                V128:$Rd, V128:$Rn, V128_lo:$Rm, VectorIndexH:$idx)>;
+  def : Pat<(v8f16 (OpNode (v8f16 V128:$Rd), (v8f16 V128:$Rn),
+                           (AArch64dup (f16 FPR16Op_lo:$Rm)))),
+            (!cast<Instruction>(INST # "v8i16_indexed") V128:$Rd, V128:$Rn,
+                (SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>;
+
+  def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn),
+                           (AArch64duplane16 (v8f16 V128_lo:$Rm),
+                                           VectorIndexH:$idx))),
+            (!cast<Instruction>(INST # "v4i16_indexed")
+                V64:$Rd, V64:$Rn, V128_lo:$Rm, VectorIndexH:$idx)>;
+  def : Pat<(v4f16 (OpNode (v4f16 V64:$Rd), (v4f16 V64:$Rn),
+                           (AArch64dup (f16 FPR16Op_lo:$Rm)))),
+            (!cast<Instruction>(INST # "v4i16_indexed") V64:$Rd, V64:$Rn,
+                (SUBREG_TO_REG (i32 0), (f16 FPR16Op_lo:$Rm), hsub), (i64 0))>;
+
+  def : Pat<(f16 (OpNode (f16 FPR16:$Rd), (f16 FPR16:$Rn),
+                         (vector_extract (v8f16 V128_lo:$Rm), VectorIndexH:$idx))),
+            (!cast<Instruction>(INST # "v1i16_indexed") FPR16:$Rd, FPR16:$Rn,
+                V128_lo:$Rm, VectorIndexH:$idx)>;
+  } // Predicates = [HasNEON, HasFullFP16]
+
   // 2 variants for the .2s version: DUPLANE from 128-bit and DUP scalar.
   def : Pat<(v2f32 (OpNode (v2f32 V64:$Rd), (v2f32 V64:$Rn),
                            (AArch64duplane32 (v4f32 V128:$Rm),
@@ -7847,15 +8171,11 @@ multiclass SIMDFPIndexedTiedPatterns<string INST, SDPatternOperator OpNode> {
             (!cast<Instruction>(INST # "v2i64_indexed") V128:$Rd, V128:$Rn,
                 (SUBREG_TO_REG (i32 0), FPR64Op:$Rm, dsub), (i64 0))>;
 
-  // 2 variants for 32-bit scalar version: extract from .2s or from .4s
+  // Covers 2 variants for 32-bit scalar version: extract from .2s or from .4s
   def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
                          (vector_extract (v4f32 V128:$Rm), VectorIndexS:$idx))),
             (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
                 V128:$Rm, VectorIndexS:$idx)>;
-  def : Pat<(f32 (OpNode (f32 FPR32:$Rd), (f32 FPR32:$Rn),
-                         (vector_extract (v2f32 V64:$Rm), VectorIndexS:$idx))),
-            (!cast<Instruction>(INST # "v1i32_indexed") FPR32:$Rd, FPR32:$Rn,
-                (SUBREG_TO_REG (i32 0), V64:$Rm, dsub), VectorIndexS:$idx)>;
 
   // 1 variant for 64-bit scalar version: extract from .1d or from .2d
   def : Pat<(f64 (OpNode (f64 FPR64:$Rd), (f64 FPR64:$Rn),
@@ -7940,6 +8260,64 @@ multiclass SIMDFPIndexedTied<bit U, bits<4> opc, string asm> {
   }
 }
 
+multiclass SIMDIndexedHSPatterns<SDPatternOperator OpNodeLane,
+                                 SDPatternOperator OpNodeLaneQ> {
+
+  def : Pat<(v4i16 (OpNodeLane
+                     (v4i16 V64:$Rn), (v4i16 V64_lo:$Rm),
+                     VectorIndexS32b:$idx)),
+            (!cast<Instruction>(NAME # v4i16_indexed) $Rn,
+              (SUBREG_TO_REG (i32 0), (v4i16 V64_lo:$Rm), dsub),
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v4i16 (OpNodeLaneQ
+                     (v4i16 V64:$Rn), (v8i16 V128_lo:$Rm),
+                     VectorIndexH32b:$idx)),
+            (!cast<Instruction>(NAME # v4i16_indexed) $Rn, $Rm,
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v8i16 (OpNodeLane
+                     (v8i16 V128:$Rn), (v4i16 V64_lo:$Rm),
+                     VectorIndexS32b:$idx)),
+            (!cast<Instruction>(NAME # v8i16_indexed) $Rn,
+              (SUBREG_TO_REG (i32 0), $Rm, dsub),
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v8i16 (OpNodeLaneQ
+                     (v8i16 V128:$Rn), (v8i16 V128_lo:$Rm),
+                     VectorIndexH32b:$idx)),
+            (!cast<Instruction>(NAME # v8i16_indexed) $Rn, $Rm,
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v2i32 (OpNodeLane
+                     (v2i32 V64:$Rn), (v2i32 V64:$Rm),
+                     VectorIndexD32b:$idx)),
+            (!cast<Instruction>(NAME # v2i32_indexed) $Rn,
+              (SUBREG_TO_REG (i32 0), (v2i32 V64_lo:$Rm), dsub),
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v2i32 (OpNodeLaneQ
+                     (v2i32 V64:$Rn), (v4i32 V128:$Rm),
+                     VectorIndexS32b:$idx)),
+            (!cast<Instruction>(NAME # v2i32_indexed) $Rn, $Rm,
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v4i32 (OpNodeLane
+                     (v4i32 V128:$Rn), (v2i32 V64:$Rm),
+                     VectorIndexD32b:$idx)),
+            (!cast<Instruction>(NAME # v4i32_indexed) $Rn,
+              (SUBREG_TO_REG (i32 0), $Rm, dsub),
+              (UImmS1XForm $idx))>;
+
+  def : Pat<(v4i32 (OpNodeLaneQ
+                     (v4i32 V128:$Rn),
+                     (v4i32 V128:$Rm),
+                     VectorIndexS32b:$idx)),
+            (!cast<Instruction>(NAME # v4i32_indexed) $Rn, $Rm,
+              (UImmS1XForm $idx))>;
+
+}
+
 multiclass SIMDIndexedHS<bit U, bits<4> opc, string asm,
                          SDPatternOperator OpNode> {
   def v4i16_indexed : BaseSIMDIndexed<0, U, 0, 0b01, opc, V64, V64,
@@ -10154,15 +10532,15 @@ class ComplexRotationOperand<int Angle, int Remainder, string Type>
   let DiagnosticType = "InvalidComplexRotation" # Type;
   let Name = "ComplexRotation" # Type;
 }
-def complexrotateop : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270;  }],
-                                                 SDNodeXForm<imm, [{
+def complexrotateop : Operand<i32>, TImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270;  }],
+                                                  SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant((N->getSExtValue() / 90), SDLoc(N), MVT::i32);
 }]>> {
   let ParserMatchClass = ComplexRotationOperand<90, 0, "Even">;
   let PrintMethod = "printComplexRotationOp<90, 0>";
 }
-def complexrotateopodd : Operand<i32>, ImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270;  }],
-                                                 SDNodeXForm<imm, [{
+def complexrotateopodd : Operand<i32>, TImmLeaf<i32, [{ return Imm >= 0 && Imm <= 270;  }],
+                                                  SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(((N->getSExtValue() - 90) / 180), SDLoc(N), MVT::i32);
 }]>> {
   let ParserMatchClass = ComplexRotationOperand<180, 90, "Odd">;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrGISel.td b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
new file mode 100644
index 000000000000..a0e7c782f68c
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64InstrGISel.td
@@ -0,0 +1,124 @@
+//=----- AArch64InstrGISel.td - AArch64 GISel target pseudos -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// AArch64 GlobalISel target pseudo instruction definitions. This is kept
+// separately from the other tablegen files for organizational purposes, but
+// share the same infrastructure.
+//
+//===----------------------------------------------------------------------===//
+
+
+class AArch64GenericInstruction : GenericInstruction {
+  let Namespace = "AArch64";
+}
+
+// A pseudo to represent a relocatable add instruction as part of address
+// computation.
+def G_ADD_LOW : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src, type2:$imm);
+  let hasSideEffects = 0;
+}
+
+// Pseudo for a rev16 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_REV16 : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src);
+  let hasSideEffects = 0;
+}
+
+// Pseudo for a rev32 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_REV32 : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src);
+  let hasSideEffects = 0;
+}
+
+// Pseudo for a rev64 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_REV64 : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src);
+  let hasSideEffects = 0;
+}
+
+// Represents an uzp1 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_UZP1 : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$v1, type0:$v2);
+  let hasSideEffects = 0;
+}
+
+// Represents an uzp2 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_UZP2 : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$v1, type0:$v2);
+  let hasSideEffects = 0;
+}
+
+// Represents a zip1 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_ZIP1 : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$v1, type0:$v2);
+  let hasSideEffects = 0;
+}
+
+// Represents a zip2 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_ZIP2 : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$v1, type0:$v2);
+  let hasSideEffects = 0;
+}
+
+// Represents a dup instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_DUP: AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$lane);
+  let hasSideEffects = 0;
+}
+// Represents a trn1 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_TRN1 : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$v1, type0:$v2);
+  let hasSideEffects = 0;
+}
+
+// Represents a trn2 instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_TRN2 : AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$v1, type0:$v2);
+  let hasSideEffects = 0;
+}
+
+// Represents an ext instruction. Produced post-legalization from
+// G_SHUFFLE_VECTORs with appropriate masks.
+def G_EXT: AArch64GenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$v1, type0:$v2, untyped_imm_0:$imm);
+}
+
+def : GINodeEquiv<G_REV16, AArch64rev16>;
+def : GINodeEquiv<G_REV32, AArch64rev32>;
+def : GINodeEquiv<G_REV64, AArch64rev64>;
+def : GINodeEquiv<G_UZP1, AArch64uzp1>;
+def : GINodeEquiv<G_UZP2, AArch64uzp2>;
+def : GINodeEquiv<G_ZIP1, AArch64zip1>;
+def : GINodeEquiv<G_ZIP2, AArch64zip2>;
+def : GINodeEquiv<G_DUP, AArch64dup>;
+def : GINodeEquiv<G_TRN1, AArch64trn1>;
+def : GINodeEquiv<G_TRN2, AArch64trn2>;
+def : GINodeEquiv<G_EXT, AArch64ext>;
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index 54f3f7c10132..5139ae5ccaf1 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -24,9 +24,9 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/StackMaps.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
@@ -111,6 +111,14 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
     // This gets lowered to an instruction sequence which takes 16 bytes
     NumBytes = 16;
     break;
+  case AArch64::SpeculationBarrierISBDSBEndBB:
+    // This gets lowered to 2 4-byte instructions.
+    NumBytes = 8;
+    break;
+  case AArch64::SpeculationBarrierSBEndBB:
+    // This gets lowered to 1 4-byte instructions.
+    NumBytes = 4;
+    break;
   case AArch64::JumpTableDest32:
   case AArch64::JumpTableDest16:
   case AArch64::JumpTableDest8:
@@ -119,11 +127,25 @@ unsigned AArch64InstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   case AArch64::SPACE:
     NumBytes = MI.getOperand(1).getImm();
     break;
+  case TargetOpcode::BUNDLE:
+    NumBytes = getInstBundleLength(MI);
+    break;
   }
 
   return NumBytes;
 }
 
+unsigned AArch64InstrInfo::getInstBundleLength(const MachineInstr &MI) const {
+  unsigned Size = 0;
+  MachineBasicBlock::const_instr_iterator I = MI.getIterator();
+  MachineBasicBlock::const_instr_iterator E = MI.getParent()->instr_end();
+  while (++I != E && I->isInsideBundle()) {
+    assert(!I->isBundle() && "No nested bundle!");
+    Size += getInstSizeInBytes(*I);
+  }
+  return Size;
+}
+
 static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
                             SmallVectorImpl<MachineOperand> &Cond) {
   // Block ends with fall-through condbranch.
@@ -216,6 +238,12 @@ bool AArch64InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
   if (I == MBB.end())
     return false;
 
+  // Skip over SpeculationBarrierEndBB terminators
+  if (I->getOpcode() == AArch64::SpeculationBarrierISBDSBEndBB ||
+      I->getOpcode() == AArch64::SpeculationBarrierSBEndBB) {
+    --I;
+  }
+
   if (!isUnpredicatedTerminator(*I))
     return false;
 
@@ -496,8 +524,9 @@ static unsigned canFoldIntoCSel(const MachineRegisterInfo &MRI, unsigned VReg,
 
 bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
                                        ArrayRef<MachineOperand> Cond,
-                                       unsigned TrueReg, unsigned FalseReg,
-                                       int &CondCycles, int &TrueCycles,
+                                       Register DstReg, Register TrueReg,
+                                       Register FalseReg, int &CondCycles,
+                                       int &TrueCycles,
                                        int &FalseCycles) const {
   // Check register classes.
   const MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -506,6 +535,12 @@ bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
   if (!RC)
     return false;
 
+  // Also need to check the dest regclass, in case we're trying to optimize
+  // something like:
+  // %1(gpr) = PHI %2(fpr), bb1, %(fpr), bb2
+  if (!RI.getCommonSubClass(RC, MRI.getRegClass(DstReg)))
+    return false;
+
   // Expanding cbz/tbz requires an extra cycle of latency on the condition.
   unsigned ExtraCondLat = Cond.size() != 1;
 
@@ -538,9 +573,9 @@ bool AArch64InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
 
 void AArch64InstrInfo::insertSelect(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator I,
-                                    const DebugLoc &DL, unsigned DstReg,
+                                    const DebugLoc &DL, Register DstReg,
                                     ArrayRef<MachineOperand> Cond,
-                                    unsigned TrueReg, unsigned FalseReg) const {
+                                    Register TrueReg, Register FalseReg) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
 
   // Parse the condition code, see parseCondBranch() above.
@@ -910,7 +945,7 @@ bool AArch64InstrInfo::isSEHInstruction(const MachineInstr &MI) {
 }
 
 bool AArch64InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
-                                             unsigned &SrcReg, unsigned &DstReg,
+                                             Register &SrcReg, Register &DstReg,
                                              unsigned &SubIdx) const {
   switch (MI.getOpcode()) {
   default:
@@ -935,6 +970,7 @@ bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
   const MachineOperand *BaseOpA = nullptr, *BaseOpB = nullptr;
   int64_t OffsetA = 0, OffsetB = 0;
   unsigned WidthA = 0, WidthB = 0;
+  bool OffsetAIsScalable = false, OffsetBIsScalable = false;
 
   assert(MIa.mayLoadOrStore() && "MIa must be a load or store.");
   assert(MIb.mayLoadOrStore() && "MIb must be a load or store.");
@@ -948,9 +984,14 @@ bool AArch64InstrInfo::areMemAccessesTriviallyDisjoint(
   // base are identical, and the offset of a lower memory access +
   // the width doesn't overlap the offset of a higher memory access,
   // then the memory accesses are different.
-  if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, WidthA, TRI) &&
-      getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, WidthB, TRI)) {
-    if (BaseOpA->isIdenticalTo(*BaseOpB)) {
+  // If OffsetAIsScalable and OffsetBIsScalable are both true, they
+  // are assumed to have the same scale (vscale).
+  if (getMemOperandWithOffsetWidth(MIa, BaseOpA, OffsetA, OffsetAIsScalable,
+                                   WidthA, TRI) &&
+      getMemOperandWithOffsetWidth(MIb, BaseOpB, OffsetB, OffsetBIsScalable,
+                                   WidthB, TRI)) {
+    if (BaseOpA->isIdenticalTo(*BaseOpB) &&
+        OffsetAIsScalable == OffsetBIsScalable) {
       int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
       int HighOffset = OffsetA < OffsetB ? OffsetB : OffsetA;
       int LowWidth = (LowOffset == OffsetA) ? WidthA : WidthB;
@@ -984,8 +1025,8 @@ bool AArch64InstrInfo::isSchedulingBoundary(const MachineInstr &MI,
 /// analyzeCompare - For a comparison instruction, return the source registers
 /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
 /// Return true if the comparison instruction can be analyzed.
-bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                                      unsigned &SrcReg2, int &CmpMask,
+bool AArch64InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                                      Register &SrcReg2, int &CmpMask,
                                       int &CmpValue) const {
   // The first operand can be a frame index where we'd normally expect a
   // register.
@@ -1156,10 +1197,9 @@ static bool areCFlagsAccessedBetweenInstrs(
                         return MI.getIterator() == From;
                       }) != To->getParent()->rend());
 
-  // We iterate backward starting \p To until we hit \p From.
-  for (--To; To != From; --To) {
-    const MachineInstr &Instr = *To;
-
+  // We iterate backward starting at \p To until we hit \p From.
+  for (const MachineInstr &Instr :
+       instructionsWithoutDebug(++To.getReverse(), From.getReverse())) {
     if (((AccessToCheck & AK_Write) &&
          Instr.modifiesRegister(AArch64::NZCV, TRI)) ||
         ((AccessToCheck & AK_Read) && Instr.readsRegister(AArch64::NZCV, TRI)))
@@ -1180,7 +1220,7 @@ static bool areCFlagsAccessedBetweenInstrs(
 ///    instruction.
 ///    Only comparison with zero is supported.
 bool AArch64InstrInfo::optimizeCompareInstr(
-    MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask,
     int CmpValue, const MachineRegisterInfo *MRI) const {
   assert(CmpInstr.getParent());
   assert(MRI);
@@ -1416,10 +1456,9 @@ static bool canInstrSubstituteCmpInstr(MachineInstr *MI, MachineInstr *CmpInstr,
     return false;
 
   UsedNZCV NZCVUsedAfterCmp;
-  for (auto I = std::next(CmpInstr->getIterator()),
-            E = CmpInstr->getParent()->instr_end();
-       I != E; ++I) {
-    const MachineInstr &Instr = *I;
+  for (const MachineInstr &Instr :
+       instructionsWithoutDebug(std::next(CmpInstr->getIterator()),
+                                CmpInstr->getParent()->instr_end())) {
     if (Instr.readsRegister(AArch64::NZCV, TRI)) {
       AArch64CC::CondCode CC = findCondCodeUsedByInstr(Instr);
       if (CC == AArch64CC::Invalid) // Unsupported conditional instruction
@@ -1684,6 +1723,8 @@ unsigned AArch64InstrInfo::isStoreToStackSlot(const MachineInstr &MI,
   case AArch64::STRSui:
   case AArch64::STRDui:
   case AArch64::STRQui:
+  case AArch64::LDR_PXI:
+  case AArch64::STR_PXI:
     if (MI.getOperand(0).getSubReg() == 0 && MI.getOperand(1).isFI() &&
         MI.getOperand(2).isImm() && MI.getOperand(2).getImm() == 0) {
       FrameIndex = MI.getOperand(1).getIndex();
@@ -1796,9 +1837,37 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::STNPSi:
   case AArch64::LDG:
   case AArch64::STGPi:
+  case AArch64::LD1B_IMM:
+  case AArch64::LD1H_IMM:
+  case AArch64::LD1W_IMM:
+  case AArch64::LD1D_IMM:
+  case AArch64::ST1B_IMM:
+  case AArch64::ST1H_IMM:
+  case AArch64::ST1W_IMM:
+  case AArch64::ST1D_IMM:
+  case AArch64::LD1B_H_IMM:
+  case AArch64::LD1SB_H_IMM:
+  case AArch64::LD1H_S_IMM:
+  case AArch64::LD1SH_S_IMM:
+  case AArch64::LD1W_D_IMM:
+  case AArch64::LD1SW_D_IMM:
+  case AArch64::ST1B_H_IMM:
+  case AArch64::ST1H_S_IMM:
+  case AArch64::ST1W_D_IMM:
+  case AArch64::LD1B_S_IMM:
+  case AArch64::LD1SB_S_IMM:
+  case AArch64::LD1H_D_IMM:
+  case AArch64::LD1SH_D_IMM:
+  case AArch64::ST1B_S_IMM:
+  case AArch64::ST1H_D_IMM:
+  case AArch64::LD1B_D_IMM:
+  case AArch64::LD1SB_D_IMM:
+  case AArch64::ST1B_D_IMM:
     return 3;
   case AArch64::ADDG:
   case AArch64::STGOffset:
+  case AArch64::LDR_PXI:
+  case AArch64::STR_PXI:
     return 2;
   }
 }
@@ -1978,20 +2047,25 @@ bool AArch64InstrInfo::isCandidateToMergeOrPair(const MachineInstr &MI) const {
   return true;
 }
 
-bool AArch64InstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
-                                          const MachineOperand *&BaseOp,
-                                          int64_t &Offset,
-                                          const TargetRegisterInfo *TRI) const {
+bool AArch64InstrInfo::getMemOperandsWithOffsetWidth(
+    const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
+    int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+    const TargetRegisterInfo *TRI) const {
   if (!LdSt.mayLoadOrStore())
     return false;
 
-  unsigned Width;
-  return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI);
+  const MachineOperand *BaseOp;
+  if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, OffsetIsScalable,
+                                    Width, TRI))
+    return false;
+  BaseOps.push_back(BaseOp);
+  return true;
 }
 
 bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
     const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
-    unsigned &Width, const TargetRegisterInfo *TRI) const {
+    bool &OffsetIsScalable, unsigned &Width,
+    const TargetRegisterInfo *TRI) const {
   assert(LdSt.mayLoadOrStore() && "Expected a memory operation.");
   // Handle only loads/stores with base register followed by immediate offset.
   if (LdSt.getNumExplicitOperands() == 3) {
@@ -2010,7 +2084,7 @@ bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
 
   // Get the scaling factor for the instruction and set the width for the
   // instruction.
-  unsigned Scale = 0;
+  TypeSize Scale(0U, false);
   int64_t Dummy1, Dummy2;
 
   // If this returns false, then it's an instruction we don't want to handle.
@@ -2022,12 +2096,13 @@ bool AArch64InstrInfo::getMemOperandWithOffsetWidth(
   // set to 1.
   if (LdSt.getNumExplicitOperands() == 3) {
     BaseOp = &LdSt.getOperand(1);
-    Offset = LdSt.getOperand(2).getImm() * Scale;
+    Offset = LdSt.getOperand(2).getImm() * Scale.getKnownMinSize();
   } else {
     assert(LdSt.getNumExplicitOperands() == 4 && "invalid number of operands");
     BaseOp = &LdSt.getOperand(2);
-    Offset = LdSt.getOperand(3).getImm() * Scale;
+    Offset = LdSt.getOperand(3).getImm() * Scale.getKnownMinSize();
   }
+  OffsetIsScalable = Scale.isScalable();
 
   if (!BaseOp->isReg() && !BaseOp->isFI())
     return false;
@@ -2043,26 +2118,28 @@ AArch64InstrInfo::getMemOpBaseRegImmOfsOffsetOperand(MachineInstr &LdSt) const {
   return OfsOp;
 }
 
-bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
+bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
                                     unsigned &Width, int64_t &MinOffset,
                                     int64_t &MaxOffset) {
+  const unsigned SVEMaxBytesPerVector = AArch64::SVEMaxBitsPerVector / 8;
   switch (Opcode) {
   // Not a memory operation or something we want to handle.
   default:
-    Scale = Width = 0;
+    Scale = TypeSize::Fixed(0);
+    Width = 0;
     MinOffset = MaxOffset = 0;
     return false;
   case AArch64::STRWpost:
   case AArch64::LDRWpost:
     Width = 32;
-    Scale = 4;
+    Scale = TypeSize::Fixed(4);
     MinOffset = -256;
     MaxOffset = 255;
     break;
   case AArch64::LDURQi:
   case AArch64::STURQi:
     Width = 16;
-    Scale = 1;
+    Scale = TypeSize::Fixed(1);
     MinOffset = -256;
     MaxOffset = 255;
     break;
@@ -2072,7 +2149,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
   case AArch64::STURXi:
   case AArch64::STURDi:
     Width = 8;
-    Scale = 1;
+    Scale = TypeSize::Fixed(1);
     MinOffset = -256;
     MaxOffset = 255;
     break;
@@ -2082,7 +2159,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
   case AArch64::STURWi:
   case AArch64::STURSi:
     Width = 4;
-    Scale = 1;
+    Scale = TypeSize::Fixed(1);
     MinOffset = -256;
     MaxOffset = 255;
     break;
@@ -2093,7 +2170,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
   case AArch64::STURHi:
   case AArch64::STURHHi:
     Width = 2;
-    Scale = 1;
+    Scale = TypeSize::Fixed(1);
     MinOffset = -256;
     MaxOffset = 255;
     break;
@@ -2104,7 +2181,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
   case AArch64::STURBi:
   case AArch64::STURBBi:
     Width = 1;
-    Scale = 1;
+    Scale = TypeSize::Fixed(1);
     MinOffset = -256;
     MaxOffset = 255;
     break;
@@ -2112,14 +2189,15 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
   case AArch64::LDNPQi:
   case AArch64::STPQi:
   case AArch64::STNPQi:
-    Scale = 16;
+    Scale = TypeSize::Fixed(16);
     Width = 32;
     MinOffset = -64;
     MaxOffset = 63;
     break;
   case AArch64::LDRQui:
   case AArch64::STRQui:
-    Scale = Width = 16;
+    Scale = TypeSize::Fixed(16);
+    Width = 16;
     MinOffset = 0;
     MaxOffset = 4095;
     break;
@@ -2131,7 +2209,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
   case AArch64::STPDi:
   case AArch64::STNPXi:
   case AArch64::STNPDi:
-    Scale = 8;
+    Scale = TypeSize::Fixed(8);
     Width = 16;
     MinOffset = -64;
     MaxOffset = 63;
@@ -2141,7 +2219,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
   case AArch64::LDRDui:
   case AArch64::STRXui:
   case AArch64::STRDui:
-    Scale = Width = 8;
+    Scale = TypeSize::Fixed(8);
+    Width = 8;
     MinOffset = 0;
     MaxOffset = 4095;
     break;
@@ -2153,7 +2232,7 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
   case AArch64::STPSi:
   case AArch64::STNPWi:
   case AArch64::STNPSi:
-    Scale = 4;
+    Scale = TypeSize::Fixed(4);
     Width = 8;
     MinOffset = -64;
     MaxOffset = 63;
@@ -2163,7 +2242,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
   case AArch64::LDRSWui:
   case AArch64::STRWui:
   case AArch64::STRSui:
-    Scale = Width = 4;
+    Scale = TypeSize::Fixed(4);
+    Width = 4;
     MinOffset = 0;
     MaxOffset = 4095;
     break;
@@ -2173,7 +2253,8 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
   case AArch64::LDRSHXui:
   case AArch64::STRHui:
   case AArch64::STRHHui:
-    Scale = Width = 2;
+    Scale = TypeSize::Fixed(2);
+    Width = 2;
     MinOffset = 0;
     MaxOffset = 4095;
     break;
@@ -2183,18 +2264,19 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
   case AArch64::LDRSBXui:
   case AArch64::STRBui:
   case AArch64::STRBBui:
-    Scale = Width = 1;
+    Scale = TypeSize::Fixed(1);
+    Width = 1;
     MinOffset = 0;
     MaxOffset = 4095;
     break;
   case AArch64::ADDG:
-    Scale = 16;
+    Scale = TypeSize::Fixed(16);
     Width = 0;
     MinOffset = 0;
     MaxOffset = 63;
     break;
   case AArch64::TAGPstack:
-    Scale = 16;
+    Scale = TypeSize::Fixed(16);
     Width = 0;
     // TAGP with a negative offset turns into SUBP, which has a maximum offset
     // of 63 (not 64!).
@@ -2204,31 +2286,110 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, unsigned &Scale,
   case AArch64::LDG:
   case AArch64::STGOffset:
   case AArch64::STZGOffset:
-    Scale = Width = 16;
+    Scale = TypeSize::Fixed(16);
+    Width = 16;
     MinOffset = -256;
     MaxOffset = 255;
     break;
+  case AArch64::STR_ZZZZXI:
+  case AArch64::LDR_ZZZZXI:
+    Scale = TypeSize::Scalable(16);
+    Width = SVEMaxBytesPerVector * 4;
+    MinOffset = -256;
+    MaxOffset = 252;
+    break;
+  case AArch64::STR_ZZZXI:
+  case AArch64::LDR_ZZZXI:
+    Scale = TypeSize::Scalable(16);
+    Width = SVEMaxBytesPerVector * 3;
+    MinOffset = -256;
+    MaxOffset = 253;
+    break;
+  case AArch64::STR_ZZXI:
+  case AArch64::LDR_ZZXI:
+    Scale = TypeSize::Scalable(16);
+    Width = SVEMaxBytesPerVector * 2;
+    MinOffset = -256;
+    MaxOffset = 254;
+    break;
   case AArch64::LDR_PXI:
   case AArch64::STR_PXI:
-    Scale = Width = 2;
+    Scale = TypeSize::Scalable(2);
+    Width = SVEMaxBytesPerVector / 8;
     MinOffset = -256;
     MaxOffset = 255;
     break;
   case AArch64::LDR_ZXI:
   case AArch64::STR_ZXI:
-    Scale = Width = 16;
+    Scale = TypeSize::Scalable(16);
+    Width = SVEMaxBytesPerVector;
     MinOffset = -256;
     MaxOffset = 255;
     break;
+  case AArch64::LD1B_IMM:
+  case AArch64::LD1H_IMM:
+  case AArch64::LD1W_IMM:
+  case AArch64::LD1D_IMM:
+  case AArch64::ST1B_IMM:
+  case AArch64::ST1H_IMM:
+  case AArch64::ST1W_IMM:
+  case AArch64::ST1D_IMM:
+    // A full vectors worth of data
+    // Width = mbytes * elements
+    Scale = TypeSize::Scalable(16);
+    Width = SVEMaxBytesPerVector;
+    MinOffset = -8;
+    MaxOffset = 7;
+    break;
+  case AArch64::LD1B_H_IMM:
+  case AArch64::LD1SB_H_IMM:
+  case AArch64::LD1H_S_IMM:
+  case AArch64::LD1SH_S_IMM:
+  case AArch64::LD1W_D_IMM:
+  case AArch64::LD1SW_D_IMM:
+  case AArch64::ST1B_H_IMM:
+  case AArch64::ST1H_S_IMM:
+  case AArch64::ST1W_D_IMM:
+    // A half vector worth of data
+    // Width = mbytes * elements
+    Scale = TypeSize::Scalable(8);
+    Width = SVEMaxBytesPerVector / 2;
+    MinOffset = -8;
+    MaxOffset = 7;
+    break;
+  case AArch64::LD1B_S_IMM:
+  case AArch64::LD1SB_S_IMM:
+  case AArch64::LD1H_D_IMM:
+  case AArch64::LD1SH_D_IMM:
+  case AArch64::ST1B_S_IMM:
+  case AArch64::ST1H_D_IMM:
+    // A quarter vector worth of data
+    // Width = mbytes * elements
+    Scale = TypeSize::Scalable(4);
+    Width = SVEMaxBytesPerVector / 4;
+    MinOffset = -8;
+    MaxOffset = 7;
+    break;
+  case AArch64::LD1B_D_IMM:
+  case AArch64::LD1SB_D_IMM:
+  case AArch64::ST1B_D_IMM:
+    // A eighth vector worth of data
+    // Width = mbytes * elements
+    Scale = TypeSize::Scalable(2);
+    Width = SVEMaxBytesPerVector / 8;
+    MinOffset = -8;
+    MaxOffset = 7;
+    break;
   case AArch64::ST2GOffset:
   case AArch64::STZ2GOffset:
-    Scale = 16;
+    Scale = TypeSize::Fixed(16);
     Width = 32;
     MinOffset = -256;
     MaxOffset = 255;
     break;
   case AArch64::STGPi:
-    Scale = Width = 16;
+    Scale = TypeSize::Fixed(16);
+    Width = 16;
     MinOffset = -64;
     MaxOffset = 63;
     break;
@@ -2363,9 +2524,13 @@ static bool shouldClusterFI(const MachineFrameInfo &MFI, int FI1,
 /// Detect opportunities for ldp/stp formation.
 ///
 /// Only called for LdSt for which getMemOperandWithOffset returns true.
-bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
-                                           const MachineOperand &BaseOp2,
-                                           unsigned NumLoads) const {
+bool AArch64InstrInfo::shouldClusterMemOps(
+    ArrayRef<const MachineOperand *> BaseOps1,
+    ArrayRef<const MachineOperand *> BaseOps2, unsigned NumLoads,
+    unsigned NumBytes) const {
+  assert(BaseOps1.size() == 1 && BaseOps2.size() == 1);
+  const MachineOperand &BaseOp1 = *BaseOps1.front();
+  const MachineOperand &BaseOp2 = *BaseOps2.front();
   const MachineInstr &FirstLdSt = *BaseOp1.getParent();
   const MachineInstr &SecondLdSt = *BaseOp2.getParent();
   if (BaseOp1.getType() != BaseOp2.getType())
@@ -2379,7 +2544,7 @@ bool AArch64InstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
     return false;
 
   // Only cluster up to a single pair.
-  if (NumLoads > 1)
+  if (NumLoads > 2)
     return false;
 
   if (!isPairableLdStInst(FirstLdSt) || !isPairableLdStInst(SecondLdSt))
@@ -2822,11 +2987,11 @@ static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
                                     MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator InsertBefore,
                                     const MCInstrDesc &MCID,
-                                    unsigned SrcReg, bool IsKill,
+                                    Register SrcReg, bool IsKill,
                                     unsigned SubIdx0, unsigned SubIdx1, int FI,
                                     MachineMemOperand *MMO) {
-  unsigned SrcReg0 = SrcReg;
-  unsigned SrcReg1 = SrcReg;
+  Register SrcReg0 = SrcReg;
+  Register SrcReg1 = SrcReg;
   if (Register::isPhysicalRegister(SrcReg)) {
     SrcReg0 = TRI.getSubReg(SrcReg, SubIdx0);
     SubIdx0 = 0;
@@ -2842,18 +3007,19 @@ static void storeRegPairToStackSlot(const TargetRegisterInfo &TRI,
 }
 
 void AArch64InstrInfo::storeRegToStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
     bool isKill, int FI, const TargetRegisterClass *RC,
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FI);
 
   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOStore, MFI.getObjectSize(FI), Align);
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
+                              MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
   unsigned Opc = 0;
   bool Offset = true;
+  unsigned StackID = TargetStackID::Default;
   switch (TRI->getSpillSize(*RC)) {
   case 1:
     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
@@ -2862,6 +3028,11 @@ void AArch64InstrInfo::storeRegToStackSlot(
   case 2:
     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
       Opc = AArch64::STRHui;
+    else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+      Opc = AArch64::STR_PXI;
+      StackID = TargetStackID::SVEVector;
+    }
     break;
   case 4:
     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
@@ -2901,6 +3072,10 @@ void AArch64InstrInfo::storeRegToStackSlot(
                               get(AArch64::STPXi), SrcReg, isKill,
                               AArch64::sube64, AArch64::subo64, FI, MMO);
       return;
+    } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+      Opc = AArch64::STR_ZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   case 24:
@@ -2919,6 +3094,10 @@ void AArch64InstrInfo::storeRegToStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Twov2d;
       Offset = false;
+    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+      Opc = AArch64::STR_ZZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   case 48:
@@ -2926,6 +3105,10 @@ void AArch64InstrInfo::storeRegToStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Threev2d;
       Offset = false;
+    } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+      Opc = AArch64::STR_ZZZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   case 64:
@@ -2933,19 +3116,13 @@ void AArch64InstrInfo::storeRegToStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Fourv2d;
       Offset = false;
+    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
+      Opc = AArch64::STR_ZZZZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   }
-  unsigned StackID = TargetStackID::Default;
-  if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
-    assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
-    Opc = AArch64::STR_PXI;
-    StackID = TargetStackID::SVEVector;
-  } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
-    assert(Subtarget.hasSVE() && "Unexpected register store without SVE");
-    Opc = AArch64::STR_ZXI;
-    StackID = TargetStackID::SVEVector;
-  }
   assert(Opc && "Unknown register class");
   MFI.setStackID(FI, StackID);
 
@@ -2962,11 +3139,11 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator InsertBefore,
                                      const MCInstrDesc &MCID,
-                                     unsigned DestReg, unsigned SubIdx0,
+                                     Register DestReg, unsigned SubIdx0,
                                      unsigned SubIdx1, int FI,
                                      MachineMemOperand *MMO) {
-  unsigned DestReg0 = DestReg;
-  unsigned DestReg1 = DestReg;
+  Register DestReg0 = DestReg;
+  Register DestReg1 = DestReg;
   bool IsUndef = true;
   if (Register::isPhysicalRegister(DestReg)) {
     DestReg0 = TRI.getSubReg(DestReg, SubIdx0);
@@ -2984,18 +3161,19 @@ static void loadRegPairFromStackSlot(const TargetRegisterInfo &TRI,
 }
 
 void AArch64InstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
     int FI, const TargetRegisterClass *RC,
     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FI);
   MachinePointerInfo PtrInfo = MachinePointerInfo::getFixedStack(MF, FI);
-  MachineMemOperand *MMO = MF.getMachineMemOperand(
-      PtrInfo, MachineMemOperand::MOLoad, MFI.getObjectSize(FI), Align);
+  MachineMemOperand *MMO =
+      MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad,
+                              MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
 
   unsigned Opc = 0;
   bool Offset = true;
+  unsigned StackID = TargetStackID::Default;
   switch (TRI->getSpillSize(*RC)) {
   case 1:
     if (AArch64::FPR8RegClass.hasSubClassEq(RC))
@@ -3004,6 +3182,11 @@ void AArch64InstrInfo::loadRegFromStackSlot(
   case 2:
     if (AArch64::FPR16RegClass.hasSubClassEq(RC))
       Opc = AArch64::LDRHui;
+    else if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+      Opc = AArch64::LDR_PXI;
+      StackID = TargetStackID::SVEVector;
+    }
     break;
   case 4:
     if (AArch64::GPR32allRegClass.hasSubClassEq(RC)) {
@@ -3043,6 +3226,10 @@ void AArch64InstrInfo::loadRegFromStackSlot(
                                get(AArch64::LDPXi), DestReg, AArch64::sube64,
                                AArch64::subo64, FI, MMO);
       return;
+    } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+      Opc = AArch64::LDR_ZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   case 24:
@@ -3061,6 +3248,10 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Twov2d;
       Offset = false;
+    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+      Opc = AArch64::LDR_ZZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   case 48:
@@ -3068,6 +3259,10 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Threev2d;
       Offset = false;
+    } else if (AArch64::ZPR3RegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+      Opc = AArch64::LDR_ZZZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   case 64:
@@ -3075,20 +3270,14 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Fourv2d;
       Offset = false;
+    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
+      Opc = AArch64::LDR_ZZZZXI;
+      StackID = TargetStackID::SVEVector;
     }
     break;
   }
 
-  unsigned StackID = TargetStackID::Default;
-  if (AArch64::PPRRegClass.hasSubClassEq(RC)) {
-    assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
-    Opc = AArch64::LDR_PXI;
-    StackID = TargetStackID::SVEVector;
-  } else if (AArch64::ZPRRegClass.hasSubClassEq(RC)) {
-    assert(Subtarget.hasSVE() && "Unexpected register load without SVE");
-    Opc = AArch64::LDR_ZXI;
-    StackID = TargetStackID::SVEVector;
-  }
   assert(Opc && "Unknown register class");
   MFI.setStackID(FI, StackID);
 
@@ -3100,6 +3289,17 @@ void AArch64InstrInfo::loadRegFromStackSlot(
   MI.addMemOperand(MMO);
 }
 
+bool llvm::isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
+                                           const MachineInstr &UseMI,
+                                           const TargetRegisterInfo *TRI) {
+  return any_of(instructionsWithoutDebug(std::next(DefMI.getIterator()),
+                                         UseMI.getIterator()),
+                [TRI](const MachineInstr &I) {
+                  return I.modifiesRegister(AArch64::NZCV, TRI) ||
+                         I.readsRegister(AArch64::NZCV, TRI);
+                });
+}
+
 // Helper function to emit a frame offset adjustment from a given
 // pointer (SrcReg), stored into DestReg. This function is explicit
 // in that it requires the opcode.
@@ -3146,6 +3346,10 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
   //  assert(Offset < (1 << 24) && "unimplemented reg plus immediate");
 
   const unsigned MaxEncodableValue = MaxEncoding << ShiftSize;
+  Register TmpReg = DestReg;
+  if (TmpReg == AArch64::XZR)
+    TmpReg = MBB.getParent()->getRegInfo().createVirtualRegister(
+        &AArch64::GPR64RegClass);
   do {
     uint64_t ThisVal = std::min<uint64_t>(Offset, MaxEncodableValue);
     unsigned LocalShiftSize = 0;
@@ -3155,7 +3359,11 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
     }
     assert((ThisVal >> ShiftSize) <= MaxEncoding &&
            "Encoding cannot handle value that big");
-    auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), DestReg)
+
+    Offset -= ThisVal << LocalShiftSize;
+    if (Offset == 0)
+      TmpReg = DestReg;
+    auto MBI = BuildMI(MBB, MBBI, DL, TII->get(Opc), TmpReg)
                    .addReg(SrcReg)
                    .addImm(Sign * (int)ThisVal);
     if (ShiftSize)
@@ -3176,8 +3384,8 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
           BuildMI(MBB, MBBI, DL, TII->get(AArch64::SEH_AddFP))
               .addImm(Imm)
               .setMIFlag(Flag);
-        assert((Offset - Imm) == 0 && "Expected remaining offset to be zero to "
-                                      "emit a single SEH directive");
+        assert(Offset == 0 && "Expected remaining offset to be zero to "
+                              "emit a single SEH directive");
       } else if (DestReg == AArch64::SP) {
         if (HasWinCFI)
           *HasWinCFI = true;
@@ -3190,8 +3398,7 @@ static void emitFrameOffsetAdj(MachineBasicBlock &MBB,
         *HasWinCFI = true;
     }
 
-    SrcReg = DestReg;
-    Offset -= ThisVal << LocalShiftSize;
+    SrcReg = TmpReg;
   } while (Offset);
 }
 
@@ -3414,18 +3621,6 @@ MachineInstr *AArch64InstrInfo::foldMemoryOperandImpl(
   return nullptr;
 }
 
-static bool isSVEScaledImmInstruction(unsigned Opcode) {
-  switch (Opcode) {
-  case AArch64::LDR_ZXI:
-  case AArch64::STR_ZXI:
-  case AArch64::LDR_PXI:
-  case AArch64::STR_PXI:
-    return true;
-  default:
-    return false;
-  }
-}
-
 int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
                                     StackOffset &SOffset,
                                     bool *OutUseUnscaledOp,
@@ -3458,20 +3653,23 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
   case AArch64::ST1Fourv1d:
   case AArch64::IRG:
   case AArch64::IRGstack:
+  case AArch64::STGloop:
+  case AArch64::STZGloop:
     return AArch64FrameOffsetCannotUpdate;
   }
 
   // Get the min/max offset and the scale.
-  unsigned Scale, Width;
+  TypeSize ScaleValue(0U, false);
+  unsigned Width;
   int64_t MinOff, MaxOff;
-  if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), Scale, Width, MinOff,
+  if (!AArch64InstrInfo::getMemOpInfo(MI.getOpcode(), ScaleValue, Width, MinOff,
                                       MaxOff))
     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
 
   // Construct the complete offset.
-  bool IsMulVL = isSVEScaledImmInstruction(MI.getOpcode());
-  int64_t Offset =
-      IsMulVL ? (SOffset.getScalableBytes()) : (SOffset.getBytes());
+  bool IsMulVL = ScaleValue.isScalable();
+  unsigned Scale = ScaleValue.getKnownMinSize();
+  int64_t Offset = IsMulVL ? SOffset.getScalableBytes() : SOffset.getBytes();
 
   const MachineOperand &ImmOpnd =
       MI.getOperand(AArch64InstrInfo::getLoadStoreImmIdx(MI.getOpcode()));
@@ -3484,9 +3682,14 @@ int llvm::isAArch64FrameOffsetLegal(const MachineInstr &MI,
       AArch64InstrInfo::getUnscaledLdSt(MI.getOpcode());
   bool useUnscaledOp = UnscaledOp && (Offset % Scale || Offset < 0);
   if (useUnscaledOp &&
-      !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, Scale, Width, MinOff, MaxOff))
+      !AArch64InstrInfo::getMemOpInfo(*UnscaledOp, ScaleValue, Width, MinOff,
+                                      MaxOff))
     llvm_unreachable("unhandled opcode in isAArch64FrameOffsetLegal");
 
+  Scale = ScaleValue.getKnownMinSize();
+  assert(IsMulVL == ScaleValue.isScalable() &&
+         "Unscaled opcode has different value for scalable");
+
   int64_t Remainder = Offset % Scale;
   assert(!(Remainder && useUnscaledOp) &&
          "Cannot have remainder when using unscaled op");
@@ -5791,6 +5994,35 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
     return C.getMF()->getFunction().hasFnAttribute("branch-target-enforcement");
   });
 
+  // We check to see if CFI Instructions are present, and if they are
+  // we find the number of CFI Instructions in the candidates.
+  unsigned CFICount = 0;
+  MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
+  for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
+       Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
+    const std::vector<MCCFIInstruction> &CFIInstructions =
+        RepeatedSequenceLocs[0].getMF()->getFrameInstructions();
+    if (MBBI->isCFIInstruction()) {
+      unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex();
+      MCCFIInstruction CFI = CFIInstructions[CFIIndex];
+      CFICount++;
+    }
+    MBBI++;
+  }
+
+  // We compare the number of found CFI Instructions to  the number of CFI
+  // instructions in the parent function for each candidate.  We must check this
+  // since if we outline one of the CFI instructions in a function, we have to
+  // outline them all for correctness. If we do not, the address offsets will be
+  // incorrect between the two sections of the program.
+  for (outliner::Candidate &C : RepeatedSequenceLocs) {
+    std::vector<MCCFIInstruction> CFIInstructions =
+        C.getMF()->getFrameInstructions();
+
+    if (CFICount > 0 && CFICount != CFIInstructions.size())
+      return outliner::OutlinedFunction();
+  }
+
   // Returns true if an instructions is safe to fix up, false otherwise.
   auto IsSafeToFixup = [this, &TRI](MachineInstr &MI) {
     if (MI.isCall())
@@ -5811,23 +6043,29 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
     if (MI.mayLoadOrStore()) {
       const MachineOperand *Base; // Filled with the base operand of MI.
       int64_t Offset;             // Filled with the offset of MI.
+      bool OffsetIsScalable;
 
       // Does it allow us to offset the base operand and is the base the
       // register SP?
-      if (!getMemOperandWithOffset(MI, Base, Offset, &TRI) || !Base->isReg() ||
-          Base->getReg() != AArch64::SP)
+      if (!getMemOperandWithOffset(MI, Base, Offset, OffsetIsScalable, &TRI) ||
+          !Base->isReg() || Base->getReg() != AArch64::SP)
+        return false;
+
+      // Fixe-up code below assumes bytes.
+      if (OffsetIsScalable)
         return false;
 
       // Find the minimum/maximum offset for this instruction and check
       // if fixing it up would be in range.
       int64_t MinOffset,
           MaxOffset;  // Unscaled offsets for the instruction.
-      unsigned Scale; // The scale to multiply the offsets by.
+      TypeSize Scale(0U, false); // The scale to multiply the offsets by.
       unsigned DummyWidth;
       getMemOpInfo(MI.getOpcode(), Scale, DummyWidth, MinOffset, MaxOffset);
 
       Offset += 16; // Update the offset to what it would be if we outlined.
-      if (Offset < MinOffset * Scale || Offset > MaxOffset * Scale)
+      if (Offset < MinOffset * (int64_t)Scale.getFixedSize() ||
+          Offset > MaxOffset * (int64_t)Scale.getFixedSize())
         return false;
 
       // It's in range, so we can outline it.
@@ -5854,7 +6092,9 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
   }
 
   else if (LastInstrOpcode == AArch64::BL ||
-           (LastInstrOpcode == AArch64::BLR && !HasBTI)) {
+           ((LastInstrOpcode == AArch64::BLR ||
+             LastInstrOpcode == AArch64::BLRNoIP) &&
+            !HasBTI)) {
     // FIXME: Do we need to check if the code after this uses the value of LR?
     FrameID = MachineOutlinerThunk;
     NumBytesToCreateFrame = 0;
@@ -5960,6 +6200,11 @@ outliner::OutlinedFunction AArch64InstrInfo::getOutliningCandidateInfo(
     }
   }
 
+  // If we have CFI instructions, we can only outline if the outlined section
+  // can be a tail call
+  if (FrameID != MachineOutlinerTailCall && CFICount > 0)
+    return outliner::OutlinedFunction();
+
   return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
                                     NumBytesToCreateFrame, FrameID);
 }
@@ -5986,6 +6231,10 @@ bool AArch64InstrInfo::isFunctionSafeToOutlineFrom(
   if (!AFI || AFI->hasRedZone().getValueOr(true))
     return false;
 
+  // FIXME: Teach the outliner to generate/handle Windows unwind info.
+  if (MF.getTarget().getMCAsmInfo()->usesWindowsCFI())
+    return false;
+
   // It's safe to outline from MF.
   return true;
 }
@@ -6081,6 +6330,15 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
   if (FuncInfo->getLOHRelated().count(&MI))
     return outliner::InstrType::Illegal;
 
+  // We can only outline these if we will tail call the outlined function, or
+  // fix up the CFI offsets. Currently, CFI instructions are outlined only if
+  // in a tail call.
+  //
+  // FIXME: If the proper fixups for the offset are implemented, this should be
+  // possible.
+  if (MI.isCFIInstruction())
+    return outliner::InstrType::Legal;
+
   // Don't allow debug values to impact outlining type.
   if (MI.isDebugInstr() || MI.isIndirectDebugValue())
     return outliner::InstrType::Invisible;
@@ -6150,10 +6408,11 @@ AArch64InstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
 
     // If we don't know anything about the callee, assume it depends on the
     // stack layout of the caller. In that case, it's only legal to outline
-    // as a tail-call.  Whitelist the call instructions we know about so we
+    // as a tail-call. Explicitly list the call instructions we know about so we
     // don't get unexpected results with call pseudo-instructions.
     auto UnknownCallOutlineType = outliner::InstrType::Illegal;
-    if (MI.getOpcode() == AArch64::BLR || MI.getOpcode() == AArch64::BL)
+    if (MI.getOpcode() == AArch64::BLR ||
+        MI.getOpcode() == AArch64::BLRNoIP || MI.getOpcode() == AArch64::BL)
       UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
 
     if (!Callee)
@@ -6205,26 +6464,29 @@ void AArch64InstrInfo::fixupPostOutline(MachineBasicBlock &MBB) const {
     const MachineOperand *Base;
     unsigned Width;
     int64_t Offset;
+    bool OffsetIsScalable;
 
     // Is this a load or store with an immediate offset with SP as the base?
     if (!MI.mayLoadOrStore() ||
-        !getMemOperandWithOffsetWidth(MI, Base, Offset, Width, &RI) ||
+        !getMemOperandWithOffsetWidth(MI, Base, Offset, OffsetIsScalable, Width,
+                                      &RI) ||
         (Base->isReg() && Base->getReg() != AArch64::SP))
       continue;
 
     // It is, so we have to fix it up.
-    unsigned Scale;
+    TypeSize Scale(0U, false);
     int64_t Dummy1, Dummy2;
 
     MachineOperand &StackOffsetOperand = getMemOpBaseRegImmOfsOffsetOperand(MI);
     assert(StackOffsetOperand.isImm() && "Stack offset wasn't immediate!");
     getMemOpInfo(MI.getOpcode(), Scale, Width, Dummy1, Dummy2);
     assert(Scale != 0 && "Unexpected opcode!");
+    assert(!OffsetIsScalable && "Expected offset to be a byte offset");
 
     // We've pushed the return address to the stack, so add 16 to the offset.
     // This is safe, since we already checked if it would overflow when we
     // checked if this instruction was legal to outline.
-    int64_t NewImm = (Offset + 16) / Scale;
+    int64_t NewImm = (Offset + 16) / (int64_t)Scale.getFixedSize();
     StackOffsetOperand.setImm(NewImm);
   }
 }
@@ -6285,15 +6547,21 @@ static void signOutlinedFunction(MachineFunction &MF, MachineBasicBlock &MBB,
 void AArch64InstrInfo::buildOutlinedFrame(
     MachineBasicBlock &MBB, MachineFunction &MF,
     const outliner::OutlinedFunction &OF) const {
-  // For thunk outlining, rewrite the last instruction from a call to a
-  // tail-call.
-  if (OF.FrameConstructionID == MachineOutlinerThunk) {
+
+  AArch64FunctionInfo *FI = MF.getInfo<AArch64FunctionInfo>();
+
+  if (OF.FrameConstructionID == MachineOutlinerTailCall)
+    FI->setOutliningStyle("Tail Call");
+  else if (OF.FrameConstructionID == MachineOutlinerThunk) {
+    // For thunk outlining, rewrite the last instruction from a call to a
+    // tail-call.
     MachineInstr *Call = &*--MBB.instr_end();
     unsigned TailOpcode;
     if (Call->getOpcode() == AArch64::BL) {
       TailOpcode = AArch64::TCRETURNdi;
     } else {
-      assert(Call->getOpcode() == AArch64::BLR);
+      assert(Call->getOpcode() == AArch64::BLR ||
+             Call->getOpcode() == AArch64::BLRNoIP);
       TailOpcode = AArch64::TCRETURNriALL;
     }
     MachineInstr *TC = BuildMI(MF, DebugLoc(), get(TailOpcode))
@@ -6301,6 +6569,8 @@ void AArch64InstrInfo::buildOutlinedFrame(
                            .addImm(0);
     MBB.insert(MBB.end(), TC);
     Call->eraseFromParent();
+
+    FI->setOutliningStyle("Thunk");
   }
 
   bool IsLeafFunction = true;
@@ -6320,7 +6590,8 @@ void AArch64InstrInfo::buildOutlinedFrame(
     IsLeafFunction = false;
 
     // LR has to be a live in so that we can save it.
-    MBB.addLiveIn(AArch64::LR);
+    if (!MBB.isLiveIn(AArch64::LR))
+      MBB.addLiveIn(AArch64::LR);
 
     MachineBasicBlock::iterator It = MBB.begin();
     MachineBasicBlock::iterator Et = MBB.end();
@@ -6343,7 +6614,7 @@ void AArch64InstrInfo::buildOutlinedFrame(
 
     // Add a CFI saying the stack was moved 16 B down.
     int64_t StackPosEntry =
-        MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 16));
+        MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 16));
     BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
         .addCFIIndex(StackPosEntry)
         .setMIFlags(MachineInstr::FrameSetup);
@@ -6351,7 +6622,7 @@ void AArch64InstrInfo::buildOutlinedFrame(
     // Add a CFI saying that the LR that we want to find is now 16 B higher than
     // before.
     int64_t LRPosEntry =
-        MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, 16));
+        MF.addFrameInst(MCCFIInstruction::createOffset(nullptr, DwarfReg, -16));
     BuildMI(MBB, It, DebugLoc(), get(AArch64::CFI_INSTRUCTION))
         .addCFIIndex(LRPosEntry)
         .setMIFlags(MachineInstr::FrameSetup);
@@ -6399,13 +6670,20 @@ void AArch64InstrInfo::buildOutlinedFrame(
   }
 
   // It's not a tail call, so we have to insert the return ourselves.
+
+  // LR has to be a live in so that we can return to it.
+  if (!MBB.isLiveIn(AArch64::LR))
+    MBB.addLiveIn(AArch64::LR);
+
   MachineInstr *ret = BuildMI(MF, DebugLoc(), get(AArch64::RET))
-                          .addReg(AArch64::LR, RegState::Undef);
+                          .addReg(AArch64::LR);
   MBB.insert(MBB.end(), ret);
 
   signOutlinedFunction(MF, MBB, ShouldSignReturnAddr,
                        ShouldSignReturnAddrWithAKey);
 
+  FI->setOutliningStyle("Function");
+
   // Did we have to modify the stack by saving the link register?
   if (OF.FrameConstructionID != MachineOutlinerDefault)
     return;
@@ -6519,7 +6797,8 @@ Optional<RegImmPair> AArch64InstrInfo::isAddImmediate(const MachineInstr &MI,
 
   // TODO: Handle cases where Reg is a super- or sub-register of the
   // destination register.
-  if (Reg != MI.getOperand(0).getReg())
+  const MachineOperand &Op0 = MI.getOperand(0);
+  if (!Op0.isReg() || Reg != Op0.getReg())
     return None;
 
   switch (MI.getOpcode()) {
@@ -6614,5 +6893,17 @@ AArch64InstrInfo::describeLoadedValue(const MachineInstr &MI,
   return TargetInstrInfo::describeLoadedValue(MI, Reg);
 }
 
+uint64_t AArch64InstrInfo::getElementSizeForOpcode(unsigned Opc) const {
+  return get(Opc).TSFlags & AArch64::ElementSizeMask;
+}
+
+unsigned llvm::getBLRCallOpcode(const MachineFunction &MF) {
+  if (MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr())
+    return AArch64::BLRNoIP;
+  else
+    return AArch64::BLR;
+}
+
 #define GET_INSTRINFO_HELPERS
+#define GET_INSTRMAP_INFO
 #include "AArch64GenInstrInfo.inc"
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.h b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
index 66e517e54903..298c04d81708 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.h
@@ -19,6 +19,7 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/CodeGen/MachineCombinerPattern.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/Support/TypeSize.h"
 
 #define GET_INSTRINFO_HEADER
 #include "AArch64GenInstrInfo.inc"
@@ -51,8 +52,8 @@ public:
 
   bool isAsCheapAsAMove(const MachineInstr &MI) const override;
 
-  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
-                             unsigned &DstReg, unsigned &SubIdx) const override;
+  bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg,
+                             Register &DstReg, unsigned &SubIdx) const override;
 
   bool
   areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
@@ -112,14 +113,19 @@ public:
   /// Hint that pairing the given load or store is unprofitable.
   static void suppressLdStPair(MachineInstr &MI);
 
-  bool getMemOperandWithOffset(const MachineInstr &MI,
-                               const MachineOperand *&BaseOp,
-                               int64_t &Offset,
-                               const TargetRegisterInfo *TRI) const override;
+  bool getMemOperandsWithOffsetWidth(
+      const MachineInstr &MI, SmallVectorImpl<const MachineOperand *> &BaseOps,
+      int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+      const TargetRegisterInfo *TRI) const override;
 
+  /// If \p OffsetIsScalable is set to 'true', the offset is scaled by `vscale`.
+  /// This is true for some SVE instructions like ldr/str that have a
+  /// 'reg + imm' addressing mode where the immediate is an index to the
+  /// scalable vector located at 'reg + imm * vscale x #bytes'.
   bool getMemOperandWithOffsetWidth(const MachineInstr &MI,
                                     const MachineOperand *&BaseOp,
-                                    int64_t &Offset, unsigned &Width,
+                                    int64_t &Offset, bool &OffsetIsScalable,
+                                    unsigned &Width,
                                     const TargetRegisterInfo *TRI) const;
 
   /// Return the immediate offset of the base register in a load/store \p LdSt.
@@ -129,12 +135,12 @@ public:
   /// \p Scale, \p Width, \p MinOffset, and \p MaxOffset accordingly.
   ///
   /// For unscaled instructions, \p Scale is set to 1.
-  static bool getMemOpInfo(unsigned Opcode, unsigned &Scale, unsigned &Width,
+  static bool getMemOpInfo(unsigned Opcode, TypeSize &Scale, unsigned &Width,
                            int64_t &MinOffset, int64_t &MaxOffset);
 
-  bool shouldClusterMemOps(const MachineOperand &BaseOp1,
-                           const MachineOperand &BaseOp2,
-                           unsigned NumLoads) const override;
+  bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           ArrayRef<const MachineOperand *> BaseOps2,
+                           unsigned NumLoads, unsigned NumBytes) const override;
 
   void copyPhysRegTuple(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                         const DebugLoc &DL, MCRegister DestReg,
@@ -149,13 +155,13 @@ public:
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+                           MachineBasicBlock::iterator MBBI, Register SrcReg,
                            bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
+                            MachineBasicBlock::iterator MBBI, Register DestReg,
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
@@ -191,11 +197,12 @@ public:
   bool
   reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
   bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
-                       unsigned, unsigned, int &, int &, int &) const override;
+                       Register, Register, Register, int &, int &,
+                       int &) const override;
   void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                    const DebugLoc &DL, unsigned DstReg,
-                    ArrayRef<MachineOperand> Cond, unsigned TrueReg,
-                    unsigned FalseReg) const override;
+                    const DebugLoc &DL, Register DstReg,
+                    ArrayRef<MachineOperand> Cond, Register TrueReg,
+                    Register FalseReg) const override;
   void getNoop(MCInst &NopInst) const override;
 
   bool isSchedulingBoundary(const MachineInstr &MI,
@@ -205,13 +212,13 @@ public:
   /// analyzeCompare - For a comparison instruction, return the source registers
   /// in SrcReg and SrcReg2, and the value it compares against in CmpValue.
   /// Return true if the comparison instruction can be analyzed.
-  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                      unsigned &SrcReg2, int &CmpMask,
+  bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                      Register &SrcReg2, int &CmpMask,
                       int &CmpValue) const override;
   /// optimizeCompareInstr - Convert the instruction supplying the argument to
   /// the comparison into one that sets the zero bit in the flags register.
-  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
-                            unsigned SrcReg2, int CmpMask, int CmpValue,
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+                            Register SrcReg2, int CmpMask, int CmpValue,
                             const MachineRegisterInfo *MRI) const override;
   bool optimizeCondBranch(MachineInstr &MI) const override;
 
@@ -264,6 +271,8 @@ public:
                      MachineBasicBlock::iterator &It, MachineFunction &MF,
                      const outliner::Candidate &C) const override;
   bool shouldOutlineFromFunctionByDefault(MachineFunction &MF) const override;
+  /// Returns the vector element size (B, H, S or D) of an SVE opcode.
+  uint64_t getElementSizeForOpcode(unsigned Opc) const;
   /// Returns true if the instruction has a shift by immediate that can be
   /// executed in one cycle less.
   static bool isFalkorShiftExtFast(const MachineInstr &MI);
@@ -288,6 +297,8 @@ protected:
   isCopyInstrImpl(const MachineInstr &MI) const override;
 
 private:
+  unsigned getInstBundleLength(const MachineInstr &MI) const;
+
   /// Sets the offsets on outlined instructions in \p MBB which use SP
   /// so that they will be valid post-outlining.
   ///
@@ -305,6 +316,12 @@ private:
   unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const;
 };
 
+/// Return true if there is an instruction /after/ \p DefMI and before \p UseMI
+/// which either reads or clobbers NZCV.
+bool isNZCVTouchedInInstructionRange(const MachineInstr &DefMI,
+                                     const MachineInstr &UseMI,
+                                     const TargetRegisterInfo *TRI);
+
 /// emitFrameOffset - Emit instructions as needed to set DestReg to SrcReg
 /// plus Offset.  This is intended to be used from within the prolog/epilog
 /// insertion (PEI) pass, where a virtual scratch register may be allocated
@@ -369,12 +386,24 @@ static inline bool isCondBranchOpcode(int Opc) {
 }
 
 static inline bool isIndirectBranchOpcode(int Opc) {
-  return Opc == AArch64::BR;
+  switch (Opc) {
+  case AArch64::BR:
+  case AArch64::BRAA:
+  case AArch64::BRAB:
+  case AArch64::BRAAZ:
+  case AArch64::BRABZ:
+    return true;
+  }
+  return false;
 }
 
+/// Return opcode to be used for indirect calls.
+unsigned getBLRCallOpcode(const MachineFunction &MF);
+
 // struct TSFlags {
 #define TSFLAG_ELEMENT_SIZE_TYPE(X)      (X)       // 3-bits
-#define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 1-bit
+#define TSFLAG_DESTRUCTIVE_INST_TYPE(X) ((X) << 3) // 4-bit
+#define TSFLAG_FALSE_LANE_TYPE(X)       ((X) << 7) // 2-bits
 // }
 
 namespace AArch64 {
@@ -389,13 +418,31 @@ enum ElementSizeType {
 };
 
 enum DestructiveInstType {
-  DestructiveInstTypeMask = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1),
-  NotDestructive          = TSFLAG_DESTRUCTIVE_INST_TYPE(0x0),
-  Destructive             = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1),
+  DestructiveInstTypeMask       = TSFLAG_DESTRUCTIVE_INST_TYPE(0xf),
+  NotDestructive                = TSFLAG_DESTRUCTIVE_INST_TYPE(0x0),
+  DestructiveOther              = TSFLAG_DESTRUCTIVE_INST_TYPE(0x1),
+  DestructiveUnary              = TSFLAG_DESTRUCTIVE_INST_TYPE(0x2),
+  DestructiveBinaryImm          = TSFLAG_DESTRUCTIVE_INST_TYPE(0x3),
+  DestructiveBinaryShImmUnpred  = TSFLAG_DESTRUCTIVE_INST_TYPE(0x4),
+  DestructiveBinary             = TSFLAG_DESTRUCTIVE_INST_TYPE(0x5),
+  DestructiveBinaryComm         = TSFLAG_DESTRUCTIVE_INST_TYPE(0x6),
+  DestructiveBinaryCommWithRev  = TSFLAG_DESTRUCTIVE_INST_TYPE(0x7),
+  DestructiveTernaryCommWithRev = TSFLAG_DESTRUCTIVE_INST_TYPE(0x8),
+};
+
+enum FalseLaneType {
+  FalseLanesMask  = TSFLAG_FALSE_LANE_TYPE(0x3),
+  FalseLanesZero  = TSFLAG_FALSE_LANE_TYPE(0x1),
+  FalseLanesUndef = TSFLAG_FALSE_LANE_TYPE(0x2),
 };
 
 #undef TSFLAG_ELEMENT_SIZE_TYPE
 #undef TSFLAG_DESTRUCTIVE_INST_TYPE
+#undef TSFLAG_FALSE_LANE_TYPE
+
+int getSVEPseudoMap(uint16_t Opcode);
+int getSVERevInstr(uint16_t Opcode);
+int getSVENonRevInstr(uint16_t Opcode);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index d590d4d913ff..f4a5f639e497 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -14,142 +14,154 @@
 // ARM Instruction Predicate Definitions.
 //
 def HasV8_1a         : Predicate<"Subtarget->hasV8_1aOps()">,
-                                 AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+                                 AssemblerPredicate<(all_of HasV8_1aOps), "armv8.1a">;
 def HasV8_2a         : Predicate<"Subtarget->hasV8_2aOps()">,
-                                 AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
+                                 AssemblerPredicate<(all_of HasV8_2aOps), "armv8.2a">;
 def HasV8_3a         : Predicate<"Subtarget->hasV8_3aOps()">,
-                                 AssemblerPredicate<"HasV8_3aOps", "armv8.3a">;
+                                 AssemblerPredicate<(all_of HasV8_3aOps), "armv8.3a">;
 def HasV8_4a         : Predicate<"Subtarget->hasV8_4aOps()">,
-                                 AssemblerPredicate<"HasV8_4aOps", "armv8.4a">;
+                                 AssemblerPredicate<(all_of HasV8_4aOps), "armv8.4a">;
 def HasV8_5a         : Predicate<"Subtarget->hasV8_5aOps()">,
-                                 AssemblerPredicate<"HasV8_5aOps", "armv8.5a">;
+                                 AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">;
+def HasV8_6a         : Predicate<"Subtarget->hasV8_6aOps()">,
+                                 AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">;
 def HasVH            : Predicate<"Subtarget->hasVH()">,
-                       AssemblerPredicate<"FeatureVH", "vh">;
+                       AssemblerPredicate<(all_of FeatureVH), "vh">;
 
 def HasLOR           : Predicate<"Subtarget->hasLOR()">,
-                       AssemblerPredicate<"FeatureLOR", "lor">;
+                       AssemblerPredicate<(all_of FeatureLOR), "lor">;
 
 def HasPA            : Predicate<"Subtarget->hasPA()">,
-                       AssemblerPredicate<"FeaturePA", "pa">;
+                       AssemblerPredicate<(all_of FeaturePA), "pa">;
 
 def HasJS            : Predicate<"Subtarget->hasJS()">,
-                       AssemblerPredicate<"FeatureJS", "jsconv">;
+                       AssemblerPredicate<(all_of FeatureJS), "jsconv">;
 
 def HasCCIDX         : Predicate<"Subtarget->hasCCIDX()">,
-                       AssemblerPredicate<"FeatureCCIDX", "ccidx">;
+                       AssemblerPredicate<(all_of FeatureCCIDX), "ccidx">;
 
 def HasComplxNum      : Predicate<"Subtarget->hasComplxNum()">,
-                       AssemblerPredicate<"FeatureComplxNum", "complxnum">;
+                       AssemblerPredicate<(all_of FeatureComplxNum), "complxnum">;
 
 def HasNV            : Predicate<"Subtarget->hasNV()">,
-                       AssemblerPredicate<"FeatureNV", "nv">;
+                       AssemblerPredicate<(all_of FeatureNV), "nv">;
 
 def HasRASv8_4       : Predicate<"Subtarget->hasRASv8_4()">,
-                       AssemblerPredicate<"FeatureRASv8_4", "rasv8_4">;
+                       AssemblerPredicate<(all_of FeatureRASv8_4), "rasv8_4">;
 
 def HasMPAM          : Predicate<"Subtarget->hasMPAM()">,
-                       AssemblerPredicate<"FeatureMPAM", "mpam">;
+                       AssemblerPredicate<(all_of FeatureMPAM), "mpam">;
 
 def HasDIT           : Predicate<"Subtarget->hasDIT()">,
-                       AssemblerPredicate<"FeatureDIT", "dit">;
+                       AssemblerPredicate<(all_of FeatureDIT), "dit">;
 
 def HasTRACEV8_4         : Predicate<"Subtarget->hasTRACEV8_4()">,
-                       AssemblerPredicate<"FeatureTRACEV8_4", "tracev8.4">;
+                       AssemblerPredicate<(all_of FeatureTRACEV8_4), "tracev8.4">;
 
 def HasAM            : Predicate<"Subtarget->hasAM()">,
-                       AssemblerPredicate<"FeatureAM", "am">;
+                       AssemblerPredicate<(all_of FeatureAM), "am">;
 
 def HasSEL2          : Predicate<"Subtarget->hasSEL2()">,
-                       AssemblerPredicate<"FeatureSEL2", "sel2">;
+                       AssemblerPredicate<(all_of FeatureSEL2), "sel2">;
 
 def HasPMU           : Predicate<"Subtarget->hasPMU()">,
-                       AssemblerPredicate<"FeaturePMU", "pmu">;
+                       AssemblerPredicate<(all_of FeaturePMU), "pmu">;
 
 def HasTLB_RMI          : Predicate<"Subtarget->hasTLB_RMI()">,
-                       AssemblerPredicate<"FeatureTLB_RMI", "tlb-rmi">;
+                       AssemblerPredicate<(all_of FeatureTLB_RMI), "tlb-rmi">;
 
 def HasFMI           : Predicate<"Subtarget->hasFMI()">,
-                       AssemblerPredicate<"FeatureFMI", "fmi">;
+                       AssemblerPredicate<(all_of FeatureFMI), "fmi">;
 
 def HasRCPC_IMMO      : Predicate<"Subtarget->hasRCPCImm()">,
-                       AssemblerPredicate<"FeatureRCPC_IMMO", "rcpc-immo">;
+                       AssemblerPredicate<(all_of FeatureRCPC_IMMO), "rcpc-immo">;
 
 def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8()">,
-                               AssemblerPredicate<"FeatureFPARMv8", "fp-armv8">;
+                               AssemblerPredicate<(all_of FeatureFPARMv8), "fp-armv8">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
-                                 AssemblerPredicate<"FeatureNEON", "neon">;
+                                 AssemblerPredicate<(all_of FeatureNEON), "neon">;
 def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
-                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
+                                 AssemblerPredicate<(all_of FeatureCrypto), "crypto">;
 def HasSM4           : Predicate<"Subtarget->hasSM4()">,
-                                 AssemblerPredicate<"FeatureSM4", "sm4">;
+                                 AssemblerPredicate<(all_of FeatureSM4), "sm4">;
 def HasSHA3          : Predicate<"Subtarget->hasSHA3()">,
-                                 AssemblerPredicate<"FeatureSHA3", "sha3">;
+                                 AssemblerPredicate<(all_of FeatureSHA3), "sha3">;
 def HasSHA2          : Predicate<"Subtarget->hasSHA2()">,
-                                 AssemblerPredicate<"FeatureSHA2", "sha2">;
+                                 AssemblerPredicate<(all_of FeatureSHA2), "sha2">;
 def HasAES           : Predicate<"Subtarget->hasAES()">,
-                                 AssemblerPredicate<"FeatureAES", "aes">;
+                                 AssemblerPredicate<(all_of FeatureAES), "aes">;
 def HasDotProd       : Predicate<"Subtarget->hasDotProd()">,
-                                 AssemblerPredicate<"FeatureDotProd", "dotprod">;
+                                 AssemblerPredicate<(all_of FeatureDotProd), "dotprod">;
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
-                                 AssemblerPredicate<"FeatureCRC", "crc">;
+                                 AssemblerPredicate<(all_of FeatureCRC), "crc">;
 def HasLSE           : Predicate<"Subtarget->hasLSE()">,
-                                 AssemblerPredicate<"FeatureLSE", "lse">;
+                                 AssemblerPredicate<(all_of FeatureLSE), "lse">;
 def HasRAS           : Predicate<"Subtarget->hasRAS()">,
-                                 AssemblerPredicate<"FeatureRAS", "ras">;
+                                 AssemblerPredicate<(all_of FeatureRAS), "ras">;
 def HasRDM           : Predicate<"Subtarget->hasRDM()">,
-                                 AssemblerPredicate<"FeatureRDM", "rdm">;
+                                 AssemblerPredicate<(all_of FeatureRDM), "rdm">;
 def HasPerfMon       : Predicate<"Subtarget->hasPerfMon()">;
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
-                                 AssemblerPredicate<"FeatureFullFP16", "fullfp16">;
+                                 AssemblerPredicate<(all_of FeatureFullFP16), "fullfp16">;
 def HasFP16FML       : Predicate<"Subtarget->hasFP16FML()">,
-                                 AssemblerPredicate<"FeatureFP16FML", "fp16fml">;
+                                 AssemblerPredicate<(all_of FeatureFP16FML), "fp16fml">;
 def HasSPE           : Predicate<"Subtarget->hasSPE()">,
-                                 AssemblerPredicate<"FeatureSPE", "spe">;
+                                 AssemblerPredicate<(all_of FeatureSPE), "spe">;
 def HasFuseAES       : Predicate<"Subtarget->hasFuseAES()">,
-                                 AssemblerPredicate<"FeatureFuseAES",
+                                 AssemblerPredicate<(all_of FeatureFuseAES),
                                  "fuse-aes">;
 def HasSVE           : Predicate<"Subtarget->hasSVE()">,
-                                 AssemblerPredicate<"FeatureSVE", "sve">;
+                                 AssemblerPredicate<(all_of FeatureSVE), "sve">;
 def HasSVE2          : Predicate<"Subtarget->hasSVE2()">,
-                                 AssemblerPredicate<"FeatureSVE2", "sve2">;
+                                 AssemblerPredicate<(all_of FeatureSVE2), "sve2">;
 def HasSVE2AES       : Predicate<"Subtarget->hasSVE2AES()">,
-                                 AssemblerPredicate<"FeatureSVE2AES", "sve2-aes">;
+                                 AssemblerPredicate<(all_of FeatureSVE2AES), "sve2-aes">;
 def HasSVE2SM4       : Predicate<"Subtarget->hasSVE2SM4()">,
-                                 AssemblerPredicate<"FeatureSVE2SM4", "sve2-sm4">;
+                                 AssemblerPredicate<(all_of FeatureSVE2SM4), "sve2-sm4">;
 def HasSVE2SHA3      : Predicate<"Subtarget->hasSVE2SHA3()">,
-                                 AssemblerPredicate<"FeatureSVE2SHA3", "sve2-sha3">;
+                                 AssemblerPredicate<(all_of FeatureSVE2SHA3), "sve2-sha3">;
 def HasSVE2BitPerm   : Predicate<"Subtarget->hasSVE2BitPerm()">,
-                                 AssemblerPredicate<"FeatureSVE2BitPerm", "sve2-bitperm">;
+                                 AssemblerPredicate<(all_of FeatureSVE2BitPerm), "sve2-bitperm">;
 def HasRCPC          : Predicate<"Subtarget->hasRCPC()">,
-                                 AssemblerPredicate<"FeatureRCPC", "rcpc">;
+                                 AssemblerPredicate<(all_of FeatureRCPC), "rcpc">;
 def HasAltNZCV       : Predicate<"Subtarget->hasAlternativeNZCV()">,
-                       AssemblerPredicate<"FeatureAltFPCmp", "altnzcv">;
+                       AssemblerPredicate<(all_of FeatureAltFPCmp), "altnzcv">;
 def HasFRInt3264     : Predicate<"Subtarget->hasFRInt3264()">,
-                       AssemblerPredicate<"FeatureFRInt3264", "frint3264">;
+                       AssemblerPredicate<(all_of FeatureFRInt3264), "frint3264">;
 def HasSB            : Predicate<"Subtarget->hasSB()">,
-                       AssemblerPredicate<"FeatureSB", "sb">;
+                       AssemblerPredicate<(all_of FeatureSB), "sb">;
 def HasPredRes      : Predicate<"Subtarget->hasPredRes()">,
-                       AssemblerPredicate<"FeaturePredRes", "predres">;
+                       AssemblerPredicate<(all_of FeaturePredRes), "predres">;
 def HasCCDP          : Predicate<"Subtarget->hasCCDP()">,
-                       AssemblerPredicate<"FeatureCacheDeepPersist", "ccdp">;
+                       AssemblerPredicate<(all_of FeatureCacheDeepPersist), "ccdp">;
 def HasBTI           : Predicate<"Subtarget->hasBTI()">,
-                       AssemblerPredicate<"FeatureBranchTargetId", "bti">;
+                       AssemblerPredicate<(all_of FeatureBranchTargetId), "bti">;
 def HasMTE           : Predicate<"Subtarget->hasMTE()">,
-                       AssemblerPredicate<"FeatureMTE", "mte">;
+                       AssemblerPredicate<(all_of FeatureMTE), "mte">;
 def HasTME           : Predicate<"Subtarget->hasTME()">,
-                       AssemblerPredicate<"FeatureTME", "tme">;
+                       AssemblerPredicate<(all_of FeatureTME), "tme">;
 def HasETE           : Predicate<"Subtarget->hasETE()">,
-                       AssemblerPredicate<"FeatureETE", "ete">;
+                       AssemblerPredicate<(all_of FeatureETE), "ete">;
 def HasTRBE          : Predicate<"Subtarget->hasTRBE()">,
-                       AssemblerPredicate<"FeatureTRBE", "trbe">;
+                       AssemblerPredicate<(all_of FeatureTRBE), "trbe">;
+def HasBF16          : Predicate<"Subtarget->hasBF16()">,
+                       AssemblerPredicate<(all_of FeatureBF16), "bf16">;
+def HasMatMulInt8    : Predicate<"Subtarget->hasMatMulInt8()">,
+                       AssemblerPredicate<(all_of FeatureMatMulInt8), "i8mm">;
+def HasMatMulFP32    : Predicate<"Subtarget->hasMatMulFP32()">,
+                       AssemblerPredicate<(all_of FeatureMatMulFP32), "f32mm">;
+def HasMatMulFP64    : Predicate<"Subtarget->hasMatMulFP64()">,
+                       AssemblerPredicate<(all_of FeatureMatMulFP64), "f64mm">;
 def IsLE             : Predicate<"Subtarget->isLittleEndian()">;
 def IsBE             : Predicate<"!Subtarget->isLittleEndian()">;
 def IsWindows        : Predicate<"Subtarget->isTargetWindows()">;
+def UseExperimentalZeroingPseudos
+    : Predicate<"Subtarget->useExperimentalZeroingPseudos()">;
 def UseAlternateSExtLoadCVTF32
     : Predicate<"Subtarget->useAlternateSExtLoadCVTF32Pattern()">;
 
 def UseNegativeImmediates
-    : Predicate<"false">, AssemblerPredicate<"!FeatureNoNegativeImmediates",
+    : Predicate<"false">, AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)),
                                              "NegativeImmediates">;
 
 def AArch64LocalRecover : SDNode<"ISD::LOCAL_RECOVER",
@@ -227,6 +239,10 @@ def SDT_AArch64ExtVec: SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                           SDTCisSameAs<0,2>, SDTCisInt<3>]>;
 def SDT_AArch64vshift : SDTypeProfile<1, 2, [SDTCisSameAs<0,1>, SDTCisInt<2>]>;
 
+def SDT_AArch64vshiftinsert : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisInt<3>,
+                                                 SDTCisSameAs<0,1>,
+                                                 SDTCisSameAs<0,2>]>;
+
 def SDT_AArch64unvec : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
 def SDT_AArch64fcmpz : SDTypeProfile<1, 1, []>;
 def SDT_AArch64fcmp  : SDTypeProfile<1, 2, [SDTCisSameAs<1,2>]>;
@@ -245,6 +261,7 @@ def SDT_AArch64TLSDescCall : SDTypeProfile<0, -2, [SDTCisPtrTy<0>,
 
 def SDT_AArch64ldp : SDTypeProfile<2, 1, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
 def SDT_AArch64stp : SDTypeProfile<0, 3, [SDTCisVT<0, i64>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
+def SDT_AArch64stnp : SDTypeProfile<0, 3, [SDTCisVT<0, v4i32>, SDTCisSameAs<0, 1>, SDTCisPtrTy<2>]>;
 
 // Generates the general dynamic sequences, i.e.
 //  adrp  x0, :tlsdesc:var
@@ -419,7 +436,14 @@ def AArch64fccmp     : SDNode<"AArch64ISD::FCCMP", SDT_AArch64FCCMP>;
 
 def AArch64threadpointer : SDNode<"AArch64ISD::THREAD_POINTER", SDTPtrLeaf>;
 
-def AArch64fcmp      : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
+def AArch64fcmp         : SDNode<"AArch64ISD::FCMP", SDT_AArch64FCmp>;
+def AArch64strict_fcmp  : SDNode<"AArch64ISD::STRICT_FCMP", SDT_AArch64FCmp,
+                                 [SDNPHasChain]>;
+def AArch64strict_fcmpe : SDNode<"AArch64ISD::STRICT_FCMPE", SDT_AArch64FCmp,
+                                 [SDNPHasChain]>;
+def AArch64any_fcmp     : PatFrags<(ops node:$lhs, node:$rhs),
+                                   [(AArch64strict_fcmp node:$lhs, node:$rhs),
+                                    (AArch64fcmp node:$lhs, node:$rhs)]>;
 
 def AArch64dup       : SDNode<"AArch64ISD::DUP", SDT_AArch64Dup>;
 def AArch64duplane8  : SDNode<"AArch64ISD::DUPLANE8", SDT_AArch64DupLane>;
@@ -457,10 +481,12 @@ def AArch64uqshli : SDNode<"AArch64ISD::UQSHL_I", SDT_AArch64vshift>;
 def AArch64sqshlui : SDNode<"AArch64ISD::SQSHLU_I", SDT_AArch64vshift>;
 def AArch64srshri : SDNode<"AArch64ISD::SRSHR_I", SDT_AArch64vshift>;
 def AArch64urshri : SDNode<"AArch64ISD::URSHR_I", SDT_AArch64vshift>;
+def AArch64vsli : SDNode<"AArch64ISD::VSLI", SDT_AArch64vshiftinsert>;
+def AArch64vsri : SDNode<"AArch64ISD::VSRI", SDT_AArch64vshiftinsert>;
 
 def AArch64not: SDNode<"AArch64ISD::NOT", SDT_AArch64unvec>;
 def AArch64bit: SDNode<"AArch64ISD::BIT", SDT_AArch64trivec>;
-def AArch64bsl: SDNode<"AArch64ISD::BSL", SDT_AArch64trivec>;
+def AArch64bsp: SDNode<"AArch64ISD::BSP", SDT_AArch64trivec>;
 
 def AArch64cmeq: SDNode<"AArch64ISD::CMEQ", SDT_AArch64binvec>;
 def AArch64cmge: SDNode<"AArch64ISD::CMGE", SDT_AArch64binvec>;
@@ -528,6 +554,9 @@ def AArch64uminv    : SDNode<"AArch64ISD::UMINV", SDT_AArch64UnaryVec>;
 def AArch64smaxv    : SDNode<"AArch64ISD::SMAXV", SDT_AArch64UnaryVec>;
 def AArch64umaxv    : SDNode<"AArch64ISD::UMAXV", SDT_AArch64UnaryVec>;
 
+def AArch64srhadd   : SDNode<"AArch64ISD::SRHADD", SDT_AArch64binvec>;
+def AArch64urhadd   : SDNode<"AArch64ISD::URHADD", SDT_AArch64binvec>;
+
 def SDT_AArch64SETTAG : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisPtrTy<1>]>;
 def AArch64stg : SDNode<"AArch64ISD::STG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 def AArch64stzg : SDNode<"AArch64ISD::STZG", SDT_AArch64SETTAG, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
@@ -544,6 +573,7 @@ def AArch64uunpklo : SDNode<"AArch64ISD::UUNPKLO", SDT_AArch64unpk>;
 
 def AArch64ldp : SDNode<"AArch64ISD::LDP", SDT_AArch64ldp, [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def AArch64stp : SDNode<"AArch64ISD::STP", SDT_AArch64stp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
+def AArch64stnp : SDNode<"AArch64ISD::STNP", SDT_AArch64stnp, [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 
 def AArch64tbl : SDNode<"AArch64ISD::TBL", SDT_AArch64TBL>;
 
@@ -564,6 +594,8 @@ let RecomputePerFunction = 1 in {
   def UseBTI : Predicate<[{ MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
   def NotUseBTI : Predicate<[{ !MF->getFunction().hasFnAttribute("branch-target-enforcement") }]>;
 
+  def SLSBLRMitigation : Predicate<[{ MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
+  def NoSLSBLRMitigation : Predicate<[{ !MF->getSubtarget<AArch64Subtarget>().hardenSlsBlr() }]>;
   // Toggles patterns which aren't beneficial in GlobalISel when we aren't
   // optimizing. This allows us to selectively use patterns without impacting
   // SelectionDAG's behaviour.
@@ -686,6 +718,14 @@ let hasSideEffects = 1, isCodeGenOnly = 1 in {
       : Pseudo<(outs GPR32:$dst), (ins GPR32:$src), []>, Sched<[]>;
 }
 
+// SpeculationBarrierEndBB must only be used after an unconditional control
+// flow, i.e. after a terminator for which isBarrier is True.
+let hasSideEffects = 1, isCodeGenOnly = 1, isTerminator = 1, isBarrier = 1 in {
+  def SpeculationBarrierISBDSBEndBB
+      : Pseudo<(outs), (ins), []>, Sched<[]>;
+  def SpeculationBarrierSBEndBB
+      : Pseudo<(outs), (ins), []>, Sched<[]>;
+}
 
 //===----------------------------------------------------------------------===//
 // System instructions.
@@ -698,8 +738,15 @@ def : InstAlias<"wfe",  (HINT 0b010)>;
 def : InstAlias<"wfi",  (HINT 0b011)>;
 def : InstAlias<"sev",  (HINT 0b100)>;
 def : InstAlias<"sevl", (HINT 0b101)>;
+def : InstAlias<"dgh",  (HINT 0b110)>;
 def : InstAlias<"esb",  (HINT 0b10000)>, Requires<[HasRAS]>;
 def : InstAlias<"csdb", (HINT 20)>;
+// In order to be able to write readable assembly, LLVM should accept assembly
+// inputs that use Branch Target Indentification mnemonics, even with BTI disabled.
+// However, in order to be compatible with other assemblers (e.g. GAS), LLVM
+// should not emit these mnemonics unless BTI is enabled.
+def : InstAlias<"bti",  (HINT 32), 0>;
+def : InstAlias<"bti $op", (HINT btihint_op:$op), 0>;
 def : InstAlias<"bti",  (HINT 32)>, Requires<[HasBTI]>;
 def : InstAlias<"bti $op", (HINT btihint_op:$op)>, Requires<[HasBTI]>;
 
@@ -731,10 +778,58 @@ def TSB   : CRmSystemI<barrier_op, 0b010, "tsb", []> {
 
 // ARMv8.2-A Dot Product
 let Predicates = [HasDotProd] in {
-defm SDOT : SIMDThreeSameVectorDot<0, "sdot", int_aarch64_neon_sdot>;
-defm UDOT : SIMDThreeSameVectorDot<1, "udot", int_aarch64_neon_udot>;
-defm SDOTlane : SIMDThreeSameVectorDotIndex<0, "sdot", int_aarch64_neon_sdot>;
-defm UDOTlane : SIMDThreeSameVectorDotIndex<1, "udot", int_aarch64_neon_udot>;
+defm SDOT : SIMDThreeSameVectorDot<0, 0, "sdot", int_aarch64_neon_sdot>;
+defm UDOT : SIMDThreeSameVectorDot<1, 0, "udot", int_aarch64_neon_udot>;
+defm SDOTlane : SIMDThreeSameVectorDotIndex<0, 0, 0b10, "sdot", int_aarch64_neon_sdot>;
+defm UDOTlane : SIMDThreeSameVectorDotIndex<1, 0, 0b10, "udot", int_aarch64_neon_udot>;
+}
+
+// ARMv8.6-A BFloat
+let Predicates = [HasBF16] in {
+defm BFDOT       : SIMDThreeSameVectorBFDot<1, "bfdot">;
+defm BF16DOTlane : SIMDThreeSameVectorBF16DotI<0, "bfdot">;
+def BFMMLA       : SIMDThreeSameVectorBF16MatrixMul<"bfmmla">;
+def BFMLALB      : SIMDBF16MLAL<0, "bfmlalb", int_aarch64_neon_bfmlalb>;
+def BFMLALT      : SIMDBF16MLAL<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
+def BFMLALBIdx   : SIMDBF16MLALIndex<0, "bfmlalb", int_aarch64_neon_bfmlalb>;
+def BFMLALTIdx   : SIMDBF16MLALIndex<1, "bfmlalt", int_aarch64_neon_bfmlalt>;
+def BFCVTN       : SIMD_BFCVTN;
+def BFCVTN2      : SIMD_BFCVTN2;
+def BFCVT        : BF16ToSinglePrecision<"bfcvt">;
+}
+
+// ARMv8.6A AArch64 matrix multiplication
+let Predicates = [HasMatMulInt8] in {
+def  SMMLA : SIMDThreeSameVectorMatMul<0, 0, "smmla", int_aarch64_neon_smmla>;
+def  UMMLA : SIMDThreeSameVectorMatMul<0, 1, "ummla", int_aarch64_neon_ummla>;
+def USMMLA : SIMDThreeSameVectorMatMul<1, 0, "usmmla", int_aarch64_neon_usmmla>;
+defm USDOT : SIMDThreeSameVectorDot<0, 1, "usdot", int_aarch64_neon_usdot>;
+defm USDOTlane : SIMDThreeSameVectorDotIndex<0, 1, 0b10, "usdot", int_aarch64_neon_usdot>;
+
+// sudot lane has a pattern where usdot is expected (there is no sudot).
+// The second operand is used in the dup operation to repeat the indexed
+// element.
+class BaseSIMDSUDOTIndex<bit Q, string dst_kind, string lhs_kind,
+                         string rhs_kind, RegisterOperand RegType,
+                         ValueType AccumType, ValueType InputType>
+      : BaseSIMDThreeSameVectorDotIndex<Q, 0, 1, 0b00, "sudot", dst_kind,
+                                        lhs_kind, rhs_kind, RegType, AccumType,
+                                        InputType, null_frag> {
+  let Pattern = [(set (AccumType RegType:$dst),
+                      (AccumType (int_aarch64_neon_usdot (AccumType RegType:$Rd),
+                                 (InputType (bitconvert (AccumType
+                                    (AArch64duplane32 (v4i32 V128:$Rm),
+                                        VectorIndexS:$idx)))),
+                                 (InputType RegType:$Rn))))];
+}
+
+multiclass SIMDSUDOTIndex {
+  def v8i8  : BaseSIMDSUDOTIndex<0, ".2s", ".8b", ".4b", V64, v2i32, v8i8>;
+  def v16i8 : BaseSIMDSUDOTIndex<1, ".4s", ".16b", ".4b", V128, v4i32, v16i8>;
+}
+
+defm SUDOTlane : SIMDSUDOTIndex;
+
 }
 
 // ARMv8.2-A FP16 Fused Multiply-Add Long
@@ -819,38 +914,56 @@ let Predicates = [HasComplxNum, HasNEON] in {
 // important for compatibility with other assemblers (e.g. GAS) when building
 // software compatible with both CPUs that do or don't implement PA.
 let Uses = [LR], Defs = [LR] in {
-  def PACIAZ   : SystemNoOperands<0b000, "hint #24">;
-  def PACIBZ   : SystemNoOperands<0b010, "hint #26">;
+  def PACIAZ   : SystemNoOperands<0b000, "hint\t#24">;
+  def PACIBZ   : SystemNoOperands<0b010, "hint\t#26">;
   let isAuthenticated = 1 in {
-    def AUTIAZ   : SystemNoOperands<0b100, "hint #28">;
-    def AUTIBZ   : SystemNoOperands<0b110, "hint #30">;
+    def AUTIAZ   : SystemNoOperands<0b100, "hint\t#28">;
+    def AUTIBZ   : SystemNoOperands<0b110, "hint\t#30">;
   }
 }
 let Uses = [LR, SP], Defs = [LR] in {
-  def PACIASP  : SystemNoOperands<0b001, "hint #25">;
-  def PACIBSP  : SystemNoOperands<0b011, "hint #27">;
+  def PACIASP  : SystemNoOperands<0b001, "hint\t#25">;
+  def PACIBSP  : SystemNoOperands<0b011, "hint\t#27">;
   let isAuthenticated = 1 in {
-    def AUTIASP  : SystemNoOperands<0b101, "hint #29">;
-    def AUTIBSP  : SystemNoOperands<0b111, "hint #31">;
+    def AUTIASP  : SystemNoOperands<0b101, "hint\t#29">;
+    def AUTIBSP  : SystemNoOperands<0b111, "hint\t#31">;
   }
 }
 let Uses = [X16, X17], Defs = [X17], CRm = 0b0001 in {
-  def PACIA1716  : SystemNoOperands<0b000, "hint #8">;
-  def PACIB1716  : SystemNoOperands<0b010, "hint #10">;
+  def PACIA1716  : SystemNoOperands<0b000, "hint\t#8">;
+  def PACIB1716  : SystemNoOperands<0b010, "hint\t#10">;
   let isAuthenticated = 1 in {
-    def AUTIA1716  : SystemNoOperands<0b100, "hint #12">;
-    def AUTIB1716  : SystemNoOperands<0b110, "hint #14">;
+    def AUTIA1716  : SystemNoOperands<0b100, "hint\t#12">;
+    def AUTIB1716  : SystemNoOperands<0b110, "hint\t#14">;
   }
 }
 
 let Uses = [LR], Defs = [LR], CRm = 0b0000 in {
-  def XPACLRI   : SystemNoOperands<0b111, "hint #7">;
-}
+  def XPACLRI   : SystemNoOperands<0b111, "hint\t#7">;
+}
+
+// In order to be able to write readable assembly, LLVM should accept assembly
+// inputs that use pointer authentication mnemonics, even with PA disabled.
+// However, in order to be compatible with other assemblers (e.g. GAS), LLVM
+// should not emit these mnemonics unless PA is enabled.
+def : InstAlias<"paciaz", (PACIAZ), 0>;
+def : InstAlias<"pacibz", (PACIBZ), 0>;
+def : InstAlias<"autiaz", (AUTIAZ), 0>;
+def : InstAlias<"autibz", (AUTIBZ), 0>;
+def : InstAlias<"paciasp", (PACIASP), 0>;
+def : InstAlias<"pacibsp", (PACIBSP), 0>;
+def : InstAlias<"autiasp", (AUTIASP), 0>;
+def : InstAlias<"autibsp", (AUTIBSP), 0>;
+def : InstAlias<"pacia1716", (PACIA1716), 0>;
+def : InstAlias<"pacib1716", (PACIB1716), 0>;
+def : InstAlias<"autia1716", (AUTIA1716), 0>;
+def : InstAlias<"autib1716", (AUTIB1716), 0>;
+def : InstAlias<"xpaclri", (XPACLRI), 0>;
 
 // These pointer authentication instructions require armv8.3a
 let Predicates = [HasPA] in {
 
-  // When compiling with PA, there is a better mnemonic for these instructions.
+  // When PA is enabled, a better mnemonic should be emitted.
   def : InstAlias<"paciaz", (PACIAZ), 1>;
   def : InstAlias<"pacibz", (PACIBZ), 1>;
   def : InstAlias<"autiaz", (AUTIAZ), 1>;
@@ -884,15 +997,23 @@ let Predicates = [HasPA] in {
   def PACGA : SignAuthTwoOperand<0b1100, "pacga", null_frag>;
 
   // Combined Instructions
-  def BRAA    : AuthBranchTwoOperands<0, 0, "braa">;
-  def BRAB    : AuthBranchTwoOperands<0, 1, "brab">;
-  def BLRAA   : AuthBranchTwoOperands<1, 0, "blraa">;
-  def BLRAB   : AuthBranchTwoOperands<1, 1, "blrab">;
+  let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1  in {
+    def BRAA    : AuthBranchTwoOperands<0, 0, "braa">;
+    def BRAB    : AuthBranchTwoOperands<0, 1, "brab">;
+  }
+  let isCall = 1, Defs = [LR], Uses = [SP] in {
+    def BLRAA   : AuthBranchTwoOperands<1, 0, "blraa">;
+    def BLRAB   : AuthBranchTwoOperands<1, 1, "blrab">;
+  }
 
-  def BRAAZ   : AuthOneOperand<0b000, 0, "braaz">;
-  def BRABZ   : AuthOneOperand<0b000, 1, "brabz">;
-  def BLRAAZ  : AuthOneOperand<0b001, 0, "blraaz">;
-  def BLRABZ  : AuthOneOperand<0b001, 1, "blrabz">;
+  let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1  in {
+    def BRAAZ   : AuthOneOperand<0b000, 0, "braaz">;
+    def BRABZ   : AuthOneOperand<0b000, 1, "brabz">;
+  }
+  let isCall = 1, Defs = [LR], Uses = [SP] in {
+    def BLRAAZ  : AuthOneOperand<0b001, 0, "blraaz">;
+    def BLRABZ  : AuthOneOperand<0b001, 1, "blrabz">;
+  }
 
   let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
     def RETAA   : AuthReturn<0b010, 0, "retaa">;
@@ -1538,17 +1659,29 @@ def TAGPstack
 // register / expression for the tagged base pointer of the current function.
 def : Pat<(int_aarch64_irg_sp i64:$Rm), (IRGstack SP, i64:$Rm)>;
 
-// Large STG to be expanded into a loop. $Rm is the size, $Rn is start address.
-// $Rn_wback is one past the end of the range.
+// Large STG to be expanded into a loop. $sz is the size, $Rn is start address.
+// $Rn_wback is one past the end of the range. $Rm is the loop counter.
 let isCodeGenOnly=1, mayStore=1 in {
+def STGloop_wback
+    : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
+             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
+      Sched<[WriteAdr, WriteST]>;
+
+def STZGloop_wback
+    : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn_wback), (ins i64imm:$sz, GPR64sp:$Rn),
+             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,@earlyclobber $Rm" >,
+      Sched<[WriteAdr, WriteST]>;
+
+// A variant of the above where $Rn2 is an independent register not tied to the input register $Rn.
+// Their purpose is to use a FrameIndex operand as $Rn (which of course can not be written back).
 def STGloop
-    : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
-             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
+    : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn2), (ins i64imm:$sz, GPR64sp:$Rn),
+             [], "@earlyclobber $Rn2,@earlyclobber $Rm" >,
       Sched<[WriteAdr, WriteST]>;
 
 def STZGloop
-    : Pseudo<(outs GPR64common:$Rm_wback, GPR64sp:$Rn_wback), (ins GPR64common:$Rm, GPR64sp:$Rn),
-             [], "$Rn = $Rn_wback,@earlyclobber $Rn_wback,$Rm = $Rm_wback,@earlyclobber $Rm_wback" >,
+    : Pseudo<(outs GPR64common:$Rm, GPR64sp:$Rn2), (ins i64imm:$sz, GPR64sp:$Rn),
+             [], "@earlyclobber $Rn2,@earlyclobber $Rm" >,
       Sched<[WriteAdr, WriteST]>;
 }
 
@@ -1894,9 +2027,19 @@ def ERET : SpecialReturn<0b0100, "eret">;
 def : InstAlias<"ret", (RET LR)>;
 
 let isCall = 1, Defs = [LR], Uses = [SP] in {
-def BLR : BranchReg<0b0001, "blr", [(AArch64call GPR64:$Rn)]>;
+  def BLR : BranchReg<0b0001, "blr", []>;
+  def BLRNoIP : Pseudo<(outs), (ins GPR64noip:$Rn), []>,
+                Sched<[WriteBrReg]>,
+                PseudoInstExpansion<(BLR GPR64:$Rn)>;
 } // isCall
 
+def : Pat<(AArch64call GPR64:$Rn),
+          (BLR GPR64:$Rn)>,
+      Requires<[NoSLSBLRMitigation]>;
+def : Pat<(AArch64call GPR64noip:$Rn),
+          (BLRNoIP GPR64noip:$Rn)>,
+      Requires<[SLSBLRMitigation]>;
+
 let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
 def BR  : BranchReg<0b0000, "br", [(brind GPR64:$Rn)]>;
 } // isBranch, isTerminator, isBarrier, isIndirectBranch
@@ -2129,6 +2272,7 @@ let Predicates = [IsLE] in {
   defm : VecROLoadPat<ro64, v8i8,  LDRDroW, LDRDroX>;
   defm : VecROLoadPat<ro64, v4i16, LDRDroW, LDRDroX>;
   defm : VecROLoadPat<ro64, v4f16, LDRDroW, LDRDroX>;
+  defm : VecROLoadPat<ro64, v4bf16, LDRDroW, LDRDroX>;
 }
 
 defm : VecROLoadPat<ro64, v1i64,  LDRDroW, LDRDroX>;
@@ -2143,6 +2287,7 @@ let Predicates = [IsLE] in {
   defm : VecROLoadPat<ro128, v4f32,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v8i16,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v8f16,  LDRQroW, LDRQroX>;
+  defm : VecROLoadPat<ro128, v8bf16,  LDRQroW, LDRQroX>;
   defm : VecROLoadPat<ro128, v16i8,  LDRQroW, LDRQroX>;
 }
 } // AddedComplexity = 10
@@ -2225,6 +2370,10 @@ defm LDRQ : LoadUI<0b00, 1, 0b11, FPR128Op, uimm12s16, "ldr",
                  [(set (f128 FPR128Op:$Rt),
                        (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)))]>;
 
+// bf16 load pattern
+def : Pat <(bf16 (load (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset))),
+           (LDRHui GPR64sp:$Rn, uimm12s2:$offset)>;
+
 // For regular load, we do not have any alignment requirement.
 // Thus, it is safe to directly map the vector loads with interesting
 // addressing modes.
@@ -2274,6 +2423,8 @@ let Predicates = [IsLE] in {
             (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
   def : Pat<(v4f16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
             (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(v4bf16 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
+            (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
 }
 def : Pat<(v1f64 (load (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset))),
           (LDRDui GPR64sp:$Rn, uimm12s8:$offset)>;
@@ -2297,6 +2448,8 @@ let Predicates = [IsLE] in {
             (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
   def : Pat<(v8f16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
             (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(v8bf16 (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
+            (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
 }
 def : Pat<(f128  (load (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset))),
           (LDRQui GPR64sp:$Rn, uimm12s16:$offset)>;
@@ -2381,11 +2534,11 @@ def : InstAlias<"prfm $Rt, [$Rn]", (PRFMui prfop:$Rt, GPR64sp:$Rn, 0)>;
 def alignedglobal : PatLeaf<(iPTR iPTR:$label), [{
   if (auto *G = dyn_cast<GlobalAddressSDNode>(N)) {
     const DataLayout &DL = MF->getDataLayout();
-    MaybeAlign Align = G->getGlobal()->getPointerAlignment(DL);
-    return Align && *Align >= 4 && G->getOffset() % 4 == 0;
+    Align Align = G->getGlobal()->getPointerAlignment(DL);
+    return Align >= 4 && G->getOffset() % 4 == 0;
   }
   if (auto *C = dyn_cast<ConstantPoolSDNode>(N))
-    return C->getAlignment() >= 4 && C->getOffset() % 4 == 0;
+    return C->getAlign() >= 4 && C->getOffset() % 4 == 0;
   return false;
 }]>;
 
@@ -2425,7 +2578,7 @@ defm LDURB : LoadUnscaled<0b00, 1, 0b01, FPR8Op, "ldur",
                     [(set FPR8Op:$Rt,
                           (load (am_unscaled8 GPR64sp:$Rn, simm9:$offset)))]>;
 defm LDURH : LoadUnscaled<0b01, 1, 0b01, FPR16Op, "ldur",
-                    [(set FPR16Op:$Rt,
+                    [(set (f16 FPR16Op:$Rt),
                           (load (am_unscaled16 GPR64sp:$Rn, simm9:$offset)))]>;
 defm LDURS : LoadUnscaled<0b10, 1, 0b01, FPR32Op, "ldur",
                     [(set (f32 FPR32Op:$Rt),
@@ -2722,6 +2875,10 @@ defm STNPQ : StorePairNoAlloc<0b10, 1, FPR128Op, simm7s16, "stnp">;
 def : Pat<(AArch64stp GPR64z:$Rt, GPR64z:$Rt2, (am_indexed7s64 GPR64sp:$Rn, simm7s8:$offset)),
           (STPXi GPR64z:$Rt, GPR64z:$Rt2, GPR64sp:$Rn, simm7s8:$offset)>;
 
+def : Pat<(AArch64stnp FPR128:$Rt, FPR128:$Rt2, (am_indexed7s128 GPR64sp:$Rn, simm7s16:$offset)),
+          (STNPQi FPR128:$Rt, FPR128:$Rt2, GPR64sp:$Rn, simm7s16:$offset)>;
+
+
 //---
 // (Register offset)
 
@@ -2791,6 +2948,7 @@ let Predicates = [IsLE] in {
   defm : VecROStorePat<ro64, v4i16, FPR64, STRDroW, STRDroX>;
   defm : VecROStorePat<ro64, v8i8, FPR64, STRDroW, STRDroX>;
   defm : VecROStorePat<ro64, v4f16, FPR64, STRDroW, STRDroX>;
+  defm : VecROStorePat<ro64, v4bf16, FPR64, STRDroW, STRDroX>;
 }
 
 defm : VecROStorePat<ro64, v1i64, FPR64, STRDroW, STRDroX>;
@@ -2806,6 +2964,7 @@ let Predicates = [IsLE, UseSTRQro] in {
   defm : VecROStorePat<ro128, v8i16, FPR128, STRQroW, STRQroX>;
   defm : VecROStorePat<ro128, v16i8, FPR128, STRQroW, STRQroX>;
   defm : VecROStorePat<ro128, v8f16, FPR128, STRQroW, STRQroX>;
+  defm : VecROStorePat<ro128, v8bf16, FPR128, STRQroW, STRQroX>;
 }
 } // AddedComplexity = 10
 
@@ -2866,6 +3025,11 @@ defm STRBB : StoreUIz<0b00, 0, 0b00, GPR32z, uimm12s1,  "strb",
                                     (am_indexed8 GPR64sp:$Rn,
                                                  uimm12s1:$offset))]>;
 
+// bf16 store pattern
+def : Pat<(store (bf16 FPR16Op:$Rt),
+                 (am_indexed16 GPR64sp:$Rn, uimm12s2:$offset)),
+          (STRHui FPR16:$Rt, GPR64sp:$Rn, uimm12s2:$offset)>;
+
 let AddedComplexity = 10 in {
 
 // Match all store 64 bits width whose type is compatible with FPR64
@@ -2893,6 +3057,9 @@ let Predicates = [IsLE] in {
   def : Pat<(store (v4f16 FPR64:$Rt),
                    (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
             (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
+  def : Pat<(store (v4bf16 FPR64:$Rt),
+                   (am_indexed64 GPR64sp:$Rn, uimm12s8:$offset)),
+            (STRDui FPR64:$Rt, GPR64sp:$Rn, uimm12s8:$offset)>;
 }
 
 // Match all store 128 bits width whose type is compatible with FPR128
@@ -2923,6 +3090,9 @@ let Predicates = [IsLE] in {
   def : Pat<(store (v8f16 FPR128:$Rt),
                    (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
             (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
+  def : Pat<(store (v8bf16 FPR128:$Rt),
+                   (am_indexed128 GPR64sp:$Rn, uimm12s16:$offset)),
+            (STRQui FPR128:$Rt, GPR64sp:$Rn, uimm12s16:$offset)>;
 }
 
 // truncstore i64
@@ -3030,6 +3200,9 @@ let Predicates = [IsLE] in {
   def : Pat<(store (v4f16 FPR64:$Rt),
                    (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
             (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v4bf16 FPR64:$Rt),
+                   (am_unscaled64 GPR64sp:$Rn, simm9:$offset)),
+            (STURDi FPR64:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 }
 
 // Match all store 128 bits width whose type is compatible with FPR128
@@ -3062,6 +3235,9 @@ let Predicates = [IsLE] in {
   def : Pat<(store (v8f16 FPR128:$Rt),
                    (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
             (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
+  def : Pat<(store (v8bf16 FPR128:$Rt),
+                   (am_unscaled128 GPR64sp:$Rn, simm9:$offset)),
+            (STURQi FPR128:$Rt, GPR64sp:$Rn, simm9:$offset)>;
 }
 
 } // AddedComplexity = 10
@@ -3300,10 +3476,10 @@ defm FCVTNS : FPToIntegerUnscaled<0b00, 0b000, "fcvtns", int_aarch64_neon_fcvtns
 defm FCVTNU : FPToIntegerUnscaled<0b00, 0b001, "fcvtnu", int_aarch64_neon_fcvtnu>;
 defm FCVTPS : FPToIntegerUnscaled<0b01, 0b000, "fcvtps", int_aarch64_neon_fcvtps>;
 defm FCVTPU : FPToIntegerUnscaled<0b01, 0b001, "fcvtpu", int_aarch64_neon_fcvtpu>;
-defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
-defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
-defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", fp_to_sint>;
-defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", fp_to_uint>;
+defm FCVTZS : FPToIntegerUnscaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>;
+defm FCVTZU : FPToIntegerUnscaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
+defm FCVTZS : FPToIntegerScaled<0b11, 0b000, "fcvtzs", any_fp_to_sint>;
+defm FCVTZU : FPToIntegerScaled<0b11, 0b001, "fcvtzu", any_fp_to_uint>;
 
 multiclass FPToIntegerIntPats<Intrinsic round, string INST> {
   def : Pat<(i32 (round f16:$Rn)), (!cast<Instruction>(INST # UWHr) $Rn)>;
@@ -3375,8 +3551,8 @@ def : Pat<(i64 (llround f64:$Rn)),
 // Scaled integer to floating point conversion instructions.
 //===----------------------------------------------------------------------===//
 
-defm SCVTF : IntegerToFP<0, "scvtf", sint_to_fp>;
-defm UCVTF : IntegerToFP<1, "ucvtf", uint_to_fp>;
+defm SCVTF : IntegerToFP<0, "scvtf", any_sint_to_fp>;
+defm UCVTF : IntegerToFP<1, "ucvtf", any_uint_to_fp>;
 
 //===----------------------------------------------------------------------===//
 // Unscaled integer to floating point conversion instruction.
@@ -3541,8 +3717,8 @@ def : Pat<(f64 (fma FPR64:$Rn, (fneg FPR64:$Rm), (fneg FPR64:$Ra))),
 // Floating point comparison instructions.
 //===----------------------------------------------------------------------===//
 
-defm FCMPE : FPComparison<1, "fcmpe">;
-defm FCMP  : FPComparison<0, "fcmp", AArch64fcmp>;
+defm FCMPE : FPComparison<1, "fcmpe", AArch64strict_fcmpe>;
+defm FCMP  : FPComparison<0, "fcmp", AArch64any_fcmp>;
 
 //===----------------------------------------------------------------------===//
 // Floating point conditional comparison instructions.
@@ -3603,10 +3779,6 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
                     Sched<[]>;
 }
 
-let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
-    usesCustomInserter = 1 in
-def CATCHPAD : Pseudo<(outs), (ins), [(catchpad)]>, Sched<[]>;
-
 //===----------------------------------------------------------------------===//
 // Floating point immediate move.
 //===----------------------------------------------------------------------===//
@@ -3788,12 +3960,16 @@ defm URSQRTE: SIMDTwoVectorS<1, 1, 0b11100, "ursqrte", int_aarch64_neon_ursqrte>
 defm USQADD : SIMDTwoVectorBHSDTied<1, 0b00011, "usqadd",int_aarch64_neon_usqadd>;
 defm XTN    : SIMDMixedTwoVector<0, 0b10010, "xtn", trunc>;
 
-def : Pat<(v4f16 (AArch64rev32 V64:$Rn)), (REV32v4i16 V64:$Rn)>;
-def : Pat<(v4f16 (AArch64rev64 V64:$Rn)), (REV64v4i16 V64:$Rn)>;
-def : Pat<(v8f16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
-def : Pat<(v8f16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
-def : Pat<(v2f32 (AArch64rev64 V64:$Rn)), (REV64v2i32 V64:$Rn)>;
-def : Pat<(v4f32 (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
+def : Pat<(v4f16  (AArch64rev32 V64:$Rn)),  (REV32v4i16 V64:$Rn)>;
+def : Pat<(v4f16  (AArch64rev64 V64:$Rn)),  (REV64v4i16 V64:$Rn)>;
+def : Pat<(v4bf16 (AArch64rev32 V64:$Rn)),  (REV32v4i16 V64:$Rn)>;
+def : Pat<(v4bf16 (AArch64rev64 V64:$Rn)),  (REV64v4i16 V64:$Rn)>;
+def : Pat<(v8f16  (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
+def : Pat<(v8f16  (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
+def : Pat<(v8bf16 (AArch64rev32 V128:$Rn)), (REV32v8i16 V128:$Rn)>;
+def : Pat<(v8bf16 (AArch64rev64 V128:$Rn)), (REV64v8i16 V128:$Rn)>;
+def : Pat<(v2f32  (AArch64rev64 V64:$Rn)),  (REV64v2i32 V64:$Rn)>;
+def : Pat<(v4f32  (AArch64rev64 V128:$Rn)), (REV64v4i32 V128:$Rn)>;
 
 // Patterns for vector long shift (by element width). These need to match all
 // three of zext, sext and anyext so it's easier to pull the patterns out of the
@@ -3900,7 +4076,7 @@ defm SQRDMULH : SIMDThreeSameVectorHS<1,0b10110,"sqrdmulh",int_aarch64_neon_sqrd
 defm SQRSHL   : SIMDThreeSameVector<0,0b01011,"sqrshl", int_aarch64_neon_sqrshl>;
 defm SQSHL    : SIMDThreeSameVector<0,0b01001,"sqshl", int_aarch64_neon_sqshl>;
 defm SQSUB    : SIMDThreeSameVector<0,0b00101,"sqsub", int_aarch64_neon_sqsub>;
-defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd",int_aarch64_neon_srhadd>;
+defm SRHADD   : SIMDThreeSameVectorBHS<0,0b00010,"srhadd", AArch64srhadd>;
 defm SRSHL    : SIMDThreeSameVector<0,0b01010,"srshl", int_aarch64_neon_srshl>;
 defm SSHL     : SIMDThreeSameVector<0,0b01000,"sshl", int_aarch64_neon_sshl>;
 defm SUB      : SIMDThreeSameVector<1,0b10000,"sub", sub>;
@@ -3917,7 +4093,7 @@ defm UQADD    : SIMDThreeSameVector<1,0b00001,"uqadd", int_aarch64_neon_uqadd>;
 defm UQRSHL   : SIMDThreeSameVector<1,0b01011,"uqrshl", int_aarch64_neon_uqrshl>;
 defm UQSHL    : SIMDThreeSameVector<1,0b01001,"uqshl", int_aarch64_neon_uqshl>;
 defm UQSUB    : SIMDThreeSameVector<1,0b00101,"uqsub", int_aarch64_neon_uqsub>;
-defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", int_aarch64_neon_urhadd>;
+defm URHADD   : SIMDThreeSameVectorBHS<1,0b00010,"urhadd", AArch64urhadd>;
 defm URSHL    : SIMDThreeSameVector<1,0b01010,"urshl", int_aarch64_neon_urshl>;
 defm USHL     : SIMDThreeSameVector<1,0b01000,"ushl", int_aarch64_neon_ushl>;
 defm SQRDMLAH : SIMDThreeSameVectorSQRDMLxHTiedHS<1,0b10000,"sqrdmlah",
@@ -3934,33 +4110,36 @@ defm : SIMDThreeSameVectorExtraPatterns<"UQSUB", usubsat>;
 defm AND : SIMDLogicalThreeVector<0, 0b00, "and", and>;
 defm BIC : SIMDLogicalThreeVector<0, 0b01, "bic",
                                   BinOpFrag<(and node:$LHS, (vnot node:$RHS))> >;
-defm BIF : SIMDLogicalThreeVector<1, 0b11, "bif">;
-defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
-defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl",
-    TriOpFrag<(or (and node:$LHS, node:$MHS), (and (vnot node:$LHS), node:$RHS))>>;
 defm EOR : SIMDLogicalThreeVector<1, 0b00, "eor", xor>;
 defm ORN : SIMDLogicalThreeVector<0, 0b11, "orn",
                                   BinOpFrag<(or node:$LHS, (vnot node:$RHS))> >;
 defm ORR : SIMDLogicalThreeVector<0, 0b10, "orr", or>;
 
-
-def : Pat<(AArch64bsl (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-def : Pat<(AArch64bsl (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
-          (BSLv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
-
-def : Pat<(AArch64bsl (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
-def : Pat<(AArch64bsl (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
-          (BSLv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+// Pseudo bitwise select pattern BSP.
+// It is expanded into BSL/BIT/BIF after register allocation.
+defm BSP : SIMDLogicalThreeVectorPseudo<TriOpFrag<(or (and node:$LHS, node:$MHS),
+                                                      (and (vnot node:$LHS), node:$RHS))>>;
+defm BSL : SIMDLogicalThreeVectorTied<1, 0b01, "bsl">;
+defm BIT : SIMDLogicalThreeVectorTied<1, 0b10, "bit", AArch64bit>;
+defm BIF : SIMDLogicalThreeVectorTied<1, 0b11, "bif">;
+
+def : Pat<(AArch64bsp (v8i8 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsp (v4i16 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsp (v2i32 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+def : Pat<(AArch64bsp (v1i64 V64:$Rd), V64:$Rn, V64:$Rm),
+          (BSPv8i8 V64:$Rd, V64:$Rn, V64:$Rm)>;
+
+def : Pat<(AArch64bsp (v16i8 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsp (v8i16 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsp (v4i32 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
+def : Pat<(AArch64bsp (v2i64 V128:$Rd), V128:$Rn, V128:$Rm),
+          (BSPv16i8 V128:$Rd, V128:$Rn, V128:$Rm)>;
 
 def : InstAlias<"mov{\t$dst.16b, $src.16b|.16b\t$dst, $src}",
                 (ORRv16i8 V128:$dst, V128:$src, V128:$src), 1>;
@@ -4669,6 +4848,7 @@ multiclass ExtPat<ValueType VT64, ValueType VT128, int N> {
 defm : ExtPat<v8i8, v16i8, 8>;
 defm : ExtPat<v4i16, v8i16, 4>;
 defm : ExtPat<v4f16, v8f16, 4>;
+defm : ExtPat<v4bf16, v8bf16, 4>;
 defm : ExtPat<v2i32, v4i32, 2>;
 defm : ExtPat<v2f32, v4f32, 2>;
 defm : ExtPat<v1i64, v2i64, 1>;
@@ -4790,16 +4970,29 @@ def : Pat<(v4f16 (AArch64dup (f16 FPR16:$Rn))),
           (v4f16 (DUPv4i16lane
             (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
             (i64 0)))>;
+def : Pat<(v4bf16 (AArch64dup (bf16 FPR16:$Rn))),
+          (v4bf16 (DUPv4i16lane
+            (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+            (i64 0)))>;
 def : Pat<(v8f16 (AArch64dup (f16 FPR16:$Rn))),
           (v8f16 (DUPv8i16lane
             (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
             (i64 0)))>;
+def : Pat<(v8bf16 (AArch64dup (bf16 FPR16:$Rn))),
+          (v8bf16 (DUPv8i16lane
+            (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR16:$Rn, hsub),
+            (i64 0)))>;
 
 def : Pat<(v4f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
           (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
 def : Pat<(v8f16 (AArch64duplane16 (v8f16 V128:$Rn), VectorIndexH:$imm)),
           (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;
 
+def : Pat<(v4bf16 (AArch64duplane16 (v8bf16 V128:$Rn), VectorIndexH:$imm)),
+          (DUPv4i16lane V128:$Rn, VectorIndexH:$imm)>;
+def : Pat<(v8bf16 (AArch64duplane16 (v8bf16 V128:$Rn), VectorIndexH:$imm)),
+          (DUPv8i16lane V128:$Rn, VectorIndexH:$imm)>;
+
 def : Pat<(v2f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
           (DUPv2i32lane V128:$Rn, VectorIndexS:$imm)>;
 def : Pat<(v4f32 (AArch64duplane32 (v4f32 V128:$Rn), VectorIndexS:$imm)),
@@ -4915,6 +5108,11 @@ def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
 def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
           (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
 
+def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+          (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+          (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
 def : Pat<(v2i32 (scalar_to_vector (i32 FPR32:$Rn))),
             (v2i32 (INSERT_SUBREG (v2i32 (IMPLICIT_DEF)),
                                   (i32 FPR32:$Rn), ssub))>;
@@ -4931,6 +5129,11 @@ def : Pat<(v4f16 (scalar_to_vector (f16 FPR16:$Rn))),
 def : Pat<(v8f16 (scalar_to_vector (f16 FPR16:$Rn))),
           (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
 
+def : Pat<(v4bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+          (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+def : Pat<(v8bf16 (scalar_to_vector (bf16 FPR16:$Rn))),
+          (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rn, hsub)>;
+
 def : Pat<(v4f32 (scalar_to_vector (f32 FPR32:$Rn))),
           (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), FPR32:$Rn, ssub)>;
 def : Pat<(v2f32 (scalar_to_vector (f32 FPR32:$Rn))),
@@ -4956,6 +5159,23 @@ def : Pat<(v8f16 (vector_insert (v8f16 V128:$Rn),
             (v8f16 (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
             (i64 0))>;
 
+def : Pat<(v4bf16 (vector_insert (v4bf16 V64:$Rn),
+            (bf16 FPR16:$Rm), (i64 VectorIndexS:$imm))),
+          (EXTRACT_SUBREG
+            (INSvi16lane
+              (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), V64:$Rn, dsub)),
+              VectorIndexS:$imm,
+              (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+              (i64 0)),
+            dsub)>;
+
+def : Pat<(v8bf16 (vector_insert (v8bf16 V128:$Rn),
+            (bf16 FPR16:$Rm), (i64 VectorIndexH:$imm))),
+          (INSvi16lane
+            V128:$Rn, VectorIndexH:$imm,
+            (v8bf16 (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR16:$Rm, hsub)),
+            (i64 0))>;
+
 def : Pat<(v2f32 (vector_insert (v2f32 V64:$Rn),
             (f32 FPR32:$Rm), (i64 VectorIndexS:$imm))),
           (EXTRACT_SUBREG
@@ -5037,6 +5257,7 @@ multiclass Neon_INS_elt_pattern<ValueType VT128, ValueType VT64,
 }
 
 defm : Neon_INS_elt_pattern<v8f16, v4f16, f16, INSvi16lane>;
+defm : Neon_INS_elt_pattern<v8bf16, v4bf16, bf16, INSvi16lane>;
 defm : Neon_INS_elt_pattern<v4f32, v2f32, f32, INSvi32lane>;
 defm : Neon_INS_elt_pattern<v2f64, v1f64, f64, INSvi64lane>;
 
@@ -5050,6 +5271,9 @@ def : Pat<(vector_extract (v4f32 V128:$Rn), 0),
           (f32 (EXTRACT_SUBREG V128:$Rn, ssub))>;
 def : Pat<(vector_extract (v8f16 V128:$Rn), 0),
           (f16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
+def : Pat<(vector_extract (v8bf16 V128:$Rn), 0),
+          (bf16 (EXTRACT_SUBREG V128:$Rn, hsub))>;
+
 
 def : Pat<(vector_extract (v2f64 V128:$Rn), VectorIndexD:$idx),
           (f64 (CPYi64 V128:$Rn, VectorIndexD:$idx))>;
@@ -5057,6 +5281,8 @@ def : Pat<(vector_extract (v4f32 V128:$Rn), VectorIndexS:$idx),
           (f32 (CPYi32 V128:$Rn, VectorIndexS:$idx))>;
 def : Pat<(vector_extract (v8f16 V128:$Rn), VectorIndexH:$idx),
           (f16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>;
+def : Pat<(vector_extract (v8bf16 V128:$Rn), VectorIndexH:$idx),
+          (bf16 (CPYi16 V128:$Rn, VectorIndexH:$idx))>;
 
 // All concat_vectors operations are canonicalised to act on i64 vectors for
 // AArch64. In the general case we need an instruction, which had just as well be
@@ -5072,6 +5298,7 @@ def : ConcatPat<v4i32, v2i32>;
 def : ConcatPat<v4f32, v2f32>;
 def : ConcatPat<v8i16, v4i16>;
 def : ConcatPat<v8f16, v4f16>;
+def : ConcatPat<v8bf16, v4bf16>;
 def : ConcatPat<v16i8, v8i8>;
 
 // If the high lanes are undef, though, we can just ignore them:
@@ -5613,6 +5840,11 @@ def : Pat<(v2f64 (fmul V128:$Rn, (AArch64dup (f64 FPR64:$Rm)))),
 defm SQDMULH : SIMDIndexedHS<0, 0b1100, "sqdmulh", int_aarch64_neon_sqdmulh>;
 defm SQRDMULH : SIMDIndexedHS<0, 0b1101, "sqrdmulh", int_aarch64_neon_sqrdmulh>;
 
+defm SQDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqdmulh_lane,
+                                     int_aarch64_neon_sqdmulh_laneq>;
+defm SQRDMULH : SIMDIndexedHSPatterns<int_aarch64_neon_sqrdmulh_lane,
+                                      int_aarch64_neon_sqrdmulh_laneq>;
+
 // Generated by MachineCombine
 defm MLA   : SIMDVectorIndexedHSTied<1, 0b0000, "mla", null_frag>;
 defm MLS   : SIMDVectorIndexedHSTied<1, 0b0100, "mls", null_frag>;
@@ -5780,8 +6012,8 @@ defm RSHRN   : SIMDVectorRShiftNarrowBHS<0, 0b10001, "rshrn",
 defm SHL     : SIMDVectorLShiftBHSD<0, 0b01010, "shl", AArch64vshl>;
 defm SHRN    : SIMDVectorRShiftNarrowBHS<0, 0b10000, "shrn",
                           BinOpFrag<(trunc (AArch64vashr node:$LHS, node:$RHS))>>;
-defm SLI     : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", int_aarch64_neon_vsli>;
-def : Pat<(v1i64 (int_aarch64_neon_vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+defm SLI     : SIMDVectorLShiftBHSDTied<1, 0b01010, "sli", AArch64vsli>;
+def : Pat<(v1i64 (AArch64vsli (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
                                       (i32 vecshiftL64:$imm))),
           (SLId FPR64:$Rd, FPR64:$Rn, vecshiftL64:$imm)>;
 defm SQRSHRN : SIMDVectorRShiftNarrowBHS<0, 0b10011, "sqrshrn",
@@ -5794,8 +6026,8 @@ defm SQSHRN  : SIMDVectorRShiftNarrowBHS<0, 0b10010, "sqshrn",
                                          int_aarch64_neon_sqshrn>;
 defm SQSHRUN : SIMDVectorRShiftNarrowBHS<1, 0b10000, "sqshrun",
                                          int_aarch64_neon_sqshrun>;
-defm SRI     : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", int_aarch64_neon_vsri>;
-def : Pat<(v1i64 (int_aarch64_neon_vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
+defm SRI     : SIMDVectorRShiftBHSDTied<1, 0b01000, "sri", AArch64vsri>;
+def : Pat<(v1i64 (AArch64vsri (v1i64 FPR64:$Rd), (v1i64 FPR64:$Rn),
                                       (i32 vecshiftR64:$imm))),
           (SRId FPR64:$Rd, FPR64:$Rn, vecshiftR64:$imm)>;
 defm SRSHR   : SIMDVectorRShiftBHSD<0, 0b00100, "srshr", AArch64srshri>;
@@ -6147,6 +6379,10 @@ def : Pat<(v4f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
           (LD1Rv4h GPR64sp:$Rn)>;
 def : Pat<(v8f16 (AArch64dup (f16 (load GPR64sp:$Rn)))),
           (LD1Rv8h GPR64sp:$Rn)>;
+def : Pat<(v4bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))),
+          (LD1Rv4h GPR64sp:$Rn)>;
+def : Pat<(v8bf16 (AArch64dup (bf16 (load GPR64sp:$Rn)))),
+          (LD1Rv8h GPR64sp:$Rn)>;
 
 class Ld1Lane128Pat<SDPatternOperator scalar_load, Operand VecIndex,
                     ValueType VTy, ValueType STy, Instruction LD1>
@@ -6161,6 +6397,7 @@ def : Ld1Lane128Pat<load,       VectorIndexS, v4f32, f32, LD1i32>;
 def : Ld1Lane128Pat<load,       VectorIndexD, v2i64, i64, LD1i64>;
 def : Ld1Lane128Pat<load,       VectorIndexD, v2f64, f64, LD1i64>;
 def : Ld1Lane128Pat<load,       VectorIndexH, v8f16, f16, LD1i16>;
+def : Ld1Lane128Pat<load,       VectorIndexH, v8bf16, bf16, LD1i16>;
 
 class Ld1Lane64Pat<SDPatternOperator scalar_load, Operand VecIndex,
                    ValueType VTy, ValueType STy, Instruction LD1>
@@ -6176,6 +6413,7 @@ def : Ld1Lane64Pat<extloadi16, VectorIndexH, v4i16, i32, LD1i16>;
 def : Ld1Lane64Pat<load,       VectorIndexS, v2i32, i32, LD1i32>;
 def : Ld1Lane64Pat<load,       VectorIndexS, v2f32, f32, LD1i32>;
 def : Ld1Lane64Pat<load,       VectorIndexH, v4f16, f16, LD1i16>;
+def : Ld1Lane64Pat<load,       VectorIndexH, v4bf16, bf16, LD1i16>;
 
 
 defm LD1 : SIMDLdSt1SingleAliases<"ld1">;
@@ -6204,6 +6442,7 @@ def : St1Lane128Pat<store,         VectorIndexS, v4f32, f32, ST1i32>;
 def : St1Lane128Pat<store,         VectorIndexD, v2i64, i64, ST1i64>;
 def : St1Lane128Pat<store,         VectorIndexD, v2f64, f64, ST1i64>;
 def : St1Lane128Pat<store,         VectorIndexH, v8f16, f16, ST1i16>;
+def : St1Lane128Pat<store,         VectorIndexH, v8bf16, bf16, ST1i16>;
 
 let AddedComplexity = 19 in
 class St1Lane64Pat<SDPatternOperator scalar_store, Operand VecIndex,
@@ -6219,6 +6458,7 @@ def : St1Lane64Pat<truncstorei16, VectorIndexH, v4i16, i32, ST1i16>;
 def : St1Lane64Pat<store,         VectorIndexS, v2i32, i32, ST1i32>;
 def : St1Lane64Pat<store,         VectorIndexS, v2f32, f32, ST1i32>;
 def : St1Lane64Pat<store,         VectorIndexH, v4f16, f16, ST1i16>;
+def : St1Lane64Pat<store,         VectorIndexH, v4bf16, bf16, ST1i16>;
 
 multiclass St1LanePost64Pat<SDPatternOperator scalar_store, Operand VecIndex,
                              ValueType VTy, ValueType STy, Instruction ST1,
@@ -6244,6 +6484,7 @@ defm : St1LanePost64Pat<post_store, VectorIndexS, v2f32, f32, ST1i32_POST, 4>;
 defm : St1LanePost64Pat<post_store, VectorIndexD, v1i64, i64, ST1i64_POST, 8>;
 defm : St1LanePost64Pat<post_store, VectorIndexD, v1f64, f64, ST1i64_POST, 8>;
 defm : St1LanePost64Pat<post_store, VectorIndexH, v4f16, f16, ST1i16_POST, 2>;
+defm : St1LanePost64Pat<post_store, VectorIndexH, v4bf16, bf16, ST1i16_POST, 2>;
 
 multiclass St1LanePost128Pat<SDPatternOperator scalar_store, Operand VecIndex,
                              ValueType VTy, ValueType STy, Instruction ST1,
@@ -6268,6 +6509,7 @@ defm : St1LanePost128Pat<post_store, VectorIndexS, v4f32, f32, ST1i32_POST, 4>;
 defm : St1LanePost128Pat<post_store, VectorIndexD, v2i64, i64, ST1i64_POST, 8>;
 defm : St1LanePost128Pat<post_store, VectorIndexD, v2f64, f64, ST1i64_POST, 8>;
 defm : St1LanePost128Pat<post_store, VectorIndexH, v8f16, f16, ST1i16_POST, 2>;
+defm : St1LanePost128Pat<post_store, VectorIndexH, v8bf16, bf16, ST1i16_POST, 2>;
 
 let mayStore = 1, hasSideEffects = 0 in {
 defm ST2 : SIMDStSingleB<1, 0b000,       "st2", VecListTwob,   GPR64pi2>;
@@ -6508,6 +6750,7 @@ def : Pat<(v4i32 (mulhu V128:$Rn, V128:$Rm)),
 def : Pat<(v8i8 (AArch64NvCast (v2i32 FPR64:$src))), (v8i8 FPR64:$src)>;
 def : Pat<(v4i16 (AArch64NvCast (v2i32 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4f16 (AArch64NvCast (v2i32 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (AArch64NvCast (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>;
 def : Pat<(v2i32 (AArch64NvCast (v2i32 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2f32 (AArch64NvCast (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
@@ -6515,12 +6758,14 @@ def : Pat<(v1i64 (AArch64NvCast (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v8i8 (AArch64NvCast (v4i16 FPR64:$src))), (v8i8 FPR64:$src)>;
 def : Pat<(v4i16 (AArch64NvCast (v4i16 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4f16 (AArch64NvCast (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (AArch64NvCast (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>;
 def : Pat<(v2i32 (AArch64NvCast (v4i16 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v1i64 (AArch64NvCast (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
 
 def : Pat<(v8i8 (AArch64NvCast (v8i8 FPR64:$src))), (v8i8 FPR64:$src)>;
 def : Pat<(v4i16 (AArch64NvCast (v8i8 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4f16 (AArch64NvCast (v8i8 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (AArch64NvCast (v8i8 FPR64:$src))), (v4bf16 FPR64:$src)>;
 def : Pat<(v2i32 (AArch64NvCast (v8i8 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2f32 (AArch64NvCast (v8i8 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
@@ -6528,6 +6773,7 @@ def : Pat<(v1i64 (AArch64NvCast (v8i8 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v8i8 (AArch64NvCast (f64 FPR64:$src))), (v8i8 FPR64:$src)>;
 def : Pat<(v4i16 (AArch64NvCast (f64 FPR64:$src))), (v4i16 FPR64:$src)>;
 def : Pat<(v4f16 (AArch64NvCast (f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (AArch64NvCast (f64 FPR64:$src))), (v4bf16 FPR64:$src)>;
 def : Pat<(v2i32 (AArch64NvCast (f64 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2f32 (AArch64NvCast (f64 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v1i64 (AArch64NvCast (f64 FPR64:$src))), (v1i64 FPR64:$src)>;
@@ -6544,6 +6790,7 @@ def : Pat<(v1f64 (AArch64NvCast (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v16i8 (AArch64NvCast (v4i32 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v4i32 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v4i32 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v4i32 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v4i32 FPR128:$src))), (v2i64 FPR128:$src)>;
@@ -6552,6 +6799,7 @@ def : Pat<(v2f64 (AArch64NvCast (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v16i8 (AArch64NvCast (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v8i16 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -6560,6 +6808,7 @@ def : Pat<(v2f64 (AArch64NvCast (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v16i8 (AArch64NvCast (v16i8 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v16i8 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -6568,6 +6817,7 @@ def : Pat<(v2f64 (AArch64NvCast (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v16i8 (AArch64NvCast (v2i64 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v8i16 (AArch64NvCast (v2i64 FPR128:$src))), (v8i16 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v2i64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>;
 def : Pat<(v4i32 (AArch64NvCast (v2i64 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v2i64 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -6579,6 +6829,7 @@ def : Pat<(v4i32 (AArch64NvCast (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v4f32 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>;
 def : Pat<(v2f64 (AArch64NvCast (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
 
 def : Pat<(v16i8 (AArch64NvCast (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
@@ -6587,6 +6838,7 @@ def : Pat<(v4i32 (AArch64NvCast (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v2i64 (AArch64NvCast (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2f64 (AArch64NvCast (v2f64 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v8f16 (AArch64NvCast (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (AArch64NvCast (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>;
 def : Pat<(v4f32 (AArch64NvCast (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
@@ -6594,6 +6846,7 @@ def : Pat<(v8i8  (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v4i16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v2i32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v4f16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
+def : Pat<(v4bf16 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 def : Pat<(v2f32 (bitconvert GPR64:$Xn)), (COPY_TO_REGCLASS GPR64:$Xn, FPR64)>;
 
 def : Pat<(i64 (bitconvert (v8i8  V64:$Vn))),
@@ -6604,6 +6857,8 @@ def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
+def : Pat<(i64 (bitconvert (v4bf16 V64:$Vn))),
+          (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
           (COPY_TO_REGCLASS V64:$Vn, GPR64)>;
 def : Pat<(i64 (bitconvert (v1f64 V64:$Vn))),
@@ -6618,6 +6873,8 @@ def : Pat<(v2i32 (bitconvert GPR64:$Xn)),
                  (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 def : Pat<(v4f16 (bitconvert GPR64:$Xn)),
                  (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
+def : Pat<(v4bf16 (bitconvert GPR64:$Xn)),
+                  (REV64v4i16 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 def : Pat<(v2f32 (bitconvert GPR64:$Xn)),
                  (REV64v2i32 (COPY_TO_REGCLASS GPR64:$Xn, FPR64))>;
 
@@ -6629,6 +6886,8 @@ def : Pat<(i64 (bitconvert (v2i32 V64:$Vn))),
           (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 def : Pat<(i64 (bitconvert (v4f16 V64:$Vn))),
           (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
+def : Pat<(i64 (bitconvert (v4bf16 V64:$Vn))),
+          (REV64v4i16 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 def : Pat<(i64 (bitconvert (v2f32 V64:$Vn))),
           (REV64v2i32 (COPY_TO_REGCLASS V64:$Vn, GPR64))>;
 }
@@ -6658,6 +6917,7 @@ def : Pat<(v1i64 (bitconvert (v2i32 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v4i16 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))), (v1i64 FPR64:$src)>;
+def : Pat<(v1i64 (bitconvert (v4bf16 FPR64:$src))), (v1i64 FPR64:$src)>;
 def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))), (v1i64 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
@@ -6669,6 +6929,8 @@ def : Pat<(v1i64 (bitconvert (v8i8  FPR64:$src))),
                              (v1i64 (REV64v8i8 FPR64:$src))>;
 def : Pat<(v1i64 (bitconvert (v4f16 FPR64:$src))),
                              (v1i64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1i64 (bitconvert (v4bf16 FPR64:$src))),
+                             (v1i64 (REV64v4i16 FPR64:$src))>;
 def : Pat<(v1i64 (bitconvert (v2f32 FPR64:$src))),
                              (v1i64 (REV64v2i32 FPR64:$src))>;
 }
@@ -6682,6 +6944,7 @@ def : Pat<(v2i32 (bitconvert (v8i8  FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (f64   FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))), (v2i32 FPR64:$src)>;
 def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))), (v2i32 FPR64:$src)>;
+def : Pat<(v2i32 (bitconvert (v4bf16 FPR64:$src))), (v2i32 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2i32 (bitconvert (v1i64 FPR64:$src))),
@@ -6696,6 +6959,8 @@ def : Pat<(v2i32 (bitconvert (v1f64 FPR64:$src))),
                              (v2i32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2i32 (bitconvert (v4f16 FPR64:$src))),
                              (v2i32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2i32 (bitconvert (v4bf16 FPR64:$src))),
+                             (v2i32 (REV32v4i16 FPR64:$src))>;
 }
 def : Pat<(v2i32 (bitconvert (v2f32 FPR64:$src))), (v2i32 FPR64:$src)>;
 
@@ -6722,6 +6987,7 @@ def : Pat<(v4i16 (bitconvert (v1f64 FPR64:$src))),
                              (v4i16 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v4i16 (bitconvert (v4f16 FPR64:$src))), (v4i16 FPR64:$src)>;
+def : Pat<(v4i16 (bitconvert (v4bf16 FPR64:$src))), (v4i16 FPR64:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))), (v4f16 FPR64:$src)>;
@@ -6730,6 +6996,13 @@ def : Pat<(v4f16 (bitconvert (v8i8  FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (f64   FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))), (v4f16 FPR64:$src)>;
 def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))), (v4f16 FPR64:$src)>;
+
+def : Pat<(v4bf16 (bitconvert (v1i64 FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v2i32 FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v8i8  FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (f64   FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))), (v4bf16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))), (v4bf16 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v4f16 (bitconvert (v1i64 FPR64:$src))),
@@ -6744,8 +7017,22 @@ def : Pat<(v4f16 (bitconvert (v2f32 FPR64:$src))),
                              (v4f16 (REV32v4i16 FPR64:$src))>;
 def : Pat<(v4f16 (bitconvert (v1f64 FPR64:$src))),
                              (v4f16 (REV64v4i16 FPR64:$src))>;
+
+def : Pat<(v4bf16 (bitconvert (v1i64 FPR64:$src))),
+                             (v4bf16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (v2i32 FPR64:$src))),
+                             (v4bf16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (v8i8  FPR64:$src))),
+                             (v4bf16 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (f64   FPR64:$src))),
+                             (v4bf16 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (v2f32 FPR64:$src))),
+                             (v4bf16 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v4bf16 (bitconvert (v1f64 FPR64:$src))),
+                             (v4bf16 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v4f16 (bitconvert (v4i16 FPR64:$src))), (v4f16 FPR64:$src)>;
+def : Pat<(v4bf16 (bitconvert (v4i16 FPR64:$src))), (v4bf16 FPR64:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))), (v8i8  FPR64:$src)>;
@@ -6755,6 +7042,7 @@ def : Pat<(v8i8  (bitconvert (f64   FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v2f32 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))), (v8i8  FPR64:$src)>;
 def : Pat<(v8i8  (bitconvert (v4f16 FPR64:$src))), (v8i8  FPR64:$src)>;
+def : Pat<(v8i8  (bitconvert (v4bf16 FPR64:$src))), (v8i8  FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v8i8  (bitconvert (v1i64 FPR64:$src))),
@@ -6771,6 +7059,8 @@ def : Pat<(v8i8  (bitconvert (v1f64 FPR64:$src))),
                              (v8i8 (REV64v8i8 FPR64:$src))>;
 def : Pat<(v8i8  (bitconvert (v4f16 FPR64:$src))),
                              (v8i8 (REV16v8i8 FPR64:$src))>;
+def : Pat<(v8i8  (bitconvert (v4bf16 FPR64:$src))),
+                             (v8i8 (REV16v8i8 FPR64:$src))>;
 }
 
 let Predicates = [IsLE] in {
@@ -6779,6 +7069,7 @@ def : Pat<(f64   (bitconvert (v4i16 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v2f32 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v4f16 FPR64:$src))), (f64   FPR64:$src)>;
+def : Pat<(f64   (bitconvert (v4bf16 FPR64:$src))), (f64   FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(f64   (bitconvert (v2i32 FPR64:$src))),
@@ -6791,6 +7082,8 @@ def : Pat<(f64   (bitconvert (v8i8  FPR64:$src))),
                              (f64 (REV64v8i8 FPR64:$src))>;
 def : Pat<(f64   (bitconvert (v4f16 FPR64:$src))),
                              (f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(f64   (bitconvert (v4bf16 FPR64:$src))),
+                             (f64 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(f64   (bitconvert (v1i64 FPR64:$src))), (f64   FPR64:$src)>;
 def : Pat<(f64   (bitconvert (v1f64 FPR64:$src))), (f64   FPR64:$src)>;
@@ -6801,6 +7094,7 @@ def : Pat<(v1f64 (bitconvert (v4i16 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v8i8  FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))), (v1f64 FPR64:$src)>;
+def : Pat<(v1f64 (bitconvert (v4bf16 FPR64:$src))), (v1f64 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v1f64 (bitconvert (v2i32 FPR64:$src))),
@@ -6813,6 +7107,8 @@ def : Pat<(v1f64 (bitconvert (v2f32 FPR64:$src))),
                              (v1f64 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v1f64 (bitconvert (v4f16 FPR64:$src))),
                              (v1f64 (REV64v4i16 FPR64:$src))>;
+def : Pat<(v1f64 (bitconvert (v4bf16 FPR64:$src))),
+                             (v1f64 (REV64v4i16 FPR64:$src))>;
 }
 def : Pat<(v1f64 (bitconvert (v1i64 FPR64:$src))), (v1f64 FPR64:$src)>;
 def : Pat<(v1f64 (bitconvert (f64   FPR64:$src))), (v1f64 FPR64:$src)>;
@@ -6824,6 +7120,7 @@ def : Pat<(v2f32 (bitconvert (v8i8  FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (v1f64 FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))), (v2f32 FPR64:$src)>;
 def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))), (v2f32 FPR64:$src)>;
+def : Pat<(v2f32 (bitconvert (v4bf16 FPR64:$src))), (v2f32 FPR64:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2f32 (bitconvert (v1i64 FPR64:$src))),
@@ -6838,6 +7135,8 @@ def : Pat<(v2f32 (bitconvert (f64   FPR64:$src))),
                              (v2f32 (REV64v2i32 FPR64:$src))>;
 def : Pat<(v2f32 (bitconvert (v4f16 FPR64:$src))),
                              (v2f32 (REV32v4i16 FPR64:$src))>;
+def : Pat<(v2f32 (bitconvert (v4bf16 FPR64:$src))),
+                             (v2f32 (REV32v4i16 FPR64:$src))>;
 }
 def : Pat<(v2f32 (bitconvert (v2i32 FPR64:$src))), (v2f32 FPR64:$src)>;
 
@@ -6848,6 +7147,7 @@ def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))), (f128 FPR128:$src)>;
+def : Pat<(f128 (bitconvert (v8bf16 FPR128:$src))), (f128 FPR128:$src)>;
 def : Pat<(f128 (bitconvert (v16i8 FPR128:$src))), (f128 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
@@ -6862,6 +7162,9 @@ def : Pat<(f128 (bitconvert (v8i16 FPR128:$src))),
 def : Pat<(f128 (bitconvert (v8f16 FPR128:$src))),
                             (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
                                             (REV64v8i16 FPR128:$src), (i32 8)))>;
+def : Pat<(f128 (bitconvert (v8bf16 FPR128:$src))),
+                            (f128 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                            (REV64v8i16 FPR128:$src), (i32 8)))>;
 def : Pat<(f128 (bitconvert (v2f64 FPR128:$src))),
                             (f128 (EXTv16i8 FPR128:$src, FPR128:$src, (i32 8)))>;
 def : Pat<(f128 (bitconvert (v4f32 FPR128:$src))),
@@ -6877,6 +7180,7 @@ def : Pat<(v2f64 (bitconvert (f128  FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4i32 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))), (v2f64 FPR128:$src)>;
+def : Pat<(v2f64 (bitconvert (v8bf16 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))), (v2f64 FPR128:$src)>;
 def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))), (v2f64 FPR128:$src)>;
 }
@@ -6890,6 +7194,8 @@ def : Pat<(v2f64 (bitconvert (v8i16 FPR128:$src))),
                              (v2f64 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v8f16 FPR128:$src))),
                              (v2f64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2f64 (bitconvert (v8bf16 FPR128:$src))),
+                             (v2f64 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v16i8 FPR128:$src))),
                              (v2f64 (REV64v16i8 FPR128:$src))>;
 def : Pat<(v2f64 (bitconvert (v4f32 FPR128:$src))),
@@ -6901,6 +7207,7 @@ let Predicates = [IsLE] in {
 def : Pat<(v4f32 (bitconvert (f128  FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))), (v4f32 FPR128:$src)>;
+def : Pat<(v4f32 (bitconvert (v8bf16 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))), (v4f32 FPR128:$src)>;
 def : Pat<(v4f32 (bitconvert (v2f64 FPR128:$src))), (v4f32 FPR128:$src)>;
@@ -6913,6 +7220,8 @@ def : Pat<(v4f32 (bitconvert (v8i16 FPR128:$src))),
                              (v4f32 (REV32v8i16 FPR128:$src))>;
 def : Pat<(v4f32 (bitconvert (v8f16 FPR128:$src))),
                              (v4f32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4f32 (bitconvert (v8bf16 FPR128:$src))),
+                             (v4f32 (REV32v8i16 FPR128:$src))>;
 def : Pat<(v4f32 (bitconvert (v16i8 FPR128:$src))),
                              (v4f32 (REV32v16i8 FPR128:$src))>;
 def : Pat<(v4f32 (bitconvert (v2i64 FPR128:$src))),
@@ -6929,6 +7238,7 @@ def : Pat<(v2i64 (bitconvert (v8i16 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v16i8 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))), (v2i64 FPR128:$src)>;
 def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))), (v2i64 FPR128:$src)>;
+def : Pat<(v2i64 (bitconvert (v8bf16 FPR128:$src))), (v2i64 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v2i64 (bitconvert (f128  FPR128:$src))),
@@ -6944,6 +7254,8 @@ def : Pat<(v2i64 (bitconvert (v4f32 FPR128:$src))),
                              (v2i64 (REV64v4i32 FPR128:$src))>;
 def : Pat<(v2i64 (bitconvert (v8f16 FPR128:$src))),
                              (v2i64 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v2i64 (bitconvert (v8bf16 FPR128:$src))),
+                             (v2i64 (REV64v8i16 FPR128:$src))>;
 }
 def : Pat<(v2i64 (bitconvert (v2f64 FPR128:$src))), (v2i64 FPR128:$src)>;
 
@@ -6954,6 +7266,7 @@ def : Pat<(v4i32 (bitconvert (v8i16 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v16i8 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))), (v4i32 FPR128:$src)>;
 def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))), (v4i32 FPR128:$src)>;
+def : Pat<(v4i32 (bitconvert (v8bf16 FPR128:$src))), (v4i32 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v4i32 (bitconvert (f128  FPR128:$src))),
@@ -6970,6 +7283,8 @@ def : Pat<(v4i32 (bitconvert (v2f64 FPR128:$src))),
                              (v4i32 (REV64v4i32 FPR128:$src))>;
 def : Pat<(v4i32 (bitconvert (v8f16 FPR128:$src))),
                              (v4i32 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v4i32 (bitconvert (v8bf16 FPR128:$src))),
+                             (v4i32 (REV32v8i16 FPR128:$src))>;
 }
 def : Pat<(v4i32 (bitconvert (v4f32 FPR128:$src))), (v4i32 FPR128:$src)>;
 
@@ -6998,6 +7313,7 @@ def : Pat<(v8i16 (bitconvert (v4f32 FPR128:$src))),
                              (v8i16 (REV32v8i16 FPR128:$src))>;
 }
 def : Pat<(v8i16 (bitconvert (v8f16 FPR128:$src))), (v8i16 FPR128:$src)>;
+def : Pat<(v8i16 (bitconvert (v8bf16 FPR128:$src))), (v8i16 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v8f16 (bitconvert (f128  FPR128:$src))), (v8f16 FPR128:$src)>;
@@ -7006,6 +7322,13 @@ def : Pat<(v8f16 (bitconvert (v4i32 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v16i8 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))), (v8f16 FPR128:$src)>;
 def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))), (v8f16 FPR128:$src)>;
+
+def : Pat<(v8bf16 (bitconvert (f128  FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v2i64 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v4i32 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v16i8 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))), (v8bf16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))), (v8bf16 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v8f16 (bitconvert (f128  FPR128:$src))),
@@ -7022,8 +7345,24 @@ def : Pat<(v8f16 (bitconvert (v2f64 FPR128:$src))),
                              (v8f16 (REV64v8i16 FPR128:$src))>;
 def : Pat<(v8f16 (bitconvert (v4f32 FPR128:$src))),
                              (v8f16 (REV32v8i16 FPR128:$src))>;
+
+def : Pat<(v8bf16 (bitconvert (f128  FPR128:$src))),
+                             (v8bf16 (EXTv16i8 (REV64v8i16 FPR128:$src),
+                                              (REV64v8i16 FPR128:$src),
+                                              (i32 8)))>;
+def : Pat<(v8bf16 (bitconvert (v2i64 FPR128:$src))),
+                             (v8bf16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8bf16 (bitconvert (v4i32 FPR128:$src))),
+                             (v8bf16 (REV32v8i16 FPR128:$src))>;
+def : Pat<(v8bf16 (bitconvert (v16i8 FPR128:$src))),
+                             (v8bf16 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v8bf16 (bitconvert (v2f64 FPR128:$src))),
+                             (v8bf16 (REV64v8i16 FPR128:$src))>;
+def : Pat<(v8bf16 (bitconvert (v4f32 FPR128:$src))),
+                             (v8bf16 (REV32v8i16 FPR128:$src))>;
 }
 def : Pat<(v8f16 (bitconvert (v8i16 FPR128:$src))), (v8f16 FPR128:$src)>;
+def : Pat<(v8bf16 (bitconvert (v8i16 FPR128:$src))), (v8bf16 FPR128:$src)>;
 
 let Predicates = [IsLE] in {
 def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))), (v16i8 FPR128:$src)>;
@@ -7033,6 +7372,7 @@ def : Pat<(v16i8 (bitconvert (v8i16 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v2f64 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))), (v16i8 FPR128:$src)>;
 def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))), (v16i8 FPR128:$src)>;
+def : Pat<(v16i8 (bitconvert (v8bf16 FPR128:$src))), (v16i8 FPR128:$src)>;
 }
 let Predicates = [IsBE] in {
 def : Pat<(v16i8 (bitconvert (f128  FPR128:$src))),
@@ -7051,6 +7391,8 @@ def : Pat<(v16i8 (bitconvert (v4f32 FPR128:$src))),
                              (v16i8 (REV32v16i8 FPR128:$src))>;
 def : Pat<(v16i8 (bitconvert (v8f16 FPR128:$src))),
                              (v16i8 (REV16v16i8 FPR128:$src))>;
+def : Pat<(v16i8 (bitconvert (v8bf16 FPR128:$src))),
+                             (v16i8 (REV16v16i8 FPR128:$src))>;
 }
 
 def : Pat<(v4i16 (extract_subvector V128:$Rn, (i64 0))),
@@ -7061,6 +7403,8 @@ def : Pat<(v2f32 (extract_subvector V128:$Rn, (i64 0))),
            (EXTRACT_SUBREG V128:$Rn, dsub)>;
 def : Pat<(v4f16 (extract_subvector V128:$Rn, (i64 0))),
            (EXTRACT_SUBREG V128:$Rn, dsub)>;
+def : Pat<(v4bf16 (extract_subvector V128:$Rn, (i64 0))),
+           (EXTRACT_SUBREG V128:$Rn, dsub)>;
 def : Pat<(v2i32 (extract_subvector V128:$Rn, (i64 0))),
            (EXTRACT_SUBREG V128:$Rn, dsub)>;
 def : Pat<(v1i64 (extract_subvector V128:$Rn, (i64 0))),
@@ -7092,6 +7436,8 @@ multiclass InsertSubvectorUndef<ValueType Ty> {
             (INSERT_SUBREG (v8i16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
   def : Pat<(insert_subvector undef, (v4f16 FPR64:$src), (Ty 0)),
             (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+  def : Pat<(insert_subvector undef, (v4bf16 FPR64:$src), (Ty 0)),
+            (INSERT_SUBREG (v8bf16 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
   def : Pat<(insert_subvector undef, (v8i8 FPR64:$src), (Ty 0)),
             (INSERT_SUBREG (v16i8 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
 }
@@ -7317,3 +7663,5 @@ let AddedComplexity = 10 in {
 
 include "AArch64InstrAtomics.td"
 include "AArch64SVEInstrInfo.td"
+
+include "AArch64InstrGISel.td"
diff --git a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
index 3156bb446963..d975b8bd04fe 100644
--- a/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AArch64/AArch64LoadStoreOptimizer.cpp
@@ -29,6 +29,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -66,6 +67,10 @@ static cl::opt<unsigned> LdStLimit("aarch64-load-store-scan-limit",
 static cl::opt<unsigned> UpdateLimit("aarch64-update-scan-limit", cl::init(100),
                                      cl::Hidden);
 
+// Enable register renaming to find additional store pairing opportunities.
+static cl::opt<bool> EnableRenaming("aarch64-load-store-renaming",
+                                    cl::init(true), cl::Hidden);
+
 #define AARCH64_LOAD_STORE_OPT_NAME "AArch64 load / store optimization pass"
 
 namespace {
@@ -673,14 +678,14 @@ AArch64LoadStoreOpt::mergeNarrowZeroStores(MachineBasicBlock::iterator I,
   assert(isPromotableZeroStoreInst(*I) && isPromotableZeroStoreInst(*MergeMI) &&
          "Expected promotable zero stores.");
 
-  MachineBasicBlock::iterator NextI = I;
-  ++NextI;
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
   // If NextI is the second of the two instructions to be merged, we need
   // to skip one further. Either way we merge will invalidate the iterator,
   // and we don't need to scan the new instruction, as it's a pairwise
   // instruction, which we're not considering for further action anyway.
   if (NextI == MergeMI)
-    ++NextI;
+    NextI = next_nodbg(NextI, E);
 
   unsigned Opc = I->getOpcode();
   bool IsScaled = !TII->isUnscaledLdSt(Opc);
@@ -743,18 +748,17 @@ static bool forAllMIsUntilDef(MachineInstr &MI, MCPhysReg DefReg,
                               const TargetRegisterInfo *TRI, unsigned Limit,
                               std::function<bool(MachineInstr &, bool)> &Fn) {
   auto MBB = MI.getParent();
-  for (MachineBasicBlock::reverse_iterator I = MI.getReverseIterator(),
-                                           E = MBB->rend();
-       I != E; I++) {
+  for (MachineInstr &I :
+       instructionsWithoutDebug(MI.getReverseIterator(), MBB->instr_rend())) {
     if (!Limit)
       return false;
     --Limit;
 
-    bool isDef = any_of(I->operands(), [DefReg, TRI](MachineOperand &MOP) {
+    bool isDef = any_of(I.operands(), [DefReg, TRI](MachineOperand &MOP) {
       return MOP.isReg() && MOP.isDef() && !MOP.isDebug() && MOP.getReg() &&
              TRI->regsOverlap(MOP.getReg(), DefReg);
     });
-    if (!Fn(*I, isDef))
+    if (!Fn(I, isDef))
       return false;
     if (isDef)
       break;
@@ -778,14 +782,14 @@ MachineBasicBlock::iterator
 AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
                                       MachineBasicBlock::iterator Paired,
                                       const LdStPairFlags &Flags) {
-  MachineBasicBlock::iterator NextI = I;
-  ++NextI;
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
   // If NextI is the second of the two instructions to be merged, we need
   // to skip one further. Either way we merge will invalidate the iterator,
   // and we don't need to scan the new instruction, as it's a pairwise
   // instruction, which we're not considering for further action anyway.
   if (NextI == Paired)
-    ++NextI;
+    NextI = next_nodbg(NextI, E);
 
   int SExtIdx = Flags.getSExtIdx();
   unsigned Opc =
@@ -1004,8 +1008,8 @@ AArch64LoadStoreOpt::mergePairedInsns(MachineBasicBlock::iterator I,
 MachineBasicBlock::iterator
 AArch64LoadStoreOpt::promoteLoadFromStore(MachineBasicBlock::iterator LoadI,
                                           MachineBasicBlock::iterator StoreI) {
-  MachineBasicBlock::iterator NextI = LoadI;
-  ++NextI;
+  MachineBasicBlock::iterator NextI =
+      next_nodbg(LoadI, LoadI->getParent()->end());
 
   int LoadSize = TII->getMemScale(*LoadI);
   int StoreSize = TII->getMemScale(*StoreI);
@@ -1140,24 +1144,11 @@ static int alignTo(int Num, int PowOf2) {
   return (Num + PowOf2 - 1) & ~(PowOf2 - 1);
 }
 
-static bool mayAlias(MachineInstr &MIa, MachineInstr &MIb,
-                     AliasAnalysis *AA) {
-  // One of the instructions must modify memory.
-  if (!MIa.mayStore() && !MIb.mayStore())
-    return false;
-
-  // Both instructions must be memory operations.
-  if (!MIa.mayLoadOrStore() && !MIb.mayLoadOrStore())
-    return false;
-
-  return MIa.mayAlias(AA, MIb, /*UseTBAA*/false);
-}
-
 static bool mayAlias(MachineInstr &MIa,
                      SmallVectorImpl<MachineInstr *> &MemInsns,
                      AliasAnalysis *AA) {
   for (MachineInstr *MIb : MemInsns)
-    if (mayAlias(MIa, *MIb, AA))
+    if (MIa.mayAlias(AA, *MIb, /*UseTBAA*/ false))
       return true;
 
   return false;
@@ -1183,7 +1174,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
 
   unsigned Count = 0;
   do {
-    --MBBI;
+    MBBI = prev_nodbg(MBBI, B);
     MachineInstr &MI = *MBBI;
 
     // Don't count transient instructions towards the search limit since there
@@ -1215,7 +1206,7 @@ bool AArch64LoadStoreOpt::findMatchingStore(
       return false;
 
     // If we encounter a store aliased with the load, return early.
-    if (MI.mayStore() && mayAlias(LoadMI, MI, AA))
+    if (MI.mayStore() && LoadMI.mayAlias(AA, MI, /*UseTBAA*/ false))
       return false;
   } while (MBBI != B && Count < Limit);
   return false;
@@ -1296,7 +1287,23 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
     LLVM_DEBUG(dbgs() << "  Operand not killed at " << FirstMI << "\n");
     return false;
   }
-  auto canRenameMOP = [](const MachineOperand &MOP) {
+  auto canRenameMOP = [TRI](const MachineOperand &MOP) {
+    if (MOP.isReg()) {
+      auto *RegClass = TRI->getMinimalPhysRegClass(MOP.getReg());
+      // Renaming registers with multiple disjunct sub-registers (e.g. the
+      // result of a LD3) means that all sub-registers are renamed, potentially
+      // impacting other instructions we did not check. Bail out.
+      // Note that this relies on the structure of the AArch64 register file. In
+      // particular, a subregister cannot be written without overwriting the
+      // whole register.
+      if (RegClass->HasDisjunctSubRegs) {
+        LLVM_DEBUG(
+            dbgs()
+            << "  Cannot rename operands with multiple disjunct subregisters ("
+            << MOP << ")\n");
+        return false;
+      }
+    }
     return MOP.isImplicit() ||
            (MOP.isRenamable() && !MOP.isEarlyClobber() && !MOP.isTied());
   };
@@ -1325,6 +1332,19 @@ canRenameUpToDef(MachineInstr &FirstMI, LiveRegUnits &UsedInBetween,
 
     // For defs, check if we can rename the first def of RegToRename.
     if (FoundDef) {
+      // For some pseudo instructions, we might not generate code in the end
+      // (e.g. KILL) and we would end up without a correct def for the rename
+      // register.
+      // TODO: This might be overly conservative and we could handle those cases
+      // in multiple ways:
+      //       1. Insert an extra copy, to materialize the def.
+      //       2. Skip pseudo-defs until we find an non-pseudo def.
+      if (MI.isPseudo()) {
+        LLVM_DEBUG(dbgs() << "  Cannot rename pseudo instruction " << MI
+                          << "\n");
+        return false;
+      }
+
       for (auto &MOP : MI.operands()) {
         if (!MOP.isReg() || !MOP.isDef() || MOP.isDebug() || !MOP.getReg() ||
             !TRI->regsOverlap(MOP.getReg(), RegToRename))
@@ -1422,7 +1442,7 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
   MachineBasicBlock::iterator MBBI = I;
   MachineBasicBlock::iterator MBBIWithRenameReg;
   MachineInstr &FirstMI = *I;
-  ++MBBI;
+  MBBI = next_nodbg(MBBI, E);
 
   bool MayLoad = FirstMI.mayLoad();
   bool IsUnscaled = TII->isUnscaledLdSt(FirstMI);
@@ -1433,6 +1453,9 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
   bool IsPromotableZeroStore = isPromotableZeroStoreInst(FirstMI);
 
   Optional<bool> MaybeCanRename = None;
+  if (!EnableRenaming)
+    MaybeCanRename = {false};
+
   SmallPtrSet<const TargetRegisterClass *, 5> RequiredClasses;
   LiveRegUnits UsedInBetween;
   UsedInBetween.init(*TRI);
@@ -1447,7 +1470,8 @@ AArch64LoadStoreOpt::findMatchingInsn(MachineBasicBlock::iterator I,
   // Remember any instructions that read/write memory between FirstMI and MI.
   SmallVector<MachineInstr *, 4> MemInsns;
 
-  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+  for (unsigned Count = 0; MBBI != E && Count < Limit;
+       MBBI = next_nodbg(MBBI, E)) {
     MachineInstr &MI = *MBBI;
 
     UsedInBetween.accumulate(MI);
@@ -1616,12 +1640,13 @@ AArch64LoadStoreOpt::mergeUpdateInsn(MachineBasicBlock::iterator I,
   assert((Update->getOpcode() == AArch64::ADDXri ||
           Update->getOpcode() == AArch64::SUBXri) &&
          "Unexpected base register update instruction to merge!");
-  MachineBasicBlock::iterator NextI = I;
+  MachineBasicBlock::iterator E = I->getParent()->end();
+  MachineBasicBlock::iterator NextI = next_nodbg(I, E);
   // Return the instruction following the merged instruction, which is
   // the instruction following our unmerged load. Unless that's the add/sub
   // instruction we're merging, in which case it's the one after that.
-  if (++NextI == Update)
-    ++NextI;
+  if (NextI == Update)
+    NextI = next_nodbg(NextI, E);
 
   int Value = Update->getOperand(2).getImm();
   assert(AArch64_AM::getShiftValue(Update->getOperand(3).getImm()) == 0 &&
@@ -1759,8 +1784,24 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
   // insn (inclusive) and the second insn.
   ModifiedRegUnits.clear();
   UsedRegUnits.clear();
-  ++MBBI;
-  for (unsigned Count = 0; MBBI != E && Count < Limit; ++MBBI) {
+  MBBI = next_nodbg(MBBI, E);
+
+  // We can't post-increment the stack pointer if any instruction between
+  // the memory access (I) and the increment (MBBI) can access the memory
+  // region defined by [SP, MBBI].
+  const bool BaseRegSP = BaseReg == AArch64::SP;
+  if (BaseRegSP) {
+    // FIXME: For now, we always block the optimization over SP in windows
+    // targets as it requires to adjust the unwind/debug info, messing up
+    // the unwind info can actually cause a miscompile.
+    const MCAsmInfo *MAI = I->getMF()->getTarget().getMCAsmInfo();
+    if (MAI->usesWindowsCFI() &&
+        I->getMF()->getFunction().needsUnwindTableEntry())
+      return E;
+  }
+
+  for (unsigned Count = 0; MBBI != E && Count < Limit;
+       MBBI = next_nodbg(MBBI, E)) {
     MachineInstr &MI = *MBBI;
 
     // Don't count transient instructions towards the search limit since there
@@ -1777,8 +1818,11 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnForward(
 
     // Otherwise, if the base register is used or modified, we have no match, so
     // return early.
+    // If we are optimizing SP, do not allow instructions that may load or store
+    // in between the load and the optimized value update.
     if (!ModifiedRegUnits.available(BaseReg) ||
-        !UsedRegUnits.available(BaseReg))
+        !UsedRegUnits.available(BaseReg) ||
+        (BaseRegSP && MBBI->mayLoadOrStore()))
       return E;
   }
   return E;
@@ -1815,7 +1859,7 @@ MachineBasicBlock::iterator AArch64LoadStoreOpt::findMatchingUpdateInsnBackward(
   UsedRegUnits.clear();
   unsigned Count = 0;
   do {
-    --MBBI;
+    MBBI = prev_nodbg(MBBI, B);
     MachineInstr &MI = *MBBI;
 
     // Don't count transient instructions towards the search limit since there
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
new file mode 100644
index 000000000000..a37e38072554
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.cpp
@@ -0,0 +1,32 @@
+//=- AArch64MachineFunctionInfo.cpp - AArch64 Machine Function Info ---------=//
+
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements AArch64-specific per-machine-function
+/// information.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AArch64MachineFunctionInfo.h"
+
+using namespace llvm;
+
+yaml::AArch64FunctionInfo::AArch64FunctionInfo(
+    const llvm::AArch64FunctionInfo &MFI)
+    : HasRedZone(MFI.hasRedZone()) {}
+
+void yaml::AArch64FunctionInfo::mappingImpl(yaml::IO &YamlIO) {
+  MappingTraits<AArch64FunctionInfo>::mapping(YamlIO, *this);
+}
+
+void AArch64FunctionInfo::initializeBaseYamlFields(
+    const yaml::AArch64FunctionInfo &YamlMFI) {
+  if (YamlMFI.HasRedZone.hasValue())
+    HasRedZone = YamlMFI.HasRedZone;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 6ddb3fdb0046..84aa53f2bece 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -18,6 +18,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CallingConvLower.h"
+#include "llvm/CodeGen/MIRYamlMapping.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
 #include "llvm/IR/Function.h"
@@ -26,6 +27,10 @@
 
 namespace llvm {
 
+namespace yaml {
+struct AArch64FunctionInfo;
+} // end namespace yaml
+
 class MachineInstr;
 
 /// AArch64FunctionInfo - This class is derived from MachineFunctionInfo and
@@ -126,6 +131,10 @@ class AArch64FunctionInfo final : public MachineFunctionInfo {
   // stack slot.
   unsigned TaggedBasePointerOffset = 0;
 
+  /// OutliningStyle denotes, if a function was outined, how it was outlined,
+  /// e.g. Tail Call, Thunk, or Function if none apply.
+  Optional<std::string> OutliningStyle;
+
 public:
   AArch64FunctionInfo() = default;
 
@@ -137,6 +146,7 @@ public:
     if (MF.getFunction().hasFnAttribute(Attribute::NoRedZone))
       HasRedZone = false;
   }
+  void initializeBaseYamlFields(const yaml::AArch64FunctionInfo &YamlMFI);
 
   unsigned getBytesInStackArgArea() const { return BytesInStackArgArea; }
   void setBytesInStackArgArea(unsigned bytes) { BytesInStackArgArea = bytes; }
@@ -173,6 +183,9 @@ public:
   void setLocalStackSize(uint64_t Size) { LocalStackSize = Size; }
   uint64_t getLocalStackSize() const { return LocalStackSize; }
 
+  void setOutliningStyle(std::string Style) { OutliningStyle = Style; }
+  Optional<std::string> getOutliningStyle() const { return OutliningStyle; }
+
   void setCalleeSavedStackSize(unsigned Size) {
     CalleeSavedStackSize = Size;
     HasCalleeSavedStackSize = true;
@@ -333,6 +346,25 @@ private:
   DenseMap<int, std::pair<unsigned, MCSymbol *>> JumpTableEntryInfo;
 };
 
+namespace yaml {
+struct AArch64FunctionInfo final : public yaml::MachineFunctionInfo {
+  Optional<bool> HasRedZone;
+
+  AArch64FunctionInfo() = default;
+  AArch64FunctionInfo(const llvm::AArch64FunctionInfo &MFI);
+
+  void mappingImpl(yaml::IO &YamlIO) override;
+  ~AArch64FunctionInfo() = default;
+};
+
+template <> struct MappingTraits<AArch64FunctionInfo> {
+  static void mapping(IO &YamlIO, AArch64FunctionInfo &MFI) {
+    YamlIO.mapOptional("hasRedZone", MFI.HasRedZone);
+  }
+};
+
+} // end namespace yaml
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_AARCH64_AARCH64MACHINEFUNCTIONINFO_H
diff --git a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
index 9135f1b40122..9044c94bc4fe 100644
--- a/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
+++ b/llvm/lib/Target/AArch64/AArch64PromoteConstant.cpp
@@ -250,6 +250,20 @@ static bool isConstantUsingVectorTy(const Type *CstTy) {
   return false;
 }
 
+// Returns true if \p C contains only ConstantData leafs and no global values,
+// block addresses or constant expressions. Traverses ConstantAggregates.
+static bool containsOnlyConstantData(const Constant *C) {
+  if (isa<ConstantData>(C))
+    return true;
+
+  if (isa<GlobalValue>(C) || isa<BlockAddress>(C) || isa<ConstantExpr>(C))
+    return false;
+
+  return all_of(C->operands(), [](const Use &U) {
+    return containsOnlyConstantData(cast<Constant>(&U));
+  });
+}
+
 /// Check if the given use (Instruction + OpIdx) of Cst should be converted into
 /// a load of a global variable initialized with Cst.
 /// A use should be converted if it is legal to do so.
@@ -304,7 +318,7 @@ static bool shouldConvertUse(const Constant *Cst, const Instruction *Instr,
 
   // Do not mess with inline asm.
   const CallInst *CI = dyn_cast<const CallInst>(Instr);
-  return !(CI && isa<const InlineAsm>(CI->getCalledValue()));
+  return !(CI && CI->isInlineAsm());
 }
 
 /// Check if the given Cst should be converted into
@@ -550,9 +564,10 @@ bool AArch64PromoteConstant::runOnFunction(Function &F,
     for (Use &U : I.operands()) {
       Constant *Cst = dyn_cast<Constant>(U);
       // There is no point in promoting global values as they are already
-      // global. Do not promote constant expressions either, as they may
-      // require some code expansion.
-      if (!Cst || isa<GlobalValue>(Cst) || isa<ConstantExpr>(Cst))
+      // global. Do not promote constants containing constant expression, global
+      // values or blockaddresses either, as they may require some code
+      // expansion.
+      if (!Cst || isa<GlobalValue>(Cst) || !containsOnlyConstantData(Cst))
         continue;
 
       // Check if this constant is worth promoting.
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
index 14f839cd4f81..886158ca4490 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.cpp
@@ -43,24 +43,27 @@ AArch64RegisterInfo::AArch64RegisterInfo(const Triple &TT)
 const MCPhysReg *
 AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   assert(MF && "Invalid MachineFunction pointer.");
-  if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
-    return CSR_Win_AArch64_CFGuard_Check_SaveList;
-  if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows())
-    return CSR_Win_AArch64_AAPCS_SaveList;
+
   if (MF->getFunction().getCallingConv() == CallingConv::GHC)
     // GHC set of callee saved regs is empty as all those regs are
     // used for passing STG regs around
     return CSR_AArch64_NoRegs_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::AnyReg)
     return CSR_AArch64_AllRegs_SaveList;
+
+  // Darwin has its own CSR_AArch64_AAPCS_SaveList, which means most CSR save
+  // lists depending on that will need to have their Darwin variant as well.
+  if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
+    return getDarwinCalleeSavedRegs(MF);
+
+  if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
+    return CSR_Win_AArch64_CFGuard_Check_SaveList;
+  if (MF->getSubtarget<AArch64Subtarget>().isTargetWindows())
+    return CSR_Win_AArch64_AAPCS_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
     return CSR_AArch64_AAVPCS_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall)
     return CSR_AArch64_SVE_AAPCS_SaveList;
-  if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS)
-    return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR() ?
-           CSR_AArch64_CXX_TLS_Darwin_PE_SaveList :
-           CSR_AArch64_CXX_TLS_Darwin_SaveList;
   if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
           ->supportSwiftError() &&
       MF->getFunction().getAttributes().hasAttrSomewhere(
@@ -68,17 +71,47 @@ AArch64RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_AArch64_AAPCS_SwiftError_SaveList;
   if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
     return CSR_AArch64_RT_MostRegs_SaveList;
-  if (MF->getSubtarget<AArch64Subtarget>().isTargetDarwin())
-    return CSR_Darwin_AArch64_AAPCS_SaveList;
+  if (MF->getFunction().getCallingConv() == CallingConv::Win64)
+    // This is for OSes other than Windows; Windows is a separate case further
+    // above.
+    return CSR_AArch64_AAPCS_X18_SaveList;
   return CSR_AArch64_AAPCS_SaveList;
 }
 
+const MCPhysReg *
+AArch64RegisterInfo::getDarwinCalleeSavedRegs(const MachineFunction *MF) const {
+  assert(MF && "Invalid MachineFunction pointer.");
+  assert(MF->getSubtarget<AArch64Subtarget>().isTargetDarwin() &&
+         "Invalid subtarget for getDarwinCalleeSavedRegs");
+
+  if (MF->getFunction().getCallingConv() == CallingConv::CFGuard_Check)
+    report_fatal_error(
+        "Calling convention CFGuard_Check is unsupported on Darwin.");
+  if (MF->getFunction().getCallingConv() == CallingConv::AArch64_VectorCall)
+    return CSR_Darwin_AArch64_AAVPCS_SaveList;
+  if (MF->getFunction().getCallingConv() == CallingConv::AArch64_SVE_VectorCall)
+    report_fatal_error(
+        "Calling convention SVE_VectorCall is unsupported on Darwin.");
+  if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS)
+    return MF->getInfo<AArch64FunctionInfo>()->isSplitCSR()
+               ? CSR_Darwin_AArch64_CXX_TLS_PE_SaveList
+               : CSR_Darwin_AArch64_CXX_TLS_SaveList;
+  if (MF->getSubtarget<AArch64Subtarget>().getTargetLowering()
+          ->supportSwiftError() &&
+      MF->getFunction().getAttributes().hasAttrSomewhere(
+          Attribute::SwiftError))
+    return CSR_Darwin_AArch64_AAPCS_SwiftError_SaveList;
+  if (MF->getFunction().getCallingConv() == CallingConv::PreserveMost)
+    return CSR_Darwin_AArch64_RT_MostRegs_SaveList;
+  return CSR_Darwin_AArch64_AAPCS_SaveList;
+}
+
 const MCPhysReg *AArch64RegisterInfo::getCalleeSavedRegsViaCopy(
     const MachineFunction *MF) const {
   assert(MF && "Invalid MachineFunction pointer.");
   if (MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
       MF->getInfo<AArch64FunctionInfo>()->isSplitCSR())
-    return CSR_AArch64_CXX_TLS_Darwin_ViaCopy_SaveList;
+    return CSR_Darwin_AArch64_CXX_TLS_ViaCopy_SaveList;
   return nullptr;
 }
 
@@ -113,6 +146,32 @@ AArch64RegisterInfo::getSubClassWithSubReg(const TargetRegisterClass *RC,
 }
 
 const uint32_t *
+AArch64RegisterInfo::getDarwinCallPreservedMask(const MachineFunction &MF,
+                                                CallingConv::ID CC) const {
+  assert(MF.getSubtarget<AArch64Subtarget>().isTargetDarwin() &&
+         "Invalid subtarget for getDarwinCallPreservedMask");
+
+  if (CC == CallingConv::CXX_FAST_TLS)
+    return CSR_Darwin_AArch64_CXX_TLS_RegMask;
+  if (CC == CallingConv::AArch64_VectorCall)
+    return CSR_Darwin_AArch64_AAVPCS_RegMask;
+  if (CC == CallingConv::AArch64_SVE_VectorCall)
+    report_fatal_error(
+        "Calling convention SVE_VectorCall is unsupported on Darwin.");
+  if (CC == CallingConv::CFGuard_Check)
+    report_fatal_error(
+        "Calling convention CFGuard_Check is unsupported on Darwin.");
+  if (MF.getSubtarget<AArch64Subtarget>()
+          .getTargetLowering()
+          ->supportSwiftError() &&
+      MF.getFunction().getAttributes().hasAttrSomewhere(Attribute::SwiftError))
+    return CSR_Darwin_AArch64_AAPCS_SwiftError_RegMask;
+  if (CC == CallingConv::PreserveMost)
+    return CSR_Darwin_AArch64_RT_MostRegs_RegMask;
+  return CSR_Darwin_AArch64_AAPCS_RegMask;
+}
+
+const uint32_t *
 AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                           CallingConv::ID CC) const {
   bool SCS = MF.getFunction().hasFnAttribute(Attribute::ShadowCallStack);
@@ -121,9 +180,14 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return SCS ? CSR_AArch64_NoRegs_SCS_RegMask : CSR_AArch64_NoRegs_RegMask;
   if (CC == CallingConv::AnyReg)
     return SCS ? CSR_AArch64_AllRegs_SCS_RegMask : CSR_AArch64_AllRegs_RegMask;
-  if (CC == CallingConv::CXX_FAST_TLS)
-    return SCS ? CSR_AArch64_CXX_TLS_Darwin_SCS_RegMask
-               : CSR_AArch64_CXX_TLS_Darwin_RegMask;
+
+  // All the following calling conventions are handled differently on Darwin.
+  if (MF.getSubtarget<AArch64Subtarget>().isTargetDarwin()) {
+    if (SCS)
+      report_fatal_error("ShadowCallStack attribute not supported on Darwin.");
+    return getDarwinCallPreservedMask(MF, CC);
+  }
+
   if (CC == CallingConv::AArch64_VectorCall)
     return SCS ? CSR_AArch64_AAVPCS_SCS_RegMask : CSR_AArch64_AAVPCS_RegMask;
   if (CC == CallingConv::AArch64_SVE_VectorCall)
@@ -145,7 +209,7 @@ AArch64RegisterInfo::getCallPreservedMask(const MachineFunction &MF,
 
 const uint32_t *AArch64RegisterInfo::getTLSCallPreservedMask() const {
   if (TT.isOSDarwin())
-    return CSR_AArch64_TLS_Darwin_RegMask;
+    return CSR_Darwin_AArch64_TLS_RegMask;
 
   assert(TT.isOSBinFormatELF() && "Invalid target");
   return CSR_AArch64_TLS_ELF_RegMask;
@@ -186,6 +250,8 @@ AArch64RegisterInfo::getThisReturnPreservedMask(const MachineFunction &MF,
   // In case that the calling convention does not use the same register for
   // both, the function should return NULL (does not currently apply)
   assert(CC != CallingConv::GHC && "should not be GHC calling convention.");
+  if (MF.getSubtarget<AArch64Subtarget>().isTargetDarwin())
+    return CSR_Darwin_AArch64_AAPCS_ThisReturn_RegMask;
   return CSR_AArch64_AAPCS_ThisReturn_RegMask;
 }
 
@@ -222,7 +288,7 @@ AArch64RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 }
 
 bool AArch64RegisterInfo::isReservedReg(const MachineFunction &MF,
-                                      unsigned Reg) const {
+                                        MCRegister Reg) const {
   return getReservedRegs(MF)[Reg];
 }
 
@@ -240,11 +306,11 @@ void AArch64RegisterInfo::emitReservedArgRegCallError(
 }
 
 bool AArch64RegisterInfo::isAsmClobberable(const MachineFunction &MF,
-                                          unsigned PhysReg) const {
+                                          MCRegister PhysReg) const {
   return !isReservedReg(MF, PhysReg);
 }
 
-bool AArch64RegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
+bool AArch64RegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
   return PhysReg == AArch64::WZR || PhysReg == AArch64::XZR;
 }
 
@@ -390,12 +456,16 @@ bool AArch64RegisterInfo::needsFrameBaseReg(MachineInstr *MI,
   if (isFrameOffsetLegal(MI, AArch64::SP, Offset))
     return false;
 
+  // If even offset 0 is illegal, we don't want a virtual base register.
+  if (!isFrameOffsetLegal(MI, AArch64::SP, 0))
+    return false;
+
   // The offset likely isn't legal; we want to allocate a virtual base register.
   return true;
 }
 
 bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
-                                             unsigned BaseReg,
+                                             Register BaseReg,
                                              int64_t Offset) const {
   assert(MI && "Unable to get the legal offset for nil instruction.");
   StackOffset SaveOffset(Offset, MVT::i8);
@@ -405,7 +475,7 @@ bool AArch64RegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
 /// Insert defining instruction(s) for BaseReg to be a pointer to FrameIdx
 /// at the beginning of the basic block.
 void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                                                       unsigned BaseReg,
+                                                       Register BaseReg,
                                                        int FrameIdx,
                                                        int64_t Offset) const {
   MachineBasicBlock::iterator Ins = MBB->begin();
@@ -426,7 +496,7 @@ void AArch64RegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
       .addImm(Shifter);
 }
 
-void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                                             int64_t Offset) const {
   // ARM doesn't need the general 64-bit offsets
   StackOffset Off(Offset, MVT::i8);
@@ -445,6 +515,27 @@ void AArch64RegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
   (void)Done;
 }
 
+// Create a scratch register for the frame index elimination in an instruction.
+// This function has special handling of stack tagging loop pseudos, in which
+// case it can also change the instruction opcode (but not the operands).
+static Register
+createScratchRegisterForInstruction(MachineInstr &MI,
+                                    const AArch64InstrInfo *TII) {
+  // ST*Gloop have a reserved scratch register in operand 1. Use it, and also
+  // replace the instruction with the writeback variant because it will now
+  // satisfy the operand constraints for it.
+  if (MI.getOpcode() == AArch64::STGloop) {
+    MI.setDesc(TII->get(AArch64::STGloop_wback));
+    return MI.getOperand(1).getReg();
+  } else if (MI.getOpcode() == AArch64::STZGloop) {
+    MI.setDesc(TII->get(AArch64::STZGloop_wback));
+    return MI.getOperand(1).getReg();
+  } else {
+    return MI.getMF()->getRegInfo().createVirtualRegister(
+        &AArch64::GPR64RegClass);
+  }
+}
+
 void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                               int SPAdj, unsigned FIOperandNum,
                                               RegScavenger *RS) const {
@@ -461,7 +552,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   bool Tagged =
       MI.getOperand(FIOperandNum).getTargetFlags() & AArch64II::MO_TAGGED;
-  unsigned FrameReg;
+  Register FrameReg;
 
   // Special handling of dbg_value, stackmap and patchpoint instructions.
   if (MI.isDebugValue() || MI.getOpcode() == TargetOpcode::STACKMAP ||
@@ -531,8 +622,7 @@ void AArch64RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   // If we get here, the immediate doesn't fit into the instruction.  We folded
   // as much as possible above.  Handle the rest, providing a register that is
   // SP+LargeImm.
-  Register ScratchReg =
-      MF.getRegInfo().createVirtualRegister(&AArch64::GPR64RegClass);
+  Register ScratchReg = createScratchRegisterForInstruction(MI, TII);
   emitFrameOffset(MBB, II, MI.getDebugLoc(), ScratchReg, FrameReg, Offset, TII);
   MI.getOperand(FIOperandNum).ChangeToRegister(ScratchReg, false, false, true);
 }
@@ -572,6 +662,8 @@ unsigned AArch64RegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
     return 32;
 
   case AArch64::FPR128_loRegClassID:
+  case AArch64::FPR64_loRegClassID:
+  case AArch64::FPR16_loRegClassID:
     return 16;
   }
 }
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
index 2c3f82c530d8..22a8ba76c611 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.h
@@ -34,7 +34,7 @@ public:
     return getEncodingValue(i);
   }
 
-  bool isReservedReg(const MachineFunction &MF, unsigned Reg) const;
+  bool isReservedReg(const MachineFunction &MF, MCRegister Reg) const;
   bool isAnyArgRegReserved(const MachineFunction &MF) const;
   void emitReservedArgRegCallError(const MachineFunction &MF) const;
 
@@ -44,10 +44,13 @@ public:
 
   /// Code Generation virtual methods...
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
+  const MCPhysReg *getDarwinCalleeSavedRegs(const MachineFunction *MF) const;
   const MCPhysReg *
   getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID) const override;
+  const uint32_t *getDarwinCallPreservedMask(const MachineFunction &MF,
+                                             CallingConv::ID) const;
 
   unsigned getCSRFirstUseCost() const override {
     // The cost will be compared against BlockFrequency where entry has the
@@ -83,8 +86,8 @@ public:
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   bool isAsmClobberable(const MachineFunction &MF,
-                       unsigned PhysReg) const override;
-  bool isConstantPhysReg(unsigned PhysReg) const override;
+                       MCRegister PhysReg) const override;
+  bool isConstantPhysReg(MCRegister PhysReg) const override;
   const TargetRegisterClass *
   getPointerRegClass(const MachineFunction &MF,
                      unsigned Kind = 0) const override;
@@ -96,12 +99,12 @@ public:
   bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
 
   bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
-  bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+  bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
                           int64_t Offset) const override;
-  void materializeFrameBaseRegister(MachineBasicBlock *MBB, unsigned BaseReg,
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
                                     int FrameIdx,
                                     int64_t Offset) const override;
-  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+  void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                          int64_t Offset) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                            unsigned FIOperandNum,
@@ -118,10 +121,6 @@ public:
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const override;
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction&) const override {
-    return true;
-  }
-
   unsigned getLocalAddressRegister(const MachineFunction &MF) const;
 };
 
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
index f52feab03953..bd05c56009a1 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64RegisterInfo.td
@@ -422,25 +422,35 @@ def Q31   : AArch64Reg<31, "q31", [D31], ["v31", ""]>, DwarfRegAlias<B31>;
 def FPR8  : RegisterClass<"AArch64", [untyped], 8, (sequence "B%u", 0, 31)> {
   let Size = 8;
 }
-def FPR16 : RegisterClass<"AArch64", [f16], 16, (sequence "H%u", 0, 31)> {
+def FPR16 : RegisterClass<"AArch64", [f16, bf16], 16, (sequence "H%u", 0, 31)> {
+  let Size = 16;
+}
+
+def FPR16_lo : RegisterClass<"AArch64", [f16], 16, (trunc FPR16, 16)> {
   let Size = 16;
 }
 def FPR32 : RegisterClass<"AArch64", [f32, i32], 32,(sequence "S%u", 0, 31)>;
 def FPR64 : RegisterClass<"AArch64", [f64, i64, v2f32, v1f64, v8i8, v4i16, v2i32,
-                                    v1i64, v4f16],
-                                    64, (sequence "D%u", 0, 31)>;
+                                      v1i64, v4f16, v4bf16],
+                                     64, (sequence "D%u", 0, 31)>;
+def FPR64_lo : RegisterClass<"AArch64",
+                             [v8i8, v4i16, v2i32, v1i64, v4f16, v4bf16, v2f32,
+                              v1f64],
+                             64, (trunc FPR64, 16)>;
+
 // We don't (yet) have an f128 legal type, so don't use that here. We
 // normalize 128-bit vectors to v2f64 for arg passing and such, so use
 // that here.
 def FPR128 : RegisterClass<"AArch64",
                            [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, f128,
-                            v8f16],
+                            v8f16, v8bf16],
                            128, (sequence "Q%u", 0, 31)>;
 
 // The lower 16 vector registers.  Some instructions can only take registers
 // in this range.
 def FPR128_lo : RegisterClass<"AArch64",
-                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16],
+                              [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16,
+                               v8bf16],
                               128, (trunc FPR128, 16)>;
 
 // Pairs, triples, and quads of 64-bit vector registers.
@@ -503,6 +513,9 @@ def VectorRegLoAsmOperand : AsmOperandClass {
   let Name = "VectorRegLo";
   let PredicateMethod = "isNeonVectorRegLo";
 }
+def V64_lo : RegisterOperand<FPR64_lo, "printVRegOperand"> {
+  let ParserMatchClass = VectorRegLoAsmOperand;
+}
 def V128_lo : RegisterOperand<FPR128_lo, "printVRegOperand"> {
   let ParserMatchClass = VectorRegLoAsmOperand;
 }
@@ -641,6 +654,10 @@ def FPR16Op  : RegisterOperand<FPR16, "printOperand"> {
   let ParserMatchClass = FPRAsmOperand<"FPR16">;
 }
 
+def FPR16Op_lo  : RegisterOperand<FPR16_lo, "printOperand"> {
+  let ParserMatchClass = FPRAsmOperand<"FPR16_lo">;
+}
+
 def FPR32Op  : RegisterOperand<FPR32, "printOperand"> {
   let ParserMatchClass = FPRAsmOperand<"FPR32">;
 }
@@ -664,11 +681,11 @@ def XSeqPairs : RegisterTuples<[sube64, subo64],
                                [(decimate (rotl GPR64, 0), 2),
                                 (decimate (rotl GPR64, 1), 2)]>;
 
-def WSeqPairsClass   : RegisterClass<"AArch64", [untyped], 32, 
+def WSeqPairsClass   : RegisterClass<"AArch64", [untyped], 32,
                                      (add WSeqPairs)>{
   let Size = 64;
 }
-def XSeqPairsClass   : RegisterClass<"AArch64", [untyped], 64, 
+def XSeqPairsClass   : RegisterClass<"AArch64", [untyped], 64,
                                      (add XSeqPairs)>{
   let Size = 128;
 }
@@ -780,7 +797,7 @@ def Z30   : AArch64Reg<30, "z30", [Q30, Z30_HI]>, DwarfRegNum<[126]>;
 def Z31   : AArch64Reg<31, "z31", [Q31, Z31_HI]>, DwarfRegNum<[127]>;
 }
 
-// Enum descibing the element size for destructive
+// Enum describing the element size for destructive
 // operations.
 class ElementSizeEnum<bits<3> val> {
   bits<3> Value = val;
@@ -862,6 +879,7 @@ def PPR3b64  : PPRRegOp<"d", PPRAsmOp3b64,  ElementSizeD, PPR_3b>;
 class ZPRClass<int lastreg> : RegisterClass<"AArch64",
                                             [nxv16i8, nxv8i16, nxv4i32, nxv2i64,
                                              nxv2f16, nxv4f16, nxv8f16,
+                                             nxv2bf16, nxv4bf16, nxv8bf16,
                                              nxv2f32, nxv4f32,
                                              nxv2f64],
                                             128, (sequence "Z%u", 0, lastreg)> {
diff --git a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
index 28a7e680849b..fc31e701d3af 100644
--- a/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SIMDInstrOpt.cpp
@@ -219,7 +219,7 @@ shouldReplaceInst(MachineFunction *MF, const MCInstrDesc *InstDesc,
                   SmallVectorImpl<const MCInstrDesc*> &InstDescRepl) {
   // Check if replacement decision is already available in the cached table.
   // if so, return it.
-  std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU();
+  std::string Subtarget = std::string(SchedModel.getSubtargetInfo()->getCPU());
   auto InstID = std::make_pair(InstDesc->getOpcode(), Subtarget);
   if (SIMDInstrTable.find(InstID) != SIMDInstrTable.end())
     return SIMDInstrTable[InstID];
@@ -288,7 +288,8 @@ bool AArch64SIMDInstrOpt::shouldExitEarly(MachineFunction *MF, Subpass SP) {
 
   // For this optimization, check for all concerned instructions.
   case Interleave:
-    std::string Subtarget = SchedModel.getSubtargetInfo()->getCPU();
+    std::string Subtarget =
+        std::string(SchedModel.getSubtargetInfo()->getCPU());
     if (InterlEarlyExit.find(Subtarget) != InterlEarlyExit.end())
       return InterlEarlyExit[Subtarget];
 
diff --git a/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
new file mode 100644
index 000000000000..cb4dc8462f68
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SLSHardening.cpp
@@ -0,0 +1,443 @@
+//===- AArch64SLSHardening.cpp - Harden Straight Line Missspeculation -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass to insert code to mitigate against side channel
+// vulnerabilities that may happen under straight line miss-speculation.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64InstrInfo.h"
+#include "AArch64Subtarget.h"
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/IndirectThunks.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RegisterScavenging.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CodeGen.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "aarch64-sls-hardening"
+
+#define AARCH64_SLS_HARDENING_NAME "AArch64 sls hardening pass"
+
+namespace {
+
+class AArch64SLSHardening : public MachineFunctionPass {
+public:
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+  const AArch64Subtarget *ST;
+
+  static char ID;
+
+  AArch64SLSHardening() : MachineFunctionPass(ID) {
+    initializeAArch64SLSHardeningPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  StringRef getPassName() const override { return AARCH64_SLS_HARDENING_NAME; }
+
+private:
+  bool hardenReturnsAndBRs(MachineBasicBlock &MBB) const;
+  bool hardenBLRs(MachineBasicBlock &MBB) const;
+  MachineBasicBlock &ConvertBLRToBL(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator) const;
+};
+
+} // end anonymous namespace
+
+char AArch64SLSHardening::ID = 0;
+
+INITIALIZE_PASS(AArch64SLSHardening, "aarch64-sls-hardening",
+                AARCH64_SLS_HARDENING_NAME, false, false)
+
+static void insertSpeculationBarrier(const AArch64Subtarget *ST,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     DebugLoc DL,
+                                     bool AlwaysUseISBDSB = false) {
+  assert(MBBI != MBB.begin() &&
+         "Must not insert SpeculationBarrierEndBB as only instruction in MBB.");
+  assert(std::prev(MBBI)->isBarrier() &&
+         "SpeculationBarrierEndBB must only follow unconditional control flow "
+         "instructions.");
+  assert(std::prev(MBBI)->isTerminator() &&
+         "SpeculationBarrierEndBB must only follow terminators.");
+  const TargetInstrInfo *TII = ST->getInstrInfo();
+  unsigned BarrierOpc = ST->hasSB() && !AlwaysUseISBDSB
+                            ? AArch64::SpeculationBarrierSBEndBB
+                            : AArch64::SpeculationBarrierISBDSBEndBB;
+  if (MBBI == MBB.end() ||
+      (MBBI->getOpcode() != AArch64::SpeculationBarrierSBEndBB &&
+       MBBI->getOpcode() != AArch64::SpeculationBarrierISBDSBEndBB))
+    BuildMI(MBB, MBBI, DL, TII->get(BarrierOpc));
+}
+
+bool AArch64SLSHardening::runOnMachineFunction(MachineFunction &MF) {
+  ST = &MF.getSubtarget<AArch64Subtarget>();
+  TII = MF.getSubtarget().getInstrInfo();
+  TRI = MF.getSubtarget().getRegisterInfo();
+
+  bool Modified = false;
+  for (auto &MBB : MF) {
+    Modified |= hardenReturnsAndBRs(MBB);
+    Modified |= hardenBLRs(MBB);
+  }
+
+  return Modified;
+}
+
+static bool isBLR(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case AArch64::BLR:
+  case AArch64::BLRNoIP:
+    return true;
+  case AArch64::BLRAA:
+  case AArch64::BLRAB:
+  case AArch64::BLRAAZ:
+  case AArch64::BLRABZ:
+    llvm_unreachable("Currently, LLVM's code generator does not support "
+                     "producing BLRA* instructions. Therefore, there's no "
+                     "support in this pass for those instructions.");
+  }
+  return false;
+}
+
+bool AArch64SLSHardening::hardenReturnsAndBRs(MachineBasicBlock &MBB) const {
+  if (!ST->hardenSlsRetBr())
+    return false;
+  bool Modified = false;
+  MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator(), E = MBB.end();
+  MachineBasicBlock::iterator NextMBBI;
+  for (; MBBI != E; MBBI = NextMBBI) {
+    MachineInstr &MI = *MBBI;
+    NextMBBI = std::next(MBBI);
+    if (MI.isReturn() || isIndirectBranchOpcode(MI.getOpcode())) {
+      assert(MI.isTerminator());
+      insertSpeculationBarrier(ST, MBB, std::next(MBBI), MI.getDebugLoc());
+      Modified = true;
+    }
+  }
+  return Modified;
+}
+
+static const char SLSBLRNamePrefix[] = "__llvm_slsblr_thunk_";
+
+static const struct ThunkNameAndReg {
+  const char* Name;
+  Register Reg;
+} SLSBLRThunks[] = {
+  { "__llvm_slsblr_thunk_x0",  AArch64::X0},
+  { "__llvm_slsblr_thunk_x1",  AArch64::X1},
+  { "__llvm_slsblr_thunk_x2",  AArch64::X2},
+  { "__llvm_slsblr_thunk_x3",  AArch64::X3},
+  { "__llvm_slsblr_thunk_x4",  AArch64::X4},
+  { "__llvm_slsblr_thunk_x5",  AArch64::X5},
+  { "__llvm_slsblr_thunk_x6",  AArch64::X6},
+  { "__llvm_slsblr_thunk_x7",  AArch64::X7},
+  { "__llvm_slsblr_thunk_x8",  AArch64::X8},
+  { "__llvm_slsblr_thunk_x9",  AArch64::X9},
+  { "__llvm_slsblr_thunk_x10",  AArch64::X10},
+  { "__llvm_slsblr_thunk_x11",  AArch64::X11},
+  { "__llvm_slsblr_thunk_x12",  AArch64::X12},
+  { "__llvm_slsblr_thunk_x13",  AArch64::X13},
+  { "__llvm_slsblr_thunk_x14",  AArch64::X14},
+  { "__llvm_slsblr_thunk_x15",  AArch64::X15},
+  // X16 and X17 are deliberately missing, as the mitigation requires those
+  // register to not be used in BLR. See comment in ConvertBLRToBL for more
+  // details.
+  { "__llvm_slsblr_thunk_x18",  AArch64::X18},
+  { "__llvm_slsblr_thunk_x19",  AArch64::X19},
+  { "__llvm_slsblr_thunk_x20",  AArch64::X20},
+  { "__llvm_slsblr_thunk_x21",  AArch64::X21},
+  { "__llvm_slsblr_thunk_x22",  AArch64::X22},
+  { "__llvm_slsblr_thunk_x23",  AArch64::X23},
+  { "__llvm_slsblr_thunk_x24",  AArch64::X24},
+  { "__llvm_slsblr_thunk_x25",  AArch64::X25},
+  { "__llvm_slsblr_thunk_x26",  AArch64::X26},
+  { "__llvm_slsblr_thunk_x27",  AArch64::X27},
+  { "__llvm_slsblr_thunk_x28",  AArch64::X28},
+  { "__llvm_slsblr_thunk_x29",  AArch64::FP},
+  // X30 is deliberately missing, for similar reasons as X16 and X17 are
+  // missing.
+  { "__llvm_slsblr_thunk_x31",  AArch64::XZR},
+};
+
+namespace {
+struct SLSBLRThunkInserter : ThunkInserter<SLSBLRThunkInserter> {
+  const char *getThunkPrefix() { return SLSBLRNamePrefix; }
+  bool mayUseThunk(const MachineFunction &MF) {
+    // FIXME: This could also check if there are any BLRs in the function
+    // to more accurately reflect if a thunk will be needed.
+    return MF.getSubtarget<AArch64Subtarget>().hardenSlsBlr();
+  }
+  void insertThunks(MachineModuleInfo &MMI);
+  void populateThunk(MachineFunction &MF);
+};
+} // namespace
+
+void SLSBLRThunkInserter::insertThunks(MachineModuleInfo &MMI) {
+  // FIXME: It probably would be possible to filter which thunks to produce
+  // based on which registers are actually used in BLR instructions in this
+  // function. But would that be a worthwhile optimization?
+  for (auto T : SLSBLRThunks)
+    createThunkFunction(MMI, T.Name);
+}
+
+void SLSBLRThunkInserter::populateThunk(MachineFunction &MF) {
+  // FIXME: How to better communicate Register number, rather than through
+  // name and lookup table?
+  assert(MF.getName().startswith(getThunkPrefix()));
+  auto ThunkIt = llvm::find_if(
+      SLSBLRThunks, [&MF](auto T) { return T.Name == MF.getName(); });
+  assert(ThunkIt != std::end(SLSBLRThunks));
+  Register ThunkReg = ThunkIt->Reg;
+
+  const TargetInstrInfo *TII =
+      MF.getSubtarget<AArch64Subtarget>().getInstrInfo();
+  assert (MF.size() == 1);
+  MachineBasicBlock *Entry = &MF.front();
+  Entry->clear();
+
+  //  These thunks need to consist of the following instructions:
+  //  __llvm_slsblr_thunk_xN:
+  //      BR xN
+  //      barrierInsts
+  Entry->addLiveIn(ThunkReg);
+  // MOV X16, ThunkReg == ORR X16, XZR, ThunkReg, LSL #0
+  BuildMI(Entry, DebugLoc(), TII->get(AArch64::ORRXrs), AArch64::X16)
+      .addReg(AArch64::XZR)
+      .addReg(ThunkReg)
+      .addImm(0);
+  BuildMI(Entry, DebugLoc(), TII->get(AArch64::BR)).addReg(AArch64::X16);
+  // Make sure the thunks do not make use of the SB extension in case there is
+  // a function somewhere that will call to it that for some reason disabled
+  // the SB extension locally on that function, even though it's enabled for
+  // the module otherwise. Therefore set AlwaysUseISBSDB to true.
+  insertSpeculationBarrier(&MF.getSubtarget<AArch64Subtarget>(), *Entry,
+                           Entry->end(), DebugLoc(), true /*AlwaysUseISBDSB*/);
+}
+
+MachineBasicBlock &
+AArch64SLSHardening::ConvertBLRToBL(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI) const {
+  // Transform a BLR to a BL as follows:
+  // Before:
+  //   |-----------------------------|
+  //   |      ...                    |
+  //   |  instI                      |
+  //   |  BLR xN                     |
+  //   |  instJ                      |
+  //   |      ...                    |
+  //   |-----------------------------|
+  //
+  // After:
+  //   |-----------------------------|
+  //   |      ...                    |
+  //   |  instI                      |
+  //   |  BL __llvm_slsblr_thunk_xN  |
+  //   |  instJ                      |
+  //   |      ...                    |
+  //   |-----------------------------|
+  //
+  //   __llvm_slsblr_thunk_xN:
+  //   |-----------------------------|
+  //   |  BR xN                      |
+  //   |  barrierInsts               |
+  //   |-----------------------------|
+  //
+  // The __llvm_slsblr_thunk_xN thunks are created by the SLSBLRThunkInserter.
+  // This function merely needs to transform BLR xN into BL
+  // __llvm_slsblr_thunk_xN.
+  //
+  // Since linkers are allowed to clobber X16 and X17 on function calls, the
+  // above mitigation only works if the original BLR instruction was not
+  // BLR X16 nor BLR X17. Code generation before must make sure that no BLR
+  // X16|X17 was produced if the mitigation is enabled.
+
+  MachineInstr &BLR = *MBBI;
+  assert(isBLR(BLR));
+  unsigned BLOpcode;
+  Register Reg;
+  bool RegIsKilled;
+  switch (BLR.getOpcode()) {
+  case AArch64::BLR:
+  case AArch64::BLRNoIP:
+    BLOpcode = AArch64::BL;
+    Reg = BLR.getOperand(0).getReg();
+    assert(Reg != AArch64::X16 && Reg != AArch64::X17 && Reg != AArch64::LR);
+    RegIsKilled = BLR.getOperand(0).isKill();
+    break;
+  case AArch64::BLRAA:
+  case AArch64::BLRAB:
+  case AArch64::BLRAAZ:
+  case AArch64::BLRABZ:
+    llvm_unreachable("BLRA instructions cannot yet be produced by LLVM, "
+                     "therefore there is no need to support them for now.");
+  default:
+    llvm_unreachable("unhandled BLR");
+  }
+  DebugLoc DL = BLR.getDebugLoc();
+
+  // If we'd like to support also BLRAA and BLRAB instructions, we'd need
+  // a lot more different kind of thunks.
+  // For example, a
+  //
+  // BLRAA xN, xM
+  //
+  // instruction probably would need to be transformed to something like:
+  //
+  // BL __llvm_slsblraa_thunk_x<N>_x<M>
+  //
+  // __llvm_slsblraa_thunk_x<N>_x<M>:
+  //   BRAA x<N>, x<M>
+  //   barrierInsts
+  //
+  // Given that about 30 different values of N are possible and about 30
+  // different values of M are possible in the above, with the current way
+  // of producing indirect thunks, we'd be producing about 30 times 30, i.e.
+  // about 900 thunks (where most might not be actually called). This would
+  // multiply further by two to support both BLRAA and BLRAB variants of those
+  // instructions.
+  // If we'd want to support this, we'd probably need to look into a different
+  // way to produce thunk functions, based on which variants are actually
+  // needed, rather than producing all possible variants.
+  // So far, LLVM does never produce BLRA* instructions, so let's leave this
+  // for the future when LLVM can start producing BLRA* instructions.
+  MachineFunction &MF = *MBBI->getMF();
+  MCContext &Context = MBB.getParent()->getContext();
+  auto ThunkIt =
+      llvm::find_if(SLSBLRThunks, [Reg](auto T) { return T.Reg == Reg; });
+  assert (ThunkIt != std::end(SLSBLRThunks));
+  MCSymbol *Sym = Context.getOrCreateSymbol(ThunkIt->Name);
+
+  MachineInstr *BL = BuildMI(MBB, MBBI, DL, TII->get(BLOpcode)).addSym(Sym);
+
+  // Now copy the implicit operands from BLR to BL and copy other necessary
+  // info.
+  // However, both BLR and BL instructions implictly use SP and implicitly
+  // define LR. Blindly copying implicit operands would result in SP and LR
+  // operands to be present multiple times. While this may not be too much of
+  // an issue, let's avoid that for cleanliness, by removing those implicit
+  // operands from the BL created above before we copy over all implicit
+  // operands from the BLR.
+  int ImpLROpIdx = -1;
+  int ImpSPOpIdx = -1;
+  for (unsigned OpIdx = BL->getNumExplicitOperands();
+       OpIdx < BL->getNumOperands(); OpIdx++) {
+    MachineOperand Op = BL->getOperand(OpIdx);
+    if (!Op.isReg())
+      continue;
+    if (Op.getReg() == AArch64::LR && Op.isDef())
+      ImpLROpIdx = OpIdx;
+    if (Op.getReg() == AArch64::SP && !Op.isDef())
+      ImpSPOpIdx = OpIdx;
+  }
+  assert(ImpLROpIdx != -1);
+  assert(ImpSPOpIdx != -1);
+  int FirstOpIdxToRemove = std::max(ImpLROpIdx, ImpSPOpIdx);
+  int SecondOpIdxToRemove = std::min(ImpLROpIdx, ImpSPOpIdx);
+  BL->RemoveOperand(FirstOpIdxToRemove);
+  BL->RemoveOperand(SecondOpIdxToRemove);
+  // Now copy over the implicit operands from the original BLR
+  BL->copyImplicitOps(MF, BLR);
+  MF.moveCallSiteInfo(&BLR, BL);
+  // Also add the register called in the BLR as being used in the called thunk.
+  BL->addOperand(MachineOperand::CreateReg(Reg, false /*isDef*/, true /*isImp*/,
+                                           RegIsKilled /*isKill*/));
+  // Remove BLR instruction
+  MBB.erase(MBBI);
+
+  return MBB;
+}
+
+bool AArch64SLSHardening::hardenBLRs(MachineBasicBlock &MBB) const {
+  if (!ST->hardenSlsBlr())
+    return false;
+  bool Modified = false;
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  MachineBasicBlock::iterator NextMBBI;
+  for (; MBBI != E; MBBI = NextMBBI) {
+    MachineInstr &MI = *MBBI;
+    NextMBBI = std::next(MBBI);
+    if (isBLR(MI)) {
+      ConvertBLRToBL(MBB, MBBI);
+      Modified = true;
+    }
+  }
+  return Modified;
+}
+
+FunctionPass *llvm::createAArch64SLSHardeningPass() {
+  return new AArch64SLSHardening();
+}
+
+namespace {
+class AArch64IndirectThunks : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AArch64IndirectThunks() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "AArch64 Indirect Thunks"; }
+
+  bool doInitialization(Module &M) override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  std::tuple<SLSBLRThunkInserter> TIs;
+
+  // FIXME: When LLVM moves to C++17, these can become folds
+  template <typename... ThunkInserterT>
+  static void initTIs(Module &M,
+                      std::tuple<ThunkInserterT...> &ThunkInserters) {
+    (void)std::initializer_list<int>{
+        (std::get<ThunkInserterT>(ThunkInserters).init(M), 0)...};
+  }
+  template <typename... ThunkInserterT>
+  static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF,
+                     std::tuple<ThunkInserterT...> &ThunkInserters) {
+    bool Modified = false;
+    (void)std::initializer_list<int>{
+        Modified |= std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF)...};
+    return Modified;
+  }
+};
+
+} // end anonymous namespace
+
+char AArch64IndirectThunks::ID = 0;
+
+FunctionPass *llvm::createAArch64IndirectThunks() {
+  return new AArch64IndirectThunks();
+}
+
+bool AArch64IndirectThunks::doInitialization(Module &M) {
+  initTIs(M, TIs);
+  return false;
+}
+
+bool AArch64IndirectThunks::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << getPassName() << '\n');
+  auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+  return runTIs(MMI, MF, TIs);
+}
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index c849d7af9a40..28a54e6f7d79 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -10,65 +10,188 @@
 //
 //===----------------------------------------------------------------------===//
 
-def SDT_AArch64_GLD1 : SDTypeProfile<1, 4, [
+// For predicated nodes where the entire operation is controlled by a governing
+// predicate, please stick to a similar naming convention as used for the
+// ISD nodes:
+//
+//    SDNode      <=>     AArch64ISD
+//    -------------------------------
+//    _m<n>       <=>     _MERGE_OP<n>
+//    _mt         <=>     _MERGE_PASSTHRU
+//    _z          <=>     _MERGE_ZERO
+//    _p          <=>     _PRED
+//
+//  Given the context of this file, it is not strictly necessary to use _p to
+//  distinguish predicated from unpredicated nodes given that most SVE
+//  instructions are predicated.
+
+// Contiguous loads - node definitions
+//
+def SDT_AArch64_LD1 : SDTypeProfile<1, 3, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
+  SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def AArch64ld1_z  : SDNode<"AArch64ISD::LD1_MERGE_ZERO",    SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64ld1s_z : SDNode<"AArch64ISD::LD1S_MERGE_ZERO",   SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+
+// Non-faulting & first-faulting loads - node definitions
+//
+def AArch64ldnf1_z : SDNode<"AArch64ISD::LDNF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1_z : SDNode<"AArch64ISD::LDFF1_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AArch64ldnf1s_z : SDNode<"AArch64ISD::LDNF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1s_z : SDNode<"AArch64ISD::LDFF1S_MERGE_ZERO", SDT_AArch64_LD1, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+
+// Contiguous load and replicate - node definitions
+//
+
+def SDT_AArch64_LD1Replicate : SDTypeProfile<1, 2, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>,
+  SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
+]>;
+
+def AArch64ld1rq_z : SDNode<"AArch64ISD::LD1RQ_MERGE_ZERO",  SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1ro_z : SDNode<"AArch64ISD::LD1RO_MERGE_ZERO",  SDT_AArch64_LD1Replicate, [SDNPHasChain, SDNPMayLoad]>;
+
+// Gather loads - node definitions
+//
+def SDT_AArch64_GATHER_SV : SDTypeProfile<1, 4, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
   SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
 ]>;
 
-def SDT_AArch64_GLD1_IMM : SDTypeProfile<1, 4, [
+def SDT_AArch64_GATHER_VS : SDTypeProfile<1, 4, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
   SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
 ]>;
 
-def SDT_AArch64_SST1 : SDTypeProfile<0, 5, [
+def AArch64ld1_gather_z             : SDNode<"AArch64ISD::GLD1_MERGE_ZERO",             SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1_gather_scaled_z      : SDNode<"AArch64ISD::GLD1_SCALED_MERGE_ZERO",      SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1_gather_uxtw_z        : SDNode<"AArch64ISD::GLD1_UXTW_MERGE_ZERO",        SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1_gather_sxtw_z        : SDNode<"AArch64ISD::GLD1_SXTW_MERGE_ZERO",        SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1_gather_imm_z         : SDNode<"AArch64ISD::GLD1_IMM_MERGE_ZERO",         SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
+
+def AArch64ld1s_gather_z             : SDNode<"AArch64ISD::GLD1S_MERGE_ZERO",             SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1s_gather_scaled_z      : SDNode<"AArch64ISD::GLD1S_SCALED_MERGE_ZERO",      SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1s_gather_uxtw_z        : SDNode<"AArch64ISD::GLD1S_UXTW_MERGE_ZERO",        SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1s_gather_sxtw_z        : SDNode<"AArch64ISD::GLD1S_SXTW_MERGE_ZERO",        SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ld1s_gather_imm_z         : SDNode<"AArch64ISD::GLD1S_IMM_MERGE_ZERO",         SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
+
+def AArch64ldff1_gather_z             : SDNode<"AArch64ISD::GLDFF1_MERGE_ZERO",             SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1_gather_scaled_z      : SDNode<"AArch64ISD::GLDFF1_SCALED_MERGE_ZERO",      SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1_gather_uxtw_z        : SDNode<"AArch64ISD::GLDFF1_UXTW_MERGE_ZERO",        SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1_gather_sxtw_z        : SDNode<"AArch64ISD::GLDFF1_SXTW_MERGE_ZERO",        SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1_gather_imm_z         : SDNode<"AArch64ISD::GLDFF1_IMM_MERGE_ZERO",         SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AArch64ldff1s_gather_z             : SDNode<"AArch64ISD::GLDFF1S_MERGE_ZERO",             SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1s_gather_scaled_z      : SDNode<"AArch64ISD::GLDFF1S_SCALED_MERGE_ZERO",      SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1s_gather_uxtw_z        : SDNode<"AArch64ISD::GLDFF1S_UXTW_MERGE_ZERO",        SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1s_gather_sxtw_z        : SDNode<"AArch64ISD::GLDFF1S_SXTW_MERGE_ZERO",        SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1s_gather_uxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_UXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1s_gather_sxtw_scaled_z : SDNode<"AArch64ISD::GLDFF1S_SXTW_SCALED_MERGE_ZERO", SDT_AArch64_GATHER_SV, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+def AArch64ldff1s_gather_imm_z         : SDNode<"AArch64ISD::GLDFF1S_IMM_MERGE_ZERO",         SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue, SDNPOutGlue]>;
+
+def AArch64ldnt1_gather_z  : SDNode<"AArch64ISD::GLDNT1_MERGE_ZERO",  SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
+def AArch64ldnt1s_gather_z : SDNode<"AArch64ISD::GLDNT1S_MERGE_ZERO", SDT_AArch64_GATHER_VS, [SDNPHasChain, SDNPMayLoad]>;
+
+// Contiguous stores - node definitions
+//
+def SDT_AArch64_ST1 : SDTypeProfile<0, 4, [
+  SDTCisVec<0>, SDTCisPtrTy<1>, SDTCisVec<2>,
+  SDTCVecEltisVT<2,i1>, SDTCisSameNumEltsAs<0,2>
+]>;
+
+def AArch64st1 : SDNode<"AArch64ISD::ST1_PRED", SDT_AArch64_ST1, [SDNPHasChain, SDNPMayStore]>;
+
+// Scatter stores - node definitions
+//
+def SDT_AArch64_SCATTER_SV : SDTypeProfile<0, 5, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisPtrTy<2>, SDTCisVec<3>, SDTCisVT<4, OtherVT>,
   SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
 ]>;
 
-def SDT_AArch64_SST1_IMM : SDTypeProfile<0, 5, [
+def SDT_AArch64_SCATTER_VS : SDTypeProfile<0, 5, [
   SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisInt<3>, SDTCisVT<4, OtherVT>,
   SDTCVecEltisVT<1,i1>, SDTCisSameNumEltsAs<0,1>
 ]>;
 
-def AArch64st1_scatter               : SDNode<"AArch64ISD::SST1",               SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
-def AArch64st1_scatter_scaled        : SDNode<"AArch64ISD::SST1_SCALED",        SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
-def AArch64st1_scatter_uxtw          : SDNode<"AArch64ISD::SST1_UXTW",          SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
-def AArch64st1_scatter_sxtw          : SDNode<"AArch64ISD::SST1_SXTW",          SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
-def AArch64st1_scatter_uxtw_scaled   : SDNode<"AArch64ISD::SST1_UXTW_SCALED",   SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
-def AArch64st1_scatter_sxtw_scaled   : SDNode<"AArch64ISD::SST1_SXTW_SCALED",   SDT_AArch64_SST1,     [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
-def AArch64st1_scatter_imm           : SDNode<"AArch64ISD::SST1_IMM",           SDT_AArch64_SST1_IMM, [SDNPHasChain, SDNPMayStore, SDNPOptInGlue]>;
-
-def AArch64ld1_gather                : SDNode<"AArch64ISD::GLD1",               SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1_gather_scaled         : SDNode<"AArch64ISD::GLD1_SCALED",        SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1_gather_uxtw           : SDNode<"AArch64ISD::GLD1_UXTW",          SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1_gather_sxtw           : SDNode<"AArch64ISD::GLD1_SXTW",          SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1_gather_uxtw_scaled    : SDNode<"AArch64ISD::GLD1_UXTW_SCALED",   SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1_gather_sxtw_scaled    : SDNode<"AArch64ISD::GLD1_SXTW_SCALED",   SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1_gather_imm            : SDNode<"AArch64ISD::GLD1_IMM",           SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-
-def AArch64ld1s_gather               : SDNode<"AArch64ISD::GLD1S",              SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1s_gather_scaled        : SDNode<"AArch64ISD::GLD1S_SCALED",       SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1s_gather_uxtw          : SDNode<"AArch64ISD::GLD1S_UXTW",         SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1s_gather_sxtw          : SDNode<"AArch64ISD::GLD1S_SXTW",         SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1s_gather_uxtw_scaled   : SDNode<"AArch64ISD::GLD1S_UXTW_SCALED",  SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1s_gather_sxtw_scaled   : SDNode<"AArch64ISD::GLD1S_SXTW_SCALED",  SDT_AArch64_GLD1,     [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
-def AArch64ld1s_gather_imm           : SDNode<"AArch64ISD::GLD1S_IMM",          SDT_AArch64_GLD1_IMM, [SDNPHasChain, SDNPMayLoad, SDNPOptInGlue]>;
+def AArch64st1_scatter             : SDNode<"AArch64ISD::SST1_PRED",             SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
+def AArch64st1_scatter_scaled      : SDNode<"AArch64ISD::SST1_SCALED_PRED",      SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
+def AArch64st1_scatter_uxtw        : SDNode<"AArch64ISD::SST1_UXTW_PRED",        SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
+def AArch64st1_scatter_sxtw        : SDNode<"AArch64ISD::SST1_SXTW_PRED",        SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
+def AArch64st1_scatter_uxtw_scaled : SDNode<"AArch64ISD::SST1_UXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
+def AArch64st1_scatter_sxtw_scaled : SDNode<"AArch64ISD::SST1_SXTW_SCALED_PRED", SDT_AArch64_SCATTER_SV, [SDNPHasChain, SDNPMayStore]>;
+def AArch64st1_scatter_imm         : SDNode<"AArch64ISD::SST1_IMM_PRED",         SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;
+
+def AArch64stnt1_scatter : SDNode<"AArch64ISD::SSTNT1_PRED", SDT_AArch64_SCATTER_VS, [SDNPHasChain, SDNPMayStore]>;
+
+// AArch64 SVE/SVE2 - the remaining node definitions
+//
+
+// SVE CNT/INC/RDVL
+def sve_rdvl_imm : ComplexPattern<i32, 1, "SelectRDVLImm<-32, 31, 16>">;
+def sve_cnth_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 8>">;
+def sve_cntw_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 4>">;
+def sve_cntd_imm : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, 2>">;
+
+// SVE DEC
+def sve_cnth_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -8>">;
+def sve_cntw_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -4>">;
+def sve_cntd_imm_neg : ComplexPattern<i32, 1, "SelectRDVLImm<1, 16, -2>">;
 
 def SDT_AArch64Reduce : SDTypeProfile<1, 2, [SDTCisVec<1>, SDTCisVec<2>]>;
+def AArch64faddv_p   : SDNode<"AArch64ISD::FADDV_PRED",   SDT_AArch64Reduce>;
+def AArch64fmaxv_p   : SDNode<"AArch64ISD::FMAXV_PRED",   SDT_AArch64Reduce>;
+def AArch64fmaxnmv_p : SDNode<"AArch64ISD::FMAXNMV_PRED", SDT_AArch64Reduce>;
+def AArch64fminv_p   : SDNode<"AArch64ISD::FMINV_PRED",   SDT_AArch64Reduce>;
+def AArch64fminnmv_p : SDNode<"AArch64ISD::FMINNMV_PRED", SDT_AArch64Reduce>;
+def AArch64smaxv_p   : SDNode<"AArch64ISD::SMAXV_PRED",   SDT_AArch64Reduce>;
+def AArch64umaxv_p   : SDNode<"AArch64ISD::UMAXV_PRED",   SDT_AArch64Reduce>;
+def AArch64sminv_p   : SDNode<"AArch64ISD::SMINV_PRED",   SDT_AArch64Reduce>;
+def AArch64uminv_p   : SDNode<"AArch64ISD::UMINV_PRED",   SDT_AArch64Reduce>;
+def AArch64orv_p     : SDNode<"AArch64ISD::ORV_PRED",     SDT_AArch64Reduce>;
+def AArch64eorv_p    : SDNode<"AArch64ISD::EORV_PRED",    SDT_AArch64Reduce>;
+def AArch64andv_p    : SDNode<"AArch64ISD::ANDV_PRED",    SDT_AArch64Reduce>;
+def AArch64lasta     : SDNode<"AArch64ISD::LASTA",        SDT_AArch64Reduce>;
+def AArch64lastb     : SDNode<"AArch64ISD::LASTB",        SDT_AArch64Reduce>;
+
+def SDT_AArch64Arith : SDTypeProfile<1, 3, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
+  SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>
+]>;
+
+def SDT_AArch64FMA : SDTypeProfile<1, 4, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>, SDTCisVec<4>,
+  SDTCVecEltisVT<1,i1>, SDTCisSameAs<2,3>, SDTCisSameAs<3,4>
+]>;
 
-def AArch64smaxv_pred      :  SDNode<"AArch64ISD::SMAXV_PRED", SDT_AArch64Reduce>;
-def AArch64umaxv_pred      :  SDNode<"AArch64ISD::UMAXV_PRED", SDT_AArch64Reduce>;
-def AArch64sminv_pred      :  SDNode<"AArch64ISD::SMINV_PRED", SDT_AArch64Reduce>;
-def AArch64uminv_pred      :  SDNode<"AArch64ISD::UMINV_PRED", SDT_AArch64Reduce>;
-def AArch64orv_pred        :  SDNode<"AArch64ISD::ORV_PRED", SDT_AArch64Reduce>;
-def AArch64eorv_pred       :  SDNode<"AArch64ISD::EORV_PRED", SDT_AArch64Reduce>;
-def AArch64andv_pred       :  SDNode<"AArch64ISD::ANDV_PRED", SDT_AArch64Reduce>;
-def AArch64lasta           :  SDNode<"AArch64ISD::LASTA",     SDT_AArch64Reduce>;
-def AArch64lastb           :  SDNode<"AArch64ISD::LASTB",     SDT_AArch64Reduce>;
+// Predicated operations with the result of inactive lanes being unspecified.
+def AArch64add_p  : SDNode<"AArch64ISD::ADD_PRED",  SDT_AArch64Arith>;
+def AArch64fadd_p : SDNode<"AArch64ISD::FADD_PRED", SDT_AArch64Arith>;
+def AArch64fma_p  : SDNode<"AArch64ISD::FMA_PRED",  SDT_AArch64FMA>;
+def AArch64sdiv_p : SDNode<"AArch64ISD::SDIV_PRED", SDT_AArch64Arith>;
+def AArch64udiv_p : SDNode<"AArch64ISD::UDIV_PRED", SDT_AArch64Arith>;
+
+// Merging op1 into the inactive lanes.
+def AArch64smin_m1 :  SDNode<"AArch64ISD::SMIN_MERGE_OP1", SDT_AArch64Arith>;
+def AArch64umin_m1 :  SDNode<"AArch64ISD::UMIN_MERGE_OP1", SDT_AArch64Arith>;
+def AArch64smax_m1 :  SDNode<"AArch64ISD::SMAX_MERGE_OP1", SDT_AArch64Arith>;
+def AArch64umax_m1 :  SDNode<"AArch64ISD::UMAX_MERGE_OP1", SDT_AArch64Arith>;
+def AArch64lsl_m1  :  SDNode<"AArch64ISD::SHL_MERGE_OP1",  SDT_AArch64Arith>;
+def AArch64lsr_m1  :  SDNode<"AArch64ISD::SRL_MERGE_OP1",  SDT_AArch64Arith>;
+def AArch64asr_m1  :  SDNode<"AArch64ISD::SRA_MERGE_OP1",  SDT_AArch64Arith>;
 
 def SDT_AArch64ReduceWithInit : SDTypeProfile<1, 3, [SDTCisVec<1>, SDTCisVec<3>]>;
 def AArch64clasta_n   : SDNode<"AArch64ISD::CLASTA_N",   SDT_AArch64ReduceWithInit>;
 def AArch64clastb_n   : SDNode<"AArch64ISD::CLASTB_N",   SDT_AArch64ReduceWithInit>;
+def AArch64fadda_p    : SDNode<"AArch64ISD::FADDA_PRED", SDT_AArch64ReduceWithInit>;
 
 def SDT_AArch64Rev   : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
 def AArch64rev       : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>;
@@ -76,42 +199,57 @@ def AArch64rev       : SDNode<"AArch64ISD::REV", SDT_AArch64Rev>;
 def SDT_AArch64PTest : SDTypeProfile<0, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>]>;
 def AArch64ptest     : SDNode<"AArch64ISD::PTEST", SDT_AArch64PTest>;
 
-let Predicates = [HasSVE] in {
+def SDT_AArch64DUP_PRED  : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 3>, SDTCisVec<1>, SDTCVecEltisVT<1,i1>]>;
+def AArch64dup_mt : SDNode<"AArch64ISD::DUP_MERGE_PASSTHRU", SDT_AArch64DUP_PRED>;
+
+def SDT_IndexVector : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<1, 2>, SDTCisInt<2>]>;
+def index_vector : SDNode<"AArch64ISD::INDEX_VECTOR", SDT_IndexVector, []>;
 
-  def RDFFR_PPz  : sve_int_rdffr_pred<0b0, "rdffr">;
-  def RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
-  def RDFFR_P    : sve_int_rdffr_unpred<"rdffr">;
-  def SETFFR     : sve_int_setffr<"setffr">;
-  def WRFFR      : sve_int_wrffr<"wrffr">;
+def reinterpret_cast : SDNode<"AArch64ISD::REINTERPRET_CAST", SDTUnaryOp>;
 
-  defm ADD_ZZZ   : sve_int_bin_cons_arit_0<0b000, "add", add>;
-  defm SUB_ZZZ   : sve_int_bin_cons_arit_0<0b001, "sub", sub>;
-  defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat>;
-  defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat>;
-  defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat>;
-  defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat>;
+let Predicates = [HasSVE] in {
+  defm RDFFR_PPz  : sve_int_rdffr_pred<0b0, "rdffr", int_aarch64_sve_rdffr_z>;
+  def  RDFFRS_PPz : sve_int_rdffr_pred<0b1, "rdffrs">;
+  defm RDFFR_P    : sve_int_rdffr_unpred<"rdffr", int_aarch64_sve_rdffr>;
+  def  SETFFR     : sve_int_setffr<"setffr", int_aarch64_sve_setffr>;
+  def  WRFFR      : sve_int_wrffr<"wrffr", int_aarch64_sve_wrffr>;
+
+  defm ADD_ZZZ   : sve_int_bin_cons_arit_0<0b000, "add", add, null_frag>;
+  defm SUB_ZZZ   : sve_int_bin_cons_arit_0<0b001, "sub", sub, null_frag>;
+  defm SQADD_ZZZ : sve_int_bin_cons_arit_0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>;
+  defm UQADD_ZZZ : sve_int_bin_cons_arit_0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>;
+  defm SQSUB_ZZZ : sve_int_bin_cons_arit_0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>;
+  defm UQSUB_ZZZ : sve_int_bin_cons_arit_0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>;
 
   defm AND_ZZZ : sve_int_bin_cons_log<0b00, "and", and>;
   defm ORR_ZZZ : sve_int_bin_cons_log<0b01, "orr", or>;
   defm EOR_ZZZ : sve_int_bin_cons_log<0b10, "eor", xor>;
   defm BIC_ZZZ : sve_int_bin_cons_log<0b11, "bic", null_frag>;
 
-  defm ADD_ZPmZ   : sve_int_bin_pred_arit_0<0b000, "add", int_aarch64_sve_add>;
-  defm SUB_ZPmZ   : sve_int_bin_pred_arit_0<0b001, "sub", int_aarch64_sve_sub>;
-  defm SUBR_ZPmZ  : sve_int_bin_pred_arit_0<0b011, "subr", int_aarch64_sve_subr>;
+  defm ADD_ZPmZ  : sve_int_bin_pred_arit_0<0b000, "add",  "ADD_ZPZZ", int_aarch64_sve_add, DestructiveBinaryComm>;
+  defm SUB_ZPmZ  : sve_int_bin_pred_arit_0<0b001, "sub",  "SUB_ZPZZ", int_aarch64_sve_sub, DestructiveBinaryCommWithRev, "SUBR_ZPmZ">;
+  defm SUBR_ZPmZ : sve_int_bin_pred_arit_0<0b011, "subr", "SUBR_ZPZZ", int_aarch64_sve_subr, DestructiveBinaryCommWithRev, "SUB_ZPmZ", /*isReverseInstr*/ 1>;
+
+  defm ADD_ZPZZ  : sve_int_bin_pred_bhsd<AArch64add_p>;
+
+  let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
+    defm ADD_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_add>;
+    defm SUB_ZPZZ  : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_sub>;
+    defm SUBR_ZPZZ : sve_int_bin_pred_zeroing_bhsd<int_aarch64_sve_subr>;
+  }
 
   defm ORR_ZPmZ : sve_int_bin_pred_log<0b000, "orr", int_aarch64_sve_orr>;
   defm EOR_ZPmZ : sve_int_bin_pred_log<0b001, "eor", int_aarch64_sve_eor>;
   defm AND_ZPmZ : sve_int_bin_pred_log<0b010, "and", int_aarch64_sve_and>;
   defm BIC_ZPmZ : sve_int_bin_pred_log<0b011, "bic", int_aarch64_sve_bic>;
 
-  defm ADD_ZI   : sve_int_arith_imm0<0b000, "add", add>;
-  defm SUB_ZI   : sve_int_arith_imm0<0b001, "sub", sub>;
+  defm ADD_ZI   : sve_int_arith_imm0<0b000, "add", add, null_frag>;
+  defm SUB_ZI   : sve_int_arith_imm0<0b001, "sub", sub, null_frag>;
   defm SUBR_ZI  : sve_int_arith_imm0_subr<0b011, "subr", sub>;
-  defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat>;
-  defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat>;
-  defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat>;
-  defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat>;
+  defm SQADD_ZI : sve_int_arith_imm0<0b100, "sqadd", saddsat, int_aarch64_sve_sqadd_x>;
+  defm UQADD_ZI : sve_int_arith_imm0<0b101, "uqadd", uaddsat, int_aarch64_sve_uqadd_x>;
+  defm SQSUB_ZI : sve_int_arith_imm0<0b110, "sqsub", ssubsat, int_aarch64_sve_sqsub_x>;
+  defm UQSUB_ZI : sve_int_arith_imm0<0b111, "uqsub", usubsat, int_aarch64_sve_uqsub_x>;
 
   defm MAD_ZPmZZ : sve_int_mladdsub_vvv_pred<0b0, "mad", int_aarch64_sve_mad>;
   defm MSB_ZPmZZ : sve_int_mladdsub_vvv_pred<0b1, "msb", int_aarch64_sve_msb>;
@@ -121,32 +259,45 @@ let Predicates = [HasSVE] in {
   // SVE predicated integer reductions.
   defm SADDV_VPZ : sve_int_reduce_0_saddv<0b000, "saddv", int_aarch64_sve_saddv>;
   defm UADDV_VPZ : sve_int_reduce_0_uaddv<0b001, "uaddv", int_aarch64_sve_uaddv, int_aarch64_sve_saddv>;
-  defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_pred>;
-  defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_pred>;
-  defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_pred>;
-  defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_pred>;
-  defm ORV_VPZ   : sve_int_reduce_2<0b000, "orv", AArch64orv_pred>;
-  defm EORV_VPZ  : sve_int_reduce_2<0b001, "eorv", AArch64eorv_pred>;
-  defm ANDV_VPZ  : sve_int_reduce_2<0b010, "andv", AArch64andv_pred>;
+  defm SMAXV_VPZ : sve_int_reduce_1<0b000, "smaxv", AArch64smaxv_p>;
+  defm UMAXV_VPZ : sve_int_reduce_1<0b001, "umaxv", AArch64umaxv_p>;
+  defm SMINV_VPZ : sve_int_reduce_1<0b010, "sminv", AArch64sminv_p>;
+  defm UMINV_VPZ : sve_int_reduce_1<0b011, "uminv", AArch64uminv_p>;
+  defm ORV_VPZ   : sve_int_reduce_2<0b000, "orv", AArch64orv_p>;
+  defm EORV_VPZ  : sve_int_reduce_2<0b001, "eorv", AArch64eorv_p>;
+  defm ANDV_VPZ  : sve_int_reduce_2<0b010, "andv", AArch64andv_p>;
 
   defm ORR_ZI : sve_int_log_imm<0b00, "orr", "orn", or>;
   defm EOR_ZI : sve_int_log_imm<0b01, "eor", "eon", xor>;
   defm AND_ZI : sve_int_log_imm<0b10, "and", "bic", and>;
 
-  defm SMAX_ZI   : sve_int_arith_imm1<0b00, "smax", smax>;
-  defm SMIN_ZI   : sve_int_arith_imm1<0b10, "smin", smin>;
-  defm UMAX_ZI   : sve_int_arith_imm1_unsigned<0b01, "umax", umax>;
-  defm UMIN_ZI   : sve_int_arith_imm1_unsigned<0b11, "umin", umin>;
+  defm SMAX_ZI   : sve_int_arith_imm1<0b00, "smax", AArch64smax_m1>;
+  defm SMIN_ZI   : sve_int_arith_imm1<0b10, "smin", AArch64smin_m1>;
+  defm UMAX_ZI   : sve_int_arith_imm1_unsigned<0b01, "umax", AArch64umax_m1>;
+  defm UMIN_ZI   : sve_int_arith_imm1_unsigned<0b11, "umin", AArch64umin_m1>;
 
-  defm MUL_ZI    : sve_int_arith_imm2<"mul", mul>;
-  defm MUL_ZPmZ   : sve_int_bin_pred_arit_2<0b000, "mul", int_aarch64_sve_mul>;
+  defm MUL_ZI     : sve_int_arith_imm2<"mul", mul>;
+  defm MUL_ZPmZ   : sve_int_bin_pred_arit_2<0b000, "mul",   int_aarch64_sve_mul>;
   defm SMULH_ZPmZ : sve_int_bin_pred_arit_2<0b010, "smulh", int_aarch64_sve_smulh>;
   defm UMULH_ZPmZ : sve_int_bin_pred_arit_2<0b011, "umulh", int_aarch64_sve_umulh>;
 
-  defm SDIV_ZPmZ  : sve_int_bin_pred_arit_2_div<0b100, "sdiv", int_aarch64_sve_sdiv>;
-  defm UDIV_ZPmZ  : sve_int_bin_pred_arit_2_div<0b101, "udiv", int_aarch64_sve_udiv>;
-  defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr", int_aarch64_sve_sdivr>;
-  defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr", int_aarch64_sve_udivr>;
+  // Add unpredicated alternative for the mul instruction.
+  def : Pat<(mul nxv16i8:$Op1, nxv16i8:$Op2),
+            (MUL_ZPmZ_B (PTRUE_B 31), $Op1, $Op2)>;
+  def : Pat<(mul nxv8i16:$Op1, nxv8i16:$Op2),
+            (MUL_ZPmZ_H (PTRUE_H 31), $Op1, $Op2)>;
+  def : Pat<(mul nxv4i32:$Op1, nxv4i32:$Op2),
+            (MUL_ZPmZ_S (PTRUE_S 31), $Op1, $Op2)>;
+  def : Pat<(mul nxv2i64:$Op1, nxv2i64:$Op2),
+            (MUL_ZPmZ_D (PTRUE_D 31), $Op1, $Op2)>;
+
+  defm SDIV_ZPmZ  : sve_int_bin_pred_arit_2_div<0b100, "sdiv",  "SDIV_ZPZZ", int_aarch64_sve_sdiv, DestructiveBinaryCommWithRev, "SDIVR_ZPmZ">;
+  defm UDIV_ZPmZ  : sve_int_bin_pred_arit_2_div<0b101, "udiv",  "UDIV_ZPZZ", int_aarch64_sve_udiv, DestructiveBinaryCommWithRev, "UDIVR_ZPmZ">;
+  defm SDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b110, "sdivr", "SDIVR_ZPZZ", int_aarch64_sve_sdivr, DestructiveBinaryCommWithRev, "SDIV_ZPmZ", /*isReverseInstr*/ 1>;
+  defm UDIVR_ZPmZ : sve_int_bin_pred_arit_2_div<0b111, "udivr", "UDIVR_ZPZZ", int_aarch64_sve_udivr, DestructiveBinaryCommWithRev, "UDIV_ZPmZ", /*isReverseInstr*/ 1>;
+
+  defm SDIV_ZPZZ  : sve_int_bin_pred_sd<AArch64sdiv_p>;
+  defm UDIV_ZPZZ  : sve_int_bin_pred_sd<AArch64udiv_p>;
 
   defm SDOT_ZZZ : sve_intx_dot<0b0, "sdot", int_aarch64_sve_sdot>;
   defm UDOT_ZZZ : sve_intx_dot<0b1, "udot", int_aarch64_sve_udot>;
@@ -166,15 +317,20 @@ let Predicates = [HasSVE] in {
   defm CLS_ZPmZ  : sve_int_un_pred_arit_1<   0b000, "cls",  int_aarch64_sve_cls>;
   defm CLZ_ZPmZ  : sve_int_un_pred_arit_1<   0b001, "clz",  int_aarch64_sve_clz>;
   defm CNT_ZPmZ  : sve_int_un_pred_arit_1<   0b010, "cnt",  int_aarch64_sve_cnt>;
+
+ let Predicates = [HasSVE, HasBF16] in {
+  def : SVE_3_Op_Pat<nxv8i16, int_aarch64_sve_cnt, nxv8i16, nxv8i1, nxv8bf16, !cast<Instruction>(CNT_ZPmZ_H)>;
+ }
+
   defm CNOT_ZPmZ : sve_int_un_pred_arit_1<   0b011, "cnot", int_aarch64_sve_cnot>;
   defm NOT_ZPmZ  : sve_int_un_pred_arit_1<   0b110, "not",  int_aarch64_sve_not>;
   defm FABS_ZPmZ : sve_int_un_pred_arit_1_fp<0b100, "fabs", int_aarch64_sve_fabs>;
   defm FNEG_ZPmZ : sve_int_un_pred_arit_1_fp<0b101, "fneg", int_aarch64_sve_fneg>;
 
-  defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", int_aarch64_sve_smax>;
-  defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", int_aarch64_sve_umax>;
-  defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", int_aarch64_sve_smin>;
-  defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", int_aarch64_sve_umin>;
+  defm SMAX_ZPmZ : sve_int_bin_pred_arit_1<0b000, "smax", AArch64smax_m1>;
+  defm UMAX_ZPmZ : sve_int_bin_pred_arit_1<0b001, "umax", AArch64umax_m1>;
+  defm SMIN_ZPmZ : sve_int_bin_pred_arit_1<0b010, "smin", AArch64smin_m1>;
+  defm UMIN_ZPmZ : sve_int_bin_pred_arit_1<0b011, "umin", AArch64umin_m1>;
   defm SABD_ZPmZ : sve_int_bin_pred_arit_1<0b100, "sabd", int_aarch64_sve_sabd>;
   defm UABD_ZPmZ : sve_int_bin_pred_arit_1<0b101, "uabd", int_aarch64_sve_uabd>;
 
@@ -190,19 +346,36 @@ let Predicates = [HasSVE] in {
   defm FMAX_ZPmI    : sve_fp_2op_i_p_zds<0b110, "fmax", sve_fpimm_zero_one>;
   defm FMIN_ZPmI    : sve_fp_2op_i_p_zds<0b111, "fmin", sve_fpimm_zero_one>;
 
-  defm FADD_ZPmZ   : sve_fp_2op_p_zds<0b0000, "fadd",   int_aarch64_sve_fadd>;
-  defm FSUB_ZPmZ   : sve_fp_2op_p_zds<0b0001, "fsub",   int_aarch64_sve_fsub>;
-  defm FMUL_ZPmZ   : sve_fp_2op_p_zds<0b0010, "fmul",   int_aarch64_sve_fmul>;
-  defm FSUBR_ZPmZ  : sve_fp_2op_p_zds<0b0011, "fsubr",  int_aarch64_sve_fsubr>;
-  defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", int_aarch64_sve_fmaxnm>;
-  defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", int_aarch64_sve_fminnm>;
-  defm FMAX_ZPmZ   : sve_fp_2op_p_zds<0b0110, "fmax",   int_aarch64_sve_fmax>;
-  defm FMIN_ZPmZ   : sve_fp_2op_p_zds<0b0111, "fmin",   int_aarch64_sve_fmin>;
-  defm FABD_ZPmZ   : sve_fp_2op_p_zds<0b1000, "fabd",   int_aarch64_sve_fabd>;
+  defm FADD_ZPmZ   : sve_fp_2op_p_zds<0b0000, "fadd", "FADD_ZPZZ", int_aarch64_sve_fadd, DestructiveBinaryComm>;
+  defm FSUB_ZPmZ   : sve_fp_2op_p_zds<0b0001, "fsub", "FSUB_ZPZZ", int_aarch64_sve_fsub, DestructiveBinaryCommWithRev, "FSUBR_ZPmZ">;
+  defm FMUL_ZPmZ   : sve_fp_2op_p_zds<0b0010, "fmul", "FMUL_ZPZZ", int_aarch64_sve_fmul, DestructiveBinaryComm>;
+  defm FSUBR_ZPmZ  : sve_fp_2op_p_zds<0b0011, "fsubr", "FSUBR_ZPZZ", int_aarch64_sve_fsubr, DestructiveBinaryCommWithRev, "FSUB_ZPmZ", /*isReverseInstr*/ 1>;
+  defm FMAXNM_ZPmZ : sve_fp_2op_p_zds<0b0100, "fmaxnm", "FMAXNM_ZPZZ", int_aarch64_sve_fmaxnm, DestructiveBinaryComm>;
+  defm FMINNM_ZPmZ : sve_fp_2op_p_zds<0b0101, "fminnm", "FMINNM_ZPZZ", int_aarch64_sve_fminnm, DestructiveBinaryComm>;
+  defm FMAX_ZPmZ   : sve_fp_2op_p_zds<0b0110, "fmax", "FMAX_ZPZZ", int_aarch64_sve_fmax, DestructiveBinaryComm>;
+  defm FMIN_ZPmZ   : sve_fp_2op_p_zds<0b0111, "fmin", "FMIN_ZPZZ", int_aarch64_sve_fmin, DestructiveBinaryComm>;
+  defm FABD_ZPmZ   : sve_fp_2op_p_zds<0b1000, "fabd", "FABD_ZPZZ", int_aarch64_sve_fabd, DestructiveBinaryComm>;
   defm FSCALE_ZPmZ : sve_fp_2op_p_zds_fscale<0b1001, "fscale", int_aarch64_sve_fscale>;
-  defm FMULX_ZPmZ  : sve_fp_2op_p_zds<0b1010, "fmulx",  int_aarch64_sve_fmulx>;
-  defm FDIVR_ZPmZ  : sve_fp_2op_p_zds<0b1100, "fdivr",  int_aarch64_sve_fdivr>;
-  defm FDIV_ZPmZ   : sve_fp_2op_p_zds<0b1101, "fdiv",   int_aarch64_sve_fdiv>;
+  defm FMULX_ZPmZ  : sve_fp_2op_p_zds<0b1010, "fmulx", "FMULX_ZPZZ", int_aarch64_sve_fmulx, DestructiveBinaryComm>;
+  defm FDIVR_ZPmZ  : sve_fp_2op_p_zds<0b1100, "fdivr", "FDIVR_ZPZZ", int_aarch64_sve_fdivr, DestructiveBinaryCommWithRev, "FDIV_ZPmZ", /*isReverseInstr*/ 1>;
+  defm FDIV_ZPmZ   : sve_fp_2op_p_zds<0b1101, "fdiv", "FDIV_ZPZZ", int_aarch64_sve_fdiv, DestructiveBinaryCommWithRev, "FDIVR_ZPmZ">;
+
+  defm FADD_ZPZZ   : sve_fp_bin_pred_hfd<AArch64fadd_p>;
+
+  let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
+    defm FADD_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fadd>;
+    defm FSUB_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsub>;
+    defm FMUL_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmul>;
+    defm FSUBR_ZPZZ  : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fsubr>;
+    defm FMAXNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmaxnm>;
+    defm FMINNM_ZPZZ : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fminnm>;
+    defm FMAX_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmax>;
+    defm FMIN_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmin>;
+    defm FABD_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fabd>;
+    defm FMULX_ZPZZ  : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fmulx>;
+    defm FDIVR_ZPZZ  : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdivr>;
+    defm FDIV_ZPZZ   : sve_fp_2op_p_zds_zeroing_hsd<int_aarch64_sve_fdiv>;
+  }
 
   defm FADD_ZZZ    : sve_fp_3op_u_zd<0b000, "fadd",    fadd>;
   defm FSUB_ZZZ    : sve_fp_3op_u_zd<0b001, "fsub",    fsub>;
@@ -226,6 +399,16 @@ let Predicates = [HasSVE] in {
   defm FNMAD_ZPmZZ : sve_fp_3op_p_zds_b<0b10, "fnmad", int_aarch64_sve_fnmad>;
   defm FNMSB_ZPmZZ : sve_fp_3op_p_zds_b<0b11, "fnmsb", int_aarch64_sve_fnmsb>;
 
+  // Add patterns for FMA where disabled lanes are undef.
+  // FIXME: Implement a pseudo so we can choose a better instruction after
+  // regalloc.
+  def : Pat<(nxv8f16 (AArch64fma_p nxv8i1:$P, nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3)),
+            (FMLA_ZPmZZ_H $P, $Op3, $Op1, $Op2)>;
+  def : Pat<(nxv4f32 (AArch64fma_p nxv4i1:$P, nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3)),
+            (FMLA_ZPmZZ_S $P, $Op3, $Op1, $Op2)>;
+  def : Pat<(nxv2f64 (AArch64fma_p nxv2i1:$P, nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3)),
+            (FMLA_ZPmZZ_D $P, $Op3, $Op1, $Op2)>;
+
   defm FTMAD_ZZI : sve_fp_ftmad<"ftmad", int_aarch64_sve_ftmad_x>;
 
   defm FMLA_ZZZI : sve_fp_fma_by_indexed_elem<0b0, "fmla", int_aarch64_sve_fmla_lane>;
@@ -235,12 +418,21 @@ let Predicates = [HasSVE] in {
   defm FMUL_ZZZI   : sve_fp_fmul_by_indexed_elem<"fmul", int_aarch64_sve_fmul_lane>;
 
   // SVE floating point reductions.
-  defm FADDA_VPZ   : sve_fp_2op_p_vd<0b000, "fadda", int_aarch64_sve_fadda>;
-  defm FADDV_VPZ   : sve_fp_fast_red<0b000, "faddv", int_aarch64_sve_faddv>;
-  defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", int_aarch64_sve_fmaxnmv>;
-  defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", int_aarch64_sve_fminnmv>;
-  defm FMAXV_VPZ   : sve_fp_fast_red<0b110, "fmaxv", int_aarch64_sve_fmaxv>;
-  defm FMINV_VPZ   : sve_fp_fast_red<0b111, "fminv", int_aarch64_sve_fminv>;
+  defm FADDA_VPZ   : sve_fp_2op_p_vd<0b000, "fadda",   AArch64fadda_p>;
+  defm FADDV_VPZ   : sve_fp_fast_red<0b000, "faddv",   AArch64faddv_p>;
+  defm FMAXNMV_VPZ : sve_fp_fast_red<0b100, "fmaxnmv", AArch64fmaxnmv_p>;
+  defm FMINNMV_VPZ : sve_fp_fast_red<0b101, "fminnmv", AArch64fminnmv_p>;
+  defm FMAXV_VPZ   : sve_fp_fast_red<0b110, "fmaxv",   AArch64fmaxv_p>;
+  defm FMINV_VPZ   : sve_fp_fast_red<0b111, "fminv",   AArch64fminv_p>;
+
+  // Use more efficient NEON instructions to extract elements within the NEON
+  // part (first 128bits) of an SVE register.
+  def : Pat<(vector_extract (nxv8f16 ZPR:$Zs), (i64 0)),
+            (f16 (EXTRACT_SUBREG (v8f16 (EXTRACT_SUBREG ZPR:$Zs, zsub)), hsub))>;
+  def : Pat<(vector_extract (nxv4f32 ZPR:$Zs), (i64 0)),
+            (f32 (EXTRACT_SUBREG (v4f32 (EXTRACT_SUBREG ZPR:$Zs, zsub)), ssub))>;
+  def : Pat<(vector_extract (nxv2f64 ZPR:$Zs), (i64 0)),
+            (f64 (EXTRACT_SUBREG (v2f64 (EXTRACT_SUBREG ZPR:$Zs, zsub)), dsub))>;
 
   // Splat immediate (unpredicated)
   defm DUP_ZI   : sve_int_dup_imm<"dup">;
@@ -257,18 +449,88 @@ let Predicates = [HasSVE] in {
   defm DUP_ZZI : sve_int_perm_dup_i<"dup">;
 
   // Splat scalar register (predicated)
-  defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy">;
-  defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy">;
+  defm CPY_ZPmR : sve_int_perm_cpy_r<"cpy", AArch64dup_mt>;
+  defm CPY_ZPmV : sve_int_perm_cpy_v<"cpy", AArch64dup_mt>;
+
+  let Predicates = [HasSVE, HasBF16] in {
+    def : Pat<(nxv8bf16 (AArch64dup_mt nxv8i1:$pg, bf16:$splat, nxv8bf16:$passthru)),
+              (CPY_ZPmV_H $passthru, $pg, $splat)>;
+  }
+
+  // Duplicate FP scalar into all vector elements
+  def : Pat<(nxv8f16 (AArch64dup (f16 FPR16:$src))),
+            (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+  def : Pat<(nxv4f16 (AArch64dup (f16 FPR16:$src))),
+            (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+  def : Pat<(nxv2f16 (AArch64dup (f16 FPR16:$src))),
+            (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+  def : Pat<(nxv4f32 (AArch64dup (f32 FPR32:$src))),
+            (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
+  def : Pat<(nxv2f32 (AArch64dup (f32 FPR32:$src))),
+            (DUP_ZZI_S (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), 0)>;
+  def : Pat<(nxv2f64 (AArch64dup (f64 FPR64:$src))),
+            (DUP_ZZI_D (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), 0)>;
+  let Predicates = [HasSVE, HasBF16] in {
+    def : Pat<(nxv8bf16 (AArch64dup (bf16 FPR16:$src))),
+              (DUP_ZZI_H (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), 0)>;
+  }
+
+  // Duplicate +0.0 into all vector elements
+  def : Pat<(nxv8f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv4f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv2f16 (AArch64dup (f16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  def : Pat<(nxv4f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
+  def : Pat<(nxv2f32 (AArch64dup (f32 fpimm0))), (DUP_ZI_S 0, 0)>;
+  def : Pat<(nxv2f64 (AArch64dup (f64 fpimm0))), (DUP_ZI_D 0, 0)>;
+  let Predicates = [HasSVE, HasBF16] in {
+    def : Pat<(nxv8bf16 (AArch64dup (bf16 fpimm0))), (DUP_ZI_H 0, 0)>;
+  }
+
+  // Duplicate Int immediate into all vector elements
+  def : Pat<(nxv16i8 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
+            (DUP_ZI_B $a, $b)>;
+  def : Pat<(nxv8i16 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
+            (DUP_ZI_H $a, $b)>;
+  def : Pat<(nxv4i32 (AArch64dup (i32 (SVE8BitLslImm i32:$a, i32:$b)))),
+            (DUP_ZI_S $a, $b)>;
+  def : Pat<(nxv2i64 (AArch64dup (i64 (SVE8BitLslImm i32:$a, i32:$b)))),
+            (DUP_ZI_D $a, $b)>;
+
+  // Duplicate FP immediate into all vector elements
+  let AddedComplexity = 2 in {
+    def : Pat<(nxv8f16 (AArch64dup fpimm16:$imm8)),
+              (FDUP_ZI_H fpimm16:$imm8)>;
+    def : Pat<(nxv4f16 (AArch64dup fpimm16:$imm8)),
+              (FDUP_ZI_H fpimm16:$imm8)>;
+    def : Pat<(nxv2f16 (AArch64dup fpimm16:$imm8)),
+              (FDUP_ZI_H fpimm16:$imm8)>;
+    def : Pat<(nxv4f32 (AArch64dup fpimm32:$imm8)),
+              (FDUP_ZI_S fpimm32:$imm8)>;
+    def : Pat<(nxv2f32 (AArch64dup fpimm32:$imm8)),
+              (FDUP_ZI_S fpimm32:$imm8)>;
+    def : Pat<(nxv2f64 (AArch64dup fpimm64:$imm8)),
+              (FDUP_ZI_D fpimm64:$imm8)>;
+  }
 
   // Select elements from either vector (predicated)
   defm SEL_ZPZZ    : sve_int_sel_vvv<"sel", vselect>;
 
   defm SPLICE_ZPZ : sve_int_perm_splice<"splice", int_aarch64_sve_splice>;
+
+  let Predicates = [HasSVE, HasBF16] in {
+    def : SVE_3_Op_Pat<nxv8bf16, vselect, nxv8i1, nxv8bf16, nxv8bf16, SEL_ZPZZ_H>;
+    def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_splice, nxv8i1, nxv8bf16, nxv8bf16, SPLICE_ZPZ_H>;
+  }
+
   defm COMPACT_ZPZ : sve_int_perm_compact<"compact", int_aarch64_sve_compact>;
   defm INSR_ZR : sve_int_perm_insrs<"insr", AArch64insr>;
   defm INSR_ZV : sve_int_perm_insrv<"insr", AArch64insr>;
   defm EXT_ZZI : sve_int_perm_extract_i<"ext", AArch64ext>;
 
+  let Predicates = [HasSVE, HasBF16] in {
+    def : SVE_2_Op_Pat<nxv8bf16, AArch64insr, nxv8bf16, bf16, INSR_ZV_H>;
+  }
+
   defm RBIT_ZPmZ : sve_int_perm_rev_rbit<"rbit", int_aarch64_sve_rbit>;
   defm REVB_ZPmZ : sve_int_perm_rev_revb<"revb", int_aarch64_sve_revb, bswap>;
   defm REVH_ZPmZ : sve_int_perm_rev_revh<"revh", int_aarch64_sve_revh>;
@@ -277,6 +539,10 @@ let Predicates = [HasSVE] in {
   defm REV_PP : sve_int_perm_reverse_p<"rev", AArch64rev>;
   defm REV_ZZ : sve_int_perm_reverse_z<"rev", AArch64rev>;
 
+  let Predicates = [HasSVE, HasBF16] in {
+    def : SVE_1_Op_Pat<nxv8bf16, AArch64rev, nxv8bf16, REV_ZZ_H>;
+  }
+
   defm SUNPKLO_ZZ : sve_int_perm_unpk<0b00, "sunpklo", AArch64sunpklo>;
   defm SUNPKHI_ZZ : sve_int_perm_unpk<0b01, "sunpkhi", AArch64sunpkhi>;
   defm UUNPKLO_ZZ : sve_int_perm_unpk<0b10, "uunpklo", AArch64uunpklo>;
@@ -290,34 +556,34 @@ let Predicates = [HasSVE] in {
   def MOVPRFX_ZZ : sve_int_bin_cons_misc_0_c<0b00000001, "movprfx", ZPRAny>;
   defm FEXPA_ZZ : sve_int_bin_cons_misc_0_c_fexpa<"fexpa", int_aarch64_sve_fexpa_x>;
 
-  def BRKPA_PPzPP  : sve_int_brkp<0b00, "brkpa">;
-  def BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas">;
-  def BRKPB_PPzPP  : sve_int_brkp<0b01, "brkpb">;
-  def BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs">;
+  defm BRKPA_PPzPP  : sve_int_brkp<0b00, "brkpa",  int_aarch64_sve_brkpa_z>;
+  defm BRKPAS_PPzPP : sve_int_brkp<0b10, "brkpas", null_frag>;
+  defm BRKPB_PPzPP  : sve_int_brkp<0b01, "brkpb",  int_aarch64_sve_brkpb_z>;
+  defm BRKPBS_PPzPP : sve_int_brkp<0b11, "brkpbs", null_frag>;
 
-  def BRKN_PPzP    : sve_int_brkn<0b0, "brkn">;
-  def BRKNS_PPzP   : sve_int_brkn<0b1, "brkns">;
+  defm BRKN_PPzP  : sve_int_brkn<0b0, "brkn",  int_aarch64_sve_brkn_z>;
+  defm BRKNS_PPzP : sve_int_brkn<0b1, "brkns", null_frag>;
 
-  defm BRKA_PPzP  : sve_int_break_z<0b000, "brka">;
-  defm BRKA_PPmP  : sve_int_break_m<0b001, "brka">;
-  defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas">;
-  defm BRKB_PPzP  : sve_int_break_z<0b100, "brkb">;
-  defm BRKB_PPmP  : sve_int_break_m<0b101, "brkb">;
-  defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs">;
+  defm BRKA_PPzP  : sve_int_break_z<0b000, "brka",  int_aarch64_sve_brka_z>;
+  defm BRKA_PPmP  : sve_int_break_m<0b001, "brka",  int_aarch64_sve_brka>;
+  defm BRKAS_PPzP : sve_int_break_z<0b010, "brkas", null_frag>;
+  defm BRKB_PPzP  : sve_int_break_z<0b100, "brkb",  int_aarch64_sve_brkb_z>;
+  defm BRKB_PPmP  : sve_int_break_m<0b101, "brkb",  int_aarch64_sve_brkb>;
+  defm BRKBS_PPzP : sve_int_break_z<0b110, "brkbs", null_frag>;
 
   def PTEST_PP : sve_int_ptest<0b010000, "ptest">;
   def PFALSE   : sve_int_pfalse<0b000000, "pfalse">;
   defm PFIRST  : sve_int_pfirst<0b00000, "pfirst", int_aarch64_sve_pfirst>;
   defm PNEXT   : sve_int_pnext<0b00110, "pnext", int_aarch64_sve_pnext>;
 
-  defm AND_PPzPP   : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z>;
+  defm AND_PPzPP   : sve_int_pred_log<0b0000, "and", int_aarch64_sve_and_z, and>;
   defm BIC_PPzPP   : sve_int_pred_log<0b0001, "bic", int_aarch64_sve_bic_z>;
-  defm EOR_PPzPP   : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z>;
+  defm EOR_PPzPP   : sve_int_pred_log<0b0010, "eor", int_aarch64_sve_eor_z, xor>;
   defm SEL_PPPP    : sve_int_pred_log<0b0011, "sel", vselect>;
   defm ANDS_PPzPP  : sve_int_pred_log<0b0100, "ands", null_frag>;
   defm BICS_PPzPP  : sve_int_pred_log<0b0101, "bics", null_frag>;
   defm EORS_PPzPP  : sve_int_pred_log<0b0110, "eors", null_frag>;
-  defm ORR_PPzPP   : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z>;
+  defm ORR_PPzPP   : sve_int_pred_log<0b1000, "orr", int_aarch64_sve_orr_z, or>;
   defm ORN_PPzPP   : sve_int_pred_log<0b1001, "orn", int_aarch64_sve_orn_z>;
   defm NOR_PPzPP   : sve_int_pred_log<0b1010, "nor", int_aarch64_sve_nor_z>;
   defm NAND_PPzPP  : sve_int_pred_log<0b1011, "nand", int_aarch64_sve_nand_z>;
@@ -333,11 +599,23 @@ let Predicates = [HasSVE] in {
   defm CLASTA_ZPZ : sve_int_perm_clast_zz<0, "clasta", int_aarch64_sve_clasta>;
   defm CLASTB_ZPZ : sve_int_perm_clast_zz<1, "clastb", int_aarch64_sve_clastb>;
 
+  let Predicates = [HasSVE, HasBF16] in {
+    def : SVE_3_Op_Pat<bf16,     AArch64clasta_n,        nxv8i1, bf16,     nxv8bf16, CLASTA_VPZ_H>;
+    def : SVE_3_Op_Pat<bf16,     AArch64clastb_n,        nxv8i1, bf16,     nxv8bf16, CLASTB_VPZ_H>;
+    def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clasta, nxv8i1, nxv8bf16, nxv8bf16, CLASTA_ZPZ_H>;
+    def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_clastb, nxv8i1, nxv8bf16, nxv8bf16, CLASTB_ZPZ_H>;
+  }
+
   defm LASTA_RPZ : sve_int_perm_last_r<0, "lasta", AArch64lasta>;
   defm LASTB_RPZ : sve_int_perm_last_r<1, "lastb", AArch64lastb>;
   defm LASTA_VPZ : sve_int_perm_last_v<0, "lasta", AArch64lasta>;
   defm LASTB_VPZ : sve_int_perm_last_v<1, "lastb", AArch64lastb>;
 
+  let Predicates = [HasSVE, HasBF16] in {
+    def : SVE_2_Op_Pat<bf16, AArch64lasta, nxv8i1, nxv8bf16, LASTA_VPZ_H>;
+    def : SVE_2_Op_Pat<bf16, AArch64lastb, nxv8i1, nxv8bf16, LASTB_VPZ_H>;
+  }
+
   // continuous load with reg+immediate
   defm LD1B_IMM    : sve_mem_cld_si<0b0000, "ld1b",  Z_b, ZPR8>;
   defm LD1B_H_IMM  : sve_mem_cld_si<0b0001, "ld1b",  Z_h, ZPR16>;
@@ -468,115 +746,115 @@ let Predicates = [HasSVE] in {
 
   // Gathers using unscaled 32-bit offsets, e.g.
   //    ld1h z0.s, p0/z, [x0, z0.s, uxtw]
-  defm GLD1SB_S   : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb",   AArch64ld1s_gather_sxtw,   AArch64ld1s_gather_uxtw,   ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
-  defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag,                 null_frag,                 ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
-  defm GLD1B_S    : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b",    AArch64ld1_gather_sxtw,    AArch64ld1_gather_uxtw,    ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
-  defm GLDFF1B_S  : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b",  null_frag,                 null_frag,                 ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
-  defm GLD1SH_S   : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh",   AArch64ld1s_gather_sxtw,   AArch64ld1s_gather_uxtw,   ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
-  defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag,                 null_frag,                 ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
-  defm GLD1H_S    : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h",    AArch64ld1_gather_sxtw,    AArch64ld1_gather_uxtw,    ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
-  defm GLDFF1H_S  : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h",  null_frag,                 null_frag,                 ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i16>;
-  defm GLD1W      : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w",    AArch64ld1_gather_sxtw,    AArch64ld1_gather_uxtw,    ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
-  defm GLDFF1W    : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w",  null_frag,                 null_frag,                 ZPR32ExtSXTW8, ZPR32ExtUXTW8, nxv4i32>;
+  defm GLD1SB_S   : sve_mem_32b_gld_vs_32_unscaled<0b0000, "ld1sb",   AArch64ld1s_gather_sxtw_z,   AArch64ld1s_gather_uxtw_z,   ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+  defm GLDFF1SB_S : sve_mem_32b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+  defm GLD1B_S    : sve_mem_32b_gld_vs_32_unscaled<0b0010, "ld1b",    AArch64ld1_gather_sxtw_z,    AArch64ld1_gather_uxtw_z,    ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+  defm GLDFF1B_S  : sve_mem_32b_gld_vs_32_unscaled<0b0011, "ldff1b",  AArch64ldff1_gather_sxtw_z,  AArch64ldff1_gather_uxtw_z,  ZPR32ExtSXTW8Only, ZPR32ExtUXTW8Only, nxv4i8>;
+  defm GLD1SH_S   : sve_mem_32b_gld_vs_32_unscaled<0b0100, "ld1sh",   AArch64ld1s_gather_sxtw_z,   AArch64ld1s_gather_uxtw_z,   ZPR32ExtSXTW8,     ZPR32ExtUXTW8,     nxv4i16>;
+  defm GLDFF1SH_S : sve_mem_32b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR32ExtSXTW8,     ZPR32ExtUXTW8,     nxv4i16>;
+  defm GLD1H_S    : sve_mem_32b_gld_vs_32_unscaled<0b0110, "ld1h",    AArch64ld1_gather_sxtw_z,    AArch64ld1_gather_uxtw_z,    ZPR32ExtSXTW8,     ZPR32ExtUXTW8,     nxv4i16>;
+  defm GLDFF1H_S  : sve_mem_32b_gld_vs_32_unscaled<0b0111, "ldff1h",  AArch64ldff1_gather_sxtw_z,  AArch64ldff1_gather_uxtw_z,  ZPR32ExtSXTW8,     ZPR32ExtUXTW8,     nxv4i16>;
+  defm GLD1W      : sve_mem_32b_gld_vs_32_unscaled<0b1010, "ld1w",    AArch64ld1_gather_sxtw_z,    AArch64ld1_gather_uxtw_z,    ZPR32ExtSXTW8,     ZPR32ExtUXTW8,     nxv4i32>;
+  defm GLDFF1W    : sve_mem_32b_gld_vs_32_unscaled<0b1011, "ldff1w",  AArch64ldff1_gather_sxtw_z,  AArch64ldff1_gather_uxtw_z,  ZPR32ExtSXTW8,     ZPR32ExtUXTW8,     nxv4i32>;
 
   // Gathers using scaled 32-bit offsets, e.g.
   //    ld1h z0.s, p0/z, [x0, z0.s, uxtw #1]
-  defm GLD1SH_S   : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh",   AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
-  defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag,                      null_frag,                      ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
-  defm GLD1H_S    : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h",    AArch64ld1_gather_sxtw_scaled,  AArch64ld1_gather_uxtw_scaled,  ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
-  defm GLDFF1H_S  : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h",  null_frag,                      null_frag,                      ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
-  defm GLD1W      : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w",    AArch64ld1_gather_sxtw_scaled,  AArch64ld1_gather_uxtw_scaled,  ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
-  defm GLDFF1W    : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w",  null_frag,                      null_frag,                      ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
+  defm GLD1SH_S   : sve_mem_32b_gld_sv_32_scaled<0b0100, "ld1sh",   AArch64ld1s_gather_sxtw_scaled_z,   AArch64ld1s_gather_uxtw_scaled_z,   ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+  defm GLDFF1SH_S : sve_mem_32b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+  defm GLD1H_S    : sve_mem_32b_gld_sv_32_scaled<0b0110, "ld1h",    AArch64ld1_gather_sxtw_scaled_z,    AArch64ld1_gather_uxtw_scaled_z,    ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+  defm GLDFF1H_S  : sve_mem_32b_gld_sv_32_scaled<0b0111, "ldff1h",  AArch64ldff1_gather_sxtw_scaled_z,  AArch64ldff1_gather_uxtw_scaled_z,  ZPR32ExtSXTW16, ZPR32ExtUXTW16, nxv4i16>;
+  defm GLD1W      : sve_mem_32b_gld_sv_32_scaled<0b1010, "ld1w",    AArch64ld1_gather_sxtw_scaled_z,    AArch64ld1_gather_uxtw_scaled_z,    ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
+  defm GLDFF1W    : sve_mem_32b_gld_sv_32_scaled<0b1011, "ldff1w",  AArch64ldff1_gather_sxtw_scaled_z,  AArch64ldff1_gather_uxtw_scaled_z,  ZPR32ExtSXTW32, ZPR32ExtUXTW32, nxv4i32>;
 
   // Gathers using 32-bit pointers with scaled offset, e.g.
   //    ld1h z0.s, p0/z, [z0.s, #16]
-  defm GLD1SB_S   : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb",   imm0_31, AArch64ld1s_gather_imm,  nxv4i8>;
-  defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, null_frag,               nxv4i8>;
-  defm GLD1B_S    : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b",    imm0_31, AArch64ld1_gather_imm,   nxv4i8>;
-  defm GLDFF1B_S  : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b",  imm0_31, null_frag,               nxv4i8>;
-  defm GLD1SH_S   : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh",   uimm5s2, AArch64ld1s_gather_imm,  nxv4i16>;
-  defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag,               nxv4i16>;
-  defm GLD1H_S    : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h",    uimm5s2, AArch64ld1_gather_imm,   nxv4i16>;
-  defm GLDFF1H_S  : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h",  uimm5s2, null_frag,               nxv4i16>;
-  defm GLD1W      : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w",    uimm5s4, AArch64ld1_gather_imm,   nxv4i32>;
-  defm GLDFF1W    : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w",  uimm5s4, null_frag,               nxv4i32>;
+  defm GLD1SB_S   : sve_mem_32b_gld_vi_32_ptrs<0b0000, "ld1sb",   imm0_31, AArch64ld1s_gather_imm_z,   nxv4i8>;
+  defm GLDFF1SB_S : sve_mem_32b_gld_vi_32_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv4i8>;
+  defm GLD1B_S    : sve_mem_32b_gld_vi_32_ptrs<0b0010, "ld1b",    imm0_31, AArch64ld1_gather_imm_z,    nxv4i8>;
+  defm GLDFF1B_S  : sve_mem_32b_gld_vi_32_ptrs<0b0011, "ldff1b",  imm0_31, AArch64ldff1_gather_imm_z,  nxv4i8>;
+  defm GLD1SH_S   : sve_mem_32b_gld_vi_32_ptrs<0b0100, "ld1sh",   uimm5s2, AArch64ld1s_gather_imm_z,   nxv4i16>;
+  defm GLDFF1SH_S : sve_mem_32b_gld_vi_32_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv4i16>;
+  defm GLD1H_S    : sve_mem_32b_gld_vi_32_ptrs<0b0110, "ld1h",    uimm5s2, AArch64ld1_gather_imm_z,    nxv4i16>;
+  defm GLDFF1H_S  : sve_mem_32b_gld_vi_32_ptrs<0b0111, "ldff1h",  uimm5s2, AArch64ldff1_gather_imm_z,  nxv4i16>;
+  defm GLD1W      : sve_mem_32b_gld_vi_32_ptrs<0b1010, "ld1w",    uimm5s4, AArch64ld1_gather_imm_z,    nxv4i32>;
+  defm GLDFF1W    : sve_mem_32b_gld_vi_32_ptrs<0b1011, "ldff1w",  uimm5s4, AArch64ldff1_gather_imm_z,  nxv4i32>;
 
   // Gathers using 64-bit pointers with scaled offset, e.g.
   //    ld1h z0.d, p0/z, [z0.d, #16]
-  defm GLD1SB_D   : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb",   imm0_31, AArch64ld1s_gather_imm,  nxv2i8>;
-  defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, null_frag,               nxv2i8>;
-  defm GLD1B_D    : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b",    imm0_31, AArch64ld1_gather_imm,   nxv2i8>;
-  defm GLDFF1B_D  : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b",  imm0_31, null_frag,               nxv2i8>;
-  defm GLD1SH_D   : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh",   uimm5s2, AArch64ld1s_gather_imm,  nxv2i16>;
-  defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, null_frag,               nxv2i16>;
-  defm GLD1H_D    : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h",    uimm5s2, AArch64ld1_gather_imm,   nxv2i16>;
-  defm GLDFF1H_D  : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h",  uimm5s2, null_frag,               nxv2i16>;
-  defm GLD1SW_D   : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw",   uimm5s4, AArch64ld1s_gather_imm,  nxv2i32>;
-  defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, null_frag,               nxv2i32>;
-  defm GLD1W_D    : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w",    uimm5s4, AArch64ld1_gather_imm,   nxv2i32>;
-  defm GLDFF1W_D  : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w",  uimm5s4, null_frag,               nxv2i32>;
-  defm GLD1D      : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d",    uimm5s8, AArch64ld1_gather_imm,   nxv2i64>;
-  defm GLDFF1D    : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d",  uimm5s8, null_frag,               nxv2i64>;
+  defm GLD1SB_D   : sve_mem_64b_gld_vi_64_ptrs<0b0000, "ld1sb",   imm0_31, AArch64ld1s_gather_imm_z,   nxv2i8>;
+  defm GLDFF1SB_D : sve_mem_64b_gld_vi_64_ptrs<0b0001, "ldff1sb", imm0_31, AArch64ldff1s_gather_imm_z, nxv2i8>;
+  defm GLD1B_D    : sve_mem_64b_gld_vi_64_ptrs<0b0010, "ld1b",    imm0_31, AArch64ld1_gather_imm_z,    nxv2i8>;
+  defm GLDFF1B_D  : sve_mem_64b_gld_vi_64_ptrs<0b0011, "ldff1b",  imm0_31, AArch64ldff1_gather_imm_z,  nxv2i8>;
+  defm GLD1SH_D   : sve_mem_64b_gld_vi_64_ptrs<0b0100, "ld1sh",   uimm5s2, AArch64ld1s_gather_imm_z,   nxv2i16>;
+  defm GLDFF1SH_D : sve_mem_64b_gld_vi_64_ptrs<0b0101, "ldff1sh", uimm5s2, AArch64ldff1s_gather_imm_z, nxv2i16>;
+  defm GLD1H_D    : sve_mem_64b_gld_vi_64_ptrs<0b0110, "ld1h",    uimm5s2, AArch64ld1_gather_imm_z,    nxv2i16>;
+  defm GLDFF1H_D  : sve_mem_64b_gld_vi_64_ptrs<0b0111, "ldff1h",  uimm5s2, AArch64ldff1_gather_imm_z,  nxv2i16>;
+  defm GLD1SW_D   : sve_mem_64b_gld_vi_64_ptrs<0b1000, "ld1sw",   uimm5s4, AArch64ld1s_gather_imm_z,   nxv2i32>;
+  defm GLDFF1SW_D : sve_mem_64b_gld_vi_64_ptrs<0b1001, "ldff1sw", uimm5s4, AArch64ldff1s_gather_imm_z, nxv2i32>;
+  defm GLD1W_D    : sve_mem_64b_gld_vi_64_ptrs<0b1010, "ld1w",    uimm5s4, AArch64ld1_gather_imm_z,    nxv2i32>;
+  defm GLDFF1W_D  : sve_mem_64b_gld_vi_64_ptrs<0b1011, "ldff1w",  uimm5s4, AArch64ldff1_gather_imm_z,  nxv2i32>;
+  defm GLD1D      : sve_mem_64b_gld_vi_64_ptrs<0b1110, "ld1d",    uimm5s8, AArch64ld1_gather_imm_z,    nxv2i64>;
+  defm GLDFF1D    : sve_mem_64b_gld_vi_64_ptrs<0b1111, "ldff1d",  uimm5s8, AArch64ldff1_gather_imm_z,  nxv2i64>;
 
   // Gathers using unscaled 64-bit offsets, e.g.
   //    ld1h z0.d, p0/z, [x0, z0.d]
-  defm GLD1SB_D   : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb",   AArch64ld1s_gather,  nxv2i8>;
-  defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", null_frag,           nxv2i8>;
-  defm GLD1B_D    : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b",    AArch64ld1_gather,   nxv2i8>;
-  defm GLDFF1B_D  : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b",  null_frag,           nxv2i8>;
-  defm GLD1SH_D   : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh",   AArch64ld1s_gather,  nxv2i16>;
-  defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", null_frag,           nxv2i16>;
-  defm GLD1H_D    : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h",    AArch64ld1_gather,   nxv2i16>;
-  defm GLDFF1H_D  : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h",  null_frag,           nxv2i16>;
-  defm GLD1SW_D   : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw",   AArch64ld1s_gather,  nxv2i32>;
-  defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", null_frag,           nxv2i32>;
-  defm GLD1W_D    : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w",    AArch64ld1_gather,   nxv2i32>;
-  defm GLDFF1W_D  : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w",  null_frag,           nxv2i32>;
-  defm GLD1D      : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d",    AArch64ld1_gather,   nxv2i64>;
-  defm GLDFF1D    : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d",  null_frag,           nxv2i64>;
+  defm GLD1SB_D   : sve_mem_64b_gld_vs2_64_unscaled<0b0000, "ld1sb",   AArch64ld1s_gather_z,   nxv2i8>;
+  defm GLDFF1SB_D : sve_mem_64b_gld_vs2_64_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_z, nxv2i8>;
+  defm GLD1B_D    : sve_mem_64b_gld_vs2_64_unscaled<0b0010, "ld1b",    AArch64ld1_gather_z,    nxv2i8>;
+  defm GLDFF1B_D  : sve_mem_64b_gld_vs2_64_unscaled<0b0011, "ldff1b",  AArch64ldff1_gather_z,  nxv2i8>;
+  defm GLD1SH_D   : sve_mem_64b_gld_vs2_64_unscaled<0b0100, "ld1sh",   AArch64ld1s_gather_z,   nxv2i16>;
+  defm GLDFF1SH_D : sve_mem_64b_gld_vs2_64_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_z, nxv2i16>;
+  defm GLD1H_D    : sve_mem_64b_gld_vs2_64_unscaled<0b0110, "ld1h",    AArch64ld1_gather_z,    nxv2i16>;
+  defm GLDFF1H_D  : sve_mem_64b_gld_vs2_64_unscaled<0b0111, "ldff1h",  AArch64ldff1_gather_z,  nxv2i16>;
+  defm GLD1SW_D   : sve_mem_64b_gld_vs2_64_unscaled<0b1000, "ld1sw",   AArch64ld1s_gather_z,   nxv2i32>;
+  defm GLDFF1SW_D : sve_mem_64b_gld_vs2_64_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_z, nxv2i32>;
+  defm GLD1W_D    : sve_mem_64b_gld_vs2_64_unscaled<0b1010, "ld1w",    AArch64ld1_gather_z,    nxv2i32>;
+  defm GLDFF1W_D  : sve_mem_64b_gld_vs2_64_unscaled<0b1011, "ldff1w",  AArch64ldff1_gather_z,  nxv2i32>;
+  defm GLD1D      : sve_mem_64b_gld_vs2_64_unscaled<0b1110, "ld1d",    AArch64ld1_gather_z,    nxv2i64>;
+  defm GLDFF1D    : sve_mem_64b_gld_vs2_64_unscaled<0b1111, "ldff1d",  AArch64ldff1_gather_z,  nxv2i64>;
 
   // Gathers using scaled 64-bit offsets, e.g.
   //    ld1h z0.d, p0/z, [x0, z0.d, lsl #1]
-  defm GLD1SH_D   : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh",    AArch64ld1s_gather_scaled,  ZPR64ExtLSL16, nxv2i16>;
-  defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh",  null_frag,                  ZPR64ExtLSL16, nxv2i16>;
-  defm GLD1H_D    : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h",     AArch64ld1_gather_scaled,   ZPR64ExtLSL16, nxv2i16>;
-  defm GLDFF1H_D  : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h",   null_frag,                  ZPR64ExtLSL16, nxv2i16>;
-  defm GLD1SW_D   : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw",    AArch64ld1s_gather_scaled,  ZPR64ExtLSL32, nxv2i32>;
-  defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw",  null_frag,                  ZPR64ExtLSL32, nxv2i32>;
-  defm GLD1W_D    : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w",     AArch64ld1_gather_scaled,   ZPR64ExtLSL32, nxv2i32>;
-  defm GLDFF1W_D  : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w",   null_frag,                  ZPR64ExtLSL32, nxv2i32>;
-  defm GLD1D      : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d",     AArch64ld1_gather_scaled,   ZPR64ExtLSL64, nxv2i64>;
-  defm GLDFF1D    : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d",   null_frag,                  ZPR64ExtLSL64, nxv2i64>;
+  defm GLD1SH_D   : sve_mem_64b_gld_sv2_64_scaled<0b0100, "ld1sh",    AArch64ld1s_gather_scaled_z,   ZPR64ExtLSL16, nxv2i16>;
+  defm GLDFF1SH_D : sve_mem_64b_gld_sv2_64_scaled<0b0101, "ldff1sh",  AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL16, nxv2i16>;
+  defm GLD1H_D    : sve_mem_64b_gld_sv2_64_scaled<0b0110, "ld1h",     AArch64ld1_gather_scaled_z,    ZPR64ExtLSL16, nxv2i16>;
+  defm GLDFF1H_D  : sve_mem_64b_gld_sv2_64_scaled<0b0111, "ldff1h",   AArch64ldff1_gather_scaled_z,  ZPR64ExtLSL16, nxv2i16>;
+  defm GLD1SW_D   : sve_mem_64b_gld_sv2_64_scaled<0b1000, "ld1sw",    AArch64ld1s_gather_scaled_z,   ZPR64ExtLSL32, nxv2i32>;
+  defm GLDFF1SW_D : sve_mem_64b_gld_sv2_64_scaled<0b1001, "ldff1sw",  AArch64ldff1s_gather_scaled_z, ZPR64ExtLSL32, nxv2i32>;
+  defm GLD1W_D    : sve_mem_64b_gld_sv2_64_scaled<0b1010, "ld1w",     AArch64ld1_gather_scaled_z,    ZPR64ExtLSL32, nxv2i32>;
+  defm GLDFF1W_D  : sve_mem_64b_gld_sv2_64_scaled<0b1011, "ldff1w",   AArch64ldff1_gather_scaled_z,  ZPR64ExtLSL32, nxv2i32>;
+  defm GLD1D      : sve_mem_64b_gld_sv2_64_scaled<0b1110, "ld1d",     AArch64ld1_gather_scaled_z,    ZPR64ExtLSL64, nxv2i64>;
+  defm GLDFF1D    : sve_mem_64b_gld_sv2_64_scaled<0b1111, "ldff1d",   AArch64ldff1_gather_scaled_z,  ZPR64ExtLSL64, nxv2i64>;
 
   // Gathers using unscaled 32-bit offsets unpacked in 64-bits elements, e.g.
   //    ld1h z0.d, p0/z, [x0, z0.d, uxtw]
-  defm GLD1SB_D   : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb",   AArch64ld1s_gather_sxtw,   AArch64ld1s_gather_uxtw,   ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
-  defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", null_frag,                 null_frag,                 ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
-  defm GLD1B_D    : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b",    AArch64ld1_gather_sxtw,    AArch64ld1_gather_uxtw,    ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
-  defm GLDFF1B_D  : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b",  null_frag,                 null_frag,                 ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
-  defm GLD1SH_D   : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh",   AArch64ld1s_gather_sxtw,   AArch64ld1s_gather_uxtw,   ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
-  defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", null_frag,                 null_frag,                 ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
-  defm GLD1H_D    : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h",    AArch64ld1_gather_sxtw,    AArch64ld1_gather_uxtw,    ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
-  defm GLDFF1H_D  : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h",  null_frag,                 null_frag,                 ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
-  defm GLD1SW_D   : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw",   AArch64ld1s_gather_sxtw,   AArch64ld1s_gather_uxtw,   ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
-  defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", null_frag,                 null_frag,                 ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
-  defm GLD1W_D    : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w",    AArch64ld1_gather_sxtw,    AArch64ld1_gather_uxtw,    ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
-  defm GLDFF1W_D  : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w",  null_frag,                 null_frag,                 ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
-  defm GLD1D      : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d",    AArch64ld1_gather_sxtw,    AArch64ld1_gather_uxtw,    ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
-  defm GLDFF1D    : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d",  null_frag,                 null_frag,                 ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
+  defm GLD1SB_D   : sve_mem_64b_gld_vs_32_unscaled<0b0000, "ld1sb",   AArch64ld1s_gather_sxtw_z,   AArch64ld1s_gather_uxtw_z,   ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+  defm GLDFF1SB_D : sve_mem_64b_gld_vs_32_unscaled<0b0001, "ldff1sb", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+  defm GLD1B_D    : sve_mem_64b_gld_vs_32_unscaled<0b0010, "ld1b",    AArch64ld1_gather_sxtw_z,    AArch64ld1_gather_uxtw_z,    ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+  defm GLDFF1B_D  : sve_mem_64b_gld_vs_32_unscaled<0b0011, "ldff1b",  AArch64ldff1_gather_sxtw_z,  AArch64ldff1_gather_uxtw_z,  ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, nxv2i8>;
+  defm GLD1SH_D   : sve_mem_64b_gld_vs_32_unscaled<0b0100, "ld1sh",   AArch64ld1s_gather_sxtw_z,   AArch64ld1s_gather_uxtw_z,   ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+  defm GLDFF1SH_D : sve_mem_64b_gld_vs_32_unscaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+  defm GLD1H_D    : sve_mem_64b_gld_vs_32_unscaled<0b0110, "ld1h",    AArch64ld1_gather_sxtw_z,    AArch64ld1_gather_uxtw_z,    ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+  defm GLDFF1H_D  : sve_mem_64b_gld_vs_32_unscaled<0b0111, "ldff1h",  AArch64ldff1_gather_sxtw_z,  AArch64ldff1_gather_uxtw_z,  ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i16>;
+  defm GLD1SW_D   : sve_mem_64b_gld_vs_32_unscaled<0b1000, "ld1sw",   AArch64ld1s_gather_sxtw_z,   AArch64ld1s_gather_uxtw_z,   ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+  defm GLDFF1SW_D : sve_mem_64b_gld_vs_32_unscaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_z, AArch64ldff1s_gather_uxtw_z, ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+  defm GLD1W_D    : sve_mem_64b_gld_vs_32_unscaled<0b1010, "ld1w",    AArch64ld1_gather_sxtw_z,    AArch64ld1_gather_uxtw_z,    ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+  defm GLDFF1W_D  : sve_mem_64b_gld_vs_32_unscaled<0b1011, "ldff1w",  AArch64ldff1_gather_sxtw_z,  AArch64ldff1_gather_uxtw_z,  ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i32>;
+  defm GLD1D      : sve_mem_64b_gld_vs_32_unscaled<0b1110, "ld1d",    AArch64ld1_gather_sxtw_z,    AArch64ld1_gather_uxtw_z,    ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
+  defm GLDFF1D    : sve_mem_64b_gld_vs_32_unscaled<0b1111, "ldff1d",  AArch64ldff1_gather_sxtw_z,  AArch64ldff1_gather_uxtw_z,  ZPR64ExtSXTW8, ZPR64ExtUXTW8, nxv2i64>;
 
   // Gathers using scaled 32-bit offsets unpacked in 64-bits elements, e.g.
   //    ld1h z0.d, p0/z, [x0, z0.d, uxtw #1]
-  defm GLD1SH_D   : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh",   AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
-  defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", null_frag, null_frag,                                           ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
-  defm GLD1H_D    : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h",    AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled,   ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
-  defm GLDFF1H_D  : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h",  null_frag, null_frag,                                           ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
-  defm GLD1SW_D   : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw",   AArch64ld1s_gather_sxtw_scaled, AArch64ld1s_gather_uxtw_scaled, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
-  defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", null_frag, null_frag,                                           ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
-  defm GLD1W_D    : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w",    AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled,   ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
-  defm GLDFF1W_D  : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w",  null_frag, null_frag,                                           ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
-  defm GLD1D      : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d",    AArch64ld1_gather_sxtw_scaled, AArch64ld1_gather_uxtw_scaled,   ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
-  defm GLDFF1D    : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d",  null_frag, null_frag,                                           ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
+  defm GLD1SH_D   : sve_mem_64b_gld_sv_32_scaled<0b0100, "ld1sh",   AArch64ld1s_gather_sxtw_scaled_z,   AArch64ld1s_gather_uxtw_scaled_z,   ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+  defm GLDFF1SH_D : sve_mem_64b_gld_sv_32_scaled<0b0101, "ldff1sh", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+  defm GLD1H_D    : sve_mem_64b_gld_sv_32_scaled<0b0110, "ld1h",    AArch64ld1_gather_sxtw_scaled_z,    AArch64ld1_gather_uxtw_scaled_z,    ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+  defm GLDFF1H_D  : sve_mem_64b_gld_sv_32_scaled<0b0111, "ldff1h",  AArch64ldff1_gather_sxtw_scaled_z,  AArch64ldff1_gather_uxtw_scaled_z,  ZPR64ExtSXTW16, ZPR64ExtUXTW16, nxv2i16>;
+  defm GLD1SW_D   : sve_mem_64b_gld_sv_32_scaled<0b1000, "ld1sw",   AArch64ld1s_gather_sxtw_scaled_z,   AArch64ld1s_gather_uxtw_scaled_z,   ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+  defm GLDFF1SW_D : sve_mem_64b_gld_sv_32_scaled<0b1001, "ldff1sw", AArch64ldff1s_gather_sxtw_scaled_z, AArch64ldff1s_gather_uxtw_scaled_z, ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+  defm GLD1W_D    : sve_mem_64b_gld_sv_32_scaled<0b1010, "ld1w",    AArch64ld1_gather_sxtw_scaled_z,    AArch64ld1_gather_uxtw_scaled_z,    ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+  defm GLDFF1W_D  : sve_mem_64b_gld_sv_32_scaled<0b1011, "ldff1w",  AArch64ldff1_gather_sxtw_scaled_z,  AArch64ldff1_gather_uxtw_scaled_z,  ZPR64ExtSXTW32, ZPR64ExtUXTW32, nxv2i32>;
+  defm GLD1D      : sve_mem_64b_gld_sv_32_scaled<0b1110, "ld1d",    AArch64ld1_gather_sxtw_scaled_z,    AArch64ld1_gather_uxtw_scaled_z,    ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
+  defm GLDFF1D    : sve_mem_64b_gld_sv_32_scaled<0b1111, "ldff1d",  AArch64ldff1_gather_sxtw_scaled_z,  AArch64ldff1_gather_uxtw_scaled_z,  ZPR64ExtSXTW64, ZPR64ExtUXTW64, nxv2i64>;
 
   // Non-temporal contiguous loads (register + immediate)
   defm LDNT1B_ZRI : sve_mem_cldnt_si<0b00, "ldnt1b", Z_b, ZPR8>;
@@ -640,16 +918,16 @@ let Predicates = [HasSVE] in {
 
   // Scatters using 32/64-bit pointers with offset, e.g.
   //    st1h z0.s, p0, [z0.s, #16]
-  defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", timm0_31, AArch64st1_scatter_imm, nxv4i8>;
-  defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv4i16>;
-  defm SST1W   : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv4i32>;
+  defm SST1B_S : sve_mem_32b_sst_vi_ptrs<0b001, "st1b", imm0_31, AArch64st1_scatter_imm, nxv4i8>;
+  defm SST1H_S : sve_mem_32b_sst_vi_ptrs<0b011, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv4i16>;
+  defm SST1W   : sve_mem_32b_sst_vi_ptrs<0b101, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv4i32>;
 
   // Scatters using 32/64-bit pointers with offset, e.g.
   //    st1h z0.d, p0, [z0.d, #16]
-  defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", timm0_31, AArch64st1_scatter_imm, nxv2i8>;
-  defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", tuimm5s2, AArch64st1_scatter_imm, nxv2i16>;
-  defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", tuimm5s4, AArch64st1_scatter_imm, nxv2i32>;
-  defm SST1D   : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", tuimm5s8, AArch64st1_scatter_imm, nxv2i64>;
+  defm SST1B_D : sve_mem_64b_sst_vi_ptrs<0b000, "st1b", imm0_31, AArch64st1_scatter_imm, nxv2i8>;
+  defm SST1H_D : sve_mem_64b_sst_vi_ptrs<0b010, "st1h", uimm5s2, AArch64st1_scatter_imm, nxv2i16>;
+  defm SST1W_D : sve_mem_64b_sst_vi_ptrs<0b100, "st1w", uimm5s4, AArch64st1_scatter_imm, nxv2i32>;
+  defm SST1D   : sve_mem_64b_sst_vi_ptrs<0b110, "st1d", uimm5s8, AArch64st1_scatter_imm, nxv2i64>;
 
   // Scatters using unscaled 64-bit offsets, e.g.
   //    st1h z0.d, p0, [x0, z0.d]
@@ -722,47 +1000,92 @@ let Predicates = [HasSVE] in {
   def PRFS_PRR : sve_mem_prfm_ss<0b101, "prfw", GPR64NoXZRshifted32>;
   def PRFD_PRR : sve_mem_prfm_ss<0b111, "prfd", GPR64NoXZRshifted64>;
 
+multiclass sve_prefetch<SDPatternOperator prefetch, ValueType PredTy, Instruction RegImmInst, Instruction RegRegInst, int scale, ComplexPattern AddrCP> {
+    // reg + imm
+    let AddedComplexity = 2 in {
+      def _reg_imm : Pat<(prefetch (PredTy PPR_3b:$gp), (am_sve_indexed_s6 GPR64sp:$base, simm6s1:$offset), (i32 sve_prfop:$prfop)),
+                         (RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, simm6s1:$offset)>;
+    }
+
+    // reg + reg
+    let AddedComplexity = 1 in {
+      def _reg_reg : Pat<(prefetch (PredTy PPR_3b:$gp), (AddrCP GPR64sp:$base, GPR64:$index), (i32 sve_prfop:$prfop)),
+                         (RegRegInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, GPR64:$index)>;
+    }
+
+    // default fallback
+    def _default : Pat<(prefetch  (PredTy PPR_3b:$gp), GPR64:$base, (i32 sve_prfop:$prfop)),
+                       (RegImmInst sve_prfop:$prfop, PPR_3b:$gp, GPR64:$base, (i64 0))>;
+  }
+
+  defm : sve_prefetch<int_aarch64_sve_prf, nxv16i1, PRFB_PRI, PRFB_PRR, 0, am_sve_regreg_lsl0>;
+  defm : sve_prefetch<int_aarch64_sve_prf, nxv8i1,  PRFH_PRI, PRFH_PRR, 1, am_sve_regreg_lsl1>;
+  defm : sve_prefetch<int_aarch64_sve_prf, nxv4i1,  PRFW_PRI, PRFS_PRR, 2, am_sve_regreg_lsl2>;
+  defm : sve_prefetch<int_aarch64_sve_prf, nxv2i1,  PRFD_PRI, PRFD_PRR, 3, am_sve_regreg_lsl3>;
+
   // Gather prefetch using scaled 32-bit offsets, e.g.
   //    prfh pldl1keep, p0, [x0, z0.s, uxtw #1]
-  defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only,  ZPR32ExtUXTW8Only>;
-  defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16, ZPR32ExtUXTW16>;
-  defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32, ZPR32ExtUXTW32>;
-  defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64, ZPR32ExtUXTW64>;
+  defm PRFB_S : sve_mem_32b_prfm_sv_scaled<0b00, "prfb", ZPR32ExtSXTW8Only,  ZPR32ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>;
+  defm PRFH_S : sve_mem_32b_prfm_sv_scaled<0b01, "prfh", ZPR32ExtSXTW16,     ZPR32ExtUXTW16,    int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>;
+  defm PRFW_S : sve_mem_32b_prfm_sv_scaled<0b10, "prfw", ZPR32ExtSXTW32,     ZPR32ExtUXTW32,    int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>;
+  defm PRFD_S : sve_mem_32b_prfm_sv_scaled<0b11, "prfd", ZPR32ExtSXTW64,     ZPR32ExtUXTW64,    int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>;
 
   // Gather prefetch using unpacked, scaled 32-bit offsets, e.g.
   //    prfh pldl1keep, p0, [x0, z0.d, uxtw #1]
-  defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only>;
-  defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16, ZPR64ExtUXTW16>;
-  defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32, ZPR64ExtUXTW32>;
-  defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64, ZPR64ExtUXTW64>;
+  defm PRFB_D : sve_mem_64b_prfm_sv_ext_scaled<0b00, "prfb", ZPR64ExtSXTW8Only, ZPR64ExtUXTW8Only, int_aarch64_sve_prfb_gather_sxtw_index, int_aarch64_sve_prfb_gather_uxtw_index>;
+  defm PRFH_D : sve_mem_64b_prfm_sv_ext_scaled<0b01, "prfh", ZPR64ExtSXTW16,    ZPR64ExtUXTW16,    int_aarch64_sve_prfh_gather_sxtw_index, int_aarch64_sve_prfh_gather_uxtw_index>;
+  defm PRFW_D : sve_mem_64b_prfm_sv_ext_scaled<0b10, "prfw", ZPR64ExtSXTW32,    ZPR64ExtUXTW32,    int_aarch64_sve_prfw_gather_sxtw_index, int_aarch64_sve_prfw_gather_uxtw_index>;
+  defm PRFD_D : sve_mem_64b_prfm_sv_ext_scaled<0b11, "prfd", ZPR64ExtSXTW64,    ZPR64ExtUXTW64,    int_aarch64_sve_prfd_gather_sxtw_index, int_aarch64_sve_prfd_gather_uxtw_index>;
 
   // Gather prefetch using scaled 64-bit offsets, e.g.
   //    prfh pldl1keep, p0, [x0, z0.d, lsl #1]
-  defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8>;
-  defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16>;
-  defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32>;
-  defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64>;
+  defm PRFB_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b00, "prfb", ZPR64ExtLSL8,  int_aarch64_sve_prfb_gather_index>;
+  defm PRFH_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b01, "prfh", ZPR64ExtLSL16, int_aarch64_sve_prfh_gather_index>;
+  defm PRFW_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b10, "prfw", ZPR64ExtLSL32, int_aarch64_sve_prfw_gather_index>;
+  defm PRFD_D_SCALED : sve_mem_64b_prfm_sv_lsl_scaled<0b11, "prfd", ZPR64ExtLSL64, int_aarch64_sve_prfd_gather_index>;
 
   // Gather prefetch using 32/64-bit pointers with offset, e.g.
   //    prfh pldl1keep, p0, [z0.s, #16]
   //    prfh pldl1keep, p0, [z0.d, #16]
-  defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31>;
-  defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2>;
-  defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4>;
-  defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8>;
+  defm PRFB_S_PZI : sve_mem_32b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>;
+  defm PRFH_S_PZI : sve_mem_32b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>;
+  defm PRFW_S_PZI : sve_mem_32b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>;
+  defm PRFD_S_PZI : sve_mem_32b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>;
 
-  defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31>;
-  defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2>;
-  defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4>;
-  defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8>;
+  defm PRFB_D_PZI : sve_mem_64b_prfm_vi<0b00, "prfb", imm0_31, int_aarch64_sve_prfb_gather_scalar_offset>;
+  defm PRFH_D_PZI : sve_mem_64b_prfm_vi<0b01, "prfh", uimm5s2, int_aarch64_sve_prfh_gather_scalar_offset>;
+  defm PRFW_D_PZI : sve_mem_64b_prfm_vi<0b10, "prfw", uimm5s4, int_aarch64_sve_prfw_gather_scalar_offset>;
+  defm PRFD_D_PZI : sve_mem_64b_prfm_vi<0b11, "prfd", uimm5s8, int_aarch64_sve_prfd_gather_scalar_offset>;
 
   defm ADR_SXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_sxtw<0b00, "adr">;
   defm ADR_UXTW_ZZZ_D : sve_int_bin_cons_misc_0_a_uxtw<0b01, "adr">;
   defm ADR_LSL_ZZZ_S  : sve_int_bin_cons_misc_0_a_32_lsl<0b10, "adr">;
   defm ADR_LSL_ZZZ_D  : sve_int_bin_cons_misc_0_a_64_lsl<0b11, "adr">;
 
+  def : Pat<(nxv4i32 (int_aarch64_sve_adrb nxv4i32:$Op1, nxv4i32:$Op2)),
+            (ADR_LSL_ZZZ_S_0 $Op1, $Op2)>;
+  def : Pat<(nxv4i32 (int_aarch64_sve_adrh nxv4i32:$Op1, nxv4i32:$Op2)),
+            (ADR_LSL_ZZZ_S_1 $Op1, $Op2)>;
+  def : Pat<(nxv4i32 (int_aarch64_sve_adrw nxv4i32:$Op1, nxv4i32:$Op2)),
+            (ADR_LSL_ZZZ_S_2 $Op1, $Op2)>;
+  def : Pat<(nxv4i32 (int_aarch64_sve_adrd nxv4i32:$Op1, nxv4i32:$Op2)),
+            (ADR_LSL_ZZZ_S_3 $Op1, $Op2)>;
+
+  def : Pat<(nxv2i64 (int_aarch64_sve_adrb nxv2i64:$Op1, nxv2i64:$Op2)),
+            (ADR_LSL_ZZZ_D_0 $Op1, $Op2)>;
+  def : Pat<(nxv2i64 (int_aarch64_sve_adrh nxv2i64:$Op1, nxv2i64:$Op2)),
+            (ADR_LSL_ZZZ_D_1 $Op1, $Op2)>;
+  def : Pat<(nxv2i64 (int_aarch64_sve_adrw nxv2i64:$Op1, nxv2i64:$Op2)),
+            (ADR_LSL_ZZZ_D_2 $Op1, $Op2)>;
+  def : Pat<(nxv2i64 (int_aarch64_sve_adrd nxv2i64:$Op1, nxv2i64:$Op2)),
+            (ADR_LSL_ZZZ_D_3 $Op1, $Op2)>;
+
   defm TBL_ZZZ  : sve_int_perm_tbl<"tbl", AArch64tbl>;
 
+  let Predicates = [HasSVE, HasBF16] in {
+    def : SVE_2_Op_Pat<nxv8bf16, AArch64tbl, nxv8bf16, nxv8i16, TBL_ZZZ_H>;
+  }
+
   defm ZIP1_ZZZ : sve_int_perm_bin_perm_zz<0b000, "zip1", AArch64zip1>;
   defm ZIP2_ZZZ : sve_int_perm_bin_perm_zz<0b001, "zip2", AArch64zip2>;
   defm UZP1_ZZZ : sve_int_perm_bin_perm_zz<0b010, "uzp1", AArch64uzp1>;
@@ -770,6 +1093,15 @@ let Predicates = [HasSVE] in {
   defm TRN1_ZZZ : sve_int_perm_bin_perm_zz<0b100, "trn1", AArch64trn1>;
   defm TRN2_ZZZ : sve_int_perm_bin_perm_zz<0b101, "trn2", AArch64trn2>;
 
+  let Predicates = [HasSVE, HasBF16] in {
+    def : SVE_2_Op_Pat<nxv8bf16, AArch64zip1, nxv8bf16, nxv8bf16, ZIP1_ZZZ_H>;
+    def : SVE_2_Op_Pat<nxv8bf16, AArch64zip2, nxv8bf16, nxv8bf16, ZIP2_ZZZ_H>;
+    def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp1, nxv8bf16, nxv8bf16, UZP1_ZZZ_H>;
+    def : SVE_2_Op_Pat<nxv8bf16, AArch64uzp2, nxv8bf16, nxv8bf16, UZP2_ZZZ_H>;
+    def : SVE_2_Op_Pat<nxv8bf16, AArch64trn1, nxv8bf16, nxv8bf16, TRN1_ZZZ_H>;
+    def : SVE_2_Op_Pat<nxv8bf16, AArch64trn2, nxv8bf16, nxv8bf16, TRN2_ZZZ_H>;
+  }
+
   defm ZIP1_PPP : sve_int_perm_bin_perm_pp<0b000, "zip1", AArch64zip1>;
   defm ZIP2_PPP : sve_int_perm_bin_perm_pp<0b001, "zip2", AArch64zip2>;
   defm UZP1_PPP : sve_int_perm_bin_perm_pp<0b010, "uzp1", AArch64uzp1>;
@@ -777,12 +1109,12 @@ let Predicates = [HasSVE] in {
   defm TRN1_PPP : sve_int_perm_bin_perm_pp<0b100, "trn1", AArch64trn1>;
   defm TRN2_PPP : sve_int_perm_bin_perm_pp<0b101, "trn2", AArch64trn2>;
 
-  defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", int_aarch64_sve_cmphs, SETUGE>;
-  defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", int_aarch64_sve_cmphi, SETUGT>;
-  defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", int_aarch64_sve_cmpge, SETGE>;
-  defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt", int_aarch64_sve_cmpgt, SETGT>;
-  defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq", int_aarch64_sve_cmpeq, SETEQ>;
-  defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne", int_aarch64_sve_cmpne, SETNE>;
+  defm CMPHS_PPzZZ : sve_int_cmp_0<0b000, "cmphs", SETUGE, SETULE>;
+  defm CMPHI_PPzZZ : sve_int_cmp_0<0b001, "cmphi", SETUGT, SETULT>;
+  defm CMPGE_PPzZZ : sve_int_cmp_0<0b100, "cmpge", SETGE, SETLE>;
+  defm CMPGT_PPzZZ : sve_int_cmp_0<0b101, "cmpgt", SETGT, SETLT>;
+  defm CMPEQ_PPzZZ : sve_int_cmp_0<0b110, "cmpeq", SETEQ, SETEQ>;
+  defm CMPNE_PPzZZ : sve_int_cmp_0<0b111, "cmpne", SETNE, SETNE>;
 
   defm CMPEQ_WIDE_PPzZZ : sve_int_cmp_0_wide<0b010, "cmpeq", int_aarch64_sve_cmpeq_wide>;
   defm CMPNE_WIDE_PPzZZ : sve_int_cmp_0_wide<0b011, "cmpne", int_aarch64_sve_cmpne_wide>;
@@ -795,22 +1127,22 @@ let Predicates = [HasSVE] in {
   defm CMPLO_WIDE_PPzZZ : sve_int_cmp_1_wide<0b110, "cmplo", int_aarch64_sve_cmplo_wide>;
   defm CMPLS_WIDE_PPzZZ : sve_int_cmp_1_wide<0b111, "cmpls", int_aarch64_sve_cmpls_wide>;
 
-  defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge", SETGE, int_aarch64_sve_cmpge>;
-  defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt", SETGT, int_aarch64_sve_cmpgt>;
-  defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt", SETLT, null_frag, int_aarch64_sve_cmpgt>;
-  defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple", SETLE, null_frag, int_aarch64_sve_cmpge>;
-  defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq", SETEQ, int_aarch64_sve_cmpeq>;
-  defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne", SETNE, int_aarch64_sve_cmpne>;
-  defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs", SETUGE, int_aarch64_sve_cmphs>;
-  defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi", SETUGT, int_aarch64_sve_cmphi>;
-  defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, null_frag, int_aarch64_sve_cmphi>;
-  defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, null_frag, int_aarch64_sve_cmphs>;
-
-  defm FCMGE_PPzZZ : sve_fp_3op_p_pd<0b000, "fcmge", int_aarch64_sve_fcmpge>;
-  defm FCMGT_PPzZZ : sve_fp_3op_p_pd<0b001, "fcmgt", int_aarch64_sve_fcmpgt>;
-  defm FCMEQ_PPzZZ : sve_fp_3op_p_pd<0b010, "fcmeq", int_aarch64_sve_fcmpeq>;
-  defm FCMNE_PPzZZ : sve_fp_3op_p_pd<0b011, "fcmne", int_aarch64_sve_fcmpne>;
-  defm FCMUO_PPzZZ : sve_fp_3op_p_pd<0b100, "fcmuo", int_aarch64_sve_fcmpuo>;
+  defm CMPGE_PPzZI : sve_int_scmp_vi<0b000, "cmpge", SETGE, SETLE>;
+  defm CMPGT_PPzZI : sve_int_scmp_vi<0b001, "cmpgt", SETGT, SETLT>;
+  defm CMPLT_PPzZI : sve_int_scmp_vi<0b010, "cmplt", SETLT, SETGT>;
+  defm CMPLE_PPzZI : sve_int_scmp_vi<0b011, "cmple", SETLE, SETGE>;
+  defm CMPEQ_PPzZI : sve_int_scmp_vi<0b100, "cmpeq", SETEQ, SETEQ>;
+  defm CMPNE_PPzZI : sve_int_scmp_vi<0b101, "cmpne", SETNE, SETEQ>;
+  defm CMPHS_PPzZI : sve_int_ucmp_vi<0b00, "cmphs", SETUGE, SETULE>;
+  defm CMPHI_PPzZI : sve_int_ucmp_vi<0b01, "cmphi", SETUGT, SETULT>;
+  defm CMPLO_PPzZI : sve_int_ucmp_vi<0b10, "cmplo", SETULT, SETUGT>;
+  defm CMPLS_PPzZI : sve_int_ucmp_vi<0b11, "cmpls", SETULE, SETUGE>;
+
+  defm FCMGE_PPzZZ : sve_fp_3op_p_pd_cc<0b000, "fcmge", int_aarch64_sve_fcmpge, setoge>;
+  defm FCMGT_PPzZZ : sve_fp_3op_p_pd_cc<0b001, "fcmgt", int_aarch64_sve_fcmpgt, setogt>;
+  defm FCMEQ_PPzZZ : sve_fp_3op_p_pd_cc<0b010, "fcmeq", int_aarch64_sve_fcmpeq, setoeq>;
+  defm FCMNE_PPzZZ : sve_fp_3op_p_pd_cc<0b011, "fcmne", int_aarch64_sve_fcmpne, setone>;
+  defm FCMUO_PPzZZ : sve_fp_3op_p_pd_cc<0b100, "fcmuo", int_aarch64_sve_fcmpuo, setuo>;
   defm FACGE_PPzZZ : sve_fp_3op_p_pd<0b101, "facge", int_aarch64_sve_facge>;
   defm FACGT_PPzZZ : sve_fp_3op_p_pd<0b111, "facgt", int_aarch64_sve_facgt>;
 
@@ -928,71 +1260,78 @@ let Predicates = [HasSVE] in {
   defm INCP_ZP     : sve_int_count_v<0b10000, "incp">;
   defm DECP_ZP     : sve_int_count_v<0b10100, "decp">;
 
-  defm INDEX_RR : sve_int_index_rr<"index">;
-  defm INDEX_IR : sve_int_index_ir<"index">;
-  defm INDEX_RI : sve_int_index_ri<"index">;
-  defm INDEX_II : sve_int_index_ii<"index">;
+  defm INDEX_RR : sve_int_index_rr<"index", index_vector>;
+  defm INDEX_IR : sve_int_index_ir<"index", index_vector>;
+  defm INDEX_RI : sve_int_index_ri<"index", index_vector>;
+  defm INDEX_II : sve_int_index_ii<"index", index_vector>;
 
   // Unpredicated shifts
-  defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr">;
-  defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr">;
-  defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl">;
+  defm ASR_ZZI : sve_int_bin_cons_shift_imm_right<0b00, "asr", AArch64asr_m1>;
+  defm LSR_ZZI : sve_int_bin_cons_shift_imm_right<0b01, "lsr", AArch64lsr_m1>;
+  defm LSL_ZZI : sve_int_bin_cons_shift_imm_left< 0b11, "lsl", AArch64lsl_m1>;
 
   defm ASR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b00, "asr">;
   defm LSR_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b01, "lsr">;
   defm LSL_WIDE_ZZZ : sve_int_bin_cons_shift_wide<0b11, "lsl">;
 
   // Predicated shifts
-  defm ASR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b0000, "asr">;
-  defm LSR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b0001, "lsr">;
+  defm ASR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b0000, "asr", "ASR_ZPZI">;
+  defm LSR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b0001, "lsr", "LSR_ZPZI">;
   defm LSL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0011, "lsl">;
-  defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", int_aarch64_sve_asrd>;
+  defm ASRD_ZPmI : sve_int_bin_pred_shift_imm_right<0b0100, "asrd", "ASRD_ZPZI", int_aarch64_sve_asrd>;
+
+  let Predicates = [HasSVE, UseExperimentalZeroingPseudos] in {
+    defm ASR_ZPZZ    : sve_int_bin_pred_zeroing_bhsd<AArch64asr_m1>;
+    defm LSR_ZPZZ    : sve_int_bin_pred_zeroing_bhsd<AArch64lsr_m1>;
+    defm LSL_ZPZZ    : sve_int_bin_pred_zeroing_bhsd<AArch64lsl_m1>;
+    defm ASRD_ZPZI   : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_asrd>;
+  }
 
-  defm ASR_ZPmZ  : sve_int_bin_pred_shift<0b000, "asr", int_aarch64_sve_asr>;
-  defm LSR_ZPmZ  : sve_int_bin_pred_shift<0b001, "lsr", int_aarch64_sve_lsr>;
-  defm LSL_ZPmZ  : sve_int_bin_pred_shift<0b011, "lsl", int_aarch64_sve_lsl>;
-  defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", null_frag>;
-  defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", null_frag>;
-  defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", null_frag>;
+  defm ASR_ZPmZ  : sve_int_bin_pred_shift<0b000, "asr", "ASR_ZPZZ", AArch64asr_m1, "ASRR_ZPmZ">;
+  defm LSR_ZPmZ  : sve_int_bin_pred_shift<0b001, "lsr", "LSR_ZPZZ", AArch64lsr_m1, "LSRR_ZPmZ">;
+  defm LSL_ZPmZ  : sve_int_bin_pred_shift<0b011, "lsl", "LSL_ZPZZ", AArch64lsl_m1, "LSLR_ZPmZ">;
+  defm ASRR_ZPmZ : sve_int_bin_pred_shift<0b100, "asrr", "ASRR_ZPZZ", null_frag, "ASR_ZPmZ", /*isReverseInstr*/ 1>;
+  defm LSRR_ZPmZ : sve_int_bin_pred_shift<0b101, "lsrr", "LSRR_ZPZZ", null_frag, "LSR_ZPmZ", /*isReverseInstr*/ 1>;
+  defm LSLR_ZPmZ : sve_int_bin_pred_shift<0b111, "lslr", "LSLR_ZPZZ", null_frag, "LSL_ZPmZ", /*isReverseInstr*/ 1>;
 
   defm ASR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b000, "asr", int_aarch64_sve_asr_wide>;
   defm LSR_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b001, "lsr", int_aarch64_sve_lsr_wide>;
   defm LSL_WIDE_ZPmZ : sve_int_bin_pred_shift_wide<0b011, "lsl", int_aarch64_sve_lsl_wide>;
 
-  defm FCVT_ZPmZ_StoH   : sve_fp_2op_p_zd<0b1001000, "fcvt",   ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32,    nxv8f16, nxv16i1, nxv4f32, ElementSizeS>;
-  defm FCVT_ZPmZ_HtoS   : sve_fp_2op_p_zd<0b1001001, "fcvt",   ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16,    nxv4f32, nxv16i1, nxv8f16, ElementSizeS>;
-  defm SCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110010, "scvtf",  ZPR16, ZPR16, int_aarch64_sve_scvtf,          nxv8f16, nxv8i1,  nxv8i16, ElementSizeH>;
-  defm SCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010100, "scvtf",  ZPR32, ZPR32, int_aarch64_sve_scvtf,          nxv4f32, nxv4i1,  nxv4i32, ElementSizeS>;
-  defm UCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010101, "ucvtf",  ZPR32, ZPR32, int_aarch64_sve_ucvtf,          nxv4f32, nxv4i1,  nxv4i32, ElementSizeS>;
-  defm UCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110011, "ucvtf",  ZPR16, ZPR16, int_aarch64_sve_ucvtf,          nxv8f16, nxv8i1,  nxv8i16, ElementSizeH>;
-  defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs,         nxv8i16, nxv8i1,  nxv8f16, ElementSizeH>;
-  defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs,         nxv4i32, nxv4i1,  nxv4f32, ElementSizeS>;
-  defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu,         nxv8i16, nxv8i1,  nxv8f16, ElementSizeH>;
-  defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu,         nxv4i32, nxv4i1,  nxv4f32, ElementSizeS>;
-  defm FCVT_ZPmZ_DtoH   : sve_fp_2op_p_zd<0b1101000, "fcvt",   ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64,    nxv8f16, nxv16i1, nxv2f64, ElementSizeD>;
-  defm FCVT_ZPmZ_HtoD   : sve_fp_2op_p_zd<0b1101001, "fcvt",   ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16,    nxv2f64, nxv16i1, nxv8f16, ElementSizeD>;
-  defm FCVT_ZPmZ_DtoS   : sve_fp_2op_p_zd<0b1101010, "fcvt",   ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64,    nxv4f32, nxv16i1, nxv2f64, ElementSizeD>;
-  defm FCVT_ZPmZ_StoD   : sve_fp_2op_p_zd<0b1101011, "fcvt",   ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32,    nxv2f64, nxv16i1, nxv4f32, ElementSizeD>;
-  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,   nxv2f64, nxv16i1, nxv4i32, ElementSizeD>;
-  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,   nxv2f64, nxv16i1, nxv4i32, ElementSizeD>;
-  defm UCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110101, "ucvtf",  ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32,   nxv8f16, nxv16i1, nxv4i32, ElementSizeS>;
-  defm SCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110100, "scvtf",  ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64,   nxv4f32, nxv16i1, nxv2i64, ElementSizeD>;
-  defm SCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110100, "scvtf",  ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32,   nxv8f16, nxv16i1, nxv4i32, ElementSizeS>;
-  defm SCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110110, "scvtf",  ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64,   nxv8f16, nxv16i1, nxv2i64, ElementSizeD>;
-  defm UCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110101, "ucvtf",  ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64,   nxv4f32, nxv16i1, nxv2i64, ElementSizeD>;
-  defm UCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110111, "ucvtf",  ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64,   nxv8f16, nxv16i1, nxv2i64, ElementSizeD>;
-  defm SCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110110, "scvtf",  ZPR64, ZPR64, int_aarch64_sve_scvtf,          nxv2f64, nxv2i1,  nxv2i64, ElementSizeD>;
-  defm UCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110111, "ucvtf",  ZPR64, ZPR64, int_aarch64_sve_ucvtf,          nxv2f64, nxv2i1,  nxv2i64, ElementSizeD>;
-  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64,  nxv4i32, nxv16i1, nxv2f64, ElementSizeD>;
-  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64,  nxv4i32, nxv16i1, nxv2f64, ElementSizeD>;
-  defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32,  nxv2i64, nxv16i1, nxv4f32, ElementSizeD>;
-  defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16,  nxv4i32, nxv16i1, nxv8f16, ElementSizeS>;
-  defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16,  nxv2i64, nxv16i1, nxv8f16, ElementSizeD>;
-  defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16,  nxv4i32, nxv16i1, nxv8f16, ElementSizeS>;
-  defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16,  nxv2i64, nxv16i1, nxv8f16, ElementSizeD>;
-  defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32,  nxv2i64, nxv16i1, nxv4f32, ElementSizeD>;
-  defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs,         nxv2i64, nxv2i1,  nxv2f64, ElementSizeD>;
-  defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu,         nxv2i64, nxv2i1,  nxv2f64, ElementSizeD>;
+  defm FCVT_ZPmZ_StoH   : sve_fp_2op_p_zd<0b1001000, "fcvt",   ZPR32, ZPR16, int_aarch64_sve_fcvt_f16f32,    nxv8f16, nxv4i1, nxv4f32, ElementSizeS>;
+  defm FCVT_ZPmZ_HtoS   : sve_fp_2op_p_zd<0b1001001, "fcvt",   ZPR16, ZPR32, int_aarch64_sve_fcvt_f32f16,    nxv4f32, nxv4i1, nxv8f16, ElementSizeS>;
+  defm SCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110010, "scvtf",  ZPR16, ZPR16, int_aarch64_sve_scvtf,          nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
+  defm SCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010100, "scvtf",  ZPR32, ZPR32, int_aarch64_sve_scvtf,          nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
+  defm UCVTF_ZPmZ_StoS  : sve_fp_2op_p_zd<0b1010101, "ucvtf",  ZPR32, ZPR32, int_aarch64_sve_ucvtf,          nxv4f32, nxv4i1, nxv4i32, ElementSizeS>;
+  defm UCVTF_ZPmZ_HtoH  : sve_fp_2op_p_zd<0b0110011, "ucvtf",  ZPR16, ZPR16, int_aarch64_sve_ucvtf,          nxv8f16, nxv8i1, nxv8i16, ElementSizeH>;
+  defm FCVTZS_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111010, "fcvtzs", ZPR16, ZPR16, int_aarch64_sve_fcvtzs,         nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
+  defm FCVTZS_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011100, "fcvtzs", ZPR32, ZPR32, int_aarch64_sve_fcvtzs,         nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
+  defm FCVTZU_ZPmZ_HtoH : sve_fp_2op_p_zd<0b0111011, "fcvtzu", ZPR16, ZPR16, int_aarch64_sve_fcvtzu,         nxv8i16, nxv8i1, nxv8f16, ElementSizeH>;
+  defm FCVTZU_ZPmZ_StoS : sve_fp_2op_p_zd<0b1011101, "fcvtzu", ZPR32, ZPR32, int_aarch64_sve_fcvtzu,         nxv4i32, nxv4i1, nxv4f32, ElementSizeS>;
+  defm FCVT_ZPmZ_DtoH   : sve_fp_2op_p_zd<0b1101000, "fcvt",   ZPR64, ZPR16, int_aarch64_sve_fcvt_f16f64,    nxv8f16, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVT_ZPmZ_HtoD   : sve_fp_2op_p_zd<0b1101001, "fcvt",   ZPR16, ZPR64, int_aarch64_sve_fcvt_f64f16,    nxv2f64, nxv2i1, nxv8f16, ElementSizeD>;
+  defm FCVT_ZPmZ_DtoS   : sve_fp_2op_p_zd<0b1101010, "fcvt",   ZPR64, ZPR32, int_aarch64_sve_fcvt_f32f64,    nxv4f32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVT_ZPmZ_StoD   : sve_fp_2op_p_zd<0b1101011, "fcvt",   ZPR32, ZPR64, int_aarch64_sve_fcvt_f64f32,    nxv2f64, nxv2i1, nxv4f32, ElementSizeD>;
+  defm SCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110000, "scvtf",  ZPR32, ZPR64, int_aarch64_sve_scvtf_f64i32,   nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+  defm UCVTF_ZPmZ_StoD  : sve_fp_2op_p_zd<0b1110001, "ucvtf",  ZPR32, ZPR64, int_aarch64_sve_ucvtf_f64i32,   nxv2f64, nxv2i1, nxv4i32, ElementSizeD>;
+  defm UCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110101, "ucvtf",  ZPR32, ZPR16, int_aarch64_sve_ucvtf_f16i32,   nxv8f16, nxv4i1, nxv4i32, ElementSizeS>;
+  defm SCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110100, "scvtf",  ZPR64, ZPR32, int_aarch64_sve_scvtf_f32i64,   nxv4f32, nxv2i1, nxv2i64, ElementSizeD>;
+  defm SCVTF_ZPmZ_StoH  : sve_fp_2op_p_zd<0b0110100, "scvtf",  ZPR32, ZPR16, int_aarch64_sve_scvtf_f16i32,   nxv8f16, nxv4i1, nxv4i32, ElementSizeS>;
+  defm SCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110110, "scvtf",  ZPR64, ZPR16, int_aarch64_sve_scvtf_f16i64,   nxv8f16, nxv2i1, nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoS  : sve_fp_2op_p_zd<0b1110101, "ucvtf",  ZPR64, ZPR32, int_aarch64_sve_ucvtf_f32i64,   nxv4f32, nxv2i1, nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoH  : sve_fp_2op_p_zd<0b0110111, "ucvtf",  ZPR64, ZPR16, int_aarch64_sve_ucvtf_f16i64,   nxv8f16, nxv2i1, nxv2i64, ElementSizeD>;
+  defm SCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110110, "scvtf",  ZPR64, ZPR64, int_aarch64_sve_scvtf,          nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
+  defm UCVTF_ZPmZ_DtoD  : sve_fp_2op_p_zd<0b1110111, "ucvtf",  ZPR64, ZPR64, int_aarch64_sve_ucvtf,          nxv2f64, nxv2i1, nxv2i64, ElementSizeD>;
+  defm FCVTZS_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111000, "fcvtzs", ZPR64, ZPR32, int_aarch64_sve_fcvtzs_i32f64,  nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZU_ZPmZ_DtoS : sve_fp_2op_p_zd<0b1111001, "fcvtzu", ZPR64, ZPR32, int_aarch64_sve_fcvtzu_i32f64,  nxv4i32, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZS_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111100, "fcvtzs", ZPR32, ZPR64, int_aarch64_sve_fcvtzs_i64f32,  nxv2i64, nxv2i1, nxv4f32, ElementSizeD>;
+  defm FCVTZS_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111100, "fcvtzs", ZPR16, ZPR32, int_aarch64_sve_fcvtzs_i32f16,  nxv4i32, nxv4i1, nxv8f16, ElementSizeS>;
+  defm FCVTZS_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111110, "fcvtzs", ZPR16, ZPR64, int_aarch64_sve_fcvtzs_i64f16,  nxv2i64, nxv2i1, nxv8f16, ElementSizeD>;
+  defm FCVTZU_ZPmZ_HtoS : sve_fp_2op_p_zd<0b0111101, "fcvtzu", ZPR16, ZPR32, int_aarch64_sve_fcvtzu_i32f16,  nxv4i32, nxv4i1, nxv8f16, ElementSizeS>;
+  defm FCVTZU_ZPmZ_HtoD : sve_fp_2op_p_zd<0b0111111, "fcvtzu", ZPR16, ZPR64, int_aarch64_sve_fcvtzu_i64f16,  nxv2i64, nxv2i1, nxv8f16, ElementSizeD>;
+  defm FCVTZU_ZPmZ_StoD : sve_fp_2op_p_zd<0b1111101, "fcvtzu", ZPR32, ZPR64, int_aarch64_sve_fcvtzu_i64f32,  nxv2i64, nxv2i1, nxv4f32, ElementSizeD>;
+  defm FCVTZS_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111110, "fcvtzs", ZPR64, ZPR64, int_aarch64_sve_fcvtzs,         nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
+  defm FCVTZU_ZPmZ_DtoD : sve_fp_2op_p_zd<0b1111111, "fcvtzu", ZPR64, ZPR64, int_aarch64_sve_fcvtzu,         nxv2i64, nxv2i1, nxv2f64, ElementSizeD>;
 
   defm FRINTN_ZPmZ : sve_fp_2op_p_zd_HSD<0b00000, "frintn", int_aarch64_sve_frintn>;
   defm FRINTP_ZPmZ : sve_fp_2op_p_zd_HSD<0b00001, "frintp", int_aarch64_sve_frintp>;
@@ -1004,6 +1343,18 @@ let Predicates = [HasSVE] in {
   defm FRECPX_ZPmZ : sve_fp_2op_p_zd_HSD<0b01100, "frecpx", int_aarch64_sve_frecpx>;
   defm FSQRT_ZPmZ  : sve_fp_2op_p_zd_HSD<0b01101, "fsqrt",  int_aarch64_sve_fsqrt>;
 
+  let Predicates = [HasBF16, HasSVE] in {
+    defm BFDOT_ZZZ    : sve_bfloat_dot<"bfdot", int_aarch64_sve_bfdot>;
+    defm BFDOT_ZZI    : sve_bfloat_dot_indexed<"bfdot", int_aarch64_sve_bfdot_lane>;
+    defm BFMMLA_ZZZ   : sve_bfloat_matmul<"bfmmla", int_aarch64_sve_bfmmla>;
+    defm BFMMLA_B_ZZZ : sve_bfloat_matmul_longvecl<0b0, "bfmlalb", int_aarch64_sve_bfmlalb>;
+    defm BFMMLA_T_ZZZ : sve_bfloat_matmul_longvecl<0b1, "bfmlalt", int_aarch64_sve_bfmlalt>;
+    defm BFMMLA_B_ZZI : sve_bfloat_matmul_longvecl_idx<0b0, "bfmlalb", int_aarch64_sve_bfmlalb_lane>;
+    defm BFMMLA_T_ZZI : sve_bfloat_matmul_longvecl_idx<0b1, "bfmlalt", int_aarch64_sve_bfmlalt_lane>;
+    defm BFCVT_ZPmZ   : sve_bfloat_convert<0b1, "bfcvt",   int_aarch64_sve_fcvt_bf16f32>;
+    defm BFCVTNT_ZPmZ : sve_bfloat_convert<0b0, "bfcvtnt", int_aarch64_sve_fcvtnt_bf16f32>;
+  }
+
   // InstAliases
   def : InstAlias<"mov $Zd, $Zn",
                   (ORR_ZZZ ZPR64:$Zd, ZPR64:$Zn, ZPR64:$Zn), 1>;
@@ -1089,6 +1440,20 @@ let Predicates = [HasSVE] in {
   def : InstAlias<"fcmlt $Zd, $Pg/z, $Zm, $Zn",
                   (FCMGT_PPzZZ_D PPR64:$Zd, PPR3bAny:$Pg, ZPR64:$Zn, ZPR64:$Zm), 0>;
 
+  // Pseudo instructions representing unpredicated LDR and STR for ZPR2,3,4.
+  // These get expanded to individual LDR_ZXI/STR_ZXI instructions in
+  // AArch64ExpandPseudoInsts.
+  let mayLoad = 1, hasSideEffects = 0 in {
+    def LDR_ZZXI   : Pseudo<(outs   ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZZXI  : Pseudo<(outs  ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+  }
+  let mayStore = 1, hasSideEffects = 0 in {
+    def STR_ZZXI   : Pseudo<(outs), (ins   ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZZXI  : Pseudo<(outs), (ins  ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+  }
+
   def : Pat<(AArch64ptest (nxv16i1 PPR:$pg), (nxv16i1 PPR:$src)),
             (PTEST_PP PPR:$pg, PPR:$src)>;
   def : Pat<(AArch64ptest (nxv8i1 PPR:$pg), (nxv8i1 PPR:$src)),
@@ -1098,6 +1463,25 @@ let Predicates = [HasSVE] in {
   def : Pat<(AArch64ptest (nxv2i1 PPR:$pg), (nxv2i1 PPR:$src)),
             (PTEST_PP PPR:$pg, PPR:$src)>;
 
+  // LD1R of 128-bit masked data
+  def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
+            (LD1RQ_B_IMM $gp, $base, (i64 0))>;
+  def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
+            (LD1RQ_H_IMM $gp, $base, (i64 0))>;
+  def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
+            (LD1RQ_W_IMM $gp, $base, (i64 0))>;
+  def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, GPR64:$base)),
+            (LD1RQ_D_IMM $gp, $base, (i64 0))>;
+
+  def : Pat<(nxv16i8 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
+            (LD1RQ_B_IMM $gp, $base, simm4s16:$imm)>;
+  def : Pat<(nxv8i16 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
+            (LD1RQ_H_IMM $gp, $base, simm4s16:$imm)>;
+  def : Pat<(nxv4i32 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
+            (LD1RQ_W_IMM $gp, $base, simm4s16:$imm)>;
+  def : Pat<(nxv2i64 (AArch64ld1rq_z PPR:$gp, (add GPR64:$base, (i64 simm4s16:$imm)))),
+            (LD1RQ_D_IMM $gp, $base, simm4s16:$imm)>;
+
   def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i32), (SXTW_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
   def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i16), (SXTH_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
   def : Pat<(sext_inreg (nxv2i64 ZPR:$Zs), nxv2i8),  (SXTB_ZPmZ_D (IMPLICIT_DEF), (PTRUE_D 31), ZPR:$Zs)>;
@@ -1105,346 +1489,899 @@ let Predicates = [HasSVE] in {
   def : Pat<(sext_inreg (nxv4i32 ZPR:$Zs), nxv4i8),  (SXTB_ZPmZ_S (IMPLICIT_DEF), (PTRUE_S 31), ZPR:$Zs)>;
   def : Pat<(sext_inreg (nxv8i16 ZPR:$Zs), nxv8i8),  (SXTB_ZPmZ_H (IMPLICIT_DEF), (PTRUE_H 31), ZPR:$Zs)>;
 
-  def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
-  def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
-  def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
-  def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
-  def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
-  def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
-
-  def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>;
-  def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
-  def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
-  def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
-  def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
-  def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
-
-  def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>;
-  def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
-  def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
-  def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
-  def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>;
-  def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
-
-  def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-  def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-  def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-  def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-  def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-  def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>;
-
-  def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-  def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-  def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-  def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-  def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-  def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
-
-  def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>;
-  def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
-  def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>;
-  def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
-  def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
-  def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
-
-  def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>;
-  def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
-  def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
-  def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
-  def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
-  def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+  // General case that we ideally never want to match.
+  def : Pat<(vscale GPR64:$scale), (MADDXrrr (UBFMXri (RDVLI_XI 1), 4, 63), $scale, XZR)>;
+
+  let AddedComplexity = 5 in {
+    def : Pat<(vscale (i64 1)), (UBFMXri (RDVLI_XI 1), 4, 63)>;
+    def : Pat<(vscale (i64 -1)), (SBFMXri (RDVLI_XI -1), 4, 63)>;
+
+    def : Pat<(vscale (sve_rdvl_imm i32:$imm)), (RDVLI_XI $imm)>;
+    def : Pat<(vscale (sve_cnth_imm i32:$imm)), (CNTH_XPiI 31, $imm)>;
+    def : Pat<(vscale (sve_cntw_imm i32:$imm)), (CNTW_XPiI 31, $imm)>;
+    def : Pat<(vscale (sve_cntd_imm i32:$imm)), (CNTD_XPiI 31, $imm)>;
+
+    def : Pat<(vscale (sve_cnth_imm_neg i32:$imm)), (SUBXrs XZR, (CNTH_XPiI 31, $imm), 0)>;
+    def : Pat<(vscale (sve_cntw_imm_neg i32:$imm)), (SUBXrs XZR, (CNTW_XPiI 31, $imm), 0)>;
+    def : Pat<(vscale (sve_cntd_imm_neg i32:$imm)), (SUBXrs XZR, (CNTD_XPiI 31, $imm), 0)>;
+  }
+
+  // FIXME: BigEndian requires an additional REV instruction to satisfy the
+  // constraint that none of the bits change when stored to memory as one
+  // type, and and reloaded as another type.
+  let Predicates = [IsLE] in {
+    def : Pat<(nxv16i8 (bitconvert (nxv8i16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+    def : Pat<(nxv16i8 (bitconvert (nxv4i32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+    def : Pat<(nxv16i8 (bitconvert (nxv2i64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+    def : Pat<(nxv16i8 (bitconvert (nxv8f16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+    def : Pat<(nxv16i8 (bitconvert (nxv4f32 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+    def : Pat<(nxv16i8 (bitconvert (nxv2f64 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+
+    def : Pat<(nxv8i16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+    def : Pat<(nxv8i16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+    def : Pat<(nxv8i16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+    def : Pat<(nxv8i16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+    def : Pat<(nxv8i16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+    def : Pat<(nxv8i16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+
+    def : Pat<(nxv4i32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+    def : Pat<(nxv4i32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+    def : Pat<(nxv4i32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+    def : Pat<(nxv4i32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+    def : Pat<(nxv4i32 (bitconvert (nxv4f32 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+    def : Pat<(nxv4i32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+
+    def : Pat<(nxv2i64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv2i64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv2i64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv2i64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv2i64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv2i64 (bitconvert (nxv2f64 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+
+    def : Pat<(nxv8f16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+    def : Pat<(nxv8f16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+    def : Pat<(nxv8f16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+    def : Pat<(nxv8f16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+    def : Pat<(nxv8f16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+    def : Pat<(nxv8f16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+
+    def : Pat<(nxv4f32 (bitconvert (nxv16i8 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+    def : Pat<(nxv4f32 (bitconvert (nxv8i16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+    def : Pat<(nxv4f32 (bitconvert (nxv4i32 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+    def : Pat<(nxv4f32 (bitconvert (nxv2i64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+    def : Pat<(nxv4f32 (bitconvert (nxv8f16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+    def : Pat<(nxv4f32 (bitconvert (nxv2f64 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+
+    def : Pat<(nxv2f64 (bitconvert (nxv16i8 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+    def : Pat<(nxv2f64 (bitconvert (nxv8i16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+    def : Pat<(nxv2f64 (bitconvert (nxv4i32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+    def : Pat<(nxv2f64 (bitconvert (nxv2i64 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+    def : Pat<(nxv2f64 (bitconvert (nxv8f16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+    def : Pat<(nxv2f64 (bitconvert (nxv4f32 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+
+  }
+
+  let Predicates = [IsLE, HasBF16, HasSVE] in {
+    def : Pat<(nxv2i64  (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))),  (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))),  (nxv8bf16 ZPR:$src)>;
+  }
+
+  let Predicates = [IsLE, HasSVE, HasBF16] in {
+    def : Pat<(nxv8bf16 (bitconvert (nxv16i8 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert (nxv8i16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert (nxv4i32 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert (nxv2i64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert (nxv8f16 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert (nxv4f32 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+    def : Pat<(nxv8bf16 (bitconvert (nxv2f64 ZPR:$src))), (nxv8bf16 ZPR:$src)>;
+
+    def : Pat<(nxv16i8 (bitconvert (nxv8bf16 ZPR:$src))), (nxv16i8 ZPR:$src)>;
+    def : Pat<(nxv8i16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8i16 ZPR:$src)>;
+    def : Pat<(nxv4i32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4i32 ZPR:$src)>;
+    def : Pat<(nxv2i64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2i64 ZPR:$src)>;
+    def : Pat<(nxv8f16 (bitconvert (nxv8bf16 ZPR:$src))), (nxv8f16 ZPR:$src)>;
+    def : Pat<(nxv4f32 (bitconvert (nxv8bf16 ZPR:$src))), (nxv4f32 ZPR:$src)>;
+    def : Pat<(nxv2f64 (bitconvert (nxv8bf16 ZPR:$src))), (nxv2f64 ZPR:$src)>;
+  }
+
+  def : Pat<(nxv16i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv16i1 (reinterpret_cast (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv16i1 (reinterpret_cast (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv16i1 (reinterpret_cast (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv8i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv8i1 (reinterpret_cast  (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv8i1 (reinterpret_cast  (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv4i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv4i1 (reinterpret_cast  (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv4i1 (reinterpret_cast  (nxv2i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv2i1 (reinterpret_cast (nxv16i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv2i1 (reinterpret_cast  (nxv8i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+  def : Pat<(nxv2i1 (reinterpret_cast  (nxv4i1 PPR:$src))), (COPY_TO_REGCLASS PPR:$src, PPR)>;
+
+  def : Pat<(nxv16i1 (and PPR:$Ps1, PPR:$Ps2)),
+            (AND_PPzPP (PTRUE_B 31), PPR:$Ps1, PPR:$Ps2)>;
+  def : Pat<(nxv8i1 (and PPR:$Ps1, PPR:$Ps2)),
+            (AND_PPzPP (PTRUE_H 31), PPR:$Ps1, PPR:$Ps2)>;
+  def : Pat<(nxv4i1 (and PPR:$Ps1, PPR:$Ps2)),
+            (AND_PPzPP (PTRUE_S 31), PPR:$Ps1, PPR:$Ps2)>;
+  def : Pat<(nxv2i1 (and PPR:$Ps1, PPR:$Ps2)),
+            (AND_PPzPP (PTRUE_D 31), PPR:$Ps1, PPR:$Ps2)>;
 
   // Add more complex addressing modes here as required
   multiclass pred_load<ValueType Ty, ValueType PredTy, SDPatternOperator Load,
-                        Instruction RegImmInst> {
-
+                       Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
+    // reg + reg
+    let AddedComplexity = 1 in {
+      def _reg_reg_z : Pat<(Ty (Load (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
+                           (RegRegInst PPR:$gp, GPR64:$base, GPR64:$offset)>;
+    }
+    // reg + imm
+    let AddedComplexity = 2 in {
+      def _reg_imm_z : Pat<(Ty (Load (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), (SVEDup0Undef))),
+                           (RegImmInst PPR:$gp, GPR64:$base, simm4s1:$offset)>;
+    }
     def _default_z : Pat<(Ty (Load  GPR64:$base, (PredTy PPR:$gp), (SVEDup0Undef))),
                          (RegImmInst PPR:$gp, GPR64:$base, (i64 0))>;
   }
 
   // 2-element contiguous loads
-  defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8,   LD1B_D_IMM>;
-  defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8,  LD1SB_D_IMM>;
-  defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16,  LD1H_D_IMM>;
-  defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D_IMM>;
-  defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32,  LD1W_D_IMM>;
-  defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D_IMM>;
-  defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load,    LD1D_IMM>;
-  defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load,    LD1H_D_IMM>;
-  defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load,    LD1W_D_IMM>;
-  defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load,    LD1D_IMM>;
+  defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i8,   LD1B_D,  LD1B_D_IMM,  am_sve_regreg_lsl0>;
+  defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i8,  LD1SB_D, LD1SB_D_IMM, am_sve_regreg_lsl0>;
+  defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i16,  LD1H_D,  LD1H_D_IMM,  am_sve_regreg_lsl1>;
+  defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i16, LD1SH_D, LD1SH_D_IMM, am_sve_regreg_lsl1>;
+  defm : pred_load<nxv2i64, nxv2i1, zext_masked_load_i32,  LD1W_D,  LD1W_D_IMM,  am_sve_regreg_lsl2>;
+  defm : pred_load<nxv2i64, nxv2i1, asext_masked_load_i32, LD1SW_D, LD1SW_D_IMM, am_sve_regreg_lsl2>;
+  defm : pred_load<nxv2i64, nxv2i1, nonext_masked_load,    LD1D,    LD1D_IMM,    am_sve_regreg_lsl3>;
+  defm : pred_load<nxv2f16, nxv2i1, nonext_masked_load,    LD1H_D,  LD1H_D_IMM,  am_sve_regreg_lsl1>;
+  defm : pred_load<nxv2f32, nxv2i1, nonext_masked_load,    LD1W_D,  LD1W_D_IMM,  am_sve_regreg_lsl2>;
+  defm : pred_load<nxv2f64, nxv2i1, nonext_masked_load,    LD1D,    LD1D_IMM,    am_sve_regreg_lsl3>;
 
   // 4-element contiguous loads
-  defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8,   LD1B_S_IMM>;
-  defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8,  LD1SB_S_IMM>;
-  defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16,  LD1H_S_IMM>;
-  defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S_IMM>;
-  defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load,    LD1W_IMM>;
-  defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load,    LD1H_S_IMM>;
-  defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load,    LD1W_IMM>;
+  defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i8,   LD1B_S,  LD1B_S_IMM,  am_sve_regreg_lsl0>;
+  defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i8,  LD1SB_S, LD1SB_S_IMM, am_sve_regreg_lsl0>;
+  defm : pred_load<nxv4i32, nxv4i1, zext_masked_load_i16,  LD1H_S,  LD1H_S_IMM,  am_sve_regreg_lsl1>;
+  defm : pred_load<nxv4i32, nxv4i1, asext_masked_load_i16, LD1SH_S, LD1SH_S_IMM, am_sve_regreg_lsl1>;
+  defm : pred_load<nxv4i32, nxv4i1, nonext_masked_load,    LD1W,    LD1W_IMM,    am_sve_regreg_lsl2>;
+  defm : pred_load<nxv4f16, nxv4i1, nonext_masked_load,    LD1H_S,  LD1H_S_IMM,  am_sve_regreg_lsl1>;
+  defm : pred_load<nxv4f32, nxv4i1, nonext_masked_load,    LD1W,    LD1W_IMM,    am_sve_regreg_lsl2>;
 
   // 8-element contiguous loads
-  defm : pred_load<nxv8i16, nxv8i1, zext_masked_load_i8,  LD1B_H_IMM>;
-  defm : pred_load<nxv8i16, nxv8i1, asext_masked_load_i8, LD1SB_H_IMM>;
-  defm : pred_load<nxv8i16, nxv8i1, nonext_masked_load,   LD1H_IMM>;
-  defm : pred_load<nxv8f16, nxv8i1, nonext_masked_load,   LD1H_IMM>;
+  defm : pred_load<nxv8i16,  nxv8i1, zext_masked_load_i8,  LD1B_H,  LD1B_H_IMM,  am_sve_regreg_lsl0>;
+  defm : pred_load<nxv8i16,  nxv8i1, asext_masked_load_i8, LD1SB_H, LD1SB_H_IMM, am_sve_regreg_lsl0>;
+  defm : pred_load<nxv8i16,  nxv8i1, nonext_masked_load,   LD1H,    LD1H_IMM,    am_sve_regreg_lsl1>;
+  defm : pred_load<nxv8f16,  nxv8i1, nonext_masked_load,   LD1H,    LD1H_IMM,    am_sve_regreg_lsl1>;
+
+  let Predicates = [HasBF16, HasSVE] in {
+    defm : pred_load<nxv8bf16, nxv8i1, nonext_masked_load,   LD1H,    LD1H_IMM,    am_sve_regreg_lsl1>;
+  }
 
   // 16-element contiguous loads
-  defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B_IMM>;
+  defm : pred_load<nxv16i8, nxv16i1, nonext_masked_load, LD1B, LD1B_IMM, am_sve_regreg_lsl0>;
 
   multiclass pred_store<ValueType Ty, ValueType PredTy, SDPatternOperator Store,
-                         Instruction RegImmInst> {
+                        Instruction RegRegInst, Instruction RegImmInst, ComplexPattern AddrCP> {
+    // reg + reg
+    let AddedComplexity = 1 in {
+      def _reg_reg : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp)),
+                         (RegRegInst ZPR:$vec, PPR:$gp, GPR64:$base, GPR64:$offset)>;
+    }
+    // reg + imm
+    let AddedComplexity = 2 in {
+      def _reg_imm : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp)),
+                         (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, simm4s1:$offset)>;
+    }
     def _default : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp)),
                        (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
   }
 
   // 2-element contiguous stores
-  defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8,  ST1B_D_IMM>;
-  defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D_IMM>;
-  defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D_IMM>;
-  defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store,  ST1D_IMM>;
-  defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store,  ST1H_D_IMM>;
-  defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store,  ST1W_D_IMM>;
-  defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store,  ST1D_IMM>;
+  defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i8,  ST1B_D, ST1B_D_IMM, am_sve_regreg_lsl0>;
+  defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i16, ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
+  defm : pred_store<nxv2i64, nxv2i1, trunc_masked_store_i32, ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
+  defm : pred_store<nxv2i64, nxv2i1, nontrunc_masked_store,  ST1D,   ST1D_IMM,   am_sve_regreg_lsl3>;
+  defm : pred_store<nxv2f16, nxv2i1, nontrunc_masked_store,  ST1H_D, ST1H_D_IMM, am_sve_regreg_lsl1>;
+  defm : pred_store<nxv2f32, nxv2i1, nontrunc_masked_store,  ST1W_D, ST1W_D_IMM, am_sve_regreg_lsl2>;
+  defm : pred_store<nxv2f64, nxv2i1, nontrunc_masked_store,  ST1D,   ST1D_IMM,   am_sve_regreg_lsl3>;
 
   // 4-element contiguous stores
-  defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8,  ST1B_S_IMM>;
-  defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S_IMM>;
-  defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store,  ST1W_IMM>;
-  defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store,  ST1H_S_IMM>;
-  defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store,  ST1W_IMM>;
+  defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i8,  ST1B_S, ST1B_S_IMM, am_sve_regreg_lsl0>;
+  defm : pred_store<nxv4i32, nxv4i1, trunc_masked_store_i16, ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
+  defm : pred_store<nxv4i32, nxv4i1, nontrunc_masked_store,  ST1W,   ST1W_IMM,   am_sve_regreg_lsl2>;
+  defm : pred_store<nxv4f16, nxv4i1, nontrunc_masked_store,  ST1H_S, ST1H_S_IMM, am_sve_regreg_lsl1>;
+  defm : pred_store<nxv4f32, nxv4i1, nontrunc_masked_store,  ST1W,   ST1W_IMM,   am_sve_regreg_lsl2>;
 
   // 8-element contiguous stores
-  defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H_IMM>;
-  defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H_IMM>;
-  defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H_IMM>;
+  defm : pred_store<nxv8i16, nxv8i1, trunc_masked_store_i8, ST1B_H, ST1B_H_IMM, am_sve_regreg_lsl0>;
+  defm : pred_store<nxv8i16, nxv8i1, nontrunc_masked_store, ST1H,   ST1H_IMM,   am_sve_regreg_lsl1>;
+  defm : pred_store<nxv8f16, nxv8i1, nontrunc_masked_store, ST1H,   ST1H_IMM,   am_sve_regreg_lsl1>;
+
+  let Predicates = [HasBF16, HasSVE] in {
+    defm : pred_store<nxv8bf16, nxv8i1, nontrunc_masked_store, ST1H,   ST1H_IMM,   am_sve_regreg_lsl1>;
+  }
 
   // 16-element contiguous stores
-  defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B_IMM>;
+  defm : pred_store<nxv16i8, nxv16i1, nontrunc_masked_store, ST1B, ST1B_IMM, am_sve_regreg_lsl0>;
+
+  defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRR, LDNT1B_ZRI, am_sve_regreg_lsl0>;
+  defm : pred_load<nxv8i16, nxv8i1,  non_temporal_load, LDNT1H_ZRR, LDNT1H_ZRI, am_sve_regreg_lsl1>;
+  defm : pred_load<nxv4i32, nxv4i1,  non_temporal_load, LDNT1W_ZRR, LDNT1W_ZRI, am_sve_regreg_lsl2>;
+  defm : pred_load<nxv2i64, nxv2i1,  non_temporal_load, LDNT1D_ZRR, LDNT1D_ZRI, am_sve_regreg_lsl3>;
+
+  defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRR, STNT1B_ZRI, am_sve_regreg_lsl0>;
+  defm : pred_store<nxv8i16, nxv8i1,  non_temporal_store, STNT1H_ZRR, STNT1H_ZRI, am_sve_regreg_lsl1>;
+  defm : pred_store<nxv4i32, nxv4i1,  non_temporal_store, STNT1W_ZRR, STNT1W_ZRI, am_sve_regreg_lsl2>;
+  defm : pred_store<nxv2i64, nxv2i1,  non_temporal_store, STNT1D_ZRR, STNT1D_ZRI, am_sve_regreg_lsl3>;
+
+  multiclass unpred_store<PatFrag Store, ValueType Ty, Instruction RegImmInst,
+                          Instruction PTrue> {
+    let AddedComplexity = 1 in {
+      def _imm : Pat<(Store (Ty ZPR:$val), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset)),
+                     (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
+    }
+    let AddedComplexity = 2 in {
+      def _fi : Pat<(Store (Ty ZPR:$val), (am_sve_fi GPR64sp:$base, simm4s1:$offset)),
+                    (RegImmInst ZPR:$val, (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
+    }
+
+    def : Pat<(Store (Ty ZPR:$val), GPR64:$base),
+              (RegImmInst ZPR:$val, (PTrue 31), GPR64:$base, (i64 0))>;
+  }
+
+  defm : unpred_store<         store, nxv16i8,   ST1B_IMM, PTRUE_B>;
+  defm : unpred_store< truncstorevi8, nxv8i16, ST1B_H_IMM, PTRUE_H>;
+  defm : unpred_store< truncstorevi8, nxv4i32, ST1B_S_IMM, PTRUE_S>;
+  defm : unpred_store< truncstorevi8, nxv2i64, ST1B_D_IMM, PTRUE_D>;
+  defm : unpred_store<         store, nxv8i16,   ST1H_IMM, PTRUE_H>;
+  defm : unpred_store<truncstorevi16, nxv4i32, ST1H_S_IMM, PTRUE_S>;
+  defm : unpred_store<truncstorevi16, nxv2i64, ST1H_D_IMM, PTRUE_D>;
+  defm : unpred_store<         store, nxv4i32,   ST1W_IMM, PTRUE_S>;
+  defm : unpred_store<truncstorevi32, nxv2i64, ST1W_D_IMM, PTRUE_D>;
+  defm : unpred_store<         store, nxv2i64,   ST1D_IMM, PTRUE_D>;
+  defm : unpred_store<         store, nxv8f16,   ST1H_IMM, PTRUE_H>;
+  defm : unpred_store<         store, nxv8bf16,  ST1H_IMM, PTRUE_H>;
+  defm : unpred_store<         store, nxv4f16, ST1H_S_IMM, PTRUE_S>;
+  defm : unpred_store<         store, nxv2f16, ST1H_D_IMM, PTRUE_D>;
+  defm : unpred_store<         store, nxv4f32,   ST1W_IMM, PTRUE_S>;
+  defm : unpred_store<         store, nxv4f32, ST1W_D_IMM, PTRUE_D>;
+  defm : unpred_store<         store, nxv2f64,   ST1D_IMM, PTRUE_D>;
+
+  multiclass unpred_load<PatFrag Load, ValueType Ty, Instruction RegImmInst,
+                         Instruction PTrue> {
+    let AddedComplexity = 1 in {
+      def _imm: Pat<(Ty (Load  (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset))),
+                    (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
+    }
+
+    let AddedComplexity = 2 in {
+      def _fi : Pat<(Ty (Load  (am_sve_fi GPR64sp:$base, simm4s1:$offset))),
+                    (RegImmInst (PTrue 31), GPR64sp:$base, simm4s1:$offset)>;
+    }
+
+    def : Pat<(Ty (Load GPR64:$base)),
+              (RegImmInst (PTrue 31), GPR64:$base, (i64 0))>;
+  }
+
+  defm : unpred_load<        load, nxv16i8,    LD1B_IMM, PTRUE_B>;
+  defm : unpred_load< zextloadvi8, nxv8i16,  LD1B_H_IMM, PTRUE_H>;
+  defm : unpred_load< zextloadvi8, nxv4i32,  LD1B_S_IMM, PTRUE_S>;
+  defm : unpred_load< zextloadvi8, nxv2i64,  LD1B_D_IMM, PTRUE_D>;
+  defm : unpred_load<  extloadvi8, nxv8i16,  LD1B_H_IMM, PTRUE_H>;
+  defm : unpred_load<  extloadvi8, nxv4i32,  LD1B_S_IMM, PTRUE_S>;
+  defm : unpred_load<  extloadvi8, nxv2i64,  LD1B_D_IMM, PTRUE_D>;
+  defm : unpred_load< sextloadvi8, nxv8i16, LD1SB_H_IMM, PTRUE_H>;
+  defm : unpred_load< sextloadvi8, nxv4i32, LD1SB_S_IMM, PTRUE_S>;
+  defm : unpred_load< sextloadvi8, nxv2i64, LD1SB_D_IMM, PTRUE_D>;
+  defm : unpred_load<        load, nxv8i16,    LD1H_IMM, PTRUE_H>;
+  defm : unpred_load<zextloadvi16, nxv4i32,  LD1H_S_IMM, PTRUE_S>;
+  defm : unpred_load<zextloadvi16, nxv2i64,  LD1H_D_IMM, PTRUE_D>;
+  defm : unpred_load< extloadvi16, nxv4i32,  LD1H_S_IMM, PTRUE_S>;
+  defm : unpred_load< extloadvi16, nxv2i64,  LD1H_D_IMM, PTRUE_D>;
+  defm : unpred_load<sextloadvi16, nxv4i32, LD1SH_S_IMM, PTRUE_S>;
+  defm : unpred_load<sextloadvi16, nxv2i64, LD1SH_D_IMM, PTRUE_D>;
+  defm : unpred_load<        load, nxv4i32,    LD1W_IMM, PTRUE_S>;
+  defm : unpred_load<zextloadvi32, nxv2i64,  LD1W_D_IMM, PTRUE_D>;
+  defm : unpred_load< extloadvi32, nxv2i64,  LD1W_D_IMM, PTRUE_D>;
+  defm : unpred_load<sextloadvi32, nxv2i64, LD1SW_D_IMM, PTRUE_D>;
+  defm : unpred_load<        load, nxv2i64,    LD1D_IMM, PTRUE_D>;
+  defm : unpred_load<        load, nxv8f16,    LD1H_IMM, PTRUE_H>;
+  defm : unpred_load<        load, nxv8bf16,   LD1H_IMM, PTRUE_H>;
+  defm : unpred_load<        load, nxv4f16,  LD1H_S_IMM, PTRUE_S>;
+  defm : unpred_load<        load, nxv2f16,  LD1H_D_IMM, PTRUE_D>;
+  defm : unpred_load<        load, nxv4f32,    LD1W_IMM, PTRUE_S>;
+  defm : unpred_load<        load, nxv2f32,  LD1W_D_IMM, PTRUE_D>;
+  defm : unpred_load<        load, nxv2f64,    LD1D_IMM, PTRUE_D>;
+
+  multiclass unpred_store_predicate<ValueType Ty, Instruction Store> {
+    def _fi : Pat<(store (Ty PPR:$val), (am_sve_fi GPR64sp:$base, simm9:$offset)),
+                  (Store PPR:$val, GPR64sp:$base, simm9:$offset)>;
+
+    def _default : Pat<(store (Ty PPR:$Val), GPR64:$base),
+                  (Store PPR:$Val, GPR64:$base, (i64 0))>;
+  }
+
+  defm Pat_Store_P16 : unpred_store_predicate<nxv16i1, STR_PXI>;
+  defm Pat_Store_P8  : unpred_store_predicate<nxv8i1, STR_PXI>;
+  defm Pat_Store_P4  : unpred_store_predicate<nxv4i1, STR_PXI>;
+  defm Pat_Store_P2  : unpred_store_predicate<nxv2i1, STR_PXI>;
+
+  multiclass unpred_load_predicate<ValueType Ty, Instruction Load> {
+    def _fi : Pat<(Ty (load (am_sve_fi GPR64sp:$base, simm9:$offset))),
+                  (Load GPR64sp:$base, simm9:$offset)>;
+
+    def _default : Pat<(Ty (load GPR64:$base)),
+                  (Load GPR64:$base, (i64 0))>;
+  }
+
+  defm Pat_Load_P16 : unpred_load_predicate<nxv16i1, LDR_PXI>;
+  defm Pat_Load_P8  : unpred_load_predicate<nxv8i1, LDR_PXI>;
+  defm Pat_Load_P4  : unpred_load_predicate<nxv4i1, LDR_PXI>;
+  defm Pat_Load_P2  : unpred_load_predicate<nxv2i1, LDR_PXI>;
+
+  multiclass ld1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
+                 SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
+    // reg + reg
+    let AddedComplexity = 1 in {
+      def : Pat<(Ty (Load  (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)),
+                (RegRegInst PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
+    }
+
+    // scalar + immediate (mul vl)
+    let AddedComplexity = 2 in {
+      def : Pat<(Ty (Load  (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)),
+                (RegImmInst PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
+    }
+
+    // base
+    def : Pat<(Ty (Load  (PredTy PPR:$gp), GPR64:$base, MemVT)),
+              (RegImmInst PPR:$gp, GPR64sp:$base, (i64 0))>;
+  }
+
+  // 2-element contiguous loads
+  defm : ld1<LD1B_D,  LD1B_D_IMM,  nxv2i64, AArch64ld1_z,  nxv2i1, nxv2i8,  am_sve_regreg_lsl0>;
+  defm : ld1<LD1SB_D, LD1SB_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i8,  am_sve_regreg_lsl0>;
+  defm : ld1<LD1H_D,  LD1H_D_IMM,  nxv2i64, AArch64ld1_z,  nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
+  defm : ld1<LD1SH_D, LD1SH_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
+  defm : ld1<LD1W_D,  LD1W_D_IMM,  nxv2i64, AArch64ld1_z,  nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
+  defm : ld1<LD1SW_D, LD1SW_D_IMM, nxv2i64, AArch64ld1s_z, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
+  defm : ld1<LD1D,    LD1D_IMM,    nxv2i64, AArch64ld1_z,  nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
+  defm : ld1<LD1D,    LD1D_IMM,    nxv2f64, AArch64ld1_z,  nxv2i1, nxv2f64, am_sve_regreg_lsl3>;
+
+  // 4-element contiguous loads
+  defm : ld1<LD1B_S,  LD1B_S_IMM,  nxv4i32, AArch64ld1_z,  nxv4i1, nxv4i8,  am_sve_regreg_lsl0>;
+  defm : ld1<LD1SB_S, LD1SB_S_IMM, nxv4i32, AArch64ld1s_z, nxv4i1, nxv4i8,  am_sve_regreg_lsl0>;
+  defm : ld1<LD1H_S,  LD1H_S_IMM,  nxv4i32, AArch64ld1_z,  nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
+  defm : ld1<LD1SH_S, LD1SH_S_IMM, nxv4i32, AArch64ld1s_z, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
+  defm : ld1<LD1W,    LD1W_IMM,    nxv4i32, AArch64ld1_z,  nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
+  defm : ld1<LD1W,    LD1W_IMM,    nxv4f32, AArch64ld1_z,  nxv4i1, nxv4f32, am_sve_regreg_lsl2>;
+
+  // 8-element contiguous loads
+  defm : ld1<LD1B_H,  LD1B_H_IMM,  nxv8i16,  AArch64ld1_z,  nxv8i1, nxv8i8,   am_sve_regreg_lsl0>;
+  defm : ld1<LD1SB_H, LD1SB_H_IMM, nxv8i16,  AArch64ld1s_z, nxv8i1, nxv8i8,   am_sve_regreg_lsl0>;
+  defm : ld1<LD1H,    LD1H_IMM,    nxv8i16,  AArch64ld1_z,  nxv8i1, nxv8i16,  am_sve_regreg_lsl1>;
+  defm : ld1<LD1H,    LD1H_IMM,    nxv8f16,  AArch64ld1_z,  nxv8i1, nxv8f16,  am_sve_regreg_lsl1>;
+
+  let Predicates = [HasBF16, HasSVE] in {
+    defm : ld1<LD1H,    LD1H_IMM,    nxv8bf16, AArch64ld1_z,  nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
+  }
+
+  // 16-element contiguous loads
+  defm : ld1<LD1B, LD1B_IMM, nxv16i8, AArch64ld1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
+
+  multiclass ldnf1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT> {
+    // scalar + immediate (mul vl)
+    let AddedComplexity = 1 in {
+      def : Pat<(Ty (Load  (PredTy PPR:$gp), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), MemVT)),
+                (I PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
+    }
+
+    // base
+    def : Pat<(Ty (Load  (PredTy PPR:$gp), GPR64:$base, MemVT)),
+              (I PPR:$gp, GPR64sp:$base, (i64 0))>;
+  }
+
+  // 2-element contiguous non-faulting loads
+  defm : ldnf1<LDNF1B_D_IMM,  nxv2i64,  AArch64ldnf1_z,  nxv2i1, nxv2i8>;
+  defm : ldnf1<LDNF1SB_D_IMM, nxv2i64,  AArch64ldnf1s_z, nxv2i1, nxv2i8>;
+  defm : ldnf1<LDNF1H_D_IMM,  nxv2i64,  AArch64ldnf1_z,  nxv2i1, nxv2i16>;
+  defm : ldnf1<LDNF1SH_D_IMM, nxv2i64,  AArch64ldnf1s_z, nxv2i1, nxv2i16>;
+  defm : ldnf1<LDNF1W_D_IMM,  nxv2i64,  AArch64ldnf1_z,  nxv2i1, nxv2i32>;
+  defm : ldnf1<LDNF1SW_D_IMM, nxv2i64,  AArch64ldnf1s_z, nxv2i1, nxv2i32>;
+  defm : ldnf1<LDNF1D_IMM,    nxv2i64,  AArch64ldnf1_z,  nxv2i1, nxv2i64>;
+  defm : ldnf1<LDNF1D_IMM,    nxv2f64,  AArch64ldnf1_z,  nxv2i1, nxv2f64>;
+
+  // 4-element contiguous non-faulting loads
+  defm : ldnf1<LDNF1B_S_IMM,  nxv4i32,  AArch64ldnf1_z,  nxv4i1, nxv4i8>;
+  defm : ldnf1<LDNF1SB_S_IMM, nxv4i32,  AArch64ldnf1s_z, nxv4i1, nxv4i8>;
+  defm : ldnf1<LDNF1H_S_IMM,  nxv4i32,  AArch64ldnf1_z,  nxv4i1, nxv4i16>;
+  defm : ldnf1<LDNF1SH_S_IMM, nxv4i32,  AArch64ldnf1s_z, nxv4i1, nxv4i16>;
+  defm : ldnf1<LDNF1W_IMM,    nxv4i32,  AArch64ldnf1_z,  nxv4i1, nxv4i32>;
+  defm : ldnf1<LDNF1W_IMM,    nxv4f32,  AArch64ldnf1_z,  nxv4i1, nxv4f32>;
+
+  // 8-element contiguous non-faulting loads
+  defm : ldnf1<LDNF1B_H_IMM,  nxv8i16,  AArch64ldnf1_z,  nxv8i1, nxv8i8>;
+  defm : ldnf1<LDNF1SB_H_IMM, nxv8i16,  AArch64ldnf1s_z, nxv8i1, nxv8i8>;
+  defm : ldnf1<LDNF1H_IMM,    nxv8i16,  AArch64ldnf1_z,  nxv8i1, nxv8i16>;
+  defm : ldnf1<LDNF1H_IMM,    nxv8f16,  AArch64ldnf1_z,  nxv8i1, nxv8f16>;
+
+  let Predicates = [HasBF16, HasSVE] in {
+    defm : ldnf1<LDNF1H_IMM,    nxv8bf16, AArch64ldnf1_z,  nxv8i1, nxv8bf16>;
+  }
+
+  // 16-element contiguous non-faulting loads
+  defm : ldnf1<LDNF1B_IMM,    nxv16i8,  AArch64ldnf1_z, nxv16i1, nxv16i8>;
 
-  defm : pred_load<nxv16i8, nxv16i1, non_temporal_load, LDNT1B_ZRI>;
-  defm : pred_load<nxv8i16, nxv8i1,  non_temporal_load, LDNT1H_ZRI>;
-  defm : pred_load<nxv4i32, nxv4i1,  non_temporal_load, LDNT1W_ZRI>;
-  defm : pred_load<nxv2i64, nxv2i1,  non_temporal_load, LDNT1D_ZRI>;
+  multiclass ldff1<Instruction I, ValueType Ty, SDPatternOperator Load, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
+    // reg + reg
+    let AddedComplexity = 1 in {
+      def : Pat<(Ty (Load  (PredTy PPR:$gp), (AddrCP GPR64:$base, GPR64:$offset), MemVT)),
+                (I PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
+    }
 
-  defm : pred_store<nxv16i8, nxv16i1, non_temporal_store, STNT1B_ZRI>;
-  defm : pred_store<nxv8i16, nxv8i1,  non_temporal_store, STNT1H_ZRI>;
-  defm : pred_store<nxv4i32, nxv4i1,  non_temporal_store, STNT1W_ZRI>;
-  defm : pred_store<nxv2i64, nxv2i1,  non_temporal_store, STNT1D_ZRI>;
+    // Base
+    def : Pat<(Ty (Load  (PredTy PPR:$gp), GPR64:$base, MemVT)),
+              (I PPR:$gp, GPR64sp:$base, XZR)>;
+  }
+
+  // 2-element contiguous first faulting loads
+  defm : ldff1<LDFF1B_D,  nxv2i64,  AArch64ldff1_z,  nxv2i1, nxv2i8,   am_sve_regreg_lsl0>;
+  defm : ldff1<LDFF1SB_D, nxv2i64,  AArch64ldff1s_z, nxv2i1, nxv2i8,   am_sve_regreg_lsl0>;
+  defm : ldff1<LDFF1H_D,  nxv2i64,  AArch64ldff1_z,  nxv2i1, nxv2i16,  am_sve_regreg_lsl1>;
+  defm : ldff1<LDFF1SH_D, nxv2i64,  AArch64ldff1s_z, nxv2i1, nxv2i16,  am_sve_regreg_lsl1>;
+  defm : ldff1<LDFF1W_D,  nxv2i64,  AArch64ldff1_z,  nxv2i1, nxv2i32,  am_sve_regreg_lsl2>;
+  defm : ldff1<LDFF1SW_D, nxv2i64,  AArch64ldff1s_z, nxv2i1, nxv2i32,  am_sve_regreg_lsl2>;
+  defm : ldff1<LDFF1D,    nxv2i64,  AArch64ldff1_z,  nxv2i1, nxv2i64,  am_sve_regreg_lsl3>;
+  defm : ldff1<LDFF1W_D,  nxv2f32,  AArch64ldff1_z,  nxv2i1, nxv2f32,  am_sve_regreg_lsl2>;
+  defm : ldff1<LDFF1D,    nxv2f64,  AArch64ldff1_z,  nxv2i1, nxv2f64,  am_sve_regreg_lsl3>;
+
+  // 4-element contiguous first faulting loads
+  defm : ldff1<LDFF1B_S,  nxv4i32,  AArch64ldff1_z,  nxv4i1, nxv4i8,   am_sve_regreg_lsl0>;
+  defm : ldff1<LDFF1SB_S, nxv4i32,  AArch64ldff1s_z, nxv4i1, nxv4i8,   am_sve_regreg_lsl0>;
+  defm : ldff1<LDFF1H_S,  nxv4i32,  AArch64ldff1_z,  nxv4i1, nxv4i16,  am_sve_regreg_lsl1>;
+  defm : ldff1<LDFF1SH_S, nxv4i32,  AArch64ldff1s_z, nxv4i1, nxv4i16,  am_sve_regreg_lsl1>;
+  defm : ldff1<LDFF1W,    nxv4i32,  AArch64ldff1_z,  nxv4i1, nxv4i32,  am_sve_regreg_lsl2>;
+  defm : ldff1<LDFF1W,    nxv4f32,  AArch64ldff1_z,  nxv4i1, nxv4f32,  am_sve_regreg_lsl2>;
+
+  // 8-element contiguous first faulting loads
+  defm : ldff1<LDFF1B_H,  nxv8i16,  AArch64ldff1_z,  nxv8i1, nxv8i8,   am_sve_regreg_lsl0>;
+  defm : ldff1<LDFF1SB_H, nxv8i16,  AArch64ldff1s_z, nxv8i1, nxv8i8,   am_sve_regreg_lsl0>;
+  defm : ldff1<LDFF1H,    nxv8i16,  AArch64ldff1_z,  nxv8i1, nxv8i16,  am_sve_regreg_lsl1>;
+  defm : ldff1<LDFF1H,    nxv8f16,  AArch64ldff1_z,  nxv8i1, nxv8f16,  am_sve_regreg_lsl1>;
+
+  let Predicates = [HasBF16, HasSVE] in {
+    defm : ldff1<LDFF1H,    nxv8bf16, AArch64ldff1_z,  nxv8i1, nxv8bf16, am_sve_regreg_lsl1>;
+  }
+
+  // 16-element contiguous first faulting loads
+  defm : ldff1<LDFF1B, nxv16i8, AArch64ldff1_z, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
+
+  multiclass st1<Instruction RegRegInst, Instruction RegImmInst, ValueType Ty,
+                 SDPatternOperator Store, ValueType PredTy, ValueType MemVT, ComplexPattern AddrCP> {
+    // reg + reg
+    let AddedComplexity = 1 in {
+      def : Pat<(Store (Ty ZPR:$vec), (AddrCP GPR64:$base, GPR64:$offset), (PredTy PPR:$gp), MemVT),
+                (RegRegInst ZPR:$vec, PPR:$gp, GPR64sp:$base, GPR64:$offset)>;
+    }
+
+    // scalar + immediate (mul vl)
+    let AddedComplexity = 2 in {
+      def : Pat<(Store (Ty ZPR:$vec), (am_sve_indexed_s4 GPR64sp:$base, simm4s1:$offset), (PredTy PPR:$gp), MemVT),
+                (RegImmInst ZPR:$vec, PPR:$gp, GPR64sp:$base, simm4s1:$offset)>;
+    }
+
+    // base
+    def : Pat<(Store (Ty ZPR:$vec), GPR64:$base, (PredTy PPR:$gp), MemVT),
+              (RegImmInst ZPR:$vec, PPR:$gp, GPR64:$base, (i64 0))>;
+  }
+
+  // 2-element contiguous store
+  defm : st1<ST1B_D, ST1B_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i8,  am_sve_regreg_lsl0>;
+  defm : st1<ST1H_D, ST1H_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i16, am_sve_regreg_lsl1>;
+  defm : st1<ST1W_D, ST1W_D_IMM, nxv2i64, AArch64st1, nxv2i1, nxv2i32, am_sve_regreg_lsl2>;
+  defm : st1<ST1D,   ST1D_IMM,   nxv2i64, AArch64st1, nxv2i1, nxv2i64, am_sve_regreg_lsl3>;
+
+  // 4-element contiguous store
+  defm : st1<ST1B_S, ST1B_S_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i8,  am_sve_regreg_lsl0>;
+  defm : st1<ST1H_S, ST1H_S_IMM, nxv4i32, AArch64st1, nxv4i1, nxv4i16, am_sve_regreg_lsl1>;
+  defm : st1<ST1W,   ST1W_IMM,   nxv4i32, AArch64st1, nxv4i1, nxv4i32, am_sve_regreg_lsl2>;
+
+  // 8-element contiguous store
+  defm : st1<ST1B_H, ST1B_H_IMM, nxv8i16, AArch64st1, nxv8i1, nxv8i8,  am_sve_regreg_lsl0>;
+  defm : st1<ST1H,   ST1H_IMM,   nxv8i16, AArch64st1, nxv8i1, nxv8i16, am_sve_regreg_lsl1>;
+
+  // 16-element contiguous store
+  defm : st1<ST1B, ST1B_IMM,   nxv16i8, AArch64st1, nxv16i1, nxv16i8, am_sve_regreg_lsl0>;
+
+  def : Pat<(nxv16i8 (vector_insert (nxv16i8 (undef)), (i32 FPR32:$src), 0)),
+            (INSERT_SUBREG (nxv16i8 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+  def : Pat<(nxv8i16 (vector_insert (nxv8i16 (undef)), (i32 FPR32:$src), 0)),
+            (INSERT_SUBREG (nxv8i16 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+  def : Pat<(nxv4i32 (vector_insert (nxv4i32 (undef)), (i32 FPR32:$src), 0)),
+            (INSERT_SUBREG (nxv4i32 (IMPLICIT_DEF)), FPR32:$src, ssub)>;
+  def : Pat<(nxv2i64 (vector_insert (nxv2i64 (undef)), (i64 FPR64:$src), 0)),
+            (INSERT_SUBREG (nxv2i64 (IMPLICIT_DEF)), FPR64:$src, dsub)>;
+
+  // Insert scalar into vector[0]
+  def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), (i32 GPR32:$src), 0)),
+            (CPY_ZPmR_B ZPR:$vec, (PTRUE_B 1), GPR32:$src)>;
+  def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), (i32 GPR32:$src), 0)),
+            (CPY_ZPmR_H ZPR:$vec, (PTRUE_H 1), GPR32:$src)>;
+  def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), (i32 GPR32:$src), 0)),
+            (CPY_ZPmR_S ZPR:$vec, (PTRUE_S 1), GPR32:$src)>;
+  def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), (i64 GPR64:$src), 0)),
+            (CPY_ZPmR_D ZPR:$vec, (PTRUE_D 1), GPR64:$src)>;
+
+  def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), 0)),
+            (SEL_ZPZZ_H (PTRUE_H 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR16:$src, hsub), ZPR:$vec)>;
+  def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), 0)),
+            (SEL_ZPZZ_S (PTRUE_S 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR32:$src, ssub), ZPR:$vec)>;
+  def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), 0)),
+            (SEL_ZPZZ_D (PTRUE_D 1), (INSERT_SUBREG (IMPLICIT_DEF), FPR64:$src, dsub), ZPR:$vec)>;
+
+  // Insert scalar into vector with scalar index
+  def : Pat<(nxv16i8 (vector_insert (nxv16i8 ZPR:$vec), GPR32:$src, GPR64:$index)),
+            (CPY_ZPmR_B ZPR:$vec,
+                        (CMPEQ_PPzZZ_B (PTRUE_B 31),
+                                       (INDEX_II_B 0, 1),
+                                       (DUP_ZR_B (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        GPR32:$src)>;
+  def : Pat<(nxv8i16 (vector_insert (nxv8i16 ZPR:$vec), GPR32:$src, GPR64:$index)),
+            (CPY_ZPmR_H ZPR:$vec,
+                        (CMPEQ_PPzZZ_H (PTRUE_H 31),
+                                       (INDEX_II_H 0, 1),
+                                       (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        GPR32:$src)>;
+  def : Pat<(nxv4i32 (vector_insert (nxv4i32 ZPR:$vec), GPR32:$src, GPR64:$index)),
+            (CPY_ZPmR_S ZPR:$vec,
+                        (CMPEQ_PPzZZ_S (PTRUE_S 31),
+                                       (INDEX_II_S 0, 1),
+                                       (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        GPR32:$src)>;
+  def : Pat<(nxv2i64 (vector_insert (nxv2i64 ZPR:$vec), GPR64:$src, GPR64:$index)),
+            (CPY_ZPmR_D ZPR:$vec,
+                        (CMPEQ_PPzZZ_D (PTRUE_D 31),
+                                       (INDEX_II_D 0, 1),
+                                       (DUP_ZR_D GPR64:$index)),
+                        GPR64:$src)>;
+
+  // Insert FP scalar into vector with scalar index
+  def : Pat<(nxv8f16 (vector_insert (nxv8f16 ZPR:$vec), (f16 FPR16:$src), GPR64:$index)),
+            (CPY_ZPmV_H ZPR:$vec,
+                        (CMPEQ_PPzZZ_H (PTRUE_H 31),
+                                       (INDEX_II_H 0, 1),
+                                       (DUP_ZR_H (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        $src)>;
+  def : Pat<(nxv4f32 (vector_insert (nxv4f32 ZPR:$vec), (f32 FPR32:$src), GPR64:$index)),
+            (CPY_ZPmV_S ZPR:$vec,
+                        (CMPEQ_PPzZZ_S (PTRUE_S 31),
+                                       (INDEX_II_S 0, 1),
+                                       (DUP_ZR_S (i32 (EXTRACT_SUBREG GPR64:$index, sub_32)))),
+                        $src)>;
+  def : Pat<(nxv2f64 (vector_insert (nxv2f64 ZPR:$vec), (f64 FPR64:$src), GPR64:$index)),
+            (CPY_ZPmV_D ZPR:$vec,
+                        (CMPEQ_PPzZZ_D (PTRUE_D 31),
+                                       (INDEX_II_D 0, 1),
+                                       (DUP_ZR_D $index)),
+                        $src)>;
+
+  // Extract element from vector with immediate index
+  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), sve_elm_idx_extdup_b:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_B ZPR:$vec, sve_elm_idx_extdup_b:$index), ssub)>;
+  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), ssub)>;
+  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
+  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
+  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), sve_elm_idx_extdup_h:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_H ZPR:$vec, sve_elm_idx_extdup_h:$index), hsub)>;
+  def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), sve_elm_idx_extdup_s:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_S ZPR:$vec, sve_elm_idx_extdup_s:$index), ssub)>;
+  def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), sve_elm_idx_extdup_d:$index)),
+            (EXTRACT_SUBREG (DUP_ZZI_D ZPR:$vec, sve_elm_idx_extdup_d:$index), dsub)>;
+
+  // Extract element from vector with scalar index
+  def : Pat<(i32 (vector_extract (nxv16i8 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_B (WHILELS_PXX_B XZR, GPR64:$index),
+                         ZPR:$vec)>;
+  def : Pat<(i32 (vector_extract (nxv8i16 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
+                         ZPR:$vec)>;
+  def : Pat<(i32 (vector_extract (nxv4i32 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
+                         ZPR:$vec)>;
+  def : Pat<(i64 (vector_extract (nxv2i64 ZPR:$vec), GPR64:$index)),
+            (LASTB_RPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
+                         ZPR:$vec)>;
+
+  def : Pat<(f16 (vector_extract (nxv8f16 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_H (WHILELS_PXX_H XZR, GPR64:$index),
+                         ZPR:$vec)>;
+  def : Pat<(f32 (vector_extract (nxv4f32 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_S (WHILELS_PXX_S XZR, GPR64:$index),
+                         ZPR:$vec)>;
+  def : Pat<(f64 (vector_extract (nxv2f64 ZPR:$vec), GPR64:$index)),
+            (LASTB_VPZ_D (WHILELS_PXX_D XZR, GPR64:$index),
+                         ZPR:$vec)>;
+}
+
+let Predicates = [HasSVE, HasMatMulInt8] in {
+  defm  SMMLA_ZZZ : sve_int_matmul<0b00, "smmla", int_aarch64_sve_smmla>;
+  defm  UMMLA_ZZZ : sve_int_matmul<0b11, "ummla", int_aarch64_sve_ummla>;
+  defm USMMLA_ZZZ : sve_int_matmul<0b10, "usmmla", int_aarch64_sve_usmmla>;
+  defm USDOT_ZZZ  : sve_int_dot_mixed<"usdot", int_aarch64_sve_usdot>;
+  defm USDOT_ZZZI : sve_int_dot_mixed_indexed<0, "usdot", int_aarch64_sve_usdot_lane>;
+  defm SUDOT_ZZZI : sve_int_dot_mixed_indexed<1, "sudot", int_aarch64_sve_sudot_lane>;
+}
+
+let Predicates = [HasSVE, HasMatMulFP32] in {
+  defm FMMLA_ZZZ_S : sve_fp_matrix_mla<0, "fmmla", ZPR32, int_aarch64_sve_fmmla, nxv4f32>;
+}
+
+let Predicates = [HasSVE, HasMatMulFP64] in {
+  defm FMMLA_ZZZ_D : sve_fp_matrix_mla<1, "fmmla", ZPR64, int_aarch64_sve_fmmla, nxv2f64>;
+  defm LD1RO_B_IMM : sve_mem_ldor_si<0b00, "ld1rob", Z_b, ZPR8,  nxv16i8, nxv16i1, AArch64ld1ro_z>;
+  defm LD1RO_H_IMM : sve_mem_ldor_si<0b01, "ld1roh", Z_h, ZPR16, nxv8i16, nxv8i1,  AArch64ld1ro_z>;
+  defm LD1RO_W_IMM : sve_mem_ldor_si<0b10, "ld1row", Z_s, ZPR32, nxv4i32, nxv4i1,  AArch64ld1ro_z>;
+  defm LD1RO_D_IMM : sve_mem_ldor_si<0b11, "ld1rod", Z_d, ZPR64, nxv2i64, nxv2i1,  AArch64ld1ro_z>;
+  defm LD1RO_B     : sve_mem_ldor_ss<0b00, "ld1rob", Z_b, ZPR8,  GPR64NoXZRshifted8,  nxv16i8, nxv16i1, AArch64ld1ro_z, am_sve_regreg_lsl0>;
+  defm LD1RO_H     : sve_mem_ldor_ss<0b01, "ld1roh", Z_h, ZPR16, GPR64NoXZRshifted16, nxv8i16, nxv8i1,  AArch64ld1ro_z, am_sve_regreg_lsl1>;
+  defm LD1RO_W     : sve_mem_ldor_ss<0b10, "ld1row", Z_s, ZPR32, GPR64NoXZRshifted32, nxv4i32, nxv4i1,  AArch64ld1ro_z, am_sve_regreg_lsl2>;
+  defm LD1RO_D     : sve_mem_ldor_ss<0b11, "ld1rod", Z_d, ZPR64, GPR64NoXZRshifted64, nxv2i64, nxv2i1,  AArch64ld1ro_z, am_sve_regreg_lsl3>;
+  defm ZIP1_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b00, 0, "zip1", int_aarch64_sve_zip1q>;
+  defm ZIP2_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b00, 1, "zip2", int_aarch64_sve_zip2q>;
+  defm UZP1_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b01, 0, "uzp1", int_aarch64_sve_uzp1q>;
+  defm UZP2_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b01, 1, "uzp2", int_aarch64_sve_uzp2q>;
+  defm TRN1_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b11, 0, "trn1", int_aarch64_sve_trn1q>;
+  defm TRN2_ZZZ_Q  : sve_int_perm_bin_perm_128_zz<0b11, 1, "trn2", int_aarch64_sve_trn2q>;
+}
+
+let Predicates = [HasSVE, HasMatMulFP64, HasBF16] in {
+  def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip1q, nxv8bf16, nxv8bf16, ZIP1_ZZZ_Q>;
+  def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_zip2q, nxv8bf16, nxv8bf16, ZIP2_ZZZ_Q>;
+  def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp1q, nxv8bf16, nxv8bf16, UZP1_ZZZ_Q>;
+  def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_uzp2q, nxv8bf16, nxv8bf16, UZP2_ZZZ_Q>;
+  def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn1q, nxv8bf16, nxv8bf16, TRN1_ZZZ_Q>;
+  def : SVE_2_Op_Pat<nxv8bf16, int_aarch64_sve_trn2q, nxv8bf16, nxv8bf16, TRN2_ZZZ_Q>;
 }
 
 let Predicates = [HasSVE2] in {
   // SVE2 integer multiply-add (indexed)
-  defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla">;
-  defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls">;
+  defm MLA_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b0, "mla", int_aarch64_sve_mla_lane>;
+  defm MLS_ZZZI : sve2_int_mla_by_indexed_elem<0b01, 0b1, "mls", int_aarch64_sve_mls_lane>;
 
   // SVE2 saturating multiply-add high (indexed)
-  defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah">;
-  defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh">;
+  defm SQRDMLAH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah_lane>;
+  defm SQRDMLSH_ZZZI : sve2_int_mla_by_indexed_elem<0b10, 0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh_lane>;
 
   // SVE2 saturating multiply-add high (vectors, unpredicated)
-  defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah">;
-  defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh">;
+  defm SQRDMLAH_ZZZ : sve2_int_mla<0b0, "sqrdmlah", int_aarch64_sve_sqrdmlah>;
+  defm SQRDMLSH_ZZZ : sve2_int_mla<0b1, "sqrdmlsh", int_aarch64_sve_sqrdmlsh>;
 
   // SVE2 integer multiply (indexed)
-  defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul">;
+  defm MUL_ZZZI : sve2_int_mul_by_indexed_elem<0b1110, "mul", int_aarch64_sve_mul_lane>;
 
   // SVE2 saturating multiply high (indexed)
-  defm SQDMULH_ZZZI  : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh">;
-  defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh">;
+  defm SQDMULH_ZZZI  : sve2_int_mul_by_indexed_elem<0b1100, "sqdmulh",  int_aarch64_sve_sqdmulh_lane>;
+  defm SQRDMULH_ZZZI : sve2_int_mul_by_indexed_elem<0b1101, "sqrdmulh", int_aarch64_sve_sqrdmulh_lane>;
 
   // SVE2 signed saturating doubling multiply high (unpredicated)
-  defm SQDMULH_ZZZ  : sve2_int_mul<0b100, "sqdmulh">;
-  defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh">;
+  defm SQDMULH_ZZZ  : sve2_int_mul<0b100, "sqdmulh",  int_aarch64_sve_sqdmulh>;
+  defm SQRDMULH_ZZZ : sve2_int_mul<0b101, "sqrdmulh", int_aarch64_sve_sqrdmulh>;
 
   // SVE2 integer multiply vectors (unpredicated)
-  defm MUL_ZZZ    : sve2_int_mul<0b000, "mul">;
-  defm SMULH_ZZZ  : sve2_int_mul<0b010, "smulh">;
-  defm UMULH_ZZZ  : sve2_int_mul<0b011, "umulh">;
-  def  PMUL_ZZZ_B : sve2_int_mul<0b00, 0b001, "pmul", ZPR8>;
-
+  defm MUL_ZZZ    : sve2_int_mul<0b000,  "mul",   mul>;
+  defm SMULH_ZZZ  : sve2_int_mul<0b010,  "smulh", null_frag>;
+  defm UMULH_ZZZ  : sve2_int_mul<0b011,  "umulh", null_frag>;
+  defm PMUL_ZZZ   : sve2_int_mul_single<0b001, "pmul",  int_aarch64_sve_pmul>;
+
+  // Add patterns for unpredicated version of smulh and umulh.
+  def : Pat<(nxv16i8 (int_aarch64_sve_smulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
+            (SMULH_ZZZ_B $Op1, $Op2)>;
+  def : Pat<(nxv8i16 (int_aarch64_sve_smulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
+            (SMULH_ZZZ_H $Op1, $Op2)>;
+  def : Pat<(nxv4i32 (int_aarch64_sve_smulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
+            (SMULH_ZZZ_S $Op1, $Op2)>;
+  def : Pat<(nxv2i64 (int_aarch64_sve_smulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
+            (SMULH_ZZZ_D $Op1, $Op2)>;
+  def : Pat<(nxv16i8 (int_aarch64_sve_umulh (nxv16i1 (AArch64ptrue 31)), nxv16i8:$Op1, nxv16i8:$Op2)),
+            (UMULH_ZZZ_B $Op1, $Op2)>;
+  def : Pat<(nxv8i16 (int_aarch64_sve_umulh (nxv8i1 (AArch64ptrue 31)), nxv8i16:$Op1, nxv8i16:$Op2)),
+            (UMULH_ZZZ_H $Op1, $Op2)>;
+  def : Pat<(nxv4i32 (int_aarch64_sve_umulh (nxv4i1 (AArch64ptrue 31)), nxv4i32:$Op1, nxv4i32:$Op2)),
+            (UMULH_ZZZ_S $Op1, $Op2)>;
+  def : Pat<(nxv2i64 (int_aarch64_sve_umulh (nxv2i1 (AArch64ptrue 31)), nxv2i64:$Op1, nxv2i64:$Op2)),
+            (UMULH_ZZZ_D $Op1, $Op2)>;
   // SVE2 complex integer dot product (indexed)
-  defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot">;
+  defm CDOT_ZZZI : sve2_cintx_dot_by_indexed_elem<"cdot", int_aarch64_sve_cdot_lane>;
 
   // SVE2 complex integer dot product
-  defm CDOT_ZZZ : sve2_cintx_dot<"cdot">;
+  defm CDOT_ZZZ : sve2_cintx_dot<"cdot", int_aarch64_sve_cdot>;
 
   // SVE2 complex integer multiply-add (indexed)
-  defm CMLA_ZZZI      : sve2_cmla_by_indexed_elem<0b0, "cmla">;
+  defm CMLA_ZZZI      : sve2_cmla_by_indexed_elem<0b0, "cmla", int_aarch64_sve_cmla_lane_x>;
   // SVE2 complex saturating multiply-add (indexed)
-  defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah">;
+  defm SQRDCMLAH_ZZZI : sve2_cmla_by_indexed_elem<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_lane_x>;
 
   // SVE2 complex integer multiply-add
-  defm CMLA_ZZZ      : sve2_int_cmla<0b0, "cmla">;
-  defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah">;
+  defm CMLA_ZZZ      : sve2_int_cmla<0b0, "cmla",      int_aarch64_sve_cmla_x>;
+  defm SQRDCMLAH_ZZZ : sve2_int_cmla<0b1, "sqrdcmlah", int_aarch64_sve_sqrdcmlah_x>;
 
   // SVE2 integer multiply long (indexed)
-  defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb">;
-  defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt">;
-  defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb">;
-  defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt">;
+  defm SMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b000, "smullb", int_aarch64_sve_smullb_lane>;
+  defm SMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b001, "smullt", int_aarch64_sve_smullt_lane>;
+  defm UMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b010, "umullb", int_aarch64_sve_umullb_lane>;
+  defm UMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b011, "umullt", int_aarch64_sve_umullt_lane>;
 
   // SVE2 saturating multiply (indexed)
-  defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb">;
-  defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt">;
+  defm SQDMULLB_ZZZI : sve2_int_mul_long_by_indexed_elem<0b100, "sqdmullb", int_aarch64_sve_sqdmullb_lane>;
+  defm SQDMULLT_ZZZI : sve2_int_mul_long_by_indexed_elem<0b101, "sqdmullt", int_aarch64_sve_sqdmullt_lane>;
 
   // SVE2 integer multiply-add long (indexed)
-  defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb">;
-  defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt">;
-  defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb">;
-  defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt">;
-  defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb">;
-  defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt">;
-  defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb">;
-  defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt">;
+  defm SMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1000, "smlalb", int_aarch64_sve_smlalb_lane>;
+  defm SMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1001, "smlalt", int_aarch64_sve_smlalt_lane>;
+  defm UMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1010, "umlalb", int_aarch64_sve_umlalb_lane>;
+  defm UMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1011, "umlalt", int_aarch64_sve_umlalt_lane>;
+  defm SMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1100, "smlslb", int_aarch64_sve_smlslb_lane>;
+  defm SMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1101, "smlslt", int_aarch64_sve_smlslt_lane>;
+  defm UMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1110, "umlslb", int_aarch64_sve_umlslb_lane>;
+  defm UMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b1111, "umlslt", int_aarch64_sve_umlslt_lane>;
 
   // SVE2 integer multiply-add long (vectors, unpredicated)
-  defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb">;
-  defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt">;
-  defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb">;
-  defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt">;
-  defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb">;
-  defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt">;
-  defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb">;
-  defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt">;
+  defm SMLALB_ZZZ : sve2_int_mla_long<0b10000, "smlalb", int_aarch64_sve_smlalb>;
+  defm SMLALT_ZZZ : sve2_int_mla_long<0b10001, "smlalt", int_aarch64_sve_smlalt>;
+  defm UMLALB_ZZZ : sve2_int_mla_long<0b10010, "umlalb", int_aarch64_sve_umlalb>;
+  defm UMLALT_ZZZ : sve2_int_mla_long<0b10011, "umlalt", int_aarch64_sve_umlalt>;
+  defm SMLSLB_ZZZ : sve2_int_mla_long<0b10100, "smlslb", int_aarch64_sve_smlslb>;
+  defm SMLSLT_ZZZ : sve2_int_mla_long<0b10101, "smlslt", int_aarch64_sve_smlslt>;
+  defm UMLSLB_ZZZ : sve2_int_mla_long<0b10110, "umlslb", int_aarch64_sve_umlslb>;
+  defm UMLSLT_ZZZ : sve2_int_mla_long<0b10111, "umlslt", int_aarch64_sve_umlslt>;
 
   // SVE2 saturating multiply-add long (indexed)
-  defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb">;
-  defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt">;
-  defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb">;
-  defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt">;
+  defm SQDMLALB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0100, "sqdmlalb", int_aarch64_sve_sqdmlalb_lane>;
+  defm SQDMLALT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0101, "sqdmlalt", int_aarch64_sve_sqdmlalt_lane>;
+  defm SQDMLSLB_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0110, "sqdmlslb", int_aarch64_sve_sqdmlslb_lane>;
+  defm SQDMLSLT_ZZZI : sve2_int_mla_long_by_indexed_elem<0b0111, "sqdmlslt", int_aarch64_sve_sqdmlslt_lane>;
 
   // SVE2 saturating multiply-add long (vectors, unpredicated)
-  defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb">;
-  defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt">;
-  defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb">;
-  defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt">;
+  defm SQDMLALB_ZZZ : sve2_int_mla_long<0b11000, "sqdmlalb", int_aarch64_sve_sqdmlalb>;
+  defm SQDMLALT_ZZZ : sve2_int_mla_long<0b11001, "sqdmlalt", int_aarch64_sve_sqdmlalt>;
+  defm SQDMLSLB_ZZZ : sve2_int_mla_long<0b11010, "sqdmlslb", int_aarch64_sve_sqdmlslb>;
+  defm SQDMLSLT_ZZZ : sve2_int_mla_long<0b11011, "sqdmlslt", int_aarch64_sve_sqdmlslt>;
 
   // SVE2 saturating multiply-add interleaved long
-  defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt">;
-  defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt">;
+  defm SQDMLALBT_ZZZ : sve2_int_mla_long<0b00010, "sqdmlalbt", int_aarch64_sve_sqdmlalbt>;
+  defm SQDMLSLBT_ZZZ : sve2_int_mla_long<0b00011, "sqdmlslbt", int_aarch64_sve_sqdmlslbt>;
 
   // SVE2 integer halving add/subtract (predicated)
-  defm SHADD_ZPmZ  : sve2_int_arith_pred<0b100000, "shadd">;
-  defm UHADD_ZPmZ  : sve2_int_arith_pred<0b100010, "uhadd">;
-  defm SHSUB_ZPmZ  : sve2_int_arith_pred<0b100100, "shsub">;
-  defm UHSUB_ZPmZ  : sve2_int_arith_pred<0b100110, "uhsub">;
-  defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd">;
-  defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd">;
-  defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr">;
-  defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr">;
+  defm SHADD_ZPmZ  : sve2_int_arith_pred<0b100000, "shadd",  int_aarch64_sve_shadd>;
+  defm UHADD_ZPmZ  : sve2_int_arith_pred<0b100010, "uhadd",  int_aarch64_sve_uhadd>;
+  defm SHSUB_ZPmZ  : sve2_int_arith_pred<0b100100, "shsub",  int_aarch64_sve_shsub>;
+  defm UHSUB_ZPmZ  : sve2_int_arith_pred<0b100110, "uhsub",  int_aarch64_sve_uhsub>;
+  defm SRHADD_ZPmZ : sve2_int_arith_pred<0b101000, "srhadd", int_aarch64_sve_srhadd>;
+  defm URHADD_ZPmZ : sve2_int_arith_pred<0b101010, "urhadd", int_aarch64_sve_urhadd>;
+  defm SHSUBR_ZPmZ : sve2_int_arith_pred<0b101100, "shsubr", int_aarch64_sve_shsubr>;
+  defm UHSUBR_ZPmZ : sve2_int_arith_pred<0b101110, "uhsubr", int_aarch64_sve_uhsubr>;
 
   // SVE2 integer pairwise add and accumulate long
-  defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp">;
-  defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp">;
+  defm SADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<0, "sadalp", int_aarch64_sve_sadalp>;
+  defm UADALP_ZPmZ : sve2_int_sadd_long_accum_pairwise<1, "uadalp", int_aarch64_sve_uadalp>;
 
   // SVE2 integer pairwise arithmetic
-  defm ADDP_ZPmZ  : sve2_int_arith_pred<0b100011, "addp">;
-  defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp">;
-  defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp">;
-  defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp">;
-  defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp">;
+  defm ADDP_ZPmZ  : sve2_int_arith_pred<0b100011, "addp",  int_aarch64_sve_addp>;
+  defm SMAXP_ZPmZ : sve2_int_arith_pred<0b101001, "smaxp", int_aarch64_sve_smaxp>;
+  defm UMAXP_ZPmZ : sve2_int_arith_pred<0b101011, "umaxp", int_aarch64_sve_umaxp>;
+  defm SMINP_ZPmZ : sve2_int_arith_pred<0b101101, "sminp", int_aarch64_sve_sminp>;
+  defm UMINP_ZPmZ : sve2_int_arith_pred<0b101111, "uminp", int_aarch64_sve_uminp>;
 
   // SVE2 integer unary operations (predicated)
-  defm URECPE_ZPmZ  : sve2_int_un_pred_arit_s<0b000, "urecpe">;
-  defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte">;
-  defm SQABS_ZPmZ   : sve2_int_un_pred_arit<0b100, "sqabs">;
-  defm SQNEG_ZPmZ   : sve2_int_un_pred_arit<0b101, "sqneg">;
+  defm URECPE_ZPmZ  : sve2_int_un_pred_arit_s<0b000, "urecpe",  int_aarch64_sve_urecpe>;
+  defm URSQRTE_ZPmZ : sve2_int_un_pred_arit_s<0b001, "ursqrte", int_aarch64_sve_ursqrte>;
+  defm SQABS_ZPmZ   : sve2_int_un_pred_arit<0b100,   "sqabs",   int_aarch64_sve_sqabs>;
+  defm SQNEG_ZPmZ   : sve2_int_un_pred_arit<0b101,   "sqneg",   int_aarch64_sve_sqneg>;
 
   // SVE2 saturating add/subtract
-  defm SQADD_ZPmZ  : sve2_int_arith_pred<0b110000, "sqadd">;
-  defm UQADD_ZPmZ  : sve2_int_arith_pred<0b110010, "uqadd">;
-  defm SQSUB_ZPmZ  : sve2_int_arith_pred<0b110100, "sqsub">;
-  defm UQSUB_ZPmZ  : sve2_int_arith_pred<0b110110, "uqsub">;
-  defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd">;
-  defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd">;
-  defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr">;
-  defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr">;
+  defm SQADD_ZPmZ  : sve2_int_arith_pred<0b110000, "sqadd",  int_aarch64_sve_sqadd>;
+  defm UQADD_ZPmZ  : sve2_int_arith_pred<0b110010, "uqadd",  int_aarch64_sve_uqadd>;
+  defm SQSUB_ZPmZ  : sve2_int_arith_pred<0b110100, "sqsub",  int_aarch64_sve_sqsub>;
+  defm UQSUB_ZPmZ  : sve2_int_arith_pred<0b110110, "uqsub",  int_aarch64_sve_uqsub>;
+  defm SUQADD_ZPmZ : sve2_int_arith_pred<0b111000, "suqadd", int_aarch64_sve_suqadd>;
+  defm USQADD_ZPmZ : sve2_int_arith_pred<0b111010, "usqadd", int_aarch64_sve_usqadd>;
+  defm SQSUBR_ZPmZ : sve2_int_arith_pred<0b111100, "sqsubr", int_aarch64_sve_sqsubr>;
+  defm UQSUBR_ZPmZ : sve2_int_arith_pred<0b111110, "uqsubr", int_aarch64_sve_uqsubr>;
 
   // SVE2 saturating/rounding bitwise shift left (predicated)
-  defm SRSHL_ZPmZ   : sve2_int_arith_pred<0b000100, "srshl">;
-  defm URSHL_ZPmZ   : sve2_int_arith_pred<0b000110, "urshl">;
-  defm SRSHLR_ZPmZ  : sve2_int_arith_pred<0b001100, "srshlr">;
-  defm URSHLR_ZPmZ  : sve2_int_arith_pred<0b001110, "urshlr">;
-  defm SQSHL_ZPmZ   : sve2_int_arith_pred<0b010000, "sqshl">;
-  defm UQSHL_ZPmZ   : sve2_int_arith_pred<0b010010, "uqshl">;
-  defm SQRSHL_ZPmZ  : sve2_int_arith_pred<0b010100, "sqrshl">;
-  defm UQRSHL_ZPmZ  : sve2_int_arith_pred<0b010110, "uqrshl">;
-  defm SQSHLR_ZPmZ  : sve2_int_arith_pred<0b011000, "sqshlr">;
-  defm UQSHLR_ZPmZ  : sve2_int_arith_pred<0b011010, "uqshlr">;
-  defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr">;
-  defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr">;
+  defm SRSHL_ZPmZ   : sve2_int_arith_pred<0b000100, "srshl",   int_aarch64_sve_srshl>;
+  defm URSHL_ZPmZ   : sve2_int_arith_pred<0b000110, "urshl",   int_aarch64_sve_urshl>;
+  defm SRSHLR_ZPmZ  : sve2_int_arith_pred<0b001100, "srshlr",  null_frag>;
+  defm URSHLR_ZPmZ  : sve2_int_arith_pred<0b001110, "urshlr",  null_frag>;
+  defm SQSHL_ZPmZ   : sve2_int_arith_pred<0b010000, "sqshl",   int_aarch64_sve_sqshl>;
+  defm UQSHL_ZPmZ   : sve2_int_arith_pred<0b010010, "uqshl",   int_aarch64_sve_uqshl>;
+  defm SQRSHL_ZPmZ  : sve2_int_arith_pred<0b010100, "sqrshl",  int_aarch64_sve_sqrshl>;
+  defm UQRSHL_ZPmZ  : sve2_int_arith_pred<0b010110, "uqrshl",  int_aarch64_sve_uqrshl>;
+  defm SQSHLR_ZPmZ  : sve2_int_arith_pred<0b011000, "sqshlr",  null_frag>;
+  defm UQSHLR_ZPmZ  : sve2_int_arith_pred<0b011010, "uqshlr",  null_frag>;
+  defm SQRSHLR_ZPmZ : sve2_int_arith_pred<0b011100, "sqrshlr", null_frag>;
+  defm UQRSHLR_ZPmZ : sve2_int_arith_pred<0b011110, "uqrshlr", null_frag>;
+
+  let Predicates = [HasSVE2, UseExperimentalZeroingPseudos] in {
+    defm SQSHL_ZPZI  : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
+    defm UQSHL_ZPZI  : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<null_frag>;
+    defm SRSHR_ZPZI  : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_srshr>;
+    defm URSHR_ZPZI  : sve_int_bin_pred_shift_imm_right_zeroing_bhsd<int_aarch64_sve_urshr>;
+    defm SQSHLU_ZPZI : sve_int_bin_pred_shift_imm_left_zeroing_bhsd<int_aarch64_sve_sqshlu>;
+  }
 
   // SVE2 predicated shifts
-  defm SQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl">;
-  defm UQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl">;
-  defm SRSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1100, "srshr">;
-  defm URSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1101, "urshr">;
-  defm SQSHLU_ZPmI : sve_int_bin_pred_shift_imm_left< 0b1111, "sqshlu">;
+  defm SQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0110, "sqshl", "SQSHL_ZPZI">;
+  defm UQSHL_ZPmI  : sve_int_bin_pred_shift_imm_left< 0b0111, "uqshl", "UQSHL_ZPZI">;
+  defm SRSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1100,  "srshr",  "SRSHR_ZPZI",  int_aarch64_sve_srshr>;
+  defm URSHR_ZPmI  : sve_int_bin_pred_shift_imm_right<0b1101,  "urshr",  "URSHR_ZPZI",  int_aarch64_sve_urshr>;
+  defm SQSHLU_ZPmI : sve2_int_bin_pred_shift_imm_left< 0b1111, "sqshlu", "SQSHLU_ZPZI", int_aarch64_sve_sqshlu>;
 
   // SVE2 integer add/subtract long
-  defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb">;
-  defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt">;
-  defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb">;
-  defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt">;
-  defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb">;
-  defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt">;
-  defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb">;
-  defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt">;
-  defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb">;
-  defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt">;
-  defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb">;
-  defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt">;
+  defm SADDLB_ZZZ : sve2_wide_int_arith_long<0b00000, "saddlb", int_aarch64_sve_saddlb>;
+  defm SADDLT_ZZZ : sve2_wide_int_arith_long<0b00001, "saddlt", int_aarch64_sve_saddlt>;
+  defm UADDLB_ZZZ : sve2_wide_int_arith_long<0b00010, "uaddlb", int_aarch64_sve_uaddlb>;
+  defm UADDLT_ZZZ : sve2_wide_int_arith_long<0b00011, "uaddlt", int_aarch64_sve_uaddlt>;
+  defm SSUBLB_ZZZ : sve2_wide_int_arith_long<0b00100, "ssublb", int_aarch64_sve_ssublb>;
+  defm SSUBLT_ZZZ : sve2_wide_int_arith_long<0b00101, "ssublt", int_aarch64_sve_ssublt>;
+  defm USUBLB_ZZZ : sve2_wide_int_arith_long<0b00110, "usublb", int_aarch64_sve_usublb>;
+  defm USUBLT_ZZZ : sve2_wide_int_arith_long<0b00111, "usublt", int_aarch64_sve_usublt>;
+  defm SABDLB_ZZZ : sve2_wide_int_arith_long<0b01100, "sabdlb", int_aarch64_sve_sabdlb>;
+  defm SABDLT_ZZZ : sve2_wide_int_arith_long<0b01101, "sabdlt", int_aarch64_sve_sabdlt>;
+  defm UABDLB_ZZZ : sve2_wide_int_arith_long<0b01110, "uabdlb", int_aarch64_sve_uabdlb>;
+  defm UABDLT_ZZZ : sve2_wide_int_arith_long<0b01111, "uabdlt", int_aarch64_sve_uabdlt>;
 
   // SVE2 integer add/subtract wide
-  defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb">;
-  defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt">;
-  defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb">;
-  defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt">;
-  defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb">;
-  defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt">;
-  defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb">;
-  defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt">;
+  defm SADDWB_ZZZ : sve2_wide_int_arith_wide<0b000, "saddwb", int_aarch64_sve_saddwb>;
+  defm SADDWT_ZZZ : sve2_wide_int_arith_wide<0b001, "saddwt", int_aarch64_sve_saddwt>;
+  defm UADDWB_ZZZ : sve2_wide_int_arith_wide<0b010, "uaddwb", int_aarch64_sve_uaddwb>;
+  defm UADDWT_ZZZ : sve2_wide_int_arith_wide<0b011, "uaddwt", int_aarch64_sve_uaddwt>;
+  defm SSUBWB_ZZZ : sve2_wide_int_arith_wide<0b100, "ssubwb", int_aarch64_sve_ssubwb>;
+  defm SSUBWT_ZZZ : sve2_wide_int_arith_wide<0b101, "ssubwt", int_aarch64_sve_ssubwt>;
+  defm USUBWB_ZZZ : sve2_wide_int_arith_wide<0b110, "usubwb", int_aarch64_sve_usubwb>;
+  defm USUBWT_ZZZ : sve2_wide_int_arith_wide<0b111, "usubwt", int_aarch64_sve_usubwt>;
 
   // SVE2 integer multiply long
-  defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb">;
-  defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt">;
-  defm SMULLB_ZZZ   : sve2_wide_int_arith_long<0b11100, "smullb">;
-  defm SMULLT_ZZZ   : sve2_wide_int_arith_long<0b11101, "smullt">;
-  defm UMULLB_ZZZ   : sve2_wide_int_arith_long<0b11110, "umullb">;
-  defm UMULLT_ZZZ   : sve2_wide_int_arith_long<0b11111, "umullt">;
-  defm PMULLB_ZZZ   : sve2_pmul_long<0b0, "pmullb">;
-  defm PMULLT_ZZZ   : sve2_pmul_long<0b1, "pmullt">;
+  defm SQDMULLB_ZZZ : sve2_wide_int_arith_long<0b11000, "sqdmullb", int_aarch64_sve_sqdmullb>;
+  defm SQDMULLT_ZZZ : sve2_wide_int_arith_long<0b11001, "sqdmullt", int_aarch64_sve_sqdmullt>;
+  defm SMULLB_ZZZ   : sve2_wide_int_arith_long<0b11100, "smullb",   int_aarch64_sve_smullb>;
+  defm SMULLT_ZZZ   : sve2_wide_int_arith_long<0b11101, "smullt",   int_aarch64_sve_smullt>;
+  defm UMULLB_ZZZ   : sve2_wide_int_arith_long<0b11110, "umullb",   int_aarch64_sve_umullb>;
+  defm UMULLT_ZZZ   : sve2_wide_int_arith_long<0b11111, "umullt",   int_aarch64_sve_umullt>;
+  defm PMULLB_ZZZ   : sve2_pmul_long<0b0, "pmullb", int_aarch64_sve_pmullb_pair>;
+  defm PMULLT_ZZZ   : sve2_pmul_long<0b1, "pmullt", int_aarch64_sve_pmullt_pair>;
 
   // SVE2 bitwise shift and insert
-  defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri">;
-  defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli">;
+  defm SRI_ZZI : sve2_int_bin_shift_imm_right<0b0, "sri", int_aarch64_sve_sri>;
+  defm SLI_ZZI : sve2_int_bin_shift_imm_left< 0b1, "sli", int_aarch64_sve_sli>;
 
   // SVE2 bitwise shift right and accumulate
-  defm SSRA_ZZI  : sve2_int_bin_accum_shift_imm_right<0b00, "ssra">;
-  defm USRA_ZZI  : sve2_int_bin_accum_shift_imm_right<0b01, "usra">;
-  defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra">;
-  defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra">;
+  defm SSRA_ZZI  : sve2_int_bin_accum_shift_imm_right<0b00, "ssra",  int_aarch64_sve_ssra>;
+  defm USRA_ZZI  : sve2_int_bin_accum_shift_imm_right<0b01, "usra",  int_aarch64_sve_usra>;
+  defm SRSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b10, "srsra", int_aarch64_sve_srsra>;
+  defm URSRA_ZZI : sve2_int_bin_accum_shift_imm_right<0b11, "ursra", int_aarch64_sve_ursra>;
 
   // SVE2 complex integer add
-  defm CADD_ZZI   : sve2_int_cadd<0b0, "cadd">;
-  defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd">;
+  defm CADD_ZZI   : sve2_int_cadd<0b0, "cadd",   int_aarch64_sve_cadd_x>;
+  defm SQCADD_ZZI : sve2_int_cadd<0b1, "sqcadd", int_aarch64_sve_sqcadd_x>;
 
   // SVE2 integer absolute difference and accumulate
-  defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba">;
-  defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba">;
+  defm SABA_ZZZ : sve2_int_absdiff_accum<0b0, "saba", int_aarch64_sve_saba>;
+  defm UABA_ZZZ : sve2_int_absdiff_accum<0b1, "uaba", int_aarch64_sve_uaba>;
 
   // SVE2 integer absolute difference and accumulate long
-  defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb">;
-  defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt">;
-  defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb">;
-  defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt">;
+  defm SABALB_ZZZ : sve2_int_absdiff_accum_long<0b00, "sabalb", int_aarch64_sve_sabalb>;
+  defm SABALT_ZZZ : sve2_int_absdiff_accum_long<0b01, "sabalt", int_aarch64_sve_sabalt>;
+  defm UABALB_ZZZ : sve2_int_absdiff_accum_long<0b10, "uabalb", int_aarch64_sve_uabalb>;
+  defm UABALT_ZZZ : sve2_int_absdiff_accum_long<0b11, "uabalt", int_aarch64_sve_uabalt>;
 
   // SVE2 integer add/subtract long with carry
-  defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb">;
-  defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt">;
-  defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb">;
-  defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt">;
+  defm ADCLB_ZZZ : sve2_int_addsub_long_carry<0b00, "adclb", int_aarch64_sve_adclb>;
+  defm ADCLT_ZZZ : sve2_int_addsub_long_carry<0b01, "adclt", int_aarch64_sve_adclt>;
+  defm SBCLB_ZZZ : sve2_int_addsub_long_carry<0b10, "sbclb", int_aarch64_sve_sbclb>;
+  defm SBCLT_ZZZ : sve2_int_addsub_long_carry<0b11, "sbclt", int_aarch64_sve_sbclt>;
 
   // SVE2 bitwise shift right narrow (bottom)
   defm SQSHRUNB_ZZI  : sve2_int_bin_shift_imm_right_narrow_bottom<0b000, "sqshrunb",  int_aarch64_sve_sqshrunb>;
@@ -1489,29 +2426,29 @@ let Predicates = [HasSVE2] in {
   defm SQXTUNT_ZZ : sve2_int_sat_extract_narrow_top<0b10, "sqxtunt", int_aarch64_sve_sqxtunt>;
 
   // SVE2 character match
-  defm MATCH_PPzZZ  : sve2_char_match<0b0, "match">;
-  defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch">;
+  defm MATCH_PPzZZ  : sve2_char_match<0b0, "match",  int_aarch64_sve_match>;
+  defm NMATCH_PPzZZ : sve2_char_match<0b1, "nmatch", int_aarch64_sve_nmatch>;
 
   // SVE2 bitwise exclusive-or interleaved
-  defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt">;
-  defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb">;
+  defm EORBT_ZZZ : sve2_bitwise_xor_interleaved<0b0, "eorbt", int_aarch64_sve_eorbt>;
+  defm EORTB_ZZZ : sve2_bitwise_xor_interleaved<0b1, "eortb", int_aarch64_sve_eortb>;
 
   // SVE2 bitwise shift left long
-  defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb">;
-  defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt">;
-  defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb">;
-  defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt">;
+  defm SSHLLB_ZZI : sve2_bitwise_shift_left_long<0b00, "sshllb", int_aarch64_sve_sshllb>;
+  defm SSHLLT_ZZI : sve2_bitwise_shift_left_long<0b01, "sshllt", int_aarch64_sve_sshllt>;
+  defm USHLLB_ZZI : sve2_bitwise_shift_left_long<0b10, "ushllb", int_aarch64_sve_ushllb>;
+  defm USHLLT_ZZI : sve2_bitwise_shift_left_long<0b11, "ushllt", int_aarch64_sve_ushllt>;
 
   // SVE2 integer add/subtract interleaved long
-  defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt">;
-  defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt">;
-  defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb">;
+  defm SADDLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b00, "saddlbt", int_aarch64_sve_saddlbt>;
+  defm SSUBLBT_ZZZ : sve2_misc_int_addsub_long_interleaved<0b10, "ssublbt", int_aarch64_sve_ssublbt>;
+  defm SSUBLTB_ZZZ : sve2_misc_int_addsub_long_interleaved<0b11, "ssubltb", int_aarch64_sve_ssubltb>;
 
   // SVE2 histogram generation (segment)
-  def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg">;
+  def HISTSEG_ZZZ : sve2_hist_gen_segment<"histseg", int_aarch64_sve_histseg>;
 
   // SVE2 histogram generation (vector)
-  defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt">;
+  defm HISTCNT_ZPzZZ : sve2_hist_gen_vector<"histcnt", int_aarch64_sve_histcnt>;
 
   // SVE2 floating-point base 2 logarithm as integer
   defm FLOGB_ZPmZ : sve2_fp_flogb<"flogb", int_aarch64_sve_flogb>;
@@ -1542,50 +2479,57 @@ let Predicates = [HasSVE2] in {
   defm FMLSLT_ZZZ_SHH : sve2_fp_mla_long<0b11, "fmlslt", int_aarch64_sve_fmlslt>;
 
   // SVE2 bitwise ternary operations
-  defm EOR3_ZZZZ_D  : sve2_int_bitwise_ternary_op<0b000, "eor3">;
-  defm BCAX_ZZZZ_D  : sve2_int_bitwise_ternary_op<0b010, "bcax">;
-  def BSL_ZZZZ_D    : sve2_int_bitwise_ternary_op_d<0b001, "bsl">;
-  def BSL1N_ZZZZ_D  : sve2_int_bitwise_ternary_op_d<0b011, "bsl1n">;
-  def BSL2N_ZZZZ_D  : sve2_int_bitwise_ternary_op_d<0b101, "bsl2n">;
-  def NBSL_ZZZZ_D   : sve2_int_bitwise_ternary_op_d<0b111, "nbsl">;
+  defm EOR3_ZZZZ  : sve2_int_bitwise_ternary_op<0b000, "eor3",  int_aarch64_sve_eor3>;
+  defm BCAX_ZZZZ  : sve2_int_bitwise_ternary_op<0b010, "bcax",  int_aarch64_sve_bcax>;
+  defm BSL_ZZZZ   : sve2_int_bitwise_ternary_op<0b001, "bsl",   int_aarch64_sve_bsl>;
+  defm BSL1N_ZZZZ : sve2_int_bitwise_ternary_op<0b011, "bsl1n", int_aarch64_sve_bsl1n>;
+  defm BSL2N_ZZZZ : sve2_int_bitwise_ternary_op<0b101, "bsl2n", int_aarch64_sve_bsl2n>;
+  defm NBSL_ZZZZ  : sve2_int_bitwise_ternary_op<0b111, "nbsl",  int_aarch64_sve_nbsl>;
 
   // SVE2 bitwise xor and rotate right by immediate
-  defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar">;
+  defm XAR_ZZZI : sve2_int_rotate_right_imm<"xar", int_aarch64_sve_xar>;
 
   // SVE2 extract vector (immediate offset, constructive)
   def EXT_ZZI_B : sve2_int_perm_extract_i_cons<"ext">;
 
   // SVE2 non-temporal gather loads
-  defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs<0b00000, "ldnt1sb", Z_s, ZPR32>;
-  defm LDNT1B_ZZR_S  : sve2_mem_gldnt_vs<0b00001, "ldnt1b",  Z_s, ZPR32>;
-  defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs<0b00100, "ldnt1sh", Z_s, ZPR32>;
-  defm LDNT1H_ZZR_S  : sve2_mem_gldnt_vs<0b00101, "ldnt1h",  Z_s, ZPR32>;
-  defm LDNT1W_ZZR_S  : sve2_mem_gldnt_vs<0b01001, "ldnt1w",  Z_s, ZPR32>;
-
-  defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs<0b10000, "ldnt1sb", Z_d, ZPR64>;
-  defm LDNT1B_ZZR_D  : sve2_mem_gldnt_vs<0b10010, "ldnt1b",  Z_d, ZPR64>;
-  defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs<0b10100, "ldnt1sh", Z_d, ZPR64>;
-  defm LDNT1H_ZZR_D  : sve2_mem_gldnt_vs<0b10110, "ldnt1h",  Z_d, ZPR64>;
-  defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs<0b11000, "ldnt1sw", Z_d, ZPR64>;
-  defm LDNT1W_ZZR_D  : sve2_mem_gldnt_vs<0b11010, "ldnt1w",  Z_d, ZPR64>;
-  defm LDNT1D_ZZR_D  : sve2_mem_gldnt_vs<0b11110, "ldnt1d",  Z_d, ZPR64>;
+  defm LDNT1SB_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv4i8>;
+  defm LDNT1B_ZZR_S  : sve2_mem_gldnt_vs_32_ptrs<0b00001, "ldnt1b",  AArch64ldnt1_gather_z,  nxv4i8>;
+  defm LDNT1SH_ZZR_S : sve2_mem_gldnt_vs_32_ptrs<0b00100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv4i16>;
+  defm LDNT1H_ZZR_S  : sve2_mem_gldnt_vs_32_ptrs<0b00101, "ldnt1h",  AArch64ldnt1_gather_z,  nxv4i16>;
+  defm LDNT1W_ZZR_S  : sve2_mem_gldnt_vs_32_ptrs<0b01001, "ldnt1w",  AArch64ldnt1_gather_z,  nxv4i32>;
+
+  defm LDNT1SB_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10000, "ldnt1sb", AArch64ldnt1s_gather_z, nxv2i8>;
+  defm LDNT1B_ZZR_D  : sve2_mem_gldnt_vs_64_ptrs<0b10010, "ldnt1b",  AArch64ldnt1_gather_z,  nxv2i8>;
+  defm LDNT1SH_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b10100, "ldnt1sh", AArch64ldnt1s_gather_z, nxv2i16>;
+  defm LDNT1H_ZZR_D  : sve2_mem_gldnt_vs_64_ptrs<0b10110, "ldnt1h",  AArch64ldnt1_gather_z,  nxv2i16>;
+  defm LDNT1SW_ZZR_D : sve2_mem_gldnt_vs_64_ptrs<0b11000, "ldnt1sw", AArch64ldnt1s_gather_z, nxv2i32>;
+  defm LDNT1W_ZZR_D  : sve2_mem_gldnt_vs_64_ptrs<0b11010, "ldnt1w",  AArch64ldnt1_gather_z,  nxv2i32>;
+  defm LDNT1D_ZZR_D  : sve2_mem_gldnt_vs_64_ptrs<0b11110, "ldnt1d",  AArch64ldnt1_gather_z,  nxv2i64>;
 
   // SVE2 vector splice (constructive)
   defm SPLICE_ZPZZ : sve2_int_perm_splice_cons<"splice">;
 
   // SVE2 non-temporal scatter stores
-  defm STNT1B_ZZR_S : sve2_mem_sstnt_vs<0b001, "stnt1b", Z_s, ZPR32>;
-  defm STNT1H_ZZR_S : sve2_mem_sstnt_vs<0b011, "stnt1h", Z_s, ZPR32>;
-  defm STNT1W_ZZR_S : sve2_mem_sstnt_vs<0b101, "stnt1w", Z_s, ZPR32>;
+  defm STNT1B_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b001, "stnt1b", AArch64stnt1_scatter, nxv4i8>;
+  defm STNT1H_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b011, "stnt1h", AArch64stnt1_scatter, nxv4i16>;
+  defm STNT1W_ZZR_S : sve2_mem_sstnt_vs_32_ptrs<0b101, "stnt1w", AArch64stnt1_scatter, nxv4i32>;
 
-  defm STNT1B_ZZR_D : sve2_mem_sstnt_vs<0b000, "stnt1b", Z_d, ZPR64>;
-  defm STNT1H_ZZR_D : sve2_mem_sstnt_vs<0b010, "stnt1h", Z_d, ZPR64>;
-  defm STNT1W_ZZR_D : sve2_mem_sstnt_vs<0b100, "stnt1w", Z_d, ZPR64>;
-  defm STNT1D_ZZR_D : sve2_mem_sstnt_vs<0b110, "stnt1d", Z_d, ZPR64>;
+  defm STNT1B_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b000, "stnt1b", AArch64stnt1_scatter, nxv2i8>;
+  defm STNT1H_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b010, "stnt1h", AArch64stnt1_scatter, nxv2i16>;
+  defm STNT1W_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b100, "stnt1w", AArch64stnt1_scatter, nxv2i32>;
+  defm STNT1D_ZZR_D : sve2_mem_sstnt_vs_64_ptrs<0b110, "stnt1d", AArch64stnt1_scatter, nxv2i64>;
 
   // SVE2 table lookup (three sources)
-  defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl">;
-  defm TBX_ZZZ  : sve2_int_perm_tbx<"tbx">;
+  defm TBL_ZZZZ : sve2_int_perm_tbl<"tbl", int_aarch64_sve_tbl2>;
+  defm TBX_ZZZ  : sve2_int_perm_tbx<"tbx", int_aarch64_sve_tbx>;
+
+  let Predicates = [HasSVE, HasBF16] in {
+    def : SVE_3_Op_Pat<nxv8bf16, int_aarch64_sve_tbx, nxv8bf16, nxv8bf16, nxv8i16, TBX_ZZZ_H>;
+    def : Pat<(nxv8bf16 (int_aarch64_sve_tbl2 nxv8bf16:$Op1, nxv8bf16:$Op2, nxv8i16:$Op3)),
+              (nxv8bf16 (TBL_ZZZZ_H (REG_SEQUENCE ZPR2, nxv8bf16:$Op1, zsub0, nxv8bf16:$Op2, zsub1),
+                        nxv8i16:$Op3))>;
+  }
 
   // SVE2 integer compare scalar count and limit
   defm WHILEGE_PWW : sve_int_while4_rr<0b000, "whilege", int_aarch64_sve_whilege>;
@@ -1599,43 +2543,41 @@ let Predicates = [HasSVE2] in {
   defm WHILEHI_PXX : sve_int_while8_rr<0b101, "whilehi", int_aarch64_sve_whilehi>;
 
   // SVE2 pointer conflict compare
-  defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr">;
-  defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw">;
+  defm WHILEWR_PXX : sve2_int_while_rr<0b0, "whilewr", "int_aarch64_sve_whilewr">;
+  defm WHILERW_PXX : sve2_int_while_rr<0b1, "whilerw", "int_aarch64_sve_whilerw">;
 }
 
 let Predicates = [HasSVE2AES] in {
   // SVE2 crypto destructive binary operations
-  def AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8>;
-  def AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8>;
+  defm AESE_ZZZ_B : sve2_crypto_des_bin_op<0b00, "aese", ZPR8, int_aarch64_sve_aese, nxv16i8>;
+  defm AESD_ZZZ_B : sve2_crypto_des_bin_op<0b01, "aesd", ZPR8, int_aarch64_sve_aesd, nxv16i8>;
 
   // SVE2 crypto unary operations
-  def AESMC_ZZ_B  : sve2_crypto_unary_op<0b0, "aesmc">;
-  def AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc">;
+  defm AESMC_ZZ_B  : sve2_crypto_unary_op<0b0, "aesmc",  int_aarch64_sve_aesmc>;
+  defm AESIMC_ZZ_B : sve2_crypto_unary_op<0b1, "aesimc", int_aarch64_sve_aesimc>;
 
   // PMULLB and PMULLT instructions which operate with 64-bit source and
   // 128-bit destination elements are enabled with crypto extensions, similar
   // to NEON PMULL2 instruction.
-  def PMULLB_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11010, "pmullb",
-                                         ZPR128, ZPR64, ZPR64>;
-  def PMULLT_ZZZ_Q : sve2_wide_int_arith<0b00, 0b11011, "pmullt",
-                                         ZPR128, ZPR64, ZPR64>;
+  defm PMULLB_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11010, "pmullb", int_aarch64_sve_pmullb_pair>;
+  defm PMULLT_ZZZ_Q : sve2_wide_int_arith_pmul<0b00, 0b11011, "pmullt", int_aarch64_sve_pmullt_pair>;
 }
 
 let Predicates = [HasSVE2SM4] in {
   // SVE2 crypto constructive binary operations
-  def SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32>;
+  defm SM4EKEY_ZZZ_S : sve2_crypto_cons_bin_op<0b0, "sm4ekey", ZPR32, int_aarch64_sve_sm4ekey, nxv4i32>;
   // SVE2 crypto destructive binary operations
-  def SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32>;
+  defm SM4E_ZZZ_S : sve2_crypto_des_bin_op<0b10, "sm4e", ZPR32, int_aarch64_sve_sm4e, nxv4i32>;
 }
 
 let Predicates = [HasSVE2SHA3] in {
   // SVE2 crypto constructive binary operations
-  def RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1",    ZPR64>;
+  defm RAX1_ZZZ_D : sve2_crypto_cons_bin_op<0b1, "rax1", ZPR64, int_aarch64_sve_rax1, nxv2i64>;
 }
 
 let Predicates = [HasSVE2BitPerm] in {
   // SVE2 bitwise permute
-  defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext">;
-  defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep">;
-  defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp">;
+  defm BEXT_ZZZ : sve2_misc_bitwise<0b1100, "bext", int_aarch64_sve_bext_x>;
+  defm BDEP_ZZZ : sve2_misc_bitwise<0b1101, "bdep", int_aarch64_sve_bdep_x>;
+  defm BGRP_ZZZ : sve2_misc_bitwise<0b1110, "bgrp", int_aarch64_sve_bgrp_x>;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA53.td b/llvm/lib/Target/AArch64/AArch64SchedA53.td
index a6df0f3f083c..c5ff1fcb274b 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA53.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA53.td
@@ -26,7 +26,8 @@ def CortexA53Model : SchedMachineModel {
                              // v 1.0 Spreadsheet
   let CompleteModel = 1;
 
-  list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    PAUnsupported.F);
 }
 
 
diff --git a/llvm/lib/Target/AArch64/AArch64SchedA57.td b/llvm/lib/Target/AArch64/AArch64SchedA57.td
index 9f566d1c7079..7c40da05c305 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedA57.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedA57.td
@@ -31,7 +31,8 @@ def CortexA57Model : SchedMachineModel {
   let LoopMicroOpBufferSize = 16;
   let CompleteModel = 1;
 
-  list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    PAUnsupported.F);
 }
 
 //===----------------------------------------------------------------------===//
@@ -501,7 +502,7 @@ def : InstRW<[A57Write_5cyc_2V], (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
 //   Q form - v16i8, v8i16, v4i32, v2i64
 
 // ASIMD bitwise insert, Q-form
-def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL)v16i8")>;
+def : InstRW<[A57Write_3cyc_2V], (instregex "^(BIF|BIT|BSL|BSP)v16i8")>;
 
 // ASIMD duplicate, gen reg, D-form and Q-form
 def : InstRW<[A57Write_8cyc_1L_1V], (instregex "^CPY")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
index 798ecb7508c0..8abcb804d5c7 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedCyclone.td
@@ -18,7 +18,8 @@ def CycloneModel : SchedMachineModel {
   let MispredictPenalty = 16; // 14-19 cycles are typical.
   let CompleteModel = 1;
 
-  list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    PAUnsupported.F);
 }
 
 //===----------------------------------------------------------------------===//
@@ -494,7 +495,7 @@ def : InstRW<[CyWriteV3], (instregex "SQRSHLv","UQRSHLv")>;
 // WriteV includes:
 // SHLL,SSHLL,USHLL
 // SLI,SRI
-// BIF,BIT,BSL
+// BIF,BIT,BSL,BSP
 // EXT
 // CLS,CLZ,CNT,RBIT,REV16,REV32,REV64,XTN
 // XTN2
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
index d1734c455b2b..8413a06ed391 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM3.td
@@ -24,7 +24,8 @@ def ExynosM3Model : SchedMachineModel {
   let MispredictPenalty     =  16; // Minimum branch misprediction penalty.
   let CompleteModel         =   1; // Use the default model otherwise.
 
-  list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    PAUnsupported.F);
 }
 
 //===----------------------------------------------------------------------===//
@@ -660,7 +661,7 @@ def : InstRW<[M3WriteNEONY],  (instrs FSQRTv2f64)>;
 
 // ASIMD miscellaneous instructions.
 def : InstRW<[M3WriteNALU1], (instregex "^RBITv")>;
-def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M3WriteNALU1], (instregex "^(BIF|BIT|BSL|BSP)v")>;
 def : InstRW<[M3WriteNEONB], (instregex "^DUPv.+gpr")>;
 def : InstRW<[M3WriteNSHF1], (instregex "^DUPv.+lane")>;
 def : InstRW<[M3WriteNSHF1], (instregex "^EXTv")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
index d2284f9fa0b5..34e8beb423ce 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM4.td
@@ -24,7 +24,8 @@ def ExynosM4Model : SchedMachineModel {
   let MispredictPenalty     =  16; // Minimum branch misprediction penalty.
   let CompleteModel         =   1; // Use the default model otherwise.
 
-  list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    PAUnsupported.F);
 }
 
 //===----------------------------------------------------------------------===//
@@ -803,7 +804,7 @@ def : InstRW<[M4WriteNEONY],  (instrs FSQRTv2f64)>;
 
 // ASIMD miscellaneous instructions.
 def : InstRW<[M4WriteNALU1],  (instregex "^RBITv")>;
-def : InstRW<[M4WriteNALU1],  (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M4WriteNALU1],  (instregex "^(BIF|BIT|BSL|BSP)v")>;
 def : InstRW<[M4WriteNALU1],  (instregex "^CL[STZ]v")>;
 def : InstRW<[M4WriteNEONB],  (instregex "^DUPv.+gpr")>;
 def : InstRW<[M4WriteNSHF1],  (instregex "^CPY")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
index df7402591e7b..403aac80e47b 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedExynosM5.td
@@ -24,7 +24,8 @@ def ExynosM5Model : SchedMachineModel {
   let MispredictPenalty     =  15; // Minimum branch misprediction penalty.
   let CompleteModel         =   1; // Use the default model otherwise.
 
-  list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    PAUnsupported.F);
 }
 
 //===----------------------------------------------------------------------===//
@@ -841,7 +842,7 @@ def : InstRW<[M5WriteNEONY],  (instrs FSQRTv2f64)>;
 
 // ASIMD miscellaneous instructions.
 def : InstRW<[M5WriteNALU2],  (instregex "^RBITv")>;
-def : InstRW<[M5WriteNALU2],  (instregex "^(BIF|BIT|BSL)v")>;
+def : InstRW<[M5WriteNALU2],  (instregex "^(BIF|BIT|BSL|BSP)v")>;
 def : InstRW<[M5WriteNALU2],  (instregex "^CL[STZ]v")>;
 def : InstRW<[M5WriteNEONB],  (instregex "^DUPv.+gpr")>;
 def : InstRW<[M5WriteNSHF2],  (instregex "^CPY")>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
index 92d03963de57..a17ab36d7f9e 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedFalkor.td
@@ -23,8 +23,8 @@ def FalkorModel : SchedMachineModel {
   let MispredictPenalty = 11;  // Minimum branch misprediction penalty.
   let CompleteModel = 1;
 
-  list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
-
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    PAUnsupported.F);
   // FIXME: Remove when all errors have been fixed.
   let FullInstRWOverlapCheck = 0;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
index 697a0f69c58c..f2cd83caffa2 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedFalkorDetails.td
@@ -911,7 +911,7 @@ def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^DUP(v16i8|v8i16)(gpr|lane)$")
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^CPY(i8|i16|i32|i64)$")>;
 def : InstRW<[FalkorWr_1GTOV_1cyc],   (instregex "^INSv(i8|i16)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_1VTOG_1cyc],   (instregex "^(S|U)MOVv.*$")>;
-def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIF|BIT|BSL)v8i8$")>;
+def : InstRW<[FalkorWr_1VXVY_1cyc],   (instregex "^(BIF|BIT|BSL|BSP)v8i8$")>;
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs EXTv8i8)>;
 def : InstRW<[FalkorWr_1VXVY_0cyc],   (instregex "(MOVI|MVNI)(D|v8b_ns|v2i32|v4i16|v2s_msl)$")>; // imm fwd
 def : InstRW<[FalkorWr_1VXVY_1cyc],   (instrs TBLv8i8One)>;
@@ -935,7 +935,7 @@ def : InstRW<[FalkorWr_FMUL64_1VXVY_6cyc],
 def : InstRW<[FalkorWr_1GTOV_1VXVY_2cyc],
                                       (instregex "^INSv(i32|i64)(gpr|lane)$")>;
 def : InstRW<[FalkorWr_2GTOV_1cyc],   (instregex "^DUP(v4i32|v2i64)(gpr|lane)$")>;
-def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(BIF|BIT|BSL)v16i8$")>;
+def : InstRW<[FalkorWr_2VXVY_1cyc],   (instregex "^(BIF|BIT|BSL|BSP)v16i8$")>;
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs EXTv16i8)>;
 def : InstRW<[FalkorWr_2VXVY_0cyc],   (instregex "(MOVI|MVNI)(v2d_ns|v16b_ns|v4i32|v8i16|v4s_msl)$")>; // imm fwd
 def : InstRW<[FalkorWr_2VXVY_1cyc],   (instrs NOTv16i8)>;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryo.td b/llvm/lib/Target/AArch64/AArch64SchedKryo.td
index 0e1a24103121..ba14bf1f50de 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedKryo.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedKryo.td
@@ -27,8 +27,8 @@ def KryoModel : SchedMachineModel {
   let LoopMicroOpBufferSize = 16;
   let CompleteModel = 1;
 
-  list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
-
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    PAUnsupported.F);
   // FIXME: Remove when all errors have been fixed.
   let FullInstRWOverlapCheck = 0;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
index 4c60992e6351..bc5ad0f8bece 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedKryoDetails.td
@@ -462,13 +462,13 @@ def KryoWrite_1cyc_X_noRSV_74ln :
 	let Latency = 1; let NumMicroOps = 2;
 }
 def : InstRW<[KryoWrite_1cyc_X_noRSV_74ln],
-	(instrs BIFv8i8, BITv8i8, BSLv8i8)>;
+	(instrs BIFv8i8, BITv8i8, BSLv8i8, BSPv8i8)>;
 def KryoWrite_1cyc_X_X_75ln :
 	SchedWriteRes<[KryoUnitX, KryoUnitX]> {
 	let Latency = 1; let NumMicroOps = 2;
 }
 def : InstRW<[KryoWrite_1cyc_X_X_75ln],
-	(instrs BIFv16i8, BITv16i8, BSLv16i8)>;
+	(instrs BIFv16i8, BITv16i8, BSLv16i8, BSPv16i8)>;
 def KryoWrite_0cyc_noRSV_11ln :
 	SchedWriteRes<[]> {
 	let Latency = 0; let NumMicroOps = 1;
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
index 3b6aecf5c035..9c50f9708583 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX.td
@@ -25,8 +25,8 @@ def ThunderXT8XModel : SchedMachineModel {
   let PostRAScheduler = 1;    // Use PostRA scheduler.
   let CompleteModel = 1;
 
-  list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
-
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    PAUnsupported.F);
   // FIXME: Remove when all errors have been fixed.
   let FullInstRWOverlapCheck = 0;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
index e2a293c06877..95c29dd2a567 100644
--- a/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX2T99.td
@@ -25,8 +25,8 @@ def ThunderX2T99Model : SchedMachineModel {
   let PostRAScheduler       =   1; // Using PostRA sched.
   let CompleteModel         =   1;
 
-  list<Predicate> UnsupportedFeatures = SVEUnsupported.F;
-
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    PAUnsupported.F);
   // FIXME: Remove when all errors have been fixed.
   let FullInstRWOverlapCheck = 0;
 }
@@ -1482,7 +1482,7 @@ def : InstRW<[THX2T99Write_5Cyc_F01], (instregex "^RBITv")>;
 // ASIMD bitwise insert, D-form
 // ASIMD bitwise insert, Q-form
 def : InstRW<[THX2T99Write_5Cyc_F01],
-            (instregex "^BIFv", "^BITv", "^BSLv")>;
+            (instregex "^BIFv", "^BITv", "^BSLv", "^BSPv")>;
 
 // ASIMD count, D-form
 // ASIMD count, Q-form
diff --git a/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
new file mode 100644
index 000000000000..00838cc4b9bd
--- /dev/null
+++ b/llvm/lib/Target/AArch64/AArch64SchedThunderX3T110.td
@@ -0,0 +1,1997 @@
+//=- AArch64SchedThunderX3T110.td - Marvell ThunderX3 T110 ---*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the scheduling model for Marvell ThunderX3T110
+// family of processors.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pipeline Description.
+
+def ThunderX3T110Model : SchedMachineModel {
+  let IssueWidth            =   4; // 4 micro-ops dispatched at a time.
+  let MicroOpBufferSize     =  70; // 70 entries in micro-op re-order buffer.
+  let LoadLatency           =   4; // Optimistic load latency.
+  let MispredictPenalty     =  12; // Extra cycles for mispredicted branch.
+  // Determined via a mix of micro-arch details and experimentation.
+  let LoopMicroOpBufferSize = 128; // FIXME: might be much bigger in TX3.
+  let PostRAScheduler       =   1; // Using PostRA sched.
+  let CompleteModel         =   1;
+
+  list<Predicate> UnsupportedFeatures = !listconcat(SVEUnsupported.F,
+                                                    PAUnsupported.F);
+  // FIXME: Remove when all errors have been fixed.
+  let FullInstRWOverlapCheck = 0;
+}
+
+let SchedModel = ThunderX3T110Model in {
+
+// Issue ports.
+
+// Port 0: ALU.
+def THX3T110P0 : ProcResource<1>;
+
+// Port 1: ALU.
+def THX3T110P1 : ProcResource<1>;
+
+// Port 2: ALU/Branch.
+def THX3T110P2 : ProcResource<1>;
+
+// Port 3: ALU/Branch.
+def THX3T110P3 : ProcResource<1>;
+
+// Port 4: Load/Store.
+def THX3T110P4 : ProcResource<1>;
+
+// Port 5: Load/store.
+def THX3T110P5 : ProcResource<1>;
+
+// Port 6: FP/Neon/SIMD/Crypto.
+def THX3T110P6FP0 : ProcResource<1>;
+
+// Port 7: FP/Neon/SIMD/Crypto.
+def THX3T110P7FP1 : ProcResource<1>;
+
+// Port 8: FP/Neon/SIMD/Crypto.
+def THX3T110P8FP2 : ProcResource<1>;
+
+// Port 9: FP/Neon/SIMD/Crypto.
+def THX3T110P9FP3 : ProcResource<1>;
+
+// Port 10: Store Data Unit.
+def THX3T110SD0 : ProcResource<1>;
+
+// Define groups for the functional units on each issue port.  Each group
+// created will be used by a WriteRes.
+
+// Integer divide/mulhi micro-ops only on port I1.
+def THX3T110I1 : ProcResGroup<[THX3T110P1]>;
+
+// Branch micro-ops on ports I2/I3.
+def THX3T110I23 : ProcResGroup<[THX3T110P2, THX3T110P3]>;
+
+// Branch micro-ops on ports I1/I2/I3.
+def THX3T110I123 : ProcResGroup<[THX3T110P1, THX3T110P2, THX3T110P3]>;
+
+// Integer micro-ops on ports I0/I1/I2.
+def THX3T110I012 : ProcResGroup<[THX3T110P0, THX3T110P1, THX3T110P2]>;
+
+// Integer micro-ops on ports I0/I1/I2/I3.
+def THX3T110I0123 : ProcResGroup<[THX3T110P0, THX3T110P1,
+                                  THX3T110P2, THX3T110P3]>;
+
+// FP micro-ops on ports FP0/FP1/FP2/FP3.
+def THX3T110FP0123 : ProcResGroup<[THX3T110P6FP0, THX3T110P7FP1,
+                                   THX3T110P8FP2, THX3T110P9FP3]>;
+
+// FP micro-ops on ports FP2/FP3.
+def THX3T110FP23 : ProcResGroup<[THX3T110P8FP2, THX3T110P9FP3]>;
+
+// ASIMD micro-ops on ports FP0/FP1/FP2/FP3.
+def THX3T110SIMD : ProcResGroup<[THX3T110P6FP0, THX3T110P7FP1,
+                                 THX3T110P8FP2, THX3T110P9FP3]>;
+
+// Store data micro-ops only on port 10.
+def THX3T110SD : ProcResGroup<[THX3T110SD0]>;
+
+// Load/store micro-ops on ports P4/P5.
+def THX3T110LS : ProcResGroup<[THX3T110P4, THX3T110P5]>;
+
+// 70 entry unified scheduler.
+def THX3T110ANY: ProcResGroup<[THX3T110P0, THX3T110P1, THX3T110P2,
+                               THX3T110P3, THX3T110P4, THX3T110P5,
+                               THX3T110P6FP0, THX3T110P7FP1,
+                               THX3T110P8FP2, THX3T110P9FP3]> {
+  let BufferSize = 70;
+}
+
+// Define commonly used write types for InstRW specializations.
+// All definitions follow the format: THX3T110Write_<NumCycles>Cyc_<Resources>.
+
+// 3 cycles on I1.
+def THX3T110Write_3Cyc_I1 : SchedWriteRes<[THX3T110I1]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+// 4 cycles on I1.
+def THX3T110Write_4Cyc_I1 : SchedWriteRes<[THX3T110I1]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+// 5 cycles on I1.
+def THX3T110Write_5Cyc_I1 : SchedWriteRes<[THX3T110I1]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// 7 cycles on I1.
+def THX3T110Write_7Cyc_I1 : SchedWriteRes<[THX3T110I1]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+
+// 23 cycles on I1.
+def THX3T110Write_23Cyc_I1 : SchedWriteRes<[THX3T110I1]> {
+  let Latency = 23;
+  let ResourceCycles = [13, 23];
+  let NumMicroOps = 4;
+}
+
+// 39 cycles on I1.
+def THX3T110Write_39Cyc_I1 : SchedWriteRes<[THX3T110I1]> {
+  let Latency = 39;
+  let ResourceCycles = [13, 39];
+  let NumMicroOps = 4;
+}
+
+// 1 cycle on I2/I3
+def THX3T110Write_1Cyc_I23 : SchedWriteRes<[THX3T110I23]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 8 cycles on I2/I3
+def THX3T110Write_8Cyc_I23 : SchedWriteRes<[THX3T110I23]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+// 1 cycle on I1/I2/I3
+def THX3T110Write_1Cyc_I123 : SchedWriteRes<[THX3T110I123]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 8 cycles on I1/I2/I3
+def THX3T110Write_8Cyc_I123 : SchedWriteRes<[THX3T110I123]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+// 1 cycle on I0/I1/I2/I3.
+def THX3T110Write_1Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 2 cycles on I0/I1/I2/I3.
+def THX3T110Write_2Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+// 3 cycles on I0/I1/I2/I3.
+def THX3T110Write_3Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+// 4 cycles on I0/I1/I2/I3.
+def THX3T110Write_4Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+// 5 cycles on I0/I1/I2/I3.
+def THX3T110Write_5Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+// 6 cycles on I0/I1/I2/I3.
+def THX3T110Write_6Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+// 8 cycles on I0/I1/I2/I3.
+def THX3T110Write_8Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+  let Latency = 8;
+  let NumMicroOps = 4;
+}
+
+// 13 cycles on I0/I1/I2/I3.
+def THX3T110Write_13Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+  let Latency = 13;
+  let NumMicroOps = 3;
+}
+
+// 23 cycles on I0/I1/I2/I3.
+def THX3T110Write_23Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+  let Latency = 23;
+  let NumMicroOps = 3;
+}
+
+// 39 cycles on I0/I1/I2/I3.
+def THX3T110Write_39Cyc_I0123 : SchedWriteRes<[THX3T110I0123]> {
+  let Latency = 39;
+  let NumMicroOps = 3;
+}
+
+// 4 cycles on F2/F3.
+def THX3T110Write_4Cyc_F23 : SchedWriteRes<[THX3T110FP23]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+// 5 cycles on F0/F1/F2/F3.
+def THX3T110Write_5Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// 6 cycles on F0/F1/F2/F3.
+def THX3T110Write_6Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+// 7 cycles on F0/F1/F2/F3.
+def THX3T110Write_7Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+
+// 8 cycles on F0/F1/F2/F3.
+def THX3T110Write_8Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+// 10 cycles on F0/F1/F2/F3.
+def THX3T110Write_10Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 10;
+  let NumMicroOps = 3;
+}
+
+// 16 cycles on F0/F1/F2/F3.
+def THX3T110Write_16Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 16;
+  let NumMicroOps = 3;
+  let ResourceCycles = [8];
+}
+
+// 23 cycles on F0/F1/F2/F3.
+def THX3T110Write_23Cyc_F01 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 23;
+  let NumMicroOps = 3;
+  let ResourceCycles = [11];
+}
+
+// 1 cycle on LS0/LS1.
+def THX3T110Write_1Cyc_LS01 : SchedWriteRes<[THX3T110LS]> {
+  let Latency = 1;
+  let NumMicroOps = 1;
+}
+
+// 2 cycles on LS0/LS1.
+def THX3T110Write_2Cyc_LS01 : SchedWriteRes<[THX3T110LS]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+// 4 cycles on LS0/LS1.
+def THX3T110Write_4Cyc_LS01 : SchedWriteRes<[THX3T110LS]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [2];
+}
+
+// 5 cycles on LS0/LS1.
+def THX3T110Write_5Cyc_LS01 : SchedWriteRes<[THX3T110LS]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+// 6 cycles on LS0/LS1.
+def THX3T110Write_6Cyc_LS01 : SchedWriteRes<[THX3T110LS]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+// 4 + 5 cycles on LS0/LS1.
+// First resource is available after 4 cycles.
+// Second resource is available after 5 cycles.
+// Load vector pair, immed offset, Q-form [LDP/LDNP].
+def THX3T110Write_4_5Cyc_LS01 : SchedWriteRes<[THX3T110LS]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [4, 5];
+}
+
+// 4 + 8 cycles on LS0/LS1.
+// First resource is available after 4 cycles.
+// Second resource is available after 8 cycles.
+// Load vector pair, immed offset, S/D-form [LDP/LDNP].
+def THX3T110Write_4_8Cyc_LS01 : SchedWriteRes<[THX3T110LS]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+  let ResourceCycles = [4, 8];
+}
+
+// 11 cycles on LS0/LS1 and I1.
+def THX3T110Write_11Cyc_LS01_I1 :
+  SchedWriteRes<[THX3T110LS, THX3T110I1]> {
+  let Latency = 11;
+  let NumMicroOps = 4;
+}
+
+// 1 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_1Cyc_LS01_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 1 cycles on LS0/LS1 and 2 of I0/I1/I2/I3.
+def THX3T110Write_1Cyc_LS01_I0123_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> {
+  let Latency = 1;
+  let NumMicroOps = 3;
+}
+
+// 4 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_4Cyc_LS01_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+// 4 cycles on LS0/LS1 and 2 of I0/I1/I2/I3.
+def THX3T110Write_4Cyc_LS01_I0123_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+// 5 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_5Cyc_LS01_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+// 5 cycles on LS0/LS1 and 2 of I0/I1/I2/I3.
+def THX3T110Write_5Cyc_LS01_I0123_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+// 6 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_6Cyc_LS01_I012 :
+  SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+  let Latency = 6;
+  let NumMicroOps = 4;
+}
+
+// 6 cycles on LS0/LS1 and 2 of I0/I1/I2/I3.
+def THX3T110Write_6Cyc_LS01_I0123_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110I0123, THX3T110I0123]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+// 1 cycle on LS0/LS1 and SD.
+def THX3T110Write_1Cyc_LS01_SD :
+  SchedWriteRes<[THX3T110LS, THX3T110SD]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 2 cycles on LS0/LS1 and SD.
+def THX3T110Write_2Cyc_LS01_SD :
+  SchedWriteRes<[THX3T110LS, THX3T110SD]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+// 4 cycles on LS0/LS1 and SD.
+def THX3T110Write_4Cyc_LS01_SD :
+  SchedWriteRes<[THX3T110LS, THX3T110SD]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+// 5 cycles on LS0/LS1 and SD.
+def THX3T110Write_5Cyc_LS01_SD :
+  SchedWriteRes<[THX3T110LS, THX3T110SD]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+}
+
+// 6 cycles on LS0/LS1 and SD.
+def THX3T110Write_6Cyc_LS01_SD :
+  SchedWriteRes<[THX3T110LS, THX3T110SD]> {
+  let Latency = 6;
+  let NumMicroOps = 5;
+}
+
+// 1 cycle on LS0/LS1, SD and I0/I1/I2/I3.
+def THX3T110Write_1Cyc_LS01_SD_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 2 cycles on LS0/LS1, SD and I0/I1/I2/I3.
+def THX3T110Write_2Cyc_LS01_SD_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> {
+  let Latency = 2;
+  let NumMicroOps = 2;
+}
+
+// 4 cycles on LS0/LS1, SD and I0/I1/I2/I3.
+def THX3T110Write_4Cyc_LS01_SD_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> {
+  let Latency = 4;
+  let NumMicroOps = 3;
+}
+
+// 5 cycles on LS0/LS1, SD and I0/I1/I2/I3.
+def THX3T110Write_5Cyc_LS01_SD_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+}
+
+// 6 cycles on LS0/LS1, SD and I0/I1/I2/I3.
+def THX3T110Write_6Cyc_LS01_SD_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110SD, THX3T110I0123]> {
+  let Latency = 6;
+  let NumMicroOps = 5;
+}
+
+// 1 cycles on LS0/LS1 and F0/F1/F2/F3.
+def THX3T110Write_1Cyc_LS01_F0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110FP0123]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// 5 cycles on LS0/LS1 and F0/F1/F2/F3.
+def THX3T110Write_5Cyc_LS01_F0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110FP0123]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+// 6 cycles on LS0/LS1 and F0/F1/F2/F3.
+def THX3T110Write_6Cyc_LS01_F0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110FP0123]> {
+  let Latency = 6;
+  let NumMicroOps = 3;
+}
+
+// 7 cycles on LS0/LS1 and F0/F1/F2/F3.
+def THX3T110Write_7Cyc_LS01_F0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110FP0123]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+
+// 8 cycles on LS0/LS1 and F0/F1/F2/F3.
+def THX3T110Write_8Cyc_LS01_F0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110FP0123]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+// 8 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_8Cyc_LS01_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+  let Latency = 8;
+  let NumMicroOps = 3;
+}
+
+// 12 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_12Cyc_LS01_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+  let Latency = 12;
+  let NumMicroOps = 4;
+}
+
+// 16 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_16Cyc_LS01_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+  let Latency = 16;
+  let NumMicroOps = 5;
+}
+
+// 24 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_24Cyc_LS01_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+  let Latency = 24;
+  let NumMicroOps = 10;
+}
+
+// 32 cycles on LS0/LS1 and I0/I1/I2/I3.
+def THX3T110Write_32Cyc_LS01_I0123 :
+  SchedWriteRes<[THX3T110LS, THX3T110I0123]> {
+  let Latency = 32;
+  let NumMicroOps = 14;
+}
+
+// 3 cycles on F0/F1/F2/F3.
+def THX3T110Write_3Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+}
+
+// 4 cycles on F0/F1/F2/F3.
+def THX3T110Write_4Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+// 5 cycles on F0/F1/F2/F3.
+def THX3T110Write_5Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// 10 cycles on F0/F1/F2/F3.
+def THX3T110Write_10Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 10;
+  let NumMicroOps = 4;
+}
+
+// 15 cycles on F0/F1/F2/F3.
+def THX3T110Write_15Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 15;
+  let NumMicroOps = 7;
+}
+
+// 16 cycles on F0/F1/F2/F3.
+def THX3T110Write_16Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 16;
+  let NumMicroOps = 3;
+}
+
+// 18 cycles on F0/F1/F2/F3.
+def THX3T110Write_18Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 18;
+  let NumMicroOps = 3;
+}
+
+// 19 cycles on F0/F1/F2/F3.
+def THX3T110Write_19Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 19;
+  let NumMicroOps = 4;
+}
+
+// 20 cycles on F0/F1/F2/F3.
+def THX3T110Write_20Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 20;
+  let NumMicroOps = 4;
+}
+
+// 23 cycles on F0/F1/F2/F3.
+def THX3T110Write_23Cyc_F0123 : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 23;
+  let NumMicroOps = 4;
+}
+
+// 3 cycles on F2/F3 and 4 cycles on F0/F1/F2/F3.
+def THX3T110Write_3_4Cyc_F23_F0123 :
+  SchedWriteRes<[THX3T110FP23, THX3T110FP0123]> {
+  let Latency = 3;
+  let NumMicroOps = 2;
+  let ResourceCycles = [3, 4];
+}
+
+
+// Define commonly used read types.
+
+// No forwarding is provided for these types.
+def : ReadAdvance<ReadI,       0>;
+def : ReadAdvance<ReadISReg,   0>;
+def : ReadAdvance<ReadIEReg,   0>;
+def : ReadAdvance<ReadIM,      0>;
+def : ReadAdvance<ReadIMA,     0>;
+def : ReadAdvance<ReadID,      0>;
+def : ReadAdvance<ReadExtrHi,  0>;
+def : ReadAdvance<ReadAdrBase, 0>;
+def : ReadAdvance<ReadVLD,     0>;
+
+//===----------------------------------------------------------------------===//
+// 3. Instruction Tables.
+
+//---
+// 3.1 Branch Instructions
+//---
+
+// Branch, immed
+// Branch and link, immed
+// Compare and branch
+def : WriteRes<WriteBr,      [THX3T110I23]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Branch, register
+// Branch and link, register != LR
+// Branch and link, register = LR
+def : WriteRes<WriteBrReg,   [THX3T110I23]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def : WriteRes<WriteSys,     []> { let Latency = 1; }
+def : WriteRes<WriteBarrier, []> { let Latency = 1; }
+def : WriteRes<WriteHint,    []> { let Latency = 1; }
+
+def : WriteRes<WriteAtomic,  []> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+//---
+// Branch
+//---
+def : InstRW<[THX3T110Write_1Cyc_I23], (instrs B, BL, BR, BLR)>;
+def : InstRW<[THX3T110Write_1Cyc_I23], (instrs Bcc)>;
+def : InstRW<[THX3T110Write_1Cyc_I23], (instrs RET)>;
+def : InstRW<[THX3T110Write_1Cyc_I23],
+            (instrs CBZW, CBZX, CBNZW, CBNZX, TBZW, TBZX, TBNZW, TBNZX)>;
+
+//---
+// 3.2 Arithmetic and Logical Instructions
+// 3.3 Move and Shift Instructions
+//---
+
+
+// ALU, basic
+// Conditional compare
+// Conditional select
+// Address generation
+def : WriteRes<WriteI,       [THX3T110I0123]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+  let NumMicroOps = 2;
+}
+
+def : InstRW<[WriteI],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC(W|X)r",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC(W|X)r",
+                       "SBCS(W|X)r",            "CCMN(W|X)(i|r)",
+                       "CCMP(W|X)(i|r)",        "CSEL(W|X)r",
+                       "CSINC(W|X)r",           "CSINV(W|X)r",
+                       "CSNEG(W|X)r")>;
+
+def : InstRW<[WriteI], (instrs COPY)>;
+
+// ALU, extend and/or shift
+def : WriteRes<WriteISReg,   [THX3T110I0123]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+  let NumMicroOps = 2;
+}
+
+def : InstRW<[WriteISReg],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC(W|X)r",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC(W|X)r",
+                       "SBCS(W|X)r",            "CCMN(W|X)(i|r)",
+                       "CCMP(W|X)(i|r)",        "CSEL(W|X)r",
+                       "CSINC(W|X)r",           "CSINV(W|X)r",
+                       "CSNEG(W|X)r")>;
+
+def : WriteRes<WriteIEReg,   [THX3T110I0123]> {
+  let Latency = 1;
+  let ResourceCycles = [1];
+  let NumMicroOps = 2;
+}
+
+def : InstRW<[WriteIEReg],
+            (instregex "ADD?(W|X)r(i|r|s|x)",   "ADDS?(W|X)r(i|r|s|x)(64)?",
+                       "AND?(W|X)r(i|r|s|x)",   "ANDS?(W|X)r(i|r|s|x)",
+                       "ADC(W|X)r",
+                       "BIC?(W|X)r(i|r|s|x)",   "BICS?(W|X)r(i|r|s|x)",
+                       "EON?(W|X)r(i|r|s|x)",   "ORN?(W|X)r(i|r|s|x)",
+                       "ORR?(W|X)r(i|r|s|x)",   "SUB?(W|X)r(i|r|s|x)",
+                       "SUBS?(W|X)r(i|r|s|x)",  "SBC(W|X)r",
+                       "SBCS(W|X)r",            "CCMN(W|X)(i|r)",
+                       "CCMP(W|X)(i|r)",        "CSEL(W|X)r",
+                       "CSINC(W|X)r",           "CSINV(W|X)r",
+                       "CSNEG(W|X)r")>;
+
+// Move immed
+def : WriteRes<WriteImm,     [THX3T110I0123]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+def : InstRW<[THX3T110Write_1Cyc_I0123],
+            (instrs MOVKWi, MOVKXi, MOVNWi, MOVNXi, MOVZWi, MOVZXi)>;
+
+def : InstRW<[THX3T110Write_1Cyc_I0123],
+            (instrs ASRVWr, ASRVXr, LSLVWr, LSLVXr, RORVWr, RORVXr)>;
+
+// Variable shift
+def : WriteRes<WriteIS,      [THX3T110I0123]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+//---
+// 3.4 Divide and Multiply Instructions
+//---
+
+// Divide, W-form
+// Latency range of 13-23/13-39.
+def : WriteRes<WriteID32,    [THX3T110I1]> {
+  let Latency = 39;
+  let ResourceCycles = [39];
+  let NumMicroOps = 4;
+}
+
+// Divide, X-form
+def : WriteRes<WriteID64,    [THX3T110I1]> {
+  let Latency = 23;
+  let ResourceCycles = [23];
+  let NumMicroOps = 4;
+}
+
+// Multiply accumulate, W-form
+def : WriteRes<WriteIM32,    [THX3T110I0123]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+// Multiply accumulate, X-form
+def : WriteRes<WriteIM64,    [THX3T110I0123]> {
+  let Latency = 5;
+  let NumMicroOps = 3;
+}
+
+//def : InstRW<[WriteIM32, ReadIM, ReadIM, ReadIMA, THX3T110Write_5Cyc_I012],
+//             (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[WriteIM32], (instrs MADDWrrr, MSUBWrrr)>;
+def : InstRW<[WriteIM32], (instrs MADDXrrr, MSUBXrrr)>;
+def : InstRW<[THX3T110Write_5Cyc_I0123],
+            (instregex "(S|U)(MADDL|MSUBL)rrr")>;
+
+def : InstRW<[WriteID32], (instrs SDIVWr, UDIVWr)>;
+def : InstRW<[WriteID64], (instrs SDIVXr, UDIVXr)>;
+
+// Bitfield extract, two reg
+def : WriteRes<WriteExtr,    [THX3T110I0123]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Multiply high
+def : InstRW<[THX3T110Write_4Cyc_I1], (instrs SMULHrr, UMULHrr)>;
+
+// Miscellaneous Data-Processing Instructions
+// Bitfield extract
+def : InstRW<[THX3T110Write_1Cyc_I0123], (instrs EXTRWrri, EXTRXrri)>;
+
+// Bitifield move - basic
+def : InstRW<[THX3T110Write_1Cyc_I0123],
+            (instrs SBFMWri, SBFMXri, UBFMWri, UBFMXri)>;
+
+// Bitfield move, insert
+def : InstRW<[THX3T110Write_1Cyc_I0123], (instregex "^BFM")>;
+def : InstRW<[THX3T110Write_1Cyc_I0123], (instregex "(S|U)?BFM.*")>;
+
+// Count leading
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123],
+            (instregex "^CLS(W|X)r$", "^CLZ(W|X)r$")>;
+
+// Reverse bits
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instrs RBITWr, RBITXr)>;
+
+// Cryptography Extensions
+def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^AES[DE]")>;
+def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^AESI?MC")>;
+def : InstRW<[THX3T110Write_4Cyc_F0123], (instregex "^PMULL")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1SU0")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1(H|SU1)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA1[CMP]")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA256SU0")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SHA256(H|H2|SU1)")>;
+
+// CRC Instructions
+// def : InstRW<[THX3T110Write_4Cyc_I1], (instregex "^CRC32", "^CRC32C")>;
+def : InstRW<[THX3T110Write_4Cyc_I1],
+            (instrs CRC32Brr, CRC32Hrr, CRC32Wrr, CRC32Xrr)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I1],
+            (instrs CRC32CBrr, CRC32CHrr, CRC32CWrr, CRC32CXrr)>;
+
+// Reverse bits/bytes
+// NOTE: Handled by WriteI.
+
+//---
+// 3.6 Load Instructions
+// 3.10 FP Load Instructions
+//---
+
+// Load register, literal
+// Load register, unscaled immed
+// Load register, immed unprivileged
+// Load register, unsigned immed
+def : WriteRes<WriteLD,      [THX3T110LS]> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+
+// Load register, immed post-index
+// NOTE: Handled by WriteLD, WriteI.
+// Load register, immed pre-index
+// NOTE: Handled by WriteLD, WriteAdr.
+def : WriteRes<WriteAdr,     [THX3T110I0123]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Load pair, immed offset, normal
+// Load pair, immed offset, signed words, base != SP
+// Load pair, immed offset signed words, base = SP
+// LDP only breaks into *one* LS micro-op.  Thus
+// the resources are handled by WriteLD.
+def : WriteRes<WriteLDHi,    []> {
+  let Latency = 4;
+  let NumMicroOps = 4;
+}
+
+// Load register offset, basic
+// Load register, register offset, scale by 4/8
+// Load register, register offset, scale by 2
+// Load register offset, extend
+// Load register, register offset, extend, scale by 4/8
+// Load register, register offset, extend, scale by 2
+def THX3T110WriteLDIdx : SchedWriteVariant<[
+  SchedVar<ScaledIdxPred, [THX3T110Write_4Cyc_LS01_I0123_I0123]>,
+  SchedVar<NoSchedPred,   [THX3T110Write_4Cyc_LS01_I0123]>]>;
+def : SchedAlias<WriteLDIdx, THX3T110WriteLDIdx>;
+
+def THX3T110ReadAdrBase : SchedReadVariant<[
+  SchedVar<ScaledIdxPred, [ReadDefault]>,
+  SchedVar<NoSchedPred,   [ReadDefault]>]>;
+def : SchedAlias<ReadAdrBase, THX3T110ReadAdrBase>;
+
+// Load pair, immed pre-index, normal
+// Load pair, immed pre-index, signed words
+// Load pair, immed post-index, normal
+// Load pair, immed post-index, signed words
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPDi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPQi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPSi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDNPXi)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPDi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPQi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPSi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPSWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, WriteLDHi], (instrs LDPXi)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRBui)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRDui)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRHui)>;
+def : InstRW<[THX3T110Write_5Cyc_LS01], (instrs LDRQui)>;
+def : InstRW<[THX3T110Write_5Cyc_LS01], (instrs LDRSui)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRDl)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRQl)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRWl)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDRXl)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRBi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRHi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRXi)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSBWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSBXi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSHWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSHXi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDTRSWi)>;
+
+def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr],
+            (instrs LDPDpre)>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr],
+            (instrs LDPQpre)>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr],
+            (instrs LDPSpre)>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr],
+            (instrs LDPWpre)>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr],
+            (instrs LDPWpre)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr],
+            (instrs LDRBpre, LDRDpre, LDRHpre, LDRQpre,
+                    LDRSpre, LDRWpre, LDRXpre,
+                    LDRSBWpre, LDRSBXpre, LDRSBWpost, LDRSBXpost,
+                    LDRSHWpre, LDRSHXpre, LDRSHWpost, LDRSHXpost,
+                    LDRBBpre, LDRBBpost, LDRHHpre, LDRHHpost)>;
+
+def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteLDHi, WriteAdr],
+            (instrs LDPDpost, LDPQpost, LDPSpost, LDPWpost, LDPXpost)>;
+
+def : InstRW<[THX3T110Write_5Cyc_LS01_I0123, WriteI],
+            (instrs LDRBpost, LDRDpost, LDRHpost,
+                    LDRQpost, LDRSpost, LDRWpost, LDRXpost)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteLDHi, WriteAdr],
+            (instrs LDPDpre, LDPQpre, LDPSpre, LDPWpre, LDPXpre)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteAdr],
+            (instrs LDRBpre, LDRDpre, LDRHpre, LDRQpre,
+                    LDRSpre, LDRWpre, LDRXpre)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteLDHi, WriteAdr],
+            (instrs LDPDpost, LDPQpost, LDPSpost, LDPWpost, LDPXpost)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123_I0123, WriteI],
+            (instrs LDRBpost, LDRDpost, LDRHpost, LDRQpost,
+                    LDRSpost, LDRWpost, LDRXpost)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRBroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRDroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHHroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRQroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHWroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHXroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRWroW)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRXroW)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRBroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRDroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHHroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRHroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRQroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHWroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRSHXroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRWroX)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_I0123, ReadAdrBase], (instrs LDRXroX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURBi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURBBi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURDi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURHi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURHHi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURQi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURXi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSBWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSBXi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSHWi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSHXi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instrs LDURSWi)>;
+
+// Load exclusive
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAR(B|H|W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAXR(B|H|W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDXR(B|H|W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDAXP(W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01], (instregex "^LDXP(W|X)$")>;
+
+//---
+// Prefetch
+//---
+def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMl)>;
+def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFUMi)>;
+def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMui)>;
+def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMroW)>;
+def : InstRW<[THX3T110Write_6Cyc_LS01_I012], (instrs PRFMroX)>;
+
+//--
+// 3.7 Store Instructions
+// 3.11 FP Store Instructions
+//--
+
+// Store register, unscaled immed
+// Store register, immed unprivileged
+// Store register, unsigned immed
+def : WriteRes<WriteST,      [THX3T110LS, THX3T110SD]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Store register, immed post-index
+// NOTE: Handled by WriteAdr, WriteST, ReadAdrBase
+
+// Store register, immed pre-index
+// NOTE: Handled by WriteAdr, WriteST
+
+// Store register, register offset, basic
+// Store register, register offset, scaled by 4/8
+// Store register, register offset, scaled by 2
+// Store register, register offset, extend
+// Store register, register offset, extend, scale by 4/8
+// Store register, register offset, extend, scale by 1
+def : WriteRes<WriteSTIdx, [THX3T110LS, THX3T110SD, THX3T110I0123]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Store pair, immed offset, W-form
+// Store pair, immed offset, X-form
+def : WriteRes<WriteSTP,     [THX3T110LS, THX3T110SD]> {
+  let Latency = 1;
+  let NumMicroOps = 2;
+}
+
+// Store pair, immed post-index, W-form
+// Store pair, immed post-index, X-form
+// Store pair, immed pre-index, W-form
+// Store pair, immed pre-index, X-form
+// NOTE: Handled by WriteAdr, WriteSTP.
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURBi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURBBi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURDi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURHi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURHHi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURQi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURSi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURWi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STURXi)>;
+
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRBi)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRHi)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRWi)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_SD], (instrs STTRXi)>;
+
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPDi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPQi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPXi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STNPWi)>;
+
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPDi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPQi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPXi)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_SD], (instrs STPWi)>;
+
+def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRBui)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRDui)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRHui)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRQui)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRXui)>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_I0123], (instrs STRWui)>;
+
+def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRBui)>;
+def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRDui)>;
+def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRHui)>;
+def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRQui)>;
+def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRXui)>;
+def : InstRW<[WriteSTP, THX3T110Write_1Cyc_LS01_SD], (instrs STRWui)>;
+
+def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRBui)>;
+def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRDui)>;
+def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRHui)>;
+def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRQui)>;
+def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRXui)>;
+def : InstRW<[WriteSTIdx, THX3T110Write_1Cyc_LS01_SD_I0123], (instrs STRWui)>;
+
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STPDpre, STPDpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STPQpre, STPQpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STPSpre, STPSpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STPWpre, STPWpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+            (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STPXpre, STPXpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRBpre, STRBpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRBBpre, STRBBpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRDpre, STRDpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRHpre, STRHpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRHHpre, STRHHpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRQpre, STRQpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRSpre, STRSpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRWpre, STRWpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123],
+            (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRXpre, STRXpost)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRBroW, STRBroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRBBroW, STRBBroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRDroW, STRDroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRHroW, STRHroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRHHroW, STRHHroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRQroW, STRQroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRSroW, STRSroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRWroW, STRWroX)>;
+def : InstRW<[WriteAdr, THX3T110Write_1Cyc_LS01_I0123, ReadAdrBase],
+            (instrs STRXroW, STRXroX)>;
+
+// Store exclusive
+def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instrs STNPWi, STNPXi)>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLR(B|H|W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STXP(W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STXR(B|H|W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLXP(W|X)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01_SD], (instregex "^STLXR(B|H|W|X)$")>;
+
+//---
+// 3.8 FP Data Processing Instructions
+//---
+
+// FP absolute value
+// FP min/max
+// FP negate
+def : WriteRes<WriteF,       [THX3T110FP0123]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// FP arithmetic
+def : InstRW<[THX3T110Write_6Cyc_F01], (instregex "^FADD", "^FSUB")>;
+
+// FP compare
+def : WriteRes<WriteFCmp,    [THX3T110FP0123]> {
+  let Latency = 5;
+  let NumMicroOps = 2;
+}
+
+// FP Mul, Div, Sqrt
+def : WriteRes<WriteFDiv, [THX3T110FP0123]> {
+  let Latency = 22;
+  let ResourceCycles = [19];
+}
+
+def THX3T110XWriteFDiv : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+  let NumMicroOps = 4;
+}
+
+def THX3T110XWriteFDivSP : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+  let NumMicroOps = 4;
+}
+
+def THX3T110XWriteFDivDP : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 23;
+  let ResourceCycles = [12];
+  let NumMicroOps = 4;
+}
+
+def THX3T110XWriteFSqrtSP : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 16;
+  let ResourceCycles = [8];
+  let NumMicroOps = 4;
+}
+
+def THX3T110XWriteFSqrtDP : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 23;
+  let ResourceCycles = [12];
+  let NumMicroOps = 4;
+}
+
+// FP divide, S-form
+// FP square root, S-form
+def : InstRW<[THX3T110XWriteFDivSP], (instrs FDIVSrr)>;
+def : InstRW<[THX3T110XWriteFSqrtSP], (instrs FSQRTSr)>;
+def : InstRW<[THX3T110XWriteFDivSP], (instregex "^FDIVv.*32$")>;
+def : InstRW<[THX3T110XWriteFSqrtSP], (instregex "^.*SQRT.*32$")>;
+def : InstRW<[THX3T110Write_16Cyc_F01], (instregex "^FDIVSrr", "^FSQRTSr")>;
+
+// FP divide, D-form
+// FP square root, D-form
+def : InstRW<[THX3T110XWriteFDivDP], (instrs FDIVDrr)>;
+def : InstRW<[THX3T110XWriteFSqrtDP], (instrs FSQRTDr)>;
+def : InstRW<[THX3T110XWriteFDivDP], (instregex "^FDIVv.*64$")>;
+def : InstRW<[THX3T110XWriteFSqrtDP], (instregex "^.*SQRT.*64$")>;
+def : InstRW<[THX3T110Write_23Cyc_F01], (instregex "^FDIVDrr", "^FSQRTDr")>;
+
+// FP multiply
+// FP multiply accumulate
+def : WriteRes<WriteFMul, [THX3T110FP0123]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+  let NumMicroOps = 3;
+}
+
+def THX3T110XWriteFMul : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+  let NumMicroOps = 3;
+}
+
+def THX3T110XWriteFMulAcc : SchedWriteRes<[THX3T110FP0123]> {
+  let Latency = 6;
+  let ResourceCycles = [2];
+  let NumMicroOps = 3;
+}
+
+def : InstRW<[THX3T110XWriteFMul], (instregex "^FMUL", "^FNMUL")>;
+def : InstRW<[THX3T110XWriteFMulAcc],
+            (instregex "^FMADD", "^FMSUB", "^FNMADD", "^FNMSUB")>;
+
+// FP round to integral
+def : InstRW<[THX3T110Write_7Cyc_F01],
+            (instregex "^FRINT(A|I|M|N|P|X|Z)(Sr|Dr)")>;
+
+// FP select
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FCSEL")>;
+
+//---
+// 3.9 FP Miscellaneous Instructions
+//---
+
+// FP convert, from vec to vec reg
+// FP convert, from gen to vec reg
+// FP convert, from vec to gen reg
+def : WriteRes<WriteFCvt, [THX3T110FP0123]> {
+  let Latency = 7;
+  let NumMicroOps = 3;
+}
+
+// FP move, immed
+// FP move, register
+def : WriteRes<WriteFImm, [THX3T110FP0123]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+// FP transfer, from gen to vec reg
+// FP transfer, from vec to gen reg
+def : WriteRes<WriteFCopy, [THX3T110FP0123]> {
+  let Latency = 4;
+  let NumMicroOps = 2;
+}
+
+def : InstRW<[THX3T110Write_5Cyc_F01], (instrs FMOVXDHighr, FMOVDXHighr)>;
+
+//---
+// 3.12 ASIMD Integer Instructions
+//---
+
+// ASIMD absolute diff, D-form
+// ASIMD absolute diff, Q-form
+// ASIMD absolute diff accum, D-form
+// ASIMD absolute diff accum, Q-form
+// ASIMD absolute diff accum long
+// ASIMD absolute diff long
+// ASIMD arith, basic
+// ASIMD arith, complex
+// ASIMD compare
+// ASIMD logical (AND, BIC, EOR)
+// ASIMD max/min, basic
+// ASIMD max/min, reduce, 4H/4S
+// ASIMD max/min, reduce, 8B/8H
+// ASIMD max/min, reduce, 16B
+// ASIMD multiply, D-form
+// ASIMD multiply, Q-form
+// ASIMD multiply accumulate long
+// ASIMD multiply accumulate saturating long
+// ASIMD multiply long
+// ASIMD pairwise add and accumulate
+// ASIMD shift accumulate
+// ASIMD shift by immed, basic
+// ASIMD shift by immed and insert, basic, D-form
+// ASIMD shift by immed and insert, basic, Q-form
+// ASIMD shift by immed, complex
+// ASIMD shift by register, basic, D-form
+// ASIMD shift by register, basic, Q-form
+// ASIMD shift by register, complex, D-form
+// ASIMD shift by register, complex, Q-form
+def : WriteRes<WriteV, [THX3T110FP0123]> {
+  let Latency = 5;
+  let NumMicroOps = 4;
+  let ResourceCycles = [4];
+}
+
+// ASIMD arith, reduce, 4H/4S
+// ASIMD arith, reduce, 8B/8H
+// ASIMD arith, reduce, 16B
+
+// ASIMD logical (MVN (alias for NOT), ORN, ORR)
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^ANDv", "^BICv", "^EORv", "^ORRv", "^ORNv", "^NOTv")>;
+
+// ASIMD arith, reduce
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^ADDVv", "^SADDLVv", "^UADDLVv")>;
+
+// ASIMD polynomial (8x8) multiply long
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^(S|U|SQD)MULL")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "(S|U|SQD)(MLAL|MLSL|MULL)v.*")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^PMULL(v8i8|v16i8)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^PMULL(v1i64|v2i64)")>;
+
+// ASIMD absolute diff accum, D-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^[SU]ABA(v8i8|v4i16|v2i32)$")>;
+// ASIMD absolute diff accum, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^[SU]ABA(v16i8|v8i16|v4i32)$")>;
+// ASIMD absolute diff accum long
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^[SU]ABAL")>;
+// ASIMD arith, reduce, 4H/4S
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^[SU]?ADDL?V(v8i8|v4i16|v2i32)v$")>;
+// ASIMD arith, reduce, 8B
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^[SU]?ADDL?V(v8i16|v4i32)v$")>;
+// ASIMD arith, reduce, 16B/16H
+def : InstRW<[THX3T110Write_10Cyc_F0123],
+            (instregex "^[SU]?ADDL?Vv16i8v$")>;
+// ASIMD max/min, reduce, 4H/4S
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^[SU](MIN|MAX)V(v4i16|v4i32)v$")>;
+// ASIMD max/min, reduce, 8B/8H
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^[SU](MIN|MAX)V(v8i8|v8i16)v$")>;
+// ASIMD max/min, reduce, 16B/16H
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^[SU](MIN|MAX)Vv16i8v$")>;
+// ASIMD multiply, D-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^(P?MUL|SQR?DMULH)" #
+                       "(v8i8|v4i16|v2i32|v1i8|v1i16|v1i32|v1i64)" #
+                       "(_indexed)?$")>;
+// ASIMD multiply, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^(P?MUL|SQR?DMULH)(v16i8|v8i16|v4i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, D-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^ML[AS](v8i8|v4i16|v2i32)(_indexed)?$")>;
+// ASIMD multiply accumulate, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^ML[AS](v16i8|v8i16|v4i32)(_indexed)?$")>;
+// ASIMD shift accumulate
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "SRSRAv","SSRAv","URSRAv","USRAv")>;
+
+// ASIMD shift by immed, basic
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "RSHRNv","SHRNv", "SQRSHRNv","SQRSHRUNv",
+                       "SQSHRNv","SQSHRUNv", "UQRSHRNv",
+                       "UQSHRNv","SQXTNv","SQXTUNv","UQXTNv")>;
+// ASIMD shift by immed, complex
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]?(Q|R){1,2}SHR")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SQSHLU")>;
+// ASIMD shift by register, basic, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F01],
+            (instregex "^[SU]SHL(v16i8|v8i16|v4i32|v2i64)")>;
+// ASIMD shift by register, complex, D-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^[SU][QR]{1,2}SHL" #
+                       "(v1i8|v1i16|v1i32|v1i64|v8i8|v4i16|v2i32|b|d|h|s)")>;
+// ASIMD shift by register, complex, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^[SU][QR]{1,2}SHL(v16i8|v8i16|v4i32|v2i64)")>;
+
+// ASIMD Arithmetic
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "(ADD|SUB)(v8i8|v4i16|v2i32|v1i64)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "(ADD|SUB)(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(ADD|SUB)HNv.*")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(RADD|RSUB)HNv.*")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^SQADD", "^SQNEG", "^SQSUB", "^SRHADD",
+                       "^SUQADD", "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "ADDP(v16i8|v8i16|v4i32|v2i64)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "((AND|ORN|EOR|EON)S?(Xr[rsi]|v16i8|v8i16|v4i32)|" #
+                       "(ORR|BIC)S?(Xr[rs]|v16i8|v8i16|v4i32))")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "(CLS|CLZ|CNT)(v4i32|v8i16|v16i8)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADALP","^UADALP")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADDLPv","^UADDLPv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SADDLV","^UADDLV")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+             (instregex "^ADDVv","^SMAXVv","^UMAXVv","^SMINVv","^UMINVv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+             (instregex "^SABAv","^UABAv","^SABALv","^UABALv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^SQADDv","^SQSUBv","^UQADDv","^UQSUBv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^SUQADDv","^USQADDv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^ADDHNv","^RADDHNv", "^RSUBHNv",
+                       "^SQABS", "^SQADD", "^SQNEG", "^SQSUB",
+                       "^SRHADD", "^SUBHNv", "^SUQADD",
+                       "^UQADD", "^UQSUB", "^URHADD", "^USQADD")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^CMEQv","^CMGEv","^CMGTv",
+                       "^CMLEv","^CMLTv", "^CMHIv","^CMHSv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^SMAXv","^SMINv","^UMAXv","^UMINv",
+                       "^SMAXPv","^SMINPv","^UMAXPv","^UMINPv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^SABDv","^UABDv", "^SABDLv","^UABDLv")>;
+
+//---
+// 3.13 ASIMD Floating-point Instructions
+//---
+
+// ASIMD FP absolute value
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FABSv")>;
+
+// ASIMD FP arith, normal, D-form
+// ASIMD FP arith, normal, Q-form
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123],
+            (instregex "^FABDv", "^FADDv", "^FSUBv")>;
+
+// ASIMD FP arith,pairwise, D-form
+// ASIMD FP arith, pairwise, Q-form
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FADDPv")>;
+
+// ASIMD FP compare, D-form
+// ASIMD FP compare, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FACGEv", "^FACGTv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FCMEQv", "^FCMGEv",
+                                                 "^FCMGTv", "^FCMLEv",
+                                                 "^FCMLTv")>;
+
+// ASIMD FP round, D-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^FRINT[AIMNPXZ](v2f32)")>;
+// ASIMD FP round, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^FRINT[AIMNPXZ](v4f32|v2f64)")>;
+
+// ASIMD FP convert, long
+// ASIMD FP convert, narrow
+// ASIMD FP convert, other, D-form
+// ASIMD FP convert, other, Q-form
+// NOTE: Handled by WriteV.
+
+// ASIMD FP convert, long and narrow
+def : InstRW<[THX3T110Write_5Cyc_F01], (instregex "^FCVT(L|N|XN)v")>;
+// ASIMD FP convert, other, D-form
+def : InstRW<[THX3T110Write_5Cyc_F01],
+      (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v2f32|v1i32|v2i32|v1i64)")>;
+// ASIMD FP convert, other, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F01],
+      (instregex "^[FVSU]CVT([AMNPZ][SU])?(_Int)?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP divide, D-form, F32
+def : InstRW<[THX3T110Write_16Cyc_F0123], (instrs FDIVv2f32)>;
+def : InstRW<[THX3T110Write_16Cyc_F0123], (instregex "FDIVv2f32")>;
+
+// ASIMD FP divide, Q-form, F32
+def : InstRW<[THX3T110Write_16Cyc_F0123], (instrs FDIVv4f32)>;
+def : InstRW<[THX3T110Write_16Cyc_F0123], (instregex "FDIVv4f32")>;
+
+// ASIMD FP divide, Q-form, F64
+def : InstRW<[THX3T110Write_23Cyc_F0123], (instrs FDIVv2f64)>;
+def : InstRW<[THX3T110Write_23Cyc_F0123], (instregex "FDIVv2f64")>;
+
+// ASIMD FP max/min, normal, D-form
+// ASIMD FP max/min, normal, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXv", "^FMAXNMv",
+                                                "^FMINv", "^FMINNMv")>;
+
+// ASIMD FP max/min, pairwise, D-form
+// ASIMD FP max/min, pairwise, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXPv", "^FMAXNMPv",
+                                                "^FMINPv", "^FMINNMPv")>;
+
+// ASIMD FP max/min, reduce
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FMAXVv", "^FMAXNMVv",
+                                                "^FMINVv", "^FMINNMVv")>;
+
+// ASIMD FP multiply, D-form, FZ
+// ASIMD FP multiply, D-form, no FZ
+// ASIMD FP multiply, Q-form, FZ
+// ASIMD FP multiply, Q-form, no FZ
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^FMULv", "^FMULXv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^FMULX?(v2f32|v1i32|v2i32|v1i64|32|64)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^FMULX?(v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP multiply accumulate, Dform, FZ
+// ASIMD FP multiply accumulate, Dform, no FZ
+// ASIMD FP multiply accumulate, Qform, FZ
+// ASIMD FP multiply accumulate, Qform, no FZ
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^FMLAv", "^FMLSv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^FML[AS](v2f32|v1i32|v2i32|v1i64)")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^FML[AS](v4f32|v2f64|v4i32|v2i64)")>;
+
+// ASIMD FP negate
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^FNEGv")>;
+
+//--
+// 3.14 ASIMD Miscellaneous Instructions
+//--
+
+// ASIMD bit reverse
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^RBITv")>;
+
+// ASIMD bitwise insert, D-form
+// ASIMD bitwise insert, Q-form
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123],
+            (instregex "^BIFv", "^BITv", "^BSLv")>;
+
+// ASIMD count, D-form
+// ASIMD count, Q-form
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123],
+            (instregex "^CLSv", "^CLZv", "^CNTv")>;
+
+// ASIMD duplicate, gen reg
+// ASIMD duplicate, element
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUPv")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^CPY")>;
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^DUPv.+gpr")>;
+
+// ASIMD extract
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^EXTv")>;
+
+// ASIMD extract narrow
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^XTNv")>;
+
+// ASIMD extract narrow, saturating
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^SQXTNv", "^SQXTUNv", "^UQXTNv")>;
+
+// ASIMD insert, element to element
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^INSv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]MOVv")>;
+
+// ASIMD move, integer immed
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^MOVIv")>;
+
+// ASIMD move, FP immed
+def : InstRW<[THX3T110Write_3_4Cyc_F23_F0123], (instregex "^FMOVv")>;
+
+// ASIMD transpose
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^TRN1", "^TRN2")>;
+
+// ASIMD unzip/zip
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^UZP1", "^UZP2", "^ZIP1", "^ZIP2")>;
+
+// ASIMD reciprocal estimate, D-form
+// ASIMD reciprocal estimate, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^FRECPEv", "^FRECPXv", "^URECPEv",
+                       "^FRSQRTEv", "^URSQRTEv")>;
+
+// ASIMD reciprocal step, D-form, FZ
+// ASIMD reciprocal step, D-form, no FZ
+// ASIMD reciprocal step, Q-form, FZ
+// ASIMD reciprocal step, Q-form, no FZ
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+              (instregex "^FRECPSv", "^FRSQRTSv")>;
+
+// ASIMD reverse
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instregex "^REV16v", "^REV32v", "^REV64v")>;
+
+// ASIMD table lookup, D-form
+// ASIMD table lookup, Q-form
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+            (instrs TBLv8i8One, TBLv16i8One, TBXv8i8One, TBXv16i8One)>;
+def : InstRW<[THX3T110Write_10Cyc_F0123],
+            (instrs TBLv8i8Two, TBLv16i8Two, TBXv8i8Two, TBXv16i8Two)>;
+def : InstRW<[THX3T110Write_15Cyc_F0123],
+            (instrs TBLv8i8Three, TBLv16i8Three, TBXv8i8Three, TBXv16i8Three)>;
+def : InstRW<[THX3T110Write_20Cyc_F0123],
+            (instrs TBLv8i8Four, TBLv16i8Four, TBXv8i8Four, TBXv16i8Four)>;
+
+// ASIMD transfer, element to word or word
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^[SU]MOVv")>;
+
+// ASIMD transfer, element to gen reg
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "(S|U)MOVv.*")>;
+
+// ASIMD transfer gen reg to element
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^INSv")>;
+
+// ASIMD transpose
+def : InstRW<[THX3T110Write_5Cyc_F0123],
+              (instregex "^TRN1v", "^TRN2v", "^UZP1v", "^UZP2v")>;
+
+// ASIMD unzip/zip
+def : InstRW<[THX3T110Write_5Cyc_F0123], (instregex "^ZIP1v", "^ZIP2v")>;
+
+//--
+// 3.15 ASIMD Load Instructions
+//--
+
+// ASIMD load, 1 element, multiple, 1 reg, D-form
+// ASIMD load, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[THX3T110Write_4Cyc_LS01],
+            (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr],
+            (instregex "^LD1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 2 reg, D-form
+// ASIMD load, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[THX3T110Write_4Cyc_LS01],
+            (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_4Cyc_LS01, WriteAdr],
+            (instregex "^LD1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 3 reg, D-form
+// ASIMD load, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[THX3T110Write_5Cyc_LS01],
+            (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_5Cyc_LS01, WriteAdr],
+            (instregex "^LD1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, multiple, 4 reg, D-form
+// ASIMD load, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[THX3T110Write_6Cyc_LS01],
+            (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_6Cyc_LS01, WriteAdr],
+            (instregex "^LD1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 1 element, one lane, B/H/S
+// ASIMD load, 1 element, one lane, D
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123],
+            (instregex "^LD1i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr],
+            (instregex "^LD1i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 1 element, all lanes, D-form, B/H/S
+// ASIMD load, 1 element, all lanes, D-form, D
+// ASIMD load, 1 element, all lanes, Q-form
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123],
+            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr],
+            (instregex "^LD1Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, multiple, D-form, B/H/S
+// ASIMD load, 2 element, multiple, Q-form, D
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123],
+            (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr],
+            (instregex "^LD2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 2 element, one lane, B/H
+// ASIMD load, 2 element, one lane, S
+// ASIMD load, 2 element, one lane, D
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123],
+            (instregex "^LD2i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr],
+            (instregex "^LD2i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 2 element, all lanes, D-form, B/H/S
+// ASIMD load, 2 element, all lanes, D-form, D
+// ASIMD load, 2 element, all lanes, Q-form
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123],
+            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_5Cyc_LS01_F0123, WriteAdr],
+            (instregex "^LD2Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, multiple, D-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, B/H/S
+// ASIMD load, 3 element, multiple, Q-form, D
+def : InstRW<[THX3T110Write_8Cyc_LS01_F0123],
+            (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_8Cyc_LS01_F0123, WriteAdr],
+            (instregex "^LD3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 3 element, one lone, B/H
+// ASIMD load, 3 element, one lane, S
+// ASIMD load, 3 element, one lane, D
+def : InstRW<[THX3T110Write_7Cyc_LS01_F0123],
+            (instregex "^LD3i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_7Cyc_LS01_F0123, WriteAdr],
+            (instregex "^LD3i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 3 element, all lanes, D-form, B/H/S
+// ASIMD load, 3 element, all lanes, D-form, D
+// ASIMD load, 3 element, all lanes, Q-form, B/H/S
+// ASIMD load, 3 element, all lanes, Q-form, D
+def : InstRW<[THX3T110Write_7Cyc_LS01_F0123],
+            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_7Cyc_LS01_F0123, WriteAdr],
+            (instregex "^LD3Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, multiple, D-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, B/H/S
+// ASIMD load, 4 element, multiple, Q-form, D
+def : InstRW<[THX3T110Write_8Cyc_LS01_F0123],
+            (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_8Cyc_LS01_F0123, WriteAdr],
+            (instregex "^LD4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD load, 4 element, one lane, B/H
+// ASIMD load, 4 element, one lane, S
+// ASIMD load, 4 element, one lane, D
+def : InstRW<[THX3T110Write_6Cyc_LS01_F0123],
+            (instregex "^LD4i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_6Cyc_LS01_F0123, WriteAdr],
+            (instregex "^LD4i(8|16|32|64)_POST$")>;
+
+// ASIMD load, 4 element, all lanes, D-form, B/H/S
+// ASIMD load, 4 element, all lanes, D-form, D
+// ASIMD load, 4 element, all lanes, Q-form, B/H/S
+// ASIMD load, 4 element, all lanes, Q-form, D
+def : InstRW<[THX3T110Write_6Cyc_LS01_F0123],
+            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_6Cyc_LS01_F0123, WriteAdr],
+            (instregex "^LD4Rv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+//--
+// 3.16 ASIMD Store Instructions
+//--
+
+// ASIMD store, 1 element, multiple, 1 reg, D-form
+// ASIMD store, 1 element, multiple, 1 reg, Q-form
+def : InstRW<[THX3T110Write_1Cyc_LS01],
+            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr],
+            (instregex "^ST1Onev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 2 reg, D-form
+// ASIMD store, 1 element, multiple, 2 reg, Q-form
+def : InstRW<[THX3T110Write_1Cyc_LS01],
+            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr],
+            (instregex "^ST1Twov(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 3 reg, D-form
+// ASIMD store, 1 element, multiple, 3 reg, Q-form
+def : InstRW<[THX3T110Write_1Cyc_LS01],
+            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr],
+            (instregex "^ST1Threev(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, multiple, 4 reg, D-form
+// ASIMD store, 1 element, multiple, 4 reg, Q-form
+def : InstRW<[THX3T110Write_1Cyc_LS01],
+            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01, WriteAdr],
+            (instregex "^ST1Fourv(8b|4h|2s|1d|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 1 element, one lane, B/H/S
+// ASIMD store, 1 element, one lane, D
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123],
+            (instregex "^ST1i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr],
+            (instregex "^ST1i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 2 element, multiple, D-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, B/H/S
+// ASIMD store, 2 element, multiple, Q-form, D
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123],
+            (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr],
+            (instregex "^ST2Twov(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 2 element, one lane, B/H/S
+// ASIMD store, 2 element, one lane, D
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123],
+            (instregex "^ST2i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr],
+            (instregex "^ST2i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 3 element, multiple, D-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, B/H/S
+// ASIMD store, 3 element, multiple, Q-form, D
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123],
+            (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr],
+            (instregex "^ST3Threev(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 3 element, one lane, B/H
+// ASIMD store, 3 element, one lane, S
+// ASIMD store, 3 element, one lane, D
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123],
+            (instregex "^ST3i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr],
+            (instregex "^ST3i(8|16|32|64)_POST$")>;
+
+// ASIMD store, 4 element, multiple, D-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, B/H/S
+// ASIMD store, 4 element, multiple, Q-form, D
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123],
+            (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr],
+            (instregex "^ST4Fourv(8b|4h|2s|16b|8h|4s|2d)_POST$")>;
+
+// ASIMD store, 4 element, one lane, B/H
+// ASIMD store, 4 element, one lane, S
+// ASIMD store, 4 element, one lane, D
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123],
+            (instregex "^ST4i(8|16|32|64)$")>;
+def : InstRW<[THX3T110Write_1Cyc_LS01_F0123, WriteAdr],
+            (instregex "^ST4i(8|16|32|64)_POST$")>;
+
+// V8.1a Atomics (LSE)
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+            (instrs CASB, CASH, CASW, CASX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+            (instrs CASAB, CASAH, CASAW, CASAX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+            (instrs CASLB, CASLH, CASLW, CASLX)>;
+
+def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic],
+            (instrs CASALB, CASALH, CASALW, CASALX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+            (instrs LDLARB, LDLARH, LDLARW, LDLARX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+            (instrs LDADDB, LDADDH, LDADDW, LDADDX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+            (instrs LDADDAB, LDADDAH, LDADDAW, LDADDAX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+            (instrs LDADDLB, LDADDLH, LDADDLW, LDADDLX)>;
+
+def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic],
+            (instrs LDADDALB, LDADDALH, LDADDALW, LDADDALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+            (instrs LDCLRB, LDCLRH, LDCLRW, LDCLRX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+            (instrs LDCLRAB, LDCLRAH, LDCLRAW, LDCLRAX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+            (instrs LDCLRLB, LDCLRLH, LDCLRLW, LDCLRLX)>;
+
+def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic],
+            (instrs LDCLRALB, LDCLRALH, LDCLRALW, LDCLRALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+            (instrs LDEORB, LDEORH, LDEORW, LDEORX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+            (instrs LDEORAB, LDEORAH, LDEORAW, LDEORAX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+            (instrs LDEORLB, LDEORLH, LDEORLW, LDEORLX)>;
+
+def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic],
+            (instrs LDEORALB, LDEORALH, LDEORALW, LDEORALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+            (instrs LDSETB, LDSETH, LDSETW, LDSETX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+            (instrs LDSETAB, LDSETAH, LDSETAW, LDSETAX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+            (instrs LDSETLB, LDSETLH, LDSETLW, LDSETLX)>;
+
+def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic],
+            (instrs LDSETALB, LDSETALH, LDSETALW, LDSETALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+            (instrs LDSMAXB, LDSMAXH, LDSMAXW, LDSMAXX,
+             LDSMAXAB, LDSMAXAH, LDSMAXAW, LDSMAXAX,
+             LDSMAXLB, LDSMAXLH, LDSMAXLW, LDSMAXLX,
+             LDSMAXALB, LDSMAXALH, LDSMAXALW, LDSMAXALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+            (instrs LDSMINB, LDSMINH, LDSMINW, LDSMINX,
+             LDSMINAB, LDSMINAH, LDSMINAW, LDSMINAX,
+             LDSMINLB, LDSMINLH, LDSMINLW, LDSMINLX,
+             LDSMINALB, LDSMINALH, LDSMINALW, LDSMINALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+            (instrs LDUMAXB, LDUMAXH, LDUMAXW, LDUMAXX,
+             LDUMAXAB, LDUMAXAH, LDUMAXAW, LDUMAXAX,
+             LDUMAXLB, LDUMAXLH, LDUMAXLW, LDUMAXLX,
+             LDUMAXALB, LDUMAXALH, LDUMAXALW, LDUMAXALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+            (instrs LDUMINB, LDUMINH, LDUMINW, LDUMINX,
+             LDUMINAB, LDUMINAH, LDUMINAW, LDUMINAX,
+             LDUMINLB, LDUMINLH, LDUMINLW, LDUMINLX,
+             LDUMINALB, LDUMINALH, LDUMINALW, LDUMINALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+            (instrs SWPB, SWPH, SWPW, SWPX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+            (instrs SWPAB, SWPAH, SWPAW, SWPAX)>;
+
+def : InstRW<[THX3T110Write_6Cyc_I0123, WriteAtomic],
+            (instrs SWPLB, SWPLH, SWPLW, SWPLX)>;
+
+def : InstRW<[THX3T110Write_8Cyc_I0123, WriteAtomic],
+            (instrs SWPALB, SWPALH, SWPALW, SWPALX)>;
+
+def : InstRW<[THX3T110Write_4Cyc_I0123, WriteAtomic],
+            (instrs STLLRB, STLLRH, STLLRW, STLLRX)>;
+
+// V8.3a PAC
+def : InstRW<[THX3T110Write_11Cyc_LS01_I1], (instregex "^LDRAA", "^LDRAB")>;
+def : InstRW<[THX3T110Write_8Cyc_I123],
+            (instrs BLRAA, BLRAAZ, BLRAB, BLRABZ,
+                    BRAA, BRAAZ, BRAB, BRABZ)>;
+def : InstRW<[THX3T110Write_8Cyc_I123], (instrs RETAA, RETAB)>;
+
+} // SchedModel = ThunderX3T110Model
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
index ba61ed726e84..8f814d185e85 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.cpp
@@ -17,7 +17,7 @@ using namespace llvm;
 
 SDValue AArch64SelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, unsigned Align, bool isVolatile,
+    SDValue Size, Align Alignment, bool isVolatile,
     MachinePointerInfo DstPtrInfo) const {
   // Check to see if there is a specialized entry-point for memory zeroing.
   ConstantSDNode *V = dyn_cast<ConstantSDNode>(Src);
@@ -117,7 +117,7 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag(
 
   MachineFunction &MF = DAG.getMachineFunction();
   MachineMemOperand *BaseMemOperand = MF.getMachineMemOperand(
-      DstPtrInfo, MachineMemOperand::MOStore, ObjSize, 16);
+      DstPtrInfo, MachineMemOperand::MOStore, ObjSize, Align(16));
 
   bool UseSetTagRangeLoop =
       kSetTagLoopThreshold >= 0 && (int)ObjSize >= kSetTagLoopThreshold;
@@ -125,21 +125,18 @@ SDValue AArch64SelectionDAGInfo::EmitTargetCodeForSetTag(
     return EmitUnrolledSetTag(DAG, dl, Chain, Addr, ObjSize, BaseMemOperand,
                               ZeroData);
 
-  if (ObjSize % 32 != 0) {
-    SDNode *St1 = DAG.getMachineNode(
-        ZeroData ? AArch64::STZGPostIndex : AArch64::STGPostIndex, dl,
-        {MVT::i64, MVT::Other},
-        {Addr, Addr, DAG.getTargetConstant(1, dl, MVT::i64), Chain});
-    DAG.setNodeMemRefs(cast<MachineSDNode>(St1), {BaseMemOperand});
-    ObjSize -= 16;
-    Addr = SDValue(St1, 0);
-    Chain = SDValue(St1, 1);
-  }
-
   const EVT ResTys[] = {MVT::i64, MVT::i64, MVT::Other};
-  SDValue Ops[] = {DAG.getConstant(ObjSize, dl, MVT::i64), Addr, Chain};
-  SDNode *St = DAG.getMachineNode(
-      ZeroData ? AArch64::STZGloop : AArch64::STGloop, dl, ResTys, Ops);
+
+  unsigned Opcode;
+  if (Addr.getOpcode() == ISD::FrameIndex) {
+    int FI = cast<FrameIndexSDNode>(Addr)->getIndex();
+    Addr = DAG.getTargetFrameIndex(FI, MVT::i64);
+    Opcode = ZeroData ? AArch64::STZGloop : AArch64::STGloop;
+  } else {
+    Opcode = ZeroData ? AArch64::STZGloop_wback : AArch64::STGloop_wback;
+  }
+  SDValue Ops[] = {DAG.getTargetConstant(ObjSize, dl, MVT::i64), Addr, Chain};
+  SDNode *St = DAG.getMachineNode(Opcode, dl, ResTys, Ops);
 
   DAG.setNodeMemRefs(cast<MachineSDNode>(St), {BaseMemOperand});
   return SDValue(St, 2);
diff --git a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
index d0967fb973cc..d94fd8471b7b 100644
--- a/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64SelectionDAGInfo.h
@@ -21,7 +21,8 @@ class AArch64SelectionDAGInfo : public SelectionDAGTargetInfo {
 public:
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  SDValue Size, Align Alignment,
+                                  bool isVolatile,
                                   MachinePointerInfo DstPtrInfo) const override;
   SDValue EmitTargetCodeForSetTag(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
diff --git a/llvm/lib/Target/AArch64/AArch64StackOffset.h b/llvm/lib/Target/AArch64/AArch64StackOffset.h
index f95b5dc5246e..6fa1c744f77e 100644
--- a/llvm/lib/Target/AArch64/AArch64StackOffset.h
+++ b/llvm/lib/Target/AArch64/AArch64StackOffset.h
@@ -16,6 +16,7 @@
 
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/TypeSize.h"
+#include <cassert>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index 975502818fcd..61f27cbc3b29 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -19,10 +19,13 @@
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/StackSafetyAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/LiveRegUnits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
@@ -44,6 +47,7 @@
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/IR/Metadata.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
@@ -61,6 +65,11 @@ static cl::opt<bool> ClMergeInit(
     "stack-tagging-merge-init", cl::Hidden, cl::init(true), cl::ZeroOrMore,
     cl::desc("merge stack variable initializers with tagging when possible"));
 
+static cl::opt<bool>
+    ClUseStackSafety("stack-tagging-use-stack-safety", cl::Hidden,
+                     cl::init(true), cl::ZeroOrMore,
+                     cl::desc("Use Stack Safety analysis results"));
+
 static cl::opt<unsigned> ClScanLimit("stack-tagging-merge-init-scan-limit",
                                      cl::init(40), cl::Hidden);
 
@@ -256,8 +265,9 @@ public:
       Type *EltTy = VecTy->getElementType();
       if (EltTy->isPointerTy()) {
         uint32_t EltSize = DL->getTypeSizeInBits(EltTy);
-        Type *NewTy = VectorType::get(IntegerType::get(Ctx, EltSize),
-                                      VecTy->getNumElements());
+        auto *NewTy = FixedVectorType::get(
+            IntegerType::get(Ctx, EltSize),
+            cast<FixedVectorType>(VecTy)->getNumElements());
         V = IRB.CreatePointerCast(V, NewTy);
       }
     }
@@ -275,15 +285,17 @@ class AArch64StackTagging : public FunctionPass {
     int Tag; // -1 for non-tagged allocations
   };
 
-  bool MergeInit;
+  const bool MergeInit;
+  const bool UseStackSafety;
 
 public:
   static char ID; // Pass ID, replacement for typeid
 
-  AArch64StackTagging(bool MergeInit = true)
+  AArch64StackTagging(bool IsOptNone = false)
       : FunctionPass(ID),
-        MergeInit(ClMergeInit.getNumOccurrences() > 0 ? ClMergeInit
-                                                      : MergeInit) {
+        MergeInit(ClMergeInit.getNumOccurrences() ? ClMergeInit : !IsOptNone),
+        UseStackSafety(ClUseStackSafety.getNumOccurrences() ? ClUseStackSafety
+                                                            : !IsOptNone) {
     initializeAArch64StackTaggingPass(*PassRegistry::getPassRegistry());
   }
 
@@ -305,13 +317,16 @@ public:
   StringRef getPassName() const override { return "AArch64 Stack Tagging"; }
 
 private:
-  Function *F;
-  Function *SetTagFunc;
-  const DataLayout *DL;
-  AAResults *AA;
+  Function *F = nullptr;
+  Function *SetTagFunc = nullptr;
+  const DataLayout *DL = nullptr;
+  AAResults *AA = nullptr;
+  const StackSafetyGlobalInfo *SSI = nullptr;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    if (UseStackSafety)
+      AU.addRequired<StackSafetyGlobalInfoWrapperPass>();
     if (MergeInit)
       AU.addRequired<AAResultsWrapperPass>();
   }
@@ -323,11 +338,13 @@ char AArch64StackTagging::ID = 0;
 
 INITIALIZE_PASS_BEGIN(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging",
                       false, false)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(StackSafetyGlobalInfoWrapperPass)
 INITIALIZE_PASS_END(AArch64StackTagging, DEBUG_TYPE, "AArch64 Stack Tagging",
                     false, false)
 
-FunctionPass *llvm::createAArch64StackTaggingPass(bool MergeInit) {
-  return new AArch64StackTagging(MergeInit);
+FunctionPass *llvm::createAArch64StackTaggingPass(bool IsOptNone) {
+  return new AArch64StackTagging(IsOptNone);
 }
 
 Instruction *AArch64StackTagging::collectInitializers(Instruction *StartInst,
@@ -400,7 +417,9 @@ bool AArch64StackTagging::isInterestingAlloca(const AllocaInst &AI) {
       // dynamic alloca instrumentation for them as well.
       !AI.isUsedWithInAlloca() &&
       // swifterror allocas are register promoted by ISel
-      !AI.isSwiftError();
+      !AI.isSwiftError() &&
+      // safe allocas are not interesting
+      !(SSI && SSI->isSafe(AI));
   return IsInteresting;
 }
 
@@ -482,7 +501,7 @@ void AArch64StackTagging::alignAndPadAlloca(AllocaInfo &Info) {
   auto *NewAI = new AllocaInst(
       TypeWithPadding, Info.AI->getType()->getAddressSpace(), nullptr, "", Info.AI);
   NewAI->takeName(Info.AI);
-  NewAI->setAlignment(MaybeAlign(Info.AI->getAlignment()));
+  NewAI->setAlignment(Info.AI->getAlign());
   NewAI->setUsedWithInAlloca(Info.AI->isUsedWithInAlloca());
   NewAI->setSwiftError(Info.AI->isSwiftError());
   NewAI->copyMetadata(*Info.AI);
@@ -516,6 +535,8 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
   if (!Fn.hasFnAttribute(Attribute::SanitizeMemTag))
     return false;
 
+  if (UseStackSafety)
+    SSI = &getAnalysis<StackSafetyGlobalInfoWrapperPass>().getResult();
   F = &Fn;
   DL = &Fn.getParent()->getDataLayout();
   if (MergeInit)
diff --git a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 5deb601822b8..a94856ef4fba 100644
--- a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -149,7 +149,9 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
         continue;
       const MachineOperand *BaseOp;
       int64_t Offset;
-      if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, TRI) &&
+      bool OffsetIsScalable;
+      if (TII->getMemOperandWithOffset(MI, BaseOp, Offset, OffsetIsScalable,
+                                       TRI) &&
           BaseOp->isReg()) {
         Register BaseReg = BaseOp->getReg();
         if (PrevBaseReg == BaseReg) {
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 3636d8d2b628..029535cb98b5 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -13,12 +13,12 @@
 #include "AArch64Subtarget.h"
 
 #include "AArch64.h"
-#include "AArch64CallLowering.h"
 #include "AArch64InstrInfo.h"
-#include "AArch64LegalizerInfo.h"
 #include "AArch64PBQPRegAlloc.h"
-#include "AArch64RegisterBankInfo.h"
 #include "AArch64TargetMachine.h"
+#include "GISel/AArch64CallLowering.h"
+#include "GISel/AArch64LegalizerInfo.h"
+#include "GISel/AArch64RegisterBankInfo.h"
 #include "MCTargetDesc/AArch64AddressingModes.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/MachineScheduler.h"
@@ -47,6 +47,18 @@ static cl::opt<bool>
                    cl::desc("Call nonlazybind functions via direct GOT load"),
                    cl::init(false), cl::Hidden);
 
+static cl::opt<unsigned> SVEVectorBitsMax(
+    "aarch64-sve-vector-bits-max",
+    cl::desc("Assume SVE vector registers are at most this big, "
+             "with zero meaning no maximum size is assumed."),
+    cl::init(0), cl::Hidden);
+
+static cl::opt<unsigned> SVEVectorBitsMin(
+    "aarch64-sve-vector-bits-min",
+    cl::desc("Assume SVE vector registers are at least this big, "
+             "with zero meaning no minimum size is assumed."),
+    cl::init(0), cl::Hidden);
+
 AArch64Subtarget &
 AArch64Subtarget::initializeSubtargetDependencies(StringRef FS,
                                                   StringRef CPUString) {
@@ -68,6 +80,9 @@ void AArch64Subtarget::initializeProperties() {
   switch (ARMProcFamily) {
   case Others:
     break;
+  case Carmel:
+    CacheLineSize = 64;
+    break;
   case CortexA35:
     break;
   case CortexA53:
@@ -86,8 +101,16 @@ void AArch64Subtarget::initializeProperties() {
   case CortexA73:
   case CortexA75:
   case CortexA76:
+  case CortexA77:
+  case CortexA78:
+  case CortexX1:
     PrefFunctionLogAlignment = 4;
     break;
+  case A64FX:
+    CacheLineSize = 256;
+    PrefFunctionLogAlignment = 5;
+    PrefLoopLogAlignment = 5;
+    break;
   case AppleA7:
   case AppleA10:
   case AppleA11:
@@ -160,6 +183,17 @@ void AArch64Subtarget::initializeProperties() {
     PrefFunctionLogAlignment = 4;
     PrefLoopLogAlignment = 2;
     break;
+  case ThunderX3T110:
+    CacheLineSize = 64;
+    PrefFunctionLogAlignment = 4;
+    PrefLoopLogAlignment = 2;
+    MaxInterleaveFactor = 4;
+    PrefetchDistance = 128;
+    MinPrefetchStride = 1024;
+    MaxPrefetchIterationsAhead = 4;
+    // FIXME: remove this to enable 64-bit SLP if performance looks good.
+    MinVectorRegisterBitWidth = 128;
+    break;
   }
 }
 
@@ -177,6 +211,7 @@ AArch64Subtarget::AArch64Subtarget(const Triple &TT, const std::string &CPU,
     ReserveXRegister.set(18);
 
   CallLoweringInfo.reset(new AArch64CallLowering(*getTargetLowering()));
+  InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
   Legalizer.reset(new AArch64LegalizerInfo(*this));
 
   auto *RBI = new AArch64RegisterBankInfo(*getRegisterInfo());
@@ -194,6 +229,10 @@ const CallLowering *AArch64Subtarget::getCallLowering() const {
   return CallLoweringInfo.get();
 }
 
+const InlineAsmLowering *AArch64Subtarget::getInlineAsmLowering() const {
+  return InlineAsmLoweringInfo.get();
+}
+
 InstructionSelector *AArch64Subtarget::getInstructionSelector() const {
   return InstSelector.get();
 }
@@ -305,3 +344,25 @@ void AArch64Subtarget::mirFileLoaded(MachineFunction &MF) const {
   if (!MFI.isMaxCallFrameSizeComputed())
     MFI.computeMaxCallFrameSize(MF);
 }
+
+unsigned AArch64Subtarget::getMaxSVEVectorSizeInBits() const {
+  assert(HasSVE && "Tried to get SVE vector length without SVE support!");
+  assert(SVEVectorBitsMax % 128 == 0 &&
+         "SVE requires vector length in multiples of 128!");
+  assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
+         "Minimum SVE vector size should not be larger than its maximum!");
+  if (SVEVectorBitsMax == 0)
+    return 0;
+  return (std::max(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
+}
+
+unsigned AArch64Subtarget::getMinSVEVectorSizeInBits() const {
+  assert(HasSVE && "Tried to get SVE vector length without SVE support!");
+  assert(SVEVectorBitsMin % 128 == 0 &&
+         "SVE requires vector length in multiples of 128!");
+  assert((SVEVectorBitsMax >= SVEVectorBitsMin || SVEVectorBitsMax == 0) &&
+         "Minimum SVE vector size should not be larger than its maximum!");
+  if (SVEVectorBitsMax == 0)
+    return (SVEVectorBitsMin / 128) * 128;
+  return (std::min(SVEVectorBitsMin, SVEVectorBitsMax) / 128) * 128;
+}
diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.h b/llvm/lib/Target/AArch64/AArch64Subtarget.h
index 79c2c161d3cb..b111f0016948 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.h
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.h
@@ -19,6 +19,7 @@
 #include "AArch64RegisterInfo.h"
 #include "AArch64SelectionDAGInfo.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
+#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
@@ -38,11 +39,13 @@ class AArch64Subtarget final : public AArch64GenSubtargetInfo {
 public:
   enum ARMProcFamilyEnum : uint8_t {
     Others,
+    A64FX,
     AppleA7,
     AppleA10,
     AppleA11,
     AppleA12,
     AppleA13,
+    Carmel,
     CortexA35,
     CortexA53,
     CortexA55,
@@ -52,6 +55,9 @@ public:
     CortexA73,
     CortexA75,
     CortexA76,
+    CortexA77,
+    CortexA78,
+    CortexX1,
     ExynosM3,
     Falkor,
     Kryo,
@@ -63,7 +69,8 @@ public:
     ThunderXT81,
     ThunderXT83,
     ThunderXT88,
-    TSV110
+    TSV110,
+    ThunderX3T110
   };
 
 protected:
@@ -75,6 +82,7 @@ protected:
   bool HasV8_3aOps = false;
   bool HasV8_4aOps = false;
   bool HasV8_5aOps = false;
+  bool HasV8_6aOps = false;
 
   bool HasFPARMv8 = false;
   bool HasNEON = false;
@@ -99,6 +107,10 @@ protected:
   bool HasPAN_RWV = false;
   bool HasCCPP = false;
 
+  // SVE extensions
+  bool HasSVE = false;
+  bool UseExperimentalZeroingPseudos = false;
+
   // Armv8.2 Crypto extensions
   bool HasSM4 = false;
   bool HasSHA3 = false;
@@ -125,8 +137,6 @@ protected:
   bool HasRCPC_IMMO = false;
 
   bool HasLSLFast = false;
-  bool HasSVE = false;
-  bool HasSVE2 = false;
   bool HasRCPC = false;
   bool HasAggressiveFMA = false;
 
@@ -143,7 +153,17 @@ protected:
   bool HasMTE = false;
   bool HasTME = false;
 
+  // Armv8.6-A Extensions
+  bool HasBF16 = false;
+  bool HasMatMulInt8 = false;
+  bool HasMatMulFP32 = false;
+  bool HasMatMulFP64 = false;
+  bool HasAMVS = false;
+  bool HasFineGrainedTraps = false;
+  bool HasEnhancedCounterVirtualization = false;
+
   // Arm SVE2 extensions
+  bool HasSVE2 = false;
   bool HasSVE2AES = false;
   bool HasSVE2SM4 = false;
   bool HasSVE2SHA3 = false;
@@ -196,6 +216,8 @@ protected:
   bool UseEL2ForTP = false;
   bool UseEL3ForTP = false;
   bool AllowTaggedGlobals = false;
+  bool HardenSlsRetBr = false;
+  bool HardenSlsBlr = false;
   uint8_t MaxInterleaveFactor = 2;
   uint8_t VectorInsertExtractBaseCost = 3;
   uint16_t CacheLineSize = 0;
@@ -225,6 +247,7 @@ protected:
 
   /// GlobalISel related APIs.
   std::unique_ptr<CallLowering> CallLoweringInfo;
+  std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
   std::unique_ptr<InstructionSelector> InstSelector;
   std::unique_ptr<LegalizerInfo> Legalizer;
   std::unique_ptr<RegisterBankInfo> RegBankInfo;
@@ -260,6 +283,7 @@ public:
     return &getInstrInfo()->getRegisterInfo();
   }
   const CallLowering *getCallLowering() const override;
+  const InlineAsmLowering *getInlineAsmLowering() const override;
   InstructionSelector *getInstructionSelector() const override;
   const LegalizerInfo *getLegalizerInfo() const override;
   const RegisterBankInfo *getRegBankInfo() const override;
@@ -347,6 +371,9 @@ public:
            hasFuseCCSelect() || hasFuseLiterals();
   }
 
+  bool hardenSlsRetBr() const { return HardenSlsRetBr; }
+  bool hardenSlsBlr() const { return HardenSlsBlr; }
+
   bool useEL1ForTP() const { return UseEL1ForTP; }
   bool useEL2ForTP() const { return UseEL2ForTP; }
   bool useEL3ForTP() const { return UseEL3ForTP; }
@@ -359,7 +386,12 @@ public:
   }
   unsigned getCacheLineSize() const override { return CacheLineSize; }
   unsigned getPrefetchDistance() const override { return PrefetchDistance; }
-  unsigned getMinPrefetchStride() const override { return MinPrefetchStride; }
+  unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                unsigned NumStridedMemAccesses,
+                                unsigned NumPrefetches,
+                                bool HasCall) const override {
+    return MinPrefetchStride;
+  }
   unsigned getMaxPrefetchIterationsAhead() const override {
     return MaxPrefetchIterationsAhead;
   }
@@ -372,6 +404,10 @@ public:
 
   unsigned getWideningBaseCost() const { return WideningBaseCost; }
 
+  bool useExperimentalZeroingPseudos() const {
+    return UseExperimentalZeroingPseudos;
+  }
+
   /// CPU has TBI (top byte of addresses is ignored during HW address
   /// translation) and OS enables it.
   bool supportsAddressTopByteIgnored() const;
@@ -401,6 +437,16 @@ public:
   bool hasSVE2SM4() const { return HasSVE2SM4; }
   bool hasSVE2SHA3() const { return HasSVE2SHA3; }
   bool hasSVE2BitPerm() const { return HasSVE2BitPerm; }
+  bool hasMatMulInt8() const { return HasMatMulInt8; }
+  bool hasMatMulFP32() const { return HasMatMulFP32; }
+  bool hasMatMulFP64() const { return HasMatMulFP64; }
+
+  // Armv8.6-A Extensions
+  bool hasBF16() const { return HasBF16; }
+  bool hasFineGrainedTraps() const { return HasFineGrainedTraps; }
+  bool hasEnhancedCounterVirtualization() const {
+    return HasEnhancedCounterVirtualization;
+  }
 
   bool isLittleEndian() const { return IsLittle; }
 
@@ -438,6 +484,7 @@ public:
   bool hasDIT() const { return HasDIT; }
   bool hasTRACEV8_4() const { return HasTRACEV8_4; }
   bool hasAM() const { return HasAM; }
+  bool hasAMVS() const { return HasAMVS; }
   bool hasSEL2() const { return HasSEL2; }
   bool hasPMU() const { return HasPMU; }
   bool hasTLB_RMI() const { return HasTLB_RMI; }
@@ -497,6 +544,12 @@ public:
   }
 
   void mirFileLoaded(MachineFunction &MF) const override;
+
+  // Return the known range for the bit length of SVE data registers. A value
+  // of 0 means nothing is known about that particular limit beyong what's
+  // implied by the architecture.
+  unsigned getMaxSVEVectorSizeInBits() const;
+  unsigned getMinSVEVectorSizeInBits() const;
 };
 } // End llvm namespace
 
diff --git a/llvm/lib/Target/AArch64/AArch64SystemOperands.td b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
index 6e82d326e519..ceceabc6ff4e 100644
--- a/llvm/lib/Target/AArch64/AArch64SystemOperands.td
+++ b/llvm/lib/Target/AArch64/AArch64SystemOperands.td
@@ -18,18 +18,18 @@ include "llvm/TableGen/SearchableTable.td"
 //===----------------------------------------------------------------------===//
 
 def HasCCPP    : Predicate<"Subtarget->hasCCPP()">,
-                 AssemblerPredicate<"FeatureCCPP", "ccpp">;
+                 AssemblerPredicate<(all_of FeatureCCPP), "ccpp">;
 
 def HasPAN     : Predicate<"Subtarget->hasPAN()">,
-                 AssemblerPredicate<"FeaturePAN",
+                 AssemblerPredicate<(all_of FeaturePAN),
                  "ARM v8.1  Privileged Access-Never extension">;
 
 def HasPsUAO   : Predicate<"Subtarget->hasPsUAO()">,
-                 AssemblerPredicate<"FeaturePsUAO",
+                 AssemblerPredicate<(all_of FeaturePsUAO),
                  "ARM v8.2 UAO PState extension (psuao)">;
 
 def HasPAN_RWV : Predicate<"Subtarget->hasPAN_RWV()">,
-                 AssemblerPredicate<"FeaturePAN_RWV",
+                 AssemblerPredicate<(all_of FeaturePAN_RWV),
                  "ARM v8.2 PAN AT S1E1R and AT S1E1W Variation">;
 
 //===----------------------------------------------------------------------===//
@@ -338,7 +338,7 @@ def : PState<"PAN",     0b00100>;
 // v8.2a "User Access Override" extension-specific PStates
 let Requires = [{ {AArch64::FeaturePsUAO} }] in
 def : PState<"UAO",     0b00011>;
-// v8.4a timining insensitivity of data processing instructions
+// v8.4a timing insensitivity of data processing instructions
 let Requires = [{ {AArch64::FeatureDIT} }] in
 def : PState<"DIT",     0b11010>;
 // v8.5a Spectre Mitigation
@@ -844,7 +844,7 @@ def : RWSysReg<"SP_EL2",             0b11, 0b110, 0b0100, 0b0001, 0b000>;
 def : RWSysReg<"SPSel",              0b11, 0b000, 0b0100, 0b0010, 0b000>;
 def : RWSysReg<"NZCV",               0b11, 0b011, 0b0100, 0b0010, 0b000>;
 def : RWSysReg<"DAIF",               0b11, 0b011, 0b0100, 0b0010, 0b001>;
-def : RWSysReg<"CurrentEL",          0b11, 0b000, 0b0100, 0b0010, 0b010>;
+def : ROSysReg<"CurrentEL",          0b11, 0b000, 0b0100, 0b0010, 0b010>;
 def : RWSysReg<"SPSR_irq",           0b11, 0b100, 0b0100, 0b0011, 0b000>;
 def : RWSysReg<"SPSR_abt",           0b11, 0b100, 0b0100, 0b0011, 0b001>;
 def : RWSysReg<"SPSR_und",           0b11, 0b100, 0b0100, 0b0011, 0b010>;
@@ -1167,7 +1167,6 @@ def : RWSysReg<"ICC_SRE_EL3",        0b11, 0b110, 0b1100, 0b1100, 0b101>;
 def : RWSysReg<"ICC_IGRPEN0_EL1",    0b11, 0b000, 0b1100, 0b1100, 0b110>;
 def : RWSysReg<"ICC_IGRPEN1_EL1",    0b11, 0b000, 0b1100, 0b1100, 0b111>;
 def : RWSysReg<"ICC_IGRPEN1_EL3",    0b11, 0b110, 0b1100, 0b1100, 0b111>;
-def : RWSysReg<"ICC_SEIEN_EL1",      0b11, 0b000, 0b1100, 0b1101, 0b000>;
 def : RWSysReg<"ICC_AP0R0_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b100>;
 def : RWSysReg<"ICC_AP0R1_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b101>;
 def : RWSysReg<"ICC_AP0R2_EL1",      0b11, 0b000, 0b1100, 0b1000, 0b110>;
@@ -1185,9 +1184,8 @@ def : RWSysReg<"ICH_AP1R1_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b001>;
 def : RWSysReg<"ICH_AP1R2_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b010>;
 def : RWSysReg<"ICH_AP1R3_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b011>;
 def : RWSysReg<"ICH_HCR_EL2",        0b11, 0b100, 0b1100, 0b1011, 0b000>;
-def : RWSysReg<"ICH_MISR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b010>;
+def : ROSysReg<"ICH_MISR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b010>;
 def : RWSysReg<"ICH_VMCR_EL2",       0b11, 0b100, 0b1100, 0b1011, 0b111>;
-def : RWSysReg<"ICH_VSEIR_EL2",      0b11, 0b100, 0b1100, 0b1001, 0b100>;
 def : RWSysReg<"ICH_LR0_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b000>;
 def : RWSysReg<"ICH_LR1_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b001>;
 def : RWSysReg<"ICH_LR2_EL2",        0b11, 0b100, 0b1100, 0b1100, 0b010>;
@@ -1260,7 +1258,7 @@ let Requires = [{ {AArch64::FeatureSPE} }] in {
 def : RWSysReg<"PMBLIMITR_EL1", 0b11, 0b000, 0b1001, 0b1010, 0b000>;
 def : RWSysReg<"PMBPTR_EL1",    0b11, 0b000, 0b1001, 0b1010, 0b001>;
 def : RWSysReg<"PMBSR_EL1",     0b11, 0b000, 0b1001, 0b1010, 0b011>;
-def : RWSysReg<"PMBIDR_EL1",    0b11, 0b000, 0b1001, 0b1010, 0b111>;
+def : ROSysReg<"PMBIDR_EL1",    0b11, 0b000, 0b1001, 0b1010, 0b111>;
 def : RWSysReg<"PMSCR_EL2",     0b11, 0b100, 0b1001, 0b1001, 0b000>;
 def : RWSysReg<"PMSCR_EL12",    0b11, 0b101, 0b1001, 0b1001, 0b000>;
 def : RWSysReg<"PMSCR_EL1",     0b11, 0b000, 0b1001, 0b1001, 0b000>;
@@ -1269,7 +1267,7 @@ def : RWSysReg<"PMSIRR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b011>;
 def : RWSysReg<"PMSFCR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b100>;
 def : RWSysReg<"PMSEVFR_EL1",   0b11, 0b000, 0b1001, 0b1001, 0b101>;
 def : RWSysReg<"PMSLATFR_EL1",  0b11, 0b000, 0b1001, 0b1001, 0b110>;
-def : RWSysReg<"PMSIDR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b111>;
+def : ROSysReg<"PMSIDR_EL1",    0b11, 0b000, 0b1001, 0b1001, 0b111>;
 }
 
 // v8.2a "RAS extension" registers
@@ -1333,7 +1331,6 @@ def : RWSysReg<"PMMIR_EL1", 0b11, 0b000, 0b1001, 0b1110, 0b110>;
 let Requires = [{ {AArch64::FeatureRASv8_4} }] in {
 def : RWSysReg<"ERXPFGCTL_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b101>;
 def : RWSysReg<"ERXPFGCDN_EL1", 0b11, 0b000, 0b0101, 0b0100, 0b110>;
-def : RWSysReg<"ERXTS_EL1",     0b11, 0b000, 0b0101, 0b0101, 0b111>;
 def : RWSysReg<"ERXMISC2_EL1",  0b11, 0b000, 0b0101, 0b0101, 0b010>;
 def : RWSysReg<"ERXMISC3_EL1",  0b11, 0b000, 0b0101, 0b0101, 0b011>;
 def : ROSysReg<"ERXPFGF_EL1",   0b11, 0b000, 0b0101, 0b0100, 0b100>;
@@ -1360,7 +1357,7 @@ def : RWSysReg<"MPAMVPM7_EL2", 0b11, 0b100, 0b1010, 0b0110, 0b111>;
 def : ROSysReg<"MPAMIDR_EL1",  0b11, 0b000, 0b1010, 0b0100, 0b100>;
 } //FeatureMPAM
 
-// v8.4a Activitiy Monitor registers
+// v8.4a Activity Monitor registers
 //                                 Op0   Op1    CRn     CRm     Op2
 let Requires = [{ {AArch64::FeatureAM} }] in {
 def : RWSysReg<"AMCR_EL0",         0b11, 0b011, 0b1101, 0b0010, 0b000>;
@@ -1426,7 +1423,7 @@ def : RWSysReg<"TRFCR_EL2",        0b11, 0b100, 0b0001, 0b0010, 0b001>;
 def : RWSysReg<"TRFCR_EL12",       0b11, 0b101, 0b0001, 0b0010, 0b001>;
 } //FeatureTRACEV8_4
 
-// v8.4a Timining insensitivity of data processing instructions
+// v8.4a Timing insensitivity of data processing instructions
 // DIT: Data Independent Timing instructions
 //                                 Op0   Op1    CRn     CRm     Op2
 let Requires = [{ {AArch64::FeatureDIT} }] in {
@@ -1490,6 +1487,41 @@ def : RWSysReg<"TRBTRG_EL1",    0b11, 0b000, 0b1001, 0b1011, 0b110>;
 def : ROSysReg<"TRBIDR_EL1",    0b11, 0b000, 0b1001, 0b1011, 0b111>;
 } // FeatureTRBE
 
+
+// v8.6a Activity Monitors Virtualization Support
+let Requires = [{ {AArch64::FeatureAMVS} }] in {
+foreach n = 0-15 in {
+  foreach x = 0-1 in {
+  def : RWSysReg<"AMEVCNTVOFF"#x#n#"_EL2",
+    0b11, 0b100, 0b1101, 0b1000, 0b000>{
+      let Encoding{4} = x;
+      let Encoding{3-0} = n;
+    }
+  }
+}
+}
+
+// v8.6a Fine Grained Virtualization Traps
+//                                 Op0   Op1    CRn     CRm     Op2
+let Requires = [{ {AArch64::FeatureFineGrainedTraps} }] in {
+def : RWSysReg<"HFGRTR_EL2",       0b11, 0b100, 0b0001, 0b0001, 0b100>;
+def : RWSysReg<"HFGWTR_EL2",       0b11, 0b100, 0b0001, 0b0001, 0b101>;
+def : RWSysReg<"HFGITR_EL2",       0b11, 0b100, 0b0001, 0b0001, 0b110>;
+def : RWSysReg<"HDFGRTR_EL2",      0b11, 0b100, 0b0011, 0b0001, 0b100>;
+def : RWSysReg<"HDFGWTR_EL2",      0b11, 0b100, 0b0011, 0b0001, 0b101>;
+}
+
+// v8.6a Enhanced Counter Virtualization
+//                                 Op0   Op1    CRn     CRm     Op2
+let Requires = [{ {AArch64::FeatureEnhancedCounterVirtualization} }] in {
+def : RWSysReg<"CNTSCALE_EL2",     0b11, 0b100, 0b1110, 0b0000, 0b100>;
+def : RWSysReg<"CNTISCALE_EL2",    0b11, 0b100, 0b1110, 0b0000, 0b101>;
+def : RWSysReg<"CNTPOFF_EL2",      0b11, 0b100, 0b1110, 0b0000, 0b110>;
+def : RWSysReg<"CNTVFRQ_EL2",      0b11, 0b100, 0b1110, 0b0000, 0b111>;
+def : RWSysReg<"CNTPCTSS_EL0",     0b11, 0b011, 0b1110, 0b0000, 0b101>;
+def : RWSysReg<"CNTVCTSS_EL0",     0b11, 0b011, 0b1110, 0b0000, 0b110>;
+}
+
 // Cyclone specific system registers
 //                                 Op0    Op1     CRn     CRm    Op2
 let Requires = [{ {AArch64::ProcAppleA7} }] in
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 115a7da8a6d9..a63b9a97ada5 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -11,6 +11,7 @@
 
 #include "AArch64TargetMachine.h"
 #include "AArch64.h"
+#include "AArch64MachineFunctionInfo.h"
 #include "AArch64MacroFusion.h"
 #include "AArch64Subtarget.h"
 #include "AArch64TargetObjectFile.h"
@@ -26,6 +27,7 @@
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
 #include "llvm/CodeGen/GlobalISel/Localizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
+#include "llvm/CodeGen/MIRParser/MIParser.h"
 #include "llvm/CodeGen/MachineScheduler.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -146,6 +148,11 @@ static cl::opt<int> EnableGlobalISelAtO(
     cl::desc("Enable GlobalISel at or below an opt level (-1 to disable)"),
     cl::init(0));
 
+static cl::opt<bool> EnableSVEIntrinsicOpts(
+    "aarch64-sve-intrinsic-opts", cl::Hidden,
+    cl::desc("Enable SVE intrinsic opts"),
+    cl::init(true));
+
 static cl::opt<bool> EnableFalkorHWPFFix("aarch64-enable-falkor-hwpf-fix",
                                          cl::init(true), cl::Hidden);
 
@@ -176,13 +183,16 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAArch64Target() {
   initializeAArch64LoadStoreOptPass(*PR);
   initializeAArch64SIMDInstrOptPass(*PR);
   initializeAArch64PreLegalizerCombinerPass(*PR);
+  initializeAArch64PostLegalizerCombinerPass(*PR);
   initializeAArch64PromoteConstantPass(*PR);
   initializeAArch64RedundantCopyEliminationPass(*PR);
   initializeAArch64StorePairSuppressPass(*PR);
   initializeFalkorHWPFFixPass(*PR);
   initializeFalkorMarkStridedAccessesLegacyPass(*PR);
   initializeLDTLSCleanupPass(*PR);
+  initializeSVEIntrinsicOptsPass(*PR);
   initializeAArch64SpeculationHardeningPass(*PR);
+  initializeAArch64SLSHardeningPass(*PR);
   initializeAArch64StackTaggingPass(*PR);
   initializeAArch64StackTaggingPreRAPass(*PR);
 }
@@ -236,12 +246,8 @@ getEffectiveAArch64CodeModel(const Triple &TT, Optional<CodeModel::Model> CM,
   if (CM) {
     if (*CM != CodeModel::Small && *CM != CodeModel::Tiny &&
         *CM != CodeModel::Large) {
-      if (!TT.isOSFuchsia())
-        report_fatal_error(
-            "Only small, tiny and large code models are allowed on AArch64");
-      else if (*CM != CodeModel::Kernel)
-        report_fatal_error("Only small, tiny, kernel, and large code models "
-                           "are allowed on AArch64");
+      report_fatal_error(
+          "Only small, tiny and large code models are allowed on AArch64");
     } else if (*CM == CodeModel::Tiny && !TT.isOSBinFormatELF())
       report_fatal_error("tiny code model is only supported on ELF");
     return *CM;
@@ -313,6 +319,9 @@ AArch64TargetMachine::AArch64TargetMachine(const Target &T, const Triple &TT,
 
   // AArch64 supports default outlining behaviour.
   setSupportsDefaultOutlining(true);
+
+  // AArch64 supports the debug entry values.
+  setSupportsDebugEntryValues(true);
 }
 
 AArch64TargetMachine::~AArch64TargetMachine() = default;
@@ -403,6 +412,7 @@ public:
   bool addIRTranslator() override;
   void addPreLegalizeMachineIR() override;
   bool addLegalizeMachineIR() override;
+  void addPreRegBankSelect() override;
   bool addRegBankSelect() override;
   void addPreGlobalInstructionSelect() override;
   bool addGlobalInstructionSelect() override;
@@ -435,6 +445,10 @@ void AArch64PassConfig::addIRPasses() {
   // ourselves.
   addPass(createAtomicExpandPass());
 
+  // Expand any SVE vector library calls that we can't code generate directly.
+  if (EnableSVEIntrinsicOpts && TM->getOptLevel() == CodeGenOpt::Aggressive)
+    addPass(createSVEIntrinsicOptsPass());
+
   // Cmpxchg instructions are often used with a subsequent comparison to
   // determine whether it succeeded. We can exploit existing control-flow in
   // ldrex/strex loops to simplify this, but it needs tidying up.
@@ -454,6 +468,9 @@ void AArch64PassConfig::addIRPasses() {
 
   TargetPassConfig::addIRPasses();
 
+  addPass(createAArch64StackTaggingPass(
+      /*IsOptNone=*/TM->getOptLevel() == CodeGenOpt::None));
+
   // Match interleaved memory accesses to ldN/stN intrinsics.
   if (TM->getOptLevel() != CodeGenOpt::None) {
     addPass(createInterleavedLoadCombinePass());
@@ -473,9 +490,6 @@ void AArch64PassConfig::addIRPasses() {
     addPass(createLICMPass());
   }
 
-  addPass(createAArch64StackTaggingPass(/* MergeInit = */ TM->getOptLevel() !=
-                                        CodeGenOpt::None));
-
   // Add Control Flow Guard checks.
   if (TM->getTargetTriple().isOSWindows())
     addPass(createCFGuardCheckPass());
@@ -541,6 +555,14 @@ bool AArch64PassConfig::addLegalizeMachineIR() {
   return false;
 }
 
+void AArch64PassConfig::addPreRegBankSelect() {
+  // For now we don't add this to the pipeline for -O0. We could do in future
+  // if we split the combines into separate O0/opt groupings.
+  bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+  if (!IsOptNone)
+    addPass(createAArch64PostLegalizeCombiner(IsOptNone));
+}
+
 bool AArch64PassConfig::addRegBankSelect() {
   addPass(new RegBankSelect());
   return false;
@@ -614,6 +636,9 @@ void AArch64PassConfig::addPreSched2() {
   // info.
   addPass(createAArch64SpeculationHardeningPass());
 
+  addPass(createAArch64IndirectThunks());
+  addPass(createAArch64SLSHardeningPass());
+
   if (TM->getOptLevel() != CodeGenOpt::None) {
     if (EnableFalkorHWPFFix)
       addPass(createFalkorHWPFFixPass());
@@ -648,4 +673,28 @@ void AArch64PassConfig::addPreEmitPass() {
   if (TM->getOptLevel() != CodeGenOpt::None && EnableCollectLOH &&
       TM->getTargetTriple().isOSBinFormatMachO())
     addPass(createAArch64CollectLOHPass());
+
+  // SVE bundles move prefixes with destructive operations.
+  addPass(createUnpackMachineBundles(nullptr));
+}
+
+yaml::MachineFunctionInfo *
+AArch64TargetMachine::createDefaultFuncInfoYAML() const {
+  return new yaml::AArch64FunctionInfo();
+}
+
+yaml::MachineFunctionInfo *
+AArch64TargetMachine::convertFuncInfoToYAML(const MachineFunction &MF) const {
+  const auto *MFI = MF.getInfo<AArch64FunctionInfo>();
+  return new yaml::AArch64FunctionInfo(*MFI);
+}
+
+bool AArch64TargetMachine::parseMachineFunctionInfo(
+    const yaml::MachineFunctionInfo &MFI, PerFunctionMIParsingState &PFS,
+    SMDiagnostic &Error, SMRange &SourceRange) const {
+  const auto &YamlMFI =
+      reinterpret_cast<const yaml::AArch64FunctionInfo &>(MFI);
+  MachineFunction &MF = PFS.MF;
+  MF.getInfo<AArch64FunctionInfo>()->initializeBaseYamlFields(YamlMFI);
+  return false;
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.h b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
index 5264efb89b9c..7738a4229391 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.h
@@ -49,6 +49,14 @@ public:
     return TLOF.get();
   }
 
+  yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override;
+  yaml::MachineFunctionInfo *
+  convertFuncInfoToYAML(const MachineFunction &MF) const override;
+  bool parseMachineFunctionInfo(const yaml::MachineFunctionInfo &,
+                                PerFunctionMIParsingState &PFS,
+                                SMDiagnostic &Error,
+                                SMRange &SourceRange) const override;
+
 private:
   bool isLittle;
 };
diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
index 54562094fcf5..dfc66f0cb4c1 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.cpp
@@ -20,7 +20,6 @@ using namespace dwarf;
 void AArch64_ELFTargetObjectFile::Initialize(MCContext &Ctx,
                                              const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
   // AARCH64 ELF ABI does not define static relocation type for TLS offset
   // within a module.  Do not generate AT_location for TLS variables.
   SupportDebugThreadLocalLocation = false;
@@ -43,7 +42,7 @@ const MCExpr *AArch64_MachoTargetObjectFile::getTTypeGlobalReference(
     const MCExpr *Res =
         MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
     MCSymbol *PCSym = getContext().createTempSymbol();
-    Streamer.EmitLabel(PCSym);
+    Streamer.emitLabel(PCSym);
     const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
     return MCBinaryExpr::createSub(Res, PC, getContext());
   }
@@ -68,7 +67,7 @@ const MCExpr *AArch64_MachoTargetObjectFile::getIndirectSymViaGOTPCRel(
   const MCExpr *Res =
       MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, getContext());
   MCSymbol *PCSym = getContext().createTempSymbol();
-  Streamer.EmitLabel(PCSym);
+  Streamer.emitLabel(PCSym);
   const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
   return MCBinaryExpr::createSub(Res, PC, getContext());
 }
diff --git a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
index 1cb4c028c80d..28324c2ae608 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetObjectFile.h
@@ -18,6 +18,11 @@ class AArch64TargetMachine;
 /// This implementation is used for AArch64 ELF targets (Linux in particular).
 class AArch64_ELFTargetObjectFile : public TargetLoweringObjectFileELF {
   void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
+
+public:
+  AArch64_ELFTargetObjectFile() {
+    PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT;
+  }
 };
 
 /// AArch64_MachoTargetObjectFile - This TLOF implementation is used for Darwin.
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 4724d6b8daea..cf6de797727b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -57,7 +57,8 @@ int AArch64TTIImpl::getIntImmCost(int64_t Val) {
 }
 
 /// Calculate the cost of materializing the given constant.
-int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+                                  TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -82,7 +83,8 @@ int AArch64TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
 }
 
 int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
-                                      const APInt &Imm, Type *Ty) {
+                                      const APInt &Imm, Type *Ty,
+                                      TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -139,16 +141,17 @@ int AArch64TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
 
   if (Idx == ImmIdx) {
     int NumConstants = (BitSize + 63) / 64;
-    int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+    int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
     return (Cost <= NumConstants * TTI::TCC_Basic)
                ? static_cast<int>(TTI::TCC_Free)
                : Cost;
   }
-  return AArch64TTIImpl::getIntImmCost(Imm, Ty);
+  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
 }
 
 int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
-                                        const APInt &Imm, Type *Ty) {
+                                        const APInt &Imm, Type *Ty,
+                                        TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -161,7 +164,7 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
   // selected instruction, so we compute the materialization cost for the
   // immediate directly.
   if (IID >= Intrinsic::aarch64_addg && IID <= Intrinsic::aarch64_udiv)
-    return AArch64TTIImpl::getIntImmCost(Imm, Ty);
+    return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
 
   switch (IID) {
   default:
@@ -174,7 +177,7 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
   case Intrinsic::umul_with_overflow:
     if (Idx == 1) {
       int NumConstants = (BitSize + 63) / 64;
-      int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty);
+      int Cost = AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
       return (Cost <= NumConstants * TTI::TCC_Basic)
                  ? static_cast<int>(TTI::TCC_Free)
                  : Cost;
@@ -190,7 +193,7 @@ int AArch64TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
       return TTI::TCC_Free;
     break;
   }
-  return AArch64TTIImpl::getIntImmCost(Imm, Ty);
+  return AArch64TTIImpl::getIntImmCost(Imm, Ty, CostKind);
 }
 
 TargetTransformInfo::PopcntSupportKind
@@ -208,8 +211,8 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
   // A helper that returns a vector type from the given type. The number of
   // elements in type Ty determine the vector width.
   auto toVectorTy = [&](Type *ArgTy) {
-    return VectorType::get(ArgTy->getScalarType(),
-                           DstTy->getVectorNumElements());
+    return FixedVectorType::get(ArgTy->getScalarType(),
+                                cast<FixedVectorType>(DstTy)->getNumElements());
   };
 
   // Exit early if DstTy is not a vector type whose elements are at least
@@ -251,7 +254,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
 
   // Legalize the source type and ensure it can be used in a widening
   // operation.
-  Type *SrcTy = toVectorTy(Extend->getSrcTy());
+  auto *SrcTy = toVectorTy(Extend->getSrcTy());
   auto SrcTyL = TLI->getTypeLegalizationCost(DL, SrcTy);
   unsigned SrcElTySize = SrcTyL.second.getScalarSizeInBits();
   if (!SrcTyL.second.isVector() || SrcElTySize != SrcTy->getScalarSizeInBits())
@@ -267,6 +270,7 @@ bool AArch64TTIImpl::isWideningInstruction(Type *DstTy, unsigned Opcode,
 }
 
 int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                     TTI::TargetCostKind CostKind,
                                      const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
@@ -291,11 +295,18 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     }
   }
 
+  // TODO: Allow non-throughput costs that aren't binary.
+  auto AdjustCost = [&CostKind](int Cost) {
+    if (CostKind != TTI::TCK_RecipThroughput)
+      return Cost == 0 ? 0 : 1;
+    return Cost;
+  };
+
   EVT SrcTy = TLI->getValueType(DL, Src);
   EVT DstTy = TLI->getValueType(DL, Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return BaseT::getCastInstrCost(Opcode, Dst, Src);
+    return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
 
   static const TypeConversionCostTblEntry
   ConversionTbl[] = {
@@ -397,9 +408,9 @@ int AArch64TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   if (const auto *Entry = ConvertCostTableLookup(ConversionTbl, ISD,
                                                  DstTy.getSimpleVT(),
                                                  SrcTy.getSimpleVT()))
-    return Entry->Cost;
+    return AdjustCost(Entry->Cost);
 
-  return BaseT::getCastInstrCost(Opcode, Dst, Src);
+  return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
 }
 
 int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
@@ -425,17 +436,18 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
   auto VecLT = TLI->getTypeLegalizationCost(DL, VecTy);
   auto DstVT = TLI->getValueType(DL, Dst);
   auto SrcVT = TLI->getValueType(DL, Src);
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   // If the resulting type is still a vector and the destination type is legal,
   // we may get the extension for free. If not, get the default cost for the
   // extend.
   if (!VecLT.second.isVector() || !TLI->isTypeLegal(DstVT))
-    return Cost + getCastInstrCost(Opcode, Dst, Src);
+    return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
 
   // The destination type should be larger than the element type. If not, get
   // the default cost for the extend.
   if (DstVT.getSizeInBits() < SrcVT.getSizeInBits())
-    return Cost + getCastInstrCost(Opcode, Dst, Src);
+    return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
 
   switch (Opcode) {
   default:
@@ -454,7 +466,16 @@ int AArch64TTIImpl::getExtractWithExtendCost(unsigned Opcode, Type *Dst,
   }
 
   // If we are unable to perform the extend for free, get the default cost.
-  return Cost + getCastInstrCost(Opcode, Dst, Src);
+  return Cost + getCastInstrCost(Opcode, Dst, Src, CostKind);
+}
+
+unsigned AArch64TTIImpl::getCFInstrCost(unsigned Opcode,
+                                        TTI::TargetCostKind CostKind) {
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return Opcode == Instruction::PHI ? 0 : 1;
+  assert(CostKind == TTI::TCK_RecipThroughput && "unexpected CostKind");
+  // Branches are assumed to be predicted.
+  return 0;
 }
 
 int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
@@ -483,10 +504,17 @@ int AArch64TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val,
 }
 
 int AArch64TTIImpl::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+    TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
     const Instruction *CxtI) {
+  // TODO: Handle more cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+                                         Opd2Info, Opd1PropInfo,
+                                         Opd2PropInfo, Args, CxtI);
+
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
@@ -504,7 +532,8 @@ int AArch64TTIImpl::getArithmeticInstrCost(
 
   switch (ISD) {
   default:
-    return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+    return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+                                                Opd2Info,
                                                 Opd1PropInfo, Opd2PropInfo);
   case ISD::SDIV:
     if (Opd2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -513,16 +542,20 @@ int AArch64TTIImpl::getArithmeticInstrCost(
       // normally expanded to the sequence ADD + CMP + SELECT + SRA.
       // The OperandValue properties many not be same as that of previous
       // operation; conservatively assume OP_None.
-      Cost += getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info, Opd2Info,
+      Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
+                                     Opd1Info, Opd2Info,
                                      TargetTransformInfo::OP_None,
                                      TargetTransformInfo::OP_None);
-      Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Opd1Info, Opd2Info,
+      Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind,
+                                     Opd1Info, Opd2Info,
                                      TargetTransformInfo::OP_None,
                                      TargetTransformInfo::OP_None);
-      Cost += getArithmeticInstrCost(Instruction::Select, Ty, Opd1Info, Opd2Info,
+      Cost += getArithmeticInstrCost(Instruction::Select, Ty, CostKind,
+                                     Opd1Info, Opd2Info,
                                      TargetTransformInfo::OP_None,
                                      TargetTransformInfo::OP_None);
-      Cost += getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info, Opd2Info,
+      Cost += getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
+                                     Opd1Info, Opd2Info,
                                      TargetTransformInfo::OP_None,
                                      TargetTransformInfo::OP_None);
       return Cost;
@@ -535,31 +568,34 @@ int AArch64TTIImpl::getArithmeticInstrCost(
         // Vector signed division by constant are expanded to the
         // sequence MULHS + ADD/SUB + SRA + SRL + ADD, and unsigned division
         // to MULHS + SUB + SRL + ADD + SRL.
-        int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, Opd1Info,
-                                             Opd2Info,
+        int MulCost = getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
+                                             Opd1Info, Opd2Info,
                                              TargetTransformInfo::OP_None,
                                              TargetTransformInfo::OP_None);
-        int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, Opd1Info,
-                                             Opd2Info,
+        int AddCost = getArithmeticInstrCost(Instruction::Add, Ty, CostKind,
+                                             Opd1Info, Opd2Info,
                                              TargetTransformInfo::OP_None,
                                              TargetTransformInfo::OP_None);
-        int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, Opd1Info,
-                                             Opd2Info,
+        int ShrCost = getArithmeticInstrCost(Instruction::AShr, Ty, CostKind,
+                                             Opd1Info, Opd2Info,
                                              TargetTransformInfo::OP_None,
                                              TargetTransformInfo::OP_None);
         return MulCost * 2 + AddCost * 2 + ShrCost * 2 + 1;
       }
     }
 
-    Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+    Cost += BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+                                          Opd2Info,
                                           Opd1PropInfo, Opd2PropInfo);
     if (Ty->isVectorTy()) {
       // On AArch64, vector divisions are not supported natively and are
       // expanded into scalar divisions of each pair of elements.
-      Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, Opd1Info,
-                                     Opd2Info, Opd1PropInfo, Opd2PropInfo);
-      Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, Opd1Info,
-                                     Opd2Info, Opd1PropInfo, Opd2PropInfo);
+      Cost += getArithmeticInstrCost(Instruction::ExtractElement, Ty, CostKind,
+                                     Opd1Info, Opd2Info, Opd1PropInfo,
+                                     Opd2PropInfo);
+      Cost += getArithmeticInstrCost(Instruction::InsertElement, Ty, CostKind,
+                                     Opd1Info, Opd2Info, Opd1PropInfo,
+                                     Opd2PropInfo);
       // TODO: if one of the arguments is scalar, then it's not necessary to
       // double the cost of handling the vector elements.
       Cost += Cost;
@@ -574,6 +610,16 @@ int AArch64TTIImpl::getArithmeticInstrCost(
     // These nodes are marked as 'custom' for combining purposes only.
     // We know that they are legal. See LowerAdd in ISelLowering.
     return (Cost + 1) * LT.first;
+
+  case ISD::FADD:
+    // These nodes are marked as 'custom' just to lower them to SVE.
+    // We know said lowering will incur no additional cost.
+    if (isa<FixedVectorType>(Ty) && !Ty->getScalarType()->isFP128Ty())
+      return (Cost + 2) * LT.first;
+
+    return Cost + BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+                                                Opd2Info,
+                                                Opd1PropInfo, Opd2PropInfo);
   }
 }
 
@@ -596,7 +642,12 @@ int AArch64TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
 }
 
 int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                       Type *CondTy, const Instruction *I) {
+                                       Type *CondTy,
+                                       TTI::TargetCostKind CostKind,
+                                       const Instruction *I) {
+  // TODO: Handle other cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
 
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // We don't lower some vector selects well that are wider than the register
@@ -623,13 +674,18 @@ int AArch64TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
         return Entry->Cost;
     }
   }
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
 }
 
 AArch64TTIImpl::TTI::MemCmpExpansionOptions
 AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   TTI::MemCmpExpansionOptions Options;
-  Options.AllowOverlappingLoads = !ST->requiresStrictAlign();
+  if (ST->requiresStrictAlign()) {
+    // TODO: Add cost modeling for strict align. Misaligned loads expand to
+    // a bunch of instructions when strict align is enabled.
+    return Options;
+  }
+  Options.AllowOverlappingLoads = true;
   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
   Options.NumLoadsPerBlock = Options.MaxNumLoads;
   // TODO: Though vector loads usually perform well on AArch64, in some targets
@@ -641,7 +697,17 @@ AArch64TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
 
 int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
                                     MaybeAlign Alignment, unsigned AddressSpace,
+                                    TTI::TargetCostKind CostKind,
                                     const Instruction *I) {
+  // TODO: Handle other cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return 1;
+
+  // Type legalization can't handle structs
+  if (TLI->getValueType(DL, Ty,  true) == MVT::Other)
+    return BaseT::getMemoryOpCost(Opcode, Ty, Alignment, AddressSpace,
+                                  CostKind);
+
   auto LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   if (ST->isMisaligned128StoreSlow() && Opcode == Instruction::Store &&
@@ -656,7 +722,8 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
     return LT.first * 2 * AmortizationCost;
   }
 
-  if (Ty->isVectorTy() && Ty->getVectorElementType()->isIntegerTy(8)) {
+  if (Ty->isVectorTy() &&
+      cast<VectorType>(Ty)->getElementType()->isIntegerTy(8)) {
     unsigned ProfitableNumElements;
     if (Opcode == Instruction::Store)
       // We use a custom trunc store lowering so v.4b should be profitable.
@@ -666,8 +733,8 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
       // have to promote the elements to v.2.
       ProfitableNumElements = 8;
 
-    if (Ty->getVectorNumElements() < ProfitableNumElements) {
-      unsigned NumVecElts = Ty->getVectorNumElements();
+    if (cast<FixedVectorType>(Ty)->getNumElements() < ProfitableNumElements) {
+      unsigned NumVecElts = cast<FixedVectorType>(Ty)->getNumElements();
       unsigned NumVectorizableInstsToAmortize = NumVecElts * 2;
       // We generate 2 instructions per vector element.
       return NumVectorizableInstsToAmortize * NumVecElts * 2;
@@ -677,20 +744,18 @@ int AArch64TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Ty,
   return LT.first;
 }
 
-int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                               unsigned Factor,
-                                               ArrayRef<unsigned> Indices,
-                                               unsigned Alignment,
-                                               unsigned AddressSpace,
-                                               bool UseMaskForCond,
-                                               bool UseMaskForGaps) {
+int AArch64TTIImpl::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+    bool UseMaskForCond, bool UseMaskForGaps) {
   assert(Factor >= 2 && "Invalid interleave factor");
-  assert(isa<VectorType>(VecTy) && "Expect a vector type");
+  auto *VecVTy = cast<FixedVectorType>(VecTy);
 
   if (!UseMaskForCond && !UseMaskForGaps &&
       Factor <= TLI->getMaxSupportedInterleaveFactor()) {
-    unsigned NumElts = VecTy->getVectorNumElements();
-    auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
+    unsigned NumElts = VecVTy->getNumElements();
+    auto *SubVecTy =
+        FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
     // ldN/stN only support legal vector types of size 64 or 128 in bits.
     // Accesses having vector types that are a multiple of 128 bits can be
@@ -701,18 +766,20 @@ int AArch64TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace,
+                                           Alignment, AddressSpace, CostKind,
                                            UseMaskForCond, UseMaskForGaps);
 }
 
 int AArch64TTIImpl::getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys) {
   int Cost = 0;
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   for (auto *I : Tys) {
     if (!I->isVectorTy())
       continue;
-    if (I->getScalarSizeInBits() * I->getVectorNumElements() == 128)
-      Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0) +
-              getMemoryOpCost(Instruction::Load, I, Align(128), 0);
+    if (I->getScalarSizeInBits() * cast<FixedVectorType>(I)->getNumElements() ==
+        128)
+      Cost += getMemoryOpCost(Instruction::Store, I, Align(128), 0, CostKind) +
+              getMemoryOpCost(Instruction::Load, I, Align(128), 0, CostKind);
   }
   return Cost;
 }
@@ -792,6 +859,11 @@ void AArch64TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
     getFalkorUnrollingPreferences(L, SE, UP);
 }
 
+void AArch64TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                                           TTI::PeelingPreferences &PP) {
+  BaseT::getPeelingPreferences(L, SE, PP);
+}
+
 Value *AArch64TTIImpl::getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
                                                          Type *ExpectedType) {
   switch (Inst->getIntrinsicID()) {
@@ -902,7 +974,7 @@ bool AArch64TTIImpl::shouldConsiderAddressTypePromotion(
 
 bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
                                            TTI::ReductionFlags Flags) const {
-  assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
+  auto *VTy = cast<VectorType>(Ty);
   unsigned ScalarBits = Ty->getScalarSizeInBits();
   switch (Opcode) {
   case Instruction::FAdd:
@@ -913,10 +985,10 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
   case Instruction::Mul:
     return false;
   case Instruction::Add:
-    return ScalarBits * Ty->getVectorNumElements() >= 128;
+    return ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128;
   case Instruction::ICmp:
     return (ScalarBits < 64) &&
-           (ScalarBits * Ty->getVectorNumElements() >= 128);
+           (ScalarBits * cast<FixedVectorType>(VTy)->getNumElements() >= 128);
   case Instruction::FCmp:
     return Flags.NoNaN;
   default:
@@ -925,11 +997,14 @@ bool AArch64TTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
   return false;
 }
 
-int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
-                                               bool IsPairwiseForm) {
+int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode,
+                                               VectorType *ValTy,
+                                               bool IsPairwiseForm,
+                                               TTI::TargetCostKind CostKind) {
 
   if (IsPairwiseForm)
-    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
+    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
+                                             CostKind);
 
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
   MVT MTy = LT.second;
@@ -950,11 +1025,12 @@ int AArch64TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
   if (const auto *Entry = CostTableLookup(CostTblNoPairwise, ISD, MTy))
     return LT.first * Entry->Cost;
 
-  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm);
+  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwiseForm,
+                                           CostKind);
 }
 
-int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
-                                   Type *SubTp) {
+int AArch64TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
+                                   int Index, VectorType *SubTp) {
   if (Kind == TTI::SK_Broadcast || Kind == TTI::SK_Transpose ||
       Kind == TTI::SK_Select || Kind == TTI::SK_PermuteSingleSrc) {
     static const CostTblEntry ShuffleTbl[] = {
diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
index 6f4569a49783..1f029689a60e 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.h
@@ -72,11 +72,11 @@ public:
 
   using BaseT::getIntImmCost;
   int getIntImmCost(int64_t Val);
-  int getIntImmCost(const APInt &Imm, Type *Ty);
+  int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
   int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                        Type *Ty);
+                        Type *Ty, TTI::TargetCostKind CostKind);
   int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                          Type *Ty);
+                          Type *Ty, TTI::TargetCostKind CostKind);
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
 
   /// @}
@@ -98,6 +98,8 @@ public:
 
   unsigned getRegisterBitWidth(bool Vector) const {
     if (Vector) {
+      if (ST->hasSVE())
+        return std::max(ST->getMinSVEVectorSizeInBits(), 128u);
       if (ST->hasNEON())
         return 128;
       return 0;
@@ -112,15 +114,19 @@ public:
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       TTI::TargetCostKind CostKind,
                        const Instruction *I = nullptr);
 
   int getExtractWithExtendCost(unsigned Opcode, Type *Dst, VectorType *VecTy,
                                unsigned Index);
 
+  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -131,30 +137,37 @@ public:
   int getAddressComputationCost(Type *Ty, ScalarEvolution *SE, const SCEV *Ptr);
 
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         TTI::TargetCostKind CostKind,
                          const Instruction *I = nullptr);
 
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const;
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
-                      unsigned AddressSpace, const Instruction *I = nullptr);
+                      unsigned AddressSpace,
+                      TTI::TargetCostKind CostKind,
+                      const Instruction *I = nullptr);
 
   int getCostOfKeepingLiveOverCall(ArrayRef<Type *> Tys);
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
 
+  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                             TTI::PeelingPreferences &PP);
+
   Value *getOrCreateResultFromMemIntrinsic(IntrinsicInst *Inst,
                                            Type *ExpectedType);
 
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info);
 
-  bool isLegalMaskedLoadStore(Type *DataType, MaybeAlign Alignment) {
-    if (!isa<VectorType>(DataType) || !ST->hasSVE())
+  bool isLegalMaskedLoadStore(Type *DataType, Align Alignment) {
+    if (!isa<ScalableVectorType>(DataType) || !ST->hasSVE())
       return false;
 
-    Type *Ty = DataType->getVectorElementType();
-    if (Ty->isHalfTy() || Ty->isFloatTy() || Ty->isDoubleTy())
+    Type *Ty = cast<ScalableVectorType>(DataType)->getElementType();
+    if (Ty->isBFloatTy() || Ty->isHalfTy() ||
+        Ty->isFloatTy() || Ty->isDoubleTy())
       return true;
 
     if (Ty->isIntegerTy(8) || Ty->isIntegerTy(16) ||
@@ -164,26 +177,58 @@ public:
     return false;
   }
 
-  bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment) {
+  bool isLegalMaskedLoad(Type *DataType, Align Alignment) {
     return isLegalMaskedLoadStore(DataType, Alignment);
   }
 
-  bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) {
+  bool isLegalMaskedStore(Type *DataType, Align Alignment) {
     return isLegalMaskedLoadStore(DataType, Alignment);
   }
 
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
-                                 ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace,
-                                 bool UseMaskForCond = false,
-                                 bool UseMaskForGaps = false);
+  bool isLegalNTStore(Type *DataType, Align Alignment) {
+    // NOTE: The logic below is mostly geared towards LV, which calls it with
+    //       vectors with 2 elements. We might want to improve that, if other
+    //       users show up.
+    // Nontemporal vector stores can be directly lowered to STNP, if the vector
+    // can be halved so that each half fits into a register. That's the case if
+    // the element type fits into a register and the number of elements is a
+    // power of 2 > 1.
+    if (auto *DataTypeVTy = dyn_cast<VectorType>(DataType)) {
+      unsigned NumElements =
+          cast<FixedVectorType>(DataTypeVTy)->getNumElements();
+      unsigned EltSize = DataTypeVTy->getElementType()->getScalarSizeInBits();
+      return NumElements > 1 && isPowerOf2_64(NumElements) && EltSize >= 8 &&
+             EltSize <= 128 && isPowerOf2_64(EltSize);
+    }
+    return BaseT::isLegalNTStore(DataType, Alignment);
+  }
+
+  int getInterleavedMemoryOpCost(
+      unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+      Align Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      bool UseMaskForCond = false, bool UseMaskForGaps = false);
 
   bool
   shouldConsiderAddressTypePromotion(const Instruction &I,
                                      bool &AllowPromotionWithoutCommonHeader);
 
   bool shouldExpandReduction(const IntrinsicInst *II) const {
-    return false;
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::experimental_vector_reduce_v2_fadd:
+    case Intrinsic::experimental_vector_reduce_v2_fmul:
+      // We don't have legalization support for ordered FP reductions.
+      return !II->getFastMathFlags().allowReassoc();
+
+    case Intrinsic::experimental_vector_reduce_fmax:
+    case Intrinsic::experimental_vector_reduce_fmin:
+      // Lowering asserts that there are no NaNs.
+      return !II->getFastMathFlags().noNaNs();
+
+    default:
+      // Don't expand anything else, let legalization deal with it.
+      return false;
+    }
   }
 
   unsigned getGISelRematGlobalCost() const {
@@ -193,10 +238,12 @@ public:
   bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                              TTI::ReductionFlags Flags) const;
 
-  int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
-                                 bool IsPairwiseForm);
+  int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                 bool IsPairwiseForm,
+                                 TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
 
-  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+  int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
+                     VectorType *SubTp);
   /// @}
 };
 
diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index be4c96022472..0ac09c4f96f0 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -260,6 +260,8 @@ public:
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
   bool ParseDirective(AsmToken DirectiveID) override;
   unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                       unsigned Kind) override;
@@ -755,12 +757,13 @@ public:
       return false;
 
     int64_t Val = MCE->getValue();
-    int64_t SVal = typename std::make_signed<T>::type(Val);
-    int64_t UVal = typename std::make_unsigned<T>::type(Val);
-    if (Val != SVal && Val != UVal)
+    // Avoid left shift by 64 directly.
+    uint64_t Upper = UINT64_C(-1) << (sizeof(T) * 4) << (sizeof(T) * 4);
+    // Allow all-0 or all-1 in top bits to permit bitwise NOT.
+    if ((Val & Upper) && (Val & Upper) != Upper)
       return false;
 
-    return AArch64_AM::isLogicalImmediate(UVal, sizeof(T) * 8);
+    return AArch64_AM::isLogicalImmediate(Val & ~Upper, sizeof(T) * 8);
   }
 
   bool isShiftedImm() const { return Kind == k_ShiftedImm; }
@@ -852,8 +855,7 @@ public:
     if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm())))
       return DiagnosticPredicateTy::NoMatch;
 
-    bool IsByte =
-        std::is_same<int8_t, typename std::make_signed<T>::type>::value;
+    bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value;
     if (auto ShiftedImm = getShiftedVal<8>())
       if (!(IsByte && ShiftedImm->second) &&
           AArch64_AM::isSVECpyImm<T>(uint64_t(ShiftedImm->first)
@@ -870,8 +872,7 @@ public:
     if (!isShiftedImm() && (!isImm() || !isa<MCConstantExpr>(getImm())))
       return DiagnosticPredicateTy::NoMatch;
 
-    bool IsByte =
-        std::is_same<int8_t, typename std::make_signed<T>::type>::value;
+    bool IsByte = std::is_same<int8_t, std::make_signed_t<T>>::value;
     if (auto ShiftedImm = getShiftedVal<8>())
       if (!(IsByte && ShiftedImm->second) &&
           AArch64_AM::isSVEAddSubImm<T>(ShiftedImm->first
@@ -969,11 +970,15 @@ public:
   bool isMOVZMovAlias() const {
     if (!isImm()) return false;
 
-    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
-    uint64_t Value = CE->getValue();
+    const MCExpr *E = getImm();
+    if (const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(E)) {
+      uint64_t Value = CE->getValue();
 
-    return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth);
+      return AArch64_AM::isMOVZMovAlias(Value, Shift, RegWidth);
+    }
+    // Only supports the case of Shift being 0 if an expression is used as an
+    // operand
+    return !Shift && E;
   }
 
   template<int RegWidth, int Shift>
@@ -1033,8 +1038,10 @@ public:
 
   bool isNeonVectorRegLo() const {
     return Kind == k_Register && Reg.Kind == RegKind::NeonVector &&
-           AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
-               Reg.RegNum);
+           (AArch64MCRegisterClasses[AArch64::FPR128_loRegClassID].contains(
+                Reg.RegNum) ||
+            AArch64MCRegisterClasses[AArch64::FPR64_loRegClassID].contains(
+                Reg.RegNum));
   }
 
   template <unsigned Class> bool isSVEVectorReg() const {
@@ -1606,7 +1613,7 @@ public:
   void addLogicalImmOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    typename std::make_unsigned<T>::type Val = MCE->getValue();
+    std::make_unsigned_t<T> Val = MCE->getValue();
     uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8);
     Inst.addOperand(MCOperand::createImm(encoding));
   }
@@ -1615,7 +1622,7 @@ public:
   void addLogicalImmNotOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     const MCConstantExpr *MCE = cast<MCConstantExpr>(getImm());
-    typename std::make_unsigned<T>::type Val = ~MCE->getValue();
+    std::make_unsigned_t<T> Val = ~MCE->getValue();
     uint64_t encoding = AArch64_AM::encodeLogicalImmediate(Val, sizeof(T) * 8);
     Inst.addOperand(MCOperand::createImm(encoding));
   }
@@ -1771,9 +1778,13 @@ public:
   void addMOVZMovAliasOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
 
-    const MCConstantExpr *CE = cast<MCConstantExpr>(getImm());
-    uint64_t Value = CE->getValue();
-    Inst.addOperand(MCOperand::createImm((Value >> Shift) & 0xffff));
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
+    if (CE) {
+      uint64_t Value = CE->getValue();
+      Inst.addOperand(MCOperand::createImm((Value >> Shift) & 0xffff));
+    } else {
+      addExpr(Inst, getImm());
+    }
   }
 
   template<int Shift>
@@ -2243,10 +2254,16 @@ static unsigned matchSVEPredicateVectorRegName(StringRef Name) {
 
 bool AArch64AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                      SMLoc &EndLoc) {
+  return tryParseRegister(RegNo, StartLoc, EndLoc) != MatchOperand_Success;
+}
+
+OperandMatchResultTy AArch64AsmParser::tryParseRegister(unsigned &RegNo,
+                                                        SMLoc &StartLoc,
+                                                        SMLoc &EndLoc) {
   StartLoc = getLoc();
   auto Res = tryParseScalarRegister(RegNo);
   EndLoc = SMLoc::getFromPointer(getLoc().getPointer() - 1);
-  return Res != MatchOperand_Success;
+  return Res;
 }
 
 // Matches a register name or register alias previously defined by '.req'
@@ -2404,9 +2421,9 @@ AArch64AsmParser::tryParsePrefetch(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  Parser.Lex(); // Eat identifier token.
   Operands.push_back(AArch64Operand::CreatePrefetch(
       *PRFM, Tok.getString(), S, getContext()));
+  Parser.Lex(); // Eat identifier token.
   return MatchOperand_Success;
 }
 
@@ -2427,9 +2444,9 @@ AArch64AsmParser::tryParsePSBHint(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  Parser.Lex(); // Eat identifier token.
   Operands.push_back(AArch64Operand::CreatePSBHint(
       PSB->Encoding, Tok.getString(), S, getContext()));
+  Parser.Lex(); // Eat identifier token.
   return MatchOperand_Success;
 }
 
@@ -2450,9 +2467,9 @@ AArch64AsmParser::tryParseBTIHint(OperandVector &Operands) {
     return MatchOperand_ParseFail;
   }
 
-  Parser.Lex(); // Eat identifier token.
   Operands.push_back(AArch64Operand::CreateBTIHint(
       BTI->Encoding, Tok.getString(), S, getContext()));
+  Parser.Lex(); // Eat identifier token.
   return MatchOperand_Success;
 }
 
@@ -2827,6 +2844,7 @@ static const struct Extension {
     {"tlb-rmi", {AArch64::FeatureTLB_RMI}},
     {"pan-rwv", {AArch64::FeaturePAN_RWV}},
     {"ccpp", {AArch64::FeatureCCPP}},
+    {"rcpc", {AArch64::FeatureRCPC}},
     {"sve", {AArch64::FeatureSVE}},
     {"sve2", {AArch64::FeatureSVE2}},
     {"sve2-aes", {AArch64::FeatureSVE2AES}},
@@ -2851,6 +2869,8 @@ static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
     Str += "ARMv8.4a";
   else if (FBS[AArch64::HasV8_5aOps])
     Str += "ARMv8.5a";
+  else if (FBS[AArch64::HasV8_6aOps])
+    Str += "ARMv8.6a";
   else {
     auto ext = std::find_if(std::begin(ExtensionMap),
       std::end(ExtensionMap),
@@ -3771,7 +3791,7 @@ bool AArch64AsmParser::ParseInstruction(ParseInstructionInfo &Info,
 
   // First check for the AArch64-specific .req directive.
   if (Parser.getTok().is(AsmToken::Identifier) &&
-      Parser.getTok().getIdentifier() == ".req") {
+      Parser.getTok().getIdentifier().lower() == ".req") {
     parseDirectiveReq(Name, NameLoc);
     // We always return 'error' for this, as we're done with this
     // statement and don't need to match the 'instruction."
@@ -4106,6 +4126,16 @@ bool AArch64AsmParser::validateInstruction(MCInst &Inst, SMLoc &IDLoc,
                    "unpredictable STXP instruction, status is also a source");
     break;
   }
+  case AArch64::LDRABwriteback:
+  case AArch64::LDRAAwriteback: {
+    unsigned Xt = Inst.getOperand(0).getReg();
+    unsigned Xn = Inst.getOperand(1).getReg();
+    if (Xt == Xn)
+      return Error(Loc[0],
+          "unpredictable LDRA instruction, writeback base"
+          " is also a destination");
+    break;
+  }
   }
 
 
@@ -4235,6 +4265,8 @@ bool AArch64AsmParser::showMatchError(SMLoc Loc, unsigned ErrCode,
     return Error(Loc, "index must be a multiple of 4 in range [-32, 28].");
   case Match_InvalidMemoryIndexed16SImm4:
     return Error(Loc, "index must be a multiple of 16 in range [-128, 112].");
+  case Match_InvalidMemoryIndexed32SImm4:
+    return Error(Loc, "index must be a multiple of 32 in range [-256, 224].");
   case Match_InvalidMemoryIndexed1SImm6:
     return Error(Loc, "index must be an integer in range [-32, 31].");
   case Match_InvalidMemoryIndexedSImm8:
@@ -4824,7 +4856,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       return true;
 
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst, getSTI());
+    Out.emitInstruction(Inst, getSTI());
     return false;
   }
   case Match_MissingFeature: {
@@ -4894,6 +4926,7 @@ bool AArch64AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   case Match_InvalidMemoryIndexed4SImm4:
   case Match_InvalidMemoryIndexed1SImm6:
   case Match_InvalidMemoryIndexed16SImm4:
+  case Match_InvalidMemoryIndexed32SImm4:
   case Match_InvalidMemoryIndexed4SImm7:
   case Match_InvalidMemoryIndexed8SImm7:
   case Match_InvalidMemoryIndexed16SImm7:
@@ -5024,7 +5057,7 @@ bool AArch64AsmParser::ParseDirective(AsmToken DirectiveID) {
     getContext().getObjectFileInfo()->getObjectFileType();
   bool IsMachO = Format == MCObjectFileInfo::IsMachO;
 
-  StringRef IDVal = DirectiveID.getIdentifier();
+  auto IDVal = DirectiveID.getIdentifier().lower();
   SMLoc Loc = DirectiveID.getLoc();
   if (IDVal == ".arch")
     parseDirectiveArch(Loc);
@@ -5076,6 +5109,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
       break;
     case AArch64::ArchKind::ARMV8_4A:
     case AArch64::ArchKind::ARMV8_5A:
+    case AArch64::ArchKind::ARMV8_6A:
       RequestedExtensions.push_back("sm4");
       RequestedExtensions.push_back("sha3");
       RequestedExtensions.push_back("sha2");
@@ -5095,6 +5129,7 @@ static void ExpandCryptoAEK(AArch64::ArchKind ArchKind,
       break;
     case AArch64::ArchKind::ARMV8_4A:
     case AArch64::ArchKind::ARMV8_5A:
+    case AArch64::ArchKind::ARMV8_6A:
       RequestedExtensions.push_back("nosm4");
       RequestedExtensions.push_back("nosha3");
       RequestedExtensions.push_back("nosha2");
@@ -5314,7 +5349,7 @@ bool AArch64AsmParser::parseDirectiveTLSDescCall(SMLoc L) {
   Inst.setOpcode(AArch64::TLSDESCCALL);
   Inst.addOperand(MCOperand::createExpr(Expr));
 
-  getParser().getStreamer().EmitInstruction(Inst, getSTI());
+  getParser().getStreamer().emitInstruction(Inst, getSTI());
   return false;
 }
 
@@ -5365,7 +5400,7 @@ bool AArch64AsmParser::parseDirectiveLOH(StringRef IDVal, SMLoc Loc) {
                  "unexpected token in '" + Twine(IDVal) + "' directive"))
     return true;
 
-  getStreamer().EmitLOHDirective((MCLOHType)Kind, Args);
+  getStreamer().emitLOHDirective((MCLOHType)Kind, Args);
   return false;
 }
 
@@ -5458,7 +5493,7 @@ bool AArch64AsmParser::parseDirectiveUnreq(SMLoc L) {
 bool AArch64AsmParser::parseDirectiveCFINegateRAState() {
   if (parseToken(AsmToken::EndOfStatement, "unexpected token in directive"))
     return true;
-  getStreamer().EmitCFINegateRAState();
+  getStreamer().emitCFINegateRAState();
   return false;
 }
 
@@ -5468,7 +5503,7 @@ bool AArch64AsmParser::parseDirectiveCFIBKeyFrame() {
   if (parseToken(AsmToken::EndOfStatement,
                  "unexpected token in '.cfi_b_key_frame'"))
     return true;
-  getStreamer().EmitCFIBKeyFrame();
+  getStreamer().emitCFIBKeyFrame();
   return false;
 }
 
diff --git a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
index d6db88603429..1ff4abb34054 100644
--- a/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
+++ b/llvm/lib/Target/AArch64/Disassembler/AArch64Disassembler.cpp
@@ -146,6 +146,9 @@ static DecodeStatus DecodeExclusiveLdStInstruction(MCInst &Inst, uint32_t insn,
 static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn,
                                               uint64_t Address,
                                               const void *Decoder);
+static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn,
+                                              uint64_t Address,
+                                              const void *Decoder);
 static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn,
                                                 uint64_t Address,
                                                 const void *Decoder);
@@ -1501,6 +1504,39 @@ static DecodeStatus DecodePairLdStInstruction(MCInst &Inst, uint32_t insn,
   return Success;
 }
 
+static DecodeStatus DecodeAuthLoadInstruction(MCInst &Inst, uint32_t insn,
+                                              uint64_t Addr,
+                                              const void *Decoder) {
+  unsigned Rt = fieldFromInstruction(insn, 0, 5);
+  unsigned Rn = fieldFromInstruction(insn, 5, 5);
+  uint64_t offset = fieldFromInstruction(insn, 22, 1) << 9 |
+                    fieldFromInstruction(insn, 12, 9);
+  unsigned writeback = fieldFromInstruction(insn, 11, 1);
+
+  switch (Inst.getOpcode()) {
+  default:
+    return Fail;
+  case AArch64::LDRAAwriteback:
+  case AArch64::LDRABwriteback:
+    DecodeGPR64spRegisterClass(Inst, Rn /* writeback register */, Addr,
+                               Decoder);
+    break;
+  case AArch64::LDRAAindexed:
+  case AArch64::LDRABindexed:
+    break;
+  }
+
+  DecodeGPR64RegisterClass(Inst, Rt, Addr, Decoder);
+  DecodeGPR64spRegisterClass(Inst, Rn, Addr, Decoder);
+  DecodeSImm<10>(Inst, offset, Addr, Decoder);
+
+  if (writeback && Rt == Rn && Rn != 31) {
+    return SoftFail;
+  }
+
+  return Success;
+}
+
 static DecodeStatus DecodeAddSubERegInstruction(MCInst &Inst, uint32_t insn,
                                                 uint64_t Addr,
                                                 const void *Decoder) {
diff --git a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
index 76ff238234d9..11a8d5def429 100644
--- a/llvm/lib/Target/AArch64/AArch64CallLowering.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.cpp
@@ -62,10 +62,9 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
     auto &MFI = MIRBuilder.getMF().getFrameInfo();
     int FI = MFI.CreateFixedObject(Size, Offset, true);
     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
-    Register AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 64));
-    MIRBuilder.buildFrameIndex(AddrReg, FI);
+    auto AddrReg = MIRBuilder.buildFrameIndex(LLT::pointer(0, 64), FI);
     StackUsed = std::max(StackUsed, Size + Offset);
-    return AddrReg;
+    return AddrReg.getReg(0);
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
@@ -87,10 +86,10 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
 
   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
-    // FIXME: Get alignment
-    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+    MachineFunction &MF = MIRBuilder.getMF();
+    auto MMO = MF.getMachineMemOperand(
         MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
-        1);
+        inferAlignFromPtrInfo(MF, MPO));
     MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
   }
 
@@ -134,7 +133,7 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
                      int FPDiff = 0)
       : ValueHandler(MIRBuilder, MRI, AssignFn), MIB(MIB),
         AssignFnVarArg(AssignFnVarArg), IsTailCall(IsTailCall), FPDiff(FPDiff),
-        StackSize(0) {}
+        StackSize(0), SPReg(0) {}
 
   bool isIncomingArgumentHandler() const override { return false; }
 
@@ -147,23 +146,20 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
     if (IsTailCall) {
       Offset += FPDiff;
       int FI = MF.getFrameInfo().CreateFixedObject(Size, Offset, true);
-      Register FIReg = MRI.createGenericVirtualRegister(p0);
-      MIRBuilder.buildFrameIndex(FIReg, FI);
+      auto FIReg = MIRBuilder.buildFrameIndex(p0, FI);
       MPO = MachinePointerInfo::getFixedStack(MF, FI);
-      return FIReg;
+      return FIReg.getReg(0);
     }
 
-    Register SPReg = MRI.createGenericVirtualRegister(p0);
-    MIRBuilder.buildCopy(SPReg, Register(AArch64::SP));
+    if (!SPReg)
+      SPReg = MIRBuilder.buildCopy(p0, Register(AArch64::SP)).getReg(0);
 
-    Register OffsetReg = MRI.createGenericVirtualRegister(s64);
-    MIRBuilder.buildConstant(OffsetReg, Offset);
+    auto OffsetReg = MIRBuilder.buildConstant(s64, Offset);
 
-    Register AddrReg = MRI.createGenericVirtualRegister(p0);
-    MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg);
+    auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg);
 
     MPO = MachinePointerInfo::getStack(MF, Offset);
-    return AddrReg;
+    return AddrReg.getReg(0);
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
@@ -175,17 +171,33 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
 
   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
-    if (VA.getLocInfo() == CCValAssign::LocInfo::AExt) {
-      Size = VA.getLocVT().getSizeInBits() / 8;
-      ValVReg = MIRBuilder.buildAnyExt(LLT::scalar(Size * 8), ValVReg)
-                    ->getOperand(0)
-                    .getReg();
-    }
-    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
-        MPO, MachineMemOperand::MOStore, Size, 1);
+    MachineFunction &MF = MIRBuilder.getMF();
+    auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, Size,
+                                       inferAlignFromPtrInfo(MF, MPO));
     MIRBuilder.buildStore(ValVReg, Addr, *MMO);
   }
 
+  void assignValueToAddress(const CallLowering::ArgInfo &Arg, Register Addr,
+                            uint64_t Size, MachinePointerInfo &MPO,
+                            CCValAssign &VA) override {
+    unsigned MaxSize = Size * 8;
+    // For varargs, we always want to extend them to 8 bytes, in which case
+    // we disable setting a max.
+    if (!Arg.IsFixed)
+      MaxSize = 0;
+
+    Register ValVReg = VA.getLocInfo() != CCValAssign::LocInfo::FPExt
+                           ? extendRegister(Arg.Regs[0], VA, MaxSize)
+                           : Arg.Regs[0];
+
+    // If we extended we might need to adjust the MMO's Size.
+    const LLT RegTy = MRI.getType(ValVReg);
+    if (RegTy.getSizeInBytes() > Size)
+      Size = RegTy.getSizeInBytes();
+
+    assignValueToAddress(ValVReg, Addr, Size, MPO, VA);
+  }
+
   bool assignArg(unsigned ValNo, MVT ValVT, MVT LocVT,
                  CCValAssign::LocInfo LocInfo,
                  const CallLowering::ArgInfo &Info,
@@ -209,6 +221,9 @@ struct OutgoingArgHandler : public CallLowering::ValueHandler {
   /// callee's. Unused elsewhere.
   int FPDiff;
   uint64_t StackSize;
+
+  // Cache the SP register vreg if we need it more than once in this call site.
+  Register SPReg;
 };
 } // namespace
 
@@ -222,13 +237,13 @@ void AArch64CallLowering::splitToValueTypes(
   const AArch64TargetLowering &TLI = *getTLI<AArch64TargetLowering>();
   LLVMContext &Ctx = OrigArg.Ty->getContext();
 
-  if (OrigArg.Ty->isVoidTy())
-    return;
-
   SmallVector<EVT, 4> SplitVTs;
   SmallVector<uint64_t, 4> Offsets;
   ComputeValueVTs(TLI, DL, OrigArg.Ty, SplitVTs, &Offsets, 0);
 
+  if (SplitVTs.size() == 0)
+    return;
+
   if (SplitVTs.size() == 1) {
     // No splitting to do, but we want to replace the original type (e.g. [1 x
     // double] -> double).
@@ -322,8 +337,7 @@ bool AArch64CallLowering::lowerReturn(MachineIRBuilder &MIRBuilder,
                 }
                 auto Undef = MIRBuilder.buildUndef({OldLLT});
                 CurVReg =
-                    MIRBuilder.buildMerge({NewLLT}, {CurVReg, Undef.getReg(0)})
-                        .getReg(0);
+                    MIRBuilder.buildMerge({NewLLT}, {CurVReg, Undef}).getReg(0);
               } else {
                 // Just do a vector extend.
                 CurVReg = MIRBuilder.buildInstr(ExtendOp, {NewLLT}, {CurVReg})
@@ -413,6 +427,14 @@ static void handleMustTailForwardedRegisters(MachineIRBuilder &MIRBuilder,
   }
 }
 
+bool AArch64CallLowering::fallBackToDAGISel(const Function &F) const {
+  if (isa<ScalableVectorType>(F.getReturnType()))
+    return true;
+  return llvm::any_of(F.args(), [](const Argument &A) {
+    return isa<ScalableVectorType>(A.getType());
+  });
+}
+
 bool AArch64CallLowering::lowerFormalArguments(
     MachineIRBuilder &MIRBuilder, const Function &F,
     ArrayRef<ArrayRef<Register>> VRegs) const {
@@ -424,7 +446,7 @@ bool AArch64CallLowering::lowerFormalArguments(
   SmallVector<ArgInfo, 8> SplitArgs;
   unsigned i = 0;
   for (auto &Arg : F.args()) {
-    if (DL.getTypeStoreSize(Arg.getType()) == 0)
+    if (DL.getTypeStoreSize(Arg.getType()).isZero())
       continue;
 
     ArgInfo OrigArg{VRegs[i], Arg.getType()};
@@ -759,17 +781,17 @@ bool AArch64CallLowering::isEligibleForTailCallOptimization(
   return true;
 }
 
-static unsigned getCallOpcode(const Function &CallerF, bool IsIndirect,
+static unsigned getCallOpcode(const MachineFunction &CallerF, bool IsIndirect,
                               bool IsTailCall) {
   if (!IsTailCall)
-    return IsIndirect ? AArch64::BLR : AArch64::BL;
+    return IsIndirect ? getBLRCallOpcode(CallerF) : (unsigned)AArch64::BL;
 
   if (!IsIndirect)
     return AArch64::TCRETURNdi;
 
   // When BTI is enabled, we need to use TCRETURNriBTI to make sure that we use
   // x16 or x17.
-  if (CallerF.hasFnAttribute("branch-target-enforcement"))
+  if (CallerF.getFunction().hasFnAttribute("branch-target-enforcement"))
     return AArch64::TCRETURNriBTI;
 
   return AArch64::TCRETURNri;
@@ -805,7 +827,7 @@ bool AArch64CallLowering::lowerTailCall(
   if (!IsSibCall)
     CallSeqStart = MIRBuilder.buildInstr(AArch64::ADJCALLSTACKDOWN);
 
-  unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), true);
+  unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), true);
   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
   MIB.add(Info.Callee);
 
@@ -863,7 +885,6 @@ bool AArch64CallLowering::lowerTailCall(
   const auto &Forwards = FuncInfo->getForwardedMustTailRegParms();
 
   // Do the actual argument marshalling.
-  SmallVector<unsigned, 8> PhysRegs;
   OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
                              AssignFnVarArg, true, FPDiff);
   if (!handleAssignments(MIRBuilder, OutArgs, Handler))
@@ -965,7 +986,7 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
 
   // Create a temporarily-floating call instruction so we can add the implicit
   // uses of arg registers.
-  unsigned Opc = getCallOpcode(F, Info.Callee.isReg(), false);
+  unsigned Opc = getCallOpcode(MF, Info.Callee.isReg(), false);
 
   auto MIB = MIRBuilder.buildInstrNoInsert(Opc);
   MIB.add(Info.Callee);
@@ -981,7 +1002,6 @@ bool AArch64CallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
     TRI->emitReservedArgRegCallError(MF);
 
   // Do the actual argument marshalling.
-  SmallVector<unsigned, 8> PhysRegs;
   OutgoingArgHandler Handler(MIRBuilder, MRI, MIB, AssignFnFixed,
                              AssignFnVarArg, false);
   if (!handleAssignments(MIRBuilder, OutArgs, Handler))
diff --git a/llvm/lib/Target/AArch64/AArch64CallLowering.h b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
index b0c601c7062c..640a86253059 100644
--- a/llvm/lib/Target/AArch64/AArch64CallLowering.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64CallLowering.h
@@ -37,6 +37,8 @@ public:
                    ArrayRef<Register> VRegs,
                    Register SwiftErrorVReg) const override;
 
+  bool fallBackToDAGISel(const Function &F) const override;
+
   bool lowerFormalArguments(MachineIRBuilder &MIRBuilder, const Function &F,
                             ArrayRef<ArrayRef<Register>> VRegs) const override;
 
diff --git a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
index b9ac2657e1c5..408f0cb77e73 100644
--- a/llvm/lib/Target/AArch64/AArch64InstructionSelector.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64InstructionSelector.cpp
@@ -31,6 +31,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetOpcodes.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
 #include "llvm/Support/Debug.h"
@@ -63,6 +65,9 @@ public:
     // cache it here for each run of the selector.
     ProduceNonFlagSettingCondBr =
         !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
+    MFReturnAddr = Register();
+
+    processPHIs(MF);
   }
 
 private:
@@ -71,23 +76,33 @@ private:
   bool selectImpl(MachineInstr &I, CodeGenCoverage &CoverageInfo) const;
 
   // A lowering phase that runs before any selection attempts.
-
-  void preISelLower(MachineInstr &I) const;
+  // Returns true if the instruction was modified.
+  bool preISelLower(MachineInstr &I);
 
   // An early selection function that runs before the selectImpl() call.
   bool earlySelect(MachineInstr &I) const;
 
+  // Do some preprocessing of G_PHIs before we begin selection.
+  void processPHIs(MachineFunction &MF);
+
   bool earlySelectSHL(MachineInstr &I, MachineRegisterInfo &MRI) const;
 
   /// Eliminate same-sized cross-bank copies into stores before selectImpl().
-  void contractCrossBankCopyIntoStore(MachineInstr &I,
-                                      MachineRegisterInfo &MRI) const;
+  bool contractCrossBankCopyIntoStore(MachineInstr &I,
+                                      MachineRegisterInfo &MRI);
+
+  bool convertPtrAddToAdd(MachineInstr &I, MachineRegisterInfo &MRI);
 
   bool selectVaStartAAPCS(MachineInstr &I, MachineFunction &MF,
                           MachineRegisterInfo &MRI) const;
   bool selectVaStartDarwin(MachineInstr &I, MachineFunction &MF,
                            MachineRegisterInfo &MRI) const;
 
+  bool tryOptAndIntoCompareBranch(MachineInstr *LHS,
+                                  int64_t CmpConstant,
+                                  const CmpInst::Predicate &Pred,
+                                  MachineBasicBlock *DstMBB,
+                                  MachineIRBuilder &MIB) const;
   bool selectCompareBranch(MachineInstr &I, MachineFunction &MF,
                            MachineRegisterInfo &MRI) const;
 
@@ -112,6 +127,8 @@ private:
                                const RegisterBank &RB,
                                MachineIRBuilder &MIRBuilder) const;
   bool selectInsertElt(MachineInstr &I, MachineRegisterInfo &MRI) const;
+  bool tryOptConstantBuildVec(MachineInstr &MI, LLT DstTy,
+                              MachineRegisterInfo &MRI) const;
   bool selectBuildVector(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectMergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectUnmergeValues(MachineInstr &I, MachineRegisterInfo &MRI) const;
@@ -123,7 +140,7 @@ private:
                                 MachineRegisterInfo &MRI) const;
   bool selectIntrinsicWithSideEffects(MachineInstr &I,
                                       MachineRegisterInfo &MRI) const;
-  bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI) const;
+  bool selectIntrinsic(MachineInstr &I, MachineRegisterInfo &MRI);
   bool selectVectorICmp(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectIntrinsicTrunc(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectIntrinsicRound(MachineInstr &I, MachineRegisterInfo &MRI) const;
@@ -131,17 +148,25 @@ private:
   bool selectBrJT(MachineInstr &I, MachineRegisterInfo &MRI) const;
   bool selectTLSGlobalValue(MachineInstr &I, MachineRegisterInfo &MRI) const;
 
-  unsigned emitConstantPoolEntry(Constant *CPVal, MachineFunction &MF) const;
-  MachineInstr *emitLoadFromConstantPool(Constant *CPVal,
+  unsigned emitConstantPoolEntry(const Constant *CPVal,
+                                 MachineFunction &MF) const;
+  MachineInstr *emitLoadFromConstantPool(const Constant *CPVal,
                                          MachineIRBuilder &MIRBuilder) const;
 
   // Emit a vector concat operation.
   MachineInstr *emitVectorConcat(Optional<Register> Dst, Register Op1,
                                  Register Op2,
                                  MachineIRBuilder &MIRBuilder) const;
-  MachineInstr *emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
-                                   MachineOperand &Predicate,
-                                   MachineIRBuilder &MIRBuilder) const;
+
+  // Emit an integer compare between LHS and RHS, which checks for Predicate.
+  //
+  // This returns the produced compare instruction, and the predicate which
+  // was ultimately used in the compare. The predicate may differ from what
+  // is passed in \p Predicate due to optimization.
+  std::pair<MachineInstr *, CmpInst::Predicate>
+  emitIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
+                     MachineOperand &Predicate,
+                     MachineIRBuilder &MIRBuilder) const;
   MachineInstr *emitADD(Register DefReg, MachineOperand &LHS, MachineOperand &RHS,
                         MachineIRBuilder &MIRBuilder) const;
   MachineInstr *emitCMN(MachineOperand &LHS, MachineOperand &RHS,
@@ -163,6 +188,13 @@ private:
   MachineInstr *emitCSetForICMP(Register DefReg, unsigned Pred,
                                 MachineIRBuilder &MIRBuilder) const;
 
+  /// Emit a TB(N)Z instruction which tests \p Bit in \p TestReg.
+  /// \p IsNegative is true if the test should be "not zero".
+  /// This will also optimize the test bit instruction when possible.
+  MachineInstr *emitTestBit(Register TestReg, uint64_t Bit, bool IsNegative,
+                            MachineBasicBlock *DstMBB,
+                            MachineIRBuilder &MIB) const;
+
   // Equivalent to the i32shift_a and friends from AArch64InstrInfo.td.
   // We use these manually instead of using the importer since it doesn't
   // support SDNodeXForm.
@@ -194,6 +226,11 @@ private:
     return selectAddrModeUnscaled(Root, 16);
   }
 
+  /// Helper to try to fold in a GISEL_ADD_LOW into an immediate, to be used
+  /// from complex pattern matchers like selectAddrModeIndexed().
+  ComplexRendererFns tryFoldAddLowIntoImm(MachineInstr &RootDef, unsigned Size,
+                                          MachineRegisterInfo &MRI) const;
+
   ComplexRendererFns selectAddrModeIndexed(MachineOperand &Root,
                                            unsigned Size) const;
   template <int Width>
@@ -258,6 +295,8 @@ private:
   /// new copy.
   Register narrowExtendRegIfNeeded(Register ExtReg,
                                              MachineIRBuilder &MIB) const;
+  Register widenGPRBankRegIfNeeded(Register Reg, unsigned Size,
+                                   MachineIRBuilder &MIB) const;
   ComplexRendererFns selectArithExtendedRegister(MachineOperand &Root) const;
 
   void renderTruncImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
@@ -272,12 +311,17 @@ private:
                              unsigned OpFlags) const;
 
   // Optimization methods.
-  bool tryOptVectorShuffle(MachineInstr &I) const;
-  bool tryOptVectorDup(MachineInstr &MI) const;
   bool tryOptSelect(MachineInstr &MI) const;
   MachineInstr *tryFoldIntegerCompare(MachineOperand &LHS, MachineOperand &RHS,
                                       MachineOperand &Predicate,
                                       MachineIRBuilder &MIRBuilder) const;
+  MachineInstr *tryOptArithImmedIntegerCompare(MachineOperand &LHS,
+                                               MachineOperand &RHS,
+                                               CmpInst::Predicate &Predicate,
+                                               MachineIRBuilder &MIB) const;
+  MachineInstr *tryOptArithShiftedCompare(MachineOperand &LHS,
+                                          MachineOperand &RHS,
+                                          MachineIRBuilder &MIB) const;
 
   /// Return true if \p MI is a load or store of \p NumBytes bytes.
   bool isLoadStoreOfNumBytes(const MachineInstr &MI, unsigned NumBytes) const;
@@ -295,6 +339,11 @@ private:
 
   bool ProduceNonFlagSettingCondBr = false;
 
+  // Some cached values used during selection.
+  // We use LR as a live-in register, and we keep track of it here as it can be
+  // clobbered by calls.
+  Register MFReturnAddr;
+
 #define GET_GLOBALISEL_PREDICATES_DECL
 #include "AArch64GenGlobalISel.inc"
 #undef GET_GLOBALISEL_PREDICATES_DECL
@@ -421,6 +470,39 @@ static bool getSubRegForClass(const TargetRegisterClass *RC,
   return true;
 }
 
+/// Returns the minimum size the given register bank can hold.
+static unsigned getMinSizeForRegBank(const RegisterBank &RB) {
+  switch (RB.getID()) {
+  case AArch64::GPRRegBankID:
+    return 32;
+  case AArch64::FPRRegBankID:
+    return 8;
+  default:
+    llvm_unreachable("Tried to get minimum size for unknown register bank.");
+  }
+}
+
+static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
+  auto &MI = *Root.getParent();
+  auto &MBB = *MI.getParent();
+  auto &MF = *MBB.getParent();
+  auto &MRI = MF.getRegInfo();
+  uint64_t Immed;
+  if (Root.isImm())
+    Immed = Root.getImm();
+  else if (Root.isCImm())
+    Immed = Root.getCImm()->getZExtValue();
+  else if (Root.isReg()) {
+    auto ValAndVReg =
+        getConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
+    if (!ValAndVReg)
+      return None;
+    Immed = ValAndVReg->Value;
+  } else
+    return None;
+  return Immed;
+}
+
 /// Check whether \p I is a currently unsupported binary operation:
 /// - it has an unsized type
 /// - an operand is not a vreg
@@ -609,23 +691,20 @@ static bool isValidCopy(const MachineInstr &I, const RegisterBank &DstBank,
 }
 #endif
 
-/// Helper function for selectCopy. Inserts a subregister copy from
-/// \p *From to \p *To, linking it up to \p I.
-///
-/// e.g, given I = "Dst = COPY SrcReg", we'll transform that into
+/// Helper function for selectCopy. Inserts a subregister copy from \p SrcReg
+/// to \p *To.
 ///
-/// CopyReg (From class) = COPY SrcReg
-/// SubRegCopy (To class) = COPY CopyReg:SubReg
-/// Dst = COPY SubRegCopy
-static bool selectSubregisterCopy(MachineInstr &I, MachineRegisterInfo &MRI,
-                                  const RegisterBankInfo &RBI, Register SrcReg,
-                                  const TargetRegisterClass *From,
-                                  const TargetRegisterClass *To,
-                                  unsigned SubReg) {
+/// E.g "To = COPY SrcReg:SubReg"
+static bool copySubReg(MachineInstr &I, MachineRegisterInfo &MRI,
+                       const RegisterBankInfo &RBI, Register SrcReg,
+                       const TargetRegisterClass *To, unsigned SubReg) {
+  assert(SrcReg.isValid() && "Expected a valid source register?");
+  assert(To && "Destination register class cannot be null");
+  assert(SubReg && "Expected a valid subregister");
+
   MachineIRBuilder MIB(I);
-  auto Copy = MIB.buildCopy({From}, {SrcReg});
-  auto SubRegCopy = MIB.buildInstr(TargetOpcode::COPY, {To}, {})
-                        .addReg(Copy.getReg(0), 0, SubReg);
+  auto SubRegCopy =
+      MIB.buildInstr(TargetOpcode::COPY, {To}, {}).addReg(SrcReg, 0, SubReg);
   MachineOperand &RegOp = I.getOperand(1);
   RegOp.setReg(SubRegCopy.getReg(0));
 
@@ -670,7 +749,6 @@ getRegClassesForCopy(MachineInstr &I, const TargetInstrInfo &TII,
 static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
                        MachineRegisterInfo &MRI, const TargetRegisterInfo &TRI,
                        const RegisterBankInfo &RBI) {
-
   Register DstReg = I.getOperand(0).getReg();
   Register SrcReg = I.getOperand(1).getReg();
   const RegisterBank &DstRegBank = *RBI.getRegBank(DstReg, MRI, TRI);
@@ -703,13 +781,15 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
             (!Register::isPhysicalRegister(I.getOperand(0).getReg()) &&
              !Register::isPhysicalRegister(I.getOperand(1).getReg()))) &&
            "No phys reg on generic operator!");
-    assert(KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI));
-    (void)KnownValid;
-    return true;
+    bool ValidCopy = true;
+#ifndef NDEBUG
+    ValidCopy = KnownValid || isValidCopy(I, DstRegBank, MRI, TRI, RBI);
+    assert(ValidCopy && "Invalid copy.");
+#endif
+    return ValidCopy;
   };
 
-  // Is this a copy? If so, then we may need to insert a subregister copy, or
-  // a SUBREG_TO_REG.
+  // Is this a copy? If so, then we may need to insert a subregister copy.
   if (I.isCopy()) {
     // Yes. Check if there's anything to fix up.
     if (!SrcRC) {
@@ -719,48 +799,43 @@ static bool selectCopy(MachineInstr &I, const TargetInstrInfo &TII,
 
     unsigned SrcSize = TRI.getRegSizeInBits(*SrcRC);
     unsigned DstSize = TRI.getRegSizeInBits(*DstRC);
+    unsigned SubReg;
 
-    // If we're doing a cross-bank copy on different-sized registers, we need
-    // to do a bit more work.
-    if (SrcSize > DstSize) {
-      // We're doing a cross-bank copy into a smaller register. We need a
-      // subregister copy. First, get a register class that's on the same bank
-      // as the destination, but the same size as the source.
-      const TargetRegisterClass *SubregRC =
-          getMinClassForRegBank(DstRegBank, SrcSize, true);
-      assert(SubregRC && "Didn't get a register class for subreg?");
-
-      // Get the appropriate subregister for the destination.
-      unsigned SubReg = 0;
-      if (!getSubRegForClass(DstRC, TRI, SubReg)) {
-        LLVM_DEBUG(dbgs() << "Couldn't determine subregister for copy.\n");
-        return false;
-      }
-
-      // Now, insert a subregister copy using the new register class.
-      selectSubregisterCopy(I, MRI, RBI, SrcReg, SubregRC, DstRC, SubReg);
-      return CheckCopy();
-    }
+    // If the source bank doesn't support a subregister copy small enough,
+    // then we first need to copy to the destination bank.
+    if (getMinSizeForRegBank(SrcRegBank) > DstSize) {
+      const TargetRegisterClass *DstTempRC =
+          getMinClassForRegBank(DstRegBank, SrcSize, /* GetAllRegSet */ true);
+      getSubRegForClass(DstRC, TRI, SubReg);
 
-    // Is this a cross-bank copy?
-    if (DstRegBank.getID() != SrcRegBank.getID()) {
-      if (DstRegBank.getID() == AArch64::GPRRegBankID && DstSize == 32 &&
-          SrcSize == 16) {
-        // Special case for FPR16 to GPR32.
-        // FIXME: This can probably be generalized like the above case.
-        Register PromoteReg =
-            MRI.createVirtualRegister(&AArch64::FPR32RegClass);
-        BuildMI(*I.getParent(), I, I.getDebugLoc(),
-                TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
-            .addImm(0)
-            .addUse(SrcReg)
-            .addImm(AArch64::hsub);
-        MachineOperand &RegOp = I.getOperand(1);
-        RegOp.setReg(PromoteReg);
+      MachineIRBuilder MIB(I);
+      auto Copy = MIB.buildCopy({DstTempRC}, {SrcReg});
+      copySubReg(I, MRI, RBI, Copy.getReg(0), DstRC, SubReg);
+    } else if (SrcSize > DstSize) {
+      // If the source register is bigger than the destination we need to
+      // perform a subregister copy.
+      const TargetRegisterClass *SubRegRC =
+          getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
+      getSubRegForClass(SubRegRC, TRI, SubReg);
+      copySubReg(I, MRI, RBI, SrcReg, DstRC, SubReg);
+    } else if (DstSize > SrcSize) {
+      // If the destination register is bigger than the source we need to do
+      // a promotion using SUBREG_TO_REG.
+      const TargetRegisterClass *PromotionRC =
+          getMinClassForRegBank(SrcRegBank, DstSize, /* GetAllRegSet */ true);
+      getSubRegForClass(SrcRC, TRI, SubReg);
+
+      Register PromoteReg = MRI.createVirtualRegister(PromotionRC);
+      BuildMI(*I.getParent(), I, I.getDebugLoc(),
+              TII.get(AArch64::SUBREG_TO_REG), PromoteReg)
+          .addImm(0)
+          .addUse(SrcReg)
+          .addImm(SubReg);
+      MachineOperand &RegOp = I.getOperand(1);
+      RegOp.setReg(PromoteReg);
 
-        // Promise that the copy is implicitly validated by the SUBREG_TO_REG.
-        KnownValid = true;
-      }
+      // Promise that the copy is implicitly validated by the SUBREG_TO_REG.
+      KnownValid = true;
     }
 
     // If the destination is a physical register, then there's nothing to
@@ -977,6 +1052,216 @@ static void changeFCMPPredToAArch64CC(CmpInst::Predicate P,
   }
 }
 
+/// Return a register which can be used as a bit to test in a TB(N)Z.
+static Register getTestBitReg(Register Reg, uint64_t &Bit, bool &Invert,
+                              MachineRegisterInfo &MRI) {
+  assert(Reg.isValid() && "Expected valid register!");
+  while (MachineInstr *MI = getDefIgnoringCopies(Reg, MRI)) {
+    unsigned Opc = MI->getOpcode();
+
+    if (!MI->getOperand(0).isReg() ||
+        !MRI.hasOneNonDBGUse(MI->getOperand(0).getReg()))
+      break;
+
+    // (tbz (any_ext x), b) -> (tbz x, b) if we don't use the extended bits.
+    //
+    // (tbz (trunc x), b) -> (tbz x, b) is always safe, because the bit number
+    // on the truncated x is the same as the bit number on x.
+    if (Opc == TargetOpcode::G_ANYEXT || Opc == TargetOpcode::G_ZEXT ||
+        Opc == TargetOpcode::G_TRUNC) {
+      Register NextReg = MI->getOperand(1).getReg();
+      // Did we find something worth folding?
+      if (!NextReg.isValid() || !MRI.hasOneNonDBGUse(NextReg))
+        break;
+
+      // NextReg is worth folding. Keep looking.
+      Reg = NextReg;
+      continue;
+    }
+
+    // Attempt to find a suitable operation with a constant on one side.
+    Optional<uint64_t> C;
+    Register TestReg;
+    switch (Opc) {
+    default:
+      break;
+    case TargetOpcode::G_AND:
+    case TargetOpcode::G_XOR: {
+      TestReg = MI->getOperand(1).getReg();
+      Register ConstantReg = MI->getOperand(2).getReg();
+      auto VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
+      if (!VRegAndVal) {
+        // AND commutes, check the other side for a constant.
+        // FIXME: Can we canonicalize the constant so that it's always on the
+        // same side at some point earlier?
+        std::swap(ConstantReg, TestReg);
+        VRegAndVal = getConstantVRegValWithLookThrough(ConstantReg, MRI);
+      }
+      if (VRegAndVal)
+        C = VRegAndVal->Value;
+      break;
+    }
+    case TargetOpcode::G_ASHR:
+    case TargetOpcode::G_LSHR:
+    case TargetOpcode::G_SHL: {
+      TestReg = MI->getOperand(1).getReg();
+      auto VRegAndVal =
+          getConstantVRegValWithLookThrough(MI->getOperand(2).getReg(), MRI);
+      if (VRegAndVal)
+        C = VRegAndVal->Value;
+      break;
+    }
+    }
+
+    // Didn't find a constant or viable register. Bail out of the loop.
+    if (!C || !TestReg.isValid())
+      break;
+
+    // We found a suitable instruction with a constant. Check to see if we can
+    // walk through the instruction.
+    Register NextReg;
+    unsigned TestRegSize = MRI.getType(TestReg).getSizeInBits();
+    switch (Opc) {
+    default:
+      break;
+    case TargetOpcode::G_AND:
+      // (tbz (and x, m), b) -> (tbz x, b) when the b-th bit of m is set.
+      if ((*C >> Bit) & 1)
+        NextReg = TestReg;
+      break;
+    case TargetOpcode::G_SHL:
+      // (tbz (shl x, c), b) -> (tbz x, b-c) when b-c is positive and fits in
+      // the type of the register.
+      if (*C <= Bit && (Bit - *C) < TestRegSize) {
+        NextReg = TestReg;
+        Bit = Bit - *C;
+      }
+      break;
+    case TargetOpcode::G_ASHR:
+      // (tbz (ashr x, c), b) -> (tbz x, b+c) or (tbz x, msb) if b+c is > # bits
+      // in x
+      NextReg = TestReg;
+      Bit = Bit + *C;
+      if (Bit >= TestRegSize)
+        Bit = TestRegSize - 1;
+      break;
+    case TargetOpcode::G_LSHR:
+      // (tbz (lshr x, c), b) -> (tbz x, b+c) when b + c is < # bits in x
+      if ((Bit + *C) < TestRegSize) {
+        NextReg = TestReg;
+        Bit = Bit + *C;
+      }
+      break;
+    case TargetOpcode::G_XOR:
+      // We can walk through a G_XOR by inverting whether we use tbz/tbnz when
+      // appropriate.
+      //
+      // e.g. If x' = xor x, c, and the b-th bit is set in c then
+      //
+      // tbz x', b -> tbnz x, b
+      //
+      // Because x' only has the b-th bit set if x does not.
+      if ((*C >> Bit) & 1)
+        Invert = !Invert;
+      NextReg = TestReg;
+      break;
+    }
+
+    // Check if we found anything worth folding.
+    if (!NextReg.isValid())
+      return Reg;
+    Reg = NextReg;
+  }
+
+  return Reg;
+}
+
+MachineInstr *AArch64InstructionSelector::emitTestBit(
+    Register TestReg, uint64_t Bit, bool IsNegative, MachineBasicBlock *DstMBB,
+    MachineIRBuilder &MIB) const {
+  assert(TestReg.isValid());
+  assert(ProduceNonFlagSettingCondBr &&
+         "Cannot emit TB(N)Z with speculation tracking!");
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+
+  // Attempt to optimize the test bit by walking over instructions.
+  TestReg = getTestBitReg(TestReg, Bit, IsNegative, MRI);
+  LLT Ty = MRI.getType(TestReg);
+  unsigned Size = Ty.getSizeInBits();
+  assert(!Ty.isVector() && "Expected a scalar!");
+  assert(Bit < 64 && "Bit is too large!");
+
+  // When the test register is a 64-bit register, we have to narrow to make
+  // TBNZW work.
+  bool UseWReg = Bit < 32;
+  unsigned NecessarySize = UseWReg ? 32 : 64;
+  if (Size < NecessarySize)
+    TestReg = widenGPRBankRegIfNeeded(TestReg, NecessarySize, MIB);
+  else if (Size > NecessarySize)
+    TestReg = narrowExtendRegIfNeeded(TestReg, MIB);
+
+  static const unsigned OpcTable[2][2] = {{AArch64::TBZX, AArch64::TBNZX},
+                                          {AArch64::TBZW, AArch64::TBNZW}};
+  unsigned Opc = OpcTable[UseWReg][IsNegative];
+  auto TestBitMI =
+      MIB.buildInstr(Opc).addReg(TestReg).addImm(Bit).addMBB(DstMBB);
+  constrainSelectedInstRegOperands(*TestBitMI, TII, TRI, RBI);
+  return &*TestBitMI;
+}
+
+bool AArch64InstructionSelector::tryOptAndIntoCompareBranch(
+    MachineInstr *AndInst, int64_t CmpConstant, const CmpInst::Predicate &Pred,
+    MachineBasicBlock *DstMBB, MachineIRBuilder &MIB) const {
+  // Given something like this:
+  //
+  //  %x = ...Something...
+  //  %one = G_CONSTANT i64 1
+  //  %zero = G_CONSTANT i64 0
+  //  %and = G_AND %x, %one
+  //  %cmp = G_ICMP intpred(ne), %and, %zero
+  //  %cmp_trunc = G_TRUNC %cmp
+  //  G_BRCOND %cmp_trunc, %bb.3
+  //
+  // We want to try and fold the AND into the G_BRCOND and produce either a
+  // TBNZ (when we have intpred(ne)) or a TBZ (when we have intpred(eq)).
+  //
+  // In this case, we'd get
+  //
+  // TBNZ %x %bb.3
+  //
+  if (!AndInst || AndInst->getOpcode() != TargetOpcode::G_AND)
+    return false;
+
+  // Need to be comparing against 0 to fold.
+  if (CmpConstant != 0)
+    return false;
+
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+
+  // Only support EQ and NE. If we have LT, then it *is* possible to fold, but
+  // we don't want to do this. When we have an AND and LT, we need a TST/ANDS,
+  // so folding would be redundant.
+  if (Pred != CmpInst::Predicate::ICMP_EQ &&
+      Pred != CmpInst::Predicate::ICMP_NE)
+    return false;
+
+  // Check if the AND has a constant on its RHS which we can use as a mask.
+  // If it's a power of 2, then it's the same as checking a specific bit.
+  // (e.g, ANDing with 8 == ANDing with 000...100 == testing if bit 3 is set)
+  auto MaybeBit =
+      getConstantVRegValWithLookThrough(AndInst->getOperand(2).getReg(), MRI);
+  if (!MaybeBit || !isPowerOf2_64(MaybeBit->Value))
+    return false;
+
+  uint64_t Bit = Log2_64(static_cast<uint64_t>(MaybeBit->Value));
+  Register TestReg = AndInst->getOperand(1).getReg();
+  bool Invert = Pred == CmpInst::Predicate::ICMP_NE;
+
+  // Emit a TB(N)Z.
+  emitTestBit(TestReg, Bit, Invert, DstMBB, MIB);
+  return true;
+}
+
 bool AArch64InstructionSelector::selectCompareBranch(
     MachineInstr &I, MachineFunction &MF, MachineRegisterInfo &MRI) const {
 
@@ -991,28 +1276,67 @@ bool AArch64InstructionSelector::selectCompareBranch(
   Register LHS = CCMI->getOperand(2).getReg();
   Register RHS = CCMI->getOperand(3).getReg();
   auto VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
-  if (!VRegAndVal)
+  MachineIRBuilder MIB(I);
+  CmpInst::Predicate Pred =
+      (CmpInst::Predicate)CCMI->getOperand(1).getPredicate();
+  MachineInstr *LHSMI = getDefIgnoringCopies(LHS, MRI);
+
+  // When we can emit a TB(N)Z, prefer that.
+  //
+  // Handle non-commutative condition codes first.
+  // Note that we don't want to do this when we have a G_AND because it can
+  // become a tst. The tst will make the test bit in the TB(N)Z redundant.
+  if (VRegAndVal && LHSMI->getOpcode() != TargetOpcode::G_AND) {
+    int64_t C = VRegAndVal->Value;
+
+    // When we have a greater-than comparison, we can just test if the msb is
+    // zero.
+    if (C == -1 && Pred == CmpInst::ICMP_SGT) {
+      uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
+      emitTestBit(LHS, Bit, /*IsNegative = */ false, DestMBB, MIB);
+      I.eraseFromParent();
+      return true;
+    }
+
+    // When we have a less than comparison, we can just test if the msb is not
+    // zero.
+    if (C == 0 && Pred == CmpInst::ICMP_SLT) {
+      uint64_t Bit = MRI.getType(LHS).getSizeInBits() - 1;
+      emitTestBit(LHS, Bit, /*IsNegative = */ true, DestMBB, MIB);
+      I.eraseFromParent();
+      return true;
+    }
+  }
+
+  if (!VRegAndVal) {
     std::swap(RHS, LHS);
+    VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
+    LHSMI = getDefIgnoringCopies(LHS, MRI);
+  }
 
-  VRegAndVal = getConstantVRegValWithLookThrough(RHS, MRI);
   if (!VRegAndVal || VRegAndVal->Value != 0) {
-    MachineIRBuilder MIB(I);
     // If we can't select a CBZ then emit a cmp + Bcc.
-    if (!emitIntegerCompare(CCMI->getOperand(2), CCMI->getOperand(3),
-                            CCMI->getOperand(1), MIB))
+    MachineInstr *Cmp;
+    std::tie(Cmp, Pred) = emitIntegerCompare(
+        CCMI->getOperand(2), CCMI->getOperand(3), CCMI->getOperand(1), MIB);
+    if (!Cmp)
       return false;
-    const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(
-        (CmpInst::Predicate)CCMI->getOperand(1).getPredicate());
+    const AArch64CC::CondCode CC = changeICMPPredToAArch64CC(Pred);
     MIB.buildInstr(AArch64::Bcc, {}, {}).addImm(CC).addMBB(DestMBB);
     I.eraseFromParent();
     return true;
   }
 
+  // Try to emit a TB(N)Z for an eq or ne condition.
+  if (tryOptAndIntoCompareBranch(LHSMI, VRegAndVal->Value, Pred, DestMBB,
+                                 MIB)) {
+    I.eraseFromParent();
+    return true;
+  }
+
   const RegisterBank &RB = *RBI.getRegBank(LHS, MRI, TRI);
   if (RB.getID() != AArch64::GPRRegBankID)
     return false;
-
-  const auto Pred = (CmpInst::Predicate)CCMI->getOperand(1).getPredicate();
   if (Pred != CmpInst::ICMP_NE && Pred != CmpInst::ICMP_EQ)
     return false;
 
@@ -1247,7 +1571,7 @@ void AArch64InstructionSelector::materializeLargeCMVal(
   return;
 }
 
-void AArch64InstructionSelector::preISelLower(MachineInstr &I) const {
+bool AArch64InstructionSelector::preISelLower(MachineInstr &I) {
   MachineBasicBlock &MBB = *I.getParent();
   MachineFunction &MF = *MBB.getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
@@ -1267,10 +1591,10 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const {
     const LLT ShiftTy = MRI.getType(ShiftReg);
     const LLT SrcTy = MRI.getType(SrcReg);
     if (SrcTy.isVector())
-      return;
+      return false;
     assert(!ShiftTy.isVector() && "unexpected vector shift ty");
     if (SrcTy.getSizeInBits() != 32 || ShiftTy.getSizeInBits() != 64)
-      return;
+      return false;
     auto *AmtMI = MRI.getVRegDef(ShiftReg);
     assert(AmtMI && "could not find a vreg definition for shift amount");
     if (AmtMI->getOpcode() != TargetOpcode::G_CONSTANT) {
@@ -1281,14 +1605,65 @@ void AArch64InstructionSelector::preISelLower(MachineInstr &I) const {
       MRI.setRegBank(Trunc.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
       I.getOperand(2).setReg(Trunc.getReg(0));
     }
-    return;
+    return true;
   }
   case TargetOpcode::G_STORE:
-    contractCrossBankCopyIntoStore(I, MRI);
-    return;
+    return contractCrossBankCopyIntoStore(I, MRI);
+  case TargetOpcode::G_PTR_ADD:
+    return convertPtrAddToAdd(I, MRI);
+  case TargetOpcode::G_LOAD: {
+    // For scalar loads of pointers, we try to convert the dest type from p0
+    // to s64 so that our imported patterns can match. Like with the G_PTR_ADD
+    // conversion, this should be ok because all users should have been
+    // selected already, so the type doesn't matter for them.
+    Register DstReg = I.getOperand(0).getReg();
+    const LLT DstTy = MRI.getType(DstReg);
+    if (!DstTy.isPointer())
+      return false;
+    MRI.setType(DstReg, LLT::scalar(64));
+    return true;
+  }
   default:
-    return;
+    return false;
+  }
+}
+
+/// This lowering tries to look for G_PTR_ADD instructions and then converts
+/// them to a standard G_ADD with a COPY on the source.
+///
+/// The motivation behind this is to expose the add semantics to the imported
+/// tablegen patterns. We shouldn't need to check for uses being loads/stores,
+/// because the selector works bottom up, uses before defs. By the time we
+/// end up trying to select a G_PTR_ADD, we should have already attempted to
+/// fold this into addressing modes and were therefore unsuccessful.
+bool AArch64InstructionSelector::convertPtrAddToAdd(
+    MachineInstr &I, MachineRegisterInfo &MRI) {
+  assert(I.getOpcode() == TargetOpcode::G_PTR_ADD && "Expected G_PTR_ADD");
+  Register DstReg = I.getOperand(0).getReg();
+  Register AddOp1Reg = I.getOperand(1).getReg();
+  const LLT PtrTy = MRI.getType(DstReg);
+  if (PtrTy.getAddressSpace() != 0)
+    return false;
+
+  MachineIRBuilder MIB(I);
+  const LLT CastPtrTy = PtrTy.isVector() ? LLT::vector(2, 64) : LLT::scalar(64);
+  auto PtrToInt = MIB.buildPtrToInt(CastPtrTy, AddOp1Reg);
+  // Set regbanks on the registers.
+  if (PtrTy.isVector())
+    MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::FPRRegBankID));
+  else
+    MRI.setRegBank(PtrToInt.getReg(0), RBI.getRegBank(AArch64::GPRRegBankID));
+
+  // Now turn the %dst(p0) = G_PTR_ADD %base, off into:
+  // %dst(intty) = G_ADD %intbase, off
+  I.setDesc(TII.get(TargetOpcode::G_ADD));
+  MRI.setType(DstReg, CastPtrTy);
+  I.getOperand(1).setReg(PtrToInt.getReg(0));
+  if (!select(*PtrToInt)) {
+    LLVM_DEBUG(dbgs() << "Failed to select G_PTRTOINT in convertPtrAddToAdd");
+    return false;
   }
+  return true;
 }
 
 bool AArch64InstructionSelector::earlySelectSHL(
@@ -1326,8 +1701,8 @@ bool AArch64InstructionSelector::earlySelectSHL(
   return constrainSelectedInstRegOperands(*NewI, TII, TRI, RBI);
 }
 
-void AArch64InstructionSelector::contractCrossBankCopyIntoStore(
-    MachineInstr &I, MachineRegisterInfo &MRI) const {
+bool AArch64InstructionSelector::contractCrossBankCopyIntoStore(
+    MachineInstr &I, MachineRegisterInfo &MRI) {
   assert(I.getOpcode() == TargetOpcode::G_STORE && "Expected G_STORE");
   // If we're storing a scalar, it doesn't matter what register bank that
   // scalar is on. All that matters is the size.
@@ -1343,10 +1718,9 @@ void AArch64InstructionSelector::contractCrossBankCopyIntoStore(
   // G_STORE %x:gpr(s32)
   //
   // And then continue the selection process normally.
-  MachineInstr *Def = getDefIgnoringCopies(I.getOperand(0).getReg(), MRI);
-  if (!Def)
-    return;
-  Register DefDstReg = Def->getOperand(0).getReg();
+  Register DefDstReg = getSrcRegIgnoringCopies(I.getOperand(0).getReg(), MRI);
+  if (!DefDstReg.isValid())
+    return false;
   LLT DefDstTy = MRI.getType(DefDstReg);
   Register StoreSrcReg = I.getOperand(0).getReg();
   LLT StoreSrcTy = MRI.getType(StoreSrcReg);
@@ -1354,18 +1728,19 @@ void AArch64InstructionSelector::contractCrossBankCopyIntoStore(
   // If we get something strange like a physical register, then we shouldn't
   // go any further.
   if (!DefDstTy.isValid())
-    return;
+    return false;
 
   // Are the source and dst types the same size?
   if (DefDstTy.getSizeInBits() != StoreSrcTy.getSizeInBits())
-    return;
+    return false;
 
   if (RBI.getRegBank(StoreSrcReg, MRI, TRI) ==
       RBI.getRegBank(DefDstReg, MRI, TRI))
-    return;
+    return false;
 
   // We have a cross-bank copy, which is entering a store. Let's fold it.
   I.getOperand(0).setReg(DefDstReg);
+  return true;
 }
 
 bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
@@ -1391,16 +1766,15 @@ bool AArch64InstructionSelector::earlySelect(MachineInstr &I) const {
 
     Register DefReg = I.getOperand(0).getReg();
     LLT Ty = MRI.getType(DefReg);
-    if (Ty != LLT::scalar(64) && Ty != LLT::scalar(32))
-      return false;
-
-    if (Ty == LLT::scalar(64)) {
+    if (Ty.getSizeInBits() == 64) {
       I.getOperand(1).ChangeToRegister(AArch64::XZR, false);
       RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass, MRI);
-    } else {
+    } else if (Ty.getSizeInBits() == 32) {
       I.getOperand(1).ChangeToRegister(AArch64::WZR, false);
       RBI.constrainGenericRegister(DefReg, AArch64::GPR32RegClass, MRI);
-    }
+    } else
+      return false;
+
     I.setDesc(TII.get(TargetOpcode::COPY));
     return true;
   }
@@ -1417,9 +1791,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
   MachineFunction &MF = *MBB.getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
+  const AArch64Subtarget *Subtarget =
+      &static_cast<const AArch64Subtarget &>(MF.getSubtarget());
+  if (Subtarget->requiresStrictAlign()) {
+    // We don't support this feature yet.
+    LLVM_DEBUG(dbgs() << "AArch64 GISel does not support strict-align yet\n");
+    return false;
+  }
+
   unsigned Opcode = I.getOpcode();
   // G_PHI requires same handling as PHI
-  if (!isPreISelGenericOpcode(Opcode) || Opcode == TargetOpcode::G_PHI) {
+  if (!I.isPreISelOpcode() || Opcode == TargetOpcode::G_PHI) {
     // Certain non-generic instructions also need some special handling.
 
     if (Opcode ==  TargetOpcode::LOAD_STACK_GUARD)
@@ -1468,7 +1850,9 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
   // Try to do some lowering before we start instruction selecting. These
   // lowerings are purely transformations on the input G_MIR and so selection
   // must continue after any modification of the instruction.
-  preISelLower(I);
+  if (preISelLower(I)) {
+    Opcode = I.getOpcode(); // The opcode may have been modified, refresh it.
+  }
 
   // There may be patterns where the importer can't deal with them optimally,
   // but does select it to a suboptimal sequence so our custom C++ selection
@@ -1503,8 +1887,6 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     // Speculation tracking/SLH assumes that optimized TB(N)Z/CB(N)Z
     // instructions will not be produced, as they are conditional branch
     // instructions that do not set flags.
-    bool ProduceNonFlagSettingCondBr =
-        !MF.getFunction().hasFnAttribute(Attribute::SpeculativeLoadHardening);
     if (ProduceNonFlagSettingCondBr && selectCompareBranch(I, MF, MRI))
       return true;
 
@@ -1540,6 +1922,31 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_BRJT:
     return selectBrJT(I, MRI);
 
+  case AArch64::G_ADD_LOW: {
+    // This op may have been separated from it's ADRP companion by the localizer
+    // or some other code motion pass. Given that many CPUs will try to
+    // macro fuse these operations anyway, select this into a MOVaddr pseudo
+    // which will later be expanded into an ADRP+ADD pair after scheduling.
+    MachineInstr *BaseMI = MRI.getVRegDef(I.getOperand(1).getReg());
+    if (BaseMI->getOpcode() != AArch64::ADRP) {
+      I.setDesc(TII.get(AArch64::ADDXri));
+      I.addOperand(MachineOperand::CreateImm(0));
+      return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
+    }
+    assert(TM.getCodeModel() == CodeModel::Small &&
+           "Expected small code model");
+    MachineIRBuilder MIB(I);
+    auto Op1 = BaseMI->getOperand(1);
+    auto Op2 = I.getOperand(2);
+    auto MovAddr = MIB.buildInstr(AArch64::MOVaddr, {I.getOperand(0)}, {})
+                       .addGlobalAddress(Op1.getGlobal(), Op1.getOffset(),
+                                         Op1.getTargetFlags())
+                       .addGlobalAddress(Op2.getGlobal(), Op2.getOffset(),
+                                         Op2.getTargetFlags());
+    I.eraseFromParent();
+    return constrainSelectedInstRegOperands(*MovAddr, TII, TRI, RBI);
+  }
+
   case TargetOpcode::G_BSWAP: {
     // Handle vector types for G_BSWAP directly.
     Register DstReg = I.getOperand(0).getReg();
@@ -1644,6 +2051,20 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
       if (emitFMovForFConstant(I, MRI))
         return true;
 
+      // For 64b values, emit a constant pool load instead.
+      if (DefSize == 64) {
+        auto *FPImm = I.getOperand(1).getFPImm();
+        MachineIRBuilder MIB(I);
+        auto *LoadMI = emitLoadFromConstantPool(FPImm, MIB);
+        if (!LoadMI) {
+          LLVM_DEBUG(dbgs() << "Failed to load double constant pool entry\n");
+          return false;
+        }
+        MIB.buildCopy({DefReg}, {LoadMI->getOperand(0).getReg()});
+        I.eraseFromParent();
+        return RBI.constrainGenericRegister(DefReg, FPRRC, MRI);
+      }
+
       // Nope. Emit a copy and use a normal mov instead.
       const Register DefGPRReg = MRI.createVirtualRegister(&GPRRC);
       MachineOperand &RegOp = I.getOperand(0);
@@ -2005,9 +2426,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     // Add and set the set condition flag.
     unsigned AddsOpc = OpSize == 32 ? AArch64::ADDSWrr : AArch64::ADDSXrr;
     MachineIRBuilder MIRBuilder(I);
-    auto AddsMI = MIRBuilder.buildInstr(
-        AddsOpc, {I.getOperand(0).getReg()},
-        {I.getOperand(2).getReg(), I.getOperand(3).getReg()});
+    auto AddsMI = MIRBuilder.buildInstr(AddsOpc, {I.getOperand(0)},
+                                        {I.getOperand(2), I.getOperand(3)});
     constrainSelectedInstRegOperands(*AddsMI, TII, TRI, RBI);
 
     // Now, put the overflow result in the register given by the first operand
@@ -2023,14 +2443,17 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return true;
   }
 
-  case TargetOpcode::G_PTR_MASK: {
-    uint64_t Align = I.getOperand(2).getImm();
-    if (Align >= 64 || Align == 0)
+  case TargetOpcode::G_PTRMASK: {
+    Register MaskReg = I.getOperand(2).getReg();
+    Optional<int64_t> MaskVal = getConstantVRegVal(MaskReg, MRI);
+    // TODO: Implement arbitrary cases
+    if (!MaskVal || !isShiftedMask_64(*MaskVal))
       return false;
 
-    uint64_t Mask = ~((1ULL << Align) - 1);
+    uint64_t Mask = *MaskVal;
     I.setDesc(TII.get(AArch64::ANDXri));
-    I.getOperand(2).setImm(AArch64_AM::encodeLogicalImmediate(Mask, 64));
+    I.getOperand(2).ChangeToImmediate(
+        AArch64_AM::encodeLogicalImmediate(Mask, 64));
 
     return constrainSelectedInstRegOperands(I, TII, TRI, RBI);
   }
@@ -2101,6 +2524,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
         I.eraseFromParent();
         return true;
       }
+
+      // We might have a vector G_PTRTOINT, in which case just emit a COPY.
+      if (Opcode == TargetOpcode::G_PTRTOINT) {
+        assert(DstTy.isVector() && "Expected an FPR ptrtoint to be a vector");
+        I.setDesc(TII.get(TargetOpcode::COPY));
+        return true;
+      }
     }
 
     return false;
@@ -2151,16 +2581,22 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
   }
 
   case TargetOpcode::G_ZEXT:
+  case TargetOpcode::G_SEXT_INREG:
   case TargetOpcode::G_SEXT: {
     unsigned Opcode = I.getOpcode();
-    const bool IsSigned = Opcode == TargetOpcode::G_SEXT;
+    const bool IsSigned = Opcode != TargetOpcode::G_ZEXT;
     const Register DefReg = I.getOperand(0).getReg();
-    const Register SrcReg = I.getOperand(1).getReg();
+    Register SrcReg = I.getOperand(1).getReg();
     const LLT DstTy = MRI.getType(DefReg);
     const LLT SrcTy = MRI.getType(SrcReg);
     unsigned DstSize = DstTy.getSizeInBits();
     unsigned SrcSize = SrcTy.getSizeInBits();
 
+    // SEXT_INREG has the same src reg size as dst, the size of the value to be
+    // extended is encoded in the imm.
+    if (Opcode == TargetOpcode::G_SEXT_INREG)
+      SrcSize = I.getOperand(2).getImm();
+
     if (DstTy.isVector())
       return false; // Should be handled by imported patterns.
 
@@ -2179,31 +2615,65 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     // %v2(s32) = G_ZEXT %v(s8)
     if (!IsSigned) {
       auto *LoadMI = getOpcodeDef(TargetOpcode::G_LOAD, SrcReg, MRI);
-      if (LoadMI &&
-          RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID) {
+      bool IsGPR =
+          RBI.getRegBank(SrcReg, MRI, TRI)->getID() == AArch64::GPRRegBankID;
+      if (LoadMI && IsGPR) {
         const MachineMemOperand *MemOp = *LoadMI->memoperands_begin();
         unsigned BytesLoaded = MemOp->getSize();
         if (BytesLoaded < 4 && SrcTy.getSizeInBytes() == BytesLoaded)
           return selectCopy(I, TII, MRI, TRI, RBI);
       }
-    }
 
-    if (DstSize == 64) {
-      // FIXME: Can we avoid manually doing this?
-      if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass, MRI)) {
-        LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
-                          << " operand\n");
-        return false;
-      }
-
-      auto SubregToReg =
-          MIB.buildInstr(AArch64::SUBREG_TO_REG, {&AArch64::GPR64RegClass}, {})
+      // If we are zero extending from 32 bits to 64 bits, it's possible that
+      // the instruction implicitly does the zero extend for us. In that case,
+      // we can just emit a SUBREG_TO_REG.
+      if (IsGPR && SrcSize == 32 && DstSize == 64) {
+        // Unlike with the G_LOAD case, we don't want to look through copies
+        // here.
+        MachineInstr *Def = MRI.getVRegDef(SrcReg);
+        if (Def && isDef32(*Def)) {
+          MIB.buildInstr(AArch64::SUBREG_TO_REG, {DefReg}, {})
               .addImm(0)
               .addUse(SrcReg)
               .addImm(AArch64::sub_32);
 
+          if (!RBI.constrainGenericRegister(DefReg, AArch64::GPR64RegClass,
+                                            MRI)) {
+            LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT destination\n");
+            return false;
+          }
+
+          if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
+                                            MRI)) {
+            LLVM_DEBUG(dbgs() << "Failed to constrain G_ZEXT source\n");
+            return false;
+          }
+
+          I.eraseFromParent();
+          return true;
+        }
+      }
+    }
+
+    if (DstSize == 64) {
+      if (Opcode != TargetOpcode::G_SEXT_INREG) {
+        // FIXME: Can we avoid manually doing this?
+        if (!RBI.constrainGenericRegister(SrcReg, AArch64::GPR32RegClass,
+                                          MRI)) {
+          LLVM_DEBUG(dbgs() << "Failed to constrain " << TII.getName(Opcode)
+                            << " operand\n");
+          return false;
+        }
+        SrcReg = MIB.buildInstr(AArch64::SUBREG_TO_REG,
+                                {&AArch64::GPR64RegClass}, {})
+                     .addImm(0)
+                     .addUse(SrcReg)
+                     .addImm(AArch64::sub_32)
+                     .getReg(0);
+      }
+
       ExtI = MIB.buildInstr(IsSigned ? AArch64::SBFMXri : AArch64::UBFMXri,
-                             {DefReg}, {SubregToReg})
+                             {DefReg}, {SrcReg})
                   .addImm(0)
                   .addImm(SrcSize - 1);
     } else if (DstSize <= 32) {
@@ -2236,6 +2706,8 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     return true;
   }
 
+  case TargetOpcode::G_FREEZE:
+    return selectCopy(I, TII, MRI, TRI, RBI);
 
   case TargetOpcode::G_INTTOPTR:
     // The importer is currently unable to import pointer types since they
@@ -2294,11 +2766,13 @@ bool AArch64InstructionSelector::select(MachineInstr &I) {
     }
 
     MachineIRBuilder MIRBuilder(I);
-    if (!emitIntegerCompare(I.getOperand(2), I.getOperand(3), I.getOperand(1),
-                            MIRBuilder))
+    MachineInstr *Cmp;
+    CmpInst::Predicate Pred;
+    std::tie(Cmp, Pred) = emitIntegerCompare(I.getOperand(2), I.getOperand(3),
+                                             I.getOperand(1), MIRBuilder);
+    if (!Cmp)
       return false;
-    emitCSetForICMP(I.getOperand(0).getReg(), I.getOperand(1).getPredicate(),
-                    MIRBuilder);
+    emitCSetForICMP(I.getOperand(0).getReg(), Pred, MIRBuilder);
     I.eraseFromParent();
     return true;
   }
@@ -2435,14 +2909,13 @@ bool AArch64InstructionSelector::selectBrJT(MachineInstr &I,
 
   Register TargetReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
   Register ScratchReg = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
-  MIB.buildInstr(AArch64::JumpTableDest32, {TargetReg, ScratchReg},
-                 {JTAddr, Index})
-      .addJumpTableIndex(JTI);
-
+  auto JumpTableInst = MIB.buildInstr(AArch64::JumpTableDest32,
+                                      {TargetReg, ScratchReg}, {JTAddr, Index})
+                           .addJumpTableIndex(JTI);
   // Build the indirect branch.
   MIB.buildInstr(AArch64::BR, {}, {TargetReg});
   I.eraseFromParent();
-  return true;
+  return constrainSelectedInstRegOperands(*JumpTableInst, TII, TRI, RBI);
 }
 
 bool AArch64InstructionSelector::selectJumpTable(
@@ -2482,7 +2955,7 @@ bool AArch64InstructionSelector::selectTLSGlobalValue(
   // TLS calls preserve all registers except those that absolutely must be
   // trashed: X0 (it takes an argument), LR (it's a call) and NZCV (let's not be
   // silly).
-  MIB.buildInstr(AArch64::BLR, {}, {Load})
+  MIB.buildInstr(getBLRCallOpcode(MF), {}, {Load})
       .addDef(AArch64::X0, RegState::Implicit)
       .addRegMask(TRI.getTLSCallPreservedMask());
 
@@ -3158,19 +3631,17 @@ bool AArch64InstructionSelector::selectConcatVectors(
 }
 
 unsigned
-AArch64InstructionSelector::emitConstantPoolEntry(Constant *CPVal,
+AArch64InstructionSelector::emitConstantPoolEntry(const Constant *CPVal,
                                                   MachineFunction &MF) const {
   Type *CPTy = CPVal->getType();
-  unsigned Align = MF.getDataLayout().getPrefTypeAlignment(CPTy);
-  if (Align == 0)
-    Align = MF.getDataLayout().getTypeAllocSize(CPTy);
+  Align Alignment = MF.getDataLayout().getPrefTypeAlign(CPTy);
 
   MachineConstantPool *MCP = MF.getConstantPool();
-  return MCP->getConstantPoolIndex(CPVal, Align);
+  return MCP->getConstantPoolIndex(CPVal, Alignment);
 }
 
 MachineInstr *AArch64InstructionSelector::emitLoadFromConstantPool(
-    Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
+    const Constant *CPVal, MachineIRBuilder &MIRBuilder) const {
   unsigned CPIdx = emitConstantPoolEntry(CPVal, MIRBuilder.getMF());
 
   auto Adrp =
@@ -3248,7 +3719,7 @@ AArch64InstructionSelector::emitADD(Register DefReg, MachineOperand &LHS,
   bool Is32Bit = MRI.getType(LHS.getReg()).getSizeInBits() == 32;
   auto ImmFns = selectArithImmed(RHS);
   unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()];
-  auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS.getReg()});
+  auto AddMI = MIRBuilder.buildInstr(Opc, {DefReg}, {LHS});
 
   // If we matched a valid constant immediate, add those operands.
   if (ImmFns) {
@@ -3274,7 +3745,7 @@ AArch64InstructionSelector::emitCMN(MachineOperand &LHS, MachineOperand &RHS,
   unsigned Opc = OpcTable[Is32Bit][ImmFns.hasValue()];
   Register ZReg = Is32Bit ? AArch64::WZR : AArch64::XZR;
 
-  auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS.getReg()});
+  auto CmpMI = MIRBuilder.buildInstr(Opc, {ZReg}, {LHS});
 
   // If we matched a valid constant immediate, add those operands.
   if (ImmFns) {
@@ -3316,17 +3787,21 @@ AArch64InstructionSelector::emitTST(const Register &LHS, const Register &RHS,
   return &*TstMI;
 }
 
-MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
+std::pair<MachineInstr *, CmpInst::Predicate>
+AArch64InstructionSelector::emitIntegerCompare(
     MachineOperand &LHS, MachineOperand &RHS, MachineOperand &Predicate,
     MachineIRBuilder &MIRBuilder) const {
   assert(LHS.isReg() && RHS.isReg() && "Expected LHS and RHS to be registers!");
+  assert(Predicate.isPredicate() && "Expected predicate?");
   MachineRegisterInfo &MRI = MIRBuilder.getMF().getRegInfo();
 
+  CmpInst::Predicate P = (CmpInst::Predicate)Predicate.getPredicate();
+
   // Fold the compare if possible.
   MachineInstr *FoldCmp =
       tryFoldIntegerCompare(LHS, RHS, Predicate, MIRBuilder);
   if (FoldCmp)
-    return FoldCmp;
+    return {FoldCmp, P};
 
   // Can't fold into a CMN. Just emit a normal compare.
   unsigned CmpOpc = 0;
@@ -3337,31 +3812,31 @@ MachineInstr *AArch64InstructionSelector::emitIntegerCompare(
          "Expected scalar or pointer");
   if (CmpTy == LLT::scalar(32)) {
     CmpOpc = AArch64::SUBSWrr;
-    ZReg = AArch64::WZR;
+    ZReg = MRI.createVirtualRegister(&AArch64::GPR32RegClass);
   } else if (CmpTy == LLT::scalar(64) || CmpTy.isPointer()) {
     CmpOpc = AArch64::SUBSXrr;
-    ZReg = AArch64::XZR;
+    ZReg = MRI.createVirtualRegister(&AArch64::GPR64RegClass);
   } else {
-    return nullptr;
+    return {nullptr, CmpInst::Predicate::BAD_ICMP_PREDICATE};
   }
 
   // Try to match immediate forms.
-  auto ImmFns = selectArithImmed(RHS);
-  if (ImmFns)
-    CmpOpc = CmpOpc == AArch64::SUBSWrr ? AArch64::SUBSWri : AArch64::SUBSXri;
-
-  auto CmpMI = MIRBuilder.buildInstr(CmpOpc).addDef(ZReg).addUse(LHS.getReg());
-  // If we matched a valid constant immediate, add those operands.
-  if (ImmFns) {
-    for (auto &RenderFn : *ImmFns)
-      RenderFn(CmpMI);
-  } else {
-    CmpMI.addUse(RHS.getReg());
-  }
-
+  MachineInstr *ImmedCmp =
+      tryOptArithImmedIntegerCompare(LHS, RHS, P, MIRBuilder);
+  if (ImmedCmp)
+    return {ImmedCmp, P};
+
+  // If we don't have an immediate, we may have a shift which can be folded
+  // into the compare.
+  MachineInstr *ShiftedCmp = tryOptArithShiftedCompare(LHS, RHS, MIRBuilder);
+  if (ShiftedCmp)
+    return {ShiftedCmp, P};
+
+  auto CmpMI =
+      MIRBuilder.buildInstr(CmpOpc, {ZReg}, {LHS.getReg(), RHS.getReg()});
   // Make sure that we can constrain the compare that we emitted.
   constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
-  return &*CmpMI;
+  return {&*CmpMI, P};
 }
 
 MachineInstr *AArch64InstructionSelector::emitVectorConcat(
@@ -3497,8 +3972,16 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
   MachineInstr *CondDef = MRI.getVRegDef(I.getOperand(1).getReg());
   while (CondDef) {
     // We can only fold if all of the defs have one use.
-    if (!MRI.hasOneUse(CondDef->getOperand(0).getReg()))
-      return false;
+    Register CondDefReg = CondDef->getOperand(0).getReg();
+    if (!MRI.hasOneNonDBGUse(CondDefReg)) {
+      // Unless it's another select.
+      for (const MachineInstr &UI : MRI.use_nodbg_instructions(CondDefReg)) {
+        if (CondDef == &UI)
+          continue;
+        if (UI.getOpcode() != TargetOpcode::G_SELECT)
+          return false;
+      }
+    }
 
     // We can skip over G_TRUNC since the condition is 1-bit.
     // Truncating/extending can have no impact on the value.
@@ -3524,13 +4007,21 @@ bool AArch64InstructionSelector::tryOptSelect(MachineInstr &I) const {
 
   AArch64CC::CondCode CondCode;
   if (CondOpc == TargetOpcode::G_ICMP) {
-    CondCode = changeICMPPredToAArch64CC(
-        (CmpInst::Predicate)CondDef->getOperand(1).getPredicate());
-    if (!emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
-                            CondDef->getOperand(1), MIB)) {
+    MachineInstr *Cmp;
+    CmpInst::Predicate Pred;
+
+    std::tie(Cmp, Pred) =
+        emitIntegerCompare(CondDef->getOperand(2), CondDef->getOperand(3),
+                           CondDef->getOperand(1), MIB);
+
+    if (!Cmp) {
       LLVM_DEBUG(dbgs() << "Couldn't emit compare for select!\n");
       return false;
     }
+
+    // Have to collect the CondCode after emitIntegerCompare, since it can
+    // update the predicate.
+    CondCode = changeICMPPredToAArch64CC(Pred);
   } else {
     // Get the condition code for the select.
     AArch64CC::CondCode CondCode2;
@@ -3660,119 +4151,150 @@ MachineInstr *AArch64InstructionSelector::tryFoldIntegerCompare(
   return nullptr;
 }
 
-bool AArch64InstructionSelector::tryOptVectorDup(MachineInstr &I) const {
-  // Try to match a vector splat operation into a dup instruction.
-  // We're looking for this pattern:
-  //    %scalar:gpr(s64) = COPY $x0
-  //    %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
-  //    %cst0:gpr(s32) = G_CONSTANT i32 0
-  //    %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
-  //    %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
-  //    %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef,
-  //                                             %zerovec(<2 x s32>)
-  //
-  // ...into:
-  // %splat = DUP %scalar
-  // We use the regbank of the scalar to determine which kind of dup to use.
-  MachineIRBuilder MIB(I);
+MachineInstr *AArch64InstructionSelector::tryOptArithImmedIntegerCompare(
+    MachineOperand &LHS, MachineOperand &RHS, CmpInst::Predicate &P,
+    MachineIRBuilder &MIB) const {
+  // Attempt to select the immediate form of an integer compare.
   MachineRegisterInfo &MRI = *MIB.getMRI();
-  const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
-  using namespace TargetOpcode;
-  using namespace MIPatternMatch;
-
-  // Begin matching the insert.
-  auto *InsMI =
-      getOpcodeDef(G_INSERT_VECTOR_ELT, I.getOperand(1).getReg(), MRI);
-  if (!InsMI)
-    return false;
-  // Match the undef vector operand.
-  auto *UndefMI =
-      getOpcodeDef(G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(), MRI);
-  if (!UndefMI)
-    return false;
-  // Match the scalar being splatted.
-  Register ScalarReg = InsMI->getOperand(2).getReg();
-  const RegisterBank *ScalarRB = RBI.getRegBank(ScalarReg, MRI, TRI);
-  // Match the index constant 0.
-  int64_t Index = 0;
-  if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index)
-    return false;
-
-  // The shuffle's second operand doesn't matter if the mask is all zero.
-  ArrayRef<int> Mask = I.getOperand(3).getShuffleMask();
-  if (!all_of(Mask, [](int Elem) { return Elem == 0; }))
-    return false;
+  auto Ty = MRI.getType(LHS.getReg());
+  assert(!Ty.isVector() && "Expected scalar or pointer only?");
+  unsigned Size = Ty.getSizeInBits();
+  assert((Size == 32 || Size == 64) &&
+         "Expected 32 bit or 64 bit compare only?");
+
+  // Check if this is a case we can already handle.
+  InstructionSelector::ComplexRendererFns ImmFns;
+  ImmFns = selectArithImmed(RHS);
+
+  if (!ImmFns) {
+    // We didn't get a rendering function, but we may still have a constant.
+    auto MaybeImmed = getImmedFromMO(RHS);
+    if (!MaybeImmed)
+      return nullptr;
 
-  // We're done, now find out what kind of splat we need.
-  LLT VecTy = MRI.getType(I.getOperand(0).getReg());
-  LLT EltTy = VecTy.getElementType();
-  if (EltTy.getSizeInBits() < 32) {
-    LLVM_DEBUG(dbgs() << "Could not optimize splat pattern < 32b elts yet");
-    return false;
-  }
-  bool IsFP = ScalarRB->getID() == AArch64::FPRRegBankID;
-  unsigned Opc = 0;
-  if (IsFP) {
-    switch (EltTy.getSizeInBits()) {
-    case 32:
-      if (VecTy.getNumElements() == 2) {
-        Opc = AArch64::DUPv2i32lane;
-      } else {
-        Opc = AArch64::DUPv4i32lane;
-        assert(VecTy.getNumElements() == 4);
-      }
+    // We have a constant, but it doesn't fit. Try adjusting it by one and
+    // updating the predicate if possible.
+    uint64_t C = *MaybeImmed;
+    CmpInst::Predicate NewP;
+    switch (P) {
+    default:
+      return nullptr;
+    case CmpInst::ICMP_SLT:
+    case CmpInst::ICMP_SGE:
+      // Check for
+      //
+      // x slt c => x sle c - 1
+      // x sge c => x sgt c - 1
+      //
+      // When c is not the smallest possible negative number.
+      if ((Size == 64 && static_cast<int64_t>(C) == INT64_MIN) ||
+          (Size == 32 && static_cast<int32_t>(C) == INT32_MIN))
+        return nullptr;
+      NewP = (P == CmpInst::ICMP_SLT) ? CmpInst::ICMP_SLE : CmpInst::ICMP_SGT;
+      C -= 1;
       break;
-    case 64:
-      assert(VecTy.getNumElements() == 2 && "Unexpected num elts");
-      Opc = AArch64::DUPv2i64lane;
+    case CmpInst::ICMP_ULT:
+    case CmpInst::ICMP_UGE:
+      // Check for
+      //
+      // x ult c => x ule c - 1
+      // x uge c => x ugt c - 1
+      //
+      // When c is not zero.
+      if (C == 0)
+        return nullptr;
+      NewP = (P == CmpInst::ICMP_ULT) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
+      C -= 1;
       break;
-    }
-  } else {
-    switch (EltTy.getSizeInBits()) {
-    case 32:
-      if (VecTy.getNumElements() == 2) {
-        Opc = AArch64::DUPv2i32gpr;
-      } else {
-        Opc = AArch64::DUPv4i32gpr;
-        assert(VecTy.getNumElements() == 4);
-      }
+    case CmpInst::ICMP_SLE:
+    case CmpInst::ICMP_SGT:
+      // Check for
+      //
+      // x sle c => x slt c + 1
+      // x sgt c => s sge c + 1
+      //
+      // When c is not the largest possible signed integer.
+      if ((Size == 32 && static_cast<int32_t>(C) == INT32_MAX) ||
+          (Size == 64 && static_cast<int64_t>(C) == INT64_MAX))
+        return nullptr;
+      NewP = (P == CmpInst::ICMP_SLE) ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGE;
+      C += 1;
       break;
-    case 64:
-      assert(VecTy.getNumElements() == 2 && "Unexpected num elts");
-      Opc = AArch64::DUPv2i64gpr;
+    case CmpInst::ICMP_ULE:
+    case CmpInst::ICMP_UGT:
+      // Check for
+      //
+      // x ule c => x ult c + 1
+      // x ugt c => s uge c + 1
+      //
+      // When c is not the largest possible unsigned integer.
+      if ((Size == 32 && static_cast<uint32_t>(C) == UINT32_MAX) ||
+          (Size == 64 && C == UINT64_MAX))
+        return nullptr;
+      NewP = (P == CmpInst::ICMP_ULE) ? CmpInst::ICMP_ULT : CmpInst::ICMP_UGE;
+      C += 1;
       break;
     }
+
+    // Check if the new constant is valid.
+    if (Size == 32)
+      C = static_cast<uint32_t>(C);
+    ImmFns = select12BitValueWithLeftShift(C);
+    if (!ImmFns)
+      return nullptr;
+    P = NewP;
   }
-  assert(Opc && "Did not compute an opcode for a dup");
 
-  // For FP splats, we need to widen the scalar reg via undef too.
-  if (IsFP) {
-    MachineInstr *Widen = emitScalarToVector(
-        EltTy.getSizeInBits(), &AArch64::FPR128RegClass, ScalarReg, MIB);
-    if (!Widen)
-      return false;
-    ScalarReg = Widen->getOperand(0).getReg();
+  // At this point, we know we can select an immediate form. Go ahead and do
+  // that.
+  Register ZReg;
+  unsigned Opc;
+  if (Size == 32) {
+    ZReg = AArch64::WZR;
+    Opc = AArch64::SUBSWri;
+  } else {
+    ZReg = AArch64::XZR;
+    Opc = AArch64::SUBSXri;
   }
-  auto Dup = MIB.buildInstr(Opc, {I.getOperand(0).getReg()}, {ScalarReg});
-  if (IsFP)
-    Dup.addImm(0);
-  constrainSelectedInstRegOperands(*Dup, TII, TRI, RBI);
-  I.eraseFromParent();
-  return true;
+
+  auto CmpMI = MIB.buildInstr(Opc, {ZReg}, {LHS.getReg()});
+  for (auto &RenderFn : *ImmFns)
+    RenderFn(CmpMI);
+  constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
+  return &*CmpMI;
 }
 
-bool AArch64InstructionSelector::tryOptVectorShuffle(MachineInstr &I) const {
-  if (TM.getOptLevel() == CodeGenOpt::None)
-    return false;
-  if (tryOptVectorDup(I))
-    return true;
-  return false;
+MachineInstr *AArch64InstructionSelector::tryOptArithShiftedCompare(
+    MachineOperand &LHS, MachineOperand &RHS, MachineIRBuilder &MIB) const {
+  // We are looking for the following pattern:
+  //
+  // shift = G_SHL/ASHR/LHSR y, c
+  // ...
+  // cmp = G_ICMP pred, something, shift
+  //
+  // Since we will select the G_ICMP to a SUBS, we can potentially fold the
+  // shift into the subtract.
+  static const unsigned OpcTable[2] = {AArch64::SUBSWrs, AArch64::SUBSXrs};
+  static const Register ZRegTable[2] = {AArch64::WZR, AArch64::XZR};
+  auto ImmFns = selectShiftedRegister(RHS);
+  if (!ImmFns)
+    return nullptr;
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  auto Ty = MRI.getType(LHS.getReg());
+  assert(!Ty.isVector() && "Expected scalar or pointer only?");
+  unsigned Size = Ty.getSizeInBits();
+  bool Idx = (Size == 64);
+  Register ZReg = ZRegTable[Idx];
+  unsigned Opc = OpcTable[Idx];
+  auto CmpMI = MIB.buildInstr(Opc, {ZReg}, {LHS.getReg()});
+  for (auto &RenderFn : *ImmFns)
+    RenderFn(CmpMI);
+  constrainSelectedInstRegOperands(*CmpMI, TII, TRI, RBI);
+  return &*CmpMI;
 }
 
 bool AArch64InstructionSelector::selectShuffleVector(
     MachineInstr &I, MachineRegisterInfo &MRI) const {
-  if (tryOptVectorShuffle(I))
-    return true;
   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
   Register Src1Reg = I.getOperand(1).getReg();
   const LLT Src1Ty = MRI.getType(Src1Reg);
@@ -3852,9 +4374,8 @@ bool AArch64InstructionSelector::selectShuffleVector(
                     .addUse(Src2Reg)
                     .addImm(AArch64::qsub1);
 
-  auto TBL2 =
-      MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0).getReg()},
-                            {RegSeq, IndexLoad->getOperand(0).getReg()});
+  auto TBL2 = MIRBuilder.buildInstr(AArch64::TBLv16i8Two, {I.getOperand(0)},
+                                    {RegSeq, IndexLoad->getOperand(0)});
   constrainSelectedInstRegOperands(*RegSeq, TII, TRI, RBI);
   constrainSelectedInstRegOperands(*TBL2, TII, TRI, RBI);
   I.eraseFromParent();
@@ -3968,6 +4489,44 @@ bool AArch64InstructionSelector::selectInsertElt(
   return true;
 }
 
+bool AArch64InstructionSelector::tryOptConstantBuildVec(
+    MachineInstr &I, LLT DstTy, MachineRegisterInfo &MRI) const {
+  assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
+  assert(DstTy.getSizeInBits() <= 128 && "Unexpected build_vec type!");
+  if (DstTy.getSizeInBits() < 32)
+    return false;
+  // Check if we're building a constant vector, in which case we want to
+  // generate a constant pool load instead of a vector insert sequence.
+  SmallVector<Constant *, 16> Csts;
+  for (unsigned Idx = 1; Idx < I.getNumOperands(); ++Idx) {
+    // Try to find G_CONSTANT or G_FCONSTANT
+    auto *OpMI =
+        getOpcodeDef(TargetOpcode::G_CONSTANT, I.getOperand(Idx).getReg(), MRI);
+    if (OpMI)
+      Csts.emplace_back(
+          const_cast<ConstantInt *>(OpMI->getOperand(1).getCImm()));
+    else if ((OpMI = getOpcodeDef(TargetOpcode::G_FCONSTANT,
+                                  I.getOperand(Idx).getReg(), MRI)))
+      Csts.emplace_back(
+          const_cast<ConstantFP *>(OpMI->getOperand(1).getFPImm()));
+    else
+      return false;
+  }
+  Constant *CV = ConstantVector::get(Csts);
+  MachineIRBuilder MIB(I);
+  auto *CPLoad = emitLoadFromConstantPool(CV, MIB);
+  if (!CPLoad) {
+    LLVM_DEBUG(dbgs() << "Could not generate cp load for build_vector");
+    return false;
+  }
+  MIB.buildCopy(I.getOperand(0), CPLoad->getOperand(0));
+  RBI.constrainGenericRegister(I.getOperand(0).getReg(),
+                               *MRI.getRegClass(CPLoad->getOperand(0).getReg()),
+                               MRI);
+  I.eraseFromParent();
+  return true;
+}
+
 bool AArch64InstructionSelector::selectBuildVector(
     MachineInstr &I, MachineRegisterInfo &MRI) const {
   assert(I.getOpcode() == TargetOpcode::G_BUILD_VECTOR);
@@ -3976,6 +4535,9 @@ bool AArch64InstructionSelector::selectBuildVector(
   const LLT DstTy = MRI.getType(I.getOperand(0).getReg());
   const LLT EltTy = MRI.getType(I.getOperand(1).getReg());
   unsigned EltSize = EltTy.getSizeInBits();
+
+  if (tryOptConstantBuildVec(I, DstTy, MRI))
+    return true;
   if (EltSize < 16 || EltSize > 64)
     return false; // Don't support all element types yet.
   const RegisterBank &RB = *RBI.getRegBank(I.getOperand(1).getReg(), MRI, TRI);
@@ -4081,8 +4643,8 @@ bool AArch64InstructionSelector::selectIntrinsicWithSideEffects(
   return true;
 }
 
-bool AArch64InstructionSelector::selectIntrinsic(
-    MachineInstr &I, MachineRegisterInfo &MRI) const {
+bool AArch64InstructionSelector::selectIntrinsic(MachineInstr &I,
+                                                 MachineRegisterInfo &MRI) {
   unsigned IntrinID = findIntrinsicID(I);
   if (!IntrinID)
     return false;
@@ -4091,7 +4653,7 @@ bool AArch64InstructionSelector::selectIntrinsic(
   switch (IntrinID) {
   default:
     break;
-  case Intrinsic::aarch64_crypto_sha1h:
+  case Intrinsic::aarch64_crypto_sha1h: {
     Register DstReg = I.getOperand(0).getReg();
     Register SrcReg = I.getOperand(2).getReg();
 
@@ -4130,28 +4692,59 @@ bool AArch64InstructionSelector::selectIntrinsic(
     I.eraseFromParent();
     return true;
   }
-  return false;
-}
+  case Intrinsic::frameaddress:
+  case Intrinsic::returnaddress: {
+    MachineFunction &MF = *I.getParent()->getParent();
+    MachineFrameInfo &MFI = MF.getFrameInfo();
 
-static Optional<uint64_t> getImmedFromMO(const MachineOperand &Root) {
-  auto &MI = *Root.getParent();
-  auto &MBB = *MI.getParent();
-  auto &MF = *MBB.getParent();
-  auto &MRI = MF.getRegInfo();
-  uint64_t Immed;
-  if (Root.isImm())
-    Immed = Root.getImm();
-  else if (Root.isCImm())
-    Immed = Root.getCImm()->getZExtValue();
-  else if (Root.isReg()) {
-    auto ValAndVReg =
-        getConstantVRegValWithLookThrough(Root.getReg(), MRI, true);
-    if (!ValAndVReg)
-      return None;
-    Immed = ValAndVReg->Value;
-  } else
-    return None;
-  return Immed;
+    unsigned Depth = I.getOperand(2).getImm();
+    Register DstReg = I.getOperand(0).getReg();
+    RBI.constrainGenericRegister(DstReg, AArch64::GPR64RegClass, MRI);
+
+    if (Depth == 0 && IntrinID == Intrinsic::returnaddress) {
+      if (MFReturnAddr) {
+        MIRBuilder.buildCopy({DstReg}, MFReturnAddr);
+        I.eraseFromParent();
+        return true;
+      }
+      MFI.setReturnAddressIsTaken(true);
+      MF.addLiveIn(AArch64::LR, &AArch64::GPR64spRegClass);
+      // Insert the copy from LR/X30 into the entry block, before it can be
+      // clobbered by anything.
+      MachineBasicBlock &EntryBlock = *MF.begin();
+      if (!EntryBlock.isLiveIn(AArch64::LR))
+        EntryBlock.addLiveIn(AArch64::LR);
+      MachineIRBuilder EntryBuilder(MF);
+      EntryBuilder.setInstr(*EntryBlock.begin());
+      EntryBuilder.buildCopy({DstReg}, {Register(AArch64::LR)});
+      MFReturnAddr = DstReg;
+      I.eraseFromParent();
+      return true;
+    }
+
+    MFI.setFrameAddressIsTaken(true);
+    Register FrameAddr(AArch64::FP);
+    while (Depth--) {
+      Register NextFrame = MRI.createVirtualRegister(&AArch64::GPR64spRegClass);
+      auto Ldr =
+          MIRBuilder.buildInstr(AArch64::LDRXui, {NextFrame}, {FrameAddr})
+              .addImm(0);
+      constrainSelectedInstRegOperands(*Ldr, TII, TRI, RBI);
+      FrameAddr = NextFrame;
+    }
+
+    if (IntrinID == Intrinsic::frameaddress)
+      MIRBuilder.buildCopy({DstReg}, {FrameAddr});
+    else {
+      MFI.setReturnAddressIsTaken(true);
+      MIRBuilder.buildInstr(AArch64::LDRXui, {DstReg}, {FrameAddr}).addImm(1);
+    }
+
+    I.eraseFromParent();
+    return true;
+  }
+  }
+  return false;
 }
 
 InstructionSelector::ComplexRendererFns
@@ -4271,7 +4864,7 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
     MachineInstr &MI, const MachineRegisterInfo &MRI) const {
   // Always fold if there is one use, or if we're optimizing for size.
   Register DefReg = MI.getOperand(0).getReg();
-  if (MRI.hasOneUse(DefReg) ||
+  if (MRI.hasOneNonDBGUse(DefReg) ||
       MI.getParent()->getParent()->getFunction().hasMinSize())
     return true;
 
@@ -4283,10 +4876,21 @@ bool AArch64InstructionSelector::isWorthFoldingIntoExtendedReg(
   // We have a fastpath, so folding a shift in and potentially computing it
   // many times may be beneficial. Check if this is only used in memory ops.
   // If it is, then we should fold.
-  return all_of(MRI.use_instructions(DefReg),
+  return all_of(MRI.use_nodbg_instructions(DefReg),
                 [](MachineInstr &Use) { return Use.mayLoadOrStore(); });
 }
 
+static bool isSignExtendShiftType(AArch64_AM::ShiftExtendType Type) {
+  switch (Type) {
+  case AArch64_AM::SXTB:
+  case AArch64_AM::SXTH:
+  case AArch64_AM::SXTW:
+    return true;
+  default:
+    return false;
+  }
+}
+
 InstructionSelector::ComplexRendererFns
 AArch64InstructionSelector::selectExtendedSHL(
     MachineOperand &Root, MachineOperand &Base, MachineOperand &Offset,
@@ -4359,7 +4963,10 @@ AArch64InstructionSelector::selectExtendedSHL(
     if (Ext == AArch64_AM::InvalidShiftExtend)
       return None;
 
-    SignExtend = Ext == AArch64_AM::SXTW;
+    SignExtend = isSignExtendShiftType(Ext) ? 1 : 0;
+    // We only support SXTW for signed extension here.
+    if (SignExtend && Ext != AArch64_AM::SXTW)
+      return None;
 
     // Need a 32-bit wide register here.
     MachineIRBuilder MIB(*MRI.getVRegDef(Root.getReg()));
@@ -4441,7 +5048,7 @@ AArch64InstructionSelector::selectAddrModeRegisterOffset(
   // If this is used more than once, let's not bother folding.
   // TODO: Check if they are memory ops. If they are, then we can still fold
   // without having to recompute anything.
-  if (!MRI.hasOneUse(Gep->getOperand(0).getReg()))
+  if (!MRI.hasOneNonDBGUse(Gep->getOperand(0).getReg()))
     return None;
 
   // Base is the GEP's LHS, offset is its RHS.
@@ -4595,14 +5202,46 @@ AArch64InstructionSelector::selectAddrModeUnscaled(MachineOperand &Root,
   return None;
 }
 
+InstructionSelector::ComplexRendererFns
+AArch64InstructionSelector::tryFoldAddLowIntoImm(MachineInstr &RootDef,
+                                                 unsigned Size,
+                                                 MachineRegisterInfo &MRI) const {
+  if (RootDef.getOpcode() != AArch64::G_ADD_LOW)
+    return None;
+  MachineInstr &Adrp = *MRI.getVRegDef(RootDef.getOperand(1).getReg());
+  if (Adrp.getOpcode() != AArch64::ADRP)
+    return None;
+
+  // TODO: add heuristics like isWorthFoldingADDlow() from SelectionDAG.
+  // TODO: Need to check GV's offset % size if doing offset folding into globals.
+  assert(Adrp.getOperand(1).getOffset() == 0 && "Unexpected offset in global");
+  auto GV = Adrp.getOperand(1).getGlobal();
+  if (GV->isThreadLocal())
+    return None;
+
+  auto &MF = *RootDef.getParent()->getParent();
+  if (GV->getPointerAlignment(MF.getDataLayout()) < Size)
+    return None;
+
+  unsigned OpFlags = STI.ClassifyGlobalReference(GV, MF.getTarget());
+  MachineIRBuilder MIRBuilder(RootDef);
+  Register AdrpReg = Adrp.getOperand(0).getReg();
+  return {{[=](MachineInstrBuilder &MIB) { MIB.addUse(AdrpReg); },
+           [=](MachineInstrBuilder &MIB) {
+             MIB.addGlobalAddress(GV, /* Offset */ 0,
+                                  OpFlags | AArch64II::MO_PAGEOFF |
+                                      AArch64II::MO_NC);
+           }}};
+}
+
 /// Select a "register plus scaled unsigned 12-bit immediate" address.  The
 /// "Size" argument is the size in bytes of the memory reference, which
 /// determines the scale.
 InstructionSelector::ComplexRendererFns
 AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
                                                   unsigned Size) const {
-  MachineRegisterInfo &MRI =
-      Root.getParent()->getParent()->getParent()->getRegInfo();
+  MachineFunction &MF = *Root.getParent()->getParent()->getParent();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
 
   if (!Root.isReg())
     return None;
@@ -4618,6 +5257,14 @@ AArch64InstructionSelector::selectAddrModeIndexed(MachineOperand &Root,
     }};
   }
 
+  CodeModel::Model CM = MF.getTarget().getCodeModel();
+  // Check if we can fold in the ADD of small code model ADRP + ADD address.
+  if (CM == CodeModel::Small) {
+    auto OpFns = tryFoldAddLowIntoImm(*RootDef, Size, MRI);
+    if (OpFns)
+      return OpFns;
+  }
+
   if (isBaseWithConstantOffset(Root, MRI)) {
     MachineOperand &LHS = RootDef->getOperand(1);
     MachineOperand &RHS = RootDef->getOperand(2);
@@ -4717,7 +5364,11 @@ AArch64_AM::ShiftExtendType AArch64InstructionSelector::getExtendTypeForInst(
 
   // Handle explicit extend instructions first.
   if (Opc == TargetOpcode::G_SEXT || Opc == TargetOpcode::G_SEXT_INREG) {
-    unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    unsigned Size;
+    if (Opc == TargetOpcode::G_SEXT)
+      Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    else
+      Size = MI.getOperand(2).getImm();
     assert(Size != 64 && "Extend from 64 bits?");
     switch (Size) {
     case 8:
@@ -4782,6 +5433,52 @@ Register AArch64InstructionSelector::narrowExtendRegIfNeeded(
   return Copy.getReg(0);
 }
 
+Register AArch64InstructionSelector::widenGPRBankRegIfNeeded(
+    Register Reg, unsigned WideSize, MachineIRBuilder &MIB) const {
+  assert(WideSize >= 8 && "WideSize is smaller than all possible registers?");
+  MachineRegisterInfo &MRI = *MIB.getMRI();
+  unsigned NarrowSize = MRI.getType(Reg).getSizeInBits();
+  assert(WideSize >= NarrowSize &&
+         "WideSize cannot be smaller than NarrowSize!");
+
+  // If the sizes match, just return the register.
+  //
+  // If NarrowSize is an s1, then we can select it to any size, so we'll treat
+  // it as a don't care.
+  if (NarrowSize == WideSize || NarrowSize == 1)
+    return Reg;
+
+  // Now check the register classes.
+  const RegisterBank *RB = RBI.getRegBank(Reg, MRI, TRI);
+  const TargetRegisterClass *OrigRC = getMinClassForRegBank(*RB, NarrowSize);
+  const TargetRegisterClass *WideRC = getMinClassForRegBank(*RB, WideSize);
+  assert(OrigRC && "Could not determine narrow RC?");
+  assert(WideRC && "Could not determine wide RC?");
+
+  // If the sizes differ, but the register classes are the same, there is no
+  // need to insert a SUBREG_TO_REG.
+  //
+  // For example, an s8 that's supposed to be a GPR will be selected to either
+  // a GPR32 or a GPR64 register. Note that this assumes that the s8 will
+  // always end up on a GPR32.
+  if (OrigRC == WideRC)
+    return Reg;
+
+  // We have two different register classes. Insert a SUBREG_TO_REG.
+  unsigned SubReg = 0;
+  getSubRegForClass(OrigRC, TRI, SubReg);
+  assert(SubReg && "Couldn't determine subregister?");
+
+  // Build the SUBREG_TO_REG and return the new, widened register.
+  auto SubRegToReg =
+      MIB.buildInstr(AArch64::SUBREG_TO_REG, {WideRC}, {})
+          .addImm(0)
+          .addUse(Reg)
+          .addImm(SubReg);
+  constrainSelectedInstRegOperands(*SubRegToReg, TII, TRI, RBI);
+  return SubRegToReg.getReg(0);
+}
+
 /// Select an "extended register" operand. This operand folds in an extend
 /// followed by an optional left shift.
 InstructionSelector::ComplexRendererFns
@@ -4908,6 +5605,95 @@ bool AArch64InstructionSelector::isDef32(const MachineInstr &MI) const {
   }
 }
 
+
+// Perform fixups on the given PHI instruction's operands to force them all
+// to be the same as the destination regbank.
+static void fixupPHIOpBanks(MachineInstr &MI, MachineRegisterInfo &MRI,
+                            const AArch64RegisterBankInfo &RBI) {
+  assert(MI.getOpcode() == TargetOpcode::G_PHI && "Expected a G_PHI");
+  Register DstReg = MI.getOperand(0).getReg();
+  const RegisterBank *DstRB = MRI.getRegBankOrNull(DstReg);
+  assert(DstRB && "Expected PHI dst to have regbank assigned");
+  MachineIRBuilder MIB(MI);
+
+  // Go through each operand and ensure it has the same regbank.
+  for (unsigned OpIdx = 1; OpIdx < MI.getNumOperands(); ++OpIdx) {
+    MachineOperand &MO = MI.getOperand(OpIdx);
+    if (!MO.isReg())
+      continue;
+    Register OpReg = MO.getReg();
+    const RegisterBank *RB = MRI.getRegBankOrNull(OpReg);
+    if (RB != DstRB) {
+      // Insert a cross-bank copy.
+      auto *OpDef = MRI.getVRegDef(OpReg);
+      const LLT &Ty = MRI.getType(OpReg);
+      MIB.setInsertPt(*OpDef->getParent(), std::next(OpDef->getIterator()));
+      auto Copy = MIB.buildCopy(Ty, OpReg);
+      MRI.setRegBank(Copy.getReg(0), *DstRB);
+      MO.setReg(Copy.getReg(0));
+    }
+  }
+}
+
+void AArch64InstructionSelector::processPHIs(MachineFunction &MF) {
+  // We're looking for PHIs, build a list so we don't invalidate iterators.
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  SmallVector<MachineInstr *, 32> Phis;
+  for (auto &BB : MF) {
+    for (auto &MI : BB) {
+      if (MI.getOpcode() == TargetOpcode::G_PHI)
+        Phis.emplace_back(&MI);
+    }
+  }
+
+  for (auto *MI : Phis) {
+    // We need to do some work here if the operand types are < 16 bit and they
+    // are split across fpr/gpr banks. Since all types <32b on gpr
+    // end up being assigned gpr32 regclasses, we can end up with PHIs here
+    // which try to select between a gpr32 and an fpr16. Ideally RBS shouldn't
+    // be selecting heterogenous regbanks for operands if possible, but we
+    // still need to be able to deal with it here.
+    //
+    // To fix this, if we have a gpr-bank operand < 32b in size and at least
+    // one other operand is on the fpr bank, then we add cross-bank copies
+    // to homogenize the operand banks. For simplicity the bank that we choose
+    // to settle on is whatever bank the def operand has. For example:
+    //
+    // %endbb:
+    //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2:fpr(s16), %bb2
+    //  =>
+    // %bb2:
+    //   ...
+    //   %in2_copy:gpr(s16) = COPY %in2:fpr(s16)
+    //   ...
+    // %endbb:
+    //   %dst:gpr(s16) = G_PHI %in1:gpr(s16), %bb1, %in2_copy:gpr(s16), %bb2
+    bool HasGPROp = false, HasFPROp = false;
+    for (unsigned OpIdx = 1; OpIdx < MI->getNumOperands(); ++OpIdx) {
+      const auto &MO = MI->getOperand(OpIdx);
+      if (!MO.isReg())
+        continue;
+      const LLT &Ty = MRI.getType(MO.getReg());
+      if (!Ty.isValid() || !Ty.isScalar())
+        break;
+      if (Ty.getSizeInBits() >= 32)
+        break;
+      const RegisterBank *RB = MRI.getRegBankOrNull(MO.getReg());
+      // If for some reason we don't have a regbank yet. Don't try anything.
+      if (!RB)
+        break;
+
+      if (RB->getID() == AArch64::GPRRegBankID)
+        HasGPROp = true;
+      else
+        HasFPROp = true;
+    }
+    // We have heterogenous regbanks, need to fixup.
+    if (HasGPROp && HasFPROp)
+      fixupPHIOpBanks(*MI, MRI, RBI);
+  }
+}
+
 namespace llvm {
 InstructionSelector *
 createAArch64InstructionSelector(const AArch64TargetMachine &TM,
diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 95719a35c6da..2eaec0b970fa 100644
--- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -30,7 +30,8 @@ using namespace LegalizeActions;
 using namespace LegalizeMutations;
 using namespace LegalityPredicates;
 
-AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
+AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
+    : ST(&ST) {
   using namespace TargetOpcode;
   const LLT p0 = LLT::pointer(0, 64);
   const LLT s1 = LLT::scalar(1);
@@ -52,13 +53,15 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
   const LLT v2s64 = LLT::vector(2, 64);
   const LLT v2p0 = LLT::vector(2, p0);
 
+  const TargetMachine &TM = ST.getTargetLowering()->getTargetMachine();
+
   // FIXME: support subtargets which have neon/fp-armv8 disabled.
   if (!ST.hasNEON() || !ST.hasFPARMv8()) {
     computeTables();
     return;
   }
 
-  getActionDefinitionsBuilder(G_IMPLICIT_DEF)
+  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
     .legalFor({p0, s1, s8, s16, s32, s64, v2s32, v4s32, v2s64})
     .clampScalar(0, s1, s64)
     .widenScalarToNextPow2(0, 8)
@@ -105,10 +108,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
     .minScalarSameAs(1, 0);
 
   getActionDefinitionsBuilder(G_PTR_ADD)
-      .legalFor({{p0, s64}})
+      .legalFor({{p0, s64}, {v2p0, v2s64}})
       .clampScalar(1, s64, s64);
 
-  getActionDefinitionsBuilder(G_PTR_MASK).legalFor({p0});
+  getActionDefinitionsBuilder(G_PTRMASK).legalFor({{p0, s64}});
 
   getActionDefinitionsBuilder({G_SDIV, G_UDIV})
       .legalFor({s32, s64})
@@ -375,7 +378,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
 
   getActionDefinitionsBuilder(G_TRUNC).alwaysLegal();
 
-  getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+  getActionDefinitionsBuilder(G_SEXT_INREG)
+    .legalFor({s32, s64})
+    .lower();
 
   // FP conversions
   getActionDefinitionsBuilder(G_FPTRUNC).legalFor(
@@ -413,7 +418,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
 
   // Pointer-handling
   getActionDefinitionsBuilder(G_FRAME_INDEX).legalFor({p0});
-  getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
+
+  if (TM.getCodeModel() == CodeModel::Small)
+    getActionDefinitionsBuilder(G_GLOBAL_VALUE).custom();
+  else
+    getActionDefinitionsBuilder(G_GLOBAL_VALUE).legalFor({p0});
 
   getActionDefinitionsBuilder(G_PTRTOINT)
       .legalForCartesianProduct({s1, s8, s16, s32, s64}, {p0})
@@ -617,10 +626,11 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST) {
   verify(*ST.getInstrInfo());
 }
 
-bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
-                                          MachineRegisterInfo &MRI,
-                                          MachineIRBuilder &MIRBuilder,
-                                          GISelChangeObserver &Observer) const {
+bool AArch64LegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
+                                          MachineInstr &MI) const {
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+  GISelChangeObserver &Observer = Helper.Observer;
   switch (MI.getOpcode()) {
   default:
     // No idea what to do.
@@ -634,19 +644,53 @@ bool AArch64LegalizerInfo::legalizeCustom(MachineInstr &MI,
   case TargetOpcode::G_ASHR:
   case TargetOpcode::G_LSHR:
     return legalizeShlAshrLshr(MI, MRI, MIRBuilder, Observer);
+  case TargetOpcode::G_GLOBAL_VALUE:
+    return legalizeSmallCMGlobalValue(MI, MRI, MIRBuilder, Observer);
   }
 
   llvm_unreachable("expected switch to return");
 }
 
+bool AArch64LegalizerInfo::legalizeSmallCMGlobalValue(MachineInstr &MI,
+                                                      MachineRegisterInfo &MRI,
+                                                      MachineIRBuilder &MIRBuilder,
+                                                      GISelChangeObserver &Observer) const {
+  assert(MI.getOpcode() == TargetOpcode::G_GLOBAL_VALUE);
+  // We do this custom legalization to convert G_GLOBAL_VALUE into target ADRP +
+  // G_ADD_LOW instructions.
+  // By splitting this here, we can optimize accesses in the small code model by
+  // folding in the G_ADD_LOW into the load/store offset.
+  auto GV = MI.getOperand(1).getGlobal();
+  if (GV->isThreadLocal())
+    return true; // Don't want to modify TLS vars.
+
+  auto &TM = ST->getTargetLowering()->getTargetMachine();
+  unsigned OpFlags = ST->ClassifyGlobalReference(GV, TM);
+
+  if (OpFlags & AArch64II::MO_GOT)
+    return true;
+
+  Register DstReg = MI.getOperand(0).getReg();
+  auto ADRP = MIRBuilder.buildInstr(AArch64::ADRP, {LLT::pointer(0, 64)}, {})
+                  .addGlobalAddress(GV, 0, OpFlags | AArch64II::MO_PAGE);
+  // Set the regclass on the dest reg too.
+  MRI.setRegClass(ADRP.getReg(0), &AArch64::GPR64RegClass);
+
+  MIRBuilder.buildInstr(AArch64::G_ADD_LOW, {DstReg}, {ADRP})
+      .addGlobalAddress(GV, 0,
+                        OpFlags | AArch64II::MO_PAGEOFF | AArch64II::MO_NC);
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AArch64LegalizerInfo::legalizeIntrinsic(
-    MachineInstr &MI, MachineRegisterInfo &MRI,
-    MachineIRBuilder &MIRBuilder) const {
+  LegalizerHelper &Helper, MachineInstr &MI) const {
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
   switch (MI.getIntrinsicID()) {
   case Intrinsic::memcpy:
   case Intrinsic::memset:
   case Intrinsic::memmove:
-    if (createMemLibcall(MIRBuilder, MRI, MI) ==
+    if (createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI) ==
         LegalizerHelper::UnableToLegalize)
       return false;
     MI.eraseFromParent();
@@ -675,7 +719,6 @@ bool AArch64LegalizerInfo::legalizeShlAshrLshr(
   if (Amount > 31)
     return true; // This will have to remain a register variant.
   assert(MRI.getType(AmtReg).getSizeInBits() == 32);
-  MIRBuilder.setInstr(MI);
   auto ExtCst = MIRBuilder.buildZExt(LLT::scalar(64), AmtReg);
   MI.getOperand(2).setReg(ExtCst.getReg(0));
   return true;
@@ -704,17 +747,15 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
     return false;
   }
 
-  MIRBuilder.setInstr(MI);
   unsigned PtrSize = ValTy.getElementType().getSizeInBits();
   const LLT NewTy = LLT::vector(ValTy.getNumElements(), PtrSize);
   auto &MMO = **MI.memoperands_begin();
   if (MI.getOpcode() == TargetOpcode::G_STORE) {
-    auto Bitcast = MIRBuilder.buildBitcast({NewTy}, {ValReg});
-    MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1).getReg(), MMO);
+    auto Bitcast = MIRBuilder.buildBitcast(NewTy, ValReg);
+    MIRBuilder.buildStore(Bitcast.getReg(0), MI.getOperand(1), MMO);
   } else {
-    Register NewReg = MRI.createGenericVirtualRegister(NewTy);
-    auto NewLoad = MIRBuilder.buildLoad(NewReg, MI.getOperand(1).getReg(), MMO);
-    MIRBuilder.buildBitcast({ValReg}, {NewLoad});
+    auto NewLoad = MIRBuilder.buildLoad(NewTy, MI.getOperand(1), MMO);
+    MIRBuilder.buildBitcast(ValReg, NewLoad);
   }
   MI.eraseFromParent();
   return true;
@@ -723,9 +764,8 @@ bool AArch64LegalizerInfo::legalizeLoadStore(
 bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
                                          MachineRegisterInfo &MRI,
                                          MachineIRBuilder &MIRBuilder) const {
-  MIRBuilder.setInstr(MI);
   MachineFunction &MF = MIRBuilder.getMF();
-  unsigned Align = MI.getOperand(2).getImm();
+  Align Alignment(MI.getOperand(2).getImm());
   Register Dst = MI.getOperand(0).getReg();
   Register ListPtr = MI.getOperand(1).getReg();
 
@@ -733,21 +773,19 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
   LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
 
   const unsigned PtrSize = PtrTy.getSizeInBits() / 8;
-  Register List = MRI.createGenericVirtualRegister(PtrTy);
-  MIRBuilder.buildLoad(
-      List, ListPtr,
+  const Align PtrAlign = Align(PtrSize);
+  auto List = MIRBuilder.buildLoad(
+      PtrTy, ListPtr,
       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
-                               PtrSize, /* Align = */ PtrSize));
+                               PtrSize, PtrAlign));
 
-  Register DstPtr;
-  if (Align > PtrSize) {
+  MachineInstrBuilder DstPtr;
+  if (Alignment > PtrAlign) {
     // Realign the list to the actual required alignment.
-    auto AlignMinus1 = MIRBuilder.buildConstant(IntPtrTy, Align - 1);
-
+    auto AlignMinus1 =
+        MIRBuilder.buildConstant(IntPtrTy, Alignment.value() - 1);
     auto ListTmp = MIRBuilder.buildPtrAdd(PtrTy, List, AlignMinus1.getReg(0));
-
-    DstPtr = MRI.createGenericVirtualRegister(PtrTy);
-    MIRBuilder.buildPtrMask(DstPtr, ListTmp, Log2_64(Align));
+    DstPtr = MIRBuilder.buildMaskLowPtrBits(PtrTy, ListTmp, Log2(Alignment));
   } else
     DstPtr = List;
 
@@ -755,16 +793,16 @@ bool AArch64LegalizerInfo::legalizeVaArg(MachineInstr &MI,
   MIRBuilder.buildLoad(
       Dst, DstPtr,
       *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOLoad,
-                               ValSize, std::max(Align, PtrSize)));
+                               ValSize, std::max(Alignment, PtrAlign)));
 
-  auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrSize));
+  auto Size = MIRBuilder.buildConstant(IntPtrTy, alignTo(ValSize, PtrAlign));
 
   auto NewList = MIRBuilder.buildPtrAdd(PtrTy, DstPtr, Size.getReg(0));
 
-  MIRBuilder.buildStore(
-      NewList, ListPtr,
-      *MF.getMachineMemOperand(MachinePointerInfo(), MachineMemOperand::MOStore,
-                               PtrSize, /* Align = */ PtrSize));
+  MIRBuilder.buildStore(NewList, ListPtr,
+                        *MF.getMachineMemOperand(MachinePointerInfo(),
+                                                 MachineMemOperand::MOStore,
+                                                 PtrSize, PtrAlign));
 
   MI.eraseFromParent();
   return true;
diff --git a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
index 15161bab466c..1cb24559c1ab 100644
--- a/llvm/lib/Target/AArch64/AArch64LegalizerInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.h
@@ -27,12 +27,10 @@ class AArch64LegalizerInfo : public LegalizerInfo {
 public:
   AArch64LegalizerInfo(const AArch64Subtarget &ST);
 
-  bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
-                      MachineIRBuilder &MIRBuilder,
-                      GISelChangeObserver &Observer) const override;
+  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
 
-  bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
-                         MachineIRBuilder &MIRBuilder) const override;
+  bool legalizeIntrinsic(LegalizerHelper &Helper,
+                         MachineInstr &MI) const override;
 
 private:
   bool legalizeVaArg(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -43,6 +41,11 @@ private:
   bool legalizeShlAshrLshr(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &MIRBuilder,
                            GISelChangeObserver &Observer) const;
+
+  bool legalizeSmallCMGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                  MachineIRBuilder &MIRBuilder,
+                                  GISelChangeObserver &Observer) const;
+  const AArch64Subtarget *ST;
 };
 } // End llvm namespace.
 #endif
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
new file mode 100644
index 000000000000..baa8515baf3e
--- /dev/null
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PostLegalizerCombiner.cpp
@@ -0,0 +1,507 @@
+ //=== lib/CodeGen/GlobalISel/AArch64PostLegalizerCombiner.cpp -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This performs post-legalization combines on generic MachineInstrs.
+//
+// Any combine that this pass performs must preserve instruction legality.
+// Combines unconcerned with legality should be handled by the
+// PreLegalizerCombiner instead.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AArch64TargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/Debug.h"
+
+#define DEBUG_TYPE "aarch64-postlegalizer-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+/// Represents a pseudo instruction which replaces a G_SHUFFLE_VECTOR.
+///
+/// Used for matching target-supported shuffles before codegen.
+struct ShuffleVectorPseudo {
+  unsigned Opc; ///< Opcode for the instruction. (E.g. G_ZIP1)
+  Register Dst; ///< Destination register.
+  SmallVector<SrcOp, 2> SrcOps; ///< Source registers.
+  ShuffleVectorPseudo(unsigned Opc, Register Dst,
+                      std::initializer_list<SrcOp> SrcOps)
+      : Opc(Opc), Dst(Dst), SrcOps(SrcOps){};
+  ShuffleVectorPseudo() {}
+};
+
+/// \returns The splat index of a G_SHUFFLE_VECTOR \p MI when \p MI is a splat.
+/// If \p MI is not a splat, returns None.
+static Optional<int> getSplatIndex(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR &&
+         "Only G_SHUFFLE_VECTOR can have a splat index!");
+  ArrayRef<int> Mask = MI.getOperand(3).getShuffleMask();
+  auto FirstDefinedIdx = find_if(Mask, [](int Elt) { return Elt >= 0; });
+
+  // If all elements are undefined, this shuffle can be considered a splat.
+  // Return 0 for better potential for callers to simplify.
+  if (FirstDefinedIdx == Mask.end())
+    return 0;
+
+  // Make sure all remaining elements are either undef or the same
+  // as the first non-undef value.
+  int SplatValue = *FirstDefinedIdx;
+  if (any_of(make_range(std::next(FirstDefinedIdx), Mask.end()),
+             [&SplatValue](int Elt) { return Elt >= 0 && Elt != SplatValue; }))
+    return None;
+
+  return SplatValue;
+}
+
+/// Check if a vector shuffle corresponds to a REV instruction with the
+/// specified blocksize.
+static bool isREVMask(ArrayRef<int> M, unsigned EltSize, unsigned NumElts,
+                      unsigned BlockSize) {
+  assert((BlockSize == 16 || BlockSize == 32 || BlockSize == 64) &&
+         "Only possible block sizes for REV are: 16, 32, 64");
+  assert(EltSize != 64 && "EltSize cannot be 64 for REV mask.");
+
+  unsigned BlockElts = M[0] + 1;
+
+  // If the first shuffle index is UNDEF, be optimistic.
+  if (M[0] < 0)
+    BlockElts = BlockSize / EltSize;
+
+  if (BlockSize <= EltSize || BlockSize != BlockElts * EltSize)
+    return false;
+
+  for (unsigned i = 0; i < NumElts; ++i) {
+    // Ignore undef indices.
+    if (M[i] < 0)
+      continue;
+    if (static_cast<unsigned>(M[i]) !=
+        (i - i % BlockElts) + (BlockElts - 1 - i % BlockElts))
+      return false;
+  }
+
+  return true;
+}
+
+/// Determines if \p M is a shuffle vector mask for a TRN of \p NumElts.
+/// Whether or not G_TRN1 or G_TRN2 should be used is stored in \p WhichResult.
+static bool isTRNMask(ArrayRef<int> M, unsigned NumElts,
+                      unsigned &WhichResult) {
+  if (NumElts % 2 != 0)
+    return false;
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i < NumElts; i += 2) {
+    if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != i + WhichResult) ||
+        (M[i + 1] >= 0 &&
+         static_cast<unsigned>(M[i + 1]) != i + NumElts + WhichResult))
+      return false;
+  }
+  return true;
+}
+
+/// Check if a G_EXT instruction can handle a shuffle mask \p M when the vector
+/// sources of the shuffle are different.
+static Optional<std::pair<bool, uint64_t>> getExtMask(ArrayRef<int> M,
+                                                      unsigned NumElts) {
+  // Look for the first non-undef element.
+  auto FirstRealElt = find_if(M, [](int Elt) { return Elt >= 0; });
+  if (FirstRealElt == M.end())
+    return None;
+
+  // Use APInt to handle overflow when calculating expected element.
+  unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
+  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+
+  // The following shuffle indices must be the successive elements after the
+  // first real element.
+  if (any_of(
+          make_range(std::next(FirstRealElt), M.end()),
+          [&ExpectedElt](int Elt) { return Elt != ExpectedElt++ && Elt >= 0; }))
+    return None;
+
+  // The index of an EXT is the first element if it is not UNDEF.
+  // Watch out for the beginning UNDEFs. The EXT index should be the expected
+  // value of the first element.  E.g.
+  // <-1, -1, 3, ...> is treated as <1, 2, 3, ...>.
+  // <-1, -1, 0, 1, ...> is treated as <2*NumElts-2, 2*NumElts-1, 0, 1, ...>.
+  // ExpectedElt is the last mask index plus 1.
+  uint64_t Imm = ExpectedElt.getZExtValue();
+  bool ReverseExt = false;
+
+  // There are two difference cases requiring to reverse input vectors.
+  // For example, for vector <4 x i32> we have the following cases,
+  // Case 1: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, -1, 0>)
+  // Case 2: shufflevector(<4 x i32>,<4 x i32>,<-1, -1, 7, 0>)
+  // For both cases, we finally use mask <5, 6, 7, 0>, which requires
+  // to reverse two input vectors.
+  if (Imm < NumElts)
+    ReverseExt = true;
+  else
+    Imm -= NumElts;
+  return std::make_pair(ReverseExt, Imm);
+}
+
+/// Determines if \p M is a shuffle vector mask for a UZP of \p NumElts.
+/// Whether or not G_UZP1 or G_UZP2 should be used is stored in \p WhichResult.
+static bool isUZPMask(ArrayRef<int> M, unsigned NumElts,
+                      unsigned &WhichResult) {
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  for (unsigned i = 0; i != NumElts; ++i) {
+    // Skip undef indices.
+    if (M[i] < 0)
+      continue;
+    if (static_cast<unsigned>(M[i]) != 2 * i + WhichResult)
+      return false;
+  }
+  return true;
+}
+
+/// \return true if \p M is a zip mask for a shuffle vector of \p NumElts.
+/// Whether or not G_ZIP1 or G_ZIP2 should be used is stored in \p WhichResult.
+static bool isZipMask(ArrayRef<int> M, unsigned NumElts,
+                      unsigned &WhichResult) {
+  if (NumElts % 2 != 0)
+    return false;
+
+  // 0 means use ZIP1, 1 means use ZIP2.
+  WhichResult = (M[0] == 0 ? 0 : 1);
+  unsigned Idx = WhichResult * NumElts / 2;
+  for (unsigned i = 0; i != NumElts; i += 2) {
+      if ((M[i] >= 0 && static_cast<unsigned>(M[i]) != Idx) ||
+          (M[i + 1] >= 0 && static_cast<unsigned>(M[i + 1]) != Idx + NumElts))
+        return false;
+    Idx += 1;
+  }
+  return true;
+}
+
+/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with a
+/// G_REV instruction. Returns the appropriate G_REV opcode in \p Opc.
+static bool matchREV(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     ShuffleVectorPseudo &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT Ty = MRI.getType(Dst);
+  unsigned EltSize = Ty.getScalarSizeInBits();
+
+  // Element size for a rev cannot be 64.
+  if (EltSize == 64)
+    return false;
+
+  unsigned NumElts = Ty.getNumElements();
+
+  // Try to produce G_REV64
+  if (isREVMask(ShuffleMask, EltSize, NumElts, 64)) {
+    MatchInfo = ShuffleVectorPseudo(AArch64::G_REV64, Dst, {Src});
+    return true;
+  }
+
+  // TODO: Produce G_REV32 and G_REV16 once we have proper legalization support.
+  // This should be identical to above, but with a constant 32 and constant
+  // 16.
+  return false;
+}
+
+/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with
+/// a G_TRN1 or G_TRN2 instruction.
+static bool matchTRN(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     ShuffleVectorPseudo &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  unsigned WhichResult;
+  ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+  Register Dst = MI.getOperand(0).getReg();
+  unsigned NumElts = MRI.getType(Dst).getNumElements();
+  if (!isTRNMask(ShuffleMask, NumElts, WhichResult))
+    return false;
+  unsigned Opc = (WhichResult == 0) ? AArch64::G_TRN1 : AArch64::G_TRN2;
+  Register V1 = MI.getOperand(1).getReg();
+  Register V2 = MI.getOperand(2).getReg();
+  MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
+  return true;
+}
+
+/// \return true if a G_SHUFFLE_VECTOR instruction \p MI can be replaced with
+/// a G_UZP1 or G_UZP2 instruction.
+///
+/// \param [in] MI - The shuffle vector instruction.
+/// \param [out] MatchInfo - Either G_UZP1 or G_UZP2 on success.
+static bool matchUZP(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     ShuffleVectorPseudo &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  unsigned WhichResult;
+  ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+  Register Dst = MI.getOperand(0).getReg();
+  unsigned NumElts = MRI.getType(Dst).getNumElements();
+  if (!isUZPMask(ShuffleMask, NumElts, WhichResult))
+    return false;
+  unsigned Opc = (WhichResult == 0) ? AArch64::G_UZP1 : AArch64::G_UZP2;
+  Register V1 = MI.getOperand(1).getReg();
+  Register V2 = MI.getOperand(2).getReg();
+  MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
+  return true;
+}
+
+static bool matchZip(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     ShuffleVectorPseudo &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  unsigned WhichResult;
+  ArrayRef<int> ShuffleMask = MI.getOperand(3).getShuffleMask();
+  Register Dst = MI.getOperand(0).getReg();
+  unsigned NumElts = MRI.getType(Dst).getNumElements();
+  if (!isZipMask(ShuffleMask, NumElts, WhichResult))
+    return false;
+  unsigned Opc = (WhichResult == 0) ? AArch64::G_ZIP1 : AArch64::G_ZIP2;
+  Register V1 = MI.getOperand(1).getReg();
+  Register V2 = MI.getOperand(2).getReg();
+  MatchInfo = ShuffleVectorPseudo(Opc, Dst, {V1, V2});
+  return true;
+}
+
+/// Helper function for matchDup.
+static bool matchDupFromInsertVectorElt(int Lane, MachineInstr &MI,
+                                        MachineRegisterInfo &MRI,
+                                        ShuffleVectorPseudo &MatchInfo) {
+  if (Lane != 0)
+    return false;
+
+  // Try to match a vector splat operation into a dup instruction.
+  // We're looking for this pattern:
+  //
+  // %scalar:gpr(s64) = COPY $x0
+  // %undef:fpr(<2 x s64>) = G_IMPLICIT_DEF
+  // %cst0:gpr(s32) = G_CONSTANT i32 0
+  // %zerovec:fpr(<2 x s32>) = G_BUILD_VECTOR %cst0(s32), %cst0(s32)
+  // %ins:fpr(<2 x s64>) = G_INSERT_VECTOR_ELT %undef, %scalar(s64), %cst0(s32)
+  // %splat:fpr(<2 x s64>) = G_SHUFFLE_VECTOR %ins(<2 x s64>), %undef, %zerovec(<2 x s32>)
+  //
+  // ...into:
+  // %splat = G_DUP %scalar
+
+  // Begin matching the insert.
+  auto *InsMI = getOpcodeDef(TargetOpcode::G_INSERT_VECTOR_ELT,
+                             MI.getOperand(1).getReg(), MRI);
+  if (!InsMI)
+    return false;
+  // Match the undef vector operand.
+  if (!getOpcodeDef(TargetOpcode::G_IMPLICIT_DEF, InsMI->getOperand(1).getReg(),
+                    MRI))
+    return false;
+
+  // Match the index constant 0.
+  int64_t Index = 0;
+  if (!mi_match(InsMI->getOperand(3).getReg(), MRI, m_ICst(Index)) || Index)
+    return false;
+
+  MatchInfo = ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(),
+                                  {InsMI->getOperand(2).getReg()});
+  return true;
+}
+
+/// Helper function for matchDup.
+static bool matchDupFromBuildVector(int Lane, MachineInstr &MI,
+                                    MachineRegisterInfo &MRI,
+                                    ShuffleVectorPseudo &MatchInfo) {
+  assert(Lane >= 0 && "Expected positive lane?");
+  // Test if the LHS is a BUILD_VECTOR. If it is, then we can just reference the
+  // lane's definition directly.
+  auto *BuildVecMI = getOpcodeDef(TargetOpcode::G_BUILD_VECTOR,
+                                  MI.getOperand(1).getReg(), MRI);
+  if (!BuildVecMI)
+    return false;
+  Register Reg = BuildVecMI->getOperand(Lane + 1).getReg();
+  MatchInfo =
+      ShuffleVectorPseudo(AArch64::G_DUP, MI.getOperand(0).getReg(), {Reg});
+  return true;
+}
+
+static bool matchDup(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     ShuffleVectorPseudo &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  auto MaybeLane = getSplatIndex(MI);
+  if (!MaybeLane)
+    return false;
+  int Lane = *MaybeLane;
+  // If this is undef splat, generate it via "just" vdup, if possible.
+  if (Lane < 0)
+    Lane = 0;
+  if (matchDupFromInsertVectorElt(Lane, MI, MRI, MatchInfo))
+    return true;
+  if (matchDupFromBuildVector(Lane, MI, MRI, MatchInfo))
+    return true;
+  return false;
+}
+
+static bool matchEXT(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     ShuffleVectorPseudo &MatchInfo) {
+  assert(MI.getOpcode() == TargetOpcode::G_SHUFFLE_VECTOR);
+  Register Dst = MI.getOperand(0).getReg();
+  auto ExtInfo = getExtMask(MI.getOperand(3).getShuffleMask(),
+                            MRI.getType(Dst).getNumElements());
+  if (!ExtInfo)
+    return false;
+  bool ReverseExt;
+  uint64_t Imm;
+  std::tie(ReverseExt, Imm) = *ExtInfo;
+  Register V1 = MI.getOperand(1).getReg();
+  Register V2 = MI.getOperand(2).getReg();
+  if (ReverseExt)
+    std::swap(V1, V2);
+  uint64_t ExtFactor = MRI.getType(V1).getScalarSizeInBits() / 8;
+  Imm *= ExtFactor;
+  MatchInfo = ShuffleVectorPseudo(AArch64::G_EXT, Dst, {V1, V2, Imm});
+  return true;
+}
+
+/// Replace a G_SHUFFLE_VECTOR instruction with a pseudo.
+/// \p Opc is the opcode to use. \p MI is the G_SHUFFLE_VECTOR.
+static bool applyShuffleVectorPseudo(MachineInstr &MI,
+                                     ShuffleVectorPseudo &MatchInfo) {
+  MachineIRBuilder MIRBuilder(MI);
+  MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst}, MatchInfo.SrcOps);
+  MI.eraseFromParent();
+  return true;
+}
+
+/// Replace a G_SHUFFLE_VECTOR instruction with G_EXT.
+/// Special-cased because the constant operand must be emitted as a G_CONSTANT
+/// for the imported tablegen patterns to work.
+static bool applyEXT(MachineInstr &MI, ShuffleVectorPseudo &MatchInfo) {
+  MachineIRBuilder MIRBuilder(MI);
+  // Tablegen patterns expect an i32 G_CONSTANT as the final op.
+  auto Cst =
+      MIRBuilder.buildConstant(LLT::scalar(32), MatchInfo.SrcOps[2].getImm());
+  MIRBuilder.buildInstr(MatchInfo.Opc, {MatchInfo.Dst},
+                        {MatchInfo.SrcOps[0], MatchInfo.SrcOps[1], Cst});
+  MI.eraseFromParent();
+  return true;
+}
+
+#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "AArch64GenPostLegalizeGICombiner.inc"
+#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
+namespace {
+#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "AArch64GenPostLegalizeGICombiner.inc"
+#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+
+class AArch64PostLegalizerCombinerInfo : public CombinerInfo {
+  GISelKnownBits *KB;
+  MachineDominatorTree *MDT;
+
+public:
+  AArch64GenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
+
+  AArch64PostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+                                   GISelKnownBits *KB,
+                                   MachineDominatorTree *MDT)
+      : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
+                     /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
+        KB(KB), MDT(MDT) {
+    if (!GeneratedRuleCfg.parseCommandLineOption())
+      report_fatal_error("Invalid rule identifier");
+  }
+
+  virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+                       MachineIRBuilder &B) const override;
+};
+
+bool AArch64PostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
+                                               MachineInstr &MI,
+                                               MachineIRBuilder &B) const {
+  const auto *LI =
+      MI.getParent()->getParent()->getSubtarget().getLegalizerInfo();
+  CombinerHelper Helper(Observer, B, KB, MDT, LI);
+  AArch64GenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg);
+  return Generated.tryCombineAll(Observer, MI, B, Helper);
+}
+
+#define AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "AArch64GenPostLegalizeGICombiner.inc"
+#undef AARCH64POSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
+class AArch64PostLegalizerCombiner : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AArch64PostLegalizerCombiner(bool IsOptNone = false);
+
+  StringRef getPassName() const override {
+    return "AArch64PostLegalizerCombiner";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  bool IsOptNone;
+};
+} // end anonymous namespace
+
+void AArch64PostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.setPreservesCFG();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  AU.addRequired<GISelKnownBitsAnalysis>();
+  AU.addPreserved<GISelKnownBitsAnalysis>();
+  if (!IsOptNone) {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+  }
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AArch64PostLegalizerCombiner::AArch64PostLegalizerCombiner(bool IsOptNone)
+    : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+  initializeAArch64PostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
+}
+
+bool AArch64PostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+  assert(MF.getProperties().hasProperty(
+             MachineFunctionProperties::Property::Legalized) &&
+         "Expected a legalized function?");
+  auto *TPC = &getAnalysis<TargetPassConfig>();
+  const Function &F = MF.getFunction();
+  bool EnableOpt =
+      MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+  MachineDominatorTree *MDT =
+      IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+  AArch64PostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
+                                          F.hasMinSize(), KB, MDT);
+  Combiner C(PCInfo, TPC);
+  return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+}
+
+char AArch64PostLegalizerCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(AArch64PostLegalizerCombiner, DEBUG_TYPE,
+                      "Combine AArch64 MachineInstrs after legalization", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_END(AArch64PostLegalizerCombiner, DEBUG_TYPE,
+                    "Combine AArch64 MachineInstrs after legalization", false,
+                    false)
+
+namespace llvm {
+FunctionPass *createAArch64PostLegalizeCombiner(bool IsOptNone) {
+  return new AArch64PostLegalizerCombiner(IsOptNone);
+}
+} // end namespace llvm
diff --git a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
index 230fd514d022..9a1f200d5222 100644
--- a/llvm/lib/Target/AArch64/AArch64PreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64PreLegalizerCombiner.cpp
@@ -27,28 +27,62 @@
 using namespace llvm;
 using namespace MIPatternMatch;
 
+/// Return true if a G_FCONSTANT instruction is known to be better-represented
+/// as a G_CONSTANT.
+static bool matchFConstantToConstant(MachineInstr &MI,
+                                     MachineRegisterInfo &MRI) {
+  assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
+  Register DstReg = MI.getOperand(0).getReg();
+  const unsigned DstSize = MRI.getType(DstReg).getSizeInBits();
+  if (DstSize != 32 && DstSize != 64)
+    return false;
+
+  // When we're storing a value, it doesn't matter what register bank it's on.
+  // Since not all floating point constants can be materialized using a fmov,
+  // it makes more sense to just use a GPR.
+  return all_of(MRI.use_nodbg_instructions(DstReg),
+                [](const MachineInstr &Use) { return Use.mayStore(); });
+}
+
+/// Change a G_FCONSTANT into a G_CONSTANT.
+static void applyFConstantToConstant(MachineInstr &MI) {
+  assert(MI.getOpcode() == TargetOpcode::G_FCONSTANT);
+  MachineIRBuilder MIB(MI);
+  const APFloat &ImmValAPF = MI.getOperand(1).getFPImm()->getValueAPF();
+  MIB.buildConstant(MI.getOperand(0).getReg(), ImmValAPF.bitcastToAPInt());
+  MI.eraseFromParent();
+}
+
+class AArch64PreLegalizerCombinerHelperState {
+protected:
+  CombinerHelper &Helper;
+
+public:
+  AArch64PreLegalizerCombinerHelperState(CombinerHelper &Helper)
+      : Helper(Helper) {}
+};
+
 #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
-#include "AArch64GenGICombiner.inc"
+#include "AArch64GenPreLegalizeGICombiner.inc"
 #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
 
 namespace {
 #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
-#include "AArch64GenGICombiner.inc"
+#include "AArch64GenPreLegalizeGICombiner.inc"
 #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
 
 class AArch64PreLegalizerCombinerInfo : public CombinerInfo {
   GISelKnownBits *KB;
   MachineDominatorTree *MDT;
+  AArch64GenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
 
 public:
-  AArch64GenPreLegalizerCombinerHelper Generated;
-
   AArch64PreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
                                   GISelKnownBits *KB, MachineDominatorTree *MDT)
       : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
                      /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
         KB(KB), MDT(MDT) {
-    if (!Generated.parseCommandLineOption())
+    if (!GeneratedRuleCfg.parseCommandLineOption())
       report_fatal_error("Invalid rule identifier");
   }
 
@@ -60,6 +94,7 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
                                               MachineInstr &MI,
                                               MachineIRBuilder &B) const {
   CombinerHelper Helper(Observer, B, KB, MDT);
+  AArch64GenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg, Helper);
 
   switch (MI.getOpcode()) {
   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
@@ -79,7 +114,7 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
     }
   }
 
-  if (Generated.tryCombineAll(Observer, MI, B, Helper))
+  if (Generated.tryCombineAll(Observer, MI, B))
     return true;
 
   switch (MI.getOpcode()) {
@@ -93,7 +128,7 @@ bool AArch64PreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
 }
 
 #define AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
-#include "AArch64GenGICombiner.inc"
+#include "AArch64GenPreLegalizeGICombiner.inc"
 #undef AARCH64PRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
 
 // Pass boilerplate
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
index 40efac261fd9..7e3ff1948dad 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.cpp
@@ -38,58 +38,58 @@ using namespace llvm;
 
 AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
     : AArch64GenRegisterBankInfo() {
-  static bool AlreadyInit = false;
-  // We have only one set of register banks, whatever the subtarget
-  // is. Therefore, the initialization of the RegBanks table should be
-  // done only once. Indeed the table of all register banks
-  // (AArch64::RegBanks) is unique in the compiler. At some point, it
-  // will get tablegen'ed and the whole constructor becomes empty.
-  if (AlreadyInit)
-    return;
-  AlreadyInit = true;
-
-  const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID);
-  (void)RBGPR;
-  assert(&AArch64::GPRRegBank == &RBGPR &&
-         "The order in RegBanks is messed up");
-
-  const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
-  (void)RBFPR;
-  assert(&AArch64::FPRRegBank == &RBFPR &&
-         "The order in RegBanks is messed up");
-
-  const RegisterBank &RBCCR = getRegBank(AArch64::CCRegBankID);
-  (void)RBCCR;
-  assert(&AArch64::CCRegBank == &RBCCR && "The order in RegBanks is messed up");
-
-  // The GPR register bank is fully defined by all the registers in
-  // GR64all + its subclasses.
-  assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) &&
-         "Subclass not added?");
-  assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
-
-  // The FPR register bank is fully defined by all the registers in
-  // GR64all + its subclasses.
-  assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) &&
-         "Subclass not added?");
-  assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) &&
-         "Subclass not added?");
-  assert(RBFPR.getSize() == 512 &&
-         "FPRs should hold up to 512-bit via QQQQ sequence");
-
-  assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) &&
-         "Class not added?");
-  assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit");
-
-  // Check that the TableGen'ed like file is in sync we our expectations.
-  // First, the Idx.
-  assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR,
-                                {PMI_GPR32, PMI_GPR64}) &&
-         "PartialMappingIdx's are incorrectly ordered");
-  assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR,
-                                {PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128,
-                                 PMI_FPR256, PMI_FPR512}) &&
-         "PartialMappingIdx's are incorrectly ordered");
+  static llvm::once_flag InitializeRegisterBankFlag;
+
+  static auto InitializeRegisterBankOnce = [&]() {
+    // We have only one set of register banks, whatever the subtarget
+    // is. Therefore, the initialization of the RegBanks table should be
+    // done only once. Indeed the table of all register banks
+    // (AArch64::RegBanks) is unique in the compiler. At some point, it
+    // will get tablegen'ed and the whole constructor becomes empty.
+
+    const RegisterBank &RBGPR = getRegBank(AArch64::GPRRegBankID);
+    (void)RBGPR;
+    assert(&AArch64::GPRRegBank == &RBGPR &&
+           "The order in RegBanks is messed up");
+
+    const RegisterBank &RBFPR = getRegBank(AArch64::FPRRegBankID);
+    (void)RBFPR;
+    assert(&AArch64::FPRRegBank == &RBFPR &&
+           "The order in RegBanks is messed up");
+
+    const RegisterBank &RBCCR = getRegBank(AArch64::CCRegBankID);
+    (void)RBCCR;
+    assert(&AArch64::CCRegBank == &RBCCR &&
+           "The order in RegBanks is messed up");
+
+    // The GPR register bank is fully defined by all the registers in
+    // GR64all + its subclasses.
+    assert(RBGPR.covers(*TRI.getRegClass(AArch64::GPR32RegClassID)) &&
+           "Subclass not added?");
+    assert(RBGPR.getSize() == 64 && "GPRs should hold up to 64-bit");
+
+    // The FPR register bank is fully defined by all the registers in
+    // GR64all + its subclasses.
+    assert(RBFPR.covers(*TRI.getRegClass(AArch64::QQRegClassID)) &&
+           "Subclass not added?");
+    assert(RBFPR.covers(*TRI.getRegClass(AArch64::FPR64RegClassID)) &&
+           "Subclass not added?");
+    assert(RBFPR.getSize() == 512 &&
+           "FPRs should hold up to 512-bit via QQQQ sequence");
+
+    assert(RBCCR.covers(*TRI.getRegClass(AArch64::CCRRegClassID)) &&
+           "Class not added?");
+    assert(RBCCR.getSize() == 32 && "CCR should hold up to 32-bit");
+
+    // Check that the TableGen'ed like file is in sync we our expectations.
+    // First, the Idx.
+    assert(checkPartialMappingIdx(PMI_FirstGPR, PMI_LastGPR,
+                                  {PMI_GPR32, PMI_GPR64}) &&
+           "PartialMappingIdx's are incorrectly ordered");
+    assert(checkPartialMappingIdx(PMI_FirstFPR, PMI_LastFPR,
+                                  {PMI_FPR16, PMI_FPR32, PMI_FPR64, PMI_FPR128,
+                                   PMI_FPR256, PMI_FPR512}) &&
+           "PartialMappingIdx's are incorrectly ordered");
 // Now, the content.
 // Check partial mapping.
 #define CHECK_PARTIALMAP(Idx, ValStartIdx, ValLength, RB)                      \
@@ -99,14 +99,14 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
         #Idx " is incorrectly initialized");                                   \
   } while (false)
 
-  CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR);
-  CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR);
-  CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR);
-  CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR);
-  CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR);
-  CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR);
-  CHECK_PARTIALMAP(PMI_FPR256, 0, 256, RBFPR);
-  CHECK_PARTIALMAP(PMI_FPR512, 0, 512, RBFPR);
+    CHECK_PARTIALMAP(PMI_GPR32, 0, 32, RBGPR);
+    CHECK_PARTIALMAP(PMI_GPR64, 0, 64, RBGPR);
+    CHECK_PARTIALMAP(PMI_FPR16, 0, 16, RBFPR);
+    CHECK_PARTIALMAP(PMI_FPR32, 0, 32, RBFPR);
+    CHECK_PARTIALMAP(PMI_FPR64, 0, 64, RBFPR);
+    CHECK_PARTIALMAP(PMI_FPR128, 0, 128, RBFPR);
+    CHECK_PARTIALMAP(PMI_FPR256, 0, 256, RBFPR);
+    CHECK_PARTIALMAP(PMI_FPR512, 0, 512, RBFPR);
 
 // Check value mapping.
 #define CHECK_VALUEMAP_IMPL(RBName, Size, Offset)                              \
@@ -119,14 +119,14 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
 
 #define CHECK_VALUEMAP(RBName, Size) CHECK_VALUEMAP_IMPL(RBName, Size, 0)
 
-  CHECK_VALUEMAP(GPR, 32);
-  CHECK_VALUEMAP(GPR, 64);
-  CHECK_VALUEMAP(FPR, 16);
-  CHECK_VALUEMAP(FPR, 32);
-  CHECK_VALUEMAP(FPR, 64);
-  CHECK_VALUEMAP(FPR, 128);
-  CHECK_VALUEMAP(FPR, 256);
-  CHECK_VALUEMAP(FPR, 512);
+    CHECK_VALUEMAP(GPR, 32);
+    CHECK_VALUEMAP(GPR, 64);
+    CHECK_VALUEMAP(FPR, 16);
+    CHECK_VALUEMAP(FPR, 32);
+    CHECK_VALUEMAP(FPR, 64);
+    CHECK_VALUEMAP(FPR, 128);
+    CHECK_VALUEMAP(FPR, 256);
+    CHECK_VALUEMAP(FPR, 512);
 
 // Check the value mapping for 3-operands instructions where all the operands
 // map to the same value mapping.
@@ -137,13 +137,13 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
     CHECK_VALUEMAP_IMPL(RBName, Size, 2);                                      \
   } while (false)
 
-  CHECK_VALUEMAP_3OPS(GPR, 32);
-  CHECK_VALUEMAP_3OPS(GPR, 64);
-  CHECK_VALUEMAP_3OPS(FPR, 32);
-  CHECK_VALUEMAP_3OPS(FPR, 64);
-  CHECK_VALUEMAP_3OPS(FPR, 128);
-  CHECK_VALUEMAP_3OPS(FPR, 256);
-  CHECK_VALUEMAP_3OPS(FPR, 512);
+    CHECK_VALUEMAP_3OPS(GPR, 32);
+    CHECK_VALUEMAP_3OPS(GPR, 64);
+    CHECK_VALUEMAP_3OPS(FPR, 32);
+    CHECK_VALUEMAP_3OPS(FPR, 64);
+    CHECK_VALUEMAP_3OPS(FPR, 128);
+    CHECK_VALUEMAP_3OPS(FPR, 256);
+    CHECK_VALUEMAP_3OPS(FPR, 512);
 
 #define CHECK_VALUEMAP_CROSSREGCPY(RBNameDst, RBNameSrc, Size)                 \
   do {                                                                         \
@@ -165,14 +165,14 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
                                                                                \
   } while (false)
 
-  CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32);
-  CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32);
-  CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 64);
-  CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 64);
-  CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 32);
-  CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 32);
-  CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64);
-  CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64);
+    CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 32);
+    CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 32);
+    CHECK_VALUEMAP_CROSSREGCPY(GPR, GPR, 64);
+    CHECK_VALUEMAP_CROSSREGCPY(GPR, FPR, 64);
+    CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 32);
+    CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 32);
+    CHECK_VALUEMAP_CROSSREGCPY(FPR, FPR, 64);
+    CHECK_VALUEMAP_CROSSREGCPY(FPR, GPR, 64);
 
 #define CHECK_VALUEMAP_FPEXT(DstSize, SrcSize)                                 \
   do {                                                                         \
@@ -193,12 +193,15 @@ AArch64RegisterBankInfo::AArch64RegisterBankInfo(const TargetRegisterInfo &TRI)
                                                                                \
   } while (false)
 
-  CHECK_VALUEMAP_FPEXT(32, 16);
-  CHECK_VALUEMAP_FPEXT(64, 16);
-  CHECK_VALUEMAP_FPEXT(64, 32);
-  CHECK_VALUEMAP_FPEXT(128, 64);
+    CHECK_VALUEMAP_FPEXT(32, 16);
+    CHECK_VALUEMAP_FPEXT(64, 16);
+    CHECK_VALUEMAP_FPEXT(64, 32);
+    CHECK_VALUEMAP_FPEXT(128, 64);
 
-  assert(verify(TRI) && "Invalid register bank information");
+    assert(verify(TRI) && "Invalid register bank information");
+  };
+
+  llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
 }
 
 unsigned AArch64RegisterBankInfo::copyCost(const RegisterBank &A,
@@ -228,8 +231,11 @@ AArch64RegisterBankInfo::getRegBankFromRegClass(const TargetRegisterClass &RC,
   switch (RC.getID()) {
   case AArch64::FPR8RegClassID:
   case AArch64::FPR16RegClassID:
+  case AArch64::FPR16_loRegClassID:
+  case AArch64::FPR32_with_hsub_in_FPR16_loRegClassID:
   case AArch64::FPR32RegClassID:
   case AArch64::FPR64RegClassID:
+  case AArch64::FPR64_loRegClassID:
   case AArch64::FPR128RegClassID:
   case AArch64::FPR128_loRegClassID:
   case AArch64::DDRegClassID:
@@ -495,6 +501,7 @@ bool AArch64RegisterBankInfo::onlyDefinesFP(
     const MachineInstr &MI, const MachineRegisterInfo &MRI,
     const TargetRegisterInfo &TRI) const {
   switch (MI.getOpcode()) {
+  case AArch64::G_DUP:
   case TargetOpcode::G_SITOFP:
   case TargetOpcode::G_UITOFP:
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
@@ -636,6 +643,16 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   // Some of the floating-point instructions have mixed GPR and FPR operands:
   // fine-tune the computed mapping.
   switch (Opc) {
+  case AArch64::G_DUP: {
+    Register ScalarReg = MI.getOperand(1).getReg();
+    auto ScalarDef = MRI.getVRegDef(ScalarReg);
+    if (getRegBank(ScalarReg, MRI, TRI) == &AArch64::FPRRegBank ||
+        onlyDefinesFP(*ScalarDef, MRI, TRI))
+      OpRegBankIdx = {PMI_FirstFPR, PMI_FirstFPR};
+    else
+      OpRegBankIdx = {PMI_FirstFPR, PMI_FirstGPR};
+    break;
+  }
   case TargetOpcode::G_TRUNC: {
     LLT SrcTy = MRI.getType(MI.getOperand(1).getReg());
     if (!SrcTy.isVector() && SrcTy.getSizeInBits() == 128)
@@ -680,7 +697,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       // In that case, we want the default mapping to be on FPR
       // instead of blind map every scalar to GPR.
       for (const MachineInstr &UseMI :
-           MRI.use_instructions(MI.getOperand(0).getReg())) {
+           MRI.use_nodbg_instructions(MI.getOperand(0).getReg())) {
         // If we have at least one direct use in a FP instruction,
         // assume this was a floating point load in the IR.
         // If it was not, we would have had a bitcast before
@@ -727,9 +744,8 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     //
     // %z = G_SELECT %cond %x %y
     // fpr = G_FOO %z ...
-    if (any_of(
-            MRI.use_instructions(MI.getOperand(0).getReg()),
-            [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); }))
+    if (any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()),
+               [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); }))
       ++NumFP;
 
     // Check if the defs of the source values always produce floating point
@@ -770,7 +786,7 @@ AArch64RegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     // UNMERGE into scalars from a vector should always use FPR.
     // Likewise if any of the uses are FP instructions.
     if (SrcTy.isVector() || SrcTy == LLT::scalar(128) ||
-        any_of(MRI.use_instructions(MI.getOperand(0).getReg()),
+        any_of(MRI.use_nodbg_instructions(MI.getOperand(0).getReg()),
                [&](MachineInstr &MI) { return onlyUsesFP(MI, MRI, TRI); })) {
       // Set the register bank of every operand to FPR.
       for (unsigned Idx = 0, NumOperands = MI.getNumOperands();
diff --git a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
index e956fca1aa10..e956fca1aa10 100644
--- a/llvm/lib/Target/AArch64/AArch64RegisterBankInfo.h
+++ b/llvm/lib/Target/AArch64/GISel/AArch64RegisterBankInfo.h
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
index 05a909f1780a..9814f7625853 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AddressingModes.h
@@ -763,10 +763,10 @@ static inline bool isSVECpyImm(int64_t Imm) {
   bool IsImm8 = int8_t(Imm) == Imm;
   bool IsImm16 = int16_t(Imm & ~0xff) == Imm;
 
-  if (std::is_same<int8_t, typename std::make_signed<T>::type>::value)
+  if (std::is_same<int8_t, std::make_signed_t<T>>::value)
     return IsImm8 || uint8_t(Imm) == Imm;
 
-  if (std::is_same<int16_t, typename std::make_signed<T>::type>::value)
+  if (std::is_same<int16_t, std::make_signed_t<T>>::value)
     return IsImm8 || IsImm16 || uint16_t(Imm & ~0xff) == Imm;
 
   return IsImm8 || IsImm16;
@@ -775,8 +775,7 @@ static inline bool isSVECpyImm(int64_t Imm) {
 /// Returns true if Imm is valid for ADD/SUB.
 template <typename T>
 static inline bool isSVEAddSubImm(int64_t Imm) {
-  bool IsInt8t =
-      std::is_same<int8_t, typename std::make_signed<T>::type>::value;
+  bool IsInt8t = std::is_same<int8_t, std::make_signed_t<T>>::value;
   return uint8_t(Imm) == Imm || (!IsInt8t && uint16_t(Imm & ~0xff) == Imm);
 }
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
index 9db746733aa3..9f7dfdf62482 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64AsmBackend.cpp
@@ -24,6 +24,7 @@
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/MC/MCTargetOptions.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 using namespace llvm;
@@ -33,6 +34,7 @@ namespace {
 class AArch64AsmBackend : public MCAsmBackend {
   static const unsigned PCRelFlagVal =
       MCFixupKindInfo::FKF_IsAlignedDownTo32Bits | MCFixupKindInfo::FKF_IsPCRel;
+protected:
   Triple TheTriple;
 
 public:
@@ -68,6 +70,11 @@ public:
         {"fixup_aarch64_pcrel_call26", 0, 26, PCRelFlagVal},
         {"fixup_aarch64_tlsdesc_call", 0, 0, 0}};
 
+    // Fixup kinds from .reloc directive are like R_AARCH64_NONE. They do not
+    // require any extra processing.
+    if (Kind >= FirstLiteralRelocationKind)
+      return MCAsmBackend::getFixupKindInfo(FK_NONE);
+
     if (Kind < FirstTargetFixupKind)
       return MCAsmBackend::getFixupKindInfo(Kind);
 
@@ -86,8 +93,8 @@ public:
   bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override;
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override;
+  void relaxInstruction(MCInst &Inst,
+                        const MCSubtargetInfo &STI) const override;
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 
   void HandleAssemblerFlag(MCAssemblerFlag Flag) {}
@@ -108,7 +115,6 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   default:
     llvm_unreachable("Unknown fixup kind!");
 
-  case FK_NONE:
   case AArch64::fixup_aarch64_tlsdesc_call:
     return 0;
 
@@ -237,11 +243,22 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
         static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
     if (AArch64MCExpr::getSymbolLoc(RefKind) != AArch64MCExpr::VK_ABS &&
         AArch64MCExpr::getSymbolLoc(RefKind) != AArch64MCExpr::VK_SABS) {
-      // VK_GOTTPREL, VK_TPREL, VK_DTPREL are movw fixups, but they can't
-      // ever be resolved in the assembler.
-      Ctx.reportError(Fixup.getLoc(),
-                      "relocation for a thread-local variable points to an "
-                      "absolute symbol");
+      if (!RefKind) {
+        // The fixup is an expression
+        if (SignedValue > 0xFFFF || SignedValue < -0xFFFF)
+          Ctx.reportError(Fixup.getLoc(),
+                          "fixup value out of range [-0xFFFF, 0xFFFF]");
+
+        // Invert the negative immediate because it will feed into a MOVN.
+        if (SignedValue < 0)
+          SignedValue = ~SignedValue;
+        Value = static_cast<uint64_t>(SignedValue);
+      } else
+        // VK_GOTTPREL, VK_TPREL, VK_DTPREL are movw fixups, but they can't
+        // ever be resolved in the assembler.
+        Ctx.reportError(Fixup.getLoc(),
+                        "relocation for a thread-local variable points to an "
+                        "absolute symbol");
       return Value;
     }
 
@@ -329,7 +346,6 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
     if (!valueFitsIntoFixupKind(Fixup.getTargetKind(), Value))
       Ctx.reportError(Fixup.getLoc(), "fixup value too large for data type!");
     LLVM_FALLTHROUGH;
-  case FK_NONE:
   case FK_SecRel_2:
   case FK_SecRel_4:
     return Value;
@@ -337,9 +353,17 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, const MCValue &Target,
 }
 
 Optional<MCFixupKind> AArch64AsmBackend::getFixupKind(StringRef Name) const {
-  if (TheTriple.isOSBinFormatELF() && Name == "R_AARCH64_NONE")
-    return FK_NONE;
-  return MCAsmBackend::getFixupKind(Name);
+  if (!TheTriple.isOSBinFormatELF())
+    return None;
+
+  unsigned Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y)  .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/AArch64.def"
+#undef ELF_RELOC
+                      .Default(-1u);
+  if (Type == -1u)
+    return None;
+  return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type);
 }
 
 /// getFixupKindContainereSizeInBytes - The number of bytes of the
@@ -386,9 +410,12 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                    MutableArrayRef<char> Data, uint64_t Value,
                                    bool IsResolved,
                                    const MCSubtargetInfo *STI) const {
-  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
   if (!Value)
     return; // Doesn't change encoding.
+  unsigned Kind = Fixup.getKind();
+  if (Kind >= FirstLiteralRelocationKind)
+    return;
+  unsigned NumBytes = getFixupKindNumBytes(Kind);
   MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
   MCContext &Ctx = Asm.getContext();
   int64_t SignedValue = static_cast<int64_t>(Value);
@@ -424,8 +451,9 @@ void AArch64AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
   // FIXME: getFixupKindInfo() and getFixupKindNumBytes() could be fixed to
   // handle this more cleanly. This may affect the output of -show-mc-encoding.
   AArch64MCExpr::VariantKind RefKind =
-    static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
-  if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS) {
+      static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
+  if (AArch64MCExpr::getSymbolLoc(RefKind) == AArch64MCExpr::VK_SABS ||
+      (!RefKind && Fixup.getTargetKind() == AArch64::fixup_aarch64_movw)) {
     // If the immediate is negative, generate MOVN else MOVZ.
     // (Bit 30 = 0) ==> MOVN, (Bit 30 = 1) ==> MOVZ.
     if (SignedValue < 0)
@@ -451,9 +479,8 @@ bool AArch64AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
   return int64_t(Value) != int64_t(int8_t(Value));
 }
 
-void AArch64AsmBackend::relaxInstruction(const MCInst &Inst,
-                                         const MCSubtargetInfo &STI,
-                                         MCInst &Res) const {
+void AArch64AsmBackend::relaxInstruction(MCInst &Inst,
+                                         const MCSubtargetInfo &STI) const {
   llvm_unreachable("AArch64AsmBackend::relaxInstruction() unimplemented");
 }
 
@@ -474,7 +501,7 @@ bool AArch64AsmBackend::shouldForceRelocation(const MCAssembler &Asm,
                                               const MCFixup &Fixup,
                                               const MCValue &Target) {
   unsigned Kind = Fixup.getKind();
-  if (Kind == FK_NONE)
+  if (Kind >= FirstLiteralRelocationKind)
     return true;
 
   // The ADRP instruction adds some multiple of 0x1000 to the current PC &
@@ -544,7 +571,6 @@ enum CompactUnwindEncodings {
 // FIXME: This should be in a separate file.
 class DarwinAArch64AsmBackend : public AArch64AsmBackend {
   const MCRegisterInfo &MRI;
-  bool IsILP32;
 
   /// Encode compact unwind stack adjustment for frameless functions.
   /// See UNWIND_ARM64_FRAMELESS_STACK_SIZE_MASK in compact_unwind_encoding.h.
@@ -555,18 +581,15 @@ class DarwinAArch64AsmBackend : public AArch64AsmBackend {
 
 public:
   DarwinAArch64AsmBackend(const Target &T, const Triple &TT,
-                          const MCRegisterInfo &MRI, bool IsILP32)
-      : AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI),
-        IsILP32(IsILP32) {}
+                          const MCRegisterInfo &MRI)
+      : AArch64AsmBackend(T, TT, /*IsLittleEndian*/ true), MRI(MRI) {}
 
   std::unique_ptr<MCObjectTargetWriter>
   createObjectTargetWriter() const override {
-    if (IsILP32)
-      return createAArch64MachObjectWriter(
-          MachO::CPU_TYPE_ARM64_32, MachO::CPU_SUBTYPE_ARM64_32_V8, true);
-    else
-      return createAArch64MachObjectWriter(MachO::CPU_TYPE_ARM64,
-                                           MachO::CPU_SUBTYPE_ARM64_ALL, false);
+    uint32_t CPUType = cantFail(MachO::getCPUType(TheTriple));
+    uint32_t CPUSubType = cantFail(MachO::getCPUSubType(TheTriple));
+    return createAArch64MachObjectWriter(CPUType, CPUSubType,
+                                         TheTriple.isArch32Bit());
   }
 
   /// Generate the compact unwind encoding from the CFI directives.
@@ -749,8 +772,7 @@ MCAsmBackend *llvm::createAArch64leAsmBackend(const Target &T,
                                               const MCTargetOptions &Options) {
   const Triple &TheTriple = STI.getTargetTriple();
   if (TheTriple.isOSBinFormatMachO()) {
-    const bool IsILP32 = TheTriple.isArch32Bit();
-    return new DarwinAArch64AsmBackend(T, TheTriple, MRI, IsILP32);
+    return new DarwinAArch64AsmBackend(T, TheTriple, MRI);
   }
 
   if (TheTriple.isOSBinFormatCOFF())
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
index 0fd1ca187be7..e5637dcab941 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFObjectWriter.cpp
@@ -106,13 +106,17 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
                                               const MCValue &Target,
                                               const MCFixup &Fixup,
                                               bool IsPCRel) const {
+  unsigned Kind = Fixup.getTargetKind();
+  if (Kind >= FirstLiteralRelocationKind)
+    return Kind - FirstLiteralRelocationKind;
   AArch64MCExpr::VariantKind RefKind =
       static_cast<AArch64MCExpr::VariantKind>(Target.getRefKind());
   AArch64MCExpr::VariantKind SymLoc = AArch64MCExpr::getSymbolLoc(RefKind);
   bool IsNC = AArch64MCExpr::isNotChecked(RefKind);
 
   assert((!Target.getSymA() ||
-          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None) &&
+          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_None ||
+          Target.getSymA()->getKind() == MCSymbolRefExpr::VK_PLT) &&
          "Should only be expression-level modifiers here");
 
   assert((!Target.getSymB() ||
@@ -120,14 +124,17 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
          "Should only be expression-level modifiers here");
 
   if (IsPCRel) {
-    switch (Fixup.getTargetKind()) {
+    switch (Kind) {
     case FK_Data_1:
       Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
       return ELF::R_AARCH64_NONE;
     case FK_Data_2:
       return R_CLS(PREL16);
-    case FK_Data_4:
-      return R_CLS(PREL32);
+    case FK_Data_4: {
+      return Target.getAccessVariant() == MCSymbolRefExpr::VK_PLT
+                 ? R_CLS(PLT32)
+                 : R_CLS(PREL32);
+    }
     case FK_Data_8:
       if (IsILP32) {
         Ctx.reportError(Fixup.getLoc(),
@@ -185,8 +192,6 @@ unsigned AArch64ELFObjectWriter::getRelocType(MCContext &Ctx,
     if (IsILP32 && isNonILP32reloc(Fixup, RefKind, Ctx))
       return ELF::R_AARCH64_NONE;
     switch (Fixup.getTargetKind()) {
-    case FK_NONE:
-      return ELF::R_AARCH64_NONE;
     case FK_Data_1:
       Ctx.reportError(Fixup.getLoc(), "1-byte data relocations not supported");
       return ELF::R_AARCH64_NONE;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
index c33f7e957b54..fe4c34be1519 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64ELFStreamer.cpp
@@ -81,14 +81,14 @@ public:
                       std::move(Emitter)),
         MappingSymbolCounter(0), LastEMS(EMS_None) {}
 
-  void ChangeSection(MCSection *Section, const MCExpr *Subsection) override {
+  void changeSection(MCSection *Section, const MCExpr *Subsection) override {
     // We have to keep track of the mapping symbol state of any sections we
     // use. Each one should start off as EMS_None, which is provided as the
     // default constructor by DenseMap::lookup.
     LastMappingSymbols[getPreviousSection().first] = LastEMS;
     LastEMS = LastMappingSymbols.lookup(Section);
 
-    MCELFStreamer::ChangeSection(Section, Subsection);
+    MCELFStreamer::changeSection(Section, Subsection);
   }
 
   // Reset state between object emissions
@@ -102,10 +102,10 @@ public:
   /// This function is the one used to emit instruction data into the ELF
   /// streamer. We override it to add the appropriate mapping symbol if
   /// necessary.
-  void EmitInstruction(const MCInst &Inst,
+  void emitInstruction(const MCInst &Inst,
                        const MCSubtargetInfo &STI) override {
     EmitA64MappingSymbol();
-    MCELFStreamer::EmitInstruction(Inst, STI);
+    MCELFStreamer::emitInstruction(Inst, STI);
   }
 
   /// Emit a 32-bit value as an instruction. This is only used for the .inst
@@ -122,28 +122,28 @@ public:
     }
 
     EmitA64MappingSymbol();
-    MCELFStreamer::EmitBytes(StringRef(Buffer, 4));
+    MCELFStreamer::emitBytes(StringRef(Buffer, 4));
   }
 
   /// This is one of the functions used to emit data into an ELF section, so the
   /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
   /// if necessary.
-  void EmitBytes(StringRef Data) override {
-    EmitDataMappingSymbol();
-    MCELFStreamer::EmitBytes(Data);
+  void emitBytes(StringRef Data) override {
+    emitDataMappingSymbol();
+    MCELFStreamer::emitBytes(Data);
   }
 
   /// This is one of the functions used to emit data into an ELF section, so the
   /// AArch64 streamer overrides it to add the appropriate mapping symbol ($d)
   /// if necessary.
-  void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
-    EmitDataMappingSymbol();
-    MCELFStreamer::EmitValueImpl(Value, Size, Loc);
+  void emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
+    emitDataMappingSymbol();
+    MCELFStreamer::emitValueImpl(Value, Size, Loc);
   }
 
   void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
                                   SMLoc Loc) override {
-    EmitDataMappingSymbol();
+    emitDataMappingSymbol();
     MCObjectStreamer::emitFill(NumBytes, FillValue, Loc);
   }
 private:
@@ -153,7 +153,7 @@ private:
     EMS_Data
   };
 
-  void EmitDataMappingSymbol() {
+  void emitDataMappingSymbol() {
     if (LastEMS == EMS_Data)
       return;
     EmitMappingSymbol("$d");
@@ -170,7 +170,7 @@ private:
   void EmitMappingSymbol(StringRef Name) {
     auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
         Name + "." + Twine(MappingSymbolCounter++)));
-    EmitLabel(Symbol);
+    emitLabel(Symbol);
     Symbol->setType(ELF::STT_NOTYPE);
     Symbol->setBinding(ELF::STB_LOCAL);
     Symbol->setExternal(false);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
index 469892213ef8..38474d31460d 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.cpp
@@ -283,7 +283,8 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address,
   }
 
   if (Opcode == AArch64::SPACE) {
-    O << '\t' << MAI.getCommentString() << " SPACE";
+    O << '\t' << MAI.getCommentString() << " SPACE "
+      << MI->getOperand(1).getImm();
     printAnnotation(O, Annot);
     return;
   }
@@ -295,7 +296,7 @@ void AArch64InstPrinter::printInst(const MCInst *MI, uint64_t Address,
     return;
   }
 
-  if (!printAliasInstr(MI, STI, O))
+  if (!printAliasInstr(MI, Address, STI, O))
     printInstruction(MI, Address, STI, O);
 
   printAnnotation(O, Annot);
@@ -900,6 +901,19 @@ void AArch64InstPrinter::printImmHex(const MCInst *MI, unsigned OpNo,
   O << format("#%#llx", Op.getImm());
 }
 
+template<int Size>
+void AArch64InstPrinter::printSImm(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI,
+                                  raw_ostream &O) {
+  const MCOperand &Op = MI->getOperand(OpNo);
+  if (Size == 8)
+    O << "#" << formatImm((signed char)Op.getImm());
+  else if (Size == 16)
+    O << "#" << formatImm((signed short)Op.getImm());
+  else
+    O << "#" << formatImm(Op.getImm());
+}
+
 void AArch64InstPrinter::printPostIncOperand(const MCInst *MI, unsigned OpNo,
                                              unsigned Imm, raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
@@ -1334,7 +1348,8 @@ void AArch64InstPrinter::printVectorIndex(const MCInst *MI, unsigned OpNum,
   O << "[" << MI->getOperand(OpNum).getImm() << "]";
 }
 
-void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
+void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, uint64_t Address,
+                                           unsigned OpNum,
                                            const MCSubtargetInfo &STI,
                                            raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNum);
@@ -1342,17 +1357,20 @@ void AArch64InstPrinter::printAlignedLabel(const MCInst *MI, unsigned OpNum,
   // If the label has already been resolved to an immediate offset (say, when
   // we're running the disassembler), just print the immediate.
   if (Op.isImm()) {
-    O << "#" << formatImm(Op.getImm() * 4);
+    int64_t Offset = Op.getImm() * 4;
+    if (PrintBranchImmAsAddress)
+      O << formatHex(Address + Offset);
+    else
+      O << "#" << formatImm(Offset);
     return;
   }
 
   // If the branch target is simply an address then print it in hex.
   const MCConstantExpr *BranchTarget =
       dyn_cast<MCConstantExpr>(MI->getOperand(OpNum).getExpr());
-  int64_t Address;
-  if (BranchTarget && BranchTarget->evaluateAsAbsolute(Address)) {
-    O << "0x";
-    O.write_hex(Address);
+  int64_t TargetAddress;
+  if (BranchTarget && BranchTarget->evaluateAsAbsolute(TargetAddress)) {
+    O << formatHex(TargetAddress);
   } else {
     // Otherwise, just print the expression.
     MI->getOperand(OpNum).getExpr()->print(O, &MAI);
@@ -1411,6 +1429,12 @@ void AArch64InstPrinter::printMRSSystemRegister(const MCInst *MI, unsigned OpNo,
     return;
   }
 
+  // Horrible hack for two different registers having the same encoding.
+  if (Val == AArch64SysReg::TRCEXTINSELR) {
+    O << "TRCEXTINSELR";
+    return;
+  }
+
   const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
   if (Reg && Reg->Readable && Reg->haveFeatures(STI.getFeatureBits()))
     O << Reg->Name;
@@ -1431,6 +1455,12 @@ void AArch64InstPrinter::printMSRSystemRegister(const MCInst *MI, unsigned OpNo,
     return;
   }
 
+  // Horrible hack for two different registers having the same encoding.
+  if (Val == AArch64SysReg::TRCEXTINSELR) {
+    O << "TRCEXTINSELR";
+    return;
+  }
+
   const AArch64SysReg::SysReg *Reg = AArch64SysReg::lookupSysRegByEncoding(Val);
   if (Reg && Reg->Writeable && Reg->haveFeatures(STI.getFeatureBits()))
     O << Reg->Name;
@@ -1499,7 +1529,7 @@ void AArch64InstPrinter::printSVERegOp(const MCInst *MI, unsigned OpNum,
 
 template <typename T>
 void AArch64InstPrinter::printImmSVE(T Value, raw_ostream &O) {
-  typename std::make_unsigned<T>::type HexValue = Value;
+  std::make_unsigned_t<T> HexValue = Value;
 
   if (getPrintImmHex())
     O << '#' << formatHex((uint64_t)HexValue);
@@ -1544,8 +1574,8 @@ template <typename T>
 void AArch64InstPrinter::printSVELogicalImm(const MCInst *MI, unsigned OpNum,
                                             const MCSubtargetInfo &STI,
                                             raw_ostream &O) {
-  typedef typename std::make_signed<T>::type SignedT;
-  typedef typename std::make_unsigned<T>::type UnsignedT;
+  typedef std::make_signed_t<T> SignedT;
+  typedef std::make_unsigned_t<T> UnsignedT;
 
   uint64_t Val = MI->getOperand(OpNum).getImm();
   UnsignedT PrintVal = AArch64_AM::decodeLogicalImmediate(Val, 64);
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
index 993f379b5343..6da5f0e81c80 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64InstPrinter.h
@@ -32,10 +32,10 @@ public:
   // Autogenerated by tblgen.
   virtual void printInstruction(const MCInst *MI, uint64_t Address,
                                 const MCSubtargetInfo &STI, raw_ostream &O);
-  virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
-                               raw_ostream &O);
-  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                                       unsigned PrintMethodIdx,
+  virtual bool printAliasInstr(const MCInst *MI, uint64_t Address,
+                               const MCSubtargetInfo &STI, raw_ostream &O);
+  virtual void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                                       unsigned OpIdx, unsigned PrintMethodIdx,
                                        const MCSubtargetInfo &STI,
                                        raw_ostream &O);
 
@@ -56,6 +56,9 @@ protected:
                 raw_ostream &O);
   void printImmHex(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                    raw_ostream &O);
+  template <int Size>
+  void printSImm(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                 raw_ostream &O);
   template <typename T> void printImmSVE(T Value, raw_ostream &O);
   void printPostIncOperand(const MCInst *MI, unsigned OpNo, unsigned Imm,
                            raw_ostream &O);
@@ -97,7 +100,7 @@ protected:
                      const MCSubtargetInfo &STI, raw_ostream &O);
   void printInverseCondCode(const MCInst *MI, unsigned OpNum,
                             const MCSubtargetInfo &STI, raw_ostream &O);
-  void printAlignedLabel(const MCInst *MI, unsigned OpNum,
+  void printAlignedLabel(const MCInst *MI, uint64_t Address, unsigned OpNum,
                          const MCSubtargetInfo &STI, raw_ostream &O);
   void printUImm12Offset(const MCInst *MI, unsigned OpNum, unsigned Scale,
                          raw_ostream &O);
@@ -202,10 +205,10 @@ public:
 
   void printInstruction(const MCInst *MI, uint64_t Address,
                         const MCSubtargetInfo &STI, raw_ostream &O) override;
-  bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
-                       raw_ostream &O) override;
-  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                               unsigned PrintMethodIdx,
+  bool printAliasInstr(const MCInst *MI, uint64_t Address,
+                       const MCSubtargetInfo &STI, raw_ostream &O) override;
+  void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                               unsigned OpIdx, unsigned PrintMethodIdx,
                                const MCSubtargetInfo &STI,
                                raw_ostream &O) override;
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
index 5926a4f81616..9a63e26dec19 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCAsmInfo.cpp
@@ -60,7 +60,7 @@ const MCExpr *AArch64MCAsmInfoDarwin::getExprForPersonalitySymbol(
   const MCExpr *Res =
       MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_GOT, Context);
   MCSymbol *PCSym = Context.createTempSymbol();
-  Streamer.EmitLabel(PCSym);
+  Streamer.emitLabel(PCSym);
   const MCExpr *PC = MCSymbolRefExpr::create(PCSym, Context);
   return MCBinaryExpr::createSub(Res, PC, Context);
 }
@@ -96,8 +96,6 @@ AArch64MCAsmInfoELF::AArch64MCAsmInfoELF(const Triple &T) {
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
 
-  UseIntegratedAssembler = true;
-
   HasIdentDirective = true;
 }
 
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
index 8f4d9cb94d60..da8f511c650f 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCCodeEmitter.cpp
@@ -569,23 +569,24 @@ unsigned AArch64MCCodeEmitter::fixMOVZ(const MCInst &MI, unsigned EncodedValue,
   if (UImm16MO.isImm())
     return EncodedValue;
 
-  const AArch64MCExpr *A64E = cast<AArch64MCExpr>(UImm16MO.getExpr());
-  switch (A64E->getKind()) {
-  case AArch64MCExpr::VK_DTPREL_G2:
-  case AArch64MCExpr::VK_DTPREL_G1:
-  case AArch64MCExpr::VK_DTPREL_G0:
-  case AArch64MCExpr::VK_GOTTPREL_G1:
-  case AArch64MCExpr::VK_TPREL_G2:
-  case AArch64MCExpr::VK_TPREL_G1:
-  case AArch64MCExpr::VK_TPREL_G0:
-    return EncodedValue & ~(1u << 30);
-  default:
-    // Nothing to do for an unsigned fixup.
-    return EncodedValue;
+  const MCExpr *E = UImm16MO.getExpr();
+  if (const AArch64MCExpr *A64E = dyn_cast<AArch64MCExpr>(E)) {
+    switch (A64E->getKind()) {
+    case AArch64MCExpr::VK_DTPREL_G2:
+    case AArch64MCExpr::VK_DTPREL_G1:
+    case AArch64MCExpr::VK_DTPREL_G0:
+    case AArch64MCExpr::VK_GOTTPREL_G1:
+    case AArch64MCExpr::VK_TPREL_G2:
+    case AArch64MCExpr::VK_TPREL_G1:
+    case AArch64MCExpr::VK_TPREL_G0:
+      return EncodedValue & ~(1u << 30);
+    default:
+      // Nothing to do for an unsigned fixup.
+      return EncodedValue;
+    }
   }
 
-
-  return EncodedValue & ~(1u << 30);
+  return EncodedValue;
 }
 
 void AArch64MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
index 7dc3665baabc..209bff3a2311 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MCTargetDesc.cpp
@@ -254,7 +254,7 @@ static MCAsmInfo *createAArch64MCAsmInfo(const MCRegisterInfo &MRI,
 
   // Initial state of the frame pointer is SP.
   unsigned Reg = MRI.getDwarfRegNum(AArch64::SP, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
+  MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, Reg, 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
index fc04d37eb362..b0f414bd27ed 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64MachObjectWriter.cpp
@@ -139,7 +139,7 @@ static bool canUseLocalRelocation(const MCSectionMachO &Section,
     return false;
 
   if (RefSec.getSegmentName() == "__DATA" &&
-      RefSec.getSectionName() == "__objc_classrefs")
+      RefSec.getName() == "__objc_classrefs")
     return false;
 
   // FIXME: ld64 currently handles internal pointer-sized relocations
@@ -407,5 +407,5 @@ std::unique_ptr<MCObjectTargetWriter>
 llvm::createAArch64MachObjectWriter(uint32_t CPUType, uint32_t CPUSubtype,
                                     bool IsILP32) {
   return std::make_unique<AArch64MachObjectWriter>(CPUType, CPUSubtype,
-                                                    IsILP32);
+                                                   IsILP32);
 }
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
index f70752f5303f..48ed68f49263 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64TargetStreamer.cpp
@@ -51,7 +51,7 @@ void AArch64TargetStreamer::emitInst(uint32_t Inst) {
     Inst >>= 8;
   }
 
-  getStreamer().EmitBytes(StringRef(Buffer, 4));
+  getStreamer().emitBytes(StringRef(Buffer, 4));
 }
 
 namespace llvm {
diff --git a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
index 37c6fbb03908..03fbab5142a2 100644
--- a/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
+++ b/llvm/lib/Target/AArch64/MCTargetDesc/AArch64WinCOFFStreamer.cpp
@@ -28,7 +28,7 @@ public:
 
   void EmitWinEHHandlerData(SMLoc Loc) override;
   void EmitWindowsUnwindTables() override;
-  void FinishImpl() override;
+  void finishImpl() override;
 };
 
 void AArch64WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
@@ -45,11 +45,11 @@ void AArch64WinCOFFStreamer::EmitWindowsUnwindTables() {
   EHStreamer.Emit(*this);
 }
 
-void AArch64WinCOFFStreamer::FinishImpl() {
-  EmitFrames(nullptr);
+void AArch64WinCOFFStreamer::finishImpl() {
+  emitFrames(nullptr);
   EmitWindowsUnwindTables();
 
-  MCWinCOFFStreamer::FinishImpl();
+  MCWinCOFFStreamer::finishImpl();
 }
 } // end anonymous namespace
 
@@ -68,7 +68,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinUnwindCode(unsigned UnwindCode,
   WinEH::FrameInfo *CurFrame = S.EnsureValidWinFrameInfo(SMLoc());
   if (!CurFrame)
     return;
-  MCSymbol *Label = S.EmitCFILabel();
+  MCSymbol *Label = S.emitCFILabel();
   auto Inst = WinEH::Instruction(UnwindCode, Label, Reg, Offset);
   if (InEpilogCFI)
     CurFrame->EpilogMap[CurrentEpilog].push_back(Inst);
@@ -158,7 +158,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIPrologEnd() {
   if (!CurFrame)
     return;
 
-  MCSymbol *Label = S.EmitCFILabel();
+  MCSymbol *Label = S.emitCFILabel();
   CurFrame->PrologEnd = Label;
   WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0);
   auto it = CurFrame->Instructions.begin();
@@ -172,7 +172,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogStart() {
     return;
 
   InEpilogCFI = true;
-  CurrentEpilog = S.EmitCFILabel();
+  CurrentEpilog = S.emitCFILabel();
 }
 
 void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogEnd() {
@@ -182,7 +182,7 @@ void AArch64TargetWinCOFFStreamer::EmitARM64WinCFIEpilogEnd() {
     return;
 
   InEpilogCFI = false;
-  MCSymbol *Label = S.EmitCFILabel();
+  MCSymbol *Label = S.emitCFILabel();
   WinEH::Instruction Inst = WinEH::Instruction(Win64EH::UOP_End, Label, -1, 0);
   CurFrame->EpilogMap[CurrentEpilog].push_back(Inst);
   CurrentEpilog = nullptr;
diff --git a/llvm/lib/Target/AArch64/SVEInstrFormats.td b/llvm/lib/Target/AArch64/SVEInstrFormats.td
index a172b8d7e6b0..a005d1e65abe 100644
--- a/llvm/lib/Target/AArch64/SVEInstrFormats.td
+++ b/llvm/lib/Target/AArch64/SVEInstrFormats.td
@@ -10,6 +10,14 @@
 //
 //===----------------------------------------------------------------------===//
 
+def SDT_AArch64Setcc : SDTypeProfile<1, 4, [
+  SDTCisVec<0>, SDTCisVec<1>, SDTCisVec<2>, SDTCisVec<3>,
+  SDTCVecEltisVT<0, i1>, SDTCVecEltisVT<1, i1>, SDTCisSameAs<2, 3>,
+  SDTCisVT<4, OtherVT>
+]>;
+
+def AArch64setcc_z : SDNode<"AArch64ISD::SETCC_MERGE_ZERO", SDT_AArch64Setcc>;
+
 def SVEPatternOperand : AsmOperandClass {
   let Name = "SVEPattern";
   let ParserMethod = "tryParseSVEPattern";
@@ -33,7 +41,7 @@ def SVEPrefetchOperand : AsmOperandClass {
   let RenderMethod = "addPrefetchOperands";
 }
 
-def sve_prfop : Operand<i32>, ImmLeaf<i32, [{
+def sve_prfop : Operand<i32>, TImmLeaf<i32, [{
     return (((uint32_t)Imm) <= 15);
   }]> {
   let PrintMethod = "printPrefetchOp<true>";
@@ -167,8 +175,8 @@ def SVEAddSubImmOperand32 : SVEShiftedImmOperand<32, "AddSub", "isSVEAddSubImm<i
 def SVEAddSubImmOperand64 : SVEShiftedImmOperand<64, "AddSub", "isSVEAddSubImm<int64_t>">;
 
 class imm8_opt_lsl<int ElementWidth, string printType,
-                   AsmOperandClass OpndClass, code Predicate>
-    : Operand<i32>, ImmLeaf<i32, Predicate> {
+                   AsmOperandClass OpndClass>
+    : Operand<i32> {
   let EncoderMethod = "getImm8OptLsl";
   let DecoderMethod = "DecodeImm8OptLsl<" # ElementWidth # ">";
   let PrintMethod = "printImm8OptLsl<" # printType # ">";
@@ -176,31 +184,15 @@ class imm8_opt_lsl<int ElementWidth, string printType,
   let MIOperandInfo = (ops i32imm, i32imm);
 }
 
-def cpy_imm8_opt_lsl_i8  : imm8_opt_lsl<8,  "int8_t",  SVECpyImmOperand8,  [{
-  return AArch64_AM::isSVECpyImm<int8_t>(Imm);
-}]>;
-def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16, [{
-  return AArch64_AM::isSVECpyImm<int16_t>(Imm);
-}]>;
-def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32, [{
-  return AArch64_AM::isSVECpyImm<int32_t>(Imm);
-}]>;
-def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64, [{
-  return AArch64_AM::isSVECpyImm<int64_t>(Imm);
-}]>;
-
-def addsub_imm8_opt_lsl_i8  : imm8_opt_lsl<8,  "uint8_t",  SVEAddSubImmOperand8,  [{
-  return AArch64_AM::isSVEAddSubImm<int8_t>(Imm);
-}]>;
-def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16, [{
-  return AArch64_AM::isSVEAddSubImm<int16_t>(Imm);
-}]>;
-def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32, [{
-  return AArch64_AM::isSVEAddSubImm<int32_t>(Imm);
-}]>;
-def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64, [{
-  return AArch64_AM::isSVEAddSubImm<int64_t>(Imm);
-}]>;
+def cpy_imm8_opt_lsl_i8  : imm8_opt_lsl<8,  "int8_t",  SVECpyImmOperand8>;
+def cpy_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "int16_t", SVECpyImmOperand16>;
+def cpy_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "int32_t", SVECpyImmOperand32>;
+def cpy_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "int64_t", SVECpyImmOperand64>;
+
+def addsub_imm8_opt_lsl_i8  : imm8_opt_lsl<8,  "uint8_t",  SVEAddSubImmOperand8>;
+def addsub_imm8_opt_lsl_i16 : imm8_opt_lsl<16, "uint16_t", SVEAddSubImmOperand16>;
+def addsub_imm8_opt_lsl_i32 : imm8_opt_lsl<32, "uint32_t", SVEAddSubImmOperand32>;
+def addsub_imm8_opt_lsl_i64 : imm8_opt_lsl<64, "uint64_t", SVEAddSubImmOperand64>;
 
 def SVEAddSubImm8Pat  : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i8>", []>;
 def SVEAddSubImm16Pat : ComplexPattern<i32, 2, "SelectSVEAddSubImm<MVT::i16>", []>;
@@ -212,9 +204,13 @@ def SVELogicalImm16Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i16>",
 def SVELogicalImm32Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i32>", []>;
 def SVELogicalImm64Pat : ComplexPattern<i64, 1, "SelectSVELogicalImm<MVT::i64>", []>;
 
+def SVE8BitLslImm : ComplexPattern<i32, 2, "SelectSVE8BitLslImm", [imm]>;
+
 def SVEArithUImmPat  : ComplexPattern<i32, 1, "SelectSVEArithImm", []>;
 def SVEArithSImmPat  : ComplexPattern<i32, 1, "SelectSVESignedArithImm", []>;
 
+def SVEShiftImm64 : ComplexPattern<i32, 1, "SelectSVEShiftImm64<0, 64>", []>;
+
 class SVEExactFPImm<string Suffix, string ValA, string ValB> : AsmOperandClass {
   let Name = "SVEExactFPImmOperand" # Suffix;
   let DiagnosticType = "Invalid" # Name;
@@ -324,6 +320,16 @@ class SVE_1_Op_Imm_Arith_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
   : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
         (inst $Op1, i32:$imm)>;
 
+class SVE_1_Op_Imm_Shift_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
+                                  ZPRRegOp zprty, Operand ImmTy, Instruction inst>
+  : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (ImmTy:$imm))))),
+        (inst $Op1, ImmTy:$imm)>;
+
+class SVE_1_Op_Imm_Arith_Pred_Pat<ValueType vt, ValueType pt, SDPatternOperator op,
+                                  ZPRRegOp zprty, ValueType it, ComplexPattern cpx, Instruction inst>
+  : Pat<(vt (op (pt (AArch64ptrue 31)), (vt zprty:$Op1), (vt (AArch64dup (it (cpx i32:$imm)))))),
+        (inst $Op1, i32:$imm)>;
+
 class SVE_1_Op_Imm_Log_Pat<ValueType vt, SDPatternOperator op, ZPRRegOp zprty,
                            ValueType it, ComplexPattern cpx, Instruction inst>
   : Pat<(vt (op (vt zprty:$Op1), (vt (AArch64dup (it (cpx i64:$imm)))))),
@@ -367,8 +373,22 @@ class SVE_4_Op_Imm_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
 : Pat<(vtd (op vt1:$Op1, vt2:$Op2, vt3:$Op3, (vt4 ImmTy:$Op4))),
       (inst $Op1, $Op2, $Op3, ImmTy:$Op4)>;
 
+def SVEDup0 : ComplexPattern<i64, 0, "SelectDupZero", []>;
 def SVEDup0Undef : ComplexPattern<i64, 0, "SelectDupZeroOrUndef", []>;
 
+let AddedComplexity = 1 in {
+class SVE_3_Op_Pat_SelZero<ValueType vtd, SDPatternOperator op, ValueType vt1,
+                   ValueType vt2, ValueType vt3, Instruction inst>
+: Pat<(vtd (vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), vt3:$Op3))),
+      (inst $Op1, $Op2, $Op3)>;
+
+class SVE_3_Op_Pat_Shift_Imm_SelZero<ValueType vtd, SDPatternOperator op,
+                                     ValueType vt1, ValueType vt2,
+                                     Operand vt3, Instruction inst>
+: Pat<(vtd (op vt1:$Op1, (vselect vt1:$Op1, vt2:$Op2, (SVEDup0)), (i32 (vt3:$Op3)))),
+      (inst $Op1, $Op2, vt3:$Op3)>;
+}
+
 //
 // Common but less generic patterns.
 //
@@ -378,6 +398,69 @@ class SVE_1_Op_AllActive_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
 : Pat<(vtd (op vt1:$Op1)),
       (inst (IMPLICIT_DEF), (ptrue 31), $Op1)>;
 
+class SVE_2_Op_AllActive_Pat<ValueType vtd, SDPatternOperator op, ValueType vt1,
+                             ValueType vt2, Instruction inst, Instruction ptrue>
+: Pat<(vtd (op vt1:$Op1, vt2:$Op2)),
+      (inst (ptrue 31), $Op1, $Op2)>;
+
+//
+// Pseudo -> Instruction mappings
+//
+def getSVEPseudoMap : InstrMapping {
+  let FilterClass = "SVEPseudo2Instr";
+  let RowFields = ["PseudoName"];
+  let ColFields = ["IsInstr"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+class SVEPseudo2Instr<string name, bit instr> {
+  string PseudoName = name;
+  bit IsInstr = instr;
+}
+
+// Lookup e.g. DIV -> DIVR
+def getSVERevInstr : InstrMapping {
+  let FilterClass = "SVEInstr2Rev";
+  let RowFields = ["InstrName"];
+  let ColFields = ["isReverseInstr"];
+  let KeyCol = ["0"];
+  let ValueCols = [["1"]];
+}
+
+// Lookup e.g. DIVR -> DIV
+def getSVENonRevInstr : InstrMapping {
+  let FilterClass = "SVEInstr2Rev";
+  let RowFields = ["InstrName"];
+  let ColFields = ["isReverseInstr"];
+  let KeyCol = ["1"];
+  let ValueCols = [["0"]];
+}
+
+class SVEInstr2Rev<string name1, string name2, bit name1IsReverseInstr> {
+  string InstrName = !if(name1IsReverseInstr, name1, name2);
+  bit isReverseInstr = name1IsReverseInstr;
+}
+
+//
+// Pseudos for destructive operands
+//
+let hasNoSchedulingInfo = 1 in {
+  class PredTwoOpPseudo<string name, ZPRRegOp zprty,
+                        FalseLanesEnum flags = FalseLanesNone>
+  : SVEPseudo2Instr<name, 0>,
+    Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, zprty:$Zs2), []> {
+    let FalseLanes = flags;
+  }
+
+  class PredTwoOpImmPseudo<string name, ZPRRegOp zprty, Operand immty,
+                           FalseLanesEnum flags = FalseLanesNone>
+  : SVEPseudo2Instr<name, 0>,
+    Pseudo<(outs zprty:$Zd), (ins PPR3bAny:$Pg, zprty:$Zs1, immty:$imm), []> {
+    let FalseLanes = flags;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // SVE Predicate Misc Group
 //===----------------------------------------------------------------------===//
@@ -566,7 +649,7 @@ class sve_int_count_v<bits<2> sz8_64, bits<5> opc, string asm,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
@@ -680,7 +763,7 @@ class sve_int_countvlv<bits<5> opc, string asm, ZPRRegOp zprty>
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
@@ -941,11 +1024,46 @@ multiclass sve_int_perm_tbl<string asm, SDPatternOperator op> {
   def : SVE_2_Op_Pat<nxv2f64, op, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_int_perm_tbl<string asm> {
+multiclass sve2_int_perm_tbl<string asm, SDPatternOperator op> {
   def _B : sve_int_perm_tbl<0b00, 0b01, asm, ZPR8,  ZZ_b>;
   def _H : sve_int_perm_tbl<0b01, 0b01, asm, ZPR16, ZZ_h>;
   def _S : sve_int_perm_tbl<0b10, 0b01, asm, ZPR32, ZZ_s>;
   def _D : sve_int_perm_tbl<0b11, 0b01, asm, ZPR64, ZZ_d>;
+
+  def : Pat<(nxv16i8 (op nxv16i8:$Op1, nxv16i8:$Op2, nxv16i8:$Op3)),
+            (nxv16i8 (!cast<Instruction>(NAME # _B) (REG_SEQUENCE ZPR2, nxv16i8:$Op1, zsub0,
+                                                                        nxv16i8:$Op2, zsub1),
+                                                     nxv16i8:$Op3))>;
+
+  def : Pat<(nxv8i16 (op nxv8i16:$Op1, nxv8i16:$Op2, nxv8i16:$Op3)),
+            (nxv8i16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8i16:$Op1, zsub0,
+                                                                        nxv8i16:$Op2, zsub1),
+                                                     nxv8i16:$Op3))>;
+
+  def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv4i32:$Op2, nxv4i32:$Op3)),
+            (nxv4i32 (!cast<Instruction>(NAME # _S) (REG_SEQUENCE ZPR2, nxv4i32:$Op1, zsub0,
+                                                                        nxv4i32:$Op2, zsub1),
+                                                     nxv4i32:$Op3))>;
+
+  def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv2i64:$Op2, nxv2i64:$Op3)),
+            (nxv2i64 (!cast<Instruction>(NAME # _D) (REG_SEQUENCE ZPR2, nxv2i64:$Op1, zsub0,
+                                                                        nxv2i64:$Op2, zsub1),
+                                                     nxv2i64:$Op3))>;
+
+  def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8i16:$Op3)),
+            (nxv8f16 (!cast<Instruction>(NAME # _H) (REG_SEQUENCE ZPR2, nxv8f16:$Op1, zsub0,
+                                                                        nxv8f16:$Op2, zsub1),
+                                                     nxv8i16:$Op3))>;
+
+  def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4i32:$Op3)),
+            (nxv4f32 (!cast<Instruction>(NAME # _S) (REG_SEQUENCE ZPR2, nxv4f32:$Op1, zsub0,
+                                                                        nxv4f32:$Op2, zsub1),
+                                                     nxv4i32:$Op3))>;
+
+  def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2i64:$Op3)),
+            (nxv2f64 (!cast<Instruction>(NAME # _D) (REG_SEQUENCE ZPR2, nxv2f64:$Op1, zsub0,
+                                                                        nxv2f64:$Op2, zsub1),
+                                                     nxv2i64:$Op3))>;
 }
 
 class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
@@ -967,11 +1085,20 @@ class sve2_int_perm_tbx<bits<2> sz8_64, string asm, ZPRRegOp zprty>
   let Constraints = "$Zd = $_Zd";
 }
 
-multiclass sve2_int_perm_tbx<string asm> {
+multiclass sve2_int_perm_tbx<string asm, SDPatternOperator op> {
   def _B : sve2_int_perm_tbx<0b00, asm, ZPR8>;
   def _H : sve2_int_perm_tbx<0b01, asm, ZPR16>;
   def _S : sve2_int_perm_tbx<0b10, asm, ZPR32>;
   def _D : sve2_int_perm_tbx<0b11, asm, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+  def : SVE_3_Op_Pat<nxv8f16, op, nxv8f16, nxv8f16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv4f32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2f64, op, nxv2f64, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve_int_perm_reverse_z<bits<2> sz8_64, string asm, ZPRRegOp zprty>
@@ -1072,7 +1199,7 @@ class sve_int_perm_insrs<bits<2> sz8_64, string asm, ZPRRegOp zprty,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
 }
 
 multiclass sve_int_perm_insrs<string asm, SDPatternOperator op> {
@@ -1102,7 +1229,7 @@ class sve_int_perm_insrv<bits<2> sz8_64, string asm, ZPRRegOp zprty,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
 }
 
 multiclass sve_int_perm_insrv<string asm, SDPatternOperator op> {
@@ -1135,7 +1262,7 @@ class sve_int_perm_extract_i<string asm>
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
@@ -1244,13 +1371,22 @@ class sve_int_pred_log<bits<4> opc, string asm>
 
 }
 
-multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op> {
+multiclass sve_int_pred_log<bits<4> opc, string asm, SDPatternOperator op,
+                            SDPatternOperator op_nopred = null_frag> {
   def NAME : sve_int_pred_log<opc, asm>;
 
   def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
   def : SVE_3_Op_Pat<nxv8i1, op, nxv8i1, nxv8i1, nxv8i1, !cast<Instruction>(NAME)>;
   def : SVE_3_Op_Pat<nxv4i1, op, nxv4i1, nxv4i1, nxv4i1, !cast<Instruction>(NAME)>;
   def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2i1, nxv2i1, !cast<Instruction>(NAME)>;
+  def : SVE_2_Op_AllActive_Pat<nxv16i1, op_nopred, nxv16i1, nxv16i1,
+                               !cast<Instruction>(NAME), PTRUE_B>;
+  def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8i1, nxv8i1,
+                               !cast<Instruction>(NAME), PTRUE_H>;
+  def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4i1, nxv4i1,
+                               !cast<Instruction>(NAME), PTRUE_S>;
+  def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2i1, nxv2i1,
+                               !cast<Instruction>(NAME), PTRUE_D>;
 }
 
 
@@ -1272,7 +1408,7 @@ class sve_int_log_imm<bits<2> opc, string asm>
 
   let Constraints = "$Zdn = $_Zdn";
   let DecoderMethod = "DecodeSVELogicalImmInstruction";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
@@ -1357,7 +1493,8 @@ class sve_int_bin_cons_arit_0<bits<2> sz8_64, bits<3> opc, string asm,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm, SDPatternOperator op> {
+multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm,
+                                   SDPatternOperator op, SDPatternOperator int_op> {
   def _B : sve_int_bin_cons_arit_0<0b00, opc, asm, ZPR8>;
   def _H : sve_int_bin_cons_arit_0<0b01, opc, asm, ZPR16>;
   def _S : sve_int_bin_cons_arit_0<0b10, opc, asm, ZPR32>;
@@ -1367,6 +1504,12 @@ multiclass sve_int_bin_cons_arit_0<bits<3> opc, string asm, SDPatternOperator op
   def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
   def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+
+  // Intrinsic version
+  def : SVE_2_Op_Pat<nxv16i8, int_op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pat<nxv8i16, int_op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i32, int_op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i64, int_op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1394,7 +1537,7 @@ class sve_fp_2op_i_p_zds<bits<2> sz, bits<3> opc, string asm,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
@@ -1423,15 +1566,21 @@ class sve_fp_2op_p_zds<bits<2> sz, bits<4> opc, string asm,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve_fp_2op_p_zds<bits<4> opc, string asm,
-                            SDPatternOperator op> {
-  def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>;
-  def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>;
-  def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>;
+multiclass sve_fp_2op_p_zds<bits<4> opc, string asm, string Ps,
+                            SDPatternOperator op, DestructiveInstTypeEnum flags,
+                            string revname="", bit isReverseInstr=0> {
+  let DestructiveInstType = flags in {
+  def _H : sve_fp_2op_p_zds<0b01, opc, asm, ZPR16>,
+           SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
+  def _S : sve_fp_2op_p_zds<0b10, opc, asm, ZPR32>,
+           SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+  def _D : sve_fp_2op_p_zds<0b11, opc, asm, ZPR64>,
+           SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
+  }
 
   def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
@@ -1449,6 +1598,16 @@ multiclass sve_fp_2op_p_zds_fscale<bits<4> opc, string asm,
   def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
+multiclass sve_fp_2op_p_zds_zeroing_hsd<SDPatternOperator op> {
+  def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;
+  def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;
+  def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;
+
+  def : SVE_3_Op_Pat_SelZero<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _ZERO_H)>;
+  def : SVE_3_Op_Pat_SelZero<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _ZERO_S)>;
+  def : SVE_3_Op_Pat_SelZero<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _ZERO_D)>;
+}
+
 class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
 : I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm, imm32_0_7:$imm3),
   asm, "\t$Zdn, $_Zdn, $Zm, $imm3",
@@ -1466,7 +1625,7 @@ class sve_fp_ftmad<bits<2> sz, string asm, ZPRRegOp zprty>
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
@@ -1551,7 +1710,7 @@ class sve_fp_3op_p_zds_a<bits<2> sz, bits<2> opc, string asm, ZPRRegOp zprty>
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
@@ -1586,7 +1745,7 @@ class sve_fp_3op_p_zds_b<bits<2> sz, bits<2> opc, string asm,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
@@ -1620,7 +1779,7 @@ class sve_fp_fma_by_indexed_elem<bits<2> sz, bit opc, string asm,
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
@@ -1646,12 +1805,12 @@ multiclass sve_fp_fma_by_indexed_elem<bit opc, string asm,
     let Inst{19-16} = Zm;
   }
 
-  def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexH32b:$idx))),
-            (!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexH32b:$idx)>;
-  def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexS32b:$idx))),
-            (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b:$idx)>;
-  def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 VectorIndexD32b:$idx))),
-            (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b:$idx)>;
+  def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexH32b_timm:$idx))),
+            (!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexH32b_timm:$idx)>;
+  def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexS32b_timm:$idx))),
+            (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>;
+  def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, nxv2f64:$Op3, (i32 VectorIndexD32b_timm:$idx))),
+            (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>;
 }
 
 
@@ -1694,12 +1853,12 @@ multiclass sve_fp_fmul_by_indexed_elem<string asm, SDPatternOperator op> {
     let Inst{19-16} = Zm;
   }
 
-  def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, (i32 VectorIndexH32b:$idx))),
-            (!cast<Instruction>(NAME # _H) $Op1, $Op2, VectorIndexH32b:$idx)>;
-  def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, (i32 VectorIndexS32b:$idx))),
-            (!cast<Instruction>(NAME # _S) $Op1, $Op2, VectorIndexS32b:$idx)>;
-  def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, (i32 VectorIndexD32b:$idx))),
-            (!cast<Instruction>(NAME # _D) $Op1, $Op2, VectorIndexD32b:$idx)>;
+  def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, (i32 VectorIndexH32b_timm:$idx))),
+            (!cast<Instruction>(NAME # _H) $Op1, $Op2, VectorIndexH32b_timm:$idx)>;
+  def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, (i32 VectorIndexS32b_timm:$idx))),
+            (!cast<Instruction>(NAME # _S) $Op1, $Op2, VectorIndexS32b_timm:$idx)>;
+  def : Pat<(nxv2f64 (op nxv2f64:$Op1, nxv2f64:$Op2, (i32 VectorIndexD32b_timm:$idx))),
+            (!cast<Instruction>(NAME # _D) $Op1, $Op2, VectorIndexD32b_timm:$idx)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1727,7 +1886,7 @@ class sve_fp_fcmla<bits<2> sz, string asm, ZPRRegOp zprty>
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
@@ -1767,7 +1926,7 @@ class sve_fp_fcmla_by_indexed_elem<bits<2> sz, string asm,
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
@@ -1785,10 +1944,10 @@ multiclass sve_fp_fcmla_by_indexed_elem<string asm, SDPatternOperator op> {
     let Inst{19-16} = Zm;
   }
 
-  def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexS32b:$idx), (i32 complexrotateop:$imm))),
-            (!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexS32b:$idx, complexrotateop:$imm)>;
-  def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexD32b:$idx), (i32 complexrotateop:$imm))),
-            (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexD32b:$idx, complexrotateop:$imm)>;
+  def : Pat<(nxv8f16 (op nxv8f16:$Op1, nxv8f16:$Op2, nxv8f16:$Op3, (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))),
+            (!cast<Instruction>(NAME # _H) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>;
+  def : Pat<(nxv4f32 (op nxv4f32:$Op1, nxv4f32:$Op2, nxv4f32:$Op3, (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))),
+            (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1815,7 +1974,7 @@ class sve_fp_fcadd<bits<2> sz, string asm, ZPRRegOp zprty>
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
@@ -1861,22 +2020,22 @@ multiclass sve2_fp_convert_down_narrow<string asm, string op> {
   def _StoH : sve2_fp_convert_precision<0b1000, asm, ZPR16, ZPR32>;
   def _DtoS : sve2_fp_convert_precision<0b1110, asm, ZPR32, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv8f16, !cast<SDPatternOperator>(op # _f16f32), nxv8f16, nxv16i1, nxv4f32, !cast<Instruction>(NAME # _StoH)>;
-  def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
+  def : SVE_3_Op_Pat<nxv8f16, !cast<SDPatternOperator>(op # _f16f32), nxv8f16, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _StoH)>;
+  def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
 }
 
 multiclass sve2_fp_convert_up_long<string asm, string op> {
   def _HtoS : sve2_fp_convert_precision<0b1001, asm, ZPR32, ZPR16>;
   def _StoD : sve2_fp_convert_precision<0b1111, asm, ZPR64, ZPR32>;
 
-  def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv16i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
-  def : SVE_3_Op_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv16i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
+  def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f16), nxv4f32, nxv4i1, nxv8f16, !cast<Instruction>(NAME # _HtoS)>;
+  def : SVE_3_Op_Pat<nxv2f64, !cast<SDPatternOperator>(op # _f64f32), nxv2f64, nxv2i1, nxv4f32, !cast<Instruction>(NAME # _StoD)>;
 }
 
 multiclass sve2_fp_convert_down_odd_rounding_top<string asm, string op> {
   def _DtoS : sve2_fp_convert_precision<0b0010, asm, ZPR32, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
+  def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1902,7 +2061,7 @@ class sve2_fp_pairwise_pred<bits<2> sz, bits<3> opc, string asm,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
@@ -1942,14 +2101,14 @@ class sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm>
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
 multiclass sve2_fp_mla_long_by_indexed_elem<bits<2> opc, string asm,
                                             SDPatternOperator op> {
   def NAME : sve2_fp_mla_long_by_indexed_elem<opc, asm>;
-  def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8f16, nxv8f16, i32, VectorIndexH32b, !cast<Instruction>(NAME)>;
+  def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8f16, nxv8f16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -1974,7 +2133,7 @@ class sve2_fp_mla_long<bits<2> opc, string asm>
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
@@ -2084,7 +2243,7 @@ class sve_fp_2op_p_zd<bits<7> opc, string asm, RegisterOperand i_zprtype,
   let Inst{4-0}   = Zd;
 
   let Constraints = "$Zd = $_Zd";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = size;
 }
 
@@ -2120,7 +2279,7 @@ multiclass sve2_fp_flogb<string asm, SDPatternOperator op> {
 
 multiclass sve2_fp_convert_down_odd_rounding<string asm, string op> {
   def _DtoS : sve_fp_2op_p_zd<0b0001010, asm, ZPR64, ZPR32, ElementSizeD>;
-  def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv16i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
+  def : SVE_3_Op_Pat<nxv4f32, !cast<SDPatternOperator>(op # _f32f64), nxv4f32, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _DtoS)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2176,7 +2335,7 @@ class sve_int_bin_pred_arit_log<bits<2> sz8_64, bits<2> fmt, bits<3> opc,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
@@ -2192,11 +2351,20 @@ multiclass sve_int_bin_pred_log<bits<3> opc, string asm, SDPatternOperator op> {
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm, SDPatternOperator op> {
-  def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>;
-  def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>;
-  def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>;
-  def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>;
+multiclass sve_int_bin_pred_arit_0<bits<3> opc, string asm, string Ps,
+                                   SDPatternOperator op,
+                                   DestructiveInstTypeEnum flags,
+                                   string revname="", bit isReverseInstr=0> {
+  let DestructiveInstType = flags in {
+  def _B : sve_int_bin_pred_arit_log<0b00, 0b00, opc, asm, ZPR8>,
+             SVEPseudo2Instr<Ps # _B, 1>, SVEInstr2Rev<NAME # _B, revname # _B, isReverseInstr>;
+  def _H : sve_int_bin_pred_arit_log<0b01, 0b00, opc, asm, ZPR16>,
+             SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
+  def _S : sve_int_bin_pred_arit_log<0b10, 0b00, opc, asm, ZPR32>,
+             SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+  def _D : sve_int_bin_pred_arit_log<0b11, 0b00, opc, asm, ZPR64>,
+             SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
+  }
 
   def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
   def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
@@ -2229,9 +2397,16 @@ multiclass sve_int_bin_pred_arit_2<bits<3> opc, string asm, SDPatternOperator op
 }
 
 // Special case for divides which are not defined for 8b/16b elements.
-multiclass sve_int_bin_pred_arit_2_div<bits<3> opc, string asm, SDPatternOperator op> {
-  def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>;
-  def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>;
+multiclass sve_int_bin_pred_arit_2_div<bits<3> opc, string asm, string Ps,
+                                       SDPatternOperator op,
+                                       DestructiveInstTypeEnum flags,
+                                       string revname="", bit isReverseInstr=0> {
+  let DestructiveInstType = flags in {
+  def _S : sve_int_bin_pred_arit_log<0b10, 0b10, opc, asm, ZPR32>,
+             SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+  def _D : sve_int_bin_pred_arit_log<0b11, 0b10, opc, asm, ZPR64>,
+             SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
+  }
 
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
@@ -2262,7 +2437,7 @@ class sve_int_mladdsub_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
@@ -2299,7 +2474,7 @@ class sve_int_mlas_vvv_pred<bits<2> sz8_64, bits<1> opc, string asm,
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
@@ -2336,21 +2511,30 @@ class sve2_int_mla<bits<2> sz, bits<5> opc, string asm,
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
-multiclass sve2_int_mla<bit S, string asm> {
+multiclass sve2_int_mla<bit S, string asm, SDPatternOperator op> {
   def _B : sve2_int_mla<0b00, { 0b1110, S }, asm, ZPR8, ZPR8>;
   def _H : sve2_int_mla<0b01, { 0b1110, S }, asm, ZPR16, ZPR16>;
   def _S : sve2_int_mla<0b10, { 0b1110, S }, asm, ZPR32, ZPR32>;
   def _D : sve2_int_mla<0b11, { 0b1110, S }, asm, ZPR64, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_int_mla_long<bits<5> opc, string asm> {
+multiclass sve2_int_mla_long<bits<5> opc, string asm, SDPatternOperator op> {
   def _H : sve2_int_mla<0b01, opc, asm, ZPR16, ZPR8>;
   def _S : sve2_int_mla<0b10, opc, asm, ZPR32, ZPR16>;
   def _D : sve2_int_mla<0b11, opc, asm, ZPR64, ZPR32>;
+
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2372,39 +2556,44 @@ class sve2_int_mla_by_indexed_elem<bits<2> sz, bits<6> opc, string asm,
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
-multiclass sve2_int_mla_by_indexed_elem<bits<2> opc, bit S, string asm> {
-  def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> {
+multiclass sve2_int_mla_by_indexed_elem<bits<2> opc, bit S, string asm,
+                                        SDPatternOperator op> {
+  def _H : sve2_int_mla_by_indexed_elem<{0, ?}, { 0b000, opc, S }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH32b> {
     bits<3> Zm;
     bits<3> iop;
     let Inst{22} = iop{2};
     let Inst{20-19} = iop{1-0};
     let Inst{18-16} = Zm;
   }
-  def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> {
+  def _S : sve2_int_mla_by_indexed_elem<0b10, { 0b000, opc, S }, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS32b> {
     bits<3> Zm;
     bits<2> iop;
     let Inst{20-19} = iop;
     let Inst{18-16} = Zm;
   }
-  def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> {
+  def _D : sve2_int_mla_by_indexed_elem<0b11, { 0b000, opc, S }, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD32b> {
     bits<4> Zm;
     bit iop;
     let Inst{20} = iop;
     let Inst{19-16} = Zm;
   }
+
+  def : SVE_4_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _H)>;
+  def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>;
+  def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
 // SVE2 Integer Multiply-Add Long - Indexed Group
 //===----------------------------------------------------------------------===//
 
-multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm> {
+multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm, SDPatternOperator op> {
   def _S : sve2_int_mla_by_indexed_elem<0b10, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
-                                        asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH> {
+                                        asm, ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> {
     bits<3> Zm;
     bits<3> iop;
     let Inst{20-19} = iop{2-1};
@@ -2412,13 +2601,16 @@ multiclass sve2_int_mla_long_by_indexed_elem<bits<4> opc, string asm> {
     let Inst{11} = iop{0};
   }
   def _D : sve2_int_mla_by_indexed_elem<0b11, { opc{3}, 0b0, opc{2-1}, ?, opc{0} },
-                                        asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS> {
+                                        asm, ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> {
     bits<4> Zm;
     bits<2> iop;
     let Inst{20} = iop{1};
     let Inst{19-16} = Zm;
     let Inst{11} = iop{0};
   }
+
+  def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _S)>;
+  def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2442,7 +2634,7 @@ class sve_intx_dot<bit sz, bit U, string asm, ZPRRegOp zprty1,
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
 }
 
 multiclass sve_intx_dot<bit opc, string asm, SDPatternOperator op> {
@@ -2474,28 +2666,28 @@ class sve_intx_dot_by_indexed_elem<bit sz, bit U, string asm,
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
 }
 
 multiclass sve_intx_dot_by_indexed_elem<bit opc, string asm,
                                         SDPatternOperator op> {
-  def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> {
+  def _S : sve_intx_dot_by_indexed_elem<0b0, opc, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b_timm> {
     bits<2> iop;
     bits<3> Zm;
     let Inst{20-19} = iop;
     let Inst{18-16} = Zm;
   }
-  def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> {
+  def _D : sve_intx_dot_by_indexed_elem<0b1, opc, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b_timm> {
     bits<1> iop;
     bits<4> Zm;
     let Inst{20} = iop;
     let Inst{19-16} = Zm;
   }
 
-  def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b:$idx))),
-            (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b:$idx)>;
-  def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b:$idx))),
-            (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b:$idx)>;
+  def : Pat<(nxv4i32 (op nxv4i32:$Op1, nxv16i8:$Op2, nxv16i8:$Op3, (i32 VectorIndexS32b_timm:$idx))),
+            (!cast<Instruction>(NAME # _S) $Op1, $Op2, $Op3, VectorIndexS32b_timm:$idx)>;
+  def : Pat<(nxv2i64 (op nxv2i64:$Op1, nxv8i16:$Op2, nxv8i16:$Op3, (i32 VectorIndexD32b_timm:$idx))),
+            (!cast<Instruction>(NAME # _D) $Op1, $Op2, $Op3, VectorIndexD32b_timm:$idx)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2521,24 +2713,36 @@ class sve2_complex_int_arith<bits<2> sz, bits<4> opc, string asm,
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
-multiclass sve2_cintx_dot<string asm> {
+multiclass sve2_cintx_dot<string asm, SDPatternOperator op> {
   def _S : sve2_complex_int_arith<0b10, 0b0001, asm, ZPR32, ZPR8>;
   def _D : sve2_complex_int_arith<0b11, 0b0001, asm, ZPR64, ZPR16>;
+
+  def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv16i8 ZPR8:$Op2), (nxv16i8 ZPR8:$Op3),
+                         (i32 complexrotateop:$imm))),
+            (!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR8:$Op2, ZPR8:$Op3, complexrotateop:$imm)>;
+  def : Pat<(nxv2i64 (op (nxv2i64 ZPR64:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3),
+                         (i32 complexrotateop:$imm))),
+            (!cast<Instruction>(NAME # "_D") ZPR64:$Op1, ZPR16:$Op2, ZPR16:$Op3, complexrotateop:$imm)>;
 }
 
 //===----------------------------------------------------------------------===//
 // SVE2 Complex Multiply-Add Group
 //===----------------------------------------------------------------------===//
 
-multiclass sve2_int_cmla<bit opc, string asm> {
+multiclass sve2_int_cmla<bit opc, string asm, SDPatternOperator op> {
   def _B : sve2_complex_int_arith<0b00, { 0b001, opc }, asm, ZPR8, ZPR8>;
   def _H : sve2_complex_int_arith<0b01, { 0b001, opc }, asm, ZPR16, ZPR16>;
   def _S : sve2_complex_int_arith<0b10, { 0b001, opc }, asm, ZPR32, ZPR32>;
   def _D : sve2_complex_int_arith<0b11, { 0b001, opc }, asm, ZPR64, ZPR64>;
+
+  def : SVE_4_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, i32, complexrotateop, !cast<Instruction>(NAME # _B)>;
+  def : SVE_4_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, i32, complexrotateop, !cast<Instruction>(NAME # _H)>;
+  def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, i32, complexrotateop, !cast<Instruction>(NAME # _S)>;
+  def : SVE_4_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, i32, complexrotateop, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2563,42 +2767,58 @@ class sve2_complex_int_arith_indexed<bits<2> sz, bits<4> opc, string asm,
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
-multiclass sve2_cintx_dot_by_indexed_elem<string asm> {
-  def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS> {
+multiclass sve2_cintx_dot_by_indexed_elem<string asm, SDPatternOperator op> {
+  def _S : sve2_complex_int_arith_indexed<0b10, 0b0100, asm, ZPR32, ZPR8, ZPR3b8, VectorIndexS32b> {
     bits<2> iop;
     bits<3> Zm;
     let Inst{20-19} = iop;
     let Inst{18-16} = Zm;
   }
-  def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD> {
+  def _D : sve2_complex_int_arith_indexed<0b11, 0b0100, asm, ZPR64, ZPR16, ZPR4b16, VectorIndexD32b> {
     bit iop;
     bits<4> Zm;
     let Inst{20} = iop;
     let Inst{19-16} = Zm;
   }
+
+  def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv16i8 ZPR8:$Op2), (nxv16i8 ZPR8:$Op3),
+                         (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))),
+            (!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR8:$Op2, ZPR8:$Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>;
+  def : Pat<(nxv2i64 (op (nxv2i64 ZPR64:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3),
+                         (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))),
+            (!cast<Instruction>(NAME # "_D") ZPR64:$Op1, ZPR16:$Op2, ZPR16:$Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>;
 }
 
 //===----------------------------------------------------------------------===//
 // SVE2 Complex Multiply-Add - Indexed Group
 //===----------------------------------------------------------------------===//
 
-multiclass sve2_cmla_by_indexed_elem<bit opc, string asm> {
-  def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS> {
+multiclass sve2_cmla_by_indexed_elem<bit opc, string asm,
+                                     SDPatternOperator op> {
+  def _H : sve2_complex_int_arith_indexed<0b10, { 0b011, opc }, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexS32b> {
     bits<2> iop;
     bits<3> Zm;
     let Inst{20-19} = iop;
     let Inst{18-16} = Zm;
   }
-  def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD> {
+  def _S : sve2_complex_int_arith_indexed<0b11, { 0b011, opc }, asm, ZPR32, ZPR32, ZPR4b32, VectorIndexD32b> {
     bit iop;
     bits<4> Zm;
     let Inst{20} = iop;
     let Inst{19-16} = Zm;
   }
+
+  def : Pat<(nxv8i16 (op (nxv8i16 ZPR16:$Op1), (nxv8i16 ZPR16:$Op2), (nxv8i16 ZPR16:$Op3),
+                         (i32 VectorIndexS32b_timm:$idx), (i32 complexrotateop:$imm))),
+            (!cast<Instruction>(NAME # "_H") ZPR16:$Op1, ZPR16:$Op2, ZPR16:$Op3, VectorIndexS32b_timm:$idx, complexrotateop:$imm)>;
+
+  def : Pat<(nxv4i32 (op (nxv4i32 ZPR32:$Op1), (nxv4i32 ZPR32:$Op2), (nxv4i32 ZPR32:$Op3),
+                         (i32 VectorIndexD32b_timm:$idx), (i32 complexrotateop:$imm))),
+            (!cast<Instruction>(NAME # "_S") ZPR32:$Op1, ZPR32:$Op2, ZPR32:$Op3, VectorIndexD32b_timm:$idx, complexrotateop:$imm)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2621,11 +2841,22 @@ class sve2_int_mul<bits<2> sz, bits<3> opc, string asm, ZPRRegOp zprty>
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve2_int_mul<bits<3> opc, string asm> {
+multiclass sve2_int_mul<bits<3> opc, string asm, SDPatternOperator op> {
   def _B : sve2_int_mul<0b00, opc, asm, ZPR8>;
   def _H : sve2_int_mul<0b01, opc, asm, ZPR16>;
   def _S : sve2_int_mul<0b10, opc, asm, ZPR32>;
   def _D : sve2_int_mul<0b11, opc, asm, ZPR64>;
+
+  def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+}
+
+multiclass sve2_int_mul_single<bits<3> opc, string asm, SDPatternOperator op> {
+  def _B : sve2_int_mul<0b00, opc, asm, ZPR8>;
+
+  def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2648,31 +2879,37 @@ class sve2_int_mul_by_indexed_elem<bits<2> sz, bits<4> opc, string asm,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve2_int_mul_by_indexed_elem<bits<4> opc, string asm> {
-  def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH> {
+multiclass sve2_int_mul_by_indexed_elem<bits<4> opc, string asm,
+                                        SDPatternOperator op> {
+  def _H : sve2_int_mul_by_indexed_elem<{0, ?}, opc, asm, ZPR16, ZPR16, ZPR3b16, VectorIndexH32b> {
     bits<3> Zm;
     bits<3> iop;
     let Inst{22} = iop{2};
     let Inst{20-19} = iop{1-0};
     let Inst{18-16} = Zm;
   }
-  def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS> {
+  def _S : sve2_int_mul_by_indexed_elem<0b10, opc, asm, ZPR32, ZPR32, ZPR3b32, VectorIndexS32b> {
     bits<3> Zm;
     bits<2> iop;
     let Inst{20-19} = iop;
     let Inst{18-16} = Zm;
   }
-  def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD> {
+  def _D : sve2_int_mul_by_indexed_elem<0b11, opc, asm, ZPR64, ZPR64, ZPR4b64, VectorIndexD32b> {
     bits<4> Zm;
     bit iop;
     let Inst{20} = iop;
     let Inst{19-16} = Zm;
   }
+
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, VectorIndexD32b_timm, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> {
+multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm,
+                                             SDPatternOperator op> {
   def _S : sve2_int_mul_by_indexed_elem<0b10, { opc{2-1}, ?, opc{0} }, asm,
-                                        ZPR32, ZPR16, ZPR3b16, VectorIndexH> {
+                                        ZPR32, ZPR16, ZPR3b16, VectorIndexH32b> {
     bits<3> Zm;
     bits<3> iop;
     let Inst{20-19} = iop{2-1};
@@ -2680,13 +2917,16 @@ multiclass sve2_int_mul_long_by_indexed_elem<bits<3> opc, string asm> {
     let Inst{11} = iop{0};
   }
   def _D : sve2_int_mul_by_indexed_elem<0b11, { opc{2-1}, ?, opc{0} }, asm,
-                                        ZPR64, ZPR32, ZPR4b32, VectorIndexS> {
+                                        ZPR64, ZPR32, ZPR4b32, VectorIndexS32b> {
     bits<4> Zm;
     bits<2> iop;
     let Inst{20} = iop{1};
     let Inst{19-16} = Zm;
     let Inst{11} = iop{0};
   }
+
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv8i16, nxv8i16, i32, VectorIndexH32b_timm, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv4i32, nxv4i32, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2702,7 +2942,7 @@ class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,
   bits<5> Zdn;
   let Inst{31-24} = 0b01000100;
   let Inst{23-22} = sz;
-  let Inst{21}    = 0b0;
+  let Inst{21-20} = 0b01;
   let Inst{20-16} = opc{5-1};
   let Inst{15-14} = 0b10;
   let Inst{13}    = opc{0};
@@ -2711,15 +2951,20 @@ class sve2_int_arith_pred<bits<2> sz, bits<6> opc, string asm,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve2_int_arith_pred<bits<6> opc, string asm> {
+multiclass sve2_int_arith_pred<bits<6> opc, string asm, SDPatternOperator op> {
   def _B : sve2_int_arith_pred<0b00, opc, asm, ZPR8>;
   def _H : sve2_int_arith_pred<0b01, opc, asm, ZPR16>;
   def _S : sve2_int_arith_pred<0b10, opc, asm, ZPR32>;
   def _D : sve2_int_arith_pred<0b11, opc, asm, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1,  nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1,  nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1,  nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_int_sadd_long_accum_pairwise<bits<2> sz, bit U, string asm,
@@ -2739,14 +2984,18 @@ class sve2_int_sadd_long_accum_pairwise<bits<2> sz, bit U, string asm,
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty1.ElementSize;
 }
 
-multiclass sve2_int_sadd_long_accum_pairwise<bit U, string asm> {
+multiclass sve2_int_sadd_long_accum_pairwise<bit U, string asm, SDPatternOperator op> {
   def _H : sve2_int_sadd_long_accum_pairwise<0b01, U, asm, ZPR16, ZPR8>;
   def _S : sve2_int_sadd_long_accum_pairwise<0b10, U, asm, ZPR32, ZPR16>;
   def _D : sve2_int_sadd_long_accum_pairwise<0b11, U, asm, ZPR64, ZPR32>;
+
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_int_un_pred_arit<bits<2> sz, bit Q, bits<2> opc,
@@ -2770,19 +3019,26 @@ class sve2_int_un_pred_arit<bits<2> sz, bit Q, bits<2> opc,
   let Inst{4-0}   = Zd;
 
   let Constraints = "$Zd = $_Zd";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve2_int_un_pred_arit_s<bits<3> opc, string asm> {
+multiclass sve2_int_un_pred_arit_s<bits<3> opc, string asm,
+                                   SDPatternOperator op> {
   def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1, nxv4i32, !cast<Instruction>(NAME # _S)>;
 }
 
-multiclass sve2_int_un_pred_arit<bits<3> opc, string asm> {
+multiclass sve2_int_un_pred_arit<bits<3> opc, string asm, SDPatternOperator op> {
   def _B : sve2_int_un_pred_arit<0b00, opc{2}, opc{1-0}, asm, ZPR8>;
   def _H : sve2_int_un_pred_arit<0b01, opc{2}, opc{1-0}, asm, ZPR16>;
   def _S : sve2_int_un_pred_arit<0b10, opc{2}, opc{1-0}, asm, ZPR32>;
   def _D : sve2_int_un_pred_arit<0b11, opc{2}, opc{1-0}, asm, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2806,21 +3062,47 @@ class sve2_wide_int_arith<bits<2> sz, bits<5> opc, string asm,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve2_wide_int_arith_long<bits<5> opc, string asm> {
+multiclass sve2_wide_int_arith_long<bits<5> opc, string asm,
+                                    SDPatternOperator op> {
   def _H : sve2_wide_int_arith<0b01, opc, asm, ZPR16, ZPR8, ZPR8>;
   def _S : sve2_wide_int_arith<0b10, opc, asm, ZPR32, ZPR16, ZPR16>;
   def _D : sve2_wide_int_arith<0b11, opc, asm, ZPR64, ZPR32, ZPR32>;
+
+  def : SVE_2_Op_Pat<nxv8i16, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i32, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm> {
+multiclass sve2_wide_int_arith_wide<bits<3> opc, string asm,
+                                    SDPatternOperator op> {
   def _H : sve2_wide_int_arith<0b01, { 0b10, opc }, asm, ZPR16, ZPR16, ZPR8>;
   def _S : sve2_wide_int_arith<0b10, { 0b10, opc }, asm, ZPR32, ZPR32, ZPR16>;
   def _D : sve2_wide_int_arith<0b11, { 0b10, opc }, asm, ZPR64, ZPR64, ZPR32>;
+
+  def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, !cast<Instruction>(NAME # _D)>;
+}
+
+multiclass sve2_wide_int_arith_pmul<bits<2> sz, bits<5> opc, string asm,
+                                     SDPatternOperator op> {
+  def NAME : sve2_wide_int_arith<sz, opc, asm, ZPR128, ZPR64, ZPR64>;
+
+  // To avoid using 128 bit elements in the IR, the pattern below works with
+  // llvm intrinsics with the _pair suffix, to reflect that
+  // _Q is implemented as a pair of _D.
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
 }
 
-multiclass sve2_pmul_long<bits<1> opc, string asm> {
+multiclass sve2_pmul_long<bits<1> opc, string asm, SDPatternOperator op> {
   def _H : sve2_wide_int_arith<0b01, {0b1101, opc}, asm, ZPR16, ZPR8, ZPR8>;
   def _D : sve2_wide_int_arith<0b11, {0b1101, opc}, asm, ZPR64, ZPR32, ZPR32>;
+
+  // To avoid using 128 bit elements in the IR, the patterns below work with
+  // llvm intrinsics with the _pair suffix, to reflect that
+  // _H is implemented as a pair of _B and _D is implemented as a pair of _S.
+  def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2844,17 +3126,27 @@ class sve2_misc<bits<2> sz, bits<4> opc, string asm,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve2_misc_bitwise<bits<4> opc, string asm> {
+multiclass sve2_misc_bitwise<bits<4> opc, string asm, SDPatternOperator op> {
   def _B : sve2_misc<0b00, opc, asm, ZPR8, ZPR8>;
   def _H : sve2_misc<0b01, opc, asm, ZPR16, ZPR16>;
   def _S : sve2_misc<0b10, opc, asm, ZPR32, ZPR32>;
   def _D : sve2_misc<0b11, opc, asm, ZPR64, ZPR64>;
+
+  def : SVE_2_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm> {
+multiclass sve2_misc_int_addsub_long_interleaved<bits<2> opc, string asm,
+                                                 SDPatternOperator op> {
   def _H : sve2_misc<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
   def _S : sve2_misc<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
   def _D : sve2_misc<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
+
+  def : SVE_2_Op_Pat<nxv8i16, op, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i32, op, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm,
@@ -2874,15 +3166,21 @@ class sve2_bitwise_xor_interleaved<bits<2> sz, bits<1> opc, string asm,
   let Inst{4-0}   = Zd;
 
   let Constraints = "$Zd = $_Zd";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
-multiclass sve2_bitwise_xor_interleaved<bit opc, string asm> {
+multiclass sve2_bitwise_xor_interleaved<bit opc, string asm,
+                                        SDPatternOperator op> {
   def _B : sve2_bitwise_xor_interleaved<0b00, opc, asm, ZPR8,  ZPR8>;
   def _H : sve2_bitwise_xor_interleaved<0b01, opc, asm, ZPR16, ZPR16>;
   def _S : sve2_bitwise_xor_interleaved<0b10, opc, asm, ZPR32, ZPR32>;
   def _D : sve2_bitwise_xor_interleaved<0b11, opc, asm, ZPR64, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
@@ -2905,7 +3203,8 @@ class sve2_bitwise_shift_left_long<bits<3> tsz8_64, bits<2> opc, string asm,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
+multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm,
+                                        SDPatternOperator op> {
   def _H : sve2_bitwise_shift_left_long<{0,0,1}, opc, asm,
                                         ZPR16, ZPR8, vecshiftL8>;
   def _S : sve2_bitwise_shift_left_long<{0,1,?}, opc, asm,
@@ -2916,6 +3215,9 @@ multiclass sve2_bitwise_shift_left_long<bits<2> opc, string asm> {
                                         ZPR64, ZPR32, vecshiftL32> {
     let Inst{20-19} = imm{4-3};
   }
+  def : SVE_2_Op_Imm_Pat<nxv8i16, op, nxv16i8, i32, tvecshiftL8,  !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Imm_Pat<nxv4i32, op, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Imm_Pat<nxv2i64, op, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2943,7 +3245,8 @@ class sve2_int_bin_shift_imm<bits<4> tsz8_64, bit opc, string asm,
   let Constraints = "$Zd = $_Zd";
 }
 
-multiclass sve2_int_bin_shift_imm_left<bit opc, string asm> {
+multiclass sve2_int_bin_shift_imm_left<bit opc, string asm,
+                                       SDPatternOperator op> {
   def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
   def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
     let Inst{19} = imm{3};
@@ -2955,9 +3258,15 @@ multiclass sve2_int_bin_shift_imm_left<bit opc, string asm> {
     let Inst{22}    = imm{5};
     let Inst{20-19} = imm{4-3};
   }
+
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftL8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_int_bin_shift_imm_right<bit opc, string asm> {
+multiclass sve2_int_bin_shift_imm_right<bit opc, string asm,
+                                        SDPatternOperator op> {
   def _B : sve2_int_bin_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
   def _H : sve2_int_bin_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
     let Inst{19} = imm{3};
@@ -2969,6 +3278,11 @@ multiclass sve2_int_bin_shift_imm_right<bit opc, string asm> {
     let Inst{22}    = imm{5};
     let Inst{20-19} = imm{4-3};
   }
+
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
@@ -2990,11 +3304,12 @@ class sve2_int_bin_accum_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
-multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm> {
+multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm,
+                                              SDPatternOperator op> {
   def _B : sve2_int_bin_accum_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
   def _H : sve2_int_bin_accum_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
     let Inst{19} = imm{3};
@@ -3006,6 +3321,11 @@ multiclass sve2_int_bin_accum_shift_imm_right<bits<2> opc, string asm> {
     let Inst{22}    = imm{5};
     let Inst{20-19} = imm{4-3};
   }
+
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_int_cadd<bits<2> sz, bit opc, string asm, ZPRRegOp zprty>
@@ -3024,15 +3344,20 @@ class sve2_int_cadd<bits<2> sz, bit opc, string asm, ZPRRegOp zprty>
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
-multiclass sve2_int_cadd<bit opc, string asm> {
+multiclass sve2_int_cadd<bit opc, string asm, SDPatternOperator op> {
   def _B : sve2_int_cadd<0b00, opc, asm, ZPR8>;
   def _H : sve2_int_cadd<0b01, opc, asm, ZPR16>;
   def _S : sve2_int_cadd<0b10, opc, asm, ZPR32>;
   def _D : sve2_int_cadd<0b11, opc, asm, ZPR64>;
+
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, complexrotateopodd, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, complexrotateopodd, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, complexrotateopodd, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, complexrotateopodd, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm,
@@ -3052,28 +3377,41 @@ class sve2_int_absdiff_accum<bits<2> sz, bits<4> opc, string asm,
   let Inst{4-0}   = Zda;
 
   let Constraints = "$Zda = $_Zda";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
-multiclass sve2_int_absdiff_accum<bit opc, string asm> {
+multiclass sve2_int_absdiff_accum<bit opc, string asm, SDPatternOperator op> {
   def _B : sve2_int_absdiff_accum<0b00, { 0b111, opc }, asm, ZPR8, ZPR8>;
   def _H : sve2_int_absdiff_accum<0b01, { 0b111, opc }, asm, ZPR16, ZPR16>;
   def _S : sve2_int_absdiff_accum<0b10, { 0b111, opc }, asm, ZPR32, ZPR32>;
   def _D : sve2_int_absdiff_accum<0b11, { 0b111, opc }, asm, ZPR64, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm> {
+multiclass sve2_int_absdiff_accum_long<bits<2> opc, string asm,
+                                       SDPatternOperator op> {
   def _H : sve2_int_absdiff_accum<0b01, { 0b00, opc }, asm, ZPR16, ZPR8>;
   def _S : sve2_int_absdiff_accum<0b10, { 0b00, opc }, asm, ZPR32, ZPR16>;
   def _D : sve2_int_absdiff_accum<0b11, { 0b00, opc }, asm, ZPR64, ZPR32>;
+
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv8i16, nxv8i16, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm> {
+multiclass sve2_int_addsub_long_carry<bits<2> opc, string asm, SDPatternOperator op> {
   def _S : sve2_int_absdiff_accum<{ opc{1}, 0b0 }, { 0b010, opc{0} }, asm,
                                   ZPR32, ZPR32>;
   def _D : sve2_int_absdiff_accum<{ opc{1}, 0b1 }, { 0b010, opc{0} }, asm,
                                   ZPR64, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3300,7 +3638,7 @@ class sve_int_un_pred_arit<bits<2> sz8_64, bits<4> opc,
   let Inst{4-0}   = Zd;
 
   let Constraints = "$Zd = $_Zd";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
@@ -3465,11 +3803,12 @@ class sve_int_arith_imm0<bits<2> sz8_64, bits<3> opc, string asm,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
-multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> {
+multiclass sve_int_arith_imm0<bits<3> opc, string asm,
+                              SDPatternOperator op, SDPatternOperator int_op> {
   def _B : sve_int_arith_imm0<0b00, opc, asm, ZPR8,  addsub_imm8_opt_lsl_i8>;
   def _H : sve_int_arith_imm0<0b01, opc, asm, ZPR16, addsub_imm8_opt_lsl_i16>;
   def _S : sve_int_arith_imm0<0b10, opc, asm, ZPR32, addsub_imm8_opt_lsl_i32>;
@@ -3479,6 +3818,12 @@ multiclass sve_int_arith_imm0<bits<3> opc, string asm, SDPatternOperator op> {
   def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
   def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
   def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
+
+  // Intrinsic version
+  def : SVE_1_Op_Imm_OptLsl_Pat<nxv16i8, int_op, ZPR8,  i32, SVEAddSubImm8Pat,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Imm_OptLsl_Pat<nxv8i16, int_op, ZPR16, i32, SVEAddSubImm16Pat, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Imm_OptLsl_Pat<nxv4i32, int_op, ZPR32, i32, SVEAddSubImm32Pat, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Imm_OptLsl_Pat<nxv2i64, int_op, ZPR64, i64, SVEAddSubImm64Pat, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_arith_imm0_subr<bits<3> opc, string asm, SDPatternOperator op> {
@@ -3509,7 +3854,7 @@ class sve_int_arith_imm<bits<2> sz8_64, bits<6> opc, string asm,
   let Inst{4-0} = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
@@ -3519,10 +3864,10 @@ multiclass sve_int_arith_imm1<bits<2> opc, string asm, SDPatternOperator op> {
   def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, simm8>;
   def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, simm8>;
 
-  def : SVE_1_Op_Imm_Arith_Pat<nxv16i8, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_Imm_Arith_Pat<nxv8i16, op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_Imm_Arith_Pat<nxv4i32, op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_Imm_Arith_Pat<nxv2i64, op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1,  op, ZPR16, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1,  op, ZPR32, i32, SVEArithSImmPat, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1,  op, ZPR64, i64, SVEArithSImmPat, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperator op> {
@@ -3531,10 +3876,10 @@ multiclass sve_int_arith_imm1_unsigned<bits<2> opc, string asm, SDPatternOperato
   def _S : sve_int_arith_imm<0b10, { 0b1010, opc }, asm, ZPR32, imm0_255>;
   def _D : sve_int_arith_imm<0b11, { 0b1010, opc }, asm, ZPR64, imm0_255>;
 
-  def : SVE_1_Op_Imm_Arith_Pat<nxv16i8, op, ZPR8, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _B)>;
-  def : SVE_1_Op_Imm_Arith_Pat<nxv8i16, op, ZPR16, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _H)>;
-  def : SVE_1_Op_Imm_Arith_Pat<nxv4i32, op, ZPR32, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _S)>;
-  def : SVE_1_Op_Imm_Arith_Pat<nxv2i64, op, ZPR64, i64, SVEArithUImmPat, !cast<Instruction>(NAME # _D)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv8i16, nxv8i1, op, ZPR16, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv4i32, nxv4i1, op, ZPR32, i32, SVEArithUImmPat, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEArithUImmPat, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_arith_imm2<string asm, SDPatternOperator op> {
@@ -3604,11 +3949,11 @@ class sve2_int_bitwise_ternary_op_d<bits<3> opc, string asm>
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
-multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm> {
+multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm, SDPatternOperator op> {
   def NAME : sve2_int_bitwise_ternary_op_d<opc, asm>;
 
   def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
@@ -3617,6 +3962,11 @@ multiclass sve2_int_bitwise_ternary_op<bits<3> opc, string asm> {
                   (!cast<Instruction>(NAME) ZPR16:$Zdn, ZPR16:$Zm, ZPR16:$Zk), 1>;
   def : InstAlias<asm # "\t$Zdn, $Zdn, $Zm, $Zk",
                   (!cast<Instruction>(NAME) ZPR32:$Zdn, ZPR32:$Zm, ZPR32:$Zk), 1>;
+
+  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i8, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i16, nxv8i16, nxv8i16, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i32, nxv4i32, nxv4i32, !cast<Instruction>(NAME)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i64, nxv2i64, nxv2i64, !cast<Instruction>(NAME)>;
 }
 
 class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm,
@@ -3638,11 +3988,11 @@ class sve2_int_rotate_right_imm<bits<4> tsz8_64, string asm,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
-multiclass sve2_int_rotate_right_imm<string asm> {
+multiclass sve2_int_rotate_right_imm<string asm, SDPatternOperator op> {
   def _B : sve2_int_rotate_right_imm<{0,0,0,1}, asm, ZPR8, vecshiftR8>;
   def _H : sve2_int_rotate_right_imm<{0,0,1,?}, asm, ZPR16, vecshiftR16> {
     let Inst{19} = imm{3};
@@ -3654,6 +4004,10 @@ multiclass sve2_int_rotate_right_imm<string asm> {
     let Inst{22}    = imm{5};
     let Inst{20-19} = imm{4-3};
   }
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i8, nxv16i8, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i16, nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i64, nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3678,7 +4032,7 @@ class sve_int_dup_fpimm_pred<bits<2> sz, Operand fpimmtype,
   let Inst{4-0}   = Zd;
 
   let Constraints = "$Zd = $_Zd";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
@@ -3713,26 +4067,34 @@ class sve_int_dup_imm_pred<bits<2> sz8_64, bit m, string asm,
   let Inst{12-5}  = imm{7-0}; // imm8
   let Inst{4-0}   = Zd;
 
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve_int_dup_imm_pred_merge<string asm> {
-  let Constraints = "$Zd = $_Zd" in {
-  def _B : sve_int_dup_imm_pred<0b00, 1, asm, ZPR8,  "/m", (ins ZPR8:$_Zd,  PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>;
-  def _H : sve_int_dup_imm_pred<0b01, 1, asm, ZPR16, "/m", (ins ZPR16:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>;
-  def _S : sve_int_dup_imm_pred<0b10, 1, asm, ZPR32, "/m", (ins ZPR32:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>;
-  def _D : sve_int_dup_imm_pred<0b11, 1, asm, ZPR64, "/m", (ins ZPR64:$_Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>;
-  }
-
-  def : InstAlias<"mov $Zd, $Pg/m, $imm",
-                  (!cast<Instruction>(NAME # _B) ZPR8:$Zd,  PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>;
-  def : InstAlias<"mov $Zd, $Pg/m, $imm",
-                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>;
+multiclass sve_int_dup_imm_pred_merge_inst<
+    bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty,
+    ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> {
+  let Constraints = "$Zd = $_Zd" in
+  def NAME : sve_int_dup_imm_pred<sz8_64, 1, asm, zprty,  "/m",
+                                  (ins zprty:$_Zd, PPRAny:$Pg, cpyimm:$imm)>;
   def : InstAlias<"mov $Zd, $Pg/m, $imm",
-                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>;
-  def : InstAlias<"mov $Zd, $Pg/m, $imm",
-                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>;
+                  (!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>;
+  def : Pat<(intty
+              (vselect predty:$Pg,
+                (intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))),
+                intty:$Zd)),
+            (!cast<Instruction>(NAME) zprty:$Zd, $Pg, i32:$imm, i32:$shift)>;
+}
+
+multiclass sve_int_dup_imm_pred_merge<string asm> {
+  defm _B : sve_int_dup_imm_pred_merge_inst<0b00, asm, ZPR8, nxv16i8, nxv16i1,
+                                            i32, cpy_imm8_opt_lsl_i8>;
+  defm _H : sve_int_dup_imm_pred_merge_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1,
+                                            i32, cpy_imm8_opt_lsl_i16>;
+  defm _S : sve_int_dup_imm_pred_merge_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1,
+                                            i32, cpy_imm8_opt_lsl_i32>;
+  defm _D : sve_int_dup_imm_pred_merge_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1,
+                                            i64, cpy_imm8_opt_lsl_i64>;
 
   def : InstAlias<"fmov $Zd, $Pg/m, #0.0",
                   (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, 0, 0), 0>;
@@ -3742,20 +4104,35 @@ multiclass sve_int_dup_imm_pred_merge<string asm> {
                   (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, 0, 0), 0>;
 }
 
-multiclass sve_int_dup_imm_pred_zero<string asm> {
-  def _B : sve_int_dup_imm_pred<0b00, 0, asm, ZPR8,  "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm)>;
-  def _H : sve_int_dup_imm_pred<0b01, 0, asm, ZPR16, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm)>;
-  def _S : sve_int_dup_imm_pred<0b10, 0, asm, ZPR32, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm)>;
-  def _D : sve_int_dup_imm_pred<0b11, 0, asm, ZPR64, "/z", (ins PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm)>;
-
-  def : InstAlias<"mov $Zd, $Pg/z, $imm",
-                  (!cast<Instruction>(NAME # _B) ZPR8:$Zd,  PPRAny:$Pg, cpy_imm8_opt_lsl_i8:$imm), 1>;
-  def : InstAlias<"mov $Zd, $Pg/z, $imm",
-                  (!cast<Instruction>(NAME # _H) ZPR16:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i16:$imm), 1>;
+multiclass sve_int_dup_imm_pred_zero_inst<
+    bits<2> sz8_64, string asm, ZPRRegOp zprty, ValueType intty,
+    ValueType predty, ValueType scalarty, imm8_opt_lsl cpyimm> {
+  def NAME : sve_int_dup_imm_pred<sz8_64, 0, asm, zprty, "/z",
+                                  (ins PPRAny:$Pg, cpyimm:$imm)>;
   def : InstAlias<"mov $Zd, $Pg/z, $imm",
-                  (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i32:$imm), 1>;
-  def : InstAlias<"mov $Zd, $Pg/z, $imm",
-                  (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPRAny:$Pg, cpy_imm8_opt_lsl_i64:$imm), 1>;
+                  (!cast<Instruction>(NAME) zprty:$Zd, PPRAny:$Pg, cpyimm:$imm), 1>;
+  def : Pat<(intty (zext (predty PPRAny:$Ps1))),
+            (!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>;
+  def : Pat<(intty (sext (predty PPRAny:$Ps1))),
+            (!cast<Instruction>(NAME) PPRAny:$Ps1, -1, 0)>;
+  def : Pat<(intty (anyext (predty PPRAny:$Ps1))),
+            (!cast<Instruction>(NAME) PPRAny:$Ps1, 1, 0)>;
+  def : Pat<(intty
+              (vselect predty:$Pg,
+                (intty (AArch64dup (scalarty (SVE8BitLslImm i32:$imm, i32:$shift)))),
+                (intty (AArch64dup (scalarty 0))))),
+            (!cast<Instruction>(NAME) $Pg, i32:$imm, i32:$shift)>;
+}
+
+multiclass sve_int_dup_imm_pred_zero<string asm> {
+  defm _B : sve_int_dup_imm_pred_zero_inst<0b00, asm, ZPR8,  nxv16i8, nxv16i1,
+                                           i32, cpy_imm8_opt_lsl_i8>;
+  defm _H : sve_int_dup_imm_pred_zero_inst<0b01, asm, ZPR16, nxv8i16, nxv8i1,
+                                           i32, cpy_imm8_opt_lsl_i16>;
+  defm _S : sve_int_dup_imm_pred_zero_inst<0b10, asm, ZPR32, nxv4i32, nxv4i1,
+                                           i32, cpy_imm8_opt_lsl_i32>;
+  defm _D : sve_int_dup_imm_pred_zero_inst<0b11, asm, ZPR64, nxv2i64, nxv2i1,
+                                           i64, cpy_imm8_opt_lsl_i64>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -3787,17 +4164,24 @@ class sve_int_cmp<bit cmp_1, bits<2> sz8_64, bits<3> opc, string asm,
   let Defs = [NZCV];
 }
 
-multiclass sve_int_cmp_0<bits<3> opc, string asm, SDPatternOperator op,
-                         CondCode cc> {
+multiclass SVE_SETCC_Pat<CondCode cc, CondCode invcc, ValueType predvt,
+                         ValueType intvt, sve_int_cmp cmp> {
+  def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, cc)),
+            (cmp $Op1, $Op2, $Op3)>;
+  def : Pat<(predvt (AArch64setcc_z predvt:$Op1, intvt:$Op2, intvt:$Op3, invcc)),
+            (cmp $Op1, $Op3, $Op2)>;
+}
+
+multiclass sve_int_cmp_0<bits<3> opc, string asm, CondCode cc, CondCode invcc> {
   def _B : sve_int_cmp<0b0, 0b00, opc, asm, PPR8, ZPR8, ZPR8>;
   def _H : sve_int_cmp<0b0, 0b01, opc, asm, PPR16, ZPR16, ZPR16>;
   def _S : sve_int_cmp<0b0, 0b10, opc, asm, PPR32, ZPR32, ZPR32>;
   def _D : sve_int_cmp<0b0, 0b11, opc, asm, PPR64, ZPR64, ZPR64>;
 
-  def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
-  def : SVE_3_Op_Pat<nxv8i1,  op, nxv8i1,  nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4i1,  op, nxv4i1,  nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2i1,  op, nxv2i1,  nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
+  defm : SVE_SETCC_Pat<cc, invcc, nxv16i1, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  defm : SVE_SETCC_Pat<cc, invcc, nxv8i1,  nxv8i16, !cast<Instruction>(NAME # _H)>;
+  defm : SVE_SETCC_Pat<cc, invcc, nxv4i1,  nxv4i32, !cast<Instruction>(NAME # _S)>;
+  defm : SVE_SETCC_Pat<cc, invcc, nxv2i1,  nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 multiclass sve_int_cmp_0_wide<bits<3> opc, string asm, SDPatternOperator op> {
@@ -3852,67 +4236,35 @@ class sve_int_scmp_vi<bits<2> sz8_64, bits<3> opc, string asm, PPRRegOp pprty,
   let ElementSize = pprty.ElementSize;
 }
 
-multiclass sve_int_scmp_vi<bits<3> opc, string asm, CondCode cc,
-                           SDPatternOperator op = null_frag,
-                           SDPatternOperator inv_op = null_frag> {
+multiclass SVE_SETCC_Imm_Pat<CondCode cc, CondCode commuted_cc,
+                             ValueType predvt, ValueType intvt,
+                             Operand immtype, Instruction cmp> {
+  def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg),
+                                       (intvt ZPR:$Zs1),
+                                       (intvt (AArch64dup (immtype:$imm))),
+                                       cc)),
+            (cmp $Pg, $Zs1, immtype:$imm)>;
+  def : Pat<(predvt (AArch64setcc_z (predvt PPR_3b:$Pg),
+                                       (intvt (AArch64dup (immtype:$imm))),
+                                       (intvt ZPR:$Zs1),
+                                       commuted_cc)),
+            (cmp $Pg, $Zs1, immtype:$imm)>;
+}
+
+multiclass sve_int_scmp_vi<bits<3> opc, string asm, CondCode cc, CondCode commuted_cc> {
   def _B : sve_int_scmp_vi<0b00, opc, asm, PPR8, ZPR8, simm5_32b>;
   def _H : sve_int_scmp_vi<0b01, opc, asm, PPR16, ZPR16, simm5_32b>;
   def _S : sve_int_scmp_vi<0b10, opc, asm, PPR32, ZPR32, simm5_32b>;
   def _D : sve_int_scmp_vi<0b11, opc, asm, PPR64, ZPR64, simm5_64b>;
 
-  // IR version
-  def : Pat<(nxv16i1 (setcc (nxv16i8 ZPR:$Zs1),
-                             (nxv16i8 (AArch64dup (simm5_32b:$imm))),
-                             cc)),
-             (!cast<Instruction>(NAME # "_B") (PTRUE_B 31), ZPR:$Zs1, simm5_32b:$imm)>;
-  def : Pat<(nxv8i1 (setcc (nxv8i16 ZPR:$Zs1),
-                            (nxv8i16 (AArch64dup (simm5_32b:$imm))),
-                            cc)),
-             (!cast<Instruction>(NAME # "_H") (PTRUE_H 31), ZPR:$Zs1, simm5_32b:$imm)>;
-  def : Pat<(nxv4i1 (setcc (nxv4i32 ZPR:$Zs1),
-                            (nxv4i32 (AArch64dup (simm5_32b:$imm))),
-                            cc)),
-             (!cast<Instruction>(NAME # "_S") (PTRUE_S 31), ZPR:$Zs1, simm5_32b:$imm)>;
-  def : Pat<(nxv2i1 (setcc (nxv2i64 ZPR:$Zs1),
-                           (nxv2i64 (AArch64dup (simm5_64b:$imm))),
-                           cc)),
-            (!cast<Instruction>(NAME # "_D") (PTRUE_D 31), ZPR:$Zs1, simm5_64b:$imm)>;
-
-  // Intrinsic version
-  def : Pat<(nxv16i1 (op (nxv16i1 PPR_3b:$Pg),
-                         (nxv16i8 ZPR:$Zs1),
-                         (nxv16i8 (AArch64dup (simm5_32b:$imm))))),
-            (!cast<Instruction>(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>;
-  def : Pat<(nxv8i1 (op (nxv8i1 PPR_3b:$Pg),
-                        (nxv8i16 ZPR:$Zs1),
-                        (nxv8i16 (AArch64dup (simm5_32b:$imm))))),
-            (!cast<Instruction>(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>;
-  def : Pat<(nxv4i1 (op (nxv4i1 PPR_3b:$Pg),
-                        (nxv4i32 ZPR:$Zs1),
-                        (nxv4i32 (AArch64dup (simm5_32b:$imm))))),
-            (!cast<Instruction>(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>;
-  def : Pat<(nxv2i1 (op (nxv2i1 PPR_3b:$Pg),
-                        (nxv2i64 ZPR:$Zs1),
-                        (nxv2i64 (AArch64dup (simm5_64b:$imm))))),
-            (!cast<Instruction>(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, simm5_64b:$imm)>;
-
-  // Inverted intrinsic version
-  def : Pat<(nxv16i1 (inv_op (nxv16i1 PPR_3b:$Pg),
-                             (nxv16i8 (AArch64dup (simm5_32b:$imm))),
-                             (nxv16i8 ZPR:$Zs1))),
-            (!cast<Instruction>(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>;
-  def : Pat<(nxv8i1 (inv_op (nxv8i1 PPR_3b:$Pg),
-                            (nxv8i16 (AArch64dup (simm5_32b:$imm))),
-                            (nxv8i16 ZPR:$Zs1))),
-            (!cast<Instruction>(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>;
-  def : Pat<(nxv4i1 (inv_op (nxv4i1 PPR_3b:$Pg),
-                            (nxv4i32 (AArch64dup (simm5_32b:$imm))),
-                            (nxv4i32 ZPR:$Zs1))),
-            (!cast<Instruction>(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, simm5_32b:$imm)>;
-  def : Pat<(nxv2i1 (inv_op (nxv2i1 PPR_3b:$Pg),
-                            (nxv2i64 (AArch64dup (simm5_64b:$imm))),
-                            (nxv2i64 ZPR:$Zs1))),
-            (!cast<Instruction>(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, simm5_64b:$imm)>;
+  defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv16i1, nxv16i8, simm5_32b,
+                           !cast<Instruction>(NAME # _B)>;
+  defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv8i1,  nxv8i16, simm5_32b,
+                           !cast<Instruction>(NAME # _H)>;
+  defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv4i1,  nxv4i32, simm5_32b,
+                           !cast<Instruction>(NAME # _S)>;
+  defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv2i1,  nxv2i64, simm5_64b,
+                           !cast<Instruction>(NAME # _D)>;
 }
 
 
@@ -3944,66 +4296,20 @@ class sve_int_ucmp_vi<bits<2> sz8_64, bits<2> opc, string asm, PPRRegOp pprty,
 }
 
 multiclass sve_int_ucmp_vi<bits<2> opc, string asm, CondCode cc,
-                           SDPatternOperator op = null_frag,
-                           SDPatternOperator inv_op = null_frag> {
+                           CondCode commuted_cc> {
   def _B : sve_int_ucmp_vi<0b00, opc, asm, PPR8, ZPR8, imm0_127>;
   def _H : sve_int_ucmp_vi<0b01, opc, asm, PPR16, ZPR16, imm0_127>;
   def _S : sve_int_ucmp_vi<0b10, opc, asm, PPR32, ZPR32, imm0_127>;
   def _D : sve_int_ucmp_vi<0b11, opc, asm, PPR64, ZPR64, imm0_127_64b>;
 
-  // IR version
-  def : Pat<(nxv16i1 (setcc (nxv16i8 ZPR:$Zs1),
-                            (nxv16i8 (AArch64dup (imm0_127:$imm))),
-                            cc)),
-            (!cast<Instruction>(NAME # "_B") (PTRUE_B 31), ZPR:$Zs1, imm0_127:$imm)>;
-  def : Pat<(nxv8i1 (setcc (nxv8i16 ZPR:$Zs1),
-                           (nxv8i16 (AArch64dup (imm0_127:$imm))),
-                           cc)),
-            (!cast<Instruction>(NAME # "_H") (PTRUE_H 31), ZPR:$Zs1, imm0_127:$imm)>;
-  def : Pat<(nxv4i1 (setcc (nxv4i32 ZPR:$Zs1),
-                           (nxv4i32 (AArch64dup (imm0_127:$imm))),
-                           cc)),
-            (!cast<Instruction>(NAME # "_S") (PTRUE_S 31), ZPR:$Zs1, imm0_127:$imm)>;
-  def : Pat<(nxv2i1 (setcc (nxv2i64 ZPR:$Zs1),
-                           (nxv2i64 (AArch64dup (imm0_127_64b:$imm))),
-                           cc)),
-            (!cast<Instruction>(NAME # "_D") (PTRUE_D 31), ZPR:$Zs1, imm0_127_64b:$imm)>;
-
-  // Intrinsic version
-  def : Pat<(nxv16i1 (op (nxv16i1 PPR_3b:$Pg),
-                         (nxv16i8 ZPR:$Zs1),
-                         (nxv16i8 (AArch64dup (imm0_127:$imm))))),
-            (!cast<Instruction>(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>;
-  def : Pat<(nxv8i1 (op (nxv8i1 PPR_3b:$Pg),
-                        (nxv8i16 ZPR:$Zs1),
-                        (nxv8i16 (AArch64dup (imm0_127:$imm))))),
-            (!cast<Instruction>(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>;
-  def : Pat<(nxv4i1 (op (nxv4i1 PPR_3b:$Pg),
-                        (nxv4i32 ZPR:$Zs1),
-                        (nxv4i32 (AArch64dup (imm0_127:$imm))))),
-            (!cast<Instruction>(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>;
-  def : Pat<(nxv2i1 (op (nxv2i1 PPR_3b:$Pg),
-                        (nxv2i64 ZPR:$Zs1),
-                        (nxv2i64 (AArch64dup (imm0_127_64b:$imm))))),
-            (!cast<Instruction>(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, imm0_127_64b:$imm)>;
-
-  // Inverted intrinsic version
-  def : Pat<(nxv16i1 (inv_op (nxv16i1 PPR_3b:$Pg),
-                             (nxv16i8 (AArch64dup (imm0_127:$imm))),
-                             (nxv16i8 ZPR:$Zs1))),
-            (!cast<Instruction>(NAME # "_B") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>;
-  def : Pat<(nxv8i1 (inv_op (nxv8i1 PPR_3b:$Pg),
-                            (nxv8i16 (AArch64dup (imm0_127:$imm))),
-                            (nxv8i16 ZPR:$Zs1))),
-            (!cast<Instruction>(NAME # "_H") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>;
-  def : Pat<(nxv4i1 (inv_op (nxv4i1 PPR_3b:$Pg),
-                            (nxv4i32 (AArch64dup (imm0_127:$imm))),
-                            (nxv4i32 ZPR:$Zs1))),
-            (!cast<Instruction>(NAME # "_S") PPR_3b:$Pg, ZPR:$Zs1, imm0_127:$imm)>;
-  def : Pat<(nxv2i1 (inv_op (nxv2i1 PPR_3b:$Pg),
-                            (nxv2i64 (AArch64dup (imm0_127_64b:$imm))),
-                            (nxv2i64 ZPR:$Zs1))),
-            (!cast<Instruction>(NAME # "_D") PPR_3b:$Pg, ZPR:$Zs1, imm0_127_64b:$imm)>;
+  defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv16i1, nxv16i8, imm0_127,
+                           !cast<Instruction>(NAME # _B)>;
+  defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv8i1,  nxv8i16, imm0_127,
+                           !cast<Instruction>(NAME # _H)>;
+  defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv4i1,  nxv4i32, imm0_127,
+                           !cast<Instruction>(NAME # _S)>;
+  defm : SVE_SETCC_Imm_Pat<cc, commuted_cc, nxv2i1,  nxv2i64, imm0_127_64b,
+                           !cast<Instruction>(NAME # _D)>;
 }
 
 
@@ -4096,11 +4402,17 @@ class sve2_int_while_rr<bits<2> sz8_64, bits<1> rw, string asm,
   let Defs = [NZCV];
 }
 
-multiclass sve2_int_while_rr<bits<1> rw, string asm> {
+multiclass sve2_int_while_rr<bits<1> rw, string asm, string op> {
   def _B : sve2_int_while_rr<0b00, rw, asm, PPR8>;
   def _H : sve2_int_while_rr<0b01, rw, asm, PPR16>;
   def _S : sve2_int_while_rr<0b10, rw, asm, PPR32>;
   def _D : sve2_int_while_rr<0b11, rw, asm, PPR64>;
+
+  def : SVE_2_Op_Pat<nxv16i1, !cast<SDPatternOperator>(op # _b), i64, i64, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pat<nxv8i1,  !cast<SDPatternOperator>(op # _h), i64, i64, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i1,  !cast<SDPatternOperator>(op # _s), i64, i64, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i1,  !cast<SDPatternOperator>(op # _d), i64, i64, !cast<Instruction>(NAME # _D)>;
+
 }
 
 //===----------------------------------------------------------------------===//
@@ -4108,8 +4420,8 @@ multiclass sve2_int_while_rr<bits<1> rw, string asm> {
 //===----------------------------------------------------------------------===//
 
 class sve_fp_fast_red<bits<2> sz, bits<3> opc, string asm,
-                      ZPRRegOp zprty, RegisterClass dstRegClass>
-: I<(outs dstRegClass:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
+                      ZPRRegOp zprty, FPRasZPROperand dstOpType>
+: I<(outs dstOpType:$Vd), (ins PPR3bAny:$Pg, zprty:$Zn),
   asm, "\t$Vd, $Pg, $Zn",
   "",
   []>, Sched<[]> {
@@ -4127,13 +4439,13 @@ class sve_fp_fast_red<bits<2> sz, bits<3> opc, string asm,
 }
 
 multiclass sve_fp_fast_red<bits<3> opc, string asm, SDPatternOperator op> {
-  def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16>;
-  def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32>;
-  def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64>;
+  def _H : sve_fp_fast_red<0b01, opc, asm, ZPR16, FPR16asZPR>;
+  def _S : sve_fp_fast_red<0b10, opc, asm, ZPR32, FPR32asZPR>;
+  def _D : sve_fp_fast_red<0b11, opc, asm, ZPR64, FPR64asZPR>;
 
-  def : SVE_2_Op_Pat<f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_2_Op_Pat<f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_2_Op_Pat<f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_2_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
 
@@ -4142,8 +4454,8 @@ multiclass sve_fp_fast_red<bits<3> opc, string asm, SDPatternOperator op> {
 //===----------------------------------------------------------------------===//
 
 class sve_fp_2op_p_vd<bits<2> sz, bits<3> opc, string asm,
-                      ZPRRegOp zprty, RegisterClass dstRegClass>
-: I<(outs dstRegClass:$Vdn), (ins PPR3bAny:$Pg, dstRegClass:$_Vdn, zprty:$Zm),
+                      ZPRRegOp zprty, FPRasZPROperand dstOpType>
+: I<(outs dstOpType:$Vdn), (ins PPR3bAny:$Pg, dstOpType:$_Vdn, zprty:$Zm),
   asm, "\t$Vdn, $Pg, $_Vdn, $Zm",
   "",
   []>,
@@ -4164,13 +4476,13 @@ class sve_fp_2op_p_vd<bits<2> sz, bits<3> opc, string asm,
 }
 
 multiclass sve_fp_2op_p_vd<bits<3> opc, string asm, SDPatternOperator op> {
-  def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16>;
-  def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32>;
-  def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64>;
+  def _H : sve_fp_2op_p_vd<0b01, opc, asm, ZPR16, FPR16asZPR>;
+  def _S : sve_fp_2op_p_vd<0b10, opc, asm, ZPR32, FPR32asZPR>;
+  def _D : sve_fp_2op_p_vd<0b11, opc, asm, ZPR64, FPR64asZPR>;
 
-  def : SVE_3_Op_Pat<f16, op, nxv8i1, f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<f32, op, nxv4i1, f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<f64, op, nxv2i1, f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -4210,6 +4522,22 @@ multiclass sve_fp_3op_p_pd<bits<3> opc, string asm, SDPatternOperator op> {
   def : SVE_3_Op_Pat<nxv2i1, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
+multiclass sve_fp_3op_p_pd_cc<bits<3> opc, string asm, SDPatternOperator op,
+                              SDPatternOperator op_nopred>
+: sve_fp_3op_p_pd<opc, asm, op> {
+  def : SVE_2_Op_AllActive_Pat<nxv8i1, op_nopred, nxv8f16, nxv8f16,
+                               !cast<Instruction>(NAME # _H), PTRUE_H>;
+  def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4f16, nxv4f16,
+                               !cast<Instruction>(NAME # _H), PTRUE_S>;
+  def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f16, nxv2f16,
+                               !cast<Instruction>(NAME # _H), PTRUE_D>;
+  def : SVE_2_Op_AllActive_Pat<nxv4i1, op_nopred, nxv4f32, nxv4f32,
+                               !cast<Instruction>(NAME # _S), PTRUE_S>;
+  def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f32, nxv2f32,
+                               !cast<Instruction>(NAME # _S), PTRUE_D>;
+  def : SVE_2_Op_AllActive_Pat<nxv2i1, op_nopred, nxv2f64, nxv2f64,
+                               !cast<Instruction>(NAME # _D), PTRUE_D>;
+}
 
 //===----------------------------------------------------------------------===//
 // SVE Floating Point Compare - with Zero Group
@@ -4263,11 +4591,20 @@ class sve_int_index_ii<bits<2> sz8_64, string asm, ZPRRegOp zprty,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_int_index_ii<string asm> {
-  def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_32b>;
-  def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_32b>;
+multiclass sve_int_index_ii<string asm, SDPatternOperator op> {
+  def _B : sve_int_index_ii<0b00, asm, ZPR8, simm5_8b>;
+  def _H : sve_int_index_ii<0b01, asm, ZPR16, simm5_16b>;
   def _S : sve_int_index_ii<0b10, asm, ZPR32, simm5_32b>;
   def _D : sve_int_index_ii<0b11, asm, ZPR64, simm5_64b>;
+
+  def : Pat<(nxv16i8 (op simm5_8b:$imm5, simm5_8b:$imm5b)),
+            (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, simm5_8b:$imm5b)>;
+  def : Pat<(nxv8i16 (op simm5_16b:$imm5, simm5_16b:$imm5b)),
+            (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, simm5_16b:$imm5b)>;
+  def : Pat<(nxv4i32 (op simm5_32b:$imm5, simm5_32b:$imm5b)),
+            (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, simm5_32b:$imm5b)>;
+  def : Pat<(nxv2i64 (op simm5_64b:$imm5, simm5_64b:$imm5b)),
+            (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, simm5_64b:$imm5b)>;
 }
 
 class sve_int_index_ir<bits<2> sz8_64, string asm, ZPRRegOp zprty,
@@ -4287,11 +4624,20 @@ class sve_int_index_ir<bits<2> sz8_64, string asm, ZPRRegOp zprty,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_int_index_ir<string asm> {
-  def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_32b>;
-  def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_32b>;
+multiclass sve_int_index_ir<string asm, SDPatternOperator op> {
+  def _B : sve_int_index_ir<0b00, asm, ZPR8, GPR32, simm5_8b>;
+  def _H : sve_int_index_ir<0b01, asm, ZPR16, GPR32, simm5_16b>;
   def _S : sve_int_index_ir<0b10, asm, ZPR32, GPR32, simm5_32b>;
   def _D : sve_int_index_ir<0b11, asm, ZPR64, GPR64, simm5_64b>;
+
+  def : Pat<(nxv16i8 (op simm5_8b:$imm5, GPR32:$Rm)),
+            (!cast<Instruction>(NAME # "_B") simm5_8b:$imm5, GPR32:$Rm)>;
+  def : Pat<(nxv8i16 (op simm5_16b:$imm5, GPR32:$Rm)),
+            (!cast<Instruction>(NAME # "_H") simm5_16b:$imm5, GPR32:$Rm)>;
+  def : Pat<(nxv4i32 (op simm5_32b:$imm5, GPR32:$Rm)),
+            (!cast<Instruction>(NAME # "_S") simm5_32b:$imm5, GPR32:$Rm)>;
+  def : Pat<(nxv2i64 (op simm5_64b:$imm5, GPR64:$Rm)),
+            (!cast<Instruction>(NAME # "_D") simm5_64b:$imm5, GPR64:$Rm)>;
 }
 
 class sve_int_index_ri<bits<2> sz8_64, string asm, ZPRRegOp zprty,
@@ -4311,11 +4657,20 @@ class sve_int_index_ri<bits<2> sz8_64, string asm, ZPRRegOp zprty,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_int_index_ri<string asm> {
-  def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_32b>;
-  def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_32b>;
+multiclass sve_int_index_ri<string asm, SDPatternOperator op> {
+  def _B : sve_int_index_ri<0b00, asm, ZPR8, GPR32, simm5_8b>;
+  def _H : sve_int_index_ri<0b01, asm, ZPR16, GPR32, simm5_16b>;
   def _S : sve_int_index_ri<0b10, asm, ZPR32, GPR32, simm5_32b>;
   def _D : sve_int_index_ri<0b11, asm, ZPR64, GPR64, simm5_64b>;
+
+  def : Pat<(nxv16i8 (op GPR32:$Rm, simm5_8b:$imm5)),
+            (!cast<Instruction>(NAME # "_B") GPR32:$Rm, simm5_8b:$imm5)>;
+  def : Pat<(nxv8i16 (op GPR32:$Rm, simm5_16b:$imm5)),
+            (!cast<Instruction>(NAME # "_H") GPR32:$Rm, simm5_16b:$imm5)>;
+  def : Pat<(nxv4i32 (op GPR32:$Rm, simm5_32b:$imm5)),
+            (!cast<Instruction>(NAME # "_S") GPR32:$Rm, simm5_32b:$imm5)>;
+  def : Pat<(nxv2i64 (op GPR64:$Rm, simm5_64b:$imm5)),
+            (!cast<Instruction>(NAME # "_D") GPR64:$Rm, simm5_64b:$imm5)>;
 }
 
 class sve_int_index_rr<bits<2> sz8_64, string asm, ZPRRegOp zprty,
@@ -4335,19 +4690,23 @@ class sve_int_index_rr<bits<2> sz8_64, string asm, ZPRRegOp zprty,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_int_index_rr<string asm> {
+multiclass sve_int_index_rr<string asm, SDPatternOperator op> {
   def _B : sve_int_index_rr<0b00, asm, ZPR8, GPR32>;
   def _H : sve_int_index_rr<0b01, asm, ZPR16, GPR32>;
   def _S : sve_int_index_rr<0b10, asm, ZPR32, GPR32>;
   def _D : sve_int_index_rr<0b11, asm, ZPR64, GPR64>;
+
+  def : SVE_2_Op_Pat<nxv16i8, op, i32, i32, !cast<Instruction>(NAME # _B)>;
+  def : SVE_2_Op_Pat<nxv8i16, op, i32, i32, !cast<Instruction>(NAME # _H)>;
+  def : SVE_2_Op_Pat<nxv4i32, op, i32, i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_2_Op_Pat<nxv2i64, op, i64, i64, !cast<Instruction>(NAME # _D)>;
 }
 //
 //===----------------------------------------------------------------------===//
 // SVE Bitwise Shift - Predicated Group
 //===----------------------------------------------------------------------===//
 class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm,
-                                 ZPRRegOp zprty, Operand immtype,
-                                 ElementSizeEnum size>
+                                 ZPRRegOp zprty, Operand immtype>
 : I<(outs zprty:$Zdn), (ins PPR3bAny:$Pg, zprty:$_Zdn, immtype:$imm),
   asm, "\t$Zdn, $Pg/m, $_Zdn, $imm",
   "",
@@ -4366,50 +4725,99 @@ class sve_int_bin_pred_shift_imm<bits<4> tsz8_64, bits<4> opc, string asm,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
-  let ElementSize = size;
+  let DestructiveInstType = DestructiveBinaryImm;
+  let ElementSize = zprty.ElementSize;
+}
+
+multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm, string psName=""> {
+  def _B : SVEPseudo2Instr<psName # _B, 1>,
+           sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
+  def _H : SVEPseudo2Instr<psName # _H, 1>,
+           sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
+    let Inst{8} = imm{3};
+  }
+  def _S : SVEPseudo2Instr<psName # _S, 1>,
+           sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
+    let Inst{9-8} = imm{4-3};
+  }
+  def _D : SVEPseudo2Instr<psName # _D, 1>,
+           sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
+    let Inst{22}  = imm{5};
+    let Inst{9-8} = imm{4-3};
+  }
 }
 
-multiclass sve_int_bin_pred_shift_imm_left<bits<4> opc, string asm> {
-  def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8,
-                                      ElementSizeB>;
-  def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16,
-                                      ElementSizeH> {
+multiclass sve2_int_bin_pred_shift_imm_left<bits<4> opc, string asm,
+                                            string psName,
+                                            SDPatternOperator op> {
+
+  def _B : SVEPseudo2Instr<psName # _B, 1>, sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
+  def _H : SVEPseudo2Instr<psName # _H, 1>,
+           sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
     let Inst{8} = imm{3};
   }
-  def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32,
-                                      ElementSizeS> {
+  def _S : SVEPseudo2Instr<psName # _S, 1>,
+           sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftL32> {
     let Inst{9-8} = imm{4-3};
   }
-  def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64,
-                                      ElementSizeD> {
+  def _D : SVEPseudo2Instr<psName # _D, 1>,
+           sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftL64> {
     let Inst{22}  = imm{5};
     let Inst{9-8} = imm{4-3};
   }
+
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, tvecshiftL8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1,  nxv8i16, i32, tvecshiftL16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1,  nxv4i32, i32, tvecshiftL32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1,  nxv2i64, i32, tvecshiftL64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm,
+multiclass sve_int_bin_pred_shift_imm_left_zeroing_bhsd<SDPatternOperator op> {
+  def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8,  tvecshiftL8,  FalseLanesZero>;
+  def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, tvecshiftL16, FalseLanesZero>;
+  def _ZERO_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, tvecshiftL32, FalseLanesZero>;
+  def _ZERO_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, tvecshiftL64, FalseLanesZero>;
+
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftL8,  !cast<Pseudo>(NAME # _ZERO_B)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1,  nxv8i16, tvecshiftL16, !cast<Pseudo>(NAME # _ZERO_H)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1,  nxv4i32, tvecshiftL32, !cast<Pseudo>(NAME # _ZERO_S)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1,  nxv2i64, tvecshiftL64, !cast<Pseudo>(NAME # _ZERO_D)>;
+}
+
+multiclass sve_int_bin_pred_shift_imm_right<bits<4> opc, string asm, string Ps,
                                             SDPatternOperator op = null_frag> {
-  def _B : sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8,
-                                      ElementSizeB>;
-  def _H : sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16,
-                                      ElementSizeH> {
+  def _B : SVEPseudo2Instr<Ps # _B, 1>,
+           sve_int_bin_pred_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
+  def _H : SVEPseudo2Instr<Ps # _H, 1>,
+           sve_int_bin_pred_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
     let Inst{8} = imm{3};
   }
-  def _S : sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32,
-                                      ElementSizeS> {
+  def _S : SVEPseudo2Instr<Ps # _S, 1>,
+           sve_int_bin_pred_shift_imm<{0,1,?,?}, opc, asm, ZPR32, vecshiftR32> {
     let Inst{9-8} = imm{4-3};
   }
-  def _D : sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64,
-                                      ElementSizeD> {
+  def _D : SVEPseudo2Instr<Ps # _D, 1>,
+           sve_int_bin_pred_shift_imm<{1,?,?,?}, opc, asm, ZPR64, vecshiftR64> {
     let Inst{22}  = imm{5};
     let Inst{9-8} = imm{4-3};
   }
 
-  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, vecshiftR8,  !cast<Instruction>(NAME # _B)>;
-  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1,  nxv8i16, i32, vecshiftR16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1,  nxv4i32, i32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1,  nxv2i64, i32, vecshiftR64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_3_Op_Imm_Pat<nxv16i8, op, nxv16i1, nxv16i8, i32, tvecshiftR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Imm_Pat<nxv8i16, op, nxv8i1,  nxv8i16, i32, tvecshiftR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Imm_Pat<nxv4i32, op, nxv4i1,  nxv4i32, i32, tvecshiftR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Imm_Pat<nxv2i64, op, nxv2i1,  nxv2i64, i32, tvecshiftR64, !cast<Instruction>(NAME # _D)>;
+}
+
+multiclass sve_int_bin_pred_shift_imm_right_zeroing_bhsd<SDPatternOperator op = null_frag> {
+  def _ZERO_B : PredTwoOpImmPseudo<NAME # _B, ZPR8, vecshiftR8, FalseLanesZero>;
+  def _ZERO_H : PredTwoOpImmPseudo<NAME # _H, ZPR16, vecshiftR16, FalseLanesZero>;
+  def _ZERO_S : PredTwoOpImmPseudo<NAME # _S, ZPR32, vecshiftR32, FalseLanesZero>;
+  def _ZERO_D : PredTwoOpImmPseudo<NAME # _D, ZPR64, vecshiftR64, FalseLanesZero>;
+
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv16i8, op, nxv16i1, nxv16i8, tvecshiftR8, !cast<Pseudo>(NAME # _ZERO_B)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv8i16, op, nxv8i1, nxv8i16, tvecshiftR16, !cast<Pseudo>(NAME # _ZERO_H)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv4i32, op, nxv4i1, nxv4i32, tvecshiftR32, !cast<Pseudo>(NAME # _ZERO_S)>;
+  def : SVE_3_Op_Pat_Shift_Imm_SelZero<nxv2i64, op, nxv2i1, nxv2i64, tvecshiftR64, !cast<Pseudo>(NAME # _ZERO_D)>;
 }
 
 class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc,
@@ -4432,23 +4840,40 @@ class sve_int_bin_pred_shift<bits<2> sz8_64, bit wide, bits<3> opc,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve_int_bin_pred_shift<bits<3> opc, string asm,
-                                  SDPatternOperator op> {
-  def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>;
-  def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>;
-  def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>;
-  def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>;
-
+multiclass sve_int_bin_pred_shift<bits<3> opc, string asm, string Ps,
+                                  SDPatternOperator op, string revname, bit isReverseInstr = 0> {
+  let DestructiveInstType = DestructiveBinaryCommWithRev in {
+  def _B : sve_int_bin_pred_shift<0b00, 0b0, opc, asm, ZPR8, ZPR8>,
+           SVEPseudo2Instr<Ps # _B, 1>, SVEInstr2Rev<NAME # _B, revname # _B, isReverseInstr>;
+  def _H : sve_int_bin_pred_shift<0b01, 0b0, opc, asm, ZPR16, ZPR16>,
+           SVEPseudo2Instr<Ps # _H, 1>, SVEInstr2Rev<NAME # _H, revname # _H, isReverseInstr>;
+  def _S : sve_int_bin_pred_shift<0b10, 0b0, opc, asm, ZPR32, ZPR32>,
+           SVEPseudo2Instr<Ps # _S, 1>, SVEInstr2Rev<NAME # _S, revname # _S, isReverseInstr>;
+  def _D : sve_int_bin_pred_shift<0b11, 0b0, opc, asm, ZPR64, ZPR64>,
+           SVEPseudo2Instr<Ps # _D, 1>, SVEInstr2Rev<NAME # _D, revname # _D, isReverseInstr>;
+  }
   def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
   def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1,  nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1,  nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1,  nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
+multiclass sve_int_bin_pred_zeroing_bhsd<SDPatternOperator op> {
+  def _ZERO_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesZero>;
+  def _ZERO_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesZero>;
+  def _ZERO_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesZero>;
+  def _ZERO_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesZero>;
+
+  def : SVE_3_Op_Pat_SelZero<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _ZERO_B)>;
+  def : SVE_3_Op_Pat_SelZero<nxv8i16, op, nxv8i1, nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _ZERO_H)>;
+  def : SVE_3_Op_Pat_SelZero<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _ZERO_S)>;
+  def : SVE_3_Op_Pat_SelZero<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _ZERO_D)>;
+}
+
 multiclass sve_int_bin_pred_shift_wide<bits<3> opc, string asm,
                                   SDPatternOperator op> {
   def _B : sve_int_bin_pred_shift<0b00, 0b1, opc, asm, ZPR8, ZPR64>;
@@ -4493,7 +4918,8 @@ class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
                                ZPRRegOp zprty, Operand immtype>
 : I<(outs zprty:$Zd), (ins zprty:$Zn, immtype:$imm),
   asm, "\t$Zd, $Zn, $imm",
-  "", []>, Sched<[]> {
+  "",
+  []>, Sched<[]> {
   bits<5> Zd;
   bits<5> Zn;
   bits<6> imm;
@@ -4508,7 +4934,8 @@ class sve_int_bin_cons_shift_imm<bits<4> tsz8_64, bits<2> opc, string asm,
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm> {
+multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm,
+                                           SDPatternOperator op> {
   def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftL8>;
   def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftL16> {
     let Inst{19} = imm{3};
@@ -4520,9 +4947,15 @@ multiclass sve_int_bin_cons_shift_imm_left<bits<2> opc, string asm> {
     let Inst{22}    = imm{5};
     let Inst{20-19} = imm{4-3};
   }
+
+  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8,  vecshiftL8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1,  op, ZPR16, vecshiftL16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1,  op, ZPR32, vecshiftL32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1, op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>;
 }
 
-multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm> {
+multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm,
+                                            SDPatternOperator op> {
   def _B : sve_int_bin_cons_shift_imm<{0,0,0,1}, opc, asm, ZPR8, vecshiftR8>;
   def _H : sve_int_bin_cons_shift_imm<{0,0,1,?}, opc, asm, ZPR16, vecshiftR16> {
     let Inst{19} = imm{3};
@@ -4534,6 +4967,11 @@ multiclass sve_int_bin_cons_shift_imm_right<bits<2> opc, string asm> {
     let Inst{22}    = imm{5};
     let Inst{20-19} = imm{4-3};
   }
+
+  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv16i8, nxv16i1, op, ZPR8,  vecshiftR8,  !cast<Instruction>(NAME # _B)>;
+  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv8i16, nxv8i1,  op, ZPR16, vecshiftR16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_1_Op_Imm_Shift_Pred_Pat<nxv4i32, nxv4i1,  op, ZPR32, vecshiftR32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_1_Op_Imm_Arith_Pred_Pat<nxv2i64, nxv2i1,  op, ZPR64, i64, SVEShiftImm64, !cast<Instruction>(NAME # _D)>;
 }
 //===----------------------------------------------------------------------===//
 // SVE Memory - Store Group
@@ -4743,16 +5181,36 @@ class sve2_mem_sstnt_vs_base<bits<3> opc, string asm,
   let mayStore = 1;
 }
 
-multiclass sve2_mem_sstnt_vs<bits<3> opc, string asm,
-                             RegisterOperand listty, ZPRRegOp zprty> {
-  def _REAL : sve2_mem_sstnt_vs_base<opc, asm, listty, zprty>;
+multiclass sve2_mem_sstnt_vs_32_ptrs<bits<3> opc, string asm,
+                             SDPatternOperator op,
+                             ValueType vt> {
+  def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_s, ZPR32>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
+                 (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+                 (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
+                 (!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;
+
+  def : Pat <(op (nxv4i32 ZPR32:$Zt), (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zn), (i64 GPR64:$Rm), vt),
+             (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm)>;
+}
+
+multiclass sve2_mem_sstnt_vs_64_ptrs<bits<3> opc, string asm,
+                             SDPatternOperator op,
+                             ValueType vt> {
+  def _REAL : sve2_mem_sstnt_vs_base<opc, asm, Z_d, ZPR64>;
 
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn, $Rm]",
-                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
+                 (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
-                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
+                 (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg, [$Zn]",
-                 (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
+                 (!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
+
+  def : Pat <(op (nxv2i64 ZPR64:$Zt), (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zn), (i64 GPR64:$Rm), vt),
+             (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm)>;
 }
 
 class sve_mem_sst_sv<bits<3> opc, bit xs, bit scaled, string asm,
@@ -5094,6 +5552,17 @@ class sve_int_rdffr_pred<bit s, string asm>
   let Uses = [FFR];
 }
 
+multiclass sve_int_rdffr_pred<bit s, string asm, SDPatternOperator op> {
+  def _REAL : sve_int_rdffr_pred<s, asm>;
+
+  // We need a layer of indirection because early machine code passes balk at
+  // physical register (i.e. FFR) uses that have no previous definition.
+  let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+  def "" : Pseudo<(outs PPR8:$Pd), (ins PPRAny:$Pg), [(set (nxv16i1 PPR8:$Pd), (op (nxv16i1 PPRAny:$Pg)))]>,
+           PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) PPR8:$Pd, PPRAny:$Pg)>;
+  }
+}
+
 class sve_int_rdffr_unpred<string asm> : I<
   (outs PPR8:$Pd), (ins),
   asm, "\t$Pd",
@@ -5106,11 +5575,22 @@ class sve_int_rdffr_unpred<string asm> : I<
   let Uses = [FFR];
 }
 
-class sve_int_wrffr<string asm>
+multiclass sve_int_rdffr_unpred<string asm, SDPatternOperator op> {
+  def _REAL : sve_int_rdffr_unpred<asm>;
+
+  // We need a layer of indirection because early machine code passes balk at
+  // physical register (i.e. FFR) uses that have no previous definition.
+  let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+  def "" : Pseudo<(outs PPR8:$Pd), (ins), [(set (nxv16i1 PPR8:$Pd), (op))]>,
+           PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) PPR8:$Pd)>;
+  }
+}
+
+class sve_int_wrffr<string asm, SDPatternOperator op>
 : I<(outs), (ins PPR8:$Pn),
   asm, "\t$Pn",
   "",
-  []>, Sched<[]> {
+  [(op (nxv16i1 PPR8:$Pn))]>, Sched<[]> {
   bits<4> Pn;
   let Inst{31-9} = 0b00100101001010001001000;
   let Inst{8-5}  = Pn;
@@ -5120,11 +5600,11 @@ class sve_int_wrffr<string asm>
   let Defs = [FFR];
 }
 
-class sve_int_setffr<string asm>
+class sve_int_setffr<string asm, SDPatternOperator op>
 : I<(outs), (ins),
   asm, "",
   "",
-  []>, Sched<[]> {
+  [(op)]>, Sched<[]> {
   let Inst{31-0} = 0b00100101001011001001000000000000;
 
   let hasSideEffects = 1;
@@ -5219,7 +5699,7 @@ class sve_int_perm_clast_zz<bits<2> sz8_64, bit ab, string asm,
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
@@ -5317,7 +5797,7 @@ class sve_int_perm_splice<bits<2> sz8_64, string asm, ZPRRegOp zprty>
   let Inst{4-0}   = Zdn;
 
   let Constraints = "$Zdn = $_Zdn";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = ElementSizeNone;
 }
 
@@ -5332,9 +5812,9 @@ multiclass sve_int_perm_splice<string asm, SDPatternOperator op> {
   def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1,  nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
   def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1,  nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 
-  def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
-  def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
-  def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
+  def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1,  nxv8f16, nxv8f16, !cast<Instruction>(NAME # _H)>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1,  nxv4f32, nxv4f32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1,  nxv2f64, nxv2f64, !cast<Instruction>(NAME # _D)>;
 }
 
 class sve2_int_perm_splice_cons<bits<2> sz8_64, string asm,
@@ -5380,7 +5860,7 @@ class sve_int_perm_rev<bits<2> sz8_64, bits<2> opc, string asm,
   let Inst{4-0}   = Zd;
 
   let Constraints = "$Zd = $_Zd";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
@@ -5443,11 +5923,11 @@ class sve_int_perm_cpy_r<bits<2> sz8_64, string asm, ZPRRegOp zprty,
   let Inst{4-0}   = Zd;
 
   let Constraints = "$Zd = $_Zd";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve_int_perm_cpy_r<string asm> {
+multiclass sve_int_perm_cpy_r<string asm, SDPatternOperator op> {
   def _B : sve_int_perm_cpy_r<0b00, asm, ZPR8, GPR32sp>;
   def _H : sve_int_perm_cpy_r<0b01, asm, ZPR16, GPR32sp>;
   def _S : sve_int_perm_cpy_r<0b10, asm, ZPR32, GPR32sp>;
@@ -5461,6 +5941,15 @@ multiclass sve_int_perm_cpy_r<string asm> {
                   (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, GPR32sp:$Rn), 1>;
   def : InstAlias<"mov $Zd, $Pg/m, $Rn",
                   (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, GPR64sp:$Rn), 1>;
+
+  def : Pat<(nxv16i8 (op nxv16i1:$pg, i32:$splat, nxv16i8:$passthru)),
+            (!cast<Instruction>(NAME # _B) $passthru, $pg, $splat)>;
+  def : Pat<(nxv8i16 (op nxv8i1:$pg, i32:$splat, nxv8i16:$passthru)),
+            (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
+  def : Pat<(nxv4i32 (op nxv4i1:$pg, i32:$splat, nxv4i32:$passthru)),
+            (!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
+  def : Pat<(nxv2i64 (op nxv2i1:$pg, i64:$splat, nxv2i64:$passthru)),
+            (!cast<Instruction>(NAME # _D) $passthru, $pg, $splat)>;
 }
 
 class sve_int_perm_cpy_v<bits<2> sz8_64, string asm, ZPRRegOp zprty,
@@ -5480,11 +5969,11 @@ class sve_int_perm_cpy_v<bits<2> sz8_64, string asm, ZPRRegOp zprty,
   let Inst{4-0}   = Zd;
 
   let Constraints = "$Zd = $_Zd";
-  let DestructiveInstType = Destructive;
+  let DestructiveInstType = DestructiveOther;
   let ElementSize = zprty.ElementSize;
 }
 
-multiclass sve_int_perm_cpy_v<string asm> {
+multiclass sve_int_perm_cpy_v<string asm, SDPatternOperator op> {
   def _B : sve_int_perm_cpy_v<0b00, asm, ZPR8, FPR8>;
   def _H : sve_int_perm_cpy_v<0b01, asm, ZPR16, FPR16>;
   def _S : sve_int_perm_cpy_v<0b10, asm, ZPR32, FPR32>;
@@ -5498,6 +5987,16 @@ multiclass sve_int_perm_cpy_v<string asm> {
                   (!cast<Instruction>(NAME # _S) ZPR32:$Zd, PPR3bAny:$Pg, FPR32:$Vn), 1>;
   def : InstAlias<"mov $Zd, $Pg/m, $Vn",
                   (!cast<Instruction>(NAME # _D) ZPR64:$Zd, PPR3bAny:$Pg, FPR64:$Vn), 1>;
+
+
+  def : Pat<(nxv8f16 (op nxv8i1:$pg, f16:$splat, nxv8f16:$passthru)),
+            (!cast<Instruction>(NAME # _H) $passthru, $pg, $splat)>;
+  def : Pat<(nxv2f32 (op nxv2i1:$pg, f32:$splat, nxv2f32:$passthru)),
+            (!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
+  def : Pat<(nxv4f32 (op nxv4i1:$pg, f32:$splat, nxv4f32:$passthru)),
+            (!cast<Instruction>(NAME # _S) $passthru, $pg, $splat)>;
+  def : Pat<(nxv2f64 (op nxv2i1:$pg, f64:$splat, nxv2f64:$passthru)),
+            (!cast<Instruction>(NAME # _D) $passthru, $pg, $splat)>;
 }
 
 class sve_int_perm_compact<bit sz, string asm, ZPRRegOp zprty>
@@ -5557,14 +6056,21 @@ class sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
 
 multiclass sve_mem_cld_si_base<bits<4> dtype, bit nf, string asm,
                                RegisterOperand listty, ZPRRegOp zprty> {
-  def "" : sve_mem_cld_si_base<dtype, nf, asm, listty>;
+  def _REAL : sve_mem_cld_si_base<dtype, nf, asm, listty>;
 
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
-                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+                  (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4, mul vl]",
-                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
+                  (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
-                  (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+                  (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+
+  // We need a layer of indirection because early machine code passes balk at
+  // physical register (i.e. FFR) uses that have no previous definition.
+  let hasSideEffects = 1, hasNoSchedulingInfo = 1, mayLoad = 1 in {
+  def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4), []>,
+           PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s1:$imm4)>;
+  }
 }
 
 multiclass sve_mem_cld_si<bits<4> dtype, string asm, RegisterOperand listty,
@@ -5773,6 +6279,13 @@ multiclass sve_mem_cldff_ss<bits<4> dtype, string asm, RegisterOperand listty,
 
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
                  (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, XZR), 0>;
+
+  // We need a layer of indirection because early machine code passes balk at
+  // physical register (i.e. FFR) uses that have no previous definition.
+  let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+  def "" : Pseudo<(outs listty:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), []>,
+           PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm)>;
+  }
 }
 
 multiclass sve_mem_cldnf_si<bits<4> dtype, string asm, RegisterOperand listty,
@@ -5878,10 +6391,19 @@ multiclass sve_mem_32b_gld_sv_32_scaled<bits<4> opc, string asm,
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
                   (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
 
+  // We need a layer of indirection because early machine code passes balk at
+  // physical register (i.e. FFR) uses that have no previous definition.
+  let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+  def _UXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
+                     PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+  def _SXTW_SCALED : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
+                     PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+  }
+
   def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)),
-            (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+            (!cast<Instruction>(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
   def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$indices), vt)),
-            (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+            (!cast<Instruction>(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
 }
 
 multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm,
@@ -5898,10 +6420,19 @@ multiclass sve_mem_32b_gld_vs_32_unscaled<bits<4> opc, string asm,
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
                   (!cast<Instruction>(NAME # _SXTW_REAL) ZPR32:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
 
+  // We need a layer of indirection because early machine code passes balk at
+  // physical register (i.e. FFR) uses that have no previous definition.
+  let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+  def _UXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
+              PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+  def _SXTW : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
+              PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_REAL) Z_s:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+  }
+
   def : Pat<(nxv4i32 (uxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)),
-            (!cast<Instruction>(NAME # _UXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+            (!cast<Instruction>(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
   def : Pat<(nxv4i32 (sxtw_op (nxv4i1 PPR:$gp), GPR64sp:$base, (nxv4i32 ZPR:$offsets), vt)),
-            (!cast<Instruction>(NAME # _SXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+            (!cast<Instruction>(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
 }
 
 
@@ -5940,8 +6471,15 @@ multiclass sve_mem_32b_gld_vi_32_ptrs<bits<4> opc, string asm, Operand imm_ty,
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
                   (!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
 
+  // We need a layer of indirection because early machine code passes balk at
+  // physical register (i.e. FFR) uses that have no previous definition.
+  let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+  def _IMM : Pseudo<(outs Z_s:$Zt), (ins PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5), []>,
+             PseudoInstExpansion<(!cast<Instruction>(NAME # _IMM_REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, imm_ty:$imm5)>;
+  }
+
   def : Pat<(nxv4i32 (op (nxv4i1 PPR:$gp), (nxv4i32 ZPR:$ptrs), imm_ty:$index, vt)),
-            (!cast<Instruction>(NAME # _IMM_REAL) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
+            (!cast<Instruction>(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
 }
 
 class sve_mem_prfm_si<bits<2> msz, string asm>
@@ -6022,9 +6560,17 @@ class sve_mem_32b_prfm_sv<bits<2> msz, bit xs, string asm,
 
 multiclass sve_mem_32b_prfm_sv_scaled<bits<2> msz, string asm,
                                       RegisterOperand sxtw_opnd,
-                                      RegisterOperand uxtw_opnd> {
+                                      RegisterOperand uxtw_opnd,
+                                      PatFrag op_sxtw,
+                                      PatFrag op_uxtw> {
   def _UXTW_SCALED : sve_mem_32b_prfm_sv<msz, 0, asm, uxtw_opnd>;
   def _SXTW_SCALED : sve_mem_32b_prfm_sv<msz, 1, asm, sxtw_opnd>;
+
+  def : Pat<(op_uxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
+            (!cast<Instruction>(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+
+  def : Pat<(op_sxtw (nxv4i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv4i32 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
+            (!cast<Instruction>(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
 }
 
 class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
@@ -6047,11 +6593,14 @@ class sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
   let Inst{3-0}   = prfop;
 }
 
-multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
+multiclass sve_mem_32b_prfm_vi<bits<2> msz, string asm, Operand imm_ty, SDPatternOperator op> {
   def NAME : sve_mem_32b_prfm_vi<msz, asm, imm_ty>;
 
   def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
                   (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR32:$Zn, 0), 1>;
+
+  def : Pat<(op (nxv4i1 PPR_3b:$Pg), (nxv4i32 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)),
+            (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>;
 }
 
 class sve_mem_z_fill<string asm>
@@ -6130,17 +6679,38 @@ class sve2_mem_gldnt_vs_base<bits<5> opc, dag iops, string asm,
   let mayLoad = 1;
 }
 
-multiclass sve2_mem_gldnt_vs<bits<5> opc, string asm,
-                             RegisterOperand listty, ZPRRegOp zprty> {
-  def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm),
-                                     asm, listty>;
+multiclass sve2_mem_gldnt_vs_32_ptrs<bits<5> opc, string asm,
+                                  SDPatternOperator op,
+                                  ValueType vt> {
+  def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm),
+                                     asm, Z_s>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
+                 (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, GPR64:$Rm), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+                 (!cast<Instruction>(NAME # _REAL) ZPR32:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
+                 (!cast<Instruction>(NAME # _REAL) Z_s:$Zt, PPR3bAny:$Pg, ZPR32:$Zn, XZR), 1>;
+
+  def : Pat <(nxv4i32 (op (nxv4i1 PPR3bAny:$Pg), (nxv4i32 ZPR32:$Zd), (i64 GPR64:$Rm), vt)),
+             (!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR32:$Zd, GPR64:$Rm)>;
+}
+
+multiclass sve2_mem_gldnt_vs_64_ptrs<bits<5> opc, string asm,
+                                   SDPatternOperator op,
+                                   ValueType vt> {
+  def _REAL : sve2_mem_gldnt_vs_base<opc, (ins PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm),
+                                     asm, Z_d>;
 
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn, $Rm]",
-                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, GPR64:$Rm), 0>;
+                 (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, GPR64:$Rm), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
-                 (!cast<Instruction>(NAME # _REAL) zprty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 0>;
+                 (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 0>;
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
-                 (!cast<Instruction>(NAME # _REAL) listty:$Zt, PPR3bAny:$Pg, zprty:$Zn, XZR), 1>;
+                 (!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, XZR), 1>;
+
+  def : Pat <(nxv2i64 (op (nxv2i1 PPR3bAny:$Pg), (nxv2i64 ZPR64:$Zd), (i64 GPR64:$Rm), vt)),
+             (!cast<Instruction>(NAME # _REAL) PPR3bAny:$Pg, ZPR64:$Zd, GPR64:$Rm)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -6190,10 +6760,19 @@ multiclass sve_mem_64b_gld_sv_32_scaled<bits<4> opc, string asm,
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
                   (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
 
+  // We need a layer of indirection because early machine code passes balk at
+  // physical register (i.e. FFR) uses that have no previous definition.
+  let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+  def _UXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
+                     PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+  def _SXTW_SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
+                     PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+  }
+
   def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
-            (!cast<Instruction>(NAME # _UXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+            (!cast<Instruction>(NAME # _UXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
   def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
-            (!cast<Instruction>(NAME # _SXTW_SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+            (!cast<Instruction>(NAME # _SXTW_SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
 }
 
 multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
@@ -6210,10 +6789,19 @@ multiclass sve_mem_64b_gld_vs_32_unscaled<bits<4> opc, string asm,
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
                   (!cast<Instruction>(NAME # _SXTW_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), 0>;
 
+  // We need a layer of indirection because early machine code passes balk at
+  // physical register (i.e. FFR) uses that have no previous definition.
+  let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+  def _UXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm), []>,
+              PseudoInstExpansion<(!cast<Instruction>(NAME # _UXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+  def _SXTW : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm), []>,
+              PseudoInstExpansion<(!cast<Instruction>(NAME # _SXTW_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+  }
+
   def : Pat<(nxv2i64 (uxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
-            (!cast<Instruction>(NAME # _UXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+            (!cast<Instruction>(NAME # _UXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
   def : Pat<(nxv2i64 (sxtw_op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
-            (!cast<Instruction>(NAME # _SXTW_REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+            (!cast<Instruction>(NAME # _SXTW) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
 }
 
 multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
@@ -6224,8 +6812,15 @@ multiclass sve_mem_64b_gld_sv2_64_scaled<bits<4> opc, string asm,
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
                   (!cast<Instruction>(NAME # _SCALED_REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), 0>;
 
+  // We need a layer of indirection because early machine code passes balk at
+  // physical register (i.e. FFR) uses that have no previous definition.
+  let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+  def _SCALED : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm), []>,
+                PseudoInstExpansion<(!cast<Instruction>(NAME # _SCALED_REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>;
+  }
+
   def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$indices), vt)),
-                     (!cast<Instruction>(NAME # _SCALED_REAL) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
+                     (!cast<Instruction>(NAME # _SCALED) PPR:$gp, GPR64sp:$base, ZPR:$indices)>;
 }
 
 multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm,
@@ -6235,8 +6830,15 @@ multiclass sve_mem_64b_gld_vs2_64_unscaled<bits<4> opc, string asm,
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Zm]",
                   (!cast<Instruction>(NAME # _REAL) ZPR64:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), 0>;
 
+  // We need a layer of indirection because early machine code passes balk at
+  // physical register (i.e. FFR) uses that have no previous definition.
+  let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+  def "" : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm), []>,
+           PseudoInstExpansion<(!cast<Instruction>(NAME # _REAL) Z_d:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, ZPR64ExtLSL8:$Zm)>;
+  }
+
   def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), GPR64sp:$base, (nxv2i64 ZPR:$offsets), vt)),
-            (!cast<Instruction>(NAME # _REAL) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
+            (!cast<Instruction>(NAME) PPR:$gp, GPR64sp:$base, ZPR:$offsets)>;
 }
 
 class sve_mem_64b_gld_vi<bits<4> opc, string asm, Operand imm_ty>
@@ -6274,8 +6876,15 @@ multiclass sve_mem_64b_gld_vi_64_ptrs<bits<4> opc, string asm, Operand imm_ty,
   def : InstAlias<asm # "\t$Zt, $Pg/z, [$Zn]",
                   (!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
 
+  // We need a layer of indirection because early machine code passes balk at
+  // physical register (i.e. FFR) uses that have no previous definition.
+  let hasSideEffects = 1, hasNoSchedulingInfo = 1 in {
+  def _IMM : Pseudo<(outs Z_d:$Zt), (ins PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5), []>,
+                  PseudoInstExpansion<(!cast<Instruction>(NAME # _IMM_REAL) Z_d:$Zt, PPR3bAny:$Pg, ZPR64:$Zn, imm_ty:$imm5)>;
+  }
+
   def : Pat<(nxv2i64 (op (nxv2i1 PPR:$gp), (nxv2i64 ZPR:$ptrs), imm_ty:$index, vt)),
-            (!cast<Instruction>(NAME # _IMM_REAL) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
+            (!cast<Instruction>(NAME # _IMM) PPR:$gp, ZPR:$ptrs, imm_ty:$index)>;
 }
 
 // bit lsl is '0' if the offsets are extended (uxtw/sxtw), '1' if shifted (lsl)
@@ -6305,14 +6914,27 @@ class sve_mem_64b_prfm_sv<bits<2> msz, bit xs, bit lsl, string asm,
 
 multiclass sve_mem_64b_prfm_sv_ext_scaled<bits<2> msz, string asm,
                                           RegisterOperand sxtw_opnd,
-                                          RegisterOperand uxtw_opnd> {
+                                          RegisterOperand uxtw_opnd,
+                                          PatFrag op_sxtw,
+                                          PatFrag op_uxtw> {
   def _UXTW_SCALED : sve_mem_64b_prfm_sv<msz, 0, 0, asm, uxtw_opnd>;
   def _SXTW_SCALED : sve_mem_64b_prfm_sv<msz, 1, 0, asm, sxtw_opnd>;
+
+  def : Pat<(op_uxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 uxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
+            (!cast<Instruction>(NAME # _UXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, uxtw_opnd:$Zm)>;
+
+  def : Pat<(op_sxtw (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 sxtw_opnd:$Zm), (i32 sve_prfop:$prfop)),
+            (!cast<Instruction>(NAME # _SXTW_SCALED) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, sxtw_opnd:$Zm)>;
+
 }
 
 multiclass sve_mem_64b_prfm_sv_lsl_scaled<bits<2> msz, string asm,
-                                          RegisterOperand zprext> {
+                                          RegisterOperand zprext, PatFrag frag> {
   def NAME : sve_mem_64b_prfm_sv<msz, 1, 1, asm, zprext>;
+
+  def : Pat<(frag (nxv2i1 PPR3bAny:$Pg), (i64 GPR64sp:$Rn), (nxv2i64 zprext:$Zm), (i32 sve_prfop:$prfop)),
+            (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, GPR64sp:$Rn, zprext:$Zm)>;
+
 }
 
 
@@ -6338,13 +6960,15 @@ class sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty>
   let hasSideEffects = 1;
 }
 
-multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty> {
+multiclass sve_mem_64b_prfm_vi<bits<2> msz, string asm, Operand imm_ty, SDPatternOperator op> {
   def NAME : sve_mem_64b_prfm_vi<msz, asm, imm_ty>;
 
   def : InstAlias<asm # "\t$prfop, $Pg, [$Zn]",
                   (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR3bAny:$Pg, ZPR64:$Zn, 0), 1>;
-}
 
+  def : Pat<(op (nxv2i1 PPR_3b:$Pg), (nxv2i64 ZPR32:$Zn), (i64 imm_ty:$imm), (i32 sve_prfop:$prfop)),
+            (!cast<Instruction>(NAME) sve_prfop:$prfop, PPR_3b:$Pg, ZPR32:$Zn, imm_ty:$imm)>;
+}
 
 //===----------------------------------------------------------------------===//
 // SVE Compute Vector Address Group
@@ -6600,6 +7224,12 @@ class sve_int_brkp<bits<2> opc, string asm>
   let Defs = !if(!eq (opc{1}, 1), [NZCV], []);
 }
 
+multiclass sve_int_brkp<bits<2> opc, string asm, SDPatternOperator op> {
+  def NAME : sve_int_brkp<opc, asm>;
+
+  def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
+}
+
 
 //===----------------------------------------------------------------------===//
 // SVE Partition Break Group
@@ -6626,6 +7256,12 @@ class sve_int_brkn<bit S, string asm>
   let Defs = !if(!eq (S, 0b1), [NZCV], []);
 }
 
+multiclass sve_int_brkn<bits<1> opc, string asm, SDPatternOperator op> {
+  def NAME : sve_int_brkn<opc, asm>;
+
+  def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
+}
+
 class sve_int_break<bits<3> opc, string asm, string suffix, dag iops>
 : I<(outs PPR8:$Pd), iops,
   asm, "\t$Pd, $Pg"#suffix#", $Pn",
@@ -6648,12 +7284,16 @@ class sve_int_break<bits<3> opc, string asm, string suffix, dag iops>
 
 }
 
-multiclass sve_int_break_m<bits<3> opc, string asm> {
+multiclass sve_int_break_m<bits<3> opc, string asm, SDPatternOperator op> {
   def NAME : sve_int_break<opc, asm, "/m", (ins PPR8:$_Pd, PPRAny:$Pg, PPR8:$Pn)>;
+
+  def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
 }
 
-multiclass sve_int_break_z<bits<3> opc, string asm> {
+multiclass sve_int_break_z<bits<3> opc, string asm, SDPatternOperator op> {
   def NAME : sve_int_break<opc, asm, "/z", (ins PPRAny:$Pg, PPR8:$Pn)>;
+
+  def : SVE_2_Op_Pat<nxv16i1, op, nxv16i1, nxv16i1, !cast<Instruction>(NAME)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -6683,20 +7323,23 @@ class sve2_char_match<bit sz, bit opc, string asm,
   let Defs = [NZCV];
 }
 
-multiclass sve2_char_match<bit opc, string asm> {
+multiclass sve2_char_match<bit opc, string asm, SDPatternOperator op> {
   def _B : sve2_char_match<0b0, opc, asm, PPR8, ZPR8>;
   def _H : sve2_char_match<0b1, opc, asm, PPR16, ZPR16>;
+
+  def : SVE_3_Op_Pat<nxv16i1, op, nxv16i1, nxv16i8, nxv16i8, !cast<Instruction>(NAME # _B)>;
+  def : SVE_3_Op_Pat<nxv8i1,  op, nxv8i1,  nxv8i16, nxv8i16, !cast<Instruction>(NAME # _H)>;
 }
 
 //===----------------------------------------------------------------------===//
 // SVE2 Histogram Computation - Segment Group
 //===----------------------------------------------------------------------===//
 
-class sve2_hist_gen_segment<string asm>
+class sve2_hist_gen_segment<string asm, SDPatternOperator op>
 : I<(outs ZPR8:$Zd), (ins ZPR8:$Zn, ZPR8:$Zm),
   asm, "\t$Zd, $Zn, $Zm",
   "",
-  []>, Sched<[]> {
+  [(set nxv16i8:$Zd, (op nxv16i8:$Zn, nxv16i8:$Zm))]>, Sched<[]> {
   bits<5> Zd;
   bits<5> Zn;
   bits<5> Zm;
@@ -6730,9 +7373,12 @@ class sve2_hist_gen_vector<bit sz, string asm, ZPRRegOp zprty>
   let Inst{4-0}   = Zd;
 }
 
-multiclass sve2_hist_gen_vector<string asm> {
+multiclass sve2_hist_gen_vector<string asm, SDPatternOperator op> {
   def _S : sve2_hist_gen_vector<0b0, asm, ZPR32>;
   def _D : sve2_hist_gen_vector<0b1, asm, ZPR64>;
+
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Instruction>(NAME # _S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Instruction>(NAME # _D)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -6755,6 +7401,12 @@ class sve2_crypto_cons_bin_op<bit opc, string asm, ZPRRegOp zprty>
   let Inst{4-0}   = Zd;
 }
 
+multiclass sve2_crypto_cons_bin_op<bit opc, string asm, ZPRRegOp zprty,
+                                   SDPatternOperator op, ValueType vt> {
+  def NAME : sve2_crypto_cons_bin_op<opc, asm, zprty>;
+  def : SVE_2_Op_Pat<vt, op, vt, vt, !cast<Instruction>(NAME)>;
+}
+
 class sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty>
 : I<(outs zprty:$Zdn), (ins zprty:$_Zdn, zprty:$Zm),
   asm, "\t$Zdn, $_Zdn, $Zm",
@@ -6772,8 +7424,14 @@ class sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty>
   let Constraints = "$Zdn = $_Zdn";
 }
 
-class sve2_crypto_unary_op<bit opc, string asm>
-: I<(outs ZPR8:$Zdn), (ins ZPR8:$_Zdn),
+multiclass sve2_crypto_des_bin_op<bits<2> opc, string asm, ZPRRegOp zprty,
+                                  SDPatternOperator op, ValueType vt> {
+  def NAME : sve2_crypto_des_bin_op<opc, asm, zprty>;
+  def : SVE_2_Op_Pat<vt, op, vt, vt, !cast<Instruction>(NAME)>;
+}
+
+class sve2_crypto_unary_op<bit opc, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zdn), (ins zprty:$_Zdn),
   asm, "\t$Zdn, $_Zdn",
   "",
   []>, Sched<[]> {
@@ -6785,3 +7443,389 @@ class sve2_crypto_unary_op<bit opc, string asm>
 
   let Constraints = "$Zdn = $_Zdn";
 }
+
+multiclass sve2_crypto_unary_op<bit opc, string asm, SDPatternOperator op> {
+  def NAME : sve2_crypto_unary_op<opc, asm, ZPR8>;
+  def : SVE_1_Op_Pat<nxv16i8, op, nxv16i8, !cast<Instruction>(NAME)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE BFloat16 Group
+//===----------------------------------------------------------------------===//
+
+class sve_bfloat_dot_base<bits<2> opc, string asm, string ops, dag iops>
+: I<(outs ZPR32:$Zda), iops, asm, ops, "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<5> Zn;
+  let Inst{31-21} = 0b01100100011;
+  let Inst{15-14} = opc;
+  let Inst{13-10} = 0b0000;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize = ElementSizeH;
+}
+
+class sve_bfloat_dot<string asm>
+: sve_bfloat_dot_base<0b10, asm, "\t$Zda, $Zn, $Zm",
+  (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm)> {
+  bits<5> Zm;
+  let Inst{20-16} = Zm;
+}
+
+multiclass sve_bfloat_dot<string asm, SDPatternOperator op> {
+  def NAME : sve_bfloat_dot<asm>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
+}
+
+class sve_bfloat_dot_indexed<string asm>
+: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop",
+  (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexS:$iop)> {
+  bits<2> iop;
+  bits<3> Zm;
+  let Inst{20-19} = iop;
+  let Inst{18-16} = Zm;
+}
+
+multiclass sve_bfloat_dot_indexed<string asm, SDPatternOperator op> {
+  def NAME : sve_bfloat_dot_indexed<asm>;
+  def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexS_timm, !cast<Instruction>(NAME)>;
+}
+
+class sve_bfloat_matmul<string asm>
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR16:$Zm),
+  asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+  bits<5> Zm;
+  bits<5> Zda;
+  bits<5> Zn;
+  let Inst{31-21} = 0b01100100011;
+  let Inst{20-16} = Zm;
+  let Inst{15-10} = 0b111001;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize = ElementSizeH;
+}
+
+multiclass sve_bfloat_matmul<string asm, SDPatternOperator op> {
+  def NAME : sve_bfloat_matmul<asm>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
+}
+
+class sve_bfloat_matmul_longvecl<bit BT, string asm>
+: sve_bfloat_matmul<asm> {
+  let Inst{23}    = 0b1;
+  let Inst{14-13} = 0b00;
+  let Inst{10}    = BT;
+}
+
+multiclass sve_bfloat_matmul_longvecl<bit BT, string asm, SDPatternOperator op> {
+  def NAME : sve_bfloat_matmul_longvecl<BT, asm>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16 ,!cast<Instruction>(NAME)>;
+}
+
+class sve_bfloat_matmul_longvecl_idx<bit BT, string asm>
+: sve_bfloat_dot_base<0b01, asm, "\t$Zda, $Zn, $Zm$iop",
+  (ins ZPR32:$_Zda, ZPR16:$Zn, ZPR3b16:$Zm, VectorIndexH:$iop)> {
+  bits<3> iop;
+  bits<3> Zm;
+  let Inst{23}    = 0b1;
+  let Inst{20-19} = iop{2-1};
+  let Inst{18-16} = Zm;
+  let Inst{11}    = iop{0};
+  let Inst{10}    = BT;
+}
+
+multiclass sve_bfloat_matmul_longvecl_idx<bit BT, string asm, SDPatternOperator op> {
+  def NAME : sve_bfloat_matmul_longvecl_idx<BT, asm>;
+  def : SVE_4_Op_Imm_Pat<nxv4f32, op, nxv4f32, nxv8bf16, nxv8bf16, i64, VectorIndexH_timm, !cast<Instruction>(NAME)>;
+}
+
+class sve_bfloat_convert<bit N, string asm>
+: I<(outs ZPR16:$Zd), (ins ZPR16:$_Zd, PPR3bAny:$Pg, ZPR32:$Zn),
+  asm, "\t$Zd, $Pg/m, $Zn", "", []>, Sched<[]> {
+  bits<5> Zd;
+  bits<3> Pg;
+  bits<5> Zn;
+  let Inst{31-25} = 0b0110010;
+  let Inst{24}    = N;
+  let Inst{23-13} = 0b10001010101;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+
+  let Constraints = "$Zd = $_Zd";
+  let DestructiveInstType = DestructiveOther;
+  let hasSideEffects = 1;
+  let ElementSize = ElementSizeS;
+}
+
+multiclass sve_bfloat_convert<bit N, string asm, SDPatternOperator op> {
+  def NAME : sve_bfloat_convert<N, asm>;
+  def : SVE_3_Op_Pat<nxv8bf16, op, nxv8bf16, nxv8i1, nxv4f32, !cast<Instruction>(NAME)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Matrix Multiply Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_matmul<bits<2> uns, string asm>
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm,
+  "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-24} = 0b01000101;
+  let Inst{23-22} = uns;
+  let Inst{21}    = 0;
+  let Inst{20-16} = Zm;
+  let Inst{15-10} = 0b100110;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize = ZPR32.ElementSize;
+}
+
+multiclass sve_int_matmul<bits<2> uns, string asm, SDPatternOperator op> {
+  def NAME : sve_int_matmul<uns, asm>;
+
+  def : SVE_3_Op_Pat<nxv4i32, op , nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Dot Product Mixed Sign Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_dot_mixed<string asm>
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR8:$Zm), asm,
+  "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-21} = 0b01000100100;
+  let Inst{20-16} = Zm;
+  let Inst{15-10} = 0b011110;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize = ZPR32.ElementSize;
+}
+
+multiclass sve_int_dot_mixed<string asm, SDPatternOperator op> {
+  def NAME : sve_int_dot_mixed<asm>;
+
+  def : SVE_3_Op_Pat<nxv4i32, op , nxv4i32, nxv16i8, nxv16i8, !cast<Instruction>(NAME)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Integer Dot Product Mixed Sign - Indexed Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_dot_mixed_indexed<bit U, string asm>
+: I<(outs ZPR32:$Zda), (ins ZPR32:$_Zda, ZPR8:$Zn, ZPR3b8:$Zm, VectorIndexS32b:$idx),
+    asm, "\t$Zda, $Zn, $Zm$idx", "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<5> Zn;
+  bits<3> Zm;
+  bits<2> idx;
+  let Inst{31-21} = 0b01000100101;
+  let Inst{20-19} = idx;
+  let Inst{18-16} = Zm;
+  let Inst{15-11} = 0b00011;
+  let Inst{10}    = U;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize = ZPR32.ElementSize;
+}
+
+multiclass sve_int_dot_mixed_indexed<bit U, string asm, SDPatternOperator op> {
+  def NAME : sve_int_dot_mixed_indexed<U, asm>;
+
+  def : SVE_4_Op_Imm_Pat<nxv4i32, op, nxv4i32, nxv16i8, nxv16i8, i32, VectorIndexS32b_timm, !cast<Instruction>(NAME)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Floating Point Matrix Multiply Accumulate Group
+//===----------------------------------------------------------------------===//
+
+class sve_fp_matrix_mla<bit sz, string asm, ZPRRegOp zprty>
+: I<(outs zprty:$Zda), (ins zprty:$_Zda, zprty:$Zn, zprty:$Zm),
+    asm, "\t$Zda, $Zn, $Zm", "", []>, Sched<[]> {
+  bits<5> Zda;
+  bits<5> Zn;
+  bits<5> Zm;
+  let Inst{31-23} = 0b011001001;
+  let Inst{22}    = sz;
+  let Inst{21}    = 1;
+  let Inst{20-16} = Zm;
+  let Inst{15-10} = 0b111001;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zda;
+
+  let Constraints = "$Zda = $_Zda";
+  let DestructiveInstType = DestructiveOther;
+  let ElementSize = zprty.ElementSize;
+}
+
+multiclass sve_fp_matrix_mla<bit sz, string asm, ZPRRegOp zprty, SDPatternOperator op, ValueType vt> {
+  def NAME : sve_fp_matrix_mla<sz, asm, zprty>;
+
+  def : SVE_3_Op_Pat<vt, op , vt, vt, vt, !cast<Instruction>(NAME)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Memory - Contiguous Load And Replicate 256-bit Group
+//===----------------------------------------------------------------------===//
+
+class sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand VecList>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4),
+  asm, "\t$Zt, $Pg/z, [$Rn, $imm4]", "", []>, Sched<[]> {
+  bits<5> Zt;
+  bits<5> Rn;
+  bits<3> Pg;
+  bits<4> imm4;
+  let Inst{31-25} = 0b1010010;
+  let Inst{24-23} = sz;
+  let Inst{22-20} = 0b010;
+  let Inst{19-16} = imm4;
+  let Inst{15-13} = 0b001;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+}
+
+multiclass sve_mem_ldor_si<bits<2> sz, string asm, RegisterOperand listty,
+                           ZPRRegOp zprty, ValueType Ty, ValueType PredTy, SDNode Ld1ro> {
+  def NAME : sve_mem_ldor_si<sz, asm, listty>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                  (!cast<Instruction>(NAME) listty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 1>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, 0), 0>;
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $imm4]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, simm4s32:$imm4), 0>;
+
+  // Base addressing mode
+  def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), GPR64sp:$base)),
+            (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, (i64 0))>;
+
+}
+
+class sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand VecList,
+                      RegisterOperand gprty>
+: I<(outs VecList:$Zt), (ins PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm),
+  asm, "\t$Zt, $Pg/z, [$Rn, $Rm]", "", []>, Sched<[]> {
+  bits<5> Zt;
+  bits<3> Pg;
+  bits<5> Rn;
+  bits<5> Rm;
+  let Inst{31-25} = 0b1010010;
+  let Inst{24-23} = sz;
+  let Inst{22-21} = 0b01;
+  let Inst{20-16} = Rm;
+  let Inst{15-13} = 0;
+  let Inst{12-10} = Pg;
+  let Inst{9-5}   = Rn;
+  let Inst{4-0}   = Zt;
+
+  let mayLoad = 1;
+}
+
+multiclass sve_mem_ldor_ss<bits<2> sz, string asm, RegisterOperand listty,
+                           ZPRRegOp zprty, RegisterOperand gprty, ValueType Ty,
+                           ValueType PredTy, SDNode Ld1ro, ComplexPattern AddrCP> {
+  def NAME : sve_mem_ldor_ss<sz, asm, listty, gprty>;
+
+  def : InstAlias<asm # "\t$Zt, $Pg/z, [$Rn, $Rm]",
+                  (!cast<Instruction>(NAME) zprty:$Zt, PPR3bAny:$Pg, GPR64sp:$Rn, gprty:$Rm), 0>;
+
+    def : Pat<(Ty (Ld1ro (PredTy PPR3bAny:$gp), (AddrCP GPR64sp:$base, gprty:$offset))),
+              (!cast<Instruction>(NAME) PPR3bAny:$gp, GPR64sp:$base, gprty:$offset)>;
+}
+
+//===----------------------------------------------------------------------===//
+// SVE Interleave 128-bit Elements Group
+//===----------------------------------------------------------------------===//
+
+class sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm>
+: I<(outs ZPR128:$Zd), (ins ZPR128:$Zn, ZPR128:$Zm),
+  asm, "\t$Zd, $Zn, $Zm",
+  "",
+  []>, Sched<[]> {
+  bits<5> Zd;
+  bits<5> Zm;
+  bits<5> Zn;
+  let Inst{31-21} = 0b00000101101;
+  let Inst{20-16} = Zm;
+  let Inst{15-13} = 0b000;
+  let Inst{12-11} = opc;
+  let Inst{10}    = P;
+  let Inst{9-5}   = Zn;
+  let Inst{4-0}   = Zd;
+}
+
+multiclass sve_int_perm_bin_perm_128_zz<bits<2> opc, bit P, string asm, SDPatternOperator op> {
+  def NAME : sve_int_perm_bin_perm_128_zz<opc, P, asm>;
+
+  def : SVE_2_Op_Pat<nxv16i8,  op, nxv16i8,  nxv16i8,  !cast<Instruction>(NAME)>;
+  def : SVE_2_Op_Pat<nxv8i16,  op, nxv8i16,  nxv8i16,  !cast<Instruction>(NAME)>;
+  def : SVE_2_Op_Pat<nxv8f16,  op, nxv8f16,  nxv8f16,  !cast<Instruction>(NAME)>;
+  def : SVE_2_Op_Pat<nxv4i32,  op, nxv4i32,  nxv4i32,  !cast<Instruction>(NAME)>;
+  def : SVE_2_Op_Pat<nxv4f32,  op, nxv4f32,  nxv4f32,  !cast<Instruction>(NAME)>;
+  def : SVE_2_Op_Pat<nxv2i64,  op, nxv2i64,  nxv2i64,  !cast<Instruction>(NAME)>;
+  def : SVE_2_Op_Pat<nxv2f64,  op, nxv2f64,  nxv2f64,  !cast<Instruction>(NAME)>;
+}
+
+/// Addressing modes
+def am_sve_indexed_s4 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-8,7>", [], [SDNPWantRoot]>;
+def am_sve_indexed_s6 :ComplexPattern<i64, 2, "SelectAddrModeIndexedSVE<-32,31>", [], [SDNPWantRoot]>;
+
+def am_sve_regreg_lsl0 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<0>", []>;
+def am_sve_regreg_lsl1 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<1>", []>;
+def am_sve_regreg_lsl2 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<2>", []>;
+def am_sve_regreg_lsl3 : ComplexPattern<i64, 2, "SelectSVERegRegAddrMode<3>", []>;
+
+// Predicated pseudo floating point two operand instructions.
+multiclass sve_fp_bin_pred_hfd<SDPatternOperator op> {
+  def _UNDEF_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+  def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+  def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
+
+  def : SVE_3_Op_Pat<nxv8f16, op, nxv8i1, nxv8f16, nxv8f16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+  def : SVE_3_Op_Pat<nxv4f32, op, nxv4i1, nxv4f32, nxv4f32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+  def : SVE_3_Op_Pat<nxv2f64, op, nxv2i1, nxv2f64, nxv2f64, !cast<Pseudo>(NAME # _UNDEF_D)>;
+}
+
+// Predicated pseudo integer two operand instructions.
+multiclass sve_int_bin_pred_bhsd<SDPatternOperator op> {
+  def _UNDEF_B : PredTwoOpPseudo<NAME # _B, ZPR8, FalseLanesUndef>;
+  def _UNDEF_H : PredTwoOpPseudo<NAME # _H, ZPR16, FalseLanesUndef>;
+  def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+  def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
+
+  def : SVE_3_Op_Pat<nxv16i8, op, nxv16i1, nxv16i8, nxv16i8, !cast<Pseudo>(NAME # _UNDEF_B)>;
+  def : SVE_3_Op_Pat<nxv8i16, op, nxv8i1,  nxv8i16, nxv8i16, !cast<Pseudo>(NAME # _UNDEF_H)>;
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1,  nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1,  nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
+}
+
+// As sve_int_bin_pred but when only i32 and i64 vector types are required.
+multiclass sve_int_bin_pred_sd<SDPatternOperator op> {
+  def _UNDEF_S : PredTwoOpPseudo<NAME # _S, ZPR32, FalseLanesUndef>;
+  def _UNDEF_D : PredTwoOpPseudo<NAME # _D, ZPR64, FalseLanesUndef>;
+
+  def : SVE_3_Op_Pat<nxv4i32, op, nxv4i1, nxv4i32, nxv4i32, !cast<Pseudo>(NAME # _UNDEF_S)>;
+  def : SVE_3_Op_Pat<nxv2i64, op, nxv2i1, nxv2i64, nxv2i64, !cast<Pseudo>(NAME # _UNDEF_D)>;
+}
diff --git a/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
new file mode 100644
index 000000000000..74fe0cdd1ea7
--- /dev/null
+++ b/llvm/lib/Target/AArch64/SVEIntrinsicOpts.cpp
@@ -0,0 +1,265 @@
+//===----- SVEIntrinsicOpts - SVE ACLE Intrinsics Opts --------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// Performs general IR level optimizations on SVE intrinsics.
+//
+// The main goal of this pass is to remove unnecessary reinterpret
+// intrinsics (llvm.aarch64.sve.convert.[to|from].svbool), e.g:
+//
+//   %1 = @llvm.aarch64.sve.convert.to.svbool.nxv4i1(<vscale x 4 x i1> %a)
+//   %2 = @llvm.aarch64.sve.convert.from.svbool.nxv4i1(<vscale x 16 x i1> %1)
+//
+// This pass also looks for ptest intrinsics & phi instructions where the
+// operands are being needlessly converted to and from svbool_t.
+//
+//===----------------------------------------------------------------------===//
+
+#include "Utils/AArch64BaseInfo.h"
+#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsAArch64.h"
+#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "sve-intrinsic-opts"
+
+namespace llvm {
+void initializeSVEIntrinsicOptsPass(PassRegistry &);
+}
+
+namespace {
+struct SVEIntrinsicOpts : public ModulePass {
+  static char ID; // Pass identification, replacement for typeid
+  SVEIntrinsicOpts() : ModulePass(ID) {
+    initializeSVEIntrinsicOptsPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+  static IntrinsicInst *isReinterpretToSVBool(Value *V);
+
+  static bool optimizeIntrinsic(Instruction *I);
+
+  bool optimizeFunctions(SmallSetVector<Function *, 4> &Functions);
+
+  static bool optimizeConvertFromSVBool(IntrinsicInst *I);
+  static bool optimizePTest(IntrinsicInst *I);
+
+  static bool processPhiNode(IntrinsicInst *I);
+};
+} // end anonymous namespace
+
+void SVEIntrinsicOpts::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.setPreservesCFG();
+}
+
+char SVEIntrinsicOpts::ID = 0;
+static const char *name = "SVE intrinsics optimizations";
+INITIALIZE_PASS_BEGIN(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass);
+INITIALIZE_PASS_END(SVEIntrinsicOpts, DEBUG_TYPE, name, false, false)
+
+namespace llvm {
+ModulePass *createSVEIntrinsicOptsPass() { return new SVEIntrinsicOpts(); }
+} // namespace llvm
+
+/// Returns V if it's a cast from <n x 16 x i1> (aka svbool_t), nullptr
+/// otherwise.
+IntrinsicInst *SVEIntrinsicOpts::isReinterpretToSVBool(Value *V) {
+  IntrinsicInst *I = dyn_cast<IntrinsicInst>(V);
+  if (!I)
+    return nullptr;
+
+  if (I->getIntrinsicID() != Intrinsic::aarch64_sve_convert_to_svbool)
+    return nullptr;
+
+  return I;
+}
+
+/// The function will remove redundant reinterprets casting in the presence
+/// of the control flow
+bool SVEIntrinsicOpts::processPhiNode(IntrinsicInst *X) {
+
+  SmallVector<Instruction *, 32> Worklist;
+  auto RequiredType = X->getType();
+
+  auto *PN = dyn_cast<PHINode>(X->getArgOperand(0));
+  assert(PN && "Expected Phi Node!");
+
+  // Don't create a new Phi unless we can remove the old one.
+  if (!PN->hasOneUse())
+    return false;
+
+  for (Value *IncValPhi : PN->incoming_values()) {
+    auto *Reinterpret = isReinterpretToSVBool(IncValPhi);
+    if (!Reinterpret ||
+        RequiredType != Reinterpret->getArgOperand(0)->getType())
+      return false;
+  }
+
+  // Create the new Phi
+  LLVMContext &Ctx = PN->getContext();
+  IRBuilder<> Builder(Ctx);
+  Builder.SetInsertPoint(PN);
+  PHINode *NPN = Builder.CreatePHI(RequiredType, PN->getNumIncomingValues());
+  Worklist.push_back(PN);
+
+  for (unsigned I = 0; I < PN->getNumIncomingValues(); I++) {
+    auto *Reinterpret = cast<Instruction>(PN->getIncomingValue(I));
+    NPN->addIncoming(Reinterpret->getOperand(0), PN->getIncomingBlock(I));
+    Worklist.push_back(Reinterpret);
+  }
+
+  // Cleanup Phi Node and reinterprets
+  X->replaceAllUsesWith(NPN);
+  X->eraseFromParent();
+
+  for (auto &I : Worklist)
+    if (I->use_empty())
+      I->eraseFromParent();
+
+  return true;
+}
+
+bool SVEIntrinsicOpts::optimizePTest(IntrinsicInst *I) {
+  IntrinsicInst *Op1 = dyn_cast<IntrinsicInst>(I->getArgOperand(0));
+  IntrinsicInst *Op2 = dyn_cast<IntrinsicInst>(I->getArgOperand(1));
+
+  if (Op1 && Op2 &&
+      Op1->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
+      Op2->getIntrinsicID() == Intrinsic::aarch64_sve_convert_to_svbool &&
+      Op1->getArgOperand(0)->getType() == Op2->getArgOperand(0)->getType()) {
+
+    Value *Ops[] = {Op1->getArgOperand(0), Op2->getArgOperand(0)};
+    Type *Tys[] = {Op1->getArgOperand(0)->getType()};
+    Module *M = I->getParent()->getParent()->getParent();
+
+    auto Fn = Intrinsic::getDeclaration(M, I->getIntrinsicID(), Tys);
+    auto CI = CallInst::Create(Fn, Ops, I->getName(), I);
+
+    I->replaceAllUsesWith(CI);
+    I->eraseFromParent();
+    if (Op1->use_empty())
+      Op1->eraseFromParent();
+    if (Op2->use_empty())
+      Op2->eraseFromParent();
+
+    return true;
+  }
+
+  return false;
+}
+
+bool SVEIntrinsicOpts::optimizeConvertFromSVBool(IntrinsicInst *I) {
+  assert(I->getIntrinsicID() == Intrinsic::aarch64_sve_convert_from_svbool &&
+         "Unexpected opcode");
+
+  // If the reinterpret instruction operand is a PHI Node
+  if (isa<PHINode>(I->getArgOperand(0)))
+    return processPhiNode(I);
+
+  // If we have a reinterpret intrinsic I of type A which is converting from
+  // another reinterpret Y of type B, and the source type of Y is A, then we can
+  // elide away both reinterprets if there are no other users of Y.
+  auto *Y = isReinterpretToSVBool(I->getArgOperand(0));
+  if (!Y)
+    return false;
+
+  Value *SourceVal = Y->getArgOperand(0);
+  if (I->getType() != SourceVal->getType())
+    return false;
+
+  I->replaceAllUsesWith(SourceVal);
+  I->eraseFromParent();
+  if (Y->use_empty())
+    Y->eraseFromParent();
+
+  return true;
+}
+
+bool SVEIntrinsicOpts::optimizeIntrinsic(Instruction *I) {
+  IntrinsicInst *IntrI = dyn_cast<IntrinsicInst>(I);
+  if (!IntrI)
+    return false;
+
+  switch (IntrI->getIntrinsicID()) {
+  case Intrinsic::aarch64_sve_convert_from_svbool:
+    return optimizeConvertFromSVBool(IntrI);
+  case Intrinsic::aarch64_sve_ptest_any:
+  case Intrinsic::aarch64_sve_ptest_first:
+  case Intrinsic::aarch64_sve_ptest_last:
+    return optimizePTest(IntrI);
+  default:
+    return false;
+  }
+
+  return true;
+}
+
+bool SVEIntrinsicOpts::optimizeFunctions(
+    SmallSetVector<Function *, 4> &Functions) {
+  bool Changed = false;
+  for (auto *F : Functions) {
+    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>(*F).getDomTree();
+
+    // Traverse the DT with an rpo walk so we see defs before uses, allowing
+    // simplification to be done incrementally.
+    BasicBlock *Root = DT->getRoot();
+    ReversePostOrderTraversal<BasicBlock *> RPOT(Root);
+    for (auto *BB : RPOT)
+      for (Instruction &I : make_early_inc_range(*BB))
+        Changed |= optimizeIntrinsic(&I);
+  }
+  return Changed;
+}
+
+bool SVEIntrinsicOpts::runOnModule(Module &M) {
+  bool Changed = false;
+  SmallSetVector<Function *, 4> Functions;
+
+  // Check for SVE intrinsic declarations first so that we only iterate over
+  // relevant functions. Where an appropriate declaration is found, store the
+  // function(s) where it is used so we can target these only.
+  for (auto &F : M.getFunctionList()) {
+    if (!F.isDeclaration())
+      continue;
+
+    switch (F.getIntrinsicID()) {
+    case Intrinsic::aarch64_sve_convert_from_svbool:
+    case Intrinsic::aarch64_sve_ptest_any:
+    case Intrinsic::aarch64_sve_ptest_first:
+    case Intrinsic::aarch64_sve_ptest_last:
+      for (auto I = F.user_begin(), E = F.user_end(); I != E;) {
+        auto *Inst = dyn_cast<Instruction>(*I++);
+        Functions.insert(Inst->getFunction());
+      }
+      break;
+    default:
+      break;
+    }
+  }
+
+  if (!Functions.empty())
+    Changed |= optimizeFunctions(Functions);
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
index 87980cddb7c0..4e289fbe2325 100644
--- a/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
+++ b/llvm/lib/Target/AArch64/Utils/AArch64BaseInfo.h
@@ -658,6 +658,7 @@ namespace AArch64 {
 // in index i*P of a <n x (M*P) x t> vector.  The other elements of the
 // <n x (M*P) x t> vector (such as index 1) are undefined.
 static constexpr unsigned SVEBitsPerBlock = 128;
+static constexpr unsigned SVEMaxBitsPerVector = 2048;
 const unsigned NeonBitsPerVector = 128;
 } // end namespace AArch64
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h
index fbed51de0ea4..88c79665be60 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.h
@@ -10,15 +10,16 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPU_H
 
-#include "llvm/Target/TargetMachine.h"
 #include "llvm/IR/IntrinsicsR600.h" // TODO: Sink this.
 #include "llvm/IR/IntrinsicsAMDGPU.h" // TODO: Sink this.
+#include "llvm/Support/CodeGen.h"
 
 namespace llvm {
 
 class AMDGPUTargetMachine;
 class FunctionPass;
 class GCNTargetMachine;
+class ImmutablePass;
 class ModulePass;
 class Pass;
 class Target;
@@ -27,6 +28,14 @@ class TargetOptions;
 class PassRegistry;
 class Module;
 
+// GlobalISel passes
+void initializeAMDGPUPreLegalizerCombinerPass(PassRegistry &);
+FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone);
+void initializeAMDGPUPostLegalizerCombinerPass(PassRegistry &);
+FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone);
+FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone);
+void initializeAMDGPURegBankCombinerPass(PassRegistry &);
+
 // R600 Passes
 FunctionPass *createR600VectorRegMerger();
 FunctionPass *createR600ExpandSpecialInstrsPass();
@@ -55,8 +64,9 @@ FunctionPass *createSIMemoryLegalizerPass();
 FunctionPass *createSIInsertWaitcntsPass();
 FunctionPass *createSIPreAllocateWWMRegsPass();
 FunctionPass *createSIFormMemoryClausesPass();
-FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetOptions &,
-                                               const TargetMachine *);
+
+FunctionPass *createSIPostRABundlerPass();
+FunctionPass *createAMDGPUSimplifyLibCallsPass(const TargetMachine *);
 FunctionPass *createAMDGPUUseNativeCallsPass();
 FunctionPass *createAMDGPUCodeGenPreparePass();
 FunctionPass *createAMDGPUMachineCFGStructurizerPass();
@@ -159,6 +169,9 @@ extern char &SILowerControlFlowID;
 void initializeSIRemoveShortExecBranchesPass(PassRegistry &);
 extern char &SIRemoveShortExecBranchesID;
 
+void initializeSIPreEmitPeepholePass(PassRegistry &);
+extern char &SIPreEmitPeepholeID;
+
 void initializeSIInsertSkipsPass(PassRegistry &);
 extern char &SIInsertSkipsPassID;
 
@@ -185,6 +198,10 @@ FunctionPass *createAMDGPUPromoteAlloca();
 void initializeAMDGPUPromoteAllocaPass(PassRegistry&);
 extern char &AMDGPUPromoteAllocaID;
 
+FunctionPass *createAMDGPUPromoteAllocaToVector();
+void initializeAMDGPUPromoteAllocaToVectorPass(PassRegistry&);
+extern char &AMDGPUPromoteAllocaToVectorID;
+
 Pass *createAMDGPUStructurizeCFGPass();
 FunctionPass *createAMDGPUISelDag(
   TargetMachine *TM = nullptr,
@@ -219,12 +236,18 @@ extern char &SIMemoryLegalizerID;
 void initializeSIModeRegisterPass(PassRegistry&);
 extern char &SIModeRegisterID;
 
+void initializeSIInsertHardClausesPass(PassRegistry &);
+extern char &SIInsertHardClausesID;
+
 void initializeSIInsertWaitcntsPass(PassRegistry&);
 extern char &SIInsertWaitcntsID;
 
 void initializeSIFormMemoryClausesPass(PassRegistry&);
 extern char &SIFormMemoryClausesID;
 
+void initializeSIPostRABundlerPass(PassRegistry&);
+extern char &SIPostRABundlerID;
+
 void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&);
 extern char &AMDGPUUnifyDivergentExitNodesID;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 42b477e07b3b..e32f0fcc4771 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -33,6 +33,12 @@ def FeatureFastFMAF32 : SubtargetFeature<"fast-fmaf",
   "Assuming f32 fma is at least as fast as mul + add"
 >;
 
+def FeatureFastDenormalF32 : SubtargetFeature<"fast-denormal-f32",
+  "FastDenormalF32",
+  "true",
+  "Enabling denormals does not cause f32 instructions to run at f64 rates"
+>;
+
 def FeatureMIMG_R128 : SubtargetFeature<"mimg-r128",
   "MIMG_R128",
   "true",
@@ -254,6 +260,12 @@ def FeatureGFX10Insts : SubtargetFeature<"gfx10-insts",
   "Additional instructions for GFX10+"
 >;
 
+def FeatureGFX10_3Insts : SubtargetFeature<"gfx10-3-insts",
+  "GFX10_3Insts",
+  "true",
+  "Additional instructions for GFX10.3"
+>;
+
 def FeatureGFX7GFX8GFX9Insts : SubtargetFeature<"gfx7-gfx8-gfx9-insts",
   "GFX7GFX8GFX9Insts",
   "true",
@@ -360,7 +372,19 @@ def FeatureDPP8 : SubtargetFeature<"dpp8",
 def FeatureR128A16 : SubtargetFeature<"r128-a16",
   "HasR128A16",
   "true",
-  "Support 16 bit coordindates/gradients/lod/clamp/mip types on gfx9"
+  "Support gfx9-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands, where a16 is aliased with r128"
+>;
+
+def FeatureGFX10A16 : SubtargetFeature<"a16",
+  "HasGFX10A16",
+  "true",
+  "Support gfx10-style A16 for 16-bit coordinates/gradients/lod/clamp/mip image operands"
+>;
+
+def FeatureG16 : SubtargetFeature<"g16",
+  "HasG16",
+  "true",
+  "Support G16 for 16-bit gradient image operands"
 >;
 
 def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
@@ -369,6 +393,12 @@ def FeatureNSAEncoding : SubtargetFeature<"nsa-encoding",
   "Support NSA encoding for image instructions"
 >;
 
+def FeatureGFX10_BEncoding : SubtargetFeature<"gfx10_b-encoding",
+  "GFX10_BEncoding",
+  "true",
+  "Encoding format GFX10_B"
+>;
+
 def FeatureIntClamp : SubtargetFeature<"int-clamp-insts",
   "HasIntClamp",
   "true",
@@ -439,7 +469,8 @@ def FeatureAtomicFaddInsts : SubtargetFeature<"atomic-fadd-insts",
   "HasAtomicFaddInsts",
   "true",
   "Has buffer_atomic_add_f32, buffer_atomic_pk_add_f16, global_atomic_add_f32, "
-  "global_atomic_pk_add_f16 instructions"
+  "global_atomic_pk_add_f16 instructions",
+  [FeatureFlatGlobalInsts]
 >;
 
 def FeatureDoesNotSupportSRAMECC : SubtargetFeature<"no-sram-ecc-support",
@@ -466,6 +497,30 @@ def FeatureVscnt : SubtargetFeature<"vscnt",
   "Has separate store vscnt counter"
 >;
 
+def FeatureGetWaveIdInst : SubtargetFeature<"get-wave-id-inst",
+  "HasGetWaveIdInst",
+  "true",
+  "Has s_get_waveid_in_workgroup instruction"
+>;
+
+def FeatureSMemTimeInst : SubtargetFeature<"s-memtime-inst",
+  "HasSMemTimeInst",
+  "true",
+  "Has s_memtime instruction"
+>;
+
+def FeatureMadMacF32Insts : SubtargetFeature<"mad-mac-f32-insts",
+  "HasMadMacF32Insts",
+  "true",
+  "Has v_mad_f32/v_mac_f32/v_madak_f32/v_madmk_f32 instructions"
+>;
+
+def FeatureDsSrc2Insts : SubtargetFeature<"ds-src2-insts",
+  "HasDsSrc2Insts",
+  "true",
+  "Has ds_*_src2 instructions"
+>;
+
 def FeatureRegisterBanking : SubtargetFeature<"register-banking",
   "HasRegisterBanking",
   "true",
@@ -488,36 +543,6 @@ def FeatureNoDataDepHazard : SubtargetFeature<"no-data-dep-hazard",
 // Subtarget Features (options and debugging)
 //===------------------------------------------------------------===//
 
-// Denormal handling for fp64 and fp16 is controlled by the same
-// config register when fp16 supported.
-// TODO: Do we need a separate f16 setting when not legal?
-def FeatureFP64FP16Denormals : SubtargetFeature<"fp64-fp16-denormals",
-  "FP64FP16Denormals",
-  "true",
-  "Enable double and half precision denormal handling",
-  [FeatureFP64]
->;
-
-def FeatureFP64Denormals : SubtargetFeature<"fp64-denormals",
-  "FP64FP16Denormals",
-  "true",
-  "Enable double and half precision denormal handling",
-  [FeatureFP64, FeatureFP64FP16Denormals]
->;
-
-def FeatureFP16Denormals : SubtargetFeature<"fp16-denormals",
-  "FP64FP16Denormals",
-  "true",
-  "Enable half precision denormal handling",
-  [FeatureFP64FP16Denormals]
->;
-
-def FeatureFPExceptions : SubtargetFeature<"fp-exceptions",
-  "FPExceptions",
-  "true",
-  "Enable floating point exceptions"
->;
-
 class FeatureMaxPrivateElementSize<int size> : SubtargetFeature<
   "max-private-element-size-"#size,
   "MaxPrivateElementSize",
@@ -628,9 +653,10 @@ class GCNSubtargetFeatureGeneration <string Value,
 def FeatureSouthernIslands : GCNSubtargetFeatureGeneration<"SOUTHERN_ISLANDS",
     "southern-islands",
   [FeatureFP64, FeatureLocalMemorySize32768, FeatureMIMG_R128,
-  FeatureWavefrontSize64,
-  FeatureLDSBankCount32, FeatureMovrel, FeatureTrigReducedRange,
-  FeatureDoesNotSupportSRAMECC, FeatureDoesNotSupportXNACK]
+  FeatureWavefrontSize64, FeatureSMemTimeInst, FeatureMadMacF32Insts,
+  FeatureDsSrc2Insts, FeatureLDSBankCount32, FeatureMovrel,
+  FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC,
+  FeatureDoesNotSupportXNACK]
 >;
 
 def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
@@ -638,7 +664,8 @@ def FeatureSeaIslands : GCNSubtargetFeatureGeneration<"SEA_ISLANDS",
   [FeatureFP64, FeatureLocalMemorySize65536, FeatureMIMG_R128,
   FeatureWavefrontSize64, FeatureFlatAddressSpace,
   FeatureCIInsts, FeatureMovrel, FeatureTrigReducedRange,
-  FeatureGFX7GFX8GFX9Insts, FeatureDoesNotSupportSRAMECC]
+  FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
+  FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC]
 >;
 
 def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
@@ -649,8 +676,9 @@ def FeatureVolcanicIslands : GCNSubtargetFeatureGeneration<"VOLCANIC_ISLANDS",
    FeatureSMemRealTime, FeatureVGPRIndexMode, FeatureMovrel,
    FeatureScalarStores, FeatureInv2PiInlineImm,
    FeatureSDWA, FeatureSDWAOutModsVOPC, FeatureSDWAMac, FeatureDPP,
-   FeatureIntClamp, FeatureTrigReducedRange, FeatureDoesNotSupportSRAMECC,
-   FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts
+   FeatureIntClamp, FeatureTrigReducedRange, FeatureGFX8Insts,
+   FeatureGFX7GFX8GFX9Insts, FeatureSMemTimeInst, FeatureMadMacF32Insts,
+   FeatureDsSrc2Insts, FeatureDoesNotSupportSRAMECC, FeatureFastDenormalF32
   ]
 >;
 
@@ -665,7 +693,9 @@ def FeatureGFX9 : GCNSubtargetFeatureGeneration<"GFX9",
    FeatureSDWA, FeatureSDWAOmod, FeatureSDWAScalar, FeatureSDWASdst,
    FeatureFlatInstOffsets, FeatureFlatGlobalInsts, FeatureFlatScratchInsts,
    FeatureAddNoCarryInsts, FeatureGFX8Insts, FeatureGFX7GFX8GFX9Insts,
-   FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16
+   FeatureScalarFlatScratchInsts, FeatureScalarAtomics, FeatureR128A16,
+   FeatureSMemTimeInst, FeatureMadMacF32Insts, FeatureDsSrc2Insts,
+   FeatureFastDenormalF32
   ]
 >;
 
@@ -682,7 +712,8 @@ def FeatureGFX10 : GCNSubtargetFeatureGeneration<"GFX10",
    FeatureAddNoCarryInsts, FeatureFmaMixInsts, FeatureGFX8Insts,
    FeatureNoSdstCMPX, FeatureVscnt, FeatureRegisterBanking,
    FeatureVOP3Literal, FeatureDPP8,
-   FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC
+   FeatureNoDataDepHazard, FeaturePkFmacF16Inst, FeatureDoesNotSupportSRAMECC,
+   FeatureGFX10A16, FeatureFastDenormalF32, FeatureG16
   ]
 >;
 
@@ -853,6 +884,10 @@ def FeatureISAVersion10_1_0 : FeatureSet<
      FeatureScalarStores,
      FeatureScalarAtomics,
      FeatureScalarFlatScratchInsts,
+     FeatureGetWaveIdInst,
+     FeatureSMemTimeInst,
+     FeatureMadMacF32Insts,
+     FeatureDsSrc2Insts,
      FeatureLdsMisalignedBug,
      FeatureDoesNotSupportXNACK,
      FeatureCodeObjectV3])>;
@@ -871,6 +906,10 @@ def FeatureISAVersion10_1_1 : FeatureSet<
      FeatureScalarStores,
      FeatureScalarAtomics,
      FeatureScalarFlatScratchInsts,
+     FeatureGetWaveIdInst,
+     FeatureSMemTimeInst,
+     FeatureMadMacF32Insts,
+     FeatureDsSrc2Insts,
      FeatureDoesNotSupportXNACK,
      FeatureCodeObjectV3])>;
 
@@ -888,10 +927,29 @@ def FeatureISAVersion10_1_2 : FeatureSet<
      FeatureScalarStores,
      FeatureScalarAtomics,
      FeatureScalarFlatScratchInsts,
+     FeatureGetWaveIdInst,
+     FeatureSMemTimeInst,
+     FeatureMadMacF32Insts,
+     FeatureDsSrc2Insts,
      FeatureLdsMisalignedBug,
      FeatureDoesNotSupportXNACK,
      FeatureCodeObjectV3])>;
 
+def FeatureISAVersion10_3_0 : FeatureSet<
+  [FeatureGFX10,
+   FeatureGFX10_BEncoding,
+   FeatureGFX10_3Insts,
+   FeatureLDSBankCount32,
+   FeatureDLInsts,
+   FeatureDot1Insts,
+   FeatureDot2Insts,
+   FeatureDot5Insts,
+   FeatureDot6Insts,
+   FeatureNSAEncoding,
+   FeatureWavefrontSize32,
+   FeatureDoesNotSupportXNACK,
+   FeatureCodeObjectV3]>;
+
 //===----------------------------------------------------------------------===//
 
 def AMDGPUInstrInfo : InstrInfo {
@@ -973,190 +1031,222 @@ def NullALU : InstrItinClass;
 
 def isGFX6 :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS">,
-  AssemblerPredicate<"FeatureSouthernIslands">;
+  AssemblerPredicate<(all_of FeatureSouthernIslands)>;
 
 def isGFX6GFX7 :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">,
-  AssemblerPredicate<"!FeatureGCN3Encoding,!FeatureGFX10Insts">;
+  AssemblerPredicate<(all_of (not FeatureGCN3Encoding), (not FeatureGFX10Insts))>;
 
 def isGFX6GFX7GFX10 :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
-  AssemblerPredicate<"!FeatureGCN3Encoding">;
+  AssemblerPredicate<(all_of (not FeatureGCN3Encoding))>;
 
 def isGFX7Only :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS">,
-  AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts,!FeatureGFX10Insts">;
+  AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts, (not FeatureGFX10Insts))>;
 
 def isGFX7GFX10 :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::GFX10">,
-  AssemblerPredicate<"!FeatureGCN3Encoding,FeatureCIInsts">;
+  AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureCIInsts)>;
 
 def isGFX7GFX8GFX9 :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
-  AssemblerPredicate<"FeatureGFX7GFX8GFX9Insts">;
+  AssemblerPredicate<(all_of FeatureGFX7GFX8GFX9Insts)>;
 
 def isGFX6GFX7GFX8GFX9 :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::SOUTHERN_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
-  AssemblerPredicate<"!FeatureGFX10Insts">;
+  AssemblerPredicate<(all_of (not FeatureGFX10Insts))>;
 
 def isGFX7Plus :
   Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::SEA_ISLANDS">,
-  AssemblerPredicate<"FeatureCIInsts">;
+  AssemblerPredicate<(all_of FeatureCIInsts)>;
 
 def isGFX8Plus :
   Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS">,
-  AssemblerPredicate<"FeatureGFX8Insts">;
+  AssemblerPredicate<(all_of FeatureGFX8Insts)>;
 
 def isGFX8Only : Predicate<"Subtarget->getGeneration() =="
                            "AMDGPUSubtarget::VOLCANIC_ISLANDS">,
-  AssemblerPredicate <"FeatureVolcanicIslands">;
+  AssemblerPredicate <(all_of FeatureVolcanicIslands)>;
 
 def isGFX9Plus :
   Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
-  AssemblerPredicate<"FeatureGFX9Insts">;
+  AssemblerPredicate<(all_of FeatureGFX9Insts)>;
 
 def isGFX9Only : Predicate <
   "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
-  AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts">;
+  AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureGFX9Insts)>;
 
 def isGFX8GFX9 :
   Predicate<"Subtarget->getGeneration() == AMDGPUSubtarget::VOLCANIC_ISLANDS ||"
             "Subtarget->getGeneration() == AMDGPUSubtarget::GFX9">,
-  AssemblerPredicate<"FeatureGFX8Insts,FeatureGCN3Encoding">;
+  AssemblerPredicate<(all_of FeatureGFX8Insts, FeatureGCN3Encoding)>;
 
 def isGFX10Plus :
   Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX10">,
-  AssemblerPredicate<"FeatureGFX10Insts">;
+  AssemblerPredicate<(all_of FeatureGFX10Insts)>;
 
 def HasFlatAddressSpace : Predicate<"Subtarget->hasFlatAddressSpace()">,
-  AssemblerPredicate<"FeatureFlatAddressSpace">;
+  AssemblerPredicate<(all_of FeatureFlatAddressSpace)>;
 
 def HasFlatGlobalInsts : Predicate<"Subtarget->hasFlatGlobalInsts()">,
-  AssemblerPredicate<"FeatureFlatGlobalInsts">;
+  AssemblerPredicate<(all_of FeatureFlatGlobalInsts)>;
 def HasFlatScratchInsts : Predicate<"Subtarget->hasFlatScratchInsts()">,
-  AssemblerPredicate<"FeatureFlatScratchInsts">;
+  AssemblerPredicate<(all_of FeatureFlatScratchInsts)>;
 def HasScalarFlatScratchInsts : Predicate<"Subtarget->hasScalarFlatScratchInsts()">,
-  AssemblerPredicate<"FeatureScalarFlatScratchInsts">;
+  AssemblerPredicate<(all_of FeatureScalarFlatScratchInsts)>;
 def HasD16LoadStore : Predicate<"Subtarget->hasD16LoadStore()">,
-  AssemblerPredicate<"FeatureGFX9Insts">;
+  AssemblerPredicate<(all_of FeatureGFX9Insts)>;
+
+def HasGFX10_BEncoding : Predicate<"Subtarget->hasGFX10_BEncoding()">,
+  AssemblerPredicate<(all_of FeatureGFX10_BEncoding)>;
 
 def HasUnpackedD16VMem : Predicate<"Subtarget->hasUnpackedD16VMem()">,
-  AssemblerPredicate<"FeatureUnpackedD16VMem">;
+  AssemblerPredicate<(all_of FeatureUnpackedD16VMem)>;
 def HasPackedD16VMem : Predicate<"!Subtarget->hasUnpackedD16VMem()">,
-  AssemblerPredicate<"!FeatureUnpackedD16VMem">;
+  AssemblerPredicate<(all_of (not FeatureUnpackedD16VMem))>;
 
 def D16PreservesUnusedBits :
   Predicate<"Subtarget->d16PreservesUnusedBits()">,
-  AssemblerPredicate<"FeatureGFX9Insts,!FeatureSRAMECC">;
+  AssemblerPredicate<(all_of FeatureGFX9Insts, (not FeatureSRAMECC))>;
 
 def LDSRequiresM0Init : Predicate<"Subtarget->ldsRequiresM0Init()">;
 def NotLDSRequiresM0Init : Predicate<"!Subtarget->ldsRequiresM0Init()">;
 
 def HasDSAddTid : Predicate<"Subtarget->getGeneration() >= AMDGPUSubtarget::GFX9">,
-  AssemblerPredicate<"FeatureGFX9Insts">;
+  AssemblerPredicate<(all_of FeatureGFX9Insts)>;
+
+def HasLDSFPAtomics : Predicate<"Subtarget->hasLDSFPAtomics()">,
+  AssemblerPredicate<(all_of FeatureGFX8Insts)>;
 
 def HasAddNoCarryInsts : Predicate<"Subtarget->hasAddNoCarry()">,
-  AssemblerPredicate<"FeatureAddNoCarryInsts">;
+  AssemblerPredicate<(all_of FeatureAddNoCarryInsts)>;
 
 def NotHasAddNoCarryInsts : Predicate<"!Subtarget->hasAddNoCarry()">;
 
 def Has16BitInsts : Predicate<"Subtarget->has16BitInsts()">,
-  AssemblerPredicate<"Feature16BitInsts">;
+  AssemblerPredicate<(all_of Feature16BitInsts)>;
 def HasVOP3PInsts : Predicate<"Subtarget->hasVOP3PInsts()">,
-  AssemblerPredicate<"FeatureVOP3P">;
+  AssemblerPredicate<(all_of FeatureVOP3P)>;
+
+def HasMinMaxDenormModes : Predicate<"Subtarget->supportsMinMaxDenormModes()">;
+def NotHasMinMaxDenormModes : Predicate<"!Subtarget->supportsMinMaxDenormModes()">;
 
 def HasSDWA : Predicate<"Subtarget->hasSDWA()">,
-  AssemblerPredicate<"FeatureSDWA,FeatureVolcanicIslands">;
+  AssemblerPredicate<(all_of FeatureSDWA, FeatureVolcanicIslands)>;
 
 def HasSDWA9 :
   Predicate<"Subtarget->hasSDWA()">,
-  AssemblerPredicate<"FeatureGCN3Encoding,FeatureGFX9Insts,FeatureSDWA">;
+  AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureGFX9Insts,FeatureSDWA)>;
 
 def HasSDWA10 :
   Predicate<"Subtarget->hasSDWA()">,
-  AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureSDWA">;
+  AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureSDWA)>;
 
 def HasDPP : Predicate<"Subtarget->hasDPP()">,
-  AssemblerPredicate<"FeatureGCN3Encoding,FeatureDPP">;
+  AssemblerPredicate<(all_of FeatureGCN3Encoding, FeatureDPP)>;
 
 def HasDPP8 : Predicate<"Subtarget->hasDPP8()">,
-  AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP8">;
+  AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP8)>;
 
 def HasR128A16 : Predicate<"Subtarget->hasR128A16()">,
-  AssemblerPredicate<"FeatureR128A16">;
+  AssemblerPredicate<(all_of FeatureR128A16)>;
+
+def HasGFX10A16 : Predicate<"Subtarget->hasGFX10A16()">,
+  AssemblerPredicate<(all_of FeatureGFX10A16)>;
+
+def HasG16 : Predicate<"Subtarget->hasG16()">,
+  AssemblerPredicate<(all_of FeatureG16)>;
 
 def HasDPP16 : Predicate<"Subtarget->hasDPP()">,
-  AssemblerPredicate<"!FeatureGCN3Encoding,FeatureGFX10Insts,FeatureDPP">;
+  AssemblerPredicate<(all_of (not FeatureGCN3Encoding), FeatureGFX10Insts, FeatureDPP)>;
 
 def HasIntClamp : Predicate<"Subtarget->hasIntClamp()">,
-  AssemblerPredicate<"FeatureIntClamp">;
+  AssemblerPredicate<(all_of FeatureIntClamp)>;
 
 def HasMadMixInsts : Predicate<"Subtarget->hasMadMixInsts()">,
-  AssemblerPredicate<"FeatureMadMixInsts">;
+  AssemblerPredicate<(all_of FeatureMadMixInsts)>;
 
 def HasScalarStores : Predicate<"Subtarget->hasScalarStores()">,
-  AssemblerPredicate<"FeatureScalarStores">;
+  AssemblerPredicate<(all_of FeatureScalarStores)>;
 
 def HasScalarAtomics : Predicate<"Subtarget->hasScalarAtomics()">,
-  AssemblerPredicate<"FeatureScalarAtomics">;
+  AssemblerPredicate<(all_of FeatureScalarAtomics)>;
 
 def HasNoSdstCMPX : Predicate<"Subtarget->hasNoSdstCMPX()">,
-  AssemblerPredicate<"FeatureNoSdstCMPX">;
+  AssemblerPredicate<(all_of FeatureNoSdstCMPX)>;
 
 def HasSdstCMPX : Predicate<"!Subtarget->hasNoSdstCMPX()">,
-  AssemblerPredicate<"!FeatureNoSdstCMPX">;
+  AssemblerPredicate<(all_of (not FeatureNoSdstCMPX))>;
 
 def has16BankLDS : Predicate<"Subtarget->getLDSBankCount() == 16">;
 def has32BankLDS : Predicate<"Subtarget->getLDSBankCount() == 32">;
 def HasVGPRIndexMode : Predicate<"Subtarget->hasVGPRIndexMode()">,
-                      AssemblerPredicate<"FeatureVGPRIndexMode">;
+                      AssemblerPredicate<(all_of FeatureVGPRIndexMode)>;
 def HasMovrel : Predicate<"Subtarget->hasMovrel()">,
-                AssemblerPredicate<"FeatureMovrel">;
+                AssemblerPredicate<(all_of FeatureMovrel)>;
 
 def HasFmaMixInsts : Predicate<"Subtarget->hasFmaMixInsts()">,
-  AssemblerPredicate<"FeatureFmaMixInsts">;
+  AssemblerPredicate<(all_of FeatureFmaMixInsts)>;
 
 def HasDLInsts : Predicate<"Subtarget->hasDLInsts()">,
-  AssemblerPredicate<"FeatureDLInsts">;
+  AssemblerPredicate<(all_of FeatureDLInsts)>;
 
 def HasDot1Insts : Predicate<"Subtarget->hasDot1Insts()">,
-  AssemblerPredicate<"FeatureDot1Insts">;
+  AssemblerPredicate<(all_of FeatureDot1Insts)>;
 
 def HasDot2Insts : Predicate<"Subtarget->hasDot2Insts()">,
-  AssemblerPredicate<"FeatureDot2Insts">;
+  AssemblerPredicate<(all_of FeatureDot2Insts)>;
 
 def HasDot3Insts : Predicate<"Subtarget->hasDot3Insts()">,
-  AssemblerPredicate<"FeatureDot3Insts">;
+  AssemblerPredicate<(all_of FeatureDot3Insts)>;
 
 def HasDot4Insts : Predicate<"Subtarget->hasDot4Insts()">,
-  AssemblerPredicate<"FeatureDot4Insts">;
+  AssemblerPredicate<(all_of FeatureDot4Insts)>;
 
 def HasDot5Insts : Predicate<"Subtarget->hasDot5Insts()">,
-  AssemblerPredicate<"FeatureDot5Insts">;
+  AssemblerPredicate<(all_of FeatureDot5Insts)>;
 
 def HasDot6Insts : Predicate<"Subtarget->hasDot6Insts()">,
-  AssemblerPredicate<"FeatureDot6Insts">;
+  AssemblerPredicate<(all_of FeatureDot6Insts)>;
+
+def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
+  AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;
 
 def HasMAIInsts : Predicate<"Subtarget->hasMAIInsts()">,
-  AssemblerPredicate<"FeatureMAIInsts">;
+  AssemblerPredicate<(all_of FeatureMAIInsts)>;
+
+def HasSMemTimeInst : Predicate<"Subtarget->hasSMemTimeInst()">,
+  AssemblerPredicate<(all_of FeatureSMemTimeInst)>;
+
+def HasNoSMemTimeInst : Predicate<"!Subtarget->hasSMemTimeInst()">;
 
 def HasPkFmacF16Inst : Predicate<"Subtarget->hasPkFmacF16Inst()">,
-  AssemblerPredicate<"FeaturePkFmacF16Inst">;
+  AssemblerPredicate<(all_of FeaturePkFmacF16Inst)>;
+
+def HasMadMacF32Insts : Predicate<"Subtarget->hasMadMacF32Insts()">,
+  AssemblerPredicate<(all_of FeatureMadMacF32Insts)>;
 
 def HasAtomicFaddInsts : Predicate<"Subtarget->hasAtomicFaddInsts()">,
-  AssemblerPredicate<"FeatureAtomicFaddInsts">;
+  AssemblerPredicate<(all_of FeatureAtomicFaddInsts)>;
+
+def HasNoMadMacF32Insts : Predicate<"!Subtarget->hasMadMacF32Insts()">,
+  AssemblerPredicate<(all_of (not FeatureMadMacF32Insts))>;
+
+def HasDsSrc2Insts : Predicate<"!Subtarget->hasDsSrc2Insts()">,
+  AssemblerPredicate<(all_of FeatureDsSrc2Insts)>;
 
 def HasOffset3fBug : Predicate<"!Subtarget->hasOffset3fBug()">,
-  AssemblerPredicate<"FeatureOffset3fBug">;
+  AssemblerPredicate<(all_of FeatureOffset3fBug)>;
 
 def EnableLateCFGStructurize : Predicate<
   "EnableLateStructurizeCFG">;
@@ -1165,7 +1255,7 @@ def EnableLateCFGStructurize : Predicate<
 include "SISchedule.td"
 include "GCNProcessors.td"
 include "AMDGPUInstrInfo.td"
-include "AMDGPURegisterInfo.td"
+include "SIRegisterInfo.td"
 include "AMDGPURegisterBanks.td"
 include "AMDGPUInstructions.td"
 include "SIInstrInfo.td"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
index bba132c3bc46..bb2aba044974 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.cpp
@@ -91,12 +91,16 @@ AliasResult AMDGPUAAResult::alias(const MemoryLocation &LocA,
 
 bool AMDGPUAAResult::pointsToConstantMemory(const MemoryLocation &Loc,
                                             AAQueryInfo &AAQI, bool OrLocal) {
+  unsigned AS = Loc.Ptr->getType()->getPointerAddressSpace();
+  if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
+      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+    return true;
+
   const Value *Base = GetUnderlyingObject(Loc.Ptr, DL);
-  unsigned AS = Base->getType()->getPointerAddressSpace();
+  AS = Base->getType()->getPointerAddressSpace();
   if (AS == AMDGPUAS::CONSTANT_ADDRESS ||
-      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT) {
+      AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
     return true;
-  }
 
   if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(Base)) {
     if (GV->isConstant())
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
index fb722920900f..fd8889ea5c0d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAliasAnalysis.h
@@ -48,10 +48,6 @@ public:
                     AAQueryInfo &AAQI);
   bool pointsToConstantMemory(const MemoryLocation &Loc, AAQueryInfo &AAQI,
                               bool OrLocal);
-
-private:
-  bool Aliases(const MDNode *A, const MDNode *B) const;
-  bool PathAliases(const MDNode *A, const MDNode *B) const;
 };
 
 /// Analysis pass providing a never-invalidated alias analysis result.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
index ff2bda6bed53..22947544ac07 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAlwaysInlinePass.cpp
@@ -71,6 +71,13 @@ void AMDGPUAlwaysInline::recursivelyVisitUsers(
     if (Instruction *I = dyn_cast<Instruction>(U)) {
       Function *F = I->getParent()->getParent();
       if (!AMDGPU::isEntryFunctionCC(F->getCallingConv())) {
+        // FIXME: This is a horrible hack. We should always respect noinline,
+        // and just let us hit the error when we can't handle this.
+        //
+        // Unfortunately, clang adds noinline to all functions at -O0. We have
+        // to override this here. until that's fixed.
+        F->removeFnAttr(Attribute::NoInline);
+
         FuncsToAlwaysInline.insert(F);
         Stack.push_back(F);
       }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
index e72b3f4fde63..625074569cfa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateKernelFeatures.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -71,7 +70,8 @@ public:
   static bool visitConstantExpr(const ConstantExpr *CE);
   static bool visitConstantExprsRecursively(
     const Constant *EntryC,
-    SmallPtrSet<const Constant *, 8> &ConstantExprVisited);
+    SmallPtrSet<const Constant *, 8> &ConstantExprVisited, bool IsFunc,
+    bool HasApertureRegs);
 };
 
 } // end anonymous namespace
@@ -93,6 +93,14 @@ static bool castRequiresQueuePtr(const AddrSpaceCastInst *ASC) {
   return castRequiresQueuePtr(ASC->getSrcAddressSpace());
 }
 
+static bool isDSAddress(const Constant *C) {
+  const GlobalValue *GV = dyn_cast<GlobalValue>(C);
+  if (!GV)
+    return false;
+  unsigned AS = GV->getAddressSpace();
+  return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS;
+}
+
 bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
   if (CE->getOpcode() == Instruction::AddrSpaceCast) {
     unsigned SrcAS = CE->getOperand(0)->getType()->getPointerAddressSpace();
@@ -104,7 +112,8 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExpr(const ConstantExpr *CE) {
 
 bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
   const Constant *EntryC,
-  SmallPtrSet<const Constant *, 8> &ConstantExprVisited) {
+  SmallPtrSet<const Constant *, 8> &ConstantExprVisited,
+  bool IsFunc, bool HasApertureRegs) {
 
   if (!ConstantExprVisited.insert(EntryC).second)
     return false;
@@ -115,9 +124,13 @@ bool AMDGPUAnnotateKernelFeatures::visitConstantExprsRecursively(
   while (!Stack.empty()) {
     const Constant *C = Stack.pop_back_val();
 
+    // We need to trap on DS globals in non-entry functions.
+    if (IsFunc && isDSAddress(C))
+      return true;
+
     // Check this constant expression.
     if (const auto *CE = dyn_cast<ConstantExpr>(C)) {
-      if (visitConstantExpr(CE))
+      if (!HasApertureRegs && visitConstantExpr(CE))
         return true;
     }
 
@@ -202,7 +215,7 @@ static void copyFeaturesToFunction(Function &Parent, const Function &Callee,
       "amdgpu-work-item-id-z",      "amdgpu-work-group-id-x",
       "amdgpu-work-group-id-y",     "amdgpu-work-group-id-z",
       "amdgpu-dispatch-ptr",        "amdgpu-dispatch-id",
-      "amdgpu-kernarg-segment-ptr", "amdgpu-implicitarg-ptr"};
+      "amdgpu-implicitarg-ptr"};
 
   if (handleAttr(Parent, Callee, "amdgpu-queue-ptr"))
     NeedQueuePtr = true;
@@ -263,10 +276,10 @@ bool AMDGPUAnnotateKernelFeatures::propagateUniformWorkGroupAttribute(
 
 bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
   const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
-  bool HasFlat = ST.hasFlatAddressSpace();
   bool HasApertureRegs = ST.hasApertureRegs();
   SmallPtrSet<const Constant *, 8> ConstantExprVisited;
 
+  bool HaveStackObjects = false;
   bool Changed = false;
   bool NeedQueuePtr = false;
   bool HaveCall = false;
@@ -274,13 +287,18 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
 
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
-      CallSite CS(&I);
-      if (CS) {
-        Function *Callee = CS.getCalledFunction();
+      if (isa<AllocaInst>(I)) {
+        HaveStackObjects = true;
+        continue;
+      }
+
+      if (auto *CB = dyn_cast<CallBase>(&I)) {
+        const Function *Callee =
+            dyn_cast<Function>(CB->getCalledOperand()->stripPointerCasts());
 
         // TODO: Do something with indirect calls.
         if (!Callee) {
-          if (!CS.isInlineAsm())
+          if (!CB->isInlineAsm())
             HaveCall = true;
           continue;
         }
@@ -292,20 +310,25 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
           Changed = true;
         } else {
           bool NonKernelOnly = false;
-          StringRef AttrName = intrinsicToAttrName(IID,
-                                                   NonKernelOnly, NeedQueuePtr);
-          if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
-            F.addFnAttr(AttrName);
-            Changed = true;
+
+          if (!IsFunc && IID == Intrinsic::amdgcn_kernarg_segment_ptr) {
+            F.addFnAttr("amdgpu-kernarg-segment-ptr");
+          } else {
+            StringRef AttrName = intrinsicToAttrName(IID, NonKernelOnly,
+                                                     NeedQueuePtr);
+            if (!AttrName.empty() && (IsFunc || !NonKernelOnly)) {
+              F.addFnAttr(AttrName);
+              Changed = true;
+            }
           }
         }
       }
 
-      if (NeedQueuePtr || HasApertureRegs)
+      if (NeedQueuePtr || (!IsFunc && HasApertureRegs))
         continue;
 
       if (const AddrSpaceCastInst *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
-        if (castRequiresQueuePtr(ASC)) {
+        if (!HasApertureRegs && castRequiresQueuePtr(ASC)) {
           NeedQueuePtr = true;
           continue;
         }
@@ -316,7 +339,8 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
         if (!OpC)
           continue;
 
-        if (visitConstantExprsRecursively(OpC, ConstantExprVisited)) {
+        if (visitConstantExprsRecursively(OpC, ConstantExprVisited, IsFunc,
+                                          HasApertureRegs)) {
           NeedQueuePtr = true;
           break;
         }
@@ -332,8 +356,13 @@ bool AMDGPUAnnotateKernelFeatures::addFeatureAttributes(Function &F) {
   // TODO: We could refine this to captured pointers that could possibly be
   // accessed by flat instructions. For now this is mostly a poor way of
   // estimating whether there are calls before argument lowering.
-  if (HasFlat && !IsFunc && HaveCall) {
-    F.addFnAttr("amdgpu-flat-scratch");
+  if (!IsFunc && HaveCall) {
+    F.addFnAttr("amdgpu-calls");
+    Changed = true;
+  }
+
+  if (HaveStackObjects) {
+    F.addFnAttr("amdgpu-stack-objects");
     Changed = true;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
index 6fb507083cef..b09e92c07f9b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAnnotateUniformValues.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
+#include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
@@ -35,7 +36,7 @@ class AMDGPUAnnotateUniformValues : public FunctionPass,
   MemoryDependenceResults *MDR;
   LoopInfo *LI;
   DenseMap<Value*, GetElementPtrInst*> noClobberClones;
-  bool isKernelFunc;
+  bool isEntryFunc;
 
 public:
   static char ID;
@@ -127,11 +128,10 @@ void AMDGPUAnnotateUniformValues::visitLoadInst(LoadInst &I) {
   auto isGlobalLoad = [&](LoadInst &Load)->bool {
     return Load.getPointerAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS;
   };
-  // We're tracking up to the Function boundaries
-  // We cannot go beyond because of FunctionPass restrictions
-  // Thus we can ensure that memory not clobbered for memory
-  // operations that live in kernel only.
-  bool NotClobbered = isKernelFunc &&   !isClobberedInFunction(&I);
+  // We're tracking up to the Function boundaries, and cannot go beyond because
+  // of FunctionPass restrictions. We can ensure that is memory not clobbered
+  // for memory operations that are live in to entry points only.
+  bool NotClobbered = isEntryFunc && !isClobberedInFunction(&I);
   Instruction *PtrI = dyn_cast<Instruction>(Ptr);
   if (!PtrI && NotClobbered && isGlobalLoad(I)) {
     if (isa<Argument>(Ptr) || isa<GlobalValue>(Ptr)) {
@@ -170,7 +170,7 @@ bool AMDGPUAnnotateUniformValues::runOnFunction(Function &F) {
   DA  = &getAnalysis<LegacyDivergenceAnalysis>();
   MDR = &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
   LI  = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  isKernelFunc = F.getCallingConv() == CallingConv::AMDGPU_KERNEL;
+  isEntryFunc = AMDGPU::isEntryFunctionCC(F.getCallingConv());
 
   visit(F);
   noClobberClones.clear();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
index 99a01ca3a2fd..d078fc147a36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.cpp
@@ -8,6 +8,8 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPUTargetMachine.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIRegisterInfo.h"
 #include "llvm/Support/NativeFormatting.h"
 #include "llvm/Support/raw_ostream.h"
@@ -43,6 +45,10 @@ char AMDGPUArgumentUsageInfo::ID = 0;
 
 const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::ExternFunctionInfo{};
 
+// Hardcoded registers from fixed function ABI
+const AMDGPUFunctionArgInfo AMDGPUArgumentUsageInfo::FixedABIFunctionInfo
+  = AMDGPUFunctionArgInfo::fixedABILayout();
+
 bool AMDGPUArgumentUsageInfo::doInitialization(Module &M) {
   return false;
 }
@@ -77,59 +83,102 @@ void AMDGPUArgumentUsageInfo::print(raw_ostream &OS, const Module *M) const {
   }
 }
 
-std::pair<const ArgDescriptor *, const TargetRegisterClass *>
+std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
 AMDGPUFunctionArgInfo::getPreloadedValue(
-  AMDGPUFunctionArgInfo::PreloadedValue Value) const {
+    AMDGPUFunctionArgInfo::PreloadedValue Value) const {
   switch (Value) {
   case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER: {
-    return std::make_pair(
-      PrivateSegmentBuffer ? &PrivateSegmentBuffer : nullptr,
-      &AMDGPU::SGPR_128RegClass);
+    return std::make_tuple(PrivateSegmentBuffer ? &PrivateSegmentBuffer
+                                                : nullptr,
+                           &AMDGPU::SGPR_128RegClass, LLT::vector(4, 32));
   }
   case AMDGPUFunctionArgInfo::IMPLICIT_BUFFER_PTR:
-    return std::make_pair(ImplicitBufferPtr ? &ImplicitBufferPtr : nullptr,
-                          &AMDGPU::SGPR_64RegClass);
+    return std::make_tuple(ImplicitBufferPtr ? &ImplicitBufferPtr : nullptr,
+                           &AMDGPU::SGPR_64RegClass,
+                           LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
   case AMDGPUFunctionArgInfo::WORKGROUP_ID_X:
-    return std::make_pair(WorkGroupIDX ? &WorkGroupIDX : nullptr,
-                          &AMDGPU::SGPR_32RegClass);
-
+    return std::make_tuple(WorkGroupIDX ? &WorkGroupIDX : nullptr,
+                           &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
   case AMDGPUFunctionArgInfo::WORKGROUP_ID_Y:
-    return std::make_pair(WorkGroupIDY ? &WorkGroupIDY : nullptr,
-                          &AMDGPU::SGPR_32RegClass);
+    return std::make_tuple(WorkGroupIDY ? &WorkGroupIDY : nullptr,
+                           &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
   case AMDGPUFunctionArgInfo::WORKGROUP_ID_Z:
-    return std::make_pair(WorkGroupIDZ ? &WorkGroupIDZ : nullptr,
-                          &AMDGPU::SGPR_32RegClass);
+    return std::make_tuple(WorkGroupIDZ ? &WorkGroupIDZ : nullptr,
+                           &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
   case AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET:
-    return std::make_pair(
-      PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr,
-      &AMDGPU::SGPR_32RegClass);
+    return std::make_tuple(
+        PrivateSegmentWaveByteOffset ? &PrivateSegmentWaveByteOffset : nullptr,
+        &AMDGPU::SGPR_32RegClass, LLT::scalar(32));
   case AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR:
-    return std::make_pair(KernargSegmentPtr ? &KernargSegmentPtr : nullptr,
-                          &AMDGPU::SGPR_64RegClass);
+    return std::make_tuple(KernargSegmentPtr ? &KernargSegmentPtr : nullptr,
+                           &AMDGPU::SGPR_64RegClass,
+                           LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
   case AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR:
-    return std::make_pair(ImplicitArgPtr ? &ImplicitArgPtr : nullptr,
-                          &AMDGPU::SGPR_64RegClass);
+    return std::make_tuple(ImplicitArgPtr ? &ImplicitArgPtr : nullptr,
+                           &AMDGPU::SGPR_64RegClass,
+                           LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
   case AMDGPUFunctionArgInfo::DISPATCH_ID:
-    return std::make_pair(DispatchID ? &DispatchID : nullptr,
-                          &AMDGPU::SGPR_64RegClass);
+    return std::make_tuple(DispatchID ? &DispatchID : nullptr,
+                           &AMDGPU::SGPR_64RegClass, LLT::scalar(64));
   case AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT:
-    return std::make_pair(FlatScratchInit ? &FlatScratchInit : nullptr,
-                          &AMDGPU::SGPR_64RegClass);
+    return std::make_tuple(FlatScratchInit ? &FlatScratchInit : nullptr,
+                           &AMDGPU::SGPR_64RegClass, LLT::scalar(64));
   case AMDGPUFunctionArgInfo::DISPATCH_PTR:
-    return std::make_pair(DispatchPtr ? &DispatchPtr : nullptr,
-                          &AMDGPU::SGPR_64RegClass);
+    return std::make_tuple(DispatchPtr ? &DispatchPtr : nullptr,
+                           &AMDGPU::SGPR_64RegClass,
+                           LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
   case AMDGPUFunctionArgInfo::QUEUE_PTR:
-    return std::make_pair(QueuePtr ? &QueuePtr : nullptr,
-                          &AMDGPU::SGPR_64RegClass);
+    return std::make_tuple(QueuePtr ? &QueuePtr : nullptr,
+                           &AMDGPU::SGPR_64RegClass,
+                           LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64));
   case AMDGPUFunctionArgInfo::WORKITEM_ID_X:
-    return std::make_pair(WorkItemIDX ? &WorkItemIDX : nullptr,
-                          &AMDGPU::VGPR_32RegClass);
+    return std::make_tuple(WorkItemIDX ? &WorkItemIDX : nullptr,
+                           &AMDGPU::VGPR_32RegClass, LLT::scalar(32));
   case AMDGPUFunctionArgInfo::WORKITEM_ID_Y:
-    return std::make_pair(WorkItemIDY ? &WorkItemIDY : nullptr,
-                          &AMDGPU::VGPR_32RegClass);
+    return std::make_tuple(WorkItemIDY ? &WorkItemIDY : nullptr,
+                           &AMDGPU::VGPR_32RegClass, LLT::scalar(32));
   case AMDGPUFunctionArgInfo::WORKITEM_ID_Z:
-    return std::make_pair(WorkItemIDZ ? &WorkItemIDZ : nullptr,
-                          &AMDGPU::VGPR_32RegClass);
+    return std::make_tuple(WorkItemIDZ ? &WorkItemIDZ : nullptr,
+                           &AMDGPU::VGPR_32RegClass, LLT::scalar(32));
   }
   llvm_unreachable("unexpected preloaded value type");
 }
+
+constexpr AMDGPUFunctionArgInfo AMDGPUFunctionArgInfo::fixedABILayout() {
+  AMDGPUFunctionArgInfo AI;
+  AI.PrivateSegmentBuffer
+    = ArgDescriptor::createRegister(AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3);
+  AI.DispatchPtr = ArgDescriptor::createRegister(AMDGPU::SGPR4_SGPR5);
+  AI.QueuePtr = ArgDescriptor::createRegister(AMDGPU::SGPR6_SGPR7);
+
+  // Do not pass kernarg segment pointer, only pass increment version in its
+  // place.
+  AI.ImplicitArgPtr = ArgDescriptor::createRegister(AMDGPU::SGPR8_SGPR9);
+  AI.DispatchID = ArgDescriptor::createRegister(AMDGPU::SGPR10_SGPR11);
+
+  // Skip FlatScratchInit/PrivateSegmentSize
+  AI.WorkGroupIDX = ArgDescriptor::createRegister(AMDGPU::SGPR12);
+  AI.WorkGroupIDY = ArgDescriptor::createRegister(AMDGPU::SGPR13);
+  AI.WorkGroupIDZ = ArgDescriptor::createRegister(AMDGPU::SGPR14);
+
+  const unsigned Mask = 0x3ff;
+  AI.WorkItemIDX = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask);
+  AI.WorkItemIDY = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask << 10);
+  AI.WorkItemIDZ = ArgDescriptor::createRegister(AMDGPU::VGPR31, Mask << 20);
+  return AI;
+}
+
+const AMDGPUFunctionArgInfo &
+AMDGPUArgumentUsageInfo::lookupFuncArgInfo(const Function &F) const {
+  auto I = ArgInfoMap.find(&F);
+  if (I == ArgInfoMap.end()) {
+    if (AMDGPUTargetMachine::EnableFixedFunctionABI)
+      return FixedABIFunctionInfo;
+
+    // Without the fixed ABI, we assume no function has special inputs.
+    assert(F.isDeclaration());
+    return ExternFunctionInfo;
+  }
+
+  return I->second;
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
index f0e7ee910f95..576e6cfe929e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUArgumentUsageInfo.h
@@ -11,15 +11,13 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/CodeGen/Register.h"
-#include "llvm/IR/Function.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/LowLevelTypeImpl.h"
 
 namespace llvm {
 
 class Function;
 class raw_ostream;
-class GCNSubtarget;
-class TargetMachine;
 class TargetRegisterClass;
 class TargetRegisterInfo;
 
@@ -40,19 +38,22 @@ private:
   bool IsSet : 1;
 
 public:
-  ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
+  constexpr ArgDescriptor(unsigned Val = 0, unsigned Mask = ~0u,
                 bool IsStack = false, bool IsSet = false)
     : Reg(Val), Mask(Mask), IsStack(IsStack), IsSet(IsSet) {}
 
-  static ArgDescriptor createRegister(Register Reg, unsigned Mask = ~0u) {
+  static constexpr ArgDescriptor createRegister(Register Reg,
+                                                unsigned Mask = ~0u) {
     return ArgDescriptor(Reg, Mask, false, true);
   }
 
-  static ArgDescriptor createStack(unsigned Offset, unsigned Mask = ~0u) {
+  static constexpr ArgDescriptor createStack(unsigned Offset,
+                                             unsigned Mask = ~0u) {
     return ArgDescriptor(Offset, Mask, true, true);
   }
 
-  static ArgDescriptor createArg(const ArgDescriptor &Arg, unsigned Mask) {
+  static constexpr ArgDescriptor createArg(const ArgDescriptor &Arg,
+                                           unsigned Mask) {
     return ArgDescriptor(Arg.Reg, Mask, Arg.IsStack, Arg.IsSet);
   }
 
@@ -141,25 +142,29 @@ struct AMDGPUFunctionArgInfo {
   ArgDescriptor ImplicitArgPtr;
 
   // Input registers for non-HSA ABI
-  ArgDescriptor ImplicitBufferPtr = 0;
+  ArgDescriptor ImplicitBufferPtr;
 
   // VGPRs inputs. These are always v0, v1 and v2 for entry functions.
   ArgDescriptor WorkItemIDX;
   ArgDescriptor WorkItemIDY;
   ArgDescriptor WorkItemIDZ;
 
-  std::pair<const ArgDescriptor *, const TargetRegisterClass *>
+  std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
   getPreloadedValue(PreloadedValue Value) const;
+
+  static constexpr AMDGPUFunctionArgInfo fixedABILayout();
 };
 
 class AMDGPUArgumentUsageInfo : public ImmutablePass {
 private:
-  static const AMDGPUFunctionArgInfo ExternFunctionInfo;
   DenseMap<const Function *, AMDGPUFunctionArgInfo> ArgInfoMap;
 
 public:
   static char ID;
 
+  static const AMDGPUFunctionArgInfo ExternFunctionInfo;
+  static const AMDGPUFunctionArgInfo FixedABIFunctionInfo;
+
   AMDGPUArgumentUsageInfo() : ImmutablePass(ID) { }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -175,15 +180,7 @@ public:
     ArgInfoMap[&F] = ArgInfo;
   }
 
-  const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const {
-    auto I = ArgInfoMap.find(&F);
-    if (I == ArgInfoMap.end()) {
-      assert(F.isDeclaration());
-      return ExternFunctionInfo;
-    }
-
-    return I->second;
-  }
+  const AMDGPUFunctionArgInfo &lookupFuncArgInfo(const Function &F) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
index 9e07b4d252b7..eef8fe2fc3b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.cpp
@@ -49,9 +49,25 @@ using namespace llvm;
 using namespace llvm::AMDGPU;
 using namespace llvm::AMDGPU::HSAMD;
 
-// TODO: This should get the default rounding mode from the kernel. We just set
-// the default here, but this could change if the OpenCL rounding mode pragmas
-// are used.
+// We need to tell the runtime some amount ahead of time if we don't know the
+// true stack size. Assume a smaller number if this is only due to dynamic /
+// non-entry block allocas.
+static cl::opt<uint32_t> AssumedStackSizeForExternalCall(
+  "amdgpu-assume-external-call-stack-size",
+  cl::desc("Assumed stack use of any external call (in bytes)"),
+  cl::Hidden,
+  cl::init(16384));
+
+static cl::opt<uint32_t> AssumedStackSizeForDynamicSizeObjects(
+  "amdgpu-assume-dynamic-stack-object-size",
+  cl::desc("Assumed extra stack use if there are any "
+           "variable sized objects (in bytes)"),
+  cl::Hidden,
+  cl::init(4096));
+
+// This should get the default rounding mode from the kernel. We just set the
+// default here, but this could change if the OpenCL rounding mode pragmas are
+// used.
 //
 // The denormal mode here should match what is reported by the OpenCL runtime
 // for the CL_FP_DENORM bit from CL_DEVICE_{HALF|SINGLE|DOUBLE}_FP_CONFIG, but
@@ -70,18 +86,10 @@ using namespace llvm::AMDGPU::HSAMD;
 // instructions to run at the double precision rate for the device so it's
 // probably best to just report no single precision denormals.
 static uint32_t getFPMode(AMDGPU::SIModeRegisterDefaults Mode) {
-
-  // TODO: Is there any real use for the flush in only / flush out only modes?
-  uint32_t FP32Denormals =
-    Mode.FP32Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
-
-  uint32_t FP64Denormals =
-    Mode.FP64FP16Denormals ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
-
   return FP_ROUND_MODE_SP(FP_ROUND_ROUND_TO_NEAREST) |
          FP_ROUND_MODE_DP(FP_ROUND_ROUND_TO_NEAREST) |
-         FP_DENORM_MODE_SP(FP32Denormals) |
-         FP_DENORM_MODE_DP(FP64Denormals);
+         FP_DENORM_MODE_SP(Mode.fpDenormModeSPValue()) |
+         FP_DENORM_MODE_DP(Mode.fpDenormModeDPValue());
 }
 
 static AsmPrinter *
@@ -120,7 +128,7 @@ AMDGPUTargetStreamer* AMDGPUAsmPrinter::getTargetStreamer() const {
   return static_cast<AMDGPUTargetStreamer*>(OutStreamer->getTargetStreamer());
 }
 
-void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
+void AMDGPUAsmPrinter::emitStartOfAsmFile(Module &M) {
   if (IsaInfo::hasCodeObjectV3(getGlobalSTI())) {
     std::string ExpectedTarget;
     raw_string_ostream ExpectedTargetOS(ExpectedTarget);
@@ -152,7 +160,7 @@ void AMDGPUAsmPrinter::EmitStartOfAsmFile(Module &M) {
       Version.Major, Version.Minor, Version.Stepping, "AMD", "AMDGPU");
 }
 
-void AMDGPUAsmPrinter::EmitEndOfAsmFile(Module &M) {
+void AMDGPUAsmPrinter::emitEndOfAsmFile(Module &M) {
   // Following code requires TargetStreamer to be present.
   if (!getTargetStreamer())
     return;
@@ -188,7 +196,7 @@ bool AMDGPUAsmPrinter::isBlockOnlyReachableByFallthrough(
   return (MBB->back().getOpcode() != AMDGPU::S_SETPC_B64);
 }
 
-void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
+void AMDGPUAsmPrinter::emitFunctionBodyStart() {
   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
   if (!MFI.isEntryFunction())
     return;
@@ -207,7 +215,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyStart() {
     HSAMetadataStream->emitKernel(*MF, CurrentProgramInfo);
 }
 
-void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
+void AMDGPUAsmPrinter::emitFunctionBodyEnd() {
   const SIMachineFunctionInfo &MFI = *MF->getInfo<SIMachineFunctionInfo>();
   if (!MFI.isEntryFunction())
     return;
@@ -226,7 +234,7 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
 
   // CP microcode requires the kernel descriptor to be allocated on 64 byte
   // alignment.
-  Streamer.EmitValueToAlignment(64, 0, 1, 0);
+  Streamer.emitValueToAlignment(64, 0, 1, 0);
   if (ReadOnlySection.getAlignment() < 64)
     ReadOnlySection.setAlignment(Align(64));
 
@@ -247,10 +255,10 @@ void AMDGPUAsmPrinter::EmitFunctionBodyEnd() {
   Streamer.PopSection();
 }
 
-void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
+void AMDGPUAsmPrinter::emitFunctionEntryLabel() {
   if (IsaInfo::hasCodeObjectV3(getGlobalSTI()) &&
       TM.getTargetTriple().getOS() == Triple::AMDHSA) {
-    AsmPrinter::EmitFunctionEntryLabel();
+    AsmPrinter::emitFunctionEntryLabel();
     return;
   }
 
@@ -269,10 +277,10 @@ void AMDGPUAsmPrinter::EmitFunctionEntryLabel() {
     HexLines.push_back("");
   }
 
-  AsmPrinter::EmitFunctionEntryLabel();
+  AsmPrinter::emitFunctionEntryLabel();
 }
 
-void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
+void AMDGPUAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
   if (DumpCodeInstEmitter && !isBlockOnlyReachableByFallthrough(&MBB)) {
     // Write a line for the basic block label if it is not only fallthrough.
     DisasmLines.push_back(
@@ -281,10 +289,10 @@ void AMDGPUAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
     DisasmLineMaxLen = std::max(DisasmLineMaxLen, DisasmLines.back().size());
     HexLines.push_back("");
   }
-  AsmPrinter::EmitBasicBlockStart(MBB);
+  AsmPrinter::emitBasicBlockStart(MBB);
 }
 
-void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+void AMDGPUAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   if (GV->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
     if (GV->hasInitializer() && !isa<UndefValue>(GV->getInitializer())) {
       OutContext.reportError({},
@@ -307,18 +315,16 @@ void AMDGPUAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
 
     const DataLayout &DL = GV->getParent()->getDataLayout();
     uint64_t Size = DL.getTypeAllocSize(GV->getValueType());
-    unsigned Align = GV->getAlignment();
-    if (!Align)
-      Align = 4;
+    Align Alignment = GV->getAlign().getValueOr(Align(4));
 
-    EmitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
-    EmitLinkage(GV, GVSym);
+    emitVisibility(GVSym, GV->getVisibility(), !GV->isDeclaration());
+    emitLinkage(GV, GVSym);
     if (auto TS = getTargetStreamer())
-      TS->emitAMDGPULDS(GVSym, Size, Align);
+      TS->emitAMDGPULDS(GVSym, Size, Alignment);
     return;
   }
 
-  AsmPrinter::EmitGlobalVariable(GV);
+  AsmPrinter::emitGlobalVariable(GV);
 }
 
 bool AMDGPUAsmPrinter::doFinalization(Module &M) {
@@ -468,7 +474,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   HexLines.clear();
   DisasmLineMaxLen = 0;
 
-  EmitFunctionBody();
+  emitFunctionBody();
 
   if (isVerbose()) {
     MCSectionELF *CommentSection =
@@ -549,7 +555,7 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   if (DumpCodeInstEmitter) {
 
     OutStreamer->SwitchSection(
-        Context.getELFSection(".AMDGPU.disasm", ELF::SHT_NOTE, 0));
+        Context.getELFSection(".AMDGPU.disasm", ELF::SHT_PROGBITS, 0));
 
     for (size_t i = 0; i < DisasmLines.size(); ++i) {
       std::string Comment = "\n";
@@ -558,8 +564,8 @@ bool AMDGPUAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
         Comment += " ; " + HexLines[i] + "\n";
       }
 
-      OutStreamer->EmitBytes(StringRef(DisasmLines[i]));
-      OutStreamer->EmitBytes(StringRef(Comment));
+      OutStreamer->emitBytes(StringRef(DisasmLines[i]));
+      OutStreamer->emitBytes(StringRef(Comment));
     }
   }
 
@@ -609,6 +615,15 @@ int32_t AMDGPUAsmPrinter::SIFunctionResourceInfo::getTotalNumVGPRs(
   return std::max(NumVGPR, NumAGPR);
 }
 
+static const Function *getCalleeFunction(const MachineOperand &Op) {
+  if (Op.isImm()) {
+    assert(Op.getImm() == 0);
+    return nullptr;
+  }
+
+  return cast<Function>(Op.getGlobal());
+}
+
 AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
   const MachineFunction &MF) const {
   SIFunctionResourceInfo Info;
@@ -636,11 +651,15 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
     Info.UsesFlatScratch = false;
   }
 
-  Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
   Info.PrivateSegmentSize = FrameInfo.getStackSize();
-  if (MFI->isStackRealigned())
-    Info.PrivateSegmentSize += FrameInfo.getMaxAlignment();
 
+  // Assume a big number if there are any unknown sized objects.
+  Info.HasDynamicallySizedStack = FrameInfo.hasVarSizedObjects();
+  if (Info.HasDynamicallySizedStack)
+    Info.PrivateSegmentSize += AssumedStackSizeForDynamicSizeObjects;
+
+  if (MFI->isStackRealigned())
+    Info.PrivateSegmentSize += FrameInfo.getMaxAlign().value();
 
   Info.UsesVCC = MRI.isPhysRegUsed(AMDGPU::VCC_LO) ||
                  MRI.isPhysRegUsed(AMDGPU::VCC_HI);
@@ -715,6 +734,7 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
         case AMDGPU::SRC_PRIVATE_BASE:
         case AMDGPU::SRC_PRIVATE_LIMIT:
         case AMDGPU::SGPR_NULL:
+        case AMDGPU::MODE:
           continue;
 
         case AMDGPU::SRC_POPS_EXITING_WAVE_ID:
@@ -727,6 +747,10 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
         case AMDGPU::VCC:
         case AMDGPU::VCC_LO:
         case AMDGPU::VCC_HI:
+        case AMDGPU::VCC_LO_LO16:
+        case AMDGPU::VCC_LO_HI16:
+        case AMDGPU::VCC_HI_LO16:
+        case AMDGPU::VCC_HI_HI16:
           Info.UsesVCC = true;
           continue;
 
@@ -764,15 +788,20 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           break;
         }
 
-        if (AMDGPU::SReg_32RegClass.contains(Reg)) {
+        if (AMDGPU::SReg_32RegClass.contains(Reg) ||
+            AMDGPU::SReg_LO16RegClass.contains(Reg) ||
+            AMDGPU::SGPR_HI16RegClass.contains(Reg)) {
           assert(!AMDGPU::TTMP_32RegClass.contains(Reg) &&
                  "trap handler registers should not be used");
           IsSGPR = true;
           Width = 1;
-        } else if (AMDGPU::VGPR_32RegClass.contains(Reg)) {
+        } else if (AMDGPU::VGPR_32RegClass.contains(Reg) ||
+                   AMDGPU::VGPR_LO16RegClass.contains(Reg) ||
+                   AMDGPU::VGPR_HI16RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 1;
-        } else if (AMDGPU::AGPR_32RegClass.contains(Reg)) {
+        } else if (AMDGPU::AGPR_32RegClass.contains(Reg) ||
+                   AMDGPU::AGPR_LO16RegClass.contains(Reg)) {
           IsSGPR = false;
           IsAGPR = true;
           Width = 1;
@@ -794,6 +823,10 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
         } else if (AMDGPU::SReg_96RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 3;
+        } else if (AMDGPU::AReg_96RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 3;
         } else if (AMDGPU::SReg_128RegClass.contains(Reg)) {
           assert(!AMDGPU::TTMP_128RegClass.contains(Reg) &&
             "trap handler registers should not be used");
@@ -812,6 +845,20 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
         } else if (AMDGPU::SReg_160RegClass.contains(Reg)) {
           IsSGPR = true;
           Width = 5;
+        } else if (AMDGPU::AReg_160RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 5;
+        } else if (AMDGPU::VReg_192RegClass.contains(Reg)) {
+          IsSGPR = false;
+          Width = 6;
+        } else if (AMDGPU::SReg_192RegClass.contains(Reg)) {
+          IsSGPR = true;
+          Width = 6;
+        } else if (AMDGPU::AReg_192RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 6;
         } else if (AMDGPU::SReg_256RegClass.contains(Reg)) {
           assert(!AMDGPU::TTMP_256RegClass.contains(Reg) &&
             "trap handler registers should not be used");
@@ -820,6 +867,10 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
         } else if (AMDGPU::VReg_256RegClass.contains(Reg)) {
           IsSGPR = false;
           Width = 8;
+        } else if (AMDGPU::AReg_256RegClass.contains(Reg)) {
+          IsSGPR = false;
+          IsAGPR = true;
+          Width = 8;
         } else if (AMDGPU::SReg_512RegClass.contains(Reg)) {
           assert(!AMDGPU::TTMP_512RegClass.contains(Reg) &&
             "trap handler registers should not be used");
@@ -862,8 +913,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
 
         const MachineOperand *CalleeOp
           = TII->getNamedOperand(MI, AMDGPU::OpName::callee);
-        const Function *Callee = cast<Function>(CalleeOp->getGlobal());
-        if (Callee->isDeclaration()) {
+
+        const Function *Callee = getCalleeFunction(*CalleeOp);
+        if (!Callee || Callee->isDeclaration()) {
           // If this is a call to an external function, we can't do much. Make
           // conservative guesses.
 
@@ -874,7 +926,9 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           MaxVGPR = std::max(MaxVGPR, 23);
           MaxAGPR = std::max(MaxAGPR, 23);
 
-          CalleeFrameSize = std::max(CalleeFrameSize, UINT64_C(16384));
+          CalleeFrameSize = std::max(CalleeFrameSize,
+            static_cast<uint64_t>(AssumedStackSizeForExternalCall));
+
           Info.UsesVCC = true;
           Info.UsesFlatScratch = ST.hasFlatAddressSpace();
           Info.HasDynamicallySizedStack = true;
@@ -906,7 +960,8 @@ AMDGPUAsmPrinter::SIFunctionResourceInfo AMDGPUAsmPrinter::analyzeResourceUsage(
           Info.HasRecursion |= I->second.HasRecursion;
         }
 
-        if (!Callee->doesNotRecurse())
+        // FIXME: Call site could have norecurse on it
+        if (!Callee || !Callee->doesNotRecurse())
           Info.HasRecursion = true;
       }
     }
@@ -1108,7 +1163,7 @@ void AMDGPUAsmPrinter::getSIProgramInfo(SIProgramInfo &ProgInfo,
       S_00B84C_LDS_SIZE(STM.isAmdHsaOS() ? 0 : ProgInfo.LDSBlocks) |
       S_00B84C_EXCP_EN(0);
 
-  ProgInfo.Occupancy = STM.computeOccupancy(MF, ProgInfo.LDSSize,
+  ProgInfo.Occupancy = STM.computeOccupancy(MF.getFunction(), ProgInfo.LDSSize,
                                             ProgInfo.NumSGPRsForWavesPerEU,
                                             ProgInfo.NumVGPRsForWavesPerEU);
 }
@@ -1132,40 +1187,41 @@ void AMDGPUAsmPrinter::EmitProgramInfoSI(const MachineFunction &MF,
   unsigned RsrcReg = getRsrcReg(MF.getFunction().getCallingConv());
 
   if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
-    OutStreamer->EmitIntValue(R_00B848_COMPUTE_PGM_RSRC1, 4);
+    OutStreamer->emitInt32(R_00B848_COMPUTE_PGM_RSRC1);
 
-    OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc1, 4);
+    OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc1);
 
-    OutStreamer->EmitIntValue(R_00B84C_COMPUTE_PGM_RSRC2, 4);
-    OutStreamer->EmitIntValue(CurrentProgramInfo.ComputePGMRSrc2, 4);
+    OutStreamer->emitInt32(R_00B84C_COMPUTE_PGM_RSRC2);
+    OutStreamer->emitInt32(CurrentProgramInfo.ComputePGMRSrc2);
 
-    OutStreamer->EmitIntValue(R_00B860_COMPUTE_TMPRING_SIZE, 4);
-    OutStreamer->EmitIntValue(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
+    OutStreamer->emitInt32(R_00B860_COMPUTE_TMPRING_SIZE);
+    OutStreamer->emitInt32(S_00B860_WAVESIZE(CurrentProgramInfo.ScratchBlocks));
 
     // TODO: Should probably note flat usage somewhere. SC emits a "FlatPtr32 =
     // 0" comment but I don't see a corresponding field in the register spec.
   } else {
-    OutStreamer->EmitIntValue(RsrcReg, 4);
-    OutStreamer->EmitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
+    OutStreamer->emitInt32(RsrcReg);
+    OutStreamer->emitIntValue(S_00B028_VGPRS(CurrentProgramInfo.VGPRBlocks) |
                               S_00B028_SGPRS(CurrentProgramInfo.SGPRBlocks), 4);
-    OutStreamer->EmitIntValue(R_0286E8_SPI_TMPRING_SIZE, 4);
-    OutStreamer->EmitIntValue(
+    OutStreamer->emitInt32(R_0286E8_SPI_TMPRING_SIZE);
+    OutStreamer->emitIntValue(
         S_0286E8_WAVESIZE(CurrentProgramInfo.ScratchBlocks), 4);
   }
 
   if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS) {
-    OutStreamer->EmitIntValue(R_00B02C_SPI_SHADER_PGM_RSRC2_PS, 4);
-    OutStreamer->EmitIntValue(S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks), 4);
-    OutStreamer->EmitIntValue(R_0286CC_SPI_PS_INPUT_ENA, 4);
-    OutStreamer->EmitIntValue(MFI->getPSInputEnable(), 4);
-    OutStreamer->EmitIntValue(R_0286D0_SPI_PS_INPUT_ADDR, 4);
-    OutStreamer->EmitIntValue(MFI->getPSInputAddr(), 4);
+    OutStreamer->emitInt32(R_00B02C_SPI_SHADER_PGM_RSRC2_PS);
+    OutStreamer->emitInt32(
+        S_00B02C_EXTRA_LDS_SIZE(CurrentProgramInfo.LDSBlocks));
+    OutStreamer->emitInt32(R_0286CC_SPI_PS_INPUT_ENA);
+    OutStreamer->emitInt32(MFI->getPSInputEnable());
+    OutStreamer->emitInt32(R_0286D0_SPI_PS_INPUT_ADDR);
+    OutStreamer->emitInt32(MFI->getPSInputAddr());
   }
 
-  OutStreamer->EmitIntValue(R_SPILLED_SGPRS, 4);
-  OutStreamer->EmitIntValue(MFI->getNumSpilledSGPRs(), 4);
-  OutStreamer->EmitIntValue(R_SPILLED_VGPRS, 4);
-  OutStreamer->EmitIntValue(MFI->getNumSpilledVGPRs(), 4);
+  OutStreamer->emitInt32(R_SPILLED_SGPRS);
+  OutStreamer->emitInt32(MFI->getNumSpilledSGPRs());
+  OutStreamer->emitInt32(R_SPILLED_VGPRS);
+  OutStreamer->emitInt32(MFI->getNumSpilledVGPRs());
 }
 
 // This is the equivalent of EmitProgramInfoSI above, but for when the OS type
@@ -1304,7 +1360,18 @@ bool AMDGPUAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
     AMDGPUInstPrinter::printRegOperand(MO.getReg(), O,
                                        *MF->getSubtarget().getRegisterInfo());
     return false;
+  } else if (MO.isImm()) {
+    int64_t Val = MO.getImm();
+    if (AMDGPU::isInlinableIntLiteral(Val)) {
+      O << Val;
+    } else if (isUInt<16>(Val)) {
+      O << format("0x%" PRIx16, static_cast<uint16_t>(Val));
+    } else if (isUInt<32>(Val)) {
+      O << format("0x%" PRIx32, static_cast<uint32_t>(Val));
+    } else {
+      O << format("0x%" PRIx64, static_cast<uint64_t>(Val));
+    }
+    return false;
   }
-
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
index c50c19a4609c..54e8338ab4b0 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAsmPrinter.h
@@ -121,21 +121,21 @@ public:
                                    const MachineInstr *MI);
 
   /// Implemented in AMDGPUMCInstLower.cpp
-  void EmitInstruction(const MachineInstr *MI) override;
+  void emitInstruction(const MachineInstr *MI) override;
 
-  void EmitFunctionBodyStart() override;
+  void emitFunctionBodyStart() override;
 
-  void EmitFunctionBodyEnd() override;
+  void emitFunctionBodyEnd() override;
 
-  void EmitFunctionEntryLabel() override;
+  void emitFunctionEntryLabel() override;
 
-  void EmitBasicBlockStart(const MachineBasicBlock &MBB) override;
+  void emitBasicBlockStart(const MachineBasicBlock &MBB) override;
 
-  void EmitGlobalVariable(const GlobalVariable *GV) override;
+  void emitGlobalVariable(const GlobalVariable *GV) override;
 
-  void EmitStartOfAsmFile(Module &M) override;
+  void emitStartOfAsmFile(Module &M) override;
 
-  void EmitEndOfAsmFile(Module &M) override;
+  void emitEndOfAsmFile(Module &M) override;
 
   bool isBlockOnlyReachableByFallthrough(
     const MachineBasicBlock *MBB) const override;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index 59aa0ea98aa7..c9d25d4250d5 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -438,7 +438,7 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
 
   Type *const Ty = I.getType();
   const unsigned TyBitWidth = DL->getTypeSizeInBits(Ty);
-  Type *const VecTy = VectorType::get(B.getInt32Ty(), 2);
+  auto *const VecTy = FixedVectorType::get(B.getInt32Ty(), 2);
 
   // This is the value in the atomic operation we need to combine in order to
   // reduce the number of atomic operations.
@@ -447,9 +447,8 @@ void AMDGPUAtomicOptimizer::optimizeAtomic(Instruction &I,
   // We need to know how many lanes are active within the wavefront, and we do
   // this by doing a ballot of active lanes.
   Type *const WaveTy = B.getIntNTy(ST->getWavefrontSize());
-  CallInst *const Ballot = B.CreateIntrinsic(
-      Intrinsic::amdgcn_icmp, {WaveTy, B.getInt32Ty()},
-      {B.getInt32(1), B.getInt32(0), B.getInt32(CmpInst::ICMP_NE)});
+  CallInst *const Ballot =
+      B.CreateIntrinsic(Intrinsic::amdgcn_ballot, WaveTy, B.getTrue());
 
   // We need to know how many lanes are active within the wavefront that are
   // below us. If we counted each lane linearly starting from 0, a lane is
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
index c657ca71bfdf..05a4e3462a26 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.cpp
@@ -16,6 +16,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUISelLowering.h"
 #include "AMDGPUSubtarget.h"
+#include "AMDGPUTargetMachine.h"
 #include "SIISelLowering.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIRegisterInfo.h"
@@ -59,6 +60,18 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
     } else
       ExtReg = extendRegister(ValVReg, VA);
 
+    // If this is a scalar return, insert a readfirstlane just in case the value
+    // ends up in a VGPR.
+    // FIXME: Assert this is a shader return.
+    const SIRegisterInfo *TRI
+      = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
+    if (TRI->isSGPRReg(MRI, PhysReg)) {
+      auto ToSGPR = MIRBuilder.buildIntrinsic(Intrinsic::amdgcn_readfirstlane,
+                                              {MRI.getType(ExtReg)}, false)
+        .addReg(ExtReg);
+      ExtReg = ToSGPR.getReg(0);
+    }
+
     MIRBuilder.buildCopy(PhysReg, ExtReg);
     MIB.addUse(PhysReg, RegState::Implicit);
   }
@@ -84,11 +97,10 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
     auto &MFI = MIRBuilder.getMF().getFrameInfo();
     int FI = MFI.CreateFixedObject(Size, Offset, true);
     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
-    Register AddrReg = MRI.createGenericVirtualRegister(
-      LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32));
-    MIRBuilder.buildFrameIndex(AddrReg, FI);
+    auto AddrReg = MIRBuilder.buildFrameIndex(
+        LLT::pointer(AMDGPUAS::PRIVATE_ADDRESS, 32), FI);
     StackUsed = std::max(StackUsed, Size + Offset);
-    return AddrReg;
+    return AddrReg.getReg(0);
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
@@ -119,9 +131,12 @@ struct IncomingArgHandler : public CallLowering::ValueHandler {
 
   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
+    MachineFunction &MF = MIRBuilder.getMF();
+
     // FIXME: Get alignment
-    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
-      MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size, 1);
+    auto MMO = MF.getMachineMemOperand(
+        MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
+        inferAlignFromPtrInfo(MF, MPO));
     MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
   }
 
@@ -150,10 +165,26 @@ AMDGPUCallLowering::AMDGPUCallLowering(const AMDGPUTargetLowering &TLI)
   : CallLowering(&TLI) {
 }
 
+// FIXME: Compatability shim
+static ISD::NodeType extOpcodeToISDExtOpcode(unsigned MIOpc) {
+  switch (MIOpc) {
+  case TargetOpcode::G_SEXT:
+    return ISD::SIGN_EXTEND;
+  case TargetOpcode::G_ZEXT:
+    return ISD::ZERO_EXTEND;
+  case TargetOpcode::G_ANYEXT:
+    return ISD::ANY_EXTEND;
+  default:
+    llvm_unreachable("not an extend opcode");
+  }
+}
+
 void AMDGPUCallLowering::splitToValueTypes(
-    const ArgInfo &OrigArg, SmallVectorImpl<ArgInfo> &SplitArgs,
-    const DataLayout &DL, MachineRegisterInfo &MRI, CallingConv::ID CallConv,
-    SplitArgTy PerformArgSplit) const {
+  MachineIRBuilder &B,
+  const ArgInfo &OrigArg, unsigned OrigArgIdx,
+  SmallVectorImpl<ArgInfo> &SplitArgs,
+  const DataLayout &DL, CallingConv::ID CallConv,
+  SplitArgTy PerformArgSplit) const {
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
   LLVMContext &Ctx = OrigArg.Ty->getContext();
 
@@ -167,28 +198,46 @@ void AMDGPUCallLowering::splitToValueTypes(
 
   int SplitIdx = 0;
   for (EVT VT : SplitVTs) {
-    unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
+    Register Reg = OrigArg.Regs[SplitIdx];
     Type *Ty = VT.getTypeForEVT(Ctx);
+    LLT LLTy = getLLTForType(*Ty, DL);
 
+    if (OrigArgIdx == AttributeList::ReturnIndex && VT.isScalarInteger()) {
+      unsigned ExtendOp = TargetOpcode::G_ANYEXT;
+      if (OrigArg.Flags[0].isSExt()) {
+        assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
+        ExtendOp = TargetOpcode::G_SEXT;
+      } else if (OrigArg.Flags[0].isZExt()) {
+        assert(OrigArg.Regs.size() == 1 && "expect only simple return values");
+        ExtendOp = TargetOpcode::G_ZEXT;
+      }
 
+      EVT ExtVT = TLI.getTypeForExtReturn(Ctx, VT,
+                                          extOpcodeToISDExtOpcode(ExtendOp));
+      if (ExtVT != VT) {
+        VT = ExtVT;
+        Ty = ExtVT.getTypeForEVT(Ctx);
+        LLTy = getLLTForType(*Ty, DL);
+        Reg = B.buildInstr(ExtendOp, {LLTy}, {Reg}).getReg(0);
+      }
+    }
+
+    unsigned NumParts = TLI.getNumRegistersForCallingConv(Ctx, CallConv, VT);
+    MVT RegVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
 
     if (NumParts == 1) {
       // No splitting to do, but we want to replace the original type (e.g. [1 x
       // double] -> double).
-      SplitArgs.emplace_back(OrigArg.Regs[SplitIdx], Ty,
-                             OrigArg.Flags, OrigArg.IsFixed);
+      SplitArgs.emplace_back(Reg, Ty, OrigArg.Flags, OrigArg.IsFixed);
 
       ++SplitIdx;
       continue;
     }
 
-    LLT LLTy = getLLTForType(*Ty, DL);
-
     SmallVector<Register, 8> SplitRegs;
-
-    EVT PartVT = TLI.getRegisterTypeForCallingConv(Ctx, CallConv, VT);
-    Type *PartTy = PartVT.getTypeForEVT(Ctx);
+    Type *PartTy = EVT(RegVT).getTypeForEVT(Ctx);
     LLT PartLLT = getLLTForType(*PartTy, DL);
+    MachineRegisterInfo &MRI = *B.getMRI();
 
     // FIXME: Should we be reporting all of the part registers for a single
     // argument, and let handleAssignments take care of the repacking?
@@ -198,7 +247,7 @@ void AMDGPUCallLowering::splitToValueTypes(
       SplitArgs.emplace_back(ArrayRef<Register>(PartReg), PartTy, OrigArg.Flags);
     }
 
-    PerformArgSplit(SplitRegs, LLTy, PartLLT, SplitIdx);
+    PerformArgSplit(SplitRegs, Reg, LLTy, PartLLT, SplitIdx);
 
     ++SplitIdx;
   }
@@ -218,13 +267,11 @@ static LLT getMultipleType(LLT OrigTy, int Factor) {
 static void unpackRegsToOrigType(MachineIRBuilder &B,
                                  ArrayRef<Register> DstRegs,
                                  Register SrcReg,
+                                 const CallLowering::ArgInfo &Info,
                                  LLT SrcTy,
                                  LLT PartTy) {
   assert(DstRegs.size() > 1 && "Nothing to unpack");
 
-  MachineFunction &MF = B.getMF();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-
   const unsigned SrcSize = SrcTy.getSizeInBits();
   const unsigned PartSize = PartTy.getSizeInBits();
 
@@ -248,12 +295,11 @@ static void unpackRegsToOrigType(MachineIRBuilder &B,
   LLT BigTy = getMultipleType(PartTy, NumRoundedParts);
   auto ImpDef = B.buildUndef(BigTy);
 
-  Register BigReg = MRI.createGenericVirtualRegister(BigTy);
-  B.buildInsert(BigReg, ImpDef.getReg(0), SrcReg, 0).getReg(0);
+  auto Big = B.buildInsert(BigTy, ImpDef.getReg(0), SrcReg, 0).getReg(0);
 
   int64_t Offset = 0;
   for (unsigned i = 0, e = DstRegs.size(); i != e; ++i, Offset += PartSize)
-    B.buildExtract(DstRegs[i], BigReg, Offset);
+    B.buildExtract(DstRegs[i], Big, Offset);
 }
 
 /// Lower the return value for the already existing \p Ret. This assumes that
@@ -267,24 +313,26 @@ bool AMDGPUCallLowering::lowerReturnVal(MachineIRBuilder &B,
   auto &MF = B.getMF();
   const auto &F = MF.getFunction();
   const DataLayout &DL = MF.getDataLayout();
+  MachineRegisterInfo *MRI = B.getMRI();
 
   CallingConv::ID CC = F.getCallingConv();
   const SITargetLowering &TLI = *getTLI<SITargetLowering>();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
 
   ArgInfo OrigRetInfo(VRegs, Val->getType());
   setArgFlags(OrigRetInfo, AttributeList::ReturnIndex, DL, F);
   SmallVector<ArgInfo, 4> SplitRetInfos;
 
   splitToValueTypes(
-    OrigRetInfo, SplitRetInfos, DL, MRI, CC,
-    [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
-      unpackRegsToOrigType(B, Regs, VRegs[VTSplitIdx], LLTy, PartLLT);
+    B, OrigRetInfo, AttributeList::ReturnIndex, SplitRetInfos, DL, CC,
+    [&](ArrayRef<Register> Regs, Register SrcReg, LLT LLTy, LLT PartLLT,
+        int VTSplitIdx) {
+      unpackRegsToOrigType(B, Regs, SrcReg,
+                           SplitRetInfos[VTSplitIdx],
+                           LLTy, PartLLT);
     });
 
   CCAssignFn *AssignFn = TLI.CCAssignFnForReturn(CC, F.isVarArg());
-
-  OutgoingValueHandler RetHandler(B, MF.getRegInfo(), Ret, AssignFn);
+  OutgoingValueHandler RetHandler(B, *MRI, Ret, AssignFn);
   return handleAssignments(B, SplitRetInfos, RetHandler);
 }
 
@@ -309,7 +357,7 @@ bool AMDGPUCallLowering::lowerReturn(MachineIRBuilder &B,
     return true;
   }
 
-  auto const &ST = B.getMF().getSubtarget<GCNSubtarget>();
+  auto const &ST = MF.getSubtarget<GCNSubtarget>();
 
   unsigned ReturnOpc =
       IsShader ? AMDGPU::SI_RETURN_TO_EPILOG : AMDGPU::S_SETPC_B64_return;
@@ -348,22 +396,17 @@ Register AMDGPUCallLowering::lowerParameterPtr(MachineIRBuilder &B,
   const DataLayout &DL = F.getParent()->getDataLayout();
   PointerType *PtrTy = PointerType::get(ParamTy, AMDGPUAS::CONSTANT_ADDRESS);
   LLT PtrType = getLLTForType(*PtrTy, DL);
-  Register DstReg = MRI.createGenericVirtualRegister(PtrType);
   Register KernArgSegmentPtr =
     MFI->getPreloadedReg(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
   Register KernArgSegmentVReg = MRI.getLiveInVirtReg(KernArgSegmentPtr);
 
-  Register OffsetReg = MRI.createGenericVirtualRegister(LLT::scalar(64));
-  B.buildConstant(OffsetReg, Offset);
+  auto OffsetReg = B.buildConstant(LLT::scalar(64), Offset);
 
-  B.buildPtrAdd(DstReg, KernArgSegmentVReg, OffsetReg);
-
-  return DstReg;
+  return B.buildPtrAdd(PtrType, KernArgSegmentVReg, OffsetReg).getReg(0);
 }
 
-void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B,
-                                        Type *ParamTy, uint64_t Offset,
-                                        unsigned Align,
+void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B, Type *ParamTy,
+                                        uint64_t Offset, Align Alignment,
                                         Register DstReg) const {
   MachineFunction &MF = B.getMF();
   const Function &F = MF.getFunction();
@@ -372,11 +415,11 @@ void AMDGPUCallLowering::lowerParameter(MachineIRBuilder &B,
   unsigned TypeSize = DL.getTypeStoreSize(ParamTy);
   Register PtrReg = lowerParameterPtr(B, ParamTy, Offset);
 
-  MachineMemOperand *MMO =
-      MF.getMachineMemOperand(PtrInfo, MachineMemOperand::MOLoad |
-                                       MachineMemOperand::MODereferenceable |
-                                       MachineMemOperand::MOInvariant,
-                                       TypeSize, Align);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      PtrInfo,
+      MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+          MachineMemOperand::MOInvariant,
+      TypeSize, Alignment);
 
   B.buildLoad(DstReg, PtrReg, *MMO);
 }
@@ -389,19 +432,19 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
                                  SIMachineFunctionInfo &Info) {
   // FIXME: How should these inputs interact with inreg / custom SGPR inputs?
   if (Info.hasPrivateSegmentBuffer()) {
-    unsigned PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
+    Register PrivateSegmentBufferReg = Info.addPrivateSegmentBuffer(TRI);
     MF.addLiveIn(PrivateSegmentBufferReg, &AMDGPU::SGPR_128RegClass);
     CCInfo.AllocateReg(PrivateSegmentBufferReg);
   }
 
   if (Info.hasDispatchPtr()) {
-    unsigned DispatchPtrReg = Info.addDispatchPtr(TRI);
+    Register DispatchPtrReg = Info.addDispatchPtr(TRI);
     MF.addLiveIn(DispatchPtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(DispatchPtrReg);
   }
 
   if (Info.hasQueuePtr()) {
-    unsigned QueuePtrReg = Info.addQueuePtr(TRI);
+    Register QueuePtrReg = Info.addQueuePtr(TRI);
     MF.addLiveIn(QueuePtrReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(QueuePtrReg);
   }
@@ -418,13 +461,13 @@ static void allocateHSAUserSGPRs(CCState &CCInfo,
   }
 
   if (Info.hasDispatchID()) {
-    unsigned DispatchIDReg = Info.addDispatchID(TRI);
+    Register DispatchIDReg = Info.addDispatchID(TRI);
     MF.addLiveIn(DispatchIDReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(DispatchIDReg);
   }
 
   if (Info.hasFlatScratchInit()) {
-    unsigned FlatScratchInitReg = Info.addFlatScratchInit(TRI);
+    Register FlatScratchInitReg = Info.addFlatScratchInit(TRI);
     MF.addLiveIn(FlatScratchInitReg, &AMDGPU::SGPR_64RegClass);
     CCInfo.AllocateReg(FlatScratchInitReg);
   }
@@ -451,7 +494,7 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
   allocateHSAUserSGPRs(CCInfo, B, MF, *TRI, *Info);
 
   unsigned i = 0;
-  const unsigned KernArgBaseAlign = 16;
+  const Align KernArgBaseAlign(16);
   const unsigned BaseOffset = Subtarget->getExplicitKernelArgOffset(F);
   uint64_t ExplicitArgOffset = 0;
 
@@ -462,19 +505,24 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
     if (AllocSize == 0)
       continue;
 
-    unsigned ABIAlign = DL.getABITypeAlignment(ArgTy);
+    Align ABIAlign = DL.getABITypeAlign(ArgTy);
 
     uint64_t ArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + BaseOffset;
     ExplicitArgOffset = alignTo(ExplicitArgOffset, ABIAlign) + AllocSize;
 
+    if (Arg.use_empty()) {
+      ++i;
+      continue;
+    }
+
     ArrayRef<Register> OrigArgRegs = VRegs[i];
     Register ArgReg =
       OrigArgRegs.size() == 1
       ? OrigArgRegs[0]
       : MRI.createGenericVirtualRegister(getLLTForType(*ArgTy, DL));
-    unsigned Align = MinAlign(KernArgBaseAlign, ArgOffset);
-    ArgOffset = alignTo(ArgOffset, DL.getABITypeAlignment(ArgTy));
-    lowerParameter(B, ArgTy, ArgOffset, Align, ArgReg);
+
+    Align Alignment = commonAlignment(KernArgBaseAlign, ArgOffset);
+    lowerParameter(B, ArgTy, ArgOffset, Alignment, ArgReg);
     if (OrigArgRegs.size() > 1)
       unpackRegs(OrigArgRegs, ArgReg, ArgTy, B);
     ++i;
@@ -485,38 +533,72 @@ bool AMDGPUCallLowering::lowerFormalArgumentsKernel(
   return true;
 }
 
+/// Pack values \p SrcRegs to cover the vector type result \p DstRegs.
+static MachineInstrBuilder mergeVectorRegsToResultRegs(
+  MachineIRBuilder &B, ArrayRef<Register> DstRegs, ArrayRef<Register> SrcRegs) {
+  MachineRegisterInfo &MRI = *B.getMRI();
+  LLT LLTy = MRI.getType(DstRegs[0]);
+  LLT PartLLT = MRI.getType(SrcRegs[0]);
+
+  // Deal with v3s16 split into v2s16
+  LLT LCMTy = getLCMType(LLTy, PartLLT);
+  if (LCMTy == LLTy) {
+    // Common case where no padding is needed.
+    assert(DstRegs.size() == 1);
+    return B.buildConcatVectors(DstRegs[0], SrcRegs);
+  }
+
+  const int NumWide =  LCMTy.getSizeInBits() / PartLLT.getSizeInBits();
+  Register Undef = B.buildUndef(PartLLT).getReg(0);
+
+  // Build vector of undefs.
+  SmallVector<Register, 8> WidenedSrcs(NumWide, Undef);
+
+  // Replace the first sources with the real registers.
+  std::copy(SrcRegs.begin(), SrcRegs.end(), WidenedSrcs.begin());
+
+  auto Widened = B.buildConcatVectors(LCMTy, WidenedSrcs);
+  int NumDst = LCMTy.getSizeInBits() / LLTy.getSizeInBits();
+
+  SmallVector<Register, 8> PadDstRegs(NumDst);
+  std::copy(DstRegs.begin(), DstRegs.end(), PadDstRegs.begin());
+
+  // Create the excess dead defs for the unmerge.
+  for (int I = DstRegs.size(); I != NumDst; ++I)
+    PadDstRegs[I] = MRI.createGenericVirtualRegister(LLTy);
+
+  return B.buildUnmerge(PadDstRegs, Widened);
+}
+
 // TODO: Move this to generic code
 static void packSplitRegsToOrigType(MachineIRBuilder &B,
                                     ArrayRef<Register> OrigRegs,
                                     ArrayRef<Register> Regs,
                                     LLT LLTy,
                                     LLT PartLLT) {
-  if (!LLTy.isVector() && !PartLLT.isVector()) {
-    B.buildMerge(OrigRegs[0], Regs);
-    return;
-  }
+  MachineRegisterInfo &MRI = *B.getMRI();
 
-  if (LLTy.isVector() && PartLLT.isVector()) {
-    assert(LLTy.getElementType() == PartLLT.getElementType());
+  if (!LLTy.isVector() && !PartLLT.isVector()) {
+    assert(OrigRegs.size() == 1);
+    LLT OrigTy = MRI.getType(OrigRegs[0]);
 
-    int DstElts = LLTy.getNumElements();
-    int PartElts = PartLLT.getNumElements();
-    if (DstElts % PartElts == 0)
-      B.buildConcatVectors(OrigRegs[0], Regs);
+    unsigned SrcSize = PartLLT.getSizeInBits() * Regs.size();
+    if (SrcSize == OrigTy.getSizeInBits())
+      B.buildMerge(OrigRegs[0], Regs);
     else {
-      // Deal with v3s16 split into v2s16
-      assert(PartElts == 2 && DstElts % 2 != 0);
-      int RoundedElts = PartElts * ((DstElts + PartElts - 1) / PartElts);
-
-      LLT RoundedDestTy = LLT::vector(RoundedElts, PartLLT.getElementType());
-      auto RoundedConcat = B.buildConcatVectors(RoundedDestTy, Regs);
-      B.buildExtract(OrigRegs[0], RoundedConcat, 0);
+      auto Widened = B.buildMerge(LLT::scalar(SrcSize), Regs);
+      B.buildTrunc(OrigRegs[0], Widened);
     }
 
     return;
   }
 
-  MachineRegisterInfo &MRI = *B.getMRI();
+  if (LLTy.isVector() && PartLLT.isVector()) {
+    assert(OrigRegs.size() == 1);
+    assert(LLTy.getElementType() == PartLLT.getElementType());
+    mergeVectorRegsToResultRegs(B, OrigRegs, Regs);
+    return;
+  }
 
   assert(LLTy.isVector() && !PartLLT.isVector());
 
@@ -644,13 +726,16 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     }
 
     ArgInfo OrigArg(VRegs[Idx], Arg.getType());
-    setArgFlags(OrigArg, Idx + AttributeList::FirstArgIndex, DL, F);
+    const unsigned OrigArgIdx = Idx + AttributeList::FirstArgIndex;
+    setArgFlags(OrigArg, OrigArgIdx, DL, F);
 
     splitToValueTypes(
-      OrigArg, SplitArgs, DL, MRI, CC,
+      B, OrigArg, OrigArgIdx, SplitArgs, DL, CC,
       // FIXME: We should probably be passing multiple registers to
       // handleAssignments to do this
-      [&](ArrayRef<Register> Regs, LLT LLTy, LLT PartLLT, int VTSplitIdx) {
+      [&](ArrayRef<Register> Regs, Register DstReg,
+          LLT LLTy, LLT PartLLT, int VTSplitIdx) {
+        assert(DstReg == VRegs[Idx][VTSplitIdx]);
         packSplitRegsToOrigType(B, VRegs[Idx][VTSplitIdx], Regs,
                                 LLTy, PartLLT);
       });
@@ -705,11 +790,17 @@ bool AMDGPUCallLowering::lowerFormalArguments(
   if (!MBB.empty())
     B.setInstr(*MBB.begin());
 
+  if (!IsEntryFunc) {
+    // For the fixed ABI, pass workitem IDs in the last argument register.
+    if (AMDGPUTargetMachine::EnableFixedFunctionABI)
+      TLI.allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
+  }
+
   FormalArgHandler Handler(B, MRI, AssignFn);
   if (!handleAssignments(CCInfo, ArgLocs, B, SplitArgs, Handler))
     return false;
 
-  if (!IsEntryFunc) {
+  if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
     // Special inputs come after user arguments.
     TLI.allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
   }
@@ -719,8 +810,6 @@ bool AMDGPUCallLowering::lowerFormalArguments(
     TLI.allocateSystemSGPRs(CCInfo, MF, *Info, CC, IsShader);
   } else {
     CCInfo.AllocateReg(Info->getScratchRSrcReg());
-    CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
-    CCInfo.AllocateReg(Info->getFrameOffsetReg());
     TLI.allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
index 53a562586bc0..446619d1502e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallLowering.h
@@ -27,14 +27,16 @@ class AMDGPUCallLowering: public CallLowering {
                              uint64_t Offset) const;
 
   void lowerParameter(MachineIRBuilder &B, Type *ParamTy, uint64_t Offset,
-                      unsigned Align, Register DstReg) const;
+                      Align Alignment, Register DstReg) const;
 
   /// A function of this type is used to perform value split action.
-  using SplitArgTy = std::function<void(ArrayRef<Register>, LLT, LLT, int)>;
+  using SplitArgTy = std::function<void(ArrayRef<Register>, Register, LLT, LLT, int)>;
 
-  void splitToValueTypes(const ArgInfo &OrigArgInfo,
+  void splitToValueTypes(MachineIRBuilder &B,
+                         const ArgInfo &OrigArgInfo,
+                         unsigned OrigArgIdx,
                          SmallVectorImpl<ArgInfo> &SplitArgs,
-                         const DataLayout &DL, MachineRegisterInfo &MRI,
+                         const DataLayout &DL,
                          CallingConv::ID CallConv,
                          SplitArgTy SplitArg) const;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
index f8a54a61aac2..7c83b6dcb44b 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCallingConv.td
@@ -18,7 +18,7 @@ class CCIfExtend<CCAction A>
 // Calling convention for SI
 def CC_SI : CallingConv<[
 
-  CCIfInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
+  CCIfInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -28,7 +28,7 @@ def CC_SI : CallingConv<[
   ]>>>,
 
   // 32*4 + 4 is the minimum for a fetch shader consumer with 32 inputs.
-  CCIfNotInReg<CCIfType<[f32, i32, f16, v2i16, v2f16] , CCAssignToReg<[
+  CCIfNotInReg<CCIfType<[f32, i32, f16, i16, v2i16, v2f16] , CCAssignToReg<[
     VGPR0, VGPR1, VGPR2, VGPR3, VGPR4, VGPR5, VGPR6, VGPR7,
     VGPR8, VGPR9, VGPR10, VGPR11, VGPR12, VGPR13, VGPR14, VGPR15,
     VGPR16, VGPR17, VGPR18, VGPR19, VGPR20, VGPR21, VGPR22, VGPR23,
@@ -50,7 +50,7 @@ def CC_SI : CallingConv<[
 ]>;
 
 def RetCC_SI_Shader : CallingConv<[
-  CCIfType<[i32] , CCAssignToReg<[
+  CCIfType<[i32, i16] , CCAssignToReg<[
     SGPR0, SGPR1, SGPR2, SGPR3, SGPR4, SGPR5, SGPR6, SGPR7,
     SGPR8, SGPR9, SGPR10, SGPR11, SGPR12, SGPR13, SGPR14, SGPR15,
     SGPR16, SGPR17, SGPR18, SGPR19, SGPR20, SGPR21, SGPR22, SGPR23,
@@ -89,6 +89,24 @@ def CSR_AMDGPU_VGPRs_32_255 : CalleeSavedRegs<
   (sequence "VGPR%u", 32, 255)
 >;
 
+def CSR_AMDGPU_VGPRs : CalleeSavedRegs<
+  // The CSRs & scratch-registers are interleaved at a split boundary of 8.
+  (add (sequence "VGPR%u", 40, 47),
+    (sequence "VGPR%u", 56, 63),
+    (sequence "VGPR%u", 72, 79),
+    (sequence "VGPR%u", 88, 95),
+    (sequence "VGPR%u", 104, 111),
+    (sequence "VGPR%u", 120, 127),
+    (sequence "VGPR%u", 136, 143),
+    (sequence "VGPR%u", 152, 159),
+    (sequence "VGPR%u", 168, 175),
+    (sequence "VGPR%u", 184, 191),
+    (sequence "VGPR%u", 200, 207),
+    (sequence "VGPR%u", 216, 223),
+    (sequence "VGPR%u", 232, 239),
+    (sequence "VGPR%u", 248, 255))
+>;
+
 def CSR_AMDGPU_SGPRs_32_105 : CalleeSavedRegs<
   (sequence "SGPR%u", 32, 105)
 >;
@@ -104,7 +122,7 @@ def CSR_AMDGPU_AllAllocatableSRegs : CalleeSavedRegs<
 >;
 
 def CSR_AMDGPU_HighRegs : CalleeSavedRegs<
-  (add CSR_AMDGPU_VGPRs_32_255, CSR_AMDGPU_SGPRs_32_105)
+  (add CSR_AMDGPU_VGPRs, CSR_AMDGPU_SGPRs_32_105)
 >;
 
 // Calling convention for leaf functions
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index cf908766caa0..a79549301740 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -15,8 +15,10 @@
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
 #include "llvm/Analysis/Loads.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -26,6 +28,7 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
@@ -41,6 +44,7 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/IntegerDivision.h"
 #include <cassert>
 #include <iterator>
 
@@ -54,7 +58,7 @@ static cl::opt<bool> WidenLoads(
   "amdgpu-codegenprepare-widen-constant-loads",
   cl::desc("Widen sub-dword constant address space loads in AMDGPUCodeGenPrepare"),
   cl::ReallyHidden,
-  cl::init(true));
+  cl::init(false));
 
 static cl::opt<bool> UseMul24Intrin(
   "amdgpu-codegenprepare-mul24",
@@ -62,10 +66,26 @@ static cl::opt<bool> UseMul24Intrin(
   cl::ReallyHidden,
   cl::init(true));
 
+// Legalize 64-bit division by using the generic IR expansion.
+static cl::opt<bool> ExpandDiv64InIR(
+  "amdgpu-codegenprepare-expand-div64",
+  cl::desc("Expand 64-bit division in AMDGPUCodeGenPrepare"),
+  cl::ReallyHidden,
+  cl::init(false));
+
+// Leave all division operations as they are. This supersedes ExpandDiv64InIR
+// and is used for testing the legalizer.
+static cl::opt<bool> DisableIDivExpand(
+  "amdgpu-codegenprepare-disable-idiv-expansion",
+  cl::desc("Prevent expanding integer division in AMDGPUCodeGenPrepare"),
+  cl::ReallyHidden,
+  cl::init(false));
+
 class AMDGPUCodeGenPrepare : public FunctionPass,
                              public InstVisitor<AMDGPUCodeGenPrepare, bool> {
   const GCNSubtarget *ST = nullptr;
   AssumptionCache *AC = nullptr;
+  DominatorTree *DT = nullptr;
   LegacyDivergenceAnalysis *DA = nullptr;
   Module *Mod = nullptr;
   const DataLayout *DL = nullptr;
@@ -152,15 +172,33 @@ class AMDGPUCodeGenPrepare : public FunctionPass,
   /// SelectionDAG has an issue where an and asserting the bits are known
   bool replaceMulWithMul24(BinaryOperator &I) const;
 
+  /// Perform same function as equivalently named function in DAGCombiner. Since
+  /// we expand some divisions here, we need to perform this before obscuring.
+  bool foldBinOpIntoSelect(BinaryOperator &I) const;
+
+  bool divHasSpecialOptimization(BinaryOperator &I,
+                                 Value *Num, Value *Den) const;
+  int getDivNumBits(BinaryOperator &I,
+                    Value *Num, Value *Den,
+                    unsigned AtLeast, bool Signed) const;
+
   /// Expands 24 bit div or rem.
   Value* expandDivRem24(IRBuilder<> &Builder, BinaryOperator &I,
                         Value *Num, Value *Den,
                         bool IsDiv, bool IsSigned) const;
 
+  Value *expandDivRem24Impl(IRBuilder<> &Builder, BinaryOperator &I,
+                            Value *Num, Value *Den, unsigned NumBits,
+                            bool IsDiv, bool IsSigned) const;
+
   /// Expands 32 bit div or rem.
   Value* expandDivRem32(IRBuilder<> &Builder, BinaryOperator &I,
                         Value *Num, Value *Den) const;
 
+  Value *shrinkDivRem64(IRBuilder<> &Builder, BinaryOperator &I,
+                        Value *Num, Value *Den) const;
+  void expandDivRem64(BinaryOperator &I) const;
+
   /// Widen a scalar load.
   ///
   /// \details \p Widen scalar load for uniform, small type loads from constant
@@ -195,7 +233,10 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<LegacyDivergenceAnalysis>();
-    AU.setPreservesAll();
+
+    // FIXME: Division expansion needs to preserve the dominator tree.
+    if (!ExpandDiv64InIR)
+      AU.setPreservesAll();
  }
 };
 
@@ -214,7 +255,7 @@ Type *AMDGPUCodeGenPrepare::getI32Ty(IRBuilder<> &B, const Type *T) const {
 
   if (T->isIntegerTy())
     return B.getInt32Ty();
-  return VectorType::get(B.getInt32Ty(), cast<VectorType>(T)->getNumElements());
+  return FixedVectorType::get(B.getInt32Ty(), cast<FixedVectorType>(T));
 }
 
 bool AMDGPUCodeGenPrepare::isSigned(const BinaryOperator &I) const {
@@ -276,10 +317,9 @@ bool AMDGPUCodeGenPrepare::canWidenScalarExtLoad(LoadInst &I) const {
   Type *Ty = I.getType();
   const DataLayout &DL = Mod->getDataLayout();
   int TySize = DL.getTypeSizeInBits(Ty);
-  unsigned Align = I.getAlignment() ?
-                   I.getAlignment() : DL.getABITypeAlignment(Ty);
+  Align Alignment = DL.getValueOrABITypeAlignment(I.getAlign(), Ty);
 
-  return I.isSimple() && TySize < 32 && Align >= 4 && DA->isUniform(&I);
+  return I.isSimple() && TySize < 32 && Alignment >= 4 && DA->isUniform(&I);
 }
 
 bool AMDGPUCodeGenPrepare::promoteUniformOpToI32(BinaryOperator &I) const {
@@ -436,7 +476,7 @@ bool AMDGPUCodeGenPrepare::isU24(Value *V, unsigned ScalarSize) const {
 
 static void extractValues(IRBuilder<> &Builder,
                           SmallVectorImpl<Value *> &Values, Value *V) {
-  VectorType *VT = dyn_cast<VectorType>(V->getType());
+  auto *VT = dyn_cast<FixedVectorType>(V->getType());
   if (!VT) {
     Values.push_back(V);
     return;
@@ -525,58 +565,218 @@ bool AMDGPUCodeGenPrepare::replaceMulWithMul24(BinaryOperator &I) const {
   return true;
 }
 
-static bool shouldKeepFDivF32(Value *Num, bool UnsafeDiv, bool HasDenormals) {
-  const ConstantFP *CNum = dyn_cast<ConstantFP>(Num);
-  if (!CNum)
-    return HasDenormals;
+// Find a select instruction, which may have been casted. This is mostly to deal
+// with cases where i16 selects were promoted here to i32.
+static SelectInst *findSelectThroughCast(Value *V, CastInst *&Cast) {
+  Cast = nullptr;
+  if (SelectInst *Sel = dyn_cast<SelectInst>(V))
+    return Sel;
 
-  if (UnsafeDiv)
-    return true;
+  if ((Cast = dyn_cast<CastInst>(V))) {
+    if (SelectInst *Sel = dyn_cast<SelectInst>(Cast->getOperand(0)))
+      return Sel;
+  }
+
+  return nullptr;
+}
+
+bool AMDGPUCodeGenPrepare::foldBinOpIntoSelect(BinaryOperator &BO) const {
+  // Don't do this unless the old select is going away. We want to eliminate the
+  // binary operator, not replace a binop with a select.
+  int SelOpNo = 0;
+
+  CastInst *CastOp;
+
+  // TODO: Should probably try to handle some cases with multiple
+  // users. Duplicating the select may be profitable for division.
+  SelectInst *Sel = findSelectThroughCast(BO.getOperand(0), CastOp);
+  if (!Sel || !Sel->hasOneUse()) {
+    SelOpNo = 1;
+    Sel = findSelectThroughCast(BO.getOperand(1), CastOp);
+  }
+
+  if (!Sel || !Sel->hasOneUse())
+    return false;
+
+  Constant *CT = dyn_cast<Constant>(Sel->getTrueValue());
+  Constant *CF = dyn_cast<Constant>(Sel->getFalseValue());
+  Constant *CBO = dyn_cast<Constant>(BO.getOperand(SelOpNo ^ 1));
+  if (!CBO || !CT || !CF)
+    return false;
+
+  if (CastOp) {
+    if (!CastOp->hasOneUse())
+      return false;
+    CT = ConstantFoldCastOperand(CastOp->getOpcode(), CT, BO.getType(), *DL);
+    CF = ConstantFoldCastOperand(CastOp->getOpcode(), CF, BO.getType(), *DL);
+  }
+
+  // TODO: Handle special 0/-1 cases DAG combine does, although we only really
+  // need to handle divisions here.
+  Constant *FoldedT = SelOpNo ?
+    ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CT, *DL) :
+    ConstantFoldBinaryOpOperands(BO.getOpcode(), CT, CBO, *DL);
+  if (isa<ConstantExpr>(FoldedT))
+    return false;
+
+  Constant *FoldedF = SelOpNo ?
+    ConstantFoldBinaryOpOperands(BO.getOpcode(), CBO, CF, *DL) :
+    ConstantFoldBinaryOpOperands(BO.getOpcode(), CF, CBO, *DL);
+  if (isa<ConstantExpr>(FoldedF))
+    return false;
+
+  IRBuilder<> Builder(&BO);
+  Builder.SetCurrentDebugLocation(BO.getDebugLoc());
+  if (const FPMathOperator *FPOp = dyn_cast<const FPMathOperator>(&BO))
+    Builder.setFastMathFlags(FPOp->getFastMathFlags());
+
+  Value *NewSelect = Builder.CreateSelect(Sel->getCondition(),
+                                          FoldedT, FoldedF);
+  NewSelect->takeName(&BO);
+  BO.replaceAllUsesWith(NewSelect);
+  BO.eraseFromParent();
+  if (CastOp)
+    CastOp->eraseFromParent();
+  Sel->eraseFromParent();
+  return true;
+}
+
+// Optimize fdiv with rcp:
+//
+// 1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
+//               allowed with unsafe-fp-math or afn.
+//
+// a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
+static Value *optimizeWithRcp(Value *Num, Value *Den, bool AllowInaccurateRcp,
+                              bool RcpIsAccurate, IRBuilder<> &Builder,
+                              Module *Mod) {
+
+  if (!AllowInaccurateRcp && !RcpIsAccurate)
+    return nullptr;
+
+  Type *Ty = Den->getType();
+  if (const ConstantFP *CLHS = dyn_cast<ConstantFP>(Num)) {
+    if (AllowInaccurateRcp || RcpIsAccurate) {
+      if (CLHS->isExactlyValue(1.0)) {
+        Function *Decl = Intrinsic::getDeclaration(
+          Mod, Intrinsic::amdgcn_rcp, Ty);
+
+        // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+        // the CI documentation has a worst case error of 1 ulp.
+        // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+        // use it as long as we aren't trying to use denormals.
+        //
+        // v_rcp_f16 and v_rsq_f16 DO support denormals.
+
+        // NOTE: v_sqrt and v_rcp will be combined to v_rsq later. So we don't
+        //       insert rsq intrinsic here.
+
+        // 1.0 / x -> rcp(x)
+        return Builder.CreateCall(Decl, { Den });
+      }
+
+       // Same as for 1.0, but expand the sign out of the constant.
+      if (CLHS->isExactlyValue(-1.0)) {
+        Function *Decl = Intrinsic::getDeclaration(
+          Mod, Intrinsic::amdgcn_rcp, Ty);
+
+         // -1.0 / x -> rcp (fneg x)
+         Value *FNeg = Builder.CreateFNeg(Den);
+         return Builder.CreateCall(Decl, { FNeg });
+       }
+    }
+  }
 
-  bool IsOne = CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0);
+  if (AllowInaccurateRcp) {
+    Function *Decl = Intrinsic::getDeclaration(
+      Mod, Intrinsic::amdgcn_rcp, Ty);
 
-  // Reciprocal f32 is handled separately without denormals.
-  return HasDenormals ^ IsOne;
+    // Turn into multiply by the reciprocal.
+    // x / y -> x * (1.0 / y)
+    Value *Recip = Builder.CreateCall(Decl, { Den });
+    return Builder.CreateFMul(Num, Recip);
+  }
+  return nullptr;
+}
+
+// optimize with fdiv.fast:
+//
+// a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
+//
+// 1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
+//
+// NOTE: optimizeWithRcp should be tried first because rcp is the preference.
+static Value *optimizeWithFDivFast(Value *Num, Value *Den, float ReqdAccuracy,
+                                   bool HasDenormals, IRBuilder<> &Builder,
+                                   Module *Mod) {
+  // fdiv.fast can achieve 2.5 ULP accuracy.
+  if (ReqdAccuracy < 2.5f)
+    return nullptr;
+
+  // Only have fdiv.fast for f32.
+  Type *Ty = Den->getType();
+  if (!Ty->isFloatTy())
+    return nullptr;
+
+  bool NumIsOne = false;
+  if (const ConstantFP *CNum = dyn_cast<ConstantFP>(Num)) {
+    if (CNum->isExactlyValue(+1.0) || CNum->isExactlyValue(-1.0))
+      NumIsOne = true;
+  }
+
+  // fdiv does not support denormals. But 1.0/x is always fine to use it.
+  if (HasDenormals && !NumIsOne)
+    return nullptr;
+
+  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
+  return Builder.CreateCall(Decl, { Num, Den });
 }
 
-// Insert an intrinsic for fast fdiv for safe math situations where we can
-// reduce precision. Leave fdiv for situations where the generic node is
-// expected to be optimized.
+// Optimizations is performed based on fpmath, fast math flags as well as
+// denormals to optimize fdiv with either rcp or fdiv.fast.
+//
+// With rcp:
+//   1/x -> rcp(x) when rcp is sufficiently accurate or inaccurate rcp is
+//                 allowed with unsafe-fp-math or afn.
+//
+//   a/b -> a*rcp(b) when inaccurate rcp is allowed with unsafe-fp-math or afn.
+//
+// With fdiv.fast:
+//   a/b -> fdiv.fast(a, b) when !fpmath >= 2.5ulp with denormals flushed.
+//
+//   1/x -> fdiv.fast(1,x)  when !fpmath >= 2.5ulp.
+//
+// NOTE: rcp is the preference in cases that both are legal.
 bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
-  Type *Ty = FDiv.getType();
 
-  if (!Ty->getScalarType()->isFloatTy())
-    return false;
+  Type *Ty = FDiv.getType()->getScalarType();
 
-  MDNode *FPMath = FDiv.getMetadata(LLVMContext::MD_fpmath);
-  if (!FPMath)
+  // No intrinsic for fdiv16 if target does not support f16.
+  if (Ty->isHalfTy() && !ST->has16BitInsts())
     return false;
 
   const FPMathOperator *FPOp = cast<const FPMathOperator>(&FDiv);
-  float ULP = FPOp->getFPAccuracy();
-  if (ULP < 2.5f)
-    return false;
+  const float ReqdAccuracy =  FPOp->getFPAccuracy();
 
+  // Inaccurate rcp is allowed with unsafe-fp-math or afn.
   FastMathFlags FMF = FPOp->getFastMathFlags();
-  bool UnsafeDiv = HasUnsafeFPMath || FMF.isFast() ||
-                                      FMF.allowReciprocal();
+  const bool AllowInaccurateRcp = HasUnsafeFPMath || FMF.approxFunc();
 
-  // With UnsafeDiv node will be optimized to just rcp and mul.
-  if (UnsafeDiv)
-    return false;
+  // rcp_f16 is accurate for !fpmath >= 1.0ulp.
+  // rcp_f32 is accurate for !fpmath >= 1.0ulp and denormals are flushed.
+  // rcp_f64 is never accurate.
+  const bool RcpIsAccurate = (Ty->isHalfTy() && ReqdAccuracy >= 1.0f) ||
+            (Ty->isFloatTy() && !HasFP32Denormals && ReqdAccuracy >= 1.0f);
 
-  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()), FPMath);
+  IRBuilder<> Builder(FDiv.getParent(), std::next(FDiv.getIterator()));
   Builder.setFastMathFlags(FMF);
   Builder.SetCurrentDebugLocation(FDiv.getDebugLoc());
 
-  Function *Decl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_fdiv_fast);
-
   Value *Num = FDiv.getOperand(0);
   Value *Den = FDiv.getOperand(1);
 
   Value *NewFDiv = nullptr;
-
-  if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+  if (auto *VT = dyn_cast<FixedVectorType>(FDiv.getType())) {
     NewFDiv = UndefValue::get(VT);
 
     // FIXME: Doesn't do the right thing for cases where the vector is partially
@@ -584,19 +784,25 @@ bool AMDGPUCodeGenPrepare::visitFDiv(BinaryOperator &FDiv) {
     for (unsigned I = 0, E = VT->getNumElements(); I != E; ++I) {
       Value *NumEltI = Builder.CreateExtractElement(Num, I);
       Value *DenEltI = Builder.CreateExtractElement(Den, I);
-      Value *NewElt;
-
-      if (shouldKeepFDivF32(NumEltI, UnsafeDiv, HasFP32Denormals)) {
+      // Try rcp first.
+      Value *NewElt = optimizeWithRcp(NumEltI, DenEltI, AllowInaccurateRcp,
+                                      RcpIsAccurate, Builder, Mod);
+      if (!NewElt) // Try fdiv.fast.
+        NewElt = optimizeWithFDivFast(NumEltI, DenEltI, ReqdAccuracy,
+                                      HasFP32Denormals, Builder, Mod);
+      if (!NewElt) // Keep the original.
         NewElt = Builder.CreateFDiv(NumEltI, DenEltI);
-      } else {
-        NewElt = Builder.CreateCall(Decl, { NumEltI, DenEltI });
-      }
 
       NewFDiv = Builder.CreateInsertElement(NewFDiv, NewElt, I);
     }
-  } else {
-    if (!shouldKeepFDivF32(Num, UnsafeDiv, HasFP32Denormals))
-      NewFDiv = Builder.CreateCall(Decl, { Num, Den });
+  } else { // Scalar FDiv.
+    // Try rcp first.
+    NewFDiv = optimizeWithRcp(Num, Den, AllowInaccurateRcp, RcpIsAccurate,
+                              Builder, Mod);
+    if (!NewFDiv) { // Try fdiv.fast.
+      NewFDiv = optimizeWithFDivFast(Num, Den, ReqdAccuracy, HasFP32Denormals,
+                                     Builder, Mod);
+    }
   }
 
   if (NewFDiv) {
@@ -631,31 +837,49 @@ static Value* getMulHu(IRBuilder<> &Builder, Value *LHS, Value *RHS) {
   return getMul64(Builder, LHS, RHS).second;
 }
 
-// The fractional part of a float is enough to accurately represent up to
-// a 24-bit signed integer.
-Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
-                                            BinaryOperator &I,
-                                            Value *Num, Value *Den,
-                                            bool IsDiv, bool IsSigned) const {
-  assert(Num->getType()->isIntegerTy(32));
-
+/// Figure out how many bits are really needed for this ddivision. \p AtLeast is
+/// an optimization hint to bypass the second ComputeNumSignBits call if we the
+/// first one is insufficient. Returns -1 on failure.
+int AMDGPUCodeGenPrepare::getDivNumBits(BinaryOperator &I,
+                                        Value *Num, Value *Den,
+                                        unsigned AtLeast, bool IsSigned) const {
   const DataLayout &DL = Mod->getDataLayout();
   unsigned LHSSignBits = ComputeNumSignBits(Num, DL, 0, AC, &I);
-  if (LHSSignBits < 9)
-    return nullptr;
+  if (LHSSignBits < AtLeast)
+    return -1;
 
   unsigned RHSSignBits = ComputeNumSignBits(Den, DL, 0, AC, &I);
-  if (RHSSignBits < 9)
-    return nullptr;
-
+  if (RHSSignBits < AtLeast)
+    return -1;
 
   unsigned SignBits = std::min(LHSSignBits, RHSSignBits);
-  unsigned DivBits = 32 - SignBits;
+  unsigned DivBits = Num->getType()->getScalarSizeInBits() - SignBits;
   if (IsSigned)
     ++DivBits;
+  return DivBits;
+}
 
-  Type *Ty = Num->getType();
+// The fractional part of a float is enough to accurately represent up to
+// a 24-bit signed integer.
+Value *AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
+                                            BinaryOperator &I,
+                                            Value *Num, Value *Den,
+                                            bool IsDiv, bool IsSigned) const {
+  int DivBits = getDivNumBits(I, Num, Den, 9, IsSigned);
+  if (DivBits == -1)
+    return nullptr;
+  return expandDivRem24Impl(Builder, I, Num, Den, DivBits, IsDiv, IsSigned);
+}
+
+Value *AMDGPUCodeGenPrepare::expandDivRem24Impl(IRBuilder<> &Builder,
+                                                BinaryOperator &I,
+                                                Value *Num, Value *Den,
+                                                unsigned DivBits,
+                                                bool IsDiv, bool IsSigned) const {
   Type *I32Ty = Builder.getInt32Ty();
+  Num = Builder.CreateTrunc(Num, I32Ty);
+  Den = Builder.CreateTrunc(Den, I32Ty);
+
   Type *F32Ty = Builder.getFloatTy();
   ConstantInt *One = Builder.getInt32(1);
   Value *JQ = One;
@@ -685,7 +909,9 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
   Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
                        : Builder.CreateUIToFP(IB,F32Ty);
 
-  Value *RCP = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), FB);
+  Function *RcpDecl = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp,
+                                                Builder.getFloatTy());
+  Value *RCP = Builder.CreateCall(RcpDecl, { FB });
   Value *FQM = Builder.CreateFMul(FA, RCP);
 
   // fq = trunc(fqm);
@@ -696,7 +922,10 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
   Value *FQNeg = Builder.CreateFNeg(FQ);
 
   // float fr = mad(fqneg, fb, fa);
-  Value *FR = Builder.CreateIntrinsic(Intrinsic::amdgcn_fmad_ftz,
+  auto FMAD = !ST->hasMadMacF32Insts()
+                  ? Intrinsic::fma
+                  : (Intrinsic::ID)Intrinsic::amdgcn_fmad_ftz;
+  Value *FR = Builder.CreateIntrinsic(FMAD,
                                       {FQNeg->getType()}, {FQNeg, FB, FA}, FQ);
 
   // int iq = (int)fq;
@@ -725,21 +954,72 @@ Value* AMDGPUCodeGenPrepare::expandDivRem24(IRBuilder<> &Builder,
     Res = Builder.CreateSub(Num, Rem);
   }
 
-  // Truncate to number of bits this divide really is.
-  if (IsSigned) {
-    Res = Builder.CreateTrunc(Res, Builder.getIntNTy(DivBits));
-    Res = Builder.CreateSExt(Res, Ty);
-  } else {
-    ConstantInt *TruncMask = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
-    Res = Builder.CreateAnd(Res, TruncMask);
+  if (DivBits != 0 && DivBits < 32) {
+    // Extend in register from the number of bits this divide really is.
+    if (IsSigned) {
+      int InRegBits = 32 - DivBits;
+
+      Res = Builder.CreateShl(Res, InRegBits);
+      Res = Builder.CreateAShr(Res, InRegBits);
+    } else {
+      ConstantInt *TruncMask
+        = Builder.getInt32((UINT64_C(1) << DivBits) - 1);
+      Res = Builder.CreateAnd(Res, TruncMask);
+    }
   }
 
   return Res;
 }
 
-Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
-                                            BinaryOperator &I,
-                                            Value *Num, Value *Den) const {
+// Try to recognize special cases the DAG will emit special, better expansions
+// than the general expansion we do here.
+
+// TODO: It would be better to just directly handle those optimizations here.
+bool AMDGPUCodeGenPrepare::divHasSpecialOptimization(
+  BinaryOperator &I, Value *Num, Value *Den) const {
+  if (Constant *C = dyn_cast<Constant>(Den)) {
+    // Arbitrary constants get a better expansion as long as a wider mulhi is
+    // legal.
+    if (C->getType()->getScalarSizeInBits() <= 32)
+      return true;
+
+    // TODO: Sdiv check for not exact for some reason.
+
+    // If there's no wider mulhi, there's only a better expansion for powers of
+    // two.
+    // TODO: Should really know for each vector element.
+    if (isKnownToBeAPowerOfTwo(C, *DL, true, 0, AC, &I, DT))
+      return true;
+
+    return false;
+  }
+
+  if (BinaryOperator *BinOpDen = dyn_cast<BinaryOperator>(Den)) {
+    // fold (udiv x, (shl c, y)) -> x >>u (log2(c)+y) iff c is power of 2
+    if (BinOpDen->getOpcode() == Instruction::Shl &&
+        isa<Constant>(BinOpDen->getOperand(0)) &&
+        isKnownToBeAPowerOfTwo(BinOpDen->getOperand(0), *DL, true,
+                               0, AC, &I, DT)) {
+      return true;
+    }
+  }
+
+  return false;
+}
+
+static Value *getSign32(Value *V, IRBuilder<> &Builder, const DataLayout *DL) {
+  // Check whether the sign can be determined statically.
+  KnownBits Known = computeKnownBits(V, *DL);
+  if (Known.isNegative())
+    return Constant::getAllOnesValue(V->getType());
+  if (Known.isNonNegative())
+    return Constant::getNullValue(V->getType());
+  return Builder.CreateAShr(V, Builder.getInt32(31));
+}
+
+Value *AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
+                                            BinaryOperator &I, Value *X,
+                                            Value *Y) const {
   Instruction::BinaryOps Opc = I.getOpcode();
   assert(Opc == Instruction::URem || Opc == Instruction::UDiv ||
          Opc == Instruction::SRem || Opc == Instruction::SDiv);
@@ -748,142 +1028,171 @@ Value* AMDGPUCodeGenPrepare::expandDivRem32(IRBuilder<> &Builder,
   FMF.setFast();
   Builder.setFastMathFlags(FMF);
 
-  if (isa<Constant>(Den))
-    return nullptr; // Keep it for optimization
+  if (divHasSpecialOptimization(I, X, Y))
+    return nullptr;  // Keep it for later optimization.
 
   bool IsDiv = Opc == Instruction::UDiv || Opc == Instruction::SDiv;
   bool IsSigned = Opc == Instruction::SRem || Opc == Instruction::SDiv;
 
-  Type *Ty = Num->getType();
+  Type *Ty = X->getType();
   Type *I32Ty = Builder.getInt32Ty();
   Type *F32Ty = Builder.getFloatTy();
 
   if (Ty->getScalarSizeInBits() < 32) {
     if (IsSigned) {
-      Num = Builder.CreateSExt(Num, I32Ty);
-      Den = Builder.CreateSExt(Den, I32Ty);
+      X = Builder.CreateSExt(X, I32Ty);
+      Y = Builder.CreateSExt(Y, I32Ty);
     } else {
-      Num = Builder.CreateZExt(Num, I32Ty);
-      Den = Builder.CreateZExt(Den, I32Ty);
+      X = Builder.CreateZExt(X, I32Ty);
+      Y = Builder.CreateZExt(Y, I32Ty);
     }
   }
 
-  if (Value *Res = expandDivRem24(Builder, I, Num, Den, IsDiv, IsSigned)) {
-    Res = Builder.CreateTrunc(Res, Ty);
-    return Res;
+  if (Value *Res = expandDivRem24(Builder, I, X, Y, IsDiv, IsSigned)) {
+    return IsSigned ? Builder.CreateSExtOrTrunc(Res, Ty) :
+                      Builder.CreateZExtOrTrunc(Res, Ty);
   }
 
   ConstantInt *Zero = Builder.getInt32(0);
   ConstantInt *One = Builder.getInt32(1);
-  ConstantInt *MinusOne = Builder.getInt32(~0);
 
   Value *Sign = nullptr;
   if (IsSigned) {
-    ConstantInt *K31 = Builder.getInt32(31);
-    Value *LHSign = Builder.CreateAShr(Num, K31);
-    Value *RHSign = Builder.CreateAShr(Den, K31);
+    Value *SignX = getSign32(X, Builder, DL);
+    Value *SignY = getSign32(Y, Builder, DL);
     // Remainder sign is the same as LHS
-    Sign = IsDiv ? Builder.CreateXor(LHSign, RHSign) : LHSign;
+    Sign = IsDiv ? Builder.CreateXor(SignX, SignY) : SignX;
 
-    Num = Builder.CreateAdd(Num, LHSign);
-    Den = Builder.CreateAdd(Den, RHSign);
+    X = Builder.CreateAdd(X, SignX);
+    Y = Builder.CreateAdd(Y, SignY);
 
-    Num = Builder.CreateXor(Num, LHSign);
-    Den = Builder.CreateXor(Den, RHSign);
+    X = Builder.CreateXor(X, SignX);
+    Y = Builder.CreateXor(Y, SignY);
   }
 
-  // RCP =  URECIP(Den) = 2^32 / Den + e
-  // e is rounding error.
-  Value *DEN_F32 = Builder.CreateUIToFP(Den, F32Ty);
-  Value *RCP_F32 = Builder.CreateFDiv(ConstantFP::get(F32Ty, 1.0), DEN_F32);
-  Constant *UINT_MAX_PLUS_1 = ConstantFP::get(F32Ty, BitsToFloat(0x4f800000));
-  Value *RCP_SCALE = Builder.CreateFMul(RCP_F32, UINT_MAX_PLUS_1);
-  Value *RCP = Builder.CreateFPToUI(RCP_SCALE, I32Ty);
-
-  // RCP_LO, RCP_HI = mul(RCP, Den) */
-  Value *RCP_LO, *RCP_HI;
-  std::tie(RCP_LO, RCP_HI) = getMul64(Builder, RCP, Den);
-
-  // NEG_RCP_LO = -RCP_LO
-  Value *NEG_RCP_LO = Builder.CreateNeg(RCP_LO);
-
-  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
-  Value *RCP_HI_0_CC = Builder.CreateICmpEQ(RCP_HI, Zero);
-  Value *ABS_RCP_LO = Builder.CreateSelect(RCP_HI_0_CC, NEG_RCP_LO, RCP_LO);
-
-  // Calculate the rounding error from the URECIP instruction
-  // E = mulhu(ABS_RCP_LO, RCP)
-  Value *E = getMulHu(Builder, ABS_RCP_LO, RCP);
-
-  // RCP_A_E = RCP + E
-  Value *RCP_A_E = Builder.CreateAdd(RCP, E);
-
-  // RCP_S_E = RCP - E
-  Value *RCP_S_E = Builder.CreateSub(RCP, E);
-
-  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
-  Value *Tmp0 = Builder.CreateSelect(RCP_HI_0_CC, RCP_A_E, RCP_S_E);
-
-  // Quotient = mulhu(Tmp0, Num)
-  Value *Quotient = getMulHu(Builder, Tmp0, Num);
-
-  // Num_S_Remainder = Quotient * Den
-  Value *Num_S_Remainder = Builder.CreateMul(Quotient, Den);
+  // The algorithm here is based on ideas from "Software Integer Division", Tom
+  // Rodeheffer, August 2008.
+  //
+  // unsigned udiv(unsigned x, unsigned y) {
+  //   // Initial estimate of inv(y). The constant is less than 2^32 to ensure
+  //   // that this is a lower bound on inv(y), even if some of the calculations
+  //   // round up.
+  //   unsigned z = (unsigned)((4294967296.0 - 512.0) * v_rcp_f32((float)y));
+  //
+  //   // One round of UNR (Unsigned integer Newton-Raphson) to improve z.
+  //   // Empirically this is guaranteed to give a "two-y" lower bound on
+  //   // inv(y).
+  //   z += umulh(z, -y * z);
+  //
+  //   // Quotient/remainder estimate.
+  //   unsigned q = umulh(x, z);
+  //   unsigned r = x - q * y;
+  //
+  //   // Two rounds of quotient/remainder refinement.
+  //   if (r >= y) {
+  //     ++q;
+  //     r -= y;
+  //   }
+  //   if (r >= y) {
+  //     ++q;
+  //     r -= y;
+  //   }
+  //
+  //   return q;
+  // }
+
+  // Initial estimate of inv(y).
+  Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
+  Function *Rcp = Intrinsic::getDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
+  Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
+  Constant *Scale = ConstantFP::get(F32Ty, BitsToFloat(0x4F7FFFFE));
+  Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
+  Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
+
+  // One round of UNR.
+  Value *NegY = Builder.CreateSub(Zero, Y);
+  Value *NegYZ = Builder.CreateMul(NegY, Z);
+  Z = Builder.CreateAdd(Z, getMulHu(Builder, Z, NegYZ));
+
+  // Quotient/remainder estimate.
+  Value *Q = getMulHu(Builder, X, Z);
+  Value *R = Builder.CreateSub(X, Builder.CreateMul(Q, Y));
+
+  // First quotient/remainder refinement.
+  Value *Cond = Builder.CreateICmpUGE(R, Y);
+  if (IsDiv)
+    Q = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
+  R = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
+
+  // Second quotient/remainder refinement.
+  Cond = Builder.CreateICmpUGE(R, Y);
+  Value *Res;
+  if (IsDiv)
+    Res = Builder.CreateSelect(Cond, Builder.CreateAdd(Q, One), Q);
+  else
+    Res = Builder.CreateSelect(Cond, Builder.CreateSub(R, Y), R);
 
-  // Remainder = Num - Num_S_Remainder
-  Value *Remainder = Builder.CreateSub(Num, Num_S_Remainder);
+  if (IsSigned) {
+    Res = Builder.CreateXor(Res, Sign);
+    Res = Builder.CreateSub(Res, Sign);
+  }
 
-  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
-  Value *Rem_GE_Den_CC = Builder.CreateICmpUGE(Remainder, Den);
-  Value *Remainder_GE_Den = Builder.CreateSelect(Rem_GE_Den_CC, MinusOne, Zero);
+  Res = Builder.CreateTrunc(Res, Ty);
 
-  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
-  Value *Num_GE_Num_S_Rem_CC = Builder.CreateICmpUGE(Num, Num_S_Remainder);
-  Value *Remainder_GE_Zero = Builder.CreateSelect(Num_GE_Num_S_Rem_CC,
-                                                  MinusOne, Zero);
+  return Res;
+}
 
-  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
-  Value *Tmp1 = Builder.CreateAnd(Remainder_GE_Den, Remainder_GE_Zero);
-  Value *Tmp1_0_CC = Builder.CreateICmpEQ(Tmp1, Zero);
+Value *AMDGPUCodeGenPrepare::shrinkDivRem64(IRBuilder<> &Builder,
+                                            BinaryOperator &I,
+                                            Value *Num, Value *Den) const {
+  if (!ExpandDiv64InIR && divHasSpecialOptimization(I, Num, Den))
+    return nullptr;  // Keep it for later optimization.
 
-  Value *Res;
-  if (IsDiv) {
-    // Quotient_A_One = Quotient + 1
-    Value *Quotient_A_One = Builder.CreateAdd(Quotient, One);
+  Instruction::BinaryOps Opc = I.getOpcode();
 
-    // Quotient_S_One = Quotient - 1
-    Value *Quotient_S_One = Builder.CreateSub(Quotient, One);
+  bool IsDiv = Opc == Instruction::SDiv || Opc == Instruction::UDiv;
+  bool IsSigned = Opc == Instruction::SDiv || Opc == Instruction::SRem;
 
-    // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
-    Value *Div = Builder.CreateSelect(Tmp1_0_CC, Quotient, Quotient_A_One);
+  int NumDivBits = getDivNumBits(I, Num, Den, 32, IsSigned);
+  if (NumDivBits == -1)
+    return nullptr;
 
-    // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
-    Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Div, Quotient_S_One);
-  } else {
-    // Remainder_S_Den = Remainder - Den
-    Value *Remainder_S_Den = Builder.CreateSub(Remainder, Den);
+  Value *Narrowed = nullptr;
+  if (NumDivBits <= 24) {
+    Narrowed = expandDivRem24Impl(Builder, I, Num, Den, NumDivBits,
+                                  IsDiv, IsSigned);
+  } else if (NumDivBits <= 32) {
+    Narrowed = expandDivRem32(Builder, I, Num, Den);
+  }
 
-    // Remainder_A_Den = Remainder + Den
-    Value *Remainder_A_Den = Builder.CreateAdd(Remainder, Den);
+  if (Narrowed) {
+    return IsSigned ? Builder.CreateSExt(Narrowed, Num->getType()) :
+                      Builder.CreateZExt(Narrowed, Num->getType());
+  }
 
-    // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
-    Value *Rem = Builder.CreateSelect(Tmp1_0_CC, Remainder, Remainder_S_Den);
+  return nullptr;
+}
 
-    // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
-    Res = Builder.CreateSelect(Num_GE_Num_S_Rem_CC, Rem, Remainder_A_Den);
+void AMDGPUCodeGenPrepare::expandDivRem64(BinaryOperator &I) const {
+  Instruction::BinaryOps Opc = I.getOpcode();
+  // Do the general expansion.
+  if (Opc == Instruction::UDiv || Opc == Instruction::SDiv) {
+    expandDivisionUpTo64Bits(&I);
+    return;
   }
 
-  if (IsSigned) {
-    Res = Builder.CreateXor(Res, Sign);
-    Res = Builder.CreateSub(Res, Sign);
+  if (Opc == Instruction::URem || Opc == Instruction::SRem) {
+    expandRemainderUpTo64Bits(&I);
+    return;
   }
 
-  Res = Builder.CreateTrunc(Res, Ty);
-
-  return Res;
+  llvm_unreachable("not a division");
 }
 
 bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
+  if (foldBinOpIntoSelect(I))
+    return true;
+
   if (ST->has16BitInsts() && needsPromotionToI32(I.getType()) &&
       DA->isUniform(&I) && promoteUniformOpToI32(I))
     return true;
@@ -895,27 +1204,54 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
   Instruction::BinaryOps Opc = I.getOpcode();
   Type *Ty = I.getType();
   Value *NewDiv = nullptr;
+  unsigned ScalarSize = Ty->getScalarSizeInBits();
+
+  SmallVector<BinaryOperator *, 8> Div64ToExpand;
+
   if ((Opc == Instruction::URem || Opc == Instruction::UDiv ||
        Opc == Instruction::SRem || Opc == Instruction::SDiv) &&
-      Ty->getScalarSizeInBits() <= 32) {
+      ScalarSize <= 64 &&
+      !DisableIDivExpand) {
     Value *Num = I.getOperand(0);
     Value *Den = I.getOperand(1);
     IRBuilder<> Builder(&I);
     Builder.SetCurrentDebugLocation(I.getDebugLoc());
 
-    if (VectorType *VT = dyn_cast<VectorType>(Ty)) {
+    if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
       NewDiv = UndefValue::get(VT);
 
       for (unsigned N = 0, E = VT->getNumElements(); N != E; ++N) {
         Value *NumEltN = Builder.CreateExtractElement(Num, N);
         Value *DenEltN = Builder.CreateExtractElement(Den, N);
-        Value *NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
-        if (!NewElt)
-          NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+
+        Value *NewElt;
+        if (ScalarSize <= 32) {
+          NewElt = expandDivRem32(Builder, I, NumEltN, DenEltN);
+          if (!NewElt)
+            NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+        } else {
+          // See if this 64-bit division can be shrunk to 32/24-bits before
+          // producing the general expansion.
+          NewElt = shrinkDivRem64(Builder, I, NumEltN, DenEltN);
+          if (!NewElt) {
+            // The general 64-bit expansion introduces control flow and doesn't
+            // return the new value. Just insert a scalar copy and defer
+            // expanding it.
+            NewElt = Builder.CreateBinOp(Opc, NumEltN, DenEltN);
+            Div64ToExpand.push_back(cast<BinaryOperator>(NewElt));
+          }
+        }
+
         NewDiv = Builder.CreateInsertElement(NewDiv, NewElt, N);
       }
     } else {
-      NewDiv = expandDivRem32(Builder, I, Num, Den);
+      if (ScalarSize <= 32)
+        NewDiv = expandDivRem32(Builder, I, Num, Den);
+      else {
+        NewDiv = shrinkDivRem64(Builder, I, Num, Den);
+        if (!NewDiv)
+          Div64ToExpand.push_back(&I);
+      }
     }
 
     if (NewDiv) {
@@ -925,6 +1261,14 @@ bool AMDGPUCodeGenPrepare::visitBinaryOperator(BinaryOperator &I) {
     }
   }
 
+  if (ExpandDiv64InIR) {
+    // TODO: We get much worse code in specially handled constant cases.
+    for (BinaryOperator *Div : Div64ToExpand) {
+      expandDivRem64(*Div);
+      Changed = true;
+    }
+  }
+
   return Changed;
 }
 
@@ -1033,16 +1377,36 @@ bool AMDGPUCodeGenPrepare::runOnFunction(Function &F) {
   ST = &TM.getSubtarget<GCNSubtarget>(F);
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
   DA = &getAnalysis<LegacyDivergenceAnalysis>();
+
+  auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+  DT = DTWP ? &DTWP->getDomTree() : nullptr;
+
   HasUnsafeFPMath = hasUnsafeFPMath(F);
-  HasFP32Denormals = ST->hasFP32Denormals(F);
+
+  AMDGPU::SIModeRegisterDefaults Mode(F);
+  HasFP32Denormals = Mode.allFP32Denormals();
 
   bool MadeChange = false;
 
-  for (BasicBlock &BB : F) {
+  Function::iterator NextBB;
+  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE; FI = NextBB) {
+    BasicBlock *BB = &*FI;
+    NextBB = std::next(FI);
+
     BasicBlock::iterator Next;
-    for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; I = Next) {
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; I = Next) {
       Next = std::next(I);
+
       MadeChange |= visit(*I);
+
+      if (Next != E) { // Control flow changed
+        BasicBlock *NextInstBB = Next->getParent();
+        if (NextInstBB != BB) {
+          BB = NextInstBB;
+          E = BB->end();
+          FE = F.end();
+        }
+      }
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
new file mode 100644
index 000000000000..faaf9168d0dd
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -0,0 +1,69 @@
+//=- AMDGPUCombine.td - Define AMDGPU Combine Rules ----------*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+include "llvm/Target/GlobalISel/Combine.td"
+
+// TODO: This really belongs after legalization after scalarization.
+// TODO: GICombineRules should accept subtarget predicates
+
+def fmin_fmax_legacy_matchdata : GIDefMatchData<"FMinFMaxLegacyInfo">;
+
+def fcmp_select_to_fmin_fmax_legacy : GICombineRule<
+  (defs root:$select, fmin_fmax_legacy_matchdata:$matchinfo),
+  (match (wip_match_opcode G_SELECT):$select,
+         [{ return matchFMinFMaxLegacy(*${select}, MRI, *MF, ${matchinfo}); }]),
+  (apply [{ applySelectFCmpToFMinToFMaxLegacy(*${select}, ${matchinfo}); }])>;
+
+
+def uchar_to_float : GICombineRule<
+  (defs root:$itofp),
+  (match (wip_match_opcode G_UITOFP, G_SITOFP):$itofp,
+         [{ return matchUCharToFloat(*${itofp}, MRI, *MF, Helper); }]),
+  (apply [{ applyUCharToFloat(*${itofp}); }])>;
+
+def cvt_f32_ubyteN_matchdata : GIDefMatchData<"CvtF32UByteMatchInfo">;
+
+def cvt_f32_ubyteN : GICombineRule<
+  (defs root:$cvt_f32_ubyteN, cvt_f32_ubyteN_matchdata:$matchinfo),
+  (match (wip_match_opcode G_AMDGPU_CVT_F32_UBYTE0,
+                           G_AMDGPU_CVT_F32_UBYTE1,
+                           G_AMDGPU_CVT_F32_UBYTE2,
+                           G_AMDGPU_CVT_F32_UBYTE3):$cvt_f32_ubyteN,
+         [{ return matchCvtF32UByteN(*${cvt_f32_ubyteN}, MRI, *MF, ${matchinfo}); }]),
+  (apply [{ applyCvtF32UByteN(*${cvt_f32_ubyteN}, ${matchinfo}); }])>;
+
+// Combines which should only apply on SI/VI
+def gfx6gfx7_combines : GICombineGroup<[fcmp_select_to_fmin_fmax_legacy]>;
+
+
+def AMDGPUPreLegalizerCombinerHelper: GICombinerHelper<
+  "AMDGPUGenPreLegalizerCombinerHelper", [all_combines,
+                                          elide_br_by_inverting_cond]> {
+  let DisableRuleOption = "amdgpuprelegalizercombiner-disable-rule";
+}
+
+
+// FIXME: combines_for_extload can introduce illegal extloads which
+// aren't re-legalized.
+// FIXME: Is there a way to remove a single item from all_combines?
+def all_combines_minus_extload : GICombineGroup<[trivial_combines,
+  ptr_add_immed_chain, combine_indexed_load_store, undef_combines,
+  identity_combines]
+>;
+
+def AMDGPUPostLegalizerCombinerHelper: GICombinerHelper<
+  "AMDGPUGenPostLegalizerCombinerHelper",
+  [all_combines_minus_extload, gfx6gfx7_combines,
+   uchar_to_float, cvt_f32_ubyteN]> {
+  let DisableRuleOption = "amdgpupostlegalizercombiner-disable-rule";
+}
+
+def AMDGPURegBankCombinerHelper : GICombinerHelper<
+  "AMDGPUGenRegBankCombinerHelper", []> {
+  let DisableRuleOption = "amdgpuregbankcombiner-disable-rule";
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
new file mode 100644
index 000000000000..25c82ed61fc2
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.cpp
@@ -0,0 +1,150 @@
+//===--- AMDGPUExportClusting.cpp - AMDGPU Export Clustering  -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains a DAG scheduling mutation to cluster shader
+///       exports.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUExportClustering.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+
+using namespace llvm;
+
+namespace {
+
+class ExportClustering : public ScheduleDAGMutation {
+public:
+  ExportClustering() {}
+  void apply(ScheduleDAGInstrs *DAG) override;
+};
+
+static bool isExport(const SUnit &SU) {
+  const MachineInstr *MI = SU.getInstr();
+  return MI->getOpcode() == AMDGPU::EXP ||
+         MI->getOpcode() == AMDGPU::EXP_DONE;
+}
+
+static bool isPositionExport(const SIInstrInfo *TII, SUnit *SU) {
+  const MachineInstr *MI = SU->getInstr();
+  int Imm = TII->getNamedOperand(*MI, AMDGPU::OpName::tgt)->getImm();
+  return Imm >= 12 && Imm <= 15;
+}
+
+static void sortChain(const SIInstrInfo *TII, SmallVector<SUnit *, 8> &Chain,
+                      unsigned PosCount) {
+  if (!PosCount || PosCount == Chain.size())
+    return;
+
+  // Position exports should occur as soon as possible in the shader
+  // for optimal performance.  This moves position exports before
+  // other exports while preserving the order within different export
+  // types (pos or other).
+  SmallVector<SUnit *, 8> Copy(Chain);
+  unsigned PosIdx = 0;
+  unsigned OtherIdx = PosCount;
+  for (SUnit *SU : Copy) {
+    if (isPositionExport(TII, SU))
+      Chain[PosIdx++] = SU;
+    else
+      Chain[OtherIdx++] = SU;
+  }
+}
+
+static void buildCluster(ArrayRef<SUnit *> Exports, ScheduleDAGInstrs *DAG) {
+  SUnit *ChainHead = Exports.front();
+
+  // Now construct cluster from chain by adding new edges.
+  for (unsigned Idx = 0, End = Exports.size() - 1; Idx < End; ++Idx) {
+    SUnit *SUa = Exports[Idx];
+    SUnit *SUb = Exports[Idx + 1];
+
+    // Copy all dependencies to the head of the chain to avoid any
+    // computation being inserted into the chain.
+    for (const SDep &Pred : SUb->Preds) {
+      SUnit *PredSU = Pred.getSUnit();
+      if (!isExport(*PredSU) && !Pred.isWeak())
+        DAG->addEdge(ChainHead, SDep(PredSU, SDep::Artificial));
+    }
+
+    // New barrier edge ordering exports
+    DAG->addEdge(SUb, SDep(SUa, SDep::Barrier));
+    // Also add cluster edge
+    DAG->addEdge(SUb, SDep(SUa, SDep::Cluster));
+  }
+}
+
+static void removeExportDependencies(ScheduleDAGInstrs *DAG, SUnit &SU) {
+  SmallVector<SDep, 2> ToAdd, ToRemove;
+
+  for (const SDep &Pred : SU.Preds) {
+    SUnit *PredSU = Pred.getSUnit();
+    if (Pred.isBarrier() && isExport(*PredSU)) {
+      ToRemove.push_back(Pred);
+      if (isExport(SU))
+        continue;
+
+      // If we remove a barrier we need to copy dependencies
+      // from the predecessor to maintain order.
+      for (const SDep &ExportPred : PredSU->Preds) {
+        SUnit *ExportPredSU = ExportPred.getSUnit();
+        if (ExportPred.isBarrier() && !isExport(*ExportPredSU))
+          ToAdd.push_back(SDep(ExportPredSU, SDep::Barrier));
+      }
+    }
+  }
+
+  for (SDep Pred : ToRemove)
+    SU.removePred(Pred);
+  for (SDep Pred : ToAdd)
+    DAG->addEdge(&SU, Pred);
+}
+
+void ExportClustering::apply(ScheduleDAGInstrs *DAG) {
+  const SIInstrInfo *TII = static_cast<const SIInstrInfo *>(DAG->TII);
+
+  SmallVector<SUnit *, 8> Chain;
+
+  // Pass through DAG gathering a list of exports and removing barrier edges
+  // creating dependencies on exports. Freeing exports of successor edges
+  // allows more scheduling freedom, and nothing should be order dependent
+  // on exports.  Edges will be added later to order the exports.
+  unsigned PosCount = 0;
+  for (SUnit &SU : DAG->SUnits) {
+    if (!isExport(SU))
+      continue;
+
+    Chain.push_back(&SU);
+    if (isPositionExport(TII, &SU))
+      PosCount++;
+
+    removeExportDependencies(DAG, SU);
+
+    SmallVector<SDep, 4> Succs(SU.Succs);
+    for (SDep Succ : Succs)
+      removeExportDependencies(DAG, *Succ.getSUnit());
+  }
+
+  // Apply clustering if there are multiple exports
+  if (Chain.size() > 1) {
+    sortChain(TII, Chain, PosCount);
+    buildCluster(Chain, DAG);
+  }
+}
+
+} // end namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUExportClusteringDAGMutation() {
+  return std::make_unique<ExportClustering>();
+}
+
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
new file mode 100644
index 000000000000..58491d0671e4
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUExportClustering.h
@@ -0,0 +1,15 @@
+//===- AMDGPUExportClustering.h - AMDGPU Export Clustering ------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createAMDGPUExportClusteringDAGMutation();
+
+} // namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
index ea3952c316e4..db00f8f711a3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFeatures.td
@@ -18,15 +18,6 @@ def FeatureFMA : SubtargetFeature<"fmaf",
   "Enable single precision FMA (not as fast as mul+add, but fused)"
 >;
 
-// Some instructions do not support denormals despite this flag. Using
-// fp32 denormals also causes instructions to run at the double
-// precision rate for the device.
-def FeatureFP32Denormals : SubtargetFeature<"fp32-denormals",
-  "FP32Denormals",
-  "true",
-  "Enable single precision denormal handling"
->;
-
 class SubtargetFeatureLocalMemorySize <int Value> : SubtargetFeature<
   "localmemorysize"#Value,
   "LocalMemorySize",
@@ -38,16 +29,16 @@ def FeatureLocalMemorySize0 : SubtargetFeatureLocalMemorySize<0>;
 def FeatureLocalMemorySize32768 : SubtargetFeatureLocalMemorySize<32768>;
 def FeatureLocalMemorySize65536 : SubtargetFeatureLocalMemorySize<65536>;
 
-class SubtargetFeatureWavefrontSize <int Value> : SubtargetFeature<
-  "wavefrontsize"#Value,
-  "WavefrontSize",
-  !cast<string>(Value),
+class SubtargetFeatureWavefrontSize <int ValueLog2> : SubtargetFeature<
+  "wavefrontsize"#!shl(1, ValueLog2),
+  "WavefrontSizeLog2",
+  !cast<string>(ValueLog2),
   "The number of threads per wavefront"
 >;
 
-def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<16>;
-def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<32>;
-def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<64>;
+def FeatureWavefrontSize16 : SubtargetFeatureWavefrontSize<4>;
+def FeatureWavefrontSize32 : SubtargetFeatureWavefrontSize<5>;
+def FeatureWavefrontSize64 : SubtargetFeatureWavefrontSize<6>;
 
 class SubtargetFeatureGeneration <string Value, string FeatureName,
                                  string Subtarget,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp b/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
index 9ba04d113c70..ea6c6d0fd212 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFixFunctionBitcasts.cpp
@@ -15,6 +15,7 @@
 
 #include "AMDGPU.h"
 #include "llvm/IR/InstVisitor.h"
+#include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
 
 using namespace llvm;
@@ -31,12 +32,13 @@ class AMDGPUFixFunctionBitcasts final
   bool Modified;
 
 public:
-  void visitCallSite(CallSite CS) {
-    if (CS.getCalledFunction())
+  void visitCallBase(CallBase &CB) {
+    if (CB.getCalledFunction())
       return;
-    auto Callee = dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
-    if (Callee && isLegalToPromote(CS, Callee)) {
-      promoteCall(CS, Callee);
+    auto *Callee =
+        dyn_cast<Function>(CB.getCalledOperand()->stripPointerCasts());
+    if (Callee && isLegalToPromote(CB, Callee)) {
+      promoteCall(CB, Callee);
       Modified = true;
     }
   }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
index 92e256cf2829..260a18e278cf 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUFrameLowering.h
@@ -26,7 +26,7 @@ namespace llvm {
 class AMDGPUFrameLowering : public TargetFrameLowering {
 public:
   AMDGPUFrameLowering(StackDirection D, Align StackAl, int LAO,
-                      Align TransAl = Align::None());
+                      Align TransAl = Align(1));
   ~AMDGPUFrameLowering() override;
 
   /// \returns The number of 32-bit sub-registers that are used when storing
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
index d420aa02ac28..3f12addbcc79 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGISel.td
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 include "AMDGPU.td"
+include "AMDGPUCombine.td"
 
 def sd_vsrc0 : ComplexPattern<i32, 1, "">;
 def gi_vsrc0 :
@@ -30,6 +31,10 @@ def gi_vop3mods :
     GIComplexOperandMatcher<s32, "selectVOP3Mods">,
     GIComplexPatternEquiv<VOP3Mods>;
 
+def gi_vop3_no_mods :
+    GIComplexOperandMatcher<s32, "selectVOP3NoMods">,
+    GIComplexPatternEquiv<VOP3NoMods>;
+
 def gi_vop3mods_nnan :
     GIComplexOperandMatcher<s32, "selectVOP3Mods_nnan">,
     GIComplexPatternEquiv<VOP3Mods_nnan>;
@@ -38,9 +43,9 @@ def gi_vop3omods :
     GIComplexOperandMatcher<s32, "selectVOP3OMods">,
     GIComplexPatternEquiv<VOP3OMods>;
 
-def gi_vop3opselmods0 :
-    GIComplexOperandMatcher<s32, "selectVOP3OpSelMods0">,
-    GIComplexPatternEquiv<VOP3OpSelMods0>;
+def gi_vop3pmods :
+    GIComplexOperandMatcher<s32, "selectVOP3PMods">,
+    GIComplexPatternEquiv<VOP3PMods>;
 
 def gi_vop3opselmods :
     GIComplexOperandMatcher<s32, "selectVOP3OpSelMods">,
@@ -83,6 +88,33 @@ def gi_ds_1addr_1offset :
     GIComplexOperandMatcher<s32, "selectDS1Addr1Offset">,
     GIComplexPatternEquiv<DS1Addr1Offset>;
 
+def gi_ds_64bit_4byte_aligned :
+    GIComplexOperandMatcher<s64, "selectDS64Bit4ByteAligned">,
+    GIComplexPatternEquiv<DS64Bit4ByteAligned>;
+
+def gi_mubuf_addr64 :
+    GIComplexOperandMatcher<s64, "selectMUBUFAddr64">,
+    GIComplexPatternEquiv<MUBUFAddr64>;
+
+def gi_mubuf_offset :
+    GIComplexOperandMatcher<s64, "selectMUBUFOffset">,
+    GIComplexPatternEquiv<MUBUFOffset>;
+
+def gi_mubuf_addr64_atomic :
+    GIComplexOperandMatcher<s64, "selectMUBUFAddr64Atomic">,
+    GIComplexPatternEquiv<MUBUFAddr64Atomic>;
+
+def gi_mubuf_offset_atomic :
+    GIComplexOperandMatcher<s64, "selectMUBUFOffsetAtomic">,
+    GIComplexPatternEquiv<MUBUFOffsetAtomic>;
+
+def gi_smrd_buffer_imm :
+    GIComplexOperandMatcher<s64, "selectSMRDBufferImm">,
+    GIComplexPatternEquiv<SMRDBufferImm>;
+
+def gi_smrd_buffer_imm32 :
+    GIComplexOperandMatcher<s64, "selectSMRDBufferImm32">,
+    GIComplexPatternEquiv<SMRDBufferImm32>;
 
 // Separate load nodes are defined to glue m0 initialization in
 // SelectionDAG. The GISel selector can just insert m0 initialization
@@ -116,9 +148,54 @@ def : GINodeEquiv<G_ATOMICRMW_UMIN, atomic_load_umin_glue>;
 def : GINodeEquiv<G_ATOMICRMW_UMAX, atomic_load_umax_glue>;
 def : GINodeEquiv<G_ATOMICRMW_FADD, atomic_load_fadd_glue>;
 
-def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32>;
-def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
+def : GINodeEquiv<G_AMDGPU_FFBH_U32, AMDGPUffbh_u32_impl>;
+def : GINodeEquiv<G_AMDGPU_FMIN_LEGACY, AMDGPUfmin_legacy>;
+def : GINodeEquiv<G_AMDGPU_FMAX_LEGACY, AMDGPUfmax_legacy>;
+def : GINodeEquiv<G_AMDGPU_RCP_IFLAG, AMDGPUrcp_iflag>;
 
+def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE0, AMDGPUcvt_f32_ubyte0>;
+def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE1, AMDGPUcvt_f32_ubyte1>;
+def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE2, AMDGPUcvt_f32_ubyte2>;
+def : GINodeEquiv<G_AMDGPU_CVT_F32_UBYTE3, AMDGPUcvt_f32_ubyte3>;
+
+def : GINodeEquiv<G_AMDGPU_ATOMIC_CMPXCHG, AMDGPUatomic_cmp_swap>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD, SIbuffer_load>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_USHORT, SIbuffer_load_ushort>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_UBYTE, SIbuffer_load_ubyte>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SSHORT, SIbuffer_load_short>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_SBYTE, SIbuffer_load_byte>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT, SIbuffer_load_format>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_LOAD_FORMAT_D16, SIbuffer_load_format_d16>;
+def : GINodeEquiv<G_AMDGPU_TBUFFER_LOAD_FORMAT, SItbuffer_load>;
+def : GINodeEquiv<G_AMDGPU_TBUFFER_LOAD_FORMAT_D16, SItbuffer_load_d16>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE, SIbuffer_store>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_SHORT, SIbuffer_store_short>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_BYTE, SIbuffer_store_byte>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_FORMAT, SIbuffer_store_format>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_STORE_FORMAT_D16, SIbuffer_store_format_d16>;
+def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT, SItbuffer_store>;
+def : GINodeEquiv<G_AMDGPU_TBUFFER_STORE_FORMAT_D16, SItbuffer_store_d16>;
+
+// FIXME: Check MMO is atomic
+def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, SIatomic_inc>;
+def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, SIatomic_dec>;
+def : GINodeEquiv<G_AMDGPU_ATOMIC_INC, atomic_inc_glue>;
+def : GINodeEquiv<G_AMDGPU_ATOMIC_DEC, atomic_dec_glue>;
+
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SWAP, SIbuffer_atomic_swap>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_ADD, SIbuffer_atomic_add>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SUB, SIbuffer_atomic_sub>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SMIN, SIbuffer_atomic_smin>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_UMIN, SIbuffer_atomic_umin>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_SMAX, SIbuffer_atomic_smax>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_UMAX, SIbuffer_atomic_umax>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_AND, SIbuffer_atomic_and>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_OR, SIbuffer_atomic_or>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_XOR, SIbuffer_atomic_xor>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_INC, SIbuffer_atomic_inc>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_DEC, SIbuffer_atomic_dec>;
+def : GINodeEquiv<G_AMDGPU_BUFFER_ATOMIC_CMPSWAP, SIbuffer_atomic_cmpswap>;
+def : GINodeEquiv<G_AMDGPU_S_BUFFER_LOAD, SIsbuffer_load>;
 
 class GISelSop2Pat <
   SDPatternOperator node,
@@ -188,16 +265,13 @@ multiclass GISelVop2IntrPat <
 
   def : GISelVop2Pat <node, inst, dst_vt, src_vt>;
 
-  // FIXME: Intrinsics aren't marked as commutable, so we need to add an explcit
+  // FIXME: Intrinsics aren't marked as commutable, so we need to add an explicit
   // pattern to handle commuting.  This is another reason why legalizing to a
   // generic machine instruction may be better that matching the intrinsic
   // directly.
   def : GISelVop2CommutePat <node, inst, dst_vt, src_vt>;
 }
 
-def : GISelSop2Pat <or, S_OR_B32, i32>;
-def : GISelVop2Pat <or, V_OR_B32_e32, i32>;
-
 // Since GlobalISel is more flexible then SelectionDAG, I think we can get
 // away with adding patterns for integer types and not legalizing all
 // loads and stores to vector types.  This should help simplify the load/store
@@ -206,12 +280,18 @@ foreach Ty = [i64, p0, p1, p4] in {
   defm : SMRD_Pattern <"S_LOAD_DWORDX2",  Ty>;
 }
 
-def gi_as_i32timm : GICustomOperandRenderer<"renderTruncImm32">,
+def gi_as_i32timm : GICustomOperandRenderer<"renderTruncTImm32">,
   GISDNodeXFormEquiv<as_i32timm>;
 
-def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm">,
+def gi_as_i16timm : GICustomOperandRenderer<"renderTruncTImm16">,
   GISDNodeXFormEquiv<as_i16timm>;
 
+def gi_as_i8timm : GICustomOperandRenderer<"renderTruncTImm8">,
+  GISDNodeXFormEquiv<as_i8timm>;
+
+def gi_as_i1timm : GICustomOperandRenderer<"renderTruncTImm1">,
+  GISDNodeXFormEquiv<as_i1timm>;
+
 def gi_NegateImm : GICustomOperandRenderer<"renderNegateImm">,
   GISDNodeXFormEquiv<NegateImm>;
 
@@ -220,3 +300,15 @@ def gi_bitcast_fpimm_to_i32 : GICustomOperandRenderer<"renderBitcastImm">,
 
 def gi_IMMPopCount : GICustomOperandRenderer<"renderPopcntImm">,
   GISDNodeXFormEquiv<IMMPopCount>;
+
+def gi_extract_glc : GICustomOperandRenderer<"renderExtractGLC">,
+  GISDNodeXFormEquiv<extract_glc>;
+
+def gi_extract_slc : GICustomOperandRenderer<"renderExtractSLC">,
+  GISDNodeXFormEquiv<extract_slc>;
+
+def gi_extract_dlc : GICustomOperandRenderer<"renderExtractDLC">,
+  GISDNodeXFormEquiv<extract_dlc>;
+
+def gi_extract_swz : GICustomOperandRenderer<"renderExtractSWZ">,
+  GISDNodeXFormEquiv<extract_swz>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
index 2e92ae51660b..600b351f9ea1 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGenRegisterBankInfo.def
@@ -132,7 +132,8 @@ const RegisterBankInfo::PartialMapping SGPROnly64BreakDown[] {
 };
 
 
-// For some instructions which can operate 64-bit only for the scalar version.
+// For some instructions which can operate 64-bit only for the scalar
+// version. Otherwise, these need to be split into 2 32-bit operations.
 const RegisterBankInfo::ValueMapping ValMappingsSGPR64OnlyVGPR32[] {
   /*32-bit sgpr*/     {&SGPROnly64BreakDown[0], 1},
   /*2 x 32-bit sgpr*/ {&SGPROnly64BreakDown[1], 2},
@@ -207,75 +208,16 @@ const RegisterBankInfo::ValueMapping *getValueMappingSGPR64Only(unsigned BankID,
   return &ValMappingsSGPR64OnlyVGPR32[2];
 }
 
-const RegisterBankInfo::PartialMapping LoadSGPROnlyBreakDown[] {
-  /* 256-bit load */    {0, 256, SGPRRegBank},
-  /* 512-bit load */    {0, 512, SGPRRegBank},
-  /* 8 32-bit loads */  {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank},
-                        {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank},
-                        {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank},
-                        {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank},
-  /* 16 32-bit loads */ {0, 32, VGPRRegBank}, {32, 32, VGPRRegBank},
-                        {64, 32, VGPRRegBank}, {96, 32, VGPRRegBank},
-                        {128, 32, VGPRRegBank}, {160, 32, VGPRRegBank},
-                        {192, 32, VGPRRegBank}, {224, 32, VGPRRegBank},
-                        {256, 32, VGPRRegBank}, {288, 32, VGPRRegBank},
-                        {320, 32, VGPRRegBank}, {352, 32, VGPRRegBank},
-                        {384, 32, VGPRRegBank}, {416, 32, VGPRRegBank},
-                        {448, 32, VGPRRegBank}, {480, 32, VGPRRegBank},
-  /* 4 64-bit loads */  {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank},
-                        {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank},
-  /* 8 64-bit loads */  {0, 64, VGPRRegBank}, {64, 64, VGPRRegBank},
-                        {128, 64, VGPRRegBank}, {192, 64, VGPRRegBank},
-                        {256, 64, VGPRRegBank}, {320, 64, VGPRRegBank},
-                        {384, 64, VGPRRegBank}, {448, 64, VGPRRegBank},
-
-  /* FIXME: The generic register bank select does not support complex
-   * break downs where the number of vector elements does not equal the
-   * number of breakdowns.
-   * FIXME: register bank select now tries to handle complex break downs,
-   * but it emits an illegal instruction:
-   * %1:vgpr(<8 x s32>) = G_CONCAT_VECTORS %2:vgpr(s128), %3:vgpr(s128)
-   */
-  /* 2 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank},
-  /* 4 128-bit loads */ {0, 128, VGPRRegBank}, {128, 128, VGPRRegBank},
-                        {256, 128, VGPRRegBank}, {384, 128, VGPRRegBank}
-};
-
-const RegisterBankInfo::ValueMapping ValMappingsLoadSGPROnly[] {
-  /* 256-bit load */     {&LoadSGPROnlyBreakDown[0], 1},
-  /* 512-bit load */     {&LoadSGPROnlyBreakDown[1], 1},
-  /* <8 x i32> load  */  {&LoadSGPROnlyBreakDown[2], 8},
-  /* <16 x i32> load */  {&LoadSGPROnlyBreakDown[10], 16},
-  /* <4 x i64> load */   {&LoadSGPROnlyBreakDown[26], 4},
-  /* <8 x i64> load */   {&LoadSGPROnlyBreakDown[30], 8}
-};
-
-const RegisterBankInfo::ValueMapping *
-getValueMappingLoadSGPROnly(unsigned BankID, LLT SizeTy) {
-  unsigned Size = SizeTy.getSizeInBits();
-  if (Size < 256 || BankID == AMDGPU::SGPRRegBankID)
-    return getValueMapping(BankID, Size);
-
-  assert((Size == 256 || Size == 512) && BankID == AMDGPU::VGPRRegBankID);
-
-  // Default to using the non-split ValueMappings, we will use these if
-  // the register bank is SGPR or if we don't know how to handle the vector
-  // type.
-  unsigned Idx = Size == 256 ? 0 : 1;
-
-  // We need to split this load if it has a vgpr pointer.
-  if (BankID == AMDGPU::VGPRRegBankID) {
-    if (SizeTy == LLT::vector(8, 32))
-      Idx = 2;
-    else if (SizeTy == LLT::vector(16, 32))
-      Idx = 3;
-    else if (SizeTy == LLT::vector(4, 64))
-      Idx = 4;
-    else if (SizeTy == LLT::vector(8, 64))
-      Idx = 5;
-  }
+/// Split any 64-bit value into 2 32-bit pieces. Unlike
+/// getValueMappingSGPR64Only, this splits both VGPRs and SGPRs.
+const RegisterBankInfo::ValueMapping *getValueMappingSplit64(unsigned BankID,
+                                                             unsigned Size) {
+  assert(Size == 64);
+  if (BankID == AMDGPU::VGPRRegBankID)
+    return &ValMappingsSGPR64OnlyVGPR32[4];
 
-  return &ValMappingsLoadSGPROnly[Idx];
+  assert(BankID == AMDGPU::SGPRRegBankID);
+  return &ValMappingsSGPR64OnlyVGPR32[1];
 }
 
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
index 16d7f2c4f9e5..989937a597fb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.cpp
@@ -43,3 +43,12 @@ AMDGPU::getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg) {
 
   return std::make_tuple(Reg, 0, Def);
 }
+
+bool AMDGPU::isLegalVOP3PShuffleMask(ArrayRef<int> Mask) {
+  assert(Mask.size() == 2);
+
+  // If one half is undef, the other is trivially in the same reg.
+  if (Mask[0] == -1 || Mask[1] == -1)
+    return true;
+  return (Mask[0] & 2) == (Mask[1] & 2);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
index 1507ade79547..766750758efc 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUGlobalISelUtils.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUGLOBALISELUTILS_H
 
+#include "AMDGPUInstrInfo.h"
 #include "llvm/CodeGen/Register.h"
 #include <tuple>
 
@@ -23,6 +24,38 @@ namespace AMDGPU {
 std::tuple<Register, unsigned, MachineInstr *>
 getBaseWithConstantOffset(MachineRegisterInfo &MRI, Register Reg);
 
+bool isLegalVOP3PShuffleMask(ArrayRef<int> Mask);
+
+/// Return number of address arguments, and the number of gradients for an image
+/// intrinsic.
+inline std::pair<int, int>
+getImageNumVAddr(const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr,
+                 const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode) {
+  const AMDGPU::MIMGDimInfo *DimInfo
+    = AMDGPU::getMIMGDimInfo(ImageDimIntr->Dim);
+
+  int NumGradients = BaseOpcode->Gradients ? DimInfo->NumGradients : 0;
+  int NumCoords = BaseOpcode->Coordinates ? DimInfo->NumCoords : 0;
+  int NumLCM = BaseOpcode->LodOrClampOrMip ? 1 : 0;
+  int NumVAddr = BaseOpcode->NumExtraArgs + NumGradients + NumCoords + NumLCM;
+  return {NumVAddr, NumGradients};
+}
+
+/// Return index of dmask in an gMIR image intrinsic
+inline int getDMaskIdx(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
+                       int NumDefs) {
+  assert(!BaseOpcode->Atomic);
+  return NumDefs + 1 + (BaseOpcode->Store ? 1 : 0);
+}
+
+/// Return first address operand index in a gMIR image intrinsic.
+inline int getImageVAddrIdxBegin(const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode,
+                                 int NumDefs) {
+  if (BaseOpcode->Atomic)
+    return NumDefs + 1 + (BaseOpcode->AtomicX2 ? 2 : 1);
+  return getDMaskIdx(BaseOpcode, NumDefs) + 1;
+}
+
 }
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
index 511d62943189..c6f6a3b84e36 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.cpp
@@ -47,7 +47,7 @@ void MetadataStreamerV2::verify(StringRef HSAMetadataString) const {
   errs() << "AMDGPU HSA Metadata Parser Test: ";
 
   HSAMD::Metadata FromHSAMetadataString;
-  if (fromString(HSAMetadataString, FromHSAMetadataString)) {
+  if (fromString(std::string(HSAMetadataString), FromHSAMetadataString)) {
     errs() << "FAIL\n";
     return;
   }
@@ -127,38 +127,6 @@ ValueKind MetadataStreamerV2::getValueKind(Type *Ty, StringRef TypeQual,
                       ValueKind::ByValue);
 }
 
-ValueType MetadataStreamerV2::getValueType(Type *Ty, StringRef TypeName) const {
-  switch (Ty->getTypeID()) {
-  case Type::IntegerTyID: {
-    auto Signed = !TypeName.startswith("u");
-    switch (Ty->getIntegerBitWidth()) {
-    case 8:
-      return Signed ? ValueType::I8 : ValueType::U8;
-    case 16:
-      return Signed ? ValueType::I16 : ValueType::U16;
-    case 32:
-      return Signed ? ValueType::I32 : ValueType::U32;
-    case 64:
-      return Signed ? ValueType::I64 : ValueType::U64;
-    default:
-      return ValueType::Struct;
-    }
-  }
-  case Type::HalfTyID:
-    return ValueType::F16;
-  case Type::FloatTyID:
-    return ValueType::F32;
-  case Type::DoubleTyID:
-    return ValueType::F64;
-  case Type::PointerTyID:
-    return getValueType(Ty->getPointerElementType(), TypeName);
-  case Type::VectorTyID:
-    return getValueType(Ty->getVectorElementType(), TypeName);
-  default:
-    return ValueType::Struct;
-  }
-}
-
 std::string MetadataStreamerV2::getTypeName(Type *Ty, bool Signed) const {
   switch (Ty->getTypeID()) {
   case Type::IntegerTyID: {
@@ -185,10 +153,10 @@ std::string MetadataStreamerV2::getTypeName(Type *Ty, bool Signed) const {
     return "float";
   case Type::DoubleTyID:
     return "double";
-  case Type::VectorTyID: {
-    auto VecTy = cast<VectorType>(Ty);
+  case Type::FixedVectorTyID: {
+    auto VecTy = cast<FixedVectorType>(Ty);
     auto ElTy = VecTy->getElementType();
-    auto NumElements = VecTy->getVectorNumElements();
+    auto NumElements = VecTy->getNumElements();
     return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str();
   }
   default:
@@ -259,7 +227,8 @@ void MetadataStreamerV2::emitPrintf(const Module &Mod) {
 
   for (auto Op : Node->operands())
     if (Op->getNumOperands())
-      Printf.push_back(cast<MDString>(Op->getOperand(0))->getString());
+      Printf.push_back(
+          std::string(cast<MDString>(Op->getOperand(0))->getString()));
 }
 
 void MetadataStreamerV2::emitKernelLanguage(const Function &Func) {
@@ -345,12 +314,11 @@ void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
   Type *Ty = Arg.getType();
   const DataLayout &DL = Func->getParent()->getDataLayout();
 
-  unsigned PointeeAlign = 0;
+  MaybeAlign PointeeAlign;
   if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
     if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-      PointeeAlign = Arg.getParamAlignment();
-      if (PointeeAlign == 0)
-        PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType());
+      PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(),
+                                                   PtrTy->getElementType());
     }
   }
 
@@ -360,20 +328,19 @@ void MetadataStreamerV2::emitKernelArg(const Argument &Arg) {
 
 void MetadataStreamerV2::emitKernelArg(const DataLayout &DL, Type *Ty,
                                        ValueKind ValueKind,
-                                       unsigned PointeeAlign, StringRef Name,
+                                       MaybeAlign PointeeAlign, StringRef Name,
                                        StringRef TypeName,
                                        StringRef BaseTypeName,
                                        StringRef AccQual, StringRef TypeQual) {
   HSAMetadata.mKernels.back().mArgs.push_back(Kernel::Arg::Metadata());
   auto &Arg = HSAMetadata.mKernels.back().mArgs.back();
 
-  Arg.mName = Name;
-  Arg.mTypeName = TypeName;
+  Arg.mName = std::string(Name);
+  Arg.mTypeName = std::string(TypeName);
   Arg.mSize = DL.getTypeAllocSize(Ty);
-  Arg.mAlign = DL.getABITypeAlignment(Ty);
+  Arg.mAlign = DL.getABITypeAlign(Ty).value();
   Arg.mValueKind = ValueKind;
-  Arg.mValueType = getValueType(Ty, BaseTypeName);
-  Arg.mPointeeAlign = PointeeAlign;
+  Arg.mPointeeAlign = PointeeAlign ? PointeeAlign->value() : 0;
 
   if (auto PtrTy = dyn_cast<PointerType>(Ty))
     Arg.mAddrSpaceQual = getAddressSpaceQualifier(PtrTy->getAddressSpace());
@@ -479,7 +446,7 @@ void MetadataStreamerV2::emitKernel(const MachineFunction &MF,
   HSAMetadata.mKernels.push_back(Kernel::Metadata());
   auto &Kernel = HSAMetadata.mKernels.back();
 
-  Kernel.mName = Func.getName();
+  Kernel.mName = std::string(Func.getName());
   Kernel.mSymbolName = (Twine(Func.getName()) + Twine("@kd")).str();
   emitKernelLanguage(Func);
   emitKernelAttrs(Func);
@@ -573,38 +540,6 @@ StringRef MetadataStreamerV3::getValueKind(Type *Ty, StringRef TypeQual,
                    : "by_value");
 }
 
-StringRef MetadataStreamerV3::getValueType(Type *Ty, StringRef TypeName) const {
-  switch (Ty->getTypeID()) {
-  case Type::IntegerTyID: {
-    auto Signed = !TypeName.startswith("u");
-    switch (Ty->getIntegerBitWidth()) {
-    case 8:
-      return Signed ? "i8" : "u8";
-    case 16:
-      return Signed ? "i16" : "u16";
-    case 32:
-      return Signed ? "i32" : "u32";
-    case 64:
-      return Signed ? "i64" : "u64";
-    default:
-      return "struct";
-    }
-  }
-  case Type::HalfTyID:
-    return "f16";
-  case Type::FloatTyID:
-    return "f32";
-  case Type::DoubleTyID:
-    return "f64";
-  case Type::PointerTyID:
-    return getValueType(Ty->getPointerElementType(), TypeName);
-  case Type::VectorTyID:
-    return getValueType(Ty->getVectorElementType(), TypeName);
-  default:
-    return "struct";
-  }
-}
-
 std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const {
   switch (Ty->getTypeID()) {
   case Type::IntegerTyID: {
@@ -631,10 +566,10 @@ std::string MetadataStreamerV3::getTypeName(Type *Ty, bool Signed) const {
     return "float";
   case Type::DoubleTyID:
     return "double";
-  case Type::VectorTyID: {
-    auto VecTy = cast<VectorType>(Ty);
+  case Type::FixedVectorTyID: {
+    auto VecTy = cast<FixedVectorType>(Ty);
     auto ElTy = VecTy->getElementType();
-    auto NumElements = VecTy->getVectorNumElements();
+    auto NumElements = VecTy->getNumElements();
     return (Twine(getTypeName(ElTy, Signed)) + Twine(NumElements)).str();
   }
   default:
@@ -767,12 +702,11 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
   Type *Ty = Arg.getType();
   const DataLayout &DL = Func->getParent()->getDataLayout();
 
-  unsigned PointeeAlign = 0;
+  MaybeAlign PointeeAlign;
   if (auto PtrTy = dyn_cast<PointerType>(Ty)) {
     if (PtrTy->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS) {
-      PointeeAlign = Arg.getParamAlignment();
-      if (PointeeAlign == 0)
-        PointeeAlign = DL.getABITypeAlignment(PtrTy->getElementType());
+      PointeeAlign = DL.getValueOrABITypeAlignment(Arg.getParamAlign(),
+                                                   PtrTy->getElementType());
     }
   }
 
@@ -785,7 +719,7 @@ void MetadataStreamerV3::emitKernelArg(const Argument &Arg, unsigned &Offset,
 void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
                                        StringRef ValueKind, unsigned &Offset,
                                        msgpack::ArrayDocNode Args,
-                                       unsigned PointeeAlign, StringRef Name,
+                                       MaybeAlign PointeeAlign, StringRef Name,
                                        StringRef TypeName,
                                        StringRef BaseTypeName,
                                        StringRef AccQual, StringRef TypeQual) {
@@ -796,16 +730,14 @@ void MetadataStreamerV3::emitKernelArg(const DataLayout &DL, Type *Ty,
   if (!TypeName.empty())
     Arg[".type_name"] = Arg.getDocument()->getNode(TypeName, /*Copy=*/true);
   auto Size = DL.getTypeAllocSize(Ty);
-  auto Align = DL.getABITypeAlignment(Ty);
+  Align Alignment = DL.getABITypeAlign(Ty);
   Arg[".size"] = Arg.getDocument()->getNode(Size);
-  Offset = alignTo(Offset, Align);
+  Offset = alignTo(Offset, Alignment);
   Arg[".offset"] = Arg.getDocument()->getNode(Offset);
   Offset += Size;
   Arg[".value_kind"] = Arg.getDocument()->getNode(ValueKind, /*Copy=*/true);
-  Arg[".value_type"] =
-      Arg.getDocument()->getNode(getValueType(Ty, BaseTypeName), /*Copy=*/true);
   if (PointeeAlign)
-    Arg[".pointee_align"] = Arg.getDocument()->getNode(PointeeAlign);
+    Arg[".pointee_align"] = Arg.getDocument()->getNode(PointeeAlign->value());
 
   if (auto PtrTy = dyn_cast<PointerType>(Ty))
     if (auto Qualifier = getAddressSpaceQualifier(PtrTy->getAddressSpace()))
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
index 80ac8ca67bcd..9534fffd228d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUHSAMetadataStreamer.h
@@ -20,6 +20,7 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/MsgPackDocument.h"
 #include "llvm/Support/AMDGPUMetadata.h"
+#include "llvm/Support/Alignment.h"
 
 namespace llvm {
 
@@ -27,6 +28,7 @@ class AMDGPUTargetStreamer;
 class Argument;
 class DataLayout;
 class Function;
+class MachineFunction;
 class MDNode;
 class Module;
 struct SIProgramInfo;
@@ -65,8 +67,6 @@ private:
   StringRef getValueKind(Type *Ty, StringRef TypeQual,
                          StringRef BaseTypeName) const;
 
-  StringRef getValueType(Type *Ty, StringRef TypeName) const;
-
   std::string getTypeName(Type *Ty, bool Signed) const;
 
   msgpack::ArrayDocNode getWorkGroupDimensions(MDNode *Node) const;
@@ -89,7 +89,7 @@ private:
 
   void emitKernelArg(const DataLayout &DL, Type *Ty, StringRef ValueKind,
                      unsigned &Offset, msgpack::ArrayDocNode Args,
-                     unsigned PointeeAlign = 0, StringRef Name = "",
+                     MaybeAlign PointeeAlign = None, StringRef Name = "",
                      StringRef TypeName = "", StringRef BaseTypeName = "",
                      StringRef AccQual = "", StringRef TypeQual = "");
 
@@ -133,8 +133,6 @@ private:
   ValueKind getValueKind(Type *Ty, StringRef TypeQual,
                          StringRef BaseTypeName) const;
 
-  ValueType getValueType(Type *Ty, StringRef TypeName) const;
-
   std::string getTypeName(Type *Ty, bool Signed) const;
 
   std::vector<uint32_t> getWorkGroupDimensions(MDNode *Node) const;
@@ -159,10 +157,9 @@ private:
   void emitKernelArg(const Argument &Arg);
 
   void emitKernelArg(const DataLayout &DL, Type *Ty, ValueKind ValueKind,
-                     unsigned PointeeAlign = 0,
-                     StringRef Name = "", StringRef TypeName = "",
-                     StringRef BaseTypeName = "", StringRef AccQual = "",
-                     StringRef TypeQual = "");
+                     MaybeAlign PointeeAlign = None, StringRef Name = "",
+                     StringRef TypeName = "", StringRef BaseTypeName = "",
+                     StringRef AccQual = "", StringRef TypeQual = "");
 
   void emitHiddenKernelArgs(const Function &Func);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
index 2b6308dc1549..aaf448346b53 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelDAGToDAG.cpp
@@ -16,7 +16,6 @@
 #include "AMDGPUISelLowering.h" // For AMDGPUISD
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUPerfHintAnalysis.h"
-#include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -29,6 +28,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/CodeGen/FunctionLoweringInfo.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
@@ -252,7 +252,6 @@ private:
   bool SelectMOVRELOffset(SDValue Index, SDValue &Base, SDValue &Offset) const;
 
   bool SelectVOP3Mods_NNaN(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-  bool SelectVOP3Mods_f32(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3ModsImpl(SDValue In, SDValue &Src, unsigned &SrcMods) const;
   bool SelectVOP3Mods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
   bool SelectVOP3NoMods(SDValue In, SDValue &Src) const;
@@ -265,16 +264,10 @@ private:
                        SDValue &Clamp, SDValue &Omod) const;
 
   bool SelectVOP3PMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-  bool SelectVOP3PMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
-                        SDValue &Clamp) const;
 
   bool SelectVOP3OpSel(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-  bool SelectVOP3OpSel0(SDValue In, SDValue &Src, SDValue &SrcMods,
-                        SDValue &Clamp) const;
 
   bool SelectVOP3OpSelMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
-  bool SelectVOP3OpSelMods0(SDValue In, SDValue &Src, SDValue &SrcMods,
-                            SDValue &Clamp) const;
   bool SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src, unsigned &Mods) const;
   bool SelectVOP3PMadMixMods(SDValue In, SDValue &Src, SDValue &SrcMods) const;
 
@@ -286,7 +279,6 @@ private:
   void SelectAddcSubb(SDNode *N);
   void SelectUADDO_USUBO(SDNode *N);
   void SelectDIV_SCALE(SDNode *N);
-  void SelectDIV_FMAS(SDNode *N);
   void SelectMAD_64_32(SDNode *N);
   void SelectFMA_W_CHAIN(SDNode *N);
   void SelectFMUL_W_CHAIN(SDNode *N);
@@ -301,6 +293,7 @@ private:
   void SelectATOMIC_CMP_SWAP(SDNode *N);
   void SelectDSAppendConsume(SDNode *N, unsigned IntrID);
   void SelectDS_GWS(SDNode *N, unsigned IntrID);
+  void SelectInterpP1F16(SDNode *N);
   void SelectINTRINSIC_W_CHAIN(SDNode *N);
   void SelectINTRINSIC_WO_CHAIN(SDNode *N);
   void SelectINTRINSIC_VOID(SDNode *N);
@@ -409,7 +402,7 @@ bool AMDGPUDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
   }
 #endif
   Subtarget = &MF.getSubtarget<GCNSubtarget>();
-  Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction(), *Subtarget);
+  Mode = AMDGPU::SIModeRegisterDefaults(MF.getFunction());
   return SelectionDAGISel::runOnMachineFunction(MF);
 }
 
@@ -655,29 +648,6 @@ MachineSDNode *AMDGPUDAGToDAGISel::buildSMovImm64(SDLoc &DL, uint64_t Imm,
   return CurDAG->getMachineNode(TargetOpcode::REG_SEQUENCE, DL, VT, Ops);
 }
 
-static unsigned selectSGPRVectorRegClassID(unsigned NumVectorElts) {
-  switch (NumVectorElts) {
-  case 1:
-    return AMDGPU::SReg_32RegClassID;
-  case 2:
-    return AMDGPU::SReg_64RegClassID;
-  case 3:
-    return AMDGPU::SGPR_96RegClassID;
-  case 4:
-    return AMDGPU::SGPR_128RegClassID;
-  case 5:
-    return AMDGPU::SGPR_160RegClassID;
-  case 8:
-    return AMDGPU::SReg_256RegClassID;
-  case 16:
-    return AMDGPU::SReg_512RegClassID;
-  case 32:
-    return AMDGPU::SReg_1024RegClassID;
-  }
-
-  llvm_unreachable("invalid vector size");
-}
-
 void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   EVT VT = N->getValueType(0);
   unsigned NumVectorElts = VT.getVectorNumElements();
@@ -698,6 +668,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
   // 1 = Vector Register Class
   SmallVector<SDValue, 32 * 2 + 1> RegSeqArgs(NumVectorElts * 2 + 1);
 
+  bool IsGCN = CurDAG->getSubtarget().getTargetTriple().getArch() ==
+               Triple::amdgcn;
   RegSeqArgs[0] = CurDAG->getTargetConstant(RegClassID, DL, MVT::i32);
   bool IsRegSeq = true;
   unsigned NOps = N->getNumOperands();
@@ -707,7 +679,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
       IsRegSeq = false;
       break;
     }
-    unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
+    unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
+                         : R600RegisterInfo::getSubRegFromChannel(i);
     RegSeqArgs[1 + (2 * i)] = N->getOperand(i);
     RegSeqArgs[1 + (2 * i) + 1] = CurDAG->getTargetConstant(Sub, DL, MVT::i32);
   }
@@ -717,7 +690,8 @@ void AMDGPUDAGToDAGISel::SelectBuildVector(SDNode *N, unsigned RegClassID) {
     MachineSDNode *ImpDef = CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF,
                                                    DL, EltVT);
     for (unsigned i = NOps; i < NumVectorElts; ++i) {
-      unsigned Sub = AMDGPURegisterInfo::getSubRegFromChannel(i);
+      unsigned Sub = IsGCN ? SIRegisterInfo::getSubRegFromChannel(i)
+                           : R600RegisterInfo::getSubRegFromChannel(i);
       RegSeqArgs[1 + (2 * i)] = SDValue(ImpDef, 0);
       RegSeqArgs[1 + (2 * i) + 1] =
           CurDAG->getTargetConstant(Sub, DL, MVT::i32);
@@ -742,7 +716,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
       (Opc == AMDGPUISD::ATOMIC_INC || Opc == AMDGPUISD::ATOMIC_DEC ||
        Opc == ISD::ATOMIC_LOAD_FADD ||
        Opc == AMDGPUISD::ATOMIC_LOAD_FMIN ||
-       Opc == AMDGPUISD::ATOMIC_LOAD_FMAX)) {
+       Opc == AMDGPUISD::ATOMIC_LOAD_FMAX ||
+       Opc == AMDGPUISD::ATOMIC_LOAD_CSUB)) {
     N = glueCopyToM0LDSInit(N);
     SelectCode(N);
     return;
@@ -801,7 +776,8 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     }
 
     assert(VT.getVectorElementType().bitsEq(MVT::i32));
-    unsigned RegClassID = selectSGPRVectorRegClassID(NumVectorElts);
+    unsigned RegClassID =
+        SIRegisterInfo::getSGPRClassForBitWidth(NumVectorElts * 32)->getID();
     SelectBuildVector(N, RegClassID);
     return;
   }
@@ -874,10 +850,6 @@ void AMDGPUDAGToDAGISel::Select(SDNode *N) {
     SelectDIV_SCALE(N);
     return;
   }
-  case AMDGPUISD::DIV_FMAS: {
-    SelectDIV_FMAS(N);
-    return;
-  }
   case AMDGPUISD::MAD_I64_I32:
   case AMDGPUISD::MAD_U64_U32: {
     SelectMAD_64_32(N);
@@ -1020,8 +992,14 @@ void AMDGPUDAGToDAGISel::SelectADD_SUB_I64(SDNode *N) {
 
   SDVTList VTList = CurDAG->getVTList(MVT::i32, MVT::Glue);
 
-  unsigned Opc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
-  unsigned CarryOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
+  static const unsigned OpcMap[2][2][2] = {
+      {{AMDGPU::S_SUB_U32, AMDGPU::S_ADD_U32},
+       {AMDGPU::V_SUB_I32_e32, AMDGPU::V_ADD_I32_e32}},
+      {{AMDGPU::S_SUBB_U32, AMDGPU::S_ADDC_U32},
+       {AMDGPU::V_SUBB_U32_e32, AMDGPU::V_ADDC_U32_e32}}};
+
+  unsigned Opc = OpcMap[0][N->isDivergent()][IsAdd];
+  unsigned CarryOpc = OpcMap[1][N->isDivergent()][IsAdd];
 
   SDNode *AddLo;
   if (!ConsumeCarry) {
@@ -1063,24 +1041,51 @@ void AMDGPUDAGToDAGISel::SelectAddcSubb(SDNode *N) {
   SDValue RHS = N->getOperand(1);
   SDValue CI = N->getOperand(2);
 
-  unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
-                                                 : AMDGPU::V_SUBB_U32_e64;
-  CurDAG->SelectNodeTo(
-      N, Opc, N->getVTList(),
-      {LHS, RHS, CI, CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
+  if (N->isDivergent()) {
+    unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::V_ADDC_U32_e64
+                                                   : AMDGPU::V_SUBB_U32_e64;
+    CurDAG->SelectNodeTo(
+        N, Opc, N->getVTList(),
+        {LHS, RHS, CI,
+         CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
+  } else {
+    unsigned Opc = N->getOpcode() == ISD::ADDCARRY ? AMDGPU::S_ADD_CO_PSEUDO
+                                                   : AMDGPU::S_SUB_CO_PSEUDO;
+    CurDAG->SelectNodeTo(N, Opc, N->getVTList(), {LHS, RHS, CI});
+  }
 }
 
 void AMDGPUDAGToDAGISel::SelectUADDO_USUBO(SDNode *N) {
   // The name of the opcodes are misleading. v_add_i32/v_sub_i32 have unsigned
   // carry out despite the _i32 name. These were renamed in VI to _U32.
   // FIXME: We should probably rename the opcodes here.
-  unsigned Opc = N->getOpcode() == ISD::UADDO ?
-    AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+  bool IsAdd = N->getOpcode() == ISD::UADDO;
+  bool IsVALU = N->isDivergent();
+
+  for (SDNode::use_iterator UI = N->use_begin(), E = N->use_end(); UI != E;
+       ++UI)
+    if (UI.getUse().getResNo() == 1) {
+      if ((IsAdd && (UI->getOpcode() != ISD::ADDCARRY)) ||
+          (!IsAdd && (UI->getOpcode() != ISD::SUBCARRY))) {
+        IsVALU = true;
+        break;
+      }
+    }
+
+  if (IsVALU) {
+    unsigned Opc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+
+    CurDAG->SelectNodeTo(
+        N, Opc, N->getVTList(),
+        {N->getOperand(0), N->getOperand(1),
+         CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
+  } else {
+    unsigned Opc = N->getOpcode() == ISD::UADDO ? AMDGPU::S_UADDO_PSEUDO
+                                                : AMDGPU::S_USUBO_PSEUDO;
 
-  CurDAG->SelectNodeTo(
-      N, Opc, N->getVTList(),
-      {N->getOperand(0), N->getOperand(1),
-       CurDAG->getTargetConstant(0, {}, MVT::i1) /*clamp bit*/});
+    CurDAG->SelectNodeTo(N, Opc, N->getVTList(),
+                         {N->getOperand(0), N->getOperand(1)});
+  }
 }
 
 void AMDGPUDAGToDAGISel::SelectFMA_W_CHAIN(SDNode *N) {
@@ -1125,35 +1130,6 @@ void AMDGPUDAGToDAGISel::SelectDIV_SCALE(SDNode *N) {
   CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
 }
 
-void AMDGPUDAGToDAGISel::SelectDIV_FMAS(SDNode *N) {
-  const GCNSubtarget *ST = static_cast<const GCNSubtarget *>(Subtarget);
-  const SIRegisterInfo *TRI = ST->getRegisterInfo();
-
-  SDLoc SL(N);
-  EVT VT = N->getValueType(0);
-
-  assert(VT == MVT::f32 || VT == MVT::f64);
-
-  unsigned Opc
-    = (VT == MVT::f64) ? AMDGPU::V_DIV_FMAS_F64 : AMDGPU::V_DIV_FMAS_F32;
-
-  SDValue CarryIn = N->getOperand(3);
-  // V_DIV_FMAS implicitly reads VCC.
-  SDValue VCC = CurDAG->getCopyToReg(CurDAG->getEntryNode(), SL,
-                                     TRI->getVCC(), CarryIn, SDValue());
-
-  SDValue Ops[10];
-
-  SelectVOP3Mods0(N->getOperand(0), Ops[1], Ops[0], Ops[6], Ops[7]);
-  SelectVOP3Mods(N->getOperand(1), Ops[3], Ops[2]);
-  SelectVOP3Mods(N->getOperand(2), Ops[5], Ops[4]);
-
-  Ops[8] = VCC;
-  Ops[9] = VCC.getValue(1);
-
-  CurDAG->SelectNodeTo(N, Opc, N->getVTList(), Ops);
-}
-
 // We need to handle this here because tablegen doesn't support matching
 // instructions with multiple outputs.
 void AMDGPUDAGToDAGISel::SelectMAD_64_32(SDNode *N) {
@@ -1343,6 +1319,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUF(SDValue Addr, SDValue &Ptr,
                                      SDValue &TFE, SDValue &DLC,
                                      SDValue &SWZ) const {
   // Subtarget prefers to use flat instruction
+  // FIXME: This should be a pattern predicate and not reach here
   if (Subtarget->useFlatForGlobal())
     return false;
 
@@ -1438,6 +1415,7 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFAddr64(SDValue Addr, SDValue &SRsrc,
   SDValue Ptr, Offen, Idxen, Addr64;
 
   // addr64 bit was removed for volcanic islands.
+  // FIXME: This should be a pattern predicate and not reach here
   if (!Subtarget->hasAddr64())
     return false;
 
@@ -1475,6 +1453,7 @@ static bool isStackPtrRelative(const MachinePointerInfo &PtrInfo) {
 }
 
 std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const {
+  SDLoc DL(N);
   const MachineFunction &MF = CurDAG->getMachineFunction();
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
 
@@ -1489,9 +1468,8 @@ std::pair<SDValue, SDValue> AMDGPUDAGToDAGISel::foldFrameIndex(SDValue N) const
   }
 
   // If we don't know this private access is a local stack object, it needs to
-  // be relative to the entry point's scratch wave offset register.
-  return std::make_pair(N, CurDAG->getRegister(Info->getScratchWaveOffsetReg(),
-                                               MVT::i32));
+  // be relative to the entry point's scratch wave offset.
+  return std::make_pair(N, CurDAG->getTargetConstant(0, DL, MVT::i32));
 }
 
 bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
@@ -1506,22 +1484,26 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffen(SDNode *Parent,
   Rsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
 
   if (ConstantSDNode *CAddr = dyn_cast<ConstantSDNode>(Addr)) {
-    unsigned Imm = CAddr->getZExtValue();
-
-    SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
-    MachineSDNode *MovHighBits = CurDAG->getMachineNode(AMDGPU::V_MOV_B32_e32,
-                                                        DL, MVT::i32, HighBits);
-    VAddr = SDValue(MovHighBits, 0);
-
-    // In a call sequence, stores to the argument stack area are relative to the
-    // stack pointer.
-    const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
-    unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
-      Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
-
-    SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
-    ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
-    return true;
+    int64_t Imm = CAddr->getSExtValue();
+    const int64_t NullPtr =
+        AMDGPUTargetMachine::getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS);
+    // Don't fold null pointer.
+    if (Imm != NullPtr) {
+      SDValue HighBits = CurDAG->getTargetConstant(Imm & ~4095, DL, MVT::i32);
+      MachineSDNode *MovHighBits = CurDAG->getMachineNode(
+        AMDGPU::V_MOV_B32_e32, DL, MVT::i32, HighBits);
+      VAddr = SDValue(MovHighBits, 0);
+
+      // In a call sequence, stores to the argument stack area are relative to the
+      // stack pointer.
+      const MachinePointerInfo &PtrInfo
+        = cast<MemSDNode>(Parent)->getPointerInfo();
+      SOffset = isStackPtrRelative(PtrInfo)
+        ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
+        : CurDAG->getTargetConstant(0, DL, MVT::i32);
+      ImmOffset = CurDAG->getTargetConstant(Imm & 4095, DL, MVT::i16);
+      return true;
+    }
   }
 
   if (CurDAG->isBaseWithConstantOffset(Addr)) {
@@ -1577,12 +1559,12 @@ bool AMDGPUDAGToDAGISel::SelectMUBUFScratchOffset(SDNode *Parent,
   SRsrc = CurDAG->getRegister(Info->getScratchRSrcReg(), MVT::v4i32);
 
   const MachinePointerInfo &PtrInfo = cast<MemSDNode>(Parent)->getPointerInfo();
-  unsigned SOffsetReg = isStackPtrRelative(PtrInfo) ?
-    Info->getStackPtrOffsetReg() : Info->getScratchWaveOffsetReg();
 
   // FIXME: Get from MachinePointerInfo? We should only be using the frame
   // offset if we know this is in a call sequence.
-  SOffset = CurDAG->getRegister(SOffsetReg, MVT::i32);
+  SOffset = isStackPtrRelative(PtrInfo)
+                ? CurDAG->getRegister(Info->getStackPtrOffsetReg(), MVT::i32)
+                : CurDAG->getTargetConstant(0, DL, MVT::i32);
 
   Offset = CurDAG->getTargetConstant(CAddr->getZExtValue(), DL, MVT::i16);
   return true;
@@ -1646,6 +1628,37 @@ static MemSDNode* findMemSDNode(SDNode *N) {
   llvm_unreachable("cannot find MemSDNode in the pattern!");
 }
 
+static bool getBaseWithOffsetUsingSplitOR(SelectionDAG &DAG, SDValue Addr,
+                                          SDValue &N0, SDValue &N1) {
+  if (Addr.getValueType() == MVT::i64 && Addr.getOpcode() == ISD::BITCAST &&
+      Addr.getOperand(0).getOpcode() == ISD::BUILD_VECTOR) {
+    // As we split 64-bit `or` earlier, it's complicated pattern to match, i.e.
+    // (i64 (bitcast (v2i32 (build_vector
+    //                        (or (extract_vector_elt V, 0), OFFSET),
+    //                        (extract_vector_elt V, 1)))))
+    SDValue Lo = Addr.getOperand(0).getOperand(0);
+    if (Lo.getOpcode() == ISD::OR && DAG.isBaseWithConstantOffset(Lo)) {
+      SDValue BaseLo = Lo.getOperand(0);
+      SDValue BaseHi = Addr.getOperand(0).getOperand(1);
+      // Check that split base (Lo and Hi) are extracted from the same one.
+      if (BaseLo.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          BaseHi.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+          BaseLo.getOperand(0) == BaseHi.getOperand(0) &&
+          // Lo is statically extracted from index 0.
+          isa<ConstantSDNode>(BaseLo.getOperand(1)) &&
+          BaseLo.getConstantOperandVal(1) == 0 &&
+          // Hi is statically extracted from index 0.
+          isa<ConstantSDNode>(BaseHi.getOperand(1)) &&
+          BaseHi.getConstantOperandVal(1) == 1) {
+        N0 = BaseLo.getOperand(0).getOperand(0);
+        N1 = Lo.getOperand(1);
+        return true;
+      }
+    }
+  }
+  return false;
+}
+
 template <bool IsSigned>
 bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
                                           SDValue Addr,
@@ -1656,84 +1669,91 @@ bool AMDGPUDAGToDAGISel::SelectFlatOffset(SDNode *N,
 
   if (Subtarget->hasFlatInstOffsets() &&
       (!Subtarget->hasFlatSegmentOffsetBug() ||
-       findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS) &&
-      CurDAG->isBaseWithConstantOffset(Addr)) {
-    SDValue N0 = Addr.getOperand(0);
-    SDValue N1 = Addr.getOperand(1);
-    uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
-
-    const SIInstrInfo *TII = Subtarget->getInstrInfo();
-    unsigned AS = findMemSDNode(N)->getAddressSpace();
-    if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
-      Addr = N0;
-      OffsetVal = COffsetVal;
-    } else {
-      // If the offset doesn't fit, put the low bits into the offset field and
-      // add the rest.
-
-      SDLoc DL(N);
-      uint64_t ImmField;
-      const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
-      if (IsSigned) {
-        ImmField = SignExtend64(COffsetVal, NumBits);
-
-        // Don't use a negative offset field if the base offset is positive.
-        // Since the scheduler currently relies on the offset field, doing so
-        // could result in strange scheduling decisions.
-
-        // TODO: Should we not do this in the opposite direction as well?
-        if (static_cast<int64_t>(COffsetVal) > 0) {
-          if (static_cast<int64_t>(ImmField) < 0) {
-            const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits - 1);
-            ImmField = COffsetVal & OffsetMask;
+       findMemSDNode(N)->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS)) {
+    SDValue N0, N1;
+    if (CurDAG->isBaseWithConstantOffset(Addr)) {
+      N0 = Addr.getOperand(0);
+      N1 = Addr.getOperand(1);
+    } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
+      assert(N0 && N1 && isa<ConstantSDNode>(N1));
+    }
+    if (N0 && N1) {
+      uint64_t COffsetVal = cast<ConstantSDNode>(N1)->getSExtValue();
+
+      const SIInstrInfo *TII = Subtarget->getInstrInfo();
+      unsigned AS = findMemSDNode(N)->getAddressSpace();
+      if (TII->isLegalFLATOffset(COffsetVal, AS, IsSigned)) {
+        Addr = N0;
+        OffsetVal = COffsetVal;
+      } else {
+        // If the offset doesn't fit, put the low bits into the offset field and
+        // add the rest.
+
+        SDLoc DL(N);
+        uint64_t ImmField;
+        const unsigned NumBits = TII->getNumFlatOffsetBits(AS, IsSigned);
+        if (IsSigned) {
+          ImmField = SignExtend64(COffsetVal, NumBits);
+
+          // Don't use a negative offset field if the base offset is positive.
+          // Since the scheduler currently relies on the offset field, doing so
+          // could result in strange scheduling decisions.
+
+          // TODO: Should we not do this in the opposite direction as well?
+          if (static_cast<int64_t>(COffsetVal) > 0) {
+            if (static_cast<int64_t>(ImmField) < 0) {
+              const uint64_t OffsetMask =
+                  maskTrailingOnes<uint64_t>(NumBits - 1);
+              ImmField = COffsetVal & OffsetMask;
+            }
           }
+        } else {
+          // TODO: Should we do this for a negative offset?
+          const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
+          ImmField = COffsetVal & OffsetMask;
         }
-      } else {
-        // TODO: Should we do this for a negative offset?
-        const uint64_t OffsetMask = maskTrailingOnes<uint64_t>(NumBits);
-        ImmField = COffsetVal & OffsetMask;
-      }
 
-      uint64_t RemainderOffset = COffsetVal - ImmField;
+        uint64_t RemainderOffset = COffsetVal - ImmField;
 
-      assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
-      assert(RemainderOffset + ImmField == COffsetVal);
+        assert(TII->isLegalFLATOffset(ImmField, AS, IsSigned));
+        assert(RemainderOffset + ImmField == COffsetVal);
 
-      OffsetVal = ImmField;
+        OffsetVal = ImmField;
 
-      // TODO: Should this try to use a scalar add pseudo if the base address is
-      // uniform and saddr is usable?
-      SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
-      SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
+        // TODO: Should this try to use a scalar add pseudo if the base address
+        // is uniform and saddr is usable?
+        SDValue Sub0 = CurDAG->getTargetConstant(AMDGPU::sub0, DL, MVT::i32);
+        SDValue Sub1 = CurDAG->getTargetConstant(AMDGPU::sub1, DL, MVT::i32);
 
-      SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                            DL, MVT::i32, N0, Sub0);
-      SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG,
-                                            DL, MVT::i32, N0, Sub1);
+        SDNode *N0Lo = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                              MVT::i32, N0, Sub0);
+        SDNode *N0Hi = CurDAG->getMachineNode(TargetOpcode::EXTRACT_SUBREG, DL,
+                                              MVT::i32, N0, Sub1);
 
-      SDValue AddOffsetLo
-        = getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
-      SDValue AddOffsetHi
-        = getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
+        SDValue AddOffsetLo =
+            getMaterializedScalarImm32(Lo_32(RemainderOffset), DL);
+        SDValue AddOffsetHi =
+            getMaterializedScalarImm32(Hi_32(RemainderOffset), DL);
 
-      SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
-      SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
+        SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i1);
+        SDValue Clamp = CurDAG->getTargetConstant(0, DL, MVT::i1);
 
-      SDNode *Add = CurDAG->getMachineNode(
-        AMDGPU::V_ADD_I32_e64, DL, VTs,
-        {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
+        SDNode *Add =
+            CurDAG->getMachineNode(AMDGPU::V_ADD_I32_e64, DL, VTs,
+                                   {AddOffsetLo, SDValue(N0Lo, 0), Clamp});
 
-      SDNode *Addc = CurDAG->getMachineNode(
-        AMDGPU::V_ADDC_U32_e64, DL, VTs,
-        {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
+        SDNode *Addc = CurDAG->getMachineNode(
+            AMDGPU::V_ADDC_U32_e64, DL, VTs,
+            {AddOffsetHi, SDValue(N0Hi, 0), SDValue(Add, 1), Clamp});
 
-      SDValue RegSequenceArgs[] = {
-        CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
-        SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1
-      };
+        SDValue RegSequenceArgs[] = {
+            CurDAG->getTargetConstant(AMDGPU::VReg_64RegClassID, DL, MVT::i32),
+            SDValue(Add, 0), Sub0, SDValue(Addc, 0), Sub1};
 
-      Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
-                                            MVT::i64, RegSequenceArgs), 0);
+        Addr = SDValue(CurDAG->getMachineNode(AMDGPU::REG_SEQUENCE, DL,
+                                              MVT::i64, RegSequenceArgs),
+                       0);
+      }
     }
   }
 
@@ -1761,35 +1781,52 @@ bool AMDGPUDAGToDAGISel::SelectFlatAtomicSigned(SDNode *N,
 
 bool AMDGPUDAGToDAGISel::SelectSMRDOffset(SDValue ByteOffsetNode,
                                           SDValue &Offset, bool &Imm) const {
-
-  // FIXME: Handle non-constant offsets.
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(ByteOffsetNode);
-  if (!C)
+  if (!C) {
+    if (ByteOffsetNode.getValueType().isScalarInteger() &&
+        ByteOffsetNode.getValueType().getSizeInBits() == 32) {
+      Offset = ByteOffsetNode;
+      Imm = false;
+      return true;
+    }
+    if (ByteOffsetNode.getOpcode() == ISD::ZERO_EXTEND) {
+      if (ByteOffsetNode.getOperand(0).getValueType().getSizeInBits() == 32) {
+        Offset = ByteOffsetNode.getOperand(0);
+        Imm = false;
+        return true;
+      }
+    }
     return false;
+  }
 
   SDLoc SL(ByteOffsetNode);
-  GCNSubtarget::Generation Gen = Subtarget->getGeneration();
+  // GFX9 and GFX10 have signed byte immediate offsets.
   int64_t ByteOffset = C->getSExtValue();
-  int64_t EncodedOffset = AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset);
-
-  if (AMDGPU::isLegalSMRDImmOffset(*Subtarget, ByteOffset)) {
-    Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
+  Optional<int64_t> EncodedOffset =
+      AMDGPU::getSMRDEncodedOffset(*Subtarget, ByteOffset, false);
+  if (EncodedOffset) {
+    Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
     Imm = true;
     return true;
   }
 
-  if (!isUInt<32>(EncodedOffset) || !isUInt<32>(ByteOffset))
+  // SGPR and literal offsets are unsigned.
+  if (ByteOffset < 0)
     return false;
 
-  if (Gen == AMDGPUSubtarget::SEA_ISLANDS && isUInt<32>(EncodedOffset)) {
-    // 32-bit Immediates are supported on Sea Islands.
-    Offset = CurDAG->getTargetConstant(EncodedOffset, SL, MVT::i32);
-  } else {
-    SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
-    Offset = SDValue(CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32,
-                                            C32Bit), 0);
+  EncodedOffset = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget, ByteOffset);
+  if (EncodedOffset) {
+    Offset = CurDAG->getTargetConstant(*EncodedOffset, SL, MVT::i32);
+    return true;
   }
-  Imm = false;
+
+  if (!isUInt<32>(ByteOffset) && !isInt<32>(ByteOffset))
+    return false;
+
+  SDValue C32Bit = CurDAG->getTargetConstant(ByteOffset, SL, MVT::i32);
+  Offset = SDValue(
+      CurDAG->getMachineNode(AMDGPU::S_MOV_B32, SL, MVT::i32, C32Bit), 0);
+
   return true;
 }
 
@@ -1825,14 +1862,21 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
   // A 32-bit (address + offset) should not cause unsigned 32-bit integer
   // wraparound, because s_load instructions perform the addition in 64 bits.
   if ((Addr.getValueType() != MVT::i32 ||
-       Addr->getFlags().hasNoUnsignedWrap()) &&
-      CurDAG->isBaseWithConstantOffset(Addr)) {
-    SDValue N0 = Addr.getOperand(0);
-    SDValue N1 = Addr.getOperand(1);
-
-    if (SelectSMRDOffset(N1, Offset, Imm)) {
-      SBase = Expand32BitAddress(N0);
-      return true;
+       Addr->getFlags().hasNoUnsignedWrap())) {
+    SDValue N0, N1;
+    // Extract the base and offset if possible.
+    if (CurDAG->isBaseWithConstantOffset(Addr) ||
+        Addr.getOpcode() == ISD::ADD) {
+      N0 = Addr.getOperand(0);
+      N1 = Addr.getOperand(1);
+    } else if (getBaseWithOffsetUsingSplitOR(*CurDAG, Addr, N0, N1)) {
+      assert(N0 && N1 && isa<ConstantSDNode>(N1));
+    }
+    if (N0 && N1) {
+      if (SelectSMRDOffset(N1, Offset, Imm)) {
+        SBase = Expand32BitAddress(N0);
+        return true;
+      }
     }
   }
   SBase = Expand32BitAddress(Addr);
@@ -1843,17 +1887,16 @@ bool AMDGPUDAGToDAGISel::SelectSMRD(SDValue Addr, SDValue &SBase,
 
 bool AMDGPUDAGToDAGISel::SelectSMRDImm(SDValue Addr, SDValue &SBase,
                                        SDValue &Offset) const {
-  bool Imm;
+  bool Imm = false;
   return SelectSMRD(Addr, SBase, Offset, Imm) && Imm;
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
                                          SDValue &Offset) const {
 
-  if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
-    return false;
+  assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
 
-  bool Imm;
+  bool Imm = false;
   if (!SelectSMRD(Addr, SBase, Offset, Imm))
     return false;
 
@@ -1862,27 +1905,38 @@ bool AMDGPUDAGToDAGISel::SelectSMRDImm32(SDValue Addr, SDValue &SBase,
 
 bool AMDGPUDAGToDAGISel::SelectSMRDSgpr(SDValue Addr, SDValue &SBase,
                                         SDValue &Offset) const {
-  bool Imm;
+  bool Imm = false;
   return SelectSMRD(Addr, SBase, Offset, Imm) && !Imm &&
          !isa<ConstantSDNode>(Offset);
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm(SDValue Addr,
                                              SDValue &Offset) const {
-  bool Imm;
-  return SelectSMRDOffset(Addr, Offset, Imm) && Imm;
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
+    // The immediate offset for S_BUFFER instructions is unsigned.
+    if (auto Imm =
+            AMDGPU::getSMRDEncodedOffset(*Subtarget, C->getZExtValue(), true)) {
+      Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
+      return true;
+    }
+  }
+
+  return false;
 }
 
 bool AMDGPUDAGToDAGISel::SelectSMRDBufferImm32(SDValue Addr,
                                                SDValue &Offset) const {
-  if (Subtarget->getGeneration() != AMDGPUSubtarget::SEA_ISLANDS)
-    return false;
+  assert(Subtarget->getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
 
-  bool Imm;
-  if (!SelectSMRDOffset(Addr, Offset, Imm))
-    return false;
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Addr)) {
+    if (auto Imm = AMDGPU::getSMRDEncodedLiteralOffset32(*Subtarget,
+                                                         C->getZExtValue())) {
+      Offset = CurDAG->getTargetConstant(*Imm, SDLoc(Addr), MVT::i32);
+      return true;
+    }
+  }
 
-  return !Imm && isa<ConstantSDNode>(Offset);
+  return false;
 }
 
 bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
@@ -1898,7 +1952,9 @@ bool AMDGPUDAGToDAGISel::SelectMOVRELOffset(SDValue Index,
     // (add n0, c0)
     // Don't peel off the offset (c0) if doing so could possibly lead
     // the base (n0) to be negative.
-    if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0)) {
+    // (or n0, |c0|) can never change a sign given isBaseWithConstantOffset.
+    if (C1->getSExtValue() <= 0 || CurDAG->SignBitIsZero(N0) ||
+        (Index->getOpcode() == ISD::OR && C1->getSExtValue() >= 0)) {
       Base = N0;
       Offset = CurDAG->getTargetConstant(C1->getZExtValue(), DL, MVT::i32);
       return true;
@@ -2066,7 +2122,7 @@ void AMDGPUDAGToDAGISel::SelectBRCOND(SDNode *N) {
 
   bool UseSCCBr = isCBranchSCC(N) && isUniformBr(N);
   unsigned BrOp = UseSCCBr ? AMDGPU::S_CBRANCH_SCC1 : AMDGPU::S_CBRANCH_VCCNZ;
-  unsigned CondReg = UseSCCBr ? (unsigned)AMDGPU::SCC : TRI->getVCC();
+  Register CondReg = UseSCCBr ? AMDGPU::SCC : TRI->getVCC();
   SDLoc SL(N);
 
   if (!UseSCCBr) {
@@ -2121,7 +2177,7 @@ void AMDGPUDAGToDAGISel::SelectFMAD_FMA(SDNode *N) {
   bool Sel1 = SelectVOP3PMadMixModsImpl(Src1, Src1, Src1Mods);
   bool Sel2 = SelectVOP3PMadMixModsImpl(Src2, Src2, Src2Mods);
 
-  assert((IsFMA || !Mode.FP32Denormals) &&
+  assert((IsFMA || !Mode.allFP32Denormals()) &&
          "fmad selected with denormals enabled");
   // TODO: We can select this with f32 denormals enabled if all the sources are
   // converted from f16 (in which case fmad isn't legal).
@@ -2338,6 +2394,64 @@ void AMDGPUDAGToDAGISel::SelectDS_GWS(SDNode *N, unsigned IntrID) {
   CurDAG->setNodeMemRefs(cast<MachineSDNode>(Selected), {MMO});
 }
 
+void AMDGPUDAGToDAGISel::SelectInterpP1F16(SDNode *N) {
+  if (Subtarget->getLDSBankCount() != 16) {
+    // This is a single instruction with a pattern.
+    SelectCode(N);
+    return;
+  }
+
+  SDLoc DL(N);
+
+  // This requires 2 instructions. It is possible to write a pattern to support
+  // this, but the generated isel emitter doesn't correctly deal with multiple
+  // output instructions using the same physical register input. The copy to m0
+  // is incorrectly placed before the second instruction.
+  //
+  // TODO: Match source modifiers.
+  //
+  // def : Pat <
+  //   (int_amdgcn_interp_p1_f16
+  //    (VOP3Mods f32:$src0, i32:$src0_modifiers),
+  //                             (i32 timm:$attrchan), (i32 timm:$attr),
+  //                             (i1 timm:$high), M0),
+  //   (V_INTERP_P1LV_F16 $src0_modifiers, VGPR_32:$src0, timm:$attr,
+  //       timm:$attrchan, 0,
+  //       (V_INTERP_MOV_F32 2, timm:$attr, timm:$attrchan), timm:$high)> {
+  //   let Predicates = [has16BankLDS];
+  // }
+
+  // 16 bank LDS
+  SDValue ToM0 = CurDAG->getCopyToReg(CurDAG->getEntryNode(), DL, AMDGPU::M0,
+                                      N->getOperand(5), SDValue());
+
+  SDVTList VTs = CurDAG->getVTList(MVT::f32, MVT::Other);
+
+  SDNode *InterpMov =
+    CurDAG->getMachineNode(AMDGPU::V_INTERP_MOV_F32, DL, VTs, {
+        CurDAG->getTargetConstant(2, DL, MVT::i32), // P0
+        N->getOperand(3),  // Attr
+        N->getOperand(2),  // Attrchan
+        ToM0.getValue(1) // In glue
+  });
+
+  SDNode *InterpP1LV =
+    CurDAG->getMachineNode(AMDGPU::V_INTERP_P1LV_F16, DL, MVT::f32, {
+        CurDAG->getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
+        N->getOperand(1), // Src0
+        N->getOperand(3), // Attr
+        N->getOperand(2), // Attrchan
+        CurDAG->getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
+        SDValue(InterpMov, 0), // Src2 - holds two f16 values selected by high
+        N->getOperand(4), // high
+        CurDAG->getTargetConstant(0, DL, MVT::i1), // $clamp
+        CurDAG->getTargetConstant(0, DL, MVT::i32), // $omod
+        SDValue(InterpMov, 1)
+  });
+
+  CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), SDValue(InterpP1LV, 0));
+}
+
 void AMDGPUDAGToDAGISel::SelectINTRINSIC_W_CHAIN(SDNode *N) {
   unsigned IntrID = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
   switch (IntrID) {
@@ -2366,6 +2480,9 @@ void AMDGPUDAGToDAGISel::SelectINTRINSIC_WO_CHAIN(SDNode *N) {
   case Intrinsic::amdgcn_wwm:
     Opcode = AMDGPU::WWM;
     break;
+  case Intrinsic::amdgcn_interp_p1_f16:
+    SelectInterpP1F16(N);
+    return;
   default:
     SelectCode(N);
     return;
@@ -2428,15 +2545,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3Mods_NNaN(SDValue In, SDValue &Src,
   return isNoNanSrc(Src);
 }
 
-bool AMDGPUDAGToDAGISel::SelectVOP3Mods_f32(SDValue In, SDValue &Src,
-                                            SDValue &SrcMods) const {
-  if (In.getValueType() == MVT::f32)
-    return SelectVOP3Mods(In, Src, SrcMods);
-  Src = In;
-  SrcMods = CurDAG->getTargetConstant(0, SDLoc(In), MVT::i32);;
-  return true;
-}
-
 bool AMDGPUDAGToDAGISel::SelectVOP3NoMods(SDValue In, SDValue &Src) const {
   if (In.getOpcode() == ISD::FABS || In.getOpcode() == ISD::FNEG)
     return false;
@@ -2520,17 +2628,6 @@ bool AMDGPUDAGToDAGISel::SelectVOP3PMods(SDValue In, SDValue &Src,
   return true;
 }
 
-bool AMDGPUDAGToDAGISel::SelectVOP3PMods0(SDValue In, SDValue &Src,
-                                          SDValue &SrcMods,
-                                          SDValue &Clamp) const {
-  SDLoc SL(In);
-
-  // FIXME: Handle clamp and op_sel
-  Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
-
-  return SelectVOP3PMods(In, Src, SrcMods);
-}
-
 bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
                                          SDValue &SrcMods) const {
   Src = In;
@@ -2539,34 +2636,12 @@ bool AMDGPUDAGToDAGISel::SelectVOP3OpSel(SDValue In, SDValue &Src,
   return true;
 }
 
-bool AMDGPUDAGToDAGISel::SelectVOP3OpSel0(SDValue In, SDValue &Src,
-                                          SDValue &SrcMods,
-                                          SDValue &Clamp) const {
-  SDLoc SL(In);
-
-  // FIXME: Handle clamp
-  Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
-
-  return SelectVOP3OpSel(In, Src, SrcMods);
-}
-
 bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods(SDValue In, SDValue &Src,
                                              SDValue &SrcMods) const {
   // FIXME: Handle op_sel
   return SelectVOP3Mods(In, Src, SrcMods);
 }
 
-bool AMDGPUDAGToDAGISel::SelectVOP3OpSelMods0(SDValue In, SDValue &Src,
-                                              SDValue &SrcMods,
-                                              SDValue &Clamp) const {
-  SDLoc SL(In);
-
-  // FIXME: Handle clamp
-  Clamp = CurDAG->getTargetConstant(0, SL, MVT::i32);
-
-  return SelectVOP3OpSelMods(In, Src, SrcMods);
-}
-
 // The return value is not whether the match is possible (which it always is),
 // but whether or not it a conversion is really used.
 bool AMDGPUDAGToDAGISel::SelectVOP3PMadMixModsImpl(SDValue In, SDValue &Src,
@@ -2705,7 +2780,7 @@ bool AMDGPUDAGToDAGISel::isUniformLoad(const SDNode * N) const {
           (
             Subtarget->getScalarizeGlobalBehavior() &&
             Ld->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS &&
-            !Ld->isVolatile() &&
+            Ld->isSimple() &&
             !N->isDivergent() &&
             static_cast<const SITargetLowering *>(
               getTargetLowering())->isMemOpHasNoClobberedMemOperand(N)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
index 23cc9404532d..940ec6f31c69 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.cpp
@@ -16,7 +16,6 @@
 #include "AMDGPU.h"
 #include "AMDGPUCallLowering.h"
 #include "AMDGPUFrameLowering.h"
-#include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "Utils/AMDGPUBaseInfo.h"
@@ -38,6 +37,11 @@ using namespace llvm;
 
 #include "AMDGPUGenCallingConv.inc"
 
+static cl::opt<bool> AMDGPUBypassSlowDiv(
+  "amdgpu-bypass-slow-div",
+  cl::desc("Skip 64-bit divide for dynamic 32-bit values"),
+  cl::init(true));
+
 // Find a larger type to do a load / store of a vector with.
 EVT AMDGPUTargetLowering::getEquivalentMemType(LLVMContext &Ctx, EVT VT) {
   unsigned StoreSize = VT.getStoreSizeInBits();
@@ -103,6 +107,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::LOAD, MVT::v2f64, Promote);
   AddPromotedToType(ISD::LOAD, MVT::v2f64, MVT::v4i32);
 
+  setOperationAction(ISD::LOAD, MVT::v4i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v4i64, MVT::v8i32);
+
+  setOperationAction(ISD::LOAD, MVT::v4f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v4f64, MVT::v8i32);
+
+  setOperationAction(ISD::LOAD, MVT::v8i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v8i64, MVT::v16i32);
+
+  setOperationAction(ISD::LOAD, MVT::v8f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v8f64, MVT::v16i32);
+
+  setOperationAction(ISD::LOAD, MVT::v16i64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v16i64, MVT::v32i32);
+
+  setOperationAction(ISD::LOAD, MVT::v16f64, Promote);
+  AddPromotedToType(ISD::LOAD, MVT::v16f64, MVT::v32i32);
+
   // There are no 64-bit extloads. These should be done as a 32-bit extload and
   // an extension to 64-bit.
   for (MVT VT : MVT::integer_valuetypes()) {
@@ -161,11 +183,13 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f32, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f32, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f32, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f32, Expand);
 
   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::v2f64, MVT::v2f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::v4f64, MVT::v4f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::v8f64, MVT::v8f16, Expand);
+  setLoadExtAction(ISD::EXTLOAD, MVT::v16f64, MVT::v16f16, Expand);
 
   setOperationAction(ISD::STORE, MVT::f32, Promote);
   AddPromotedToType(ISD::STORE, MVT::f32, MVT::i32);
@@ -203,6 +227,24 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::STORE, MVT::v2f64, Promote);
   AddPromotedToType(ISD::STORE, MVT::v2f64, MVT::v4i32);
 
+  setOperationAction(ISD::STORE, MVT::v4i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v4i64, MVT::v8i32);
+
+  setOperationAction(ISD::STORE, MVT::v4f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v4f64, MVT::v8i32);
+
+  setOperationAction(ISD::STORE, MVT::v8i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v8i64, MVT::v16i32);
+
+  setOperationAction(ISD::STORE, MVT::v8f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v8f64, MVT::v16i32);
+
+  setOperationAction(ISD::STORE, MVT::v16i64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v16i64, MVT::v32i32);
+
+  setOperationAction(ISD::STORE, MVT::v16f64, Promote);
+  AddPromotedToType(ISD::STORE, MVT::v16f64, MVT::v32i32);
+
   setTruncStoreAction(MVT::i64, MVT::i1, Expand);
   setTruncStoreAction(MVT::i64, MVT::i8, Expand);
   setTruncStoreAction(MVT::i64, MVT::i16, Expand);
@@ -227,12 +269,21 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v2f64, MVT::v2f32, Expand);
   setTruncStoreAction(MVT::v2f64, MVT::v2f16, Expand);
 
+  setTruncStoreAction(MVT::v4i64, MVT::v4i32, Expand);
+  setTruncStoreAction(MVT::v4i64, MVT::v4i16, Expand);
   setTruncStoreAction(MVT::v4f64, MVT::v4f32, Expand);
   setTruncStoreAction(MVT::v4f64, MVT::v4f16, Expand);
 
   setTruncStoreAction(MVT::v8f64, MVT::v8f32, Expand);
   setTruncStoreAction(MVT::v8f64, MVT::v8f16, Expand);
 
+  setTruncStoreAction(MVT::v16f64, MVT::v16f32, Expand);
+  setTruncStoreAction(MVT::v16f64, MVT::v16f16, Expand);
+  setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
+  setTruncStoreAction(MVT::v16i64, MVT::v16i16, Expand);
+  setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
+  setTruncStoreAction(MVT::v16i64, MVT::v16i8, Expand);
+  setTruncStoreAction(MVT::v16i64, MVT::v16i1, Expand);
 
   setOperationAction(ISD::Constant, MVT::i32, Legal);
   setOperationAction(ISD::Constant, MVT::i64, Legal);
@@ -297,6 +348,14 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32f32, Custom);
   setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v32i32, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2f64, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v2i64, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4f64, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v4i64, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8f64, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v8i64, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16f64, Custom);
+  setOperationAction(ISD::EXTRACT_SUBVECTOR, MVT::v16i64, Custom);
 
   setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
   setOperationAction(ISD::FP_TO_FP16, MVT::f64, Custom);
@@ -329,6 +388,9 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SUBE, VT, Legal);
   }
 
+  // The hardware supports 32-bit FSHR, but not FSHL.
+  setOperationAction(ISD::FSHR, MVT::i32, Legal);
+
   // The hardware supports 32-bit ROTR, but not ROTL.
   setOperationAction(ISD::ROTL, MVT::i32, Expand);
   setOperationAction(ISD::ROTL, MVT::i64, Expand);
@@ -381,7 +443,7 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::UREM, VT, Expand);
     setOperationAction(ISD::SMUL_LOHI, VT, Expand);
     setOperationAction(ISD::UMUL_LOHI, VT, Expand);
-    setOperationAction(ISD::SDIVREM, VT, Custom);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::UDIVREM, VT, Expand);
     setOperationAction(ISD::SELECT, VT, Expand);
     setOperationAction(ISD::VSELECT, VT, Expand);
@@ -483,6 +545,10 @@ AMDGPUTargetLowering::AMDGPUTargetLowering(const TargetMachine &TM,
   MaxStoresPerMemmove = 0xffffffff;
   MaxStoresPerMemset  = 0xffffffff;
 
+  // The expansion for 64-bit division is enormous.
+  if (AMDGPUBypassSlowDiv)
+    addBypassSlowDiv(64, 32);
+
   setTargetDAGCombine(ISD::BITCAST);
   setTargetDAGCombine(ISD::SHL);
   setTargetDAGCombine(ISD::SRA);
@@ -609,6 +675,17 @@ bool AMDGPUTargetLowering::allUsesHaveSourceMods(const SDNode *N,
   return true;
 }
 
+EVT AMDGPUTargetLowering::getTypeForExtReturn(LLVMContext &Context, EVT VT,
+                                              ISD::NodeType ExtendKind) const {
+  assert(!VT.isVector() && "only scalar expected");
+
+  // Round to the next multiple of 32-bits.
+  unsigned Size = VT.getSizeInBits();
+  if (Size <= 32)
+    return MVT::i32;
+  return EVT::getIntegerVT(Context, 32 * ((Size + 31) / 32));
+}
+
 MVT AMDGPUTargetLowering::getVectorIdxTy(const DataLayout &) const {
   return MVT::i32;
 }
@@ -641,8 +718,9 @@ bool AMDGPUTargetLowering::shouldReduceLoadWidth(SDNode *N,
 
   unsigned NewSize = NewVT.getStoreSizeInBits();
 
-  // If we are reducing to a 32-bit load, this is always better.
-  if (NewSize == 32)
+  // If we are reducing to a 32-bit load or a smaller multi-dword load,
+  // this is always better.
+  if (NewSize >= 32)
     return true;
 
   EVT OldVT = N->getValueType(0);
@@ -733,6 +811,26 @@ bool AMDGPUTargetLowering::isSDNodeAlwaysUniform(const SDNode * N) const {
   }
 }
 
+SDValue AMDGPUTargetLowering::getNegatedExpression(
+    SDValue Op, SelectionDAG &DAG, bool LegalOperations, bool ForCodeSize,
+    NegatibleCost &Cost, unsigned Depth) const {
+
+  switch (Op.getOpcode()) {
+  case ISD::FMA:
+  case ISD::FMAD: {
+    // Negating a fma is not free if it has users without source mods.
+    if (!allUsesHaveSourceMods(Op.getNode()))
+      return SDValue();
+    break;
+  }
+  default:
+    break;
+  }
+
+  return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
+                                              ForCodeSize, Cost, Depth);
+}
+
 //===---------------------------------------------------------------------===//
 // Target Properties
 //===---------------------------------------------------------------------===//
@@ -912,7 +1010,7 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
   const unsigned ExplicitOffset = ST.getExplicitKernelArgOffset(Fn);
   CallingConv::ID CC = Fn.getCallingConv();
 
-  unsigned MaxAlign = 1;
+  Align MaxAlign = Align(1);
   uint64_t ExplicitArgOffset = 0;
   const DataLayout &DL = Fn.getParent()->getDataLayout();
 
@@ -920,12 +1018,12 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
 
   for (const Argument &Arg : Fn.args()) {
     Type *BaseArgTy = Arg.getType();
-    unsigned Align = DL.getABITypeAlignment(BaseArgTy);
-    MaxAlign = std::max(Align, MaxAlign);
+    Align Alignment = DL.getABITypeAlign(BaseArgTy);
+    MaxAlign = std::max(Alignment, MaxAlign);
     unsigned AllocSize = DL.getTypeAllocSize(BaseArgTy);
 
-    uint64_t ArgOffset = alignTo(ExplicitArgOffset, Align) + ExplicitOffset;
-    ExplicitArgOffset = alignTo(ExplicitArgOffset, Align) + AllocSize;
+    uint64_t ArgOffset = alignTo(ExplicitArgOffset, Alignment) + ExplicitOffset;
+    ExplicitArgOffset = alignTo(ExplicitArgOffset, Alignment) + AllocSize;
 
     // We're basically throwing away everything passed into us and starting over
     // to get accurate in-memory offsets. The "PartOffset" is completely useless
@@ -999,6 +1097,8 @@ void AMDGPUTargetLowering::analyzeFormalArgumentsCompute(
         assert(MemVT.getVectorNumElements() == 3 ||
                MemVT.getVectorNumElements() == 5);
         MemVT = MemVT.getPow2VectorType(State.getContext());
+      } else if (!MemVT.isSimple() && !MemVT.isVector()) {
+        MemVT = MemVT.getRoundIntegerType(State.getContext());
       }
 
       unsigned PartOffset = 0;
@@ -1140,7 +1240,7 @@ SDValue AMDGPUTargetLowering::LowerOperation(SDValue Op,
   case ISD::FROUND: return LowerFROUND(Op, DAG);
   case ISD::FFLOOR: return LowerFFLOOR(Op, DAG);
   case ISD::FLOG:
-    return LowerFLOG(Op, DAG, 1.0F / numbers::log2ef);
+    return LowerFLOG(Op, DAG, numbers::ln2f);
   case ISD::FLOG10:
     return LowerFLOG(Op, DAG, numbers::ln2f / numbers::ln10f);
   case ISD::FEXP:
@@ -1196,10 +1296,23 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
   if (G->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS ||
       G->getAddressSpace() == AMDGPUAS::REGION_ADDRESS) {
     if (!MFI->isEntryFunction()) {
+      SDLoc DL(Op);
       const Function &Fn = DAG.getMachineFunction().getFunction();
       DiagnosticInfoUnsupported BadLDSDecl(
-        Fn, "local memory global used by non-kernel function", SDLoc(Op).getDebugLoc());
+        Fn, "local memory global used by non-kernel function",
+        DL.getDebugLoc(), DS_Warning);
       DAG.getContext()->diagnose(BadLDSDecl);
+
+      // We currently don't have a way to correctly allocate LDS objects that
+      // aren't directly associated with a kernel. We do force inlining of
+      // functions that use local objects. However, if these dead functions are
+      // not eliminated, we don't want a compile time error. Just emit a warning
+      // and a trap, since there should be no callable path here.
+      SDValue Trap = DAG.getNode(ISD::TRAP, DL, MVT::Other, DAG.getEntryNode());
+      SDValue OutputChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
+                                        Trap, DAG.getRoot());
+      DAG.setRoot(OutputChain);
+      return DAG.getUNDEF(Op.getValueType());
     }
 
     // XXX: What does the value of G->getOffset() mean?
@@ -1208,7 +1321,7 @@ SDValue AMDGPUTargetLowering::LowerGlobalAddress(AMDGPUMachineFunction* MFI,
 
     // TODO: We could emit code to handle the initialization somewhere.
     if (!hasDefinedInitializer(GV)) {
-      unsigned Offset = MFI->allocateLDSGlobal(DL, *GV);
+      unsigned Offset = MFI->allocateLDSGlobal(DL, *cast<GlobalVariable>(GV));
       return DAG.getConstant(Offset, SDLoc(Op), Op.getValueType());
     }
   }
@@ -1383,12 +1496,11 @@ AMDGPUTargetLowering::splitVector(const SDValue &N, const SDLoc &DL,
                  (HiVT.isVector() ? HiVT.getVectorNumElements() : 1) <=
              N.getValueType().getVectorNumElements() &&
          "More vector elements requested than available!");
-  auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
   SDValue Lo = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, LoVT, N,
-                           DAG.getConstant(0, DL, IdxTy));
+                           DAG.getVectorIdxConstant(0, DL));
   SDValue Hi = DAG.getNode(
       HiVT.isVector() ? ISD::EXTRACT_SUBVECTOR : ISD::EXTRACT_VECTOR_ELT, DL,
-      HiVT, N, DAG.getConstant(LoVT.getVectorNumElements(), DL, IdxTy));
+      HiVT, N, DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), DL));
   return std::make_pair(Lo, Hi);
 }
 
@@ -1433,18 +1545,17 @@ SDValue AMDGPUTargetLowering::SplitVectorLoad(const SDValue Op,
                      HiPtr, SrcValue.getWithOffset(LoMemVT.getStoreSize()),
                      HiMemVT, HiAlign, Load->getMemOperand()->getFlags());
 
-  auto IdxTy = getVectorIdxTy(DAG.getDataLayout());
   SDValue Join;
   if (LoVT == HiVT) {
     // This is the case that the vector is power of two so was evenly split.
     Join = DAG.getNode(ISD::CONCAT_VECTORS, SL, VT, LoLoad, HiLoad);
   } else {
     Join = DAG.getNode(ISD::INSERT_SUBVECTOR, SL, VT, DAG.getUNDEF(VT), LoLoad,
-                       DAG.getConstant(0, SL, IdxTy));
-    Join = DAG.getNode(HiVT.isVector() ? ISD::INSERT_SUBVECTOR
-                                       : ISD::INSERT_VECTOR_ELT,
-                       SL, VT, Join, HiLoad,
-                       DAG.getConstant(LoVT.getVectorNumElements(), SL, IdxTy));
+                       DAG.getVectorIdxConstant(0, SL));
+    Join = DAG.getNode(
+        HiVT.isVector() ? ISD::INSERT_SUBVECTOR : ISD::INSERT_VECTOR_ELT, SL,
+        VT, Join, HiLoad,
+        DAG.getVectorIdxConstant(LoVT.getVectorNumElements(), SL));
   }
 
   SDValue Ops[] = {Join, DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
@@ -1474,7 +1585,7 @@ SDValue AMDGPUTargetLowering::WidenVectorLoad(SDValue Op,
       WideMemVT, BaseAlign, Load->getMemOperand()->getFlags());
   return DAG.getMergeValues(
       {DAG.getNode(ISD::EXTRACT_SUBVECTOR, SL, VT, WideLoad,
-                   DAG.getConstant(0, SL, getVectorIdxTy(DAG.getDataLayout()))),
+                   DAG.getVectorIdxConstant(0, SL)),
        WideLoad.getValue(1)},
       SL);
 }
@@ -1588,9 +1699,11 @@ SDValue AMDGPUTargetLowering::LowerDIVREM24(SDValue Op, SelectionDAG &DAG,
   const AMDGPUMachineFunction *MFI = MF.getInfo<AMDGPUMachineFunction>();
 
   // float fr = mad(fqneg, fb, fa);
-  unsigned OpCode = MFI->getMode().FP32Denormals ?
-                    (unsigned)AMDGPUISD::FMAD_FTZ :
-                    (unsigned)ISD::FMAD;
+  unsigned OpCode = !Subtarget->hasMadMacF32Insts() ?
+                    (unsigned)ISD::FMA :
+                    !MFI->getMode().allFP32Denormals() ?
+                    (unsigned)ISD::FMAD :
+                    (unsigned)AMDGPUISD::FMAD_FTZ;
   SDValue fr = DAG.getNode(OpCode, DL, FltVT, fqneg, fb, fa);
 
   // int iq = (int)fq;
@@ -1673,9 +1786,11 @@ void AMDGPUTargetLowering::LowerUDIVREM64(SDValue Op,
     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
     // Compute denominator reciprocal.
-    unsigned FMAD = MFI->getMode().FP32Denormals ?
-                    (unsigned)AMDGPUISD::FMAD_FTZ :
-                    (unsigned)ISD::FMAD;
+    unsigned FMAD = !Subtarget->hasMadMacF32Insts() ?
+                    (unsigned)ISD::FMA :
+                    !MFI->getMode().allFP32Denormals() ?
+                    (unsigned)ISD::FMAD :
+                    (unsigned)AMDGPUISD::FMAD_FTZ;
 
     SDValue Cvt_Lo = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Lo);
     SDValue Cvt_Hi = DAG.getNode(ISD::UINT_TO_FP, DL, MVT::f32, RHS_Hi);
@@ -1861,103 +1976,43 @@ SDValue AMDGPUTargetLowering::LowerUDIVREM(SDValue Op,
       return Res;
   }
 
-  SDValue Num = Op.getOperand(0);
-  SDValue Den = Op.getOperand(1);
-
-  // RCP =  URECIP(Den) = 2^32 / Den + e
-  // e is rounding error.
-  SDValue RCP = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Den);
-
-  // RCP_LO = mul(RCP, Den) */
-  SDValue RCP_LO = DAG.getNode(ISD::MUL, DL, VT, RCP, Den);
-
-  // RCP_HI = mulhu (RCP, Den) */
-  SDValue RCP_HI = DAG.getNode(ISD::MULHU, DL, VT, RCP, Den);
-
-  // NEG_RCP_LO = -RCP_LO
-  SDValue NEG_RCP_LO = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT),
-                                                     RCP_LO);
-
-  // ABS_RCP_LO = (RCP_HI == 0 ? NEG_RCP_LO : RCP_LO)
-  SDValue ABS_RCP_LO = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
-                                           NEG_RCP_LO, RCP_LO,
-                                           ISD::SETEQ);
-  // Calculate the rounding error from the URECIP instruction
-  // E = mulhu(ABS_RCP_LO, RCP)
-  SDValue E = DAG.getNode(ISD::MULHU, DL, VT, ABS_RCP_LO, RCP);
-
-  // RCP_A_E = RCP + E
-  SDValue RCP_A_E = DAG.getNode(ISD::ADD, DL, VT, RCP, E);
-
-  // RCP_S_E = RCP - E
-  SDValue RCP_S_E = DAG.getNode(ISD::SUB, DL, VT, RCP, E);
-
-  // Tmp0 = (RCP_HI == 0 ? RCP_A_E : RCP_SUB_E)
-  SDValue Tmp0 = DAG.getSelectCC(DL, RCP_HI, DAG.getConstant(0, DL, VT),
-                                     RCP_A_E, RCP_S_E,
-                                     ISD::SETEQ);
-  // Quotient = mulhu(Tmp0, Num)
-  SDValue Quotient = DAG.getNode(ISD::MULHU, DL, VT, Tmp0, Num);
-
-  // Num_S_Remainder = Quotient * Den
-  SDValue Num_S_Remainder = DAG.getNode(ISD::MUL, DL, VT, Quotient, Den);
-
-  // Remainder = Num - Num_S_Remainder
-  SDValue Remainder = DAG.getNode(ISD::SUB, DL, VT, Num, Num_S_Remainder);
-
-  // Remainder_GE_Den = (Remainder >= Den ? -1 : 0)
-  SDValue Remainder_GE_Den = DAG.getSelectCC(DL, Remainder, Den,
-                                                 DAG.getConstant(-1, DL, VT),
-                                                 DAG.getConstant(0, DL, VT),
-                                                 ISD::SETUGE);
-  // Remainder_GE_Zero = (Num >= Num_S_Remainder ? -1 : 0)
-  SDValue Remainder_GE_Zero = DAG.getSelectCC(DL, Num,
-                                                  Num_S_Remainder,
-                                                  DAG.getConstant(-1, DL, VT),
-                                                  DAG.getConstant(0, DL, VT),
-                                                  ISD::SETUGE);
-  // Tmp1 = Remainder_GE_Den & Remainder_GE_Zero
-  SDValue Tmp1 = DAG.getNode(ISD::AND, DL, VT, Remainder_GE_Den,
-                                               Remainder_GE_Zero);
-
-  // Calculate Division result:
-
-  // Quotient_A_One = Quotient + 1
-  SDValue Quotient_A_One = DAG.getNode(ISD::ADD, DL, VT, Quotient,
-                                       DAG.getConstant(1, DL, VT));
-
-  // Quotient_S_One = Quotient - 1
-  SDValue Quotient_S_One = DAG.getNode(ISD::SUB, DL, VT, Quotient,
-                                       DAG.getConstant(1, DL, VT));
-
-  // Div = (Tmp1 == 0 ? Quotient : Quotient_A_One)
-  SDValue Div = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
-                                     Quotient, Quotient_A_One, ISD::SETEQ);
-
-  // Div = (Remainder_GE_Zero == 0 ? Quotient_S_One : Div)
-  Div = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
-                            Quotient_S_One, Div, ISD::SETEQ);
-
-  // Calculate Rem result:
-
-  // Remainder_S_Den = Remainder - Den
-  SDValue Remainder_S_Den = DAG.getNode(ISD::SUB, DL, VT, Remainder, Den);
-
-  // Remainder_A_Den = Remainder + Den
-  SDValue Remainder_A_Den = DAG.getNode(ISD::ADD, DL, VT, Remainder, Den);
-
-  // Rem = (Tmp1 == 0 ? Remainder : Remainder_S_Den)
-  SDValue Rem = DAG.getSelectCC(DL, Tmp1, DAG.getConstant(0, DL, VT),
-                                    Remainder, Remainder_S_Den, ISD::SETEQ);
-
-  // Rem = (Remainder_GE_Zero == 0 ? Remainder_A_Den : Rem)
-  Rem = DAG.getSelectCC(DL, Remainder_GE_Zero, DAG.getConstant(0, DL, VT),
-                            Remainder_A_Den, Rem, ISD::SETEQ);
-  SDValue Ops[2] = {
-    Div,
-    Rem
-  };
-  return DAG.getMergeValues(Ops, DL);
+  SDValue X = Op.getOperand(0);
+  SDValue Y = Op.getOperand(1);
+
+  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
+  // algorithm used here.
+
+  // Initial estimate of inv(y).
+  SDValue Z = DAG.getNode(AMDGPUISD::URECIP, DL, VT, Y);
+
+  // One round of UNR.
+  SDValue NegY = DAG.getNode(ISD::SUB, DL, VT, DAG.getConstant(0, DL, VT), Y);
+  SDValue NegYZ = DAG.getNode(ISD::MUL, DL, VT, NegY, Z);
+  Z = DAG.getNode(ISD::ADD, DL, VT, Z,
+                  DAG.getNode(ISD::MULHU, DL, VT, Z, NegYZ));
+
+  // Quotient/remainder estimate.
+  SDValue Q = DAG.getNode(ISD::MULHU, DL, VT, X, Z);
+  SDValue R =
+      DAG.getNode(ISD::SUB, DL, VT, X, DAG.getNode(ISD::MUL, DL, VT, Q, Y));
+
+  // First quotient/remainder refinement.
+  EVT CCVT = getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
+  SDValue One = DAG.getConstant(1, DL, VT);
+  SDValue Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
+  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+                  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
+  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+                  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
+
+  // Second quotient/remainder refinement.
+  Cond = DAG.getSetCC(DL, CCVT, R, Y, ISD::SETUGE);
+  Q = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+                  DAG.getNode(ISD::ADD, DL, VT, Q, One), Q);
+  R = DAG.getNode(ISD::SELECT, DL, VT, Cond,
+                  DAG.getNode(ISD::SUB, DL, VT, R, Y), R);
+
+  return DAG.getMergeValues({Q, R}, DL);
 }
 
 SDValue AMDGPUTargetLowering::LowerSDIVREM(SDValue Op,
@@ -2164,8 +2219,7 @@ SDValue AMDGPUTargetLowering::LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) con
 // Don't handle v2f16. The extra instructions to scalarize and repack around the
 // compare and vselect end up producing worse code than scalarizing the whole
 // operation.
-SDValue AMDGPUTargetLowering::LowerFROUND_LegalFTRUNC(SDValue Op,
-                                                      SelectionDAG &DAG) const {
+SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue X = Op.getOperand(0);
   EVT VT = Op.getValueType();
@@ -2194,75 +2248,6 @@ SDValue AMDGPUTargetLowering::LowerFROUND_LegalFTRUNC(SDValue Op,
   return DAG.getNode(ISD::FADD, SL, VT, T, Sel);
 }
 
-SDValue AMDGPUTargetLowering::LowerFROUND64(SDValue Op, SelectionDAG &DAG) const {
-  SDLoc SL(Op);
-  SDValue X = Op.getOperand(0);
-
-  SDValue L = DAG.getNode(ISD::BITCAST, SL, MVT::i64, X);
-
-  const SDValue Zero = DAG.getConstant(0, SL, MVT::i32);
-  const SDValue One = DAG.getConstant(1, SL, MVT::i32);
-  const SDValue NegOne = DAG.getConstant(-1, SL, MVT::i32);
-  const SDValue FiftyOne = DAG.getConstant(51, SL, MVT::i32);
-  EVT SetCCVT =
-      getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i32);
-
-  SDValue BC = DAG.getNode(ISD::BITCAST, SL, MVT::v2i32, X);
-
-  SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, MVT::i32, BC, One);
-
-  SDValue Exp = extractF64Exponent(Hi, SL, DAG);
-
-  const SDValue Mask = DAG.getConstant(INT64_C(0x000fffffffffffff), SL,
-                                       MVT::i64);
-
-  SDValue M = DAG.getNode(ISD::SRA, SL, MVT::i64, Mask, Exp);
-  SDValue D = DAG.getNode(ISD::SRA, SL, MVT::i64,
-                          DAG.getConstant(INT64_C(0x0008000000000000), SL,
-                                          MVT::i64),
-                          Exp);
-
-  SDValue Tmp0 = DAG.getNode(ISD::AND, SL, MVT::i64, L, M);
-  SDValue Tmp1 = DAG.getSetCC(SL, SetCCVT,
-                              DAG.getConstant(0, SL, MVT::i64), Tmp0,
-                              ISD::SETNE);
-
-  SDValue Tmp2 = DAG.getNode(ISD::SELECT, SL, MVT::i64, Tmp1,
-                             D, DAG.getConstant(0, SL, MVT::i64));
-  SDValue K = DAG.getNode(ISD::ADD, SL, MVT::i64, L, Tmp2);
-
-  K = DAG.getNode(ISD::AND, SL, MVT::i64, K, DAG.getNOT(SL, M, MVT::i64));
-  K = DAG.getNode(ISD::BITCAST, SL, MVT::f64, K);
-
-  SDValue ExpLt0 = DAG.getSetCC(SL, SetCCVT, Exp, Zero, ISD::SETLT);
-  SDValue ExpGt51 = DAG.getSetCC(SL, SetCCVT, Exp, FiftyOne, ISD::SETGT);
-  SDValue ExpEqNegOne = DAG.getSetCC(SL, SetCCVT, NegOne, Exp, ISD::SETEQ);
-
-  SDValue Mag = DAG.getNode(ISD::SELECT, SL, MVT::f64,
-                            ExpEqNegOne,
-                            DAG.getConstantFP(1.0, SL, MVT::f64),
-                            DAG.getConstantFP(0.0, SL, MVT::f64));
-
-  SDValue S = DAG.getNode(ISD::FCOPYSIGN, SL, MVT::f64, Mag, X);
-
-  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpLt0, S, K);
-  K = DAG.getNode(ISD::SELECT, SL, MVT::f64, ExpGt51, X, K);
-
-  return K;
-}
-
-SDValue AMDGPUTargetLowering::LowerFROUND(SDValue Op, SelectionDAG &DAG) const {
-  EVT VT = Op.getValueType();
-
-  if (isOperationLegal(ISD::FTRUNC, VT))
-    return LowerFROUND_LegalFTRUNC(Op, DAG);
-
-  if (VT == MVT::f64)
-    return LowerFROUND64(Op, DAG);
-
-  llvm_unreachable("unhandled type");
-}
-
 SDValue AMDGPUTargetLowering::LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Src = Op.getOperand(0);
@@ -2793,6 +2778,7 @@ static bool isI24(SDValue Op, SelectionDAG &DAG) {
 static SDValue simplifyI24(SDNode *Node24,
                            TargetLowering::DAGCombinerInfo &DCI) {
   SelectionDAG &DAG = DCI.DAG;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   bool IsIntrin = Node24->getOpcode() == ISD::INTRINSIC_WO_CHAIN;
 
   SDValue LHS = IsIntrin ? Node24->getOperand(1) : Node24->getOperand(0);
@@ -2806,11 +2792,11 @@ static SDValue simplifyI24(SDNode *Node24,
 
   APInt Demanded = APInt::getLowBitsSet(LHS.getValueSizeInBits(), 24);
 
-  // First try to simplify using GetDemandedBits which allows the operands to
-  // have other uses, but will only perform simplifications that involve
-  // bypassing some nodes for this user.
-  SDValue DemandedLHS = DAG.GetDemandedBits(LHS, Demanded);
-  SDValue DemandedRHS = DAG.GetDemandedBits(RHS, Demanded);
+  // First try to simplify using SimplifyMultipleUseDemandedBits which allows
+  // the operands to have other uses, but will only perform simplifications that
+  // involve bypassing some nodes for this user.
+  SDValue DemandedLHS = TLI.SimplifyMultipleUseDemandedBits(LHS, Demanded, DAG);
+  SDValue DemandedRHS = TLI.SimplifyMultipleUseDemandedBits(RHS, Demanded, DAG);
   if (DemandedLHS || DemandedRHS)
     return DAG.getNode(NewOpcode, SDLoc(Node24), Node24->getVTList(),
                        DemandedLHS ? DemandedLHS : LHS,
@@ -2818,7 +2804,6 @@ static SDValue simplifyI24(SDNode *Node24,
 
   // Now try SimplifyDemandedBits which can simplify the nodes used by our
   // operands if this node is the only user.
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   if (TLI.SimplifyDemandedBits(LHS, Demanded, DCI))
     return SDValue(Node24, 0);
   if (TLI.SimplifyDemandedBits(RHS, Demanded, DCI))
@@ -2877,7 +2862,7 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
     return SDValue();
 
   LoadSDNode *LN = cast<LoadSDNode>(N);
-  if (LN->isVolatile() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
+  if (!LN->isSimple() || !ISD::isNormalLoad(LN) || hasVolatileUser(LN))
     return SDValue();
 
   SDLoc SL(N);
@@ -2885,16 +2870,17 @@ SDValue AMDGPUTargetLowering::performLoadCombine(SDNode *N,
   EVT VT = LN->getMemoryVT();
 
   unsigned Size = VT.getStoreSize();
-  unsigned Align = LN->getAlignment();
-  if (Align < Size && isTypeLegal(VT)) {
+  Align Alignment = LN->getAlign();
+  if (Alignment < Size && isTypeLegal(VT)) {
     bool IsFast;
     unsigned AS = LN->getAddressSpace();
 
     // Expand unaligned loads earlier than legalization. Due to visitation order
     // problems during legalization, the emitted instructions to pack and unpack
     // the bytes again are not eliminated in the case of an unaligned copy.
-    if (!allowsMisalignedMemoryAccesses(
-            VT, AS, Align, LN->getMemOperand()->getFlags(), &IsFast)) {
+    if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(),
+                                        LN->getMemOperand()->getFlags(),
+                                        &IsFast)) {
       SDValue Ops[2];
 
       if (VT.isVector())
@@ -2931,7 +2917,7 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
     return SDValue();
 
   StoreSDNode *SN = cast<StoreSDNode>(N);
-  if (SN->isVolatile() || !ISD::isNormalStore(SN))
+  if (!SN->isSimple() || !ISD::isNormalStore(SN))
     return SDValue();
 
   EVT VT = SN->getMemoryVT();
@@ -2939,8 +2925,8 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
 
   SDLoc SL(N);
   SelectionDAG &DAG = DCI.DAG;
-  unsigned Align = SN->getAlignment();
-  if (Align < Size && isTypeLegal(VT)) {
+  Align Alignment = SN->getAlign();
+  if (Alignment < Size && isTypeLegal(VT)) {
     bool IsFast;
     unsigned AS = SN->getAddressSpace();
 
@@ -2948,8 +2934,9 @@ SDValue AMDGPUTargetLowering::performStoreCombine(SDNode *N,
     // order problems during legalization, the emitted instructions to pack and
     // unpack the bytes again are not eliminated in the case of an unaligned
     // copy.
-    if (!allowsMisalignedMemoryAccesses(
-            VT, AS, Align, SN->getMemOperand()->getFlags(), &IsFast)) {
+    if (!allowsMisalignedMemoryAccesses(VT, AS, Alignment.value(),
+                                        SN->getMemOperand()->getFlags(),
+                                        &IsFast)) {
       if (VT.isVector())
         return scalarizeVectorStore(SN, DAG);
 
@@ -3012,6 +2999,16 @@ SDValue AMDGPUTargetLowering::performIntrinsicWOChainCombine(
   case Intrinsic::amdgcn_mul_i24:
   case Intrinsic::amdgcn_mul_u24:
     return simplifyI24(N, DCI);
+  case Intrinsic::amdgcn_fract:
+  case Intrinsic::amdgcn_rsq:
+  case Intrinsic::amdgcn_rcp_legacy:
+  case Intrinsic::amdgcn_rsq_legacy:
+  case Intrinsic::amdgcn_rsq_clamp:
+  case Intrinsic::amdgcn_ldexp: {
+    // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
+    SDValue Src = N->getOperand(1);
+    return Src.isUndef() ? Src : SDValue();
+  }
   default:
     return SDValue();
   }
@@ -3465,24 +3462,24 @@ SDValue AMDGPUTargetLowering::performCtlz_CttzCombine(const SDLoc &SL, SDValue C
   ISD::CondCode CCOpcode = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
   SDValue CmpLHS = Cond.getOperand(0);
 
-  unsigned Opc = isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 :
-                                           AMDGPUISD::FFBH_U32;
-
   // select (setcc x, 0, eq), -1, (ctlz_zero_undef x) -> ffbh_u32 x
   // select (setcc x, 0, eq), -1, (cttz_zero_undef x) -> ffbl_u32 x
   if (CCOpcode == ISD::SETEQ &&
       (isCtlzOpc(RHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
-      RHS.getOperand(0) == CmpLHS &&
-      isNegativeOne(LHS)) {
+      RHS.getOperand(0) == CmpLHS && isNegativeOne(LHS)) {
+    unsigned Opc =
+        isCttzOpc(RHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
   }
 
   // select (setcc x, 0, ne), (ctlz_zero_undef x), -1 -> ffbh_u32 x
   // select (setcc x, 0, ne), (cttz_zero_undef x), -1 -> ffbl_u32 x
   if (CCOpcode == ISD::SETNE &&
-      (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(RHS.getOpcode())) &&
-      LHS.getOperand(0) == CmpLHS &&
-      isNegativeOne(RHS)) {
+      (isCtlzOpc(LHS.getOpcode()) || isCttzOpc(LHS.getOpcode())) &&
+      LHS.getOperand(0) == CmpLHS && isNegativeOne(RHS)) {
+    unsigned Opc =
+        isCttzOpc(LHS.getOpcode()) ? AMDGPUISD::FFBL_B32 : AMDGPUISD::FFBH_U32;
+
     return getFFBX_U32(DAG, CmpLHS, SL, Opc);
   }
 
@@ -4117,12 +4114,12 @@ SDValue AMDGPUTargetLowering::PerformDAGCombine(SDNode *N,
 
 SDValue AMDGPUTargetLowering::CreateLiveInRegister(SelectionDAG &DAG,
                                                    const TargetRegisterClass *RC,
-                                                   unsigned Reg, EVT VT,
+                                                   Register Reg, EVT VT,
                                                    const SDLoc &SL,
                                                    bool RawReg) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  unsigned VReg;
+  Register VReg;
 
   if (!MRI.isLiveIn(Reg)) {
     VReg = MRI.createVirtualRegister(RC);
@@ -4266,11 +4263,9 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(DIV_FMAS)
   NODE_NAME_CASE(DIV_FIXUP)
   NODE_NAME_CASE(FMAD_FTZ)
-  NODE_NAME_CASE(TRIG_PREOP)
   NODE_NAME_CASE(RCP)
   NODE_NAME_CASE(RSQ)
   NODE_NAME_CASE(RCP_LEGACY)
-  NODE_NAME_CASE(RSQ_LEGACY)
   NODE_NAME_CASE(RCP_IFLAG)
   NODE_NAME_CASE(FMUL_LEGACY)
   NODE_NAME_CASE(RSQ_CLAMP)
@@ -4298,8 +4293,6 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(MAD_U64_U32)
   NODE_NAME_CASE(PERM)
   NODE_NAME_CASE(TEXTURE_FETCH)
-  NODE_NAME_CASE(EXPORT)
-  NODE_NAME_CASE(EXPORT_DONE)
   NODE_NAME_CASE(R600_EXPORT)
   NODE_NAME_CASE(CONST_ADDRESS)
   NODE_NAME_CASE(REGISTER_LOAD)
@@ -4323,12 +4316,8 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(CONST_DATA_PTR)
   NODE_NAME_CASE(PC_ADD_REL_OFFSET)
   NODE_NAME_CASE(LDS)
-  NODE_NAME_CASE(KILL)
   NODE_NAME_CASE(DUMMY_CHAIN)
   case AMDGPUISD::FIRST_MEM_OPCODE_NUMBER: break;
-  NODE_NAME_CASE(INTERP_P1LL_F16)
-  NODE_NAME_CASE(INTERP_P1LV_F16)
-  NODE_NAME_CASE(INTERP_P2_F16)
   NODE_NAME_CASE(LOAD_D16_HI)
   NODE_NAME_CASE(LOAD_D16_LO)
   NODE_NAME_CASE(LOAD_D16_HI_I8)
@@ -4347,6 +4336,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(ATOMIC_DEC)
   NODE_NAME_CASE(ATOMIC_LOAD_FMIN)
   NODE_NAME_CASE(ATOMIC_LOAD_FMAX)
+  NODE_NAME_CASE(ATOMIC_LOAD_CSUB)
   NODE_NAME_CASE(BUFFER_LOAD)
   NODE_NAME_CASE(BUFFER_LOAD_UBYTE)
   NODE_NAME_CASE(BUFFER_LOAD_USHORT)
@@ -4373,6 +4363,7 @@ const char* AMDGPUTargetLowering::getTargetNodeName(unsigned Opcode) const {
   NODE_NAME_CASE(BUFFER_ATOMIC_INC)
   NODE_NAME_CASE(BUFFER_ATOMIC_DEC)
   NODE_NAME_CASE(BUFFER_ATOMIC_CMPSWAP)
+  NODE_NAME_CASE(BUFFER_ATOMIC_CSUB)
   NODE_NAME_CASE(BUFFER_ATOMIC_FADD)
   NODE_NAME_CASE(BUFFER_ATOMIC_PK_FADD)
   NODE_NAME_CASE(ATOMIC_PK_FADD)
@@ -4539,11 +4530,10 @@ void AMDGPUTargetLowering::computeKnownBitsForTargetNode(
   }
   case AMDGPUISD::LDS: {
     auto GA = cast<GlobalAddressSDNode>(Op.getOperand(0).getNode());
-    unsigned Align = GA->getGlobal()->getAlignment();
+    Align Alignment = GA->getGlobal()->getPointerAlignment(DAG.getDataLayout());
 
     Known.Zero.setHighBits(16);
-    if (Align)
-      Known.Zero.setLowBits(Log2_32(Align));
+    Known.Zero.setLowBits(Log2(Alignment));
     break;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
@@ -4607,6 +4597,29 @@ unsigned AMDGPUTargetLowering::ComputeNumSignBitsForTargetNode(
   }
 }
 
+unsigned AMDGPUTargetLowering::computeNumSignBitsForTargetInstr(
+  GISelKnownBits &Analysis, Register R,
+  const APInt &DemandedElts, const MachineRegisterInfo &MRI,
+  unsigned Depth) const {
+  const MachineInstr *MI = MRI.getVRegDef(R);
+  if (!MI)
+    return 1;
+
+  // TODO: Check range metadata on MMO.
+  switch (MI->getOpcode()) {
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
+    return 25;
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
+    return 17;
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
+    return 24;
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
+    return 16;
+  default:
+    return 1;
+  }
+}
+
 bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
                                                         const SelectionDAG &DAG,
                                                         bool SNaN,
@@ -4648,7 +4661,6 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
   case AMDGPUISD::RCP:
   case AMDGPUISD::RSQ:
   case AMDGPUISD::RCP_LEGACY:
-  case AMDGPUISD::RSQ_LEGACY:
   case AMDGPUISD::RSQ_CLAMP: {
     if (SNaN)
       return true;
@@ -4665,7 +4677,6 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
   case AMDGPUISD::DIV_SCALE:
   case AMDGPUISD::DIV_FMAS:
   case AMDGPUISD::DIV_FIXUP:
-  case AMDGPUISD::TRIG_PREOP:
     // TODO: Refine on operands.
     return SNaN;
   case AMDGPUISD::SIN_HW:
@@ -4692,6 +4703,18 @@ bool AMDGPUTargetLowering::isKnownNeverNaNForTargetNode(SDValue Op,
       return DAG.isKnownNeverNaN(Op.getOperand(1), SNaN, Depth + 1) &&
              DAG.isKnownNeverNaN(Op.getOperand(2), SNaN, Depth + 1);
     }
+    case Intrinsic::amdgcn_rcp:
+    case Intrinsic::amdgcn_rsq:
+    case Intrinsic::amdgcn_rcp_legacy:
+    case Intrinsic::amdgcn_rsq_legacy:
+    case Intrinsic::amdgcn_rsq_clamp: {
+      if (SNaN)
+        return true;
+
+      // TODO: Need is known positive check.
+      return false;
+    }
+    case Intrinsic::amdgcn_trig_preop:
     case Intrinsic::amdgcn_fdot2:
       // TODO: Refine on operand
       return SNaN;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
index a90b7f5653dc..85f23c81db17 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUISelLowering.h
@@ -18,6 +18,7 @@
 #include "AMDGPU.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
@@ -52,8 +53,6 @@ protected:
   SDValue LowerFRINT(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFNEARBYINT(SDValue Op, SelectionDAG &DAG) const;
 
-  SDValue LowerFROUND_LegalFTRUNC(SDValue Op, SelectionDAG &DAG) const;
-  SDValue LowerFROUND64(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFFLOOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerFLOG(SDValue Op, SelectionDAG &DAG,
@@ -172,8 +171,16 @@ public:
   bool isZExtFree(EVT Src, EVT Dest) const override;
   bool isZExtFree(SDValue Val, EVT VT2) const override;
 
+  SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+                               bool LegalOperations, bool ForCodeSize,
+                               NegatibleCost &Cost,
+                               unsigned Depth) const override;
+
   bool isNarrowingProfitable(EVT VT1, EVT VT2) const override;
 
+  EVT getTypeForExtReturn(LLVMContext &Context, EVT VT,
+                          ISD::NodeType ExtendKind) const override;
+
   MVT getVectorIdxTy(const DataLayout &) const override;
   bool isSelectSupported(SelectSupportKind) const override;
 
@@ -264,6 +271,12 @@ public:
                                            const SelectionDAG &DAG,
                                            unsigned Depth = 0) const override;
 
+  unsigned computeNumSignBitsForTargetInstr(GISelKnownBits &Analysis,
+                                            Register R,
+                                            const APInt &DemandedElts,
+                                            const MachineRegisterInfo &MRI,
+                                            unsigned Depth = 0) const override;
+
   bool isKnownNeverNaNForTargetNode(SDValue Op,
                                     const SelectionDAG &DAG,
                                     bool SNaN = false,
@@ -276,19 +289,19 @@ public:
   /// a copy from the register.
   SDValue CreateLiveInRegister(SelectionDAG &DAG,
                                const TargetRegisterClass *RC,
-                               unsigned Reg, EVT VT,
+                               Register Reg, EVT VT,
                                const SDLoc &SL,
                                bool RawReg = false) const;
   SDValue CreateLiveInRegister(SelectionDAG &DAG,
                                const TargetRegisterClass *RC,
-                               unsigned Reg, EVT VT) const {
+                               Register Reg, EVT VT) const {
     return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()));
   }
 
   // Returns the raw live in register rather than a copy from it.
   SDValue CreateLiveInRegisterRaw(SelectionDAG &DAG,
                                   const TargetRegisterClass *RC,
-                                  unsigned Reg, EVT VT) const {
+                                  Register Reg, EVT VT) const {
     return CreateLiveInRegister(DAG, RC, Reg, VT, SDLoc(DAG.getEntryNode()), true);
   }
 
@@ -398,14 +411,12 @@ enum NodeType : unsigned {
   // For emitting ISD::FMAD when f32 denormals are enabled because mac/mad is
   // treated as an illegal operation.
   FMAD_FTZ,
-  TRIG_PREOP, // 1 ULP max error for f64
 
   // RCP, RSQ - For f32, 1 ULP max error, no denormal handling.
   //            For f64, max error 2^29 ULP, handles denormals.
   RCP,
   RSQ,
   RCP_LEGACY,
-  RSQ_LEGACY,
   RCP_IFLAG,
   FMUL_LEGACY,
   RSQ_CLAMP,
@@ -433,8 +444,6 @@ enum NodeType : unsigned {
   MUL_LOHI_U24,
   PERM,
   TEXTURE_FETCH,
-  EXPORT, // exp on SI+
-  EXPORT_DONE, // exp on SI+ with done bit set
   R600_EXPORT,
   CONST_ADDRESS,
   REGISTER_LOAD,
@@ -476,12 +485,8 @@ enum NodeType : unsigned {
   BUILD_VERTICAL_VECTOR,
   /// Pointer to the start of the shader's constant data.
   CONST_DATA_PTR,
-  INTERP_P1LL_F16,
-  INTERP_P1LV_F16,
-  INTERP_P2_F16,
   PC_ADD_REL_OFFSET,
   LDS,
-  KILL,
   DUMMY_CHAIN,
   FIRST_MEM_OPCODE_NUMBER = ISD::FIRST_TARGET_MEMORY_OPCODE,
   LOAD_D16_HI,
@@ -503,6 +508,7 @@ enum NodeType : unsigned {
   ATOMIC_DEC,
   ATOMIC_LOAD_FMIN,
   ATOMIC_LOAD_FMAX,
+  ATOMIC_LOAD_CSUB,
   BUFFER_LOAD,
   BUFFER_LOAD_UBYTE,
   BUFFER_LOAD_USHORT,
@@ -529,6 +535,7 @@ enum NodeType : unsigned {
   BUFFER_ATOMIC_INC,
   BUFFER_ATOMIC_DEC,
   BUFFER_ATOMIC_CMPSWAP,
+  BUFFER_ATOMIC_CSUB,
   BUFFER_ATOMIC_FADD,
   BUFFER_ATOMIC_PK_FADD,
   ATOMIC_PK_FADD,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
index 64d761997b0c..3b5d91133a2f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInline.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -67,9 +66,9 @@ public:
 
   static char ID; // Pass identification, replacement for typeid
 
-  unsigned getInlineThreshold(CallSite CS) const;
+  unsigned getInlineThreshold(CallBase &CB) const;
 
-  InlineCost getInlineCost(CallSite CS) override;
+  InlineCost getInlineCost(CallBase &CB) override;
 
   bool runOnSCC(CallGraphSCC &SCC) override;
 
@@ -106,13 +105,13 @@ void AMDGPUInliner::getAnalysisUsage(AnalysisUsage &AU) const {
   LegacyInlinerBase::getAnalysisUsage(AU);
 }
 
-unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
+unsigned AMDGPUInliner::getInlineThreshold(CallBase &CB) const {
   int Thres = Params.DefaultThreshold;
 
-  Function *Caller = CS.getCaller();
+  Function *Caller = CB.getCaller();
   // Listen to the inlinehint attribute when it would increase the threshold
   // and the caller does not need to minimize its size.
-  Function *Callee = CS.getCalledFunction();
+  Function *Callee = CB.getCalledFunction();
   bool InlineHint = Callee && !Callee->isDeclaration() &&
     Callee->hasFnAttribute(Attribute::InlineHint);
   if (InlineHint && Params.HintThreshold && Params.HintThreshold > Thres
@@ -129,7 +128,7 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
   // Increase the inline threshold to allow inliniting in this case.
   uint64_t AllocaSize = 0;
   SmallPtrSet<const AllocaInst *, 8> AIVisited;
-  for (Value *PtrArg : CS.args()) {
+  for (Value *PtrArg : CB.args()) {
     PointerType *Ty = dyn_cast<PointerType>(PtrArg->getType());
     if (!Ty || (Ty->getAddressSpace() != AMDGPUAS::PRIVATE_ADDRESS &&
                 Ty->getAddressSpace() != AMDGPUAS::FLAT_ADDRESS))
@@ -156,8 +155,8 @@ unsigned AMDGPUInliner::getInlineThreshold(CallSite CS) const {
 
 // Check if call is just a wrapper around another call.
 // In this case we only have call and ret instructions.
-static bool isWrapperOnlyCall(CallSite CS) {
-  Function *Callee = CS.getCalledFunction();
+static bool isWrapperOnlyCall(CallBase &CB) {
+  Function *Callee = CB.getCalledFunction();
   if (!Callee || Callee->size() != 1)
     return false;
   const BasicBlock &BB = Callee->getEntryBlock();
@@ -174,32 +173,32 @@ static bool isWrapperOnlyCall(CallSite CS) {
   return false;
 }
 
-InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
-  Function *Callee = CS.getCalledFunction();
-  Function *Caller = CS.getCaller();
+InlineCost AMDGPUInliner::getInlineCost(CallBase &CB) {
+  Function *Callee = CB.getCalledFunction();
+  Function *Caller = CB.getCaller();
 
   if (!Callee || Callee->isDeclaration())
     return llvm::InlineCost::getNever("undefined callee");
 
-  if (CS.isNoInline())
+  if (CB.isNoInline())
     return llvm::InlineCost::getNever("noinline");
 
   TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
   if (!TTI.areInlineCompatible(Caller, Callee))
     return llvm::InlineCost::getNever("incompatible");
 
-  if (CS.hasFnAttr(Attribute::AlwaysInline)) {
+  if (CB.hasFnAttr(Attribute::AlwaysInline)) {
     auto IsViable = isInlineViable(*Callee);
-    if (IsViable)
+    if (IsViable.isSuccess())
       return llvm::InlineCost::getAlways("alwaysinline viable");
-    return llvm::InlineCost::getNever(IsViable.message);
+    return llvm::InlineCost::getNever(IsViable.getFailureReason());
   }
 
-  if (isWrapperOnlyCall(CS))
+  if (isWrapperOnlyCall(CB))
     return llvm::InlineCost::getAlways("wrapper-only call");
 
   InlineParams LocalParams = Params;
-  LocalParams.DefaultThreshold = (int)getInlineThreshold(CS);
+  LocalParams.DefaultThreshold = (int)getInlineThreshold(CB);
   bool RemarksEnabled = false;
   const auto &BBs = Caller->getBasicBlockList();
   if (!BBs.empty()) {
@@ -209,14 +208,13 @@ InlineCost AMDGPUInliner::getInlineCost(CallSite CS) {
   }
 
   OptimizationRemarkEmitter ORE(Caller);
-  std::function<AssumptionCache &(Function &)> GetAssumptionCache =
-      [this](Function &F) -> AssumptionCache & {
+  auto GetAssumptionCache = [this](Function &F) -> AssumptionCache & {
     return ACT->getAssumptionCache(F);
   };
 
-  auto IC = llvm::getInlineCost(cast<CallBase>(*CS.getInstruction()), Callee,
-                             LocalParams, TTI, GetAssumptionCache, None, PSI,
-                             RemarksEnabled ? &ORE : nullptr);
+  auto IC = llvm::getInlineCost(CB, Callee, LocalParams, TTI,
+                                GetAssumptionCache, GetTLI, nullptr, PSI,
+                                RemarksEnabled ? &ORE : nullptr);
 
   if (IC && !IC.isAlways() && !Callee->hasFnAttribute(Attribute::InlineHint)) {
     // Single BB does not increase total BB amount, thus subtract 1
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
index 9951cbf2326e..6c13bc8599db 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPUInstrInfo.h"
-#include "AMDGPURegisterInfo.h"
 #include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
index 698189e14c21..61b78acad3f4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.h
@@ -55,6 +55,9 @@ struct ImageDimIntrinsicInfo {
 };
 const ImageDimIntrinsicInfo *getImageDimIntrinsicInfo(unsigned Intr);
 
+const ImageDimIntrinsicInfo *getImageDimInstrinsicByBaseOpcode(unsigned BaseOpcode,
+                                                               unsigned Dim);
+
 } // end AMDGPU namespace
 } // End llvm namespace
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
index 50c451be4b86..894677ec68b6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstrInfo.td
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file contains DAG node defintions for the AMDGPU target.
+// This file contains DAG node definitions for the AMDGPU target.
 //
 //===----------------------------------------------------------------------===//
 
@@ -18,10 +18,6 @@ def AMDGPUDTIntTernaryOp : SDTypeProfile<1, 3, [
   SDTCisSameAs<0, 1>, SDTCisSameAs<0, 2>, SDTCisInt<0>, SDTCisInt<3>
 ]>;
 
-def AMDGPUTrigPreOp : SDTypeProfile<1, 2,
-  [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
->;
-
 def AMDGPULdExpOp : SDTypeProfile<1, 2,
   [SDTCisSameAs<0, 1>, SDTCisFP<0>, SDTCisInt<2>]
 >;
@@ -121,8 +117,6 @@ def AMDGPUrcp_impl : SDNode<"AMDGPUISD::RCP", SDTFPUnaryOp>;
 // out = 1.0 / sqrt(a)
 def AMDGPUrsq_impl : SDNode<"AMDGPUISD::RSQ", SDTFPUnaryOp>;
 
-// out = 1.0 / sqrt(a)
-def AMDGPUrsq_legacy_impl : SDNode<"AMDGPUISD::RSQ_LEGACY", SDTFPUnaryOp>;
 def AMDGPUrcp_legacy_impl : SDNode<"AMDGPUISD::RCP_LEGACY", SDTFPUnaryOp>;
 
 def AMDGPUrcp_iflag : SDNode<"AMDGPUISD::RCP_IFLAG", SDTFPUnaryOp>;
@@ -151,7 +145,7 @@ def AMDGPUfmax_legacy : SDNode<"AMDGPUISD::FMAX_LEGACY", SDTFPBinOp,
   []
 >;
 
-def AMDGPUfmul_legacy : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
+def AMDGPUfmul_legacy_impl : SDNode<"AMDGPUISD::FMUL_LEGACY", SDTFPBinOp,
   [SDNPCommutative, SDNPAssociative]
 >;
 
@@ -204,13 +198,6 @@ def AMDGPUSetCCOp : SDTypeProfile<1, 3, [        // setcc
 
 def AMDGPUsetcc : SDNode<"AMDGPUISD::SETCC", AMDGPUSetCCOp>;
 
-def AMDGPUSetRegOp :  SDTypeProfile<0, 2, [
-  SDTCisInt<0>, SDTCisInt<1>
-]>;
-
-def AMDGPUsetreg : SDNode<"AMDGPUISD::SETREG", AMDGPUSetRegOp, [
-  SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]>;
-
 def AMDGPUfma : SDNode<"AMDGPUISD::FMA_W_CHAIN", SDTFPTernaryOp, [
    SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
@@ -238,7 +225,7 @@ def AMDGPUdiv_scale : SDNode<"AMDGPUISD::DIV_SCALE", AMDGPUDivScaleOp>;
 
 //  Special case divide FMA with scale and flags (src0 = Quotient,
 //  src1 = Denominator, src2 = Numerator).
-def AMDGPUdiv_fmas : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp,
+def AMDGPUdiv_fmas_impl : SDNode<"AMDGPUISD::DIV_FMAS", AMDGPUFmasOp,
                             [SDNPOptInGlue]>;
 
 // Single or double precision division fixup.
@@ -248,9 +235,6 @@ def AMDGPUdiv_fixup_impl : SDNode<"AMDGPUISD::DIV_FIXUP", SDTFPTernaryOp>;
 
 def AMDGPUfmad_ftz_impl : SDNode<"AMDGPUISD::FMAD_FTZ", SDTFPTernaryOp>;
 
-// Look Up 2.0 / pi src0 with segment select src1[4:0]
-def AMDGPUtrig_preop : SDNode<"AMDGPUISD::TRIG_PREOP", AMDGPUTrigPreOp>;
-
 def AMDGPUregister_load : SDNode<"AMDGPUISD::REGISTER_LOAD",
                           SDTypeProfile<1, 2, [SDTCisPtrTy<1>, SDTCisInt<2>]>,
                           [SDNPHasChain, SDNPMayLoad]>;
@@ -278,18 +262,18 @@ def AMDGPUatomic_cmp_swap : SDNode<"AMDGPUISD::ATOMIC_CMP_SWAP",
 def AMDGPUround : SDNode<"ISD::FROUND",
                          SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisSameAs<0,1>]>>;
 
-def AMDGPUbfe_u32 : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
-def AMDGPUbfe_i32 : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfe_u32_impl : SDNode<"AMDGPUISD::BFE_U32", AMDGPUDTIntTernaryOp>;
+def AMDGPUbfe_i32_impl : SDNode<"AMDGPUISD::BFE_I32", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfi : SDNode<"AMDGPUISD::BFI", AMDGPUDTIntTernaryOp>;
 def AMDGPUbfm : SDNode<"AMDGPUISD::BFM", SDTIntBinOp>;
 
-def AMDGPUffbh_u32 : SDNode<"AMDGPUISD::FFBH_U32", SDTIntUnaryOp>;
-def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntUnaryOp>;
+def AMDGPUffbh_u32_impl : SDNode<"AMDGPUISD::FFBH_U32", SDTIntBitCountUnaryOp>;
+def AMDGPUffbh_i32_impl : SDNode<"AMDGPUISD::FFBH_I32", SDTIntBitCountUnaryOp>;
 
-def AMDGPUffbl_b32 : SDNode<"AMDGPUISD::FFBL_B32", SDTIntUnaryOp>;
+def AMDGPUffbl_b32_impl : SDNode<"AMDGPUISD::FFBL_B32", SDTIntBitCountUnaryOp>;
 
 // Signed and unsigned 24-bit multiply. The highest 8-bits are ignore
-// when performing the mulitply. The result is a 32-bit value.
+// when performing the multiply. The result is a 32-bit value.
 def AMDGPUmul_u24_impl : SDNode<"AMDGPUISD::MUL_U24", SDTIntBinOp,
   [SDNPCommutative, SDNPAssociative]
 >;
@@ -321,7 +305,7 @@ def AMDGPUumed3 : SDNode<"AMDGPUISD::UMED3", AMDGPUDTIntTernaryOp,
 
 def AMDGPUfmed3_impl : SDNode<"AMDGPUISD::FMED3", SDTFPTernaryOp, []>;
 
-def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
+def AMDGPUfdot2_impl : SDNode<"AMDGPUISD::FDOT2",
                   SDTypeProfile<1, 4, [SDTCisSameAs<0, 3>, SDTCisSameAs<1, 2>,
                                        SDTCisFP<0>, SDTCisVec<1>,
                                        SDTCisInt<4>]>,
@@ -329,21 +313,6 @@ def AMDGPUfdot2 : SDNode<"AMDGPUISD::FDOT2",
 
 def AMDGPUperm : SDNode<"AMDGPUISD::PERM", AMDGPUDTIntTernaryOp, []>;
 
-def AMDGPUinterp_p1ll_f16 : SDNode<"AMDGPUISD::INTERP_P1LL_F16",
-                            SDTypeProfile<1, 7, [SDTCisFP<0>]>,
-                            [SDNPInGlue, SDNPOutGlue]>;
-
-def AMDGPUinterp_p1lv_f16 : SDNode<"AMDGPUISD::INTERP_P1LV_F16",
-                            SDTypeProfile<1, 9, [SDTCisFP<0>]>,
-                            [SDNPInGlue, SDNPOutGlue]>;
-
-def AMDGPUinterp_p2_f16 : SDNode<"AMDGPUISD::INTERP_P2_F16",
-                          SDTypeProfile<1, 8, [SDTCisFP<0>]>,
-                          [SDNPInGlue]>;
-
-def AMDGPUkill : SDNode<"AMDGPUISD::KILL", AMDGPUKillSDT,
-  [SDNPHasChain, SDNPSideEffect]>;
-
 // SI+ export
 def AMDGPUExportOp : SDTypeProfile<0, 8, [
   SDTCisInt<0>,       // i8 tgt
@@ -358,12 +327,6 @@ def AMDGPUExportOp : SDTypeProfile<0, 8, [
 
 ]>;
 
-def AMDGPUexport: SDNode<"AMDGPUISD::EXPORT", AMDGPUExportOp,
-  [SDNPHasChain, SDNPMayStore]>;
-
-def AMDGPUexport_done: SDNode<"AMDGPUISD::EXPORT_DONE", AMDGPUExportOp,
-  [SDNPHasChain, SDNPMayLoad, SDNPMayStore]>;
-
 
 def R600ExportOp : SDTypeProfile<0, 7, [SDTCisFP<0>, SDTCisInt<1>]>;
 
@@ -398,7 +361,7 @@ def AMDGPUret_flag : SDNode<"AMDGPUISD::RET_FLAG", SDTypeProfile<0, 1, [SDTCisPt
 
 
 //===----------------------------------------------------------------------===//
-// Intrinsic/Custom node compatability PatFrags
+// Intrinsic/Custom node compatibility PatFrags
 //===----------------------------------------------------------------------===//
 
 def AMDGPUrcp : PatFrags<(ops node:$src), [(int_amdgcn_rcp node:$src),
@@ -406,9 +369,6 @@ def AMDGPUrcp : PatFrags<(ops node:$src), [(int_amdgcn_rcp node:$src),
 def AMDGPUrcp_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rcp_legacy node:$src),
                                                   (AMDGPUrcp_legacy_impl node:$src)]>;
 
-def AMDGPUrsq_legacy : PatFrags<(ops node:$src), [(int_amdgcn_rsq_legacy node:$src),
-                                                  (AMDGPUrsq_legacy_impl node:$src)]>;
-
 def AMDGPUrsq : PatFrags<(ops node:$src), [(int_amdgcn_rsq node:$src),
                                            (AMDGPUrsq_impl node:$src)]>;
 
@@ -442,6 +402,14 @@ def AMDGPUffbh_i32 : PatFrags<(ops node:$src),
   [(int_amdgcn_sffbh node:$src),
    (AMDGPUffbh_i32_impl node:$src)]>;
 
+def AMDGPUffbh_u32 : PatFrags<(ops node:$src),
+  [(ctlz_zero_undef node:$src),
+   (AMDGPUffbh_u32_impl node:$src)]>;
+
+def AMDGPUffbl_b32 : PatFrags<(ops node:$src),
+  [(cttz_zero_undef node:$src),
+   (AMDGPUffbl_b32_impl node:$src)]>;
+
 def AMDGPUpkrtz_f16_f32 : PatFrags<(ops node:$src0, node:$src1),
   [(int_amdgcn_cvt_pkrtz node:$src0, node:$src1),
   (AMDGPUpkrtz_f16_f32_impl node:$src0, node:$src1)]>;
@@ -473,3 +441,23 @@ def AMDGPUmul_u24 : PatFrags<(ops node:$src0, node:$src1),
 def AMDGPUmul_i24 : PatFrags<(ops node:$src0, node:$src1),
   [(int_amdgcn_mul_i24 node:$src0, node:$src1),
    (AMDGPUmul_i24_impl node:$src0, node:$src1)]>;
+
+def AMDGPUbfe_i32 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+  [(int_amdgcn_sbfe node:$src0, node:$src1, node:$src2),
+   (AMDGPUbfe_i32_impl node:$src0, node:$src1, node:$src2)]>;
+
+def AMDGPUbfe_u32 : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+  [(int_amdgcn_ubfe node:$src0, node:$src1, node:$src2),
+   (AMDGPUbfe_u32_impl node:$src0, node:$src1, node:$src2)]>;
+
+def AMDGPUfmul_legacy : PatFrags<(ops node:$src0, node:$src1),
+  [(int_amdgcn_fmul_legacy node:$src0, node:$src1),
+   (AMDGPUfmul_legacy_impl node:$src0, node:$src1)]>;
+
+def AMDGPUfdot2 : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$clamp),
+  [(int_amdgcn_fdot2 node:$src0, node:$src1, node:$src2, node:$clamp),
+   (AMDGPUfdot2_impl node:$src0, node:$src1, node:$src2, node:$clamp)]>;
+
+def AMDGPUdiv_fmas : PatFrags<(ops node:$src0, node:$src1, node:$src2, node:$vcc),
+  [(int_amdgcn_div_fmas node:$src0, node:$src1, node:$src2, node:$vcc),
+   (AMDGPUdiv_fmas_impl node:$src0, node:$src1, node:$src2, node:$vcc)]>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
index c0ea35817ec8..2025c0fa5d21 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.cpp
@@ -15,7 +15,6 @@
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUGlobalISelUtils.h"
 #include "AMDGPURegisterBankInfo.h"
-#include "AMDGPURegisterInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "AMDGPUTargetMachine.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -40,6 +39,12 @@
 using namespace llvm;
 using namespace MIPatternMatch;
 
+static cl::opt<bool> AllowRiskySelect(
+  "amdgpu-global-isel-risky-select",
+  cl::desc("Allow GlobalISel to select cases that are likely to not work yet"),
+  cl::init(false),
+  cl::ReallyHidden);
+
 #define GET_GLOBALISEL_IMPL
 #define AMDGPUSubtarget GCNSubtarget
 #include "AMDGPUGenGlobalISel.inc"
@@ -88,6 +93,30 @@ bool AMDGPUInstructionSelector::isVCC(Register Reg,
   return RB->getID() == AMDGPU::VCCRegBankID;
 }
 
+bool AMDGPUInstructionSelector::constrainCopyLikeIntrin(MachineInstr &MI,
+                                                        unsigned NewOpc) const {
+  MI.setDesc(TII.get(NewOpc));
+  MI.RemoveOperand(1); // Remove intrinsic ID.
+  MI.addOperand(*MF, MachineOperand::CreateReg(AMDGPU::EXEC, false, true));
+
+  MachineOperand &Dst = MI.getOperand(0);
+  MachineOperand &Src = MI.getOperand(1);
+
+  // TODO: This should be legalized to s32 if needed
+  if (MRI->getType(Dst.getReg()) == LLT::scalar(1))
+    return false;
+
+  const TargetRegisterClass *DstRC
+    = TRI.getConstrainedRegClassForOperand(Dst, *MRI);
+  const TargetRegisterClass *SrcRC
+    = TRI.getConstrainedRegClassForOperand(Src, *MRI);
+  if (!DstRC || DstRC != SrcRC)
+    return false;
+
+  return RBI.constrainGenericRegister(Dst.getReg(), *DstRC, *MRI) &&
+         RBI.constrainGenericRegister(Src.getReg(), *SrcRC, *MRI);
+}
+
 bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
   const DebugLoc &DL = I.getDebugLoc();
   MachineBasicBlock *BB = I.getParent();
@@ -173,6 +202,14 @@ bool AMDGPUInstructionSelector::selectCOPY(MachineInstr &I) const {
 bool AMDGPUInstructionSelector::selectPHI(MachineInstr &I) const {
   const Register DefReg = I.getOperand(0).getReg();
   const LLT DefTy = MRI->getType(DefReg);
+  if (DefTy == LLT::scalar(1)) {
+    if (!AllowRiskySelect) {
+      LLVM_DEBUG(dbgs() << "Skipping risky boolean phi\n");
+      return false;
+    }
+
+    LLVM_DEBUG(dbgs() << "Selecting risky boolean phi\n");
+  }
 
   // TODO: Verify this doesn't have insane operands (i.e. VGPR to SGPR copy)
 
@@ -261,6 +298,11 @@ bool AMDGPUInstructionSelector::selectG_AND_OR_XOR(MachineInstr &I) const {
     unsigned InstOpc = getLogicalBitOpcode(I.getOpcode(),
                                            RC == &AMDGPU::SReg_64RegClass);
     I.setDesc(TII.get(InstOpc));
+    // Dead implicit-def of scc
+    I.addOperand(MachineOperand::CreateReg(AMDGPU::SCC, true, // isDef
+                                           true, // isImp
+                                           false, // isKill
+                                           true)); // isDead
 
     // FIXME: Hack to avoid turning the register bank into a register class.
     // The selector for G_ICMP relies on seeing the register bank for the result
@@ -295,7 +337,11 @@ bool AMDGPUInstructionSelector::selectG_ADD_SUB(MachineInstr &I) const {
   MachineFunction *MF = BB->getParent();
   Register DstReg = I.getOperand(0).getReg();
   const DebugLoc &DL = I.getDebugLoc();
-  unsigned Size = RBI.getSizeInBits(DstReg, *MRI, TRI);
+  LLT Ty = MRI->getType(DstReg);
+  if (Ty.isVector())
+    return false;
+
+  unsigned Size = Ty.getSizeInBits();
   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
   const bool IsSALU = DstRB->getID() == AMDGPU::SGPRRegBankID;
   const bool Sub = I.getOpcode() == TargetOpcode::G_SUB;
@@ -445,6 +491,7 @@ bool AMDGPUInstructionSelector::selectG_UADDO_USUBO_UADDE_USUBE(
   return true;
 }
 
+// TODO: We should probably legalize these to only using 32-bit results.
 bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   Register DstReg = I.getOperand(0).getReg();
@@ -452,11 +499,21 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
   LLT DstTy = MRI->getType(DstReg);
   LLT SrcTy = MRI->getType(SrcReg);
   const unsigned SrcSize = SrcTy.getSizeInBits();
-  const unsigned DstSize = DstTy.getSizeInBits();
+  unsigned DstSize = DstTy.getSizeInBits();
 
   // TODO: Should handle any multiple of 32 offset.
   unsigned Offset = I.getOperand(2).getImm();
-  if (Offset % DstSize != 0)
+  if (Offset % 32 != 0 || DstSize > 128)
+    return false;
+
+  // 16-bit operations really use 32-bit registers.
+  // FIXME: Probably should not allow 16-bit G_EXTRACT results.
+  if (DstSize == 16)
+    DstSize = 32;
+
+  const TargetRegisterClass *DstRC =
+    TRI.getConstrainedRegClassForOperand(I.getOperand(0), *MRI);
+  if (!DstRC || !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI))
     return false;
 
   const RegisterBank *SrcBank = RBI.getRegBank(SrcReg, *MRI, TRI);
@@ -464,20 +521,18 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT(MachineInstr &I) const {
     TRI.getRegClassForSizeOnBank(SrcSize, *SrcBank, *MRI);
   if (!SrcRC)
     return false;
+  unsigned SubReg = SIRegisterInfo::getSubRegFromChannel(Offset / 32,
+                                                         DstSize / 32);
+  SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubReg);
+  if (!SrcRC)
+    return false;
 
-  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SrcRC, DstSize / 8);
-
+  SrcReg = constrainOperandRegClass(*MF, TRI, *MRI, TII, RBI, I,
+                                    *SrcRC, I.getOperand(1));
   const DebugLoc &DL = I.getDebugLoc();
-  MachineInstr *Copy = BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
-                               .addReg(SrcReg, 0, SubRegs[Offset / DstSize]);
+  BuildMI(*BB, &I, DL, TII.get(TargetOpcode::COPY), DstReg)
+    .addReg(SrcReg, 0, SubReg);
 
-  for (const MachineOperand &MO : Copy->operands()) {
-    const TargetRegisterClass *RC =
-            TRI.getConstrainedRegClassForOperand(MO, *MRI);
-    if (!RC)
-      continue;
-    RBI.constrainGenericRegister(MO.getReg(), *RC, *MRI);
-  }
   I.eraseFromParent();
   return true;
 }
@@ -563,6 +618,90 @@ bool AMDGPUInstructionSelector::selectG_UNMERGE_VALUES(MachineInstr &MI) const {
   return true;
 }
 
+static bool isZero(Register Reg, const MachineRegisterInfo &MRI) {
+  int64_t Val;
+  return mi_match(Reg, MRI, m_ICst(Val)) && Val == 0;
+}
+
+bool AMDGPUInstructionSelector::selectG_BUILD_VECTOR_TRUNC(
+  MachineInstr &MI) const {
+  if (selectImpl(MI, *CoverageInfo))
+    return true;
+
+  const LLT S32 = LLT::scalar(32);
+  const LLT V2S16 = LLT::vector(2, 16);
+
+  Register Dst = MI.getOperand(0).getReg();
+  if (MRI->getType(Dst) != V2S16)
+    return false;
+
+  const RegisterBank *DstBank = RBI.getRegBank(Dst, *MRI, TRI);
+  if (DstBank->getID() != AMDGPU::SGPRRegBankID)
+    return false;
+
+  Register Src0 = MI.getOperand(1).getReg();
+  Register Src1 = MI.getOperand(2).getReg();
+  if (MRI->getType(Src0) != S32)
+    return false;
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock *BB = MI.getParent();
+
+  // TODO: This should probably be a combine somewhere
+  // (build_vector_trunc $src0, undef -> copy $src0
+  MachineInstr *Src1Def = getDefIgnoringCopies(Src1, *MRI);
+  if (Src1Def && Src1Def->getOpcode() == AMDGPU::G_IMPLICIT_DEF) {
+    MI.setDesc(TII.get(AMDGPU::COPY));
+    MI.RemoveOperand(2);
+    return RBI.constrainGenericRegister(Dst, AMDGPU::SReg_32RegClass, *MRI) &&
+           RBI.constrainGenericRegister(Src0, AMDGPU::SReg_32RegClass, *MRI);
+  }
+
+  Register ShiftSrc0;
+  Register ShiftSrc1;
+  int64_t ShiftAmt;
+
+  // With multiple uses of the shift, this will duplicate the shift and
+  // increase register pressure.
+  //
+  // (build_vector_trunc (lshr_oneuse $src0, 16), (lshr_oneuse $src1, 16)
+  //  => (S_PACK_HH_B32_B16 $src0, $src1)
+  // (build_vector_trunc $src0, (lshr_oneuse SReg_32:$src1, 16))
+  //  => (S_PACK_LH_B32_B16 $src0, $src1)
+  // (build_vector_trunc $src0, $src1)
+  //  => (S_PACK_LL_B32_B16 $src0, $src1)
+
+  // FIXME: This is an inconvenient way to check a specific value
+  bool Shift0 = mi_match(
+    Src0, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc0), m_ICst(ShiftAmt)))) &&
+    ShiftAmt == 16;
+
+  bool Shift1 = mi_match(
+    Src1, *MRI, m_OneUse(m_GLShr(m_Reg(ShiftSrc1), m_ICst(ShiftAmt)))) &&
+    ShiftAmt == 16;
+
+  unsigned Opc = AMDGPU::S_PACK_LL_B32_B16;
+  if (Shift0 && Shift1) {
+    Opc = AMDGPU::S_PACK_HH_B32_B16;
+    MI.getOperand(1).setReg(ShiftSrc0);
+    MI.getOperand(2).setReg(ShiftSrc1);
+  } else if (Shift1) {
+    Opc = AMDGPU::S_PACK_LH_B32_B16;
+    MI.getOperand(2).setReg(ShiftSrc1);
+  } else if (Shift0 && isZero(Src1, *MRI)) {
+    // build_vector_trunc (lshr $src0, 16), 0 -> s_lshr_b32 $src0, 16
+    auto MIB = BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_LSHR_B32), Dst)
+      .addReg(ShiftSrc0)
+      .addImm(16);
+
+    MI.eraseFromParent();
+    return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  }
+
+  MI.setDesc(TII.get(Opc));
+  return constrainSelectedInstRegOperands(MI, TII, TRI, RBI);
+}
+
 bool AMDGPUInstructionSelector::selectG_PTR_ADD(MachineInstr &I) const {
   return selectG_ADD_SUB(I);
 }
@@ -594,7 +733,9 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
   unsigned InsSize = Src1Ty.getSizeInBits();
 
   int64_t Offset = I.getOperand(3).getImm();
-  if (Offset % 32 != 0)
+
+  // FIXME: These cases should have been illegal and unnecessary to check here.
+  if (Offset % 32 != 0 || InsSize % 32 != 0)
     return false;
 
   unsigned SubReg = TRI.getSubRegFromChannel(Offset / 32, InsSize / 32);
@@ -617,7 +758,7 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
   // Deal with weird cases where the class only partially supports the subreg
   // index.
   Src0RC = TRI.getSubClassWithSubReg(Src0RC, SubReg);
-  if (!Src0RC)
+  if (!Src0RC || !Src1RC)
     return false;
 
   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
@@ -635,6 +776,85 @@ bool AMDGPUInstructionSelector::selectG_INSERT(MachineInstr &I) const {
   return true;
 }
 
+bool AMDGPUInstructionSelector::selectInterpP1F16(MachineInstr &MI) const {
+  if (STI.getLDSBankCount() != 16)
+    return selectImpl(MI, *CoverageInfo);
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(2).getReg();
+  Register M0Val = MI.getOperand(6).getReg();
+  if (!RBI.constrainGenericRegister(M0Val, AMDGPU::SReg_32RegClass, *MRI) ||
+      !RBI.constrainGenericRegister(Dst, AMDGPU::VGPR_32RegClass, *MRI) ||
+      !RBI.constrainGenericRegister(Src0, AMDGPU::VGPR_32RegClass, *MRI))
+    return false;
+
+  // This requires 2 instructions. It is possible to write a pattern to support
+  // this, but the generated isel emitter doesn't correctly deal with multiple
+  // output instructions using the same physical register input. The copy to m0
+  // is incorrectly placed before the second instruction.
+  //
+  // TODO: Match source modifiers.
+
+  Register InterpMov = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock *MBB = MI.getParent();
+
+  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+    .addReg(M0Val);
+  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_MOV_F32), InterpMov)
+    .addImm(2)
+    .addImm(MI.getOperand(4).getImm())  // $attr
+    .addImm(MI.getOperand(3).getImm()); // $attrchan
+
+  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::V_INTERP_P1LV_F16), Dst)
+    .addImm(0)                          // $src0_modifiers
+    .addReg(Src0)                       // $src0
+    .addImm(MI.getOperand(4).getImm())  // $attr
+    .addImm(MI.getOperand(3).getImm())  // $attrchan
+    .addImm(0)                          // $src2_modifiers
+    .addReg(InterpMov)                  // $src2 - 2 f16 values selected by high
+    .addImm(MI.getOperand(5).getImm())  // $high
+    .addImm(0)                          // $clamp
+    .addImm(0);                         // $omod
+
+  MI.eraseFromParent();
+  return true;
+}
+
+// We need to handle this here because tablegen doesn't support matching
+// instructions with multiple outputs.
+bool AMDGPUInstructionSelector::selectDivScale(MachineInstr &MI) const {
+  Register Dst0 = MI.getOperand(0).getReg();
+  Register Dst1 = MI.getOperand(1).getReg();
+
+  LLT Ty = MRI->getType(Dst0);
+  unsigned Opc;
+  if (Ty == LLT::scalar(32))
+    Opc = AMDGPU::V_DIV_SCALE_F32;
+  else if (Ty == LLT::scalar(64))
+    Opc = AMDGPU::V_DIV_SCALE_F64;
+  else
+    return false;
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock *MBB = MI.getParent();
+
+  Register Numer = MI.getOperand(3).getReg();
+  Register Denom = MI.getOperand(4).getReg();
+  unsigned ChooseDenom = MI.getOperand(5).getImm();
+
+  Register Src0 = ChooseDenom != 0 ? Numer : Denom;
+
+  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opc), Dst0)
+    .addDef(Dst1)
+    .addUse(Src0)
+    .addUse(Denom)
+    .addUse(Numer);
+
+  MI.eraseFromParent();
+  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
 bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
   unsigned IntrinsicID = I.getIntrinsicID();
   switch (IntrinsicID) {
@@ -659,6 +879,20 @@ bool AMDGPUInstructionSelector::selectG_INTRINSIC(MachineInstr &I) const {
 
     return true;
   }
+  case Intrinsic::amdgcn_interp_p1_f16:
+    return selectInterpP1F16(I);
+  case Intrinsic::amdgcn_wqm:
+    return constrainCopyLikeIntrin(I, AMDGPU::WQM);
+  case Intrinsic::amdgcn_softwqm:
+    return constrainCopyLikeIntrin(I, AMDGPU::SOFT_WQM);
+  case Intrinsic::amdgcn_wwm:
+    return constrainCopyLikeIntrin(I, AMDGPU::WWM);
+  case Intrinsic::amdgcn_div_scale:
+    return selectDivScale(I);
+  case Intrinsic::amdgcn_icmp:
+    return selectIntrinsicIcmp(I);
+  case Intrinsic::amdgcn_ballot:
+    return selectBallot(I);
   default:
     return selectImpl(I, *CoverageInfo);
   }
@@ -779,247 +1013,79 @@ bool AMDGPUInstructionSelector::selectG_ICMP(MachineInstr &I) const {
   return Ret;
 }
 
-static MachineInstr *
-buildEXP(const TargetInstrInfo &TII, MachineInstr *Insert, unsigned Tgt,
-         unsigned Reg0, unsigned Reg1, unsigned Reg2, unsigned Reg3,
-         unsigned VM, bool Compr, unsigned Enabled, bool Done) {
-  const DebugLoc &DL = Insert->getDebugLoc();
-  MachineBasicBlock &BB = *Insert->getParent();
-  unsigned Opcode = Done ? AMDGPU::EXP_DONE : AMDGPU::EXP;
-  return BuildMI(BB, Insert, DL, TII.get(Opcode))
-          .addImm(Tgt)
-          .addReg(Reg0)
-          .addReg(Reg1)
-          .addReg(Reg2)
-          .addReg(Reg3)
-          .addImm(VM)
-          .addImm(Compr)
-          .addImm(Enabled);
-}
-
-static bool isZero(Register Reg, MachineRegisterInfo &MRI) {
-  int64_t C;
-  if (mi_match(Reg, MRI, m_ICst(C)) && C == 0)
-    return true;
-
-  // FIXME: matcher should ignore copies
-  return mi_match(Reg, MRI, m_Copy(m_ICst(C))) && C == 0;
-}
+bool AMDGPUInstructionSelector::selectIntrinsicIcmp(MachineInstr &I) const {
+  Register Dst = I.getOperand(0).getReg();
+  if (isVCC(Dst, *MRI))
+    return false;
 
-static unsigned extractGLC(unsigned AuxiliaryData) {
-  return AuxiliaryData & 1;
-}
+  if (MRI->getType(Dst).getSizeInBits() != STI.getWavefrontSize())
+    return false;
 
-static unsigned extractSLC(unsigned AuxiliaryData) {
-  return (AuxiliaryData >> 1) & 1;
-}
+  MachineBasicBlock *BB = I.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+  Register SrcReg = I.getOperand(2).getReg();
+  unsigned Size = RBI.getSizeInBits(SrcReg, *MRI, TRI);
+  auto Pred = static_cast<CmpInst::Predicate>(I.getOperand(4).getImm());
 
-static unsigned extractDLC(unsigned AuxiliaryData) {
-  return (AuxiliaryData >> 2) & 1;
-}
+  int Opcode = getV_CMPOpcode(Pred, Size);
+  if (Opcode == -1)
+    return false;
 
-static unsigned extractSWZ(unsigned AuxiliaryData) {
-  return (AuxiliaryData >> 3) & 1;
+  MachineInstr *ICmp = BuildMI(*BB, &I, DL, TII.get(Opcode), Dst)
+                           .add(I.getOperand(2))
+                           .add(I.getOperand(3));
+  RBI.constrainGenericRegister(ICmp->getOperand(0).getReg(), *TRI.getBoolRC(),
+                               *MRI);
+  bool Ret = constrainSelectedInstRegOperands(*ICmp, TII, TRI, RBI);
+  I.eraseFromParent();
+  return Ret;
 }
 
-static unsigned getBufferStoreOpcode(LLT Ty,
-                                     const unsigned MemSize,
-                                     const bool Offen) {
-  const int Size = Ty.getSizeInBits();
-  switch (8 * MemSize) {
-  case 8:
-    return Offen ? AMDGPU::BUFFER_STORE_BYTE_OFFEN_exact :
-                   AMDGPU::BUFFER_STORE_BYTE_OFFSET_exact;
-  case 16:
-    return Offen ? AMDGPU::BUFFER_STORE_SHORT_OFFEN_exact :
-                   AMDGPU::BUFFER_STORE_SHORT_OFFSET_exact;
-  default:
-    unsigned Opc = Offen ? AMDGPU::BUFFER_STORE_DWORD_OFFEN_exact :
-                           AMDGPU::BUFFER_STORE_DWORD_OFFSET_exact;
-    if (Size > 32)
-      Opc = AMDGPU::getMUBUFOpcode(Opc, Size / 32);
-    return Opc;
-  }
-}
-
-static unsigned getBufferStoreFormatOpcode(LLT Ty,
-                                           const unsigned MemSize,
-                                           const bool Offen) {
-  bool IsD16Packed = Ty.getScalarSizeInBits() == 16;
-  bool IsD16Unpacked = 8 * MemSize < Ty.getSizeInBits();
-  int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
-
-  if (IsD16Packed) {
-    switch (NumElts) {
-    case 1:
-      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
-                     AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
-    case 2:
-      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFEN_exact :
-                     AMDGPU::BUFFER_STORE_FORMAT_D16_XY_OFFSET_exact;
-    case 3:
-      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFEN_exact :
-                     AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_OFFSET_exact;
-    case 4:
-      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFEN_exact :
-                     AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_OFFSET_exact;
-    default:
-      return -1;
-    }
-  }
-
-  if (IsD16Unpacked) {
-    switch (NumElts) {
-    case 1:
-      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFEN_exact :
-                     AMDGPU::BUFFER_STORE_FORMAT_D16_X_OFFSET_exact;
-    case 2:
-      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFEN_exact :
-                     AMDGPU::BUFFER_STORE_FORMAT_D16_XY_gfx80_OFFSET_exact;
-    case 3:
-      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFEN_exact :
-                     AMDGPU::BUFFER_STORE_FORMAT_D16_XYZ_gfx80_OFFSET_exact;
-    case 4:
-      return Offen ? AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFEN_exact :
-                     AMDGPU::BUFFER_STORE_FORMAT_D16_XYZW_gfx80_OFFSET_exact;
-    default:
-      return -1;
-    }
-  }
-
-  switch (NumElts) {
-  case 1:
-    return Offen ? AMDGPU::BUFFER_STORE_FORMAT_X_OFFEN_exact :
-                   AMDGPU::BUFFER_STORE_FORMAT_X_OFFSET_exact;
-  case 2:
-    return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XY_OFFEN_exact :
-                  AMDGPU::BUFFER_STORE_FORMAT_XY_OFFSET_exact;
-  case 3:
-    return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFEN_exact :
-                   AMDGPU::BUFFER_STORE_FORMAT_XYZ_OFFSET_exact;
-  case 4:
-    return Offen ? AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFEN_exact :
-                   AMDGPU::BUFFER_STORE_FORMAT_XYZW_OFFSET_exact;
-  default:
-    return -1;
-  }
+bool AMDGPUInstructionSelector::selectBallot(MachineInstr &I) const {
+  MachineBasicBlock *BB = I.getParent();
+  const DebugLoc &DL = I.getDebugLoc();
+  Register DstReg = I.getOperand(0).getReg();
+  const unsigned Size = MRI->getType(DstReg).getSizeInBits();
+  const bool Is64 = Size == 64;
 
-  llvm_unreachable("unhandled buffer store");
-}
-
-// TODO: Move this to combiner
-// Returns base register, imm offset, total constant offset.
-std::tuple<Register, unsigned, unsigned>
-AMDGPUInstructionSelector::splitBufferOffsets(MachineIRBuilder &B,
-                                              Register OrigOffset) const {
-  const unsigned MaxImm = 4095;
-  Register BaseReg;
-  unsigned TotalConstOffset;
-  MachineInstr *OffsetDef;
-
-  std::tie(BaseReg, TotalConstOffset, OffsetDef)
-    = AMDGPU::getBaseWithConstantOffset(*MRI, OrigOffset);
-
-  unsigned ImmOffset = TotalConstOffset;
-
-  // If the immediate value is too big for the immoffset field, put the value
-  // and -4096 into the immoffset field so that the value that is copied/added
-  // for the voffset field is a multiple of 4096, and it stands more chance
-  // of being CSEd with the copy/add for another similar load/store.f
-  // However, do not do that rounding down to a multiple of 4096 if that is a
-  // negative number, as it appears to be illegal to have a negative offset
-  // in the vgpr, even if adding the immediate offset makes it positive.
-  unsigned Overflow = ImmOffset & ~MaxImm;
-  ImmOffset -= Overflow;
-  if ((int32_t)Overflow < 0) {
-    Overflow += ImmOffset;
-    ImmOffset = 0;
-  }
-
-  if (Overflow != 0) {
-    // In case this is in a waterfall loop, insert offset code at the def point
-    // of the offset, not inside the loop.
-    MachineBasicBlock::iterator OldInsPt = B.getInsertPt();
-    MachineBasicBlock &OldMBB = B.getMBB();
-    B.setInstr(*OffsetDef);
-
-    if (!BaseReg) {
-      BaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-      B.buildInstr(AMDGPU::V_MOV_B32_e32)
-        .addDef(BaseReg)
-        .addImm(Overflow);
-    } else {
-      Register OverflowVal = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-      B.buildInstr(AMDGPU::V_MOV_B32_e32)
-        .addDef(OverflowVal)
-        .addImm(Overflow);
-
-      Register NewBaseReg = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-      TII.getAddNoCarry(B.getMBB(), B.getInsertPt(), B.getDebugLoc(), NewBaseReg)
-        .addReg(BaseReg)
-        .addReg(OverflowVal, RegState::Kill)
-        .addImm(0);
-      BaseReg = NewBaseReg;
-    }
+  if (Size != STI.getWavefrontSize())
+    return false;
 
-    B.setInsertPt(OldMBB, OldInsPt);
+  Optional<ValueAndVReg> Arg =
+      getConstantVRegValWithLookThrough(I.getOperand(2).getReg(), *MRI, true);
+
+  if (Arg.hasValue()) {
+    const int64_t Value = Arg.getValue().Value;
+    if (Value == 0) {
+      unsigned Opcode = Is64 ? AMDGPU::S_MOV_B64 : AMDGPU::S_MOV_B32;
+      BuildMI(*BB, &I, DL, TII.get(Opcode), DstReg).addImm(0);
+    } else if (Value == -1) { // all ones
+      Register SrcReg = Is64 ? AMDGPU::EXEC : AMDGPU::EXEC_LO;
+      BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
+    } else
+      return false;
+  } else {
+    Register SrcReg = I.getOperand(2).getReg();
+    BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), DstReg).addReg(SrcReg);
   }
 
-  return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
+  I.eraseFromParent();
+  return true;
 }
 
-bool AMDGPUInstructionSelector::selectStoreIntrinsic(MachineInstr &MI,
-                                                     bool IsFormat) const {
-  MachineIRBuilder B(MI);
-  MachineFunction &MF = B.getMF();
-  Register VData = MI.getOperand(1).getReg();
-  LLT Ty = MRI->getType(VData);
-
-  int Size = Ty.getSizeInBits();
-  if (Size % 32 != 0)
-    return false;
-
-  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
-  MachineMemOperand *MMO = *MI.memoperands_begin();
-  const int MemSize = MMO->getSize();
-
-  Register RSrc = MI.getOperand(2).getReg();
-  Register VOffset = MI.getOperand(3).getReg();
-  Register SOffset = MI.getOperand(4).getReg();
-  unsigned AuxiliaryData = MI.getOperand(5).getImm();
-  unsigned ImmOffset;
-  unsigned TotalOffset;
-
-  std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
-  if (TotalOffset != 0)
-    MMO = MF.getMachineMemOperand(MMO, TotalOffset, MemSize);
-
-  const bool Offen = !isZero(VOffset, *MRI);
-
-  int Opc = IsFormat ? getBufferStoreFormatOpcode(Ty, MemSize, Offen) :
-    getBufferStoreOpcode(Ty, MemSize, Offen);
-  if (Opc == -1)
-    return false;
-
-  MachineInstrBuilder MIB = B.buildInstr(Opc)
-    .addUse(VData);
-
-  if (Offen)
-    MIB.addUse(VOffset);
-
-  MIB.addUse(RSrc)
-     .addUse(SOffset)
-     .addImm(ImmOffset)
-     .addImm(extractGLC(AuxiliaryData))
-     .addImm(extractSLC(AuxiliaryData))
-     .addImm(0) // tfe: FIXME: Remove from inst
-     .addImm(extractDLC(AuxiliaryData))
-     .addImm(extractSWZ(AuxiliaryData))
-     .addMemOperand(MMO);
+bool AMDGPUInstructionSelector::selectEndCfIntrinsic(MachineInstr &MI) const {
+  // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
+  // SelectionDAG uses for wave32 vs wave64.
+  MachineBasicBlock *BB = MI.getParent();
+  BuildMI(*BB, &MI, MI.getDebugLoc(), TII.get(AMDGPU::SI_END_CF))
+      .add(MI.getOperand(1));
 
+  Register Reg = MI.getOperand(1).getReg();
   MI.eraseFromParent();
 
-  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+  if (!MRI->getRegClassOrNull(Reg))
+    MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
+  return true;
 }
 
 static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
@@ -1106,70 +1172,458 @@ bool AMDGPUInstructionSelector::selectDSOrderedIntrinsic(
   return Ret;
 }
 
-bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
-    MachineInstr &I) const {
-  MachineBasicBlock *BB = I.getParent();
-  unsigned IntrinsicID = I.getIntrinsicID();
-  switch (IntrinsicID) {
-  case Intrinsic::amdgcn_exp: {
-    int64_t Tgt = I.getOperand(1).getImm();
-    int64_t Enabled = I.getOperand(2).getImm();
-    int64_t Done = I.getOperand(7).getImm();
-    int64_t VM = I.getOperand(8).getImm();
-
-    MachineInstr *Exp = buildEXP(TII, &I, Tgt, I.getOperand(3).getReg(),
-                                 I.getOperand(4).getReg(),
-                                 I.getOperand(5).getReg(),
-                                 I.getOperand(6).getReg(),
-                                 VM, false, Enabled, Done);
+static unsigned gwsIntrinToOpcode(unsigned IntrID) {
+  switch (IntrID) {
+  case Intrinsic::amdgcn_ds_gws_init:
+    return AMDGPU::DS_GWS_INIT;
+  case Intrinsic::amdgcn_ds_gws_barrier:
+    return AMDGPU::DS_GWS_BARRIER;
+  case Intrinsic::amdgcn_ds_gws_sema_v:
+    return AMDGPU::DS_GWS_SEMA_V;
+  case Intrinsic::amdgcn_ds_gws_sema_br:
+    return AMDGPU::DS_GWS_SEMA_BR;
+  case Intrinsic::amdgcn_ds_gws_sema_p:
+    return AMDGPU::DS_GWS_SEMA_P;
+  case Intrinsic::amdgcn_ds_gws_sema_release_all:
+    return AMDGPU::DS_GWS_SEMA_RELEASE_ALL;
+  default:
+    llvm_unreachable("not a gws intrinsic");
+  }
+}
 
-    I.eraseFromParent();
-    return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
+bool AMDGPUInstructionSelector::selectDSGWSIntrinsic(MachineInstr &MI,
+                                                     Intrinsic::ID IID) const {
+  if (IID == Intrinsic::amdgcn_ds_gws_sema_release_all &&
+      !STI.hasGWSSemaReleaseAll())
+    return false;
+
+  // intrinsic ID, vsrc, offset
+  const bool HasVSrc = MI.getNumOperands() == 3;
+  assert(HasVSrc || MI.getNumOperands() == 2);
+
+  Register BaseOffset = MI.getOperand(HasVSrc ? 2 : 1).getReg();
+  const RegisterBank *OffsetRB = RBI.getRegBank(BaseOffset, *MRI, TRI);
+  if (OffsetRB->getID() != AMDGPU::SGPRRegBankID)
+    return false;
+
+  MachineInstr *OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
+  assert(OffsetDef);
+
+  unsigned ImmOffset;
+
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  MachineInstr *Readfirstlane = nullptr;
+
+  // If we legalized the VGPR input, strip out the readfirstlane to analyze the
+  // incoming offset, in case there's an add of a constant. We'll have to put it
+  // back later.
+  if (OffsetDef->getOpcode() == AMDGPU::V_READFIRSTLANE_B32) {
+    Readfirstlane = OffsetDef;
+    BaseOffset = OffsetDef->getOperand(1).getReg();
+    OffsetDef = getDefIgnoringCopies(BaseOffset, *MRI);
   }
-  case Intrinsic::amdgcn_exp_compr: {
-    const DebugLoc &DL = I.getDebugLoc();
-    int64_t Tgt = I.getOperand(1).getImm();
-    int64_t Enabled = I.getOperand(2).getImm();
-    Register Reg0 = I.getOperand(3).getReg();
-    Register Reg1 = I.getOperand(4).getReg();
-    Register Undef = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    int64_t Done = I.getOperand(5).getImm();
-    int64_t VM = I.getOperand(6).getImm();
-
-    BuildMI(*BB, &I, DL, TII.get(AMDGPU::IMPLICIT_DEF), Undef);
-    MachineInstr *Exp = buildEXP(TII, &I, Tgt, Reg0, Reg1, Undef, Undef, VM,
-                                 true,  Enabled, Done);
 
-    I.eraseFromParent();
-    return constrainSelectedInstRegOperands(*Exp, TII, TRI, RBI);
+  if (OffsetDef->getOpcode() == AMDGPU::G_CONSTANT) {
+    // If we have a constant offset, try to use the 0 in m0 as the base.
+    // TODO: Look into changing the default m0 initialization value. If the
+    // default -1 only set the low 16-bits, we could leave it as-is and add 1 to
+    // the immediate offset.
+
+    ImmOffset = OffsetDef->getOperand(1).getCImm()->getZExtValue();
+    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), AMDGPU::M0)
+      .addImm(0);
+  } else {
+    std::tie(BaseOffset, ImmOffset, OffsetDef)
+      = AMDGPU::getBaseWithConstantOffset(*MRI, BaseOffset);
+
+    if (Readfirstlane) {
+      // We have the constant offset now, so put the readfirstlane back on the
+      // variable component.
+      if (!RBI.constrainGenericRegister(BaseOffset, AMDGPU::VGPR_32RegClass, *MRI))
+        return false;
+
+      Readfirstlane->getOperand(1).setReg(BaseOffset);
+      BaseOffset = Readfirstlane->getOperand(0).getReg();
+    } else {
+      if (!RBI.constrainGenericRegister(BaseOffset,
+                                        AMDGPU::SReg_32RegClass, *MRI))
+        return false;
+    }
+
+    Register M0Base = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::S_LSHL_B32), M0Base)
+      .addReg(BaseOffset)
+      .addImm(16);
+
+    BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+      .addReg(M0Base);
   }
-  case Intrinsic::amdgcn_end_cf: {
-    // FIXME: Manually selecting to avoid dealiing with the SReg_1 trick
-    // SelectionDAG uses for wave32 vs wave64.
-    BuildMI(*BB, &I, I.getDebugLoc(),
-            TII.get(AMDGPU::SI_END_CF))
-      .add(I.getOperand(1));
 
-    Register Reg = I.getOperand(1).getReg();
-    I.eraseFromParent();
+  // The resource id offset is computed as (<isa opaque base> + M0[21:16] +
+  // offset field) % 64. Some versions of the programming guide omit the m0
+  // part, or claim it's from offset 0.
+  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(gwsIntrinToOpcode(IID)));
 
-    if (!MRI->getRegClassOrNull(Reg))
-      MRI->setRegClass(Reg, TRI.getWaveMaskRegClass());
-    return true;
+  if (HasVSrc) {
+    Register VSrc = MI.getOperand(1).getReg();
+    MIB.addReg(VSrc);
+    if (!RBI.constrainGenericRegister(VSrc, AMDGPU::VGPR_32RegClass, *MRI))
+      return false;
+  }
+
+  MIB.addImm(ImmOffset)
+     .addImm(-1) // $gds
+     .cloneMemRefs(MI);
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AMDGPUInstructionSelector::selectDSAppendConsume(MachineInstr &MI,
+                                                      bool IsAppend) const {
+  Register PtrBase = MI.getOperand(2).getReg();
+  LLT PtrTy = MRI->getType(PtrBase);
+  bool IsGDS = PtrTy.getAddressSpace() == AMDGPUAS::REGION_ADDRESS;
+
+  unsigned Offset;
+  std::tie(PtrBase, Offset) = selectDS1Addr1OffsetImpl(MI.getOperand(2));
+
+  // TODO: Should this try to look through readfirstlane like GWS?
+  if (!isDSOffsetLegal(PtrBase, Offset, 16)) {
+    PtrBase = MI.getOperand(2).getReg();
+    Offset = 0;
+  }
+
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  const unsigned Opc = IsAppend ? AMDGPU::DS_APPEND : AMDGPU::DS_CONSUME;
+
+  BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+    .addReg(PtrBase);
+  BuildMI(*MBB, &MI, DL, TII.get(Opc), MI.getOperand(0).getReg())
+    .addImm(Offset)
+    .addImm(IsGDS ? -1 : 0)
+    .cloneMemRefs(MI);
+  MI.eraseFromParent();
+  return true;
+}
+
+static bool parseTexFail(uint64_t TexFailCtrl, bool &TFE, bool &LWE,
+                         bool &IsTexFail) {
+  if (TexFailCtrl)
+    IsTexFail = true;
+
+  TFE = (TexFailCtrl & 0x1) ? 1 : 0;
+  TexFailCtrl &= ~(uint64_t)0x1;
+  LWE = (TexFailCtrl & 0x2) ? 1 : 0;
+  TexFailCtrl &= ~(uint64_t)0x2;
+
+  return TexFailCtrl == 0;
+}
+
+static bool parseCachePolicy(uint64_t Value,
+                             bool *GLC, bool *SLC, bool *DLC) {
+  if (GLC) {
+    *GLC = (Value & 0x1) ? 1 : 0;
+    Value &= ~(uint64_t)0x1;
+  }
+  if (SLC) {
+    *SLC = (Value & 0x2) ? 1 : 0;
+    Value &= ~(uint64_t)0x2;
+  }
+  if (DLC) {
+    *DLC = (Value & 0x4) ? 1 : 0;
+    Value &= ~(uint64_t)0x4;
+  }
+
+  return Value == 0;
+}
+
+bool AMDGPUInstructionSelector::selectImageIntrinsic(
+  MachineInstr &MI, const AMDGPU::ImageDimIntrinsicInfo *Intr) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+    AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+
+  const AMDGPU::MIMGDimInfo *DimInfo = AMDGPU::getMIMGDimInfo(Intr->Dim);
+  const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
+      AMDGPU::getMIMGLZMappingInfo(Intr->BaseOpcode);
+  const AMDGPU::MIMGMIPMappingInfo *MIPMappingInfo =
+      AMDGPU::getMIMGMIPMappingInfo(Intr->BaseOpcode);
+  unsigned IntrOpcode = Intr->BaseOpcode;
+  const bool IsGFX10 = STI.getGeneration() >= AMDGPUSubtarget::GFX10;
+
+  const int VAddrIdx = getImageVAddrIdxBegin(BaseOpcode,
+                                             MI.getNumExplicitDefs());
+  int NumVAddr, NumGradients;
+  std::tie(NumVAddr, NumGradients) = getImageNumVAddr(Intr, BaseOpcode);
+
+  Register VDataIn, VDataOut;
+  LLT VDataTy;
+  int NumVDataDwords = -1;
+  bool IsD16 = false;
+
+  // XXX - Can we just get the second to last argument for ctrl?
+  unsigned CtrlIdx; // Index of texfailctrl argument
+  bool Unorm;
+  if (!BaseOpcode->Sampler) {
+    Unorm = true;
+    CtrlIdx = VAddrIdx + NumVAddr + 1;
+  } else {
+    Unorm = MI.getOperand(VAddrIdx + NumVAddr + 2).getImm() != 0;
+    CtrlIdx = VAddrIdx + NumVAddr + 3;
+  }
+
+  bool TFE;
+  bool LWE;
+  bool IsTexFail = false;
+  if (!parseTexFail(MI.getOperand(CtrlIdx).getImm(), TFE, LWE, IsTexFail))
+    return false;
+
+  const int Flags = MI.getOperand(CtrlIdx + 2).getImm();
+  const bool IsA16 = (Flags & 1) != 0;
+  const bool IsG16 = (Flags & 2) != 0;
+
+  // A16 implies 16 bit gradients
+  if (IsA16 && !IsG16)
+    return false;
+
+  unsigned DMask = 0;
+  unsigned DMaskLanes = 0;
+
+  if (BaseOpcode->Atomic) {
+    VDataOut = MI.getOperand(0).getReg();
+    VDataIn = MI.getOperand(2).getReg();
+    LLT Ty = MRI->getType(VDataIn);
+
+    // Be careful to allow atomic swap on 16-bit element vectors.
+    const bool Is64Bit = BaseOpcode->AtomicX2 ?
+      Ty.getSizeInBits() == 128 :
+      Ty.getSizeInBits() == 64;
+
+    if (BaseOpcode->AtomicX2) {
+      assert(MI.getOperand(3).getReg() == AMDGPU::NoRegister);
+
+      DMask = Is64Bit ? 0xf : 0x3;
+      NumVDataDwords = Is64Bit ? 4 : 2;
+    } else {
+      DMask = Is64Bit ? 0x3 : 0x1;
+      NumVDataDwords = Is64Bit ? 2 : 1;
+    }
+  } else {
+    const int DMaskIdx = 2; // Input/output + intrinsic ID.
+
+    DMask = MI.getOperand(DMaskIdx).getImm();
+    DMaskLanes = BaseOpcode->Gather4 ? 4 : countPopulation(DMask);
+
+    if (BaseOpcode->Store) {
+      VDataIn = MI.getOperand(1).getReg();
+      VDataTy = MRI->getType(VDataIn);
+      NumVDataDwords = (VDataTy.getSizeInBits() + 31) / 32;
+    } else {
+      VDataOut = MI.getOperand(0).getReg();
+      VDataTy = MRI->getType(VDataOut);
+      NumVDataDwords = DMaskLanes;
+
+      // One memoperand is mandatory, except for getresinfo.
+      // FIXME: Check this in verifier.
+      if (!MI.memoperands_empty()) {
+        const MachineMemOperand *MMO = *MI.memoperands_begin();
+
+        // Infer d16 from the memory size, as the register type will be mangled by
+        // unpacked subtargets, or by TFE.
+        IsD16 = ((8 * MMO->getSize()) / DMaskLanes) < 32;
+
+        if (IsD16 && !STI.hasUnpackedD16VMem())
+          NumVDataDwords = (DMaskLanes + 1) / 2;
+      }
+    }
+  }
+
+  // Optimize _L to _LZ when _L is zero
+  if (LZMappingInfo) {
+    // The legalizer replaced the register with an immediate 0 if we need to
+    // change the opcode.
+    const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
+    if (Lod.isImm()) {
+      assert(Lod.getImm() == 0);
+      IntrOpcode = LZMappingInfo->LZ;  // set new opcode to _lz variant of _l
+    }
+  }
+
+  // Optimize _mip away, when 'lod' is zero
+  if (MIPMappingInfo) {
+    const MachineOperand &Lod = MI.getOperand(VAddrIdx + NumVAddr - 1);
+    if (Lod.isImm()) {
+      assert(Lod.getImm() == 0);
+      IntrOpcode = MIPMappingInfo->NONMIP;  // set new opcode to variant without _mip
+    }
+  }
+
+  // Set G16 opcode
+  if (IsG16 && !IsA16) {
+    const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
+        AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
+    assert(G16MappingInfo);
+    IntrOpcode = G16MappingInfo->G16; // set opcode to variant with _g16
+  }
+
+  // TODO: Check this in verifier.
+  assert((!IsTexFail || DMaskLanes >= 1) && "should have legalized this");
+
+  bool GLC = false;
+  bool SLC = false;
+  bool DLC = false;
+  if (BaseOpcode->Atomic) {
+    GLC = true; // TODO no-return optimization
+    if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), nullptr, &SLC,
+                          IsGFX10 ? &DLC : nullptr))
+      return false;
+  } else {
+    if (!parseCachePolicy(MI.getOperand(CtrlIdx + 1).getImm(), &GLC, &SLC,
+                          IsGFX10 ? &DLC : nullptr))
+      return false;
+  }
+
+  int NumVAddrRegs = 0;
+  int NumVAddrDwords = 0;
+  for (int I = 0; I < NumVAddr; ++I) {
+    // Skip the $noregs and 0s inserted during legalization.
+    MachineOperand &AddrOp = MI.getOperand(VAddrIdx + I);
+    if (!AddrOp.isReg())
+      continue; // XXX - Break?
+
+    Register Addr = AddrOp.getReg();
+    if (!Addr)
+      break;
+
+    ++NumVAddrRegs;
+    NumVAddrDwords += (MRI->getType(Addr).getSizeInBits() + 31) / 32;
+  }
+
+  // The legalizer preprocessed the intrinsic arguments. If we aren't using
+  // NSA, these should have beeen packed into a single value in the first
+  // address register
+  const bool UseNSA = NumVAddrRegs != 1 && NumVAddrDwords == NumVAddrRegs;
+  if (UseNSA && !STI.hasFeature(AMDGPU::FeatureNSAEncoding)) {
+    LLVM_DEBUG(dbgs() << "Trying to use NSA on non-NSA target\n");
+    return false;
+  }
+
+  if (IsTexFail)
+    ++NumVDataDwords;
+
+  int Opcode = -1;
+  if (IsGFX10) {
+    Opcode = AMDGPU::getMIMGOpcode(IntrOpcode,
+                                   UseNSA ? AMDGPU::MIMGEncGfx10NSA
+                                          : AMDGPU::MIMGEncGfx10Default,
+                                   NumVDataDwords, NumVAddrDwords);
+  } else {
+    if (STI.getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
+      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx8,
+                                     NumVDataDwords, NumVAddrDwords);
+    if (Opcode == -1)
+      Opcode = AMDGPU::getMIMGOpcode(IntrOpcode, AMDGPU::MIMGEncGfx6,
+                                     NumVDataDwords, NumVAddrDwords);
+  }
+  assert(Opcode != -1);
+
+  auto MIB = BuildMI(*MBB, &MI, DL, TII.get(Opcode))
+    .cloneMemRefs(MI);
+
+  if (VDataOut) {
+    if (BaseOpcode->AtomicX2) {
+      const bool Is64 = MRI->getType(VDataOut).getSizeInBits() == 64;
+
+      Register TmpReg = MRI->createVirtualRegister(
+        Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
+      unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+
+      MIB.addDef(TmpReg);
+      BuildMI(*MBB, &MI, DL, TII.get(AMDGPU::COPY), VDataOut)
+        .addReg(TmpReg, RegState::Kill, SubReg);
+
+    } else {
+      MIB.addDef(VDataOut); // vdata output
+    }
   }
-  case Intrinsic::amdgcn_raw_buffer_store:
-    return selectStoreIntrinsic(I, false);
-  case Intrinsic::amdgcn_raw_buffer_store_format:
-    return selectStoreIntrinsic(I, true);
+
+  if (VDataIn)
+    MIB.addReg(VDataIn); // vdata input
+
+  for (int i = 0; i != NumVAddrRegs; ++i) {
+    MachineOperand &SrcOp = MI.getOperand(VAddrIdx + i);
+    if (SrcOp.isReg()) {
+      assert(SrcOp.getReg() != 0);
+      MIB.addReg(SrcOp.getReg());
+    }
+  }
+
+  MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr).getReg()); // rsrc
+  if (BaseOpcode->Sampler)
+    MIB.addReg(MI.getOperand(VAddrIdx + NumVAddr + 1).getReg()); // sampler
+
+  MIB.addImm(DMask); // dmask
+
+  if (IsGFX10)
+    MIB.addImm(DimInfo->Encoding);
+  MIB.addImm(Unorm);
+  if (IsGFX10)
+    MIB.addImm(DLC);
+
+  MIB.addImm(GLC);
+  MIB.addImm(SLC);
+  MIB.addImm(IsA16 &&  // a16 or r128
+             STI.hasFeature(AMDGPU::FeatureR128A16) ? -1 : 0);
+  if (IsGFX10)
+    MIB.addImm(IsA16 ? -1 : 0);
+
+  MIB.addImm(TFE); // tfe
+  MIB.addImm(LWE); // lwe
+  if (!IsGFX10)
+    MIB.addImm(DimInfo->DA ? -1 : 0);
+  if (BaseOpcode->HasD16)
+    MIB.addImm(IsD16 ? -1 : 0);
+
+  MI.eraseFromParent();
+  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
+bool AMDGPUInstructionSelector::selectG_INTRINSIC_W_SIDE_EFFECTS(
+    MachineInstr &I) const {
+  unsigned IntrinsicID = I.getIntrinsicID();
+  switch (IntrinsicID) {
+  case Intrinsic::amdgcn_end_cf:
+    return selectEndCfIntrinsic(I);
   case Intrinsic::amdgcn_ds_ordered_add:
   case Intrinsic::amdgcn_ds_ordered_swap:
     return selectDSOrderedIntrinsic(I, IntrinsicID);
-  default:
+  case Intrinsic::amdgcn_ds_gws_init:
+  case Intrinsic::amdgcn_ds_gws_barrier:
+  case Intrinsic::amdgcn_ds_gws_sema_v:
+  case Intrinsic::amdgcn_ds_gws_sema_br:
+  case Intrinsic::amdgcn_ds_gws_sema_p:
+  case Intrinsic::amdgcn_ds_gws_sema_release_all:
+    return selectDSGWSIntrinsic(I, IntrinsicID);
+  case Intrinsic::amdgcn_ds_append:
+    return selectDSAppendConsume(I, true);
+  case Intrinsic::amdgcn_ds_consume:
+    return selectDSAppendConsume(I, false);
+  default: {
     return selectImpl(I, *CoverageInfo);
   }
+  }
 }
 
 bool AMDGPUInstructionSelector::selectG_SELECT(MachineInstr &I) const {
+  if (selectImpl(I, *CoverageInfo))
+    return true;
+
   MachineBasicBlock *BB = I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
 
@@ -1247,9 +1701,6 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
   Register SrcReg = I.getOperand(1).getReg();
   const LLT DstTy = MRI->getType(DstReg);
   const LLT SrcTy = MRI->getType(SrcReg);
-  if (!DstTy.isScalar())
-    return false;
-
   const LLT S1 = LLT::scalar(1);
 
   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
@@ -1264,6 +1715,8 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
       return false;
   }
 
+  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+
   unsigned DstSize = DstTy.getSizeInBits();
   unsigned SrcSize = SrcTy.getSizeInBits();
 
@@ -1271,6 +1724,73 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
     = TRI.getRegClassForSizeOnBank(SrcSize, *SrcRB, *MRI);
   const TargetRegisterClass *DstRC
     = TRI.getRegClassForSizeOnBank(DstSize, *DstRB, *MRI);
+  if (!SrcRC || !DstRC)
+    return false;
+
+  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
+      !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
+    LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
+    return false;
+  }
+
+  if (DstTy == LLT::vector(2, 16) && SrcTy == LLT::vector(2, 32)) {
+    MachineBasicBlock *MBB = I.getParent();
+    const DebugLoc &DL = I.getDebugLoc();
+
+    Register LoReg = MRI->createVirtualRegister(DstRC);
+    Register HiReg = MRI->createVirtualRegister(DstRC);
+    BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), LoReg)
+      .addReg(SrcReg, 0, AMDGPU::sub0);
+    BuildMI(*MBB, I, DL, TII.get(AMDGPU::COPY), HiReg)
+      .addReg(SrcReg, 0, AMDGPU::sub1);
+
+    if (IsVALU && STI.hasSDWA()) {
+      // Write the low 16-bits of the high element into the high 16-bits of the
+      // low element.
+      MachineInstr *MovSDWA =
+        BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+        .addImm(0)                             // $src0_modifiers
+        .addReg(HiReg)                         // $src0
+        .addImm(0)                             // $clamp
+        .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
+        .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+        .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
+        .addReg(LoReg, RegState::Implicit);
+      MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+    } else {
+      Register TmpReg0 = MRI->createVirtualRegister(DstRC);
+      Register TmpReg1 = MRI->createVirtualRegister(DstRC);
+      Register ImmReg = MRI->createVirtualRegister(DstRC);
+      if (IsVALU) {
+        BuildMI(*MBB, I, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), TmpReg0)
+          .addImm(16)
+          .addReg(HiReg);
+      } else {
+        BuildMI(*MBB, I, DL, TII.get(AMDGPU::S_LSHL_B32), TmpReg0)
+          .addReg(HiReg)
+          .addImm(16);
+      }
+
+      unsigned MovOpc = IsVALU ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
+      unsigned AndOpc = IsVALU ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
+      unsigned OrOpc = IsVALU ? AMDGPU::V_OR_B32_e64 : AMDGPU::S_OR_B32;
+
+      BuildMI(*MBB, I, DL, TII.get(MovOpc), ImmReg)
+        .addImm(0xffff);
+      BuildMI(*MBB, I, DL, TII.get(AndOpc), TmpReg1)
+        .addReg(LoReg)
+        .addReg(ImmReg);
+      BuildMI(*MBB, I, DL, TII.get(OrOpc), DstReg)
+        .addReg(TmpReg0)
+        .addReg(TmpReg1);
+    }
+
+    I.eraseFromParent();
+    return true;
+  }
+
+  if (!DstTy.isScalar())
+    return false;
 
   if (SrcSize > 32) {
     int SubRegIdx = sizeToSubRegIndex(DstSize);
@@ -1279,17 +1799,17 @@ bool AMDGPUInstructionSelector::selectG_TRUNC(MachineInstr &I) const {
 
     // Deal with weird cases where the class only partially supports the subreg
     // index.
-    SrcRC = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
-    if (!SrcRC)
+    const TargetRegisterClass *SrcWithSubRC
+      = TRI.getSubClassWithSubReg(SrcRC, SubRegIdx);
+    if (!SrcWithSubRC)
       return false;
 
-    I.getOperand(1).setSubReg(SubRegIdx);
-  }
+    if (SrcWithSubRC != SrcRC) {
+      if (!RBI.constrainGenericRegister(SrcReg, *SrcWithSubRC, *MRI))
+        return false;
+    }
 
-  if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
-      !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI)) {
-    LLVM_DEBUG(dbgs() << "Failed to constrain G_TRUNC\n");
-    return false;
+    I.getOperand(1).setSubReg(SubRegIdx);
   }
 
   I.setDesc(TII.get(TargetOpcode::COPY));
@@ -1318,7 +1838,8 @@ const RegisterBank *AMDGPUInstructionSelector::getArtifactRegBank(
 }
 
 bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
-  bool Signed = I.getOpcode() == AMDGPU::G_SEXT;
+  bool InReg = I.getOpcode() == AMDGPU::G_SEXT_INREG;
+  bool Signed = I.getOpcode() == AMDGPU::G_SEXT || InReg;
   const DebugLoc &DL = I.getDebugLoc();
   MachineBasicBlock &MBB = *I.getParent();
   const Register DstReg = I.getOperand(0).getReg();
@@ -1326,7 +1847,8 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
 
   const LLT DstTy = MRI->getType(DstReg);
   const LLT SrcTy = MRI->getType(SrcReg);
-  const unsigned SrcSize = SrcTy.getSizeInBits();
+  const unsigned SrcSize = I.getOpcode() == AMDGPU::G_SEXT_INREG ?
+    I.getOperand(2).getImm() : SrcTy.getSizeInBits();
   const unsigned DstSize = DstTy.getSizeInBits();
   if (!DstTy.isScalar())
     return false;
@@ -1362,7 +1884,9 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
   }
 
   if (SrcBank->getID() == AMDGPU::SGPRRegBankID && DstSize <= 64) {
-    if (!RBI.constrainGenericRegister(SrcReg, AMDGPU::SReg_32RegClass, *MRI))
+    const TargetRegisterClass &SrcRC = InReg && DstSize > 32 ?
+      AMDGPU::SReg_64RegClass : AMDGPU::SReg_32RegClass;
+    if (!RBI.constrainGenericRegister(SrcReg, SrcRC, *MRI))
       return false;
 
     if (Signed && DstSize == 32 && (SrcSize == 8 || SrcSize == 16)) {
@@ -1378,13 +1902,15 @@ bool AMDGPUInstructionSelector::selectG_SZA_EXT(MachineInstr &I) const {
     const unsigned BFE32 = Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32;
 
     // Scalar BFE is encoded as S1[5:0] = offset, S1[22:16]= width.
-    if (DstSize > 32 && SrcSize <= 32) {
+    if (DstSize > 32 && (SrcSize <= 32 || InReg)) {
       // We need a 64-bit register source, but the high bits don't matter.
       Register ExtReg = MRI->createVirtualRegister(&AMDGPU::SReg_64RegClass);
       Register UndefReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      unsigned SubReg = InReg ? AMDGPU::sub0 : 0;
+
       BuildMI(MBB, I, DL, TII.get(AMDGPU::IMPLICIT_DEF), UndefReg);
       BuildMI(MBB, I, DL, TII.get(AMDGPU::REG_SEQUENCE), ExtReg)
-        .addReg(SrcReg)
+        .addReg(SrcReg, 0, SubReg)
         .addImm(AMDGPU::sub0)
         .addReg(UndefReg)
         .addImm(AMDGPU::sub1);
@@ -1487,6 +2013,103 @@ bool AMDGPUInstructionSelector::selectG_CONSTANT(MachineInstr &I) const {
   return RBI.constrainGenericRegister(DstReg, *DstRC, *MRI);
 }
 
+bool AMDGPUInstructionSelector::selectG_FNEG(MachineInstr &MI) const {
+  // Only manually handle the f64 SGPR case.
+  //
+  // FIXME: This is a workaround for 2.5 different tablegen problems. Because
+  // the bit ops theoretically have a second result due to the implicit def of
+  // SCC, the GlobalISelEmitter is overly conservative and rejects it. Fixing
+  // that is easy by disabling the check. The result works, but uses a
+  // nonsensical sreg32orlds_and_sreg_1 regclass.
+  //
+  // The DAG emitter is more problematic, and incorrectly adds both S_XOR_B32 to
+  // the variadic REG_SEQUENCE operands.
+
+  Register Dst = MI.getOperand(0).getReg();
+  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
+  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
+      MRI->getType(Dst) != LLT::scalar(64))
+    return false;
+
+  Register Src = MI.getOperand(1).getReg();
+  MachineInstr *Fabs = getOpcodeDef(TargetOpcode::G_FABS, Src, *MRI);
+  if (Fabs)
+    Src = Fabs->getOperand(1).getReg();
+
+  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
+      !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
+    return false;
+
+  MachineBasicBlock *BB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
+    .addReg(Src, 0, AMDGPU::sub0);
+  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
+    .addReg(Src, 0, AMDGPU::sub1);
+  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
+    .addImm(0x80000000);
+
+  // Set or toggle sign bit.
+  unsigned Opc = Fabs ? AMDGPU::S_OR_B32 : AMDGPU::S_XOR_B32;
+  BuildMI(*BB, &MI, DL, TII.get(Opc), OpReg)
+    .addReg(HiReg)
+    .addReg(ConstReg);
+  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
+    .addReg(LoReg)
+    .addImm(AMDGPU::sub0)
+    .addReg(OpReg)
+    .addImm(AMDGPU::sub1);
+  MI.eraseFromParent();
+  return true;
+}
+
+// FIXME: This is a workaround for the same tablegen problems as G_FNEG
+bool AMDGPUInstructionSelector::selectG_FABS(MachineInstr &MI) const {
+  Register Dst = MI.getOperand(0).getReg();
+  const RegisterBank *DstRB = RBI.getRegBank(Dst, *MRI, TRI);
+  if (DstRB->getID() != AMDGPU::SGPRRegBankID ||
+      MRI->getType(Dst) != LLT::scalar(64))
+    return false;
+
+  Register Src = MI.getOperand(1).getReg();
+  MachineBasicBlock *BB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+  Register LoReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  Register HiReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  Register ConstReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  Register OpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+
+  if (!RBI.constrainGenericRegister(Src, AMDGPU::SReg_64RegClass, *MRI) ||
+      !RBI.constrainGenericRegister(Dst, AMDGPU::SReg_64RegClass, *MRI))
+    return false;
+
+  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), LoReg)
+    .addReg(Src, 0, AMDGPU::sub0);
+  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), HiReg)
+    .addReg(Src, 0, AMDGPU::sub1);
+  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_MOV_B32), ConstReg)
+    .addImm(0x7fffffff);
+
+  // Clear sign bit.
+  // TODO: Should this used S_BITSET0_*?
+  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::S_AND_B32), OpReg)
+    .addReg(HiReg)
+    .addReg(ConstReg);
+  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::REG_SEQUENCE), Dst)
+    .addReg(LoReg)
+    .addImm(AMDGPU::sub0)
+    .addReg(OpReg)
+    .addImm(AMDGPU::sub1);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 static bool isConstant(const MachineInstr &MI) {
   return MI.getOpcode() == TargetOpcode::G_CONSTANT;
 }
@@ -1573,6 +2196,65 @@ bool AMDGPUInstructionSelector::selectG_LOAD_ATOMICRMW(MachineInstr &I) const {
   return selectImpl(I, *CoverageInfo);
 }
 
+// TODO: No rtn optimization.
+bool AMDGPUInstructionSelector::selectG_AMDGPU_ATOMIC_CMPXCHG(
+  MachineInstr &MI) const {
+  Register PtrReg = MI.getOperand(1).getReg();
+  const LLT PtrTy = MRI->getType(PtrReg);
+  if (PtrTy.getAddressSpace() == AMDGPUAS::FLAT_ADDRESS ||
+      STI.useFlatForGlobal())
+    return selectImpl(MI, *CoverageInfo);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  const LLT Ty = MRI->getType(DstReg);
+  const bool Is64 = Ty.getSizeInBits() == 64;
+  const unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+  Register TmpReg = MRI->createVirtualRegister(
+    Is64 ? &AMDGPU::VReg_128RegClass : &AMDGPU::VReg_64RegClass);
+
+  const DebugLoc &DL = MI.getDebugLoc();
+  MachineBasicBlock *BB = MI.getParent();
+
+  Register VAddr, RSrcReg, SOffset;
+  int64_t Offset = 0;
+
+  unsigned Opcode;
+  if (selectMUBUFOffsetImpl(MI.getOperand(1), RSrcReg, SOffset, Offset)) {
+    Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_OFFSET_RTN :
+                             AMDGPU::BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN;
+  } else if (selectMUBUFAddr64Impl(MI.getOperand(1), VAddr,
+                                   RSrcReg, SOffset, Offset)) {
+    Opcode = Is64 ? AMDGPU::BUFFER_ATOMIC_CMPSWAP_X2_ADDR64_RTN :
+                    AMDGPU::BUFFER_ATOMIC_CMPSWAP_ADDR64_RTN;
+  } else
+    return selectImpl(MI, *CoverageInfo);
+
+  auto MIB = BuildMI(*BB, &MI, DL, TII.get(Opcode), TmpReg)
+    .addReg(MI.getOperand(2).getReg());
+
+  if (VAddr)
+    MIB.addReg(VAddr);
+
+  MIB.addReg(RSrcReg);
+  if (SOffset)
+    MIB.addReg(SOffset);
+  else
+    MIB.addImm(0);
+
+  MIB.addImm(Offset);
+  MIB.addImm(0); // slc
+  MIB.cloneMemRefs(MI);
+
+  BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), DstReg)
+    .addReg(TmpReg, RegState::Kill, SubReg);
+
+  MI.eraseFromParent();
+
+  MRI->setRegClass(
+    DstReg, Is64 ? &AMDGPU::VReg_64RegClass : &AMDGPU::VGPR_32RegClass);
+  return constrainSelectedInstRegOperands(*MIB, TII, TRI, RBI);
+}
+
 bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
   MachineBasicBlock *BB = I.getParent();
   MachineOperand &CondOp = I.getOperand(0);
@@ -1619,7 +2301,8 @@ bool AMDGPUInstructionSelector::selectG_BRCOND(MachineInstr &I) const {
   return true;
 }
 
-bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const {
+bool AMDGPUInstructionSelector::selectG_FRAME_INDEX_GLOBAL_VALUE(
+  MachineInstr &I) const {
   Register DstReg = I.getOperand(0).getReg();
   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
@@ -1631,67 +2314,134 @@ bool AMDGPUInstructionSelector::selectG_FRAME_INDEX(MachineInstr &I) const {
     DstReg, IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass, *MRI);
 }
 
-bool AMDGPUInstructionSelector::selectG_PTR_MASK(MachineInstr &I) const {
-  uint64_t Align = I.getOperand(2).getImm();
-  const uint64_t Mask = ~((UINT64_C(1) << Align) - 1);
-
-  MachineBasicBlock *BB = I.getParent();
-
+bool AMDGPUInstructionSelector::selectG_PTRMASK(MachineInstr &I) const {
   Register DstReg = I.getOperand(0).getReg();
   Register SrcReg = I.getOperand(1).getReg();
+  Register MaskReg = I.getOperand(2).getReg();
+  LLT Ty = MRI->getType(DstReg);
+  LLT MaskTy = MRI->getType(MaskReg);
 
   const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
   const RegisterBank *SrcRB = RBI.getRegBank(SrcReg, *MRI, TRI);
+  const RegisterBank *MaskRB = RBI.getRegBank(MaskReg, *MRI, TRI);
   const bool IsVGPR = DstRB->getID() == AMDGPU::VGPRRegBankID;
+  if (DstRB != SrcRB) // Should only happen for hand written MIR.
+    return false;
+
   unsigned NewOpc = IsVGPR ? AMDGPU::V_AND_B32_e64 : AMDGPU::S_AND_B32;
-  unsigned MovOpc = IsVGPR ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
   const TargetRegisterClass &RegRC
     = IsVGPR ? AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
 
-  LLT Ty = MRI->getType(DstReg);
-
   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(Ty, *DstRB,
                                                                   *MRI);
   const TargetRegisterClass *SrcRC = TRI.getRegClassForTypeOnBank(Ty, *SrcRB,
                                                                   *MRI);
+  const TargetRegisterClass *MaskRC =
+      TRI.getRegClassForTypeOnBank(MaskTy, *MaskRB, *MRI);
+
   if (!RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
-      !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI))
+      !RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
+      !RBI.constrainGenericRegister(MaskReg, *MaskRC, *MRI))
     return false;
 
+  MachineBasicBlock *BB = I.getParent();
   const DebugLoc &DL = I.getDebugLoc();
-  Register ImmReg = MRI->createVirtualRegister(&RegRC);
-  BuildMI(*BB, &I, DL, TII.get(MovOpc), ImmReg)
-    .addImm(Mask);
-
   if (Ty.getSizeInBits() == 32) {
+    assert(MaskTy.getSizeInBits() == 32 &&
+           "ptrmask should have been narrowed during legalize");
+
     BuildMI(*BB, &I, DL, TII.get(NewOpc), DstReg)
       .addReg(SrcReg)
-      .addReg(ImmReg);
+      .addReg(MaskReg);
     I.eraseFromParent();
     return true;
   }
 
   Register HiReg = MRI->createVirtualRegister(&RegRC);
   Register LoReg = MRI->createVirtualRegister(&RegRC);
-  Register MaskLo = MRI->createVirtualRegister(&RegRC);
 
+  // Extract the subregisters from the source pointer.
   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), LoReg)
     .addReg(SrcReg, 0, AMDGPU::sub0);
   BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), HiReg)
     .addReg(SrcReg, 0, AMDGPU::sub1);
 
-  BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskLo)
-    .addReg(LoReg)
-    .addReg(ImmReg);
+  Register MaskedLo, MaskedHi;
+
+  // Try to avoid emitting a bit operation when we only need to touch half of
+  // the 64-bit pointer.
+  APInt MaskOnes = KnownBits->getKnownOnes(MaskReg).zextOrSelf(64);
+
+  const APInt MaskHi32 = APInt::getHighBitsSet(64, 32);
+  const APInt MaskLo32 = APInt::getLowBitsSet(64, 32);
+  if ((MaskOnes & MaskLo32) == MaskLo32) {
+    // If all the bits in the low half are 1, we only need a copy for it.
+    MaskedLo = LoReg;
+  } else {
+    // Extract the mask subregister and apply the and.
+    Register MaskLo = MRI->createVirtualRegister(&RegRC);
+    MaskedLo = MRI->createVirtualRegister(&RegRC);
+
+    BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskLo)
+      .addReg(MaskReg, 0, AMDGPU::sub0);
+    BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedLo)
+      .addReg(LoReg)
+      .addReg(MaskLo);
+  }
+
+  if ((MaskOnes & MaskHi32) == MaskHi32) {
+    // If all the bits in the high half are 1, we only need a copy for it.
+    MaskedHi = HiReg;
+  } else {
+    Register MaskHi = MRI->createVirtualRegister(&RegRC);
+    MaskedHi = MRI->createVirtualRegister(&RegRC);
+
+    BuildMI(*BB, &I, DL, TII.get(AMDGPU::COPY), MaskHi)
+      .addReg(MaskReg, 0, AMDGPU::sub1);
+    BuildMI(*BB, &I, DL, TII.get(NewOpc), MaskedHi)
+      .addReg(HiReg)
+      .addReg(MaskHi);
+  }
+
   BuildMI(*BB, &I, DL, TII.get(AMDGPU::REG_SEQUENCE), DstReg)
-    .addReg(MaskLo)
+    .addReg(MaskedLo)
     .addImm(AMDGPU::sub0)
-    .addReg(HiReg)
+    .addReg(MaskedHi)
     .addImm(AMDGPU::sub1);
   I.eraseFromParent();
   return true;
 }
 
+/// Return the register to use for the index value, and the subregister to use
+/// for the indirectly accessed register.
+static std::pair<Register, unsigned>
+computeIndirectRegIndex(MachineRegisterInfo &MRI,
+                        const SIRegisterInfo &TRI,
+                        const TargetRegisterClass *SuperRC,
+                        Register IdxReg,
+                        unsigned EltSize) {
+  Register IdxBaseReg;
+  int Offset;
+  MachineInstr *Unused;
+
+  std::tie(IdxBaseReg, Offset, Unused)
+    = AMDGPU::getBaseWithConstantOffset(MRI, IdxReg);
+  if (IdxBaseReg == AMDGPU::NoRegister) {
+    // This will happen if the index is a known constant. This should ordinarily
+    // be legalized out, but handle it as a register just in case.
+    assert(Offset == 0);
+    IdxBaseReg = IdxReg;
+  }
+
+  ArrayRef<int16_t> SubRegs = TRI.getRegSplitParts(SuperRC, EltSize);
+
+  // Skip out of bounds offsets, or else we would end up using an undefined
+  // register.
+  if (static_cast<unsigned>(Offset) >= SubRegs.size())
+    return std::make_pair(IdxReg, SubRegs[0]);
+  return std::make_pair(IdxBaseReg, SubRegs[Offset]);
+}
+
 bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
   MachineInstr &MI) const {
   Register DstReg = MI.getOperand(0).getReg();
@@ -1714,6 +2464,8 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
                                                                   *MRI);
   const TargetRegisterClass *DstRC = TRI.getRegClassForTypeOnBank(DstTy, *DstRB,
                                                                   *MRI);
+  if (!SrcRC || !DstRC)
+    return false;
   if (!RBI.constrainGenericRegister(SrcReg, *SrcRC, *MRI) ||
       !RBI.constrainGenericRegister(DstReg, *DstRC, *MRI) ||
       !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
@@ -1723,7 +2475,9 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
   const DebugLoc &DL = MI.getDebugLoc();
   const bool Is64 = DstTy.getSizeInBits() == 64;
 
-  unsigned SubReg = Is64 ? AMDGPU::sub0_sub1 : AMDGPU::sub0;
+  unsigned SubReg;
+  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, SrcRC, IdxReg,
+                                                     DstTy.getSizeInBits() / 8);
 
   if (SrcRB->getID() == AMDGPU::SGPRRegBankID) {
     if (DstTy.getSizeInBits() != 32 && !Is64)
@@ -1766,6 +2520,237 @@ bool AMDGPUInstructionSelector::selectG_EXTRACT_VECTOR_ELT(
   return true;
 }
 
+// TODO: Fold insert_vector_elt (extract_vector_elt) into movrelsd
+bool AMDGPUInstructionSelector::selectG_INSERT_VECTOR_ELT(
+  MachineInstr &MI) const {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register VecReg = MI.getOperand(1).getReg();
+  Register ValReg = MI.getOperand(2).getReg();
+  Register IdxReg = MI.getOperand(3).getReg();
+
+  LLT VecTy = MRI->getType(DstReg);
+  LLT ValTy = MRI->getType(ValReg);
+  unsigned VecSize = VecTy.getSizeInBits();
+  unsigned ValSize = ValTy.getSizeInBits();
+
+  const RegisterBank *VecRB = RBI.getRegBank(VecReg, *MRI, TRI);
+  const RegisterBank *ValRB = RBI.getRegBank(ValReg, *MRI, TRI);
+  const RegisterBank *IdxRB = RBI.getRegBank(IdxReg, *MRI, TRI);
+
+  assert(VecTy.getElementType() == ValTy);
+
+  // The index must be scalar. If it wasn't RegBankSelect should have moved this
+  // into a waterfall loop.
+  if (IdxRB->getID() != AMDGPU::SGPRRegBankID)
+    return false;
+
+  const TargetRegisterClass *VecRC = TRI.getRegClassForTypeOnBank(VecTy, *VecRB,
+                                                                  *MRI);
+  const TargetRegisterClass *ValRC = TRI.getRegClassForTypeOnBank(ValTy, *ValRB,
+                                                                  *MRI);
+
+  if (!RBI.constrainGenericRegister(VecReg, *VecRC, *MRI) ||
+      !RBI.constrainGenericRegister(DstReg, *VecRC, *MRI) ||
+      !RBI.constrainGenericRegister(ValReg, *ValRC, *MRI) ||
+      !RBI.constrainGenericRegister(IdxReg, AMDGPU::SReg_32RegClass, *MRI))
+    return false;
+
+  if (VecRB->getID() == AMDGPU::VGPRRegBankID && ValSize != 32)
+    return false;
+
+  unsigned SubReg;
+  std::tie(IdxReg, SubReg) = computeIndirectRegIndex(*MRI, TRI, VecRC, IdxReg,
+                                                     ValSize / 8);
+
+  const bool IndexMode = VecRB->getID() == AMDGPU::VGPRRegBankID &&
+                         STI.useVGPRIndexMode();
+
+  MachineBasicBlock *BB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  if (IndexMode) {
+    BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_ON))
+      .addReg(IdxReg)
+      .addImm(AMDGPU::VGPRIndexMode::DST_ENABLE);
+  } else {
+    BuildMI(*BB, &MI, DL, TII.get(AMDGPU::COPY), AMDGPU::M0)
+      .addReg(IdxReg);
+  }
+
+  const MCInstrDesc &RegWriteOp
+    = TII.getIndirectRegWritePseudo(VecSize, ValSize,
+                                    VecRB->getID() == AMDGPU::SGPRRegBankID);
+  BuildMI(*BB, MI, DL, RegWriteOp, DstReg)
+    .addReg(VecReg)
+    .addReg(ValReg)
+    .addImm(SubReg);
+
+  if (IndexMode)
+    BuildMI(*BB, MI, DL, TII.get(AMDGPU::S_SET_GPR_IDX_OFF));
+
+  MI.eraseFromParent();
+  return true;
+}
+
+static bool isZeroOrUndef(int X) {
+  return X == 0 || X == -1;
+}
+
+static bool isOneOrUndef(int X) {
+  return X == 1 || X == -1;
+}
+
+static bool isZeroOrOneOrUndef(int X) {
+  return X == 0 || X == 1 || X == -1;
+}
+
+// Normalize a VOP3P shuffle mask to refer to the low/high half of a single
+// 32-bit register.
+static Register normalizeVOP3PMask(int NewMask[2], Register Src0, Register Src1,
+                                   ArrayRef<int> Mask) {
+  NewMask[0] = Mask[0];
+  NewMask[1] = Mask[1];
+  if (isZeroOrOneOrUndef(Mask[0]) && isZeroOrOneOrUndef(Mask[1]))
+    return Src0;
+
+  assert(NewMask[0] == 2 || NewMask[0] == 3 || NewMask[0] == -1);
+  assert(NewMask[1] == 2 || NewMask[1] == 3 || NewMask[1] == -1);
+
+  // Shift the mask inputs to be 0/1;
+  NewMask[0] = NewMask[0] == -1 ? -1 : NewMask[0] - 2;
+  NewMask[1] = NewMask[1] == -1 ? -1 : NewMask[1] - 2;
+  return Src1;
+}
+
+// This is only legal with VOP3P instructions as an aid to op_sel matching.
+bool AMDGPUInstructionSelector::selectG_SHUFFLE_VECTOR(
+  MachineInstr &MI) const {
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Src0Reg = MI.getOperand(1).getReg();
+  Register Src1Reg = MI.getOperand(2).getReg();
+  ArrayRef<int> ShufMask = MI.getOperand(3).getShuffleMask();
+
+  const LLT V2S16 = LLT::vector(2, 16);
+  if (MRI->getType(DstReg) != V2S16 || MRI->getType(Src0Reg) != V2S16)
+    return false;
+
+  if (!AMDGPU::isLegalVOP3PShuffleMask(ShufMask))
+    return false;
+
+  assert(ShufMask.size() == 2);
+  assert(STI.hasSDWA() && "no target has VOP3P but not SDWA");
+
+  MachineBasicBlock *MBB = MI.getParent();
+  const DebugLoc &DL = MI.getDebugLoc();
+
+  const RegisterBank *DstRB = RBI.getRegBank(DstReg, *MRI, TRI);
+  const bool IsVALU = DstRB->getID() == AMDGPU::VGPRRegBankID;
+  const TargetRegisterClass &RC = IsVALU ?
+    AMDGPU::VGPR_32RegClass : AMDGPU::SReg_32RegClass;
+
+  // Handle the degenerate case which should have folded out.
+  if (ShufMask[0] == -1 && ShufMask[1] == -1) {
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::IMPLICIT_DEF), DstReg);
+
+    MI.eraseFromParent();
+    return RBI.constrainGenericRegister(DstReg, RC, *MRI);
+  }
+
+  // A legal VOP3P mask only reads one of the sources.
+  int Mask[2];
+  Register SrcVec = normalizeVOP3PMask(Mask, Src0Reg, Src1Reg, ShufMask);
+
+  if (!RBI.constrainGenericRegister(DstReg, RC, *MRI) ||
+      !RBI.constrainGenericRegister(SrcVec, RC, *MRI))
+    return false;
+
+  // TODO: This also should have been folded out
+  if (isZeroOrUndef(Mask[0]) && isOneOrUndef(Mask[1])) {
+    BuildMI(*MBB, MI, DL, TII.get(AMDGPU::COPY), DstReg)
+      .addReg(SrcVec);
+
+    MI.eraseFromParent();
+    return true;
+  }
+
+  if (Mask[0] == 1 && Mask[1] == -1) {
+    if (IsVALU) {
+      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHRREV_B32_e64), DstReg)
+        .addImm(16)
+        .addReg(SrcVec);
+    } else {
+      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), DstReg)
+        .addReg(SrcVec)
+        .addImm(16);
+    }
+  } else if (Mask[0] == -1 && Mask[1] == 0) {
+    if (IsVALU) {
+      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_LSHLREV_B32_e64), DstReg)
+        .addImm(16)
+        .addReg(SrcVec);
+    } else {
+      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHL_B32), DstReg)
+        .addReg(SrcVec)
+        .addImm(16);
+    }
+  } else if (Mask[0] == 0 && Mask[1] == 0) {
+    if (IsVALU) {
+      // Write low half of the register into the high half.
+      MachineInstr *MovSDWA =
+        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+        .addImm(0)                             // $src0_modifiers
+        .addReg(SrcVec)                        // $src0
+        .addImm(0)                             // $clamp
+        .addImm(AMDGPU::SDWA::WORD_1)          // $dst_sel
+        .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+        .addImm(AMDGPU::SDWA::WORD_0)          // $src0_sel
+        .addReg(SrcVec, RegState::Implicit);
+      MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+    } else {
+      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
+        .addReg(SrcVec)
+        .addReg(SrcVec);
+    }
+  } else if (Mask[0] == 1 && Mask[1] == 1) {
+    if (IsVALU) {
+      // Write high half of the register into the low half.
+      MachineInstr *MovSDWA =
+        BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_MOV_B32_sdwa), DstReg)
+        .addImm(0)                             // $src0_modifiers
+        .addReg(SrcVec)                        // $src0
+        .addImm(0)                             // $clamp
+        .addImm(AMDGPU::SDWA::WORD_0)          // $dst_sel
+        .addImm(AMDGPU::SDWA::UNUSED_PRESERVE) // $dst_unused
+        .addImm(AMDGPU::SDWA::WORD_1)          // $src0_sel
+        .addReg(SrcVec, RegState::Implicit);
+      MovSDWA->tieOperands(0, MovSDWA->getNumOperands() - 1);
+    } else {
+      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_HH_B32_B16), DstReg)
+        .addReg(SrcVec)
+        .addReg(SrcVec);
+    }
+  } else if (Mask[0] == 1 && Mask[1] == 0) {
+    if (IsVALU) {
+      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::V_ALIGNBIT_B32), DstReg)
+        .addReg(SrcVec)
+        .addReg(SrcVec)
+        .addImm(16);
+    } else {
+      Register TmpReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_LSHR_B32), TmpReg)
+        .addReg(SrcVec)
+        .addImm(16);
+      BuildMI(*MBB, MI, DL, TII.get(AMDGPU::S_PACK_LL_B32_B16), DstReg)
+        .addReg(TmpReg)
+        .addReg(SrcVec);
+    }
+  } else
+    llvm_unreachable("all shuffle masks should be handled");
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   if (I.isPHI())
     return selectPHI(I);
@@ -1780,9 +2765,9 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_AND:
   case TargetOpcode::G_OR:
   case TargetOpcode::G_XOR:
-    if (selectG_AND_OR_XOR(I))
+    if (selectImpl(I, *CoverageInfo))
       return true;
-    return selectImpl(I, *CoverageInfo);
+    return selectG_AND_OR_XOR(I);
   case TargetOpcode::G_ADD:
   case TargetOpcode::G_SUB:
     if (selectImpl(I, *CoverageInfo))
@@ -1800,6 +2785,14 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_CONSTANT:
   case TargetOpcode::G_FCONSTANT:
     return selectG_CONSTANT(I);
+  case TargetOpcode::G_FNEG:
+    if (selectImpl(I, *CoverageInfo))
+      return true;
+    return selectG_FNEG(I);
+  case TargetOpcode::G_FABS:
+    if (selectImpl(I, *CoverageInfo))
+      return true;
+    return selectG_FABS(I);
   case TargetOpcode::G_EXTRACT:
     return selectG_EXTRACT(I);
   case TargetOpcode::G_MERGE_VALUES:
@@ -1808,6 +2801,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
     return selectG_MERGE_VALUES(I);
   case TargetOpcode::G_UNMERGE_VALUES:
     return selectG_UNMERGE_VALUES(I);
+  case TargetOpcode::G_BUILD_VECTOR_TRUNC:
+    return selectG_BUILD_VECTOR_TRUNC(I);
   case TargetOpcode::G_PTR_ADD:
     return selectG_PTR_ADD(I);
   case TargetOpcode::G_IMPLICIT_DEF:
@@ -1836,6 +2831,8 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_ATOMICRMW_UMAX:
   case TargetOpcode::G_ATOMICRMW_FADD:
     return selectG_LOAD_ATOMICRMW(I);
+  case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
+    return selectG_AMDGPU_ATOMIC_CMPXCHG(I);
   case TargetOpcode::G_SELECT:
     return selectG_SELECT(I);
   case TargetOpcode::G_STORE:
@@ -1845,17 +2842,34 @@ bool AMDGPUInstructionSelector::select(MachineInstr &I) {
   case TargetOpcode::G_SEXT:
   case TargetOpcode::G_ZEXT:
   case TargetOpcode::G_ANYEXT:
+  case TargetOpcode::G_SEXT_INREG:
     if (selectImpl(I, *CoverageInfo))
       return true;
     return selectG_SZA_EXT(I);
   case TargetOpcode::G_BRCOND:
     return selectG_BRCOND(I);
   case TargetOpcode::G_FRAME_INDEX:
-    return selectG_FRAME_INDEX(I);
-  case TargetOpcode::G_PTR_MASK:
-    return selectG_PTR_MASK(I);
+  case TargetOpcode::G_GLOBAL_VALUE:
+    return selectG_FRAME_INDEX_GLOBAL_VALUE(I);
+  case TargetOpcode::G_PTRMASK:
+    return selectG_PTRMASK(I);
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     return selectG_EXTRACT_VECTOR_ELT(I);
+  case TargetOpcode::G_INSERT_VECTOR_ELT:
+    return selectG_INSERT_VECTOR_ELT(I);
+  case TargetOpcode::G_SHUFFLE_VECTOR:
+    return selectG_SHUFFLE_VECTOR(I);
+  case AMDGPU::G_AMDGPU_ATOMIC_INC:
+  case AMDGPU::G_AMDGPU_ATOMIC_DEC:
+    initM0(I);
+    return selectImpl(I, *CoverageInfo);
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
+    const AMDGPU::ImageDimIntrinsicInfo *Intr
+      = AMDGPU::getImageDimIntrinsicInfo(I.getIntrinsicID());
+    assert(Intr && "not an image intrinsic with image pseudo");
+    return selectImageIntrinsic(I, Intr);
+  }
   default:
     return selectImpl(I, *CoverageInfo);
   }
@@ -1871,15 +2885,16 @@ AMDGPUInstructionSelector::selectVCSRC(MachineOperand &Root) const {
 }
 
 std::pair<Register, unsigned>
-AMDGPUInstructionSelector::selectVOP3ModsImpl(
-  Register Src) const {
+AMDGPUInstructionSelector::selectVOP3ModsImpl(MachineOperand &Root) const {
+  Register Src = Root.getReg();
+  Register OrigSrc = Src;
   unsigned Mods = 0;
-  MachineInstr *MI = MRI->getVRegDef(Src);
+  MachineInstr *MI = getDefIgnoringCopies(Src, *MRI);
 
   if (MI && MI->getOpcode() == AMDGPU::G_FNEG) {
     Src = MI->getOperand(1).getReg();
     Mods |= SISrcMods::NEG;
-    MI = MRI->getVRegDef(Src);
+    MI = getDefIgnoringCopies(Src, *MRI);
   }
 
   if (MI && MI->getOpcode() == AMDGPU::G_FABS) {
@@ -1887,6 +2902,20 @@ AMDGPUInstructionSelector::selectVOP3ModsImpl(
     Mods |= SISrcMods::ABS;
   }
 
+  if (Mods != 0 &&
+      RBI.getRegBank(Src, *MRI, TRI)->getID() != AMDGPU::VGPRRegBankID) {
+    MachineInstr *UseMI = Root.getParent();
+
+    // If we looked through copies to find source modifiers on an SGPR operand,
+    // we now have an SGPR register source. To avoid potentially violating the
+    // constant bus restriction, we need to insert a copy to a VGPR.
+    Register VGPRSrc = MRI->cloneVirtualRegister(OrigSrc);
+    BuildMI(*UseMI->getParent(), UseMI, UseMI->getDebugLoc(),
+            TII.get(AMDGPU::COPY), VGPRSrc)
+      .addReg(Src);
+    Src = VGPRSrc;
+  }
+
   return std::make_pair(Src, Mods);
 }
 
@@ -1904,7 +2933,7 @@ InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3Mods0(MachineOperand &Root) const {
   Register Src;
   unsigned Mods;
-  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
 
   return {{
       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1927,7 +2956,7 @@ InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
   Register Src;
   unsigned Mods;
-  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
+  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
 
   return {{
       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1936,12 +2965,48 @@ AMDGPUInstructionSelector::selectVOP3Mods(MachineOperand &Root) const {
 }
 
 InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
+AMDGPUInstructionSelector::selectVOP3NoMods(MachineOperand &Root) const {
+  Register Reg = Root.getReg();
+  const MachineInstr *Def = getDefIgnoringCopies(Reg, *MRI);
+  if (Def && (Def->getOpcode() == AMDGPU::G_FNEG ||
+              Def->getOpcode() == AMDGPU::G_FABS))
+    return {};
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
+  }};
+}
+
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectVOP3PModsImpl(
+  Register Src, const MachineRegisterInfo &MRI) const {
+  unsigned Mods = 0;
+  MachineInstr *MI = MRI.getVRegDef(Src);
+
+  if (MI && MI->getOpcode() == AMDGPU::G_FNEG &&
+      // It's possible to see an f32 fneg here, but unlikely.
+      // TODO: Treat f32 fneg as only high bit.
+      MRI.getType(Src) == LLT::vector(2, 16)) {
+    Mods ^= (SISrcMods::NEG | SISrcMods::NEG_HI);
+    Src = MI->getOperand(1).getReg();
+    MI = MRI.getVRegDef(Src);
+  }
+
+  // TODO: Match op_sel through g_build_vector_trunc and g_shuffle_vector.
+
+  // Packed instructions do not have abs modifiers.
+  Mods |= SISrcMods::OP_SEL_1;
+
+  return std::make_pair(Src, Mods);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectVOP3PMods(MachineOperand &Root) const {
+  MachineRegisterInfo &MRI
+    = Root.getParent()->getParent()->getParent()->getRegInfo();
+
   Register Src;
   unsigned Mods;
-  std::tie(Src, Mods) = selectVOP3ModsImpl(Root.getReg());
-  if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
-    return None;
+  std::tie(Src, Mods) = selectVOP3PModsImpl(Root.getReg(), MRI);
 
   return {{
       [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
@@ -1950,12 +3015,16 @@ AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
 }
 
 InstructionSelector::ComplexRendererFns
-AMDGPUInstructionSelector::selectVOP3OpSelMods0(MachineOperand &Root) const {
-  // FIXME: Handle clamp and op_sel
+AMDGPUInstructionSelector::selectVOP3Mods_nnan(MachineOperand &Root) const {
+  Register Src;
+  unsigned Mods;
+  std::tie(Src, Mods) = selectVOP3ModsImpl(Root);
+  if (!TM.Options.NoNaNsFPMath && !isKnownNeverNaN(Src, *MRI))
+    return None;
+
   return {{
-      [=](MachineInstrBuilder &MIB) { MIB.addReg(Root.getReg()); },
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }, // src_mods
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }  // clamp
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Src); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Mods); }  // src_mods
   }};
 }
 
@@ -1977,15 +3046,15 @@ AMDGPUInstructionSelector::selectSmrdImm(MachineOperand &Root) const {
     return None;
 
   const GEPInfo &GEPInfo = AddrInfo[0];
-
-  if (!AMDGPU::isLegalSMRDImmOffset(STI, GEPInfo.Imm))
+  Optional<int64_t> EncodedImm =
+      AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm, false);
+  if (!EncodedImm)
     return None;
 
   unsigned PtrReg = GEPInfo.SgprParts[0];
-  int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm);
   return {{
     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
-    [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); }
+    [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
   }};
 }
 
@@ -1998,14 +3067,15 @@ AMDGPUInstructionSelector::selectSmrdImm32(MachineOperand &Root) const {
     return None;
 
   const GEPInfo &GEPInfo = AddrInfo[0];
-  unsigned PtrReg = GEPInfo.SgprParts[0];
-  int64_t EncodedImm = AMDGPU::getSMRDEncodedOffset(STI, GEPInfo.Imm);
-  if (!isUInt<32>(EncodedImm))
+  Register PtrReg = GEPInfo.SgprParts[0];
+  Optional<int64_t> EncodedImm =
+      AMDGPU::getSMRDEncodedLiteralOffset32(STI, GEPInfo.Imm);
+  if (!EncodedImm)
     return None;
 
   return {{
     [=](MachineInstrBuilder &MIB) { MIB.addReg(PtrReg); },
-    [=](MachineInstrBuilder &MIB) { MIB.addImm(EncodedImm); }
+    [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }
   }};
 }
 
@@ -2023,14 +3093,15 @@ AMDGPUInstructionSelector::selectSmrdSgpr(MachineOperand &Root) const {
     return None;
 
   const GEPInfo &GEPInfo = AddrInfo[0];
-  if (!GEPInfo.Imm || !isUInt<32>(GEPInfo.Imm))
+  // SGPR offset is unsigned.
+  if (!GEPInfo.Imm || GEPInfo.Imm < 0 || !isUInt<32>(GEPInfo.Imm))
     return None;
 
   // If we make it this far we have a load with an 32-bit immediate offset.
   // It is OK to select this using a sgpr offset, because we have already
   // failed trying to select this load into one of the _IMM variants since
   // the _IMM Patterns are considered before the _SGPR patterns.
-  unsigned PtrReg = GEPInfo.SgprParts[0];
+  Register PtrReg = GEPInfo.SgprParts[0];
   Register OffsetReg = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
   BuildMI(*MBB, MI, MI->getDebugLoc(), TII.get(AMDGPU::S_MOV_B32), OffsetReg)
           .addImm(GEPInfo.Imm);
@@ -2099,7 +3170,8 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
   const SIMachineFunctionInfo *Info = MF->getInfo<SIMachineFunctionInfo>();
 
   int64_t Offset = 0;
-  if (mi_match(Root.getReg(), *MRI, m_ICst(Offset))) {
+  if (mi_match(Root.getReg(), *MRI, m_ICst(Offset)) &&
+      Offset != TM.getNullPointerValue(AMDGPUAS::PRIVATE_ADDRESS)) {
     Register HighBits = MRI->createVirtualRegister(&AMDGPU::VGPR_32RegClass);
 
     // TODO: Should this be inside the render function? The iterator seems to
@@ -2118,17 +3190,17 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
                const MachineMemOperand *MMO = *MI->memoperands_begin();
                const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
 
-               Register SOffsetReg = isStackPtrRelative(PtrInfo)
-                                         ? Info->getStackPtrOffsetReg()
-                                         : Info->getScratchWaveOffsetReg();
-               MIB.addReg(SOffsetReg);
+               if (isStackPtrRelative(PtrInfo))
+                 MIB.addReg(Info->getStackPtrOffsetReg());
+               else
+                 MIB.addImm(0);
              },
              [=](MachineInstrBuilder &MIB) { // offset
                MIB.addImm(Offset & 4095);
              }}};
   }
 
-  assert(Offset == 0);
+  assert(Offset == 0 || Offset == -1);
 
   // Try to fold a frame index directly into the MUBUF vaddr field, and any
   // offsets.
@@ -2158,13 +3230,6 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
     }
   }
 
-  // If we don't know this private access is a local stack object, it needs to
-  // be relative to the entry point's scratch wave offset register.
-  // TODO: Should split large offsets that don't fit like above.
-  // TODO: Don't use scratch wave offset just because the offset didn't fit.
-  Register SOffset = FI.hasValue() ? Info->getStackPtrOffsetReg()
-                                   : Info->getScratchWaveOffsetReg();
-
   return {{[=](MachineInstrBuilder &MIB) { // rsrc
              MIB.addReg(Info->getScratchRSrcReg());
            },
@@ -2175,15 +3240,22 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffen(MachineOperand &Root) const {
                MIB.addReg(VAddr);
            },
            [=](MachineInstrBuilder &MIB) { // soffset
-             MIB.addReg(SOffset);
+             // If we don't know this private access is a local stack object, it
+             // needs to be relative to the entry point's scratch wave offset.
+             // TODO: Should split large offsets that don't fit like above.
+             // TODO: Don't use scratch wave offset just because the offset
+             // didn't fit.
+             if (!Info->isEntryFunction() && FI.hasValue())
+               MIB.addReg(Info->getStackPtrOffsetReg());
+             else
+               MIB.addImm(0);
            },
            [=](MachineInstrBuilder &MIB) { // offset
              MIB.addImm(Offset);
            }}};
 }
 
-bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI,
-                                                const MachineOperand &Base,
+bool AMDGPUInstructionSelector::isDSOffsetLegal(Register Base,
                                                 int64_t Offset,
                                                 unsigned OffsetBits) const {
   if ((OffsetBits == 16 && !isUInt<16>(Offset)) ||
@@ -2195,7 +3267,7 @@ bool AMDGPUInstructionSelector::isDSOffsetLegal(const MachineRegisterInfo &MRI,
 
   // On Southern Islands instruction with a negative base value and an offset
   // don't seem to work.
-  return KnownBits->signBitIsZero(Base.getReg());
+  return KnownBits->signBitIsZero(Base);
 }
 
 InstructionSelector::ComplexRendererFns
@@ -2214,68 +3286,485 @@ AMDGPUInstructionSelector::selectMUBUFScratchOffset(
   const MachineMemOperand *MMO = *MI->memoperands_begin();
   const MachinePointerInfo &PtrInfo = MMO->getPointerInfo();
 
-  Register SOffsetReg = isStackPtrRelative(PtrInfo)
-                            ? Info->getStackPtrOffsetReg()
-                            : Info->getScratchWaveOffsetReg();
   return {{
-      [=](MachineInstrBuilder &MIB) {
+      [=](MachineInstrBuilder &MIB) { // rsrc
         MIB.addReg(Info->getScratchRSrcReg());
-      },                                                         // rsrc
-      [=](MachineInstrBuilder &MIB) { MIB.addReg(SOffsetReg); }, // soffset
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }      // offset
+      },
+      [=](MachineInstrBuilder &MIB) { // soffset
+        if (isStackPtrRelative(PtrInfo))
+          MIB.addReg(Info->getStackPtrOffsetReg());
+        else
+          MIB.addImm(0);
+      },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); } // offset
   }};
 }
 
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectDS1Addr1OffsetImpl(MachineOperand &Root) const {
+  const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
+  if (!RootDef)
+    return std::make_pair(Root.getReg(), 0);
+
+  int64_t ConstAddr = 0;
+
+  Register PtrBase;
+  int64_t Offset;
+  std::tie(PtrBase, Offset) =
+    getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+
+  if (Offset) {
+    if (isDSOffsetLegal(PtrBase, Offset, 16)) {
+      // (add n0, c0)
+      return std::make_pair(PtrBase, Offset);
+    }
+  } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
+    // TODO
+
+
+  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
+    // TODO
+
+  }
+
+  return std::make_pair(Root.getReg(), 0);
+}
+
 InstructionSelector::ComplexRendererFns
 AMDGPUInstructionSelector::selectDS1Addr1Offset(MachineOperand &Root) const {
+  Register Reg;
+  unsigned Offset;
+  std::tie(Reg, Offset) = selectDS1Addr1OffsetImpl(Root);
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }
+    }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectDS64Bit4ByteAligned(MachineOperand &Root) const {
+  Register Reg;
+  unsigned Offset;
+  std::tie(Reg, Offset) = selectDS64Bit4ByteAlignedImpl(Root);
+  return {{
+      [=](MachineInstrBuilder &MIB) { MIB.addReg(Reg); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset+1); }
+    }};
+}
+
+std::pair<Register, unsigned>
+AMDGPUInstructionSelector::selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const {
   const MachineInstr *RootDef = MRI->getVRegDef(Root.getReg());
-  if (!RootDef) {
-    return {{
-        [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
-        [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }
-      }};
-  }
+  if (!RootDef)
+    return std::make_pair(Root.getReg(), 0);
 
   int64_t ConstAddr = 0;
-  if (isBaseWithConstantOffset(Root, *MRI)) {
-    const MachineOperand &LHS = RootDef->getOperand(1);
-    const MachineOperand &RHS = RootDef->getOperand(2);
-    const MachineInstr *LHSDef = MRI->getVRegDef(LHS.getReg());
-    const MachineInstr *RHSDef = MRI->getVRegDef(RHS.getReg());
-    if (LHSDef && RHSDef) {
-      int64_t PossibleOffset =
-        RHSDef->getOperand(1).getCImm()->getSExtValue();
-      if (isDSOffsetLegal(*MRI, LHS, PossibleOffset, 16)) {
-        // (add n0, c0)
-        return {{
-            [=](MachineInstrBuilder &MIB) { MIB.add(LHS); },
-            [=](MachineInstrBuilder &MIB) { MIB.addImm(PossibleOffset); }
-          }};
-      }
+
+  Register PtrBase;
+  int64_t Offset;
+  std::tie(PtrBase, Offset) =
+    getPtrBaseWithConstantOffset(Root.getReg(), *MRI);
+
+  if (Offset) {
+    int64_t DWordOffset0 = Offset / 4;
+    int64_t DWordOffset1 = DWordOffset0 + 1;
+    if (isDSOffsetLegal(PtrBase, DWordOffset1, 8)) {
+      // (add n0, c0)
+      return std::make_pair(PtrBase, DWordOffset0);
     }
   } else if (RootDef->getOpcode() == AMDGPU::G_SUB) {
+    // TODO
 
+  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
+    // TODO
 
+  }
 
-  } else if (mi_match(Root.getReg(), *MRI, m_ICst(ConstAddr))) {
+  return std::make_pair(Root.getReg(), 0);
+}
+
+/// If \p Root is a G_PTR_ADD with a G_CONSTANT on the right hand side, return
+/// the base value with the constant offset. There may be intervening copies
+/// between \p Root and the identified constant. Returns \p Root, 0 if this does
+/// not match the pattern.
+std::pair<Register, int64_t>
+AMDGPUInstructionSelector::getPtrBaseWithConstantOffset(
+  Register Root, const MachineRegisterInfo &MRI) const {
+  MachineInstr *RootI = MRI.getVRegDef(Root);
+  if (RootI->getOpcode() != TargetOpcode::G_PTR_ADD)
+    return {Root, 0};
+
+  MachineOperand &RHS = RootI->getOperand(2);
+  Optional<ValueAndVReg> MaybeOffset
+    = getConstantVRegValWithLookThrough(RHS.getReg(), MRI, true);
+  if (!MaybeOffset)
+    return {Root, 0};
+  return {RootI->getOperand(1).getReg(), MaybeOffset->Value};
+}
+
+static void addZeroImm(MachineInstrBuilder &MIB) {
+  MIB.addImm(0);
+}
+
+/// Return a resource descriptor for use with an arbitrary 64-bit pointer. If \p
+/// BasePtr is not valid, a null base pointer will be used.
+static Register buildRSRC(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+                          uint32_t FormatLo, uint32_t FormatHi,
+                          Register BasePtr) {
+  Register RSrc2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  Register RSrc3 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  Register RSrcHi = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+  Register RSrc = MRI.createVirtualRegister(&AMDGPU::SGPR_128RegClass);
+
+  B.buildInstr(AMDGPU::S_MOV_B32)
+    .addDef(RSrc2)
+    .addImm(FormatLo);
+  B.buildInstr(AMDGPU::S_MOV_B32)
+    .addDef(RSrc3)
+    .addImm(FormatHi);
+
+  // Build the half of the subregister with the constants before building the
+  // full 128-bit register. If we are building multiple resource descriptors,
+  // this will allow CSEing of the 2-component register.
+  B.buildInstr(AMDGPU::REG_SEQUENCE)
+    .addDef(RSrcHi)
+    .addReg(RSrc2)
+    .addImm(AMDGPU::sub0)
+    .addReg(RSrc3)
+    .addImm(AMDGPU::sub1);
+
+  Register RSrcLo = BasePtr;
+  if (!BasePtr) {
+    RSrcLo = MRI.createVirtualRegister(&AMDGPU::SReg_64RegClass);
+    B.buildInstr(AMDGPU::S_MOV_B64)
+      .addDef(RSrcLo)
+      .addImm(0);
+  }
+
+  B.buildInstr(AMDGPU::REG_SEQUENCE)
+    .addDef(RSrc)
+    .addReg(RSrcLo)
+    .addImm(AMDGPU::sub0_sub1)
+    .addReg(RSrcHi)
+    .addImm(AMDGPU::sub2_sub3);
+
+  return RSrc;
+}
+
+static Register buildAddr64RSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+                                const SIInstrInfo &TII, Register BasePtr) {
+  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
+
+  // FIXME: Why are half the "default" bits ignored based on the addressing
+  // mode?
+  return buildRSRC(B, MRI, 0, Hi_32(DefaultFormat), BasePtr);
+}
+
+static Register buildOffsetSrc(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+                               const SIInstrInfo &TII, Register BasePtr) {
+  uint64_t DefaultFormat = TII.getDefaultRsrcDataFormat();
+
+  // FIXME: Why are half the "default" bits ignored based on the addressing
+  // mode?
+  return buildRSRC(B, MRI, -1, Hi_32(DefaultFormat), BasePtr);
+}
+
+AMDGPUInstructionSelector::MUBUFAddressData
+AMDGPUInstructionSelector::parseMUBUFAddress(Register Src) const {
+  MUBUFAddressData Data;
+  Data.N0 = Src;
+
+  Register PtrBase;
+  int64_t Offset;
+
+  std::tie(PtrBase, Offset) = getPtrBaseWithConstantOffset(Src, *MRI);
+  if (isUInt<32>(Offset)) {
+    Data.N0 = PtrBase;
+    Data.Offset = Offset;
+  }
+
+  if (MachineInstr *InputAdd
+      = getOpcodeDef(TargetOpcode::G_PTR_ADD, Data.N0, *MRI)) {
+    Data.N2 = InputAdd->getOperand(1).getReg();
+    Data.N3 = InputAdd->getOperand(2).getReg();
+
+    // FIXME: Need to fix extra SGPR->VGPRcopies inserted
+    // FIXME: Don't know this was defined by operand 0
+    //
+    // TODO: Remove this when we have copy folding optimizations after
+    // RegBankSelect.
+    Data.N2 = getDefIgnoringCopies(Data.N2, *MRI)->getOperand(0).getReg();
+    Data.N3 = getDefIgnoringCopies(Data.N3, *MRI)->getOperand(0).getReg();
+  }
+
+  return Data;
+}
+
+/// Return if the addr64 mubuf mode should be used for the given address.
+bool AMDGPUInstructionSelector::shouldUseAddr64(MUBUFAddressData Addr) const {
+  // (ptr_add N2, N3) -> addr64, or
+  // (ptr_add (ptr_add N2, N3), C1) -> addr64
+  if (Addr.N2)
+    return true;
+
+  const RegisterBank *N0Bank = RBI.getRegBank(Addr.N0, *MRI, TRI);
+  return N0Bank->getID() == AMDGPU::VGPRRegBankID;
+}
 
+/// Split an immediate offset \p ImmOffset depending on whether it fits in the
+/// immediate field. Modifies \p ImmOffset and sets \p SOffset to the variable
+/// component.
+void AMDGPUInstructionSelector::splitIllegalMUBUFOffset(
+  MachineIRBuilder &B, Register &SOffset, int64_t &ImmOffset) const {
+  if (SIInstrInfo::isLegalMUBUFImmOffset(ImmOffset))
+    return;
+
+  // Illegal offset, store it in soffset.
+  SOffset = MRI->createVirtualRegister(&AMDGPU::SReg_32RegClass);
+  B.buildInstr(AMDGPU::S_MOV_B32)
+    .addDef(SOffset)
+    .addImm(ImmOffset);
+  ImmOffset = 0;
+}
 
+bool AMDGPUInstructionSelector::selectMUBUFAddr64Impl(
+  MachineOperand &Root, Register &VAddr, Register &RSrcReg,
+  Register &SOffset, int64_t &Offset) const {
+  // FIXME: Predicates should stop this from reaching here.
+  // addr64 bit was removed for volcanic islands.
+  if (!STI.hasAddr64() || STI.useFlatForGlobal())
+    return false;
+
+  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
+  if (!shouldUseAddr64(AddrData))
+    return false;
+
+  Register N0 = AddrData.N0;
+  Register N2 = AddrData.N2;
+  Register N3 = AddrData.N3;
+  Offset = AddrData.Offset;
+
+  // Base pointer for the SRD.
+  Register SRDPtr;
+
+  if (N2) {
+    if (RBI.getRegBank(N2, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
+      assert(N3);
+      if (RBI.getRegBank(N3, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
+        // Both N2 and N3 are divergent. Use N0 (the result of the add) as the
+        // addr64, and construct the default resource from a 0 address.
+        VAddr = N0;
+      } else {
+        SRDPtr = N3;
+        VAddr = N2;
+      }
+    } else {
+      // N2 is not divergent.
+      SRDPtr = N2;
+      VAddr = N3;
+    }
+  } else if (RBI.getRegBank(N0, *MRI, TRI)->getID() == AMDGPU::VGPRRegBankID) {
+    // Use the default null pointer in the resource
+    VAddr = N0;
+  } else {
+    // N0 -> offset, or
+    // (N0 + C1) -> offset
+    SRDPtr = N0;
   }
 
+  MachineIRBuilder B(*Root.getParent());
+  RSrcReg = buildAddr64RSrc(B, *MRI, TII, SRDPtr);
+  splitIllegalMUBUFOffset(B, SOffset, Offset);
+  return true;
+}
+
+bool AMDGPUInstructionSelector::selectMUBUFOffsetImpl(
+  MachineOperand &Root, Register &RSrcReg, Register &SOffset,
+  int64_t &Offset) const {
+  MUBUFAddressData AddrData = parseMUBUFAddress(Root.getReg());
+  if (shouldUseAddr64(AddrData))
+    return false;
+
+  // N0 -> offset, or
+  // (N0 + C1) -> offset
+  Register SRDPtr = AddrData.N0;
+  Offset = AddrData.Offset;
+
+  // TODO: Look through extensions for 32-bit soffset.
+  MachineIRBuilder B(*Root.getParent());
+
+  RSrcReg = buildOffsetSrc(B, *MRI, TII, SRDPtr);
+  splitIllegalMUBUFOffset(B, SOffset, Offset);
+  return true;
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFAddr64(MachineOperand &Root) const {
+  Register VAddr;
+  Register RSrcReg;
+  Register SOffset;
+  int64_t Offset = 0;
+
+  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
+    return {};
+
+  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
+  // pattern.
   return {{
-      [=](MachineInstrBuilder &MIB) { MIB.add(Root); },
-      [=](MachineInstrBuilder &MIB) { MIB.addImm(0); }
+      [=](MachineInstrBuilder &MIB) {  // rsrc
+        MIB.addReg(RSrcReg);
+      },
+      [=](MachineInstrBuilder &MIB) { // vaddr
+        MIB.addReg(VAddr);
+      },
+      [=](MachineInstrBuilder &MIB) { // soffset
+        if (SOffset)
+          MIB.addReg(SOffset);
+        else
+          MIB.addImm(0);
+      },
+      [=](MachineInstrBuilder &MIB) { // offset
+        MIB.addImm(Offset);
+      },
+      addZeroImm, //  glc
+      addZeroImm, //  slc
+      addZeroImm, //  tfe
+      addZeroImm, //  dlc
+      addZeroImm  //  swz
+    }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFOffset(MachineOperand &Root) const {
+  Register RSrcReg;
+  Register SOffset;
+  int64_t Offset = 0;
+
+  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
+    return {};
+
+  return {{
+      [=](MachineInstrBuilder &MIB) {  // rsrc
+        MIB.addReg(RSrcReg);
+      },
+      [=](MachineInstrBuilder &MIB) { // soffset
+        if (SOffset)
+          MIB.addReg(SOffset);
+        else
+          MIB.addImm(0);
+      },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
+      addZeroImm, //  glc
+      addZeroImm, //  slc
+      addZeroImm, //  tfe
+      addZeroImm, //  dlc
+      addZeroImm  //  swz
+    }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFAddr64Atomic(MachineOperand &Root) const {
+  Register VAddr;
+  Register RSrcReg;
+  Register SOffset;
+  int64_t Offset = 0;
+
+  if (!selectMUBUFAddr64Impl(Root, VAddr, RSrcReg, SOffset, Offset))
+    return {};
+
+  // FIXME: Use defaulted operands for trailing 0s and remove from the complex
+  // pattern.
+  return {{
+      [=](MachineInstrBuilder &MIB) {  // rsrc
+        MIB.addReg(RSrcReg);
+      },
+      [=](MachineInstrBuilder &MIB) { // vaddr
+        MIB.addReg(VAddr);
+      },
+      [=](MachineInstrBuilder &MIB) { // soffset
+        if (SOffset)
+          MIB.addReg(SOffset);
+        else
+          MIB.addImm(0);
+      },
+      [=](MachineInstrBuilder &MIB) { // offset
+        MIB.addImm(Offset);
+      },
+      addZeroImm //  slc
     }};
 }
 
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectMUBUFOffsetAtomic(MachineOperand &Root) const {
+  Register RSrcReg;
+  Register SOffset;
+  int64_t Offset = 0;
+
+  if (!selectMUBUFOffsetImpl(Root, RSrcReg, SOffset, Offset))
+    return {};
+
+  return {{
+      [=](MachineInstrBuilder &MIB) {  // rsrc
+        MIB.addReg(RSrcReg);
+      },
+      [=](MachineInstrBuilder &MIB) { // soffset
+        if (SOffset)
+          MIB.addReg(SOffset);
+        else
+          MIB.addImm(0);
+      },
+      [=](MachineInstrBuilder &MIB) { MIB.addImm(Offset); }, // offset
+      addZeroImm //  slc
+    }};
+}
+
+/// Get an immediate that must be 32-bits, and treated as zero extended.
+static Optional<uint64_t> getConstantZext32Val(Register Reg,
+                                               const MachineRegisterInfo &MRI) {
+  // getConstantVRegVal sexts any values, so see if that matters.
+  Optional<int64_t> OffsetVal = getConstantVRegVal(Reg, MRI);
+  if (!OffsetVal || !isInt<32>(*OffsetVal))
+    return None;
+  return Lo_32(*OffsetVal);
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSMRDBufferImm(MachineOperand &Root) const {
+  Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
+  if (!OffsetVal)
+    return {};
+
+  Optional<int64_t> EncodedImm =
+      AMDGPU::getSMRDEncodedOffset(STI, *OffsetVal, true);
+  if (!EncodedImm)
+    return {};
+
+  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
+}
+
+InstructionSelector::ComplexRendererFns
+AMDGPUInstructionSelector::selectSMRDBufferImm32(MachineOperand &Root) const {
+  assert(STI.getGeneration() == AMDGPUSubtarget::SEA_ISLANDS);
+
+  Optional<uint64_t> OffsetVal = getConstantZext32Val(Root.getReg(), *MRI);
+  if (!OffsetVal)
+    return {};
+
+  Optional<int64_t> EncodedImm
+    = AMDGPU::getSMRDEncodedLiteralOffset32(STI, *OffsetVal);
+  if (!EncodedImm)
+    return {};
+
+  return {{ [=](MachineInstrBuilder &MIB) { MIB.addImm(*EncodedImm); }  }};
+}
+
 void AMDGPUInstructionSelector::renderTruncImm32(MachineInstrBuilder &MIB,
                                                  const MachineInstr &MI,
                                                  int OpIdx) const {
   assert(MI.getOpcode() == TargetOpcode::G_CONSTANT && OpIdx == -1 &&
          "Expected G_CONSTANT");
-  Optional<int64_t> CstVal = getConstantVRegVal(MI.getOperand(0).getReg(), *MRI);
-  assert(CstVal && "Expected constant value");
-  MIB.addImm(CstVal.getValue());
+  MIB.addImm(MI.getOperand(1).getCImm()->getSExtValue());
 }
 
 void AMDGPUInstructionSelector::renderNegateImm(MachineInstrBuilder &MIB,
@@ -2316,6 +3805,34 @@ void AMDGPUInstructionSelector::renderTruncTImm(MachineInstrBuilder &MIB,
   MIB.addImm(MI.getOperand(OpIdx).getImm());
 }
 
+void AMDGPUInstructionSelector::renderExtractGLC(MachineInstrBuilder &MIB,
+                                                 const MachineInstr &MI,
+                                                 int OpIdx) const {
+  assert(OpIdx >= 0 && "expected to match an immediate operand");
+  MIB.addImm(MI.getOperand(OpIdx).getImm() & 1);
+}
+
+void AMDGPUInstructionSelector::renderExtractSLC(MachineInstrBuilder &MIB,
+                                                 const MachineInstr &MI,
+                                                 int OpIdx) const {
+  assert(OpIdx >= 0 && "expected to match an immediate operand");
+  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 1) & 1);
+}
+
+void AMDGPUInstructionSelector::renderExtractDLC(MachineInstrBuilder &MIB,
+                                                 const MachineInstr &MI,
+                                                 int OpIdx) const {
+  assert(OpIdx >= 0 && "expected to match an immediate operand");
+  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 2) & 1);
+}
+
+void AMDGPUInstructionSelector::renderExtractSWZ(MachineInstrBuilder &MIB,
+                                                 const MachineInstr &MI,
+                                                 int OpIdx) const {
+  assert(OpIdx >= 0 && "expected to match an immediate operand");
+  MIB.addImm((MI.getOperand(OpIdx).getImm() >> 3) & 1);
+}
+
 bool AMDGPUInstructionSelector::isInlineImmediate16(int64_t Imm) const {
   return AMDGPU::isInlinableLiteral16(Imm, STI.hasInv2PiInlineImm());
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
index 38ca7fd4104b..1fe80958917d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructionSelector.h
@@ -31,6 +31,10 @@ namespace {
 
 namespace llvm {
 
+namespace AMDGPU {
+struct ImageDimIntrinsicInfo;
+}
+
 class AMDGPUInstrInfo;
 class AMDGPURegisterBankInfo;
 class GCNSubtarget;
@@ -80,28 +84,39 @@ private:
   MachineOperand getSubOperand64(MachineOperand &MO,
                                  const TargetRegisterClass &SubRC,
                                  unsigned SubIdx) const;
+
+  bool constrainCopyLikeIntrin(MachineInstr &MI, unsigned NewOpc) const;
   bool selectCOPY(MachineInstr &I) const;
   bool selectPHI(MachineInstr &I) const;
   bool selectG_TRUNC(MachineInstr &I) const;
   bool selectG_SZA_EXT(MachineInstr &I) const;
   bool selectG_CONSTANT(MachineInstr &I) const;
+  bool selectG_FNEG(MachineInstr &I) const;
+  bool selectG_FABS(MachineInstr &I) const;
   bool selectG_AND_OR_XOR(MachineInstr &I) const;
   bool selectG_ADD_SUB(MachineInstr &I) const;
   bool selectG_UADDO_USUBO_UADDE_USUBE(MachineInstr &I) const;
   bool selectG_EXTRACT(MachineInstr &I) const;
   bool selectG_MERGE_VALUES(MachineInstr &I) const;
   bool selectG_UNMERGE_VALUES(MachineInstr &I) const;
+  bool selectG_BUILD_VECTOR_TRUNC(MachineInstr &I) const;
   bool selectG_PTR_ADD(MachineInstr &I) const;
   bool selectG_IMPLICIT_DEF(MachineInstr &I) const;
   bool selectG_INSERT(MachineInstr &I) const;
-  bool selectG_INTRINSIC(MachineInstr &I) const;
 
-  std::tuple<Register, unsigned, unsigned>
-  splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
+  bool selectInterpP1F16(MachineInstr &MI) const;
+  bool selectDivScale(MachineInstr &MI) const;
+  bool selectIntrinsicIcmp(MachineInstr &MI) const;
+  bool selectBallot(MachineInstr &I) const;
+  bool selectG_INTRINSIC(MachineInstr &I) const;
 
-  bool selectStoreIntrinsic(MachineInstr &MI, bool IsFormat) const;
+  bool selectEndCfIntrinsic(MachineInstr &MI) const;
   bool selectDSOrderedIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
+  bool selectDSGWSIntrinsic(MachineInstr &MI, Intrinsic::ID IID) const;
+  bool selectDSAppendConsume(MachineInstr &MI, bool IsAppend) const;
 
+  bool selectImageIntrinsic(MachineInstr &MI,
+                            const AMDGPU::ImageDimIntrinsicInfo *Intr) const;
   bool selectG_INTRINSIC_W_SIDE_EFFECTS(MachineInstr &I) const;
   int getS_CMPOpcode(CmpInst::Predicate P, unsigned Size) const;
   bool selectG_ICMP(MachineInstr &I) const;
@@ -112,15 +127,18 @@ private:
 
   void initM0(MachineInstr &I) const;
   bool selectG_LOAD_ATOMICRMW(MachineInstr &I) const;
+  bool selectG_AMDGPU_ATOMIC_CMPXCHG(MachineInstr &I) const;
   bool selectG_STORE(MachineInstr &I) const;
   bool selectG_SELECT(MachineInstr &I) const;
   bool selectG_BRCOND(MachineInstr &I) const;
-  bool selectG_FRAME_INDEX(MachineInstr &I) const;
-  bool selectG_PTR_MASK(MachineInstr &I) const;
+  bool selectG_FRAME_INDEX_GLOBAL_VALUE(MachineInstr &I) const;
+  bool selectG_PTRMASK(MachineInstr &I) const;
   bool selectG_EXTRACT_VECTOR_ELT(MachineInstr &I) const;
+  bool selectG_INSERT_VECTOR_ELT(MachineInstr &I) const;
+  bool selectG_SHUFFLE_VECTOR(MachineInstr &I) const;
 
   std::pair<Register, unsigned>
-  selectVOP3ModsImpl(Register Src) const;
+  selectVOP3ModsImpl(MachineOperand &Root) const;
 
   InstructionSelector::ComplexRendererFns
   selectVCSRC(MachineOperand &Root) const;
@@ -134,11 +152,18 @@ private:
   selectVOP3OMods(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectVOP3Mods(MachineOperand &Root) const;
+
+  ComplexRendererFns selectVOP3NoMods(MachineOperand &Root) const;
+
   InstructionSelector::ComplexRendererFns
   selectVOP3Mods_nnan(MachineOperand &Root) const;
 
+  std::pair<Register, unsigned>
+  selectVOP3PModsImpl(Register Src, const MachineRegisterInfo &MRI) const;
+
   InstructionSelector::ComplexRendererFns
-  selectVOP3OpSelMods0(MachineOperand &Root) const;
+  selectVOP3PMods(MachineOperand &Root) const;
+
   InstructionSelector::ComplexRendererFns
   selectVOP3OpSelMods(MachineOperand &Root) const;
 
@@ -163,19 +188,86 @@ private:
   InstructionSelector::ComplexRendererFns
   selectMUBUFScratchOffset(MachineOperand &Root) const;
 
-  bool isDSOffsetLegal(const MachineRegisterInfo &MRI,
-                       const MachineOperand &Base,
-                       int64_t Offset, unsigned OffsetBits) const;
+  bool isDSOffsetLegal(Register Base, int64_t Offset,
+                       unsigned OffsetBits) const;
 
+  std::pair<Register, unsigned>
+  selectDS1Addr1OffsetImpl(MachineOperand &Root) const;
   InstructionSelector::ComplexRendererFns
   selectDS1Addr1Offset(MachineOperand &Root) const;
 
+  std::pair<Register, unsigned>
+  selectDS64Bit4ByteAlignedImpl(MachineOperand &Root) const;
+  InstructionSelector::ComplexRendererFns
+  selectDS64Bit4ByteAligned(MachineOperand &Root) const;
+
+  std::pair<Register, int64_t>
+  getPtrBaseWithConstantOffset(Register Root,
+                               const MachineRegisterInfo &MRI) const;
+
+  // Parse out a chain of up to two g_ptr_add instructions.
+  // g_ptr_add (n0, _)
+  // g_ptr_add (n0, (n1 = g_ptr_add n2, n3))
+  struct MUBUFAddressData {
+    Register N0, N2, N3;
+    int64_t Offset = 0;
+  };
+
+  bool shouldUseAddr64(MUBUFAddressData AddrData) const;
+
+  void splitIllegalMUBUFOffset(MachineIRBuilder &B,
+                               Register &SOffset, int64_t &ImmOffset) const;
+
+  MUBUFAddressData parseMUBUFAddress(Register Src) const;
+
+  bool selectMUBUFAddr64Impl(MachineOperand &Root, Register &VAddr,
+                             Register &RSrcReg, Register &SOffset,
+                             int64_t &Offset) const;
+
+  bool selectMUBUFOffsetImpl(MachineOperand &Root, Register &RSrcReg,
+                             Register &SOffset, int64_t &Offset) const;
+
+  InstructionSelector::ComplexRendererFns
+  selectMUBUFAddr64(MachineOperand &Root) const;
+
+  InstructionSelector::ComplexRendererFns
+  selectMUBUFOffset(MachineOperand &Root) const;
+
+  InstructionSelector::ComplexRendererFns
+  selectMUBUFOffsetAtomic(MachineOperand &Root) const;
+
+  InstructionSelector::ComplexRendererFns
+  selectMUBUFAddr64Atomic(MachineOperand &Root) const;
+
+  ComplexRendererFns selectSMRDBufferImm(MachineOperand &Root) const;
+  ComplexRendererFns selectSMRDBufferImm32(MachineOperand &Root) const;
+
   void renderTruncImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
                         int OpIdx = -1) const;
 
   void renderTruncTImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
                        int OpIdx) const;
 
+  void renderTruncTImm1(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                        int OpIdx) const {
+    renderTruncTImm(MIB, MI, OpIdx);
+  }
+
+  void renderTruncTImm8(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                        int OpIdx) const {
+    renderTruncTImm(MIB, MI, OpIdx);
+  }
+
+  void renderTruncTImm16(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                        int OpIdx) const {
+    renderTruncTImm(MIB, MI, OpIdx);
+  }
+
+  void renderTruncTImm32(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                        int OpIdx) const {
+    renderTruncTImm(MIB, MI, OpIdx);
+  }
+
   void renderNegateImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
                        int OpIdx) const;
 
@@ -184,6 +276,14 @@ private:
 
   void renderPopcntImm(MachineInstrBuilder &MIB, const MachineInstr &MI,
                        int OpIdx) const;
+  void renderExtractGLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                        int OpIdx) const;
+  void renderExtractSLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                        int OpIdx) const;
+  void renderExtractDLC(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                        int OpIdx) const;
+  void renderExtractSWZ(MachineInstrBuilder &MIB, const MachineInstr &MI,
+                        int OpIdx) const;
 
   bool isInlineImmediate16(int64_t Imm) const;
   bool isInlineImmediate32(int64_t Imm) const;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
index 7e71dbdd1240..5cb7ac320d2f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstructions.td
@@ -77,6 +77,9 @@ class ILFormat<dag outs, dag ins, string asmstr, list<dag> pattern>
 
 def TruePredicate : Predicate<"">;
 
+// FIXME: Tablegen should specially supports this
+def FalsePredicate : Predicate<"false">;
+
 // Add a predicate to the list if does not already exist to deduplicate it.
 class PredConcat<list<Predicate> lst, Predicate pred> {
   list<Predicate> ret =
@@ -101,12 +104,12 @@ class AMDGPUPat<dag pattern, dag result> : Pat<pattern, result>,
       PredicateControl;
 
 let RecomputePerFunction = 1 in {
-def FP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
-def FP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals">;
-def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
-def NoFP16Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
-def NoFP32Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals">;
-def NoFP64Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().FP64FP16Denormals">;
+def FP16Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">;
+def FP32Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()">;
+def FP64Denormals : Predicate<"MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">;
+def NoFP16Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">;
+def NoFP32Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals()">;
+def NoFP64Denormals : Predicate<"!MF->getInfo<SIMachineFunctionInfo>()->getMode().allFP64FP16Denormals()">;
 def UnsafeFPMath : Predicate<"TM.Options.UnsafeFPMath">;
 }
 
@@ -408,7 +411,12 @@ def atomic_load_64_#as : PatFrag<(ops node:$ptr), (atomic_load_64 node:$ptr)> {
   let IsAtomic = 1;
   let MemoryVT = i64;
 }
+} // End let AddressSpaces
+} // End foreach as
+
 
+foreach as = [ "global", "flat", "local", "private", "region" ] in {
+let AddressSpaces = !cast<AddressSpaceList>("StoreAddress_"#as).AddrSpaces in {
 def store_#as : PatFrag<(ops node:$val, node:$ptr),
                     (unindexedstore node:$val, node:$ptr)> {
   let IsStore = 1;
@@ -444,8 +452,8 @@ def truncstorei16_hi16_#as : StoreHi16<truncstorei16>;
 
 defm atomic_store_#as : binary_atomic_op<atomic_store>;
 
-} // End let AddressSpaces = ...
-} // End foreach AddrSpace
+} // End let AddressSpaces
+} // End foreach as
 
 
 multiclass ret_noret_binary_atomic_op<SDNode atomic_op, bit IsInt = 1> {
@@ -520,7 +528,7 @@ class Constants {
 int TWO_PI = 0x40c90fdb;
 int PI = 0x40490fdb;
 int TWO_PI_INV = 0x3e22f983;
-int FP_UINT_MAX_PLUS_1 = 0x4f800000;    // 1 << 32 in floating point encoding
+int FP_4294966784 = 0x4f7ffffe; // 4294966784 = 4294967296 - 512 = 2^32 - 2^9
 int FP16_ONE = 0x3C00;
 int FP16_NEG_ONE = 0xBC00;
 int FP32_ONE = 0x3f800000;
@@ -731,6 +739,12 @@ multiclass BFEPattern <Instruction UBFE, Instruction SBFE, Instruction MOV> {
   >;
 }
 
+// fshr pattern
+class FSHRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
+  (fshr i32:$src0, i32:$src1, i32:$src2),
+  (BIT_ALIGN $src0, $src1, $src2)
+>;
+
 // rotr pattern
 class ROTRPattern <Instruction BIT_ALIGN> : AMDGPUPat <
   (rotr i32:$src0, i32:$src1),
@@ -796,3 +810,13 @@ def fmaxnum_like_oneuse : PatFrags<(ops node:$src0, node:$src1),
   [(fmaxnum_ieee_oneuse node:$src0, node:$src1),
    (fmaxnum_oneuse node:$src0, node:$src1)]
 >;
+
+def any_fmad : PatFrags<(ops node:$src0, node:$src1, node:$src2),
+  [(fmad node:$src0, node:$src1, node:$src2),
+   (AMDGPUfmad_ftz node:$src0, node:$src1, node:$src2)]
+>;
+
+// FIXME: fsqrt should not select directly
+def any_amdgcn_sqrt : PatFrags<(ops node:$src0),
+  [(fsqrt node:$src0), (int_amdgcn_sqrt node:$src0)]
+>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
index 3f99d5cfb7f9..2976794b49c3 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.cpp
@@ -11,19 +11,16 @@
 /// \todo This should be generated by TableGen.
 //===----------------------------------------------------------------------===//
 
-#if defined(_MSC_VER) || defined(__MINGW32__)
-// According to Microsoft, one must set _USE_MATH_DEFINES in order to get M_PI
-// from the Visual C++ cmath / math.h headers:
-// https://docs.microsoft.com/en-us/cpp/c-runtime-library/math-constants?view=vs-2019
-#define _USE_MATH_DEFINES
-#endif
+#include "AMDGPULegalizerInfo.h"
 
 #include "AMDGPU.h"
-#include "AMDGPULegalizerInfo.h"
+#include "AMDGPUGlobalISelUtils.h"
 #include "AMDGPUTargetMachine.h"
 #include "SIMachineFunctionInfo.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
 #include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -37,21 +34,30 @@ using namespace llvm;
 using namespace LegalizeActions;
 using namespace LegalizeMutations;
 using namespace LegalityPredicates;
-
-
-static LegalityPredicate isMultiple32(unsigned TypeIdx,
-                                      unsigned MaxSize = 1024) {
-  return [=](const LegalityQuery &Query) {
-    const LLT Ty = Query.Types[TypeIdx];
-    const LLT EltTy = Ty.getScalarType();
-    return Ty.getSizeInBits() <= MaxSize && EltTy.getSizeInBits() % 32 == 0;
-  };
+using namespace MIPatternMatch;
+
+// Hack until load/store selection patterns support any tuple of legal types.
+static cl::opt<bool> EnableNewLegality(
+  "amdgpu-global-isel-new-legality",
+  cl::desc("Use GlobalISel desired legality, rather than try to use"
+           "rules compatible with selection patterns"),
+  cl::init(false),
+  cl::ReallyHidden);
+
+static constexpr unsigned MaxRegisterSize = 1024;
+
+// Round the number of elements to the next power of two elements
+static LLT getPow2VectorType(LLT Ty) {
+  unsigned NElts = Ty.getNumElements();
+  unsigned Pow2NElts = 1 <<  Log2_32_Ceil(NElts);
+  return Ty.changeNumElements(Pow2NElts);
 }
 
-static LegalityPredicate sizeIs(unsigned TypeIdx, unsigned Size) {
-  return [=](const LegalityQuery &Query) {
-    return Query.Types[TypeIdx].getSizeInBits() == Size;
-  };
+// Round the number of bits to the next power of two bits
+static LLT getPow2ScalarType(LLT Ty) {
+  unsigned Bits = Ty.getSizeInBits();
+  unsigned Pow2Bits = 1 <<  Log2_32_Ceil(Bits);
+  return LLT::scalar(Pow2Bits);
 }
 
 static LegalityPredicate isSmallOddVector(unsigned TypeIdx) {
@@ -109,6 +115,23 @@ static LegalizeMutation moreEltsToNext32Bit(unsigned TypeIdx) {
   };
 }
 
+static LegalizeMutation bitcastToRegisterType(unsigned TypeIdx) {
+  return [=](const LegalityQuery &Query) {
+    const LLT Ty = Query.Types[TypeIdx];
+    unsigned Size = Ty.getSizeInBits();
+
+    LLT CoercedTy;
+    if (Size <= 32) {
+      // <2 x s8> -> s16
+      // <4 x s8> -> s32
+      CoercedTy = LLT::scalar(Size);
+    } else
+      CoercedTy = LLT::scalarOrVector(Size / 32, 32);
+
+    return std::make_pair(TypeIdx, CoercedTy);
+  };
+}
+
 static LegalityPredicate vectorSmallerThan(unsigned TypeIdx, unsigned Size) {
   return [=](const LegalityQuery &Query) {
     const LLT QueryTy = Query.Types[TypeIdx];
@@ -130,25 +153,47 @@ static LegalityPredicate numElementsNotEven(unsigned TypeIdx) {
   };
 }
 
-// Any combination of 32 or 64-bit elements up to 1024 bits, and multiples of
-// v2s16.
+static bool isRegisterSize(unsigned Size) {
+  return Size % 32 == 0 && Size <= MaxRegisterSize;
+}
+
+static bool isRegisterVectorElementType(LLT EltTy) {
+  const int EltSize = EltTy.getSizeInBits();
+  return EltSize == 16 || EltSize % 32 == 0;
+}
+
+static bool isRegisterVectorType(LLT Ty) {
+  const int EltSize = Ty.getElementType().getSizeInBits();
+  return EltSize == 32 || EltSize == 64 ||
+         (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
+         EltSize == 128 || EltSize == 256;
+}
+
+static bool isRegisterType(LLT Ty) {
+  if (!isRegisterSize(Ty.getSizeInBits()))
+    return false;
+
+  if (Ty.isVector())
+    return isRegisterVectorType(Ty);
+
+  return true;
+}
+
+// Any combination of 32 or 64-bit elements up the maximum register size, and
+// multiples of v2s16.
 static LegalityPredicate isRegisterType(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
-    const LLT Ty = Query.Types[TypeIdx];
-    if (Ty.isVector()) {
-      const int EltSize = Ty.getElementType().getSizeInBits();
-      return EltSize == 32 || EltSize == 64 ||
-            (EltSize == 16 && Ty.getNumElements() % 2 == 0) ||
-             EltSize == 128 || EltSize == 256;
-    }
-
-    return Ty.getSizeInBits() % 32 == 0 && Ty.getSizeInBits() <= 1024;
+    return isRegisterType(Query.Types[TypeIdx]);
   };
 }
 
-static LegalityPredicate elementTypeIs(unsigned TypeIdx, LLT Type) {
+static LegalityPredicate elementTypeIsLegal(unsigned TypeIdx) {
   return [=](const LegalityQuery &Query) {
-    return Query.Types[TypeIdx].getElementType() == Type;
+    const LLT QueryTy = Query.Types[TypeIdx];
+    if (!QueryTy.isVector())
+      return false;
+    const LLT EltTy = QueryTy.getElementType();
+    return EltTy == LLT::scalar(16) || EltTy.getSizeInBits() >= 32;
   };
 }
 
@@ -160,6 +205,120 @@ static LegalityPredicate isWideScalarTruncStore(unsigned TypeIdx) {
   };
 }
 
+// TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
+// handle some operations by just promoting the register during
+// selection. There are also d16 loads on GFX9+ which preserve the high bits.
+static unsigned maxSizeForAddrSpace(const GCNSubtarget &ST, unsigned AS,
+                                    bool IsLoad) {
+  switch (AS) {
+  case AMDGPUAS::PRIVATE_ADDRESS:
+    // FIXME: Private element size.
+    return 32;
+  case AMDGPUAS::LOCAL_ADDRESS:
+    return ST.useDS128() ? 128 : 64;
+  case AMDGPUAS::GLOBAL_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS:
+  case AMDGPUAS::CONSTANT_ADDRESS_32BIT:
+    // Treat constant and global as identical. SMRD loads are sometimes usable for
+    // global loads (ideally constant address space should be eliminated)
+    // depending on the context. Legality cannot be context dependent, but
+    // RegBankSelect can split the load as necessary depending on the pointer
+    // register bank/uniformity and if the memory is invariant or not written in a
+    // kernel.
+    return IsLoad ? 512 : 128;
+  default:
+    // Flat addresses may contextually need to be split to 32-bit parts if they
+    // may alias scratch depending on the subtarget.
+    return 128;
+  }
+}
+
+static bool isLoadStoreSizeLegal(const GCNSubtarget &ST,
+                                 const LegalityQuery &Query,
+                                 unsigned Opcode) {
+  const LLT Ty = Query.Types[0];
+
+  // Handle G_LOAD, G_ZEXTLOAD, G_SEXTLOAD
+  const bool IsLoad = Opcode != AMDGPU::G_STORE;
+
+  unsigned RegSize = Ty.getSizeInBits();
+  unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+  unsigned Align = Query.MMODescrs[0].AlignInBits;
+  unsigned AS = Query.Types[1].getAddressSpace();
+
+  // All of these need to be custom lowered to cast the pointer operand.
+  if (AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT)
+    return false;
+
+  // TODO: We should be able to widen loads if the alignment is high enough, but
+  // we also need to modify the memory access size.
+#if 0
+  // Accept widening loads based on alignment.
+  if (IsLoad && MemSize < Size)
+    MemSize = std::max(MemSize, Align);
+#endif
+
+  // Only 1-byte and 2-byte to 32-bit extloads are valid.
+  if (MemSize != RegSize && RegSize != 32)
+    return false;
+
+  if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
+    return false;
+
+  switch (MemSize) {
+  case 8:
+  case 16:
+  case 32:
+  case 64:
+  case 128:
+    break;
+  case 96:
+    if (!ST.hasDwordx3LoadStores())
+      return false;
+    break;
+  case 256:
+  case 512:
+    // These may contextually need to be broken down.
+    break;
+  default:
+    return false;
+  }
+
+  assert(RegSize >= MemSize);
+
+  if (Align < MemSize) {
+    const SITargetLowering *TLI = ST.getTargetLowering();
+    if (!TLI->allowsMisalignedMemoryAccessesImpl(MemSize, AS, Align / 8))
+      return false;
+  }
+
+  return true;
+}
+
+// The current selector can't handle <6 x s16>, <8 x s16>, s96, s128 etc, so
+// workaround this. Eventually it should ignore the type for loads and only care
+// about the size. Return true in cases where we will workaround this for now by
+// bitcasting.
+static bool loadStoreBitcastWorkaround(const LLT Ty) {
+  if (EnableNewLegality)
+    return false;
+
+  const unsigned Size = Ty.getSizeInBits();
+  if (Size <= 64)
+    return false;
+  if (!Ty.isVector())
+    return true;
+  unsigned EltSize = Ty.getElementType().getSizeInBits();
+  return EltSize != 32 && EltSize != 64;
+}
+
+static bool isLoadStoreLegal(const GCNSubtarget &ST, const LegalityQuery &Query,
+                             unsigned Opcode) {
+  const LLT Ty = Query.Types[0];
+  return isRegisterType(Ty) && isLoadStoreSizeLegal(ST, Query, Opcode) &&
+         !loadStoreBitcastWorkaround(Ty);
+}
+
 AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
                                          const GCNTargetMachine &TM)
   :  ST(ST_) {
@@ -170,14 +329,13 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   };
 
   const LLT S1 = LLT::scalar(1);
-  const LLT S8 = LLT::scalar(8);
   const LLT S16 = LLT::scalar(16);
   const LLT S32 = LLT::scalar(32);
   const LLT S64 = LLT::scalar(64);
-  const LLT S96 = LLT::scalar(96);
   const LLT S128 = LLT::scalar(128);
   const LLT S256 = LLT::scalar(256);
-  const LLT S1024 = LLT::scalar(1024);
+  const LLT S512 = LLT::scalar(512);
+  const LLT MaxScalar = LLT::scalar(MaxRegisterSize);
 
   const LLT V2S16 = LLT::vector(2, 16);
   const LLT V4S16 = LLT::vector(4, 16);
@@ -244,6 +402,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     S32, S64, S16, V2S16
   };
 
+  const LLT MinScalarFPTy = ST.has16BitInsts() ? S16 : S32;
+
   setAction({G_BRCOND, S1}, Legal); // VCC branches
   setAction({G_BRCOND, S32}, Legal); // SCC branches
 
@@ -261,11 +421,19 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
     .legalIf(isPointer(0));
 
-  if (ST.has16BitInsts()) {
+  if (ST.hasVOP3PInsts()) {
+    getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
+      .legalFor({S32, S16, V2S16})
+      .clampScalar(0, S16, S32)
+      .clampMaxNumElements(0, S16, 2)
+      .scalarize(0)
+      .widenScalarToNextPow2(0, 32);
+  } else if (ST.has16BitInsts()) {
     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
       .legalFor({S32, S16})
       .clampScalar(0, S16, S32)
-      .scalarize(0);
+      .scalarize(0)
+      .widenScalarToNextPow2(0, 32);
   } else {
     getActionDefinitionsBuilder({G_ADD, G_SUB, G_MUL})
       .legalFor({S32})
@@ -275,7 +443,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   // FIXME: Not really legal. Placeholder for custom lowering.
   getActionDefinitionsBuilder({G_SDIV, G_UDIV, G_SREM, G_UREM})
-    .legalFor({S32, S64})
+    .customFor({S32, S64})
     .clampScalar(0, S32, S64)
     .widenScalarToNextPow2(0, 32)
     .scalarize(0);
@@ -298,35 +466,16 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   getActionDefinitionsBuilder({G_UADDO, G_USUBO,
                                G_UADDE, G_SADDE, G_USUBE, G_SSUBE})
     .legalFor({{S32, S1}, {S32, S32}})
-    .clampScalar(0, S32, S32)
-    .scalarize(0); // TODO: Implement.
-
-  getActionDefinitionsBuilder({G_SADDO, G_SSUBO})
+    .minScalar(0, S32)
+    // TODO: .scalarize(0)
     .lower();
 
   getActionDefinitionsBuilder(G_BITCAST)
     // Don't worry about the size constraint.
     .legalIf(all(isRegisterType(0), isRegisterType(1)))
-    // FIXME: Testing hack
-    .legalForCartesianProduct({S16, LLT::vector(2, 8), });
-
-  getActionDefinitionsBuilder(G_FCONSTANT)
-    .legalFor({S32, S64, S16})
-    .clampScalar(0, S16, S64);
-
-  getActionDefinitionsBuilder(G_IMPLICIT_DEF)
-    .legalFor({S1, S32, S64, S16, V2S32, V4S32, V2S16, V4S16, GlobalPtr,
-               ConstantPtr, LocalPtr, FlatPtr, PrivatePtr})
-    .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
-    .clampScalarOrElt(0, S32, S1024)
-    .legalIf(isMultiple32(0))
-    .widenScalarToNextPow2(0, 32)
-    .clampMaxNumElements(0, S32, 16);
+    .lower();
 
 
-  // FIXME: i1 operands to intrinsics should always be legal, but other i1
-  // values may not be legal.  We need to figure out how to distinguish
-  // between these two scenarios.
   getActionDefinitionsBuilder(G_CONSTANT)
     .legalFor({S1, S32, S64, S16, GlobalPtr,
                LocalPtr, ConstantPtr, PrivatePtr, FlatPtr })
@@ -334,10 +483,31 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .widenScalarToNextPow2(0)
     .legalIf(isPointer(0));
 
+  getActionDefinitionsBuilder(G_FCONSTANT)
+    .legalFor({S32, S64, S16})
+    .clampScalar(0, S16, S64);
+
+  getActionDefinitionsBuilder({G_IMPLICIT_DEF, G_FREEZE})
+      .legalIf(isRegisterType(0))
+      // s1 and s16 are special cases because they have legal operations on
+      // them, but don't really occupy registers in the normal way.
+      .legalFor({S1, S16})
+      .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
+      .clampScalarOrElt(0, S32, MaxScalar)
+      .widenScalarToNextPow2(0, 32)
+      .clampMaxNumElements(0, S32, 16);
+
   setAction({G_FRAME_INDEX, PrivatePtr}, Legal);
-  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
-    .customFor({LocalPtr, GlobalPtr, ConstantPtr, Constant32Ptr});
 
+  // If the amount is divergent, we have to do a wave reduction to get the
+  // maximum value, so this is expanded during RegBankSelect.
+  getActionDefinitionsBuilder(G_DYN_STACKALLOC)
+    .legalFor({{PrivatePtr, S32}});
+
+  getActionDefinitionsBuilder(G_GLOBAL_VALUE)
+    .unsupportedFor({PrivatePtr})
+    .custom();
+  setAction({G_BLOCK_ADDR, CodePtr}, Legal);
 
   auto &FPOpActions = getActionDefinitionsBuilder(
     { G_FADD, G_FMUL, G_FMA, G_FCANONICALIZE})
@@ -397,33 +567,41 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .scalarize(0)
     .clampScalar(0, S16, S64);
 
-  // TODO: Implement
-  getActionDefinitionsBuilder({G_FMINIMUM, G_FMAXIMUM}).lower();
-
   if (ST.has16BitInsts()) {
     getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
       .legalFor({S32, S64, S16})
       .scalarize(0)
       .clampScalar(0, S16, S64);
   } else {
-    getActionDefinitionsBuilder({G_FSQRT, G_FFLOOR})
+    getActionDefinitionsBuilder(G_FSQRT)
       .legalFor({S32, S64})
       .scalarize(0)
       .clampScalar(0, S32, S64);
+
+    if (ST.hasFractBug()) {
+      getActionDefinitionsBuilder(G_FFLOOR)
+        .customFor({S64})
+        .legalFor({S32, S64})
+        .scalarize(0)
+        .clampScalar(0, S32, S64);
+    } else {
+      getActionDefinitionsBuilder(G_FFLOOR)
+        .legalFor({S32, S64})
+        .scalarize(0)
+        .clampScalar(0, S32, S64);
+    }
   }
 
   getActionDefinitionsBuilder(G_FPTRUNC)
     .legalFor({{S32, S64}, {S16, S32}})
-    .scalarize(0);
+    .scalarize(0)
+    .lower();
 
   getActionDefinitionsBuilder(G_FPEXT)
     .legalFor({{S64, S32}, {S32, S16}})
     .lowerFor({{S64, S16}}) // FIXME: Implement
     .scalarize(0);
 
-  // TODO: Verify V_BFI_B32 is generated from expanded bit ops.
-  getActionDefinitionsBuilder(G_FCOPYSIGN).lower();
-
   getActionDefinitionsBuilder(G_FSUB)
       // Use actual fsub instruction
       .legalFor({S32})
@@ -434,22 +612,32 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
   // Whether this is legal depends on the floating point mode for the function.
   auto &FMad = getActionDefinitionsBuilder(G_FMAD);
-  if (ST.hasMadF16())
+  if (ST.hasMadF16() && ST.hasMadMacF32Insts())
     FMad.customFor({S32, S16});
-  else
+  else if (ST.hasMadMacF32Insts())
     FMad.customFor({S32});
+  else if (ST.hasMadF16())
+    FMad.customFor({S16});
   FMad.scalarize(0)
       .lower();
 
+  // TODO: Do we need to clamp maximum bitwidth?
+  getActionDefinitionsBuilder(G_TRUNC)
+    .legalIf(isScalar(0))
+    .legalFor({{V2S16, V2S32}})
+    .clampMaxNumElements(0, S16, 2)
+    // Avoid scalarizing in cases that should be truly illegal. In unresolvable
+    // situations (like an invalid implicit use), we don't want to infinite loop
+    // in the legalizer.
+    .fewerElementsIf(elementTypeIsLegal(0), LegalizeMutations::scalarize(0))
+    .alwaysLegal();
+
   getActionDefinitionsBuilder({G_SEXT, G_ZEXT, G_ANYEXT})
     .legalFor({{S64, S32}, {S32, S16}, {S64, S16},
-               {S32, S1}, {S64, S1}, {S16, S1},
-               {S96, S32},
-               // FIXME: Hack
-               {S64, LLT::scalar(33)},
-               {S32, S8}, {S32, LLT::scalar(24)}})
+               {S32, S1}, {S64, S1}, {S16, S1}})
     .scalarize(0)
-    .clampScalar(0, S32, S64);
+    .clampScalar(0, S32, S64)
+    .widenScalarToNextPow2(1, 32);
 
   // TODO: Split s1->s64 during regbankselect for VALU.
   auto &IToFP = getActionDefinitionsBuilder({G_SITOFP, G_UITOFP})
@@ -460,17 +648,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   if (ST.has16BitInsts())
     IToFP.legalFor({{S16, S16}});
   IToFP.clampScalar(1, S32, S64)
-       .scalarize(0);
+       .scalarize(0)
+       .widenScalarToNextPow2(1);
 
   auto &FPToI = getActionDefinitionsBuilder({G_FPTOSI, G_FPTOUI})
-    .legalFor({{S32, S32}, {S32, S64}, {S32, S16}});
+    .legalFor({{S32, S32}, {S32, S64}, {S32, S16}})
+    .customFor({{S64, S64}});
   if (ST.has16BitInsts())
     FPToI.legalFor({{S16, S16}});
   else
     FPToI.minScalar(1, S32);
 
   FPToI.minScalar(0, S32)
-       .scalarize(0);
+       .scalarize(0)
+       .lower();
 
   getActionDefinitionsBuilder(G_INTRINSIC_ROUND)
     .scalarize(0)
@@ -494,16 +685,17 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       .scalarize(0);
   }
 
+  // FIXME: Clamp offset operand.
   getActionDefinitionsBuilder(G_PTR_ADD)
-    .legalForCartesianProduct(AddrSpaces64, {S64})
-    .legalForCartesianProduct(AddrSpaces32, {S32})
+    .legalIf(isPointer(0))
     .scalarize(0);
 
-  getActionDefinitionsBuilder(G_PTR_MASK)
-    .scalarize(0)
-    .alwaysLegal();
-
-  setAction({G_BLOCK_ADDR, CodePtr}, Legal);
+  getActionDefinitionsBuilder(G_PTRMASK)
+    .legalIf(typeInSet(1, {S64, S32}))
+    .minScalar(1, S32)
+    .maxScalarIf(sizeIs(0, 32), 1, S32)
+    .maxScalarIf(sizeIs(0, 64), 1, S64)
+    .scalarize(0);
 
   auto &CmpBuilder =
     getActionDefinitionsBuilder(G_ICMP)
@@ -537,16 +729,45 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .clampScalar(1, S32, S64)
     .scalarize(0);
 
-  // FIXME: fexp, flog2, flog10 needs to be custom lowered.
-  getActionDefinitionsBuilder({G_FPOW, G_FEXP, G_FEXP2,
-                               G_FLOG, G_FLOG2, G_FLOG10})
-    .legalFor({S32})
-    .scalarize(0);
+  // FIXME: fpow has a selection pattern that should move to custom lowering.
+  auto &Exp2Ops = getActionDefinitionsBuilder({G_FEXP2, G_FLOG2});
+  if (ST.has16BitInsts())
+    Exp2Ops.legalFor({S32, S16});
+  else
+    Exp2Ops.legalFor({S32});
+  Exp2Ops.clampScalar(0, MinScalarFPTy, S32);
+  Exp2Ops.scalarize(0);
+
+  auto &ExpOps = getActionDefinitionsBuilder({G_FEXP, G_FLOG, G_FLOG10, G_FPOW});
+  if (ST.has16BitInsts())
+    ExpOps.customFor({{S32}, {S16}});
+  else
+    ExpOps.customFor({S32});
+  ExpOps.clampScalar(0, MinScalarFPTy, S32)
+        .scalarize(0);
+
+  // The 64-bit versions produce 32-bit results, but only on the SALU.
+  getActionDefinitionsBuilder(G_CTPOP)
+    .legalFor({{S32, S32}, {S32, S64}})
+    .clampScalar(0, S32, S32)
+    .clampScalar(1, S32, S64)
+    .scalarize(0)
+    .widenScalarToNextPow2(0, 32)
+    .widenScalarToNextPow2(1, 32);
+
+  // The hardware instructions return a different result on 0 than the generic
+  // instructions expect. The hardware produces -1, but these produce the
+  // bitwidth.
+  getActionDefinitionsBuilder({G_CTLZ, G_CTTZ})
+    .scalarize(0)
+    .clampScalar(0, S32, S32)
+    .clampScalar(1, S32, S64)
+    .widenScalarToNextPow2(0, 32)
+    .widenScalarToNextPow2(1, 32)
+    .lower();
 
   // The 64-bit versions produce 32-bit results, but only on the SALU.
-  getActionDefinitionsBuilder({G_CTLZ, G_CTLZ_ZERO_UNDEF,
-                               G_CTTZ, G_CTTZ_ZERO_UNDEF,
-                               G_CTPOP})
+  getActionDefinitionsBuilder({G_CTLZ_ZERO_UNDEF, G_CTTZ_ZERO_UNDEF})
     .legalFor({{S32, S32}, {S32, S64}})
     .clampScalar(0, S32, S32)
     .clampScalar(1, S32, S64)
@@ -554,50 +775,58 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .widenScalarToNextPow2(0, 32)
     .widenScalarToNextPow2(1, 32);
 
-  // TODO: Expand for > s32
-  getActionDefinitionsBuilder({G_BSWAP, G_BITREVERSE})
+  getActionDefinitionsBuilder(G_BITREVERSE)
     .legalFor({S32})
     .clampScalar(0, S32, S32)
     .scalarize(0);
 
   if (ST.has16BitInsts()) {
+    getActionDefinitionsBuilder(G_BSWAP)
+      .legalFor({S16, S32, V2S16})
+      .clampMaxNumElements(0, S16, 2)
+      // FIXME: Fixing non-power-of-2 before clamp is workaround for
+      // narrowScalar limitation.
+      .widenScalarToNextPow2(0)
+      .clampScalar(0, S16, S32)
+      .scalarize(0);
+
     if (ST.hasVOP3PInsts()) {
       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
         .legalFor({S32, S16, V2S16})
         .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
         .clampMaxNumElements(0, S16, 2)
-        .clampScalar(0, S16, S32)
+        .minScalar(0, S16)
         .widenScalarToNextPow2(0)
-        .scalarize(0);
+        .scalarize(0)
+        .lower();
     } else {
       getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
         .legalFor({S32, S16})
         .widenScalarToNextPow2(0)
-        .clampScalar(0, S16, S32)
-        .scalarize(0);
+        .minScalar(0, S16)
+        .scalarize(0)
+        .lower();
     }
   } else {
+    // TODO: Should have same legality without v_perm_b32
+    getActionDefinitionsBuilder(G_BSWAP)
+      .legalFor({S32})
+      .lowerIf(scalarNarrowerThan(0, 32))
+      // FIXME: Fixing non-power-of-2 before clamp is workaround for
+      // narrowScalar limitation.
+      .widenScalarToNextPow2(0)
+      .maxScalar(0, S32)
+      .scalarize(0)
+      .lower();
+
     getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
       .legalFor({S32})
-      .clampScalar(0, S32, S32)
+      .minScalar(0, S32)
       .widenScalarToNextPow2(0)
-      .scalarize(0);
+      .scalarize(0)
+      .lower();
   }
 
-  auto smallerThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
-    return [=](const LegalityQuery &Query) {
-      return Query.Types[TypeIdx0].getSizeInBits() <
-             Query.Types[TypeIdx1].getSizeInBits();
-    };
-  };
-
-  auto greaterThan = [](unsigned TypeIdx0, unsigned TypeIdx1) {
-    return [=](const LegalityQuery &Query) {
-      return Query.Types[TypeIdx0].getSizeInBits() >
-             Query.Types[TypeIdx1].getSizeInBits();
-    };
-  };
-
   getActionDefinitionsBuilder(G_INTTOPTR)
     // List the common cases
     .legalForCartesianProduct(AddrSpaces64, {S64})
@@ -609,7 +838,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
       [](const LegalityQuery &Query) {
         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
       })
-    .narrowScalarIf(greaterThan(1, 0),
+    .narrowScalarIf(largerThan(1, 0),
       [](const LegalityQuery &Query) {
         return std::make_pair(1, LLT::scalar(Query.Types[0].getSizeInBits()));
       });
@@ -626,7 +855,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
       })
     .narrowScalarIf(
-      greaterThan(0, 1),
+      largerThan(0, 1),
       [](const LegalityQuery &Query) {
         return std::make_pair(0, LLT::scalar(Query.Types[1].getSizeInBits()));
       });
@@ -635,33 +864,8 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .scalarize(0)
     .custom();
 
-  // TODO: Should load to s16 be legal? Most loads extend to 32-bits, but we
-  // handle some operations by just promoting the register during
-  // selection. There are also d16 loads on GFX9+ which preserve the high bits.
-  auto maxSizeForAddrSpace = [this](unsigned AS) -> unsigned {
-    switch (AS) {
-    // FIXME: Private element size.
-    case AMDGPUAS::PRIVATE_ADDRESS:
-      return 32;
-    // FIXME: Check subtarget
-    case AMDGPUAS::LOCAL_ADDRESS:
-      return ST.useDS128() ? 128 : 64;
-
-    // Treat constant and global as identical. SMRD loads are sometimes usable
-    // for global loads (ideally constant address space should be eliminated)
-    // depending on the context. Legality cannot be context dependent, but
-    // RegBankSelect can split the load as necessary depending on the pointer
-    // register bank/uniformity and if the memory is invariant or not written in
-    // a kernel.
-    case AMDGPUAS::CONSTANT_ADDRESS:
-    case AMDGPUAS::GLOBAL_ADDRESS:
-      return 512;
-    default:
-      return 128;
-    }
-  };
-
-  const auto needToSplitLoad = [=](const LegalityQuery &Query) -> bool {
+  const auto needToSplitMemOp = [=](const LegalityQuery &Query,
+                                    bool IsLoad) -> bool {
     const LLT DstTy = Query.Types[0];
 
     // Split vector extloads.
@@ -676,14 +880,20 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
     const LLT PtrTy = Query.Types[1];
     unsigned AS = PtrTy.getAddressSpace();
-    if (MemSize > maxSizeForAddrSpace(AS))
+    if (MemSize > maxSizeForAddrSpace(ST, AS, IsLoad))
       return true;
 
     // Catch weird sized loads that don't evenly divide into the access sizes
     // TODO: May be able to widen depending on alignment etc.
-    unsigned NumRegs = MemSize / 32;
-    if (NumRegs == 3 && !ST.hasDwordx3LoadStores())
-      return true;
+    unsigned NumRegs = (MemSize + 31) / 32;
+    if (NumRegs == 3) {
+      if (!ST.hasDwordx3LoadStores())
+        return true;
+    } else {
+      // If the alignment allows, these should have been widened.
+      if (!isPowerOf2_32(NumRegs))
+        return true;
+    }
 
     if (Align < MemSize) {
       const SITargetLowering *TLI = ST.getTargetLowering();
@@ -693,6 +903,24 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     return false;
   };
 
+  const auto shouldWidenLoadResult = [=](const LegalityQuery &Query,
+                                         unsigned Opc) -> bool {
+    unsigned Size = Query.Types[0].getSizeInBits();
+    if (isPowerOf2_32(Size))
+      return false;
+
+    if (Size == 96 && ST.hasDwordx3LoadStores())
+      return false;
+
+    unsigned AddrSpace = Query.Types[1].getAddressSpace();
+    if (Size >= maxSizeForAddrSpace(ST, AddrSpace, Opc))
+      return false;
+
+    unsigned Align = Query.MMODescrs[0].AlignInBits;
+    unsigned RoundedSize = NextPowerOf2(Size);
+    return (Align >= RoundedSize);
+  };
+
   unsigned GlobalAlign32 = ST.hasUnalignedBufferAccess() ? 0 : 32;
   unsigned GlobalAlign16 = ST.hasUnalignedBufferAccess() ? 0 : 16;
   unsigned GlobalAlign8 = ST.hasUnalignedBufferAccess() ? 0 : 8;
@@ -705,17 +933,11 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     const bool IsStore = Op == G_STORE;
 
     auto &Actions = getActionDefinitionsBuilder(Op);
-    // Whitelist the common cases.
-    // TODO: Pointer loads
-    // TODO: Wide constant loads
-    // TODO: Only CI+ has 3x loads
-    // TODO: Loads to s16 on gfx9
+    // Explicitly list some common cases.
+    // TODO: Does this help compile time at all?
     Actions.legalForTypesWithMemDesc({{S32, GlobalPtr, 32, GlobalAlign32},
                                       {V2S32, GlobalPtr, 64, GlobalAlign32},
-                                      {V3S32, GlobalPtr, 96, GlobalAlign32},
-                                      {S96, GlobalPtr, 96, GlobalAlign32},
                                       {V4S32, GlobalPtr, 128, GlobalAlign32},
-                                      {S128, GlobalPtr, 128, GlobalAlign32},
                                       {S64, GlobalPtr, 64, GlobalAlign32},
                                       {V2S64, GlobalPtr, 128, GlobalAlign32},
                                       {V2S16, GlobalPtr, 32, GlobalAlign32},
@@ -734,23 +956,60 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
                                       {S32, PrivatePtr, 16, 16},
                                       {V2S16, PrivatePtr, 32, 32},
 
-                                      {S32, FlatPtr, 32, GlobalAlign32},
-                                      {S32, FlatPtr, 16, GlobalAlign16},
-                                      {S32, FlatPtr, 8, GlobalAlign8},
-                                      {V2S16, FlatPtr, 32, GlobalAlign32},
-
                                       {S32, ConstantPtr, 32, GlobalAlign32},
                                       {V2S32, ConstantPtr, 64, GlobalAlign32},
-                                      {V3S32, ConstantPtr, 96, GlobalAlign32},
                                       {V4S32, ConstantPtr, 128, GlobalAlign32},
                                       {S64, ConstantPtr, 64, GlobalAlign32},
-                                      {S128, ConstantPtr, 128, GlobalAlign32},
                                       {V2S32, ConstantPtr, 32, GlobalAlign32}});
+    Actions.legalIf(
+      [=](const LegalityQuery &Query) -> bool {
+        return isLoadStoreLegal(ST, Query, Op);
+      });
+
+    // Constant 32-bit is handled by addrspacecasting the 32-bit pointer to
+    // 64-bits.
+    //
+    // TODO: Should generalize bitcast action into coerce, which will also cover
+    // inserting addrspacecasts.
+    Actions.customIf(typeIs(1, Constant32Ptr));
+
+    // Turn any illegal element vectors into something easier to deal
+    // with. These will ultimately produce 32-bit scalar shifts to extract the
+    // parts anyway.
+    //
+    // For odd 16-bit element vectors, prefer to split those into pieces with
+    // 16-bit vector parts.
+    Actions.bitcastIf(
+      [=](const LegalityQuery &Query) -> bool {
+        const LLT Ty = Query.Types[0];
+        const unsigned Size = Ty.getSizeInBits();
+
+        if (Size != Query.MMODescrs[0].SizeInBits)
+          return Size <= 32 && Ty.isVector();
+
+        if (loadStoreBitcastWorkaround(Ty) && isRegisterType(Ty))
+          return true;
+        return Ty.isVector() && (Size <= 32 || isRegisterSize(Size)) &&
+               !isRegisterVectorElementType(Ty.getElementType());
+      }, bitcastToRegisterType(0));
+
     Actions
         .customIf(typeIs(1, Constant32Ptr))
+        // Widen suitably aligned loads by loading extra elements.
+        .moreElementsIf([=](const LegalityQuery &Query) {
+            const LLT Ty = Query.Types[0];
+            return Op == G_LOAD && Ty.isVector() &&
+                   shouldWidenLoadResult(Query, Op);
+          }, moreElementsToNextPow2(0))
+        .widenScalarIf([=](const LegalityQuery &Query) {
+            const LLT Ty = Query.Types[0];
+            return Op == G_LOAD && !Ty.isVector() &&
+                   shouldWidenLoadResult(Query, Op);
+          }, widenScalarOrEltToNextPow2(0))
         .narrowScalarIf(
             [=](const LegalityQuery &Query) -> bool {
-              return !Query.Types[0].isVector() && needToSplitLoad(Query);
+              return !Query.Types[0].isVector() &&
+                     needToSplitMemOp(Query, Op == G_LOAD);
             },
             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
               const LLT DstTy = Query.Types[0];
@@ -763,13 +1022,23 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
               if (DstSize > MemSize)
                 return std::make_pair(0, LLT::scalar(MemSize));
 
+              if (!isPowerOf2_32(DstSize)) {
+                // We're probably decomposing an odd sized store. Try to split
+                // to the widest type. TODO: Account for alignment. As-is it
+                // should be OK, since the new parts will be further legalized.
+                unsigned FloorSize = PowerOf2Floor(DstSize);
+                return std::make_pair(0, LLT::scalar(FloorSize));
+              }
+
               if (DstSize > 32 && (DstSize % 32 != 0)) {
                 // FIXME: Need a way to specify non-extload of larger size if
                 // suitably aligned.
                 return std::make_pair(0, LLT::scalar(32 * (DstSize / 32)));
               }
 
-              unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
+              unsigned MaxSize = maxSizeForAddrSpace(ST,
+                                                     PtrTy.getAddressSpace(),
+                                                     Op == G_LOAD);
               if (MemSize > MaxSize)
                 return std::make_pair(0, LLT::scalar(MaxSize));
 
@@ -778,18 +1047,32 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
             })
         .fewerElementsIf(
             [=](const LegalityQuery &Query) -> bool {
-              return Query.Types[0].isVector() && needToSplitLoad(Query);
+              return Query.Types[0].isVector() &&
+                     needToSplitMemOp(Query, Op == G_LOAD);
             },
             [=](const LegalityQuery &Query) -> std::pair<unsigned, LLT> {
               const LLT DstTy = Query.Types[0];
               const LLT PtrTy = Query.Types[1];
 
               LLT EltTy = DstTy.getElementType();
-              unsigned MaxSize = maxSizeForAddrSpace(PtrTy.getAddressSpace());
+              unsigned MaxSize = maxSizeForAddrSpace(ST,
+                                                     PtrTy.getAddressSpace(),
+                                                     Op == G_LOAD);
+
+              // FIXME: Handle widened to power of 2 results better. This ends
+              // up scalarizing.
+              // FIXME: 3 element stores scalarized on SI
 
               // Split if it's too large for the address space.
               if (Query.MMODescrs[0].SizeInBits > MaxSize) {
                 unsigned NumElts = DstTy.getNumElements();
+                unsigned EltSize = EltTy.getSizeInBits();
+
+                if (MaxSize % EltSize == 0) {
+                  return std::make_pair(
+                    0, LLT::scalarOrVector(MaxSize / EltSize, EltTy));
+                }
+
                 unsigned NumPieces = Query.MMODescrs[0].SizeInBits / MaxSize;
 
                 // FIXME: Refine when odd breakdowns handled
@@ -802,9 +1085,24 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
                                       LLT::vector(NumElts / NumPieces, EltTy));
               }
 
+              // FIXME: We could probably handle weird extending loads better.
+              unsigned MemSize = Query.MMODescrs[0].SizeInBits;
+              if (DstTy.getSizeInBits() > MemSize)
+                return std::make_pair(0, EltTy);
+
+              unsigned EltSize = EltTy.getSizeInBits();
+              unsigned DstSize = DstTy.getSizeInBits();
+              if (!isPowerOf2_32(DstSize)) {
+                // We're probably decomposing an odd sized store. Try to split
+                // to the widest type. TODO: Account for alignment. As-is it
+                // should be OK, since the new parts will be further legalized.
+                unsigned FloorSize = PowerOf2Floor(DstSize);
+                return std::make_pair(
+                  0, LLT::scalarOrVector(FloorSize / EltSize, EltTy));
+              }
+
               // Need to split because of alignment.
               unsigned Align = Query.MMODescrs[0].AlignInBits;
-              unsigned EltSize = EltTy.getSizeInBits();
               if (EltSize > Align &&
                   (EltSize / Align < DstTy.getNumElements())) {
                 return std::make_pair(0, LLT::vector(EltSize / Align, EltTy));
@@ -820,39 +1118,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
     // TODO: Need a bitcast lower option?
     Actions
-        .legalIf([=](const LegalityQuery &Query) {
-          const LLT Ty0 = Query.Types[0];
-          unsigned Size = Ty0.getSizeInBits();
-          unsigned MemSize = Query.MMODescrs[0].SizeInBits;
-          unsigned Align = Query.MMODescrs[0].AlignInBits;
-
-          // FIXME: Widening store from alignment not valid.
-          if (MemSize < Size)
-            MemSize = std::max(MemSize, Align);
-
-          // No extending vector loads.
-          if (Size > MemSize && Ty0.isVector())
-            return false;
-
-          switch (MemSize) {
-          case 8:
-          case 16:
-            return Size == 32;
-          case 32:
-          case 64:
-          case 128:
-            return true;
-          case 96:
-            return ST.hasDwordx3LoadStores();
-          case 256:
-          case 512:
-            return true;
-          default:
-            return false;
-          }
-        })
         .widenScalarToNextPow2(0)
-        // TODO: v3s32->v4s32 with alignment
         .moreElementsIf(vectorSmallerThan(0, 32), moreEltsToNext32Bit(0));
   }
 
@@ -886,8 +1152,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     Atomics.legalFor({{S32, FlatPtr}, {S64, FlatPtr}});
   }
 
-  getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
-    .legalFor({{S32, LocalPtr}});
+  if (ST.hasLDSFPAtomics()) {
+    getActionDefinitionsBuilder(G_ATOMICRMW_FADD)
+      .legalFor({{S32, LocalPtr}});
+  }
 
   // BUFFER/FLAT_ATOMIC_CMP_SWAP on GCN GPUs needs input marshalling, and output
   // demarshalling
@@ -896,10 +1164,6 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
                 {S32, FlatPtr}, {S64, FlatPtr}})
     .legalFor({{S32, LocalPtr}, {S64, LocalPtr},
                {S32, RegionPtr}, {S64, RegionPtr}});
-
-  getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG_WITH_SUCCESS)
-    .lower();
-
   // TODO: Pointer types, any 32-bit or 64-bit vector
 
   // Condition should be s32 for scalar, s1 for vector.
@@ -908,9 +1172,9 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
           GlobalPtr, LocalPtr, FlatPtr, PrivatePtr,
           LLT::vector(2, LocalPtr), LLT::vector(2, PrivatePtr)}, {S1, S32})
     .clampScalar(0, S16, S64)
+    .scalarize(1)
     .moreElementsIf(isSmallOddVector(0), oneMoreElement(0))
     .fewerElementsIf(numElementsNotEven(0), scalarize(0))
-    .scalarize(1)
     .clampMaxNumElements(0, S32, 2)
     .clampMaxNumElements(0, LocalPtr, 2)
     .clampMaxNumElements(0, PrivatePtr, 2)
@@ -924,12 +1188,22 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .legalFor({{S32, S32}, {S64, S32}});
   if (ST.has16BitInsts()) {
     if (ST.hasVOP3PInsts()) {
-      Shifts.legalFor({{S16, S32}, {S16, S16}, {V2S16, V2S16}})
+      Shifts.legalFor({{S16, S16}, {V2S16, V2S16}})
             .clampMaxNumElements(0, S16, 2);
     } else
-      Shifts.legalFor({{S16, S32}, {S16, S16}});
+      Shifts.legalFor({{S16, S16}});
 
-    // TODO: Support 16-bit shift amounts
+    // TODO: Support 16-bit shift amounts for all types
+    Shifts.widenScalarIf(
+      [=](const LegalityQuery &Query) {
+        // Use 16-bit shift amounts for any 16-bit shift. Otherwise we want a
+        // 32-bit amount.
+        const LLT ValTy = Query.Types[0];
+        const LLT AmountTy = Query.Types[1];
+        return ValTy.getSizeInBits() <= 16 &&
+               AmountTy.getSizeInBits() < 16;
+      }, changeTo(1, S16));
+    Shifts.maxScalarIf(typeIs(0, S16), 1, S16);
     Shifts.clampScalar(1, S32, S32);
     Shifts.clampScalar(0, S16, S64);
     Shifts.widenScalarToNextPow2(0, 16);
@@ -956,7 +1230,7 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
           return (EltTy.getSizeInBits() == 16 ||
                   EltTy.getSizeInBits() % 32 == 0) &&
                  VecTy.getSizeInBits() % 32 == 0 &&
-                 VecTy.getSizeInBits() <= 1024 &&
+                 VecTy.getSizeInBits() <= MaxRegisterSize &&
                  IdxTy.getSizeInBits() == 32;
         })
       .clampScalar(EltTypeIdx, S32, S64)
@@ -1008,28 +1282,40 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     .clampNumElements(0, V2S64, V16S64)
     .fewerElementsIf(isWideVec16(0), changeTo(0, V2S16));
 
-  if (ST.hasScalarPackInsts())
-    BuildVector.legalFor({V2S16, S32});
-
-  BuildVector
-    .minScalarSameAs(1, 0)
-    .legalIf(isRegisterType(0))
-    .minScalarOrElt(0, S32);
-
   if (ST.hasScalarPackInsts()) {
+    BuildVector
+      // FIXME: Should probably widen s1 vectors straight to s32
+      .minScalarOrElt(0, S16)
+      // Widen source elements and produce a G_BUILD_VECTOR_TRUNC
+      .minScalar(1, S32);
+
     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
       .legalFor({V2S16, S32})
       .lower();
+    BuildVector.minScalarOrElt(0, S32);
   } else {
+    BuildVector.customFor({V2S16, S16});
+    BuildVector.minScalarOrElt(0, S32);
+
     getActionDefinitionsBuilder(G_BUILD_VECTOR_TRUNC)
+      .customFor({V2S16, S32})
       .lower();
   }
 
+  BuildVector.legalIf(isRegisterType(0));
+
+  // FIXME: Clamp maximum size
   getActionDefinitionsBuilder(G_CONCAT_VECTORS)
     .legalIf(isRegisterType(0));
 
-  // TODO: Don't fully scalarize v2s16 pieces
-  getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
+  // TODO: Don't fully scalarize v2s16 pieces? Or combine out thosse
+  // pre-legalize.
+  if (ST.hasVOP3PInsts()) {
+    getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
+      .customFor({V2S16, V2S16})
+      .lower();
+  } else
+    getActionDefinitionsBuilder(G_SHUFFLE_VECTOR).lower();
 
   // Merge/Unmerge
   for (unsigned Op : {G_MERGE_VALUES, G_UNMERGE_VALUES}) {
@@ -1037,10 +1323,10 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     unsigned LitTyIdx = Op == G_MERGE_VALUES ? 1 : 0;
 
     auto notValidElt = [=](const LegalityQuery &Query, unsigned TypeIdx) {
-      const LLT &Ty = Query.Types[TypeIdx];
+      const LLT Ty = Query.Types[TypeIdx];
       if (Ty.isVector()) {
         const LLT &EltTy = Ty.getElementType();
-        if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 64)
+        if (EltTy.getSizeInBits() < 8 || EltTy.getSizeInBits() > 512)
           return true;
         if (!isPowerOf2_32(EltTy.getSizeInBits()))
           return true;
@@ -1049,25 +1335,32 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
     };
 
     auto &Builder = getActionDefinitionsBuilder(Op)
+      .lowerFor({{S16, V2S16}})
+      .lowerIf([=](const LegalityQuery &Query) {
+          const LLT BigTy = Query.Types[BigTyIdx];
+          return BigTy.getSizeInBits() == 32;
+        })
+      // Try to widen to s16 first for small types.
+      // TODO: Only do this on targets with legal s16 shifts
+      .minScalarOrEltIf(scalarNarrowerThan(LitTyIdx, 16), LitTyIdx, S16)
       .widenScalarToNextPow2(LitTyIdx, /*Min*/ 16)
-      // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
-      // worth considering the multiples of 64 since 2*192 and 2*384 are not
-      // valid.
-      .clampScalar(LitTyIdx, S16, S256)
-      .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
       .moreElementsIf(isSmallOddVector(BigTyIdx), oneMoreElement(BigTyIdx))
       .fewerElementsIf(all(typeIs(0, S16), vectorWiderThan(1, 32),
                            elementTypeIs(1, S16)),
                        changeTo(1, V2S16))
+      // Clamp the little scalar to s8-s256 and make it a power of 2. It's not
+      // worth considering the multiples of 64 since 2*192 and 2*384 are not
+      // valid.
+      .clampScalar(LitTyIdx, S32, S512)
+      .widenScalarToNextPow2(LitTyIdx, /*Min*/ 32)
       // Break up vectors with weird elements into scalars
       .fewerElementsIf(
-        [=](const LegalityQuery &Query) { return notValidElt(Query, 0); },
+        [=](const LegalityQuery &Query) { return notValidElt(Query, LitTyIdx); },
         scalarize(0))
       .fewerElementsIf(
-        [=](const LegalityQuery &Query) { return notValidElt(Query, 1); },
+        [=](const LegalityQuery &Query) { return notValidElt(Query, BigTyIdx); },
         scalarize(1))
-      .clampScalar(BigTyIdx, S32, S1024)
-      .lowerFor({{S16, V2S16}});
+      .clampScalar(BigTyIdx, S32, MaxScalar);
 
     if (Op == G_MERGE_VALUES) {
       Builder.widenScalarIf(
@@ -1108,22 +1401,68 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
 
           return BigTy.getSizeInBits() % 16 == 0 &&
                  LitTy.getSizeInBits() % 16 == 0 &&
-                 BigTy.getSizeInBits() <= 1024;
+                 BigTy.getSizeInBits() <= MaxRegisterSize;
         })
       // Any vectors left are the wrong size. Scalarize them.
       .scalarize(0)
       .scalarize(1);
   }
 
-  getActionDefinitionsBuilder(G_SEXT_INREG).lower();
+  // S64 is only legal on SALU, and needs to be broken into 32-bit elements in
+  // RegBankSelect.
+  auto &SextInReg = getActionDefinitionsBuilder(G_SEXT_INREG)
+    .legalFor({{S32}, {S64}});
+
+  if (ST.hasVOP3PInsts()) {
+    SextInReg.lowerFor({{V2S16}})
+      // Prefer to reduce vector widths for 16-bit vectors before lowering, to
+      // get more vector shift opportunities, since we'll get those when
+      // expanded.
+      .fewerElementsIf(elementTypeIs(0, S16), changeTo(0, V2S16));
+  } else if (ST.has16BitInsts()) {
+    SextInReg.lowerFor({{S32}, {S64}, {S16}});
+  } else {
+    // Prefer to promote to s32 before lowering if we don't have 16-bit
+    // shifts. This avoid a lot of intermediate truncate and extend operations.
+    SextInReg.lowerFor({{S32}, {S64}});
+  }
+
+  // FIXME: Placeholder rule. Really depends on whether the clamp modifier is
+  // available, and is selectively legal for s16, s32, v2s16.
+  getActionDefinitionsBuilder({G_SADDSAT, G_SSUBSAT, G_UADDSAT, G_USUBSAT})
+    .scalarize(0)
+    .clampScalar(0, S16, S32);
 
-  getActionDefinitionsBuilder({G_READ_REGISTER, G_WRITE_REGISTER}).lower();
+  SextInReg
+    .scalarize(0)
+    .clampScalar(0, S32, S64)
+    .lower();
+
+  getActionDefinitionsBuilder(G_FSHR)
+    .legalFor({{S32, S32}})
+    .scalarize(0)
+    .lower();
 
   getActionDefinitionsBuilder(G_READCYCLECOUNTER)
     .legalFor({S64});
 
+  getActionDefinitionsBuilder({
+      // TODO: Verify V_BFI_B32 is generated from expanded bit ops
+      G_FCOPYSIGN,
+
+      G_ATOMIC_CMPXCHG_WITH_SUCCESS,
+      G_READ_REGISTER,
+      G_WRITE_REGISTER,
+
+      G_SADDO, G_SSUBO,
+
+       // TODO: Implement
+      G_FMINIMUM, G_FMAXIMUM,
+      G_FSHL
+    }).lower();
+
   getActionDefinitionsBuilder({G_VASTART, G_VAARG, G_BRJT, G_JUMP_TABLE,
-        G_DYN_STACKALLOC, G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
+        G_INDEXED_LOAD, G_INDEXED_SEXTLOAD,
         G_INDEXED_ZEXTLOAD, G_INDEXED_STORE})
     .unsupported();
 
@@ -1131,10 +1470,12 @@ AMDGPULegalizerInfo::AMDGPULegalizerInfo(const GCNSubtarget &ST_,
   verify(*ST.getInstrInfo());
 }
 
-bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
-                                         MachineRegisterInfo &MRI,
-                                         MachineIRBuilder &B,
-                                         GISelChangeObserver &Observer) const {
+bool AMDGPULegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
+                                         MachineInstr &MI) const {
+  MachineIRBuilder &B = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *B.getMRI();
+  GISelChangeObserver &Observer = Helper.Observer;
+
   switch (MI.getOpcode()) {
   case TargetOpcode::G_ADDRSPACE_CAST:
     return legalizeAddrSpaceCast(MI, MRI, B);
@@ -1148,15 +1489,21 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
     return legalizeITOFP(MI, MRI, B, true);
   case TargetOpcode::G_UITOFP:
     return legalizeITOFP(MI, MRI, B, false);
+  case TargetOpcode::G_FPTOSI:
+    return legalizeFPTOI(MI, MRI, B, true);
+  case TargetOpcode::G_FPTOUI:
+    return legalizeFPTOI(MI, MRI, B, false);
   case TargetOpcode::G_FMINNUM:
   case TargetOpcode::G_FMAXNUM:
   case TargetOpcode::G_FMINNUM_IEEE:
   case TargetOpcode::G_FMAXNUM_IEEE:
-    return legalizeMinNumMaxNum(MI, MRI, B);
+    return legalizeMinNumMaxNum(Helper, MI);
   case TargetOpcode::G_EXTRACT_VECTOR_ELT:
     return legalizeExtractVectorElt(MI, MRI, B);
   case TargetOpcode::G_INSERT_VECTOR_ELT:
     return legalizeInsertVectorElt(MI, MRI, B);
+  case TargetOpcode::G_SHUFFLE_VECTOR:
+    return legalizeShuffleVector(MI, MRI, B);
   case TargetOpcode::G_FSIN:
   case TargetOpcode::G_FCOS:
     return legalizeSinCos(MI, MRI, B);
@@ -1168,8 +1515,26 @@ bool AMDGPULegalizerInfo::legalizeCustom(MachineInstr &MI,
     return legalizeFMad(MI, MRI, B);
   case TargetOpcode::G_FDIV:
     return legalizeFDIV(MI, MRI, B);
+  case TargetOpcode::G_UDIV:
+  case TargetOpcode::G_UREM:
+    return legalizeUDIV_UREM(MI, MRI, B);
+  case TargetOpcode::G_SDIV:
+  case TargetOpcode::G_SREM:
+    return legalizeSDIV_SREM(MI, MRI, B);
   case TargetOpcode::G_ATOMIC_CMPXCHG:
     return legalizeAtomicCmpXChg(MI, MRI, B);
+  case TargetOpcode::G_FLOG:
+    return legalizeFlog(MI, B, numbers::ln2f);
+  case TargetOpcode::G_FLOG10:
+    return legalizeFlog(MI, B, numbers::ln2f / numbers::ln10f);
+  case TargetOpcode::G_FEXP:
+    return legalizeFExp(MI, B);
+  case TargetOpcode::G_FPOW:
+    return legalizeFPow(MI, B);
+  case TargetOpcode::G_FFLOOR:
+    return legalizeFFloor(MI, MRI, B);
+  case TargetOpcode::G_BUILD_VECTOR:
+    return legalizeBuildVector(MI, MRI, B);
   default:
     return false;
   }
@@ -1201,7 +1566,6 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
         Offset << AMDGPU::Hwreg::OFFSET_SHIFT_ |
         WidthM1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_;
 
-    Register ApertureReg = MRI.createGenericVirtualRegister(S32);
     Register GetReg = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
 
     B.buildInstr(AMDGPU::S_GETREG_B32)
@@ -1210,12 +1574,7 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
     MRI.setType(GetReg, S32);
 
     auto ShiftAmt = B.buildConstant(S32, WidthM1 + 1);
-    B.buildInstr(TargetOpcode::G_SHL)
-      .addDef(ApertureReg)
-      .addUse(GetReg)
-      .addUse(ShiftAmt.getReg(0));
-
-    return ApertureReg;
+    return B.buildShl(S32, GetReg, ShiftAmt).getReg(0);
   }
 
   Register QueuePtr = MRI.createGenericVirtualRegister(
@@ -1232,19 +1591,15 @@ Register AMDGPULegalizerInfo::getSegmentAperture(
   // TODO: can we be smarter about machine pointer info?
   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
   MachineMemOperand *MMO = MF.getMachineMemOperand(
-    PtrInfo,
-    MachineMemOperand::MOLoad |
-    MachineMemOperand::MODereferenceable |
-    MachineMemOperand::MOInvariant,
-    4,
-    MinAlign(64, StructOffset));
-
-  Register LoadResult = MRI.createGenericVirtualRegister(S32);
+      PtrInfo,
+      MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+          MachineMemOperand::MOInvariant,
+      4, commonAlignment(Align(64), StructOffset));
+
   Register LoadAddr;
 
   B.materializePtrAdd(LoadAddr, QueuePtr, LLT::scalar(64), StructOffset);
-  B.buildLoad(LoadResult, LoadAddr, *MMO);
-  return LoadResult;
+  return B.buildLoad(S32, LoadAddr, *MMO).getReg(0);
 }
 
 bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
@@ -1252,8 +1607,6 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
   MachineIRBuilder &B) const {
   MachineFunction &MF = B.getMF();
 
-  B.setInstr(MI);
-
   const LLT S32 = LLT::scalar(32);
   Register Dst = MI.getOperand(0).getReg();
   Register Src = MI.getOperand(1).getReg();
@@ -1292,7 +1645,7 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
     // extra ptrtoint would be kind of pointless.
     auto HighAddr = B.buildConstant(
       LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS_32BIT, 32), AddrHiVal);
-    B.buildMerge(Dst, {Src, HighAddr.getReg(0)});
+    B.buildMerge(Dst, {Src, HighAddr});
     MI.eraseFromParent();
     return true;
   }
@@ -1305,13 +1658,11 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
     auto SegmentNull = B.buildConstant(DstTy, NullVal);
     auto FlatNull = B.buildConstant(SrcTy, 0);
 
-    Register PtrLo32 = MRI.createGenericVirtualRegister(DstTy);
-
     // Extract low 32-bits of the pointer.
-    B.buildExtract(PtrLo32, Src, 0);
+    auto PtrLo32 = B.buildExtract(DstTy, Src, 0);
 
-    Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
-    B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, FlatNull.getReg(0));
+    auto CmpRes =
+        B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, FlatNull.getReg(0));
     B.buildSelect(Dst, CmpRes, PtrLo32, SegmentNull.getReg(0));
 
     MI.eraseFromParent();
@@ -1333,21 +1684,16 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
   if (!ApertureReg.isValid())
     return false;
 
-  Register CmpRes = MRI.createGenericVirtualRegister(LLT::scalar(1));
-  B.buildICmp(CmpInst::ICMP_NE, CmpRes, Src, SegmentNull.getReg(0));
-
-  Register BuildPtr = MRI.createGenericVirtualRegister(DstTy);
+  auto CmpRes =
+      B.buildICmp(CmpInst::ICMP_NE, LLT::scalar(1), Src, SegmentNull.getReg(0));
 
   // Coerce the type of the low half of the result so we can use merge_values.
-  Register SrcAsInt = MRI.createGenericVirtualRegister(S32);
-  B.buildInstr(TargetOpcode::G_PTRTOINT)
-    .addDef(SrcAsInt)
-    .addUse(Src);
+  Register SrcAsInt = B.buildPtrToInt(S32, Src).getReg(0);
 
   // TODO: Should we allow mismatched types but matching sizes in merges to
   // avoid the ptrtoint?
-  B.buildMerge(BuildPtr, {SrcAsInt, ApertureReg});
-  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull.getReg(0));
+  auto BuildPtr = B.buildMerge(DstTy, {SrcAsInt, ApertureReg});
+  B.buildSelect(Dst, CmpRes, BuildPtr, FlatNull);
 
   MI.eraseFromParent();
   return true;
@@ -1356,8 +1702,6 @@ bool AMDGPULegalizerInfo::legalizeAddrSpaceCast(
 bool AMDGPULegalizerInfo::legalizeFrint(
   MachineInstr &MI, MachineRegisterInfo &MRI,
   MachineIRBuilder &B) const {
-  B.setInstr(MI);
-
   Register Src = MI.getOperand(1).getReg();
   LLT Ty = MRI.getType(Src);
   assert(Ty.isScalar() && Ty.getSizeInBits() == 64);
@@ -1383,7 +1727,6 @@ bool AMDGPULegalizerInfo::legalizeFrint(
 bool AMDGPULegalizerInfo::legalizeFceil(
   MachineInstr &MI, MachineRegisterInfo &MRI,
   MachineIRBuilder &B) const {
-  B.setInstr(MI);
 
   const LLT S1 = LLT::scalar(1);
   const LLT S64 = LLT::scalar(64);
@@ -1395,7 +1738,7 @@ bool AMDGPULegalizerInfo::legalizeFceil(
   // if (src > 0.0 && src != result)
   //   result += 1.0
 
-  auto Trunc = B.buildInstr(TargetOpcode::G_INTRINSIC_TRUNC, {S64}, {Src});
+  auto Trunc = B.buildIntrinsicTrunc(S64, Src);
 
   const auto Zero = B.buildFConstant(S64, 0.0);
   const auto One = B.buildFConstant(S64, 1.0);
@@ -1428,8 +1771,6 @@ static MachineInstrBuilder extractF64Exponent(unsigned Hi,
 bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
   MachineInstr &MI, MachineRegisterInfo &MRI,
   MachineIRBuilder &B) const {
-  B.setInstr(MI);
-
   const LLT S1 = LLT::scalar(1);
   const LLT S32 = LLT::scalar(32);
   const LLT S64 = LLT::scalar(64);
@@ -1456,7 +1797,7 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
   const auto Zero32 = B.buildConstant(S32, 0);
 
   // Extend back to 64-bits.
-  auto SignBit64 = B.buildMerge(S64, {Zero32.getReg(0), SignBit.getReg(0)});
+  auto SignBit64 = B.buildMerge(S64, {Zero32, SignBit});
 
   auto Shr = B.buildAShr(S64, FractMask, Exp);
   auto Not = B.buildNot(S64, Shr);
@@ -1474,7 +1815,6 @@ bool AMDGPULegalizerInfo::legalizeIntrinsicTrunc(
 bool AMDGPULegalizerInfo::legalizeITOFP(
   MachineInstr &MI, MachineRegisterInfo &MRI,
   MachineIRBuilder &B, bool Signed) const {
-  B.setInstr(MI);
 
   Register Dst = MI.getOperand(0).getReg();
   Register Src = MI.getOperand(1).getReg();
@@ -1503,10 +1843,44 @@ bool AMDGPULegalizerInfo::legalizeITOFP(
   return true;
 }
 
-bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
+// TODO: Copied from DAG implementation. Verify logic and document how this
+// actually works.
+bool AMDGPULegalizerInfo::legalizeFPTOI(
   MachineInstr &MI, MachineRegisterInfo &MRI,
-  MachineIRBuilder &B) const {
-  MachineFunction &MF = B.getMF();
+  MachineIRBuilder &B, bool Signed) const {
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+
+  const LLT S64 = LLT::scalar(64);
+  const LLT S32 = LLT::scalar(32);
+
+  assert(MRI.getType(Src) == S64 && MRI.getType(Dst) == S64);
+
+  unsigned Flags = MI.getFlags();
+
+  auto Trunc = B.buildIntrinsicTrunc(S64, Src, Flags);
+  auto K0 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0x3df0000000000000)));
+  auto K1 = B.buildFConstant(S64, BitsToDouble(UINT64_C(0xc1f0000000000000)));
+
+  auto Mul = B.buildFMul(S64, Trunc, K0, Flags);
+  auto FloorMul = B.buildFFloor(S64, Mul, Flags);
+  auto Fma = B.buildFMA(S64, FloorMul, K1, Trunc, Flags);
+
+  auto Hi = Signed ?
+    B.buildFPTOSI(S32, FloorMul) :
+    B.buildFPTOUI(S32, FloorMul);
+  auto Lo = B.buildFPTOUI(S32, Fma);
+
+  B.buildMerge(Dst, { Lo, Hi });
+  MI.eraseFromParent();
+
+  return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(LegalizerHelper &Helper,
+                                               MachineInstr &MI) const {
+  MachineFunction &MF = Helper.MIRBuilder.getMF();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   const bool IsIEEEOp = MI.getOpcode() == AMDGPU::G_FMINNUM_IEEE ||
@@ -1520,10 +1894,6 @@ bool AMDGPULegalizerInfo::legalizeMinNumMaxNum(
   if (IsIEEEOp)
     return true;
 
-  MachineIRBuilder HelperBuilder(MI);
-  GISelObserverWrapper DummyObserver;
-  LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
-  HelperBuilder.setInstr(MI);
   return Helper.lowerFMinNumMaxNum(MI) == LegalizerHelper::Legalized;
 }
 
@@ -1533,8 +1903,12 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
   // TODO: Should move some of this into LegalizerHelper.
 
   // TODO: Promote dynamic indexing of s16 to s32
-  // TODO: Dynamic s64 indexing is only legal for SGPR.
-  Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(2).getReg(), MRI);
+
+  // FIXME: Artifact combiner probably should have replaced the truncated
+  // constant before this, so we shouldn't need
+  // getConstantVRegValWithLookThrough.
+  Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
+    MI.getOperand(2).getReg(), MRI);
   if (!IdxVal) // Dynamic case will be selected to register indexing.
     return true;
 
@@ -1545,10 +1919,8 @@ bool AMDGPULegalizerInfo::legalizeExtractVectorElt(
   LLT EltTy = VecTy.getElementType();
   assert(EltTy == MRI.getType(Dst));
 
-  B.setInstr(MI);
-
-  if (IdxVal.getValue() < VecTy.getNumElements())
-    B.buildExtract(Dst, Vec, IdxVal.getValue() * EltTy.getSizeInBits());
+  if (IdxVal->Value < VecTy.getNumElements())
+    B.buildExtract(Dst, Vec, IdxVal->Value * EltTy.getSizeInBits());
   else
     B.buildUndef(Dst);
 
@@ -1562,8 +1934,12 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
   // TODO: Should move some of this into LegalizerHelper.
 
   // TODO: Promote dynamic indexing of s16 to s32
-  // TODO: Dynamic s64 indexing is only legal for SGPR.
-  Optional<int64_t> IdxVal = getConstantVRegVal(MI.getOperand(3).getReg(), MRI);
+
+  // FIXME: Artifact combiner probably should have replaced the truncated
+  // constant before this, so we shouldn't need
+  // getConstantVRegValWithLookThrough.
+  Optional<ValueAndVReg> IdxVal = getConstantVRegValWithLookThrough(
+    MI.getOperand(3).getReg(), MRI);
   if (!IdxVal) // Dynamic case will be selected to register indexing.
     return true;
 
@@ -1575,10 +1951,8 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
   LLT EltTy = VecTy.getElementType();
   assert(EltTy == MRI.getType(Ins));
 
-  B.setInstr(MI);
-
-  if (IdxVal.getValue() < VecTy.getNumElements())
-    B.buildInsert(Dst, Vec, Ins, IdxVal.getValue() * EltTy.getSizeInBits());
+  if (IdxVal->Value < VecTy.getNumElements())
+    B.buildInsert(Dst, Vec, Ins, IdxVal->Value * EltTy.getSizeInBits());
   else
     B.buildUndef(Dst);
 
@@ -1586,10 +1960,29 @@ bool AMDGPULegalizerInfo::legalizeInsertVectorElt(
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeShuffleVector(
+  MachineInstr &MI, MachineRegisterInfo &MRI,
+  MachineIRBuilder &B) const {
+  const LLT V2S16 = LLT::vector(2, 16);
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(1).getReg();
+  LLT DstTy = MRI.getType(Dst);
+  LLT SrcTy = MRI.getType(Src0);
+
+  if (SrcTy == V2S16 && DstTy == V2S16 &&
+      AMDGPU::isLegalVOP3PShuffleMask(MI.getOperand(3).getShuffleMask()))
+    return true;
+
+  MachineIRBuilder HelperBuilder(MI);
+  GISelObserverWrapper DummyObserver;
+  LegalizerHelper Helper(B.getMF(), DummyObserver, HelperBuilder);
+  return Helper.lowerShuffleVector(MI) == LegalizerHelper::Legalized;
+}
+
 bool AMDGPULegalizerInfo::legalizeSinCos(
   MachineInstr &MI, MachineRegisterInfo &MRI,
   MachineIRBuilder &B) const {
-  B.setInstr(MI);
 
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
@@ -1597,7 +1990,7 @@ bool AMDGPULegalizerInfo::legalizeSinCos(
   unsigned Flags = MI.getFlags();
 
   Register TrigVal;
-  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 / M_PI);
+  auto OneOver2Pi = B.buildFConstant(Ty, 0.5 * numbers::inv_pi);
   if (ST.hasTrigReducedRange()) {
     auto MulVal = B.buildFMul(Ty, SrcReg, OneOver2Pi, Flags);
     TrigVal = B.buildIntrinsic(Intrinsic::amdgcn_fract, {Ty}, false)
@@ -1615,10 +2008,12 @@ bool AMDGPULegalizerInfo::legalizeSinCos(
   return true;
 }
 
-bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(
-  Register DstReg, LLT PtrTy,
-  MachineIRBuilder &B, const GlobalValue *GV,
-  unsigned Offset, unsigned GAFlags) const {
+bool AMDGPULegalizerInfo::buildPCRelGlobalAddress(Register DstReg, LLT PtrTy,
+                                                  MachineIRBuilder &B,
+                                                  const GlobalValue *GV,
+                                                  int64_t Offset,
+                                                  unsigned GAFlags) const {
+  assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
   // In order to support pc-relative addressing, SI_PC_ADD_REL_OFFSET is lowered
   // to the following code sequence:
   //
@@ -1681,19 +2076,37 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
   const GlobalValue *GV = MI.getOperand(1).getGlobal();
   MachineFunction &MF = B.getMF();
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-  B.setInstr(MI);
 
   if (AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS) {
     if (!MFI->isEntryFunction()) {
       const Function &Fn = MF.getFunction();
       DiagnosticInfoUnsupported BadLDSDecl(
-        Fn, "local memory global used by non-kernel function", MI.getDebugLoc());
+        Fn, "local memory global used by non-kernel function", MI.getDebugLoc(),
+        DS_Warning);
       Fn.getContext().diagnose(BadLDSDecl);
+
+      // We currently don't have a way to correctly allocate LDS objects that
+      // aren't directly associated with a kernel. We do force inlining of
+      // functions that use local objects. However, if these dead functions are
+      // not eliminated, we don't want a compile time error. Just emit a warning
+      // and a trap, since there should be no callable path here.
+      B.buildIntrinsic(Intrinsic::trap, ArrayRef<Register>(), true);
+      B.buildUndef(DstReg);
+      MI.eraseFromParent();
+      return true;
     }
 
     // TODO: We could emit code to handle the initialization somewhere.
     if (!AMDGPUTargetLowering::hasDefinedInitializer(GV)) {
-      B.buildConstant(DstReg, MFI->allocateLDSGlobal(B.getDataLayout(), *GV));
+      const SITargetLowering *TLI = ST.getTargetLowering();
+      if (!TLI->shouldUseLDSConstAddress(GV)) {
+        MI.getOperand(1).setTargetFlags(SIInstrInfo::MO_ABS32_LO);
+        return true; // Leave in place;
+      }
+
+      B.buildConstant(
+          DstReg,
+          MFI->allocateLDSGlobal(B.getDataLayout(), *cast<GlobalVariable>(GV)));
       MI.eraseFromParent();
       return true;
     }
@@ -1723,10 +2136,10 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
   Register GOTAddr = MRI.createGenericVirtualRegister(PtrTy);
 
   MachineMemOperand *GOTMMO = MF.getMachineMemOperand(
-    MachinePointerInfo::getGOT(MF),
-    MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
-    MachineMemOperand::MOInvariant,
-    8 /*Size*/, 8 /*Align*/);
+      MachinePointerInfo::getGOT(MF),
+      MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+          MachineMemOperand::MOInvariant,
+      8 /*Size*/, Align(8));
 
   buildPCRelGlobalAddress(GOTAddr, PtrTy, B, GV, 0, SIInstrInfo::MO_GOTPCREL32);
 
@@ -1744,7 +2157,6 @@ bool AMDGPULegalizerInfo::legalizeGlobalValue(
 bool AMDGPULegalizerInfo::legalizeLoad(
   MachineInstr &MI, MachineRegisterInfo &MRI,
   MachineIRBuilder &B, GISelChangeObserver &Observer) const {
-  B.setInstr(MI);
   LLT ConstPtr = LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64);
   auto Cast = B.buildAddrSpaceCast(ConstPtr, MI.getOperand(1).getReg());
   Observer.changingInstr(MI);
@@ -1763,16 +2175,15 @@ bool AMDGPULegalizerInfo::legalizeFMad(
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   // TODO: Always legal with future ftz flag.
-  if (Ty == LLT::scalar(32) && !MFI->getMode().FP32Denormals)
+  // FIXME: Do we need just output?
+  if (Ty == LLT::scalar(32) && !MFI->getMode().allFP32Denormals())
     return true;
-  if (Ty == LLT::scalar(16) && !MFI->getMode().FP64FP16Denormals)
+  if (Ty == LLT::scalar(16) && !MFI->getMode().allFP64FP16Denormals())
     return true;
 
-
   MachineIRBuilder HelperBuilder(MI);
   GISelObserverWrapper DummyObserver;
   LegalizerHelper Helper(MF, DummyObserver, HelperBuilder);
-  HelperBuilder.setMBB(*MI.getParent());
   return Helper.lowerFMad(MI) == LegalizerHelper::Legalized;
 }
 
@@ -1790,7 +2201,6 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
   LLT ValTy = MRI.getType(CmpVal);
   LLT VecTy = LLT::vector(2, ValTy);
 
-  B.setInstr(MI);
   Register PackedVal = B.buildBuildVector(VecTy, { NewVal, CmpVal }).getReg(0);
 
   B.buildInstr(AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG)
@@ -1803,39 +2213,248 @@ bool AMDGPULegalizerInfo::legalizeAtomicCmpXChg(
   return true;
 }
 
+bool AMDGPULegalizerInfo::legalizeFlog(
+  MachineInstr &MI, MachineIRBuilder &B, double Log2BaseInverted) const {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  LLT Ty = B.getMRI()->getType(Dst);
+  unsigned Flags = MI.getFlags();
+
+  auto Log2Operand = B.buildFLog2(Ty, Src, Flags);
+  auto Log2BaseInvertedOperand = B.buildFConstant(Ty, Log2BaseInverted);
+
+  B.buildFMul(Dst, Log2Operand, Log2BaseInvertedOperand, Flags);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFExp(MachineInstr &MI,
+                                       MachineIRBuilder &B) const {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src = MI.getOperand(1).getReg();
+  unsigned Flags = MI.getFlags();
+  LLT Ty = B.getMRI()->getType(Dst);
+
+  auto K = B.buildFConstant(Ty, numbers::log2e);
+  auto Mul = B.buildFMul(Ty, Src, K, Flags);
+  B.buildFExp2(Dst, Mul, Flags);
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeFPow(MachineInstr &MI,
+                                       MachineIRBuilder &B) const {
+  Register Dst = MI.getOperand(0).getReg();
+  Register Src0 = MI.getOperand(1).getReg();
+  Register Src1 = MI.getOperand(2).getReg();
+  unsigned Flags = MI.getFlags();
+  LLT Ty = B.getMRI()->getType(Dst);
+  const LLT S16 = LLT::scalar(16);
+  const LLT S32 = LLT::scalar(32);
+
+  if (Ty == S32) {
+    auto Log = B.buildFLog2(S32, Src0, Flags);
+    auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
+      .addUse(Log.getReg(0))
+      .addUse(Src1)
+      .setMIFlags(Flags);
+    B.buildFExp2(Dst, Mul, Flags);
+  } else if (Ty == S16) {
+    // There's no f16 fmul_legacy, so we need to convert for it.
+    auto Log = B.buildFLog2(S16, Src0, Flags);
+    auto Ext0 = B.buildFPExt(S32, Log, Flags);
+    auto Ext1 = B.buildFPExt(S32, Src1, Flags);
+    auto Mul = B.buildIntrinsic(Intrinsic::amdgcn_fmul_legacy, {S32}, false)
+      .addUse(Ext0.getReg(0))
+      .addUse(Ext1.getReg(0))
+      .setMIFlags(Flags);
+
+    B.buildFExp2(Dst, B.buildFPTrunc(S16, Mul), Flags);
+  } else
+    return false;
+
+  MI.eraseFromParent();
+  return true;
+}
+
+// Find a source register, ignoring any possible source modifiers.
+static Register stripAnySourceMods(Register OrigSrc, MachineRegisterInfo &MRI) {
+  Register ModSrc = OrigSrc;
+  if (MachineInstr *SrcFNeg = getOpcodeDef(AMDGPU::G_FNEG, ModSrc, MRI)) {
+    ModSrc = SrcFNeg->getOperand(1).getReg();
+    if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
+      ModSrc = SrcFAbs->getOperand(1).getReg();
+  } else if (MachineInstr *SrcFAbs = getOpcodeDef(AMDGPU::G_FABS, ModSrc, MRI))
+    ModSrc = SrcFAbs->getOperand(1).getReg();
+  return ModSrc;
+}
+
+bool AMDGPULegalizerInfo::legalizeFFloor(MachineInstr &MI,
+                                         MachineRegisterInfo &MRI,
+                                         MachineIRBuilder &B) const {
+
+  const LLT S1 = LLT::scalar(1);
+  const LLT S64 = LLT::scalar(64);
+  Register Dst = MI.getOperand(0).getReg();
+  Register OrigSrc = MI.getOperand(1).getReg();
+  unsigned Flags = MI.getFlags();
+  assert(ST.hasFractBug() && MRI.getType(Dst) == S64 &&
+         "this should not have been custom lowered");
+
+  // V_FRACT is buggy on SI, so the F32 version is never used and (x-floor(x))
+  // is used instead. However, SI doesn't have V_FLOOR_F64, so the most
+  // efficient way to implement it is using V_FRACT_F64. The workaround for the
+  // V_FRACT bug is:
+  //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
+  //
+  // Convert floor(x) to (x - fract(x))
+
+  auto Fract = B.buildIntrinsic(Intrinsic::amdgcn_fract, {S64}, false)
+    .addUse(OrigSrc)
+    .setMIFlags(Flags);
+
+  // Give source modifier matching some assistance before obscuring a foldable
+  // pattern.
+
+  // TODO: We can avoid the neg on the fract? The input sign to fract
+  // shouldn't matter?
+  Register ModSrc = stripAnySourceMods(OrigSrc, MRI);
+
+  auto Const = B.buildFConstant(S64, BitsToDouble(0x3fefffffffffffff));
+
+  Register Min = MRI.createGenericVirtualRegister(S64);
+
+  // We don't need to concern ourselves with the snan handling difference, so
+  // use the one which will directly select.
+  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+  if (MFI->getMode().IEEE)
+    B.buildFMinNumIEEE(Min, Fract, Const, Flags);
+  else
+    B.buildFMinNum(Min, Fract, Const, Flags);
+
+  Register CorrectedFract = Min;
+  if (!MI.getFlag(MachineInstr::FmNoNans)) {
+    auto IsNan = B.buildFCmp(CmpInst::FCMP_ORD, S1, ModSrc, ModSrc, Flags);
+    CorrectedFract = B.buildSelect(S64, IsNan, ModSrc, Min, Flags).getReg(0);
+  }
+
+  auto NegFract = B.buildFNeg(S64, CorrectedFract, Flags);
+  B.buildFAdd(Dst, OrigSrc, NegFract, Flags);
+
+  MI.eraseFromParent();
+  return true;
+}
+
+// Turn an illegal packed v2s16 build vector into bit operations.
+// TODO: This should probably be a bitcast action in LegalizerHelper.
+bool AMDGPULegalizerInfo::legalizeBuildVector(
+  MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+  Register Dst = MI.getOperand(0).getReg();
+  const LLT S32 = LLT::scalar(32);
+  assert(MRI.getType(Dst) == LLT::vector(2, 16));
+
+  Register Src0 = MI.getOperand(1).getReg();
+  Register Src1 = MI.getOperand(2).getReg();
+  assert(MRI.getType(Src0) == LLT::scalar(16));
+
+  auto Merge = B.buildMerge(S32, {Src0, Src1});
+  B.buildBitcast(Dst, Merge);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 // Return the use branch instruction, otherwise null if the usage is invalid.
 static MachineInstr *verifyCFIntrinsic(MachineInstr &MI,
                                        MachineRegisterInfo &MRI,
-                                       MachineInstr *&Br) {
+                                       MachineInstr *&Br,
+                                       MachineBasicBlock *&UncondBrTarget) {
   Register CondDef = MI.getOperand(0).getReg();
   if (!MRI.hasOneNonDBGUse(CondDef))
     return nullptr;
 
+  MachineBasicBlock *Parent = MI.getParent();
   MachineInstr &UseMI = *MRI.use_instr_nodbg_begin(CondDef);
-  if (UseMI.getParent() != MI.getParent() ||
+  if (UseMI.getParent() != Parent ||
       UseMI.getOpcode() != AMDGPU::G_BRCOND)
     return nullptr;
 
-  // Make sure the cond br is followed by a G_BR
+  // Make sure the cond br is followed by a G_BR, or is the last instruction.
   MachineBasicBlock::iterator Next = std::next(UseMI.getIterator());
-  if (Next != MI.getParent()->end()) {
+  if (Next == Parent->end()) {
+    MachineFunction::iterator NextMBB = std::next(Parent->getIterator());
+    if (NextMBB == Parent->getParent()->end()) // Illegal intrinsic use.
+      return nullptr;
+    UncondBrTarget = &*NextMBB;
+  } else {
     if (Next->getOpcode() != AMDGPU::G_BR)
       return nullptr;
     Br = &*Next;
+    UncondBrTarget = Br->getOperand(0).getMBB();
   }
 
   return &UseMI;
 }
 
-Register AMDGPULegalizerInfo::getLiveInRegister(MachineRegisterInfo &MRI,
-                                                Register Reg, LLT Ty) const {
-  Register LiveIn = MRI.getLiveInVirtReg(Reg);
-  if (LiveIn)
+Register AMDGPULegalizerInfo::insertLiveInCopy(MachineIRBuilder &B,
+                                               MachineRegisterInfo &MRI,
+                                               Register LiveIn,
+                                               Register PhyReg) const {
+  assert(PhyReg.isPhysical() && "Physical register expected");
+
+  // Insert the live-in copy, if required, by defining destination virtual
+  // register.
+  // FIXME: It seems EmitLiveInCopies isn't called anywhere?
+  if (!MRI.getVRegDef(LiveIn)) {
+    // FIXME: Should have scoped insert pt
+    MachineBasicBlock &OrigInsBB = B.getMBB();
+    auto OrigInsPt = B.getInsertPt();
+
+    MachineBasicBlock &EntryMBB = B.getMF().front();
+    EntryMBB.addLiveIn(PhyReg);
+    B.setInsertPt(EntryMBB, EntryMBB.begin());
+    B.buildCopy(LiveIn, PhyReg);
+
+    B.setInsertPt(OrigInsBB, OrigInsPt);
+  }
+
+  return LiveIn;
+}
+
+Register AMDGPULegalizerInfo::getLiveInRegister(MachineIRBuilder &B,
+                                                MachineRegisterInfo &MRI,
+                                                Register PhyReg, LLT Ty,
+                                                bool InsertLiveInCopy) const {
+  assert(PhyReg.isPhysical() && "Physical register expected");
+
+  // Get or create virtual live-in regester
+  Register LiveIn = MRI.getLiveInVirtReg(PhyReg);
+  if (!LiveIn) {
+    LiveIn = MRI.createGenericVirtualRegister(Ty);
+    MRI.addLiveIn(PhyReg, LiveIn);
+  }
+
+  // When the actual true copy required is from virtual register to physical
+  // register (to be inserted later), live-in copy insertion from physical
+  // to register virtual register is not required
+  if (!InsertLiveInCopy)
     return LiveIn;
 
-  Register NewReg = MRI.createGenericVirtualRegister(Ty);
-  MRI.addLiveIn(Reg, NewReg);
-  return NewReg;
+  return insertLiveInCopy(B, MRI, LiveIn, PhyReg);
+}
+
+const ArgDescriptor *AMDGPULegalizerInfo::getArgDescriptor(
+    MachineIRBuilder &B, AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
+  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+  const ArgDescriptor *Arg;
+  const TargetRegisterClass *RC;
+  LLT ArgTy;
+  std::tie(Arg, RC, ArgTy) = MFI->getPreloadedValue(ArgType);
+  if (!Arg) {
+    LLVM_DEBUG(dbgs() << "Required arg register missing\n");
+    return nullptr;
+  }
+  return Arg;
 }
 
 bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
@@ -1843,12 +2462,14 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
   if (!Arg->isRegister() || !Arg->getRegister().isValid())
     return false; // TODO: Handle these
 
-  assert(Arg->getRegister().isPhysical());
+  Register SrcReg = Arg->getRegister();
+  assert(SrcReg.isPhysical() && "Physical register expected");
+  assert(DstReg.isVirtual() && "Virtual register expected");
 
   MachineRegisterInfo &MRI = *B.getMRI();
 
   LLT Ty = MRI.getType(DstReg);
-  Register LiveIn = getLiveInRegister(MRI, Arg->getRegister(), Ty);
+  Register LiveIn = getLiveInRegister(B, MRI, SrcReg, Ty);
 
   if (Arg->isMasked()) {
     // TODO: Should we try to emit this once in the entry block?
@@ -1864,56 +2485,31 @@ bool AMDGPULegalizerInfo::loadInputValue(Register DstReg, MachineIRBuilder &B,
     }
 
     B.buildAnd(DstReg, AndMaskSrc, B.buildConstant(S32, Mask >> Shift));
-  } else
+  } else {
     B.buildCopy(DstReg, LiveIn);
-
-  // Insert the argument copy if it doens't already exist.
-  // FIXME: It seems EmitLiveInCopies isn't called anywhere?
-  if (!MRI.getVRegDef(LiveIn)) {
-    // FIXME: Should have scoped insert pt
-    MachineBasicBlock &OrigInsBB = B.getMBB();
-    auto OrigInsPt = B.getInsertPt();
-
-    MachineBasicBlock &EntryMBB = B.getMF().front();
-    EntryMBB.addLiveIn(Arg->getRegister());
-    B.setInsertPt(EntryMBB, EntryMBB.begin());
-    B.buildCopy(LiveIn, Arg->getRegister());
-
-    B.setInsertPt(OrigInsBB, OrigInsPt);
   }
 
   return true;
 }
 
 bool AMDGPULegalizerInfo::legalizePreloadedArgIntrin(
-  MachineInstr &MI,
-  MachineRegisterInfo &MRI,
-  MachineIRBuilder &B,
-  AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
-  B.setInstr(MI);
-
-  const SIMachineFunctionInfo *MFI = B.getMF().getInfo<SIMachineFunctionInfo>();
+    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
+    AMDGPUFunctionArgInfo::PreloadedValue ArgType) const {
 
-  const ArgDescriptor *Arg;
-  const TargetRegisterClass *RC;
-  std::tie(Arg, RC) = MFI->getPreloadedValue(ArgType);
-  if (!Arg) {
-    LLVM_DEBUG(dbgs() << "Required arg register missing\n");
+  const ArgDescriptor *Arg = getArgDescriptor(B, ArgType);
+  if (!Arg)
     return false;
-  }
 
-  if (loadInputValue(MI.getOperand(0).getReg(), B, Arg)) {
-    MI.eraseFromParent();
-    return true;
-  }
+  if (!loadInputValue(MI.getOperand(0).getReg(), B, Arg))
+    return false;
 
-  return false;
+  MI.eraseFromParent();
+  return true;
 }
 
 bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
                                        MachineRegisterInfo &MRI,
                                        MachineIRBuilder &B) const {
-  B.setInstr(MI);
   Register Dst = MI.getOperand(0).getReg();
   LLT DstTy = MRI.getType(Dst);
   LLT S16 = LLT::scalar(16);
@@ -1933,6 +2529,284 @@ bool AMDGPULegalizerInfo::legalizeFDIV(MachineInstr &MI,
   return false;
 }
 
+void AMDGPULegalizerInfo::legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
+                                                  Register DstReg,
+                                                  Register X,
+                                                  Register Y,
+                                                  bool IsDiv) const {
+  const LLT S1 = LLT::scalar(1);
+  const LLT S32 = LLT::scalar(32);
+
+  // See AMDGPUCodeGenPrepare::expandDivRem32 for a description of the
+  // algorithm used here.
+
+  // Initial estimate of inv(y).
+  auto FloatY = B.buildUITOFP(S32, Y);
+  auto RcpIFlag = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {FloatY});
+  auto Scale = B.buildFConstant(S32, BitsToFloat(0x4f7ffffe));
+  auto ScaledY = B.buildFMul(S32, RcpIFlag, Scale);
+  auto Z = B.buildFPTOUI(S32, ScaledY);
+
+  // One round of UNR.
+  auto NegY = B.buildSub(S32, B.buildConstant(S32, 0), Y);
+  auto NegYZ = B.buildMul(S32, NegY, Z);
+  Z = B.buildAdd(S32, Z, B.buildUMulH(S32, Z, NegYZ));
+
+  // Quotient/remainder estimate.
+  auto Q = B.buildUMulH(S32, X, Z);
+  auto R = B.buildSub(S32, X, B.buildMul(S32, Q, Y));
+
+  // First quotient/remainder refinement.
+  auto One = B.buildConstant(S32, 1);
+  auto Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
+  if (IsDiv)
+    Q = B.buildSelect(S32, Cond, B.buildAdd(S32, Q, One), Q);
+  R = B.buildSelect(S32, Cond, B.buildSub(S32, R, Y), R);
+
+  // Second quotient/remainder refinement.
+  Cond = B.buildICmp(CmpInst::ICMP_UGE, S1, R, Y);
+  if (IsDiv)
+    B.buildSelect(DstReg, Cond, B.buildAdd(S32, Q, One), Q);
+  else
+    B.buildSelect(DstReg, Cond, B.buildSub(S32, R, Y), R);
+}
+
+bool AMDGPULegalizerInfo::legalizeUDIV_UREM32(MachineInstr &MI,
+                                              MachineRegisterInfo &MRI,
+                                              MachineIRBuilder &B) const {
+  const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Num = MI.getOperand(1).getReg();
+  Register Den = MI.getOperand(2).getReg();
+  legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
+  MI.eraseFromParent();
+  return true;
+}
+
+// Build integer reciprocal sequence arounud V_RCP_IFLAG_F32
+//
+// Return lo, hi of result
+//
+// %cvt.lo = G_UITOFP Val.lo
+// %cvt.hi = G_UITOFP Val.hi
+// %mad = G_FMAD %cvt.hi, 2**32, %cvt.lo
+// %rcp = G_AMDGPU_RCP_IFLAG %mad
+// %mul1 = G_FMUL %rcp, 0x5f7ffffc
+// %mul2 = G_FMUL %mul1, 2**(-32)
+// %trunc = G_INTRINSIC_TRUNC %mul2
+// %mad2 = G_FMAD %trunc, -(2**32), %mul1
+// return {G_FPTOUI %mad2, G_FPTOUI %trunc}
+static std::pair<Register, Register> emitReciprocalU64(MachineIRBuilder &B,
+                                                       Register Val) {
+  const LLT S32 = LLT::scalar(32);
+  auto Unmerge = B.buildUnmerge(S32, Val);
+
+  auto CvtLo = B.buildUITOFP(S32, Unmerge.getReg(0));
+  auto CvtHi = B.buildUITOFP(S32, Unmerge.getReg(1));
+
+  auto Mad = B.buildFMAD(S32, CvtHi, // 2**32
+                         B.buildFConstant(S32, BitsToFloat(0x4f800000)), CvtLo);
+
+  auto Rcp = B.buildInstr(AMDGPU::G_AMDGPU_RCP_IFLAG, {S32}, {Mad});
+  auto Mul1 =
+      B.buildFMul(S32, Rcp, B.buildFConstant(S32, BitsToFloat(0x5f7ffffc)));
+
+  // 2**(-32)
+  auto Mul2 =
+      B.buildFMul(S32, Mul1, B.buildFConstant(S32, BitsToFloat(0x2f800000)));
+  auto Trunc = B.buildIntrinsicTrunc(S32, Mul2);
+
+  // -(2**32)
+  auto Mad2 = B.buildFMAD(S32, Trunc,
+                          B.buildFConstant(S32, BitsToFloat(0xcf800000)), Mul1);
+
+  auto ResultLo = B.buildFPTOUI(S32, Mad2);
+  auto ResultHi = B.buildFPTOUI(S32, Trunc);
+
+  return {ResultLo.getReg(0), ResultHi.getReg(0)};
+}
+
+void AMDGPULegalizerInfo::legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
+                                                  Register DstReg,
+                                                  Register Numer,
+                                                  Register Denom,
+                                                  bool IsDiv) const {
+  const LLT S32 = LLT::scalar(32);
+  const LLT S64 = LLT::scalar(64);
+  const LLT S1 = LLT::scalar(1);
+  Register RcpLo, RcpHi;
+
+  std::tie(RcpLo, RcpHi) = emitReciprocalU64(B, Denom);
+
+  auto Rcp = B.buildMerge(S64, {RcpLo, RcpHi});
+
+  auto Zero64 = B.buildConstant(S64, 0);
+  auto NegDenom = B.buildSub(S64, Zero64, Denom);
+
+  auto MulLo1 = B.buildMul(S64, NegDenom, Rcp);
+  auto MulHi1 = B.buildUMulH(S64, Rcp, MulLo1);
+
+  auto UnmergeMulHi1 = B.buildUnmerge(S32, MulHi1);
+  Register MulHi1_Lo = UnmergeMulHi1.getReg(0);
+  Register MulHi1_Hi = UnmergeMulHi1.getReg(1);
+
+  auto Add1_Lo = B.buildUAddo(S32, S1, RcpLo, MulHi1_Lo);
+  auto Add1_Hi = B.buildUAdde(S32, S1, RcpHi, MulHi1_Hi, Add1_Lo.getReg(1));
+  auto Add1_HiNc = B.buildAdd(S32, RcpHi, MulHi1_Hi);
+  auto Add1 = B.buildMerge(S64, {Add1_Lo, Add1_Hi});
+
+  auto MulLo2 = B.buildMul(S64, NegDenom, Add1);
+  auto MulHi2 = B.buildUMulH(S64, Add1, MulLo2);
+  auto UnmergeMulHi2 = B.buildUnmerge(S32, MulHi2);
+  Register MulHi2_Lo = UnmergeMulHi2.getReg(0);
+  Register MulHi2_Hi = UnmergeMulHi2.getReg(1);
+
+  auto Zero32 = B.buildConstant(S32, 0);
+  auto Add2_Lo = B.buildUAddo(S32, S1, Add1_Lo, MulHi2_Lo);
+  auto Add2_HiC =
+      B.buildUAdde(S32, S1, Add1_HiNc, MulHi2_Hi, Add1_Lo.getReg(1));
+  auto Add2_Hi = B.buildUAdde(S32, S1, Add2_HiC, Zero32, Add2_Lo.getReg(1));
+  auto Add2 = B.buildMerge(S64, {Add2_Lo, Add2_Hi});
+
+  auto UnmergeNumer = B.buildUnmerge(S32, Numer);
+  Register NumerLo = UnmergeNumer.getReg(0);
+  Register NumerHi = UnmergeNumer.getReg(1);
+
+  auto MulHi3 = B.buildUMulH(S64, Numer, Add2);
+  auto Mul3 = B.buildMul(S64, Denom, MulHi3);
+  auto UnmergeMul3 = B.buildUnmerge(S32, Mul3);
+  Register Mul3_Lo = UnmergeMul3.getReg(0);
+  Register Mul3_Hi = UnmergeMul3.getReg(1);
+  auto Sub1_Lo = B.buildUSubo(S32, S1, NumerLo, Mul3_Lo);
+  auto Sub1_Hi = B.buildUSube(S32, S1, NumerHi, Mul3_Hi, Sub1_Lo.getReg(1));
+  auto Sub1_Mi = B.buildSub(S32, NumerHi, Mul3_Hi);
+  auto Sub1 = B.buildMerge(S64, {Sub1_Lo, Sub1_Hi});
+
+  auto UnmergeDenom = B.buildUnmerge(S32, Denom);
+  Register DenomLo = UnmergeDenom.getReg(0);
+  Register DenomHi = UnmergeDenom.getReg(1);
+
+  auto CmpHi = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Hi, DenomHi);
+  auto C1 = B.buildSExt(S32, CmpHi);
+
+  auto CmpLo = B.buildICmp(CmpInst::ICMP_UGE, S1, Sub1_Lo, DenomLo);
+  auto C2 = B.buildSExt(S32, CmpLo);
+
+  auto CmpEq = B.buildICmp(CmpInst::ICMP_EQ, S1, Sub1_Hi, DenomHi);
+  auto C3 = B.buildSelect(S32, CmpEq, C2, C1);
+
+  // TODO: Here and below portions of the code can be enclosed into if/endif.
+  // Currently control flow is unconditional and we have 4 selects after
+  // potential endif to substitute PHIs.
+
+  // if C3 != 0 ...
+  auto Sub2_Lo = B.buildUSubo(S32, S1, Sub1_Lo, DenomLo);
+  auto Sub2_Mi = B.buildUSube(S32, S1, Sub1_Mi, DenomHi, Sub1_Lo.getReg(1));
+  auto Sub2_Hi = B.buildUSube(S32, S1, Sub2_Mi, Zero32, Sub2_Lo.getReg(1));
+  auto Sub2 = B.buildMerge(S64, {Sub2_Lo, Sub2_Hi});
+
+  auto One64 = B.buildConstant(S64, 1);
+  auto Add3 = B.buildAdd(S64, MulHi3, One64);
+
+  auto C4 =
+      B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Hi, DenomHi));
+  auto C5 =
+      B.buildSExt(S32, B.buildICmp(CmpInst::ICMP_UGE, S1, Sub2_Lo, DenomLo));
+  auto C6 = B.buildSelect(
+      S32, B.buildICmp(CmpInst::ICMP_EQ, S1, Sub2_Hi, DenomHi), C5, C4);
+
+  // if (C6 != 0)
+  auto Add4 = B.buildAdd(S64, Add3, One64);
+  auto Sub3_Lo = B.buildUSubo(S32, S1, Sub2_Lo, DenomLo);
+
+  auto Sub3_Mi = B.buildUSube(S32, S1, Sub2_Mi, DenomHi, Sub2_Lo.getReg(1));
+  auto Sub3_Hi = B.buildUSube(S32, S1, Sub3_Mi, Zero32, Sub3_Lo.getReg(1));
+  auto Sub3 = B.buildMerge(S64, {Sub3_Lo, Sub3_Hi});
+
+  // endif C6
+  // endif C3
+
+  if (IsDiv) {
+    auto Sel1 = B.buildSelect(
+        S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Add4, Add3);
+    B.buildSelect(DstReg,
+                  B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel1, MulHi3);
+  } else {
+    auto Sel2 = B.buildSelect(
+        S64, B.buildICmp(CmpInst::ICMP_NE, S1, C6, Zero32), Sub3, Sub2);
+    B.buildSelect(DstReg,
+                  B.buildICmp(CmpInst::ICMP_NE, S1, C3, Zero32), Sel2, Sub1);
+  }
+}
+
+bool AMDGPULegalizerInfo::legalizeUDIV_UREM(MachineInstr &MI,
+                                            MachineRegisterInfo &MRI,
+                                            MachineIRBuilder &B) const {
+  const LLT S64 = LLT::scalar(64);
+  const LLT S32 = LLT::scalar(32);
+  const bool IsDiv = MI.getOpcode() == AMDGPU::G_UDIV;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register Num = MI.getOperand(1).getReg();
+  Register Den = MI.getOperand(2).getReg();
+  LLT Ty = MRI.getType(DstReg);
+
+  if (Ty == S32)
+    legalizeUDIV_UREM32Impl(B, DstReg, Num, Den, IsDiv);
+  else if (Ty == S64)
+    legalizeUDIV_UREM64Impl(B, DstReg, Num, Den, IsDiv);
+  else
+    return false;
+
+  MI.eraseFromParent();
+  return true;
+
+}
+
+bool AMDGPULegalizerInfo::legalizeSDIV_SREM(MachineInstr &MI,
+                                            MachineRegisterInfo &MRI,
+                                            MachineIRBuilder &B) const {
+  const LLT S64 = LLT::scalar(64);
+  const LLT S32 = LLT::scalar(32);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  const LLT Ty = MRI.getType(DstReg);
+  if (Ty != S32 && Ty != S64)
+    return false;
+
+  const bool IsDiv = MI.getOpcode() == AMDGPU::G_SDIV;
+
+  Register LHS = MI.getOperand(1).getReg();
+  Register RHS = MI.getOperand(2).getReg();
+
+  auto SignBitOffset = B.buildConstant(S32, Ty.getSizeInBits() - 1);
+  auto LHSign = B.buildAShr(Ty, LHS, SignBitOffset);
+  auto RHSign = B.buildAShr(Ty, RHS, SignBitOffset);
+
+  LHS = B.buildAdd(Ty, LHS, LHSign).getReg(0);
+  RHS = B.buildAdd(Ty, RHS, RHSign).getReg(0);
+
+  LHS = B.buildXor(Ty, LHS, LHSign).getReg(0);
+  RHS = B.buildXor(Ty, RHS, RHSign).getReg(0);
+
+  Register UDivRem = MRI.createGenericVirtualRegister(Ty);
+  if (Ty == S32)
+    legalizeUDIV_UREM32Impl(B, UDivRem, LHS, RHS, IsDiv);
+  else
+    legalizeUDIV_UREM64Impl(B, UDivRem, LHS, RHS, IsDiv);
+
+  Register Sign;
+  if (IsDiv)
+    Sign = B.buildXor(Ty, LHSign, RHSign).getReg(0);
+  else
+    Sign = LHSign.getReg(0); // Remainder sign is the same as LHS
+
+  UDivRem = B.buildXor(Ty, UDivRem, Sign).getReg(0);
+  B.buildSub(DstReg, UDivRem, Sign);
+
+  MI.eraseFromParent();
+  return true;
+}
+
 bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
                                                  MachineRegisterInfo &MRI,
                                                  MachineIRBuilder &B) const {
@@ -1954,7 +2828,7 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
     return false;
 
   if (!Unsafe && ResTy == S32 &&
-      MF.getInfo<SIMachineFunctionInfo>()->getMode().FP32Denormals)
+      MF.getInfo<SIMachineFunctionInfo>()->getMode().allFP32Denormals())
     return false;
 
   if (auto CLHS = getConstantFPVRegVal(LHS, MRI)) {
@@ -1997,7 +2871,6 @@ bool AMDGPULegalizerInfo::legalizeFastUnsafeFDIV(MachineInstr &MI,
 bool AMDGPULegalizerInfo::legalizeFDIV16(MachineInstr &MI,
                                          MachineRegisterInfo &MRI,
                                          MachineIRBuilder &B) const {
-  B.setInstr(MI);
   Register Res = MI.getOperand(0).getReg();
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
@@ -2035,15 +2908,13 @@ static void toggleSPDenormMode(bool Enable,
                                AMDGPU::SIModeRegisterDefaults Mode) {
   // Set SP denorm mode to this value.
   unsigned SPDenormMode =
-    Enable ? FP_DENORM_FLUSH_NONE : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+    Enable ? FP_DENORM_FLUSH_NONE : Mode.fpDenormModeSPValue();
 
   if (ST.hasDenormModeInst()) {
     // Preserve default FP64FP16 denorm mode while updating FP32 mode.
-    unsigned DPDenormModeDefault = Mode.FP64FP16Denormals
-                                   ? FP_DENORM_FLUSH_NONE
-                                   : FP_DENORM_FLUSH_IN_FLUSH_OUT;
+    uint32_t DPDenormModeDefault = Mode.fpDenormModeDPValue();
 
-    unsigned NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
+    uint32_t NewDenormModeValue = SPDenormMode | (DPDenormModeDefault << 2);
     B.buildInstr(AMDGPU::S_DENORM_MODE)
       .addImm(NewDenormModeValue);
 
@@ -2062,7 +2933,6 @@ static void toggleSPDenormMode(bool Enable,
 bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
                                          MachineRegisterInfo &MRI,
                                          MachineIRBuilder &B) const {
-  B.setInstr(MI);
   Register Res = MI.getOperand(0).getReg();
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
@@ -2078,15 +2948,15 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
 
   auto DenominatorScaled =
     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
-      .addUse(RHS)
       .addUse(LHS)
-      .addImm(1)
+      .addUse(RHS)
+      .addImm(0)
       .setMIFlags(Flags);
   auto NumeratorScaled =
     B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S32, S1}, false)
       .addUse(LHS)
       .addUse(RHS)
-      .addImm(0)
+      .addImm(1)
       .setMIFlags(Flags);
 
   auto ApproxRcp = B.buildIntrinsic(Intrinsic::amdgcn_rcp, {S32}, false)
@@ -2096,7 +2966,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
 
   // FIXME: Doesn't correctly model the FP mode switch, and the FP operations
   // aren't modeled as reading it.
-  if (!Mode.FP32Denormals)
+  if (!Mode.allFP32Denormals())
     toggleSPDenormMode(true, B, ST, Mode);
 
   auto Fma0 = B.buildFMA(S32, NegDivScale0, ApproxRcp, One, Flags);
@@ -2106,7 +2976,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
   auto Fma3 = B.buildFMA(S32, Fma2, Fma1, Mul, Flags);
   auto Fma4 = B.buildFMA(S32, NegDivScale0, Fma3, NumeratorScaled, Flags);
 
-  if (!Mode.FP32Denormals)
+  if (!Mode.allFP32Denormals())
     toggleSPDenormMode(false, B, ST, Mode);
 
   auto Fmas = B.buildIntrinsic(Intrinsic::amdgcn_div_fmas, {S32}, false)
@@ -2129,7 +2999,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV32(MachineInstr &MI,
 bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
                                          MachineRegisterInfo &MRI,
                                          MachineIRBuilder &B) const {
-  B.setInstr(MI);
   Register Res = MI.getOperand(0).getReg();
   Register LHS = MI.getOperand(1).getReg();
   Register RHS = MI.getOperand(2).getReg();
@@ -2144,7 +3013,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
   auto DivScale0 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
     .addUse(LHS)
     .addUse(RHS)
-    .addImm(1)
+    .addImm(0)
     .setMIFlags(Flags);
 
   auto NegDivScale0 = B.buildFNeg(S64, DivScale0.getReg(0), Flags);
@@ -2160,11 +3029,11 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
   auto DivScale1 = B.buildIntrinsic(Intrinsic::amdgcn_div_scale, {S64, S1}, false)
     .addUse(LHS)
     .addUse(RHS)
-    .addImm(0)
+    .addImm(1)
     .setMIFlags(Flags);
 
   auto Fma3 = B.buildFMA(S64, Fma1, Fma2, Fma1, Flags);
-  auto Mul = B.buildMul(S64, DivScale1.getReg(0), Fma3, Flags);
+  auto Mul = B.buildFMul(S64, DivScale1.getReg(0), Fma3, Flags);
   auto Fma4 = B.buildFMA(S64, NegDivScale0, Mul, DivScale1.getReg(0), Flags);
 
   Register Scale;
@@ -2172,8 +3041,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
     // Workaround a hardware bug on SI where the condition output from div_scale
     // is not usable.
 
-    Scale = MRI.createGenericVirtualRegister(S1);
-
     LLT S32 = LLT::scalar(32);
 
     auto NumUnmerge = B.buildUnmerge(S32, LHS);
@@ -2185,7 +3052,7 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
                               Scale1Unmerge.getReg(1));
     auto CmpDen = B.buildICmp(ICmpInst::ICMP_EQ, S1, DenUnmerge.getReg(1),
                               Scale0Unmerge.getReg(1));
-    B.buildXor(Scale, CmpNum, CmpDen);
+    Scale = B.buildXor(S1, CmpNum, CmpDen).getReg(0);
   } else {
     Scale = DivScale1.getReg(1);
   }
@@ -2210,7 +3077,6 @@ bool AMDGPULegalizerInfo::legalizeFDIV64(MachineInstr &MI,
 bool AMDGPULegalizerInfo::legalizeFDIVFastIntrin(MachineInstr &MI,
                                                  MachineRegisterInfo &MRI,
                                                  MachineIRBuilder &B) const {
-  B.setInstr(MI);
   Register Res = MI.getOperand(0).getReg();
   Register LHS = MI.getOperand(2).getReg();
   Register RHS = MI.getOperand(3).getReg();
@@ -2252,8 +3118,6 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
                                       AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
   }
 
-  B.setInstr(MI);
-
   uint64_t Offset =
     ST.getTargetLowering()->getImplicitParameterOffset(
       B.getMF(), AMDGPUTargetLowering::FIRST_IMPLICIT);
@@ -2263,8 +3127,9 @@ bool AMDGPULegalizerInfo::legalizeImplicitArgPtr(MachineInstr &MI,
 
   const ArgDescriptor *Arg;
   const TargetRegisterClass *RC;
-  std::tie(Arg, RC)
-    = MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+  LLT ArgTy;
+  std::tie(Arg, RC, ArgTy) =
+      MFI->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
   if (!Arg)
     return false;
 
@@ -2281,7 +3146,6 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
                                               MachineRegisterInfo &MRI,
                                               MachineIRBuilder &B,
                                               unsigned AddrSpace) const {
-  B.setInstr(MI);
   Register ApertureReg = getSegmentAperture(AddrSpace, MRI, B);
   auto Hi32 = B.buildExtract(LLT::scalar(32), MI.getOperand(2).getReg(), 32);
   B.buildICmp(ICmpInst::ICMP_EQ, MI.getOperand(0), Hi32, ApertureReg);
@@ -2289,6 +3153,55 @@ bool AMDGPULegalizerInfo::legalizeIsAddrSpace(MachineInstr &MI,
   return true;
 }
 
+// The raw.(t)buffer and struct.(t)buffer intrinsics have two offset args:
+// offset (the offset that is included in bounds checking and swizzling, to be
+// split between the instruction's voffset and immoffset fields) and soffset
+// (the offset that is excluded from bounds checking and swizzling, to go in
+// the instruction's soffset field).  This function takes the first kind of
+// offset and figures out how to split it between voffset and immoffset.
+std::tuple<Register, unsigned, unsigned>
+AMDGPULegalizerInfo::splitBufferOffsets(MachineIRBuilder &B,
+                                        Register OrigOffset) const {
+  const unsigned MaxImm = 4095;
+  Register BaseReg;
+  unsigned TotalConstOffset;
+  MachineInstr *OffsetDef;
+  const LLT S32 = LLT::scalar(32);
+
+  std::tie(BaseReg, TotalConstOffset, OffsetDef)
+    = AMDGPU::getBaseWithConstantOffset(*B.getMRI(), OrigOffset);
+
+  unsigned ImmOffset = TotalConstOffset;
+
+  // If the immediate value is too big for the immoffset field, put the value
+  // and -4096 into the immoffset field so that the value that is copied/added
+  // for the voffset field is a multiple of 4096, and it stands more chance
+  // of being CSEd with the copy/add for another similar load/store.
+  // However, do not do that rounding down to a multiple of 4096 if that is a
+  // negative number, as it appears to be illegal to have a negative offset
+  // in the vgpr, even if adding the immediate offset makes it positive.
+  unsigned Overflow = ImmOffset & ~MaxImm;
+  ImmOffset -= Overflow;
+  if ((int32_t)Overflow < 0) {
+    Overflow += ImmOffset;
+    ImmOffset = 0;
+  }
+
+  if (Overflow != 0) {
+    if (!BaseReg) {
+      BaseReg = B.buildConstant(S32, Overflow).getReg(0);
+    } else {
+      auto OverflowVal = B.buildConstant(S32, Overflow);
+      BaseReg = B.buildAdd(S32, BaseReg, OverflowVal).getReg(0);
+    }
+  }
+
+  if (!BaseReg)
+    BaseReg = B.buildConstant(S32, 0).getReg(0);
+
+  return std::make_tuple(BaseReg, ImmOffset, TotalConstOffset);
+}
+
 /// Handle register layout difference for f16 images for some subtargets.
 Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
                                              MachineRegisterInfo &MRI,
@@ -2312,75 +3225,969 @@ Register AMDGPULegalizerInfo::handleD16VData(MachineIRBuilder &B,
   return B.buildBuildVector(LLT::vector(NumElts, S32), WideRegs).getReg(0);
 }
 
-bool AMDGPULegalizerInfo::legalizeRawBufferStore(MachineInstr &MI,
-                                                 MachineRegisterInfo &MRI,
-                                                 MachineIRBuilder &B,
-                                                 bool IsFormat) const {
-  // TODO: Reject f16 format on targets where unsupported.
-  Register VData = MI.getOperand(1).getReg();
-  LLT Ty = MRI.getType(VData);
+Register AMDGPULegalizerInfo::fixStoreSourceType(
+  MachineIRBuilder &B, Register VData, bool IsFormat) const {
+  MachineRegisterInfo *MRI = B.getMRI();
+  LLT Ty = MRI->getType(VData);
 
-  B.setInstr(MI);
-
-  const LLT S32 = LLT::scalar(32);
   const LLT S16 = LLT::scalar(16);
 
   // Fixup illegal register types for i8 stores.
   if (Ty == LLT::scalar(8) || Ty == S16) {
     Register AnyExt = B.buildAnyExt(LLT::scalar(32), VData).getReg(0);
-    MI.getOperand(1).setReg(AnyExt);
-    return true;
+    return AnyExt;
   }
 
   if (Ty.isVector()) {
     if (Ty.getElementType() == S16 && Ty.getNumElements() <= 4) {
       if (IsFormat)
-        MI.getOperand(1).setReg(handleD16VData(B, MRI, VData));
+        return handleD16VData(B, *MRI, VData);
+    }
+  }
+
+  return VData;
+}
+
+bool AMDGPULegalizerInfo::legalizeBufferStore(MachineInstr &MI,
+                                              MachineRegisterInfo &MRI,
+                                              MachineIRBuilder &B,
+                                              bool IsTyped,
+                                              bool IsFormat) const {
+  Register VData = MI.getOperand(1).getReg();
+  LLT Ty = MRI.getType(VData);
+  LLT EltTy = Ty.getScalarType();
+  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
+  const LLT S32 = LLT::scalar(32);
+
+  VData = fixStoreSourceType(B, VData, IsFormat);
+  Register RSrc = MI.getOperand(2).getReg();
+
+  MachineMemOperand *MMO = *MI.memoperands_begin();
+  const int MemSize = MMO->getSize();
+
+  unsigned ImmOffset;
+  unsigned TotalOffset;
+
+  // The typed intrinsics add an immediate after the registers.
+  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
+
+  // The struct intrinsic variants add one additional operand over raw.
+  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
+  Register VIndex;
+  int OpOffset = 0;
+  if (HasVIndex) {
+    VIndex = MI.getOperand(3).getReg();
+    OpOffset = 1;
+  }
+
+  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
+  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
+
+  unsigned Format = 0;
+  if (IsTyped) {
+    Format = MI.getOperand(5 + OpOffset).getImm();
+    ++OpOffset;
+  }
+
+  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
+
+  std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
+  if (TotalOffset != 0)
+    MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
+
+  unsigned Opc;
+  if (IsTyped) {
+    Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16 :
+                  AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT;
+  } else if (IsFormat) {
+    Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16 :
+                  AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT;
+  } else {
+    switch (MemSize) {
+    case 1:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE;
+      break;
+    case 2:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT;
+      break;
+    default:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_STORE;
+      break;
+    }
+  }
+
+  if (!VIndex)
+    VIndex = B.buildConstant(S32, 0).getReg(0);
+
+  auto MIB = B.buildInstr(Opc)
+    .addUse(VData)              // vdata
+    .addUse(RSrc)               // rsrc
+    .addUse(VIndex)             // vindex
+    .addUse(VOffset)            // voffset
+    .addUse(SOffset)            // soffset
+    .addImm(ImmOffset);         // offset(imm)
+
+  if (IsTyped)
+    MIB.addImm(Format);
+
+  MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
+     .addImm(HasVIndex ? -1 : 0) // idxen(imm)
+     .addMemOperand(MMO);
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeBufferLoad(MachineInstr &MI,
+                                             MachineRegisterInfo &MRI,
+                                             MachineIRBuilder &B,
+                                             bool IsFormat,
+                                             bool IsTyped) const {
+  // FIXME: Verifier should enforce 1 MMO for these intrinsics.
+  MachineMemOperand *MMO = *MI.memoperands_begin();
+  const int MemSize = MMO->getSize();
+  const LLT S32 = LLT::scalar(32);
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register RSrc = MI.getOperand(2).getReg();
+
+  // The typed intrinsics add an immediate after the registers.
+  const unsigned NumVIndexOps = IsTyped ? 8 : 7;
+
+  // The struct intrinsic variants add one additional operand over raw.
+  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
+  Register VIndex;
+  int OpOffset = 0;
+  if (HasVIndex) {
+    VIndex = MI.getOperand(3).getReg();
+    OpOffset = 1;
+  }
+
+  Register VOffset = MI.getOperand(3 + OpOffset).getReg();
+  Register SOffset = MI.getOperand(4 + OpOffset).getReg();
+
+  unsigned Format = 0;
+  if (IsTyped) {
+    Format = MI.getOperand(5 + OpOffset).getImm();
+    ++OpOffset;
+  }
+
+  unsigned AuxiliaryData = MI.getOperand(5 + OpOffset).getImm();
+  unsigned ImmOffset;
+  unsigned TotalOffset;
+
+  LLT Ty = MRI.getType(Dst);
+  LLT EltTy = Ty.getScalarType();
+  const bool IsD16 = IsFormat && (EltTy.getSizeInBits() == 16);
+  const bool Unpacked = ST.hasUnpackedD16VMem();
+
+  std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
+  if (TotalOffset != 0)
+    MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MemSize);
+
+  unsigned Opc;
+
+  if (IsTyped) {
+    Opc = IsD16 ? AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 :
+                  AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT;
+  } else if (IsFormat) {
+    Opc = IsD16 ? AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16 :
+                  AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT;
+  } else {
+    switch (MemSize) {
+    case 1:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE;
+      break;
+    case 2:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT;
+      break;
+    default:
+      Opc = AMDGPU::G_AMDGPU_BUFFER_LOAD;
+      break;
+    }
+  }
+
+  Register LoadDstReg;
+
+  bool IsExtLoad = (!IsD16 && MemSize < 4) || (IsD16 && !Ty.isVector());
+  LLT UnpackedTy = Ty.changeElementSize(32);
+
+  if (IsExtLoad)
+    LoadDstReg = B.getMRI()->createGenericVirtualRegister(S32);
+  else if (Unpacked && IsD16 && Ty.isVector())
+    LoadDstReg = B.getMRI()->createGenericVirtualRegister(UnpackedTy);
+  else
+    LoadDstReg = Dst;
+
+  if (!VIndex)
+    VIndex = B.buildConstant(S32, 0).getReg(0);
+
+  auto MIB = B.buildInstr(Opc)
+    .addDef(LoadDstReg)         // vdata
+    .addUse(RSrc)               // rsrc
+    .addUse(VIndex)             // vindex
+    .addUse(VOffset)            // voffset
+    .addUse(SOffset)            // soffset
+    .addImm(ImmOffset);         // offset(imm)
+
+  if (IsTyped)
+    MIB.addImm(Format);
+
+  MIB.addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
+     .addImm(HasVIndex ? -1 : 0) // idxen(imm)
+     .addMemOperand(MMO);
+
+  if (LoadDstReg != Dst) {
+    B.setInsertPt(B.getMBB(), ++B.getInsertPt());
+
+    // Widen result for extending loads was widened.
+    if (IsExtLoad)
+      B.buildTrunc(Dst, LoadDstReg);
+    else {
+      // Repack to original 16-bit vector result
+      // FIXME: G_TRUNC should work, but legalization currently fails
+      auto Unmerge = B.buildUnmerge(S32, LoadDstReg);
+      SmallVector<Register, 4> Repack;
+      for (unsigned I = 0, N = Unmerge->getNumOperands() - 1; I != N; ++I)
+        Repack.push_back(B.buildTrunc(EltTy, Unmerge.getReg(I)).getReg(0));
+      B.buildMerge(Dst, Repack);
+    }
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeAtomicIncDec(MachineInstr &MI,
+                                               MachineIRBuilder &B,
+                                               bool IsInc) const {
+  unsigned Opc = IsInc ? AMDGPU::G_AMDGPU_ATOMIC_INC :
+                         AMDGPU::G_AMDGPU_ATOMIC_DEC;
+  B.buildInstr(Opc)
+    .addDef(MI.getOperand(0).getReg())
+    .addUse(MI.getOperand(2).getReg())
+    .addUse(MI.getOperand(3).getReg())
+    .cloneMemRefs(MI);
+  MI.eraseFromParent();
+  return true;
+}
+
+static unsigned getBufferAtomicPseudo(Intrinsic::ID IntrID) {
+  switch (IntrID) {
+  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP;
+  case Intrinsic::amdgcn_raw_buffer_atomic_add:
+  case Intrinsic::amdgcn_struct_buffer_atomic_add:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD;
+  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB;
+  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN;
+  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN;
+  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX;
+  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX;
+  case Intrinsic::amdgcn_raw_buffer_atomic_and:
+  case Intrinsic::amdgcn_struct_buffer_atomic_and:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND;
+  case Intrinsic::amdgcn_raw_buffer_atomic_or:
+  case Intrinsic::amdgcn_struct_buffer_atomic_or:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR;
+  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR;
+  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC;
+  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC;
+  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
+  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
+    return AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP;
+  default:
+    llvm_unreachable("unhandled atomic opcode");
+  }
+}
+
+bool AMDGPULegalizerInfo::legalizeBufferAtomic(MachineInstr &MI,
+                                               MachineIRBuilder &B,
+                                               Intrinsic::ID IID) const {
+  const bool IsCmpSwap = IID == Intrinsic::amdgcn_raw_buffer_atomic_cmpswap ||
+                         IID == Intrinsic::amdgcn_struct_buffer_atomic_cmpswap;
+
+  Register Dst = MI.getOperand(0).getReg();
+  Register VData = MI.getOperand(2).getReg();
+
+  Register CmpVal;
+  int OpOffset = 0;
+
+  if (IsCmpSwap) {
+    CmpVal = MI.getOperand(3 + OpOffset).getReg();
+    ++OpOffset;
+  }
+
+  Register RSrc = MI.getOperand(3 + OpOffset).getReg();
+  const unsigned NumVIndexOps = IsCmpSwap ? 9 : 8;
+
+  // The struct intrinsic variants add one additional operand over raw.
+  const bool HasVIndex = MI.getNumOperands() == NumVIndexOps;
+  Register VIndex;
+  if (HasVIndex) {
+    VIndex = MI.getOperand(4 + OpOffset).getReg();
+    ++OpOffset;
+  }
+
+  Register VOffset = MI.getOperand(4 + OpOffset).getReg();
+  Register SOffset = MI.getOperand(5 + OpOffset).getReg();
+  unsigned AuxiliaryData = MI.getOperand(6 + OpOffset).getImm();
+
+  MachineMemOperand *MMO = *MI.memoperands_begin();
+
+  unsigned ImmOffset;
+  unsigned TotalOffset;
+  std::tie(VOffset, ImmOffset, TotalOffset) = splitBufferOffsets(B, VOffset);
+  if (TotalOffset != 0)
+    MMO = B.getMF().getMachineMemOperand(MMO, TotalOffset, MMO->getSize());
+
+  if (!VIndex)
+    VIndex = B.buildConstant(LLT::scalar(32), 0).getReg(0);
+
+  auto MIB = B.buildInstr(getBufferAtomicPseudo(IID))
+    .addDef(Dst)
+    .addUse(VData); // vdata
+
+  if (IsCmpSwap)
+    MIB.addReg(CmpVal);
+
+  MIB.addUse(RSrc)               // rsrc
+     .addUse(VIndex)             // vindex
+     .addUse(VOffset)            // voffset
+     .addUse(SOffset)            // soffset
+     .addImm(ImmOffset)          // offset(imm)
+     .addImm(AuxiliaryData)      // cachepolicy, swizzled buffer(imm)
+     .addImm(HasVIndex ? -1 : 0) // idxen(imm)
+     .addMemOperand(MMO);
+
+  MI.eraseFromParent();
+  return true;
+}
+
+/// Turn a set of s16 typed registers in \p A16AddrRegs into a dword sized
+/// vector with s16 typed elements.
+static void packImageA16AddressToDwords(MachineIRBuilder &B, MachineInstr &MI,
+                                        SmallVectorImpl<Register> &PackedAddrs,
+                                        int AddrIdx, int DimIdx, int EndIdx,
+                                        int NumGradients) {
+  const LLT S16 = LLT::scalar(16);
+  const LLT V2S16 = LLT::vector(2, 16);
+
+  for (int I = AddrIdx; I < EndIdx; ++I) {
+    MachineOperand &SrcOp = MI.getOperand(I);
+    if (!SrcOp.isReg())
+      continue; // _L to _LZ may have eliminated this.
+
+    Register AddrReg = SrcOp.getReg();
+
+    if (I < DimIdx) {
+      AddrReg = B.buildBitcast(V2S16, AddrReg).getReg(0);
+      PackedAddrs.push_back(AddrReg);
+    } else {
+      // Dz/dh, dz/dv and the last odd coord are packed with undef. Also, in 1D,
+      // derivatives dx/dh and dx/dv are packed with undef.
+      if (((I + 1) >= EndIdx) ||
+          ((NumGradients / 2) % 2 == 1 &&
+           (I == DimIdx + (NumGradients / 2) - 1 ||
+            I == DimIdx + NumGradients - 1)) ||
+          // Check for _L to _LZ optimization
+          !MI.getOperand(I + 1).isReg()) {
+        PackedAddrs.push_back(
+            B.buildBuildVector(V2S16, {AddrReg, B.buildUndef(S16).getReg(0)})
+                .getReg(0));
+      } else {
+        PackedAddrs.push_back(
+            B.buildBuildVector(V2S16, {AddrReg, MI.getOperand(I + 1).getReg()})
+                .getReg(0));
+        ++I;
+      }
+    }
+  }
+}
+
+/// Convert from separate vaddr components to a single vector address register,
+/// and replace the remaining operands with $noreg.
+static void convertImageAddrToPacked(MachineIRBuilder &B, MachineInstr &MI,
+                                     int DimIdx, int NumVAddrs) {
+  const LLT S32 = LLT::scalar(32);
+
+  SmallVector<Register, 8> AddrRegs;
+  for (int I = 0; I != NumVAddrs; ++I) {
+    MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
+    if (SrcOp.isReg()) {
+      AddrRegs.push_back(SrcOp.getReg());
+      assert(B.getMRI()->getType(SrcOp.getReg()) == S32);
+    }
+  }
+
+  int NumAddrRegs = AddrRegs.size();
+  if (NumAddrRegs != 1) {
+    // Round up to 8 elements for v5-v7
+    // FIXME: Missing intermediate sized register classes and instructions.
+    if (NumAddrRegs > 4 && !isPowerOf2_32(NumAddrRegs)) {
+      const int RoundedNumRegs = NextPowerOf2(NumAddrRegs);
+      auto Undef = B.buildUndef(S32);
+      AddrRegs.append(RoundedNumRegs - NumAddrRegs, Undef.getReg(0));
+      NumAddrRegs = RoundedNumRegs;
+    }
+
+    auto VAddr = B.buildBuildVector(LLT::vector(NumAddrRegs, 32), AddrRegs);
+    MI.getOperand(DimIdx).setReg(VAddr.getReg(0));
+  }
+
+  for (int I = 1; I != NumVAddrs; ++I) {
+    MachineOperand &SrcOp = MI.getOperand(DimIdx + I);
+    if (SrcOp.isReg())
+      MI.getOperand(DimIdx + I).setReg(AMDGPU::NoRegister);
+  }
+}
+
+/// Rewrite image intrinsics to use register layouts expected by the subtarget.
+///
+/// Depending on the subtarget, load/store with 16-bit element data need to be
+/// rewritten to use the low half of 32-bit registers, or directly use a packed
+/// layout. 16-bit addresses should also sometimes be packed into 32-bit
+/// registers.
+///
+/// We don't want to directly select image instructions just yet, but also want
+/// to exposes all register repacking to the legalizer/combiners. We also don't
+/// want a selected instrution entering RegBankSelect. In order to avoid
+/// defining a multitude of intermediate image instructions, directly hack on
+/// the intrinsic's arguments. In cases like a16 addreses, this requires padding
+/// now unnecessary arguments with $noreg.
+bool AMDGPULegalizerInfo::legalizeImageIntrinsic(
+    MachineInstr &MI, MachineIRBuilder &B,
+    GISelChangeObserver &Observer,
+    const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const {
+
+  const int NumDefs = MI.getNumExplicitDefs();
+  bool IsTFE = NumDefs == 2;
+  // We are only processing the operands of d16 image operations on subtargets
+  // that use the unpacked register layout, or need to repack the TFE result.
+
+  // TODO: Do we need to guard against already legalized intrinsics?
+  const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+    AMDGPU::getMIMGBaseOpcodeInfo(ImageDimIntr->BaseOpcode);
+
+  MachineRegisterInfo *MRI = B.getMRI();
+  const LLT S32 = LLT::scalar(32);
+  const LLT S16 = LLT::scalar(16);
+  const LLT V2S16 = LLT::vector(2, 16);
+
+  // Index of first address argument
+  const int AddrIdx = getImageVAddrIdxBegin(BaseOpcode, NumDefs);
+
+  int NumVAddrs, NumGradients;
+  std::tie(NumVAddrs, NumGradients) = getImageNumVAddr(ImageDimIntr, BaseOpcode);
+  const int DMaskIdx = BaseOpcode->Atomic ? -1 :
+    getDMaskIdx(BaseOpcode, NumDefs);
+  unsigned DMask = 0;
+
+  // Check for 16 bit addresses and pack if true.
+  int DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
+  LLT GradTy = MRI->getType(MI.getOperand(DimIdx).getReg());
+  LLT AddrTy = MRI->getType(MI.getOperand(DimIdx + NumGradients).getReg());
+  const bool IsG16 = GradTy == S16;
+  const bool IsA16 = AddrTy == S16;
+
+  int DMaskLanes = 0;
+  if (!BaseOpcode->Atomic) {
+    DMask = MI.getOperand(DMaskIdx).getImm();
+    if (BaseOpcode->Gather4) {
+      DMaskLanes = 4;
+    } else if (DMask != 0) {
+      DMaskLanes = countPopulation(DMask);
+    } else if (!IsTFE && !BaseOpcode->Store) {
+      // If dmask is 0, this is a no-op load. This can be eliminated.
+      B.buildUndef(MI.getOperand(0));
+      MI.eraseFromParent();
       return true;
     }
+  }
+
+  Observer.changingInstr(MI);
+  auto ChangedInstr = make_scope_exit([&] { Observer.changedInstr(MI); });
+
+  unsigned NewOpcode = NumDefs == 0 ?
+    AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD;
+
+  // Track that we legalized this
+  MI.setDesc(B.getTII().get(NewOpcode));
+
+  // Expecting to get an error flag since TFC is on - and dmask is 0 Force
+  // dmask to be at least 1 otherwise the instruction will fail
+  if (IsTFE && DMask == 0) {
+    DMask = 0x1;
+    DMaskLanes = 1;
+    MI.getOperand(DMaskIdx).setImm(DMask);
+  }
+
+  if (BaseOpcode->Atomic) {
+    Register VData0 = MI.getOperand(2).getReg();
+    LLT Ty = MRI->getType(VData0);
+
+    // TODO: Allow atomic swap and bit ops for v2s16/v4s16
+    if (Ty.isVector())
+      return false;
+
+    if (BaseOpcode->AtomicX2) {
+      Register VData1 = MI.getOperand(3).getReg();
+      // The two values are packed in one register.
+      LLT PackedTy = LLT::vector(2, Ty);
+      auto Concat = B.buildBuildVector(PackedTy, {VData0, VData1});
+      MI.getOperand(2).setReg(Concat.getReg(0));
+      MI.getOperand(3).setReg(AMDGPU::NoRegister);
+    }
+  }
 
-    return Ty.getElementType() == S32 && Ty.getNumElements() <= 4;
+  int CorrectedNumVAddrs = NumVAddrs;
+
+  // Optimize _L to _LZ when _L is zero
+  if (const AMDGPU::MIMGLZMappingInfo *LZMappingInfo =
+        AMDGPU::getMIMGLZMappingInfo(ImageDimIntr->BaseOpcode)) {
+    const ConstantFP *ConstantLod;
+    const int LodIdx = AddrIdx + NumVAddrs - 1;
+
+    if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_GFCst(ConstantLod))) {
+      if (ConstantLod->isZero() || ConstantLod->isNegative()) {
+        // Set new opcode to _lz variant of _l, and change the intrinsic ID.
+        ImageDimIntr = AMDGPU::getImageDimInstrinsicByBaseOpcode(
+          LZMappingInfo->LZ, ImageDimIntr->Dim);
+
+        // The starting indexes should remain in the same place.
+        --NumVAddrs;
+        --CorrectedNumVAddrs;
+
+        MI.getOperand(MI.getNumExplicitDefs()).setIntrinsicID(
+          static_cast<Intrinsic::ID>(ImageDimIntr->Intr));
+        MI.RemoveOperand(LodIdx);
+      }
+    }
   }
 
-  return Ty == S32;
+  // Optimize _mip away, when 'lod' is zero
+  if (AMDGPU::getMIMGMIPMappingInfo(ImageDimIntr->BaseOpcode)) {
+    int64_t ConstantLod;
+    const int LodIdx = AddrIdx + NumVAddrs - 1;
+
+    if (mi_match(MI.getOperand(LodIdx).getReg(), *MRI, m_ICst(ConstantLod))) {
+      if (ConstantLod == 0) {
+        // TODO: Change intrinsic opcode and remove operand instead or replacing
+        // it with 0, as the _L to _LZ handling is done above.
+        MI.getOperand(LodIdx).ChangeToImmediate(0);
+        --CorrectedNumVAddrs;
+      }
+    }
+  }
+
+  // Rewrite the addressing register layout before doing anything else.
+  if (IsA16 || IsG16) {
+    if (IsA16) {
+      // Target must support the feature and gradients need to be 16 bit too
+      if (!ST.hasA16() || !IsG16)
+        return false;
+    } else if (!ST.hasG16())
+      return false;
+
+    if (NumVAddrs > 1) {
+      SmallVector<Register, 4> PackedRegs;
+      // Don't compress addresses for G16
+      const int PackEndIdx =
+          IsA16 ? (AddrIdx + NumVAddrs) : (DimIdx + NumGradients);
+      packImageA16AddressToDwords(B, MI, PackedRegs, AddrIdx, DimIdx,
+                                  PackEndIdx, NumGradients);
+
+      if (!IsA16) {
+        // Add uncompressed address
+        for (int I = DimIdx + NumGradients; I != AddrIdx + NumVAddrs; ++I) {
+          int AddrReg = MI.getOperand(I).getReg();
+          assert(B.getMRI()->getType(AddrReg) == LLT::scalar(32));
+          PackedRegs.push_back(AddrReg);
+        }
+      }
+
+      // See also below in the non-a16 branch
+      const bool UseNSA = PackedRegs.size() >= 3 && ST.hasNSAEncoding();
+
+      if (!UseNSA && PackedRegs.size() > 1) {
+        LLT PackedAddrTy = LLT::vector(2 * PackedRegs.size(), 16);
+        auto Concat = B.buildConcatVectors(PackedAddrTy, PackedRegs);
+        PackedRegs[0] = Concat.getReg(0);
+        PackedRegs.resize(1);
+      }
+
+      const int NumPacked = PackedRegs.size();
+      for (int I = 0; I != NumVAddrs; ++I) {
+        MachineOperand &SrcOp = MI.getOperand(AddrIdx + I);
+        if (!SrcOp.isReg()) {
+          assert(SrcOp.isImm() && SrcOp.getImm() == 0);
+          continue;
+        }
+
+        assert(SrcOp.getReg() != AMDGPU::NoRegister);
+
+        if (I < NumPacked)
+          SrcOp.setReg(PackedRegs[I]);
+        else
+          SrcOp.setReg(AMDGPU::NoRegister);
+      }
+    }
+  } else {
+    // If the register allocator cannot place the address registers contiguously
+    // without introducing moves, then using the non-sequential address encoding
+    // is always preferable, since it saves VALU instructions and is usually a
+    // wash in terms of code size or even better.
+    //
+    // However, we currently have no way of hinting to the register allocator
+    // that MIMG addresses should be placed contiguously when it is possible to
+    // do so, so force non-NSA for the common 2-address case as a heuristic.
+    //
+    // SIShrinkInstructions will convert NSA encodings to non-NSA after register
+    // allocation when possible.
+    const bool UseNSA = CorrectedNumVAddrs >= 3 && ST.hasNSAEncoding();
+
+    if (!UseNSA && NumVAddrs > 1)
+      convertImageAddrToPacked(B, MI, AddrIdx, NumVAddrs);
+  }
+
+  int Flags = 0;
+  if (IsA16)
+    Flags |= 1;
+  if (IsG16)
+    Flags |= 2;
+  MI.addOperand(MachineOperand::CreateImm(Flags));
+
+  if (BaseOpcode->Store) { // No TFE for stores?
+    // TODO: Handle dmask trim
+    Register VData = MI.getOperand(1).getReg();
+    LLT Ty = MRI->getType(VData);
+    if (!Ty.isVector() || Ty.getElementType() != S16)
+      return true;
+
+    Register RepackedReg = handleD16VData(B, *MRI, VData);
+    if (RepackedReg != VData) {
+      MI.getOperand(1).setReg(RepackedReg);
+    }
+
+    return true;
+  }
+
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT Ty = MRI->getType(DstReg);
+  const LLT EltTy = Ty.getScalarType();
+  const bool IsD16 = Ty.getScalarType() == S16;
+  const int NumElts = Ty.isVector() ? Ty.getNumElements() : 1;
+
+  // Confirm that the return type is large enough for the dmask specified
+  if (NumElts < DMaskLanes)
+    return false;
+
+  if (NumElts > 4 || DMaskLanes > 4)
+    return false;
+
+  const unsigned AdjustedNumElts = DMaskLanes == 0 ? 1 : DMaskLanes;
+  const LLT AdjustedTy = Ty.changeNumElements(AdjustedNumElts);
+
+  // The raw dword aligned data component of the load. The only legal cases
+  // where this matters should be when using the packed D16 format, for
+  // s16 -> <2 x s16>, and <3 x s16> -> <4 x s16>,
+  LLT RoundedTy;
+
+  // S32 vector to to cover all data, plus TFE result element.
+  LLT TFETy;
+
+  // Register type to use for each loaded component. Will be S32 or V2S16.
+  LLT RegTy;
+
+  if (IsD16 && ST.hasUnpackedD16VMem()) {
+    RoundedTy = LLT::scalarOrVector(AdjustedNumElts, 32);
+    TFETy = LLT::vector(AdjustedNumElts + 1, 32);
+    RegTy = S32;
+  } else {
+    unsigned EltSize = EltTy.getSizeInBits();
+    unsigned RoundedElts = (AdjustedTy.getSizeInBits() + 31) / 32;
+    unsigned RoundedSize = 32 * RoundedElts;
+    RoundedTy = LLT::scalarOrVector(RoundedSize / EltSize, EltSize);
+    TFETy = LLT::vector(RoundedSize / 32 + 1, S32);
+    RegTy = !IsTFE && EltSize == 16 ? V2S16 : S32;
+  }
+
+  // The return type does not need adjustment.
+  // TODO: Should we change s16 case to s32 or <2 x s16>?
+  if (!IsTFE && (RoundedTy == Ty || !Ty.isVector()))
+    return true;
+
+  Register Dst1Reg;
+
+  // Insert after the instruction.
+  B.setInsertPt(*MI.getParent(), ++MI.getIterator());
+
+  // TODO: For TFE with d16, if we used a TFE type that was a multiple of <2 x
+  // s16> instead of s32, we would only need 1 bitcast instead of multiple.
+  const LLT LoadResultTy = IsTFE ? TFETy : RoundedTy;
+  const int ResultNumRegs = LoadResultTy.getSizeInBits() / 32;
+
+  Register NewResultReg = MRI->createGenericVirtualRegister(LoadResultTy);
+
+  MI.getOperand(0).setReg(NewResultReg);
+
+  // In the IR, TFE is supposed to be used with a 2 element struct return
+  // type. The intruction really returns these two values in one contiguous
+  // register, with one additional dword beyond the loaded data. Rewrite the
+  // return type to use a single register result.
+
+  if (IsTFE) {
+    Dst1Reg = MI.getOperand(1).getReg();
+    if (MRI->getType(Dst1Reg) != S32)
+      return false;
+
+    // TODO: Make sure the TFE operand bit is set.
+    MI.RemoveOperand(1);
+
+    // Handle the easy case that requires no repack instructions.
+    if (Ty == S32) {
+      B.buildUnmerge({DstReg, Dst1Reg}, NewResultReg);
+      return true;
+    }
+  }
+
+  // Now figure out how to copy the new result register back into the old
+  // result.
+  SmallVector<Register, 5> ResultRegs(ResultNumRegs, Dst1Reg);
+
+  const int NumDataRegs = IsTFE ? ResultNumRegs - 1  : ResultNumRegs;
+
+  if (ResultNumRegs == 1) {
+    assert(!IsTFE);
+    ResultRegs[0] = NewResultReg;
+  } else {
+    // We have to repack into a new vector of some kind.
+    for (int I = 0; I != NumDataRegs; ++I)
+      ResultRegs[I] = MRI->createGenericVirtualRegister(RegTy);
+    B.buildUnmerge(ResultRegs, NewResultReg);
+
+    // Drop the final TFE element to get the data part. The TFE result is
+    // directly written to the right place already.
+    if (IsTFE)
+      ResultRegs.resize(NumDataRegs);
+  }
+
+  // For an s16 scalar result, we form an s32 result with a truncate regardless
+  // of packed vs. unpacked.
+  if (IsD16 && !Ty.isVector()) {
+    B.buildTrunc(DstReg, ResultRegs[0]);
+    return true;
+  }
+
+  // Avoid a build/concat_vector of 1 entry.
+  if (Ty == V2S16 && NumDataRegs == 1 && !ST.hasUnpackedD16VMem()) {
+    B.buildBitcast(DstReg, ResultRegs[0]);
+    return true;
+  }
+
+  assert(Ty.isVector());
+
+  if (IsD16) {
+    // For packed D16 results with TFE enabled, all the data components are
+    // S32. Cast back to the expected type.
+    //
+    // TODO: We don't really need to use load s32 elements. We would only need one
+    // cast for the TFE result if a multiple of v2s16 was used.
+    if (RegTy != V2S16 && !ST.hasUnpackedD16VMem()) {
+      for (Register &Reg : ResultRegs)
+        Reg = B.buildBitcast(V2S16, Reg).getReg(0);
+    } else if (ST.hasUnpackedD16VMem()) {
+      for (Register &Reg : ResultRegs)
+        Reg = B.buildTrunc(S16, Reg).getReg(0);
+    }
+  }
+
+  auto padWithUndef = [&](LLT Ty, int NumElts) {
+    if (NumElts == 0)
+      return;
+    Register Undef = B.buildUndef(Ty).getReg(0);
+    for (int I = 0; I != NumElts; ++I)
+      ResultRegs.push_back(Undef);
+  };
+
+  // Pad out any elements eliminated due to the dmask.
+  LLT ResTy = MRI->getType(ResultRegs[0]);
+  if (!ResTy.isVector()) {
+    padWithUndef(ResTy, NumElts - ResultRegs.size());
+    B.buildBuildVector(DstReg, ResultRegs);
+    return true;
+  }
+
+  assert(!ST.hasUnpackedD16VMem() && ResTy == V2S16);
+  const int RegsToCover = (Ty.getSizeInBits() + 31) / 32;
+
+  // Deal with the one annoying legal case.
+  const LLT V3S16 = LLT::vector(3, 16);
+  if (Ty == V3S16) {
+    padWithUndef(ResTy, RegsToCover - ResultRegs.size() + 1);
+    auto Concat = B.buildConcatVectors(LLT::vector(6, 16), ResultRegs);
+    B.buildUnmerge({DstReg, MRI->createGenericVirtualRegister(V3S16)}, Concat);
+    return true;
+  }
+
+  padWithUndef(ResTy, RegsToCover - ResultRegs.size());
+  B.buildConcatVectors(DstReg, ResultRegs);
+  return true;
 }
 
-bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
-                                            MachineRegisterInfo &MRI,
-                                            MachineIRBuilder &B) const {
+bool AMDGPULegalizerInfo::legalizeSBufferLoad(
+  MachineInstr &MI, MachineIRBuilder &B,
+  GISelChangeObserver &Observer) const {
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = B.getMRI()->getType(Dst);
+  unsigned Size = Ty.getSizeInBits();
+  MachineFunction &MF = B.getMF();
+
+  Observer.changingInstr(MI);
+
+  // FIXME: We don't really need this intermediate instruction. The intrinsic
+  // should be fixed to have a memory operand. Since it's readnone, we're not
+  // allowed to add one.
+  MI.setDesc(B.getTII().get(AMDGPU::G_AMDGPU_S_BUFFER_LOAD));
+  MI.RemoveOperand(1); // Remove intrinsic ID
+
+  // FIXME: When intrinsic definition is fixed, this should have an MMO already.
+  // TODO: Should this use datalayout alignment?
+  const unsigned MemSize = (Size + 7) / 8;
+  const Align MemAlign(4);
+  MachineMemOperand *MMO = MF.getMachineMemOperand(
+      MachinePointerInfo(),
+      MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+          MachineMemOperand::MOInvariant,
+      MemSize, MemAlign);
+  MI.addMemOperand(MF, MMO);
+
+  // There are no 96-bit result scalar loads, but widening to 128-bit should
+  // always be legal. We may need to restore this to a 96-bit result if it turns
+  // out this needs to be converted to a vector load during RegBankSelect.
+  if (!isPowerOf2_32(Size)) {
+    LegalizerHelper Helper(MF, *this, Observer, B);
+
+    if (Ty.isVector())
+      Helper.moreElementsVectorDst(MI, getPow2VectorType(Ty), 0);
+    else
+      Helper.widenScalarDst(MI, getPow2ScalarType(Ty), 0);
+  }
+
+  Observer.changedInstr(MI);
+  return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeTrapIntrinsic(MachineInstr &MI,
+                                                MachineRegisterInfo &MRI,
+                                                MachineIRBuilder &B) const {
+  // Is non-HSA path or trap-handler disabled? then, insert s_endpgm instruction
+  if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
+      !ST.isTrapHandlerEnabled()) {
+    B.buildInstr(AMDGPU::S_ENDPGM).addImm(0);
+  } else {
+    // Pass queue pointer to trap handler as input, and insert trap instruction
+    // Reference: https://llvm.org/docs/AMDGPUUsage.html#trap-handler-abi
+    const ArgDescriptor *Arg =
+        getArgDescriptor(B, AMDGPUFunctionArgInfo::QUEUE_PTR);
+    if (!Arg)
+      return false;
+    MachineRegisterInfo &MRI = *B.getMRI();
+    Register SGPR01(AMDGPU::SGPR0_SGPR1);
+    Register LiveIn = getLiveInRegister(
+        B, MRI, SGPR01, LLT::pointer(AMDGPUAS::CONSTANT_ADDRESS, 64),
+        /*InsertLiveInCopy=*/false);
+    if (!loadInputValue(LiveIn, B, Arg))
+      return false;
+    B.buildCopy(SGPR01, LiveIn);
+    B.buildInstr(AMDGPU::S_TRAP)
+        .addImm(GCNSubtarget::TrapIDLLVMTrap)
+        .addReg(SGPR01, RegState::Implicit);
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeDebugTrapIntrinsic(
+    MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B) const {
+  // Is non-HSA path or trap-handler disabled? then, report a warning
+  // accordingly
+  if (ST.getTrapHandlerAbi() != GCNSubtarget::TrapHandlerAbiHsa ||
+      !ST.isTrapHandlerEnabled()) {
+    DiagnosticInfoUnsupported NoTrap(B.getMF().getFunction(),
+                                     "debugtrap handler not supported",
+                                     MI.getDebugLoc(), DS_Warning);
+    LLVMContext &Ctx = B.getMF().getFunction().getContext();
+    Ctx.diagnose(NoTrap);
+  } else {
+    // Insert debug-trap instruction
+    B.buildInstr(AMDGPU::S_TRAP).addImm(GCNSubtarget::TrapIDLLVMDebugTrap);
+  }
+
+  MI.eraseFromParent();
+  return true;
+}
+
+bool AMDGPULegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
+                                            MachineInstr &MI) const {
+  MachineIRBuilder &B = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *B.getMRI();
+
   // Replace the use G_BRCOND with the exec manipulate and branch pseudos.
   auto IntrID = MI.getIntrinsicID();
   switch (IntrID) {
   case Intrinsic::amdgcn_if:
   case Intrinsic::amdgcn_else: {
     MachineInstr *Br = nullptr;
-    if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
+    MachineBasicBlock *UncondBrTarget = nullptr;
+    if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
       const SIRegisterInfo *TRI
         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
 
-      B.setInstr(*BrCond);
       Register Def = MI.getOperand(1).getReg();
       Register Use = MI.getOperand(3).getReg();
 
-      MachineBasicBlock *BrTarget = BrCond->getOperand(1).getMBB();
-      if (Br)
-        BrTarget = Br->getOperand(0).getMBB();
-
+      MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
+      B.setInsertPt(B.getMBB(), BrCond->getIterator());
       if (IntrID == Intrinsic::amdgcn_if) {
         B.buildInstr(AMDGPU::SI_IF)
           .addDef(Def)
           .addUse(Use)
-          .addMBB(BrTarget);
+          .addMBB(UncondBrTarget);
       } else {
         B.buildInstr(AMDGPU::SI_ELSE)
           .addDef(Def)
           .addUse(Use)
-          .addMBB(BrTarget)
+          .addMBB(UncondBrTarget)
           .addImm(0);
       }
 
-      if (Br)
-        Br->getOperand(0).setMBB(BrCond->getOperand(1).getMBB());
+      if (Br) {
+        Br->getOperand(0).setMBB(CondBrTarget);
+      } else {
+        // The IRTranslator skips inserting the G_BR for fallthrough cases, but
+        // since we're swapping branch targets it needs to be reinserted.
+        // FIXME: IRTranslator should probably not do this
+        B.buildBr(*CondBrTarget);
+      }
 
       MRI.setRegClass(Def, TRI->getWaveMaskRegClass());
       MRI.setRegClass(Use, TRI->getWaveMaskRegClass());
@@ -2393,17 +4200,24 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
   }
   case Intrinsic::amdgcn_loop: {
     MachineInstr *Br = nullptr;
-    if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br)) {
+    MachineBasicBlock *UncondBrTarget = nullptr;
+    if (MachineInstr *BrCond = verifyCFIntrinsic(MI, MRI, Br, UncondBrTarget)) {
       const SIRegisterInfo *TRI
         = static_cast<const SIRegisterInfo *>(MRI.getTargetRegisterInfo());
 
-      B.setInstr(*BrCond);
-
-      // FIXME: Need to adjust branch targets based on unconditional branch.
+      MachineBasicBlock *CondBrTarget = BrCond->getOperand(1).getMBB();
       Register Reg = MI.getOperand(2).getReg();
+
+      B.setInsertPt(B.getMBB(), BrCond->getIterator());
       B.buildInstr(AMDGPU::SI_LOOP)
         .addUse(Reg)
-        .addMBB(BrCond->getOperand(1).getMBB());
+        .addMBB(UncondBrTarget);
+
+      if (Br)
+        Br->getOperand(0).setMBB(CondBrTarget);
+      else
+        B.buildBr(*CondBrTarget);
+
       MI.eraseFromParent();
       BrCond->eraseFromParent();
       MRI.setRegClass(Reg, TRI->getWaveMaskRegClass());
@@ -2413,6 +4227,13 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
     return false;
   }
   case Intrinsic::amdgcn_kernarg_segment_ptr:
+    if (!AMDGPU::isKernel(B.getMF().getFunction().getCallingConv())) {
+      // This only makes sense to call in a kernel, so just lower to null.
+      B.buildConstant(MI.getOperand(0).getReg(), 0);
+      MI.eraseFromParent();
+      return true;
+    }
+
     return legalizePreloadedArgIntrin(
       MI, MRI, B, AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
   case Intrinsic::amdgcn_implicitarg_ptr:
@@ -2454,18 +4275,72 @@ bool AMDGPULegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
   case Intrinsic::amdgcn_is_private:
     return legalizeIsAddrSpace(MI, MRI, B, AMDGPUAS::PRIVATE_ADDRESS);
   case Intrinsic::amdgcn_wavefrontsize: {
-    B.setInstr(MI);
     B.buildConstant(MI.getOperand(0), ST.getWavefrontSize());
     MI.eraseFromParent();
     return true;
   }
+  case Intrinsic::amdgcn_s_buffer_load:
+    return legalizeSBufferLoad(MI, B, Helper.Observer);
   case Intrinsic::amdgcn_raw_buffer_store:
-    return legalizeRawBufferStore(MI, MRI, B, false);
+  case Intrinsic::amdgcn_struct_buffer_store:
+    return legalizeBufferStore(MI, MRI, B, false, false);
   case Intrinsic::amdgcn_raw_buffer_store_format:
-    return legalizeRawBufferStore(MI, MRI, B, true);
-  default:
+  case Intrinsic::amdgcn_struct_buffer_store_format:
+    return legalizeBufferStore(MI, MRI, B, false, true);
+  case Intrinsic::amdgcn_raw_tbuffer_store:
+  case Intrinsic::amdgcn_struct_tbuffer_store:
+    return legalizeBufferStore(MI, MRI, B, true, true);
+  case Intrinsic::amdgcn_raw_buffer_load:
+  case Intrinsic::amdgcn_struct_buffer_load:
+    return legalizeBufferLoad(MI, MRI, B, false, false);
+  case Intrinsic::amdgcn_raw_buffer_load_format:
+  case Intrinsic::amdgcn_struct_buffer_load_format:
+    return legalizeBufferLoad(MI, MRI, B, true, false);
+  case Intrinsic::amdgcn_raw_tbuffer_load:
+  case Intrinsic::amdgcn_struct_tbuffer_load:
+    return legalizeBufferLoad(MI, MRI, B, true, true);
+  case Intrinsic::amdgcn_raw_buffer_atomic_swap:
+  case Intrinsic::amdgcn_struct_buffer_atomic_swap:
+  case Intrinsic::amdgcn_raw_buffer_atomic_add:
+  case Intrinsic::amdgcn_struct_buffer_atomic_add:
+  case Intrinsic::amdgcn_raw_buffer_atomic_sub:
+  case Intrinsic::amdgcn_struct_buffer_atomic_sub:
+  case Intrinsic::amdgcn_raw_buffer_atomic_smin:
+  case Intrinsic::amdgcn_struct_buffer_atomic_smin:
+  case Intrinsic::amdgcn_raw_buffer_atomic_umin:
+  case Intrinsic::amdgcn_struct_buffer_atomic_umin:
+  case Intrinsic::amdgcn_raw_buffer_atomic_smax:
+  case Intrinsic::amdgcn_struct_buffer_atomic_smax:
+  case Intrinsic::amdgcn_raw_buffer_atomic_umax:
+  case Intrinsic::amdgcn_struct_buffer_atomic_umax:
+  case Intrinsic::amdgcn_raw_buffer_atomic_and:
+  case Intrinsic::amdgcn_struct_buffer_atomic_and:
+  case Intrinsic::amdgcn_raw_buffer_atomic_or:
+  case Intrinsic::amdgcn_struct_buffer_atomic_or:
+  case Intrinsic::amdgcn_raw_buffer_atomic_xor:
+  case Intrinsic::amdgcn_struct_buffer_atomic_xor:
+  case Intrinsic::amdgcn_raw_buffer_atomic_inc:
+  case Intrinsic::amdgcn_struct_buffer_atomic_inc:
+  case Intrinsic::amdgcn_raw_buffer_atomic_dec:
+  case Intrinsic::amdgcn_struct_buffer_atomic_dec:
+  case Intrinsic::amdgcn_raw_buffer_atomic_cmpswap:
+  case Intrinsic::amdgcn_struct_buffer_atomic_cmpswap:
+    return legalizeBufferAtomic(MI, B, IntrID);
+  case Intrinsic::amdgcn_atomic_inc:
+    return legalizeAtomicIncDec(MI, B, true);
+  case Intrinsic::amdgcn_atomic_dec:
+    return legalizeAtomicIncDec(MI, B, false);
+  case Intrinsic::trap:
+    return legalizeTrapIntrinsic(MI, MRI, B);
+  case Intrinsic::debugtrap:
+    return legalizeDebugTrapIntrinsic(MI, MRI, B);
+  default: {
+    if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
+            AMDGPU::getImageDimIntrinsicInfo(IntrID))
+      return legalizeImageIntrinsic(MI, B, Helper.Observer, ImageDimIntr);
     return true;
   }
+  }
 
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
index 4b1405a92787..ce32bbf76b34 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULegalizerInfo.h
@@ -32,9 +32,7 @@ public:
   AMDGPULegalizerInfo(const GCNSubtarget &ST,
                       const GCNTargetMachine &TM);
 
-  bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
-                      MachineIRBuilder &B,
-                      GISelChangeObserver &Observer) const override;
+  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
 
   Register getSegmentAperture(unsigned AddrSpace,
                               MachineRegisterInfo &MRI,
@@ -50,18 +48,22 @@ public:
                               MachineIRBuilder &B) const;
   bool legalizeITOFP(MachineInstr &MI, MachineRegisterInfo &MRI,
                      MachineIRBuilder &B, bool Signed) const;
-  bool legalizeMinNumMaxNum(MachineInstr &MI, MachineRegisterInfo &MRI,
-                            MachineIRBuilder &B) const;
+  bool legalizeFPTOI(MachineInstr &MI, MachineRegisterInfo &MRI,
+                     MachineIRBuilder &B, bool Signed) const;
+  bool legalizeMinNumMaxNum(LegalizerHelper &Helper, MachineInstr &MI) const;
   bool legalizeExtractVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
                                 MachineIRBuilder &B) const;
   bool legalizeInsertVectorElt(MachineInstr &MI, MachineRegisterInfo &MRI,
                                MachineIRBuilder &B) const;
+  bool legalizeShuffleVector(MachineInstr &MI, MachineRegisterInfo &MRI,
+                             MachineIRBuilder &B) const;
+
   bool legalizeSinCos(MachineInstr &MI, MachineRegisterInfo &MRI,
                       MachineIRBuilder &B) const;
 
-  bool buildPCRelGlobalAddress(
-    Register DstReg, LLT PtrTy, MachineIRBuilder &B, const GlobalValue *GV,
-    unsigned Offset, unsigned GAFlags = SIInstrInfo::MO_NONE) const;
+  bool buildPCRelGlobalAddress(Register DstReg, LLT PtrTy, MachineIRBuilder &B,
+                               const GlobalValue *GV, int64_t Offset,
+                               unsigned GAFlags = SIInstrInfo::MO_NONE) const;
 
   bool legalizeGlobalValue(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &B) const;
@@ -74,16 +76,50 @@ public:
 
   bool legalizeAtomicCmpXChg(MachineInstr &MI, MachineRegisterInfo &MRI,
                              MachineIRBuilder &B) const;
+  bool legalizeFlog(MachineInstr &MI, MachineIRBuilder &B,
+                    double Log2BaseInverted) const;
+  bool legalizeFExp(MachineInstr &MI, MachineIRBuilder &B) const;
+  bool legalizeFPow(MachineInstr &MI, MachineIRBuilder &B) const;
+  bool legalizeFFloor(MachineInstr &MI, MachineRegisterInfo &MRI,
+                      MachineIRBuilder &B) const;
 
-  Register getLiveInRegister(MachineRegisterInfo &MRI,
-                             Register Reg, LLT Ty) const;
+  bool legalizeBuildVector(MachineInstr &MI, MachineRegisterInfo &MRI,
+                           MachineIRBuilder &B) const;
 
+  Register getLiveInRegister(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+                             Register PhyReg, LLT Ty,
+                             bool InsertLiveInCopy = true) const;
+  Register insertLiveInCopy(MachineIRBuilder &B, MachineRegisterInfo &MRI,
+                            Register LiveIn, Register PhyReg) const;
+  const ArgDescriptor *
+  getArgDescriptor(MachineIRBuilder &B,
+                   AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
   bool loadInputValue(Register DstReg, MachineIRBuilder &B,
                       const ArgDescriptor *Arg) const;
   bool legalizePreloadedArgIntrin(
     MachineInstr &MI, MachineRegisterInfo &MRI, MachineIRBuilder &B,
     AMDGPUFunctionArgInfo::PreloadedValue ArgType) const;
 
+  bool legalizeUDIV_UREM(MachineInstr &MI, MachineRegisterInfo &MRI,
+                         MachineIRBuilder &B) const;
+
+  void legalizeUDIV_UREM32Impl(MachineIRBuilder &B,
+                               Register DstReg, Register Num, Register Den,
+                               bool IsRem) const;
+  bool legalizeUDIV_UREM32(MachineInstr &MI, MachineRegisterInfo &MRI,
+                           MachineIRBuilder &B) const;
+  bool legalizeSDIV_SREM32(MachineInstr &MI, MachineRegisterInfo &MRI,
+                           MachineIRBuilder &B) const;
+
+  void legalizeUDIV_UREM64Impl(MachineIRBuilder &B,
+                               Register DstReg, Register Numer, Register Denom,
+                               bool IsDiv) const;
+
+  bool legalizeUDIV_UREM64(MachineInstr &MI, MachineRegisterInfo &MRI,
+                           MachineIRBuilder &B) const;
+  bool legalizeSDIV_SREM(MachineInstr &MI, MachineRegisterInfo &MRI,
+                         MachineIRBuilder &B) const;
+
   bool legalizeFDIV(MachineInstr &MI, MachineRegisterInfo &MRI,
                     MachineIRBuilder &B) const;
   bool legalizeFDIV16(MachineInstr &MI, MachineRegisterInfo &MRI,
@@ -102,13 +138,46 @@ public:
   bool legalizeIsAddrSpace(MachineInstr &MI, MachineRegisterInfo &MRI,
                            MachineIRBuilder &B, unsigned AddrSpace) const;
 
+  std::tuple<Register, unsigned, unsigned>
+  splitBufferOffsets(MachineIRBuilder &B, Register OrigOffset) const;
+
   Register handleD16VData(MachineIRBuilder &B, MachineRegisterInfo &MRI,
                           Register Reg) const;
   bool legalizeRawBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
                               MachineIRBuilder &B, bool IsFormat) const;
-  bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
-                         MachineIRBuilder &B) const override;
+  bool legalizeRawBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
+                             MachineIRBuilder &B, bool IsFormat) const;
+  Register fixStoreSourceType(MachineIRBuilder &B, Register VData,
+                              bool IsFormat) const;
+
+  bool legalizeBufferStore(MachineInstr &MI, MachineRegisterInfo &MRI,
+                           MachineIRBuilder &B, bool IsTyped,
+                           bool IsFormat) const;
+  bool legalizeBufferLoad(MachineInstr &MI, MachineRegisterInfo &MRI,
+                          MachineIRBuilder &B, bool IsTyped,
+                          bool IsFormat) const;
+  bool legalizeBufferAtomic(MachineInstr &MI, MachineIRBuilder &B,
+                            Intrinsic::ID IID) const;
+
+  bool legalizeImageIntrinsic(
+      MachineInstr &MI, MachineIRBuilder &B,
+      GISelChangeObserver &Observer,
+      const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr) const;
+
+  bool legalizeSBufferLoad(
+    MachineInstr &MI, MachineIRBuilder &B,
+    GISelChangeObserver &Observer) const;
+
+  bool legalizeAtomicIncDec(MachineInstr &MI,  MachineIRBuilder &B,
+                            bool IsInc) const;
+
+  bool legalizeTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+                             MachineIRBuilder &B) const;
+  bool legalizeDebugTrapIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                  MachineIRBuilder &B) const;
 
+  bool legalizeIntrinsic(LegalizerHelper &Helper,
+                         MachineInstr &MI) const override;
 };
 } // End llvm namespace.
 #endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index 0c56927dea02..4a14259f1bdb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -32,7 +32,6 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
 #include <cmath>
 #include <vector>
 
@@ -170,16 +169,13 @@ namespace {
 
   class AMDGPUSimplifyLibCalls : public FunctionPass {
 
-  const TargetOptions Options;
-
   AMDGPULibCalls Simplifier;
 
   public:
     static char ID; // Pass identification
 
-    AMDGPUSimplifyLibCalls(const TargetOptions &Opt = TargetOptions(),
-                           const TargetMachine *TM = nullptr)
-      : FunctionPass(ID), Options(Opt), Simplifier(TM) {
+    AMDGPUSimplifyLibCalls(const TargetMachine *TM = nullptr)
+      : FunctionPass(ID), Simplifier(TM) {
       initializeAMDGPUSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
     }
 
@@ -585,7 +581,7 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
   assert(Callee->hasName() && "Invalid read_pipe/write_pipe function");
   auto *M = Callee->getParent();
   auto &Ctx = M->getContext();
-  std::string Name = Callee->getName();
+  std::string Name = std::string(Callee->getName());
   auto NumArg = CI->getNumArgOperands();
   if (NumArg != 4 && NumArg != 6)
     return false;
@@ -594,15 +590,15 @@ bool AMDGPULibCalls::fold_read_write_pipe(CallInst *CI, IRBuilder<> &B,
   if (!isa<ConstantInt>(PacketSize) || !isa<ConstantInt>(PacketAlign))
     return false;
   unsigned Size = cast<ConstantInt>(PacketSize)->getZExtValue();
-  unsigned Align = cast<ConstantInt>(PacketAlign)->getZExtValue();
-  if (Size != Align || !isPowerOf2_32(Size))
+  Align Alignment = cast<ConstantInt>(PacketAlign)->getAlignValue();
+  if (Alignment != Size)
     return false;
 
   Type *PtrElemTy;
   if (Size <= 8)
     PtrElemTy = Type::getIntNTy(Ctx, Size * 8);
   else
-    PtrElemTy = VectorType::get(Type::getInt64Ty(Ctx), Size / 8);
+    PtrElemTy = FixedVectorType::get(Type::getInt64Ty(Ctx), Size / 8);
   unsigned PtrArgLoc = CI->getNumArgOperands() - 3;
   auto PtrArg = CI->getArgOperand(PtrArgLoc);
   unsigned PtrArgAS = PtrArg->getType()->getPointerAddressSpace();
@@ -1130,8 +1126,8 @@ bool AMDGPULibCalls::fold_pow(CallInst *CI, IRBuilder<> &B,
     Type* rTy = opr0->getType();
     Type* nTyS = eltType->isDoubleTy() ? B.getInt64Ty() : B.getInt32Ty();
     Type *nTy = nTyS;
-    if (const VectorType *vTy = dyn_cast<VectorType>(rTy))
-      nTy = VectorType::get(nTyS, vTy->getNumElements());
+    if (const auto *vTy = dyn_cast<FixedVectorType>(rTy))
+      nTy = FixedVectorType::get(nTyS, vTy);
     unsigned size = nTy->getScalarSizeInBits();
     opr_n = CI->getArgOperand(1);
     if (opr_n->getType()->isIntegerTy())
@@ -1420,8 +1416,8 @@ AllocaInst* AMDGPULibCalls::insertAlloca(CallInst *UI, IRBuilder<> &B,
   B.SetInsertPoint(&*ItNew);
   AllocaInst *Alloc = B.CreateAlloca(RetType, 0,
     std::string(prefix) + UI->getName());
-  Alloc->setAlignment(MaybeAlign(
-      UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
+  Alloc->setAlignment(
+      Align(UCallee->getParent()->getDataLayout().getTypeAllocSize(RetType)));
   return Alloc;
 }
 
@@ -1711,35 +1707,14 @@ bool AMDGPULibCalls::evaluateCall(CallInst *aCI, FuncInfo &FInfo) {
 }
 
 // Public interface to the Simplify LibCalls pass.
-FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetOptions &Opt,
-                                                     const TargetMachine *TM) {
-  return new AMDGPUSimplifyLibCalls(Opt, TM);
+FunctionPass *llvm::createAMDGPUSimplifyLibCallsPass(const TargetMachine *TM) {
+  return new AMDGPUSimplifyLibCalls(TM);
 }
 
 FunctionPass *llvm::createAMDGPUUseNativeCallsPass() {
   return new AMDGPUUseNativeCalls();
 }
 
-static bool setFastFlags(Function &F, const TargetOptions &Options) {
-  AttrBuilder B;
-
-  if (Options.UnsafeFPMath || Options.NoInfsFPMath)
-    B.addAttribute("no-infs-fp-math", "true");
-  if (Options.UnsafeFPMath || Options.NoNaNsFPMath)
-    B.addAttribute("no-nans-fp-math", "true");
-  if (Options.UnsafeFPMath) {
-    B.addAttribute("less-precise-fpmad", "true");
-    B.addAttribute("unsafe-fp-math", "true");
-  }
-
-  if (!B.hasAttributes())
-    return false;
-
-  F.addAttributes(AttributeList::FunctionIndex, B);
-
-  return true;
-}
-
 bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
@@ -1750,15 +1725,14 @@ bool AMDGPUSimplifyLibCalls::runOnFunction(Function &F) {
   LLVM_DEBUG(dbgs() << "AMDIC: process function ";
              F.printAsOperand(dbgs(), false, F.getParent()); dbgs() << '\n';);
 
-  if (!EnablePreLink)
-    Changed |= setFastFlags(F, Options);
-
   for (auto &BB : F) {
     for (BasicBlock::iterator I = BB.begin(), E = BB.end(); I != E; ) {
       // Ignore non-calls.
       CallInst *CI = dyn_cast<CallInst>(I);
       ++I;
-      if (!CI) continue;
+      // Ignore intrinsics that do not become real instructions.
+      if (!CI || isa<DbgInfoIntrinsic>(CI) || CI->isLifetimeStartOrEnd())
+        continue;
 
       // Ignore indirect calls.
       Function *Callee = CI->getCalledFunction();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
index e1ae496d9cbc..2b5143ba7506 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.cpp
@@ -10,17 +10,18 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "AMDGPU.h"
 #include "AMDGPULibFunc.h"
-#include <llvm/ADT/SmallString.h>
-#include <llvm/ADT/SmallVector.h>
-#include <llvm/ADT/StringSwitch.h>
+#include "AMDGPU.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueSymbolTable.h"
-#include <llvm/Support/raw_ostream.h>
+#include "llvm/Support/raw_ostream.h"
 #include <string>
 
 using namespace llvm;
@@ -479,8 +480,6 @@ static bool eatTerm(StringRef& mangledName, const char (&str)[N]) {
   return false;
 }
 
-static inline bool isDigit(char c) { return c >= '0' && c <= '9'; }
-
 static int eatNumber(StringRef& s) {
   size_t const savedSize = s.size();
   int n = 0;
@@ -605,7 +604,7 @@ bool ItaniumParamParser::parseItaniumParam(StringRef& param,
 
   // parse type
   char const TC = param.front();
-  if (::isDigit(TC)) {
+  if (isDigit(TC)) {
     res.ArgType = StringSwitch<AMDGPULibFunc::EType>
       (eatLengthPrefixedName(param))
       .Case("ocl_image1darray" , AMDGPULibFunc::IMG1DA)
@@ -863,7 +862,7 @@ std::string AMDGPUMangledLibFunc::mangleNameItanium() const {
   Param P;
   while ((P = I.getNextParam()).ArgType != 0)
     Mangler(S, P);
-  return S.str();
+  return std::string(S.str());
 }
 
 ///////////////////////////////////////////////////////////////////////////////
@@ -903,7 +902,7 @@ static Type* getIntrinsicParamType(
     return nullptr;
   }
   if (P.VectorSize > 1)
-    T = VectorType::get(T, P.VectorSize);
+    T = FixedVectorType::get(T, P.VectorSize);
   if (P.PtrKind != AMDGPULibFunc::BYVALUE)
     T = useAddrSpace ? T->getPointerTo((P.PtrKind & AMDGPULibFunc::ADDR_SPACE)
                                        - 1)
@@ -936,7 +935,7 @@ std::string AMDGPUMangledLibFunc::getName() const {
   SmallString<128> Buf;
   raw_svector_ostream OS(Buf);
   writeName(OS);
-  return OS.str();
+  return std::string(OS.str());
 }
 
 Function *AMDGPULibFunc::getFunction(Module *M, const AMDGPULibFunc &fInfo) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
index 2354ed7df205..c97223b047e8 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibFunc.h
@@ -13,6 +13,7 @@
 
 namespace llvm {
 
+class FunctionCallee;
 class FunctionType;
 class Function;
 class Module;
@@ -341,7 +342,7 @@ public:
   /// and unmangled function name for unmangled library functions.
   virtual std::string mangle() const = 0;
 
-  void setName(StringRef N) { Name = N; }
+  void setName(StringRef N) { Name = std::string(N); }
   void setPrefix(ENamePrefix pfx) { FKind = pfx; }
 
   virtual FunctionType *getFunctionType(Module &M) const = 0;
@@ -438,7 +439,7 @@ class AMDGPUUnmangledLibFunc : public AMDGPULibFuncImpl {
 public:
   explicit AMDGPUUnmangledLibFunc();
   explicit AMDGPUUnmangledLibFunc(StringRef FName, FunctionType *FT) {
-    Name = FName;
+    Name = std::string(FName);
     FuncTy = FT;
   }
   std::string getName() const override { return Name; }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
index 15032969890e..54c15e4e4d39 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerIntrinsics.cpp
@@ -22,7 +22,15 @@ using namespace llvm;
 
 namespace {
 
-const unsigned MaxStaticSize = 1024;
+static int MaxStaticSize;
+
+static cl::opt<int, true> MemIntrinsicExpandSizeThresholdOpt(
+  "amdgpu-mem-intrinsic-expand-size",
+  cl::desc("Set minimum mem intrinsic size to expand in IR"),
+  cl::location(MaxStaticSize),
+  cl::init(1024),
+  cl::Hidden);
+
 
 class AMDGPULowerIntrinsics : public ModulePass {
 private:
@@ -57,7 +65,7 @@ INITIALIZE_PASS(AMDGPULowerIntrinsics, DEBUG_TYPE, "Lower intrinsics", false,
 // require splitting based on alignment)
 static bool shouldExpandOperationWithSize(Value *Size) {
   ConstantInt *CI = dyn_cast<ConstantInt>(Size);
-  return !CI || (CI->getZExtValue() > MaxStaticSize);
+  return !CI || (CI->getSExtValue() > MaxStaticSize);
 }
 
 bool AMDGPULowerIntrinsics::expandMemIntrinsicUses(Function &F) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index e64542a395f0..62ab5bb55a16 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -58,6 +58,21 @@ public:
 
 } // end anonymous namespace
 
+// skip allocas
+static BasicBlock::iterator getInsertPt(BasicBlock &BB) {
+  BasicBlock::iterator InsPt = BB.getFirstInsertionPt();
+  for (BasicBlock::iterator E = BB.end(); InsPt != E; ++InsPt) {
+    AllocaInst *AI = dyn_cast<AllocaInst>(&*InsPt);
+
+    // If this is a dynamic alloca, the value may depend on the loaded kernargs,
+    // so loads will need to be inserted before it.
+    if (!AI || !AI->isStaticAlloca())
+      break;
+  }
+
+  return InsPt;
+}
+
 bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
   CallingConv::ID CC = F.getCallingConv();
   if (CC != CallingConv::AMDGPU_KERNEL || F.arg_empty())
@@ -70,7 +85,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
   LLVMContext &Ctx = F.getParent()->getContext();
   const DataLayout &DL = F.getParent()->getDataLayout();
   BasicBlock &EntryBlock = *F.begin();
-  IRBuilder<> Builder(&*EntryBlock.begin());
+  IRBuilder<> Builder(&*getInsertPt(EntryBlock));
 
   const Align KernArgBaseAlign(16); // FIXME: Increase if necessary
   const uint64_t BaseOffset = ST.getExplicitKernelArgOffset(F);
@@ -94,7 +109,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
 
   for (Argument &Arg : F.args()) {
     Type *ArgTy = Arg.getType();
-    unsigned ABITypeAlign = DL.getABITypeAlignment(ArgTy);
+    Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
     unsigned Size = DL.getTypeSizeInBits(ArgTy);
     unsigned AllocSize = DL.getTypeAllocSize(ArgTy);
 
@@ -120,7 +135,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
         continue;
     }
 
-    VectorType *VT = dyn_cast<VectorType>(ArgTy);
+    auto *VT = dyn_cast<FixedVectorType>(ArgTy);
     bool IsV3 = VT && VT->getNumElements() == 3;
     bool DoShiftOpt = Size < 32 && !ArgTy->isAggregateType();
 
@@ -152,7 +167,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
     }
 
     if (IsV3 && Size >= 32) {
-      V4Ty = VectorType::get(VT->getVectorElementType(), 4);
+      V4Ty = FixedVectorType::get(VT->getElementType(), 4);
       // Use the hack that clang uses to avoid SelectionDAG ruining v3 loads
       AdjustedArgTy = V4Ty;
     }
@@ -160,7 +175,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
     ArgPtr = Builder.CreateBitCast(ArgPtr, AdjustedArgTy->getPointerTo(AS),
                                    ArgPtr->getName() + ".cast");
     LoadInst *Load =
-        Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign.value());
+        Builder.CreateAlignedLoad(AdjustedArgTy, ArgPtr, AdjustedAlign);
     Load->setMetadata(LLVMContext::MD_invariant_load, MDNode::get(Ctx, {}));
 
     MDBuilder MDB(Ctx);
@@ -210,7 +225,7 @@ bool AMDGPULowerKernelArguments::runOnFunction(Function &F) {
       Arg.replaceAllUsesWith(NewVal);
     } else if (IsV3) {
       Value *Shuf = Builder.CreateShuffleVector(Load, UndefValue::get(V4Ty),
-                                                {0, 1, 2},
+                                                ArrayRef<int>{0, 1, 2},
                                                 Arg.getName() + ".load");
       Arg.replaceAllUsesWith(Shuf);
     } else {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
index ce7286dabcc8..99d229c9b74e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMCInstLower.cpp
@@ -254,7 +254,7 @@ const MCExpr *AMDGPUAsmPrinter::lowerConstant(const Constant *CV) {
   return AsmPrinter::lowerConstant(CV);
 }
 
-void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void AMDGPUAsmPrinter::emitInstruction(const MachineInstr *MI) {
   if (emitPseudoExpansionLowering(*OutStreamer, MI))
     return;
 
@@ -272,7 +272,7 @@ void AMDGPUAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MachineBasicBlock *MBB = MI->getParent();
     MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
     while (I != MBB->instr_end() && I->isInsideBundle()) {
-      EmitInstruction(&*I);
+      emitInstruction(&*I);
       ++I;
     }
   } else {
@@ -381,7 +381,7 @@ void R600MCInstLower::lower(const MachineInstr *MI, MCInst &OutMI) const {
   }
 }
 
-void R600AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void R600AsmPrinter::emitInstruction(const MachineInstr *MI) {
   const R600Subtarget &STI = MF->getSubtarget<R600Subtarget>();
   R600MCInstLower MCInstLowering(OutContext, STI, *this);
 
@@ -396,7 +396,7 @@ void R600AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     const MachineBasicBlock *MBB = MI->getParent();
     MachineBasicBlock::const_instr_iterator I = ++MI->getIterator();
     while (I != MBB->instr_end() && I->isInsideBundle()) {
-      EmitInstruction(&*I);
+      emitInstruction(&*I);
       ++I;
     }
   } else {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
index 940ddff85d73..64acd6efe028 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.cpp
@@ -15,14 +15,9 @@ using namespace llvm;
 
 AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
   MachineFunctionInfo(),
-  LocalMemoryObjects(),
-  ExplicitKernArgSize(0),
-  LDSSize(0),
-  Mode(MF.getFunction(), MF.getSubtarget<GCNSubtarget>()),
+  Mode(MF.getFunction()),
   IsEntryFunction(AMDGPU::isEntryFunctionCC(MF.getFunction().getCallingConv())),
-  NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath),
-  MemoryBound(false),
-  WaveLimiter(false) {
+  NoSignedZerosFPMath(MF.getTarget().Options.NoSignedZerosFPMath) {
   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(MF);
 
   // FIXME: Should initialize KernArgSize based on ExplicitKernelArgOffset,
@@ -43,19 +38,18 @@ AMDGPUMachineFunction::AMDGPUMachineFunction(const MachineFunction &MF) :
 }
 
 unsigned AMDGPUMachineFunction::allocateLDSGlobal(const DataLayout &DL,
-                                                  const GlobalValue &GV) {
+                                                  const GlobalVariable &GV) {
   auto Entry = LocalMemoryObjects.insert(std::make_pair(&GV, 0));
   if (!Entry.second)
     return Entry.first->second;
 
-  unsigned Align = GV.getAlignment();
-  if (Align == 0)
-    Align = DL.getABITypeAlignment(GV.getValueType());
+  Align Alignment =
+      DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
 
   /// TODO: We should sort these to minimize wasted space due to alignment
   /// padding. Currently the padding is decided by the first encountered use
   /// during lowering.
-  unsigned Offset = LDSSize = alignTo(LDSSize, Align);
+  unsigned Offset = LDSSize = alignTo(LDSSize, Alignment);
 
   Entry.first->second = Offset;
   LDSSize += DL.getTypeAllocSize(GV.getValueType());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
index 1933e41c66f3..c504dd76bc65 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMachineFunction.h
@@ -23,26 +23,26 @@ class AMDGPUMachineFunction : public MachineFunctionInfo {
   SmallDenseMap<const GlobalValue *, unsigned, 4> LocalMemoryObjects;
 
 protected:
-  uint64_t ExplicitKernArgSize; // Cache for this.
+  uint64_t ExplicitKernArgSize = 0; // Cache for this.
   Align MaxKernArgAlign;        // Cache for this.
 
   /// Number of bytes in the LDS that are being used.
-  unsigned LDSSize;
+  unsigned LDSSize = 0;
 
   // State of MODE register, assumed FP mode.
   AMDGPU::SIModeRegisterDefaults Mode;
 
   // Kernels + shaders. i.e. functions called by the driver and not called
   // by other functions.
-  bool IsEntryFunction;
+  bool IsEntryFunction = false;
 
-  bool NoSignedZerosFPMath;
+  bool NoSignedZerosFPMath = false;
 
   // Function may be memory bound.
-  bool MemoryBound;
+  bool MemoryBound = false;
 
   // Kernel may need limited waves per EU for better performance.
-  bool WaveLimiter;
+  bool WaveLimiter = false;
 
 public:
   AMDGPUMachineFunction(const MachineFunction &MF);
@@ -77,7 +77,7 @@ public:
     return WaveLimiter;
   }
 
-  unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalValue &GV);
+  unsigned allocateLDSGlobal(const DataLayout &DL, const GlobalVariable &GV);
 };
 
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
index 8c11230f411a..b05855d1afc6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUMacroFusion.cpp
@@ -34,6 +34,7 @@ static bool shouldScheduleAdjacent(const TargetInstrInfo &TII_,
   switch (SecondMI.getOpcode()) {
   case AMDGPU::V_ADDC_U32_e64:
   case AMDGPU::V_SUBB_U32_e64:
+  case AMDGPU::V_SUBBREV_U32_e64:
   case AMDGPU::V_CNDMASK_B32_e64: {
     // Try to cluster defs of condition registers to their uses. This improves
     // the chance VCC will be available which will allow shrinking to VOP2
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
index f7231471c107..4f9ffa11bc73 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUOpenCLEnqueuedBlockLowering.cpp
@@ -33,6 +33,7 @@
 
 #include "AMDGPU.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
index 9613d5a843b3..93079738ef99 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPerfHintAnalysis.cpp
@@ -28,6 +28,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueMap.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -220,9 +221,8 @@ AMDGPUPerfHintAnalysis::FuncInfo *AMDGPUPerfHint::visit(const Function &F) {
         ++FI.InstCount;
         continue;
       }
-      CallSite CS(const_cast<Instruction *>(&I));
-      if (CS) {
-        Function *Callee = CS.getCalledFunction();
+      if (auto *CB = dyn_cast<CallBase>(&I)) {
+        Function *Callee = CB->getCalledFunction();
         if (!Callee || Callee->isDeclaration()) {
           ++FI.InstCount;
           continue;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
new file mode 100644
index 000000000000..098b0e993886
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPostLegalizerCombiner.cpp
@@ -0,0 +1,359 @@
+//=== lib/CodeGen/GlobalISel/AMDGPUPostLegalizerCombiner.cpp ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does combining of machine instructions at the generic MI level,
+// after the legalizer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPULegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/Debug.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+
+#define DEBUG_TYPE "amdgpu-postlegalizer-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+struct FMinFMaxLegacyInfo {
+  Register LHS;
+  Register RHS;
+  Register True;
+  Register False;
+  CmpInst::Predicate Pred;
+};
+
+// TODO: Make sure fmin_legacy/fmax_legacy don't canonicalize
+static bool matchFMinFMaxLegacy(MachineInstr &MI, MachineRegisterInfo &MRI,
+                                MachineFunction &MF, FMinFMaxLegacyInfo &Info) {
+  // FIXME: Combines should have subtarget predicates, and we shouldn't need
+  // this here.
+  if (!MF.getSubtarget<GCNSubtarget>().hasFminFmaxLegacy())
+    return false;
+
+  // FIXME: Type predicate on pattern
+  if (MRI.getType(MI.getOperand(0).getReg()) != LLT::scalar(32))
+    return false;
+
+  Register Cond = MI.getOperand(1).getReg();
+  if (!MRI.hasOneNonDBGUse(Cond) ||
+      !mi_match(Cond, MRI,
+                m_GFCmp(m_Pred(Info.Pred), m_Reg(Info.LHS), m_Reg(Info.RHS))))
+    return false;
+
+  Info.True = MI.getOperand(2).getReg();
+  Info.False = MI.getOperand(3).getReg();
+
+  if (!(Info.LHS == Info.True && Info.RHS == Info.False) &&
+      !(Info.LHS == Info.False && Info.RHS == Info.True))
+    return false;
+
+  switch (Info.Pred) {
+  case CmpInst::FCMP_FALSE:
+  case CmpInst::FCMP_OEQ:
+  case CmpInst::FCMP_ONE:
+  case CmpInst::FCMP_ORD:
+  case CmpInst::FCMP_UNO:
+  case CmpInst::FCMP_UEQ:
+  case CmpInst::FCMP_UNE:
+  case CmpInst::FCMP_TRUE:
+    return false;
+  default:
+    return true;
+  }
+}
+
+static void applySelectFCmpToFMinToFMaxLegacy(MachineInstr &MI,
+                                              const FMinFMaxLegacyInfo &Info) {
+
+  auto buildNewInst = [&MI](unsigned Opc, Register X, Register Y) {
+    MachineIRBuilder MIB(MI);
+    MIB.buildInstr(Opc, {MI.getOperand(0)}, {X, Y}, MI.getFlags());
+  };
+
+  switch (Info.Pred) {
+  case CmpInst::FCMP_ULT:
+  case CmpInst::FCMP_ULE:
+    if (Info.LHS == Info.True)
+      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
+    else
+      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
+    break;
+  case CmpInst::FCMP_OLE:
+  case CmpInst::FCMP_OLT: {
+    // We need to permute the operands to get the correct NaN behavior. The
+    // selected operand is the second one based on the failing compare with NaN,
+    // so permute it based on the compare type the hardware uses.
+    if (Info.LHS == Info.True)
+      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
+    else
+      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
+    break;
+  }
+  case CmpInst::FCMP_UGE:
+  case CmpInst::FCMP_UGT: {
+    if (Info.LHS == Info.True)
+      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.RHS, Info.LHS);
+    else
+      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.LHS, Info.RHS);
+    break;
+  }
+  case CmpInst::FCMP_OGT:
+  case CmpInst::FCMP_OGE: {
+    if (Info.LHS == Info.True)
+      buildNewInst(AMDGPU::G_AMDGPU_FMAX_LEGACY, Info.LHS, Info.RHS);
+    else
+      buildNewInst(AMDGPU::G_AMDGPU_FMIN_LEGACY, Info.RHS, Info.LHS);
+    break;
+  }
+  default:
+    llvm_unreachable("predicate should not have matched");
+  }
+
+  MI.eraseFromParent();
+}
+
+static bool matchUCharToFloat(MachineInstr &MI, MachineRegisterInfo &MRI,
+                              MachineFunction &MF, CombinerHelper &Helper) {
+  Register DstReg = MI.getOperand(0).getReg();
+
+  // TODO: We could try to match extracting the higher bytes, which would be
+  // easier if i8 vectors weren't promoted to i32 vectors, particularly after
+  // types are legalized. v4i8 -> v4f32 is probably the only case to worry
+  // about in practice.
+  LLT Ty = MRI.getType(DstReg);
+  if (Ty == LLT::scalar(32) || Ty == LLT::scalar(16)) {
+    Register SrcReg = MI.getOperand(1).getReg();
+    unsigned SrcSize = MRI.getType(SrcReg).getSizeInBits();
+    assert(SrcSize == 16 || SrcSize == 32 || SrcSize == 64);
+    const APInt Mask = APInt::getHighBitsSet(SrcSize, SrcSize - 8);
+    return Helper.getKnownBits()->maskedValueIsZero(SrcReg, Mask);
+  }
+
+  return false;
+}
+
+static void applyUCharToFloat(MachineInstr &MI) {
+  MachineIRBuilder B(MI);
+
+  const LLT S32 = LLT::scalar(32);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
+  LLT Ty = B.getMRI()->getType(DstReg);
+  LLT SrcTy = B.getMRI()->getType(SrcReg);
+  if (SrcTy != S32)
+    SrcReg = B.buildAnyExtOrTrunc(S32, SrcReg).getReg(0);
+
+  if (Ty == S32) {
+    B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {DstReg},
+                   {SrcReg}, MI.getFlags());
+  } else {
+    auto Cvt0 = B.buildInstr(AMDGPU::G_AMDGPU_CVT_F32_UBYTE0, {S32},
+                             {SrcReg}, MI.getFlags());
+    B.buildFPTrunc(DstReg, Cvt0, MI.getFlags());
+  }
+
+  MI.eraseFromParent();
+}
+
+// FIXME: Should be able to have 2 separate matchdatas rather than custom struct
+// boilerplate.
+struct CvtF32UByteMatchInfo {
+  Register CvtVal;
+  unsigned ShiftOffset;
+};
+
+static bool matchCvtF32UByteN(MachineInstr &MI, MachineRegisterInfo &MRI,
+                              MachineFunction &MF,
+                              CvtF32UByteMatchInfo &MatchInfo) {
+  Register SrcReg = MI.getOperand(1).getReg();
+
+  // Look through G_ZEXT.
+  mi_match(SrcReg, MRI, m_GZExt(m_Reg(SrcReg)));
+
+  Register Src0;
+  int64_t ShiftAmt;
+  bool IsShr = mi_match(SrcReg, MRI, m_GLShr(m_Reg(Src0), m_ICst(ShiftAmt)));
+  if (IsShr || mi_match(SrcReg, MRI, m_GShl(m_Reg(Src0), m_ICst(ShiftAmt)))) {
+    const unsigned Offset = MI.getOpcode() - AMDGPU::G_AMDGPU_CVT_F32_UBYTE0;
+
+    unsigned ShiftOffset = 8 * Offset;
+    if (IsShr)
+      ShiftOffset += ShiftAmt;
+    else
+      ShiftOffset -= ShiftAmt;
+
+    MatchInfo.CvtVal = Src0;
+    MatchInfo.ShiftOffset = ShiftOffset;
+    return ShiftOffset < 32 && ShiftOffset >= 8 && (ShiftOffset % 8) == 0;
+  }
+
+  // TODO: Simplify demanded bits.
+  return false;
+}
+
+static void applyCvtF32UByteN(MachineInstr &MI,
+                              const CvtF32UByteMatchInfo &MatchInfo) {
+  MachineIRBuilder B(MI);
+  unsigned NewOpc = AMDGPU::G_AMDGPU_CVT_F32_UBYTE0 + MatchInfo.ShiftOffset / 8;
+
+  const LLT S32 = LLT::scalar(32);
+  Register CvtSrc = MatchInfo.CvtVal;
+  LLT SrcTy = B.getMRI()->getType(MatchInfo.CvtVal);
+  if (SrcTy != S32) {
+    assert(SrcTy.isScalar() && SrcTy.getSizeInBits() >= 8);
+    CvtSrc = B.buildAnyExt(S32, CvtSrc).getReg(0);
+  }
+
+  assert(MI.getOpcode() != NewOpc);
+  B.buildInstr(NewOpc, {MI.getOperand(0)}, {CvtSrc}, MI.getFlags());
+  MI.eraseFromParent();
+}
+
+#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "AMDGPUGenPostLegalizeGICombiner.inc"
+#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
+namespace {
+#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "AMDGPUGenPostLegalizeGICombiner.inc"
+#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+
+class AMDGPUPostLegalizerCombinerInfo : public CombinerInfo {
+  GISelKnownBits *KB;
+  MachineDominatorTree *MDT;
+
+public:
+  AMDGPUGenPostLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
+
+  AMDGPUPostLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+                                  const AMDGPULegalizerInfo *LI,
+                                  GISelKnownBits *KB, MachineDominatorTree *MDT)
+      : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
+                     /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
+        KB(KB), MDT(MDT) {
+    if (!GeneratedRuleCfg.parseCommandLineOption())
+      report_fatal_error("Invalid rule identifier");
+  }
+
+  bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+               MachineIRBuilder &B) const override;
+};
+
+bool AMDGPUPostLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
+                                              MachineInstr &MI,
+                                              MachineIRBuilder &B) const {
+  CombinerHelper Helper(Observer, B, KB, MDT);
+  AMDGPUGenPostLegalizerCombinerHelper Generated(GeneratedRuleCfg);
+
+  if (Generated.tryCombineAll(Observer, MI, B, Helper))
+    return true;
+
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_SHL:
+  case TargetOpcode::G_LSHR:
+  case TargetOpcode::G_ASHR:
+    // On some subtargets, 64-bit shift is a quarter rate instruction. In the
+    // common case, splitting this into a move and a 32-bit shift is faster and
+    // the same code size.
+    return Helper.tryCombineShiftToUnmerge(MI, 32);
+  }
+
+  return false;
+}
+
+#define AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "AMDGPUGenPostLegalizeGICombiner.inc"
+#undef AMDGPUPOSTLEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
+// Pass boilerplate
+// ================
+
+class AMDGPUPostLegalizerCombiner : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUPostLegalizerCombiner(bool IsOptNone = false);
+
+  StringRef getPassName() const override {
+    return "AMDGPUPostLegalizerCombiner";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+private:
+  bool IsOptNone;
+};
+} // end anonymous namespace
+
+void AMDGPUPostLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.setPreservesCFG();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  AU.addRequired<GISelKnownBitsAnalysis>();
+  AU.addPreserved<GISelKnownBitsAnalysis>();
+  if (!IsOptNone) {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+  }
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AMDGPUPostLegalizerCombiner::AMDGPUPostLegalizerCombiner(bool IsOptNone)
+  : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+  initializeAMDGPUPostLegalizerCombinerPass(*PassRegistry::getPassRegistry());
+}
+
+bool AMDGPUPostLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+  auto *TPC = &getAnalysis<TargetPassConfig>();
+  const Function &F = MF.getFunction();
+  bool EnableOpt =
+      MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const AMDGPULegalizerInfo *LI
+    = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
+
+  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+  MachineDominatorTree *MDT =
+      IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+  AMDGPUPostLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
+                                         F.hasMinSize(), LI, KB, MDT);
+  Combiner C(PCInfo, TPC);
+  return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+}
+
+char AMDGPUPostLegalizerCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
+                      "Combine AMDGPU machine instrs after legalization",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_END(AMDGPUPostLegalizerCombiner, DEBUG_TYPE,
+                    "Combine AMDGPU machine instrs after legalization", false,
+                    false)
+
+namespace llvm {
+FunctionPass *createAMDGPUPostLegalizeCombiner(bool IsOptNone) {
+  return new AMDGPUPostLegalizerCombiner(IsOptNone);
+}
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
new file mode 100644
index 000000000000..800ad2039f0e
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPreLegalizerCombiner.cpp
@@ -0,0 +1,153 @@
+//=== lib/CodeGen/GlobalISel/AMDGPUPreLegalizerCombiner.cpp ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does combining of machine instructions at the generic MI level,
+// before the legalizer.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/Debug.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+
+#define DEBUG_TYPE "amdgpu-prelegalizer-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
+#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
+namespace {
+#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
+#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_H
+
+class AMDGPUPreLegalizerCombinerInfo : public CombinerInfo {
+  GISelKnownBits *KB;
+  MachineDominatorTree *MDT;
+
+public:
+  AMDGPUGenPreLegalizerCombinerHelperRuleConfig GeneratedRuleCfg;
+
+  AMDGPUPreLegalizerCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+                                  GISelKnownBits *KB, MachineDominatorTree *MDT)
+      : CombinerInfo(/*AllowIllegalOps*/ true, /*ShouldLegalizeIllegal*/ false,
+                     /*LegalizerInfo*/ nullptr, EnableOpt, OptSize, MinSize),
+        KB(KB), MDT(MDT) {
+    if (!GeneratedRuleCfg.parseCommandLineOption())
+      report_fatal_error("Invalid rule identifier");
+  }
+
+  virtual bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+                       MachineIRBuilder &B) const override;
+};
+
+bool AMDGPUPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
+                                              MachineInstr &MI,
+                                              MachineIRBuilder &B) const {
+  CombinerHelper Helper(Observer, B, KB, MDT);
+  AMDGPUGenPreLegalizerCombinerHelper Generated(GeneratedRuleCfg);
+
+  if (Generated.tryCombineAll(Observer, MI, B, Helper))
+    return true;
+
+  switch (MI.getOpcode()) {
+  case TargetOpcode::G_CONCAT_VECTORS:
+    return Helper.tryCombineConcatVectors(MI);
+  case TargetOpcode::G_SHUFFLE_VECTOR:
+    return Helper.tryCombineShuffleVector(MI);
+  }
+
+  return false;
+}
+
+#define AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "AMDGPUGenPreLegalizeGICombiner.inc"
+#undef AMDGPUPRELEGALIZERCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
+// Pass boilerplate
+// ================
+
+class AMDGPUPreLegalizerCombiner : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPUPreLegalizerCombiner(bool IsOptNone = false);
+
+  StringRef getPassName() const override {
+    return "AMDGPUPreLegalizerCombiner";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+private:
+  bool IsOptNone;
+};
+} // end anonymous namespace
+
+void AMDGPUPreLegalizerCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.setPreservesCFG();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  AU.addRequired<GISelKnownBitsAnalysis>();
+  AU.addPreserved<GISelKnownBitsAnalysis>();
+  if (!IsOptNone) {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+  }
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AMDGPUPreLegalizerCombiner::AMDGPUPreLegalizerCombiner(bool IsOptNone)
+  : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+  initializeAMDGPUPreLegalizerCombinerPass(*PassRegistry::getPassRegistry());
+}
+
+bool AMDGPUPreLegalizerCombiner::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+  auto *TPC = &getAnalysis<TargetPassConfig>();
+  const Function &F = MF.getFunction();
+  bool EnableOpt =
+      MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+  MachineDominatorTree *MDT =
+      IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+  AMDGPUPreLegalizerCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
+                                        F.hasMinSize(), KB, MDT);
+  Combiner C(PCInfo, TPC);
+  return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+}
+
+char AMDGPUPreLegalizerCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
+                      "Combine AMDGPU machine instrs before legalization",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_END(AMDGPUPreLegalizerCombiner, DEBUG_TYPE,
+                    "Combine AMDGPU machine instrs before legalization", false,
+                    false)
+
+namespace llvm {
+FunctionPass *createAMDGPUPreLegalizeCombiner(bool IsOptNone) {
+  return new AMDGPUPreLegalizerCombiner(IsOptNone);
+}
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
index 511de96b5f7c..524a34be876f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPrintfRuntimeBinding.cpp
@@ -218,10 +218,10 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
         //
         if (ArgSize % DWORD_ALIGN != 0) {
           llvm::Type *ResType = llvm::Type::getInt32Ty(Ctx);
-          VectorType *LLVMVecType = llvm::dyn_cast<llvm::VectorType>(ArgType);
+          auto *LLVMVecType = llvm::dyn_cast<llvm::FixedVectorType>(ArgType);
           int NumElem = LLVMVecType ? LLVMVecType->getNumElements() : 1;
           if (LLVMVecType && NumElem > 1)
-            ResType = llvm::VectorType::get(ResType, NumElem);
+            ResType = llvm::FixedVectorType::get(ResType, NumElem);
           Builder.SetInsertPoint(CI);
           Builder.SetCurrentDebugLocation(CI->getDebugLoc());
           if (OpConvSpecifiers[ArgCount - 1] == 'x' ||
@@ -387,9 +387,7 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
       Value *id_gep_cast =
           new BitCastInst(BufferIdx, idPointer, "PrintBuffIdCast", Brnch);
 
-      StoreInst *stbuff =
-          new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast);
-      stbuff->insertBefore(Brnch); // to Remove unused variable warning
+      new StoreInst(ConstantInt::get(I32Ty, UniqID), id_gep_cast, Brnch);
 
       SmallVector<Value *, 2> FourthIdxList;
       ConstantInt *fourInt =
@@ -408,8 +406,7 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
         Value *Arg = CI->getArgOperand(ArgCount);
         Type *ArgType = Arg->getType();
         SmallVector<Value *, 32> WhatToStore;
-        if (ArgType->isFPOrFPVectorTy() &&
-            (ArgType->getTypeID() != Type::VectorTyID)) {
+        if (ArgType->isFPOrFPVectorTy() && !isa<VectorType>(ArgType)) {
           Type *IType = (ArgType->isFloatTy()) ? Int32Ty : Int64Ty;
           if (OpConvSpecifiers[ArgCount - 1] == 'f') {
             ConstantFP *fpCons = dyn_cast<ConstantFP>(Arg);
@@ -478,18 +475,14 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
             Arg = new PtrToIntInst(Arg, DstType, "PrintArgPtr", Brnch);
             WhatToStore.push_back(Arg);
           }
-        } else if (ArgType->getTypeID() == Type::VectorTyID) {
+        } else if (isa<FixedVectorType>(ArgType)) {
           Type *IType = NULL;
-          uint32_t EleCount = cast<VectorType>(ArgType)->getNumElements();
+          uint32_t EleCount = cast<FixedVectorType>(ArgType)->getNumElements();
           uint32_t EleSize = ArgType->getScalarSizeInBits();
           uint32_t TotalSize = EleCount * EleSize;
           if (EleCount == 3) {
-            IntegerType *Int32Ty = Type::getInt32Ty(ArgType->getContext());
-            Constant *Indices[4] = {
-                ConstantInt::get(Int32Ty, 0), ConstantInt::get(Int32Ty, 1),
-                ConstantInt::get(Int32Ty, 2), ConstantInt::get(Int32Ty, 2)};
-            Constant *Mask = ConstantVector::get(Indices);
-            ShuffleVectorInst *Shuffle = new ShuffleVectorInst(Arg, Arg, Mask);
+            ShuffleVectorInst *Shuffle =
+                new ShuffleVectorInst(Arg, Arg, ArrayRef<int>{0, 1, 2, 2});
             Shuffle->insertBefore(Brnch);
             Arg = Shuffle;
             ArgType = Arg->getType();
@@ -523,7 +516,7 @@ bool AMDGPUPrintfRuntimeBinding::lowerPrintfForGpu(
             break;
           }
           if (EleCount > 1) {
-            IType = dyn_cast<Type>(VectorType::get(IType, EleCount));
+            IType = FixedVectorType::get(IType, EleCount);
           }
           Arg = new BitCastInst(Arg, IType, "PrintArgVect", Brnch);
           WhatToStore.push_back(Arg);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 14958a180ce3..727f71b35049 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -76,6 +76,11 @@ static cl::opt<bool> DisablePromoteAllocaToLDS(
   cl::desc("Disable promote alloca to LDS"),
   cl::init(false));
 
+static cl::opt<unsigned> PromoteAllocaToVectorLimit(
+  "amdgpu-promote-alloca-to-vector-limit",
+  cl::desc("Maximum byte size to consider promote alloca to vector"),
+  cl::init(0));
+
 // FIXME: This can create globals so should be a module pass.
 class AMDGPUPromoteAlloca : public FunctionPass {
 private:
@@ -86,6 +91,7 @@ private:
   // FIXME: This should be per-kernel.
   uint32_t LocalMemLimit = 0;
   uint32_t CurrentLocalMemUsage = 0;
+  unsigned MaxVGPRs;
 
   bool IsAMDGCN = false;
   bool IsAMDHSA = false;
@@ -128,14 +134,42 @@ public:
   }
 };
 
+class AMDGPUPromoteAllocaToVector : public FunctionPass {
+private:
+  unsigned MaxVGPRs;
+
+public:
+  static char ID;
+
+  AMDGPUPromoteAllocaToVector() : FunctionPass(ID) {}
+
+  bool runOnFunction(Function &F) override;
+
+  StringRef getPassName() const override {
+    return "AMDGPU Promote Alloca to vector";
+  }
+
+  bool handleAlloca(AllocaInst &I);
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+};
+
 } // end anonymous namespace
 
 char AMDGPUPromoteAlloca::ID = 0;
+char AMDGPUPromoteAllocaToVector::ID = 0;
 
 INITIALIZE_PASS(AMDGPUPromoteAlloca, DEBUG_TYPE,
                 "AMDGPU promote alloca to vector or LDS", false, false)
 
+INITIALIZE_PASS(AMDGPUPromoteAllocaToVector, DEBUG_TYPE "-to-vector",
+                "AMDGPU promote alloca to vector", false, false)
+
 char &llvm::AMDGPUPromoteAllocaID = AMDGPUPromoteAlloca::ID;
+char &llvm::AMDGPUPromoteAllocaToVectorID = AMDGPUPromoteAllocaToVector::ID;
 
 bool AMDGPUPromoteAlloca::doInitialization(Module &M) {
   Mod = &M;
@@ -161,6 +195,13 @@ bool AMDGPUPromoteAlloca::runOnFunction(Function &F) {
   if (!ST.isPromoteAllocaEnabled())
     return false;
 
+  if (IsAMDGCN) {
+    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+    MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+  } else {
+    MaxVGPRs = 128;
+  }
+
   bool SufficientLDS = hasSufficientLocalMem(F);
   bool Changed = false;
   BasicBlock &EntryBB = *F.begin();
@@ -251,10 +292,10 @@ AMDGPUPromoteAlloca::getLocalSizeYZ(IRBuilder<> &Builder) {
   // 32-bit and extract sequence is already present, and it is probably easier
   // to CSE this. The loads should be mergable later anyway.
   Value *GEPXY = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 1);
-  LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, 4);
+  LoadInst *LoadXY = Builder.CreateAlignedLoad(I32Ty, GEPXY, Align(4));
 
   Value *GEPZU = Builder.CreateConstInBoundsGEP1_64(I32Ty, CastDispatchPtr, 2);
-  LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, 4);
+  LoadInst *LoadZU = Builder.CreateAlignedLoad(I32Ty, GEPZU, Align(4));
 
   MDNode *MD = MDNode::get(Mod->getContext(), None);
   LoadXY->setMetadata(LLVMContext::MD_invariant_load, MD);
@@ -297,15 +338,26 @@ Value *AMDGPUPromoteAlloca::getWorkitemID(IRBuilder<> &Builder, unsigned N) {
   return CI;
 }
 
-static VectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
-  return VectorType::get(ArrayTy->getElementType(),
-                         ArrayTy->getNumElements());
+static FixedVectorType *arrayTypeToVecType(ArrayType *ArrayTy) {
+  return FixedVectorType::get(ArrayTy->getElementType(),
+                              ArrayTy->getNumElements());
+}
+
+static Value *stripBitcasts(Value *V) {
+  while (Instruction *I = dyn_cast<Instruction>(V)) {
+    if (I->getOpcode() != Instruction::BitCast)
+      break;
+    V = I->getOperand(0);
+  }
+  return V;
 }
 
 static Value *
 calculateVectorIndex(Value *Ptr,
                      const std::map<GetElementPtrInst *, Value *> &GEPIdx) {
-  GetElementPtrInst *GEP = cast<GetElementPtrInst>(Ptr);
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(stripBitcasts(Ptr));
+  if (!GEP)
+    return nullptr;
 
   auto I = GEPIdx.find(GEP);
   return I == GEPIdx.end() ? nullptr : I->second;
@@ -327,7 +379,8 @@ static Value* GEPToVectorIndex(GetElementPtrInst *GEP) {
 //
 // TODO: Check isTriviallyVectorizable for calls and handle other
 // instructions.
-static bool canVectorizeInst(Instruction *Inst, User *User) {
+static bool canVectorizeInst(Instruction *Inst, User *User,
+                             const DataLayout &DL) {
   switch (Inst->getOpcode()) {
   case Instruction::Load: {
     // Currently only handle the case where the Pointer Operand is a GEP.
@@ -337,7 +390,14 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
         LI->getPointerOperandType() == User->getType() &&
         isa<VectorType>(LI->getType()))
       return true;
-    return isa<GetElementPtrInst>(LI->getPointerOperand()) && LI->isSimple();
+
+    Instruction *PtrInst = dyn_cast<Instruction>(LI->getPointerOperand());
+    if (!PtrInst)
+      return false;
+
+    return (PtrInst->getOpcode() == Instruction::GetElementPtr ||
+            PtrInst->getOpcode() == Instruction::BitCast) &&
+           LI->isSimple();
   }
   case Instruction::BitCast:
     return true;
@@ -350,22 +410,46 @@ static bool canVectorizeInst(Instruction *Inst, User *User) {
         SI->getPointerOperandType() == User->getType() &&
         isa<VectorType>(SI->getValueOperand()->getType()))
       return true;
-    return (SI->getPointerOperand() == User) && isa<GetElementPtrInst>(User) && SI->isSimple();
+
+    Instruction *UserInst = dyn_cast<Instruction>(User);
+    if (!UserInst)
+      return false;
+
+    return (SI->getPointerOperand() == User) &&
+           (UserInst->getOpcode() == Instruction::GetElementPtr ||
+            UserInst->getOpcode() == Instruction::BitCast) &&
+           SI->isSimple();
   }
   default:
     return false;
   }
 }
 
-static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
+static bool tryPromoteAllocaToVector(AllocaInst *Alloca, const DataLayout &DL,
+                                     unsigned MaxVGPRs) {
 
   if (DisablePromoteAllocaToVector) {
     LLVM_DEBUG(dbgs() << "  Promotion alloca to vector is disabled\n");
     return false;
   }
 
-  Type *AT = Alloca->getAllocatedType();
-  SequentialType *AllocaTy = dyn_cast<SequentialType>(AT);
+  Type *AllocaTy = Alloca->getAllocatedType();
+  auto *VectorTy = dyn_cast<FixedVectorType>(AllocaTy);
+  if (auto *ArrayTy = dyn_cast<ArrayType>(AllocaTy)) {
+    if (VectorType::isValidElementType(ArrayTy->getElementType()) &&
+        ArrayTy->getNumElements() > 0)
+      VectorTy = arrayTypeToVecType(ArrayTy);
+  }
+
+  // Use up to 1/4 of available register budget for vectorization.
+  unsigned Limit = PromoteAllocaToVectorLimit ? PromoteAllocaToVectorLimit * 8
+                                              : (MaxVGPRs * 32);
+
+  if (DL.getTypeSizeInBits(AllocaTy) * 4 > Limit) {
+    LLVM_DEBUG(dbgs() << "  Alloca too big for vectorization with "
+                      << MaxVGPRs << " registers available\n");
+    return false;
+  }
 
   LLVM_DEBUG(dbgs() << "Alloca candidate for vectorization\n");
 
@@ -373,22 +457,44 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
   // are just being conservative for now.
   // FIXME: We also reject alloca's of the form [ 2 x [ 2 x i32 ]] or equivalent. Potentially these
   // could also be promoted but we don't currently handle this case
-  if (!AllocaTy ||
-      AllocaTy->getNumElements() > 16 ||
-      AllocaTy->getNumElements() < 2 ||
-      !VectorType::isValidElementType(AllocaTy->getElementType())) {
+  if (!VectorTy || VectorTy->getNumElements() > 16 ||
+      VectorTy->getNumElements() < 2) {
     LLVM_DEBUG(dbgs() << "  Cannot convert type to vector\n");
     return false;
   }
 
   std::map<GetElementPtrInst*, Value*> GEPVectorIdx;
-  std::vector<Value*> WorkList;
-  for (User *AllocaUser : Alloca->users()) {
+  std::vector<Value *> WorkList;
+  SmallVector<User *, 8> Users(Alloca->users());
+  SmallVector<User *, 8> UseUsers(Users.size(), Alloca);
+  Type *VecEltTy = VectorTy->getElementType();
+  while (!Users.empty()) {
+    User *AllocaUser = Users.pop_back_val();
+    User *UseUser = UseUsers.pop_back_val();
+    Instruction *Inst = dyn_cast<Instruction>(AllocaUser);
+
     GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(AllocaUser);
     if (!GEP) {
-      if (!canVectorizeInst(cast<Instruction>(AllocaUser), Alloca))
+      if (!canVectorizeInst(Inst, UseUser, DL))
         return false;
 
+      if (Inst->getOpcode() == Instruction::BitCast) {
+        Type *FromTy = Inst->getOperand(0)->getType()->getPointerElementType();
+        Type *ToTy = Inst->getType()->getPointerElementType();
+        if (FromTy->isAggregateType() || ToTy->isAggregateType() ||
+            DL.getTypeSizeInBits(FromTy) != DL.getTypeSizeInBits(ToTy))
+          continue;
+
+        for (User *CastUser : Inst->users()) {
+          if (isAssumeLikeIntrinsic(cast<Instruction>(CastUser)))
+            continue;
+          Users.push_back(CastUser);
+          UseUsers.push_back(Inst);
+        }
+
+        continue;
+      }
+
       WorkList.push_back(AllocaUser);
       continue;
     }
@@ -404,18 +510,10 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     }
 
     GEPVectorIdx[GEP] = Index;
-    for (User *GEPUser : AllocaUser->users()) {
-      if (!canVectorizeInst(cast<Instruction>(GEPUser), AllocaUser))
-        return false;
-
-      WorkList.push_back(GEPUser);
-    }
+    Users.append(GEP->user_begin(), GEP->user_end());
+    UseUsers.append(GEP->getNumUses(), GEP);
   }
 
-  VectorType *VectorTy = dyn_cast<VectorType>(AllocaTy);
-  if (!VectorTy)
-    VectorTy = arrayTypeToVecType(cast<ArrayType>(AllocaTy));
-
   LLVM_DEBUG(dbgs() << "  Converting alloca to vector " << *AllocaTy << " -> "
                     << *VectorTy << '\n');
 
@@ -424,40 +522,46 @@ static bool tryPromoteAllocaToVector(AllocaInst *Alloca) {
     IRBuilder<> Builder(Inst);
     switch (Inst->getOpcode()) {
     case Instruction::Load: {
-      if (Inst->getType() == AT)
+      if (Inst->getType() == AllocaTy || Inst->getType()->isVectorTy())
         break;
 
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *Ptr = cast<LoadInst>(Inst)->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      if (!Index)
+        break;
 
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
       Value *ExtractElement = Builder.CreateExtractElement(VecValue, Index);
+      if (Inst->getType() != VecEltTy)
+        ExtractElement = Builder.CreateBitOrPointerCast(ExtractElement, Inst->getType());
       Inst->replaceAllUsesWith(ExtractElement);
       Inst->eraseFromParent();
       break;
     }
     case Instruction::Store: {
       StoreInst *SI = cast<StoreInst>(Inst);
-      if (SI->getValueOperand()->getType() == AT)
+      if (SI->getValueOperand()->getType() == AllocaTy ||
+          SI->getValueOperand()->getType()->isVectorTy())
         break;
 
-      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *Ptr = SI->getPointerOperand();
       Value *Index = calculateVectorIndex(Ptr, GEPVectorIdx);
+      if (!Index)
+        break;
+
+      Type *VecPtrTy = VectorTy->getPointerTo(AMDGPUAS::PRIVATE_ADDRESS);
       Value *BitCast = Builder.CreateBitCast(Alloca, VecPtrTy);
       Value *VecValue = Builder.CreateLoad(VectorTy, BitCast);
-      Value *NewVecValue = Builder.CreateInsertElement(VecValue,
-                                                       SI->getValueOperand(),
-                                                       Index);
+      Value *Elt = SI->getValueOperand();
+      if (Elt->getType() != VecEltTy)
+        Elt = Builder.CreateBitOrPointerCast(Elt, VecEltTy);
+      Value *NewVecValue = Builder.CreateInsertElement(VecValue, Elt, Index);
       Builder.CreateStore(NewVecValue, BitCast);
       Inst->eraseFromParent();
       break;
     }
-    case Instruction::BitCast:
-    case Instruction::AddrSpaceCast:
-      break;
 
     default:
       llvm_unreachable("Inconsistency in instructions promotable to vector");
@@ -659,16 +763,15 @@ bool AMDGPUPromoteAlloca::hasSufficientLocalMem(const Function &F) {
         continue;
 
       if (Use->getParent()->getParent() == &F) {
-        unsigned Align = GV.getAlignment();
-        if (Align == 0)
-          Align = DL.getABITypeAlignment(GV.getValueType());
+        Align Alignment =
+            DL.getValueOrABITypeAlignment(GV.getAlign(), GV.getValueType());
 
         // FIXME: Try to account for padding here. The padding is currently
         // determined from the inverse order of uses in the function. I'm not
         // sure if the use list order is in any way connected to this, so the
         // total reported size is likely incorrect.
         uint64_t AllocSize = DL.getTypeAllocSize(GV.getValueType());
-        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Align);
+        CurrentLocalMemUsage = alignTo(CurrentLocalMemUsage, Alignment);
         CurrentLocalMemUsage += AllocSize;
         break;
       }
@@ -722,6 +825,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   if (!I.isStaticAlloca() || I.isArrayAllocation())
     return false;
 
+  const DataLayout &DL = Mod->getDataLayout();
   IRBuilder<> Builder(&I);
 
   // First try to replace the alloca with a vector
@@ -729,7 +833,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
 
   LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
 
-  if (tryPromoteAllocaToVector(&I))
+  if (tryPromoteAllocaToVector(&I, DL, MaxVGPRs))
     return true; // Promoted to vector.
 
   if (DisablePromoteAllocaToLDS)
@@ -759,11 +863,8 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, ContainingFunction);
   unsigned WorkGroupSize = ST.getFlatWorkGroupSizes(ContainingFunction).second;
 
-  const DataLayout &DL = Mod->getDataLayout();
-
-  unsigned Align = I.getAlignment();
-  if (Align == 0)
-    Align = DL.getABITypeAlignment(I.getAllocatedType());
+  Align Alignment =
+      DL.getValueOrABITypeAlignment(I.getAlign(), I.getAllocatedType());
 
   // FIXME: This computed padding is likely wrong since it depends on inverse
   // usage order.
@@ -771,7 +872,7 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   // FIXME: It is also possible that if we're allowed to use all of the memory
   // could could end up using more than the maximum due to alignment padding.
 
-  uint32_t NewSize = alignTo(CurrentLocalMemUsage, Align);
+  uint32_t NewSize = alignTo(CurrentLocalMemUsage, Alignment);
   uint32_t AllocSize = WorkGroupSize * DL.getTypeAllocSize(AllocaTy);
   NewSize += AllocSize;
 
@@ -938,6 +1039,60 @@ bool AMDGPUPromoteAlloca::handleAlloca(AllocaInst &I, bool SufficientLDS) {
   return true;
 }
 
+bool AMDGPUPromoteAllocaToVector::runOnFunction(Function &F) {
+  if (skipFunction(F) || DisablePromoteAllocaToVector)
+    return false;
+
+  const TargetMachine *TM;
+  if (auto *TPC = getAnalysisIfAvailable<TargetPassConfig>())
+    TM = &TPC->getTM<TargetMachine>();
+  else
+    return false;
+
+  const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(*TM, F);
+  if (!ST.isPromoteAllocaEnabled())
+    return false;
+
+  if (TM->getTargetTriple().getArch() == Triple::amdgcn) {
+    const GCNSubtarget &ST = TM->getSubtarget<GCNSubtarget>(F);
+    MaxVGPRs = ST.getMaxNumVGPRs(ST.getWavesPerEU(F).first);
+  } else {
+    MaxVGPRs = 128;
+  }
+
+  bool Changed = false;
+  BasicBlock &EntryBB = *F.begin();
+
+  SmallVector<AllocaInst *, 16> Allocas;
+  for (Instruction &I : EntryBB) {
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(&I))
+      Allocas.push_back(AI);
+  }
+
+  for (AllocaInst *AI : Allocas) {
+    if (handleAlloca(*AI))
+      Changed = true;
+  }
+
+  return Changed;
+}
+
+bool AMDGPUPromoteAllocaToVector::handleAlloca(AllocaInst &I) {
+  // Array allocations are probably not worth handling, since an allocation of
+  // the array type is the canonical form.
+  if (!I.isStaticAlloca() || I.isArrayAllocation())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "Trying to promote " << I << '\n');
+
+  Module *Mod = I.getParent()->getParent()->getParent();
+  return tryPromoteAllocaToVector(&I, Mod->getDataLayout(), MaxVGPRs);
+}
+
 FunctionPass *llvm::createAMDGPUPromoteAlloca() {
   return new AMDGPUPromoteAlloca();
 }
+
+FunctionPass *llvm::createAMDGPUPromoteAllocaToVector() {
+  return new AMDGPUPromoteAllocaToVector();
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
index 7a7addd0f5cf..982aae374884 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPropagateAttributes.cpp
@@ -48,19 +48,62 @@ extern const SubtargetFeatureKV AMDGPUFeatureKV[AMDGPU::NumSubtargetFeatures-1];
 
 namespace {
 
+// Target features to propagate.
+static constexpr const FeatureBitset TargetFeatures = {
+  AMDGPU::FeatureWavefrontSize16,
+  AMDGPU::FeatureWavefrontSize32,
+  AMDGPU::FeatureWavefrontSize64
+};
+
+// Attributes to propagate.
+static constexpr const char* AttributeNames[] = {
+  "amdgpu-waves-per-eu"
+};
+
+static constexpr unsigned NumAttr =
+  sizeof(AttributeNames) / sizeof(AttributeNames[0]);
+
 class AMDGPUPropagateAttributes {
-  const FeatureBitset TargetFeatures = {
-    AMDGPU::FeatureWavefrontSize16,
-    AMDGPU::FeatureWavefrontSize32,
-    AMDGPU::FeatureWavefrontSize64
+
+  class FnProperties {
+  private:
+    explicit FnProperties(const FeatureBitset &&FB) : Features(FB) {}
+
+  public:
+    explicit FnProperties(const TargetMachine &TM, const Function &F) {
+      Features = TM.getSubtargetImpl(F)->getFeatureBits();
+
+      for (unsigned I = 0; I < NumAttr; ++I)
+        if (F.hasFnAttribute(AttributeNames[I]))
+          Attributes[I] = F.getFnAttribute(AttributeNames[I]);
+    }
+
+    bool operator == (const FnProperties &Other) const {
+      if ((Features & TargetFeatures) != (Other.Features & TargetFeatures))
+        return false;
+      for (unsigned I = 0; I < NumAttr; ++I)
+        if (Attributes[I] != Other.Attributes[I])
+          return false;
+      return true;
+    }
+
+    FnProperties adjustToCaller(const FnProperties &CallerProps) const {
+      FnProperties New((Features & ~TargetFeatures) | CallerProps.Features);
+      for (unsigned I = 0; I < NumAttr; ++I)
+        New.Attributes[I] = CallerProps.Attributes[I];
+      return New;
+    }
+
+    FeatureBitset Features;
+    Optional<Attribute> Attributes[NumAttr];
   };
 
-  class Clone{
+  class Clone {
   public:
-    Clone(FeatureBitset FeatureMask, Function *OrigF, Function *NewF) :
-      FeatureMask(FeatureMask), OrigF(OrigF), NewF(NewF) {}
+    Clone(const FnProperties &Props, Function *OrigF, Function *NewF) :
+      Properties(Props), OrigF(OrigF), NewF(NewF) {}
 
-    FeatureBitset FeatureMask;
+    FnProperties Properties;
     Function *OrigF;
     Function *NewF;
   };
@@ -77,17 +120,19 @@ class AMDGPUPropagateAttributes {
   SmallVector<Clone, 32> Clones;
 
   // Find a clone with required features.
-  Function *findFunction(const FeatureBitset &FeaturesNeeded,
+  Function *findFunction(const FnProperties &PropsNeeded,
                          Function *OrigF);
 
-  // Clone function F and set NewFeatures on the clone.
+  // Clone function \p F and set \p NewProps on the clone.
   // Cole takes the name of original function.
-  Function *cloneWithFeatures(Function &F,
-                              const FeatureBitset &NewFeatures);
+  Function *cloneWithProperties(Function &F, const FnProperties &NewProps);
 
   // Set new function's features in place.
   void setFeatures(Function &F, const FeatureBitset &NewFeatures);
 
+  // Set new function's attributes in place.
+  void setAttributes(Function &F, const ArrayRef<Optional<Attribute>> NewAttrs);
+
   std::string getFeatureString(const FeatureBitset &Features) const;
 
   // Propagate attributes from Roots.
@@ -155,11 +200,11 @@ INITIALIZE_PASS(AMDGPUPropagateAttributesLate,
                 false, false)
 
 Function *
-AMDGPUPropagateAttributes::findFunction(const FeatureBitset &FeaturesNeeded,
+AMDGPUPropagateAttributes::findFunction(const FnProperties &PropsNeeded,
                                         Function *OrigF) {
   // TODO: search for clone's clones.
   for (Clone &C : Clones)
-    if (C.OrigF == OrigF && FeaturesNeeded == C.FeatureMask)
+    if (C.OrigF == OrigF && PropsNeeded == C.Properties)
       return C.NewF;
 
   return nullptr;
@@ -192,12 +237,12 @@ bool AMDGPUPropagateAttributes::process() {
     NewRoots.clear();
 
     for (auto &F : M.functions()) {
-      if (F.isDeclaration() || Roots.count(&F) || Roots.count(&F))
+      if (F.isDeclaration())
         continue;
 
-      const FeatureBitset &CalleeBits =
-        TM->getSubtargetImpl(F)->getFeatureBits();
+      const FnProperties CalleeProps(*TM, F);
       SmallVector<std::pair<CallBase *, Function *>, 32> ToReplace;
+      SmallSet<CallBase *, 32> Visited;
 
       for (User *U : F.users()) {
         Instruction *I = dyn_cast<Instruction>(U);
@@ -207,36 +252,36 @@ bool AMDGPUPropagateAttributes::process() {
         if (!CI)
           continue;
         Function *Caller = CI->getCaller();
-        if (!Caller)
+        if (!Caller || !Visited.insert(CI).second)
           continue;
-        if (!Roots.count(Caller))
+        if (!Roots.count(Caller) && !NewRoots.count(Caller))
           continue;
 
-        const FeatureBitset &CallerBits =
-          TM->getSubtargetImpl(*Caller)->getFeatureBits() & TargetFeatures;
+        const FnProperties CallerProps(*TM, *Caller);
 
-        if (CallerBits == (CalleeBits  & TargetFeatures)) {
-          NewRoots.insert(&F);
+        if (CalleeProps == CallerProps) {
+          if (!Roots.count(&F))
+            NewRoots.insert(&F);
           continue;
         }
 
-        Function *NewF = findFunction(CallerBits, &F);
+        Function *NewF = findFunction(CallerProps, &F);
         if (!NewF) {
-          FeatureBitset NewFeatures((CalleeBits & ~TargetFeatures) |
-                                    CallerBits);
+          const FnProperties NewProps = CalleeProps.adjustToCaller(CallerProps);
           if (!AllowClone) {
             // This may set different features on different iteartions if
             // there is a contradiction in callers' attributes. In this case
             // we rely on a second pass running on Module, which is allowed
             // to clone.
-            setFeatures(F, NewFeatures);
+            setFeatures(F, NewProps.Features);
+            setAttributes(F, NewProps.Attributes);
             NewRoots.insert(&F);
             Changed = true;
             break;
           }
 
-          NewF = cloneWithFeatures(F, NewFeatures);
-          Clones.push_back(Clone(CallerBits, &F, NewF));
+          NewF = cloneWithProperties(F, NewProps);
+          Clones.push_back(Clone(CallerProps, &F, NewF));
           NewRoots.insert(NewF);
         }
 
@@ -258,28 +303,30 @@ bool AMDGPUPropagateAttributes::process() {
       F->eraseFromParent();
   }
 
+  Roots.clear();
+  Clones.clear();
+
   return Changed;
 }
 
 Function *
-AMDGPUPropagateAttributes::cloneWithFeatures(Function &F,
-                                             const FeatureBitset &NewFeatures) {
+AMDGPUPropagateAttributes::cloneWithProperties(Function &F,
+                                               const FnProperties &NewProps) {
   LLVM_DEBUG(dbgs() << "Cloning " << F.getName() << '\n');
 
   ValueToValueMapTy dummy;
   Function *NewF = CloneFunction(&F, dummy);
-  setFeatures(*NewF, NewFeatures);
+  setFeatures(*NewF, NewProps.Features);
+  setAttributes(*NewF, NewProps.Attributes);
+  NewF->setVisibility(GlobalValue::DefaultVisibility);
+  NewF->setLinkage(GlobalValue::InternalLinkage);
 
   // Swap names. If that is the only clone it will retain the name of now
-  // dead value.
-  if (F.hasName()) {
-    std::string NewName = NewF->getName();
+  // dead value. Preserve original name for externally visible functions.
+  if (F.hasName() && F.hasLocalLinkage()) {
+    std::string NewName = std::string(NewF->getName());
     NewF->takeName(&F);
     F.setName(NewName);
-
-    // Name has changed, it does not need an external symbol.
-    F.setVisibility(GlobalValue::DefaultVisibility);
-    F.setLinkage(GlobalValue::InternalLinkage);
   }
 
   return NewF;
@@ -297,6 +344,18 @@ void AMDGPUPropagateAttributes::setFeatures(Function &F,
   F.addFnAttr("target-features", NewFeatureStr);
 }
 
+void AMDGPUPropagateAttributes::setAttributes(Function &F,
+    const ArrayRef<Optional<Attribute>> NewAttrs) {
+  LLVM_DEBUG(dbgs() << "Set attributes on " << F.getName() << ":\n");
+  for (unsigned I = 0; I < NumAttr; ++I) {
+    F.removeFnAttr(AttributeNames[I]);
+    if (NewAttrs[I]) {
+      LLVM_DEBUG(dbgs() << '\t' << NewAttrs[I]->getAsString() << '\n');
+      F.addFnAttr(*NewAttrs[I]);
+    }
+  }
+}
+
 std::string
 AMDGPUPropagateAttributes::getFeatureString(const FeatureBitset &Features) const
 {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
new file mode 100644
index 000000000000..71d82679b3ff
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegBankCombiner.cpp
@@ -0,0 +1,154 @@
+//=== lib/CodeGen/GlobalISel/AMDGPURegBankCombiner.cpp ---------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass does combining of machine instructions at the generic MI level,
+// after register banks are known.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUTargetMachine.h"
+#include "AMDGPULegalizerInfo.h"
+#include "llvm/CodeGen/GlobalISel/Combiner.h"
+#include "llvm/CodeGen/GlobalISel/CombinerHelper.h"
+#include "llvm/CodeGen/GlobalISel/CombinerInfo.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
+#include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/Support/Debug.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+
+#define DEBUG_TYPE "amdgpu-regbank-combiner"
+
+using namespace llvm;
+using namespace MIPatternMatch;
+
+
+#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+#include "AMDGPUGenRegBankGICombiner.inc"
+#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_DEPS
+
+namespace {
+#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H
+#include "AMDGPUGenRegBankGICombiner.inc"
+#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_H
+
+class AMDGPURegBankCombinerInfo : public CombinerInfo {
+  GISelKnownBits *KB;
+  MachineDominatorTree *MDT;
+
+public:
+  AMDGPUGenRegBankCombinerHelperRuleConfig GeneratedRuleCfg;
+
+  AMDGPURegBankCombinerInfo(bool EnableOpt, bool OptSize, bool MinSize,
+                                  const AMDGPULegalizerInfo *LI,
+                                  GISelKnownBits *KB, MachineDominatorTree *MDT)
+      : CombinerInfo(/*AllowIllegalOps*/ false, /*ShouldLegalizeIllegal*/ true,
+                     /*LegalizerInfo*/ LI, EnableOpt, OptSize, MinSize),
+        KB(KB), MDT(MDT) {
+    if (!GeneratedRuleCfg.parseCommandLineOption())
+      report_fatal_error("Invalid rule identifier");
+  }
+
+  bool combine(GISelChangeObserver &Observer, MachineInstr &MI,
+               MachineIRBuilder &B) const override;
+};
+
+bool AMDGPURegBankCombinerInfo::combine(GISelChangeObserver &Observer,
+                                              MachineInstr &MI,
+                                              MachineIRBuilder &B) const {
+  CombinerHelper Helper(Observer, B, KB, MDT);
+  AMDGPUGenRegBankCombinerHelper Generated(GeneratedRuleCfg);
+
+  if (Generated.tryCombineAll(Observer, MI, B, Helper))
+    return true;
+
+  return false;
+}
+
+#define AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP
+#include "AMDGPUGenRegBankGICombiner.inc"
+#undef AMDGPUREGBANKCOMBINERHELPER_GENCOMBINERHELPER_CPP
+
+// Pass boilerplate
+// ================
+
+class AMDGPURegBankCombiner : public MachineFunctionPass {
+public:
+  static char ID;
+
+  AMDGPURegBankCombiner(bool IsOptNone = false);
+
+  StringRef getPassName() const override {
+    return "AMDGPURegBankCombiner";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+private:
+  bool IsOptNone;
+};
+} // end anonymous namespace
+
+void AMDGPURegBankCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addRequired<TargetPassConfig>();
+  AU.setPreservesCFG();
+  getSelectionDAGFallbackAnalysisUsage(AU);
+  AU.addRequired<GISelKnownBitsAnalysis>();
+  AU.addPreserved<GISelKnownBitsAnalysis>();
+  if (!IsOptNone) {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
+  }
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+AMDGPURegBankCombiner::AMDGPURegBankCombiner(bool IsOptNone)
+  : MachineFunctionPass(ID), IsOptNone(IsOptNone) {
+  initializeAMDGPURegBankCombinerPass(*PassRegistry::getPassRegistry());
+}
+
+bool AMDGPURegBankCombiner::runOnMachineFunction(MachineFunction &MF) {
+  if (MF.getProperties().hasProperty(
+          MachineFunctionProperties::Property::FailedISel))
+    return false;
+  auto *TPC = &getAnalysis<TargetPassConfig>();
+  const Function &F = MF.getFunction();
+  bool EnableOpt =
+      MF.getTarget().getOptLevel() != CodeGenOpt::None && !skipFunction(F);
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const AMDGPULegalizerInfo *LI
+    = static_cast<const AMDGPULegalizerInfo *>(ST.getLegalizerInfo());
+
+  GISelKnownBits *KB = &getAnalysis<GISelKnownBitsAnalysis>().get(MF);
+  MachineDominatorTree *MDT =
+      IsOptNone ? nullptr : &getAnalysis<MachineDominatorTree>();
+  AMDGPURegBankCombinerInfo PCInfo(EnableOpt, F.hasOptSize(),
+                                         F.hasMinSize(), LI, KB, MDT);
+  Combiner C(PCInfo, TPC);
+  return C.combineMachineInstrs(MF, /*CSEInfo*/ nullptr);
+}
+
+char AMDGPURegBankCombiner::ID = 0;
+INITIALIZE_PASS_BEGIN(AMDGPURegBankCombiner, DEBUG_TYPE,
+                      "Combine AMDGPU machine instrs after regbankselect",
+                      false, false)
+INITIALIZE_PASS_DEPENDENCY(TargetPassConfig)
+INITIALIZE_PASS_DEPENDENCY(GISelKnownBitsAnalysis)
+INITIALIZE_PASS_END(AMDGPURegBankCombiner, DEBUG_TYPE,
+                    "Combine AMDGPU machine instrs after regbankselect", false,
+                    false)
+
+namespace llvm {
+FunctionPass *createAMDGPURegBankCombiner(bool IsOptNone) {
+  return new AMDGPURegBankCombiner(IsOptNone);
+}
+} // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
index 1bb01dc8fa11..dfaf97bfb08e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.cpp
@@ -8,10 +8,69 @@
 /// \file
 /// This file implements the targeting of the RegisterBankInfo class for
 /// AMDGPU.
-/// \todo This should be generated by TableGen.
+///
+/// \par
+///
+/// AMDGPU has unique register bank constraints that require special high level
+/// strategies to deal with. There are two main true physical register banks
+/// VGPR (vector), and SGPR (scalar). Additionally the VCC register bank is a
+/// sort of pseudo-register bank needed to represent SGPRs used in a vector
+/// boolean context. There is also the AGPR bank, which is a special purpose
+/// physical register bank present on some subtargets.
+///
+/// Copying from VGPR to SGPR is generally illegal, unless the value is known to
+/// be uniform. It is generally not valid to legalize operands by inserting
+/// copies as on other targets. Operations which require uniform, SGPR operands
+/// generally require scalarization by repeatedly executing the instruction,
+/// activating each set of lanes using a unique set of input values. This is
+/// referred to as a waterfall loop.
+///
+/// \par Booleans
+///
+/// Booleans (s1 values) requires special consideration. A vector compare result
+/// is naturally a bitmask with one bit per lane, in a 32 or 64-bit
+/// register. These are represented with the VCC bank. During selection, we need
+/// to be able to unambiguously go back from a register class to a register
+/// bank. To distinguish whether an SGPR should use the SGPR or VCC register
+/// bank, we need to know the use context type. An SGPR s1 value always means a
+/// VCC bank value, otherwise it will be the SGPR bank. A scalar compare sets
+/// SCC, which is a 1-bit unaddressable register. This will need to be copied to
+/// a 32-bit virtual register. Taken together, this means we need to adjust the
+/// type of boolean operations to be regbank legal. All SALU booleans need to be
+/// widened to 32-bits, and all VALU booleans need to be s1 values.
+///
+/// A noteworthy exception to the s1-means-vcc rule is for legalization artifact
+/// casts. G_TRUNC s1 results, and G_SEXT/G_ZEXT/G_ANYEXT sources are never vcc
+/// bank. A non-boolean source (such as a truncate from a 1-bit load from
+/// memory) will require a copy to the VCC bank which will require clearing the
+/// high bits and inserting a compare.
+///
+/// \par Constant bus restriction
+///
+/// VALU instructions have a limitation known as the constant bus
+/// restriction. Most VALU instructions can use SGPR operands, but may read at
+/// most 1 SGPR or constant literal value (this to 2 in gfx10 for most
+/// instructions). This is one unique SGPR, so the same SGPR may be used for
+/// multiple operands. From a register bank perspective, any combination of
+/// operands should be legal as an SGPR, but this is contextually dependent on
+/// the SGPR operands all being the same register. There is therefore optimal to
+/// choose the SGPR with the most uses to minimize the number of copies.
+///
+/// We avoid trying to solve this problem in RegBankSelect. Any VALU G_*
+/// operation should have its source operands all mapped to VGPRs (except for
+/// VCC), inserting copies from any SGPR operands. This the most trival legal
+/// mapping. Anything beyond the simplest 1:1 instruction selection would be too
+/// complicated to solve here. Every optimization pattern or instruction
+/// selected to multiple outputs would have to enforce this rule, and there
+/// would be additional complexity in tracking this rule for every G_*
+/// operation. By forcing all inputs to VGPRs, it also simplifies the task of
+/// picking the optimal operand combination from a post-isel optimization pass.
+///
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPURegisterBankInfo.h"
+
+#include "AMDGPUGlobalISelUtils.h"
 #include "AMDGPUInstrInfo.h"
 #include "AMDGPUSubtarget.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -19,8 +78,8 @@
 #include "SIRegisterInfo.h"
 #include "llvm/CodeGen/GlobalISel/LegalizationArtifactCombiner.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerHelper.h"
-#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/MIPatternMatch.h"
+#include "llvm/CodeGen/GlobalISel/MachineIRBuilder.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBank.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
@@ -101,8 +160,9 @@ public:
       if (!Op.isReg())
         continue;
 
+      // We may see physical registers if building a real MI
       Register Reg = Op.getReg();
-      if (MRI.getRegClassOrRegBank(Reg))
+      if (Reg.isPhysical() || MRI.getRegClassOrRegBank(Reg))
         continue;
 
       const RegisterBank *RB = NewBank;
@@ -138,15 +198,16 @@ AMDGPURegisterBankInfo::AMDGPURegisterBankInfo(const GCNSubtarget &ST)
       TII(Subtarget.getInstrInfo()) {
 
   // HACK: Until this is fully tablegen'd.
-  static bool AlreadyInit = false;
-  if (AlreadyInit)
-    return;
+  static llvm::once_flag InitializeRegisterBankFlag;
 
-  AlreadyInit = true;
+  static auto InitializeRegisterBankOnce = [this]() {
+    assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
+           &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
+           &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
+    (void)this;
+  };
 
-  assert(&getRegBank(AMDGPU::SGPRRegBankID) == &AMDGPU::SGPRRegBank &&
-         &getRegBank(AMDGPU::VGPRRegBankID) == &AMDGPU::VGPRRegBank &&
-         &getRegBank(AMDGPU::AGPRRegBankID) == &AMDGPU::AGPRRegBank);
+  llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
 }
 
 static bool isVectorRegisterBank(const RegisterBank &Bank) {
@@ -159,7 +220,7 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
                                           unsigned Size) const {
   // TODO: Should there be a UniformVGPRRegBank which can use readfirstlane?
   if (Dst.getID() == AMDGPU::SGPRRegBankID &&
-      isVectorRegisterBank(Src)) {
+      (isVectorRegisterBank(Src) || Src.getID() == AMDGPU::VCCRegBankID)) {
     return std::numeric_limits<unsigned>::max();
   }
 
@@ -177,9 +238,6 @@ unsigned AMDGPURegisterBankInfo::copyCost(const RegisterBank &Dst,
        Src.getID() == AMDGPU::VCCRegBankID))
     return std::numeric_limits<unsigned>::max();
 
-  if (Src.getID() == AMDGPU::VCCRegBankID)
-    return std::numeric_limits<unsigned>::max();
-
   // There is no direct copy between AGPRs.
   if (Dst.getID() == AMDGPU::AGPRRegBankID &&
       Src.getID() == AMDGPU::AGPRRegBankID)
@@ -317,22 +375,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappingsIntrinsicWSideEffects(
     const MachineInstr &MI, const MachineRegisterInfo &MRI) const {
 
   switch (MI.getIntrinsicID()) {
-  case Intrinsic::amdgcn_buffer_load: {
-    static const OpRegBankEntry<3> Table[4] = {
-      // Perfectly legal.
-      { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
-      { { AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
-
-      // Waterfall loop needed for rsrc. In the worst case this will execute
-      // approximately an extra 10 * wavesize + 2 instructions.
-      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1000 },
-      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1000 }
-    };
-
-    // rsrc, voffset, offset
-    const std::array<unsigned, 3> RegSrcOpIdx = { { 2, 3, 4 } };
-    return addMappingFromTable<3>(MI, MRI, RegSrcOpIdx, makeArrayRef(Table));
-  }
   case Intrinsic::amdgcn_s_buffer_load: {
     static const OpRegBankEntry<2> Table[4] = {
       // Perfectly legal.
@@ -402,15 +444,15 @@ static bool isScalarLoadLegal(const MachineInstr &MI) {
                        AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT;
 
   // There are no extending SMRD/SMEM loads, and they require 4-byte alignment.
-  return MMO->getSize() >= 4 && MMO->getAlignment() >= 4 &&
-    // Can't do a scalar atomic load.
-    !MMO->isAtomic() &&
-    // Don't use scalar loads for volatile accesses to non-constant address
-    // spaces.
-    (IsConst || !MMO->isVolatile()) &&
-    // Memory must be known constant, or not written before this load.
-    (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
-    AMDGPUInstrInfo::isUniformMMO(MMO);
+  return MMO->getSize() >= 4 && MMO->getAlign() >= Align(4) &&
+         // Can't do a scalar atomic load.
+         !MMO->isAtomic() &&
+         // Don't use scalar loads for volatile accesses to non-constant address
+         // spaces.
+         (IsConst || !MMO->isVolatile()) &&
+         // Memory must be known constant, or not written before this load.
+         (IsConst || MMO->isInvariant() || memOpHasNoClobbered(MMO)) &&
+         AMDGPUInstrInfo::isUniformMMO(MMO);
 }
 
 RegisterBankInfo::InstructionMappings
@@ -490,24 +532,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
          AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
       3); // Num Operands
     AltMappings.push_back(&VVMapping);
-
-    const InstructionMapping &SVMapping = getInstructionMapping(
-      3, 3, getOperandsMapping(
-        {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
-         AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size),
-         AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size)}),
-      3); // Num Operands
-    AltMappings.push_back(&SVMapping);
-
-    // SGPR in LHS is slightly preferrable, so make it VS more expensive than
-    // SV.
-    const InstructionMapping &VSMapping = getInstructionMapping(
-      3, 4, getOperandsMapping(
-        {AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
-         AMDGPU::getValueMappingSGPR64Only(AMDGPU::VGPRRegBankID, Size),
-         AMDGPU::getValueMappingSGPR64Only(AMDGPU::SGPRRegBankID, Size)}),
-      3); // Num Operands
-    AltMappings.push_back(&VSMapping);
     break;
   }
   case TargetOpcode::G_LOAD:
@@ -517,7 +541,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
     LLT PtrTy = MRI.getType(MI.getOperand(1).getReg());
     unsigned PtrSize = PtrTy.getSizeInBits();
     unsigned AS = PtrTy.getAddressSpace();
-    LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
 
     if ((AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
          AS != AMDGPUAS::PRIVATE_ADDRESS) &&
@@ -531,9 +554,10 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
     }
 
     const InstructionMapping &VVMapping = getInstructionMapping(
-        2, 1, getOperandsMapping(
-          {AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy),
-           AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
+        2, 1,
+        getOperandsMapping(
+            {AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
+             AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize)}),
         2); // Num Operands
     AltMappings.push_back(&VVMapping);
 
@@ -546,43 +570,6 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
     return AltMappings;
 
   }
-  case TargetOpcode::G_ICMP: {
-    // TODO: Should report 32-bit for scalar output type.
-    unsigned Size = getSizeInBits(MI.getOperand(2).getReg(), MRI, *TRI);
-    const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
-      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 1),
-                          nullptr, // Predicate operand.
-                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
-                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
-      4); // Num Operands
-    AltMappings.push_back(&SSMapping);
-
-    const InstructionMapping &SVMapping = getInstructionMapping(2, 1,
-      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
-                          nullptr, // Predicate operand.
-                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size),
-                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
-      4); // Num Operands
-    AltMappings.push_back(&SVMapping);
-
-    const InstructionMapping &VSMapping = getInstructionMapping(3, 1,
-      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
-                          nullptr, // Predicate operand.
-                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
-                          AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size)}),
-      4); // Num Operands
-    AltMappings.push_back(&VSMapping);
-
-    const InstructionMapping &VVMapping = getInstructionMapping(4, 1,
-      getOperandsMapping({AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1),
-                          nullptr, // Predicate operand.
-                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size),
-                          AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size)}),
-      4); // Num Operands
-    AltMappings.push_back(&VVMapping);
-
-    return AltMappings;
-  }
   case TargetOpcode::G_SELECT: {
     unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
     const InstructionMapping &SSMapping = getInstructionMapping(1, 1,
@@ -607,10 +594,8 @@ AMDGPURegisterBankInfo::getInstrAlternativeMappings(
   case TargetOpcode::G_SMAX:
   case TargetOpcode::G_UMIN:
   case TargetOpcode::G_UMAX: {
-    static const OpRegBankEntry<3> Table[4] = {
+    static const OpRegBankEntry<3> Table[2] = {
       { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
-      { { AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID, AMDGPU::VGPRRegBankID }, 1 },
-      { { AMDGPU::VGPRRegBankID, AMDGPU::VGPRRegBankID, AMDGPU::SGPRRegBankID }, 1 },
 
       // Scalar requires cmp+select, and extends if 16-bit.
       // FIXME: Should there be separate costs for 32 and 16-bit
@@ -740,6 +725,10 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   SmallVector<Register, 4> InitResultRegs;
   SmallVector<Register, 4> PhiRegs;
 
+  // Track use registers which have already been expanded with a readfirstlane
+  // sequence. This may have multiple uses if moving a sequence.
+  DenseMap<Register, Register> WaterfalledRegMap;
+
   MachineBasicBlock &MBB = B.getMBB();
   MachineFunction *MF = &B.getMF();
 
@@ -755,6 +744,10 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
   const unsigned ExecReg =  Subtarget.isWave32() ?
     AMDGPU::EXEC_LO : AMDGPU::EXEC;
 
+#ifndef NDEBUG
+  const int OrigRangeSize = std::distance(Range.begin(), Range.end());
+#endif
+
   for (MachineInstr &MI : Range) {
     for (MachineOperand &Def : MI.defs()) {
       LLT ResTy = MRI.getType(Def.getReg());
@@ -820,13 +813,14 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
 
   const DebugLoc &DL = B.getDL();
 
-  // Figure out the iterator range after splicing the instructions.
-  auto NewBegin = std::prev(LoopBB->end());
+  MachineInstr &FirstInst = *Range.begin();
 
   // Move the instruction into the loop. Note we moved everything after
   // Range.end() already into a new block, so Range.end() is no longer valid.
   LoopBB->splice(LoopBB->end(), &MBB, Range.begin(), MBB.end());
 
+  // Figure out the iterator range after splicing the instructions.
+  MachineBasicBlock::iterator NewBegin = FirstInst.getIterator();
   auto NewEnd = LoopBB->end();
 
   MachineBasicBlock::iterator I = Range.begin();
@@ -834,39 +828,145 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
 
   Register CondReg;
 
+  assert(std::distance(NewBegin, NewEnd) == OrigRangeSize);
+
   for (MachineInstr &MI : make_range(NewBegin, NewEnd)) {
     for (MachineOperand &Op : MI.uses()) {
       if (!Op.isReg() || Op.isDef())
         continue;
 
-      if (SGPROperandRegs.count(Op.getReg())) {
-        LLT OpTy = MRI.getType(Op.getReg());
-        unsigned OpSize = OpTy.getSizeInBits();
+      Register OldReg = Op.getReg();
+      if (!SGPROperandRegs.count(OldReg))
+        continue;
+
+      // See if we already processed this register in another instruction in the
+      // sequence.
+      auto OldVal = WaterfalledRegMap.find(OldReg);
+      if (OldVal != WaterfalledRegMap.end()) {
+        Op.setReg(OldVal->second);
+        continue;
+      }
+
+      LLT OpTy = MRI.getType(Op.getReg());
+      unsigned OpSize = OpTy.getSizeInBits();
+
+      // Can only do a readlane of 32-bit pieces.
+      if (OpSize == 32) {
+        // Avoid extra copies in the simple case of one 32-bit register.
+        Register CurrentLaneOpReg
+          = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
+        MRI.setType(CurrentLaneOpReg, OpTy);
+
+        constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
+        // Read the next variant <- also loop target.
+        BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                CurrentLaneOpReg)
+          .addReg(Op.getReg());
+
+        Register NewCondReg = MRI.createVirtualRegister(WaveRC);
+        bool First = CondReg == AMDGPU::NoRegister;
+        if (First)
+          CondReg = NewCondReg;
+
+        // Compare the just read M0 value to all possible Idx values.
+        B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
+          .addDef(NewCondReg)
+          .addReg(CurrentLaneOpReg)
+          .addReg(Op.getReg());
+        Op.setReg(CurrentLaneOpReg);
+
+        if (!First) {
+          Register AndReg = MRI.createVirtualRegister(WaveRC);
+
+          // If there are multiple operands to consider, and the conditions.
+          B.buildInstr(WaveAndOpc)
+            .addDef(AndReg)
+            .addReg(NewCondReg)
+            .addReg(CondReg);
+          CondReg = AndReg;
+        }
+      } else {
+        LLT S32 = LLT::scalar(32);
+        SmallVector<Register, 8> ReadlanePieces;
+
+        // The compares can be done as 64-bit, but the extract needs to be done
+        // in 32-bit pieces.
+
+        bool Is64 = OpSize % 64 == 0;
+
+        LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
+        unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
+          : AMDGPU::V_CMP_EQ_U32_e64;
+
+        // The compares can be done as 64-bit, but the extract needs to be done
+        // in 32-bit pieces.
+
+        // Insert the unmerge before the loop.
+
+        B.setMBB(MBB);
+        auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
+        B.setInstr(*I);
+
+        unsigned NumPieces = Unmerge->getNumOperands() - 1;
+        for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
+          Register UnmergePiece = Unmerge.getReg(PieceIdx);
+
+          Register CurrentLaneOpReg;
+          if (Is64) {
+            Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
+            Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
+
+            MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
+            MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
+            MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
+
+            // Read the next variant <- also loop target.
+            BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                    CurrentLaneOpRegLo)
+              .addReg(UnmergePiece, 0, AMDGPU::sub0);
+
+            // Read the next variant <- also loop target.
+            BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                    CurrentLaneOpRegHi)
+              .addReg(UnmergePiece, 0, AMDGPU::sub1);
+
+            CurrentLaneOpReg =
+              B.buildMerge(LLT::scalar(64),
+                           {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
+              .getReg(0);
 
-        // Can only do a readlane of 32-bit pieces.
-        if (OpSize == 32) {
-          // Avoid extra copies in the simple case of one 32-bit register.
-          Register CurrentLaneOpReg
-            = MRI.createVirtualRegister(&AMDGPU::SReg_32_XM0RegClass);
-          MRI.setType(CurrentLaneOpReg, OpTy);
+            MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
 
-          constrainGenericRegister(Op.getReg(), AMDGPU::VGPR_32RegClass, MRI);
-          // Read the next variant <- also loop target.
-          BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-                  CurrentLaneOpReg)
-            .addReg(Op.getReg());
+            if (OpTy.getScalarSizeInBits() == 64) {
+              // If we need to produce a 64-bit element vector, so use the
+              // merged pieces
+              ReadlanePieces.push_back(CurrentLaneOpReg);
+            } else {
+              // 32-bit element type.
+              ReadlanePieces.push_back(CurrentLaneOpRegLo);
+              ReadlanePieces.push_back(CurrentLaneOpRegHi);
+            }
+          } else {
+            CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
+            MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
+            MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
+
+            // Read the next variant <- also loop target.
+            BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
+                    CurrentLaneOpReg)
+              .addReg(UnmergePiece);
+            ReadlanePieces.push_back(CurrentLaneOpReg);
+          }
 
           Register NewCondReg = MRI.createVirtualRegister(WaveRC);
           bool First = CondReg == AMDGPU::NoRegister;
           if (First)
             CondReg = NewCondReg;
 
-          // Compare the just read M0 value to all possible Idx values.
-          B.buildInstr(AMDGPU::V_CMP_EQ_U32_e64)
+          B.buildInstr(CmpOp)
             .addDef(NewCondReg)
             .addReg(CurrentLaneOpReg)
-            .addReg(Op.getReg());
-          Op.setReg(CurrentLaneOpReg);
+            .addReg(UnmergePiece);
 
           if (!First) {
             Register AndReg = MRI.createVirtualRegister(WaveRC);
@@ -878,114 +978,23 @@ bool AMDGPURegisterBankInfo::executeInWaterfallLoop(
               .addReg(CondReg);
             CondReg = AndReg;
           }
-        } else {
-          LLT S32 = LLT::scalar(32);
-          SmallVector<Register, 8> ReadlanePieces;
-
-          // The compares can be done as 64-bit, but the extract needs to be done
-          // in 32-bit pieces.
-
-          bool Is64 = OpSize % 64 == 0;
-
-          LLT UnmergeTy = OpSize % 64 == 0 ? LLT::scalar(64) : LLT::scalar(32);
-          unsigned CmpOp = OpSize % 64 == 0 ? AMDGPU::V_CMP_EQ_U64_e64
-            : AMDGPU::V_CMP_EQ_U32_e64;
-
-          // The compares can be done as 64-bit, but the extract needs to be done
-          // in 32-bit pieces.
-
-          // Insert the unmerge before the loop.
-
-          B.setMBB(MBB);
-          auto Unmerge = B.buildUnmerge(UnmergeTy, Op.getReg());
-          B.setInstr(*I);
-
-          unsigned NumPieces = Unmerge->getNumOperands() - 1;
-          for (unsigned PieceIdx = 0; PieceIdx != NumPieces; ++PieceIdx) {
-            Register UnmergePiece = Unmerge.getReg(PieceIdx);
-
-            Register CurrentLaneOpReg;
-            if (Is64) {
-              Register CurrentLaneOpRegLo = MRI.createGenericVirtualRegister(S32);
-              Register CurrentLaneOpRegHi = MRI.createGenericVirtualRegister(S32);
-
-              MRI.setRegClass(UnmergePiece, &AMDGPU::VReg_64RegClass);
-              MRI.setRegClass(CurrentLaneOpRegLo, &AMDGPU::SReg_32_XM0RegClass);
-              MRI.setRegClass(CurrentLaneOpRegHi, &AMDGPU::SReg_32_XM0RegClass);
-
-              // Read the next variant <- also loop target.
-              BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-                      CurrentLaneOpRegLo)
-                .addReg(UnmergePiece, 0, AMDGPU::sub0);
-
-              // Read the next variant <- also loop target.
-              BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-                      CurrentLaneOpRegHi)
-                .addReg(UnmergePiece, 0, AMDGPU::sub1);
-
-              CurrentLaneOpReg =
-                B.buildMerge(LLT::scalar(64),
-                             {CurrentLaneOpRegLo, CurrentLaneOpRegHi})
-                .getReg(0);
-
-              MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_64_XEXECRegClass);
-
-              if (OpTy.getScalarSizeInBits() == 64) {
-                // If we need to produce a 64-bit element vector, so use the
-                // merged pieces
-                ReadlanePieces.push_back(CurrentLaneOpReg);
-              } else {
-                // 32-bit element type.
-                ReadlanePieces.push_back(CurrentLaneOpRegLo);
-                ReadlanePieces.push_back(CurrentLaneOpRegHi);
-              }
-            } else {
-              CurrentLaneOpReg = MRI.createGenericVirtualRegister(S32);
-              MRI.setRegClass(UnmergePiece, &AMDGPU::VGPR_32RegClass);
-              MRI.setRegClass(CurrentLaneOpReg, &AMDGPU::SReg_32_XM0RegClass);
-
-              // Read the next variant <- also loop target.
-              BuildMI(*LoopBB, I, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32),
-                      CurrentLaneOpReg)
-                .addReg(UnmergePiece);
-              ReadlanePieces.push_back(CurrentLaneOpReg);
-            }
-
-            Register NewCondReg = MRI.createVirtualRegister(WaveRC);
-            bool First = CondReg == AMDGPU::NoRegister;
-            if (First)
-              CondReg = NewCondReg;
-
-            B.buildInstr(CmpOp)
-              .addDef(NewCondReg)
-              .addReg(CurrentLaneOpReg)
-              .addReg(UnmergePiece);
-
-            if (!First) {
-              Register AndReg = MRI.createVirtualRegister(WaveRC);
-
-              // If there are multiple operands to consider, and the conditions.
-              B.buildInstr(WaveAndOpc)
-                .addDef(AndReg)
-                .addReg(NewCondReg)
-                .addReg(CondReg);
-              CondReg = AndReg;
-            }
-          }
-
-          // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
-          // BUILD_VECTOR
-          if (OpTy.isVector()) {
-            auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
-            Op.setReg(Merge.getReg(0));
-          } else {
-            auto Merge = B.buildMerge(OpTy, ReadlanePieces);
-            Op.setReg(Merge.getReg(0));
-          }
+        }
 
-          MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
+        // FIXME: Build merge seems to switch to CONCAT_VECTORS but not
+        // BUILD_VECTOR
+        if (OpTy.isVector()) {
+          auto Merge = B.buildBuildVector(OpTy, ReadlanePieces);
+          Op.setReg(Merge.getReg(0));
+        } else {
+          auto Merge = B.buildMerge(OpTy, ReadlanePieces);
+          Op.setReg(Merge.getReg(0));
         }
+
+        MRI.setRegBank(Op.getReg(), AMDGPU::SGPRRegBank);
       }
+
+      // Make sure we don't re-process this register again.
+      WaterfalledRegMap.insert(std::make_pair(OldReg, Op.getReg()));
     }
   }
 
@@ -1093,53 +1102,89 @@ void AMDGPURegisterBankInfo::constrainOpWithReadfirstlane(
   MI.getOperand(OpIdx).setReg(SGPR);
 }
 
-// When regbankselect repairs registers, it will insert a repair instruction
-// which defines the repaired register.  Then it calls applyMapping and expects
-// that the targets will either delete or rewrite the originally wrote to the
-// repaired registers.  Beccause of this, we end up in a situation where
-// we have 2 instructions defining the same registers.
-static MachineInstr *getOtherVRegDef(const MachineRegisterInfo &MRI,
-                                     Register Reg,
-                                     const MachineInstr &MI) {
-  // Is there some way we can assert that there are exactly 2 def instructions?
-  for (MachineInstr &Other : MRI.def_instructions(Reg)) {
-    if (&Other != &MI)
-      return &Other;
-  }
-
-  return nullptr;
+/// Split \p Ty into 2 pieces. The first will have \p FirstSize bits, and the
+/// rest will be in the remainder.
+static std::pair<LLT, LLT> splitUnequalType(LLT Ty, unsigned FirstSize) {
+  unsigned TotalSize = Ty.getSizeInBits();
+  if (!Ty.isVector())
+    return {LLT::scalar(FirstSize), LLT::scalar(TotalSize - FirstSize)};
+
+  LLT EltTy = Ty.getElementType();
+  unsigned EltSize = EltTy.getSizeInBits();
+  assert(FirstSize % EltSize == 0);
+
+  unsigned FirstPartNumElts = FirstSize / EltSize;
+  unsigned RemainderElts = (TotalSize - FirstSize) / EltSize;
+
+  return {LLT::scalarOrVector(FirstPartNumElts, EltTy),
+          LLT::scalarOrVector(RemainderElts, EltTy)};
+}
+
+static LLT widen96To128(LLT Ty) {
+  if (!Ty.isVector())
+    return LLT::scalar(128);
+
+  LLT EltTy = Ty.getElementType();
+  assert(128 % EltTy.getSizeInBits() == 0);
+  return LLT::vector(128 / EltTy.getSizeInBits(), EltTy);
 }
 
-bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
+bool AMDGPURegisterBankInfo::applyMappingLoad(MachineInstr &MI,
                         const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
                                               MachineRegisterInfo &MRI) const {
   Register DstReg = MI.getOperand(0).getReg();
-  const LLT LoadTy =  MRI.getType(DstReg);
+  const LLT LoadTy = MRI.getType(DstReg);
   unsigned LoadSize = LoadTy.getSizeInBits();
   const unsigned MaxNonSmrdLoadSize = 128;
+
+  const RegisterBank *PtrBank =
+    OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
+  if (PtrBank == &AMDGPU::SGPRRegBank) {
+    // If the pointer is an SGPR, we ordinarily have nothing to do.
+    if (LoadSize != 96)
+      return false;
+
+    MachineMemOperand *MMO = *MI.memoperands_begin();
+    Register PtrReg = MI.getOperand(1).getReg();
+    // 96-bit loads are only available for vector loads. We need to split this
+    // into a 64-bit part, and 32 (unless we can widen to a 128-bit load).
+
+    MachineIRBuilder B(MI);
+    ApplyRegBankMapping O(*this, MRI, &AMDGPU::SGPRRegBank);
+    GISelObserverWrapper Observer(&O);
+    B.setChangeObserver(Observer);
+
+    if (MMO->getAlign() < Align(16)) {
+      LLT Part64, Part32;
+      std::tie(Part64, Part32) = splitUnequalType(LoadTy, 64);
+      auto Load0 = B.buildLoadFromOffset(Part64, PtrReg, *MMO, 0);
+      auto Load1 = B.buildLoadFromOffset(Part32, PtrReg, *MMO, 8);
+
+      auto Undef = B.buildUndef(LoadTy);
+      auto Ins0 = B.buildInsert(LoadTy, Undef, Load0, 0);
+      B.buildInsert(MI.getOperand(0), Ins0, Load1, 64);
+    } else {
+      LLT WiderTy = widen96To128(LoadTy);
+      auto WideLoad = B.buildLoadFromOffset(WiderTy, PtrReg, *MMO, 0);
+      B.buildExtract(MI.getOperand(0), WideLoad, 0);
+    }
+
+    MI.eraseFromParent();
+    return true;
+  }
+
   // 128-bit loads are supported for all instruction types.
   if (LoadSize <= MaxNonSmrdLoadSize)
     return false;
 
-  SmallVector<unsigned, 16> DefRegs(OpdMapper.getVRegs(0));
-  SmallVector<unsigned, 1> SrcRegs(OpdMapper.getVRegs(1));
+  SmallVector<Register, 16> DefRegs(OpdMapper.getVRegs(0));
+  SmallVector<Register, 1> SrcRegs(OpdMapper.getVRegs(1));
 
-  // If the pointer is an SGPR, we have nothing to do.
-  if (SrcRegs.empty()) {
-    const RegisterBank *PtrBank =
-      OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
-    if (PtrBank == &AMDGPU::SGPRRegBank)
-      return false;
+  if (SrcRegs.empty())
     SrcRegs.push_back(MI.getOperand(1).getReg());
-  }
 
   assert(LoadSize % MaxNonSmrdLoadSize == 0);
 
-  // We want to get the repair instruction now, because it will help us
-  // determine which instruction the legalizer inserts that will also
-  // write to DstReg.
-  MachineInstr *RepairInst = getOtherVRegDef(MRI, DstReg, MI);
-
   // RegBankSelect only emits scalar types, so we need to reset the pointer
   // operand to a pointer type.
   Register BasePtrReg = SrcRegs[0];
@@ -1148,38 +1193,72 @@ bool AMDGPURegisterBankInfo::applyMappingWideLoad(MachineInstr &MI,
 
   MachineIRBuilder B(MI);
 
-  unsigned SplitElts =
-      MaxNonSmrdLoadSize / LoadTy.getScalarType().getSizeInBits();
-  const LLT LoadSplitTy =  LLT::vector(SplitElts, LoadTy.getScalarType());
+  unsigned NumSplitParts = LoadTy.getSizeInBits() / MaxNonSmrdLoadSize;
+  const LLT LoadSplitTy = LoadTy.divide(NumSplitParts);
   ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
   GISelObserverWrapper Observer(&O);
   B.setChangeObserver(Observer);
   LegalizerHelper Helper(B.getMF(), Observer, B);
-  if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
+
+  if (LoadTy.isVector()) {
+    if (Helper.fewerElementsVector(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
+      return false;
+  } else {
+    if (Helper.narrowScalar(MI, 0, LoadSplitTy) != LegalizerHelper::Legalized)
+      return false;
+  }
+
+  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
+  return true;
+}
+
+bool AMDGPURegisterBankInfo::applyMappingDynStackAlloc(
+  MachineInstr &MI,
+  const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+  MachineRegisterInfo &MRI) const {
+  const MachineFunction &MF = *MI.getMF();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const auto &TFI = *ST.getFrameLowering();
+
+  // Guard in case the stack growth direction ever changes with scratch
+  // instructions.
+  if (TFI.getStackGrowthDirection() == TargetFrameLowering::StackGrowsDown)
     return false;
 
-  // At this point, the legalizer has split the original load into smaller
-  // loads.  At the end of lowering, it inserts an instruction (LegalizedInst)
-  // that combines the outputs of the lower loads and writes it to DstReg.
-  // The register bank selector has also added the RepairInst which writes to
-  // DstReg as well.
+  Register Dst = MI.getOperand(0).getReg();
+  Register AllocSize = MI.getOperand(1).getReg();
+  Align Alignment = assumeAligned(MI.getOperand(2).getImm());
 
-  MachineInstr *LegalizedInst = getOtherVRegDef(MRI, DstReg, *RepairInst);
+  const RegisterBank *SizeBank = getRegBank(AllocSize, MRI, *TRI);
 
-  // Replace the output of the LegalizedInst with a temporary register, since
-  // RepairInst already defines DstReg.
-  Register TmpReg = MRI.createGenericVirtualRegister(MRI.getType(DstReg));
-  LegalizedInst->getOperand(0).setReg(TmpReg);
-  B.setInsertPt(*RepairInst->getParent(), RepairInst);
+  // TODO: Need to emit a wave reduction to get the maximum size.
+  if (SizeBank != &AMDGPU::SGPRRegBank)
+    return false;
 
-  for (unsigned DefIdx = 0, e = DefRegs.size(); DefIdx != e; ++DefIdx) {
-    Register IdxReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
-    B.buildConstant(IdxReg, DefIdx);
-    MRI.setRegBank(IdxReg, AMDGPU::VGPRRegBank);
-    B.buildExtractVectorElement(DefRegs[DefIdx], TmpReg, IdxReg);
+  LLT PtrTy = MRI.getType(Dst);
+  LLT IntPtrTy = LLT::scalar(PtrTy.getSizeInBits());
+
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  Register SPReg = Info->getStackPtrOffsetReg();
+  ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
+  GISelObserverWrapper Observer(&ApplyBank);
+
+  MachineIRBuilder B(MI);
+  B.setChangeObserver(Observer);
+
+  auto WaveSize = B.buildConstant(LLT::scalar(32), ST.getWavefrontSizeLog2());
+  auto ScaledSize = B.buildShl(IntPtrTy, AllocSize, WaveSize);
+
+  auto SPCopy = B.buildCopy(PtrTy, SPReg);
+  if (Alignment > TFI.getStackAlign()) {
+    auto PtrAdd = B.buildPtrAdd(PtrTy, SPCopy, ScaledSize);
+    B.buildMaskLowPtrBits(Dst, PtrAdd,
+                          Log2(Alignment) + ST.getWavefrontSizeLog2());
+  } else {
+    B.buildPtrAdd(Dst, SPCopy, ScaledSize);
   }
 
-  MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
+  MI.eraseFromParent();
   return true;
 }
 
@@ -1210,6 +1289,281 @@ bool AMDGPURegisterBankInfo::applyMappingImage(
   return true;
 }
 
+static Register getSrcRegIgnoringCopies(const MachineRegisterInfo &MRI,
+                                        Register Reg) {
+  MachineInstr *Def = getDefIgnoringCopies(Reg, MRI);
+  if (!Def)
+    return Reg;
+
+  // TODO: Guard against this being an implicit def
+  return Def->getOperand(0).getReg();
+}
+
+// Analyze a combined offset from an llvm.amdgcn.s.buffer intrinsic and store
+// the three offsets (voffset, soffset and instoffset)
+static unsigned setBufferOffsets(MachineIRBuilder &B,
+                                 const AMDGPURegisterBankInfo &RBI,
+                                 Register CombinedOffset, Register &VOffsetReg,
+                                 Register &SOffsetReg, int64_t &InstOffsetVal,
+                                 Align Alignment) {
+  const LLT S32 = LLT::scalar(32);
+  MachineRegisterInfo *MRI = B.getMRI();
+
+  if (Optional<int64_t> Imm = getConstantVRegVal(CombinedOffset, *MRI)) {
+    uint32_t SOffset, ImmOffset;
+    if (AMDGPU::splitMUBUFOffset(*Imm, SOffset, ImmOffset, &RBI.Subtarget,
+                                 Alignment)) {
+      VOffsetReg = B.buildConstant(S32, 0).getReg(0);
+      SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
+      InstOffsetVal = ImmOffset;
+
+      B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
+      B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
+      return SOffset + ImmOffset;
+    }
+  }
+
+  Register Base;
+  unsigned Offset;
+  MachineInstr *Unused;
+
+  std::tie(Base, Offset, Unused)
+    = AMDGPU::getBaseWithConstantOffset(*MRI, CombinedOffset);
+
+  uint32_t SOffset, ImmOffset;
+  if (Offset > 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
+                                             &RBI.Subtarget, Alignment)) {
+    if (RBI.getRegBank(Base, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
+      VOffsetReg = Base;
+      SOffsetReg = B.buildConstant(S32, SOffset).getReg(0);
+      B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
+      InstOffsetVal = ImmOffset;
+      return 0; // XXX - Why is this 0?
+    }
+
+    // If we have SGPR base, we can use it for soffset.
+    if (SOffset == 0) {
+      VOffsetReg = B.buildConstant(S32, 0).getReg(0);
+      B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
+      SOffsetReg = Base;
+      InstOffsetVal = ImmOffset;
+      return 0; // XXX - Why is this 0?
+    }
+  }
+
+  // Handle the variable sgpr + vgpr case.
+  if (MachineInstr *Add = getOpcodeDef(AMDGPU::G_ADD, CombinedOffset, *MRI)) {
+    Register Src0 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(1).getReg());
+    Register Src1 = getSrcRegIgnoringCopies(*MRI, Add->getOperand(2).getReg());
+
+    const RegisterBank *Src0Bank = RBI.getRegBank(Src0, *MRI, *RBI.TRI);
+    const RegisterBank *Src1Bank = RBI.getRegBank(Src1, *MRI, *RBI.TRI);
+
+    if (Src0Bank == &AMDGPU::VGPRRegBank && Src1Bank == &AMDGPU::SGPRRegBank) {
+      VOffsetReg = Src0;
+      SOffsetReg = Src1;
+      return 0;
+    }
+
+    if (Src0Bank == &AMDGPU::SGPRRegBank && Src1Bank == &AMDGPU::VGPRRegBank) {
+      VOffsetReg = Src1;
+      SOffsetReg = Src0;
+      return 0;
+    }
+  }
+
+  // Ensure we have a VGPR for the combined offset. This could be an issue if we
+  // have an SGPR offset and a VGPR resource.
+  if (RBI.getRegBank(CombinedOffset, *MRI, *RBI.TRI) == &AMDGPU::VGPRRegBank) {
+    VOffsetReg = CombinedOffset;
+  } else {
+    VOffsetReg = B.buildCopy(S32, CombinedOffset).getReg(0);
+    B.getMRI()->setRegBank(VOffsetReg, AMDGPU::VGPRRegBank);
+  }
+
+  SOffsetReg = B.buildConstant(S32, 0).getReg(0);
+  B.getMRI()->setRegBank(SOffsetReg, AMDGPU::SGPRRegBank);
+  return 0;
+}
+
+bool AMDGPURegisterBankInfo::applyMappingSBufferLoad(
+  const OperandsMapper &OpdMapper) const {
+  MachineInstr &MI = OpdMapper.getMI();
+  MachineRegisterInfo &MRI = OpdMapper.getMRI();
+
+  const LLT S32 = LLT::scalar(32);
+  Register Dst = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(Dst);
+
+  const RegisterBank *RSrcBank =
+    OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
+  const RegisterBank *OffsetBank =
+    OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
+  if (RSrcBank == &AMDGPU::SGPRRegBank &&
+      OffsetBank == &AMDGPU::SGPRRegBank)
+    return true; // Legal mapping
+
+  // FIXME: 96-bit case was widened during legalize. We neeed to narrow it back
+  // here but don't have an MMO.
+
+  unsigned LoadSize = Ty.getSizeInBits();
+  int NumLoads = 1;
+  if (LoadSize == 256 || LoadSize == 512) {
+    NumLoads = LoadSize / 128;
+    Ty = Ty.divide(NumLoads);
+  }
+
+  // Use the alignment to ensure that the required offsets will fit into the
+  // immediate offsets.
+  const Align Alignment = NumLoads > 1 ? Align(16 * NumLoads) : Align(1);
+
+  MachineIRBuilder B(MI);
+  MachineFunction &MF = B.getMF();
+
+  Register SOffset;
+  Register VOffset;
+  int64_t ImmOffset = 0;
+
+  unsigned MMOOffset = setBufferOffsets(B, *this, MI.getOperand(2).getReg(),
+                                        VOffset, SOffset, ImmOffset, Alignment);
+
+  // TODO: 96-bit loads were widened to 128-bit results. Shrink the result if we
+  // can, but we neeed to track an MMO for that.
+  const unsigned MemSize = (Ty.getSizeInBits() + 7) / 8;
+  const Align MemAlign(4); // FIXME: ABI type alignment?
+  MachineMemOperand *BaseMMO = MF.getMachineMemOperand(
+    MachinePointerInfo(),
+    MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
+    MachineMemOperand::MOInvariant,
+    MemSize, MemAlign);
+  if (MMOOffset != 0)
+    BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset, MemSize);
+
+  // If only the offset is divergent, emit a MUBUF buffer load instead. We can
+  // assume that the buffer is unswizzled.
+
+  Register RSrc = MI.getOperand(1).getReg();
+  Register VIndex = B.buildConstant(S32, 0).getReg(0);
+  B.getMRI()->setRegBank(VIndex, AMDGPU::VGPRRegBank);
+
+  SmallVector<Register, 4> LoadParts(NumLoads);
+
+  MachineBasicBlock::iterator MII = MI.getIterator();
+  MachineInstrSpan Span(MII, &B.getMBB());
+
+  for (int i = 0; i < NumLoads; ++i) {
+    if (NumLoads == 1) {
+      LoadParts[i] = Dst;
+    } else {
+      LoadParts[i] = MRI.createGenericVirtualRegister(Ty);
+      MRI.setRegBank(LoadParts[i], AMDGPU::VGPRRegBank);
+    }
+
+    MachineMemOperand *MMO = BaseMMO;
+    if (i != 0)
+      BaseMMO = MF.getMachineMemOperand(BaseMMO, MMOOffset + 16 * i, MemSize);
+
+    B.buildInstr(AMDGPU::G_AMDGPU_BUFFER_LOAD)
+      .addDef(LoadParts[i])       // vdata
+      .addUse(RSrc)               // rsrc
+      .addUse(VIndex)             // vindex
+      .addUse(VOffset)            // voffset
+      .addUse(SOffset)            // soffset
+      .addImm(ImmOffset + 16 * i) // offset(imm)
+      .addImm(0)                  // cachepolicy, swizzled buffer(imm)
+      .addImm(0)                  // idxen(imm)
+      .addMemOperand(MMO);
+  }
+
+  // TODO: If only the resource is a VGPR, it may be better to execute the
+  // scalar load in the waterfall loop if the resource is expected to frequently
+  // be dynamically uniform.
+  if (RSrcBank != &AMDGPU::SGPRRegBank) {
+    // Remove the original instruction to avoid potentially confusing the
+    // waterfall loop logic.
+    B.setInstr(*Span.begin());
+    MI.eraseFromParent();
+
+    SmallSet<Register, 4> OpsToWaterfall;
+
+    OpsToWaterfall.insert(RSrc);
+    executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
+                           OpsToWaterfall, MRI);
+  }
+
+  if (NumLoads != 1) {
+    if (Ty.isVector())
+      B.buildConcatVectors(Dst, LoadParts);
+    else
+      B.buildMerge(Dst, LoadParts);
+  }
+
+  // We removed the instruction earlier with a waterfall loop.
+  if (RSrcBank == &AMDGPU::SGPRRegBank)
+    MI.eraseFromParent();
+
+  return true;
+}
+
+bool AMDGPURegisterBankInfo::applyMappingBFEIntrinsic(
+  const OperandsMapper &OpdMapper, bool Signed) const {
+  MachineInstr &MI = OpdMapper.getMI();
+  MachineRegisterInfo &MRI = OpdMapper.getMRI();
+
+  // Insert basic copies
+  applyDefaultMapping(OpdMapper);
+
+  Register DstReg = MI.getOperand(0).getReg();
+  LLT Ty = MRI.getType(DstReg);
+
+  const LLT S32 = LLT::scalar(32);
+
+  const RegisterBank *DstBank =
+    OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+  if (DstBank == &AMDGPU::VGPRRegBank) {
+    if (Ty == S32)
+      return true;
+
+    // TODO: 64-bit version is scalar only, so we need to expand this.
+    return false;
+  }
+
+  Register SrcReg = MI.getOperand(2).getReg();
+  Register OffsetReg = MI.getOperand(3).getReg();
+  Register WidthReg = MI.getOperand(4).getReg();
+
+  // The scalar form packs the offset and width in a single operand.
+
+  ApplyRegBankMapping ApplyBank(*this, MRI, &AMDGPU::SGPRRegBank);
+  GISelObserverWrapper Observer(&ApplyBank);
+  MachineIRBuilder B(MI);
+  B.setChangeObserver(Observer);
+
+  // Ensure the high bits are clear to insert the offset.
+  auto OffsetMask = B.buildConstant(S32, maskTrailingOnes<unsigned>(6));
+  auto ClampOffset = B.buildAnd(S32, OffsetReg, OffsetMask);
+
+  // Zeros out the low bits, so don't bother clamping the input value.
+  auto ShiftWidth = B.buildShl(S32, WidthReg, B.buildConstant(S32, 16));
+
+  // Transformation function, pack the offset and width of a BFE into
+  // the format expected by the S_BFE_I32 / S_BFE_U32. In the second
+  // source, bits [5:0] contain the offset and bits [22:16] the width.
+  auto MergedInputs = B.buildOr(S32, ClampOffset, ShiftWidth);
+
+  // TODO: It might be worth using a pseudo here to avoid scc clobber and
+  // register class constraints.
+  unsigned Opc = Ty == S32 ? (Signed ? AMDGPU::S_BFE_I32 : AMDGPU::S_BFE_U32) :
+                             (Signed ? AMDGPU::S_BFE_I64 : AMDGPU::S_BFE_U64);
+
+  auto MIB = B.buildInstr(Opc, {DstReg}, {SrcReg, MergedInputs});
+  if (!constrainSelectedInstRegOperands(*MIB, *TII, *TRI, *this))
+    llvm_unreachable("failed to constrain BFE");
+
+  MI.eraseFromParent();
+  return true;
+}
+
 // FIXME: Duplicated from LegalizerHelper
 static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
   switch (Opc) {
@@ -1226,6 +1580,51 @@ static CmpInst::Predicate minMaxToCompare(unsigned Opc) {
   }
 }
 
+static unsigned minMaxToExtend(unsigned Opc) {
+  switch (Opc) {
+  case TargetOpcode::G_SMIN:
+  case TargetOpcode::G_SMAX:
+    return TargetOpcode::G_SEXT;
+  case TargetOpcode::G_UMIN:
+  case TargetOpcode::G_UMAX:
+    return TargetOpcode::G_ZEXT;
+  default:
+    llvm_unreachable("not in integer min/max");
+  }
+}
+
+// Emit a legalized extension from <2 x s16> to 2 32-bit components, avoiding
+// any illegal vector extend or unmerge operations.
+static std::pair<Register, Register>
+unpackV2S16ToS32(MachineIRBuilder &B, Register Src, unsigned ExtOpcode) {
+  const LLT S32 = LLT::scalar(32);
+  auto Bitcast = B.buildBitcast(S32, Src);
+
+  if (ExtOpcode == TargetOpcode::G_SEXT) {
+    auto ExtLo = B.buildSExtInReg(S32, Bitcast, 16);
+    auto ShiftHi = B.buildAShr(S32, Bitcast, B.buildConstant(S32, 16));
+    return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
+  }
+
+  auto ShiftHi = B.buildLShr(S32, Bitcast, B.buildConstant(S32, 16));
+  if (ExtOpcode == TargetOpcode::G_ZEXT) {
+    auto ExtLo = B.buildAnd(S32, Bitcast, B.buildConstant(S32, 0xffff));
+    return std::make_pair(ExtLo.getReg(0), ShiftHi.getReg(0));
+  }
+
+  assert(ExtOpcode == TargetOpcode::G_ANYEXT);
+  return std::make_pair(Bitcast.getReg(0), ShiftHi.getReg(0));
+}
+
+static MachineInstr *buildExpandedScalarMinMax(MachineIRBuilder &B,
+                                               CmpInst::Predicate Pred,
+                                               Register Dst, Register Src0,
+                                               Register Src1) {
+  const LLT CmpType = LLT::scalar(32);
+  auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
+  return B.buildSelect(Dst, Cmp, Src0, Src1);
+}
+
 // FIXME: Duplicated from LegalizerHelper, except changing the boolean type.
 void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
                                                MachineInstr &MI) const {
@@ -1234,24 +1633,25 @@ void AMDGPURegisterBankInfo::lowerScalarMinMax(MachineIRBuilder &B,
   Register Src1 = MI.getOperand(2).getReg();
 
   const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
-  LLT CmpType = LLT::scalar(32);
-
-  auto Cmp = B.buildICmp(Pred, CmpType, Src0, Src1);
-  B.buildSelect(Dst, Cmp, Src0, Src1);
+  MachineInstr *Sel = buildExpandedScalarMinMax(B, Pred, Dst, Src0, Src1);
 
-  B.getMRI()->setRegBank(Cmp.getReg(0), AMDGPU::SGPRRegBank);
+  Register CmpReg = Sel->getOperand(1).getReg();
+  B.getMRI()->setRegBank(CmpReg, AMDGPU::SGPRRegBank);
   MI.eraseFromParent();
 }
 
 // For cases where only a single copy is inserted for matching register banks.
 // Replace the register in the instruction operand
-static void substituteSimpleCopyRegs(
+static bool substituteSimpleCopyRegs(
   const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper, unsigned OpIdx) {
   SmallVector<unsigned, 1> SrcReg(OpdMapper.getVRegs(OpIdx));
   if (!SrcReg.empty()) {
     assert(SrcReg.size() == 1);
     OpdMapper.getMI().getOperand(OpIdx).setReg(SrcReg[0]);
+    return true;
   }
+
+  return false;
 }
 
 /// Handle register layout difference for f16 images for some subtargets.
@@ -1465,6 +1865,223 @@ bool AMDGPURegisterBankInfo::buildVCopy(MachineIRBuilder &B, Register DstReg,
          constrainGenericRegister(DstReg, AMDGPU::VReg_64RegClass, MRI);
 }
 
+/// Utility function for pushing dynamic vector indexes with a constant offset
+/// into waterwall loops.
+static void reinsertVectorIndexAdd(MachineIRBuilder &B,
+                                   MachineInstr &IdxUseInstr,
+                                   unsigned OpIdx,
+                                   unsigned ConstOffset) {
+  MachineRegisterInfo &MRI = *B.getMRI();
+  const LLT S32 = LLT::scalar(32);
+  Register WaterfallIdx = IdxUseInstr.getOperand(OpIdx).getReg();
+  B.setInsertPt(*IdxUseInstr.getParent(), IdxUseInstr.getIterator());
+
+  auto MaterializedOffset = B.buildConstant(S32, ConstOffset);
+
+  auto Add = B.buildAdd(S32, WaterfallIdx, MaterializedOffset);
+  MRI.setRegBank(MaterializedOffset.getReg(0), AMDGPU::SGPRRegBank);
+  MRI.setRegBank(Add.getReg(0), AMDGPU::SGPRRegBank);
+  IdxUseInstr.getOperand(OpIdx).setReg(Add.getReg(0));
+}
+
+/// Implement extending a 32-bit value to a 64-bit value. \p Lo32Reg is the
+/// original 32-bit source value (to be inserted in the low part of the combined
+/// 64-bit result), and \p Hi32Reg is the high half of the combined 64-bit
+/// value.
+static void extendLow32IntoHigh32(MachineIRBuilder &B,
+                                  Register Hi32Reg, Register Lo32Reg,
+                                  unsigned ExtOpc,
+                                  const RegisterBank &RegBank,
+                                  bool IsBooleanSrc = false) {
+  if (ExtOpc == AMDGPU::G_ZEXT) {
+    B.buildConstant(Hi32Reg, 0);
+  } else if (ExtOpc == AMDGPU::G_SEXT) {
+    if (IsBooleanSrc) {
+      // If we know the original source was an s1, the high half is the same as
+      // the low.
+      B.buildCopy(Hi32Reg, Lo32Reg);
+    } else {
+      // Replicate sign bit from 32-bit extended part.
+      auto ShiftAmt = B.buildConstant(LLT::scalar(32), 31);
+      B.getMRI()->setRegBank(ShiftAmt.getReg(0), RegBank);
+      B.buildAShr(Hi32Reg, Lo32Reg, ShiftAmt);
+    }
+  } else {
+    assert(ExtOpc == AMDGPU::G_ANYEXT && "not an integer extension");
+    B.buildUndef(Hi32Reg);
+  }
+}
+
+bool AMDGPURegisterBankInfo::foldExtractEltToCmpSelect(
+  MachineInstr &MI, MachineRegisterInfo &MRI,
+  const OperandsMapper &OpdMapper) const {
+
+  Register VecReg = MI.getOperand(1).getReg();
+  Register Idx = MI.getOperand(2).getReg();
+
+  const RegisterBank &IdxBank =
+    *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
+
+  bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank;
+
+  LLT VecTy = MRI.getType(VecReg);
+  unsigned EltSize = VecTy.getScalarSizeInBits();
+  unsigned NumElem = VecTy.getNumElements();
+
+  if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
+                                                  IsDivergentIdx))
+    return false;
+
+  MachineIRBuilder B(MI);
+  LLT S32 = LLT::scalar(32);
+
+  const RegisterBank &DstBank =
+    *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+  const RegisterBank &SrcBank =
+    *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
+
+  const RegisterBank &CCBank =
+    (DstBank == AMDGPU::SGPRRegBank &&
+     SrcBank == AMDGPU::SGPRRegBank &&
+     IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
+                                     : AMDGPU::VCCRegBank;
+  LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
+
+  if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
+    Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
+    MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
+  }
+
+  LLT EltTy = VecTy.getScalarType();
+  SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
+  unsigned NumLanes = DstRegs.size();
+  if (!NumLanes)
+    NumLanes = 1;
+  else
+    EltTy = MRI.getType(DstRegs[0]);
+
+  auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
+  SmallVector<Register, 2> Res(NumLanes);
+  for (unsigned L = 0; L < NumLanes; ++L)
+    Res[L] = UnmergeToEltTy.getReg(L);
+
+  for (unsigned I = 1; I < NumElem; ++I) {
+    auto IC = B.buildConstant(S32, I);
+    MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
+    auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
+    MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
+
+    for (unsigned L = 0; L < NumLanes; ++L) {
+      auto S = B.buildSelect(EltTy, Cmp,
+                             UnmergeToEltTy.getReg(I * NumLanes + L), Res[L]);
+
+      for (unsigned N : { 0, 2, 3 })
+        MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
+
+      Res[L] = S->getOperand(0).getReg();
+    }
+  }
+
+  for (unsigned L = 0; L < NumLanes; ++L) {
+    Register DstReg = (NumLanes == 1) ? MI.getOperand(0).getReg() : DstRegs[L];
+    B.buildCopy(DstReg, Res[L]);
+    MRI.setRegBank(DstReg, DstBank);
+  }
+
+  MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
+  MI.eraseFromParent();
+
+  return true;
+}
+
+bool AMDGPURegisterBankInfo::foldInsertEltToCmpSelect(
+  MachineInstr &MI, MachineRegisterInfo &MRI,
+  const OperandsMapper &OpdMapper) const {
+
+  Register VecReg = MI.getOperand(1).getReg();
+  Register Idx = MI.getOperand(3).getReg();
+
+  const RegisterBank &IdxBank =
+    *OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
+
+  bool IsDivergentIdx = IdxBank == AMDGPU::VGPRRegBank;
+
+  LLT VecTy = MRI.getType(VecReg);
+  unsigned EltSize = VecTy.getScalarSizeInBits();
+  unsigned NumElem = VecTy.getNumElements();
+
+  if (!SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
+                                                  IsDivergentIdx))
+    return false;
+
+  MachineIRBuilder B(MI);
+  LLT S32 = LLT::scalar(32);
+
+  const RegisterBank &DstBank =
+    *OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+  const RegisterBank &SrcBank =
+    *OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
+  const RegisterBank &InsBank =
+    *OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
+
+  const RegisterBank &CCBank =
+    (DstBank == AMDGPU::SGPRRegBank &&
+     SrcBank == AMDGPU::SGPRRegBank &&
+     InsBank == AMDGPU::SGPRRegBank &&
+     IdxBank == AMDGPU::SGPRRegBank) ? AMDGPU::SGPRRegBank
+                                     : AMDGPU::VCCRegBank;
+  LLT CCTy = (CCBank == AMDGPU::SGPRRegBank) ? S32 : LLT::scalar(1);
+
+  if (CCBank == AMDGPU::VCCRegBank && IdxBank == AMDGPU::SGPRRegBank) {
+    Idx = B.buildCopy(S32, Idx)->getOperand(0).getReg();
+    MRI.setRegBank(Idx, AMDGPU::VGPRRegBank);
+  }
+
+  LLT EltTy = VecTy.getScalarType();
+  SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
+  unsigned NumLanes = InsRegs.size();
+  if (!NumLanes) {
+    NumLanes = 1;
+    InsRegs.push_back(MI.getOperand(2).getReg());
+  } else {
+    EltTy = MRI.getType(InsRegs[0]);
+  }
+
+  auto UnmergeToEltTy = B.buildUnmerge(EltTy, VecReg);
+  SmallVector<Register, 16> Ops(NumElem * NumLanes);
+
+  for (unsigned I = 0; I < NumElem; ++I) {
+    auto IC = B.buildConstant(S32, I);
+    MRI.setRegBank(IC->getOperand(0).getReg(), AMDGPU::SGPRRegBank);
+    auto Cmp = B.buildICmp(CmpInst::ICMP_EQ, CCTy, Idx, IC);
+    MRI.setRegBank(Cmp->getOperand(0).getReg(), CCBank);
+
+    for (unsigned L = 0; L < NumLanes; ++L) {
+      auto S = B.buildSelect(EltTy, Cmp, InsRegs[L],
+                             UnmergeToEltTy.getReg(I * NumLanes + L));
+
+      for (unsigned N : { 0, 2, 3 })
+        MRI.setRegBank(S->getOperand(N).getReg(), DstBank);
+
+      Ops[I * NumLanes + L] = S->getOperand(0).getReg();
+    }
+  }
+
+  LLT MergeTy = LLT::vector(Ops.size(), EltTy);
+  if (MergeTy == MRI.getType(MI.getOperand(0).getReg())) {
+    B.buildBuildVector(MI.getOperand(0), Ops);
+  } else {
+    auto Vec = B.buildBuildVector(MergeTy, Ops);
+    MRI.setRegBank(Vec->getOperand(0).getReg(), DstBank);
+    B.buildBitcast(MI.getOperand(0).getReg(), Vec);
+  }
+
+  MRI.setRegBank(MI.getOperand(0).getReg(), DstBank);
+  MI.eraseFromParent();
+
+  return true;
+}
+
 void AMDGPURegisterBankInfo::applyMappingImpl(
     const OperandsMapper &OpdMapper) const {
   MachineInstr &MI = OpdMapper.getMI();
@@ -1555,7 +2172,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 
     MachineBasicBlock *MBB = MI.getParent();
     B.setInsertPt(*MBB, std::next(MI.getIterator()));
-    B.buildTrunc(DstReg, NewDstReg);
+
+    // If we had a constrained VCC result register, a copy was inserted to VCC
+    // from SGPR.
+    SmallVector<Register, 1> DefRegs(OpdMapper.getVRegs(0));
+    if (DefRegs.empty())
+      DefRegs.push_back(DstReg);
+    B.buildTrunc(DefRegs[0], NewDstReg);
     return;
   }
   case AMDGPU::G_SELECT: {
@@ -1712,10 +2335,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   }
   case AMDGPU::G_ADD:
   case AMDGPU::G_SUB:
-  case AMDGPU::G_MUL: {
+  case AMDGPU::G_MUL:
+  case AMDGPU::G_SHL:
+  case AMDGPU::G_LSHR:
+  case AMDGPU::G_ASHR: {
     Register DstReg = MI.getOperand(0).getReg();
     LLT DstTy = MRI.getType(DstReg);
-    if (DstTy != LLT::scalar(16))
+
+    // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
+    // Packed 16-bit operations need to be scalarized and promoted.
+    if (DstTy != LLT::scalar(16) && DstTy != LLT::vector(2, 16))
       break;
 
     const RegisterBank *DstBank =
@@ -1723,16 +2352,42 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     if (DstBank == &AMDGPU::VGPRRegBank)
       break;
 
-    // 16-bit operations are VALU only, but can be promoted to 32-bit SALU.
-    MachineFunction *MF = MI.getParent()->getParent();
+    const LLT S32 = LLT::scalar(32);
+    MachineBasicBlock *MBB = MI.getParent();
+    MachineFunction *MF = MBB->getParent();
     MachineIRBuilder B(MI);
     ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
     GISelObserverWrapper Observer(&ApplySALU);
-    LegalizerHelper Helper(*MF, Observer, B);
 
-    if (Helper.widenScalar(MI, 0, LLT::scalar(32)) !=
-        LegalizerHelper::Legalized)
-      llvm_unreachable("widen scalar should have succeeded");
+    if (DstTy.isVector()) {
+      B.setChangeObserver(Observer);
+
+      Register WideSrc0Lo, WideSrc0Hi;
+      Register WideSrc1Lo, WideSrc1Hi;
+
+      std::tie(WideSrc0Lo, WideSrc0Hi)
+        = unpackV2S16ToS32(B, MI.getOperand(1).getReg(), AMDGPU::G_ANYEXT);
+      std::tie(WideSrc1Lo, WideSrc1Hi)
+        = unpackV2S16ToS32(B, MI.getOperand(2).getReg(), AMDGPU::G_ANYEXT);
+      auto Lo = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Lo, WideSrc1Lo});
+      auto Hi = B.buildInstr(MI.getOpcode(), {S32}, {WideSrc0Hi, WideSrc1Hi});
+      B.buildBuildVectorTrunc(DstReg, {Lo.getReg(0), Hi.getReg(0)});
+      MI.eraseFromParent();
+    } else {
+      LegalizerHelper Helper(*MF, Observer, B);
+
+      if (Helper.widenScalar(MI, 0, S32) != LegalizerHelper::Legalized)
+        llvm_unreachable("widen scalar should have succeeded");
+
+      // FIXME: s16 shift amounts should be legal.
+      if (Opc == AMDGPU::G_SHL || Opc == AMDGPU::G_LSHR ||
+          Opc == AMDGPU::G_ASHR) {
+        B.setInsertPt(*MBB, MI.getIterator());
+        if (Helper.widenScalar(MI, 1, S32) != LegalizerHelper::Legalized)
+          llvm_unreachable("widen scalar should have succeeded");
+      }
+    }
+
     return;
   }
   case AMDGPU::G_SMIN:
@@ -1750,10 +2405,44 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 
     // Turn scalar min/max into a compare and select.
     LLT Ty = MRI.getType(DstReg);
-    LLT S32 = LLT::scalar(32);
-    LLT S16 = LLT::scalar(16);
+    const LLT S32 = LLT::scalar(32);
+    const LLT S16 = LLT::scalar(16);
+    const LLT V2S16 = LLT::vector(2, 16);
 
-    if (Ty == S16) {
+    if (Ty == V2S16) {
+      ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
+      GISelObserverWrapper Observer(&ApplySALU);
+      B.setChangeObserver(Observer);
+
+      // Need to widen to s32, and expand as cmp + select, and avoid producing
+      // illegal vector extends or unmerges that would need further
+      // legalization.
+      //
+      // TODO: Should we just readfirstlane? That should probably be handled
+      // with a UniformVGPR register bank that wouldn't need special
+      // consideration here.
+
+      Register Dst = MI.getOperand(0).getReg();
+      Register Src0 = MI.getOperand(1).getReg();
+      Register Src1 = MI.getOperand(2).getReg();
+
+      Register WideSrc0Lo, WideSrc0Hi;
+      Register WideSrc1Lo, WideSrc1Hi;
+
+      unsigned ExtendOp = minMaxToExtend(MI.getOpcode());
+
+      std::tie(WideSrc0Lo, WideSrc0Hi) = unpackV2S16ToS32(B, Src0, ExtendOp);
+      std::tie(WideSrc1Lo, WideSrc1Hi) = unpackV2S16ToS32(B, Src1, ExtendOp);
+
+      Register Lo = MRI.createGenericVirtualRegister(S32);
+      Register Hi = MRI.createGenericVirtualRegister(S32);
+      const CmpInst::Predicate Pred = minMaxToCompare(MI.getOpcode());
+      buildExpandedScalarMinMax(B, Pred, Lo, WideSrc0Lo, WideSrc1Lo);
+      buildExpandedScalarMinMax(B, Pred, Hi, WideSrc0Hi, WideSrc1Hi);
+
+      B.buildBuildVectorTrunc(Dst, {Lo, Hi});
+      MI.eraseFromParent();
+    } else if (Ty == S16) {
       ApplyRegBankMapping ApplySALU(*this, MRI, &AMDGPU::SGPRRegBank);
       GISelObserverWrapper Observer(&ApplySALU);
       LegalizerHelper Helper(*MF, Observer, B);
@@ -1769,11 +2458,77 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 
     return;
   }
+  case AMDGPU::G_SEXT_INREG: {
+    SmallVector<Register, 2> SrcRegs(OpdMapper.getVRegs(1));
+    if (SrcRegs.empty())
+      break; // Nothing to repair
+
+    const LLT S32 = LLT::scalar(32);
+    MachineIRBuilder B(MI);
+    ApplyRegBankMapping O(*this, MRI, &AMDGPU::VGPRRegBank);
+    GISelObserverWrapper Observer(&O);
+    B.setChangeObserver(Observer);
+
+    // Don't use LegalizerHelper's narrowScalar. It produces unwanted G_SEXTs
+    // we would need to further expand, and doesn't let us directly set the
+    // result registers.
+    SmallVector<Register, 2> DstRegs(OpdMapper.getVRegs(0));
+
+    int Amt = MI.getOperand(2).getImm();
+    if (Amt <= 32) {
+      if (Amt == 32) {
+        // The low bits are unchanged.
+        B.buildCopy(DstRegs[0], SrcRegs[0]);
+      } else {
+        // Extend in the low bits and propagate the sign bit to the high half.
+        B.buildSExtInReg(DstRegs[0], SrcRegs[0], Amt);
+      }
+
+      B.buildAShr(DstRegs[1], DstRegs[0], B.buildConstant(S32, 31));
+    } else {
+      // The low bits are unchanged, and extend in the high bits.
+      B.buildCopy(DstRegs[0], SrcRegs[0]);
+      B.buildSExtInReg(DstRegs[1], DstRegs[0], Amt - 32);
+    }
+
+    Register DstReg = MI.getOperand(0).getReg();
+    MRI.setRegBank(DstReg, AMDGPU::VGPRRegBank);
+    MI.eraseFromParent();
+    return;
+  }
+  case AMDGPU::G_CTPOP:
+  case AMDGPU::G_CTLZ_ZERO_UNDEF:
+  case AMDGPU::G_CTTZ_ZERO_UNDEF: {
+    MachineIRBuilder B(MI);
+    MachineFunction &MF = B.getMF();
+
+    const RegisterBank *DstBank =
+      OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
+    if (DstBank == &AMDGPU::SGPRRegBank)
+      break;
+
+    Register SrcReg = MI.getOperand(1).getReg();
+    const LLT S32 = LLT::scalar(32);
+    LLT Ty = MRI.getType(SrcReg);
+    if (Ty == S32)
+      break;
+
+    ApplyRegBankMapping ApplyVALU(*this, MRI, &AMDGPU::VGPRRegBank);
+    GISelObserverWrapper Observer(&ApplyVALU);
+    LegalizerHelper Helper(MF, Observer, B);
+
+    if (Helper.narrowScalar(MI, 1, S32) != LegalizerHelper::Legalized)
+      llvm_unreachable("narrowScalar should have succeeded");
+    return;
+  }
   case AMDGPU::G_SEXT:
-  case AMDGPU::G_ZEXT: {
+  case AMDGPU::G_ZEXT:
+  case AMDGPU::G_ANYEXT: {
     Register SrcReg = MI.getOperand(1).getReg();
     LLT SrcTy = MRI.getType(SrcReg);
-    bool Signed = Opc == AMDGPU::G_SEXT;
+    const bool Signed = Opc == AMDGPU::G_SEXT;
+
+    assert(empty(OpdMapper.getVRegs(1)));
 
     MachineIRBuilder B(MI);
     const RegisterBank *SrcBank =
@@ -1788,23 +2543,19 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
         // breakdowns supported.
         DstTy.getSizeInBits() == 64 &&
         SrcTy.getSizeInBits() <= 32) {
-      const LLT S32 = LLT::scalar(32);
       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
 
       // Extend to 32-bit, and then extend the low half.
       if (Signed) {
         // TODO: Should really be buildSExtOrCopy
         B.buildSExtOrTrunc(DefRegs[0], SrcReg);
-
-        // Replicate sign bit from 32-bit extended part.
-        auto ShiftAmt = B.buildConstant(S32, 31);
-        MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
-        B.buildAShr(DefRegs[1], DefRegs[0], ShiftAmt);
-      } else {
+      } else if (Opc == AMDGPU::G_ZEXT) {
         B.buildZExtOrTrunc(DefRegs[0], SrcReg);
-        B.buildConstant(DefRegs[1], 0);
+      } else {
+        B.buildAnyExtOrTrunc(DefRegs[0], SrcReg);
       }
 
+      extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank);
       MRI.setRegBank(DstReg, *SrcBank);
       MI.eraseFromParent();
       return;
@@ -1813,6 +2564,9 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     if (SrcTy != LLT::scalar(1))
       return;
 
+    // It is not legal to have a legalization artifact with a VCC source. Rather
+    // than introducing a copy, insert the select we would have to select the
+    // copy to.
     if (SrcBank == &AMDGPU::VCCRegBank) {
       SmallVector<Register, 2> DefRegs(OpdMapper.getVRegs(0));
 
@@ -1834,7 +2588,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 
       if (DstSize > 32) {
         B.buildSelect(DefRegs[0], SrcReg, True, False);
-        B.buildCopy(DefRegs[1], DefRegs[0]);
+        extendLow32IntoHigh32(B, DefRegs[1], DefRegs[0], Opc, *SrcBank, true);
       } else if (DstSize < 32) {
         auto Sel = B.buildSelect(SelType, SrcReg, True, False);
         MRI.setRegBank(Sel.getReg(0), *DstBank);
@@ -1847,24 +2601,7 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       return;
     }
 
-    // Fixup the case with an s1 src that isn't a condition register. Use shifts
-    // instead of introducing a compare to avoid an unnecessary condition
-    // register (and since there's no scalar 16-bit compares).
-    auto Ext = B.buildAnyExt(DstTy, SrcReg);
-    auto ShiftAmt = B.buildConstant(LLT::scalar(32), DstTy.getSizeInBits() - 1);
-    auto Shl = B.buildShl(DstTy, Ext, ShiftAmt);
-
-    if (MI.getOpcode() == AMDGPU::G_SEXT)
-      B.buildAShr(DstReg, Shl, ShiftAmt);
-    else
-      B.buildLShr(DstReg, Shl, ShiftAmt);
-
-    MRI.setRegBank(DstReg, *SrcBank);
-    MRI.setRegBank(Ext.getReg(0), *SrcBank);
-    MRI.setRegBank(ShiftAmt.getReg(0), *SrcBank);
-    MRI.setRegBank(Shl.getReg(0), *SrcBank);
-    MI.eraseFromParent();
-    return;
+    break;
   }
   case AMDGPU::G_BUILD_VECTOR:
   case AMDGPU::G_BUILD_VECTOR_TRUNC: {
@@ -1934,7 +2671,16 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 
     assert(OpdMapper.getVRegs(1).empty() && OpdMapper.getVRegs(2).empty());
 
-    LLT DstTy = MRI.getType(MI.getOperand(0).getReg());
+    Register DstReg = MI.getOperand(0).getReg();
+    Register SrcReg = MI.getOperand(1).getReg();
+
+    const LLT S32 = LLT::scalar(32);
+    LLT DstTy = MRI.getType(DstReg);
+    LLT SrcTy = MRI.getType(SrcReg);
+
+    if (foldExtractEltToCmpSelect(MI, MRI, OpdMapper))
+      return;
+
     MachineIRBuilder B(MI);
 
     const ValueMapping &DstMapping
@@ -1942,10 +2688,26 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     const RegisterBank *DstBank = DstMapping.BreakDown[0].RegBank;
     const RegisterBank *SrcBank =
       OpdMapper.getInstrMapping().getOperandMapping(1).BreakDown[0].RegBank;
-
-    Register DstReg = MI.getOperand(0).getReg();
-    Register SrcReg = MI.getOperand(1).getReg();
-    Register IdxReg = MI.getOperand(2).getReg();
+    const RegisterBank *IdxBank =
+        OpdMapper.getInstrMapping().getOperandMapping(2).BreakDown[0].RegBank;
+
+    Register BaseIdxReg;
+    unsigned ConstOffset;
+    MachineInstr *OffsetDef;
+    std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
+        AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(2).getReg());
+
+    // See if the index is an add of a constant which will be foldable by moving
+    // the base register of the index later if this is going to be executed in a
+    // waterfall loop. This is essentially to reassociate the add of a constant
+    // with the readfirstlane.
+    bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
+                                   ConstOffset > 0 &&
+                                   ConstOffset < SrcTy.getNumElements();
+
+    // Move the base register. We'll re-insert the add later.
+    if (ShouldMoveIndexIntoLoop)
+      MI.getOperand(2).setReg(BaseIdxReg);
 
     // If this is a VGPR result only because the index was a VGPR result, the
     // actual indexing will be done on the SGPR source vector, which will
@@ -1969,26 +2731,30 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
         buildVCopy(B, DstReg, TmpReg);
       }
 
+      // Re-insert the constant offset add inside the waterfall loop.
+      if (ShouldMoveIndexIntoLoop)
+        reinsertVectorIndexAdd(B, MI, 2, ConstOffset);
+
       return;
     }
 
     assert(DstTy.getSizeInBits() == 64);
 
-    LLT SrcTy = MRI.getType(SrcReg);
-    const LLT S32 = LLT::scalar(32);
     LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
 
     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
     auto One = B.buildConstant(S32, 1);
 
+    MachineBasicBlock::iterator MII = MI.getIterator();
+
     // Split the vector index into 32-bit pieces. Prepare to move all of the
     // new instructions into a waterfall loop if necessary.
     //
     // Don't put the bitcast or constant in the loop.
-    MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
+    MachineInstrSpan Span(MII, &B.getMBB());
 
     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
-    auto IdxLo = B.buildShl(S32, IdxReg, One);
+    auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
     auto IdxHi = B.buildAdd(S32, IdxLo, One);
 
     auto Extract0 = B.buildExtractVectorElement(DstRegs[0], CastSrc, IdxLo);
@@ -2029,33 +2795,70 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       buildVCopy(B, DstRegs[1], TmpReg1);
     }
 
+    if (ShouldMoveIndexIntoLoop)
+      reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
+
     return;
   }
   case AMDGPU::G_INSERT_VECTOR_ELT: {
     SmallVector<Register, 2> InsRegs(OpdMapper.getVRegs(2));
 
+    Register DstReg = MI.getOperand(0).getReg();
+    LLT VecTy = MRI.getType(DstReg);
+
     assert(OpdMapper.getVRegs(0).empty());
-    assert(OpdMapper.getVRegs(1).empty());
     assert(OpdMapper.getVRegs(3).empty());
 
-    if (InsRegs.empty()) {
-      applyDefaultMapping(OpdMapper);
-      executeInWaterfallLoop(MI, MRI, { 3 });
+    if (substituteSimpleCopyRegs(OpdMapper, 1))
+      MRI.setType(MI.getOperand(1).getReg(), VecTy);
+
+    if (foldInsertEltToCmpSelect(MI, MRI, OpdMapper))
       return;
-    }
 
-    Register DstReg = MI.getOperand(0).getReg();
+    const RegisterBank *IdxBank =
+      OpdMapper.getInstrMapping().getOperandMapping(3).BreakDown[0].RegBank;
+
     Register SrcReg = MI.getOperand(1).getReg();
     Register InsReg = MI.getOperand(2).getReg();
-    Register IdxReg = MI.getOperand(3).getReg();
-    LLT SrcTy = MRI.getType(SrcReg);
     LLT InsTy = MRI.getType(InsReg);
     (void)InsTy;
 
+    Register BaseIdxReg;
+    unsigned ConstOffset;
+    MachineInstr *OffsetDef;
+    std::tie(BaseIdxReg, ConstOffset, OffsetDef) =
+      AMDGPU::getBaseWithConstantOffset(MRI, MI.getOperand(3).getReg());
+
+    // See if the index is an add of a constant which will be foldable by moving
+    // the base register of the index later if this is going to be executed in a
+    // waterfall loop. This is essentially to reassociate the add of a constant
+    // with the readfirstlane.
+    bool ShouldMoveIndexIntoLoop = IdxBank != &AMDGPU::SGPRRegBank &&
+      ConstOffset > 0 &&
+      ConstOffset < VecTy.getNumElements();
+
+    // Move the base register. We'll re-insert the add later.
+    if (ShouldMoveIndexIntoLoop)
+      MI.getOperand(3).setReg(BaseIdxReg);
+
+
+    if (InsRegs.empty()) {
+      executeInWaterfallLoop(MI, MRI, { 3 });
+
+      // Re-insert the constant offset add inside the waterfall loop.
+      if (ShouldMoveIndexIntoLoop) {
+        MachineIRBuilder B(MI);
+        reinsertVectorIndexAdd(B, MI, 3, ConstOffset);
+      }
+
+      return;
+    }
+
+
     assert(InsTy.getSizeInBits() == 64);
 
     const LLT S32 = LLT::scalar(32);
-    LLT Vec32 = LLT::vector(2 * SrcTy.getNumElements(), 32);
+    LLT Vec32 = LLT::vector(2 * VecTy.getNumElements(), 32);
 
     MachineIRBuilder B(MI);
     auto CastSrc = B.buildBitcast(Vec32, SrcReg);
@@ -2068,12 +2871,11 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     MachineInstrSpan Span(MachineBasicBlock::iterator(&MI), &B.getMBB());
 
     // Compute 32-bit element indices, (2 * OrigIdx, 2 * OrigIdx + 1).
-    auto IdxLo = B.buildShl(S32, IdxReg, One);
+    auto IdxLo = B.buildShl(S32, BaseIdxReg, One);
     auto IdxHi = B.buildAdd(S32, IdxLo, One);
 
     auto InsLo = B.buildInsertVectorElement(Vec32, CastSrc, InsRegs[0], IdxLo);
     auto InsHi = B.buildInsertVectorElement(Vec32, InsLo, InsRegs[1], IdxHi);
-    B.buildBitcast(DstReg, InsHi);
 
     const RegisterBank *DstBank =
       OpdMapper.getInstrMapping().getOperandMapping(0).BreakDown[0].RegBank;
@@ -2093,6 +2895,8 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
 
     SmallSet<Register, 4> OpsToWaterfall;
     if (!collectWaterfallOperands(OpsToWaterfall, MI, MRI, { 3 })) {
+      B.setInsertPt(B.getMBB(), MI);
+      B.buildBitcast(DstReg, InsHi);
       MI.eraseFromParent();
       return;
     }
@@ -2100,17 +2904,70 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
     B.setInstr(*Span.begin());
     MI.eraseFromParent();
 
+    // Figure out the point after the waterfall loop before mangling the control
+    // flow.
     executeInWaterfallLoop(B, make_range(Span.begin(), Span.end()),
                            OpsToWaterfall, MRI);
+
+    // The insertion point is now right after the original instruction.
+    //
+    // Keep the bitcast to the original vector type out of the loop. Doing this
+    // saved an extra phi we don't need inside the loop.
+    B.buildBitcast(DstReg, InsHi);
+
+    // Re-insert the constant offset add inside the waterfall loop.
+    if (ShouldMoveIndexIntoLoop)
+      reinsertVectorIndexAdd(B, *IdxLo, 1, ConstOffset);
+
+    return;
+  }
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD:
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
+  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
+  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
+  case AMDGPU::G_AMDGPU_BUFFER_STORE:
+  case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
+  case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
+  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
+  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16:
+  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
+  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16: {
+    applyDefaultMapping(OpdMapper);
+    executeInWaterfallLoop(MI, MRI, {1, 4});
+    return;
+  }
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
+    applyDefaultMapping(OpdMapper);
+    executeInWaterfallLoop(MI, MRI, {2, 5});
+    return;
+  }
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
+    applyDefaultMapping(OpdMapper);
+    executeInWaterfallLoop(MI, MRI, {3, 6});
+    return;
+  }
+  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
+    applyMappingSBufferLoad(OpdMapper);
     return;
   }
   case AMDGPU::G_INTRINSIC: {
     switch (MI.getIntrinsicID()) {
-    case Intrinsic::amdgcn_s_buffer_load: {
-      // FIXME: Move to G_INTRINSIC_W_SIDE_EFFECTS
-      executeInWaterfallLoop(MI, MRI, { 2, 3 });
-      return;
-    }
     case Intrinsic::amdgcn_readlane: {
       substituteSimpleCopyRegs(OpdMapper, 2);
 
@@ -2132,18 +2989,51 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(MI, MRI, 3); // Index
       return;
     }
-    default:
-      break;
+    case Intrinsic::amdgcn_ballot:
+    case Intrinsic::amdgcn_interp_p1:
+    case Intrinsic::amdgcn_interp_p2:
+    case Intrinsic::amdgcn_interp_mov:
+    case Intrinsic::amdgcn_interp_p1_f16:
+    case Intrinsic::amdgcn_interp_p2_f16: {
+      applyDefaultMapping(OpdMapper);
+
+      // Readlane for m0 value, which is always the last operand.
+      // FIXME: Should this be a waterfall loop instead?
+      constrainOpWithReadfirstlane(MI, MRI, MI.getNumOperands() - 1); // Index
+      return;
+    }
+    case Intrinsic::amdgcn_permlane16:
+    case Intrinsic::amdgcn_permlanex16: {
+      // Doing a waterfall loop over these wouldn't make any sense.
+      substituteSimpleCopyRegs(OpdMapper, 2);
+      substituteSimpleCopyRegs(OpdMapper, 3);
+      constrainOpWithReadfirstlane(MI, MRI, 4);
+      constrainOpWithReadfirstlane(MI, MRI, 5);
+      return;
+    }
+    case Intrinsic::amdgcn_sbfe:
+      applyMappingBFEIntrinsic(OpdMapper, true);
+      return;
+    case Intrinsic::amdgcn_ubfe:
+      applyMappingBFEIntrinsic(OpdMapper, false);
+      return;
     }
     break;
   }
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
+    const AMDGPU::RsrcIntrinsic *RSrcIntrin
+      = AMDGPU::lookupRsrcIntrinsic(MI.getIntrinsicID());
+    assert(RSrcIntrin && RSrcIntrin->IsImage);
+    // Non-images can have complications from operands that allow both SGPR
+    // and VGPR. For now it's too complicated to figure out the final opcode
+    // to derive the register bank from the MCInstrDesc.
+    applyMappingImage(MI, OpdMapper, MRI, RSrcIntrin->RsrcArg);
+    return;
+  }
   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
     auto IntrID = MI.getIntrinsicID();
     switch (IntrID) {
-    case Intrinsic::amdgcn_buffer_load: {
-      executeInWaterfallLoop(MI, MRI, { 2 });
-      return;
-    }
     case Intrinsic::amdgcn_ds_ordered_add:
     case Intrinsic::amdgcn_ds_ordered_swap: {
       // This is only allowed to execute with 1 lane, so readfirstlane is safe.
@@ -2167,28 +3057,19 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
       constrainOpWithReadfirstlane(MI, MRI, 1); // M0
       return;
     }
+    case Intrinsic::amdgcn_ds_append:
+    case Intrinsic::amdgcn_ds_consume: {
+      constrainOpWithReadfirstlane(MI, MRI, 2); // M0
+      return;
+    }
     case Intrinsic::amdgcn_s_sendmsg:
     case Intrinsic::amdgcn_s_sendmsghalt: {
       // FIXME: Should this use a waterfall loop?
       constrainOpWithReadfirstlane(MI, MRI, 2); // M0
       return;
     }
-    case Intrinsic::amdgcn_raw_buffer_load:
-    case Intrinsic::amdgcn_raw_buffer_load_format:
-    case Intrinsic::amdgcn_raw_tbuffer_load:
-    case Intrinsic::amdgcn_raw_buffer_store:
-    case Intrinsic::amdgcn_raw_buffer_store_format:
-    case Intrinsic::amdgcn_raw_tbuffer_store: {
-      applyDefaultMapping(OpdMapper);
-      executeInWaterfallLoop(MI, MRI, {2, 4});
-      return;
-    }
-    case Intrinsic::amdgcn_struct_buffer_load:
-    case Intrinsic::amdgcn_struct_buffer_store:
-    case Intrinsic::amdgcn_struct_tbuffer_load:
-    case Intrinsic::amdgcn_struct_tbuffer_store: {
-      applyDefaultMapping(OpdMapper);
-      executeInWaterfallLoop(MI, MRI, {2, 5});
+    case Intrinsic::amdgcn_s_setreg: {
+      constrainOpWithReadfirstlane(MI, MRI, 2);
       return;
     }
     default: {
@@ -2211,10 +3092,13 @@ void AMDGPURegisterBankInfo::applyMappingImpl(
   case AMDGPU::G_LOAD:
   case AMDGPU::G_ZEXTLOAD:
   case AMDGPU::G_SEXTLOAD: {
-    if (applyMappingWideLoad(MI, OpdMapper, MRI))
+    if (applyMappingLoad(MI, OpdMapper, MRI))
       return;
     break;
   }
+  case AMDGPU::G_DYN_STACKALLOC:
+    applyMappingDynStackAlloc(MI, OpdMapper, MRI);
+    return;
   default:
     break;
   }
@@ -2244,7 +3128,11 @@ AMDGPURegisterBankInfo::getDefaultMappingSOP(const MachineInstr &MI) const {
   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
 
   for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
-    unsigned Size = getSizeInBits(MI.getOperand(i).getReg(), MRI, *TRI);
+    const MachineOperand &SrcOp = MI.getOperand(i);
+    if (!SrcOp.isReg())
+      continue;
+
+    unsigned Size = getSizeInBits(SrcOp.getReg(), MRI, *TRI);
     OpdsMapping[i] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
   }
   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
@@ -2256,31 +3144,19 @@ AMDGPURegisterBankInfo::getDefaultMappingVOP(const MachineInstr &MI) const {
   const MachineFunction &MF = *MI.getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   SmallVector<const ValueMapping*, 8> OpdsMapping(MI.getNumOperands());
-  unsigned OpdIdx = 0;
-
-  unsigned Size0 = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
-  OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
-
-  if (MI.getOperand(OpdIdx).isIntrinsicID())
-    OpdsMapping[OpdIdx++] = nullptr;
 
-  Register Reg1 = MI.getOperand(OpdIdx).getReg();
-  unsigned Size1 = getSizeInBits(Reg1, MRI, *TRI);
-
-  unsigned DefaultBankID = Size1 == 1 ?
-    AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
-  unsigned Bank1 = getRegBankID(Reg1, MRI, *TRI, DefaultBankID);
-
-  OpdsMapping[OpdIdx++] = AMDGPU::getValueMapping(Bank1, Size1);
-
-  for (unsigned e = MI.getNumOperands(); OpdIdx != e; ++OpdIdx) {
-    const MachineOperand &MO = MI.getOperand(OpdIdx);
-    if (!MO.isReg())
+  // Even though we technically could use SGPRs, this would require knowledge of
+  // the constant bus restriction. Force all sources to VGPR (except for VCC).
+  //
+  // TODO: Unary ops are trivially OK, so accept SGPRs?
+  for (unsigned i = 0, e = MI.getNumOperands(); i != e; ++i) {
+    const MachineOperand &Src = MI.getOperand(i);
+    if (!Src.isReg())
       continue;
 
-    unsigned Size = getSizeInBits(MO.getReg(), MRI, *TRI);
+    unsigned Size = getSizeInBits(Src.getReg(), MRI, *TRI);
     unsigned BankID = Size == 1 ? AMDGPU::VCCRegBankID : AMDGPU::VGPRRegBankID;
-    OpdsMapping[OpdIdx] = AMDGPU::getValueMapping(BankID, Size);
+    OpdsMapping[i] = AMDGPU::getValueMapping(BankID, Size);
   }
 
   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping),
@@ -2324,6 +3200,10 @@ AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
       continue;
 
     Register OpReg = MI.getOperand(I).getReg();
+    // We replace some dead address operands with $noreg
+    if (!OpReg)
+      continue;
+
     unsigned Size = getSizeInBits(OpReg, MRI, *TRI);
 
     // FIXME: Probably need a new intrinsic register bank searchable table to
@@ -2345,6 +3225,22 @@ AMDGPURegisterBankInfo::getImageMapping(const MachineRegisterInfo &MRI,
   return getInstructionMapping(1, 1, getOperandsMapping(OpdsMapping), NumOps);
 }
 
+/// Return the mapping for a pointer arugment.
+const RegisterBankInfo::ValueMapping *
+AMDGPURegisterBankInfo::getValueMappingForPtr(const MachineRegisterInfo &MRI,
+                                              Register PtrReg) const {
+  LLT PtrTy = MRI.getType(PtrReg);
+  unsigned Size = PtrTy.getSizeInBits();
+  if (Subtarget.useFlatForGlobal() ||
+      !SITargetLowering::isFlatGlobalAddrSpace(PtrTy.getAddressSpace()))
+    return AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+
+  // If we're using MUBUF instructions for global memory, an SGPR base register
+  // is possible. Otherwise this needs to be a VGPR.
+  const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
+  return AMDGPU::getValueMapping(PtrBank->getID(), Size);
+}
+
 const RegisterBankInfo::InstructionMapping &
 AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
 
@@ -2352,7 +3248,6 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
   SmallVector<const ValueMapping*, 2> OpdsMapping(2);
   unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
-  LLT LoadTy = MRI.getType(MI.getOperand(0).getReg());
   Register PtrReg = MI.getOperand(1).getReg();
   LLT PtrTy = MRI.getType(PtrReg);
   unsigned AS = PtrTy.getAddressSpace();
@@ -2364,14 +3259,23 @@ AMDGPURegisterBankInfo::getInstrMappingForLoad(const MachineInstr &MI) const {
   const RegisterBank *PtrBank = getRegBank(PtrReg, MRI, *TRI);
 
   if (PtrBank == &AMDGPU::SGPRRegBank &&
-      (AS != AMDGPUAS::LOCAL_ADDRESS && AS != AMDGPUAS::REGION_ADDRESS &&
-       AS != AMDGPUAS::PRIVATE_ADDRESS) &&
-      isScalarLoadLegal(MI)) {
-    // We have a uniform instruction so we want to use an SMRD load
-    ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
-    PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
+      SITargetLowering::isFlatGlobalAddrSpace(AS)) {
+    if (isScalarLoadLegal(MI)) {
+      // We have a uniform instruction so we want to use an SMRD load
+      ValMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
+      PtrMapping = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, PtrSize);
+    } else {
+      ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+
+      // If we're using MUBUF instructions for global memory, an SGPR base
+      // register is possible. Otherwise this needs to be a VGPR.
+      unsigned PtrBankID = Subtarget.useFlatForGlobal() ?
+        AMDGPU::VGPRRegBankID : AMDGPU::SGPRRegBankID;
+
+      PtrMapping = AMDGPU::getValueMapping(PtrBankID, PtrSize);
+    }
   } else {
-    ValMapping = AMDGPU::getValueMappingLoadSGPROnly(AMDGPU::VGPRRegBankID, LoadTy);
+    ValMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
     PtrMapping = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, PtrSize);
   }
 
@@ -2449,11 +3353,35 @@ AMDGPURegisterBankInfo::getAGPROpMapping(Register Reg,
 /// in RegBankSelect::Mode::Fast.  Any mapping that would cause a
 /// VGPR to SGPR generated is illegal.
 ///
+// Operands that must be SGPRs must accept potentially divergent VGPRs as
+// legal. These will be dealt with in applyMappingImpl.
+//
 const RegisterBankInfo::InstructionMapping &
 AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   const MachineFunction &MF = *MI.getParent()->getParent();
   const MachineRegisterInfo &MRI = MF.getRegInfo();
 
+  if (MI.isCopy()) {
+    // The default logic bothers to analyze impossible alternative mappings. We
+    // want the most straightforward mapping, so just directly handle this.
+    const RegisterBank *DstBank = getRegBank(MI.getOperand(0).getReg(), MRI,
+                                             *TRI);
+    const RegisterBank *SrcBank = getRegBank(MI.getOperand(1).getReg(), MRI,
+                                             *TRI);
+    assert(SrcBank && "src bank should have been assigned already");
+    if (!DstBank)
+      DstBank = SrcBank;
+
+    unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+    if (cannotCopy(*DstBank, *SrcBank, Size))
+      return getInvalidInstructionMapping();
+
+    const ValueMapping &ValMap = getValueMapping(0, Size, *DstBank);
+    return getInstructionMapping(
+        1, /*Cost*/ 1,
+        /*OperandsMapping*/ getOperandsMapping({&ValMap}), 1);
+  }
+
   if (MI.isRegSequence()) {
     // If any input is a VGPR, the result must be a VGPR. The default handling
     // assumes any copy between banks is legal.
@@ -2592,6 +3520,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     LLVM_FALLTHROUGH;
   }
   case AMDGPU::G_PTR_ADD:
+  case AMDGPU::G_PTRMASK:
   case AMDGPU::G_ADD:
   case AMDGPU::G_SUB:
   case AMDGPU::G_MUL:
@@ -2608,6 +3537,7 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_SMAX:
   case AMDGPU::G_UMIN:
   case AMDGPU::G_UMAX:
+  case AMDGPU::G_SHUFFLE_VECTOR:
     if (isSALUMapping(MI))
       return getDefaultMappingSOP(MI);
     LLVM_FALLTHROUGH;
@@ -2635,7 +3565,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_FMAXNUM_IEEE:
   case AMDGPU::G_FCANONICALIZE:
   case AMDGPU::G_INTRINSIC_TRUNC:
+  case AMDGPU::G_BSWAP: // TODO: Somehow expand for scalar?
+  case AMDGPU::G_FSHR: // TODO: Expand for scalar
   case AMDGPU::G_AMDGPU_FFBH_U32:
+  case AMDGPU::G_AMDGPU_FMIN_LEGACY:
+  case AMDGPU::G_AMDGPU_FMAX_LEGACY:
+  case AMDGPU::G_AMDGPU_RCP_IFLAG:
+  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE0:
+  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE1:
+  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE2:
+  case AMDGPU::G_AMDGPU_CVT_F32_UBYTE3:
     return getDefaultMappingVOP(MI);
   case AMDGPU::G_UMULH:
   case AMDGPU::G_SMULH: {
@@ -2664,6 +3603,13 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
     break;
   }
+  case AMDGPU::G_DYN_STACKALLOC: {
+    // Result is always uniform, and a wave reduction is needed for the source.
+    OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
+    unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+    OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, 32);
+    break;
+  }
   case AMDGPU::G_INSERT: {
     unsigned BankID = isSALUMapping(MI) ? AMDGPU::SGPRRegBankID :
                                           AMDGPU::VGPRRegBankID;
@@ -2719,12 +3665,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_BITCAST:
   case AMDGPU::G_INTTOPTR:
   case AMDGPU::G_PTRTOINT:
-  case AMDGPU::G_CTLZ:
-  case AMDGPU::G_CTLZ_ZERO_UNDEF:
-  case AMDGPU::G_CTTZ:
-  case AMDGPU::G_CTTZ_ZERO_UNDEF:
-  case AMDGPU::G_CTPOP:
-  case AMDGPU::G_BSWAP:
   case AMDGPU::G_BITREVERSE:
   case AMDGPU::G_FABS:
   case AMDGPU::G_FNEG: {
@@ -2733,21 +3673,33 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     OpdsMapping[0] = OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
     break;
   }
+  case AMDGPU::G_CTLZ_ZERO_UNDEF:
+  case AMDGPU::G_CTTZ_ZERO_UNDEF:
+  case AMDGPU::G_CTPOP: {
+    unsigned Size = MRI.getType(MI.getOperand(1).getReg()).getSizeInBits();
+    unsigned BankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
+    OpdsMapping[0] = AMDGPU::getValueMapping(BankID, 32);
+
+    // This should really be getValueMappingSGPR64Only, but allowing the generic
+    // code to handle the register split just makes using LegalizerHelper more
+    // difficult.
+    OpdsMapping[1] = AMDGPU::getValueMapping(BankID, Size);
+    break;
+  }
   case AMDGPU::G_TRUNC: {
     Register Dst = MI.getOperand(0).getReg();
     Register Src = MI.getOperand(1).getReg();
     unsigned Bank = getRegBankID(Src, MRI, *TRI);
     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
     unsigned SrcSize = getSizeInBits(Src, MRI, *TRI);
-    OpdsMapping[0] = DstSize == 1 && Bank != AMDGPU::SGPRRegBankID ?
-      AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize) :
-      AMDGPU::getValueMapping(Bank, DstSize);
+    OpdsMapping[0] = AMDGPU::getValueMapping(Bank, DstSize);
     OpdsMapping[1] = AMDGPU::getValueMapping(Bank, SrcSize);
     break;
   }
   case AMDGPU::G_ZEXT:
   case AMDGPU::G_SEXT:
-  case AMDGPU::G_ANYEXT: {
+  case AMDGPU::G_ANYEXT:
+  case AMDGPU::G_SEXT_INREG: {
     Register Dst = MI.getOperand(0).getReg();
     Register Src = MI.getOperand(1).getReg();
     unsigned DstSize = getSizeInBits(Dst, MRI, *TRI);
@@ -2765,17 +3717,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       break;
     }
 
-    // TODO: Should anyext be split into 32-bit part as well?
-    if (MI.getOpcode() == AMDGPU::G_ANYEXT) {
-      OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, DstSize);
-      OpdsMapping[1] = AMDGPU::getValueMapping(SrcBank->getID(), SrcSize);
-    } else {
-      // Scalar extend can use 64-bit BFE, but VGPRs require extending to
-      // 32-bits, and then to 64.
-      OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
-      OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
-                                                         SrcSize);
-    }
+    // Scalar extend can use 64-bit BFE, but VGPRs require extending to
+    // 32-bits, and then to 64.
+    OpdsMapping[0] = AMDGPU::getValueMappingSGPR64Only(DstBank, DstSize);
+    OpdsMapping[1] = AMDGPU::getValueMappingSGPR64Only(SrcBank->getID(),
+                                                       SrcSize);
     break;
   }
   case AMDGPU::G_FCMP: {
@@ -2790,43 +3736,43 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_STORE: {
     assert(MI.getOperand(0).isReg());
     unsigned Size = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-    // FIXME: We need to specify a different reg bank once scalar stores
-    // are supported.
+
+    // FIXME: We need to specify a different reg bank once scalar stores are
+    // supported.
     const ValueMapping *ValMapping =
         AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
-    // FIXME: Depending on the type of store, the pointer could be in
-    // the SGPR Reg bank.
-    // FIXME: Pointer size should be based on the address space.
-    const ValueMapping *PtrMapping =
-        AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 64);
-
     OpdsMapping[0] = ValMapping;
-    OpdsMapping[1] = PtrMapping;
+    OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
     break;
   }
-
   case AMDGPU::G_ICMP: {
     auto Pred = static_cast<CmpInst::Predicate>(MI.getOperand(1).getPredicate());
     unsigned Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+
+    // See if the result register has already been constrained to vcc, which may
+    // happen due to control flow intrinsic lowering.
+    unsigned DstBank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
+                                    AMDGPU::SGPRRegBankID);
     unsigned Op2Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
     unsigned Op3Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
 
-    bool CanUseSCC = Op2Bank == AMDGPU::SGPRRegBankID &&
+    bool CanUseSCC = DstBank == AMDGPU::SGPRRegBankID &&
+                     Op2Bank == AMDGPU::SGPRRegBankID &&
                      Op3Bank == AMDGPU::SGPRRegBankID &&
       (Size == 32 || (Size == 64 &&
                       (Pred == CmpInst::ICMP_EQ || Pred == CmpInst::ICMP_NE) &&
                       Subtarget.hasScalarCompareEq64()));
 
-    unsigned Op0Bank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
+    DstBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VCCRegBankID;
+    unsigned SrcBank = CanUseSCC ? AMDGPU::SGPRRegBankID : AMDGPU::VGPRRegBankID;
 
     // TODO: Use 32-bit for scalar output size.
     // SCC results will need to be copied to a 32-bit SGPR virtual register.
     const unsigned ResultSize = 1;
 
-    OpdsMapping[0] = AMDGPU::getValueMapping(Op0Bank, ResultSize);
-    OpdsMapping[1] = nullptr; // Predicate Operand.
-    OpdsMapping[2] = AMDGPU::getValueMapping(Op2Bank, Size);
-    OpdsMapping[3] = AMDGPU::getValueMapping(Op3Bank, Size);
+    OpdsMapping[0] = AMDGPU::getValueMapping(DstBank, ResultSize);
+    OpdsMapping[2] = AMDGPU::getValueMapping(SrcBank, Size);
+    OpdsMapping[3] = AMDGPU::getValueMapping(SrcBank, Size);
     break;
   }
   case AMDGPU::G_EXTRACT_VECTOR_ELT: {
@@ -2852,15 +3798,22 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     unsigned VecSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
     unsigned InsertSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
     unsigned IdxSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
-    unsigned SrcBankID = getRegBankID(MI.getOperand(1).getReg(), MRI, *TRI);
     unsigned InsertEltBankID = getRegBankID(MI.getOperand(2).getReg(),
                                             MRI, *TRI);
     unsigned IdxBankID = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
 
     OpdsMapping[0] = AMDGPU::getValueMapping(OutputBankID, VecSize);
-    OpdsMapping[1] = AMDGPU::getValueMapping(SrcBankID, VecSize);
-    OpdsMapping[2] = AMDGPU::getValueMappingSGPR64Only(InsertEltBankID,
-                                                       InsertSize);
+    OpdsMapping[1] = AMDGPU::getValueMapping(OutputBankID, VecSize);
+
+    // This is a weird case, because we need to break down the mapping based on
+    // the register bank of a different operand.
+    if (InsertSize == 64 && OutputBankID == AMDGPU::VGPRRegBankID) {
+      OpdsMapping[2] = AMDGPU::getValueMappingSplit64(InsertEltBankID,
+                                                      InsertSize);
+    } else {
+      assert(InsertSize == 32 || InsertSize == 64);
+      OpdsMapping[2] = AMDGPU::getValueMapping(InsertEltBankID, InsertSize);
+    }
 
     // The index can be either if the source vector is VGPR.
     OpdsMapping[3] = AMDGPU::getValueMapping(IdxBankID, IdxSize);
@@ -2878,6 +3831,116 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     break;
   }
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD:
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_UBYTE:
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SBYTE:
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_USHORT:
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_SSHORT:
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT:
+  case AMDGPU::G_AMDGPU_BUFFER_LOAD_FORMAT_D16:
+  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT:
+  case AMDGPU::G_AMDGPU_TBUFFER_LOAD_FORMAT_D16:
+  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT:
+  case AMDGPU::G_AMDGPU_TBUFFER_STORE_FORMAT_D16:
+  case AMDGPU::G_AMDGPU_BUFFER_STORE:
+  case AMDGPU::G_AMDGPU_BUFFER_STORE_BYTE:
+  case AMDGPU::G_AMDGPU_BUFFER_STORE_SHORT:
+  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT:
+  case AMDGPU::G_AMDGPU_BUFFER_STORE_FORMAT_D16: {
+    OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+
+    // rsrc
+    OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+
+    // vindex
+    OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+
+    // voffset
+    OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+
+    // soffset
+    OpdsMapping[4] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+
+    // Any remaining operands are immediates and were correctly null
+    // initialized.
+    break;
+  }
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SWAP:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_ADD:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SUB:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMIN:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMIN:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_SMAX:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_UMAX:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_AND:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_OR:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_XOR:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_INC:
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_DEC: {
+    // vdata_out
+    OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+
+    // vdata_in
+    OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+
+    // rsrc
+    OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+
+    // vindex
+    OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+
+    // voffset
+    OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+
+    // soffset
+    OpdsMapping[5] = getSGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+
+    // Any remaining operands are immediates and were correctly null
+    // initialized.
+    break;
+  }
+  case AMDGPU::G_AMDGPU_BUFFER_ATOMIC_CMPSWAP: {
+    // vdata_out
+    OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+
+    // vdata_in
+    OpdsMapping[1] = getVGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+
+    // cmp
+    OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+
+    // rsrc
+    OpdsMapping[3] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+
+    // vindex
+    OpdsMapping[4] = getVGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+
+    // voffset
+    OpdsMapping[5] = getVGPROpMapping(MI.getOperand(5).getReg(), MRI, *TRI);
+
+    // soffset
+    OpdsMapping[6] = getSGPROpMapping(MI.getOperand(6).getReg(), MRI, *TRI);
+
+    // Any remaining operands are immediates and were correctly null
+    // initialized.
+    break;
+  }
+  case AMDGPU::G_AMDGPU_S_BUFFER_LOAD: {
+    // Lie and claim everything is legal, even though some need to be
+    // SGPRs. applyMapping will have to deal with it as a waterfall loop.
+    OpdsMapping[1] = getSGPROpMapping(MI.getOperand(1).getReg(), MRI, *TRI);
+    OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+
+    // We need to convert this to a MUBUF if either the resource of offset is
+    // VGPR.
+    unsigned RSrcBank = OpdsMapping[1]->BreakDown[0].RegBank->getID();
+    unsigned OffsetBank = OpdsMapping[2]->BreakDown[0].RegBank->getID();
+    unsigned ResultBank = regBankUnion(RSrcBank, OffsetBank);
+
+    unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+    OpdsMapping[0] = AMDGPU::getValueMapping(ResultBank, Size0);
+    break;
+  }
   case AMDGPU::G_INTRINSIC: {
     switch (MI.getIntrinsicID()) {
     default:
@@ -2890,9 +3953,11 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_log_clamp:
     case Intrinsic::amdgcn_rcp:
     case Intrinsic::amdgcn_rcp_legacy:
+    case Intrinsic::amdgcn_sqrt:
     case Intrinsic::amdgcn_rsq:
     case Intrinsic::amdgcn_rsq_legacy:
     case Intrinsic::amdgcn_rsq_clamp:
+    case Intrinsic::amdgcn_fmul_legacy:
     case Intrinsic::amdgcn_ldexp:
     case Intrinsic::amdgcn_frexp_mant:
     case Intrinsic::amdgcn_frexp_exp:
@@ -2911,8 +3976,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_fmad_ftz:
     case Intrinsic::amdgcn_mbcnt_lo:
     case Intrinsic::amdgcn_mbcnt_hi:
-    case Intrinsic::amdgcn_ubfe:
-    case Intrinsic::amdgcn_sbfe:
     case Intrinsic::amdgcn_mul_u24:
     case Intrinsic::amdgcn_mul_i24:
     case Intrinsic::amdgcn_lerp:
@@ -2933,13 +3996,21 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_udot4:
     case Intrinsic::amdgcn_sdot8:
     case Intrinsic::amdgcn_udot8:
-    case Intrinsic::amdgcn_wwm:
-    case Intrinsic::amdgcn_wqm:
+      return getDefaultMappingVOP(MI);
+    case Intrinsic::amdgcn_sbfe:
+    case Intrinsic::amdgcn_ubfe:
+      if (isSALUMapping(MI))
+        return getDefaultMappingSOP(MI);
       return getDefaultMappingVOP(MI);
     case Intrinsic::amdgcn_ds_swizzle:
     case Intrinsic::amdgcn_ds_permute:
     case Intrinsic::amdgcn_ds_bpermute:
     case Intrinsic::amdgcn_update_dpp:
+    case Intrinsic::amdgcn_mov_dpp8:
+    case Intrinsic::amdgcn_mov_dpp:
+    case Intrinsic::amdgcn_wwm:
+    case Intrinsic::amdgcn_wqm:
+    case Intrinsic::amdgcn_softwqm:
       return getDefaultMappingAllVGPR(MI);
     case Intrinsic::amdgcn_kernarg_segment_ptr:
     case Intrinsic::amdgcn_s_getpc:
@@ -2954,26 +4025,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
         = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Size);
       break;
     }
-    case Intrinsic::amdgcn_s_buffer_load: {
-      // FIXME: This should be moved to G_INTRINSIC_W_SIDE_EFFECTS
-      Register RSrc = MI.getOperand(2).getReg();   // SGPR
-      Register Offset = MI.getOperand(3).getReg(); // SGPR/imm
-
-      unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-      unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
-      unsigned Size3 = MRI.getType(Offset).getSizeInBits();
-
-      unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
-      unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
-
-      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size0);
-      OpdsMapping[1] = nullptr; // intrinsic id
-
-      // Lie and claim everything is legal, even though some need to be
-      // SGPRs. applyMapping will have to deal with it as a waterfall loop.
-      OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
-      OpdsMapping[3] = AMDGPU::getValueMapping(OffsetBank, Size3);
-      OpdsMapping[4] = nullptr;
+    case Intrinsic::amdgcn_ps_live: {
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, 1);
       break;
     }
     case Intrinsic::amdgcn_div_scale: {
@@ -2983,11 +4036,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, Dst1Size);
 
       unsigned SrcSize = MRI.getType(MI.getOperand(3).getReg()).getSizeInBits();
-      OpdsMapping[3] = AMDGPU::getValueMapping(
-        getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI), SrcSize);
-      OpdsMapping[4] = AMDGPU::getValueMapping(
-        getRegBankID(MI.getOperand(4).getReg(), MRI, *TRI), SrcSize);
-
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
+      OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, SrcSize);
       break;
     }
     case Intrinsic::amdgcn_class: {
@@ -2997,10 +4047,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       unsigned Src1Size = MRI.getType(Src1Reg).getSizeInBits();
       unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, DstSize);
-      OpdsMapping[2] = AMDGPU::getValueMapping(getRegBankID(Src0Reg, MRI, *TRI),
-                                               Src0Size);
-      OpdsMapping[3] = AMDGPU::getValueMapping(getRegBankID(Src1Reg, MRI, *TRI),
-                                               Src1Size);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src0Size);
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Src1Size);
       break;
     }
     case Intrinsic::amdgcn_icmp:
@@ -3009,10 +4057,8 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       // This is not VCCRegBank because this is not used in boolean contexts.
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
       unsigned OpSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
-      unsigned Op1Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI);
-      unsigned Op2Bank = getRegBankID(MI.getOperand(3).getReg(), MRI, *TRI);
-      OpdsMapping[2] = AMDGPU::getValueMapping(Op1Bank, OpSize);
-      OpdsMapping[3] = AMDGPU::getValueMapping(Op2Bank, OpSize);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, OpSize);
       break;
     }
     case Intrinsic::amdgcn_readlane: {
@@ -3054,6 +4100,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
     }
+    case Intrinsic::amdgcn_permlane16:
+    case Intrinsic::amdgcn_permlanex16: {
+      unsigned Size = getSizeInBits(MI.getOperand(0).getReg(), MRI, *TRI);
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size);
+      OpdsMapping[4] = getSGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+      OpdsMapping[5] = getSGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
+      break;
+    }
     case Intrinsic::amdgcn_mfma_f32_4x4x1f32:
     case Intrinsic::amdgcn_mfma_f32_4x4x4f16:
     case Intrinsic::amdgcn_mfma_i32_4x4x4i8:
@@ -3086,9 +4142,46 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[4] = getAGPROpMapping(MI.getOperand(4).getReg(), MRI, *TRI);
       break;
     }
+    case Intrinsic::amdgcn_interp_p1:
+    case Intrinsic::amdgcn_interp_p2:
+    case Intrinsic::amdgcn_interp_mov:
+    case Intrinsic::amdgcn_interp_p1_f16:
+    case Intrinsic::amdgcn_interp_p2_f16: {
+      const int M0Idx = MI.getNumOperands() - 1;
+      Register M0Reg = MI.getOperand(M0Idx).getReg();
+      unsigned M0Bank = getRegBankID(M0Reg, MRI, *TRI, AMDGPU::SGPRRegBankID);
+      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+      for (int I = 2; I != M0Idx && MI.getOperand(I).isReg(); ++I)
+        OpdsMapping[I] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
+
+      // Must be SGPR, but we must take whatever the original bank is and fix it
+      // later.
+      OpdsMapping[M0Idx] = AMDGPU::getValueMapping(M0Bank, 32);
+      break;
+    }
+    case Intrinsic::amdgcn_ballot: {
+      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+      unsigned SrcSize = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, DstSize);
+      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::VCCRegBankID, SrcSize);
+      break;
+    }
     }
     break;
   }
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_LOAD:
+  case AMDGPU::G_AMDGPU_INTRIN_IMAGE_STORE: {
+    auto IntrID = MI.getIntrinsicID();
+    const AMDGPU::RsrcIntrinsic *RSrcIntrin = AMDGPU::lookupRsrcIntrinsic(IntrID);
+    assert(RSrcIntrin && "missing RsrcIntrinsic for image intrinsic");
+    // Non-images can have complications from operands that allow both SGPR
+    // and VGPR. For now it's too complicated to figure out the final opcode
+    // to derive the register bank from the MCInstrDesc.
+    assert(RSrcIntrin->IsImage);
+    return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
+  }
   case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
     auto IntrID = MI.getIntrinsicID();
     switch (IntrID) {
@@ -3100,13 +4193,9 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
     }
-    case Intrinsic::amdgcn_ds_append:
-    case Intrinsic::amdgcn_ds_consume:
     case Intrinsic::amdgcn_ds_fadd:
     case Intrinsic::amdgcn_ds_fmin:
     case Intrinsic::amdgcn_ds_fmax:
-    case Intrinsic::amdgcn_atomic_inc:
-    case Intrinsic::amdgcn_atomic_dec:
       return getDefaultMappingAllVGPR(MI);
     case Intrinsic::amdgcn_ds_ordered_add:
     case Intrinsic::amdgcn_ds_ordered_swap: {
@@ -3118,17 +4207,16 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
       break;
     }
+    case Intrinsic::amdgcn_ds_append:
+    case Intrinsic::amdgcn_ds_consume: {
+      unsigned DstSize = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
+      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, DstSize);
+      OpdsMapping[2] = getSGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+      break;
+    }
     case Intrinsic::amdgcn_exp_compr:
-      OpdsMapping[0] = nullptr; // IntrinsicID
-      // FIXME: These are immediate values which can't be read from registers.
-      OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
-      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
-      // FIXME: Could we support packed types here?
       OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
       OpdsMapping[4] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
-      // FIXME: These are immediate values which can't be read from registers.
-      OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
-      OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, 32);
       break;
     case Intrinsic::amdgcn_exp:
       // FIXME: Could we support packed types here?
@@ -3137,31 +4225,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[5] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
       OpdsMapping[6] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, 32);
       break;
-    case Intrinsic::amdgcn_buffer_load: {
-      Register RSrc = MI.getOperand(2).getReg();   // SGPR
-      Register VIndex = MI.getOperand(3).getReg(); // VGPR
-      Register Offset = MI.getOperand(4).getReg(); // SGPR/VGPR/imm
-
-      unsigned Size0 = MRI.getType(MI.getOperand(0).getReg()).getSizeInBits();
-      unsigned Size2 = MRI.getType(RSrc).getSizeInBits();
-      unsigned Size3 = MRI.getType(VIndex).getSizeInBits();
-      unsigned Size4 = MRI.getType(Offset).getSizeInBits();
-
-      unsigned RSrcBank = getRegBankID(RSrc, MRI, *TRI);
-      unsigned OffsetBank = getRegBankID(Offset, MRI, *TRI);
-
-      OpdsMapping[0] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size0);
-      OpdsMapping[1] = nullptr; // intrinsic id
-
-      // Lie and claim everything is legal, even though some need to be
-      // SGPRs. applyMapping will have to deal with it as a waterfall loop.
-      OpdsMapping[2] = AMDGPU::getValueMapping(RSrcBank, Size2); // rsrc
-      OpdsMapping[3] = AMDGPU::getValueMapping(AMDGPU::VGPRRegBankID, Size3);
-      OpdsMapping[4] = AMDGPU::getValueMapping(OffsetBank, Size4);
-      OpdsMapping[5] = nullptr;
-      OpdsMapping[6] = nullptr;
-      break;
-    }
     case Intrinsic::amdgcn_s_sendmsg:
     case Intrinsic::amdgcn_s_sendmsghalt: {
       // This must be an SGPR, but accept a VGPR.
@@ -3170,8 +4233,14 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
       break;
     }
-    case Intrinsic::amdgcn_end_cf:
-    case Intrinsic::amdgcn_init_exec: {
+    case Intrinsic::amdgcn_s_setreg: {
+      // This must be an SGPR, but accept a VGPR.
+      unsigned Bank = getRegBankID(MI.getOperand(2).getReg(), MRI, *TRI,
+                                   AMDGPU::SGPRRegBankID);
+      OpdsMapping[2] = AMDGPU::getValueMapping(Bank, 32);
+      break;
+    }
+    case Intrinsic::amdgcn_end_cf: {
       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
@@ -3227,7 +4296,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     case Intrinsic::amdgcn_init_exec_from_input: {
       unsigned Size = getSizeInBits(MI.getOperand(1).getReg(), MRI, *TRI);
       OpdsMapping[1] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
-      OpdsMapping[2] = AMDGPU::getValueMapping(AMDGPU::SGPRRegBankID, Size);
       break;
     }
     case Intrinsic::amdgcn_ds_gws_init:
@@ -3251,15 +4319,6 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       break;
     }
     default:
-      if (const AMDGPU::RsrcIntrinsic *RSrcIntrin =
-              AMDGPU::lookupRsrcIntrinsic(IntrID)) {
-        // Non-images can have complications from operands that allow both SGPR
-        // and VGPR. For now it's too complicated to figure out the final opcode
-        // to derive the register bank from the MCInstrDesc.
-        if (RSrcIntrin->IsImage)
-          return getImageMapping(MRI, MI, RSrcIntrin->RsrcArg);
-      }
-
       return getInvalidInstructionMapping();
     }
     break;
@@ -3319,9 +4378,20 @@ AMDGPURegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case AMDGPU::G_ATOMICRMW_UMAX:
   case AMDGPU::G_ATOMICRMW_UMIN:
   case AMDGPU::G_ATOMICRMW_FADD:
-  case AMDGPU::G_ATOMIC_CMPXCHG:
-  case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG: {
-    return getDefaultMappingAllVGPR(MI);
+  case AMDGPU::G_AMDGPU_ATOMIC_CMPXCHG:
+  case AMDGPU::G_AMDGPU_ATOMIC_INC:
+  case AMDGPU::G_AMDGPU_ATOMIC_DEC: {
+    OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+    OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
+    OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+    break;
+  }
+  case AMDGPU::G_ATOMIC_CMPXCHG: {
+    OpdsMapping[0] = getVGPROpMapping(MI.getOperand(0).getReg(), MRI, *TRI);
+    OpdsMapping[1] = getValueMappingForPtr(MRI, MI.getOperand(1).getReg());
+    OpdsMapping[2] = getVGPROpMapping(MI.getOperand(2).getReg(), MRI, *TRI);
+    OpdsMapping[3] = getVGPROpMapping(MI.getOperand(3).getReg(), MRI, *TRI);
+    break;
   }
   case AMDGPU::G_BRCOND: {
     unsigned Bank = getRegBankID(MI.getOperand(0).getReg(), MRI, *TRI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
index 1ac7d3652a8b..8f38ec4eeb3a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBankInfo.h
@@ -69,13 +69,20 @@ public:
 
   void constrainOpWithReadfirstlane(MachineInstr &MI, MachineRegisterInfo &MRI,
                                     unsigned OpIdx) const;
-  bool applyMappingWideLoad(MachineInstr &MI,
-                            const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
-                            MachineRegisterInfo &MRI) const;
+  bool applyMappingDynStackAlloc(MachineInstr &MI,
+                                 const OperandsMapper &OpdMapper,
+                                 MachineRegisterInfo &MRI) const;
+  bool applyMappingLoad(MachineInstr &MI,
+                        const OperandsMapper &OpdMapper,
+                        MachineRegisterInfo &MRI) const;
   bool
   applyMappingImage(MachineInstr &MI,
-                    const AMDGPURegisterBankInfo::OperandsMapper &OpdMapper,
+                    const OperandsMapper &OpdMapper,
                     MachineRegisterInfo &MRI, int RSrcIdx) const;
+  bool applyMappingSBufferLoad(const OperandsMapper &OpdMapper) const;
+
+  bool applyMappingBFEIntrinsic(const OperandsMapper &OpdMapper,
+                                bool Signed) const;
 
   void lowerScalarMinMax(MachineIRBuilder &B, MachineInstr &MI) const;
 
@@ -91,6 +98,9 @@ public:
   /// See RegisterBankInfo::applyMapping.
   void applyMappingImpl(const OperandsMapper &OpdMapper) const override;
 
+  const ValueMapping *getValueMappingForPtr(const MachineRegisterInfo &MRI,
+                                            Register Ptr) const;
+
   const RegisterBankInfo::InstructionMapping &
   getInstrMappingForLoad(const MachineInstr &MI) const;
 
@@ -168,6 +178,15 @@ public:
 
   const InstructionMapping &
   getInstrMapping(const MachineInstr &MI) const override;
+
+private:
+
+  bool foldExtractEltToCmpSelect(MachineInstr &MI,
+                                 MachineRegisterInfo &MRI,
+                                 const OperandsMapper &OpdMapper) const;
+  bool foldInsertEltToCmpSelect(MachineInstr &MI,
+                                MachineRegisterInfo &MRI,
+                                const OperandsMapper &OpdMapper) const;
 };
 } // End llvm namespace.
 #endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
index c495316c5bce..9f6ebd00cd97 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPURegisterBanks.td
@@ -7,16 +7,16 @@
 //===----------------------------------------------------------------------===//
 
 def SGPRRegBank : RegisterBank<"SGPR",
-  [SReg_32, SReg_64, SReg_128, SReg_256, SReg_512, SReg_1024]
+  [SReg_LO16, SReg_32, SReg_64, SReg_128, SReg_160, SReg_192, SReg_256, SReg_512, SReg_1024]
 >;
 
 def VGPRRegBank : RegisterBank<"VGPR",
-  [VGPR_32, VReg_64, VReg_96, VReg_128, VReg_256, VReg_512, VReg_1024]
+  [VGPR_LO16, VGPR_HI16, VGPR_32, VReg_64, VReg_96, VReg_128, VReg_160, VReg_192, VReg_256, VReg_512, VReg_1024]
 >;
 
 // It is helpful to distinguish conditions from ordinary SGPRs.
 def VCCRegBank : RegisterBank <"VCC", [SReg_1]>;
 
 def AGPRRegBank : RegisterBank <"AGPR",
-  [AGPR_32, AReg_64, AReg_128, AReg_512, AReg_1024]
+  [AGPR_LO16, AGPR_32, AReg_64, AReg_96, AReg_128, AReg_160, AReg_192, AReg_256, AReg_512, AReg_1024]
 >;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
deleted file mode 100644
index 9806e6b0714f..000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-//===-- AMDGPURegisterInfo.cpp - AMDGPU Register Information -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// Parent TargetRegisterInfo class common to all hw codegen targets.
-//
-//===----------------------------------------------------------------------===//
-
-#include "AMDGPURegisterInfo.h"
-#include "AMDGPUTargetMachine.h"
-#include "SIMachineFunctionInfo.h"
-#include "SIRegisterInfo.h"
-#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
-
-using namespace llvm;
-
-AMDGPURegisterInfo::AMDGPURegisterInfo() : AMDGPUGenRegisterInfo(0) {}
-
-//===----------------------------------------------------------------------===//
-// Function handling callbacks - Functions are a seldom used feature of GPUS, so
-// they are not supported at this time.
-//===----------------------------------------------------------------------===//
-
-// Table of NumRegs sized pieces at every 32-bit offset.
-static const uint16_t SubRegFromChannelTable[][32] = {
-  { AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
-    AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
-    AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
-    AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
-    AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
-    AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
-    AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
-    AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31
-  },
-  {
-    AMDGPU::sub0_sub1, AMDGPU::sub1_sub2, AMDGPU::sub2_sub3, AMDGPU::sub3_sub4,
-    AMDGPU::sub4_sub5, AMDGPU::sub5_sub6, AMDGPU::sub6_sub7, AMDGPU::sub7_sub8,
-    AMDGPU::sub8_sub9, AMDGPU::sub9_sub10, AMDGPU::sub10_sub11, AMDGPU::sub11_sub12,
-    AMDGPU::sub12_sub13, AMDGPU::sub13_sub14, AMDGPU::sub14_sub15, AMDGPU::sub15_sub16,
-    AMDGPU::sub16_sub17, AMDGPU::sub17_sub18, AMDGPU::sub18_sub19, AMDGPU::sub19_sub20,
-    AMDGPU::sub20_sub21, AMDGPU::sub21_sub22, AMDGPU::sub22_sub23, AMDGPU::sub23_sub24,
-    AMDGPU::sub24_sub25, AMDGPU::sub25_sub26, AMDGPU::sub26_sub27, AMDGPU::sub27_sub28,
-    AMDGPU::sub28_sub29, AMDGPU::sub29_sub30, AMDGPU::sub30_sub31, AMDGPU::NoSubRegister
-  },
-  {
-    AMDGPU::sub0_sub1_sub2, AMDGPU::sub1_sub2_sub3, AMDGPU::sub2_sub3_sub4, AMDGPU::sub3_sub4_sub5,
-    AMDGPU::sub4_sub5_sub6, AMDGPU::sub5_sub6_sub7, AMDGPU::sub6_sub7_sub8, AMDGPU::sub7_sub8_sub9,
-    AMDGPU::sub8_sub9_sub10, AMDGPU::sub9_sub10_sub11, AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13,
-    AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15, AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17,
-    AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19, AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21,
-    AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23, AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25,
-    AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27, AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29,
-    AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister
-  },
-  {
-    AMDGPU::sub0_sub1_sub2_sub3, AMDGPU::sub1_sub2_sub3_sub4, AMDGPU::sub2_sub3_sub4_sub5, AMDGPU::sub3_sub4_sub5_sub6,
-    AMDGPU::sub4_sub5_sub6_sub7, AMDGPU::sub5_sub6_sub7_sub8, AMDGPU::sub6_sub7_sub8_sub9, AMDGPU::sub7_sub8_sub9_sub10,
-    AMDGPU::sub8_sub9_sub10_sub11, AMDGPU::sub9_sub10_sub11_sub12, AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14,
-    AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16, AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18,
-    AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20, AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22,
-    AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24, AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26,
-    AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28, AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30,
-    AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister, AMDGPU::NoSubRegister
-  }
-};
-
-// FIXME: TableGen should generate something to make this manageable for all
-// register classes. At a minimum we could use the opposite of
-// composeSubRegIndices and go up from the base 32-bit subreg.
-unsigned AMDGPURegisterInfo::getSubRegFromChannel(unsigned Channel, unsigned NumRegs) {
-  const unsigned NumRegIndex = NumRegs - 1;
-
-  assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) &&
-         "Not implemented");
-  assert(Channel < array_lengthof(SubRegFromChannelTable[0]));
-  return SubRegFromChannelTable[NumRegIndex][Channel];
-}
-
-void AMDGPURegisterInfo::reserveRegisterTuples(BitVector &Reserved, unsigned Reg) const {
-  MCRegAliasIterator R(Reg, this, true);
-
-  for (; R.isValid(); ++R)
-    Reserved.set(*R);
-}
-
-#define GET_REGINFO_TARGET_DESC
-#include "AMDGPUGenRegisterInfo.inc"
-
-// Forced to be here by one .inc
-const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
-  const MachineFunction *MF) const {
-  CallingConv::ID CC = MF->getFunction().getCallingConv();
-  switch (CC) {
-  case CallingConv::C:
-  case CallingConv::Fast:
-  case CallingConv::Cold:
-    return CSR_AMDGPU_HighRegs_SaveList;
-  default: {
-    // Dummy to not crash RegisterClassInfo.
-    static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
-    return &NoCalleeSavedReg;
-  }
-  }
-}
-
-const MCPhysReg *
-SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
-  return nullptr;
-}
-
-const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
-                                                     CallingConv::ID CC) const {
-  switch (CC) {
-  case CallingConv::C:
-  case CallingConv::Fast:
-  case CallingConv::Cold:
-    return CSR_AMDGPU_HighRegs_RegMask;
-  default:
-    return nullptr;
-  }
-}
-
-Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
-  const SIFrameLowering *TFI =
-      MF.getSubtarget<GCNSubtarget>().getFrameLowering();
-  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-  return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
-                        : FuncInfo->getStackPtrOffsetReg();
-}
-
-const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
-  return CSR_AMDGPU_AllVGPRs_RegMask;
-}
-
-const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
-  return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
-}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
deleted file mode 100644
index 9e713ca804a1..000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.h
+++ /dev/null
@@ -1,38 +0,0 @@
-//===-- AMDGPURegisterInfo.h - AMDGPURegisterInfo Interface -*- C++ -*-----===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-/// \file
-/// TargetRegisterInfo interface that is implemented by all hw codegen
-/// targets.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
-#define LLVM_LIB_TARGET_AMDGPU_AMDGPUREGISTERINFO_H
-
-#define GET_REGINFO_HEADER
-#include "AMDGPUGenRegisterInfo.inc"
-
-namespace llvm {
-
-class GCNSubtarget;
-class TargetInstrInfo;
-
-struct AMDGPURegisterInfo : public AMDGPUGenRegisterInfo {
-  AMDGPURegisterInfo();
-
-  /// \returns the sub reg enum value for the given \p Channel
-  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
-  static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1);
-
-  void reserveRegisterTuples(BitVector &, unsigned Reg) const;
-};
-
-} // End namespace llvm
-
-#endif
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td b/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td
deleted file mode 100644
index ab71b7aa8a57..000000000000
--- a/llvm/lib/Target/AMDGPU/AMDGPURegisterInfo.td
+++ /dev/null
@@ -1,21 +0,0 @@
-//===-- AMDGPURegisterInfo.td - AMDGPU register info -------*- tablegen -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Tablegen register definitions common to all hw codegen targets.
-//
-//===----------------------------------------------------------------------===//
-
-let Namespace = "AMDGPU" in {
-
-foreach Index = 0-31 in {
-  def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
-}
-
-}
-
-include "SIRegisterInfo.td"
diff --git a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
index 9a1e2fc42ed5..9c3d96de6d68 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPURewriteOutArguments.cpp
@@ -208,8 +208,8 @@ bool AMDGPURewriteOutArguments::doInitialization(Module &M) {
 
 #ifndef NDEBUG
 bool AMDGPURewriteOutArguments::isVec3ToVec4Shuffle(Type *Ty0, Type* Ty1) const {
-  VectorType *VT0 = dyn_cast<VectorType>(Ty0);
-  VectorType *VT1 = dyn_cast<VectorType>(Ty1);
+  auto *VT0 = dyn_cast<FixedVectorType>(Ty0);
+  auto *VT1 = dyn_cast<FixedVectorType>(Ty1);
   if (!VT0 || !VT1)
     return false;
 
@@ -409,7 +409,7 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
             DL->getTypeSizeInBits(Val->getType())) {
           assert(isVec3ToVec4Shuffle(EffectiveEltTy, Val->getType()));
           Val = B.CreateShuffleVector(Val, UndefValue::get(Val->getType()),
-                                      { 0, 1, 2 });
+                                      ArrayRef<int>{0, 1, 2});
         }
 
         Val = B.CreateBitCast(Val, EffectiveEltTy);
@@ -453,9 +453,8 @@ bool AMDGPURewriteOutArguments::runOnFunction(Function &F) {
     PointerType *ArgType = cast<PointerType>(Arg.getType());
 
     auto *EltTy = ArgType->getElementType();
-    unsigned Align = Arg.getParamAlignment();
-    if (Align == 0)
-      Align = DL->getABITypeAlignment(EltTy);
+    const auto Align =
+        DL->getValueOrABITypeAlignment(Arg.getParamAlign(), EltTy);
 
     Value *Val = B.CreateExtractValue(StubCall, RetIdx++);
     Type *PtrTy = Val->getType()->getPointerTo(ArgType->getAddressSpace());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
index 8d70536ec21c..bc68310b2f5c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSearchableTables.td
@@ -198,6 +198,7 @@ def : SourceOfDivergence<int_r600_read_tidig_y>;
 def : SourceOfDivergence<int_r600_read_tidig_z>;
 def : SourceOfDivergence<int_amdgcn_atomic_inc>;
 def : SourceOfDivergence<int_amdgcn_atomic_dec>;
+def : SourceOfDivergence<int_amdgcn_global_atomic_csub>;
 def : SourceOfDivergence<int_amdgcn_ds_fadd>;
 def : SourceOfDivergence<int_amdgcn_ds_fmin>;
 def : SourceOfDivergence<int_amdgcn_ds_fmax>;
@@ -238,6 +239,7 @@ def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_xor>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_inc>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_dec>;
 def : SourceOfDivergence<int_amdgcn_struct_buffer_atomic_cmpswap>;
+def : SourceOfDivergence<int_amdgcn_buffer_atomic_csub>;
 def : SourceOfDivergence<int_amdgcn_ps_live>;
 def : SourceOfDivergence<int_amdgcn_ds_swizzle>;
 def : SourceOfDivergence<int_amdgcn_ds_ordered_add>;
@@ -247,6 +249,7 @@ def : SourceOfDivergence<int_amdgcn_permlanex16>;
 def : SourceOfDivergence<int_amdgcn_mov_dpp>;
 def : SourceOfDivergence<int_amdgcn_mov_dpp8>;
 def : SourceOfDivergence<int_amdgcn_update_dpp>;
+def : SourceOfDivergence<int_amdgcn_writelane>;
 
 def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
 def : SourceOfDivergence<int_amdgcn_mfma_f32_4x4x1f32>;
@@ -270,5 +273,13 @@ def : SourceOfDivergence<int_amdgcn_mfma_i32_32x32x8i8>;
 def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x2bf16>;
 def : SourceOfDivergence<int_amdgcn_mfma_f32_32x32x4bf16>;
 
+// The dummy boolean output is divergent from the IR's perspective,
+// but the mask results are uniform. These produce a divergent and
+// uniform result, so the returned struct is collectively divergent.
+// isAlwaysUniform can override the extract of the uniform component.
+def : SourceOfDivergence<int_amdgcn_if>;
+def : SourceOfDivergence<int_amdgcn_else>;
+def : SourceOfDivergence<int_amdgcn_loop>;
+
 foreach intr = AMDGPUImageDimAtomicIntrinsics in
 def : SourceOfDivergence<intr>;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
index 445e91092499..213788ae0f67 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.cpp
@@ -59,13 +59,6 @@ R600Subtarget::initializeSubtargetDependencies(const Triple &TT,
   FullFS += FS;
   ParseSubtargetFeatures(GPU, FullFS);
 
-  // FIXME: I don't think think Evergreen has any useful support for
-  // denormals, but should be checked. Should we issue a warning somewhere
-  // if someone tries to enable these?
-  if (getGeneration() <= AMDGPUSubtarget::NORTHERN_ISLANDS) {
-    FP32Denormals = false;
-  }
-
   HasMulU24 = getGeneration() >= EVERGREEN;
   HasMulI24 = hasCaymanISA();
 
@@ -76,9 +69,6 @@ GCNSubtarget &
 GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
                                               StringRef GPU, StringRef FS) {
   // Determine default and user-specified characteristics
-  // On SI+, we want FP64 denormals to be on by default. FP32 denormals can be
-  // enabled, but some instructions do not respect them and they run at the
-  // double precision rate, so don't enable by default.
   //
   // We want to be able to turn these off, but making this a subtarget feature
   // for SI has the unhelpful behavior that it unsets everything else if you
@@ -88,20 +78,11 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
   // unset everything else if it is disabled
 
   // Assuming ECC is enabled is the conservative default.
-  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+sram-ecc,+xnack,");
+  SmallString<256> FullFS("+promote-alloca,+load-store-opt,+enable-ds128,+sram-ecc,+xnack,");
 
   if (isAmdHsaOS()) // Turn on FlatForGlobal for HSA.
     FullFS += "+flat-for-global,+unaligned-buffer-access,+trap-handler,";
 
-  // FIXME: I don't think think Evergreen has any useful support for
-  // denormals, but should be checked. Should we issue a warning somewhere
-  // if someone tries to enable these?
-  if (getGeneration() >= AMDGPUSubtarget::SOUTHERN_ISLANDS) {
-    FullFS += "+fp64-fp16-denormals,";
-  } else {
-    FullFS += "-fp32-denormals,";
-  }
-
   FullFS += "+enable-prt-strict-null,"; // This is overridden by a disable in FS
 
   // Disable mutually exclusive bits.
@@ -145,12 +126,14 @@ GCNSubtarget::initializeSubtargetDependencies(const Triple &TT,
   }
 
   // Don't crash on invalid devices.
-  if (WavefrontSize == 0)
-    WavefrontSize = 64;
+  if (WavefrontSizeLog2 == 0)
+    WavefrontSizeLog2 = 5;
 
   HasFminFmaxLegacy = getGeneration() < AMDGPUSubtarget::VOLCANIC_ISLANDS;
 
-  if (DoesNotSupportXNACK && EnableXNACK) {
+  // Disable XNACK on targets where it is not enabled by default unless it is
+  // explicitly requested.
+  if (!FS.contains("+xnack") && DoesNotSupportXNACK && EnableXNACK) {
     ToggleFeature(AMDGPU::FeatureXNACK);
     EnableXNACK = false;
   }
@@ -170,8 +153,8 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
   TargetTriple(TT),
   Has16BitInsts(false),
   HasMadMixInsts(false),
-  FP32Denormals(false),
-  FPExceptions(false),
+  HasMadMacF32Insts(false),
+  HasDsSrc2Insts(false),
   HasSDWA(false),
   HasVOP3PInsts(false),
   HasMulI24(true),
@@ -182,7 +165,7 @@ AMDGPUSubtarget::AMDGPUSubtarget(const Triple &TT) :
   HasTrigReducedRange(false),
   MaxWavesPerEU(10),
   LocalMemorySize(0),
-  WavefrontSize(0)
+  WavefrontSizeLog2(0)
   { }
 
 GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
@@ -196,9 +179,9 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     MaxPrivateElementSize(0),
 
     FastFMAF32(false),
+    FastDenormalF32(false),
     HalfRate64Ops(false),
 
-    FP64FP16Denormals(false),
     FlatForGlobal(false),
     AutoWaitcntBeforeBarrier(false),
     CodeObjectV3(false),
@@ -224,6 +207,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     GFX8Insts(false),
     GFX9Insts(false),
     GFX10Insts(false),
+    GFX10_3Insts(false),
     GFX7GFX8GFX9Insts(false),
     SGPRInitBug(false),
     HasSMemRealTime(false),
@@ -241,7 +225,10 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     HasDPP(false),
     HasDPP8(false),
     HasR128A16(false),
+    HasGFX10A16(false),
+    HasG16(false),
     HasNSAEncoding(false),
+    GFX10_BEncoding(false),
     HasDLInsts(false),
     HasDot1Insts(false),
     HasDot2Insts(false),
@@ -256,6 +243,8 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     DoesNotSupportSRAMECC(false),
     HasNoSdstCMPX(false),
     HasVscnt(false),
+    HasGetWaveIdInst(false),
+    HasSMemTimeInst(false),
     HasRegisterBanking(false),
     HasVOP3Literal(false),
     HasNoDataDepHazard(false),
@@ -287,6 +276,7 @@ GCNSubtarget::GCNSubtarget(const Triple &TT, StringRef GPU, StringRef FS,
     FrameLowering(TargetFrameLowering::StackGrowsUp, getStackAlignment(), 0) {
   MaxWavesPerEU = AMDGPU::IsaInfo::getMaxWavesPerEU(this);
   CallLoweringInfo.reset(new AMDGPUCallLowering(*getTargetLowering()));
+  InlineAsmLoweringInfo.reset(new InlineAsmLowering(getTargetLowering()));
   Legalizer.reset(new AMDGPULegalizerInfo(*this, TM));
   RegBankInfo.reset(new AMDGPURegisterBankInfo(*this));
   InstSelector.reset(new AMDGPUInstructionSelector(
@@ -325,18 +315,41 @@ unsigned AMDGPUSubtarget::getMaxLocalMemSizeWithWaveCount(unsigned NWaves,
   return getLocalMemorySize() * MaxWaves / WorkGroupsPerCu / NWaves;
 }
 
+// FIXME: Should return min,max range.
 unsigned AMDGPUSubtarget::getOccupancyWithLocalMemSize(uint32_t Bytes,
   const Function &F) const {
-  unsigned WorkGroupSize = getFlatWorkGroupSizes(F).second;
-  unsigned WorkGroupsPerCu = getMaxWorkGroupsPerCU(WorkGroupSize);
-  if (!WorkGroupsPerCu)
+  const unsigned MaxWorkGroupSize = getFlatWorkGroupSizes(F).second;
+  const unsigned MaxWorkGroupsPerCu = getMaxWorkGroupsPerCU(MaxWorkGroupSize);
+  if (!MaxWorkGroupsPerCu)
     return 0;
-  unsigned MaxWaves = getMaxWavesPerEU();
-  unsigned Limit = getLocalMemorySize() * MaxWaves / WorkGroupsPerCu;
-  unsigned NumWaves = Limit / (Bytes ? Bytes : 1u);
-  NumWaves = std::min(NumWaves, MaxWaves);
-  NumWaves = std::max(NumWaves, 1u);
-  return NumWaves;
+
+  const unsigned WaveSize = getWavefrontSize();
+
+  // FIXME: Do we need to account for alignment requirement of LDS rounding the
+  // size up?
+  // Compute restriction based on LDS usage
+  unsigned NumGroups = getLocalMemorySize() / (Bytes ? Bytes : 1u);
+
+  // This can be queried with more LDS than is possible, so just assume the
+  // worst.
+  if (NumGroups == 0)
+    return 1;
+
+  NumGroups = std::min(MaxWorkGroupsPerCu, NumGroups);
+
+  // Round to the number of waves.
+  const unsigned MaxGroupNumWaves = (MaxWorkGroupSize + WaveSize - 1) / WaveSize;
+  unsigned MaxWaves = NumGroups * MaxGroupNumWaves;
+
+  // Clamp to the maximum possible number of waves.
+  MaxWaves = std::min(MaxWaves, getMaxWavesPerEU());
+
+  // FIXME: Needs to be a multiple of the group size?
+  //MaxWaves = MaxGroupNumWaves * (MaxWaves / MaxGroupNumWaves);
+
+  assert(MaxWaves > 0 && MaxWaves <= getMaxWavesPerEU() &&
+         "computed invalid occupancy");
+  return MaxWaves;
 }
 
 unsigned
@@ -396,13 +409,10 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
   // number of waves per execution unit to values implied by requested
   // minimum/maximum flat work group sizes.
   unsigned MinImpliedByFlatWorkGroupSize =
-    getMaxWavesPerEU(FlatWorkGroupSizes.second);
-  bool RequestedFlatWorkGroupSize = false;
-
-  if (F.hasFnAttribute("amdgpu-flat-work-group-size")) {
-    Default.first = MinImpliedByFlatWorkGroupSize;
-    RequestedFlatWorkGroupSize = true;
-  }
+    getWavesPerEUForWorkGroup(FlatWorkGroupSizes.second);
+  Default.first = MinImpliedByFlatWorkGroupSize;
+  bool RequestedFlatWorkGroupSize =
+      F.hasFnAttribute("amdgpu-flat-work-group-size");
 
   // Requested minimum/maximum number of waves per execution unit.
   std::pair<unsigned, unsigned> Requested = AMDGPU::getIntegerPairAttribute(
@@ -414,9 +424,7 @@ std::pair<unsigned, unsigned> AMDGPUSubtarget::getWavesPerEU(
 
   // Make sure requested values do not violate subtarget's specifications.
   if (Requested.first < getMinWavesPerEU() ||
-      Requested.first > getMaxWavesPerEU())
-    return Default;
-  if (Requested.second > getMaxWavesPerEU())
+      Requested.second > getMaxWavesPerEU())
     return Default;
 
   // Make sure requested values are compatible with values implied by requested
@@ -497,12 +505,12 @@ uint64_t AMDGPUSubtarget::getExplicitKernArgSize(const Function &F,
 
   const DataLayout &DL = F.getParent()->getDataLayout();
   uint64_t ExplicitArgBytes = 0;
-  MaxAlign = Align::None();
+  MaxAlign = Align(1);
 
   for (const Argument &Arg : F.args()) {
     Type *ArgTy = Arg.getType();
 
-    const Align Alignment(DL.getABITypeAlignment(ArgTy));
+    const Align Alignment = DL.getABITypeAlign(ArgTy);
     uint64_t AllocSize = DL.getTypeAllocSize(ArgTy);
     ExplicitArgBytes = alignTo(ExplicitArgBytes, Alignment) + AllocSize;
     MaxAlign = std::max(MaxAlign, Alignment);
@@ -622,13 +630,12 @@ unsigned GCNSubtarget::getReservedNumSGPRs(const MachineFunction &MF) const {
   return 2; // VCC.
 }
 
-unsigned GCNSubtarget::computeOccupancy(const MachineFunction &MF,
-                                        unsigned LDSSize,
+unsigned GCNSubtarget::computeOccupancy(const Function &F, unsigned LDSSize,
                                         unsigned NumSGPRs,
                                         unsigned NumVGPRs) const {
   unsigned Occupancy =
     std::min(getMaxWavesPerEU(),
-             getOccupancyWithLocalMemSize(LDSSize, MF.getFunction()));
+             getOccupancyWithLocalMemSize(LDSSize, F));
   if (NumSGPRs)
     Occupancy = std::min(Occupancy, getOccupancyWithNumSGPRs(NumSGPRs));
   if (NumVGPRs)
@@ -716,20 +723,20 @@ unsigned GCNSubtarget::getMaxNumVGPRs(const MachineFunction &MF) const {
   return MaxNumVGPRs;
 }
 
-void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
-                                         SDep &Dep) const {
+void GCNSubtarget::adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use,
+                                         int UseOpIdx, SDep &Dep) const {
   if (Dep.getKind() != SDep::Kind::Data || !Dep.getReg() ||
-      !Src->isInstr() || !Dst->isInstr())
+      !Def->isInstr() || !Use->isInstr())
     return;
 
-  MachineInstr *SrcI = Src->getInstr();
-  MachineInstr *DstI = Dst->getInstr();
+  MachineInstr *DefI = Def->getInstr();
+  MachineInstr *UseI = Use->getInstr();
 
-  if (SrcI->isBundle()) {
+  if (DefI->isBundle()) {
     const SIRegisterInfo *TRI = getRegisterInfo();
     auto Reg = Dep.getReg();
-    MachineBasicBlock::const_instr_iterator I(SrcI->getIterator());
-    MachineBasicBlock::const_instr_iterator E(SrcI->getParent()->instr_end());
+    MachineBasicBlock::const_instr_iterator I(DefI->getIterator());
+    MachineBasicBlock::const_instr_iterator E(DefI->getParent()->instr_end());
     unsigned Lat = 0;
     for (++I; I != E && I->isBundledWithPred(); ++I) {
       if (I->modifiesRegister(Reg, TRI))
@@ -738,12 +745,12 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
         --Lat;
     }
     Dep.setLatency(Lat);
-  } else if (DstI->isBundle()) {
+  } else if (UseI->isBundle()) {
     const SIRegisterInfo *TRI = getRegisterInfo();
     auto Reg = Dep.getReg();
-    MachineBasicBlock::const_instr_iterator I(DstI->getIterator());
-    MachineBasicBlock::const_instr_iterator E(DstI->getParent()->instr_end());
-    unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *SrcI);
+    MachineBasicBlock::const_instr_iterator I(UseI->getIterator());
+    MachineBasicBlock::const_instr_iterator E(UseI->getParent()->instr_end());
+    unsigned Lat = InstrInfo.getInstrLatency(getInstrItineraryData(), *DefI);
     for (++I; I != E && I->isBundledWithPred() && Lat; ++I) {
       if (I->readsRegister(Reg, TRI))
         break;
@@ -754,53 +761,6 @@ void GCNSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
 }
 
 namespace {
-struct MemOpClusterMutation : ScheduleDAGMutation {
-  const SIInstrInfo *TII;
-
-  MemOpClusterMutation(const SIInstrInfo *tii) : TII(tii) {}
-
-  void apply(ScheduleDAGInstrs *DAG) override {
-    SUnit *SUa = nullptr;
-    // Search for two consequent memory operations and link them
-    // to prevent scheduler from moving them apart.
-    // In DAG pre-process SUnits are in the original order of
-    // the instructions before scheduling.
-    for (SUnit &SU : DAG->SUnits) {
-      MachineInstr &MI2 = *SU.getInstr();
-      if (!MI2.mayLoad() && !MI2.mayStore()) {
-        SUa = nullptr;
-        continue;
-      }
-      if (!SUa) {
-        SUa = &SU;
-        continue;
-      }
-
-      MachineInstr &MI1 = *SUa->getInstr();
-      if ((TII->isVMEM(MI1) && TII->isVMEM(MI2)) ||
-          (TII->isFLAT(MI1) && TII->isFLAT(MI2)) ||
-          (TII->isSMRD(MI1) && TII->isSMRD(MI2)) ||
-          (TII->isDS(MI1)   && TII->isDS(MI2))) {
-        SU.addPredBarrier(SUa);
-
-        for (const SDep &SI : SU.Preds) {
-          if (SI.getSUnit() != SUa)
-            SUa->addPred(SDep(SI.getSUnit(), SDep::Artificial));
-        }
-
-        if (&SU != &DAG->ExitSU) {
-          for (const SDep &SI : SUa->Succs) {
-            if (SI.getSUnit() != &SU)
-              SI.getSUnit()->addPred(SDep(&SU, SDep::Artificial));
-          }
-        }
-      }
-
-      SUa = &SU;
-    }
-  }
-};
-
 struct FillMFMAShadowMutation : ScheduleDAGMutation {
   const SIInstrInfo *TII;
 
@@ -927,7 +887,6 @@ struct FillMFMAShadowMutation : ScheduleDAGMutation {
 
 void GCNSubtarget::getPostRAMutations(
     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
-  Mutations.push_back(std::make_unique<MemOpClusterMutation>(&InstrInfo));
   Mutations.push_back(std::make_unique<FillMFMAShadowMutation>(&InstrInfo));
 }
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
index 19a240800ba1..c833bfbcf936 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSubtarget.h
@@ -16,6 +16,7 @@
 
 #include "AMDGPU.h"
 #include "AMDGPUCallLowering.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "R600FrameLowering.h"
 #include "R600ISelLowering.h"
 #include "R600InstrInfo.h"
@@ -24,6 +25,7 @@
 #include "SIInstrInfo.h"
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/Triple.h"
+#include "llvm/CodeGen/GlobalISel/InlineAsmLowering.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
 #include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
@@ -65,8 +67,8 @@ private:
 protected:
   bool Has16BitInsts;
   bool HasMadMixInsts;
-  bool FP32Denormals;
-  bool FPExceptions;
+  bool HasMadMacF32Insts;
+  bool HasDsSrc2Insts;
   bool HasSDWA;
   bool HasVOP3PInsts;
   bool HasMulI24;
@@ -77,7 +79,7 @@ protected:
   bool HasTrigReducedRange;
   unsigned MaxWavesPerEU;
   int LocalMemorySize;
-  unsigned WavefrontSize;
+  char WavefrontSizeLog2;
 
 public:
   AMDGPUSubtarget(const Triple &TT);
@@ -140,6 +142,10 @@ public:
     return isAmdHsaOS() || isMesaKernel(F);
   }
 
+  bool isGCN() const {
+    return TargetTriple.getArch() == Triple::amdgcn;
+  }
+
   bool has16BitInsts() const {
     return Has16BitInsts;
   }
@@ -148,17 +154,12 @@ public:
     return HasMadMixInsts;
   }
 
-  bool hasFP32Denormals(const Function &F) const {
-    // FIXME: This should not be a property of the subtarget. This should be a
-    // property with a default set by the calling convention which can be
-    // overridden by attributes. For now, use the subtarget feature as a
-    // placeholder attribute. The function arguments only purpose is to
-    // discourage use without a function context until this is removed.
-    return FP32Denormals;
+  bool hasMadMacF32Insts() const {
+    return HasMadMacF32Insts || !isGCN();
   }
 
-  bool hasFPExceptions() const {
-    return FPExceptions;
+  bool hasDsSrc2Insts() const {
+    return HasDsSrc2Insts;
   }
 
   bool hasSDWA() const {
@@ -194,7 +195,11 @@ public:
   }
 
   unsigned getWavefrontSize() const {
-    return WavefrontSize;
+    return 1 << WavefrontSizeLog2;
+  }
+
+  unsigned getWavefrontSizeLog2() const {
+    return WavefrontSizeLog2;
   }
 
   int getLocalMemorySize() const {
@@ -221,9 +226,10 @@ public:
   /// \returns Maximum flat work group size supported by the subtarget.
   virtual unsigned getMaxFlatWorkGroupSize() const = 0;
 
-  /// \returns Maximum number of waves per execution unit supported by the
-  /// subtarget and limited by given \p FlatWorkGroupSize.
-  virtual unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const  = 0;
+  /// \returns Number of waves per execution unit required to support the given
+  /// \p FlatWorkGroupSize.
+  virtual unsigned
+  getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const = 0;
 
   /// \returns Minimum number of waves per execution unit supported by the
   /// subtarget.
@@ -246,6 +252,13 @@ public:
   uint64_t getExplicitKernArgSize(const Function &F, Align &MaxAlign) const;
   unsigned getKernArgSegmentSize(const Function &F, Align &MaxAlign) const;
 
+  /// \returns Corresponsing DWARF register number mapping flavour for the
+  /// \p WavefrontSize.
+  AMDGPUDwarfFlavour getAMDGPUDwarfFlavour() const {
+    return getWavefrontSize() == 32 ? AMDGPUDwarfFlavour::Wave32
+                                    : AMDGPUDwarfFlavour::Wave64;
+  }
+
   virtual ~AMDGPUSubtarget() {}
 };
 
@@ -278,6 +291,7 @@ public:
 private:
   /// GlobalISel related APIs.
   std::unique_ptr<AMDGPUCallLowering> CallLoweringInfo;
+  std::unique_ptr<InlineAsmLowering> InlineAsmLoweringInfo;
   std::unique_ptr<InstructionSelector> InstSelector;
   std::unique_ptr<LegalizerInfo> Legalizer;
   std::unique_ptr<RegisterBankInfo> RegBankInfo;
@@ -292,10 +306,10 @@ protected:
 
   // Possibly statically set by tablegen, but may want to be overridden.
   bool FastFMAF32;
+  bool FastDenormalF32;
   bool HalfRate64Ops;
 
   // Dynamially set bits that enable features.
-  bool FP64FP16Denormals;
   bool FlatForGlobal;
   bool AutoWaitcntBeforeBarrier;
   bool CodeObjectV3;
@@ -325,6 +339,7 @@ protected:
   bool GFX8Insts;
   bool GFX9Insts;
   bool GFX10Insts;
+  bool GFX10_3Insts;
   bool GFX7GFX8GFX9Insts;
   bool SGPRInitBug;
   bool HasSMemRealTime;
@@ -342,7 +357,10 @@ protected:
   bool HasDPP;
   bool HasDPP8;
   bool HasR128A16;
+  bool HasGFX10A16;
+  bool HasG16;
   bool HasNSAEncoding;
+  bool GFX10_BEncoding;
   bool HasDLInsts;
   bool HasDot1Insts;
   bool HasDot2Insts;
@@ -357,6 +375,8 @@ protected:
   bool DoesNotSupportSRAMECC;
   bool HasNoSdstCMPX;
   bool HasVscnt;
+  bool HasGetWaveIdInst;
+  bool HasSMemTimeInst;
   bool HasRegisterBanking;
   bool HasVOP3Literal;
   bool HasNoDataDepHazard;
@@ -426,6 +446,10 @@ public:
     return CallLoweringInfo.get();
   }
 
+  const InlineAsmLowering *getInlineAsmLowering() const override {
+    return InlineAsmLoweringInfo.get();
+  }
+
   InstructionSelector *getInstructionSelector() const override {
     return InstSelector.get();
   }
@@ -453,10 +477,6 @@ public:
     return (Generation)Gen;
   }
 
-  unsigned getWavefrontSizeLog2() const {
-    return Log2_32(WavefrontSize);
-  }
-
   /// Return the number of high bits known to be zero fror a frame index.
   unsigned getKnownHighZeroBitsForFrameIndex() const {
     return countLeadingZeros(MaxWaveScratchSize) + getWavefrontSizeLog2();
@@ -506,6 +526,10 @@ public:
     return getGeneration() >= VOLCANIC_ISLANDS;
   }
 
+  bool hasFractBug() const {
+    return getGeneration() == SOUTHERN_ISLANDS;
+  }
+
   bool hasBFE() const {
     return true;
   }
@@ -587,6 +611,11 @@ public:
     return getGeneration() <= SEA_ISLANDS;
   }
 
+  /// Writes to VCC_LO/VCC_HI update the VCCZ flag.
+  bool partialVCCWritesUpdateVCCZ() const {
+    return getGeneration() >= GFX10;
+  }
+
   /// A read of an SGPR by SMRD instruction requires 4 wait states when the SGPR
   /// was written by a VALU instruction.
   bool hasSMRDReadVALUDefHazard() const {
@@ -617,20 +646,6 @@ public:
   unsigned getMaxLocalMemSizeWithWaveCount(unsigned WaveCount,
                                            const Function &) const;
 
-  /// Alias for hasFP64FP16Denormals
-  bool hasFP16Denormals(const Function &F) const {
-    return FP64FP16Denormals;
-  }
-
-  /// Alias for hasFP64FP16Denormals
-  bool hasFP64Denormals(const Function &F) const {
-    return FP64FP16Denormals;
-  }
-
-  bool hasFP64FP16Denormals(const Function &F) const {
-    return FP64FP16Denormals;
-  }
-
   bool supportsMinMaxDenormModes() const {
     return getGeneration() >= AMDGPUSubtarget::GFX9;
   }
@@ -724,6 +739,18 @@ public:
     return ScalarFlatScratchInsts;
   }
 
+  bool hasGlobalAddTidInsts() const {
+    return GFX10_BEncoding;
+  }
+
+  bool hasAtomicCSub() const {
+    return GFX10_BEncoding;
+  }
+
+  bool hasMultiDwordFlatScratchAddressing() const {
+    return getGeneration() >= GFX9;
+  }
+
   bool hasFlatSegmentOffsetBug() const {
     return HasFlatSegmentOffsetBug;
   }
@@ -853,6 +880,14 @@ public:
     return HasVscnt;
   }
 
+  bool hasGetWaveIdInst() const {
+    return HasGetWaveIdInst;
+  }
+
+  bool hasSMemTimeInst() const {
+    return HasSMemTimeInst;
+  }
+
   bool hasRegisterBanking() const {
     return HasRegisterBanking;
   }
@@ -890,30 +925,6 @@ public:
   void setScalarizeGlobalBehavior(bool b) { ScalarizeGlobal = b; }
   bool getScalarizeGlobalBehavior() const { return ScalarizeGlobal; }
 
-  /// \returns Number of execution units per compute unit supported by the
-  /// subtarget.
-  unsigned getEUsPerCU() const {
-    return AMDGPU::IsaInfo::getEUsPerCU(this);
-  }
-
-  /// \returns Maximum number of waves per compute unit supported by the
-  /// subtarget without any kind of limitation.
-  unsigned getMaxWavesPerCU() const {
-    return AMDGPU::IsaInfo::getMaxWavesPerCU(this);
-  }
-
-  /// \returns Maximum number of waves per compute unit supported by the
-  /// subtarget and limited by given \p FlatWorkGroupSize.
-  unsigned getMaxWavesPerCU(unsigned FlatWorkGroupSize) const {
-    return AMDGPU::IsaInfo::getMaxWavesPerCU(this, FlatWorkGroupSize);
-  }
-
-  /// \returns Number of waves per work group supported by the subtarget and
-  /// limited by given \p FlatWorkGroupSize.
-  unsigned getWavesPerWorkGroup(unsigned FlatWorkGroupSize) const {
-    return AMDGPU::IsaInfo::getWavesPerWorkGroup(this, FlatWorkGroupSize);
-  }
-
   // static wrappers
   static bool hasHalfRate64Ops(const TargetSubtargetInfo &STI);
 
@@ -979,6 +990,14 @@ public:
     return HasR128A16;
   }
 
+  bool hasGFX10A16() const {
+    return HasGFX10A16;
+  }
+
+  bool hasA16() const { return hasR128A16() || hasGFX10A16(); }
+
+  bool hasG16() const { return HasG16; }
+
   bool hasOffset3fBug() const {
     return HasOffset3fBug;
   }
@@ -987,6 +1006,14 @@ public:
     return HasNSAEncoding;
   }
 
+  bool hasGFX10_BEncoding() const {
+    return GFX10_BEncoding;
+  }
+
+  bool hasGFX10_3Insts() const {
+    return GFX10_3Insts;
+  }
+
   bool hasMadF16() const;
 
   bool enableSIScheduler() const {
@@ -1059,6 +1086,8 @@ public:
     return HasNSAtoVMEMBug;
   }
 
+  bool hasHardClauses() const { return getGeneration() >= GFX10; }
+
   /// Return the maximum number of waves per SIMD for kernels using \p SGPRs
   /// SGPRs
   unsigned getOccupancyWithNumSGPRs(unsigned SGPRs) const;
@@ -1071,7 +1100,7 @@ public:
   /// registers if provided.
   /// Note, occupancy can be affected by the scratch allocation as well, but
   /// we do not have enough information to compute it.
-  unsigned computeOccupancy(const MachineFunction &MF, unsigned LDSSize = 0,
+  unsigned computeOccupancy(const Function &F, unsigned LDSSize = 0,
                             unsigned NumSGPRs = 0, unsigned NumVGPRs = 0) const;
 
   /// \returns true if the flat_scratch register should be initialized with the
@@ -1178,7 +1207,7 @@ public:
       const override;
 
   bool isWave32() const {
-    return WavefrontSize == 32;
+    return getWavefrontSize() == 32;
   }
 
   const TargetRegisterClass *getBoolRC() const {
@@ -1201,10 +1230,11 @@ public:
     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
   }
 
-  /// \returns Maximum number of waves per execution unit supported by the
-  /// subtarget and limited by given \p FlatWorkGroupSize.
-  unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
-    return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
+  /// \returns Number of waves per execution unit required to support the given
+  /// \p FlatWorkGroupSize.
+  unsigned
+  getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
+    return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
   }
 
   /// \returns Minimum number of waves per execution unit supported by the
@@ -1213,7 +1243,8 @@ public:
     return AMDGPU::IsaInfo::getMinWavesPerEU(this);
   }
 
-  void adjustSchedDependency(SUnit *Src, SUnit *Dst, SDep &Dep) const override;
+  void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
+                             SDep &Dep) const override;
 };
 
 class R600Subtarget final : public R600GenSubtargetInfo,
@@ -1338,10 +1369,11 @@ public:
     return AMDGPU::IsaInfo::getMaxFlatWorkGroupSize(this);
   }
 
-  /// \returns Maximum number of waves per execution unit supported by the
-  /// subtarget and limited by given \p FlatWorkGroupSize.
-  unsigned getMaxWavesPerEU(unsigned FlatWorkGroupSize) const override {
-    return AMDGPU::IsaInfo::getMaxWavesPerEU(this, FlatWorkGroupSize);
+  /// \returns Number of waves per execution unit required to support the given
+  /// \p FlatWorkGroupSize.
+  unsigned
+  getWavesPerEUForWorkGroup(unsigned FlatWorkGroupSize) const override {
+    return AMDGPU::IsaInfo::getWavesPerEUForWorkGroup(this, FlatWorkGroupSize);
   }
 
   /// \returns Minimum number of waves per execution unit supported by the
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index eb30d659bf0b..b4b10835837c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -16,6 +16,7 @@
 #include "AMDGPU.h"
 #include "AMDGPUAliasAnalysis.h"
 #include "AMDGPUCallLowering.h"
+#include "AMDGPUExportClustering.h"
 #include "AMDGPUInstructionSelector.h"
 #include "AMDGPULegalizerInfo.h"
 #include "AMDGPUMacroFusion.h"
@@ -23,6 +24,7 @@
 #include "AMDGPUTargetTransformInfo.h"
 #include "GCNIterativeScheduler.h"
 #include "GCNSchedStrategy.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "R600MachineScheduler.h"
 #include "SIMachineFunctionInfo.h"
 #include "SIMachineScheduler.h"
@@ -30,6 +32,7 @@
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelect.h"
 #include "llvm/CodeGen/GlobalISel/Legalizer.h"
+#include "llvm/CodeGen/GlobalISel/Localizer.h"
 #include "llvm/CodeGen/GlobalISel/RegBankSelect.h"
 #include "llvm/CodeGen/MIRParser/MIParser.h"
 #include "llvm/CodeGen/Passes.h"
@@ -138,6 +141,13 @@ static cl::opt<bool, true> EnableAMDGPUFunctionCallsOpt(
   cl::init(true),
   cl::Hidden);
 
+static cl::opt<bool, true> EnableAMDGPUFixedFunctionABIOpt(
+  "amdgpu-fixed-function-abi",
+  cl::desc("Enable all implicit function arguments"),
+  cl::location(AMDGPUTargetMachine::EnableFixedFunctionABI),
+  cl::init(false),
+  cl::Hidden);
+
 // Enable lib calls simplifications
 static cl::opt<bool> EnableLibCallSimplify(
   "amdgpu-simplify-libcall",
@@ -183,6 +193,11 @@ static cl::opt<bool> EnableScalarIRPasses(
   cl::init(true),
   cl::Hidden);
 
+static cl::opt<bool> EnableStructurizerWorkarounds(
+    "amdgpu-enable-structurizer-workarounds",
+    cl::desc("Enable workarounds for the StructurizeCFG pass"), cl::init(true),
+    cl::Hidden);
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   // Register the target
   RegisterTargetMachine<R600TargetMachine> X(getTheAMDGPUTarget());
@@ -217,23 +232,29 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPULowerKernelAttributesPass(*PR);
   initializeAMDGPULowerIntrinsicsPass(*PR);
   initializeAMDGPUOpenCLEnqueuedBlockLoweringPass(*PR);
+  initializeAMDGPUPostLegalizerCombinerPass(*PR);
+  initializeAMDGPUPreLegalizerCombinerPass(*PR);
   initializeAMDGPUPromoteAllocaPass(*PR);
+  initializeAMDGPUPromoteAllocaToVectorPass(*PR);
   initializeAMDGPUCodeGenPreparePass(*PR);
   initializeAMDGPUPropagateAttributesEarlyPass(*PR);
   initializeAMDGPUPropagateAttributesLatePass(*PR);
   initializeAMDGPURewriteOutArgumentsPass(*PR);
   initializeAMDGPUUnifyMetadataPass(*PR);
   initializeSIAnnotateControlFlowPass(*PR);
+  initializeSIInsertHardClausesPass(*PR);
   initializeSIInsertWaitcntsPass(*PR);
   initializeSIModeRegisterPass(*PR);
   initializeSIWholeQuadModePass(*PR);
   initializeSILowerControlFlowPass(*PR);
   initializeSIRemoveShortExecBranchesPass(*PR);
+  initializeSIPreEmitPeepholePass(*PR);
   initializeSIInsertSkipsPass(*PR);
   initializeSIMemoryLegalizerPass(*PR);
   initializeSIOptimizeExecMaskingPass(*PR);
   initializeSIPreAllocateWWMRegsPass(*PR);
   initializeSIFormMemoryClausesPass(*PR);
+  initializeSIPostRABundlerPass(*PR);
   initializeAMDGPUUnifyDivergentExitNodesPass(*PR);
   initializeAMDGPUAAWrapperPassPass(*PR);
   initializeAMDGPUExternalAAWrapperPass(*PR);
@@ -243,6 +264,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAMDGPUTarget() {
   initializeAMDGPUPrintfRuntimeBindingPass(*PR);
   initializeGCNRegBankReassignPass(*PR);
   initializeGCNNSAReassignPass(*PR);
+  initializeSIAddIMGInitPass(*PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -264,6 +286,7 @@ createGCNMaxOccupancyMachineScheduler(MachineSchedContext *C) {
   DAG->addMutation(createLoadClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createStoreClusterDAGMutation(DAG->TII, DAG->TRI));
   DAG->addMutation(createAMDGPUMacroFusionDAGMutation());
+  DAG->addMutation(createAMDGPUExportClusteringDAGMutation());
   return DAG;
 }
 
@@ -363,10 +386,17 @@ AMDGPUTargetMachine::AMDGPUTargetMachine(const Target &T, const Triple &TT,
                         getEffectiveCodeModel(CM, CodeModel::Small), OptLevel),
       TLOF(createTLOF(getTargetTriple())) {
   initAsmInfo();
+  if (TT.getArch() == Triple::amdgcn) {
+    if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize64"))
+      MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave64));
+    else if (getMCSubtargetInfo()->checkFeatures("+wavefrontsize32"))
+      MRI.reset(llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour::Wave32));
+  }
 }
 
 bool AMDGPUTargetMachine::EnableLateStructurizeCFG = false;
 bool AMDGPUTargetMachine::EnableFunctionCalls = false;
+bool AMDGPUTargetMachine::EnableFixedFunctionABI = false;
 
 AMDGPUTargetMachine::~AMDGPUTargetMachine() = default;
 
@@ -416,20 +446,19 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
       }
       PM.add(createAMDGPUUnifyMetadataPass());
       PM.add(createAMDGPUPrintfRuntimeBinding());
-      PM.add(createAMDGPUPropagateAttributesLatePass(this));
-      if (Internalize) {
+      if (Internalize)
         PM.add(createInternalizePass(mustPreserveGV));
+      PM.add(createAMDGPUPropagateAttributesLatePass(this));
+      if (Internalize)
         PM.add(createGlobalDCEPass());
-      }
       if (EarlyInline)
         PM.add(createAMDGPUAlwaysInlinePass(false));
   });
 
-  const auto &Opt = Options;
   Builder.addExtension(
     PassManagerBuilder::EP_EarlyAsPossible,
-    [AMDGPUAA, LibCallSimplify, &Opt, this](const PassManagerBuilder &,
-                                            legacy::PassManagerBase &PM) {
+    [AMDGPUAA, LibCallSimplify, this](const PassManagerBuilder &,
+                                      legacy::PassManagerBase &PM) {
       if (AMDGPUAA) {
         PM.add(createAMDGPUAAWrapperPass());
         PM.add(createAMDGPUExternalAAWrapperPass());
@@ -437,12 +466,12 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
       PM.add(llvm::createAMDGPUPropagateAttributesEarlyPass(this));
       PM.add(llvm::createAMDGPUUseNativeCallsPass());
       if (LibCallSimplify)
-        PM.add(llvm::createAMDGPUSimplifyLibCallsPass(Opt, this));
+        PM.add(llvm::createAMDGPUSimplifyLibCallsPass(this));
   });
 
   Builder.addExtension(
     PassManagerBuilder::EP_CGSCCOptimizerLate,
-    [](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
+    [EnableOpt](const PassManagerBuilder &, legacy::PassManagerBase &PM) {
       // Add infer address spaces pass to the opt pipeline after inlining
       // but before SROA to increase SROA opportunities.
       PM.add(createInferAddressSpacesPass());
@@ -450,6 +479,11 @@ void AMDGPUTargetMachine::adjustPassManager(PassManagerBuilder &Builder) {
       // This should run after inlining to have any chance of doing anything,
       // and before other cleanup optimizations.
       PM.add(createAMDGPULowerKernelAttributesPass());
+
+      // Promote alloca to vector before SROA and loop unroll. If we manage
+      // to eliminate allocas before unroll we may choose to unroll less.
+      if (EnableOpt)
+        PM.add(createAMDGPUPromoteAllocaToVector());
   });
 }
 
@@ -617,7 +651,9 @@ public:
   bool addILPOpts() override;
   bool addInstSelector() override;
   bool addIRTranslator() override;
+  void addPreLegalizeMachineIR() override;
   bool addLegalizeMachineIR() override;
+  void addPreRegBankSelect() override;
   bool addRegBankSelect() override;
   bool addGlobalInstructionSelect() override;
   void addFastRegAlloc() override;
@@ -751,10 +787,15 @@ void AMDGPUPassConfig::addCodeGenPrepare() {
 
   if (EnableLoadStoreVectorizer)
     addPass(createLoadStoreVectorizerPass());
+
+  // LowerSwitch pass may introduce unreachable blocks that can
+  // cause unexpected behavior for subsequent passes. Placing it
+  // here seems better that these blocks would get cleaned up by
+  // UnreachableBlockElim inserted next in the pass flow.
+  addPass(createLowerSwitchPass());
 }
 
 bool AMDGPUPassConfig::addPreISel() {
-  addPass(createLowerSwitchPass());
   addPass(createFlattenCFGPass());
   return false;
 }
@@ -836,7 +877,11 @@ bool GCNPassConfig::addPreISel() {
   // regions formed by them.
   addPass(&AMDGPUUnifyDivergentExitNodesID);
   if (!LateCFGStructurize) {
-    addPass(createStructurizeCFGPass(true)); // true -> SkipUniformRegions
+    if (EnableStructurizerWorkarounds) {
+      addPass(createFixIrreduciblePass());
+      addPass(createUnifyLoopExitsPass());
+    }
+    addPass(createStructurizeCFGPass(false)); // true -> SkipUniformRegions
   }
   addPass(createSinkingPass());
   addPass(createAMDGPUAnnotateUniformValues());
@@ -885,6 +930,12 @@ bool GCNPassConfig::addInstSelector() {
   AMDGPUPassConfig::addInstSelector();
   addPass(&SIFixSGPRCopiesID);
   addPass(createSILowerI1CopiesPass());
+  // TODO: We have to add FinalizeISel
+  // to expand V_ADD/SUB_U64_PSEUDO before SIFixupVectorISel
+  // that expects V_ADD/SUB -> A_ADDC/SUBB pairs expanded.
+  // Will be removed as soon as SIFixupVectorISel is changed
+  // to work with V_ADD/SUB_U64_PSEUDO instead.
+  addPass(&FinalizeISelID);
   addPass(createSIFixupVectorISelPass());
   addPass(createSIAddIMGInitPass());
   return false;
@@ -895,11 +946,22 @@ bool GCNPassConfig::addIRTranslator() {
   return false;
 }
 
+void GCNPassConfig::addPreLegalizeMachineIR() {
+  bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+  addPass(createAMDGPUPreLegalizeCombiner(IsOptNone));
+  addPass(new Localizer());
+}
+
 bool GCNPassConfig::addLegalizeMachineIR() {
   addPass(new Legalizer());
   return false;
 }
 
+void GCNPassConfig::addPreRegBankSelect() {
+  bool IsOptNone = getOptLevel() == CodeGenOpt::None;
+  addPass(createAMDGPUPostLegalizeCombiner(IsOptNone));
+}
+
 bool GCNPassConfig::addRegBankSelect() {
   addPass(new RegBankSelect());
   return false;
@@ -933,12 +995,9 @@ void GCNPassConfig::addFastRegAlloc() {
 }
 
 void GCNPassConfig::addOptimizedRegAlloc() {
-  if (OptExecMaskPreRA) {
+  if (OptExecMaskPreRA)
     insertPass(&MachineSchedulerID, &SIOptimizeExecMaskingPreRAID);
-    insertPass(&SIOptimizeExecMaskingPreRAID, &SIFormMemoryClausesID);
-  } else {
-    insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
-  }
+  insertPass(&MachineSchedulerID, &SIFormMemoryClausesID);
 
   // This must be run immediately after phi elimination and before
   // TwoAddressInstructions, otherwise the processing of the tied operand of
@@ -973,6 +1032,7 @@ void GCNPassConfig::addPostRegAlloc() {
 }
 
 void GCNPassConfig::addPreSched2() {
+  addPass(&SIPostRABundlerID);
 }
 
 void GCNPassConfig::addPreEmitPass() {
@@ -993,9 +1053,12 @@ void GCNPassConfig::addPreEmitPass() {
   // FIXME: This stand-alone pass will emit indiv. S_NOP 0, as needed. It would
   // be better for it to emit S_NOP <N> when possible.
   addPass(&PostRAHazardRecognizerID);
+  if (getOptLevel() > CodeGenOpt::None)
+    addPass(&SIInsertHardClausesID);
 
   addPass(&SIRemoveShortExecBranchesID);
   addPass(&SIInsertSkipsPassID);
+  addPass(&SIPreEmitPeepholeID);
   addPass(&BranchRelaxationPassID);
 }
 
@@ -1024,11 +1087,13 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
 
   MFI->initializeBaseYamlFields(YamlMFI);
 
-  auto parseRegister = [&](const yaml::StringValue &RegName, unsigned &RegVal) {
-    if (parseNamedRegisterReference(PFS, RegVal, RegName.Value, Error)) {
+  auto parseRegister = [&](const yaml::StringValue &RegName, Register &RegVal) {
+    Register TempReg;
+    if (parseNamedRegisterReference(PFS, TempReg, RegName.Value, Error)) {
       SourceRange = RegName.SourceRange;
       return true;
     }
+    RegVal = TempReg;
 
     return false;
   };
@@ -1046,7 +1111,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
   };
 
   if (parseRegister(YamlMFI.ScratchRSrcReg, MFI->ScratchRSrcReg) ||
-      parseRegister(YamlMFI.ScratchWaveOffsetReg, MFI->ScratchWaveOffsetReg) ||
       parseRegister(YamlMFI.FrameOffsetReg, MFI->FrameOffsetReg) ||
       parseRegister(YamlMFI.StackPtrOffsetReg, MFI->StackPtrOffsetReg))
     return true;
@@ -1056,11 +1120,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
     return diagnoseRegisterClass(YamlMFI.ScratchRSrcReg);
   }
 
-  if (MFI->ScratchWaveOffsetReg != AMDGPU::SCRATCH_WAVE_OFFSET_REG &&
-      !AMDGPU::SGPR_32RegClass.contains(MFI->ScratchWaveOffsetReg)) {
-    return diagnoseRegisterClass(YamlMFI.ScratchWaveOffsetReg);
-  }
-
   if (MFI->FrameOffsetReg != AMDGPU::FP_REG &&
       !AMDGPU::SGPR_32RegClass.contains(MFI->FrameOffsetReg)) {
     return diagnoseRegisterClass(YamlMFI.FrameOffsetReg);
@@ -1080,7 +1139,7 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
       return false;
 
     if (A->IsRegister) {
-      unsigned Reg;
+      Register Reg;
       if (parseNamedRegisterReference(PFS, Reg, A->RegisterName.Value, Error)) {
         SourceRange = A->RegisterName.SourceRange;
         return true;
@@ -1154,8 +1213,10 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
 
   MFI->Mode.IEEE = YamlMFI.Mode.IEEE;
   MFI->Mode.DX10Clamp = YamlMFI.Mode.DX10Clamp;
-  MFI->Mode.FP32Denormals = YamlMFI.Mode.FP32Denormals;
-  MFI->Mode.FP64FP16Denormals = YamlMFI.Mode.FP64FP16Denormals;
+  MFI->Mode.FP32InputDenormals = YamlMFI.Mode.FP32InputDenormals;
+  MFI->Mode.FP32OutputDenormals = YamlMFI.Mode.FP32OutputDenormals;
+  MFI->Mode.FP64FP16InputDenormals = YamlMFI.Mode.FP64FP16InputDenormals;
+  MFI->Mode.FP64FP16OutputDenormals = YamlMFI.Mode.FP64FP16OutputDenormals;
 
   return false;
 }
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index 70fa3961236f..e223fecc8819 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -39,6 +39,7 @@ protected:
 public:
   static bool EnableLateStructurizeCFG;
   static bool EnableFunctionCalls;
+  static bool EnableFixedFunctionABI;
 
   AMDGPUTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
                       StringRef FS, TargetOptions Options,
@@ -56,8 +57,9 @@ public:
   void adjustPassManager(PassManagerBuilder &) override;
 
   /// Get the integer value of a null pointer in the given address space.
-  uint64_t getNullPointerValue(unsigned AddrSpace) const {
+  static int64_t getNullPointerValue(unsigned AddrSpace) {
     return (AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+            AddrSpace == AMDGPUAS::PRIVATE_ADDRESS ||
             AddrSpace == AMDGPUAS::REGION_ADDRESS) ? -1 : 0;
   }
 };
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
index 819bebb7932d..ed564ec1ad54 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetObjectFile.h
@@ -15,9 +15,7 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUTARGETOBJECTFILE_H
 
-#include "AMDGPU.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index c4eeb81c5133..542a5f006c0f 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -69,6 +69,21 @@ static cl::opt<unsigned> UnrollThresholdIf(
   cl::desc("Unroll threshold increment for AMDGPU for each if statement inside loop"),
   cl::init(150), cl::Hidden);
 
+static cl::opt<bool> UnrollRuntimeLocal(
+  "amdgpu-unroll-runtime-local",
+  cl::desc("Allow runtime unroll for AMDGPU if local memory used in a loop"),
+  cl::init(true), cl::Hidden);
+
+static cl::opt<bool> UseLegacyDA(
+  "amdgpu-use-legacy-divergence-analysis",
+  cl::desc("Enable legacy divergence analysis for AMDGPU"),
+  cl::init(false), cl::Hidden);
+
+static cl::opt<unsigned> UnrollMaxBlockToAnalyze(
+    "amdgpu-unroll-max-block-to-analyze",
+    cl::desc("Inner loop block size threshold to analyze in unroll for AMDGPU"),
+    cl::init(20), cl::Hidden);
+
 static bool dependsOnLocalPhi(const Loop *L, const Value *Cond,
                               unsigned Depth = 0) {
   const Instruction *I = dyn_cast<Instruction>(Cond);
@@ -172,6 +187,9 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
             (!isa<GlobalVariable>(GEP->getPointerOperand()) &&
              !isa<Argument>(GEP->getPointerOperand())))
           continue;
+        LLVM_DEBUG(dbgs() << "Allow unroll runtime for loop:\n"
+                          << *L << " due to LDS use.\n");
+        UP.Runtime = UnrollRuntimeLocal;
       }
 
       // Check if GEP depends on a value defined by this loop itself.
@@ -210,13 +228,22 @@ void AMDGPUTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
       if (UP.Threshold >= MaxBoost)
         return;
     }
+
+    // If we got a GEP in a small BB from inner loop then increase max trip
+    // count to analyze for better estimation cost in unroll
+    if (L->empty() && BB->size() < UnrollMaxBlockToAnalyze)
+      UP.MaxIterationsCountToAnalyze = 32;
   }
 }
 
+void AMDGPUTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                                          TTI::PeelingPreferences &PP) {
+  BaseT::getPeelingPreferences(L, SE, PP);
+}
 unsigned GCNTTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
   // The concept of vector registers doesn't really exist. Some packed vector
   // operations operate on the normal 32-bit registers.
-  return 256;
+  return MaxVGPRs;
 }
 
 unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
@@ -225,6 +252,13 @@ unsigned GCNTTIImpl::getNumberOfRegisters(bool Vec) const {
   return getHardwareNumberOfRegisters(Vec) >> 3;
 }
 
+unsigned GCNTTIImpl::getNumberOfRegisters(unsigned RCID) const {
+  const SIRegisterInfo *TRI = ST->getRegisterInfo();
+  const TargetRegisterClass *RC = TRI->getRegClass(RCID);
+  unsigned NumVGPRs = (TRI->getRegSizeInBits(*RC) + 31) / 32;
+  return getHardwareNumberOfRegisters(false) / NumVGPRs;
+}
+
 unsigned GCNTTIImpl::getRegisterBitWidth(bool Vector) const {
   return 32;
 }
@@ -234,8 +268,8 @@ unsigned GCNTTIImpl::getMinVectorRegisterBitWidth() const {
 }
 
 unsigned GCNTTIImpl::getLoadVectorFactor(unsigned VF, unsigned LoadSize,
-                                            unsigned ChainSizeInBytes,
-                                            VectorType *VecTy) const {
+                                         unsigned ChainSizeInBytes,
+                                         VectorType *VecTy) const {
   unsigned VecRegBitWidth = VF * LoadSize;
   if (VecRegBitWidth > 128 && VecTy->getScalarSizeInBits() < 32)
     // TODO: Support element-size less than 32bit?
@@ -262,20 +296,16 @@ unsigned GCNTTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
     return 512;
   }
 
-  if (AddrSpace == AMDGPUAS::FLAT_ADDRESS ||
-      AddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
-      AddrSpace == AMDGPUAS::REGION_ADDRESS)
-    return 128;
-
   if (AddrSpace == AMDGPUAS::PRIVATE_ADDRESS)
     return 8 * ST->getMaxPrivateElementSize();
 
-  llvm_unreachable("unhandled address space");
+  // Common to flat, global, local and region. Assume for unknown addrspace.
+  return 128;
 }
 
 bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
-                                               unsigned Alignment,
-                                               unsigned AddrSpace) const {
+                                            Align Alignment,
+                                            unsigned AddrSpace) const {
   // We allow vectorization of flat stores, even though we may need to decompose
   // them later if they may access private memory. We don't have enough context
   // here, and legalization can handle it.
@@ -287,17 +317,87 @@ bool GCNTTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
 }
 
 bool GCNTTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
-                                                unsigned Alignment,
-                                                unsigned AddrSpace) const {
+                                             Align Alignment,
+                                             unsigned AddrSpace) const {
   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 }
 
 bool GCNTTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
-                                                 unsigned Alignment,
-                                                 unsigned AddrSpace) const {
+                                              Align Alignment,
+                                              unsigned AddrSpace) const {
   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 }
 
+// FIXME: Really we would like to issue multiple 128-bit loads and stores per
+// iteration. Should we report a larger size and let it legalize?
+//
+// FIXME: Should we use narrower types for local/region, or account for when
+// unaligned access is legal?
+//
+// FIXME: This could use fine tuning and microbenchmarks.
+Type *GCNTTIImpl::getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
+                                            unsigned SrcAddrSpace,
+                                            unsigned DestAddrSpace,
+                                            unsigned SrcAlign,
+                                            unsigned DestAlign) const {
+  unsigned MinAlign = std::min(SrcAlign, DestAlign);
+
+  // A (multi-)dword access at an address == 2 (mod 4) will be decomposed by the
+  // hardware into byte accesses. If you assume all alignments are equally
+  // probable, it's more efficient on average to use short accesses for this
+  // case.
+  if (MinAlign == 2)
+    return Type::getInt16Ty(Context);
+
+  // Not all subtargets have 128-bit DS instructions, and we currently don't
+  // form them by default.
+  if (SrcAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+      SrcAddrSpace == AMDGPUAS::REGION_ADDRESS ||
+      DestAddrSpace == AMDGPUAS::LOCAL_ADDRESS ||
+      DestAddrSpace == AMDGPUAS::REGION_ADDRESS) {
+    return FixedVectorType::get(Type::getInt32Ty(Context), 2);
+  }
+
+  // Global memory works best with 16-byte accesses. Private memory will also
+  // hit this, although they'll be decomposed.
+  return FixedVectorType::get(Type::getInt32Ty(Context), 4);
+}
+
+void GCNTTIImpl::getMemcpyLoopResidualLoweringType(
+  SmallVectorImpl<Type *> &OpsOut, LLVMContext &Context,
+  unsigned RemainingBytes, unsigned SrcAddrSpace, unsigned DestAddrSpace,
+  unsigned SrcAlign, unsigned DestAlign) const {
+  assert(RemainingBytes < 16);
+
+  unsigned MinAlign = std::min(SrcAlign, DestAlign);
+
+  if (MinAlign != 2) {
+    Type *I64Ty = Type::getInt64Ty(Context);
+    while (RemainingBytes >= 8) {
+      OpsOut.push_back(I64Ty);
+      RemainingBytes -= 8;
+    }
+
+    Type *I32Ty = Type::getInt32Ty(Context);
+    while (RemainingBytes >= 4) {
+      OpsOut.push_back(I32Ty);
+      RemainingBytes -= 4;
+    }
+  }
+
+  Type *I16Ty = Type::getInt16Ty(Context);
+  while (RemainingBytes >= 2) {
+    OpsOut.push_back(I16Ty);
+    RemainingBytes -= 2;
+  }
+
+  Type *I8Ty = Type::getInt8Ty(Context);
+  while (RemainingBytes) {
+    OpsOut.push_back(I8Ty);
+    --RemainingBytes;
+  }
+}
+
 unsigned GCNTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // Disable unrolling if the loop is not vectorized.
   // TODO: Enable this again.
@@ -339,6 +439,7 @@ bool GCNTTIImpl::getTgtMemIntrinsic(IntrinsicInst *Inst,
 }
 
 int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                       TTI::TargetCostKind CostKind,
                                        TTI::OperandValueKind Opd1Info,
                                        TTI::OperandValueKind Opd2Info,
                                        TTI::OperandValueProperties Opd1PropInfo,
@@ -347,7 +448,11 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                                        const Instruction *CxtI) {
   EVT OrigTy = TLI->getValueType(DL, Ty);
   if (!OrigTy.isSimple()) {
-    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+    // FIXME: We're having to query the throughput cost so that the basic
+    // implementation tries to generate legalize and scalarization costs. Maybe
+    // we could hoist the scalarization code here?
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, TTI::TCK_RecipThroughput,
+                                         Opd1Info, Opd2Info,
                                          Opd1PropInfo, Opd2PropInfo);
   }
 
@@ -455,24 +560,44 @@ int GCNTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
       return LT.first * NElts * Cost;
     }
     break;
+  case ISD::FNEG:
+    // Use the backend' estimation. If fneg is not free each element will cost
+    // one additional instruction.
+    return TLI->isFNegFree(SLT) ? 0 : NElts;
   default:
     break;
   }
 
-  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+                                       Opd2Info,
                                        Opd1PropInfo, Opd2PropInfo);
 }
 
-template <typename T>
-int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                      ArrayRef<T *> Args,
-                                      FastMathFlags FMF, unsigned VF) {
-  if (ID != Intrinsic::fma)
-    return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+// Return true if there's a potential benefit from using v2f16 instructions for
+// an intrinsic, even if it requires nontrivial legalization.
+static bool intrinsicHasPackedVectorBenefit(Intrinsic::ID ID) {
+  switch (ID) {
+  case Intrinsic::fma: // TODO: fmuladd
+  // There's a small benefit to using vector ops in the legalized code.
+  case Intrinsic::round:
+    return true;
+  default:
+    return false;
+  }
+}
+
+int GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                                      TTI::TargetCostKind CostKind) {
+  if (ICA.getID() == Intrinsic::fabs)
+    return 0;
 
+  if (!intrinsicHasPackedVectorBenefit(ICA.getID()))
+    return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+
+  Type *RetTy = ICA.getReturnType();
   EVT OrigTy = TLI->getValueType(DL, RetTy);
   if (!OrigTy.isSimple()) {
-    return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+    return BaseT::getIntrinsicInstrCost(ICA, CostKind);
   }
 
   // Legalize the type.
@@ -489,36 +614,34 @@ int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
   if (ST->has16BitInsts() && SLT == MVT::f16)
     NElts = (NElts + 1) / 2;
 
-  return LT.first * NElts * (ST->hasFastFMAF32() ? getHalfRateInstrCost()
-                                                 : getQuarterRateInstrCost());
-}
+  // TODO: Get more refined intrinsic costs?
+  unsigned InstRate = getQuarterRateInstrCost();
+  if (ICA.getID() == Intrinsic::fma) {
+    InstRate = ST->hasFastFMAF32() ? getHalfRateInstrCost()
+                                   : getQuarterRateInstrCost();
+  }
 
-int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                      ArrayRef<Value*> Args, FastMathFlags FMF,
-                                      unsigned VF) {
-  return getIntrinsicInstrCost<Value>(ID, RetTy, Args, FMF, VF);
+  return LT.first * NElts * InstRate;
 }
 
-int GCNTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                      ArrayRef<Type *> Tys, FastMathFlags FMF,
-                                      unsigned ScalarizationCostPassed) {
-  return getIntrinsicInstrCost<Type>(ID, RetTy, Tys, FMF,
-                                     ScalarizationCostPassed);
-}
+unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode,
+                                    TTI::TargetCostKind CostKind) {
+  if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
+    return Opcode == Instruction::PHI ? 0 : 1;
 
-unsigned GCNTTIImpl::getCFInstrCost(unsigned Opcode) {
   // XXX - For some reason this isn't called for switch.
   switch (Opcode) {
   case Instruction::Br:
   case Instruction::Ret:
     return 10;
   default:
-    return BaseT::getCFInstrCost(Opcode);
+    return BaseT::getCFInstrCost(Opcode, CostKind);
   }
 }
 
-int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
-                                              bool IsPairwise) {
+int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                           bool IsPairwise,
+                                           TTI::TargetCostKind CostKind) {
   EVT OrigTy = TLI->getValueType(DL, Ty);
 
   // Computes cost on targets that have packed math instructions(which support
@@ -526,15 +649,15 @@ int GCNTTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *Ty,
   if (IsPairwise ||
       !ST->hasVOP3PInsts() ||
       OrigTy.getScalarSizeInBits() != 16)
-    return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise);
+    return BaseT::getArithmeticReductionCost(Opcode, Ty, IsPairwise, CostKind);
 
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
   return LT.first * getFullRateInstrCost();
 }
 
-int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
-                                          bool IsPairwise,
-                                          bool IsUnsigned) {
+int GCNTTIImpl::getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+                                       bool IsPairwise, bool IsUnsigned,
+                                       TTI::TargetCostKind CostKind) {
   EVT OrigTy = TLI->getValueType(DL, Ty);
 
   // Computes cost on targets that have packed math instructions(which support
@@ -542,7 +665,8 @@ int GCNTTIImpl::getMinMaxReductionCost(Type *Ty, Type *CondTy,
   if (IsPairwise ||
       !ST->hasVOP3PInsts() ||
       OrigTy.getScalarSizeInBits() != 16)
-    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned);
+    return BaseT::getMinMaxReductionCost(Ty, CondTy, IsPairwise, IsUnsigned,
+                                         CostKind);
 
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
   return LT.first * getHalfRateInstrCost();
@@ -573,8 +697,6 @@ int GCNTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
   }
 }
 
-
-
 static bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
 
@@ -601,6 +723,58 @@ static bool isArgPassedInSGPR(const Argument *A) {
   }
 }
 
+/// Analyze if the results of inline asm are divergent. If \p Indices is empty,
+/// this is analyzing the collective result of all output registers. Otherwise,
+/// this is only querying a specific result index if this returns multiple
+/// registers in a struct.
+bool GCNTTIImpl::isInlineAsmSourceOfDivergence(
+  const CallInst *CI, ArrayRef<unsigned> Indices) const {
+  // TODO: Handle complex extract indices
+  if (Indices.size() > 1)
+    return true;
+
+  const DataLayout &DL = CI->getModule()->getDataLayout();
+  const SIRegisterInfo *TRI = ST->getRegisterInfo();
+  TargetLowering::AsmOperandInfoVector TargetConstraints =
+      TLI->ParseConstraints(DL, ST->getRegisterInfo(), *CI);
+
+  const int TargetOutputIdx = Indices.empty() ? -1 : Indices[0];
+
+  int OutputIdx = 0;
+  for (auto &TC : TargetConstraints) {
+    if (TC.Type != InlineAsm::isOutput)
+      continue;
+
+    // Skip outputs we don't care about.
+    if (TargetOutputIdx != -1 && TargetOutputIdx != OutputIdx++)
+      continue;
+
+    TLI->ComputeConstraintToUse(TC, SDValue());
+
+    Register AssignedReg;
+    const TargetRegisterClass *RC;
+    std::tie(AssignedReg, RC) = TLI->getRegForInlineAsmConstraint(
+      TRI, TC.ConstraintCode, TC.ConstraintVT);
+    if (AssignedReg) {
+      // FIXME: This is a workaround for getRegForInlineAsmConstraint
+      // returning VS_32
+      RC = TRI->getPhysRegClass(AssignedReg);
+    }
+
+    // For AGPR constraints null is returned on subtargets without AGPRs, so
+    // assume divergent for null.
+    if (!RC || !TRI->isSGPRClass(RC))
+      return true;
+  }
+
+  return false;
+}
+
+/// \returns true if the new GPU divergence analysis is enabled.
+bool GCNTTIImpl::useGPUDivergenceAnalysis() const {
+  return !UseLegacyDA;
+}
+
 /// \returns true if the result of the value could potentially be
 /// different across workitems in a wavefront.
 bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
@@ -628,7 +802,14 @@ bool GCNTTIImpl::isSourceOfDivergence(const Value *V) const {
     return AMDGPU::isIntrinsicSourceOfDivergence(Intrinsic->getIntrinsicID());
 
   // Assume all function calls are a source of divergence.
-  if (isa<CallInst>(V) || isa<InvokeInst>(V))
+  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (CI->isInlineAsm())
+      return isInlineAsmSourceOfDivergence(CI);
+    return true;
+  }
+
+  // Assume all function calls are a source of divergence.
+  if (isa<InvokeInst>(V))
     return true;
 
   return false;
@@ -643,9 +824,44 @@ bool GCNTTIImpl::isAlwaysUniform(const Value *V) const {
     case Intrinsic::amdgcn_readlane:
     case Intrinsic::amdgcn_icmp:
     case Intrinsic::amdgcn_fcmp:
+    case Intrinsic::amdgcn_ballot:
+    case Intrinsic::amdgcn_if_break:
       return true;
     }
   }
+
+  if (const CallInst *CI = dyn_cast<CallInst>(V)) {
+    if (CI->isInlineAsm())
+      return !isInlineAsmSourceOfDivergence(CI);
+    return false;
+  }
+
+  const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V);
+  if (!ExtValue)
+    return false;
+
+  const CallInst *CI = dyn_cast<CallInst>(ExtValue->getOperand(0));
+  if (!CI)
+    return false;
+
+  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(CI)) {
+    switch (Intrinsic->getIntrinsicID()) {
+    default:
+      return false;
+    case Intrinsic::amdgcn_if:
+    case Intrinsic::amdgcn_else: {
+      ArrayRef<unsigned> Indices = ExtValue->getIndices();
+      return Indices.size() == 1 && Indices[0] == 1;
+    }
+    }
+  }
+
+  // If we have inline asm returning mixed SGPR and VGPR results, we inferred
+  // divergent for the overall struct return. We need to override it in the
+  // case we're extracting an SGPR component here.
+  if (CI->isInlineAsm())
+    return !isInlineAsmSourceOfDivergence(CI, ExtValue->getIndices());
+
   return false;
 }
 
@@ -666,8 +882,9 @@ bool GCNTTIImpl::collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
   }
 }
 
-bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
-  IntrinsicInst *II, Value *OldV, Value *NewV) const {
+Value *GCNTTIImpl::rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
+                                                    Value *OldV,
+                                                    Value *NewV) const {
   auto IntrID = II->getIntrinsicID();
   switch (IntrID) {
   case Intrinsic::amdgcn_atomic_inc:
@@ -677,7 +894,7 @@ bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
   case Intrinsic::amdgcn_ds_fmax: {
     const ConstantInt *IsVolatile = cast<ConstantInt>(II->getArgOperand(4));
     if (!IsVolatile->isZero())
-      return false;
+      return nullptr;
     Module *M = II->getParent()->getParent()->getParent();
     Type *DestTy = II->getType();
     Type *SrcTy = NewV->getType();
@@ -685,7 +902,7 @@ bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
         Intrinsic::getDeclaration(M, II->getIntrinsicID(), {DestTy, SrcTy});
     II->setArgOperand(0, NewV);
     II->setCalledFunction(NewDecl);
-    return true;
+    return II;
   }
   case Intrinsic::amdgcn_is_shared:
   case Intrinsic::amdgcn_is_private: {
@@ -695,20 +912,49 @@ bool GCNTTIImpl::rewriteIntrinsicWithAddressSpace(
     LLVMContext &Ctx = NewV->getType()->getContext();
     ConstantInt *NewVal = (TrueAS == NewAS) ?
       ConstantInt::getTrue(Ctx) : ConstantInt::getFalse(Ctx);
-    II->replaceAllUsesWith(NewVal);
-    II->eraseFromParent();
-    return true;
+    return NewVal;
+  }
+  case Intrinsic::ptrmask: {
+    unsigned OldAS = OldV->getType()->getPointerAddressSpace();
+    unsigned NewAS = NewV->getType()->getPointerAddressSpace();
+    Value *MaskOp = II->getArgOperand(1);
+    Type *MaskTy = MaskOp->getType();
+
+    bool DoTruncate = false;
+    if (!getTLI()->isNoopAddrSpaceCast(OldAS, NewAS)) {
+      // All valid 64-bit to 32-bit casts work by chopping off the high
+      // bits. Any masking only clearing the low bits will also apply in the new
+      // address space.
+      if (DL.getPointerSizeInBits(OldAS) != 64 ||
+          DL.getPointerSizeInBits(NewAS) != 32)
+        return nullptr;
+
+      // TODO: Do we need to thread more context in here?
+      KnownBits Known = computeKnownBits(MaskOp, DL, 0, nullptr, II);
+      if (Known.countMinLeadingOnes() < 32)
+        return nullptr;
+
+      DoTruncate = true;
+    }
+
+    IRBuilder<> B(II);
+    if (DoTruncate) {
+      MaskTy = B.getInt32Ty();
+      MaskOp = B.CreateTrunc(MaskOp, MaskTy);
+    }
+
+    return B.CreateIntrinsic(Intrinsic::ptrmask, {NewV->getType(), MaskTy},
+                             {NewV, MaskOp});
   }
   default:
-    return false;
+    return nullptr;
   }
 }
 
-unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
-                                       Type *SubTp) {
+unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *VT,
+                                    int Index, VectorType *SubTp) {
   if (ST->hasVOP3PInsts()) {
-    VectorType *VT = cast<VectorType>(Tp);
-    if (VT->getNumElements() == 2 &&
+    if (cast<FixedVectorType>(VT)->getNumElements() == 2 &&
         DL.getTypeSizeInBits(VT->getElementType()) == 16) {
       // With op_sel VOP3P instructions freely can access the low half or high
       // half of a register, so any swizzle is free.
@@ -724,7 +970,7 @@ unsigned GCNTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     }
   }
 
-  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  return BaseT::getShuffleCost(Kind, VT, Index, SubTp);
 }
 
 bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
@@ -745,8 +991,8 @@ bool GCNTTIImpl::areInlineCompatible(const Function *Caller,
 
   // FIXME: dx10_clamp can just take the caller setting, but there seems to be
   // no way to support merge for backend defined attributes.
-  AMDGPU::SIModeRegisterDefaults CallerMode(*Caller, *CallerST);
-  AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee, *CalleeST);
+  AMDGPU::SIModeRegisterDefaults CallerMode(*Caller);
+  AMDGPU::SIModeRegisterDefaults CalleeMode(*Callee);
   return CallerMode.isInlineCompatible(CalleeMode);
 }
 
@@ -755,117 +1001,9 @@ void GCNTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   CommonTTI.getUnrollingPreferences(L, SE, UP);
 }
 
-unsigned GCNTTIImpl::getUserCost(const User *U,
-                                 ArrayRef<const Value *> Operands) {
-  const Instruction *I = dyn_cast<Instruction>(U);
-  if (!I)
-    return BaseT::getUserCost(U, Operands);
-
-  // Estimate different operations to be optimized out
-  switch (I->getOpcode()) {
-  case Instruction::ExtractElement: {
-    ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(1));
-    unsigned Idx = -1;
-    if (CI)
-      Idx = CI->getZExtValue();
-    return getVectorInstrCost(I->getOpcode(), I->getOperand(0)->getType(), Idx);
-  }
-  case Instruction::InsertElement: {
-    ConstantInt *CI = dyn_cast<ConstantInt>(I->getOperand(2));
-    unsigned Idx = -1;
-    if (CI)
-      Idx = CI->getZExtValue();
-    return getVectorInstrCost(I->getOpcode(), I->getType(), Idx);
-  }
-  case Instruction::Call: {
-    if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(U)) {
-      SmallVector<Value *, 4> Args(II->arg_operands());
-      FastMathFlags FMF;
-      if (auto *FPMO = dyn_cast<FPMathOperator>(II))
-        FMF = FPMO->getFastMathFlags();
-      return getIntrinsicInstrCost(II->getIntrinsicID(), II->getType(), Args,
-                                   FMF);
-    } else {
-      return BaseT::getUserCost(U, Operands);
-    }
-  }
-  case Instruction::ShuffleVector: {
-    const ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(I);
-    Type *Ty = Shuffle->getType();
-    Type *SrcTy = Shuffle->getOperand(0)->getType();
-
-    // TODO: Identify and add costs for insert subvector, etc.
-    int SubIndex;
-    if (Shuffle->isExtractSubvectorMask(SubIndex))
-      return getShuffleCost(TTI::SK_ExtractSubvector, SrcTy, SubIndex, Ty);
-
-    if (Shuffle->changesLength())
-      return BaseT::getUserCost(U, Operands);
-
-    if (Shuffle->isIdentity())
-      return 0;
-
-    if (Shuffle->isReverse())
-      return getShuffleCost(TTI::SK_Reverse, Ty, 0, nullptr);
-
-    if (Shuffle->isSelect())
-      return getShuffleCost(TTI::SK_Select, Ty, 0, nullptr);
-
-    if (Shuffle->isTranspose())
-      return getShuffleCost(TTI::SK_Transpose, Ty, 0, nullptr);
-
-    if (Shuffle->isZeroEltSplat())
-      return getShuffleCost(TTI::SK_Broadcast, Ty, 0, nullptr);
-
-    if (Shuffle->isSingleSource())
-      return getShuffleCost(TTI::SK_PermuteSingleSrc, Ty, 0, nullptr);
-
-    return getShuffleCost(TTI::SK_PermuteTwoSrc, Ty, 0, nullptr);
-  }
-  case Instruction::ZExt:
-  case Instruction::SExt:
-  case Instruction::FPToUI:
-  case Instruction::FPToSI:
-  case Instruction::FPExt:
-  case Instruction::PtrToInt:
-  case Instruction::IntToPtr:
-  case Instruction::SIToFP:
-  case Instruction::UIToFP:
-  case Instruction::Trunc:
-  case Instruction::FPTrunc:
-  case Instruction::BitCast:
-  case Instruction::AddrSpaceCast: {
-    return getCastInstrCost(I->getOpcode(), I->getType(),
-                            I->getOperand(0)->getType(), I);
-  }
-  case Instruction::Add:
-  case Instruction::FAdd:
-  case Instruction::Sub:
-  case Instruction::FSub:
-  case Instruction::Mul:
-  case Instruction::FMul:
-  case Instruction::UDiv:
-  case Instruction::SDiv:
-  case Instruction::FDiv:
-  case Instruction::URem:
-  case Instruction::SRem:
-  case Instruction::FRem:
-  case Instruction::Shl:
-  case Instruction::LShr:
-  case Instruction::AShr:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-  case Instruction::FNeg: {
-    return getArithmeticInstrCost(I->getOpcode(), I->getType(),
-                                  TTI::OK_AnyValue, TTI::OK_AnyValue,
-                                  TTI::OP_None, TTI::OP_None, Operands, I);
-  }
-  default:
-    break;
-  }
-
-  return BaseT::getUserCost(U, Operands);
+void GCNTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                                       TTI::PeelingPreferences &PP) {
+  CommonTTI.getPeelingPreferences(L, SE, PP);
 }
 
 unsigned R600TTIImpl::getHardwareNumberOfRegisters(bool Vec) const {
@@ -903,7 +1041,7 @@ unsigned R600TTIImpl::getLoadStoreVecRegBitWidth(unsigned AddrSpace) const {
 }
 
 bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
-                                             unsigned Alignment,
+                                             Align Alignment,
                                              unsigned AddrSpace) const {
   // We allow vectorization of flat stores, even though we may need to decompose
   // them later if they may access private memory. We don't have enough context
@@ -912,13 +1050,13 @@ bool R600TTIImpl::isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
 }
 
 bool R600TTIImpl::isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
-                                              unsigned Alignment,
+                                              Align Alignment,
                                               unsigned AddrSpace) const {
   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 }
 
 bool R600TTIImpl::isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
-                                               unsigned Alignment,
+                                               Align Alignment,
                                                unsigned AddrSpace) const {
   return isLegalToVectorizeMemChain(ChainSizeInBytes, Alignment, AddrSpace);
 }
@@ -932,14 +1070,18 @@ unsigned R600TTIImpl::getMaxInterleaveFactor(unsigned VF) {
   return 8;
 }
 
-unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode) {
+unsigned R600TTIImpl::getCFInstrCost(unsigned Opcode,
+                                     TTI::TargetCostKind CostKind) {
+  if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency)
+    return Opcode == Instruction::PHI ? 0 : 1;
+
   // XXX - For some reason this isn't called for switch.
   switch (Opcode) {
   case Instruction::Br:
   case Instruction::Ret:
     return 10;
   default:
-    return BaseT::getCFInstrCost(Opcode);
+    return BaseT::getCFInstrCost(Opcode, CostKind);
   }
 }
 
@@ -970,3 +1112,8 @@ void R600TTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                           TTI::UnrollingPreferences &UP) {
   CommonTTI.getUnrollingPreferences(L, SE, UP);
 }
+
+void R600TTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                                        TTI::PeelingPreferences &PP) {
+  CommonTTI.getPeelingPreferences(L, SE, PP);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
index 0b48f9f602b7..3364a9bcaccb 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.h
@@ -61,6 +61,9 @@ public:
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
+
+  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                             TTI::PeelingPreferences &PP);
 };
 
 class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
@@ -70,10 +73,11 @@ class GCNTTIImpl final : public BasicTTIImplBase<GCNTTIImpl> {
   friend BaseT;
 
   const GCNSubtarget *ST;
-  const AMDGPUTargetLowering *TLI;
+  const SITargetLowering *TLI;
   AMDGPUTTIImpl CommonTTI;
   bool IsGraphicsShader;
   bool HasFP32Denormals;
+  unsigned MaxVGPRs;
 
   const FeatureBitset InlineFeatureIgnoreList = {
     // Codegen control options which don't matter.
@@ -133,13 +137,21 @@ public:
       TLI(ST->getTargetLowering()),
       CommonTTI(TM, F),
       IsGraphicsShader(AMDGPU::isShader(F.getCallingConv())),
-      HasFP32Denormals(ST->hasFP32Denormals(F)) { }
+      HasFP32Denormals(AMDGPU::SIModeRegisterDefaults(F).allFP32Denormals()),
+      MaxVGPRs(ST->getMaxNumVGPRs(
+          std::max(ST->getWavesPerEU(F).first,
+                   ST->getWavesPerEUForWorkGroup(
+                       ST->getFlatWorkGroupSizes(F).second)))) {}
 
   bool hasBranchDivergence() { return true; }
+  bool useGPUDivergenceAnalysis() const;
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
 
+  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                             TTI::PeelingPreferences &PP);
+
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth) {
     assert(isPowerOf2_32(TyWidth) && "Ty width must be power of 2");
     return TTI::PSK_FastHardware;
@@ -147,6 +159,7 @@ public:
 
   unsigned getHardwareNumberOfRegisters(bool Vector) const;
   unsigned getNumberOfRegisters(bool Vector) const;
+  unsigned getNumberOfRegisters(unsigned RCID) const;
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
   unsigned getLoadVectorFactor(unsigned VF, unsigned LoadSize,
@@ -157,22 +170,30 @@ public:
                                 VectorType *VecTy) const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
 
-  bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes,
-                                  unsigned Alignment,
+  bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
                                   unsigned AddrSpace) const;
-  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
-                                   unsigned Alignment,
+  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
                                    unsigned AddrSpace) const;
-  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
-                                    unsigned Alignment,
+  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
                                     unsigned AddrSpace) const;
-
+  Type *getMemcpyLoopLoweringType(LLVMContext &Context, Value *Length,
+                                  unsigned SrcAddrSpace, unsigned DestAddrSpace,
+                                  unsigned SrcAlign, unsigned DestAlign) const;
+
+  void getMemcpyLoopResidualLoweringType(SmallVectorImpl<Type *> &OpsOut,
+                                         LLVMContext &Context,
+                                         unsigned RemainingBytes,
+                                         unsigned SrcAddrSpace,
+                                         unsigned DestAddrSpace,
+                                         unsigned SrcAlign,
+                                         unsigned DestAlign) const;
   unsigned getMaxInterleaveFactor(unsigned VF);
 
   bool getTgtMemIntrinsic(IntrinsicInst *Inst, MemIntrinsicInfo &Info) const;
 
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -180,7 +201,10 @@ public:
       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
       const Instruction *CxtI = nullptr);
 
-  unsigned getCFInstrCost(unsigned Opcode);
+  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
+
+  bool isInlineAsmSourceOfDivergence(const CallInst *CI,
+                                     ArrayRef<unsigned> Indices = {}) const;
 
   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
   bool isSourceOfDivergence(const Value *V) const;
@@ -196,13 +220,13 @@ public:
 
   bool collectFlatAddressOperands(SmallVectorImpl<int> &OpIndexes,
                                   Intrinsic::ID IID) const;
-  bool rewriteIntrinsicWithAddressSpace(IntrinsicInst *II,
-                                        Value *OldV, Value *NewV) const;
+  Value *rewriteIntrinsicWithAddressSpace(IntrinsicInst *II, Value *OldV,
+                                          Value *NewV) const;
 
   unsigned getVectorSplitCost() { return 0; }
 
-  unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
-                          Type *SubTp);
+  unsigned getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
+                          VectorType *SubTp);
 
   bool areInlineCompatible(const Function *Caller,
                            const Function *Callee) const;
@@ -211,23 +235,17 @@ public:
 
   int getInlinerVectorBonusPercent() { return 0; }
 
-  int getArithmeticReductionCost(unsigned Opcode,
-                                 Type *Ty,
-                                 bool IsPairwise);
-  template <typename T>
-  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<T *> Args, FastMathFlags FMF,
-                            unsigned VF);
-  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<Type *> Tys, FastMathFlags FMF,
-                            unsigned ScalarizationCostPassed = UINT_MAX);
-  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<Value *> Args, FastMathFlags FMF,
-                            unsigned VF = 1);
-  int getMinMaxReductionCost(Type *Ty, Type *CondTy,
-                             bool IsPairwiseForm,
-                             bool IsUnsigned);
-  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
+  int getArithmeticReductionCost(
+      unsigned Opcode,
+      VectorType *Ty,
+      bool IsPairwise,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
+
+  int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                            TTI::TargetCostKind CostKind);
+  int getMinMaxReductionCost(
+    VectorType *Ty, VectorType *CondTy, bool IsPairwiseForm, bool IsUnsigned,
+    TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput);
 };
 
 class R600TTIImpl final : public BasicTTIImplBase<R600TTIImpl> {
@@ -245,28 +263,28 @@ public:
     : BaseT(TM, F.getParent()->getDataLayout()),
       ST(static_cast<const R600Subtarget*>(TM->getSubtargetImpl(F))),
       TLI(ST->getTargetLowering()),
-      CommonTTI(TM, F)	{}
+      CommonTTI(TM, F) {}
 
   const R600Subtarget *getST() const { return ST; }
   const AMDGPUTargetLowering *getTLI() const { return TLI; }
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
+  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                             TTI::PeelingPreferences &PP);
   unsigned getHardwareNumberOfRegisters(bool Vec) const;
   unsigned getNumberOfRegisters(bool Vec) const;
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getMinVectorRegisterBitWidth() const;
   unsigned getLoadStoreVecRegBitWidth(unsigned AddrSpace) const;
-  bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, unsigned Alignment,
+  bool isLegalToVectorizeMemChain(unsigned ChainSizeInBytes, Align Alignment,
                                   unsigned AddrSpace) const;
-  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
-		                   unsigned Alignment,
+  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
                                    unsigned AddrSpace) const;
-  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
-                                    unsigned Alignment,
+  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
                                     unsigned AddrSpace) const;
   unsigned getMaxInterleaveFactor(unsigned VF);
-  unsigned getCFInstrCost(unsigned Opcode);
+  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
   int getVectorInstrCost(unsigned Opcode, Type *ValTy, unsigned Index);
 };
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
index 191f603a66d6..418296684d76 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUUnifyDivergentExitNodes.cpp
@@ -34,6 +34,7 @@
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
@@ -117,24 +118,58 @@ static bool isUniformlyReached(const LegacyDivergenceAnalysis &DA,
   return true;
 }
 
+static void removeDoneExport(Function &F) {
+  ConstantInt *BoolFalse = ConstantInt::getFalse(F.getContext());
+  for (BasicBlock &BB : F) {
+    for (Instruction &I : BB) {
+      if (IntrinsicInst *Intrin = llvm::dyn_cast<IntrinsicInst>(&I)) {
+        if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp) {
+          Intrin->setArgOperand(6, BoolFalse); // done
+        } else if (Intrin->getIntrinsicID() == Intrinsic::amdgcn_exp_compr) {
+          Intrin->setArgOperand(4, BoolFalse); // done
+        }
+      }
+    }
+  }
+}
+
 static BasicBlock *unifyReturnBlockSet(Function &F,
                                        ArrayRef<BasicBlock *> ReturningBlocks,
+                                       bool InsertExport,
                                        const TargetTransformInfo &TTI,
                                        StringRef Name) {
   // Otherwise, we need to insert a new basic block into the function, add a PHI
   // nodes (if the function returns values), and convert all of the return
   // instructions into unconditional branches.
   BasicBlock *NewRetBlock = BasicBlock::Create(F.getContext(), Name, &F);
+  IRBuilder<> B(NewRetBlock);
+
+  if (InsertExport) {
+    // Ensure that there's only one "done" export in the shader by removing the
+    // "done" bit set on the original final export. More than one "done" export
+    // can lead to undefined behavior.
+    removeDoneExport(F);
+
+    Value *Undef = UndefValue::get(B.getFloatTy());
+    B.CreateIntrinsic(Intrinsic::amdgcn_exp, { B.getFloatTy() },
+                      {
+                        B.getInt32(9), // target, SQ_EXP_NULL
+                        B.getInt32(0), // enabled channels
+                        Undef, Undef, Undef, Undef, // values
+                        B.getTrue(), // done
+                        B.getTrue(), // valid mask
+                      });
+  }
 
   PHINode *PN = nullptr;
   if (F.getReturnType()->isVoidTy()) {
-    ReturnInst::Create(F.getContext(), nullptr, NewRetBlock);
+    B.CreateRetVoid();
   } else {
     // If the function doesn't return void... add a PHI node to the block...
-    PN = PHINode::Create(F.getReturnType(), ReturningBlocks.size(),
-                         "UnifiedRetVal");
-    NewRetBlock->getInstList().push_back(PN);
-    ReturnInst::Create(F.getContext(), PN, NewRetBlock);
+    PN = B.CreatePHI(F.getReturnType(), ReturningBlocks.size(),
+                     "UnifiedRetVal");
+    assert(!InsertExport);
+    B.CreateRet(PN);
   }
 
   // Loop over all of the blocks, replacing the return instruction with an
@@ -160,7 +195,11 @@ static BasicBlock *unifyReturnBlockSet(Function &F,
 
 bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
   auto &PDT = getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
-  if (PDT.getRoots().size() <= 1)
+
+  // If there's only one exit, we don't need to do anything, unless this is a
+  // pixel shader and that exit is an infinite loop, since we still have to
+  // insert an export in that case.
+  if (PDT.root_size() <= 1 && F.getCallingConv() != CallingConv::AMDGPU_PS)
     return false;
 
   LegacyDivergenceAnalysis &DA = getAnalysis<LegacyDivergenceAnalysis>();
@@ -168,15 +207,21 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
   // Loop over all of the blocks in a function, tracking all of the blocks that
   // return.
   SmallVector<BasicBlock *, 4> ReturningBlocks;
+  SmallVector<BasicBlock *, 4> UniformlyReachedRetBlocks;
   SmallVector<BasicBlock *, 4> UnreachableBlocks;
 
   // Dummy return block for infinite loop.
   BasicBlock *DummyReturnBB = nullptr;
 
-  for (BasicBlock *BB : PDT.getRoots()) {
+  bool InsertExport = false;
+
+  bool Changed = false;
+  for (BasicBlock *BB : PDT.roots()) {
     if (isa<ReturnInst>(BB->getTerminator())) {
       if (!isUniformlyReached(DA, *BB))
         ReturningBlocks.push_back(BB);
+      else
+        UniformlyReachedRetBlocks.push_back(BB);
     } else if (isa<UnreachableInst>(BB->getTerminator())) {
       if (!isUniformlyReached(DA, *BB))
         UnreachableBlocks.push_back(BB);
@@ -188,6 +233,36 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
                                            "DummyReturnBlock", &F);
         Type *RetTy = F.getReturnType();
         Value *RetVal = RetTy->isVoidTy() ? nullptr : UndefValue::get(RetTy);
+
+        // For pixel shaders, the producer guarantees that an export is
+        // executed before each return instruction. However, if there is an
+        // infinite loop and we insert a return ourselves, we need to uphold
+        // that guarantee by inserting a null export. This can happen e.g. in
+        // an infinite loop with kill instructions, which is supposed to
+        // terminate. However, we don't need to do this if there is a non-void
+        // return value, since then there is an epilog afterwards which will
+        // still export.
+        //
+        // Note: In the case where only some threads enter the infinite loop,
+        // this can result in the null export happening redundantly after the
+        // original exports. However, The last "real" export happens after all
+        // the threads that didn't enter an infinite loop converged, which
+        // means that the only extra threads to execute the null export are
+        // threads that entered the infinite loop, and they only could've
+        // exited through being killed which sets their exec bit to 0.
+        // Therefore, unless there's an actual infinite loop, which can have
+        // invalid results, or there's a kill after the last export, which we
+        // assume the frontend won't do, this export will have the same exec
+        // mask as the last "real" export, and therefore the valid mask will be
+        // overwritten with the same value and will still be correct. Also,
+        // even though this forces an extra unnecessary export wait, we assume
+        // that this happens rare enough in practice to that we don't have to
+        // worry about performance.
+        if (F.getCallingConv() == CallingConv::AMDGPU_PS &&
+            RetTy->isVoidTy()) {
+          InsertExport = true;
+        }
+
         ReturnInst::Create(F.getContext(), RetVal, DummyReturnBB);
         ReturningBlocks.push_back(DummyReturnBB);
       }
@@ -206,6 +281,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
         BB->getTerminator()->eraseFromParent();
         BranchInst::Create(TransitionBB, DummyReturnBB, BoolTrue, BB);
       }
+      Changed = true;
     }
   }
 
@@ -224,6 +300,7 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
         BB->getTerminator()->eraseFromParent();
         BranchInst::Create(UnreachableBlock, BB);
       }
+      Changed = true;
     }
 
     if (!ReturningBlocks.empty()) {
@@ -247,19 +324,32 @@ bool AMDGPUUnifyDivergentExitNodes::runOnFunction(Function &F) {
       // actually reached here.
       ReturnInst::Create(F.getContext(), RetVal, UnreachableBlock);
       ReturningBlocks.push_back(UnreachableBlock);
+      Changed = true;
     }
   }
 
   // Now handle return blocks.
   if (ReturningBlocks.empty())
-    return false; // No blocks return
+    return Changed; // No blocks return
 
-  if (ReturningBlocks.size() == 1)
-    return false; // Already has a single return block
+  if (ReturningBlocks.size() == 1 && !InsertExport)
+    return Changed; // Already has a single return block
 
   const TargetTransformInfo &TTI
     = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
-  unifyReturnBlockSet(F, ReturningBlocks, TTI, "UnifiedReturnBlock");
+  // Unify returning blocks. If we are going to insert the export it is also
+  // necessary to include blocks that are uniformly reached, because in addition
+  // to inserting the export the "done" bits on existing exports will be cleared
+  // and we do not want to end up with the normal export in a non-unified,
+  // uniformly reached block with the "done" bit cleared.
+  auto BlocksToUnify = std::move(ReturningBlocks);
+  if (InsertExport) {
+    BlocksToUnify.insert(BlocksToUnify.end(), UniformlyReachedRetBlocks.begin(),
+                         UniformlyReachedRetBlocks.end());
+  }
+
+  unifyReturnBlockSet(F, BlocksToUnify, InsertExport, TTI,
+                      "UnifiedReturnBlock");
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
index f3aa1a582368..013b7a0cf25d 100644
--- a/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
+++ b/llvm/lib/Target/AMDGPU/AsmParser/AMDGPUAsmParser.cpp
@@ -163,6 +163,7 @@ public:
     ImmTyUNorm,
     ImmTyDA,
     ImmTyR128A16,
+    ImmTyA16,
     ImmTyLWE,
     ImmTyExpTgt,
     ImmTyExpCompr,
@@ -277,6 +278,7 @@ public:
            isRegClass(AMDGPU::VReg_96RegClassID) ||
            isRegClass(AMDGPU::VReg_128RegClassID) ||
            isRegClass(AMDGPU::VReg_160RegClassID) ||
+           isRegClass(AMDGPU::VReg_192RegClassID) ||
            isRegClass(AMDGPU::VReg_256RegClassID) ||
            isRegClass(AMDGPU::VReg_512RegClassID) ||
            isRegClass(AMDGPU::VReg_1024RegClassID);
@@ -315,6 +317,7 @@ public:
   bool isUNorm() const { return isImmTy(ImmTyUNorm); }
   bool isDA() const { return isImmTy(ImmTyDA); }
   bool isR128A16() const { return isImmTy(ImmTyR128A16); }
+  bool isGFX10A16() const { return isImmTy(ImmTyA16); }
   bool isLWE() const { return isImmTy(ImmTyLWE); }
   bool isOff() const { return isImmTy(ImmTyOff); }
   bool isExpTgt() const { return isImmTy(ImmTyExpTgt); }
@@ -486,7 +489,7 @@ public:
   }
 
   bool isVSrcB16() const {
-    return isVCSrcF16() || isLiteralImm(MVT::i16);
+    return isVCSrcB16() || isLiteralImm(MVT::i16);
   }
 
   bool isVSrcV2B16() const {
@@ -654,7 +657,7 @@ public:
   bool isSendMsg() const;
   bool isSwizzle() const;
   bool isSMRDOffset8() const;
-  bool isSMRDOffset20() const;
+  bool isSMEMOffset() const;
   bool isSMRDLiteralOffset() const;
   bool isDPP8() const;
   bool isDPPCtrl() const;
@@ -847,6 +850,7 @@ public:
     case ImmTyUNorm: OS << "UNorm"; break;
     case ImmTyDA: OS << "DA"; break;
     case ImmTyR128A16: OS << "R128A16"; break;
+    case ImmTyA16: OS << "A16"; break;
     case ImmTyLWE: OS << "LWE"; break;
     case ImmTyOff: OS << "Off"; break;
     case ImmTyExpTgt: OS << "ExpTgt"; break;
@@ -1062,17 +1066,20 @@ private:
 
   bool AddNextRegisterToList(unsigned& Reg, unsigned& RegWidth,
                              RegisterKind RegKind, unsigned Reg1);
-  bool ParseAMDGPURegister(RegisterKind& RegKind, unsigned& Reg,
-                           unsigned& RegNum, unsigned& RegWidth);
-  unsigned ParseRegularReg(RegisterKind &RegKind,
-                           unsigned &RegNum,
-                           unsigned &RegWidth);
-  unsigned ParseSpecialReg(RegisterKind &RegKind,
-                           unsigned &RegNum,
-                           unsigned &RegWidth);
-  unsigned ParseRegList(RegisterKind &RegKind,
-                        unsigned &RegNum,
-                        unsigned &RegWidth);
+  bool ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
+                           unsigned &RegNum, unsigned &RegWidth,
+                           bool RestoreOnFailure = false);
+  bool ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
+                           unsigned &RegNum, unsigned &RegWidth,
+                           SmallVectorImpl<AsmToken> &Tokens);
+  unsigned ParseRegularReg(RegisterKind &RegKind, unsigned &RegNum,
+                           unsigned &RegWidth,
+                           SmallVectorImpl<AsmToken> &Tokens);
+  unsigned ParseSpecialReg(RegisterKind &RegKind, unsigned &RegNum,
+                           unsigned &RegWidth,
+                           SmallVectorImpl<AsmToken> &Tokens);
+  unsigned ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
+                        unsigned &RegWidth, SmallVectorImpl<AsmToken> &Tokens);
   bool ParseRegRange(unsigned& Num, unsigned& Width);
   unsigned getRegularReg(RegisterKind RegKind,
                          unsigned RegNum,
@@ -1157,6 +1164,10 @@ public:
     return AMDGPU::hasPackedD16(getSTI());
   }
 
+  bool hasGFX10A16() const {
+    return AMDGPU::hasGFX10A16(getSTI());
+  }
+
   bool isSI() const {
     return AMDGPU::isSI(getSTI());
   }
@@ -1177,6 +1188,10 @@ public:
     return AMDGPU::isGFX10(getSTI());
   }
 
+  bool isGFX10_BEncoding() const {
+    return AMDGPU::isGFX10_BEncoding(getSTI());
+  }
+
   bool hasInv2PiInlineImm() const {
     return getFeatureBits()[AMDGPU::FeatureInv2PiInlineImm];
   }
@@ -1226,8 +1241,12 @@ public:
   bool isForcedSDWA() const { return ForcedSDWA; }
   ArrayRef<unsigned> getMatchedVariants() const;
 
-  std::unique_ptr<AMDGPUOperand> parseRegister();
+  std::unique_ptr<AMDGPUOperand> parseRegister(bool RestoreOnFailure = false);
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc,
+                     bool RestoreOnFailure);
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
   unsigned checkTargetMatchPredicate(MCInst &Inst) override;
   unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
                                       unsigned Kind) override;
@@ -1311,9 +1330,11 @@ private:
   void errorExpTgt();
   OperandMatchResultTy parseExpTgtImpl(StringRef Str, uint8_t &Val);
   SMLoc getFlatOffsetLoc(const OperandVector &Operands) const;
+  SMLoc getSMEMOffsetLoc(const OperandVector &Operands) const;
 
   bool validateInstruction(const MCInst &Inst, const SMLoc &IDLoc, const OperandVector &Operands);
   bool validateFlatOffset(const MCInst &Inst, const OperandVector &Operands);
+  bool validateSMEMOffset(const MCInst &Inst, const OperandVector &Operands);
   bool validateSOPLiteral(const MCInst &Inst) const;
   bool validateConstantBusLimitations(const MCInst &Inst);
   bool validateEarlyClobberLimitations(const MCInst &Inst);
@@ -1329,6 +1350,7 @@ private:
   bool validateOpSel(const MCInst &Inst);
   bool validateVccOperand(unsigned Reg) const;
   bool validateVOP3Literal(const MCInst &Inst) const;
+  bool validateMAIAccWrite(const MCInst &Inst);
   unsigned getConstantBusLimit(unsigned Opcode) const;
   bool usesConstantBus(const MCInst &Inst, unsigned OpIdx);
   bool isInlineConstant(const MCInst &Inst, unsigned OpIdx) const;
@@ -1390,7 +1412,7 @@ public:
   AMDGPUOperand::Ptr defaultSLC() const;
 
   AMDGPUOperand::Ptr defaultSMRDOffset8() const;
-  AMDGPUOperand::Ptr defaultSMRDOffset20() const;
+  AMDGPUOperand::Ptr defaultSMEMOffset() const;
   AMDGPUOperand::Ptr defaultSMRDLiteralOffset() const;
   AMDGPUOperand::Ptr defaultFlatOffset() const;
 
@@ -1524,6 +1546,16 @@ static bool isSafeTruncation(int64_t Val, unsigned Size) {
   return isUIntN(Size, Val) || isIntN(Size, Val);
 }
 
+static bool isInlineableLiteralOp16(int64_t Val, MVT VT, bool HasInv2Pi) {
+  if (VT.getScalarType() == MVT::i16) {
+    // FP immediate values are broken.
+    return isInlinableIntLiteral(Val);
+  }
+
+  // f16/v2f16 operands work correctly for all values.
+  return AMDGPU::isInlinableLiteral16(Val, HasInv2Pi);
+}
+
 bool AMDGPUOperand::isInlinableImm(MVT type) const {
 
   // This is a hack to enable named inline values like
@@ -1555,9 +1587,9 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
       return false;
 
     if (type.getScalarSizeInBits() == 16) {
-      return AMDGPU::isInlinableLiteral16(
+      return isInlineableLiteralOp16(
         static_cast<int16_t>(FPLiteral.bitcastToAPInt().getZExtValue()),
-        AsmParser->hasInv2PiInlineImm());
+        type, AsmParser->hasInv2PiInlineImm());
     }
 
     // Check if single precision literal is inlinable
@@ -1577,9 +1609,9 @@ bool AMDGPUOperand::isInlinableImm(MVT type) const {
   }
 
   if (type.getScalarSizeInBits() == 16) {
-    return AMDGPU::isInlinableLiteral16(
+    return isInlineableLiteralOp16(
       static_cast<int16_t>(Literal.getLoBits(16).getSExtValue()),
-      AsmParser->hasInv2PiInlineImm());
+      type, AsmParser->hasInv2PiInlineImm());
   }
 
   return AMDGPU::isInlinableLiteral32(
@@ -1901,6 +1933,7 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
       case 3: return AMDGPU::VReg_96RegClassID;
       case 4: return AMDGPU::VReg_128RegClassID;
       case 5: return AMDGPU::VReg_160RegClassID;
+      case 6: return AMDGPU::VReg_192RegClassID;
       case 8: return AMDGPU::VReg_256RegClassID;
       case 16: return AMDGPU::VReg_512RegClassID;
       case 32: return AMDGPU::VReg_1024RegClassID;
@@ -1919,7 +1952,10 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
       default: return -1;
       case 1: return AMDGPU::SGPR_32RegClassID;
       case 2: return AMDGPU::SGPR_64RegClassID;
+      case 3: return AMDGPU::SGPR_96RegClassID;
       case 4: return AMDGPU::SGPR_128RegClassID;
+      case 5: return AMDGPU::SGPR_160RegClassID;
+      case 6: return AMDGPU::SGPR_192RegClassID;
       case 8: return AMDGPU::SGPR_256RegClassID;
       case 16: return AMDGPU::SGPR_512RegClassID;
     }
@@ -1928,7 +1964,11 @@ static int getRegClass(RegisterKind Is, unsigned RegWidth) {
       default: return -1;
       case 1: return AMDGPU::AGPR_32RegClassID;
       case 2: return AMDGPU::AReg_64RegClassID;
+      case 3: return AMDGPU::AReg_96RegClassID;
       case 4: return AMDGPU::AReg_128RegClassID;
+      case 5: return AMDGPU::AReg_160RegClassID;
+      case 6: return AMDGPU::AReg_192RegClassID;
+      case 8: return AMDGPU::AReg_256RegClassID;
       case 16: return AMDGPU::AReg_512RegClassID;
       case 32: return AMDGPU::AReg_1024RegClassID;
     }
@@ -1975,12 +2015,13 @@ static unsigned getSpecialRegForName(StringRef RegName) {
     .Case("tma_hi", AMDGPU::TMA_HI)
     .Case("tba_lo", AMDGPU::TBA_LO)
     .Case("tba_hi", AMDGPU::TBA_HI)
+    .Case("pc", AMDGPU::PC_REG)
     .Case("null", AMDGPU::SGPR_NULL)
     .Default(AMDGPU::NoRegister);
 }
 
 bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
-                                    SMLoc &EndLoc) {
+                                    SMLoc &EndLoc, bool RestoreOnFailure) {
   auto R = parseRegister();
   if (!R) return true;
   assert(R->isReg());
@@ -1990,6 +2031,25 @@ bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
   return false;
 }
 
+bool AMDGPUAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                    SMLoc &EndLoc) {
+  return ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/false);
+}
+
+OperandMatchResultTy AMDGPUAsmParser::tryParseRegister(unsigned &RegNo,
+                                                       SMLoc &StartLoc,
+                                                       SMLoc &EndLoc) {
+  bool Result =
+      ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/true);
+  bool PendingErrors = getParser().hasPendingError();
+  getParser().clearPendingErrors();
+  if (PendingErrors)
+    return MatchOperand_ParseFail;
+  if (Result)
+    return MatchOperand_NoMatch;
+  return MatchOperand_Success;
+}
+
 bool AMDGPUAsmParser::AddNextRegisterToList(unsigned &Reg, unsigned &RegWidth,
                                             RegisterKind RegKind, unsigned Reg1) {
   switch (RegKind) {
@@ -2166,31 +2226,31 @@ AMDGPUAsmParser::ParseRegRange(unsigned& Num, unsigned& Width) {
   return true;
 }
 
-unsigned
-AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind,
-                                 unsigned &RegNum,
-                                 unsigned &RegWidth) {
+unsigned AMDGPUAsmParser::ParseSpecialReg(RegisterKind &RegKind,
+                                          unsigned &RegNum, unsigned &RegWidth,
+                                          SmallVectorImpl<AsmToken> &Tokens) {
   assert(isToken(AsmToken::Identifier));
   unsigned Reg = getSpecialRegForName(getTokenStr());
   if (Reg) {
     RegNum = 0;
     RegWidth = 1;
     RegKind = IS_SPECIAL;
+    Tokens.push_back(getToken());
     lex(); // skip register name
   }
   return Reg;
 }
 
-unsigned
-AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
-                                 unsigned &RegNum,
-                                 unsigned &RegWidth) {
+unsigned AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
+                                          unsigned &RegNum, unsigned &RegWidth,
+                                          SmallVectorImpl<AsmToken> &Tokens) {
   assert(isToken(AsmToken::Identifier));
   StringRef RegName = getTokenStr();
 
   const RegInfo *RI = getRegularRegInfo(RegName);
   if (!RI)
     return AMDGPU::NoRegister;
+  Tokens.push_back(getToken());
   lex(); // skip register name
 
   RegKind = RI->Kind;
@@ -2209,10 +2269,9 @@ AMDGPUAsmParser::ParseRegularReg(RegisterKind &RegKind,
   return getRegularReg(RegKind, RegNum, RegWidth);
 }
 
-unsigned
-AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind,
-                              unsigned &RegNum,
-                              unsigned &RegWidth) {
+unsigned AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind, unsigned &RegNum,
+                                       unsigned &RegWidth,
+                                       SmallVectorImpl<AsmToken> &Tokens) {
   unsigned Reg = AMDGPU::NoRegister;
 
   if (!trySkipToken(AsmToken::LBrac))
@@ -2229,7 +2288,8 @@ AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind,
     RegisterKind NextRegKind;
     unsigned NextReg, NextRegNum, NextRegWidth;
 
-    if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth))
+    if (!ParseAMDGPURegister(NextRegKind, NextReg, NextRegNum, NextRegWidth,
+                             Tokens))
       return AMDGPU::NoRegister;
     if (NextRegWidth != 1)
       return AMDGPU::NoRegister;
@@ -2248,24 +2308,40 @@ AMDGPUAsmParser::ParseRegList(RegisterKind &RegKind,
   return Reg;
 }
 
-bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind,
-                                          unsigned &Reg,
-                                          unsigned &RegNum,
-                                          unsigned &RegWidth) {
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
+                                          unsigned &RegNum, unsigned &RegWidth,
+                                          SmallVectorImpl<AsmToken> &Tokens) {
   Reg = AMDGPU::NoRegister;
 
   if (isToken(AsmToken::Identifier)) {
-    Reg = ParseSpecialReg(RegKind, RegNum, RegWidth);
+    Reg = ParseSpecialReg(RegKind, RegNum, RegWidth, Tokens);
     if (Reg == AMDGPU::NoRegister)
-      Reg = ParseRegularReg(RegKind, RegNum, RegWidth);
+      Reg = ParseRegularReg(RegKind, RegNum, RegWidth, Tokens);
   } else {
-    Reg = ParseRegList(RegKind, RegNum, RegWidth);
+    Reg = ParseRegList(RegKind, RegNum, RegWidth, Tokens);
   }
 
   const MCRegisterInfo *TRI = getContext().getRegisterInfo();
   return Reg != AMDGPU::NoRegister && subtargetHasRegister(*TRI, Reg);
 }
 
+bool AMDGPUAsmParser::ParseAMDGPURegister(RegisterKind &RegKind, unsigned &Reg,
+                                          unsigned &RegNum, unsigned &RegWidth,
+                                          bool RestoreOnFailure) {
+  Reg = AMDGPU::NoRegister;
+
+  SmallVector<AsmToken, 1> Tokens;
+  if (ParseAMDGPURegister(RegKind, Reg, RegNum, RegWidth, Tokens)) {
+    if (RestoreOnFailure) {
+      while (!Tokens.empty()) {
+        getLexer().UnLex(Tokens.pop_back_val());
+      }
+    }
+    return true;
+  }
+  return false;
+}
+
 Optional<StringRef>
 AMDGPUAsmParser::getGprCountSymbolName(RegisterKind RegKind) {
   switch (RegKind) {
@@ -2314,7 +2390,8 @@ bool AMDGPUAsmParser::updateGprCountSymbols(RegisterKind RegKind,
   return true;
 }
 
-std::unique_ptr<AMDGPUOperand> AMDGPUAsmParser::parseRegister() {
+std::unique_ptr<AMDGPUOperand>
+AMDGPUAsmParser::parseRegister(bool RestoreOnFailure) {
   const auto &Tok = Parser.getTok();
   SMLoc StartLoc = Tok.getLoc();
   SMLoc EndLoc = Tok.getEndLoc();
@@ -2758,16 +2835,22 @@ bool AMDGPUAsmParser::isInlineConstant(const MCInst &Inst,
     return AMDGPU::isInlinableLiteral32(Val, hasInv2PiInlineImm());
   case 2: {
     const unsigned OperandType = Desc.OpInfo[OpIdx].OperandType;
+    if (OperandType == AMDGPU::OPERAND_REG_IMM_INT16 ||
+        OperandType == AMDGPU::OPERAND_REG_INLINE_C_INT16 ||
+        OperandType == AMDGPU::OPERAND_REG_INLINE_AC_INT16)
+      return AMDGPU::isInlinableIntLiteral(Val);
+
     if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2INT16 ||
-        OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 ||
         OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2INT16 ||
+        OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16)
+      return AMDGPU::isInlinableIntLiteralV216(Val);
+
+    if (OperandType == AMDGPU::OPERAND_REG_INLINE_C_V2FP16 ||
         OperandType == AMDGPU::OPERAND_REG_INLINE_AC_V2FP16 ||
-        OperandType == AMDGPU::OPERAND_REG_IMM_V2INT16 ||
-        OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16) {
+        OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
       return AMDGPU::isInlinableLiteralV216(Val, hasInv2PiInlineImm());
-    } else {
-      return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
-    }
+
+    return AMDGPU::isInlinableLiteral16(Val, hasInv2PiInlineImm());
   }
   default:
     llvm_unreachable("invalid operand size");
@@ -3085,6 +3168,30 @@ bool AMDGPUAsmParser::validateMovrels(const MCInst &Inst) {
   return !isSGPR(mc2PseudoReg(Reg), TRI);
 }
 
+bool AMDGPUAsmParser::validateMAIAccWrite(const MCInst &Inst) {
+
+  const unsigned Opc = Inst.getOpcode();
+
+  if (Opc != AMDGPU::V_ACCVGPR_WRITE_B32_vi)
+    return true;
+
+  const int Src0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::src0);
+  assert(Src0Idx != -1);
+
+  const MCOperand &Src0 = Inst.getOperand(Src0Idx);
+  if (!Src0.isReg())
+    return true;
+
+  auto Reg = Src0.getReg();
+  const MCRegisterInfo *TRI = getContext().getRegisterInfo();
+  if (isSGPR(mc2PseudoReg(Reg), TRI)) {
+    Error(getLoc(), "source operand must be either a VGPR or an inline constant");
+    return false;
+  }
+
+  return true;
+}
+
 bool AMDGPUAsmParser::validateMIMGD16(const MCInst &Inst) {
 
   const unsigned Opc = Inst.getOpcode();
@@ -3335,6 +3442,46 @@ bool AMDGPUAsmParser::validateFlatOffset(const MCInst &Inst,
   return true;
 }
 
+SMLoc AMDGPUAsmParser::getSMEMOffsetLoc(const OperandVector &Operands) const {
+  for (unsigned i = 1, e = Operands.size(); i != e; ++i) {
+    AMDGPUOperand &Op = ((AMDGPUOperand &)*Operands[i]);
+    if (Op.isSMEMOffset())
+      return Op.getStartLoc();
+  }
+  return getLoc();
+}
+
+bool AMDGPUAsmParser::validateSMEMOffset(const MCInst &Inst,
+                                         const OperandVector &Operands) {
+  if (isCI() || isSI())
+    return true;
+
+  uint64_t TSFlags = MII.get(Inst.getOpcode()).TSFlags;
+  if ((TSFlags & SIInstrFlags::SMRD) == 0)
+    return true;
+
+  auto Opcode = Inst.getOpcode();
+  auto OpNum = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::offset);
+  if (OpNum == -1)
+    return true;
+
+  const auto &Op = Inst.getOperand(OpNum);
+  if (!Op.isImm())
+    return true;
+
+  uint64_t Offset = Op.getImm();
+  bool IsBuffer = AMDGPU::getSMEMIsBuffer(Opcode);
+  if (AMDGPU::isLegalSMRDEncodedUnsignedOffset(getSTI(), Offset) ||
+      AMDGPU::isLegalSMRDEncodedSignedOffset(getSTI(), Offset, IsBuffer))
+    return true;
+
+  Error(getSMEMOffsetLoc(Operands),
+        (isVI() || IsBuffer) ? "expected a 20-bit unsigned offset" :
+                               "expected a 21-bit signed offset");
+
+  return false;
+}
+
 bool AMDGPUAsmParser::validateSOPLiteral(const MCInst &Inst) const {
   unsigned Opcode = Inst.getOpcode();
   const MCInstrDesc &Desc = MII.get(Opcode);
@@ -3512,6 +3659,12 @@ bool AMDGPUAsmParser::validateInstruction(const MCInst &Inst,
   if (!validateFlatOffset(Inst, Operands)) {
     return false;
   }
+  if (!validateSMEMOffset(Inst, Operands)) {
+    return false;
+  }
+  if (!validateMAIAccWrite(Inst)) {
+    return false;
+  }
 
   return true;
 }
@@ -3556,7 +3709,7 @@ bool AMDGPUAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       return true;
     }
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst, getSTI());
+    Out.emitInstruction(Inst, getSTI());
     return false;
 
   case Match_MissingFeature:
@@ -4307,19 +4460,19 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
   if (Size > LocalMemorySize)
     return Error(SizeLoc, "size is too large");
 
-  int64_t Align = 4;
+  int64_t Alignment = 4;
   if (getLexer().is(AsmToken::Comma)) {
     Lex();
     SMLoc AlignLoc = getLexer().getLoc();
-    if (getParser().parseAbsoluteExpression(Align))
+    if (getParser().parseAbsoluteExpression(Alignment))
       return true;
-    if (Align < 0 || !isPowerOf2_64(Align))
+    if (Alignment < 0 || !isPowerOf2_64(Alignment))
       return Error(AlignLoc, "alignment must be a power of two");
 
     // Alignment larger than the size of LDS is possible in theory, as long
     // as the linker manages to place to symbol at address 0, but we do want
     // to make sure the alignment fits nicely into a 32-bit integer.
-    if (Align >= 1u << 31)
+    if (Alignment >= 1u << 31)
       return Error(AlignLoc, "alignment is too large");
   }
 
@@ -4331,7 +4484,7 @@ bool AMDGPUAsmParser::ParseDirectiveAMDGPULDS() {
   if (!Symbol->isUndefined())
     return Error(NameLoc, "invalid symbol redefinition");
 
-  getTargetStreamer().emitAMDGPULDS(Symbol, Size, Align);
+  getTargetStreamer().emitAMDGPULDS(Symbol, Size, Align(Alignment));
   return false;
 }
 
@@ -4650,9 +4803,9 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
       case AsmToken::Identifier: {
         StringRef Tok = Parser.getTok().getString();
         if (Tok == Name) {
-          if (Tok == "r128" && isGFX9())
+          if (Tok == "r128" && !hasMIMG_R128())
             Error(S, "r128 modifier is not supported on this GPU");
-          if (Tok == "a16" && !isGFX9() && !isGFX10())
+          if (Tok == "a16" && !isGFX9() && !hasGFX10A16())
             Error(S, "a16 modifier is not supported on this GPU");
           Bit = 1;
           Parser.Lex();
@@ -4672,6 +4825,9 @@ AMDGPUAsmParser::parseNamedBit(const char *Name, OperandVector &Operands,
   if (!isGFX10() && ImmTy == AMDGPUOperand::ImmTyDLC)
     return MatchOperand_ParseFail;
 
+  if (isGFX9() && ImmTy == AMDGPUOperand::ImmTyA16)
+    ImmTy = AMDGPUOperand::ImmTyR128A16;
+
   Operands.push_back(AMDGPUOperand::CreateImm(this, Bit, S, ImmTy));
   return MatchOperand_Success;
 }
@@ -5987,6 +6143,8 @@ void AMDGPUAsmParser::cvtMIMG(MCInst &Inst, const OperandVector &Operands,
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyGLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTySLC);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyR128A16);
+  if (IsGFX10)
+    addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyA16);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyTFE);
   addOptionalImmOperand(Inst, Operands, OptionalIdx, AMDGPUOperand::ImmTyLWE);
   if (!IsGFX10)
@@ -6006,8 +6164,8 @@ bool AMDGPUOperand::isSMRDOffset8() const {
   return isImm() && isUInt<8>(getImm());
 }
 
-bool AMDGPUOperand::isSMRDOffset20() const {
-  return isImm() && isUInt<20>(getImm());
+bool AMDGPUOperand::isSMEMOffset() const {
+  return isImm(); // Offset range is checked later by validator.
 }
 
 bool AMDGPUOperand::isSMRDLiteralOffset() const {
@@ -6020,7 +6178,7 @@ AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset8() const {
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
 }
 
-AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMRDOffset20() const {
+AMDGPUOperand::Ptr AMDGPUAsmParser::defaultSMEMOffset() const {
   return AMDGPUOperand::CreateImm(this, 0, SMLoc(), AMDGPUOperand::ImmTyOffset);
 }
 
@@ -6096,7 +6254,7 @@ static const OptionalOperand AMDGPUOptionalOperandTable[] = {
   {"unorm",   AMDGPUOperand::ImmTyUNorm, true, nullptr},
   {"da",      AMDGPUOperand::ImmTyDA,    true, nullptr},
   {"r128",    AMDGPUOperand::ImmTyR128A16,  true, nullptr},
-  {"a16",     AMDGPUOperand::ImmTyR128A16,  true, nullptr},
+  {"a16",     AMDGPUOperand::ImmTyA16,  true, nullptr},
   {"lwe",     AMDGPUOperand::ImmTyLWE,   true, nullptr},
   {"d16",     AMDGPUOperand::ImmTyD16,   true, nullptr},
   {"dmask",   AMDGPUOperand::ImmTyDMask, false, nullptr},
@@ -6499,7 +6657,7 @@ OperandMatchResultTy AMDGPUAsmParser::parseDim(OperandVector &Operands) {
   std::string Token;
   if (getLexer().is(AsmToken::Integer)) {
     SMLoc Loc = getLexer().getTok().getEndLoc();
-    Token = getLexer().getTok().getString();
+    Token = std::string(getLexer().getTok().getString());
     Parser.Lex();
     if (getLexer().getTok().getLoc() != Loc)
       return MatchOperand_ParseFail;
@@ -7032,6 +7190,8 @@ unsigned AMDGPUAsmParser::validateTargetOperandClass(MCParsedAsmOperand &Op,
     return Operand.isInterpAttr() ? Match_Success : Match_InvalidOperand;
   case MCK_AttrChan:
     return Operand.isAttrChan() ? Match_Success : Match_InvalidOperand;
+  case MCK_ImmSMEMOffset:
+    return Operand.isSMEMOffset() ? Match_Success : Match_InvalidOperand;
   case MCK_SReg_64:
   case MCK_SReg_64_XEXEC:
     // Null is defined as a 32-bit register but
diff --git a/llvm/lib/Target/AMDGPU/BUFInstructions.td b/llvm/lib/Target/AMDGPU/BUFInstructions.td
index 691aff4ecbb8..fa42ddc54b56 100644
--- a/llvm/lib/Target/AMDGPU/BUFInstructions.td
+++ b/llvm/lib/Target/AMDGPU/BUFInstructions.td
@@ -1,4 +1,4 @@
-//===-- BUFInstructions.td - Buffer Instruction Defintions ----------------===//
+//===-- BUFInstructions.td - Buffer Instruction Definitions ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -374,7 +374,8 @@ class MUBUF_Invalidate <string opName, SDPatternOperator node = null_frag> :
   let AsmMatchConverter = "";
 
   let hasSideEffects = 1;
-  let mayStore = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
 
   // Set everything to 0.
   let offen       = 0;
@@ -1003,6 +1004,11 @@ defm BUFFER_ATOMIC_DEC_X2 : MUBUF_Pseudo_Atomics <
   "buffer_atomic_dec_x2", VReg_64, i64, atomic_dec_global_64
 >;
 
+let SubtargetPredicate = HasGFX10_BEncoding in
+defm BUFFER_ATOMIC_CSUB : MUBUF_Pseudo_Atomics_RTN <
+  "buffer_atomic_csub", VGPR_32, i32, atomic_csub_global_32
+>;
+
 let SubtargetPredicate = isGFX8GFX9 in {
 def BUFFER_STORE_LDS_DWORD : MUBUF_Pseudo_Store_Lds <"buffer_store_lds_dword">;
 }
@@ -1152,22 +1158,6 @@ let SubtargetPredicate = isGFX10Plus in {
 // MUBUF Patterns
 //===----------------------------------------------------------------------===//
 
-def extract_glc : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8);
-}]>;
-
-def extract_slc : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8);
-}]>;
-
-def extract_dlc : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8);
-}]>;
-
-def extract_swz : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8);
-}]>;
-
 //===----------------------------------------------------------------------===//
 // buffer_load/store_format patterns
 //===----------------------------------------------------------------------===//
@@ -1177,24 +1167,24 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
   def : GCNPat<
     (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
               timm:$auxiliary, 0)),
-    (!cast<MUBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
-      (extract_glc $auxiliary), (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
+    (!cast<MUBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
 
   def : GCNPat<
     (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
               timm:$auxiliary, 0)),
-    (!cast<MUBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
-      (extract_glc $auxiliary), (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
+    (!cast<MUBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
 
   def : GCNPat<
     (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
               timm:$auxiliary, timm)),
-    (!cast<MUBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
-      (extract_glc $auxiliary), (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
+    (!cast<MUBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
 
@@ -1202,9 +1192,9 @@ multiclass MUBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
     (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
               timm:$auxiliary, timm)),
     (!cast<MUBUF_Pseudo>(opcode # _BOTHEN)
-      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
-      $rsrc, $soffset, (as_i16imm $offset),
-      (extract_glc $auxiliary), (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
+      (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+      SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+      (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
 }
@@ -1221,6 +1211,7 @@ defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format, v4i32, "BUFFER_LOAD_FORMAT_X
 let SubtargetPredicate = HasUnpackedD16VMem in {
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X_gfx80">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i32, "BUFFER_LOAD_FORMAT_D16_XY_gfx80">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4i32, "BUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
 } // End HasUnpackedD16VMem.
@@ -1228,6 +1219,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
 let SubtargetPredicate = HasPackedD16VMem in {
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, f16, "BUFFER_LOAD_FORMAT_D16_X">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i16, "BUFFER_LOAD_FORMAT_D16_X">;
+  defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, i32, "BUFFER_LOAD_FORMAT_D16_X">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2f16, "BUFFER_LOAD_FORMAT_D16_XY">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v2i16, "BUFFER_LOAD_FORMAT_D16_XY">;
   defm : MUBUF_LoadIntrinsicPat<SIbuffer_load_format_d16, v4f16, "BUFFER_LOAD_FORMAT_D16_XYZW">;
@@ -1256,7 +1248,7 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
   def : GCNPat<
     (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
               timm:$auxiliary, 0),
-    (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset, (as_i16imm $offset),
+    (!cast<MUBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
@@ -1264,8 +1256,8 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
   def : GCNPat<
     (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
               timm:$auxiliary, 0),
-    (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
-      (as_i16imm $offset), (extract_glc $auxiliary),
+    (!cast<MUBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
+      (as_i16timm $offset), (extract_glc $auxiliary),
       (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
@@ -1273,8 +1265,8 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
   def : GCNPat<
     (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
               timm:$auxiliary, timm),
-    (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
-      (as_i16imm $offset), (extract_glc $auxiliary),
+    (!cast<MUBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
+      (as_i16timm $offset), (extract_glc $auxiliary),
       (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
@@ -1283,9 +1275,9 @@ multiclass MUBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
               timm:$auxiliary, timm),
     (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_exact)
-      $vdata,
-      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
-      $rsrc, $soffset, (as_i16imm $offset), (extract_glc $auxiliary),
+      getVregSrcForVT<vt>.ret:$vdata,
+      (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+      SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_glc $auxiliary),
       (extract_slc $auxiliary), 0,  (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
@@ -1303,6 +1295,7 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format, v4i32, "BUFFER_STORE_FORMA
 let SubtargetPredicate = HasUnpackedD16VMem in {
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X_gfx80">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i32, "BUFFER_STORE_FORMAT_D16_XY_gfx80">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4i32, "BUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
 } // End HasUnpackedD16VMem.
@@ -1310,6 +1303,7 @@ let SubtargetPredicate = HasUnpackedD16VMem in {
 let SubtargetPredicate = HasPackedD16VMem in {
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, f16, "BUFFER_STORE_FORMAT_D16_X">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i16, "BUFFER_STORE_FORMAT_D16_X">;
+  defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, i32, "BUFFER_STORE_FORMAT_D16_X">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2f16, "BUFFER_STORE_FORMAT_D16_XY">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v2i16, "BUFFER_STORE_FORMAT_D16_XY">;
   defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_format_d16, v4f16, "BUFFER_STORE_FORMAT_D16_XYZW">;
@@ -1338,37 +1332,37 @@ defm : MUBUF_StoreIntrinsicPat<SIbuffer_store_short, i32, "BUFFER_STORE_SHORT">;
 multiclass BufferAtomicPatterns<SDPatternOperator name, ValueType vt,
                                 string opcode> {
   def : GCNPat<
-    (vt (name vt:$vdata_in, v4i32:$rsrc, 0,
-          0, i32:$soffset, timm:$offset,
-          timm:$cachepolicy, 0)),
-    (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN) $vdata_in, $rsrc, $soffset,
-                                        (as_i16imm $offset), (extract_slc $cachepolicy))
+    (vt (name vt:$vdata_in, v4i32:$rsrc, 0, 0, i32:$soffset,
+              timm:$offset, timm:$cachepolicy, 0)),
+    (!cast<MUBUF_Pseudo>(opcode # _OFFSET_RTN)
+      getVregSrcForVT<vt>.ret:$vdata_in, SReg_128:$rsrc, SCSrc_b32:$soffset,
+      (as_i16timm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
-    (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
-          0, i32:$soffset, timm:$offset,
-          timm:$cachepolicy, timm)),
-    (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) $vdata_in, $vindex, $rsrc, $soffset,
-                                       (as_i16imm $offset), (extract_slc $cachepolicy))
+    (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset,
+              timm:$offset, timm:$cachepolicy, timm)),
+    (!cast<MUBUF_Pseudo>(opcode # _IDXEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in,
+      VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
+      (as_i16timm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
-    (vt (name vt:$vdata_in, v4i32:$rsrc, 0,
-          i32:$voffset, i32:$soffset, timm:$offset,
-          timm:$cachepolicy, 0)),
-    (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) $vdata_in, $voffset, $rsrc, $soffset,
-                                       (as_i16imm $offset), (extract_slc $cachepolicy))
+    (vt (name vt:$vdata_in, v4i32:$rsrc, 0, i32:$voffset,
+              i32:$soffset, timm:$offset, timm:$cachepolicy, 0)),
+    (!cast<MUBUF_Pseudo>(opcode # _OFFEN_RTN) getVregSrcForVT<vt>.ret:$vdata_in,
+      VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
+      (as_i16timm $offset), (extract_slc $cachepolicy))
   >;
 
   def : GCNPat<
-    (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex,
-          i32:$voffset, i32:$soffset, timm:$offset,
-          timm:$cachepolicy, timm)),
+    (vt (name vt:$vdata_in, v4i32:$rsrc, i32:$vindex, i32:$voffset,
+              i32:$soffset, timm:$offset, timm:$cachepolicy, timm)),
     (!cast<MUBUF_Pseudo>(opcode # _BOTHEN_RTN)
-      $vdata_in,
-      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
-      $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy))
+      getVregSrcForVT<vt>.ret:$vdata_in,
+      (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+        SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+        (extract_slc $cachepolicy))
   >;
 }
 
@@ -1384,6 +1378,7 @@ defm : BufferAtomicPatterns<SIbuffer_atomic_or, i32, "BUFFER_ATOMIC_OR">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_xor, i32, "BUFFER_ATOMIC_XOR">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_inc, i32, "BUFFER_ATOMIC_INC">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_dec, i32, "BUFFER_ATOMIC_DEC">;
+defm : BufferAtomicPatterns<SIbuffer_atomic_csub, i32, "BUFFER_ATOMIC_CSUB">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_swap, i64, "BUFFER_ATOMIC_SWAP_X2">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_add, i64,  "BUFFER_ATOMIC_ADD_X2">;
 defm : BufferAtomicPatterns<SIbuffer_atomic_sub, i64, "BUFFER_ATOMIC_SUB_X2">;
@@ -1434,19 +1429,20 @@ multiclass BufferAtomicPatterns_NO_RTN<SDPatternOperator name, ValueType vt,
   >;
 }
 
+let SubtargetPredicate = HasAtomicFaddInsts in {
 defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_fadd, f32, "BUFFER_ATOMIC_ADD_F32">;
 defm : BufferAtomicPatterns_NO_RTN<SIbuffer_atomic_pk_fadd, v2f16, "BUFFER_ATOMIC_PK_ADD_F16">;
+}
 
 def : GCNPat<
   (SIbuffer_atomic_cmpswap
-      i32:$data, i32:$cmp, v4i32:$rsrc, 0,
-      0, i32:$soffset, timm:$offset,
-      timm:$cachepolicy, 0),
+      i32:$data, i32:$cmp, v4i32:$rsrc, 0, 0, i32:$soffset,
+      timm:$offset, timm:$cachepolicy, 0),
   (EXTRACT_SUBREG
     (BUFFER_ATOMIC_CMPSWAP_OFFSET_RTN
-      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
-      $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
-    sub0)
+      (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+        SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+        (extract_slc $cachepolicy)), sub0)
 >;
 
 def : GCNPat<
@@ -1456,8 +1452,8 @@ def : GCNPat<
       timm:$cachepolicy, timm),
   (EXTRACT_SUBREG
     (BUFFER_ATOMIC_CMPSWAP_IDXEN_RTN
-      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
-      $vindex, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
+      (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+      VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
     sub0)
 >;
 
@@ -1468,8 +1464,8 @@ def : GCNPat<
       timm:$cachepolicy, 0),
   (EXTRACT_SUBREG
     (BUFFER_ATOMIC_CMPSWAP_OFFEN_RTN
-      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
-      $voffset, $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
+      (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+      VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
     sub0)
 >;
 
@@ -1480,9 +1476,9 @@ def : GCNPat<
       timm:$cachepolicy, timm),
   (EXTRACT_SUBREG
     (BUFFER_ATOMIC_CMPSWAP_BOTHEN_RTN
-      (REG_SEQUENCE VReg_64, $data, sub0, $cmp, sub1),
-      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
-      $rsrc, $soffset, (as_i16imm $offset), (extract_slc $cachepolicy)),
+      (REG_SEQUENCE VReg_64, VGPR_32:$data, sub0, VGPR_32:$cmp, sub1),
+      (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+      SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (extract_slc $cachepolicy)),
     sub0)
 >;
 
@@ -1584,7 +1580,7 @@ defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET,
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_USHORT_OFFEN, BUFFER_LOAD_USHORT_OFFSET, i16, load_private>;
 
 foreach vt = Reg32Types.types in {
-defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, i32, load_private>;
+defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORD_OFFEN, BUFFER_LOAD_DWORD_OFFSET, vt, load_private>;
 }
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX2_OFFEN, BUFFER_LOAD_DWORDX2_OFFSET, v2i32, load_private>;
 defm : MUBUFScratchLoadPat <BUFFER_LOAD_DWORDX3_OFFEN, BUFFER_LOAD_DWORDX3_OFFSET, v3i32, load_private>;
@@ -1692,8 +1688,8 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
   def : GCNPat<
     (vt (name v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
               timm:$format, timm:$auxiliary, 0)),
-    (!cast<MTBUF_Pseudo>(opcode # _OFFSET) $rsrc, $soffset, (as_i16imm $offset),
-      (as_i8imm $format),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFSET) SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+      (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
@@ -1701,8 +1697,8 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
   def : GCNPat<
     (vt (name v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
               timm:$format, timm:$auxiliary, timm)),
-    (!cast<MTBUF_Pseudo>(opcode # _IDXEN) $vindex, $rsrc, $soffset, (as_i16imm $offset),
-      (as_i8imm $format),
+    (!cast<MTBUF_Pseudo>(opcode # _IDXEN) VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+      (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
@@ -1710,8 +1706,8 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
   def : GCNPat<
     (vt (name v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
               timm:$format, timm:$auxiliary, 0)),
-    (!cast<MTBUF_Pseudo>(opcode # _OFFEN) $voffset, $rsrc, $soffset, (as_i16imm $offset),
-      (as_i8imm $format),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFEN) VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+      (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
@@ -1720,9 +1716,9 @@ multiclass MTBUF_LoadIntrinsicPat<SDPatternOperator name, ValueType vt,
     (vt (name v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset, timm:$offset,
               timm:$format, timm:$auxiliary, timm)),
     (!cast<MTBUF_Pseudo>(opcode # _BOTHEN)
-      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
-      $rsrc, $soffset, (as_i16imm $offset),
-      (as_i8imm $format),
+      (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+      SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset),
+      (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
@@ -1739,12 +1735,14 @@ defm : MTBUF_LoadIntrinsicPat<SItbuffer_load, v4f32, "TBUFFER_LOAD_FORMAT_XYZW">
 
 let SubtargetPredicate = HasUnpackedD16VMem in {
   defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16,   "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32,   "TBUFFER_LOAD_FORMAT_D16_X_gfx80">;
   defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2i32, "TBUFFER_LOAD_FORMAT_D16_XY_gfx80">;
   defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4i32, "TBUFFER_LOAD_FORMAT_D16_XYZW_gfx80">;
 } // End HasUnpackedD16VMem.
 
 let SubtargetPredicate = HasPackedD16VMem in {
   defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, f16,   "TBUFFER_LOAD_FORMAT_D16_X">;
+  defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, i32,   "TBUFFER_LOAD_FORMAT_D16_X">;
   defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v2f16, "TBUFFER_LOAD_FORMAT_D16_XY">;
   defm : MTBUF_LoadIntrinsicPat<SItbuffer_load_d16, v4f16, "TBUFFER_LOAD_FORMAT_D16_XYZW">;
 } // End HasPackedD16VMem.
@@ -1754,8 +1752,8 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
   def : GCNPat<
     (name vt:$vdata, v4i32:$rsrc, 0, 0, i32:$soffset, timm:$offset,
           timm:$format, timm:$auxiliary, 0),
-    (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) $vdata, $rsrc, $soffset,
-      (as_i16imm $offset), (as_i8imm $format),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFSET_exact) getVregSrcForVT<vt>.ret:$vdata, SReg_128:$rsrc, SCSrc_b32:$soffset,
+      (as_i16timm $offset), (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
@@ -1763,8 +1761,8 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
   def : GCNPat<
     (name vt:$vdata, v4i32:$rsrc, i32:$vindex, 0, i32:$soffset, timm:$offset,
           timm:$format, timm:$auxiliary, timm),
-    (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) $vdata, $vindex, $rsrc, $soffset,
-      (as_i16imm $offset), (as_i8imm $format),
+    (!cast<MTBUF_Pseudo>(opcode # _IDXEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$vindex, SReg_128:$rsrc, SCSrc_b32:$soffset,
+      (as_i16timm $offset), (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
@@ -1772,8 +1770,8 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
   def : GCNPat<
     (name vt:$vdata, v4i32:$rsrc, 0, i32:$voffset, i32:$soffset, timm:$offset,
           timm:$format, timm:$auxiliary, 0),
-    (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) $vdata, $voffset, $rsrc, $soffset,
-      (as_i16imm $offset), (as_i8imm $format),
+    (!cast<MTBUF_Pseudo>(opcode # _OFFEN_exact) getVregSrcForVT<vt>.ret:$vdata, VGPR_32:$voffset, SReg_128:$rsrc, SCSrc_b32:$soffset,
+      (as_i16timm $offset), (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
@@ -1782,9 +1780,9 @@ multiclass MTBUF_StoreIntrinsicPat<SDPatternOperator name, ValueType vt,
     (name vt:$vdata, v4i32:$rsrc, i32:$vindex, i32:$voffset, i32:$soffset,
           timm:$offset, timm:$format, timm:$auxiliary, timm),
     (!cast<MTBUF_Pseudo>(opcode # _BOTHEN_exact)
-      $vdata,
-      (REG_SEQUENCE VReg_64, $vindex, sub0, $voffset, sub1),
-      $rsrc, $soffset, (as_i16imm $offset), (as_i8imm $format),
+      getVregSrcForVT<vt>.ret:$vdata,
+      (REG_SEQUENCE VReg_64, VGPR_32:$vindex, sub0, VGPR_32:$voffset, sub1),
+      SReg_128:$rsrc, SCSrc_b32:$soffset, (as_i16timm $offset), (as_i8timm $format),
       (extract_glc $auxiliary), (extract_slc $auxiliary), 0, (extract_dlc $auxiliary),
       (extract_swz $auxiliary))
   >;
@@ -1801,12 +1799,14 @@ defm : MTBUF_StoreIntrinsicPat<SItbuffer_store, v4f32, "TBUFFER_STORE_FORMAT_XYZ
 
 let SubtargetPredicate = HasUnpackedD16VMem in {
   defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16,   "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32,   "TBUFFER_STORE_FORMAT_D16_X_gfx80">;
   defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2i32, "TBUFFER_STORE_FORMAT_D16_XY_gfx80">;
   defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4i32, "TBUFFER_STORE_FORMAT_D16_XYZW_gfx80">;
 } // End HasUnpackedD16VMem.
 
 let SubtargetPredicate = HasPackedD16VMem in {
   defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, f16,   "TBUFFER_STORE_FORMAT_D16_X">;
+  defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, i32,   "TBUFFER_STORE_FORMAT_D16_X">;
   defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v2f16, "TBUFFER_STORE_FORMAT_D16_XY">;
   defm : MTBUF_StoreIntrinsicPat<SItbuffer_store_d16, v4f16, "TBUFFER_STORE_FORMAT_D16_XYZW">;
 } // End HasPackedD16VMem.
@@ -1888,8 +1888,7 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
     def _LDS_BOTHEN_gfx10 : MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_LDS_BOTHEN")>,
                             MUBUFLdsTable<1, NAME # "_BOTHEN_gfx10">;
   }
-  multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> :
-      MUBUF_Real_AllAddr_gfx10<op> {
+  multiclass MUBUF_Real_Atomics_RTN_gfx10<bits<8> op> {
     def _BOTHEN_RTN_gfx10 :
       MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_BOTHEN_RTN")>;
     def _IDXEN_RTN_gfx10 :
@@ -1899,6 +1898,8 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
     def _OFFSET_RTN_gfx10 :
       MUBUF_Real_gfx10<op, !cast<MUBUF_Pseudo>(NAME#"_OFFSET_RTN")>;
   }
+  multiclass MUBUF_Real_Atomics_gfx10<bits<8> op> :
+      MUBUF_Real_AllAddr_gfx10<op>, MUBUF_Real_Atomics_RTN_gfx10<op>;
 } // End AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10"
 
 defm BUFFER_STORE_BYTE_D16_HI     : MUBUF_Real_AllAddr_gfx10<0x019>;
@@ -2063,6 +2064,8 @@ defm BUFFER_ATOMIC_FCMPSWAP_X2 : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05e>;
 defm BUFFER_ATOMIC_FMIN_X2     : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x05f>;
 defm BUFFER_ATOMIC_FMAX_X2     : MUBUF_Real_Atomics_gfx6_gfx7_gfx10<0x060>;
 
+defm BUFFER_ATOMIC_CSUB       : MUBUF_Real_Atomics_RTN_gfx10<0x034>;
+
 defm BUFFER_WBINVL1_SC        : MUBUF_Real_gfx6<0x070>;
 defm BUFFER_WBINVL1_VOL       : MUBUF_Real_gfx7<0x070>;
 def  BUFFER_WBINVL1_gfx6_gfx7 : MUBUF_Real_gfx6_gfx7<0x071, BUFFER_WBINVL1>;
diff --git a/llvm/lib/Target/AMDGPU/CaymanInstructions.td b/llvm/lib/Target/AMDGPU/CaymanInstructions.td
index 1a526675164a..f4ddbf1131c3 100644
--- a/llvm/lib/Target/AMDGPU/CaymanInstructions.td
+++ b/llvm/lib/Target/AMDGPU/CaymanInstructions.td
@@ -50,16 +50,19 @@ def COS_cm : COS_Common<0x8E>;
 
 def : RsqPat<RECIPSQRT_IEEE_cm, f32>;
 
+def : SqrtPat<RECIPSQRT_IEEE_cm, RECIP_IEEE_cm>;
+
 def : POW_Common <LOG_IEEE_cm, EXP_IEEE_cm, MUL>;
 
 defm DIV_cm : DIV_Common<RECIP_IEEE_cm>;
 
 // RECIP_UINT emulation for Cayman
-// The multiplication scales from [0,1] to the unsigned integer range
+// The multiplication scales from [0,1) to the unsigned integer range,
+// rounding down a bit to avoid unwanted overflow.
 def : R600Pat <
   (AMDGPUurecip i32:$src0),
   (FLT_TO_UINT_eg (MUL_IEEE (RECIP_IEEE_cm (UINT_TO_FLT_eg $src0)),
-                            (MOV_IMM_I32 CONST.FP_UINT_MAX_PLUS_1)))
+                            (MOV_IMM_I32 CONST.FP_4294966784)))
 >;
 
 def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
@@ -70,8 +73,6 @@ def CF_END_CM : CF_CLAUSE_EG<32, (ins), "CF_END"> {
 
 
 
-def : R600Pat<(fsqrt f32:$src), (MUL R600_Reg32:$src, (RECIPSQRT_CLAMPED_cm $src))>;
-
 class RAT_STORE_DWORD <RegisterClass rc, ValueType vt, bits<4> mask> :
   CF_MEM_RAT_CACHELESS <0x14, 0, mask,
                         (ins rc:$rw_gpr, R600_TReg32_X:$index_gpr),
diff --git a/llvm/lib/Target/AMDGPU/DSInstructions.td b/llvm/lib/Target/AMDGPU/DSInstructions.td
index fe7faca8b157..beb01b1abf0f 100644
--- a/llvm/lib/Target/AMDGPU/DSInstructions.td
+++ b/llvm/lib/Target/AMDGPU/DSInstructions.td
@@ -1,4 +1,4 @@
-//===-- DSInstructions.td - DS Instruction Defintions ---------------------===//
+//===-- DSInstructions.td - DS Instruction Definitions --------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -388,7 +388,12 @@ defm DS_MAX_U32       : DS_1A1D_NORET_mc<"ds_max_u32">;
 defm DS_AND_B32       : DS_1A1D_NORET_mc<"ds_and_b32">;
 defm DS_OR_B32        : DS_1A1D_NORET_mc<"ds_or_b32">;
 defm DS_XOR_B32       : DS_1A1D_NORET_mc<"ds_xor_b32">;
+
+let SubtargetPredicate = HasLDSFPAtomics in {
 defm DS_ADD_F32       : DS_1A1D_NORET_mc<"ds_add_f32">;
+}
+
+// FIXME: Are these really present pre-gfx8?
 defm DS_MIN_F32       : DS_1A1D_NORET_mc<"ds_min_f32">;
 defm DS_MAX_F32       : DS_1A1D_NORET_mc<"ds_max_f32">;
 
@@ -443,7 +448,10 @@ defm DS_MIN_F64       : DS_1A1D_NORET_mc<"ds_min_f64", VReg_64>;
 defm DS_MAX_F64       : DS_1A1D_NORET_mc<"ds_max_f64", VReg_64>;
 
 defm DS_ADD_RTN_U32   : DS_1A1D_RET_mc<"ds_add_rtn_u32", VGPR_32, "ds_add_u32">;
+
+let SubtargetPredicate = HasLDSFPAtomics in {
 defm DS_ADD_RTN_F32   : DS_1A1D_RET_mc<"ds_add_rtn_f32", VGPR_32, "ds_add_f32">;
+}
 defm DS_SUB_RTN_U32   : DS_1A1D_RET_mc<"ds_sub_rtn_u32", VGPR_32, "ds_sub_u32">;
 defm DS_RSUB_RTN_U32  : DS_1A1D_RET_mc<"ds_rsub_rtn_u32", VGPR_32, "ds_rsub_u32">;
 defm DS_INC_RTN_U32   : DS_1A1D_RET_mc<"ds_inc_rtn_u32", VGPR_32, "ds_inc_u32">;
@@ -497,6 +505,7 @@ def DS_GWS_SEMA_P     : DS_GWS_0D<"ds_gws_sema_p">;
 def DS_GWS_BARRIER    : DS_GWS_1D<"ds_gws_barrier">;
 }
 
+let SubtargetPredicate = HasDsSrc2Insts in {
 def DS_ADD_SRC2_U32   : DS_1A<"ds_add_src2_u32">;
 def DS_SUB_SRC2_U32   : DS_1A<"ds_sub_src2_u32">;
 def DS_RSUB_SRC2_U32  : DS_1A<"ds_rsub_src2_u32">;
@@ -529,6 +538,7 @@ def DS_MAX_SRC2_F64   : DS_1A<"ds_max_src2_f64">;
 
 def DS_WRITE_SRC2_B32 : DS_1A<"ds_write_src2_b32">;
 def DS_WRITE_SRC2_B64 : DS_1A<"ds_write_src2_b64">;
+} // End SubtargetPredicate = HasDsSrc2Insts
 
 let Uses = [EXEC], mayLoad = 0, mayStore = 0, isConvergent = 1 in {
 def DS_SWIZZLE_B32 : DS_1A_RET <"ds_swizzle_b32", VGPR_32, 0, SwizzleImm>;
@@ -609,10 +619,12 @@ def DS_BPERMUTE_B32 : DS_1A1D_PERMUTE <"ds_bpermute_b32",
                                        int_amdgcn_ds_bpermute>;
 }
 
-def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
-
 } // let SubtargetPredicate = isGFX8Plus
 
+let SubtargetPredicate = HasLDSFPAtomics, OtherPredicates = [HasDsSrc2Insts] in {
+def DS_ADD_SRC2_F32 : DS_1A<"ds_add_src2_f32">;
+}
+
 //===----------------------------------------------------------------------===//
 // DS Patterns
 //===----------------------------------------------------------------------===//
@@ -725,7 +737,7 @@ defm : DSWritePat_mc <DS_WRITE_B16, i32, "truncstorei16_local">;
 defm : DSWritePat_mc <DS_WRITE_B8, i16, "truncstorei8_local">;
 defm : DSWritePat_mc <DS_WRITE_B16, i16, "store_local">;
 
-foreach vt = VGPR_32.RegTypes in {
+foreach vt = Reg32Types.types in {
 defm : DSWritePat_mc <DS_WRITE_B32, vt, "store_local">;
 }
 
@@ -737,31 +749,35 @@ def : DSWritePat <DS_WRITE_B16_D16_HI, i32, store_hi16_local>;
 def : DSWritePat <DS_WRITE_B8_D16_HI, i32, truncstorei8_hi16_local>;
 }
 
-
-class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, PatFrag frag> : GCNPat <
-  (v2i32 (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
+class DS64Bit4ByteAlignedReadPat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat <
+  (vt:$value (frag (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1))),
   (inst $ptr, $offset0, $offset1, (i1 0))
 >;
 
-class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, PatFrag frag> : GCNPat<
-  (frag v2i32:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
-  (inst $ptr, (i32 (EXTRACT_SUBREG $value, sub0)),
-              (i32 (EXTRACT_SUBREG $value, sub1)), $offset0, $offset1,
+class DS64Bit4ByteAlignedWritePat<DS_Pseudo inst, ValueType vt, PatFrag frag> : GCNPat<
+  (frag vt:$value, (DS64Bit4ByteAligned i32:$ptr, i8:$offset0, i8:$offset1)),
+  (inst $ptr, (i32 (EXTRACT_SUBREG VReg_64:$value, sub0)),
+              (i32 (EXTRACT_SUBREG VReg_64:$value, sub1)), $offset0, $offset1,
               (i1 0))
 >;
 
-// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
-// related to bounds checking.
-let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
-def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, load_local_m0>;
-def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, store_local_m0>;
-}
+multiclass DS64Bit4ByteAlignedPat_mc<ValueType vt> {
+  let OtherPredicates = [LDSRequiresM0Init, isGFX7Plus] in {
+    def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32, vt, load_local_m0>;
+    def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32, vt, store_local_m0>;
+  }
 
-let OtherPredicates = [NotLDSRequiresM0Init] in {
-def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, load_local>;
-def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, store_local>;
+  let OtherPredicates = [NotLDSRequiresM0Init] in {
+    def : DS64Bit4ByteAlignedReadPat<DS_READ2_B32_gfx9, vt, load_local>;
+    def : DS64Bit4ByteAlignedWritePat<DS_WRITE2_B32_gfx9, vt, store_local>;
+  }
 }
 
+// v2i32 loads are split into i32 loads on SI during lowering, due to a bug
+// related to bounds checking.
+foreach vt = VReg_64.RegTypes in {
+defm : DS64Bit4ByteAlignedPat_mc<vt>;
+}
 
 let AddedComplexity = 100 in {
 
@@ -826,9 +842,12 @@ defm : DSAtomicRetPat_mc<DS_MAX_RTN_I32, i32, "atomic_load_max">;
 defm : DSAtomicRetPat_mc<DS_MIN_RTN_U32, i32, "atomic_load_umin">;
 defm : DSAtomicRetPat_mc<DS_MAX_RTN_U32, i32, "atomic_load_umax">;
 defm : DSAtomicCmpXChg_mc<DS_CMPST_RTN_B32, i32, "atomic_cmp_swap">;
+
+let SubtargetPredicate = HasLDSFPAtomics in {
 defm : DSAtomicRetPat_mc<DS_MIN_RTN_F32, f32, "atomic_load_fmin">;
 defm : DSAtomicRetPat_mc<DS_MAX_RTN_F32, f32, "atomic_load_fmax">;
 defm : DSAtomicRetPat_mc<DS_ADD_RTN_F32, f32, "atomic_load_fadd">;
+}
 
 // 64-bit atomics.
 defm : DSAtomicRetPat_mc<DS_WRXCHG_RTN_B64, i64, "atomic_swap">;
diff --git a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
index 419513bdc248..9c2f2e7eecd1 100644
--- a/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
+++ b/llvm/lib/Target/AMDGPU/Disassembler/AMDGPUDisassembler.cpp
@@ -18,7 +18,6 @@
 
 #include "Disassembler/AMDGPUDisassembler.h"
 #include "AMDGPU.h"
-#include "AMDGPURegisterInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
 #include "SIDefines.h"
 #include "TargetInfo/AMDGPUTargetInfo.h"
@@ -101,6 +100,18 @@ static DecodeStatus decodeSoppBrTarget(MCInst &Inst, unsigned Imm,
   return addOperand(Inst, MCOperand::createImm(Imm));
 }
 
+static DecodeStatus decodeSMEMOffset(MCInst &Inst, unsigned Imm,
+                                     uint64_t Addr, const void *Decoder) {
+  auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
+  int64_t Offset;
+  if (DAsm->isVI()) {         // VI supports 20-bit unsigned offsets.
+    Offset = Imm & 0xFFFFF;
+  } else {                    // GFX9+ supports 21-bit signed offsets.
+    Offset = SignExtend64<21>(Imm);
+  }
+  return addOperand(Inst, MCOperand::createImm(Offset));
+}
+
 static DecodeStatus decodeBoolReg(MCInst &Inst, unsigned Val,
                                   uint64_t Addr, const void *Decoder) {
   auto DAsm = static_cast<const AMDGPUDisassembler*>(Decoder);
@@ -285,6 +296,18 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     if (Bytes.size() >= 8) {
       const uint64_t QW = eatBytes<uint64_t>(Bytes);
 
+      if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) {
+        Res = tryDecodeInst(DecoderTableGFX10_B64, MI, QW, Address);
+        if (Res) {
+          if (AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dpp8)
+              == -1)
+            break;
+          if (convertDPP8Inst(MI) == MCDisassembler::Success)
+            break;
+          MI = MCInst(); // clear
+        }
+      }
+
       Res = tryDecodeInst(DecoderTableDPP864, MI, QW, Address);
       if (Res && convertDPP8Inst(MI) == MCDisassembler::Success)
         break;
@@ -334,6 +357,11 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Res = tryDecodeInst(DecoderTableGFX932, MI, DW, Address);
     if (Res) break;
 
+    if (STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding]) {
+      Res = tryDecodeInst(DecoderTableGFX10_B32, MI, DW, Address);
+      if (Res) break;
+    }
+
     Res = tryDecodeInst(DecoderTableGFX1032, MI, DW, Address);
     if (Res) break;
 
@@ -351,13 +379,6 @@ DecodeStatus AMDGPUDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     Res = tryDecodeInst(DecoderTableGFX1064, MI, QW, Address);
   } while (false);
 
-  if (Res && (MaxInstBytesNum - Bytes.size()) == 12 && (!HasLiteral ||
-        !(MCII->get(MI.getOpcode()).TSFlags & SIInstrFlags::VOP3))) {
-    MaxInstBytesNum = 8;
-    Bytes = Bytes_.slice(0, MaxInstBytesNum);
-    eatBytes<uint64_t>(Bytes);
-  }
-
   if (Res && (MI.getOpcode() == AMDGPU::V_MAC_F32_e64_vi ||
               MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx6_gfx7 ||
               MI.getOpcode() == AMDGPU::V_MAC_F32_e64_gfx10 ||
@@ -931,6 +952,7 @@ unsigned AMDGPUDisassembler::getAgprClassId(const OpWidthTy Width) const {
     return AGPR_32RegClassID;
   case OPW64: return AReg_64RegClassID;
   case OPW128: return AReg_128RegClassID;
+  case OPW256: return AReg_256RegClassID;
   case OPW512: return AReg_512RegClassID;
   case OPW1024: return AReg_1024RegClassID;
   }
@@ -1202,8 +1224,6 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
                                 raw_ostream &/*cStream*/, int64_t Value,
                                 uint64_t /*Address*/, bool IsBranch,
                                 uint64_t /*Offset*/, uint64_t /*InstSize*/) {
-  using SymbolInfoTy = std::tuple<uint64_t, StringRef, uint8_t>;
-  using SectionSymbolsTy = std::vector<SymbolInfoTy>;
 
   if (!IsBranch) {
     return false;
@@ -1215,11 +1235,11 @@ bool AMDGPUSymbolizer::tryAddingSymbolicOperand(MCInst &Inst,
 
   auto Result = std::find_if(Symbols->begin(), Symbols->end(),
                              [Value](const SymbolInfoTy& Val) {
-                                return std::get<0>(Val) == static_cast<uint64_t>(Value)
-                                    && std::get<2>(Val) == ELF::STT_NOTYPE;
+                                return Val.Addr == static_cast<uint64_t>(Value)
+                                    && Val.Type == ELF::STT_NOTYPE;
                              });
   if (Result != Symbols->end()) {
-    auto *Sym = Ctx.getOrCreateSymbol(std::get<1>(*Result));
+    auto *Sym = Ctx.getOrCreateSymbol(Result->Name);
     const auto *Add = MCSymbolRefExpr::create(Sym, Ctx);
     Inst.addOperand(MCOperand::createExpr(Add));
     return true;
diff --git a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
index 792e26d21f98..97104a242d8c 100644
--- a/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
+++ b/llvm/lib/Target/AMDGPU/EvergreenInstructions.td
@@ -69,11 +69,11 @@ multiclass RAT_ATOMIC<bits<6> op_ret, bits<6> op_noret, string name> {
   def  _RTN: CF_MEM_RAT <op_ret, 0, 0xf,
              (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
              (outs R600_Reg128:$out_gpr),
-             name ## "_RTN" ## " $rw_gpr, $index_gpr", [] >;
+             name # "_RTN" # " $rw_gpr, $index_gpr", [] >;
   def _NORET: CF_MEM_RAT <op_noret, 0, 0xf,
               (ins R600_Reg128:$rw_gpr, R600_TReg32_X:$index_gpr),
               (outs R600_Reg128:$out_gpr),
-              name ## " $rw_gpr, $index_gpr", [] >;
+              name # " $rw_gpr, $index_gpr", [] >;
   }
 }
 
@@ -118,11 +118,12 @@ def LOG_IEEE_eg : LOG_IEEE_Common<0x83>;
 def RECIP_CLAMPED_eg : RECIP_CLAMPED_Common<0x84>;
 def RECIPSQRT_IEEE_eg : RECIPSQRT_IEEE_Common<0x89>;
 def : RsqPat<RECIPSQRT_IEEE_eg, f32>;
+def : SqrtPat<RECIPSQRT_IEEE_eg, RECIP_IEEE_eg>;
+
 def SIN_eg : SIN_Common<0x8D>;
 def COS_eg : COS_Common<0x8E>;
 
 def : POW_Common <LOG_IEEE_eg, EXP_IEEE_eg, MUL>;
-def : EGPat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_eg $src))>;
 } // End SubtargetPredicate = isEG
 
 //===----------------------------------------------------------------------===//
@@ -421,6 +422,7 @@ def MULADD_UINT24_eg : R600_3OP <0x10, "MULADD_UINT24",
 def : UMad24Pat<MULADD_UINT24_eg>;
 
 def BIT_ALIGN_INT_eg : R600_3OP <0xC, "BIT_ALIGN_INT", [], VecALU>;
+def : FSHRPattern <BIT_ALIGN_INT_eg>;
 def : ROTRPattern <BIT_ALIGN_INT_eg>;
 def MULADD_eg : MULADD_Common<0x14>;
 def MULADD_IEEE_eg : MULADD_IEEE_Common<0x18>;
@@ -570,7 +572,7 @@ class R600_LDS_1A1D_NORET <bits<6> lds_op, string name, list<dag> pattern> :
 }
 
 class R600_LDS_1A1D_RET <bits<6> lds_op, string name, list<dag> pattern> :
-    R600_LDS_1A1D <lds_op,  (outs R600_Reg32:$dst), name##"_RET", pattern, "OQAP, "> {
+    R600_LDS_1A1D <lds_op,  (outs R600_Reg32:$dst), name#"_RET", pattern, "OQAP, "> {
 
   let BaseOp = name;
   let usesCustomInserter = 1;
diff --git a/llvm/lib/Target/AMDGPU/FLATInstructions.td b/llvm/lib/Target/AMDGPU/FLATInstructions.td
index 2057cac346d4..69facada2e96 100644
--- a/llvm/lib/Target/AMDGPU/FLATInstructions.td
+++ b/llvm/lib/Target/AMDGPU/FLATInstructions.td
@@ -1,4 +1,4 @@
-//===-- FLATInstructions.td - FLAT Instruction Defintions -----------------===//
+//===-- FLATInstructions.td - FLAT Instruction Definitions ----------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -100,7 +100,7 @@ class FLAT_Real <bits<7> op, FLAT_Pseudo ps> :
                   !if(ps.is_flat_scratch, 0b01, 0));
 
   // Signed offset. Highest bit ignored for flat and treated as 12-bit
-  // unsigned for flat acceses.
+  // unsigned for flat accesses.
   bits<13> offset;
   bits<1> nv = 0; // XXX - What does this actually do?
 
@@ -175,7 +175,7 @@ class FLAT_Store_Pseudo <string opName, RegisterClass vdataClass,
 }
 
 multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit HasTiedInput = 0> {
-  let is_flat_global = 1 in {
+  let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
     def "" : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1>,
       GlobalSaddrTable<0, opName>;
     def _SADDR : FLAT_Load_Pseudo<opName, regClass, HasTiedInput, 1, 1>,
@@ -183,8 +183,27 @@ multiclass FLAT_Global_Load_Pseudo<string opName, RegisterClass regClass, bit Ha
   }
 }
 
+class FLAT_Global_Load_AddTid_Pseudo <string opName, RegisterClass regClass,
+  bit HasTiedOutput = 0, bit HasSignedOffset = 0> : FLAT_Pseudo<
+  opName,
+  (outs regClass:$vdst),
+  !con((ins SReg_64:$saddr, flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc),
+    !if(HasTiedOutput, (ins regClass:$vdst_in), (ins))),
+  " $vdst, $saddr$offset$glc$slc$dlc"> {
+  let is_flat_global = 1;
+  let has_data = 0;
+  let mayLoad = 1;
+  let has_vaddr = 0;
+  let has_saddr = 1;
+  let enabled_saddr = 1;
+  let maybeAtomic = 1;
+
+  let Constraints = !if(HasTiedOutput, "$vdst = $vdst_in", "");
+  let DisableEncoding = !if(HasTiedOutput, "$vdst_in", "");
+}
+
 multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
-  let is_flat_global = 1 in {
+  let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
     def "" : FLAT_Store_Pseudo<opName, regClass, 1>,
       GlobalSaddrTable<0, opName>;
     def _SADDR : FLAT_Store_Pseudo<opName, regClass, 1, 1>,
@@ -192,6 +211,24 @@ multiclass FLAT_Global_Store_Pseudo<string opName, RegisterClass regClass> {
   }
 }
 
+class FLAT_Global_Store_AddTid_Pseudo <string opName, RegisterClass vdataClass,
+  bit HasSignedOffset = 0> : FLAT_Pseudo<
+  opName,
+  (outs),
+  !con(
+    (ins vdataClass:$vdata, SReg_64:$saddr),
+      (ins flat_offset:$offset, GLC:$glc, SLC:$slc, DLC:$dlc)),
+  " $vdata, $saddr$offset$glc$slc$dlc"> {
+  let is_flat_global = 1;
+  let mayLoad  = 0;
+  let mayStore = 1;
+  let has_vdst = 0;
+  let has_vaddr = 0;
+  let has_saddr = 1;
+  let enabled_saddr = 1;
+  let maybeAtomic = 1;
+}
+
 class FLAT_Scratch_Load_Pseudo <string opName, RegisterClass regClass,
   bit EnableSaddr = 0>: FLAT_Pseudo<
   opName,
@@ -279,6 +316,7 @@ multiclass FLAT_Atomic_Pseudo<
     AtomicNoRet <opName, 0> {
     let PseudoInstr = NAME;
     let FPAtomic = isFP;
+    let AddedComplexity = -1; // Prefer global atomics if available
   }
 
   def _RTN : FLAT_AtomicRet_Pseudo <opName,
@@ -290,6 +328,7 @@ multiclass FLAT_Atomic_Pseudo<
        GlobalSaddrTable<0, opName#"_rtn">,
        AtomicNoRet <opName, 1>{
     let FPAtomic = isFP;
+    let AddedComplexity = -1; // Prefer global atomics if available
   }
 }
 
@@ -367,10 +406,12 @@ multiclass FLAT_Global_Atomic_Pseudo<
   SDPatternOperator atomic_rtn = null_frag,
   SDPatternOperator atomic_no_rtn = null_frag,
   ValueType data_vt = vt,
-  RegisterClass data_rc = vdst_rc> :
-    FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, atomic_no_rtn, data_vt, data_rc>,
-    FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic_rtn, data_vt, data_rc>;
-
+  RegisterClass data_rc = vdst_rc> {
+  let is_flat_global = 1, SubtargetPredicate = HasFlatGlobalInsts in {
+    defm "" : FLAT_Global_Atomic_Pseudo_NO_RTN<opName, vdst_rc, vt, atomic_no_rtn, data_vt, data_rc>;
+    defm "" : FLAT_Global_Atomic_Pseudo_RTN<opName, vdst_rc, vt, atomic_rtn, data_vt, data_rc>;
+  }
+}
 
 //===----------------------------------------------------------------------===//
 // Flat Instructions
@@ -507,7 +548,6 @@ defm FLAT_ATOMIC_FMAX_X2     : FLAT_Atomic_Pseudo <"flat_atomic_fmax_x2",
 
 } // End SubtargetPredicate = isGFX7GFX10
 
-let SubtargetPredicate = HasFlatGlobalInsts in {
 defm GLOBAL_LOAD_UBYTE    : FLAT_Global_Load_Pseudo <"global_load_ubyte", VGPR_32>;
 defm GLOBAL_LOAD_SBYTE    : FLAT_Global_Load_Pseudo <"global_load_sbyte", VGPR_32>;
 defm GLOBAL_LOAD_USHORT   : FLAT_Global_Load_Pseudo <"global_load_ushort", VGPR_32>;
@@ -523,6 +563,8 @@ defm GLOBAL_LOAD_SBYTE_D16    : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16"
 defm GLOBAL_LOAD_SBYTE_D16_HI : FLAT_Global_Load_Pseudo <"global_load_sbyte_d16_hi", VGPR_32, 1>;
 defm GLOBAL_LOAD_SHORT_D16    : FLAT_Global_Load_Pseudo <"global_load_short_d16", VGPR_32, 1>;
 defm GLOBAL_LOAD_SHORT_D16_HI : FLAT_Global_Load_Pseudo <"global_load_short_d16_hi", VGPR_32, 1>;
+let OtherPredicates = [HasGFX10_BEncoding] in
+def  GLOBAL_LOAD_DWORD_ADDTID : FLAT_Global_Load_AddTid_Pseudo <"global_load_dword_addtid", VGPR_32>;
 
 defm GLOBAL_STORE_BYTE    : FLAT_Global_Store_Pseudo <"global_store_byte", VGPR_32>;
 defm GLOBAL_STORE_SHORT   : FLAT_Global_Store_Pseudo <"global_store_short", VGPR_32>;
@@ -530,6 +572,8 @@ defm GLOBAL_STORE_DWORD   : FLAT_Global_Store_Pseudo <"global_store_dword", VGPR
 defm GLOBAL_STORE_DWORDX2 : FLAT_Global_Store_Pseudo <"global_store_dwordx2", VReg_64>;
 defm GLOBAL_STORE_DWORDX3 : FLAT_Global_Store_Pseudo <"global_store_dwordx3", VReg_96>;
 defm GLOBAL_STORE_DWORDX4 : FLAT_Global_Store_Pseudo <"global_store_dwordx4", VReg_128>;
+let OtherPredicates = [HasGFX10_BEncoding] in
+def  GLOBAL_STORE_DWORD_ADDTID : FLAT_Global_Store_AddTid_Pseudo <"global_store_dword_addtid", VGPR_32>;
 
 defm GLOBAL_STORE_BYTE_D16_HI  : FLAT_Global_Store_Pseudo <"global_store_byte_d16_hi", VGPR_32>;
 defm GLOBAL_STORE_SHORT_D16_HI : FLAT_Global_Store_Pseudo <"global_store_short_d16_hi", VGPR_32>;
@@ -615,9 +659,12 @@ defm GLOBAL_ATOMIC_INC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_inc_x2",
 
 defm GLOBAL_ATOMIC_DEC_X2 : FLAT_Global_Atomic_Pseudo <"global_atomic_dec_x2",
                               VReg_64, i64, atomic_dec_global_64>;
+
+let SubtargetPredicate = HasGFX10_BEncoding in
+defm GLOBAL_ATOMIC_CSUB : FLAT_Global_Atomic_Pseudo_RTN <"global_atomic_csub",
+                              VGPR_32, i32, atomic_csub_global_32>;
 } // End is_flat_global = 1
 
-} // End SubtargetPredicate = HasFlatGlobalInsts
 
 
 let SubtargetPredicate = HasFlatScratchInsts in {
@@ -912,6 +959,7 @@ def : FlatSignedAtomicPat <GLOBAL_ATOMIC_OR_RTN, atomic_load_or_global_32, i32>;
 def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SWAP_RTN, atomic_swap_global_32, i32>;
 def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CMPSWAP_RTN, AMDGPUatomic_cmp_swap_global_32, i32, v2i32>;
 def : FlatSignedAtomicPat <GLOBAL_ATOMIC_XOR_RTN, atomic_load_xor_global_32, i32>;
+def : FlatSignedAtomicPat <GLOBAL_ATOMIC_CSUB_RTN, atomic_csub_global_32, i32>;
 
 def : FlatSignedAtomicPat <GLOBAL_ATOMIC_ADD_X2_RTN, atomic_load_add_global_64, i64>;
 def : FlatSignedAtomicPat <GLOBAL_ATOMIC_SUB_X2_RTN, atomic_load_sub_global_64, i64>;
@@ -1212,6 +1260,9 @@ multiclass FLAT_Real_GlblAtomics_gfx10<bits<7> op> :
   FLAT_Real_RTN_gfx10<op>,
   FLAT_Real_SADDR_RTN_gfx10<op>;
 
+multiclass FLAT_Real_GlblAtomics_RTN_gfx10<bits<7> op> :
+  FLAT_Real_RTN_gfx10<op>,
+  FLAT_Real_SADDR_RTN_gfx10<op>;
 
 // ENC_FLAT.
 defm FLAT_LOAD_UBYTE            : FLAT_Real_Base_gfx10<0x008>;
@@ -1297,6 +1348,7 @@ defm GLOBAL_ATOMIC_SWAP         : FLAT_Real_GlblAtomics_gfx10<0x030>;
 defm GLOBAL_ATOMIC_CMPSWAP      : FLAT_Real_GlblAtomics_gfx10<0x031>;
 defm GLOBAL_ATOMIC_ADD          : FLAT_Real_GlblAtomics_gfx10<0x032>;
 defm GLOBAL_ATOMIC_SUB          : FLAT_Real_GlblAtomics_gfx10<0x033>;
+defm GLOBAL_ATOMIC_CSUB         : FLAT_Real_GlblAtomics_RTN_gfx10<0x034>;
 defm GLOBAL_ATOMIC_SMIN         : FLAT_Real_GlblAtomics_gfx10<0x035>;
 defm GLOBAL_ATOMIC_UMIN         : FLAT_Real_GlblAtomics_gfx10<0x036>;
 defm GLOBAL_ATOMIC_SMAX         : FLAT_Real_GlblAtomics_gfx10<0x037>;
@@ -1325,7 +1377,8 @@ defm GLOBAL_ATOMIC_DEC_X2       : FLAT_Real_GlblAtomics_gfx10<0x05d>;
 defm GLOBAL_ATOMIC_FCMPSWAP_X2  : FLAT_Real_GlblAtomics_gfx10<0x05e>;
 defm GLOBAL_ATOMIC_FMIN_X2      : FLAT_Real_GlblAtomics_gfx10<0x05f>;
 defm GLOBAL_ATOMIC_FMAX_X2      : FLAT_Real_GlblAtomics_gfx10<0x060>;
-
+defm GLOBAL_LOAD_DWORD_ADDTID   : FLAT_Real_Base_gfx10<0x016>;
+defm GLOBAL_STORE_DWORD_ADDTID  : FLAT_Real_Base_gfx10<0x017>;
 
 // ENC_FLAT_SCRATCH.
 defm SCRATCH_LOAD_UBYTE         : FLAT_Real_AllAddr_gfx10<0x008>;
diff --git a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
index 10e2c3a263f1..719a968b8314 100644
--- a/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNDPPCombine.cpp
@@ -105,6 +105,11 @@ public:
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties()
+      .set(MachineFunctionProperties::Property::IsSSA);
+  }
+
 private:
   int getDPPOp(unsigned Op) const;
 };
@@ -168,7 +173,9 @@ MachineInstr *GCNDPPCombine::createDPPInst(MachineInstr &OrigMI,
   }
 
   auto DPPInst = BuildMI(*OrigMI.getParent(), OrigMI,
-                         OrigMI.getDebugLoc(), TII->get(DPPOp));
+                         OrigMI.getDebugLoc(), TII->get(DPPOp))
+    .setMIFlags(OrigMI.getFlags());
+
   bool Fail = false;
   do {
     auto *Dst = TII->getNamedOperand(OrigMI, AMDGPU::OpName::vdst);
@@ -506,15 +513,32 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
       break;
     }
 
+    auto *Src0 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0);
+    auto *Src1 = TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1);
+    if (Use != Src0 && !(Use == Src1 && OrigMI.isCommutable())) { // [1]
+      LLVM_DEBUG(dbgs() << "  failed: no suitable operands\n");
+      break;
+    }
+
+    assert(Src0 && "Src1 without Src0?");
+    if (Src1 && Src1->isIdenticalTo(*Src0)) {
+      assert(Src1->isReg());
+      LLVM_DEBUG(
+          dbgs()
+          << "  " << OrigMI
+          << "  failed: DPP register is used more than once per instruction\n");
+      break;
+    }
+
     LLVM_DEBUG(dbgs() << "  combining: " << OrigMI);
-    if (Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src0)) {
+    if (Use == Src0) {
       if (auto *DPPInst = createDPPInst(OrigMI, MovMI, CombOldVGPR,
                                         OldOpndValue, CombBCZ)) {
         DPPMIs.push_back(DPPInst);
         Rollback = false;
       }
-    } else if (OrigMI.isCommutable() &&
-               Use == TII->getNamedOperand(OrigMI, AMDGPU::OpName::src1)) {
+    } else {
+      assert(Use == Src1 && OrigMI.isCommutable()); // by check [1]
       auto *BB = OrigMI.getParent();
       auto *NewMI = BB->getParent()->CloneMachineInstr(&OrigMI);
       BB->insert(OrigMI, NewMI);
@@ -528,8 +552,7 @@ bool GCNDPPCombine::combineDPPMov(MachineInstr &MovMI) const {
       } else
         LLVM_DEBUG(dbgs() << "  failed: cannot be commuted\n");
       NewMI->eraseFromParent();
-    } else
-      LLVM_DEBUG(dbgs() << "  failed: no suitable operands\n");
+    }
     if (Rollback)
       break;
     OrigMIs.push_back(&OrigMI);
@@ -562,8 +585,6 @@ bool GCNDPPCombine::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   TII = ST.getInstrInfo();
 
-  assert(MRI->isSSA() && "Must be run on SSA");
-
   bool Changed = false;
   for (auto &MBB : MF) {
     for (auto I = MBB.rbegin(), E = MBB.rend(); I != E;) {
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
index 3ef5a77af45e..8482dbfec250 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.cpp
@@ -228,11 +228,6 @@ void GCNHazardRecognizer::processBundle() {
   CurrCycleInstr = nullptr;
 }
 
-unsigned GCNHazardRecognizer::PreEmitNoops(SUnit *SU) {
-  IsHazardRecognizerMode = false;
-  return PreEmitNoopsCommon(SU->getInstr());
-}
-
 unsigned GCNHazardRecognizer::PreEmitNoops(MachineInstr *MI) {
   IsHazardRecognizerMode = true;
   CurrCycleInstr = MI;
@@ -486,6 +481,14 @@ void GCNHazardRecognizer::addClauseInst(const MachineInstr &MI) {
   addRegsToSet(TRI, MI.uses(), ClauseUses);
 }
 
+static bool breaksSMEMSoftClause(MachineInstr *MI) {
+  return !SIInstrInfo::isSMRD(*MI);
+}
+
+static bool breaksVMEMSoftClause(MachineInstr *MI) {
+  return !SIInstrInfo::isVMEM(*MI) && !SIInstrInfo::isFLAT(*MI);
+}
+
 int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
   // SMEM soft clause are only present on VI+, and only matter if xnack is
   // enabled.
@@ -512,7 +515,7 @@ int GCNHazardRecognizer::checkSoftClauseHazards(MachineInstr *MEM) {
     if (!MI)
       break;
 
-    if (IsSMRD != SIInstrInfo::isSMRD(*MI))
+    if (IsSMRD ? breaksSMEMSoftClause(MI) : breaksVMEMSoftClause(MI))
       break;
 
     addClauseInst(*MI);
diff --git a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
index 6aa2e70dfbfb..cd17f2755bd1 100644
--- a/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
+++ b/llvm/lib/Target/AMDGPU/GCNHazardRecognizer.h
@@ -105,7 +105,6 @@ public:
   void EmitInstruction(MachineInstr *MI) override;
   HazardType getHazardType(SUnit *SU, int Stalls) override;
   void EmitNoop() override;
-  unsigned PreEmitNoops(SUnit *SU) override;
   unsigned PreEmitNoops(MachineInstr *) override;
   unsigned PreEmitNoopsCommon(MachineInstr *);
   void AdvanceCycle() override;
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
index 90ab6a14ce20..75a02c839034 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.cpp
@@ -5,6 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the class GCNIterativeScheduler.
+///
+//===----------------------------------------------------------------------===//
 
 #include "GCNIterativeScheduler.h"
 #include "AMDGPUSubtarget.h"
diff --git a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
index e6f83914af5b..a0d4f432aa48 100644
--- a/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
+++ b/llvm/lib/Target/AMDGPU/GCNIterativeScheduler.h
@@ -5,6 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the class GCNIterativeScheduler, which uses an iterative
+/// approach to find a best schedule for GCN architecture. It basically makes
+/// use of various lightweight schedules, scores them, chooses best one based on
+/// their scores, and finally implements the chosen one.
+///
+//===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
 #define LLVM_LIB_TARGET_AMDGPU_GCNITERATIVESCHEDULER_H
diff --git a/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
index c469cf290e26..884b2e17289c 100644
--- a/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNMinRegStrategy.cpp
@@ -5,6 +5,13 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines and imlements the class GCNMinRegScheduler, which
+/// implements an experimental, simple scheduler whose main goal is to learn
+/// ways about consuming less possible registers for a region.
+///
+//===----------------------------------------------------------------------===//
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -207,9 +214,8 @@ void GCNMinRegScheduler::bumpPredsPriority(const SUnit *SchedSU, int Priority) {
   LLVM_DEBUG(dbgs() << "Make the predecessors of SU(" << SchedSU->NodeNum
                     << ")'s non-ready successors of " << Priority
                     << " priority in ready queue: ");
-  const auto SetEnd = Set.end();
   for (auto &C : RQ) {
-    if (Set.find(C.SU) != SetEnd) {
+    if (Set.count(C.SU)) {
       C.Priority = Priority;
       LLVM_DEBUG(dbgs() << " SU(" << C.SU->NodeNum << ')');
     }
diff --git a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
index f6023f3a40a2..57346087d017 100644
--- a/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNNSAReassign.cpp
@@ -286,8 +286,15 @@ bool GCNNSAReassign::runOnMachineFunction(MachineFunction &MF) {
       }
       Intervals.push_back(LI);
       OrigRegs.push_back(VRM->getPhys(Reg));
-      MinInd = I ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
-      MaxInd = I ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
+      if (LI->empty()) {
+        // The address input is undef, so it doesn't contribute to the relevant
+        // range. Seed a reasonable index range if required.
+        if (I == 0)
+          MinInd = MaxInd = LIS->getInstructionIndex(*MI);
+        continue;
+      }
+      MinInd = I != 0 ? std::min(MinInd, LI->beginIndex()) : LI->beginIndex();
+      MaxInd = I != 0 ? std::max(MaxInd, LI->endIndex()) : LI->endIndex();
     }
 
     if (Intervals.empty())
diff --git a/llvm/lib/Target/AMDGPU/GCNProcessors.td b/llvm/lib/Target/AMDGPU/GCNProcessors.td
index b926041afb2f..17e6098d880d 100644
--- a/llvm/lib/Target/AMDGPU/GCNProcessors.td
+++ b/llvm/lib/Target/AMDGPU/GCNProcessors.td
@@ -183,3 +183,7 @@ def : ProcessorModel<"gfx1011", GFX10SpeedModel,
 def : ProcessorModel<"gfx1012", GFX10SpeedModel,
   FeatureISAVersion10_1_2.Features
 >;
+
+def : ProcessorModel<"gfx1030", GFX10SpeedModel,
+  FeatureISAVersion10_3_0.Features
+>;
diff --git a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
index 76593bc0e5ac..98d971630ca4 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegBankReassign.cpp
@@ -168,13 +168,15 @@ private:
   // 8 banks for SGPRs.
   // Registers already processed and recorded in RegsUsed are excluded.
   // If Bank is not -1 assume Reg:SubReg to belong to that Bank.
-  unsigned getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
+  uint32_t getRegBankMask(unsigned Reg, unsigned SubReg, int Bank);
 
-  // Return number of stalls in the instructions.
-  // UsedBanks has bits set for the banks used by all operands.
-  // If Reg and Bank provided substitute the Reg with the Bank.
-  unsigned analyzeInst(const MachineInstr& MI, unsigned& UsedBanks,
-                       unsigned Reg = AMDGPU::NoRegister, int Bank = -1);
+  // Analyze one instruction returning the number of stalls and a mask of the
+  // banks used by all operands.
+  // If Reg and Bank are provided, assume all uses of Reg will be replaced with
+  // a register chosen from Bank.
+  std::pair<unsigned, unsigned> analyzeInst(const MachineInstr &MI,
+                                            unsigned Reg = AMDGPU::NoRegister,
+                                            int Bank = -1);
 
   // Return true if register is regular VGPR or SGPR or their tuples.
   // Returns false for special registers like m0, vcc etc.
@@ -280,7 +282,9 @@ unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
 
   const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
   unsigned Size = TRI->getRegSizeInBits(*RC);
-  if (Size > 32)
+  if (Size == 16)
+    Reg = TRI->get32BitRegister(Reg);
+  else if (Size > 32)
     Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
 
   if (TRI->hasVGPRs(RC)) {
@@ -292,7 +296,7 @@ unsigned GCNRegBankReassign::getPhysRegBank(unsigned Reg) const {
   return Reg % NUM_SGPR_BANKS + SGPR_BANK_OFFSET;
 }
 
-unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
+uint32_t GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
                                             int Bank) {
   if (Register::isVirtualRegister(Reg)) {
     if (!VRM->isAssignedReg(Reg))
@@ -306,14 +310,21 @@ unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
   }
 
   const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-  unsigned Size = TRI->getRegSizeInBits(*RC) / 32;
-  if (Size > 1)
-    Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+  unsigned Size = TRI->getRegSizeInBits(*RC);
+
+  if (Size == 16) {
+    Reg = TRI->get32BitRegister(Reg);
+    Size = 1;
+  } else {
+    Size /= 32;
+    if (Size > 1)
+      Reg = TRI->getSubReg(Reg, AMDGPU::sub0);
+  }
 
   if (TRI->hasVGPRs(RC)) {
     // VGPRs have 4 banks assigned in a round-robin fashion.
     Reg -= AMDGPU::VGPR0;
-    unsigned Mask = (1 << Size) - 1;
+    uint32_t Mask = maskTrailingOnes<uint32_t>(Size);
     unsigned Used = 0;
     // Bitmask lacks an extract method
     for (unsigned I = 0; I < Size; ++I)
@@ -321,7 +332,7 @@ unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
         Used |= 1 << I;
     RegsUsed.set(Reg, Reg + Size);
     Mask &= ~Used;
-    Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : unsigned(Bank);
+    Mask <<= (Bank == -1) ? Reg % NUM_VGPR_BANKS : uint32_t(Bank);
     return (Mask | (Mask >> NUM_VGPR_BANKS)) & VGPR_BANK_MASK;
   }
 
@@ -347,15 +358,14 @@ unsigned GCNRegBankReassign::getRegBankMask(unsigned Reg, unsigned SubReg,
   return Mask << SGPR_BANK_OFFSET;
 }
 
-unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
-                                         unsigned& UsedBanks,
-                                         unsigned Reg,
-                                         int Bank) {
+std::pair<unsigned, unsigned>
+GCNRegBankReassign::analyzeInst(const MachineInstr &MI, unsigned Reg,
+                                int Bank) {
   unsigned StallCycles = 0;
-  UsedBanks = 0;
+  unsigned UsedBanks = 0;
 
   if (MI.isDebugValue())
-    return 0;
+    return std::make_pair(StallCycles, UsedBanks);
 
   RegsUsed.reset();
   OperandMasks.clear();
@@ -372,30 +382,30 @@ unsigned GCNRegBankReassign::analyzeInst(const MachineInstr& MI,
     unsigned ShiftedBank = Bank;
 
     if (Bank != -1 && R == Reg && Op.getSubReg()) {
-      unsigned LM = TRI->getSubRegIndexLaneMask(Op.getSubReg()).getAsInteger();
-      if (!(LM & 1) && (Bank < NUM_VGPR_BANKS)) {
+      unsigned Offset = TRI->getChannelFromSubReg(Op.getSubReg());
+      LaneBitmask LM = TRI->getSubRegIndexLaneMask(Op.getSubReg());
+      if (Offset && Bank < NUM_VGPR_BANKS) {
         // If a register spans all banks we cannot shift it to avoid conflict.
-        if (countPopulation(LM) >= NUM_VGPR_BANKS)
+        if (TRI->getNumCoveredRegs(LM) >= NUM_VGPR_BANKS)
           continue;
-        ShiftedBank = (Bank + countTrailingZeros(LM)) % NUM_VGPR_BANKS;
-      } else if (!(LM & 3) && (Bank >= SGPR_BANK_OFFSET)) {
+        ShiftedBank = (Bank + Offset) % NUM_VGPR_BANKS;
+      } else if (Offset > 1 && Bank >= SGPR_BANK_OFFSET) {
         // If a register spans all banks we cannot shift it to avoid conflict.
-        if (countPopulation(LM) / 2 >= NUM_SGPR_BANKS)
+        if (TRI->getNumCoveredRegs(LM) / 2 >= NUM_SGPR_BANKS)
           continue;
-        ShiftedBank = SGPR_BANK_OFFSET + (Bank - SGPR_BANK_OFFSET +
-                                          (countTrailingZeros(LM) >> 1)) %
-                                             NUM_SGPR_BANKS;
+        ShiftedBank = SGPR_BANK_OFFSET +
+          (Bank - SGPR_BANK_OFFSET + (Offset >> 1)) % NUM_SGPR_BANKS;
       }
     }
 
-    unsigned Mask = getRegBankMask(R, Op.getSubReg(),
+    uint32_t Mask = getRegBankMask(R, Op.getSubReg(),
                                    (Reg == R) ? ShiftedBank : -1);
     StallCycles += countPopulation(UsedBanks & Mask);
     UsedBanks |= Mask;
     OperandMasks.push_back(OperandMask(Op.getReg(), Op.getSubReg(), Mask));
   }
 
-  return StallCycles;
+  return std::make_pair(StallCycles, UsedBanks);
 }
 
 unsigned GCNRegBankReassign::getOperandGatherWeight(const MachineInstr& MI,
@@ -440,10 +450,19 @@ bool GCNRegBankReassign::isReassignable(unsigned Reg) const {
   }
 
   const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(PhysReg);
+  unsigned Size = TRI->getRegSizeInBits(*RC);
+
+  // TODO: Support 16 bit registers. Those needs to be moved with their
+  //       parent VGPR_32 and potentially a sibling 16 bit sub-register.
+  if (Size < 32)
+    return false;
+
   if (TRI->hasVGPRs(RC))
     return true;
 
-  unsigned Size = TRI->getRegSizeInBits(*RC);
+  if (Size == 16)
+    return AMDGPU::SGPR_LO16RegClass.contains(PhysReg);
+
   if (Size > 32)
     PhysReg = TRI->getSubReg(PhysReg, AMDGPU::sub0);
 
@@ -496,16 +515,16 @@ unsigned GCNRegBankReassign::getFreeBanks(unsigned Reg,
 
   unsigned FreeBanks = getFreeBanks(Mask, UsedBanks);
 
-  unsigned LM = TRI->getSubRegIndexLaneMask(SubReg).getAsInteger();
-  if (!(LM & 1) && (Mask & VGPR_BANK_MASK)) {
-    unsigned Shift = countTrailingZeros(LM);
+  unsigned Offset = TRI->getChannelFromSubReg(SubReg);
+  if (Offset && (Mask & VGPR_BANK_MASK)) {
+    unsigned Shift = Offset;
     if (Shift >= NUM_VGPR_BANKS)
       return 0;
     unsigned VB = FreeBanks & VGPR_BANK_MASK;
     FreeBanks = ((VB >> Shift) | (VB << (NUM_VGPR_BANKS - Shift))) &
                 VGPR_BANK_MASK;
-  } else if (!(LM & 3) && (Mask & SGPR_BANK_MASK)) {
-    unsigned Shift = countTrailingZeros(LM) >> 1;
+  } else if (Offset > 1 && (Mask & SGPR_BANK_MASK)) {
+    unsigned Shift = Offset >> 1;
     if (Shift >= NUM_SGPR_BANKS)
       return 0;
     unsigned SB = FreeBanks >> SGPR_BANK_OFFSET;
@@ -570,7 +589,6 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
                                                 unsigned Reg, int Bank,
                                                 bool Collect) {
   unsigned TotalStallCycles = 0;
-  unsigned UsedBanks = 0;
   SmallSet<const MachineInstr *, 16> Visited;
 
   for (auto &MI : MRI->use_nodbg_instructions(SrcReg)) {
@@ -578,7 +596,9 @@ unsigned GCNRegBankReassign::computeStallCycles(unsigned SrcReg,
       continue;
     if (!Visited.insert(&MI).second)
       continue;
-    unsigned StallCycles = analyzeInst(MI, UsedBanks, Reg, Bank);
+    unsigned StallCycles;
+    unsigned UsedBanks;
+    std::tie(StallCycles, UsedBanks) = analyzeInst(MI, Reg, Bank);
     TotalStallCycles += StallCycles;
     if (Collect)
       collectCandidates(MI, UsedBanks, StallCycles);
@@ -636,7 +656,11 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
 
   struct BankStall {
     BankStall(unsigned b, unsigned s) : Bank(b), Stalls(s) {};
-    bool operator< (const BankStall &RHS) const { return Stalls > RHS.Stalls; }
+    bool operator<(const BankStall &RHS) const {
+      if (Stalls == RHS.Stalls)
+        return Bank < RHS.Bank;
+      return Stalls > RHS.Stalls;
+    }
     unsigned Bank;
     unsigned Stalls;
   };
@@ -653,7 +677,7 @@ unsigned GCNRegBankReassign::tryReassign(Candidate &C) {
       }
     }
   }
-  std::sort(BankStalls.begin(), BankStalls.end());
+  llvm::sort(BankStalls);
 
   Register OrigReg = VRM->getPhys(C.Reg);
   LRM->unassign(LI);
@@ -695,8 +719,9 @@ unsigned GCNRegBankReassign::collectCandidates(MachineFunction &MF,
       if (MI.isBundle())
           continue; // we analyze the instructions inside the bundle individually
 
-      unsigned UsedBanks = 0;
-      unsigned StallCycles = analyzeInst(MI, UsedBanks);
+      unsigned StallCycles;
+      unsigned UsedBanks;
+      std::tie(StallCycles, UsedBanks) = analyzeInst(MI);
 
       if (Collect)
         collectCandidates(MI, UsedBanks, StallCycles);
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
index d593204cba05..86a3cb9af32f 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.cpp
@@ -5,6 +5,11 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file implements the GCNRegPressure class.
+///
+//===----------------------------------------------------------------------===//
 
 #include "GCNRegPressure.h"
 #include "AMDGPUSubtarget.h"
@@ -98,7 +103,8 @@ void GCNRegPressure::inc(unsigned Reg,
                          LaneBitmask PrevMask,
                          LaneBitmask NewMask,
                          const MachineRegisterInfo &MRI) {
-  if (NewMask == PrevMask)
+  if (SIRegisterInfo::getNumCoveredRegs(NewMask) ==
+      SIRegisterInfo::getNumCoveredRegs(PrevMask))
     return;
 
   int Sign = 1;
@@ -106,25 +112,21 @@ void GCNRegPressure::inc(unsigned Reg,
     std::swap(NewMask, PrevMask);
     Sign = -1;
   }
-#ifndef NDEBUG
-  const auto MaxMask = MRI.getMaxLaneMaskForVReg(Reg);
-#endif
+
   switch (auto Kind = getRegKind(Reg, MRI)) {
   case SGPR32:
   case VGPR32:
   case AGPR32:
-    assert(PrevMask.none() && NewMask == MaxMask);
     Value[Kind] += Sign;
     break;
 
   case SGPR_TUPLE:
   case VGPR_TUPLE:
   case AGPR_TUPLE:
-    assert(NewMask < MaxMask || NewMask == MaxMask);
     assert(PrevMask < NewMask);
 
     Value[Kind == SGPR_TUPLE ? SGPR32 : Kind == AGPR_TUPLE ? AGPR32 : VGPR32] +=
-      Sign * (~PrevMask & NewMask).getNumLanes();
+      Sign * SIRegisterInfo::getNumCoveredRegs(~PrevMask & NewMask);
 
     if (PrevMask.none()) {
       assert(NewMask.any());
@@ -216,7 +218,7 @@ static LaneBitmask getUsedRegMask(const MachineOperand &MO,
     return MRI.getTargetRegisterInfo()->getSubRegIndexLaneMask(SubReg);
 
   auto MaxMask = MRI.getMaxLaneMaskForVReg(MO.getReg());
-  if (MaxMask == LaneBitmask::getLane(0)) // cannot have subregs
+  if (SIRegisterInfo::getNumCoveredRegs(MaxMask) > 1) // cannot have subregs
     return MaxMask;
 
   // For a tentative schedule LIS isn't updated yet but livemask should remain
@@ -327,8 +329,9 @@ void GCNUpwardRPTracker::recede(const MachineInstr &MI) {
   // update max pressure
   MaxPressure = max(AtMIPressure, MaxPressure);
 
-  for (const auto &MO : MI.defs()) {
-    if (!MO.isReg() || !Register::isVirtualRegister(MO.getReg()) || MO.isDead())
+  for (const auto &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.isDef() ||
+        !Register::isVirtualRegister(MO.getReg()) || MO.isDead())
       continue;
 
     auto Reg = MO.getReg();
@@ -403,8 +406,8 @@ void GCNDownwardRPTracker::advanceToNext() {
   LastTrackedMI = &*NextMI++;
 
   // Add new registers or mask bits.
-  for (const auto &MO : LastTrackedMI->defs()) {
-    if (!MO.isReg())
+  for (const auto &MO : LastTrackedMI->operands()) {
+    if (!MO.isReg() || !MO.isDef())
       continue;
     Register Reg = MO.getReg();
     if (!Register::isVirtualRegister(Reg))
diff --git a/llvm/lib/Target/AMDGPU/GCNRegPressure.h b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
index 5862cdb04166..2ef79410719f 100644
--- a/llvm/lib/Target/AMDGPU/GCNRegPressure.h
+++ b/llvm/lib/Target/AMDGPU/GCNRegPressure.h
@@ -5,6 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
+///
+/// \file
+/// This file defines the GCNRegPressure class, which tracks registry pressure
+/// by bookkeeping number of SGPR/VGPRs used, weights for large SGPR/VGPRs. It
+/// also implements a compare function, which compares different register
+/// pressures, and declares one with max occupance as winner.
+///
+//===----------------------------------------------------------------------===//
 
 #ifndef LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
 #define LLVM_LIB_TARGET_AMDGPU_GCNREGPRESSURE_H
@@ -208,7 +216,7 @@ getLiveRegMap(Range &&R, bool After, LiveIntervals &LIS) {
     auto SI = SII.getInstructionIndex(*I);
     Indexes.push_back(After ? SI.getDeadSlot() : SI.getBaseIndex());
   }
-  std::sort(Indexes.begin(), Indexes.end());
+  llvm::sort(Indexes);
 
   auto &MRI = (*R.begin())->getParent()->getParent()->getRegInfo();
   DenseMap<MachineInstr *, GCNRPTracker::LiveRegSet> LiveRegMap;
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
index e109eed5f607..deed50b6db7d 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.cpp
@@ -50,9 +50,9 @@ void GCNMaxOccupancySchedStrategy::initialize(ScheduleDAGMI *DAG) {
     VGPRCriticalLimit = ST.getMaxNumVGPRs(TargetOccupancy);
   } else {
     SGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
-                                                    SRI->getSGPRPressureSet());
+        AMDGPU::RegisterPressureSets::SReg_32);
     VGPRCriticalLimit = SRI->getRegPressureSetLimit(DAG->MF,
-                                                    SRI->getVGPRPressureSet());
+        AMDGPU::RegisterPressureSets::VGPR_32);
   }
 
   SGPRCriticalLimit -= ErrorMargin;
@@ -83,8 +83,8 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
     TempTracker.getUpwardPressure(SU->getInstr(), Pressure, MaxPressure);
   }
 
-  unsigned NewSGPRPressure = Pressure[SRI->getSGPRPressureSet()];
-  unsigned NewVGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+  unsigned NewSGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
+  unsigned NewVGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
 
   // If two instructions increase the pressure of different register sets
   // by the same amount, the generic scheduler will prefer to schedule the
@@ -109,12 +109,12 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
   // marked as RegExcess in tryCandidate() when they are compared with
   // instructions that increase the register pressure.
   if (ShouldTrackVGPRs && NewVGPRPressure >= VGPRExcessLimit) {
-    Cand.RPDelta.Excess = PressureChange(SRI->getVGPRPressureSet());
+    Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
     Cand.RPDelta.Excess.setUnitInc(NewVGPRPressure - VGPRExcessLimit);
   }
 
   if (ShouldTrackSGPRs && NewSGPRPressure >= SGPRExcessLimit) {
-    Cand.RPDelta.Excess = PressureChange(SRI->getSGPRPressureSet());
+    Cand.RPDelta.Excess = PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
     Cand.RPDelta.Excess.setUnitInc(NewSGPRPressure - SGPRExcessLimit);
   }
 
@@ -128,10 +128,12 @@ void GCNMaxOccupancySchedStrategy::initCandidate(SchedCandidate &Cand, SUnit *SU
 
   if (SGPRDelta >= 0 || VGPRDelta >= 0) {
     if (SGPRDelta > VGPRDelta) {
-      Cand.RPDelta.CriticalMax = PressureChange(SRI->getSGPRPressureSet());
+      Cand.RPDelta.CriticalMax =
+        PressureChange(AMDGPU::RegisterPressureSets::SReg_32);
       Cand.RPDelta.CriticalMax.setUnitInc(SGPRDelta);
     } else {
-      Cand.RPDelta.CriticalMax = PressureChange(SRI->getVGPRPressureSet());
+      Cand.RPDelta.CriticalMax =
+        PressureChange(AMDGPU::RegisterPressureSets::VGPR_32);
       Cand.RPDelta.CriticalMax.setUnitInc(VGPRDelta);
     }
   }
@@ -145,8 +147,8 @@ void GCNMaxOccupancySchedStrategy::pickNodeFromQueue(SchedBoundary &Zone,
                                          SchedCandidate &Cand) {
   const SIRegisterInfo *SRI = static_cast<const SIRegisterInfo*>(TRI);
   ArrayRef<unsigned> Pressure = RPTracker.getRegSetPressureAtPos();
-  unsigned SGPRPressure = Pressure[SRI->getSGPRPressureSet()];
-  unsigned VGPRPressure = Pressure[SRI->getVGPRPressureSet()];
+  unsigned SGPRPressure = Pressure[AMDGPU::RegisterPressureSets::SReg_32];
+  unsigned VGPRPressure = Pressure[AMDGPU::RegisterPressureSets::VGPR_32];
   ReadyQueue &Q = Zone.Available;
   for (SUnit *SU : Q) {
 
@@ -231,33 +233,11 @@ SUnit *GCNMaxOccupancySchedStrategy::pickNodeBidirectional(bool &IsTopNode) {
   // Pick best from BotCand and TopCand.
   LLVM_DEBUG(dbgs() << "Top Cand: "; traceCandidate(TopCand);
              dbgs() << "Bot Cand: "; traceCandidate(BotCand););
-  SchedCandidate Cand;
-  if (TopCand.Reason == BotCand.Reason) {
-    Cand = BotCand;
-    GenericSchedulerBase::CandReason TopReason = TopCand.Reason;
-    TopCand.Reason = NoCand;
-    GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
-    if (TopCand.Reason != NoCand) {
-      Cand.setBest(TopCand);
-    } else {
-      TopCand.Reason = TopReason;
-    }
-  } else {
-    if (TopCand.Reason == RegExcess && TopCand.RPDelta.Excess.getUnitInc() <= 0) {
-      Cand = TopCand;
-    } else if (BotCand.Reason == RegExcess && BotCand.RPDelta.Excess.getUnitInc() <= 0) {
-      Cand = BotCand;
-    } else if (TopCand.Reason == RegCritical && TopCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
-      Cand = TopCand;
-    } else if (BotCand.Reason == RegCritical && BotCand.RPDelta.CriticalMax.getUnitInc() <= 0) {
-      Cand = BotCand;
-    } else {
-      if (BotCand.Reason > TopCand.Reason) {
-        Cand = TopCand;
-      } else {
-        Cand = BotCand;
-      }
-    }
+  SchedCandidate Cand = BotCand;
+  TopCand.Reason = NoCand;
+  GenericScheduler::tryCandidate(Cand, TopCand, nullptr);
+  if (TopCand.Reason != NoCand) {
+    Cand.setBest(TopCand);
   }
   LLVM_DEBUG(dbgs() << "Picking: "; traceCandidate(Cand););
 
@@ -316,13 +296,13 @@ GCNScheduleDAGMILive::GCNScheduleDAGMILive(MachineSchedContext *C,
   ST(MF.getSubtarget<GCNSubtarget>()),
   MFI(*MF.getInfo<SIMachineFunctionInfo>()),
   StartingOccupancy(MFI.getOccupancy()),
-  MinOccupancy(StartingOccupancy), Stage(0), RegionIdx(0) {
+  MinOccupancy(StartingOccupancy), Stage(Collect), RegionIdx(0) {
 
   LLVM_DEBUG(dbgs() << "Starting occupancy is " << StartingOccupancy << ".\n");
 }
 
 void GCNScheduleDAGMILive::schedule() {
-  if (Stage == 0) {
+  if (Stage == Collect) {
     // Just record regions at the first pass.
     Regions.push_back(std::make_pair(RegionBegin, RegionEnd));
     return;
@@ -348,6 +328,7 @@ void GCNScheduleDAGMILive::schedule() {
 
   ScheduleDAGMILive::schedule();
   Regions[RegionIdx] = std::make_pair(RegionBegin, RegionEnd);
+  RescheduleRegions[RegionIdx] = false;
 
   if (!LIS)
     return;
@@ -389,20 +370,28 @@ void GCNScheduleDAGMILive::schedule() {
                       << MinOccupancy << ".\n");
   }
 
+  unsigned MaxVGPRs = ST.getMaxNumVGPRs(MF);
+  unsigned MaxSGPRs = ST.getMaxNumSGPRs(MF);
+  if (PressureAfter.getVGPRNum() > MaxVGPRs ||
+      PressureAfter.getSGPRNum() > MaxSGPRs)
+    RescheduleRegions[RegionIdx] = true;
+
   if (WavesAfter >= MinOccupancy) {
-    unsigned TotalVGPRs = AMDGPU::IsaInfo::getAddressableNumVGPRs(&ST);
-    unsigned TotalSGPRs = AMDGPU::IsaInfo::getAddressableNumSGPRs(&ST);
-    if (WavesAfter > MFI.getMinWavesPerEU() ||
+    if (Stage == UnclusteredReschedule &&
+        !PressureAfter.less(ST, PressureBefore)) {
+      LLVM_DEBUG(dbgs() << "Unclustered reschedule did not help.\n");
+    } else if (WavesAfter > MFI.getMinWavesPerEU() ||
         PressureAfter.less(ST, PressureBefore) ||
-        (TotalVGPRs >= PressureAfter.getVGPRNum() &&
-         TotalSGPRs >= PressureAfter.getSGPRNum())) {
+        !RescheduleRegions[RegionIdx]) {
       Pressure[RegionIdx] = PressureAfter;
       return;
+    } else {
+      LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
     }
-    LLVM_DEBUG(dbgs() << "New pressure will result in more spilling.\n");
   }
 
   LLVM_DEBUG(dbgs() << "Attempting to revert scheduling.\n");
+  RescheduleRegions[RegionIdx] = true;
   RegionEnd = RegionBegin;
   for (MachineInstr *MI : Unsched) {
     if (MI->isDebugInstr())
@@ -532,33 +521,55 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
 
   LiveIns.resize(Regions.size());
   Pressure.resize(Regions.size());
+  RescheduleRegions.resize(Regions.size());
+  RescheduleRegions.set();
 
   if (!Regions.empty())
     BBLiveInMap = getBBLiveInMap();
 
+  std::vector<std::unique_ptr<ScheduleDAGMutation>> SavedMutations;
+
   do {
     Stage++;
     RegionIdx = 0;
     MachineBasicBlock *MBB = nullptr;
 
-    if (Stage > 1) {
+    if (Stage > InitialSchedule) {
+      if (!LIS)
+        break;
+
       // Retry function scheduling if we found resulting occupancy and it is
       // lower than used for first pass scheduling. This will give more freedom
       // to schedule low register pressure blocks.
       // Code is partially copied from MachineSchedulerBase::scheduleRegions().
 
-      if (!LIS || StartingOccupancy <= MinOccupancy)
-        break;
+      if (Stage == UnclusteredReschedule) {
+        if (RescheduleRegions.none())
+          continue;
+        LLVM_DEBUG(dbgs() <<
+          "Retrying function scheduling without clustering.\n");
+      }
+
+      if (Stage == ClusteredLowOccupancyReschedule) {
+        if (StartingOccupancy <= MinOccupancy)
+          break;
 
-      LLVM_DEBUG(
-          dbgs()
-          << "Retrying function scheduling with lowest recorded occupancy "
-          << MinOccupancy << ".\n");
+        LLVM_DEBUG(
+            dbgs()
+            << "Retrying function scheduling with lowest recorded occupancy "
+            << MinOccupancy << ".\n");
 
-      S.setTargetOccupancy(MinOccupancy);
+        S.setTargetOccupancy(MinOccupancy);
+      }
     }
 
+    if (Stage == UnclusteredReschedule)
+      SavedMutations.swap(Mutations);
+
     for (auto Region : Regions) {
+      if (Stage == UnclusteredReschedule && !RescheduleRegions[RegionIdx])
+        continue;
+
       RegionBegin = Region.first;
       RegionEnd = Region.second;
 
@@ -566,7 +577,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
         if (MBB) finishBlock();
         MBB = RegionBegin->getParent();
         startBlock(MBB);
-        if (Stage == 1)
+        if (Stage == InitialSchedule)
           computeBlockPressure(MBB);
       }
 
@@ -594,5 +605,7 @@ void GCNScheduleDAGMILive::finalizeSchedule() {
     }
     finishBlock();
 
-  } while (Stage < 2);
+    if (Stage == UnclusteredReschedule)
+      SavedMutations.swap(Mutations);
+  } while (Stage != LastStage);
 }
diff --git a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
index dd687a930c79..2d81d9977c31 100644
--- a/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
+++ b/llvm/lib/Target/AMDGPU/GCNSchedStrategy.h
@@ -64,6 +64,14 @@ public:
 
 class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
 
+  enum : unsigned {
+    Collect,
+    InitialSchedule,
+    UnclusteredReschedule,
+    ClusteredLowOccupancyReschedule,
+    LastStage = ClusteredLowOccupancyReschedule
+  };
+
   const GCNSubtarget &ST;
 
   SIMachineFunctionInfo &MFI;
@@ -84,6 +92,10 @@ class GCNScheduleDAGMILive final : public ScheduleDAGMILive {
   SmallVector<std::pair<MachineBasicBlock::iterator,
                         MachineBasicBlock::iterator>, 32> Regions;
 
+  // Records if a region is not yet scheduled, or schedule has been reverted,
+  // or we generally desire to reschedule it.
+  BitVector RescheduleRegions;
+
   // Region live-in cache.
   SmallVector<GCNRPTracker::LiveRegSet, 32> LiveIns;
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
index 1f94ab799122..ea6e9038fd1e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUAsmBackend.cpp
@@ -17,6 +17,7 @@
 #include "llvm/MC/MCFixupKindInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "Utils/AMDGPUBaseInfo.h"
 
@@ -39,8 +40,8 @@ public:
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override;
 
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override;
+  void relaxInstruction(MCInst &Inst,
+                        const MCSubtargetInfo &STI) const override;
 
   bool mayNeedRelaxation(const MCInst &Inst,
                          const MCSubtargetInfo &STI) const override;
@@ -53,12 +54,13 @@ public:
 
 } //End anonymous namespace
 
-void AMDGPUAsmBackend::relaxInstruction(const MCInst &Inst,
-                                        const MCSubtargetInfo &STI,
-                                        MCInst &Res) const {
+void AMDGPUAsmBackend::relaxInstruction(MCInst &Inst,
+                                        const MCSubtargetInfo &STI) const {
+  MCInst Res;
   unsigned RelaxedOpcode = AMDGPU::getSOPPWithRelaxation(Inst.getOpcode());
   Res.setOpcode(RelaxedOpcode);
   Res.addOperand(Inst.getOperand(0));
+  Inst = std::move(Res);
   return;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
index d352219a7a98..619fde74e88d 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUELFObjectWriter.cpp
@@ -6,8 +6,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "AMDGPUFixupKinds.h"
 #include "AMDGPUMCTargetDesc.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCELFObjectWriter.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCFixup.h"
@@ -80,6 +82,15 @@ unsigned AMDGPUELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_AMDGPU_ABS64;
   }
 
+  if (Fixup.getTargetKind() == AMDGPU::fixup_si_sopp_br) {
+    const auto *SymA = Target.getSymA();
+    assert(SymA);
+
+    Ctx.reportError(Fixup.getLoc(),
+                    Twine("undefined label '") + SymA->getSymbol().getName() + "'");
+    return ELF::R_AMDGPU_NONE;
+  }
+
   llvm_unreachable("unhandled relocation type");
 }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
index f65dc25d7eec..fe063d33ea3e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
@@ -26,6 +27,28 @@
 using namespace llvm;
 using namespace llvm::AMDGPU;
 
+static cl::opt<bool> Keep16BitSuffixes(
+  "amdgpu-keep-16-bit-reg-suffixes",
+  cl::desc("Keep .l and .h suffixes in asm for debugging purposes"),
+  cl::init(false),
+  cl::ReallyHidden);
+
+void AMDGPUInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  // FIXME: The current implementation of
+  // AsmParser::parseRegisterOrRegisterNumber in MC implies we either emit this
+  // as an integer or we provide a name which represents a physical register.
+  // For CFI instructions we really want to emit a name for the DWARF register
+  // instead, because there may be multiple DWARF registers corresponding to a
+  // single physical register. One case where this problem manifests is with
+  // wave32/wave64 where using the physical register name is ambiguous: if we
+  // write e.g. `.cfi_undefined v0` we lose information about the wavefront
+  // size which we need to encode the register in the final DWARF. Ideally we
+  // would extend MC to support parsing DWARF register names so we could do
+  // something like `.cfi_undefined dwarf_wave32_v0`. For now we just live with
+  // non-pretty DWARF register names in assembly text.
+  OS << RegNo;
+}
+
 void AMDGPUInstPrinter::printInst(const MCInst *MI, uint64_t Address,
                                   StringRef Annot, const MCSubtargetInfo &STI,
                                   raw_ostream &OS) {
@@ -164,10 +187,10 @@ void AMDGPUInstPrinter::printSMRDOffset8(const MCInst *MI, unsigned OpNo,
   printU32ImmOperand(MI, OpNo, STI, O);
 }
 
-void AMDGPUInstPrinter::printSMRDOffset20(const MCInst *MI, unsigned OpNo,
+void AMDGPUInstPrinter::printSMEMOffset(const MCInst *MI, unsigned OpNo,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
-  printU32ImmOperand(MI, OpNo, STI, O);
+  O << formatHex(MI->getOperand(OpNo).getImm());
 }
 
 void AMDGPUInstPrinter::printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
@@ -244,6 +267,11 @@ void AMDGPUInstPrinter::printR128A16(const MCInst *MI, unsigned OpNo,
     printNamedBit(MI, OpNo, O, "r128");
 }
 
+void AMDGPUInstPrinter::printGFX10A16(const MCInst *MI, unsigned OpNo,
+                                  const MCSubtargetInfo &STI, raw_ostream &O) {
+  printNamedBit(MI, OpNo, O, "a16");
+}
+
 void AMDGPUInstPrinter::printLWE(const MCInst *MI, unsigned OpNo,
                                  const MCSubtargetInfo &STI, raw_ostream &O) {
   printNamedBit(MI, OpNo, O, "lwe");
@@ -287,7 +315,6 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
   switch (RegNo) {
   case AMDGPU::FP_REG:
   case AMDGPU::SP_REG:
-  case AMDGPU::SCRATCH_WAVE_OFFSET_REG:
   case AMDGPU::PRIVATE_RSRC_REG:
     llvm_unreachable("pseudo-register should not ever be emitted");
   case AMDGPU::SCC:
@@ -297,7 +324,12 @@ void AMDGPUInstPrinter::printRegOperand(unsigned RegNo, raw_ostream &O,
   }
 #endif
 
-  O << getRegisterName(RegNo);
+  StringRef RegName(getRegisterName(RegNo));
+  if (!Keep16BitSuffixes)
+    if (!RegName.consume_back(".l"))
+      RegName.consume_back(".h");
+
+  O << RegName;
 }
 
 void AMDGPUInstPrinter::printVOPDst(const MCInst *MI, unsigned OpNo,
@@ -346,11 +378,21 @@ void AMDGPUInstPrinter::printVINTRPDst(const MCInst *MI, unsigned OpNo,
   printOperand(MI, OpNo, STI, O);
 }
 
+void AMDGPUInstPrinter::printImmediateInt16(uint32_t Imm,
+                                            const MCSubtargetInfo &STI,
+                                            raw_ostream &O) {
+  int16_t SImm = static_cast<int16_t>(Imm);
+  if (isInlinableIntLiteral(SImm))
+    O << SImm;
+  else
+    O << formatHex(static_cast<uint64_t>(Imm));
+}
+
 void AMDGPUInstPrinter::printImmediate16(uint32_t Imm,
                                          const MCSubtargetInfo &STI,
                                          raw_ostream &O) {
   int16_t SImm = static_cast<int16_t>(Imm);
-  if (SImm >= -16 && SImm <= 64) {
+  if (isInlinableIntLiteral(SImm)) {
     O << SImm;
     return;
   }
@@ -518,7 +560,8 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
   if (Op.isReg()) {
     printRegOperand(Op.getReg(), O, MRI);
   } else if (Op.isImm()) {
-    switch (Desc.OpInfo[OpNo].OperandType) {
+    const uint8_t OpTy = Desc.OpInfo[OpNo].OperandType;
+    switch (OpTy) {
     case AMDGPU::OPERAND_REG_IMM_INT32:
     case AMDGPU::OPERAND_REG_IMM_FP32:
     case AMDGPU::OPERAND_REG_INLINE_C_INT32:
@@ -535,10 +578,12 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
       printImmediate64(Op.getImm(), STI, O);
       break;
     case AMDGPU::OPERAND_REG_INLINE_C_INT16:
-    case AMDGPU::OPERAND_REG_INLINE_C_FP16:
     case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
-    case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
     case AMDGPU::OPERAND_REG_IMM_INT16:
+      printImmediateInt16(Op.getImm(), STI, O);
+      break;
+    case AMDGPU::OPERAND_REG_INLINE_C_FP16:
+    case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
     case AMDGPU::OPERAND_REG_IMM_FP16:
       printImmediate16(Op.getImm(), STI, O);
       break;
@@ -549,11 +594,19 @@ void AMDGPUInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
         printImmediate32(Op.getImm(), STI, O);
         break;
       }
+
+      //  Deal with 16-bit FP inline immediates not working.
+      if (OpTy == AMDGPU::OPERAND_REG_IMM_V2FP16) {
+        printImmediate16(static_cast<uint16_t>(Op.getImm()), STI, O);
+        break;
+      }
       LLVM_FALLTHROUGH;
-    case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
     case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
-    case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
     case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+      printImmediateInt16(static_cast<uint16_t>(Op.getImm()), STI, O);
+      break;
+    case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
+    case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16:
       printImmediateV216(Op.getImm(), STI, O);
       break;
     case MCOI::OPERAND_UNKNOWN:
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
index ba53003e9041..6dfd23ea72e6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUInstPrinter.h
@@ -23,6 +23,7 @@ public:
     : MCInstPrinter(MAI, MII, MRI) {}
 
   //Autogenerated by tblgen
+  void printRegName(raw_ostream &OS, unsigned RegNo) const override;
   void printInstruction(const MCInst *MI, uint64_t Address,
                         const MCSubtargetInfo &STI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
@@ -60,7 +61,7 @@ private:
                     raw_ostream &O);
   void printSMRDOffset8(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
-  void printSMRDOffset20(const MCInst *MI, unsigned OpNo,
+  void printSMEMOffset(const MCInst *MI, unsigned OpNo,
                        const MCSubtargetInfo &STI, raw_ostream &O);
   void printSMRDLiteralOffset(const MCInst *MI, unsigned OpNo,
                               const MCSubtargetInfo &STI, raw_ostream &O);
@@ -86,6 +87,8 @@ private:
                raw_ostream &O);
   void printR128A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                  raw_ostream &O);
+  void printGFX10A16(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                raw_ostream &O);
   void printLWE(const MCInst *MI, unsigned OpNo,
                 const MCSubtargetInfo &STI, raw_ostream &O);
   void printD16(const MCInst *MI, unsigned OpNo,
@@ -102,8 +105,12 @@ private:
                    raw_ostream &O);
   void printVINTRPDst(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                       raw_ostream &O);
+  void printImmediateInt16(uint32_t Imm, const MCSubtargetInfo &STI,
+                           raw_ostream &O);
   void printImmediate16(uint32_t Imm, const MCSubtargetInfo &STI,
                         raw_ostream &O);
+  void printImmediateIntV216(uint32_t Imm, const MCSubtargetInfo &STI,
+                             raw_ostream &O);
   void printImmediateV216(uint32_t Imm, const MCSubtargetInfo &STI,
                           raw_ostream &O);
   void printImmediate32(uint32_t Imm, const MCSubtargetInfo &STI,
@@ -112,6 +119,10 @@ private:
                         raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
+  void printOperand(const MCInst *MI, uint64_t /*Address*/, unsigned OpNum,
+                    const MCSubtargetInfo &STI, raw_ostream &O) {
+    printOperand(MI, OpNum, STI, O);
+  }
   void printOperandAndFPInputMods(const MCInst *MI, unsigned OpNo,
                                   const MCSubtargetInfo &STI, raw_ostream &O);
   void printOperandAndIntInputMods(const MCInst *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
index 9644e66fda4e..687cfef4559f 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCAsmInfo.cpp
@@ -43,6 +43,9 @@ AMDGPUMCAsmInfo::AMDGPUMCAsmInfo(const Triple &TT,
   WeakRefDirective = ".weakref\t";
   //===--- Dwarf Emission Directives -----------------------------------===//
   SupportsDebugInformation = true;
+  DwarfRegNumForCFI = true;
+
+  UseIntegratedAssembler = false;
 }
 
 bool AMDGPUMCAsmInfo::shouldOmitSectionDirective(StringRef SectionName) const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
index 62757a707890..d7d8c8181b02 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCCodeEmitter.h
@@ -51,6 +51,12 @@ public:
     return 0;
   }
 
+  virtual unsigned getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+    return 0;
+  }
+
   virtual unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
                                       SmallVectorImpl<MCFixup> &Fixups,
                                       const MCSubtargetInfo &STI) const {
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
index 9507836c64c2..7d3235efc59e 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.cpp
@@ -61,7 +61,13 @@ static MCRegisterInfo *createAMDGPUMCRegisterInfo(const Triple &TT) {
   if (TT.getArch() == Triple::r600)
     InitR600MCRegisterInfo(X, 0);
   else
-    InitAMDGPUMCRegisterInfo(X, 0);
+    InitAMDGPUMCRegisterInfo(X, AMDGPU::PC_REG);
+  return X;
+}
+
+MCRegisterInfo *llvm::createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour) {
+  MCRegisterInfo *X = new MCRegisterInfo();
+  InitAMDGPUMCRegisterInfo(X, AMDGPU::PC_REG, DwarfFlavour);
   return X;
 }
 
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
index 9754d31fee60..b9cdbc6502e5 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUMCTargetDesc.h
@@ -33,6 +33,10 @@ class Target;
 class Triple;
 class raw_pwrite_stream;
 
+enum AMDGPUDwarfFlavour { Wave64 = 0, Wave32 = 1 };
+
+MCRegisterInfo *createGCNMCRegisterInfo(AMDGPUDwarfFlavour DwarfFlavour);
+
 MCCodeEmitter *createR600MCCodeEmitter(const MCInstrInfo &MCII,
                                        const MCRegisterInfo &MRI,
                                        MCContext &Ctx);
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
index fef665c2900e..3d202d7960d6 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.cpp
@@ -43,7 +43,7 @@ using namespace llvm::AMDGPU::HSAMD;
 
 bool AMDGPUTargetStreamer::EmitHSAMetadataV2(StringRef HSAMetadataString) {
   HSAMD::Metadata HSAMetadata;
-  if (HSAMD::fromString(HSAMetadataString, HSAMetadata))
+  if (HSAMD::fromString(std::string(HSAMetadataString), HSAMetadata))
     return false;
 
   return EmitHSAMetadata(HSAMetadata);
@@ -97,6 +97,7 @@ StringRef AMDGPUTargetStreamer::getArchNameFromElfMach(unsigned ElfMach) {
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010: AK = GK_GFX1010; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011: AK = GK_GFX1011; break;
   case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012: AK = GK_GFX1012; break;
+  case ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030: AK = GK_GFX1030; break;
   case ELF::EF_AMDGPU_MACH_NONE:           AK = GK_NONE;    break;
   }
 
@@ -148,6 +149,7 @@ unsigned AMDGPUTargetStreamer::getElfMach(StringRef GPU) {
   case GK_GFX1010: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1010;
   case GK_GFX1011: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1011;
   case GK_GFX1012: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1012;
+  case GK_GFX1030: return ELF::EF_AMDGPU_MACH_AMDGCN_GFX1030;
   case GK_NONE:    return ELF::EF_AMDGPU_MACH_NONE;
   }
 
@@ -210,9 +212,9 @@ void AMDGPUTargetAsmStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
 }
 
 void AMDGPUTargetAsmStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
-                                            unsigned Align) {
-  OS << "\t.amdgpu_lds " << Symbol->getName() << ", " << Size << ", " << Align
-     << '\n';
+                                            Align Alignment) {
+  OS << "\t.amdgpu_lds " << Symbol->getName() << ", " << Size << ", "
+     << Alignment.value() << '\n';
 }
 
 bool AMDGPUTargetAsmStreamer::EmitISAVersion(StringRef IsaVersionString) {
@@ -393,9 +395,9 @@ void AMDGPUTargetAsmStreamer::EmitAmdhsaKernelDescriptor(
 // AMDGPUTargetELFStreamer
 //===----------------------------------------------------------------------===//
 
-AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(
-    MCStreamer &S, const MCSubtargetInfo &STI)
-    : AMDGPUTargetStreamer(S), Streamer(S) {
+AMDGPUTargetELFStreamer::AMDGPUTargetELFStreamer(MCStreamer &S,
+                                                 const MCSubtargetInfo &STI)
+    : AMDGPUTargetStreamer(S), Streamer(S), Os(STI.getTargetTriple().getOS()) {
   MCAssembler &MCA = getStreamer().getAssembler();
   unsigned EFlags = MCA.getELFHeaderEFlags();
 
@@ -427,7 +429,7 @@ void AMDGPUTargetELFStreamer::finish() {
   if (Blob.empty())
     return;
   EmitNote(Vendor, MCConstantExpr::create(Blob.size(), getContext()), Type,
-           [&](MCELFStreamer &OS) { OS.EmitBytes(Blob); });
+           [&](MCELFStreamer &OS) { OS.emitBytes(Blob); });
 }
 
 void AMDGPUTargetELFStreamer::EmitNote(
@@ -438,16 +440,22 @@ void AMDGPUTargetELFStreamer::EmitNote(
 
   auto NameSZ = Name.size() + 1;
 
+  unsigned NoteFlags = 0;
+  // TODO Apparently, this is currently needed for OpenCL as mentioned in
+  // https://reviews.llvm.org/D74995
+  if (Os == Triple::AMDHSA)
+    NoteFlags = ELF::SHF_ALLOC;
+
   S.PushSection();
-  S.SwitchSection(Context.getELFSection(
-    ElfNote::SectionName, ELF::SHT_NOTE, ELF::SHF_ALLOC));
-  S.EmitIntValue(NameSZ, 4);                                  // namesz
-  S.EmitValue(DescSZ, 4);                                     // descz
-  S.EmitIntValue(NoteType, 4);                                // type
-  S.EmitBytes(Name);                                          // name
-  S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
+  S.SwitchSection(
+      Context.getELFSection(ElfNote::SectionName, ELF::SHT_NOTE, NoteFlags));
+  S.emitInt32(NameSZ);                                        // namesz
+  S.emitValue(DescSZ, 4);                                     // descz
+  S.emitInt32(NoteType);                                      // type
+  S.emitBytes(Name);                                          // name
+  S.emitValueToAlignment(4, 0, 1, 0);                         // padding 0
   EmitDesc(S);                                                // desc
-  S.EmitValueToAlignment(4, 0, 1, 0);                         // padding 0
+  S.emitValueToAlignment(4, 0, 1, 0);                         // padding 0
   S.PopSection();
 }
 
@@ -458,8 +466,8 @@ void AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectVersion(
 
   EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(8, getContext()),
            ElfNote::NT_AMDGPU_HSA_CODE_OBJECT_VERSION, [&](MCELFStreamer &OS) {
-             OS.EmitIntValue(Major, 4);
-             OS.EmitIntValue(Minor, 4);
+             OS.emitInt32(Major);
+             OS.emitInt32(Minor);
            });
 }
 
@@ -478,15 +486,15 @@ AMDGPUTargetELFStreamer::EmitDirectiveHSACodeObjectISA(uint32_t Major,
 
   EmitNote(ElfNote::NoteNameV2, MCConstantExpr::create(DescSZ, getContext()),
            ElfNote::NT_AMDGPU_HSA_ISA, [&](MCELFStreamer &OS) {
-             OS.EmitIntValue(VendorNameSize, 2);
-             OS.EmitIntValue(ArchNameSize, 2);
-             OS.EmitIntValue(Major, 4);
-             OS.EmitIntValue(Minor, 4);
-             OS.EmitIntValue(Stepping, 4);
-             OS.EmitBytes(VendorName);
-             OS.EmitIntValue(0, 1); // NULL terminate VendorName
-             OS.EmitBytes(ArchName);
-             OS.EmitIntValue(0, 1); // NULL terminte ArchName
+             OS.emitInt16(VendorNameSize);
+             OS.emitInt16(ArchNameSize);
+             OS.emitInt32(Major);
+             OS.emitInt32(Minor);
+             OS.emitInt32(Stepping);
+             OS.emitBytes(VendorName);
+             OS.emitInt8(0); // NULL terminate VendorName
+             OS.emitBytes(ArchName);
+             OS.emitInt8(0); // NULL terminte ArchName
            });
 }
 
@@ -495,7 +503,7 @@ AMDGPUTargetELFStreamer::EmitAMDKernelCodeT(const amd_kernel_code_t &Header) {
 
   MCStreamer &OS = getStreamer();
   OS.PushSection();
-  OS.EmitBytes(StringRef((const char*)&Header, sizeof(Header)));
+  OS.emitBytes(StringRef((const char*)&Header, sizeof(Header)));
   OS.PopSection();
 }
 
@@ -507,9 +515,7 @@ void AMDGPUTargetELFStreamer::EmitAMDGPUSymbolType(StringRef SymbolName,
 }
 
 void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
-                                            unsigned Align) {
-  assert(isPowerOf2_32(Align));
-
+                                            Align Alignment) {
   MCSymbolELF *SymbolELF = cast<MCSymbolELF>(Symbol);
   SymbolELF->setType(ELF::STT_OBJECT);
 
@@ -518,7 +524,7 @@ void AMDGPUTargetELFStreamer::emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
     SymbolELF->setExternal(true);
   }
 
-  if (SymbolELF->declareCommon(Size, Align, true)) {
+  if (SymbolELF->declareCommon(Size, Alignment.value(), true)) {
     report_fatal_error("Symbol: " + Symbol->getName() +
                        " redeclared as different type");
   }
@@ -539,9 +545,9 @@ bool AMDGPUTargetELFStreamer::EmitISAVersion(StringRef IsaVersionString) {
 
   EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_ISA,
            [&](MCELFStreamer &OS) {
-             OS.EmitLabel(DescBegin);
-             OS.EmitBytes(IsaVersionString);
-             OS.EmitLabel(DescEnd);
+             OS.emitLabel(DescBegin);
+             OS.emitBytes(IsaVersionString);
+             OS.emitLabel(DescEnd);
            });
   return true;
 }
@@ -566,9 +572,9 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(msgpack::Document &HSAMetadataDoc,
 
   EmitNote(ElfNote::NoteNameV3, DescSZ, ELF::NT_AMDGPU_METADATA,
            [&](MCELFStreamer &OS) {
-             OS.EmitLabel(DescBegin);
-             OS.EmitBytes(HSAMetadataString);
-             OS.EmitLabel(DescEnd);
+             OS.emitLabel(DescBegin);
+             OS.emitBytes(HSAMetadataString);
+             OS.emitLabel(DescEnd);
            });
   return true;
 }
@@ -590,9 +596,9 @@ bool AMDGPUTargetELFStreamer::EmitHSAMetadata(
 
   EmitNote(ElfNote::NoteNameV2, DescSZ, ELF::NT_AMD_AMDGPU_HSA_METADATA,
            [&](MCELFStreamer &OS) {
-             OS.EmitLabel(DescBegin);
-             OS.EmitBytes(HSAMetadataString);
-             OS.EmitLabel(DescEnd);
+             OS.emitLabel(DescBegin);
+             OS.emitBytes(HSAMetadataString);
+             OS.emitLabel(DescEnd);
            });
   return true;
 }
@@ -602,9 +608,9 @@ bool AMDGPUTargetELFStreamer::EmitCodeEnd() {
 
   MCStreamer &OS = getStreamer();
   OS.PushSection();
-  OS.EmitValueToAlignment(64, Encoded_s_code_end, 4);
+  OS.emitValueToAlignment(64, Encoded_s_code_end, 4);
   for (unsigned I = 0; I < 48; ++I)
-    OS.EmitIntValue(Encoded_s_code_end, 4);
+    OS.emitInt32(Encoded_s_code_end);
   OS.PopSection();
   return true;
 }
@@ -637,22 +643,22 @@ void AMDGPUTargetELFStreamer::EmitAmdhsaKernelDescriptor(
   if (KernelCodeSymbol->getVisibility() == ELF::STV_DEFAULT)
     KernelCodeSymbol->setVisibility(ELF::STV_PROTECTED);
 
-  Streamer.EmitLabel(KernelDescriptorSymbol);
-  Streamer.EmitBytes(StringRef(
+  Streamer.emitLabel(KernelDescriptorSymbol);
+  Streamer.emitBytes(StringRef(
       (const char*)&(KernelDescriptor),
       offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset)));
   // FIXME: Remove the use of VK_AMDGPU_REL64 in the expression below. The
   // expression being created is:
   //   (start of kernel code) - (start of kernel descriptor)
   // It implies R_AMDGPU_REL64, but ends up being R_AMDGPU_ABS64.
-  Streamer.EmitValue(MCBinaryExpr::createSub(
+  Streamer.emitValue(MCBinaryExpr::createSub(
       MCSymbolRefExpr::create(
           KernelCodeSymbol, MCSymbolRefExpr::VK_AMDGPU_REL64, Context),
       MCSymbolRefExpr::create(
           KernelDescriptorSymbol, MCSymbolRefExpr::VK_None, Context),
       Context),
       sizeof(KernelDescriptor.kernel_code_entry_byte_offset));
-  Streamer.EmitBytes(StringRef(
+  Streamer.emitBytes(StringRef(
       (const char*)&(KernelDescriptor) +
           offsetof(amdhsa::kernel_descriptor_t, kernel_code_entry_byte_offset) +
           sizeof(KernelDescriptor.kernel_code_entry_byte_offset),
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
index 683b3e363b9a..a19d4646deb2 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/AMDGPUTargetStreamer.h
@@ -54,7 +54,7 @@ public:
   virtual void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) = 0;
 
   virtual void emitAMDGPULDS(MCSymbol *Symbol, unsigned Size,
-                             unsigned Align) = 0;
+                             Align Alignment) = 0;
 
   /// \returns True on success, false on failure.
   virtual bool EmitISAVersion(StringRef IsaVersionString) = 0;
@@ -110,7 +110,7 @@ public:
 
   void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
 
-  void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override;
+  void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
 
   /// \returns True on success, false on failure.
   bool EmitISAVersion(StringRef IsaVersionString) override;
@@ -133,6 +133,7 @@ public:
 
 class AMDGPUTargetELFStreamer final : public AMDGPUTargetStreamer {
   MCStreamer &Streamer;
+  Triple::OSType Os;
 
   void EmitNote(StringRef Name, const MCExpr *DescSize, unsigned NoteType,
                 function_ref<void(MCELFStreamer &)> EmitDesc);
@@ -157,7 +158,7 @@ public:
 
   void EmitAMDGPUSymbolType(StringRef SymbolName, unsigned Type) override;
 
-  void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, unsigned Align) override;
+  void emitAMDGPULDS(MCSymbol *Sym, unsigned Size, Align Alignment) override;
 
   /// \returns True on success, false on failure.
   bool EmitISAVersion(StringRef IsaVersionString) override;
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
index 2f1f4e7a0392..f61470573050 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/R600MCCodeEmitter.cpp
@@ -47,7 +47,7 @@ public:
   /// Encode the instruction and write it to the OS.
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
                          SmallVectorImpl<MCFixup> &Fixups,
-                         const MCSubtargetInfo &STI) const;
+                         const MCSubtargetInfo &STI) const override;
 
   /// \returns the encoding for an MCOperand.
   uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
diff --git a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
index f8ec3c36f019..2cd6c3a81d2b 100644
--- a/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
+++ b/llvm/lib/Target/AMDGPU/MCTargetDesc/SIMCCodeEmitter.cpp
@@ -13,7 +13,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "AMDGPU.h"
-#include "AMDGPURegisterInfo.h"
 #include "MCTargetDesc/AMDGPUFixupKinds.h"
 #include "MCTargetDesc/AMDGPUMCCodeEmitter.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -71,6 +70,10 @@ public:
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const override;
 
+  unsigned getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const override;
+
   unsigned getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
                               SmallVectorImpl<MCFixup> &Fixups,
                               const MCSubtargetInfo &STI) const override;
@@ -105,6 +108,11 @@ static uint32_t getIntInlineImmEncoding(IntTy Imm) {
   return 0;
 }
 
+static uint32_t getLit16IntEncoding(uint16_t Val, const MCSubtargetInfo &STI) {
+  uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val));
+  return IntImm == 0 ? 255 : IntImm;
+}
+
 static uint32_t getLit16Encoding(uint16_t Val, const MCSubtargetInfo &STI) {
   uint16_t IntImm = getIntInlineImmEncoding(static_cast<int16_t>(Val));
   if (IntImm != 0)
@@ -249,23 +257,27 @@ uint32_t SIMCCodeEmitter::getLitEncoding(const MCOperand &MO,
     return getLit64Encoding(static_cast<uint64_t>(Imm), STI);
 
   case AMDGPU::OPERAND_REG_IMM_INT16:
-  case AMDGPU::OPERAND_REG_IMM_FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
-  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+    return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP16:
     // FIXME Is this correct? What do inline immediates do on SI for f16 src
     // which does not have f16 support?
     return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
-
   case AMDGPU::OPERAND_REG_IMM_V2INT16:
-  case AMDGPU::OPERAND_REG_IMM_V2FP16:
+  case AMDGPU::OPERAND_REG_IMM_V2FP16: {
     if (!isUInt<16>(Imm) && STI.getFeatureBits()[AMDGPU::FeatureVOP3Literal])
       return getLit32Encoding(static_cast<uint32_t>(Imm), STI);
+    if (OpInfo.OperandType == AMDGPU::OPERAND_REG_IMM_V2FP16)
+      return getLit16Encoding(static_cast<uint16_t>(Imm), STI);
     LLVM_FALLTHROUGH;
+  }
   case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
-  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+    return getLit16IntEncoding(static_cast<uint16_t>(Imm), STI);
+  case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
     uint16_t Lo16 = static_cast<uint16_t>(Imm);
     uint32_t Encoding = getLit16Encoding(Lo16, STI);
@@ -359,6 +371,15 @@ unsigned SIMCCodeEmitter::getSOPPBrEncoding(const MCInst &MI, unsigned OpNo,
   return getMachineOpValue(MI, MO, Fixups, STI);
 }
 
+unsigned SIMCCodeEmitter::getSMEMOffsetEncoding(const MCInst &MI, unsigned OpNo,
+                                                SmallVectorImpl<MCFixup> &Fixups,
+                                                const MCSubtargetInfo &STI) const {
+  auto Offset = MI.getOperand(OpNo).getImm();
+  // VI only supports 20-bit unsigned offsets.
+  assert(!AMDGPU::isVI(STI) || isUInt<20>(Offset));
+  return Offset;
+}
+
 unsigned
 SIMCCodeEmitter::getSDWASrcEncoding(const MCInst &MI, unsigned OpNo,
                                     SmallVectorImpl<MCFixup> &Fixups,
@@ -419,7 +440,13 @@ SIMCCodeEmitter::getAVOperandEncoding(const MCInst &MI, unsigned OpNo,
   // instructions use acc[0:1] modifier bits to distinguish. These bits are
   // encoded as a virtual 9th bit of the register for these operands.
   if (MRI.getRegClass(AMDGPU::AGPR_32RegClassID).contains(Reg) ||
-      MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg))
+      MRI.getRegClass(AMDGPU::AReg_64RegClassID).contains(Reg) ||
+      MRI.getRegClass(AMDGPU::AReg_96RegClassID).contains(Reg) ||
+      MRI.getRegClass(AMDGPU::AReg_128RegClassID).contains(Reg) ||
+      MRI.getRegClass(AMDGPU::AReg_160RegClassID).contains(Reg) ||
+      MRI.getRegClass(AMDGPU::AReg_192RegClassID).contains(Reg) ||
+      MRI.getRegClass(AMDGPU::AReg_256RegClassID).contains(Reg) ||
+      MRI.getRegClass(AMDGPU::AGPR_LO16RegClassID).contains(Reg))
     Enc |= 512;
 
   return Enc;
diff --git a/llvm/lib/Target/AMDGPU/MIMGInstructions.td b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
index 4006a6205fb8..2bfc2d579533 100644
--- a/llvm/lib/Target/AMDGPU/MIMGInstructions.td
+++ b/llvm/lib/Target/AMDGPU/MIMGInstructions.td
@@ -1,4 +1,4 @@
-//===-- MIMGInstructions.td - MIMG Instruction Defintions -----------------===//
+//===-- MIMGInstructions.td - MIMG Instruction Definitions ----------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -35,6 +35,7 @@ class MIMGBaseOpcode : PredicateControl {
   bit Gather4 = 0;
   bits<8> NumExtraArgs = 0;
   bit Gradients = 0;
+  bit G16 = 0;
   bit Coordinates = 1;
   bit LodOrClampOrMip = 0;
   bit HasD16 = 0;
@@ -47,9 +48,9 @@ def MIMGBaseOpcode : GenericEnum {
 def MIMGBaseOpcodesTable : GenericTable {
   let FilterClass = "MIMGBaseOpcode";
   let CppTypeName = "MIMGBaseOpcodeInfo";
-  let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler", "Gather4",
-                "NumExtraArgs", "Gradients", "Coordinates", "LodOrClampOrMip",
-                "HasD16"];
+  let Fields = ["BaseOpcode", "Store", "Atomic", "AtomicX2", "Sampler",
+                "Gather4", "NumExtraArgs", "Gradients", "G16", "Coordinates",
+                "LodOrClampOrMip", "HasD16"];
   GenericEnum TypeOf_BaseOpcode = MIMGBaseOpcode;
 
   let PrimaryKey = ["BaseOpcode"];
@@ -117,6 +118,22 @@ def MIMGMIPMappingTable : GenericTable {
   let PrimaryKeyName = "getMIMGMIPMappingInfo";
 }
 
+class MIMGG16Mapping<MIMGBaseOpcode g, MIMGBaseOpcode g16> {
+  MIMGBaseOpcode G = g;
+  MIMGBaseOpcode G16 = g16;
+}
+
+def MIMGG16MappingTable : GenericTable {
+  let FilterClass = "MIMGG16Mapping";
+  let CppTypeName = "MIMGG16MappingInfo";
+  let Fields = ["G", "G16"];
+  GenericEnum TypeOf_G = MIMGBaseOpcode;
+  GenericEnum TypeOf_G16 = MIMGBaseOpcode;
+
+  let PrimaryKey = ["G"];
+  let PrimaryKeyName = "getMIMGG16MappingInfo";
+}
+
 class MIMG_Base <dag outs, string dns = "">
   : InstSI <outs, (ins), "", []> {
 
@@ -132,7 +149,6 @@ class MIMG_Base <dag outs, string dns = "">
 
   let DecoderNamespace = dns;
   let isAsmParserOnly = !if(!eq(dns,""), 1, 0);
-  let usesCustomInserter = 1;
 }
 
 class MIMG <dag outs, string dns = "">
@@ -238,9 +254,9 @@ class MIMG_NoSampler_gfx10<int op, string opcode,
   : MIMG_gfx10<op, (outs DataRC:$vdata), dns> {
   let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
-                                SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+                                SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
-  let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+  let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
                     #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
@@ -251,9 +267,9 @@ class MIMG_NoSampler_nsa_gfx10<int op, string opcode,
   let InOperandList = !con(AddrIns,
                            (ins SReg_256:$srsrc, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
-                                SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+                                SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
-  let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+  let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
                     #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
@@ -331,9 +347,9 @@ class MIMG_Store_gfx10<int op, string opcode,
   : MIMG_gfx10<op, (outs), dns> {
   let InOperandList = !con((ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
                                 DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
-                                GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+                                GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
-  let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+  let AsmString = opcode#" $vdata, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
                     #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
@@ -345,9 +361,9 @@ class MIMG_Store_nsa_gfx10<int op, string opcode,
                            AddrIns,
                            (ins SReg_256:$srsrc, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
-                                SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+                                SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
-  let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe"
+  let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe"
                     #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
@@ -436,8 +452,8 @@ class MIMG_Atomic_gfx10<mimg op, string opcode,
 
   let InOperandList = (ins DataRC:$vdata, AddrRC:$vaddr0, SReg_256:$srsrc,
                            DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
-                           GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe);
-  let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe";
+                           GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe);
+  let AsmString = opcode#" $vdst, $vaddr0, $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe";
 }
 
 class MIMG_Atomic_nsa_gfx10<mimg op, string opcode,
@@ -452,8 +468,8 @@ class MIMG_Atomic_nsa_gfx10<mimg op, string opcode,
                            AddrIns,
                            (ins SReg_256:$srsrc, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
-                                SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe));
-  let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$tfe$lwe";
+                                SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe));
+  let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc$dmask$dim$unorm$dlc$glc$slc$r128$a16$tfe$lwe";
 }
 
 multiclass MIMG_Atomic_Addr_Helper_m <mimg op, string asm,
@@ -522,10 +538,10 @@ class MIMG_Sampler_gfx10<int op, string opcode,
   : MIMG_gfx10<op, (outs DataRC:$vdata), dns> {
   let InOperandList = !con((ins AddrRC:$vaddr0, SReg_256:$srsrc, SReg_128:$ssamp,
                                 DMask:$dmask, Dim:$dim, UNorm:$unorm, DLC:$dlc,
-                                GLC:$glc, SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+                                GLC:$glc, SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
   let AsmString = opcode#" $vdata, $vaddr0, $srsrc, $ssamp$dmask$dim$unorm"
-                    #"$dlc$glc$slc$r128$tfe$lwe"
+                    #"$dlc$glc$slc$r128$a16$tfe$lwe"
                     #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
@@ -536,10 +552,10 @@ class MIMG_Sampler_nsa_gfx10<int op, string opcode,
   let InOperandList = !con(AddrIns,
                            (ins SReg_256:$srsrc, SReg_128:$ssamp, DMask:$dmask,
                                 Dim:$dim, UNorm:$unorm, DLC:$dlc, GLC:$glc,
-                                SLC:$slc, R128A16:$r128, TFE:$tfe, LWE:$lwe),
+                                SLC:$slc, R128A16:$r128, GFX10A16:$a16, TFE:$tfe, LWE:$lwe),
                            !if(BaseOpcode.HasD16, (ins D16:$d16), (ins)));
   let AsmString = opcode#" $vdata, "#AddrAsm#", $srsrc, $ssamp$dmask$dim$unorm"
-                    #"$dlc$glc$slc$r128$tfe$lwe"
+                    #"$dlc$glc$slc$r128$a16$tfe$lwe"
                     #!if(BaseOpcode.HasD16, "$d16", "");
 }
 
@@ -646,10 +662,11 @@ class MIMG_Sampler_BaseOpcode<AMDGPUSampleVariant sample>
 }
 
 multiclass MIMG_Sampler <bits<8> op, AMDGPUSampleVariant sample, bit wqm = 0,
-                         bit isGetLod = 0,
-                         string asm = "image_sample"#sample.LowerCaseMod> {
+                         bit isG16 = 0, bit isGetLod = 0,
+                         string asm = "image_sample"#sample.LowerCaseMod#!if(isG16, "_g16", "")> {
   def "" : MIMG_Sampler_BaseOpcode<sample> {
     let HasD16 = !if(isGetLod, 0, 1);
+    let G16 = isG16;
   }
 
   let BaseOpcode = !cast<MIMGBaseOpcode>(NAME), WQM = wqm,
@@ -726,76 +743,95 @@ defm IMAGE_ATOMIC_DEC : MIMG_Atomic <mimg<0x1c>, "image_atomic_dec">;
 //def IMAGE_ATOMIC_FMIN : MIMG_NoPattern_ <"image_atomic_fmin", 0x0000001e>; -- not on VI
 //def IMAGE_ATOMIC_FMAX : MIMG_NoPattern_ <"image_atomic_fmax", 0x0000001f>; -- not on VI
 //} // End let FPAtomic = 1
-defm IMAGE_SAMPLE           : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>;
-defm IMAGE_SAMPLE_CL        : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>;
-defm IMAGE_SAMPLE_D         : MIMG_Sampler <0x00000022, AMDGPUSample_d>;
-defm IMAGE_SAMPLE_D_CL      : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>;
-defm IMAGE_SAMPLE_L         : MIMG_Sampler <0x00000024, AMDGPUSample_l>;
-defm IMAGE_SAMPLE_B         : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>;
-defm IMAGE_SAMPLE_B_CL      : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>;
-defm IMAGE_SAMPLE_LZ        : MIMG_Sampler <0x00000027, AMDGPUSample_lz>;
-defm IMAGE_SAMPLE_C         : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>;
-defm IMAGE_SAMPLE_C_CL      : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>;
-defm IMAGE_SAMPLE_C_D       : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>;
-defm IMAGE_SAMPLE_C_D_CL    : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>;
-defm IMAGE_SAMPLE_C_L       : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>;
-defm IMAGE_SAMPLE_C_B       : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>;
-defm IMAGE_SAMPLE_C_B_CL    : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>;
-defm IMAGE_SAMPLE_C_LZ      : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>;
-defm IMAGE_SAMPLE_O         : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>;
-defm IMAGE_SAMPLE_CL_O      : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>;
-defm IMAGE_SAMPLE_D_O       : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>;
-defm IMAGE_SAMPLE_D_CL_O    : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>;
-defm IMAGE_SAMPLE_L_O       : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>;
-defm IMAGE_SAMPLE_B_O       : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>;
-defm IMAGE_SAMPLE_B_CL_O    : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>;
-defm IMAGE_SAMPLE_LZ_O      : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>;
-defm IMAGE_SAMPLE_C_O       : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>;
-defm IMAGE_SAMPLE_C_CL_O    : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>;
-defm IMAGE_SAMPLE_C_D_O     : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>;
-defm IMAGE_SAMPLE_C_D_CL_O  : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>;
-defm IMAGE_SAMPLE_C_L_O     : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>;
-defm IMAGE_SAMPLE_C_B_CL_O  : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_SAMPLE_C_B_O     : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>;
-defm IMAGE_SAMPLE_C_LZ_O    : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>;
-defm IMAGE_GATHER4          : MIMG_Gather_WQM <0x00000040, AMDGPUSample>;
-defm IMAGE_GATHER4_CL       : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>;
-defm IMAGE_GATHER4_L        : MIMG_Gather <0x00000044, AMDGPUSample_l>;
-defm IMAGE_GATHER4_B        : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>;
-defm IMAGE_GATHER4_B_CL     : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>;
-defm IMAGE_GATHER4_LZ       : MIMG_Gather <0x00000047, AMDGPUSample_lz>;
-defm IMAGE_GATHER4_C        : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>;
-defm IMAGE_GATHER4_C_CL     : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>;
-defm IMAGE_GATHER4_C_L      : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>;
-defm IMAGE_GATHER4_C_B      : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>;
-defm IMAGE_GATHER4_C_B_CL   : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>;
-defm IMAGE_GATHER4_C_LZ     : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>;
-defm IMAGE_GATHER4_O        : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>;
-defm IMAGE_GATHER4_CL_O     : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>;
-defm IMAGE_GATHER4_L_O      : MIMG_Gather <0x00000054, AMDGPUSample_l_o>;
-defm IMAGE_GATHER4_B_O      : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>;
-defm IMAGE_GATHER4_B_CL_O   : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>;
-defm IMAGE_GATHER4_LZ_O     : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>;
-defm IMAGE_GATHER4_C_O      : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>;
-defm IMAGE_GATHER4_C_CL_O   : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>;
-defm IMAGE_GATHER4_C_L_O    : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>;
-defm IMAGE_GATHER4_C_B_O    : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>;
-defm IMAGE_GATHER4_C_B_CL_O : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>;
-defm IMAGE_GATHER4_C_LZ_O   : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>;
-
-defm IMAGE_GET_LOD          : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 1, "image_get_lod">;
-
-defm IMAGE_SAMPLE_CD        : MIMG_Sampler <0x00000068, AMDGPUSample_cd>;
-defm IMAGE_SAMPLE_CD_CL     : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>;
-defm IMAGE_SAMPLE_C_CD      : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>;
-defm IMAGE_SAMPLE_C_CD_CL   : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>;
-defm IMAGE_SAMPLE_CD_O      : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>;
-defm IMAGE_SAMPLE_CD_CL_O   : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>;
-defm IMAGE_SAMPLE_C_CD_O    : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>;
-defm IMAGE_SAMPLE_C_CD_CL_O : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>;
+defm IMAGE_SAMPLE               : MIMG_Sampler_WQM <0x00000020, AMDGPUSample>;
+defm IMAGE_SAMPLE_CL            : MIMG_Sampler_WQM <0x00000021, AMDGPUSample_cl>;
+defm IMAGE_SAMPLE_D             : MIMG_Sampler <0x00000022, AMDGPUSample_d>;
+defm IMAGE_SAMPLE_D_CL          : MIMG_Sampler <0x00000023, AMDGPUSample_d_cl>;
+defm IMAGE_SAMPLE_D_G16         : MIMG_Sampler <0x000000a2, AMDGPUSample_d, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_G16      : MIMG_Sampler <0x000000a3, AMDGPUSample_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_L             : MIMG_Sampler <0x00000024, AMDGPUSample_l>;
+defm IMAGE_SAMPLE_B             : MIMG_Sampler_WQM <0x00000025, AMDGPUSample_b>;
+defm IMAGE_SAMPLE_B_CL          : MIMG_Sampler_WQM <0x00000026, AMDGPUSample_b_cl>;
+defm IMAGE_SAMPLE_LZ            : MIMG_Sampler <0x00000027, AMDGPUSample_lz>;
+defm IMAGE_SAMPLE_C             : MIMG_Sampler_WQM <0x00000028, AMDGPUSample_c>;
+defm IMAGE_SAMPLE_C_CL          : MIMG_Sampler_WQM <0x00000029, AMDGPUSample_c_cl>;
+defm IMAGE_SAMPLE_C_D           : MIMG_Sampler <0x0000002a, AMDGPUSample_c_d>;
+defm IMAGE_SAMPLE_C_D_CL        : MIMG_Sampler <0x0000002b, AMDGPUSample_c_d_cl>;
+defm IMAGE_SAMPLE_C_D_G16       : MIMG_Sampler <0x000000aa, AMDGPUSample_c_d, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_G16    : MIMG_Sampler <0x000000ab, AMDGPUSample_c_d_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_L           : MIMG_Sampler <0x0000002c, AMDGPUSample_c_l>;
+defm IMAGE_SAMPLE_C_B           : MIMG_Sampler_WQM <0x0000002d, AMDGPUSample_c_b>;
+defm IMAGE_SAMPLE_C_B_CL        : MIMG_Sampler_WQM <0x0000002e, AMDGPUSample_c_b_cl>;
+defm IMAGE_SAMPLE_C_LZ          : MIMG_Sampler <0x0000002f, AMDGPUSample_c_lz>;
+defm IMAGE_SAMPLE_O             : MIMG_Sampler_WQM <0x00000030, AMDGPUSample_o>;
+defm IMAGE_SAMPLE_CL_O          : MIMG_Sampler_WQM <0x00000031, AMDGPUSample_cl_o>;
+defm IMAGE_SAMPLE_D_O           : MIMG_Sampler <0x00000032, AMDGPUSample_d_o>;
+defm IMAGE_SAMPLE_D_CL_O        : MIMG_Sampler <0x00000033, AMDGPUSample_d_cl_o>;
+defm IMAGE_SAMPLE_D_O_G16       : MIMG_Sampler <0x000000b2, AMDGPUSample_d_o, 0, 1>;
+defm IMAGE_SAMPLE_D_CL_O_G16    : MIMG_Sampler <0x000000b3, AMDGPUSample_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_L_O           : MIMG_Sampler <0x00000034, AMDGPUSample_l_o>;
+defm IMAGE_SAMPLE_B_O           : MIMG_Sampler_WQM <0x00000035, AMDGPUSample_b_o>;
+defm IMAGE_SAMPLE_B_CL_O        : MIMG_Sampler_WQM <0x00000036, AMDGPUSample_b_cl_o>;
+defm IMAGE_SAMPLE_LZ_O          : MIMG_Sampler <0x00000037, AMDGPUSample_lz_o>;
+defm IMAGE_SAMPLE_C_O           : MIMG_Sampler_WQM <0x00000038, AMDGPUSample_c_o>;
+defm IMAGE_SAMPLE_C_CL_O        : MIMG_Sampler_WQM <0x00000039, AMDGPUSample_c_cl_o>;
+defm IMAGE_SAMPLE_C_D_O         : MIMG_Sampler <0x0000003a, AMDGPUSample_c_d_o>;
+defm IMAGE_SAMPLE_C_D_CL_O      : MIMG_Sampler <0x0000003b, AMDGPUSample_c_d_cl_o>;
+defm IMAGE_SAMPLE_C_D_O_G16     : MIMG_Sampler <0x000000ba, AMDGPUSample_c_d_o, 0, 1>;
+defm IMAGE_SAMPLE_C_D_CL_O_G16  : MIMG_Sampler <0x000000bb, AMDGPUSample_c_d_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_L_O         : MIMG_Sampler <0x0000003c, AMDGPUSample_c_l_o>;
+defm IMAGE_SAMPLE_C_B_CL_O      : MIMG_Sampler_WQM <0x0000003e, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_SAMPLE_C_B_O         : MIMG_Sampler_WQM <0x0000003d, AMDGPUSample_c_b_o>;
+defm IMAGE_SAMPLE_C_LZ_O        : MIMG_Sampler <0x0000003f, AMDGPUSample_c_lz_o>;
+defm IMAGE_GATHER4              : MIMG_Gather_WQM <0x00000040, AMDGPUSample>;
+defm IMAGE_GATHER4_CL           : MIMG_Gather_WQM <0x00000041, AMDGPUSample_cl>;
+defm IMAGE_GATHER4_L            : MIMG_Gather <0x00000044, AMDGPUSample_l>;
+defm IMAGE_GATHER4_B            : MIMG_Gather_WQM <0x00000045, AMDGPUSample_b>;
+defm IMAGE_GATHER4_B_CL         : MIMG_Gather_WQM <0x00000046, AMDGPUSample_b_cl>;
+defm IMAGE_GATHER4_LZ           : MIMG_Gather <0x00000047, AMDGPUSample_lz>;
+defm IMAGE_GATHER4_C            : MIMG_Gather_WQM <0x00000048, AMDGPUSample_c>;
+defm IMAGE_GATHER4_C_CL         : MIMG_Gather_WQM <0x00000049, AMDGPUSample_c_cl>;
+defm IMAGE_GATHER4_C_L          : MIMG_Gather <0x0000004c, AMDGPUSample_c_l>;
+defm IMAGE_GATHER4_C_B          : MIMG_Gather_WQM <0x0000004d, AMDGPUSample_c_b>;
+defm IMAGE_GATHER4_C_B_CL       : MIMG_Gather_WQM <0x0000004e, AMDGPUSample_c_b_cl>;
+defm IMAGE_GATHER4_C_LZ         : MIMG_Gather <0x0000004f, AMDGPUSample_c_lz>;
+defm IMAGE_GATHER4_O            : MIMG_Gather_WQM <0x00000050, AMDGPUSample_o>;
+defm IMAGE_GATHER4_CL_O         : MIMG_Gather_WQM <0x00000051, AMDGPUSample_cl_o>;
+defm IMAGE_GATHER4_L_O          : MIMG_Gather <0x00000054, AMDGPUSample_l_o>;
+defm IMAGE_GATHER4_B_O          : MIMG_Gather_WQM <0x00000055, AMDGPUSample_b_o>;
+defm IMAGE_GATHER4_B_CL_O       : MIMG_Gather <0x00000056, AMDGPUSample_b_cl_o>;
+defm IMAGE_GATHER4_LZ_O         : MIMG_Gather <0x00000057, AMDGPUSample_lz_o>;
+defm IMAGE_GATHER4_C_O          : MIMG_Gather_WQM <0x00000058, AMDGPUSample_c_o>;
+defm IMAGE_GATHER4_C_CL_O       : MIMG_Gather_WQM <0x00000059, AMDGPUSample_c_cl_o>;
+defm IMAGE_GATHER4_C_L_O        : MIMG_Gather <0x0000005c, AMDGPUSample_c_l_o>;
+defm IMAGE_GATHER4_C_B_O        : MIMG_Gather_WQM <0x0000005d, AMDGPUSample_c_b_o>;
+defm IMAGE_GATHER4_C_B_CL_O     : MIMG_Gather_WQM <0x0000005e, AMDGPUSample_c_b_cl_o>;
+defm IMAGE_GATHER4_C_LZ_O       : MIMG_Gather <0x0000005f, AMDGPUSample_c_lz_o>;
+
+defm IMAGE_GET_LOD              : MIMG_Sampler <0x00000060, AMDGPUSample, 1, 0, 1, "image_get_lod">;
+
+defm IMAGE_SAMPLE_CD            : MIMG_Sampler <0x00000068, AMDGPUSample_cd>;
+defm IMAGE_SAMPLE_CD_CL         : MIMG_Sampler <0x00000069, AMDGPUSample_cd_cl>;
+defm IMAGE_SAMPLE_C_CD          : MIMG_Sampler <0x0000006a, AMDGPUSample_c_cd>;
+defm IMAGE_SAMPLE_C_CD_CL       : MIMG_Sampler <0x0000006b, AMDGPUSample_c_cd_cl>;
+defm IMAGE_SAMPLE_CD_O          : MIMG_Sampler <0x0000006c, AMDGPUSample_cd_o>;
+defm IMAGE_SAMPLE_CD_CL_O       : MIMG_Sampler <0x0000006d, AMDGPUSample_cd_cl_o>;
+defm IMAGE_SAMPLE_C_CD_O        : MIMG_Sampler <0x0000006e, AMDGPUSample_c_cd_o>;
+defm IMAGE_SAMPLE_C_CD_CL_O     : MIMG_Sampler <0x0000006f, AMDGPUSample_c_cd_cl_o>;
+defm IMAGE_SAMPLE_CD_G16        : MIMG_Sampler <0x000000e8, AMDGPUSample_cd, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_G16     : MIMG_Sampler <0x000000e9, AMDGPUSample_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_G16      : MIMG_Sampler <0x000000ea, AMDGPUSample_c_cd, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_G16   : MIMG_Sampler <0x000000eb, AMDGPUSample_c_cd_cl, 0, 1>;
+defm IMAGE_SAMPLE_CD_O_G16      : MIMG_Sampler <0x000000ec, AMDGPUSample_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_CD_CL_O_G16   : MIMG_Sampler <0x000000ed, AMDGPUSample_cd_cl_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_O_G16    : MIMG_Sampler <0x000000ee, AMDGPUSample_c_cd_o, 0, 1>;
+defm IMAGE_SAMPLE_C_CD_CL_O_G16 : MIMG_Sampler <0x000000ef, AMDGPUSample_c_cd_cl_o, 0, 1>;
 //def IMAGE_RSRC256 : MIMG_NoPattern_RSRC256 <"image_rsrc256", 0x0000007e>;
 //def IMAGE_SAMPLER : MIMG_NoPattern_ <"image_sampler", 0x0000007f>;
 
+let SubtargetPredicate = HasGFX10_BEncoding in
+defm IMAGE_MSAA_LOAD : MIMG_NoSampler <0x00000080, "image_msaa_load", 1>;
+
 /********** ========================================= **********/
 /********** Table of dimension-aware image intrinsics **********/
 /********** ========================================= **********/
@@ -817,6 +853,11 @@ def ImageDimIntrinsicTable : GenericTable {
   let PrimaryKeyEarlyOut = 1;
 }
 
+def getImageDimInstrinsicByBaseOpcode : SearchIndex {
+  let Table = ImageDimIntrinsicTable;
+  let Key = ["BaseOpcode", "Dim"];
+}
+
 foreach intr = !listconcat(AMDGPUImageDimIntrinsics,
                            AMDGPUImageDimAtomicIntrinsics) in {
   def : ImageDimIntrinsicInfo<intr>;
@@ -835,3 +876,21 @@ def : MIMGLZMapping<IMAGE_GATHER4_C_L_O, IMAGE_GATHER4_C_LZ_O>;
 // MIP to NONMIP Optimization Mapping
 def : MIMGMIPMapping<IMAGE_LOAD_MIP, IMAGE_LOAD>;
 def : MIMGMIPMapping<IMAGE_STORE_MIP, IMAGE_STORE>;
+
+// G to G16 Optimization Mapping
+def : MIMGG16Mapping<IMAGE_SAMPLE_D, IMAGE_SAMPLE_D_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_D_CL, IMAGE_SAMPLE_D_CL_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D, IMAGE_SAMPLE_C_D_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_CL, IMAGE_SAMPLE_C_D_CL_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_D_O, IMAGE_SAMPLE_D_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_D_CL_O, IMAGE_SAMPLE_D_CL_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_O, IMAGE_SAMPLE_C_D_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_D_CL_O, IMAGE_SAMPLE_C_D_CL_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD, IMAGE_SAMPLE_CD_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL, IMAGE_SAMPLE_CD_CL_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD, IMAGE_SAMPLE_C_CD_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL, IMAGE_SAMPLE_C_CD_CL_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD_O, IMAGE_SAMPLE_CD_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_CD_CL_O, IMAGE_SAMPLE_CD_CL_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_O, IMAGE_SAMPLE_C_CD_O_G16>;
+def : MIMGG16Mapping<IMAGE_SAMPLE_C_CD_CL_O, IMAGE_SAMPLE_C_CD_CL_O_G16>;
diff --git a/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp b/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
index ed23c8ea814b..d363baa15507 100644
--- a/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
+++ b/llvm/lib/Target/AMDGPU/R600AsmPrinter.cpp
@@ -88,15 +88,15 @@ void R600AsmPrinter::EmitProgramInfoR600(const MachineFunction &MF) {
     }
   }
 
-  OutStreamer->EmitIntValue(RsrcReg, 4);
-  OutStreamer->EmitIntValue(S_NUM_GPRS(MaxGPR + 1) |
+  OutStreamer->emitInt32(RsrcReg);
+  OutStreamer->emitIntValue(S_NUM_GPRS(MaxGPR + 1) |
                            S_STACK_SIZE(MFI->CFStackSize), 4);
-  OutStreamer->EmitIntValue(R_02880C_DB_SHADER_CONTROL, 4);
-  OutStreamer->EmitIntValue(S_02880C_KILL_ENABLE(killPixel), 4);
+  OutStreamer->emitInt32(R_02880C_DB_SHADER_CONTROL);
+  OutStreamer->emitInt32(S_02880C_KILL_ENABLE(killPixel));
 
   if (AMDGPU::isCompute(MF.getFunction().getCallingConv())) {
-    OutStreamer->EmitIntValue(R_0288E8_SQ_LDS_ALLOC, 4);
-    OutStreamer->EmitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
+    OutStreamer->emitInt32(R_0288E8_SQ_LDS_ALLOC);
+    OutStreamer->emitIntValue(alignTo(MFI->getLDSSize(), 4) >> 2, 4);
   }
 }
 
@@ -115,7 +115,7 @@ bool R600AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
 
   EmitProgramInfoR600(MF);
 
-  EmitFunctionBody();
+  emitFunctionBody();
 
   if (isVerbose()) {
     MCSectionELF *CommentSection =
diff --git a/llvm/lib/Target/AMDGPU/R600AsmPrinter.h b/llvm/lib/Target/AMDGPU/R600AsmPrinter.h
index 0da9526d716e..552d01f81b66 100644
--- a/llvm/lib/Target/AMDGPU/R600AsmPrinter.h
+++ b/llvm/lib/Target/AMDGPU/R600AsmPrinter.h
@@ -26,7 +26,7 @@ public:
   StringRef getPassName() const override;
   bool runOnMachineFunction(MachineFunction &MF) override;
   /// Implemented in AMDGPUMCInstLower.cpp
-  void EmitInstruction(const MachineInstr *MI) override;
+  void emitInstruction(const MachineInstr *MI) override;
   /// Lower the specified LLVM Constant to an MCExpr.
   /// The AsmPrinter::lowerConstantof does not know how to lower
   /// addrspacecast, therefore they should be lowered by this function.
diff --git a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
index e4160ac11c86..8124df68f688 100644
--- a/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ControlFlowFinalizer.cpp
@@ -159,8 +159,7 @@ unsigned CFStack::getSubEntrySize(CFStack::StackItem Item) {
 }
 
 void CFStack::updateMaxStackSize() {
-  unsigned CurrentStackSize =
-      CurrentEntries + (alignTo(CurrentSubEntries, 4) / 4);
+  unsigned CurrentStackSize = CurrentEntries + divideCeil(CurrentSubEntries, 4);
   MaxStackSize = std::max(CurrentStackSize, MaxStackSize);
 }
 
@@ -308,7 +307,7 @@ private:
           DstMI = Reg;
         else
           DstMI = TRI->getMatchingSuperReg(Reg,
-              AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+              R600RegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
               &R600::R600_Reg128RegClass);
       }
       if (MO.isUse()) {
@@ -317,7 +316,7 @@ private:
           SrcMI = Reg;
         else
           SrcMI = TRI->getMatchingSuperReg(Reg,
-              AMDGPURegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
+              R600RegisterInfo::getSubRegFromChannel(TRI->getHWRegChan(Reg)),
               &R600::R600_Reg128RegClass);
       }
     }
diff --git a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
index fd75c41040e1..5f682d86d26e 100644
--- a/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ExpandSpecialInstrs.cpp
@@ -219,13 +219,13 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
           }
         }
         if (IsReduction) {
-          unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan);
+          unsigned SubRegIndex = R600RegisterInfo::getSubRegFromChannel(Chan);
           Src0 = TRI.getSubReg(Src0, SubRegIndex);
           Src1 = TRI.getSubReg(Src1, SubRegIndex);
         } else if (IsCube) {
           static const int CubeSrcSwz[] = {2, 2, 0, 1};
-          unsigned SubRegIndex0 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[Chan]);
-          unsigned SubRegIndex1 = AMDGPURegisterInfo::getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
+          unsigned SubRegIndex0 = R600RegisterInfo::getSubRegFromChannel(CubeSrcSwz[Chan]);
+          unsigned SubRegIndex1 = R600RegisterInfo::getSubRegFromChannel(CubeSrcSwz[3 - Chan]);
           Src1 = TRI.getSubReg(Src0, SubRegIndex1);
           Src0 = TRI.getSubReg(Src0, SubRegIndex0);
         }
@@ -234,7 +234,7 @@ bool R600ExpandSpecialInstrsPass::runOnMachineFunction(MachineFunction &MF) {
         bool Mask = false;
         bool NotLast = true;
         if (IsCube) {
-          unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(Chan);
+          unsigned SubRegIndex = R600RegisterInfo::getSubRegFromChannel(Chan);
           DstReg = TRI.getSubReg(DstReg, SubRegIndex);
         } else {
           // Mask the write if the original instruction does not write to
diff --git a/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp b/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
index d9aa9ebe878d..c568a4aa61c3 100644
--- a/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600FrameLowering.cpp
@@ -18,9 +18,8 @@ using namespace llvm;
 R600FrameLowering::~R600FrameLowering() = default;
 
 /// \returns The number of registers allocated for \p FI.
-int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF,
-                                              int FI,
-                                              unsigned &FrameReg) const {
+int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                              Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const R600RegisterInfo *RI
     = MF.getSubtarget<R600Subtarget>().getRegisterInfo();
@@ -35,15 +34,15 @@ int R600FrameLowering::getFrameIndexReference(const MachineFunction &MF,
   int UpperBound = FI == -1 ? MFI.getNumObjects() : FI;
 
   for (int i = MFI.getObjectIndexBegin(); i < UpperBound; ++i) {
-    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(i));
+    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlign(i));
     OffsetBytes += MFI.getObjectSize(i);
     // Each register holds 4 bytes, so we must always align the offset to at
     // least 4 bytes, so that 2 frame objects won't share the same register.
-    OffsetBytes = alignTo(OffsetBytes, 4);
+    OffsetBytes = alignTo(OffsetBytes, Align(4));
   }
 
   if (FI != -1)
-    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlignment(FI));
+    OffsetBytes = alignTo(OffsetBytes, MFI.getObjectAlign(FI));
 
   return OffsetBytes / (getStackWidth(MF) * 4);
 }
diff --git a/llvm/lib/Target/AMDGPU/R600FrameLowering.h b/llvm/lib/Target/AMDGPU/R600FrameLowering.h
index 283e4d1935ea..b877ecd29829 100644
--- a/llvm/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600FrameLowering.h
@@ -16,7 +16,7 @@ namespace llvm {
 class R600FrameLowering : public AMDGPUFrameLowering {
 public:
   R600FrameLowering(StackDirection D, Align StackAl, int LAO,
-                    Align TransAl = Align::None())
+                    Align TransAl = Align(1))
       : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
   ~R600FrameLowering() override;
 
@@ -25,7 +25,7 @@ public:
   void emitEpilogue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override {}
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
+                             Register &FrameReg) const override;
 
   bool hasFP(const MachineFunction &MF) const override {
     return false;
diff --git a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
index 1b1f5f9a404a..dc2e73e1f94e 100644
--- a/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/R600ISelLowering.cpp
@@ -615,21 +615,27 @@ SDValue R600TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const
       return LowerImplicitParameter(DAG, VT, DL, 8);
 
     case Intrinsic::r600_read_tgid_x:
+    case Intrinsic::amdgcn_workgroup_id_x:
       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
                                      R600::T1_X, VT);
     case Intrinsic::r600_read_tgid_y:
+    case Intrinsic::amdgcn_workgroup_id_y:
       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
                                      R600::T1_Y, VT);
     case Intrinsic::r600_read_tgid_z:
+    case Intrinsic::amdgcn_workgroup_id_z:
       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
                                      R600::T1_Z, VT);
     case Intrinsic::r600_read_tidig_x:
+    case Intrinsic::amdgcn_workitem_id_x:
       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
                                      R600::T0_X, VT);
     case Intrinsic::r600_read_tidig_y:
+    case Intrinsic::amdgcn_workitem_id_y:
       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
                                      R600::T0_Y, VT);
     case Intrinsic::r600_read_tidig_z:
+    case Intrinsic::amdgcn_workitem_id_z:
       return CreateLiveInRegisterRaw(DAG, &R600::R600_TReg32RegClass,
                                      R600::T0_Z, VT);
 
@@ -699,9 +705,8 @@ SDValue R600TargetLowering::vectorToVerticalVector(SelectionDAG &DAG,
   SmallVector<SDValue, 8> Args;
 
   for (unsigned i = 0, e = VecVT.getVectorNumElements(); i != e; ++i) {
-    Args.push_back(DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
-        DAG.getConstant(i, DL, getVectorIdxTy(DAG.getDataLayout()))));
+    Args.push_back(DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, EltVT, Vector,
+                               DAG.getVectorIdxConstant(i, DL)));
   }
 
   return DAG.getNode(AMDGPUISD::BUILD_VERTICAL_VECTOR, DL, VecVT, Args);
@@ -1260,10 +1265,11 @@ SDValue R600TargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
     return scalarizeVectorStore(StoreNode, DAG);
   }
 
-  unsigned Align = StoreNode->getAlignment();
-  if (Align < MemVT.getStoreSize() &&
-      !allowsMisalignedMemoryAccesses(
-          MemVT, AS, Align, StoreNode->getMemOperand()->getFlags(), nullptr)) {
+  Align Alignment = StoreNode->getAlign();
+  if (Alignment < MemVT.getStoreSize() &&
+      !allowsMisalignedMemoryAccesses(MemVT, AS, Alignment.value(),
+                                      StoreNode->getMemOperand()->getFlags(),
+                                      nullptr)) {
     return expandUnalignedStore(StoreNode, DAG);
   }
 
@@ -1543,7 +1549,7 @@ SDValue R600TargetLowering::lowerFrameIndex(SDValue Op,
   FrameIndexSDNode *FIN = cast<FrameIndexSDNode>(Op);
 
   unsigned FrameIndex = FIN->getIndex();
-  unsigned IgnoredFrameReg;
+  Register IgnoredFrameReg;
   unsigned Offset =
     TFL->getFrameIndexReference(MF, FrameIndex, IgnoredFrameReg);
   return DAG.getConstant(Offset * 4 * TFL->getStackWidth(MF), SDLoc(Op),
diff --git a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
index 346296c77377..088cf16d8ed2 100644
--- a/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600InstrInfo.cpp
@@ -77,7 +77,7 @@ void R600InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   if (VectorComponents > 0) {
     for (unsigned I = 0; I < VectorComponents; I++) {
-      unsigned SubRegIndex = AMDGPURegisterInfo::getSubRegFromChannel(I);
+      unsigned SubRegIndex = R600RegisterInfo::getSubRegFromChannel(I);
       buildDefaultInstruction(MBB, MI, R600::MOV,
                               RI.getSubReg(DestReg, SubRegIndex),
                               RI.getSubReg(SrcReg, SubRegIndex))
@@ -541,7 +541,7 @@ R600InstrInfo::fitsReadPortLimitations(const std::vector<MachineInstr *> &IG,
 
   std::vector<std::vector<std::pair<int, unsigned>>> IGSrcs;
   ValidSwizzle.clear();
-  unsigned ConstCount = 0;
+  unsigned ConstCount;
   BankSwizzle TransBS = ALU_VEC_012_SCL_210;
   for (unsigned i = 0, e = IG.size(); i < e; ++i) {
     IGSrcs.push_back(ExtractSrcs(*IG[i], PV, ConstCount));
@@ -676,7 +676,7 @@ bool R600InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                   MachineBasicBlock *&FBB,
                                   SmallVectorImpl<MachineOperand> &Cond,
                                   bool AllowModify) const {
-  // Most of the following comes from the ARM implementation of AnalyzeBranch
+  // Most of the following comes from the ARM implementation of analyzeBranch
 
   // If the block has no terminators, it just falls into the block after it.
   MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
@@ -1224,7 +1224,7 @@ int R600InstrInfo::getIndirectIndexEnd(const MachineFunction &MF) const {
   const R600Subtarget &ST = MF.getSubtarget<R600Subtarget>();
   const R600FrameLowering *TFL = ST.getFrameLowering();
 
-  unsigned IgnoredFrameReg;
+  Register IgnoredFrameReg;
   Offset = TFL->getFrameIndexReference(MF, -1, IgnoredFrameReg);
 
   return getIndirectIndexBegin(MF) + Offset;
diff --git a/llvm/lib/Target/AMDGPU/R600Instructions.td b/llvm/lib/Target/AMDGPU/R600Instructions.td
index cbdf0de44f87..2cc21364c439 100644
--- a/llvm/lib/Target/AMDGPU/R600Instructions.td
+++ b/llvm/lib/Target/AMDGPU/R600Instructions.td
@@ -1006,7 +1006,7 @@ class MULADD_Common <bits<5> inst> : R600_3OP <
 
 class MULADD_IEEE_Common <bits<5> inst> : R600_3OP <
   inst, "MULADD_IEEE",
-  [(set f32:$dst, (fmad f32:$src0, f32:$src1, f32:$src2))]
+  [(set f32:$dst, (any_fmad f32:$src0, f32:$src1, f32:$src2))]
 >;
 
 class FMA_Common <bits<5> inst> : R600_3OP <
@@ -1233,6 +1233,11 @@ def : R600Pat<
 def : RcpPat<recip_ieee, f32>;
 }
 
+class SqrtPat<Instruction RsqInst, Instruction RecipInst> : R600Pat <
+  (fsqrt f32:$src),
+  (RecipInst (RsqInst $src))
+>;
+
 //===----------------------------------------------------------------------===//
 // R600 / R700 Instructions
 //===----------------------------------------------------------------------===//
@@ -1272,8 +1277,8 @@ let Predicates = [isR600] in {
   defm DIV_r600 : DIV_Common<RECIP_IEEE_r600>;
   def : POW_Common <LOG_IEEE_r600, EXP_IEEE_r600, MUL>;
 
-  def : R600Pat<(fsqrt f32:$src), (MUL $src, (RECIPSQRT_CLAMPED_r600 $src))>;
   def : RsqPat<RECIPSQRT_IEEE_r600, f32>;
+  def : SqrtPat<RECIPSQRT_IEEE_r600, RECIP_IEEE_r600>;
 
   def R600_ExportSwz : ExportSwzInst {
     let Word1{20-17} = 0; // BURST_COUNT
diff --git a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
index cec7f563f480..b0620663a230 100644
--- a/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
+++ b/llvm/lib/Target/AMDGPU/R600OptimizeVectorRegisters.cpp
@@ -56,9 +56,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "vec-merger"
 
-static bool isImplicitlyDef(MachineRegisterInfo &MRI, unsigned Reg) {
-  assert(MRI.isSSA());
-  if (Register::isPhysicalRegister(Reg))
+static bool isImplicitlyDef(MachineRegisterInfo &MRI, Register Reg) {
+  if (Reg.isPhysical())
     return false;
   const MachineInstr *MI = MRI.getUniqueVRegDef(Reg);
   return MI && MI->isImplicitDef();
@@ -69,8 +68,8 @@ namespace {
 class RegSeqInfo {
 public:
   MachineInstr *Instr;
-  DenseMap<unsigned, unsigned> RegToChan;
-  std::vector<unsigned> UndefReg;
+  DenseMap<Register, unsigned> RegToChan;
+  std::vector<Register> UndefReg;
 
   RegSeqInfo(MachineRegisterInfo &MRI, MachineInstr *MI) : Instr(MI) {
     assert(MI->getOpcode() == R600::REG_SEQUENCE);
@@ -102,7 +101,7 @@ private:
   InstructionSetMap PreviousRegSeqByUndefCount;
 
   bool canSwizzle(const MachineInstr &MI) const;
-  bool areAllUsesSwizzeable(unsigned Reg) const;
+  bool areAllUsesSwizzeable(Register Reg) const;
   void SwizzleInput(MachineInstr &,
       const std::vector<std::pair<unsigned, unsigned>> &RemapChan) const;
   bool tryMergeVector(const RegSeqInfo *Untouched, RegSeqInfo *ToMerge,
@@ -130,6 +129,11 @@ public:
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties()
+      .set(MachineFunctionProperties::Property::IsSSA);
+  }
+
   StringRef getPassName() const override {
     return "R600 Vector Registers Merge Pass";
   }
@@ -165,9 +169,9 @@ bool R600VectorRegMerger::tryMergeVector(const RegSeqInfo *Untouched,
     RegSeqInfo *ToMerge, std::vector< std::pair<unsigned, unsigned>> &Remap)
     const {
   unsigned CurrentUndexIdx = 0;
-  for (DenseMap<unsigned, unsigned>::iterator It = ToMerge->RegToChan.begin(),
+  for (DenseMap<Register, unsigned>::iterator It = ToMerge->RegToChan.begin(),
       E = ToMerge->RegToChan.end(); It != E; ++It) {
-    DenseMap<unsigned, unsigned>::const_iterator PosInUntouched =
+    DenseMap<Register, unsigned>::const_iterator PosInUntouched =
         Untouched->RegToChan.find((*It).first);
     if (PosInUntouched != Untouched->RegToChan.end()) {
       Remap.push_back(std::pair<unsigned, unsigned>
@@ -203,9 +207,9 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
   DebugLoc DL = Pos->getDebugLoc();
 
   Register SrcVec = BaseRSI->Instr->getOperand(0).getReg();
-  DenseMap<unsigned, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
-  std::vector<unsigned> UpdatedUndef = BaseRSI->UndefReg;
-  for (DenseMap<unsigned, unsigned>::iterator It = RSI->RegToChan.begin(),
+  DenseMap<Register, unsigned> UpdatedRegToChan = BaseRSI->RegToChan;
+  std::vector<Register> UpdatedUndef = BaseRSI->UndefReg;
+  for (DenseMap<Register, unsigned>::iterator It = RSI->RegToChan.begin(),
       E = RSI->RegToChan.end(); It != E; ++It) {
     Register DstReg = MRI->createVirtualRegister(&R600::R600_Reg128RegClass);
     unsigned SubReg = (*It).first;
@@ -218,7 +222,7 @@ MachineInstr *R600VectorRegMerger::RebuildVector(
         .addReg(SubReg)
         .addImm(Chan);
     UpdatedRegToChan[SubReg] = Chan;
-    std::vector<unsigned>::iterator ChanPos = llvm::find(UpdatedUndef, Chan);
+    std::vector<Register>::iterator ChanPos = llvm::find(UpdatedUndef, Chan);
     if (ChanPos != UpdatedUndef.end())
       UpdatedUndef.erase(ChanPos);
     assert(!is_contained(UpdatedUndef, Chan) &&
@@ -279,7 +283,7 @@ void R600VectorRegMerger::SwizzleInput(MachineInstr &MI,
   }
 }
 
-bool R600VectorRegMerger::areAllUsesSwizzeable(unsigned Reg) const {
+bool R600VectorRegMerger::areAllUsesSwizzeable(Register Reg) const {
   for (MachineRegisterInfo::use_instr_iterator It = MRI->use_instr_begin(Reg),
       E = MRI->use_instr_end(); It != E; ++It) {
     if (!canSwizzle(*It))
@@ -322,7 +326,7 @@ bool R600VectorRegMerger::tryMergeUsingFreeSlot(RegSeqInfo &RSI,
 }
 
 void R600VectorRegMerger::trackRSI(const RegSeqInfo &RSI) {
-  for (DenseMap<unsigned, unsigned>::const_iterator
+  for (DenseMap<Register, unsigned>::const_iterator
   It = RSI.RegToChan.begin(), E = RSI.RegToChan.end(); It != E; ++It) {
     PreviousRegSeqByReg[(*It).first].push_back(RSI.Instr);
   }
diff --git a/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp b/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
index ef12c1d24594..78ef71cdf8e3 100644
--- a/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/R600RegisterInfo.cpp
@@ -20,14 +20,21 @@
 
 using namespace llvm;
 
-R600RegisterInfo::R600RegisterInfo() : R600GenRegisterInfo(0) {
-  RCW.RegWeight = 0;
-  RCW.WeightLimit = 0;
-}
-
 #define GET_REGINFO_TARGET_DESC
 #include "R600GenRegisterInfo.inc"
 
+unsigned R600RegisterInfo::getSubRegFromChannel(unsigned Channel) {
+  static const uint16_t SubRegFromChannelTable[] = {
+    R600::sub0, R600::sub1, R600::sub2, R600::sub3,
+    R600::sub4, R600::sub5, R600::sub6, R600::sub7,
+    R600::sub8, R600::sub9, R600::sub10, R600::sub11,
+    R600::sub12, R600::sub13, R600::sub14, R600::sub15
+  };
+
+  assert(Channel < array_lengthof(SubRegFromChannelTable));
+  return SubRegFromChannelTable[Channel];
+}
+
 BitVector R600RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
 
@@ -87,11 +94,6 @@ const TargetRegisterClass * R600RegisterInfo::getCFGStructurizerRegClass(
   }
 }
 
-const RegClassWeight &R600RegisterInfo::getRegClassWeight(
-  const TargetRegisterClass *RC) const {
-  return RCW;
-}
-
 bool R600RegisterInfo::isPhysRegLiveAcrossClauses(unsigned Reg) const {
   assert(!Register::isVirtualRegister(Reg));
 
diff --git a/llvm/lib/Target/AMDGPU/R600RegisterInfo.h b/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
index 9378b70ca580..06981c4cf9c5 100644
--- a/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/R600RegisterInfo.h
@@ -20,9 +20,11 @@
 namespace llvm {
 
 struct R600RegisterInfo final : public R600GenRegisterInfo {
-  RegClassWeight RCW;
+  R600RegisterInfo() : R600GenRegisterInfo(0) {}
 
-  R600RegisterInfo();
+  /// \returns the sub reg enum value for the given \p Channel
+  /// (e.g. getSubRegFromChannel(0) -> R600::sub0)
+  static unsigned getSubRegFromChannel(unsigned Channel);
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
@@ -37,8 +39,9 @@ struct R600RegisterInfo final : public R600GenRegisterInfo {
   /// CFGStructurizer
   const TargetRegisterClass *getCFGStructurizerRegClass(MVT VT) const;
 
-  const RegClassWeight &
-    getRegClassWeight(const TargetRegisterClass *RC) const override;
+  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
+    return false;
+  }
 
   // \returns true if \p Reg can be defined in one ALU clause and used in
   // another.
diff --git a/llvm/lib/Target/AMDGPU/R600RegisterInfo.td b/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
index 02164b74a01b..fdff7541edec 100644
--- a/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/R600RegisterInfo.td
@@ -150,13 +150,16 @@ def AR_X : R600Reg<"AR.x", 0>;
 def INDIRECT_BASE_ADDR : R600Reg <"INDIRECT_BASE_ADDR", 0>;
 
 def R600_ArrayBase : RegisterClass <"AMDGPU", [f32, i32], 32,
-                          (add (sequence "ArrayBase%u", 448, 480))>;
+                          (add (sequence "ArrayBase%u", 448, 480))> {
+  let Weight = 0;
+}
 // special registers for ALU src operands
 // const buffer reference, SRCx_SEL contains index
 def ALU_CONST : R600Reg<"CBuf", 0>;
 // interpolation param reference, SRCx_SEL contains index
 def ALU_PARAM : R600Reg<"Param", 0>;
 
+let Weight = 0 in {
 let isAllocatable = 0 in {
 
 def R600_Addr : RegisterClass <"AMDGPU", [i32], 32, (add (sequence "Addr%u_X", 0, 127))>;
@@ -251,3 +254,4 @@ def R600_Reg64 : RegisterClass<"AMDGPU", [v2f32, v2i32, i64, f64], 64,
 def R600_Reg64Vertical : RegisterClass<"AMDGPU", [v2f32, v2i32], 64,
                                       (add V01_X, V01_Y, V01_Z, V01_W,
                                            V23_X, V23_Y, V23_Z, V23_W)>;
+} // End let Weight = 0
diff --git a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
index ee011286b8ff..90e48c63b5dc 100644
--- a/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAddIMGInit.cpp
@@ -111,10 +111,6 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
           unsigned ActiveLanes =
               TII->isGather4(Opcode) ? 4 : countPopulation(dmask);
 
-          // Subreg indices are counted from 1
-          // When D16 then we want next whole VGPR after write data.
-          static_assert(AMDGPU::sub0 == 1 && AMDGPU::sub4 == 5, "Subreg indices different from expected");
-
           bool Packed = !ST.hasUnpackedD16VMem();
 
           unsigned InitIdx =
@@ -137,7 +133,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
           // all the result registers to 0, otherwise just the error indication
           // register (VGPRn+1)
           unsigned SizeLeft = ST.usePRTStrictNull() ? InitIdx : 1;
-          unsigned CurrIdx = ST.usePRTStrictNull() ? 1 : InitIdx;
+          unsigned CurrIdx = ST.usePRTStrictNull() ? 0 : (InitIdx - 1);
 
           if (DstSize == 1) {
             // In this case we can just initialize the result directly
@@ -158,7 +154,7 @@ bool SIAddIMGInit::runOnMachineFunction(MachineFunction &MF) {
               BuildMI(MBB, I, DL, TII->get(TargetOpcode::INSERT_SUBREG), NewDst)
                   .addReg(PrevDst)
                   .addReg(SubReg)
-                  .addImm(CurrIdx);
+                  .addImm(SIRegisterInfo::getSubRegFromChannel(CurrIdx));
 
               PrevDst = NewDst;
             }
diff --git a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
index 27320472cacb..3c41bf1fef5e 100644
--- a/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SIAnnotateControlFlow.cpp
@@ -153,7 +153,7 @@ void SIAnnotateControlFlow::initialize(Module &M, const GCNSubtarget &ST) {
   Else = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_else,
                                    { IntMask, IntMask });
   IfBreak = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_if_break,
-                                      { IntMask, IntMask });
+                                      { IntMask });
   Loop = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_loop, { IntMask });
   EndCf = Intrinsic::getDeclaration(&M, Intrinsic::amdgcn_end_cf, { IntMask });
 }
diff --git a/llvm/lib/Target/AMDGPU/SIDefines.h b/llvm/lib/Target/AMDGPU/SIDefines.h
index 23ef56afc39c..4f7d255eb450 100644
--- a/llvm/lib/Target/AMDGPU/SIDefines.h
+++ b/llvm/lib/Target/AMDGPU/SIDefines.h
@@ -333,7 +333,9 @@ enum Id { // HwRegCode, (6) [5:0]
   ID_FLAT_SCR_HI = 21,
   ID_XNACK_MASK = 22,
   ID_POPS_PACKER = 25,
-  ID_SYMBOLIC_LAST_ = 26,
+  ID_SHADER_CYCLES = 29,
+  ID_SYMBOLIC_FIRST_GFX1030_ = ID_SHADER_CYCLES,
+  ID_SYMBOLIC_LAST_ = 30,
   ID_SHIFT_ = 0,
   ID_WIDTH_ = 6,
   ID_MASK_ = (((1 << ID_WIDTH_) - 1) << ID_SHIFT_)
@@ -366,6 +368,28 @@ enum Width : unsigned {
   WIDTH_DEFAULT_ = WIDTH_M1_DEFAULT_ + 1,
 };
 
+enum ModeRegisterMasks : uint32_t {
+  FP_ROUND_MASK = 0xf << 0,  // Bits 0..3
+  FP_DENORM_MASK = 0xf << 4, // Bits 4..7
+  DX10_CLAMP_MASK = 1 << 8,
+  IEEE_MODE_MASK = 1 << 9,
+  LOD_CLAMP_MASK = 1 << 10,
+  DEBUG_MASK = 1 << 11,
+
+  // EXCP_EN fields.
+  EXCP_EN_INVALID_MASK = 1 << 12,
+  EXCP_EN_INPUT_DENORMAL_MASK = 1 << 13,
+  EXCP_EN_FLOAT_DIV0_MASK = 1 << 14,
+  EXCP_EN_OVERFLOW_MASK = 1 << 15,
+  EXCP_EN_UNDERFLOW_MASK = 1 << 16,
+  EXCP_EN_INEXACT_MASK = 1 << 17,
+  EXCP_EN_INT_DIV0_MASK = 1 << 18,
+
+  GPR_IDX_EN_MASK = 1 << 27,
+  VSKIP_MASK = 1 << 28,
+  CSP_MASK = 0x7u << 29 // Bits 29..31
+};
+
 } // namespace Hwreg
 
 namespace Swizzle { // Encoding of swizzle macro used in ds_swizzle_b32.
diff --git a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
index 914d2a5ef148..ef64c5674bd1 100644
--- a/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixSGPRCopies.cpp
@@ -587,6 +587,11 @@ static bool hoistAndMergeSGPRInits(unsigned Reg,
 }
 
 bool SIFixSGPRCopies::runOnMachineFunction(MachineFunction &MF) {
+  // Only need to run this in SelectionDAG path.
+  if (MF.getProperties().hasProperty(
+        MachineFunctionProperties::Property::Selected))
+    return false;
+
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   MRI = &MF.getRegInfo();
   TRI = ST.getRegisterInfo();
@@ -761,6 +766,7 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
   bool AllAGPRUses = true;
   SetVector<const MachineInstr *> worklist;
   SmallSet<const MachineInstr *, 4> Visited;
+  SetVector<MachineInstr *> PHIOperands;
   worklist.insert(&MI);
   Visited.insert(&MI);
   while (!worklist.empty()) {
@@ -805,6 +811,11 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
   if (AllAGPRUses && numVGPRUses && !TRI->hasAGPRs(RC0)) {
     LLVM_DEBUG(dbgs() << "Moving PHI to AGPR: " << MI);
     MRI->setRegClass(PHIRes, TRI->getEquivalentAGPRClass(RC0));
+    for (unsigned I = 1, N = MI.getNumOperands(); I != N; I += 2) {
+      MachineInstr *DefMI = MRI->getVRegDef(MI.getOperand(I).getReg());
+      if (DefMI && DefMI->isPHI())
+        PHIOperands.insert(DefMI);
+    }
   }
 
   bool hasVGPRInput = false;
@@ -824,8 +835,22 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
     }
     else if (Def->isCopy() &&
       TRI->isVectorRegister(*MRI, Def->getOperand(1).getReg())) {
-      hasVGPRInput = true;
-      break;
+      Register SrcReg = Def->getOperand(1).getReg();
+      MachineInstr *SrcDef = MRI->getVRegDef(SrcReg);
+      unsigned SMovOp;
+      int64_t Imm;
+      if (!isSafeToFoldImmIntoCopy(Def, SrcDef, TII, SMovOp, Imm)) {
+        hasVGPRInput = true;
+        break;
+      } else {
+        // Formally, if we did not do this right away
+        // it would be done on the next iteration of the
+        // runOnMachineFunction main loop. But why not if we can?
+        MachineFunction *MF = MI.getParent()->getParent();
+        Def->getOperand(1).ChangeToImmediate(Imm);
+        Def->addImplicitDefUseOperands(*MF);
+        Def->setDesc(TII->get(SMovOp));
+      }
     }
   }
 
@@ -840,4 +865,8 @@ void SIFixSGPRCopies::processPHINode(MachineInstr &MI) {
     TII->legalizeOperands(MI, MDT);
   }
 
+  // Propagate register class back to PHI operands which are PHI themselves.
+  while (!PHIOperands.empty()) {
+    processPHINode(*PHIOperands.pop_back_val());
+  }
 }
diff --git a/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp b/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp
index a0119297b112..8e3402b537b3 100644
--- a/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFixupVectorISel.cpp
@@ -217,6 +217,11 @@ static bool fixupGlobalSaddr(MachineBasicBlock &MBB,
 }
 
 bool SIFixupVectorISel::runOnMachineFunction(MachineFunction &MF) {
+  // Only need to run this in SelectionDAG path.
+  if (MF.getProperties().hasProperty(
+        MachineFunctionProperties::Property::Selected))
+    return false;
+
   if (skipFunction(MF.getFunction()))
     return false;
 
diff --git a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
index 2ff8baf29394..ffcf4c30bc70 100644
--- a/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFoldOperands.cpp
@@ -282,6 +282,9 @@ static bool updateOperand(FoldCandidate &Fold,
   assert(!Fold.needsShrink() && "not handled");
 
   if (Fold.isImm()) {
+    // FIXME: ChangeToImmediate should probably clear the subreg flags. It's
+    // reinterpreted as TargetFlags.
+    Old.setSubReg(0);
     Old.ChangeToImmediate(Fold.ImmToFold);
     return true;
   }
@@ -612,19 +615,26 @@ void SIFoldOperands::foldOperand(
   if (frameIndexMayFold(TII, *UseMI, UseOpIdx, OpToFold)) {
     // Sanity check that this is a stack access.
     // FIXME: Should probably use stack pseudos before frame lowering.
-    MachineOperand *SOff = TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
-    if (!SOff->isReg() || (SOff->getReg() != MFI->getScratchWaveOffsetReg() &&
-                           SOff->getReg() != MFI->getStackPtrOffsetReg()))
-      return;
 
     if (TII->getNamedOperand(*UseMI, AMDGPU::OpName::srsrc)->getReg() !=
         MFI->getScratchRSrcReg())
       return;
 
+    // Ensure this is either relative to the current frame or the current wave.
+    MachineOperand &SOff =
+        *TII->getNamedOperand(*UseMI, AMDGPU::OpName::soffset);
+    if ((!SOff.isReg() || SOff.getReg() != MFI->getStackPtrOffsetReg()) &&
+        (!SOff.isImm() || SOff.getImm() != 0))
+      return;
+
     // A frame index will resolve to a positive constant, so it should always be
     // safe to fold the addressing mode, even pre-GFX9.
     UseMI->getOperand(UseOpIdx).ChangeToFrameIndex(OpToFold.getIndex());
-    SOff->setReg(MFI->getStackPtrOffsetReg());
+
+    // If this is relative to the current wave, update it to be relative to the
+    // current frame.
+    if (SOff.isImm())
+      SOff.ChangeToRegister(MFI->getStackPtrOffsetReg(), false);
     return;
   }
 
@@ -907,6 +917,21 @@ static bool evalBinaryInstruction(unsigned Opcode, int32_t &Result,
   case AMDGPU::S_XOR_B32:
     Result = LHS ^ RHS;
     return true;
+  case AMDGPU::S_XNOR_B32:
+    Result = ~(LHS ^ RHS);
+    return true;
+  case AMDGPU::S_NAND_B32:
+    Result = ~(LHS & RHS);
+    return true;
+  case AMDGPU::S_NOR_B32:
+    Result = ~(LHS | RHS);
+    return true;
+  case AMDGPU::S_ANDN2_B32:
+    Result = LHS & ~RHS;
+    return true;
+  case AMDGPU::S_ORN2_B32:
+    Result = LHS | ~RHS;
+    return true;
   case AMDGPU::V_LSHL_B32_e64:
   case AMDGPU::V_LSHL_B32_e32:
   case AMDGPU::S_LSHL_B32:
@@ -1007,10 +1032,16 @@ static bool tryConstantFoldOp(MachineRegisterInfo &MRI,
   if (!Src0->isImm() && !Src1->isImm())
     return false;
 
-  if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32) {
+  if (MI->getOpcode() == AMDGPU::V_LSHL_OR_B32 ||
+      MI->getOpcode() == AMDGPU::V_LSHL_ADD_U32 ||
+      MI->getOpcode() == AMDGPU::V_AND_OR_B32) {
     if (Src0->isImm() && Src0->getImm() == 0) {
       // v_lshl_or_b32 0, X, Y -> copy Y
       // v_lshl_or_b32 0, X, K -> v_mov_b32 K
+      // v_lshl_add_b32 0, X, Y -> copy Y
+      // v_lshl_add_b32 0, X, K -> v_mov_b32 K
+      // v_and_or_b32 0, X, Y -> copy Y
+      // v_and_or_b32 0, X, K -> v_mov_b32 K
       bool UseCopy = TII->getNamedOperand(*MI, AMDGPU::OpName::src2)->isReg();
       MI->RemoveOperand(Src1Idx);
       MI->RemoveOperand(Src0Idx);
@@ -1381,8 +1412,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
   case AMDGPU::V_MUL_F32_e64:
   case AMDGPU::V_MUL_F16_e64: {
     // If output denormals are enabled, omod is ignored.
-    if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32Denormals) ||
-        (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16Denormals))
+    if ((Op == AMDGPU::V_MUL_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
+        (Op == AMDGPU::V_MUL_F16_e64 && MFI->getMode().FP64FP16OutputDenormals))
       return std::make_pair(nullptr, SIOutMods::NONE);
 
     const MachineOperand *RegOp = nullptr;
@@ -1411,8 +1442,8 @@ SIFoldOperands::isOMod(const MachineInstr &MI) const {
   case AMDGPU::V_ADD_F32_e64:
   case AMDGPU::V_ADD_F16_e64: {
     // If output denormals are enabled, omod is ignored.
-    if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32Denormals) ||
-        (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16Denormals))
+    if ((Op == AMDGPU::V_ADD_F32_e64 && MFI->getMode().FP32OutputDenormals) ||
+        (Op == AMDGPU::V_ADD_F16_e64 && MFI->getMode().FP64FP16OutputDenormals))
       return std::make_pair(nullptr, SIOutMods::NONE);
 
     // Look through the DAGCombiner canonicalization fmul x, 2 -> fadd x, x
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index 8364665dda04..a2e802009d09 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -24,18 +24,6 @@ using namespace llvm;
 #define DEBUG_TYPE "frame-info"
 
 
-static ArrayRef<MCPhysReg> getAllSGPR128(const GCNSubtarget &ST,
-                                         const MachineFunction &MF) {
-  return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
-                      ST.getMaxNumSGPRs(MF) / 4);
-}
-
-static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
-                                       const MachineFunction &MF) {
-  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(),
-                      ST.getMaxNumSGPRs(MF));
-}
-
 // Find a scratch register that we can use at the start of the prologue to
 // re-align the stack pointer. We avoid using callee-save registers since they
 // may appear to be free when this is called from canUseAsPrologue (during
@@ -47,10 +35,10 @@ static ArrayRef<MCPhysReg> getAllSGPRs(const GCNSubtarget &ST,
 // but we would then have to make sure that we were in fact saving at least one
 // callee-save register in the prologue, which is additional complexity that
 // doesn't seem worth the benefit.
-static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
-                                                 LivePhysRegs &LiveRegs,
-                                                 const TargetRegisterClass &RC,
-                                                 bool Unused = false) {
+static MCRegister findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
+                                                   LivePhysRegs &LiveRegs,
+                                                   const TargetRegisterClass &RC,
+                                                   bool Unused = false) {
   // Mark callee saved registers as used so we will not choose them.
   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
   for (unsigned i = 0; CSRegs[i]; ++i)
@@ -59,12 +47,12 @@ static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
   if (Unused) {
     // We are looking for a register that can be used throughout the entire
     // function, so any use is unacceptable.
-    for (unsigned Reg : RC) {
+    for (MCRegister Reg : RC) {
       if (!MRI.isPhysRegUsed(Reg) && LiveRegs.available(MRI, Reg))
         return Reg;
     }
   } else {
-    for (unsigned Reg : RC) {
+    for (MCRegister Reg : RC) {
       if (LiveRegs.available(MRI, Reg))
         return Reg;
     }
@@ -76,14 +64,67 @@ static unsigned findScratchNonCalleeSaveRegister(MachineRegisterInfo &MRI,
   if (!Unused)
     report_fatal_error("failed to find free scratch register");
 
-  return AMDGPU::NoRegister;
+  return MCRegister();
 }
 
-static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
-  LivePhysRegs LiveRegs;
-  LiveRegs.init(*MRI.getTargetRegisterInfo());
-  return findScratchNonCalleeSaveRegister(
-    MRI, LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
+static void getVGPRSpillLaneOrTempRegister(MachineFunction &MF,
+                                           LivePhysRegs &LiveRegs,
+                                           Register &TempSGPR,
+                                           Optional<int> &FrameIndex,
+                                           bool IsFP) {
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+
+#ifndef NDEBUG
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+#endif
+
+  // We need to save and restore the current FP/BP.
+
+  // 1: If there is already a VGPR with free lanes, use it. We
+  // may already have to pay the penalty for spilling a CSR VGPR.
+  if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
+    int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
+                                            TargetStackID::SGPRSpill);
+
+    if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
+      llvm_unreachable("allocate SGPR spill should have worked");
+
+    FrameIndex = NewFI;
+
+    LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+               dbgs() << "Spilling " << (IsFP ? "FP" : "BP") << " to  "
+                      << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
+                      << '\n');
+    return;
+  }
+
+  // 2: Next, try to save the FP/BP in an unused SGPR.
+  TempSGPR = findScratchNonCalleeSaveRegister(
+      MF.getRegInfo(), LiveRegs, AMDGPU::SReg_32_XM0_XEXECRegClass, true);
+
+  if (!TempSGPR) {
+    int NewFI = FrameInfo.CreateStackObject(4, Align(4), true, nullptr,
+                                            TargetStackID::SGPRSpill);
+
+    if (MFI->allocateSGPRSpillToVGPR(MF, NewFI)) {
+      // 3: There's no free lane to spill, and no free register to save FP/BP,
+      // so we're forced to spill another VGPR to use for the spill.
+      FrameIndex = NewFI;
+    } else {
+      // 4: If all else fails, spill the FP/BP to memory.
+      FrameIndex = FrameInfo.CreateSpillStackObject(4, Align(4));
+    }
+
+    LLVM_DEBUG(auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
+               dbgs() << (IsFP ? "FP" : "BP") << " requires fallback spill to "
+                      << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane
+                      << '\n';);
+  } else {
+    LLVM_DEBUG(dbgs() << "Saving " << (IsFP ? "FP" : "BP") << " with copy to "
+                      << printReg(TempSGPR, TRI) << '\n');
+  }
 }
 
 // We need to specially emit stack operations here because a different frame
@@ -91,8 +132,8 @@ static MCPhysReg findUnusedSGPRNonCalleeSaved(MachineRegisterInfo &MRI) {
 // use.
 static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator I,
-                             const SIInstrInfo *TII, unsigned SpillReg,
-                             unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
+                             const SIInstrInfo *TII, Register SpillReg,
+                             Register ScratchRsrcReg, Register SPReg, int FI) {
   MachineFunction *MF = MBB.getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
 
@@ -100,7 +141,7 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
 
   MachineMemOperand *MMO = MF->getMachineMemOperand(
       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore, 4,
-      MFI.getObjectAlignment(FI));
+      MFI.getObjectAlign(FI));
 
   if (isUInt<12>(Offset)) {
     BuildMI(MBB, I, DebugLoc(), TII->get(AMDGPU::BUFFER_STORE_DWORD_OFFSET))
@@ -139,15 +180,15 @@ static void buildPrologSpill(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
 
 static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator I,
-                              const SIInstrInfo *TII, unsigned SpillReg,
-                              unsigned ScratchRsrcReg, unsigned SPReg, int FI) {
+                              const SIInstrInfo *TII, Register SpillReg,
+                              Register ScratchRsrcReg, Register SPReg, int FI) {
   MachineFunction *MF = MBB.getParent();
   MachineFrameInfo &MFI = MF->getFrameInfo();
   int64_t Offset = MFI.getObjectOffset(FI);
 
   MachineMemOperand *MMO = MF->getMachineMemOperand(
       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad, 4,
-      MFI.getObjectAlignment(FI));
+      MFI.getObjectAlign(FI));
 
   if (isUInt<12>(Offset)) {
     BuildMI(MBB, I, DebugLoc(),
@@ -184,11 +225,13 @@ static void buildEpilogReload(LivePhysRegs &LiveRegs, MachineBasicBlock &MBB,
     .addMemOperand(MMO);
 }
 
-void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
-                                          MachineFunction &MF,
-                                          MachineBasicBlock &MBB) const {
+// Emit flat scratch setup code, assuming `MFI->hasFlatScratchInit()`
+void SIFrameLowering::emitEntryFunctionFlatScratchInit(
+    MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+    const DebugLoc &DL, Register ScratchWaveOffsetReg) const {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
-  const SIRegisterInfo* TRI = &TII->getRegisterInfo();
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
   // We don't need this if we only have spills since there is no user facing
@@ -201,11 +244,6 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
   // pointer. Because we only detect if flat instructions are used at all,
   // this will be used more often than necessary on VI.
 
-  // Debug location must be unknown since the first debug location is used to
-  // determine the end of the prologue.
-  DebugLoc DL;
-  MachineBasicBlock::iterator I = MBB.begin();
-
   Register FlatScratchInitReg =
       MFI->getPreloadedReg(AMDGPUFunctionArgInfo::FLAT_SCRATCH_INIT);
 
@@ -216,8 +254,6 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
   Register FlatScrInitLo = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub0);
   Register FlatScrInitHi = TRI->getSubReg(FlatScratchInitReg, AMDGPU::sub1);
 
-  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
-
   // Do a 64-bit pointer add.
   if (ST.flatScratchIsPointer()) {
     if (ST.getGeneration() >= AMDGPUSubtarget::GFX10) {
@@ -266,19 +302,22 @@ void SIFrameLowering::emitFlatScratchInit(const GCNSubtarget &ST,
     .addImm(8);
 }
 
-unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
-  const GCNSubtarget &ST,
-  const SIInstrInfo *TII,
-  const SIRegisterInfo *TRI,
-  SIMachineFunctionInfo *MFI,
-  MachineFunction &MF) const {
+// Shift down registers reserved for the scratch RSRC.
+Register SIFrameLowering::getEntryFunctionReservedScratchRsrcReg(
+    MachineFunction &MF) const {
+
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+
+  assert(MFI->isEntryFunction());
+
+  Register ScratchRsrcReg = MFI->getScratchRSrcReg();
 
-  // We need to insert initialization of the scratch resource descriptor.
-  unsigned ScratchRsrcReg = MFI->getScratchRSrcReg();
-  if (ScratchRsrcReg == AMDGPU::NoRegister ||
-      !MRI.isPhysRegUsed(ScratchRsrcReg))
-    return AMDGPU::NoRegister;
+  if (!ScratchRsrcReg || !MRI.isPhysRegUsed(ScratchRsrcReg))
+    return Register();
 
   if (ST.hasSGPRInitBug() ||
       ScratchRsrcReg != TRI->reservedPrivateSegmentBufferReg(MF))
@@ -293,18 +332,19 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
   // cannot do this for the resources required for scratch access. For now we
   // skip over user SGPRs and may leave unused holes.
 
-  // We find the resource first because it has an alignment requirement.
-
   unsigned NumPreloaded = (MFI->getNumPreloadedSGPRs() + 3) / 4;
-  ArrayRef<MCPhysReg> AllSGPR128s = getAllSGPR128(ST, MF);
+  ArrayRef<MCPhysReg> AllSGPR128s = TRI->getAllSGPR128(MF);
   AllSGPR128s = AllSGPR128s.slice(std::min(static_cast<unsigned>(AllSGPR128s.size()), NumPreloaded));
 
   // Skip the last N reserved elements because they should have already been
   // reserved for VCC etc.
+  Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
   for (MCPhysReg Reg : AllSGPR128s) {
     // Pick the first unallocated one. Make sure we don't clobber the other
-    // reserved input we needed.
-    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
+    // reserved input we needed. Also for PAL, make sure we don't clobber
+    // the GIT pointer passed in SGPR0 or SGPR8.
+    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
+        !TRI->isSubRegisterEq(Reg, GITPtrLoReg)) {
       MRI.replaceRegWith(ScratchRsrcReg, Reg);
       MFI->setScratchRSrcReg(Reg);
       return Reg;
@@ -314,231 +354,138 @@ unsigned SIFrameLowering::getReservedPrivateSegmentBufferReg(
   return ScratchRsrcReg;
 }
 
-// Shift down registers reserved for the scratch wave offset.
-std::pair<unsigned, bool>
-SIFrameLowering::getReservedPrivateSegmentWaveByteOffsetReg(
-    const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI,
-    SIMachineFunctionInfo *MFI, MachineFunction &MF) const {
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
-
-  assert(MFI->isEntryFunction());
-
-  // No replacement necessary.
-  if (ScratchWaveOffsetReg == AMDGPU::NoRegister ||
-      (!hasFP(MF) && !MRI.isPhysRegUsed(ScratchWaveOffsetReg))) {
-    return std::make_pair(AMDGPU::NoRegister, false);
-  }
-
-  if (ST.hasSGPRInitBug())
-    return std::make_pair(ScratchWaveOffsetReg, false);
-
-  unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
-
-  ArrayRef<MCPhysReg> AllSGPRs = getAllSGPRs(ST, MF);
-  if (NumPreloaded > AllSGPRs.size())
-    return std::make_pair(ScratchWaveOffsetReg, false);
-
-  AllSGPRs = AllSGPRs.slice(NumPreloaded);
-
-  // We need to drop register from the end of the list that we cannot use
-  // for the scratch wave offset.
-  // + 2 s102 and s103 do not exist on VI.
-  // + 2 for vcc
-  // + 2 for xnack_mask
-  // + 2 for flat_scratch
-  // + 4 for registers reserved for scratch resource register
-  // + 1 for register reserved for scratch wave offset.  (By exluding this
-  //     register from the list to consider, it means that when this
-  //     register is being used for the scratch wave offset and there
-  //     are no other free SGPRs, then the value will stay in this register.
-  // + 1 if stack pointer is used.
-  // ----
-  //  13 (+1)
-  unsigned ReservedRegCount = 13;
-
-  if (AllSGPRs.size() < ReservedRegCount)
-    return std::make_pair(ScratchWaveOffsetReg, false);
-
-  bool HandledScratchWaveOffsetReg =
-    ScratchWaveOffsetReg != TRI->reservedPrivateSegmentWaveByteOffsetReg(MF);
-  bool FPAdjusted = false;
-
-  for (MCPhysReg Reg : AllSGPRs.drop_back(ReservedRegCount)) {
-    // Pick the first unallocated SGPR. Be careful not to pick an alias of the
-    // scratch descriptor, since we haven’t added its uses yet.
-    if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg)) {
-      if (!HandledScratchWaveOffsetReg) {
-        HandledScratchWaveOffsetReg = true;
-
-        MRI.replaceRegWith(ScratchWaveOffsetReg, Reg);
-        if (MFI->getScratchWaveOffsetReg() == MFI->getStackPtrOffsetReg()) {
-          assert(!hasFP(MF));
-          MFI->setStackPtrOffsetReg(Reg);
-        }
-
-        MFI->setScratchWaveOffsetReg(Reg);
-        MFI->setFrameOffsetReg(Reg);
-        ScratchWaveOffsetReg = Reg;
-        FPAdjusted = true;
-        break;
-      }
-    }
-  }
-
-  return std::make_pair(ScratchWaveOffsetReg, FPAdjusted);
-}
-
 void SIFrameLowering::emitEntryFunctionPrologue(MachineFunction &MF,
                                                 MachineBasicBlock &MBB) const {
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
 
-  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-
-  // If we only have SGPR spills, we won't actually be using scratch memory
-  // since these spill to VGPRs.
-  //
-  // FIXME: We should be cleaning up these unused SGPR spill frame indices
-  // somewhere.
-
-  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
-  const SIInstrInfo *TII = ST.getInstrInfo();
-  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
-  MachineRegisterInfo &MRI = MF.getRegInfo();
-  const Function &F = MF.getFunction();
-
-  // We need to do the replacement of the private segment buffer and wave offset
-  // register even if there are no stack objects. There could be stores to undef
-  // or a constant without an associated object.
+  // FIXME: If we only have SGPR spills, we won't actually be using scratch
+  // memory since these spill to VGPRs. We should be cleaning up these unused
+  // SGPR spill frame indices somewhere.
 
   // FIXME: We still have implicit uses on SGPR spill instructions in case they
   // need to spill to vector memory. It's likely that will not happen, but at
   // this point it appears we need the setup. This part of the prolog should be
   // emitted after frame indices are eliminated.
 
-  if (MFI->hasFlatScratchInit())
-    emitFlatScratchInit(ST, MF, MBB);
+  // FIXME: Remove all of the isPhysRegUsed checks
 
-  unsigned ScratchRsrcReg
-    = getReservedPrivateSegmentBufferReg(ST, TII, TRI, MFI, MF);
+  SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const Function &F = MF.getFunction();
 
-  unsigned ScratchWaveOffsetReg;
-  bool FPAdjusted;
-  std::tie(ScratchWaveOffsetReg, FPAdjusted) =
-      getReservedPrivateSegmentWaveByteOffsetReg(ST, TII, TRI, MFI, MF);
+  assert(MFI->isEntryFunction());
 
-  // We need to insert initialization of the scratch resource descriptor.
   Register PreloadedScratchWaveOffsetReg = MFI->getPreloadedReg(
       AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-
-  unsigned PreloadedPrivateBufferReg = AMDGPU::NoRegister;
-  if (ST.isAmdHsaOrMesa(F)) {
-    PreloadedPrivateBufferReg = MFI->getPreloadedReg(
-      AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
-  }
-
-  bool OffsetRegUsed = ScratchWaveOffsetReg != AMDGPU::NoRegister &&
-                       MRI.isPhysRegUsed(ScratchWaveOffsetReg);
-  bool ResourceRegUsed = ScratchRsrcReg != AMDGPU::NoRegister &&
-                         MRI.isPhysRegUsed(ScratchRsrcReg);
-
   // FIXME: Hack to not crash in situations which emitted an error.
-  if (PreloadedScratchWaveOffsetReg == AMDGPU::NoRegister)
+  if (!PreloadedScratchWaveOffsetReg)
     return;
 
-  // We added live-ins during argument lowering, but since they were not used
-  // they were deleted. We're adding the uses now, so add them back.
-  MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
-  MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
-
-  if (ResourceRegUsed && PreloadedPrivateBufferReg != AMDGPU::NoRegister) {
-    assert(ST.isAmdHsaOrMesa(F) || ST.isMesaGfxShader(F));
-    MRI.addLiveIn(PreloadedPrivateBufferReg);
-    MBB.addLiveIn(PreloadedPrivateBufferReg);
+  // We need to do the replacement of the private segment buffer register even
+  // if there are no stack objects. There could be stores to undef or a
+  // constant without an associated object.
+  //
+  // This will return `Register()` in cases where there are no actual
+  // uses of the SRSRC.
+  Register ScratchRsrcReg = getEntryFunctionReservedScratchRsrcReg(MF);
+
+  // Make the selected register live throughout the function.
+  if (ScratchRsrcReg) {
+    for (MachineBasicBlock &OtherBB : MF) {
+      if (&OtherBB != &MBB) {
+        OtherBB.addLiveIn(ScratchRsrcReg);
+      }
+    }
   }
 
-  // Make the register selected live throughout the function.
-  for (MachineBasicBlock &OtherBB : MF) {
-    if (&OtherBB == &MBB)
-      continue;
-
-    if (OffsetRegUsed || FPAdjusted)
-      OtherBB.addLiveIn(ScratchWaveOffsetReg);
-
-    if (ResourceRegUsed)
-      OtherBB.addLiveIn(ScratchRsrcReg);
+  // Now that we have fixed the reserved SRSRC we need to locate the
+  // (potentially) preloaded SRSRC.
+  Register PreloadedScratchRsrcReg;
+  if (ST.isAmdHsaOrMesa(F)) {
+    PreloadedScratchRsrcReg =
+        MFI->getPreloadedReg(AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_BUFFER);
+    if (ScratchRsrcReg && PreloadedScratchRsrcReg) {
+      // We added live-ins during argument lowering, but since they were not
+      // used they were deleted. We're adding the uses now, so add them back.
+      MRI.addLiveIn(PreloadedScratchRsrcReg);
+      MBB.addLiveIn(PreloadedScratchRsrcReg);
+    }
   }
 
+  // Debug location must be unknown since the first debug location is used to
+  // determine the end of the prologue.
   DebugLoc DL;
   MachineBasicBlock::iterator I = MBB.begin();
 
-  // If we reserved the original input registers, we don't need to copy to the
-  // reserved registers.
-
-  bool CopyBuffer = ResourceRegUsed &&
-    PreloadedPrivateBufferReg != AMDGPU::NoRegister &&
-    ST.isAmdHsaOrMesa(F) &&
-    ScratchRsrcReg != PreloadedPrivateBufferReg;
-
-  // This needs to be careful of the copying order to avoid overwriting one of
-  // the input registers before it's been copied to it's final
-  // destination. Usually the offset should be copied first.
-  bool CopyBufferFirst = TRI->isSubRegisterEq(PreloadedPrivateBufferReg,
-                                              ScratchWaveOffsetReg);
-  if (CopyBuffer && CopyBufferFirst) {
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
-      .addReg(PreloadedPrivateBufferReg, RegState::Kill);
+  // We found the SRSRC first because it needs four registers and has an
+  // alignment requirement. If the SRSRC that we found is clobbering with
+  // the scratch wave offset, which may be in a fixed SGPR or a free SGPR
+  // chosen by SITargetLowering::allocateSystemSGPRs, COPY the scratch
+  // wave offset to a free SGPR.
+  Register ScratchWaveOffsetReg;
+  if (TRI->isSubRegisterEq(ScratchRsrcReg, PreloadedScratchWaveOffsetReg)) {
+    ArrayRef<MCPhysReg> AllSGPRs = TRI->getAllSGPR32(MF);
+    unsigned NumPreloaded = MFI->getNumPreloadedSGPRs();
+    AllSGPRs = AllSGPRs.slice(
+        std::min(static_cast<unsigned>(AllSGPRs.size()), NumPreloaded));
+    Register GITPtrLoReg = MFI->getGITPtrLoReg(MF);
+    for (MCPhysReg Reg : AllSGPRs) {
+      if (!MRI.isPhysRegUsed(Reg) && MRI.isAllocatable(Reg) &&
+          !TRI->isSubRegisterEq(ScratchRsrcReg, Reg) && GITPtrLoReg != Reg) {
+        ScratchWaveOffsetReg = Reg;
+        BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
+            .addReg(PreloadedScratchWaveOffsetReg, RegState::Kill);
+        break;
+      }
+    }
+  } else {
+    ScratchWaveOffsetReg = PreloadedScratchWaveOffsetReg;
   }
+  assert(ScratchWaveOffsetReg);
 
-  unsigned SPReg = MFI->getStackPtrOffsetReg();
-  assert(SPReg != AMDGPU::SP_REG);
-
-  // FIXME: Remove the isPhysRegUsed checks
-  const bool HasFP = hasFP(MF);
-
-  if (HasFP || OffsetRegUsed) {
-    assert(ScratchWaveOffsetReg);
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchWaveOffsetReg)
-      .addReg(PreloadedScratchWaveOffsetReg, HasFP ? RegState::Kill : 0);
+  if (MF.getFrameInfo().hasCalls()) {
+    Register SPReg = MFI->getStackPtrOffsetReg();
+    assert(SPReg != AMDGPU::SP_REG);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), SPReg)
+        .addImm(MF.getFrameInfo().getStackSize() * ST.getWavefrontSize());
   }
 
-  if (CopyBuffer && !CopyBufferFirst) {
-    BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
-      .addReg(PreloadedPrivateBufferReg, RegState::Kill);
+  if (hasFP(MF)) {
+    Register FPReg = MFI->getFrameOffsetReg();
+    assert(FPReg != AMDGPU::FP_REG);
+    BuildMI(MBB, I, DL, TII->get(AMDGPU::S_MOV_B32), FPReg).addImm(0);
   }
 
-  if (ResourceRegUsed) {
-    emitEntryFunctionScratchSetup(ST, MF, MBB, MFI, I,
-        PreloadedPrivateBufferReg, ScratchRsrcReg);
+  if (MFI->hasFlatScratchInit() || ScratchRsrcReg) {
+    MRI.addLiveIn(PreloadedScratchWaveOffsetReg);
+    MBB.addLiveIn(PreloadedScratchWaveOffsetReg);
   }
 
-  if (HasFP) {
-    DebugLoc DL;
-    const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-    int64_t StackSize = FrameInfo.getStackSize();
+  if (MFI->hasFlatScratchInit()) {
+    emitEntryFunctionFlatScratchInit(MF, MBB, I, DL, ScratchWaveOffsetReg);
+  }
 
-    // On kernel entry, the private scratch wave offset is the SP value.
-    if (StackSize == 0) {
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), SPReg)
-        .addReg(MFI->getScratchWaveOffsetReg());
-    } else {
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), SPReg)
-        .addReg(MFI->getScratchWaveOffsetReg())
-        .addImm(StackSize * ST.getWavefrontSize());
-    }
+  if (ScratchRsrcReg) {
+    emitEntryFunctionScratchRsrcRegSetup(MF, MBB, I, DL,
+                                         PreloadedScratchRsrcReg,
+                                         ScratchRsrcReg, ScratchWaveOffsetReg);
   }
 }
 
-// Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
-void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
-      MachineFunction &MF, MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
-      MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
-      unsigned ScratchRsrcReg) const {
+// Emit scratch RSRC setup code, assuming `ScratchRsrcReg != AMDGPU::NoReg`
+void SIFrameLowering::emitEntryFunctionScratchRsrcRegSetup(
+    MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+    const DebugLoc &DL, Register PreloadedScratchRsrcReg,
+    Register ScratchRsrcReg, Register ScratchWaveOffsetReg) const {
 
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo *TRI = &TII->getRegisterInfo();
+  const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   const Function &Fn = MF.getFunction();
-  DebugLoc DL;
 
   if (ST.isAmdPalOS()) {
     // The pointer to the GIT is formed from the offset passed in and either
@@ -557,19 +504,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
       const MCInstrDesc &GetPC64 = TII->get(AMDGPU::S_GETPC_B64);
       BuildMI(MBB, I, DL, GetPC64, Rsrc01);
     }
-    auto GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
-    if (ST.hasMergedShaders()) {
-      switch (MF.getFunction().getCallingConv()) {
-        case CallingConv::AMDGPU_HS:
-        case CallingConv::AMDGPU_GS:
-          // Low GIT address is passed in s8 rather than s0 for an LS+HS or
-          // ES+GS merged shader on gfx9+.
-          GitPtrLo = AMDGPU::SGPR8;
-          break;
-        default:
-          break;
-      }
-    }
+    Register GitPtrLo = MFI->getGITPtrLoReg(MF);
     MF.getRegInfo().addLiveIn(GitPtrLo);
     MBB.addLiveIn(GitPtrLo);
     BuildMI(MBB, I, DL, SMovB32, RsrcLo)
@@ -582,12 +517,12 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
     const MCInstrDesc &LoadDwordX4 = TII->get(AMDGPU::S_LOAD_DWORDX4_IMM);
     auto MMO = MF.getMachineMemOperand(PtrInfo,
                                        MachineMemOperand::MOLoad |
-                                       MachineMemOperand::MOInvariant |
-                                       MachineMemOperand::MODereferenceable,
-                                       16, 4);
+                                           MachineMemOperand::MOInvariant |
+                                           MachineMemOperand::MODereferenceable,
+                                       16, Align(4));
     unsigned Offset = Fn.getCallingConv() == CallingConv::AMDGPU_CS ? 16 : 0;
     const GCNSubtarget &Subtarget = MF.getSubtarget<GCNSubtarget>();
-    unsigned EncodedOffset = AMDGPU::getSMRDEncodedOffset(Subtarget, Offset);
+    unsigned EncodedOffset = AMDGPU::convertSMRDOffsetUnits(Subtarget, Offset);
     BuildMI(MBB, I, DL, LoadDwordX4, ScratchRsrcReg)
       .addReg(Rsrc01)
       .addImm(EncodedOffset) // offset
@@ -595,10 +530,7 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
       .addImm(0) // dlc
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine)
       .addMemOperand(MMO);
-    return;
-  }
-  if (ST.isMesaGfxShader(Fn)
-      || (PreloadedPrivateBufferReg == AMDGPU::NoRegister)) {
+  } else if (ST.isMesaGfxShader(Fn) || !PreloadedScratchRsrcReg) {
     assert(!ST.isAmdHsaOrMesa(Fn));
     const MCInstrDesc &SMovB32 = TII->get(AMDGPU::S_MOV_B32);
 
@@ -621,11 +553,11 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
         const MCInstrDesc &LoadDwordX2 = TII->get(AMDGPU::S_LOAD_DWORDX2_IMM);
 
         MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
-        auto MMO = MF.getMachineMemOperand(PtrInfo,
-                                           MachineMemOperand::MOLoad |
-                                           MachineMemOperand::MOInvariant |
-                                           MachineMemOperand::MODereferenceable,
-                                           8, 4);
+        auto MMO = MF.getMachineMemOperand(
+            PtrInfo,
+            MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant |
+                MachineMemOperand::MODereferenceable,
+            8, Align(4));
         BuildMI(MBB, I, DL, LoadDwordX2, Rsrc01)
           .addReg(MFI->getImplicitBufferPtrUserSGPR())
           .addImm(0) // offset
@@ -658,7 +590,37 @@ void SIFrameLowering::emitEntryFunctionScratchSetup(const GCNSubtarget &ST,
     BuildMI(MBB, I, DL, SMovB32, Rsrc3)
       .addImm(Rsrc23 >> 32)
       .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+  } else if (ST.isAmdHsaOrMesa(Fn)) {
+    assert(PreloadedScratchRsrcReg);
+
+    if (ScratchRsrcReg != PreloadedScratchRsrcReg) {
+      BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), ScratchRsrcReg)
+          .addReg(PreloadedScratchRsrcReg, RegState::Kill);
+    }
   }
+
+  // Add the scratch wave offset into the scratch RSRC.
+  //
+  // We only want to update the first 48 bits, which is the base address
+  // pointer, without touching the adjacent 16 bits of flags. We know this add
+  // cannot carry-out from bit 47, otherwise the scratch allocation would be
+  // impossible to fit in the 48-bit global address space.
+  //
+  // TODO: Evaluate if it is better to just construct an SRD using the flat
+  // scratch init and some constants rather than update the one we are passed.
+  Register ScratchRsrcSub0 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub0);
+  Register ScratchRsrcSub1 = TRI->getSubReg(ScratchRsrcReg, AMDGPU::sub1);
+
+  // We cannot Kill ScratchWaveOffsetReg here because we allow it to be used in
+  // the kernel body via inreg arguments.
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADD_U32), ScratchRsrcSub0)
+      .addReg(ScratchRsrcSub0)
+      .addReg(ScratchWaveOffsetReg)
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ADDC_U32), ScratchRsrcSub1)
+      .addReg(ScratchRsrcSub1)
+      .addImm(0)
+      .addReg(ScratchRsrcReg, RegState::ImplicitDefine);
 }
 
 bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
@@ -673,6 +635,50 @@ bool SIFrameLowering::isSupportedStackID(TargetStackID::Value ID) const {
   llvm_unreachable("Invalid TargetStackID::Value");
 }
 
+// Activate all lanes, returns saved exec.
+static Register buildScratchExecCopy(LivePhysRegs &LiveRegs,
+                                     MachineFunction &MF,
+                                     MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator MBBI,
+                                     bool IsProlog) {
+  Register ScratchExecCopy;
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  DebugLoc DL;
+
+  if (LiveRegs.empty()) {
+    if (IsProlog) {
+      LiveRegs.init(TRI);
+      LiveRegs.addLiveIns(MBB);
+      if (FuncInfo->SGPRForFPSaveRestoreCopy)
+        LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+
+      if (FuncInfo->SGPRForBPSaveRestoreCopy)
+        LiveRegs.removeReg(FuncInfo->SGPRForBPSaveRestoreCopy);
+    } else {
+      // In epilog.
+      LiveRegs.init(*ST.getRegisterInfo());
+      LiveRegs.addLiveOuts(MBB);
+      LiveRegs.stepBackward(*MBBI);
+    }
+  }
+
+  ScratchExecCopy = findScratchNonCalleeSaveRegister(
+      MRI, LiveRegs, *TRI.getWaveMaskRegClass());
+
+  if (!IsProlog)
+    LiveRegs.removeReg(ScratchExecCopy);
+
+  const unsigned OrSaveExec =
+      ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
+  BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy).addImm(-1);
+
+  return ScratchExecCopy;
+}
+
 void SIFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
@@ -687,51 +693,81 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
   const SIInstrInfo *TII = ST.getInstrInfo();
   const SIRegisterInfo &TRI = TII->getRegisterInfo();
 
-  unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
-  unsigned FramePtrReg = FuncInfo->getFrameOffsetReg();
+  Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+  Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+  Register BasePtrReg =
+      TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
   LivePhysRegs LiveRegs;
 
   MachineBasicBlock::iterator MBBI = MBB.begin();
   DebugLoc DL;
 
   bool HasFP = false;
+  bool HasBP = false;
   uint32_t NumBytes = MFI.getStackSize();
   uint32_t RoundedSize = NumBytes;
   // To avoid clobbering VGPRs in lanes that weren't active on function entry,
   // turn on all lanes before doing the spill to memory.
-  unsigned ScratchExecCopy = AMDGPU::NoRegister;
+  Register ScratchExecCopy;
+
+  bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
+  bool SpillFPToMemory = false;
+  // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
+  // Otherwise we are spilling the FP to memory.
+  if (HasFPSaveIndex) {
+    SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
+                      TargetStackID::SGPRSpill;
+  }
+
+  bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
+  bool SpillBPToMemory = false;
+  // A StackID of SGPRSpill implies that this is a spill from SGPR to VGPR.
+  // Otherwise we are spilling the BP to memory.
+  if (HasBPSaveIndex) {
+    SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
+                      TargetStackID::SGPRSpill;
+  }
 
   // Emit the copy if we need an FP, and are using a free SGPR to save it.
-  if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
+  if (FuncInfo->SGPRForFPSaveRestoreCopy) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->SGPRForFPSaveRestoreCopy)
       .addReg(FramePtrReg)
       .setMIFlag(MachineInstr::FrameSetup);
   }
 
+  // Emit the copy if we need a BP, and are using a free SGPR to save it.
+  if (FuncInfo->SGPRForBPSaveRestoreCopy) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY),
+            FuncInfo->SGPRForBPSaveRestoreCopy)
+        .addReg(BasePtrReg)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // If a copy has been emitted for FP and/or BP, Make the SGPRs
+  // used in the copy instructions live throughout the function.
+  SmallVector<MCPhysReg, 2> TempSGPRs;
+  if (FuncInfo->SGPRForFPSaveRestoreCopy)
+    TempSGPRs.push_back(FuncInfo->SGPRForFPSaveRestoreCopy);
+
+  if (FuncInfo->SGPRForBPSaveRestoreCopy)
+    TempSGPRs.push_back(FuncInfo->SGPRForBPSaveRestoreCopy);
+
+  if (!TempSGPRs.empty()) {
+    for (MachineBasicBlock &MBB : MF) {
+      for (MCPhysReg Reg : TempSGPRs)
+        MBB.addLiveIn(Reg);
+
+      MBB.sortUniqueLiveIns();
+    }
+  }
+
   for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
          : FuncInfo->getSGPRSpillVGPRs()) {
     if (!Reg.FI.hasValue())
       continue;
 
-    if (ScratchExecCopy == AMDGPU::NoRegister) {
-      if (LiveRegs.empty()) {
-        LiveRegs.init(TRI);
-        LiveRegs.addLiveIns(MBB);
-        if (FuncInfo->SGPRForFPSaveRestoreCopy)
-          LiveRegs.removeReg(FuncInfo->SGPRForFPSaveRestoreCopy);
-      }
-
-      ScratchExecCopy
-        = findScratchNonCalleeSaveRegister(MRI, LiveRegs,
-                                           *TRI.getWaveMaskRegClass());
-      assert(FuncInfo->SGPRForFPSaveRestoreCopy != ScratchExecCopy);
-
-      const unsigned OrSaveExec = ST.isWave32() ?
-        AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
-      BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec),
-              ScratchExecCopy)
-        .addImm(-1);
-    }
+    if (!ScratchExecCopy)
+      ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
 
     buildPrologSpill(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
                      FuncInfo->getScratchRSrcReg(),
@@ -739,84 +775,153 @@ void SIFrameLowering::emitPrologue(MachineFunction &MF,
                      Reg.FI.getValue());
   }
 
-  if (ScratchExecCopy != AMDGPU::NoRegister) {
+  if (HasFPSaveIndex && SpillFPToMemory) {
+    assert(!MFI.isDeadObjectIndex(FuncInfo->FramePointerSaveIndex.getValue()));
+
+    if (!ScratchExecCopy)
+      ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
+
+    MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
+        MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+        .addReg(FramePtrReg);
+
+    buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
+                     FuncInfo->getScratchRSrcReg(), StackPtrReg,
+                     FuncInfo->FramePointerSaveIndex.getValue());
+  }
+
+  if (HasBPSaveIndex && SpillBPToMemory) {
+    assert(!MFI.isDeadObjectIndex(*FuncInfo->BasePointerSaveIndex));
+
+    if (!ScratchExecCopy)
+      ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, true);
+
+    MCPhysReg TmpVGPR = findScratchNonCalleeSaveRegister(
+        MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
+        .addReg(BasePtrReg);
+
+    buildPrologSpill(LiveRegs, MBB, MBBI, TII, TmpVGPR,
+                     FuncInfo->getScratchRSrcReg(), StackPtrReg,
+                     *FuncInfo->BasePointerSaveIndex);
+  }
+
+  if (ScratchExecCopy) {
     // FIXME: Split block and make terminator.
     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+    MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
-      .addReg(ScratchExecCopy, RegState::Kill);
+        .addReg(ScratchExecCopy, RegState::Kill);
     LiveRegs.addReg(ScratchExecCopy);
   }
 
-
-  if (FuncInfo->FramePointerSaveIndex) {
+  // In this case, spill the FP to a reserved VGPR.
+  if (HasFPSaveIndex && !SpillFPToMemory) {
     const int FI = FuncInfo->FramePointerSaveIndex.getValue();
-    assert(!MFI.isDeadObjectIndex(FI) &&
-           MFI.getStackID(FI) == TargetStackID::SGPRSpill);
-    ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
-      = FuncInfo->getSGPRToVGPRSpills(FI);
+    assert(!MFI.isDeadObjectIndex(FI));
+
+    assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+    ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+        FuncInfo->getSGPRToVGPRSpills(FI);
     assert(Spill.size() == 1);
 
     // Save FP before setting it up.
     // FIXME: This should respect spillSGPRToVGPR;
     BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
             Spill[0].VGPR)
-      .addReg(FramePtrReg)
-      .addImm(Spill[0].Lane)
-      .addReg(Spill[0].VGPR, RegState::Undef);
+        .addReg(FramePtrReg)
+        .addImm(Spill[0].Lane)
+        .addReg(Spill[0].VGPR, RegState::Undef);
+  }
+
+  // In this case, spill the BP to a reserved VGPR.
+  if (HasBPSaveIndex && !SpillBPToMemory) {
+    const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+    assert(!MFI.isDeadObjectIndex(BasePtrFI));
+
+    assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
+    ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+        FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
+    assert(Spill.size() == 1);
+
+    // Save BP before setting it up.
+    // FIXME: This should respect spillSGPRToVGPR;
+    BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+            Spill[0].VGPR)
+        .addReg(BasePtrReg)
+        .addImm(Spill[0].Lane)
+        .addReg(Spill[0].VGPR, RegState::Undef);
   }
 
   if (TRI.needsStackRealignment(MF)) {
     HasFP = true;
-    const unsigned Alignment = MFI.getMaxAlignment();
+    const unsigned Alignment = MFI.getMaxAlign().value();
 
     RoundedSize += Alignment;
     if (LiveRegs.empty()) {
       LiveRegs.init(TRI);
       LiveRegs.addLiveIns(MBB);
       LiveRegs.addReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+      LiveRegs.addReg(FuncInfo->SGPRForBPSaveRestoreCopy);
     }
 
-    unsigned ScratchSPReg = findScratchNonCalleeSaveRegister(
+    Register ScratchSPReg = findScratchNonCalleeSaveRegister(
         MRI, LiveRegs, AMDGPU::SReg_32_XM0RegClass);
-    assert(ScratchSPReg != AMDGPU::NoRegister &&
-           ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy);
+    assert(ScratchSPReg && ScratchSPReg != FuncInfo->SGPRForFPSaveRestoreCopy &&
+           ScratchSPReg != FuncInfo->SGPRForBPSaveRestoreCopy);
 
     // s_add_u32 tmp_reg, s32, NumBytes
     // s_and_b32 s32, tmp_reg, 0b111...0000
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), ScratchSPReg)
-      .addReg(StackPtrReg)
-      .addImm((Alignment - 1) * ST.getWavefrontSize())
-      .setMIFlag(MachineInstr::FrameSetup);
+        .addReg(StackPtrReg)
+        .addImm((Alignment - 1) * ST.getWavefrontSize())
+        .setMIFlag(MachineInstr::FrameSetup);
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_AND_B32), FramePtrReg)
-      .addReg(ScratchSPReg, RegState::Kill)
-      .addImm(-Alignment * ST.getWavefrontSize())
-      .setMIFlag(MachineInstr::FrameSetup);
+        .addReg(ScratchSPReg, RegState::Kill)
+        .addImm(-Alignment * ST.getWavefrontSize())
+        .setMIFlag(MachineInstr::FrameSetup);
     FuncInfo->setIsStackRealigned(true);
   } else if ((HasFP = hasFP(MF))) {
-    // If we need a base pointer, set it up here. It's whatever the value of
-    // the stack pointer is at this point. Any variable size objects will be
-    // allocated after this, so we can still use the base pointer to reference
-    // locals.
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
-      .addReg(StackPtrReg)
-      .setMIFlag(MachineInstr::FrameSetup);
+        .addReg(StackPtrReg)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // If we need a base pointer, set it up here. It's whatever the value of
+  // the stack pointer is at this point. Any variable size objects will be
+  // allocated after this, so we can still use the base pointer to reference
+  // the incoming arguments.
+  if ((HasBP = TRI.hasBasePointer(MF))) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
+        .addReg(StackPtrReg)
+        .setMIFlag(MachineInstr::FrameSetup);
   }
 
   if (HasFP && RoundedSize != 0) {
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_ADD_U32), StackPtrReg)
-      .addReg(StackPtrReg)
-      .addImm(RoundedSize * ST.getWavefrontSize())
-      .setMIFlag(MachineInstr::FrameSetup);
+        .addReg(StackPtrReg)
+        .addImm(RoundedSize * ST.getWavefrontSize())
+        .setMIFlag(MachineInstr::FrameSetup);
   }
 
-  assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister ||
+  assert((!HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy ||
                      FuncInfo->FramePointerSaveIndex)) &&
          "Needed to save FP but didn't save it anywhere");
 
-  assert((HasFP || (FuncInfo->SGPRForFPSaveRestoreCopy == AMDGPU::NoRegister &&
+  assert((HasFP || (!FuncInfo->SGPRForFPSaveRestoreCopy &&
                     !FuncInfo->FramePointerSaveIndex)) &&
          "Saved FP but didn't need it");
+
+  assert((!HasBP || (FuncInfo->SGPRForBPSaveRestoreCopy ||
+                     FuncInfo->BasePointerSaveIndex)) &&
+         "Needed to save BP but didn't save it anywhere");
+
+  assert((HasBP || (!FuncInfo->SGPRForBPSaveRestoreCopy &&
+                    !FuncInfo->BasePointerSaveIndex)) &&
+         "Saved BP but didn't need it");
 }
 
 void SIFrameLowering::emitEpilogue(MachineFunction &MF,
@@ -828,81 +933,126 @@ void SIFrameLowering::emitEpilogue(MachineFunction &MF,
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIInstrInfo *TII = ST.getInstrInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
+  const SIRegisterInfo &TRI = TII->getRegisterInfo();
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
   LivePhysRegs LiveRegs;
   DebugLoc DL;
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   uint32_t NumBytes = MFI.getStackSize();
-  uint32_t RoundedSize = FuncInfo->isStackRealigned() ?
-    NumBytes + MFI.getMaxAlignment() : NumBytes;
+  uint32_t RoundedSize = FuncInfo->isStackRealigned()
+                             ? NumBytes + MFI.getMaxAlign().value()
+                             : NumBytes;
+  const Register StackPtrReg = FuncInfo->getStackPtrOffsetReg();
+  const Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+  const Register BasePtrReg =
+      TRI.hasBasePointer(MF) ? TRI.getBaseRegister() : Register();
+
+  bool HasFPSaveIndex = FuncInfo->FramePointerSaveIndex.hasValue();
+  bool SpillFPToMemory = false;
+  if (HasFPSaveIndex) {
+    SpillFPToMemory = MFI.getStackID(*FuncInfo->FramePointerSaveIndex) !=
+                      TargetStackID::SGPRSpill;
+  }
+
+  bool HasBPSaveIndex = FuncInfo->BasePointerSaveIndex.hasValue();
+  bool SpillBPToMemory = false;
+  if (HasBPSaveIndex) {
+    SpillBPToMemory = MFI.getStackID(*FuncInfo->BasePointerSaveIndex) !=
+                      TargetStackID::SGPRSpill;
+  }
 
   if (RoundedSize != 0 && hasFP(MF)) {
-    const unsigned StackPtrReg = FuncInfo->getStackPtrOffsetReg();
     BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::S_SUB_U32), StackPtrReg)
       .addReg(StackPtrReg)
       .addImm(RoundedSize * ST.getWavefrontSize())
       .setMIFlag(MachineInstr::FrameDestroy);
   }
 
-  if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister) {
-    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FuncInfo->getFrameOffsetReg())
-      .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
-      .setMIFlag(MachineInstr::FrameSetup);
+  if (FuncInfo->SGPRForFPSaveRestoreCopy) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), FramePtrReg)
+        .addReg(FuncInfo->SGPRForFPSaveRestoreCopy)
+        .setMIFlag(MachineInstr::FrameSetup);
   }
 
-  if (FuncInfo->FramePointerSaveIndex) {
-    const int FI = FuncInfo->FramePointerSaveIndex.getValue();
+  if (FuncInfo->SGPRForBPSaveRestoreCopy) {
+    BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::COPY), BasePtrReg)
+        .addReg(FuncInfo->SGPRForBPSaveRestoreCopy)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
 
-    assert(!MF.getFrameInfo().isDeadObjectIndex(FI) &&
-           MF.getFrameInfo().getStackID(FI) == TargetStackID::SGPRSpill);
+  Register ScratchExecCopy;
+  if (HasFPSaveIndex) {
+    const int FI = FuncInfo->FramePointerSaveIndex.getValue();
+    assert(!MFI.isDeadObjectIndex(FI));
+    if (SpillFPToMemory) {
+      if (!ScratchExecCopy)
+        ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
+
+      MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
+          MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+      buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
+                        FuncInfo->getScratchRSrcReg(), StackPtrReg, FI);
+      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), FramePtrReg)
+          .addReg(TempVGPR, RegState::Kill);
+    } else {
+      // Reload from VGPR spill.
+      assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill);
+      ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+          FuncInfo->getSGPRToVGPRSpills(FI);
+      assert(Spill.size() == 1);
+      BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+              FramePtrReg)
+          .addReg(Spill[0].VGPR)
+          .addImm(Spill[0].Lane);
+    }
+  }
 
-    ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill
-      = FuncInfo->getSGPRToVGPRSpills(FI);
-    assert(Spill.size() == 1);
-    BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
-            FuncInfo->getFrameOffsetReg())
-      .addReg(Spill[0].VGPR)
-      .addImm(Spill[0].Lane);
+  if (HasBPSaveIndex) {
+    const int BasePtrFI = *FuncInfo->BasePointerSaveIndex;
+    assert(!MFI.isDeadObjectIndex(BasePtrFI));
+    if (SpillBPToMemory) {
+      if (!ScratchExecCopy)
+        ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
+
+      MCPhysReg TempVGPR = findScratchNonCalleeSaveRegister(
+          MRI, LiveRegs, AMDGPU::VGPR_32RegClass);
+      buildEpilogReload(LiveRegs, MBB, MBBI, TII, TempVGPR,
+                        FuncInfo->getScratchRSrcReg(), StackPtrReg, BasePtrFI);
+      BuildMI(MBB, MBBI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), BasePtrReg)
+          .addReg(TempVGPR, RegState::Kill);
+    } else {
+      // Reload from VGPR spill.
+      assert(MFI.getStackID(BasePtrFI) == TargetStackID::SGPRSpill);
+      ArrayRef<SIMachineFunctionInfo::SpilledReg> Spill =
+          FuncInfo->getSGPRToVGPRSpills(BasePtrFI);
+      assert(Spill.size() == 1);
+      BuildMI(MBB, MBBI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+              BasePtrReg)
+          .addReg(Spill[0].VGPR)
+          .addImm(Spill[0].Lane);
+    }
   }
 
-  unsigned ScratchExecCopy = AMDGPU::NoRegister;
-  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg
-         : FuncInfo->getSGPRSpillVGPRs()) {
+  for (const SIMachineFunctionInfo::SGPRSpillVGPRCSR &Reg :
+       FuncInfo->getSGPRSpillVGPRs()) {
     if (!Reg.FI.hasValue())
       continue;
 
-    const SIRegisterInfo &TRI = TII->getRegisterInfo();
-    if (ScratchExecCopy == AMDGPU::NoRegister) {
-      // See emitPrologue
-      if (LiveRegs.empty()) {
-        LiveRegs.init(*ST.getRegisterInfo());
-        LiveRegs.addLiveOuts(MBB);
-        LiveRegs.stepBackward(*MBBI);
-      }
-
-      ScratchExecCopy = findScratchNonCalleeSaveRegister(
-          MRI, LiveRegs, *TRI.getWaveMaskRegClass());
-      LiveRegs.removeReg(ScratchExecCopy);
-
-      const unsigned OrSaveExec =
-          ST.isWave32() ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64;
-
-      BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), ScratchExecCopy)
-        .addImm(-1);
-    }
+    if (!ScratchExecCopy)
+      ScratchExecCopy = buildScratchExecCopy(LiveRegs, MF, MBB, MBBI, false);
 
     buildEpilogReload(LiveRegs, MBB, MBBI, TII, Reg.VGPR,
-                      FuncInfo->getScratchRSrcReg(),
-                      FuncInfo->getStackPtrOffsetReg(), Reg.FI.getValue());
+                      FuncInfo->getScratchRSrcReg(), StackPtrReg,
+                      Reg.FI.getValue());
   }
 
-  if (ScratchExecCopy != AMDGPU::NoRegister) {
+  if (ScratchExecCopy) {
     // FIXME: Split block and make terminator.
     unsigned ExecMov = ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
-    unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+    MCRegister Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
     BuildMI(MBB, MBBI, DL, TII->get(ExecMov), Exec)
-      .addReg(ScratchExecCopy, RegState::Kill);
+        .addReg(ScratchExecCopy, RegState::Kill);
   }
 }
 
@@ -920,12 +1070,14 @@ static bool allStackObjectsAreDead(const MachineFrameInfo &MFI) {
 
 #ifndef NDEBUG
 static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
-                                 Optional<int> FramePointerSaveIndex) {
+                                 Optional<int> FramePointerSaveIndex,
+                                 Optional<int> BasePointerSaveIndex) {
   for (int I = MFI.getObjectIndexBegin(), E = MFI.getObjectIndexEnd();
        I != E; ++I) {
     if (!MFI.isDeadObjectIndex(I) &&
         MFI.getStackID(I) == TargetStackID::SGPRSpill &&
-        FramePointerSaveIndex && I != FramePointerSaveIndex) {
+        ((FramePointerSaveIndex && I != FramePointerSaveIndex) ||
+         (BasePointerSaveIndex && I != BasePointerSaveIndex))) {
       return false;
     }
   }
@@ -935,7 +1087,7 @@ static bool allSGPRSpillsAreDead(const MachineFrameInfo &MFI,
 #endif
 
 int SIFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
-                                            unsigned &FrameReg) const {
+                                            Register &FrameReg) const {
   const SIRegisterInfo *RI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
 
   FrameReg = RI->getFrameRegister(MF);
@@ -952,7 +1104,7 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
   SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
 
   FuncInfo->removeDeadFrameIndices(MFI);
-  assert(allSGPRSpillsAreDead(MFI, None) &&
+  assert(allSGPRSpillsAreDead(MFI, None, None) &&
          "SGPR spill should have been removed in SILowerSGPRSpills");
 
   // FIXME: The other checks should be redundant with allStackObjectsAreDead,
@@ -967,9 +1119,8 @@ void SIFrameLowering::processFunctionBeforeFrameFinalized(
       RS->addScavengingFrameIndex(ScavengeFI);
     } else {
       int ScavengeFI = MFI.CreateStackObject(
-        TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
-        TRI->getSpillAlignment(AMDGPU::SGPR_32RegClass),
-        false);
+          TRI->getSpillSize(AMDGPU::SGPR_32RegClass),
+          TRI->getSpillAlign(AMDGPU::SGPR_32RegClass), false);
       RS->addScavengingFrameIndex(ScavengeFI);
     }
   }
@@ -984,7 +1135,7 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (MFI->isEntryFunction())
     return;
 
-  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   const SIRegisterInfo *TRI = ST.getRegisterInfo();
 
@@ -1008,46 +1159,19 @@ void SIFrameLowering::determineCalleeSaves(MachineFunction &MF,
   for (auto SSpill : MFI->getSGPRSpillVGPRs())
     SavedVGPRs.reset(SSpill.VGPR);
 
-  const bool HasFP = WillHaveFP || hasFP(MF);
-  if (!HasFP)
-    return;
-
-  if (MFI->haveFreeLanesForSGPRSpill(MF, 1)) {
-    int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
-                                                    TargetStackID::SGPRSpill);
-
-    // If there is already a VGPR with free lanes, use it. We may already have
-    // to pay the penalty for spilling a CSR VGPR.
-    if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
-      llvm_unreachable("allocate SGPR spill should have worked");
-
-    MFI->FramePointerSaveIndex = NewFI;
+  LivePhysRegs LiveRegs;
+  LiveRegs.init(*TRI);
 
-    LLVM_DEBUG(
-      auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
-      dbgs() << "Spilling FP to  " << printReg(Spill.VGPR, TRI)
-             << ':' << Spill.Lane << '\n');
-    return;
+  if (WillHaveFP || hasFP(MF)) {
+    getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForFPSaveRestoreCopy,
+                                   MFI->FramePointerSaveIndex, true);
   }
 
-  MFI->SGPRForFPSaveRestoreCopy = findUnusedSGPRNonCalleeSaved(MF.getRegInfo());
-
-  if (!MFI->SGPRForFPSaveRestoreCopy) {
-    // There's no free lane to spill, and no free register to save FP, so we're
-    // forced to spill another VGPR to use for the spill.
-    int NewFI = MF.getFrameInfo().CreateStackObject(4, 4, true, nullptr,
-                                                    TargetStackID::SGPRSpill);
-    if (!MFI->allocateSGPRSpillToVGPR(MF, NewFI))
-      llvm_unreachable("allocate SGPR spill should have worked");
-    MFI->FramePointerSaveIndex = NewFI;
-
-    LLVM_DEBUG(
-      auto Spill = MFI->getSGPRToVGPRSpills(NewFI).front();
-      dbgs() << "FP requires fallback spill to " << printReg(Spill.VGPR, TRI)
-             << ':' << Spill.Lane << '\n';);
-  } else {
-    LLVM_DEBUG(dbgs() << "Saving FP with copy to " <<
-               printReg(MFI->SGPRForFPSaveRestoreCopy, TRI) << '\n');
+  if (TRI->hasBasePointer(MF)) {
+    if (MFI->SGPRForFPSaveRestoreCopy)
+      LiveRegs.addReg(MFI->SGPRForFPSaveRestoreCopy);
+    getVGPRSpillLaneOrTempRegister(MF, LiveRegs, MFI->SGPRForBPSaveRestoreCopy,
+                                   MFI->BasePointerSaveIndex, false);
   }
 }
 
@@ -1074,14 +1198,31 @@ bool SIFrameLowering::assignCalleeSavedSpillSlots(
     return true; // Early exit if no callee saved registers are modified!
 
   const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-  if (!FuncInfo->SGPRForFPSaveRestoreCopy)
+  if (!FuncInfo->SGPRForFPSaveRestoreCopy &&
+      !FuncInfo->SGPRForBPSaveRestoreCopy)
     return false;
 
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *RI = ST.getRegisterInfo();
+  Register FramePtrReg = FuncInfo->getFrameOffsetReg();
+  Register BasePtrReg = RI->getBaseRegister();
+  unsigned NumModifiedRegs = 0;
+
+  if (FuncInfo->SGPRForFPSaveRestoreCopy)
+    NumModifiedRegs++;
+  if (FuncInfo->SGPRForBPSaveRestoreCopy)
+    NumModifiedRegs++;
+
   for (auto &CS : CSI) {
-    if (CS.getReg() == FuncInfo->getFrameOffsetReg()) {
-      if (FuncInfo->SGPRForFPSaveRestoreCopy != AMDGPU::NoRegister)
-        CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
-      break;
+    if (CS.getReg() == FramePtrReg && FuncInfo->SGPRForFPSaveRestoreCopy) {
+      CS.setDstReg(FuncInfo->SGPRForFPSaveRestoreCopy);
+      if (--NumModifiedRegs)
+        break;
+    } else if (CS.getReg() == BasePtrReg &&
+               FuncInfo->SGPRForBPSaveRestoreCopy) {
+      CS.setDstReg(FuncInfo->SGPRForBPSaveRestoreCopy);
+      if (--NumModifiedRegs)
+        break;
     }
   }
 
@@ -1104,12 +1245,10 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
   uint64_t CalleePopAmount = IsDestroy ? I->getOperand(1).getImm() : 0;
 
   if (!hasReservedCallFrame(MF)) {
-    unsigned Align = getStackAlignment();
-
-    Amount = alignTo(Amount, Align);
+    Amount = alignTo(Amount, getStackAlign());
     assert(isUInt<32>(Amount) && "exceeded stack address space size");
     const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
-    unsigned SPReg = MFI->getStackPtrOffsetReg();
+    Register SPReg = MFI->getStackPtrOffsetReg();
 
     unsigned Op = IsDestroy ? AMDGPU::S_SUB_U32 : AMDGPU::S_ADD_U32;
     BuildMI(MBB, I, DL, TII->get(Op), SPReg)
@@ -1124,19 +1263,17 @@ MachineBasicBlock::iterator SIFrameLowering::eliminateCallFramePseudoInstr(
 
 bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  if (MFI.hasCalls()) {
+
+  // For entry functions we can use an immediate offset in most cases, so the
+  // presence of calls doesn't imply we need a distinct frame pointer.
+  if (MFI.hasCalls() &&
+      !MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction()) {
     // All offsets are unsigned, so need to be addressed in the same direction
     // as stack growth.
 
     // FIXME: This function is pretty broken, since it can be called before the
     // frame layout is determined or CSR spills are inserted.
-    if (MFI.getStackSize() != 0)
-      return true;
-
-    // For the entry point, the input wave scratch offset must be copied to the
-    // API SP if there are calls.
-    if (MF.getInfo<SIMachineFunctionInfo>()->isEntryFunction())
-      return true;
+    return MFI.getStackSize() != 0;
   }
 
   return MFI.hasVarSizedObjects() || MFI.isFrameAddressTaken() ||
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index d9970fd6b4b8..e89432040661 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -21,7 +21,7 @@ class GCNSubtarget;
 class SIFrameLowering final : public AMDGPUFrameLowering {
 public:
   SIFrameLowering(StackDirection D, Align StackAl, int LAO,
-                  Align TransAl = Align::None())
+                  Align TransAl = Align(1))
       : AMDGPUFrameLowering(D, StackAl, LAO, TransAl) {}
   ~SIFrameLowering() override = default;
 
@@ -32,7 +32,7 @@ public:
   void emitEpilogue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
+                             Register &FrameReg) const override;
 
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS = nullptr) const override;
@@ -55,26 +55,19 @@ public:
                                 MachineBasicBlock::iterator MI) const override;
 
 private:
-  void emitFlatScratchInit(const GCNSubtarget &ST,
-                           MachineFunction &MF,
-                           MachineBasicBlock &MBB) const;
-
-  unsigned getReservedPrivateSegmentBufferReg(
-    const GCNSubtarget &ST,
-    const SIInstrInfo *TII,
-    const SIRegisterInfo *TRI,
-    SIMachineFunctionInfo *MFI,
-    MachineFunction &MF) const;
-
-  std::pair<unsigned, bool> getReservedPrivateSegmentWaveByteOffsetReg(
-      const GCNSubtarget &ST, const SIInstrInfo *TII, const SIRegisterInfo *TRI,
-      SIMachineFunctionInfo *MFI, MachineFunction &MF) const;
-
-  // Emit scratch setup code for AMDPAL or Mesa, assuming ResourceRegUsed is set.
-  void emitEntryFunctionScratchSetup(const GCNSubtarget &ST, MachineFunction &MF,
-      MachineBasicBlock &MBB, SIMachineFunctionInfo *MFI,
-      MachineBasicBlock::iterator I, unsigned PreloadedPrivateBufferReg,
-      unsigned ScratchRsrcReg) const;
+  void emitEntryFunctionFlatScratchInit(MachineFunction &MF,
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator I,
+                                        const DebugLoc &DL,
+                                        Register ScratchWaveOffsetReg) const;
+
+  Register getEntryFunctionReservedScratchRsrcReg(MachineFunction &MF) const;
+
+  void emitEntryFunctionScratchRsrcRegSetup(
+      MachineFunction &MF, MachineBasicBlock &MBB,
+      MachineBasicBlock::iterator I, const DebugLoc &DL,
+      Register PreloadedPrivateBufferReg, Register ScratchRsrcReg,
+      Register ScratchWaveOffsetReg) const;
 
 public:
   bool hasFP(const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index e73d87cd66af..d035aa8f72bd 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -11,11 +11,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-#if defined(_MSC_VER) || defined(__MINGW32__)
-// Provide M_PI.
-#define _USE_MATH_DEFINES
-#endif
-
 #include "SIISelLowering.h"
 #include "AMDGPU.h"
 #include "AMDGPUSubtarget.h"
@@ -40,6 +35,7 @@
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/DAGCombine.h"
 #include "llvm/CodeGen/ISDOpcodes.h"
+#include "llvm/CodeGen/GlobalISel/GISelKnownBits.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -95,14 +91,24 @@ static cl::opt<bool> DisableLoopAlignment(
   cl::desc("Do not align and prefetch loops"),
   cl::init(false));
 
+static cl::opt<bool> VGPRReserveforSGPRSpill(
+    "amdgpu-reserve-vgpr-for-sgpr-spill",
+    cl::desc("Allocates one VGPR for future SGPR Spill"), cl::init(true));
+
+static cl::opt<bool> UseDivergentRegisterIndexing(
+  "amdgpu-use-divergent-register-indexing",
+  cl::Hidden,
+  cl::desc("Use indirect register addressing for divergent indexes"),
+  cl::init(false));
+
 static bool hasFP32Denormals(const MachineFunction &MF) {
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  return Info->getMode().FP32Denormals;
+  return Info->getMode().allFP32Denormals();
 }
 
 static bool hasFP64FP16Denormals(const MachineFunction &MF) {
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  return Info->getMode().FP64FP16Denormals;
+  return Info->getMode().allFP64FP16Denormals();
 }
 
 static unsigned findFirstFreeSGPR(CCState &CCInfo) {
@@ -141,12 +147,21 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   addRegisterClass(MVT::v5i32, &AMDGPU::SGPR_160RegClass);
   addRegisterClass(MVT::v5f32, &AMDGPU::VReg_160RegClass);
 
-  addRegisterClass(MVT::v8i32, &AMDGPU::SReg_256RegClass);
+  addRegisterClass(MVT::v8i32, &AMDGPU::SGPR_256RegClass);
   addRegisterClass(MVT::v8f32, &AMDGPU::VReg_256RegClass);
 
-  addRegisterClass(MVT::v16i32, &AMDGPU::SReg_512RegClass);
+  addRegisterClass(MVT::v4i64, &AMDGPU::SGPR_256RegClass);
+  addRegisterClass(MVT::v4f64, &AMDGPU::VReg_256RegClass);
+
+  addRegisterClass(MVT::v16i32, &AMDGPU::SGPR_512RegClass);
   addRegisterClass(MVT::v16f32, &AMDGPU::VReg_512RegClass);
 
+  addRegisterClass(MVT::v8i64, &AMDGPU::SGPR_512RegClass);
+  addRegisterClass(MVT::v8f64, &AMDGPU::VReg_512RegClass);
+
+  addRegisterClass(MVT::v16i64, &AMDGPU::SGPR_1024RegClass);
+  addRegisterClass(MVT::v16f64, &AMDGPU::VReg_1024RegClass);
+
   if (Subtarget->has16BitInsts()) {
     addRegisterClass(MVT::i16, &AMDGPU::SReg_32RegClass);
     addRegisterClass(MVT::f16, &AMDGPU::SReg_32RegClass);
@@ -158,10 +173,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     addRegisterClass(MVT::v4f16, &AMDGPU::SReg_64RegClass);
   }
 
-  if (Subtarget->hasMAIInsts()) {
-    addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
-    addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
-  }
+  addRegisterClass(MVT::v32i32, &AMDGPU::VReg_1024RegClass);
+  addRegisterClass(MVT::v32f32, &AMDGPU::VReg_1024RegClass);
 
   computeRegisterProperties(Subtarget->getRegisterInfo());
 
@@ -202,6 +215,17 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTruncStoreAction(MVT::v8i32, MVT::v8i8, Expand);
   setTruncStoreAction(MVT::v16i32, MVT::v16i8, Expand);
   setTruncStoreAction(MVT::v32i32, MVT::v32i8, Expand);
+  setTruncStoreAction(MVT::v2i16, MVT::v2i8, Expand);
+  setTruncStoreAction(MVT::v4i16, MVT::v4i8, Expand);
+  setTruncStoreAction(MVT::v8i16, MVT::v8i8, Expand);
+  setTruncStoreAction(MVT::v16i16, MVT::v16i8, Expand);
+  setTruncStoreAction(MVT::v32i16, MVT::v32i8, Expand);
+
+  setTruncStoreAction(MVT::v4i64, MVT::v4i8, Expand);
+  setTruncStoreAction(MVT::v8i64, MVT::v8i8, Expand);
+  setTruncStoreAction(MVT::v8i64, MVT::v8i16, Expand);
+  setTruncStoreAction(MVT::v8i64, MVT::v8i32, Expand);
+  setTruncStoreAction(MVT::v16i64, MVT::v16i32, Expand);
 
   setOperationAction(ISD::GlobalAddress, MVT::i32, Custom);
   setOperationAction(ISD::GlobalAddress, MVT::i64, Custom);
@@ -224,6 +248,12 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::TRUNCATE, MVT::v2i32, Expand);
   setOperationAction(ISD::FP_ROUND, MVT::v2f32, Expand);
+  setOperationAction(ISD::TRUNCATE, MVT::v4i32, Expand);
+  setOperationAction(ISD::FP_ROUND, MVT::v4f32, Expand);
+  setOperationAction(ISD::TRUNCATE, MVT::v8i32, Expand);
+  setOperationAction(ISD::FP_ROUND, MVT::v8f32, Expand);
+  setOperationAction(ISD::TRUNCATE, MVT::v16i32, Expand);
+  setOperationAction(ISD::FP_ROUND, MVT::v16f32, Expand);
 
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v2i1, Custom);
   setOperationAction(ISD::SIGN_EXTEND_INREG, MVT::v4i1, Custom);
@@ -260,7 +290,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   // with > 4 elements.
   for (MVT VT : { MVT::v8i32, MVT::v8f32, MVT::v16i32, MVT::v16f32,
                   MVT::v2i64, MVT::v2f64, MVT::v4i16, MVT::v4f16,
-                  MVT::v32i32, MVT::v32f32 }) {
+                  MVT::v4i64, MVT::v4f64, MVT::v8i64, MVT::v8f64,
+                  MVT::v16i64, MVT::v16f64, MVT::v32i32, MVT::v32f32 }) {
     for (unsigned Op = 0; Op < ISD::BUILTIN_OP_END; ++Op) {
       switch (Op) {
       case ISD::LOAD:
@@ -304,6 +335,48 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v4i32);
   }
 
+  for (MVT Vec64 : { MVT::v4i64, MVT::v4f64 }) {
+    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v8i32);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v8i32);
+
+    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v8i32);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v8i32);
+  }
+
+  for (MVT Vec64 : { MVT::v8i64, MVT::v8f64 }) {
+    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v16i32);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v16i32);
+
+    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v16i32);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v16i32);
+  }
+
+  for (MVT Vec64 : { MVT::v16i64, MVT::v16f64 }) {
+    setOperationAction(ISD::BUILD_VECTOR, Vec64, Promote);
+    AddPromotedToType(ISD::BUILD_VECTOR, Vec64, MVT::v32i32);
+
+    setOperationAction(ISD::EXTRACT_VECTOR_ELT, Vec64, Promote);
+    AddPromotedToType(ISD::EXTRACT_VECTOR_ELT, Vec64, MVT::v32i32);
+
+    setOperationAction(ISD::INSERT_VECTOR_ELT, Vec64, Promote);
+    AddPromotedToType(ISD::INSERT_VECTOR_ELT, Vec64, MVT::v32i32);
+
+    setOperationAction(ISD::SCALAR_TO_VECTOR, Vec64, Promote);
+    AddPromotedToType(ISD::SCALAR_TO_VECTOR, Vec64, MVT::v32i32);
+  }
+
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8i32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v8f32, Expand);
   setOperationAction(ISD::VECTOR_SHUFFLE, MVT::v16i32, Expand);
@@ -361,9 +434,14 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::ADDRSPACECAST, MVT::i64, Custom);
   }
 
-  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
   setOperationAction(ISD::BITREVERSE, MVT::i32, Legal);
 
+  // FIXME: This should be narrowed to i32, but that only happens if i64 is
+  // illegal.
+  // FIXME: Should lower sub-i32 bswaps to bit-ops without v_perm_b32.
+  setOperationAction(ISD::BSWAP, MVT::i64, Legal);
+  setOperationAction(ISD::BSWAP, MVT::i32, Legal);
+
   // On SI this is s_memtime and s_memrealtime on VI.
   setOperationAction(ISD::READCYCLECOUNTER, MVT::i64, Legal);
   setOperationAction(ISD::TRAP, MVT::Other, Custom);
@@ -376,10 +454,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FLOG10, MVT::f16, Custom);
   }
 
-  // v_mad_f32 does not support denormals. We report it as unconditionally
-  // legal, and the context where it is formed will disallow it when fp32
-  // denormals are enabled.
-  setOperationAction(ISD::FMAD, MVT::f32, Legal);
+  if (Subtarget->hasMadMacF32Insts())
+    setOperationAction(ISD::FMAD, MVT::f32, Legal);
 
   if (!Subtarget->hasBFI()) {
     // fcopysign can be done in a single instruction with BFI.
@@ -463,7 +539,6 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SREM, MVT::i16, Promote);
     setOperationAction(ISD::UREM, MVT::i16, Promote);
 
-    setOperationAction(ISD::BSWAP, MVT::i16, Promote);
     setOperationAction(ISD::BITREVERSE, MVT::i16, Promote);
 
     setOperationAction(ISD::CTTZ, MVT::i16, Promote);
@@ -499,8 +574,8 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
 
     // F16 - VOP1 Actions.
     setOperationAction(ISD::FP_ROUND, MVT::f16, Custom);
-    setOperationAction(ISD::FCOS, MVT::f16, Promote);
-    setOperationAction(ISD::FSIN, MVT::f16, Promote);
+    setOperationAction(ISD::FCOS, MVT::f16, Custom);
+    setOperationAction(ISD::FSIN, MVT::f16, Custom);
 
     setOperationAction(ISD::SINT_TO_FP, MVT::i16, Custom);
     setOperationAction(ISD::UINT_TO_FP, MVT::i16, Custom);
@@ -545,6 +620,11 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
       }
     }
 
+    // v_perm_b32 can handle either of these.
+    setOperationAction(ISD::BSWAP, MVT::i16, Legal);
+    setOperationAction(ISD::BSWAP, MVT::v2i16, Legal);
+    setOperationAction(ISD::BSWAP, MVT::v4i16, Custom);
+
     // XXX - Do these do anything? Vector constants turn into build_vector.
     setOperationAction(ISD::Constant, MVT::v2i16, Legal);
     setOperationAction(ISD::ConstantFP, MVT::v2f16, Legal);
@@ -686,6 +766,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SELECT, VT, Custom);
   }
 
+  setOperationAction(ISD::SMULO, MVT::i64, Custom);
+  setOperationAction(ISD::UMULO, MVT::i64, Custom);
+
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::f32, Custom);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::v4f32, Custom);
@@ -762,6 +845,9 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::ATOMIC_LOAD_UMAX);
   setTargetDAGCombine(ISD::ATOMIC_LOAD_FADD);
 
+  // FIXME: In other contexts we pretend this is a per-function property.
+  setStackPointerRegisterToSaveRestore(AMDGPU::SGPR32);
+
   setSchedulingPreference(Sched::RegPressure);
 }
 
@@ -783,6 +869,7 @@ bool SITargetLowering::isFPExtFoldable(const SelectionDAG &DAG, unsigned Opcode,
           (Opcode == ISD::FMA && Subtarget->hasFmaMixInsts())) &&
     DestVT.getScalarType() == MVT::f32 &&
     SrcVT.getScalarType() == MVT::f16 &&
+    // TODO: This probably only requires no input flushing?
     !hasFP32Denormals(DAG.getMachineFunction());
 }
 
@@ -877,45 +964,33 @@ unsigned SITargetLowering::getVectorTypeBreakdownForCallingConv(
     Context, CC, VT, IntermediateVT, NumIntermediates, RegisterVT);
 }
 
-static MVT memVTFromAggregate(Type *Ty) {
-  // Only limited forms of aggregate type currently expected.
-  assert(Ty->isStructTy() && "Expected struct type");
-
+static EVT memVTFromImageData(Type *Ty, unsigned DMaskLanes) {
+  assert(DMaskLanes != 0);
 
-  Type *ElementType = nullptr;
-  unsigned NumElts;
-  if (Ty->getContainedType(0)->isVectorTy()) {
-    VectorType *VecComponent = cast<VectorType>(Ty->getContainedType(0));
-    ElementType = VecComponent->getElementType();
-    NumElts = VecComponent->getNumElements();
-  } else {
-    ElementType = Ty->getContainedType(0);
-    NumElts = 1;
+  if (auto *VT = dyn_cast<FixedVectorType>(Ty)) {
+    unsigned NumElts = std::min(DMaskLanes, VT->getNumElements());
+    return EVT::getVectorVT(Ty->getContext(),
+                            EVT::getEVT(VT->getElementType()),
+                            NumElts);
   }
 
-  assert((Ty->getContainedType(1) && Ty->getContainedType(1)->isIntegerTy(32)) && "Expected int32 type");
+  return EVT::getEVT(Ty);
+}
 
-  // Calculate the size of the memVT type from the aggregate
-  unsigned Pow2Elts = 0;
-  unsigned ElementSize;
-  switch (ElementType->getTypeID()) {
-    default:
-      llvm_unreachable("Unknown type!");
-    case Type::IntegerTyID:
-      ElementSize = cast<IntegerType>(ElementType)->getBitWidth();
-      break;
-    case Type::HalfTyID:
-      ElementSize = 16;
-      break;
-    case Type::FloatTyID:
-      ElementSize = 32;
-      break;
-  }
-  unsigned AdditionalElts = ElementSize == 16 ? 2 : 1;
-  Pow2Elts = 1 << Log2_32_Ceil(NumElts + AdditionalElts);
+// Peek through TFE struct returns to only use the data size.
+static EVT memVTFromImageReturn(Type *Ty, unsigned DMaskLanes) {
+  auto *ST = dyn_cast<StructType>(Ty);
+  if (!ST)
+    return memVTFromImageData(Ty, DMaskLanes);
 
-  return MVT::getVectorVT(MVT::getVT(ElementType, false),
-                          Pow2Elts);
+  // Some intrinsics return an aggregate type - special case to work out the
+  // correct memVT.
+  //
+  // Only limited forms of aggregate type currently expected.
+  if (ST->getNumContainedTypes() != 2 ||
+      !ST->getContainedType(1)->isIntegerTy(32))
+    return EVT();
+  return memVTFromImageData(ST->getContainedType(0), DMaskLanes);
 }
 
 bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
@@ -944,17 +1019,40 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 
     Info.flags = MachineMemOperand::MODereferenceable;
     if (Attr.hasFnAttribute(Attribute::ReadOnly)) {
+      unsigned DMaskLanes = 4;
+
+      if (RsrcIntr->IsImage) {
+        const AMDGPU::ImageDimIntrinsicInfo *Intr
+          = AMDGPU::getImageDimIntrinsicInfo(IntrID);
+        const AMDGPU::MIMGBaseOpcodeInfo *BaseOpcode =
+          AMDGPU::getMIMGBaseOpcodeInfo(Intr->BaseOpcode);
+
+        if (!BaseOpcode->Gather4) {
+          // If this isn't a gather, we may have excess loaded elements in the
+          // IR type. Check the dmask for the real number of elements loaded.
+          unsigned DMask
+            = cast<ConstantInt>(CI.getArgOperand(0))->getZExtValue();
+          DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
+        }
+
+        Info.memVT = memVTFromImageReturn(CI.getType(), DMaskLanes);
+      } else
+        Info.memVT = EVT::getEVT(CI.getType());
+
+      // FIXME: What does alignment mean for an image?
       Info.opc = ISD::INTRINSIC_W_CHAIN;
-      Info.memVT = MVT::getVT(CI.getType(), true);
-      if (Info.memVT == MVT::Other) {
-        // Some intrinsics return an aggregate type - special case to work out
-        // the correct memVT
-        Info.memVT = memVTFromAggregate(CI.getType());
-      }
       Info.flags |= MachineMemOperand::MOLoad;
     } else if (Attr.hasFnAttribute(Attribute::WriteOnly)) {
       Info.opc = ISD::INTRINSIC_VOID;
-      Info.memVT = MVT::getVT(CI.getArgOperand(0)->getType());
+
+      Type *DataTy = CI.getArgOperand(0)->getType();
+      if (RsrcIntr->IsImage) {
+        unsigned DMask = cast<ConstantInt>(CI.getArgOperand(1))->getZExtValue();
+        unsigned DMaskLanes = DMask == 0 ? 1 : countPopulation(DMask);
+        Info.memVT = memVTFromImageData(DataTy, DMaskLanes);
+      } else
+        Info.memVT = EVT::getEVT(DataTy);
+
       Info.flags |= MachineMemOperand::MOStore;
     } else {
       // Atomic
@@ -1031,6 +1129,17 @@ bool SITargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
 
     return true;
   }
+  case Intrinsic::amdgcn_global_atomic_csub: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    Info.memVT = MVT::getVT(CI.getType());
+    Info.ptrVal = CI.getOperand(0);
+    Info.align.reset();
+    Info.flags = MachineMemOperand::MOLoad |
+                 MachineMemOperand::MOStore |
+                 MachineMemOperand::MODereferenceable |
+                 MachineMemOperand::MOVolatile;
+    return true;
+  }
   case Intrinsic::amdgcn_ds_gws_init:
   case Intrinsic::amdgcn_ds_gws_barrier:
   case Intrinsic::amdgcn_ds_gws_sema_v:
@@ -1226,9 +1335,10 @@ bool SITargetLowering::isLegalAddressingMode(const DataLayout &DL,
     // addressing modes, so treat them as having no offset like flat
     // instructions.
     return isLegalFlatAddressingMode(AM);
-  } else {
-    llvm_unreachable("unhandled address space");
   }
+
+  // Assume a user alias of global for unknown address spaces.
+  return isLegalGlobalAddressingMode(AM);
 }
 
 bool SITargetLowering::canMergeStoresTo(unsigned AS, EVT MemVT,
@@ -1279,9 +1389,11 @@ bool SITargetLowering::allowsMisalignedMemoryAccessesImpl(
     // If we have an uniform constant load, it still requires using a slow
     // buffer instruction if unaligned.
     if (IsFast) {
+      // Accesses can really be issued as 1-byte aligned or 4-byte aligned, so
+      // 2-byte alignment is worse than 1 unless doing a 2-byte accesss.
       *IsFast = (AddrSpace == AMDGPUAS::CONSTANT_ADDRESS ||
                  AddrSpace == AMDGPUAS::CONSTANT_ADDRESS_32BIT) ?
-        (Align % 4 == 0) : true;
+        Align >= 4 : Align != 2;
     }
 
     return true;
@@ -1320,18 +1432,17 @@ bool SITargetLowering::allowsMisalignedMemoryAccesses(
 }
 
 EVT SITargetLowering::getOptimalMemOpType(
-    uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
-    bool ZeroMemset, bool MemcpyStrSrc,
-    const AttributeList &FuncAttributes) const {
+    const MemOp &Op, const AttributeList &FuncAttributes) const {
   // FIXME: Should account for address space here.
 
   // The default fallback uses the private pointer size as a guess for a type to
   // use. Make sure we switch these to 64-bit accesses.
 
-  if (Size >= 16 && DstAlign >= 4) // XXX: Should only do for global
+  if (Op.size() >= 16 &&
+      Op.isDstAligned(Align(4))) // XXX: Should only do for global
     return MVT::v4i32;
 
-  if (Size >= 8 && DstAlign >= 4)
+  if (Op.size() >= 8 && Op.isDstAligned(Align(4)))
     return MVT::v2i32;
 
   // Use the default.
@@ -1416,9 +1527,10 @@ SDValue SITargetLowering::lowerKernArgParameterPtr(SelectionDAG &DAG,
 
   const ArgDescriptor *InputPtrReg;
   const TargetRegisterClass *RC;
+  LLT ArgTy;
 
-  std::tie(InputPtrReg, RC)
-    = Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
+  std::tie(InputPtrReg, RC, ArgTy) =
+      Info->getPreloadedValue(AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
 
   MachineRegisterInfo &MRI = DAG.getMachineFunction().getRegInfo();
   MVT PtrVT = getPointerTy(DL, AMDGPUAS::CONSTANT_ADDRESS);
@@ -1457,7 +1569,7 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
   }
 
   if (MemVT.isFloatingPoint())
-    Val = getFPExtOrFPTrunc(DAG, Val, SL, VT);
+    Val = getFPExtOrFPRound(DAG, Val, SL, VT);
   else if (Signed)
     Val = DAG.getSExtOrTrunc(Val, SL, VT);
   else
@@ -1467,16 +1579,15 @@ SDValue SITargetLowering::convertArgType(SelectionDAG &DAG, EVT VT, EVT MemVT,
 }
 
 SDValue SITargetLowering::lowerKernargMemParameter(
-  SelectionDAG &DAG, EVT VT, EVT MemVT,
-  const SDLoc &SL, SDValue Chain,
-  uint64_t Offset, unsigned Align, bool Signed,
-  const ISD::InputArg *Arg) const {
+    SelectionDAG &DAG, EVT VT, EVT MemVT, const SDLoc &SL, SDValue Chain,
+    uint64_t Offset, Align Alignment, bool Signed,
+    const ISD::InputArg *Arg) const {
   MachinePointerInfo PtrInfo(AMDGPUAS::CONSTANT_ADDRESS);
 
   // Try to avoid using an extload by loading earlier than the argument address,
   // and extracting the relevant bits. The load should hopefully be merged with
   // the previous argument.
-  if (MemVT.getStoreSize() < 4 && Align < 4) {
+  if (MemVT.getStoreSize() < 4 && Alignment < 4) {
     // TODO: Handle align < 4 and size >= 4 (can happen with packed structs).
     int64_t AlignDownOffset = alignDown(Offset, 4);
     int64_t OffsetDiff = Offset - AlignDownOffset;
@@ -1502,9 +1613,9 @@ SDValue SITargetLowering::lowerKernargMemParameter(
   }
 
   SDValue Ptr = lowerKernArgParameterPtr(DAG, SL, Chain, Offset);
-  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Align,
+  SDValue Load = DAG.getLoad(MemVT, SL, Chain, Ptr, PtrInfo, Alignment,
                              MachineMemOperand::MODereferenceable |
-                             MachineMemOperand::MOInvariant);
+                                 MachineMemOperand::MOInvariant);
 
   SDValue Val = convertArgType(DAG, VT, MemVT, SL, Load, Signed, Arg);
   return DAG.getMergeValues({ Val, Load.getValue(1) }, SL);
@@ -1565,8 +1676,9 @@ SDValue SITargetLowering::getPreloadedValue(SelectionDAG &DAG,
   AMDGPUFunctionArgInfo::PreloadedValue PVID) const {
   const ArgDescriptor *Reg;
   const TargetRegisterClass *RC;
+  LLT Ty;
 
-  std::tie(Reg, RC) = MFI.getPreloadedValue(PVID);
+  std::tie(Reg, RC, Ty) = MFI.getPreloadedValue(PVID);
   return CreateLiveInRegister(DAG, RC, Reg->getRegister(), VT);
 }
 
@@ -1666,7 +1778,7 @@ static ArgDescriptor allocateVGPR32Input(CCState &CCInfo, unsigned Mask = ~0u,
   unsigned RegIdx = CCInfo.getFirstUnallocated(ArgVGPRs);
   if (RegIdx == ArgVGPRs.size()) {
     // Spill to stack required.
-    int64_t Offset = CCInfo.AllocateStack(4, 4);
+    int64_t Offset = CCInfo.AllocateStack(4, Align(4));
 
     return ArgDescriptor::createStack(Offset, Mask);
   }
@@ -1706,10 +1818,11 @@ static ArgDescriptor allocateSGPR64Input(CCState &CCInfo) {
   return allocateSGPR32InputImpl(CCInfo, &AMDGPU::SGPR_64RegClass, 16);
 }
 
-void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo,
-                                                 MachineFunction &MF,
-                                                 const SIRegisterInfo &TRI,
-                                                 SIMachineFunctionInfo &Info) const {
+/// Allocate implicit function VGPR arguments at the end of allocated user
+/// arguments.
+void SITargetLowering::allocateSpecialInputVGPRs(
+  CCState &CCInfo, MachineFunction &MF,
+  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
   const unsigned Mask = 0x3ff;
   ArgDescriptor Arg;
 
@@ -1727,6 +1840,20 @@ void SITargetLowering::allocateSpecialInputVGPRs(CCState &CCInfo,
     Info.setWorkItemIDZ(allocateVGPR32Input(CCInfo, Mask << 20, Arg));
 }
 
+/// Allocate implicit function VGPR arguments in fixed registers.
+void SITargetLowering::allocateSpecialInputVGPRsFixed(
+  CCState &CCInfo, MachineFunction &MF,
+  const SIRegisterInfo &TRI, SIMachineFunctionInfo &Info) const {
+  Register Reg = CCInfo.AllocateReg(AMDGPU::VGPR31);
+  if (!Reg)
+    report_fatal_error("failed to allocated VGPR for implicit arguments");
+
+  const unsigned Mask = 0x3ff;
+  Info.setWorkItemIDX(ArgDescriptor::createRegister(Reg, Mask));
+  Info.setWorkItemIDY(ArgDescriptor::createRegister(Reg, Mask << 10));
+  Info.setWorkItemIDZ(ArgDescriptor::createRegister(Reg, Mask << 20));
+}
+
 void SITargetLowering::allocateSpecialInputSGPRs(
   CCState &CCInfo,
   MachineFunction &MF,
@@ -1742,8 +1869,10 @@ void SITargetLowering::allocateSpecialInputSGPRs(
   if (Info.hasQueuePtr())
     ArgInfo.QueuePtr = allocateSGPR64Input(CCInfo);
 
-  if (Info.hasKernargSegmentPtr())
-    ArgInfo.KernargSegmentPtr = allocateSGPR64Input(CCInfo);
+  // Implicit arg ptr takes the place of the kernarg segment pointer. This is a
+  // constant offset from the kernarg segment.
+  if (Info.hasImplicitArgPtr())
+    ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
 
   if (Info.hasDispatchID())
     ArgInfo.DispatchID = allocateSGPR64Input(CCInfo);
@@ -1758,9 +1887,6 @@ void SITargetLowering::allocateSpecialInputSGPRs(
 
   if (Info.hasWorkGroupIDZ())
     ArgInfo.WorkGroupIDZ = allocateSGPR32Input(CCInfo);
-
-  if (Info.hasImplicitArgPtr())
-    ArgInfo.ImplicitArgPtr = allocateSGPR64Input(CCInfo);
 }
 
 // Allocate special inputs passed in user SGPRs.
@@ -1916,67 +2042,45 @@ static void reservePrivateMemoryRegs(const TargetMachine &TM,
     Info.setScratchRSrcReg(ReservedBufferReg);
   }
 
-  // hasFP should be accurate for kernels even before the frame is finalized.
-  if (ST.getFrameLowering()->hasFP(MF)) {
-    MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
 
-    // Try to use s32 as the SP, but move it if it would interfere with input
-    // arguments. This won't work with calls though.
-    //
-    // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
-    // registers.
-    if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
-      Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
-    } else {
-      assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
+  // For entry functions we have to set up the stack pointer if we use it,
+  // whereas non-entry functions get this "for free". This means there is no
+  // intrinsic advantage to using S32 over S34 in cases where we do not have
+  // calls but do need a frame pointer (i.e. if we are requested to have one
+  // because frame pointer elimination is disabled). To keep things simple we
+  // only ever use S32 as the call ABI stack pointer, and so using it does not
+  // imply we need a separate frame pointer.
+  //
+  // Try to use s32 as the SP, but move it if it would interfere with input
+  // arguments. This won't work with calls though.
+  //
+  // FIXME: Move SP to avoid any possible inputs, or find a way to spill input
+  // registers.
+  if (!MRI.isLiveIn(AMDGPU::SGPR32)) {
+    Info.setStackPtrOffsetReg(AMDGPU::SGPR32);
+  } else {
+    assert(AMDGPU::isShader(MF.getFunction().getCallingConv()));
 
-      if (MFI.hasCalls())
-        report_fatal_error("call in graphics shader with too many input SGPRs");
+    if (MFI.hasCalls())
+      report_fatal_error("call in graphics shader with too many input SGPRs");
 
-      for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
-        if (!MRI.isLiveIn(Reg)) {
-          Info.setStackPtrOffsetReg(Reg);
-          break;
-        }
+    for (unsigned Reg : AMDGPU::SGPR_32RegClass) {
+      if (!MRI.isLiveIn(Reg)) {
+        Info.setStackPtrOffsetReg(Reg);
+        break;
       }
-
-      if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
-        report_fatal_error("failed to find register for SP");
     }
 
-    if (MFI.hasCalls()) {
-      Info.setScratchWaveOffsetReg(AMDGPU::SGPR33);
-      Info.setFrameOffsetReg(AMDGPU::SGPR33);
-    } else {
-      unsigned ReservedOffsetReg =
-        TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
-      Info.setScratchWaveOffsetReg(ReservedOffsetReg);
-      Info.setFrameOffsetReg(ReservedOffsetReg);
-    }
-  } else if (RequiresStackAccess) {
-    assert(!MFI.hasCalls());
-    // We know there are accesses and they will be done relative to SP, so just
-    // pin it to the input.
-    //
-    // FIXME: Should not do this if inline asm is reading/writing these
-    // registers.
-    Register PreloadedSP = Info.getPreloadedReg(
-        AMDGPUFunctionArgInfo::PRIVATE_SEGMENT_WAVE_BYTE_OFFSET);
-
-    Info.setStackPtrOffsetReg(PreloadedSP);
-    Info.setScratchWaveOffsetReg(PreloadedSP);
-    Info.setFrameOffsetReg(PreloadedSP);
-  } else {
-    assert(!MFI.hasCalls());
+    if (Info.getStackPtrOffsetReg() == AMDGPU::SP_REG)
+      report_fatal_error("failed to find register for SP");
+  }
 
-    // There may not be stack access at all. There may still be spills, or
-    // access of a constant pointer (in which cases an extra copy will be
-    // emitted in the prolog).
-    unsigned ReservedOffsetReg
-      = TRI.reservedPrivateSegmentWaveByteOffsetReg(MF);
-    Info.setStackPtrOffsetReg(ReservedOffsetReg);
-    Info.setScratchWaveOffsetReg(ReservedOffsetReg);
-    Info.setFrameOffsetReg(ReservedOffsetReg);
+  // hasFP should be accurate for entry functions even before the frame is
+  // finalized, because it does not rely on the known stack size, only
+  // properties like whether variable sized objects are present.
+  if (ST.getFrameLowering()->hasFP(MF)) {
+    Info.setFrameOffsetReg(AMDGPU::SGPR33);
   }
 }
 
@@ -2110,6 +2214,10 @@ SDValue SITargetLowering::LowerFormalArguments(
   if (IsEntryFunc) {
     allocateSpecialEntryInputVGPRs(CCInfo, MF, *TRI, *Info);
     allocateHSAUserSGPRs(CCInfo, MF, *TRI, *Info);
+  } else {
+    // For the fixed ABI, pass workitem IDs in the last argument register.
+    if (AMDGPUTargetMachine::EnableFixedFunctionABI)
+      allocateSpecialInputVGPRsFixed(CCInfo, MF, *TRI, *Info);
   }
 
   if (IsKernel) {
@@ -2126,9 +2234,9 @@ SDValue SITargetLowering::LowerFormalArguments(
   //
   // FIXME: Alignment of explicit arguments totally broken with non-0 explicit
   // kern arg offset.
-  const unsigned KernelArgBaseAlign = 16;
+  const Align KernelArgBaseAlign = Align(16);
 
-   for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
+  for (unsigned i = 0, e = Ins.size(), ArgIdx = 0; i != e; ++i) {
     const ISD::InputArg &Arg = Ins[i];
     if (Arg.isOrigArg() && Skipped[Arg.getOrigArgIndex()]) {
       InVals.push_back(DAG.getUNDEF(Arg.VT));
@@ -2143,10 +2251,11 @@ SDValue SITargetLowering::LowerFormalArguments(
       EVT MemVT = VA.getLocVT();
 
       const uint64_t Offset = VA.getLocMemOffset();
-      unsigned Align = MinAlign(KernelArgBaseAlign, Offset);
+      Align Alignment = commonAlignment(KernelArgBaseAlign, Offset);
 
-      SDValue Arg = lowerKernargMemParameter(
-        DAG, VT, MemVT, DL, Chain, Offset, Align, Ins[i].Flags.isSExt(), &Ins[i]);
+      SDValue Arg =
+          lowerKernargMemParameter(DAG, VT, MemVT, DL, Chain, Offset, Alignment,
+                                   Ins[i].Flags.isSExt(), &Ins[i]);
       Chains.push_back(Arg.getValue(1));
 
       auto *ParamTy =
@@ -2221,7 +2330,7 @@ SDValue SITargetLowering::LowerFormalArguments(
     InVals.push_back(Val);
   }
 
-  if (!IsEntryFunc) {
+  if (!IsEntryFunc && !AMDGPUTargetMachine::EnableFixedFunctionABI) {
     // Special inputs come after user arguments.
     allocateSpecialInputVGPRs(CCInfo, MF, *TRI, *Info);
   }
@@ -2231,8 +2340,6 @@ SDValue SITargetLowering::LowerFormalArguments(
     allocateSystemSGPRs(CCInfo, MF, *Info, CallConv, IsShader);
   } else {
     CCInfo.AllocateReg(Info->getScratchRSrcReg());
-    CCInfo.AllocateReg(Info->getScratchWaveOffsetReg());
-    CCInfo.AllocateReg(Info->getFrameOffsetReg());
     allocateSpecialInputSGPRs(CCInfo, MF, *TRI, *Info);
   }
 
@@ -2442,50 +2549,51 @@ void SITargetLowering::passSpecialInputs(
     SDValue Chain) const {
   // If we don't have a call site, this was a call inserted by
   // legalization. These can never use special inputs.
-  if (!CLI.CS)
+  if (!CLI.CB)
     return;
 
-  const Function *CalleeFunc = CLI.CS.getCalledFunction();
-  assert(CalleeFunc);
-
   SelectionDAG &DAG = CLI.DAG;
   const SDLoc &DL = CLI.DL;
 
   const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
-
-  auto &ArgUsageInfo =
-    DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
-  const AMDGPUFunctionArgInfo &CalleeArgInfo
-    = ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
-
   const AMDGPUFunctionArgInfo &CallerArgInfo = Info.getArgInfo();
 
+  const AMDGPUFunctionArgInfo *CalleeArgInfo
+    = &AMDGPUArgumentUsageInfo::FixedABIFunctionInfo;
+  if (const Function *CalleeFunc = CLI.CB->getCalledFunction()) {
+    auto &ArgUsageInfo =
+      DAG.getPass()->getAnalysis<AMDGPUArgumentUsageInfo>();
+    CalleeArgInfo = &ArgUsageInfo.lookupFuncArgInfo(*CalleeFunc);
+  }
+
   // TODO: Unify with private memory register handling. This is complicated by
   // the fact that at least in kernels, the input argument is not necessarily
   // in the same location as the input.
   AMDGPUFunctionArgInfo::PreloadedValue InputRegs[] = {
     AMDGPUFunctionArgInfo::DISPATCH_PTR,
     AMDGPUFunctionArgInfo::QUEUE_PTR,
-    AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR,
+    AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR,
     AMDGPUFunctionArgInfo::DISPATCH_ID,
     AMDGPUFunctionArgInfo::WORKGROUP_ID_X,
     AMDGPUFunctionArgInfo::WORKGROUP_ID_Y,
-    AMDGPUFunctionArgInfo::WORKGROUP_ID_Z,
-    AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR
+    AMDGPUFunctionArgInfo::WORKGROUP_ID_Z
   };
 
   for (auto InputID : InputRegs) {
     const ArgDescriptor *OutgoingArg;
     const TargetRegisterClass *ArgRC;
+    LLT ArgTy;
 
-    std::tie(OutgoingArg, ArgRC) = CalleeArgInfo.getPreloadedValue(InputID);
+    std::tie(OutgoingArg, ArgRC, ArgTy) =
+        CalleeArgInfo->getPreloadedValue(InputID);
     if (!OutgoingArg)
       continue;
 
     const ArgDescriptor *IncomingArg;
     const TargetRegisterClass *IncomingArgRC;
-    std::tie(IncomingArg, IncomingArgRC)
-      = CallerArgInfo.getPreloadedValue(InputID);
+    LLT Ty;
+    std::tie(IncomingArg, IncomingArgRC, Ty) =
+        CallerArgInfo.getPreloadedValue(InputID);
     assert(IncomingArgRC == ArgRC);
 
     // All special arguments are ints for now.
@@ -2503,8 +2611,11 @@ void SITargetLowering::passSpecialInputs(
 
     if (OutgoingArg->isRegister()) {
       RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+      if (!CCInfo.AllocateReg(OutgoingArg->getRegister()))
+        report_fatal_error("failed to allocate implicit input argument");
     } else {
-      unsigned SpecialArgOffset = CCInfo.AllocateStack(ArgVT.getStoreSize(), 4);
+      unsigned SpecialArgOffset =
+          CCInfo.AllocateStack(ArgVT.getStoreSize(), Align(4));
       SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
                                               SpecialArgOffset);
       MemOpChains.push_back(ArgStore);
@@ -2515,33 +2626,34 @@ void SITargetLowering::passSpecialInputs(
   // packed.
   const ArgDescriptor *OutgoingArg;
   const TargetRegisterClass *ArgRC;
+  LLT Ty;
 
-  std::tie(OutgoingArg, ArgRC) =
-    CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
+  std::tie(OutgoingArg, ArgRC, Ty) =
+      CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X);
   if (!OutgoingArg)
-    std::tie(OutgoingArg, ArgRC) =
-      CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
+    std::tie(OutgoingArg, ArgRC, Ty) =
+        CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y);
   if (!OutgoingArg)
-    std::tie(OutgoingArg, ArgRC) =
-      CalleeArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
+    std::tie(OutgoingArg, ArgRC, Ty) =
+        CalleeArgInfo->getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z);
   if (!OutgoingArg)
     return;
 
-  const ArgDescriptor *IncomingArgX
-    = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X).first;
-  const ArgDescriptor *IncomingArgY
-    = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y).first;
-  const ArgDescriptor *IncomingArgZ
-    = CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z).first;
+  const ArgDescriptor *IncomingArgX = std::get<0>(
+      CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_X));
+  const ArgDescriptor *IncomingArgY = std::get<0>(
+      CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Y));
+  const ArgDescriptor *IncomingArgZ = std::get<0>(
+      CallerArgInfo.getPreloadedValue(AMDGPUFunctionArgInfo::WORKITEM_ID_Z));
 
   SDValue InputReg;
   SDLoc SL;
 
   // If incoming ids are not packed we need to pack them.
-  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo.WorkItemIDX)
+  if (IncomingArgX && !IncomingArgX->isMasked() && CalleeArgInfo->WorkItemIDX)
     InputReg = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgX);
 
-  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo.WorkItemIDY) {
+  if (IncomingArgY && !IncomingArgY->isMasked() && CalleeArgInfo->WorkItemIDY) {
     SDValue Y = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgY);
     Y = DAG.getNode(ISD::SHL, SL, MVT::i32, Y,
                     DAG.getShiftAmountConstant(10, MVT::i32, SL));
@@ -2549,7 +2661,7 @@ void SITargetLowering::passSpecialInputs(
                  DAG.getNode(ISD::OR, SL, MVT::i32, InputReg, Y) : Y;
   }
 
-  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo.WorkItemIDZ) {
+  if (IncomingArgZ && !IncomingArgZ->isMasked() && CalleeArgInfo->WorkItemIDZ) {
     SDValue Z = loadInputValue(DAG, ArgRC, MVT::i32, DL, *IncomingArgZ);
     Z = DAG.getNode(ISD::SHL, SL, MVT::i32, Z,
                     DAG.getShiftAmountConstant(20, MVT::i32, SL));
@@ -2569,8 +2681,9 @@ void SITargetLowering::passSpecialInputs(
 
   if (OutgoingArg->isRegister()) {
     RegsToPass.emplace_back(OutgoingArg->getRegister(), InputReg);
+    CCInfo.AllocateReg(OutgoingArg->getRegister());
   } else {
-    unsigned SpecialArgOffset = CCInfo.AllocateStack(4, 4);
+    unsigned SpecialArgOffset = CCInfo.AllocateStack(4, Align(4));
     SDValue ArgStore = storeStackInputValue(DAG, DL, Chain, InputReg,
                                             SpecialArgOffset);
     MemOpChains.push_back(ArgStore);
@@ -2703,10 +2816,11 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                               "unsupported call to variadic function ");
   }
 
-  if (!CLI.CS.getInstruction())
+  if (!CLI.CB)
     report_fatal_error("unsupported libcall legalization");
 
-  if (!CLI.CS.getCalledFunction()) {
+  if (!AMDGPUTargetMachine::EnableFixedFunctionABI &&
+      !CLI.CB->getCalledFunction()) {
     return lowerUnhandledCall(CLI, InVals,
                               "unsupported indirect call to function ");
   }
@@ -2726,7 +2840,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (IsTailCall) {
     IsTailCall = isEligibleForTailCallOptimization(
       Callee, CallConv, IsVarArg, Outs, OutVals, Ins, DAG);
-    if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall()) {
+    if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall()) {
       report_fatal_error("failed to perform tail call elimination on a call "
                          "site marked musttail");
     }
@@ -2743,12 +2857,19 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
 
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
   CCAssignFn *AssignFn = CCAssignFnForCall(CallConv, IsVarArg);
 
+  if (AMDGPUTargetMachine::EnableFixedFunctionABI) {
+    // With a fixed ABI, allocate fixed registers before user arguments.
+    passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
+  }
+
   CCInfo.AnalyzeCallOperands(Outs, AssignFn);
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -2767,7 +2888,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   // arguments to begin at SP+0. Completely unused for non-tail calls.
   int32_t FPDiff = 0;
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
@@ -2784,7 +2904,6 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     Chain = DAG.getTokenFactor(DL, CopyFromChains);
   }
 
-  SmallVector<SDValue, 8> MemOpChains;
   MVT PtrVT = MVT::i32;
 
   // Walk the register/memloc assignments, inserting copies/loads.
@@ -2837,7 +2956,7 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
         // FIXME: We can have better than the minimum byval required alignment.
         Alignment =
             Flags.isByVal()
-                ? MaybeAlign(Flags.getByValAlign())
+                ? Flags.getNonZeroByValAlign()
                 : commonAlignment(Subtarget->getStackAlignment(), Offset);
 
         Offset = Offset + FPDiff;
@@ -2864,11 +2983,12 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
       if (Outs[i].Flags.isByVal()) {
         SDValue SizeNode =
             DAG.getConstant(Outs[i].Flags.getByValSize(), DL, MVT::i32);
-        SDValue Cpy = DAG.getMemcpy(
-            Chain, DL, DstAddr, Arg, SizeNode, Outs[i].Flags.getByValAlign(),
-            /*isVol = */ false, /*AlwaysInline = */ true,
-            /*isTailCall = */ false, DstInfo,
-            MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
+        SDValue Cpy =
+            DAG.getMemcpy(Chain, DL, DstAddr, Arg, SizeNode,
+                          Outs[i].Flags.getNonZeroByValAlign(),
+                          /*isVol = */ false, /*AlwaysInline = */ true,
+                          /*isTailCall = */ false, DstInfo,
+                          MachinePointerInfo(AMDGPUAS::PRIVATE_ADDRESS));
 
         MemOpChains.push_back(Cpy);
       } else {
@@ -2879,8 +2999,10 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
     }
   }
 
-  // Copy special input registers after user input arguments.
-  passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
+  if (!AMDGPUTargetMachine::EnableFixedFunctionABI) {
+    // Copy special input registers after user input arguments.
+    passSpecialInputs(CLI, CCInfo, *Info, RegsToPass, MemOpChains, Chain);
+  }
 
   if (!MemOpChains.empty())
     Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
@@ -2927,9 +3049,12 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
   Ops.push_back(Callee);
   // Add a redundant copy of the callee global which will not be legalized, as
   // we need direct access to the callee later.
-  GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Callee);
-  const GlobalValue *GV = GSD->getGlobal();
-  Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
+  if (GlobalAddressSDNode *GSD = dyn_cast<GlobalAddressSDNode>(Callee)) {
+    const GlobalValue *GV = GSD->getGlobal();
+    Ops.push_back(DAG.getTargetGlobalAddress(GV, DL, MVT::i64));
+  } else {
+    Ops.push_back(DAG.getTargetConstant(0, DL, MVT::i64));
+  }
 
   if (IsTailCall) {
     // Each tail call may have to adjust the stack by a different amount, so
@@ -2985,6 +3110,71 @@ SDValue SITargetLowering::LowerCall(CallLoweringInfo &CLI,
                          IsThisReturn ? OutVals[0] : SDValue());
 }
 
+// This is identical to the default implementation in ExpandDYNAMIC_STACKALLOC,
+// except for applying the wave size scale to the increment amount.
+SDValue SITargetLowering::lowerDYNAMIC_STACKALLOCImpl(
+    SDValue Op, SelectionDAG &DAG) const {
+  const MachineFunction &MF = DAG.getMachineFunction();
+  const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
+
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  SDValue Tmp1 = Op;
+  SDValue Tmp2 = Op.getValue(1);
+  SDValue Tmp3 = Op.getOperand(2);
+  SDValue Chain = Tmp1.getOperand(0);
+
+  Register SPReg = Info->getStackPtrOffsetReg();
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, dl);
+
+  SDValue Size  = Tmp2.getOperand(1);
+  SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+  Chain = SP.getValue(1);
+  MaybeAlign Alignment = cast<ConstantSDNode>(Tmp3)->getMaybeAlignValue();
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const TargetFrameLowering *TFL = ST.getFrameLowering();
+  unsigned Opc =
+    TFL->getStackGrowthDirection() == TargetFrameLowering::StackGrowsUp ?
+    ISD::ADD : ISD::SUB;
+
+  SDValue ScaledSize = DAG.getNode(
+      ISD::SHL, dl, VT, Size,
+      DAG.getConstant(ST.getWavefrontSizeLog2(), dl, MVT::i32));
+
+  Align StackAlign = TFL->getStackAlign();
+  Tmp1 = DAG.getNode(Opc, dl, VT, SP, ScaledSize); // Value
+  if (Alignment && *Alignment > StackAlign) {
+    Tmp1 = DAG.getNode(ISD::AND, dl, VT, Tmp1,
+                       DAG.getConstant(-(uint64_t)Alignment->value()
+                                           << ST.getWavefrontSizeLog2(),
+                                       dl, VT));
+  }
+
+  Chain = DAG.getCopyToReg(Chain, dl, SPReg, Tmp1);    // Output chain
+  Tmp2 = DAG.getCALLSEQ_END(
+      Chain, DAG.getIntPtrConstant(0, dl, true),
+      DAG.getIntPtrConstant(0, dl, true), SDValue(), dl);
+
+  return DAG.getMergeValues({Tmp1, Tmp2}, dl);
+}
+
+SDValue SITargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  // We only handle constant sizes here to allow non-entry block, static sized
+  // allocas. A truly dynamic value is more difficult to support because we
+  // don't know if the size value is uniform or not. If the size isn't uniform,
+  // we would need to do a wave reduction to get the maximum size to know how
+  // much to increment the uniform stack pointer.
+  SDValue Size = Op.getOperand(1);
+  if (isa<ConstantSDNode>(Size))
+      return lowerDYNAMIC_STACKALLOCImpl(Op, DAG); // Use "generic" expansion.
+
+  return AMDGPUTargetLowering::LowerDYNAMIC_STACKALLOC(Op, DAG);
+}
+
 Register SITargetLowering::getRegisterByName(const char* RegName, LLT VT,
                                              const MachineFunction &MF) const {
   Register Reg = StringSwitch<Register>(RegName)
@@ -3310,9 +3500,15 @@ static MachineBasicBlock::iterator loadM0FromVGPR(const SIInstrInfo *TII,
   auto InsPt = emitLoadM0FromVGPRLoop(TII, MRI, MBB, *LoopBB, DL, *Idx,
                                       InitResultReg, DstReg, PhiReg, TmpExec,
                                       Offset, UseGPRIdxMode, IsIndirectSrc);
-
-  MachineBasicBlock::iterator First = RemainderBB->begin();
-  BuildMI(*RemainderBB, First, DL, TII->get(MovExecOpc), Exec)
+  MachineBasicBlock* LandingPad = MF->CreateMachineBasicBlock();
+  MachineFunction::iterator MBBI(LoopBB);
+  ++MBBI;
+  MF->insert(MBBI, LandingPad);
+  LoopBB->removeSuccessor(RemainderBB);
+  LandingPad->addSuccessor(RemainderBB);
+  LoopBB->addSuccessor(LandingPad);
+  MachineBasicBlock::iterator First = LandingPad->begin();
+  BuildMI(*LandingPad, First, DL, TII->get(MovExecOpc), Exec)
     .addReg(SaveExec);
 
   return InsPt;
@@ -3331,7 +3527,7 @@ computeIndirectRegAndOffset(const SIRegisterInfo &TRI,
   if (Offset >= NumElts || Offset < 0)
     return std::make_pair(AMDGPU::sub0, Offset);
 
-  return std::make_pair(AMDGPU::sub0 + Offset, 0);
+  return std::make_pair(SIRegisterInfo::getSubRegFromChannel(Offset), 0);
 }
 
 // Return true if the index is an SGPR and was set.
@@ -3465,24 +3661,6 @@ static MachineBasicBlock *emitIndirectSrc(MachineInstr &MI,
   return LoopBB;
 }
 
-static unsigned getMOVRELDPseudo(const SIRegisterInfo &TRI,
-                                 const TargetRegisterClass *VecRC) {
-  switch (TRI.getRegSizeInBits(*VecRC)) {
-  case 32: // 4 bytes
-    return AMDGPU::V_MOVRELD_B32_V1;
-  case 64: // 8 bytes
-    return AMDGPU::V_MOVRELD_B32_V2;
-  case 128: // 16 bytes
-    return AMDGPU::V_MOVRELD_B32_V4;
-  case 256: // 32 bytes
-    return AMDGPU::V_MOVRELD_B32_V8;
-  case 512: // 64 bytes
-    return AMDGPU::V_MOVRELD_B32_V16;
-  default:
-    llvm_unreachable("unsupported size for MOVRELD pseudos");
-  }
-}
-
 static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
                                           MachineBasicBlock &MBB,
                                           const GCNSubtarget &ST) {
@@ -3522,28 +3700,18 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
     return &MBB;
   }
 
+  const MCInstrDesc &MovRelDesc
+    = TII->getIndirectRegWritePseudo(TRI.getRegSizeInBits(*VecRC), 32, false);
+
   if (setM0ToIndexFromSGPR(TII, MRI, MI, Offset, UseGPRIdxMode, false)) {
     MachineBasicBlock::iterator I(&MI);
     const DebugLoc &DL = MI.getDebugLoc();
-
-    if (UseGPRIdxMode) {
-      BuildMI(MBB, I, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
-          .addReg(SrcVec->getReg(), RegState::Undef, SubReg) // vdst
-          .add(*Val)
-          .addReg(Dst, RegState::ImplicitDefine)
-          .addReg(SrcVec->getReg(), RegState::Implicit)
-          .addReg(AMDGPU::M0, RegState::Implicit);
-
+    BuildMI(MBB, I, DL, MovRelDesc, Dst)
+      .addReg(SrcVec->getReg())
+      .add(*Val)
+      .addImm(SubReg);
+    if (UseGPRIdxMode)
       BuildMI(MBB, I, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
-    } else {
-      const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
-
-      BuildMI(MBB, I, DL, MovRelDesc)
-          .addReg(Dst, RegState::Define)
-          .addReg(SrcVec->getReg())
-          .add(*Val)
-          .addImm(SubReg - AMDGPU::sub0);
-    }
 
     MI.eraseFromParent();
     return &MBB;
@@ -3560,26 +3728,14 @@ static MachineBasicBlock *emitIndirectDst(MachineInstr &MI,
                               Offset, UseGPRIdxMode, false);
   MachineBasicBlock *LoopBB = InsPt->getParent();
 
-  if (UseGPRIdxMode) {
-    BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::V_MOV_B32_indirect))
-        .addReg(PhiReg, RegState::Undef, SubReg) // vdst
-        .add(*Val)                               // src0
-        .addReg(Dst, RegState::ImplicitDefine)
-        .addReg(PhiReg, RegState::Implicit)
-        .addReg(AMDGPU::M0, RegState::Implicit);
+  BuildMI(*LoopBB, InsPt, DL, MovRelDesc, Dst)
+    .addReg(PhiReg)
+    .add(*Val)
+    .addImm(AMDGPU::sub0);
+  if (UseGPRIdxMode)
     BuildMI(*LoopBB, InsPt, DL, TII->get(AMDGPU::S_SET_GPR_IDX_OFF));
-  } else {
-    const MCInstrDesc &MovRelDesc = TII->get(getMOVRELDPseudo(TRI, VecRC));
-
-    BuildMI(*LoopBB, InsPt, DL, MovRelDesc)
-        .addReg(Dst, RegState::Define)
-        .addReg(PhiReg)
-        .add(*Val)
-        .addImm(SubReg - AMDGPU::sub0);
-  }
 
   MI.eraseFromParent();
-
   return LoopBB;
 }
 
@@ -3590,17 +3746,27 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   MachineFunction *MF = BB->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
 
-  if (TII->isMIMG(MI)) {
-    if (MI.memoperands_empty() && MI.mayLoadOrStore()) {
-      report_fatal_error("missing mem operand from MIMG instruction");
-    }
-    // Add a memoperand for mimg instructions so that they aren't assumed to
-    // be ordered memory instuctions.
+  switch (MI.getOpcode()) {
+  case AMDGPU::S_UADDO_PSEUDO:
+  case AMDGPU::S_USUBO_PSEUDO: {
+    const DebugLoc &DL = MI.getDebugLoc();
+    MachineOperand &Dest0 = MI.getOperand(0);
+    MachineOperand &Dest1 = MI.getOperand(1);
+    MachineOperand &Src0 = MI.getOperand(2);
+    MachineOperand &Src1 = MI.getOperand(3);
+
+    unsigned Opc = (MI.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
+                       ? AMDGPU::S_ADD_I32
+                       : AMDGPU::S_SUB_I32;
+    BuildMI(*BB, MI, DL, TII->get(Opc), Dest0.getReg()).add(Src0).add(Src1);
+
+    BuildMI(*BB, MI, DL, TII->get(AMDGPU::S_CSELECT_B64), Dest1.getReg())
+        .addImm(1)
+        .addImm(0);
 
+    MI.eraseFromParent();
     return BB;
   }
-
-  switch (MI.getOpcode()) {
   case AMDGPU::S_ADD_U64_PSEUDO:
   case AMDGPU::S_SUB_U64_PSEUDO: {
     MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
@@ -3616,35 +3782,150 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
     Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
 
-    MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
-     Src0, BoolRC, AMDGPU::sub0,
-     &AMDGPU::SReg_32RegClass);
-    MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
-      Src0, BoolRC, AMDGPU::sub1,
-      &AMDGPU::SReg_32RegClass);
+    MachineOperand Src0Sub0 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+    MachineOperand Src0Sub1 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
 
-    MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(MI, MRI,
-      Src1, BoolRC, AMDGPU::sub0,
-      &AMDGPU::SReg_32RegClass);
-    MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(MI, MRI,
-      Src1, BoolRC, AMDGPU::sub1,
-      &AMDGPU::SReg_32RegClass);
+    MachineOperand Src1Sub0 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src1, BoolRC, AMDGPU::sub0, &AMDGPU::SReg_32RegClass);
+    MachineOperand Src1Sub1 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src1, BoolRC, AMDGPU::sub1, &AMDGPU::SReg_32RegClass);
 
     bool IsAdd = (MI.getOpcode() == AMDGPU::S_ADD_U64_PSEUDO);
 
     unsigned LoOpc = IsAdd ? AMDGPU::S_ADD_U32 : AMDGPU::S_SUB_U32;
     unsigned HiOpc = IsAdd ? AMDGPU::S_ADDC_U32 : AMDGPU::S_SUBB_U32;
-    BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
-      .add(Src0Sub0)
-      .add(Src1Sub0);
-    BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
-      .add(Src0Sub1)
-      .add(Src1Sub1);
+    BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0).add(Src0Sub0).add(Src1Sub0);
+    BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1).add(Src0Sub1).add(Src1Sub1);
     BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
-      .addReg(DestSub0)
-      .addImm(AMDGPU::sub0)
-      .addReg(DestSub1)
-      .addImm(AMDGPU::sub1);
+        .addReg(DestSub0)
+        .addImm(AMDGPU::sub0)
+        .addReg(DestSub1)
+        .addImm(AMDGPU::sub1);
+    MI.eraseFromParent();
+    return BB;
+  }
+  case AMDGPU::V_ADD_U64_PSEUDO:
+  case AMDGPU::V_SUB_U64_PSEUDO: {
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    const DebugLoc &DL = MI.getDebugLoc();
+
+    bool IsAdd = (MI.getOpcode() == AMDGPU::V_ADD_U64_PSEUDO);
+
+    const auto *CarryRC = TRI->getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+
+    Register DestSub0 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+    Register DestSub1 = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+    Register CarryReg = MRI.createVirtualRegister(CarryRC);
+    Register DeadCarryReg = MRI.createVirtualRegister(CarryRC);
+
+    MachineOperand &Dest = MI.getOperand(0);
+    MachineOperand &Src0 = MI.getOperand(1);
+    MachineOperand &Src1 = MI.getOperand(2);
+
+    const TargetRegisterClass *Src0RC = Src0.isReg()
+                                            ? MRI.getRegClass(Src0.getReg())
+                                            : &AMDGPU::VReg_64RegClass;
+    const TargetRegisterClass *Src1RC = Src1.isReg()
+                                            ? MRI.getRegClass(Src1.getReg())
+                                            : &AMDGPU::VReg_64RegClass;
+
+    const TargetRegisterClass *Src0SubRC =
+        TRI->getSubRegClass(Src0RC, AMDGPU::sub0);
+    const TargetRegisterClass *Src1SubRC =
+        TRI->getSubRegClass(Src1RC, AMDGPU::sub1);
+
+    MachineOperand SrcReg0Sub0 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, Src0RC, AMDGPU::sub0, Src0SubRC);
+    MachineOperand SrcReg1Sub0 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src1, Src1RC, AMDGPU::sub0, Src1SubRC);
+
+    MachineOperand SrcReg0Sub1 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src0, Src0RC, AMDGPU::sub1, Src0SubRC);
+    MachineOperand SrcReg1Sub1 = TII->buildExtractSubRegOrImm(
+        MI, MRI, Src1, Src1RC, AMDGPU::sub1, Src1SubRC);
+
+    unsigned LoOpc = IsAdd ? AMDGPU::V_ADD_I32_e64 : AMDGPU::V_SUB_I32_e64;
+    MachineInstr *LoHalf = BuildMI(*BB, MI, DL, TII->get(LoOpc), DestSub0)
+                               .addReg(CarryReg, RegState::Define)
+                               .add(SrcReg0Sub0)
+                               .add(SrcReg1Sub0)
+                               .addImm(0); // clamp bit
+
+    unsigned HiOpc = IsAdd ? AMDGPU::V_ADDC_U32_e64 : AMDGPU::V_SUBB_U32_e64;
+    MachineInstr *HiHalf =
+        BuildMI(*BB, MI, DL, TII->get(HiOpc), DestSub1)
+            .addReg(DeadCarryReg, RegState::Define | RegState::Dead)
+            .add(SrcReg0Sub1)
+            .add(SrcReg1Sub1)
+            .addReg(CarryReg, RegState::Kill)
+            .addImm(0); // clamp bit
+
+    BuildMI(*BB, MI, DL, TII->get(TargetOpcode::REG_SEQUENCE), Dest.getReg())
+        .addReg(DestSub0)
+        .addImm(AMDGPU::sub0)
+        .addReg(DestSub1)
+        .addImm(AMDGPU::sub1);
+    TII->legalizeOperands(*LoHalf);
+    TII->legalizeOperands(*HiHalf);
+    MI.eraseFromParent();
+    return BB;
+  }
+  case AMDGPU::S_ADD_CO_PSEUDO:
+  case AMDGPU::S_SUB_CO_PSEUDO: {
+    // This pseudo has a chance to be selected
+    // only from uniform add/subcarry node. All the VGPR operands
+    // therefore assumed to be splat vectors.
+    MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+    const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
+    const SIRegisterInfo *TRI = ST.getRegisterInfo();
+    MachineBasicBlock::iterator MII = MI;
+    const DebugLoc &DL = MI.getDebugLoc();
+    MachineOperand &Dest = MI.getOperand(0);
+    MachineOperand &CarryDest = MI.getOperand(1);
+    MachineOperand &Src0 = MI.getOperand(2);
+    MachineOperand &Src1 = MI.getOperand(3);
+    MachineOperand &Src2 = MI.getOperand(4);
+    unsigned Opc = (MI.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
+                       ? AMDGPU::S_ADDC_U32
+                       : AMDGPU::S_SUBB_U32;
+    if (Src0.isReg() && TRI->isVectorRegister(MRI, Src0.getReg())) {
+      Register RegOp0 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp0)
+          .addReg(Src0.getReg());
+      Src0.setReg(RegOp0);
+    }
+    if (Src1.isReg() && TRI->isVectorRegister(MRI, Src1.getReg())) {
+      Register RegOp1 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+      BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp1)
+          .addReg(Src1.getReg());
+      Src1.setReg(RegOp1);
+    }
+    Register RegOp2 = MRI.createVirtualRegister(&AMDGPU::SReg_32RegClass);
+    if (TRI->isVectorRegister(MRI, Src2.getReg())) {
+      BuildMI(*BB, MII, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), RegOp2)
+          .addReg(Src2.getReg());
+      Src2.setReg(RegOp2);
+    }
+
+    if (TRI->getRegSizeInBits(*MRI.getRegClass(Src2.getReg())) == 64) {
+      BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMP_LG_U64))
+          .addReg(Src2.getReg())
+          .addImm(0);
+    } else {
+      BuildMI(*BB, MII, DL, TII->get(AMDGPU::S_CMPK_LG_U32))
+          .addReg(Src2.getReg())
+          .addImm(0);
+    }
+
+    BuildMI(*BB, MII, DL, TII->get(Opc), Dest.getReg()).add(Src0).add(Src1);
+
+    BuildMI(*BB, MII, DL, TII->get(AMDGPU::COPY), CarryDest.getReg())
+      .addReg(AMDGPU::SCC);
     MI.eraseFromParent();
     return BB;
   }
@@ -3741,12 +4022,14 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
   case AMDGPU::SI_INDIRECT_SRC_V4:
   case AMDGPU::SI_INDIRECT_SRC_V8:
   case AMDGPU::SI_INDIRECT_SRC_V16:
+  case AMDGPU::SI_INDIRECT_SRC_V32:
     return emitIndirectSrc(MI, *BB, *getSubtarget());
   case AMDGPU::SI_INDIRECT_DST_V1:
   case AMDGPU::SI_INDIRECT_DST_V2:
   case AMDGPU::SI_INDIRECT_DST_V4:
   case AMDGPU::SI_INDIRECT_DST_V8:
   case AMDGPU::SI_INDIRECT_DST_V16:
+  case AMDGPU::SI_INDIRECT_DST_V32:
     return emitIndirectDst(MI, *BB, *getSubtarget());
   case AMDGPU::SI_KILL_F32_COND_IMM_PSEUDO:
   case AMDGPU::SI_KILL_I1_PSEUDO:
@@ -3870,6 +4153,75 @@ MachineBasicBlock *SITargetLowering::EmitInstrWithCustomInserter(
     }
 
     return emitGWSMemViolTestLoop(MI, BB);
+  case AMDGPU::S_SETREG_B32: {
+    if (!getSubtarget()->hasDenormModeInst())
+      return BB;
+
+    // Try to optimize cases that only set the denormal mode or rounding mode.
+    //
+    // If the s_setreg_b32 fully sets all of the bits in the rounding mode or
+    // denormal mode to a constant, we can use s_round_mode or s_denorm_mode
+    // instead.
+    //
+    // FIXME: This could be predicates on the immediate, but tablegen doesn't
+    // allow you to have a no side effect instruction in the output of a
+    // sideeffecting pattern.
+
+    // TODO: Should also emit a no side effects pseudo if only FP bits are
+    // touched, even if not all of them or to a variable.
+    unsigned ID, Offset, Width;
+    AMDGPU::Hwreg::decodeHwreg(MI.getOperand(1).getImm(), ID, Offset, Width);
+    if (ID != AMDGPU::Hwreg::ID_MODE)
+      return BB;
+
+    const unsigned WidthMask = maskTrailingOnes<unsigned>(Width);
+    const unsigned SetMask = WidthMask << Offset;
+    unsigned SetDenormOp = 0;
+    unsigned SetRoundOp = 0;
+
+    // The dedicated instructions can only set the whole denorm or round mode at
+    // once, not a subset of bits in either.
+    if (Width == 8 && (SetMask & (AMDGPU::Hwreg::FP_ROUND_MASK |
+                                  AMDGPU::Hwreg::FP_DENORM_MASK)) == SetMask) {
+      // If this fully sets both the round and denorm mode, emit the two
+      // dedicated instructions for these.
+      assert(Offset == 0);
+      SetRoundOp = AMDGPU::S_ROUND_MODE;
+      SetDenormOp = AMDGPU::S_DENORM_MODE;
+    } else if (Width == 4) {
+      if ((SetMask & AMDGPU::Hwreg::FP_ROUND_MASK) == SetMask) {
+        SetRoundOp = AMDGPU::S_ROUND_MODE;
+        assert(Offset == 0);
+      } else if ((SetMask & AMDGPU::Hwreg::FP_DENORM_MASK) == SetMask) {
+        SetDenormOp = AMDGPU::S_DENORM_MODE;
+        assert(Offset == 4);
+      }
+    }
+
+    if (SetRoundOp || SetDenormOp) {
+      MachineRegisterInfo &MRI = BB->getParent()->getRegInfo();
+      MachineInstr *Def = MRI.getVRegDef(MI.getOperand(0).getReg());
+      if (Def && Def->isMoveImmediate() && Def->getOperand(1).isImm()) {
+        unsigned ImmVal = Def->getOperand(1).getImm();
+        if (SetRoundOp) {
+          BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetRoundOp))
+            .addImm(ImmVal & 0xf);
+
+          // If we also have the denorm mode, get just the denorm mode bits.
+          ImmVal >>= 4;
+        }
+
+        if (SetDenormOp) {
+          BuildMI(*BB, MI, MI.getDebugLoc(), TII->get(SetDenormOp))
+            .addImm(ImmVal & 0xf);
+        }
+
+        MI.eraseFromParent();
+      }
+    }
+
+    return BB;
+  }
   default:
     return AMDGPUTargetLowering::EmitInstrWithCustomInserter(MI, BB);
   }
@@ -3925,10 +4277,13 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
 
   switch (VT.getSimpleVT().SimpleTy) {
   case MVT::f32: {
-    // This is as fast on some subtargets. However, we always have full rate f32
-    // mad available which returns the same result as the separate operations
-    // which we should prefer over fma. We can't use this if we want to support
-    // denormals, so only report this in these cases.
+    // If mad is not available this depends only on if f32 fma is full rate.
+    if (!Subtarget->hasMadMacF32Insts())
+      return Subtarget->hasFastFMAF32();
+
+    // Otherwise f32 mad is always full rate and returns the same result as
+    // the separate operations so should be preferred over fma.
+    // However does not support denomals.
     if (hasFP32Denormals(MF))
       return Subtarget->hasFastFMAF32() || Subtarget->hasDLInsts();
 
@@ -3946,13 +4301,14 @@ bool SITargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
   return false;
 }
 
-bool SITargetLowering::isFMADLegalForFAddFSub(const SelectionDAG &DAG,
-                                              const SDNode *N) const {
+bool SITargetLowering::isFMADLegal(const SelectionDAG &DAG,
+                                   const SDNode *N) const {
   // TODO: Check future ftz flag
   // v_mad_f32/v_mac_f32 do not support denormals.
   EVT VT = N->getValueType(0);
   if (VT == MVT::f32)
-    return !hasFP32Denormals(DAG.getMachineFunction());
+    return Subtarget->hasMadMacF32Insts() &&
+           !hasFP32Denormals(DAG.getMachineFunction());
   if (VT == MVT::f16) {
     return Subtarget->hasMadF16() &&
            !hasFP64FP16Denormals(DAG.getMachineFunction());
@@ -3971,7 +4327,7 @@ SDValue SITargetLowering::splitUnaryVectorOp(SDValue Op,
                                              SelectionDAG &DAG) const {
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
-  assert(VT == MVT::v4f16);
+  assert(VT == MVT::v4f16 || VT == MVT::v4i16);
 
   SDValue Lo, Hi;
   std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
@@ -4080,6 +4436,7 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FABS:
   case ISD::FNEG:
   case ISD::FCANONICALIZE:
+  case ISD::BSWAP:
     return splitUnaryVectorOp(Op, DAG);
   case ISD::FMINNUM:
   case ISD::FMAXNUM:
@@ -4101,6 +4458,11 @@ SDValue SITargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FMINNUM_IEEE:
   case ISD::FMAXNUM_IEEE:
     return splitBinaryVectorOp(Op, DAG);
+  case ISD::SMULO:
+  case ISD::UMULO:
+    return lowerXMULO(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    return LowerDYNAMIC_STACKALLOC(Op, DAG);
   }
   return SDValue();
 }
@@ -4204,9 +4566,8 @@ static SDValue lowerICMPIntrinsic(const SITargetLowering &TLI,
                                   SDNode *N, SelectionDAG &DAG) {
   EVT VT = N->getValueType(0);
   const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
-  int CondCode = CD->getSExtValue();
-  if (CondCode < ICmpInst::Predicate::FIRST_ICMP_PREDICATE ||
-      CondCode > ICmpInst::Predicate::LAST_ICMP_PREDICATE)
+  unsigned CondCode = CD->getZExtValue();
+  if (!ICmpInst::isIntPredicate(static_cast<ICmpInst::Predicate>(CondCode)))
     return DAG.getUNDEF(VT);
 
   ICmpInst::Predicate IcInput = static_cast<ICmpInst::Predicate>(CondCode);
@@ -4241,11 +4602,9 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
   EVT VT = N->getValueType(0);
   const auto *CD = cast<ConstantSDNode>(N->getOperand(3));
 
-  int CondCode = CD->getSExtValue();
-  if (CondCode < FCmpInst::Predicate::FIRST_FCMP_PREDICATE ||
-      CondCode > FCmpInst::Predicate::LAST_FCMP_PREDICATE) {
+  unsigned CondCode = CD->getZExtValue();
+  if (!FCmpInst::isFPPredicate(static_cast<FCmpInst::Predicate>(CondCode)))
     return DAG.getUNDEF(VT);
-  }
 
   SDValue Src0 = N->getOperand(1);
   SDValue Src1 = N->getOperand(2);
@@ -4268,6 +4627,43 @@ static SDValue lowerFCMPIntrinsic(const SITargetLowering &TLI,
   return DAG.getZExtOrTrunc(SetCC, SL, VT);
 }
 
+static SDValue lowerBALLOTIntrinsic(const SITargetLowering &TLI, SDNode *N,
+                                    SelectionDAG &DAG) {
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(1);
+  SDLoc SL(N);
+
+  if (Src.getOpcode() == ISD::SETCC) {
+    // (ballot (ISD::SETCC ...)) -> (AMDGPUISD::SETCC ...)
+    return DAG.getNode(AMDGPUISD::SETCC, SL, VT, Src.getOperand(0),
+                       Src.getOperand(1), Src.getOperand(2));
+  }
+  if (const ConstantSDNode *Arg = dyn_cast<ConstantSDNode>(Src)) {
+    // (ballot 0) -> 0
+    if (Arg->isNullValue())
+      return DAG.getConstant(0, SL, VT);
+
+    // (ballot 1) -> EXEC/EXEC_LO
+    if (Arg->isOne()) {
+      Register Exec;
+      if (VT.getScalarSizeInBits() == 32)
+        Exec = AMDGPU::EXEC_LO;
+      else if (VT.getScalarSizeInBits() == 64)
+        Exec = AMDGPU::EXEC;
+      else
+        return SDValue();
+
+      return DAG.getCopyFromReg(DAG.getEntryNode(), SL, Exec, VT);
+    }
+  }
+
+  // (ballot (i1 $src)) -> (AMDGPUISD::SETCC (i32 (zext $src)) (i32 0)
+  // ISD::SETNE)
+  return DAG.getNode(
+      AMDGPUISD::SETCC, SL, VT, DAG.getZExtOrTrunc(Src, SL, MVT::i32),
+      DAG.getConstant(0, SL, MVT::i32), DAG.getCondCode(ISD::SETNE));
+}
+
 void SITargetLowering::ReplaceNodeResults(SDNode *N,
                                           SmallVectorImpl<SDValue> &Results,
                                           SelectionDAG &DAG) const {
@@ -4440,9 +4836,7 @@ bool SITargetLowering::shouldEmitGOTReloc(const GlobalValue *GV) const {
   // FIXME: Either avoid relying on address space here or change the default
   // address space for functions to avoid the explicit check.
   return (GV->getValueType()->isFunctionTy() ||
-          GV->getAddressSpace() == AMDGPUAS::GLOBAL_ADDRESS ||
-          GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS ||
-          GV->getAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT) &&
+          !isNonGlobalAddrSpace(GV->getAddressSpace())) &&
          !shouldEmitFixup(GV) &&
          !getTargetMachine().shouldAssumeDSOLocal(*GV->getParent(), GV);
 }
@@ -4451,6 +4845,14 @@ bool SITargetLowering::shouldEmitPCReloc(const GlobalValue *GV) const {
   return !shouldEmitFixup(GV) && !shouldEmitGOTReloc(GV);
 }
 
+bool SITargetLowering::shouldUseLDSConstAddress(const GlobalValue *GV) const {
+  if (!GV->hasExternalLinkage())
+    return true;
+
+  const auto OS = getTargetMachine().getTargetTriple().getOS();
+  return OS == Triple::AMDHSA || OS == Triple::AMDPAL;
+}
+
 /// This transforms the control flow intrinsics to get the branch destination as
 /// last parameter, also switches branch target with BR if the need arise
 SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
@@ -4470,16 +4872,10 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
   } else {
     // Get the target from BR if we don't negate the condition
     BR = findUser(BRCOND, ISD::BR);
+    assert(BR && "brcond missing unconditional branch user");
     Target = BR->getOperand(1);
   }
 
-  // FIXME: This changes the types of the intrinsics instead of introducing new
-  // nodes with the correct types.
-  // e.g. llvm.amdgcn.loop
-
-  // eg: i1,ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3
-  // =>     t9: ch = llvm.amdgcn.loop t0, TargetConstant:i32<6271>, t3, BasicBlock:ch<bb1 0x7fee5286d088>
-
   unsigned CFNode = isCFIntrinsic(Intr);
   if (CFNode == 0) {
     // This is a uniform branch so we don't need to legalize.
@@ -4524,7 +4920,6 @@ SDValue SITargetLowering::LowerBRCOND(SDValue BRCOND,
     };
     SDValue NewBR = DAG.getNode(ISD::BR, DL, BR->getVTList(), Ops);
     DAG.ReplaceAllUsesWith(BR, NewBR.getNode());
-    BR = NewBR.getNode();
   }
 
   SDValue Chain = SDValue(Result, Result->getNumValues() - 1);
@@ -4577,13 +4972,14 @@ SDValue SITargetLowering::LowerRETURNADDR(SDValue Op,
   return DAG.getCopyFromReg(DAG.getEntryNode(), DL, Reg, VT);
 }
 
-SDValue SITargetLowering::getFPExtOrFPTrunc(SelectionDAG &DAG,
+SDValue SITargetLowering::getFPExtOrFPRound(SelectionDAG &DAG,
                                             SDValue Op,
                                             const SDLoc &DL,
                                             EVT VT) const {
   return Op.getValueType().bitsLE(VT) ?
       DAG.getNode(ISD::FP_EXTEND, DL, VT, Op) :
-      DAG.getNode(ISD::FTRUNC, DL, VT, Op);
+    DAG.getNode(ISD::FP_ROUND, DL, VT, Op,
+                DAG.getTargetConstant(0, DL, MVT::i32));
 }
 
 SDValue SITargetLowering::lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
@@ -4609,7 +5005,7 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
   const SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
   bool IsIEEEMode = Info->getMode().IEEE;
 
-  // FIXME: Assert during eslection that this is only selected for
+  // FIXME: Assert during selection that this is only selected for
   // ieee_mode. Currently a combine can produce the ieee version for non-ieee
   // mode functions, but this happens to be OK since it's only done in cases
   // where there is known no sNaN.
@@ -4621,6 +5017,42 @@ SDValue SITargetLowering::lowerFMINNUM_FMAXNUM(SDValue Op,
   return Op;
 }
 
+SDValue SITargetLowering::lowerXMULO(SDValue Op, SelectionDAG &DAG) const {
+  EVT VT = Op.getValueType();
+  SDLoc SL(Op);
+  SDValue LHS = Op.getOperand(0);
+  SDValue RHS = Op.getOperand(1);
+  bool isSigned = Op.getOpcode() == ISD::SMULO;
+
+  if (ConstantSDNode *RHSC = isConstOrConstSplat(RHS)) {
+    const APInt &C = RHSC->getAPIntValue();
+    // mulo(X, 1 << S) -> { X << S, (X << S) >> S != X }
+    if (C.isPowerOf2()) {
+      // smulo(x, signed_min) is same as umulo(x, signed_min).
+      bool UseArithShift = isSigned && !C.isMinSignedValue();
+      SDValue ShiftAmt = DAG.getConstant(C.logBase2(), SL, MVT::i32);
+      SDValue Result = DAG.getNode(ISD::SHL, SL, VT, LHS, ShiftAmt);
+      SDValue Overflow = DAG.getSetCC(SL, MVT::i1,
+          DAG.getNode(UseArithShift ? ISD::SRA : ISD::SRL,
+                      SL, VT, Result, ShiftAmt),
+          LHS, ISD::SETNE);
+      return DAG.getMergeValues({ Result, Overflow }, SL);
+    }
+  }
+
+  SDValue Result = DAG.getNode(ISD::MUL, SL, VT, LHS, RHS);
+  SDValue Top = DAG.getNode(isSigned ? ISD::MULHS : ISD::MULHU,
+                            SL, VT, LHS, RHS);
+
+  SDValue Sign = isSigned
+    ? DAG.getNode(ISD::SRA, SL, VT, Result,
+                  DAG.getConstant(VT.getScalarSizeInBits() - 1, SL, MVT::i32))
+    : DAG.getConstant(0, SL, VT);
+  SDValue Overflow = DAG.getSetCC(SL, MVT::i1, Top, Sign, ISD::SETNE);
+
+  return DAG.getMergeValues({ Result, Overflow }, SL);
+}
+
 SDValue SITargetLowering::lowerTRAP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc SL(Op);
   SDValue Chain = Op.getOperand(0);
@@ -4694,7 +5126,7 @@ SDValue SITargetLowering::getSegmentAperture(unsigned AS, const SDLoc &DL,
 
   MachineFunction &MF = DAG.getMachineFunction();
   SIMachineFunctionInfo *Info = MF.getInfo<SIMachineFunctionInfo>();
-  unsigned UserSGPR = Info->getQueuePtrUserSGPR();
+  Register UserSGPR = Info->getQueuePtrUserSGPR();
   assert(UserSGPR != AMDGPU::NoRegister);
 
   SDValue QueuePtr = CreateLiveInRegister(
@@ -4765,6 +5197,10 @@ SDValue SITargetLowering::lowerADDRSPACECAST(SDValue Op,
     }
   }
 
+  if (ASC->getDestAddressSpace() == AMDGPUAS::CONSTANT_ADDRESS_32BIT &&
+      Src.getValueType() == MVT::i64)
+    return DAG.getNode(ISD::TRUNCATE, SL, MVT::i32, Src);
+
   // global <-> flat are no-ops and never emitted.
 
   const MachineFunction &MF = DAG.getMachineFunction();
@@ -5036,8 +5472,9 @@ SITargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 
 static SDValue
 buildPCRelGlobalAddress(SelectionDAG &DAG, const GlobalValue *GV,
-                        const SDLoc &DL, unsigned Offset, EVT PtrVT,
+                        const SDLoc &DL, int64_t Offset, EVT PtrVT,
                         unsigned GAFlags = SIInstrInfo::MO_NONE) {
+  assert(isInt<32>(Offset + 4) && "32-bit offset is expected!");
   // In order to support pc-relative addressing, the PC_ADD_REL_OFFSET SDNode is
   // lowered to the following code sequence:
   //
@@ -5086,9 +5523,7 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
   GlobalAddressSDNode *GSD = cast<GlobalAddressSDNode>(Op);
   const GlobalValue *GV = GSD->getGlobal();
   if ((GSD->getAddressSpace() == AMDGPUAS::LOCAL_ADDRESS &&
-       (!GV->hasExternalLinkage() ||
-        getTargetMachine().getTargetTriple().getOS() == Triple::AMDHSA ||
-        getTargetMachine().getTargetTriple().getOS() == Triple::AMDPAL)) ||
+       shouldUseLDSConstAddress(GV)) ||
       GSD->getAddressSpace() == AMDGPUAS::REGION_ADDRESS ||
       GSD->getAddressSpace() == AMDGPUAS::PRIVATE_ADDRESS)
     return AMDGPUTargetLowering::LowerGlobalAddress(MFI, Op, DAG);
@@ -5114,11 +5549,11 @@ SDValue SITargetLowering::LowerGlobalAddress(AMDGPUMachineFunction *MFI,
   Type *Ty = PtrVT.getTypeForEVT(*DAG.getContext());
   PointerType *PtrTy = PointerType::get(Ty, AMDGPUAS::CONSTANT_ADDRESS);
   const DataLayout &DataLayout = DAG.getDataLayout();
-  unsigned Align = DataLayout.getABITypeAlignment(PtrTy);
+  Align Alignment = DataLayout.getABITypeAlign(PtrTy);
   MachinePointerInfo PtrInfo
     = MachinePointerInfo::getGOT(DAG.getMachineFunction());
 
-  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Align,
+  return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), GOTAddr, PtrInfo, Alignment,
                      MachineMemOperand::MODereferenceable |
                          MachineMemOperand::MOInvariant);
 }
@@ -5144,8 +5579,8 @@ SDValue SITargetLowering::lowerImplicitZextParam(SelectionDAG &DAG,
                                                  MVT VT,
                                                  unsigned Offset) const {
   SDLoc SL(Op);
-  SDValue Param = lowerKernargMemParameter(DAG, MVT::i32, MVT::i32, SL,
-                                           DAG.getEntryNode(), Offset, 4, false);
+  SDValue Param = lowerKernargMemParameter(
+      DAG, MVT::i32, MVT::i32, SL, DAG.getEntryNode(), Offset, Align(4), false);
   // The local size values will have the hi 16-bits as zero.
   return DAG.getNode(ISD::AssertZext, SL, MVT::i32, Param,
                      DAG.getValueType(VT));
@@ -5181,6 +5616,9 @@ static SDValue getBuildDwordsVector(SelectionDAG &DAG, SDLoc DL,
   } else if (Elts.size() == 2) {
     Type = MVT::v2f32;
     NumElts = 2;
+  } else if (Elts.size() == 3) {
+    Type = MVT::v3f32;
+    NumElts = 3;
   } else if (Elts.size() <= 4) {
     Type = MVT::v4f32;
     NumElts = 4;
@@ -5230,6 +5668,24 @@ static bool parseCachePolicy(SDValue CachePolicy, SelectionDAG &DAG,
   return Value == 0;
 }
 
+static SDValue padEltsToUndef(SelectionDAG &DAG, const SDLoc &DL, EVT CastVT,
+                              SDValue Src, int ExtraElts) {
+  EVT SrcVT = Src.getValueType();
+
+  SmallVector<SDValue, 8> Elts;
+
+  if (SrcVT.isVector())
+    DAG.ExtractVectorElements(Src, Elts);
+  else
+    Elts.push_back(Src);
+
+  SDValue Undef = DAG.getUNDEF(SrcVT.getScalarType());
+  while (ExtraElts--)
+    Elts.push_back(Undef);
+
+  return DAG.getBuildVector(CastVT, DL, Elts);
+}
+
 // Re-construct the required return value for a image load intrinsic.
 // This is more complicated due to the optional use TexFailCtrl which means the required
 // return type is an aggregate
@@ -5241,76 +5697,56 @@ static SDValue constructRetValue(SelectionDAG &DAG,
                                  const SDLoc &DL, LLVMContext &Context) {
   // Determine the required return type. This is the same regardless of IsTexFail flag
   EVT ReqRetVT = ResultTypes[0];
-  EVT ReqRetEltVT = ReqRetVT.isVector() ? ReqRetVT.getVectorElementType() : ReqRetVT;
   int ReqRetNumElts = ReqRetVT.isVector() ? ReqRetVT.getVectorNumElements() : 1;
-  EVT AdjEltVT = Unpacked && IsD16 ? MVT::i32 : ReqRetEltVT;
-  EVT AdjVT = Unpacked ? ReqRetNumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, ReqRetNumElts)
-                                           : AdjEltVT
-                       : ReqRetVT;
-
-  // Extract data part of the result
-  // Bitcast the result to the same type as the required return type
-  int NumElts;
-  if (IsD16 && !Unpacked)
-    NumElts = NumVDataDwords << 1;
-  else
-    NumElts = NumVDataDwords;
+  int NumDataDwords = (!IsD16 || (IsD16 && Unpacked)) ?
+    ReqRetNumElts : (ReqRetNumElts + 1) / 2;
 
-  EVT CastVT = NumElts > 1 ? EVT::getVectorVT(Context, AdjEltVT, NumElts)
-                           : AdjEltVT;
+  int MaskPopDwords = (!IsD16 || (IsD16 && Unpacked)) ?
+    DMaskPop : (DMaskPop + 1) / 2;
 
-  // Special case for v6f16. Rather than add support for this, use v3i32 to
-  // extract the data elements
-  bool V6F16Special = false;
-  if (NumElts == 6) {
-    CastVT = EVT::getVectorVT(Context, MVT::i32, NumElts / 2);
-    DMaskPop >>= 1;
-    ReqRetNumElts >>= 1;
-    V6F16Special = true;
-    AdjVT = MVT::v2i32;
-  }
+  MVT DataDwordVT = NumDataDwords == 1 ?
+    MVT::i32 : MVT::getVectorVT(MVT::i32, NumDataDwords);
 
-  SDValue N = SDValue(Result, 0);
-  SDValue CastRes = DAG.getNode(ISD::BITCAST, DL, CastVT, N);
+  MVT MaskPopVT = MaskPopDwords == 1 ?
+    MVT::i32 : MVT::getVectorVT(MVT::i32, MaskPopDwords);
 
-  // Iterate over the result
-  SmallVector<SDValue, 4> BVElts;
+  SDValue Data(Result, 0);
+  SDValue TexFail;
 
-  if (CastVT.isVector()) {
-    DAG.ExtractVectorElements(CastRes, BVElts, 0, DMaskPop);
-  } else {
-    BVElts.push_back(CastRes);
-  }
-  int ExtraElts = ReqRetNumElts - DMaskPop;
-  while(ExtraElts--)
-    BVElts.push_back(DAG.getUNDEF(AdjEltVT));
+  if (IsTexFail) {
+    SDValue ZeroIdx = DAG.getConstant(0, DL, MVT::i32);
+    if (MaskPopVT.isVector()) {
+      Data = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, MaskPopVT,
+                         SDValue(Result, 0), ZeroIdx);
+    } else {
+      Data = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MaskPopVT,
+                         SDValue(Result, 0), ZeroIdx);
+    }
 
-  SDValue PreTFCRes;
-  if (ReqRetNumElts > 1) {
-    SDValue NewVec = DAG.getBuildVector(AdjVT, DL, BVElts);
-    if (IsD16 && Unpacked)
-      PreTFCRes = adjustLoadValueTypeImpl(NewVec, ReqRetVT, DL, DAG, Unpacked);
-    else
-      PreTFCRes = NewVec;
-  } else {
-    PreTFCRes = BVElts[0];
+    TexFail = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i32,
+                          SDValue(Result, 0),
+                          DAG.getConstant(MaskPopDwords, DL, MVT::i32));
   }
 
-  if (V6F16Special)
-    PreTFCRes = DAG.getNode(ISD::BITCAST, DL, MVT::v4f16, PreTFCRes);
+  if (DataDwordVT.isVector())
+    Data = padEltsToUndef(DAG, DL, DataDwordVT, Data,
+                          NumDataDwords - MaskPopDwords);
 
-  if (!IsTexFail) {
-    if (Result->getNumValues() > 1)
-      return DAG.getMergeValues({PreTFCRes, SDValue(Result, 1)}, DL);
-    else
-      return PreTFCRes;
-  }
+  if (IsD16)
+    Data = adjustLoadValueTypeImpl(Data, ReqRetVT, DL, DAG, Unpacked);
+
+  if (!ReqRetVT.isVector())
+    Data = DAG.getNode(ISD::TRUNCATE, DL, ReqRetVT.changeTypeToInteger(), Data);
+
+  Data = DAG.getNode(ISD::BITCAST, DL, ReqRetVT, Data);
 
-  // Extract the TexFail result and insert into aggregate return
-  SmallVector<SDValue, 1> TFCElt;
-  DAG.ExtractVectorElements(N, TFCElt, DMaskPop, 1);
-  SDValue TFCRes = DAG.getNode(ISD::BITCAST, DL, ResultTypes[1], TFCElt[0]);
-  return DAG.getMergeValues({PreTFCRes, TFCRes, SDValue(Result, 1)}, DL);
+  if (TexFail)
+    return DAG.getMergeValues({Data, TexFail, SDValue(Result, 1)}, DL);
+
+  if (Result->getNumValues() == 1)
+    return Data;
+
+  return DAG.getMergeValues({Data, SDValue(Result, 1)}, DL);
 }
 
 static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
@@ -5331,6 +5767,35 @@ static bool parseTexFail(SDValue TexFailCtrl, SelectionDAG &DAG, SDValue *TFE,
   return Value == 0;
 }
 
+static void packImageA16AddressToDwords(SelectionDAG &DAG, SDValue Op,
+                                        MVT PackVectorVT,
+                                        SmallVectorImpl<SDValue> &PackedAddrs,
+                                        unsigned DimIdx, unsigned EndIdx,
+                                        unsigned NumGradients) {
+  SDLoc DL(Op);
+  for (unsigned I = DimIdx; I < EndIdx; I++) {
+    SDValue Addr = Op.getOperand(I);
+
+    // Gradients are packed with undef for each coordinate.
+    // In <hi 16 bit>,<lo 16 bit> notation, the registers look like this:
+    // 1D: undef,dx/dh; undef,dx/dv
+    // 2D: dy/dh,dx/dh; dy/dv,dx/dv
+    // 3D: dy/dh,dx/dh; undef,dz/dh; dy/dv,dx/dv; undef,dz/dv
+    if (((I + 1) >= EndIdx) ||
+        ((NumGradients / 2) % 2 == 1 && (I == DimIdx + (NumGradients / 2) - 1 ||
+                                         I == DimIdx + NumGradients - 1))) {
+      if (Addr.getValueType() != MVT::i16)
+        Addr = DAG.getBitcast(MVT::i16, Addr);
+      Addr = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::i32, Addr);
+    } else {
+      Addr = DAG.getBuildVector(PackVectorVT, DL, {Addr, Op.getOperand(I + 1)});
+      I++;
+    }
+    Addr = DAG.getBitcast(MVT::f32, Addr);
+    PackedAddrs.push_back(Addr);
+  }
+}
+
 SDValue SITargetLowering::lowerImage(SDValue Op,
                                      const AMDGPU::ImageDimIntrinsicInfo *Intr,
                                      SelectionDAG &DAG) const {
@@ -5350,6 +5815,7 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
   SmallVector<EVT, 3> ResultTypes(Op->value_begin(), Op->value_end());
   SmallVector<EVT, 3> OrigResultTypes(Op->value_begin(), Op->value_end());
   bool IsD16 = false;
+  bool IsG16 = false;
   bool IsA16 = false;
   SDValue VData;
   int NumVDataDwords;
@@ -5456,41 +5922,67 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     }
   }
 
-  // Check for 16 bit addresses and pack if true.
+  // Push back extra arguments.
+  for (unsigned I = 0; I < BaseOpcode->NumExtraArgs; I++)
+    VAddrs.push_back(Op.getOperand(AddrIdx + I));
+
+  // Check for 16 bit addresses or derivatives and pack if true.
   unsigned DimIdx = AddrIdx + BaseOpcode->NumExtraArgs;
+  unsigned CoordIdx = DimIdx + NumGradients;
+  unsigned CoordsEnd = AddrIdx + NumMIVAddrs;
+
   MVT VAddrVT = Op.getOperand(DimIdx).getSimpleValueType();
-  const MVT VAddrScalarVT = VAddrVT.getScalarType();
-  if (((VAddrScalarVT == MVT::f16) || (VAddrScalarVT == MVT::i16)) &&
-      ST->hasFeature(AMDGPU::FeatureR128A16)) {
-    IsA16 = true;
-    const MVT VectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
-    for (unsigned i = AddrIdx; i < (AddrIdx + NumMIVAddrs); ++i) {
-      SDValue AddrLo, AddrHi;
-      // Push back extra arguments.
-      if (i < DimIdx) {
-        AddrLo = Op.getOperand(i);
-      } else {
-        AddrLo = Op.getOperand(i);
-        // Dz/dh, dz/dv and the last odd coord are packed with undef. Also,
-        // in 1D, derivatives dx/dh and dx/dv are packed with undef.
-        if (((i + 1) >= (AddrIdx + NumMIVAddrs)) ||
-            ((NumGradients / 2) % 2 == 1 &&
-            (i == DimIdx + (NumGradients / 2) - 1 ||
-             i == DimIdx + NumGradients - 1))) {
-          AddrHi = DAG.getUNDEF(MVT::f16);
-        } else {
-          AddrHi = Op.getOperand(i + 1);
-          i++;
-        }
-        AddrLo = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VectorVT,
-                             {AddrLo, AddrHi});
-        AddrLo = DAG.getBitcast(MVT::i32, AddrLo);
+  MVT VAddrScalarVT = VAddrVT.getScalarType();
+  MVT PackVectorVT = VAddrScalarVT == MVT::f16 ? MVT::v2f16 : MVT::v2i16;
+  IsG16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
+
+  VAddrVT = Op.getOperand(CoordIdx).getSimpleValueType();
+  VAddrScalarVT = VAddrVT.getScalarType();
+  IsA16 = VAddrScalarVT == MVT::f16 || VAddrScalarVT == MVT::i16;
+  if (IsA16 || IsG16) {
+    if (IsA16) {
+      if (!ST->hasA16()) {
+        LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
+                             "support 16 bit addresses\n");
+        return Op;
+      }
+      if (!IsG16) {
+        LLVM_DEBUG(
+            dbgs() << "Failed to lower image intrinsic: 16 bit addresses "
+                      "need 16 bit derivatives but got 32 bit derivatives\n");
+        return Op;
       }
-      VAddrs.push_back(AddrLo);
+    } else if (!ST->hasG16()) {
+      LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
+                           "support 16 bit derivatives\n");
+      return Op;
+    }
+
+    if (BaseOpcode->Gradients && !IsA16) {
+      if (!ST->hasG16()) {
+        LLVM_DEBUG(dbgs() << "Failed to lower image intrinsic: Target does not "
+                             "support 16 bit derivatives\n");
+        return Op;
+      }
+      // Activate g16
+      const AMDGPU::MIMGG16MappingInfo *G16MappingInfo =
+          AMDGPU::getMIMGG16MappingInfo(Intr->BaseOpcode);
+      IntrOpcode = G16MappingInfo->G16; // set new opcode to variant with _g16
+    }
+
+    // Don't compress addresses for G16
+    const int PackEndIdx = IsA16 ? CoordsEnd : CoordIdx;
+    packImageA16AddressToDwords(DAG, Op, PackVectorVT, VAddrs, DimIdx,
+                                PackEndIdx, NumGradients);
+
+    if (!IsA16) {
+      // Add uncompressed address
+      for (unsigned I = CoordIdx; I < CoordsEnd; I++)
+        VAddrs.push_back(Op.getOperand(I));
     }
   } else {
-    for (unsigned i = 0; i < NumMIVAddrs; ++i)
-      VAddrs.push_back(Op.getOperand(AddrIdx + i));
+    for (unsigned I = DimIdx; I < CoordsEnd; I++)
+      VAddrs.push_back(Op.getOperand(I));
   }
 
   // If the register allocator cannot place the address registers contiguously
@@ -5557,8 +6049,8 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     }
 
     EVT NewVT = NumVDataDwords > 1 ?
-                  EVT::getVectorVT(*DAG.getContext(), MVT::f32, NumVDataDwords)
-                : MVT::f32;
+                  EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumVDataDwords)
+                : MVT::i32;
 
     ResultTypes[0] = NewVT;
     if (ResultTypes.size() == 3) {
@@ -5603,10 +6095,12 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
     Ops.push_back(DLC);
   Ops.push_back(GLC);
   Ops.push_back(SLC);
-  Ops.push_back(IsA16 &&  // a16 or r128
+  Ops.push_back(IsA16 &&  // r128, a16 for gfx9
                 ST->hasFeature(AMDGPU::FeatureR128A16) ? True : False);
-  Ops.push_back(TFE); // tfe
-  Ops.push_back(LWE); // lwe
+  if (IsGFX10)
+    Ops.push_back(IsA16 ? True : False);
+  Ops.push_back(TFE);
+  Ops.push_back(LWE);
   if (!IsGFX10)
     Ops.push_back(DimInfo->DA ? True : False);
   if (BaseOpcode->HasD16)
@@ -5655,26 +6149,25 @@ SDValue SITargetLowering::lowerImage(SDValue Op,
 }
 
 SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
-                                       SDValue Offset, SDValue GLC, SDValue DLC,
+                                       SDValue Offset, SDValue CachePolicy,
                                        SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
 
   const DataLayout &DataLayout = DAG.getDataLayout();
-  unsigned Align =
-      DataLayout.getABITypeAlignment(VT.getTypeForEVT(*DAG.getContext()));
+  Align Alignment =
+      DataLayout.getABITypeAlign(VT.getTypeForEVT(*DAG.getContext()));
 
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo(),
       MachineMemOperand::MOLoad | MachineMemOperand::MODereferenceable |
           MachineMemOperand::MOInvariant,
-      VT.getStoreSize(), Align);
+      VT.getStoreSize(), Alignment);
 
   if (!Offset->isDivergent()) {
     SDValue Ops[] = {
         Rsrc,
         Offset, // Offset
-        GLC,
-        DLC,
+        CachePolicy
     };
 
     // Widen vec3 load to vec4.
@@ -5684,9 +6177,8 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
       auto WidenedOp = DAG.getMemIntrinsicNode(
           AMDGPUISD::SBUFFER_LOAD, DL, DAG.getVTList(WidenedVT), Ops, WidenedVT,
           MF.getMachineMemOperand(MMO, 0, WidenedVT.getStoreSize()));
-      auto Subvector = DAG.getNode(
-          ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
-          DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
+      auto Subvector = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, WidenedOp,
+                                   DAG.getVectorIdxConstant(0, DL));
       return Subvector;
     }
 
@@ -5705,11 +6197,10 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
 
   if (NumElts == 8 || NumElts == 16) {
     NumLoads = NumElts / 4;
-    LoadVT = MVT::v4i32;
+    LoadVT = MVT::getVectorVT(LoadVT.getScalarType(), 4);
   }
 
   SDVTList VTList = DAG.getVTList({LoadVT, MVT::Glue});
-  unsigned CachePolicy = cast<ConstantSDNode>(GLC)->getZExtValue();
   SDValue Ops[] = {
       DAG.getEntryNode(),                               // Chain
       Rsrc,                                             // rsrc
@@ -5717,13 +6208,14 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
       {},                                               // voffset
       {},                                               // soffset
       {},                                               // offset
-      DAG.getTargetConstant(CachePolicy, DL, MVT::i32), // cachepolicy
+      CachePolicy,                                      // cachepolicy
       DAG.getTargetConstant(0, DL, MVT::i1),            // idxen
   };
 
   // Use the alignment to ensure that the required offsets will fit into the
   // immediate offsets.
-  setBufferOffsets(Offset, DAG, &Ops[3], NumLoads > 1 ? 16 * NumLoads : 4);
+  setBufferOffsets(Offset, DAG, &Ops[3],
+                   NumLoads > 1 ? Align(16 * NumLoads) : Align(4));
 
   uint64_t InstOffset = cast<ConstantSDNode>(Ops[5])->getZExtValue();
   for (unsigned i = 0; i < NumLoads; ++i) {
@@ -5732,7 +6224,7 @@ SDValue SITargetLowering::lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc,
                                         LoadVT, MMO, DAG));
   }
 
-  if (VT == MVT::v8i32 || VT == MVT::v16i32)
+  if (NumElts == 8 || NumElts == 16)
     return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Loads);
 
   return Loads[0];
@@ -5777,6 +6269,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                              AMDGPUFunctionArgInfo::IMPLICIT_ARG_PTR);
   }
   case Intrinsic::amdgcn_kernarg_segment_ptr: {
+    if (!AMDGPU::isKernel(MF.getFunction().getCallingConv())) {
+      // This only makes sense to call in a kernel, so just lower to null.
+      return DAG.getConstant(0, DL, VT);
+    }
+
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::KERNARG_SEGMENT_PTR);
   }
@@ -5790,8 +6287,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_rsq_legacy:
     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return emitRemovedIntrinsicError(DAG, DL, VT);
-
-    return DAG.getNode(AMDGPUISD::RSQ_LEGACY, DL, VT, Op.getOperand(1));
+    return SDValue();
   case Intrinsic::amdgcn_rcp_legacy:
     if (Subtarget->getGeneration() >= AMDGPUSubtarget::VOLCANIC_ISLANDS)
       return emitRemovedIntrinsicError(DAG, DL, VT);
@@ -5815,37 +6311,43 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_X, 4, false);
+                                    SI::KernelInputOffsets::NGROUPS_X, Align(4),
+                                    false);
   case Intrinsic::r600_read_ngroups_y:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_Y, 4, false);
+                                    SI::KernelInputOffsets::NGROUPS_Y, Align(4),
+                                    false);
   case Intrinsic::r600_read_ngroups_z:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::NGROUPS_Z, 4, false);
+                                    SI::KernelInputOffsets::NGROUPS_Z, Align(4),
+                                    false);
   case Intrinsic::r600_read_global_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_X, 4, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_X,
+                                    Align(4), false);
   case Intrinsic::r600_read_global_size_y:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y, 4, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Y,
+                                    Align(4), false);
   case Intrinsic::r600_read_global_size_z:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
 
     return lowerKernargMemParameter(DAG, VT, VT, DL, DAG.getEntryNode(),
-                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z, 4, false);
+                                    SI::KernelInputOffsets::GLOBAL_SIZE_Z,
+                                    Align(4), false);
   case Intrinsic::r600_read_local_size_x:
     if (Subtarget->isAmdHsaOS())
       return emitNonHSAIntrinsicError(DAG, DL, VT);
@@ -5865,29 +6367,23 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return lowerImplicitZextParam(DAG, Op, MVT::i16,
                                   SI::KernelInputOffsets::LOCAL_SIZE_Z);
   case Intrinsic::amdgcn_workgroup_id_x:
-  case Intrinsic::r600_read_tgid_x:
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::WORKGROUP_ID_X);
   case Intrinsic::amdgcn_workgroup_id_y:
-  case Intrinsic::r600_read_tgid_y:
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Y);
   case Intrinsic::amdgcn_workgroup_id_z:
-  case Intrinsic::r600_read_tgid_z:
     return getPreloadedValue(DAG, *MFI, VT,
                              AMDGPUFunctionArgInfo::WORKGROUP_ID_Z);
   case Intrinsic::amdgcn_workitem_id_x:
-  case Intrinsic::r600_read_tidig_x:
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
                           SDLoc(DAG.getEntryNode()),
                           MFI->getArgInfo().WorkItemIDX);
   case Intrinsic::amdgcn_workitem_id_y:
-  case Intrinsic::r600_read_tidig_y:
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
                           SDLoc(DAG.getEntryNode()),
                           MFI->getArgInfo().WorkItemIDY);
   case Intrinsic::amdgcn_workitem_id_z:
-  case Intrinsic::r600_read_tidig_z:
     return loadInputValue(DAG, &AMDGPU::VGPR_32RegClass, MVT::i32,
                           SDLoc(DAG.getEntryNode()),
                           MFI->getArgInfo().WorkItemIDZ);
@@ -5901,53 +6397,11 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     if (!parseCachePolicy(Op.getOperand(3), DAG, &GLC, nullptr,
                           IsGFX10 ? &DLC : nullptr))
       return Op;
-    return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), GLC, DLC,
+    return lowerSBuffer(VT, DL, Op.getOperand(1), Op.getOperand(2), Op.getOperand(3),
                         DAG);
   }
   case Intrinsic::amdgcn_fdiv_fast:
     return lowerFDIV_FAST(Op, DAG);
-  case Intrinsic::amdgcn_interp_p1_f16: {
-    SDValue ToM0 = DAG.getCopyToReg(DAG.getEntryNode(), DL, AMDGPU::M0,
-                                    Op.getOperand(5), SDValue());
-    if (getSubtarget()->getLDSBankCount() == 16) {
-      // 16 bank LDS
-
-      // FIXME: This implicitly will insert a second CopyToReg to M0.
-      SDValue S = DAG.getNode(
-        ISD::INTRINSIC_WO_CHAIN, DL, MVT::f32,
-        DAG.getTargetConstant(Intrinsic::amdgcn_interp_mov, DL, MVT::i32),
-        DAG.getConstant(2, DL, MVT::i32), // P0
-        Op.getOperand(2),  // Attrchan
-        Op.getOperand(3),  // Attr
-        Op.getOperand(5)); // m0
-
-      SDValue Ops[] = {
-        Op.getOperand(1), // Src0
-        Op.getOperand(2), // Attrchan
-        Op.getOperand(3), // Attr
-        DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
-        S, // Src2 - holds two f16 values selected by high
-        DAG.getTargetConstant(0, DL, MVT::i32), // $src2_modifiers
-        Op.getOperand(4), // high
-        DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
-        DAG.getTargetConstant(0, DL, MVT::i32) // $omod
-      };
-      return DAG.getNode(AMDGPUISD::INTERP_P1LV_F16, DL, MVT::f32, Ops);
-    } else {
-      // 32 bank LDS
-      SDValue Ops[] = {
-        Op.getOperand(1), // Src0
-        Op.getOperand(2), // Attrchan
-        Op.getOperand(3), // Attr
-        DAG.getTargetConstant(0, DL, MVT::i32), // $src0_modifiers
-        Op.getOperand(4), // high
-        DAG.getTargetConstant(0, DL, MVT::i1), // $clamp
-        DAG.getTargetConstant(0, DL, MVT::i32), // $omod
-        ToM0.getValue(1)
-      };
-      return DAG.getNode(AMDGPUISD::INTERP_P1LL_F16, DL, MVT::f32, Ops);
-    }
-  }
   case Intrinsic::amdgcn_sin:
     return DAG.getNode(AMDGPUISD::SIN_HW, DL, VT, Op.getOperand(1));
 
@@ -5988,9 +6442,6 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
     return DAG.getNode(AMDGPUISD::DIV_FIXUP, DL, VT,
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
 
-  case Intrinsic::amdgcn_trig_preop:
-    return DAG.getNode(AMDGPUISD::TRIG_PREOP, DL, VT,
-                       Op.getOperand(1), Op.getOperand(2));
   case Intrinsic::amdgcn_div_scale: {
     const ConstantSDNode *Param = cast<ConstantSDNode>(Op.getOperand(3));
 
@@ -6020,6 +6471,8 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_fcmp: {
     return lowerFCMPIntrinsic(*this, Op.getNode(), DAG);
   }
+  case Intrinsic::amdgcn_ballot:
+    return lowerBALLOTIntrinsic(*this, Op.getNode(), DAG);
   case Intrinsic::amdgcn_fmed3:
     return DAG.getNode(AMDGPUISD::FMED3, DL, VT,
                        Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
@@ -6098,6 +6551,19 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
                                 DAG.getConstant(1, SL, MVT::i32));
     return DAG.getSetCC(SL, MVT::i1, SrcHi, Aperture, ISD::SETEQ);
   }
+  case Intrinsic::amdgcn_alignbit:
+    return DAG.getNode(ISD::FSHR, DL, VT,
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::amdgcn_reloc_constant: {
+    Module *M = const_cast<Module *>(MF.getFunction().getParent());
+    const MDNode *Metadata = cast<MDNodeSDNode>(Op.getOperand(1))->getMD();
+    auto SymbolName = cast<MDString>(Metadata->getOperand(0))->getString();
+    auto RelocSymbol = cast<GlobalVariable>(
+        M->getOrInsertGlobal(SymbolName, Type::getInt32Ty(M->getContext())));
+    SDValue GA = DAG.getTargetGlobalAddress(RelocSymbol, DL, MVT::i32, 0,
+                                            SIInstrInfo::MO_ABS32_LO);
+    return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
+  }
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
             AMDGPU::getImageDimIntrinsicInfo(IntrinsicID))
@@ -6131,6 +6597,28 @@ static unsigned getBufferOffsetForMMO(SDValue VOffset,
          cast<ConstantSDNode>(Offset)->getSExtValue();
 }
 
+static unsigned getDSShaderTypeValue(const MachineFunction &MF) {
+  switch (MF.getFunction().getCallingConv()) {
+  case CallingConv::AMDGPU_PS:
+    return 1;
+  case CallingConv::AMDGPU_VS:
+    return 2;
+  case CallingConv::AMDGPU_GS:
+    return 3;
+  case CallingConv::AMDGPU_HS:
+  case CallingConv::AMDGPU_LS:
+  case CallingConv::AMDGPU_ES:
+    report_fatal_error("ds_ordered_count unsupported for this calling conv");
+  case CallingConv::AMDGPU_CS:
+  case CallingConv::AMDGPU_KERNEL:
+  case CallingConv::C:
+  case CallingConv::Fast:
+  default:
+    // Assume other calling conventions are various compute callable functions
+    return 0;
+  }
+}
+
 SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
                                                  SelectionDAG &DAG) const {
   unsigned IntrID = cast<ConstantSDNode>(Op.getOperand(1))->getZExtValue();
@@ -6146,8 +6634,6 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     unsigned IndexOperand = M->getConstantOperandVal(7);
     unsigned WaveRelease = M->getConstantOperandVal(8);
     unsigned WaveDone = M->getConstantOperandVal(9);
-    unsigned ShaderType;
-    unsigned Instruction;
 
     unsigned OrderedCountIndex = IndexOperand & 0x3f;
     IndexOperand &= ~0x3f;
@@ -6166,36 +6652,11 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     if (IndexOperand)
       report_fatal_error("ds_ordered_count: bad index operand");
 
-    switch (IntrID) {
-    case Intrinsic::amdgcn_ds_ordered_add:
-      Instruction = 0;
-      break;
-    case Intrinsic::amdgcn_ds_ordered_swap:
-      Instruction = 1;
-      break;
-    }
-
     if (WaveDone && !WaveRelease)
       report_fatal_error("ds_ordered_count: wave_done requires wave_release");
 
-    switch (DAG.getMachineFunction().getFunction().getCallingConv()) {
-    case CallingConv::AMDGPU_CS:
-    case CallingConv::AMDGPU_KERNEL:
-      ShaderType = 0;
-      break;
-    case CallingConv::AMDGPU_PS:
-      ShaderType = 1;
-      break;
-    case CallingConv::AMDGPU_VS:
-      ShaderType = 2;
-      break;
-    case CallingConv::AMDGPU_GS:
-      ShaderType = 3;
-      break;
-    default:
-      report_fatal_error("ds_ordered_count unsupported for this calling conv");
-    }
-
+    unsigned Instruction = IntrID == Intrinsic::amdgcn_ds_ordered_add ? 0 : 1;
+    unsigned ShaderType = getDSShaderTypeValue(DAG.getMachineFunction());
     unsigned Offset0 = OrderedCountIndex << 2;
     unsigned Offset1 = WaveRelease | (WaveDone << 1) | (ShaderType << 2) |
                        (Instruction << 4);
@@ -6425,6 +6886,7 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
   case Intrinsic::amdgcn_buffer_atomic_swap:
   case Intrinsic::amdgcn_buffer_atomic_add:
   case Intrinsic::amdgcn_buffer_atomic_sub:
+  case Intrinsic::amdgcn_buffer_atomic_csub:
   case Intrinsic::amdgcn_buffer_atomic_smin:
   case Intrinsic::amdgcn_buffer_atomic_umin:
   case Intrinsic::amdgcn_buffer_atomic_smax:
@@ -6467,6 +6929,9 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     case Intrinsic::amdgcn_buffer_atomic_sub:
       Opcode = AMDGPUISD::BUFFER_ATOMIC_SUB;
       break;
+    case Intrinsic::amdgcn_buffer_atomic_csub:
+      Opcode = AMDGPUISD::BUFFER_ATOMIC_CSUB;
+      break;
     case Intrinsic::amdgcn_buffer_atomic_smin:
       Opcode = AMDGPUISD::BUFFER_ATOMIC_SMIN;
       break;
@@ -6715,6 +7180,18 @@ SDValue SITargetLowering::LowerINTRINSIC_W_CHAIN(SDValue Op,
     return DAG.getMemIntrinsicNode(AMDGPUISD::BUFFER_ATOMIC_CMPSWAP, DL,
                                    Op->getVTList(), Ops, VT, M->getMemOperand());
   }
+  case Intrinsic::amdgcn_global_atomic_csub: {
+    MemSDNode *M = cast<MemSDNode>(Op);
+    SDValue Ops[] = {
+      M->getOperand(0), // Chain
+      M->getOperand(2), // Ptr
+      M->getOperand(3)  // Value
+    };
+
+    return DAG.getMemIntrinsicNode(AMDGPUISD::ATOMIC_LOAD_CSUB, SDLoc(Op),
+                                   M->getVTList(), Ops, M->getMemoryVT(),
+                                   M->getMemOperand());
+  }
 
   default:
     if (const AMDGPU::ImageDimIntrinsicInfo *ImageDimIntr =
@@ -6750,9 +7227,8 @@ SDValue SITargetLowering::getMemIntrinsicNode(unsigned Opcode, const SDLoc &DL,
   auto NewOp = DAG.getMemIntrinsicNode(Opcode, DL, WidenedVTList, Ops,
                                        WidenedMemVT, MMO);
   if (WidenedVT != VT) {
-    auto Extract = DAG.getNode(
-        ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
-        DAG.getConstant(0, DL, getVectorIdxTy(DAG.getDataLayout())));
+    auto Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, NewOp,
+                               DAG.getVectorIdxConstant(0, DL));
     NewOp = DAG.getMergeValues({ Extract, SDValue(NewOp.getNode(), 1) }, DL);
   }
   return NewOp;
@@ -6792,52 +7268,29 @@ SDValue SITargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   MachineFunction &MF = DAG.getMachineFunction();
 
   switch (IntrinsicID) {
-  case Intrinsic::amdgcn_exp: {
-    const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
-    const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
-    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(8));
-    const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(9));
-
-    const SDValue Ops[] = {
-      Chain,
-      DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
-      DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
-      Op.getOperand(4), // src0
-      Op.getOperand(5), // src1
-      Op.getOperand(6), // src2
-      Op.getOperand(7), // src3
-      DAG.getTargetConstant(0, DL, MVT::i1), // compr
-      DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
-    };
-
-    unsigned Opc = Done->isNullValue() ?
-      AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
-    return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
-  }
   case Intrinsic::amdgcn_exp_compr: {
-    const ConstantSDNode *Tgt = cast<ConstantSDNode>(Op.getOperand(2));
-    const ConstantSDNode *En = cast<ConstantSDNode>(Op.getOperand(3));
     SDValue Src0 = Op.getOperand(4);
     SDValue Src1 = Op.getOperand(5);
-    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
-    const ConstantSDNode *VM = cast<ConstantSDNode>(Op.getOperand(7));
+    // Hack around illegal type on SI by directly selecting it.
+    if (isTypeLegal(Src0.getValueType()))
+      return SDValue();
 
+    const ConstantSDNode *Done = cast<ConstantSDNode>(Op.getOperand(6));
     SDValue Undef = DAG.getUNDEF(MVT::f32);
     const SDValue Ops[] = {
-      Chain,
-      DAG.getTargetConstant(Tgt->getZExtValue(), DL, MVT::i8), // tgt
-      DAG.getTargetConstant(En->getZExtValue(), DL, MVT::i8),  // en
-      DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0),
-      DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1),
+      Op.getOperand(2), // tgt
+      DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src0), // src0
+      DAG.getNode(ISD::BITCAST, DL, MVT::f32, Src1), // src1
       Undef, // src2
       Undef, // src3
+      Op.getOperand(7), // vm
       DAG.getTargetConstant(1, DL, MVT::i1), // compr
-      DAG.getTargetConstant(VM->getZExtValue(), DL, MVT::i1)
+      Op.getOperand(3), // en
+      Op.getOperand(0) // Chain
     };
 
-    unsigned Opc = Done->isNullValue() ?
-      AMDGPUISD::EXPORT : AMDGPUISD::EXPORT_DONE;
-    return DAG.getNode(Opc, DL, Op->getVTList(), Ops);
+    unsigned Opc = Done->isNullValue() ? AMDGPU::EXP : AMDGPU::EXP_DONE;
+    return SDValue(DAG.getMachineNode(Opc, DL, Op->getVTList(), Ops), 0);
   }
   case Intrinsic::amdgcn_s_barrier: {
     if (getTargetMachine().getOptLevel() > CodeGenOpt::None) {
@@ -7183,13 +7636,14 @@ std::pair<SDValue, SDValue> SITargetLowering::splitBufferOffsets(
 // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
 // pointed to by Offsets.
 unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
-                                        SelectionDAG &DAG, SDValue *Offsets,
-                                        unsigned Align) const {
+                                            SelectionDAG &DAG, SDValue *Offsets,
+                                            Align Alignment) const {
   SDLoc DL(CombinedOffset);
   if (auto C = dyn_cast<ConstantSDNode>(CombinedOffset)) {
     uint32_t Imm = C->getZExtValue();
     uint32_t SOffset, ImmOffset;
-    if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget, Align)) {
+    if (AMDGPU::splitMUBUFOffset(Imm, SOffset, ImmOffset, Subtarget,
+                                 Alignment)) {
       Offsets[0] = DAG.getConstant(0, DL, MVT::i32);
       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
       Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
@@ -7202,7 +7656,7 @@ unsigned SITargetLowering::setBufferOffsets(SDValue CombinedOffset,
     uint32_t SOffset, ImmOffset;
     int Offset = cast<ConstantSDNode>(N1)->getSExtValue();
     if (Offset >= 0 && AMDGPU::splitMUBUFOffset(Offset, SOffset, ImmOffset,
-                                                Subtarget, Align)) {
+                                                Subtarget, Alignment)) {
       Offsets[0] = N0;
       Offsets[1] = DAG.getConstant(SOffset, DL, MVT::i32);
       Offsets[2] = DAG.getTargetConstant(ImmOffset, DL, MVT::i32);
@@ -7413,7 +7867,8 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   // If there is a possibilty that flat instruction access scratch memory
   // then we need to use the same legalization rules we use for private.
-  if (AS == AMDGPUAS::FLAT_ADDRESS)
+  if (AS == AMDGPUAS::FLAT_ADDRESS &&
+      !Subtarget->hasMultiDwordFlatScratchAddressing())
     AS = MFI->hasFlatScratchInit() ?
          AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
 
@@ -7438,7 +7893,7 @@ SDValue SITargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
       AS == AMDGPUAS::CONSTANT_ADDRESS_32BIT ||
       AS == AMDGPUAS::GLOBAL_ADDRESS) {
     if (Subtarget->getScalarizeGlobalBehavior() && !Op->isDivergent() &&
-        !Load->isVolatile() && isMemOpHasNoClobberedMemOperand(Load) &&
+        Load->isSimple() && isMemOpHasNoClobberedMemOperand(Load) &&
         Alignment >= 4 && NumElements < 32) {
       if (MemVT.isPow2VectorType())
         return SDValue();
@@ -7547,55 +8002,54 @@ SDValue SITargetLowering::lowerFastUnsafeFDIV(SDValue Op,
   SDValue RHS = Op.getOperand(1);
   EVT VT = Op.getValueType();
   const SDNodeFlags Flags = Op->getFlags();
-  bool Unsafe = DAG.getTarget().Options.UnsafeFPMath || Flags.hasAllowReciprocal();
 
-  if (!Unsafe && VT == MVT::f32 && hasFP32Denormals(DAG.getMachineFunction()))
+  bool AllowInaccurateRcp = DAG.getTarget().Options.UnsafeFPMath ||
+                            Flags.hasApproximateFuncs();
+
+  // Without !fpmath accuracy information, we can't do more because we don't
+  // know exactly whether rcp is accurate enough to meet !fpmath requirement.
+  if (!AllowInaccurateRcp)
     return SDValue();
 
   if (const ConstantFPSDNode *CLHS = dyn_cast<ConstantFPSDNode>(LHS)) {
-    if (Unsafe || VT == MVT::f32 || VT == MVT::f16) {
-      if (CLHS->isExactlyValue(1.0)) {
-        // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
-        // the CI documentation has a worst case error of 1 ulp.
-        // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
-        // use it as long as we aren't trying to use denormals.
-        //
-        // v_rcp_f16 and v_rsq_f16 DO support denormals.
-
-        // 1.0 / sqrt(x) -> rsq(x)
-
-        // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
-        // error seems really high at 2^29 ULP.
-        if (RHS.getOpcode() == ISD::FSQRT)
-          return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
-
-        // 1.0 / x -> rcp(x)
-        return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-      }
+    if (CLHS->isExactlyValue(1.0)) {
+      // v_rcp_f32 and v_rsq_f32 do not support denormals, and according to
+      // the CI documentation has a worst case error of 1 ulp.
+      // OpenCL requires <= 2.5 ulp for 1.0 / x, so it should always be OK to
+      // use it as long as we aren't trying to use denormals.
+      //
+      // v_rcp_f16 and v_rsq_f16 DO support denormals.
 
-      // Same as for 1.0, but expand the sign out of the constant.
-      if (CLHS->isExactlyValue(-1.0)) {
-        // -1.0 / x -> rcp (fneg x)
-        SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
-        return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
-      }
+      // 1.0 / sqrt(x) -> rsq(x)
+
+      // XXX - Is UnsafeFPMath sufficient to do this for f64? The maximum ULP
+      // error seems really high at 2^29 ULP.
+      if (RHS.getOpcode() == ISD::FSQRT)
+        return DAG.getNode(AMDGPUISD::RSQ, SL, VT, RHS.getOperand(0));
+
+      // 1.0 / x -> rcp(x)
+      return DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
     }
-  }
 
-  if (Unsafe) {
-    // Turn into multiply by the reciprocal.
-    // x / y -> x * (1.0 / y)
-    SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
-    return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
+    // Same as for 1.0, but expand the sign out of the constant.
+    if (CLHS->isExactlyValue(-1.0)) {
+      // -1.0 / x -> rcp (fneg x)
+      SDValue FNegRHS = DAG.getNode(ISD::FNEG, SL, VT, RHS);
+      return DAG.getNode(AMDGPUISD::RCP, SL, VT, FNegRHS);
+    }
   }
 
-  return SDValue();
+  // Turn into multiply by the reciprocal.
+  // x / y -> x * (1.0 / y)
+  SDValue Recip = DAG.getNode(AMDGPUISD::RCP, SL, VT, RHS);
+  return DAG.getNode(ISD::FMUL, SL, VT, LHS, Recip, Flags);
 }
 
 static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
-                          EVT VT, SDValue A, SDValue B, SDValue GlueChain) {
+                          EVT VT, SDValue A, SDValue B, SDValue GlueChain,
+                          SDNodeFlags Flags) {
   if (GlueChain->getNumValues() <= 1) {
-    return DAG.getNode(Opcode, SL, VT, A, B);
+    return DAG.getNode(Opcode, SL, VT, A, B, Flags);
   }
 
   assert(GlueChain->getNumValues() == 3);
@@ -7608,15 +8062,16 @@ static SDValue getFPBinOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
     break;
   }
 
-  return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B,
-                     GlueChain.getValue(2));
+  return DAG.getNode(Opcode, SL, VTList,
+                     {GlueChain.getValue(1), A, B, GlueChain.getValue(2)},
+                     Flags);
 }
 
 static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
                            EVT VT, SDValue A, SDValue B, SDValue C,
-                           SDValue GlueChain) {
+                           SDValue GlueChain, SDNodeFlags Flags) {
   if (GlueChain->getNumValues() <= 1) {
-    return DAG.getNode(Opcode, SL, VT, A, B, C);
+    return DAG.getNode(Opcode, SL, VT, {A, B, C}, Flags);
   }
 
   assert(GlueChain->getNumValues() == 3);
@@ -7629,8 +8084,9 @@ static SDValue getFPTernOp(SelectionDAG &DAG, unsigned Opcode, const SDLoc &SL,
     break;
   }
 
-  return DAG.getNode(Opcode, SL, VTList, GlueChain.getValue(1), A, B, C,
-                     GlueChain.getValue(2));
+  return DAG.getNode(Opcode, SL, VTList,
+                     {GlueChain.getValue(1), A, B, C, GlueChain.getValue(2)},
+                     Flags);
 }
 
 SDValue SITargetLowering::LowerFDIV16(SDValue Op, SelectionDAG &DAG) const {
@@ -7704,6 +8160,13 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   if (SDValue FastLowered = lowerFastUnsafeFDIV(Op, DAG))
     return FastLowered;
 
+  // The selection matcher assumes anything with a chain selecting to a
+  // mayRaiseFPException machine instruction. Since we're introducing a chain
+  // here, we need to explicitly report nofpexcept for the regular fdiv
+  // lowering.
+  SDNodeFlags Flags = Op->getFlags();
+  Flags.setNoFPExcept(true);
+
   SDLoc SL(Op);
   SDValue LHS = Op.getOperand(0);
   SDValue RHS = Op.getOperand(1);
@@ -7713,95 +8176,100 @@ SDValue SITargetLowering::LowerFDIV32(SDValue Op, SelectionDAG &DAG) const {
   SDVTList ScaleVT = DAG.getVTList(MVT::f32, MVT::i1);
 
   SDValue DenominatorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
-                                          RHS, RHS, LHS);
+                                          {RHS, RHS, LHS}, Flags);
   SDValue NumeratorScaled = DAG.getNode(AMDGPUISD::DIV_SCALE, SL, ScaleVT,
-                                        LHS, RHS, LHS);
+                                        {LHS, RHS, LHS}, Flags);
 
   // Denominator is scaled to not be denormal, so using rcp is ok.
   SDValue ApproxRcp = DAG.getNode(AMDGPUISD::RCP, SL, MVT::f32,
-                                  DenominatorScaled);
+                                  DenominatorScaled, Flags);
   SDValue NegDivScale0 = DAG.getNode(ISD::FNEG, SL, MVT::f32,
-                                     DenominatorScaled);
+                                     DenominatorScaled, Flags);
 
   const unsigned Denorm32Reg = AMDGPU::Hwreg::ID_MODE |
                                (4 << AMDGPU::Hwreg::OFFSET_SHIFT_) |
                                (1 << AMDGPU::Hwreg::WIDTH_M1_SHIFT_);
-  const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i16);
+  const SDValue BitField = DAG.getTargetConstant(Denorm32Reg, SL, MVT::i32);
 
   const bool HasFP32Denormals = hasFP32Denormals(DAG.getMachineFunction());
 
   if (!HasFP32Denormals) {
+    // Note we can't use the STRICT_FMA/STRICT_FMUL for the non-strict FDIV
+    // lowering. The chain dependence is insufficient, and we need glue. We do
+    // not need the glue variants in a strictfp function.
+
     SDVTList BindParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
 
-    SDValue EnableDenorm;
+    SDNode *EnableDenorm;
     if (Subtarget->hasDenormModeInst()) {
       const SDValue EnableDenormValue =
           getSPDenormModeValue(FP_DENORM_FLUSH_NONE, DAG, SL, Subtarget);
 
       EnableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, BindParamVTs,
-                                 DAG.getEntryNode(), EnableDenormValue);
+                                 DAG.getEntryNode(), EnableDenormValue).getNode();
     } else {
       const SDValue EnableDenormValue = DAG.getConstant(FP_DENORM_FLUSH_NONE,
                                                         SL, MVT::i32);
-      EnableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, BindParamVTs,
-                                 DAG.getEntryNode(), EnableDenormValue,
-                                 BitField);
+      EnableDenorm =
+          DAG.getMachineNode(AMDGPU::S_SETREG_B32, SL, BindParamVTs,
+                             {EnableDenormValue, BitField, DAG.getEntryNode()});
     }
 
     SDValue Ops[3] = {
       NegDivScale0,
-      EnableDenorm.getValue(0),
-      EnableDenorm.getValue(1)
+      SDValue(EnableDenorm, 0),
+      SDValue(EnableDenorm, 1)
     };
 
     NegDivScale0 = DAG.getMergeValues(Ops, SL);
   }
 
   SDValue Fma0 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0,
-                             ApproxRcp, One, NegDivScale0);
+                             ApproxRcp, One, NegDivScale0, Flags);
 
   SDValue Fma1 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma0, ApproxRcp,
-                             ApproxRcp, Fma0);
+                             ApproxRcp, Fma0, Flags);
 
   SDValue Mul = getFPBinOp(DAG, ISD::FMUL, SL, MVT::f32, NumeratorScaled,
-                           Fma1, Fma1);
+                           Fma1, Fma1, Flags);
 
   SDValue Fma2 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Mul,
-                             NumeratorScaled, Mul);
+                             NumeratorScaled, Mul, Flags);
 
-  SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, Fma2, Fma1, Mul, Fma2);
+  SDValue Fma3 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32,
+                             Fma2, Fma1, Mul, Fma2, Flags);
 
   SDValue Fma4 = getFPTernOp(DAG, ISD::FMA, SL, MVT::f32, NegDivScale0, Fma3,
-                             NumeratorScaled, Fma3);
+                             NumeratorScaled, Fma3, Flags);
 
   if (!HasFP32Denormals) {
-    SDValue DisableDenorm;
+    SDNode *DisableDenorm;
     if (Subtarget->hasDenormModeInst()) {
       const SDValue DisableDenormValue =
           getSPDenormModeValue(FP_DENORM_FLUSH_IN_FLUSH_OUT, DAG, SL, Subtarget);
 
       DisableDenorm = DAG.getNode(AMDGPUISD::DENORM_MODE, SL, MVT::Other,
                                   Fma4.getValue(1), DisableDenormValue,
-                                  Fma4.getValue(2));
+                                  Fma4.getValue(2)).getNode();
     } else {
       const SDValue DisableDenormValue =
           DAG.getConstant(FP_DENORM_FLUSH_IN_FLUSH_OUT, SL, MVT::i32);
 
-      DisableDenorm = DAG.getNode(AMDGPUISD::SETREG, SL, MVT::Other,
-                                  Fma4.getValue(1), DisableDenormValue,
-                                  BitField, Fma4.getValue(2));
+      DisableDenorm = DAG.getMachineNode(
+          AMDGPU::S_SETREG_B32, SL, MVT::Other,
+          {DisableDenormValue, BitField, Fma4.getValue(1), Fma4.getValue(2)});
     }
 
     SDValue OutputChain = DAG.getNode(ISD::TokenFactor, SL, MVT::Other,
-                                      DisableDenorm, DAG.getRoot());
+                                      SDValue(DisableDenorm, 0), DAG.getRoot());
     DAG.setRoot(OutputChain);
   }
 
   SDValue Scale = NumeratorScaled.getValue(1);
   SDValue Fmas = DAG.getNode(AMDGPUISD::DIV_FMAS, SL, MVT::f32,
-                             Fma4, Fma1, Fma3, Scale);
+                             {Fma4, Fma1, Fma3, Scale}, Flags);
 
-  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS);
+  return DAG.getNode(AMDGPUISD::DIV_FIXUP, SL, MVT::f32, Fmas, RHS, LHS, Flags);
 }
 
 SDValue SITargetLowering::LowerFDIV64(SDValue Op, SelectionDAG &DAG) const {
@@ -7916,7 +8384,8 @@ SDValue SITargetLowering::LowerSTORE(SDValue Op, SelectionDAG &DAG) const {
   SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
   // If there is a possibilty that flat instruction access scratch memory
   // then we need to use the same legalization rules we use for private.
-  if (AS == AMDGPUAS::FLAT_ADDRESS)
+  if (AS == AMDGPUAS::FLAT_ADDRESS &&
+      !Subtarget->hasMultiDwordFlatScratchAddressing())
     AS = MFI->hasFlatScratchInit() ?
          AMDGPUAS::PRIVATE_ADDRESS : AMDGPUAS::GLOBAL_ADDRESS;
 
@@ -7976,22 +8445,24 @@ SDValue SITargetLowering::LowerTrig(SDValue Op, SelectionDAG &DAG) const {
   SDValue Arg = Op.getOperand(0);
   SDValue TrigVal;
 
-  // TODO: Should this propagate fast-math-flags?
+  // Propagate fast-math flags so that the multiply we introduce can be folded
+  // if Arg is already the result of a multiply by constant.
+  auto Flags = Op->getFlags();
 
-  SDValue OneOver2Pi = DAG.getConstantFP(0.5 / M_PI, DL, VT);
+  SDValue OneOver2Pi = DAG.getConstantFP(0.5 * numbers::inv_pi, DL, VT);
 
   if (Subtarget->hasTrigReducedRange()) {
-    SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
-    TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal);
+    SDValue MulVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
+    TrigVal = DAG.getNode(AMDGPUISD::FRACT, DL, VT, MulVal, Flags);
   } else {
-    TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi);
+    TrigVal = DAG.getNode(ISD::FMUL, DL, VT, Arg, OneOver2Pi, Flags);
   }
 
   switch (Op.getOpcode()) {
   case ISD::FCOS:
-    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal);
+    return DAG.getNode(AMDGPUISD::COS_HW, SDLoc(Op), VT, TrigVal, Flags);
   case ISD::FSIN:
-    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal);
+    return DAG.getNode(AMDGPUISD::SIN_HW, SDLoc(Op), VT, TrigVal, Flags);
   default:
     llvm_unreachable("Wrong trig opcode");
   }
@@ -8032,7 +8503,7 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
                                                      DAGCombinerInfo &DCI) const {
   EVT VT = N->getValueType(0);
   EVT ScalarVT = VT.getScalarType();
-  if (ScalarVT != MVT::f32)
+  if (ScalarVT != MVT::f32 && ScalarVT != MVT::f16)
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
@@ -8047,8 +8518,14 @@ SDValue SITargetLowering::performUCharToFloatCombine(SDNode *N,
   // about in practice.
   if (DCI.isAfterLegalizeDAG() && SrcVT == MVT::i32) {
     if (DAG.MaskedValueIsZero(Src, APInt::getHighBitsSet(32, 24))) {
-      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, VT, Src);
+      SDValue Cvt = DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0, DL, MVT::f32, Src);
       DCI.AddToWorklist(Cvt.getNode());
+
+      // For the f16 case, fold to a cast to f32 and then cast back to f16.
+      if (ScalarVT != MVT::f32) {
+        Cvt = DAG.getNode(ISD::FP_ROUND, DL, VT, Cvt,
+                          DAG.getTargetConstant(0, DL, MVT::i32));
+      }
       return Cvt;
     }
   }
@@ -8525,7 +9002,7 @@ SDValue SITargetLowering::performOrCombine(SDNode *N,
     }
   }
 
-  if (VT != MVT::i64)
+  if (VT != MVT::i64 || DCI.isBeforeLegalizeOps())
     return SDValue();
 
   // TODO: This could be a generic combine with a predicate for extracting the
@@ -8735,6 +9212,11 @@ SDValue SITargetLowering::performRcpCombine(SDNode *N,
                            N->getFlags());
   }
 
+  if ((VT == MVT::f32 || VT == MVT::f16) && N0.getOpcode() == ISD::FSQRT) {
+    return DCI.DAG.getNode(AMDGPUISD::RSQ, SDLoc(N), VT,
+                           N0.getOperand(0), N->getFlags());
+  }
+
   return AMDGPUTargetLowering::performRcpCombine(N, DCI);
 }
 
@@ -8776,9 +9258,7 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
   case AMDGPUISD::RSQ:
   case AMDGPUISD::RSQ_CLAMP:
   case AMDGPUISD::RCP_LEGACY:
-  case AMDGPUISD::RSQ_LEGACY:
   case AMDGPUISD::RCP_IFLAG:
-  case AMDGPUISD::TRIG_PREOP:
   case AMDGPUISD::DIV_SCALE:
   case AMDGPUISD::DIV_FMAS:
   case AMDGPUISD::DIV_FIXUP:
@@ -8881,6 +9361,12 @@ bool SITargetLowering::isCanonicalized(SelectionDAG &DAG, SDValue Op,
     case Intrinsic::amdgcn_cubeid:
     case Intrinsic::amdgcn_frexp_mant:
     case Intrinsic::amdgcn_fdot2:
+    case Intrinsic::amdgcn_rcp:
+    case Intrinsic::amdgcn_rsq:
+    case Intrinsic::amdgcn_rsq_clamp:
+    case Intrinsic::amdgcn_rcp_legacy:
+    case Intrinsic::amdgcn_rsq_legacy:
+    case Intrinsic::amdgcn_trig_preop:
       return true;
     default:
       break;
@@ -9099,8 +9585,7 @@ SDValue SITargetLowering::performFPMed3ImmCombine(SelectionDAG &DAG,
     return SDValue();
 
   // Ordered >= (although NaN inputs should have folded away by now).
-  APFloat::cmpResult Cmp = K0->getValueAPF().compare(K1->getValueAPF());
-  if (Cmp == APFloat::cmpGreaterThan)
+  if (K0->getValueAPF() > K1->getValueAPF())
     return SDValue();
 
   const MachineFunction &MF = DAG.getMachineFunction();
@@ -9275,6 +9760,50 @@ SDValue SITargetLowering::performCvtPkRTZCombine(SDNode *N,
   return SDValue();
 }
 
+// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
+// expanded into a set of cmp/select instructions.
+bool SITargetLowering::shouldExpandVectorDynExt(unsigned EltSize,
+                                                unsigned NumElem,
+                                                bool IsDivergentIdx) {
+  if (UseDivergentRegisterIndexing)
+    return false;
+
+  unsigned VecSize = EltSize * NumElem;
+
+  // Sub-dword vectors of size 2 dword or less have better implementation.
+  if (VecSize <= 64 && EltSize < 32)
+    return false;
+
+  // Always expand the rest of sub-dword instructions, otherwise it will be
+  // lowered via memory.
+  if (EltSize < 32)
+    return true;
+
+  // Always do this if var-idx is divergent, otherwise it will become a loop.
+  if (IsDivergentIdx)
+    return true;
+
+  // Large vectors would yield too many compares and v_cndmask_b32 instructions.
+  unsigned NumInsts = NumElem /* Number of compares */ +
+                      ((EltSize + 31) / 32) * NumElem /* Number of cndmasks */;
+  return NumInsts <= 16;
+}
+
+static bool shouldExpandVectorDynExt(SDNode *N) {
+  SDValue Idx = N->getOperand(N->getNumOperands() - 1);
+  if (isa<ConstantSDNode>(Idx))
+    return false;
+
+  SDValue Vec = N->getOperand(0);
+  EVT VecVT = Vec.getValueType();
+  EVT EltVT = VecVT.getVectorElementType();
+  unsigned EltSize = EltVT.getSizeInBits();
+  unsigned NumElem = VecVT.getVectorNumElements();
+
+  return SITargetLowering::shouldExpandVectorDynExt(EltSize, NumElem,
+                                                    Idx->isDivergent());
+}
+
 SDValue SITargetLowering::performExtractVectorEltCombine(
   SDNode *N, DAGCombinerInfo &DCI) const {
   SDValue Vec = N->getOperand(0);
@@ -9336,18 +9865,12 @@ SDValue SITargetLowering::performExtractVectorEltCombine(
   unsigned EltSize = EltVT.getSizeInBits();
 
   // EXTRACT_VECTOR_ELT (<n x e>, var-idx) => n x select (e, const-idx)
-  // This elminates non-constant index and subsequent movrel or scratch access.
-  // Sub-dword vectors of size 2 dword or less have better implementation.
-  // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
-  // instructions.
-  if (VecSize <= 256 && (VecSize > 64 || EltSize >= 32) &&
-      !isa<ConstantSDNode>(N->getOperand(1))) {
+  if (::shouldExpandVectorDynExt(N)) {
     SDLoc SL(N);
     SDValue Idx = N->getOperand(1);
-    EVT IdxVT = Idx.getValueType();
     SDValue V;
     for (unsigned I = 0, E = VecVT.getVectorNumElements(); I < E; ++I) {
-      SDValue IC = DAG.getConstant(I, SL, IdxVT);
+      SDValue IC = DAG.getVectorIdxConstant(I, SL);
       SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SL, EltVT, Vec, IC);
       if (I == 0)
         V = Elt;
@@ -9402,17 +9925,10 @@ SITargetLowering::performInsertVectorEltCombine(SDNode *N,
   SDValue Idx = N->getOperand(2);
   EVT VecVT = Vec.getValueType();
   EVT EltVT = VecVT.getVectorElementType();
-  unsigned VecSize = VecVT.getSizeInBits();
-  unsigned EltSize = EltVT.getSizeInBits();
 
   // INSERT_VECTOR_ELT (<n x e>, var-idx)
   // => BUILD_VECTOR n x select (e, const-idx)
-  // This elminates non-constant index and subsequent movrel or scratch access.
-  // Sub-dword vectors of size 2 dword or less have better implementation.
-  // Vectors of size bigger than 8 dwords would yield too many v_cndmask_b32
-  // instructions.
-  if (isa<ConstantSDNode>(Idx) ||
-      VecSize > 256 || (VecSize <= 64 && EltSize < 32))
+  if (!::shouldExpandVectorDynExt(N))
     return SDValue();
 
   SelectionDAG &DAG = DCI.DAG;
@@ -9919,39 +10435,50 @@ SDValue SITargetLowering::performCvtF32UByteNCombine(SDNode *N,
   unsigned Offset = N->getOpcode() - AMDGPUISD::CVT_F32_UBYTE0;
 
   SDValue Src = N->getOperand(0);
-  SDValue Srl = N->getOperand(0);
-  if (Srl.getOpcode() == ISD::ZERO_EXTEND)
-    Srl = Srl.getOperand(0);
+  SDValue Shift = N->getOperand(0);
 
-  // TODO: Handle (or x, (srl y, 8)) pattern when known bits are zero.
-  if (Srl.getOpcode() == ISD::SRL) {
+  // TODO: Extend type shouldn't matter (assuming legal types).
+  if (Shift.getOpcode() == ISD::ZERO_EXTEND)
+    Shift = Shift.getOperand(0);
+
+  if (Shift.getOpcode() == ISD::SRL || Shift.getOpcode() == ISD::SHL) {
+    // cvt_f32_ubyte1 (shl x,  8) -> cvt_f32_ubyte0 x
+    // cvt_f32_ubyte3 (shl x, 16) -> cvt_f32_ubyte1 x
     // cvt_f32_ubyte0 (srl x, 16) -> cvt_f32_ubyte2 x
     // cvt_f32_ubyte1 (srl x, 16) -> cvt_f32_ubyte3 x
-    // cvt_f32_ubyte0 (srl x, 8) -> cvt_f32_ubyte1 x
-
-    if (const ConstantSDNode *C =
-        dyn_cast<ConstantSDNode>(Srl.getOperand(1))) {
-      Srl = DAG.getZExtOrTrunc(Srl.getOperand(0), SDLoc(Srl.getOperand(0)),
-                               EVT(MVT::i32));
+    // cvt_f32_ubyte0 (srl x,  8) -> cvt_f32_ubyte1 x
+    if (auto *C = dyn_cast<ConstantSDNode>(Shift.getOperand(1))) {
+      Shift = DAG.getZExtOrTrunc(Shift.getOperand(0),
+                                 SDLoc(Shift.getOperand(0)), MVT::i32);
+
+      unsigned ShiftOffset = 8 * Offset;
+      if (Shift.getOpcode() == ISD::SHL)
+        ShiftOffset -= C->getZExtValue();
+      else
+        ShiftOffset += C->getZExtValue();
 
-      unsigned SrcOffset = C->getZExtValue() + 8 * Offset;
-      if (SrcOffset < 32 && SrcOffset % 8 == 0) {
-        return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + SrcOffset / 8, SL,
-                           MVT::f32, Srl);
+      if (ShiftOffset < 32 && (ShiftOffset % 8) == 0) {
+        return DAG.getNode(AMDGPUISD::CVT_F32_UBYTE0 + ShiftOffset / 8, SL,
+                           MVT::f32, Shift);
       }
     }
   }
 
-  APInt Demanded = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
-
-  KnownBits Known;
-  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
-                                        !DCI.isBeforeLegalizeOps());
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.SimplifyDemandedBits(Src, Demanded, Known, TLO)) {
-    DCI.CommitTargetLoweringOpt(TLO);
+  APInt DemandedBits = APInt::getBitsSet(32, 8 * Offset, 8 * Offset + 8);
+  if (TLI.SimplifyDemandedBits(Src, DemandedBits, DCI)) {
+    // We simplified Src. If this node is not dead, visit it again so it is
+    // folded properly.
+    if (N->getOpcode() != ISD::DELETED_NODE)
+      DCI.AddToWorklist(N);
+    return SDValue(N, 0);
   }
 
+  // Handle (or x, (srl y, 8)) pattern when known bits are zero.
+  if (SDValue DemandedSrc =
+          TLI.SimplifyMultipleUseDemandedBits(Src, DemandedBits, DAG))
+    return DAG.getNode(N->getOpcode(), SL, MVT::f32, DemandedSrc);
+
   return SDValue();
 }
 
@@ -9964,16 +10491,13 @@ SDValue SITargetLowering::performClampCombine(SDNode *N,
   const MachineFunction &MF = DCI.DAG.getMachineFunction();
   const APFloat &F = CSrc->getValueAPF();
   APFloat Zero = APFloat::getZero(F.getSemantics());
-  APFloat::cmpResult Cmp0 = F.compare(Zero);
-  if (Cmp0 == APFloat::cmpLessThan ||
-      (Cmp0 == APFloat::cmpUnordered &&
-       MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
+  if (F < Zero ||
+      (F.isNaN() && MF.getInfo<SIMachineFunctionInfo>()->getMode().DX10Clamp)) {
     return DCI.DAG.getConstantFP(Zero, SDLoc(N), N->getValueType(0));
   }
 
   APFloat One(F.getSemantics(), "1.0");
-  APFloat::cmpResult Cmp1 = F.compare(One);
-  if (Cmp1 == APFloat::cmpGreaterThan)
+  if (F > One)
     return DCI.DAG.getConstantFP(One, SDLoc(N), N->getValueType(0));
 
   return SDValue(CSrc, 0);
@@ -10061,10 +10585,10 @@ SDValue SITargetLowering::PerformDAGCombine(SDNode *N,
   case AMDGPUISD::FRACT:
   case AMDGPUISD::RSQ:
   case AMDGPUISD::RCP_LEGACY:
-  case AMDGPUISD::RSQ_LEGACY:
   case AMDGPUISD::RCP_IFLAG:
   case AMDGPUISD::RSQ_CLAMP:
   case AMDGPUISD::LDEXP: {
+    // FIXME: This is probably wrong. If src is an sNaN, it won't be quieted
     SDValue Src = N->getOperand(0);
     if (Src.isUndef())
       return Src;
@@ -10406,24 +10930,6 @@ SDNode *SITargetLowering::PostISelFolding(MachineSDNode *Node,
     Ops.push_back(ImpDef.getValue(1));
     return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
   }
-  case AMDGPU::V_PERMLANE16_B32:
-  case AMDGPU::V_PERMLANEX16_B32: {
-    ConstantSDNode *FI = cast<ConstantSDNode>(Node->getOperand(0));
-    ConstantSDNode *BC = cast<ConstantSDNode>(Node->getOperand(2));
-    if (!FI->getZExtValue() && !BC->getZExtValue())
-      break;
-    SDValue VDstIn = Node->getOperand(6);
-    if (VDstIn.isMachineOpcode()
-        && VDstIn.getMachineOpcode() == AMDGPU::IMPLICIT_DEF)
-      break;
-    MachineSDNode *ImpDef = DAG.getMachineNode(TargetOpcode::IMPLICIT_DEF,
-                                               SDLoc(Node), MVT::i32);
-    SmallVector<SDValue, 8> Ops = { SDValue(FI, 0), Node->getOperand(1),
-                                    SDValue(BC, 0), Node->getOperand(3),
-                                    Node->getOperand(4), Node->getOperand(5),
-                                    SDValue(ImpDef, 0), Node->getOperand(7) };
-    return DAG.getMachineNode(Opcode, SDLoc(Node), Node->getVTList(), Ops);
-  }
   default:
     break;
   }
@@ -10592,89 +11098,50 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                MVT VT) const {
   const TargetRegisterClass *RC = nullptr;
   if (Constraint.size() == 1) {
+    const unsigned BitWidth = VT.getSizeInBits();
     switch (Constraint[0]) {
     default:
       return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
     case 's':
     case 'r':
-      switch (VT.getSizeInBits()) {
-      default:
-        return std::make_pair(0U, nullptr);
-      case 32:
+      switch (BitWidth) {
       case 16:
         RC = &AMDGPU::SReg_32RegClass;
         break;
       case 64:
         RC = &AMDGPU::SGPR_64RegClass;
         break;
-      case 96:
-        RC = &AMDGPU::SReg_96RegClass;
-        break;
-      case 128:
-        RC = &AMDGPU::SGPR_128RegClass;
-        break;
-      case 160:
-        RC = &AMDGPU::SReg_160RegClass;
-        break;
-      case 256:
-        RC = &AMDGPU::SReg_256RegClass;
-        break;
-      case 512:
-        RC = &AMDGPU::SReg_512RegClass;
+      default:
+        RC = SIRegisterInfo::getSGPRClassForBitWidth(BitWidth);
+        if (!RC)
+          return std::make_pair(0U, nullptr);
         break;
       }
       break;
     case 'v':
-      switch (VT.getSizeInBits()) {
-      default:
-        return std::make_pair(0U, nullptr);
-      case 32:
+      switch (BitWidth) {
       case 16:
         RC = &AMDGPU::VGPR_32RegClass;
         break;
-      case 64:
-        RC = &AMDGPU::VReg_64RegClass;
-        break;
-      case 96:
-        RC = &AMDGPU::VReg_96RegClass;
-        break;
-      case 128:
-        RC = &AMDGPU::VReg_128RegClass;
-        break;
-      case 160:
-        RC = &AMDGPU::VReg_160RegClass;
-        break;
-      case 256:
-        RC = &AMDGPU::VReg_256RegClass;
-        break;
-      case 512:
-        RC = &AMDGPU::VReg_512RegClass;
+      default:
+        RC = SIRegisterInfo::getVGPRClassForBitWidth(BitWidth);
+        if (!RC)
+          return std::make_pair(0U, nullptr);
         break;
       }
       break;
     case 'a':
       if (!Subtarget->hasMAIInsts())
         break;
-      switch (VT.getSizeInBits()) {
-      default:
-        return std::make_pair(0U, nullptr);
-      case 32:
+      switch (BitWidth) {
       case 16:
         RC = &AMDGPU::AGPR_32RegClass;
         break;
-      case 64:
-        RC = &AMDGPU::AReg_64RegClass;
-        break;
-      case 128:
-        RC = &AMDGPU::AReg_128RegClass;
-        break;
-      case 512:
-        RC = &AMDGPU::AReg_512RegClass;
+      default:
+        RC = SIRegisterInfo::getAGPRClassForBitWidth(BitWidth);
+        if (!RC)
+          return std::make_pair(0U, nullptr);
         break;
-      case 1024:
-        RC = &AMDGPU::AReg_1024RegClass;
-        // v32 types are not legal but we support them here.
-        return std::make_pair(0U, RC);
       }
       break;
     }
@@ -10701,9 +11168,29 @@ SITargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(RC->getRegister(Idx), RC);
     }
   }
+
+  // FIXME: Returns VS_32 for physical SGPR constraints
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
+static bool isImmConstraint(StringRef Constraint) {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    default: break;
+    case 'I':
+    case 'J':
+    case 'A':
+    case 'B':
+    case 'C':
+      return true;
+    }
+  } else if (Constraint == "DA" ||
+             Constraint == "DB") {
+    return true;
+  }
+  return false;
+}
+
 SITargetLowering::ConstraintType
 SITargetLowering::getConstraintType(StringRef Constraint) const {
   if (Constraint.size() == 1) {
@@ -10715,9 +11202,115 @@ SITargetLowering::getConstraintType(StringRef Constraint) const {
       return C_RegisterClass;
     }
   }
+  if (isImmConstraint(Constraint)) {
+    return C_Other;
+  }
   return TargetLowering::getConstraintType(Constraint);
 }
 
+static uint64_t clearUnusedBits(uint64_t Val, unsigned Size) {
+  if (!AMDGPU::isInlinableIntLiteral(Val)) {
+    Val = Val & maskTrailingOnes<uint64_t>(Size);
+  }
+  return Val;
+}
+
+void SITargetLowering::LowerAsmOperandForConstraint(SDValue Op,
+                                                    std::string &Constraint,
+                                                    std::vector<SDValue> &Ops,
+                                                    SelectionDAG &DAG) const {
+  if (isImmConstraint(Constraint)) {
+    uint64_t Val;
+    if (getAsmOperandConstVal(Op, Val) &&
+        checkAsmConstraintVal(Op, Constraint, Val)) {
+      Val = clearUnusedBits(Val, Op.getScalarValueSizeInBits());
+      Ops.push_back(DAG.getTargetConstant(Val, SDLoc(Op), MVT::i64));
+    }
+  } else {
+    TargetLowering::LowerAsmOperandForConstraint(Op, Constraint, Ops, DAG);
+  }
+}
+
+bool SITargetLowering::getAsmOperandConstVal(SDValue Op, uint64_t &Val) const {
+  unsigned Size = Op.getScalarValueSizeInBits();
+  if (Size > 64)
+    return false;
+
+  if (Size == 16 && !Subtarget->has16BitInsts())
+    return false;
+
+  if (ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op)) {
+    Val = C->getSExtValue();
+    return true;
+  }
+  if (ConstantFPSDNode *C = dyn_cast<ConstantFPSDNode>(Op)) {
+    Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
+    return true;
+  }
+  if (BuildVectorSDNode *V = dyn_cast<BuildVectorSDNode>(Op)) {
+    if (Size != 16 || Op.getNumOperands() != 2)
+      return false;
+    if (Op.getOperand(0).isUndef() || Op.getOperand(1).isUndef())
+      return false;
+    if (ConstantSDNode *C = V->getConstantSplatNode()) {
+      Val = C->getSExtValue();
+      return true;
+    }
+    if (ConstantFPSDNode *C = V->getConstantFPSplatNode()) {
+      Val = C->getValueAPF().bitcastToAPInt().getSExtValue();
+      return true;
+    }
+  }
+
+  return false;
+}
+
+bool SITargetLowering::checkAsmConstraintVal(SDValue Op,
+                                             const std::string &Constraint,
+                                             uint64_t Val) const {
+  if (Constraint.size() == 1) {
+    switch (Constraint[0]) {
+    case 'I':
+      return AMDGPU::isInlinableIntLiteral(Val);
+    case 'J':
+      return isInt<16>(Val);
+    case 'A':
+      return checkAsmConstraintValA(Op, Val);
+    case 'B':
+      return isInt<32>(Val);
+    case 'C':
+      return isUInt<32>(clearUnusedBits(Val, Op.getScalarValueSizeInBits())) ||
+             AMDGPU::isInlinableIntLiteral(Val);
+    default:
+      break;
+    }
+  } else if (Constraint.size() == 2) {
+    if (Constraint == "DA") {
+      int64_t HiBits = static_cast<int32_t>(Val >> 32);
+      int64_t LoBits = static_cast<int32_t>(Val);
+      return checkAsmConstraintValA(Op, HiBits, 32) &&
+             checkAsmConstraintValA(Op, LoBits, 32);
+    }
+    if (Constraint == "DB") {
+      return true;
+    }
+  }
+  llvm_unreachable("Invalid asm constraint");
+}
+
+bool SITargetLowering::checkAsmConstraintValA(SDValue Op,
+                                              uint64_t Val,
+                                              unsigned MaxSize) const {
+  unsigned Size = std::min<unsigned>(Op.getScalarValueSizeInBits(), MaxSize);
+  bool HasInv2Pi = Subtarget->hasInv2PiInlineImm();
+  if ((Size == 16 && AMDGPU::isInlinableLiteral16(Val, HasInv2Pi)) ||
+      (Size == 32 && AMDGPU::isInlinableLiteral32(Val, HasInv2Pi)) ||
+      (Size == 64 && AMDGPU::isInlinableLiteral64(Val, HasInv2Pi))) {
+    return true;
+  }
+  return false;
+}
+
 // Figure out which registers should be reserved for stack access. Only after
 // the function is legalized do we know all of the non-spill stack objects or if
 // calls are present.
@@ -10745,11 +11338,6 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
   if (Info->getFrameOffsetReg() != AMDGPU::FP_REG)
     MRI.replaceRegWith(AMDGPU::FP_REG, Info->getFrameOffsetReg());
 
-  if (Info->getScratchWaveOffsetReg() != AMDGPU::SCRATCH_WAVE_OFFSET_REG) {
-    MRI.replaceRegWith(AMDGPU::SCRATCH_WAVE_OFFSET_REG,
-                       Info->getScratchWaveOffsetReg());
-  }
-
   Info->limitOccupancy(MF);
 
   if (ST.isWave32() && !MF.empty()) {
@@ -10772,15 +11360,18 @@ void SITargetLowering::finalizeLowering(MachineFunction &MF) const {
   }
 
   TargetLoweringBase::finalizeLowering(MF);
+
+  // Allocate a VGPR for future SGPR Spill if
+  // "amdgpu-reserve-vgpr-for-sgpr-spill" option is used
+  // FIXME: We won't need this hack if we split SGPR allocation from VGPR
+  if (VGPRReserveforSGPRSpill && !Info->VGPRReservedForSGPRSpill &&
+      !Info->isEntryFunction() && MF.getFrameInfo().hasStackObjects())
+    Info->reserveVGPRforSGPRSpills(MF);
 }
 
-void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
-                                                     KnownBits &Known,
-                                                     const APInt &DemandedElts,
-                                                     const SelectionDAG &DAG,
-                                                     unsigned Depth) const {
-  TargetLowering::computeKnownBitsForFrameIndex(Op, Known, DemandedElts,
-                                                DAG, Depth);
+void SITargetLowering::computeKnownBitsForFrameIndex(
+  const int FI, KnownBits &Known, const MachineFunction &MF) const {
+  TargetLowering::computeKnownBitsForFrameIndex(FI, Known, MF);
 
   // Set the high bits to zero based on the maximum allowed scratch size per
   // wave. We can't use vaddr in MUBUF instructions if we don't know the address
@@ -10788,6 +11379,27 @@ void SITargetLowering::computeKnownBitsForFrameIndex(const SDValue Op,
   Known.Zero.setHighBits(getSubtarget()->getKnownHighZeroBitsForFrameIndex());
 }
 
+Align SITargetLowering::computeKnownAlignForTargetInstr(
+  GISelKnownBits &KB, Register R, const MachineRegisterInfo &MRI,
+  unsigned Depth) const {
+  const MachineInstr *MI = MRI.getVRegDef(R);
+  switch (MI->getOpcode()) {
+  case AMDGPU::G_INTRINSIC:
+  case AMDGPU::G_INTRINSIC_W_SIDE_EFFECTS: {
+    // FIXME: Can this move to generic code? What about the case where the call
+    // site specifies a lower alignment?
+    Intrinsic::ID IID = MI->getIntrinsicID();
+    LLVMContext &Ctx = KB.getMachineFunction().getFunction().getContext();
+    AttributeList Attrs = Intrinsic::getAttributes(Ctx, IID);
+    if (MaybeAlign RetAlign = Attrs.getRetAlignment())
+      return *RetAlign;
+    return Align(1);
+  }
+  default:
+    return Align(1);
+  }
+}
+
 Align SITargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
   const Align PrefAlign = TargetLowering::getPrefLoopAlignment(ML);
   const Align CacheLineAlign = Align(64);
@@ -10879,30 +11491,19 @@ bool SITargetLowering::isSDNodeSourceOfDivergence(const SDNode * N,
     case ISD::CopyFromReg:
     {
       const RegisterSDNode *R = cast<RegisterSDNode>(N->getOperand(1));
-      const MachineFunction * MF = FLI->MF;
-      const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
-      const MachineRegisterInfo &MRI = MF->getRegInfo();
-      const SIRegisterInfo &TRI = ST.getInstrInfo()->getRegisterInfo();
-      unsigned Reg = R->getReg();
-      if (Register::isPhysicalRegister(Reg))
-        return !TRI.isSGPRReg(MRI, Reg);
-
-      if (MRI.isLiveIn(Reg)) {
-        // workitem.id.x workitem.id.y workitem.id.z
-        // Any VGPR formal argument is also considered divergent
-        if (!TRI.isSGPRReg(MRI, Reg))
-          return true;
-        // Formal arguments of non-entry functions
-        // are conservatively considered divergent
-        else if (!AMDGPU::isEntryFunctionCC(FLI->Fn->getCallingConv()))
-          return true;
-        return false;
-      }
-      const Value *V = FLI->getValueFromVirtualReg(Reg);
-      if (V)
+      const MachineRegisterInfo &MRI = FLI->MF->getRegInfo();
+      const SIRegisterInfo *TRI = Subtarget->getRegisterInfo();
+      Register Reg = R->getReg();
+
+      // FIXME: Why does this need to consider isLiveIn?
+      if (Reg.isPhysical() || MRI.isLiveIn(Reg))
+        return !TRI->isSGPRReg(MRI, Reg);
+
+      if (const Value *V = FLI->getValueFromVirtualReg(R->getReg()))
         return KDA->isDivergent(V);
+
       assert(Reg == FLI->DemoteRegister || isCopyFromRegOfInlineAsm(N));
-      return !TRI.isSGPRReg(MRI, Reg);
+      return !TRI->isSGPRReg(MRI, Reg);
     }
     break;
     case ISD::LOAD: {
@@ -11004,7 +11605,19 @@ SITargetLowering::getRegClassFor(MVT VT, bool isDivergent) const {
   return RC;
 }
 
-static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) {
+// FIXME: This is a workaround for DivergenceAnalysis not understanding always
+// uniform values (as produced by the mask results of control flow intrinsics)
+// used outside of divergent blocks. The phi users need to also be treated as
+// always uniform.
+static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited,
+                      unsigned WaveSize) {
+  // FIXME: We asssume we never cast the mask results of a control flow
+  // intrinsic.
+  // Early exit if the type won't be consistent as a compile time hack.
+  IntegerType *IT = dyn_cast<IntegerType>(V->getType());
+  if (!IT || IT->getBitWidth() != WaveSize)
+    return false;
+
   if (!isa<Instruction>(V))
     return false;
   if (!Visited.insert(V).second)
@@ -11036,7 +11649,7 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) {
         }
       }
     } else {
-      Result = hasCFUser(U, Visited);
+      Result = hasCFUser(U, Visited, WaveSize);
     }
     if (Result)
       break;
@@ -11046,36 +11659,16 @@ static bool hasCFUser(const Value *V, SmallPtrSet<const Value *, 16> &Visited) {
 
 bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
                                                const Value *V) const {
-  if (const IntrinsicInst *Intrinsic = dyn_cast<IntrinsicInst>(V)) {
-    switch (Intrinsic->getIntrinsicID()) {
-    default:
-      return false;
-    case Intrinsic::amdgcn_if_break:
-      return true;
-    }
-  }
-  if (const ExtractValueInst *ExtValue = dyn_cast<ExtractValueInst>(V)) {
-    if (const IntrinsicInst *Intrinsic =
-            dyn_cast<IntrinsicInst>(ExtValue->getOperand(0))) {
-      switch (Intrinsic->getIntrinsicID()) {
-      default:
-        return false;
-      case Intrinsic::amdgcn_if:
-      case Intrinsic::amdgcn_else: {
-        ArrayRef<unsigned> Indices = ExtValue->getIndices();
-        if (Indices.size() == 1 && Indices[0] == 1) {
-          return true;
-        }
-      }
-      }
-    }
-  }
   if (const CallInst *CI = dyn_cast<CallInst>(V)) {
-    if (isa<InlineAsm>(CI->getCalledValue())) {
+    if (CI->isInlineAsm()) {
+      // FIXME: This cannot give a correct answer. This should only trigger in
+      // the case where inline asm returns mixed SGPR and VGPR results, used
+      // outside the defining block. We don't have a specific result to
+      // consider, so this assumes if any value is SGPR, the overall register
+      // also needs to be SGPR.
       const SIRegisterInfo *SIRI = Subtarget->getRegisterInfo();
-      ImmutableCallSite CS(CI);
       TargetLowering::AsmOperandInfoVector TargetConstraints = ParseConstraints(
-          MF.getDataLayout(), Subtarget->getRegisterInfo(), CS);
+          MF.getDataLayout(), Subtarget->getRegisterInfo(), *CI);
       for (auto &TC : TargetConstraints) {
         if (TC.Type == InlineAsm::isOutput) {
           ComputeConstraintToUse(TC, SDValue());
@@ -11095,5 +11688,20 @@ bool SITargetLowering::requiresUniformRegister(MachineFunction &MF,
     }
   }
   SmallPtrSet<const Value *, 16> Visited;
-  return hasCFUser(V, Visited);
+  return hasCFUser(V, Visited, Subtarget->getWavefrontSize());
+}
+
+std::pair<int, MVT>
+SITargetLowering::getTypeLegalizationCost(const DataLayout &DL,
+                                          Type *Ty) const {
+  auto Cost = TargetLoweringBase::getTypeLegalizationCost(DL, Ty);
+  auto Size = DL.getTypeSizeInBits(Ty);
+  // Maximum load or store can handle 8 dwords for scalar and 4 for
+  // vector ALU. Let's assume anything above 8 dwords is expensive
+  // even if legal.
+  if (Size <= 256)
+    return Cost;
+
+  Cost.first = (Size + 255) / 256;
+  return Cost;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.h b/llvm/lib/Target/AMDGPU/SIISelLowering.h
index d59495b052a4..f4c076464057 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.h
@@ -42,7 +42,8 @@ private:
   SDValue getImplicitArgPtr(SelectionDAG &DAG, const SDLoc &SL) const;
   SDValue lowerKernargMemParameter(SelectionDAG &DAG, EVT VT, EVT MemVT,
                                    const SDLoc &SL, SDValue Chain,
-                                   uint64_t Offset, unsigned Align, bool Signed,
+                                   uint64_t Offset, Align Alignment,
+                                   bool Signed,
                                    const ISD::InputArg *Arg = nullptr) const;
 
   SDValue lowerStackParameter(SelectionDAG &DAG, CCValAssign &VA,
@@ -60,7 +61,7 @@ private:
   SDValue lowerImage(SDValue Op, const AMDGPU::ImageDimIntrinsicInfo *Intr,
                      SelectionDAG &DAG) const;
   SDValue lowerSBuffer(EVT VT, SDLoc DL, SDValue Rsrc, SDValue Offset,
-                       SDValue GLC, SDValue DLC, SelectionDAG &DAG) const;
+                       SDValue CachePolicy, SelectionDAG &DAG) const;
 
   SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, SelectionDAG &DAG) const;
@@ -107,7 +108,7 @@ private:
 
   /// Converts \p Op, which must be of floating point type, to the
   /// floating point type \p VT, by either extending or truncating it.
-  SDValue getFPExtOrFPTrunc(SelectionDAG &DAG,
+  SDValue getFPExtOrFPRound(SelectionDAG &DAG,
                             SDValue Op,
                             const SDLoc &DL,
                             EVT VT) const;
@@ -119,6 +120,7 @@ private:
   /// Custom lowering for ISD::FP_ROUND for MVT::f16.
   SDValue lowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerFMINNUM_FMAXNUM(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerXMULO(SDValue Op, SelectionDAG &DAG) const;
 
   SDValue getSegmentAperture(unsigned AS, const SDLoc &DL,
                              SelectionDAG &DAG) const;
@@ -199,6 +201,15 @@ public:
   /// global value \p GV, false otherwise.
   bool shouldEmitPCReloc(const GlobalValue *GV) const;
 
+  /// \returns true if this should use a literal constant for an LDS address,
+  /// and not emit a relocation for an LDS global.
+  bool shouldUseLDSConstAddress(const GlobalValue *GV) const;
+
+  /// Check if EXTRACT_VECTOR_ELT/INSERT_VECTOR_ELT (<n x e>, var-idx) should be
+  /// expanded into a set of cmp/select instructions.
+  static bool shouldExpandVectorDynExt(unsigned EltSize, unsigned NumElem,
+                                       bool IsDivergentIdx);
+
 private:
   // Analyze a combined offset from an amdgcn_buffer_ intrinsic and store the
   // three offsets (voffset, soffset and instoffset) into the SDValue[3] array
@@ -206,7 +217,7 @@ private:
   /// \returns 0 If there is a non-constant offset or if the offset is 0.
   /// Otherwise returns the constant offset.
   unsigned setBufferOffsets(SDValue CombinedOffset, SelectionDAG &DAG,
-                           SDValue *Offsets, unsigned Align = 4) const;
+                            SDValue *Offsets, Align Alignment = Align(4)) const;
 
   // Handle 8 bit and 16 bit buffer loads
   SDValue handleByteShortBufferLoads(SelectionDAG &DAG, EVT LoadVT, SDLoc DL,
@@ -253,15 +264,18 @@ public:
       MachineMemOperand::Flags Flags = MachineMemOperand::MONone,
       bool *IsFast = nullptr) const override;
 
-  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
-                          unsigned SrcAlign, bool IsMemset,
-                          bool ZeroMemset,
-                          bool MemcpyStrSrc,
+  EVT getOptimalMemOpType(const MemOp &Op,
                           const AttributeList &FuncAttributes) const override;
 
   bool isMemOpUniform(const SDNode *N) const;
   bool isMemOpHasNoClobberedMemOperand(const SDNode *N) const;
 
+  static bool isNonGlobalAddrSpace(unsigned AS) {
+    return AS == AMDGPUAS::LOCAL_ADDRESS || AS == AMDGPUAS::REGION_ADDRESS ||
+           AS == AMDGPUAS::PRIVATE_ADDRESS;
+  }
+
+  // FIXME: Missing constant_32bit
   static bool isFlatGlobalAddrSpace(unsigned AS) {
     return AS == AMDGPUAS::GLOBAL_ADDRESS ||
            AS == AMDGPUAS::FLAT_ADDRESS ||
@@ -330,6 +344,9 @@ public:
   SDValue LowerCall(CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
 
+  SDValue lowerDYNAMIC_STACKALLOCImpl(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+
   Register getRegisterByName(const char* RegName, LLT VT,
                              const MachineFunction &MF) const override;
 
@@ -351,8 +368,7 @@ public:
   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override;
   bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                   EVT VT) const override;
-  bool isFMADLegalForFAddFSub(const SelectionDAG &DAG,
-                              const SDNode *N) const override;
+  bool isFMADLegal(const SelectionDAG &DAG, const SDNode *N) const override;
 
   SDValue splitUnaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
   SDValue splitBinaryVectorOp(SDValue Op, SelectionDAG &DAG) const;
@@ -377,17 +393,29 @@ public:
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                StringRef Constraint, MVT VT) const override;
   ConstraintType getConstraintType(StringRef Constraint) const override;
+  void LowerAsmOperandForConstraint(SDValue Op,
+                                    std::string &Constraint,
+                                    std::vector<SDValue> &Ops,
+                                    SelectionDAG &DAG) const override;
+  bool getAsmOperandConstVal(SDValue Op, uint64_t &Val) const;
+  bool checkAsmConstraintVal(SDValue Op,
+                             const std::string &Constraint,
+                             uint64_t Val) const;
+  bool checkAsmConstraintValA(SDValue Op,
+                              uint64_t Val,
+                              unsigned MaxSize = 64) const;
   SDValue copyToM0(SelectionDAG &DAG, SDValue Chain, const SDLoc &DL,
                    SDValue V) const;
 
   void finalizeLowering(MachineFunction &MF) const override;
 
-  void computeKnownBitsForFrameIndex(const SDValue Op,
+  void computeKnownBitsForFrameIndex(int FrameIdx,
                                      KnownBits &Known,
-                                     const APInt &DemandedElts,
-                                     const SelectionDAG &DAG,
-                                     unsigned Depth = 0) const override;
+                                     const MachineFunction &MF) const override;
 
+  Align computeKnownAlignForTargetInstr(GISelKnownBits &Analysis, Register R,
+                                        const MachineRegisterInfo &MRI,
+                                        unsigned Depth = 0) const override;
   bool isSDNodeSourceOfDivergence(const SDNode *N,
     FunctionLoweringInfo *FLI, LegacyDivergenceAnalysis *DA) const override;
 
@@ -432,6 +460,13 @@ public:
                                  MachineFunction &MF,
                                  const SIRegisterInfo &TRI,
                                  SIMachineFunctionInfo &Info) const;
+  void allocateSpecialInputVGPRsFixed(CCState &CCInfo,
+                                      MachineFunction &MF,
+                                      const SIRegisterInfo &TRI,
+                                      SIMachineFunctionInfo &Info) const;
+
+  std::pair<int, MVT> getTypeLegalizationCost(const DataLayout &DL,
+                                              Type *Ty) const;
 };
 
 } // End namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
new file mode 100644
index 000000000000..35c49ae8c0dd
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIInsertHardClauses.cpp
@@ -0,0 +1,203 @@
+//===- SIInsertHardClauses.cpp - Insert Hard Clauses ----------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// Insert s_clause instructions to form hard clauses.
+///
+/// Clausing load instructions can give cache coherency benefits. Before gfx10,
+/// the hardware automatically detected "soft clauses", which were sequences of
+/// memory instructions of the same type. In gfx10 this detection was removed,
+/// and the s_clause instruction was introduced to explicitly mark "hard
+/// clauses".
+///
+/// It's the scheduler's job to form the clauses by putting similar memory
+/// instructions next to each other. Our job is just to insert an s_clause
+/// instruction to mark the start of each clause.
+///
+/// Note that hard clauses are very similar to, but logically distinct from, the
+/// groups of instructions that have to be restartable when XNACK is enabled.
+/// The rules are slightly different in each case. For example an s_nop
+/// instruction breaks a restartable group, but can appear in the middle of a
+/// hard clause. (Before gfx10 there wasn't a distinction, and both were called
+/// "soft clauses" or just "clauses".)
+///
+/// The SIFormMemoryClauses pass and GCNHazardRecognizer deal with restartable
+/// groups, not hard clauses.
+//
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPUSubtarget.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-insert-hard-clauses"
+
+namespace {
+
+enum HardClauseType {
+  // Texture, buffer, global or scratch memory instructions.
+  HARDCLAUSE_VMEM,
+  // Flat (not global or scratch) memory instructions.
+  HARDCLAUSE_FLAT,
+  // Instructions that access LDS.
+  HARDCLAUSE_LDS,
+  // Scalar memory instructions.
+  HARDCLAUSE_SMEM,
+  // VALU instructions.
+  HARDCLAUSE_VALU,
+  LAST_REAL_HARDCLAUSE_TYPE = HARDCLAUSE_VALU,
+
+  // Internal instructions, which are allowed in the middle of a hard clause,
+  // except for s_waitcnt.
+  HARDCLAUSE_INTERNAL,
+  // Instructions that are not allowed in a hard clause: SALU, export, branch,
+  // message, GDS, s_waitcnt and anything else not mentioned above.
+  HARDCLAUSE_ILLEGAL,
+};
+
+HardClauseType getHardClauseType(const MachineInstr &MI) {
+  // On current architectures we only get a benefit from clausing loads.
+  if (MI.mayLoad()) {
+    if (SIInstrInfo::isVMEM(MI) || SIInstrInfo::isSegmentSpecificFLAT(MI))
+      return HARDCLAUSE_VMEM;
+    if (SIInstrInfo::isFLAT(MI))
+      return HARDCLAUSE_FLAT;
+    // TODO: LDS
+    if (SIInstrInfo::isSMRD(MI))
+      return HARDCLAUSE_SMEM;
+  }
+
+  // Don't form VALU clauses. It's not clear what benefit they give, if any.
+
+  // In practice s_nop is the only internal instruction we're likely to see.
+  // It's safe to treat the rest as illegal.
+  if (MI.getOpcode() == AMDGPU::S_NOP)
+    return HARDCLAUSE_INTERNAL;
+  return HARDCLAUSE_ILLEGAL;
+}
+
+class SIInsertHardClauses : public MachineFunctionPass {
+public:
+  static char ID;
+
+  SIInsertHardClauses() : MachineFunctionPass(ID) {}
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  // Track information about a clause as we discover it.
+  struct ClauseInfo {
+    // The type of all (non-internal) instructions in the clause.
+    HardClauseType Type = HARDCLAUSE_ILLEGAL;
+    // The first (necessarily non-internal) instruction in the clause.
+    MachineInstr *First = nullptr;
+    // The last non-internal instruction in the clause.
+    MachineInstr *Last = nullptr;
+    // The length of the clause including any internal instructions in the
+    // middle or after the end of the clause.
+    unsigned Length = 0;
+    // The base operands of *Last.
+    SmallVector<const MachineOperand *, 4> BaseOps;
+  };
+
+  bool emitClause(const ClauseInfo &CI, const SIInstrInfo *SII) {
+    // Get the size of the clause excluding any internal instructions at the
+    // end.
+    unsigned Size =
+        std::distance(CI.First->getIterator(), CI.Last->getIterator()) + 1;
+    if (Size < 2)
+      return false;
+    assert(Size <= 64 && "Hard clause is too long!");
+
+    auto &MBB = *CI.First->getParent();
+    auto ClauseMI =
+        BuildMI(MBB, *CI.First, DebugLoc(), SII->get(AMDGPU::S_CLAUSE))
+            .addImm(Size - 1);
+    finalizeBundle(MBB, ClauseMI->getIterator(),
+                   std::next(CI.Last->getIterator()));
+    return true;
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    if (skipFunction(MF.getFunction()))
+      return false;
+
+    const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+    if (!ST.hasHardClauses())
+      return false;
+
+    const SIInstrInfo *SII = ST.getInstrInfo();
+    const TargetRegisterInfo *TRI = ST.getRegisterInfo();
+
+    bool Changed = false;
+    for (auto &MBB : MF) {
+      ClauseInfo CI;
+      for (auto &MI : MBB) {
+        HardClauseType Type = getHardClauseType(MI);
+
+        int64_t Dummy1;
+        bool Dummy2;
+        unsigned Dummy3;
+        SmallVector<const MachineOperand *, 4> BaseOps;
+        if (Type <= LAST_REAL_HARDCLAUSE_TYPE) {
+          if (!SII->getMemOperandsWithOffsetWidth(MI, BaseOps, Dummy1, Dummy2,
+                                                  Dummy3, TRI)) {
+            // We failed to get the base operands, so we'll never clause this
+            // instruction with any other, so pretend it's illegal.
+            Type = HARDCLAUSE_ILLEGAL;
+          }
+        }
+
+        if (CI.Length == 64 ||
+            (CI.Length && Type != HARDCLAUSE_INTERNAL &&
+             (Type != CI.Type ||
+              // Note that we lie to shouldClusterMemOps about the size of the
+              // cluster. When shouldClusterMemOps is called from the machine
+              // scheduler it limits the size of the cluster to avoid increasing
+              // register pressure too much, but this pass runs after register
+              // allocation so there is no need for that kind of limit.
+              !SII->shouldClusterMemOps(CI.BaseOps, BaseOps, 2, 2)))) {
+          // Finish the current clause.
+          Changed |= emitClause(CI, SII);
+          CI = ClauseInfo();
+        }
+
+        if (CI.Length) {
+          // Extend the current clause.
+          ++CI.Length;
+          if (Type != HARDCLAUSE_INTERNAL) {
+            CI.Last = &MI;
+            CI.BaseOps = std::move(BaseOps);
+          }
+        } else if (Type <= LAST_REAL_HARDCLAUSE_TYPE) {
+          // Start a new clause.
+          CI = ClauseInfo{Type, &MI, &MI, 1, std::move(BaseOps)};
+        }
+      }
+
+      // Finish the last clause in the basic block if any.
+      if (CI.Length)
+        Changed |= emitClause(CI, SII);
+    }
+
+    return Changed;
+  }
+};
+
+} // namespace
+
+char SIInsertHardClauses::ID = 0;
+
+char &llvm::SIInsertHardClausesID = SIInsertHardClauses::ID;
+
+INITIALIZE_PASS(SIInsertHardClauses, DEBUG_TYPE, "SI Insert Hard Clauses",
+                false, false)
diff --git a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
index 80c044ec00cb..052db5f6ea71 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertSkips.cpp
@@ -18,9 +18,11 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -28,6 +30,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DebugLoc.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
@@ -52,21 +55,22 @@ private:
   const SIRegisterInfo *TRI = nullptr;
   const SIInstrInfo *TII = nullptr;
   unsigned SkipThreshold = 0;
+  MachineDominatorTree *MDT = nullptr;
+
+  MachineBasicBlock *EarlyExitBlock = nullptr;
 
   bool shouldSkip(const MachineBasicBlock &From,
                   const MachineBasicBlock &To) const;
 
-  bool skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB);
-
-  void kill(MachineInstr &MI);
+  bool dominatesAllReachable(MachineBasicBlock &MBB);
+  void createEarlyExitBlock(MachineBasicBlock &MBB);
+  void skipIfDead(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                  DebugLoc DL);
 
-  MachineBasicBlock *insertSkipBlock(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I) const;
+  bool kill(MachineInstr &MI);
 
   bool skipMaskBranch(MachineInstr &MI, MachineBasicBlock &MBB);
 
-  bool optimizeVccBranch(MachineInstr &MI) const;
-
 public:
   static char ID;
 
@@ -79,6 +83,8 @@ public:
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<MachineDominatorTree>();
+    AU.addPreserved<MachineDominatorTree>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -87,8 +93,11 @@ public:
 
 char SIInsertSkips::ID = 0;
 
-INITIALIZE_PASS(SIInsertSkips, DEBUG_TYPE,
-                "SI insert s_cbranch_execz instructions", false, false)
+INITIALIZE_PASS_BEGIN(SIInsertSkips, DEBUG_TYPE,
+                      "SI insert s_cbranch_execz instructions", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(SIInsertSkips, DEBUG_TYPE,
+                    "SI insert s_cbranch_execz instructions", false, false)
 
 char &llvm::SIInsertSkipsPassID = SIInsertSkips::ID;
 
@@ -146,42 +155,110 @@ bool SIInsertSkips::shouldSkip(const MachineBasicBlock &From,
   return false;
 }
 
-bool SIInsertSkips::skipIfDead(MachineInstr &MI, MachineBasicBlock &NextBB) {
-  MachineBasicBlock &MBB = *MI.getParent();
-  MachineFunction *MF = MBB.getParent();
-
-  if (MF->getFunction().getCallingConv() != CallingConv::AMDGPU_PS ||
-      !shouldSkip(MBB, MBB.getParent()->back()))
-    return false;
+/// Check whether \p MBB dominates all blocks that are reachable from it.
+bool SIInsertSkips::dominatesAllReachable(MachineBasicBlock &MBB) {
+  for (MachineBasicBlock *Other : depth_first(&MBB)) {
+    if (!MDT->dominates(&MBB, Other))
+      return false;
+  }
+  return true;
+}
 
-  MachineBasicBlock *SkipBB = insertSkipBlock(MBB, MI.getIterator());
+static void generatePsEndPgm(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator I, DebugLoc DL,
+                             const SIInstrInfo *TII) {
+  // Generate "null export; s_endpgm".
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::EXP_DONE))
+      .addImm(0x09) // V_008DFC_SQ_EXP_NULL
+      .addReg(AMDGPU::VGPR0, RegState::Undef)
+      .addReg(AMDGPU::VGPR0, RegState::Undef)
+      .addReg(AMDGPU::VGPR0, RegState::Undef)
+      .addReg(AMDGPU::VGPR0, RegState::Undef)
+      .addImm(1)  // vm
+      .addImm(0)  // compr
+      .addImm(0); // en
+  BuildMI(MBB, I, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
+}
 
-  const DebugLoc &DL = MI.getDebugLoc();
+void SIInsertSkips::createEarlyExitBlock(MachineBasicBlock &MBB) {
+  MachineFunction *MF = MBB.getParent();
+  DebugLoc DL;
 
-  // If the exec mask is non-zero, skip the next two instructions
-  BuildMI(&MBB, DL, TII->get(AMDGPU::S_CBRANCH_EXECNZ))
-    .addMBB(&NextBB);
+  assert(!EarlyExitBlock);
+  EarlyExitBlock = MF->CreateMachineBasicBlock();
+  MF->insert(MF->end(), EarlyExitBlock);
 
-  MachineBasicBlock::iterator Insert = SkipBB->begin();
+  generatePsEndPgm(*EarlyExitBlock, EarlyExitBlock->end(), DL, TII);
+}
 
-  // Exec mask is zero: Export to NULL target...
-  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::EXP_DONE))
-    .addImm(0x09) // V_008DFC_SQ_EXP_NULL
-    .addReg(AMDGPU::VGPR0, RegState::Undef)
-    .addReg(AMDGPU::VGPR0, RegState::Undef)
-    .addReg(AMDGPU::VGPR0, RegState::Undef)
-    .addReg(AMDGPU::VGPR0, RegState::Undef)
-    .addImm(1)  // vm
-    .addImm(0)  // compr
-    .addImm(0); // en
+/// Insert an "if exec=0 { null export; s_endpgm }" sequence before the given
+/// iterator. Only applies to pixel shaders.
+void SIInsertSkips::skipIfDead(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator I, DebugLoc DL) {
+  MachineFunction *MF = MBB.getParent();
+  assert(MF->getFunction().getCallingConv() == CallingConv::AMDGPU_PS);
+
+  // It is possible for an SI_KILL_*_TERMINATOR to sit at the bottom of a
+  // basic block that has no further successors (e.g., there was an
+  // `unreachable` there in IR). This can happen with original source of the
+  // form:
+  //
+  //   if (uniform_condition) {
+  //     write_to_memory();
+  //     discard;
+  //   }
+  //
+  // In this case, we write the "null_export; s_endpgm" skip code in the
+  // already-existing basic block.
+  auto NextBBI = std::next(MBB.getIterator());
+  bool NoSuccessor = I == MBB.end() &&
+                     llvm::find(MBB.successors(), &*NextBBI) == MBB.succ_end();
+
+  if (NoSuccessor) {
+    generatePsEndPgm(MBB, I, DL, TII);
+  } else {
+    if (!EarlyExitBlock) {
+      createEarlyExitBlock(MBB);
+      // Update next block pointer to reflect any new blocks
+      NextBBI = std::next(MBB.getIterator());
+    }
 
-  // ... and terminate wavefront.
-  BuildMI(*SkipBB, Insert, DL, TII->get(AMDGPU::S_ENDPGM)).addImm(0);
+    auto BranchMI = BuildMI(MBB, I, DL, TII->get(AMDGPU::S_CBRANCH_EXECZ))
+                        .addMBB(EarlyExitBlock);
+
+    // Split the block if the branch will not come at the end.
+    auto Next = std::next(BranchMI->getIterator());
+    if (Next != MBB.end() && !Next->isTerminator()) {
+      MachineBasicBlock *SplitBB =
+          MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+      MF->insert(NextBBI, SplitBB);
+      SplitBB->splice(SplitBB->begin(), &MBB, I, MBB.end());
+      SplitBB->transferSuccessorsAndUpdatePHIs(&MBB);
+      // FIXME: the expectation is that this will be used near the beginning
+      //        of a block so just assume all registers are still live.
+      for (auto LiveIn : MBB.liveins())
+        SplitBB->addLiveIn(LiveIn);
+      MBB.addSuccessor(SplitBB);
+
+      // Update dominator tree
+      using DomTreeT = DomTreeBase<MachineBasicBlock>;
+      SmallVector<DomTreeT::UpdateType, 16> DTUpdates;
+      for (MachineBasicBlock *Succ : SplitBB->successors()) {
+        DTUpdates.push_back({DomTreeT::Insert, SplitBB, Succ});
+        DTUpdates.push_back({DomTreeT::Delete, &MBB, Succ});
+      }
+      DTUpdates.push_back({DomTreeT::Insert, &MBB, SplitBB});
+      MDT->getBase().applyUpdates(DTUpdates);
+    }
 
-  return true;
+    MBB.addSuccessor(EarlyExitBlock);
+    MDT->getBase().insertEdge(&MBB, EarlyExitBlock);
+  }
 }
 
-void SIInsertSkips::kill(MachineInstr &MI) {
+/// Translate a SI_KILL_*_TERMINATOR into exec-manipulating instructions.
+/// Return true unless the terminator is a no-op.
+bool SIInsertSkips::kill(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   DebugLoc DL = MI.getDebugLoc();
 
@@ -268,7 +345,7 @@ void SIInsertSkips::kill(MachineInstr &MI) {
 
       I.addImm(0);  // omod
     }
-    break;
+    return true;
   }
   case AMDGPU::SI_KILL_I1_TERMINATOR: {
     const MachineFunction *MF = MI.getParent()->getParent();
@@ -283,11 +360,13 @@ void SIInsertSkips::kill(MachineInstr &MI) {
       int64_t Imm = Op.getImm();
       assert(Imm == 0 || Imm == -1);
 
-      if (Imm == KillVal)
+      if (Imm == KillVal) {
         BuildMI(MBB, &MI, DL, TII->get(ST.isWave32() ? AMDGPU::S_MOV_B32
                                                      : AMDGPU::S_MOV_B64), Exec)
           .addImm(0);
-      break;
+        return true;
+      }
+      return false;
     }
 
     unsigned Opcode = KillVal ? AMDGPU::S_ANDN2_B64 : AMDGPU::S_AND_B64;
@@ -296,27 +375,13 @@ void SIInsertSkips::kill(MachineInstr &MI) {
     BuildMI(MBB, &MI, DL, TII->get(Opcode), Exec)
         .addReg(Exec)
         .add(Op);
-    break;
+    return true;
   }
   default:
     llvm_unreachable("invalid opcode, expected SI_KILL_*_TERMINATOR");
   }
 }
 
-MachineBasicBlock *SIInsertSkips::insertSkipBlock(
-  MachineBasicBlock &MBB, MachineBasicBlock::iterator I) const {
-  MachineFunction *MF = MBB.getParent();
-
-  MachineBasicBlock *SkipBB = MF->CreateMachineBasicBlock();
-  MachineFunction::iterator MBBI(MBB);
-  ++MBBI;
-
-  MF->insert(MBBI, SkipBB);
-  MBB.addSuccessor(SkipBB);
-
-  return SkipBB;
-}
-
 // Returns true if a branch over the block was inserted.
 bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
                                    MachineBasicBlock &SrcMBB) {
@@ -334,143 +399,24 @@ bool SIInsertSkips::skipMaskBranch(MachineInstr &MI,
   return true;
 }
 
-bool SIInsertSkips::optimizeVccBranch(MachineInstr &MI) const {
-  // Match:
-  // sreg = -1
-  // vcc = S_AND_B64 exec, sreg
-  // S_CBRANCH_VCC[N]Z
-  // =>
-  // S_CBRANCH_EXEC[N]Z
-  bool Changed = false;
-  MachineBasicBlock &MBB = *MI.getParent();
-  const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
-  const bool IsWave32 = ST.isWave32();
-  const unsigned CondReg = TRI->getVCC();
-  const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
-
-  MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
-                                      E = MBB.rend();
-  bool ReadsCond = false;
-  unsigned Threshold = 5;
-  for (++A ; A != E ; ++A) {
-    if (!--Threshold)
-      return false;
-    if (A->modifiesRegister(ExecReg, TRI))
-      return false;
-    if (A->modifiesRegister(CondReg, TRI)) {
-      if (!A->definesRegister(CondReg, TRI) || A->getOpcode() != And)
-        return false;
-      break;
-    }
-    ReadsCond |= A->readsRegister(CondReg, TRI);
-  }
-  if (A == E)
-    return false;
-
-  MachineOperand &Op1 = A->getOperand(1);
-  MachineOperand &Op2 = A->getOperand(2);
-  if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
-    TII->commuteInstruction(*A);
-    Changed = true;
-  }
-  if (Op1.getReg() != ExecReg)
-    return Changed;
-  if (Op2.isImm() && Op2.getImm() != -1)
-    return Changed;
-
-  unsigned SReg = AMDGPU::NoRegister;
-  if (Op2.isReg()) {
-    SReg = Op2.getReg();
-    auto M = std::next(A);
-    bool ReadsSreg = false;
-    for ( ; M != E ; ++M) {
-      if (M->definesRegister(SReg, TRI))
-        break;
-      if (M->modifiesRegister(SReg, TRI))
-        return Changed;
-      ReadsSreg |= M->readsRegister(SReg, TRI);
-    }
-    if (M == E ||
-        !M->isMoveImmediate() ||
-        !M->getOperand(1).isImm() ||
-        M->getOperand(1).getImm() != -1)
-      return Changed;
-    // First if sreg is only used in and instruction fold the immediate
-    // into that and.
-    if (!ReadsSreg && Op2.isKill()) {
-      A->getOperand(2).ChangeToImmediate(-1);
-      M->eraseFromParent();
-    }
-  }
-
-  if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
-      MI.killsRegister(CondReg, TRI))
-    A->eraseFromParent();
-
-  bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
-  if (SReg == ExecReg) {
-    if (IsVCCZ) {
-      MI.eraseFromParent();
-      return true;
-    }
-    MI.setDesc(TII->get(AMDGPU::S_BRANCH));
-  } else {
-    MI.setDesc(TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ
-                               : AMDGPU::S_CBRANCH_EXECNZ));
-  }
-
-  MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
-  MI.addImplicitDefUseOperands(*MBB.getParent());
-
-  return true;
-}
-
 bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
   TRI = &TII->getRegisterInfo();
+  MDT = &getAnalysis<MachineDominatorTree>();
   SkipThreshold = SkipThresholdFlag;
 
-  bool HaveKill = false;
+  SmallVector<MachineInstr *, 4> KillInstrs;
   bool MadeChange = false;
 
-  // Track depth of exec mask, divergent branches.
-  SmallVector<MachineBasicBlock *, 16> ExecBranchStack;
-
-  MachineFunction::iterator NextBB;
-
-  MachineBasicBlock *EmptyMBBAtEnd = nullptr;
-
-  for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
-       BI != BE; BI = NextBB) {
-    NextBB = std::next(BI);
-    MachineBasicBlock &MBB = *BI;
-    bool HaveSkipBlock = false;
-
-    if (!ExecBranchStack.empty() && ExecBranchStack.back() == &MBB) {
-      // Reached convergence point for last divergent branch.
-      ExecBranchStack.pop_back();
-    }
-
-    if (HaveKill && ExecBranchStack.empty()) {
-      HaveKill = false;
-
-      // TODO: Insert skip if exec is 0?
-    }
-
+  for (MachineBasicBlock &MBB : MF) {
     MachineBasicBlock::iterator I, Next;
     for (I = MBB.begin(); I != MBB.end(); I = Next) {
       Next = std::next(I);
-
       MachineInstr &MI = *I;
 
       switch (MI.getOpcode()) {
-      case AMDGPU::S_CBRANCH_EXECZ:
-        ExecBranchStack.push_back(MI.getOperand(0).getMBB());
-        break;
       case AMDGPU::SI_MASK_BRANCH:
-        ExecBranchStack.push_back(MI.getOperand(0).getMBB());
         MadeChange |= skipMaskBranch(MI, MBB);
         break;
 
@@ -478,64 +424,60 @@ bool SIInsertSkips::runOnMachineFunction(MachineFunction &MF) {
         // Optimize out branches to the next block.
         // FIXME: Shouldn't this be handled by BranchFolding?
         if (MBB.isLayoutSuccessor(MI.getOperand(0).getMBB())) {
+          assert(&MI == &MBB.back());
           MI.eraseFromParent();
-        } else if (HaveSkipBlock) {
-          // Remove the given unconditional branch when a skip block has been
-          // inserted after the current one and let skip the two instructions
-          // performing the kill if the exec mask is non-zero.
-          MI.eraseFromParent();
+          MadeChange = true;
         }
         break;
 
       case AMDGPU::SI_KILL_F32_COND_IMM_TERMINATOR:
-      case AMDGPU::SI_KILL_I1_TERMINATOR:
+      case AMDGPU::SI_KILL_I1_TERMINATOR: {
         MadeChange = true;
-        kill(MI);
-
-        if (ExecBranchStack.empty()) {
-          if (NextBB != BE && skipIfDead(MI, *NextBB)) {
-            HaveSkipBlock = true;
-            NextBB = std::next(BI);
-            BE = MF.end();
-          }
+        bool CanKill = kill(MI);
+
+        // Check if we can add an early "if exec=0 { end shader }".
+        //
+        // Note that we _always_ do this if it is correct, even if the kill
+        // happens fairly late in the shader, because the null export should
+        // generally still be cheaper than normal export(s).
+        //
+        // TODO: The dominatesAllReachable check is conservative: if the
+        //       dominance is only missing due to _uniform_ branches, we could
+        //       in fact insert the early-exit as well.
+        if (CanKill &&
+            MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
+            dominatesAllReachable(MBB)) {
+          // Mark the instruction for kill-if-dead insertion. We delay this
+          // change because it modifies the CFG.
+          KillInstrs.push_back(&MI);
         } else {
-          HaveKill = true;
+          MI.eraseFromParent();
         }
-
-        MI.eraseFromParent();
         break;
+      }
 
-      case AMDGPU::SI_RETURN_TO_EPILOG:
-        // FIXME: Should move somewhere else
-        assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
-
-        // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
-        // because external bytecode will be appended at the end.
-        if (BI != --MF.end() || I != MBB.getFirstTerminator()) {
-          // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block at
-          // the end and jump there.
-          if (!EmptyMBBAtEnd) {
-            EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
-            MF.insert(MF.end(), EmptyMBBAtEnd);
-          }
-
-          MBB.addSuccessor(EmptyMBBAtEnd);
-          BuildMI(*BI, I, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
-            .addMBB(EmptyMBBAtEnd);
-          I->eraseFromParent();
+      case AMDGPU::SI_KILL_CLEANUP:
+        if (MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS &&
+            dominatesAllReachable(MBB)) {
+          KillInstrs.push_back(&MI);
+        } else {
+          MI.eraseFromParent();
         }
         break;
 
-      case AMDGPU::S_CBRANCH_VCCZ:
-      case AMDGPU::S_CBRANCH_VCCNZ:
-        MadeChange |= optimizeVccBranch(MI);
-        break;
-
       default:
         break;
       }
     }
   }
 
+  for (MachineInstr *Kill : KillInstrs) {
+    skipIfDead(*Kill->getParent(), std::next(Kill->getIterator()),
+               Kill->getDebugLoc());
+    Kill->eraseFromParent();
+  }
+  KillInstrs.clear();
+  EarlyExitBlock = nullptr;
+
   return MadeChange;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
index ef662d55cb0a..2a157eb20ab4 100644
--- a/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInsertWaitcnts.cpp
@@ -32,6 +32,7 @@
 #include "Utils/AMDGPUBaseInfo.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -57,7 +58,6 @@
 #include <cstring>
 #include <memory>
 #include <utility>
-#include <vector>
 
 using namespace llvm;
 
@@ -109,15 +109,13 @@ iterator_range<enum_iterator<InstCounterType>> inst_counter_types() {
                     enum_iterator<InstCounterType>(NUM_INST_CNTS));
 }
 
-using RegInterval = std::pair<signed, signed>;
+using RegInterval = std::pair<int, int>;
 
 struct {
-  uint32_t VmcntMax;
-  uint32_t ExpcntMax;
-  uint32_t LgkmcntMax;
-  uint32_t VscntMax;
-  int32_t NumVGPRsMax;
-  int32_t NumSGPRsMax;
+  unsigned VmcntMax;
+  unsigned ExpcntMax;
+  unsigned LgkmcntMax;
+  unsigned VscntMax;
 } HardwareLimits;
 
 struct {
@@ -143,7 +141,7 @@ enum WaitEventType {
   NUM_WAIT_EVENTS,
 };
 
-static const uint32_t WaitEventMaskForInst[NUM_INST_CNTS] = {
+static const unsigned WaitEventMaskForInst[NUM_INST_CNTS] = {
   (1 << VMEM_ACCESS) | (1 << VMEM_READ_ACCESS),
   (1 << SMEM_ACCESS) | (1 << LDS_ACCESS) | (1 << GDS_ACCESS) |
       (1 << SQ_MESSAGE),
@@ -166,6 +164,28 @@ enum RegisterMapping {
   NUM_ALL_VGPRS = SQ_MAX_PGM_VGPRS + NUM_EXTRA_VGPRS, // Where SGPR starts.
 };
 
+// Enumerate different types of result-returning VMEM operations. Although
+// s_waitcnt orders them all with a single vmcnt counter, in the absence of
+// s_waitcnt only instructions of the same VmemType are guaranteed to write
+// their results in order -- so there is no need to insert an s_waitcnt between
+// two instructions of the same type that write the same vgpr.
+enum VmemType {
+  // BUF instructions and MIMG instructions without a sampler.
+  VMEM_NOSAMPLER,
+  // MIMG instructions with a sampler.
+  VMEM_SAMPLER,
+};
+
+VmemType getVmemType(const MachineInstr &Inst) {
+  assert(SIInstrInfo::isVMEM(Inst));
+  if (!SIInstrInfo::isMIMG(Inst))
+    return VMEM_NOSAMPLER;
+  const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Inst.getOpcode());
+  return AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler
+             ? VMEM_SAMPLER
+             : VMEM_NOSAMPLER;
+}
+
 void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
   switch (T) {
   case VM_CNT:
@@ -195,12 +215,9 @@ void addWait(AMDGPU::Waitcnt &Wait, InstCounterType T, unsigned Count) {
 // "s_waitcnt 0" before use.
 class WaitcntBrackets {
 public:
-  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {
-    for (auto T : inst_counter_types())
-      memset(VgprScores[T], 0, sizeof(VgprScores[T]));
-  }
+  WaitcntBrackets(const GCNSubtarget *SubTarget) : ST(SubTarget) {}
 
-  static uint32_t getWaitCountMax(InstCounterType T) {
+  static unsigned getWaitCountMax(InstCounterType T) {
     switch (T) {
     case VM_CNT:
       return HardwareLimits.VmcntMax;
@@ -216,17 +233,13 @@ public:
     return 0;
   }
 
-  uint32_t getScoreLB(InstCounterType T) const {
+  unsigned getScoreLB(InstCounterType T) const {
     assert(T < NUM_INST_CNTS);
-    if (T >= NUM_INST_CNTS)
-      return 0;
     return ScoreLBs[T];
   }
 
-  uint32_t getScoreUB(InstCounterType T) const {
+  unsigned getScoreUB(InstCounterType T) const {
     assert(T < NUM_INST_CNTS);
-    if (T >= NUM_INST_CNTS)
-      return 0;
     return ScoreUBs[T];
   }
 
@@ -242,7 +255,7 @@ public:
     return EXP_CNT;
   }
 
-  uint32_t getRegScore(int GprNo, InstCounterType T) {
+  unsigned getRegScore(int GprNo, InstCounterType T) {
     if (GprNo < NUM_ALL_VGPRS) {
       return VgprScores[T][GprNo];
     }
@@ -250,30 +263,16 @@ public:
     return SgprScores[GprNo - NUM_ALL_VGPRS];
   }
 
-  void clear() {
-    memset(ScoreLBs, 0, sizeof(ScoreLBs));
-    memset(ScoreUBs, 0, sizeof(ScoreUBs));
-    PendingEvents = 0;
-    memset(MixedPendingEvents, 0, sizeof(MixedPendingEvents));
-    for (auto T : inst_counter_types())
-      memset(VgprScores[T], 0, sizeof(VgprScores[T]));
-    memset(SgprScores, 0, sizeof(SgprScores));
-  }
-
   bool merge(const WaitcntBrackets &Other);
 
   RegInterval getRegInterval(const MachineInstr *MI, const SIInstrInfo *TII,
                              const MachineRegisterInfo *MRI,
-                             const SIRegisterInfo *TRI, unsigned OpNo,
-                             bool Def) const;
-
-  int32_t getMaxVGPR() const { return VgprUB; }
-  int32_t getMaxSGPR() const { return SgprUB; }
+                             const SIRegisterInfo *TRI, unsigned OpNo) const;
 
   bool counterOutOfOrder(InstCounterType T) const;
   bool simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const;
   bool simplifyWaitcnt(InstCounterType T, unsigned &Count) const;
-  void determineWait(InstCounterType T, uint32_t ScoreToWait,
+  void determineWait(InstCounterType T, unsigned ScoreToWait,
                      AMDGPU::Waitcnt &Wait) const;
   void applyWaitcnt(const AMDGPU::Waitcnt &Wait);
   void applyWaitcnt(InstCounterType T, unsigned Count);
@@ -286,6 +285,12 @@ public:
     return PendingEvents & (1 << E);
   }
 
+  bool hasMixedPendingEvents(InstCounterType T) const {
+    unsigned Events = PendingEvents & WaitEventMaskForInst[T];
+    // Return true if more than one bit is set in Events.
+    return Events & (Events - 1);
+  }
+
   bool hasPendingFlat() const {
     return ((LastFlat[LGKM_CNT] > ScoreLBs[LGKM_CNT] &&
              LastFlat[LGKM_CNT] <= ScoreUBs[LGKM_CNT]) ||
@@ -298,71 +303,77 @@ public:
     LastFlat[LGKM_CNT] = ScoreUBs[LGKM_CNT];
   }
 
+  // Return true if there might be pending writes to the specified vgpr by VMEM
+  // instructions with types different from V.
+  bool hasOtherPendingVmemTypes(int GprNo, VmemType V) const {
+    assert(GprNo < NUM_ALL_VGPRS);
+    return VgprVmemTypes[GprNo] & ~(1 << V);
+  }
+
+  void clearVgprVmemTypes(int GprNo) {
+    assert(GprNo < NUM_ALL_VGPRS);
+    VgprVmemTypes[GprNo] = 0;
+  }
+
   void print(raw_ostream &);
   void dump() { print(dbgs()); }
 
 private:
   struct MergeInfo {
-    uint32_t OldLB;
-    uint32_t OtherLB;
-    uint32_t MyShift;
-    uint32_t OtherShift;
+    unsigned OldLB;
+    unsigned OtherLB;
+    unsigned MyShift;
+    unsigned OtherShift;
   };
-  static bool mergeScore(const MergeInfo &M, uint32_t &Score,
-                         uint32_t OtherScore);
+  static bool mergeScore(const MergeInfo &M, unsigned &Score,
+                         unsigned OtherScore);
 
-  void setScoreLB(InstCounterType T, uint32_t Val) {
+  void setScoreLB(InstCounterType T, unsigned Val) {
     assert(T < NUM_INST_CNTS);
-    if (T >= NUM_INST_CNTS)
-      return;
     ScoreLBs[T] = Val;
   }
 
-  void setScoreUB(InstCounterType T, uint32_t Val) {
+  void setScoreUB(InstCounterType T, unsigned Val) {
     assert(T < NUM_INST_CNTS);
-    if (T >= NUM_INST_CNTS)
-      return;
     ScoreUBs[T] = Val;
     if (T == EXP_CNT) {
-      uint32_t UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
+      unsigned UB = ScoreUBs[T] - getWaitCountMax(EXP_CNT);
       if (ScoreLBs[T] < UB && UB < ScoreUBs[T])
         ScoreLBs[T] = UB;
     }
   }
 
-  void setRegScore(int GprNo, InstCounterType T, uint32_t Val) {
+  void setRegScore(int GprNo, InstCounterType T, unsigned Val) {
     if (GprNo < NUM_ALL_VGPRS) {
-      if (GprNo > VgprUB) {
-        VgprUB = GprNo;
-      }
+      VgprUB = std::max(VgprUB, GprNo);
       VgprScores[T][GprNo] = Val;
     } else {
       assert(T == LGKM_CNT);
-      if (GprNo - NUM_ALL_VGPRS > SgprUB) {
-        SgprUB = GprNo - NUM_ALL_VGPRS;
-      }
+      SgprUB = std::max(SgprUB, GprNo - NUM_ALL_VGPRS);
       SgprScores[GprNo - NUM_ALL_VGPRS] = Val;
     }
   }
 
   void setExpScore(const MachineInstr *MI, const SIInstrInfo *TII,
                    const SIRegisterInfo *TRI, const MachineRegisterInfo *MRI,
-                   unsigned OpNo, uint32_t Val);
+                   unsigned OpNo, unsigned Val);
 
   const GCNSubtarget *ST = nullptr;
-  uint32_t ScoreLBs[NUM_INST_CNTS] = {0};
-  uint32_t ScoreUBs[NUM_INST_CNTS] = {0};
-  uint32_t PendingEvents = 0;
-  bool MixedPendingEvents[NUM_INST_CNTS] = {false};
+  unsigned ScoreLBs[NUM_INST_CNTS] = {0};
+  unsigned ScoreUBs[NUM_INST_CNTS] = {0};
+  unsigned PendingEvents = 0;
   // Remember the last flat memory operation.
-  uint32_t LastFlat[NUM_INST_CNTS] = {0};
+  unsigned LastFlat[NUM_INST_CNTS] = {0};
   // wait_cnt scores for every vgpr.
   // Keep track of the VgprUB and SgprUB to make merge at join efficient.
-  int32_t VgprUB = 0;
-  int32_t SgprUB = 0;
-  uint32_t VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS];
+  int VgprUB = -1;
+  int SgprUB = -1;
+  unsigned VgprScores[NUM_INST_CNTS][NUM_ALL_VGPRS] = {{0}};
   // Wait cnt scores for every sgpr, only lgkmcnt is relevant.
-  uint32_t SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+  unsigned SgprScores[SQ_MAX_PGM_SGPRS] = {0};
+  // Bitmask of the VmemTypes of VMEM instructions that might have a pending
+  // write to each vgpr.
+  unsigned char VgprVmemTypes[NUM_ALL_VGPRS] = {0};
 };
 
 class SIInsertWaitcnts : public MachineFunctionPass {
@@ -385,8 +396,7 @@ private:
     explicit BlockInfo(MachineBasicBlock *MBB) : MBB(MBB) {}
   };
 
-  std::vector<BlockInfo> BlockInfos; // by reverse post-order traversal index
-  DenseMap<MachineBasicBlock *, unsigned> RpotIdxMap;
+  MapVector<MachineBasicBlock *, BlockInfo> BlockInfos;
 
   // ForceEmitZeroWaitcnts: force all waitcnts insts to be s_waitcnt 0
   // because of amdgpu-waitcnt-forcezero flag
@@ -464,10 +474,10 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
                                             const SIInstrInfo *TII,
                                             const MachineRegisterInfo *MRI,
                                             const SIRegisterInfo *TRI,
-                                            unsigned OpNo, bool Def) const {
+                                            unsigned OpNo) const {
   const MachineOperand &Op = MI->getOperand(OpNo);
-  if (!Op.isReg() || !TRI->isInAllocatableClass(Op.getReg()) ||
-      (Def && !Op.isDef()) || TRI->isAGPR(*MRI, Op.getReg()))
+  assert(Op.isReg());
+  if (!TRI->isInAllocatableClass(Op.getReg()) || TRI->isAGPR(*MRI, Op.getReg()))
     return {-1, -1};
 
   // A use via a PW operand does not need a waitcnt.
@@ -475,29 +485,27 @@ RegInterval WaitcntBrackets::getRegInterval(const MachineInstr *MI,
   assert(!Op.getSubReg() || !Op.isUndef());
 
   RegInterval Result;
-  const MachineRegisterInfo &MRIA = *MRI;
 
   unsigned Reg = TRI->getEncodingValue(Op.getReg());
 
-  if (TRI->isVGPR(MRIA, Op.getReg())) {
+  if (TRI->isVGPR(*MRI, Op.getReg())) {
     assert(Reg >= RegisterEncoding.VGPR0 && Reg <= RegisterEncoding.VGPRL);
     Result.first = Reg - RegisterEncoding.VGPR0;
     assert(Result.first >= 0 && Result.first < SQ_MAX_PGM_VGPRS);
-  } else if (TRI->isSGPRReg(MRIA, Op.getReg())) {
+  } else if (TRI->isSGPRReg(*MRI, Op.getReg())) {
     assert(Reg >= RegisterEncoding.SGPR0 && Reg < SQ_MAX_PGM_SGPRS);
     Result.first = Reg - RegisterEncoding.SGPR0 + NUM_ALL_VGPRS;
     assert(Result.first >= NUM_ALL_VGPRS &&
            Result.first < SQ_MAX_PGM_SGPRS + NUM_ALL_VGPRS);
   }
   // TODO: Handle TTMP
-  // else if (TRI->isTTMP(MRIA, Reg.getReg())) ...
+  // else if (TRI->isTTMP(*MRI, Reg.getReg())) ...
   else
     return {-1, -1};
 
-  const MachineInstr &MIA = *MI;
-  const TargetRegisterClass *RC = TII->getOpRegClass(MIA, OpNo);
+  const TargetRegisterClass *RC = TII->getOpRegClass(*MI, OpNo);
   unsigned Size = TRI->getRegSizeInBits(*RC);
-  Result.second = Result.first + (Size / 32);
+  Result.second = Result.first + ((Size + 16) / 32);
 
   return Result;
 }
@@ -506,13 +514,10 @@ void WaitcntBrackets::setExpScore(const MachineInstr *MI,
                                   const SIInstrInfo *TII,
                                   const SIRegisterInfo *TRI,
                                   const MachineRegisterInfo *MRI, unsigned OpNo,
-                                  uint32_t Val) {
-  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo, false);
-  LLVM_DEBUG({
-    const MachineOperand &Opnd = MI->getOperand(OpNo);
-    assert(TRI->isVGPR(*MRI, Opnd.getReg()));
-  });
-  for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+                                  unsigned Val) {
+  RegInterval Interval = getRegInterval(MI, TII, MRI, TRI, OpNo);
+  assert(TRI->isVGPR(*MRI, MI->getOperand(OpNo).getReg()));
+  for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
     setRegScore(RegNo, EXP_CNT, Val);
   }
 }
@@ -521,19 +526,14 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
                                     const SIRegisterInfo *TRI,
                                     const MachineRegisterInfo *MRI,
                                     WaitEventType E, MachineInstr &Inst) {
-  const MachineRegisterInfo &MRIA = *MRI;
   InstCounterType T = eventCounter(E);
-  uint32_t CurrScore = getScoreUB(T) + 1;
+  unsigned CurrScore = getScoreUB(T) + 1;
   if (CurrScore == 0)
     report_fatal_error("InsertWaitcnt score wraparound");
   // PendingEvents and ScoreUB need to be update regardless if this event
   // changes the score of a register or not.
   // Examples including vm_cnt when buffer-store or lgkm_cnt when send-message.
-  if (!hasPendingEvent(E)) {
-    if (PendingEvents & WaitEventMaskForInst[T])
-      MixedPendingEvents[T] = true;
-    PendingEvents |= 1 << E;
-  }
+  PendingEvents |= 1 << E;
   setScoreUB(T, CurrScore);
 
   if (T == EXP_CNT) {
@@ -574,7 +574,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
                  Inst.getOpcode() != AMDGPU::DS_ORDERED_COUNT) {
         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
           const MachineOperand &Op = Inst.getOperand(I);
-          if (Op.isReg() && !Op.isDef() && TRI->isVGPR(MRIA, Op.getReg())) {
+          if (Op.isReg() && !Op.isDef() && TRI->isVGPR(*MRI, Op.getReg())) {
             setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
           }
         }
@@ -622,7 +622,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
         for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
           MachineOperand &DefMO = Inst.getOperand(I);
           if (DefMO.isReg() && DefMO.isDef() &&
-              TRI->isVGPR(MRIA, DefMO.getReg())) {
+              TRI->isVGPR(*MRI, DefMO.getReg())) {
             setRegScore(TRI->getEncodingValue(DefMO.getReg()), EXP_CNT,
                         CurrScore);
           }
@@ -630,7 +630,7 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
       }
       for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
         MachineOperand &MO = Inst.getOperand(I);
-        if (MO.isReg() && !MO.isDef() && TRI->isVGPR(MRIA, MO.getReg())) {
+        if (MO.isReg() && !MO.isDef() && TRI->isVGPR(*MRI, MO.getReg())) {
           setExpScore(&Inst, TII, TRI, MRI, I, CurrScore);
         }
       }
@@ -641,8 +641,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
        Inst.getOpcode() == AMDGPU::BUFFER_STORE_DWORDX4) {
     MachineOperand *MO = TII->getNamedOperand(Inst, AMDGPU::OpName::data);
     unsigned OpNo;//TODO: find the OpNo for this operand;
-    RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo, false);
-    for (signed RegNo = Interval.first; RegNo < Interval.second;
+    RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, OpNo);
+    for (int RegNo = Interval.first; RegNo < Interval.second;
     ++RegNo) {
       setRegScore(RegNo + NUM_ALL_VGPRS, t, CurrScore);
     }
@@ -650,10 +650,20 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
   } else {
     // Match the score to the destination registers.
     for (unsigned I = 0, E = Inst.getNumOperands(); I != E; ++I) {
-      RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I, true);
-      if (T == VM_CNT && Interval.first >= NUM_ALL_VGPRS)
+      auto &Op = Inst.getOperand(I);
+      if (!Op.isReg() || !Op.isDef())
         continue;
-      for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+      RegInterval Interval = getRegInterval(&Inst, TII, MRI, TRI, I);
+      if (T == VM_CNT) {
+        if (Interval.first >= NUM_ALL_VGPRS)
+          continue;
+        if (SIInstrInfo::isVMEM(Inst)) {
+          VmemType V = getVmemType(Inst);
+          for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo)
+            VgprVmemTypes[RegNo] |= 1 << V;
+        }
+      }
+      for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
         setRegScore(RegNo, T, CurrScore);
       }
     }
@@ -666,8 +676,8 @@ void WaitcntBrackets::updateByEvent(const SIInstrInfo *TII,
 void WaitcntBrackets::print(raw_ostream &OS) {
   OS << '\n';
   for (auto T : inst_counter_types()) {
-    uint32_t LB = getScoreLB(T);
-    uint32_t UB = getScoreUB(T);
+    unsigned LB = getScoreLB(T);
+    unsigned UB = getScoreUB(T);
 
     switch (T) {
     case VM_CNT:
@@ -689,11 +699,11 @@ void WaitcntBrackets::print(raw_ostream &OS) {
 
     if (LB < UB) {
       // Print vgpr scores.
-      for (int J = 0; J <= getMaxVGPR(); J++) {
-        uint32_t RegScore = getRegScore(J, T);
+      for (int J = 0; J <= VgprUB; J++) {
+        unsigned RegScore = getRegScore(J, T);
         if (RegScore <= LB)
           continue;
-        uint32_t RelScore = RegScore - LB - 1;
+        unsigned RelScore = RegScore - LB - 1;
         if (J < SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS) {
           OS << RelScore << ":v" << J << " ";
         } else {
@@ -702,11 +712,11 @@ void WaitcntBrackets::print(raw_ostream &OS) {
       }
       // Also need to print sgpr scores for lgkm_cnt.
       if (T == LGKM_CNT) {
-        for (int J = 0; J <= getMaxSGPR(); J++) {
-          uint32_t RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
+        for (int J = 0; J <= SgprUB; J++) {
+          unsigned RegScore = getRegScore(J + NUM_ALL_VGPRS, LGKM_CNT);
           if (RegScore <= LB)
             continue;
-          uint32_t RelScore = RegScore - LB - 1;
+          unsigned RelScore = RegScore - LB - 1;
           OS << RelScore << ":s" << J << " ";
         }
       }
@@ -727,8 +737,8 @@ bool WaitcntBrackets::simplifyWaitcnt(AMDGPU::Waitcnt &Wait) const {
 
 bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
                                       unsigned &Count) const {
-  const uint32_t LB = getScoreLB(T);
-  const uint32_t UB = getScoreUB(T);
+  const unsigned LB = getScoreLB(T);
+  const unsigned UB = getScoreUB(T);
   if (Count < UB && UB - Count > LB)
     return true;
 
@@ -736,12 +746,12 @@ bool WaitcntBrackets::simplifyWaitcnt(InstCounterType T,
   return false;
 }
 
-void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
+void WaitcntBrackets::determineWait(InstCounterType T, unsigned ScoreToWait,
                                     AMDGPU::Waitcnt &Wait) const {
   // If the score of src_operand falls within the bracket, we need an
   // s_waitcnt instruction.
-  const uint32_t LB = getScoreLB(T);
-  const uint32_t UB = getScoreUB(T);
+  const unsigned LB = getScoreLB(T);
+  const unsigned UB = getScoreUB(T);
   if ((UB >= ScoreToWait) && (ScoreToWait > LB)) {
     if ((T == VM_CNT || T == LGKM_CNT) &&
         hasPendingFlat() &&
@@ -758,7 +768,7 @@ void WaitcntBrackets::determineWait(InstCounterType T, uint32_t ScoreToWait,
     } else {
       // If a counter has been maxed out avoid overflow by waiting for
       // MAX(CounterType) - 1 instead.
-      uint32_t NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
+      unsigned NeededWait = std::min(UB - ScoreToWait, getWaitCountMax(T) - 1);
       addWait(Wait, T, NeededWait);
     }
   }
@@ -772,7 +782,7 @@ void WaitcntBrackets::applyWaitcnt(const AMDGPU::Waitcnt &Wait) {
 }
 
 void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
-  const uint32_t UB = getScoreUB(T);
+  const unsigned UB = getScoreUB(T);
   if (Count >= UB)
     return;
   if (Count != 0) {
@@ -781,7 +791,6 @@ void WaitcntBrackets::applyWaitcnt(InstCounterType T, unsigned Count) {
     setScoreLB(T, std::max(getScoreLB(T), UB - Count));
   } else {
     setScoreLB(T, UB);
-    MixedPendingEvents[T] = false;
     PendingEvents &= ~WaitEventMaskForInst[T];
   }
 }
@@ -792,7 +801,7 @@ bool WaitcntBrackets::counterOutOfOrder(InstCounterType T) const {
   // Scalar memory read always can go out of order.
   if (T == LGKM_CNT && hasPendingEvent(SMEM_ACCESS))
     return true;
-  return MixedPendingEvents[T];
+  return hasMixedPendingEvents(T);
 }
 
 INITIALIZE_PASS_BEGIN(SIInsertWaitcnts, DEBUG_TYPE, "SI Insert Waitcnts", false,
@@ -954,10 +963,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
 
       int CallAddrOpIdx =
           AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::src0);
-      RegInterval CallAddrOpInterval = ScoreBrackets.getRegInterval(
-          &MI, TII, MRI, TRI, CallAddrOpIdx, false);
+      RegInterval CallAddrOpInterval =
+          ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, CallAddrOpIdx);
 
-      for (signed RegNo = CallAddrOpInterval.first;
+      for (int RegNo = CallAddrOpInterval.first;
            RegNo < CallAddrOpInterval.second; ++RegNo)
         ScoreBrackets.determineWait(
             LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
@@ -965,10 +974,10 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
       int RtnAddrOpIdx =
             AMDGPU::getNamedOperandIdx(MI.getOpcode(), AMDGPU::OpName::dst);
       if (RtnAddrOpIdx != -1) {
-        RegInterval RtnAddrOpInterval = ScoreBrackets.getRegInterval(
-            &MI, TII, MRI, TRI, RtnAddrOpIdx, false);
+        RegInterval RtnAddrOpInterval =
+            ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, RtnAddrOpIdx);
 
-        for (signed RegNo = RtnAddrOpInterval.first;
+        for (int RegNo = RtnAddrOpInterval.first;
              RegNo < RtnAddrOpInterval.second; ++RegNo)
           ScoreBrackets.determineWait(
               LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
@@ -982,7 +991,19 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
       // emitted.
       // If the source operand was defined by a load, add the s_waitcnt
       // instruction.
+      //
+      // Two cases are handled for destination operands:
+      // 1) If the destination operand was defined by a load, add the s_waitcnt
+      // instruction to guarantee the right WAW order.
+      // 2) If a destination operand that was used by a recent export/store ins,
+      // add s_waitcnt on exp_cnt to guarantee the WAR order.
       for (const MachineMemOperand *Memop : MI.memoperands()) {
+        const Value *Ptr = Memop->getValue();
+        if (Memop->isStore() && SLoadAddresses.count(Ptr)) {
+          addWait(Wait, LGKM_CNT, 0);
+          if (PDT->dominates(MI.getParent(), SLoadAddresses.find(Ptr)->second))
+            SLoadAddresses.erase(Ptr);
+        }
         unsigned AS = Memop->getAddrSpace();
         if (AS != AMDGPUAS::LOCAL_ADDRESS)
           continue;
@@ -990,67 +1011,41 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
         // VM_CNT is only relevant to vgpr or LDS.
         ScoreBrackets.determineWait(
             VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-      }
-
-      for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-        const MachineOperand &Op = MI.getOperand(I);
-        const MachineRegisterInfo &MRIA = *MRI;
-        RegInterval Interval =
-            ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, false);
-        for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-          if (TRI->isVGPR(MRIA, Op.getReg())) {
-            // VM_CNT is only relevant to vgpr or LDS.
-            ScoreBrackets.determineWait(
-                VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-          }
-          ScoreBrackets.determineWait(
-              LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
-        }
-      }
-      // End of for loop that looks at all source operands to decide vm_wait_cnt
-      // and lgk_wait_cnt.
-
-      // Two cases are handled for destination operands:
-      // 1) If the destination operand was defined by a load, add the s_waitcnt
-      // instruction to guarantee the right WAW order.
-      // 2) If a destination operand that was used by a recent export/store ins,
-      // add s_waitcnt on exp_cnt to guarantee the WAR order.
-      if (MI.mayStore()) {
-        // FIXME: Should not be relying on memoperands.
-        for (const MachineMemOperand *Memop : MI.memoperands()) {
-          const Value *Ptr = Memop->getValue();
-          if (SLoadAddresses.count(Ptr)) {
-            addWait(Wait, LGKM_CNT, 0);
-            if (PDT->dominates(MI.getParent(),
-                               SLoadAddresses.find(Ptr)->second))
-              SLoadAddresses.erase(Ptr);
-          }
-          unsigned AS = Memop->getAddrSpace();
-          if (AS != AMDGPUAS::LOCAL_ADDRESS)
-            continue;
-          unsigned RegNo = SQ_MAX_PGM_VGPRS + EXTRA_VGPR_LDS;
-          ScoreBrackets.determineWait(
-              VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+        if (Memop->isStore()) {
           ScoreBrackets.determineWait(
               EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
         }
       }
+
+      // Loop over use and def operands.
       for (unsigned I = 0, E = MI.getNumOperands(); I != E; ++I) {
-        MachineOperand &Def = MI.getOperand(I);
-        const MachineRegisterInfo &MRIA = *MRI;
+        MachineOperand &Op = MI.getOperand(I);
+        if (!Op.isReg())
+          continue;
         RegInterval Interval =
-            ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I, true);
-        for (signed RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
-          if (TRI->isVGPR(MRIA, Def.getReg())) {
-            ScoreBrackets.determineWait(
-                VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
-            ScoreBrackets.determineWait(
-                EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+            ScoreBrackets.getRegInterval(&MI, TII, MRI, TRI, I);
+        for (int RegNo = Interval.first; RegNo < Interval.second; ++RegNo) {
+          if (TRI->isVGPR(*MRI, Op.getReg())) {
+            // RAW always needs an s_waitcnt. WAW needs an s_waitcnt unless the
+            // previous write and this write are the same type of VMEM
+            // instruction, in which case they're guaranteed to write their
+            // results in order anyway.
+            if (Op.isUse() || !SIInstrInfo::isVMEM(MI) ||
+                ScoreBrackets.hasOtherPendingVmemTypes(RegNo,
+                                                       getVmemType(MI))) {
+              ScoreBrackets.determineWait(
+                  VM_CNT, ScoreBrackets.getRegScore(RegNo, VM_CNT), Wait);
+              ScoreBrackets.clearVgprVmemTypes(RegNo);
+            }
+            if (Op.isDef()) {
+              ScoreBrackets.determineWait(
+                  EXP_CNT, ScoreBrackets.getRegScore(RegNo, EXP_CNT), Wait);
+            }
           }
           ScoreBrackets.determineWait(
               LGKM_CNT, ScoreBrackets.getRegScore(RegNo, LGKM_CNT), Wait);
         }
-      } // End of for loop that looks at all dest operands.
+      }
     }
   }
 
@@ -1154,7 +1149,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
       }
 
       LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
-                        << "Old Instr: " << MI << '\n'
+                        << "Old Instr: " << MI
                         << "New Instr: " << *II << '\n');
 
       if (!Wait.hasWait())
@@ -1171,7 +1166,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
     Modified = true;
 
     LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
-                      << "Old Instr: " << MI << '\n'
+                      << "Old Instr: " << MI
                       << "New Instr: " << *SWaitInst << '\n');
   }
 
@@ -1187,7 +1182,7 @@ bool SIInsertWaitcnts::generateWaitcntInstBefore(
     Modified = true;
 
     LLVM_DEBUG(dbgs() << "generateWaitcntInstBefore\n"
-                      << "Old Instr: " << MI << '\n'
+                      << "Old Instr: " << MI
                       << "New Instr: " << *SWaitInst << '\n');
   }
 
@@ -1303,10 +1298,10 @@ void SIInsertWaitcnts::updateEventWaitcntAfter(MachineInstr &Inst,
   }
 }
 
-bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
-                                 uint32_t OtherScore) {
-  uint32_t MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
-  uint32_t OtherShifted =
+bool WaitcntBrackets::mergeScore(const MergeInfo &M, unsigned &Score,
+                                 unsigned OtherScore) {
+  unsigned MyShifted = Score <= M.OldLB ? 0 : Score + M.MyShift;
+  unsigned OtherShifted =
       OtherScore <= M.OtherLB ? 0 : OtherScore + M.OtherShift;
   Score = std::max(MyShifted, OtherShifted);
   return OtherShifted > MyShifted;
@@ -1320,44 +1315,50 @@ bool WaitcntBrackets::mergeScore(const MergeInfo &M, uint32_t &Score,
 bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
   bool StrictDom = false;
 
+  VgprUB = std::max(VgprUB, Other.VgprUB);
+  SgprUB = std::max(SgprUB, Other.SgprUB);
+
   for (auto T : inst_counter_types()) {
     // Merge event flags for this counter
     const bool OldOutOfOrder = counterOutOfOrder(T);
-    const uint32_t OldEvents = PendingEvents & WaitEventMaskForInst[T];
-    const uint32_t OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
+    const unsigned OldEvents = PendingEvents & WaitEventMaskForInst[T];
+    const unsigned OtherEvents = Other.PendingEvents & WaitEventMaskForInst[T];
     if (OtherEvents & ~OldEvents)
       StrictDom = true;
-    if (Other.MixedPendingEvents[T] ||
-        (OldEvents && OtherEvents && OldEvents != OtherEvents))
-      MixedPendingEvents[T] = true;
     PendingEvents |= OtherEvents;
 
     // Merge scores for this counter
-    const uint32_t MyPending = ScoreUBs[T] - ScoreLBs[T];
-    const uint32_t OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
+    const unsigned MyPending = ScoreUBs[T] - ScoreLBs[T];
+    const unsigned OtherPending = Other.ScoreUBs[T] - Other.ScoreLBs[T];
+    const unsigned NewUB = ScoreLBs[T] + std::max(MyPending, OtherPending);
+    if (NewUB < ScoreLBs[T])
+      report_fatal_error("waitcnt score overflow");
+
     MergeInfo M;
     M.OldLB = ScoreLBs[T];
     M.OtherLB = Other.ScoreLBs[T];
-    M.MyShift = OtherPending > MyPending ? OtherPending - MyPending : 0;
-    M.OtherShift = ScoreUBs[T] - Other.ScoreUBs[T] + M.MyShift;
+    M.MyShift = NewUB - ScoreUBs[T];
+    M.OtherShift = NewUB - Other.ScoreUBs[T];
 
-    const uint32_t NewUB = ScoreUBs[T] + M.MyShift;
-    if (NewUB < ScoreUBs[T])
-      report_fatal_error("waitcnt score overflow");
     ScoreUBs[T] = NewUB;
-    ScoreLBs[T] = std::min(M.OldLB + M.MyShift, M.OtherLB + M.OtherShift);
 
     StrictDom |= mergeScore(M, LastFlat[T], Other.LastFlat[T]);
 
     bool RegStrictDom = false;
-    for (int J = 0, E = std::max(getMaxVGPR(), Other.getMaxVGPR()) + 1; J != E;
-         J++) {
+    for (int J = 0; J <= VgprUB; J++) {
       RegStrictDom |= mergeScore(M, VgprScores[T][J], Other.VgprScores[T][J]);
     }
 
+    if (T == VM_CNT) {
+      for (int J = 0; J <= VgprUB; J++) {
+        unsigned char NewVmemTypes = VgprVmemTypes[J] | Other.VgprVmemTypes[J];
+        RegStrictDom |= NewVmemTypes != VgprVmemTypes[J];
+        VgprVmemTypes[J] = NewVmemTypes;
+      }
+    }
+
     if (T == LGKM_CNT) {
-      for (int J = 0, E = std::max(getMaxSGPR(), Other.getMaxSGPR()) + 1;
-           J != E; J++) {
+      for (int J = 0; J <= SgprUB; J++) {
         RegStrictDom |= mergeScore(M, SgprScores[J], Other.SgprScores[J]);
       }
     }
@@ -1366,9 +1367,6 @@ bool WaitcntBrackets::merge(const WaitcntBrackets &Other) {
       StrictDom = true;
   }
 
-  VgprUB = std::max(getMaxVGPR(), Other.getMaxVGPR());
-  SgprUB = std::max(getMaxSGPR(), Other.getMaxSGPR());
-
   return StrictDom;
 }
 
@@ -1383,6 +1381,10 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
     ScoreBrackets.dump();
   });
 
+  // Assume VCCZ is correct at basic block boundaries, unless and until we need
+  // to handle cases where that is not true.
+  bool VCCZCorrect = true;
+
   // Walk over the instructions.
   MachineInstr *OldWaitcntInstr = nullptr;
 
@@ -1402,13 +1404,26 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       continue;
     }
 
-    bool VCCZBugWorkAround = false;
+    // We might need to restore vccz to its correct value for either of two
+    // different reasons; see ST->hasReadVCCZBug() and
+    // ST->partialVCCWritesUpdateVCCZ().
+    bool RestoreVCCZ = false;
     if (readsVCCZ(Inst)) {
-      if (ScoreBrackets.getScoreLB(LGKM_CNT) <
-              ScoreBrackets.getScoreUB(LGKM_CNT) &&
-          ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
-        if (ST->hasReadVCCZBug())
-          VCCZBugWorkAround = true;
+      if (!VCCZCorrect)
+        RestoreVCCZ = true;
+      else if (ST->hasReadVCCZBug()) {
+        // There is a hardware bug on CI/SI where SMRD instruction may corrupt
+        // vccz bit, so when we detect that an instruction may read from a
+        // corrupt vccz bit, we need to:
+        // 1. Insert s_waitcnt lgkm(0) to wait for all outstanding SMRD
+        //    operations to complete.
+        // 2. Restore the correct value of vccz by writing the current value
+        //    of vcc back to vcc.
+        if (ScoreBrackets.getScoreLB(LGKM_CNT) <
+            ScoreBrackets.getScoreUB(LGKM_CNT) &&
+            ScoreBrackets.hasPendingEvent(SMEM_ACCESS)) {
+          RestoreVCCZ = true;
+        }
       }
     }
 
@@ -1419,6 +1434,16 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
       }
     }
 
+    if (!ST->partialVCCWritesUpdateVCCZ()) {
+      // Up to gfx9, writes to vcc_lo and vcc_hi don't update vccz.
+      // Writes to vcc will fix it.
+      if (Inst.definesRegister(AMDGPU::VCC_LO) ||
+          Inst.definesRegister(AMDGPU::VCC_HI))
+        VCCZCorrect = false;
+      else if (Inst.definesRegister(AMDGPU::VCC))
+        VCCZCorrect = true;
+    }
+
     // Generate an s_waitcnt instruction to be placed before
     // cur_Inst, if needed.
     Modified |= generateWaitcntInstBefore(Inst, ScoreBrackets, OldWaitcntInstr);
@@ -1444,7 +1469,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
 
     // TODO: Remove this work-around after fixing the scheduler and enable the
     // assert above.
-    if (VCCZBugWorkAround) {
+    if (RestoreVCCZ) {
       // Restore the vccz bit.  Any time a value is written to vcc, the vcc
       // bit is updated, so we can restore the bit by reading the value of
       // vcc and then writing it back to the register.
@@ -1452,6 +1477,7 @@ bool SIInsertWaitcnts::insertWaitcntInBlock(MachineFunction &MF,
               TII->get(ST->isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64),
               TRI->getVCC())
           .addReg(TRI->getVCC());
+      VCCZCorrect = true;
       Modified = true;
     }
 
@@ -1479,29 +1505,23 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   HardwareLimits.LgkmcntMax = AMDGPU::getLgkmcntBitMask(IV);
   HardwareLimits.VscntMax = ST->hasVscnt() ? 63 : 0;
 
-  HardwareLimits.NumVGPRsMax = ST->getAddressableNumVGPRs();
-  HardwareLimits.NumSGPRsMax = ST->getAddressableNumSGPRs();
-  assert(HardwareLimits.NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
-  assert(HardwareLimits.NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
+  unsigned NumVGPRsMax = ST->getAddressableNumVGPRs();
+  unsigned NumSGPRsMax = ST->getAddressableNumSGPRs();
+  assert(NumVGPRsMax <= SQ_MAX_PGM_VGPRS);
+  assert(NumSGPRsMax <= SQ_MAX_PGM_SGPRS);
 
   RegisterEncoding.VGPR0 = TRI->getEncodingValue(AMDGPU::VGPR0);
-  RegisterEncoding.VGPRL =
-      RegisterEncoding.VGPR0 + HardwareLimits.NumVGPRsMax - 1;
+  RegisterEncoding.VGPRL = RegisterEncoding.VGPR0 + NumVGPRsMax - 1;
   RegisterEncoding.SGPR0 = TRI->getEncodingValue(AMDGPU::SGPR0);
-  RegisterEncoding.SGPRL =
-      RegisterEncoding.SGPR0 + HardwareLimits.NumSGPRsMax - 1;
+  RegisterEncoding.SGPRL = RegisterEncoding.SGPR0 + NumSGPRsMax - 1;
 
   TrackedWaitcntSet.clear();
-  RpotIdxMap.clear();
   BlockInfos.clear();
 
   // Keep iterating over the blocks in reverse post order, inserting and
   // updating s_waitcnt where needed, until a fix point is reached.
-  for (MachineBasicBlock *MBB :
-       ReversePostOrderTraversal<MachineFunction *>(&MF)) {
-    RpotIdxMap[MBB] = BlockInfos.size();
-    BlockInfos.emplace_back(MBB);
-  }
+  for (auto *MBB : ReversePostOrderTraversal<MachineFunction *>(&MF))
+    BlockInfos.insert({MBB, BlockInfo(MBB)});
 
   std::unique_ptr<WaitcntBrackets> Brackets;
   bool Modified = false;
@@ -1509,12 +1529,12 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
   do {
     Repeat = false;
 
-    for (BlockInfo &BI : BlockInfos) {
+    for (auto BII = BlockInfos.begin(), BIE = BlockInfos.end(); BII != BIE;
+         ++BII) {
+      BlockInfo &BI = BII->second;
       if (!BI.Dirty)
         continue;
 
-      unsigned Idx = std::distance(&*BlockInfos.begin(), &BI);
-
       if (BI.Incoming) {
         if (!Brackets)
           Brackets = std::make_unique<WaitcntBrackets>(*BI.Incoming);
@@ -1524,7 +1544,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
         if (!Brackets)
           Brackets = std::make_unique<WaitcntBrackets>(ST);
         else
-          Brackets->clear();
+          *Brackets = WaitcntBrackets(ST);
       }
 
       Modified |= insertWaitcntInBlock(MF, *BI.MBB, *Brackets);
@@ -1533,11 +1553,11 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
       if (Brackets->hasPending()) {
         BlockInfo *MoveBracketsToSucc = nullptr;
         for (MachineBasicBlock *Succ : BI.MBB->successors()) {
-          unsigned SuccIdx = RpotIdxMap[Succ];
-          BlockInfo &SuccBI = BlockInfos[SuccIdx];
+          auto SuccBII = BlockInfos.find(Succ);
+          BlockInfo &SuccBI = SuccBII->second;
           if (!SuccBI.Incoming) {
             SuccBI.Dirty = true;
-            if (SuccIdx <= Idx)
+            if (SuccBII <= BII)
               Repeat = true;
             if (!MoveBracketsToSucc) {
               MoveBracketsToSucc = &SuccBI;
@@ -1546,7 +1566,7 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
             }
           } else if (SuccBI.Incoming->merge(*Brackets)) {
             SuccBI.Dirty = true;
-            if (SuccIdx <= Idx)
+            if (SuccBII <= BII)
               Repeat = true;
           }
         }
@@ -1612,13 +1632,15 @@ bool SIInsertWaitcnts::runOnMachineFunction(MachineFunction &MF) {
     // TODO: Could insert earlier and schedule more liberally with operations
     // that only use caller preserved registers.
     MachineBasicBlock &EntryBB = MF.front();
+    MachineBasicBlock::iterator I = EntryBB.begin();
+    for (MachineBasicBlock::iterator E = EntryBB.end();
+         I != E && (I->isPHI() || I->isMetaInstruction()); ++I)
+      ;
+    BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT)).addImm(0);
     if (ST->hasVscnt())
-      BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(),
-              TII->get(AMDGPU::S_WAITCNT_VSCNT))
-      .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
-      .addImm(0);
-    BuildMI(EntryBB, EntryBB.getFirstNonPHI(), DebugLoc(), TII->get(AMDGPU::S_WAITCNT))
-      .addImm(0);
+      BuildMI(EntryBB, I, DebugLoc(), TII->get(AMDGPU::S_WAITCNT_VSCNT))
+          .addReg(AMDGPU::SGPR_NULL, RegState::Undef)
+          .addImm(0);
 
     Modified = true;
   }
diff --git a/llvm/lib/Target/AMDGPU/SIInstrFormats.td b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
index 4dcbe92861f2..428c21c896d5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrFormats.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrFormats.td
@@ -114,6 +114,9 @@ class InstSI <dag outs, dag ins, string asm = "",
   // FLAT_SCRATCH segment. Must be 0 for non-FLAT instructions.
   field bit IsNonFlatSeg = 0;
 
+  // Reads the mode register, usually for FP environment.
+  field bit ReadsModeReg = 0;
+
   // This bit indicates that this uses the floating point double precision
   // rounding mode flags
   field bit FPDPRounding = 0;
@@ -303,7 +306,7 @@ class MIMGe_gfx10 <bits<8> op> : MIMGe {
   bits<3> dim;
   bits<2> nsa;
   bits<1> dlc;
-  bits<1> a16 = 0; // TODO: this should be an operand
+  bits<1> a16;
 
   let Inst{0} = op{7};
   let Inst{2-1} = nsa;
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d53950ca4465..9af8ffedce0f 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -63,6 +63,8 @@
 
 using namespace llvm;
 
+#define DEBUG_TYPE "si-instr-info"
+
 #define GET_INSTRINFO_CTOR_DTOR
 #include "AMDGPUGenInstrInfo.inc"
 
@@ -83,6 +85,12 @@ static cl::opt<unsigned>
 BranchOffsetBits("amdgpu-s-branch-bits", cl::ReallyHidden, cl::init(16),
                  cl::desc("Restrict range of branch instructions (DEBUG)"));
 
+static cl::opt<bool> Fix16BitCopies(
+  "amdgpu-fix-16-bit-physreg-copies",
+  cl::desc("Fix copies between 32 and 16 bit registers by extending to 32 bit"),
+  cl::init(true),
+  cl::ReallyHidden);
+
 SIInstrInfo::SIInstrInfo(const GCNSubtarget &ST)
   : AMDGPUGenInstrInfo(AMDGPU::ADJCALLSTACKUP, AMDGPU::ADJCALLSTACKDOWN),
     RI(ST), ST(ST) {
@@ -136,6 +144,8 @@ bool SIInstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
   case AMDGPU::V_MOV_B32_e32:
   case AMDGPU::V_MOV_B32_e64:
   case AMDGPU::V_MOV_B64_PSEUDO:
+  case AMDGPU::V_ACCVGPR_READ_B32:
+  case AMDGPU::V_ACCVGPR_WRITE_B32:
     // No implicit operands.
     return MI.getNumOperands() == MI.getDesc().getNumOperands();
   default:
@@ -258,43 +268,49 @@ static bool isStride64(unsigned Opc) {
   }
 }
 
-bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
-                                          const MachineOperand *&BaseOp,
-                                          int64_t &Offset,
-                                          const TargetRegisterInfo *TRI) const {
+bool SIInstrInfo::getMemOperandsWithOffsetWidth(
+    const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
+    int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+    const TargetRegisterInfo *TRI) const {
   if (!LdSt.mayLoadOrStore())
     return false;
 
   unsigned Opc = LdSt.getOpcode();
+  OffsetIsScalable = false;
+  const MachineOperand *BaseOp, *OffsetOp;
+  int DataOpIdx;
 
   if (isDS(LdSt)) {
-    const MachineOperand *OffsetImm =
-        getNamedOperand(LdSt, AMDGPU::OpName::offset);
-    if (OffsetImm) {
+    BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
+    OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
+    if (OffsetOp) {
       // Normal, single offset LDS instruction.
-      BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
-      // TODO: ds_consume/ds_append use M0 for the base address. Is it safe to
-      // report that here?
-      if (!BaseOp || !BaseOp->isReg())
+      if (!BaseOp) {
+        // DS_CONSUME/DS_APPEND use M0 for the base address.
+        // TODO: find the implicit use operand for M0 and use that as BaseOp?
+        return false;
+      }
+      BaseOps.push_back(BaseOp);
+      Offset = OffsetOp->getImm();
+      // Get appropriate operand, and compute width accordingly.
+      DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+      if (DataOpIdx == -1)
+        DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+      Width = getOpSize(LdSt, DataOpIdx);
+    } else {
+      // The 2 offset instructions use offset0 and offset1 instead. We can treat
+      // these as a load with a single offset if the 2 offsets are consecutive.
+      // We will use this for some partially aligned loads.
+      const MachineOperand *Offset0Op =
+          getNamedOperand(LdSt, AMDGPU::OpName::offset0);
+      const MachineOperand *Offset1Op =
+          getNamedOperand(LdSt, AMDGPU::OpName::offset1);
+
+      unsigned Offset0 = Offset0Op->getImm();
+      unsigned Offset1 = Offset1Op->getImm();
+      if (Offset0 + 1 != Offset1)
         return false;
 
-      Offset = OffsetImm->getImm();
-
-      return true;
-    }
-
-    // The 2 offset instructions use offset0 and offset1 instead. We can treat
-    // these as a load with a single offset if the 2 offsets are consecutive. We
-    // will use this for some partially aligned loads.
-    const MachineOperand *Offset0Imm =
-        getNamedOperand(LdSt, AMDGPU::OpName::offset0);
-    const MachineOperand *Offset1Imm =
-        getNamedOperand(LdSt, AMDGPU::OpName::offset1);
-
-    uint8_t Offset0 = Offset0Imm->getImm();
-    uint8_t Offset1 = Offset1Imm->getImm();
-
-    if (Offset1 > Offset0 && Offset1 - Offset0 == 1) {
       // Each of these offsets is in element sized units, so we need to convert
       // to bytes of the individual reads.
 
@@ -310,16 +326,20 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
       if (isStride64(Opc))
         EltSize *= 64;
 
-      BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::addr);
-      if (!BaseOp->isReg())
-        return false;
-
+      BaseOps.push_back(BaseOp);
       Offset = EltSize * Offset0;
-
-      return true;
+      // Get appropriate operand(s), and compute width accordingly.
+      DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+      if (DataOpIdx == -1) {
+        DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data0);
+        Width = getOpSize(LdSt, DataOpIdx);
+        DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::data1);
+        Width += getOpSize(LdSt, DataOpIdx);
+      } else {
+        Width = getOpSize(LdSt, DataOpIdx);
+      }
     }
-
-    return false;
+    return true;
   }
 
   if (isMUBUF(LdSt) || isMTBUF(LdSt)) {
@@ -339,59 +359,78 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
 
       const MachineOperand *OffsetImm =
         getNamedOperand(LdSt, AMDGPU::OpName::offset);
-      BaseOp = SOffset;
+      BaseOps.push_back(RSrc);
+      BaseOps.push_back(SOffset);
       Offset = OffsetImm->getImm();
-      return true;
-    }
-
-    const MachineOperand *AddrReg = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
-    if (!AddrReg)
-      return false;
+    } else {
+      BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::srsrc);
+      if (!BaseOp) // e.g. BUFFER_WBINVL1_VOL
+        return false;
+      BaseOps.push_back(BaseOp);
 
-    const MachineOperand *OffsetImm =
-        getNamedOperand(LdSt, AMDGPU::OpName::offset);
-    BaseOp = AddrReg;
-    Offset = OffsetImm->getImm();
-    if (SOffset) // soffset can be an inline immediate.
-      Offset += SOffset->getImm();
+      BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+      if (BaseOp)
+        BaseOps.push_back(BaseOp);
 
-    if (!BaseOp->isReg())
-      return false;
+      const MachineOperand *OffsetImm =
+          getNamedOperand(LdSt, AMDGPU::OpName::offset);
+      Offset = OffsetImm->getImm();
+      if (SOffset) // soffset can be an inline immediate.
+        Offset += SOffset->getImm();
+    }
+    // Get appropriate operand, and compute width accordingly.
+    DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+    if (DataOpIdx == -1)
+      DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+    Width = getOpSize(LdSt, DataOpIdx);
+    return true;
+  }
 
+  if (isMIMG(LdSt)) {
+    int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+    BaseOps.push_back(&LdSt.getOperand(SRsrcIdx));
+    int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
+    if (VAddr0Idx >= 0) {
+      // GFX10 possible NSA encoding.
+      for (int I = VAddr0Idx; I < SRsrcIdx; ++I)
+        BaseOps.push_back(&LdSt.getOperand(I));
+    } else {
+      BaseOps.push_back(getNamedOperand(LdSt, AMDGPU::OpName::vaddr));
+    }
+    Offset = 0;
+    // Get appropriate operand, and compute width accordingly.
+    DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+    Width = getOpSize(LdSt, DataOpIdx);
     return true;
   }
 
   if (isSMRD(LdSt)) {
-    const MachineOperand *OffsetImm =
-        getNamedOperand(LdSt, AMDGPU::OpName::offset);
-    if (!OffsetImm)
+    BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
+    if (!BaseOp) // e.g. S_MEMTIME
       return false;
-
-    const MachineOperand *SBaseReg = getNamedOperand(LdSt, AMDGPU::OpName::sbase);
-    BaseOp = SBaseReg;
-    Offset = OffsetImm->getImm();
-    if (!BaseOp->isReg())
-      return false;
-
+    BaseOps.push_back(BaseOp);
+    OffsetOp = getNamedOperand(LdSt, AMDGPU::OpName::offset);
+    Offset = OffsetOp ? OffsetOp->getImm() : 0;
+    // Get appropriate operand, and compute width accordingly.
+    DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sdst);
+    Width = getOpSize(LdSt, DataOpIdx);
     return true;
   }
 
   if (isFLAT(LdSt)) {
-    const MachineOperand *VAddr = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
-    if (VAddr) {
-      // Can't analyze 2 offsets.
-      if (getNamedOperand(LdSt, AMDGPU::OpName::saddr))
-        return false;
-
-      BaseOp = VAddr;
-    } else {
-      // scratch instructions have either vaddr or saddr.
-      BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
-    }
-
+    // Instructions have either vaddr or saddr or both.
+    BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::vaddr);
+    if (BaseOp)
+      BaseOps.push_back(BaseOp);
+    BaseOp = getNamedOperand(LdSt, AMDGPU::OpName::saddr);
+    if (BaseOp)
+      BaseOps.push_back(BaseOp);
     Offset = getNamedOperand(LdSt, AMDGPU::OpName::offset)->getImm();
-    if (!BaseOp->isReg())
-      return false;
+    // Get appropriate operand, and compute width accordingly.
+    DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdst);
+    if (DataOpIdx == -1)
+      DataOpIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vdata);
+    Width = getOpSize(LdSt, DataOpIdx);
     return true;
   }
 
@@ -399,15 +438,13 @@ bool SIInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
 }
 
 static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
-                                  const MachineOperand &BaseOp1,
+                                  ArrayRef<const MachineOperand *> BaseOps1,
                                   const MachineInstr &MI2,
-                                  const MachineOperand &BaseOp2) {
-  // Support only base operands with base registers.
-  // Note: this could be extended to support FI operands.
-  if (!BaseOp1.isReg() || !BaseOp2.isReg())
-    return false;
-
-  if (BaseOp1.isIdenticalTo(BaseOp2))
+                                  ArrayRef<const MachineOperand *> BaseOps2) {
+  // Only examine the first "base" operand of each instruction, on the
+  // assumption that it represents the real base address of the memory access.
+  // Other operands are typically offsets or indices from this base address.
+  if (BaseOps1.front()->isIdenticalTo(*BaseOps2.front()))
     return true;
 
   if (!MI1.hasOneMemOperand() || !MI2.hasOneMemOperand())
@@ -433,62 +470,31 @@ static bool memOpsHaveSameBasePtr(const MachineInstr &MI1,
   return Base1 == Base2;
 }
 
-bool SIInstrInfo::shouldClusterMemOps(const MachineOperand &BaseOp1,
-                                      const MachineOperand &BaseOp2,
-                                      unsigned NumLoads) const {
-  const MachineInstr &FirstLdSt = *BaseOp1.getParent();
-  const MachineInstr &SecondLdSt = *BaseOp2.getParent();
-
-  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOp1, SecondLdSt, BaseOp2))
-    return false;
-
-  const MachineOperand *FirstDst = nullptr;
-  const MachineOperand *SecondDst = nullptr;
-
-  if ((isMUBUF(FirstLdSt) && isMUBUF(SecondLdSt)) ||
-      (isMTBUF(FirstLdSt) && isMTBUF(SecondLdSt)) ||
-      (isFLAT(FirstLdSt) && isFLAT(SecondLdSt))) {
-    const unsigned MaxGlobalLoadCluster = 6;
-    if (NumLoads > MaxGlobalLoadCluster)
-      return false;
-
-    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdata);
-    if (!FirstDst)
-      FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
-    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdata);
-    if (!SecondDst)
-      SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
-  } else if (isSMRD(FirstLdSt) && isSMRD(SecondLdSt)) {
-    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::sdst);
-    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::sdst);
-  } else if (isDS(FirstLdSt) && isDS(SecondLdSt)) {
-    FirstDst = getNamedOperand(FirstLdSt, AMDGPU::OpName::vdst);
-    SecondDst = getNamedOperand(SecondLdSt, AMDGPU::OpName::vdst);
-  }
-
-  if (!FirstDst || !SecondDst)
+bool SIInstrInfo::shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                                      ArrayRef<const MachineOperand *> BaseOps2,
+                                      unsigned NumLoads,
+                                      unsigned NumBytes) const {
+  // If current mem ops pair do not have same base pointer, then they cannot be
+  // clustered.
+  assert(!BaseOps1.empty() && !BaseOps2.empty());
+  const MachineInstr &FirstLdSt = *BaseOps1.front()->getParent();
+  const MachineInstr &SecondLdSt = *BaseOps2.front()->getParent();
+  if (!memOpsHaveSameBasePtr(FirstLdSt, BaseOps1, SecondLdSt, BaseOps2))
     return false;
 
-  // Try to limit clustering based on the total number of bytes loaded
-  // rather than the number of instructions.  This is done to help reduce
-  // register pressure.  The method used is somewhat inexact, though,
-  // because it assumes that all loads in the cluster will load the
-  // same number of bytes as FirstLdSt.
-
-  // The unit of this value is bytes.
-  // FIXME: This needs finer tuning.
-  unsigned LoadClusterThreshold = 16;
-
-  const MachineRegisterInfo &MRI =
-      FirstLdSt.getParent()->getParent()->getRegInfo();
-
-  const Register Reg = FirstDst->getReg();
-
-  const TargetRegisterClass *DstRC = Register::isVirtualRegister(Reg)
-                                         ? MRI.getRegClass(Reg)
-                                         : RI.getPhysRegClass(Reg);
-
-  return (NumLoads * (RI.getRegSizeInBits(*DstRC) / 8)) <= LoadClusterThreshold;
+  // Compute max cluster size based on average number bytes clustered till now,
+  // and decide based on it, if current mem ops pair can be clustered or not.
+  assert((NumLoads > 0) && (NumBytes > 0) && (NumBytes >= NumLoads) &&
+         "Invalid NumLoads/NumBytes values");
+  unsigned MaxNumLoads;
+  if (NumBytes <= 4 * NumLoads) {
+    // Loads are dword or smaller (on average).
+    MaxNumLoads = 5;
+  } else {
+    // Loads are bigger than a dword (on average).
+    MaxNumLoads = 4;
+  }
+  return NumLoads <= MaxNumLoads;
 }
 
 // FIXME: This behaves strangely. If, for example, you have 32 load + stores,
@@ -516,11 +522,10 @@ bool SIInstrInfo::shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1,
 static void reportIllegalCopy(const SIInstrInfo *TII, MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI,
                               const DebugLoc &DL, MCRegister DestReg,
-                              MCRegister SrcReg, bool KillSrc) {
+                              MCRegister SrcReg, bool KillSrc,
+                              const char *Msg = "illegal SGPR to VGPR copy") {
   MachineFunction *MF = MBB.getParent();
-  DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(),
-                                        "illegal SGPR to VGPR copy",
-                                        DL, DS_Error);
+  DiagnosticInfoUnsupported IllegalCopy(MF->getFunction(), Msg, DL, DS_Error);
   LLVMContext &C = MF->getFunction().getContext();
   C.diagnose(IllegalCopy);
 
@@ -534,6 +539,25 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                               MCRegister SrcReg, bool KillSrc) const {
   const TargetRegisterClass *RC = RI.getPhysRegClass(DestReg);
 
+  // FIXME: This is hack to resolve copies between 16 bit and 32 bit
+  // registers until all patterns are fixed.
+  if (Fix16BitCopies &&
+      ((RI.getRegSizeInBits(*RC) == 16) ^
+       (RI.getRegSizeInBits(*RI.getPhysRegClass(SrcReg)) == 16))) {
+    MCRegister &RegToFix = (RI.getRegSizeInBits(*RC) == 16) ? DestReg : SrcReg;
+    MCRegister Super = RI.get32BitRegister(RegToFix);
+    assert(RI.getSubReg(Super, AMDGPU::lo16) == RegToFix);
+    RegToFix = Super;
+
+    if (DestReg == SrcReg) {
+      // Insert empty bundle since ExpandPostRA expects an instruction here.
+      BuildMI(MBB, MI, DL, get(AMDGPU::BUNDLE));
+      return;
+    }
+
+    RC = RI.getPhysRegClass(DestReg);
+  }
+
   if (RC == &AMDGPU::VGPR_32RegClass) {
     assert(AMDGPU::VGPR_32RegClass.contains(SrcReg) ||
            AMDGPU::SReg_32RegClass.contains(SrcReg) ||
@@ -580,6 +604,13 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   }
 
   if (RC == &AMDGPU::SReg_64RegClass) {
+    if (SrcReg == AMDGPU::SCC) {
+      BuildMI(MBB, MI, DL, get(AMDGPU::S_CSELECT_B64), DestReg)
+          .addImm(1)
+          .addImm(0);
+      return;
+    }
+
     if (DestReg == AMDGPU::VCC) {
       if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
         BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B64), AMDGPU::VCC)
@@ -606,10 +637,18 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
   }
 
   if (DestReg == AMDGPU::SCC) {
+    // Copying 64-bit or 32-bit sources to SCC barely makes sense,
+    // but SelectionDAG emits such copies for i1 sources.
+    // TODO: Use S_BITCMP0_B32 instead and only consider the 0th bit.
+    if (AMDGPU::SReg_64RegClass.contains(SrcReg)) {
+      SrcReg = RI.getSubReg(SrcReg, AMDGPU::sub0);
+    }
     assert(AMDGPU::SReg_32RegClass.contains(SrcReg));
+
     BuildMI(MBB, MI, DL, get(AMDGPU::S_CMP_LG_U32))
-      .addReg(SrcReg, getKillRegState(KillSrc))
-      .addImm(0);
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addImm(0);
+
     return;
   }
 
@@ -660,7 +699,7 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       // Registers in the sequence are allocated contiguously so we can just
       // use register number to pick one of three round-robin temps.
       unsigned RegNo = DestReg % 3;
-      unsigned Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
+      Register Tmp = RS.scavengeRegister(&AMDGPU::VGPR_32RegClass, 0);
       if (!Tmp)
         report_fatal_error("Cannot scavenge VGPR to copy to AGPR");
       RS.setRegUsed(Tmp);
@@ -685,6 +724,72 @@ void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
+  if (RI.getRegSizeInBits(*RC) == 16) {
+    assert(AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
+           AMDGPU::VGPR_HI16RegClass.contains(SrcReg) ||
+           AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
+           AMDGPU::AGPR_LO16RegClass.contains(SrcReg));
+
+    bool IsSGPRDst = AMDGPU::SReg_LO16RegClass.contains(DestReg);
+    bool IsSGPRSrc = AMDGPU::SReg_LO16RegClass.contains(SrcReg);
+    bool IsAGPRDst = AMDGPU::AGPR_LO16RegClass.contains(DestReg);
+    bool IsAGPRSrc = AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
+    bool DstLow = AMDGPU::VGPR_LO16RegClass.contains(DestReg) ||
+                  AMDGPU::SReg_LO16RegClass.contains(DestReg) ||
+                  AMDGPU::AGPR_LO16RegClass.contains(DestReg);
+    bool SrcLow = AMDGPU::VGPR_LO16RegClass.contains(SrcReg) ||
+                  AMDGPU::SReg_LO16RegClass.contains(SrcReg) ||
+                  AMDGPU::AGPR_LO16RegClass.contains(SrcReg);
+    MCRegister NewDestReg = RI.get32BitRegister(DestReg);
+    MCRegister NewSrcReg = RI.get32BitRegister(SrcReg);
+
+    if (IsSGPRDst) {
+      if (!IsSGPRSrc) {
+        reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc);
+        return;
+      }
+
+      BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), NewDestReg)
+        .addReg(NewSrcReg, getKillRegState(KillSrc));
+      return;
+    }
+
+    if (IsAGPRDst || IsAGPRSrc) {
+      if (!DstLow || !SrcLow) {
+        reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
+                          "Cannot use hi16 subreg with an AGPR!");
+      }
+
+      copyPhysReg(MBB, MI, DL, NewDestReg, NewSrcReg, KillSrc);
+      return;
+    }
+
+    if (IsSGPRSrc && !ST.hasSDWAScalar()) {
+      if (!DstLow || !SrcLow) {
+        reportIllegalCopy(this, MBB, MI, DL, DestReg, SrcReg, KillSrc,
+                          "Cannot use hi16 subreg on VI!");
+      }
+
+      BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), NewDestReg)
+        .addReg(NewSrcReg, getKillRegState(KillSrc));
+      return;
+    }
+
+    auto MIB = BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_sdwa), NewDestReg)
+      .addImm(0) // src0_modifiers
+      .addReg(NewSrcReg)
+      .addImm(0) // clamp
+      .addImm(DstLow ? AMDGPU::SDWA::SdwaSel::WORD_0
+                     : AMDGPU::SDWA::SdwaSel::WORD_1)
+      .addImm(AMDGPU::SDWA::DstUnused::UNUSED_PRESERVE)
+      .addImm(SrcLow ? AMDGPU::SDWA::SdwaSel::WORD_0
+                     : AMDGPU::SDWA::SdwaSel::WORD_1)
+      .addReg(NewDestReg, RegState::Implicit | RegState::Undef);
+    // First implicit operand is $exec.
+    MIB->tieOperands(0, MIB->getNumOperands() - 1);
+    return;
+  }
+
   unsigned EltSize = 4;
   unsigned Opcode = AMDGPU::V_MOV_B32_e32;
   if (RI.isSGPRClass(RC)) {
@@ -806,7 +911,7 @@ void SIInstrInfo::materializeImmediate(MachineBasicBlock &MBB,
     int64_t IdxValue = Idx == 0 ? Value : 0;
 
     MachineInstrBuilder Builder = BuildMI(MBB, MI, DL,
-      get(Opcode), RI.getSubReg(DestReg, Idx));
+      get(Opcode), RI.getSubReg(DestReg, SubIndices[Idx]));
     Builder.addImm(IdxValue);
   }
 }
@@ -818,10 +923,10 @@ SIInstrInfo::getPreferredSelectRegClass(unsigned Size) const {
 
 void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I,
-                                     const DebugLoc &DL, unsigned DstReg,
+                                     const DebugLoc &DL, Register DstReg,
                                      ArrayRef<MachineOperand> Cond,
-                                     unsigned TrueReg,
-                                     unsigned FalseReg) const {
+                                     Register TrueReg,
+                                     Register FalseReg) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   MachineFunction *MF = MBB.getParent();
   const GCNSubtarget &ST = MF->getSubtarget<GCNSubtarget>();
@@ -944,10 +1049,10 @@ void SIInstrInfo::insertVectorSelect(MachineBasicBlock &MBB,
   }
 }
 
-unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
+Register SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
                                MachineBasicBlock::iterator I,
                                const DebugLoc &DL,
-                               unsigned SrcReg, int Value) const {
+                               Register SrcReg, int Value) const {
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_EQ_I32_e64), Reg)
@@ -957,10 +1062,10 @@ unsigned SIInstrInfo::insertEQ(MachineBasicBlock *MBB,
   return Reg;
 }
 
-unsigned SIInstrInfo::insertNE(MachineBasicBlock *MBB,
+Register SIInstrInfo::insertNE(MachineBasicBlock *MBB,
                                MachineBasicBlock::iterator I,
                                const DebugLoc &DL,
-                               unsigned SrcReg, int Value) const {
+                               Register SrcReg, int Value) const {
   MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
   Register Reg = MRI.createVirtualRegister(RI.getBoolRC());
   BuildMI(*MBB, I, DL, get(AMDGPU::V_CMP_NE_I32_e64), Reg)
@@ -984,6 +1089,80 @@ unsigned SIInstrInfo::getMovOpcode(const TargetRegisterClass *DstRC) const {
   return AMDGPU::COPY;
 }
 
+static unsigned getIndirectVGPRWritePseudoOpc(unsigned VecSize) {
+  if (VecSize <= 32) // 4 bytes
+    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V1;
+  if (VecSize <= 64) // 8 bytes
+    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V2;
+  if (VecSize <= 96) // 12 bytes
+    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V3;
+  if (VecSize <= 128) // 16 bytes
+    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V4;
+  if (VecSize <= 160) // 20 bytes
+    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V5;
+  if (VecSize <= 256) // 32 bytes
+    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V8;
+  if (VecSize <= 512) // 64 bytes
+    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V16;
+  if (VecSize <= 1024) // 128 bytes
+    return AMDGPU::V_INDIRECT_REG_WRITE_B32_V32;
+
+  llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
+}
+
+static unsigned getIndirectSGPRWritePseudo32(unsigned VecSize) {
+  if (VecSize <= 32) // 4 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V1;
+  if (VecSize <= 64) // 8 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V2;
+  if (VecSize <= 96) // 12 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V3;
+  if (VecSize <= 128) // 16 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V4;
+  if (VecSize <= 160) // 20 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V5;
+  if (VecSize <= 256) // 32 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V8;
+  if (VecSize <= 512) // 64 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V16;
+  if (VecSize <= 1024) // 128 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_B32_V32;
+
+  llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
+}
+
+static unsigned getIndirectSGPRWritePseudo64(unsigned VecSize) {
+  if (VecSize <= 64) // 8 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_B64_V1;
+  if (VecSize <= 128) // 16 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_B64_V2;
+  if (VecSize <= 256) // 32 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_B64_V4;
+  if (VecSize <= 512) // 64 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_B64_V8;
+  if (VecSize <= 1024) // 128 bytes
+    return AMDGPU::S_INDIRECT_REG_WRITE_B64_V16;
+
+  llvm_unreachable("unsupported size for IndirectRegWrite pseudos");
+}
+
+const MCInstrDesc &SIInstrInfo::getIndirectRegWritePseudo(
+  unsigned VecSize, unsigned EltSize, bool IsSGPR) const {
+  if (IsSGPR) {
+    switch (EltSize) {
+    case 32:
+      return get(getIndirectSGPRWritePseudo32(VecSize));
+    case 64:
+      return get(getIndirectSGPRWritePseudo64(VecSize));
+    default:
+      llvm_unreachable("invalid reg indexing elt size");
+    }
+  }
+
+  assert(EltSize == 32 && "invalid reg indexing elt size");
+  return get(getIndirectVGPRWritePseudoOpc(VecSize));
+}
+
 static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
   switch (Size) {
   case 4:
@@ -996,6 +1175,8 @@ static unsigned getSGPRSpillSaveOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_S128_SAVE;
   case 20:
     return AMDGPU::SI_SPILL_S160_SAVE;
+  case 24:
+    return AMDGPU::SI_SPILL_S192_SAVE;
   case 32:
     return AMDGPU::SI_SPILL_S256_SAVE;
   case 64:
@@ -1019,6 +1200,8 @@ static unsigned getVGPRSpillSaveOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_V128_SAVE;
   case 20:
     return AMDGPU::SI_SPILL_V160_SAVE;
+  case 24:
+    return AMDGPU::SI_SPILL_V192_SAVE;
   case 32:
     return AMDGPU::SI_SPILL_V256_SAVE;
   case 64:
@@ -1049,7 +1232,7 @@ static unsigned getAGPRSpillSaveOpcode(unsigned Size) {
 
 void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator MI,
-                                      unsigned SrcReg, bool isKill,
+                                      Register SrcReg, bool isKill,
                                       int FrameIndex,
                                       const TargetRegisterClass *RC,
                                       const TargetRegisterInfo *TRI) const {
@@ -1058,18 +1241,18 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
   const DebugLoc &DL = MBB.findDebugLoc(MI);
 
-  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
-  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
   MachinePointerInfo PtrInfo
     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
-  MachineMemOperand *MMO
-    = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
-                               Size, Align);
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      PtrInfo, MachineMemOperand::MOStore, FrameInfo.getObjectSize(FrameIndex),
+      FrameInfo.getObjectAlign(FrameIndex));
   unsigned SpillSize = TRI->getSpillSize(*RC);
 
   if (RI.isSGPRClass(RC)) {
     MFI->setHasSpilledSGPRs();
     assert(SrcReg != AMDGPU::M0 && "m0 should not be spilled");
+    assert(SrcReg != AMDGPU::EXEC_LO && SrcReg != AMDGPU::EXEC_HI &&
+           SrcReg != AMDGPU::EXEC && "exec should not be spilled");
 
     // We are only allowed to create one new instruction when spilling
     // registers, so we need to use pseudo instruction for spilling SGPRs.
@@ -1079,7 +1262,7 @@ void SIInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
     // to make sure we are using the correct register class.
     if (Register::isVirtualRegister(SrcReg) && SpillSize == 4) {
       MachineRegisterInfo &MRI = MF->getRegInfo();
-      MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
+      MRI.constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
     }
 
     BuildMI(MBB, MI, DL, OpDesc)
@@ -1126,6 +1309,8 @@ static unsigned getSGPRSpillRestoreOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_S128_RESTORE;
   case 20:
     return AMDGPU::SI_SPILL_S160_RESTORE;
+  case 24:
+    return AMDGPU::SI_SPILL_S192_RESTORE;
   case 32:
     return AMDGPU::SI_SPILL_S256_RESTORE;
   case 64:
@@ -1149,6 +1334,8 @@ static unsigned getVGPRSpillRestoreOpcode(unsigned Size) {
     return AMDGPU::SI_SPILL_V128_RESTORE;
   case 20:
     return AMDGPU::SI_SPILL_V160_RESTORE;
+  case 24:
+    return AMDGPU::SI_SPILL_V192_RESTORE;
   case 32:
     return AMDGPU::SI_SPILL_V256_RESTORE;
   case 64:
@@ -1179,33 +1366,34 @@ static unsigned getAGPRSpillRestoreOpcode(unsigned Size) {
 
 void SIInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MI,
-                                       unsigned DestReg, int FrameIndex,
+                                       Register DestReg, int FrameIndex,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
   MachineFrameInfo &FrameInfo = MF->getFrameInfo();
   const DebugLoc &DL = MBB.findDebugLoc(MI);
-  unsigned Align = FrameInfo.getObjectAlignment(FrameIndex);
-  unsigned Size = FrameInfo.getObjectSize(FrameIndex);
   unsigned SpillSize = TRI->getSpillSize(*RC);
 
   MachinePointerInfo PtrInfo
     = MachinePointerInfo::getFixedStack(*MF, FrameIndex);
 
   MachineMemOperand *MMO = MF->getMachineMemOperand(
-    PtrInfo, MachineMemOperand::MOLoad, Size, Align);
+      PtrInfo, MachineMemOperand::MOLoad, FrameInfo.getObjectSize(FrameIndex),
+      FrameInfo.getObjectAlign(FrameIndex));
 
   if (RI.isSGPRClass(RC)) {
     MFI->setHasSpilledSGPRs();
     assert(DestReg != AMDGPU::M0 && "m0 should not be reloaded into");
+    assert(DestReg != AMDGPU::EXEC_LO && DestReg != AMDGPU::EXEC_HI &&
+           DestReg != AMDGPU::EXEC && "exec should not be spilled");
 
     // FIXME: Maybe this should not include a memoperand because it will be
     // lowered to non-memory instructions.
     const MCInstrDesc &OpDesc = get(getSGPRSpillRestoreOpcode(SpillSize));
-    if (Register::isVirtualRegister(DestReg) && SpillSize == 4) {
+    if (DestReg.isVirtual() && SpillSize == 4) {
       MachineRegisterInfo &MRI = MF->getRegInfo();
-      MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0RegClass);
+      MRI.constrainRegClass(DestReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
     }
 
     if (RI.spillSGPRToVGPR())
@@ -1244,7 +1432,7 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
   unsigned WorkGroupSize = MFI->getMaxFlatWorkGroupSize();
   unsigned WavefrontSize = ST.getWavefrontSize();
 
-  unsigned TIDReg = MFI->getTIDReg();
+  Register TIDReg = MFI->getTIDReg();
   if (!MFI->hasCalculatedTID()) {
     MachineBasicBlock &Entry = MBB.getParent()->front();
     MachineBasicBlock::iterator Insert = Entry.front();
@@ -1272,8 +1460,8 @@ unsigned SIInstrInfo::calculateLDSSpillAddress(
 
       RS->enterBasicBlock(Entry);
       // FIXME: Can we scavenge an SReg_64 and access the subregs?
-      unsigned STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
-      unsigned STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+      Register STmp0 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
+      Register STmp1 = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, 0);
       BuildMI(Entry, Insert, DL, get(AMDGPU::S_LOAD_DWORD_IMM), STmp0)
               .addReg(InputPtrReg)
               .addImm(SI::KernelInputOffsets::NGROUPS_Z);
@@ -1482,30 +1670,55 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent();
     break;
   }
-  case AMDGPU::V_MOVRELD_B32_V1:
-  case AMDGPU::V_MOVRELD_B32_V2:
-  case AMDGPU::V_MOVRELD_B32_V4:
-  case AMDGPU::V_MOVRELD_B32_V8:
-  case AMDGPU::V_MOVRELD_B32_V16: {
-    const MCInstrDesc &MovRelDesc = get(AMDGPU::V_MOVRELD_B32_e32);
+  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V1:
+  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V2:
+  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V3:
+  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V4:
+  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V5:
+  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V8:
+  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V16:
+  case AMDGPU::V_INDIRECT_REG_WRITE_B32_V32:
+  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V1:
+  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V2:
+  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V3:
+  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V4:
+  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V5:
+  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V8:
+  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V16:
+  case AMDGPU::S_INDIRECT_REG_WRITE_B32_V32:
+  case AMDGPU::S_INDIRECT_REG_WRITE_B64_V1:
+  case AMDGPU::S_INDIRECT_REG_WRITE_B64_V2:
+  case AMDGPU::S_INDIRECT_REG_WRITE_B64_V4:
+  case AMDGPU::S_INDIRECT_REG_WRITE_B64_V8:
+  case AMDGPU::S_INDIRECT_REG_WRITE_B64_V16: {
+    const TargetRegisterClass *EltRC = getOpRegClass(MI, 2);
+
+    unsigned Opc;
+    if (RI.hasVGPRs(EltRC)) {
+      Opc = ST.useVGPRIndexMode() ?
+        AMDGPU::V_MOV_B32_indirect : AMDGPU::V_MOVRELD_B32_e32;
+    } else {
+      Opc = RI.getRegSizeInBits(*EltRC) == 64 ?
+        AMDGPU::S_MOVRELD_B64 : AMDGPU::S_MOVRELD_B32;
+    }
+
+    const MCInstrDesc &OpDesc = get(Opc);
     Register VecReg = MI.getOperand(0).getReg();
     bool IsUndef = MI.getOperand(1).isUndef();
-    unsigned SubReg = AMDGPU::sub0 + MI.getOperand(3).getImm();
+    unsigned SubReg = MI.getOperand(3).getImm();
     assert(VecReg == MI.getOperand(1).getReg());
 
-    MachineInstr *MovRel =
-        BuildMI(MBB, MI, DL, MovRelDesc)
-            .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
-            .add(MI.getOperand(2))
-            .addReg(VecReg, RegState::ImplicitDefine)
-            .addReg(VecReg,
-                    RegState::Implicit | (IsUndef ? RegState::Undef : 0));
+    MachineInstrBuilder MIB =
+      BuildMI(MBB, MI, DL, OpDesc)
+        .addReg(RI.getSubReg(VecReg, SubReg), RegState::Undef)
+        .add(MI.getOperand(2))
+        .addReg(VecReg, RegState::ImplicitDefine)
+        .addReg(VecReg, RegState::Implicit | (IsUndef ? RegState::Undef : 0));
 
     const int ImpDefIdx =
-        MovRelDesc.getNumOperands() + MovRelDesc.getNumImplicitUses();
+      OpDesc.getNumOperands() + OpDesc.getNumImplicitUses();
     const int ImpUseIdx = ImpDefIdx + 1;
-    MovRel->tieOperands(ImpDefIdx, ImpUseIdx);
-
+    MIB->tieOperands(ImpDefIdx, ImpUseIdx);
     MI.eraseFromParent();
     break;
   }
@@ -1549,22 +1762,6 @@ bool SIInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.setDesc(get(ST.isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64));
     break;
   }
-  case TargetOpcode::BUNDLE: {
-    if (!MI.mayLoad() || MI.hasUnmodeledSideEffects())
-      return false;
-
-    // If it is a load it must be a memory clause
-    for (MachineBasicBlock::instr_iterator I = MI.getIterator();
-         I->isBundledWithSucc(); ++I) {
-      I->unbundleFromSucc();
-      for (MachineOperand &MO : I->operands())
-        if (MO.isReg())
-          MO.setIsInternalRead(false);
-    }
-
-    MI.eraseFromParent();
-    break;
-  }
   }
   return true;
 }
@@ -1662,9 +1859,15 @@ static MachineInstr *swapRegAndNonRegOperand(MachineInstr &MI,
     RegOp.ChangeToImmediate(NonRegOp.getImm());
   else if (NonRegOp.isFI())
     RegOp.ChangeToFrameIndex(NonRegOp.getIndex());
-  else
+  else if (NonRegOp.isGlobal()) {
+    RegOp.ChangeToGA(NonRegOp.getGlobal(), NonRegOp.getOffset(),
+                     NonRegOp.getTargetFlags());
+  } else
     return nullptr;
 
+  // Make sure we don't reinterpret a subreg index in the target flags.
+  RegOp.setTargetFlags(NonRegOp.getTargetFlags());
+
   NonRegOp.ChangeToRegister(Reg, false, false, IsKill, IsDead, IsUndef, IsDebug);
   NonRegOp.setSubReg(SubReg);
 
@@ -2085,6 +2288,7 @@ unsigned SIInstrInfo::insertBranch(MachineBasicBlock &MBB,
 
     // Copy the flags onto the implicit condition register operand.
     preserveCondRegFlags(CondBr->getOperand(1), Cond[1]);
+    fixImplicitOperands(*CondBr);
 
     if (BytesAdded)
       *BytesAdded = 4;
@@ -2125,8 +2329,8 @@ bool SIInstrInfo::reverseBranchCondition(
 
 bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
                                   ArrayRef<MachineOperand> Cond,
-                                  unsigned TrueReg, unsigned FalseReg,
-                                  int &CondCycles,
+                                  Register DstReg, Register TrueReg,
+                                  Register FalseReg, int &CondCycles,
                                   int &TrueCycles, int &FalseCycles) const {
   switch (Cond[0].getImm()) {
   case VCCNZ:
@@ -2165,8 +2369,8 @@ bool SIInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
 
 void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator I, const DebugLoc &DL,
-                               unsigned DstReg, ArrayRef<MachineOperand> Cond,
-                               unsigned TrueReg, unsigned FalseReg) const {
+                               Register DstReg, ArrayRef<MachineOperand> Cond,
+                               Register TrueReg, Register FalseReg) const {
   BranchPredicate Pred = static_cast<BranchPredicate>(Cond[0].getImm());
   if (Pred == VCCZ || Pred == SCC_FALSE) {
     Pred = static_cast<BranchPredicate>(-Pred);
@@ -2178,14 +2382,17 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
   unsigned DstSize = RI.getRegSizeInBits(*DstRC);
 
   if (DstSize == 32) {
-    unsigned SelOp = Pred == SCC_TRUE ?
-      AMDGPU::S_CSELECT_B32 : AMDGPU::V_CNDMASK_B32_e32;
-
-    // Instruction's operands are backwards from what is expected.
-    MachineInstr *Select =
-      BuildMI(MBB, I, DL, get(SelOp), DstReg)
-      .addReg(FalseReg)
-      .addReg(TrueReg);
+    MachineInstr *Select;
+    if (Pred == SCC_TRUE) {
+      Select = BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B32), DstReg)
+        .addReg(TrueReg)
+        .addReg(FalseReg);
+    } else {
+      // Instruction's operands are backwards from what is expected.
+      Select = BuildMI(MBB, I, DL, get(AMDGPU::V_CNDMASK_B32_e32), DstReg)
+        .addReg(FalseReg)
+        .addReg(TrueReg);
+    }
 
     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
     return;
@@ -2194,8 +2401,8 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
   if (DstSize == 64 && Pred == SCC_TRUE) {
     MachineInstr *Select =
       BuildMI(MBB, I, DL, get(AMDGPU::S_CSELECT_B64), DstReg)
-      .addReg(FalseReg)
-      .addReg(TrueReg);
+      .addReg(TrueReg)
+      .addReg(FalseReg);
 
     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
     return;
@@ -2239,17 +2446,26 @@ void SIInstrInfo::insertSelect(MachineBasicBlock &MBB,
 
   I = MIB->getIterator();
 
-  SmallVector<unsigned, 8> Regs;
+  SmallVector<Register, 8> Regs;
   for (int Idx = 0; Idx != NElts; ++Idx) {
     Register DstElt = MRI.createVirtualRegister(EltRC);
     Regs.push_back(DstElt);
 
     unsigned SubIdx = SubIndices[Idx];
 
-    MachineInstr *Select =
-      BuildMI(MBB, I, DL, get(SelOp), DstElt)
-      .addReg(FalseReg, 0, SubIdx)
-      .addReg(TrueReg, 0, SubIdx);
+    MachineInstr *Select;
+    if (SelOp == AMDGPU::V_CNDMASK_B32_e32) {
+      Select =
+        BuildMI(MBB, I, DL, get(SelOp), DstElt)
+        .addReg(FalseReg, 0, SubIdx)
+        .addReg(TrueReg, 0, SubIdx);
+    } else {
+      Select =
+        BuildMI(MBB, I, DL, get(SelOp), DstElt)
+        .addReg(TrueReg, 0, SubIdx)
+        .addReg(FalseReg, 0, SubIdx);
+    }
+
     preserveCondRegFlags(Select->getOperand(3), Cond[1]);
     fixImplicitOperands(*Select);
 
@@ -2313,7 +2529,7 @@ static void removeModOperands(MachineInstr &MI) {
 }
 
 bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
-                                unsigned Reg, MachineRegisterInfo *MRI) const {
+                                Register Reg, MachineRegisterInfo *MRI) const {
   if (!MRI->hasOneNonDBGUse(Reg))
     return false;
 
@@ -2339,15 +2555,40 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
 
   unsigned Opc = UseMI.getOpcode();
   if (Opc == AMDGPU::COPY) {
-    bool isVGPRCopy = RI.isVGPR(*MRI, UseMI.getOperand(0).getReg());
+    Register DstReg = UseMI.getOperand(0).getReg();
+    bool Is16Bit = getOpSize(UseMI, 0) == 2;
+    bool isVGPRCopy = RI.isVGPR(*MRI, DstReg);
     unsigned NewOpc = isVGPRCopy ? AMDGPU::V_MOV_B32_e32 : AMDGPU::S_MOV_B32;
-    if (RI.isAGPR(*MRI, UseMI.getOperand(0).getReg())) {
-      if (!isInlineConstant(*ImmOp, AMDGPU::OPERAND_REG_INLINE_AC_INT32))
+    APInt Imm(32, ImmOp->getImm());
+
+    if (UseMI.getOperand(1).getSubReg() == AMDGPU::hi16)
+      Imm = Imm.ashr(16);
+
+    if (RI.isAGPR(*MRI, DstReg)) {
+      if (!isInlineConstant(Imm))
         return false;
       NewOpc = AMDGPU::V_ACCVGPR_WRITE_B32;
     }
+
+    if (Is16Bit) {
+       if (isVGPRCopy)
+         return false; // Do not clobber vgpr_hi16
+
+       if (DstReg.isVirtual() &&
+           UseMI.getOperand(0).getSubReg() != AMDGPU::lo16)
+         return false;
+
+      UseMI.getOperand(0).setSubReg(0);
+      if (DstReg.isPhysical()) {
+        DstReg = RI.get32BitRegister(DstReg);
+        UseMI.getOperand(0).setReg(DstReg);
+      }
+      assert(UseMI.getOperand(1).getReg().isVirtual());
+    }
+
     UseMI.setDesc(get(NewOpc));
-    UseMI.getOperand(1).ChangeToImmediate(ImmOp->getImm());
+    UseMI.getOperand(1).ChangeToImmediate(Imm.getSExtValue());
+    UseMI.getOperand(1).setTargetFlags(0);
     UseMI.addImplicitDefUseOperands(*UseMI.getParent()->getParent());
     return true;
   }
@@ -2517,6 +2758,18 @@ bool SIInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   return false;
 }
 
+static bool
+memOpsHaveSameBaseOperands(ArrayRef<const MachineOperand *> BaseOps1,
+                           ArrayRef<const MachineOperand *> BaseOps2) {
+  if (BaseOps1.size() != BaseOps2.size())
+    return false;
+  for (size_t I = 0, E = BaseOps1.size(); I < E; ++I) {
+    if (!BaseOps1[I]->isIdenticalTo(*BaseOps2[I]))
+      return false;
+  }
+  return true;
+}
+
 static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
                                 int WidthB, int OffsetB) {
   int LowOffset = OffsetA < OffsetB ? OffsetA : OffsetB;
@@ -2527,26 +2780,26 @@ static bool offsetsDoNotOverlap(int WidthA, int OffsetA,
 
 bool SIInstrInfo::checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
                                                const MachineInstr &MIb) const {
-  const MachineOperand *BaseOp0, *BaseOp1;
+  SmallVector<const MachineOperand *, 4> BaseOps0, BaseOps1;
   int64_t Offset0, Offset1;
+  unsigned Dummy0, Dummy1;
+  bool Offset0IsScalable, Offset1IsScalable;
+  if (!getMemOperandsWithOffsetWidth(MIa, BaseOps0, Offset0, Offset0IsScalable,
+                                     Dummy0, &RI) ||
+      !getMemOperandsWithOffsetWidth(MIb, BaseOps1, Offset1, Offset1IsScalable,
+                                     Dummy1, &RI))
+    return false;
 
-  if (getMemOperandWithOffset(MIa, BaseOp0, Offset0, &RI) &&
-      getMemOperandWithOffset(MIb, BaseOp1, Offset1, &RI)) {
-    if (!BaseOp0->isIdenticalTo(*BaseOp1))
-      return false;
+  if (!memOpsHaveSameBaseOperands(BaseOps0, BaseOps1))
+    return false;
 
-    if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
-      // FIXME: Handle ds_read2 / ds_write2.
-      return false;
-    }
-    unsigned Width0 = (*MIa.memoperands_begin())->getSize();
-    unsigned Width1 = (*MIb.memoperands_begin())->getSize();
-    if (offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1)) {
-      return true;
-    }
+  if (!MIa.hasOneMemOperand() || !MIb.hasOneMemOperand()) {
+    // FIXME: Handle ds_read2 / ds_write2.
+    return false;
   }
-
-  return false;
+  unsigned Width0 = MIa.memoperands().front()->getSize();
+  unsigned Width1 = MIb.memoperands().front()->getSize();
+  return offsetsDoNotOverlap(Width0, Offset0, Width1, Offset1);
 }
 
 bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
@@ -2586,7 +2839,7 @@ bool SIInstrInfo::areMemAccessesTriviallyDisjoint(const MachineInstr &MIa,
     if (isSMRD(MIb))
       return checkInstOffsetsDoNotOverlap(MIa, MIb);
 
-    return !isFLAT(MIb) && !isMUBUF(MIa) && !isMTBUF(MIa);
+    return !isFLAT(MIb) && !isMUBUF(MIb) && !isMTBUF(MIb);
   }
 
   if (isFLAT(MIa)) {
@@ -2732,16 +2985,30 @@ static bool changesVGPRIndexingMode(const MachineInstr &MI) {
 bool SIInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
                                        const MachineBasicBlock *MBB,
                                        const MachineFunction &MF) const {
-  // XXX - Do we want the SP check in the base implementation?
+  // Skipping the check for SP writes in the base implementation. The reason it
+  // was added was apparently due to compile time concerns.
+  //
+  // TODO: Do we really want this barrier? It triggers unnecessary hazard nops
+  // but is probably avoidable.
+
+  // Copied from base implementation.
+  // Terminators and labels can't be scheduled around.
+  if (MI.isTerminator() || MI.isPosition())
+    return true;
+
+  // INLINEASM_BR can jump to another block
+  if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
+    return true;
 
   // Target-independent instructions do not have an implicit-use of EXEC, even
   // when they operate on VGPRs. Treating EXEC modifications as scheduling
   // boundaries prevents incorrect movements of such instructions.
-  return TargetInstrInfo::isSchedulingBoundary(MI, MBB, MF) ||
-         MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
+
+  // TODO: Don't treat setreg with known constant that only changes MODE as
+  // barrier.
+  return MI.modifiesRegister(AMDGPU::EXEC, &RI) ||
          MI.getOpcode() == AMDGPU::S_SETREG_IMM32_B32 ||
          MI.getOpcode() == AMDGPU::S_SETREG_B32 ||
-         MI.getOpcode() == AMDGPU::S_DENORM_MODE ||
          changesVGPRIndexingMode(MI);
 }
 
@@ -2755,6 +3022,20 @@ bool SIInstrInfo::isAlwaysGDS(uint16_t Opcode) const {
          Opcode == AMDGPU::DS_GWS_BARRIER;
 }
 
+bool SIInstrInfo::modifiesModeRegister(const MachineInstr &MI) {
+  // Skip the full operand and register alias search modifiesRegister
+  // does. There's only a handful of instructions that touch this, it's only an
+  // implicit def, and doesn't alias any other registers.
+  if (const MCPhysReg *ImpDef = MI.getDesc().getImplicitDefs()) {
+    for (; ImpDef && *ImpDef; ++ImpDef) {
+      if (*ImpDef == AMDGPU::MODE)
+        return true;
+    }
+  }
+
+  return false;
+}
+
 bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const {
   unsigned Opcode = MI.getOpcode();
 
@@ -2780,6 +3061,10 @@ bool SIInstrInfo::hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const
   if (MI.isCall() || MI.isInlineAsm())
     return true; // conservative assumption
 
+  // A mode change is a scalar operation that influences vector instructions.
+  if (modifiesModeRegister(MI))
+    return true;
+
   // These are like SALU instructions in terms of effects, so it's questionable
   // whether we should return true for those.
   //
@@ -2866,10 +3151,26 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
     return AMDGPU::isInlinableLiteral64(MO.getImm(),
                                         ST.hasInv2PiInlineImm());
   case AMDGPU::OPERAND_REG_IMM_INT16:
-  case AMDGPU::OPERAND_REG_IMM_FP16:
   case AMDGPU::OPERAND_REG_INLINE_C_INT16:
-  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_INT16:
+    // We would expect inline immediates to not be concerned with an integer/fp
+    // distinction. However, in the case of 16-bit integer operations, the
+    // "floating point" values appear to not work. It seems read the low 16-bits
+    // of 32-bit immediates, which happens to always work for the integer
+    // values.
+    //
+    // See llvm bugzilla 46302.
+    //
+    // TODO: Theoretically we could use op-sel to use the high bits of the
+    // 32-bit FP values.
+    return AMDGPU::isInlinableIntLiteral(Imm);
+  case AMDGPU::OPERAND_REG_IMM_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
+  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
+    // This suffers the same problem as the scalar 16-bit cases.
+    return AMDGPU::isInlinableIntLiteralV216(Imm);
+  case AMDGPU::OPERAND_REG_IMM_FP16:
+  case AMDGPU::OPERAND_REG_INLINE_C_FP16:
   case AMDGPU::OPERAND_REG_INLINE_AC_FP16: {
     if (isInt<16>(Imm) || isUInt<16>(Imm)) {
       // A few special case instructions have 16-bit operands on subtargets
@@ -2883,11 +3184,8 @@ bool SIInstrInfo::isInlineConstant(const MachineOperand &MO,
 
     return false;
   }
-  case AMDGPU::OPERAND_REG_IMM_V2INT16:
   case AMDGPU::OPERAND_REG_IMM_V2FP16:
-  case AMDGPU::OPERAND_REG_INLINE_C_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_C_V2FP16:
-  case AMDGPU::OPERAND_REG_INLINE_AC_V2INT16:
   case AMDGPU::OPERAND_REG_INLINE_AC_V2FP16: {
     uint32_t Trunc = static_cast<uint32_t>(Imm);
     return AMDGPU::isInlinableLiteralV216(Trunc, ST.hasInv2PiInlineImm());
@@ -3056,7 +3354,8 @@ static void copyFlagsToImplicitVCC(MachineInstr &MI,
                                    const MachineOperand &Orig) {
 
   for (MachineOperand &Use : MI.implicit_operands()) {
-    if (Use.isUse() && Use.getReg() == AMDGPU::VCC) {
+    if (Use.isUse() &&
+        (Use.getReg() == AMDGPU::VCC || Use.getReg() == AMDGPU::VCC_LO)) {
       Use.setIsUndef(Orig.isUndef());
       Use.setIsKill(Orig.isKill());
       return;
@@ -3068,7 +3367,8 @@ MachineInstr *SIInstrInfo::buildShrunkInst(MachineInstr &MI,
                                            unsigned Op32) const {
   MachineBasicBlock *MBB = MI.getParent();;
   MachineInstrBuilder Inst32 =
-    BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32));
+    BuildMI(*MBB, MI, MI.getDebugLoc(), get(Op32))
+    .setMIFlags(MI.getFlags());
 
   // Add the dst operand if the 32-bit encoding also has an explicit $vdst.
   // For VOPC instructions, this is replaced by an implicit def of vcc.
@@ -3138,7 +3438,7 @@ bool SIInstrInfo::usesConstantBus(const MachineRegisterInfo &MRI,
   }
 }
 
-static unsigned findImplicitSGPRRead(const MachineInstr &MI) {
+static Register findImplicitSGPRRead(const MachineInstr &MI) {
   for (const MachineOperand &MO : MI.implicit_operands()) {
     // We only care about reads.
     if (MO.isDef())
@@ -3239,6 +3539,11 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     return true;
   }
 
+  if (isMIMG(MI) && MI.memoperands_empty() && MI.mayLoadOrStore()) {
+    ErrInfo = "missing memory operand from MIMG instruction.";
+    return false;
+  }
+
   // Make sure the register classes are correct.
   for (int i = 0, e = Desc.getNumOperands(); i != e; ++i) {
     if (MI.getOperand(i).isFPImm()) {
@@ -3446,8 +3751,8 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     if (AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::imm) != -1)
       ++ConstantBusCount;
 
-    SmallVector<unsigned, 2> SGPRsUsed;
-    unsigned SGPRUsed = findImplicitSGPRRead(MI);
+    SmallVector<Register, 2> SGPRsUsed;
+    Register SGPRUsed = findImplicitSGPRRead(MI);
     if (SGPRUsed != AMDGPU::NoRegister) {
       ++ConstantBusCount;
       SGPRsUsed.push_back(SGPRUsed);
@@ -3482,7 +3787,7 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
     }
 
     if (isVOP3(MI) && LiteralCount) {
-      if (LiteralCount && !ST.hasVOP3Literal()) {
+      if (!ST.hasVOP3Literal()) {
         ErrInfo = "VOP3 instruction uses literal";
         return false;
       }
@@ -3665,11 +3970,34 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
         return false;
       }
 
+      bool IsA16 = false;
+      if (ST.hasR128A16()) {
+        const MachineOperand *R128A16 = getNamedOperand(MI, AMDGPU::OpName::r128);
+        IsA16 = R128A16->getImm() != 0;
+      } else if (ST.hasGFX10A16()) {
+        const MachineOperand *A16 = getNamedOperand(MI, AMDGPU::OpName::a16);
+        IsA16 = A16->getImm() != 0;
+      }
+
+      bool PackDerivatives = IsA16 || BaseOpcode->G16;
       bool IsNSA = SRsrcIdx - VAddr0Idx > 1;
-      unsigned AddrWords = BaseOpcode->NumExtraArgs +
-                           (BaseOpcode->Gradients ? Dim->NumGradients : 0) +
-                           (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
-                           (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+
+      unsigned AddrWords = BaseOpcode->NumExtraArgs;
+      unsigned AddrComponents = (BaseOpcode->Coordinates ? Dim->NumCoords : 0) +
+                                (BaseOpcode->LodOrClampOrMip ? 1 : 0);
+      if (IsA16)
+        AddrWords += (AddrComponents + 1) / 2;
+      else
+        AddrWords += AddrComponents;
+
+      if (BaseOpcode->Gradients) {
+        if (PackDerivatives)
+          // There are two gradients per coordinate, we pack them separately.
+          // For the 3d case, we get (dy/du, dx/du) (-, dz/du) (dy/dv, dx/dv) (-, dz/dv)
+          AddrWords += (Dim->NumGradients / 2 + 1) / 2 * 2;
+        else
+          AddrWords += Dim->NumGradients;
+      }
 
       unsigned VAddrWords;
       if (IsNSA) {
@@ -3681,14 +4009,15 @@ bool SIInstrInfo::verifyInstruction(const MachineInstr &MI,
           AddrWords = 16;
         else if (AddrWords > 4)
           AddrWords = 8;
-        else if (AddrWords == 3 && VAddrWords == 4) {
-          // CodeGen uses the V4 variant of instructions for three addresses,
-          // because the selection DAG does not support non-power-of-two types.
+        else if (AddrWords == 4)
           AddrWords = 4;
-        }
+        else if (AddrWords == 3)
+          AddrWords = 3;
       }
 
       if (VAddrWords != AddrWords) {
+        LLVM_DEBUG(dbgs() << "bad vaddr size, expected " << AddrWords
+                          << " but got " << VAddrWords << "\n");
         ErrInfo = "bad vaddr size";
         return false;
       }
@@ -4217,7 +4546,7 @@ void SIInstrInfo::legalizeOperandsVOP3(MachineRegisterInfo &MRI,
   }
 }
 
-unsigned SIInstrInfo::readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
+Register SIInstrInfo::readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI,
                                          MachineRegisterInfo &MRI) const {
   const TargetRegisterClass *VRC = MRI.getRegClass(SrcReg);
   const TargetRegisterClass *SRC = RI.getEquivalentSGPRClass(VRC);
@@ -5002,6 +5331,76 @@ void SIInstrInfo::moveToVALU(MachineInstr &TopInst,
       splitScalarBinOpN2(Worklist, Inst, AMDGPU::S_OR_B32);
       Inst.eraseFromParent();
       continue;
+
+    // TODO: remove as soon as everything is ready
+    // to replace VGPR to SGPR copy with V_READFIRSTLANEs.
+    // S_ADD/SUB_CO_PSEUDO as well as S_UADDO/USUBO_PSEUDO
+    // can only be selected from the uniform SDNode.
+    case AMDGPU::S_ADD_CO_PSEUDO:
+    case AMDGPU::S_SUB_CO_PSEUDO: {
+      unsigned Opc = (Inst.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO)
+                         ? AMDGPU::V_ADDC_U32_e64
+                         : AMDGPU::V_SUBB_U32_e64;
+      const auto *CarryRC = RI.getRegClass(AMDGPU::SReg_1_XEXECRegClassID);
+
+      Register CarryInReg = Inst.getOperand(4).getReg();
+      if (!MRI.constrainRegClass(CarryInReg, CarryRC)) {
+        Register NewCarryReg = MRI.createVirtualRegister(CarryRC);
+        BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(AMDGPU::COPY), NewCarryReg)
+            .addReg(CarryInReg);
+      }
+
+      Register CarryOutReg = Inst.getOperand(1).getReg();
+
+      Register DestReg = MRI.createVirtualRegister(RI.getEquivalentVGPRClass(
+          MRI.getRegClass(Inst.getOperand(0).getReg())));
+      MachineInstr *CarryOp =
+          BuildMI(*MBB, &Inst, Inst.getDebugLoc(), get(Opc), DestReg)
+              .addReg(CarryOutReg, RegState::Define)
+              .add(Inst.getOperand(2))
+              .add(Inst.getOperand(3))
+              .addReg(CarryInReg)
+              .addImm(0);
+      legalizeOperands(*CarryOp);
+      MRI.replaceRegWith(Inst.getOperand(0).getReg(), DestReg);
+      addUsersToMoveToVALUWorklist(DestReg, MRI, Worklist);
+      Inst.eraseFromParent();
+    }
+      continue;
+    case AMDGPU::S_UADDO_PSEUDO:
+    case AMDGPU::S_USUBO_PSEUDO: {
+      const DebugLoc &DL = Inst.getDebugLoc();
+      MachineOperand &Dest0 = Inst.getOperand(0);
+      MachineOperand &Dest1 = Inst.getOperand(1);
+      MachineOperand &Src0 = Inst.getOperand(2);
+      MachineOperand &Src1 = Inst.getOperand(3);
+
+      unsigned Opc = (Inst.getOpcode() == AMDGPU::S_UADDO_PSEUDO)
+                         ? AMDGPU::V_ADD_I32_e64
+                         : AMDGPU::V_SUB_I32_e64;
+      const TargetRegisterClass *NewRC =
+          RI.getEquivalentVGPRClass(MRI.getRegClass(Dest0.getReg()));
+      Register DestReg = MRI.createVirtualRegister(NewRC);
+      MachineInstr *NewInstr = BuildMI(*MBB, &Inst, DL, get(Opc), DestReg)
+                                   .addReg(Dest1.getReg(), RegState::Define)
+                                   .add(Src0)
+                                   .add(Src1)
+                                   .addImm(0); // clamp bit
+
+      legalizeOperands(*NewInstr, MDT);
+
+      MRI.replaceRegWith(Dest0.getReg(), DestReg);
+      addUsersToMoveToVALUWorklist(NewInstr->getOperand(0).getReg(), MRI,
+                                   Worklist);
+      Inst.eraseFromParent();
+    }
+      continue;
+
+    case AMDGPU::S_CSELECT_B32:
+    case AMDGPU::S_CSELECT_B64:
+      lowerSelect(Worklist, Inst, MDT);
+      Inst.eraseFromParent();
+      continue;
     }
 
     if (NewOpcode == AMDGPU::INSTRUCTION_LIST_END) {
@@ -5142,6 +5541,78 @@ bool SIInstrInfo::moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
   return false;
 }
 
+void SIInstrInfo::lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+                              MachineDominatorTree *MDT) const {
+
+  MachineBasicBlock &MBB = *Inst.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+  MachineBasicBlock::iterator MII = Inst;
+  DebugLoc DL = Inst.getDebugLoc();
+
+  MachineOperand &Dest = Inst.getOperand(0);
+  MachineOperand &Src0 = Inst.getOperand(1);
+  MachineOperand &Src1 = Inst.getOperand(2);
+  MachineOperand &Cond = Inst.getOperand(3);
+
+  Register SCCSource = Cond.getReg();
+  // Find SCC def, and if that is a copy (SCC = COPY reg) then use reg instead.
+  if (!Cond.isUndef()) {
+    for (MachineInstr &CandI :
+         make_range(std::next(MachineBasicBlock::reverse_iterator(Inst)),
+                    Inst.getParent()->rend())) {
+      if (CandI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) !=
+          -1) {
+        if (CandI.isCopy() && CandI.getOperand(0).getReg() == AMDGPU::SCC) {
+          SCCSource = CandI.getOperand(1).getReg();
+        }
+        break;
+      }
+    }
+  }
+
+  // If this is a trivial select where the condition is effectively not SCC
+  // (SCCSource is a source of copy to SCC), then the select is semantically
+  // equivalent to copying SCCSource. Hence, there is no need to create
+  // V_CNDMASK, we can just use that and bail out.
+  if ((SCCSource != AMDGPU::SCC) && Src0.isImm() && (Src0.getImm() == -1) &&
+      Src1.isImm() && (Src1.getImm() == 0)) {
+    MRI.replaceRegWith(Dest.getReg(), SCCSource);
+    return;
+  }
+
+  const TargetRegisterClass *TC = ST.getWavefrontSize() == 64
+                                      ? &AMDGPU::SReg_64_XEXECRegClass
+                                      : &AMDGPU::SReg_32_XM0_XEXECRegClass;
+  Register CopySCC = MRI.createVirtualRegister(TC);
+
+  if (SCCSource == AMDGPU::SCC) {
+    // Insert a trivial select instead of creating a copy, because a copy from
+    // SCC would semantically mean just copying a single bit, but we may need
+    // the result to be a vector condition mask that needs preserving.
+    unsigned Opcode = (ST.getWavefrontSize() == 64) ? AMDGPU::S_CSELECT_B64
+                                                    : AMDGPU::S_CSELECT_B32;
+    auto NewSelect =
+        BuildMI(MBB, MII, DL, get(Opcode), CopySCC).addImm(-1).addImm(0);
+    NewSelect->getOperand(3).setIsUndef(Cond.isUndef());
+  } else {
+    BuildMI(MBB, MII, DL, get(AMDGPU::COPY), CopySCC).addReg(SCCSource);
+  }
+
+  Register ResultReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
+
+  auto UpdatedInst =
+      BuildMI(MBB, MII, DL, get(AMDGPU::V_CNDMASK_B32_e64), ResultReg)
+          .addImm(0)
+          .add(Src1) // False
+          .addImm(0)
+          .add(Src0) // True
+          .addReg(CopySCC);
+
+  MRI.replaceRegWith(Dest.getReg(), ResultReg);
+  legalizeOperands(*UpdatedInst, MDT);
+  addUsersToMoveToVALUWorklist(ResultReg, MRI, Worklist);
+}
+
 void SIInstrInfo::lowerScalarAbs(SetVectorType &Worklist,
                                  MachineInstr &Inst) const {
   MachineBasicBlock &MBB = *Inst.getParent();
@@ -5623,7 +6094,7 @@ void SIInstrInfo::splitScalar64BitBFE(SetVectorType &Worklist,
 }
 
 void SIInstrInfo::addUsersToMoveToVALUWorklist(
-  unsigned DstReg,
+  Register DstReg,
   MachineRegisterInfo &MRI,
   SetVectorType &Worklist) const {
   for (MachineRegisterInfo::use_iterator I = MRI.use_begin(DstReg),
@@ -5723,20 +6194,60 @@ void SIInstrInfo::movePackToVALU(SetVectorType &Worklist,
 void SIInstrInfo::addSCCDefUsersToVALUWorklist(MachineOperand &Op,
                                                MachineInstr &SCCDefInst,
                                                SetVectorType &Worklist) const {
+  bool SCCUsedImplicitly = false;
+
   // Ensure that def inst defines SCC, which is still live.
   assert(Op.isReg() && Op.getReg() == AMDGPU::SCC && Op.isDef() &&
          !Op.isDead() && Op.getParent() == &SCCDefInst);
+  SmallVector<MachineInstr *, 4> CopyToDelete;
   // This assumes that all the users of SCC are in the same block
   // as the SCC def.
   for (MachineInstr &MI : // Skip the def inst itself.
        make_range(std::next(MachineBasicBlock::iterator(SCCDefInst)),
                   SCCDefInst.getParent()->end())) {
     // Check if SCC is used first.
-    if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1)
-      Worklist.insert(&MI);
+    if (MI.findRegisterUseOperandIdx(AMDGPU::SCC, false, &RI) != -1) {
+      if (MI.isCopy()) {
+        MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
+        unsigned DestReg = MI.getOperand(0).getReg();
+
+        for (auto &User : MRI.use_nodbg_instructions(DestReg)) {
+          if ((User.getOpcode() == AMDGPU::S_ADD_CO_PSEUDO) ||
+              (User.getOpcode() == AMDGPU::S_SUB_CO_PSEUDO)) {
+            User.getOperand(4).setReg(RI.getVCC());
+            Worklist.insert(&User);
+          } else if (User.getOpcode() == AMDGPU::V_CNDMASK_B32_e64) {
+            User.getOperand(5).setReg(RI.getVCC());
+            // No need to add to Worklist.
+          }
+        }
+        CopyToDelete.push_back(&MI);
+      } else {
+        if (MI.getOpcode() == AMDGPU::S_CSELECT_B32 ||
+            MI.getOpcode() == AMDGPU::S_CSELECT_B64) {
+          // This is an implicit use of SCC and it is really expected by
+          // the SCC users to handle.
+          // We cannot preserve the edge to the user so add the explicit
+          // copy: SCC = COPY VCC.
+          // The copy will be cleaned up during the processing of the user
+          // in lowerSelect.
+          SCCUsedImplicitly = true;
+        }
+
+        Worklist.insert(&MI);
+      }
+    }
     // Exit if we find another SCC def.
     if (MI.findRegisterDefOperandIdx(AMDGPU::SCC, false, false, &RI) != -1)
-      return;
+      break;
+  }
+  for (auto &Copy : CopyToDelete)
+    Copy->eraseFromParent();
+
+  if (SCCUsedImplicitly) {
+    BuildMI(*SCCDefInst.getParent(), std::next(SCCDefInst.getIterator()),
+            SCCDefInst.getDebugLoc(), get(AMDGPU::COPY), AMDGPU::SCC)
+        .addReg(RI.getVCC());
   }
 }
 
@@ -5789,7 +6300,7 @@ const TargetRegisterClass *SIInstrInfo::getDestEquivalentVGPRClass(
 }
 
 // Find the one SGPR operand we are allowed to use.
-unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
+Register SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
                                    int OpIndices[3]) const {
   const MCInstrDesc &Desc = MI.getDesc();
 
@@ -5802,11 +6313,11 @@ unsigned SIInstrInfo::findUsedSGPR(const MachineInstr &MI,
   //
   // If the operand's class is an SGPR, we can never move it.
 
-  unsigned SGPRReg = findImplicitSGPRRead(MI);
+  Register SGPRReg = findImplicitSGPRRead(MI);
   if (SGPRReg != AMDGPU::NoRegister)
     return SGPRReg;
 
-  unsigned UsedSGPRs[3] = { AMDGPU::NoRegister };
+  Register UsedSGPRs[3] = { AMDGPU::NoRegister };
   const MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
 
   for (unsigned i = 0; i < 3; ++i) {
@@ -5919,10 +6430,9 @@ bool SIInstrInfo::isLowLatencyInstruction(const MachineInstr &MI) const {
   return isSMRD(Opc);
 }
 
-bool SIInstrInfo::isHighLatencyInstruction(const MachineInstr &MI) const {
-  unsigned Opc = MI.getOpcode();
-
-  return isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc);
+bool SIInstrInfo::isHighLatencyDef(int Opc) const {
+  return get(Opc).mayLoad() &&
+         (isMUBUF(Opc) || isMTBUF(Opc) || isMIMG(Opc) || isFLAT(Opc));
 }
 
 unsigned SIInstrInfo::isStackAccess(const MachineInstr &MI,
@@ -6198,7 +6708,7 @@ MachineInstrBuilder
 SIInstrInfo::getAddNoCarry(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator I,
                            const DebugLoc &DL,
-                           unsigned DestReg) const {
+                           Register DestReg) const {
   if (ST.hasAddNoCarry())
     return BuildMI(MBB, I, DL, get(AMDGPU::V_ADD_U32_e64), DestReg);
 
@@ -6608,20 +7118,24 @@ MachineInstr *SIInstrInfo::foldMemoryOperandImpl(
   // %0 may even spill. We can't spill $m0 normally (it would require copying to
   // a numbered SGPR anyway), and since it is in the SReg_32 register class,
   // TargetInstrInfo::foldMemoryOperand() is going to try.
+  // A similar issue also exists with spilling and reloading $exec registers.
   //
   // To prevent that, constrain the %0 register class here.
   if (MI.isFullCopy()) {
     Register DstReg = MI.getOperand(0).getReg();
     Register SrcReg = MI.getOperand(1).getReg();
-
-    if (DstReg == AMDGPU::M0 && SrcReg.isVirtual()) {
-      MF.getRegInfo().constrainRegClass(SrcReg, &AMDGPU::SReg_32_XM0RegClass);
-      return nullptr;
-    }
-
-    if (SrcReg == AMDGPU::M0 && DstReg.isVirtual()) {
-      MF.getRegInfo().constrainRegClass(DstReg, &AMDGPU::SReg_32_XM0RegClass);
-      return nullptr;
+    if ((DstReg.isVirtual() || SrcReg.isVirtual()) &&
+        (DstReg.isVirtual() != SrcReg.isVirtual())) {
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+      Register VirtReg = DstReg.isVirtual() ? DstReg : SrcReg;
+      const TargetRegisterClass *RC = MRI.getRegClass(VirtReg);
+      if (RC->hasSuperClassEq(&AMDGPU::SReg_32RegClass)) {
+        MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_32_XM0_XEXECRegClass);
+        return nullptr;
+      } else if (RC->hasSuperClassEq(&AMDGPU::SReg_64RegClass)) {
+        MRI.constrainRegClass(VirtReg, &AMDGPU::SReg_64_XEXECRegClass);
+        return nullptr;
+      }
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
index b151a94b0d11..53e2ffba0f65 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h
@@ -84,6 +84,9 @@ private:
   bool moveScalarAddSub(SetVectorType &Worklist, MachineInstr &Inst,
                         MachineDominatorTree *MDT = nullptr) const;
 
+  void lowerSelect(SetVectorType &Worklist, MachineInstr &Inst,
+                   MachineDominatorTree *MDT = nullptr) const;
+
   void lowerScalarAbs(SetVectorType &Worklist,
                       MachineInstr &Inst) const;
 
@@ -119,7 +122,7 @@ private:
                       MachineRegisterInfo &MRI,
                       MachineInstr &Inst) const;
 
-  void addUsersToMoveToVALUWorklist(unsigned Reg, MachineRegisterInfo &MRI,
+  void addUsersToMoveToVALUWorklist(Register Reg, MachineRegisterInfo &MRI,
                                     SetVectorType &Worklist) const;
 
   void addSCCDefUsersToVALUWorklist(MachineOperand &Op,
@@ -132,7 +135,7 @@ private:
   bool checkInstOffsetsDoNotOverlap(const MachineInstr &MIa,
                                     const MachineInstr &MIb) const;
 
-  unsigned findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
+  Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const;
 
 protected:
   bool swapSourceModifiers(MachineInstr &MI,
@@ -181,14 +184,15 @@ public:
                                int64_t &Offset1,
                                int64_t &Offset2) const override;
 
-  bool getMemOperandWithOffset(const MachineInstr &LdSt,
-                               const MachineOperand *&BaseOp,
-                               int64_t &Offset,
-                               const TargetRegisterInfo *TRI) const final;
+  bool getMemOperandsWithOffsetWidth(
+      const MachineInstr &LdSt,
+      SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
+      bool &OffsetIsScalable, unsigned &Width,
+      const TargetRegisterInfo *TRI) const final;
 
-  bool shouldClusterMemOps(const MachineOperand &BaseOp1,
-                           const MachineOperand &BaseOp2,
-                           unsigned NumLoads) const override;
+  bool shouldClusterMemOps(ArrayRef<const MachineOperand *> BaseOps1,
+                           ArrayRef<const MachineOperand *> BaseOps2,
+                           unsigned NumLoads, unsigned NumBytes) const override;
 
   bool shouldScheduleLoadsNear(SDNode *Load0, SDNode *Load1, int64_t Offset0,
                                int64_t Offset1, unsigned NumLoads) const override;
@@ -210,22 +214,22 @@ public:
   const TargetRegisterClass *getPreferredSelectRegClass(
                                unsigned Size) const;
 
-  unsigned insertNE(MachineBasicBlock *MBB,
+  Register insertNE(MachineBasicBlock *MBB,
                     MachineBasicBlock::iterator I, const DebugLoc &DL,
-                    unsigned SrcReg, int Value) const;
+                    Register SrcReg, int Value) const;
 
-  unsigned insertEQ(MachineBasicBlock *MBB,
+  Register insertEQ(MachineBasicBlock *MBB,
                     MachineBasicBlock::iterator I, const DebugLoc &DL,
-                    unsigned SrcReg, int Value)  const;
+                    Register SrcReg, int Value)  const;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, unsigned SrcReg,
+                           MachineBasicBlock::iterator MI, Register SrcReg,
                            bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI, unsigned DestReg,
+                            MachineBasicBlock::iterator MI, Register DestReg,
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
@@ -244,6 +248,9 @@ public:
   // DstRC, then AMDGPU::COPY is returned.
   unsigned getMovOpcode(const TargetRegisterClass *DstRC) const;
 
+  const MCInstrDesc &getIndirectRegWritePseudo(
+    unsigned VecSize, unsigned EltSize, bool IsSGPR) const;
+
   LLVM_READONLY
   int commuteOpcode(unsigned Opc) const;
 
@@ -293,20 +300,19 @@ public:
     SmallVectorImpl<MachineOperand> &Cond) const override;
 
   bool canInsertSelect(const MachineBasicBlock &MBB,
-                       ArrayRef<MachineOperand> Cond,
-                       unsigned TrueReg, unsigned FalseReg,
-                       int &CondCycles,
+                       ArrayRef<MachineOperand> Cond, Register DstReg,
+                       Register TrueReg, Register FalseReg, int &CondCycles,
                        int &TrueCycles, int &FalseCycles) const override;
 
   void insertSelect(MachineBasicBlock &MBB,
                     MachineBasicBlock::iterator I, const DebugLoc &DL,
-                    unsigned DstReg, ArrayRef<MachineOperand> Cond,
-                    unsigned TrueReg, unsigned FalseReg) const override;
+                    Register DstReg, ArrayRef<MachineOperand> Cond,
+                    Register TrueReg, Register FalseReg) const override;
 
   void insertVectorSelect(MachineBasicBlock &MBB,
                           MachineBasicBlock::iterator I, const DebugLoc &DL,
-                          unsigned DstReg, ArrayRef<MachineOperand> Cond,
-                          unsigned TrueReg, unsigned FalseReg) const;
+                          Register DstReg, ArrayRef<MachineOperand> Cond,
+                          Register TrueReg, Register FalseReg) const;
 
   unsigned getAddressSpaceForPseudoSourceKind(
              unsigned Kind) const override;
@@ -317,7 +323,7 @@ public:
 
   bool isFoldableCopy(const MachineInstr &MI) const;
 
-  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
                      MachineRegisterInfo *MRI) const final;
 
   unsigned getMachineCSELookAheadLimit() const override { return 500; }
@@ -685,6 +691,9 @@ public:
       return MO.isReg() && RI.isVGPR(MRI, MO.getReg());});
   }
 
+  /// Return true if the instruction modifies the mode register.q
+  static bool modifiesModeRegister(const MachineInstr &MI);
+
   /// Whether we must prevent this instruction from executing with EXEC = 0.
   bool hasUnwantedEffectsWhenEXECEmpty(const MachineInstr &MI) const;
 
@@ -824,11 +833,7 @@ public:
     const MachineOperand &MO = MI.getOperand(OpNo);
     if (MO.isReg()) {
       if (unsigned SubReg = MO.getSubReg()) {
-        assert(RI.getRegSizeInBits(*RI.getSubClassWithSubReg(
-                                   MI.getParent()->getParent()->getRegInfo().
-                                     getRegClass(MO.getReg()), SubReg)) >= 32 &&
-               "Sub-dword subregs are not supported");
-        return RI.getSubRegIndexLaneMask(SubReg).getNumLanes() * 4;
+        return RI.getSubRegIdxSize(SubReg) / 8;
       }
     }
     return RI.getRegSizeInBits(*getOpRegClass(MI, OpNo)) / 8;
@@ -874,7 +879,7 @@ public:
   /// be used when it is know that the value in SrcReg is same across all
   /// threads in the wave.
   /// \returns The SGPR register that \p SrcReg was copied to.
-  unsigned readlaneVGPRToSGPR(unsigned SrcReg, MachineInstr &UseMI,
+  Register readlaneVGPRToSGPR(Register SrcReg, MachineInstr &UseMI,
                               MachineRegisterInfo &MRI) const;
 
   void legalizeOperandsSMRD(MachineRegisterInfo &MRI, MachineInstr &MI) const;
@@ -928,7 +933,7 @@ public:
   uint64_t getScratchRsrcWords23() const;
 
   bool isLowLatencyInstruction(const MachineInstr &MI) const;
-  bool isHighLatencyInstruction(const MachineInstr &MI) const;
+  bool isHighLatencyDef(int Opc) const override;
 
   /// Return the descriptor of the target-specific machine instruction
   /// that corresponds to the specified pseudo or native opcode.
@@ -995,7 +1000,7 @@ public:
   MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator I,
                                     const DebugLoc &DL,
-                                    unsigned DestReg) const;
+                                    Register DestReg) const;
 
   MachineInstrBuilder getAddNoCarry(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator I,
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 85e8d0582dcd..7aee52f91360 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -7,11 +7,9 @@
 //===----------------------------------------------------------------------===//
 
 def isWave32 : Predicate<"Subtarget->getWavefrontSize() == 32">,
-  AssemblerPredicate <"FeatureWavefrontSize32">;
+  AssemblerPredicate <(all_of FeatureWavefrontSize32)>;
 def isWave64 : Predicate<"Subtarget->getWavefrontSize() == 64">,
-  AssemblerPredicate <"FeatureWavefrontSize64">;
-
-def DisableInst : Predicate <"false">, AssemblerPredicate<"FeatureDisable">;
+  AssemblerPredicate <(all_of FeatureWavefrontSize64)>;
 
 class GCNPredicateControl : PredicateControl {
   Predicate SIAssemblerPredicate = isGFX6GFX7;
@@ -30,6 +28,7 @@ def SIEncodingFamily {
   int GFX9 = 5;
   int GFX10 = 6;
   int SDWA10 = 7;
+  int GFX10_B = 8;
 }
 
 //===----------------------------------------------------------------------===//
@@ -39,8 +38,7 @@ def SIEncodingFamily {
 def AMDGPUclamp : SDNode<"AMDGPUISD::CLAMP", SDTFPUnaryOp>;
 
 def SIsbuffer_load : SDNode<"AMDGPUISD::SBUFFER_LOAD",
-  SDTypeProfile<1, 4, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i1>,
-                       SDTCisVT<4, i1>]>,
+  SDTypeProfile<1, 3, [SDTCisVT<1, v4i32>, SDTCisVT<2, i32>, SDTCisVT<3, i32>]>,
   [SDNPMayLoad, SDNPMemOperand]
 >;
 
@@ -57,6 +55,10 @@ def SIatomic_dec : SDNode<"AMDGPUISD::ATOMIC_DEC", SDTAtomic2,
   [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
 >;
 
+def SIatomic_csub : SDNode<"AMDGPUISD::ATOMIC_LOAD_CSUB", SDTAtomic2,
+  [SDNPMayLoad, SDNPMayStore, SDNPMemOperand, SDNPHasChain]
+>;
+
 def SDTAtomic2_f32 : SDTypeProfile<1, 2, [
   SDTCisSameAs<0,2>, SDTCisFP<0>, SDTCisPtrTy<1>
 ]>;
@@ -200,6 +202,7 @@ def SIbuffer_atomic_or : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_OR">;
 def SIbuffer_atomic_xor : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_XOR">;
 def SIbuffer_atomic_inc : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_INC">;
 def SIbuffer_atomic_dec : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_DEC">;
+def SIbuffer_atomic_csub : SDBufferAtomic <"AMDGPUISD::BUFFER_ATOMIC_CSUB">;
 def SIbuffer_atomic_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_FADD", f32>;
 def SIbuffer_atomic_pk_fadd : SDBufferAtomicNoRtn <"AMDGPUISD::BUFFER_ATOMIC_PK_FADD", v2f16>;
 
@@ -267,7 +270,7 @@ def SIload_d16_hi_i8 : SDNode<"AMDGPUISD::LOAD_D16_HI_I8",
 
 def SIdenorm_mode : SDNode<"AMDGPUISD::DENORM_MODE",
   SDTypeProfile<0 ,1, [SDTCisInt<0>]>,
-  [SDNPHasChain, SDNPSideEffect, SDNPOptInGlue, SDNPOutGlue]
+  [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]
 >;
 
 //===----------------------------------------------------------------------===//
@@ -308,6 +311,10 @@ class isPackedType<ValueType SrcVT> {
 // PatFrags for global memory operations
 //===----------------------------------------------------------------------===//
 
+let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_global").AddrSpaces in {
+defm atomic_csub_global : binary_atomic_op<SIatomic_csub>;
+}
+
 foreach as = [ "global", "flat", "constant", "local", "private", "region" ] in {
 let AddressSpaces = !cast<AddressSpaceList>("LoadAddress_"#as).AddrSpaces in {
 
@@ -631,6 +638,16 @@ def add_ctpop : PatFrag <
   (add (ctpop $src0), $src1)
 >;
 
+foreach I = 1-4 in {
+def shl#I#_add : PatFrag <
+  (ops node:$src0, node:$src1),
+  (add (shl_oneuse $src0, (i32 I)), $src1)> {
+  // FIXME: Poor substitute for disabling pattern in SelectionDAG
+  let PredicateCode = [{return false;}];
+  let GISelPredicateCode = [{return true;}];
+}
+}
+
 multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
                             SDTypeProfile tc = SDTAtomic2,
                             bit IsInt = 1> {
@@ -651,6 +668,7 @@ multiclass SIAtomicM0Glue2 <string op_name, bit is_amdgpu = 0,
 
 defm atomic_load_add : SIAtomicM0Glue2 <"LOAD_ADD">;
 defm atomic_load_sub : SIAtomicM0Glue2 <"LOAD_SUB">;
+defm atomic_load_csub : SIAtomicM0Glue2 <"LOAD_CSUB", 1>;
 defm atomic_inc : SIAtomicM0Glue2 <"INC", 1>;
 defm atomic_dec : SIAtomicM0Glue2 <"DEC", 1>;
 defm atomic_load_and : SIAtomicM0Glue2 <"LOAD_AND">;
@@ -665,7 +683,7 @@ defm atomic_load_fadd : SIAtomicM0Glue2 <"LOAD_FADD", 0, SDTAtomic2_f32, 0>;
 defm atomic_load_fmin : SIAtomicM0Glue2 <"LOAD_FMIN", 1, SDTAtomic2_f32, 0>;
 defm atomic_load_fmax : SIAtomicM0Glue2 <"LOAD_FMAX", 1, SDTAtomic2_f32, 0>;
 
-def as_i1imm : SDNodeXForm<imm, [{
+def as_i1timm : SDNodeXForm<timm, [{
   return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i1);
 }]>;
 
@@ -673,6 +691,10 @@ def as_i8imm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getZExtValue(), SDLoc(N), MVT::i8);
 }]>;
 
+def as_i8timm : SDNodeXForm<timm, [{
+  return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16);
+}]>;
+
 def as_i16imm : SDNodeXForm<imm, [{
   return CurDAG->getTargetConstant(N->getSExtValue(), SDLoc(N), MVT::i16);
 }]>;
@@ -766,7 +788,7 @@ def NegSubInlineConst32 : ImmLeaf<i32, [{
   return Imm < -16 && Imm >= -64;
 }], NegateImm>;
 
-def NegSubInlineConst16 : ImmLeaf<i16, [{
+def NegSubInlineIntConst16 : ImmLeaf<i16, [{
   return Imm < -16 && Imm >= -64;
 }], NegateImm>;
 
@@ -791,6 +813,26 @@ def NegSubInlineConstV216 : PatLeaf<(build_vector), [{
 }], getNegV2I16Imm>;
 
 //===----------------------------------------------------------------------===//
+// MUBUF/SMEM Patterns
+//===----------------------------------------------------------------------===//
+
+def extract_glc : SDNodeXForm<timm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i8);
+}]>;
+
+def extract_slc : SDNodeXForm<timm, [{
+  return CurDAG->getTargetConstant((N->getZExtValue() >> 1) & 1, SDLoc(N), MVT::i8);
+}]>;
+
+def extract_dlc : SDNodeXForm<timm, [{
+  return CurDAG->getTargetConstant((N->getZExtValue() >> 2) & 1, SDLoc(N), MVT::i8);
+}]>;
+
+def extract_swz : SDNodeXForm<timm, [{
+  return CurDAG->getTargetConstant((N->getZExtValue() >> 3) & 1, SDLoc(N), MVT::i8);
+}]>;
+
+//===----------------------------------------------------------------------===//
 // Custom Operands
 //===----------------------------------------------------------------------===//
 
@@ -935,7 +977,7 @@ def VOPDstS64orS32 : BoolRC {
 }
 
 // SCSrc_i1 is the operand for pseudo instructions only.
-// Boolean immeadiates shall not be exposed to codegen instructions.
+// Boolean immediates shall not be exposed to codegen instructions.
 def SCSrc_i1 : RegisterOperand<SReg_1_XEXEC> {
   let OperandNamespace = "AMDGPU";
   let OperandType = "OPERAND_REG_IMM_INT32";
@@ -1067,6 +1109,7 @@ def SWZ : NamedOperandBit<"SWZ", NamedMatchClass<"SWZ">>;
 def UNorm : NamedOperandBit<"UNorm", NamedMatchClass<"UNorm">>;
 def DA : NamedOperandBit<"DA", NamedMatchClass<"DA">>;
 def R128A16 : NamedOperandBit<"R128A16", NamedMatchClass<"R128A16">>;
+def GFX10A16 : NamedOperandBit<"GFX10A16", NamedMatchClass<"GFX10A16">>;
 def D16 : NamedOperandBit<"D16", NamedMatchClass<"D16">>;
 def LWE : NamedOperandBit<"LWE", NamedMatchClass<"LWE">>;
 def exp_compr : NamedOperandBit<"ExpCompr", NamedMatchClass<"ExpCompr">>;
@@ -1099,9 +1142,9 @@ def blgp : NamedOperandU32<"BLGP", NamedMatchClass<"BLGP">>;
 def cbsz : NamedOperandU32<"CBSZ", NamedMatchClass<"CBSZ">>;
 def abid : NamedOperandU32<"ABID", NamedMatchClass<"ABID">>;
 
-def hwreg : NamedOperandU16<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
+def hwreg : NamedOperandU32<"Hwreg", NamedMatchClass<"Hwreg", 0>>;
 
-def exp_tgt : NamedOperandU8<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
+def exp_tgt : NamedOperandU32<"ExpTgt", NamedMatchClass<"ExpTgt", 0>> {
 
 }
 
@@ -1274,19 +1317,14 @@ def VOP3Mods  : ComplexPattern<untyped, 2, "SelectVOP3Mods">;
 def VOP3NoMods : ComplexPattern<untyped, 1, "SelectVOP3NoMods">;
 // VOP3Mods, but the input source is known to never be NaN.
 def VOP3Mods_nnan : ComplexPattern<fAny, 2, "SelectVOP3Mods_NNaN">;
-// VOP3Mods, but only allowed for f32 operands.
-def VOP3Mods_f32 : ComplexPattern<fAny, 2, "SelectVOP3Mods_f32">;
 
 def VOP3OMods : ComplexPattern<untyped, 3, "SelectVOP3OMods">;
 
 def VOP3PMods  : ComplexPattern<untyped, 2, "SelectVOP3PMods">;
-def VOP3PMods0 : ComplexPattern<untyped, 3, "SelectVOP3PMods0">;
 
 def VOP3OpSel  : ComplexPattern<untyped, 2, "SelectVOP3OpSel">;
-def VOP3OpSel0 : ComplexPattern<untyped, 3, "SelectVOP3OpSel0">;
 
 def VOP3OpSelMods  : ComplexPattern<untyped, 2, "SelectVOP3OpSelMods">;
-def VOP3OpSelMods0 : ComplexPattern<untyped, 3, "SelectVOP3OpSelMods0">;
 
 def VOP3PMadMixMods  : ComplexPattern<untyped, 2, "SelectVOP3PMadMixMods">;
 
@@ -1347,6 +1385,7 @@ def HWREG {
   int FLAT_SCR_HI = 21;
   int XNACK_MASK = 22;
   int POPS_PACKER = 25;
+  int SHADER_CYCLES = 29;
 }
 
 class getHwRegImm<int Reg, int Offset = 0, int Size = 32> {
@@ -1380,24 +1419,21 @@ class SIMCInstr <string pseudo, int subtarget> {
 // EXP classes
 //===----------------------------------------------------------------------===//
 
-class EXP_Helper<bit done, SDPatternOperator node = null_frag> : EXPCommon<
+class EXP_Helper<bit done> : EXPCommon<
   (outs),
   (ins exp_tgt:$tgt,
        ExpSrc0:$src0, ExpSrc1:$src1, ExpSrc2:$src2, ExpSrc3:$src3,
-       exp_vm:$vm, exp_compr:$compr, i8imm:$en),
-  "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm",
-  [(node (i8 timm:$tgt), (i8 timm:$en),
-         f32:$src0, f32:$src1, f32:$src2, f32:$src3,
-         (i1 timm:$compr), (i1 timm:$vm))]> {
+       exp_vm:$vm, exp_compr:$compr, i32imm:$en),
+  "exp$tgt $src0, $src1, $src2, $src3"#!if(done, " done", "")#"$compr$vm", []> {
   let AsmMatchConverter = "cvtExp";
 }
 
 // Split EXP instruction into EXP and EXP_DONE so we can set
 // mayLoad for done=1.
-multiclass EXP_m<bit done, SDPatternOperator node> {
+multiclass EXP_m<bit done> {
   let mayLoad = done, DisableWQM = 1 in {
     let isPseudo = 1, isCodeGenOnly = 1 in {
-      def "" : EXP_Helper<done, node>,
+      def "" : EXP_Helper<done>,
                SIMCInstr <"exp"#!if(done, "_done", ""), SIEncodingFamily.NONE>;
     }
 
@@ -1685,7 +1721,7 @@ class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
     !if (HasClamp,
       (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
            Src1Mod:$src1_modifiers, Src1RC:$src1,
-           clampmod:$clamp,
+           clampmod0:$clamp,
            op_sel:$op_sel, op_sel_hi:$op_sel_hi,
            neg_lo:$neg_lo, neg_hi:$neg_hi),
       (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
@@ -1697,7 +1733,7 @@ class getInsVOP3P <RegisterOperand Src0RC, RegisterOperand Src1RC,
       (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
            Src1Mod:$src1_modifiers, Src1RC:$src1,
            Src2Mod:$src2_modifiers, Src2RC:$src2,
-           clampmod:$clamp,
+           clampmod0:$clamp,
            op_sel:$op_sel, op_sel_hi:$op_sel_hi,
            neg_lo:$neg_lo, neg_hi:$neg_hi),
       (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
@@ -1720,7 +1756,7 @@ class getInsVOP3OpSel <RegisterOperand Src0RC,
     !if (HasClamp,
       (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
            Src1Mod:$src1_modifiers, Src1RC:$src1,
-           clampmod:$clamp,
+           clampmod0:$clamp,
            op_sel:$op_sel),
       (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
            Src1Mod:$src1_modifiers, Src1RC:$src1,
@@ -1730,7 +1766,7 @@ class getInsVOP3OpSel <RegisterOperand Src0RC,
       (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
            Src1Mod:$src1_modifiers, Src1RC:$src1,
            Src2Mod:$src2_modifiers, Src2RC:$src2,
-           clampmod:$clamp,
+           clampmod0:$clamp,
            op_sel:$op_sel),
       (ins Src0Mod:$src0_modifiers, Src0RC:$src0,
            Src1Mod:$src1_modifiers, Src1RC:$src1,
@@ -2242,6 +2278,7 @@ def VOP_F16_F16_F16 : VOPProfile <[f16, f16, f16, untyped]>;
 def VOP_F16_F16_I16 : VOPProfile <[f16, f16, i16, untyped]>;
 def VOP_F16_F16_I32 : VOPProfile <[f16, f16, i32, untyped]>;
 def VOP_I16_I16_I16 : VOPProfile <[i16, i16, i16, untyped]>;
+def VOP_I16_I16_I16_ARITH : VOPProfile <[i16, i16, i16, untyped], 0, /*EnableClamp=*/1>;
 
 def VOP_I16_I16_I16_I16 : VOPProfile <[i16, i16, i16, i16, untyped]>;
 def VOP_F16_F16_F16_F16 : VOPProfile <[f16, f16, f16, f16, untyped]>;
@@ -2455,7 +2492,8 @@ def getMCOpcodeGen : InstrMapping {
                    [!cast<string>(SIEncodingFamily.GFX80)],
                    [!cast<string>(SIEncodingFamily.GFX9)],
                    [!cast<string>(SIEncodingFamily.GFX10)],
-                   [!cast<string>(SIEncodingFamily.SDWA10)]];
+                   [!cast<string>(SIEncodingFamily.SDWA10)],
+                   [!cast<string>(SIEncodingFamily.GFX10_B)]];
 }
 
 // Get equivalent SOPK instruction.
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index d84720f820ee..0c4c9e0e9df2 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1,4 +1,4 @@
-//===-- SIInstructions.td - SI Instruction Defintions ---------------------===//
+//===-- SIInstructions.td - SI Instruction Definitions --------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -24,8 +24,38 @@ include "BUFInstructions.td"
 // EXP Instructions
 //===----------------------------------------------------------------------===//
 
-defm EXP : EXP_m<0, AMDGPUexport>;
-defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
+defm EXP : EXP_m<0>;
+defm EXP_DONE : EXP_m<1>;
+
+class ExpPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
+  (int_amdgcn_exp timm:$tgt, timm:$en,
+                  (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
+                  (vt ExpSrc2:$src2), (vt ExpSrc3:$src3),
+                  done_val, timm:$vm),
+  (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
+        ExpSrc2:$src2, ExpSrc3:$src3, timm:$vm, 0, timm:$en)
+>;
+
+class ExpComprPattern<ValueType vt, Instruction Inst, int done_val> : GCNPat<
+  (int_amdgcn_exp_compr timm:$tgt, timm:$en,
+                        (vt ExpSrc0:$src0), (vt ExpSrc1:$src1),
+                        done_val, timm:$vm),
+  (Inst timm:$tgt, ExpSrc0:$src0, ExpSrc1:$src1,
+        (IMPLICIT_DEF), (IMPLICIT_DEF), timm:$vm, 1, timm:$en)
+>;
+
+// FIXME: The generated DAG matcher seems to have strange behavior
+// with a 1-bit literal to match, so use a -1 for checking a true
+// 1-bit value.
+def : ExpPattern<i32, EXP, 0>;
+def : ExpPattern<i32, EXP_DONE, -1>;
+def : ExpPattern<f32, EXP, 0>;
+def : ExpPattern<f32, EXP_DONE, -1>;
+
+def : ExpComprPattern<v2i16, EXP, 0>;
+def : ExpComprPattern<v2i16, EXP_DONE, -1>;
+def : ExpComprPattern<v2f16, EXP, 0>;
+def : ExpComprPattern<v2f16, EXP_DONE, -1>;
 
 //===----------------------------------------------------------------------===//
 // VINTRP Instructions
@@ -34,9 +64,9 @@ defm EXP_DONE : EXP_m<1, AMDGPUexport_done>;
 // Used to inject printing of "_e32" suffix for VI (there are "_e64" variants for VI)
 def VINTRPDst : VINTRPDstOperand <VGPR_32>;
 
-let Uses = [M0, EXEC] in {
+let Uses = [MODE, M0, EXEC] in {
 
-// FIXME: Specify SchedRW for VINTRP insturctions.
+// FIXME: Specify SchedRW for VINTRP instructions.
 
 multiclass V_INTERP_P1_F32_m : VINTRP_m <
   0x00000000,
@@ -76,10 +106,10 @@ defm V_INTERP_MOV_F32 : VINTRP_m <
   (outs VINTRPDst:$vdst),
   (ins InterpSlot:$vsrc, Attr:$attr, AttrChan:$attrchan),
   "v_interp_mov_f32$vdst, $vsrc, $attr$attrchan",
-  [(set f32:$vdst, (int_amdgcn_interp_mov (i32 imm:$vsrc),
+  [(set f32:$vdst, (int_amdgcn_interp_mov (i32 timm:$vsrc),
                    (i32 timm:$attrchan), (i32 timm:$attr), M0))]>;
 
-} // End Uses = [M0, EXEC]
+} // End Uses = [MODE, M0, EXEC]
 
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
@@ -136,7 +166,8 @@ def WWM : PseudoInstSI <(outs unknown:$vdst), (ins unknown:$src0)>;
 } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC]
 
 def ENTER_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> {
-  let Defs = [EXEC];
+  let Uses = [EXEC];
+  let Defs = [EXEC, SCC];
   let hasSideEffects = 0;
   let mayLoad = 0;
   let mayStore = 0;
@@ -162,16 +193,27 @@ def V_SET_INACTIVE_B64 : VPseudoInstSI <(outs VReg_64:$vdst),
   let Constraints = "$src = $vdst";
 }
 
+let usesCustomInserter = 1, Defs = [VCC, EXEC] in {
+def V_ADD_U64_PSEUDO : VPseudoInstSI <
+  (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
+  [(set VReg_64:$vdst, (getDivergentFrag<add>.ret i64:$src0, i64:$src1))]
+>;
+
+def V_SUB_U64_PSEUDO : VPseudoInstSI <
+  (outs VReg_64:$vdst), (ins VSrc_b64:$src0, VSrc_b64:$src1),
+  [(set VReg_64:$vdst, (getDivergentFrag<sub>.ret i64:$src0, i64:$src1))]
+>;
+} // End usesCustomInserter = 1, Defs = [VCC, EXEC]
 
 let usesCustomInserter = 1, Defs = [SCC] in {
 def S_ADD_U64_PSEUDO : SPseudoInstSI <
-  (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
-  [(set SReg_64:$vdst, (add i64:$src0, i64:$src1))]
+  (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+  [(set SReg_64:$sdst, (UniformBinFrag<add> i64:$src0, i64:$src1))]
 >;
 
 def S_SUB_U64_PSEUDO : SPseudoInstSI <
-  (outs SReg_64:$vdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
-  [(set SReg_64:$vdst, (sub i64:$src0, i64:$src1))]
+  (outs SReg_64:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1),
+  [(set SReg_64:$sdst, (UniformBinFrag<sub> i64:$src0, i64:$src1))]
 >;
 
 def S_ADD_U64_CO_PSEUDO : SPseudoInstSI <
@@ -181,6 +223,23 @@ def S_ADD_U64_CO_PSEUDO : SPseudoInstSI <
 def S_SUB_U64_CO_PSEUDO : SPseudoInstSI <
   (outs SReg_64:$vdst, VOPDstS64orS32:$sdst), (ins SSrc_b64:$src0, SSrc_b64:$src1)
 >;
+
+def S_ADD_CO_PSEUDO : SPseudoInstSI <
+  (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in)
+>;
+
+def S_SUB_CO_PSEUDO : SPseudoInstSI <
+  (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1, SSrc_i1:$scc_in)
+>;
+
+def S_UADDO_PSEUDO : SPseudoInstSI <
+  (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
+>;
+
+def S_USUBO_PSEUDO : SPseudoInstSI <
+  (outs SReg_32:$sdst, SSrc_i1:$scc_out), (ins SSrc_b32:$src0, SSrc_b32:$src1)
+>;
+
 } // End usesCustomInserter = 1, Defs = [SCC]
 
 let usesCustomInserter = 1 in {
@@ -199,6 +258,7 @@ class WrapTerminatorInst<SOP_Pseudo base_inst> : SPseudoInstSI<
   let hasSideEffects = base_inst.hasSideEffects;
   let UseNamedOperandTable = base_inst.UseNamedOperandTable;
   let CodeSize = base_inst.CodeSize;
+  let SchedRW = base_inst.SchedRW;
 }
 
 let WaveSizePredicate = isWave64 in {
@@ -214,13 +274,14 @@ def S_OR_B32_term : WrapTerminatorInst<S_OR_B32>;
 def S_ANDN2_B32_term : WrapTerminatorInst<S_ANDN2_B32>;
 }
 
+
 def WAVE_BARRIER : SPseudoInstSI<(outs), (ins),
   [(int_amdgcn_wave_barrier)]> {
   let SchedRW = [];
   let hasNoSchedulingInfo = 1;
   let hasSideEffects = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
   let isConvergent = 1;
   let FixedSize = 1;
   let Size = 0;
@@ -318,6 +379,9 @@ multiclass PseudoInstKill <dag ins> {
 defm SI_KILL_I1 : PseudoInstKill <(ins SCSrc_i1:$src, i1imm:$killvalue)>;
 defm SI_KILL_F32_COND_IMM : PseudoInstKill <(ins VSrc_b32:$src0, i32imm:$src1, i32imm:$cond)>;
 
+let Defs = [EXEC] in
+def SI_KILL_CLEANUP : SPseudoInstSI <(outs), (ins)>;
+
 let Defs = [EXEC,VCC] in
 def SI_ILLEGAL_COPY : SPseudoInstSI <
   (outs unknown:$dst), (ins unknown:$src),
@@ -386,7 +450,7 @@ def SI_INIT_EXEC_FROM_INPUT : SPseudoInstSI <
 
 def : GCNPat <
   (int_amdgcn_init_exec timm:$src),
-  (SI_INIT_EXEC_LO (as_i32imm imm:$src))> {
+  (SI_INIT_EXEC_LO (as_i32timm timm:$src))> {
   let WaveSizePredicate = isWave32;
 }
 
@@ -413,8 +477,8 @@ def SI_RETURN : SPseudoInstSI <
 
 // Return for returning function calls without output register.
 //
-// This version is only needed so we can fill in the output regiter in
-// the custom inserter.
+// This version is only needed so we can fill in the output register
+// in the custom inserter.
 def SI_CALL_ISEL : SPseudoInstSI <
   (outs), (ins SSrc_b64:$src0, unknown:$callee),
   [(AMDGPUcall i64:$src0, tglobaladdr:$callee)]> {
@@ -426,6 +490,11 @@ def SI_CALL_ISEL : SPseudoInstSI <
   let isConvergent = 1;
 }
 
+def : GCNPat<
+  (AMDGPUcall i64:$src0, (i64 0)),
+  (SI_CALL_ISEL $src0, (i64 0))
+>;
+
 // Wrapper around s_swappc_b64 with extra $callee parameter to track
 // the called function after regalloc.
 def SI_CALL : SPseudoInstSI <
@@ -480,6 +549,8 @@ def ADJCALLSTACKDOWN : SPseudoInstSI<
 let Defs = [M0, EXEC, SCC],
   UseNamedOperandTable = 1 in {
 
+// SI_INDIRECT_SRC/DST are only used by legacy SelectionDAG indirect
+// addressing implementation.
 class SI_INDIRECT_SRC<RegisterClass rc> : VPseudoInstSI <
   (outs VGPR_32:$vdst),
   (ins rc:$src, VS_32:$idx, i32imm:$offset)> {
@@ -493,21 +564,81 @@ class SI_INDIRECT_DST<RegisterClass rc> : VPseudoInstSI <
   let usesCustomInserter = 1;
 }
 
-// TODO: We can support indirect SGPR access.
 def SI_INDIRECT_SRC_V1 : SI_INDIRECT_SRC<VGPR_32>;
 def SI_INDIRECT_SRC_V2 : SI_INDIRECT_SRC<VReg_64>;
 def SI_INDIRECT_SRC_V4 : SI_INDIRECT_SRC<VReg_128>;
 def SI_INDIRECT_SRC_V8 : SI_INDIRECT_SRC<VReg_256>;
 def SI_INDIRECT_SRC_V16 : SI_INDIRECT_SRC<VReg_512>;
+def SI_INDIRECT_SRC_V32 : SI_INDIRECT_SRC<VReg_1024>;
 
 def SI_INDIRECT_DST_V1 : SI_INDIRECT_DST<VGPR_32>;
 def SI_INDIRECT_DST_V2 : SI_INDIRECT_DST<VReg_64>;
 def SI_INDIRECT_DST_V4 : SI_INDIRECT_DST<VReg_128>;
 def SI_INDIRECT_DST_V8 : SI_INDIRECT_DST<VReg_256>;
 def SI_INDIRECT_DST_V16 : SI_INDIRECT_DST<VReg_512>;
+def SI_INDIRECT_DST_V32 : SI_INDIRECT_DST<VReg_1024>;
 
 } // End Uses = [EXEC], Defs = [M0, EXEC]
 
+
+// This is a pseudo variant of the v_movreld_b32 (or v_mov_b32
+// expecting to be executed with gpr indexing mode enabled)
+// instruction in which the vector operand appears only twice, once as
+// def and once as use. Using this pseudo avoids problems with the Two
+// Address instructions pass.
+class INDIRECT_REG_WRITE_pseudo<RegisterClass rc,
+                                RegisterOperand val_ty> : PseudoInstSI <
+  (outs rc:$vdst), (ins rc:$vsrc, val_ty:$val, i32imm:$subreg)> {
+  let Constraints = "$vsrc = $vdst";
+  let Uses = [M0];
+}
+
+class V_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> :
+  INDIRECT_REG_WRITE_pseudo<rc, VSrc_b32> {
+  let VALU = 1;
+  let VOP1 = 1;
+  let Uses = [M0, EXEC];
+}
+
+class S_INDIRECT_REG_WRITE_pseudo<RegisterClass rc,
+                                  RegisterOperand val_ty> :
+  INDIRECT_REG_WRITE_pseudo<rc, val_ty> {
+  let SALU = 1;
+  let SOP1 = 1;
+  let Uses = [M0];
+}
+
+class S_INDIRECT_REG_WRITE_B32_pseudo<RegisterClass rc> :
+  S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b32>;
+class S_INDIRECT_REG_WRITE_B64_pseudo<RegisterClass rc> :
+  S_INDIRECT_REG_WRITE_pseudo<rc, SSrc_b64>;
+
+
+def V_INDIRECT_REG_WRITE_B32_V1 : V_INDIRECT_REG_WRITE_B32_pseudo<VGPR_32>;
+def V_INDIRECT_REG_WRITE_B32_V2 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_64>;
+def V_INDIRECT_REG_WRITE_B32_V3 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_96>;
+def V_INDIRECT_REG_WRITE_B32_V4 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_128>;
+def V_INDIRECT_REG_WRITE_B32_V5 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_160>;
+def V_INDIRECT_REG_WRITE_B32_V8 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_256>;
+def V_INDIRECT_REG_WRITE_B32_V16 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_512>;
+def V_INDIRECT_REG_WRITE_B32_V32 : V_INDIRECT_REG_WRITE_B32_pseudo<VReg_1024>;
+
+def S_INDIRECT_REG_WRITE_B32_V1 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_32>;
+def S_INDIRECT_REG_WRITE_B32_V2 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_64>;
+def S_INDIRECT_REG_WRITE_B32_V3 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_96>;
+def S_INDIRECT_REG_WRITE_B32_V4 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_128>;
+def S_INDIRECT_REG_WRITE_B32_V5 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_160>;
+def S_INDIRECT_REG_WRITE_B32_V8 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_256>;
+def S_INDIRECT_REG_WRITE_B32_V16 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_512>;
+def S_INDIRECT_REG_WRITE_B32_V32 : S_INDIRECT_REG_WRITE_B32_pseudo<SReg_1024>;
+
+def S_INDIRECT_REG_WRITE_B64_V1 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_64>;
+def S_INDIRECT_REG_WRITE_B64_V2 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_128>;
+def S_INDIRECT_REG_WRITE_B64_V4 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_256>;
+def S_INDIRECT_REG_WRITE_B64_V8 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_512>;
+def S_INDIRECT_REG_WRITE_B64_V16 : S_INDIRECT_REG_WRITE_B64_pseudo<SReg_1024>;
+
+
 multiclass SI_SPILL_SGPR <RegisterClass sgpr_class> {
   let UseNamedOperandTable = 1, SGPRSpill = 1, Uses = [EXEC] in {
     def _SAVE : PseudoInstSI <
@@ -535,6 +666,7 @@ defm SI_SPILL_S64  : SI_SPILL_SGPR <SReg_64>;
 defm SI_SPILL_S96  : SI_SPILL_SGPR <SReg_96>;
 defm SI_SPILL_S128 : SI_SPILL_SGPR <SReg_128>;
 defm SI_SPILL_S160 : SI_SPILL_SGPR <SReg_160>;
+defm SI_SPILL_S192 : SI_SPILL_SGPR <SReg_192>;
 defm SI_SPILL_S256 : SI_SPILL_SGPR <SReg_256>;
 defm SI_SPILL_S512 : SI_SPILL_SGPR <SReg_512>;
 defm SI_SPILL_S1024 : SI_SPILL_SGPR <SReg_1024>;
@@ -574,6 +706,7 @@ defm SI_SPILL_V64  : SI_SPILL_VGPR <VReg_64>;
 defm SI_SPILL_V96  : SI_SPILL_VGPR <VReg_96>;
 defm SI_SPILL_V128 : SI_SPILL_VGPR <VReg_128>;
 defm SI_SPILL_V160 : SI_SPILL_VGPR <VReg_160>;
+defm SI_SPILL_V192 : SI_SPILL_VGPR <VReg_192>;
 defm SI_SPILL_V256 : SI_SPILL_VGPR <VReg_256>;
 defm SI_SPILL_V512 : SI_SPILL_VGPR <VReg_512>;
 defm SI_SPILL_V1024 : SI_SPILL_VGPR <VReg_1024>;
@@ -639,12 +772,6 @@ def : GCNPat<
 >;
 
 def : Pat <
-  // -1.0 as i32 (LowerINTRINSIC_VOID converts all other constants to -1.0)
-  (AMDGPUkill (i32 -1082130432)),
-  (SI_KILL_I1_PSEUDO (i1 0), 0)
->;
-
-def : Pat <
   (int_amdgcn_kill i1:$src),
   (SI_KILL_I1_PSEUDO SCSrc_i1:$src, 0)
 >;
@@ -655,11 +782,6 @@ def : Pat <
 >;
 
 def : Pat <
-  (AMDGPUkill i32:$src),
-  (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, 0, 3) // 3 means SETOGE
->;
-
-def : Pat <
   (int_amdgcn_kill (i1 (setcc f32:$src, InlineImmFP32:$imm, cond:$cond))),
   (SI_KILL_F32_COND_IMM_PSEUDO VSrc_b32:$src, (bitcast_fpimm_to_i32 $imm), (cond_as_i32imm $cond))
 >;
@@ -693,14 +815,14 @@ def : RsqPat<V_RSQ_F64_e32, f64>;
 def : GCNPat <
   (f32 (fsub (f32 (VOP3Mods f32:$x, i32:$mods)),
              (f32 (ffloor (f32 (VOP3Mods f32:$x, i32:$mods)))))),
-  (V_FRACT_F32_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+  (V_FRACT_F32_e64 $mods, $x)
 >;
 
 // Convert (x + (-floor(x))) to fract(x)
 def : GCNPat <
   (f64 (fadd (f64 (VOP3Mods f64:$x, i32:$mods)),
              (f64 (fneg (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))))))),
-  (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE)
+  (V_FRACT_F64_e64 $mods, $x)
 >;
 
 } // End OtherPredicates = [UnsafeFPMath]
@@ -709,27 +831,27 @@ def : GCNPat <
 // f16_to_fp patterns
 def : GCNPat <
   (f32 (f16_to_fp i32:$src0)),
-  (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+  (V_CVT_F32_F16_e64 SRCMODS.NONE, $src0)
 >;
 
 def : GCNPat <
   (f32 (f16_to_fp (and_oneuse i32:$src0, 0x7fff))),
-  (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+  (V_CVT_F32_F16_e64 SRCMODS.ABS, $src0)
 >;
 
 def : GCNPat <
   (f32 (f16_to_fp (i32 (srl_oneuse (and_oneuse i32:$src0, 0x7fff0000), (i32 16))))),
-  (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)), DSTCLAMP.NONE, DSTOMOD.NONE)
+  (V_CVT_F32_F16_e64 SRCMODS.ABS, (i32 (V_LSHRREV_B32_e64 (i32 16), i32:$src0)))
 >;
 
 def : GCNPat <
   (f32 (f16_to_fp (or_oneuse i32:$src0, 0x8000))),
-  (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+  (V_CVT_F32_F16_e64 SRCMODS.NEG_ABS, $src0)
 >;
 
 def : GCNPat <
   (f32 (f16_to_fp (xor_oneuse i32:$src0, 0x8000))),
-  (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+  (V_CVT_F32_F16_e64 SRCMODS.NEG, $src0)
 >;
 
 def : GCNPat <
@@ -740,7 +862,7 @@ def : GCNPat <
 // fp_to_fp16 patterns
 def : GCNPat <
   (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
-  (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0, DSTCLAMP.NONE, DSTOMOD.NONE)
+  (V_CVT_F16_F32_e64 $src0_modifiers, f32:$src0)
 >;
 
 def : GCNPat <
@@ -767,20 +889,29 @@ def : GCNPat <
 // VOP2 Patterns
 //===----------------------------------------------------------------------===//
 
-multiclass FMADPat <ValueType vt, Instruction inst> {
-  def : GCNPat <
-    (vt (fmad (VOP3NoMods vt:$src0),
-              (VOP3NoMods vt:$src1),
-              (VOP3NoMods vt:$src2))),
+// TODO: Check only no src2 mods?
+class FMADPat <ValueType vt, Instruction inst, SDPatternOperator node>
+  : GCNPat <(vt (node (vt (VOP3NoMods vt:$src0)),
+                      (vt (VOP3NoMods vt:$src1)),
+                      (vt (VOP3NoMods vt:$src2)))),
     (inst SRCMODS.NONE, $src0, SRCMODS.NONE, $src1,
           SRCMODS.NONE, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
-  >;
+>;
+
+
+// Prefer mac form when there are no modifiers.
+let AddedComplexity = 9 in {
+def : FMADPat <f32, V_MAC_F32_e64, fmad>;
+def : FMADPat <f32, V_MAC_F32_e64, AMDGPUfmad_ftz>;
+
+let SubtargetPredicate = Has16BitInsts in {
+def : FMADPat <f16, V_MAC_F16_e64, fmad>;
+def : FMADPat <f16, V_MAC_F16_e64, AMDGPUfmad_ftz>;
 }
 
-defm : FMADPat <f16, V_MAC_F16_e64>;
-defm : FMADPat <f32, V_MAC_F32_e64>;
+}
 
-class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
+class FMADModsPat<ValueType Ty, Instruction inst, SDPatternOperator mad_opr>
   : GCNPat<
   (Ty (mad_opr (Ty (VOP3Mods Ty:$src0, i32:$src0_mod)),
                (Ty (VOP3Mods Ty:$src1, i32:$src1_mod)),
@@ -789,24 +920,28 @@ class FMADModsPat<Instruction inst, SDPatternOperator mad_opr, ValueType Ty>
   $src2_mod, $src2, DSTCLAMP.NONE, DSTOMOD.NONE)
 >;
 
-// FIXME: This should select to V_MAC_F32
-def : FMADModsPat<V_MAD_F32, AMDGPUfmad_ftz, f32>;
-def : FMADModsPat<V_MAD_F16, AMDGPUfmad_ftz, f16> {
+let SubtargetPredicate = HasMadMacF32Insts in
+def : FMADModsPat<f32, V_MAD_F32, AMDGPUfmad_ftz>;
+def : FMADModsPat<f16, V_MAD_F16, AMDGPUfmad_ftz> {
   let SubtargetPredicate = Has16BitInsts;
 }
 
-multiclass SelectPat <ValueType vt> {
-  def : GCNPat <
-    (vt (select i1:$src0, (VOP3Mods_f32 vt:$src1, i32:$src1_mods),
-                          (VOP3Mods_f32 vt:$src2, i32:$src2_mods))),
-    (V_CNDMASK_B32_e64 $src2_mods, $src2, $src1_mods, $src1, $src0)
-  >;
-}
+class VOPSelectModsPat <ValueType vt> : GCNPat <
+  (vt (select i1:$src0, (VOP3Mods vt:$src1, i32:$src1_mods),
+                        (VOP3Mods vt:$src2, i32:$src2_mods))),
+  (V_CNDMASK_B32_e64 FP32InputMods:$src2_mods, VSrc_b32:$src2,
+                     FP32InputMods:$src1_mods, VSrc_b32:$src1, SSrc_i1:$src0)
+>;
+
+class VOPSelectPat <ValueType vt> : GCNPat <
+  (vt (select i1:$src0, vt:$src1, vt:$src2)),
+  (V_CNDMASK_B32_e64 0, VSrc_b32:$src2, 0, VSrc_b32:$src1, SSrc_i1:$src0)
+>;
 
-defm : SelectPat <i16>;
-defm : SelectPat <i32>;
-defm : SelectPat <f16>;
-defm : SelectPat <f32>;
+def : VOPSelectModsPat <i32>;
+def : VOPSelectModsPat <f32>;
+def : VOPSelectPat <f16>;
+def : VOPSelectPat <i16>;
 
 let AddedComplexity = 1 in {
 def : GCNPat <
@@ -1039,6 +1174,8 @@ def : BitConvert <v4f32, v2f64, VReg_128>;
 def : BitConvert <v4i32, v2f64, VReg_128>;
 def : BitConvert <v2i64, v2f64, VReg_128>;
 def : BitConvert <v2f64, v2i64, VReg_128>;
+def : BitConvert <v4f32, v2i64, VReg_128>;
+def : BitConvert <v2i64, v4f32, VReg_128>;
 
 // 160-bit bitcast
 def : BitConvert <v5i32, v5f32, SGPR_160>;
@@ -1049,14 +1186,46 @@ def : BitConvert <v8i32, v8f32, SReg_256>;
 def : BitConvert <v8f32, v8i32, SReg_256>;
 def : BitConvert <v8i32, v8f32, VReg_256>;
 def : BitConvert <v8f32, v8i32, VReg_256>;
+def : BitConvert <v4i64, v4f64, VReg_256>;
+def : BitConvert <v4f64, v4i64, VReg_256>;
+def : BitConvert <v4i64, v8i32, VReg_256>;
+def : BitConvert <v4i64, v8f32, VReg_256>;
+def : BitConvert <v4f64, v8i32, VReg_256>;
+def : BitConvert <v4f64, v8f32, VReg_256>;
+def : BitConvert <v8i32, v4i64, VReg_256>;
+def : BitConvert <v8f32, v4i64, VReg_256>;
+def : BitConvert <v8i32, v4f64, VReg_256>;
+def : BitConvert <v8f32, v4f64, VReg_256>;
+
 
 // 512-bit bitcast
 def : BitConvert <v16i32, v16f32, VReg_512>;
 def : BitConvert <v16f32, v16i32, VReg_512>;
+def : BitConvert <v8i64,  v8f64,  VReg_512>;
+def : BitConvert <v8f64,  v8i64,  VReg_512>;
+def : BitConvert <v8i64,  v16i32, VReg_512>;
+def : BitConvert <v8f64,  v16i32, VReg_512>;
+def : BitConvert <v16i32, v8i64,  VReg_512>;
+def : BitConvert <v16i32, v8f64,  VReg_512>;
+def : BitConvert <v8i64,  v16f32, VReg_512>;
+def : BitConvert <v8f64,  v16f32, VReg_512>;
+def : BitConvert <v16f32, v8i64,  VReg_512>;
+def : BitConvert <v16f32, v8f64,  VReg_512>;
 
 // 1024-bit bitcast
 def : BitConvert <v32i32, v32f32, VReg_1024>;
 def : BitConvert <v32f32, v32i32, VReg_1024>;
+def : BitConvert <v16i64, v16f64, VReg_1024>;
+def : BitConvert <v16f64, v16i64, VReg_1024>;
+def : BitConvert <v16i64, v32i32, VReg_1024>;
+def : BitConvert <v32i32, v16i64, VReg_1024>;
+def : BitConvert <v16f64, v32f32, VReg_1024>;
+def : BitConvert <v32f32, v16f64, VReg_1024>;
+def : BitConvert <v16i64, v32f32, VReg_1024>;
+def : BitConvert <v32i32, v16f64, VReg_1024>;
+def : BitConvert <v16f64, v32i32, VReg_1024>;
+def : BitConvert <v32f32, v16i64, VReg_1024>;
+
 
 /********** =================== **********/
 /********** Src & Dst modifiers **********/
@@ -1155,7 +1324,7 @@ def : GCNPat <
   (S_OR_B32 SReg_32:$src, (S_MOV_B32 (i32 0x80008000))) // Set sign bit
 >;
 
-// FIXME: The implicit-def of scc from S_[X]OR_B32 is mishandled
+// FIXME: The implicit-def of scc from S_[X]OR/AND_B32 is mishandled
  // def : GCNPat <
 //   (fneg (f64 SReg_64:$src)),
 //   (REG_SEQUENCE SReg_64,
@@ -1176,6 +1345,17 @@ def : GCNPat <
 //     sub1)
 // >;
 
+// FIXME: Use S_BITSET0_B32/B64?
+// def : GCNPat <
+//   (fabs (f64 SReg_64:$src)),
+//   (REG_SEQUENCE SReg_64,
+//     (i32 (EXTRACT_SUBREG SReg_64:$src, sub0)),
+//     sub0,
+//     (S_AND_B32 (i32 (EXTRACT_SUBREG SReg_64:$src, sub1)),
+//                (i32 (S_MOV_B32 (i32 0x7fffffff)))),
+//     sub1)
+// >;
+
 } // End let AddedComplexity = 1
 
 def : GCNPat <
@@ -1372,11 +1552,12 @@ class Ext32Pat <SDNode ext> : GCNPat <
 def : Ext32Pat <zext>;
 def : Ext32Pat <anyext>;
 
-// The multiplication scales from [0,1] to the unsigned integer range
+// The multiplication scales from [0,1) to the unsigned integer range,
+// rounding down a bit to avoid unwanted overflow.
 def : GCNPat <
   (AMDGPUurecip i32:$src0),
   (V_CVT_U32_F32_e32
-    (V_MUL_F32_e32 (i32 CONST.FP_UINT_MAX_PLUS_1),
+    (V_MUL_F32_e32 (i32 CONST.FP_4294966784),
                    (V_RCP_IFLAG_F32_e32 (V_CVT_F32_U32_e32 $src0))))
 >;
 
@@ -1421,11 +1602,13 @@ defm : SI_INDIRECT_Pattern <v2f32, f32, "V2">;
 defm : SI_INDIRECT_Pattern <v4f32, f32, "V4">;
 defm : SI_INDIRECT_Pattern <v8f32, f32, "V8">;
 defm : SI_INDIRECT_Pattern <v16f32, f32, "V16">;
+defm : SI_INDIRECT_Pattern <v32f32, f32, "V32">;
 
 defm : SI_INDIRECT_Pattern <v2i32, i32, "V2">;
 defm : SI_INDIRECT_Pattern <v4i32, i32, "V4">;
 defm : SI_INDIRECT_Pattern <v8i32, i32, "V8">;
 defm : SI_INDIRECT_Pattern <v16i32, i32, "V16">;
+defm : SI_INDIRECT_Pattern <v32i32, i32, "V32">;
 
 //===----------------------------------------------------------------------===//
 // SAD Patterns
@@ -1695,102 +1878,187 @@ def : GCNPat <
 def : GCNPat <
   (i32 (bswap i32:$a)),
   (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
-             (V_ALIGNBIT_B32 $a, $a, (i32 24)),
-             (V_ALIGNBIT_B32 $a, $a, (i32 8)))
+             (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 24)),
+             (V_ALIGNBIT_B32 VSrc_b32:$a, VSrc_b32:$a, (i32 8)))
 >;
 
-let OtherPredicates = [NoFP16Denormals] in {
-def : GCNPat<
-  (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
-  (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src, 0, 0)
+// FIXME: This should have been narrowed to i32 during legalization.
+// This pattern should also be skipped for GlobalISel
+def : GCNPat <
+  (i64 (bswap i64:$a)),
+  (REG_SEQUENCE VReg_64,
+  (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
+             (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+                             (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+                             (i32 24)),
+             (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+                             (i32 (EXTRACT_SUBREG VReg_64:$a, sub1)),
+                             (i32 8))),
+  sub0,
+  (V_BFI_B32 (S_MOV_B32 (i32 0x00ff00ff)),
+             (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+                             (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+                             (i32 24)),
+             (V_ALIGNBIT_B32 (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+                             (i32 (EXTRACT_SUBREG VReg_64:$a, sub0)),
+                             (i32 8))),
+  sub1)
+>;
+
+// FIXME: The AddedComplexity should not be needed, but in GlobalISel
+// the BFI pattern ends up taking precedence without it.
+let SubtargetPredicate = isGFX8Plus, AddedComplexity = 1 in {
+// Magic number: 3 | (2 << 8) | (1 << 16) | (0 << 24)
+//
+// My reading of the manual suggests we should be using src0 for the
+// register value, but this is what seems to work.
+def : GCNPat <
+  (i32 (bswap i32:$a)),
+  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x00010203)))
 >;
 
-def : GCNPat<
-  (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
-  (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src, 0, 0)
+// FIXME: This should have been narrowed to i32 during legalization.
+// This pattern should also be skipped for GlobalISel
+def : GCNPat <
+  (i64 (bswap i64:$a)),
+  (REG_SEQUENCE VReg_64,
+  (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub1),
+              (S_MOV_B32 (i32 0x00010203))),
+  sub0,
+  (V_PERM_B32 (i32 0), (EXTRACT_SUBREG VReg_64:$a, sub0),
+              (S_MOV_B32 (i32 0x00010203))),
+  sub1)
 >;
 
-def : GCNPat<
-  (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
-  (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
+// Magic number: 1 | (0 << 8) | (12 << 16) | (12 << 24)
+// The 12s emit 0s.
+def : GCNPat <
+  (i16 (bswap i16:$a)),
+  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
 >;
-}
 
-let OtherPredicates = [FP16Denormals] in {
-def : GCNPat<
-  (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
-  (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)
+def : GCNPat <
+  (i32 (zext (bswap i16:$a))),
+  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x0c0c0001)))
 >;
 
-let SubtargetPredicate = HasVOP3PInsts in {
-def : GCNPat<
-  (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
-  (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)
+// Magic number: 1 | (0 << 8) | (3 << 16) | (2 << 24)
+def : GCNPat <
+  (v2i16 (bswap v2i16:$a)),
+  (V_PERM_B32 (i32 0), VSrc_b32:$a, (S_MOV_B32 (i32 0x02030001)))
 >;
+
 }
-}
 
-let OtherPredicates = [NoFP32Denormals] in {
+
+// Prefer selecting to max when legal, but using mul is always valid.
+let AddedComplexity = -5 in {
 def : GCNPat<
-  (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
-  (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src, 0, 0)
+  (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
+  (V_MUL_F16_e64 0, (i32 CONST.FP16_ONE), $src_mods, $src)
 >;
 
 def : GCNPat<
-  (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))),
-  (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src, 0, 0)
+  (fcanonicalize (f16 (fneg (VOP3Mods f16:$src, i32:$src_mods)))),
+  (V_MUL_F16_e64 0, (i32 CONST.FP16_NEG_ONE), $src_mods, $src)
+>;
+
+def : GCNPat<
+  (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
+  (V_PK_MUL_F16 0, (i32 CONST.FP16_ONE), $src_mods, $src, DSTCLAMP.NONE)
 >;
-}
 
-let OtherPredicates = [FP32Denormals] in {
 def : GCNPat<
   (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
-  (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src, 0, 0)
+  (V_MUL_F32_e64 0, (i32 CONST.FP32_ONE), $src_mods, $src)
 >;
-}
 
-let OtherPredicates = [NoFP64Denormals] in {
 def : GCNPat<
-  (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
-  (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src, 0, 0)
+  (fcanonicalize (f32 (fneg (VOP3Mods f32:$src, i32:$src_mods)))),
+  (V_MUL_F32_e64 0, (i32 CONST.FP32_NEG_ONE), $src_mods, $src)
 >;
-}
 
-let OtherPredicates = [FP64Denormals] in {
+// TODO: Handle fneg like other types.
 def : GCNPat<
   (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
-  (V_MAX_F64 $src_mods, $src, $src_mods, $src, 0, 0)
+  (V_MUL_F64 0, CONST.FP64_ONE, $src_mods, $src)
 >;
+} // End AddedComplexity = -5
+
+multiclass SelectCanonicalizeAsMax<
+  list<Predicate> f32_preds = [],
+  list<Predicate> f64_preds = [],
+  list<Predicate> f16_preds = []> {
+  def : GCNPat<
+    (fcanonicalize (f32 (VOP3Mods f32:$src, i32:$src_mods))),
+    (V_MAX_F32_e64 $src_mods, $src, $src_mods, $src)> {
+    let OtherPredicates = f32_preds;
+  }
+
+  def : GCNPat<
+    (fcanonicalize (f64 (VOP3Mods f64:$src, i32:$src_mods))),
+    (V_MAX_F64 $src_mods, $src, $src_mods, $src)> {
+    let OtherPredicates = f64_preds;
+  }
+
+  def : GCNPat<
+    (fcanonicalize (f16 (VOP3Mods f16:$src, i32:$src_mods))),
+    (V_MAX_F16_e64 $src_mods, $src, $src_mods, $src, 0, 0)> {
+    // FIXME: Should have 16-bit inst subtarget predicate
+    let OtherPredicates = f16_preds;
+  }
+
+  def : GCNPat<
+    (fcanonicalize (v2f16 (VOP3PMods v2f16:$src, i32:$src_mods))),
+    (V_PK_MAX_F16 $src_mods, $src, $src_mods, $src, DSTCLAMP.NONE)> {
+    // FIXME: Should have VOP3P subtarget predicate
+    let OtherPredicates = f16_preds;
+  }
 }
 
+// On pre-gfx9 targets, v_max_*/v_min_* did not respect the denormal
+// mode, and would never flush. For f64, it's faster to do implement
+// this with a max. For f16/f32 it's a wash, but prefer max when
+// valid.
+//
+// FIXME: Lowering f32/f16 with max is worse since we can use a
+// smaller encoding if the input is fneg'd. It also adds an extra
+// register use.
+let SubtargetPredicate = HasMinMaxDenormModes in {
+  defm : SelectCanonicalizeAsMax<[], [], []>;
+} // End SubtargetPredicate = HasMinMaxDenormModes
+
+let SubtargetPredicate = NotHasMinMaxDenormModes in {
+  // Use the max lowering if we don't need to flush.
+
+  // FIXME: We don't do use this for f32 as a workaround for the
+  // library being compiled with the default ieee mode, but
+  // potentially being called from flushing kernels. Really we should
+  // not be mixing code expecting different default FP modes, but mul
+  // works in any FP environment.
+  defm : SelectCanonicalizeAsMax<[FalsePredicate], [FP64Denormals], [FP16Denormals]>;
+} // End SubtargetPredicate = NotHasMinMaxDenormModes
+
+
 let OtherPredicates = [HasDLInsts] in {
 def : GCNPat <
-  (fma (f32 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+  (fma (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
        (f32 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
        (f32 (VOP3NoMods f32:$src2))),
   (V_FMAC_F32_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
-                  SRCMODS.NONE, $src2, $clamp, $omod)
+                  SRCMODS.NONE, $src2)
 >;
 } // End OtherPredicates = [HasDLInsts]
 
 let SubtargetPredicate = isGFX10Plus in
 def : GCNPat <
-  (fma (f16 (VOP3Mods0 f32:$src0, i32:$src0_modifiers, i1:$clamp, i32:$omod)),
+  (fma (f16 (VOP3Mods f32:$src0, i32:$src0_modifiers)),
        (f16 (VOP3Mods f32:$src1, i32:$src1_modifiers)),
        (f16 (VOP3NoMods f32:$src2))),
   (V_FMAC_F16_e64 $src0_modifiers, $src0, $src1_modifiers, $src1,
-                  SRCMODS.NONE, $src2, $clamp, $omod)
->;
-
-// Allow integer inputs
-class ExpPattern<SDPatternOperator node, ValueType vt, Instruction Inst> : GCNPat<
-  (node (i8 timm:$tgt), (i8 timm:$en), vt:$src0, vt:$src1, vt:$src2, vt:$src3, (i1 timm:$compr), (i1 timm:$vm)),
-  (Inst i8:$tgt, vt:$src0, vt:$src1, vt:$src2, vt:$src3, i1:$vm, i1:$compr, i8:$en)
+                  SRCMODS.NONE, $src2)
 >;
 
-def : ExpPattern<AMDGPUexport, i32, EXP>;
-def : ExpPattern<AMDGPUexport_done, i32, EXP_DONE>;
-
 // COPY is workaround tablegen bug from multiple outputs
 // from S_LSHL_B32's multiple outputs from implicit scc def.
 def : GCNPat <
@@ -1873,19 +2141,20 @@ def : GCNPat <
 >;
 
 def : GCNPat <
-  (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
-                           timm:$bound_ctrl)),
-  (V_MOV_B64_DPP_PSEUDO $src, $src, (as_i32imm $dpp_ctrl),
-                        (as_i32imm $row_mask), (as_i32imm $bank_mask),
-                        (as_i1imm $bound_ctrl))
+  (i64 (int_amdgcn_mov_dpp i64:$src, timm:$dpp_ctrl, timm:$row_mask,
+                           timm:$bank_mask, timm:$bound_ctrl)),
+  (V_MOV_B64_DPP_PSEUDO VReg_64:$src, VReg_64:$src,
+                        (as_i32timm $dpp_ctrl), (as_i32timm $row_mask),
+                        (as_i32timm $bank_mask),
+                        (as_i1timm $bound_ctrl))
 >;
 
 def : GCNPat <
   (i64 (int_amdgcn_update_dpp i64:$old, i64:$src, timm:$dpp_ctrl, timm:$row_mask,
                               timm:$bank_mask, timm:$bound_ctrl)),
-  (V_MOV_B64_DPP_PSEUDO $old, $src, (as_i32imm $dpp_ctrl),
-                        (as_i32imm $row_mask), (as_i32imm $bank_mask),
-                        (as_i1imm $bound_ctrl))
+  (V_MOV_B64_DPP_PSEUDO VReg_64:$old, VReg_64:$src, (as_i32timm $dpp_ctrl),
+                        (as_i32timm $row_mask), (as_i32timm $bank_mask),
+                        (as_i1timm $bound_ctrl))
 >;
 
 //===----------------------------------------------------------------------===//
@@ -1901,6 +2170,11 @@ let SubtargetPredicate = isGFX6 in {
 //    fract(x) = isnan(x) ? x : min(V_FRACT(x), 0.99999999999999999)
 
 // Convert floor(x) to (x - fract(x))
+
+// Don't bother handling this for GlobalISel, it's handled during
+// lowering.
+//
+// FIXME: DAG should also custom lower this.
 def : GCNPat <
   (f64 (ffloor (f64 (VOP3Mods f64:$x, i32:$mods)))),
   (V_ADD_F64
@@ -1910,13 +2184,11 @@ def : GCNPat <
       (V_CNDMASK_B64_PSEUDO
          (V_MIN_F64
              SRCMODS.NONE,
-             (V_FRACT_F64_e64 $mods, $x, DSTCLAMP.NONE, DSTOMOD.NONE),
+             (V_FRACT_F64_e64 $mods, $x),
              SRCMODS.NONE,
-             (V_MOV_B64_PSEUDO 0x3fefffffffffffff),
-             DSTCLAMP.NONE, DSTOMOD.NONE),
+             (V_MOV_B64_PSEUDO 0x3fefffffffffffff)),
          $x,
-         (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))),
-      DSTCLAMP.NONE, DSTOMOD.NONE)
+         (V_CMP_CLASS_F64_e64 SRCMODS.NONE, $x, (i32 3 /*NaN*/))))
 >;
 
 } // End SubtargetPredicates = isGFX6
@@ -2061,13 +2333,164 @@ def G_AMDGPU_FFBH_U32 : AMDGPUGenericInstruction {
   let hasSideEffects = 0;
 }
 
+def G_AMDGPU_RCP_IFLAG : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$src);
+  let hasSideEffects = 0;
+}
+
+class BufferLoadGenericInstruction : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
+                           type2:$soffset, untyped_imm_0:$offset,
+                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+}
+
+class TBufferLoadGenericInstruction : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$rsrc, type2:$vindex, type2:$voffset,
+                           type2:$soffset, untyped_imm_0:$offset, untyped_imm_0:$format,
+                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+}
+
+def G_AMDGPU_BUFFER_LOAD_UBYTE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_SBYTE : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_USHORT : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_SSHORT : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_FORMAT : BufferLoadGenericInstruction;
+def G_AMDGPU_BUFFER_LOAD_FORMAT_D16 : BufferLoadGenericInstruction;
+def G_AMDGPU_TBUFFER_LOAD_FORMAT : TBufferLoadGenericInstruction;
+def G_AMDGPU_TBUFFER_LOAD_FORMAT_D16 : TBufferLoadGenericInstruction;
+
+class BufferStoreGenericInstruction : AMDGPUGenericInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
+                           type2:$soffset, untyped_imm_0:$offset,
+                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+  let hasSideEffects = 0;
+  let mayStore = 1;
+}
+
+class TBufferStoreGenericInstruction : AMDGPUGenericInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
+                           type2:$soffset, untyped_imm_0:$offset,
+                           untyped_imm_0:$format,
+                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+  let hasSideEffects = 0;
+  let mayStore = 1;
+}
+
+def G_AMDGPU_BUFFER_STORE : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_BYTE : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_SHORT : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_FORMAT : BufferStoreGenericInstruction;
+def G_AMDGPU_BUFFER_STORE_FORMAT_D16 : BufferStoreGenericInstruction;
+def G_AMDGPU_TBUFFER_STORE_FORMAT : TBufferStoreGenericInstruction;
+def G_AMDGPU_TBUFFER_STORE_FORMAT_D16 : TBufferStoreGenericInstruction;
+
+def G_AMDGPU_FMIN_LEGACY : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type0:$src1);
+  let hasSideEffects = 0;
+}
+
+def G_AMDGPU_FMAX_LEGACY : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0, type0:$src1);
+  let hasSideEffects = 0;
+}
+
+foreach N = 0-3 in {
+def G_AMDGPU_CVT_F32_UBYTE#N : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$src0);
+  let hasSideEffects = 0;
+}
+}
+
 // Atomic cmpxchg. $cmpval ad $newval are packed in a single vector
 // operand Expects a MachineMemOperand in addition to explicit
 // operands.
 def G_AMDGPU_ATOMIC_CMPXCHG : AMDGPUGenericInstruction {
   let OutOperandList = (outs type0:$oldval);
-  let InOperandList = (ins ptype1:$addr, type0:$cmpval_nnenwval);
+  let InOperandList = (ins ptype1:$addr, type0:$cmpval_newval);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+let Namespace = "AMDGPU" in {
+def G_AMDGPU_ATOMIC_INC : G_ATOMICRMW_OP;
+def G_AMDGPU_ATOMIC_DEC : G_ATOMICRMW_OP;
+}
+
+class BufferAtomicGenericInstruction : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$vdata, type1:$rsrc, type2:$vindex, type2:$voffset,
+                           type2:$soffset, untyped_imm_0:$offset,
+                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
   let hasSideEffects = 0;
   let mayLoad = 1;
   let mayStore = 1;
 }
+
+def G_AMDGPU_BUFFER_ATOMIC_SWAP : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_ADD : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SUB : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SMIN : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_UMIN : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_SMAX : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_UMAX : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_AND : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_OR : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_XOR : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_INC : BufferAtomicGenericInstruction;
+def G_AMDGPU_BUFFER_ATOMIC_DEC : BufferAtomicGenericInstruction;
+
+def G_AMDGPU_BUFFER_ATOMIC_CMPSWAP : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type0:$vdata, type0:$cmp, type1:$rsrc, type2:$vindex,
+                           type2:$voffset, type2:$soffset, untyped_imm_0:$offset,
+                           untyped_imm_0:$cachepolicy, untyped_imm_0:$idxen);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+  let mayStore = 1;
+}
+
+// Wrapper around llvm.amdgcn.s.buffer.load. This is mostly needed as
+// a workaround for the intrinsic being defined as readnone, but
+// really needs a memory operand.
+def G_AMDGPU_S_BUFFER_LOAD : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins type1:$rsrc, type2:$offset, untyped_imm_0:$cachepolicy);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+  let mayStore = 0;
+}
+
+// This is equivalent to the G_INTRINSIC*, but the operands may have
+// been legalized depending on the subtarget requirements.
+def G_AMDGPU_INTRIN_IMAGE_LOAD : AMDGPUGenericInstruction {
+  let OutOperandList = (outs type0:$dst);
+  let InOperandList = (ins unknown:$intrin, variable_ops);
+  let hasSideEffects = 0;
+  let mayLoad = 1;
+
+  // FIXME: Use separate opcode for atomics.
+  let mayStore = 1;
+}
+
+// This is equivalent to the G_INTRINSIC*, but the operands may have
+// been legalized depending on the subtarget requirements.
+def G_AMDGPU_INTRIN_IMAGE_STORE : AMDGPUGenericInstruction {
+  let OutOperandList = (outs);
+  let InOperandList = (ins unknown:$intrin, variable_ops);
+  let hasSideEffects = 0;
+  let mayStore = 1;
+}
diff --git a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
index d2b1abc8a9fb..2eb1c52f1b59 100644
--- a/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SILoadStoreOptimizer.cpp
@@ -103,15 +103,19 @@ enum InstClassEnum {
   TBUFFER_STORE,
 };
 
-enum RegisterEnum {
-  SBASE = 0x1,
-  SRSRC = 0x2,
-  SOFFSET = 0x4,
-  VADDR = 0x8,
-  ADDR = 0x10,
-  SSAMP = 0x20,
+struct AddressRegs {
+  unsigned char NumVAddrs = 0;
+  bool SBase = false;
+  bool SRsrc = false;
+  bool SOffset = false;
+  bool VAddr = false;
+  bool Addr = false;
+  bool SSamp = false;
 };
 
+// GFX10 image_sample instructions can have 12 vaddrs + srsrc + ssamp.
+const unsigned MaxAddressRegs = 12 + 1 + 1;
+
 class SILoadStoreOptimizer : public MachineFunctionPass {
   struct CombineInfo {
     MachineBasicBlock::iterator I;
@@ -126,10 +130,10 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
     bool SLC;
     bool DLC;
     bool UseST64;
-    SmallVector<MachineInstr *, 8> InstsToMove;
-    int AddrIdx[5];
-    const MachineOperand *AddrReg[5];
+    int AddrIdx[MaxAddressRegs];
+    const MachineOperand *AddrReg[MaxAddressRegs];
     unsigned NumAddresses;
+    unsigned Order;
 
     bool hasSameBaseAddress(const MachineInstr &MI) {
       for (unsigned i = 0; i < NumAddresses; i++) {
@@ -183,8 +187,8 @@ class SILoadStoreOptimizer : public MachineFunctionPass {
   };
 
   struct BaseRegisters {
-    unsigned LoReg = 0;
-    unsigned HiReg = 0;
+    Register LoReg;
+    Register HiReg;
 
     unsigned LoSubReg = 0;
     unsigned HiSubReg = 0;
@@ -201,7 +205,6 @@ private:
   const GCNSubtarget *STM = nullptr;
   const SIInstrInfo *TII = nullptr;
   const SIRegisterInfo *TRI = nullptr;
-  const MCSubtargetInfo *STI = nullptr;
   MachineRegisterInfo *MRI = nullptr;
   AliasAnalysis *AA = nullptr;
   bool OptimizeAgain;
@@ -209,9 +212,9 @@ private:
   static bool dmasksCanBeCombined(const CombineInfo &CI,
                                   const SIInstrInfo &TII,
                                   const CombineInfo &Paired);
-  static bool offsetsCanBeCombined(CombineInfo &CI, const MCSubtargetInfo &STI,
-                                   CombineInfo &Paired);
-  static bool widthsFit(const GCNSubtarget &STM, const CombineInfo &CI,
+  static bool offsetsCanBeCombined(CombineInfo &CI, const GCNSubtarget &STI,
+                                   CombineInfo &Paired, bool Modify = false);
+  static bool widthsFit(const GCNSubtarget &STI, const CombineInfo &CI,
                         const CombineInfo &Paired);
   static unsigned getNewOpcode(const CombineInfo &CI, const CombineInfo &Paired);
   static std::pair<unsigned, unsigned> getSubRegIdxs(const CombineInfo &CI,
@@ -219,25 +222,42 @@ private:
   const TargetRegisterClass *getTargetRegisterClass(const CombineInfo &CI,
                                                     const CombineInfo &Paired);
 
-  bool findMatchingInst(CombineInfo &CI, CombineInfo &Paired);
+  bool checkAndPrepareMerge(CombineInfo &CI, CombineInfo &Paired,
+                            SmallVectorImpl<MachineInstr *> &InstsToMove);
 
   unsigned read2Opcode(unsigned EltSize) const;
   unsigned read2ST64Opcode(unsigned EltSize) const;
-  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired);
+  MachineBasicBlock::iterator mergeRead2Pair(CombineInfo &CI,
+                                             CombineInfo &Paired,
+                  const SmallVectorImpl<MachineInstr *> &InstsToMove);
 
   unsigned write2Opcode(unsigned EltSize) const;
   unsigned write2ST64Opcode(unsigned EltSize) const;
-  MachineBasicBlock::iterator mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired);
-  MachineBasicBlock::iterator mergeImagePair(CombineInfo &CI, CombineInfo &Paired);
-  MachineBasicBlock::iterator mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired);
-  MachineBasicBlock::iterator mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired);
-  MachineBasicBlock::iterator mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired);
-  MachineBasicBlock::iterator mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired);
-  MachineBasicBlock::iterator mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired);
-
-  void updateBaseAndOffset(MachineInstr &I, unsigned NewBase,
+  MachineBasicBlock::iterator
+  mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
+                  const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
+                 const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired,
+                          const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
+                      const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
+                       const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired,
+                       const SmallVectorImpl<MachineInstr *> &InstsToMove);
+  MachineBasicBlock::iterator
+  mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired,
+                        const SmallVectorImpl<MachineInstr *> &InstsToMove);
+
+  void updateBaseAndOffset(MachineInstr &I, Register NewBase,
                            int32_t NewOffset) const;
-  unsigned computeBase(MachineInstr &MI, const MemAddress &Addr) const;
+  Register computeBase(MachineInstr &MI, const MemAddress &Addr) const;
   MachineOperand createRegOrImm(int32_t Val, MachineInstr &MI) const;
   Optional<int32_t> extractConstOffset(const MachineOperand &Op) const;
   void processBaseWithConstOffset(const MachineOperand &Base, MemAddress &Addr) const;
@@ -249,8 +269,11 @@ private:
                                   SmallPtrSet<MachineInstr *, 4> &Promoted) const;
   void addInstToMergeableList(const CombineInfo &CI,
                   std::list<std::list<CombineInfo> > &MergeableInsts) const;
-  bool collectMergeableInsts(MachineBasicBlock &MBB,
-                  std::list<std::list<CombineInfo> > &MergeableInsts) const;
+
+  std::pair<MachineBasicBlock::iterator, bool> collectMergeableInsts(
+      MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
+      MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
+      std::list<std::list<CombineInfo>> &MergeableInsts) const;
 
 public:
   static char ID;
@@ -259,8 +282,6 @@ public:
     initializeSILoadStoreOptimizerPass(*PassRegistry::getPassRegistry());
   }
 
-  void removeCombinedInst(std::list<CombineInfo> &MergeList,
-                                         const MachineInstr &MI);
   bool optimizeInstsWithSameBaseAddr(std::list<CombineInfo> &MergeList,
                                      bool &OptimizeListAgain);
   bool optimizeBlock(std::list<std::list<CombineInfo> > &MergeableInsts);
@@ -275,6 +296,11 @@ public:
 
     MachineFunctionPass::getAnalysisUsage(AU);
   }
+
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties()
+      .set(MachineFunctionProperties::Property::IsSSA);
+  }
 };
 
 static unsigned getOpcodeWidth(const MachineInstr &MI, const SIInstrInfo &TII) {
@@ -327,7 +353,8 @@ static InstClassEnum getInstClass(unsigned Opc, const SIInstrInfo &TII) {
     }
     if (TII.isMIMG(Opc)) {
       // Ignore instructions encoded without vaddr.
-      if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1)
+      if (AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr) == -1 &&
+          AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) == -1)
         return UNKNOWN;
       // TODO: Support IMAGE_GET_RESINFO and IMAGE_GET_LOD.
       if (TII.get(Opc).mayStore() || !TII.get(Opc).mayLoad() ||
@@ -400,58 +427,54 @@ static unsigned getInstSubclass(unsigned Opc, const SIInstrInfo &TII) {
   }
 }
 
-static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) {
-  if (TII.isMUBUF(Opc)) {
-    unsigned result = 0;
+static AddressRegs getRegs(unsigned Opc, const SIInstrInfo &TII) {
+  AddressRegs Result;
 
-    if (AMDGPU::getMUBUFHasVAddr(Opc)) {
-      result |= VADDR;
-    }
-
-    if (AMDGPU::getMUBUFHasSrsrc(Opc)) {
-      result |= SRSRC;
-    }
-
-    if (AMDGPU::getMUBUFHasSoffset(Opc)) {
-      result |= SOFFSET;
-    }
-
-    return result;
+  if (TII.isMUBUF(Opc)) {
+    if (AMDGPU::getMUBUFHasVAddr(Opc))
+      Result.VAddr = true;
+    if (AMDGPU::getMUBUFHasSrsrc(Opc))
+      Result.SRsrc = true;
+    if (AMDGPU::getMUBUFHasSoffset(Opc))
+      Result.SOffset = true;
+
+    return Result;
   }
 
   if (TII.isMIMG(Opc)) {
-    unsigned result = VADDR | SRSRC;
+    int VAddr0Idx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0);
+    if (VAddr0Idx >= 0) {
+      int SRsrcIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+      Result.NumVAddrs = SRsrcIdx - VAddr0Idx;
+    } else {
+      Result.VAddr = true;
+    }
+    Result.SRsrc = true;
     const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(Opc);
     if (Info && AMDGPU::getMIMGBaseOpcodeInfo(Info->BaseOpcode)->Sampler)
-      result |= SSAMP;
+      Result.SSamp = true;
 
-    return result;
+    return Result;
   }
   if (TII.isMTBUF(Opc)) {
-    unsigned result = 0;
-
-    if (AMDGPU::getMTBUFHasVAddr(Opc)) {
-      result |= VADDR;
-    }
-
-    if (AMDGPU::getMTBUFHasSrsrc(Opc)) {
-      result |= SRSRC;
-    }
-
-    if (AMDGPU::getMTBUFHasSoffset(Opc)) {
-      result |= SOFFSET;
-    }
-
-    return result;
+    if (AMDGPU::getMTBUFHasVAddr(Opc))
+      Result.VAddr = true;
+    if (AMDGPU::getMTBUFHasSrsrc(Opc))
+      Result.SRsrc = true;
+    if (AMDGPU::getMTBUFHasSoffset(Opc))
+      Result.SOffset = true;
+
+    return Result;
   }
 
   switch (Opc) {
   default:
-    return 0;
+    return Result;
   case AMDGPU::S_BUFFER_LOAD_DWORD_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX2_IMM:
   case AMDGPU::S_BUFFER_LOAD_DWORDX4_IMM:
-    return SBASE;
+    Result.SBase = true;
+    return Result;
   case AMDGPU::DS_READ_B32:
   case AMDGPU::DS_READ_B64:
   case AMDGPU::DS_READ_B32_gfx9:
@@ -460,7 +483,8 @@ static unsigned getRegs(unsigned Opc, const SIInstrInfo &TII) {
   case AMDGPU::DS_WRITE_B64:
   case AMDGPU::DS_WRITE_B32_gfx9:
   case AMDGPU::DS_WRITE_B64_gfx9:
-    return ADDR;
+    Result.Addr = true;
+    return Result;
   }
 }
 
@@ -486,7 +510,7 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
                                                                             : 4;
     break;
   case S_BUFFER_LOAD_IMM:
-    EltSize = AMDGPU::getSMRDEncodedOffset(STM, 4);
+    EltSize = AMDGPU::convertSMRDOffsetUnits(STM, 4);
     break;
   default:
     EltSize = 4;
@@ -495,6 +519,8 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
 
   if (InstClass == MIMG) {
     DMask = TII.getNamedOperand(*I, AMDGPU::OpName::dmask)->getImm();
+    // Offset is not considered for MIMG instructions.
+    Offset = 0;
   } else {
     int OffsetIdx = AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::offset);
     Offset = I->getOperand(OffsetIdx).getImm();
@@ -515,40 +541,34 @@ void SILoadStoreOptimizer::CombineInfo::setMI(MachineBasicBlock::iterator MI,
     DLC = TII.getNamedOperand(*I, AMDGPU::OpName::dlc)->getImm();
   }
 
-  unsigned AddrOpName[5] = {0};
-  NumAddresses = 0;
-  const unsigned Regs = getRegs(I->getOpcode(), TII);
-
-  if (Regs & ADDR) {
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::addr;
-  }
+  AddressRegs Regs = getRegs(Opc, TII);
 
-  if (Regs & SBASE) {
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::sbase;
-  }
-
-  if (Regs & SRSRC) {
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::srsrc;
-  }
-
-  if (Regs & SOFFSET) {
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::soffset;
-  }
-
-  if (Regs & VADDR) {
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::vaddr;
-  }
-
-  if (Regs & SSAMP) {
-    AddrOpName[NumAddresses++] = AMDGPU::OpName::ssamp;
-  }
-
-  for (unsigned i = 0; i < NumAddresses; i++) {
-    AddrIdx[i] = AMDGPU::getNamedOperandIdx(I->getOpcode(), AddrOpName[i]);
-    AddrReg[i] = &I->getOperand(AddrIdx[i]);
-  }
-
-  InstsToMove.clear();
+  NumAddresses = 0;
+  for (unsigned J = 0; J < Regs.NumVAddrs; J++)
+    AddrIdx[NumAddresses++] =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr0) + J;
+  if (Regs.Addr)
+    AddrIdx[NumAddresses++] =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::addr);
+  if (Regs.SBase)
+    AddrIdx[NumAddresses++] =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::sbase);
+  if (Regs.SRsrc)
+    AddrIdx[NumAddresses++] =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::srsrc);
+  if (Regs.SOffset)
+    AddrIdx[NumAddresses++] =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::soffset);
+  if (Regs.VAddr)
+    AddrIdx[NumAddresses++] =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::vaddr);
+  if (Regs.SSamp)
+    AddrIdx[NumAddresses++] =
+        AMDGPU::getNamedOperandIdx(Opc, AMDGPU::OpName::ssamp);
+  assert(NumAddresses <= MaxAddressRegs);
+
+  for (unsigned J = 0; J < NumAddresses; J++)
+    AddrReg[J] = &I->getOperand(AddrIdx[J]);
 }
 
 } // end anonymous namespace.
@@ -578,8 +598,8 @@ static void moveInstsAfter(MachineBasicBlock::iterator I,
 }
 
 static void addDefsUsesToList(const MachineInstr &MI,
-                              DenseSet<unsigned> &RegDefs,
-                              DenseSet<unsigned> &PhysRegUses) {
+                              DenseSet<Register> &RegDefs,
+                              DenseSet<Register> &PhysRegUses) {
   for (const MachineOperand &Op : MI.operands()) {
     if (Op.isReg()) {
       if (Op.isDef())
@@ -601,8 +621,8 @@ static bool memAccessesCanBeReordered(MachineBasicBlock::iterator A,
 
 // Add MI and its defs to the lists if MI reads one of the defs that are
 // already in the list. Returns true in that case.
-static bool addToListsIfDependent(MachineInstr &MI, DenseSet<unsigned> &RegDefs,
-                                  DenseSet<unsigned> &PhysRegUses,
+static bool addToListsIfDependent(MachineInstr &MI, DenseSet<Register> &RegDefs,
+                                  DenseSet<Register> &PhysRegUses,
                                   SmallVectorImpl<MachineInstr *> &Insts) {
   for (MachineOperand &Use : MI.operands()) {
     // If one of the defs is read, then there is a use of Def between I and the
@@ -671,7 +691,8 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
   // Check other optional immediate operands for equality.
   unsigned OperandsToMatch[] = {AMDGPU::OpName::glc, AMDGPU::OpName::slc,
                                 AMDGPU::OpName::d16, AMDGPU::OpName::unorm,
-                                AMDGPU::OpName::da,  AMDGPU::OpName::r128};
+                                AMDGPU::OpName::da,  AMDGPU::OpName::r128,
+                                AMDGPU::OpName::a16, AMDGPU::OpName::dlc};
 
   for (auto op : OperandsToMatch) {
     int Idx = AMDGPU::getNamedOperandIdx(CI.I->getOpcode(), op);
@@ -695,7 +716,7 @@ bool SILoadStoreOptimizer::dmasksCanBeCombined(const CombineInfo &CI,
 
 static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
                                        unsigned ComponentCount,
-                                       const MCSubtargetInfo &STI) {
+                                       const GCNSubtarget &STI) {
   if (ComponentCount > 4)
     return 0;
 
@@ -719,8 +740,9 @@ static unsigned getBufferFormatWithCompCount(unsigned OldFormat,
 }
 
 bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
-                                                const MCSubtargetInfo &STI,
-                                                CombineInfo &Paired) {
+                                                const GCNSubtarget &STI,
+                                                CombineInfo &Paired,
+                                                bool Modify) {
   assert(CI.InstClass != MIMG);
 
   // XXX - Would the same offset be OK? Is there any reason this would happen or
@@ -761,7 +783,7 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
   CI.UseST64 = false;
   CI.BaseOff = 0;
 
-  // Handle SMEM and VMEM instructions.
+  // Handle DS instructions.
   if ((CI.InstClass != DS_READ) && (CI.InstClass != DS_WRITE)) {
     return (EltOffset0 + CI.Width == EltOffset1 ||
             EltOffset1 + Paired.Width == EltOffset0) &&
@@ -769,20 +791,25 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
            (CI.InstClass == S_BUFFER_LOAD_IMM || CI.SLC == Paired.SLC);
   }
 
+  // Handle SMEM and VMEM instructions.
   // If the offset in elements doesn't fit in 8-bits, we might be able to use
   // the stride 64 versions.
   if ((EltOffset0 % 64 == 0) && (EltOffset1 % 64) == 0 &&
       isUInt<8>(EltOffset0 / 64) && isUInt<8>(EltOffset1 / 64)) {
-    CI.Offset = EltOffset0 / 64;
-    Paired.Offset = EltOffset1 / 64;
-    CI.UseST64 = true;
+    if (Modify) {
+      CI.Offset = EltOffset0 / 64;
+      Paired.Offset = EltOffset1 / 64;
+      CI.UseST64 = true;
+    }
     return true;
   }
 
   // Check if the new offsets fit in the reduced 8-bit range.
   if (isUInt<8>(EltOffset0) && isUInt<8>(EltOffset1)) {
-    CI.Offset = EltOffset0;
-    Paired.Offset = EltOffset1;
+    if (Modify) {
+      CI.Offset = EltOffset0;
+      Paired.Offset = EltOffset1;
+    }
     return true;
   }
 
@@ -791,15 +818,19 @@ bool SILoadStoreOptimizer::offsetsCanBeCombined(CombineInfo &CI,
   CI.BaseOff = std::min(CI.Offset, Paired.Offset);
 
   if ((OffsetDiff % 64 == 0) && isUInt<8>(OffsetDiff / 64)) {
-    CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
-    Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
-    CI.UseST64 = true;
+    if (Modify) {
+      CI.Offset = (EltOffset0 - CI.BaseOff / CI.EltSize) / 64;
+      Paired.Offset = (EltOffset1 - CI.BaseOff / CI.EltSize) / 64;
+      CI.UseST64 = true;
+    }
     return true;
   }
 
   if (isUInt<8>(OffsetDiff)) {
-    CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize;
-    Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize;
+    if (Modify) {
+      CI.Offset = EltOffset0 - CI.BaseOff / CI.EltSize;
+      Paired.Offset = EltOffset1 - CI.BaseOff / CI.EltSize;
+    }
     return true;
   }
 
@@ -824,11 +855,19 @@ bool SILoadStoreOptimizer::widthsFit(const GCNSubtarget &STM,
   }
 }
 
-bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
-                                            CombineInfo &Paired) {
-  MachineBasicBlock *MBB = CI.I->getParent();
-  MachineBasicBlock::iterator E = MBB->end();
-  MachineBasicBlock::iterator MBBI = CI.I;
+/// This function assumes that CI comes before Paired in a basic block.
+bool SILoadStoreOptimizer::checkAndPrepareMerge(
+    CombineInfo &CI, CombineInfo &Paired,
+    SmallVectorImpl<MachineInstr *> &InstsToMove) {
+
+  // Check both offsets (or masks for MIMG) can be combined and fit in the
+  // reduced range.
+  if (CI.InstClass == MIMG && !dmasksCanBeCombined(CI, *TII, Paired))
+    return false;
+
+  if (CI.InstClass != MIMG &&
+      (!widthsFit(*STM, CI, Paired) || !offsetsCanBeCombined(CI, *STM, Paired)))
+    return false;
 
   const unsigned Opc = CI.I->getOpcode();
   const InstClassEnum InstClass = getInstClass(Opc, *TII);
@@ -844,14 +883,25 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
   if (Swizzled != -1 && CI.I->getOperand(Swizzled).getImm())
     return false;
 
-  ++MBBI;
-
-  DenseSet<unsigned> RegDefsToMove;
-  DenseSet<unsigned> PhysRegUsesToMove;
+  DenseSet<Register> RegDefsToMove;
+  DenseSet<Register> PhysRegUsesToMove;
   addDefsUsesToList(*CI.I, RegDefsToMove, PhysRegUsesToMove);
 
+  MachineBasicBlock::iterator E = std::next(Paired.I);
+  MachineBasicBlock::iterator MBBI = std::next(CI.I);
+  MachineBasicBlock::iterator MBBE = CI.I->getParent()->end();
   for (; MBBI != E; ++MBBI) {
 
+    if (MBBI == MBBE) {
+      // CombineInfo::Order is a hint on the instruction ordering within the
+      // basic block. This hint suggests that CI precedes Paired, which is
+      // true most of the time. However, moveInstsAfter() processing a
+      // previous list may have changed this order in a situation when it
+      // moves an instruction which exists in some other merge list.
+      // In this case it must be dependent.
+      return false;
+    }
+
     if ((getInstClass(MBBI->getOpcode(), *TII) != InstClass) ||
         (getInstSubclass(MBBI->getOpcode(), *TII) != InstSubclass)) {
       // This is not a matching instruction, but we can keep looking as
@@ -868,11 +918,11 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
 
       if (MBBI->mayLoadOrStore() &&
           (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
-           !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))) {
+           !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))) {
         // We fail condition #1, but we may still be able to satisfy condition
         // #2.  Add this instruction to the move list and then we will check
         // if condition #2 holds once we have selected the matching instruction.
-        CI.InstsToMove.push_back(&*MBBI);
+        InstsToMove.push_back(&*MBBI);
         addDefsUsesToList(*MBBI, RegDefsToMove, PhysRegUsesToMove);
         continue;
       }
@@ -881,7 +931,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
       // to the location of the matched instruction any uses of I will need to
       // be moved down as well.
       addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
-                            CI.InstsToMove);
+                            InstsToMove);
       continue;
     }
 
@@ -901,26 +951,24 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
     // where the DS_READ_B32 ends up in InstsToMove and therefore prevents
     // merging of the two writes.
     if (addToListsIfDependent(*MBBI, RegDefsToMove, PhysRegUsesToMove,
-                              CI.InstsToMove))
+                              InstsToMove))
       continue;
 
-    bool Match = CI.hasSameBaseAddress(*MBBI);
-
-    if (Match) {
-      Paired.setMI(MBBI, *TII, *STM);
-
-      // Check both offsets (or masks for MIMG) can be combined and fit in the
-      // reduced range.
-      bool canBeCombined =
-          CI.InstClass == MIMG
-              ? dmasksCanBeCombined(CI, *TII, Paired)
-              : widthsFit(*STM, CI, Paired) && offsetsCanBeCombined(CI, *STI, Paired);
-
-      // We also need to go through the list of instructions that we plan to
+    if (&*MBBI == &*Paired.I) {
+      // We need to go through the list of instructions that we plan to
       // move and make sure they are all safe to move down past the merged
       // instruction.
-      if (canBeCombined && canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
+      if (canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA)) {
+
+        // Call offsetsCanBeCombined with modify = true so that the offsets are
+        // correct for the new instruction.  This should return true, because
+        // this function should only be called on CombineInfo objects that
+        // have already been confirmed to be mergeable.
+        if (CI.InstClass != MIMG)
+          offsetsCanBeCombined(CI, *STM, Paired, true);
         return true;
+      }
+      return false;
     }
 
     // We've found a load/store that we couldn't merge for some reason.
@@ -929,7 +977,7 @@ bool SILoadStoreOptimizer::findMatchingInst(CombineInfo &CI,
     // down past this instruction.
     // check if we can move I across MBBI and if we can move all I's users
     if (!memAccessesCanBeReordered(*CI.I, *MBBI, AA) ||
-        !canMoveInstsAcrossMemOp(*MBBI, CI.InstsToMove, AA))
+        !canMoveInstsAcrossMemOp(*MBBI, InstsToMove, AA))
       break;
   }
   return false;
@@ -950,7 +998,8 @@ unsigned SILoadStoreOptimizer::read2ST64Opcode(unsigned EltSize) const {
 }
 
 MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired) {
+SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired,
+    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be careful, since the addresses could be subregisters themselves in weird
@@ -1023,7 +1072,7 @@ SILoadStoreOptimizer::mergeRead2Pair(CombineInfo &CI, CombineInfo &Paired) {
                             .add(*Dest1)
                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  moveInstsAfter(Copy1, CI.InstsToMove);
+  moveInstsAfter(Copy1, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1049,7 +1098,8 @@ unsigned SILoadStoreOptimizer::write2ST64Opcode(unsigned EltSize) const {
 }
 
 MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired) {
+SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired,
+                                      const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
 
   // Be sure to use .addOperand(), and not .addReg() with these. We want to be
@@ -1106,7 +1156,7 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired) {
           .addImm(0)                                 // gds
           .cloneMergedMemRefs({&*CI.I, &*Paired.I});
 
-  moveInstsAfter(Write2, CI.InstsToMove);
+  moveInstsAfter(Write2, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1116,7 +1166,8 @@ SILoadStoreOptimizer::mergeWrite2Pair(CombineInfo &CI, CombineInfo &Paired) {
 }
 
 MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired) {
+SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired,
+                           const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
   const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1161,15 +1212,16 @@ SILoadStoreOptimizer::mergeImagePair(CombineInfo &CI, CombineInfo &Paired) {
                             .add(*Dest1)
                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  moveInstsAfter(Copy1, CI.InstsToMove);
+  moveInstsAfter(Copy1, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
   return New;
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeSBufferLoadImmPair(
+    CombineInfo &CI, CombineInfo &Paired,
+    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
   const unsigned Opcode = getNewOpcode(CI, Paired);
@@ -1211,15 +1263,16 @@ SILoadStoreOptimizer::mergeSBufferLoadImmPair(CombineInfo &CI, CombineInfo &Pair
                             .add(*Dest1)
                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  moveInstsAfter(Copy1, CI.InstsToMove);
+  moveInstsAfter(Copy1, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
   return New;
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferLoadPair(
+    CombineInfo &CI, CombineInfo &Paired,
+    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1233,9 +1286,9 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired)
 
   auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
 
-  const unsigned Regs = getRegs(Opcode, *TII);
+  AddressRegs Regs = getRegs(Opcode, *TII);
 
-  if (Regs & VADDR)
+  if (Regs.VAddr)
     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
 
   // It shouldn't be possible to get this far if the two instructions
@@ -1273,15 +1326,16 @@ SILoadStoreOptimizer::mergeBufferLoadPair(CombineInfo &CI, CombineInfo &Paired)
                             .add(*Dest1)
                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  moveInstsAfter(Copy1, CI.InstsToMove);
+  moveInstsAfter(Copy1, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
   return New;
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferLoadPair(
+    CombineInfo &CI, CombineInfo &Paired,
+    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1295,13 +1349,13 @@ SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired)
 
   auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode), DestReg);
 
-  const unsigned Regs = getRegs(Opcode, *TII);
+  AddressRegs Regs = getRegs(Opcode, *TII);
 
-  if (Regs & VADDR)
+  if (Regs.VAddr)
     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
 
   unsigned JoinedFormat =
-      getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STI);
+      getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
 
   // It shouldn't be possible to get this far if the two instructions
   // don't have a single memoperand, because MachineInstr::mayAlias()
@@ -1340,15 +1394,16 @@ SILoadStoreOptimizer::mergeTBufferLoadPair(CombineInfo &CI, CombineInfo &Paired)
                             .add(*Dest1)
                             .addReg(DestReg, RegState::Kill, SubRegIdx1);
 
-  moveInstsAfter(Copy1, CI.InstsToMove);
+  moveInstsAfter(Copy1, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
   return New;
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeTBufferStorePair(
+    CombineInfo &CI, CombineInfo &Paired,
+    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1374,13 +1429,13 @@ SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired
   auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
 
-  const unsigned Regs = getRegs(Opcode, *TII);
+  AddressRegs Regs = getRegs(Opcode, *TII);
 
-  if (Regs & VADDR)
+  if (Regs.VAddr)
     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
 
   unsigned JoinedFormat =
-      getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STI);
+      getBufferFormatWithCompCount(CI.Format, CI.Width + Paired.Width, *STM);
 
   // It shouldn't be possible to get this far if the two instructions
   // don't have a single memoperand, because MachineInstr::mayAlias()
@@ -1403,7 +1458,7 @@ SILoadStoreOptimizer::mergeTBufferStorePair(CombineInfo &CI, CombineInfo &Paired
           .addMemOperand(
               combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
-  moveInstsAfter(MIB, CI.InstsToMove);
+  moveInstsAfter(MIB, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1491,9 +1546,9 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
     case 4:
       return &AMDGPU::SGPR_128RegClass;
     case 8:
-      return &AMDGPU::SReg_256RegClass;
+      return &AMDGPU::SGPR_256RegClass;
     case 16:
-      return &AMDGPU::SReg_512RegClass;
+      return &AMDGPU::SGPR_512RegClass;
     }
   } else {
     switch (CI.Width + Paired.Width) {
@@ -1509,8 +1564,9 @@ SILoadStoreOptimizer::getTargetRegisterClass(const CombineInfo &CI,
   }
 }
 
-MachineBasicBlock::iterator
-SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired) {
+MachineBasicBlock::iterator SILoadStoreOptimizer::mergeBufferStorePair(
+    CombineInfo &CI, CombineInfo &Paired,
+    const SmallVectorImpl<MachineInstr *> &InstsToMove) {
   MachineBasicBlock *MBB = CI.I->getParent();
   DebugLoc DL = CI.I->getDebugLoc();
 
@@ -1536,9 +1592,9 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired)
   auto MIB = BuildMI(*MBB, Paired.I, DL, TII->get(Opcode))
                  .addReg(SrcReg, RegState::Kill);
 
-  const unsigned Regs = getRegs(Opcode, *TII);
+  AddressRegs Regs = getRegs(Opcode, *TII);
 
-  if (Regs & VADDR)
+  if (Regs.VAddr)
     MIB.add(*TII->getNamedOperand(*CI.I, AMDGPU::OpName::vaddr));
 
 
@@ -1561,7 +1617,7 @@ SILoadStoreOptimizer::mergeBufferStorePair(CombineInfo &CI, CombineInfo &Paired)
         .addImm(0)            // swz
         .addMemOperand(combineKnownAdjacentMMOs(*MBB->getParent(), MMOa, MMOb));
 
-  moveInstsAfter(MIB, CI.InstsToMove);
+  moveInstsAfter(MIB, InstsToMove);
 
   CI.I->eraseFromParent();
   Paired.I->eraseFromParent();
@@ -1585,7 +1641,7 @@ SILoadStoreOptimizer::createRegOrImm(int32_t Val, MachineInstr &MI) const {
 }
 
 // Compute base address using Addr and return the final register.
-unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
+Register SILoadStoreOptimizer::computeBase(MachineInstr &MI,
                                            const MemAddress &Addr) const {
   MachineBasicBlock *MBB = MI.getParent();
   MachineBasicBlock::iterator MBBI = MI.getIterator();
@@ -1644,7 +1700,7 @@ unsigned SILoadStoreOptimizer::computeBase(MachineInstr &MI,
 
 // Update base and offset with the NewBase and NewOffset in MI.
 void SILoadStoreOptimizer::updateBaseAndOffset(MachineInstr &MI,
-                                               unsigned NewBase,
+                                               Register NewBase,
                                                int32_t NewOffset) const {
   auto Base = TII->getNamedOperand(MI, AMDGPU::OpName::vaddr);
   Base->setReg(NewBase);
@@ -1856,7 +1912,7 @@ bool SILoadStoreOptimizer::promoteConstantOffsetToImm(
                <<  AnchorAddr.Offset << "\n\n");
 
     // Instead of moving up, just re-compute anchor-instruction's base address.
-    unsigned Base = computeBase(MI, AnchorAddr);
+    Register Base = computeBase(MI, AnchorAddr);
 
     updateBaseAndOffset(MI, Base, MAddr.Offset - AnchorAddr.Offset);
     LLVM_DEBUG(dbgs() << "  After promotion: "; MI.dump(););
@@ -1894,39 +1950,80 @@ void SILoadStoreOptimizer::addInstToMergeableList(const CombineInfo &CI,
   MergeableInsts.emplace_back(1, CI);
 }
 
-bool SILoadStoreOptimizer::collectMergeableInsts(MachineBasicBlock &MBB,
-                 std::list<std::list<CombineInfo> > &MergeableInsts) const {
+std::pair<MachineBasicBlock::iterator, bool>
+SILoadStoreOptimizer::collectMergeableInsts(
+    MachineBasicBlock::iterator Begin, MachineBasicBlock::iterator End,
+    MemInfoMap &Visited, SmallPtrSet<MachineInstr *, 4> &AnchorList,
+    std::list<std::list<CombineInfo>> &MergeableInsts) const {
   bool Modified = false;
-  // Contain the list
-  MemInfoMap Visited;
-  // Contains the list of instructions for which constant offsets are being
-  // promoted to the IMM.
-  SmallPtrSet<MachineInstr *, 4> AnchorList;
 
   // Sort potential mergeable instructions into lists.  One list per base address.
-  for (MachineInstr &MI : MBB.instrs()) {
+  unsigned Order = 0;
+  MachineBasicBlock::iterator BlockI = Begin;
+  for (; BlockI != End; ++BlockI) {
+    MachineInstr &MI = *BlockI;
+
     // We run this before checking if an address is mergeable, because it can produce
     // better code even if the instructions aren't mergeable.
     if (promoteConstantOffsetToImm(MI, Visited, AnchorList))
       Modified = true;
 
+    // Don't combine if volatile. We also won't be able to merge across this, so
+    // break the search. We can look after this barrier for separate merges.
+    if (MI.hasOrderedMemoryRef()) {
+      LLVM_DEBUG(dbgs() << "Breaking search on memory fence: " << MI);
+
+      // Search will resume after this instruction in a separate merge list.
+      ++BlockI;
+      break;
+    }
+
     const InstClassEnum InstClass = getInstClass(MI.getOpcode(), *TII);
     if (InstClass == UNKNOWN)
       continue;
 
-    // Don't combine if volatile.
-    if (MI.hasOrderedMemoryRef())
-      continue;
-
     CombineInfo CI;
     CI.setMI(MI, *TII, *STM);
+    CI.Order = Order++;
 
     if (!CI.hasMergeableAddress(*MRI))
       continue;
 
+    LLVM_DEBUG(dbgs() << "Mergeable: " << MI);
+
     addInstToMergeableList(CI, MergeableInsts);
   }
-  return Modified;
+
+  // At this point we have lists of Mergeable instructions.
+  //
+  // Part 2: Sort lists by offset and then for each CombineInfo object in the
+  // list try to find an instruction that can be merged with I.  If an instruction
+  // is found, it is stored in the Paired field.  If no instructions are found, then
+  // the CombineInfo object is deleted from the list.
+
+  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
+                                                   E = MergeableInsts.end(); I != E;) {
+
+    std::list<CombineInfo> &MergeList = *I;
+    if (MergeList.size() <= 1) {
+      // This means we have found only one instruction with a given address
+      // that can be merged, and we need at least 2 instructions to do a merge,
+      // so this list can be discarded.
+      I = MergeableInsts.erase(I);
+      continue;
+    }
+
+    // Sort the lists by offsets, this way mergeable instructions will be
+    // adjacent to each other in the list, which will make it easier to find
+    // matches.
+    MergeList.sort(
+        [] (const CombineInfo &A, CombineInfo &B) {
+          return A.Offset < B.Offset;
+        });
+    ++I;
+  }
+
+  return std::make_pair(BlockI, Modified);
 }
 
 // Scan through looking for adjacent LDS operations with constant offsets from
@@ -1936,117 +2033,126 @@ bool SILoadStoreOptimizer::optimizeBlock(
                        std::list<std::list<CombineInfo> > &MergeableInsts) {
   bool Modified = false;
 
-  for (std::list<CombineInfo> &MergeList : MergeableInsts) {
-    if (MergeList.size() < 2)
-      continue;
+  for (std::list<std::list<CombineInfo>>::iterator I = MergeableInsts.begin(),
+                                                   E = MergeableInsts.end(); I != E;) {
+    std::list<CombineInfo> &MergeList = *I;
 
     bool OptimizeListAgain = false;
     if (!optimizeInstsWithSameBaseAddr(MergeList, OptimizeListAgain)) {
-      // We weren't able to make any changes, so clear the list so we don't
+      // We weren't able to make any changes, so delete the list so we don't
       // process the same instructions the next time we try to optimize this
       // block.
-      MergeList.clear();
+      I = MergeableInsts.erase(I);
       continue;
     }
 
-    // We made changes, but also determined that there were no more optimization
-    // opportunities, so we don't need to reprocess the list
-    if (!OptimizeListAgain)
-      MergeList.clear();
-
-    OptimizeAgain |= OptimizeListAgain;
     Modified = true;
-  }
-  return Modified;
-}
 
-void
-SILoadStoreOptimizer::removeCombinedInst(std::list<CombineInfo> &MergeList,
-                                         const MachineInstr &MI) {
-
-  for (auto CI = MergeList.begin(), E = MergeList.end(); CI != E; ++CI) {
-    if (&*CI->I == &MI) {
-      MergeList.erase(CI);
-      return;
+    // We made changes, but also determined that there were no more optimization
+    // opportunities, so we don't need to reprocess the list
+    if (!OptimizeListAgain) {
+      I = MergeableInsts.erase(I);
+      continue;
     }
+    OptimizeAgain = true;
   }
+  return Modified;
 }
 
 bool
 SILoadStoreOptimizer::optimizeInstsWithSameBaseAddr(
                                           std::list<CombineInfo> &MergeList,
                                           bool &OptimizeListAgain) {
+  if (MergeList.empty())
+    return false;
+
   bool Modified = false;
-  for (auto I = MergeList.begin(); I != MergeList.end(); ++I) {
-    CombineInfo &CI = *I;
-    CombineInfo Paired;
 
-    if (CI.InstClass == UNKNOWN)
-      continue;
+  for (auto I = MergeList.begin(), Next = std::next(I); Next != MergeList.end();
+       Next = std::next(I)) {
+
+    auto First = I;
+    auto Second = Next;
+
+    if ((*First).Order > (*Second).Order)
+      std::swap(First, Second);
+    CombineInfo &CI = *First;
+    CombineInfo &Paired = *Second;
 
-    if (!findMatchingInst(CI, Paired))
-      goto done;
+    SmallVector<MachineInstr *, 8> InstsToMove;
+    if (!checkAndPrepareMerge(CI, Paired, InstsToMove)) {
+      ++I;
+      continue;
+    }
 
     Modified = true;
-    removeCombinedInst(MergeList, *Paired.I);
+
+    LLVM_DEBUG(dbgs() << "Merging: " << *CI.I << "   with: " << *Paired.I);
 
     switch (CI.InstClass) {
     default:
       llvm_unreachable("unknown InstClass");
       break;
     case DS_READ: {
-      MachineBasicBlock::iterator NewMI = mergeRead2Pair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeRead2Pair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       break;
     }
     case DS_WRITE: {
-      MachineBasicBlock::iterator NewMI = mergeWrite2Pair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeWrite2Pair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       break;
     }
     case S_BUFFER_LOAD_IMM: {
-      MachineBasicBlock::iterator NewMI = mergeSBufferLoadImmPair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeSBufferLoadImmPair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 16;
       break;
     }
     case BUFFER_LOAD: {
-      MachineBasicBlock::iterator NewMI = mergeBufferLoadPair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeBufferLoadPair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
     case BUFFER_STORE: {
-      MachineBasicBlock::iterator NewMI = mergeBufferStorePair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeBufferStorePair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
     case MIMG: {
-      MachineBasicBlock::iterator NewMI = mergeImagePair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeImagePair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
     case TBUFFER_LOAD: {
-      MachineBasicBlock::iterator NewMI = mergeTBufferLoadPair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeTBufferLoadPair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
     case TBUFFER_STORE: {
-      MachineBasicBlock::iterator NewMI = mergeTBufferStorePair(CI, Paired);
+      MachineBasicBlock::iterator NewMI =
+          mergeTBufferStorePair(CI, Paired, InstsToMove);
       CI.setMI(NewMI, *TII, *STM);
       OptimizeListAgain |= (CI.Width + Paired.Width) < 4;
       break;
     }
     }
+    CI.Order = Paired.Order;
+    if (I == Second)
+      I = Next;
 
-done:
-    // Clear the InstsToMove after we have finished searching so we don't have
-    // stale values left over if we search for this CI again in another pass
-    // over the block.
-    CI.InstsToMove.clear();
+    MergeList.erase(Second);
   }
 
   return Modified;
@@ -2062,26 +2168,41 @@ bool SILoadStoreOptimizer::runOnMachineFunction(MachineFunction &MF) {
 
   TII = STM->getInstrInfo();
   TRI = &TII->getRegisterInfo();
-  STI = &MF.getSubtarget<MCSubtargetInfo>();
 
   MRI = &MF.getRegInfo();
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
-  assert(MRI->isSSA() && "Must be run on SSA");
-
   LLVM_DEBUG(dbgs() << "Running SILoadStoreOptimizer\n");
 
   bool Modified = false;
 
+  // Contains the list of instructions for which constant offsets are being
+  // promoted to the IMM. This is tracked for an entire block at time.
+  SmallPtrSet<MachineInstr *, 4> AnchorList;
+  MemInfoMap Visited;
 
   for (MachineBasicBlock &MBB : MF) {
-    std::list<std::list<CombineInfo> > MergeableInsts;
-    // First pass: Collect list of all instructions we know how to merge.
-    Modified |= collectMergeableInsts(MBB, MergeableInsts);
-    do {
-      OptimizeAgain = false;
-      Modified |= optimizeBlock(MergeableInsts);
-    } while (OptimizeAgain);
+    MachineBasicBlock::iterator SectionEnd;
+    for (MachineBasicBlock::iterator I = MBB.begin(), E = MBB.end(); I != E;
+         I = SectionEnd) {
+      bool CollectModified;
+      std::list<std::list<CombineInfo>> MergeableInsts;
+
+      // First pass: Collect list of all instructions we know how to merge in a
+      // subset of the block.
+      std::tie(SectionEnd, CollectModified) =
+          collectMergeableInsts(I, E, Visited, AnchorList, MergeableInsts);
+
+      Modified |= CollectModified;
+
+      do {
+        OptimizeAgain = false;
+        Modified |= optimizeBlock(MergeableInsts);
+      } while (OptimizeAgain);
+    }
+
+    Visited.clear();
+    AnchorList.clear();
   }
 
   return Modified;
diff --git a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
index 61d2719a3aad..36d52ac3ee89 100644
--- a/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerControlFlow.cpp
@@ -38,8 +38,8 @@
 /// %vgpr0 = V_ADD_F32 %vgpr0, %vgpr0 // Do the IF block of the branch
 ///
 /// label0:
-/// %sgpr0 = S_OR_SAVEEXEC_B64 %exec   // Restore the exec mask for the Then block
-/// %exec = S_XOR_B64 %sgpr0, %exec    // Clear live bits from saved exec mask
+/// %sgpr0 = S_OR_SAVEEXEC_B64 %sgpr0  // Restore the exec mask for the Then block
+/// %exec = S_XOR_B64 %sgpr0, %exec    // Update the exec mask
 /// S_BRANCH_EXECZ label1              // Use our branch optimization
 ///                                    // instruction again.
 /// %vgpr0 = V_SUB_F32 %vgpr0, %vgpr   // Do the THEN block
@@ -51,6 +51,8 @@
 #include "AMDGPUSubtarget.h"
 #include "SIInstrInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/CodeGen/LiveIntervals.h"
@@ -73,6 +75,10 @@ using namespace llvm;
 
 #define DEBUG_TYPE "si-lower-control-flow"
 
+static cl::opt<bool>
+RemoveRedundantEndcf("amdgpu-remove-redundant-endcf",
+    cl::init(true), cl::ReallyHidden);
+
 namespace {
 
 class SILowerControlFlow : public MachineFunctionPass {
@@ -81,8 +87,12 @@ private:
   const SIInstrInfo *TII = nullptr;
   LiveIntervals *LIS = nullptr;
   MachineRegisterInfo *MRI = nullptr;
+  SetVector<MachineInstr*> LoweredEndCf;
+  DenseSet<Register> LoweredIf;
+  SmallSet<MachineInstr *, 16> NeedsKillCleanup;
 
   const TargetRegisterClass *BoolRC = nullptr;
+  bool InsertKillCleanups;
   unsigned AndOpc;
   unsigned OrOpc;
   unsigned XorOpc;
@@ -98,13 +108,23 @@ private:
   void emitLoop(MachineInstr &MI);
   void emitEndCf(MachineInstr &MI);
 
-  Register getSaveExec(MachineInstr* MI);
-
   void findMaskOperands(MachineInstr &MI, unsigned OpNo,
                         SmallVectorImpl<MachineOperand> &Src) const;
 
   void combineMasks(MachineInstr &MI);
 
+  void process(MachineInstr &MI);
+
+  // Skip to the next instruction, ignoring debug instructions, and trivial
+  // block boundaries (blocks that have one (typically fallthrough) successor,
+  // and the successor has one predecessor.
+  MachineBasicBlock::iterator
+  skipIgnoreExecInstsTrivialSucc(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator It) const;
+
+  // Remove redundant SI_END_CF instructions.
+  void optimizeEndCf();
+
 public:
   static char ID;
 
@@ -144,62 +164,44 @@ static void setImpSCCDefDead(MachineInstr &MI, bool IsDead) {
 
 char &llvm::SILowerControlFlowID = SILowerControlFlow::ID;
 
-static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI,
-                       const SIInstrInfo *TII) {
-  Register SaveExecReg = MI.getOperand(0).getReg();
-  auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
-
-  if (U == MRI->use_instr_nodbg_end() ||
-      std::next(U) != MRI->use_instr_nodbg_end() ||
-      U->getOpcode() != AMDGPU::SI_END_CF)
-    return false;
-
-  // Check for SI_KILL_*_TERMINATOR on path from if to endif.
-  // if there is any such terminator simplififcations are not safe.
-  auto SMBB = MI.getParent();
-  auto EMBB = U->getParent();
+static bool hasKill(const MachineBasicBlock *Begin,
+                    const MachineBasicBlock *End, const SIInstrInfo *TII) {
   DenseSet<const MachineBasicBlock*> Visited;
-  SmallVector<MachineBasicBlock*, 4> Worklist(SMBB->succ_begin(),
-                                              SMBB->succ_end());
+  SmallVector<MachineBasicBlock *, 4> Worklist(Begin->succ_begin(),
+                                               Begin->succ_end());
 
   while (!Worklist.empty()) {
     MachineBasicBlock *MBB = Worklist.pop_back_val();
 
-    if (MBB == EMBB || !Visited.insert(MBB).second)
+    if (MBB == End || !Visited.insert(MBB).second)
       continue;
-    for(auto &Term : MBB->terminators())
+    for (auto &Term : MBB->terminators())
       if (TII->isKillTerminator(Term.getOpcode()))
-        return false;
+        return true;
 
     Worklist.append(MBB->succ_begin(), MBB->succ_end());
   }
 
-  return true;
+  return false;
 }
 
-Register SILowerControlFlow::getSaveExec(MachineInstr *MI) {
-  MachineBasicBlock *MBB = MI->getParent();
-  MachineOperand &SaveExec = MI->getOperand(0);
-  assert(SaveExec.getSubReg() == AMDGPU::NoSubRegister);
-
-  Register SaveExecReg = SaveExec.getReg();
-  unsigned FalseTermOpc =
-      TII->isWave32() ? AMDGPU::S_MOV_B32_term : AMDGPU::S_MOV_B64_term;
-  MachineBasicBlock::iterator I = (MI);
-  MachineBasicBlock::iterator J = std::next(I);
-  if (J != MBB->end() && J->getOpcode() == FalseTermOpc &&
-      J->getOperand(1).isReg() && J->getOperand(1).getReg() == SaveExecReg) {
-    SaveExecReg = J->getOperand(0).getReg();
-    J->eraseFromParent();
-  }
-  return SaveExecReg;
+static bool isSimpleIf(const MachineInstr &MI, const MachineRegisterInfo *MRI) {
+  Register SaveExecReg = MI.getOperand(0).getReg();
+  auto U = MRI->use_instr_nodbg_begin(SaveExecReg);
+
+  if (U == MRI->use_instr_nodbg_end() ||
+      std::next(U) != MRI->use_instr_nodbg_end() ||
+      U->getOpcode() != AMDGPU::SI_END_CF)
+    return false;
+
+  return true;
 }
 
 void SILowerControlFlow::emitIf(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
   MachineBasicBlock::iterator I(&MI);
-  Register SaveExecReg = getSaveExec(&MI);
+  Register SaveExecReg = MI.getOperand(0).getReg();
   MachineOperand& Cond = MI.getOperand(1);
   assert(Cond.getSubReg() == AMDGPU::NoSubRegister);
 
@@ -209,7 +211,35 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
   // If there is only one use of save exec register and that use is SI_END_CF,
   // we can optimize SI_IF by returning the full saved exec mask instead of
   // just cleared bits.
-  bool SimpleIf = isSimpleIf(MI, MRI, TII);
+  bool SimpleIf = isSimpleIf(MI, MRI);
+
+  if (InsertKillCleanups) {
+    // Check for SI_KILL_*_TERMINATOR on full path of control flow and
+    // flag the associated SI_END_CF for insertion of a kill cleanup.
+    auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
+    while (UseMI->getOpcode() != AMDGPU::SI_END_CF) {
+      assert(std::next(UseMI) == MRI->use_instr_nodbg_end());
+      assert(UseMI->getOpcode() == AMDGPU::SI_ELSE);
+      MachineOperand &NextExec = UseMI->getOperand(0);
+      Register NextExecReg = NextExec.getReg();
+      if (NextExec.isDead()) {
+        assert(!SimpleIf);
+        break;
+      }
+      UseMI = MRI->use_instr_nodbg_begin(NextExecReg);
+    }
+    if (UseMI->getOpcode() == AMDGPU::SI_END_CF) {
+      if (hasKill(MI.getParent(), UseMI->getParent(), TII)) {
+        NeedsKillCleanup.insert(&*UseMI);
+        SimpleIf = false;
+      }
+    }
+  } else if (SimpleIf) {
+    // Check for SI_KILL_*_TERMINATOR on path from if to endif.
+    // if there is any such terminator simplifications are not safe.
+    auto UseMI = MRI->use_instr_nodbg_begin(SaveExecReg);
+    SimpleIf = !hasKill(MI.getParent(), UseMI->getParent(), TII);
+  }
 
   // Add an implicit def of exec to discourage scheduling VALU after this which
   // will interfere with trying to form s_and_saveexec_b64 later.
@@ -219,6 +249,7 @@ void SILowerControlFlow::emitIf(MachineInstr &MI) {
     BuildMI(MBB, I, DL, TII->get(AMDGPU::COPY), CopyReg)
     .addReg(Exec)
     .addReg(Exec, RegState::ImplicitDefine);
+  LoweredIf.insert(CopyReg);
 
   Register Tmp = MRI->createVirtualRegister(BoolRC);
 
@@ -282,7 +313,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
 
-  Register DstReg = getSaveExec(&MI);
+  Register DstReg = MI.getOperand(0).getReg();
 
   bool ExecModified = MI.getOperand(3).getImm() != 0;
   MachineBasicBlock::iterator Start = MBB.begin();
@@ -354,7 +385,7 @@ void SILowerControlFlow::emitElse(MachineInstr &MI) {
 void SILowerControlFlow::emitIfBreak(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   const DebugLoc &DL = MI.getDebugLoc();
-  auto Dst = getSaveExec(&MI);
+  auto Dst = MI.getOperand(0).getReg();
 
   // Skip ANDing with exec if the break condition is already masked by exec
   // because it is a V_CMP in the same basic block. (We know the break
@@ -416,6 +447,38 @@ void SILowerControlFlow::emitLoop(MachineInstr &MI) {
   MI.eraseFromParent();
 }
 
+MachineBasicBlock::iterator
+SILowerControlFlow::skipIgnoreExecInstsTrivialSucc(
+  MachineBasicBlock &MBB, MachineBasicBlock::iterator It) const {
+
+  SmallSet<const MachineBasicBlock *, 4> Visited;
+  MachineBasicBlock *B = &MBB;
+  do {
+    if (!Visited.insert(B).second)
+      return MBB.end();
+
+    auto E = B->end();
+    for ( ; It != E; ++It) {
+      if (It->getOpcode() == AMDGPU::SI_KILL_CLEANUP)
+        continue;
+      if (TII->mayReadEXEC(*MRI, *It))
+        break;
+    }
+
+    if (It != E)
+      return It;
+
+    if (B->succ_size() != 1)
+      return MBB.end();
+
+    // If there is one trivial successor, advance to the next block.
+    MachineBasicBlock *Succ = *B->succ_begin();
+
+    It = Succ->begin();
+    B = Succ;
+  } while (true);
+}
+
 void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
@@ -430,8 +493,20 @@ void SILowerControlFlow::emitEndCf(MachineInstr &MI) {
                             .addReg(Exec)
                             .add(MI.getOperand(0));
 
-  if (LIS)
+  LoweredEndCf.insert(NewMI);
+
+  // If this ends control flow which contains kills (as flagged in emitIf)
+  // then insert an SI_KILL_CLEANUP immediately following the exec mask
+  // manipulation.  This can be lowered to early termination if appropriate.
+  MachineInstr *CleanUpMI = nullptr;
+  if (NeedsKillCleanup.count(&MI))
+    CleanUpMI = BuildMI(MBB, InsPt, DL, TII->get(AMDGPU::SI_KILL_CLEANUP));
+
+  if (LIS) {
     LIS->ReplaceMachineInstrInMaps(MI, *NewMI);
+    if (CleanUpMI)
+      LIS->InsertMachineInstrInMaps(*CleanUpMI);
+  }
 
   MI.eraseFromParent();
 
@@ -494,6 +569,84 @@ void SILowerControlFlow::combineMasks(MachineInstr &MI) {
     MRI->getUniqueVRegDef(Reg)->eraseFromParent();
 }
 
+void SILowerControlFlow::optimizeEndCf() {
+  // If the only instruction immediately following this END_CF is an another
+  // END_CF in the only successor we can avoid emitting exec mask restore here.
+  if (!RemoveRedundantEndcf)
+    return;
+
+  for (MachineInstr *MI : LoweredEndCf) {
+    MachineBasicBlock &MBB = *MI->getParent();
+    auto Next =
+      skipIgnoreExecInstsTrivialSucc(MBB, std::next(MI->getIterator()));
+    if (Next == MBB.end() || !LoweredEndCf.count(&*Next))
+      continue;
+    // Only skip inner END_CF if outer ENDCF belongs to SI_IF.
+    // If that belongs to SI_ELSE then saved mask has an inverted value.
+    Register SavedExec
+      = TII->getNamedOperand(*Next, AMDGPU::OpName::src1)->getReg();
+    assert(SavedExec.isVirtual() && "Expected saved exec to be src1!");
+
+    const MachineInstr *Def = MRI->getUniqueVRegDef(SavedExec);
+    if (Def && LoweredIf.count(SavedExec)) {
+      LLVM_DEBUG(dbgs() << "Skip redundant "; MI->dump());
+      if (LIS)
+        LIS->RemoveMachineInstrFromMaps(*MI);
+      MI->eraseFromParent();
+    }
+  }
+}
+
+void SILowerControlFlow::process(MachineInstr &MI) {
+  MachineBasicBlock &MBB = *MI.getParent();
+  MachineBasicBlock::iterator I(MI);
+  MachineInstr *Prev = (I != MBB.begin()) ? &*(std::prev(I)) : nullptr;
+
+  switch (MI.getOpcode()) {
+  case AMDGPU::SI_IF:
+    emitIf(MI);
+    break;
+
+  case AMDGPU::SI_ELSE:
+    emitElse(MI);
+    break;
+
+  case AMDGPU::SI_IF_BREAK:
+    emitIfBreak(MI);
+    break;
+
+  case AMDGPU::SI_LOOP:
+    emitLoop(MI);
+    break;
+
+  case AMDGPU::SI_END_CF:
+    emitEndCf(MI);
+    break;
+
+  default:
+    assert(false && "Attempt to process unsupported instruction");
+    break;
+  }
+
+  MachineBasicBlock::iterator Next;
+  for (I = Prev ? Prev->getIterator() : MBB.begin(); I != MBB.end(); I = Next) {
+    Next = std::next(I);
+    MachineInstr &MaskMI = *I;
+    switch (MaskMI.getOpcode()) {
+    case AMDGPU::S_AND_B64:
+    case AMDGPU::S_OR_B64:
+    case AMDGPU::S_AND_B32:
+    case AMDGPU::S_OR_B32:
+      // Cleanup bit manipulations on exec mask
+      combineMasks(MaskMI);
+      break;
+    default:
+      I = MBB.end();
+      break;
+    }
+  }
+}
+
 bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
@@ -503,6 +656,8 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
   LIS = getAnalysisIfAvailable<LiveIntervals>();
   MRI = &MF.getRegInfo();
   BoolRC = TRI->getBoolRC();
+  InsertKillCleanups =
+      MF.getFunction().getCallingConv() == CallingConv::AMDGPU_PS;
 
   if (ST.isWave32()) {
     AndOpc = AMDGPU::S_AND_B32;
@@ -524,57 +679,49 @@ bool SILowerControlFlow::runOnMachineFunction(MachineFunction &MF) {
     Exec = AMDGPU::EXEC;
   }
 
+  SmallVector<MachineInstr *, 32> Worklist;
+
   MachineFunction::iterator NextBB;
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end();
        BI != BE; BI = NextBB) {
     NextBB = std::next(BI);
     MachineBasicBlock &MBB = *BI;
 
-    MachineBasicBlock::iterator I, Next, Last;
-
-    for (I = MBB.begin(), Last = MBB.end(); I != MBB.end(); I = Next) {
+    MachineBasicBlock::iterator I, Next;
+    for (I = MBB.begin(); I != MBB.end(); I = Next) {
       Next = std::next(I);
       MachineInstr &MI = *I;
 
       switch (MI.getOpcode()) {
       case AMDGPU::SI_IF:
-        emitIf(MI);
+        process(MI);
         break;
 
       case AMDGPU::SI_ELSE:
-        emitElse(MI);
-        break;
-
       case AMDGPU::SI_IF_BREAK:
-        emitIfBreak(MI);
-        break;
-
       case AMDGPU::SI_LOOP:
-        emitLoop(MI);
-        break;
-
       case AMDGPU::SI_END_CF:
-        emitEndCf(MI);
+        // Only build worklist if SI_IF instructions must be processed first.
+        if (InsertKillCleanups)
+          Worklist.push_back(&MI);
+        else
+          process(MI);
         break;
 
-      case AMDGPU::S_AND_B64:
-      case AMDGPU::S_OR_B64:
-      case AMDGPU::S_AND_B32:
-      case AMDGPU::S_OR_B32:
-        // Cleanup bit manipulations on exec mask
-        combineMasks(MI);
-        Last = I;
-        continue;
-
       default:
-        Last = I;
-        continue;
+        break;
       }
-
-      // Replay newly inserted code to combine masks
-      Next = (Last == MBB.end()) ? MBB.begin() : Last;
     }
   }
 
+  for (MachineInstr *MI : Worklist)
+    process(*MI);
+
+  optimizeEndCf();
+
+  LoweredEndCf.clear();
+  LoweredIf.clear();
+  NeedsKillCleanup.clear();
+
   return true;
 }
diff --git a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
index 1d45e6241d22..236a24a02ece 100644
--- a/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerI1Copies.cpp
@@ -452,6 +452,11 @@ static unsigned insertUndefLaneMask(MachineBasicBlock &MBB) {
 /// all others, because phi lowering looks through copies and can therefore
 /// often make copy lowering unnecessary.
 bool SILowerI1Copies::runOnMachineFunction(MachineFunction &TheMF) {
+  // Only need to run this in SelectionDAG path.
+  if (TheMF.getProperties().hasProperty(
+        MachineFunctionProperties::Property::Selected))
+    return false;
+
   MF = &TheMF;
   MRI = &MF->getRegInfo();
   DT = &getAnalysis<MachineDominatorTree>();
diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
index 57ccf7641666..1349d3b6bf3f 100644
--- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
+++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp
@@ -100,7 +100,8 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
       unsigned Reg = CS.getReg();
 
       MachineInstrSpan MIS(I, &SaveBlock);
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      const TargetRegisterClass *RC =
+        TRI->getMinimalPhysRegClass(Reg, MVT::i32);
 
       TII.storeRegToStackSlot(SaveBlock, I, Reg, true, CS.getFrameIdx(), RC,
                               TRI);
@@ -118,7 +119,7 @@ static void insertCSRSaves(MachineBasicBlock &SaveBlock,
 
 /// Insert restore code for the callee-saved registers used in the function.
 static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
-                              std::vector<CalleeSavedInfo> &CSI,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
                               LiveIntervals *LIS) {
   MachineFunction &MF = *RestoreBlock.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
@@ -133,7 +134,8 @@ static void insertCSRRestores(MachineBasicBlock &RestoreBlock,
   if (!TFI->restoreCalleeSavedRegisters(RestoreBlock, I, CSI, TRI)) {
     for (const CalleeSavedInfo &CI : reverse(CSI)) {
       unsigned Reg = CI.getReg();
-      const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+      const TargetRegisterClass *RC =
+        TRI->getMinimalPhysRegClass(Reg, MVT::i32);
 
       TII.loadRegFromStackSlot(RestoreBlock, I, Reg, CI.getFrameIdx(), RC, TRI);
       assert(I != RestoreBlock.begin() &&
@@ -206,10 +208,10 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
     for (unsigned I = 0; CSRegs[I]; ++I) {
       unsigned Reg = CSRegs[I];
       if (SavedRegs.test(Reg)) {
-        const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+        const TargetRegisterClass *RC =
+          TRI->getMinimalPhysRegClass(Reg, MVT::i32);
         int JunkFI = MFI.CreateStackObject(TRI->getSpillSize(*RC),
-                                           TRI->getSpillAlignment(*RC),
-                                           true);
+                                           TRI->getSpillAlign(*RC), true);
 
         CSI.push_back(CalleeSavedInfo(Reg, JunkFI));
       }
@@ -228,6 +230,47 @@ bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) {
   return false;
 }
 
+// Find lowest available VGPR and use it as VGPR reserved for SGPR spills.
+static bool lowerShiftReservedVGPR(MachineFunction &MF,
+                                   const GCNSubtarget &ST) {
+  MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineFrameInfo &FrameInfo = MF.getFrameInfo();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  Register LowestAvailableVGPR, ReservedVGPR;
+  ArrayRef<MCPhysReg> AllVGPR32s = ST.getRegisterInfo()->getAllVGPR32(MF);
+  for (MCPhysReg Reg : AllVGPR32s) {
+    if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg)) {
+      LowestAvailableVGPR = Reg;
+      break;
+    }
+  }
+
+  if (!LowestAvailableVGPR)
+    return false;
+
+  ReservedVGPR = FuncInfo->VGPRReservedForSGPRSpill;
+  const MCPhysReg *CSRegs = MF.getRegInfo().getCalleeSavedRegs();
+  int i = 0;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) {
+      if (Reg.VGPR == ReservedVGPR) {
+        MBB.removeLiveIn(ReservedVGPR);
+        MBB.addLiveIn(LowestAvailableVGPR);
+        Optional<int> FI;
+        if (FuncInfo->isCalleeSavedReg(CSRegs, LowestAvailableVGPR))
+          FI = FrameInfo.CreateSpillStackObject(4, Align(4));
+
+        FuncInfo->setSGPRSpillVGPRs(LowestAvailableVGPR, FI, i);
+      }
+      ++i;
+    }
+    MBB.sortUniqueLiveIns();
+  }
+
+  return true;
+}
+
 bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
   const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
   TII = ST.getInstrInfo();
@@ -267,6 +310,9 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
     //
     // This operates under the assumption that only other SGPR spills are users
     // of the frame index.
+
+    lowerShiftReservedVGPR(MF, ST);
+
     for (MachineBasicBlock &MBB : MF) {
       MachineBasicBlock::iterator Next;
       for (auto I = MBB.begin(), E = MBB.end(); I != E; I = Next) {
@@ -315,6 +361,8 @@ bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) {
     }
 
     MadeChange = true;
+  } else if (FuncInfo->VGPRReservedForSGPRSpill) {
+    FuncInfo->removeVGPRForSGPRSpill(FuncInfo->VGPRReservedForSGPRSpill, MF);
   }
 
   SaveBlocks.clear();
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
index 0c67b1467a5d..788e9873f780 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp
@@ -8,6 +8,7 @@
 
 #include "SIMachineFunctionInfo.h"
 #include "AMDGPUArgumentUsageInfo.h"
+#include "AMDGPUTargetMachine.h"
 #include "AMDGPUSubtarget.h"
 #include "SIRegisterInfo.h"
 #include "MCTargetDesc/AMDGPUMCTargetDesc.h"
@@ -52,9 +53,18 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   FlatWorkGroupSizes = ST.getFlatWorkGroupSizes(F);
   WavesPerEU = ST.getWavesPerEU(F);
 
-  Occupancy = ST.computeOccupancy(MF, getLDSSize());
+  Occupancy = ST.computeOccupancy(F, getLDSSize());
   CallingConv::ID CC = F.getCallingConv();
 
+  // FIXME: Should have analysis or something rather than attribute to detect
+  // calls.
+  const bool HasCalls = F.hasFnAttribute("amdgpu-calls");
+
+  // Enable all kernel inputs if we have the fixed ABI. Don't bother if we don't
+  // have any calls.
+  const bool UseFixedABI = AMDGPUTargetMachine::EnableFixedFunctionABI &&
+                           (!isEntryFunction() || HasCalls);
+
   if (CC == CallingConv::AMDGPU_KERNEL || CC == CallingConv::SPIR_KERNEL) {
     if (!F.arg_empty())
       KernargSegmentPtr = true;
@@ -68,16 +78,13 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     // Non-entry functions have no special inputs for now, other registers
     // required for scratch access.
     ScratchRSrcReg = AMDGPU::SGPR0_SGPR1_SGPR2_SGPR3;
-    ScratchWaveOffsetReg = AMDGPU::SGPR33;
 
     // TODO: Pick a high register, and shift down, similar to a kernel.
-    FrameOffsetReg = AMDGPU::SGPR34;
+    FrameOffsetReg = AMDGPU::SGPR33;
     StackPtrOffsetReg = AMDGPU::SGPR32;
 
     ArgInfo.PrivateSegmentBuffer =
       ArgDescriptor::createRegister(ScratchRSrcReg);
-    ArgInfo.PrivateSegmentWaveByteOffset =
-      ArgDescriptor::createRegister(ScratchWaveOffsetReg);
 
     if (F.hasFnAttribute("amdgpu-implicitarg-ptr"))
       ImplicitArgPtr = true;
@@ -89,27 +96,35 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
     }
   }
 
-  if (F.hasFnAttribute("amdgpu-work-group-id-x"))
+  if (UseFixedABI) {
     WorkGroupIDX = true;
-
-  if (F.hasFnAttribute("amdgpu-work-group-id-y"))
     WorkGroupIDY = true;
-
-  if (F.hasFnAttribute("amdgpu-work-group-id-z"))
     WorkGroupIDZ = true;
-
-  if (F.hasFnAttribute("amdgpu-work-item-id-x"))
     WorkItemIDX = true;
-
-  if (F.hasFnAttribute("amdgpu-work-item-id-y"))
     WorkItemIDY = true;
-
-  if (F.hasFnAttribute("amdgpu-work-item-id-z"))
     WorkItemIDZ = true;
+    ImplicitArgPtr = true;
+  } else {
+    if (F.hasFnAttribute("amdgpu-work-group-id-x"))
+      WorkGroupIDX = true;
+
+    if (F.hasFnAttribute("amdgpu-work-group-id-y"))
+      WorkGroupIDY = true;
+
+    if (F.hasFnAttribute("amdgpu-work-group-id-z"))
+      WorkGroupIDZ = true;
+
+    if (F.hasFnAttribute("amdgpu-work-item-id-x"))
+      WorkItemIDX = true;
 
-  const MachineFrameInfo &FrameInfo = MF.getFrameInfo();
-  bool HasStackObjects = FrameInfo.hasStackObjects();
+    if (F.hasFnAttribute("amdgpu-work-item-id-y"))
+      WorkItemIDY = true;
+
+    if (F.hasFnAttribute("amdgpu-work-item-id-z"))
+      WorkItemIDZ = true;
+  }
 
+  bool HasStackObjects = F.hasFnAttribute("amdgpu-stack-objects");
   if (isEntryFunction()) {
     // X, XY, and XYZ are the only supported combinations, so make sure Y is
     // enabled if Z is.
@@ -129,36 +144,34 @@ SIMachineFunctionInfo::SIMachineFunctionInfo(const MachineFunction &MF)
   if (isAmdHsaOrMesa) {
     PrivateSegmentBuffer = true;
 
-    if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
+    if (UseFixedABI) {
       DispatchPtr = true;
-
-    if (F.hasFnAttribute("amdgpu-queue-ptr"))
       QueuePtr = true;
 
-    if (F.hasFnAttribute("amdgpu-dispatch-id"))
+      // FIXME: We don't need this?
       DispatchID = true;
+    } else {
+      if (F.hasFnAttribute("amdgpu-dispatch-ptr"))
+        DispatchPtr = true;
+
+      if (F.hasFnAttribute("amdgpu-queue-ptr"))
+        QueuePtr = true;
+
+      if (F.hasFnAttribute("amdgpu-dispatch-id"))
+        DispatchID = true;
+    }
   } else if (ST.isMesaGfxShader(F)) {
     ImplicitBufferPtr = true;
   }
 
-  if (F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
+  if (UseFixedABI || F.hasFnAttribute("amdgpu-kernarg-segment-ptr"))
     KernargSegmentPtr = true;
 
   if (ST.hasFlatAddressSpace() && isEntryFunction() && isAmdHsaOrMesa) {
-    auto hasNonSpillStackObjects = [&]() {
-      // Avoid expensive checking if there's no stack objects.
-      if (!HasStackObjects)
-        return false;
-      for (auto OI = FrameInfo.getObjectIndexBegin(),
-                OE = FrameInfo.getObjectIndexEnd(); OI != OE; ++OI)
-        if (!FrameInfo.isSpillSlotObjectIndex(OI))
-          return true;
-      // All stack objects are spill slots.
-      return false;
-    };
     // TODO: This could be refined a lot. The attribute is a poor way of
-    // detecting calls that may require it before argument lowering.
-    if (hasNonSpillStackObjects() || F.hasFnAttribute("amdgpu-flat-scratch"))
+    // detecting calls or stack objects that may require it before argument
+    // lowering.
+    if (HasCalls || HasStackObjects)
       FlatScratchInit = true;
   }
 
@@ -184,7 +197,7 @@ void SIMachineFunctionInfo::limitOccupancy(const MachineFunction &MF) {
                  MF.getFunction()));
 }
 
-unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
+Register SIMachineFunctionInfo::addPrivateSegmentBuffer(
   const SIRegisterInfo &TRI) {
   ArgInfo.PrivateSegmentBuffer =
     ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
@@ -193,21 +206,21 @@ unsigned SIMachineFunctionInfo::addPrivateSegmentBuffer(
   return ArgInfo.PrivateSegmentBuffer.getRegister();
 }
 
-unsigned SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
+Register SIMachineFunctionInfo::addDispatchPtr(const SIRegisterInfo &TRI) {
   ArgInfo.DispatchPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
   NumUserSGPRs += 2;
   return ArgInfo.DispatchPtr.getRegister();
 }
 
-unsigned SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
+Register SIMachineFunctionInfo::addQueuePtr(const SIRegisterInfo &TRI) {
   ArgInfo.QueuePtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
   NumUserSGPRs += 2;
   return ArgInfo.QueuePtr.getRegister();
 }
 
-unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
+Register SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI) {
   ArgInfo.KernargSegmentPtr
     = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
@@ -215,28 +228,29 @@ unsigned SIMachineFunctionInfo::addKernargSegmentPtr(const SIRegisterInfo &TRI)
   return ArgInfo.KernargSegmentPtr.getRegister();
 }
 
-unsigned SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
+Register SIMachineFunctionInfo::addDispatchID(const SIRegisterInfo &TRI) {
   ArgInfo.DispatchID = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
   NumUserSGPRs += 2;
   return ArgInfo.DispatchID.getRegister();
 }
 
-unsigned SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
+Register SIMachineFunctionInfo::addFlatScratchInit(const SIRegisterInfo &TRI) {
   ArgInfo.FlatScratchInit = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
   NumUserSGPRs += 2;
   return ArgInfo.FlatScratchInit.getRegister();
 }
 
-unsigned SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
+Register SIMachineFunctionInfo::addImplicitBufferPtr(const SIRegisterInfo &TRI) {
   ArgInfo.ImplicitBufferPtr = ArgDescriptor::createRegister(TRI.getMatchingSuperReg(
     getNextUserSGPR(), AMDGPU::sub0, &AMDGPU::SReg_64RegClass));
   NumUserSGPRs += 2;
   return ArgInfo.ImplicitBufferPtr.getRegister();
 }
 
-static bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg) {
+bool SIMachineFunctionInfo::isCalleeSavedReg(const MCPhysReg *CSRegs,
+                                             MCPhysReg Reg) {
   for (unsigned I = 0; CSRegs[I]; ++I) {
     if (CSRegs[I] == Reg)
       return true;
@@ -270,22 +284,35 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
   MachineFrameInfo &FrameInfo = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   unsigned WaveSize = ST.getWavefrontSize();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
 
   unsigned Size = FrameInfo.getObjectSize(FI);
-  assert(Size >= 4 && Size <= 64 && "invalid sgpr spill size");
-  assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
+  unsigned NumLanes = Size / 4;
+
+  if (NumLanes > WaveSize)
+    return false;
 
-  int NumLanes = Size / 4;
+  assert(Size >= 4 && "invalid sgpr spill size");
+  assert(TRI->spillSGPRToVGPR() && "not spilling SGPRs to VGPRs");
 
   const MCPhysReg *CSRegs = MRI.getCalleeSavedRegs();
 
   // Make sure to handle the case where a wide SGPR spill may span between two
   // VGPRs.
-  for (int I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
-    unsigned LaneVGPR;
+  for (unsigned I = 0; I < NumLanes; ++I, ++NumVGPRSpillLanes) {
+    Register LaneVGPR;
     unsigned VGPRIndex = (NumVGPRSpillLanes % WaveSize);
 
-    if (VGPRIndex == 0) {
+    // Reserve a VGPR (when NumVGPRSpillLanes = 0, WaveSize, 2*WaveSize, ..) and
+    // when one of the two conditions is true:
+    // 1. One reserved VGPR being tracked by VGPRReservedForSGPRSpill is not yet
+    // reserved.
+    // 2. All spill lanes of reserved VGPR(s) are full and another spill lane is
+    // required.
+    if (FuncInfo->VGPRReservedForSGPRSpill && NumVGPRSpillLanes < WaveSize) {
+      assert(FuncInfo->VGPRReservedForSGPRSpill == SpillVGPRs.back().VGPR);
+      LaneVGPR = FuncInfo->VGPRReservedForSGPRSpill;
+    } else if (VGPRIndex == 0) {
       LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF);
       if (LaneVGPR == AMDGPU::NoRegister) {
         // We have no VGPRs left for spilling SGPRs. Reset because we will not
@@ -298,7 +325,7 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
       Optional<int> CSRSpillFI;
       if ((FrameInfo.hasCalls() || !isEntryFunction()) && CSRegs &&
           isCalleeSavedReg(CSRegs, LaneVGPR)) {
-        CSRSpillFI = FrameInfo.CreateSpillStackObject(4, 4);
+        CSRSpillFI = FrameInfo.CreateSpillStackObject(4, Align(4));
       }
 
       SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, CSRSpillFI));
@@ -317,6 +344,19 @@ bool SIMachineFunctionInfo::allocateSGPRSpillToVGPR(MachineFunction &MF,
   return true;
 }
 
+/// Reserve a VGPR for spilling of SGPRs
+bool SIMachineFunctionInfo::reserveVGPRforSGPRSpills(MachineFunction &MF) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  const SIRegisterInfo *TRI = ST.getRegisterInfo();
+  SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+
+  Register LaneVGPR = TRI->findUnusedRegister(
+      MF.getRegInfo(), &AMDGPU::VGPR_32RegClass, MF, true);
+  SpillVGPRs.push_back(SGPRSpillVGPRCSR(LaneVGPR, None));
+  FuncInfo->VGPRReservedForSGPRSpill = LaneVGPR;
+  return true;
+}
+
 /// Reserve AGPRs or VGPRs to support spilling for FrameIndex \p FI.
 /// Either AGPR is spilled to VGPR to vice versa.
 /// Returns true if a \p FI can be eliminated completely.
@@ -386,9 +426,9 @@ bool SIMachineFunctionInfo::allocateVGPRSpillToAGPR(MachineFunction &MF,
 }
 
 void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
-  // The FP spill hasn't been inserted yet, so keep it around.
+  // The FP & BP spills haven't been inserted yet, so keep them around.
   for (auto &R : SGPRToVGPRSpills) {
-    if (R.first != FramePointerSaveIndex)
+    if (R.first != FramePointerSaveIndex && R.first != BasePointerSaveIndex)
       MFI.RemoveStackObject(R.first);
   }
 
@@ -396,7 +436,7 @@ void SIMachineFunctionInfo::removeDeadFrameIndices(MachineFrameInfo &MFI) {
   // ID.
   for (int i = MFI.getObjectIndexBegin(), e = MFI.getObjectIndexEnd(); i != e;
        ++i)
-    if (i != FramePointerSaveIndex)
+    if (i != FramePointerSaveIndex && i != BasePointerSaveIndex)
       MFI.setStackID(i, TargetStackID::Default);
 
   for (auto &R : VGPRToAGPRSpills) {
@@ -414,7 +454,28 @@ MCPhysReg SIMachineFunctionInfo::getNextSystemSGPR() const {
   return AMDGPU::SGPR0 + NumUserSGPRs + NumSystemSGPRs;
 }
 
-static yaml::StringValue regToString(unsigned Reg,
+Register
+SIMachineFunctionInfo::getGITPtrLoReg(const MachineFunction &MF) const {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  if (!ST.isAmdPalOS())
+    return Register();
+  Register GitPtrLo = AMDGPU::SGPR0; // Low GIT address passed in
+  if (ST.hasMergedShaders()) {
+    switch (MF.getFunction().getCallingConv()) {
+    case CallingConv::AMDGPU_HS:
+    case CallingConv::AMDGPU_GS:
+      // Low GIT address is passed in s8 rather than s0 for an LS+HS or
+      // ES+GS merged shader on gfx9+.
+      GitPtrLo = AMDGPU::SGPR8;
+      return GitPtrLo;
+    default:
+      return GitPtrLo;
+    }
+  }
+  return GitPtrLo;
+}
+
+static yaml::StringValue regToString(Register Reg,
                                      const TargetRegisterInfo &TRI) {
   yaml::StringValue Dest;
   {
@@ -487,7 +548,6 @@ yaml::SIMachineFunctionInfo::SIMachineFunctionInfo(
     WaveLimiter(MFI.needsWaveLimiter()),
     HighBitsOf32BitAddress(MFI.get32BitAddressHighBits()),
     ScratchRSrcReg(regToString(MFI.getScratchRSrcReg(), TRI)),
-    ScratchWaveOffsetReg(regToString(MFI.getScratchWaveOffsetReg(), TRI)),
     FrameOffsetReg(regToString(MFI.getFrameOffsetReg(), TRI)),
     StackPtrOffsetReg(regToString(MFI.getStackPtrOffsetReg(), TRI)),
     ArgInfo(convertArgumentInfo(MFI.getArgInfo(), TRI)),
@@ -509,3 +569,21 @@ bool SIMachineFunctionInfo::initializeBaseYamlFields(
   WaveLimiter = YamlMFI.WaveLimiter;
   return false;
 }
+
+// Remove VGPR which was reserved for SGPR spills if there are no spilled SGPRs
+bool SIMachineFunctionInfo::removeVGPRForSGPRSpill(Register ReservedVGPR,
+                                                   MachineFunction &MF) {
+  for (auto *i = SpillVGPRs.begin(); i < SpillVGPRs.end(); i++) {
+    if (i->VGPR == ReservedVGPR) {
+      SpillVGPRs.erase(i);
+
+      for (MachineBasicBlock &MBB : MF) {
+        MBB.removeLiveIn(ReservedVGPR);
+        MBB.sortUniqueLiveIns();
+      }
+      this->VGPRReservedForSGPRSpill = AMDGPU::NoRegister;
+      return true;
+    }
+  }
+  return false;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
index ef0186f7d57f..cf1629fda0af 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h
@@ -236,23 +236,29 @@ template <> struct MappingTraits<SIArgumentInfo> {
 struct SIMode {
   bool IEEE = true;
   bool DX10Clamp = true;
-  bool FP32Denormals = true;
-  bool FP64FP16Denormals = true;
+  bool FP32InputDenormals = true;
+  bool FP32OutputDenormals = true;
+  bool FP64FP16InputDenormals = true;
+  bool FP64FP16OutputDenormals = true;
 
   SIMode() = default;
 
   SIMode(const AMDGPU::SIModeRegisterDefaults &Mode) {
     IEEE = Mode.IEEE;
     DX10Clamp = Mode.DX10Clamp;
-    FP32Denormals = Mode.FP32Denormals;
-    FP64FP16Denormals = Mode.FP64FP16Denormals;
+    FP32InputDenormals = Mode.FP32InputDenormals;
+    FP32OutputDenormals = Mode.FP32OutputDenormals;
+    FP64FP16InputDenormals = Mode.FP64FP16InputDenormals;
+    FP64FP16OutputDenormals = Mode.FP64FP16OutputDenormals;
   }
 
   bool operator ==(const SIMode Other) const {
     return IEEE == Other.IEEE &&
            DX10Clamp == Other.DX10Clamp &&
-           FP32Denormals == Other.FP32Denormals &&
-           FP64FP16Denormals == Other.FP64FP16Denormals;
+           FP32InputDenormals == Other.FP32InputDenormals &&
+           FP32OutputDenormals == Other.FP32OutputDenormals &&
+           FP64FP16InputDenormals == Other.FP64FP16InputDenormals &&
+           FP64FP16OutputDenormals == Other.FP64FP16OutputDenormals;
   }
 };
 
@@ -260,8 +266,10 @@ template <> struct MappingTraits<SIMode> {
   static void mapping(IO &YamlIO, SIMode &Mode) {
     YamlIO.mapOptional("ieee", Mode.IEEE, true);
     YamlIO.mapOptional("dx10-clamp", Mode.DX10Clamp, true);
-    YamlIO.mapOptional("fp32-denormals", Mode.FP32Denormals, true);
-    YamlIO.mapOptional("fp64-fp16-denormals", Mode.FP64FP16Denormals, true);
+    YamlIO.mapOptional("fp32-input-denormals", Mode.FP32InputDenormals, true);
+    YamlIO.mapOptional("fp32-output-denormals", Mode.FP32OutputDenormals, true);
+    YamlIO.mapOptional("fp64-fp16-input-denormals", Mode.FP64FP16InputDenormals, true);
+    YamlIO.mapOptional("fp64-fp16-output-denormals", Mode.FP64FP16OutputDenormals, true);
   }
 };
 
@@ -276,7 +284,6 @@ struct SIMachineFunctionInfo final : public yaml::MachineFunctionInfo {
   uint32_t HighBitsOf32BitAddress = 0;
 
   StringValue ScratchRSrcReg = "$private_rsrc_reg";
-  StringValue ScratchWaveOffsetReg = "$scratch_wave_offset_reg";
   StringValue FrameOffsetReg = "$fp_reg";
   StringValue StackPtrOffsetReg = "$sp_reg";
 
@@ -303,8 +310,6 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
     YamlIO.mapOptional("waveLimiter", MFI.WaveLimiter, false);
     YamlIO.mapOptional("scratchRSrcReg", MFI.ScratchRSrcReg,
                        StringValue("$private_rsrc_reg"));
-    YamlIO.mapOptional("scratchWaveOffsetReg", MFI.ScratchWaveOffsetReg,
-                       StringValue("$scratch_wave_offset_reg"));
     YamlIO.mapOptional("frameOffsetReg", MFI.FrameOffsetReg,
                        StringValue("$fp_reg"));
     YamlIO.mapOptional("stackPtrOffsetReg", MFI.StackPtrOffsetReg,
@@ -323,20 +328,20 @@ template <> struct MappingTraits<SIMachineFunctionInfo> {
 class SIMachineFunctionInfo final : public AMDGPUMachineFunction {
   friend class GCNTargetMachine;
 
-  unsigned TIDReg = AMDGPU::NoRegister;
+  Register TIDReg = AMDGPU::NoRegister;
 
   // Registers that may be reserved for spilling purposes. These may be the same
   // as the input registers.
-  unsigned ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG;
-  unsigned ScratchWaveOffsetReg = AMDGPU::SCRATCH_WAVE_OFFSET_REG;
+  Register ScratchRSrcReg = AMDGPU::PRIVATE_RSRC_REG;
 
-  // This is the current function's incremented size from the kernel's scratch
-  // wave offset register. For an entry function, this is exactly the same as
-  // the ScratchWaveOffsetReg.
-  unsigned FrameOffsetReg = AMDGPU::FP_REG;
+  // This is the the unswizzled offset from the current dispatch's scratch wave
+  // base to the beginning of the current function's frame.
+  Register FrameOffsetReg = AMDGPU::FP_REG;
 
-  // Top of the stack SGPR offset derived from the ScratchWaveOffsetReg.
-  unsigned StackPtrOffsetReg = AMDGPU::SP_REG;
+  // This is an ABI register used in the non-entry calling convention to
+  // communicate the unswizzled offset from the current dispatch's scratch wave
+  // base to the beginning of the new function's frame.
+  Register StackPtrOffsetReg = AMDGPU::SP_REG;
 
   AMDGPUFunctionArgInfo ArgInfo;
 
@@ -429,11 +434,11 @@ private:
 
 public:
   struct SpilledReg {
-    unsigned VGPR = 0;
+    Register VGPR;
     int Lane = -1;
 
     SpilledReg() = default;
-    SpilledReg(unsigned R, int L) : VGPR (R), Lane (L) {}
+    SpilledReg(Register R, int L) : VGPR (R), Lane (L) {}
 
     bool hasLane() { return Lane != -1;}
     bool hasReg() { return VGPR != 0;}
@@ -441,13 +446,13 @@ public:
 
   struct SGPRSpillVGPRCSR {
     // VGPR used for SGPR spills
-    unsigned VGPR;
+    Register VGPR;
 
     // If the VGPR is a CSR, the stack slot used to save/restore it in the
     // prolog/epilog.
     Optional<int> FI;
 
-    SGPRSpillVGPRCSR(unsigned V, Optional<int> F) : VGPR(V), FI(F) {}
+    SGPRSpillVGPRCSR(Register V, Optional<int> F) : VGPR(V), FI(F) {}
   };
 
   struct VGPRSpillToAGPR {
@@ -457,12 +462,9 @@ public:
 
   SparseBitVector<> WWMReservedRegs;
 
-  void ReserveWWMRegister(unsigned reg) { WWMReservedRegs.set(reg); }
+  void ReserveWWMRegister(Register Reg) { WWMReservedRegs.set(Reg); }
 
 private:
-  // SGPR->VGPR spilling support.
-  using SpillRegMask = std::pair<unsigned, unsigned>;
-
   // Track VGPR + wave index for each subregister of the SGPR spilled to
   // frameindex key.
   DenseMap<int, std::vector<SpilledReg>> SGPRToVGPRSpills;
@@ -480,9 +482,17 @@ private:
 public: // FIXME
   /// If this is set, an SGPR used for save/restore of the register used for the
   /// frame pointer.
-  unsigned SGPRForFPSaveRestoreCopy = 0;
+  Register SGPRForFPSaveRestoreCopy;
   Optional<int> FramePointerSaveIndex;
 
+  /// If this is set, an SGPR used for save/restore of the register used for the
+  /// base pointer.
+  Register SGPRForBPSaveRestoreCopy;
+  Optional<int> BasePointerSaveIndex;
+
+  Register VGPRReservedForSGPRSpill;
+  bool isCalleeSavedReg(const MCPhysReg *CSRegs, MCPhysReg Reg);
+
 public:
   SIMachineFunctionInfo(const MachineFunction &MF);
 
@@ -498,6 +508,14 @@ public:
     return SpillVGPRs;
   }
 
+  void setSGPRSpillVGPRs(Register NewVGPR, Optional<int> newFI, int Index) {
+    SpillVGPRs[Index].VGPR = NewVGPR;
+    SpillVGPRs[Index].FI = newFI;
+    VGPRReservedForSGPRSpill = NewVGPR;
+  }
+
+  bool removeVGPRForSGPRSpill(Register ReservedVGPR, MachineFunction &MF);
+
   ArrayRef<MCPhysReg> getAGPRSpillVGPRs() const {
     return SpillAGPR;
   }
@@ -515,12 +533,13 @@ public:
   bool haveFreeLanesForSGPRSpill(const MachineFunction &MF,
                                  unsigned NumLane) const;
   bool allocateSGPRSpillToVGPR(MachineFunction &MF, int FI);
+  bool reserveVGPRforSGPRSpills(MachineFunction &MF);
   bool allocateVGPRSpillToAGPR(MachineFunction &MF, int FI, bool isAGPRtoVGPR);
   void removeDeadFrameIndices(MachineFrameInfo &MFI);
 
   bool hasCalculatedTID() const { return TIDReg != 0; };
-  unsigned getTIDReg() const { return TIDReg; };
-  void setTIDReg(unsigned Reg) { TIDReg = Reg; }
+  Register getTIDReg() const { return TIDReg; };
+  void setTIDReg(Register Reg) { TIDReg = Reg; }
 
   unsigned getBytesInStackArgArea() const {
     return BytesInStackArgArea;
@@ -531,34 +550,34 @@ public:
   }
 
   // Add user SGPRs.
-  unsigned addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
-  unsigned addDispatchPtr(const SIRegisterInfo &TRI);
-  unsigned addQueuePtr(const SIRegisterInfo &TRI);
-  unsigned addKernargSegmentPtr(const SIRegisterInfo &TRI);
-  unsigned addDispatchID(const SIRegisterInfo &TRI);
-  unsigned addFlatScratchInit(const SIRegisterInfo &TRI);
-  unsigned addImplicitBufferPtr(const SIRegisterInfo &TRI);
+  Register addPrivateSegmentBuffer(const SIRegisterInfo &TRI);
+  Register addDispatchPtr(const SIRegisterInfo &TRI);
+  Register addQueuePtr(const SIRegisterInfo &TRI);
+  Register addKernargSegmentPtr(const SIRegisterInfo &TRI);
+  Register addDispatchID(const SIRegisterInfo &TRI);
+  Register addFlatScratchInit(const SIRegisterInfo &TRI);
+  Register addImplicitBufferPtr(const SIRegisterInfo &TRI);
 
   // Add system SGPRs.
-  unsigned addWorkGroupIDX() {
+  Register addWorkGroupIDX() {
     ArgInfo.WorkGroupIDX = ArgDescriptor::createRegister(getNextSystemSGPR());
     NumSystemSGPRs += 1;
     return ArgInfo.WorkGroupIDX.getRegister();
   }
 
-  unsigned addWorkGroupIDY() {
+  Register addWorkGroupIDY() {
     ArgInfo.WorkGroupIDY = ArgDescriptor::createRegister(getNextSystemSGPR());
     NumSystemSGPRs += 1;
     return ArgInfo.WorkGroupIDY.getRegister();
   }
 
-  unsigned addWorkGroupIDZ() {
+  Register addWorkGroupIDZ() {
     ArgInfo.WorkGroupIDZ = ArgDescriptor::createRegister(getNextSystemSGPR());
     NumSystemSGPRs += 1;
     return ArgInfo.WorkGroupIDZ.getRegister();
   }
 
-  unsigned addWorkGroupInfo() {
+  Register addWorkGroupInfo() {
     ArgInfo.WorkGroupInfo = ArgDescriptor::createRegister(getNextSystemSGPR());
     NumSystemSGPRs += 1;
     return ArgInfo.WorkGroupInfo.getRegister();
@@ -577,14 +596,14 @@ public:
     ArgInfo.WorkItemIDZ = Arg;
   }
 
-  unsigned addPrivateSegmentWaveByteOffset() {
+  Register addPrivateSegmentWaveByteOffset() {
     ArgInfo.PrivateSegmentWaveByteOffset
       = ArgDescriptor::createRegister(getNextSystemSGPR());
     NumSystemSGPRs += 1;
     return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
   }
 
-  void setPrivateSegmentWaveByteOffset(unsigned Reg) {
+  void setPrivateSegmentWaveByteOffset(Register Reg) {
     ArgInfo.PrivateSegmentWaveByteOffset = ArgDescriptor::createRegister(Reg);
   }
 
@@ -660,13 +679,13 @@ public:
     return ArgInfo;
   }
 
-  std::pair<const ArgDescriptor *, const TargetRegisterClass *>
+  std::tuple<const ArgDescriptor *, const TargetRegisterClass *, LLT>
   getPreloadedValue(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
     return ArgInfo.getPreloadedValue(Value);
   }
 
   Register getPreloadedReg(AMDGPUFunctionArgInfo::PreloadedValue Value) const {
-    auto Arg = ArgInfo.getPreloadedValue(Value).first;
+    auto Arg = std::get<0>(ArgInfo.getPreloadedValue(Value));
     return Arg ? Arg->getRegister() : Register();
   }
 
@@ -674,6 +693,8 @@ public:
     return GITPtrHigh;
   }
 
+  Register getGITPtrLoReg(const MachineFunction &MF) const;
+
   uint32_t get32BitAddressHighBits() const {
     return HighBitsOf32BitAddress;
   }
@@ -690,35 +711,31 @@ public:
     return NumUserSGPRs + NumSystemSGPRs;
   }
 
-  unsigned getPrivateSegmentWaveByteOffsetSystemSGPR() const {
+  Register getPrivateSegmentWaveByteOffsetSystemSGPR() const {
     return ArgInfo.PrivateSegmentWaveByteOffset.getRegister();
   }
 
   /// Returns the physical register reserved for use as the resource
   /// descriptor for scratch accesses.
-  unsigned getScratchRSrcReg() const {
+  Register getScratchRSrcReg() const {
     return ScratchRSrcReg;
   }
 
-  void setScratchRSrcReg(unsigned Reg) {
+  void setScratchRSrcReg(Register Reg) {
     assert(Reg != 0 && "Should never be unset");
     ScratchRSrcReg = Reg;
   }
 
-  unsigned getScratchWaveOffsetReg() const {
-    return ScratchWaveOffsetReg;
-  }
-
-  unsigned getFrameOffsetReg() const {
+  Register getFrameOffsetReg() const {
     return FrameOffsetReg;
   }
 
-  void setFrameOffsetReg(unsigned Reg) {
+  void setFrameOffsetReg(Register Reg) {
     assert(Reg != 0 && "Should never be unset");
     FrameOffsetReg = Reg;
   }
 
-  void setStackPtrOffsetReg(unsigned Reg) {
+  void setStackPtrOffsetReg(Register Reg) {
     assert(Reg != 0 && "Should never be unset");
     StackPtrOffsetReg = Reg;
   }
@@ -727,20 +744,15 @@ public:
   // NoRegister. This is mostly a workaround for MIR tests where state that
   // can't be directly computed from the function is not preserved in serialized
   // MIR.
-  unsigned getStackPtrOffsetReg() const {
+  Register getStackPtrOffsetReg() const {
     return StackPtrOffsetReg;
   }
 
-  void setScratchWaveOffsetReg(unsigned Reg) {
-    assert(Reg != 0 && "Should never be unset");
-    ScratchWaveOffsetReg = Reg;
-  }
-
-  unsigned getQueuePtrUserSGPR() const {
+  Register getQueuePtrUserSGPR() const {
     return ArgInfo.QueuePtr.getRegister();
   }
 
-  unsigned getImplicitBufferPtrUserSGPR() const {
+  Register getImplicitBufferPtrUserSGPR() const {
     return ArgInfo.ImplicitBufferPtr.getRegister();
   }
 
@@ -853,7 +865,7 @@ public:
   }
 
   /// \returns SGPR used for \p Dim's work group ID.
-  unsigned getWorkGroupIDSGPR(unsigned Dim) const {
+  Register getWorkGroupIDSGPR(unsigned Dim) const {
     switch (Dim) {
     case 0:
       assert(hasWorkGroupIDX());
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
index 004a3cb185d6..3ba05aadbbbe 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.cpp
@@ -269,8 +269,8 @@ SUnit* SIScheduleBlock::pickNode() {
     // Predict register usage after this instruction.
     TryCand.SU = SU;
     TopRPTracker.getDownwardPressure(SU->getInstr(), pressure, MaxPressure);
-    TryCand.SGPRUsage = pressure[DAG->getSGPRSetID()];
-    TryCand.VGPRUsage = pressure[DAG->getVGPRSetID()];
+    TryCand.SGPRUsage = pressure[AMDGPU::RegisterPressureSets::SReg_32];
+    TryCand.VGPRUsage = pressure[AMDGPU::RegisterPressureSets::VGPR_32];
     TryCand.IsLowLatency = DAG->IsLowLatencySU[SU->NodeNum];
     TryCand.LowLatencyOffset = DAG->LowLatencyOffset[SU->NodeNum];
     TryCand.HasLowLatencyNonWaitedParent =
@@ -595,10 +595,12 @@ void SIScheduleBlock::printDebug(bool full) {
   }
 
   if (Scheduled) {
-    dbgs() << "LiveInPressure " << LiveInPressure[DAG->getSGPRSetID()] << ' '
-           << LiveInPressure[DAG->getVGPRSetID()] << '\n';
-    dbgs() << "LiveOutPressure " << LiveOutPressure[DAG->getSGPRSetID()] << ' '
-           << LiveOutPressure[DAG->getVGPRSetID()] << "\n\n";
+    dbgs() << "LiveInPressure "
+           << LiveInPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
+           << LiveInPressure[AMDGPU::RegisterPressureSets::VGPR_32] << '\n';
+    dbgs() << "LiveOutPressure "
+           << LiveOutPressure[AMDGPU::RegisterPressureSets::SReg_32] << ' '
+           << LiveOutPressure[AMDGPU::RegisterPressureSets::VGPR_32] << "\n\n";
     dbgs() << "LiveIns:\n";
     for (unsigned Reg : LiveInRegs)
       dbgs() << printVRegOrUnit(Reg, DAG->getTRI()) << ' ';
@@ -1637,7 +1639,7 @@ SIScheduleBlock *SIScheduleBlockScheduler::pickBlock() {
     TryCand.IsHighLatency = TryCand.Block->isHighLatencyBlock();
     TryCand.VGPRUsageDiff =
       checkRegUsageImpact(TryCand.Block->getInRegs(),
-                          TryCand.Block->getOutRegs())[DAG->getVGPRSetID()];
+          TryCand.Block->getOutRegs())[AMDGPU::RegisterPressureSets::VGPR_32];
     TryCand.NumSuccessors = TryCand.Block->getSuccs().size();
     TryCand.NumHighLatencySuccessors =
       TryCand.Block->getNumHighLatencySuccessors();
@@ -1796,9 +1798,6 @@ SIScheduleDAGMI::SIScheduleDAGMI(MachineSchedContext *C) :
   ScheduleDAGMILive(C, std::make_unique<GenericScheduler>(C)) {
   SITII = static_cast<const SIInstrInfo*>(TII);
   SITRI = static_cast<const SIRegisterInfo*>(TRI);
-
-  VGPRSetID = SITRI->getVGPRPressureSet();
-  SGPRSetID = SITRI->getSGPRPressureSet();
 }
 
 SIScheduleDAGMI::~SIScheduleDAGMI() = default;
@@ -1909,9 +1908,9 @@ SIScheduleDAGMI::fillVgprSgprCost(_Iterator First, _Iterator End,
       continue;
     PSetIterator PSetI = MRI.getPressureSets(Reg);
     for (; PSetI.isValid(); ++PSetI) {
-      if (*PSetI == VGPRSetID)
+      if (*PSetI == AMDGPU::RegisterPressureSets::VGPR_32)
         VgprUsage += PSetI.getWeight();
-      else if (*PSetI == SGPRSetID)
+      else if (*PSetI == AMDGPU::RegisterPressureSets::SReg_32)
         SgprUsage += PSetI.getWeight();
     }
   }
@@ -1952,10 +1951,11 @@ void SIScheduleDAGMI::schedule()
     int64_t OffLatReg;
     if (SITII->isLowLatencyInstruction(*SU->getInstr())) {
       IsLowLatencySU[i] = 1;
+      bool OffsetIsScalable;
       if (SITII->getMemOperandWithOffset(*SU->getInstr(), BaseLatOp, OffLatReg,
-                                         TRI))
+                                         OffsetIsScalable, TRI))
         LowLatencyOffset[i] = OffLatReg;
-    } else if (SITII->isHighLatencyInstruction(*SU->getInstr()))
+    } else if (SITII->isHighLatencyDef(SU->getInstr()->getOpcode()))
       IsHighLatencySU[i] = 1;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
index ec450a316467..02e0a3fe1b61 100644
--- a/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
+++ b/llvm/lib/Target/AMDGPU/SIMachineScheduler.h
@@ -435,9 +435,6 @@ class SIScheduleDAGMI final : public ScheduleDAGMILive {
   std::vector<unsigned> ScheduledSUnits;
   std::vector<unsigned> ScheduledSUnitsInv;
 
-  unsigned VGPRSetID;
-  unsigned SGPRSetID;
-
 public:
   SIScheduleDAGMI(MachineSchedContext *C);
 
@@ -484,9 +481,6 @@ public:
     return OutRegs;
   };
 
-  unsigned getVGPRSetID() const { return VGPRSetID; }
-  unsigned getSGPRSetID() const { return SGPRSetID; }
-
 private:
   void topologicalSort();
   // After scheduling is done, improve low latency placements.
diff --git a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
index e914573306ae..4e6c72ca20e2 100644
--- a/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
+++ b/llvm/lib/Target/AMDGPU/SIMemoryLegalizer.cpp
@@ -254,6 +254,9 @@ protected:
 
   IsaVersion IV;
 
+  /// Whether to insert cache invalidation instructions.
+  bool InsertCacheInv;
+
   SICacheControl(const GCNSubtarget &ST);
 
 public:
@@ -650,6 +653,7 @@ Optional<SIMemOpInfo> SIMemOpAccess::getAtomicCmpxchgOrRmwInfo(
 SICacheControl::SICacheControl(const GCNSubtarget &ST) {
   TII = ST.getInstrInfo();
   IV = getIsaVersion(ST.getCPU());
+  InsertCacheInv = !ST.isAmdPalOS();
 }
 
 /* static */
@@ -714,6 +718,9 @@ bool SIGfx6CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
                                                SIAtomicScope Scope,
                                                SIAtomicAddrSpace AddrSpace,
                                                Position Pos) const {
+  if (!InsertCacheInv)
+    return false;
+
   bool Changed = false;
 
   MachineBasicBlock &MBB = *MI->getParent();
@@ -852,6 +859,9 @@ bool SIGfx7CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
                                                SIAtomicScope Scope,
                                                SIAtomicAddrSpace AddrSpace,
                                                Position Pos) const {
+  if (!InsertCacheInv)
+    return false;
+
   bool Changed = false;
 
   MachineBasicBlock &MBB = *MI->getParent();
@@ -954,6 +964,9 @@ bool SIGfx10CacheControl::insertCacheInvalidate(MachineBasicBlock::iterator &MI,
                                                 SIAtomicScope Scope,
                                                 SIAtomicAddrSpace AddrSpace,
                                                 Position Pos) const {
+  if (!InsertCacheInv)
+    return false;
+
   bool Changed = false;
 
   MachineBasicBlock &MBB = *MI->getParent();
@@ -1289,6 +1302,21 @@ bool SIMemoryLegalizer::runOnMachineFunction(MachineFunction &MF) {
 
   for (auto &MBB : MF) {
     for (auto MI = MBB.begin(); MI != MBB.end(); ++MI) {
+
+      if (MI->getOpcode() == TargetOpcode::BUNDLE && MI->mayLoadOrStore()) {
+        MachineBasicBlock::instr_iterator II(MI->getIterator());
+        for (MachineBasicBlock::instr_iterator I = ++II, E = MBB.instr_end();
+             I != E && I->isBundledWithPred(); ++I) {
+          I->unbundleFromPred();
+          for (MachineOperand &MO : I->operands())
+            if (MO.isReg())
+              MO.setIsInternalRead(false);
+        }
+
+        MI->eraseFromParent();
+        MI = II->getIterator();
+      }
+
       if (!(MI->getDesc().TSFlags & SIInstrFlags::maybeAtomic))
         continue;
 
diff --git a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
index 52989a280e80..0e162ac42c11 100644
--- a/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
+++ b/llvm/lib/Target/AMDGPU/SIModeRegister.cpp
@@ -83,9 +83,7 @@ struct Status {
     return ((Mask & S.Mask) == S.Mask) && ((Mode & S.Mask) == S.Mode);
   }
 
-  bool isCombinable(Status &S) {
-    return !(Mask & S.Mask) || isCompatible(S);
-  }
+  bool isCombinable(Status &S) { return !(Mask & S.Mask) || isCompatible(S); }
 };
 
 class BlockData {
@@ -110,7 +108,11 @@ public:
   // which is used in Phase 3 if we need to insert a mode change.
   MachineInstr *FirstInsertionPoint;
 
-  BlockData() : FirstInsertionPoint(nullptr) {};
+  // A flag to indicate whether an Exit value has been set (we can't tell by
+  // examining the Exit value itself as all values may be valid results).
+  bool ExitSet;
+
+  BlockData() : FirstInsertionPoint(nullptr), ExitSet(false){};
 };
 
 namespace {
@@ -131,6 +133,8 @@ public:
   Status DefaultStatus =
       Status(FP_ROUND_MODE_DP(0x3), FP_ROUND_MODE_DP(DefaultMode));
 
+  bool Changed = false;
+
 public:
   SIModeRegister() : MachineFunctionPass(ID) {}
 
@@ -201,6 +205,7 @@ void SIModeRegister::insertSetreg(MachineBasicBlock &MBB, MachineInstr *MI,
                 (Offset << AMDGPU::Hwreg::OFFSET_SHIFT_) |
                 (AMDGPU::Hwreg::ID_MODE << AMDGPU::Hwreg::ID_SHIFT_));
     ++NumSetregInserted;
+    Changed = true;
     InstrMode.Mask &= ~(((1 << Width) - 1) << Offset);
   }
 }
@@ -325,24 +330,53 @@ void SIModeRegister::processBlockPhase1(MachineBasicBlock &MBB,
 // exit value is propagated.
 void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB,
                                         const SIInstrInfo *TII) {
-//  BlockData *BI = BlockInfo[MBB.getNumber()];
+  bool RevisitRequired = false;
+  bool ExitSet = false;
   unsigned ThisBlock = MBB.getNumber();
   if (MBB.pred_empty()) {
     // There are no predecessors, so use the default starting status.
     BlockInfo[ThisBlock]->Pred = DefaultStatus;
+    ExitSet = true;
   } else {
     // Build a status that is common to all the predecessors by intersecting
     // all the predecessor exit status values.
+    // Mask bits (which represent the Mode bits with a known value) can only be
+    // added by explicit SETREG instructions or the initial default value -
+    // the intersection process may remove Mask bits.
+    // If we find a predecessor that has not yet had an exit value determined
+    // (this can happen for example if a block is its own predecessor) we defer
+    // use of that value as the Mask will be all zero, and we will revisit this
+    // block again later (unless the only predecessor without an exit value is
+    // this block).
     MachineBasicBlock::pred_iterator P = MBB.pred_begin(), E = MBB.pred_end();
     MachineBasicBlock &PB = *(*P);
-    BlockInfo[ThisBlock]->Pred = BlockInfo[PB.getNumber()]->Exit;
+    unsigned PredBlock = PB.getNumber();
+    if ((ThisBlock == PredBlock) && (std::next(P) == E)) {
+      BlockInfo[ThisBlock]->Pred = DefaultStatus;
+      ExitSet = true;
+    } else if (BlockInfo[PredBlock]->ExitSet) {
+      BlockInfo[ThisBlock]->Pred = BlockInfo[PredBlock]->Exit;
+      ExitSet = true;
+    } else if (PredBlock != ThisBlock)
+      RevisitRequired = true;
 
     for (P = std::next(P); P != E; P = std::next(P)) {
       MachineBasicBlock *Pred = *P;
-      BlockInfo[ThisBlock]->Pred = BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[Pred->getNumber()]->Exit);
+      unsigned PredBlock = Pred->getNumber();
+      if (BlockInfo[PredBlock]->ExitSet) {
+        if (BlockInfo[ThisBlock]->ExitSet) {
+          BlockInfo[ThisBlock]->Pred =
+              BlockInfo[ThisBlock]->Pred.intersect(BlockInfo[PredBlock]->Exit);
+        } else {
+          BlockInfo[ThisBlock]->Pred = BlockInfo[PredBlock]->Exit;
+        }
+        ExitSet = true;
+      } else if (PredBlock != ThisBlock)
+        RevisitRequired = true;
     }
   }
-  Status TmpStatus = BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change);
+  Status TmpStatus =
+      BlockInfo[ThisBlock]->Pred.merge(BlockInfo[ThisBlock]->Change);
   if (BlockInfo[ThisBlock]->Exit != TmpStatus) {
     BlockInfo[ThisBlock]->Exit = TmpStatus;
     // Add the successors to the work list so we can propagate the changed exit
@@ -354,6 +388,9 @@ void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB,
       Phase2List.push(&B);
     }
   }
+  BlockInfo[ThisBlock]->ExitSet = ExitSet;
+  if (RevisitRequired)
+    Phase2List.push(&MBB);
 }
 
 // In Phase 3 we revisit each block and if it has an insertion point defined we
@@ -361,10 +398,10 @@ void SIModeRegister::processBlockPhase2(MachineBasicBlock &MBB,
 // not we insert an appropriate setreg instruction to modify the Mode register.
 void SIModeRegister::processBlockPhase3(MachineBasicBlock &MBB,
                                         const SIInstrInfo *TII) {
-//  BlockData *BI = BlockInfo[MBB.getNumber()];
   unsigned ThisBlock = MBB.getNumber();
   if (!BlockInfo[ThisBlock]->Pred.isCompatible(BlockInfo[ThisBlock]->Require)) {
-    Status Delta = BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require);
+    Status Delta =
+        BlockInfo[ThisBlock]->Pred.delta(BlockInfo[ThisBlock]->Require);
     if (BlockInfo[ThisBlock]->FirstInsertionPoint)
       insertSetreg(MBB, BlockInfo[ThisBlock]->FirstInsertionPoint, TII, Delta);
     else
@@ -401,5 +438,5 @@ bool SIModeRegister::runOnMachineFunction(MachineFunction &MF) {
 
   BlockInfo.clear();
 
-  return NumSetregInserted > 0;
+  return Changed;
 }
diff --git a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
index 34199d3e425c..8af00fcf62a8 100644
--- a/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIOptimizeExecMaskingPreRA.cpp
@@ -7,15 +7,8 @@
 //===----------------------------------------------------------------------===//
 //
 /// \file
-/// This pass removes redundant S_OR_B64 instructions enabling lanes in
-/// the exec. If two SI_END_CF (lowered as S_OR_B64) come together without any
-/// vector instructions between them we can only keep outer SI_END_CF, given
-/// that CFG is structured and exec bits of the outer end statement are always
-/// not less than exec bit of the inner one.
-///
-/// This needs to be done before the RA to eliminate saved exec bits registers
-/// but after register coalescer to have no vector registers copies in between
-/// of different end cf statements.
+/// This pass performs exec mask handling peephole optimizations which needs
+/// to be done before register allocation to reduce register pressure.
 ///
 //===----------------------------------------------------------------------===//
 
@@ -40,14 +33,6 @@ private:
   MachineRegisterInfo *MRI;
 
 public:
-  MachineBasicBlock::iterator skipIgnoreExecInsts(
-    MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const;
-
-    MachineBasicBlock::iterator skipIgnoreExecInstsTrivialSucc(
-      MachineBasicBlock *&MBB,
-      MachineBasicBlock::iterator It) const;
-
-public:
   static char ID;
 
   SIOptimizeExecMaskingPreRA() : MachineFunctionPass(ID) {
@@ -83,93 +68,15 @@ FunctionPass *llvm::createSIOptimizeExecMaskingPreRAPass() {
   return new SIOptimizeExecMaskingPreRA();
 }
 
-static bool isEndCF(const MachineInstr &MI, const SIRegisterInfo *TRI,
-                    const GCNSubtarget &ST) {
-  if (ST.isWave32()) {
-    return MI.getOpcode() == AMDGPU::S_OR_B32 &&
-           MI.modifiesRegister(AMDGPU::EXEC_LO, TRI);
-  }
-
-  return MI.getOpcode() == AMDGPU::S_OR_B64 &&
-         MI.modifiesRegister(AMDGPU::EXEC, TRI);
-}
-
 static bool isFullExecCopy(const MachineInstr& MI, const GCNSubtarget& ST) {
   unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
 
-  if (MI.isCopy() && MI.getOperand(1).getReg() == Exec) {
-    assert(MI.isFullCopy());
+  if (MI.isFullCopy() && MI.getOperand(1).getReg() == Exec)
     return true;
-  }
 
   return false;
 }
 
-static unsigned getOrNonExecReg(const MachineInstr &MI,
-                                const SIInstrInfo &TII,
-                                const GCNSubtarget& ST) {
-  unsigned Exec = ST.isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
-  auto Op = TII.getNamedOperand(MI, AMDGPU::OpName::src1);
-  if (Op->isReg() && Op->getReg() != Exec)
-     return Op->getReg();
-  Op = TII.getNamedOperand(MI, AMDGPU::OpName::src0);
-  if (Op->isReg() && Op->getReg() != Exec)
-     return Op->getReg();
-  return AMDGPU::NoRegister;
-}
-
-static MachineInstr* getOrExecSource(const MachineInstr &MI,
-                                     const SIInstrInfo &TII,
-                                     const MachineRegisterInfo &MRI,
-                                     const GCNSubtarget& ST) {
-  auto SavedExec = getOrNonExecReg(MI, TII, ST);
-  if (SavedExec == AMDGPU::NoRegister)
-    return nullptr;
-  auto SaveExecInst = MRI.getUniqueVRegDef(SavedExec);
-  if (!SaveExecInst || !isFullExecCopy(*SaveExecInst, ST))
-    return nullptr;
-  return SaveExecInst;
-}
-
-/// Skip over instructions that don't care about the exec mask.
-MachineBasicBlock::iterator SIOptimizeExecMaskingPreRA::skipIgnoreExecInsts(
-  MachineBasicBlock::iterator I, MachineBasicBlock::iterator E) const {
-  for ( ; I != E; ++I) {
-    if (TII->mayReadEXEC(*MRI, *I))
-      break;
-  }
-
-  return I;
-}
-
-// Skip to the next instruction, ignoring debug instructions, and trivial block
-// boundaries (blocks that have one (typically fallthrough) successor, and the
-// successor has one predecessor.
-MachineBasicBlock::iterator
-SIOptimizeExecMaskingPreRA::skipIgnoreExecInstsTrivialSucc(
-  MachineBasicBlock *&MBB,
-  MachineBasicBlock::iterator It) const {
-
-  do {
-    It = skipIgnoreExecInsts(It, MBB->end());
-    if (It != MBB->end() || MBB->succ_size() != 1)
-      break;
-
-    // If there is one trivial successor, advance to the next block.
-    MachineBasicBlock *Succ = *MBB->succ_begin();
-
-    // TODO: Is this really necessary?
-    if (!MBB->isLayoutSuccessor(Succ))
-      break;
-
-    It = Succ->begin();
-    MBB = Succ;
-  } while (true);
-
-  return It;
-}
-
-
 // Optimize sequence
 //    %sel = V_CNDMASK_B32_e64 0, 1, %cc
 //    %cmp = V_CMP_NE_U32 1, %1
@@ -261,6 +168,11 @@ static unsigned optimizeVcndVcmpPair(MachineBasicBlock &MBB,
               And->getOperand(0).getReg())
           .addReg(ExecReg)
           .addReg(CCReg, getUndefRegState(CC->isUndef()), CC->getSubReg());
+  MachineOperand &AndSCC = And->getOperand(3);
+  assert(AndSCC.getReg() == AMDGPU::SCC);
+  MachineOperand &Andn2SCC = Andn2->getOperand(3);
+  assert(Andn2SCC.getReg() == AMDGPU::SCC);
+  Andn2SCC.setIsDead(AndSCC.isDead());
   And->eraseFromParent();
   LIS->InsertMachineInstrInMaps(*Andn2);
 
@@ -379,57 +291,30 @@ bool SIOptimizeExecMaskingPreRA::runOnMachineFunction(MachineFunction &MF) {
       continue;
     }
 
-    // Try to collapse adjacent endifs.
-    auto E = MBB.end();
-    auto Lead = skipDebugInstructionsForward(MBB.begin(), E);
-    if (MBB.succ_size() != 1 || Lead == E || !isEndCF(*Lead, TRI, ST))
-      continue;
-
-    MachineBasicBlock *TmpMBB = &MBB;
-    auto NextLead = skipIgnoreExecInstsTrivialSucc(TmpMBB, std::next(Lead));
-    if (NextLead == TmpMBB->end() || !isEndCF(*NextLead, TRI, ST) ||
-        !getOrExecSource(*NextLead, *TII, MRI, ST))
-      continue;
-
-    LLVM_DEBUG(dbgs() << "Redundant EXEC = S_OR_B64 found: " << *Lead << '\n');
-
-    auto SaveExec = getOrExecSource(*Lead, *TII, MRI, ST);
-    unsigned SaveExecReg = getOrNonExecReg(*Lead, *TII, ST);
-    for (auto &Op : Lead->operands()) {
-      if (Op.isReg())
-        RecalcRegs.insert(Op.getReg());
-    }
-
-    LIS->RemoveMachineInstrFromMaps(*Lead);
-    Lead->eraseFromParent();
-    if (SaveExecReg) {
-      LIS->removeInterval(SaveExecReg);
-      LIS->createAndComputeVirtRegInterval(SaveExecReg);
-    }
-
-    Changed = true;
-
-    // If the only use of saved exec in the removed instruction is S_AND_B64
-    // fold the copy now.
-    if (!SaveExec || !SaveExec->isFullCopy())
-      continue;
+    // If the only user of a logical operation is move to exec, fold it now
+    // to prevent forming of saveexec. I.e:
+    //
+    //    %0:sreg_64 = COPY $exec
+    //    %1:sreg_64 = S_AND_B64 %0:sreg_64, %2:sreg_64
+    // =>
+    //    %1 = S_AND_B64 $exec, %2:sreg_64
+    unsigned ScanThreshold = 10;
+    for (auto I = MBB.rbegin(), E = MBB.rend(); I != E
+         && ScanThreshold--; ++I) {
+      if (!isFullExecCopy(*I, ST))
+        continue;
 
-    Register SavedExec = SaveExec->getOperand(0).getReg();
-    bool SafeToReplace = true;
-    for (auto& U : MRI.use_nodbg_instructions(SavedExec)) {
-      if (U.getParent() != SaveExec->getParent()) {
-        SafeToReplace = false;
-        break;
+      Register SavedExec = I->getOperand(0).getReg();
+      if (SavedExec.isVirtual() && MRI.hasOneNonDBGUse(SavedExec) &&
+          MRI.use_instr_nodbg_begin(SavedExec)->getParent() == I->getParent()) {
+        LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *I << '\n');
+        LIS->RemoveMachineInstrFromMaps(*I);
+        I->eraseFromParent();
+        MRI.replaceRegWith(SavedExec, Exec);
+        LIS->removeInterval(SavedExec);
+        Changed = true;
       }
-
-      LLVM_DEBUG(dbgs() << "Redundant EXEC COPY: " << *SaveExec << '\n');
-    }
-
-    if (SafeToReplace) {
-      LIS->RemoveMachineInstrFromMaps(*SaveExec);
-      SaveExec->eraseFromParent();
-      MRI.replaceRegWith(SavedExec, Exec);
-      LIS->removeInterval(SavedExec);
+      break;
     }
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
index 05c81feb23ec..9a1855c3458b 100644
--- a/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
+++ b/llvm/lib/Target/AMDGPU/SIPeepholeSDWA.cpp
@@ -244,11 +244,6 @@ static raw_ostream& operator<<(raw_ostream &OS, const DstUnused &Un) {
   return OS;
 }
 
-static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
-  Operand.print(OS);
-  return OS;
-}
-
 LLVM_DUMP_METHOD
 void SDWASrcOperand::print(raw_ostream& OS) const {
   OS << "SDWA src: " << *getTargetOperand()
@@ -850,6 +845,13 @@ SIPeepholeSDWA::matchSDWAOperand(MachineInstr &MI) {
   return std::unique_ptr<SDWAOperand>(nullptr);
 }
 
+#if !defined(NDEBUG)
+static raw_ostream& operator<<(raw_ostream &OS, const SDWAOperand &Operand) {
+  Operand.print(OS);
+  return OS;
+}
+#endif
+
 void SIPeepholeSDWA::matchSDWAOperands(MachineBasicBlock &MBB) {
   for (MachineInstr &MI : MBB) {
     if (auto Operand = matchSDWAOperand(MI)) {
@@ -920,18 +922,24 @@ void SIPeepholeSDWA::pseudoOpConvertToVOP2(MachineInstr &MI,
     if (I->modifiesRegister(AMDGPU::VCC, TRI))
       return;
   }
+
   // Make the two new e32 instruction variants.
   // Replace MI with V_{SUB|ADD}_I32_e32
-  auto NewMI = BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc));
-  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst));
-  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0));
-  NewMI.add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1));
+  BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(Opc))
+    .add(*TII->getNamedOperand(MI, AMDGPU::OpName::vdst))
+    .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src0))
+    .add(*TII->getNamedOperand(MI, AMDGPU::OpName::src1))
+    .setMIFlags(MI.getFlags());
+
   MI.eraseFromParent();
+
   // Replace MISucc with V_{SUBB|ADDC}_U32_e32
-  auto NewInst = BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc));
-  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst));
-  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0));
-  NewInst.add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1));
+  BuildMI(MBB, MISucc, MISucc.getDebugLoc(), TII->get(SuccOpc))
+    .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::vdst))
+    .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src0))
+    .add(*TII->getNamedOperand(MISucc, AMDGPU::OpName::src1))
+    .setMIFlags(MISucc.getFlags());
+
   MISucc.eraseFromParent();
 }
 
@@ -1008,7 +1016,8 @@ bool SIPeepholeSDWA::convertToSDWA(MachineInstr &MI,
 
   // Create SDWA version of instruction MI and initialize its operands
   MachineInstrBuilder SDWAInst =
-    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc);
+    BuildMI(*MI.getParent(), MI, MI.getDebugLoc(), SDWADesc)
+    .setMIFlags(MI.getFlags());
 
   // Copy dst, if it is present in original then should also be present in SDWA
   MachineOperand *Dst = TII->getNamedOperand(MI, AMDGPU::OpName::vdst);
diff --git a/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
new file mode 100644
index 000000000000..4c72fa235975
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIPostRABundler.cpp
@@ -0,0 +1,139 @@
+//===-- SIPostRABundler.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass creates bundles of memory instructions to protect adjacent loads
+/// and stores from beeing rescheduled apart from each other post-RA.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "SIDefines.h"
+#include "SIInstrInfo.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBundle.h"
+#include "llvm/InitializePasses.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-post-ra-bundler"
+
+namespace {
+
+class SIPostRABundler : public MachineFunctionPass {
+public:
+  static char ID;
+
+public:
+  SIPostRABundler() : MachineFunctionPass(ID) {
+    initializeSIPostRABundlerPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "SI post-RA bundler";
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesAll();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+private:
+  const SIRegisterInfo *TRI;
+
+  SmallSet<Register, 16> Defs;
+
+  bool isDependentLoad(const MachineInstr &MI) const;
+
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIPostRABundler, DEBUG_TYPE, "SI post-RA bundler", false, false)
+
+char SIPostRABundler::ID = 0;
+
+char &llvm::SIPostRABundlerID = SIPostRABundler::ID;
+
+FunctionPass *llvm::createSIPostRABundlerPass() {
+  return new SIPostRABundler();
+}
+
+bool SIPostRABundler::isDependentLoad(const MachineInstr &MI) const {
+  if (!MI.mayLoad())
+    return false;
+
+  for (const MachineOperand &Op : MI.explicit_operands()) {
+    if (!Op.isReg())
+      continue;
+    Register Reg = Op.getReg();
+    for (Register Def : Defs)
+      if (TRI->regsOverlap(Reg, Def))
+        return true;
+  }
+
+  return false;
+}
+
+bool SIPostRABundler::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  TRI = MF.getSubtarget<GCNSubtarget>().getRegisterInfo();
+  bool Changed = false;
+  const uint64_t MemFlags = SIInstrFlags::MTBUF | SIInstrFlags::MUBUF |
+                            SIInstrFlags::SMRD | SIInstrFlags::DS |
+                            SIInstrFlags::FLAT | SIInstrFlags::MIMG;
+
+  for (MachineBasicBlock &MBB : MF) {
+    MachineBasicBlock::instr_iterator Next;
+    MachineBasicBlock::instr_iterator B = MBB.instr_begin();
+    MachineBasicBlock::instr_iterator E = MBB.instr_end();
+    for (auto I = B; I != E; I = Next) {
+      Next = std::next(I);
+
+      const uint64_t IMemFlags = I->getDesc().TSFlags & MemFlags;
+
+      if (IMemFlags == 0 || I->isBundled() || !I->mayLoadOrStore() ||
+          B->mayLoad() != I->mayLoad() || B->mayStore() != I->mayStore() ||
+          ((B->getDesc().TSFlags & MemFlags) != IMemFlags) ||
+          isDependentLoad(*I)) {
+
+        if (B != I) {
+          if (std::next(B) != I) {
+            finalizeBundle(MBB, B, I);
+            Changed = true;
+          }
+          Next = I;
+        }
+
+        B = Next;
+        Defs.clear();
+        continue;
+      }
+
+      if (I->getNumExplicitDefs() == 0)
+        continue;
+
+      Defs.insert(I->defs().begin()->getReg());
+    }
+
+    if (B != E && std::next(B) != E) {
+      finalizeBundle(MBB, B, E);
+      Changed = true;
+    }
+
+    Defs.clear();
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
new file mode 100644
index 000000000000..f31c722db1b2
--- /dev/null
+++ b/llvm/lib/Target/AMDGPU/SIPreEmitPeephole.cpp
@@ -0,0 +1,326 @@
+//===-- SIPreEmitPeephole.cpp ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file
+/// This pass performs the peephole optimizations before code emission.
+///
+//===----------------------------------------------------------------------===//
+
+#include "AMDGPU.h"
+#include "AMDGPUSubtarget.h"
+#include "MCTargetDesc/AMDGPUMCTargetDesc.h"
+#include "SIInstrInfo.h"
+#include "SIMachineFunctionInfo.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/Support/CommandLine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "si-pre-emit-peephole"
+
+namespace {
+
+class SIPreEmitPeephole : public MachineFunctionPass {
+private:
+  const SIInstrInfo *TII = nullptr;
+  const SIRegisterInfo *TRI = nullptr;
+
+  bool optimizeVccBranch(MachineInstr &MI) const;
+  bool optimizeSetGPR(MachineInstr &First, MachineInstr &MI) const;
+
+public:
+  static char ID;
+
+  SIPreEmitPeephole() : MachineFunctionPass(ID) {
+    initializeSIPreEmitPeepholePass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+
+} // End anonymous namespace.
+
+INITIALIZE_PASS(SIPreEmitPeephole, DEBUG_TYPE,
+                "SI peephole optimizations", false, false)
+
+char SIPreEmitPeephole::ID = 0;
+
+char &llvm::SIPreEmitPeepholeID = SIPreEmitPeephole::ID;
+
+bool SIPreEmitPeephole::optimizeVccBranch(MachineInstr &MI) const {
+  // Match:
+  // sreg = -1 or 0
+  // vcc = S_AND_B64 exec, sreg or S_ANDN2_B64 exec, sreg
+  // S_CBRANCH_VCC[N]Z
+  // =>
+  // S_CBRANCH_EXEC[N]Z
+  // We end up with this pattern sometimes after basic block placement.
+  // It happens while combining a block which assigns -1 or 0 to a saved mask
+  // and another block which consumes that saved mask and then a branch.
+  bool Changed = false;
+  MachineBasicBlock &MBB = *MI.getParent();
+  const GCNSubtarget &ST = MBB.getParent()->getSubtarget<GCNSubtarget>();
+  const bool IsWave32 = ST.isWave32();
+  const unsigned CondReg = TRI->getVCC();
+  const unsigned ExecReg = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+  const unsigned And = IsWave32 ? AMDGPU::S_AND_B32 : AMDGPU::S_AND_B64;
+  const unsigned AndN2 = IsWave32 ? AMDGPU::S_ANDN2_B32 : AMDGPU::S_ANDN2_B64;
+
+  MachineBasicBlock::reverse_iterator A = MI.getReverseIterator(),
+                                      E = MBB.rend();
+  bool ReadsCond = false;
+  unsigned Threshold = 5;
+  for (++A; A != E; ++A) {
+    if (!--Threshold)
+      return false;
+    if (A->modifiesRegister(ExecReg, TRI))
+      return false;
+    if (A->modifiesRegister(CondReg, TRI)) {
+      if (!A->definesRegister(CondReg, TRI) ||
+          (A->getOpcode() != And && A->getOpcode() != AndN2))
+        return false;
+      break;
+    }
+    ReadsCond |= A->readsRegister(CondReg, TRI);
+  }
+  if (A == E)
+    return false;
+
+  MachineOperand &Op1 = A->getOperand(1);
+  MachineOperand &Op2 = A->getOperand(2);
+  if (Op1.getReg() != ExecReg && Op2.isReg() && Op2.getReg() == ExecReg) {
+    TII->commuteInstruction(*A);
+    Changed = true;
+  }
+  if (Op1.getReg() != ExecReg)
+    return Changed;
+  if (Op2.isImm() && !(Op2.getImm() == -1 || Op2.getImm() == 0))
+    return Changed;
+
+  int64_t MaskValue = 0;
+  Register SReg;
+  if (Op2.isReg()) {
+    SReg = Op2.getReg();
+    auto M = std::next(A);
+    bool ReadsSreg = false;
+    for (; M != E; ++M) {
+      if (M->definesRegister(SReg, TRI))
+        break;
+      if (M->modifiesRegister(SReg, TRI))
+        return Changed;
+      ReadsSreg |= M->readsRegister(SReg, TRI);
+    }
+    if (M == E || !M->isMoveImmediate() || !M->getOperand(1).isImm() ||
+        (M->getOperand(1).getImm() != -1 && M->getOperand(1).getImm() != 0))
+      return Changed;
+    MaskValue = M->getOperand(1).getImm();
+    // First if sreg is only used in the AND instruction fold the immediate
+    // into into the AND.
+    if (!ReadsSreg && Op2.isKill()) {
+      A->getOperand(2).ChangeToImmediate(MaskValue);
+      M->eraseFromParent();
+    }
+  } else if (Op2.isImm()) {
+    MaskValue = Op2.getImm();
+  } else {
+    llvm_unreachable("Op2 must be register or immediate");
+  }
+
+  // Invert mask for s_andn2
+  assert(MaskValue == 0 || MaskValue == -1);
+  if (A->getOpcode() == AndN2)
+    MaskValue = ~MaskValue;
+
+  if (!ReadsCond && A->registerDefIsDead(AMDGPU::SCC) &&
+      MI.killsRegister(CondReg, TRI))
+    A->eraseFromParent();
+
+  bool IsVCCZ = MI.getOpcode() == AMDGPU::S_CBRANCH_VCCZ;
+  if (SReg == ExecReg) {
+    // EXEC is updated directly
+    if (IsVCCZ) {
+      MI.eraseFromParent();
+      return true;
+    }
+    MI.setDesc(TII->get(AMDGPU::S_BRANCH));
+  } else if (IsVCCZ && MaskValue == 0) {
+    // Will always branch
+    // Remove all succesors shadowed by new unconditional branch
+    MachineBasicBlock *Parent = MI.getParent();
+    SmallVector<MachineInstr *, 4> ToRemove;
+    bool Found = false;
+    for (MachineInstr &Term : Parent->terminators()) {
+      if (Found) {
+        if (Term.isBranch())
+          ToRemove.push_back(&Term);
+      } else {
+        Found = Term.isIdenticalTo(MI);
+      }
+    }
+    assert(Found && "conditional branch is not terminator");
+    for (auto BranchMI : ToRemove) {
+      MachineOperand &Dst = BranchMI->getOperand(0);
+      assert(Dst.isMBB() && "destination is not basic block");
+      Parent->removeSuccessor(Dst.getMBB());
+      BranchMI->eraseFromParent();
+    }
+
+    if (MachineBasicBlock *Succ = Parent->getFallThrough()) {
+      Parent->removeSuccessor(Succ);
+    }
+
+    // Rewrite to unconditional branch
+    MI.setDesc(TII->get(AMDGPU::S_BRANCH));
+  } else if (!IsVCCZ && MaskValue == 0) {
+    // Will never branch
+    MachineOperand &Dst = MI.getOperand(0);
+    assert(Dst.isMBB() && "destination is not basic block");
+    MI.getParent()->removeSuccessor(Dst.getMBB());
+    MI.eraseFromParent();
+    return true;
+  } else if (MaskValue == -1) {
+    // Depends only on EXEC
+    MI.setDesc(
+        TII->get(IsVCCZ ? AMDGPU::S_CBRANCH_EXECZ : AMDGPU::S_CBRANCH_EXECNZ));
+  }
+
+  MI.RemoveOperand(MI.findRegisterUseOperandIdx(CondReg, false /*Kill*/, TRI));
+  MI.addImplicitDefUseOperands(*MBB.getParent());
+
+  return true;
+}
+
+bool SIPreEmitPeephole::optimizeSetGPR(MachineInstr &First,
+                                       MachineInstr &MI) const {
+  MachineBasicBlock &MBB = *MI.getParent();
+  const MachineFunction &MF = *MBB.getParent();
+  const MachineRegisterInfo &MRI = MF.getRegInfo();
+  MachineOperand *Idx = TII->getNamedOperand(MI, AMDGPU::OpName::src0);
+  Register IdxReg = Idx->isReg() ? Idx->getReg() : Register();
+  SmallVector<MachineInstr *, 4> ToRemove;
+  bool IdxOn = true;
+
+  if (!MI.isIdenticalTo(First))
+    return false;
+
+  // Scan back to find an identical S_SET_GPR_IDX_ON
+  for (MachineBasicBlock::iterator I = std::next(First.getIterator()),
+       E = MI.getIterator(); I != E; ++I) {
+    switch (I->getOpcode()) {
+    case AMDGPU::S_SET_GPR_IDX_MODE:
+      return false;
+    case AMDGPU::S_SET_GPR_IDX_OFF:
+      IdxOn = false;
+      ToRemove.push_back(&*I);
+      break;
+    default:
+      if (I->modifiesRegister(AMDGPU::M0, TRI))
+        return false;
+      if (IdxReg && I->modifiesRegister(IdxReg, TRI))
+        return false;
+      if (llvm::any_of(I->operands(),
+                       [&MRI, this](const MachineOperand &MO) {
+                         return MO.isReg() &&
+                                TRI->isVectorRegister(MRI, MO.getReg());
+                       })) {
+        // The only exception allowed here is another indirect vector move
+        // with the same mode.
+        if (!IdxOn ||
+            !((I->getOpcode() == AMDGPU::V_MOV_B32_e32 &&
+               I->hasRegisterImplicitUseOperand(AMDGPU::M0)) ||
+              I->getOpcode() == AMDGPU::V_MOV_B32_indirect))
+          return false;
+      }
+    }
+  }
+
+  MI.eraseFromParent();
+  for (MachineInstr *RI : ToRemove)
+    RI->eraseFromParent();
+  return true;
+}
+
+bool SIPreEmitPeephole::runOnMachineFunction(MachineFunction &MF) {
+  const GCNSubtarget &ST = MF.getSubtarget<GCNSubtarget>();
+  TII = ST.getInstrInfo();
+  TRI = &TII->getRegisterInfo();
+  MachineBasicBlock *EmptyMBBAtEnd = nullptr;
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    MachineBasicBlock::iterator MBBE = MBB.getFirstTerminator();
+    if (MBBE != MBB.end()) {
+      MachineInstr &MI = *MBBE;
+      switch (MI.getOpcode()) {
+      case AMDGPU::S_CBRANCH_VCCZ:
+      case AMDGPU::S_CBRANCH_VCCNZ:
+        Changed |= optimizeVccBranch(MI);
+        continue;
+      case AMDGPU::SI_RETURN_TO_EPILOG:
+        // FIXME: This is not an optimization and should be
+        // moved somewhere else.
+        assert(!MF.getInfo<SIMachineFunctionInfo>()->returnsVoid());
+
+        // Graphics shaders returning non-void shouldn't contain S_ENDPGM,
+        // because external bytecode will be appended at the end.
+        if (&MBB != &MF.back() || &MI != &MBB.back()) {
+          // SI_RETURN_TO_EPILOG is not the last instruction. Add an empty block
+          // at the end and jump there.
+          if (!EmptyMBBAtEnd) {
+            EmptyMBBAtEnd = MF.CreateMachineBasicBlock();
+            MF.insert(MF.end(), EmptyMBBAtEnd);
+          }
+
+          MBB.addSuccessor(EmptyMBBAtEnd);
+          BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(AMDGPU::S_BRANCH))
+              .addMBB(EmptyMBBAtEnd);
+          MI.eraseFromParent();
+          MBBE = MBB.getFirstTerminator();
+        }
+        break;
+      default:
+        break;
+      }
+    }
+
+    if (!ST.hasVGPRIndexMode())
+      continue;
+
+    MachineInstr *SetGPRMI = nullptr;
+    const unsigned Threshold = 20;
+    unsigned Count = 0;
+    // Scan the block for two S_SET_GPR_IDX_ON instructions to see if a
+    // second is not needed. Do expensive checks in the optimizeSetGPR()
+    // and limit the distance to 20 instructions for compile time purposes.
+    for (MachineBasicBlock::iterator MBBI = MBB.begin(); MBBI != MBBE; ) {
+      MachineInstr &MI = *MBBI;
+      ++MBBI;
+
+      if (Count == Threshold)
+        SetGPRMI = nullptr;
+      else
+        ++Count;
+
+      if (MI.getOpcode() != AMDGPU::S_SET_GPR_IDX_ON)
+        continue;
+
+      Count = 0;
+      if (!SetGPRMI) {
+        SetGPRMI = &MI;
+        continue;
+      }
+
+      if (optimizeSetGPR(*SetGPRMI, MI))
+        Changed = true;
+      else
+        SetGPRMI = &MI;
+    }
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index fbadad3c84ad..5d6009ebf384 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -26,27 +26,12 @@
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/LLVMContext.h"
+#include <vector>
 
 using namespace llvm;
 
-static bool hasPressureSet(const int *PSets, unsigned PSetID) {
-  for (unsigned i = 0; PSets[i] != -1; ++i) {
-    if (PSets[i] == (int)PSetID)
-      return true;
-  }
-  return false;
-}
-
-void SIRegisterInfo::classifyPressureSet(unsigned PSetID, unsigned Reg,
-                                         BitVector &PressureSets) const {
-  for (MCRegUnitIterator U(Reg, this); U.isValid(); ++U) {
-    const int *PSets = getRegUnitPressureSets(*U);
-    if (hasPressureSet(PSets, PSetID)) {
-      PressureSets.set(PSetID);
-      break;
-    }
-  }
-}
+#define GET_REGINFO_TARGET_DESC
+#include "AMDGPUGenRegisterInfo.inc"
 
 static cl::opt<bool> EnableSpillSGPRToVGPR(
   "amdgpu-spill-sgpr-to-vgpr",
@@ -54,90 +39,200 @@ static cl::opt<bool> EnableSpillSGPRToVGPR(
   cl::ReallyHidden,
   cl::init(true));
 
-SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST) :
-  AMDGPURegisterInfo(),
-  ST(ST),
-  SGPRPressureSets(getNumRegPressureSets()),
-  VGPRPressureSets(getNumRegPressureSets()),
-  AGPRPressureSets(getNumRegPressureSets()),
-  SpillSGPRToVGPR(EnableSpillSGPRToVGPR),
-  isWave32(ST.isWave32()) {
-  unsigned NumRegPressureSets = getNumRegPressureSets();
-
-  SGPRSetID = NumRegPressureSets;
-  VGPRSetID = NumRegPressureSets;
-  AGPRSetID = NumRegPressureSets;
-
-  for (unsigned i = 0; i < NumRegPressureSets; ++i) {
-    classifyPressureSet(i, AMDGPU::SGPR0, SGPRPressureSets);
-    classifyPressureSet(i, AMDGPU::VGPR0, VGPRPressureSets);
-    classifyPressureSet(i, AMDGPU::AGPR0, AGPRPressureSets);
-  }
-
-  // Determine the number of reg units for each pressure set.
-  std::vector<unsigned> PressureSetRegUnits(NumRegPressureSets, 0);
-  for (unsigned i = 0, e = getNumRegUnits(); i != e; ++i) {
-    const int *PSets = getRegUnitPressureSets(i);
-    for (unsigned j = 0; PSets[j] != -1; ++j) {
-      ++PressureSetRegUnits[PSets[j]];
+std::array<std::vector<int16_t>, 16> SIRegisterInfo::RegSplitParts;
+
+SIRegisterInfo::SIRegisterInfo(const GCNSubtarget &ST)
+    : AMDGPUGenRegisterInfo(AMDGPU::PC_REG, ST.getAMDGPUDwarfFlavour()), ST(ST),
+      SpillSGPRToVGPR(EnableSpillSGPRToVGPR), isWave32(ST.isWave32()) {
+
+  assert(getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() == 3 &&
+         getSubRegIndexLaneMask(AMDGPU::sub31).getAsInteger() == (3ULL << 62) &&
+         (getSubRegIndexLaneMask(AMDGPU::lo16) |
+          getSubRegIndexLaneMask(AMDGPU::hi16)).getAsInteger() ==
+           getSubRegIndexLaneMask(AMDGPU::sub0).getAsInteger() &&
+         "getNumCoveredRegs() will not work with generated subreg masks!");
+
+  RegPressureIgnoredUnits.resize(getNumRegUnits());
+  RegPressureIgnoredUnits.set(*MCRegUnitIterator(AMDGPU::M0, this));
+  for (auto Reg : AMDGPU::VGPR_HI16RegClass)
+    RegPressureIgnoredUnits.set(*MCRegUnitIterator(Reg, this));
+
+  // HACK: Until this is fully tablegen'd.
+  static llvm::once_flag InitializeRegSplitPartsFlag;
+
+  static auto InitializeRegSplitPartsOnce = [this]() {
+    for (unsigned Idx = 1, E = getNumSubRegIndices() - 1; Idx < E; ++Idx) {
+      unsigned Size = getSubRegIdxSize(Idx);
+      if (Size & 31)
+        continue;
+      std::vector<int16_t> &Vec = RegSplitParts[Size / 32 - 1];
+      unsigned Pos = getSubRegIdxOffset(Idx);
+      if (Pos % Size)
+        continue;
+      Pos /= Size;
+      if (Vec.empty()) {
+        unsigned MaxNumParts = 1024 / Size; // Maximum register is 1024 bits.
+        Vec.resize(MaxNumParts);
+      }
+      Vec[Pos] = Idx;
     }
+  };
+
+
+  llvm::call_once(InitializeRegSplitPartsFlag, InitializeRegSplitPartsOnce);
+}
+
+void SIRegisterInfo::reserveRegisterTuples(BitVector &Reserved,
+                                           MCRegister Reg) const {
+  MCRegAliasIterator R(Reg, this, true);
+
+  for (; R.isValid(); ++R)
+    Reserved.set(*R);
+}
+
+// Forced to be here by one .inc
+const MCPhysReg *SIRegisterInfo::getCalleeSavedRegs(
+  const MachineFunction *MF) const {
+  CallingConv::ID CC = MF->getFunction().getCallingConv();
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::Fast:
+  case CallingConv::Cold:
+    return CSR_AMDGPU_HighRegs_SaveList;
+  default: {
+    // Dummy to not crash RegisterClassInfo.
+    static const MCPhysReg NoCalleeSavedReg = AMDGPU::NoRegister;
+    return &NoCalleeSavedReg;
   }
+  }
+}
 
-  unsigned VGPRMax = 0, SGPRMax = 0, AGPRMax = 0;
-  for (unsigned i = 0; i < NumRegPressureSets; ++i) {
-    if (isVGPRPressureSet(i) && PressureSetRegUnits[i] > VGPRMax) {
-      VGPRSetID = i;
-      VGPRMax = PressureSetRegUnits[i];
-      continue;
-    }
-    if (isSGPRPressureSet(i) && PressureSetRegUnits[i] > SGPRMax) {
-      SGPRSetID = i;
-      SGPRMax = PressureSetRegUnits[i];
-    }
-    if (isAGPRPressureSet(i) && PressureSetRegUnits[i] > AGPRMax) {
-      AGPRSetID = i;
-      AGPRMax = PressureSetRegUnits[i];
-      continue;
-    }
+const MCPhysReg *
+SIRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
+  return nullptr;
+}
+
+const uint32_t *SIRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
+                                                     CallingConv::ID CC) const {
+  switch (CC) {
+  case CallingConv::C:
+  case CallingConv::Fast:
+  case CallingConv::Cold:
+    return CSR_AMDGPU_HighRegs_RegMask;
+  default:
+    return nullptr;
   }
+}
 
-  assert(SGPRSetID < NumRegPressureSets &&
-         VGPRSetID < NumRegPressureSets &&
-         AGPRSetID < NumRegPressureSets);
+Register SIRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  const SIFrameLowering *TFI =
+      MF.getSubtarget<GCNSubtarget>().getFrameLowering();
+  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
+  // During ISel lowering we always reserve the stack pointer in entry
+  // functions, but never actually want to reference it when accessing our own
+  // frame. If we need a frame pointer we use it, but otherwise we can just use
+  // an immediate "0" which we represent by returning NoRegister.
+  if (FuncInfo->isEntryFunction()) {
+    return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg() : Register();
+  }
+  return TFI->hasFP(MF) ? FuncInfo->getFrameOffsetReg()
+                        : FuncInfo->getStackPtrOffsetReg();
 }
 
-unsigned SIRegisterInfo::reservedPrivateSegmentBufferReg(
-  const MachineFunction &MF) const {
-  unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
-  unsigned BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
-  return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
+bool SIRegisterInfo::hasBasePointer(const MachineFunction &MF) const {
+  // When we need stack realignment, we can't reference off of the
+  // stack pointer, so we reserve a base pointer.
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  return MFI.getNumFixedObjects() && needsStackRealignment(MF);
 }
 
-static unsigned findPrivateSegmentWaveByteOffsetRegIndex(unsigned RegCount) {
-  unsigned Reg;
+Register SIRegisterInfo::getBaseRegister() const { return AMDGPU::SGPR34; }
 
-  // Try to place it in a hole after PrivateSegmentBufferReg.
-  if (RegCount & 3) {
-    // We cannot put the segment buffer in (Idx - 4) ... (Idx - 1) due to
-    // alignment constraints, so we have a hole where can put the wave offset.
-    Reg = RegCount - 1;
-  } else {
-    // We can put the segment buffer in (Idx - 4) ... (Idx - 1) and put the
-    // wave offset before it.
-    Reg = RegCount - 5;
-  }
+const uint32_t *SIRegisterInfo::getAllVGPRRegMask() const {
+  return CSR_AMDGPU_AllVGPRs_RegMask;
+}
 
-  return Reg;
+const uint32_t *SIRegisterInfo::getAllAllocatableSRegMask() const {
+  return CSR_AMDGPU_AllAllocatableSRegs_RegMask;
 }
 
-unsigned SIRegisterInfo::reservedPrivateSegmentWaveByteOffsetReg(
+// FIXME: TableGen should generate something to make this manageable for all
+// register classes. At a minimum we could use the opposite of
+// composeSubRegIndices and go up from the base 32-bit subreg.
+unsigned SIRegisterInfo::getSubRegFromChannel(unsigned Channel,
+                                              unsigned NumRegs) {
+  // Table of NumRegs sized pieces at every 32-bit offset.
+  static const uint16_t SubRegFromChannelTable[][32] = {
+      {AMDGPU::sub0,  AMDGPU::sub1,  AMDGPU::sub2,  AMDGPU::sub3,
+       AMDGPU::sub4,  AMDGPU::sub5,  AMDGPU::sub6,  AMDGPU::sub7,
+       AMDGPU::sub8,  AMDGPU::sub9,  AMDGPU::sub10, AMDGPU::sub11,
+       AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
+       AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
+       AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
+       AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
+       AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31},
+      {AMDGPU::sub0_sub1,   AMDGPU::sub1_sub2,    AMDGPU::sub2_sub3,
+       AMDGPU::sub3_sub4,   AMDGPU::sub4_sub5,    AMDGPU::sub5_sub6,
+       AMDGPU::sub6_sub7,   AMDGPU::sub7_sub8,    AMDGPU::sub8_sub9,
+       AMDGPU::sub9_sub10,  AMDGPU::sub10_sub11,  AMDGPU::sub11_sub12,
+       AMDGPU::sub12_sub13, AMDGPU::sub13_sub14,  AMDGPU::sub14_sub15,
+       AMDGPU::sub15_sub16, AMDGPU::sub16_sub17,  AMDGPU::sub17_sub18,
+       AMDGPU::sub18_sub19, AMDGPU::sub19_sub20,  AMDGPU::sub20_sub21,
+       AMDGPU::sub21_sub22, AMDGPU::sub22_sub23,  AMDGPU::sub23_sub24,
+       AMDGPU::sub24_sub25, AMDGPU::sub25_sub26,  AMDGPU::sub26_sub27,
+       AMDGPU::sub27_sub28, AMDGPU::sub28_sub29,  AMDGPU::sub29_sub30,
+       AMDGPU::sub30_sub31, AMDGPU::NoSubRegister},
+      {AMDGPU::sub0_sub1_sub2,    AMDGPU::sub1_sub2_sub3,
+       AMDGPU::sub2_sub3_sub4,    AMDGPU::sub3_sub4_sub5,
+       AMDGPU::sub4_sub5_sub6,    AMDGPU::sub5_sub6_sub7,
+       AMDGPU::sub6_sub7_sub8,    AMDGPU::sub7_sub8_sub9,
+       AMDGPU::sub8_sub9_sub10,   AMDGPU::sub9_sub10_sub11,
+       AMDGPU::sub10_sub11_sub12, AMDGPU::sub11_sub12_sub13,
+       AMDGPU::sub12_sub13_sub14, AMDGPU::sub13_sub14_sub15,
+       AMDGPU::sub14_sub15_sub16, AMDGPU::sub15_sub16_sub17,
+       AMDGPU::sub16_sub17_sub18, AMDGPU::sub17_sub18_sub19,
+       AMDGPU::sub18_sub19_sub20, AMDGPU::sub19_sub20_sub21,
+       AMDGPU::sub20_sub21_sub22, AMDGPU::sub21_sub22_sub23,
+       AMDGPU::sub22_sub23_sub24, AMDGPU::sub23_sub24_sub25,
+       AMDGPU::sub24_sub25_sub26, AMDGPU::sub25_sub26_sub27,
+       AMDGPU::sub26_sub27_sub28, AMDGPU::sub27_sub28_sub29,
+       AMDGPU::sub28_sub29_sub30, AMDGPU::sub29_sub30_sub31,
+       AMDGPU::NoSubRegister,     AMDGPU::NoSubRegister},
+      {AMDGPU::sub0_sub1_sub2_sub3,     AMDGPU::sub1_sub2_sub3_sub4,
+       AMDGPU::sub2_sub3_sub4_sub5,     AMDGPU::sub3_sub4_sub5_sub6,
+       AMDGPU::sub4_sub5_sub6_sub7,     AMDGPU::sub5_sub6_sub7_sub8,
+       AMDGPU::sub6_sub7_sub8_sub9,     AMDGPU::sub7_sub8_sub9_sub10,
+       AMDGPU::sub8_sub9_sub10_sub11,   AMDGPU::sub9_sub10_sub11_sub12,
+       AMDGPU::sub10_sub11_sub12_sub13, AMDGPU::sub11_sub12_sub13_sub14,
+       AMDGPU::sub12_sub13_sub14_sub15, AMDGPU::sub13_sub14_sub15_sub16,
+       AMDGPU::sub14_sub15_sub16_sub17, AMDGPU::sub15_sub16_sub17_sub18,
+       AMDGPU::sub16_sub17_sub18_sub19, AMDGPU::sub17_sub18_sub19_sub20,
+       AMDGPU::sub18_sub19_sub20_sub21, AMDGPU::sub19_sub20_sub21_sub22,
+       AMDGPU::sub20_sub21_sub22_sub23, AMDGPU::sub21_sub22_sub23_sub24,
+       AMDGPU::sub22_sub23_sub24_sub25, AMDGPU::sub23_sub24_sub25_sub26,
+       AMDGPU::sub24_sub25_sub26_sub27, AMDGPU::sub25_sub26_sub27_sub28,
+       AMDGPU::sub26_sub27_sub28_sub29, AMDGPU::sub27_sub28_sub29_sub30,
+       AMDGPU::sub28_sub29_sub30_sub31, AMDGPU::NoSubRegister,
+       AMDGPU::NoSubRegister,           AMDGPU::NoSubRegister}};
+
+  const unsigned NumRegIndex = NumRegs - 1;
+
+  assert(NumRegIndex < array_lengthof(SubRegFromChannelTable) &&
+         "Not implemented");
+  assert(Channel < array_lengthof(SubRegFromChannelTable[0]));
+  return SubRegFromChannelTable[NumRegIndex][Channel];
+}
+
+MCRegister SIRegisterInfo::reservedPrivateSegmentBufferReg(
   const MachineFunction &MF) const {
-  unsigned Reg = findPrivateSegmentWaveByteOffsetRegIndex(ST.getMaxNumSGPRs(MF));
-  return AMDGPU::SGPR_32RegClass.getRegister(Reg);
+  unsigned BaseIdx = alignDown(ST.getMaxNumSGPRs(MF), 4) - 4;
+  MCRegister BaseReg(AMDGPU::SGPR_32RegClass.getRegister(BaseIdx));
+  return getMatchingSuperReg(BaseReg, AMDGPU::sub0, &AMDGPU::SGPR_128RegClass);
 }
 
 BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
+  Reserved.set(AMDGPU::MODE);
 
   // EXEC_LO and EXEC_HI could be allocated and used as regular register, but
   // this seems likely to result in bugs, so I'm marking them as reserved.
@@ -205,6 +300,18 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
     reserveRegisterTuples(Reserved, Reg);
   }
 
+  for (auto Reg : AMDGPU::SReg_32RegClass) {
+    Reserved.set(getSubReg(Reg, AMDGPU::hi16));
+    Register Low = getSubReg(Reg, AMDGPU::lo16);
+    // This is to prevent BB vcc liveness errors.
+    if (!AMDGPU::SGPR_LO16RegClass.contains(Low))
+      Reserved.set(Low);
+  }
+
+  for (auto Reg : AMDGPU::AGPR_32RegClass) {
+    Reserved.set(getSubReg(Reg, AMDGPU::hi16));
+  }
+
   // Reserve all the rest AGPRs if there are no instructions to use it.
   if (!ST.hasMAIInsts()) {
     for (unsigned i = 0; i < MaxNumVGPRs; ++i) {
@@ -215,38 +322,37 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   const SIMachineFunctionInfo *MFI = MF.getInfo<SIMachineFunctionInfo>();
 
-  unsigned ScratchWaveOffsetReg = MFI->getScratchWaveOffsetReg();
-  if (ScratchWaveOffsetReg != AMDGPU::NoRegister) {
-    // Reserve 1 SGPR for scratch wave offset in case we need to spill.
-    reserveRegisterTuples(Reserved, ScratchWaveOffsetReg);
-  }
-
   unsigned ScratchRSrcReg = MFI->getScratchRSrcReg();
   if (ScratchRSrcReg != AMDGPU::NoRegister) {
     // Reserve 4 SGPRs for the scratch buffer resource descriptor in case we need
     // to spill.
     // TODO: May need to reserve a VGPR if doing LDS spilling.
     reserveRegisterTuples(Reserved, ScratchRSrcReg);
-    assert(!isSubRegister(ScratchRSrcReg, ScratchWaveOffsetReg));
   }
 
   // We have to assume the SP is needed in case there are calls in the function,
   // which is detected after the function is lowered. If we aren't really going
   // to need SP, don't bother reserving it.
-  unsigned StackPtrReg = MFI->getStackPtrOffsetReg();
+  MCRegister StackPtrReg = MFI->getStackPtrOffsetReg();
 
-  if (StackPtrReg != AMDGPU::NoRegister) {
+  if (StackPtrReg) {
     reserveRegisterTuples(Reserved, StackPtrReg);
     assert(!isSubRegister(ScratchRSrcReg, StackPtrReg));
   }
 
-  unsigned FrameReg = MFI->getFrameOffsetReg();
-  if (FrameReg != AMDGPU::NoRegister) {
+  MCRegister FrameReg = MFI->getFrameOffsetReg();
+  if (FrameReg) {
     reserveRegisterTuples(Reserved, FrameReg);
     assert(!isSubRegister(ScratchRSrcReg, FrameReg));
   }
 
-  for (unsigned Reg : MFI->WWMReservedRegs) {
+  if (hasBasePointer(MF)) {
+    MCRegister BasePtrReg = getBaseRegister();
+    reserveRegisterTuples(Reserved, BasePtrReg);
+    assert(!isSubRegister(ScratchRSrcReg, BasePtrReg));
+  }
+
+  for (MCRegister Reg : MFI->WWMReservedRegs) {
     reserveRegisterTuples(Reserved, Reg);
   }
 
@@ -257,6 +363,10 @@ BitVector SIRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs())
     reserveRegisterTuples(Reserved, Reg);
 
+  if (MFI->VGPRReservedForSGPRSpill)
+    for (auto SSpill : MFI->getSGPRSpillVGPRs())
+      reserveRegisterTuples(Reserved, SSpill.VGPR);
+
   return Reserved;
 }
 
@@ -305,11 +415,6 @@ bool SIRegisterInfo::requiresVirtualBaseRegisters(
   return true;
 }
 
-bool SIRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
-  // This helps catch bugs as verifier errors.
-  return true;
-}
-
 int64_t SIRegisterInfo::getMUBUFInstrOffset(const MachineInstr *MI) const {
   assert(SIInstrInfo::isMUBUF(*MI));
 
@@ -340,7 +445,7 @@ bool SIRegisterInfo::needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
 }
 
 void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                                                  unsigned BaseReg,
+                                                  Register BaseReg,
                                                   int FrameIdx,
                                                   int64_t Offset) const {
   MachineBasicBlock::iterator Ins = MBB->begin();
@@ -374,7 +479,7 @@ void SIRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
     .addImm(0); // clamp bit
 }
 
-void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                                        int64_t Offset) const {
   const SIInstrInfo *TII = ST.getInstrInfo();
 
@@ -411,7 +516,7 @@ void SIRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
 }
 
 bool SIRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
-                                        unsigned BaseReg,
+                                        Register BaseReg,
                                         int64_t Offset) const {
   if (!SIInstrInfo::isMUBUF(*MI))
     return false;
@@ -451,6 +556,11 @@ static unsigned getNumSubRegsForSpillOp(unsigned Op) {
   case AMDGPU::SI_SPILL_V256_SAVE:
   case AMDGPU::SI_SPILL_V256_RESTORE:
     return 8;
+  case AMDGPU::SI_SPILL_S192_SAVE:
+  case AMDGPU::SI_SPILL_S192_RESTORE:
+  case AMDGPU::SI_SPILL_V192_SAVE:
+  case AMDGPU::SI_SPILL_V192_RESTORE:
+    return 6;
   case AMDGPU::SI_SPILL_S160_SAVE:
   case AMDGPU::SI_SPILL_S160_RESTORE:
   case AMDGPU::SI_SPILL_V160_SAVE:
@@ -614,10 +724,10 @@ static bool buildMUBUFOffsetLoadStore(const GCNSubtarget &ST,
 void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
                                          unsigned LoadStoreOp,
                                          int Index,
-                                         unsigned ValueReg,
+                                         Register ValueReg,
                                          bool IsKill,
-                                         unsigned ScratchRsrcReg,
-                                         unsigned ScratchOffsetReg,
+                                         MCRegister ScratchRsrcReg,
+                                         MCRegister ScratchOffsetReg,
                                          int64_t InstOffset,
                                          MachineMemOperand *MMO,
                                          RegScavenger *RS) const {
@@ -625,13 +735,14 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
   MachineFunction *MF = MI->getParent()->getParent();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const MachineFrameInfo &MFI = MF->getFrameInfo();
+  const SIMachineFunctionInfo *FuncInfo = MF->getInfo<SIMachineFunctionInfo>();
 
   const MCInstrDesc &Desc = TII->get(LoadStoreOp);
   const DebugLoc &DL = MI->getDebugLoc();
   bool IsStore = Desc.mayStore();
 
   bool Scavenged = false;
-  unsigned SOffset = ScratchOffsetReg;
+  MCRegister SOffset = ScratchOffsetReg;
 
   const unsigned EltSize = 4;
   const TargetRegisterClass *RC = getRegClassForReg(MF->getRegInfo(), ValueReg);
@@ -640,7 +751,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
   int64_t Offset = InstOffset + MFI.getObjectOffset(Index);
   int64_t ScratchOffsetRegDelta = 0;
 
-  unsigned Align = MFI.getObjectAlignment(Index);
+  Align Alignment = MFI.getObjectAlign(Index);
   const MachinePointerInfo &BasePtrInfo = MMO->getPointerInfo();
 
   Register TmpReg =
@@ -650,7 +761,7 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
   assert((Offset % EltSize) == 0 && "unexpected VGPR spill offset");
 
   if (!isUInt<12>(Offset + Size - EltSize)) {
-    SOffset = AMDGPU::NoRegister;
+    SOffset = MCRegister();
 
     // We currently only support spilling VGPRs to EltSize boundaries, meaning
     // we can simplify the adjustment of Offset here to just scale with
@@ -662,23 +773,33 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
     if (RS)
       SOffset = RS->scavengeRegister(&AMDGPU::SGPR_32RegClass, MI, 0, false);
 
-    if (SOffset == AMDGPU::NoRegister) {
+    if (!SOffset) {
       // There are no free SGPRs, and since we are in the process of spilling
       // VGPRs too.  Since we need a VGPR in order to spill SGPRs (this is true
       // on SI/CI and on VI it is true until we implement spilling using scalar
       // stores), we have no way to free up an SGPR.  Our solution here is to
-      // add the offset directly to the ScratchOffset register, and then
-      // subtract the offset after the spill to return ScratchOffset to it's
-      // original value.
+      // add the offset directly to the ScratchOffset or StackPtrOffset
+      // register, and then subtract the offset after the spill to return the
+      // register to it's original value.
+      if (!ScratchOffsetReg)
+        ScratchOffsetReg = FuncInfo->getStackPtrOffsetReg();
       SOffset = ScratchOffsetReg;
       ScratchOffsetRegDelta = Offset;
     } else {
       Scavenged = true;
     }
 
-    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
-      .addReg(ScratchOffsetReg)
-      .addImm(Offset);
+    if (!SOffset)
+      report_fatal_error("could not scavenge SGPR to spill in entry function");
+
+    if (ScratchOffsetReg == AMDGPU::NoRegister) {
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_MOV_B32), SOffset)
+          .addImm(Offset);
+    } else {
+      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), SOffset)
+          .addReg(ScratchOffsetReg)
+          .addImm(Offset);
+    }
 
     Offset = 0;
   }
@@ -708,21 +829,26 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
       }
 
       MachinePointerInfo PInfo = BasePtrInfo.getWithOffset(EltSize * i);
-      MachineMemOperand *NewMMO
-        = MF->getMachineMemOperand(PInfo, MMO->getFlags(),
-                                   EltSize, MinAlign(Align, EltSize * i));
+      MachineMemOperand *NewMMO =
+          MF->getMachineMemOperand(PInfo, MMO->getFlags(), EltSize,
+                                   commonAlignment(Alignment, EltSize * i));
 
       MIB = BuildMI(*MBB, MI, DL, Desc)
-        .addReg(SubReg, getDefRegState(!IsStore) | getKillRegState(IsKill))
-        .addReg(ScratchRsrcReg)
-        .addReg(SOffset, SOffsetRegState)
-        .addImm(Offset)
-        .addImm(0) // glc
-        .addImm(0) // slc
-        .addImm(0) // tfe
-        .addImm(0) // dlc
-        .addImm(0) // swz
-        .addMemOperand(NewMMO);
+                .addReg(SubReg,
+                        getDefRegState(!IsStore) | getKillRegState(IsKill))
+                .addReg(ScratchRsrcReg);
+      if (SOffset == AMDGPU::NoRegister) {
+        MIB.addImm(0);
+      } else {
+        MIB.addReg(SOffset, SOffsetRegState);
+      }
+      MIB.addImm(Offset)
+          .addImm(0) // glc
+          .addImm(0) // slc
+          .addImm(0) // tfe
+          .addImm(0) // dlc
+          .addImm(0) // swz
+          .addMemOperand(NewMMO);
 
       if (!IsStore && TmpReg != AMDGPU::NoRegister)
         MIB = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_ACCVGPR_WRITE_B32),
@@ -736,12 +862,124 @@ void SIRegisterInfo::buildSpillLoadStore(MachineBasicBlock::iterator MI,
 
   if (ScratchOffsetRegDelta != 0) {
     // Subtract the offset we added to the ScratchOffset register.
-    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), ScratchOffsetReg)
-        .addReg(ScratchOffsetReg)
+    BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), SOffset)
+        .addReg(SOffset)
         .addImm(ScratchOffsetRegDelta);
   }
 }
 
+// Generate a VMEM access which loads or stores the VGPR containing an SGPR
+// spill such that all the lanes set in VGPRLanes are loaded or stored.
+// This generates exec mask manipulation and will use SGPRs available in MI
+// or VGPR lanes in the VGPR to save and restore the exec mask.
+void SIRegisterInfo::buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI,
+                                             int Index, int Offset,
+                                             unsigned EltSize, Register VGPR,
+                                             int64_t VGPRLanes,
+                                             RegScavenger *RS,
+                                             bool IsLoad) const {
+  MachineBasicBlock *MBB = MI->getParent();
+  MachineFunction *MF = MBB->getParent();
+  SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
+  const SIInstrInfo *TII = ST.getInstrInfo();
+
+  Register SuperReg = MI->getOperand(0).getReg();
+  const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
+  ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
+  unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
+  unsigned FirstPart = Offset * 32;
+  unsigned ExecLane = 0;
+
+  bool IsKill = MI->getOperand(0).isKill();
+  const DebugLoc &DL = MI->getDebugLoc();
+
+  // Cannot handle load/store to EXEC
+  assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
+         SuperReg != AMDGPU::EXEC && "exec should never spill");
+
+  // On Wave32 only handle EXEC_LO.
+  // On Wave64 only update EXEC_HI if there is sufficent space for a copy.
+  bool OnlyExecLo = isWave32 || NumSubRegs == 1 || SuperReg == AMDGPU::EXEC_HI;
+
+  unsigned ExecMovOpc = OnlyExecLo ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64;
+  Register ExecReg = OnlyExecLo ? AMDGPU::EXEC_LO : AMDGPU::EXEC;
+  Register SavedExecReg;
+
+  // Backup EXEC
+  if (OnlyExecLo) {
+    SavedExecReg = NumSubRegs == 1
+                       ? SuperReg
+                       : getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]);
+  } else {
+    // If src/dst is an odd size it is possible subreg0 is not aligned.
+    for (; ExecLane < (NumSubRegs - 1); ++ExecLane) {
+      SavedExecReg = getMatchingSuperReg(
+          getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]), AMDGPU::sub0,
+          &AMDGPU::SReg_64_XEXECRegClass);
+      if (SavedExecReg)
+        break;
+    }
+  }
+  assert(SavedExecReg);
+  BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), SavedExecReg).addReg(ExecReg);
+
+  // Setup EXEC
+  BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg).addImm(VGPRLanes);
+
+  // Load/store VGPR
+  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
+  assert(FrameInfo.getStackID(Index) != TargetStackID::SGPRSpill);
+
+  Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
+                          ? getBaseRegister()
+                          : getFrameRegister(*MF);
+
+  Align Alignment = FrameInfo.getObjectAlign(Index);
+  MachinePointerInfo PtrInfo =
+      MachinePointerInfo::getFixedStack(*MF, Index);
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      PtrInfo, IsLoad ? MachineMemOperand::MOLoad : MachineMemOperand::MOStore,
+      EltSize, Alignment);
+
+  if (IsLoad) {
+    buildSpillLoadStore(MI, AMDGPU::BUFFER_LOAD_DWORD_OFFSET,
+          Index,
+          VGPR, false,
+          MFI->getScratchRSrcReg(), FrameReg,
+          Offset * EltSize, MMO,
+          RS);
+  } else {
+    buildSpillLoadStore(MI, AMDGPU::BUFFER_STORE_DWORD_OFFSET, Index, VGPR,
+                        IsKill, MFI->getScratchRSrcReg(), FrameReg,
+                        Offset * EltSize, MMO, RS);
+    // This only ever adds one VGPR spill
+    MFI->addToSpilledVGPRs(1);
+  }
+
+  // Restore EXEC
+  BuildMI(*MBB, MI, DL, TII->get(ExecMovOpc), ExecReg)
+      .addReg(SavedExecReg, getKillRegState(IsLoad || IsKill));
+
+  // Restore clobbered SGPRs
+  if (IsLoad) {
+    // Nothing to do; register will be overwritten
+  } else if (!IsKill) {
+    // Restore SGPRs from appropriate VGPR lanes
+    if (!OnlyExecLo) {
+      BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+              getSubReg(SuperReg, SplitParts[FirstPart + ExecLane + 1]))
+          .addReg(VGPR)
+          .addImm(ExecLane + 1);
+    }
+    BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
+            NumSubRegs == 1
+                ? SavedExecReg
+                : getSubReg(SuperReg, SplitParts[FirstPart + ExecLane]))
+        .addReg(VGPR, RegState::Kill)
+        .addImm(ExecLane);
+  }
+}
+
 bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
                                int Index,
                                RegScavenger *RS,
@@ -749,7 +987,7 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   MachineBasicBlock *MBB = MI->getParent();
   MachineFunction *MF = MBB->getParent();
   SIMachineFunctionInfo *MFI = MF->getInfo<SIMachineFunctionInfo>();
-  DenseSet<unsigned> SGPRSpillVGPRDefinedSet;
+  DenseSet<Register> SGPRSpillVGPRDefinedSet;
 
   ArrayRef<SIMachineFunctionInfo::SpilledReg> VGPRSpills
     = MFI->getSGPRToVGPRSpills(Index);
@@ -763,13 +1001,12 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   bool IsKill = MI->getOperand(0).isKill();
   const DebugLoc &DL = MI->getDebugLoc();
 
-  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
-
   assert(SpillToVGPR || (SuperReg != MFI->getStackPtrOffsetReg() &&
-                         SuperReg != MFI->getFrameOffsetReg() &&
-                         SuperReg != MFI->getScratchWaveOffsetReg()));
+                         SuperReg != MFI->getFrameOffsetReg()));
 
   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+  assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
+         SuperReg != AMDGPU::EXEC && "exec should never spill");
 
   unsigned EltSize = 4;
   const TargetRegisterClass *RC = getPhysRegClass(SuperReg);
@@ -777,17 +1014,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
 
-  // Scavenged temporary VGPR to use. It must be scavenged once for any number
-  // of spilled subregs.
-  Register TmpVGPR;
-
-  // SubReg carries the "Kill" flag when SubReg == SuperReg.
-  unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
-  for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
-    Register SubReg =
-        NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
-
-    if (SpillToVGPR) {
+  if (SpillToVGPR) {
+    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+      Register SubReg =
+          NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
 
       // During SGPR spilling to VGPR, determine if the VGPR is defined. The
@@ -809,42 +1039,53 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI,
       // FIXME: Since this spills to another register instead of an actual
       // frame index, we should delete the frame index when all references to
       // it are fixed.
-    } else {
-      // XXX - Can to VGPR spill fail for some subregisters but not others?
-      if (OnlyToVGPR)
-        return false;
-
-      // Spill SGPR to a frame index.
-      if (!TmpVGPR.isValid())
-        TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
-
-      MachineInstrBuilder Mov
-        = BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_MOV_B32_e32), TmpVGPR)
-        .addReg(SubReg, SubKillState);
-
-      // There could be undef components of a spilled super register.
-      // TODO: Can we detect this and skip the spill?
-      if (NumSubRegs > 1) {
-        // The last implicit use of the SuperReg carries the "Kill" flag.
-        unsigned SuperKillState = 0;
-        if (i + 1 == e)
-          SuperKillState |= getKillRegState(IsKill);
-        Mov.addReg(SuperReg, RegState::Implicit | SuperKillState);
+    }
+  } else {
+    // Scavenged temporary VGPR to use. It must be scavenged once for any number
+    // of spilled subregs.
+    Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+    RS->setRegUsed(TmpVGPR);
+
+    // SubReg carries the "Kill" flag when SubReg == SuperReg.
+    unsigned SubKillState = getKillRegState((NumSubRegs == 1) && IsKill);
+
+    unsigned PerVGPR = 32;
+    unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
+    int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
+
+    for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
+      unsigned TmpVGPRFlags = RegState::Undef;
+
+      // Write sub registers into the VGPR
+      for (unsigned i = Offset * PerVGPR,
+                    e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
+           i < e; ++i) {
+        Register SubReg =
+            NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
+
+        MachineInstrBuilder WriteLane =
+            BuildMI(*MBB, MI, DL,
+                    TII->getMCOpcodeFromPseudo(AMDGPU::V_WRITELANE_B32),
+                    TmpVGPR)
+                .addReg(SubReg, SubKillState)
+                .addImm(i % PerVGPR)
+                .addReg(TmpVGPR, TmpVGPRFlags);
+        TmpVGPRFlags = 0;
+
+        // There could be undef components of a spilled super register.
+        // TODO: Can we detect this and skip the spill?
+        if (NumSubRegs > 1) {
+          // The last implicit use of the SuperReg carries the "Kill" flag.
+          unsigned SuperKillState = 0;
+          if (i + 1 == NumSubRegs)
+            SuperKillState |= getKillRegState(IsKill);
+          WriteLane.addReg(SuperReg, RegState::Implicit | SuperKillState);
+        }
       }
 
-      unsigned Align = FrameInfo.getObjectAlignment(Index);
-      MachinePointerInfo PtrInfo
-        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
-      MachineMemOperand *MMO
-        = MF->getMachineMemOperand(PtrInfo, MachineMemOperand::MOStore,
-                                   EltSize, MinAlign(Align, EltSize * i));
-      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_SAVE))
-        .addReg(TmpVGPR, RegState::Kill)      // src
-        .addFrameIndex(Index)                 // vaddr
-        .addReg(MFI->getScratchRSrcReg())     // srrsrc
-        .addReg(MFI->getStackPtrOffsetReg())  // soffset
-        .addImm(i * 4)                        // offset
-        .addMemOperand(MMO);
+      // Write out VGPR
+      buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
+                              RS, false);
     }
   }
 
@@ -867,13 +1108,14 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
   if (OnlyToVGPR && !SpillToVGPR)
     return false;
 
-  MachineFrameInfo &FrameInfo = MF->getFrameInfo();
   const SIInstrInfo *TII = ST.getInstrInfo();
   const DebugLoc &DL = MI->getDebugLoc();
 
   Register SuperReg = MI->getOperand(0).getReg();
 
   assert(SuperReg != AMDGPU::M0 && "m0 should never spill");
+  assert(SuperReg != AMDGPU::EXEC_LO && SuperReg != AMDGPU::EXEC_HI &&
+         SuperReg != AMDGPU::EXEC && "exec should never spill");
 
   unsigned EltSize = 4;
 
@@ -882,52 +1124,49 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI,
   ArrayRef<int16_t> SplitParts = getRegSplitParts(RC, EltSize);
   unsigned NumSubRegs = SplitParts.empty() ? 1 : SplitParts.size();
 
-  Register TmpVGPR;
-
-  for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
-    Register SubReg =
-        NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
+  if (SpillToVGPR) {
+    for (unsigned i = 0, e = NumSubRegs; i < e; ++i) {
+      Register SubReg =
+          NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
 
-    if (SpillToVGPR) {
       SIMachineFunctionInfo::SpilledReg Spill = VGPRSpills[i];
       auto MIB =
         BuildMI(*MBB, MI, DL, TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32),
                 SubReg)
         .addReg(Spill.VGPR)
         .addImm(Spill.Lane);
-
       if (NumSubRegs > 1 && i == 0)
         MIB.addReg(SuperReg, RegState::ImplicitDefine);
-    } else {
-      if (OnlyToVGPR)
-        return false;
-
-      // Restore SGPR from a stack slot.
-      // FIXME: We should use S_LOAD_DWORD here for VI.
-      if (!TmpVGPR.isValid())
-        TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
-      unsigned Align = FrameInfo.getObjectAlignment(Index);
-
-      MachinePointerInfo PtrInfo
-        = MachinePointerInfo::getFixedStack(*MF, Index, EltSize * i);
-
-      MachineMemOperand *MMO = MF->getMachineMemOperand(PtrInfo,
-        MachineMemOperand::MOLoad, EltSize,
-        MinAlign(Align, EltSize * i));
-
-      BuildMI(*MBB, MI, DL, TII->get(AMDGPU::SI_SPILL_V32_RESTORE), TmpVGPR)
-        .addFrameIndex(Index)                 // vaddr
-        .addReg(MFI->getScratchRSrcReg())     // srsrc
-        .addReg(MFI->getStackPtrOffsetReg())  // soffset
-        .addImm(i * 4)                        // offset
-        .addMemOperand(MMO);
-
-      auto MIB =
-        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_READFIRSTLANE_B32), SubReg)
-        .addReg(TmpVGPR, RegState::Kill);
-
-      if (NumSubRegs > 1)
-        MIB.addReg(MI->getOperand(0).getReg(), RegState::ImplicitDefine);
+    }
+  } else {
+    Register TmpVGPR = RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
+    RS->setRegUsed(TmpVGPR);
+
+    unsigned PerVGPR = 32;
+    unsigned NumVGPRs = (NumSubRegs + (PerVGPR - 1)) / PerVGPR;
+    int64_t VGPRLanes = (1LL << std::min(PerVGPR, NumSubRegs)) - 1LL;
+
+    for (unsigned Offset = 0; Offset < NumVGPRs; ++Offset) {
+      // Load in VGPR data
+      buildSGPRSpillLoadStore(MI, Index, Offset, EltSize, TmpVGPR, VGPRLanes,
+                              RS, true);
+
+      // Unpack lanes
+      for (unsigned i = Offset * PerVGPR,
+                    e = std::min((Offset + 1) * PerVGPR, NumSubRegs);
+           i < e; ++i) {
+        Register SubReg =
+            NumSubRegs == 1 ? SuperReg : getSubReg(SuperReg, SplitParts[i]);
+
+        bool LastSubReg = (i + 1 == e);
+        auto MIB =
+            BuildMI(*MBB, MI, DL,
+                    TII->getMCOpcodeFromPseudo(AMDGPU::V_READLANE_B32), SubReg)
+                .addReg(TmpVGPR, getKillRegState(LastSubReg))
+                .addImm(i);
+        if (NumSubRegs > 1 && i == 0)
+          MIB.addReg(SuperReg, RegState::ImplicitDefine);
+      }
     }
   }
 
@@ -946,6 +1185,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
   case AMDGPU::SI_SPILL_S1024_SAVE:
   case AMDGPU::SI_SPILL_S512_SAVE:
   case AMDGPU::SI_SPILL_S256_SAVE:
+  case AMDGPU::SI_SPILL_S192_SAVE:
   case AMDGPU::SI_SPILL_S160_SAVE:
   case AMDGPU::SI_SPILL_S128_SAVE:
   case AMDGPU::SI_SPILL_S96_SAVE:
@@ -955,6 +1195,7 @@ bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex(
   case AMDGPU::SI_SPILL_S1024_RESTORE:
   case AMDGPU::SI_SPILL_S512_RESTORE:
   case AMDGPU::SI_SPILL_S256_RESTORE:
+  case AMDGPU::SI_SPILL_S192_RESTORE:
   case AMDGPU::SI_SPILL_S160_RESTORE:
   case AMDGPU::SI_SPILL_S128_RESTORE:
   case AMDGPU::SI_SPILL_S96_RESTORE:
@@ -981,13 +1222,16 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   MachineOperand &FIOp = MI->getOperand(FIOperandNum);
   int Index = MI->getOperand(FIOperandNum).getIndex();
 
-  Register FrameReg = getFrameRegister(*MF);
+  Register FrameReg = FrameInfo.isFixedObjectIndex(Index) && hasBasePointer(*MF)
+                          ? getBaseRegister()
+                          : getFrameRegister(*MF);
 
   switch (MI->getOpcode()) {
     // SGPR register spill
     case AMDGPU::SI_SPILL_S1024_SAVE:
     case AMDGPU::SI_SPILL_S512_SAVE:
     case AMDGPU::SI_SPILL_S256_SAVE:
+    case AMDGPU::SI_SPILL_S192_SAVE:
     case AMDGPU::SI_SPILL_S160_SAVE:
     case AMDGPU::SI_SPILL_S128_SAVE:
     case AMDGPU::SI_SPILL_S96_SAVE:
@@ -1001,6 +1245,7 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
     case AMDGPU::SI_SPILL_S1024_RESTORE:
     case AMDGPU::SI_SPILL_S512_RESTORE:
     case AMDGPU::SI_SPILL_S256_RESTORE:
+    case AMDGPU::SI_SPILL_S192_RESTORE:
     case AMDGPU::SI_SPILL_S160_RESTORE:
     case AMDGPU::SI_SPILL_S128_RESTORE:
     case AMDGPU::SI_SPILL_S96_RESTORE:
@@ -1076,42 +1321,30 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
       bool IsMUBUF = TII->isMUBUF(*MI);
 
       if (!IsMUBUF && !MFI->isEntryFunction()) {
-        // Convert to an absolute stack address by finding the offset from the
-        // scratch wave base and scaling by the wave size.
+        // Convert to a swizzled stack address by scaling by the wave size.
         //
-        // In an entry function/kernel the offset is already the absolute
-        // address relative to the frame register.
-
-        Register TmpDiffReg =
-          RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
-
-        // If there's no free SGPR, in-place modify the FP
-        Register DiffReg = TmpDiffReg.isValid() ? TmpDiffReg : FrameReg;
+        // In an entry function/kernel the offset is already swizzled.
 
         bool IsCopy = MI->getOpcode() == AMDGPU::V_MOV_B32_e32;
-        Register ResultReg = IsCopy ?
-          MI->getOperand(0).getReg() :
-          RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
-
-        BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_SUB_U32), DiffReg)
-          .addReg(FrameReg)
-          .addReg(MFI->getScratchWaveOffsetReg());
+        Register ResultReg =
+            IsCopy ? MI->getOperand(0).getReg()
+                   : RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MI, 0);
 
         int64_t Offset = FrameInfo.getObjectOffset(Index);
         if (Offset == 0) {
           // XXX - This never happens because of emergency scavenging slot at 0?
           BuildMI(*MBB, MI, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64), ResultReg)
             .addImm(ST.getWavefrontSizeLog2())
-            .addReg(DiffReg);
+            .addReg(FrameReg);
         } else {
           if (auto MIB = TII->getAddNoCarry(*MBB, MI, DL, ResultReg, *RS)) {
-            Register ScaledReg =
-              RS->scavengeRegister(&AMDGPU::VGPR_32RegClass, MIB, 0);
+            // Reuse ResultReg in intermediate step.
+            Register ScaledReg = ResultReg;
 
             BuildMI(*MBB, *MIB, DL, TII->get(AMDGPU::V_LSHRREV_B32_e64),
                     ScaledReg)
               .addImm(ST.getWavefrontSizeLog2())
-              .addReg(DiffReg, RegState::Kill);
+              .addReg(FrameReg);
 
             const bool IsVOP2 = MIB->getOpcode() == AMDGPU::V_ADD_U32_e32;
 
@@ -1148,10 +1381,10 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
             // unavailable. Only one additional mov is needed.
             Register TmpScaledReg =
                 RS->scavengeRegister(&AMDGPU::SReg_32_XM0RegClass, MI, 0, false);
-            Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : DiffReg;
+            Register ScaledReg = TmpScaledReg.isValid() ? TmpScaledReg : FrameReg;
 
             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHR_B32), ScaledReg)
-              .addReg(DiffReg, RegState::Kill)
+              .addReg(FrameReg)
               .addImm(ST.getWavefrontSizeLog2());
             BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), ScaledReg)
               .addReg(ScaledReg, RegState::Kill)
@@ -1165,19 +1398,12 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                 .addReg(ScaledReg, RegState::Kill)
                 .addImm(Offset);
               BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_LSHL_B32), ScaledReg)
-                .addReg(DiffReg, RegState::Kill)
+                .addReg(FrameReg)
                 .addImm(ST.getWavefrontSizeLog2());
             }
           }
         }
 
-        if (!TmpDiffReg.isValid()) {
-          // Restore the FP.
-          BuildMI(*MBB, MI, DL, TII->get(AMDGPU::S_ADD_U32), FrameReg)
-            .addReg(FrameReg)
-            .addReg(MFI->getScratchWaveOffsetReg());
-        }
-
         // Don't introduce an extra copy if we're just materializing in a mov.
         if (IsCopy)
           MI->eraseFromParent();
@@ -1192,10 +1418,17 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
                AMDGPU::getNamedOperandIdx(MI->getOpcode(),
                                           AMDGPU::OpName::vaddr));
 
-        assert(TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->getReg() ==
-               MFI->getStackPtrOffsetReg());
-
-        TII->getNamedOperand(*MI, AMDGPU::OpName::soffset)->setReg(FrameReg);
+        auto &SOffset = *TII->getNamedOperand(*MI, AMDGPU::OpName::soffset);
+        assert((SOffset.isReg() &&
+                SOffset.getReg() == MFI->getStackPtrOffsetReg()) ||
+               (SOffset.isImm() && SOffset.getImm() == 0));
+        if (SOffset.isReg()) {
+          if (FrameReg == AMDGPU::NoRegister) {
+            SOffset.ChangeToImmediate(0);
+          } else {
+            SOffset.setReg(FrameReg);
+          }
+        }
 
         int64_t Offset = FrameInfo.getObjectOffset(Index);
         int64_t OldImm
@@ -1224,16 +1457,99 @@ void SIRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
   }
 }
 
-StringRef SIRegisterInfo::getRegAsmName(unsigned Reg) const {
+StringRef SIRegisterInfo::getRegAsmName(MCRegister Reg) const {
   return AMDGPUInstPrinter::getRegisterName(Reg);
 }
 
+const TargetRegisterClass *
+SIRegisterInfo::getVGPRClassForBitWidth(unsigned BitWidth) {
+  if (BitWidth == 1)
+    return &AMDGPU::VReg_1RegClass;
+  if (BitWidth <= 16)
+    return &AMDGPU::VGPR_LO16RegClass;
+  if (BitWidth <= 32)
+    return &AMDGPU::VGPR_32RegClass;
+  if (BitWidth <= 64)
+    return &AMDGPU::VReg_64RegClass;
+  if (BitWidth <= 96)
+    return &AMDGPU::VReg_96RegClass;
+  if (BitWidth <= 128)
+    return &AMDGPU::VReg_128RegClass;
+  if (BitWidth <= 160)
+    return &AMDGPU::VReg_160RegClass;
+  if (BitWidth <= 192)
+    return &AMDGPU::VReg_192RegClass;
+  if (BitWidth <= 256)
+    return &AMDGPU::VReg_256RegClass;
+  if (BitWidth <= 512)
+    return &AMDGPU::VReg_512RegClass;
+  if (BitWidth <= 1024)
+    return &AMDGPU::VReg_1024RegClass;
+
+  return nullptr;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getAGPRClassForBitWidth(unsigned BitWidth) {
+  if (BitWidth <= 16)
+    return &AMDGPU::AGPR_LO16RegClass;
+  if (BitWidth <= 32)
+    return &AMDGPU::AGPR_32RegClass;
+  if (BitWidth <= 64)
+    return &AMDGPU::AReg_64RegClass;
+  if (BitWidth <= 96)
+    return &AMDGPU::AReg_96RegClass;
+  if (BitWidth <= 128)
+    return &AMDGPU::AReg_128RegClass;
+  if (BitWidth <= 160)
+    return &AMDGPU::AReg_160RegClass;
+  if (BitWidth <= 192)
+    return &AMDGPU::AReg_192RegClass;
+  if (BitWidth <= 256)
+    return &AMDGPU::AReg_256RegClass;
+  if (BitWidth <= 512)
+    return &AMDGPU::AReg_512RegClass;
+  if (BitWidth <= 1024)
+    return &AMDGPU::AReg_1024RegClass;
+
+  return nullptr;
+}
+
+const TargetRegisterClass *
+SIRegisterInfo::getSGPRClassForBitWidth(unsigned BitWidth) {
+  if (BitWidth <= 16)
+    return &AMDGPU::SGPR_LO16RegClass;
+  if (BitWidth <= 32)
+    return &AMDGPU::SReg_32RegClass;
+  if (BitWidth <= 64)
+    return &AMDGPU::SReg_64RegClass;
+  if (BitWidth <= 96)
+    return &AMDGPU::SGPR_96RegClass;
+  if (BitWidth <= 128)
+    return &AMDGPU::SGPR_128RegClass;
+  if (BitWidth <= 160)
+    return &AMDGPU::SGPR_160RegClass;
+  if (BitWidth <= 192)
+    return &AMDGPU::SGPR_192RegClass;
+  if (BitWidth <= 256)
+    return &AMDGPU::SGPR_256RegClass;
+  if (BitWidth <= 512)
+    return &AMDGPU::SGPR_512RegClass;
+  if (BitWidth <= 1024)
+    return &AMDGPU::SGPR_1024RegClass;
+
+  return nullptr;
+}
+
 // FIXME: This is very slow. It might be worth creating a map from physreg to
 // register class.
-const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
-  assert(!Register::isVirtualRegister(Reg));
-
+const TargetRegisterClass *
+SIRegisterInfo::getPhysRegClass(MCRegister Reg) const {
   static const TargetRegisterClass *const BaseClasses[] = {
+    &AMDGPU::VGPR_LO16RegClass,
+    &AMDGPU::VGPR_HI16RegClass,
+    &AMDGPU::SReg_LO16RegClass,
+    &AMDGPU::AGPR_LO16RegClass,
     &AMDGPU::VGPR_32RegClass,
     &AMDGPU::SReg_32RegClass,
     &AMDGPU::AGPR_32RegClass,
@@ -1242,13 +1558,19 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
     &AMDGPU::AReg_64RegClass,
     &AMDGPU::VReg_96RegClass,
     &AMDGPU::SReg_96RegClass,
+    &AMDGPU::AReg_96RegClass,
     &AMDGPU::VReg_128RegClass,
     &AMDGPU::SReg_128RegClass,
     &AMDGPU::AReg_128RegClass,
     &AMDGPU::VReg_160RegClass,
     &AMDGPU::SReg_160RegClass,
+    &AMDGPU::AReg_160RegClass,
+    &AMDGPU::VReg_192RegClass,
+    &AMDGPU::SReg_192RegClass,
+    &AMDGPU::AReg_192RegClass,
     &AMDGPU::VReg_256RegClass,
     &AMDGPU::SReg_256RegClass,
+    &AMDGPU::AReg_256RegClass,
     &AMDGPU::VReg_512RegClass,
     &AMDGPU::SReg_512RegClass,
     &AMDGPU::AReg_512RegClass,
@@ -1272,122 +1594,54 @@ const TargetRegisterClass *SIRegisterInfo::getPhysRegClass(unsigned Reg) const {
 // TargetRegisterClass to mark which classes are VGPRs to make this trivial.
 bool SIRegisterInfo::hasVGPRs(const TargetRegisterClass *RC) const {
   unsigned Size = getRegSizeInBits(*RC);
-  switch (Size) {
-  case 32:
-    return getCommonSubClass(&AMDGPU::VGPR_32RegClass, RC) != nullptr;
-  case 64:
-    return getCommonSubClass(&AMDGPU::VReg_64RegClass, RC) != nullptr;
-  case 96:
-    return getCommonSubClass(&AMDGPU::VReg_96RegClass, RC) != nullptr;
-  case 128:
-    return getCommonSubClass(&AMDGPU::VReg_128RegClass, RC) != nullptr;
-  case 160:
-    return getCommonSubClass(&AMDGPU::VReg_160RegClass, RC) != nullptr;
-  case 256:
-    return getCommonSubClass(&AMDGPU::VReg_256RegClass, RC) != nullptr;
-  case 512:
-    return getCommonSubClass(&AMDGPU::VReg_512RegClass, RC) != nullptr;
-  case 1024:
-    return getCommonSubClass(&AMDGPU::VReg_1024RegClass, RC) != nullptr;
-  case 1:
-    return getCommonSubClass(&AMDGPU::VReg_1RegClass, RC) != nullptr;
-  default:
+  if (Size == 16) {
+    return getCommonSubClass(&AMDGPU::VGPR_LO16RegClass, RC) != nullptr ||
+           getCommonSubClass(&AMDGPU::VGPR_HI16RegClass, RC) != nullptr;
+  }
+  const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
+  if (!VRC) {
     assert(Size < 32 && "Invalid register class size");
     return false;
   }
+  return getCommonSubClass(VRC, RC) != nullptr;
 }
 
 bool SIRegisterInfo::hasAGPRs(const TargetRegisterClass *RC) const {
   unsigned Size = getRegSizeInBits(*RC);
-  if (Size < 32)
+  if (Size < 16)
     return false;
-  switch (Size) {
-  case 32:
-    return getCommonSubClass(&AMDGPU::AGPR_32RegClass, RC) != nullptr;
-  case 64:
-    return getCommonSubClass(&AMDGPU::AReg_64RegClass, RC) != nullptr;
-  case 96:
+  const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
+  if (!ARC) {
+    assert(getVGPRClassForBitWidth(Size) && "Invalid register class size");
     return false;
-  case 128:
-    return getCommonSubClass(&AMDGPU::AReg_128RegClass, RC) != nullptr;
-  case 160:
-  case 256:
-    return false;
-  case 512:
-    return getCommonSubClass(&AMDGPU::AReg_512RegClass, RC) != nullptr;
-  case 1024:
-    return getCommonSubClass(&AMDGPU::AReg_1024RegClass, RC) != nullptr;
-  default:
-    llvm_unreachable("Invalid register class size");
   }
+  return getCommonSubClass(ARC, RC) != nullptr;
 }
 
-const TargetRegisterClass *SIRegisterInfo::getEquivalentVGPRClass(
-                                         const TargetRegisterClass *SRC) const {
-  switch (getRegSizeInBits(*SRC)) {
-  case 32:
-    return &AMDGPU::VGPR_32RegClass;
-  case 64:
-    return &AMDGPU::VReg_64RegClass;
-  case 96:
-    return &AMDGPU::VReg_96RegClass;
-  case 128:
-    return &AMDGPU::VReg_128RegClass;
-  case 160:
-    return &AMDGPU::VReg_160RegClass;
-  case 256:
-    return &AMDGPU::VReg_256RegClass;
-  case 512:
-    return &AMDGPU::VReg_512RegClass;
-  case 1024:
-    return &AMDGPU::VReg_1024RegClass;
-  case 1:
-    return &AMDGPU::VReg_1RegClass;
-  default:
-    llvm_unreachable("Invalid register class size");
-  }
+const TargetRegisterClass *
+SIRegisterInfo::getEquivalentVGPRClass(const TargetRegisterClass *SRC) const {
+  unsigned Size = getRegSizeInBits(*SRC);
+  const TargetRegisterClass *VRC = getVGPRClassForBitWidth(Size);
+  assert(VRC && "Invalid register class size");
+  return VRC;
 }
 
-const TargetRegisterClass *SIRegisterInfo::getEquivalentAGPRClass(
-                                         const TargetRegisterClass *SRC) const {
-  switch (getRegSizeInBits(*SRC)) {
-  case 32:
-    return &AMDGPU::AGPR_32RegClass;
-  case 64:
-    return &AMDGPU::AReg_64RegClass;
-  case 128:
-    return &AMDGPU::AReg_128RegClass;
-  case 512:
-    return &AMDGPU::AReg_512RegClass;
-  case 1024:
-    return &AMDGPU::AReg_1024RegClass;
-  default:
-    llvm_unreachable("Invalid register class size");
-  }
+const TargetRegisterClass *
+SIRegisterInfo::getEquivalentAGPRClass(const TargetRegisterClass *SRC) const {
+  unsigned Size = getRegSizeInBits(*SRC);
+  const TargetRegisterClass *ARC = getAGPRClassForBitWidth(Size);
+  assert(ARC && "Invalid register class size");
+  return ARC;
 }
 
-const TargetRegisterClass *SIRegisterInfo::getEquivalentSGPRClass(
-                                         const TargetRegisterClass *VRC) const {
-  switch (getRegSizeInBits(*VRC)) {
-  case 32:
+const TargetRegisterClass *
+SIRegisterInfo::getEquivalentSGPRClass(const TargetRegisterClass *VRC) const {
+  unsigned Size = getRegSizeInBits(*VRC);
+  if (Size == 32)
     return &AMDGPU::SGPR_32RegClass;
-  case 64:
-    return &AMDGPU::SReg_64RegClass;
-  case 96:
-    return &AMDGPU::SReg_96RegClass;
-  case 128:
-    return &AMDGPU::SGPR_128RegClass;
-  case 160:
-    return &AMDGPU::SReg_160RegClass;
-  case 256:
-    return &AMDGPU::SReg_256RegClass;
-  case 512:
-    return &AMDGPU::SReg_512RegClass;
-  case 1024:
-    return &AMDGPU::SReg_1024RegClass;
-  default:
-    llvm_unreachable("Invalid register class size");
-  }
+  const TargetRegisterClass *SRC = getSGPRClassForBitWidth(Size);
+  assert(SRC && "Invalid register class size");
+  return SRC;
 }
 
 const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
@@ -1396,62 +1650,19 @@ const TargetRegisterClass *SIRegisterInfo::getSubRegClass(
     return RC;
 
   // We can assume that each lane corresponds to one 32-bit register.
-  unsigned Count = getSubRegIndexLaneMask(SubIdx).getNumLanes();
+  unsigned Size = getNumChannelsFromSubReg(SubIdx) * 32;
   if (isSGPRClass(RC)) {
-    switch (Count) {
-    case 1:
-      return &AMDGPU::SGPR_32RegClass;
-    case 2:
-      return &AMDGPU::SReg_64RegClass;
-    case 3:
-      return &AMDGPU::SReg_96RegClass;
-    case 4:
-      return &AMDGPU::SGPR_128RegClass;
-    case 5:
-      return &AMDGPU::SReg_160RegClass;
-    case 8:
-      return &AMDGPU::SReg_256RegClass;
-    case 16:
-      return &AMDGPU::SReg_512RegClass;
-    case 32: /* fall-through */
-    default:
-      llvm_unreachable("Invalid sub-register class size");
-    }
+    if (Size == 32)
+      RC = &AMDGPU::SGPR_32RegClass;
+    else
+      RC = getSGPRClassForBitWidth(Size);
   } else if (hasAGPRs(RC)) {
-    switch (Count) {
-    case 1:
-      return &AMDGPU::AGPR_32RegClass;
-    case 2:
-      return &AMDGPU::AReg_64RegClass;
-    case 4:
-      return &AMDGPU::AReg_128RegClass;
-    case 16:
-      return &AMDGPU::AReg_512RegClass;
-    case 32: /* fall-through */
-    default:
-      llvm_unreachable("Invalid sub-register class size");
-    }
+    RC = getAGPRClassForBitWidth(Size);
   } else {
-    switch (Count) {
-    case 1:
-      return &AMDGPU::VGPR_32RegClass;
-    case 2:
-      return &AMDGPU::VReg_64RegClass;
-    case 3:
-      return &AMDGPU::VReg_96RegClass;
-    case 4:
-      return &AMDGPU::VReg_128RegClass;
-    case 5:
-      return &AMDGPU::VReg_160RegClass;
-    case 8:
-      return &AMDGPU::VReg_256RegClass;
-    case 16:
-      return &AMDGPU::VReg_512RegClass;
-    case 32: /* fall-through */
-    default:
-      llvm_unreachable("Invalid sub-register class size");
-    }
+    RC = getVGPRClassForBitWidth(Size);
   }
+  assert(RC && "Invalid sub-register class size");
+  return RC;
 }
 
 bool SIRegisterInfo::opCanUseInlineConstant(unsigned OpType) const {
@@ -1487,215 +1698,60 @@ bool SIRegisterInfo::shouldRewriteCopySrc(
   return getCommonSubClass(DefRC, SrcRC) != nullptr;
 }
 
-/// Returns a register that is not used at any point in the function.
+/// Returns a lowest register that is not used at any point in the function.
 ///        If all registers are used, then this function will return
-//         AMDGPU::NoRegister.
-unsigned
-SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
-                                   const TargetRegisterClass *RC,
-                                   const MachineFunction &MF) const {
-
-  for (unsigned Reg : *RC)
-    if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
-      return Reg;
-  return AMDGPU::NoRegister;
+///         AMDGPU::NoRegister. If \p ReserveHighestVGPR = true, then return
+///         highest unused register.
+MCRegister SIRegisterInfo::findUnusedRegister(const MachineRegisterInfo &MRI,
+                                              const TargetRegisterClass *RC,
+                                              const MachineFunction &MF,
+                                              bool ReserveHighestVGPR) const {
+  if (ReserveHighestVGPR) {
+    for (MCRegister Reg : reverse(*RC))
+      if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
+        return Reg;
+  } else {
+    for (MCRegister Reg : *RC)
+      if (MRI.isAllocatable(Reg) && !MRI.isPhysRegUsed(Reg))
+        return Reg;
+  }
+  return MCRegister();
 }
 
 ArrayRef<int16_t> SIRegisterInfo::getRegSplitParts(const TargetRegisterClass *RC,
                                                    unsigned EltSize) const {
-  if (EltSize == 4) {
-    static const int16_t Sub0_31[] = {
-      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
-      AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
-      AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
-      AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
-      AMDGPU::sub16, AMDGPU::sub17, AMDGPU::sub18, AMDGPU::sub19,
-      AMDGPU::sub20, AMDGPU::sub21, AMDGPU::sub22, AMDGPU::sub23,
-      AMDGPU::sub24, AMDGPU::sub25, AMDGPU::sub26, AMDGPU::sub27,
-      AMDGPU::sub28, AMDGPU::sub29, AMDGPU::sub30, AMDGPU::sub31,
-    };
-
-    static const int16_t Sub0_15[] = {
-      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
-      AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
-      AMDGPU::sub8, AMDGPU::sub9, AMDGPU::sub10, AMDGPU::sub11,
-      AMDGPU::sub12, AMDGPU::sub13, AMDGPU::sub14, AMDGPU::sub15,
-    };
-
-    static const int16_t Sub0_7[] = {
-      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
-      AMDGPU::sub4, AMDGPU::sub5, AMDGPU::sub6, AMDGPU::sub7,
-    };
-
-    static const int16_t Sub0_4[] = {
-      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3, AMDGPU::sub4,
-    };
-
-    static const int16_t Sub0_3[] = {
-      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2, AMDGPU::sub3,
-    };
-
-    static const int16_t Sub0_2[] = {
-      AMDGPU::sub0, AMDGPU::sub1, AMDGPU::sub2,
-    };
-
-    static const int16_t Sub0_1[] = {
-      AMDGPU::sub0, AMDGPU::sub1,
-    };
-
-    switch (AMDGPU::getRegBitWidth(*RC->MC)) {
-    case 32:
-      return {};
-    case 64:
-      return makeArrayRef(Sub0_1);
-    case 96:
-      return makeArrayRef(Sub0_2);
-    case 128:
-      return makeArrayRef(Sub0_3);
-    case 160:
-      return makeArrayRef(Sub0_4);
-    case 256:
-      return makeArrayRef(Sub0_7);
-    case 512:
-      return makeArrayRef(Sub0_15);
-    case 1024:
-      return makeArrayRef(Sub0_31);
-    default:
-      llvm_unreachable("unhandled register size");
-    }
-  }
-
-  if (EltSize == 8) {
-    static const int16_t Sub0_31_64[] = {
-      AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
-      AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
-      AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
-      AMDGPU::sub12_sub13, AMDGPU::sub14_sub15,
-      AMDGPU::sub16_sub17, AMDGPU::sub18_sub19,
-      AMDGPU::sub20_sub21, AMDGPU::sub22_sub23,
-      AMDGPU::sub24_sub25, AMDGPU::sub26_sub27,
-      AMDGPU::sub28_sub29, AMDGPU::sub30_sub31
-    };
-
-    static const int16_t Sub0_15_64[] = {
-      AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
-      AMDGPU::sub4_sub5, AMDGPU::sub6_sub7,
-      AMDGPU::sub8_sub9, AMDGPU::sub10_sub11,
-      AMDGPU::sub12_sub13, AMDGPU::sub14_sub15
-    };
-
-    static const int16_t Sub0_7_64[] = {
-      AMDGPU::sub0_sub1, AMDGPU::sub2_sub3,
-      AMDGPU::sub4_sub5, AMDGPU::sub6_sub7
-    };
-
-
-    static const int16_t Sub0_3_64[] = {
-      AMDGPU::sub0_sub1, AMDGPU::sub2_sub3
-    };
-
-    switch (AMDGPU::getRegBitWidth(*RC->MC)) {
-    case 64:
-      return {};
-    case 128:
-      return makeArrayRef(Sub0_3_64);
-    case 256:
-      return makeArrayRef(Sub0_7_64);
-    case 512:
-      return makeArrayRef(Sub0_15_64);
-    case 1024:
-      return makeArrayRef(Sub0_31_64);
-    default:
-      llvm_unreachable("unhandled register size");
-    }
-  }
+  const unsigned RegBitWidth = AMDGPU::getRegBitWidth(*RC->MC);
+  assert(RegBitWidth >= 32 && RegBitWidth <= 1024);
 
-  if (EltSize == 16) {
-
-    static const int16_t Sub0_31_128[] = {
-      AMDGPU::sub0_sub1_sub2_sub3,
-      AMDGPU::sub4_sub5_sub6_sub7,
-      AMDGPU::sub8_sub9_sub10_sub11,
-      AMDGPU::sub12_sub13_sub14_sub15,
-      AMDGPU::sub16_sub17_sub18_sub19,
-      AMDGPU::sub20_sub21_sub22_sub23,
-      AMDGPU::sub24_sub25_sub26_sub27,
-      AMDGPU::sub28_sub29_sub30_sub31
-    };
-
-    static const int16_t Sub0_15_128[] = {
-      AMDGPU::sub0_sub1_sub2_sub3,
-      AMDGPU::sub4_sub5_sub6_sub7,
-      AMDGPU::sub8_sub9_sub10_sub11,
-      AMDGPU::sub12_sub13_sub14_sub15
-    };
-
-    static const int16_t Sub0_7_128[] = {
-      AMDGPU::sub0_sub1_sub2_sub3,
-      AMDGPU::sub4_sub5_sub6_sub7
-    };
-
-    switch (AMDGPU::getRegBitWidth(*RC->MC)) {
-    case 128:
-      return {};
-    case 256:
-      return makeArrayRef(Sub0_7_128);
-    case 512:
-      return makeArrayRef(Sub0_15_128);
-    case 1024:
-      return makeArrayRef(Sub0_31_128);
-    default:
-      llvm_unreachable("unhandled register size");
-    }
-  }
-
-  assert(EltSize == 32 && "unhandled elt size");
-
-  static const int16_t Sub0_31_256[] = {
-    AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
-    AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15,
-    AMDGPU::sub16_sub17_sub18_sub19_sub20_sub21_sub22_sub23,
-    AMDGPU::sub24_sub25_sub26_sub27_sub28_sub29_sub30_sub31
-  };
+  const unsigned RegDWORDs = RegBitWidth / 32;
+  const unsigned EltDWORDs = EltSize / 4;
+  assert(RegSplitParts.size() + 1 >= EltDWORDs);
 
-  static const int16_t Sub0_15_256[] = {
-    AMDGPU::sub0_sub1_sub2_sub3_sub4_sub5_sub6_sub7,
-    AMDGPU::sub8_sub9_sub10_sub11_sub12_sub13_sub14_sub15
-  };
+  const std::vector<int16_t> &Parts = RegSplitParts[EltDWORDs - 1];
+  const unsigned NumParts = RegDWORDs / EltDWORDs;
 
-  switch (AMDGPU::getRegBitWidth(*RC->MC)) {
-  case 256:
-    return {};
-  case 512:
-    return makeArrayRef(Sub0_15_256);
-  case 1024:
-    return makeArrayRef(Sub0_31_256);
-  default:
-    llvm_unreachable("unhandled register size");
-  }
+  return makeArrayRef(Parts.data(), NumParts);
 }
 
 const TargetRegisterClass*
 SIRegisterInfo::getRegClassForReg(const MachineRegisterInfo &MRI,
-                                  unsigned Reg) const {
-  if (Register::isVirtualRegister(Reg))
-    return  MRI.getRegClass(Reg);
-
-  return getPhysRegClass(Reg);
+                                  Register Reg) const {
+  return Reg.isVirtual() ? MRI.getRegClass(Reg) : getPhysRegClass(Reg);
 }
 
 bool SIRegisterInfo::isVGPR(const MachineRegisterInfo &MRI,
-                            unsigned Reg) const {
-  const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
-  assert(RC && "Register class for the reg not found");
-  return hasVGPRs(RC);
+                            Register Reg) const {
+  const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
+  // Registers without classes are unaddressable, SGPR-like registers.
+  return RC && hasVGPRs(RC);
 }
 
 bool SIRegisterInfo::isAGPR(const MachineRegisterInfo &MRI,
-                            unsigned Reg) const {
-  const TargetRegisterClass * RC = getRegClassForReg(MRI, Reg);
-  assert(RC && "Register class for the reg not found");
-  return hasAGPRs(RC);
+                            Register Reg) const {
+  const TargetRegisterClass *RC = getRegClassForReg(MRI, Reg);
+
+  // Registers without classes are unaddressable, SGPR-like registers.
+  return RC && hasAGPRs(RC);
 }
 
 bool SIRegisterInfo::shouldCoalesce(MachineInstr *MI,
@@ -1727,36 +1783,41 @@ unsigned SIRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
                                                        MF.getFunction());
   switch (RC->getID()) {
   default:
-    return AMDGPURegisterInfo::getRegPressureLimit(RC, MF);
+    return AMDGPUGenRegisterInfo::getRegPressureLimit(RC, MF);
   case AMDGPU::VGPR_32RegClassID:
+  case AMDGPU::VGPR_LO16RegClassID:
+  case AMDGPU::VGPR_HI16RegClassID:
     return std::min(ST.getMaxNumVGPRs(Occupancy), ST.getMaxNumVGPRs(MF));
   case AMDGPU::SGPR_32RegClassID:
+  case AMDGPU::SGPR_LO16RegClassID:
     return std::min(ST.getMaxNumSGPRs(Occupancy, true), ST.getMaxNumSGPRs(MF));
   }
 }
 
 unsigned SIRegisterInfo::getRegPressureSetLimit(const MachineFunction &MF,
                                                 unsigned Idx) const {
-  if (Idx == getVGPRPressureSet() || Idx == getAGPRPressureSet())
+  if (Idx == AMDGPU::RegisterPressureSets::VGPR_32 ||
+      Idx == AMDGPU::RegisterPressureSets::AGPR_32)
     return getRegPressureLimit(&AMDGPU::VGPR_32RegClass,
                                const_cast<MachineFunction &>(MF));
 
-  if (Idx == getSGPRPressureSet())
+  if (Idx == AMDGPU::RegisterPressureSets::SReg_32)
     return getRegPressureLimit(&AMDGPU::SGPR_32RegClass,
                                const_cast<MachineFunction &>(MF));
 
-  return AMDGPURegisterInfo::getRegPressureSetLimit(MF, Idx);
+  llvm_unreachable("Unexpected register pressure set!");
 }
 
 const int *SIRegisterInfo::getRegUnitPressureSets(unsigned RegUnit) const {
   static const int Empty[] = { -1 };
 
-  if (hasRegUnit(AMDGPU::M0, RegUnit))
+  if (RegPressureIgnoredUnits[RegUnit])
     return Empty;
-  return AMDGPURegisterInfo::getRegUnitPressureSets(RegUnit);
+
+  return AMDGPUGenRegisterInfo::getRegUnitPressureSets(RegUnit);
 }
 
-unsigned SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
+MCRegister SIRegisterInfo::getReturnAddressReg(const MachineFunction &MF) const {
   // Not a callee saved register.
   return AMDGPU::SGPR30_SGPR31;
 }
@@ -1765,49 +1826,19 @@ const TargetRegisterClass *
 SIRegisterInfo::getRegClassForSizeOnBank(unsigned Size,
                                          const RegisterBank &RB,
                                          const MachineRegisterInfo &MRI) const {
-  switch (Size) {
-  case 1: {
-    switch (RB.getID()) {
-    case AMDGPU::VGPRRegBankID:
-      return &AMDGPU::VGPR_32RegClass;
-    case AMDGPU::VCCRegBankID:
-      return isWave32 ?
-        &AMDGPU::SReg_32_XM0_XEXECRegClass : &AMDGPU::SReg_64_XEXECRegClass;
-    case AMDGPU::SGPRRegBankID:
-      return &AMDGPU::SReg_32RegClass;
-    default:
-      llvm_unreachable("unknown register bank");
-    }
-  }
-  case 32:
-    return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
-                                                 &AMDGPU::SReg_32RegClass;
-  case 64:
-    return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_64RegClass :
-                                                 &AMDGPU::SReg_64RegClass;
-  case 96:
-    return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_96RegClass :
-                                                 &AMDGPU::SReg_96RegClass;
-  case 128:
-    return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_128RegClass :
-                                                 &AMDGPU::SGPR_128RegClass;
-  case 160:
-    return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_160RegClass :
-                                                 &AMDGPU::SReg_160RegClass;
-  case 256:
-    return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_256RegClass :
-                                                 &AMDGPU::SReg_256RegClass;
-  case 512:
-    return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_512RegClass :
-                                                 &AMDGPU::SReg_512RegClass;
-  case 1024:
-    return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VReg_1024RegClass :
-                                                 &AMDGPU::SReg_1024RegClass;
+  switch (RB.getID()) {
+  case AMDGPU::VGPRRegBankID:
+    return getVGPRClassForBitWidth(std::max(32u, Size));
+  case AMDGPU::VCCRegBankID:
+    assert(Size == 1);
+    return isWave32 ? &AMDGPU::SReg_32_XM0_XEXECRegClass
+                    : &AMDGPU::SReg_64_XEXECRegClass;
+  case AMDGPU::SGPRRegBankID:
+    return getSGPRClassForBitWidth(std::max(32u, Size));
+  case AMDGPU::AGPRRegBankID:
+    return getAGPRClassForBitWidth(std::max(32u, Size));
   default:
-    if (Size < 32)
-      return RB.getID() == AMDGPU::VGPRRegBankID ? &AMDGPU::VGPR_32RegClass :
-                                                   &AMDGPU::SReg_32RegClass;
-    return nullptr;
+    llvm_unreachable("unknown register bank");
   }
 }
 
@@ -1822,7 +1853,7 @@ SIRegisterInfo::getConstrainedRegClassForOperand(const MachineOperand &MO,
   return getAllocatableClass(RC);
 }
 
-unsigned SIRegisterInfo::getVCC() const {
+MCRegister SIRegisterInfo::getVCC() const {
   return isWave32 ? AMDGPU::VCC_LO : AMDGPU::VCC;
 }
 
@@ -1837,12 +1868,12 @@ SIRegisterInfo::getRegClass(unsigned RCID) const {
   case -1:
     return nullptr;
   default:
-    return AMDGPURegisterInfo::getRegClass(RCID);
+    return AMDGPUGenRegisterInfo::getRegClass(RCID);
   }
 }
 
 // Find reaching register definition
-MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
+MachineInstr *SIRegisterInfo::findReachingDef(Register Reg, unsigned SubReg,
                                               MachineInstr &Use,
                                               MachineRegisterInfo &MRI,
                                               LiveIntervals *LIS) const {
@@ -1850,7 +1881,7 @@ MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
   SlotIndex UseIdx = LIS->getInstructionIndex(Use);
   SlotIndex DefIdx;
 
-  if (Register::isVirtualRegister(Reg)) {
+  if (Reg.isVirtual()) {
     if (!LIS->hasInterval(Reg))
       return nullptr;
     LiveInterval &LI = LIS->getInterval(Reg);
@@ -1894,3 +1925,49 @@ MachineInstr *SIRegisterInfo::findReachingDef(unsigned Reg, unsigned SubReg,
 
   return Def;
 }
+
+MCPhysReg SIRegisterInfo::get32BitRegister(MCPhysReg Reg) const {
+  assert(getRegSizeInBits(*getPhysRegClass(Reg)) <= 32);
+
+  for (const TargetRegisterClass &RC : { AMDGPU::VGPR_32RegClass,
+                                         AMDGPU::SReg_32RegClass,
+                                         AMDGPU::AGPR_32RegClass } ) {
+    if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::lo16, &RC))
+      return Super;
+  }
+  if (MCPhysReg Super = getMatchingSuperReg(Reg, AMDGPU::hi16,
+                                            &AMDGPU::VGPR_32RegClass)) {
+      return Super;
+  }
+
+  return AMDGPU::NoRegister;
+}
+
+bool SIRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
+  switch (PhysReg) {
+  case AMDGPU::SGPR_NULL:
+  case AMDGPU::SRC_SHARED_BASE:
+  case AMDGPU::SRC_PRIVATE_BASE:
+  case AMDGPU::SRC_SHARED_LIMIT:
+  case AMDGPU::SRC_PRIVATE_LIMIT:
+    return true;
+  default:
+    return false;
+  }
+}
+
+ArrayRef<MCPhysReg>
+SIRegisterInfo::getAllSGPR128(const MachineFunction &MF) const {
+  return makeArrayRef(AMDGPU::SGPR_128RegClass.begin(),
+                      ST.getMaxNumSGPRs(MF) / 4);
+}
+
+ArrayRef<MCPhysReg>
+SIRegisterInfo::getAllSGPR32(const MachineFunction &MF) const {
+  return makeArrayRef(AMDGPU::SGPR_32RegClass.begin(), ST.getMaxNumSGPRs(MF));
+}
+
+ArrayRef<MCPhysReg>
+SIRegisterInfo::getAllVGPR32(const MachineFunction &MF) const {
+  return makeArrayRef(AMDGPU::VGPR_32RegClass.begin(), ST.getMaxNumVGPRs(MF));
+}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index ac8c56fa3a03..62d9f1174337 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -14,7 +14,9 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
 #define LLVM_LIB_TARGET_AMDGPU_SIREGISTERINFO_H
 
-#include "AMDGPURegisterInfo.h"
+#define GET_REGINFO_HEADER
+#include "AMDGPUGenRegisterInfo.inc"
+
 #include "SIDefines.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 
@@ -22,38 +24,38 @@ namespace llvm {
 
 class GCNSubtarget;
 class LiveIntervals;
-class MachineRegisterInfo;
 class SIMachineFunctionInfo;
 
-class SIRegisterInfo final : public AMDGPURegisterInfo {
+class SIRegisterInfo final : public AMDGPUGenRegisterInfo {
 private:
   const GCNSubtarget &ST;
-  unsigned SGPRSetID;
-  unsigned VGPRSetID;
-  unsigned AGPRSetID;
-  BitVector SGPRPressureSets;
-  BitVector VGPRPressureSets;
-  BitVector AGPRPressureSets;
   bool SpillSGPRToVGPR;
   bool isWave32;
+  BitVector RegPressureIgnoredUnits;
+
+  /// Sub reg indexes for getRegSplitParts.
+  /// First index represents subreg size from 1 to 16 DWORDs.
+  /// The inner vector is sorted by bit offset.
+  /// Provided a register can be fully split with given subregs,
+  /// all elements of the inner vector combined give a full lane mask.
+  static std::array<std::vector<int16_t>, 16> RegSplitParts;
+
+  void reserveRegisterTuples(BitVector &, MCRegister Reg) const;
 
-  void classifyPressureSet(unsigned PSetID, unsigned Reg,
-                           BitVector &PressureSets) const;
 public:
   SIRegisterInfo(const GCNSubtarget &ST);
 
+  /// \returns the sub reg enum value for the given \p Channel
+  /// (e.g. getSubRegFromChannel(0) -> AMDGPU::sub0)
+  static unsigned getSubRegFromChannel(unsigned Channel, unsigned NumRegs = 1);
+
   bool spillSGPRToVGPR() const {
     return SpillSGPRToVGPR;
   }
 
   /// Return the end register initially reserved for the scratch buffer in case
   /// spilling is needed.
-  unsigned reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
-
-  /// Return the end register initially reserved for the scratch wave offset in
-  /// case spilling is needed.
-  unsigned reservedPrivateSegmentWaveByteOffsetReg(
-    const MachineFunction &MF) const;
+  MCRegister reservedPrivateSegmentBufferReg(const MachineFunction &MF) const;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
 
@@ -70,6 +72,9 @@ public:
 
   Register getFrameRegister(const MachineFunction &MF) const override;
 
+  bool hasBasePointer(const MachineFunction &MF) const;
+  Register getBaseRegister() const;
+
   bool canRealignStack(const MachineFunction &MF) const override;
   bool requiresRegisterScavenging(const MachineFunction &Fn) const override;
 
@@ -77,7 +82,6 @@ public:
   bool requiresFrameIndexReplacementScavenging(
     const MachineFunction &MF) const override;
   bool requiresVirtualBaseRegisters(const MachineFunction &Fn) const override;
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
 
   int64_t getMUBUFInstrOffset(const MachineInstr *MI) const;
 
@@ -86,19 +90,24 @@ public:
 
   bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
 
-  void materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                                    unsigned BaseReg, int FrameIdx,
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
+                                    int FrameIdx,
                                     int64_t Offset) const override;
 
-  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+  void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                          int64_t Offset) const override;
 
-  bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+  bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
                           int64_t Offset) const override;
 
   const TargetRegisterClass *getPointerRegClass(
     const MachineFunction &MF, unsigned Kind = 0) const override;
 
+  void buildSGPRSpillLoadStore(MachineBasicBlock::iterator MI, int Index,
+                               int Offset, unsigned EltSize, Register VGPR,
+                               int64_t VGPRLanes, RegScavenger *RS,
+                               bool IsLoad) const;
+
   /// If \p OnlyToVGPR is true, this will only succeed if this
   bool spillSGPR(MachineBasicBlock::iterator MI,
                  int FI, RegScavenger *RS,
@@ -115,15 +124,19 @@ public:
   bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI,
                                           int FI, RegScavenger *RS) const;
 
-  StringRef getRegAsmName(unsigned Reg) const override;
+  StringRef getRegAsmName(MCRegister Reg) const override;
 
-  unsigned getHWRegIndex(unsigned Reg) const {
+  unsigned getHWRegIndex(MCRegister Reg) const {
     return getEncodingValue(Reg) & 0xff;
   }
 
+  static const TargetRegisterClass *getVGPRClassForBitWidth(unsigned BitWidth);
+  static const TargetRegisterClass *getAGPRClassForBitWidth(unsigned BitWidth);
+  static const TargetRegisterClass *getSGPRClassForBitWidth(unsigned BitWidth);
+
   /// Return the 'base' register class for this register.
   /// e.g. SGPR0 => SReg_32, VGPR => VGPR_32 SGPR0_SGPR1 -> SReg_32, etc.
-  const TargetRegisterClass *getPhysRegClass(unsigned Reg) const;
+  const TargetRegisterClass *getPhysRegClass(MCRegister Reg) const;
 
   /// \returns true if this class contains only SGPR registers
   bool isSGPRClass(const TargetRegisterClass *RC) const {
@@ -135,9 +148,9 @@ public:
     return isSGPRClass(getRegClass(RCID));
   }
 
-  bool isSGPRReg(const MachineRegisterInfo &MRI, unsigned Reg) const {
+  bool isSGPRReg(const MachineRegisterInfo &MRI, Register Reg) const {
     const TargetRegisterClass *RC;
-    if (Register::isVirtualRegister(Reg))
+    if (Reg.isVirtual())
       RC = MRI.getRegClass(Reg);
     else
       RC = getPhysRegClass(Reg);
@@ -161,16 +174,16 @@ public:
   }
 
   /// \returns A VGPR reg class with the same width as \p SRC
-  const TargetRegisterClass *getEquivalentVGPRClass(
-                                          const TargetRegisterClass *SRC) const;
+  const TargetRegisterClass *
+  getEquivalentVGPRClass(const TargetRegisterClass *SRC) const;
 
   /// \returns An AGPR reg class with the same width as \p SRC
-  const TargetRegisterClass *getEquivalentAGPRClass(
-                                          const TargetRegisterClass *SRC) const;
+  const TargetRegisterClass *
+  getEquivalentAGPRClass(const TargetRegisterClass *SRC) const;
 
   /// \returns A SGPR reg class with the same width as \p SRC
-  const TargetRegisterClass *getEquivalentSGPRClass(
-                                           const TargetRegisterClass *VRC) const;
+  const TargetRegisterClass *
+  getEquivalentSGPRClass(const TargetRegisterClass *VRC) const;
 
   /// \returns The register class that is used for a sub-register of \p RC for
   /// the given \p SubIdx.  If \p SubIdx equals NoSubRegister, \p RC will
@@ -196,38 +209,23 @@ public:
   /// -4.0f, -2.0f, -1.0f, -0.5f, 0.0f, 0.5f, 1.0f, 2.0f, 4.0f.
   bool opCanUseInlineConstant(unsigned OpType) const;
 
-  unsigned findUnusedRegister(const MachineRegisterInfo &MRI,
-                              const TargetRegisterClass *RC,
-                              const MachineFunction &MF) const;
-
-  unsigned getSGPRPressureSet() const { return SGPRSetID; };
-  unsigned getVGPRPressureSet() const { return VGPRSetID; };
-  unsigned getAGPRPressureSet() const { return AGPRSetID; };
+  MCRegister findUnusedRegister(const MachineRegisterInfo &MRI,
+                                const TargetRegisterClass *RC,
+                                const MachineFunction &MF,
+                                bool ReserveHighestVGPR = false) const;
 
   const TargetRegisterClass *getRegClassForReg(const MachineRegisterInfo &MRI,
-                                               unsigned Reg) const;
-  bool isVGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
-  bool isAGPR(const MachineRegisterInfo &MRI, unsigned Reg) const;
-  bool isVectorRegister(const MachineRegisterInfo &MRI, unsigned Reg) const {
+                                               Register Reg) const;
+  bool isVGPR(const MachineRegisterInfo &MRI, Register Reg) const;
+  bool isAGPR(const MachineRegisterInfo &MRI, Register Reg) const;
+  bool isVectorRegister(const MachineRegisterInfo &MRI, Register Reg) const {
     return isVGPR(MRI, Reg) || isAGPR(MRI, Reg);
   }
 
-  virtual bool
-  isDivergentRegClass(const TargetRegisterClass *RC) const override {
-    return !isSGPRClass(RC);
-  }
+  bool isConstantPhysReg(MCRegister PhysReg) const override;
 
-  bool isSGPRPressureSet(unsigned SetID) const {
-    return SGPRPressureSets.test(SetID) && !VGPRPressureSets.test(SetID) &&
-           !AGPRPressureSets.test(SetID);
-  }
-  bool isVGPRPressureSet(unsigned SetID) const {
-    return VGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) &&
-           !AGPRPressureSets.test(SetID);
-  }
-  bool isAGPRPressureSet(unsigned SetID) const {
-    return AGPRPressureSets.test(SetID) && !SGPRPressureSets.test(SetID) &&
-           !VGPRPressureSets.test(SetID);
+  bool isDivergentRegClass(const TargetRegisterClass *RC) const override {
+    return !isSGPRClass(RC);
   }
 
   ArrayRef<int16_t> getRegSplitParts(const TargetRegisterClass *RC,
@@ -249,7 +247,7 @@ public:
 
   const int *getRegUnitPressureSets(unsigned RegUnit) const override;
 
-  unsigned getReturnAddressReg(const MachineFunction &MF) const;
+  MCRegister getReturnAddressReg(const MachineFunction &MF) const;
 
   const TargetRegisterClass *
   getRegClassForSizeOnBank(unsigned Size,
@@ -277,12 +275,12 @@ public:
                     : &AMDGPU::SReg_64_XEXECRegClass;
   }
 
-  unsigned getVCC() const;
+  MCRegister getVCC() const;
 
   const TargetRegisterClass *getRegClass(unsigned RCID) const;
 
   // Find reaching register definition
-  MachineInstr *findReachingDef(unsigned Reg, unsigned SubReg,
+  MachineInstr *findReachingDef(Register Reg, unsigned SubReg,
                                 MachineInstr &Use,
                                 MachineRegisterInfo &MRI,
                                 LiveIntervals *LIS) const;
@@ -290,14 +288,51 @@ public:
   const uint32_t *getAllVGPRRegMask() const;
   const uint32_t *getAllAllocatableSRegMask() const;
 
+  // \returns number of 32 bit registers covered by a \p LM
+  static unsigned getNumCoveredRegs(LaneBitmask LM) {
+    // The assumption is that every lo16 subreg is an even bit and every hi16
+    // is an adjacent odd bit or vice versa.
+    uint64_t Mask = LM.getAsInteger();
+    uint64_t Even = Mask & 0xAAAAAAAAAAAAAAAAULL;
+    Mask = (Even >> 1) | Mask;
+    uint64_t Odd = Mask & 0x5555555555555555ULL;
+    return countPopulation(Odd);
+  }
+
+  // \returns a DWORD offset of a \p SubReg
+  unsigned getChannelFromSubReg(unsigned SubReg) const {
+    return SubReg ? (getSubRegIdxOffset(SubReg) + 31) / 32 : 0;
+  }
+
+  // \returns a DWORD size of a \p SubReg
+  unsigned getNumChannelsFromSubReg(unsigned SubReg) const {
+    return getNumCoveredRegs(getSubRegIndexLaneMask(SubReg));
+  }
+
+  // For a given 16 bit \p Reg \returns a 32 bit register holding it.
+  // \returns \p Reg otherwise.
+  MCPhysReg get32BitRegister(MCPhysReg Reg) const;
+
+  /// Return all SGPR128 which satisfy the waves per execution unit requirement
+  /// of the subtarget.
+  ArrayRef<MCPhysReg> getAllSGPR128(const MachineFunction &MF) const;
+
+  /// Return all SGPR32 which satisfy the waves per execution unit requirement
+  /// of the subtarget.
+  ArrayRef<MCPhysReg> getAllSGPR32(const MachineFunction &MF) const;
+
+  /// Return all VGPR32 which satisfy the waves per execution unit requirement
+  /// of the subtarget.
+  ArrayRef<MCPhysReg> getAllVGPR32(const MachineFunction &MF) const;
+
 private:
   void buildSpillLoadStore(MachineBasicBlock::iterator MI,
                            unsigned LoadStoreOp,
                            int Index,
-                           unsigned ValueReg,
+                           Register ValueReg,
                            bool ValueIsKill,
-                           unsigned ScratchRsrcReg,
-                           unsigned ScratchOffsetReg,
+                           MCRegister ScratchRsrcReg,
+                           MCRegister ScratchOffsetReg,
                            int64_t InstrOffset,
                            MachineMemOperand *MMO,
                            RegScavenger *RS) const;
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
index 6ea6ec00e742..ff1f5c4bc49b 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.td
@@ -7,6 +7,50 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
+//  Subregister declarations
+//===----------------------------------------------------------------------===//
+
+class Indexes<int N> {
+  list<int> all = [0,   1,  2,  3,  4,  5,  6 , 7,
+                   8,   9, 10, 11, 12, 13, 14, 15,
+                   16, 17, 18, 19, 20, 21, 22, 23,
+                   24, 25, 26, 27, 28, 29, 30, 31];
+
+  // Returns list of indexes [0..N)
+  list<int> slice =
+    !foldl([]<int>, all, acc, cur,
+           !listconcat(acc, !if(!lt(cur, N), [cur], [])));
+}
+
+let Namespace = "AMDGPU" in {
+
+def lo16 : SubRegIndex<16, 0>;
+def hi16 : SubRegIndex<16, 16>;
+
+foreach Index = 0-31 in {
+  def sub#Index : SubRegIndex<32, !shl(Index, 5)>;
+}
+
+foreach Index = 1-31 in {
+  def sub#Index#_lo16 : ComposedSubRegIndex<!cast<SubRegIndex>(sub#Index), lo16>;
+  def sub#Index#_hi16 : ComposedSubRegIndex<!cast<SubRegIndex>(sub#Index), hi16>;
+}
+
+foreach Size = {2-6,8,16} in {
+  foreach Index = Indexes<!add(33, !mul(Size, -1))>.slice in {
+    def !foldl("", Indexes<Size>.slice, acc, cur,
+               !strconcat(acc#!if(!eq(acc,""),"","_"), "sub"#!add(cur, Index))) :
+      SubRegIndex<!mul(Size, 32), !shl(Index, 5)> {
+      let CoveringSubRegIndices =
+        !foldl([]<SubRegIndex>, Indexes<Size>.slice, acc, cur,
+               !listconcat(acc, [!cast<SubRegIndex>(sub#!add(cur, Index))]));
+    }
+  }
+}
+
+}
+
+//===----------------------------------------------------------------------===//
 //  Helpers
 //===----------------------------------------------------------------------===//
 
@@ -15,6 +59,7 @@ class getSubRegs<int size> {
   list<SubRegIndex> ret3 = [sub0, sub1, sub2];
   list<SubRegIndex> ret4 = [sub0, sub1, sub2, sub3];
   list<SubRegIndex> ret5 = [sub0, sub1, sub2, sub3, sub4];
+  list<SubRegIndex> ret6 = [sub0, sub1, sub2, sub3, sub4, sub5];
   list<SubRegIndex> ret8 = [sub0, sub1, sub2, sub3, sub4, sub5, sub6, sub7];
   list<SubRegIndex> ret16 = [sub0, sub1, sub2, sub3,
                              sub4, sub5, sub6, sub7,
@@ -33,8 +78,10 @@ class getSubRegs<int size> {
                               !if(!eq(size, 3), ret3,
                                   !if(!eq(size, 4), ret4,
                                       !if(!eq(size, 5), ret5,
-                                          !if(!eq(size, 8), ret8,
-                                              !if(!eq(size, 16), ret16, ret32))))));
+                                          !if(!eq(size, 6), ret6,
+                                              !if(!eq(size, 8), ret8,
+                                                  !if(!eq(size, 16), ret16,
+                                                      ret32)))))));
 }
 
 // Generates list of sequential register tuple names.
@@ -74,39 +121,69 @@ class SIRegisterTuples<list<SubRegIndex> Indices, RegisterClass RC,
 //  Declarations that describe the SI registers
 //===----------------------------------------------------------------------===//
 class SIReg <string n, bits<16> regIdx = 0> :
-  Register<n>,
-  DwarfRegNum<[!cast<int>(HWEncoding)]> {
+  Register<n> {
   let Namespace = "AMDGPU";
-
-  // This is the not yet the complete register encoding. An additional
-  // bit is set for VGPRs.
   let HWEncoding = regIdx;
 }
 
+class SIRegWithSubRegs <string n, list<Register> subregs, bits<16> regIdx> :
+  RegisterWithSubRegs<n, subregs> {
+}
+
+multiclass SIRegLoHi16 <string n, bits<16> regIdx, bit ArtificialHigh = 1,
+                        bit HWEncodingHigh = 0> {
+  // There is no special encoding for 16 bit subregs, these are not real
+  // registers but rather operands for instructions preserving other 16 bits
+  // of the result or reading just 16 bits of a 32 bit VGPR.
+  // It is encoded as a corresponding 32 bit register.
+  // Non-VGPR register classes use it as we need to have matching subregisters
+  // to move instructions and data between ALUs.
+  def _LO16 : SIReg<n#".l", regIdx> {
+    let HWEncoding{8} = HWEncodingHigh;
+  }
+  def _HI16 : SIReg<!if(ArtificialHigh, "", n#".h"), regIdx> {
+    let isArtificial = ArtificialHigh;
+    let HWEncoding{8} = HWEncodingHigh;
+  }
+  def "" : RegisterWithSubRegs<n, [!cast<Register>(NAME#"_LO16"),
+                                   !cast<Register>(NAME#"_HI16")]> {
+    let Namespace = "AMDGPU";
+    let SubRegIndices = [lo16, hi16];
+    let CoveredBySubRegs = !if(ArtificialHigh,0,1);
+    let HWEncoding = regIdx;
+    let HWEncoding{8} = HWEncodingHigh;
+  }
+}
+
 // Special Registers
-def VCC_LO : SIReg<"vcc_lo", 106>;
-def VCC_HI : SIReg<"vcc_hi", 107>;
+defm VCC_LO : SIRegLoHi16<"vcc_lo", 106>;
+defm VCC_HI : SIRegLoHi16<"vcc_hi", 107>;
 
 // Pseudo-registers: Used as placeholders during isel and immediately
 // replaced, never seeing the verifier.
 def PRIVATE_RSRC_REG : SIReg<"private_rsrc", 0>;
 def FP_REG : SIReg<"fp", 0>;
 def SP_REG : SIReg<"sp", 0>;
-def SCRATCH_WAVE_OFFSET_REG : SIReg<"scratch_wave_offset", 0>;
+
+// Pseudo-register to represent the program-counter DWARF register.
+def PC_REG : SIReg<"pc", 0>, DwarfRegNum<[16, 16]> {
+  // There is no physical register corresponding to a "program counter", but
+  // we need to encode the concept in debug information in order to represent
+  // things like the return value in unwind information.
+  let isArtificial = 1;
+}
 
 // VCC for 64-bit instructions
-def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]>,
-          DwarfRegAlias<VCC_LO> {
+def VCC : RegisterWithSubRegs<"vcc", [VCC_LO, VCC_HI]> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
   let HWEncoding = 106;
 }
 
-def EXEC_LO : SIReg<"exec_lo", 126>;
-def EXEC_HI : SIReg<"exec_hi", 127>;
+defm EXEC_LO : SIRegLoHi16<"exec_lo", 126>, DwarfRegNum<[1, 1]>;
+defm EXEC_HI : SIRegLoHi16<"exec_hi", 127>;
 
-def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>,
-           DwarfRegAlias<EXEC_LO> {
+def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>, DwarfRegNum<[17, 1]> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
   let HWEncoding = 126;
@@ -114,71 +191,76 @@ def EXEC : RegisterWithSubRegs<"exec", [EXEC_LO, EXEC_HI]>,
 
 // 32-bit real registers, for MC only.
 // May be used with both 32-bit and 64-bit operands.
-def SRC_VCCZ : SIReg<"src_vccz", 251>;
-def SRC_EXECZ : SIReg<"src_execz", 252>;
-def SRC_SCC : SIReg<"src_scc", 253>;
+defm SRC_VCCZ : SIRegLoHi16<"src_vccz", 251>;
+defm SRC_EXECZ : SIRegLoHi16<"src_execz", 252>;
+defm SRC_SCC : SIRegLoHi16<"src_scc", 253>;
 
 // 1-bit pseudo register, for codegen only.
 // Should never be emitted.
 def SCC : SIReg<"scc">;
 
-def M0 : SIReg <"m0", 124>;
-def SGPR_NULL : SIReg<"null", 125>;
+defm M0 : SIRegLoHi16 <"m0", 124>;
+defm SGPR_NULL : SIRegLoHi16 <"null", 125>;
 
-def SRC_SHARED_BASE : SIReg<"src_shared_base", 235>;
-def SRC_SHARED_LIMIT : SIReg<"src_shared_limit", 236>;
-def SRC_PRIVATE_BASE : SIReg<"src_private_base", 237>;
-def SRC_PRIVATE_LIMIT : SIReg<"src_private_limit", 238>;
-def SRC_POPS_EXITING_WAVE_ID : SIReg<"src_pops_exiting_wave_id", 239>;
+defm SRC_SHARED_BASE : SIRegLoHi16<"src_shared_base", 235>;
+defm SRC_SHARED_LIMIT : SIRegLoHi16<"src_shared_limit", 236>;
+defm SRC_PRIVATE_BASE : SIRegLoHi16<"src_private_base", 237>;
+defm SRC_PRIVATE_LIMIT : SIRegLoHi16<"src_private_limit", 238>;
+defm SRC_POPS_EXITING_WAVE_ID : SIRegLoHi16<"src_pops_exiting_wave_id", 239>;
 
-def LDS_DIRECT : SIReg <"src_lds_direct", 254>;
+// Not addressable
+def MODE : SIReg <"mode", 0>;
 
-def XNACK_MASK_LO : SIReg<"xnack_mask_lo", 104>;
-def XNACK_MASK_HI : SIReg<"xnack_mask_hi", 105>;
+def LDS_DIRECT : SIReg <"src_lds_direct", 254> {
+  // There is no physical register corresponding to this. This is an
+  // encoding value in a source field, which will ultimately trigger a
+  // read from m0.
+  let isArtificial = 1;
+}
 
-def XNACK_MASK : RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]>,
-                 DwarfRegAlias<XNACK_MASK_LO> {
+defm XNACK_MASK_LO : SIRegLoHi16<"xnack_mask_lo", 104>;
+defm XNACK_MASK_HI : SIRegLoHi16<"xnack_mask_hi", 105>;
+
+def XNACK_MASK :
+    RegisterWithSubRegs<"xnack_mask", [XNACK_MASK_LO, XNACK_MASK_HI]> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
   let HWEncoding = 104;
 }
 
 // Trap handler registers
-def TBA_LO : SIReg<"tba_lo", 108>;
-def TBA_HI : SIReg<"tba_hi", 109>;
+defm TBA_LO : SIRegLoHi16<"tba_lo", 108>;
+defm TBA_HI : SIRegLoHi16<"tba_hi", 109>;
 
-def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]>,
-          DwarfRegAlias<TBA_LO> {
+def TBA : RegisterWithSubRegs<"tba", [TBA_LO, TBA_HI]> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
   let HWEncoding = 108;
 }
 
-def TMA_LO : SIReg<"tma_lo", 110>;
-def TMA_HI : SIReg<"tma_hi", 111>;
+defm TMA_LO : SIRegLoHi16<"tma_lo", 110>;
+defm TMA_HI : SIRegLoHi16<"tma_hi", 111>;
 
-def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]>,
-          DwarfRegAlias<TMA_LO> {
+def TMA : RegisterWithSubRegs<"tma", [TMA_LO, TMA_HI]> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
   let HWEncoding = 110;
 }
 
 foreach Index = 0-15 in {
-  def TTMP#Index#_vi         : SIReg<"ttmp"#Index, !add(112, Index)>;
-  def TTMP#Index#_gfx9_gfx10 : SIReg<"ttmp"#Index, !add(108, Index)>;
-  def TTMP#Index             : SIReg<"ttmp"#Index, 0>;
+  defm TTMP#Index#_vi         : SIRegLoHi16<"ttmp"#Index, !add(112, Index)>;
+  defm TTMP#Index#_gfx9_gfx10 : SIRegLoHi16<"ttmp"#Index, !add(108, Index)>;
+  defm TTMP#Index             : SIRegLoHi16<"ttmp"#Index, 0>;
 }
 
 multiclass FLAT_SCR_LOHI_m <string n, bits<16> ci_e, bits<16> vi_e> {
-  def _ci : SIReg<n, ci_e>;
-  def _vi : SIReg<n, vi_e>;
-  def "" : SIReg<n, 0>;
+  defm _ci : SIRegLoHi16<n, ci_e>;
+  defm _vi : SIRegLoHi16<n, vi_e>;
+  defm "" : SIRegLoHi16<n, 0>;
 }
 
 class FlatReg <Register lo, Register hi, bits<16> encoding> :
-    RegisterWithSubRegs<"flat_scratch", [lo, hi]>,
-    DwarfRegAlias<lo> {
+    RegisterWithSubRegs<"flat_scratch", [lo, hi]> {
   let Namespace = "AMDGPU";
   let SubRegIndices = [sub0, sub1];
   let HWEncoding = encoding;
@@ -193,21 +275,24 @@ def FLAT_SCR : FlatReg<FLAT_SCR_LO, FLAT_SCR_HI, 0>;
 
 // SGPR registers
 foreach Index = 0-105 in {
-  def SGPR#Index : SIReg <"s"#Index, Index>;
+  defm SGPR#Index :
+     SIRegLoHi16 <"s"#Index, Index>,
+     DwarfRegNum<[!if(!le(Index, 63), !add(Index, 32), !add(Index, 1024)),
+                  !if(!le(Index, 63), !add(Index, 32), !add(Index, 1024))]>;
 }
 
 // VGPR registers
 foreach Index = 0-255 in {
-  def VGPR#Index : SIReg <"v"#Index, Index> {
-    let HWEncoding{8} = 1;
-  }
+  defm VGPR#Index :
+    SIRegLoHi16 <"v"#Index, Index, 0, 1>,
+    DwarfRegNum<[!add(Index, 2560), !add(Index, 1536)]>;
 }
 
 // AccVGPR registers
 foreach Index = 0-255 in {
-  def AGPR#Index : SIReg <"a"#Index, Index> {
-    let HWEncoding{8} = 1;
-  }
+  defm AGPR#Index :
+      SIRegLoHi16 <"a"#Index, Index, 1, 1>,
+      DwarfRegNum<[!add(Index, 3072), !add(Index, 2048)]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -224,14 +309,35 @@ def M0_CLASS : RegisterClass<"AMDGPU", [i32], 32, (add M0)> {
   let isAllocatable = 0;
 }
 
+def M0_CLASS_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16, (add M0_LO16)> {
+  let CopyCost = 1;
+  let Size = 16;
+  let isAllocatable = 0;
+}
+
 // TODO: Do we need to set DwarfRegAlias on register tuples?
 
+def SGPR_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
+                              (add (sequence "SGPR%u_LO16", 0, 105))> {
+  let AllocationPriority = 9;
+  let Size = 16;
+  let GeneratePressureSet = 0;
+}
+
+def SGPR_HI16 : RegisterClass<"AMDGPU", [i16, f16], 16,
+                              (add (sequence "SGPR%u_HI16", 0, 105))> {
+  let isAllocatable = 0;
+  let Size = 16;
+  let GeneratePressureSet = 0;
+}
+
 // SGPR 32-bit registers
 def SGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                             (add (sequence "SGPR%u", 0, 105))> {
   // Give all SGPR classes higher priority than VGPR classes, because
   // we want to spill SGPRs to VGPRs.
   let AllocationPriority = 9;
+  let GeneratePressureSet = 0;
 }
 
 // SGPR 64-bit registers
@@ -246,6 +352,9 @@ def SGPR_128Regs : SIRegisterTuples<getSubRegs<4>.ret, SGPR_32, 105, 4, 4, "s">;
 // SGPR 160-bit registers. No operations use these, but for symmetry with 160-bit VGPRs.
 def SGPR_160Regs : SIRegisterTuples<getSubRegs<5>.ret, SGPR_32, 105, 4, 5, "s">;
 
+// SGPR 192-bit registers
+def SGPR_192Regs : SIRegisterTuples<getSubRegs<6>.ret, SGPR_32, 105, 4, 6, "s">;
+
 // SGPR 256-bit registers
 def SGPR_256Regs : SIRegisterTuples<getSubRegs<8>.ret, SGPR_32, 105, 4, 8, "s">;
 
@@ -261,6 +370,12 @@ def TTMP_32 : RegisterClass<"AMDGPU", [i32, f32, v2i16, v2f16], 32,
   let isAllocatable = 0;
 }
 
+def TTMP_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
+                              (add (sequence "TTMP%u_LO16", 0, 15))> {
+  let Size = 16;
+  let isAllocatable = 0;
+}
+
 // Trap handler TMP 64-bit registers
 def TTMP_64Regs : SIRegisterTuples<getSubRegs<2>.ret, TTMP_32, 15, 2, 2, "ttmp">;
 
@@ -357,6 +472,19 @@ class RegisterTypes<list<ValueType> reg_types> {
 def Reg16Types : RegisterTypes<[i16, f16]>;
 def Reg32Types : RegisterTypes<[i32, f32, v2i16, v2f16, p2, p3, p5, p6]>;
 
+def VGPR_LO16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
+                              (add (sequence "VGPR%u_LO16", 0, 255))> {
+  let AllocationPriority = 1;
+  let Size = 16;
+  let GeneratePressureSet = 0;
+}
+
+def VGPR_HI16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
+                              (add (sequence "VGPR%u_HI16", 0, 255))> {
+  let AllocationPriority = 1;
+  let Size = 16;
+  let GeneratePressureSet = 0;
+}
 
 // VGPR 32-bit registers
 // i16/f16 only on VI+
@@ -364,6 +492,7 @@ def VGPR_32 : RegisterClass<"AMDGPU", !listconcat(Reg32Types.types, Reg16Types.t
                             (add (sequence "VGPR%u", 0, 255))> {
   let AllocationPriority = 1;
   let Size = 32;
+  let Weight = 1;
 }
 
 // VGPR 64-bit registers
@@ -378,6 +507,9 @@ def VGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, VGPR_32, 255, 1, 4, "v">;
 // VGPR 160-bit registers
 def VGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, VGPR_32, 255, 1, 5, "v">;
 
+// VGPR 192-bit registers
+def VGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, VGPR_32, 255, 1, 6, "v">;
+
 // VGPR 256-bit registers
 def VGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, VGPR_32, 255, 1, 8, "v">;
 
@@ -387,19 +519,39 @@ def VGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, VGPR_32, 255, 1, 16, "v">;
 // VGPR 1024-bit registers
 def VGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, VGPR_32, 255, 1, 32, "v">;
 
+def AGPR_LO16 : RegisterClass<"AMDGPU", Reg16Types.types, 16,
+                              (add (sequence "AGPR%u_LO16", 0, 255))> {
+  let isAllocatable = 0;
+  let Size = 16;
+  let GeneratePressureSet = 0;
+}
+
 // AccVGPR 32-bit registers
 def AGPR_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
                             (add (sequence "AGPR%u", 0, 255))> {
   let AllocationPriority = 1;
   let Size = 32;
+  let Weight = 1;
 }
 
 // AGPR 64-bit registers
 def AGPR_64 : SIRegisterTuples<getSubRegs<2>.ret, AGPR_32, 255, 1, 2, "a">;
 
+// AGPR 96-bit registers
+def AGPR_96 : SIRegisterTuples<getSubRegs<3>.ret, AGPR_32, 255, 1, 3, "a">;
+
 // AGPR 128-bit registers
 def AGPR_128 : SIRegisterTuples<getSubRegs<4>.ret, AGPR_32, 255, 1, 4, "a">;
 
+// AGPR 160-bit registers
+def AGPR_160 : SIRegisterTuples<getSubRegs<5>.ret, AGPR_32, 255, 1, 5, "a">;
+
+// AGPR 192-bit registers
+def AGPR_192 : SIRegisterTuples<getSubRegs<6>.ret, AGPR_32, 255, 1, 6, "a">;
+
+// AGPR 256-bit registers
+def AGPR_256 : SIRegisterTuples<getSubRegs<8>.ret, AGPR_32, 255, 1, 8, "a">;
+
 // AGPR 512-bit registers
 def AGPR_512 : SIRegisterTuples<getSubRegs<16>.ret, AGPR_32, 255, 1, 16, "a">;
 
@@ -411,7 +563,7 @@ def AGPR_1024 : SIRegisterTuples<getSubRegs<32>.ret, AGPR_32, 255, 1, 32, "a">;
 //===----------------------------------------------------------------------===//
 
 def Pseudo_SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
-  (add FP_REG, SP_REG, SCRATCH_WAVE_OFFSET_REG)> {
+  (add FP_REG, SP_REG)> {
   let isAllocatable = 0;
   let CopyCost = -1;
 }
@@ -422,12 +574,13 @@ def Pseudo_SReg_128 : RegisterClass<"AMDGPU", [v4i32, v2i64, v2f64], 32,
   let CopyCost = -1;
 }
 
-def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+def LDS_DIRECT_CLASS : RegisterClass<"AMDGPU", [i32], 32,
   (add LDS_DIRECT)> {
   let isAllocatable = 0;
   let CopyCost = -1;
 }
 
+let GeneratePressureSet = 0 in {
 // Subset of SReg_32 without M0 for SMRD instructions and alike.
 // See comments in SIInstructions.td for more info.
 def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
@@ -438,24 +591,54 @@ def SReg_32_XM0_XEXEC : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f1
   let AllocationPriority = 10;
 }
 
+def SReg_LO16_XM0_XEXEC : RegisterClass<"AMDGPU", [i16, f16], 16,
+  (add SGPR_LO16, VCC_LO_LO16, VCC_HI_LO16, FLAT_SCR_LO_LO16, FLAT_SCR_HI_LO16,
+   XNACK_MASK_LO_LO16, XNACK_MASK_HI_LO16, SGPR_NULL_LO16, TTMP_LO16, TMA_LO_LO16,
+   TMA_HI_LO16, TBA_LO_LO16, TBA_HI_LO16, SRC_SHARED_BASE_LO16,
+   SRC_SHARED_LIMIT_LO16, SRC_PRIVATE_BASE_LO16, SRC_PRIVATE_LIMIT_LO16,
+   SRC_POPS_EXITING_WAVE_ID_LO16, SRC_VCCZ_LO16, SRC_EXECZ_LO16, SRC_SCC_LO16)> {
+  let Size = 16;
+  let AllocationPriority = 10;
+}
+
 def SReg_32_XEXEC_HI : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
   (add SReg_32_XM0_XEXEC, EXEC_LO, M0_CLASS)> {
   let AllocationPriority = 10;
 }
 
+def SReg_LO16_XEXEC_HI : RegisterClass<"AMDGPU", [i16, f16], 16,
+  (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, M0_CLASS_LO16)> {
+  let Size = 16;
+  let AllocationPriority = 10;
+}
+
 def SReg_32_XM0 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
   (add SReg_32_XM0_XEXEC, EXEC_LO, EXEC_HI)> {
   let AllocationPriority = 10;
 }
 
+def SReg_LO16_XM0 : RegisterClass<"AMDGPU", [i16, f16], 16,
+  (add SReg_LO16_XM0_XEXEC, EXEC_LO_LO16, EXEC_HI_LO16)> {
+  let Size = 16;
+  let AllocationPriority = 10;
+}
+
+def SReg_LO16 : RegisterClass<"AMDGPU", [i16, f16], 16,
+  (add SGPR_LO16, SReg_LO16_XM0, M0_CLASS_LO16, EXEC_LO_LO16, EXEC_HI_LO16, SReg_LO16_XEXEC_HI)> {
+  let Size = 16;
+  let AllocationPriority = 10;
+}
+} // End GeneratePressureSet = 0
+
 // Register class for all scalar registers (SGPRs + Special Registers)
 def SReg_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
   (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI)> {
   let AllocationPriority = 10;
 }
 
-def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16, i1], 32,
-  (add SReg_32_XM0, M0_CLASS, EXEC_LO, EXEC_HI, SReg_32_XEXEC_HI, LDS_DIRECT_CLASS)> {
+let GeneratePressureSet = 0 in {
+def SRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
+  (add SReg_32, LDS_DIRECT_CLASS)> {
   let isAllocatable = 0;
 }
 
@@ -528,7 +711,6 @@ def TTMP_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64], 32,
 
 def SReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
                              (add SGPR_128, TTMP_128)> {
-  let AllocationPriority = 15;
   let isAllocatable = 0;
 }
 
@@ -543,39 +725,50 @@ def SGPR_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
 
 def SReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
                              (add SGPR_160)> {
-  let AllocationPriority = 16;
+  // FIXME: Should be isAllocatable = 0, but that causes all TableGen-generated
+  // subclasses of SGPR_160 to be marked unallocatable too.
 }
 
-def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add SGPR_256Regs)> {
+def SGPR_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192Regs)> {
+  let Size = 192;
   let AllocationPriority = 17;
 }
 
-def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32, (add TTMP_256Regs)> {
+def SReg_192 : RegisterClass<"AMDGPU", [untyped], 32, (add SGPR_192)> {
+  let Size = 192;
+  let isAllocatable = 0;
+}
+
+def SGPR_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add SGPR_256Regs)> {
+  let AllocationPriority = 18;
+}
+
+def TTMP_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32, (add TTMP_256Regs)> {
   let isAllocatable = 0;
 }
 
-def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
+def SReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32, v4i64, v4f64], 32,
                              (add SGPR_256, TTMP_256)> {
   // Requires 4 s_mov_b64 to copy
   let CopyCost = 4;
-  let AllocationPriority = 17;
+  let isAllocatable = 0;
 }
 
-def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+def SGPR_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
                              (add SGPR_512Regs)> {
-  let AllocationPriority = 18;
+  let AllocationPriority = 19;
 }
 
-def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+def TTMP_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
                              (add TTMP_512Regs)> {
   let isAllocatable = 0;
 }
 
-def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
+def SReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32, v8i64, v8f64], 32,
                              (add SGPR_512, TTMP_512)> {
   // Requires 8 s_mov_b64 to copy
   let CopyCost = 8;
-  let AllocationPriority = 18;
+  let isAllocatable = 0;
 }
 
 def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 32,
@@ -583,105 +776,55 @@ def VRegOrLds_32 : RegisterClass<"AMDGPU", [i32, f32, i16, f16, v2i16, v2f16], 3
   let isAllocatable = 0;
 }
 
-def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
+def SGPR_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32,
                               (add SGPR_1024Regs)> {
-  let AllocationPriority = 19;
+  let AllocationPriority = 20;
 }
 
-def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
+def SReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32, v16i64, v16f64], 32,
                               (add SGPR_1024)> {
   let CopyCost = 16;
-  let AllocationPriority = 19;
-}
-
-// Register class for all vector registers (VGPRs + Interploation Registers)
-def VReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4], 32,
-                            (add VGPR_64)> {
-  let Size = 64;
-
-  // Requires 2 v_mov_b32 to copy
-  let CopyCost = 2;
-  let AllocationPriority = 2;
-}
-
-def VReg_96 : RegisterClass<"AMDGPU", [v3i32, v3f32], 32, (add VGPR_96)> {
-  let Size = 96;
-
-  // Requires 3 v_mov_b32 to copy
-  let CopyCost = 3;
-  let AllocationPriority = 3;
-}
-
-def VReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
-                             (add VGPR_128)> {
-  let Size = 128;
-
-  // Requires 4 v_mov_b32 to copy
-  let CopyCost = 4;
-  let AllocationPriority = 4;
-}
-
-def VReg_160 : RegisterClass<"AMDGPU", [v5i32, v5f32], 32,
-                             (add VGPR_160)> {
-  let Size = 160;
-
-  // Requires 5 v_mov_b32 to copy
-  let CopyCost = 5;
-  let AllocationPriority = 5;
-}
-
-def VReg_256 : RegisterClass<"AMDGPU", [v8i32, v8f32], 32,
-                             (add VGPR_256)> {
-  let Size = 256;
-  let CopyCost = 8;
-  let AllocationPriority = 6;
-}
-
-def VReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
-                             (add VGPR_512)> {
-  let Size = 512;
-  let CopyCost = 16;
-  let AllocationPriority = 7;
-}
-
-def VReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
-                              (add VGPR_1024)> {
-  let Size = 1024;
-  let CopyCost = 32;
-  let AllocationPriority = 8;
+  let isAllocatable = 0;
 }
 
-def AReg_64 : RegisterClass<"AMDGPU", [i64, f64, v2i32, v2f32, v4f16, v4i16], 32,
-                            (add AGPR_64)> {
-  let Size = 64;
+// Register class for all vector registers (VGPRs + Interpolation Registers)
+class VRegClass<int numRegs, list<ValueType> regTypes, dag regList> :
+    RegisterClass<"AMDGPU", regTypes, 32, regList> {
+  let Size = !mul(numRegs, 32);
 
-  let CopyCost = 5;
-  let AllocationPriority = 2;
+  // Requires n v_mov_b32 to copy
+  let CopyCost = numRegs;
+  let AllocationPriority = numRegs;
+  let Weight = numRegs;
 }
 
-def AReg_128 : RegisterClass<"AMDGPU", [v4i32, v4f32, v2i64, v2f64], 32,
-                             (add AGPR_128)> {
-  let Size = 128;
+def VReg_64 : VRegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16, p0, p1, p4],
+                        (add VGPR_64)>;
+def VReg_96 : VRegClass<3, [v3i32, v3f32], (add VGPR_96)>;
+def VReg_128 : VRegClass<4, [v4i32, v4f32, v2i64, v2f64, i128], (add VGPR_128)>;
+def VReg_160 : VRegClass<5, [v5i32, v5f32], (add VGPR_160)>;
+def VReg_192 : VRegClass<6, [untyped], (add VGPR_192)>;
+def VReg_256 : VRegClass<8, [v8i32, v8f32, v4i64, v4f64], (add VGPR_256)>;
+def VReg_512 : VRegClass<16, [v16i32, v16f32, v8i64, v8f64], (add VGPR_512)>;
+def VReg_1024 : VRegClass<32, [v32i32, v32f32, v16i64, v16f64], (add VGPR_1024)>;
 
-  // Requires 4 v_accvgpr_write and 4 v_accvgpr_read to copy + burn 1 vgpr
-  let CopyCost = 9;
-  let AllocationPriority = 4;
+class ARegClass<int numRegs, list<ValueType> regTypes, dag regList> :
+    VRegClass<numRegs, regTypes, regList> {
+  // Requires n v_accvgpr_write and n v_accvgpr_read to copy + burn 1 vgpr
+  let CopyCost = !add(numRegs, numRegs, 1);
 }
 
-def AReg_512 : RegisterClass<"AMDGPU", [v16i32, v16f32], 32,
-                             (add AGPR_512)> {
-  let Size = 512;
-  let CopyCost = 33;
-  let AllocationPriority = 7;
-}
-
-def AReg_1024 : RegisterClass<"AMDGPU", [v32i32, v32f32], 32,
-                              (add AGPR_1024)> {
-  let Size = 1024;
-  let CopyCost = 65;
-  let AllocationPriority = 8;
-}
+def AReg_64 : ARegClass<2, [i64, f64, v2i32, v2f32, v4f16, v4i16],
+                        (add AGPR_64)>;
+def AReg_96 : ARegClass<3, [v3i32, v3f32], (add AGPR_96)>;
+def AReg_128 : ARegClass<4, [v4i32, v4f32, v2i64, v2f64], (add AGPR_128)>;
+def AReg_160 : ARegClass<5, [v5i32, v5f32], (add AGPR_160)>;
+def AReg_192 : ARegClass<6, [untyped], (add AGPR_192)>;
+def AReg_256 : ARegClass<8, [v8i32, v8f32, v4i64, v4f64], (add AGPR_256)>;
+def AReg_512 : ARegClass<16, [v16i32, v16f32, v8i64, v8f64], (add AGPR_512)>;
+def AReg_1024 : ARegClass<32, [v32i32, v32f32, v16i64, v16f64], (add AGPR_1024)>;
 
+} // End GeneratePressureSet = 0
 
 // This is not a real register. This is just to have a register to add
 // to VReg_1 that does not alias any real register that would
@@ -690,6 +833,7 @@ def ARTIFICIAL_VGPR : SIReg <"invalid vgpr", 0> {
   let isArtificial = 1;
 }
 
+let GeneratePressureSet = 0 in {
 // FIXME: Should specify an empty set for this. No register should
 // ever be allocated using VReg_1. This is a hack for SelectionDAG
 // that should always be lowered by SILowerI1Copies. TableGen crashes
@@ -718,6 +862,7 @@ def AV_64 : RegisterClass<"AMDGPU", [i64, f64, v4f16], 32,
                           (add AReg_64, VReg_64)> {
   let isAllocatable = 0;
 }
+} // End GeneratePressureSet = 0
 
 //===----------------------------------------------------------------------===//
 //  Register operands
diff --git a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
index 51779e97ac62..64fca0b46797 100644
--- a/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRemoveShortExecBranches.cpp
@@ -88,15 +88,17 @@ bool SIRemoveShortExecBranches::mustRetainExeczBranch(
     for (MachineBasicBlock::const_iterator I = MBB.begin(), E = MBB.end();
          I != E; ++I) {
       // When a uniform loop is inside non-uniform control flow, the branch
-      // leaving the loop might be an S_CBRANCH_VCCNZ, which is never taken
-      // when EXEC = 0. We should skip the loop lest it becomes infinite.
-      if (I->getOpcode() == AMDGPU::S_CBRANCH_VCCNZ ||
-          I->getOpcode() == AMDGPU::S_CBRANCH_VCCZ)
+      // leaving the loop might never be taken when EXEC = 0.
+      // Hence we should retain cbranch out of the loop lest it become infinite.
+      if (I->isConditionalBranch())
         return true;
 
       if (TII->hasUnwantedEffectsWhenEXECEmpty(*I))
         return true;
 
+      if (TII->isKillTerminator(I->getOpcode()))
+        return true;
+
       // These instructions are potentially expensive even if EXEC = 0.
       if (TII->isSMRD(*I) || TII->isVMEM(*I) || TII->isFLAT(*I) ||
           I->getOpcode() == AMDGPU::S_WAITCNT)
diff --git a/llvm/lib/Target/AMDGPU/SISchedule.td b/llvm/lib/Target/AMDGPU/SISchedule.td
index 824d1aeb0df9..932381c99e0b 100644
--- a/llvm/lib/Target/AMDGPU/SISchedule.td
+++ b/llvm/lib/Target/AMDGPU/SISchedule.td
@@ -1,4 +1,4 @@
-//===-- SISchedule.td - SI Scheduling definitons -------------------------===//
+//===-- SISchedule.td - SI Scheduling definitions -------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -27,10 +27,14 @@ def WriteBarrier : SchedWrite;
 def MIVGPRRead  : SchedRead;
 def MIMFMARead  : SchedRead;
 
-// Vector ALU instructions
+// Normal 16 or 32 bit VALU instructions
 def Write32Bit         : SchedWrite;
+// Conversion to or from F32 (but not converting F64 to or from F32)
+def WriteFloatCvt      : SchedWrite;
+// F16 or F32 transcendental instructions (these are quarter rate)
+def WriteTrans32       : SchedWrite;
+// Other quarter rate VALU instructions
 def WriteQuarterRate32 : SchedWrite;
-def WriteFullOrQuarterRate32 : SchedWrite;
 
 def WriteFloatFMA   : SchedWrite;
 
@@ -43,6 +47,10 @@ def WriteDoubleAdd  : SchedWrite;
 // Conversion to or from f64 instruction
 def WriteDoubleCvt  : SchedWrite;
 
+// F64 "transcendental" (actually only reciprocal and/or square root)
+// instructions
+def WriteTrans64    : SchedWrite;
+
 // Half rate 64-bit instructions.
 def Write64Bit : SchedWrite;
 
@@ -56,7 +64,7 @@ def Write16PassMAI : SchedWrite;
 // instructions)
 
 class SISchedMachineModel : SchedMachineModel {
-  let CompleteModel = 0;
+  let CompleteModel = 1;
   // MicroOpBufferSize = 1 means that instructions will always be added
   // the ready queue when they become available.  This exposes them
   // to the register pressure analysis.
@@ -127,6 +135,8 @@ multiclass SICommonWriteRes {
 
   def : HWVALUWriteRes<Write32Bit,         1>;
   def : HWVALUWriteRes<Write64Bit,         2>;
+  def : HWVALUWriteRes<WriteFloatCvt,      4>;
+  def : HWVALUWriteRes<WriteTrans32,       4>;
   def : HWVALUWriteRes<WriteQuarterRate32, 4>;
   def : HWVALUWriteRes<Write2PassMAI,      2>;
   def : HWVALUWriteRes<Write8PassMAI,      8>;
@@ -135,9 +145,9 @@ multiclass SICommonWriteRes {
   def : ReadAdvance<MIVGPRRead, -2>;
   def : InstRW<[Write64Bit, MIReadVGPR], (instregex "^V_ACCVGPR_WRITE_B32$")>;
 
-  // Technicaly mfma reads can be from 0 to 4 cycles but that does not make
+  // Technically mfma reads can be from 0 to 4 cycles but that does not make
   // sense to model because its register setup is huge. In particular if we
-  // properly model read advanice as -2 for a vgpr read it will result in a
+  // properly model read advance as -2 for a vgpr read it will result in a
   // bad scheduling of acc writes before that mfma. To avoid it we would
   // need to consume 2 or 4 more vgprs to be initialized before the acc
   // write sequence. Just assume worst case here.
@@ -163,6 +173,7 @@ def : HWVALUWriteRes<WriteFloatFMA,   1>;
 def : HWVALUWriteRes<WriteDouble,     4>;
 def : HWVALUWriteRes<WriteDoubleAdd,  2>;
 def : HWVALUWriteRes<WriteDoubleCvt,  4>;
+def : HWVALUWriteRes<WriteTrans64,    4>;
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 
@@ -176,6 +187,7 @@ def : HWVALUWriteRes<WriteFloatFMA, 16>;
 def : HWVALUWriteRes<WriteDouble,   16>;
 def : HWVALUWriteRes<WriteDoubleAdd, 8>;
 def : HWVALUWriteRes<WriteDoubleCvt, 4>;
+def : HWVALUWriteRes<WriteTrans64,  16>;
 
 def : InstRW<[WriteCopy], (instrs COPY)>;
 
@@ -186,17 +198,20 @@ let SchedModel = GFX10SpeedModel in {
 // The latency values are 1 / (operations / cycle).
 // Add 1 stall cycle for VGPR read.
 def : HWWriteRes<Write32Bit,         [HWVALU, HWRC],   5>;
-def : HWWriteRes<Write64Bit,         [HWVALU, HWRC],   9>;
-def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC],   17>;
+def : HWWriteRes<WriteFloatCvt,      [HWVALU, HWRC],   5>;
+def : HWWriteRes<Write64Bit,         [HWVALU, HWRC],   6>;
+def : HWWriteRes<WriteTrans32,       [HWVALU, HWRC],   10>;
+def : HWWriteRes<WriteQuarterRate32, [HWVALU, HWRC],   8>;
 def : HWWriteRes<WriteFloatFMA,      [HWVALU, HWRC],   5>;
-def : HWWriteRes<WriteDouble,        [HWVALU, HWRC],   17>;
-def : HWWriteRes<WriteDoubleAdd,     [HWVALU, HWRC],   17>;
-def : HWWriteRes<WriteDoubleCvt,     [HWVALU, HWRC],   17>;
+def : HWWriteRes<WriteDouble,        [HWVALU, HWRC],   22>;
+def : HWWriteRes<WriteDoubleAdd,     [HWVALU, HWRC],   22>;
+def : HWWriteRes<WriteDoubleCvt,     [HWVALU, HWRC],   22>;
+def : HWWriteRes<WriteTrans64,       [HWVALU, HWRC],   24>;
 
 def : HWWriteRes<WriteBranch,        [HWBranch],       32>;
 def : HWWriteRes<WriteExport,        [HWExport, HWRC], 16>;
 def : HWWriteRes<WriteLDS,           [HWLGKM,   HWRC], 20>;
-def : HWWriteRes<WriteSALU,          [HWSALU,   HWRC], 5>;
+def : HWWriteRes<WriteSALU,          [HWSALU,   HWRC], 2>;
 def : HWWriteRes<WriteSMEM,          [HWLGKM,   HWRC], 20>;
 def : HWWriteRes<WriteVMEM,          [HWVMEM,   HWRC], 320>;
 def : HWWriteRes<WriteBarrier,       [HWBranch],       2000>;
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 3986ca6dfa81..9c6833a7dab6 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -185,6 +185,11 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
   if (!MI.getOperand(0).isReg())
     TII->commuteInstruction(MI, false, 0, 1);
 
+  // cmpk requires src0 to be a register
+  const MachineOperand &Src0 = MI.getOperand(0);
+  if (!Src0.isReg())
+    return;
+
   const MachineOperand &Src1 = MI.getOperand(1);
   if (!Src1.isImm())
     return;
@@ -220,7 +225,7 @@ static void shrinkScalarCompare(const SIInstrInfo *TII, MachineInstr &MI) {
 // Shrink NSA encoded instructions with contiguous VGPRs to non-NSA encoding.
 void SIShrinkInstructions::shrinkMIMG(MachineInstr &MI) {
   const AMDGPU::MIMGInfo *Info = AMDGPU::getMIMGInfo(MI.getOpcode());
-  if (Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
+  if (!Info || Info->MIMGEncoding != AMDGPU::MIMGEncGfx10NSA)
     return;
 
   MachineFunction *MF = MI.getParent()->getParent();
@@ -323,60 +328,61 @@ static bool shrinkScalarLogicOp(const GCNSubtarget &ST,
   MachineOperand *SrcReg = Src0;
   MachineOperand *SrcImm = Src1;
 
-  if (SrcImm->isImm() &&
-      !AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm())) {
-    uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
-    uint32_t NewImm = 0;
-
-    if (Opc == AMDGPU::S_AND_B32) {
-      if (isPowerOf2_32(~Imm)) {
-        NewImm = countTrailingOnes(Imm);
-        Opc = AMDGPU::S_BITSET0_B32;
-      } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
-        NewImm = ~Imm;
-        Opc = AMDGPU::S_ANDN2_B32;
-      }
-    } else if (Opc == AMDGPU::S_OR_B32) {
-      if (isPowerOf2_32(Imm)) {
-        NewImm = countTrailingZeros(Imm);
-        Opc = AMDGPU::S_BITSET1_B32;
-      } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
-        NewImm = ~Imm;
-        Opc = AMDGPU::S_ORN2_B32;
-      }
-    } else if (Opc == AMDGPU::S_XOR_B32) {
-      if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
-        NewImm = ~Imm;
-        Opc = AMDGPU::S_XNOR_B32;
-      }
-    } else {
-      llvm_unreachable("unexpected opcode");
-    }
+  if (!SrcImm->isImm() ||
+      AMDGPU::isInlinableLiteral32(SrcImm->getImm(), ST.hasInv2PiInlineImm()))
+    return false;
+
+  uint32_t Imm = static_cast<uint32_t>(SrcImm->getImm());
+  uint32_t NewImm = 0;
 
-    if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
-        SrcImm == Src0) {
-      if (!TII->commuteInstruction(MI, false, 1, 2))
-        NewImm = 0;
+  if (Opc == AMDGPU::S_AND_B32) {
+    if (isPowerOf2_32(~Imm)) {
+      NewImm = countTrailingOnes(Imm);
+      Opc = AMDGPU::S_BITSET0_B32;
+    } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+      NewImm = ~Imm;
+      Opc = AMDGPU::S_ANDN2_B32;
+    }
+  } else if (Opc == AMDGPU::S_OR_B32) {
+    if (isPowerOf2_32(Imm)) {
+      NewImm = countTrailingZeros(Imm);
+      Opc = AMDGPU::S_BITSET1_B32;
+    } else if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+      NewImm = ~Imm;
+      Opc = AMDGPU::S_ORN2_B32;
+    }
+  } else if (Opc == AMDGPU::S_XOR_B32) {
+    if (AMDGPU::isInlinableLiteral32(~Imm, ST.hasInv2PiInlineImm())) {
+      NewImm = ~Imm;
+      Opc = AMDGPU::S_XNOR_B32;
     }
+  } else {
+    llvm_unreachable("unexpected opcode");
+  }
 
-    if (NewImm != 0) {
-      if (Register::isVirtualRegister(Dest->getReg()) && SrcReg->isReg()) {
-        MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
-        MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
-        return true;
-      }
+  if ((Opc == AMDGPU::S_ANDN2_B32 || Opc == AMDGPU::S_ORN2_B32) &&
+      SrcImm == Src0) {
+    if (!TII->commuteInstruction(MI, false, 1, 2))
+      NewImm = 0;
+  }
 
-      if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
-        MI.setDesc(TII->get(Opc));
-        if (Opc == AMDGPU::S_BITSET0_B32 ||
-            Opc == AMDGPU::S_BITSET1_B32) {
-          Src0->ChangeToImmediate(NewImm);
-          // Remove the immediate and add the tied input.
-          MI.getOperand(2).ChangeToRegister(Dest->getReg(), false);
-          MI.tieOperands(0, 2);
-        } else {
-          SrcImm->setImm(NewImm);
-        }
+  if (NewImm != 0) {
+    if (Register::isVirtualRegister(Dest->getReg()) && SrcReg->isReg()) {
+      MRI.setRegAllocationHint(Dest->getReg(), 0, SrcReg->getReg());
+      MRI.setRegAllocationHint(SrcReg->getReg(), 0, Dest->getReg());
+      return true;
+    }
+
+    if (SrcReg->isReg() && SrcReg->getReg() == Dest->getReg()) {
+      MI.setDesc(TII->get(Opc));
+      if (Opc == AMDGPU::S_BITSET0_B32 ||
+          Opc == AMDGPU::S_BITSET1_B32) {
+        Src0->ChangeToImmediate(NewImm);
+        // Remove the immediate and add the tied input.
+        MI.getOperand(2).ChangeToRegister(Dest->getReg(), false);
+        MI.tieOperands(0, 2);
+      } else {
+        SrcImm->setImm(NewImm);
       }
     }
   }
@@ -426,8 +432,7 @@ getSubRegForIndex(unsigned Reg, unsigned Sub, unsigned I,
     if (Register::isPhysicalRegister(Reg)) {
       Reg = TRI.getSubReg(Reg, TRI.getSubRegFromChannel(I));
     } else {
-      LaneBitmask LM = TRI.getSubRegIndexLaneMask(Sub);
-      Sub = TRI.getSubRegFromChannel(I + countTrailingZeros(LM.getAsInteger()));
+      Sub = TRI.getSubRegFromChannel(I + TRI.getChannelFromSubReg(Sub));
     }
   }
   return TargetInstrInfo::RegSubRegPair(Reg, Sub);
@@ -472,26 +477,30 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
   if (!TRI.isVGPR(MRI, X))
     return nullptr;
 
-  for (MachineOperand &YTop : MRI.use_nodbg_operands(T)) {
-    if (YTop.getSubReg() != Tsub)
-      continue;
-
-    MachineInstr &MovY = *YTop.getParent();
-    if ((MovY.getOpcode() != AMDGPU::V_MOV_B32_e32 &&
-         MovY.getOpcode() != AMDGPU::COPY) ||
-        MovY.getOperand(1).getSubReg() != Tsub)
+  const unsigned SearchLimit = 16;
+  unsigned Count = 0;
+  for (auto Iter = std::next(MovT.getIterator()),
+            E = MovT.getParent()->instr_end();
+       Iter != E && Count < SearchLimit; ++Iter, ++Count) {
+
+    MachineInstr *MovY = &*Iter;
+    if ((MovY->getOpcode() != AMDGPU::V_MOV_B32_e32 &&
+         MovY->getOpcode() != AMDGPU::COPY) ||
+        !MovY->getOperand(1).isReg()        ||
+        MovY->getOperand(1).getReg() != T   ||
+        MovY->getOperand(1).getSubReg() != Tsub)
       continue;
 
-    Register Y = MovY.getOperand(0).getReg();
-    unsigned Ysub = MovY.getOperand(0).getSubReg();
+    Register Y = MovY->getOperand(0).getReg();
+    unsigned Ysub = MovY->getOperand(0).getSubReg();
 
-    if (!TRI.isVGPR(MRI, Y) || MovT.getParent() != MovY.getParent())
+    if (!TRI.isVGPR(MRI, Y))
       continue;
 
     MachineInstr *MovX = nullptr;
-    auto I = std::next(MovT.getIterator()), E = MovT.getParent()->instr_end();
-    for (auto IY = MovY.getIterator(); I != E && I != IY; ++I) {
-      if (instReadsReg(&*I, X, Xsub, TRI) ||
+    for (auto IY = MovY->getIterator(), I = std::next(MovT.getIterator());
+         I != IY; ++I) {
+      if (instReadsReg(&*I, X, Xsub, TRI)    ||
           instModifiesReg(&*I, Y, Ysub, TRI) ||
           instModifiesReg(&*I, T, Tsub, TRI) ||
           (MovX && instModifiesReg(&*I, X, Xsub, TRI))) {
@@ -516,7 +525,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
       MovX = &*I;
     }
 
-    if (!MovX || I == E)
+    if (!MovX)
       continue;
 
     LLVM_DEBUG(dbgs() << "Matched v_swap_b32:\n" << MovT << *MovX << MovY);
@@ -533,7 +542,7 @@ static MachineInstr* matchSwap(MachineInstr &MovT, MachineRegisterInfo &MRI,
         .addReg(X1.Reg, 0, X1.SubReg).getInstr();
     }
     MovX->eraseFromParent();
-    MovY.eraseFromParent();
+    MovY->eraseFromParent();
     MachineInstr *Next = &*std::next(MovT.getIterator());
     if (MRI.use_nodbg_empty(T))
       MovT.eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
index 39f5df767977..b1c73df269fb 100644
--- a/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
+++ b/llvm/lib/Target/AMDGPU/SIWholeQuadMode.cpp
@@ -61,6 +61,7 @@
 #include "SIInstrInfo.h"
 #include "SIMachineFunctionInfo.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -154,7 +155,7 @@ private:
   LiveIntervals *LIS;
 
   DenseMap<const MachineInstr *, InstrInfo> Instructions;
-  DenseMap<MachineBasicBlock *, BlockInfo> Blocks;
+  MapVector<MachineBasicBlock *, BlockInfo> Blocks;
   SmallVector<MachineInstr *, 1> LiveMaskQueries;
   SmallVector<MachineInstr *, 4> LowerToMovInstrs;
   SmallVector<MachineInstr *, 4> LowerToCopyInstrs;
@@ -170,8 +171,6 @@ private:
   void propagateBlock(MachineBasicBlock &MBB, std::vector<WorkItem> &Worklist);
   char analyzeFunction(MachineFunction &MF);
 
-  bool requiresCorrectState(const MachineInstr &MI) const;
-
   MachineBasicBlock::iterator saveSCC(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator Before);
   MachineBasicBlock::iterator
@@ -525,36 +524,6 @@ char SIWholeQuadMode::analyzeFunction(MachineFunction &MF) {
   return GlobalFlags;
 }
 
-/// Whether \p MI really requires the exec state computed during analysis.
-///
-/// Scalar instructions must occasionally be marked WQM for correct propagation
-/// (e.g. thread masks leading up to branches), but when it comes to actual
-/// execution, they don't care about EXEC.
-bool SIWholeQuadMode::requiresCorrectState(const MachineInstr &MI) const {
-  if (MI.isTerminator())
-    return true;
-
-  // Skip instructions that are not affected by EXEC
-  if (TII->isScalarUnit(MI))
-    return false;
-
-  // Generic instructions such as COPY will either disappear by register
-  // coalescing or be lowered to SALU or VALU instructions.
-  if (MI.isTransient()) {
-    if (MI.getNumExplicitOperands() >= 1) {
-      const MachineOperand &Op = MI.getOperand(0);
-      if (Op.isReg()) {
-        if (TRI->isSGPRReg(*MRI, Op.getReg())) {
-          // SGPR instructions are not affected by EXEC
-          return false;
-        }
-      }
-    }
-  }
-
-  return true;
-}
-
 MachineBasicBlock::iterator
 SIWholeQuadMode::saveSCC(MachineBasicBlock &MBB,
                          MachineBasicBlock::iterator Before) {
@@ -741,7 +710,7 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
     if (II != IE) {
       MachineInstr &MI = *II;
 
-      if (requiresCorrectState(MI)) {
+      if (MI.isTerminator() || TII->mayReadEXEC(*MRI, MI)) {
         auto III = Instructions.find(&MI);
         if (III != Instructions.end()) {
           if (III->second.Needs & StateWWM)
@@ -793,18 +762,23 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
       if (State == StateWWM) {
         assert(SavedNonWWMReg);
         fromWWM(MBB, Before, SavedNonWWMReg);
+        LIS->createAndComputeVirtRegInterval(SavedNonWWMReg);
+        SavedNonWWMReg = 0;
         State = NonWWMState;
       }
 
       if (Needs == StateWWM) {
         NonWWMState = State;
+        assert(!SavedNonWWMReg);
         SavedNonWWMReg = MRI->createVirtualRegister(BoolRC);
         toWWM(MBB, Before, SavedNonWWMReg);
         State = StateWWM;
       } else {
         if (State == StateWQM && (Needs & StateExact) && !(Needs & StateWQM)) {
-          if (!WQMFromExec && (OutNeeds & StateWQM))
+          if (!WQMFromExec && (OutNeeds & StateWQM)) {
+            assert(!SavedWQMReg);
             SavedWQMReg = MRI->createVirtualRegister(BoolRC);
+          }
 
           toExact(MBB, Before, SavedWQMReg, LiveMaskReg);
           State = StateExact;
@@ -837,6 +811,8 @@ void SIWholeQuadMode::processBlock(MachineBasicBlock &MBB, unsigned LiveMaskReg,
       break;
     II = Next;
   }
+  assert(!SavedWQMReg);
+  assert(!SavedNonWWMReg);
 }
 
 void SIWholeQuadMode::lowerLiveMaskQueries(unsigned LiveMaskReg) {
@@ -929,10 +905,12 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
 
     if (GlobalFlags == StateWQM) {
       // For a shader that needs only WQM, we can just set it once.
-      BuildMI(Entry, EntryMI, DebugLoc(), TII->get(ST->isWave32() ?
-                AMDGPU::S_WQM_B32 : AMDGPU::S_WQM_B64),
-              Exec)
-          .addReg(Exec);
+      auto MI = BuildMI(Entry, EntryMI, DebugLoc(),
+                        TII->get(ST->isWave32() ? AMDGPU::S_WQM_B32
+                                                : AMDGPU::S_WQM_B64),
+                        Exec)
+                    .addReg(Exec);
+      LIS->InsertMachineInstrInMaps(*MI);
 
       lowerCopyInstrs();
       // EntryMI may become invalid here
@@ -948,6 +926,9 @@ bool SIWholeQuadMode::runOnMachineFunction(MachineFunction &MF) {
   for (auto BII : Blocks)
     processBlock(*BII.first, LiveMaskReg, BII.first == &*MF.begin());
 
+  if (LiveMaskReg)
+    LIS->createAndComputeVirtRegInterval(LiveMaskReg);
+
   // Physical registers like SCC aren't tracked by default anyway, so just
   // removing the ranges we computed is the simplest option for maintaining
   // the analysis results.
diff --git a/llvm/lib/Target/AMDGPU/SMInstructions.td b/llvm/lib/Target/AMDGPU/SMInstructions.td
index 79982d96c2c8..70bf215c03f3 100644
--- a/llvm/lib/Target/AMDGPU/SMInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SMInstructions.td
@@ -1,4 +1,4 @@
-//===---- SMInstructions.td - Scalar Memory Instruction Defintions --------===//
+//===---- SMInstructions.td - Scalar Memory Instruction Definitions -------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -11,9 +11,11 @@ def smrd_offset_8 : NamedOperandU32<"SMRDOffset8",
   let OperandType = "OPERAND_IMMEDIATE";
 }
 
-def smrd_offset_20 : NamedOperandU32<"SMRDOffset20",
-                                  NamedMatchClass<"SMRDOffset20">> {
+def smem_offset : NamedOperandU32<"SMEMOffset",
+                                  NamedMatchClass<"SMEMOffset">> {
   let OperandType = "OPERAND_IMMEDIATE";
+  let EncoderMethod = "getSMEMOffsetEncoding";
+  let DecoderMethod = "decodeSMEMOffset";
 }
 
 //===----------------------------------------------------------------------===//
@@ -43,6 +45,7 @@ class SM_Pseudo <string opName, dag outs, dag ins, string asmOps, list<dag> patt
   bit has_dlc = 0;
   bits<1> has_offset = 1;
   bits<1> offset_is_imm = 0;
+  bit is_buffer = 0;
 }
 
 class SM_Real <SM_Pseudo ps>
@@ -51,9 +54,15 @@ class SM_Real <SM_Pseudo ps>
   let isPseudo = 0;
   let isCodeGenOnly = 0;
 
+  Instruction Opcode = !cast<Instruction>(NAME);
+
   // copy relevant pseudo op flags
   let SubtargetPredicate = ps.SubtargetPredicate;
   let AsmMatchConverter  = ps.AsmMatchConverter;
+  let UseNamedOperandTable = ps.UseNamedOperandTable;
+  let SMRD = ps.SMRD;
+
+  bit is_buffer = ps.is_buffer;
 
   // encoding
   bits<7>  sbase;
@@ -153,7 +162,7 @@ multiclass SM_Pseudo_Stores<string opName,
 }
 
 multiclass SM_Pseudo_Discards<string opName> {
-  def _IMM  : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, smrd_offset_20:$offset), 1>;
+  def _IMM  : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, smem_offset:$offset), 1>;
   def _SGPR : SM_Discard_Pseudo <opName, (ins SReg_64:$sbase, SReg_32:$offset), 0>;
 }
 
@@ -178,14 +187,14 @@ class SM_Time_Pseudo<string opName, SDPatternOperator node = null_frag> : SM_Pse
 class SM_Inval_Pseudo <string opName, SDPatternOperator node = null_frag> : SM_Pseudo<
   opName, (outs), (ins), "", [(node)]> {
   let hasSideEffects = 1;
-  let mayStore = 1;
+  let mayStore = 0;
   let has_sdst = 0;
   let has_sbase = 0;
   let has_offset = 0;
 }
 
 multiclass SM_Pseudo_Probe<string opName, RegisterClass baseClass> {
-  def _IMM  : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, smrd_offset_20:$offset), 1>;
+  def _IMM  : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, smem_offset:$offset), 1>;
   def _SGPR : SM_Probe_Pseudo <opName, (ins i8imm:$sdata, baseClass:$sbase, SReg_32:$offset), 0>;
 }
 
@@ -228,7 +237,7 @@ class SM_Pseudo_Atomic<string opName,
   SM_Atomic_Pseudo<opName,
                    !if(isRet, (outs dataClass:$sdst), (outs)),
                    !if(isImm,
-                       (ins dataClass:$sdata, baseClass:$sbase, smrd_offset_20:$offset, DLC:$dlc),
+                       (ins dataClass:$sdata, baseClass:$sbase, smem_offset:$offset, DLC:$dlc),
                        (ins dataClass:$sdata, baseClass:$sbase, SReg_32:$offset, DLC:$dlc)),
                    !if(isRet, " $sdst", " $sdata") # ", $sbase, $offset" # !if(isRet, " glc", "") # "$dlc",
                    isRet> {
@@ -266,6 +275,7 @@ defm S_LOAD_DWORDX4  : SM_Pseudo_Loads <"s_load_dwordx4", SReg_64, SReg_128>;
 defm S_LOAD_DWORDX8  : SM_Pseudo_Loads <"s_load_dwordx8", SReg_64, SReg_256>;
 defm S_LOAD_DWORDX16 : SM_Pseudo_Loads <"s_load_dwordx16", SReg_64, SReg_512>;
 
+let is_buffer = 1 in {
 defm S_BUFFER_LOAD_DWORD : SM_Pseudo_Loads <
   "s_buffer_load_dword", SReg_128, SReg_32_XM0_XEXEC
 >;
@@ -287,12 +297,14 @@ defm S_BUFFER_LOAD_DWORDX8 : SM_Pseudo_Loads <
 defm S_BUFFER_LOAD_DWORDX16 : SM_Pseudo_Loads <
   "s_buffer_load_dwordx16", SReg_128, SReg_512
 >;
+}
 
 let SubtargetPredicate = HasScalarStores in {
 defm S_STORE_DWORD : SM_Pseudo_Stores <"s_store_dword", SReg_64, SReg_32_XM0_XEXEC>;
 defm S_STORE_DWORDX2 : SM_Pseudo_Stores <"s_store_dwordx2", SReg_64, SReg_64_XEXEC>;
 defm S_STORE_DWORDX4 : SM_Pseudo_Stores <"s_store_dwordx4", SReg_64, SReg_128>;
 
+let is_buffer = 1 in {
 defm S_BUFFER_STORE_DWORD : SM_Pseudo_Stores <
   "s_buffer_store_dword", SReg_128, SReg_32_XM0_XEXEC
 >;
@@ -304,8 +316,10 @@ defm S_BUFFER_STORE_DWORDX2 : SM_Pseudo_Stores <
 defm S_BUFFER_STORE_DWORDX4 : SM_Pseudo_Stores <
   "s_buffer_store_dwordx4", SReg_128, SReg_128
 >;
+}
 } // End SubtargetPredicate = HasScalarStores
 
+let SubtargetPredicate = HasSMemTimeInst in
 def S_MEMTIME : SM_Time_Pseudo <"s_memtime", int_amdgcn_s_memtime>;
 def S_DCACHE_INV : SM_Inval_Pseudo <"s_dcache_inv", int_amdgcn_s_dcache_inv>;
 
@@ -321,13 +335,16 @@ def S_DCACHE_WB_VOL : SM_Inval_Pseudo <"s_dcache_wb_vol", int_amdgcn_s_dcache_wb
 def S_MEMREALTIME   : SM_Time_Pseudo <"s_memrealtime", int_amdgcn_s_memrealtime>;
 
 defm S_ATC_PROBE        : SM_Pseudo_Probe <"s_atc_probe", SReg_64>;
+let is_buffer = 1 in {
 defm S_ATC_PROBE_BUFFER : SM_Pseudo_Probe <"s_atc_probe_buffer", SReg_128>;
+}
 } // SubtargetPredicate = isGFX8Plus
 
-let SubtargetPredicate = isGFX10Plus in {
+let SubtargetPredicate = isGFX10Plus in
 def S_GL1_INV : SM_Inval_Pseudo<"s_gl1_inv">;
+let SubtargetPredicate = HasGetWaveIdInst in
 def S_GET_WAVEID_IN_WORKGROUP : SM_WaveId_Pseudo <"s_get_waveid_in_workgroup", int_amdgcn_s_get_waveid_in_workgroup>;
-} // End SubtargetPredicate = isGFX10Plus
+
 
 let SubtargetPredicate = HasScalarFlatScratchInsts, Uses = [FLAT_SCR] in {
 defm S_SCRATCH_LOAD_DWORD    : SM_Pseudo_Loads <"s_scratch_load_dword",   SReg_64, SReg_32_XM0_XEXEC>;
@@ -341,6 +358,7 @@ defm S_SCRATCH_STORE_DWORDX4 : SM_Pseudo_Stores <"s_scratch_store_dwordx4", SReg
 
 let SubtargetPredicate = HasScalarAtomics in {
 
+let is_buffer = 1 in {
 defm S_BUFFER_ATOMIC_SWAP         : SM_Pseudo_Atomics <"s_buffer_atomic_swap", SReg_128, SReg_32_XM0_XEXEC>;
 defm S_BUFFER_ATOMIC_CMPSWAP      : SM_Pseudo_Atomics <"s_buffer_atomic_cmpswap", SReg_128, SReg_64_XEXEC>;
 defm S_BUFFER_ATOMIC_ADD          : SM_Pseudo_Atomics <"s_buffer_atomic_add", SReg_128, SReg_32_XM0_XEXEC>;
@@ -368,6 +386,7 @@ defm S_BUFFER_ATOMIC_OR_X2        : SM_Pseudo_Atomics <"s_buffer_atomic_or_x2",
 defm S_BUFFER_ATOMIC_XOR_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_xor_x2", SReg_128, SReg_64_XEXEC>;
 defm S_BUFFER_ATOMIC_INC_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_inc_x2", SReg_128, SReg_64_XEXEC>;
 defm S_BUFFER_ATOMIC_DEC_X2       : SM_Pseudo_Atomics <"s_buffer_atomic_dec_x2", SReg_128, SReg_64_XEXEC>;
+}
 
 defm S_ATOMIC_SWAP                : SM_Pseudo_Atomics <"s_atomic_swap", SReg_64, SReg_32_XM0_XEXEC>;
 defm S_ATOMIC_CMPSWAP             : SM_Pseudo_Atomics <"s_atomic_cmpswap", SReg_64, SReg_64_XEXEC>;
@@ -481,14 +500,17 @@ class SMEM_Real_vi <bits<8> op, SM_Pseudo ps>
   let Inst{17} = imm;
   let Inst{25-18} = op;
   let Inst{31-26} = 0x30; //encoding
-  let Inst{51-32} = !if(ps.has_offset, offset{19-0}, ?);
+
+  // VI supports 20-bit unsigned offsets while GFX9+ supports 21-bit signed.
+  // Offset value is corrected accordingly when offset is encoded/decoded.
+  let Inst{52-32} = !if(ps.has_offset, offset{20-0}, ?);
 }
 
 multiclass SM_Real_Loads_vi<bits<8> op, string ps,
                             SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
                             SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
   def _IMM_vi : SMEM_Real_vi <op, immPs> {
-    let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
+    let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
   }
   def _SGPR_vi : SMEM_Real_vi <op, sgprPs> {
     let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
@@ -509,7 +531,7 @@ multiclass SM_Real_Stores_vi<bits<8> op, string ps,
   // FIXME: The operand name $offset is inconsistent with $soff used
   // in the pseudo
   def _IMM_vi : SMEM_Real_Store_vi <op, immPs> {
-    let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
+    let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
   }
 
   def _SGPR_vi : SMEM_Real_Store_vi <op, sgprPs> {
@@ -665,12 +687,10 @@ class SMRD_Real_Load_IMM_ci <bits<5> op, SM_Load_Pseudo ps> :
   let InOperandList = (ins ps.BaseClass:$sbase, smrd_literal_offset:$offset, GLC:$glc, DLC:$dlc);
 
   let LGKM_CNT = ps.LGKM_CNT;
-  let SMRD = ps.SMRD;
   let mayLoad = ps.mayLoad;
   let mayStore = ps.mayStore;
   let hasSideEffects = ps.hasSideEffects;
   let SchedRW = ps.SchedRW;
-  let UseNamedOperandTable = ps.UseNamedOperandTable;
 
   let Inst{7-0}   = 0xff;
   let Inst{8}     = 0;
@@ -768,23 +788,26 @@ multiclass SMRD_Pattern <string Instr, ValueType vt> {
 multiclass SMLoad_Pattern <string Instr, ValueType vt> {
   // 1. Offset as an immediate
   def : GCNPat <
-    (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), i1:$glc, i1:$dlc),
-    (vt (!cast<SM_Pseudo>(Instr#"_IMM") $sbase, $offset, (as_i1imm $glc),
-                                        (as_i1imm $dlc)))
-  >;
+    (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm i32:$offset), timm:$cachepolicy),
+    (vt (!cast<SM_Pseudo>(Instr#"_IMM") SReg_128:$sbase, i32imm:$offset, (extract_glc $cachepolicy),
+                                        (extract_dlc $cachepolicy)))> {
+    let AddedComplexity = 2;
+  }
 
   // 2. 32-bit IMM offset on CI
   def : GCNPat <
-    (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), i1:$glc, i1:$dlc)),
-    (!cast<InstSI>(Instr#"_IMM_ci") $sbase, $offset, (as_i1imm $glc), (as_i1imm $dlc))> {
+    (vt (SIsbuffer_load v4i32:$sbase, (SMRDBufferImm32 i32:$offset), timm:$cachepolicy)),
+    (!cast<InstSI>(Instr#"_IMM_ci") SReg_128:$sbase, smrd_literal_offset:$offset,
+                                    (extract_glc $cachepolicy), (extract_dlc $cachepolicy))> {
     let OtherPredicates = [isGFX7Only];
+    let AddedComplexity = 1;
   }
 
   // 3. Offset loaded in an 32bit SGPR
   def : GCNPat <
-    (SIsbuffer_load v4i32:$sbase, i32:$offset, i1:$glc, i1:$dlc),
-    (vt (!cast<SM_Pseudo>(Instr#"_SGPR") $sbase, $offset, (as_i1imm $glc),
-                                         (as_i1imm $dlc)))
+    (SIsbuffer_load v4i32:$sbase, i32:$offset, timm:$cachepolicy),
+    (vt (!cast<SM_Pseudo>(Instr#"_SGPR") SReg_128:$sbase, SReg_32:$offset, (extract_glc $cachepolicy),
+                                         (extract_dlc $cachepolicy)))
   >;
 }
 
@@ -805,8 +828,13 @@ foreach vt = SReg_128.RegTypes in {
 defm : SMRD_Pattern <"S_LOAD_DWORDX4", vt>;
 }
 
-defm : SMRD_Pattern <"S_LOAD_DWORDX8",  v8i32>;
-defm : SMRD_Pattern <"S_LOAD_DWORDX16", v16i32>;
+foreach vt = SReg_256.RegTypes in {
+defm : SMRD_Pattern <"S_LOAD_DWORDX8", vt>;
+}
+
+foreach vt = SReg_512.RegTypes in {
+defm : SMRD_Pattern <"S_LOAD_DWORDX16", vt>;
+}
 
 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORD",     i32>;
 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX2",   v2i32>;
@@ -821,10 +849,21 @@ defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX8",   v8f32>;
 defm : SMLoad_Pattern <"S_BUFFER_LOAD_DWORDX16",  v16f32>;
 } // End let AddedComplexity = 100
 
+let OtherPredicates = [HasSMemTimeInst] in {
 def : GCNPat <
   (i64 (readcyclecounter)),
   (S_MEMTIME)
 >;
+} // let OtherPredicates = [HasSMemTimeInst]
+
+let OtherPredicates = [HasNoSMemTimeInst] in {
+def : GCNPat <
+  (i64 (readcyclecounter)),
+  (REG_SEQUENCE SReg_64,
+    (S_GETREG_B32 getHwRegImm<HWREG.SHADER_CYCLES, 0, -12>.ret), sub0,
+    (S_MOV_B32 (i32 0)), sub1)
+>;
+} // let OtherPredicates = [HasNoSMemTimeInst]
 
 //===----------------------------------------------------------------------===//
 // GFX10.
@@ -844,7 +883,7 @@ class SMEM_Real_gfx10<bits<8> op, SM_Pseudo ps> :
   let Inst{16}    = !if(ps.has_glc, glc, ?);
   let Inst{25-18} = op;
   let Inst{31-26} = 0x3d;
-  let Inst{51-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{19-0}, ?), ?);
+  let Inst{52-32} = !if(ps.offset_is_imm, !if(ps.has_offset, offset{20-0}, ?), ?);
   let Inst{63-57} = !if(ps.offset_is_imm, !cast<int>(SGPR_NULL.HWEncoding),
                                           !if(ps.has_offset, offset{6-0}, ?));
 }
@@ -853,7 +892,7 @@ multiclass SM_Real_Loads_gfx10<bits<8> op, string ps,
                                SM_Load_Pseudo immPs = !cast<SM_Load_Pseudo>(ps#_IMM),
                                SM_Load_Pseudo sgprPs = !cast<SM_Load_Pseudo>(ps#_SGPR)> {
   def _IMM_gfx10 : SMEM_Real_gfx10<op, immPs> {
-    let InOperandList = (ins immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
+    let InOperandList = (ins immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
   }
   def _SGPR_gfx10 : SMEM_Real_gfx10<op, sgprPs> {
     let InOperandList = (ins sgprPs.BaseClass:$sbase, SReg_32:$offset, GLC:$glc, DLC:$dlc);
@@ -873,7 +912,7 @@ multiclass SM_Real_Stores_gfx10<bits<8> op, string ps,
   // FIXME: The operand name $offset is inconsistent with $soff used
   // in the pseudo
   def _IMM_gfx10 : SMEM_Real_Store_gfx10 <op, immPs> {
-    let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smrd_offset_20:$offset, GLC:$glc, DLC:$dlc);
+    let InOperandList = (ins immPs.SrcClass:$sdata, immPs.BaseClass:$sbase, smem_offset:$offset, GLC:$glc, DLC:$dlc);
   }
 
   def _SGPR_gfx10 : SMEM_Real_Store_gfx10 <op, sgprPs> {
@@ -1020,3 +1059,12 @@ defm S_DCACHE_DISCARD    : SM_Real_Discard_gfx10 <0x28, "S_DCACHE_DISCARD">;
 defm S_DCACHE_DISCARD_X2 : SM_Real_Discard_gfx10 <0x29, "S_DCACHE_DISCARD_X2">;
 
 } // End SubtargetPredicate = HasScalarAtomics
+
+def SMInfoTable : GenericTable {
+  let FilterClass = "SM_Real";
+  let CppTypeName = "SMInfo";
+  let Fields = ["Opcode", "is_buffer"];
+
+  let PrimaryKey = ["Opcode"];
+  let PrimaryKeyName = "getSMEMOpcodeHelper";
+}
diff --git a/llvm/lib/Target/AMDGPU/SOPInstructions.td b/llvm/lib/Target/AMDGPU/SOPInstructions.td
index 73ba2ae367f7..9d7b25d55217 100644
--- a/llvm/lib/Target/AMDGPU/SOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SOPInstructions.td
@@ -1,4 +1,4 @@
-//===-- SOPInstructions.td - SOP Instruction Defintions -------------------===//
+//===-- SOPInstructions.td - SOP Instruction Definitions ------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -97,6 +97,17 @@ class SOP1_0_32 <string opName, list<dag> pattern = []> : SOP1_Pseudo <
   let has_sdst = 0;
 }
 
+// Special case for movreld where sdst is treated as a use operand.
+class SOP1_32_movreld <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+  opName, (outs), (ins SReg_32:$sdst, SSrc_b32:$src0),
+  "$sdst, $src0", pattern>;
+
+// Special case for movreld where sdst is treated as a use operand.
+class SOP1_64_movreld <string opName, list<dag> pattern=[]> : SOP1_Pseudo <
+  opName, (outs), (ins SReg_64:$sdst, SSrc_b64:$src0),
+  "$sdst, $src0", pattern
+>;
+
 class SOP1_0_32R <string opName, list<dag> pattern = []> : SOP1_Pseudo <
   opName, (outs), (ins SReg_32:$src0),
   "$src0", pattern> {
@@ -199,7 +210,9 @@ def S_BCNT1_I32_B64 : SOP1_32_64 <"s_bcnt1_i32_b64",
 
 def S_FF0_I32_B32 : SOP1_32 <"s_ff0_i32_b32">;
 def S_FF0_I32_B64 : SOP1_32_64 <"s_ff0_i32_b64">;
-def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64">;
+def S_FF1_I32_B64 : SOP1_32_64 <"s_ff1_i32_b64",
+  [(set i32:$sdst, (AMDGPUffbl_b32 i64:$src0))]
+>;
 
 def S_FF1_I32_B32 : SOP1_32 <"s_ff1_i32_b32",
   [(set i32:$sdst, (AMDGPUffbl_b32 i32:$src0))]
@@ -209,7 +222,9 @@ def S_FLBIT_I32_B32 : SOP1_32 <"s_flbit_i32_b32",
   [(set i32:$sdst, (AMDGPUffbh_u32 i32:$src0))]
 >;
 
-def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64">;
+def S_FLBIT_I32_B64 : SOP1_32_64 <"s_flbit_i32_b64",
+  [(set i32:$sdst, (AMDGPUffbh_u32 i64:$src0))]
+>;
 def S_FLBIT_I32 : SOP1_32 <"s_flbit_i32",
   [(set i32:$sdst, (AMDGPUffbh_i32 i32:$src0))]
 >;
@@ -267,8 +282,8 @@ def S_QUADMASK_B64 : SOP1_64 <"s_quadmask_b64">;
 let Uses = [M0] in {
 def S_MOVRELS_B32 : SOP1_32R <"s_movrels_b32">;
 def S_MOVRELS_B64 : SOP1_64R <"s_movrels_b64">;
-def S_MOVRELD_B32 : SOP1_32 <"s_movreld_b32">;
-def S_MOVRELD_B64 : SOP1_64 <"s_movreld_b64">;
+def S_MOVRELD_B32 : SOP1_32_movreld <"s_movreld_b32">;
+def S_MOVRELD_B64 : SOP1_64_movreld <"s_movreld_b64">;
 } // End Uses = [M0]
 
 let SubtargetPredicate = isGFX6GFX7GFX8GFX9 in {
@@ -283,8 +298,8 @@ def S_MOV_FED_B32 : SOP1_32 <"s_mov_fed_b32">;
 
 let SubtargetPredicate = HasVGPRIndexMode in {
 def S_SET_GPR_IDX_IDX : SOP1_0_32<"s_set_gpr_idx_idx"> {
-  let Uses = [M0];
-  let Defs = [M0];
+  let Uses = [M0, MODE];
+  let Defs = [M0, MODE];
 }
 }
 
@@ -401,8 +416,14 @@ class UniformUnaryFrag<SDPatternOperator Op> : PatFrag <
 class UniformBinFrag<SDPatternOperator Op> : PatFrag <
   (ops node:$src0, node:$src1),
   (Op $src0, $src1),
-  [{ return !N->isDivergent(); }]
->;
+  [{ return !N->isDivergent(); }]> {
+  // This check is unnecessary as it's captured by the result register
+  // bank constraint.
+  //
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
+}
 
 let Defs = [SCC] in { // Carry out goes to SCC
 let isCommutable = 1 in {
@@ -444,9 +465,19 @@ def S_MAX_U32 : SOP2_32 <"s_max_u32",
 } // End isCommutable = 1
 } // End Defs = [SCC]
 
+class SelectPat<SDPatternOperator select> : PatFrag <
+  (ops node:$src1, node:$src2),
+  (select SCC, $src1, $src2),
+  [{ return N->getOperand(0)->hasOneUse() && !N->isDivergent(); }]
+>;
 
 let Uses = [SCC] in {
-  def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32">;
+  let AddedComplexity = 20 in {
+    def S_CSELECT_B32 : SOP2_32 <"s_cselect_b32",
+      [(set i32:$sdst, (SelectPat<select> i32:$src0, i32:$src1))]
+    >;
+  }
+
   def S_CSELECT_B64 : SOP2_64 <"s_cselect_b64">;
 } // End Uses = [SCC]
 
@@ -524,22 +555,22 @@ let AddedComplexity = 1 in {
 let Defs = [SCC] in {
 // TODO: b64 versions require VOP3 change since v_lshlrev_b64 is VOP3
 def S_LSHL_B32 : SOP2_32 <"s_lshl_b32",
-  [(set SReg_32:$sdst, (shl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
+  [(set SReg_32:$sdst, (UniformBinFrag<shl> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
 >;
 def S_LSHL_B64 : SOP2_64_32 <"s_lshl_b64",
-  [(set SReg_64:$sdst, (shl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
+  [(set SReg_64:$sdst, (UniformBinFrag<shl> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
 >;
 def S_LSHR_B32 : SOP2_32 <"s_lshr_b32",
-  [(set SReg_32:$sdst, (srl (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
+  [(set SReg_32:$sdst, (UniformBinFrag<srl> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
 >;
 def S_LSHR_B64 : SOP2_64_32 <"s_lshr_b64",
-  [(set SReg_64:$sdst, (srl (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
+  [(set SReg_64:$sdst, (UniformBinFrag<srl> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
 >;
 def S_ASHR_I32 : SOP2_32 <"s_ashr_i32",
-  [(set SReg_32:$sdst, (sra (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
+  [(set SReg_32:$sdst, (UniformBinFrag<sra> (i32 SSrc_b32:$src0), (i32 SSrc_b32:$src1)))]
 >;
 def S_ASHR_I64 : SOP2_64_32 <"s_ashr_i64",
-  [(set SReg_64:$sdst, (sra (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
+  [(set SReg_64:$sdst, (UniformBinFrag<sra> (i64 SSrc_b64:$src0), (i32 SSrc_b32:$src1)))]
 >;
 } // End Defs = [SCC]
 
@@ -592,14 +623,26 @@ let SubtargetPredicate = isGFX9Plus in {
   def S_PACK_HH_B32_B16 : SOP2_32<"s_pack_hh_b32_b16">;
 
   let Defs = [SCC] in {
-    def S_LSHL1_ADD_U32 : SOP2_32<"s_lshl1_add_u32">;
-    def S_LSHL2_ADD_U32 : SOP2_32<"s_lshl2_add_u32">;
-    def S_LSHL3_ADD_U32 : SOP2_32<"s_lshl3_add_u32">;
-    def S_LSHL4_ADD_U32 : SOP2_32<"s_lshl4_add_u32">;
+    def S_LSHL1_ADD_U32 : SOP2_32<"s_lshl1_add_u32",
+      [(set i32:$sdst, (shl1_add SSrc_b32:$src0, SSrc_b32:$src1))]
+    >;
+    def S_LSHL2_ADD_U32 : SOP2_32<"s_lshl2_add_u32",
+      [(set i32:$sdst, (shl2_add SSrc_b32:$src0, SSrc_b32:$src1))]
+    >;
+    def S_LSHL3_ADD_U32 : SOP2_32<"s_lshl3_add_u32",
+      [(set i32:$sdst, (shl3_add SSrc_b32:$src0, SSrc_b32:$src1))]
+    >;
+    def S_LSHL4_ADD_U32 : SOP2_32<"s_lshl4_add_u32",
+      [(set i32:$sdst, (shl4_add SSrc_b32:$src0, SSrc_b32:$src1))]
+    >;
   } // End Defs = [SCC]
 
-  def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32">;
-  def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32">;
+  let isCommutable = 1 in {
+    def S_MUL_HI_U32 : SOP2_32<"s_mul_hi_u32",
+      [(set i32:$sdst, (UniformBinFrag<mulhu> SSrc_b32:$src0, SSrc_b32:$src1))]>;
+    def S_MUL_HI_I32 : SOP2_32<"s_mul_hi_i32",
+      [(set i32:$sdst, (UniformBinFrag<mulhs> SSrc_b32:$src0, SSrc_b32:$src1))]>;
+  }
 } // End SubtargetPredicate = isGFX9Plus
 
 //===----------------------------------------------------------------------===//
@@ -760,7 +803,11 @@ def S_CBRANCH_I_FORK : SOPK_Pseudo <
   "$sdst, $simm16"
 >;
 
+let hasSideEffects = 1 in {
+
 let mayLoad = 1 in {
+// s_getreg_b32 should use hasSideEffects = 1 for tablegen to allow
+// its use in the readcyclecounter selection.
 def S_GETREG_B32 : SOPK_Pseudo <
   "s_getreg_b32",
   (outs SReg_32:$sdst), (ins hwreg:$simm16),
@@ -768,14 +815,20 @@ def S_GETREG_B32 : SOPK_Pseudo <
 >;
 }
 
-let hasSideEffects = 1 in {
+let mayLoad = 0, mayStore =0 in {
 
 def S_SETREG_B32 : SOPK_Pseudo <
   "s_setreg_b32",
   (outs), (ins SReg_32:$sdst, hwreg:$simm16),
   "$simm16, $sdst",
-  [(AMDGPUsetreg i32:$sdst, (i16 timm:$simm16))]
->;
+  [(int_amdgcn_s_setreg (i32 timm:$simm16), i32:$sdst)]> {
+
+  // Use custom inserter to optimize some cases to
+  // S_DENORM_MODE/S_ROUND_MODE.
+  let usesCustomInserter = 1;
+  let Defs = [MODE];
+  let Uses = [MODE];
+}
 
 // FIXME: Not on SI?
 //def S_GETREG_REGRD_B32 : SOPK_32 <sopk<0x14, 0x13>, "s_getreg_regrd_b32">;
@@ -786,8 +839,11 @@ def S_SETREG_IMM32_B32 : SOPK_Pseudo <
   "$simm16, $imm"> {
   let Size = 8; // Unlike every other SOPK instruction.
   let has_sdst = 0;
+  let Defs = [MODE];
+  let Uses = [MODE];
 }
 
+}
 } // End hasSideEffects = 1
 
 class SOPK_WAITCNT<string opName, list<dag> pat=[]> :
@@ -920,12 +976,16 @@ def S_CMP_LG_U64 : SOPC_CMP_64 <0x13, "s_cmp_lg_u64", COND_NE>;
 } // End SubtargetPredicate = isGFX8Plus
 
 let SubtargetPredicate = HasVGPRIndexMode in {
+// Setting the GPR index mode is really writing the fields in the mode
+// register. We don't want to add mode register uses to every
+// instruction, and it's too complicated to deal with anyway. This is
+// modeled just as a side effect.
 def S_SET_GPR_IDX_ON : SOPC <0x11,
   (outs),
   (ins SSrc_b32:$src0, GPRIdxMode:$src1),
   "s_set_gpr_idx_on $src0,$src1"> {
-  let Defs = [M0]; // No scc def
-  let Uses = [M0]; // Other bits of m0 unmodified.
+  let Defs = [M0, MODE]; // No scc def
+  let Uses = [M0, MODE]; // Other bits of mode, m0 unmodified.
   let hasSideEffects = 1; // Sets mode.gpr_idx_en
   let FixedSize = 1;
 }
@@ -1099,7 +1159,7 @@ def S_WAKEUP : SOPP <0x00000003, (ins), "s_wakeup"> {
   let mayStore = 1;
 }
 
-let mayLoad = 1, mayStore = 1, hasSideEffects = 1 in
+let mayLoad = 0, mayStore = 0, hasSideEffects = 1 in
 def S_WAITCNT : SOPP <0x0000000c, (ins WAIT_FLAG:$simm16), "s_waitcnt $simm16",
     [(int_amdgcn_s_waitcnt timm:$simm16)]>;
 def S_SETHALT : SOPP <0x0000000d, (ins i16imm:$simm16), "s_sethalt $simm16">;
@@ -1112,8 +1172,8 @@ def S_SETKILL : SOPP <0x0000000b, (ins i16imm:$simm16), "s_setkill $simm16">;
 def S_SLEEP : SOPP <0x0000000e, (ins i32imm:$simm16),
   "s_sleep $simm16", [(int_amdgcn_s_sleep timm:$simm16)]> {
   let hasSideEffects = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
 }
 
 def S_SETPRIO : SOPP <0x0000000f, (ins i16imm:$simm16), "s_setprio $simm16">;
@@ -1138,14 +1198,14 @@ def S_ICACHE_INV : SOPP <0x00000013, (ins), "s_icache_inv"> {
 def S_INCPERFLEVEL : SOPP <0x00000014, (ins i32imm:$simm16), "s_incperflevel $simm16",
   [(int_amdgcn_s_incperflevel timm:$simm16)]> {
   let hasSideEffects = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
 }
 def S_DECPERFLEVEL : SOPP <0x00000015, (ins i32imm:$simm16), "s_decperflevel $simm16",
   [(int_amdgcn_s_decperflevel timm:$simm16)]> {
   let hasSideEffects = 1;
-  let mayLoad = 1;
-  let mayStore = 1;
+  let mayLoad = 0;
+  let mayStore = 0;
 }
 def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
   let simm16 = 0;
@@ -1154,6 +1214,8 @@ def S_TTRACEDATA : SOPP <0x00000016, (ins), "s_ttracedata"> {
 let SubtargetPredicate = HasVGPRIndexMode in {
 def S_SET_GPR_IDX_OFF : SOPP<0x1c, (ins), "s_set_gpr_idx_off"> {
   let simm16 = 0;
+  let Defs = [MODE];
+  let Uses = [MODE];
 }
 }
 } // End hasSideEffects
@@ -1161,7 +1223,8 @@ def S_SET_GPR_IDX_OFF : SOPP<0x1c, (ins), "s_set_gpr_idx_off"> {
 let SubtargetPredicate = HasVGPRIndexMode in {
 def S_SET_GPR_IDX_MODE : SOPP<0x1d, (ins GPRIdxMode:$simm16),
   "s_set_gpr_idx_mode$simm16"> {
-  let Defs = [M0];
+  let Defs = [M0, MODE];
+  let Uses = [MODE];
 }
 }
 
@@ -1176,13 +1239,15 @@ let SubtargetPredicate = isGFX10Plus in {
     }
   def S_WAITCNT_DEPCTR :
     SOPP <0x023, (ins s16imm:$simm16), "s_waitcnt_depctr $simm16">;
-  def S_ROUND_MODE :
-    SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">;
-  def S_DENORM_MODE :
-    SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16",
-    [(SIdenorm_mode (i32 timm:$simm16))]> {
-      let hasSideEffects = 1;
-    }
+
+  let hasSideEffects = 0, Uses = [MODE], Defs = [MODE] in {
+    def S_ROUND_MODE :
+      SOPP<0x024, (ins s16imm:$simm16), "s_round_mode $simm16">;
+    def S_DENORM_MODE :
+      SOPP<0x025, (ins i32imm:$simm16), "s_denorm_mode $simm16",
+      [(SIdenorm_mode (i32 timm:$simm16))]>;
+  }
+
   def S_TTRACEDATA_IMM :
     SOPP<0x028, (ins s16imm:$simm16), "s_ttracedata_imm $simm16">;
 } // End SubtargetPredicate = isGFX10Plus
@@ -1223,7 +1288,7 @@ def : GCNPat <
 
 // Same as a 32-bit inreg
 def : GCNPat<
-  (i32 (sext i16:$src)),
+  (i32 (UniformUnaryFrag<sext> i16:$src)),
   (S_SEXT_I32_I16 $src)
 >;
 
@@ -1250,7 +1315,7 @@ def : GCNPat<
 >;
 
 def : GCNPat <
-  (i64 (sext i16:$src)),
+  (i64 (UniformUnaryFrag<sext> i16:$src)),
     (REG_SEQUENCE SReg_64, (i32 (S_SEXT_I32_I16 $src)), sub0,
     (i32 (COPY_TO_REGCLASS (S_ASHR_I32 (i32 (S_SEXT_I32_I16 $src)), (S_MOV_B32 (i32 31))), SGPR_32)), sub1)
 >;
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
index 075e08986c0c..5819a621f55d 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUAsmUtils.cpp
@@ -78,7 +78,11 @@ const char* const IdSymbolic[] = {
   "HW_REG_XNACK_MASK",
   nullptr, // HW_ID1, no predictable values
   nullptr, // HW_ID2, no predictable values
-  "HW_REG_POPS_PACKER"
+  "HW_REG_POPS_PACKER",
+  nullptr,
+  nullptr,
+  nullptr,
+  "HW_REG_SHADER_CYCLES"
 };
 
 } // namespace Hwreg
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
index 5271bc3aacc6..00e6d517bde5 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp
@@ -108,6 +108,7 @@ namespace AMDGPU {
 #define GET_MIMGInfoTable_IMPL
 #define GET_MIMGLZMappingTable_IMPL
 #define GET_MIMGMIPMappingTable_IMPL
+#define GET_MIMGG16MappingTable_IMPL
 #include "AMDGPUGenSearchableTables.inc"
 
 int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -148,10 +149,17 @@ struct MTBUFInfo {
   bool has_soffset;
 };
 
+struct SMInfo {
+  uint16_t Opcode;
+  bool IsBuffer;
+};
+
 #define GET_MTBUFInfoTable_DECL
 #define GET_MTBUFInfoTable_IMPL
 #define GET_MUBUFInfoTable_DECL
 #define GET_MUBUFInfoTable_IMPL
+#define GET_SMInfoTable_DECL
+#define GET_SMInfoTable_IMPL
 #include "AMDGPUGenSearchableTables.inc"
 
 int getMTBUFBaseOpcode(unsigned Opc) {
@@ -214,6 +222,11 @@ bool getMUBUFHasSoffset(unsigned Opc) {
   return Info ? Info->has_soffset : false;
 }
 
+bool getSMEMIsBuffer(unsigned Opc) {
+  const SMInfo *Info = getSMEMOpcodeHelper(Opc);
+  return Info ? Info->IsBuffer : false;
+}
+
 // Wrapper for Tablegen'd function.  enum Subtarget is not defined in any
 // header files, so we need to wrap it in a function that takes unsigned
 // instead.
@@ -268,6 +281,13 @@ unsigned getLocalMemorySize(const MCSubtargetInfo *STI) {
 }
 
 unsigned getEUsPerCU(const MCSubtargetInfo *STI) {
+  // "Per CU" really means "per whatever functional block the waves of a
+  // workgroup must share". For gfx10 in CU mode this is the CU, which contains
+  // two SIMDs.
+  if (isGFX10(*STI) && STI->getFeatureBits().test(FeatureCuMode))
+    return 2;
+  // Pre-gfx10 a CU contains four SIMDs. For gfx10 in WGP mode the WGP contains
+  // two CUs, so a total of four SIMDs.
   return 4;
 }
 
@@ -283,15 +303,6 @@ unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
   return std::min(N, 16u);
 }
 
-unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI) {
-  return getMaxWavesPerEU(STI) * getEUsPerCU(STI);
-}
-
-unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
-                          unsigned FlatWorkGroupSize) {
-  return getWavesPerWorkGroup(STI, FlatWorkGroupSize);
-}
-
 unsigned getMinWavesPerEU(const MCSubtargetInfo *STI) {
   return 1;
 }
@@ -300,13 +311,13 @@ unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI) {
   // FIXME: Need to take scratch memory into account.
   if (!isGFX10(*STI))
     return 10;
-  return 20;
+  return hasGFX10_3Insts(*STI) ? 16 : 20;
 }
 
-unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
-                          unsigned FlatWorkGroupSize) {
-  return alignTo(getMaxWavesPerCU(STI, FlatWorkGroupSize),
-                 getEUsPerCU(STI)) / getEUsPerCU(STI);
+unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI,
+                                   unsigned FlatWorkGroupSize) {
+  return divideCeil(getWavesPerWorkGroup(STI, FlatWorkGroupSize),
+                    getEUsPerCU(STI));
 }
 
 unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI) {
@@ -320,8 +331,7 @@ unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI) {
 
 unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
                               unsigned FlatWorkGroupSize) {
-  return alignTo(FlatWorkGroupSize, getWavefrontSize(STI)) /
-                 getWavefrontSize(STI);
+  return divideCeil(FlatWorkGroupSize, getWavefrontSize(STI));
 }
 
 unsigned getSGPRAllocGranule(const MCSubtargetInfo *STI) {
@@ -431,12 +441,21 @@ unsigned getVGPRAllocGranule(const MCSubtargetInfo *STI,
   bool IsWave32 = EnableWavefrontSize32 ?
       *EnableWavefrontSize32 :
       STI->getFeatureBits().test(FeatureWavefrontSize32);
+
+  if (hasGFX10_3Insts(*STI))
+    return IsWave32 ? 16 : 8;
+
   return IsWave32 ? 8 : 4;
 }
 
 unsigned getVGPREncodingGranule(const MCSubtargetInfo *STI,
                                 Optional<bool> EnableWavefrontSize32) {
-  return getVGPRAllocGranule(STI, EnableWavefrontSize32);
+
+  bool IsWave32 = EnableWavefrontSize32 ?
+      *EnableWavefrontSize32 :
+      STI->getFeatureBits().test(FeatureWavefrontSize32);
+
+  return IsWave32 ? 8 : 4;
 }
 
 unsigned getTotalNumVGPRs(const MCSubtargetInfo *STI) {
@@ -559,7 +578,7 @@ bool isReadOnlySegment(const GlobalValue *GV) {
 }
 
 bool shouldEmitConstantsToTextSection(const Triple &TT) {
-  return TT.getOS() == Triple::AMDPAL;
+  return TT.getOS() == Triple::AMDPAL || TT.getArch() == Triple::r600;
 }
 
 int getIntegerAttribute(const Function &F, StringRef Name, int Default) {
@@ -722,13 +741,16 @@ static unsigned getLastSymbolicHwreg(const MCSubtargetInfo &STI) {
     return ID_SYMBOLIC_FIRST_GFX9_;
   else if (isGFX9(STI))
     return ID_SYMBOLIC_FIRST_GFX10_;
+  else if (isGFX10(STI) && !isGFX10_BEncoding(STI))
+    return ID_SYMBOLIC_FIRST_GFX1030_;
   else
     return ID_SYMBOLIC_LAST_;
 }
 
 bool isValidHwreg(int64_t Id, const MCSubtargetInfo &STI) {
-  return ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
-         IdSymbolic[Id];
+  return
+    ID_SYMBOLIC_FIRST_ <= Id && Id < getLastSymbolicHwreg(STI) &&
+    IdSymbolic[Id] && (Id != ID_XNACK_MASK || !AMDGPU::isGFX10_BEncoding(STI));
 }
 
 bool isValidHwreg(int64_t Id) {
@@ -927,7 +949,15 @@ bool hasSRAMECC(const MCSubtargetInfo &STI) {
 }
 
 bool hasMIMG_R128(const MCSubtargetInfo &STI) {
-  return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128];
+  return STI.getFeatureBits()[AMDGPU::FeatureMIMG_R128] && !STI.getFeatureBits()[AMDGPU::FeatureR128A16];
+}
+
+bool hasGFX10A16(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX10A16];
+}
+
+bool hasG16(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureG16];
 }
 
 bool hasPackedD16(const MCSubtargetInfo &STI) {
@@ -958,9 +988,17 @@ bool isGCN3Encoding(const MCSubtargetInfo &STI) {
   return STI.getFeatureBits()[AMDGPU::FeatureGCN3Encoding];
 }
 
+bool isGFX10_BEncoding(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX10_BEncoding];
+}
+
+bool hasGFX10_3Insts(const MCSubtargetInfo &STI) {
+  return STI.getFeatureBits()[AMDGPU::FeatureGFX10_3Insts];
+}
+
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI) {
   const MCRegisterClass SGPRClass = TRI->getRegClass(AMDGPU::SReg_32RegClassID);
-  const unsigned FirstSubReg = TRI->getSubReg(Reg, 1);
+  const unsigned FirstSubReg = TRI->getSubReg(Reg, AMDGPU::sub0);
   return SGPRClass.contains(FirstSubReg != 0 ? FirstSubReg : Reg) ||
     Reg == AMDGPU::SCC;
 }
@@ -1082,6 +1120,11 @@ bool isSISrcInlinableOperand(const MCInstrDesc &Desc, unsigned OpNo) {
 // (move from MC* level to Target* level). Return size in bits.
 unsigned getRegBitWidth(unsigned RCID) {
   switch (RCID) {
+  case AMDGPU::VGPR_LO16RegClassID:
+  case AMDGPU::VGPR_HI16RegClassID:
+  case AMDGPU::SGPR_LO16RegClassID:
+  case AMDGPU::AGPR_LO16RegClassID:
+    return 16;
   case AMDGPU::SGPR_32RegClassID:
   case AMDGPU::VGPR_32RegClassID:
   case AMDGPU::VRegOrLds_32RegClassID:
@@ -1103,6 +1146,7 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::SGPR_96RegClassID:
   case AMDGPU::SReg_96RegClassID:
   case AMDGPU::VReg_96RegClassID:
+  case AMDGPU::AReg_96RegClassID:
     return 96;
   case AMDGPU::SGPR_128RegClassID:
   case AMDGPU::SReg_128RegClassID:
@@ -1112,14 +1156,24 @@ unsigned getRegBitWidth(unsigned RCID) {
   case AMDGPU::SGPR_160RegClassID:
   case AMDGPU::SReg_160RegClassID:
   case AMDGPU::VReg_160RegClassID:
+  case AMDGPU::AReg_160RegClassID:
     return 160;
+  case AMDGPU::SGPR_192RegClassID:
+  case AMDGPU::SReg_192RegClassID:
+  case AMDGPU::VReg_192RegClassID:
+  case AMDGPU::AReg_192RegClassID:
+    return 192;
+  case AMDGPU::SGPR_256RegClassID:
   case AMDGPU::SReg_256RegClassID:
   case AMDGPU::VReg_256RegClassID:
+  case AMDGPU::AReg_256RegClassID:
     return 256;
+  case AMDGPU::SGPR_512RegClassID:
   case AMDGPU::SReg_512RegClassID:
   case AMDGPU::VReg_512RegClassID:
   case AMDGPU::AReg_512RegClassID:
     return 512;
+  case AMDGPU::SGPR_1024RegClassID:
   case AMDGPU::SReg_1024RegClassID:
   case AMDGPU::VReg_1024RegClassID:
   case AMDGPU::AReg_1024RegClassID:
@@ -1141,7 +1195,7 @@ unsigned getRegOperandSize(const MCRegisterInfo *MRI, const MCInstrDesc &Desc,
 }
 
 bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
-  if (Literal >= -16 && Literal <= 64)
+  if (isInlinableIntLiteral(Literal))
     return true;
 
   uint64_t Val = static_cast<uint64_t>(Literal);
@@ -1158,7 +1212,7 @@ bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi) {
 }
 
 bool isInlinableLiteral32(int32_t Literal, bool HasInv2Pi) {
-  if (Literal >= -16 && Literal <= 64)
+  if (isInlinableIntLiteral(Literal))
     return true;
 
   // The actual type of the operand does not seem to matter as long
@@ -1187,7 +1241,7 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi) {
   if (!HasInv2Pi)
     return false;
 
-  if (Literal >= -16 && Literal <= 64)
+  if (isInlinableIntLiteral(Literal))
     return true;
 
   uint16_t Val = static_cast<uint16_t>(Literal);
@@ -1217,6 +1271,17 @@ bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi) {
   return Lo16 == Hi16 && isInlinableLiteral16(Lo16, HasInv2Pi);
 }
 
+bool isInlinableIntLiteralV216(int32_t Literal) {
+  int16_t Lo16 = static_cast<int16_t>(Literal);
+  if (isInt<16>(Literal) || isUInt<16>(Literal))
+    return isInlinableIntLiteral(Lo16);
+
+  int16_t Hi16 = static_cast<int16_t>(Literal >> 16);
+  if (!(Literal & 0xffff))
+    return isInlinableIntLiteral(Hi16);
+  return Lo16 == Hi16 && isInlinableIntLiteral(Lo16);
+}
+
 bool isArgPassedInSGPR(const Argument *A) {
   const Function *F = A->getParent();
 
@@ -1247,16 +1312,61 @@ static bool hasSMEMByteOffset(const MCSubtargetInfo &ST) {
   return isGCN3Encoding(ST) || isGFX10(ST);
 }
 
-int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
+static bool hasSMRDSignedImmOffset(const MCSubtargetInfo &ST) {
+  return isGFX9(ST) || isGFX10(ST);
+}
+
+bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
+                                      int64_t EncodedOffset) {
+  return hasSMEMByteOffset(ST) ? isUInt<20>(EncodedOffset)
+                               : isUInt<8>(EncodedOffset);
+}
+
+bool isLegalSMRDEncodedSignedOffset(const MCSubtargetInfo &ST,
+                                    int64_t EncodedOffset,
+                                    bool IsBuffer) {
+  return !IsBuffer &&
+         hasSMRDSignedImmOffset(ST) &&
+         isInt<21>(EncodedOffset);
+}
+
+static bool isDwordAligned(uint64_t ByteOffset) {
+  return (ByteOffset & 3) == 0;
+}
+
+uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST,
+                                uint64_t ByteOffset) {
   if (hasSMEMByteOffset(ST))
     return ByteOffset;
+
+  assert(isDwordAligned(ByteOffset));
   return ByteOffset >> 2;
 }
 
-bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
-  int64_t EncodedOffset = getSMRDEncodedOffset(ST, ByteOffset);
-  return (hasSMEMByteOffset(ST)) ?
-    isUInt<20>(EncodedOffset) : isUInt<8>(EncodedOffset);
+Optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
+                                       int64_t ByteOffset, bool IsBuffer) {
+  // The signed version is always a byte offset.
+  if (!IsBuffer && hasSMRDSignedImmOffset(ST)) {
+    assert(hasSMEMByteOffset(ST));
+    return isInt<20>(ByteOffset) ? Optional<int64_t>(ByteOffset) : None;
+  }
+
+  if (!isDwordAligned(ByteOffset) && !hasSMEMByteOffset(ST))
+    return None;
+
+  int64_t EncodedOffset = convertSMRDOffsetUnits(ST, ByteOffset);
+  return isLegalSMRDEncodedUnsignedOffset(ST, EncodedOffset)
+             ? Optional<int64_t>(EncodedOffset)
+             : None;
+}
+
+Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST,
+                                                int64_t ByteOffset) {
+  if (!isCI(ST) || !isDwordAligned(ByteOffset))
+    return None;
+
+  int64_t EncodedOffset = convertSMRDOffsetUnits(ST, ByteOffset);
+  return isUInt<32>(EncodedOffset) ? Optional<int64_t>(EncodedOffset) : None;
 }
 
 // Given Imm, split it into the values to put into the SOffset and ImmOffset
@@ -1267,8 +1377,8 @@ bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset) {
 // aligned if they are aligned to begin with. It also ensures that additional
 // offsets within the given alignment can be added to the resulting ImmOffset.
 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
-                      const GCNSubtarget *Subtarget, uint32_t Align) {
-  const uint32_t MaxImm = alignDown(4095, Align);
+                      const GCNSubtarget *Subtarget, Align Alignment) {
+  const uint32_t MaxImm = alignDown(4095, Alignment.value());
   uint32_t Overflow = 0;
 
   if (Imm > MaxImm) {
@@ -1286,10 +1396,10 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
       //
       // Atomic operations fail to work correctly when individual address
       // components are unaligned, even if their sum is aligned.
-      uint32_t High = (Imm + Align) & ~4095;
-      uint32_t Low = (Imm + Align) & 4095;
+      uint32_t High = (Imm + Alignment.value()) & ~4095;
+      uint32_t Low = (Imm + Alignment.value()) & 4095;
       Imm = Low;
-      Overflow = High - Align;
+      Overflow = High - Alignment.value();
     }
   }
 
@@ -1305,8 +1415,7 @@ bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
   return true;
 }
 
-SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F,
-                                               const GCNSubtarget &ST) {
+SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F) {
   *this = getDefaultForCallingConv(F.getCallingConv());
 
   StringRef IEEEAttr = F.getFnAttribute("amdgpu-ieee").getValueAsString();
@@ -1318,8 +1427,25 @@ SIModeRegisterDefaults::SIModeRegisterDefaults(const Function &F,
   if (!DX10ClampAttr.empty())
     DX10Clamp = DX10ClampAttr == "true";
 
-  FP32Denormals = ST.hasFP32Denormals(F);
-  FP64FP16Denormals = ST.hasFP64FP16Denormals(F);
+  StringRef DenormF32Attr = F.getFnAttribute("denormal-fp-math-f32").getValueAsString();
+  if (!DenormF32Attr.empty()) {
+    DenormalMode DenormMode = parseDenormalFPAttribute(DenormF32Attr);
+    FP32InputDenormals = DenormMode.Input == DenormalMode::IEEE;
+    FP32OutputDenormals = DenormMode.Output == DenormalMode::IEEE;
+  }
+
+  StringRef DenormAttr = F.getFnAttribute("denormal-fp-math").getValueAsString();
+  if (!DenormAttr.empty()) {
+    DenormalMode DenormMode = parseDenormalFPAttribute(DenormAttr);
+
+    if (DenormF32Attr.empty()) {
+      FP32InputDenormals = DenormMode.Input == DenormalMode::IEEE;
+      FP32OutputDenormals = DenormMode.Output == DenormalMode::IEEE;
+    }
+
+    FP64FP16InputDenormals = DenormMode.Input == DenormalMode::IEEE;
+    FP64FP16OutputDenormals = DenormMode.Output == DenormalMode::IEEE;
+  }
 }
 
 namespace {
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
index a5bada2890d2..e71554575f6a 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h
@@ -12,10 +12,10 @@
 #include "AMDGPU.h"
 #include "AMDKernelCodeT.h"
 #include "SIDefines.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Support/AMDHSAKernelDescriptor.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetParser.h"
@@ -26,17 +26,13 @@
 namespace llvm {
 
 class Argument;
-class AMDGPUSubtarget;
-class FeatureBitset;
 class Function;
 class GCNSubtarget;
 class GlobalValue;
-class MCContext;
 class MCRegisterClass;
 class MCRegisterInfo;
-class MCSection;
 class MCSubtargetInfo;
-class MachineMemOperand;
+class StringRef;
 class Triple;
 
 namespace AMDGPU {
@@ -87,15 +83,6 @@ unsigned getEUsPerCU(const MCSubtargetInfo *STI);
 unsigned getMaxWorkGroupsPerCU(const MCSubtargetInfo *STI,
                                unsigned FlatWorkGroupSize);
 
-/// \returns Maximum number of waves per compute unit for given subtarget \p
-/// STI without any kind of limitation.
-unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI);
-
-/// \returns Maximum number of waves per compute unit for given subtarget \p
-/// STI and limited by given \p FlatWorkGroupSize.
-unsigned getMaxWavesPerCU(const MCSubtargetInfo *STI,
-                          unsigned FlatWorkGroupSize);
-
 /// \returns Minimum number of waves per execution unit for given subtarget \p
 /// STI.
 unsigned getMinWavesPerEU(const MCSubtargetInfo *STI);
@@ -104,10 +91,10 @@ unsigned getMinWavesPerEU(const MCSubtargetInfo *STI);
 /// STI without any kind of limitation.
 unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI);
 
-/// \returns Maximum number of waves per execution unit for given subtarget \p
-/// STI and limited by given \p FlatWorkGroupSize.
-unsigned getMaxWavesPerEU(const MCSubtargetInfo *STI,
-                          unsigned FlatWorkGroupSize);
+/// \returns Number of waves per execution unit required to support the given \p
+/// FlatWorkGroupSize.
+unsigned getWavesPerEUForWorkGroup(const MCSubtargetInfo *STI,
+                                   unsigned FlatWorkGroupSize);
 
 /// \returns Minimum flat work group size for given subtarget \p STI.
 unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI);
@@ -116,7 +103,7 @@ unsigned getMinFlatWorkGroupSize(const MCSubtargetInfo *STI);
 unsigned getMaxFlatWorkGroupSize(const MCSubtargetInfo *STI);
 
 /// \returns Number of waves per work group for given subtarget \p STI and
-/// limited by given \p FlatWorkGroupSize.
+/// \p FlatWorkGroupSize.
 unsigned getWavesPerWorkGroup(const MCSubtargetInfo *STI,
                               unsigned FlatWorkGroupSize);
 
@@ -211,6 +198,7 @@ struct MIMGBaseOpcodeInfo {
 
   uint8_t NumExtraArgs;
   bool Gradients;
+  bool G16;
   bool Coordinates;
   bool LodOrClampOrMip;
   bool HasD16;
@@ -247,11 +235,19 @@ struct MIMGMIPMappingInfo {
   MIMGBaseOpcode NONMIP;
 };
 
+struct MIMGG16MappingInfo {
+  MIMGBaseOpcode G;
+  MIMGBaseOpcode G16;
+};
+
 LLVM_READONLY
 const MIMGLZMappingInfo *getMIMGLZMappingInfo(unsigned L);
 
 LLVM_READONLY
-const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned L);
+const MIMGMIPMappingInfo *getMIMGMIPMappingInfo(unsigned MIP);
+
+LLVM_READONLY
+const MIMGG16MappingInfo *getMIMGG16MappingInfo(unsigned G);
 
 LLVM_READONLY
 int getMIMGOpcode(unsigned BaseOpcode, unsigned MIMGEncoding,
@@ -308,6 +304,9 @@ LLVM_READONLY
 bool getMUBUFHasSoffset(unsigned Opc);
 
 LLVM_READONLY
+bool getSMEMIsBuffer(unsigned Opc);
+
+LLVM_READONLY
 const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp,
                                                   uint8_t NumComponents,
                                                   uint8_t NumFormat,
@@ -551,6 +550,8 @@ inline bool isKernel(CallingConv::ID CC) {
 bool hasXNACK(const MCSubtargetInfo &STI);
 bool hasSRAMECC(const MCSubtargetInfo &STI);
 bool hasMIMG_R128(const MCSubtargetInfo &STI);
+bool hasGFX10A16(const MCSubtargetInfo &STI);
+bool hasG16(const MCSubtargetInfo &STI);
 bool hasPackedD16(const MCSubtargetInfo &STI);
 
 bool isSI(const MCSubtargetInfo &STI);
@@ -558,6 +559,9 @@ bool isCI(const MCSubtargetInfo &STI);
 bool isVI(const MCSubtargetInfo &STI);
 bool isGFX9(const MCSubtargetInfo &STI);
 bool isGFX10(const MCSubtargetInfo &STI);
+bool isGCN3Encoding(const MCSubtargetInfo &STI);
+bool isGFX10_BEncoding(const MCSubtargetInfo &STI);
+bool hasGFX10_3Insts(const MCSubtargetInfo &STI);
 
 /// Is Reg - scalar register
 bool isSGPR(unsigned Reg, const MCRegisterInfo* TRI);
@@ -633,6 +637,13 @@ inline unsigned getOperandSize(const MCInstrDesc &Desc, unsigned OpNo) {
   return getOperandSize(Desc.OpInfo[OpNo]);
 }
 
+/// Is this literal inlinable, and not one of the values intended for floating
+/// point values.
+LLVM_READNONE
+inline bool isInlinableIntLiteral(int64_t Literal) {
+  return Literal >= -16 && Literal <= 64;
+}
+
 /// Is this literal inlinable
 LLVM_READNONE
 bool isInlinableLiteral64(int64_t Literal, bool HasInv2Pi);
@@ -646,11 +657,35 @@ bool isInlinableLiteral16(int16_t Literal, bool HasInv2Pi);
 LLVM_READNONE
 bool isInlinableLiteralV216(int32_t Literal, bool HasInv2Pi);
 
+LLVM_READNONE
+bool isInlinableIntLiteralV216(int32_t Literal);
+
 bool isArgPassedInSGPR(const Argument *Arg);
 
-/// \returns The encoding that will be used for \p ByteOffset in the SMRD
-/// offset field.
-int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
+LLVM_READONLY
+bool isLegalSMRDEncodedUnsignedOffset(const MCSubtargetInfo &ST,
+                                      int64_t EncodedOffset);
+
+LLVM_READONLY
+bool isLegalSMRDEncodedSignedOffset(const MCSubtargetInfo &ST,
+                                    int64_t EncodedOffset,
+                                    bool IsBuffer);
+
+/// Convert \p ByteOffset to dwords if the subtarget uses dword SMRD immediate
+/// offsets.
+uint64_t convertSMRDOffsetUnits(const MCSubtargetInfo &ST, uint64_t ByteOffset);
+
+/// \returns The encoding that will be used for \p ByteOffset in the
+/// SMRD offset field, or None if it won't fit. On GFX9 and GFX10
+/// S_LOAD instructions have a signed offset, on other subtargets it is
+/// unsigned. S_BUFFER has an unsigned offset for all subtargets.
+Optional<int64_t> getSMRDEncodedOffset(const MCSubtargetInfo &ST,
+                                       int64_t ByteOffset, bool IsBuffer);
+
+/// \return The encoding that can be used for a 32-bit literal offset in an SMRD
+/// instruction. This is only useful on CI.s
+Optional<int64_t> getSMRDEncodedLiteralOffset32(const MCSubtargetInfo &ST,
+                                                int64_t ByteOffset);
 
 /// \returns true if this offset is small enough to fit in the SMRD
 /// offset field.  \p ByteOffset should be the offset in bytes and
@@ -658,7 +693,8 @@ int64_t getSMRDEncodedOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 bool isLegalSMRDImmOffset(const MCSubtargetInfo &ST, int64_t ByteOffset);
 
 bool splitMUBUFOffset(uint32_t Imm, uint32_t &SOffset, uint32_t &ImmOffset,
-                      const GCNSubtarget *Subtarget, uint32_t Align = 4);
+                      const GCNSubtarget *Subtarget,
+                      Align Alignment = Align(4));
 
 /// \returns true if the intrinsic is divergent
 bool isIntrinsicSourceOfDivergence(unsigned IntrID);
@@ -677,45 +713,76 @@ struct SIModeRegisterDefaults {
 
   /// If this is set, neither input or output denormals are flushed for most f32
   /// instructions.
-  ///
-  /// TODO: Split into separate input and output fields if necessary like the
-  /// control bits really provide?
-  bool FP32Denormals : 1;
+  bool FP32InputDenormals : 1;
+  bool FP32OutputDenormals : 1;
 
   /// If this is set, neither input or output denormals are flushed for both f64
   /// and f16/v2f16 instructions.
-  bool FP64FP16Denormals : 1;
+  bool FP64FP16InputDenormals : 1;
+  bool FP64FP16OutputDenormals : 1;
 
   SIModeRegisterDefaults() :
     IEEE(true),
     DX10Clamp(true),
-    FP32Denormals(true),
-    FP64FP16Denormals(true) {}
+    FP32InputDenormals(true),
+    FP32OutputDenormals(true),
+    FP64FP16InputDenormals(true),
+    FP64FP16OutputDenormals(true) {}
 
-  // FIXME: Should not depend on the subtarget
-  SIModeRegisterDefaults(const Function &F, const GCNSubtarget &ST);
+  SIModeRegisterDefaults(const Function &F);
 
   static SIModeRegisterDefaults getDefaultForCallingConv(CallingConv::ID CC) {
     const bool IsCompute = AMDGPU::isCompute(CC);
 
     SIModeRegisterDefaults Mode;
-    Mode.DX10Clamp = true;
     Mode.IEEE = IsCompute;
-    Mode.FP32Denormals = false; // FIXME: Should be on by default.
-    Mode.FP64FP16Denormals = true;
     return Mode;
   }
 
   bool operator ==(const SIModeRegisterDefaults Other) const {
     return IEEE == Other.IEEE && DX10Clamp == Other.DX10Clamp &&
-           FP32Denormals == Other.FP32Denormals &&
-           FP64FP16Denormals == Other.FP64FP16Denormals;
+           FP32InputDenormals == Other.FP32InputDenormals &&
+           FP32OutputDenormals == Other.FP32OutputDenormals &&
+           FP64FP16InputDenormals == Other.FP64FP16InputDenormals &&
+           FP64FP16OutputDenormals == Other.FP64FP16OutputDenormals;
+  }
+
+  bool allFP32Denormals() const {
+    return FP32InputDenormals && FP32OutputDenormals;
+  }
+
+  bool allFP64FP16Denormals() const {
+    return FP64FP16InputDenormals && FP64FP16OutputDenormals;
+  }
+
+  /// Get the encoding value for the FP_DENORM bits of the mode register for the
+  /// FP32 denormal mode.
+  uint32_t fpDenormModeSPValue() const {
+    if (FP32InputDenormals && FP32OutputDenormals)
+      return FP_DENORM_FLUSH_NONE;
+    if (FP32InputDenormals)
+      return FP_DENORM_FLUSH_OUT;
+    if (FP32OutputDenormals)
+      return FP_DENORM_FLUSH_IN;
+    return FP_DENORM_FLUSH_IN_FLUSH_OUT;
+  }
+
+  /// Get the encoding value for the FP_DENORM bits of the mode register for the
+  /// FP64/FP16 denormal mode.
+  uint32_t fpDenormModeDPValue() const {
+    if (FP64FP16InputDenormals && FP64FP16OutputDenormals)
+      return FP_DENORM_FLUSH_NONE;
+    if (FP64FP16InputDenormals)
+      return FP_DENORM_FLUSH_OUT;
+    if (FP64FP16OutputDenormals)
+      return FP_DENORM_FLUSH_IN;
+    return FP_DENORM_FLUSH_IN_FLUSH_OUT;
   }
 
   /// Returns true if a flag is compatible if it's enabled in the callee, but
   /// disabled in the caller.
   static bool oneWayCompatible(bool CallerMode, bool CalleeMode) {
-    return CallerMode == CalleeMode || (CallerMode && !CalleeMode);
+    return CallerMode == CalleeMode || (!CallerMode && CalleeMode);
   }
 
   // FIXME: Inlining should be OK for dx10-clamp, since the caller's mode should
@@ -727,8 +794,10 @@ struct SIModeRegisterDefaults {
       return false;
 
     // Allow inlining denormals enabled into denormals flushed functions.
-    return oneWayCompatible(FP64FP16Denormals, CalleeMode.FP64FP16Denormals) &&
-           oneWayCompatible(FP32Denormals, CalleeMode.FP32Denormals);
+    return oneWayCompatible(FP64FP16InputDenormals, CalleeMode.FP64FP16InputDenormals) &&
+           oneWayCompatible(FP64FP16OutputDenormals, CalleeMode.FP64FP16OutputDenormals) &&
+           oneWayCompatible(FP32InputDenormals, CalleeMode.FP32InputDenormals) &&
+           oneWayCompatible(FP32OutputDenormals, CalleeMode.FP32OutputDenormals);
   }
 };
 
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
index 207e4232e829..ef010a7ac157 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.cpp
@@ -397,6 +397,39 @@ static const char *getRegisterName(unsigned RegNum) {
       {0x2c6a, "SPI_SHADER_USER_DATA_VS_30"},
       {0x2c6b, "SPI_SHADER_USER_DATA_VS_31"},
 
+      {0x2c8c, "SPI_SHADER_USER_DATA_GS_0"},
+      {0x2c8d, "SPI_SHADER_USER_DATA_GS_1"},
+      {0x2c8e, "SPI_SHADER_USER_DATA_GS_2"},
+      {0x2c8f, "SPI_SHADER_USER_DATA_GS_3"},
+      {0x2c90, "SPI_SHADER_USER_DATA_GS_4"},
+      {0x2c91, "SPI_SHADER_USER_DATA_GS_5"},
+      {0x2c92, "SPI_SHADER_USER_DATA_GS_6"},
+      {0x2c93, "SPI_SHADER_USER_DATA_GS_7"},
+      {0x2c94, "SPI_SHADER_USER_DATA_GS_8"},
+      {0x2c95, "SPI_SHADER_USER_DATA_GS_9"},
+      {0x2c96, "SPI_SHADER_USER_DATA_GS_10"},
+      {0x2c97, "SPI_SHADER_USER_DATA_GS_11"},
+      {0x2c98, "SPI_SHADER_USER_DATA_GS_12"},
+      {0x2c99, "SPI_SHADER_USER_DATA_GS_13"},
+      {0x2c9a, "SPI_SHADER_USER_DATA_GS_14"},
+      {0x2c9b, "SPI_SHADER_USER_DATA_GS_15"},
+      {0x2c9c, "SPI_SHADER_USER_DATA_GS_16"},
+      {0x2c9d, "SPI_SHADER_USER_DATA_GS_17"},
+      {0x2c9e, "SPI_SHADER_USER_DATA_GS_18"},
+      {0x2c9f, "SPI_SHADER_USER_DATA_GS_19"},
+      {0x2ca0, "SPI_SHADER_USER_DATA_GS_20"},
+      {0x2ca1, "SPI_SHADER_USER_DATA_GS_21"},
+      {0x2ca2, "SPI_SHADER_USER_DATA_GS_22"},
+      {0x2ca3, "SPI_SHADER_USER_DATA_GS_23"},
+      {0x2ca4, "SPI_SHADER_USER_DATA_GS_24"},
+      {0x2ca5, "SPI_SHADER_USER_DATA_GS_25"},
+      {0x2ca6, "SPI_SHADER_USER_DATA_GS_26"},
+      {0x2ca7, "SPI_SHADER_USER_DATA_GS_27"},
+      {0x2ca8, "SPI_SHADER_USER_DATA_GS_28"},
+      {0x2ca9, "SPI_SHADER_USER_DATA_GS_29"},
+      {0x2caa, "SPI_SHADER_USER_DATA_GS_30"},
+      {0x2cab, "SPI_SHADER_USER_DATA_GS_31"},
+
       {0x2ccc, "SPI_SHADER_USER_DATA_ES_0"},
       {0x2ccd, "SPI_SHADER_USER_DATA_ES_1"},
       {0x2cce, "SPI_SHADER_USER_DATA_ES_2"},
@@ -491,38 +524,55 @@ static const char *getRegisterName(unsigned RegNum) {
       {0xa310, "PA_SC_SHADER_CONTROL"},
       {0xa313, "PA_SC_CONSERVATIVE_RASTERIZATION_CNTL"},
 
-      {0x2d0c, "SPI_SHADER_USER_DATA_LS_0"},
-      {0x2d0d, "SPI_SHADER_USER_DATA_LS_1"},
-      {0x2d0e, "SPI_SHADER_USER_DATA_LS_2"},
-      {0x2d0f, "SPI_SHADER_USER_DATA_LS_3"},
-      {0x2d10, "SPI_SHADER_USER_DATA_LS_4"},
-      {0x2d11, "SPI_SHADER_USER_DATA_LS_5"},
-      {0x2d12, "SPI_SHADER_USER_DATA_LS_6"},
-      {0x2d13, "SPI_SHADER_USER_DATA_LS_7"},
-      {0x2d14, "SPI_SHADER_USER_DATA_LS_8"},
-      {0x2d15, "SPI_SHADER_USER_DATA_LS_9"},
-      {0x2d16, "SPI_SHADER_USER_DATA_LS_10"},
-      {0x2d17, "SPI_SHADER_USER_DATA_LS_11"},
-      {0x2d18, "SPI_SHADER_USER_DATA_LS_12"},
-      {0x2d19, "SPI_SHADER_USER_DATA_LS_13"},
-      {0x2d1a, "SPI_SHADER_USER_DATA_LS_14"},
-      {0x2d1b, "SPI_SHADER_USER_DATA_LS_15"},
-      {0x2d1c, "SPI_SHADER_USER_DATA_LS_16"},
-      {0x2d1d, "SPI_SHADER_USER_DATA_LS_17"},
-      {0x2d1e, "SPI_SHADER_USER_DATA_LS_18"},
-      {0x2d1f, "SPI_SHADER_USER_DATA_LS_19"},
-      {0x2d20, "SPI_SHADER_USER_DATA_LS_20"},
-      {0x2d21, "SPI_SHADER_USER_DATA_LS_21"},
-      {0x2d22, "SPI_SHADER_USER_DATA_LS_22"},
-      {0x2d23, "SPI_SHADER_USER_DATA_LS_23"},
-      {0x2d24, "SPI_SHADER_USER_DATA_LS_24"},
-      {0x2d25, "SPI_SHADER_USER_DATA_LS_25"},
-      {0x2d26, "SPI_SHADER_USER_DATA_LS_26"},
-      {0x2d27, "SPI_SHADER_USER_DATA_LS_27"},
-      {0x2d28, "SPI_SHADER_USER_DATA_LS_28"},
-      {0x2d29, "SPI_SHADER_USER_DATA_LS_29"},
-      {0x2d2a, "SPI_SHADER_USER_DATA_LS_30"},
-      {0x2d2b, "SPI_SHADER_USER_DATA_LS_31"},
+      {0x2d0c, "SPI_SHADER_USER_DATA_HS_0"},
+      {0x2d0d, "SPI_SHADER_USER_DATA_HS_1"},
+      {0x2d0e, "SPI_SHADER_USER_DATA_HS_2"},
+      {0x2d0f, "SPI_SHADER_USER_DATA_HS_3"},
+      {0x2d10, "SPI_SHADER_USER_DATA_HS_4"},
+      {0x2d11, "SPI_SHADER_USER_DATA_HS_5"},
+      {0x2d12, "SPI_SHADER_USER_DATA_HS_6"},
+      {0x2d13, "SPI_SHADER_USER_DATA_HS_7"},
+      {0x2d14, "SPI_SHADER_USER_DATA_HS_8"},
+      {0x2d15, "SPI_SHADER_USER_DATA_HS_9"},
+      {0x2d16, "SPI_SHADER_USER_DATA_HS_10"},
+      {0x2d17, "SPI_SHADER_USER_DATA_HS_11"},
+      {0x2d18, "SPI_SHADER_USER_DATA_HS_12"},
+      {0x2d19, "SPI_SHADER_USER_DATA_HS_13"},
+      {0x2d1a, "SPI_SHADER_USER_DATA_HS_14"},
+      {0x2d1b, "SPI_SHADER_USER_DATA_HS_15"},
+      {0x2d1c, "SPI_SHADER_USER_DATA_HS_16"},
+      {0x2d1d, "SPI_SHADER_USER_DATA_HS_17"},
+      {0x2d1e, "SPI_SHADER_USER_DATA_HS_18"},
+      {0x2d1f, "SPI_SHADER_USER_DATA_HS_19"},
+      {0x2d20, "SPI_SHADER_USER_DATA_HS_20"},
+      {0x2d21, "SPI_SHADER_USER_DATA_HS_21"},
+      {0x2d22, "SPI_SHADER_USER_DATA_HS_22"},
+      {0x2d23, "SPI_SHADER_USER_DATA_HS_23"},
+      {0x2d24, "SPI_SHADER_USER_DATA_HS_24"},
+      {0x2d25, "SPI_SHADER_USER_DATA_HS_25"},
+      {0x2d26, "SPI_SHADER_USER_DATA_HS_26"},
+      {0x2d27, "SPI_SHADER_USER_DATA_HS_27"},
+      {0x2d28, "SPI_SHADER_USER_DATA_HS_28"},
+      {0x2d29, "SPI_SHADER_USER_DATA_HS_29"},
+      {0x2d2a, "SPI_SHADER_USER_DATA_HS_30"},
+      {0x2d2b, "SPI_SHADER_USER_DATA_HS_31"},
+
+      {0x2d4c, "SPI_SHADER_USER_DATA_LS_0"},
+      {0x2d4d, "SPI_SHADER_USER_DATA_LS_1"},
+      {0x2d4e, "SPI_SHADER_USER_DATA_LS_2"},
+      {0x2d4f, "SPI_SHADER_USER_DATA_LS_3"},
+      {0x2d50, "SPI_SHADER_USER_DATA_LS_4"},
+      {0x2d51, "SPI_SHADER_USER_DATA_LS_5"},
+      {0x2d52, "SPI_SHADER_USER_DATA_LS_6"},
+      {0x2d53, "SPI_SHADER_USER_DATA_LS_7"},
+      {0x2d54, "SPI_SHADER_USER_DATA_LS_8"},
+      {0x2d55, "SPI_SHADER_USER_DATA_LS_9"},
+      {0x2d56, "SPI_SHADER_USER_DATA_LS_10"},
+      {0x2d57, "SPI_SHADER_USER_DATA_LS_11"},
+      {0x2d58, "SPI_SHADER_USER_DATA_LS_12"},
+      {0x2d59, "SPI_SHADER_USER_DATA_LS_13"},
+      {0x2d5a, "SPI_SHADER_USER_DATA_LS_14"},
+      {0x2d5b, "SPI_SHADER_USER_DATA_LS_15"},
 
       {0xa2aa, "IA_MULTI_VGT_PARAM"},
       {0xa2a5, "VGT_GS_MAX_PRIMS_PER_SUBGROUP"},
diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
index 0f17c157b206..544ab669d9ae 100644
--- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
+++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUPALMetadata.h
@@ -14,16 +14,12 @@
 #ifndef LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
 #define LLVM_LIB_TARGET_AMDGPU_AMDGPUPALMETADATA_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/BinaryFormat/MsgPackDocument.h"
-#include <map>
 
 namespace llvm {
 
-class AMDGPUTargetStreamer;
-class formatted_raw_ostream;
-class MCStreamer;
 class Module;
+class StringRef;
 
 class AMDGPUPALMetadata {
   unsigned BlobType = 0;
diff --git a/llvm/lib/Target/AMDGPU/VIInstructions.td b/llvm/lib/Target/AMDGPU/VIInstructions.td
deleted file mode 100644
index ec7d8875a746..000000000000
--- a/llvm/lib/Target/AMDGPU/VIInstructions.td
+++ /dev/null
@@ -1,13 +0,0 @@
-//===-- VIInstructions.td - VI Instruction Defintions ---------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-// Instruction definitions for VI and newer.
-//===----------------------------------------------------------------------===//
-
-FIXME: Deleting this file broke buildbots that don't do full rebuilds.  This
-file is no longer used by the backend, so it can be deleted once all
-the buildbots update there dependencies.
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index c7aed0985540..17f334f62a30 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -1,4 +1,4 @@
-//===-- VOP1Instructions.td - Vector Instruction Defintions ---------------===//
+//===-- VOP1Instructions.td - Vector Instruction Definitions --------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -48,9 +48,13 @@ class VOP1_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], bit VOP1On
   let mayStore = 0;
   let hasSideEffects = 0;
 
+  let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+  let mayRaiseFPException = ReadsModeReg;
+
   let VOP1 = 1;
   let VALU = 1;
-  let Uses = [EXEC];
+  let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
 
   let AsmVariantName = AMDGPUAsmVariants.Default;
 }
@@ -89,9 +93,7 @@ class VOP1_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
 class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
   list<dag> ret =
     !if(P.HasModifiers,
-        [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods0 P.Src0VT:$src0,
-                                              i32:$src0_modifiers,
-                                              i1:$clamp, i32:$omod))))],
+        [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3Mods P.Src0VT:$src0, i32:$src0_modifiers))))],
         !if(P.HasOMod,
             [(set P.DstVT:$vdst, (node (P.Src0VT (VOP3OMods P.Src0VT:$src0,
                                                   i1:$clamp, i32:$omod))))],
@@ -102,8 +104,13 @@ class getVOP1Pat64 <SDPatternOperator node, VOPProfile P> : LetDummies {
 
 multiclass VOP1Inst <string opName, VOPProfile P,
                      SDPatternOperator node = null_frag> {
-  def _e32 : VOP1_Pseudo <opName, P>;
-  def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
+  // We only want to set this on the basic, non-SDWA or DPP forms.
+  defvar should_mov_imm = !eq(opName, "v_mov_b32");
+
+  let isMoveImm = should_mov_imm in {
+    def _e32 : VOP1_Pseudo <opName, P>;
+    def _e64 : VOP3_Pseudo <opName, P, getVOP1Pat64<node, P>.ret>;
+  }
 
   foreach _ = BoolToList<P.HasExtSDWA>.ret in
     def _sdwa : VOP1_SDWA_Pseudo <opName, P>;
@@ -146,7 +153,7 @@ let VOPAsmPrefer32Bit = 1 in {
 defm V_NOP : VOP1Inst <"v_nop", VOP_NONE>;
 }
 
-let isMoveImm = 1, isReMaterializable = 1, isAsCheapAsAMove = 1 in {
+let isReMaterializable = 1, isAsCheapAsAMove = 1 in {
 defm V_MOV_B32 : VOP1Inst <"v_mov_b32", VOP_I32_I32>;
 } // End isMoveImm = 1
 
@@ -183,31 +190,51 @@ def V_READFIRSTLANE_B32 :
 
 let SchedRW = [WriteDoubleCvt] in {
 defm V_CVT_I32_F64 : VOP1Inst <"v_cvt_i32_f64", VOP_I32_F64,  fp_to_sint>;
+
+let mayRaiseFPException = 0 in {
 defm V_CVT_F64_I32 : VOP1Inst <"v_cvt_f64_i32", VOP1_F64_I32, sint_to_fp>;
+}
+
 defm V_CVT_F32_F64 : VOP1Inst <"v_cvt_f32_f64", VOP_F32_F64,  fpround>;
 defm V_CVT_F64_F32 : VOP1Inst <"v_cvt_f64_f32", VOP_F64_F32,  fpextend>;
 defm V_CVT_U32_F64 : VOP1Inst <"v_cvt_u32_f64", VOP_I32_F64,  fp_to_uint>;
+
+let mayRaiseFPException = 0 in {
 defm V_CVT_F64_U32 : VOP1Inst <"v_cvt_f64_u32", VOP1_F64_I32, uint_to_fp>;
+}
+
 } // End SchedRW = [WriteDoubleCvt]
 
-let SchedRW = [WriteQuarterRate32] in {
+let SchedRW = [WriteFloatCvt] in {
+
+// XXX: Does this really not raise exceptions? The manual claims the
+// 16-bit ones can.
+let mayRaiseFPException = 0 in {
 defm V_CVT_F32_I32 : VOP1Inst <"v_cvt_f32_i32", VOP1_F32_I32, sint_to_fp>;
 defm V_CVT_F32_U32 : VOP1Inst <"v_cvt_f32_u32", VOP1_F32_I32, uint_to_fp>;
+}
+
 defm V_CVT_U32_F32 : VOP1Inst <"v_cvt_u32_f32", VOP_I32_F32, fp_to_uint>;
 defm V_CVT_I32_F32 : VOP1Inst <"v_cvt_i32_f32", VOP_I32_F32, fp_to_sint>;
 let FPDPRounding = 1 in {
 defm V_CVT_F16_F32 : VOP1Inst <"v_cvt_f16_f32", VOP_F16_F32, fpround>;
 } // End FPDPRounding = 1
+
 defm V_CVT_F32_F16 : VOP1Inst <"v_cvt_f32_f16", VOP_F32_F16, fpextend>;
+
+let ReadsModeReg = 0, mayRaiseFPException = 0 in {
 defm V_CVT_RPI_I32_F32 : VOP1Inst <"v_cvt_rpi_i32_f32", VOP_I32_F32, cvt_rpi_i32_f32>;
 defm V_CVT_FLR_I32_F32 : VOP1Inst <"v_cvt_flr_i32_f32", VOP_I32_F32, cvt_flr_i32_f32>;
 defm V_CVT_OFF_F32_I4 : VOP1Inst  <"v_cvt_off_f32_i4", VOP1_F32_I32>;
-} // End SchedRW = [WriteQuarterRate32]
+} // End ReadsModeReg = 0, mayRaiseFPException = 0
+} // End SchedRW = [WriteFloatCvt]
 
+let ReadsModeReg = 0, mayRaiseFPException = 0 in {
 defm V_CVT_F32_UBYTE0 : VOP1Inst <"v_cvt_f32_ubyte0", VOP1_F32_I32, AMDGPUcvt_f32_ubyte0>;
 defm V_CVT_F32_UBYTE1 : VOP1Inst <"v_cvt_f32_ubyte1", VOP1_F32_I32, AMDGPUcvt_f32_ubyte1>;
 defm V_CVT_F32_UBYTE2 : VOP1Inst <"v_cvt_f32_ubyte2", VOP1_F32_I32, AMDGPUcvt_f32_ubyte2>;
 defm V_CVT_F32_UBYTE3 : VOP1Inst <"v_cvt_f32_ubyte3", VOP1_F32_I32, AMDGPUcvt_f32_ubyte3>;
+} // ReadsModeReg = 0, mayRaiseFPException = 0
 
 defm V_FRACT_F32 : VOP1Inst <"v_fract_f32", VOP_F32_F32, AMDGPUfract>;
 defm V_TRUNC_F32 : VOP1Inst <"v_trunc_f32", VOP_F32_F32, ftrunc>;
@@ -215,33 +242,30 @@ defm V_CEIL_F32 : VOP1Inst <"v_ceil_f32", VOP_F32_F32, fceil>;
 defm V_RNDNE_F32 : VOP1Inst <"v_rndne_f32", VOP_F32_F32, frint>;
 defm V_FLOOR_F32 : VOP1Inst <"v_floor_f32", VOP_F32_F32, ffloor>;
 
-let SchedRW = [WriteQuarterRate32] in {
+let SchedRW = [WriteTrans32] in {
 defm V_EXP_F32 : VOP1Inst <"v_exp_f32", VOP_F32_F32, fexp2>;
 defm V_LOG_F32 : VOP1Inst <"v_log_f32", VOP_F32_F32, flog2>;
 defm V_RCP_F32 : VOP1Inst <"v_rcp_f32", VOP_F32_F32, AMDGPUrcp>;
 defm V_RCP_IFLAG_F32 : VOP1Inst <"v_rcp_iflag_f32", VOP_F32_F32, AMDGPUrcp_iflag>;
 defm V_RSQ_F32 : VOP1Inst <"v_rsq_f32", VOP_F32_F32, AMDGPUrsq>;
-defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, fsqrt>;
-} // End SchedRW = [WriteQuarterRate32]
+defm V_SQRT_F32 : VOP1Inst <"v_sqrt_f32", VOP_F32_F32, any_amdgcn_sqrt>;
+} // End SchedRW = [WriteTrans32]
 
-let SchedRW = [WriteDouble] in {
+let SchedRW = [WriteTrans64] in {
 defm V_RCP_F64 : VOP1Inst <"v_rcp_f64", VOP_F64_F64, AMDGPUrcp>;
 defm V_RSQ_F64 : VOP1Inst <"v_rsq_f64", VOP_F64_F64, AMDGPUrsq>;
-} // End SchedRW = [WriteDouble];
-
-let SchedRW = [WriteDouble] in {
-defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, fsqrt>;
-} // End SchedRW = [WriteDouble]
+defm V_SQRT_F64 : VOP1Inst <"v_sqrt_f64", VOP_F64_F64, any_amdgcn_sqrt>;
+} // End SchedRW = [WriteTrans64]
 
-let SchedRW = [WriteQuarterRate32] in {
+let SchedRW = [WriteTrans32] in {
 defm V_SIN_F32 : VOP1Inst <"v_sin_f32", VOP_F32_F32, AMDGPUsin>;
 defm V_COS_F32 : VOP1Inst <"v_cos_f32", VOP_F32_F32, AMDGPUcos>;
-} // End SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteTrans32]
 
 defm V_NOT_B32 : VOP1Inst <"v_not_b32", VOP_I32_I32>;
 defm V_BFREV_B32 : VOP1Inst <"v_bfrev_b32", VOP_I32_I32, bitreverse>;
 defm V_FFBH_U32 : VOP1Inst <"v_ffbh_u32", VOP_I32_I32, AMDGPUffbh_u32>;
-defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32>;
+defm V_FFBL_B32 : VOP1Inst <"v_ffbl_b32", VOP_I32_I32, AMDGPUffbl_b32>;
 defm V_FFBH_I32 : VOP1Inst <"v_ffbh_i32", VOP_I32_I32, AMDGPUffbh_i32>;
 
 let SchedRW = [WriteDoubleAdd] in {
@@ -317,7 +341,7 @@ defm V_MOVRELSD_B32 : VOP1Inst <"v_movrelsd_b32", VOP_MOVRELSD>;
 defm V_MOV_FED_B32 : VOP1Inst <"v_mov_fed_b32", VOP_I32_I32>;
 
 let SubtargetPredicate = isGFX6GFX7 in {
-  let SchedRW = [WriteQuarterRate32] in {
+  let SchedRW = [WriteTrans32] in {
     defm V_LOG_CLAMP_F32 :
       VOP1Inst<"v_log_clamp_f32", VOP_F32_F32, int_amdgcn_log_clamp>;
     defm V_RCP_CLAMP_F32 :
@@ -327,8 +351,8 @@ let SubtargetPredicate = isGFX6GFX7 in {
     defm V_RSQ_CLAMP_F32 :
       VOP1Inst<"v_rsq_clamp_f32", VOP_F32_F32, AMDGPUrsq_clamp>;
     defm V_RSQ_LEGACY_F32 :
-      VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, AMDGPUrsq_legacy>;
-  } // End SchedRW = [WriteQuarterRate32]
+      VOP1Inst<"v_rsq_legacy_f32", VOP_F32_F32, int_amdgcn_rsq_legacy>;
+  } // End SchedRW = [WriteTrans32]
 
   let SchedRW = [WriteDouble] in {
     defm V_RCP_CLAMP_F64 :
@@ -339,10 +363,10 @@ let SubtargetPredicate = isGFX6GFX7 in {
 } // End SubtargetPredicate = isGFX6GFX7
 
 let SubtargetPredicate = isGFX7GFX8GFX9 in {
-  let SchedRW = [WriteQuarterRate32] in {
+  let SchedRW = [WriteTrans32] in {
     defm V_LOG_LEGACY_F32 : VOP1Inst<"v_log_legacy_f32", VOP_F32_F32>;
     defm V_EXP_LEGACY_F32 : VOP1Inst<"v_exp_legacy_f32", VOP_F32_F32>;
-  } // End SchedRW = [WriteQuarterRate32]
+  } // End SchedRW = [WriteTrans32]
 } // End SubtargetPredicate = isGFX7GFX8GFX9
 
 let SubtargetPredicate = isGFX7Plus in {
@@ -362,15 +386,15 @@ defm V_CVT_F16_I16 : VOP1Inst <"v_cvt_f16_i16", VOP1_F16_I16, sint_to_fp>;
 } // End FPDPRounding = 1
 defm V_CVT_U16_F16 : VOP1Inst <"v_cvt_u16_f16", VOP_I16_F16, fp_to_uint>;
 defm V_CVT_I16_F16 : VOP1Inst <"v_cvt_i16_f16", VOP_I16_F16, fp_to_sint>;
-let SchedRW = [WriteQuarterRate32] in {
+let SchedRW = [WriteTrans32] in {
 defm V_RCP_F16 : VOP1Inst <"v_rcp_f16", VOP_F16_F16, AMDGPUrcp>;
-defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, fsqrt>;
+defm V_SQRT_F16 : VOP1Inst <"v_sqrt_f16", VOP_F16_F16, any_amdgcn_sqrt>;
 defm V_RSQ_F16 : VOP1Inst <"v_rsq_f16", VOP_F16_F16, AMDGPUrsq>;
 defm V_LOG_F16 : VOP1Inst <"v_log_f16", VOP_F16_F16, flog2>;
 defm V_EXP_F16 : VOP1Inst <"v_exp_f16", VOP_F16_F16, fexp2>;
 defm V_SIN_F16 : VOP1Inst <"v_sin_f16", VOP_F16_F16, AMDGPUsin>;
 defm V_COS_F16 : VOP1Inst <"v_cos_f16", VOP_F16_F16, AMDGPUcos>;
-} // End SchedRW = [WriteQuarterRate32]
+} // End SchedRW = [WriteTrans32]
 defm V_FREXP_MANT_F16 : VOP1Inst <"v_frexp_mant_f16", VOP_F16_F16, int_amdgcn_frexp_mant>;
 defm V_FREXP_EXP_I16_F16 : VOP1Inst <"v_frexp_exp_i16_f16", VOP_I16_F16, int_amdgcn_frexp_exp>;
 defm V_FLOOR_F16 : VOP1Inst <"v_floor_f16", VOP_F16_F16, ffloor>;
@@ -414,8 +438,11 @@ let SubtargetPredicate = isGFX9Plus in {
   }
 
   defm V_SAT_PK_U8_I16    : VOP1Inst<"v_sat_pk_u8_i16", VOP_I32_I32>;
-  defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
-  defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+
+  let mayRaiseFPException = 0 in {
+    defm V_CVT_NORM_I16_F16 : VOP1Inst<"v_cvt_norm_i16_f16", VOP_I16_F16>;
+    defm V_CVT_NORM_U16_F16 : VOP1Inst<"v_cvt_norm_u16_f16", VOP_I16_F16>;
+  } // End mayRaiseFPException = 0
 } // End SubtargetPredicate = isGFX9Plus
 
 let SubtargetPredicate = isGFX9Only in {
@@ -458,7 +485,7 @@ class VOP1_DPP<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl, bit isDPP1
 class VOP1_DPP16<bits<8> op, VOP1_DPP_Pseudo ps, VOPProfile p = ps.Pfl> :
     VOP1_DPP<op, ps, p, 1>,
     SIMCInstr <ps.PseudoInstr, SIEncodingFamily.GFX10> {
-  let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
+  let AssemblerPredicate = HasDPP16;
   let SubtargetPredicate = HasDPP16;
 }
 
@@ -475,7 +502,7 @@ class VOP1_DPP8<bits<8> op, VOP1_Pseudo ps, VOPProfile p = ps.Pfl> :
   let Inst{24-17} = !if(p.EmitDst, vdst{7-0}, 0);
   let Inst{31-25} = 0x3f;
 
-  let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst);
+  let AssemblerPredicate = HasDPP8;
   let SubtargetPredicate = HasDPP8;
 }
 
@@ -812,42 +839,23 @@ def V_MOV_B32_indirect : VPseudoInstSI<(outs),
   let SubtargetPredicate = isGFX8GFX9;
 }
 
-// This is a pseudo variant of the v_movreld_b32 instruction in which the
-// vector operand appears only twice, once as def and once as use. Using this
-// pseudo avoids problems with the Two Address instructions pass.
-class V_MOVRELD_B32_pseudo<RegisterClass rc> : VPseudoInstSI <
-  (outs rc:$vdst),
-  (ins rc:$vsrc, VSrc_b32:$val, i32imm:$offset)> {
-  let VOP1 = 1;
-
-  let Constraints = "$vsrc = $vdst";
-  let Uses = [M0, EXEC];
-
-  let SubtargetPredicate = HasMovrel;
-}
-
-def V_MOVRELD_B32_V1 : V_MOVRELD_B32_pseudo<VGPR_32>;
-def V_MOVRELD_B32_V2 : V_MOVRELD_B32_pseudo<VReg_64>;
-def V_MOVRELD_B32_V4 : V_MOVRELD_B32_pseudo<VReg_128>;
-def V_MOVRELD_B32_V8 : V_MOVRELD_B32_pseudo<VReg_256>;
-def V_MOVRELD_B32_V16 : V_MOVRELD_B32_pseudo<VReg_512>;
-
 let OtherPredicates = [isGFX8Plus] in {
 
 def : GCNPat <
-  (i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask, timm:$bank_mask,
-                      timm:$bound_ctrl)),
-  (V_MOV_B32_dpp $src, $src, (as_i32imm $dpp_ctrl),
-                       (as_i32imm $row_mask), (as_i32imm $bank_mask),
-                       (as_i1imm $bound_ctrl))
+  (i32 (int_amdgcn_mov_dpp i32:$src, timm:$dpp_ctrl, timm:$row_mask,
+                           timm:$bank_mask, timm:$bound_ctrl)),
+  (V_MOV_B32_dpp VGPR_32:$src, VGPR_32:$src, (as_i32timm $dpp_ctrl),
+                       (as_i32timm $row_mask), (as_i32timm $bank_mask),
+                       (as_i1timm $bound_ctrl))
 >;
 
 def : GCNPat <
-  (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl, timm:$row_mask,
-                      timm:$bank_mask, timm:$bound_ctrl)),
-  (V_MOV_B32_dpp $old, $src, (as_i32imm $dpp_ctrl),
-                       (as_i32imm $row_mask), (as_i32imm $bank_mask),
-                       (as_i1imm $bound_ctrl))
+  (i32 (int_amdgcn_update_dpp i32:$old, i32:$src, timm:$dpp_ctrl,
+                              timm:$row_mask, timm:$bank_mask,
+                              timm:$bound_ctrl)),
+  (V_MOV_B32_dpp VGPR_32:$old, VGPR_32:$src, (as_i32timm $dpp_ctrl),
+                 (as_i32timm $row_mask), (as_i32timm $bank_mask),
+                 (as_i1timm $bound_ctrl))
 >;
 
 } // End OtherPredicates = [isGFX8Plus]
@@ -907,6 +915,7 @@ defm V_SCREEN_PARTITION_4SE_B32 : VOP1_Real_gfx9 <0x37>;
 let OtherPredicates = [isGFX10Plus] in {
 def : GCNPat <
   (i32 (int_amdgcn_mov_dpp8 i32:$src, timm:$dpp8)),
-  (V_MOV_B32_dpp8_gfx10 $src, $src, (as_i32imm $dpp8), (i32 DPP8Mode.FI_0))
+  (V_MOV_B32_dpp8_gfx10 VGPR_32:$src, VGPR_32:$src,
+                        (as_i32timm $dpp8), (i32 DPP8Mode.FI_0))
 >;
 } // End OtherPredicates = [isGFX10Plus]
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index aaadc3dbc721..aa37dbf1418f 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1,4 +1,4 @@
-//===-- VOP2Instructions.td - Vector Instruction Defintions ---------------===//
+//===-- VOP2Instructions.td - Vector Instruction Definitions --------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -69,9 +69,13 @@ class VOP2_Pseudo <string opName, VOPProfile P, list<dag> pattern=[], string suf
   let mayStore = 0;
   let hasSideEffects = 0;
 
+  let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+  let mayRaiseFPException = ReadsModeReg;
+
   let VOP2 = 1;
   let VALU = 1;
-  let Uses = [EXEC];
+  let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
 
   let AsmVariantName = AMDGPUAsmVariants.Default;
 }
@@ -459,17 +463,18 @@ def VOP_WRITELANE : VOPProfile<[i32, i32, i32, i32]> {
 //===----------------------------------------------------------------------===//
 
 defm V_CNDMASK_B32 : VOP2eInst <"v_cndmask_b32", VOP2e_I32_I32_I32_I1>;
+let SubtargetPredicate = HasMadMacF32Insts in
 def V_MADMK_F32 : VOP2_Pseudo <"v_madmk_f32", VOP_MADMK_F32, []>;
 
 let isCommutable = 1 in {
-defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, fadd>;
+defm V_ADD_F32 : VOP2Inst <"v_add_f32", VOP_F32_F32_F32, any_fadd>;
 defm V_SUB_F32 : VOP2Inst <"v_sub_f32", VOP_F32_F32_F32, fsub>;
 defm V_SUBREV_F32 : VOP2Inst <"v_subrev_f32", VOP_F32_F32_F32, null_frag, "v_sub_f32">;
 defm V_MUL_LEGACY_F32 : VOP2Inst <"v_mul_legacy_f32", VOP_F32_F32_F32, AMDGPUfmul_legacy>;
-defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, fmul>;
-defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32, AMDGPUmul_i24>;
+defm V_MUL_F32 : VOP2Inst <"v_mul_f32", VOP_F32_F32_F32, any_fmul>;
+defm V_MUL_I32_I24 : VOP2Inst <"v_mul_i32_i24", VOP_I32_I32_I32_ARITH, AMDGPUmul_i24>;
 defm V_MUL_HI_I32_I24 : VOP2Inst <"v_mul_hi_i32_i24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_i24>;
-defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32, AMDGPUmul_u24>;
+defm V_MUL_U32_U24 : VOP2Inst <"v_mul_u32_u24", VOP_I32_I32_I32_ARITH, AMDGPUmul_u24>;
 defm V_MUL_HI_U32_U24 : VOP2Inst <"v_mul_hi_u32_u24", VOP_PAT_GEN<VOP_I32_I32_I32, 2>, AMDGPUmulhi_u24>;
 defm V_MIN_F32 : VOP2Inst <"v_min_f32", VOP_F32_F32_F32, fminnum_like>;
 defm V_MAX_F32 : VOP2Inst <"v_max_f32", VOP_F32_F32_F32, fmaxnum_like>;
@@ -484,12 +489,16 @@ defm V_AND_B32 : VOP2Inst <"v_and_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, and>;
 defm V_OR_B32 : VOP2Inst <"v_or_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, or>;
 defm V_XOR_B32 : VOP2Inst <"v_xor_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, xor>;
 
+let mayRaiseFPException = 0 in {
+let SubtargetPredicate = HasMadMacF32Insts in {
 let Constraints = "$vdst = $src2", DisableEncoding="$src2",
     isConvertibleToThreeAddress = 1 in {
 defm V_MAC_F32 : VOP2Inst <"v_mac_f32", VOP_MAC_F32>;
 }
 
 def V_MADAK_F32 : VOP2_Pseudo <"v_madak_f32", VOP_MADAK_F32, []>;
+} // End SubtargetPredicate = HasMadMacF32Insts
+}
 
 // No patterns so that the scalar instructions are always selected.
 // The scalar versions will be replaced with vector when needed later.
@@ -529,8 +538,12 @@ defm V_MBCNT_LO_U32_B32 : VOP2Inst <"v_mbcnt_lo_u32_b32", VOP_NO_EXT<VOP_I32_I32
 defm V_MBCNT_HI_U32_B32 : VOP2Inst <"v_mbcnt_hi_u32_b32", VOP_NO_EXT<VOP_I32_I32_I32>, int_amdgcn_mbcnt_hi>;
 defm V_LDEXP_F32 : VOP2Inst <"v_ldexp_f32", VOP_NO_EXT<VOP_F32_F32_I32>, AMDGPUldexp>;
 defm V_CVT_PKACCUM_U8_F32 : VOP2Inst <"v_cvt_pkaccum_u8_f32", VOP_NO_EXT<VOP_I32_F32_I32>>; // TODO: set "Uses = dst"
+
+let ReadsModeReg = 0, mayRaiseFPException = 0 in {
 defm V_CVT_PKNORM_I16_F32 : VOP2Inst <"v_cvt_pknorm_i16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_i16_f32>;
 defm V_CVT_PKNORM_U16_F32 : VOP2Inst <"v_cvt_pknorm_u16_f32", VOP_NO_EXT<VOP_V2I16_F32_F32>, AMDGPUpknorm_u16_f32>;
+}
+
 defm V_CVT_PKRTZ_F16_F32 : VOP2Inst <"v_cvt_pkrtz_f16_f32", VOP_NO_EXT<VOP_V2F16_F32_F32>, AMDGPUpkrtz_f16_f32>;
 defm V_CVT_PK_U16_U32 : VOP2Inst <"v_cvt_pk_u16_u32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_u16_u32>;
 defm V_CVT_PK_I16_I32 : VOP2Inst <"v_cvt_pk_i16_i32", VOP_NO_EXT<VOP_V2I16_I32_I32>, AMDGPUpk_i16_i32>;
@@ -541,14 +554,18 @@ defm V_MIN_LEGACY_F32 : VOP2Inst <"v_min_legacy_f32", VOP_F32_F32_F32, AMDGPUfmi
 defm V_MAX_LEGACY_F32 : VOP2Inst <"v_max_legacy_f32", VOP_F32_F32_F32, AMDGPUfmax_legacy>;
 } // End SubtargetPredicate = isGFX6GFX7
 
-let SubtargetPredicate = isGFX6GFX7GFX10 in {
 let isCommutable = 1 in {
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+let OtherPredicates = [HasMadMacF32Insts] in
 defm V_MAC_LEGACY_F32 : VOP2Inst <"v_mac_legacy_f32", VOP_F32_F32_F32>;
-defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_I32_I32_I32, srl>;
-defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_I32_I32_I32, sra>;
-defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_I32_I32_I32, shl>;
-} // End isCommutable = 1
 } // End SubtargetPredicate = isGFX6GFX7GFX10
+let SubtargetPredicate = isGFX6GFX7 in {
+defm V_LSHR_B32 : VOP2Inst <"v_lshr_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, srl>;
+defm V_ASHR_I32 : VOP2Inst <"v_ashr_i32", VOP_PAT_GEN<VOP_I32_I32_I32>, sra>;
+defm V_LSHL_B32 : VOP2Inst <"v_lshl_b32", VOP_PAT_GEN<VOP_I32_I32_I32>, shl>;
+} // End SubtargetPredicate = isGFX6GFX7
+} // End isCommutable = 1
+
 
 class DivergentBinOp<SDPatternOperator Op, VOP_Pseudo Inst> :
   GCNPat<
@@ -617,15 +634,19 @@ defm V_ASHRREV_I16 : VOP2Inst <"v_ashrrev_i16", VOP_I16_I16_I16, ashr_rev>;
 
 let isCommutable = 1 in {
 let FPDPRounding = 1 in {
-defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, fadd>;
+defm V_ADD_F16 : VOP2Inst <"v_add_f16", VOP_F16_F16_F16, any_fadd>;
 defm V_SUB_F16 : VOP2Inst <"v_sub_f16", VOP_F16_F16_F16, fsub>;
 defm V_SUBREV_F16 : VOP2Inst <"v_subrev_f16", VOP_F16_F16_F16, null_frag, "v_sub_f16">;
-defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, fmul>;
+defm V_MUL_F16 : VOP2Inst <"v_mul_f16", VOP_F16_F16_F16, any_fmul>;
+
+let mayRaiseFPException = 0 in {
 def V_MADAK_F16 : VOP2_Pseudo <"v_madak_f16", VOP_MADAK_F16, [], "">;
+}
+
 } // End FPDPRounding = 1
-defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16, add>;
-defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16, sub>;
-defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16, null_frag, "v_sub_u16">;
+defm V_ADD_U16 : VOP2Inst <"v_add_u16", VOP_I16_I16_I16_ARITH, add>;
+defm V_SUB_U16 : VOP2Inst <"v_sub_u16" , VOP_I16_I16_I16_ARITH, sub>;
+defm V_SUBREV_U16 : VOP2Inst <"v_subrev_u16", VOP_I16_I16_I16_ARITH, null_frag, "v_sub_u16">;
 defm V_MUL_LO_U16 : VOP2Inst <"v_mul_lo_u16", VOP_I16_I16_I16, mul>;
 defm V_MAX_F16 : VOP2Inst <"v_max_f16", VOP_F16_F16_F16, fmaxnum_like>;
 defm V_MIN_F16 : VOP2Inst <"v_min_f16", VOP_F16_F16_F16, fminnum_like>;
@@ -770,16 +791,16 @@ let Predicates = [Has16BitInsts] in {
 // an inline immediate than -c.
 // TODO: Also do for 64-bit.
 def : GCNPat<
-  (add i16:$src0, (i16 NegSubInlineConst16:$src1)),
-  (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineConst16:$src1)
+  (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
+  (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1)
 >;
 
 
 let Predicates = [Has16BitInsts, isGFX7GFX8GFX9] in {
 
 def : GCNPat<
-  (i32 (zext (add i16:$src0, (i16 NegSubInlineConst16:$src1)))),
-  (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineConst16:$src1)
+  (i32 (zext (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)))),
+  (V_SUB_U16_e64 VSrc_b16:$src0, NegSubInlineIntConst16:$src1)
 >;
 
 defm : Arithmetic_i16_0Hi_Pats<add, V_ADD_U16_e64>;
@@ -831,7 +852,7 @@ class VOP2_DPP<bits<6> op, VOP2_DPP_Pseudo ps,
 class Base_VOP2_DPP16<bits<6> op, VOP2_DPP_Pseudo ps,
                  string opName = ps.OpName, VOPProfile p = ps.Pfl> :
     VOP2_DPP<op, ps, opName, p, 1> {
-  let AssemblerPredicate = !if(p.HasExt, HasDPP16, DisableInst);
+  let AssemblerPredicate = HasDPP16;
   let SubtargetPredicate = HasDPP16;
 }
 
@@ -857,7 +878,7 @@ class VOP2_DPP8<bits<6> op, VOP2_Pseudo ps,
   let Inst{30-25} = op;
   let Inst{31}    = 0x0;
 
-  let AssemblerPredicate = !if(p.HasExt, HasDPP8, DisableInst);
+  let AssemblerPredicate = HasDPP8;
   let SubtargetPredicate = HasDPP8;
 }
 
@@ -1250,9 +1271,9 @@ defm V_SUBBREV_U32        : VOP2be_Real_gfx6_gfx7<0x02a>;
 
 defm V_READLANE_B32 : VOP2Only_Real_gfx6_gfx7<0x001>;
 
-let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
+let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
   defm V_WRITELANE_B32 : VOP2Only_Real_gfx6_gfx7<0x002>;
-} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in)
+} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
 
 let SubtargetPredicate = isGFX6GFX7 in {
   defm : VOP2eInstAliases<V_CNDMASK_B32_e32, V_CNDMASK_B32_e32_gfx6_gfx7>;
@@ -1261,6 +1282,7 @@ let SubtargetPredicate = isGFX6GFX7 in {
 defm V_ADD_F32            : VOP2_Real_gfx6_gfx7_gfx10<0x003>;
 defm V_SUB_F32            : VOP2_Real_gfx6_gfx7_gfx10<0x004>;
 defm V_SUBREV_F32         : VOP2_Real_gfx6_gfx7_gfx10<0x005>;
+let OtherPredicates = [HasMadMacF32Insts] in
 defm V_MAC_LEGACY_F32     : VOP2_Real_gfx6_gfx7_gfx10<0x006>;
 defm V_MUL_LEGACY_F32     : VOP2_Real_gfx6_gfx7_gfx10<0x007>;
 defm V_MUL_F32            : VOP2_Real_gfx6_gfx7_gfx10<0x008>;
@@ -1593,3 +1615,9 @@ let SubtargetPredicate = HasDot3Insts in {
 let SubtargetPredicate = HasPkFmacF16Inst in {
 defm V_PK_FMAC_F16 : VOP2_Real_e32_vi<0x3c>;
 } // End SubtargetPredicate = HasPkFmacF16Inst
+
+let SubtargetPredicate = HasDot3Insts in {
+  // NB: Opcode conflicts with V_DOT2C_F32_F16
+  let DecoderNamespace = "GFX10_B" in
+  defm V_DOT8C_I32_I4 : VOP2_Real_DOT_ACC_gfx10<0x02>;
+}
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 67c8b926302d..169949f2171a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -1,4 +1,4 @@
-//===-- VOP3Instructions.td - Vector Instruction Defintions ---------------===//
+//===-- VOP3Instructions.td - Vector Instruction Definitions --------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -32,20 +32,26 @@ class getVOP3ModPat<VOPProfile P, SDPatternOperator node> {
                   ret1));
 }
 
-class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
+class getVOP3PModPat<VOPProfile P, SDPatternOperator node, bit HasExplicitClamp> {
+  dag src0_dag = (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers));
+  dag src1_dag = (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers));
+  dag src2_dag = (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers));
+  dag clamp_dag = (i1 timm:$clamp);
+
   list<dag> ret3 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
-                                    (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
-          (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers)),
-          (P.Src2VT (VOP3PMods P.Src2VT:$src2, i32:$src2_modifiers))))];
+    !if(HasExplicitClamp,
+        (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag, clamp_dag),
+        (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, src2_dag)))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
-                          (P.Src0VT (VOP3PMods P.Src0VT:$src0, i32:$src0_modifiers))),
-          (P.Src1VT (VOP3PMods P.Src1VT:$src1, i32:$src1_modifiers))))];
+    !if(HasExplicitClamp,
+        (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag, clamp_dag),
+        (DivergentFragOrOp<node, P>.ret src0_dag, src1_dag)))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3PMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+    !if(HasExplicitClamp,
+        (DivergentFragOrOp<node, P>.ret src0_dag, clamp_dag),
+        (DivergentFragOrOp<node, P>.ret src0_dag)))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
@@ -54,18 +60,16 @@ class getVOP3PModPat<VOPProfile P, SDPatternOperator node> {
 
 class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> {
   list<dag> ret3 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
-                                    (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))),
+        (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)),
           (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers)),
           (P.Src2VT (VOP3OpSel P.Src2VT:$src2, i32:$src2_modifiers))))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
-                          (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))),
-          (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))];
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers)),
+                                    (P.Src1VT (VOP3OpSel P.Src1VT:$src1, i32:$src1_modifiers))))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSel P.Src0VT:$src0, i32:$src0_modifiers))))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
@@ -74,18 +78,18 @@ class getVOP3OpSelPat<VOPProfile P, SDPatternOperator node> {
 
 class getVOP3OpSelModPat<VOPProfile P, SDPatternOperator node> {
   list<dag> ret3 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp),
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT !if(P.HasClamp, (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers),
                                     (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
           (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers)),
           (P.Src2VT (VOP3OpSelMods P.Src2VT:$src2, i32:$src2_modifiers))))];
 
   list<dag> ret2 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp)),
+    (DivergentFragOrOp<node, P>.ret !if(P.HasClamp, (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers)),
                           (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))),
           (P.Src1VT (VOP3OpSelMods P.Src1VT:$src1, i32:$src1_modifiers))))];
 
   list<dag> ret1 = [(set P.DstVT:$vdst,
-    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods0 P.Src0VT:$src0, i32:$src0_modifiers, i1:$clamp))))];
+    (DivergentFragOrOp<node, P>.ret (P.Src0VT (VOP3OpSelMods P.Src0VT:$src0, i32:$src0_modifiers))))];
 
   list<dag> ret = !if(!eq(P.NumSrcArgs, 3), ret3,
                   !if(!eq(P.NumSrcArgs, 2), ret2,
@@ -224,12 +228,13 @@ def VOP3b_I64_I1_I32_I32_I64 : VOPProfile<[i64, i32, i32, i64]> {
 class VOP3Interp<string OpName, VOPProfile P, list<dag> pattern = []> :
                  VOP3_Pseudo<OpName, P, pattern> {
   let AsmMatchConverter = "cvtVOP3Interp";
+  let mayRaiseFPException = 0;
 }
 
 def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> {
   let Ins64 = (ins Src0Mod:$src0_modifiers, VRegSrc_32:$src0,
                    Attr:$attr, AttrChan:$attrchan,
-                   clampmod:$clamp, omod:$omod);
+                   clampmod0:$clamp, omod0:$omod);
 
   let Asm64 = "$vdst, $src0_modifiers, $attr$attrchan$clamp$omod";
 }
@@ -237,7 +242,7 @@ def VOP3_INTERP : VOPProfile<[f32, f32, i32, untyped]> {
 def VOP3_INTERP_MOV : VOPProfile<[f32, i32, i32, untyped]> {
   let Ins64 = (ins InterpSlot:$src0,
                    Attr:$attr, AttrChan:$attrchan,
-                   clampmod:$clamp, omod:$omod);
+                   clampmod0:$clamp, omod0:$omod);
 
   let Asm64 = "$vdst, $src0, $attr$attrchan$clamp$omod";
 
@@ -286,17 +291,25 @@ class VOP3_INTERP16 <list<ValueType> ArgVT> : VOPProfile<ArgVT> {
 
 let isCommutable = 1 in {
 
+let mayRaiseFPException = 0 in {
+let SubtargetPredicate = HasMadMacF32Insts in {
 def V_MAD_LEGACY_F32 : VOP3Inst <"v_mad_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
 def V_MAD_F32 : VOP3Inst <"v_mad_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fmad>;
+} // End SubtargetPredicate = HasMadMacInsts
+
+let SubtargetPredicate = HasNoMadMacF32Insts in
+def V_FMA_LEGACY_F32 : VOP3Inst <"v_fma_legacy_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+}
+
 def V_MAD_I32_I24 : VOP3Inst <"v_mad_i32_i24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_MAD_U32_U24 : VOP3Inst <"v_mad_u32_u24", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
-def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, fma>;
+def V_FMA_F32 : VOP3Inst <"v_fma_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, any_fma>;
 def V_LERP_U8 : VOP3Inst <"v_lerp_u8", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_lerp>;
 
 let SchedRW = [WriteDoubleAdd] in {
 let FPDPRounding = 1 in {
-def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, fma>;
-def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, fadd, 1>;
+def V_FMA_F64 : VOP3Inst <"v_fma_f64", VOP3_Profile<VOP_F64_F64_F64_F64>, any_fma>;
+def V_ADD_F64 : VOP3Inst <"v_add_f64", VOP3_Profile<VOP_F64_F64_F64>, any_fadd, 1>;
 def V_MUL_F64 : VOP3Inst <"v_mul_f64", VOP3_Profile<VOP_F64_F64_F64>, fmul, 1>;
 } // End FPDPRounding = 1
 def V_MIN_F64 : VOP3Inst <"v_min_f64", VOP3_Profile<VOP_F64_F64_F64>, fminnum_like, 1>;
@@ -310,7 +323,7 @@ def V_MUL_LO_I32 : VOP3Inst <"v_mul_lo_i32", VOP3_Profile<VOP_I32_I32_I32>>;
 def V_MUL_HI_I32 : VOP3Inst <"v_mul_hi_i32", VOP3_Profile<VOP_I32_I32_I32>, mulhs>;
 } // End SchedRW = [WriteQuarterRate32]
 
-let Uses = [VCC, EXEC] in {
+let Uses = [MODE, VCC, EXEC] in {
 // v_div_fmas_f32:
 //   result = src0 * src1 + src2
 //   if (vcc)
@@ -332,15 +345,20 @@ def V_DIV_FMAS_F64 : VOP3_Pseudo <"v_div_fmas_f64", VOP_F64_F64_F64_F64_VCC, []>
 
 } // End isCommutable = 1
 
+let mayRaiseFPException = 0 in {
 def V_CUBEID_F32 : VOP3Inst <"v_cubeid_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubeid>;
 def V_CUBESC_F32 : VOP3Inst <"v_cubesc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubesc>;
 def V_CUBETC_F32 : VOP3Inst <"v_cubetc_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubetc>;
 def V_CUBEMA_F32 : VOP3Inst <"v_cubema_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, int_amdgcn_cubema>;
+} // End mayRaiseFPException
+
 def V_BFE_U32 : VOP3Inst <"v_bfe_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_u32>;
 def V_BFE_I32 : VOP3Inst <"v_bfe_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfe_i32>;
 def V_BFI_B32 : VOP3Inst <"v_bfi_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUbfi>;
-def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbit>;
+def V_ALIGNBIT_B32 : VOP3Inst <"v_alignbit_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, fshr>;
 def V_ALIGNBYTE_B32 : VOP3Inst <"v_alignbyte_b32", VOP3_Profile<VOP_I32_I32_I32_I32>, int_amdgcn_alignbyte>;
+
+let mayRaiseFPException = 0 in { // XXX - Seems suspect but manual doesn't say it does
 def V_MIN3_F32 : VOP3Inst <"v_min3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmin3>;
 def V_MIN3_I32 : VOP3Inst <"v_min3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmin3>;
 def V_MIN3_U32 : VOP3Inst <"v_min3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumin3>;
@@ -350,6 +368,8 @@ def V_MAX3_U32 : VOP3Inst <"v_max3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDG
 def V_MED3_F32 : VOP3Inst <"v_med3_f32", VOP3_Profile<VOP_F32_F32_F32_F32>, AMDGPUfmed3>;
 def V_MED3_I32 : VOP3Inst <"v_med3_i32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUsmed3>;
 def V_MED3_U32 : VOP3Inst <"v_med3_u32", VOP3_Profile<VOP_I32_I32_I32_I32>, AMDGPUumed3>;
+} // End mayRaiseFPException = 0
+
 def V_SAD_U8 : VOP3Inst <"v_sad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_SAD_HI_U8 : VOP3Inst <"v_sad_hi_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 def V_SAD_U16 : VOP3Inst <"v_sad_u16", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
@@ -362,6 +382,8 @@ def V_DIV_FIXUP_F64 : VOP3Inst <"v_div_fixup_f64", VOP3_Profile<VOP_F64_F64_F64_
 def V_LDEXP_F64 : VOP3Inst <"v_ldexp_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUldexp, 1>;
 } // End SchedRW = [WriteDoubleAdd], FPDPRounding = 1
 
+
+let mayRaiseFPException = 0 in { // Seems suspicious but manual doesn't say it does.
 def V_DIV_SCALE_F32 : VOP3_Pseudo <"v_div_scale_f32", VOP3b_F32_I1_F32_F32_F32, [], 1> {
   let SchedRW = [WriteFloatFMA, WriteSALU];
   let AsmMatchConverter = "";
@@ -373,6 +395,7 @@ def V_DIV_SCALE_F64 : VOP3_Pseudo <"v_div_scale_f64", VOP3b_F64_I1_F64_F64_F64,
   let AsmMatchConverter = "";
   let FPDPRounding = 1;
 }
+} // End mayRaiseFPException = 0
 
 def V_MSAD_U8 : VOP3Inst <"v_msad_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_CLAMP>>;
 
@@ -380,17 +403,16 @@ let Constraints = "@earlyclobber $vdst" in {
 def V_MQSAD_PK_U16_U8 : VOP3Inst <"v_mqsad_pk_u16_u8", VOP3_Profile<VOP_I64_I64_I32_I64, VOP3_CLAMP>>;
 } // End Constraints = "@earlyclobber $vdst"
 
-def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, AMDGPUtrig_preop> {
+def V_TRIG_PREOP_F64 : VOP3Inst <"v_trig_preop_f64", VOP3_Profile<VOP_F64_F64_I32>, int_amdgcn_trig_preop> {
   let SchedRW = [WriteDouble];
 }
 
 let SchedRW = [Write64Bit] in {
-let SubtargetPredicate = isGFX6GFX7GFX10 in {
+let SubtargetPredicate = isGFX6GFX7 in {
 def V_LSHL_B64 : VOP3Inst <"v_lshl_b64", VOP3_Profile<VOP_I64_I64_I32>, shl>;
 def V_LSHR_B64 : VOP3Inst <"v_lshr_b64", VOP3_Profile<VOP_I64_I64_I32>, srl>;
 def V_ASHR_I64 : VOP3Inst <"v_ashr_i64", VOP3_Profile<VOP_I64_I64_I32>, sra>;
-def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
-} // End SubtargetPredicate = isGFX6GFX7GFX10
+} // End SubtargetPredicate = isGFX6GFX7
 
 let SubtargetPredicate = isGFX8Plus in {
 def V_LSHLREV_B64 : VOP3Inst <"v_lshlrev_b64", VOP3_Profile<VOP_I64_I32_I64>, lshl_rev>;
@@ -399,6 +421,23 @@ def V_ASHRREV_I64 : VOP3Inst <"v_ashrrev_i64", VOP3_Profile<VOP_I64_I32_I64>, as
 } // End SubtargetPredicate = isGFX8Plus
 } // End SchedRW = [Write64Bit]
 
+def : GCNPat<
+  (i64 (getDivergentFrag<sext>.ret i16:$src)),
+    (REG_SEQUENCE VReg_64,
+      (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10)))), sub0,
+      (i32 (COPY_TO_REGCLASS
+         (V_ASHRREV_I32_e32 (S_MOV_B32 (i32 0x1f)), (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
+      ), VGPR_32)), sub1)
+>;
+
+def : GCNPat<
+  (i32 (getDivergentFrag<sext>.ret i16:$src)),
+  (i32 (V_BFE_I32 $src, (S_MOV_B32 (i32 0)), (S_MOV_B32 (i32 0x10))))
+>;
+
+let SubtargetPredicate = isGFX6GFX7GFX10 in {
+def V_MULLIT_F32 : VOP3Inst <"v_mullit_f32", VOP3_Profile<VOP_F32_F32_F32_F32>>;
+} // End SubtargetPredicate = isGFX6GFX7GFX10
 
 let SchedRW = [Write32Bit] in {
 let SubtargetPredicate = isGFX8Plus in {
@@ -417,7 +456,7 @@ let isCommutable = 1 in {
 let SchedRW = [WriteQuarterRate32, WriteSALU] in {
 def V_MAD_U64_U32 : VOP3Inst <"v_mad_u64_u32", VOP3b_I64_I1_I32_I32_I64>;
 def V_MAD_I64_I32 : VOP3Inst <"v_mad_i64_i32", VOP3b_I64_I1_I32_I32_I64>;
-} // End SchedRW = [WriteDouble, WriteSALU]
+} // End SchedRW = [WriteQuarterRate32, WriteSALU]
 } // End isCommutable = 1
 
 } // End SubtargetPredicate = isGFX7Plus
@@ -434,11 +473,11 @@ def V_DIV_FIXUP_F16_gfx9 : VOP3Inst <"v_div_fixup_f16_gfx9",
   let FPDPRounding = 1;
 }
 
-def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fma> {
+def V_FMA_F16 : VOP3Inst <"v_fma_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, any_fma> {
   let Predicates = [Has16BitInsts, isGFX8Only];
   let FPDPRounding = 1;
 }
-def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, fma> {
+def V_FMA_F16_gfx9 : VOP3Inst <"v_fma_f16_gfx9", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, any_fma> {
   let renamedInGFX9 = 1;
   let Predicates = [Has16BitInsts, isGFX9Plus];
   let FPDPRounding = 1;
@@ -451,7 +490,7 @@ def V_MAD_U16 : VOP3Inst <"v_mad_u16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CL
 def V_MAD_I16 : VOP3Inst <"v_mad_i16", VOP3_Profile<VOP_I16_I16_I16_I16, VOP3_CLAMP>>;
 let FPDPRounding = 1 in {
 def V_MAD_F16 : VOP3Inst <"v_mad_f16", VOP3_Profile<VOP_F16_F16_F16_F16>, fmad>;
-let Uses = [M0, EXEC] in {
+let Uses = [MODE, M0, EXEC] in {
 // For some reason the intrinsic operands are in a different order
 // from the instruction operands.
 def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i32, f32]>,
@@ -462,7 +501,7 @@ def V_INTERP_P2_F16 : VOP3Interp <"v_interp_p2_f16", VOP3_INTERP16<[f16, f32, i3
                                    (i32 timm:$attr),
                                    (i1 timm:$high),
                                    M0))]>;
-} // End Uses = [M0, EXEC]
+} // End Uses = [M0, MODE, EXEC]
 } // End FPDPRounding = 1
 } // End renamedInGFX9 = 1
 
@@ -478,32 +517,29 @@ def V_MAD_I16_gfx9   : VOP3Inst <"v_mad_i16_gfx9", VOP3_Profile<VOP_I16_I16_I16_
 def V_INTERP_P2_F16_gfx9 : VOP3Interp <"v_interp_p2_f16_gfx9", VOP3_INTERP16<[f16, f32, i32, f32]>>;
 } // End SubtargetPredicate = isGFX9Plus
 
-let Uses = [M0, EXEC], FPDPRounding = 1 in {
+let Uses = [MODE, M0, EXEC], FPDPRounding = 1 in {
 def V_INTERP_P1LL_F16 : VOP3Interp <"v_interp_p1ll_f16", VOP3_INTERP16<[f32, f32, i32, untyped]>,
-       [(set f32:$vdst, (AMDGPUinterp_p1ll_f16 f32:$src0, (i32 timm:$attrchan),
-                                                          (i32 timm:$attr),
-                                                          (i32 timm:$src0_modifiers),
-                                                          (i1 timm:$high),
-                                                          (i1 timm:$clamp),
-                                                          (i32 timm:$omod)))]>;
-def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>,
-       [(set f32:$vdst, (AMDGPUinterp_p1lv_f16 f32:$src0, (i32 timm:$attrchan),
-                                                          (i32 timm:$attr),
-                                                          (i32 timm:$src0_modifiers),
-                                                          (f32 VRegSrc_32:$src2),
-                                                          (i32 timm:$src2_modifiers),
-                                                          (i1 timm:$high),
-                                                          (i1 timm:$clamp),
-                                                          (i32 timm:$omod)))]>;
-} // End Uses = [M0, EXEC], FPDPRounding = 1
+       [(set f32:$vdst, (int_amdgcn_interp_p1_f16 (VOP3Mods f32:$src0, i32:$src0_modifiers),
+                                                  (i32 timm:$attrchan),
+                                                  (i32 timm:$attr),
+                                                  (i1 timm:$high), M0))]> {
+  // This predicate should only apply to the selection pattern. The
+  // instruction still exists and should decode on subtargets with
+  // other bank counts.
+  let OtherPredicates = [has32BankLDS];
+}
+
+
+def V_INTERP_P1LV_F16 : VOP3Interp <"v_interp_p1lv_f16", VOP3_INTERP16<[f32, f32, i32, f16]>>;
+} // End Uses = [MODE, M0, EXEC], FPDPRounding = 1
 
 } // End SubtargetPredicate = Has16BitInsts, isCommutable = 1
 
-let SubtargetPredicate = isGFX8Plus, Uses = [M0, EXEC] in {
+let SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC] in {
 def V_INTERP_P1_F32_e64  : VOP3Interp <"v_interp_p1_f32", VOP3_INTERP>;
 def V_INTERP_P2_F32_e64  : VOP3Interp <"v_interp_p2_f32", VOP3_INTERP>;
 def V_INTERP_MOV_F32_e64 : VOP3Interp <"v_interp_mov_f32", VOP3_INTERP_MOV>;
-} // End SubtargetPredicate = isGFX8Plus, Uses = [M0, EXEC]
+} // End SubtargetPredicate = isGFX8Plus, Uses = [MODE, M0, EXEC]
 
 let Predicates = [Has16BitInsts, isGFX6GFX7GFX8GFX9] in {
 
@@ -565,9 +601,20 @@ class ThreeOpFrag<SDPatternOperator op1, SDPatternOperator op2> : PatFrag<
     }
 
     return true;
-  }]
-> {
+  }]> {
   let PredicateCodeUsesOperands = 1;
+
+  // The divergence predicate is irrelevant in GlobalISel, as we have
+  // proper register bank checks. We also force all VOP instruction
+  // operands to VGPR, so we should not need to check the constant bus
+  // restriction.
+  //
+  // FIXME: With unlucky SGPR operands, we could penalize code by
+  // blocking folding SGPR->VGPR copies later.
+  // FIXME: There's no register bank verifier
+  // FIXME: Should add a way for the emitter to recognize this is a
+  // trivially true predicate to eliminate the check.
+  let GISelPredicateCode = [{return true;}];
 }
 
 let SubtargetPredicate = isGFX9Plus in {
@@ -602,14 +649,14 @@ def V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32,
 def V_CVT_PKNORM_I16_F16 : VOP3Inst <"v_cvt_pknorm_i16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
 def V_CVT_PKNORM_U16_F16 : VOP3Inst <"v_cvt_pknorm_u16_f16", VOP3_Profile<VOP_B32_F16_F16, VOP3_OPSEL>>;
 
-def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>;
-def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32>>;
+def V_ADD_I32_gfx9 : VOP3Inst <"v_add_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
+def V_SUB_I32_gfx9 : VOP3Inst <"v_sub_i32_gfx9", VOP3_Profile<VOP_I32_I32_I32_ARITH>>;
 
 
 class ThreeOp_i32_Pats <SDPatternOperator op1, SDPatternOperator op2, Instruction inst> : GCNPat <
   // This matches (op2 (op1 i32:$src0, i32:$src1), i32:$src2) with conditions.
   (ThreeOpFrag<op1, op2> i32:$src0, i32:$src1, i32:$src2),
-  (inst i32:$src0, i32:$src1, i32:$src2)
+  (inst VSrc_b32:$src0, VSrc_b32:$src1, VSrc_b32:$src2)
 >;
 
 def : ThreeOp_i32_Pats<shl, add, V_LSHL_ADD_U32>;
@@ -634,6 +681,40 @@ def VOP3_PERMLANE_Profile : VOP3_Profile<VOPProfile <[i32, i32, i32, i32]>, VOP3
   let HasOMod = 0;
 }
 
+class PermlanePat<SDPatternOperator permlane,
+  Instruction inst> : GCNPat<
+  (permlane i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2,
+            timm:$fi, timm:$bc),
+  (inst (as_i1timm $fi), VGPR_32:$src0, (as_i1timm $bc),
+        SCSrc_b32:$src1, 0, SCSrc_b32:$src2, VGPR_32:$vdst_in)
+>;
+
+// Permlane intrinsic that has either fetch invalid or bound control
+// fields enabled.
+class BoundControlOrFetchInvalidPermlane<SDPatternOperator permlane> :
+  PatFrag<(ops node:$vdst_in, node:$src0, node:$src1, node:$src2,
+               node:$fi, node:$bc),
+          (permlane node:$vdst_in, node:$src0, node:
+                    $src1, node:$src2, node:$fi, node:$bc)> {
+  let PredicateCode = [{ return N->getConstantOperandVal(5) != 0 ||
+                                N->getConstantOperandVal(6) != 0; }];
+  let GISelPredicateCode = [{
+    return MI.getOperand(6).getImm() != 0 ||
+           MI.getOperand(7).getImm() != 0;
+  }];
+}
+
+// Drop the input value if it won't be read.
+class PermlaneDiscardVDstIn<SDPatternOperator permlane,
+                            Instruction inst> : GCNPat<
+  (permlane srcvalue, i32:$src0, i32:$src1, i32:$src2,
+            timm:$fi, timm:$bc),
+  (inst (as_i1timm $fi), VGPR_32:$src0, (as_i1timm $bc),
+        SCSrc_b32:$src1, 0, SCSrc_b32:$src2,
+        (IMPLICIT_DEF))
+>;
+
+
 let SubtargetPredicate = isGFX10Plus in {
   def V_XOR3_B32 : VOP3Inst <"v_xor3_b32", VOP3_Profile<VOP_I32_I32_I32_I32>>;
   def : ThreeOp_i32_Pats<xor, xor, V_XOR3_B32>;
@@ -643,16 +724,35 @@ let SubtargetPredicate = isGFX10Plus in {
     def V_PERMLANEX16_B32 : VOP3Inst <"v_permlanex16_b32", VOP3_PERMLANE_Profile>;
   } // End $vdst = $vdst_in, DisableEncoding $vdst_in
 
-  def : GCNPat<
-    (int_amdgcn_permlane16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc),
-    (V_PERMLANE16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
-  >;
-  def : GCNPat<
-    (int_amdgcn_permlanex16 i32:$vdst_in, i32:$src0, i32:$src1, i32:$src2, timm:$fi, timm:$bc),
-    (V_PERMLANEX16_B32 (as_i1imm $fi), $src0, (as_i1imm $bc), $src1, 0, $src2, $vdst_in)
-  >;
+  def : PermlanePat<int_amdgcn_permlane16, V_PERMLANE16_B32>;
+  def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32>;
+
+  def : PermlaneDiscardVDstIn<
+    BoundControlOrFetchInvalidPermlane<int_amdgcn_permlane16>,
+    V_PERMLANE16_B32>;
+  def : PermlaneDiscardVDstIn<
+    BoundControlOrFetchInvalidPermlane<int_amdgcn_permlanex16>,
+    V_PERMLANEX16_B32>;
 } // End SubtargetPredicate = isGFX10Plus
 
+class DivFmasPat<ValueType vt, Instruction inst, Register CondReg> : GCNPat<
+  (AMDGPUdiv_fmas (vt (VOP3Mods vt:$src0, i32:$src0_modifiers)),
+                  (vt (VOP3Mods vt:$src1, i32:$src1_modifiers)),
+                  (vt (VOP3Mods vt:$src2, i32:$src2_modifiers)),
+                  (i1 CondReg)),
+  (inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2)
+>;
+
+let WaveSizePredicate = isWave64 in {
+def : DivFmasPat<f32, V_DIV_FMAS_F32, VCC>;
+def : DivFmasPat<f64, V_DIV_FMAS_F64, VCC>;
+}
+
+let WaveSizePredicate = isWave32 in {
+def : DivFmasPat<f32, V_DIV_FMAS_F32, VCC_LO>;
+def : DivFmasPat<f64, V_DIV_FMAS_F64, VCC_LO>;
+}
+
 //===----------------------------------------------------------------------===//
 // Integer Clamp Patterns
 //===----------------------------------------------------------------------===//
@@ -745,9 +845,9 @@ let AssemblerPredicate = isGFX10Plus, DecoderNamespace = "GFX10" in {
 
 defm V_READLANE_B32  : VOP3_Real_gfx10<0x360>;
 
-let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in) in {
+let InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in) in {
   defm V_WRITELANE_B32 : VOP3_Real_gfx10<0x361>;
-} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VSrc_b32:$vdst_in)
+} // End InOperandList = (ins SSrcOrLds_b32:$src0, SCSrc_b32:$src1, VGPR_32:$vdst_in)
 
 defm V_XOR3_B32           : VOP3_Real_gfx10<0x178>;
 defm V_LSHLREV_B64        : VOP3_Real_gfx10<0x2ff>;
@@ -925,6 +1025,10 @@ defm V_TRIG_PREOP_F64  : VOP3_Real_gfx6_gfx7_gfx10<0x174>;
 defm V_DIV_SCALE_F32   : VOP3be_Real_gfx6_gfx7_gfx10<0x16d>;
 defm V_DIV_SCALE_F64   : VOP3be_Real_gfx6_gfx7_gfx10<0x16e>;
 
+// NB: Same opcode as v_mad_legacy_f32
+let DecoderNamespace = "GFX10_B" in
+defm V_FMA_LEGACY_F32  : VOP3_Real_gfx10<0x140>;
+
 //===----------------------------------------------------------------------===//
 // GFX8, GFX9 (VI).
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 933acc2278fd..fc457ad212d4 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -1,4 +1,4 @@
-//===-- VOP3PInstructions.td - Vector Instruction Defintions --------------===//
+//===-- VOP3PInstructions.td - Vector Instruction Definitions -------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -10,9 +10,11 @@
 // VOP3P Classes
 //===----------------------------------------------------------------------===//
 
-class VOP3PInst<string OpName, VOPProfile P, SDPatternOperator node = null_frag> :
+class VOP3PInst<string OpName, VOPProfile P,
+                SDPatternOperator node = null_frag,
+                bit HasExplicitClamp = 0> :
   VOP3P_Pseudo<OpName, P,
-    !if(P.HasModifiers, getVOP3PModPat<P, node>.ret, getVOP3Pat<P, node>.ret)
+    !if(P.HasModifiers, getVOP3PModPat<P, node, HasExplicitClamp>.ret, getVOP3Pat<P, node>.ret)
 >;
 
 // Non-packed instructions that use the VOP3P encoding.
@@ -29,9 +31,14 @@ class VOP3_VOP3PInst<string OpName, VOPProfile P, bit UseTiedOutput = 0,
       !con(
         (ins FP16InputMods:$src0_modifiers, VCSrc_f16:$src0,
              FP16InputMods:$src1_modifiers, VCSrc_f16:$src1,
-             FP16InputMods:$src2_modifiers, VCSrc_f16:$src2,
-             clampmod:$clamp),
-         !if(UseTiedOutput, (ins VGPR_32:$vdst_in), (ins))),
+             FP16InputMods:$src2_modifiers, VCSrc_f16:$src2),
+         // FIXME: clampmod0 misbehaves with the non-default vdst_in
+         // following it. For now workaround this by requiring clamp
+         // in tied patterns. This should use undef_tied_input, but it
+         // seems underdeveloped and doesn't apply the right register
+         // class constraints.
+         !if(UseTiedOutput, (ins clampmod:$clamp, VGPR_32:$vdst_in),
+                            (ins clampmod0:$clamp))),
          (ins op_sel:$op_sel, op_sel_hi:$op_sel_hi));
 
   let Constraints = !if(UseTiedOutput, "$vdst = $vdst_in", "");
@@ -45,9 +52,9 @@ def V_PK_MAD_I16 : VOP3PInst<"v_pk_mad_i16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_
 def V_PK_MAD_U16 : VOP3PInst<"v_pk_mad_u16", VOP3_Profile<VOP_V2I16_V2I16_V2I16_V2I16>>;
 
 let FPDPRounding = 1 in {
-def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, fma>;
-def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fadd>;
-def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmul>;
+def V_PK_FMA_F16 : VOP3PInst<"v_pk_fma_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16_V2F16>, any_fma>;
+def V_PK_ADD_F16 : VOP3PInst<"v_pk_add_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fadd>;
+def V_PK_MUL_F16 : VOP3PInst<"v_pk_mul_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, any_fmul>;
 } // End FPDPRounding = 1
 def V_PK_MAX_F16 : VOP3PInst<"v_pk_max_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fmaxnum_like>;
 def V_PK_MIN_F16 : VOP3PInst<"v_pk_min_f16", VOP3_Profile<VOP_V2F16_V2F16_V2F16>, fminnum_like>;
@@ -75,8 +82,8 @@ def V_PK_LSHRREV_B16 : VOP3PInst<"v_pk_lshrrev_b16", VOP3_Profile<VOP_V2I16_V2I1
 // The constant will be emitted as a mov, and folded later.
 // TODO: We could directly encode the immediate now
 def : GCNPat<
-  (add (v2i16 (VOP3PMods0 v2i16:$src0, i32:$src0_modifiers, i1:$clamp)), NegSubInlineConstV216:$src1),
-  (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1, $clamp)
+  (add (v2i16 (VOP3PMods v2i16:$src0, i32:$src0_modifiers)), NegSubInlineConstV216:$src1),
+  (V_PK_SUB_U16 $src0_modifiers, $src0, SRCMODS.OP_SEL_1, NegSubInlineConstV216:$src1)
 >;
 
 multiclass MadFmaMixPats<SDPatternOperator fma_like,
@@ -142,10 +149,11 @@ multiclass MadFmaMixPats<SDPatternOperator fma_like,
 }
 
 let SubtargetPredicate = HasMadMixInsts in {
+
 // These are VOP3a-like opcodes which accept no omod.
 // Size of src arguments (16/32) is controlled by op_sel.
 // For 16-bit src arguments their location (hi/lo) are controlled by op_sel_hi.
-let isCommutable = 1 in {
+let isCommutable = 1, mayRaiseFPException = 0 in {
 def V_MAD_MIX_F32 : VOP3_VOP3PInst<"v_mad_mix_f32", VOP3_Profile<VOP_F32_F16_F16_F16, VOP3_OPSEL>>;
 
 let FPDPRounding = 1 in {
@@ -203,7 +211,7 @@ foreach Type = ["I", "U"] in
   foreach Index = 0-3 in {
     // Defines patterns that extract each Index'ed 8bit from an unsigned
     // 32bit scalar value;
-    def #Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !if (!eq (Type, "U"), 1, 0)>;
+    def Type#Index#"_8bit" : Extract<!shl(Index, 3), 255, !if (!eq (Type, "U"), 1, 0)>;
 
     // Defines multiplication patterns where the multiplication is happening on each
     // Index'ed 8bit of a 32bit scalar value.
@@ -211,8 +219,8 @@ foreach Type = ["I", "U"] in
     def Mul#Type#_Elt#Index : PatFrag<
       (ops node:$src0, node:$src1),
       (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), AMDGPUmul_i24_oneuse, AMDGPUmul_u24_oneuse))
-                            (!cast<Extract>(#Type#Index#"_8bit") node:$src0),
-                            (!cast<Extract>(#Type#Index#"_8bit") node:$src1))>;
+                            (!cast<Extract>(Type#Index#"_8bit") node:$src0),
+                            (!cast<Extract>(Type#Index#"_8bit") node:$src1))>;
   }
 
 // Different variants of dot8 patterns cause a huge increase in the compile time.
@@ -231,15 +239,15 @@ foreach Type = ["I", "U"] in
   foreach Index = 0-7 in {
     // Defines patterns that extract each Index'ed 4bit from an unsigned
     // 32bit scalar value;
-    def #Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !if (!eq (Type, "U"), 1, 0)>;
+    def Type#Index#"_4bit" : Extract<!shl(Index, 2), 15, !if (!eq (Type, "U"), 1, 0)>;
 
     // Defines multiplication patterns where the multiplication is happening on each
     // Index'ed 8bit of a 32bit scalar value.
     def Mul#Type#Index#"_4bit" : PatFrag<
       (ops node:$src0, node:$src1),
       (!cast<HasOneUseBinOp>(!if (!eq (Type, "I"), NonACAMDGPUmul_i24_oneuse, NonACAMDGPUmul_u24_oneuse))
-                             (!cast<Extract>(#Type#Index#"_4bit") node:$src0),
-                             (!cast<Extract>(#Type#Index#"_4bit") node:$src1))>;
+                             (!cast<Extract>(Type#Index#"_4bit") node:$src0),
+                             (!cast<Extract>(Type#Index#"_4bit") node:$src1))>;
   }
 
 class UDot2Pat<Instruction Inst> : GCNPat <
@@ -264,40 +272,30 @@ class SDot2Pat<Instruction Inst> : GCNPat <
 let IsDOT = 1 in {
 let SubtargetPredicate = HasDot2Insts in {
 
-def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16", VOP3_Profile<VOP_F32_V2F16_V2F16_F32>>;
-def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
-def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16", VOP3_Profile<VOP_I32_V2I16_V2I16_I32>>;
-def V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
-def V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT2_F32_F16 : VOP3PInst<"v_dot2_f32_f16",
+  VOP3_Profile<VOP_F32_V2F16_V2F16_F32>,
+  AMDGPUfdot2, 1/*ExplicitClamp*/>;
+def V_DOT2_I32_I16 : VOP3PInst<"v_dot2_i32_i16",
+  VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_sdot2, 1>;
+def V_DOT2_U32_U16 : VOP3PInst<"v_dot2_u32_u16",
+  VOP3_Profile<VOP_I32_V2I16_V2I16_I32>, int_amdgcn_udot2, 1>;
+def V_DOT4_U32_U8  : VOP3PInst<"v_dot4_u32_u8",
+  VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot4, 1>;
+def V_DOT8_U32_U4  : VOP3PInst<"v_dot8_u32_u4",
+  VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_udot8, 1>;
 
 } // End SubtargetPredicate = HasDot2Insts
 
 let SubtargetPredicate = HasDot1Insts in {
 
-def V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
-def V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4", VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>>;
+def V_DOT4_I32_I8  : VOP3PInst<"v_dot4_i32_i8",
+  VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot4, 1>;
+def V_DOT8_I32_I4  : VOP3PInst<"v_dot8_i32_i4",
+  VOP3_Profile<VOP_I32_I32_I32_I32, VOP3_PACKED>, int_amdgcn_sdot8, 1>;
 
 } // End SubtargetPredicate = HasDot1Insts
 } // End let IsDOT = 1
 
-multiclass DotPats<SDPatternOperator dot_op,
-                   VOP3PInst dot_inst> {
-  let SubtargetPredicate = dot_inst.SubtargetPredicate in
-  def : GCNPat <
-    (dot_op (dot_inst.Pfl.Src0VT (VOP3PMods0 dot_inst.Pfl.Src0VT:$src0, i32:$src0_modifiers)),
-            (dot_inst.Pfl.Src1VT (VOP3PMods dot_inst.Pfl.Src1VT:$src1, i32:$src1_modifiers)),
-            (dot_inst.Pfl.Src2VT (VOP3PMods dot_inst.Pfl.Src2VT:$src2, i32:$src2_modifiers)), i1:$clamp),
-    (dot_inst $src0_modifiers, $src0, $src1_modifiers, $src1, $src2_modifiers, $src2, (as_i1imm $clamp))>;
-}
-
-defm : DotPats<AMDGPUfdot2, V_DOT2_F32_F16>;
-defm : DotPats<int_amdgcn_sdot2, V_DOT2_I32_I16>;
-defm : DotPats<int_amdgcn_udot2, V_DOT2_U32_U16>;
-defm : DotPats<int_amdgcn_sdot4, V_DOT4_I32_I8>;
-defm : DotPats<int_amdgcn_udot4, V_DOT4_U32_U8>;
-defm : DotPats<int_amdgcn_sdot8, V_DOT8_I32_I4>;
-defm : DotPats<int_amdgcn_udot8, V_DOT8_U32_U4>;
-
 def : UDot2Pat<V_DOT2_U32_U16>;
 def : SDot2Pat<V_DOT2_I32_I16>;
 
@@ -368,12 +366,16 @@ def VOPProfileMAI_F32_V4F16_X16 : VOPProfileMAI<VOP_V16F32_V4F16_V4F16_V16F32, A
 def VOPProfileMAI_F32_V4F16_X32 : VOPProfileMAI<VOP_V32F32_V4F16_V4F16_V32F32, AISrc_1024_b32, ADst_1024, AVSrc_64>;
 
 let Predicates = [HasMAIInsts] in {
+
+let isAsCheapAsAMove = 1, isReMaterializable = 1 in {
 def V_ACCVGPR_READ_B32  : VOP3Inst<"v_accvgpr_read_b32",  VOPProfileAccRead>;
 def V_ACCVGPR_WRITE_B32 : VOP3Inst<"v_accvgpr_write_b32", VOPProfileAccWrite> {
   let isMoveImm = 1;
 }
+}
 
-let isConvergent = 1 in {
+// FP32 denorm mode is respected, rounding mode is not. Exceptions are not supported.
+let isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1 in {
 def V_MFMA_F32_4X4X1F32    : VOP3Inst<"v_mfma_f32_4x4x1f32",    VOPProfileMAI_F32_F32_X4,    int_amdgcn_mfma_f32_4x4x1f32>;
 def V_MFMA_F32_4X4X4F16    : VOP3Inst<"v_mfma_f32_4x4x4f16",    VOPProfileMAI_F32_V4F16_X4,  int_amdgcn_mfma_f32_4x4x4f16>;
 def V_MFMA_I32_4X4X4I8     : VOP3Inst<"v_mfma_i32_4x4x4i8",     VOPProfileMAI_I32_I32_X4,    int_amdgcn_mfma_i32_4x4x4i8>;
@@ -394,7 +396,7 @@ def V_MFMA_I32_32X32X4I8   : VOP3Inst<"v_mfma_i32_32x32x4i8",   VOPProfileMAI_I3
 def V_MFMA_I32_32X32X8I8   : VOP3Inst<"v_mfma_i32_32x32x8i8",   VOPProfileMAI_I32_I32_X16,   int_amdgcn_mfma_i32_32x32x8i8>;
 def V_MFMA_F32_32X32X2BF16 : VOP3Inst<"v_mfma_f32_32x32x2bf16", VOPProfileMAI_F32_V2I16_X32, int_amdgcn_mfma_f32_32x32x2bf16>;
 def V_MFMA_F32_32X32X4BF16 : VOP3Inst<"v_mfma_f32_32x32x4bf16", VOPProfileMAI_F32_V2I16_X16, int_amdgcn_mfma_f32_32x32x4bf16>;
-} // End isConvergent = 1
+} // End isConvergent = 1, mayRaiseFPException = 0, ReadsModeReg = 1
 
 } // End SubtargetPredicate = HasMAIInsts
 
diff --git a/llvm/lib/Target/AMDGPU/VOPCInstructions.td b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
index 39d18794f947..aa2fa260e7b5 100644
--- a/llvm/lib/Target/AMDGPU/VOPCInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPCInstructions.td
@@ -1,4 +1,4 @@
-//===-- VOPCInstructions.td - Vector Instruction Defintions ---------------===//
+//===-- VOPCInstructions.td - Vector Instruction Definitions --------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -92,9 +92,11 @@ class VOPC_Pseudo <string opName, VOPC_Profile P, list<dag> pattern=[],
   let mayStore = 0;
   let hasSideEffects = 0;
 
+  let ReadsModeReg = isFloatType<P.Src0VT>.ret;
+
   let VALU = 1;
   let VOPC = 1;
-  let Uses = [EXEC];
+  let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
   let Defs = !if(DefVcc, [VCC], []);
 
   VOPProfile Pfl = P;
@@ -738,6 +740,9 @@ multiclass VOPC_CLASS_F64 <string opName> :
 multiclass VOPCX_CLASS_F64 <string opName> :
   VOPCX_Class_Pseudos <opName, VOPC_I1_F64_I32, VOPC_F64_I32>;
 
+// cmp_class ignores the FP mode and faithfully reports the unmodified
+// source value.
+let ReadsModeReg = 0, mayRaiseFPException = 0 in {
 defm V_CMP_CLASS_F32 : VOPC_CLASS_F32 <"v_cmp_class_f32">;
 defm V_CMPX_CLASS_F32 : VOPCX_CLASS_F32 <"v_cmpx_class_f32">;
 defm V_CMP_CLASS_F64 : VOPC_CLASS_F64 <"v_cmp_class_f64">;
@@ -747,6 +752,7 @@ let SubtargetPredicate = Has16BitInsts in {
 defm V_CMP_CLASS_F16  : VOPC_CLASS_F16 <"v_cmp_class_f16">;
 defm V_CMPX_CLASS_F16 : VOPCX_CLASS_F16 <"v_cmpx_class_f16">;
 }
+} // End ReadsModeReg = 0, mayRaiseFPException = 0
 
 //===----------------------------------------------------------------------===//
 // V_ICMPIntrinsic Pattern.
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index f208a1134a5a..f8a83e5f74c0 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -1,4 +1,4 @@
-//===-- VOPInstructions.td - Vector Instruction Defintions ----------------===//
+//===-- VOPInstructions.td - Vector Instruction Definitions ---------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,6 +8,8 @@
 
 // dummies for outer let
 class LetDummies {
+  bit ReadsModeReg;
+  bit mayRaiseFPException;
   bit isCommutable;
   bit isConvertibleToThreeAddress;
   bit isMoveImm;
@@ -35,7 +37,7 @@ class VOPAnyCommon <dag outs, dag ins, string asm, list<dag> pattern> :
   let hasSideEffects = 0;
   let UseNamedOperandTable = 1;
   let VALU = 1;
-  let Uses = [EXEC];
+  let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
 }
 
 class VOP_Pseudo <string opName, string suffix, VOPProfile P, dag outs, dag ins,
@@ -118,7 +120,10 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
   let ClampLo = P.HasClampLo;
   let ClampHi = P.HasClampHi;
 
-  let Uses = [EXEC];
+  let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+  let mayRaiseFPException = ReadsModeReg;
+  let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
 
   let AsmVariantName = AMDGPUAsmVariants.VOP3;
   let AsmMatchConverter =
@@ -160,7 +165,7 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily> :
   VOPProfile Pfl = ps.Pfl;
 }
 
-// XXX - Is there any reason to distingusih this from regular VOP3
+// XXX - Is there any reason to distinguish this from regular VOP3
 // here?
 class VOP3P_Real<VOP_Pseudo ps, int EncodingFamily> :
   VOP3_Real<ps, EncodingFamily>;
@@ -490,10 +495,14 @@ class VOP_SDWA_Pseudo <string opName, VOPProfile P, list<dag> pattern=[]> :
 
   let VALU = 1;
   let SDWA = 1;
-  let Uses = [EXEC];
 
-  let SubtargetPredicate = !if(P.HasExtSDWA, HasSDWA, DisableInst);
-  let AssemblerPredicate = !if(P.HasExtSDWA, HasSDWA, DisableInst);
+  let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+  let mayRaiseFPException = ReadsModeReg;
+  let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
+
+  let SubtargetPredicate = HasSDWA;
+  let AssemblerPredicate = HasSDWA;
   let AsmVariantName = !if(P.HasExtSDWA, AMDGPUAsmVariants.SDWA,
                                          AMDGPUAsmVariants.Disable);
   let DecoderNamespace = "SDWA";
@@ -542,8 +551,8 @@ class Base_VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
   let Constraints     = ps.Constraints;
   let DisableEncoding = ps.DisableEncoding;
 
-  let SubtargetPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA9, DisableInst);
-  let AssemblerPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA9, DisableInst);
+  let SubtargetPredicate = HasSDWA9;
+  let AssemblerPredicate = HasSDWA9;
   let AsmVariantName = !if(ps.Pfl.HasExtSDWA9, AMDGPUAsmVariants.SDWA9,
                                                AMDGPUAsmVariants.Disable);
   let DecoderNamespace = "SDWA9";
@@ -561,8 +570,8 @@ class VOP_SDWA9_Real <VOP_SDWA_Pseudo ps> :
   SIMCInstr <ps.PseudoInstr, SIEncodingFamily.SDWA9>;
 
 class Base_VOP_SDWA10_Real<VOP_SDWA_Pseudo ps> : Base_VOP_SDWA9_Real<ps> {
-  let SubtargetPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA10, DisableInst);
-  let AssemblerPredicate = !if(ps.Pfl.HasExtSDWA9, HasSDWA10, DisableInst);
+  let SubtargetPredicate = HasSDWA10;
+  let AssemblerPredicate = HasSDWA10;
   let DecoderNamespace = "SDWA10";
 }
 
@@ -607,7 +616,11 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
   let VALU = 1;
   let DPP = 1;
   let Size = 8;
-  let Uses = [EXEC];
+
+  let ReadsModeReg = !or(isFloatType<P.DstVT>.ret, isFloatType<P.Src0VT>.ret);
+
+  let mayRaiseFPException = ReadsModeReg;
+  let Uses = !if(ReadsModeReg, [MODE, EXEC], [EXEC]);
   let isConvergent = 1;
 
   string Mnemonic = OpName;
@@ -615,7 +628,7 @@ class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[]> :
 
   let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
   let SubtargetPredicate = HasDPP;
-  let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst);
+  let AssemblerPredicate = HasDPP;
   let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
                                         AMDGPUAsmVariants.Disable);
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
@@ -670,7 +683,7 @@ class VOP_DPP <string OpName, VOPProfile P, bit IsDPP16,
 
   let AsmMatchConverter = !if(!eq(P.HasModifiers,1), "cvtDPP", "");
   let SubtargetPredicate = HasDPP;
-  let AssemblerPredicate = !if(P.HasExtDPP, HasDPP, DisableInst);
+  let AssemblerPredicate = HasDPP;
   let AsmVariantName = !if(P.HasExtDPP, AMDGPUAsmVariants.DPP,
                                         AMDGPUAsmVariants.Disable);
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
@@ -702,7 +715,7 @@ class VOP_DPP8<string OpName, VOPProfile P> :
 
   let AsmMatchConverter = "cvtDPP8";
   let SubtargetPredicate = HasDPP8;
-  let AssemblerPredicate = !if(P.HasExt, HasDPP8, DisableInst);
+  let AssemblerPredicate = HasDPP8;
   let AsmVariantName = !if(P.HasExt, AMDGPUAsmVariants.DPP,
                                      AMDGPUAsmVariants.Disable);
   let Constraints = !if(P.NumSrcArgs, P.TieRegDPP # " = $vdst", "");
diff --git a/llvm/lib/Target/ARC/ARCAsmPrinter.cpp b/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
index 7915ca003316..025b920ff7b4 100644
--- a/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
+++ b/llvm/lib/Target/ARC/ARCAsmPrinter.cpp
@@ -41,12 +41,14 @@ public:
         MCInstLowering(&OutContext, *this) {}
 
   StringRef getPassName() const override { return "ARC Assembly Printer"; }
-  void EmitInstruction(const MachineInstr *MI) override;
+  void emitInstruction(const MachineInstr *MI) override;
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
 };
 
 } // end anonymous namespace
 
-void ARCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void ARCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   SmallString<128> Str;
   raw_svector_ostream O(Str);
 
@@ -61,6 +63,12 @@ void ARCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
+bool ARCAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  // Functions are 4-byte aligned.
+  MF.ensureAlignment(Align(4));
+  return AsmPrinter::runOnMachineFunction(MF);
+}
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARCAsmPrinter() {
   RegisterAsmPrinter<ARCAsmPrinter> X(getTheARCTarget());
diff --git a/llvm/lib/Target/ARC/ARCFrameLowering.cpp b/llvm/lib/Target/ARC/ARCFrameLowering.cpp
index d8946d97deff..ead593106262 100644
--- a/llvm/lib/Target/ARC/ARCFrameLowering.cpp
+++ b/llvm/lib/Target/ARC/ARCFrameLowering.cpp
@@ -74,8 +74,7 @@ static void generateStackAdjustment(MachineBasicBlock &MBB,
       .addImm(AbsAmount);
 }
 
-static unsigned
-determineLastCalleeSave(const std::vector<CalleeSavedInfo> &CSI) {
+static unsigned determineLastCalleeSave(ArrayRef<CalleeSavedInfo> CSI) {
   unsigned Last = 0;
   for (auto Reg : CSI) {
     assert(Reg.getReg() >= ARC::R13 && Reg.getReg() <= ARC::R25 &&
@@ -197,7 +196,7 @@ void ARCFrameLowering::emitPrologue(MachineFunction &MF,
   // .cfi_offset fp, -StackSize
   // .cfi_offset blink, -StackSize+4
   unsigned CFIIndex = MF.addFrameInst(
-      MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize()));
+      MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
   BuildMI(MBB, MBBI, dl, TII->get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex)
       .setMIFlags(MachineInstr::FrameSetup);
@@ -401,8 +400,7 @@ bool ARCFrameLowering::assignCalleeSavedSpillSlots(
 
 bool ARCFrameLowering::spillCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   LLVM_DEBUG(dbgs() << "Spill callee saved registers: "
                     << MBB.getParent()->getName() << "\n");
   // There are routines for saving at least 3 registers (r13 to r15, etc.)
@@ -419,7 +417,7 @@ bool ARCFrameLowering::spillCalleeSavedRegisters(
 
 bool ARCFrameLowering::restoreCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    std::vector<CalleeSavedInfo> &CSI, const TargetRegisterInfo *TRI) const {
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   LLVM_DEBUG(dbgs() << "Restore callee saved registers: "
                     << MBB.getParent()->getName() << "\n");
   // There are routines for saving at least 3 registers (r13 to r15, etc.)
@@ -441,8 +439,8 @@ void ARCFrameLowering::processFunctionBeforeFrameFinalized(
   LLVM_DEBUG(dbgs() << "Current stack size: " << MFI.getStackSize() << "\n");
   const TargetRegisterClass *RC = &ARC::GPR32RegClass;
   if (MFI.hasStackObjects()) {
-    int RegScavFI = MFI.CreateStackObject(
-        RegInfo->getSpillSize(*RC), RegInfo->getSpillAlignment(*RC), false);
+    int RegScavFI = MFI.CreateStackObject(RegInfo->getSpillSize(*RC),
+                                          RegInfo->getSpillAlign(*RC), false);
     RS->addScavengingFrameIndex(RegScavFI);
     LLVM_DEBUG(dbgs() << "Created scavenging index RegScavFI=" << RegScavFI
                       << "\n");
diff --git a/llvm/lib/Target/ARC/ARCFrameLowering.h b/llvm/lib/Target/ARC/ARCFrameLowering.h
index 9242400fb28d..9951a09842c5 100644
--- a/llvm/lib/Target/ARC/ARCFrameLowering.h
+++ b/llvm/lib/Target/ARC/ARCFrameLowering.h
@@ -42,13 +42,13 @@ public:
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
                                  const TargetRegisterInfo *TRI) const override;
 
   bool
   restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI,
-                              std::vector<CalleeSavedInfo> &CSI,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
                               const TargetRegisterInfo *TRI) const override;
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
diff --git a/llvm/lib/Target/ARC/ARCISelLowering.cpp b/llvm/lib/Target/ARC/ARCISelLowering.cpp
index 8df2b5d2b6a7..4a6510f10eeb 100644
--- a/llvm/lib/Target/ARC/ARCISelLowering.cpp
+++ b/llvm/lib/Target/ARC/ARCISelLowering.cpp
@@ -245,7 +245,7 @@ SDValue ARCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Analyze return values to determine the number of bytes of stack required.
   CCState RetCCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
                     *DAG.getContext());
-  RetCCInfo.AllocateStack(CCInfo.getNextStackOffset(), 4);
+  RetCCInfo.AllocateStack(CCInfo.getNextStackOffset(), Align(4));
   RetCCInfo.AnalyzeCallResult(Ins, RetCC_ARC);
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -563,14 +563,16 @@ SDValue ARCTargetLowering::LowerCallArguments(
   for (const auto &ArgDI : ArgData) {
     if (ArgDI.Flags.isByVal() && ArgDI.Flags.getByValSize()) {
       unsigned Size = ArgDI.Flags.getByValSize();
-      unsigned Align = std::max(StackSlotSize, ArgDI.Flags.getByValAlign());
+      Align Alignment =
+          std::max(Align(StackSlotSize), ArgDI.Flags.getNonZeroByValAlign());
       // Create a new object on the stack and copy the pointee into it.
-      int FI = MFI.CreateStackObject(Size, Align, false);
+      int FI = MFI.CreateStackObject(Size, Alignment, false);
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
       InVals.push_back(FIN);
       MemOps.push_back(DAG.getMemcpy(
-          Chain, dl, FIN, ArgDI.SDV, DAG.getConstant(Size, dl, MVT::i32), Align,
-          false, false, false, MachinePointerInfo(), MachinePointerInfo()));
+          Chain, dl, FIN, ArgDI.SDV, DAG.getConstant(Size, dl, MVT::i32),
+          Alignment, false, false, false, MachinePointerInfo(),
+          MachinePointerInfo()));
     } else {
       InVals.push_back(ArgDI.SDV);
     }
@@ -620,7 +622,7 @@ ARCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   // Analyze return values.
   if (!IsVarArg)
-    CCInfo.AllocateStack(AFI->getReturnStackOffset(), 4);
+    CCInfo.AllocateStack(AFI->getReturnStackOffset(), Align(4));
 
   CCInfo.AnalyzeReturn(Outs, RetCC_ARC);
 
diff --git a/llvm/lib/Target/ARC/ARCInstrFormats.td b/llvm/lib/Target/ARC/ARCInstrFormats.td
index e4902a73ed49..584844d49553 100644
--- a/llvm/lib/Target/ARC/ARCInstrFormats.td
+++ b/llvm/lib/Target/ARC/ARCInstrFormats.td
@@ -127,16 +127,16 @@ class PseudoInstARC<dag outs, dag ins, string asmstr, list<dag> pattern>
 //===----------------------------------------------------------------------===//
 
 // All 32-bit ARC instructions have a 5-bit "major" opcode class designator
-// in bits 27-31.   
-// 
+// in bits 27-31.
+//
 // Some general naming conventions:
 // N  - Delay Slot bit.  ARC v2 branch instructions have an optional delay slot
 //      which is encoded with this bit.  When set, a delay slot exists.
 // cc - Condition code.
 // SX - Signed X-bit immediate.
 // UX - Unsigned X-bit immediate.
-// 
-// [ABC] - 32-bit register operand.  These are 6-bit fields.  This encodes the 
+//
+// [ABC] - 32-bit register operand.  These are 6-bit fields.  This encodes the
 //         standard 32 general purpose registers, and allows use of additional
 //         (extension) registers.  This also encodes an instruction that uses
 //         a 32-bit Long Immediate (LImm), using 0x3e==62 as the field value.
@@ -166,7 +166,7 @@ class F32_BR_COND<bits<5> major, dag outs, dag ins, bit b16, string asmstr,
                   list<dag> pattern> :
   F32_BR<major, outs, ins, b16, asmstr, pattern> {
   bits<21> S21; // 2-byte aligned 21-bit byte-offset.
-  bits<5> cc; 
+  bits<5> cc;
   let Inst{26-18} = S21{10-2};
   let Inst{15-6} = S21{20-11};
   let Inst{4-0} = cc;
@@ -328,7 +328,7 @@ class F32_DOP_RU6<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
 }
 
 // 2-register, signed 12-bit immediate Dual Operand instruction.
-// This instruction uses B as the first 2 operands (i.e., add B, B, -128). 
+// This instruction uses B as the first 2 operands (i.e., add B, B, -128).
 // |26|25|24|23|22|21|20|19|18|17|16|15|14|13|12|11|10|9|8|7|6|5|4|3|2|1|0|
 // |B[2-0]  | 1| 0|            subop| F|B[5-3]  |S12[5-0]     |S12[11-6]  |
 class F32_DOP_RS12<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
@@ -336,7 +336,7 @@ class F32_DOP_RS12<bits<5> major, bits<6> subop, bit F, dag outs, dag ins,
   InstARC<4, outs, ins, asmstr, pattern> {
   bits<6> B;
   bits<12> S12;
-  
+
   let Inst{31-27} = major;
   let Inst{26-24} = B{2-0};
   let Inst{23-22} = 0b10;
@@ -547,14 +547,14 @@ class F16_COMPACT<bits<1> i, dag outs, dag ins,
   let Inst{15-11} = 0b01000;
   let Inst{7-5} = h{2-0};
   let Inst{2} = i;
-  let Inst{1-0} = h{4-3};  
+  let Inst{1-0} = h{4-3};
 }
 
 // Compact Load/Add/Sub.
 class F16_LD_ADD_SUB<dag outs, dag ins, string asmstr> :
   InstARC<2, outs, ins, asmstr, []> {
 
-  bits<3> b;  
+  bits<3> b;
   let Inst{15-11} = 0b01001;
   let Inst{10-8} = b;
 }
@@ -575,10 +575,10 @@ class F16_LD_SUB<bit i, string asmstr> :
 class F16_ADD :
   F16_LD_ADD_SUB<(outs GPR32:$r), (ins GPR32:$b, immU<6>:$u6),
   "add_s\t$r, $b, $u6"> {
-  
+
   bit r;
   bits<6> u6;
-  
+
   let Inst{7} = r;
   let Inst{6-4} = u6{5-3};
   let Inst{3} = 1;
@@ -610,7 +610,7 @@ class F16_LDI_u7 :
 
   bits<3> b;
   bits<7> u7;
-  
+
   let Inst{10-8} = b;
   let Inst{7-4} = u7{6-3};
   let Inst{3} = 1;
@@ -623,7 +623,7 @@ class F16_JLI_EI<bit i, string asmstr> :
   !strconcat(asmstr, "\t$u10"), []> {
 
   bits<10> u10;
-  
+
   let Inst{15-11} = 0b01011;
   let Inst{10} = i;
   let Inst{9-0} = u10;
@@ -635,9 +635,9 @@ class F16_LD_ADD_RR<bits<2> i, string asmstr> :
   asmstr, []> {
 
   bits<3> a;
-  bits<3> b; 
+  bits<3> b;
   bits<3> c;
- 
+
   let Inst{15-11} = 0b01100;
   let Inst{10-8} = b;
   let Inst{7-5} = c;
@@ -648,7 +648,7 @@ class F16_LD_ADD_RR<bits<2> i, string asmstr> :
 // Load/Add GP-Relative.
 class F16_GP_LD_ADD<bits<2> i, dag ins, string asmstr> :
   InstARC<2, (outs), ins, asmstr, []> {
- 
+
   let Inst{15-11} = 0b11001;
   let Inst{10-9} = i;
 }
@@ -663,7 +663,7 @@ class F16_ADD_IMM<bits<2> i, string asmstr> :
   bits<3> b;
   bits<3> c;
   bits<3> u3;
-  
+
   let Inst{15-11} = 0b01101;
   let Inst{10-8} = b;
   let Inst{7-5} = c;
@@ -689,8 +689,8 @@ class F16_OP_HREG<bits<3> i, dag outs, dag ins, string asmstr> :
 
 class F16_OP_HREG30<bits<3> i, dag outs, dag ins, string asmstr> :
   F16_OP_HREG<i, outs, ins, asmstr> {
-    
-  bits<5> LImmReg = 0b11110;  
+
+  bits<5> LImmReg = 0b11110;
   let Inst{7-5} = LImmReg{2-0};
   let Inst{1-0} = LImmReg{4-3};
 }
@@ -784,7 +784,7 @@ class F16_SH_SUB_BIT<bits<3> i, string asmstr> :
 
   bits<3> b;
   bits<5> u5;
-  
+
   let Inst{15-11} = 0b10111;
   let Inst{10-8} = b;
   let Inst{7-5} = i;
@@ -816,7 +816,7 @@ class F16_SP_OPS_u7_aligned<bits<3> i,
 
   bits<3> b3;
   bits<7> u7;
-  
+
   let fieldB = b3;
   let fieldU = u7{6-2};
   let u7{1-0} = 0b00;
@@ -826,7 +826,7 @@ class F16_SP_OPS_bconst<bits<3> b, string asmop> :
   F16_SP_OPS_u7_aligned<0b101,
   (outs), (ins immU<7>:$u7),
   !strconcat(asmop, "\t%sp, %sp, $u7")> {
-  
+
   let fieldB = b;
 }
 
@@ -834,14 +834,14 @@ class F16_SP_OPS_uconst<bits<3> i,
   dag outs, dag ins, string asmop> :
   F16_SP_OPS_u7_aligned<i, outs, ins,
   !strconcat(asmop, "\t$b3")> {
-  
+
   let fieldU = 0b00001;
 }
 
 class F16_SP_OPS_buconst<bits<3> i, string asmop> :
   F16_SP_OPS_u7_aligned<i, (outs), (ins),
     !strconcat(asmop, "\t%blink")> {
-  
+
   let fieldB = 0x000;
   let fieldU = 0b10001;
 }
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.cpp b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
index 2bf2c1f6bbc5..527f239c2643 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.cpp
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.cpp
@@ -161,7 +161,7 @@ static bool isJumpOpcode(int Opc) { return Opc == ARC::J; }
 ///    condition.  These operands can be passed to other TargetInstrInfo
 ///    methods to create new branches.
 ///
-/// Note that RemoveBranch and InsertBranch must be implemented to support
+/// Note that RemoveBranch and insertBranch must be implemented to support
 /// cases where this method returns success.
 ///
 /// If AllowModify is true, then this routine is allowed to modify the basic
@@ -292,18 +292,18 @@ void ARCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
 void ARCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I,
-                                       unsigned SrcReg, bool isKill,
+                                       Register SrcReg, bool isKill,
                                        int FrameIndex,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   DebugLoc dl = MBB.findDebugLoc(I);
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FrameIndex);
 
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FrameIndex),
-      MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex), Align);
+      MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex),
+      MFI.getObjectAlign(FrameIndex));
 
   assert(MMO && "Couldn't get MachineMemOperand for store to stack.");
   assert(TRI->getSpillSize(*RC) == 4 &&
@@ -321,16 +321,16 @@ void ARCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
 void ARCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator I,
-                                        unsigned DestReg, int FrameIndex,
+                                        Register DestReg, int FrameIndex,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
   DebugLoc dl = MBB.findDebugLoc(I);
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FrameIndex);
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FrameIndex),
-      MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex), Align);
+      MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
+      MFI.getObjectAlign(FrameIndex));
 
   assert(MMO && "Couldn't get MachineMemOperand for store to stack.");
   assert(TRI->getSpillSize(*RC) == 4 &&
@@ -375,7 +375,7 @@ unsigned ARCInstrInfo::insertBranch(MachineBasicBlock &MBB,
   assert(!BytesAdded && "Code size not handled.");
 
   // Shouldn't be a fall through.
-  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 3 || Cond.size() == 0) &&
          "ARC branch conditions have two components!");
 
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.h b/llvm/lib/Target/ARC/ARCInstrInfo.h
index 6f894478d3d3..4f6122daf91f 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.h
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.h
@@ -68,13 +68,13 @@ public:
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, unsigned SrcReg,
+                           MachineBasicBlock::iterator MI, Register SrcReg,
                            bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI, unsigned DestReg,
+                            MachineBasicBlock::iterator MI, Register DestReg,
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
diff --git a/llvm/lib/Target/ARC/ARCInstrInfo.td b/llvm/lib/Target/ARC/ARCInstrInfo.td
index 311d998f3d86..8fe393dfaf5b 100644
--- a/llvm/lib/Target/ARC/ARCInstrInfo.td
+++ b/llvm/lib/Target/ARC/ARCInstrInfo.td
@@ -34,7 +34,7 @@ def ARCGAWrapper : SDNode<"ARCISD::GAWRAPPER", SDT_ARCmov, []>;
 // Comparison
 def ARCcmp : SDNode<"ARCISD::CMP", SDT_ARCcmptst, [SDNPOutGlue]>;
 
-// Conditionanal mov
+// Conditional mov
 def ARCcmov : SDNode<"ARCISD::CMOV", SDT_ARCcmov, [SDNPInGlue]>;
 
 // Conditional Branch
@@ -206,7 +206,7 @@ multiclass ArcBinaryEXT5Inst<bits<6> mincode, string opasm> :
 multiclass ArcUnaryGEN4Inst<bits<6> mincode, string opasm> :
   ArcUnaryInst<0b00100, mincode, opasm>;
 
-// Pattern generation for differnt instruction variants.
+// Pattern generation for different instruction variants.
 multiclass MultiPat<SDPatternOperator InFrag,
                Instruction RRR, Instruction RRU6, Instruction RRLImm> {
   def _rrr : Pat<(InFrag i32:$B, i32:$C), (RRR i32:$B, i32:$C)>;
@@ -215,7 +215,7 @@ multiclass MultiPat<SDPatternOperator InFrag,
 }
 
 // ---------------------------------------------------------------------------
-// Instruction defintions and patterns for 3 operand binary instructions.
+// Instruction definitions and patterns for 3 operand binary instructions.
 // ---------------------------------------------------------------------------
 
 // Definitions for 3 operand binary instructions.
@@ -344,7 +344,7 @@ let isBranch = 1, isTerminator = 1 in {
   // At worst, this expands into 2 4-byte instructions.
   def BRcc_rr_p : PseudoInstARC<(outs),
                                 (ins btarget:$T, GPR32:$B, GPR32:$C, ccond:$cc),
-                                "pbr$cc\t$B, $C, $T", 
+                                "pbr$cc\t$B, $C, $T",
                                 [(ARCbrcc bb:$T, i32:$B, i32:$C, imm32:$cc)]>
                                 { let Size = 8; }
 
@@ -430,7 +430,7 @@ def LEAVE_S : F16_SP_OPS<0b110,
   (outs), (ins immU<7>:$u7), "leave_s\t$u7"> {
 
   bits<7> u7;
-  
+
   let fieldB = u7{6-4};
   let fieldU{4-1} = u7{3-0};
   let fieldU{0} = 0b0;
@@ -440,7 +440,7 @@ def ENTER_S : F16_SP_OPS<0b111,
   (outs), (ins immU<6>:$u6), "enter_s\t$u6"> {
 
   bits<6> u6;
-  
+
   let fieldB{2} = 0;
   let fieldB{1-0} = u6{5-4};
   let fieldU{4-1} = u6{3-0};
@@ -452,19 +452,19 @@ def ENTER_S : F16_SP_OPS<0b111,
 //----------------------------------------------------------------------------
 class COMPACT_MOV_S :
   F16_COMPACT<0b0, (outs GPR32:$g), (ins GPR32:$h),
-          "mov_s\t$g, $h"> {  
+          "mov_s\t$g, $h"> {
   let DecoderMethod = "DecodeMoveHRegInstruction";
 }
 
 def COMPACT_MOV_S_limm : COMPACT_MOV_S {
-  bits<32> LImm;  
+  bits<32> LImm;
   let Inst{47-16} = LImm;
 
-  bits<5> LImmReg = 0b11110;  
+  bits<5> LImmReg = 0b11110;
   let Inst{7-5} = LImmReg{2-0};
   let Inst{1-0} = LImmReg{4-3};
 
-  let Size = 6;  
+  let Size = 6;
 }
 
 def COMPACT_MOV_S_hreg : COMPACT_MOV_S;
@@ -548,9 +548,9 @@ def GP_ADD_S : F16_GP_LD_ADD<0b11, (ins immS<11>:$s),
 //----------------------------------------------------------------------------
 def PCL_LD : InstARC<2, (outs GPR32:$b), (ins immU<10>:$u10),
  "ld_s\t$b, [%pcl, $u10]", []> {
- 
-  bits<3> b; 
-  bits<10> u10; 
+
+  bits<3> b;
+  bits<10> u10;
 
   let Inst{15-11} = 0b11010;
   let Inst{10-8} = b;
@@ -587,11 +587,11 @@ def BL_S :
   InstARC<2, (outs), (ins btargetS13:$s13), "bl_s\t$s13", []> {
 
   let Inst{15-11} = 0b11111;
-  
+
   bits<13> s13;
   let Inst{10-0} = s13{12-2};
   let s13{1-0} = 0b00;
-  
+
   let isCall = 1;
   let isBarrier = 1;
 }
diff --git a/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h b/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h
index d4dcf9bf285c..968c6b63f423 100644
--- a/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARC/ARCMachineFunctionInfo.h
@@ -33,10 +33,7 @@ public:
 
   explicit ARCFunctionInfo(MachineFunction &MF)
       : ReturnStackOffsetSet(false), VarArgsFrameIndex(0),
-        ReturnStackOffset(-1U), MaxCallStackReq(0) {
-    // Functions are 4-byte aligned.
-    MF.setAlignment(Align(4));
-  }
+        ReturnStackOffset(-1U), MaxCallStackReq(0) {}
 
   ~ARCFunctionInfo() {}
 
diff --git a/llvm/lib/Target/ARC/ARCRegisterInfo.cpp b/llvm/lib/Target/ARC/ARCRegisterInfo.cpp
index 490f08930091..c49af8a45236 100644
--- a/llvm/lib/Target/ARC/ARCRegisterInfo.cpp
+++ b/llvm/lib/Target/ARC/ARCRegisterInfo.cpp
@@ -153,11 +153,6 @@ bool ARCRegisterInfo::requiresRegisterScavenging(
   return true;
 }
 
-bool ARCRegisterInfo::trackLivenessAfterRegAlloc(
-    const MachineFunction &MF) const {
-  return true;
-}
-
 bool ARCRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
   return true;
 }
diff --git a/llvm/lib/Target/ARC/ARCRegisterInfo.h b/llvm/lib/Target/ARC/ARCRegisterInfo.h
index af41234e9dda..f8bca11fdbc8 100644
--- a/llvm/lib/Target/ARC/ARCRegisterInfo.h
+++ b/llvm/lib/Target/ARC/ARCRegisterInfo.h
@@ -34,8 +34,6 @@ public:
 
   bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
-
   bool useFPForScavengingIndex(const MachineFunction &MF) const override;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
diff --git a/llvm/lib/Target/ARC/ARCRegisterInfo.td b/llvm/lib/Target/ARC/ARCRegisterInfo.td
index 4b6744ad73da..82fdccc51466 100644
--- a/llvm/lib/Target/ARC/ARCRegisterInfo.td
+++ b/llvm/lib/Target/ARC/ARCRegisterInfo.td
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-//  Declarations that describe the ARC register file 
+//  Declarations that describe the ARC register file
 //===----------------------------------------------------------------------===//
 
 class ARCReg<string n, list<string> altNames> : Register<n, altNames> {
@@ -27,35 +27,35 @@ class Status<string n> : ARCReg<n, []> {
 // Integer registers
 def R0 : Core< 0, "%r0">, DwarfRegNum<[0]>;
 def R1 : Core< 1, "%r1">, DwarfRegNum<[1]>;
-def R2 : Core< 2, "%r2">, DwarfRegNum<[2]>; 
+def R2 : Core< 2, "%r2">, DwarfRegNum<[2]>;
 def R3 : Core< 3, "%r3">, DwarfRegNum<[3]>;
 let CostPerUse=1 in {
 def R4 : Core< 4, "%r4">, DwarfRegNum<[4]>;
-def R5 : Core< 5, "%r5">, DwarfRegNum<[5]>; 
+def R5 : Core< 5, "%r5">, DwarfRegNum<[5]>;
 def R6 : Core< 6, "%r6">, DwarfRegNum<[6]>;
 def R7 : Core< 7, "%r7">, DwarfRegNum<[7]>;
 def R8 : Core< 8, "%r8">, DwarfRegNum<[8]>;
 def R9 : Core< 9, "%r9">, DwarfRegNum<[9]>;
-def R10 : Core<10, "%r10">, DwarfRegNum<[10]>; 
+def R10 : Core<10, "%r10">, DwarfRegNum<[10]>;
 def R11 : Core<11, "%r11">, DwarfRegNum<[11]>;
 }
 def R12 : Core<12, "%r12">, DwarfRegNum<[12]>;
-def R13 : Core<13, "%r13">, DwarfRegNum<[13]>; 
+def R13 : Core<13, "%r13">, DwarfRegNum<[13]>;
 def R14 : Core<14, "%r14">, DwarfRegNum<[14]>;
 def R15 : Core<15, "%r15">, DwarfRegNum<[15]>;
 
 let CostPerUse=1 in {
 def R16 : Core<16, "%r16">, DwarfRegNum<[16]>;
 def R17 : Core<17, "%r17">, DwarfRegNum<[17]>;
-def R18 : Core<18, "%r18">, DwarfRegNum<[18]>; 
+def R18 : Core<18, "%r18">, DwarfRegNum<[18]>;
 def R19 : Core<19, "%r19">, DwarfRegNum<[19]>;
 def R20 : Core<20, "%r20">, DwarfRegNum<[20]>;
-def R21 : Core<21, "%r21">, DwarfRegNum<[21]>; 
+def R21 : Core<21, "%r21">, DwarfRegNum<[21]>;
 def R22 : Core<22, "%r22">, DwarfRegNum<[22]>;
 def R23 : Core<23, "%r23">, DwarfRegNum<[23]>;
 def R24 : Core<24, "%r24">, DwarfRegNum<[24]>;
 def R25 : Core<25, "%r25">, DwarfRegNum<[25]>;
-def GP : Core<26, "%gp",["%r26"]>, DwarfRegNum<[26]>; 
+def GP : Core<26, "%gp",["%r26"]>, DwarfRegNum<[26]>;
 def FP : Core<27, "%fp", ["%r27"]>, DwarfRegNum<[27]>;
 def SP : Core<28, "%sp", ["%r28"]>, DwarfRegNum<[28]>;
 def ILINK : Core<29, "%ilink">, DwarfRegNum<[29]>;
diff --git a/llvm/lib/Target/ARC/ARCTargetMachine.cpp b/llvm/lib/Target/ARC/ARCTargetMachine.cpp
index ab74fecb7804..4a5b6fd4d5bf 100644
--- a/llvm/lib/Target/ARC/ARCTargetMachine.cpp
+++ b/llvm/lib/Target/ARC/ARCTargetMachine.cpp
@@ -39,7 +39,7 @@ ARCTargetMachine::ARCTargetMachine(const Target &T, const Triple &TT,
                         TT, CPU, FS, Options, getRelocModel(RM),
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
-      Subtarget(TT, CPU, FS, *this) {
+      Subtarget(TT, std::string(CPU), std::string(FS), *this) {
   initAsmInfo();
 }
 
diff --git a/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h b/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h
index 53ca4066c02d..266f2de08772 100644
--- a/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h
+++ b/llvm/lib/Target/ARC/MCTargetDesc/ARCInstPrinter.h
@@ -36,6 +36,10 @@ public:
 private:
   void printMemOperandRI(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
+  void printOperand(const MCInst *MI, uint64_t /*Address*/, unsigned OpNum,
+                    raw_ostream &O) {
+    printOperand(MI, OpNum, O);
+  }
   void printPredicateOperand(const MCInst *MI, unsigned OpNum, raw_ostream &O);
   void printBRCCPredicateOperand(const MCInst *MI, unsigned OpNum,
                                  raw_ostream &O);
diff --git a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
index 997e95e1a35f..3e3613ccb90f 100644
--- a/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARC/MCTargetDesc/ARCMCTargetDesc.cpp
@@ -57,7 +57,7 @@ static MCAsmInfo *createARCMCAsmInfo(const MCRegisterInfo &MRI,
   MCAsmInfo *MAI = new ARCMCAsmInfo(TT);
 
   // Initial state of the frame pointer is SP.
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, ARC::SP, 0);
+  MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, ARC::SP, 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
diff --git a/llvm/lib/Target/ARM/ARM.h b/llvm/lib/Target/ARM/ARM.h
index 3412813a3ef2..7398968bb24a 100644
--- a/llvm/lib/Target/ARM/ARM.h
+++ b/llvm/lib/Target/ARM/ARM.h
@@ -47,6 +47,7 @@ FunctionPass *createARMConstantIslandPass();
 FunctionPass *createMLxExpansionPass();
 FunctionPass *createThumb2ITBlockPass();
 FunctionPass *createMVEVPTBlockPass();
+FunctionPass *createMVEVPTOptimisationsPass();
 FunctionPass *createARMOptimizeBarriersPass();
 FunctionPass *createThumb2SizeReductionPass(
     std::function<bool(const Function &)> Ftor = nullptr);
@@ -66,6 +67,7 @@ void initializeARMExpandPseudoPass(PassRegistry &);
 void initializeThumb2SizeReducePass(PassRegistry &);
 void initializeThumb2ITBlockPass(PassRegistry &);
 void initializeMVEVPTBlockPass(PassRegistry &);
+void initializeMVEVPTOptimisationsPass(PassRegistry &);
 void initializeARMLowOverheadLoopsPass(PassRegistry &);
 void initializeMVETailPredicationPass(PassRegistry &);
 void initializeMVEGatherScatterLoweringPass(PassRegistry &);
diff --git a/llvm/lib/Target/ARM/ARM.td b/llvm/lib/Target/ARM/ARM.td
index 380eaa863689..0468f7f1cf8e 100644
--- a/llvm/lib/Target/ARM/ARM.td
+++ b/llvm/lib/Target/ARM/ARM.td
@@ -424,6 +424,13 @@ def FeatureNoPostRASched : SubtargetFeature<"disable-postra-scheduler",
 def FeatureSB       : SubtargetFeature<"sb", "HasSB", "true",
   "Enable v8.5a Speculation Barrier" >;
 
+// Armv8.6-A extensions
+def FeatureBF16     : SubtargetFeature<"bf16", "HasBF16", "true",
+  "Enable support for BFloat16 instructions",  [FeatureNEON]>;
+
+def FeatureMatMulInt8 : SubtargetFeature<"i8mm", "HasMatMulInt8",
+    "true", "Enable Matrix Multiply Int8 Extension", [FeatureNEON]>;
+
 // Armv8.1-M extensions
 
 def FeatureLOB            : SubtargetFeature<"lob", "HasLOB", "true",
@@ -523,6 +530,11 @@ def HasV8_5aOps   : SubtargetFeature<"v8.5a", "HasV8_5aOps", "true",
                                    "Support ARM v8.5a instructions",
                                    [HasV8_4aOps, FeatureSB]>;
 
+def HasV8_6aOps   : SubtargetFeature<"v8.6a", "HasV8_6aOps", "true",
+                                   "Support ARM v8.6a instructions",
+                                   [HasV8_5aOps, FeatureBF16,
+                                    FeatureMatMulInt8]>;
+
 def HasV8_1MMainlineOps : SubtargetFeature<
                "v8.1m.main", "HasV8_1MMainlineOps", "true",
                "Support ARM v8-1M Mainline instructions",
@@ -536,6 +548,16 @@ def HasMVEFloatOps : SubtargetFeature<
                "Support M-Class Vector Extension with integer and floating ops",
                [HasMVEIntegerOps, FeatureFPARMv8_D16_SP, FeatureFullFP16]>;
 
+def HasCDEOps : SubtargetFeature<"cde", "HasCDEOps", "true",
+                                 "Support CDE instructions",
+                                 [HasV8MMainlineOps]>;
+
+foreach i = {0-7} in
+    def FeatureCoprocCDE#i : SubtargetFeature<"cdecp"#i,
+                                              "CoprocCDE["#i#"]", "true",
+                                              "Coprocessor "#i#" ISA is CDEv1",
+                                              [HasCDEOps]>;
+
 //===----------------------------------------------------------------------===//
 // ARM Processor subtarget features.
 //
@@ -572,6 +594,12 @@ def ProcA75     : SubtargetFeature<"a75", "ARMProcFamily", "CortexA75",
                                    "Cortex-A75 ARM processors", []>;
 def ProcA76     : SubtargetFeature<"a76", "ARMProcFamily", "CortexA76",
                                    "Cortex-A76 ARM processors", []>;
+def ProcA77     : SubtargetFeature<"a77", "ARMProcFamily", "CortexA77",
+                                   "Cortex-A77 ARM processors", []>;
+def ProcA78     : SubtargetFeature<"cortex-a78", "ARMProcFamily", "CortexA78",
+                                   "Cortex-A78 ARM processors", []>;
+def ProcX1      : SubtargetFeature<"cortex-x1", "ARMProcFamily", "CortexX1",
+                                   "Cortex-X1 ARM processors", []>;
 
 def ProcKrait   : SubtargetFeature<"krait", "ARMProcFamily", "Krait",
                                    "Qualcomm Krait processors", []>;
@@ -787,6 +815,19 @@ def ARMv85a   : Architecture<"armv8.5-a", "ARMv85a",  [HasV8_5aOps,
                                                        FeatureCRC,
                                                        FeatureRAS,
                                                        FeatureDotProd]>;
+def ARMv86a   : Architecture<"armv8.6-a", "ARMv86a",  [HasV8_6aOps,
+                                                       FeatureAClass,
+                                                       FeatureDB,
+                                                       FeatureFPARMv8,
+                                                       FeatureNEON,
+                                                       FeatureDSP,
+                                                       FeatureTrustZone,
+                                                       FeatureMP,
+                                                       FeatureVirtualization,
+                                                       FeatureCrypto,
+                                                       FeatureCRC,
+                                                       FeatureRAS,
+                                                       FeatureDotProd]>;
 
 def ARMv8r    : Architecture<"armv8-r",   "ARMv8r",   [HasV8Ops,
                                                        FeatureRClass,
@@ -1114,6 +1155,14 @@ def : ProcessorModel<"cortex-m35p", CortexM4Model,      [ARMv8mMainline,
                                                          FeatureUseMISched,
                                                          FeatureHasNoBranchPredictor]>;
 
+def : ProcessorModel<"cortex-m55", CortexM4Model,      [ARMv81mMainline,
+                                                         FeatureDSP,
+                                                         FeatureFPARMv8_D16,
+                                                         FeatureUseMISched,
+                                                         FeatureHasNoBranchPredictor,
+                                                         FeaturePrefLoopAlign32,
+                                                         FeatureHasSlowFPVMLx,
+                                                         HasMVEFloatOps]>;
 
 def : ProcNoItin<"cortex-a32",                           [ARMv8a,
                                                          FeatureHWDivThumb,
@@ -1181,6 +1230,30 @@ def : ProcNoItin<"cortex-a76ae",                        [ARMv82a, ProcA76,
                                                          FeatureFullFP16,
                                                          FeatureDotProd]>;
 
+def : ProcNoItin<"cortex-a77",                          [ARMv82a, ProcA77,
+                                                         FeatureHWDivThumb,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC,
+                                                         FeatureFullFP16,
+                                                         FeatureDotProd]>;
+
+def : ProcNoItin<"cortex-a78",                          [ARMv82a, ProcA78,
+                                                         FeatureHWDivThumb,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC,
+                                                         FeatureFullFP16,
+                                                         FeatureDotProd]>;
+
+def : ProcNoItin<"cortex-x1",                           [ARMv82a, ProcX1,
+                                                         FeatureHWDivThumb,
+                                                         FeatureHWDivARM,
+                                                         FeatureCrypto,
+                                                         FeatureCRC,
+                                                         FeatureFullFP16,
+                                                         FeatureDotProd]>;
+
 def : ProcNoItin<"neoverse-n1",                         [ARMv82a,
                                                          FeatureHWDivThumb,
                                                          FeatureHWDivARM,
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
index 6f26ca127f94..d6c1efa6327c 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.cpp
@@ -57,26 +57,36 @@ ARMAsmPrinter::ARMAsmPrinter(TargetMachine &TM,
     : AsmPrinter(TM, std::move(Streamer)), Subtarget(nullptr), AFI(nullptr),
       MCP(nullptr), InConstantPool(false), OptimizationGoals(-1) {}
 
-void ARMAsmPrinter::EmitFunctionBodyEnd() {
+void ARMAsmPrinter::emitFunctionBodyEnd() {
   // Make sure to terminate any constant pools that were at the end
   // of the function.
   if (!InConstantPool)
     return;
   InConstantPool = false;
-  OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+  OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
 }
 
-void ARMAsmPrinter::EmitFunctionEntryLabel() {
+void ARMAsmPrinter::emitFunctionEntryLabel() {
   if (AFI->isThumbFunction()) {
-    OutStreamer->EmitAssemblerFlag(MCAF_Code16);
-    OutStreamer->EmitThumbFunc(CurrentFnSym);
+    OutStreamer->emitAssemblerFlag(MCAF_Code16);
+    OutStreamer->emitThumbFunc(CurrentFnSym);
   } else {
-    OutStreamer->EmitAssemblerFlag(MCAF_Code32);
+    OutStreamer->emitAssemblerFlag(MCAF_Code32);
   }
-  OutStreamer->EmitLabel(CurrentFnSym);
+
+  // Emit symbol for CMSE non-secure entry point
+  if (AFI->isCmseNSEntryFunction()) {
+    MCSymbol *S =
+        OutContext.getOrCreateSymbol("__acle_se_" + CurrentFnSym->getName());
+    emitLinkage(&MF->getFunction(), S);
+    OutStreamer->emitSymbolAttribute(S, MCSA_ELF_TypeFunction);
+    OutStreamer->emitLabel(S);
+  }
+
+  OutStreamer->emitLabel(CurrentFnSym);
 }
 
-void ARMAsmPrinter::EmitXXStructor(const DataLayout &DL, const Constant *CV) {
+void ARMAsmPrinter::emitXXStructor(const DataLayout &DL, const Constant *CV) {
   uint64_t Size = getDataLayout().getTypeAllocSize(CV->getType());
   assert(Size && "C++ constructor pointer had zero size!");
 
@@ -90,17 +100,17 @@ void ARMAsmPrinter::EmitXXStructor(const DataLayout &DL, const Constant *CV) {
                                              : MCSymbolRefExpr::VK_None),
                                             OutContext);
 
-  OutStreamer->EmitValue(E, Size);
+  OutStreamer->emitValue(E, Size);
 }
 
-void ARMAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+void ARMAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   if (PromotedGlobals.count(GV))
     // The global was promoted into a constant pool. It should not be emitted.
     return;
-  AsmPrinter::EmitGlobalVariable(GV);
+  AsmPrinter::emitGlobalVariable(GV);
 }
 
-/// runOnMachineFunction - This uses the EmitInstruction()
+/// runOnMachineFunction - This uses the emitInstruction()
 /// method to print assembly for each instruction.
 ///
 bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
@@ -158,7 +168,7 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   }
 
   // Emit the rest of the function body.
-  EmitFunctionBody();
+  emitFunctionBody();
 
   // Emit the XRay table for this function.
   emitXRayTable();
@@ -167,10 +177,10 @@ bool ARMAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   // These are created per function, rather than per TU, since it's
   // relatively easy to exceed the thumb branch range within a TU.
   if (! ThumbIndirectPads.empty()) {
-    OutStreamer->EmitAssemblerFlag(MCAF_Code16);
-    EmitAlignment(Align(2));
+    OutStreamer->emitAssemblerFlag(MCAF_Code16);
+    emitAlignment(Align(2));
     for (std::pair<unsigned, MCSymbol *> &TIP : ThumbIndirectPads) {
-      OutStreamer->EmitLabel(TIP.second);
+      OutStreamer->emitLabel(TIP.second);
       EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tBX)
         .addReg(TIP.first)
         // Add predicate operands.
@@ -467,14 +477,14 @@ void ARMAsmPrinter::emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
   // the start mode, then restore the start mode.
   const bool WasThumb = isThumb(StartInfo);
   if (!EndInfo || WasThumb != isThumb(*EndInfo)) {
-    OutStreamer->EmitAssemblerFlag(WasThumb ? MCAF_Code16 : MCAF_Code32);
+    OutStreamer->emitAssemblerFlag(WasThumb ? MCAF_Code16 : MCAF_Code32);
   }
 }
 
-void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
+void ARMAsmPrinter::emitStartOfAsmFile(Module &M) {
   const Triple &TT = TM.getTargetTriple();
   // Use unified assembler syntax.
-  OutStreamer->EmitAssemblerFlag(MCAF_SyntaxUnified);
+  OutStreamer->emitAssemblerFlag(MCAF_SyntaxUnified);
 
   // Emit ARM Build Attributes
   if (TT.isOSBinFormatELF())
@@ -484,20 +494,20 @@ void ARMAsmPrinter::EmitStartOfAsmFile(Module &M) {
   // if we're thumb for the purposes of the top level code16 assembler
   // flag.
   if (!M.getModuleInlineAsm().empty() && TT.isThumb())
-    OutStreamer->EmitAssemblerFlag(MCAF_Code16);
+    OutStreamer->emitAssemblerFlag(MCAF_Code16);
 }
 
 static void
 emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
                          MachineModuleInfoImpl::StubValueTy &MCSym) {
   // L_foo$stub:
-  OutStreamer.EmitLabel(StubLabel);
+  OutStreamer.emitLabel(StubLabel);
   //   .indirect_symbol _foo
-  OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+  OutStreamer.emitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
 
   if (MCSym.getInt())
     // External to current translation unit.
-    OutStreamer.EmitIntValue(0, 4/*size*/);
+    OutStreamer.emitIntValue(0, 4/*size*/);
   else
     // Internal to current translation unit.
     //
@@ -505,13 +515,13 @@ emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
     // pointers need to be indirect and pc-rel. We accomplish this by
     // using NLPs; however, sometimes the types are local to the file.
     // We need to fill in the value for the NLP in those cases.
-    OutStreamer.EmitValue(
+    OutStreamer.emitValue(
         MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()),
         4 /*size*/);
 }
 
 
-void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
+void ARMAsmPrinter::emitEndOfAsmFile(Module &M) {
   const Triple &TT = TM.getTargetTriple();
   if (TT.isOSBinFormatMachO()) {
     // All darwin targets use mach-o.
@@ -526,7 +536,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     if (!Stubs.empty()) {
       // Switch with ".non_lazy_symbol_pointer" directive.
       OutStreamer->SwitchSection(TLOFMacho.getNonLazySymbolPointerSection());
-      EmitAlignment(Align(4));
+      emitAlignment(Align(4));
 
       for (auto &Stub : Stubs)
         emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
@@ -539,7 +549,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     if (!Stubs.empty()) {
       // Switch with ".non_lazy_symbol_pointer" directive.
       OutStreamer->SwitchSection(TLOFMacho.getThreadLocalPointerSection());
-      EmitAlignment(Align(4));
+      emitAlignment(Align(4));
 
       for (auto &Stub : Stubs)
         emitNonLazySymbolPointer(*OutStreamer, Stub.first, Stub.second);
@@ -553,7 +563,7 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
     // implementation of multiple entry points).  If this doesn't occur, the
     // linker can safely perform dead code stripping.  Since LLVM never
     // generates code that does this, it is always safe to set.
-    OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+    OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   }
 
   // The last attribute to be emitted is ABI_optimization_goals
@@ -570,18 +580,28 @@ void ARMAsmPrinter::EmitEndOfAsmFile(Module &M) {
 }
 
 //===----------------------------------------------------------------------===//
-// Helper routines for EmitStartOfAsmFile() and EmitEndOfAsmFile()
+// Helper routines for emitStartOfAsmFile() and emitEndOfAsmFile()
 // FIXME:
 // The following seem like one-off assembler flags, but they actually need
 // to appear in the .ARM.attributes section in ELF.
 // Instead of subclassing the MCELFStreamer, we do the work here.
 
-// Returns true if all functions have the same function attribute value.
-// It also returns true when the module has no functions.
+ // Returns true if all functions have the same function attribute value.
+ // It also returns true when the module has no functions.
 static bool checkFunctionsAttributeConsistency(const Module &M, StringRef Attr,
                                                StringRef Value) {
+   return !any_of(M, [&](const Function &F) {
+       return F.getFnAttribute(Attr).getValueAsString() != Value;
+   });
+}
+// Returns true if all functions have the same denormal mode.
+// It also returns true when the module has no functions.
+static bool checkDenormalAttributeConsistency(const Module &M,
+                                              StringRef Attr,
+                                              DenormalMode Value) {
   return !any_of(M, [&](const Function &F) {
-    return F.getFnAttribute(Attr).getValueAsString() != Value;
+    StringRef AttrVal = F.getFnAttribute(Attr).getValueAsString();
+    return parseDenormalFPAttribute(AttrVal) != Value;
   });
 }
 
@@ -606,11 +626,12 @@ void ARMAsmPrinter::emitAttributes() {
     if (!ArchFS.empty())
       ArchFS = (Twine(ArchFS) + "," + FS).str();
     else
-      ArchFS = FS;
+      ArchFS = std::string(FS);
   }
   const ARMBaseTargetMachine &ATM =
       static_cast<const ARMBaseTargetMachine &>(TM);
-  const ARMSubtarget STI(TT, CPU, ArchFS, ATM, ATM.isLittleEndian());
+  const ARMSubtarget STI(TT, std::string(CPU), ArchFS, ATM,
+                         ATM.isLittleEndian());
 
   // Emit build attributes for the available hardware.
   ATS.emitTargetAttributes(STI);
@@ -641,16 +662,13 @@ void ARMAsmPrinter::emitAttributes() {
   }
 
   // Set FP Denormals.
-  if (checkFunctionsAttributeConsistency(*MMI->getModule(),
-                                         "denormal-fp-math",
-                                         "preserve-sign") ||
-      TM.Options.FPDenormalMode == FPDenormal::PreserveSign)
+  if (checkDenormalAttributeConsistency(*MMI->getModule(), "denormal-fp-math",
+                                        DenormalMode::getPreserveSign()))
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
                       ARMBuildAttrs::PreserveFPSign);
-  else if (checkFunctionsAttributeConsistency(*MMI->getModule(),
-                                              "denormal-fp-math",
-                                              "positive-zero") ||
-           TM.Options.FPDenormalMode == FPDenormal::PositiveZero)
+  else if (checkDenormalAttributeConsistency(*MMI->getModule(),
+                                             "denormal-fp-math",
+                                             DenormalMode::getPositiveZero()))
     ATS.emitAttribute(ARMBuildAttrs::ABI_FP_denormal,
                       ARMBuildAttrs::PositiveZero);
   else if (!TM.Options.UnsafeFPMath)
@@ -855,8 +873,8 @@ MCSymbol *ARMAsmPrinter::GetARMGVSymbol(const GlobalValue *GV,
   llvm_unreachable("unexpected target");
 }
 
-void ARMAsmPrinter::
-EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+void ARMAsmPrinter::emitMachineConstantPoolValue(
+    MachineConstantPoolValue *MCPV) {
   const DataLayout &DL = getDataLayout();
   int Size = DL.getTypeAllocSize(MCPV->getType());
 
@@ -876,11 +894,11 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
     for (const auto *GV : ACPC->promotedGlobals()) {
       if (!EmittedPromotedGlobalLabels.count(GV)) {
         MCSymbol *GVSym = getSymbol(GV);
-        OutStreamer->EmitLabel(GVSym);
+        OutStreamer->emitLabel(GVSym);
         EmittedPromotedGlobalLabels.insert(GV);
       }
     }
-    return EmitGlobalConstant(DL, ACPC->getPromotedGlobalInit());
+    return emitGlobalConstant(DL, ACPC->getPromotedGlobalInit());
   }
 
   MCSymbol *MCSym;
@@ -925,29 +943,29 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
       // We want "(<expr> - .)", but MC doesn't have a concept of the '.'
       // label, so just emit a local label end reference that instead.
       MCSymbol *DotSym = OutContext.createTempSymbol();
-      OutStreamer->EmitLabel(DotSym);
+      OutStreamer->emitLabel(DotSym);
       const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
       PCRelExpr = MCBinaryExpr::createSub(PCRelExpr, DotExpr, OutContext);
     }
     Expr = MCBinaryExpr::createSub(Expr, PCRelExpr, OutContext);
   }
-  OutStreamer->EmitValue(Expr, Size);
+  OutStreamer->emitValue(Expr, Size);
 }
 
-void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) {
+void ARMAsmPrinter::emitJumpTableAddrs(const MachineInstr *MI) {
   const MachineOperand &MO1 = MI->getOperand(1);
   unsigned JTI = MO1.getIndex();
 
   // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
   // ARM mode tables.
-  EmitAlignment(Align(4));
+  emitAlignment(Align(4));
 
   // Emit a label for the jump table.
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
-  OutStreamer->EmitLabel(JTISymbol);
+  OutStreamer->emitLabel(JTISymbol);
 
   // Mark the jump table as data-in-code.
-  OutStreamer->EmitDataRegion(MCDR_DataRegionJT32);
+  OutStreamer->emitDataRegion(MCDR_DataRegionJT32);
 
   // Emit each entry of the table.
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
@@ -974,23 +992,23 @@ void ARMAsmPrinter::EmitJumpTableAddrs(const MachineInstr *MI) {
     else if (AFI->isThumbFunction())
       Expr = MCBinaryExpr::createAdd(Expr, MCConstantExpr::create(1,OutContext),
                                      OutContext);
-    OutStreamer->EmitValue(Expr, 4);
+    OutStreamer->emitValue(Expr, 4);
   }
   // Mark the end of jump table data-in-code region.
-  OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+  OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
 }
 
-void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) {
+void ARMAsmPrinter::emitJumpTableInsts(const MachineInstr *MI) {
   const MachineOperand &MO1 = MI->getOperand(1);
   unsigned JTI = MO1.getIndex();
 
   // Make sure the Thumb jump table is 4-byte aligned. This will be a nop for
   // ARM mode tables.
-  EmitAlignment(Align(4));
+  emitAlignment(Align(4));
 
   // Emit a label for the jump table.
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
-  OutStreamer->EmitLabel(JTISymbol);
+  OutStreamer->emitLabel(JTISymbol);
 
   // Emit each entry of the table.
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
@@ -1008,17 +1026,17 @@ void ARMAsmPrinter::EmitJumpTableInsts(const MachineInstr *MI) {
   }
 }
 
-void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
+void ARMAsmPrinter::emitJumpTableTBInst(const MachineInstr *MI,
                                         unsigned OffsetWidth) {
   assert((OffsetWidth == 1 || OffsetWidth == 2) && "invalid tbb/tbh width");
   const MachineOperand &MO1 = MI->getOperand(1);
   unsigned JTI = MO1.getIndex();
 
   if (Subtarget->isThumb1Only())
-    EmitAlignment(Align(4));
+    emitAlignment(Align(4));
 
   MCSymbol *JTISymbol = GetARMJTIPICJumpTableLabel(JTI);
-  OutStreamer->EmitLabel(JTISymbol);
+  OutStreamer->emitLabel(JTISymbol);
 
   // Emit each entry of the table.
   const MachineJumpTableInfo *MJTI = MF->getJumpTableInfo();
@@ -1026,7 +1044,7 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
   const std::vector<MachineBasicBlock*> &JTBBs = JT[JTI].MBBs;
 
   // Mark the jump table as data-in-code.
-  OutStreamer->EmitDataRegion(OffsetWidth == 1 ? MCDR_DataRegionJT8
+  OutStreamer->emitDataRegion(OffsetWidth == 1 ? MCDR_DataRegionJT8
                                                : MCDR_DataRegionJT16);
 
   for (auto MBB : JTBBs) {
@@ -1050,15 +1068,15 @@ void ARMAsmPrinter::EmitJumpTableTBInst(const MachineInstr *MI,
     Expr = MCBinaryExpr::createSub(MBBSymbolExpr, Expr, OutContext);
     Expr = MCBinaryExpr::createDiv(Expr, MCConstantExpr::create(2, OutContext),
                                    OutContext);
-    OutStreamer->EmitValue(Expr, OffsetWidth);
+    OutStreamer->emitValue(Expr, OffsetWidth);
   }
   // Mark the end of jump table data-in-code region. 32-bit offsets use
   // actual branch instructions here, so we don't mark those as a data-region
   // at all.
-  OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+  OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
 
   // Make sure the next instruction is 2-byte aligned.
-  EmitAlignment(Align(2));
+  emitAlignment(Align(2));
 }
 
 void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
@@ -1076,16 +1094,26 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
   unsigned Opc = MI->getOpcode();
   unsigned SrcReg, DstReg;
 
-  if (Opc == ARM::tPUSH || Opc == ARM::tLDRpci) {
-    // Two special cases:
-    // 1) tPUSH does not have src/dst regs.
-    // 2) for Thumb1 code we sometimes materialize the constant via constpool
-    // load. Yes, this is pretty fragile, but for now I don't see better
-    // way... :(
+  switch (Opc) {
+  case ARM::tPUSH:
+    // special case: tPUSH does not have src/dst regs.
     SrcReg = DstReg = ARM::SP;
-  } else {
+    break;
+  case ARM::tLDRpci:
+  case ARM::t2MOVi16:
+  case ARM::t2MOVTi16:
+    // special cases:
+    // 1) for Thumb1 code we sometimes materialize the constant via constpool
+    //    load.
+    // 2) for Thumb2 execute only code we materialize the constant via
+    //    immediate constants in 2 separate instructions (MOVW/MOVT).
+    SrcReg = ~0U;
+    DstReg = MI->getOperand(0).getReg();
+    break;
+  default:
     SrcReg = MI->getOperand(1).getReg();
     DstReg = MI->getOperand(0).getReg();
+    break;
   }
 
   // Try to figure out the unwinding opcode out of src / dst regs.
@@ -1189,23 +1217,11 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
       case ARM::tADDrSPi:
         Offset = -MI->getOperand(2).getImm()*4;
         break;
-      case ARM::tLDRpci: {
-        // Grab the constpool index and check, whether it corresponds to
-        // original or cloned constpool entry.
-        unsigned CPI = MI->getOperand(1).getIndex();
-        const MachineConstantPool *MCP = MF.getConstantPool();
-        if (CPI >= MCP->getConstants().size())
-          CPI = AFI->getOriginalCPIdx(CPI);
-        assert(CPI != -1U && "Invalid constpool index");
-
-        // Derive the actual offset.
-        const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI];
-        assert(!CPE.isMachineConstantPoolEntry() && "Invalid constpool entry");
-        // FIXME: Check for user, it should be "add" instruction!
-        Offset = -cast<ConstantInt>(CPE.Val.ConstVal)->getSExtValue();
+      case ARM::tADDhirr:
+        Offset =
+            -AFI->EHPrologueOffsetInRegs.lookup(MI->getOperand(2).getReg());
         break;
       }
-      }
 
       if (MAI->getExceptionHandlingType() == ExceptionHandling::ARM) {
         if (DstReg == FramePtr && FramePtr != ARM::SP)
@@ -1225,14 +1241,43 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
     } else if (DstReg == ARM::SP) {
       MI->print(errs());
       llvm_unreachable("Unsupported opcode for unwinding information");
-    } else if (Opc == ARM::tMOVr) {
-      // If a Thumb1 function spills r8-r11, we copy the values to low
-      // registers before pushing them. Record the copy so we can emit the
-      // correct ".save" later.
-      AFI->EHPrologueRemappedRegs[DstReg] = SrcReg;
     } else {
-      MI->print(errs());
-      llvm_unreachable("Unsupported opcode for unwinding information");
+      int64_t Offset = 0;
+      switch (Opc) {
+      case ARM::tMOVr:
+        // If a Thumb1 function spills r8-r11, we copy the values to low
+        // registers before pushing them. Record the copy so we can emit the
+        // correct ".save" later.
+        AFI->EHPrologueRemappedRegs[DstReg] = SrcReg;
+        break;
+      case ARM::tLDRpci: {
+        // Grab the constpool index and check, whether it corresponds to
+        // original or cloned constpool entry.
+        unsigned CPI = MI->getOperand(1).getIndex();
+        const MachineConstantPool *MCP = MF.getConstantPool();
+        if (CPI >= MCP->getConstants().size())
+          CPI = AFI->getOriginalCPIdx(CPI);
+        assert(CPI != -1U && "Invalid constpool index");
+
+        // Derive the actual offset.
+        const MachineConstantPoolEntry &CPE = MCP->getConstants()[CPI];
+        assert(!CPE.isMachineConstantPoolEntry() && "Invalid constpool entry");
+        Offset = cast<ConstantInt>(CPE.Val.ConstVal)->getSExtValue();
+        AFI->EHPrologueOffsetInRegs[DstReg] = Offset;
+        break;
+      }
+      case ARM::t2MOVi16:
+        Offset = MI->getOperand(1).getImm();
+        AFI->EHPrologueOffsetInRegs[DstReg] = Offset;
+        break;
+      case ARM::t2MOVTi16:
+        Offset = MI->getOperand(2).getImm();
+        AFI->EHPrologueOffsetInRegs[DstReg] |= (Offset << 16);
+        break;
+      default:
+        MI->print(errs());
+        llvm_unreachable("Unsupported opcode for unwinding information");
+      }
     }
   }
 }
@@ -1241,7 +1286,7 @@ void ARMAsmPrinter::EmitUnwindingInstruction(const MachineInstr *MI) {
 // instructions) auto-generated.
 #include "ARMGenMCPseudoLowering.inc"
 
-void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void ARMAsmPrinter::emitInstruction(const MachineInstr *MI) {
   const DataLayout &DL = getDataLayout();
   MCTargetStreamer &TS = *OutStreamer->getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
@@ -1252,7 +1297,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   // If we just ended a constant pool, mark it as such.
   if (InConstantPool && MI->getOpcode() != ARM::CONSTPOOL_ENTRY) {
-    OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+    OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
     InConstantPool = false;
   }
 
@@ -1513,7 +1558,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // This is a pseudo op for a label used by a branch future instruction
 
     // Emit the label.
-    OutStreamer->EmitLabel(getBFLabel(DL.getPrivateGlobalPrefix(),
+    OutStreamer->emitLabel(getBFLabel(DL.getPrivateGlobalPrefix(),
                                        getFunctionNumber(),
                                        MI->getOperand(0).getIndex(), OutContext));
     return;
@@ -1525,7 +1570,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // This adds the address of LPC0 to r0.
 
     // Emit the label.
-    OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
+    OutStreamer->emitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
                                        getFunctionNumber(),
                                        MI->getOperand(2).getImm(), OutContext));
 
@@ -1546,7 +1591,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // This adds the address of LPC0 to r0.
 
     // Emit the label.
-    OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
+    OutStreamer->emitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
                                        getFunctionNumber(),
                                        MI->getOperand(2).getImm(), OutContext));
 
@@ -1577,7 +1622,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // a PC-relative address at the ldr instruction.
 
     // Emit the label.
-    OutStreamer->EmitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
+    OutStreamer->emitLabel(getPICLabel(DL.getPrivateGlobalPrefix(),
                                        getFunctionNumber(),
                                        MI->getOperand(2).getImm(), OutContext));
 
@@ -1620,28 +1665,28 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     // If this is the first entry of the pool, mark it.
     if (!InConstantPool) {
-      OutStreamer->EmitDataRegion(MCDR_DataRegion);
+      OutStreamer->emitDataRegion(MCDR_DataRegion);
       InConstantPool = true;
     }
 
-    OutStreamer->EmitLabel(GetCPISymbol(LabelId));
+    OutStreamer->emitLabel(GetCPISymbol(LabelId));
 
     const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
     if (MCPE.isMachineConstantPoolEntry())
-      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+      emitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
     else
-      EmitGlobalConstant(DL, MCPE.Val.ConstVal);
+      emitGlobalConstant(DL, MCPE.Val.ConstVal);
     return;
   }
   case ARM::JUMPTABLE_ADDRS:
-    EmitJumpTableAddrs(MI);
+    emitJumpTableAddrs(MI);
     return;
   case ARM::JUMPTABLE_INSTS:
-    EmitJumpTableInsts(MI);
+    emitJumpTableInsts(MI);
     return;
   case ARM::JUMPTABLE_TBB:
   case ARM::JUMPTABLE_TBH:
-    EmitJumpTableTBInst(MI, MI->getOpcode() == ARM::JUMPTABLE_TBB ? 1 : 2);
+    emitJumpTableTBInst(MI, MI->getOpcode() == ARM::JUMPTABLE_TBB ? 1 : 2);
     return;
   case ARM::t2BR_JT: {
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tMOVr)
@@ -1656,7 +1701,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case ARM::t2TBH_JT: {
     unsigned Opc = MI->getOpcode() == ARM::t2TBB_JT ? ARM::t2TBB : ARM::t2TBH;
     // Lower and emit the PC label, then the instruction itself.
-    OutStreamer->EmitLabel(GetCPISymbol(MI->getOperand(3).getImm()));
+    OutStreamer->emitLabel(GetCPISymbol(MI->getOperand(3).getImm()));
     EmitToStreamer(*OutStreamer, MCInstBuilder(Opc)
                                      .addReg(MI->getOperand(0).getReg())
                                      .addReg(MI->getOperand(1).getReg())
@@ -1698,7 +1743,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       // FIXME: Ideally we could vary the LDRB index based on the padding
       // between the sequence and jump table, however that relies on MCExprs
       // for load indexes which are currently not supported.
-      OutStreamer->EmitCodeAlignment(4);
+      OutStreamer->emitCodeAlignment(4);
       EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
                                        .addReg(Idx)
                                        .addReg(Idx)
@@ -1740,7 +1785,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
                                      .addImm(ARMCC::AL)
                                      .addReg(0));
 
-    OutStreamer->EmitLabel(GetCPISymbol(MI->getOperand(3).getImm()));
+    OutStreamer->emitLabel(GetCPISymbol(MI->getOperand(3).getImm()));
     EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::tADDhirr)
                                      .addReg(ARM::PC)
                                      .addReg(ARM::PC)
@@ -1809,7 +1854,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case ARM::SPACE:
-    OutStreamer->EmitZeros(MI->getOperand(1).getImm());
+    OutStreamer->emitZeros(MI->getOperand(1).getImm());
     return;
   case ARM::TRAP: {
     // Non-Darwin binutils don't yet support the "trap" mnemonic.
@@ -1904,7 +1949,7 @@ void ARMAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       .addImm(ARMCC::AL)
       .addReg(0));
 
-    OutStreamer->EmitLabel(Label);
+    OutStreamer->emitLabel(Label);
     return;
   }
 
diff --git a/llvm/lib/Target/ARM/ARMAsmPrinter.h b/llvm/lib/Target/ARM/ARMAsmPrinter.h
index a4b37fa2331f..f8ff047a1d06 100644
--- a/llvm/lib/Target/ARM/ARMAsmPrinter.h
+++ b/llvm/lib/Target/ARM/ARMAsmPrinter.h
@@ -84,21 +84,21 @@ public:
   void emitInlineAsmEnd(const MCSubtargetInfo &StartInfo,
                         const MCSubtargetInfo *EndInfo) const override;
 
-  void EmitJumpTableAddrs(const MachineInstr *MI);
-  void EmitJumpTableInsts(const MachineInstr *MI);
-  void EmitJumpTableTBInst(const MachineInstr *MI, unsigned OffsetWidth);
-  void EmitInstruction(const MachineInstr *MI) override;
+  void emitJumpTableAddrs(const MachineInstr *MI);
+  void emitJumpTableInsts(const MachineInstr *MI);
+  void emitJumpTableTBInst(const MachineInstr *MI, unsigned OffsetWidth);
+  void emitInstruction(const MachineInstr *MI) override;
   bool runOnMachineFunction(MachineFunction &F) override;
 
-  void EmitConstantPool() override {
+  void emitConstantPool() override {
     // we emit constant pools customly!
   }
-  void EmitFunctionBodyEnd() override;
-  void EmitFunctionEntryLabel() override;
-  void EmitStartOfAsmFile(Module &M) override;
-  void EmitEndOfAsmFile(Module &M) override;
-  void EmitXXStructor(const DataLayout &DL, const Constant *CV) override;
-  void EmitGlobalVariable(const GlobalVariable *GV) override;
+  void emitFunctionBodyEnd() override;
+  void emitFunctionEntryLabel() override;
+  void emitStartOfAsmFile(Module &M) override;
+  void emitEndOfAsmFile(Module &M) override;
+  void emitXXStructor(const DataLayout &DL, const Constant *CV) override;
+  void emitGlobalVariable(const GlobalVariable *GV) override;
 
   MCSymbol *GetCPISymbol(unsigned CPID) const override;
 
@@ -117,7 +117,7 @@ public:
 private:
   void EmitSled(const MachineInstr &MI, SledKind Kind);
 
-  // Helpers for EmitStartOfAsmFile() and EmitEndOfAsmFile()
+  // Helpers for emitStartOfAsmFile() and emitEndOfAsmFile()
   void emitAttributes();
 
   // Generic helper used to emit e.g. ARMv5 mul pseudos
@@ -150,7 +150,7 @@ private:
 public:
   /// EmitMachineConstantPoolValue - Print a machine constantpool value to
   /// the .s file.
-  void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
+  void emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 48f781510254..4cc2b6bf7e7e 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -32,6 +32,7 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/ScoreboardHazardRecognizer.h"
@@ -495,6 +496,31 @@ bool ARMBaseInstrInfo::isPredicated(const MachineInstr &MI) const {
   return PIdx != -1 && MI.getOperand(PIdx).getImm() != ARMCC::AL;
 }
 
+std::string ARMBaseInstrInfo::createMIROperandComment(
+    const MachineInstr &MI, const MachineOperand &Op, unsigned OpIdx,
+    const TargetRegisterInfo *TRI) const {
+
+  // First, let's see if there is a generic comment for this operand
+  std::string GenericComment =
+      TargetInstrInfo::createMIROperandComment(MI, Op, OpIdx, TRI);
+  if (!GenericComment.empty())
+    return GenericComment;
+
+  // If not, check if we have an immediate operand.
+  if (Op.getType() != MachineOperand::MO_Immediate)
+    return std::string();
+
+  // And print its corresponding condition code if the immediate is a
+  // predicate.
+  int FirstPredOp = MI.findFirstPredOperandIdx();
+  if (FirstPredOp != (int) OpIdx)
+    return std::string();
+
+  std::string CC = "CC::";
+  CC += ARMCondCodeToString((ARMCC::CondCodes)Op.getImm());
+  return CC;
+}
+
 bool ARMBaseInstrInfo::PredicateInstruction(
     MachineInstr &MI, ArrayRef<MachineOperand> Pred) const {
   unsigned Opc = MI.getOpcode();
@@ -811,7 +837,7 @@ void llvm::addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB) {
 }
 
 void llvm::addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB,
-                                      unsigned DestReg) {
+                                      Register DestReg) {
   addUnpredicatedMveVpredNOp(MIB);
   MIB.addReg(DestReg, RegState::Undef);
 }
@@ -1009,6 +1035,36 @@ ARMBaseInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
   return DestSourcePair{MI.getOperand(0), MI.getOperand(1)};
 }
 
+Optional<ParamLoadedValue>
+ARMBaseInstrInfo::describeLoadedValue(const MachineInstr &MI,
+                                      Register Reg) const {
+  if (auto DstSrcPair = isCopyInstrImpl(MI)) {
+    Register DstReg = DstSrcPair->Destination->getReg();
+
+    // TODO: We don't handle cases where the forwarding reg is narrower/wider
+    // than the copy registers. Consider for example:
+    //
+    //   s16 = VMOVS s0
+    //   s17 = VMOVS s1
+    //   call @callee(d0)
+    //
+    // We'd like to describe the call site value of d0 as d8, but this requires
+    // gathering and merging the descriptions for the two VMOVS instructions.
+    //
+    // We also don't handle the reverse situation, where the forwarding reg is
+    // narrower than the copy destination:
+    //
+    //   d8 = VMOVD d0
+    //   call @callee(s1)
+    //
+    // We need to produce a fragment description (the call site value of s1 is
+    // /not/ just d8).
+    if (DstReg != Reg)
+      return None;
+  }
+  return TargetInstrInfo::describeLoadedValue(MI, Reg);
+}
+
 const MachineInstrBuilder &
 ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
                           unsigned SubIdx, unsigned State,
@@ -1023,16 +1079,16 @@ ARMBaseInstrInfo::AddDReg(MachineInstrBuilder &MIB, unsigned Reg,
 
 void ARMBaseInstrInfo::
 storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                    unsigned SrcReg, bool isKill, int FI,
+                    Register SrcReg, bool isKill, int FI,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FI);
+  Align Alignment = MFI.getObjectAlign(FI);
 
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
-      MFI.getObjectSize(FI), Align);
+      MFI.getObjectSize(FI), Alignment);
 
   switch (TRI->getSpillSize(*RC)) {
     case 2:
@@ -1102,7 +1158,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     case 16:
       if (ARM::DPairRegClass.hasSubClassEq(RC) && Subtarget.hasNEON()) {
         // Use aligned spills if the stack can be realigned.
-        if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+        if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF)) {
           BuildMI(MBB, I, DebugLoc(), get(ARM::VST1q64))
               .addFrameIndex(FI)
               .addImm(16)
@@ -1130,7 +1186,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     case 24:
       if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
         // Use aligned spills if the stack can be realigned.
-        if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+        if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) &&
             Subtarget.hasNEON()) {
           BuildMI(MBB, I, DebugLoc(), get(ARM::VST1d64TPseudo))
               .addFrameIndex(FI)
@@ -1153,7 +1209,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
       break;
     case 32:
       if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
-        if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+        if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) &&
             Subtarget.hasNEON()) {
           // FIXME: It's possible to only store part of the QQ register if the
           // spilled def has a sub-register index.
@@ -1264,17 +1320,17 @@ unsigned ARMBaseInstrInfo::isStoreToStackSlotPostFE(const MachineInstr &MI,
 
 void ARMBaseInstrInfo::
 loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                     unsigned DestReg, int FI,
+                     Register DestReg, int FI,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FI);
+  const Align Alignment = MFI.getObjectAlign(FI);
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
-      MFI.getObjectSize(FI), Align);
+      MFI.getObjectSize(FI), Alignment);
 
   switch (TRI->getSpillSize(*RC)) {
   case 2:
@@ -1343,7 +1399,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     break;
   case 16:
     if (ARM::DPairRegClass.hasSubClassEq(RC) && Subtarget.hasNEON()) {
-      if (Align >= 16 && getRegisterInfo().canRealignStack(MF)) {
+      if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF)) {
         BuildMI(MBB, I, DL, get(ARM::VLD1q64), DestReg)
             .addFrameIndex(FI)
             .addImm(16)
@@ -1367,7 +1423,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     break;
   case 24:
     if (ARM::DTripleRegClass.hasSubClassEq(RC)) {
-      if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+      if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) &&
           Subtarget.hasNEON()) {
         BuildMI(MBB, I, DL, get(ARM::VLD1d64TPseudo), DestReg)
             .addFrameIndex(FI)
@@ -1390,7 +1446,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     break;
    case 32:
     if (ARM::QQPRRegClass.hasSubClassEq(RC) || ARM::DQuadRegClass.hasSubClassEq(RC)) {
-      if (Align >= 16 && getRegisterInfo().canRealignStack(MF) &&
+      if (Alignment >= 16 && getRegisterInfo().canRealignStack(MF) &&
           Subtarget.hasNEON()) {
         BuildMI(MBB, I, DL, get(ARM::VLD1d64QPseudo), DestReg)
             .addFrameIndex(FI)
@@ -1682,13 +1738,13 @@ static unsigned duplicateCPV(MachineFunction &MF, unsigned &CPI) {
              cast<ARMConstantPoolMBB>(ACPV)->getMBB(), PCLabelId, 4);
   else
     llvm_unreachable("Unexpected ARM constantpool value type!!");
-  CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlignment());
+  CPI = MCP->getConstantPoolIndex(NewCPV, MCPE.getAlign());
   return PCLabelId;
 }
 
 void ARMBaseInstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator I,
-                                     unsigned DestReg, unsigned SubIdx,
+                                     Register DestReg, unsigned SubIdx,
                                      const MachineInstr &Orig,
                                      const TargetRegisterInfo &TRI) const {
   unsigned Opcode = Orig.getOpcode();
@@ -1959,6 +2015,10 @@ bool ARMBaseInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   if (MI.isTerminator() || MI.isPosition())
     return true;
 
+  // INLINEASM_BR can jump to another block
+  if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
+    return true;
+
   // Treat the start of the IT block as a scheduling boundary, but schedule
   // t2IT along with all instructions following it.
   // FIXME: This is a big hammer. But the alternative is to add all potential
@@ -2120,7 +2180,7 @@ ARMBaseInstrInfo::isProfitableToUnpredicate(MachineBasicBlock &TMBB,
 /// condition, otherwise returns AL. It also returns the condition code
 /// register by reference.
 ARMCC::CondCodes llvm::getInstrPredicate(const MachineInstr &MI,
-                                         unsigned &PredReg) {
+                                         Register &PredReg) {
   int PIdx = MI.findFirstPredOperandIdx();
   if (PIdx == -1) {
     PredReg = 0;
@@ -2150,7 +2210,7 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI,
   case ARM::MOVCCr:
   case ARM::t2MOVCCr: {
     // MOVCC can be commuted by inverting the condition.
-    unsigned PredReg = 0;
+    Register PredReg;
     ARMCC::CondCodes CC = getInstrPredicate(MI, PredReg);
     // MOVCC AL can't be inverted. Shouldn't happen.
     if (CC == ARMCC::AL || PredReg != ARM::CPSR)
@@ -2171,9 +2231,9 @@ MachineInstr *ARMBaseInstrInfo::commuteInstructionImpl(MachineInstr &MI,
 /// Identify instructions that can be folded into a MOVCC instruction, and
 /// return the defining instruction.
 MachineInstr *
-ARMBaseInstrInfo::canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI,
+ARMBaseInstrInfo::canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI,
                                    const TargetInstrInfo *TII) const {
-  if (!Register::isVirtualRegister(Reg))
+  if (!Reg.isVirtual())
     return nullptr;
   if (!MRI.hasOneNonDBGUse(Reg))
     return nullptr;
@@ -2353,9 +2413,9 @@ unsigned llvm::convertAddSubFlagsOpcode(unsigned OldOpc) {
 
 void llvm::emitARMRegPlusImmediate(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator &MBBI,
-                                   const DebugLoc &dl, unsigned DestReg,
-                                   unsigned BaseReg, int NumBytes,
-                                   ARMCC::CondCodes Pred, unsigned PredReg,
+                                   const DebugLoc &dl, Register DestReg,
+                                   Register BaseReg, int NumBytes,
+                                   ARMCC::CondCodes Pred, Register PredReg,
                                    const ARMBaseInstrInfo &TII,
                                    unsigned MIFlags) {
   if (NumBytes == 0 && DestReg != BaseReg) {
@@ -2515,7 +2575,7 @@ bool llvm::tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
 }
 
 bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                                unsigned FrameReg, int &Offset,
+                                Register FrameReg, int &Offset,
                                 const ARMBaseInstrInfo &TII) {
   unsigned Opcode = MI.getOpcode();
   const MCInstrDesc &Desc = MI.getDesc();
@@ -2671,8 +2731,8 @@ bool llvm::rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
 /// in SrcReg and SrcReg2 if having two register operands, and the value it
 /// compares against in CmpValue. Return true if the comparison instruction
 /// can be analyzed.
-bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                                      unsigned &SrcReg2, int &CmpMask,
+bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                                      Register &SrcReg2, int &CmpMask,
                                       int &CmpValue) const {
   switch (MI.getOpcode()) {
   default: break;
@@ -2708,7 +2768,7 @@ bool ARMBaseInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
 /// operates on the given source register and applies the same mask
 /// as a 'tst' instruction. Provide a limited look-through for copies.
 /// When successful, MI will hold the found instruction.
-static bool isSuitableForMask(MachineInstr *&MI, unsigned SrcReg,
+static bool isSuitableForMask(MachineInstr *&MI, Register SrcReg,
                               int CmpMask, bool CommonUse) {
   switch (MI->getOpcode()) {
     case ARM::ANDri:
@@ -2743,7 +2803,7 @@ inline static ARMCC::CondCodes getCmpToAddCondition(ARMCC::CondCodes CC) {
 /// CMPrr(r0, r1) can be made redundant by ADDr[ri](r0, r1, X).
 /// This function can be extended later on.
 inline static bool isRedundantFlagInstr(const MachineInstr *CmpI,
-                                        unsigned SrcReg, unsigned SrcReg2,
+                                        Register SrcReg, Register SrcReg2,
                                         int ImmValue, const MachineInstr *OI,
                                         bool &IsThumb1) {
   if ((CmpI->getOpcode() == ARM::CMPrr || CmpI->getOpcode() == ARM::t2CMPrr) &&
@@ -2879,7 +2939,7 @@ static bool isOptimizeCompareCandidate(MachineInstr *MI, bool &IsThumb1) {
 /// operands are swapped: SUBrr(r1,r2) and CMPrr(r2,r1), by updating the
 /// condition code of instructions which use the flags.
 bool ARMBaseInstrInfo::optimizeCompareInstr(
-    MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int CmpMask,
+    MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int CmpMask,
     int CmpValue, const MachineRegisterInfo *MRI) const {
   // Get the unique definition of SrcReg.
   MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
@@ -3166,7 +3226,7 @@ bool ARMBaseInstrInfo::shouldSink(const MachineInstr &MI) const {
     return true;
   MachineBasicBlock::const_iterator Next = &MI;
   ++Next;
-  unsigned SrcReg, SrcReg2;
+  Register SrcReg, SrcReg2;
   int CmpMask, CmpValue;
   bool IsThumb1;
   if (Next != MI.getParent()->end() &&
@@ -3177,7 +3237,7 @@ bool ARMBaseInstrInfo::shouldSink(const MachineInstr &MI) const {
 }
 
 bool ARMBaseInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
-                                     unsigned Reg,
+                                     Register Reg,
                                      MachineRegisterInfo *MRI) const {
   // Fold large immediates into add, sub, or, xor.
   unsigned DefOpc = DefMI.getOpcode();
@@ -3729,7 +3789,7 @@ unsigned ARMBaseInstrInfo::getNumMicroOps(const InstrItineraryData *ItinData,
       // If there are odd number of registers or if it's not 64-bit aligned,
       // then it takes an extra AGU (Address Generation Unit) cycle.
       if ((NumRegs % 2) || !MI.hasOneMemOperand() ||
-          (*MI.memoperands_begin())->getAlignment() < 8)
+          (*MI.memoperands_begin())->getAlign() < Align(8))
         ++UOps;
       return UOps;
       }
@@ -4316,10 +4376,10 @@ int ARMBaseInstrInfo::getOperandLatencyImpl(
     return -1;
 
   unsigned DefAlign = DefMI.hasOneMemOperand()
-                          ? (*DefMI.memoperands_begin())->getAlignment()
+                          ? (*DefMI.memoperands_begin())->getAlign().value()
                           : 0;
   unsigned UseAlign = UseMI.hasOneMemOperand()
-                          ? (*UseMI.memoperands_begin())->getAlignment()
+                          ? (*UseMI.memoperands_begin())->getAlign().value()
                           : 0;
 
   // Get the itinerary's latency if possible, and handle variable_ops.
@@ -4366,10 +4426,12 @@ ARMBaseInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   const MCInstrDesc &UseMCID = get(UseNode->getMachineOpcode());
   auto *DefMN = cast<MachineSDNode>(DefNode);
   unsigned DefAlign = !DefMN->memoperands_empty()
-    ? (*DefMN->memoperands_begin())->getAlignment() : 0;
+                          ? (*DefMN->memoperands_begin())->getAlign().value()
+                          : 0;
   auto *UseMN = cast<MachineSDNode>(UseNode);
   unsigned UseAlign = !UseMN->memoperands_empty()
-    ? (*UseMN->memoperands_begin())->getAlignment() : 0;
+                          ? (*UseMN->memoperands_begin())->getAlign().value()
+                          : 0;
   int Latency = getOperandLatency(ItinData, DefMCID, DefIdx, DefAlign,
                                   UseMCID, UseIdx, UseAlign);
 
@@ -4660,7 +4722,7 @@ unsigned ARMBaseInstrInfo::getInstrLatency(const InstrItineraryData *ItinData,
 
   // Adjust for dynamic def-side opcode variants not captured by the itinerary.
   unsigned DefAlign =
-      MI.hasOneMemOperand() ? (*MI.memoperands_begin())->getAlignment() : 0;
+      MI.hasOneMemOperand() ? (*MI.memoperands_begin())->getAlign().value() : 0;
   int Adj = adjustDefLatency(Subtarget, MI, MCID, DefAlign);
   if (Adj >= 0 || (int)Latency > -Adj) {
     return Latency + Adj;
@@ -4782,7 +4844,7 @@ void ARMBaseInstrInfo::expandLoadStackGuardBase(MachineBasicBlock::iterator MI,
                  MachineMemOperand::MODereferenceable |
                  MachineMemOperand::MOInvariant;
     MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
-        MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
+        MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, Align(4));
     MIB.addMemOperand(MMO).add(predOps(ARMCC::AL));
   }
 
@@ -5353,7 +5415,8 @@ Optional<RegImmPair> ARMBaseInstrInfo::isAddImmediate(const MachineInstr &MI,
 
   // TODO: Handle cases where Reg is a super- or sub-register of the
   // destination register.
-  if (Reg != MI.getOperand(0).getReg())
+  const MachineOperand &Op0 = MI.getOperand(0);
+  if (!Op0.isReg() || Reg != Op0.getReg())
     return None;
 
   // We describe SUBri or ADDri instructions.
@@ -5365,8 +5428,7 @@ Optional<RegImmPair> ARMBaseInstrInfo::isAddImmediate(const MachineInstr &MI,
   // TODO: Third operand can be global address (usually some string). Since
   //       strings can be relocated we cannot calculate their offsets for
   //       now.
-  if (!MI.getOperand(0).isReg() || !MI.getOperand(1).isReg() ||
-      !MI.getOperand(2).isImm())
+  if (!MI.getOperand(1).isReg() || !MI.getOperand(2).isImm())
     return None;
 
   Offset = MI.getOperand(2).getImm() * Sign;
@@ -5402,7 +5464,7 @@ MachineInstr *llvm::findCMPToFoldIntoCBZ(MachineInstr *Br,
   if (CmpMI->getOpcode() != ARM::tCMPi8 && CmpMI->getOpcode() != ARM::t2CMPri)
     return nullptr;
   Register Reg = CmpMI->getOperand(0).getReg();
-  unsigned PredReg = 0;
+  Register PredReg;
   ARMCC::CondCodes Pred = getInstrPredicate(*CmpMI, PredReg);
   if (Pred != ARMCC::AL || CmpMI->getOperand(1).getImm() != 0)
     return nullptr;
@@ -5460,3 +5522,521 @@ bool llvm::HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
   return ConstantMaterializationCost(Val1, Subtarget, !ForCodesize) <
          ConstantMaterializationCost(Val2, Subtarget, !ForCodesize);
 }
+
+/// Constants defining how certain sequences should be outlined.
+/// This encompasses how an outlined function should be called, and what kind of
+/// frame should be emitted for that outlined function.
+///
+/// \p MachineOutlinerTailCall implies that the function is being created from
+/// a sequence of instructions ending in a return.
+///
+/// That is,
+///
+/// I1                                OUTLINED_FUNCTION:
+/// I2    --> B OUTLINED_FUNCTION     I1
+/// BX LR                             I2
+///                                   BX LR
+///
+/// +-------------------------+--------+-----+
+/// |                         | Thumb2 | ARM |
+/// +-------------------------+--------+-----+
+/// | Call overhead in Bytes  |      4 |   4 |
+/// | Frame overhead in Bytes |      0 |   0 |
+/// | Stack fixup required    |     No |  No |
+/// +-------------------------+--------+-----+
+///
+/// \p MachineOutlinerThunk implies that the function is being created from
+/// a sequence of instructions ending in a call. The outlined function is
+/// called with a BL instruction, and the outlined function tail-calls the
+/// original call destination.
+///
+/// That is,
+///
+/// I1                                OUTLINED_FUNCTION:
+/// I2   --> BL OUTLINED_FUNCTION     I1
+/// BL f                              I2
+///                                   B f
+///
+/// +-------------------------+--------+-----+
+/// |                         | Thumb2 | ARM |
+/// +-------------------------+--------+-----+
+/// | Call overhead in Bytes  |      4 |   4 |
+/// | Frame overhead in Bytes |      0 |   0 |
+/// | Stack fixup required    |     No |  No |
+/// +-------------------------+--------+-----+
+///
+/// \p MachineOutlinerNoLRSave implies that the function should be called using
+/// a BL instruction, but doesn't require LR to be saved and restored. This
+/// happens when LR is known to be dead.
+///
+/// That is,
+///
+/// I1                                OUTLINED_FUNCTION:
+/// I2 --> BL OUTLINED_FUNCTION       I1
+/// I3                                I2
+///                                   I3
+///                                   BX LR
+///
+/// +-------------------------+--------+-----+
+/// |                         | Thumb2 | ARM |
+/// +-------------------------+--------+-----+
+/// | Call overhead in Bytes  |      4 |   4 |
+/// | Frame overhead in Bytes |      4 |   4 |
+/// | Stack fixup required    |     No |  No |
+/// +-------------------------+--------+-----+
+///
+/// \p MachineOutlinerRegSave implies that the function should be called with a
+/// save and restore of LR to an available register. This allows us to avoid
+/// stack fixups. Note that this outlining variant is compatible with the
+/// NoLRSave case.
+///
+/// That is,
+///
+/// I1     Save LR                    OUTLINED_FUNCTION:
+/// I2 --> BL OUTLINED_FUNCTION       I1
+/// I3     Restore LR                 I2
+///                                   I3
+///                                   BX LR
+///
+/// +-------------------------+--------+-----+
+/// |                         | Thumb2 | ARM |
+/// +-------------------------+--------+-----+
+/// | Call overhead in Bytes  |      8 |  12 |
+/// | Frame overhead in Bytes |      2 |   4 |
+/// | Stack fixup required    |     No |  No |
+/// +-------------------------+--------+-----+
+
+enum MachineOutlinerClass {
+  MachineOutlinerTailCall,
+  MachineOutlinerThunk,
+  MachineOutlinerNoLRSave,
+  MachineOutlinerRegSave
+};
+
+enum MachineOutlinerMBBFlags {
+  LRUnavailableSomewhere = 0x2,
+  HasCalls = 0x4,
+  UnsafeRegsDead = 0x8
+};
+
+struct OutlinerCosts {
+  const int CallTailCall;
+  const int FrameTailCall;
+  const int CallThunk;
+  const int FrameThunk;
+  const int CallNoLRSave;
+  const int FrameNoLRSave;
+  const int CallRegSave;
+  const int FrameRegSave;
+
+  OutlinerCosts(const ARMSubtarget &target)
+      : CallTailCall(target.isThumb() ? 4 : 4),
+        FrameTailCall(target.isThumb() ? 0 : 0),
+        CallThunk(target.isThumb() ? 4 : 4),
+        FrameThunk(target.isThumb() ? 0 : 0),
+        CallNoLRSave(target.isThumb() ? 4 : 4),
+        FrameNoLRSave(target.isThumb() ? 4 : 4),
+        CallRegSave(target.isThumb() ? 8 : 12),
+        FrameRegSave(target.isThumb() ? 2 : 4) {}
+};
+
+unsigned
+ARMBaseInstrInfo::findRegisterToSaveLRTo(const outliner::Candidate &C) const {
+  assert(C.LRUWasSet && "LRU wasn't set?");
+  MachineFunction *MF = C.getMF();
+  const ARMBaseRegisterInfo *ARI = static_cast<const ARMBaseRegisterInfo *>(
+      MF->getSubtarget().getRegisterInfo());
+
+  BitVector regsReserved = ARI->getReservedRegs(*MF);
+  // Check if there is an available register across the sequence that we can
+  // use.
+  for (unsigned Reg : ARM::rGPRRegClass) {
+    if (!(Reg < regsReserved.size() && regsReserved.test(Reg)) &&
+        Reg != ARM::LR &&  // LR is not reserved, but don't use it.
+        Reg != ARM::R12 && // R12 is not guaranteed to be preserved.
+        C.LRU.available(Reg) && C.UsedInSequence.available(Reg))
+      return Reg;
+  }
+
+  // No suitable register. Return 0.
+  return 0u;
+}
+
+outliner::OutlinedFunction ARMBaseInstrInfo::getOutliningCandidateInfo(
+    std::vector<outliner::Candidate> &RepeatedSequenceLocs) const {
+  outliner::Candidate &FirstCand = RepeatedSequenceLocs[0];
+  unsigned SequenceSize =
+      std::accumulate(FirstCand.front(), std::next(FirstCand.back()), 0,
+                      [this](unsigned Sum, const MachineInstr &MI) {
+                        return Sum + getInstSizeInBytes(MI);
+                      });
+
+  // Properties about candidate MBBs that hold for all of them.
+  unsigned FlagsSetInAll = 0xF;
+
+  // Compute liveness information for each candidate, and set FlagsSetInAll.
+  const TargetRegisterInfo &TRI = getRegisterInfo();
+  std::for_each(
+      RepeatedSequenceLocs.begin(), RepeatedSequenceLocs.end(),
+      [&FlagsSetInAll](outliner::Candidate &C) { FlagsSetInAll &= C.Flags; });
+
+  // According to the ARM Procedure Call Standard, the following are
+  // undefined on entry/exit from a function call:
+  //
+  // * Register R12(IP),
+  // * Condition codes (and thus the CPSR register)
+  //
+  // Since we control the instructions which are part of the outlined regions
+  // we don't need to be fully compliant with the AAPCS, but we have to
+  // guarantee that if a veneer is inserted at link time the code is still
+  // correct.  Because of this, we can't outline any sequence of instructions
+  // where one of these registers is live into/across it. Thus, we need to
+  // delete those candidates.
+  auto CantGuaranteeValueAcrossCall = [&TRI](outliner::Candidate &C) {
+    // If the unsafe registers in this block are all dead, then we don't need
+    // to compute liveness here.
+    if (C.Flags & UnsafeRegsDead)
+      return false;
+    C.initLRU(TRI);
+    LiveRegUnits LRU = C.LRU;
+    return (!LRU.available(ARM::R12) || !LRU.available(ARM::CPSR));
+  };
+
+  // Are there any candidates where those registers are live?
+  if (!(FlagsSetInAll & UnsafeRegsDead)) {
+    // Erase every candidate that violates the restrictions above. (It could be
+    // true that we have viable candidates, so it's not worth bailing out in
+    // the case that, say, 1 out of 20 candidates violate the restructions.)
+    RepeatedSequenceLocs.erase(std::remove_if(RepeatedSequenceLocs.begin(),
+                                              RepeatedSequenceLocs.end(),
+                                              CantGuaranteeValueAcrossCall),
+                               RepeatedSequenceLocs.end());
+
+    // If the sequence doesn't have enough candidates left, then we're done.
+    if (RepeatedSequenceLocs.size() < 2)
+      return outliner::OutlinedFunction();
+  }
+
+  // At this point, we have only "safe" candidates to outline. Figure out
+  // frame + call instruction information.
+
+  unsigned LastInstrOpcode = RepeatedSequenceLocs[0].back()->getOpcode();
+
+  // Helper lambda which sets call information for every candidate.
+  auto SetCandidateCallInfo =
+      [&RepeatedSequenceLocs](unsigned CallID, unsigned NumBytesForCall) {
+        for (outliner::Candidate &C : RepeatedSequenceLocs)
+          C.setCallInfo(CallID, NumBytesForCall);
+      };
+
+  OutlinerCosts Costs(Subtarget);
+  unsigned FrameID = 0;
+  unsigned NumBytesToCreateFrame = 0;
+
+  // If the last instruction in any candidate is a terminator, then we should
+  // tail call all of the candidates.
+  if (RepeatedSequenceLocs[0].back()->isTerminator()) {
+    FrameID = MachineOutlinerTailCall;
+    NumBytesToCreateFrame = Costs.FrameTailCall;
+    SetCandidateCallInfo(MachineOutlinerTailCall, Costs.CallTailCall);
+  } else if (LastInstrOpcode == ARM::BL || LastInstrOpcode == ARM::BLX ||
+             LastInstrOpcode == ARM::tBL || LastInstrOpcode == ARM::tBLXr ||
+             LastInstrOpcode == ARM::tBLXi) {
+    FrameID = MachineOutlinerThunk;
+    NumBytesToCreateFrame = Costs.FrameThunk;
+    SetCandidateCallInfo(MachineOutlinerThunk, Costs.CallThunk);
+  } else {
+    // We need to decide how to emit calls + frames. We can always emit the same
+    // frame if we don't need to save to the stack.
+    unsigned NumBytesNoStackCalls = 0;
+    std::vector<outliner::Candidate> CandidatesWithoutStackFixups;
+
+    for (outliner::Candidate &C : RepeatedSequenceLocs) {
+      C.initLRU(TRI);
+
+      // Is LR available? If so, we don't need a save.
+      if (C.LRU.available(ARM::LR)) {
+        FrameID = MachineOutlinerNoLRSave;
+        NumBytesNoStackCalls += Costs.CallNoLRSave;
+        C.setCallInfo(MachineOutlinerNoLRSave, Costs.CallNoLRSave);
+        CandidatesWithoutStackFixups.push_back(C);
+      }
+
+      // Is an unused register available? If so, we won't modify the stack, so
+      // we can outline with the same frame type as those that don't save LR.
+      else if (findRegisterToSaveLRTo(C)) {
+        FrameID = MachineOutlinerRegSave;
+        NumBytesNoStackCalls += Costs.CallRegSave;
+        C.setCallInfo(MachineOutlinerRegSave, Costs.CallRegSave);
+        CandidatesWithoutStackFixups.push_back(C);
+      }
+    }
+
+    if (!CandidatesWithoutStackFixups.empty()) {
+      RepeatedSequenceLocs = CandidatesWithoutStackFixups;
+    } else
+      return outliner::OutlinedFunction();
+  }
+
+  return outliner::OutlinedFunction(RepeatedSequenceLocs, SequenceSize,
+                                    NumBytesToCreateFrame, FrameID);
+}
+
+bool ARMBaseInstrInfo::isFunctionSafeToOutlineFrom(
+    MachineFunction &MF, bool OutlineFromLinkOnceODRs) const {
+  const Function &F = MF.getFunction();
+
+  // Can F be deduplicated by the linker? If it can, don't outline from it.
+  if (!OutlineFromLinkOnceODRs && F.hasLinkOnceODRLinkage())
+    return false;
+
+  // Don't outline from functions with section markings; the program could
+  // expect that all the code is in the named section.
+  // FIXME: Allow outlining from multiple functions with the same section
+  // marking.
+  if (F.hasSection())
+    return false;
+
+  // FIXME: Thumb1 outlining is not handled
+  if (MF.getInfo<ARMFunctionInfo>()->isThumb1OnlyFunction())
+    return false;
+
+  // It's safe to outline from MF.
+  return true;
+}
+
+bool ARMBaseInstrInfo::isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
+                                              unsigned &Flags) const {
+  // Check if LR is available through all of the MBB. If it's not, then set
+  // a flag.
+  assert(MBB.getParent()->getRegInfo().tracksLiveness() &&
+         "Suitable Machine Function for outlining must track liveness");
+
+  LiveRegUnits LRU(getRegisterInfo());
+
+  std::for_each(MBB.rbegin(), MBB.rend(),
+                [&LRU](MachineInstr &MI) { LRU.accumulate(MI); });
+
+  // Check if each of the unsafe registers are available...
+  bool R12AvailableInBlock = LRU.available(ARM::R12);
+  bool CPSRAvailableInBlock = LRU.available(ARM::CPSR);
+
+  // If all of these are dead (and not live out), we know we don't have to check
+  // them later.
+  if (R12AvailableInBlock && CPSRAvailableInBlock)
+    Flags |= MachineOutlinerMBBFlags::UnsafeRegsDead;
+
+  // Now, add the live outs to the set.
+  LRU.addLiveOuts(MBB);
+
+  // If any of these registers is available in the MBB, but also a live out of
+  // the block, then we know outlining is unsafe.
+  if (R12AvailableInBlock && !LRU.available(ARM::R12))
+    return false;
+  if (CPSRAvailableInBlock && !LRU.available(ARM::CPSR))
+    return false;
+
+  // Check if there's a call inside this MachineBasicBlock.  If there is, then
+  // set a flag.
+  if (any_of(MBB, [](MachineInstr &MI) { return MI.isCall(); }))
+    Flags |= MachineOutlinerMBBFlags::HasCalls;
+
+  if (!LRU.available(ARM::LR))
+    Flags |= MachineOutlinerMBBFlags::LRUnavailableSomewhere;
+
+  return true;
+}
+
+outliner::InstrType
+ARMBaseInstrInfo::getOutliningType(MachineBasicBlock::iterator &MIT,
+                                   unsigned Flags) const {
+  MachineInstr &MI = *MIT;
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+
+  // Be conservative with inline ASM
+  if (MI.isInlineAsm())
+    return outliner::InstrType::Illegal;
+
+  // Don't allow debug values to impact outlining type.
+  if (MI.isDebugInstr() || MI.isIndirectDebugValue())
+    return outliner::InstrType::Invisible;
+
+  // At this point, KILL or IMPLICIT_DEF instructions don't really tell us much
+  // so we can go ahead and skip over them.
+  if (MI.isKill() || MI.isImplicitDef())
+    return outliner::InstrType::Invisible;
+
+  // PIC instructions contain labels, outlining them would break offset
+  // computing.  unsigned Opc = MI.getOpcode();
+  unsigned Opc = MI.getOpcode();
+  if (Opc == ARM::tPICADD || Opc == ARM::PICADD || Opc == ARM::PICSTR ||
+      Opc == ARM::PICSTRB || Opc == ARM::PICSTRH || Opc == ARM::PICLDR ||
+      Opc == ARM::PICLDRB || Opc == ARM::PICLDRH || Opc == ARM::PICLDRSB ||
+      Opc == ARM::PICLDRSH || Opc == ARM::t2LDRpci_pic ||
+      Opc == ARM::t2MOVi16_ga_pcrel || Opc == ARM::t2MOVTi16_ga_pcrel ||
+      Opc == ARM::t2MOV_ga_pcrel)
+    return outliner::InstrType::Illegal;
+
+  // Be conservative with ARMv8.1 MVE instructions.
+  if (Opc == ARM::t2BF_LabelPseudo || Opc == ARM::t2DoLoopStart ||
+      Opc == ARM::t2WhileLoopStart || Opc == ARM::t2LoopDec ||
+      Opc == ARM::t2LoopEnd)
+    return outliner::InstrType::Illegal;
+
+  const MCInstrDesc &MCID = MI.getDesc();
+  uint64_t MIFlags = MCID.TSFlags;
+  if ((MIFlags & ARMII::DomainMask) == ARMII::DomainMVE)
+    return outliner::InstrType::Illegal;
+
+  // Is this a terminator for a basic block?
+  if (MI.isTerminator()) {
+    // Don't outline if the branch is not unconditional.
+    if (isPredicated(MI))
+      return outliner::InstrType::Illegal;
+
+    // Is this the end of a function?
+    if (MI.getParent()->succ_empty())
+      return outliner::InstrType::Legal;
+
+    // It's not, so don't outline it.
+    return outliner::InstrType::Illegal;
+  }
+
+  // Make sure none of the operands are un-outlinable.
+  for (const MachineOperand &MOP : MI.operands()) {
+    if (MOP.isCPI() || MOP.isJTI() || MOP.isCFIIndex() || MOP.isFI() ||
+        MOP.isTargetIndex())
+      return outliner::InstrType::Illegal;
+  }
+
+  // Don't outline if link register or program counter value are used.
+  if (MI.readsRegister(ARM::LR, TRI) || MI.readsRegister(ARM::PC, TRI))
+    return outliner::InstrType::Illegal;
+
+  if (MI.isCall()) {
+    // If we don't know anything about the callee, assume it depends on the
+    // stack layout of the caller. In that case, it's only legal to outline
+    // as a tail-call. Explicitly list the call instructions we know about so
+    // we don't get unexpected results with call pseudo-instructions.
+    auto UnknownCallOutlineType = outliner::InstrType::Illegal;
+    if (Opc == ARM::BL || Opc == ARM::tBL || Opc == ARM::BLX ||
+        Opc == ARM::tBLXr || Opc == ARM::tBLXi)
+      UnknownCallOutlineType = outliner::InstrType::LegalTerminator;
+
+    return UnknownCallOutlineType;
+  }
+
+  // Since calls are handled, don't touch LR or PC
+  if (MI.modifiesRegister(ARM::LR, TRI) || MI.modifiesRegister(ARM::PC, TRI))
+    return outliner::InstrType::Illegal;
+
+  // Does this use the stack?
+  if (MI.modifiesRegister(ARM::SP, TRI) || MI.readsRegister(ARM::SP, TRI)) {
+    // True if there is no chance that any outlined candidate from this range
+    // could require stack fixups. That is, both
+    // * LR is available in the range (No save/restore around call)
+    // * The range doesn't include calls (No save/restore in outlined frame)
+    // are true.
+    // FIXME: This is very restrictive; the flags check the whole block,
+    // not just the bit we will try to outline.
+    bool MightNeedStackFixUp =
+        (Flags & (MachineOutlinerMBBFlags::LRUnavailableSomewhere |
+                  MachineOutlinerMBBFlags::HasCalls));
+
+    if (!MightNeedStackFixUp)
+      return outliner::InstrType::Legal;
+
+    return outliner::InstrType::Illegal;
+  }
+
+  // Be conservative with IT blocks.
+  if (MI.readsRegister(ARM::ITSTATE, TRI) ||
+      MI.modifiesRegister(ARM::ITSTATE, TRI))
+    return outliner::InstrType::Illegal;
+
+  // Don't outline positions.
+  if (MI.isPosition())
+    return outliner::InstrType::Illegal;
+
+  return outliner::InstrType::Legal;
+}
+
+void ARMBaseInstrInfo::buildOutlinedFrame(
+    MachineBasicBlock &MBB, MachineFunction &MF,
+    const outliner::OutlinedFunction &OF) const {
+  // Nothing is needed for tail-calls.
+  if (OF.FrameConstructionID == MachineOutlinerTailCall)
+    return;
+
+  // For thunk outlining, rewrite the last instruction from a call to a
+  // tail-call.
+  if (OF.FrameConstructionID == MachineOutlinerThunk) {
+    MachineInstr *Call = &*--MBB.instr_end();
+    bool isThumb = Subtarget.isThumb();
+    unsigned FuncOp = isThumb ? 2 : 0;
+    unsigned Opc = Call->getOperand(FuncOp).isReg()
+                       ? isThumb ? ARM::tTAILJMPr : ARM::TAILJMPr
+                       : isThumb ? Subtarget.isTargetMachO() ? ARM::tTAILJMPd
+                                                             : ARM::tTAILJMPdND
+                                 : ARM::TAILJMPd;
+    MachineInstrBuilder MIB = BuildMI(MBB, MBB.end(), DebugLoc(), get(Opc))
+                                  .add(Call->getOperand(FuncOp));
+    if (isThumb && !Call->getOperand(FuncOp).isReg())
+      MIB.add(predOps(ARMCC::AL));
+    Call->eraseFromParent();
+    return;
+  }
+
+  // Here we have to insert the return ourselves.  Get the correct opcode from
+  // current feature set.
+  BuildMI(MBB, MBB.end(), DebugLoc(), get(Subtarget.getReturnOpcode()))
+      .add(predOps(ARMCC::AL));
+}
+
+MachineBasicBlock::iterator ARMBaseInstrInfo::insertOutlinedCall(
+    Module &M, MachineBasicBlock &MBB, MachineBasicBlock::iterator &It,
+    MachineFunction &MF, const outliner::Candidate &C) const {
+  MachineInstrBuilder MIB;
+  MachineBasicBlock::iterator CallPt;
+  unsigned Opc;
+  bool isThumb = Subtarget.isThumb();
+
+  // Are we tail calling?
+  if (C.CallConstructionID == MachineOutlinerTailCall) {
+    // If yes, then we can just branch to the label.
+    Opc = isThumb
+              ? Subtarget.isTargetMachO() ? ARM::tTAILJMPd : ARM::tTAILJMPdND
+              : ARM::TAILJMPd;
+    MIB = BuildMI(MF, DebugLoc(), get(Opc))
+              .addGlobalAddress(M.getNamedValue(MF.getName()));
+    if (isThumb)
+      MIB.add(predOps(ARMCC::AL));
+    It = MBB.insert(It, MIB);
+    return It;
+  }
+
+  // Create the call instruction.
+  Opc = isThumb ? ARM::tBL : ARM::BL;
+  MachineInstrBuilder CallMIB = BuildMI(MF, DebugLoc(), get(Opc));
+  if (isThumb)
+    CallMIB.add(predOps(ARMCC::AL));
+  CallMIB.addGlobalAddress(M.getNamedValue(MF.getName()));
+
+  // Can we save to a register?
+  if (C.CallConstructionID == MachineOutlinerRegSave) {
+    unsigned Reg = findRegisterToSaveLRTo(C);
+    assert(Reg != 0 && "No callee-saved register available?");
+
+    // Save and restore LR from that register.
+    if (!MBB.isLiveIn(ARM::LR))
+      MBB.addLiveIn(ARM::LR);
+    copyPhysReg(MBB, It, DebugLoc(), Reg, ARM::LR, true);
+    CallPt = MBB.insert(It, CallMIB);
+    copyPhysReg(MBB, It, DebugLoc(), ARM::LR, Reg, true);
+    It--;
+    return CallPt;
+  }
+  // Insert the call.
+  It = MBB.insert(It, CallMIB);
+  return It;
+}
diff --git a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
index f6d4ebe3a090..1a75b011ca59 100644
--- a/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseInstrInfo.h
@@ -21,6 +21,8 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsARM.h"
 #include <array>
 #include <cstdint>
 
@@ -105,6 +107,11 @@ protected:
   Optional<DestSourcePair>
   isCopyInstrImpl(const MachineInstr &MI) const override;
 
+  /// Specialization of \ref TargetInstrInfo::describeLoadedValue, used to
+  /// enhance debug entry value descriptions for ARM targets.
+  Optional<ParamLoadedValue> describeLoadedValue(const MachineInstr &MI,
+                                                 Register Reg) const override;
+
 public:
   // Return whether the target has an explicit NOP encoding.
   bool hasNOP() const;
@@ -146,6 +153,12 @@ public:
   // Predication support.
   bool isPredicated(const MachineInstr &MI) const override;
 
+  // MIR printer helper function to annotate Operands with a comment.
+  std::string
+  createMIROperandComment(const MachineInstr &MI, const MachineOperand &Op,
+                          unsigned OpIdx,
+                          const TargetRegisterInfo *TRI) const override;
+
   ARMCC::CondCodes getPredicate(const MachineInstr &MI) const {
     int PIdx = MI.findFirstPredOperandIdx();
     return PIdx != -1 ? (ARMCC::CondCodes)MI.getOperand(PIdx).getImm()
@@ -207,13 +220,13 @@ public:
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           Register SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
-                            unsigned DestReg, int FrameIndex,
+                            Register DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
@@ -222,7 +235,7 @@ public:
   bool shouldSink(const MachineInstr &MI) const override;
 
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                     unsigned DestReg, unsigned SubIdx,
+                     Register DestReg, unsigned SubIdx,
                      const MachineInstr &Orig,
                      const TargetRegisterInfo &TRI) const override;
 
@@ -286,16 +299,16 @@ public:
   /// in SrcReg and SrcReg2 if having two register operands, and the value it
   /// compares against in CmpValue. Return true if the comparison instruction
   /// can be analyzed.
-  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                      unsigned &SrcReg2, int &CmpMask,
+  bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                      Register &SrcReg2, int &CmpMask,
                       int &CmpValue) const override;
 
   /// optimizeCompareInstr - Convert the instruction to set the zero flag so
   /// that we can remove a "comparison with zero"; Remove a redundant CMP
   /// instruction if the flags can be updated in the same way by an earlier
   /// instruction such as SUB.
-  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
-                            unsigned SrcReg2, int CmpMask, int CmpValue,
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+                            Register SrcReg2, int CmpMask, int CmpValue,
                             const MachineRegisterInfo *MRI) const override;
 
   bool analyzeSelect(const MachineInstr &MI,
@@ -308,7 +321,7 @@ public:
 
   /// FoldImmediate - 'Reg' is known to be defined by a move immediate
   /// instruction, try to fold the immediate into the use instruction.
-  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
                      MachineRegisterInfo *MRI) const override;
 
   unsigned getNumMicroOps(const InstrItineraryData *ItinData,
@@ -343,7 +356,27 @@ public:
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableBitmaskMachineOperandTargetFlags() const override;
 
+  /// ARM supports the MachineOutliner.
+  bool isFunctionSafeToOutlineFrom(MachineFunction &MF,
+                                   bool OutlineFromLinkOnceODRs) const override;
+  outliner::OutlinedFunction getOutliningCandidateInfo(
+      std::vector<outliner::Candidate> &RepeatedSequenceLocs) const override;
+  outliner::InstrType getOutliningType(MachineBasicBlock::iterator &MIT,
+                                       unsigned Flags) const override;
+  bool isMBBSafeToOutlineFrom(MachineBasicBlock &MBB,
+                              unsigned &Flags) const override;
+  void buildOutlinedFrame(MachineBasicBlock &MBB, MachineFunction &MF,
+                          const outliner::OutlinedFunction &OF) const override;
+  MachineBasicBlock::iterator
+  insertOutlinedCall(Module &M, MachineBasicBlock &MBB,
+                     MachineBasicBlock::iterator &It, MachineFunction &MF,
+                     const outliner::Candidate &C) const override;
+
 private:
+  /// Returns an unused general-purpose register which can be used for
+  /// constructing an outlined call if one exists. Returns 0 otherwise.
+  unsigned findRegisterToSaveLRTo(const outliner::Candidate &C) const;
+
   unsigned getInstBundleLength(const MachineInstr &MI) const;
 
   int getVLDMDefCycle(const InstrItineraryData *ItinData,
@@ -403,7 +436,7 @@ private:
 
   /// Identify instructions that can be folded into a MOVCC instruction, and
   /// return the defining instruction.
-  MachineInstr *canFoldIntoMOVCC(unsigned Reg, const MachineRegisterInfo &MRI,
+  MachineInstr *canFoldIntoMOVCC(Register Reg, const MachineRegisterInfo &MRI,
                                  const TargetInstrInfo *TII) const;
 
 private:
@@ -491,24 +524,6 @@ bool isUncondBranchOpcode(int Opc) {
 // This table shows the VPT instruction variants, i.e. the different
 // mask field encodings, see also B5.6. Predication/conditional execution in
 // the ArmARM.
-enum VPTMaskValue {
-  T     =  8, // 0b1000
-  TT    =  4, // 0b0100
-  TE    = 12, // 0b1100
-  TTT   =  2, // 0b0010
-  TTE   =  6, // 0b0110
-  TEE   = 10, // 0b1010
-  TET   = 14, // 0b1110
-  TTTT  =  1, // 0b0001
-  TTTE  =  3, // 0b0011
-  TTEE  =  5, // 0b0101
-  TTET  =  7, // 0b0111
-  TEEE  =  9, // 0b1001
-  TEET  = 11, // 0b1011
-  TETT  = 13, // 0b1101
-  TETE  = 15  // 0b1111
-};
-
 static inline bool isVPTOpcode(int Opc) {
   return Opc == ARM::MVE_VPTv16i8 || Opc == ARM::MVE_VPTv16u8 ||
          Opc == ARM::MVE_VPTv16s8 || Opc == ARM::MVE_VPTv8i16 ||
@@ -595,6 +610,18 @@ unsigned VCTPOpcodeToLSTP(unsigned Opcode, bool IsDoLoop) {
   return 0;
 }
 
+static inline unsigned getTailPredVectorWidth(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    llvm_unreachable("unhandled vctp opcode");
+  case ARM::MVE_VCTP8:  return 16;
+  case ARM::MVE_VCTP16: return 8;
+  case ARM::MVE_VCTP32: return 4;
+  case ARM::MVE_VCTP64: return 2;
+  }
+  return 0;
+}
+
 static inline
 bool isVCTP(MachineInstr *MI) {
   switch (MI->getOpcode()) {
@@ -642,20 +669,31 @@ static inline bool isPushOpcode(int Opc) {
          Opc == ARM::STMDB_UPD || Opc == ARM::VSTMDDB_UPD;
 }
 
+static inline bool isSubImmOpcode(int Opc) {
+  return Opc == ARM::SUBri ||
+         Opc == ARM::tSUBi3 || Opc == ARM::tSUBi8 ||
+         Opc == ARM::tSUBSi3 || Opc == ARM::tSUBSi8 ||
+         Opc == ARM::t2SUBri || Opc == ARM::t2SUBri12 || Opc == ARM::t2SUBSri;
+}
+
+static inline bool isMovRegOpcode(int Opc) {
+  return Opc == ARM::MOVr || Opc == ARM::tMOVr || Opc == ARM::t2MOVr;
+}
 /// isValidCoprocessorNumber - decide whether an explicit coprocessor
 /// number is legal in generic instructions like CDP. The answer can
 /// vary with the subtarget.
 static inline bool isValidCoprocessorNumber(unsigned Num,
                                             const FeatureBitset& featureBits) {
+  // In Armv7 and Armv8-M CP10 and CP11 clash with VFP/NEON, however, the
+  // coprocessor is still valid for CDP/MCR/MRC and friends. Allowing it is
+  // useful for code which is shared with older architectures which do not know
+  // the new VFP/NEON mnemonics.
+
   // Armv8-A disallows everything *other* than 111x (CP14 and CP15).
   if (featureBits[ARM::HasV8Ops] && (Num & 0xE) != 0xE)
     return false;
 
-  // Armv7 disallows 101x (CP10 and CP11), which clash with VFP/NEON.
-  if (featureBits[ARM::HasV7Ops] && (Num & 0xE) == 0xA)
-    return false;
-
-  // Armv8.1-M also disallows 100x (CP8,CP9) and 111x (CP14,CP15)
+  // Armv8.1-M disallows 100x (CP8,CP9) and 111x (CP14,CP15)
   // which clash with MVE.
   if (featureBits[ARM::HasV8_1MMainlineOps] &&
       ((Num & 0xE) == 0x8 || (Num & 0xE) == 0xE))
@@ -667,7 +705,7 @@ static inline bool isValidCoprocessorNumber(unsigned Num,
 /// getInstrPredicate - If instruction is predicated, returns its predicate
 /// condition, otherwise returns AL. It also returns the condition code
 /// register by reference.
-ARMCC::CondCodes getInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
+ARMCC::CondCodes getInstrPredicate(const MachineInstr &MI, Register &PredReg);
 
 unsigned getMatchingCondBranchOpcode(unsigned Opc);
 
@@ -681,21 +719,21 @@ unsigned convertAddSubFlagsOpcode(unsigned OldOpc);
 /// code.
 void emitARMRegPlusImmediate(MachineBasicBlock &MBB,
                              MachineBasicBlock::iterator &MBBI,
-                             const DebugLoc &dl, unsigned DestReg,
-                             unsigned BaseReg, int NumBytes,
-                             ARMCC::CondCodes Pred, unsigned PredReg,
+                             const DebugLoc &dl, Register DestReg,
+                             Register BaseReg, int NumBytes,
+                             ARMCC::CondCodes Pred, Register PredReg,
                              const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
 
 void emitT2RegPlusImmediate(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator &MBBI,
-                            const DebugLoc &dl, unsigned DestReg,
-                            unsigned BaseReg, int NumBytes,
-                            ARMCC::CondCodes Pred, unsigned PredReg,
+                            const DebugLoc &dl, Register DestReg,
+                            Register BaseReg, int NumBytes,
+                            ARMCC::CondCodes Pred, Register PredReg,
                             const ARMBaseInstrInfo &TII, unsigned MIFlags = 0);
 void emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator &MBBI,
-                               const DebugLoc &dl, unsigned DestReg,
-                               unsigned BaseReg, int NumBytes,
+                               const DebugLoc &dl, Register DestReg,
+                               Register BaseReg, int NumBytes,
                                const TargetInstrInfo &TII,
                                const ARMBaseRegisterInfo &MRI,
                                unsigned MIFlags = 0);
@@ -714,11 +752,11 @@ bool tryFoldSPUpdateIntoPushPop(const ARMSubtarget &Subtarget,
 /// offset could not be handled directly in MI, and return the left-over
 /// portion by reference.
 bool rewriteARMFrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                          unsigned FrameReg, int &Offset,
+                          Register FrameReg, int &Offset,
                           const ARMBaseInstrInfo &TII);
 
 bool rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                         unsigned FrameReg, int &Offset,
+                         Register FrameReg, int &Offset,
                          const ARMBaseInstrInfo &TII,
                          const TargetRegisterInfo *TRI);
 
@@ -733,7 +771,7 @@ MachineInstr *findCMPToFoldIntoCBZ(MachineInstr *Br,
                                    const TargetRegisterInfo *TRI);
 
 void addUnpredicatedMveVpredNOp(MachineInstrBuilder &MIB);
-void addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned DestReg);
+void addUnpredicatedMveVpredROp(MachineInstrBuilder &MIB, Register DestReg);
 
 void addPredicatedMveVpredNOp(MachineInstrBuilder &MIB, unsigned Cond);
 void addPredicatedMveVpredROp(MachineInstrBuilder &MIB, unsigned Cond,
@@ -753,6 +791,70 @@ bool HasLowerConstantMaterializationCost(unsigned Val1, unsigned Val2,
                                          const ARMSubtarget *Subtarget,
                                          bool ForCodesize = false);
 
+// Return the immediate if this is ADDri or SUBri, scaled as appropriate.
+// Returns 0 for unknown instructions.
+inline int getAddSubImmediate(MachineInstr &MI) {
+  int Scale = 1;
+  unsigned ImmOp;
+  switch (MI.getOpcode()) {
+  case ARM::t2ADDri:
+    ImmOp = 2;
+    break;
+  case ARM::t2SUBri:
+  case ARM::t2SUBri12:
+    ImmOp = 2;
+    Scale = -1;
+    break;
+  case ARM::tSUBi3:
+  case ARM::tSUBi8:
+    ImmOp = 3;
+    Scale = -1;
+    break;
+  default:
+    return 0;
+  }
+  return Scale * MI.getOperand(ImmOp).getImm();
+}
+
+// Given a memory access Opcode, check that the give Imm would be a valid Offset
+// for this instruction using its addressing mode.
+inline bool isLegalAddressImm(unsigned Opcode, int Imm,
+                              const TargetInstrInfo *TII) {
+  const MCInstrDesc &Desc = TII->get(Opcode);
+  unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
+  switch (AddrMode) {
+  case ARMII::AddrModeT2_i7:
+    return std::abs(Imm) < (((1 << 7) * 1) - 1);
+  case ARMII::AddrModeT2_i7s2:
+    return std::abs(Imm) < (((1 << 7) * 2) - 1) && Imm % 2 == 0;
+  case ARMII::AddrModeT2_i7s4:
+    return std::abs(Imm) < (((1 << 7) * 4) - 1) && Imm % 4 == 0;
+  default:
+    llvm_unreachable("Unhandled Addressing mode");
+  }
+}
+
+// Return true if the given intrinsic is a gather or scatter
+inline bool isGatherScatter(IntrinsicInst *IntInst) {
+  if (IntInst == nullptr)
+    return false;
+  unsigned IntrinsicID = IntInst->getIntrinsicID();
+  return (IntrinsicID == Intrinsic::masked_gather ||
+          IntrinsicID == Intrinsic::arm_mve_vldr_gather_base ||
+          IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_predicated ||
+          IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb ||
+          IntrinsicID == Intrinsic::arm_mve_vldr_gather_base_wb_predicated ||
+          IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset ||
+          IntrinsicID == Intrinsic::arm_mve_vldr_gather_offset_predicated ||
+          IntrinsicID == Intrinsic::masked_scatter ||
+          IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base ||
+          IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_predicated ||
+          IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb ||
+          IntrinsicID == Intrinsic::arm_mve_vstr_scatter_base_wb_predicated ||
+          IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset ||
+          IntrinsicID == Intrinsic::arm_mve_vstr_scatter_offset_predicated);
+}
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_ARM_ARMBASEINSTRINFO_H
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 52e6d05c3155..3579635f83b5 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -220,10 +220,25 @@ getReservedRegs(const MachineFunction &MF) const {
 }
 
 bool ARMBaseRegisterInfo::
-isAsmClobberable(const MachineFunction &MF, unsigned PhysReg) const {
+isAsmClobberable(const MachineFunction &MF, MCRegister PhysReg) const {
   return !getReservedRegs(MF).test(PhysReg);
 }
 
+bool ARMBaseRegisterInfo::isInlineAsmReadOnlyReg(const MachineFunction &MF,
+                                                 unsigned PhysReg) const {
+  const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
+  const ARMFrameLowering *TFI = getFrameLowering(MF);
+
+  BitVector Reserved(getNumRegs());
+  markSuperRegs(Reserved, ARM::PC);
+  if (TFI->hasFP(MF))
+    markSuperRegs(Reserved, getFramePointerReg(STI));
+  if (hasBasePointer(MF))
+    markSuperRegs(Reserved, BasePtr);
+  assert(checkAllSuperRegsMarked(Reserved));
+  return Reserved.test(PhysReg);
+}
+
 const TargetRegisterClass *
 ARMBaseRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
                                                const MachineFunction &MF) const {
@@ -289,7 +304,8 @@ ARMBaseRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
 }
 
 // Get the other register in a GPRPair.
-static unsigned getPairedGPR(unsigned Reg, bool Odd, const MCRegisterInfo *RI) {
+static MCPhysReg getPairedGPR(MCPhysReg Reg, bool Odd,
+                              const MCRegisterInfo *RI) {
   for (MCSuperRegIterator Supers(Reg, RI); Supers.isValid(); ++Supers)
     if (ARM::GPRPairRegClass.contains(*Supers))
       return RI->getSubReg(*Supers, Odd ? ARM::gsub_1 : ARM::gsub_0);
@@ -297,15 +313,12 @@ static unsigned getPairedGPR(unsigned Reg, bool Odd, const MCRegisterInfo *RI) {
 }
 
 // Resolve the RegPairEven / RegPairOdd register allocator hints.
-bool
-ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
-                                           ArrayRef<MCPhysReg> Order,
-                                           SmallVectorImpl<MCPhysReg> &Hints,
-                                           const MachineFunction &MF,
-                                           const VirtRegMap *VRM,
-                                           const LiveRegMatrix *Matrix) const {
+bool ARMBaseRegisterInfo::getRegAllocationHints(
+    Register VirtReg, ArrayRef<MCPhysReg> Order,
+    SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
+    const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
   const MachineRegisterInfo &MRI = MF.getRegInfo();
-  std::pair<unsigned, unsigned> Hint = MRI.getRegAllocationHint(VirtReg);
+  std::pair<Register, Register> Hint = MRI.getRegAllocationHint(VirtReg);
 
   unsigned Odd;
   switch (Hint.first) {
@@ -323,12 +336,12 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
   // This register should preferably be even (Odd == 0) or odd (Odd == 1).
   // Check if the other part of the pair has already been assigned, and provide
   // the paired register as the first hint.
-  unsigned Paired = Hint.second;
-  if (Paired == 0)
+  Register Paired = Hint.second;
+  if (!Paired)
     return false;
 
-  unsigned PairedPhys = 0;
-  if (Register::isPhysicalRegister(Paired)) {
+  Register PairedPhys;
+  if (Paired.isPhysical()) {
     PairedPhys = Paired;
   } else if (VRM && VRM->hasPhys(Paired)) {
     PairedPhys = getPairedGPR(VRM->getPhys(Paired), Odd, this);
@@ -339,11 +352,11 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
     Hints.push_back(PairedPhys);
 
   // Then prefer even or odd registers.
-  for (unsigned Reg : Order) {
+  for (MCPhysReg Reg : Order) {
     if (Reg == PairedPhys || (getEncodingValue(Reg) & 1) != Odd)
       continue;
     // Don't provide hints that are paired to a reserved register.
-    unsigned Paired = getPairedGPR(Reg, !Odd, this);
+    MCPhysReg Paired = getPairedGPR(Reg, !Odd, this);
     if (!Paired || MRI.isReserved(Paired))
       continue;
     Hints.push_back(Reg);
@@ -351,27 +364,27 @@ ARMBaseRegisterInfo::getRegAllocationHints(unsigned VirtReg,
   return false;
 }
 
-void
-ARMBaseRegisterInfo::updateRegAllocHint(unsigned Reg, unsigned NewReg,
-                                        MachineFunction &MF) const {
+void ARMBaseRegisterInfo::updateRegAllocHint(Register Reg, Register NewReg,
+                                             MachineFunction &MF) const {
   MachineRegisterInfo *MRI = &MF.getRegInfo();
-  std::pair<unsigned, unsigned> Hint = MRI->getRegAllocationHint(Reg);
-  if ((Hint.first == (unsigned)ARMRI::RegPairOdd ||
-       Hint.first == (unsigned)ARMRI::RegPairEven) &&
-      Register::isVirtualRegister(Hint.second)) {
+  std::pair<Register, Register> Hint = MRI->getRegAllocationHint(Reg);
+  if ((Hint.first == ARMRI::RegPairOdd || Hint.first == ARMRI::RegPairEven) &&
+      Hint.second.isVirtual()) {
     // If 'Reg' is one of the even / odd register pair and it's now changed
     // (e.g. coalesced) into a different register. The other register of the
     // pair allocation hint must be updated to reflect the relationship
     // change.
-    unsigned OtherReg = Hint.second;
+    Register OtherReg = Hint.second;
     Hint = MRI->getRegAllocationHint(OtherReg);
     // Make sure the pair has not already divorced.
     if (Hint.second == Reg) {
       MRI->setRegAllocationHint(OtherReg, Hint.first, NewReg);
       if (Register::isVirtualRegister(NewReg))
         MRI->setRegAllocationHint(NewReg,
-            Hint.first == (unsigned)ARMRI::RegPairOdd ? ARMRI::RegPairEven
-            : ARMRI::RegPairOdd, OtherReg);
+                                  Hint.first == ARMRI::RegPairOdd
+                                      ? ARMRI::RegPairEven
+                                      : ARMRI::RegPairOdd,
+                                  OtherReg);
     }
   }
 }
@@ -457,14 +470,14 @@ ARMBaseRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
 /// specified immediate.
 void ARMBaseRegisterInfo::emitLoadConstPool(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val,
-    ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const {
+    const DebugLoc &dl, Register DestReg, unsigned SubIdx, int Val,
+    ARMCC::CondCodes Pred, Register PredReg, unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C =
         ConstantInt::get(Type::getInt32Ty(MF.getFunction().getContext()), Val);
-  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align(4));
 
   BuildMI(MBB, MBBI, dl, TII.get(ARM::LDRcp))
       .addReg(DestReg, getDefRegState(true), SubIdx)
@@ -480,11 +493,6 @@ requiresRegisterScavenging(const MachineFunction &MF) const {
 }
 
 bool ARMBaseRegisterInfo::
-trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
-  return true;
-}
-
-bool ARMBaseRegisterInfo::
 requiresFrameIndexScavenging(const MachineFunction &MF) const {
   return true;
 }
@@ -606,9 +614,9 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
   // The FP is only available if there is no dynamic realignment. We
   // don't know for sure yet whether we'll need that, so we guess based
   // on whether there are any local variables that would trigger it.
-  unsigned StackAlign = TFI->getStackAlignment();
   if (TFI->hasFP(MF) &&
-      !((MFI.getLocalFrameMaxAlign() > StackAlign) && canRealignStack(MF))) {
+      !((MFI.getLocalFrameMaxAlign() > TFI->getStackAlign()) &&
+        canRealignStack(MF))) {
     if (isFrameOffsetLegal(MI, getFrameRegister(MF), FPOffset))
       return false;
   }
@@ -626,10 +634,10 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
 
 /// materializeFrameBaseRegister - Insert defining instruction(s) for BaseReg to
 /// be a pointer to FrameIdx at the beginning of the basic block.
-void ARMBaseRegisterInfo::
-materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                             unsigned BaseReg, int FrameIdx,
-                             int64_t Offset) const {
+void ARMBaseRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                       Register BaseReg,
+                                                       int FrameIdx,
+                                                       int64_t Offset) const {
   ARMFunctionInfo *AFI = MBB->getParent()->getInfo<ARMFunctionInfo>();
   unsigned ADDriOpc = !AFI->isThumbFunction() ? ARM::ADDri :
     (AFI->isThumb1OnlyFunction() ? ARM::tADDframe : ARM::t2ADDri);
@@ -652,7 +660,7 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
     MIB.add(predOps(ARMCC::AL)).add(condCodeOp());
 }
 
-void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                                             int64_t Offset) const {
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
@@ -680,7 +688,8 @@ void ARMBaseRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
   (void)Done;
 }
 
-bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+bool ARMBaseRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
+                                             Register BaseReg,
                                              int64_t Offset) const {
   const MCInstrDesc &Desc = MI->getDesc();
   unsigned AddrMode = (Desc.TSFlags & ARMII::AddrModeMask);
@@ -759,7 +768,7 @@ ARMBaseRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   assert(!AFI->isThumb1OnlyFunction() &&
          "This eliminateFrameIndex does not support Thumb1!");
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  unsigned FrameReg;
+  Register FrameReg;
 
   int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj);
 
diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
index 477f3ad0a9a7..0a0907af2141 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.h
@@ -134,7 +134,9 @@ public:
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   bool isAsmClobberable(const MachineFunction &MF,
-                       unsigned PhysReg) const override;
+                       MCRegister PhysReg) const override;
+  bool isInlineAsmReadOnlyReg(const MachineFunction &MF,
+                              unsigned PhysReg) const override;
 
   const TargetRegisterClass *
   getPointerRegClass(const MachineFunction &MF,
@@ -149,14 +151,12 @@ public:
   unsigned getRegPressureLimit(const TargetRegisterClass *RC,
                                MachineFunction &MF) const override;
 
-  bool getRegAllocationHints(unsigned VirtReg,
-                             ArrayRef<MCPhysReg> Order,
+  bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
                              SmallVectorImpl<MCPhysReg> &Hints,
-                             const MachineFunction &MF,
-                             const VirtRegMap *VRM,
+                             const MachineFunction &MF, const VirtRegMap *VRM,
                              const LiveRegMatrix *Matrix) const override;
 
-  void updateRegAllocHint(unsigned Reg, unsigned NewReg,
+  void updateRegAllocHint(Register Reg, Register NewReg,
                           MachineFunction &MF) const override;
 
   bool hasBasePointer(const MachineFunction &MF) const;
@@ -165,35 +165,32 @@ public:
   int64_t getFrameIndexInstrOffset(const MachineInstr *MI,
                                    int Idx) const override;
   bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
-  void materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                                    unsigned BaseReg, int FrameIdx,
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
+                                    int FrameIdx,
                                     int64_t Offset) const override;
-  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+  void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                          int64_t Offset) const override;
-  bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+  bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
                           int64_t Offset) const override;
 
   bool cannotEliminateFrame(const MachineFunction &MF) const;
 
   // Debug information queries.
   Register getFrameRegister(const MachineFunction &MF) const override;
-  unsigned getBaseRegister() const { return BasePtr; }
-
+  Register getBaseRegister() const { return BasePtr; }
 
   /// emitLoadConstPool - Emits a load from constpool to materialize the
   /// specified immediate.
   virtual void
   emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-                    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx,
+                    const DebugLoc &dl, Register DestReg, unsigned SubIdx,
                     int Val, ARMCC::CondCodes Pred = ARMCC::AL,
-                    unsigned PredReg = 0,
+                    Register PredReg = Register(),
                     unsigned MIFlags = MachineInstr::NoFlags) const;
 
   /// Code Generation virtual methods...
   bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
-
   bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
 
   bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/ARM/ARMBasicBlockInfo.cpp b/llvm/lib/Target/ARM/ARMBasicBlockInfo.cpp
index 00a2231f59e3..6d389cc82730 100644
--- a/llvm/lib/Target/ARM/ARMBasicBlockInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBasicBlockInfo.cpp
@@ -49,7 +49,7 @@ void ARMBasicBlockUtils::computeBlockSize(MachineBasicBlock *MBB) {
   BasicBlockInfo &BBI = BBInfo[MBB->getNumber()];
   BBI.Size = 0;
   BBI.Unalign = 0;
-  BBI.PostAlign = Align::None();
+  BBI.PostAlign = Align(1);
 
   for (MachineInstr &I : *MBB) {
     BBI.Size += TII->getInstSizeInBytes(I);
diff --git a/llvm/lib/Target/ARM/ARMBasicBlockInfo.h b/llvm/lib/Target/ARM/ARMBasicBlockInfo.h
index 13df399ed995..47d9a4049fa0 100644
--- a/llvm/lib/Target/ARM/ARMBasicBlockInfo.h
+++ b/llvm/lib/Target/ARM/ARMBasicBlockInfo.h
@@ -87,10 +87,10 @@ struct BasicBlockInfo {
   /// Compute the offset immediately following this block.  If Align is
   /// specified, return the offset the successor block will get if it has
   /// this alignment.
-  unsigned postOffset(Align Alignment = Align::None()) const {
+  unsigned postOffset(Align Alignment = Align(1)) const {
     unsigned PO = Offset + Size;
     const Align PA = std::max(PostAlign, Alignment);
-    if (PA == Align::None())
+    if (PA == Align(1))
       return PO;
     // Add alignment padding from the terminator.
     return PO + UnknownPadding(PA, internalKnownBits());
@@ -101,7 +101,7 @@ struct BasicBlockInfo {
   /// instruction alignment.  An aligned terminator may increase the number
   /// of know bits.
   /// If LogAlign is given, also consider the alignment of the next block.
-  unsigned postKnownBits(Align Align = Align::None()) const {
+  unsigned postKnownBits(Align Align = llvm::Align(1)) const {
     return std::max(Log2(std::max(PostAlign, Align)), internalKnownBits());
   }
 };
diff --git a/llvm/lib/Target/ARM/ARMCallLowering.cpp b/llvm/lib/Target/ARM/ARMCallLowering.cpp
index ce260a9ba145..d860473011e7 100644
--- a/llvm/lib/Target/ARM/ARMCallLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMCallLowering.cpp
@@ -99,17 +99,14 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
 
     LLT p0 = LLT::pointer(0, 32);
     LLT s32 = LLT::scalar(32);
-    Register SPReg = MRI.createGenericVirtualRegister(p0);
-    MIRBuilder.buildCopy(SPReg, Register(ARM::SP));
+    auto SPReg = MIRBuilder.buildCopy(p0, Register(ARM::SP));
 
-    Register OffsetReg = MRI.createGenericVirtualRegister(s32);
-    MIRBuilder.buildConstant(OffsetReg, Offset);
+    auto OffsetReg = MIRBuilder.buildConstant(s32, Offset);
 
-    Register AddrReg = MRI.createGenericVirtualRegister(p0);
-    MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg);
+    auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg);
 
     MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
-    return AddrReg;
+    return AddrReg.getReg(0);
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
@@ -133,7 +130,7 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
     Register ExtReg = extendRegister(ValVReg, VA);
     auto MMO = MIRBuilder.getMF().getMachineMemOperand(
         MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(),
-        /* Alignment */ 1);
+        Align(1));
     MIRBuilder.buildStore(ExtReg, Addr, *MMO);
   }
 
@@ -143,7 +140,10 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
 
     CCValAssign VA = VAs[0];
     assert(VA.needsCustom() && "Value doesn't need custom handling");
-    assert(VA.getValVT() == MVT::f64 && "Unsupported type");
+
+    // Custom lowering for other types, such as f16, is currently not supported
+    if (VA.getValVT() != MVT::f64)
+      return 0;
 
     CCValAssign NextVA = VAs[1];
     assert(NextVA.needsCustom() && "Value doesn't need custom handling");
@@ -203,7 +203,7 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
     // Even if there is no splitting to do, we still want to replace the
     // original type (e.g. pointer type -> integer).
     auto Flags = OrigArg.Flags[0];
-    Flags.setOrigAlign(Align(DL.getABITypeAlignment(OrigArg.Ty)));
+    Flags.setOrigAlign(DL.getABITypeAlign(OrigArg.Ty));
     SplitArgs.emplace_back(OrigArg.Regs[0], SplitVTs[0].getTypeForEVT(Ctx),
                            Flags, OrigArg.IsFixed);
     return;
@@ -215,7 +215,7 @@ void ARMCallLowering::splitToValueTypes(const ArgInfo &OrigArg,
     Type *SplitTy = SplitVT.getTypeForEVT(Ctx);
     auto Flags = OrigArg.Flags[0];
 
-    Flags.setOrigAlign(Align(DL.getABITypeAlignment(SplitTy)));
+    Flags.setOrigAlign(DL.getABITypeAlign(SplitTy));
 
     bool NeedsConsecutiveRegisters =
         TLI.functionArgumentNeedsConsecutiveRegisters(
@@ -299,11 +299,8 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
     int FI = MFI.CreateFixedObject(Size, Offset, true);
     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
 
-    Register AddrReg =
-        MRI.createGenericVirtualRegister(LLT::pointer(MPO.getAddrSpace(), 32));
-    MIRBuilder.buildFrameIndex(AddrReg, FI);
-
-    return AddrReg;
+    return MIRBuilder.buildFrameIndex(LLT::pointer(MPO.getAddrSpace(), 32), FI)
+        .getReg(0);
   }
 
   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
@@ -318,20 +315,21 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
       Size = 4;
       assert(MRI.getType(ValVReg).isScalar() && "Only scalars supported atm");
 
-      auto LoadVReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
-      buildLoad(LoadVReg, Addr, Size, /* Alignment */ 1, MPO);
+      auto LoadVReg = buildLoad(LLT::scalar(32), Addr, Size, MPO);
       MIRBuilder.buildTrunc(ValVReg, LoadVReg);
     } else {
       // If the value is not extended, a simple load will suffice.
-      buildLoad(ValVReg, Addr, Size, /* Alignment */ 1, MPO);
+      buildLoad(ValVReg, Addr, Size, MPO);
     }
   }
 
-  void buildLoad(Register Val, Register Addr, uint64_t Size, unsigned Alignment,
-                 MachinePointerInfo &MPO) {
-    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
-        MPO, MachineMemOperand::MOLoad, Size, Alignment);
-    MIRBuilder.buildLoad(Val, Addr, *MMO);
+  MachineInstrBuilder buildLoad(const DstOp &Res, Register Addr, uint64_t Size,
+                                MachinePointerInfo &MPO) {
+    MachineFunction &MF = MIRBuilder.getMF();
+
+    auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, Size,
+                                       inferAlignFromPtrInfo(MF, MPO));
+    return MIRBuilder.buildLoad(Res, Addr, *MMO);
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
@@ -354,9 +352,7 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
       // We cannot create a truncating copy, nor a trunc of a physical register.
       // Therefore, we need to copy the content of the physical register into a
       // virtual one and then truncate that.
-      auto PhysRegToVReg =
-          MRI.createGenericVirtualRegister(LLT::scalar(LocSize));
-      MIRBuilder.buildCopy(PhysRegToVReg, PhysReg);
+      auto PhysRegToVReg = MIRBuilder.buildCopy(LLT::scalar(LocSize), PhysReg);
       MIRBuilder.buildTrunc(ValVReg, PhysRegToVReg);
     }
   }
@@ -367,7 +363,10 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
 
     CCValAssign VA = VAs[0];
     assert(VA.needsCustom() && "Value doesn't need custom handling");
-    assert(VA.getValVT() == MVT::f64 && "Unsupported type");
+
+    // Custom lowering for other types, such as f16, is currently not supported
+    if (VA.getValVT() != MVT::f64)
+      return 0;
 
     CCValAssign NextVA = VAs[1];
     assert(NextVA.needsCustom() && "Value doesn't need custom handling");
@@ -436,7 +435,7 @@ bool ARMCallLowering::lowerFormalArguments(
   for (auto &Arg : F.args()) {
     if (!isSupportedType(DL, TLI, Arg.getType()))
       return false;
-    if (Arg.hasByValOrInAllocaAttr())
+    if (Arg.hasPassPointeeByValueAttr())
       return false;
   }
 
diff --git a/llvm/lib/Target/ARM/ARMCallingConv.cpp b/llvm/lib/Target/ARM/ARMCallingConv.cpp
index a47c59512592..67c822a5b6ef 100644
--- a/llvm/lib/Target/ARM/ARMCallingConv.cpp
+++ b/llvm/lib/Target/ARM/ARMCallingConv.cpp
@@ -32,9 +32,8 @@ static bool f64AssignAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
       return false;
 
     // Put the whole thing on the stack.
-    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
-                                           State.AllocateStack(8, 4),
-                                           LocVT, LocInfo));
+    State.addLoc(CCValAssign::getCustomMem(
+        ValNo, ValVT, State.AllocateStack(8, Align(4)), LocVT, LocInfo));
     return true;
   }
 
@@ -42,9 +41,8 @@ static bool f64AssignAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
   if (unsigned Reg = State.AllocateReg(RegList))
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   else
-    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
-                                           State.AllocateStack(4, 4),
-                                           LocVT, LocInfo));
+    State.addLoc(CCValAssign::getCustomMem(
+        ValNo, ValVT, State.AllocateStack(4, Align(4)), LocVT, LocInfo));
   return true;
 }
 
@@ -81,9 +79,8 @@ static bool f64AssignAAPCS(unsigned ValNo, MVT ValVT, MVT LocVT,
       return false;
 
     // Put the whole thing on the stack.
-    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
-                                           State.AllocateStack(8, 8),
-                                           LocVT, LocInfo));
+    State.addLoc(CCValAssign::getCustomMem(
+        ValNo, ValVT, State.AllocateStack(8, Align(8)), LocVT, LocInfo));
     return true;
   }
 
@@ -184,8 +181,8 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
   // aggregate. Store the type's required alignment as extra info for later: in
   // the [N x i64] case all trace has been removed by the time we actually get
   // to do allocation.
-  PendingMembers.push_back(CCValAssign::getPending(ValNo, ValVT, LocVT, LocInfo,
-                                                   ArgFlags.getOrigAlign()));
+  PendingMembers.push_back(CCValAssign::getPending(
+      ValNo, ValVT, LocVT, LocInfo, ArgFlags.getNonZeroOrigAlign().value()));
 
   if (!ArgFlags.isInConsecutiveRegsLast())
     return true;
@@ -193,8 +190,9 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
   // Try to allocate a contiguous block of registers, each of the correct
   // size to hold one member.
   auto &DL = State.getMachineFunction().getDataLayout();
-  unsigned StackAlign = DL.getStackAlignment().value();
-  unsigned Align = std::min(PendingMembers[0].getExtraInfo(), StackAlign);
+  const Align StackAlign = DL.getStackAlignment();
+  const Align FirstMemberAlign(PendingMembers[0].getExtraInfo());
+  Align Alignment = std::min(FirstMemberAlign, StackAlign);
 
   ArrayRef<MCPhysReg> RegList;
   switch (LocVT.SimpleTy) {
@@ -204,21 +202,24 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
 
     // First consume all registers that would give an unaligned object. Whether
     // we go on stack or in regs, no-one will be using them in future.
-    unsigned RegAlign = alignTo(Align, 4) / 4;
+    unsigned RegAlign = alignTo(Alignment.value(), 4) / 4;
     while (RegIdx % RegAlign != 0 && RegIdx < RegList.size())
       State.AllocateReg(RegList[RegIdx++]);
 
     break;
   }
   case MVT::f16:
+  case MVT::bf16:
   case MVT::f32:
     RegList = SRegList;
     break;
   case MVT::v4f16:
+  case MVT::v4bf16:
   case MVT::f64:
     RegList = DRegList;
     break;
   case MVT::v8f16:
+  case MVT::v8bf16:
   case MVT::v2f64:
     RegList = QRegList;
     break;
@@ -247,7 +248,7 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
     unsigned RegIdx = State.getFirstUnallocated(RegList);
     for (auto &It : PendingMembers) {
       if (RegIdx >= RegList.size())
-        It.convertToMem(State.AllocateStack(Size, Size));
+        It.convertToMem(State.AllocateStack(Size, Align(Size)));
       else
         It.convertToReg(State.AllocateReg(RegList[RegIdx++]));
 
@@ -265,12 +266,12 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
   // After the first item has been allocated, the rest are packed as tightly as
   // possible. (E.g. an incoming i64 would have starting Align of 8, but we'll
   // be allocating a bunch of i32 slots).
-  unsigned RestAlign = std::min(Align, Size);
+  const Align RestAlign = std::min(Alignment, Align(Size));
 
   for (auto &It : PendingMembers) {
-    It.convertToMem(State.AllocateStack(Size, Align));
+    It.convertToMem(State.AllocateStack(Size, Alignment));
     State.addLoc(It);
-    Align = RestAlign;
+    Alignment = RestAlign;
   }
 
   // All pending members have now been allocated
@@ -280,5 +281,33 @@ static bool CC_ARM_AAPCS_Custom_Aggregate(unsigned ValNo, MVT ValVT,
   return true;
 }
 
+static bool CustomAssignInRegList(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                  CCValAssign::LocInfo LocInfo, CCState &State,
+                                  ArrayRef<MCPhysReg> RegList) {
+  unsigned Reg = State.AllocateReg(RegList);
+  if (Reg) {
+    State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
+    return true;
+  }
+  return false;
+}
+
+static bool CC_ARM_AAPCS_Custom_f16(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                    CCValAssign::LocInfo LocInfo,
+                                    ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  // f16 arguments are extended to i32 and assigned to a register in [r0, r3]
+  return CustomAssignInRegList(ValNo, ValVT, MVT::i32, LocInfo, State,
+                               RRegList);
+}
+
+static bool CC_ARM_AAPCS_VFP_Custom_f16(unsigned ValNo, MVT ValVT, MVT LocVT,
+                                        CCValAssign::LocInfo LocInfo,
+                                        ISD::ArgFlagsTy ArgFlags,
+                                        CCState &State) {
+  // f16 arguments are extended to f32 and assigned to a register in [s0, s15]
+  return CustomAssignInRegList(ValNo, ValVT, MVT::f32, LocInfo, State,
+                               SRegList);
+}
+
 // Include the table generated calling convention implementations.
 #include "ARMGenCallingConv.inc"
diff --git a/llvm/lib/Target/ARM/ARMCallingConv.td b/llvm/lib/Target/ARM/ARMCallingConv.td
index 5df5b56f5afa..3517274e4c5c 100644
--- a/llvm/lib/Target/ARM/ARMCallingConv.td
+++ b/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -10,7 +10,7 @@
 
 /// CCIfAlign - Match of the original alignment of the arg
 class CCIfAlign<string Align, CCAction A>:
-  CCIf<!strconcat("ArgFlags.getOrigAlign() == ", Align), A>;
+  CCIf<!strconcat("ArgFlags.getNonZeroOrigAlign() == ", Align), A>;
 
 //===----------------------------------------------------------------------===//
 // ARM APCS Calling Convention
@@ -30,8 +30,8 @@ def CC_ARM_APCS : CallingConv<[
   CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
 
   // Handle all vector types as either f64 or v2f64.
-  CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
-  CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+  CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   // f64 and v2f64 are passed in adjacent GPRs, possibly split onto the stack
   CCIfType<[f64, v2f64], CCCustom<"CC_ARM_APCS_Custom_f64">>,
@@ -56,8 +56,8 @@ def RetCC_ARM_APCS : CallingConv<[
   CCIfSwiftError<CCIfType<[i32], CCAssignToReg<[R8]>>>,
 
   // Handle all vector types as either f64 or v2f64.
-  CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
-  CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+  CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_APCS_Custom_f64">>,
 
@@ -71,8 +71,8 @@ def RetCC_ARM_APCS : CallingConv<[
 let Entry = 1 in
 def FastCC_ARM_APCS : CallingConv<[
   // Handle all vector types as either f64 or v2f64.
-  CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
-  CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+  CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
@@ -91,8 +91,8 @@ def FastCC_ARM_APCS : CallingConv<[
 let Entry = 1 in
 def RetFastCC_ARM_APCS : CallingConv<[
   // Handle all vector types as either f64 or v2f64.
-  CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
-  CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+  CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
@@ -108,8 +108,8 @@ def RetFastCC_ARM_APCS : CallingConv<[
 let Entry = 1 in
 def CC_ARM_APCS_GHC : CallingConv<[
   // Handle all vector types as either f64 or v2f64.
-  CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
-  CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+  CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   CCIfType<[v2f64], CCAssignToReg<[Q4, Q5]>>,
   CCIfType<[f64], CCAssignToReg<[D8, D9, D10, D11]>>,
@@ -134,12 +134,12 @@ def CC_ARM_AAPCS_Common : CallingConv<[
   // i64 is 8-aligned i32 here, so we may need to eat R1 as a pad register
   // (and the same is true for f64 if VFP is not enabled)
   CCIfType<[i32], CCIfAlign<"8", CCAssignToRegWithShadow<[R0, R2], [R0, R1]>>>,
-  CCIfType<[i32], CCIf<"ArgFlags.getOrigAlign() != 8",
+  CCIfType<[i32], CCIf<"ArgFlags.getNonZeroOrigAlign() != Align(8)",
                        CCAssignToReg<[R0, R1, R2, R3]>>>,
 
   CCIfType<[i32], CCIfAlign<"8", CCAssignToStackWithShadow<4, 8, [R0, R1, R2, R3]>>>,
   CCIfType<[i32], CCAssignToStackWithShadow<4, 4, [R0, R1, R2, R3]>>,
-  CCIfType<[f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
+  CCIfType<[f16, bf16, f32], CCAssignToStackWithShadow<4, 4, [Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToStackWithShadow<8, 8, [Q0, Q1, Q2, Q3]>>,
   CCIfType<[v2f64], CCIfAlign<"16",
            CCAssignToStackWithShadow<16, 16, [Q0, Q1, Q2, Q3]>>>,
@@ -165,8 +165,8 @@ def CC_ARM_AAPCS : CallingConv<[
   CCIfNest<CCAssignToReg<[R12]>>,
 
   // Handle all vector types as either f64 or v2f64.
-  CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
-  CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+  CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -176,14 +176,15 @@ def CC_ARM_AAPCS : CallingConv<[
 
   CCIfType<[f64, v2f64], CCCustom<"CC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCIfType<[f16, bf16], CCCustom<"CC_ARM_AAPCS_Custom_f16">>,
   CCDelegateTo<CC_ARM_AAPCS_Common>
 ]>;
 
 let Entry = 1 in
 def RetCC_ARM_AAPCS : CallingConv<[
   // Handle all vector types as either f64 or v2f64.
-  CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
-  CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+  CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -193,6 +194,7 @@ def RetCC_ARM_AAPCS : CallingConv<[
 
   CCIfType<[f64, v2f64], CCCustom<"RetCC_ARM_AAPCS_Custom_f64">>,
   CCIfType<[f32], CCBitConvertToType<i32>>,
+  CCIfType<[f16, bf16], CCCustom<"CC_ARM_AAPCS_Custom_f16">>,
 
   CCDelegateTo<RetCC_ARM_AAPCS_Common>
 ]>;
@@ -208,8 +210,8 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
   CCIfByVal<CCPassByVal<4, 4>>,
 
   // Handle all vector types as either f64 or v2f64.
-  CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
-  CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+  CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -224,14 +226,15 @@ def CC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
                                  S9, S10, S11, S12, S13, S14, S15]>>,
+  CCIfType<[f16, bf16], CCCustom<"CC_ARM_AAPCS_VFP_Custom_f16">>,
   CCDelegateTo<CC_ARM_AAPCS_Common>
 ]>;
 
 let Entry = 1 in
 def RetCC_ARM_AAPCS_VFP : CallingConv<[
   // Handle all vector types as either f64 or v2f64.
-  CCIfType<[v1i64, v2i32, v4i16, v4f16, v8i8, v2f32], CCBitConvertToType<f64>>,
-  CCIfType<[v2i64, v4i32, v8i16, v8f16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
+  CCIfType<[v1i64, v2i32, v4i16, v4f16, v4bf16, v8i8, v2f32], CCBitConvertToType<f64>>,
+  CCIfType<[v2i64, v4i32, v8i16, v8f16, v8bf16, v16i8, v4f32], CCBitConvertToType<v2f64>>,
 
   // Pass SwiftSelf in a callee saved register.
   CCIfSwiftSelf<CCIfType<[i32], CCAssignToReg<[R10]>>>,
@@ -242,7 +245,8 @@ def RetCC_ARM_AAPCS_VFP : CallingConv<[
   CCIfType<[v2f64], CCAssignToReg<[Q0, Q1, Q2, Q3]>>,
   CCIfType<[f64], CCAssignToReg<[D0, D1, D2, D3, D4, D5, D6, D7]>>,
   CCIfType<[f32], CCAssignToReg<[S0, S1, S2, S3, S4, S5, S6, S7, S8,
-                                      S9, S10, S11, S12, S13, S14, S15]>>,
+                                 S9, S10, S11, S12, S13, S14, S15]>>,
+  CCIfType<[f16, bf16], CCCustom<"CC_ARM_AAPCS_VFP_Custom_f16">>,
   CCDelegateTo<RetCC_ARM_AAPCS_Common>
 ]>;
 
diff --git a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
index 634fb89b8e89..195d0a89291b 100644
--- a/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantIslandPass.cpp
@@ -206,10 +206,6 @@ namespace {
     /// T2JumpTables - Keep track of all the Thumb2 jumptable instructions.
     SmallVector<MachineInstr*, 4> T2JumpTables;
 
-    /// HasFarJump - True if any far jump instruction has been emitted during
-    /// the branch fix up pass.
-    bool HasFarJump;
-
     MachineFunction *MF;
     MachineConstantPool *MCP;
     const ARMBaseInstrInfo *TII;
@@ -270,7 +266,6 @@ namespace {
     bool fixupImmediateBr(ImmBranch &Br);
     bool fixupConditionalBr(ImmBranch &Br);
     bool fixupUnconditionalBr(ImmBranch &Br);
-    bool undoLRSpillRestore();
     bool optimizeThumb2Instructions();
     bool optimizeThumb2Branches();
     bool reorderThumb2JumpTables();
@@ -330,8 +325,8 @@ void ARMConstantIslands::verify() {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 /// print block size and offset information - debugging
 LLVM_DUMP_METHOD void ARMConstantIslands::dumpBBs() {
-  BBInfoVector &BBInfo = BBUtils->getBBInfo();
   LLVM_DEBUG({
+    BBInfoVector &BBInfo = BBUtils->getBBInfo();
     for (unsigned J = 0, E = BBInfo.size(); J !=E; ++J) {
       const BasicBlockInfo &BBI = BBInfo[J];
       dbgs() << format("%08x %bb.%u\t", BBI.Offset, J)
@@ -350,7 +345,7 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
 
   LLVM_DEBUG(dbgs() << "***** ARMConstantIslands: "
                     << MCP->getConstants().size() << " CP entries, aligned to "
-                    << MCP->getConstantPoolAlignment() << " bytes *****\n");
+                    << MCP->getConstantPoolAlign().value() << " bytes *****\n");
 
   STI = &static_cast<const ARMSubtarget &>(MF->getSubtarget());
   TII = STI->getInstrInfo();
@@ -363,7 +358,6 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   isThumb1 = AFI->isThumb1OnlyFunction();
   isThumb2 = AFI->isThumb2Function();
 
-  HasFarJump = false;
   bool GenerateTBB = isThumb2 || (isThumb1 && SynthesizeThumb1TBB);
 
   // Renumber all of the machine basic blocks in the function, guaranteeing that
@@ -456,11 +450,6 @@ bool ARMConstantIslands::runOnMachineFunction(MachineFunction &mf) {
   // After a while, this might be made debug-only, but it is not expensive.
   verify();
 
-  // If LR has been forced spilled and no far jump (i.e. BL) has been issued,
-  // undo the spill / restore of LR if possible.
-  if (isThumb && !HasFarJump && AFI->isLRSpilledForFarJump())
-    MadeChange |= undoLRSpillRestore();
-
   // Save the mapping between original and cloned constpool entries.
   for (unsigned i = 0, e = CPEntries.size(); i != e; ++i) {
     for (unsigned j = 0, je = CPEntries[i].size(); j != je; ++j) {
@@ -494,7 +483,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
   MF->push_back(BB);
 
   // MachineConstantPool measures alignment in bytes.
-  const Align MaxAlign(MCP->getConstantPoolAlignment());
+  const Align MaxAlign = MCP->getConstantPoolAlign();
   const unsigned MaxLogAlign = Log2(MaxAlign);
 
   // Mark the basic block as required by the const-pool.
@@ -518,14 +507,13 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
   const DataLayout &TD = MF->getDataLayout();
   for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
     unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
-    unsigned Align = CPs[i].getAlignment();
-    assert(isPowerOf2_32(Align) && "Invalid alignment");
+    Align Alignment = CPs[i].getAlign();
     // Verify that all constant pool entries are a multiple of their alignment.
     // If not, we would have to pad them out so that instructions stay aligned.
-    assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!");
+    assert(isAligned(Alignment, Size) && "CP Entry not multiple of 4 bytes!");
 
     // Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
-    unsigned LogAlign = Log2_32(Align);
+    unsigned LogAlign = Log2(Alignment);
     MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
     MachineInstr *CPEMI =
       BuildMI(*BB, InsAt, DebugLoc(), TII->get(ARM::CONSTPOOL_ENTRY))
@@ -542,7 +530,7 @@ ARMConstantIslands::doInitialConstPlacement(std::vector<MachineInstr*> &CPEMIs)
     CPEntries.emplace_back(1, CPEntry(CPEMI, i));
     ++NumCPEs;
     LLVM_DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
-                      << Size << ", align = " << Align << '\n');
+                      << Size << ", align = " << Alignment.value() << '\n');
   }
   LLVM_DEBUG(BB->dump());
 }
@@ -668,7 +656,7 @@ Align ARMConstantIslands::getCPEAlign(const MachineInstr *CPEMI) {
 
   unsigned CPI = getCombinedIndex(CPEMI);
   assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
-  return Align(MCP->getConstants()[CPI].getAlignment());
+  return MCP->getConstants()[CPI].getAlign();
 }
 
 /// scanFunctionJumpTables - Do a scan of the function, building up
@@ -1364,8 +1352,8 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
     //   displacement.
     MachineBasicBlock::iterator I = UserMI;
     ++I;
-    for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI),
-                  PredReg = 0;
+    Register PredReg;
+    for (unsigned Offset = UserOffset + TII->getInstSizeInBytes(*UserMI);
          I->getOpcode() != ARM::t2IT &&
          getITInstrPredicate(*I, PredReg) != ARMCC::AL;
          Offset += TII->getInstSizeInBytes(*I), I = std::next(I)) {
@@ -1410,7 +1398,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
 
   // Avoid splitting an IT block.
   if (LastIT) {
-    unsigned PredReg = 0;
+    Register PredReg;
     ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg);
     if (CC != ARMCC::AL)
       MI = LastIT;
@@ -1434,7 +1422,7 @@ void ARMConstantIslands::createNewWater(unsigned CPUserIndex,
 
   // We really must not split an IT block.
 #ifndef NDEBUG
-  unsigned PredReg;
+  Register PredReg;
   assert(!isThumb || getITInstrPredicate(*MI, PredReg) == ARMCC::AL);
 #endif
   NewMBB = splitBlockBeforeInstr(&*MI);
@@ -1566,7 +1554,7 @@ void ARMConstantIslands::removeDeadCPEMI(MachineInstr *CPEMI) {
     BBInfo[CPEBB->getNumber()].Size = 0;
 
     // This block no longer needs to be aligned.
-    CPEBB->setAlignment(Align::None());
+    CPEBB->setAlignment(Align(1));
   } else {
     // Entries are sorted by descending alignment, so realign from the front.
     CPEBB->setAlignment(getCPEAlign(&*CPEBB->begin()));
@@ -1633,7 +1621,6 @@ ARMConstantIslands::fixupUnconditionalBr(ImmBranch &Br) {
   BBInfoVector &BBInfo = BBUtils->getBBInfo();
   BBInfo[MBB->getNumber()].Size += 2;
   BBUtils->adjustBBOffsetsAfter(MBB);
-  HasFarJump = true;
   ++NumUBrFixed;
 
   LLVM_DEBUG(dbgs() << "  Changed B to long jump " << *MI);
@@ -1735,34 +1722,6 @@ ARMConstantIslands::fixupConditionalBr(ImmBranch &Br) {
   return true;
 }
 
-/// undoLRSpillRestore - Remove Thumb push / pop instructions that only spills
-/// LR / restores LR to pc. FIXME: This is done here because it's only possible
-/// to do this if tBfar is not used.
-bool ARMConstantIslands::undoLRSpillRestore() {
-  bool MadeChange = false;
-  for (unsigned i = 0, e = PushPopMIs.size(); i != e; ++i) {
-    MachineInstr *MI = PushPopMIs[i];
-    // First two operands are predicates.
-    if (MI->getOpcode() == ARM::tPOP_RET &&
-        MI->getOperand(2).getReg() == ARM::PC &&
-        MI->getNumExplicitOperands() == 3) {
-      // Create the new insn and copy the predicate from the old.
-      BuildMI(MI->getParent(), MI->getDebugLoc(), TII->get(ARM::tBX_RET))
-          .add(MI->getOperand(0))
-          .add(MI->getOperand(1));
-      MI->eraseFromParent();
-      MadeChange = true;
-    } else if (MI->getOpcode() == ARM::tPUSH &&
-               MI->getOperand(2).getReg() == ARM::LR &&
-               MI->getNumExplicitOperands() == 3) {
-      // Just remove the push.
-      MI->eraseFromParent();
-      MadeChange = true;
-    }
-  }
-  return MadeChange;
-}
-
 bool ARMConstantIslands::optimizeThumb2Instructions() {
   bool MadeChange = false;
 
@@ -1868,7 +1827,7 @@ bool ARMConstantIslands::optimizeThumb2Branches() {
     if (!Br.MI->killsRegister(ARM::CPSR))
       return false;
 
-    unsigned PredReg = 0;
+    Register PredReg;
     unsigned NewOpc = 0;
     ARMCC::CondCodes Pred = getInstrPredicate(*Br.MI, PredReg);
     if (Pred == ARMCC::EQ)
@@ -2402,6 +2361,7 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   SmallVector<MachineOperand, 4> CondPrior;
   MachineFunction::iterator BBi = BB->getIterator();
   MachineFunction::iterator OldPrior = std::prev(BBi);
+  MachineFunction::iterator OldNext = std::next(BBi);
 
   // If the block terminator isn't analyzable, don't try to move the block
   bool B = TII->analyzeBranch(*BB, TBB, FBB, Cond);
@@ -2412,8 +2372,8 @@ adjustJTTargetBlockForward(MachineBasicBlock *BB, MachineBasicBlock *JTBB) {
   if (!B && Cond.empty() && BB != &MF->front() &&
       !TII->analyzeBranch(*OldPrior, TBB, FBB, CondPrior)) {
     BB->moveAfter(JTBB);
-    OldPrior->updateTerminator();
-    BB->updateTerminator();
+    OldPrior->updateTerminator(BB);
+    BB->updateTerminator(OldNext != MF->end() ? &*OldNext : nullptr);
     // Update numbering to account for the block being moved.
     MF->RenumberBlocks();
     ++NumJTMoved;
diff --git a/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp b/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
index 72c95f441265..c1df7ef43cad 100644
--- a/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
+++ b/llvm/lib/Target/ARM/ARMConstantPoolValue.cpp
@@ -73,7 +73,7 @@ StringRef ARMConstantPoolValue::getModifierText() const {
 }
 
 int ARMConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
-                                                    unsigned Alignment) {
+                                                    Align Alignment) {
   llvm_unreachable("Shouldn't be calling this directly!");
 }
 
@@ -189,7 +189,7 @@ const BlockAddress *ARMConstantPoolConstant::getBlockAddress() const {
 }
 
 int ARMConstantPoolConstant::getExistingMachineCPValue(MachineConstantPool *CP,
-                                                       unsigned Alignment) {
+                                                       Align Alignment) {
   int index =
     getExistingMachineCPValueImpl<ARMConstantPoolConstant>(CP, Alignment);
   if (index != -1) {
@@ -228,7 +228,7 @@ ARMConstantPoolSymbol::ARMConstantPoolSymbol(LLVMContext &C, StringRef s,
                                              bool AddCurrentAddress)
     : ARMConstantPoolValue(C, id, ARMCP::CPExtSymbol, PCAdj, Modifier,
                            AddCurrentAddress),
-      S(s) {}
+      S(std::string(s)) {}
 
 ARMConstantPoolSymbol *ARMConstantPoolSymbol::Create(LLVMContext &C,
                                                      StringRef s, unsigned ID,
@@ -237,7 +237,7 @@ ARMConstantPoolSymbol *ARMConstantPoolSymbol::Create(LLVMContext &C,
 }
 
 int ARMConstantPoolSymbol::getExistingMachineCPValue(MachineConstantPool *CP,
-                                                     unsigned Alignment) {
+                                                     Align Alignment) {
   return getExistingMachineCPValueImpl<ARMConstantPoolSymbol>(CP, Alignment);
 }
 
@@ -277,7 +277,7 @@ ARMConstantPoolMBB *ARMConstantPoolMBB::Create(LLVMContext &C,
 }
 
 int ARMConstantPoolMBB::getExistingMachineCPValue(MachineConstantPool *CP,
-                                                  unsigned Alignment) {
+                                                  Align Alignment) {
   return getExistingMachineCPValueImpl<ARMConstantPoolMBB>(CP, Alignment);
 }
 
diff --git a/llvm/lib/Target/ARM/ARMConstantPoolValue.h b/llvm/lib/Target/ARM/ARMConstantPoolValue.h
index 660b7fc88d82..261070a74ba3 100644
--- a/llvm/lib/Target/ARM/ARMConstantPoolValue.h
+++ b/llvm/lib/Target/ARM/ARMConstantPoolValue.h
@@ -76,13 +76,11 @@ protected:
                        bool AddCurrentAddress);
 
   template <typename Derived>
-  int getExistingMachineCPValueImpl(MachineConstantPool *CP,
-                                    unsigned Alignment) {
-    unsigned AlignMask = Alignment - 1;
+  int getExistingMachineCPValueImpl(MachineConstantPool *CP, Align Alignment) {
     const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
     for (unsigned i = 0, e = Constants.size(); i != e; ++i) {
       if (Constants[i].isMachineConstantPoolEntry() &&
-          (Constants[i].getAlignment() & AlignMask) == 0) {
+          Constants[i].getAlign() >= Alignment) {
         auto *CPV =
           static_cast<ARMConstantPoolValue*>(Constants[i].Val.MachineCPVal);
         if (Derived *APC = dyn_cast<Derived>(CPV))
@@ -114,7 +112,7 @@ public:
   bool isPromotedGlobal() const{ return Kind == ARMCP::CPPromotedGlobal; }
 
   int getExistingMachineCPValue(MachineConstantPool *CP,
-                                unsigned Alignment) override;
+                                Align Alignment) override;
 
   void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
 
@@ -187,7 +185,7 @@ public:
   }
 
   int getExistingMachineCPValue(MachineConstantPool *CP,
-                                unsigned Alignment) override;
+                                Align Alignment) override;
 
   /// hasSameValue - Return true if this ARM constpool value can share the same
   /// constantpool entry as another ARM constpool value.
@@ -223,7 +221,7 @@ public:
   StringRef getSymbol() const { return S; }
 
   int getExistingMachineCPValue(MachineConstantPool *CP,
-                                unsigned Alignment) override;
+                                Align Alignment) override;
 
   void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
 
@@ -259,7 +257,7 @@ public:
   const MachineBasicBlock *getMBB() const { return MBB; }
 
   int getExistingMachineCPValue(MachineConstantPool *CP,
-                                unsigned Alignment) override;
+                                Align Alignment) override;
 
   void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
 
diff --git a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index 2c3ac816219f..48622aae3cb4 100644
--- a/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -71,6 +71,38 @@ namespace {
                     unsigned Opc, bool IsExt);
     void ExpandMOV32BitImm(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator &MBBI);
+    void CMSEClearGPRegs(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+                         const SmallVectorImpl<unsigned> &ClearRegs,
+                         unsigned ClobberReg);
+    MachineBasicBlock &CMSEClearFPRegs(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI);
+    MachineBasicBlock &CMSEClearFPRegsV8(MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MBBI,
+                                         const BitVector &ClearRegs);
+    MachineBasicBlock &CMSEClearFPRegsV81(MachineBasicBlock &MBB,
+                                          MachineBasicBlock::iterator MBBI,
+                                          const BitVector &ClearRegs);
+    void CMSESaveClearFPRegs(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+                             const LivePhysRegs &LiveRegs,
+                             SmallVectorImpl<unsigned> &AvailableRegs);
+    void CMSESaveClearFPRegsV8(MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+                               const LivePhysRegs &LiveRegs,
+                               SmallVectorImpl<unsigned> &ScratchRegs);
+    void CMSESaveClearFPRegsV81(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+                                const LivePhysRegs &LiveRegs);
+    void CMSERestoreFPRegs(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+                           SmallVectorImpl<unsigned> &AvailableRegs);
+    void CMSERestoreFPRegsV8(MachineBasicBlock &MBB,
+                             MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+                             SmallVectorImpl<unsigned> &AvailableRegs);
+    void CMSERestoreFPRegsV81(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+                              SmallVectorImpl<unsigned> &AvailableRegs);
     bool ExpandCMP_SWAP(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator MBBI, unsigned LdrexOp,
                         unsigned StrexOp, unsigned UxtOp,
@@ -417,8 +449,7 @@ static const NEONLdStTableEntry *LookupNEONLdSt(unsigned Opcode) {
   // Make sure the table is sorted.
   static std::atomic<bool> TableChecked(false);
   if (!TableChecked.load(std::memory_order_relaxed)) {
-    assert(std::is_sorted(std::begin(NEONLdStTable), std::end(NEONLdStTable)) &&
-           "NEONLdStTable is not sorted!");
+    assert(llvm::is_sorted(NEONLdStTable) && "NEONLdStTable is not sorted!");
     TableChecked.store(true, std::memory_order_relaxed);
   }
 #endif
@@ -827,7 +858,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator &MBBI) {
   MachineInstr &MI = *MBBI;
   unsigned Opcode = MI.getOpcode();
-  unsigned PredReg = 0;
+  Register PredReg;
   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
   Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
@@ -852,10 +883,13 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
     unsigned ImmVal = (unsigned)MO.getImm();
     unsigned SOImmValV1 = ARM_AM::getSOImmTwoPartFirst(ImmVal);
     unsigned SOImmValV2 = ARM_AM::getSOImmTwoPartSecond(ImmVal);
+    unsigned MIFlags = MI.getFlags();
     LO16 = LO16.addImm(SOImmValV1);
     HI16 = HI16.addImm(SOImmValV2);
     LO16.cloneMemRefs(MI);
     HI16.cloneMemRefs(MI);
+    LO16.setMIFlags(MIFlags);
+    HI16.setMIFlags(MIFlags);
     LO16.addImm(Pred).addReg(PredReg).add(condCodeOp());
     HI16.addImm(Pred).addReg(PredReg).add(condCodeOp());
     if (isCC)
@@ -867,6 +901,7 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
 
   unsigned LO16Opc = 0;
   unsigned HI16Opc = 0;
+  unsigned MIFlags = MI.getFlags();
   if (Opcode == ARM::t2MOVi32imm || Opcode == ARM::t2MOVCCi32imm) {
     LO16Opc = ARM::t2MOVi16;
     HI16Opc = ARM::t2MOVTi16;
@@ -880,6 +915,9 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
     .addReg(DstReg, RegState::Define | getDeadRegState(DstIsDead))
     .addReg(DstReg);
 
+  LO16.setMIFlags(MIFlags);
+  HI16.setMIFlags(MIFlags);
+
   switch (MO.getType()) {
   case MachineOperand::MO_Immediate: {
     unsigned Imm = MO.getImm();
@@ -921,6 +959,582 @@ void ARMExpandPseudo::ExpandMOV32BitImm(MachineBasicBlock &MBB,
   LLVM_DEBUG(dbgs() << "And:       "; HI16.getInstr()->dump(););
 }
 
+// The size of the area, accessed by that VLSTM/VLLDM
+// S0-S31 + FPSCR + 8 more bytes (VPR + pad, or just pad)
+static const int CMSE_FP_SAVE_SIZE = 136;
+
+static void determineGPRegsToClear(const MachineInstr &MI,
+                                   const std::initializer_list<unsigned> &Regs,
+                                   SmallVectorImpl<unsigned> &ClearRegs) {
+  SmallVector<unsigned, 4> OpRegs;
+  for (const MachineOperand &Op : MI.operands()) {
+    if (!Op.isReg() || !Op.isUse())
+      continue;
+    OpRegs.push_back(Op.getReg());
+  }
+  llvm::sort(OpRegs);
+
+  std::set_difference(Regs.begin(), Regs.end(), OpRegs.begin(), OpRegs.end(),
+                      std::back_inserter(ClearRegs));
+}
+
+void ARMExpandPseudo::CMSEClearGPRegs(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    const DebugLoc &DL, const SmallVectorImpl<unsigned> &ClearRegs,
+    unsigned ClobberReg) {
+
+  if (STI->hasV8_1MMainlineOps()) {
+    // Clear the registers using the CLRM instruction.
+    MachineInstrBuilder CLRM =
+        BuildMI(MBB, MBBI, DL, TII->get(ARM::t2CLRM)).add(predOps(ARMCC::AL));
+    for (unsigned R : ClearRegs)
+      CLRM.addReg(R, RegState::Define);
+    CLRM.addReg(ARM::APSR, RegState::Define);
+    CLRM.addReg(ARM::CPSR, RegState::Define | RegState::Implicit);
+  } else {
+    // Clear the registers and flags by copying ClobberReg into them.
+    // (Baseline can't do a high register clear in one instruction).
+    for (unsigned Reg : ClearRegs) {
+      if (Reg == ClobberReg)
+        continue;
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::tMOVr), Reg)
+          .addReg(ClobberReg)
+          .add(predOps(ARMCC::AL));
+    }
+
+    BuildMI(MBB, MBBI, DL, TII->get(ARM::t2MSR_M))
+        .addImm(STI->hasDSP() ? 0xc00 : 0x800)
+        .addReg(ClobberReg)
+        .add(predOps(ARMCC::AL));
+  }
+}
+
+// Find which FP registers need to be cleared.  The parameter `ClearRegs` is
+// initialised with all elements set to true, and this function resets all the
+// bits, which correspond to register uses. Returns true if any floating point
+// register is defined, false otherwise.
+static bool determineFPRegsToClear(const MachineInstr &MI,
+                                   BitVector &ClearRegs) {
+  bool DefFP = false;
+  for (const MachineOperand &Op : MI.operands()) {
+    if (!Op.isReg())
+      continue;
+
+    unsigned Reg = Op.getReg();
+    if (Op.isDef()) {
+      if ((Reg >= ARM::Q0 && Reg <= ARM::Q7) ||
+          (Reg >= ARM::D0 && Reg <= ARM::D15) ||
+          (Reg >= ARM::S0 && Reg <= ARM::S31))
+        DefFP = true;
+      continue;
+    }
+
+    if (Reg >= ARM::Q0 && Reg <= ARM::Q7) {
+      int R = Reg - ARM::Q0;
+      ClearRegs.reset(R * 4, (R + 1) * 4);
+    } else if (Reg >= ARM::D0 && Reg <= ARM::D15) {
+      int R = Reg - ARM::D0;
+      ClearRegs.reset(R * 2, (R + 1) * 2);
+    } else if (Reg >= ARM::S0 && Reg <= ARM::S31) {
+      ClearRegs[Reg - ARM::S0] = false;
+    }
+  }
+  return DefFP;
+}
+
+MachineBasicBlock &
+ARMExpandPseudo::CMSEClearFPRegs(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI) {
+  BitVector ClearRegs(16, true);
+  (void)determineFPRegsToClear(*MBBI, ClearRegs);
+
+  if (STI->hasV8_1MMainlineOps())
+    return CMSEClearFPRegsV81(MBB, MBBI, ClearRegs);
+  else
+    return CMSEClearFPRegsV8(MBB, MBBI, ClearRegs);
+}
+
+// Clear the FP registers for v8.0-M, by copying over the content
+// of LR. Uses R12 as a scratch register.
+MachineBasicBlock &
+ARMExpandPseudo::CMSEClearFPRegsV8(MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   const BitVector &ClearRegs) {
+  if (!STI->hasFPRegs())
+    return MBB;
+
+  auto &RetI = *MBBI;
+  const DebugLoc &DL = RetI.getDebugLoc();
+
+  // If optimising for minimum size, clear FP registers unconditionally.
+  // Otherwise, check the CONTROL.SFPA (Secure Floating-Point Active) bit and
+  // don't clear them if they belong to the non-secure state.
+  MachineBasicBlock *ClearBB, *DoneBB;
+  if (STI->hasMinSize()) {
+    ClearBB = DoneBB = &MBB;
+  } else {
+    MachineFunction *MF = MBB.getParent();
+    ClearBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+    DoneBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+    MF->insert(++MBB.getIterator(), ClearBB);
+    MF->insert(++ClearBB->getIterator(), DoneBB);
+
+    DoneBB->splice(DoneBB->end(), &MBB, MBBI, MBB.end());
+    DoneBB->transferSuccessors(&MBB);
+    MBB.addSuccessor(ClearBB);
+    MBB.addSuccessor(DoneBB);
+    ClearBB->addSuccessor(DoneBB);
+
+    // At the new basic blocks we need to have live-in the registers, used
+    // for the return value as well as LR, used to clear registers.
+    for (const MachineOperand &Op : RetI.operands()) {
+      if (!Op.isReg())
+        continue;
+      Register Reg = Op.getReg();
+      if (Reg == ARM::NoRegister || Reg == ARM::LR)
+        continue;
+      assert(Register::isPhysicalRegister(Reg) && "Unallocated register");
+      ClearBB->addLiveIn(Reg);
+      DoneBB->addLiveIn(Reg);
+    }
+    ClearBB->addLiveIn(ARM::LR);
+    DoneBB->addLiveIn(ARM::LR);
+
+    // Read the CONTROL register.
+    BuildMI(MBB, MBB.end(), DL, TII->get(ARM::t2MRS_M), ARM::R12)
+        .addImm(20)
+        .add(predOps(ARMCC::AL));
+    // Check bit 3 (SFPA).
+    BuildMI(MBB, MBB.end(), DL, TII->get(ARM::t2TSTri))
+        .addReg(ARM::R12)
+        .addImm(8)
+        .add(predOps(ARMCC::AL));
+    // If SFPA is clear, jump over ClearBB to DoneBB.
+    BuildMI(MBB, MBB.end(), DL, TII->get(ARM::tBcc))
+        .addMBB(DoneBB)
+        .addImm(ARMCC::EQ)
+        .addReg(ARM::CPSR, RegState::Kill);
+  }
+
+  // Emit the clearing sequence
+  for (unsigned D = 0; D < 8; D++) {
+    // Attempt to clear as double
+    if (ClearRegs[D * 2 + 0] && ClearRegs[D * 2 + 1]) {
+      unsigned Reg = ARM::D0 + D;
+      BuildMI(ClearBB, DL, TII->get(ARM::VMOVDRR), Reg)
+          .addReg(ARM::LR)
+          .addReg(ARM::LR)
+          .add(predOps(ARMCC::AL));
+    } else {
+      // Clear first part as single
+      if (ClearRegs[D * 2 + 0]) {
+        unsigned Reg = ARM::S0 + D * 2;
+        BuildMI(ClearBB, DL, TII->get(ARM::VMOVSR), Reg)
+            .addReg(ARM::LR)
+            .add(predOps(ARMCC::AL));
+      }
+      // Clear second part as single
+      if (ClearRegs[D * 2 + 1]) {
+        unsigned Reg = ARM::S0 + D * 2 + 1;
+        BuildMI(ClearBB, DL, TII->get(ARM::VMOVSR), Reg)
+            .addReg(ARM::LR)
+            .add(predOps(ARMCC::AL));
+      }
+    }
+  }
+
+  // Clear FPSCR bits 0-4, 7, 28-31
+  // The other bits are program global according to the AAPCS
+  BuildMI(ClearBB, DL, TII->get(ARM::VMRS), ARM::R12)
+      .add(predOps(ARMCC::AL));
+  BuildMI(ClearBB, DL, TII->get(ARM::t2BICri), ARM::R12)
+      .addReg(ARM::R12)
+      .addImm(0x0000009F)
+      .add(predOps(ARMCC::AL))
+      .add(condCodeOp());
+  BuildMI(ClearBB, DL, TII->get(ARM::t2BICri), ARM::R12)
+      .addReg(ARM::R12)
+      .addImm(0xF0000000)
+      .add(predOps(ARMCC::AL))
+      .add(condCodeOp());
+  BuildMI(ClearBB, DL, TII->get(ARM::VMSR))
+      .addReg(ARM::R12)
+      .add(predOps(ARMCC::AL));
+
+  return *DoneBB;
+}
+
+MachineBasicBlock &
+ARMExpandPseudo::CMSEClearFPRegsV81(MachineBasicBlock &MBB,
+                                    MachineBasicBlock::iterator MBBI,
+                                    const BitVector &ClearRegs) {
+  auto &RetI = *MBBI;
+
+  // Emit a sequence of VSCCLRM <sreglist> instructions, one instruction for
+  // each contiguous sequence of S-registers.
+  int Start = -1, End = -1;
+  for (int S = 0, E = ClearRegs.size(); S != E; ++S) {
+    if (ClearRegs[S] && S == End + 1) {
+      End = S; // extend range
+      continue;
+    }
+    // Emit current range.
+    if (Start < End) {
+      MachineInstrBuilder VSCCLRM =
+          BuildMI(MBB, MBBI, RetI.getDebugLoc(), TII->get(ARM::VSCCLRMS))
+              .add(predOps(ARMCC::AL));
+      while (++Start <= End)
+        VSCCLRM.addReg(ARM::S0 + Start, RegState::Define);
+      VSCCLRM.addReg(ARM::VPR, RegState::Define);
+    }
+    Start = End = S;
+  }
+  // Emit last range.
+  if (Start < End) {
+    MachineInstrBuilder VSCCLRM =
+        BuildMI(MBB, MBBI, RetI.getDebugLoc(), TII->get(ARM::VSCCLRMS))
+            .add(predOps(ARMCC::AL));
+    while (++Start <= End)
+      VSCCLRM.addReg(ARM::S0 + Start, RegState::Define);
+    VSCCLRM.addReg(ARM::VPR, RegState::Define);
+  }
+
+  return MBB;
+}
+
+void ARMExpandPseudo::CMSESaveClearFPRegs(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+    const LivePhysRegs &LiveRegs, SmallVectorImpl<unsigned> &ScratchRegs) {
+  if (STI->hasV8_1MMainlineOps())
+    CMSESaveClearFPRegsV81(MBB, MBBI, DL, LiveRegs);
+  else
+    CMSESaveClearFPRegsV8(MBB, MBBI, DL, LiveRegs, ScratchRegs);
+}
+
+// Save and clear FP registers if present
+void ARMExpandPseudo::CMSESaveClearFPRegsV8(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+    const LivePhysRegs &LiveRegs, SmallVectorImpl<unsigned> &ScratchRegs) {
+  if (!STI->hasFPRegs())
+    return;
+
+  // Store an available register for FPSCR clearing
+  assert(!ScratchRegs.empty());
+  unsigned SpareReg = ScratchRegs.front();
+
+  // save space on stack for VLSTM
+  BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBspi), ARM::SP)
+      .addReg(ARM::SP)
+      .addImm(CMSE_FP_SAVE_SIZE >> 2)
+      .add(predOps(ARMCC::AL));
+
+  // Use ScratchRegs to store the fp regs
+  std::vector<std::tuple<unsigned, unsigned, unsigned>> ClearedFPRegs;
+  std::vector<unsigned> NonclearedFPRegs;
+  for (const MachineOperand &Op : MBBI->operands()) {
+    if (Op.isReg() && Op.isUse()) {
+      unsigned Reg = Op.getReg();
+      assert(!ARM::DPRRegClass.contains(Reg) ||
+             ARM::DPR_VFP2RegClass.contains(Reg));
+      assert(!ARM::QPRRegClass.contains(Reg));
+      if (ARM::DPR_VFP2RegClass.contains(Reg)) {
+        if (ScratchRegs.size() >= 2) {
+          unsigned SaveReg2 = ScratchRegs.pop_back_val();
+          unsigned SaveReg1 = ScratchRegs.pop_back_val();
+          ClearedFPRegs.emplace_back(Reg, SaveReg1, SaveReg2);
+
+          // Save the fp register to the normal registers
+          BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRRD))
+              .addReg(SaveReg1, RegState::Define)
+              .addReg(SaveReg2, RegState::Define)
+              .addReg(Reg)
+              .add(predOps(ARMCC::AL));
+        } else {
+          NonclearedFPRegs.push_back(Reg);
+        }
+      } else if (ARM::SPRRegClass.contains(Reg)) {
+        if (ScratchRegs.size() >= 1) {
+          unsigned SaveReg = ScratchRegs.pop_back_val();
+          ClearedFPRegs.emplace_back(Reg, SaveReg, 0);
+
+          // Save the fp register to the normal registers
+          BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRS), SaveReg)
+              .addReg(Reg)
+              .add(predOps(ARMCC::AL));
+        } else {
+          NonclearedFPRegs.push_back(Reg);
+        }
+      }
+    }
+  }
+
+  bool passesFPReg = (!NonclearedFPRegs.empty() || !ClearedFPRegs.empty());
+
+  // Lazy store all fp registers to the stack
+  MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
+                                  .addReg(ARM::SP)
+                                  .add(predOps(ARMCC::AL));
+  for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1,
+                 ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7})
+    VLSTM.addReg(R, RegState::Implicit |
+                        (LiveRegs.contains(R) ? 0 : RegState::Undef));
+
+  // Restore all arguments
+  for (const auto &Regs : ClearedFPRegs) {
+    unsigned Reg, SaveReg1, SaveReg2;
+    std::tie(Reg, SaveReg1, SaveReg2) = Regs;
+    if (ARM::DPR_VFP2RegClass.contains(Reg))
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVDRR), Reg)
+          .addReg(SaveReg1)
+          .addReg(SaveReg2)
+          .add(predOps(ARMCC::AL));
+    else if (ARM::SPRRegClass.contains(Reg))
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVSR), Reg)
+          .addReg(SaveReg1)
+          .add(predOps(ARMCC::AL));
+  }
+
+  for (unsigned Reg : NonclearedFPRegs) {
+    if (ARM::DPR_VFP2RegClass.contains(Reg)) {
+      if (STI->isLittle()) {
+        BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRD), Reg)
+            .addReg(ARM::SP)
+            .addImm((Reg - ARM::D0) * 2)
+            .add(predOps(ARMCC::AL));
+      } else {
+        // For big-endian targets we need to load the two subregisters of Reg
+        // manually because VLDRD would load them in wrong order
+        unsigned SReg0 = TRI->getSubReg(Reg, ARM::ssub_0);
+        BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRS), SReg0)
+            .addReg(ARM::SP)
+            .addImm((Reg - ARM::D0) * 2)
+            .add(predOps(ARMCC::AL));
+        BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRS), SReg0 + 1)
+            .addReg(ARM::SP)
+            .addImm((Reg - ARM::D0) * 2 + 1)
+            .add(predOps(ARMCC::AL));
+      }
+    } else if (ARM::SPRRegClass.contains(Reg)) {
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDRS), Reg)
+          .addReg(ARM::SP)
+          .addImm(Reg - ARM::S0)
+          .add(predOps(ARMCC::AL));
+    }
+  }
+  // restore FPSCR from stack and clear bits 0-4, 7, 28-31
+  // The other bits are program global according to the AAPCS
+  if (passesFPReg) {
+    BuildMI(MBB, MBBI, DL, TII->get(ARM::t2LDRi8), SpareReg)
+        .addReg(ARM::SP)
+        .addImm(0x40)
+        .add(predOps(ARMCC::AL));
+    BuildMI(MBB, MBBI, DL, TII->get(ARM::t2BICri), SpareReg)
+        .addReg(SpareReg)
+        .addImm(0x0000009F)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
+    BuildMI(MBB, MBBI, DL, TII->get(ARM::t2BICri), SpareReg)
+        .addReg(SpareReg)
+        .addImm(0xF0000000)
+        .add(predOps(ARMCC::AL))
+        .add(condCodeOp());
+    BuildMI(MBB, MBBI, DL, TII->get(ARM::VMSR))
+        .addReg(SpareReg)
+        .add(predOps(ARMCC::AL));
+    // The ldr must happen after a floating point instruction. To prevent the
+    // post-ra scheduler to mess with the order, we create a bundle.
+    finalizeBundle(MBB, VLSTM->getIterator(), MBBI->getIterator());
+  }
+}
+
+void ARMExpandPseudo::CMSESaveClearFPRegsV81(MachineBasicBlock &MBB,
+                                             MachineBasicBlock::iterator MBBI,
+                                             DebugLoc &DL,
+                                             const LivePhysRegs &LiveRegs) {
+  BitVector ClearRegs(32, true);
+  bool DefFP = determineFPRegsToClear(*MBBI, ClearRegs);
+
+  // If the instruction does not write to a FP register and no elements were
+  // removed from the set, then no FP registers were used to pass
+  // arguments/returns.
+  if (!DefFP && ClearRegs.count() == ClearRegs.size()) {
+    // save space on stack for VLSTM
+    BuildMI(MBB, MBBI, DL, TII->get(ARM::tSUBspi), ARM::SP)
+        .addReg(ARM::SP)
+        .addImm(CMSE_FP_SAVE_SIZE >> 2)
+        .add(predOps(ARMCC::AL));
+
+    // Lazy store all FP registers to the stack
+    MachineInstrBuilder VLSTM = BuildMI(MBB, MBBI, DL, TII->get(ARM::VLSTM))
+                                    .addReg(ARM::SP)
+                                    .add(predOps(ARMCC::AL));
+    for (auto R : {ARM::VPR, ARM::FPSCR, ARM::FPSCR_NZCV, ARM::Q0, ARM::Q1,
+                   ARM::Q2, ARM::Q3, ARM::Q4, ARM::Q5, ARM::Q6, ARM::Q7})
+      VLSTM.addReg(R, RegState::Implicit |
+                          (LiveRegs.contains(R) ? 0 : RegState::Undef));
+  } else {
+    // Push all the callee-saved registers (s16-s31).
+    MachineInstrBuilder VPUSH =
+        BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTMSDB_UPD), ARM::SP)
+            .addReg(ARM::SP)
+            .add(predOps(ARMCC::AL));
+    for (int Reg = ARM::S16; Reg <= ARM::S31; ++Reg)
+      VPUSH.addReg(Reg);
+
+    // Clear FP registers with a VSCCLRM.
+    (void)CMSEClearFPRegsV81(MBB, MBBI, ClearRegs);
+
+    // Save floating-point context.
+    BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTR_FPCXTS_pre), ARM::SP)
+        .addReg(ARM::SP)
+        .addImm(-8)
+        .add(predOps(ARMCC::AL));
+  }
+}
+
+// Restore FP registers if present
+void ARMExpandPseudo::CMSERestoreFPRegs(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+    SmallVectorImpl<unsigned> &AvailableRegs) {
+  if (STI->hasV8_1MMainlineOps())
+    CMSERestoreFPRegsV81(MBB, MBBI, DL, AvailableRegs);
+  else
+    CMSERestoreFPRegsV8(MBB, MBBI, DL, AvailableRegs);
+}
+
+void ARMExpandPseudo::CMSERestoreFPRegsV8(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+    SmallVectorImpl<unsigned> &AvailableRegs) {
+  if (!STI->hasFPRegs())
+    return;
+
+  // Use AvailableRegs to store the fp regs
+  std::vector<std::tuple<unsigned, unsigned, unsigned>> ClearedFPRegs;
+  std::vector<unsigned> NonclearedFPRegs;
+  for (const MachineOperand &Op : MBBI->operands()) {
+    if (Op.isReg() && Op.isDef()) {
+      unsigned Reg = Op.getReg();
+      assert(!ARM::DPRRegClass.contains(Reg) ||
+             ARM::DPR_VFP2RegClass.contains(Reg));
+      assert(!ARM::QPRRegClass.contains(Reg));
+      if (ARM::DPR_VFP2RegClass.contains(Reg)) {
+        if (AvailableRegs.size() >= 2) {
+          unsigned SaveReg2 = AvailableRegs.pop_back_val();
+          unsigned SaveReg1 = AvailableRegs.pop_back_val();
+          ClearedFPRegs.emplace_back(Reg, SaveReg1, SaveReg2);
+
+          // Save the fp register to the normal registers
+          BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRRD))
+              .addReg(SaveReg1, RegState::Define)
+              .addReg(SaveReg2, RegState::Define)
+              .addReg(Reg)
+              .add(predOps(ARMCC::AL));
+        } else {
+          NonclearedFPRegs.push_back(Reg);
+        }
+      } else if (ARM::SPRRegClass.contains(Reg)) {
+        if (AvailableRegs.size() >= 1) {
+          unsigned SaveReg = AvailableRegs.pop_back_val();
+          ClearedFPRegs.emplace_back(Reg, SaveReg, 0);
+
+          // Save the fp register to the normal registers
+          BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVRS), SaveReg)
+              .addReg(Reg)
+              .add(predOps(ARMCC::AL));
+        } else {
+          NonclearedFPRegs.push_back(Reg);
+        }
+      }
+    }
+  }
+
+  // Push FP regs that cannot be restored via normal registers on the stack
+  for (unsigned Reg : NonclearedFPRegs) {
+    if (ARM::DPR_VFP2RegClass.contains(Reg))
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTRD), Reg)
+          .addReg(ARM::SP)
+          .addImm((Reg - ARM::D0) * 2)
+          .add(predOps(ARMCC::AL));
+    else if (ARM::SPRRegClass.contains(Reg))
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VSTRS), Reg)
+          .addReg(ARM::SP)
+          .addImm(Reg - ARM::S0)
+          .add(predOps(ARMCC::AL));
+  }
+
+  // Lazy load fp regs from stack
+  BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
+      .addReg(ARM::SP)
+      .add(predOps(ARMCC::AL));
+
+  // Restore all FP registers via normal registers
+  for (const auto &Regs : ClearedFPRegs) {
+    unsigned Reg, SaveReg1, SaveReg2;
+    std::tie(Reg, SaveReg1, SaveReg2) = Regs;
+    if (ARM::DPR_VFP2RegClass.contains(Reg))
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVDRR), Reg)
+          .addReg(SaveReg1)
+          .addReg(SaveReg2)
+          .add(predOps(ARMCC::AL));
+    else if (ARM::SPRRegClass.contains(Reg))
+      BuildMI(MBB, MBBI, DL, TII->get(ARM::VMOVSR), Reg)
+          .addReg(SaveReg1)
+          .add(predOps(ARMCC::AL));
+  }
+
+  // Pop the stack space
+  BuildMI(MBB, MBBI, DL, TII->get(ARM::tADDspi), ARM::SP)
+      .addReg(ARM::SP)
+      .addImm(CMSE_FP_SAVE_SIZE >> 2)
+      .add(predOps(ARMCC::AL));
+}
+
+static bool definesOrUsesFPReg(const MachineInstr &MI) {
+  for (const MachineOperand &Op : MI.operands()) {
+    if (!Op.isReg())
+      continue;
+    unsigned Reg = Op.getReg();
+    if ((Reg >= ARM::Q0 && Reg <= ARM::Q7) ||
+        (Reg >= ARM::D0 && Reg <= ARM::D15) ||
+        (Reg >= ARM::S0 && Reg <= ARM::S31))
+      return true;
+  }
+  return false;
+}
+
+void ARMExpandPseudo::CMSERestoreFPRegsV81(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, DebugLoc &DL,
+    SmallVectorImpl<unsigned> &AvailableRegs) {
+  if (!definesOrUsesFPReg(*MBBI)) {
+    // Load FP registers from stack.
+    BuildMI(MBB, MBBI, DL, TII->get(ARM::VLLDM))
+        .addReg(ARM::SP)
+        .add(predOps(ARMCC::AL));
+
+    // Pop the stack space
+    BuildMI(MBB, MBBI, DL, TII->get(ARM::tADDspi), ARM::SP)
+        .addReg(ARM::SP)
+        .addImm(CMSE_FP_SAVE_SIZE >> 2)
+        .add(predOps(ARMCC::AL));
+  } else {
+    // Restore the floating point context.
+    BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(ARM::VLDR_FPCXTS_post),
+            ARM::SP)
+        .addReg(ARM::SP)
+        .addImm(8)
+        .add(predOps(ARMCC::AL));
+
+    // Pop all the callee-saved registers (s16-s31).
+    MachineInstrBuilder VPOP =
+        BuildMI(MBB, MBBI, DL, TII->get(ARM::VLDMSIA_UPD), ARM::SP)
+            .addReg(ARM::SP)
+            .add(predOps(ARMCC::AL));
+    for (int Reg = ARM::S16; Reg <= ARM::S31; ++Reg)
+      VPOP.addReg(Reg, RegState::Define);
+  }
+}
+
 /// Expand a CMP_SWAP pseudo-inst to an ldrex/strex loop as simply as
 /// possible. This only gets used at -O0 so we don't care about efficiency of
 /// the generated code.
@@ -1149,6 +1763,93 @@ bool ARMExpandPseudo::ExpandCMP_SWAP_64(MachineBasicBlock &MBB,
   return true;
 }
 
+static void CMSEPushCalleeSaves(const TargetInstrInfo &TII,
+                                MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI, int JumpReg,
+                                const LivePhysRegs &LiveRegs, bool Thumb1Only) {
+  const DebugLoc &DL = MBBI->getDebugLoc();
+  if (Thumb1Only) { // push Lo and Hi regs separately
+    MachineInstrBuilder PushMIB =
+        BuildMI(MBB, MBBI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL));
+    for (int Reg = ARM::R4; Reg < ARM::R8; ++Reg) {
+      PushMIB.addReg(
+          Reg, Reg == JumpReg || LiveRegs.contains(Reg) ? 0 : RegState::Undef);
+    }
+
+    // Thumb1 can only tPUSH low regs, so we copy the high regs to the low
+    // regs that we just saved and push the low regs again, taking care to
+    // not clobber JumpReg. If JumpReg is one of the low registers, push first
+    // the values of r9-r11, and then r8. That would leave them ordered in
+    // memory, and allow us to later pop them with a single instructions.
+    // FIXME: Could also use any of r0-r3 that are free (including in the
+    // first PUSH above).
+    for (int LoReg = ARM::R7, HiReg = ARM::R11; LoReg >= ARM::R4; --LoReg) {
+      if (JumpReg == LoReg)
+        continue;
+      BuildMI(MBB, MBBI, DL, TII.get(ARM::tMOVr), LoReg)
+          .addReg(HiReg, LiveRegs.contains(HiReg) ? 0 : RegState::Undef)
+          .add(predOps(ARMCC::AL));
+      --HiReg;
+    }
+    MachineInstrBuilder PushMIB2 =
+        BuildMI(MBB, MBBI, DL, TII.get(ARM::tPUSH)).add(predOps(ARMCC::AL));
+    for (int Reg = ARM::R4; Reg < ARM::R8; ++Reg) {
+      if (Reg == JumpReg)
+        continue;
+      PushMIB2.addReg(Reg, RegState::Kill);
+    }
+
+    // If we couldn't use a low register for temporary storage (because it was
+    // the JumpReg), use r4 or r5, whichever is not JumpReg. It has already been
+    // saved.
+    if (JumpReg >= ARM::R4 && JumpReg <= ARM::R7) {
+      int LoReg = JumpReg == ARM::R4 ? ARM::R5 : ARM::R4;
+      BuildMI(MBB, MBBI, DL, TII.get(ARM::tMOVr), LoReg)
+          .addReg(ARM::R8, LiveRegs.contains(ARM::R8) ? 0 : RegState::Undef)
+          .add(predOps(ARMCC::AL));
+      BuildMI(MBB, MBBI, DL, TII.get(ARM::tPUSH))
+          .add(predOps(ARMCC::AL))
+          .addReg(LoReg, RegState::Kill);
+    }
+  } else { // push Lo and Hi registers with a single instruction
+    MachineInstrBuilder PushMIB =
+        BuildMI(MBB, MBBI, DL, TII.get(ARM::t2STMDB_UPD), ARM::SP)
+            .addReg(ARM::SP)
+            .add(predOps(ARMCC::AL));
+    for (int Reg = ARM::R4; Reg < ARM::R12; ++Reg) {
+      PushMIB.addReg(
+          Reg, Reg == JumpReg || LiveRegs.contains(Reg) ? 0 : RegState::Undef);
+    }
+  }
+}
+
+static void CMSEPopCalleeSaves(const TargetInstrInfo &TII,
+                               MachineBasicBlock &MBB,
+                               MachineBasicBlock::iterator MBBI, int JumpReg,
+                               bool Thumb1Only) {
+  const DebugLoc &DL = MBBI->getDebugLoc();
+  if (Thumb1Only) {
+    MachineInstrBuilder PopMIB =
+        BuildMI(MBB, MBBI, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL));
+    for (int R = 0; R < 4; ++R) {
+      PopMIB.addReg(ARM::R4 + R, RegState::Define);
+      BuildMI(MBB, MBBI, DL, TII.get(ARM::tMOVr), ARM::R8 + R)
+          .addReg(ARM::R4 + R, RegState::Kill)
+          .add(predOps(ARMCC::AL));
+    }
+    MachineInstrBuilder PopMIB2 =
+        BuildMI(MBB, MBBI, DL, TII.get(ARM::tPOP)).add(predOps(ARMCC::AL));
+    for (int R = 0; R < 4; ++R)
+      PopMIB2.addReg(ARM::R4 + R, RegState::Define);
+  } else { // pop Lo and Hi registers with a single instruction
+    MachineInstrBuilder PopMIB =
+        BuildMI(MBB, MBBI, DL, TII.get(ARM::t2LDMIA_UPD), ARM::SP)
+            .addReg(ARM::SP)
+            .add(predOps(ARMCC::AL));
+    for (int Reg = ARM::R4; Reg < ARM::R12; ++Reg)
+      PopMIB.addReg(Reg, RegState::Define);
+  }
+}
 
 bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator MBBI,
@@ -1207,12 +1908,117 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
 
       // Update call site info and delete the pseudo instruction TCRETURN.
-      MBB.getParent()->moveCallSiteInfo(&MI, &*NewMI);
+      if (MI.isCandidateForCallSiteEntry())
+        MI.getMF()->moveCallSiteInfo(&MI, &*NewMI);
       MBB.erase(MBBI);
 
       MBBI = NewMI;
       return true;
     }
+    case ARM::tBXNS_RET: {
+      MachineBasicBlock &AfterBB = CMSEClearFPRegs(MBB, MBBI);
+
+      if (STI->hasV8_1MMainlineOps()) {
+        // Restore the non-secure floating point context.
+        BuildMI(MBB, MBBI, MBBI->getDebugLoc(),
+                TII->get(ARM::VLDR_FPCXTNS_post), ARM::SP)
+            .addReg(ARM::SP)
+            .addImm(4)
+            .add(predOps(ARMCC::AL));
+      }
+
+      // Clear all GPR that are not a use of the return instruction.
+      assert(llvm::all_of(MBBI->operands(), [](const MachineOperand &Op) {
+        return !Op.isReg() || Op.getReg() != ARM::R12;
+      }));
+      SmallVector<unsigned, 5> ClearRegs;
+      determineGPRegsToClear(
+          *MBBI, {ARM::R0, ARM::R1, ARM::R2, ARM::R3, ARM::R12}, ClearRegs);
+      CMSEClearGPRegs(AfterBB, AfterBB.end(), MBBI->getDebugLoc(), ClearRegs,
+                      ARM::LR);
+
+      MachineInstrBuilder NewMI =
+          BuildMI(AfterBB, AfterBB.end(), MBBI->getDebugLoc(),
+                  TII->get(ARM::tBXNS))
+              .addReg(ARM::LR)
+              .add(predOps(ARMCC::AL));
+      for (const MachineOperand &Op : MI.operands())
+        NewMI->addOperand(Op);
+      MI.eraseFromParent();
+      return true;
+    }
+    case ARM::tBLXNS_CALL: {
+      DebugLoc DL = MBBI->getDebugLoc();
+      unsigned JumpReg = MBBI->getOperand(0).getReg();
+
+      // Figure out which registers are live at the point immediately before the
+      // call. When we indiscriminately push a set of registers, the live
+      // registers are added as ordinary use operands, whereas dead registers
+      // are "undef".
+      LivePhysRegs LiveRegs(*TRI);
+      LiveRegs.addLiveOuts(MBB);
+      for (const MachineInstr &MI : make_range(MBB.rbegin(), MBBI.getReverse()))
+        LiveRegs.stepBackward(MI);
+      LiveRegs.stepBackward(*MBBI);
+
+      CMSEPushCalleeSaves(*TII, MBB, MBBI, JumpReg, LiveRegs,
+                          AFI->isThumb1OnlyFunction());
+
+      SmallVector<unsigned, 16> ClearRegs;
+      determineGPRegsToClear(*MBBI,
+                             {ARM::R0, ARM::R1, ARM::R2, ARM::R3, ARM::R4,
+                              ARM::R5, ARM::R6, ARM::R7, ARM::R8, ARM::R9,
+                              ARM::R10, ARM::R11, ARM::R12},
+                             ClearRegs);
+      auto OriginalClearRegs = ClearRegs;
+
+      // Get the first cleared register as a scratch (to use later with tBIC).
+      // We need to use the first so we can ensure it is a low register.
+      unsigned ScratchReg = ClearRegs.front();
+
+      // Clear LSB of JumpReg
+      if (AFI->isThumb2Function()) {
+        BuildMI(MBB, MBBI, DL, TII->get(ARM::t2BICri), JumpReg)
+            .addReg(JumpReg)
+            .addImm(1)
+            .add(predOps(ARMCC::AL))
+            .add(condCodeOp());
+      } else {
+        // We need to use an extra register to cope with 8M Baseline,
+        // since we have saved all of the registers we are ok to trash a non
+        // argument register here.
+        BuildMI(MBB, MBBI, DL, TII->get(ARM::tMOVi8), ScratchReg)
+            .add(condCodeOp())
+            .addImm(1)
+            .add(predOps(ARMCC::AL));
+        BuildMI(MBB, MBBI, DL, TII->get(ARM::tBIC), JumpReg)
+            .addReg(ARM::CPSR, RegState::Define)
+            .addReg(JumpReg)
+            .addReg(ScratchReg)
+            .add(predOps(ARMCC::AL));
+      }
+
+      CMSESaveClearFPRegs(MBB, MBBI, DL, LiveRegs,
+                          ClearRegs); // save+clear FP regs with ClearRegs
+      CMSEClearGPRegs(MBB, MBBI, DL, ClearRegs, JumpReg);
+
+      const MachineInstrBuilder NewCall =
+          BuildMI(MBB, MBBI, DL, TII->get(ARM::tBLXNSr))
+              .add(predOps(ARMCC::AL))
+              .addReg(JumpReg, RegState::Kill);
+
+      for (int I = 1, E = MI.getNumOperands(); I != E; ++I)
+        NewCall->addOperand(MI.getOperand(I));
+      if (MI.isCandidateForCallSiteEntry())
+        MI.getMF()->moveCallSiteInfo(&MI, NewCall.getInstr());
+
+      CMSERestoreFPRegs(MBB, MBBI, DL, OriginalClearRegs); // restore FP registers
+
+      CMSEPopCalleeSaves(*TII, MBB, MBBI, JumpReg, AFI->isThumb1OnlyFunction());
+
+      MI.eraseFromParent();
+      return true;
+    }
     case ARM::VMOVHcc:
     case ARM::VMOVScc:
     case ARM::VMOVDcc: {
@@ -1359,17 +2165,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
         // If there's dynamic realignment, adjust for it.
         if (RI.needsStackRealignment(MF)) {
           MachineFrameInfo &MFI = MF.getFrameInfo();
-          unsigned MaxAlign = MFI.getMaxAlignment();
+          Align MaxAlign = MFI.getMaxAlign();
           assert (!AFI->isThumb1OnlyFunction());
           // Emit bic r6, r6, MaxAlign
-          assert(MaxAlign <= 256 && "The BIC instruction cannot encode "
-                                    "immediates larger than 256 with all lower "
-                                    "bits set.");
+          assert(MaxAlign <= Align(256) &&
+                 "The BIC instruction cannot encode "
+                 "immediates larger than 256 with all lower "
+                 "bits set.");
           unsigned bicOpc = AFI->isThumbFunction() ?
             ARM::t2BICri : ARM::BICri;
           BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(bicOpc), ARM::R6)
               .addReg(ARM::R6, RegState::Kill)
-              .addImm(MaxAlign - 1)
+              .addImm(MaxAlign.value() - 1)
               .add(predOps(ARMCC::AL))
               .add(condCodeOp());
         }
@@ -1410,17 +2217,18 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       const bool Thumb = Opcode == ARM::tTPsoft;
 
       MachineInstrBuilder MIB;
+      MachineFunction *MF = MBB.getParent();
       if (STI->genLongCalls()) {
-        MachineFunction *MF = MBB.getParent();
         MachineConstantPool *MCP = MF->getConstantPool();
         unsigned PCLabelID = AFI->createPICLabelUId();
         MachineConstantPoolValue *CPV =
             ARMConstantPoolSymbol::Create(MF->getFunction().getContext(),
                                           "__aeabi_read_tp", PCLabelID, 0);
         Register Reg = MI.getOperand(0).getReg();
-        MIB = BuildMI(MBB, MBBI, MI.getDebugLoc(),
-                      TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg)
-                  .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
+        MIB =
+            BuildMI(MBB, MBBI, MI.getDebugLoc(),
+                    TII->get(Thumb ? ARM::tLDRpci : ARM::LDRi12), Reg)
+                .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, Align(4)));
         if (!Thumb)
           MIB.addImm(0);
         MIB.add(predOps(ARMCC::AL));
@@ -1440,7 +2248,9 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
       MIB.cloneMemRefs(MI);
       TransferImpOps(MI, MIB, MIB);
-      MI.getMF()->moveCallSiteInfo(&MI, &*MIB);
+      // Update the call site info.
+      if (MI.isCandidateForCallSiteEntry())
+        MF->moveCallSiteInfo(&MI, &*MIB);
       MI.eraseFromParent();
       return true;
     }
@@ -1504,7 +2314,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
       MachineInstrBuilder MIB =
           BuildMI(MBB, MBBI, MI.getDebugLoc(), TII->get(LDRLITOpc), DstReg)
-            .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, 4));
+              .addConstantPoolIndex(MCP->getConstantPoolIndex(CPV, Align(4)));
       if (IsARM)
         MIB.addImm(0);
       MIB.add(predOps(ARMCC::AL));
diff --git a/llvm/lib/Target/ARM/ARMFastISel.cpp b/llvm/lib/Target/ARM/ARMFastISel.cpp
index 6e19db3c7e22..4bfca8a803ca 100644
--- a/llvm/lib/Target/ARM/ARMFastISel.cpp
+++ b/llvm/lib/Target/ARM/ARMFastISel.cpp
@@ -48,7 +48,6 @@
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -209,7 +208,7 @@ class ARMFastISel final : public FastISel {
     unsigned ARMMoveToFPReg(MVT VT, unsigned SrcReg);
     unsigned ARMMoveToIntReg(MVT VT, unsigned SrcReg);
     unsigned ARMSelectCallOp(bool UseReg);
-    unsigned ARMLowerPICELF(const GlobalValue *GV, unsigned Align, MVT VT);
+    unsigned ARMLowerPICELF(const GlobalValue *GV, MVT VT);
 
     const TargetLowering *getTargetLowering() { return &TLI; }
 
@@ -444,12 +443,8 @@ unsigned ARMFastISel::ARMMaterializeFP(const ConstantFP *CFP, MVT VT) {
   if (!Subtarget->hasVFP2Base()) return false;
 
   // MachineConstantPool wants an explicit alignment.
-  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
-  if (Align == 0) {
-    // TODO: Figure out if this is correct.
-    Align = DL.getTypeAllocSize(CFP->getType());
-  }
-  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
+  Align Alignment = DL.getPrefTypeAlign(CFP->getType());
+  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Alignment);
   unsigned DestReg = createResultReg(TLI.getRegClassFor(VT));
   unsigned Opc = is64bit ? ARM::VLDRD : ARM::VLDRS;
 
@@ -508,12 +503,8 @@ unsigned ARMFastISel::ARMMaterializeInt(const Constant *C, MVT VT) {
     return 0;
 
   // MachineConstantPool wants an explicit alignment.
-  unsigned Align = DL.getPrefTypeAlignment(C->getType());
-  if (Align == 0) {
-    // TODO: Figure out if this is correct.
-    Align = DL.getTypeAllocSize(C->getType());
-  }
-  unsigned Idx = MCP.getConstantPoolIndex(C, Align);
+  Align Alignment = DL.getPrefTypeAlign(C->getType());
+  unsigned Idx = MCP.getConstantPoolIndex(C, Alignment);
   ResultReg = createResultReg(TLI.getRegClassFor(VT));
   if (isThumb2)
     AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -570,14 +561,10 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
                             TII.get(Opc), DestReg).addGlobalAddress(GV, 0, TF));
   } else {
     // MachineConstantPool wants an explicit alignment.
-    unsigned Align = DL.getPrefTypeAlignment(GV->getType());
-    if (Align == 0) {
-      // TODO: Figure out if this is correct.
-      Align = DL.getTypeAllocSize(GV->getType());
-    }
+    Align Alignment = DL.getPrefTypeAlign(GV->getType());
 
     if (Subtarget->isTargetELF() && IsPositionIndependent)
-      return ARMLowerPICELF(GV, Align, VT);
+      return ARMLowerPICELF(GV, VT);
 
     // Grab index.
     unsigned PCAdj = IsPositionIndependent ? (Subtarget->isThumb() ? 4 : 8) : 0;
@@ -585,7 +572,7 @@ unsigned ARMFastISel::ARMMaterializeGV(const GlobalValue *GV, MVT VT) {
     ARMConstantPoolValue *CPV = ARMConstantPoolConstant::Create(GV, Id,
                                                                 ARMCP::CPValue,
                                                                 PCAdj);
-    unsigned Idx = MCP.getConstantPoolIndex(CPV, Align);
+    unsigned Idx = MCP.getConstantPoolIndex(CPV, Alignment);
 
     // Load value.
     MachineInstrBuilder MIB;
@@ -882,7 +869,7 @@ void ARMFastISel::AddLoadStoreOperands(MVT VT, Address &Addr,
     int Offset = Addr.Offset;
     MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*FuncInfo.MF, FI, Offset), Flags,
-        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+        MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
     // Now add the rest of the operands.
     MIB.addFrameIndex(FI);
 
@@ -2090,6 +2077,7 @@ bool ARMFastISel::FinishCall(MVT RetVT, SmallVectorImpl<Register> &UsedRegs,
 bool ARMFastISel::SelectRet(const Instruction *I) {
   const ReturnInst *Ret = cast<ReturnInst>(I);
   const Function &F = *I->getParent()->getParent();
+  const bool IsCmseNSEntry = F.hasFnAttribute("cmse_nonsecure_entry");
 
   if (!FuncInfo.CanLowerReturn)
     return false;
@@ -2166,8 +2154,17 @@ bool ARMFastISel::SelectRet(const Instruction *I) {
     RetRegs.push_back(VA.getLocReg());
   }
 
+  unsigned RetOpc;
+  if (IsCmseNSEntry)
+    if (isThumb2)
+      RetOpc = ARM::tBXNS_RET;
+    else
+      llvm_unreachable("CMSE not valid for non-Thumb targets");
+  else
+    RetOpc = Subtarget->getReturnOpcode();
+
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-                                    TII.get(Subtarget->getReturnOpcode()));
+                                    TII.get(RetOpc));
   AddOptionalDefs(MIB);
   for (unsigned R : RetRegs)
     MIB.addReg(R, RegState::Implicit);
@@ -2239,7 +2236,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
     if (!isTypeLegal(ArgTy, ArgVT)) return false;
 
     ISD::ArgFlagsTy Flags;
-    Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy)));
+    Flags.setOrigAlign(DL.getABITypeAlign(ArgTy));
 
     Args.push_back(Op);
     ArgRegs.push_back(Arg);
@@ -2293,7 +2290,7 @@ bool ARMFastISel::ARMEmitLibcall(const Instruction *I, RTLIB::Libcall Call) {
 bool ARMFastISel::SelectCall(const Instruction *I,
                              const char *IntrMemName = nullptr) {
   const CallInst *CI = cast<CallInst>(I);
-  const Value *Callee = CI->getCalledValue();
+  const Value *Callee = CI->getCalledOperand();
 
   // Can't handle inline asm.
   if (isa<InlineAsm>(Callee)) return false;
@@ -2302,12 +2299,11 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   if (CI->isTailCall()) return false;
 
   // Check the calling convention.
-  ImmutableCallSite CS(CI);
-  CallingConv::ID CC = CS.getCallingConv();
+  CallingConv::ID CC = CI->getCallingConv();
 
   // TODO: Avoid some calling conventions?
 
-  FunctionType *FTy = CS.getFunctionType();
+  FunctionType *FTy = CI->getFunctionType();
   bool isVarArg = FTy->isVarArg();
 
   // Handle *simple* calls for now.
@@ -2334,47 +2330,46 @@ bool ARMFastISel::SelectCall(const Instruction *I,
   SmallVector<Register, 8> ArgRegs;
   SmallVector<MVT, 8> ArgVTs;
   SmallVector<ISD::ArgFlagsTy, 8> ArgFlags;
-  unsigned arg_size = CS.arg_size();
+  unsigned arg_size = CI->arg_size();
   Args.reserve(arg_size);
   ArgRegs.reserve(arg_size);
   ArgVTs.reserve(arg_size);
   ArgFlags.reserve(arg_size);
-  for (ImmutableCallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
-       i != e; ++i) {
+  for (auto ArgI = CI->arg_begin(), ArgE = CI->arg_end(); ArgI != ArgE; ++ArgI) {
     // If we're lowering a memory intrinsic instead of a regular call, skip the
     // last argument, which shouldn't be passed to the underlying function.
-    if (IntrMemName && e - i <= 1)
+    if (IntrMemName && ArgE - ArgI <= 1)
       break;
 
     ISD::ArgFlagsTy Flags;
-    unsigned ArgIdx = i - CS.arg_begin();
-    if (CS.paramHasAttr(ArgIdx, Attribute::SExt))
+    unsigned ArgIdx = ArgI - CI->arg_begin();
+    if (CI->paramHasAttr(ArgIdx, Attribute::SExt))
       Flags.setSExt();
-    if (CS.paramHasAttr(ArgIdx, Attribute::ZExt))
+    if (CI->paramHasAttr(ArgIdx, Attribute::ZExt))
       Flags.setZExt();
 
     // FIXME: Only handle *easy* calls for now.
-    if (CS.paramHasAttr(ArgIdx, Attribute::InReg) ||
-        CS.paramHasAttr(ArgIdx, Attribute::StructRet) ||
-        CS.paramHasAttr(ArgIdx, Attribute::SwiftSelf) ||
-        CS.paramHasAttr(ArgIdx, Attribute::SwiftError) ||
-        CS.paramHasAttr(ArgIdx, Attribute::Nest) ||
-        CS.paramHasAttr(ArgIdx, Attribute::ByVal))
+    if (CI->paramHasAttr(ArgIdx, Attribute::InReg) ||
+        CI->paramHasAttr(ArgIdx, Attribute::StructRet) ||
+        CI->paramHasAttr(ArgIdx, Attribute::SwiftSelf) ||
+        CI->paramHasAttr(ArgIdx, Attribute::SwiftError) ||
+        CI->paramHasAttr(ArgIdx, Attribute::Nest) ||
+        CI->paramHasAttr(ArgIdx, Attribute::ByVal))
       return false;
 
-    Type *ArgTy = (*i)->getType();
+    Type *ArgTy = (*ArgI)->getType();
     MVT ArgVT;
     if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8 &&
         ArgVT != MVT::i1)
       return false;
 
-    Register Arg = getRegForValue(*i);
+    Register Arg = getRegForValue(*ArgI);
     if (!Arg.isValid())
       return false;
 
-    Flags.setOrigAlign(Align(DL.getABITypeAlignment(ArgTy)));
+    Flags.setOrigAlign(DL.getABITypeAlign(ArgTy));
 
-    Args.push_back(*i);
+    Args.push_back(*ArgI);
     ArgRegs.push_back(Arg);
     ArgVTs.push_back(ArgVT);
     ArgFlags.push_back(Flags);
@@ -2949,8 +2944,7 @@ bool ARMFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   return true;
 }
 
-unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
-                                     unsigned Align, MVT VT) {
+unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV, MVT VT) {
   bool UseGOT_PREL = !TM.shouldAssumeDSOLocal(*GV->getParent(), GV);
 
   LLVMContext *Context = &MF->getFunction().getContext();
@@ -2961,12 +2955,12 @@ unsigned ARMFastISel::ARMLowerPICELF(const GlobalValue *GV,
       UseGOT_PREL ? ARMCP::GOT_PREL : ARMCP::no_modifier,
       /*AddCurrentAddress=*/UseGOT_PREL);
 
-  unsigned ConstAlign =
-      MF->getDataLayout().getPrefTypeAlignment(Type::getInt32PtrTy(*Context));
+  Align ConstAlign =
+      MF->getDataLayout().getPrefTypeAlign(Type::getInt32PtrTy(*Context));
   unsigned Idx = MF->getConstantPool()->getConstantPoolIndex(CPV, ConstAlign);
   MachineMemOperand *CPMMO =
       MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
-                               MachineMemOperand::MOLoad, 4, 4);
+                               MachineMemOperand::MOLoad, 4, Align(4));
 
   Register TempReg = MF->getRegInfo().createVirtualRegister(&ARM::rGPRRegClass);
   unsigned Opc = isThumb2 ? ARM::t2LDRpci : ARM::LDRcp;
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index cb98b2b34efd..8a8f3237bb6f 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -142,27 +142,6 @@ ARMFrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
   return hasReservedCallFrame(MF) || MF.getFrameInfo().hasVarSizedObjects();
 }
 
-static bool isCSRestore(MachineInstr &MI, const ARMBaseInstrInfo &TII,
-                        const MCPhysReg *CSRegs) {
-  // Integer spill area is handled with "pop".
-  if (isPopOpcode(MI.getOpcode())) {
-    // The first two operands are predicates. The last two are
-    // imp-def and imp-use of SP. Check everything in between.
-    for (int i = 5, e = MI.getNumOperands(); i != e; ++i)
-      if (!isCalleeSavedRegister(MI.getOperand(i).getReg(), CSRegs))
-        return false;
-    return true;
-  }
-  if ((MI.getOpcode() == ARM::LDR_POST_IMM ||
-       MI.getOpcode() == ARM::LDR_POST_REG ||
-       MI.getOpcode() == ARM::t2LDR_POST) &&
-      isCalleeSavedRegister(MI.getOperand(0).getReg(), CSRegs) &&
-      MI.getOperand(1).getReg() == ARM::SP)
-    return true;
-
-  return false;
-}
-
 static void emitRegPlusImmediate(
     bool isARM, MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
     const DebugLoc &dl, const ARMBaseInstrInfo &TII, unsigned DestReg,
@@ -256,9 +235,9 @@ struct StackAdjustingInsts {
       if (HasFP && !Info.BeforeFPSet)
         return;
 
-      CFAOffset -= Info.SPAdjust;
+      CFAOffset += Info.SPAdjust;
       unsigned CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+          MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset));
       BuildMI(MBB, std::next(Info.I), dl,
               TII.get(TargetOpcode::CFI_INSTRUCTION))
               .addCFIIndex(CFIIndex)
@@ -281,13 +260,13 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
                                      MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator MBBI,
                                      const DebugLoc &DL, const unsigned Reg,
-                                     const unsigned Alignment,
+                                     const Align Alignment,
                                      const bool MustBeSingleInstruction) {
   const ARMSubtarget &AST =
       static_cast<const ARMSubtarget &>(MF.getSubtarget());
   const bool CanUseBFC = AST.hasV6T2Ops() || AST.hasV7Ops();
-  const unsigned AlignMask = Alignment - 1;
-  const unsigned NrBitsToZero = countTrailingZeros(Alignment);
+  const unsigned AlignMask = Alignment.value() - 1U;
+  const unsigned NrBitsToZero = Log2(Alignment);
   assert(!AFI->isThumb1OnlyFunction() && "Thumb1 not supported");
   if (!AFI->isThumbFunction()) {
     // if the BFC instruction is available, use that to zero the lower
@@ -343,14 +322,15 @@ static void emitAligningInstructions(MachineFunction &MF, ARMFunctionInfo *AFI,
 /// Unfortunately we cannot determine this value in determineCalleeSaves() yet
 /// as assignCalleeSavedSpillSlots() hasn't run at this point. Instead we use
 /// this to produce a conservative estimate that we check in an assert() later.
-static int getMaxFPOffset(const Function &F, const ARMFunctionInfo &AFI) {
+static int getMaxFPOffset(const ARMSubtarget &STI, const ARMFunctionInfo &AFI) {
   // For Thumb1, push.w isn't available, so the first push will always push
   // r7 and lr onto the stack first.
   if (AFI.isThumb1OnlyFunction())
     return -AFI.getArgRegsSaveSize() - (2 * 4);
   // This is a conservative estimation: Assume the frame pointer being r7 and
   // pc("r15") up to r8 getting spilled before (= 8 registers).
-  return -AFI.getArgRegsSaveSize() - (8 * 4);
+  int FPCXTSaveSize = (STI.hasV8_1MMainlineOps() && AFI.isCmseNSEntryFunction()) ? 4 : 0;
+  return - FPCXTSaveSize - AFI.getArgRegsSaveSize() - (8 * 4);
 }
 
 void ARMFrameLowering::emitPrologue(MachineFunction &MF,
@@ -367,10 +347,11 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   assert(!AFI->isThumb1OnlyFunction() &&
          "This emitPrologue does not support Thumb1!");
   bool isARM = !AFI->isThumbFunction();
-  unsigned Align = STI.getFrameLowering()->getStackAlignment();
+  Align Alignment = STI.getFrameLowering()->getStackAlign();
   unsigned ArgRegsSaveSize = AFI->getArgRegsSaveSize();
   unsigned NumBytes = MFI.getStackSize();
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  int FPCXTSaveSize = 0;
 
   // Debug location must be unknown since the first debug location is used
   // to determine the end of the prologue.
@@ -439,6 +420,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
         FramePtrSpillFI = FI;
       GPRCS1Size += 4;
       break;
+    case ARM::FPCXTNS:
+      FPCXTSaveSize = 4;
+      break;
     default:
       // This is a DPR. Exclude the aligned DPRCS2 spills.
       if (Reg == ARM::D8)
@@ -448,25 +432,35 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     }
   }
 
-  // Move past area 1.
+  // Move past FPCXT area.
   MachineBasicBlock::iterator LastPush = MBB.end(), GPRCS1Push, GPRCS2Push;
+  if (FPCXTSaveSize > 0) {
+    LastPush = MBBI++;
+    DefCFAOffsetCandidates.addInst(LastPush, FPCXTSaveSize, true);
+  }
+
+  // Move past area 1.
   if (GPRCS1Size > 0) {
     GPRCS1Push = LastPush = MBBI++;
     DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, true);
   }
 
   // Determine starting offsets of spill areas.
-  unsigned GPRCS1Offset = NumBytes - ArgRegsSaveSize - GPRCS1Size;
+  unsigned FPCXTOffset = NumBytes - ArgRegsSaveSize - FPCXTSaveSize;
+  unsigned GPRCS1Offset = FPCXTOffset - GPRCS1Size;
   unsigned GPRCS2Offset = GPRCS1Offset - GPRCS2Size;
-  unsigned DPRAlign = DPRCSSize ? std::min(8U, Align) : 4U;
-  unsigned DPRGapSize = (GPRCS1Size + GPRCS2Size + ArgRegsSaveSize) % DPRAlign;
+  Align DPRAlign = DPRCSSize ? std::min(Align(8), Alignment) : Align(4);
+  unsigned DPRGapSize =
+      (GPRCS1Size + GPRCS2Size + FPCXTSaveSize + ArgRegsSaveSize) %
+      DPRAlign.value();
+
   unsigned DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize;
   int FramePtrOffsetInPush = 0;
   if (HasFP) {
     int FPOffset = MFI.getObjectOffset(FramePtrSpillFI);
-    assert(getMaxFPOffset(MF.getFunction(), *AFI) <= FPOffset &&
+    assert(getMaxFPOffset(STI, *AFI) <= FPOffset &&
            "Max FP estimation is wrong");
-    FramePtrOffsetInPush = FPOffset + ArgRegsSaveSize;
+    FramePtrOffsetInPush = FPOffset + ArgRegsSaveSize + FPCXTSaveSize;
     AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) +
                                 NumBytes);
   }
@@ -599,9 +593,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
                          PushSize + FramePtrOffsetInPush,
                          MachineInstr::FrameSetup);
     if (FramePtrOffsetInPush + PushSize != 0) {
-      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
           nullptr, MRI->getDwarfRegNum(FramePtr, true),
-          -(ArgRegsSaveSize - FramePtrOffsetInPush)));
+          FPCXTSaveSize + ArgRegsSaveSize - FramePtrOffsetInPush));
       BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
@@ -707,6 +701,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     MFI.setOffsetAdjustment(MFI.getOffsetAdjustment() -
                             AFI->getFramePtrSpillOffset());
 
+  AFI->setFPCXTSaveAreaSize(FPCXTSaveSize);
   AFI->setGPRCalleeSavedArea1Size(GPRCS1Size);
   AFI->setGPRCalleeSavedArea2Size(GPRCS2Size);
   AFI->setDPRCalleeSavedGapSize(DPRGapSize);
@@ -717,7 +712,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   // If aligned NEON registers were spilled, the stack has already been
   // realigned.
   if (!AFI->getNumAlignedDPRCS2Regs() && RegInfo->needsStackRealignment(MF)) {
-    unsigned MaxAlign = MFI.getMaxAlignment();
+    Align MaxAlign = MFI.getMaxAlign();
     assert(!AFI->isThumb1OnlyFunction());
     if (!AFI->isThumbFunction()) {
       emitAligningInstructions(MF, AFI, TII, MBB, MBBI, dl, ARM::SP, MaxAlign,
@@ -793,20 +788,22 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (!AFI->hasStackFrame()) {
     if (NumBytes - ArgRegsSaveSize != 0)
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize);
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes - ArgRegsSaveSize,
+                   MachineInstr::FrameDestroy);
   } else {
     // Unwind MBBI to point to first LDR / VLDRD.
-    const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
     if (MBBI != MBB.begin()) {
       do {
         --MBBI;
-      } while (MBBI != MBB.begin() && isCSRestore(*MBBI, TII, CSRegs));
-      if (!isCSRestore(*MBBI, TII, CSRegs))
+      } while (MBBI != MBB.begin() &&
+               MBBI->getFlag(MachineInstr::FrameDestroy));
+      if (!MBBI->getFlag(MachineInstr::FrameDestroy))
         ++MBBI;
     }
 
     // Move SP to start of FP callee save spill area.
     NumBytes -= (ArgRegsSaveSize +
+                 AFI->getFPCXTSaveAreaSize() +
                  AFI->getGPRCalleeSavedArea1Size() +
                  AFI->getGPRCalleeSavedArea2Size() +
                  AFI->getDPRCalleeSavedGapSize() +
@@ -819,7 +816,8 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
       if (NumBytes) {
         if (isARM)
           emitARMRegPlusImmediate(MBB, MBBI, dl, ARM::SP, FramePtr, -NumBytes,
-                                  ARMCC::AL, 0, TII);
+                                  ARMCC::AL, 0, TII,
+                                  MachineInstr::FrameDestroy);
         else {
           // It's not possible to restore SP from FP in a single instruction.
           // For iOS, this looks like:
@@ -831,10 +829,11 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
           assert(!MFI.getPristineRegs(MF).test(ARM::R4) &&
                  "No scratch register to restore SP from FP!");
           emitT2RegPlusImmediate(MBB, MBBI, dl, ARM::R4, FramePtr, -NumBytes,
-                                 ARMCC::AL, 0, TII);
+                                 ARMCC::AL, 0, TII, MachineInstr::FrameDestroy);
           BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
               .addReg(ARM::R4)
-              .add(predOps(ARMCC::AL));
+              .add(predOps(ARMCC::AL))
+              .setMIFlag(MachineInstr::FrameDestroy);
         }
       } else {
         // Thumb2 or ARM.
@@ -842,15 +841,18 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
           BuildMI(MBB, MBBI, dl, TII.get(ARM::MOVr), ARM::SP)
               .addReg(FramePtr)
               .add(predOps(ARMCC::AL))
-              .add(condCodeOp());
+              .add(condCodeOp())
+              .setMIFlag(MachineInstr::FrameDestroy);
         else
           BuildMI(MBB, MBBI, dl, TII.get(ARM::tMOVr), ARM::SP)
               .addReg(FramePtr)
-              .add(predOps(ARMCC::AL));
+              .add(predOps(ARMCC::AL))
+              .setMIFlag(MachineInstr::FrameDestroy);
       }
     } else if (NumBytes &&
                !tryFoldSPUpdateIntoPushPop(STI, MF, &*MBBI, NumBytes))
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes);
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, NumBytes,
+                   MachineInstr::FrameDestroy);
 
     // Increment past our save areas.
     if (MBBI != MBB.end() && AFI->getDPRCalleeSavedAreaSize()) {
@@ -863,31 +865,32 @@ void ARMFrameLowering::emitEpilogue(MachineFunction &MF,
     if (AFI->getDPRCalleeSavedGapSize()) {
       assert(AFI->getDPRCalleeSavedGapSize() == 4 &&
              "unexpected DPR alignment gap");
-      emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedGapSize());
+      emitSPUpdate(isARM, MBB, MBBI, dl, TII, AFI->getDPRCalleeSavedGapSize(),
+                   MachineInstr::FrameDestroy);
     }
 
     if (AFI->getGPRCalleeSavedArea2Size()) MBBI++;
     if (AFI->getGPRCalleeSavedArea1Size()) MBBI++;
+    if (AFI->getFPCXTSaveAreaSize()) MBBI++;
   }
 
   if (ArgRegsSaveSize)
-    emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize);
+    emitSPUpdate(isARM, MBB, MBBI, dl, TII, ArgRegsSaveSize,
+                 MachineInstr::FrameDestroy);
 }
 
 /// getFrameIndexReference - Provide a base+offset reference to an FI slot for
 /// debug info.  It's the same as what we use for resolving the code-gen
 /// references for now.  FIXME: This can go wrong when references are
 /// SP-relative and simple call frames aren't used.
-int
-ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
-                                         unsigned &FrameReg) const {
+int ARMFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
+                                             Register &FrameReg) const {
   return ResolveFrameIndexReference(MF, FI, FrameReg, 0);
 }
 
-int
-ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
-                                             int FI, unsigned &FrameReg,
-                                             int SPAdj) const {
+int ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
+                                                 int FI, Register &FrameReg,
+                                                 int SPAdj) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const ARMBaseRegisterInfo *RegInfo = static_cast<const ARMBaseRegisterInfo *>(
       MF.getSubtarget().getRegisterInfo());
@@ -969,10 +972,9 @@ ARMFrameLowering::ResolveFrameIndexReference(const MachineFunction &MF,
 
 void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MI,
-                                    const std::vector<CalleeSavedInfo> &CSI,
+                                    ArrayRef<CalleeSavedInfo> CSI,
                                     unsigned StmOpc, unsigned StrOpc,
-                                    bool NoGap,
-                                    bool(*Func)(unsigned, bool),
+                                    bool NoGap, bool (*Func)(unsigned, bool),
                                     unsigned NumAlignedDPRCS2Regs,
                                     unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
@@ -1047,10 +1049,10 @@ void ARMFrameLowering::emitPushInst(MachineBasicBlock &MBB,
 
 void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator MI,
-                                   std::vector<CalleeSavedInfo> &CSI,
+                                   MutableArrayRef<CalleeSavedInfo> CSI,
                                    unsigned LdmOpc, unsigned LdrOpc,
                                    bool isVarArg, bool NoGap,
-                                   bool(*Func)(unsigned, bool),
+                                   bool (*Func)(unsigned, bool),
                                    unsigned NumAlignedDPRCS2Regs) const {
   MachineFunction &MF = *MBB.getParent();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
@@ -1060,6 +1062,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
   bool isTailCall = false;
   bool isInterrupt = false;
   bool isTrap = false;
+  bool isCmseEntry = false;
   if (MBB.end() != MI) {
     DL = MI->getDebugLoc();
     unsigned RetOpcode = MI->getOpcode();
@@ -1069,6 +1072,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
     isTrap =
         RetOpcode == ARM::TRAP || RetOpcode == ARM::TRAPNaCl ||
         RetOpcode == ARM::tTRAP;
+    isCmseEntry = (RetOpcode == ARM::tBXNS || RetOpcode == ARM::tBXNS_RET);
   }
 
   SmallVector<unsigned, 4> Regs;
@@ -1086,7 +1090,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
         continue;
 
       if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
-          !isTrap && STI.hasV5TOps()) {
+          !isCmseEntry && !isTrap && STI.hasV5TOps()) {
         if (MBB.succ_empty()) {
           Reg = ARM::PC;
           // Fold the return instruction into the LDM.
@@ -1119,7 +1123,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
     if (Regs.size() > 1 || LdrOpc == 0) {
       MachineInstrBuilder MIB = BuildMI(MBB, MI, DL, TII.get(LdmOpc), ARM::SP)
                                     .addReg(ARM::SP)
-                                    .add(predOps(ARMCC::AL));
+                                    .add(predOps(ARMCC::AL))
+                                    .setMIFlags(MachineInstr::FrameDestroy);
       for (unsigned i = 0, e = Regs.size(); i < e; ++i)
         MIB.addReg(Regs[i], getDefRegState(true));
       if (DeleteRet) {
@@ -1137,7 +1142,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
       MachineInstrBuilder MIB =
         BuildMI(MBB, MI, DL, TII.get(LdrOpc), Regs[0])
           .addReg(ARM::SP, RegState::Define)
-          .addReg(ARM::SP);
+          .addReg(ARM::SP)
+          .setMIFlags(MachineInstr::FrameDestroy);
       // ARM mode needs an extra reg0 here due to addrmode2. Will go away once
       // that refactoring is complete (eventually).
       if (LdrOpc == ARM::LDR_POST_REG || LdrOpc == ARM::LDR_POST_IMM) {
@@ -1162,7 +1168,7 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
 static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator MI,
                                     unsigned NumAlignedDPRCS2Regs,
-                                    const std::vector<CalleeSavedInfo> &CSI,
+                                    ArrayRef<CalleeSavedInfo> CSI,
                                     const TargetRegisterInfo *TRI) {
   MachineFunction &MF = *MBB.getParent();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -1180,7 +1186,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
     int FI = CSI[i].getFrameIdx();
     // The even-numbered registers will be 16-byte aligned, the odd-numbered
     // registers will be 8-byte aligned.
-    MFI.setObjectAlignment(FI, DNum % 2 ? 8 : 16);
+    MFI.setObjectAlignment(FI, DNum % 2 ? Align(8) : Align(16));
 
     // The stack slot for D8 needs to be maximally aligned because this is
     // actually the point where we align the stack pointer.  MachineFrameInfo
@@ -1189,7 +1195,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
     // over-alignment is not realized because the code inserted below adjusts
     // the stack pointer by numregs * 8 before aligning the stack pointer.
     if (DNum == 0)
-      MFI.setObjectAlignment(FI, MFI.getMaxAlignment());
+      MFI.setObjectAlignment(FI, MFI.getMaxAlign());
   }
 
   // Move the stack pointer to the d8 spill slot, and align it at the same
@@ -1212,7 +1218,7 @@ static void emitAlignedDPRCS2Spills(MachineBasicBlock &MBB,
       .add(predOps(ARMCC::AL))
       .add(condCodeOp());
 
-  unsigned MaxAlign = MF.getFrameInfo().getMaxAlignment();
+  Align MaxAlign = MF.getFrameInfo().getMaxAlign();
   // We must set parameter MustBeSingleInstruction to true, since
   // skipAlignedDPRCS2Spills expects exactly 3 instructions to perform
   // stack alignment.  Luckily, this can always be done since all ARM
@@ -1335,7 +1341,7 @@ skipAlignedDPRCS2Spills(MachineBasicBlock::iterator MI,
 static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator MI,
                                       unsigned NumAlignedDPRCS2Regs,
-                                      const std::vector<CalleeSavedInfo> &CSI,
+                                      ArrayRef<CalleeSavedInfo> CSI,
                                       const TargetRegisterInfo *TRI) {
   MachineFunction &MF = *MBB.getParent();
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
@@ -1422,10 +1428,9 @@ static void emitAlignedDPRCS2Restores(MachineBasicBlock &MBB,
   std::prev(MI)->addRegisterKilled(ARM::R4, TRI);
 }
 
-bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const {
+bool ARMFrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return false;
 
@@ -1437,6 +1442,16 @@ bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     ARM::t2STR_PRE : ARM::STR_PRE_IMM;
   unsigned FltOpc = ARM::VSTMDDB_UPD;
   unsigned NumAlignedDPRCS2Regs = AFI->getNumAlignedDPRCS2Regs();
+  // Save the non-secure floating point context.
+  if (llvm::any_of(CSI, [](const CalleeSavedInfo &C) {
+        return C.getReg() == ARM::FPCXTNS;
+      })) {
+    BuildMI(MBB, MI, DebugLoc(), STI.getInstrInfo()->get(ARM::VSTR_FPCXTNS_pre),
+            ARM::SP)
+        .addReg(ARM::SP)
+        .addImm(-4)
+        .add(predOps(ARMCC::AL));
+  }
   emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea1Register, 0,
                MachineInstr::FrameSetup);
   emitPushInst(MBB, MI, CSI, PushOpc, PushOneOpc, false, &isARMArea2Register, 0,
@@ -1453,10 +1468,9 @@ bool ARMFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
-bool ARMFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MI,
-                                        std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const {
+bool ARMFrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return false;
 
@@ -1601,7 +1615,7 @@ checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) {
     return;
 
   // Don't bother if the default stack alignment is sufficiently high.
-  if (MF.getSubtarget().getFrameLowering()->getStackAlignment() >= 8)
+  if (MF.getSubtarget().getFrameLowering()->getStackAlign() >= Align(8))
     return;
 
   // Aligned spills require stack realignment.
@@ -1630,6 +1644,16 @@ checkNumAlignedDPRCS2Regs(MachineFunction &MF, BitVector &SavedRegs) {
   SavedRegs.set(ARM::R4);
 }
 
+bool ARMFrameLowering::enableShrinkWrapping(const MachineFunction &MF) const {
+  // For CMSE entry functions, we want to save the FPCXT_NS immediately
+  // upon function entry (resp. restore it immmediately before return)
+  if (STI.hasV8_1MMainlineOps() &&
+      MF.getInfo<ARMFunctionInfo>()->isCmseNSEntryFunction())
+    return false;
+
+  return true;
+}
+
 void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                             BitVector &SavedRegs,
                                             RegScavenger *RS) const {
@@ -1699,6 +1723,10 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (RegInfo->hasBasePointer(MF))
     SavedRegs.set(RegInfo->getBaseRegister());
 
+  // On v8.1-M.Main CMSE entry functions save/restore FPCXT.
+  if (STI.hasV8_1MMainlineOps() && AFI->isCmseNSEntryFunction())
+    CanEliminateFrame = false;
+
   // Don't spill FP if the frame can be eliminated. This is determined
   // by scanning the callee-save registers to see if any is modified.
   const MCPhysReg *CSRegs = RegInfo->getCalleeSavedRegs(&MF);
@@ -1771,8 +1799,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   if (!LRSpilled && AFI->isThumb1OnlyFunction()) {
     unsigned FnSize = EstimateFunctionSizeInBytes(MF, TII);
     // Force LR to be spilled if the Thumb function size is > 2048. This enables
-    // use of BL to implement far jump. If it turns out that it's not needed
-    // then the branch fix up path will undo it.
+    // use of BL to implement far jump.
     if (FnSize >= (1 << 11)) {
       CanEliminateFrame = false;
       ForceLRSpill = true;
@@ -1858,7 +1885,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   //
   // We could do slightly better on Thumb1; in some cases, an sp-relative
   // offset would be legal even though an fp-relative offset is not.
-  int MaxFPOffset = getMaxFPOffset(MF.getFunction(), *AFI);
+  int MaxFPOffset = getMaxFPOffset(STI, *AFI);
   bool HasLargeArgumentList =
       HasFP && (MaxFixedOffset - MaxFPOffset) > (int)EstimatedRSFixedSizeLimit;
 
@@ -2045,8 +2072,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
     // of GPRs, spill one extra callee save GPR so we won't have to pad between
     // the integer and double callee save areas.
     LLVM_DEBUG(dbgs() << "NumGPRSpills = " << NumGPRSpills << "\n");
-    unsigned TargetAlign = getStackAlignment();
-    if (TargetAlign >= 8 && (NumGPRSpills & 1)) {
+    const Align TargetAlign = getStackAlign();
+    if (TargetAlign >= Align(8) && (NumGPRSpills & 1)) {
       if (CS1Spilled && !UnspilledCS1GPRs.empty()) {
         for (unsigned i = 0, e = UnspilledCS1GPRs.size(); i != e; ++i) {
           unsigned Reg = UnspilledCS1GPRs[i];
@@ -2083,7 +2110,7 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
     if (BigFrameOffsets && !ExtraCSSpill) {
       // If any non-reserved CS register isn't spilled, just spill one or two
       // extra. That should take care of it!
-      unsigned NumExtras = TargetAlign / 4;
+      unsigned NumExtras = TargetAlign.value() / 4;
       SmallVector<unsigned, 2> Extras;
       while (NumExtras && !UnspilledCS1GPRs.empty()) {
         unsigned Reg = UnspilledCS1GPRs.back();
@@ -2117,16 +2144,15 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
         LLVM_DEBUG(dbgs() << "Reserving emergency spill slot\n");
         const TargetRegisterClass &RC = ARM::GPRRegClass;
         unsigned Size = TRI->getSpillSize(RC);
-        unsigned Align = TRI->getSpillAlignment(RC);
-        RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
+        Align Alignment = TRI->getSpillAlign(RC);
+        RS->addScavengingFrameIndex(
+            MFI.CreateStackObject(Size, Alignment, false));
       }
     }
   }
 
-  if (ForceLRSpill) {
+  if (ForceLRSpill)
     SavedRegs.set(ARM::LR);
-    AFI->setLRIsSpilledForFarJump(true);
-  }
   AFI->setLRIsSpilled(SavedRegs.test(ARM::LR));
 }
 
@@ -2142,6 +2168,27 @@ void ARMFrameLowering::getCalleeSaves(const MachineFunction &MF,
     SavedRegs.set(ARM::R0);
 }
 
+bool ARMFrameLowering::assignCalleeSavedSpillSlots(
+    MachineFunction &MF, const TargetRegisterInfo *TRI,
+    std::vector<CalleeSavedInfo> &CSI) const {
+  // For CMSE entry functions, handle floating-point context as if it was a
+  // callee-saved register.
+  if (STI.hasV8_1MMainlineOps() &&
+      MF.getInfo<ARMFunctionInfo>()->isCmseNSEntryFunction()) {
+    CSI.emplace_back(ARM::FPCXTNS);
+    CSI.back().setRestored(false);
+  }
+
+  return false;
+}
+
+const TargetFrameLowering::SpillSlot *
+ARMFrameLowering::getCalleeSavedSpillSlots(unsigned &NumEntries) const {
+  static const SpillSlot FixedSpillOffsets[] = {{ARM::FPCXTNS, -4}};
+  NumEntries = array_lengthof(FixedSpillOffsets);
+  return FixedSpillOffsets;
+}
+
 MachineBasicBlock::iterator ARMFrameLowering::eliminateCallFramePseudoInstr(
     MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator I) const {
@@ -2364,8 +2411,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   // Emit the relevant DWARF information about the change in stack pointer as
   // well as where to find both r4 and r5 (the callee-save registers)
-  CFIIndex =
-      MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -8));
+  CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 8));
   BuildMI(PrevStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
   CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
@@ -2409,7 +2455,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
     ARMConstantPoolValue *NewCPV = ARMConstantPoolSymbol::Create(
         MF.getFunction().getContext(), "__STACK_LIMIT", PCLabelId, 0);
     MachineConstantPool *MCP = MF.getConstantPool();
-    unsigned CPI = MCP->getConstantPoolIndex(NewCPV, 4);
+    unsigned CPI = MCP->getConstantPoolIndex(NewCPV, Align(4));
 
     // ldr SR0, [pc, offset(STACK_LIMIT)]
     BuildMI(GetMBB, DL, TII.get(ARM::tLDRpci), ScratchReg0)
@@ -2507,8 +2553,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
 
   // Emit the DWARF info about the change in stack as well as where to find the
   // previous link register
-  CFIIndex =
-      MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -12));
+  CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 12));
   BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
   CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
@@ -2570,7 +2615,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   }
 
   // Update the CFA offset now that we've popped
-  CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
+  CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0));
   BuildMI(AllocMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
 
@@ -2594,7 +2639,7 @@ void ARMFrameLowering::adjustForSegmentedStacks(
   }
 
   // Update the CFA offset now that we've popped
-  CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, 0));
+  CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, 0));
   BuildMI(PostStackMBB, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
 
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h
index 0462b01af707..4c2c07d64f57 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -9,9 +9,7 @@
 #ifndef LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
 #define LLVM_LIB_TARGET_ARM_ARMFRAMELOWERING_H
 
-#include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/TargetFrameLowering.h"
-#include <vector>
 
 namespace llvm {
 
@@ -33,13 +31,14 @@ public:
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
                                  const TargetRegisterInfo *TRI) const override;
 
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MI,
-                                  std::vector<CalleeSavedInfo> &CSI,
-                                  const TargetRegisterInfo *TRI) const override;
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
+                              const TargetRegisterInfo *TRI) const override;
 
   bool keepFramePointer(const MachineFunction &MF) const override;
 
@@ -49,9 +48,9 @@ public:
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
+                             Register &FrameReg) const override;
   int ResolveFrameIndexReference(const MachineFunction &MF, int FI,
-                                 unsigned &FrameReg, int SPAdj) const;
+                                 Register &FrameReg, int SPAdj) const;
 
   void getCalleeSaves(const MachineFunction &MF,
                       BitVector &SavedRegs) const override;
@@ -62,25 +61,31 @@ public:
                                 MachineBasicBlock &MBB) const override;
 
   /// Returns true if the target will correctly handle shrink wrapping.
-  bool enableShrinkWrapping(const MachineFunction &MF) const override {
-    return true;
-  }
+  bool enableShrinkWrapping(const MachineFunction &MF) const override;
+
   bool isProfitableForNoCSROpt(const Function &F) const override {
     // The no-CSR optimisation is bad for code size on ARM, because we can save
     // many registers with a single PUSH/POP pair.
     return false;
   }
 
+  bool
+  assignCalleeSavedSpillSlots(MachineFunction &MF,
+                              const TargetRegisterInfo *TRI,
+                              std::vector<CalleeSavedInfo> &CSI) const override;
+
+  const SpillSlot *
+  getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
+
 private:
   void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                    const std::vector<CalleeSavedInfo> &CSI, unsigned StmOpc,
-                    unsigned StrOpc, bool NoGap,
-                    bool(*Func)(unsigned, bool), unsigned NumAlignedDPRCS2Regs,
-                    unsigned MIFlags = 0) const;
+                    ArrayRef<CalleeSavedInfo> CSI, unsigned StmOpc,
+                    unsigned StrOpc, bool NoGap, bool (*Func)(unsigned, bool),
+                    unsigned NumAlignedDPRCS2Regs, unsigned MIFlags = 0) const;
   void emitPopInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                   std::vector<CalleeSavedInfo> &CSI, unsigned LdmOpc,
+                   MutableArrayRef<CalleeSavedInfo> CSI, unsigned LdmOpc,
                    unsigned LdrOpc, bool isVarArg, bool NoGap,
-                   bool(*Func)(unsigned, bool),
+                   bool (*Func)(unsigned, bool),
                    unsigned NumAlignedDPRCS2Regs) const;
 
   MachineBasicBlock::iterator
diff --git a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
index 76a9ac12062d..2a9a31dab74f 100644
--- a/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
+++ b/llvm/lib/Target/ARM/ARMISelDAGToDAG.cpp
@@ -239,6 +239,10 @@ private:
   void SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry,
                          uint16_t OpcodeWithNoCarry, bool Add, bool Predicated);
 
+  /// SelectMVE_VSHLC - Select MVE intrinsics for a shift that carries between
+  /// vector lanes.
+  void SelectMVE_VSHLC(SDNode *N, bool Predicated);
+
   /// Select long MVE vector reductions with two vector operands
   /// Stride is the number of vector element widths the instruction can operate
   /// on:
@@ -266,7 +270,21 @@ private:
   /// pointer points to a set of NumVecs sub-opcodes used for the
   /// different stages (e.g. VLD20 versus VLD21) of each load family.
   void SelectMVE_VLD(SDNode *N, unsigned NumVecs,
-                     const uint16_t *const *Opcodes);
+                     const uint16_t *const *Opcodes, bool HasWriteback);
+
+  /// SelectMVE_VxDUP - Select MVE incrementing-dup instructions. Opcodes is an
+  /// array of 3 elements for the 8, 16 and 32-bit lane sizes.
+  void SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes,
+                       bool Wrapping, bool Predicated);
+
+  /// Select SelectCDE_CXxD - Select CDE dual-GPR instruction (one of CX1D,
+  /// CX1DA, CX2D, CX2DA, CX3, CX3DA).
+  /// \arg \c NumExtraOps number of extra operands besides the coprocossor,
+  ///                     the accumulator and the immediate operand, i.e. 0
+  ///                     for CX1*, 1 for CX2*, 2 for CX3*
+  /// \arg \c HasAccum whether the instruction has an accumulator operand
+  void SelectCDE_CXxD(SDNode *N, uint16_t Opcode, size_t NumExtraOps,
+                      bool HasAccum);
 
   /// SelectVLDDup - Select NEON load-duplicate intrinsics.  NumVecs
   /// should be 1, 2, 3 or 4.  The opcode array specifies the instructions used
@@ -1173,8 +1191,8 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
     // Only multiples of 4 are allowed for the offset, so the frame object
     // alignment must be at least 4.
     MachineFrameInfo &MFI = MF->getFrameInfo();
-    if (MFI.getObjectAlignment(FI) < 4)
-      MFI.setObjectAlignment(FI, 4);
+    if (MFI.getObjectAlign(FI) < Align(4))
+      MFI.setObjectAlignment(FI, Align(4));
     Base = CurDAG->getTargetFrameIndex(
         FI, TLI->getPointerTy(CurDAG->getDataLayout()));
     OffImm = CurDAG->getTargetConstant(0, SDLoc(N), MVT::i32);
@@ -1197,9 +1215,9 @@ bool ARMDAGToDAGISel::SelectThumbAddrModeSP(SDValue N,
       if (RHSC * 4 < MFI.getObjectSize(FI)) {
         // For LHS+RHS to result in an offset that's a multiple of 4 the object
         // indexed by the LHS must be 4-byte aligned.
-        if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlignment(FI) < 4)
-          MFI.setObjectAlignment(FI, 4);
-        if (MFI.getObjectAlignment(FI) >= 4) {
+        if (!MFI.isFixedObjectIndex(FI) && MFI.getObjectAlign(FI) < Align(4))
+          MFI.setObjectAlignment(FI, Align(4));
+        if (MFI.getObjectAlign(FI) >= Align(4)) {
           Base = CurDAG->getTargetFrameIndex(
               FI, TLI->getPointerTy(CurDAG->getDataLayout()));
           OffImm = CurDAG->getTargetConstant(RHSC, SDLoc(N), MVT::i32);
@@ -1708,7 +1726,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
   EVT LoadedVT;
   unsigned Opcode = 0;
   bool isSExtLd, isPre;
-  unsigned Align;
+  Align Alignment;
   ARMVCC::VPTCodes Pred;
   SDValue PredReg;
   SDValue Chain, Base, Offset;
@@ -1724,7 +1742,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
     Chain = LD->getChain();
     Base = LD->getBasePtr();
     Offset = LD->getOffset();
-    Align = LD->getAlignment();
+    Alignment = LD->getAlign();
     isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
     isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
     Pred = ARMVCC::None;
@@ -1740,7 +1758,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
     Chain = LD->getChain();
     Base = LD->getBasePtr();
     Offset = LD->getOffset();
-    Align = LD->getAlignment();
+    Alignment = LD->getAlign();
     isSExtLd = LD->getExtensionType() == ISD::SEXTLOAD;
     isPre = (AM == ISD::PRE_INC) || (AM == ISD::PRE_DEC);
     Pred = ARMVCC::Then;
@@ -1754,7 +1772,7 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
   bool CanChangeType = Subtarget->isLittle() && !isa<MaskedLoadSDNode>(N);
 
   SDValue NewOffset;
-  if (Align >= 2 && LoadedVT == MVT::v4i16 &&
+  if (Alignment >= Align(2) && LoadedVT == MVT::v4i16 &&
       SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1)) {
     if (isSExtLd)
       Opcode = isPre ? ARM::MVE_VLDRHS32_pre : ARM::MVE_VLDRHS32_post;
@@ -1772,12 +1790,12 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
       Opcode = isPre ? ARM::MVE_VLDRBS32_pre : ARM::MVE_VLDRBS32_post;
     else
       Opcode = isPre ? ARM::MVE_VLDRBU32_pre : ARM::MVE_VLDRBU32_post;
-  } else if (Align >= 4 &&
+  } else if (Alignment >= Align(4) &&
              (CanChangeType || LoadedVT == MVT::v4i32 ||
               LoadedVT == MVT::v4f32) &&
              SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 2))
     Opcode = isPre ? ARM::MVE_VLDRWU32_pre : ARM::MVE_VLDRWU32_post;
-  else if (Align >= 2 &&
+  else if (Alignment >= Align(2) &&
            (CanChangeType || LoadedVT == MVT::v8i16 ||
             LoadedVT == MVT::v8f16) &&
            SelectT2AddrModeImm7Offset(N, Offset, NewOffset, 1))
@@ -1791,8 +1809,8 @@ bool ARMDAGToDAGISel::tryMVEIndexedLoad(SDNode *N) {
   SDValue Ops[] = {Base, NewOffset,
                    CurDAG->getTargetConstant(Pred, SDLoc(N), MVT::i32), PredReg,
                    Chain};
-  SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), N->getValueType(0),
-                                       MVT::i32, MVT::Other, Ops);
+  SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), MVT::i32,
+                                       N->getValueType(0), MVT::Other, Ops);
   transferMemOperands(N, New);
   ReplaceUses(SDValue(N, 0), SDValue(New, 1));
   ReplaceUses(SDValue(N, 1), SDValue(New, 0));
@@ -2038,6 +2056,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
                                 const uint16_t *DOpcodes,
                                 const uint16_t *QOpcodes0,
                                 const uint16_t *QOpcodes1) {
+  assert(Subtarget->hasNEON());
   assert(NumVecs >= 1 && NumVecs <= 4 && "VLD NumVecs out-of-range");
   SDLoc dl(N);
 
@@ -2059,6 +2078,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
     // Double-register operations:
   case MVT::v8i8:  OpcodeIndex = 0; break;
   case MVT::v4f16:
+  case MVT::v4bf16:
   case MVT::v4i16: OpcodeIndex = 1; break;
   case MVT::v2f32:
   case MVT::v2i32: OpcodeIndex = 2; break;
@@ -2066,6 +2086,7 @@ void ARMDAGToDAGISel::SelectVLD(SDNode *N, bool isUpdating, unsigned NumVecs,
     // Quad-register operations:
   case MVT::v16i8: OpcodeIndex = 0; break;
   case MVT::v8f16:
+  case MVT::v8bf16:
   case MVT::v8i16: OpcodeIndex = 1; break;
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 2; break;
@@ -2177,6 +2198,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
                                 const uint16_t *DOpcodes,
                                 const uint16_t *QOpcodes0,
                                 const uint16_t *QOpcodes1) {
+  assert(Subtarget->hasNEON());
   assert(NumVecs >= 1 && NumVecs <= 4 && "VST NumVecs out-of-range");
   SDLoc dl(N);
 
@@ -2201,6 +2223,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
     // Double-register operations:
   case MVT::v8i8:  OpcodeIndex = 0; break;
   case MVT::v4f16:
+  case MVT::v4bf16:
   case MVT::v4i16: OpcodeIndex = 1; break;
   case MVT::v2f32:
   case MVT::v2i32: OpcodeIndex = 2; break;
@@ -2208,6 +2231,7 @@ void ARMDAGToDAGISel::SelectVST(SDNode *N, bool isUpdating, unsigned NumVecs,
     // Quad-register operations:
   case MVT::v16i8: OpcodeIndex = 0; break;
   case MVT::v8f16:
+  case MVT::v8bf16:
   case MVT::v8i16: OpcodeIndex = 1; break;
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 2; break;
@@ -2328,6 +2352,7 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
                                       unsigned NumVecs,
                                       const uint16_t *DOpcodes,
                                       const uint16_t *QOpcodes) {
+  assert(Subtarget->hasNEON());
   assert(NumVecs >=2 && NumVecs <= 4 && "VLDSTLane NumVecs out-of-range");
   SDLoc dl(N);
 
@@ -2368,11 +2393,13 @@ void ARMDAGToDAGISel::SelectVLDSTLane(SDNode *N, bool IsLoad, bool isUpdating,
     // Double-register operations:
   case MVT::v8i8:  OpcodeIndex = 0; break;
   case MVT::v4f16:
+  case MVT::v4bf16:
   case MVT::v4i16: OpcodeIndex = 1; break;
   case MVT::v2f32:
   case MVT::v2i32: OpcodeIndex = 2; break;
     // Quad-register operations:
   case MVT::v8f16:
+  case MVT::v8bf16:
   case MVT::v8i16: OpcodeIndex = 0; break;
   case MVT::v4f32:
   case MVT::v4i32: OpcodeIndex = 1; break;
@@ -2511,7 +2538,16 @@ void ARMDAGToDAGISel::SelectMVE_WB(SDNode *N, const uint16_t *Opcodes,
 
   Ops.push_back(N->getOperand(0)); // chain
 
-  CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
+  SmallVector<EVT, 8> VTs;
+  VTs.push_back(N->getValueType(1));
+  VTs.push_back(N->getValueType(0));
+  VTs.push_back(N->getValueType(2));
+
+  SDNode *New = CurDAG->getMachineNode(Opcode, SDLoc(N), VTs, Ops);
+  ReplaceUses(SDValue(N, 0), SDValue(New, 1));
+  ReplaceUses(SDValue(N, 1), SDValue(New, 0));
+  ReplaceUses(SDValue(N, 2), SDValue(New, 2));
+  CurDAG->RemoveDeadNode(N);
 }
 
 void ARMDAGToDAGISel::SelectMVE_LongShift(SDNode *N, uint16_t Opcode,
@@ -2581,6 +2617,25 @@ void ARMDAGToDAGISel::SelectMVE_VADCSBC(SDNode *N, uint16_t OpcodeWithCarry,
   CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
 }
 
+void ARMDAGToDAGISel::SelectMVE_VSHLC(SDNode *N, bool Predicated) {
+  SDLoc Loc(N);
+  SmallVector<SDValue, 8> Ops;
+
+  // One vector input, followed by a 32-bit word of bits to shift in
+  // and then an immediate shift count
+  Ops.push_back(N->getOperand(1));
+  Ops.push_back(N->getOperand(2));
+  int32_t ImmValue = cast<ConstantSDNode>(N->getOperand(3))->getZExtValue();
+  Ops.push_back(getI32Imm(ImmValue, Loc)); // immediate shift count
+
+  if (Predicated)
+    AddMVEPredicateToOps(Ops, Loc, N->getOperand(4));
+  else
+    AddEmptyMVEPredicateToOps(Ops, Loc);
+
+  CurDAG->SelectNodeTo(N, ARM::MVE_VSHLC, N->getVTList(), makeArrayRef(Ops));
+}
+
 static bool SDValueToConstBool(SDValue SDVal) {
   assert(isa<ConstantSDNode>(SDVal) && "expected a compile-time constant");
   ConstantSDNode *SDValConstant = dyn_cast<ConstantSDNode>(SDVal);
@@ -2673,7 +2728,8 @@ void ARMDAGToDAGISel::SelectMVE_VRMLLDAVH(SDNode *N, bool Predicated,
 }
 
 void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs,
-                                    const uint16_t *const *Opcodes) {
+                                    const uint16_t *const *Opcodes,
+                                    bool HasWriteback) {
   EVT VT = N->getValueType(0);
   SDLoc Loc(N);
 
@@ -2693,23 +2749,141 @@ void ARMDAGToDAGISel::SelectMVE_VLD(SDNode *N, unsigned NumVecs,
   }
 
   EVT DataTy = EVT::getVectorVT(*CurDAG->getContext(), MVT::i64, NumVecs * 2);
-  EVT ResultTys[] = {DataTy, MVT::Other};
+  SmallVector<EVT, 4> ResultTys = {DataTy, MVT::Other};
+  unsigned PtrOperand = HasWriteback ? 1 : 2;
 
   auto Data = SDValue(
       CurDAG->getMachineNode(TargetOpcode::IMPLICIT_DEF, Loc, DataTy), 0);
   SDValue Chain = N->getOperand(0);
-  for (unsigned Stage = 0; Stage < NumVecs; ++Stage) {
-    SDValue Ops[] = {Data, N->getOperand(2), Chain};
+  // Add a MVE_VLDn instruction for each Vec, except the last
+  for (unsigned Stage = 0; Stage < NumVecs - 1; ++Stage) {
+    SDValue Ops[] = {Data, N->getOperand(PtrOperand), Chain};
     auto LoadInst =
         CurDAG->getMachineNode(OurOpcodes[Stage], Loc, ResultTys, Ops);
     Data = SDValue(LoadInst, 0);
     Chain = SDValue(LoadInst, 1);
   }
+  // The last may need a writeback on it
+  if (HasWriteback)
+    ResultTys = {DataTy, MVT::i32, MVT::Other};
+  SDValue Ops[] = {Data, N->getOperand(PtrOperand), Chain};
+  auto LoadInst =
+      CurDAG->getMachineNode(OurOpcodes[NumVecs - 1], Loc, ResultTys, Ops);
 
-  for (unsigned i = 0; i < NumVecs; i++)
+  unsigned i;
+  for (i = 0; i < NumVecs; i++)
     ReplaceUses(SDValue(N, i),
-                CurDAG->getTargetExtractSubreg(ARM::qsub_0 + i, Loc, VT, Data));
-  ReplaceUses(SDValue(N, NumVecs), Chain);
+                CurDAG->getTargetExtractSubreg(ARM::qsub_0 + i, Loc, VT,
+                                               SDValue(LoadInst, 0)));
+  if (HasWriteback)
+    ReplaceUses(SDValue(N, i++), SDValue(LoadInst, 1));
+  ReplaceUses(SDValue(N, i), SDValue(LoadInst, HasWriteback ? 2 : 1));
+  CurDAG->RemoveDeadNode(N);
+}
+
+void ARMDAGToDAGISel::SelectMVE_VxDUP(SDNode *N, const uint16_t *Opcodes,
+                                      bool Wrapping, bool Predicated) {
+  EVT VT = N->getValueType(0);
+  SDLoc Loc(N);
+
+  uint16_t Opcode;
+  switch (VT.getScalarSizeInBits()) {
+  case 8:
+    Opcode = Opcodes[0];
+    break;
+  case 16:
+    Opcode = Opcodes[1];
+    break;
+  case 32:
+    Opcode = Opcodes[2];
+    break;
+  default:
+    llvm_unreachable("bad vector element size in SelectMVE_VxDUP");
+  }
+
+  SmallVector<SDValue, 8> Ops;
+  unsigned OpIdx = 1;
+
+  SDValue Inactive;
+  if (Predicated)
+    Inactive = N->getOperand(OpIdx++);
+
+  Ops.push_back(N->getOperand(OpIdx++));     // base
+  if (Wrapping)
+    Ops.push_back(N->getOperand(OpIdx++));   // limit
+
+  SDValue ImmOp = N->getOperand(OpIdx++);    // step
+  int ImmValue = cast<ConstantSDNode>(ImmOp)->getZExtValue();
+  Ops.push_back(getI32Imm(ImmValue, Loc));
+
+  if (Predicated)
+    AddMVEPredicateToOps(Ops, Loc, N->getOperand(OpIdx), Inactive);
+  else
+    AddEmptyMVEPredicateToOps(Ops, Loc, N->getValueType(0));
+
+  CurDAG->SelectNodeTo(N, Opcode, N->getVTList(), makeArrayRef(Ops));
+}
+
+void ARMDAGToDAGISel::SelectCDE_CXxD(SDNode *N, uint16_t Opcode,
+                                     size_t NumExtraOps, bool HasAccum) {
+  bool IsBigEndian = CurDAG->getDataLayout().isBigEndian();
+  SDLoc Loc(N);
+  SmallVector<SDValue, 8> Ops;
+
+  unsigned OpIdx = 1;
+
+  // Convert and append the immediate operand designating the coprocessor.
+  SDValue ImmCorpoc = N->getOperand(OpIdx++);
+  uint32_t ImmCoprocVal = cast<ConstantSDNode>(ImmCorpoc)->getZExtValue();
+  Ops.push_back(getI32Imm(ImmCoprocVal, Loc));
+
+  // For accumulating variants copy the low and high order parts of the
+  // accumulator into a register pair and add it to the operand vector.
+  if (HasAccum) {
+    SDValue AccLo = N->getOperand(OpIdx++);
+    SDValue AccHi = N->getOperand(OpIdx++);
+    if (IsBigEndian)
+      std::swap(AccLo, AccHi);
+    Ops.push_back(SDValue(createGPRPairNode(MVT::Untyped, AccLo, AccHi), 0));
+  }
+
+  // Copy extra operands as-is.
+  for (size_t I = 0; I < NumExtraOps; I++)
+    Ops.push_back(N->getOperand(OpIdx++));
+
+  // Convert and append the immediate operand
+  SDValue Imm = N->getOperand(OpIdx);
+  uint32_t ImmVal = cast<ConstantSDNode>(Imm)->getZExtValue();
+  Ops.push_back(getI32Imm(ImmVal, Loc));
+
+  // Accumulating variants are IT-predicable, add predicate operands.
+  if (HasAccum) {
+    SDValue Pred = getAL(CurDAG, Loc);
+    SDValue PredReg = CurDAG->getRegister(0, MVT::i32);
+    Ops.push_back(Pred);
+    Ops.push_back(PredReg);
+  }
+
+  // Create the CDE intruction
+  SDNode *InstrNode = CurDAG->getMachineNode(Opcode, Loc, MVT::Untyped, Ops);
+  SDValue ResultPair = SDValue(InstrNode, 0);
+
+  // The original intrinsic had two outputs, and the output of the dual-register
+  // CDE instruction is a register pair. We need to extract the two subregisters
+  // and replace all uses of the original outputs with the extracted
+  // subregisters.
+  uint16_t SubRegs[2] = {ARM::gsub_0, ARM::gsub_1};
+  if (IsBigEndian)
+    std::swap(SubRegs[0], SubRegs[1]);
+
+  for (size_t ResIdx = 0; ResIdx < 2; ResIdx++) {
+    if (SDValue(N, ResIdx).use_empty())
+      continue;
+    SDValue SubReg = CurDAG->getTargetExtractSubreg(SubRegs[ResIdx], Loc,
+                                                    MVT::i32, ResultPair);
+    ReplaceUses(SDValue(N, ResIdx), SubReg);
+  }
+
   CurDAG->RemoveDeadNode(N);
 }
 
@@ -2718,6 +2892,7 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
                                    const uint16_t *DOpcodes,
                                    const uint16_t *QOpcodes0,
                                    const uint16_t *QOpcodes1) {
+  assert(Subtarget->hasNEON());
   assert(NumVecs >= 1 && NumVecs <= 4 && "VLDDup NumVecs out-of-range");
   SDLoc dl(N);
 
@@ -2754,6 +2929,8 @@ void ARMDAGToDAGISel::SelectVLDDup(SDNode *N, bool IsIntrinsic,
   case MVT::v8i16:
   case MVT::v4f16:
   case MVT::v8f16:
+  case MVT::v4bf16:
+  case MVT::v8bf16:
                   OpcodeIndex = 1; break;
   case MVT::v2f32:
   case MVT::v2i32:
@@ -3231,7 +3408,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       MachineFunction& MF = CurDAG->getMachineFunction();
       MachineMemOperand *MemOp =
           MF.getMachineMemOperand(MachinePointerInfo::getConstantPool(MF),
-                                  MachineMemOperand::MOLoad, 4, 4);
+                                  MachineMemOperand::MOLoad, 4, Align(4));
 
       CurDAG->setNodeMemRefs(cast<MachineSDNode>(ResNode), {MemOp});
 
@@ -3251,8 +3428,8 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       // Set the alignment of the frame object to 4, to avoid having to generate
       // more than one ADD
       MachineFrameInfo &MFI = MF->getFrameInfo();
-      if (MFI.getObjectAlignment(FI) < 4)
-        MFI.setObjectAlignment(FI, 4);
+      if (MFI.getObjectAlign(FI) < Align(4))
+        MFI.setObjectAlignment(FI, Align(4));
       CurDAG->SelectNodeTo(N, ARM::tADDframe, MVT::i32, TFI,
                            CurDAG->getTargetConstant(0, dl, MVT::i32));
       return;
@@ -3522,6 +3699,14 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     const SDValue &Chain = N->getOperand(0);
     const SDValue &Addr = N->getOperand(1);
     SelectAddrMode3(Addr, Base, RegOffset, ImmOffset);
+    if (RegOffset != CurDAG->getRegister(0, MVT::i32)) {
+      // The register-offset variant of LDRD mandates that the register
+      // allocated to RegOffset is not reused in any of the remaining operands.
+      // This restriction is currently not enforced. Therefore emitting this
+      // variant is explicitly avoided.
+      Base = Addr;
+      RegOffset = CurDAG->getRegister(0, MVT::i32);
+    }
     SDValue Ops[] = {Base, RegOffset, ImmOffset, Chain};
     SDNode *New = CurDAG->getMachineNode(ARM::LOADDUAL, dl,
                                          {MVT::Untyped, MVT::Other}, Ops);
@@ -3529,12 +3714,37 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
                                                 SDValue(New, 0));
     SDValue Hi = CurDAG->getTargetExtractSubreg(ARM::gsub_1, dl, MVT::i32,
                                                 SDValue(New, 0));
+    transferMemOperands(N, New);
     ReplaceUses(SDValue(N, 0), Lo);
     ReplaceUses(SDValue(N, 1), Hi);
     ReplaceUses(SDValue(N, 2), SDValue(New, 1));
     CurDAG->RemoveDeadNode(N);
     return;
   }
+  case ARMISD::STRD: {
+    if (Subtarget->isThumb2())
+      break; // TableGen handles isel in this case.
+    SDValue Base, RegOffset, ImmOffset;
+    const SDValue &Chain = N->getOperand(0);
+    const SDValue &Addr = N->getOperand(3);
+    SelectAddrMode3(Addr, Base, RegOffset, ImmOffset);
+    if (RegOffset != CurDAG->getRegister(0, MVT::i32)) {
+      // The register-offset variant of STRD mandates that the register
+      // allocated to RegOffset is not reused in any of the remaining operands.
+      // This restriction is currently not enforced. Therefore emitting this
+      // variant is explicitly avoided.
+      Base = Addr;
+      RegOffset = CurDAG->getRegister(0, MVT::i32);
+    }
+    SDNode *RegPair =
+        createGPRPairNode(MVT::Untyped, N->getOperand(1), N->getOperand(2));
+    SDValue Ops[] = {SDValue(RegPair, 0), Base, RegOffset, ImmOffset, Chain};
+    SDNode *New = CurDAG->getMachineNode(ARM::STOREDUAL, dl, MVT::Other, Ops);
+    transferMemOperands(N, New);
+    ReplaceUses(SDValue(N, 0), SDValue(New, 0));
+    CurDAG->RemoveDeadNode(N);
+    return;
+  }
   case ARMISD::LOOP_DEC: {
     SDValue Ops[] = { N->getOperand(1),
                       N->getOperand(2),
@@ -3877,14 +4087,24 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ARMISD::VLD2_UPD: {
-    static const uint16_t DOpcodes[] = { ARM::VLD2d8wb_fixed,
-                                         ARM::VLD2d16wb_fixed,
-                                         ARM::VLD2d32wb_fixed,
-                                         ARM::VLD1q64wb_fixed};
-    static const uint16_t QOpcodes[] = { ARM::VLD2q8PseudoWB_fixed,
-                                         ARM::VLD2q16PseudoWB_fixed,
-                                         ARM::VLD2q32PseudoWB_fixed };
-    SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    if (Subtarget->hasNEON()) {
+      static const uint16_t DOpcodes[] = {
+          ARM::VLD2d8wb_fixed, ARM::VLD2d16wb_fixed, ARM::VLD2d32wb_fixed,
+          ARM::VLD1q64wb_fixed};
+      static const uint16_t QOpcodes[] = {ARM::VLD2q8PseudoWB_fixed,
+                                          ARM::VLD2q16PseudoWB_fixed,
+                                          ARM::VLD2q32PseudoWB_fixed};
+      SelectVLD(N, true, 2, DOpcodes, QOpcodes, nullptr);
+    } else {
+      static const uint16_t Opcodes8[] = {ARM::MVE_VLD20_8,
+                                          ARM::MVE_VLD21_8_wb};
+      static const uint16_t Opcodes16[] = {ARM::MVE_VLD20_16,
+                                           ARM::MVE_VLD21_16_wb};
+      static const uint16_t Opcodes32[] = {ARM::MVE_VLD20_32,
+                                           ARM::MVE_VLD21_32_wb};
+      static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32};
+      SelectMVE_VLD(N, 2, Opcodes, true);
+    }
     return;
   }
 
@@ -3904,17 +4124,30 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ARMISD::VLD4_UPD: {
-    static const uint16_t DOpcodes[] = { ARM::VLD4d8Pseudo_UPD,
-                                         ARM::VLD4d16Pseudo_UPD,
-                                         ARM::VLD4d32Pseudo_UPD,
-                                         ARM::VLD1d64QPseudoWB_fixed};
-    static const uint16_t QOpcodes0[] = { ARM::VLD4q8Pseudo_UPD,
-                                          ARM::VLD4q16Pseudo_UPD,
-                                          ARM::VLD4q32Pseudo_UPD };
-    static const uint16_t QOpcodes1[] = { ARM::VLD4q8oddPseudo_UPD,
-                                          ARM::VLD4q16oddPseudo_UPD,
-                                          ARM::VLD4q32oddPseudo_UPD };
-    SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    if (Subtarget->hasNEON()) {
+      static const uint16_t DOpcodes[] = {
+          ARM::VLD4d8Pseudo_UPD, ARM::VLD4d16Pseudo_UPD, ARM::VLD4d32Pseudo_UPD,
+          ARM::VLD1d64QPseudoWB_fixed};
+      static const uint16_t QOpcodes0[] = {ARM::VLD4q8Pseudo_UPD,
+                                           ARM::VLD4q16Pseudo_UPD,
+                                           ARM::VLD4q32Pseudo_UPD};
+      static const uint16_t QOpcodes1[] = {ARM::VLD4q8oddPseudo_UPD,
+                                           ARM::VLD4q16oddPseudo_UPD,
+                                           ARM::VLD4q32oddPseudo_UPD};
+      SelectVLD(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+    } else {
+      static const uint16_t Opcodes8[] = {ARM::MVE_VLD40_8, ARM::MVE_VLD41_8,
+                                          ARM::MVE_VLD42_8,
+                                          ARM::MVE_VLD43_8_wb};
+      static const uint16_t Opcodes16[] = {ARM::MVE_VLD40_16, ARM::MVE_VLD41_16,
+                                           ARM::MVE_VLD42_16,
+                                           ARM::MVE_VLD43_16_wb};
+      static const uint16_t Opcodes32[] = {ARM::MVE_VLD40_32, ARM::MVE_VLD41_32,
+                                           ARM::MVE_VLD42_32,
+                                           ARM::MVE_VLD43_32_wb};
+      static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32};
+      SelectMVE_VLD(N, 4, Opcodes, true);
+    }
     return;
   }
 
@@ -3962,15 +4195,17 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ARMISD::VST2_UPD: {
-    static const uint16_t DOpcodes[] = { ARM::VST2d8wb_fixed,
-                                         ARM::VST2d16wb_fixed,
-                                         ARM::VST2d32wb_fixed,
-                                         ARM::VST1q64wb_fixed};
-    static const uint16_t QOpcodes[] = { ARM::VST2q8PseudoWB_fixed,
-                                         ARM::VST2q16PseudoWB_fixed,
-                                         ARM::VST2q32PseudoWB_fixed };
-    SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
-    return;
+    if (Subtarget->hasNEON()) {
+      static const uint16_t DOpcodes[] = {
+          ARM::VST2d8wb_fixed, ARM::VST2d16wb_fixed, ARM::VST2d32wb_fixed,
+          ARM::VST1q64wb_fixed};
+      static const uint16_t QOpcodes[] = {ARM::VST2q8PseudoWB_fixed,
+                                          ARM::VST2q16PseudoWB_fixed,
+                                          ARM::VST2q32PseudoWB_fixed};
+      SelectVST(N, true, 2, DOpcodes, QOpcodes, nullptr);
+      return;
+    }
+    break;
   }
 
   case ARMISD::VST3_UPD: {
@@ -3989,18 +4224,20 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
   }
 
   case ARMISD::VST4_UPD: {
-    static const uint16_t DOpcodes[] = { ARM::VST4d8Pseudo_UPD,
-                                         ARM::VST4d16Pseudo_UPD,
-                                         ARM::VST4d32Pseudo_UPD,
-                                         ARM::VST1d64QPseudoWB_fixed};
-    static const uint16_t QOpcodes0[] = { ARM::VST4q8Pseudo_UPD,
-                                          ARM::VST4q16Pseudo_UPD,
-                                          ARM::VST4q32Pseudo_UPD };
-    static const uint16_t QOpcodes1[] = { ARM::VST4q8oddPseudo_UPD,
-                                          ARM::VST4q16oddPseudo_UPD,
-                                          ARM::VST4q32oddPseudo_UPD };
-    SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
-    return;
+    if (Subtarget->hasNEON()) {
+      static const uint16_t DOpcodes[] = {
+          ARM::VST4d8Pseudo_UPD, ARM::VST4d16Pseudo_UPD, ARM::VST4d32Pseudo_UPD,
+          ARM::VST1d64QPseudoWB_fixed};
+      static const uint16_t QOpcodes0[] = {ARM::VST4q8Pseudo_UPD,
+                                           ARM::VST4q16Pseudo_UPD,
+                                           ARM::VST4q32Pseudo_UPD};
+      static const uint16_t QOpcodes1[] = {ARM::VST4q8oddPseudo_UPD,
+                                           ARM::VST4q16oddPseudo_UPD,
+                                           ARM::VST4q32oddPseudo_UPD};
+      SelectVST(N, true, 4, DOpcodes, QOpcodes0, QOpcodes1);
+      return;
+    }
+    break;
   }
 
   case ARMISD::VST2LN_UPD: {
@@ -4479,7 +4716,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
       static const uint16_t Opcodes32[] = {ARM::MVE_VLD20_32,
                                            ARM::MVE_VLD21_32};
       static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32};
-      SelectMVE_VLD(N, 2, Opcodes);
+      SelectMVE_VLD(N, 2, Opcodes, false);
       return;
     }
 
@@ -4493,7 +4730,7 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
                                            ARM::MVE_VLD42_32,
                                            ARM::MVE_VLD43_32};
       static const uint16_t *const Opcodes[] = {Opcodes8, Opcodes16, Opcodes32};
-      SelectMVE_VLD(N, 4, Opcodes);
+      SelectMVE_VLD(N, 4, Opcodes, false);
       return;
     }
     }
@@ -4506,6 +4743,29 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     default:
       break;
 
+    // Scalar f32 -> bf16
+    case Intrinsic::arm_neon_vcvtbfp2bf: {
+      SDLoc dl(N);
+      const SDValue &Src = N->getOperand(1);
+      llvm::EVT DestTy = N->getValueType(0);
+      SDValue Pred = getAL(CurDAG, dl);
+      SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { Src, Src, Pred, Reg0 };
+      CurDAG->SelectNodeTo(N, ARM::BF16_VCVTB, DestTy, Ops);
+      return;
+    }
+
+    // Vector v4f32 -> v4bf16
+    case Intrinsic::arm_neon_vcvtfp2bf: {
+      SDLoc dl(N);
+      const SDValue &Src = N->getOperand(1);
+      SDValue Pred = getAL(CurDAG, dl);
+      SDValue Reg0 = CurDAG->getRegister(0, MVT::i32);
+      SDValue Ops[] = { Src, Pred, Reg0 };
+      CurDAG->SelectNodeTo(N, ARM::BF16_VCVT, MVT::v4bf16, Ops);
+      return;
+    }
+
     case Intrinsic::arm_mve_urshrl:
       SelectMVE_LongShift(N, ARM::MVE_URSHRL, true, false);
       return;
@@ -4524,18 +4784,21 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
     case Intrinsic::arm_mve_sqrshrl:
       SelectMVE_LongShift(N, ARM::MVE_SQRSHRL, false, true);
       return;
-    case Intrinsic::arm_mve_lsll:
-      SelectMVE_LongShift(N, ARM::MVE_LSLLr, false, false);
-      return;
-    case Intrinsic::arm_mve_asrl:
-      SelectMVE_LongShift(N, ARM::MVE_ASRLr, false, false);
-      return;
 
     case Intrinsic::arm_mve_vadc:
     case Intrinsic::arm_mve_vadc_predicated:
       SelectMVE_VADCSBC(N, ARM::MVE_VADC, ARM::MVE_VADCI, true,
                         IntNo == Intrinsic::arm_mve_vadc_predicated);
       return;
+    case Intrinsic::arm_mve_vsbc:
+    case Intrinsic::arm_mve_vsbc_predicated:
+      SelectMVE_VADCSBC(N, ARM::MVE_VSBC, ARM::MVE_VSBCI, true,
+                        IntNo == Intrinsic::arm_mve_vsbc_predicated);
+      return;
+    case Intrinsic::arm_mve_vshlc:
+    case Intrinsic::arm_mve_vshlc_predicated:
+      SelectMVE_VSHLC(N, IntNo == Intrinsic::arm_mve_vshlc_predicated);
+      return;
 
     case Intrinsic::arm_mve_vmlldava:
     case Intrinsic::arm_mve_vmlldava_predicated: {
@@ -4573,6 +4836,80 @@ void ARMDAGToDAGISel::Select(SDNode *N) {
                           OpcodesS, OpcodesU);
       return;
     }
+
+    case Intrinsic::arm_mve_vidup:
+    case Intrinsic::arm_mve_vidup_predicated: {
+      static const uint16_t Opcodes[] = {
+          ARM::MVE_VIDUPu8, ARM::MVE_VIDUPu16, ARM::MVE_VIDUPu32,
+      };
+      SelectMVE_VxDUP(N, Opcodes, false,
+                      IntNo == Intrinsic::arm_mve_vidup_predicated);
+      return;
+    }
+
+    case Intrinsic::arm_mve_vddup:
+    case Intrinsic::arm_mve_vddup_predicated: {
+      static const uint16_t Opcodes[] = {
+          ARM::MVE_VDDUPu8, ARM::MVE_VDDUPu16, ARM::MVE_VDDUPu32,
+      };
+      SelectMVE_VxDUP(N, Opcodes, false,
+                      IntNo == Intrinsic::arm_mve_vddup_predicated);
+      return;
+    }
+
+    case Intrinsic::arm_mve_viwdup:
+    case Intrinsic::arm_mve_viwdup_predicated: {
+      static const uint16_t Opcodes[] = {
+          ARM::MVE_VIWDUPu8, ARM::MVE_VIWDUPu16, ARM::MVE_VIWDUPu32,
+      };
+      SelectMVE_VxDUP(N, Opcodes, true,
+                      IntNo == Intrinsic::arm_mve_viwdup_predicated);
+      return;
+    }
+
+    case Intrinsic::arm_mve_vdwdup:
+    case Intrinsic::arm_mve_vdwdup_predicated: {
+      static const uint16_t Opcodes[] = {
+          ARM::MVE_VDWDUPu8, ARM::MVE_VDWDUPu16, ARM::MVE_VDWDUPu32,
+      };
+      SelectMVE_VxDUP(N, Opcodes, true,
+                      IntNo == Intrinsic::arm_mve_vdwdup_predicated);
+      return;
+    }
+
+    case Intrinsic::arm_cde_cx1d:
+    case Intrinsic::arm_cde_cx1da:
+    case Intrinsic::arm_cde_cx2d:
+    case Intrinsic::arm_cde_cx2da:
+    case Intrinsic::arm_cde_cx3d:
+    case Intrinsic::arm_cde_cx3da: {
+      bool HasAccum = IntNo == Intrinsic::arm_cde_cx1da ||
+                      IntNo == Intrinsic::arm_cde_cx2da ||
+                      IntNo == Intrinsic::arm_cde_cx3da;
+      size_t NumExtraOps;
+      uint16_t Opcode;
+      switch (IntNo) {
+      case Intrinsic::arm_cde_cx1d:
+      case Intrinsic::arm_cde_cx1da:
+        NumExtraOps = 0;
+        Opcode = HasAccum ? ARM::CDE_CX1DA : ARM::CDE_CX1D;
+        break;
+      case Intrinsic::arm_cde_cx2d:
+      case Intrinsic::arm_cde_cx2da:
+        NumExtraOps = 1;
+        Opcode = HasAccum ? ARM::CDE_CX2DA : ARM::CDE_CX2D;
+        break;
+      case Intrinsic::arm_cde_cx3d:
+      case Intrinsic::arm_cde_cx3da:
+        NumExtraOps = 2;
+        Opcode = HasAccum ? ARM::CDE_CX3DA : ARM::CDE_CX3D;
+        break;
+      default:
+        llvm_unreachable("Unexpected opcode");
+      }
+      SelectCDE_CXxD(N, Opcode, NumExtraOps, HasAccum);
+      return;
+    }
     }
     break;
   }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index cf738cd66434..287e2e60e572 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -210,6 +210,8 @@ void ARMTargetLowering::addTypeForNEON(MVT VT, MVT PromotedLdStVT,
   setOperationAction(ISD::SREM, VT, Expand);
   setOperationAction(ISD::UREM, VT, Expand);
   setOperationAction(ISD::FREM, VT, Expand);
+  setOperationAction(ISD::SDIVREM, VT, Expand);
+  setOperationAction(ISD::UDIVREM, VT, Expand);
 
   if (!VT.isFloatingPoint() &&
       VT != MVT::v2i64 && VT != MVT::v1i64)
@@ -284,6 +286,8 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::SDIV, VT, Expand);
     setOperationAction(ISD::UREM, VT, Expand);
     setOperationAction(ISD::SREM, VT, Expand);
+    setOperationAction(ISD::UDIVREM, VT, Expand);
+    setOperationAction(ISD::SDIVREM, VT, Expand);
     setOperationAction(ISD::CTPOP, VT, Expand);
 
     // Vector reductions
@@ -292,6 +296,10 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     setOperationAction(ISD::VECREDUCE_UMAX, VT, Legal);
     setOperationAction(ISD::VECREDUCE_SMIN, VT, Legal);
     setOperationAction(ISD::VECREDUCE_UMIN, VT, Legal);
+    setOperationAction(ISD::VECREDUCE_MUL, VT, Custom);
+    setOperationAction(ISD::VECREDUCE_AND, VT, Custom);
+    setOperationAction(ISD::VECREDUCE_OR, VT, Custom);
+    setOperationAction(ISD::VECREDUCE_XOR, VT, Custom);
 
     if (!HasMVEFP) {
       setOperationAction(ISD::SINT_TO_FP, VT, Expand);
@@ -341,6 +349,10 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
       setOperationAction(ISD::FMINNUM, VT, Legal);
       setOperationAction(ISD::FMAXNUM, VT, Legal);
       setOperationAction(ISD::FROUND, VT, Legal);
+      setOperationAction(ISD::VECREDUCE_FADD, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_FMUL, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_FMIN, VT, Custom);
+      setOperationAction(ISD::VECREDUCE_FMAX, VT, Custom);
 
       // No native support for these.
       setOperationAction(ISD::FDIV, VT, Expand);
@@ -358,6 +370,17 @@ void ARMTargetLowering::addMVEVectorTypes(bool HasMVEFP) {
     }
   }
 
+  // Custom Expand smaller than legal vector reductions to prevent false zero
+  // items being added.
+  setOperationAction(ISD::VECREDUCE_FADD, MVT::v4f16, Custom);
+  setOperationAction(ISD::VECREDUCE_FMUL, MVT::v4f16, Custom);
+  setOperationAction(ISD::VECREDUCE_FMIN, MVT::v4f16, Custom);
+  setOperationAction(ISD::VECREDUCE_FMAX, MVT::v4f16, Custom);
+  setOperationAction(ISD::VECREDUCE_FADD, MVT::v2f16, Custom);
+  setOperationAction(ISD::VECREDUCE_FMUL, MVT::v2f16, Custom);
+  setOperationAction(ISD::VECREDUCE_FMIN, MVT::v2f16, Custom);
+  setOperationAction(ISD::VECREDUCE_FMAX, MVT::v2f16, Custom);
+
   // We 'support' these types up to bitcast/load/store level, regardless of
   // MVE integer-only / float support. Only doing FP data processing on the FP
   // vector types is inhibited at integer-only level.
@@ -717,13 +740,19 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   if (Subtarget->hasFullFP16()) {
     addRegisterClass(MVT::f16, &ARM::HPRRegClass);
     setOperationAction(ISD::BITCAST, MVT::i16, Custom);
-    setOperationAction(ISD::BITCAST, MVT::i32, Custom);
     setOperationAction(ISD::BITCAST, MVT::f16, Custom);
 
     setOperationAction(ISD::FMINNUM, MVT::f16, Legal);
     setOperationAction(ISD::FMAXNUM, MVT::f16, Legal);
   }
 
+  if (Subtarget->hasBF16()) {
+    addRegisterClass(MVT::bf16, &ARM::HPRRegClass);
+    setAllExpand(MVT::bf16);
+    if (!Subtarget->hasFullFP16())
+      setOperationAction(ISD::BITCAST, MVT::bf16, Custom);
+  }
+
   for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
     for (MVT InnerVT : MVT::fixedlen_vector_valuetypes()) {
       setTruncStoreAction(VT, InnerVT, Expand);
@@ -771,6 +800,11 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       addQRTypeForNEON(MVT::v8f16);
       addDRTypeForNEON(MVT::v4f16);
     }
+
+    if (Subtarget->hasBF16()) {
+      addQRTypeForNEON(MVT::v8bf16);
+      addDRTypeForNEON(MVT::v4bf16);
+    }
   }
 
   if (Subtarget->hasMVEIntegerOps() || Subtarget->hasNEON()) {
@@ -912,9 +946,6 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FMA, MVT::v4f32, Expand);
     }
 
-    setTargetDAGCombine(ISD::INTRINSIC_VOID);
-    setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
-    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
     setTargetDAGCombine(ISD::SHL);
     setTargetDAGCombine(ISD::SRL);
     setTargetDAGCombine(ISD::SRA);
@@ -938,10 +969,24 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
     setTargetDAGCombine(ISD::BUILD_VECTOR);
     setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
     setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
+    setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
     setTargetDAGCombine(ISD::STORE);
     setTargetDAGCombine(ISD::SIGN_EXTEND);
     setTargetDAGCombine(ISD::ZERO_EXTEND);
     setTargetDAGCombine(ISD::ANY_EXTEND);
+    setTargetDAGCombine(ISD::INTRINSIC_WO_CHAIN);
+    setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
+    setTargetDAGCombine(ISD::INTRINSIC_VOID);
+    setTargetDAGCombine(ISD::VECREDUCE_ADD);
+    setTargetDAGCombine(ISD::ADD);
+    setTargetDAGCombine(ISD::BITCAST);
+  }
+  if (Subtarget->hasMVEIntegerOps()) {
+    setTargetDAGCombine(ISD::SMIN);
+    setTargetDAGCombine(ISD::UMIN);
+    setTargetDAGCombine(ISD::SMAX);
+    setTargetDAGCombine(ISD::UMAX);
+    setTargetDAGCombine(ISD::FP_EXTEND);
   }
 
   if (!Subtarget->hasFP64()) {
@@ -1356,6 +1401,14 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
       setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
     }
+
+    // Strict floating-point comparisons need custom lowering.
+    setOperationAction(ISD::STRICT_FSETCC,  MVT::f16, Custom);
+    setOperationAction(ISD::STRICT_FSETCCS, MVT::f16, Custom);
+    setOperationAction(ISD::STRICT_FSETCC,  MVT::f32, Custom);
+    setOperationAction(ISD::STRICT_FSETCCS, MVT::f32, Custom);
+    setOperationAction(ISD::STRICT_FSETCC,  MVT::f64, Custom);
+    setOperationAction(ISD::STRICT_FSETCCS, MVT::f64, Custom);
   }
 
   // Use __sincos_stret if available.
@@ -1413,12 +1466,16 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   }
 
   if (Subtarget->hasNEON()) {
-    // vmin and vmax aren't available in a scalar form, so we use
-    // a NEON instruction with an undef lane instead.
-    setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
-    setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
-    setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
+    // vmin and vmax aren't available in a scalar form, so we can use
+    // a NEON instruction with an undef lane instead.  This has a performance
+    // penalty on some cores, so we don't do this unless we have been
+    // asked to by the core tuning model.
+    if (Subtarget->useNEONForSinglePrecisionFP()) {
+      setOperationAction(ISD::FMINIMUM, MVT::f32, Legal);
+      setOperationAction(ISD::FMAXIMUM, MVT::f32, Legal);
+      setOperationAction(ISD::FMINIMUM, MVT::f16, Legal);
+      setOperationAction(ISD::FMAXIMUM, MVT::f16, Legal);
+    }
     setOperationAction(ISD::FMINIMUM, MVT::v2f32, Legal);
     setOperationAction(ISD::FMAXIMUM, MVT::v2f32, Legal);
     setOperationAction(ISD::FMINIMUM, MVT::v4f32, Legal);
@@ -1446,6 +1503,9 @@ ARMTargetLowering::ARMTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::OR);
   setTargetDAGCombine(ISD::XOR);
 
+  if (Subtarget->hasMVEIntegerOps())
+    setTargetDAGCombine(ISD::VSELECT);
+
   if (Subtarget->hasV6Ops())
     setTargetDAGCombine(ISD::SRL);
   if (Subtarget->isThumb1Only())
@@ -1544,17 +1604,21 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::CALL:          return "ARMISD::CALL";
   case ARMISD::CALL_PRED:     return "ARMISD::CALL_PRED";
   case ARMISD::CALL_NOLINK:   return "ARMISD::CALL_NOLINK";
+  case ARMISD::tSECALL:       return "ARMISD::tSECALL";
   case ARMISD::BRCOND:        return "ARMISD::BRCOND";
   case ARMISD::BR_JT:         return "ARMISD::BR_JT";
   case ARMISD::BR2_JT:        return "ARMISD::BR2_JT";
   case ARMISD::RET_FLAG:      return "ARMISD::RET_FLAG";
+  case ARMISD::SERET_FLAG:    return "ARMISD::SERET_FLAG";
   case ARMISD::INTRET_FLAG:   return "ARMISD::INTRET_FLAG";
   case ARMISD::PIC_ADD:       return "ARMISD::PIC_ADD";
   case ARMISD::CMP:           return "ARMISD::CMP";
   case ARMISD::CMN:           return "ARMISD::CMN";
   case ARMISD::CMPZ:          return "ARMISD::CMPZ";
   case ARMISD::CMPFP:         return "ARMISD::CMPFP";
+  case ARMISD::CMPFPE:        return "ARMISD::CMPFPE";
   case ARMISD::CMPFPw0:       return "ARMISD::CMPFPw0";
+  case ARMISD::CMPFPEw0:      return "ARMISD::CMPFPEw0";
   case ARMISD::BCC_i64:       return "ARMISD::BCC_i64";
   case ARMISD::FMSTAT:        return "ARMISD::FMSTAT";
 
@@ -1605,6 +1669,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::WIN__DBZCHK:   return "ARMISD::WIN__DBZCHK";
 
   case ARMISD::PREDICATE_CAST: return "ARMISD::PREDICATE_CAST";
+  case ARMISD::VECTOR_REG_CAST: return "ARMISD::VECTOR_REG_CAST";
   case ARMISD::VCMP:          return "ARMISD::VCMP";
   case ARMISD::VCMPZ:         return "ARMISD::VCMPZ";
   case ARMISD::VTST:          return "ARMISD::VTST";
@@ -1645,8 +1710,28 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VTBL1:         return "ARMISD::VTBL1";
   case ARMISD::VTBL2:         return "ARMISD::VTBL2";
   case ARMISD::VMOVN:         return "ARMISD::VMOVN";
+  case ARMISD::VQMOVNs:       return "ARMISD::VQMOVNs";
+  case ARMISD::VQMOVNu:       return "ARMISD::VQMOVNu";
+  case ARMISD::VCVTN:         return "ARMISD::VCVTN";
+  case ARMISD::VCVTL:         return "ARMISD::VCVTL";
   case ARMISD::VMULLs:        return "ARMISD::VMULLs";
   case ARMISD::VMULLu:        return "ARMISD::VMULLu";
+  case ARMISD::VADDVs:        return "ARMISD::VADDVs";
+  case ARMISD::VADDVu:        return "ARMISD::VADDVu";
+  case ARMISD::VADDLVs:       return "ARMISD::VADDLVs";
+  case ARMISD::VADDLVu:       return "ARMISD::VADDLVu";
+  case ARMISD::VADDLVAs:      return "ARMISD::VADDLVAs";
+  case ARMISD::VADDLVAu:      return "ARMISD::VADDLVAu";
+  case ARMISD::VADDLVps:      return "ARMISD::VADDLVps";
+  case ARMISD::VADDLVpu:      return "ARMISD::VADDLVpu";
+  case ARMISD::VADDLVAps:     return "ARMISD::VADDLVAps";
+  case ARMISD::VADDLVApu:     return "ARMISD::VADDLVApu";
+  case ARMISD::VMLAVs:        return "ARMISD::VMLAVs";
+  case ARMISD::VMLAVu:        return "ARMISD::VMLAVu";
+  case ARMISD::VMLALVs:       return "ARMISD::VMLALVs";
+  case ARMISD::VMLALVu:       return "ARMISD::VMLALVu";
+  case ARMISD::VMLALVAs:      return "ARMISD::VMLALVAs";
+  case ARMISD::VMLALVAu:      return "ARMISD::VMLALVAu";
   case ARMISD::UMAAL:         return "ARMISD::UMAAL";
   case ARMISD::UMLAL:         return "ARMISD::UMLAL";
   case ARMISD::SMLAL:         return "ARMISD::SMLAL";
@@ -1950,6 +2035,35 @@ CCAssignFn *ARMTargetLowering::CCAssignFnForNode(CallingConv::ID CC,
   }
 }
 
+SDValue ARMTargetLowering::MoveToHPR(const SDLoc &dl, SelectionDAG &DAG,
+                                     MVT LocVT, MVT ValVT, SDValue Val) const {
+  Val = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocVT.getSizeInBits()),
+                    Val);
+  if (Subtarget->hasFullFP16()) {
+    Val = DAG.getNode(ARMISD::VMOVhr, dl, ValVT, Val);
+  } else {
+    Val = DAG.getNode(ISD::TRUNCATE, dl,
+                      MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
+    Val = DAG.getNode(ISD::BITCAST, dl, ValVT, Val);
+  }
+  return Val;
+}
+
+SDValue ARMTargetLowering::MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG,
+                                       MVT LocVT, MVT ValVT,
+                                       SDValue Val) const {
+  if (Subtarget->hasFullFP16()) {
+    Val = DAG.getNode(ARMISD::VMOVrh, dl,
+                      MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
+  } else {
+    Val = DAG.getNode(ISD::BITCAST, dl,
+                      MVT::getIntegerVT(ValVT.getSizeInBits()), Val);
+    Val = DAG.getNode(ISD::ZERO_EXTEND, dl,
+                      MVT::getIntegerVT(LocVT.getSizeInBits()), Val);
+  }
+  return DAG.getNode(ISD::BITCAST, dl, LocVT, Val);
+}
+
 /// LowerCallResult - Lower the result values of a call into the
 /// appropriate copies out of appropriate physical registers.
 SDValue ARMTargetLowering::LowerCallResult(
@@ -1977,7 +2091,8 @@ SDValue ARMTargetLowering::LowerCallResult(
     }
 
     SDValue Val;
-    if (VA.needsCustom()) {
+    if (VA.needsCustom() &&
+        (VA.getLocVT() == MVT::f64 || VA.getLocVT() == MVT::v2f64)) {
       // Handle f64 or half of a v2f64.
       SDValue Lo = DAG.getCopyFromReg(Chain, dl, VA.getLocReg(), MVT::i32,
                                       InFlag);
@@ -2026,6 +2141,13 @@ SDValue ARMTargetLowering::LowerCallResult(
       break;
     }
 
+    // f16 arguments have their size extended to 4 bytes and passed as if they
+    // had been copied to the LSBs of a 32-bit register.
+    // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
+    if (VA.needsCustom() &&
+        (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
+      Val = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Val);
+
     InVals.push_back(Val);
   }
 
@@ -2092,22 +2214,34 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool isVarArg                         = CLI.IsVarArg;
 
   MachineFunction &MF = DAG.getMachineFunction();
+  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   MachineFunction::CallSiteInfo CSInfo;
   bool isStructRet = (Outs.empty()) ? false : Outs[0].Flags.isSRet();
   bool isThisReturn = false;
+  bool isCmseNSCall   = false;
   bool PreferIndirect = false;
 
+  // Determine whether this is a non-secure function call.
+  if (CLI.CB && CLI.CB->getAttributes().hasFnAttribute("cmse_nonsecure_call"))
+    isCmseNSCall = true;
+
   // Disable tail calls if they're not supported.
   if (!Subtarget->supportsTailCall())
     isTailCall = false;
 
+  // For both the non-secure calls and the returns from a CMSE entry function,
+  // the function needs to do some extra work afte r the call, or before the
+  // return, respectively, thus it cannot end with atail call
+  if (isCmseNSCall || AFI->isCmseNSEntryFunction())
+    isTailCall = false;
+
   if (isa<GlobalAddressSDNode>(Callee)) {
     // If we're optimizing for minimum size and the function is called three or
     // more times in this block, we can improve codesize by calling indirectly
     // as BLXr has a 16-bit encoding.
     auto *GV = cast<GlobalAddressSDNode>(Callee)->getGlobal();
-    if (CLI.CS) {
-      auto *BB = CLI.CS.getParent();
+    if (CLI.CB) {
+      auto *BB = CLI.CB->getParent();
       PreferIndirect = Subtarget->isThumb() && Subtarget->hasMinSize() &&
                        count_if(GV->users(), [&BB](const User *U) {
                          return isa<Instruction>(U) &&
@@ -2121,7 +2255,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         Callee, CallConv, isVarArg, isStructRet,
         MF.getFunction().hasStructRetAttr(), Outs, OutVals, Ins, DAG,
         PreferIndirect);
-    if (!isTailCall && CLI.CS && CLI.CS.isMustTailCall())
+    if (!isTailCall && CLI.CB && CLI.CB->isMustTailCall())
       report_fatal_error("failed to perform tail call elimination on a call "
                          "site marked musttail");
     // We don't support GuaranteedTailCallOpt for ARM, only automatically
@@ -2182,31 +2316,50 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       break;
     }
 
-    // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
-    if (VA.needsCustom()) {
-      if (VA.getLocVT() == MVT::v2f64) {
-        SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
-                                  DAG.getConstant(0, dl, MVT::i32));
-        SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
-                                  DAG.getConstant(1, dl, MVT::i32));
-
-        PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass,
-                         VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
-
-        VA = ArgLocs[++i]; // skip ahead to next loc
-        if (VA.isRegLoc()) {
-          PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass,
-                           VA, ArgLocs[++i], StackPtr, MemOpChains, Flags);
-        } else {
-          assert(VA.isMemLoc());
+    // f16 arguments have their size extended to 4 bytes and passed as if they
+    // had been copied to the LSBs of a 32-bit register.
+    // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
+    if (VA.needsCustom() &&
+        (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16)) {
+      Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
+    } else {
+      // f16 arguments could have been extended prior to argument lowering.
+      // Mask them arguments if this is a CMSE nonsecure call.
+      auto ArgVT = Outs[realArgIdx].ArgVT;
+      if (isCmseNSCall && (ArgVT == MVT::f16)) {
+        auto LocBits = VA.getLocVT().getSizeInBits();
+        auto MaskValue = APInt::getLowBitsSet(LocBits, ArgVT.getSizeInBits());
+        SDValue Mask =
+            DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
+        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
+        Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
+        Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      }
+    }
 
-          MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Op1,
-                                                 dl, DAG, VA, Flags));
-        }
-      } else {
-        PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+    // f64 and v2f64 might be passed in i32 pairs and must be split into pieces
+    if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
+      SDValue Op0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                DAG.getConstant(0, dl, MVT::i32));
+      SDValue Op1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
+                                DAG.getConstant(1, dl, MVT::i32));
+
+      PassF64ArgInRegs(dl, DAG, Chain, Op0, RegsToPass, VA, ArgLocs[++i],
+                       StackPtr, MemOpChains, Flags);
+
+      VA = ArgLocs[++i]; // skip ahead to next loc
+      if (VA.isRegLoc()) {
+        PassF64ArgInRegs(dl, DAG, Chain, Op1, RegsToPass, VA, ArgLocs[++i],
                          StackPtr, MemOpChains, Flags);
+      } else {
+        assert(VA.isMemLoc());
+
+        MemOpChains.push_back(
+            LowerMemOpCallTo(Chain, StackPtr, Op1, dl, DAG, VA, Flags));
       }
+    } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
+      PassF64ArgInRegs(dl, DAG, Chain, Arg, RegsToPass, VA, ArgLocs[++i],
+                       StackPtr, MemOpChains, Flags);
     } else if (VA.isRegLoc()) {
       if (realArgIdx == 0 && Flags.isReturned() && !Flags.isSwiftSelf() &&
           Outs[0].VT == MVT::i32) {
@@ -2217,7 +2370,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         isThisReturn = true;
       }
       const TargetOptions &Options = DAG.getTarget().Options;
-      if (Options.EnableDebugEntryValues)
+      if (Options.EmitCallSiteInfo)
         CSInfo.emplace_back(VA.getLocReg(), i);
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
     } else if (isByVal) {
@@ -2240,9 +2393,9 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         for (i = 0, j = RegBegin; j < RegEnd; i++, j++) {
           SDValue Const = DAG.getConstant(4*i, dl, MVT::i32);
           SDValue AddArg = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, Const);
-          SDValue Load = DAG.getLoad(PtrVT, dl, Chain, AddArg,
-                                     MachinePointerInfo(),
-                                     DAG.InferPtrAlignment(AddArg));
+          SDValue Load =
+              DAG.getLoad(PtrVT, dl, Chain, AddArg, MachinePointerInfo(),
+                          DAG.InferPtrAlign(AddArg));
           MemOpChains.push_back(Load.getValue(1));
           RegsToPass.push_back(std::make_pair(j, Load));
         }
@@ -2263,8 +2416,8 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         SDValue Src = DAG.getNode(ISD::ADD, dl, PtrVT, Arg, SrcOffset);
         SDValue SizeNode = DAG.getConstant(Flags.getByValSize() - 4*offset, dl,
                                            MVT::i32);
-        SDValue AlignNode = DAG.getConstant(Flags.getByValAlign(), dl,
-                                            MVT::i32);
+        SDValue AlignNode =
+            DAG.getConstant(Flags.getNonZeroByValAlign().value(), dl, MVT::i32);
 
         SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
         SDValue Ops[] = { Chain, Dst, Src, SizeNode, AlignNode};
@@ -2306,7 +2459,6 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   bool isARMFunc = !Subtarget->isThumb() || (isStub && !Subtarget->isMClass());
   bool isLocalARMFunc = false;
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   auto PtrVt = getPointerTy(DAG.getDataLayout());
 
   if (Subtarget->genLongCalls()) {
@@ -2322,7 +2474,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         ARMConstantPoolConstant::Create(GV, ARMPCLabelIndex, ARMCP::CPValue, 0);
 
       // Get the address of the callee into a register
-      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(
           PtrVt, dl, DAG.getEntryNode(), CPAddr,
@@ -2336,7 +2488,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
                                       ARMPCLabelIndex, 0);
       // Get the address of the callee into a register
-      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(
           PtrVt, dl, DAG.getEntryNode(), CPAddr,
@@ -2388,7 +2540,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       ARMConstantPoolValue *CPV =
         ARMConstantPoolSymbol::Create(*DAG.getContext(), Sym,
                                       ARMPCLabelIndex, 4);
-      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVt, Align(4));
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       Callee = DAG.getLoad(
           PtrVt, dl, DAG.getEntryNode(), CPAddr,
@@ -2400,10 +2552,31 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     }
   }
 
+  if (isCmseNSCall) {
+    assert(!isARMFunc && !isDirect &&
+           "Cannot handle call to ARM function or direct call");
+    if (NumBytes > 0) {
+      DiagnosticInfoUnsupported Diag(DAG.getMachineFunction().getFunction(),
+                                     "call to non-secure function would "
+                                     "require passing arguments on stack",
+                                     dl.getDebugLoc());
+      DAG.getContext()->diagnose(Diag);
+    }
+    if (isStructRet) {
+      DiagnosticInfoUnsupported Diag(
+          DAG.getMachineFunction().getFunction(),
+          "call to non-secure function would return value through pointer",
+          dl.getDebugLoc());
+      DAG.getContext()->diagnose(Diag);
+    }
+  }
+
   // FIXME: handle tail calls differently.
   unsigned CallOpc;
   if (Subtarget->isThumb()) {
-    if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
+    if (isCmseNSCall)
+      CallOpc = ARMISD::tSECALL;
+    else if ((!isDirect || isARMFunc) && !Subtarget->hasV5TOps())
       CallOpc = ARMISD::CALL_NOLINK;
     else
       CallOpc = ARMISD::CALL;
@@ -2463,6 +2636,7 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Returns a chain and a flag for retval copy to use.
   Chain = DAG.getNode(CallOpc, dl, NodeTys, Ops);
+  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   InFlag = Chain.getValue(1);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
 
@@ -2483,15 +2657,15 @@ ARMTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 /// and then confiscate the rest of the parameter registers to insure
 /// this.
 void ARMTargetLowering::HandleByVal(CCState *State, unsigned &Size,
-                                    unsigned Align) const {
+                                    Align Alignment) const {
   // Byval (as with any stack) slots are always at least 4 byte aligned.
-  Align = std::max(Align, 4U);
+  Alignment = std::max(Alignment, Align(4));
 
   unsigned Reg = State->AllocateReg(GPRArgRegs);
   if (!Reg)
     return;
 
-  unsigned AlignInRegs = Align / 4;
+  unsigned AlignInRegs = Alignment.value() / 4;
   unsigned Waste = (ARM::R4 - Reg) % AlignInRegs;
   for (unsigned i = 0; i < Waste; ++i)
     Reg = State->AllocateReg(GPRArgRegs);
@@ -2630,9 +2804,11 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
 
   // Check that the call results are passed in the same way.
   LLVMContext &C = *DAG.getContext();
-  if (!CCState::resultsCompatible(CalleeCC, CallerCC, MF, C, Ins,
-                                  CCAssignFnForReturn(CalleeCC, isVarArg),
-                                  CCAssignFnForReturn(CallerCC, isVarArg)))
+  if (!CCState::resultsCompatible(
+          getEffectiveCallingConv(CalleeCC, isVarArg),
+          getEffectiveCallingConv(CallerCC, CallerF.isVarArg()), MF, C, Ins,
+          CCAssignFnForReturn(CalleeCC, isVarArg),
+          CCAssignFnForReturn(CallerCC, CallerF.isVarArg())))
     return false;
   // The callee has to preserve all registers the caller needs to preserve.
   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
@@ -2673,7 +2849,7 @@ bool ARMTargetLowering::IsEligibleForTailCallOptimization(
         ISD::ArgFlagsTy Flags = Outs[realArgIdx].Flags;
         if (VA.getLocInfo() == CCValAssign::Indirect)
           return false;
-        if (VA.needsCustom()) {
+        if (VA.needsCustom() && (RegVT == MVT::f64 || RegVT == MVT::v2f64)) {
           // f64 and vector types are split into multiple registers or
           // register/stack-slot combinations.  The types will not match
           // the registers; give up on memory f64 refs until we figure
@@ -2772,6 +2948,17 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
   AFI->setReturnRegsCount(RVLocs.size());
 
+ // Report error if cmse entry function returns structure through first ptr arg.
+  if (AFI->isCmseNSEntryFunction() && MF.getFunction().hasStructRetAttr()) {
+    // Note: using an empty SDLoc(), as the first line of the function is a
+    // better place to report than the last line.
+    DiagnosticInfoUnsupported Diag(
+        DAG.getMachineFunction().getFunction(),
+        "secure entry function would return value through pointer",
+        SDLoc().getDebugLoc());
+    DAG.getContext()->diagnose(Diag);
+  }
+
   // Copy the result values into the output registers.
   for (unsigned i = 0, realRVLocIdx = 0;
        i != RVLocs.size();
@@ -2814,7 +3001,24 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       break;
     }
 
-    if (VA.needsCustom()) {
+    // Mask f16 arguments if this is a CMSE nonsecure entry.
+    auto RetVT = Outs[realRVLocIdx].ArgVT;
+    if (AFI->isCmseNSEntryFunction() && (RetVT == MVT::f16)) {
+      if (VA.needsCustom() && VA.getValVT() == MVT::f16) {
+        Arg = MoveFromHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), Arg);
+      } else {
+        auto LocBits = VA.getLocVT().getSizeInBits();
+        auto MaskValue = APInt::getLowBitsSet(LocBits, RetVT.getSizeInBits());
+        SDValue Mask =
+            DAG.getConstant(MaskValue, dl, MVT::getIntegerVT(LocBits));
+        Arg = DAG.getNode(ISD::BITCAST, dl, MVT::getIntegerVT(LocBits), Arg);
+        Arg = DAG.getNode(ISD::AND, dl, MVT::getIntegerVT(LocBits), Arg, Mask);
+        Arg = DAG.getNode(ISD::BITCAST, dl, VA.getLocVT(), Arg);
+      }
+    }
+
+    if (VA.needsCustom() &&
+        (VA.getLocVT() == MVT::v2f64 || VA.getLocVT() == MVT::f64)) {
       if (VA.getLocVT() == MVT::v2f64) {
         // Extract the first half and return it in two registers.
         SDValue Half = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64, Arg,
@@ -2822,15 +3026,15 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
         SDValue HalfGPRs = DAG.getNode(ARMISD::VMOVRRD, dl,
                                        DAG.getVTList(MVT::i32, MVT::i32), Half);
 
-        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                                 HalfGPRs.getValue(isLittleEndian ? 0 : 1),
-                                 Flag);
+        Chain =
+            DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             HalfGPRs.getValue(isLittleEndian ? 0 : 1), Flag);
         Flag = Chain.getValue(1);
         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
         VA = RVLocs[++i]; // skip ahead to next loc
-        Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                                 HalfGPRs.getValue(isLittleEndian ? 1 : 0),
-                                 Flag);
+        Chain =
+            DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
+                             HalfGPRs.getValue(isLittleEndian ? 1 : 0), Flag);
         Flag = Chain.getValue(1);
         RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
         VA = RVLocs[++i]; // skip ahead to next loc
@@ -2844,22 +3048,20 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       SDValue fmrrd = DAG.getNode(ARMISD::VMOVRRD, dl,
                                   DAG.getVTList(MVT::i32, MVT::i32), Arg);
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                               fmrrd.getValue(isLittleEndian ? 0 : 1),
-                               Flag);
+                               fmrrd.getValue(isLittleEndian ? 0 : 1), Flag);
       Flag = Chain.getValue(1);
       RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
       VA = RVLocs[++i]; // skip ahead to next loc
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(),
-                               fmrrd.getValue(isLittleEndian ? 1 : 0),
-                               Flag);
+                               fmrrd.getValue(isLittleEndian ? 1 : 0), Flag);
     } else
       Chain = DAG.getCopyToReg(Chain, dl, VA.getLocReg(), Arg, Flag);
 
     // Guarantee that all emitted copies are
     // stuck together, avoiding something bad.
     Flag = Chain.getValue(1);
-    RetOps.push_back(DAG.getRegister(VA.getLocReg(),
-                                     ReturnF16 ? MVT::f16 : VA.getLocVT()));
+    RetOps.push_back(DAG.getRegister(
+        VA.getLocReg(), ReturnF16 ? Arg.getValueType() : VA.getLocVT()));
   }
   const ARMBaseRegisterInfo *TRI = Subtarget->getRegisterInfo();
   const MCPhysReg *I =
@@ -2893,7 +3095,9 @@ ARMTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     return LowerInterruptReturn(RetOps, dl, DAG);
   }
 
-  return DAG.getNode(ARMISD::RET_FLAG, dl, MVT::Other, RetOps);
+  ARMISD::NodeType RetNode = AFI->isCmseNSEntryFunction() ? ARMISD::SERET_FLAG :
+                                                            ARMISD::RET_FLAG;
+  return DAG.getNode(RetNode, dl, MVT::Other, RetOps);
 }
 
 bool ARMTargetLowering::isUsedByReturnOnly(SDNode *N, SDValue &Chain) const {
@@ -3035,11 +3239,10 @@ SDValue ARMTargetLowering::LowerConstantPool(SDValue Op,
   }
 
   if (CP->isMachineConstantPoolEntry())
-    Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
-                                    CP->getAlignment());
+    Res =
+        DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign());
   else
-    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
-                                    CP->getAlignment());
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign());
   return DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Res);
 }
 
@@ -3058,14 +3261,14 @@ SDValue ARMTargetLowering::LowerBlockAddress(SDValue Op,
   SDValue CPAddr;
   bool IsPositionIndependent = isPositionIndependent() || Subtarget->isROPI();
   if (!IsPositionIndependent) {
-    CPAddr = DAG.getTargetConstantPool(BA, PtrVT, 4);
+    CPAddr = DAG.getTargetConstantPool(BA, PtrVT, Align(4));
   } else {
     unsigned PCAdj = Subtarget->isThumb() ? 4 : 8;
     ARMPCLabelIndex = AFI->createPICLabelUId();
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(BA, ARMPCLabelIndex,
                                       ARMCP::CPBlockAddress, PCAdj);
-    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
   }
   CPAddr = DAG.getNode(ARMISD::Wrapper, DL, PtrVT, CPAddr);
   SDValue Result = DAG.getLoad(
@@ -3194,8 +3397,9 @@ ARMTargetLowering::LowerGlobalTLSAddressWindows(SDValue Op,
   const auto *GA = cast<GlobalAddressSDNode>(Op);
   auto *CPV = ARMConstantPoolConstant::Create(GA->getGlobal(), ARMCP::SECREL);
   SDValue Offset = DAG.getLoad(
-      PtrVT, DL, Chain, DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
-                                    DAG.getTargetConstantPool(CPV, PtrVT, 4)),
+      PtrVT, DL, Chain,
+      DAG.getNode(ARMISD::Wrapper, DL, MVT::i32,
+                  DAG.getTargetConstantPool(CPV, PtrVT, Align(4))),
       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
 
   return DAG.getNode(ISD::ADD, DL, PtrVT, TLS, Offset);
@@ -3214,7 +3418,7 @@ ARMTargetLowering::LowerToTLSGeneralDynamicModel(GlobalAddressSDNode *GA,
   ARMConstantPoolValue *CPV =
     ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
                                     ARMCP::CPValue, PCAdj, ARMCP::TLSGD, true);
-  SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+  SDValue Argument = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
   Argument = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Argument);
   Argument = DAG.getLoad(
       PtrVT, dl, DAG.getEntryNode(), Argument,
@@ -3265,7 +3469,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
       ARMConstantPoolConstant::Create(GA->getGlobal(), ARMPCLabelIndex,
                                       ARMCP::CPValue, PCAdj, ARMCP::GOTTPOFF,
                                       true);
-    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(
         PtrVT, dl, Chain, Offset,
@@ -3283,7 +3487,7 @@ ARMTargetLowering::LowerToTLSExecModels(GlobalAddressSDNode *GA,
     assert(model == TLSModel::LocalExec);
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(GV, ARMCP::TPOFF);
-    Offset = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    Offset = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
     Offset = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, Offset);
     Offset = DAG.getLoad(
         PtrVT, dl, Chain, Offset,
@@ -3386,11 +3590,11 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
   // that are strings for simplicity.
   auto *CDAInit = dyn_cast<ConstantDataArray>(Init);
   unsigned Size = DAG.getDataLayout().getTypeAllocSize(Init->getType());
-  unsigned Align = DAG.getDataLayout().getPreferredAlignment(GVar);
+  Align PrefAlign = DAG.getDataLayout().getPreferredAlign(GVar);
   unsigned RequiredPadding = 4 - (Size % 4);
   bool PaddingPossible =
     RequiredPadding == 4 || (CDAInit && CDAInit->isString());
-  if (!PaddingPossible || Align > 4 || Size > ConstpoolPromotionMaxSize ||
+  if (!PaddingPossible || PrefAlign > 4 || Size > ConstpoolPromotionMaxSize ||
       Size == 0)
     return SDValue();
 
@@ -3429,8 +3633,7 @@ static SDValue promoteToConstantPool(const ARMTargetLowering *TLI,
   }
 
   auto CPVal = ARMConstantPoolConstant::Create(GVar, Init);
-  SDValue CPAddr =
-    DAG.getTargetConstantPool(CPVal, PtrVT, /*Align=*/4);
+  SDValue CPAddr = DAG.getTargetConstantPool(CPVal, PtrVT, Align(4));
   if (!AFI->getGlobalsPromotedToConstantPool().count(GVar)) {
     AFI->markGlobalAsPromotedToConstantPool(GVar);
     AFI->setPromotedConstpoolIncrease(AFI->getPromotedConstpoolIncrease() +
@@ -3500,7 +3703,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
     } else { // use literal pool for address constant
       ARMConstantPoolValue *CPV =
         ARMConstantPoolConstant::Create(GV, ARMCP::SBREL);
-      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+      SDValue CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
       CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
       RelAddr = DAG.getLoad(
           PtrVT, dl, DAG.getEntryNode(), CPAddr,
@@ -3520,7 +3723,7 @@ SDValue ARMTargetLowering::LowerGlobalAddressELF(SDValue Op,
     return DAG.getNode(ARMISD::Wrapper, dl, PtrVT,
                        DAG.getTargetGlobalAddress(GV, dl, PtrVT));
   } else {
-    SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, 4);
+    SDValue CPAddr = DAG.getTargetConstantPool(GV, PtrVT, Align(4));
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     return DAG.getLoad(
         PtrVT, dl, DAG.getEntryNode(), CPAddr,
@@ -3631,7 +3834,7 @@ SDValue ARMTargetLowering::LowerINTRINSIC_VOID(
       unsigned Reg = MF.addLiveIn(ARM::LR, getRegClassFor(MVT::i32));
       SDValue ReturnAddress =
           DAG.getCopyFromReg(DAG.getEntryNode(), dl, Reg, PtrVT);
-      std::vector<EVT> ResultTys = {MVT::Other, MVT::Glue};
+      constexpr EVT ResultTys[] = {MVT::Other, MVT::Glue};
       SDValue Callee =
           DAG.getTargetExternalSymbol("\01__gnu_mcount_nc", PtrVT, 0);
       SDValue RegisterMask = DAG.getRegisterMask(Mask);
@@ -3715,7 +3918,7 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
     ARMConstantPoolValue *CPV =
       ARMConstantPoolConstant::Create(&MF.getFunction(), ARMPCLabelIndex,
                                       ARMCP::CPLSDA, PCAdj);
-    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, 4);
+    CPAddr = DAG.getTargetConstantPool(CPV, PtrVT, Align(4));
     CPAddr = DAG.getNode(ARMISD::Wrapper, dl, MVT::i32, CPAddr);
     SDValue Result = DAG.getLoad(
         PtrVT, dl, DAG.getEntryNode(), CPAddr,
@@ -3777,6 +3980,15 @@ ARMTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
   case Intrinsic::arm_mve_pred_v2i:
     return DAG.getNode(ARMISD::PREDICATE_CAST, SDLoc(Op), Op.getValueType(),
                        Op.getOperand(1));
+  case Intrinsic::arm_mve_vreinterpretq:
+    return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(Op), Op.getValueType(),
+                       Op.getOperand(1));
+  case Intrinsic::arm_mve_lsll:
+    return DAG.getNode(ARMISD::LSLL, SDLoc(Op), Op->getVTList(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
+  case Intrinsic::arm_mve_asrl:
+    return DAG.getNode(ARMISD::ASRL, SDLoc(Op), Op->getVTList(),
+                       Op.getOperand(1), Op.getOperand(2), Op.getOperand(3));
   }
 }
 
@@ -3977,6 +4189,42 @@ void ARMTargetLowering::VarArgStyleRegisters(CCState &CCInfo, SelectionDAG &DAG,
   AFI->setVarArgsFrameIndex(FrameIndex);
 }
 
+bool ARMTargetLowering::splitValueIntoRegisterParts(
+    SelectionDAG &DAG, const SDLoc &DL, SDValue Val, SDValue *Parts,
+    unsigned NumParts, MVT PartVT, Optional<CallingConv::ID> CC) const {
+  bool IsABIRegCopy = CC.hasValue();
+  EVT ValueVT = Val.getValueType();
+  if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
+      PartVT == MVT::f32) {
+    unsigned ValueBits = ValueVT.getSizeInBits();
+    unsigned PartBits = PartVT.getSizeInBits();
+    Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(ValueBits), Val);
+    Val = DAG.getNode(ISD::ANY_EXTEND, DL, MVT::getIntegerVT(PartBits), Val);
+    Val = DAG.getNode(ISD::BITCAST, DL, PartVT, Val);
+    Parts[0] = Val;
+    return true;
+  }
+  return false;
+}
+
+SDValue ARMTargetLowering::joinRegisterPartsIntoValue(
+    SelectionDAG &DAG, const SDLoc &DL, const SDValue *Parts, unsigned NumParts,
+    MVT PartVT, EVT ValueVT, Optional<CallingConv::ID> CC) const {
+  bool IsABIRegCopy = CC.hasValue();
+  if (IsABIRegCopy && (ValueVT == MVT::f16 || ValueVT == MVT::bf16) &&
+      PartVT == MVT::f32) {
+    unsigned ValueBits = ValueVT.getSizeInBits();
+    unsigned PartBits = PartVT.getSizeInBits();
+    SDValue Val = Parts[0];
+
+    Val = DAG.getNode(ISD::BITCAST, DL, MVT::getIntegerVT(PartBits), Val);
+    Val = DAG.getNode(ISD::TRUNCATE, DL, MVT::getIntegerVT(ValueBits), Val);
+    Val = DAG.getNode(ISD::BITCAST, DL, ValueVT, Val);
+    return Val;
+  }
+  return SDValue();
+}
+
 SDValue ARMTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@@ -4049,44 +4297,41 @@ SDValue ARMTargetLowering::LowerFormalArguments(
     if (VA.isRegLoc()) {
       EVT RegVT = VA.getLocVT();
 
-      if (VA.needsCustom()) {
+      if (VA.needsCustom() && VA.getLocVT() == MVT::v2f64) {
         // f64 and vector types are split up into multiple registers or
         // combinations of registers and stack slots.
-        if (VA.getLocVT() == MVT::v2f64) {
-          SDValue ArgValue1 = GetF64FormalArgument(VA, ArgLocs[++i],
-                                                   Chain, DAG, dl);
-          VA = ArgLocs[++i]; // skip ahead to next loc
-          SDValue ArgValue2;
-          if (VA.isMemLoc()) {
-            int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
-            SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
-            ArgValue2 = DAG.getLoad(MVT::f64, dl, Chain, FIN,
-                                    MachinePointerInfo::getFixedStack(
-                                        DAG.getMachineFunction(), FI));
-          } else {
-            ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i],
-                                             Chain, DAG, dl);
-          }
-          ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
-          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
-                                 ArgValue, ArgValue1,
-                                 DAG.getIntPtrConstant(0, dl));
-          ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64,
-                                 ArgValue, ArgValue2,
-                                 DAG.getIntPtrConstant(1, dl));
-        } else
-          ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+        SDValue ArgValue1 =
+            GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+        VA = ArgLocs[++i]; // skip ahead to next loc
+        SDValue ArgValue2;
+        if (VA.isMemLoc()) {
+          int FI = MFI.CreateFixedObject(8, VA.getLocMemOffset(), true);
+          SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+          ArgValue2 = DAG.getLoad(
+              MVT::f64, dl, Chain, FIN,
+              MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FI));
+        } else {
+          ArgValue2 = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
+        }
+        ArgValue = DAG.getNode(ISD::UNDEF, dl, MVT::v2f64);
+        ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
+                               ArgValue1, DAG.getIntPtrConstant(0, dl));
+        ArgValue = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v2f64, ArgValue,
+                               ArgValue2, DAG.getIntPtrConstant(1, dl));
+      } else if (VA.needsCustom() && VA.getLocVT() == MVT::f64) {
+        ArgValue = GetF64FormalArgument(VA, ArgLocs[++i], Chain, DAG, dl);
       } else {
         const TargetRegisterClass *RC;
 
-
-        if (RegVT == MVT::f16)
+        if (RegVT == MVT::f16 || RegVT == MVT::bf16)
           RC = &ARM::HPRRegClass;
         else if (RegVT == MVT::f32)
           RC = &ARM::SPRRegClass;
-        else if (RegVT == MVT::f64 || RegVT == MVT::v4f16)
+        else if (RegVT == MVT::f64 || RegVT == MVT::v4f16 ||
+                 RegVT == MVT::v4bf16)
           RC = &ARM::DPRRegClass;
-        else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16)
+        else if (RegVT == MVT::v2f64 || RegVT == MVT::v8f16 ||
+                 RegVT == MVT::v8bf16)
           RC = &ARM::QPRRegClass;
         else if (RegVT == MVT::i32)
           RC = AFI->isThumb1OnlyFunction() ? &ARM::tGPRRegClass
@@ -4126,6 +4371,13 @@ SDValue ARMTargetLowering::LowerFormalArguments(
         break;
       }
 
+      // f16 arguments have their size extended to 4 bytes and passed as if they
+      // had been copied to the LSBs of a 32-bit register.
+      // For that, it's passed extended to i32 (soft ABI) or to f32 (hard ABI)
+      if (VA.needsCustom() &&
+          (VA.getValVT() == MVT::f16 || VA.getValVT() == MVT::bf16))
+        ArgValue = MoveToHPR(dl, DAG, VA.getLocVT(), VA.getValVT(), ArgValue);
+
       InVals.push_back(ArgValue);
     } else { // VA.isRegLoc()
       // sanity check
@@ -4349,13 +4601,16 @@ SDValue ARMTargetLowering::getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
 
 /// Returns a appropriate VFP CMP (fcmp{s|d}+fmstat) for the given operands.
 SDValue ARMTargetLowering::getVFPCmp(SDValue LHS, SDValue RHS,
-                                     SelectionDAG &DAG, const SDLoc &dl) const {
+                                     SelectionDAG &DAG, const SDLoc &dl,
+                                     bool Signaling) const {
   assert(Subtarget->hasFP64() || RHS.getValueType() != MVT::f64);
   SDValue Cmp;
   if (!isFloatingPointZero(RHS))
-    Cmp = DAG.getNode(ARMISD::CMPFP, dl, MVT::Glue, LHS, RHS);
+    Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPE : ARMISD::CMPFP,
+                      dl, MVT::Glue, LHS, RHS);
   else
-    Cmp = DAG.getNode(ARMISD::CMPFPw0, dl, MVT::Glue, LHS);
+    Cmp = DAG.getNode(Signaling ? ARMISD::CMPFPEw0 : ARMISD::CMPFPw0,
+                      dl, MVT::Glue, LHS);
   return DAG.getNode(ARMISD::FMSTAT, dl, MVT::Glue, Cmp);
 }
 
@@ -4541,7 +4796,7 @@ SDValue ARMTargetLowering::LowerUnsignedALUO(SDValue Op,
 static SDValue LowerSADDSUBSAT(SDValue Op, SelectionDAG &DAG,
                                const ARMSubtarget *Subtarget) {
   EVT VT = Op.getValueType();
-  if (!Subtarget->hasDSP())
+  if (!Subtarget->hasV6Ops() || !Subtarget->hasDSP())
     return SDValue();
   if (!VT.isSimple())
     return SDValue();
@@ -5413,7 +5668,12 @@ SDValue ARMTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
 
   // FIXME: Remove this when we have strict fp instruction selection patterns
   if (IsStrict) {
-    DAG.mutateStrictFPToFP(Op.getNode());
+    SDLoc Loc(Op);
+    SDValue Result =
+        DAG.getNode(Op.getOpcode() == ISD::STRICT_FP_TO_SINT ? ISD::FP_TO_SINT
+                                                             : ISD::FP_TO_UINT,
+                    Loc, Op.getValueType(), SrcVal);
+    return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
   }
 
   return Op;
@@ -5696,85 +5956,27 @@ static SDValue CombineVMOVDRRCandidateWithVecOp(const SDNode *BC,
 /// use a VMOVDRR or VMOVRRD node.  This should not be done when the non-i64
 /// operand type is illegal (e.g., v2f32 for a target that doesn't support
 /// vectors), since the legalizer won't know what to do with that.
-static SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
-                             const ARMSubtarget *Subtarget) {
+SDValue ARMTargetLowering::ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
+                                         const ARMSubtarget *Subtarget) const {
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   SDLoc dl(N);
   SDValue Op = N->getOperand(0);
 
-  // This function is only supposed to be called for i64 types, either as the
-  // source or destination of the bit convert.
+  // This function is only supposed to be called for i16 and i64 types, either
+  // as the source or destination of the bit convert.
   EVT SrcVT = Op.getValueType();
   EVT DstVT = N->getValueType(0);
-  const bool HasFullFP16 = Subtarget->hasFullFP16();
-
-  if (SrcVT == MVT::f32 && DstVT == MVT::i32) {
-     // FullFP16: half values are passed in S-registers, and we don't
-     // need any of the bitcast and moves:
-     //
-     // t2: f32,ch = CopyFromReg t0, Register:f32 %0
-     //   t5: i32 = bitcast t2
-     // t18: f16 = ARMISD::VMOVhr t5
-     if (Op.getOpcode() != ISD::CopyFromReg ||
-         Op.getValueType() != MVT::f32)
-       return SDValue();
-
-     auto Move = N->use_begin();
-     if (Move->getOpcode() != ARMISD::VMOVhr)
-       return SDValue();
-
-     SDValue Ops[] = { Op.getOperand(0), Op.getOperand(1) };
-     SDValue Copy = DAG.getNode(ISD::CopyFromReg, SDLoc(Op), MVT::f16, Ops);
-     DAG.ReplaceAllUsesWith(*Move, &Copy);
-     return Copy;
-  }
-
-  if (SrcVT == MVT::i16 && DstVT == MVT::f16) {
-    if (!HasFullFP16)
-      return SDValue();
-    // SoftFP: read half-precision arguments:
-    //
-    // t2: i32,ch = ...
-    //        t7: i16 = truncate t2 <~~~~ Op
-    //      t8: f16 = bitcast t7    <~~~~ N
-    //
-    if (Op.getOperand(0).getValueType() == MVT::i32)
-      return DAG.getNode(ARMISD::VMOVhr, SDLoc(Op),
-                         MVT::f16, Op.getOperand(0));
 
-    return SDValue();
-  }
+  if ((SrcVT == MVT::i16 || SrcVT == MVT::i32) &&
+      (DstVT == MVT::f16 || DstVT == MVT::bf16))
+    return MoveToHPR(SDLoc(N), DAG, MVT::i32, DstVT.getSimpleVT(),
+                     DAG.getNode(ISD::ZERO_EXTEND, SDLoc(N), MVT::i32, Op));
 
-  // Half-precision return values
-  if (SrcVT == MVT::f16 && DstVT == MVT::i16) {
-    if (!HasFullFP16)
-      return SDValue();
-    //
-    //          t11: f16 = fadd t8, t10
-    //        t12: i16 = bitcast t11       <~~~ SDNode N
-    //      t13: i32 = zero_extend t12
-    //    t16: ch,glue = CopyToReg t0, Register:i32 %r0, t13
-    //  t17: ch = ARMISD::RET_FLAG t16, Register:i32 %r0, t16:1
-    //
-    // transform this into:
-    //
-    //    t20: i32 = ARMISD::VMOVrh t11
-    //  t16: ch,glue = CopyToReg t0, Register:i32 %r0, t20
-    //
-    auto ZeroExtend = N->use_begin();
-    if (N->use_size() != 1 || ZeroExtend->getOpcode() != ISD::ZERO_EXTEND ||
-        ZeroExtend->getValueType(0) != MVT::i32)
-      return SDValue();
-
-    auto Copy = ZeroExtend->use_begin();
-    if (Copy->getOpcode() == ISD::CopyToReg &&
-        Copy->use_begin()->getOpcode() == ARMISD::RET_FLAG) {
-      SDValue Cvt = DAG.getNode(ARMISD::VMOVrh, SDLoc(Op), MVT::i32, Op);
-      DAG.ReplaceAllUsesWith(*ZeroExtend, &Cvt);
-      return Cvt;
-    }
-    return SDValue();
-  }
+  if ((DstVT == MVT::i16 || DstVT == MVT::i32) &&
+      (SrcVT == MVT::f16 || SrcVT == MVT::bf16))
+    return DAG.getNode(
+        ISD::TRUNCATE, SDLoc(N), DstVT,
+        MoveFromHPR(SDLoc(N), DAG, MVT::i32, SrcVT.getSimpleVT(), Op));
 
   if (!(SrcVT == MVT::i64 || DstVT == MVT::i64))
     return SDValue();
@@ -5917,16 +6119,20 @@ SDValue ARMTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   // The formula we use to implement this is (((FPSCR + 1 << 22) >> 22) & 3)
   // so that the shift + and get folded into a bitfield extract.
   SDLoc dl(Op);
-  SDValue Ops[] = { DAG.getEntryNode(),
-                    DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32) };
+  SDValue Chain = Op.getOperand(0);
+  SDValue Ops[] = {Chain,
+                   DAG.getConstant(Intrinsic::arm_get_fpscr, dl, MVT::i32)};
 
-  SDValue FPSCR = DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, MVT::i32, Ops);
+  SDValue FPSCR =
+      DAG.getNode(ISD::INTRINSIC_W_CHAIN, dl, {MVT::i32, MVT::Other}, Ops);
+  Chain = FPSCR.getValue(1);
   SDValue FltRounds = DAG.getNode(ISD::ADD, dl, MVT::i32, FPSCR,
                                   DAG.getConstant(1U << 22, dl, MVT::i32));
   SDValue RMODE = DAG.getNode(ISD::SRL, dl, MVT::i32, FltRounds,
                               DAG.getConstant(22, dl, MVT::i32));
-  return DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
-                     DAG.getConstant(3, dl, MVT::i32));
+  SDValue And = DAG.getNode(ISD::AND, dl, MVT::i32, RMODE,
+                            DAG.getConstant(3, dl, MVT::i32));
+  return DAG.getMergeValues({And, Chain}, dl);
 }
 
 static SDValue LowerCTTZ(SDNode *N, SelectionDAG &DAG,
@@ -6411,9 +6617,10 @@ static SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) {
 /// immediate" operand (e.g., VMOV).  If so, return the encoded value.
 static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
                                  unsigned SplatBitSize, SelectionDAG &DAG,
-                                 const SDLoc &dl, EVT &VT, bool is128Bits,
+                                 const SDLoc &dl, EVT &VT, EVT VectorVT,
                                  VMOVModImmType type) {
   unsigned OpCmode, Imm;
+  bool is128Bits = VectorVT.is128BitVector();
 
   // SplatBitSize is set to the smallest size that splats the vector, so a
   // zero vector will always have SplatBitSize == 8.  However, NEON modified
@@ -6531,9 +6738,18 @@ static SDValue isVMOVModifiedImm(uint64_t SplatBits, uint64_t SplatUndef,
       ImmMask <<= 1;
     }
 
-    if (DAG.getDataLayout().isBigEndian())
-      // swap higher and lower 32 bit word
-      Imm = ((Imm & 0xf) << 4) | ((Imm & 0xf0) >> 4);
+    if (DAG.getDataLayout().isBigEndian()) {
+      // Reverse the order of elements within the vector.
+      unsigned BytesPerElem = VectorVT.getScalarSizeInBits() / 8;
+      unsigned Mask = (1 << BytesPerElem) - 1;
+      unsigned NumElems = 8 / BytesPerElem;
+      unsigned NewImm = 0;
+      for (unsigned ElemNum = 0; ElemNum < NumElems; ++ElemNum) {
+        unsigned Elem = ((Imm >> ElemNum * BytesPerElem) & Mask);
+        NewImm |= Elem << (NumElems - ElemNum - 1) * BytesPerElem;
+      }
+      Imm = NewImm;
+    }
 
     // Op=1, Cmode=1110.
     OpCmode = 0x1e;
@@ -6572,8 +6788,6 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
       case MVT::f64: {
         SDValue Lo = DAG.getConstant(INTVal.trunc(32), DL, MVT::i32);
         SDValue Hi = DAG.getConstant(INTVal.lshr(32).trunc(32), DL, MVT::i32);
-        if (!ST->isLittle())
-          std::swap(Lo, Hi);
         return DAG.getNode(ARMISD::VMOVDRR, DL, MVT::f64, Lo, Hi);
       }
       case MVT::f32:
@@ -6626,7 +6840,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
 
   // Try a VMOV.i32 (FIXME: i8, i16, or i64 could work too).
   SDValue NewVal = isVMOVModifiedImm(iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op),
-                                     VMovVT, false, VMOVModImm);
+                                     VMovVT, VT, VMOVModImm);
   if (NewVal != SDValue()) {
     SDLoc DL(Op);
     SDValue VecConstant = DAG.getNode(ARMISD::VMOVIMM, DL, VMovVT,
@@ -6643,7 +6857,7 @@ SDValue ARMTargetLowering::LowerConstantFP(SDValue Op, SelectionDAG &DAG,
 
   // Finally, try a VMVN.i32
   NewVal = isVMOVModifiedImm(~iVal & 0xffffffffU, 0, 32, DAG, SDLoc(Op), VMovVT,
-                             false, VMVNModImm);
+                             VT, VMVNModImm);
   if (NewVal != SDValue()) {
     SDLoc DL(Op);
     SDValue VecConstant = DAG.getNode(ARMISD::VMVNIMM, DL, VMovVT, NewVal);
@@ -7051,6 +7265,104 @@ static bool isVMOVNMask(ArrayRef<int> M, EVT VT, bool Top) {
   return true;
 }
 
+// Reconstruct an MVE VCVT from a BuildVector of scalar fptrunc, all extracted
+// from a pair of inputs. For example:
+// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
+//             FP_ROUND(EXTRACT_ELT(Y, 0),
+//             FP_ROUND(EXTRACT_ELT(X, 1),
+//             FP_ROUND(EXTRACT_ELT(Y, 1), ...)
+static SDValue LowerBuildVectorOfFPTrunc(SDValue BV, SelectionDAG &DAG,
+                                         const ARMSubtarget *ST) {
+  assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+  if (!ST->hasMVEFloatOps())
+    return SDValue();
+
+  SDLoc dl(BV);
+  EVT VT = BV.getValueType();
+  if (VT != MVT::v8f16)
+    return SDValue();
+
+  // We are looking for a buildvector of fptrunc elements, where all the
+  // elements are interleavingly extracted from two sources. Check the first two
+  // items are valid enough and extract some info from them (they are checked
+  // properly in the loop below).
+  if (BV.getOperand(0).getOpcode() != ISD::FP_ROUND ||
+      BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      BV.getOperand(0).getOperand(0).getConstantOperandVal(1) != 0)
+    return SDValue();
+  if (BV.getOperand(1).getOpcode() != ISD::FP_ROUND ||
+      BV.getOperand(1).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
+      BV.getOperand(1).getOperand(0).getConstantOperandVal(1) != 0)
+    return SDValue();
+  SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
+  SDValue Op1 = BV.getOperand(1).getOperand(0).getOperand(0);
+  if (Op0.getValueType() != MVT::v4f32 || Op1.getValueType() != MVT::v4f32)
+    return SDValue();
+
+  // Check all the values in the BuildVector line up with our expectations.
+  for (unsigned i = 1; i < 4; i++) {
+    auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
+      return Trunc.getOpcode() == ISD::FP_ROUND &&
+             Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+             Trunc.getOperand(0).getOperand(0) == Op &&
+             Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
+    };
+    if (!Check(BV.getOperand(i * 2 + 0), Op0, i))
+      return SDValue();
+    if (!Check(BV.getOperand(i * 2 + 1), Op1, i))
+      return SDValue();
+  }
+
+  SDValue N1 = DAG.getNode(ARMISD::VCVTN, dl, VT, DAG.getUNDEF(VT), Op0,
+                           DAG.getConstant(0, dl, MVT::i32));
+  return DAG.getNode(ARMISD::VCVTN, dl, VT, N1, Op1,
+                     DAG.getConstant(1, dl, MVT::i32));
+}
+
+// Reconstruct an MVE VCVT from a BuildVector of scalar fpext, all extracted
+// from a single input on alternating lanes. For example:
+// BUILDVECTOR(FP_ROUND(EXTRACT_ELT(X, 0),
+//             FP_ROUND(EXTRACT_ELT(X, 2),
+//             FP_ROUND(EXTRACT_ELT(X, 4), ...)
+static SDValue LowerBuildVectorOfFPExt(SDValue BV, SelectionDAG &DAG,
+                                       const ARMSubtarget *ST) {
+  assert(BV.getOpcode() == ISD::BUILD_VECTOR && "Unknown opcode!");
+  if (!ST->hasMVEFloatOps())
+    return SDValue();
+
+  SDLoc dl(BV);
+  EVT VT = BV.getValueType();
+  if (VT != MVT::v4f32)
+    return SDValue();
+
+  // We are looking for a buildvector of fptext elements, where all the
+  // elements are alternating lanes from a single source. For example <0,2,4,6>
+  // or <1,3,5,7>. Check the first two items are valid enough and extract some
+  // info from them (they are checked properly in the loop below).
+  if (BV.getOperand(0).getOpcode() != ISD::FP_EXTEND ||
+      BV.getOperand(0).getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+  SDValue Op0 = BV.getOperand(0).getOperand(0).getOperand(0);
+  int Offset = BV.getOperand(0).getOperand(0).getConstantOperandVal(1);
+  if (Op0.getValueType() != MVT::v8f16 || (Offset != 0 && Offset != 1))
+    return SDValue();
+
+  // Check all the values in the BuildVector line up with our expectations.
+  for (unsigned i = 1; i < 4; i++) {
+    auto Check = [](SDValue Trunc, SDValue Op, unsigned Idx) {
+      return Trunc.getOpcode() == ISD::FP_EXTEND &&
+             Trunc.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+             Trunc.getOperand(0).getOperand(0) == Op &&
+             Trunc.getOperand(0).getConstantOperandVal(1) == Idx;
+    };
+    if (!Check(BV.getOperand(i), Op0, 2 * i + Offset))
+      return SDValue();
+  }
+
+  return DAG.getNode(ARMISD::VCVTL, dl, VT, Op0,
+                     DAG.getConstant(Offset, dl, MVT::i32));
+}
+
 // If N is an integer constant that can be moved into a register in one
 // instruction, return an SDValue of such a constant (will become a MOV
 // instruction).  Otherwise return null.
@@ -7150,13 +7462,12 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       return DAG.getUNDEF(VT);
 
     if ((ST->hasNEON() && SplatBitSize <= 64) ||
-        (ST->hasMVEIntegerOps() && SplatBitSize <= 32)) {
+        (ST->hasMVEIntegerOps() && SplatBitSize <= 64)) {
       // Check if an immediate VMOV works.
       EVT VmovVT;
-      SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
-                                      SplatUndef.getZExtValue(), SplatBitSize,
-                                      DAG, dl, VmovVT, VT.is128BitVector(),
-                                      VMOVModImm);
+      SDValue Val =
+          isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
+                            SplatBitSize, DAG, dl, VmovVT, VT, VMOVModImm);
 
       if (Val.getNode()) {
         SDValue Vmov = DAG.getNode(ARMISD::VMOVIMM, dl, VmovVT, Val);
@@ -7166,9 +7477,8 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
       // Try an immediate VMVN.
       uint64_t NegatedImm = (~SplatBits).getZExtValue();
       Val = isVMOVModifiedImm(
-          NegatedImm, SplatUndef.getZExtValue(), SplatBitSize,
-          DAG, dl, VmovVT, VT.is128BitVector(),
-          ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
+          NegatedImm, SplatUndef.getZExtValue(), SplatBitSize, DAG, dl, VmovVT,
+          VT, ST->hasMVEIntegerOps() ? MVEVMVNModImm : VMVNModImm);
       if (Val.getNode()) {
         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
@@ -7308,12 +7618,19 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
   if (isConstant)
     return SDValue();
 
-  // Empirical tests suggest this is rarely worth it for vectors of length <= 2.
-  if (NumElts >= 4) {
-    SDValue shuffle = ReconstructShuffle(Op, DAG);
-    if (shuffle != SDValue())
+  // Reconstruct the BUILDVECTOR to one of the legal shuffles (such as vext and
+  // vmovn). Empirical tests suggest this is rarely worth it for vectors of
+  // length <= 2.
+  if (NumElts >= 4)
+    if (SDValue shuffle = ReconstructShuffle(Op, DAG))
       return shuffle;
-  }
+
+  // Attempt to turn a buildvector of scalar fptrunc's or fpext's back into
+  // VCVT's
+  if (SDValue VCVT = LowerBuildVectorOfFPTrunc(Op, DAG, Subtarget))
+    return VCVT;
+  if (SDValue VCVT = LowerBuildVectorOfFPExt(Op, DAG, Subtarget))
+    return VCVT;
 
   if (ST->hasNEON() && VT.is128BitVector() && VT != MVT::v2f64 && VT != MVT::v4f32) {
     // If we haven't found an efficient lowering, try splitting a 128-bit vector
@@ -7514,7 +7831,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
     if (SrcEltTy == SmallestEltTy)
       continue;
     assert(ShuffleVT.getVectorElementType() == SmallestEltTy);
-    Src.ShuffleVec = DAG.getNode(ISD::BITCAST, dl, ShuffleVT, Src.ShuffleVec);
+    Src.ShuffleVec = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, ShuffleVT, Src.ShuffleVec);
     Src.WindowScale = SrcEltTy.getSizeInBits() / SmallestEltTy.getSizeInBits();
     Src.WindowBase *= Src.WindowScale;
   }
@@ -7566,7 +7883,7 @@ SDValue ARMTargetLowering::ReconstructShuffle(SDValue Op,
                                             ShuffleOps[1], Mask, DAG);
   if (!Shuffle)
     return SDValue();
-  return DAG.getNode(ISD::BITCAST, dl, VT, Shuffle);
+  return DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Shuffle);
 }
 
 enum ShuffleOpCodes {
@@ -8879,7 +9196,7 @@ SDValue ARMTargetLowering::LowerFSINCOS(SDValue Op, SelectionDAG &DAG) const {
   if (ShouldUseSRet) {
     // Create stack object for sret.
     const uint64_t ByteSize = DL.getTypeAllocSize(RetTy);
-    const unsigned StackAlign = DL.getPrefTypeAlignment(RetTy);
+    const Align StackAlign = DL.getPrefTypeAlign(RetTy);
     int FrameIdx = MFI.CreateStackObject(ByteSize, StackAlign, false);
     SRet = DAG.getFrameIndex(FrameIdx, TLI.getPointerTy(DL));
 
@@ -9054,8 +9371,7 @@ void ARMTargetLowering::ExpandDIV_Windows(
                               DAG.getConstant(32, dl, TLI.getPointerTy(DL)));
   Upper = DAG.getNode(ISD::TRUNCATE, dl, MVT::i32, Upper);
 
-  Results.push_back(Lower);
-  Results.push_back(Upper);
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lower, Upper));
 }
 
 static SDValue LowerPredicateLoad(SDValue Op, SelectionDAG &DAG) {
@@ -9100,8 +9416,9 @@ void ARMTargetLowering::LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
     SDValue Result = DAG.getMemIntrinsicNode(
         ARMISD::LDRD, dl, DAG.getVTList({MVT::i32, MVT::i32, MVT::Other}),
         {LD->getChain(), LD->getBasePtr()}, MemVT, LD->getMemOperand());
-    SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
-                               Result.getValue(0), Result.getValue(1));
+    SDValue Lo = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 0 : 1);
+    SDValue Hi = Result.getValue(DAG.getDataLayout().isLittleEndian() ? 1 : 0);
+    SDValue Pair = DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi);
     Results.append({Pair, Result.getValue(2)});
   }
 }
@@ -9146,10 +9463,14 @@ static SDValue LowerSTORE(SDValue Op, SelectionDAG &DAG,
     SDNode *N = Op.getNode();
     SDLoc dl(N);
 
-    SDValue Lo = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
-                             DAG.getTargetConstant(0, dl, MVT::i32));
-    SDValue Hi = DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
-                             DAG.getTargetConstant(1, dl, MVT::i32));
+    SDValue Lo = DAG.getNode(
+        ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
+        DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 0 : 1, dl,
+                              MVT::i32));
+    SDValue Hi = DAG.getNode(
+        ISD::EXTRACT_ELEMENT, dl, MVT::i32, ST->getValue(),
+        DAG.getTargetConstant(DAG.getDataLayout().isLittleEndian() ? 1 : 0, dl,
+                              MVT::i32));
 
     return DAG.getMemIntrinsicNode(ARMISD::STRD, dl, DAG.getVTList(MVT::Other),
                                    {ST->getChain(), Lo, Hi, ST->getBasePtr()},
@@ -9188,13 +9509,87 @@ static SDValue LowerMLOAD(SDValue Op, SelectionDAG &DAG) {
       N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
       N->getExtensionType(), N->isExpandingLoad());
   SDValue Combo = NewLoad;
-  if (!PassThru.isUndef() &&
-      (PassThru.getOpcode() != ISD::BITCAST ||
-       !isZeroVector(PassThru->getOperand(0))))
+  bool PassThruIsCastZero = (PassThru.getOpcode() == ISD::BITCAST ||
+                             PassThru.getOpcode() == ARMISD::VECTOR_REG_CAST) &&
+                            isZeroVector(PassThru->getOperand(0));
+  if (!PassThru.isUndef() && !PassThruIsCastZero)
     Combo = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
   return DAG.getMergeValues({Combo, NewLoad.getValue(1)}, dl);
 }
 
+static SDValue LowerVecReduce(SDValue Op, SelectionDAG &DAG,
+                              const ARMSubtarget *ST) {
+  if (!ST->hasMVEIntegerOps())
+    return SDValue();
+
+  SDLoc dl(Op);
+  unsigned BaseOpcode = 0;
+  switch (Op->getOpcode()) {
+  default: llvm_unreachable("Expected VECREDUCE opcode");
+  case ISD::VECREDUCE_FADD: BaseOpcode = ISD::FADD; break;
+  case ISD::VECREDUCE_FMUL: BaseOpcode = ISD::FMUL; break;
+  case ISD::VECREDUCE_MUL:  BaseOpcode = ISD::MUL; break;
+  case ISD::VECREDUCE_AND:  BaseOpcode = ISD::AND; break;
+  case ISD::VECREDUCE_OR:   BaseOpcode = ISD::OR; break;
+  case ISD::VECREDUCE_XOR:  BaseOpcode = ISD::XOR; break;
+  case ISD::VECREDUCE_FMAX: BaseOpcode = ISD::FMAXNUM; break;
+  case ISD::VECREDUCE_FMIN: BaseOpcode = ISD::FMINNUM; break;
+  }
+
+  SDValue Op0 = Op->getOperand(0);
+  EVT VT = Op0.getValueType();
+  EVT EltVT = VT.getVectorElementType();
+  unsigned NumElts = VT.getVectorNumElements();
+  unsigned NumActiveLanes = NumElts;
+
+  assert((NumActiveLanes == 16 || NumActiveLanes == 8 || NumActiveLanes == 4 ||
+          NumActiveLanes == 2) &&
+         "Only expected a power 2 vector size");
+
+  // Use Mul(X, Rev(X)) until 4 items remain. Going down to 4 vector elements
+  // allows us to easily extract vector elements from the lanes.
+  while (NumActiveLanes > 4) {
+    unsigned RevOpcode = NumActiveLanes == 16 ? ARMISD::VREV16 : ARMISD::VREV32;
+    SDValue Rev = DAG.getNode(RevOpcode, dl, VT, Op0);
+    Op0 = DAG.getNode(BaseOpcode, dl, VT, Op0, Rev);
+    NumActiveLanes /= 2;
+  }
+
+  SDValue Res;
+  if (NumActiveLanes == 4) {
+    // The remaining 4 elements are summed sequentially
+    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                              DAG.getConstant(0 * NumElts / 4, dl, MVT::i32));
+    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                              DAG.getConstant(1 * NumElts / 4, dl, MVT::i32));
+    SDValue Ext2 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                              DAG.getConstant(2 * NumElts / 4, dl, MVT::i32));
+    SDValue Ext3 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                              DAG.getConstant(3 * NumElts / 4, dl, MVT::i32));
+    SDValue Res0 = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
+    SDValue Res1 = DAG.getNode(BaseOpcode, dl, EltVT, Ext2, Ext3, Op->getFlags());
+    Res = DAG.getNode(BaseOpcode, dl, EltVT, Res0, Res1, Op->getFlags());
+  } else {
+    SDValue Ext0 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                              DAG.getConstant(0, dl, MVT::i32));
+    SDValue Ext1 = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, EltVT, Op0,
+                              DAG.getConstant(1, dl, MVT::i32));
+    Res = DAG.getNode(BaseOpcode, dl, EltVT, Ext0, Ext1, Op->getFlags());
+  }
+
+  // Result type may be wider than element type.
+  if (EltVT != Op->getValueType(0))
+    Res = DAG.getNode(ISD::ANY_EXTEND, dl, Op->getValueType(0), Res);
+  return Res;
+}
+
+static SDValue LowerVecReduceF(SDValue Op, SelectionDAG &DAG,
+                               const ARMSubtarget *ST) {
+  if (!ST->hasMVEFloatOps())
+    return SDValue();
+  return LowerVecReduce(Op, DAG, ST);
+}
+
 static SDValue LowerAtomicLoadStore(SDValue Op, SelectionDAG &DAG) {
   if (isStrongerThanMonotonic(cast<AtomicSDNode>(Op)->getOrdering()))
     // Acquire/Release load/store is not legal for targets without a dmb or
@@ -9264,15 +9659,61 @@ static void ReplaceCMP_SWAP_64Results(SDNode *N,
 
   bool isBigEndian = DAG.getDataLayout().isBigEndian();
 
-  Results.push_back(
+  SDValue Lo =
       DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_1 : ARM::gsub_0,
-                                 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
-  Results.push_back(
+                                 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
+  SDValue Hi =
       DAG.getTargetExtractSubreg(isBigEndian ? ARM::gsub_0 : ARM::gsub_1,
-                                 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0)));
+                                 SDLoc(N), MVT::i32, SDValue(CmpSwap, 0));
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, SDLoc(N), MVT::i64, Lo, Hi));
   Results.push_back(SDValue(CmpSwap, 2));
 }
 
+SDValue ARMTargetLowering::LowerFSETCC(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  SDValue Chain = Op.getOperand(0);
+  SDValue LHS = Op.getOperand(1);
+  SDValue RHS = Op.getOperand(2);
+  ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(3))->get();
+  bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
+
+  // If we don't have instructions of this float type then soften to a libcall
+  // and use SETCC instead.
+  if (isUnsupportedFloatingType(LHS.getValueType())) {
+    DAG.getTargetLoweringInfo().softenSetCCOperands(
+      DAG, LHS.getValueType(), LHS, RHS, CC, dl, LHS, RHS, Chain, IsSignaling);
+    if (!RHS.getNode()) {
+      RHS = DAG.getConstant(0, dl, LHS.getValueType());
+      CC = ISD::SETNE;
+    }
+    SDValue Result = DAG.getNode(ISD::SETCC, dl, VT, LHS, RHS,
+                                 DAG.getCondCode(CC));
+    return DAG.getMergeValues({Result, Chain}, dl);
+  }
+
+  ARMCC::CondCodes CondCode, CondCode2;
+  FPCCToARMCC(CC, CondCode, CondCode2);
+
+  // FIXME: Chain is not handled correctly here. Currently the FPSCR is implicit
+  // in CMPFP and CMPFPE, but instead it should be made explicit by these
+  // instructions using a chain instead of glue. This would also fix the problem
+  // here (and also in LowerSELECT_CC) where we generate two comparisons when
+  // CondCode2 != AL.
+  SDValue True = DAG.getConstant(1, dl, VT);
+  SDValue False =  DAG.getConstant(0, dl, VT);
+  SDValue ARMcc = DAG.getConstant(CondCode, dl, MVT::i32);
+  SDValue CCR = DAG.getRegister(ARM::CPSR, MVT::i32);
+  SDValue Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
+  SDValue Result = getCMOV(dl, VT, False, True, ARMcc, CCR, Cmp, DAG);
+  if (CondCode2 != ARMCC::AL) {
+    ARMcc = DAG.getConstant(CondCode2, dl, MVT::i32);
+    Cmp = getVFPCmp(LHS, RHS, DAG, dl, IsSignaling);
+    Result = getCMOV(dl, VT, Result, True, ARMcc, CCR, Cmp, DAG);
+  }
+  return DAG.getMergeValues({Result, Chain}, dl);
+}
+
 SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   LLVM_DEBUG(dbgs() << "Lowering node: "; Op.dump());
   switch (Op.getOpcode()) {
@@ -9353,6 +9794,16 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerSTORE(Op, DAG, Subtarget);
   case ISD::MLOAD:
     return LowerMLOAD(Op, DAG);
+  case ISD::VECREDUCE_MUL:
+  case ISD::VECREDUCE_AND:
+  case ISD::VECREDUCE_OR:
+  case ISD::VECREDUCE_XOR:
+    return LowerVecReduce(Op, DAG, Subtarget);
+  case ISD::VECREDUCE_FADD:
+  case ISD::VECREDUCE_FMUL:
+  case ISD::VECREDUCE_FMIN:
+  case ISD::VECREDUCE_FMAX:
+    return LowerVecReduceF(Op, DAG, Subtarget);
   case ISD::ATOMIC_LOAD:
   case ISD::ATOMIC_STORE:  return LowerAtomicLoadStore(Op, DAG);
   case ISD::FSINCOS:       return LowerFSINCOS(Op, DAG);
@@ -9366,6 +9817,8 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::FP_ROUND: return LowerFP_ROUND(Op, DAG);
   case ISD::STRICT_FP_EXTEND:
   case ISD::FP_EXTEND: return LowerFP_EXTEND(Op, DAG);
+  case ISD::STRICT_FSETCC:
+  case ISD::STRICT_FSETCCS: return LowerFSETCC(Op, DAG);
   case ARMISD::WIN__DBZCHK: return SDValue();
   }
 }
@@ -9397,8 +9850,8 @@ static void ReplaceLongIntrinsic(SDNode *N, SmallVectorImpl<SDValue> &Results,
                                 DAG.getVTList(MVT::i32, MVT::i32),
                                 N->getOperand(1), N->getOperand(2),
                                 Lo, Hi);
-  Results.push_back(LongMul.getValue(0));
-  Results.push_back(LongMul.getValue(1));
+  Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64,
+                                LongMul.getValue(0), LongMul.getValue(1)));
 }
 
 /// ReplaceNodeResults - Replace the results of node with an illegal result
@@ -9487,7 +9940,7 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
   unsigned PCAdj = (isThumb || isThumb2) ? 4 : 8;
   ARMConstantPoolValue *CPV =
     ARMConstantPoolMBB::Create(F.getContext(), DispatchBB, PCLabelId, PCAdj);
-  unsigned CPI = MCP->getConstantPoolIndex(CPV, 4);
+  unsigned CPI = MCP->getConstantPoolIndex(CPV, Align(4));
 
   const TargetRegisterClass *TRC = isThumb ? &ARM::tGPRRegClass
                                            : &ARM::GPRRegClass;
@@ -9495,11 +9948,11 @@ void ARMTargetLowering::SetupEntryBlockForSjLj(MachineInstr &MI,
   // Grab constant pool and fixed stack memory operands.
   MachineMemOperand *CPMMO =
       MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
-                               MachineMemOperand::MOLoad, 4, 4);
+                               MachineMemOperand::MOLoad, 4, Align(4));
 
   MachineMemOperand *FIMMOSt =
       MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(*MF, FI),
-                               MachineMemOperand::MOStore, 4, 4);
+                               MachineMemOperand::MOStore, 4, Align(4));
 
   // Load the address of the dispatch MBB into the jump buffer.
   if (isThumb2) {
@@ -9685,7 +10138,7 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
 
   MachineMemOperand *FIMMOLd = MF->getMachineMemOperand(
       MachinePointerInfo::getFixedStack(*MF, FI),
-      MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, 4);
+      MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 4, Align(4));
 
   MachineInstrBuilder MIB;
   MIB = BuildMI(DispatchBB, dl, TII->get(ARM::Int_eh_sjlj_dispatchsetup));
@@ -9776,10 +10229,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
 
       // MachineConstantPool wants an explicit alignment.
-      unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
-      if (Align == 0)
-        Align = MF->getDataLayout().getTypeAllocSize(C->getType());
-      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+      Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
+      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
 
       Register VReg1 = MRI->createVirtualRegister(TRC);
       BuildMI(DispatchBB, dl, TII->get(ARM::tLDRpci))
@@ -9816,8 +10267,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
         .addReg(NewVReg3)
         .add(predOps(ARMCC::AL));
 
-    MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
-        MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
+    MachineMemOperand *JTMMOLd =
+        MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
+                                 MachineMemOperand::MOLoad, 4, Align(4));
 
     Register NewVReg5 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::tLDRi), NewVReg5)
@@ -9877,10 +10329,8 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
       const Constant *C = ConstantInt::get(Int32Ty, NumLPads);
 
       // MachineConstantPool wants an explicit alignment.
-      unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
-      if (Align == 0)
-        Align = MF->getDataLayout().getTypeAllocSize(C->getType());
-      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+      Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
+      unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
 
       Register VReg1 = MRI->createVirtualRegister(TRC);
       BuildMI(DispatchBB, dl, TII->get(ARM::LDRcp))
@@ -9910,8 +10360,9 @@ void ARMTargetLowering::EmitSjLjDispatchBlock(MachineInstr &MI,
         .addJumpTableIndex(MJTI)
         .add(predOps(ARMCC::AL));
 
-    MachineMemOperand *JTMMOLd = MF->getMachineMemOperand(
-        MachinePointerInfo::getJumpTable(*MF), MachineMemOperand::MOLoad, 4, 4);
+    MachineMemOperand *JTMMOLd =
+        MF->getMachineMemOperand(MachinePointerInfo::getJumpTable(*MF),
+                                 MachineMemOperand::MOLoad, 4, Align(4));
     Register NewVReg5 = MRI->createVirtualRegister(TRC);
     BuildMI(DispContBB, dl, TII->get(ARM::LDRrs), NewVReg5)
         .addReg(NewVReg3, RegState::Kill)
@@ -10150,7 +10601,7 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
   Register dest = MI.getOperand(0).getReg();
   Register src = MI.getOperand(1).getReg();
   unsigned SizeVal = MI.getOperand(2).getImm();
-  unsigned Align = MI.getOperand(3).getImm();
+  unsigned Alignment = MI.getOperand(3).getImm();
   DebugLoc dl = MI.getDebugLoc();
 
   MachineFunction *MF = BB->getParent();
@@ -10163,17 +10614,17 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
   bool IsThumb2 = Subtarget->isThumb2();
   bool IsThumb = Subtarget->isThumb();
 
-  if (Align & 1) {
+  if (Alignment & 1) {
     UnitSize = 1;
-  } else if (Align & 2) {
+  } else if (Alignment & 2) {
     UnitSize = 2;
   } else {
     // Check whether we can use NEON instructions.
     if (!MF->getFunction().hasFnAttribute(Attribute::NoImplicitFloat) &&
         Subtarget->hasNEON()) {
-      if ((Align % 16 == 0) && SizeVal >= 16)
+      if ((Alignment % 16 == 0) && SizeVal >= 16)
         UnitSize = 16;
-      else if ((Align % 8 == 0) && SizeVal >= 8)
+      else if ((Alignment % 8 == 0) && SizeVal >= 8)
         UnitSize = 8;
     }
     // Can't use NEON instructions.
@@ -10279,13 +10730,11 @@ ARMTargetLowering::EmitStructByval(MachineInstr &MI,
     const Constant *C = ConstantInt::get(Int32Ty, LoopSize);
 
     // MachineConstantPool wants an explicit alignment.
-    unsigned Align = MF->getDataLayout().getPrefTypeAlignment(Int32Ty);
-    if (Align == 0)
-      Align = MF->getDataLayout().getTypeAllocSize(C->getType());
-    unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align);
+    Align Alignment = MF->getDataLayout().getPrefTypeAlign(Int32Ty);
+    unsigned Idx = ConstantPool->getConstantPoolIndex(C, Alignment);
     MachineMemOperand *CPMMO =
         MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
-                                 MachineMemOperand::MOLoad, 4, 4);
+                                 MachineMemOperand::MOLoad, 4, Align(4));
 
     if (IsThumb)
       BuildMI(*BB, MI, dl, TII->get(ARM::tLDRpci))
@@ -11655,6 +12104,42 @@ static SDValue PerformAddeSubeCombine(SDNode *N,
   return SDValue();
 }
 
+static SDValue PerformVSELECTCombine(SDNode *N,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const ARMSubtarget *Subtarget) {
+  // Transforms vselect(not(cond), lhs, rhs) into vselect(cond, rhs, lhs).
+  //
+  // We need to re-implement this optimization here as the implementation in the
+  // Target-Independent DAGCombiner does not handle the kind of constant we make
+  // (it calls isConstOrConstSplat with AllowTruncation set to false - and for
+  // good reason, allowing truncation there would break other targets).
+  //
+  // Currently, this is only done for MVE, as it's the only target that benefits
+  // from this transformation (e.g. VPNOT+VPSEL becomes a single VPSEL).
+  if (!Subtarget->hasMVEIntegerOps())
+    return SDValue();
+
+  if (N->getOperand(0).getOpcode() != ISD::XOR)
+    return SDValue();
+  SDValue XOR = N->getOperand(0);
+
+  // Check if the XOR's RHS is either a 1, or a BUILD_VECTOR of 1s.
+  // It is important to check with truncation allowed as the BUILD_VECTORs we
+  // generate in those situations will truncate their operands.
+  ConstantSDNode *Const =
+      isConstOrConstSplat(XOR->getOperand(1), /*AllowUndefs*/ false,
+                          /*AllowTruncation*/ true);
+  if (!Const || !Const->isOne())
+    return SDValue();
+
+  // Rewrite into vselect(cond, rhs, lhs).
+  SDValue Cond = XOR->getOperand(0);
+  SDValue LHS = N->getOperand(1);
+  SDValue RHS = N->getOperand(2);
+  EVT Type = N->getValueType(0);
+  return DCI.DAG.getNode(ISD::VSELECT, SDLoc(N), Type, Cond, RHS, LHS);
+}
+
 static SDValue PerformABSCombine(SDNode *N,
                                   TargetLowering::DAGCombinerInfo &DCI,
                                   const ARMSubtarget *Subtarget) {
@@ -11712,6 +12197,71 @@ static SDValue PerformADDCombineWithOperands(SDNode *N, SDValue N0, SDValue N1,
   return SDValue();
 }
 
+static SDValue PerformADDVecReduce(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI,
+                                   const ARMSubtarget *Subtarget) {
+  if (!Subtarget->hasMVEIntegerOps() || N->getValueType(0) != MVT::i64)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // We are looking for a i64 add of a VADDLVx. Due to these being i64's, this
+  // will look like:
+  //   t1: i32,i32 = ARMISD::VADDLVs x
+  //   t2: i64 = build_pair t1, t1:1
+  //   t3: i64 = add t2, y
+  // We also need to check for sext / zext and commutitive adds.
+  auto MakeVecReduce = [&](unsigned Opcode, unsigned OpcodeA, SDValue NA,
+                           SDValue NB) {
+    if (NB->getOpcode() != ISD::BUILD_PAIR)
+      return SDValue();
+    SDValue VecRed = NB->getOperand(0);
+    if (VecRed->getOpcode() != Opcode || VecRed.getResNo() != 0 ||
+        NB->getOperand(1) != SDValue(VecRed.getNode(), 1))
+      return SDValue();
+
+    SDLoc dl(N);
+    SmallVector<SDValue, 4> Ops;
+    Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
+                                  DCI.DAG.getConstant(0, dl, MVT::i32)));
+    Ops.push_back(DCI.DAG.getNode(ISD::EXTRACT_ELEMENT, dl, MVT::i32, NA,
+                                  DCI.DAG.getConstant(1, dl, MVT::i32)));
+    for (unsigned i = 0, e = VecRed.getNumOperands(); i < e; i++)
+      Ops.push_back(VecRed->getOperand(i));
+    SDValue Red = DCI.DAG.getNode(OpcodeA, dl,
+                                  DCI.DAG.getVTList({MVT::i32, MVT::i32}), Ops);
+    return DCI.DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Red,
+                           SDValue(Red.getNode(), 1));
+  };
+
+  if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N0, N1))
+    return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N0, N1))
+    return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VADDLVs, ARMISD::VADDLVAs, N1, N0))
+    return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VADDLVu, ARMISD::VADDLVAu, N1, N0))
+    return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N0, N1))
+    return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N0, N1))
+    return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VADDLVps, ARMISD::VADDLVAps, N1, N0))
+    return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VADDLVpu, ARMISD::VADDLVApu, N1, N0))
+    return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N0, N1))
+    return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N0, N1))
+    return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VMLALVs, ARMISD::VMLALVAs, N1, N0))
+    return M;
+  if (SDValue M = MakeVecReduce(ARMISD::VMLALVu, ARMISD::VMLALVAu, N1, N0))
+    return M;
+  return SDValue();
+}
+
 bool
 ARMTargetLowering::isDesirableToCommuteWithShift(const SDNode *N,
                                                  CombineLevel Level) const {
@@ -11883,6 +12433,9 @@ static SDValue PerformADDCombine(SDNode *N,
   if (SDValue Result = PerformSHLSimplify(N, DCI, Subtarget))
     return Result;
 
+  if (SDValue Result = PerformADDVecReduce(N, DCI, Subtarget))
+    return Result;
+
   // First try with the default operand order.
   if (SDValue Result = PerformADDCombineWithOperands(N, N0, N1, DCI, Subtarget))
     return Result;
@@ -11974,18 +12527,86 @@ static SDValue PerformVMULCombine(SDNode *N,
                      DAG.getNode(ISD::MUL, DL, VT, N01, N1));
 }
 
+static SDValue PerformMVEVMULLCombine(SDNode *N, SelectionDAG &DAG,
+                                      const ARMSubtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  if (VT != MVT::v2i64)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  auto IsSignExt = [&](SDValue Op) {
+    if (Op->getOpcode() != ISD::SIGN_EXTEND_INREG)
+      return SDValue();
+    EVT VT = cast<VTSDNode>(Op->getOperand(1))->getVT();
+    if (VT.getScalarSizeInBits() == 32)
+      return Op->getOperand(0);
+    return SDValue();
+  };
+  auto IsZeroExt = [&](SDValue Op) {
+    // Zero extends are a little more awkward. At the point we are matching
+    // this, we are looking for an AND with a (-1, 0, -1, 0) buildvector mask.
+    // That might be before of after a bitcast depending on how the and is
+    // placed. Because this has to look through bitcasts, it is currently only
+    // supported on LE.
+    if (!Subtarget->isLittle())
+      return SDValue();
+
+    SDValue And = Op;
+    if (And->getOpcode() == ISD::BITCAST)
+      And = And->getOperand(0);
+    if (And->getOpcode() != ISD::AND)
+      return SDValue();
+    SDValue Mask = And->getOperand(1);
+    if (Mask->getOpcode() == ISD::BITCAST)
+      Mask = Mask->getOperand(0);
+
+    if (Mask->getOpcode() != ISD::BUILD_VECTOR ||
+        Mask.getValueType() != MVT::v4i32)
+      return SDValue();
+    if (isAllOnesConstant(Mask->getOperand(0)) &&
+        isNullConstant(Mask->getOperand(1)) &&
+        isAllOnesConstant(Mask->getOperand(2)) &&
+        isNullConstant(Mask->getOperand(3)))
+      return And->getOperand(0);
+    return SDValue();
+  };
+
+  SDLoc dl(N);
+  if (SDValue Op0 = IsSignExt(N0)) {
+    if (SDValue Op1 = IsSignExt(N1)) {
+      SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
+      SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
+      return DAG.getNode(ARMISD::VMULLs, dl, VT, New0a, New1a);
+    }
+  }
+  if (SDValue Op0 = IsZeroExt(N0)) {
+    if (SDValue Op1 = IsZeroExt(N1)) {
+      SDValue New0a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op0);
+      SDValue New1a = DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, MVT::v4i32, Op1);
+      return DAG.getNode(ARMISD::VMULLu, dl, VT, New0a, New1a);
+    }
+  }
+
+  return SDValue();
+}
+
 static SDValue PerformMULCombine(SDNode *N,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const ARMSubtarget *Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
 
+  EVT VT = N->getValueType(0);
+  if (Subtarget->hasMVEIntegerOps() && VT == MVT::v2i64)
+    return PerformMVEVMULLCombine(N, DAG, Subtarget);
+
   if (Subtarget->isThumb1Only())
     return SDValue();
 
   if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
     return SDValue();
 
-  EVT VT = N->getValueType(0);
   if (VT.is64BitVector() || VT.is128BitVector())
     return PerformVMULCombine(N, DCI, Subtarget);
   if (VT != MVT::i32)
@@ -12170,20 +12791,21 @@ static SDValue PerformANDCombine(SDNode *N,
   EVT VT = N->getValueType(0);
   SelectionDAG &DAG = DCI.DAG;
 
-  if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT) || VT == MVT::v4i1 ||
+      VT == MVT::v8i1 || VT == MVT::v16i1)
     return SDValue();
 
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
-  if (BVN && Subtarget->hasNEON() &&
+  if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    if (SplatBitSize <= 64) {
+    if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
+        SplatBitSize == 64) {
       EVT VbicVT;
       SDValue Val = isVMOVModifiedImm((~SplatBits).getZExtValue(),
                                       SplatUndef.getZExtValue(), SplatBitSize,
-                                      DAG, dl, VbicVT, VT.is128BitVector(),
-                                      OtherModImm);
+                                      DAG, dl, VbicVT, VT, OtherModImm);
       if (Val.getNode()) {
         SDValue Input =
           DAG.getNode(ISD::BITCAST, dl, VbicVT, N->getOperand(0));
@@ -12413,58 +13035,44 @@ static bool isValidMVECond(unsigned CC, bool IsFloat) {
   };
 }
 
+static ARMCC::CondCodes getVCMPCondCode(SDValue N) {
+  if (N->getOpcode() == ARMISD::VCMP)
+    return (ARMCC::CondCodes)N->getConstantOperandVal(2);
+  else if (N->getOpcode() == ARMISD::VCMPZ)
+    return (ARMCC::CondCodes)N->getConstantOperandVal(1);
+  else
+    llvm_unreachable("Not a VCMP/VCMPZ!");
+}
+
+static bool CanInvertMVEVCMP(SDValue N) {
+  ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N));
+  return isValidMVECond(CC, N->getOperand(0).getValueType().isFloatingPoint());
+}
+
 static SDValue PerformORCombine_i1(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI,
                                    const ARMSubtarget *Subtarget) {
   // Try to invert "or A, B" -> "and ~A, ~B", as the "and" is easier to chain
   // together with predicates
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
-  ARMCC::CondCodes CondCode0 = ARMCC::AL;
-  ARMCC::CondCodes CondCode1 = ARMCC::AL;
-  if (N0->getOpcode() == ARMISD::VCMP)
-    CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(2))
-                    ->getZExtValue();
-  else if (N0->getOpcode() == ARMISD::VCMPZ)
-    CondCode0 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N0->getOperand(1))
-                    ->getZExtValue();
-  if (N1->getOpcode() == ARMISD::VCMP)
-    CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(2))
-                    ->getZExtValue();
-  else if (N1->getOpcode() == ARMISD::VCMPZ)
-    CondCode1 = (ARMCC::CondCodes)cast<const ConstantSDNode>(N1->getOperand(1))
-                    ->getZExtValue();
-
-  if (CondCode0 == ARMCC::AL || CondCode1 == ARMCC::AL)
-    return SDValue();
-
-  unsigned Opposite0 = ARMCC::getOppositeCondition(CondCode0);
-  unsigned Opposite1 = ARMCC::getOppositeCondition(CondCode1);
+  auto IsFreelyInvertable = [&](SDValue V) {
+    if (V->getOpcode() == ARMISD::VCMP || V->getOpcode() == ARMISD::VCMPZ)
+      return CanInvertMVEVCMP(V);
+    return false;
+  };
 
-  if (!isValidMVECond(Opposite0,
-                      N0->getOperand(0)->getValueType(0).isFloatingPoint()) ||
-      !isValidMVECond(Opposite1,
-                      N1->getOperand(0)->getValueType(0).isFloatingPoint()))
+  // At least one operand must be freely invertable.
+  if (!(IsFreelyInvertable(N0) || IsFreelyInvertable(N1)))
     return SDValue();
 
-  SmallVector<SDValue, 4> Ops0;
-  Ops0.push_back(N0->getOperand(0));
-  if (N0->getOpcode() == ARMISD::VCMP)
-    Ops0.push_back(N0->getOperand(1));
-  Ops0.push_back(DCI.DAG.getConstant(Opposite0, SDLoc(N0), MVT::i32));
-  SmallVector<SDValue, 4> Ops1;
-  Ops1.push_back(N1->getOperand(0));
-  if (N1->getOpcode() == ARMISD::VCMP)
-    Ops1.push_back(N1->getOperand(1));
-  Ops1.push_back(DCI.DAG.getConstant(Opposite1, SDLoc(N1), MVT::i32));
-
-  SDValue NewN0 = DCI.DAG.getNode(N0->getOpcode(), SDLoc(N0), VT, Ops0);
-  SDValue NewN1 = DCI.DAG.getNode(N1->getOpcode(), SDLoc(N1), VT, Ops1);
-  SDValue And = DCI.DAG.getNode(ISD::AND, SDLoc(N), VT, NewN0, NewN1);
-  return DCI.DAG.getNode(ISD::XOR, SDLoc(N), VT, And,
-                         DCI.DAG.getAllOnesConstant(SDLoc(N), VT));
+  SDValue NewN0 = DCI.DAG.getLogicalNOT(DL, N0, VT);
+  SDValue NewN1 = DCI.DAG.getLogicalNOT(DL, N1, VT);
+  SDValue And = DCI.DAG.getNode(ISD::AND, DL, VT, NewN0, NewN1);
+  return DCI.DAG.getLogicalNOT(DL, And, VT);
 }
 
 /// PerformORCombine - Target-specific dag combine xforms for ISD::OR
@@ -12480,17 +13088,21 @@ static SDValue PerformORCombine(SDNode *N,
   if(!DAG.getTargetLoweringInfo().isTypeLegal(VT))
     return SDValue();
 
+  if (Subtarget->hasMVEIntegerOps() &&
+      (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
+    return PerformORCombine_i1(N, DCI, Subtarget);
+
   APInt SplatBits, SplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
-  if (BVN && Subtarget->hasNEON() &&
+  if (BVN && (Subtarget->hasNEON() || Subtarget->hasMVEIntegerOps()) &&
       BVN->isConstantSplat(SplatBits, SplatUndef, SplatBitSize, HasAnyUndefs)) {
-    if (SplatBitSize <= 64) {
+    if (SplatBitSize == 8 || SplatBitSize == 16 || SplatBitSize == 32 ||
+        SplatBitSize == 64) {
       EVT VorrVT;
-      SDValue Val = isVMOVModifiedImm(SplatBits.getZExtValue(),
-                                      SplatUndef.getZExtValue(), SplatBitSize,
-                                      DAG, dl, VorrVT, VT.is128BitVector(),
-                                      OtherModImm);
+      SDValue Val =
+          isVMOVModifiedImm(SplatBits.getZExtValue(), SplatUndef.getZExtValue(),
+                            SplatBitSize, DAG, dl, VorrVT, VT, OtherModImm);
       if (Val.getNode()) {
         SDValue Input =
           DAG.getNode(ISD::BITCAST, dl, VorrVT, N->getOperand(0));
@@ -12551,10 +13163,6 @@ static SDValue PerformORCombine(SDNode *N,
     }
   }
 
-  if (Subtarget->hasMVEIntegerOps() &&
-      (VT == MVT::v4i1 || VT == MVT::v8i1 || VT == MVT::v16i1))
-    return PerformORCombine_i1(N, DCI, Subtarget);
-
   // Try to use the ARM/Thumb2 BFI (bitfield insert) instruction when
   // reasonable.
   if (N0.getOpcode() == ISD::AND && N0.hasOneUse()) {
@@ -12586,6 +13194,27 @@ static SDValue PerformXORCombine(SDNode *N,
       return Result;
   }
 
+  if (Subtarget->hasMVEIntegerOps()) {
+    // fold (xor(vcmp/z, 1)) into a vcmp with the opposite condition.
+    SDValue N0 = N->getOperand(0);
+    SDValue N1 = N->getOperand(1);
+    const TargetLowering *TLI = Subtarget->getTargetLowering();
+    if (TLI->isConstTrueVal(N1.getNode()) &&
+        (N0->getOpcode() == ARMISD::VCMP || N0->getOpcode() == ARMISD::VCMPZ)) {
+      if (CanInvertMVEVCMP(N0)) {
+        SDLoc DL(N0);
+        ARMCC::CondCodes CC = ARMCC::getOppositeCondition(getVCMPCondCode(N0));
+
+        SmallVector<SDValue, 4> Ops;
+        Ops.push_back(N0->getOperand(0));
+        if (N0->getOpcode() == ARMISD::VCMP)
+          Ops.push_back(N0->getOperand(1));
+        Ops.push_back(DCI.DAG.getConstant(CC, DL, MVT::i32));
+        return DCI.DAG.getNode(N0->getOpcode(), DL, N0->getValueType(0), Ops);
+      }
+    }
+  }
+
   return SDValue();
 }
 
@@ -12784,6 +13413,78 @@ static SDValue PerformVMOVDRRCombine(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
+static SDValue PerformVMOVhrCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue Op0 = N->getOperand(0);
+
+  // VMOVhr (VMOVrh (X)) -> X
+  if (Op0->getOpcode() == ARMISD::VMOVrh)
+    return Op0->getOperand(0);
+
+  // FullFP16: half values are passed in S-registers, and we don't
+  // need any of the bitcast and moves:
+  //
+  //     t2: f32,ch = CopyFromReg t0, Register:f32 %0
+  //   t5: i32 = bitcast t2
+  // t18: f16 = ARMISD::VMOVhr t5
+  if (Op0->getOpcode() == ISD::BITCAST) {
+    SDValue Copy = Op0->getOperand(0);
+    if (Copy.getValueType() == MVT::f32 &&
+        Copy->getOpcode() == ISD::CopyFromReg) {
+      SDValue Ops[] = {Copy->getOperand(0), Copy->getOperand(1)};
+      SDValue NewCopy =
+          DCI.DAG.getNode(ISD::CopyFromReg, SDLoc(N), N->getValueType(0), Ops);
+      return NewCopy;
+    }
+  }
+
+  // fold (VMOVhr (load x)) -> (load (f16*)x)
+  if (LoadSDNode *LN0 = dyn_cast<LoadSDNode>(Op0)) {
+    if (LN0->hasOneUse() && LN0->isUnindexed() &&
+        LN0->getMemoryVT() == MVT::i16) {
+      SDValue Load =
+          DCI.DAG.getLoad(N->getValueType(0), SDLoc(N), LN0->getChain(),
+                          LN0->getBasePtr(), LN0->getMemOperand());
+      DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
+      DCI.DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Load.getValue(1));
+      return Load;
+    }
+  }
+
+  // Only the bottom 16 bits of the source register are used.
+  APInt DemandedMask = APInt::getLowBitsSet(32, 16);
+  const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+  if (TLI.SimplifyDemandedBits(Op0, DemandedMask, DCI))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+static SDValue PerformVMOVrhCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue N0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // fold (VMOVrh (load x)) -> (zextload (i16*)x)
+  if (ISD::isNormalLoad(N0.getNode()) && N0.hasOneUse()) {
+    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
+
+    SDValue Load =
+        DCI.DAG.getExtLoad(ISD::ZEXTLOAD, SDLoc(N), VT, LN0->getChain(),
+                           LN0->getBasePtr(), MVT::i16, LN0->getMemOperand());
+    DCI.DAG.ReplaceAllUsesOfValueWith(SDValue(N, 0), Load.getValue(0));
+    DCI.DAG.ReplaceAllUsesOfValueWith(N0.getValue(1), Load.getValue(1));
+    return Load;
+  }
+
+  // Fold VMOVrh(extract(x, n)) -> vgetlaneu(x, n)
+  if (N0->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
+      isa<ConstantSDNode>(N0->getOperand(1)))
+    return DCI.DAG.getNode(ARMISD::VGETLANEu, SDLoc(N), VT, N0->getOperand(0),
+                           N0->getOperand(1));
+
+  return SDValue();
+}
+
 /// hasNormalLoadOperand - Check if any of the operands of a BUILD_VECTOR node
 /// are normal, non-volatile loads.  If so, it is profitable to bitcast an
 /// i64 vector to have f64 elements, since the value can then be loaded
@@ -12934,8 +13635,29 @@ PerformPREDICATE_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
     // If the valuetypes are the same, we can remove the cast entirely.
     if (Op->getOperand(0).getValueType() == VT)
       return Op->getOperand(0);
-    return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl,
-                           Op->getOperand(0).getValueType(), Op->getOperand(0));
+    return DCI.DAG.getNode(ARMISD::PREDICATE_CAST, dl, VT, Op->getOperand(0));
+  }
+
+  return SDValue();
+}
+
+static SDValue
+PerformVECTOR_REG_CASTCombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI,
+                              const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  SDValue Op = N->getOperand(0);
+  SDLoc dl(N);
+
+  // Under Little endian, a VECTOR_REG_CAST is equivalent to a BITCAST
+  if (ST->isLittle())
+    return DCI.DAG.getNode(ISD::BITCAST, dl, VT, Op);
+
+  // VECTOR_REG_CAST(VECTOR_REG_CAST(x)) == VECTOR_REG_CAST(x)
+  if (Op->getOpcode() == ARMISD::VECTOR_REG_CAST) {
+    // If the valuetypes are the same, we can remove the cast entirely.
+    if (Op->getOperand(0).getValueType() == VT)
+      return Op->getOperand(0);
+    return DCI.DAG.getNode(ARMISD::VECTOR_REG_CAST, dl, VT, Op->getOperand(0));
   }
 
   return SDValue();
@@ -13000,6 +13722,29 @@ static SDValue PerformInsertEltCombine(SDNode *N,
   return DAG.getNode(ISD::BITCAST, dl, VT, InsElt);
 }
 
+static SDValue PerformExtractEltCombine(SDNode *N,
+                                        TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue Op0 = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+  SDLoc dl(N);
+
+  // extract (vdup x) -> x
+  if (Op0->getOpcode() == ARMISD::VDUP) {
+    SDValue X = Op0->getOperand(0);
+    if (VT == MVT::f16 && X.getValueType() == MVT::i32)
+      return DCI.DAG.getNode(ARMISD::VMOVhr, dl, VT, X);
+    if (VT == MVT::i32 && X.getValueType() == MVT::f16)
+      return DCI.DAG.getNode(ARMISD::VMOVrh, dl, VT, X);
+
+    while (X.getValueType() != VT && X->getOpcode() == ISD::BITCAST)
+      X = X->getOperand(0);
+    if (X.getValueType() == VT)
+      return X;
+  }
+
+  return SDValue();
+}
+
 /// PerformVECTOR_SHUFFLECombine - Target-specific dag combine xforms for
 /// ISD::VECTOR_SHUFFLE.
 static SDValue PerformVECTOR_SHUFFLECombine(SDNode *N, SelectionDAG &DAG) {
@@ -13281,6 +14026,128 @@ static SDValue PerformVLDCombine(SDNode *N,
   return CombineBaseUpdate(N, DCI);
 }
 
+static SDValue PerformMVEVLDCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalize() || DCI.isCalledByLegalizer())
+    return SDValue();
+
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Addr = N->getOperand(2);
+  MemSDNode *MemN = cast<MemSDNode>(N);
+  SDLoc dl(N);
+
+  // For the stores, where there are multiple intrinsics we only actually want
+  // to post-inc the last of the them.
+  unsigned IntNo = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  if (IntNo == Intrinsic::arm_mve_vst2q &&
+      cast<ConstantSDNode>(N->getOperand(5))->getZExtValue() != 1)
+    return SDValue();
+  if (IntNo == Intrinsic::arm_mve_vst4q &&
+      cast<ConstantSDNode>(N->getOperand(7))->getZExtValue() != 3)
+    return SDValue();
+
+  // Search for a use of the address operand that is an increment.
+  for (SDNode::use_iterator UI = Addr.getNode()->use_begin(),
+                            UE = Addr.getNode()->use_end();
+       UI != UE; ++UI) {
+    SDNode *User = *UI;
+    if (User->getOpcode() != ISD::ADD ||
+        UI.getUse().getResNo() != Addr.getResNo())
+      continue;
+
+    // Check that the add is independent of the load/store.  Otherwise, folding
+    // it would create a cycle. We can avoid searching through Addr as it's a
+    // predecessor to both.
+    SmallPtrSet<const SDNode *, 32> Visited;
+    SmallVector<const SDNode *, 16> Worklist;
+    Visited.insert(Addr.getNode());
+    Worklist.push_back(N);
+    Worklist.push_back(User);
+    if (SDNode::hasPredecessorHelper(N, Visited, Worklist) ||
+        SDNode::hasPredecessorHelper(User, Visited, Worklist))
+      continue;
+
+    // Find the new opcode for the updating load/store.
+    bool isLoadOp = true;
+    unsigned NewOpc = 0;
+    unsigned NumVecs = 0;
+    switch (IntNo) {
+    default:
+      llvm_unreachable("unexpected intrinsic for MVE VLDn combine");
+    case Intrinsic::arm_mve_vld2q:
+      NewOpc = ARMISD::VLD2_UPD;
+      NumVecs = 2;
+      break;
+    case Intrinsic::arm_mve_vld4q:
+      NewOpc = ARMISD::VLD4_UPD;
+      NumVecs = 4;
+      break;
+    case Intrinsic::arm_mve_vst2q:
+      NewOpc = ARMISD::VST2_UPD;
+      NumVecs = 2;
+      isLoadOp = false;
+      break;
+    case Intrinsic::arm_mve_vst4q:
+      NewOpc = ARMISD::VST4_UPD;
+      NumVecs = 4;
+      isLoadOp = false;
+      break;
+    }
+
+    // Find the size of memory referenced by the load/store.
+    EVT VecTy;
+    if (isLoadOp) {
+      VecTy = N->getValueType(0);
+    } else {
+      VecTy = N->getOperand(3).getValueType();
+    }
+
+    unsigned NumBytes = NumVecs * VecTy.getSizeInBits() / 8;
+
+    // If the increment is a constant, it must match the memory ref size.
+    SDValue Inc = User->getOperand(User->getOperand(0) == Addr ? 1 : 0);
+    ConstantSDNode *CInc = dyn_cast<ConstantSDNode>(Inc.getNode());
+    if (!CInc || CInc->getZExtValue() != NumBytes)
+      continue;
+
+    // Create the new updating load/store node.
+    // First, create an SDVTList for the new updating node's results.
+    EVT Tys[6];
+    unsigned NumResultVecs = (isLoadOp ? NumVecs : 0);
+    unsigned n;
+    for (n = 0; n < NumResultVecs; ++n)
+      Tys[n] = VecTy;
+    Tys[n++] = MVT::i32;
+    Tys[n] = MVT::Other;
+    SDVTList SDTys = DAG.getVTList(makeArrayRef(Tys, NumResultVecs + 2));
+
+    // Then, gather the new node's operands.
+    SmallVector<SDValue, 8> Ops;
+    Ops.push_back(N->getOperand(0)); // incoming chain
+    Ops.push_back(N->getOperand(2)); // ptr
+    Ops.push_back(Inc);
+
+    for (unsigned i = 3; i < N->getNumOperands(); ++i)
+      Ops.push_back(N->getOperand(i));
+
+    SDValue UpdN = DAG.getMemIntrinsicNode(NewOpc, dl, SDTys, Ops, VecTy,
+                                           MemN->getMemOperand());
+
+    // Update the uses.
+    SmallVector<SDValue, 5> NewResults;
+    for (unsigned i = 0; i < NumResultVecs; ++i)
+      NewResults.push_back(SDValue(UpdN.getNode(), i));
+
+    NewResults.push_back(SDValue(UpdN.getNode(), NumResultVecs + 1)); // chain
+    DCI.CombineTo(N, NewResults);
+    DCI.CombineTo(User, SDValue(UpdN.getNode(), NumResultVecs));
+
+    break;
+  }
+
+  return SDValue();
+}
+
 /// CombineVLDDUP - For a VDUPLANE node N, check if its source operand is a
 /// vldN-lane (N > 1) intrinsic, and if all the other uses of that intrinsic
 /// are also VDUPLANEs.  If so, combine them to a vldN-dup operation and
@@ -13365,8 +14232,21 @@ static bool CombineVLDDUP(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
 /// PerformVDUPLANECombine - Target-specific dag combine xforms for
 /// ARMISD::VDUPLANE.
 static SDValue PerformVDUPLANECombine(SDNode *N,
-                                      TargetLowering::DAGCombinerInfo &DCI) {
+                                      TargetLowering::DAGCombinerInfo &DCI,
+                                      const ARMSubtarget *Subtarget) {
   SDValue Op = N->getOperand(0);
+  EVT VT = N->getValueType(0);
+
+  // On MVE, we just convert the VDUPLANE to a VDUP with an extract.
+  if (Subtarget->hasMVEIntegerOps()) {
+    EVT ExtractVT = VT.getVectorElementType();
+    // We need to ensure we are creating a legal type.
+    if (!DCI.DAG.getTargetLoweringInfo().isTypeLegal(ExtractVT))
+      ExtractVT = MVT::i32;
+    SDValue Extract = DCI.DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(N), ExtractVT,
+                              N->getOperand(0), N->getOperand(1));
+    return DCI.DAG.getNode(ARMISD::VDUP, SDLoc(N), VT, Extract);
+  }
 
   // If the source is a vldN-lane (N > 1) intrinsic, and all the other uses
   // of that intrinsic are also VDUPLANEs, combine them to a vldN-dup operation.
@@ -13387,7 +14267,6 @@ static SDValue PerformVDUPLANECombine(SDNode *N,
   unsigned EltBits;
   if (ARM_AM::decodeVMOVModImm(Imm, EltBits) == 0)
     EltSize = 8;
-  EVT VT = N->getValueType(0);
   if (EltSize > VT.getScalarSizeInBits())
     return SDValue();
 
@@ -13400,6 +14279,18 @@ static SDValue PerformVDUPCombine(SDNode *N,
                                   const ARMSubtarget *Subtarget) {
   SelectionDAG &DAG = DCI.DAG;
   SDValue Op = N->getOperand(0);
+  SDLoc dl(N);
+
+  if (Subtarget->hasMVEIntegerOps()) {
+    // Convert VDUP f32 -> VDUP BITCAST i32 under MVE, as we know the value will
+    // need to come from a GPR.
+    if (Op.getValueType() == MVT::f32)
+      return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
+                             DAG.getNode(ISD::BITCAST, dl, MVT::i32, Op));
+    else if (Op.getValueType() == MVT::f16)
+      return DCI.DAG.getNode(ARMISD::VDUP, dl, N->getValueType(0),
+                             DAG.getNode(ARMISD::VMOVrh, dl, MVT::i32, Op));
+  }
 
   if (!Subtarget->hasNEON())
     return SDValue();
@@ -13528,7 +14419,7 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
   if (!St->isSimple() || St->isTruncatingStore() || !St->isUnindexed())
     return SDValue();
   SDValue Trunc = St->getValue();
-  if (Trunc->getOpcode() != ISD::TRUNCATE)
+  if (Trunc->getOpcode() != ISD::TRUNCATE && Trunc->getOpcode() != ISD::FP_ROUND)
     return SDValue();
   EVT FromVT = Trunc->getOperand(0).getValueType();
   EVT ToVT = Trunc.getValueType();
@@ -13543,20 +14434,54 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
     NumElements = 4;
   if (FromEltVT == MVT::i16 && ToEltVT == MVT::i8)
     NumElements = 8;
-  if (NumElements == 0 || FromVT.getVectorNumElements() == NumElements ||
+  if (FromEltVT == MVT::f32 && ToEltVT == MVT::f16)
+    NumElements = 4;
+  if (NumElements == 0 ||
+      (FromEltVT != MVT::f32 && FromVT.getVectorNumElements() == NumElements) ||
       FromVT.getVectorNumElements() % NumElements != 0)
     return SDValue();
 
+  // Test if the Trunc will be convertable to a VMOVN with a shuffle, and if so
+  // use the VMOVN over splitting the store. We are looking for patterns of:
+  // !rev: 0 N 1 N+1 2 N+2 ...
+  //  rev: N 0 N+1 1 N+2 2 ...
+  auto isVMOVNOriginalMask = [&](ArrayRef<int> M, bool rev) {
+    unsigned NumElts = ToVT.getVectorNumElements();
+    if (NumElts != M.size())
+      return false;
+
+    unsigned Off0 = rev ? NumElts : 0;
+    unsigned Off1 = rev ? 0 : NumElts;
+
+    for (unsigned i = 0; i < NumElts; i += 2) {
+      if (M[i] >= 0 && M[i] != (int)(Off0 + i / 2))
+        return false;
+      if (M[i + 1] >= 0 && M[i + 1] != (int)(Off1 + i / 2))
+        return false;
+    }
+
+    return true;
+  };
+
+  if (auto *Shuffle = dyn_cast<ShuffleVectorSDNode>(Trunc->getOperand(0)))
+    if (isVMOVNOriginalMask(Shuffle->getMask(), false) ||
+        isVMOVNOriginalMask(Shuffle->getMask(), true))
+      return SDValue();
+
+  LLVMContext &C = *DAG.getContext();
   SDLoc DL(St);
   // Details about the old store
   SDValue Ch = St->getChain();
   SDValue BasePtr = St->getBasePtr();
-  unsigned Alignment = St->getOriginalAlignment();
+  Align Alignment = St->getOriginalAlign();
   MachineMemOperand::Flags MMOFlags = St->getMemOperand()->getFlags();
   AAMDNodes AAInfo = St->getAAInfo();
 
-  EVT NewFromVT = EVT::getVectorVT(*DAG.getContext(), FromEltVT, NumElements);
-  EVT NewToVT = EVT::getVectorVT(*DAG.getContext(), ToEltVT, NumElements);
+  // We split the store into slices of NumElements. fp16 trunc stores are vcvt
+  // and then stored as truncating integer stores.
+  EVT NewFromVT = EVT::getVectorVT(C, FromEltVT, NumElements);
+  EVT NewToVT = EVT::getVectorVT(
+      C, EVT::getIntegerVT(C, ToEltVT.getSizeInBits()), NumElements);
 
   SmallVector<SDValue, 4> Stores;
   for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
@@ -13566,9 +14491,17 @@ static SDValue PerformSplittingToNarrowingStores(StoreSDNode *St,
     SDValue Extract =
         DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, NewFromVT, Trunc.getOperand(0),
                     DAG.getConstant(i * NumElements, DL, MVT::i32));
+
+    if (ToEltVT == MVT::f16) {
+      SDValue FPTrunc =
+          DAG.getNode(ARMISD::VCVTN, DL, MVT::v8f16, DAG.getUNDEF(MVT::v8f16),
+                      Extract, DAG.getConstant(0, DL, MVT::i32));
+      Extract = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v4i32, FPTrunc);
+    }
+
     SDValue Store = DAG.getTruncStore(
         Ch, DL, Extract, NewPtr, St->getPointerInfo().getWithOffset(NewOffset),
-        NewToVT, Alignment, MMOFlags, AAInfo);
+        NewToVT, Alignment.value(), MMOFlags, AAInfo);
     Stores.push_back(Store);
   }
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Stores);
@@ -13766,8 +14699,163 @@ static SDValue PerformVDIVCombine(SDNode *N, SelectionDAG &DAG,
                      ConvInput, DAG.getConstant(C, dl, MVT::i32));
 }
 
+static SDValue PerformVECREDUCE_ADDCombine(SDNode *N, SelectionDAG &DAG,
+                                           const ARMSubtarget *ST) {
+  if (!ST->hasMVEIntegerOps())
+    return SDValue();
+
+  assert(N->getOpcode() == ISD::VECREDUCE_ADD);
+  EVT ResVT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDLoc dl(N);
+
+  // We are looking for something that will have illegal types if left alone,
+  // but that we can convert to a single instruction undef MVE. For example
+  // vecreduce_add(sext(A, v8i32)) => VADDV.s16 A
+  // or
+  // vecreduce_add(mul(zext(A, v16i32), zext(B, v16i32))) => VMLADAV.u8 A, B
+
+  // Cases:
+  //   VADDV u/s 8/16/32
+  //   VMLAV u/s 8/16/32
+  //   VADDLV u/s 32
+  //   VMLALV u/s 16/32
+
+  auto IsVADDV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes) {
+    if (ResVT != RetTy || N0->getOpcode() != ExtendCode)
+      return SDValue();
+    SDValue A = N0->getOperand(0);
+    if (llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; }))
+      return A;
+    return SDValue();
+  };
+  auto IsVMLAV = [&](MVT RetTy, unsigned ExtendCode, ArrayRef<MVT> ExtTypes,
+                     SDValue &A, SDValue &B) {
+    if (ResVT != RetTy || N0->getOpcode() != ISD::MUL)
+      return false;
+    SDValue ExtA = N0->getOperand(0);
+    SDValue ExtB = N0->getOperand(1);
+    if (ExtA->getOpcode() != ExtendCode && ExtB->getOpcode() != ExtendCode)
+      return false;
+    A = ExtA->getOperand(0);
+    B = ExtB->getOperand(0);
+    if (A.getValueType() == B.getValueType() &&
+        llvm::any_of(ExtTypes, [&A](MVT Ty) { return A.getValueType() == Ty; }))
+      return true;
+    return false;
+  };
+  auto Create64bitNode = [&](unsigned Opcode, ArrayRef<SDValue> Ops) {
+    SDValue Node = DAG.getNode(Opcode, dl, {MVT::i32, MVT::i32}, Ops);
+    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Node,
+                       SDValue(Node.getNode(), 1));
+  };
+
+  if (SDValue A = IsVADDV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}))
+    return DAG.getNode(ARMISD::VADDVs, dl, ResVT, A);
+  if (SDValue A = IsVADDV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}))
+    return DAG.getNode(ARMISD::VADDVu, dl, ResVT, A);
+  if (SDValue A = IsVADDV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v4i32}))
+    return Create64bitNode(ARMISD::VADDLVs, {A});
+  if (SDValue A = IsVADDV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v4i32}))
+    return Create64bitNode(ARMISD::VADDLVu, {A});
+
+  SDValue A, B;
+  if (IsVMLAV(MVT::i32, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
+    return DAG.getNode(ARMISD::VMLAVs, dl, ResVT, A, B);
+  if (IsVMLAV(MVT::i32, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v16i8}, A, B))
+    return DAG.getNode(ARMISD::VMLAVu, dl, ResVT, A, B);
+  if (IsVMLAV(MVT::i64, ISD::SIGN_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B))
+    return Create64bitNode(ARMISD::VMLALVs, {A, B});
+  if (IsVMLAV(MVT::i64, ISD::ZERO_EXTEND, {MVT::v8i16, MVT::v4i32}, A, B))
+    return Create64bitNode(ARMISD::VMLALVu, {A, B});
+  return SDValue();
+}
+
+static SDValue PerformVMOVNCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+  unsigned IsTop = N->getConstantOperandVal(2);
+
+  // VMOVNt(c, VQMOVNb(a, b)) => VQMOVNt(c, b)
+  // VMOVNb(c, VQMOVNb(a, b)) => VQMOVNb(c, b)
+  if ((Op1->getOpcode() == ARMISD::VQMOVNs ||
+       Op1->getOpcode() == ARMISD::VQMOVNu) &&
+      Op1->getConstantOperandVal(2) == 0)
+    return DCI.DAG.getNode(Op1->getOpcode(), SDLoc(Op1), N->getValueType(0),
+                           Op0, Op1->getOperand(1), N->getOperand(2));
+
+  // Only the bottom lanes from Qm (Op1) and either the top or bottom lanes from
+  // Qd (Op0) are demanded from a VMOVN, depending on whether we are inserting
+  // into the top or bottom lanes.
+  unsigned NumElts = N->getValueType(0).getVectorNumElements();
+  APInt Op1DemandedElts = APInt::getSplat(NumElts, APInt::getLowBitsSet(2, 1));
+  APInt Op0DemandedElts =
+      IsTop ? Op1DemandedElts
+            : APInt::getSplat(NumElts, APInt::getHighBitsSet(2, 1));
+
+  APInt KnownUndef, KnownZero;
+  const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+  if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef,
+                                     KnownZero, DCI))
+    return SDValue(N, 0);
+  if (TLI.SimplifyDemandedVectorElts(Op1, Op1DemandedElts, KnownUndef,
+                                     KnownZero, DCI))
+    return SDValue(N, 0);
+
+  return SDValue();
+}
+
+static SDValue PerformVQMOVNCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI) {
+  SDValue Op0 = N->getOperand(0);
+  unsigned IsTop = N->getConstantOperandVal(2);
+
+  unsigned NumElts = N->getValueType(0).getVectorNumElements();
+  APInt Op0DemandedElts =
+      APInt::getSplat(NumElts, IsTop ? APInt::getLowBitsSet(2, 1)
+                                     : APInt::getHighBitsSet(2, 1));
+
+  APInt KnownUndef, KnownZero;
+  const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+  if (TLI.SimplifyDemandedVectorElts(Op0, Op0DemandedElts, KnownUndef,
+                                     KnownZero, DCI))
+    return SDValue(N, 0);
+  return SDValue();
+}
+
+static SDValue PerformLongShiftCombine(SDNode *N, SelectionDAG &DAG) {
+  SDLoc DL(N);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // Turn X << -C -> X >> C and viceversa. The negative shifts can come up from
+  // uses of the intrinsics.
+  if (auto C = dyn_cast<ConstantSDNode>(N->getOperand(2))) {
+    int ShiftAmt = C->getSExtValue();
+    if (ShiftAmt == 0) {
+      SDValue Merge = DAG.getMergeValues({Op0, Op1}, DL);
+      DAG.ReplaceAllUsesWith(N, Merge.getNode());
+      return SDValue();
+    }
+
+    if (ShiftAmt >= -32 && ShiftAmt < 0) {
+      unsigned NewOpcode =
+          N->getOpcode() == ARMISD::LSLL ? ARMISD::LSRL : ARMISD::LSLL;
+      SDValue NewShift = DAG.getNode(NewOpcode, DL, N->getVTList(), Op0, Op1,
+                                     DAG.getConstant(-ShiftAmt, DL, MVT::i32));
+      DAG.ReplaceAllUsesWith(N, NewShift.getNode());
+      return NewShift;
+    }
+  }
+
+  return SDValue();
+}
+
 /// PerformIntrinsicCombine - ARM-specific DAG combining for intrinsics.
-static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
+SDValue ARMTargetLowering::PerformIntrinsicCombine(SDNode *N,
+                                                   DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
   unsigned IntNo = cast<ConstantSDNode>(N->getOperand(0))->getZExtValue();
   switch (IntNo) {
   default:
@@ -13916,6 +15004,72 @@ static SDValue PerformIntrinsicCombine(SDNode *N, SelectionDAG &DAG) {
   case Intrinsic::arm_neon_vqrshiftu:
     // No immediate versions of these to check for.
     break;
+
+  case Intrinsic::arm_mve_vqdmlah:
+  case Intrinsic::arm_mve_vqdmlash:
+  case Intrinsic::arm_mve_vqrdmlah:
+  case Intrinsic::arm_mve_vqrdmlash:
+  case Intrinsic::arm_mve_vmla_n_predicated:
+  case Intrinsic::arm_mve_vmlas_n_predicated:
+  case Intrinsic::arm_mve_vqdmlah_predicated:
+  case Intrinsic::arm_mve_vqdmlash_predicated:
+  case Intrinsic::arm_mve_vqrdmlah_predicated:
+  case Intrinsic::arm_mve_vqrdmlash_predicated: {
+    // These intrinsics all take an i32 scalar operand which is narrowed to the
+    // size of a single lane of the vector type they return. So we don't need
+    // any bits of that operand above that point, which allows us to eliminate
+    // uxth/sxth.
+    unsigned BitWidth = N->getValueType(0).getScalarSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
+    if (SimplifyDemandedBits(N->getOperand(3), DemandedMask, DCI))
+      return SDValue();
+    break;
+  }
+
+  case Intrinsic::arm_mve_minv:
+  case Intrinsic::arm_mve_maxv:
+  case Intrinsic::arm_mve_minav:
+  case Intrinsic::arm_mve_maxav:
+  case Intrinsic::arm_mve_minv_predicated:
+  case Intrinsic::arm_mve_maxv_predicated:
+  case Intrinsic::arm_mve_minav_predicated:
+  case Intrinsic::arm_mve_maxav_predicated: {
+    // These intrinsics all take an i32 scalar operand which is narrowed to the
+    // size of a single lane of the vector type they take as the other input.
+    unsigned BitWidth = N->getOperand(2)->getValueType(0).getScalarSizeInBits();
+    APInt DemandedMask = APInt::getLowBitsSet(32, BitWidth);
+    if (SimplifyDemandedBits(N->getOperand(1), DemandedMask, DCI))
+      return SDValue();
+    break;
+  }
+
+  case Intrinsic::arm_mve_addv: {
+    // Turn this intrinsic straight into the appropriate ARMISD::VADDV node,
+    // which allow PerformADDVecReduce to turn it into VADDLV when possible.
+    bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    unsigned Opc = Unsigned ? ARMISD::VADDVu : ARMISD::VADDVs;
+    return DAG.getNode(Opc, SDLoc(N), N->getVTList(), N->getOperand(1));
+  }
+
+  case Intrinsic::arm_mve_addlv:
+  case Intrinsic::arm_mve_addlv_predicated: {
+    // Same for these, but ARMISD::VADDLV has to be followed by a BUILD_PAIR
+    // which recombines the two outputs into an i64
+    bool Unsigned = cast<ConstantSDNode>(N->getOperand(2))->getZExtValue();
+    unsigned Opc = IntNo == Intrinsic::arm_mve_addlv ?
+                    (Unsigned ? ARMISD::VADDLVu : ARMISD::VADDLVs) :
+                    (Unsigned ? ARMISD::VADDLVpu : ARMISD::VADDLVps);
+
+    SmallVector<SDValue, 4> Ops;
+    for (unsigned i = 1, e = N->getNumOperands(); i < e; i++)
+      if (i != 2)                      // skip the unsigned flag
+        Ops.push_back(N->getOperand(i));
+
+    SDLoc dl(N);
+    SDValue val = DAG.getNode(Opc, dl, {MVT::i32, MVT::i32}, Ops);
+    return DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, val.getValue(0),
+                       val.getValue(1));
+  }
   }
 
   return SDValue();
@@ -14011,9 +15165,10 @@ static SDValue PerformShiftCombine(SDNode *N,
   return SDValue();
 }
 
-// Look for a sign/zero extend of a larger than legal load. This can be split
-// into two extending loads, which are simpler to deal with than an arbitrary
-// sign extend.
+// Look for a sign/zero/fpextend extend of a larger than legal load. This can be
+// split into multiple extending loads, which are simpler to deal with than an
+// arbitrary extend. For fp extends we use an integer extending load and a VCVTL
+// to convert the type to an f32.
 static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   if (N0.getOpcode() != ISD::LOAD)
@@ -14035,45 +15190,63 @@ static SDValue PerformSplittingToWideningLoad(SDNode *N, SelectionDAG &DAG) {
     NumElements = 4;
   if (ToEltVT == MVT::i16 && FromEltVT == MVT::i8)
     NumElements = 8;
+  if (ToEltVT == MVT::f32 && FromEltVT == MVT::f16)
+    NumElements = 4;
   if (NumElements == 0 ||
-      FromVT.getVectorNumElements() == NumElements ||
+      (FromEltVT != MVT::f16 && FromVT.getVectorNumElements() == NumElements) ||
       FromVT.getVectorNumElements() % NumElements != 0 ||
       !isPowerOf2_32(NumElements))
     return SDValue();
 
+  LLVMContext &C = *DAG.getContext();
   SDLoc DL(LD);
   // Details about the old load
   SDValue Ch = LD->getChain();
   SDValue BasePtr = LD->getBasePtr();
-  unsigned Alignment = LD->getOriginalAlignment();
+  Align Alignment = LD->getOriginalAlign();
   MachineMemOperand::Flags MMOFlags = LD->getMemOperand()->getFlags();
   AAMDNodes AAInfo = LD->getAAInfo();
 
   ISD::LoadExtType NewExtType =
       N->getOpcode() == ISD::SIGN_EXTEND ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
   SDValue Offset = DAG.getUNDEF(BasePtr.getValueType());
-  EVT NewFromVT = FromVT.getHalfNumVectorElementsVT(*DAG.getContext());
-  EVT NewToVT = ToVT.getHalfNumVectorElementsVT(*DAG.getContext());
-  unsigned NewOffset = NewFromVT.getSizeInBits() / 8;
-  SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
-
-  // Split the load in half, each side of which is extended separately. This
-  // is good enough, as legalisation will take it from there. They are either
-  // already legal or they will be split further into something that is
-  // legal.
-  SDValue NewLoad1 =
-      DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, BasePtr, Offset,
-                  LD->getPointerInfo(), NewFromVT, Alignment, MMOFlags, AAInfo);
-  SDValue NewLoad2 =
-      DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
-                  LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
-                  Alignment, MMOFlags, AAInfo);
-
-  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other,
-                                 SDValue(NewLoad1.getNode(), 1),
-                                 SDValue(NewLoad2.getNode(), 1));
+  EVT NewFromVT = EVT::getVectorVT(
+      C, EVT::getIntegerVT(C, FromEltVT.getScalarSizeInBits()), NumElements);
+  EVT NewToVT = EVT::getVectorVT(
+      C, EVT::getIntegerVT(C, ToEltVT.getScalarSizeInBits()), NumElements);
+
+  SmallVector<SDValue, 4> Loads;
+  SmallVector<SDValue, 4> Chains;
+  for (unsigned i = 0; i < FromVT.getVectorNumElements() / NumElements; i++) {
+    unsigned NewOffset = (i * NewFromVT.getSizeInBits()) / 8;
+    SDValue NewPtr = DAG.getObjectPtrOffset(DL, BasePtr, NewOffset);
+
+    SDValue NewLoad =
+        DAG.getLoad(ISD::UNINDEXED, NewExtType, NewToVT, DL, Ch, NewPtr, Offset,
+                    LD->getPointerInfo().getWithOffset(NewOffset), NewFromVT,
+                    Alignment.value(), MMOFlags, AAInfo);
+    Loads.push_back(NewLoad);
+    Chains.push_back(SDValue(NewLoad.getNode(), 1));
+  }
+
+  // Float truncs need to extended with VCVTB's into their floating point types.
+  if (FromEltVT == MVT::f16) {
+    SmallVector<SDValue, 4> Extends;
+
+    for (unsigned i = 0; i < Loads.size(); i++) {
+      SDValue LoadBC =
+          DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, MVT::v8f16, Loads[i]);
+      SDValue FPExt = DAG.getNode(ARMISD::VCVTL, DL, MVT::v4f32, LoadBC,
+                                  DAG.getConstant(0, DL, MVT::i32));
+      Extends.push_back(FPExt);
+    }
+
+    Loads = Extends;
+  }
+
+  SDValue NewChain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chains);
   DAG.ReplaceAllUsesOfValueWith(SDValue(LD, 1), NewChain);
-  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, NewLoad1, NewLoad2);
+  return DAG.getNode(ISD::CONCAT_VECTORS, DL, ToVT, Loads);
 }
 
 /// PerformExtendCombine - Target-specific DAG combining for ISD::SIGN_EXTEND,
@@ -14121,6 +15294,116 @@ static SDValue PerformExtendCombine(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue PerformFPExtendCombine(SDNode *N, SelectionDAG &DAG,
+                                      const ARMSubtarget *ST) {
+  if (ST->hasMVEFloatOps())
+    if (SDValue NewLoad = PerformSplittingToWideningLoad(N, DAG))
+      return NewLoad;
+
+  return SDValue();
+}
+
+/// PerformMinMaxCombine - Target-specific DAG combining for creating truncating
+/// saturates.
+static SDValue PerformMinMaxCombine(SDNode *N, SelectionDAG &DAG,
+                                    const ARMSubtarget *ST) {
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  if (!ST->hasMVEIntegerOps())
+    return SDValue();
+
+  if (VT != MVT::v4i32 && VT != MVT::v8i16)
+    return SDValue();
+
+  auto IsSignedSaturate = [&](SDNode *Min, SDNode *Max) {
+    // Check one is a smin and the other is a smax
+    if (Min->getOpcode() != ISD::SMIN)
+      std::swap(Min, Max);
+    if (Min->getOpcode() != ISD::SMIN || Max->getOpcode() != ISD::SMAX)
+      return false;
+
+    APInt SaturateC;
+    if (VT == MVT::v4i32)
+      SaturateC = APInt(32, (1 << 15) - 1, true);
+    else //if (VT == MVT::v8i16)
+      SaturateC = APInt(16, (1 << 7) - 1, true);
+
+    APInt MinC, MaxC;
+    if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
+        MinC != SaturateC)
+      return false;
+    if (!ISD::isConstantSplatVector(Max->getOperand(1).getNode(), MaxC) ||
+        MaxC != ~SaturateC)
+      return false;
+    return true;
+  };
+
+  if (IsSignedSaturate(N, N0.getNode())) {
+    SDLoc DL(N);
+    MVT ExtVT, HalfVT;
+    if (VT == MVT::v4i32) {
+      HalfVT = MVT::v8i16;
+      ExtVT = MVT::v4i16;
+    } else { // if (VT == MVT::v8i16)
+      HalfVT = MVT::v16i8;
+      ExtVT = MVT::v8i8;
+    }
+
+    // Create a VQMOVNB with undef top lanes, then signed extended into the top
+    // half. That extend will hopefully be removed if only the bottom bits are
+    // demanded (though a truncating store, for example).
+    SDValue VQMOVN =
+        DAG.getNode(ARMISD::VQMOVNs, DL, HalfVT, DAG.getUNDEF(HalfVT),
+                    N0->getOperand(0), DAG.getConstant(0, DL, MVT::i32));
+    SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, Bitcast,
+                       DAG.getValueType(ExtVT));
+  }
+
+  auto IsUnsignedSaturate = [&](SDNode *Min) {
+    // For unsigned, we just need to check for <= 0xffff
+    if (Min->getOpcode() != ISD::UMIN)
+      return false;
+
+    APInt SaturateC;
+    if (VT == MVT::v4i32)
+      SaturateC = APInt(32, (1 << 16) - 1, true);
+    else //if (VT == MVT::v8i16)
+      SaturateC = APInt(16, (1 << 8) - 1, true);
+
+    APInt MinC;
+    if (!ISD::isConstantSplatVector(Min->getOperand(1).getNode(), MinC) ||
+        MinC != SaturateC)
+      return false;
+    return true;
+  };
+
+  if (IsUnsignedSaturate(N)) {
+    SDLoc DL(N);
+    MVT HalfVT;
+    unsigned ExtConst;
+    if (VT == MVT::v4i32) {
+      HalfVT = MVT::v8i16;
+      ExtConst = 0x0000FFFF;
+    } else { //if (VT == MVT::v8i16)
+      HalfVT = MVT::v16i8;
+      ExtConst = 0x00FF;
+    }
+
+    // Create a VQMOVNB with undef top lanes, then ZExt into the top half with
+    // an AND. That extend will hopefully be removed if only the bottom bits are
+    // demanded (though a truncating store, for example).
+    SDValue VQMOVN =
+        DAG.getNode(ARMISD::VQMOVNu, DL, HalfVT, DAG.getUNDEF(HalfVT), N0,
+                    DAG.getConstant(0, DL, MVT::i32));
+    SDValue Bitcast = DAG.getNode(ARMISD::VECTOR_REG_CAST, DL, VT, VQMOVN);
+    return DAG.getNode(ISD::AND, DL, VT, Bitcast,
+                       DAG.getConstant(ExtConst, DL, VT));
+  }
+
+  return SDValue();
+}
+
 static const APInt *isPowerOf2Constant(SDValue V) {
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(V);
   if (!C)
@@ -14602,10 +15885,41 @@ ARMTargetLowering::PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const {
   return Res;
 }
 
+static SDValue PerformBITCASTCombine(SDNode *N, SelectionDAG &DAG,
+                                    const ARMSubtarget *ST) {
+  SDValue Src = N->getOperand(0);
+  EVT DstVT = N->getValueType(0);
+
+  // Convert v4f32 bitcast (v4i32 vdup (i32)) -> v4f32 vdup (i32) under MVE.
+  if (ST->hasMVEIntegerOps() && Src.getOpcode() == ARMISD::VDUP) {
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.getScalarSizeInBits() == DstVT.getScalarSizeInBits())
+      return DAG.getNode(ARMISD::VDUP, SDLoc(N), DstVT, Src.getOperand(0));
+  }
+
+  // We may have a bitcast of something that has already had this bitcast
+  // combine performed on it, so skip past any VECTOR_REG_CASTs.
+  while (Src.getOpcode() == ARMISD::VECTOR_REG_CAST)
+    Src = Src.getOperand(0);
+
+  // Bitcast from element-wise VMOV or VMVN doesn't need VREV if the VREV that
+  // would be generated is at least the width of the element type.
+  EVT SrcVT = Src.getValueType();
+  if ((Src.getOpcode() == ARMISD::VMOVIMM ||
+       Src.getOpcode() == ARMISD::VMVNIMM ||
+       Src.getOpcode() == ARMISD::VMOVFPIMM) &&
+      SrcVT.getScalarSizeInBits() <= DstVT.getScalarSizeInBits() &&
+      DAG.getDataLayout().isBigEndian())
+    return DAG.getNode(ARMISD::VECTOR_REG_CAST, SDLoc(N), DstVT, Src);
+
+  return SDValue();
+}
+
 SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   switch (N->getOpcode()) {
   default: break;
+  case ISD::VSELECT:    return PerformVSELECTCombine(N, DCI, Subtarget);
   case ISD::ABS:        return PerformABSCombine(N, DCI, Subtarget);
   case ARMISD::ADDE:    return PerformADDECombine(N, DCI, Subtarget);
   case ARMISD::UMLAL:   return PerformUMLALCombine(N, DCI.DAG, Subtarget);
@@ -14623,25 +15937,37 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
   case ARMISD::BFI:     return PerformBFICombine(N, DCI);
   case ARMISD::VMOVRRD: return PerformVMOVRRDCombine(N, DCI, Subtarget);
   case ARMISD::VMOVDRR: return PerformVMOVDRRCombine(N, DCI.DAG);
+  case ARMISD::VMOVhr:  return PerformVMOVhrCombine(N, DCI);
+  case ARMISD::VMOVrh:  return PerformVMOVrhCombine(N, DCI);
   case ISD::STORE:      return PerformSTORECombine(N, DCI, Subtarget);
   case ISD::BUILD_VECTOR: return PerformBUILD_VECTORCombine(N, DCI, Subtarget);
   case ISD::INSERT_VECTOR_ELT: return PerformInsertEltCombine(N, DCI);
+  case ISD::EXTRACT_VECTOR_ELT: return PerformExtractEltCombine(N, DCI);
   case ISD::VECTOR_SHUFFLE: return PerformVECTOR_SHUFFLECombine(N, DCI.DAG);
-  case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI);
+  case ARMISD::VDUPLANE: return PerformVDUPLANECombine(N, DCI, Subtarget);
   case ARMISD::VDUP: return PerformVDUPCombine(N, DCI, Subtarget);
   case ISD::FP_TO_SINT:
   case ISD::FP_TO_UINT:
     return PerformVCVTCombine(N, DCI.DAG, Subtarget);
   case ISD::FDIV:
     return PerformVDIVCombine(N, DCI.DAG, Subtarget);
-  case ISD::INTRINSIC_WO_CHAIN: return PerformIntrinsicCombine(N, DCI.DAG);
+  case ISD::INTRINSIC_WO_CHAIN:
+    return PerformIntrinsicCombine(N, DCI);
   case ISD::SHL:
   case ISD::SRA:
   case ISD::SRL:
     return PerformShiftCombine(N, DCI, Subtarget);
   case ISD::SIGN_EXTEND:
   case ISD::ZERO_EXTEND:
-  case ISD::ANY_EXTEND: return PerformExtendCombine(N, DCI.DAG, Subtarget);
+  case ISD::ANY_EXTEND:
+    return PerformExtendCombine(N, DCI.DAG, Subtarget);
+  case ISD::FP_EXTEND:
+    return PerformFPExtendCombine(N, DCI.DAG, Subtarget);
+  case ISD::SMIN:
+  case ISD::UMIN:
+  case ISD::SMAX:
+  case ISD::UMAX:
+    return PerformMinMaxCombine(N, DCI.DAG, Subtarget);
   case ARMISD::CMOV: return PerformCMOVCombine(N, DCI.DAG);
   case ARMISD::BRCOND: return PerformBRCONDCombine(N, DCI.DAG);
   case ISD::LOAD:       return PerformLOADCombine(N, DCI);
@@ -14652,10 +15978,25 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     return PerformVLDCombine(N, DCI);
   case ARMISD::BUILD_VECTOR:
     return PerformARMBUILD_VECTORCombine(N, DCI);
+  case ISD::BITCAST:
+    return PerformBITCASTCombine(N, DCI.DAG, Subtarget);
   case ARMISD::PREDICATE_CAST:
     return PerformPREDICATE_CASTCombine(N, DCI);
+  case ARMISD::VECTOR_REG_CAST:
+    return PerformVECTOR_REG_CASTCombine(N, DCI, Subtarget);
   case ARMISD::VCMP:
     return PerformVCMPCombine(N, DCI, Subtarget);
+  case ISD::VECREDUCE_ADD:
+    return PerformVECREDUCE_ADDCombine(N, DCI.DAG, Subtarget);
+  case ARMISD::VMOVN:
+    return PerformVMOVNCombine(N, DCI);
+  case ARMISD::VQMOVNs:
+  case ARMISD::VQMOVNu:
+    return PerformVQMOVNCombine(N, DCI);
+  case ARMISD::ASRL:
+  case ARMISD::LSRL:
+  case ARMISD::LSLL:
+    return PerformLongShiftCombine(N, DCI.DAG);
   case ARMISD::SMULWB: {
     unsigned BitWidth = N->getValueType(0).getSizeInBits();
     APInt DemandedMask = APInt::getLowBitsSet(BitWidth, 16);
@@ -14744,6 +16085,11 @@ SDValue ARMTargetLowering::PerformDAGCombine(SDNode *N,
     case Intrinsic::arm_neon_vst3lane:
     case Intrinsic::arm_neon_vst4lane:
       return PerformVLDCombine(N, DCI);
+    case Intrinsic::arm_mve_vld2q:
+    case Intrinsic::arm_mve_vld4q:
+    case Intrinsic::arm_mve_vst2q:
+    case Intrinsic::arm_mve_vst4q:
+      return PerformMVEVLDCombine(N, DCI);
     default: break;
     }
     break;
@@ -14827,28 +16173,21 @@ bool ARMTargetLowering::allowsMisalignedMemoryAccesses(EVT VT, unsigned,
   return false;
 }
 
-static bool memOpAlign(unsigned DstAlign, unsigned SrcAlign,
-                       unsigned AlignCheck) {
-  return ((SrcAlign == 0 || SrcAlign % AlignCheck == 0) &&
-          (DstAlign == 0 || DstAlign % AlignCheck == 0));
-}
 
 EVT ARMTargetLowering::getOptimalMemOpType(
-    uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
-    bool ZeroMemset, bool MemcpyStrSrc,
-    const AttributeList &FuncAttributes) const {
+    const MemOp &Op, const AttributeList &FuncAttributes) const {
   // See if we can use NEON instructions for this...
-  if ((!IsMemset || ZeroMemset) && Subtarget->hasNEON() &&
+  if ((Op.isMemcpy() || Op.isZeroMemset()) && Subtarget->hasNEON() &&
       !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
     bool Fast;
-    if (Size >= 16 &&
-        (memOpAlign(SrcAlign, DstAlign, 16) ||
+    if (Op.size() >= 16 &&
+        (Op.isAligned(Align(16)) ||
          (allowsMisalignedMemoryAccesses(MVT::v2f64, 0, 1,
                                          MachineMemOperand::MONone, &Fast) &&
           Fast))) {
       return MVT::v2f64;
-    } else if (Size >= 8 &&
-               (memOpAlign(SrcAlign, DstAlign, 8) ||
+    } else if (Op.size() >= 8 &&
+               (Op.isAligned(Align(8)) ||
                 (allowsMisalignedMemoryAccesses(
                      MVT::f64, 0, 1, MachineMemOperand::MONone, &Fast) &&
                  Fast))) {
@@ -14962,45 +16301,97 @@ bool ARMTargetLowering::shouldSinkOperands(Instruction *I,
   if (!Subtarget->hasMVEIntegerOps())
     return false;
 
-  auto IsSinker = [](Instruction *I, int Operand) {
+  auto IsFMSMul = [&](Instruction *I) {
+    if (!I->hasOneUse())
+      return false;
+    auto *Sub = cast<Instruction>(*I->users().begin());
+    return Sub->getOpcode() == Instruction::FSub && Sub->getOperand(1) == I;
+  };
+  auto IsFMS = [&](Instruction *I) {
+    if (match(I->getOperand(0), m_FNeg(m_Value())) ||
+        match(I->getOperand(1), m_FNeg(m_Value())))
+      return true;
+    return false;
+  };
+
+  auto IsSinker = [&](Instruction *I, int Operand) {
     switch (I->getOpcode()) {
     case Instruction::Add:
     case Instruction::Mul:
+    case Instruction::FAdd:
     case Instruction::ICmp:
+    case Instruction::FCmp:
       return true;
+    case Instruction::FMul:
+      return !IsFMSMul(I);
     case Instruction::Sub:
+    case Instruction::FSub:
     case Instruction::Shl:
     case Instruction::LShr:
     case Instruction::AShr:
       return Operand == 1;
+    case Instruction::Call:
+      if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::fma:
+          return !IsFMS(I);
+        default:
+          return false;
+        }
+      }
+      return false;
     default:
       return false;
     }
   };
 
-  int Op = 0;
-  if (!isa<ShuffleVectorInst>(I->getOperand(Op)))
-    Op = 1;
-  if (!IsSinker(I, Op))
-    return false;
-  if (!match(I->getOperand(Op),
-             m_ShuffleVector(m_InsertElement(m_Undef(), m_Value(), m_ZeroInt()),
-                             m_Undef(), m_Zero()))) {
-    return false;
-  }
-  Instruction *Shuffle = cast<Instruction>(I->getOperand(Op));
-  // All uses of the shuffle should be sunk to avoid duplicating it across gpr
-  // and vector registers
-  for (Use &U : Shuffle->uses()) {
-    Instruction *Insn = cast<Instruction>(U.getUser());
-    if (!IsSinker(Insn, U.getOperandNo()))
-      return false;
+  for (auto OpIdx : enumerate(I->operands())) {
+    Instruction *Op = dyn_cast<Instruction>(OpIdx.value().get());
+    // Make sure we are not already sinking this operand
+    if (!Op || any_of(Ops, [&](Use *U) { return U->get() == Op; }))
+      continue;
+
+    Instruction *Shuffle = Op;
+    if (Shuffle->getOpcode() == Instruction::BitCast)
+      Shuffle = dyn_cast<Instruction>(Shuffle->getOperand(0));
+    // We are looking for a splat that can be sunk.
+    if (!Shuffle ||
+        !match(Shuffle, m_Shuffle(
+                            m_InsertElt(m_Undef(), m_Value(), m_ZeroInt()),
+                            m_Undef(), m_ZeroMask())))
+      continue;
+    if (!IsSinker(I, OpIdx.index()))
+      continue;
+
+    // All uses of the shuffle should be sunk to avoid duplicating it across gpr
+    // and vector registers
+    for (Use &U : Op->uses()) {
+      Instruction *Insn = cast<Instruction>(U.getUser());
+      if (!IsSinker(Insn, U.getOperandNo()))
+        return false;
+    }
+
+    Ops.push_back(&Shuffle->getOperandUse(0));
+    if (Shuffle != Op)
+      Ops.push_back(&Op->getOperandUse(0));
+    Ops.push_back(&OpIdx.value());
   }
-  Ops.push_back(&Shuffle->getOperandUse(0));
-  Ops.push_back(&I->getOperandUse(Op));
   return true;
 }
 
+Type *ARMTargetLowering::shouldConvertSplatType(ShuffleVectorInst *SVI) const {
+  if (!Subtarget->hasMVEIntegerOps())
+    return nullptr;
+  Type *SVIType = SVI->getType();
+  Type *ScalarType = SVIType->getScalarType();
+
+  if (ScalarType->isFloatTy())
+    return Type::getInt32Ty(SVIType->getContext());
+  if (ScalarType->isHalfTy())
+    return Type::getInt16Ty(SVIType->getContext());
+  return nullptr;
+}
+
 bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   EVT VT = ExtVal.getValueType();
 
@@ -15012,6 +16403,9 @@ bool ARMTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
       return false;
   }
 
+  if (Subtarget->hasMVEIntegerOps())
+    return true;
+
   // Don't create a loadext if we can fold the extension into a wide/long
   // instruction.
   // If there's more than one user instruction, the loadext is desirable no
@@ -15433,7 +16827,7 @@ static bool getT2IndexedAddressParts(SDNode *Ptr, EVT VT,
   return false;
 }
 
-static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
+static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, Align Alignment,
                                       bool isSEXTLoad, bool IsMasked, bool isLE,
                                       SDValue &Base, SDValue &Offset,
                                       bool &isInc, SelectionDAG &DAG) {
@@ -15468,16 +16862,16 @@ static bool getMVEIndexedAddressParts(SDNode *Ptr, EVT VT, unsigned Align,
   // (in BE/masked) type.
   Base = Ptr->getOperand(0);
   if (VT == MVT::v4i16) {
-    if (Align >= 2 && IsInRange(RHSC, 0x80, 2))
+    if (Alignment >= 2 && IsInRange(RHSC, 0x80, 2))
       return true;
   } else if (VT == MVT::v4i8 || VT == MVT::v8i8) {
     if (IsInRange(RHSC, 0x80, 1))
       return true;
-  } else if (Align >= 4 &&
+  } else if (Alignment >= 4 &&
              (CanChangeType || VT == MVT::v4i32 || VT == MVT::v4f32) &&
              IsInRange(RHSC, 0x80, 4))
     return true;
-  else if (Align >= 2 &&
+  else if (Alignment >= 2 &&
            (CanChangeType || VT == MVT::v8i16 || VT == MVT::v8f16) &&
            IsInRange(RHSC, 0x80, 2))
     return true;
@@ -15499,28 +16893,28 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 
   EVT VT;
   SDValue Ptr;
-  unsigned Align;
+  Align Alignment;
   bool isSEXTLoad = false;
   bool IsMasked = false;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     Ptr = LD->getBasePtr();
     VT = LD->getMemoryVT();
-    Align = LD->getAlignment();
+    Alignment = LD->getAlign();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     Ptr = ST->getBasePtr();
     VT = ST->getMemoryVT();
-    Align = ST->getAlignment();
+    Alignment = ST->getAlign();
   } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
     Ptr = LD->getBasePtr();
     VT = LD->getMemoryVT();
-    Align = LD->getAlignment();
+    Alignment = LD->getAlign();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
     IsMasked = true;
   } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
     Ptr = ST->getBasePtr();
     VT = ST->getMemoryVT();
-    Align = ST->getAlignment();
+    Alignment = ST->getAlign();
     IsMasked = true;
   } else
     return false;
@@ -15529,9 +16923,9 @@ ARMTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
   bool isLegal = false;
   if (VT.isVector())
     isLegal = Subtarget->hasMVEIntegerOps() &&
-              getMVEIndexedAddressParts(Ptr.getNode(), VT, Align, isSEXTLoad,
-                                        IsMasked, Subtarget->isLittle(), Base,
-                                        Offset, isInc, DAG);
+              getMVEIndexedAddressParts(
+                  Ptr.getNode(), VT, Alignment, isSEXTLoad, IsMasked,
+                  Subtarget->isLittle(), Base, Offset, isInc, DAG);
   else {
     if (Subtarget->isThumb2())
       isLegal = getT2IndexedAddressParts(Ptr.getNode(), VT, isSEXTLoad, Base,
@@ -15557,31 +16951,31 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
                                                    SelectionDAG &DAG) const {
   EVT VT;
   SDValue Ptr;
-  unsigned Align;
+  Align Alignment;
   bool isSEXTLoad = false, isNonExt;
   bool IsMasked = false;
   if (LoadSDNode *LD = dyn_cast<LoadSDNode>(N)) {
     VT = LD->getMemoryVT();
     Ptr = LD->getBasePtr();
-    Align = LD->getAlignment();
+    Alignment = LD->getAlign();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
     isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
   } else if (StoreSDNode *ST = dyn_cast<StoreSDNode>(N)) {
     VT = ST->getMemoryVT();
     Ptr = ST->getBasePtr();
-    Align = ST->getAlignment();
+    Alignment = ST->getAlign();
     isNonExt = !ST->isTruncatingStore();
   } else if (MaskedLoadSDNode *LD = dyn_cast<MaskedLoadSDNode>(N)) {
     VT = LD->getMemoryVT();
     Ptr = LD->getBasePtr();
-    Align = LD->getAlignment();
+    Alignment = LD->getAlign();
     isSEXTLoad = LD->getExtensionType() == ISD::SEXTLOAD;
     isNonExt = LD->getExtensionType() == ISD::NON_EXTLOAD;
     IsMasked = true;
   } else if (MaskedStoreSDNode *ST = dyn_cast<MaskedStoreSDNode>(N)) {
     VT = ST->getMemoryVT();
     Ptr = ST->getBasePtr();
-    Align = ST->getAlignment();
+    Alignment = ST->getAlign();
     isNonExt = !ST->isTruncatingStore();
     IsMasked = true;
   } else
@@ -15607,7 +17001,7 @@ bool ARMTargetLowering::getPostIndexedAddressParts(SDNode *N, SDNode *Op,
   bool isLegal = false;
   if (VT.isVector())
     isLegal = Subtarget->hasMVEIntegerOps() &&
-              getMVEIndexedAddressParts(Op, VT, Align, isSEXTLoad, IsMasked,
+              getMVEIndexedAddressParts(Op, VT, Alignment, isSEXTLoad, IsMasked,
                                         Subtarget->isLittle(), Base, Offset,
                                         isInc, DAG);
   else {
@@ -15722,18 +17116,23 @@ void ARMTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     if (Op.getOpcode() == ARMISD::VGETLANEs)
       Known = Known.sext(DstSz);
     else {
-      Known = Known.zext(DstSz, true /* extended bits are known zero */);
+      Known = Known.zext(DstSz);
     }
     assert(DstSz == Known.getBitWidth());
     break;
   }
+  case ARMISD::VMOVrh: {
+    KnownBits KnownOp = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
+    assert(KnownOp.getBitWidth() == 16);
+    Known = KnownOp.zext(32);
+    break;
+  }
   }
 }
 
-bool
-ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op,
-                                                const APInt &DemandedAPInt,
-                                                TargetLoweringOpt &TLO) const {
+bool ARMTargetLowering::targetShrinkDemandedConstant(
+    SDValue Op, const APInt &DemandedBits, const APInt &DemandedElts,
+    TargetLoweringOpt &TLO) const {
   // Delay optimization, so we don't have to deal with illegal types, or block
   // optimizations.
   if (!TLO.LegalOps)
@@ -15758,7 +17157,7 @@ ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op,
 
   unsigned Mask = C->getZExtValue();
 
-  unsigned Demanded = DemandedAPInt.getZExtValue();
+  unsigned Demanded = DemandedBits.getZExtValue();
   unsigned ShrunkMask = Mask & Demanded;
   unsigned ExpandedMask = Mask | ~Demanded;
 
@@ -15813,6 +17212,35 @@ ARMTargetLowering::targetShrinkDemandedConstant(SDValue Op,
   return false;
 }
 
+bool ARMTargetLowering::SimplifyDemandedBitsForTargetNode(
+    SDValue Op, const APInt &OriginalDemandedBits,
+    const APInt &OriginalDemandedElts, KnownBits &Known, TargetLoweringOpt &TLO,
+    unsigned Depth) const {
+  unsigned Opc = Op.getOpcode();
+
+  switch (Opc) {
+  case ARMISD::ASRL:
+  case ARMISD::LSRL: {
+    // If this is result 0 and the other result is unused, see if the demand
+    // bits allow us to shrink this long shift into a standard small shift in
+    // the opposite direction.
+    if (Op.getResNo() == 0 && !Op->hasAnyUseOfValue(1) &&
+        isa<ConstantSDNode>(Op->getOperand(2))) {
+      unsigned ShAmt = Op->getConstantOperandVal(2);
+      if (ShAmt < 32 && OriginalDemandedBits.isSubsetOf(
+                            APInt::getAllOnesValue(32) << (32 - ShAmt)))
+        return TLO.CombineTo(
+            Op, TLO.DAG.getNode(
+                    ISD::SHL, SDLoc(Op), MVT::i32, Op.getOperand(1),
+                    TLO.DAG.getConstant(32 - ShAmt, SDLoc(Op), MVT::i32)));
+    }
+    break;
+  }
+  }
+
+  return TargetLowering::SimplifyDemandedBitsForTargetNode(
+      Op, OriginalDemandedBits, OriginalDemandedElts, Known, TLO, Depth);
+}
 
 //===----------------------------------------------------------------------===//
 //                           ARM Inline Assembly Support
@@ -15823,7 +17251,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
   if (!Subtarget->hasV6Ops())
     return false;
 
-  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
   std::string AsmStr = IA->getAsmString();
   SmallVector<StringRef, 4> AsmPieces;
   SplitString(AsmStr, AsmPieces, ";\n");
@@ -15831,7 +17259,7 @@ bool ARMTargetLowering::ExpandInlineAsm(CallInst *CI) const {
   switch (AsmPieces.size()) {
   default: return false;
   case 1:
-    AsmStr = AsmPieces[0];
+    AsmStr = std::string(AsmPieces[0]);
     AsmPieces.clear();
     SplitString(AsmStr, AsmPieces, " \t,");
 
@@ -16330,13 +17758,15 @@ ARMTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const
 
   if (DAG.getMachineFunction().getFunction().hasFnAttribute(
           "no-stack-arg-probe")) {
-    unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
+    MaybeAlign Align =
+        cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
     SDValue SP = DAG.getCopyFromReg(Chain, DL, ARM::SP, MVT::i32);
     Chain = SP.getValue(1);
     SP = DAG.getNode(ISD::SUB, DL, MVT::i32, SP, Size);
     if (Align)
-      SP = DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
-                       DAG.getConstant(-(uint64_t)Align, DL, MVT::i32));
+      SP =
+          DAG.getNode(ISD::AND, DL, MVT::i32, SP.getValue(0),
+                      DAG.getConstant(-(uint64_t)Align->value(), DL, MVT::i32));
     Chain = DAG.getCopyToReg(Chain, DL, ARM::SP, SP);
     SDValue Ops[2] = { SP, Chain };
     return DAG.getMergeValues(Ops, DL);
@@ -16373,6 +17803,18 @@ SDValue ARMTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   assert(!(DstSz == 32 && Subtarget->hasFP16()) &&
          "With FP16, 16 to 32 conversion is legal!");
 
+  // Converting from 32 -> 64 is valid if we have FP64.
+  if (SrcSz == 32 && DstSz == 64 && Subtarget->hasFP64()) {
+    // FIXME: Remove this when we have strict fp instruction selection patterns
+    if (IsStrict) {
+      SDLoc Loc(Op);
+      SDValue Result = DAG.getNode(ISD::FP_EXTEND,
+                                   Loc, Op.getValueType(), SrcVal);
+      return DAG.getMergeValues({Result, Op.getOperand(0)}, Loc);
+    }
+    return Op;
+  }
+
   // Either we are converting from 16 -> 64, without FP16 and/or
   // FP.double-precision or without Armv8-fp. So we must do it in two
   // steps.
@@ -16528,7 +17970,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue());
+    Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
     // volatile loads with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOLoad;
     return true;
@@ -16569,7 +18011,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Value *AlignArg = I.getArgOperand(I.getNumArgOperands() - 1);
-    Info.align = MaybeAlign(cast<ConstantInt>(AlignArg)->getZExtValue());
+    Info.align = cast<ConstantInt>(AlignArg)->getMaybeAlignValue();
     // volatile stores with NEON intrinsics not supported
     Info.flags = MachineMemOperand::MOStore;
     return true;
@@ -16595,6 +18037,34 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.flags = MachineMemOperand::MOStore;
     return true;
   }
+  case Intrinsic::arm_mve_vld2q:
+  case Intrinsic::arm_mve_vld4q: {
+    Info.opc = ISD::INTRINSIC_W_CHAIN;
+    // Conservatively set memVT to the entire set of vectors loaded.
+    Type *VecTy = cast<StructType>(I.getType())->getElementType(1);
+    unsigned Factor = Intrinsic == Intrinsic::arm_mve_vld2q ? 2 : 4;
+    Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = Align(VecTy->getScalarSizeInBits() / 8);
+    // volatile loads with MVE intrinsics not supported
+    Info.flags = MachineMemOperand::MOLoad;
+    return true;
+  }
+  case Intrinsic::arm_mve_vst2q:
+  case Intrinsic::arm_mve_vst4q: {
+    Info.opc = ISD::INTRINSIC_VOID;
+    // Conservatively set memVT to the entire set of vectors stored.
+    Type *VecTy = I.getArgOperand(1)->getType();
+    unsigned Factor = Intrinsic == Intrinsic::arm_mve_vst2q ? 2 : 4;
+    Info.memVT = EVT::getVectorVT(VecTy->getContext(), MVT::i64, Factor * 2);
+    Info.ptrVal = I.getArgOperand(0);
+    Info.offset = 0;
+    Info.align = Align(VecTy->getScalarSizeInBits() / 8);
+    // volatile stores with MVE intrinsics not supported
+    Info.flags = MachineMemOperand::MOStore;
+    return true;
+  }
   case Intrinsic::arm_ldaex:
   case Intrinsic::arm_ldrex: {
     auto &DL = I.getCalledFunction()->getParent()->getDataLayout();
@@ -16603,7 +18073,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
-    Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
+    Info.align = DL.getABITypeAlign(PtrTy->getElementType());
     Info.flags = MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile;
     return true;
   }
@@ -16615,7 +18085,7 @@ bool ARMTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.memVT = MVT::getVT(PtrTy->getElementType());
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
-    Info.align = MaybeAlign(DL.getABITypeAlignment(PtrTy->getElementType()));
+    Info.align = DL.getABITypeAlign(PtrTy->getElementType());
     Info.flags = MachineMemOperand::MOStore | MachineMemOperand::MOVolatile;
     return true;
   }
@@ -16849,7 +18319,7 @@ bool ARMTargetLowering::canCombineStoreAndExtract(Type *VectorTy, Value *Idx,
     return false;
 
   assert(VectorTy->isVectorTy() && "VectorTy is not a vector type");
-  unsigned BitWidth = cast<VectorType>(VectorTy)->getBitWidth();
+  unsigned BitWidth = VectorTy->getPrimitiveSizeInBits().getFixedSize();
   // We can do a store + vector extract on any vector that fits perfectly in a D
   // or Q register.
   if (BitWidth == 64 || BitWidth == 128) {
@@ -16868,7 +18338,7 @@ bool ARMTargetLowering::isCheapToSpeculateCtlz() const {
 }
 
 bool ARMTargetLowering::shouldExpandShift(SelectionDAG &DAG, SDNode *N) const {
-  return !Subtarget->hasMinSize();
+  return !Subtarget->hasMinSize() || Subtarget->isTargetWindows();
 }
 
 Value *ARMTargetLowering::emitLoadLinked(IRBuilder<> &Builder, Value *Addr,
@@ -16962,7 +18432,7 @@ ARMTargetLowering::getNumInterleavedAccesses(VectorType *VecTy,
 }
 
 bool ARMTargetLowering::isLegalInterleavedAccessType(
-    unsigned Factor, VectorType *VecTy, const DataLayout &DL) const {
+    unsigned Factor, FixedVectorType *VecTy, const DataLayout &DL) const {
 
   unsigned VecSize = DL.getTypeSizeInBits(VecTy);
   unsigned ElSize = DL.getTypeSizeInBits(VecTy->getElementType());
@@ -17021,8 +18491,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   assert(Shuffles.size() == Indices.size() &&
          "Unmatched number of shufflevectors and indices");
 
-  VectorType *VecTy = Shuffles[0]->getType();
-  Type *EltTy = VecTy->getVectorElementType();
+  auto *VecTy = cast<FixedVectorType>(Shuffles[0]->getType());
+  Type *EltTy = VecTy->getElementType();
 
   const DataLayout &DL = LI->getModule()->getDataLayout();
 
@@ -17037,8 +18507,7 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   // A pointer vector can not be the return type of the ldN intrinsics. Need to
   // load integer vectors first and then convert to pointer vectors.
   if (EltTy->isPointerTy())
-    VecTy =
-        VectorType::get(DL.getIntPtrType(EltTy), VecTy->getVectorNumElements());
+    VecTy = FixedVectorType::get(DL.getIntPtrType(EltTy), VecTy);
 
   IRBuilder<> Builder(LI);
 
@@ -17048,15 +18517,15 @@ bool ARMTargetLowering::lowerInterleavedLoad(
   if (NumLoads > 1) {
     // If we're going to generate more than one load, reset the sub-vector type
     // to something legal.
-    VecTy = VectorType::get(VecTy->getVectorElementType(),
-                            VecTy->getVectorNumElements() / NumLoads);
+    VecTy = FixedVectorType::get(VecTy->getElementType(),
+                                 VecTy->getNumElements() / NumLoads);
 
     // We will compute the pointer operand of each load from the original base
     // address using GEPs. Cast the base address to a pointer to the scalar
     // element type.
     BaseAddr = Builder.CreateBitCast(
-        BaseAddr, VecTy->getVectorElementType()->getPointerTo(
-                      LI->getPointerAddressSpace()));
+        BaseAddr,
+        VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace()));
   }
 
   assert(isTypeLegal(EVT::getEVT(VecTy)) && "Illegal vldN vector type!");
@@ -17081,8 +18550,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
              "expected interleave factor of 2 or 4 for MVE");
       Intrinsic::ID LoadInts =
           Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
-      Type *VecEltTy = VecTy->getVectorElementType()->getPointerTo(
-          LI->getPointerAddressSpace());
+      Type *VecEltTy =
+          VecTy->getElementType()->getPointerTo(LI->getPointerAddressSpace());
       Type *Tys[] = {VecTy, VecEltTy};
       Function *VldnFunc =
           Intrinsic::getDeclaration(LI->getModule(), LoadInts, Tys);
@@ -17102,9 +18571,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
     // If we're generating more than one load, compute the base address of
     // subsequent loads as an offset from the previous.
     if (LoadCount > 0)
-      BaseAddr =
-          Builder.CreateConstGEP1_32(VecTy->getVectorElementType(), BaseAddr,
-                                     VecTy->getVectorNumElements() * Factor);
+      BaseAddr = Builder.CreateConstGEP1_32(VecTy->getElementType(), BaseAddr,
+                                            VecTy->getNumElements() * Factor);
 
     CallInst *VldN = createLoadIntrinsic(BaseAddr);
 
@@ -17119,8 +18587,8 @@ bool ARMTargetLowering::lowerInterleavedLoad(
       // Convert the integer vector to pointer vector if the element is pointer.
       if (EltTy->isPointerTy())
         SubVec = Builder.CreateIntToPtr(
-            SubVec, VectorType::get(SV->getType()->getVectorElementType(),
-                                    VecTy->getVectorNumElements()));
+            SubVec,
+            FixedVectorType::get(SV->getType()->getElementType(), VecTy));
 
       SubVecs[SV].push_back(SubVec);
     }
@@ -17172,13 +18640,12 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
 
-  VectorType *VecTy = SVI->getType();
-  assert(VecTy->getVectorNumElements() % Factor == 0 &&
-         "Invalid interleaved store");
+  auto *VecTy = cast<FixedVectorType>(SVI->getType());
+  assert(VecTy->getNumElements() % Factor == 0 && "Invalid interleaved store");
 
-  unsigned LaneLen = VecTy->getVectorNumElements() / Factor;
-  Type *EltTy = VecTy->getVectorElementType();
-  VectorType *SubVecTy = VectorType::get(EltTy, LaneLen);
+  unsigned LaneLen = VecTy->getNumElements() / Factor;
+  Type *EltTy = VecTy->getElementType();
+  auto *SubVecTy = FixedVectorType::get(EltTy, LaneLen);
 
   const DataLayout &DL = SI->getModule()->getDataLayout();
 
@@ -17200,12 +18667,12 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
     Type *IntTy = DL.getIntPtrType(EltTy);
 
     // Convert to the corresponding integer vector.
-    Type *IntVecTy =
-        VectorType::get(IntTy, Op0->getType()->getVectorNumElements());
+    auto *IntVecTy =
+        FixedVectorType::get(IntTy, cast<FixedVectorType>(Op0->getType()));
     Op0 = Builder.CreatePtrToInt(Op0, IntVecTy);
     Op1 = Builder.CreatePtrToInt(Op1, IntVecTy);
 
-    SubVecTy = VectorType::get(IntTy, LaneLen);
+    SubVecTy = FixedVectorType::get(IntTy, LaneLen);
   }
 
   // The base address of the store.
@@ -17215,14 +18682,14 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
     // If we're going to generate more than one store, reset the lane length
     // and sub-vector type to something legal.
     LaneLen /= NumStores;
-    SubVecTy = VectorType::get(SubVecTy->getVectorElementType(), LaneLen);
+    SubVecTy = FixedVectorType::get(SubVecTy->getElementType(), LaneLen);
 
     // We will compute the pointer operand of each store from the original base
     // address using GEPs. Cast the base address to a pointer to the scalar
     // element type.
     BaseAddr = Builder.CreateBitCast(
-        BaseAddr, SubVecTy->getVectorElementType()->getPointerTo(
-                      SI->getPointerAddressSpace()));
+        BaseAddr,
+        SubVecTy->getElementType()->getPointerTo(SI->getPointerAddressSpace()));
   }
 
   assert(isTypeLegal(EVT::getEVT(SubVecTy)) && "Illegal vstN vector type!");
@@ -17252,7 +18719,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
              "expected interleave factor of 2 or 4 for MVE");
       Intrinsic::ID StoreInts =
           Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
-      Type *EltPtrTy = SubVecTy->getVectorElementType()->getPointerTo(
+      Type *EltPtrTy = SubVecTy->getElementType()->getPointerTo(
           SI->getPointerAddressSpace());
       Type *Tys[] = {EltPtrTy, SubVecTy};
       Function *VstNFunc =
@@ -17274,7 +18741,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
     // If we generating more than one store, we compute the base address of
     // subsequent stores as an offset from the previous.
     if (StoreCount > 0)
-      BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getVectorElementType(),
+      BaseAddr = Builder.CreateConstGEP1_32(SubVecTy->getElementType(),
                                             BaseAddr, LaneLen * Factor);
 
     SmallVector<Value *, 4> Shuffles;
@@ -17284,7 +18751,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
       unsigned IdxI = StoreCount * LaneLen * Factor + i;
       if (Mask[IdxI] >= 0) {
         Shuffles.push_back(Builder.CreateShuffleVector(
-            Op0, Op1, createSequentialMask(Builder, Mask[IdxI], LaneLen, 0)));
+            Op0, Op1, createSequentialMask(Mask[IdxI], LaneLen, 0)));
       } else {
         unsigned StartMask = 0;
         for (unsigned j = 1; j < LaneLen; j++) {
@@ -17301,7 +18768,7 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
         // Note: StartMask cannot be negative, it's checked in
         // isReInterleaveMask
         Shuffles.push_back(Builder.CreateShuffleVector(
-            Op0, Op1, createSequentialMask(Builder, StartMask, LaneLen, 0)));
+            Op0, Op1, createSequentialMask(StartMask, LaneLen, 0)));
       }
     }
 
@@ -17349,11 +18816,11 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
     case HA_DOUBLE:
       return false;
     case HA_VECT64:
-      return VT->getBitWidth() == 64;
+      return VT->getPrimitiveSizeInBits().getFixedSize() == 64;
     case HA_VECT128:
-      return VT->getBitWidth() == 128;
+      return VT->getPrimitiveSizeInBits().getFixedSize() == 128;
     case HA_UNKNOWN:
-      switch (VT->getBitWidth()) {
+      switch (VT->getPrimitiveSizeInBits().getFixedSize()) {
       case 64:
         Base = HA_VECT64;
         return true;
@@ -17372,7 +18839,7 @@ static bool isHomogeneousAggregate(Type *Ty, HABaseType &Base,
 /// Return the correct alignment for the current calling convention.
 Align ARMTargetLowering::getABIAlignmentForCallingConv(Type *ArgTy,
                                                        DataLayout DL) const {
-  const Align ABITypeAlign(DL.getABITypeAlignment(ArgTy));
+  const Align ABITypeAlign = DL.getABITypeAlign(ArgTy);
   if (!ArgTy->isVectorTy())
     return ABITypeAlign;
 
@@ -17399,18 +18866,18 @@ bool ARMTargetLowering::functionArgumentNeedsConsecutiveRegisters(
   return IsHA || IsIntArray;
 }
 
-unsigned ARMTargetLowering::getExceptionPointerRegister(
+Register ARMTargetLowering::getExceptionPointerRegister(
     const Constant *PersonalityFn) const {
   // Platforms which do not use SjLj EH may return values in these registers
   // via the personality function.
-  return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R0;
+  return Subtarget->useSjLjEH() ? Register() : ARM::R0;
 }
 
-unsigned ARMTargetLowering::getExceptionSelectorRegister(
+Register ARMTargetLowering::getExceptionSelectorRegister(
     const Constant *PersonalityFn) const {
   // Platforms which do not use SjLj EH may return values in these registers
   // via the personality function.
-  return Subtarget->useSjLjEH() ? ARM::NoRegister : ARM::R1;
+  return Subtarget->useSjLjEH() ? Register() : ARM::R1;
 }
 
 void ARMTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 1baa22a4fa56..8b1f4183032e 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -68,10 +68,12 @@ class VectorType;
       CALL,         // Function call.
       CALL_PRED,    // Function call that's predicable.
       CALL_NOLINK,  // Function call with branch not branch-and-link.
+      tSECALL,      // CMSE non-secure function call.
       BRCOND,       // Conditional branch.
       BR_JT,        // Jumptable branch.
       BR2_JT,       // Jumptable branch (2 level - jumptable entry is a jump).
       RET_FLAG,     // Return with a flag operand.
+      SERET_FLAG,   // CMSE Entry function return with a flag operand.
       INTRET_FLAG,  // Interrupt return with an LR-offset and a flag operand.
 
       PIC_ADD,      // Add with a PC operand and a PIC label.
@@ -84,7 +86,9 @@ class VectorType;
       CMN,          // ARM CMN instructions.
       CMPZ,         // ARM compare that sets only Z flag.
       CMPFP,        // ARM VFP compare instruction, sets FPSCR.
+      CMPFPE,       // ARM VFP signalling compare instruction, sets FPSCR.
       CMPFPw0,      // ARM VFP compare against zero instruction, sets FPSCR.
+      CMPFPEw0,     // ARM VFP signalling compare against zero instruction, sets FPSCR.
       FMSTAT,       // ARM fmstat instruction.
 
       CMOV,         // ARM conditional move instructions.
@@ -131,6 +135,7 @@ class VectorType;
       LE,           // Low-overhead loops, Loop End
 
       PREDICATE_CAST, // Predicate cast for MVE i1 types
+      VECTOR_REG_CAST, // Reinterpret the current contents of a vector register
 
       VCMP,         // Vector compare.
       VCMPZ,        // Vector compare to zero.
@@ -199,10 +204,36 @@ class VectorType;
       VTBL2,        // 2-register shuffle with mask
       VMOVN,        // MVE vmovn
 
+      // MVE Saturating truncates
+      VQMOVNs,      // Vector (V) Saturating (Q) Move and Narrow (N), signed (s)
+      VQMOVNu,      // Vector (V) Saturating (Q) Move and Narrow (N), unsigned (u)
+
+      // MVE float <> half converts
+      VCVTN,        // MVE vcvt f32 -> f16, truncating into either the bottom or top lanes
+      VCVTL,        // MVE vcvt f16 -> f32, extending from either the bottom or top lanes
+
       // Vector multiply long:
       VMULLs,       // ...signed
       VMULLu,       // ...unsigned
 
+      // MVE reductions
+      VADDVs,       // sign- or zero-extend the elements of a vector to i32,
+      VADDVu,       //   add them all together, and return an i32 of their sum
+      VADDLVs,      // sign- or zero-extend elements to i64 and sum, returning
+      VADDLVu,      //   the low and high 32-bit halves of the sum
+      VADDLVAs,     // same as VADDLV[su] but also add an input accumulator
+      VADDLVAu,     //   provided as low and high halves
+      VADDLVps,     // same as VADDLVs but with a v4i1 predicate mask
+      VADDLVpu,     // same as VADDLVu but with a v4i1 predicate mask
+      VADDLVAps,    // same as VADDLVps but with a v4i1 predicate mask
+      VADDLVApu,    // same as VADDLVpu but with a v4i1 predicate mask
+      VMLAVs,
+      VMLAVu,
+      VMLALVs,
+      VMLALVu,
+      VMLALVAs,
+      VMLALVAu,
+
       SMULWB,       // Signed multiply word by half word, bottom
       SMULWT,       // Signed multiply word by half word, top
       UMLAL,        // 64bit Unsigned Accumulate Multiply
@@ -335,8 +366,16 @@ class VectorType;
     SDValue PerformCMOVCombine(SDNode *N, SelectionDAG &DAG) const;
     SDValue PerformBRCONDCombine(SDNode *N, SelectionDAG &DAG) const;
     SDValue PerformCMOVToBFICombine(SDNode *N, SelectionDAG &DAG) const;
+    SDValue PerformIntrinsicCombine(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
+    bool SimplifyDemandedBitsForTargetNode(SDValue Op,
+                                           const APInt &OriginalDemandedBits,
+                                           const APInt &OriginalDemandedElts,
+                                           KnownBits &Known,
+                                           TargetLoweringOpt &TLO,
+                                           unsigned Depth) const override;
+
     bool isDesirableToTransformToIntegerOp(unsigned Opc, EVT VT) const override;
 
     /// allowsMisalignedMemoryAccesses - Returns true if the target allows
@@ -347,10 +386,7 @@ class VectorType;
                                         MachineMemOperand::Flags Flags,
                                         bool *Fast) const override;
 
-    EVT getOptimalMemOpType(uint64_t Size,
-                            unsigned DstAlign, unsigned SrcAlign,
-                            bool IsMemset, bool ZeroMemset,
-                            bool MemcpyStrSrc,
+    EVT getOptimalMemOpType(const MemOp &Op,
                             const AttributeList &FuncAttributes) const override;
 
     bool isTruncateFree(Type *SrcTy, Type *DstTy) const override;
@@ -358,6 +394,7 @@ class VectorType;
     bool isZExtFree(SDValue Val, EVT VT2) const override;
     bool shouldSinkOperands(Instruction *I,
                             SmallVectorImpl<Use *> &Ops) const override;
+    Type* shouldConvertSplatType(ShuffleVectorInst* SVI) const override;
 
     bool isFNegFree(EVT VT) const override;
 
@@ -416,10 +453,10 @@ class VectorType;
                                        const SelectionDAG &DAG,
                                        unsigned Depth) const override;
 
-    bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+    bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
+                                      const APInt &DemandedElts,
                                       TargetLoweringOpt &TLO) const override;
 
-
     bool ExpandInlineAsm(CallInst *CI) const override;
 
     ConstraintType getConstraintType(StringRef Constraint) const override;
@@ -524,6 +561,12 @@ class VectorType;
     bool isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
                                  unsigned Index) const override;
 
+    bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
+                              bool MathUsed) const override {
+      // Using overflow ops for overflow checks only should beneficial on ARM.
+      return TargetLowering::shouldFormOverflowOp(Opcode, VT, true);
+    }
+
     /// Returns true if an argument of type Ty needs to be passed in a
     /// contiguous block of registers in calling convention CallConv.
     bool functionArgumentNeedsConsecutiveRegisters(
@@ -531,12 +574,12 @@ class VectorType;
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
-    unsigned
+    Register
     getExceptionPointerRegister(const Constant *PersonalityFn) const override;
 
     /// If a physical register, this returns the register that receives the
     /// exception typeid on entry to a landing pad.
-    unsigned
+    Register
     getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
     Instruction *makeDMB(IRBuilder<> &Builder, ARM_MB::MemBOpt Domain) const;
@@ -608,7 +651,7 @@ class VectorType;
     /// Returns true if \p VecTy is a legal interleaved access type. This
     /// function checks the vector element type and the overall width of the
     /// vector.
-    bool isLegalInterleavedAccessType(unsigned Factor, VectorType *VecTy,
+    bool isLegalInterleavedAccessType(unsigned Factor, FixedVectorType *VecTy,
                                       const DataLayout &DL) const;
 
     bool alignLoopsWithOptSize() const override;
@@ -725,6 +768,8 @@ class VectorType;
     SDValue LowerDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed) const;
     void ExpandDIV_Windows(SDValue Op, SelectionDAG &DAG, bool Signed,
                            SmallVectorImpl<SDValue> &Results) const;
+    SDValue ExpandBITCAST(SDNode *N, SelectionDAG &DAG,
+                          const ARMSubtarget *Subtarget) const;
     SDValue LowerWindowsDIVLibCall(SDValue Op, SelectionDAG &DAG, bool Signed,
                                    SDValue &Chain) const;
     SDValue LowerREM(SDNode *N, SelectionDAG &DAG) const;
@@ -733,6 +778,7 @@ class VectorType;
     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerFSETCC(SDValue Op, SelectionDAG &DAG) const;
     void lowerABS(SDNode *N, SmallVectorImpl<SDValue> &Results,
                   SelectionDAG &DAG) const;
     void LowerLOAD(SDNode *N, SmallVectorImpl<SDValue> &Results,
@@ -747,6 +793,11 @@ class VectorType;
     bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                     EVT VT) const override;
 
+    SDValue MoveToHPR(const SDLoc &dl, SelectionDAG &DAG, MVT LocVT, MVT ValVT,
+                      SDValue Val) const;
+    SDValue MoveFromHPR(const SDLoc &dl, SelectionDAG &DAG, MVT LocVT,
+                        MVT ValVT, SDValue Val) const;
+
     SDValue ReconstructShuffle(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerCallResult(SDValue Chain, SDValue InFlag,
@@ -766,6 +817,17 @@ class VectorType;
       MachineBasicBlock *Entry,
       const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
 
+    bool
+    splitValueIntoRegisterParts(SelectionDAG &DAG, const SDLoc &DL, SDValue Val,
+                                SDValue *Parts, unsigned NumParts, MVT PartVT,
+                                Optional<CallingConv::ID> CC) const override;
+
+    SDValue
+    joinRegisterPartsIntoValue(SelectionDAG &DAG, const SDLoc &DL,
+                               const SDValue *Parts, unsigned NumParts,
+                               MVT PartVT, EVT ValueVT,
+                               Optional<CallingConv::ID> CC) const override;
+
     SDValue
     LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                          const SmallVectorImpl<ISD::InputArg> &Ins,
@@ -786,7 +848,7 @@ class VectorType;
                       SmallVectorImpl<SDValue> &InVals) const override;
 
     /// HandleByVal - Target-specific cleanup for ByVal support.
-    void HandleByVal(CCState *, unsigned &, unsigned) const override;
+    void HandleByVal(CCState *, unsigned &, Align) const override;
 
     /// IsEligibleForTailCallOptimization - Check whether the call is eligible
     /// for tail call optimization. Targets which want to do tail call
@@ -823,7 +885,7 @@ class VectorType;
     SDValue getARMCmp(SDValue LHS, SDValue RHS, ISD::CondCode CC,
                       SDValue &ARMcc, SelectionDAG &DAG, const SDLoc &dl) const;
     SDValue getVFPCmp(SDValue LHS, SDValue RHS, SelectionDAG &DAG,
-                      const SDLoc &dl) const;
+                      const SDLoc &dl, bool Signaling = false) const;
     SDValue duplicateCmp(SDValue Cmp, SelectionDAG &DAG) const;
 
     SDValue OptimizeVFPBrcond(SDValue Op, SelectionDAG &DAG) const;
diff --git a/llvm/lib/Target/ARM/ARMInstrCDE.td b/llvm/lib/Target/ARM/ARMInstrCDE.td
new file mode 100644
index 000000000000..0e97668e2e01
--- /dev/null
+++ b/llvm/lib/Target/ARM/ARMInstrCDE.td
@@ -0,0 +1,666 @@
+//===-- ARMInstrCDE.td - CDE support for ARM ---------------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the Arm CDE (Custom Datapath Extension) instruction set.
+//
+//===----------------------------------------------------------------------===//
+
+// Immediate operand of arbitrary bit width
+class BitWidthImmOperand<int width>
+  : ImmAsmOperand<0, !add(!shl(1, width), -1)> {
+  let Name = "Imm"#width#"b";
+}
+
+class BitWidthImm<int width>
+  : Operand<i32>,
+    ImmLeaf<i32, "{ return Imm >= 0 && Imm < (1 << "#width#"); }"> {
+  let ParserMatchClass = BitWidthImmOperand<width>;
+}
+
+def CDEDualRegOp : RegisterOperand<GPRPairnosp, "printGPRPairOperand">;
+
+// Used by VCX3 FP
+def imm_3b : BitWidthImm<3>;
+
+// Used by VCX3 vector
+def imm_4b : BitWidthImm<4>;
+
+// Used by VCX2 FP and CX3
+def imm_6b :  BitWidthImm<6>;
+
+// Used by VCX2 vector
+def imm_7b :  BitWidthImm<7>;
+
+// Used by CX2
+def imm_9b :  BitWidthImm<9>;
+
+// Used by VCX1 FP
+def imm_11b : BitWidthImm<11>;
+
+// Used by VCX1 vector
+def imm_12b : BitWidthImm<12>;
+
+// Used by CX1
+def imm_13b : BitWidthImm<13>;
+
+// Base class for all CDE instructions
+class CDE_Instr<bit acc, dag oops, dag iops, string asm, string cstr>
+  : Thumb2XI<oops, !con((ins p_imm:$coproc), iops),
+             AddrModeNone, /*sz=*/4, NoItinerary,
+             asm, cstr, /*pattern=*/[]>,
+    Sched<[]> {
+  bits<3> coproc;
+
+  let Inst{31-29} = 0b111;  // 15:13
+  let Inst{28} = acc;
+  let Inst{27-26} = 0b11;
+  let Inst{11} = 0b0;
+  let Inst{10-8} = coproc{2-0};
+
+  let isPredicable = 0;
+  let DecoderNamespace = "Thumb2CDE";
+}
+
+// Base class for CX* CDE instructions
+class CDE_GPR_Instr<bit dual, bit acc, dag oops, dag iops,
+                    string asm, string cstr>
+  : CDE_Instr<acc, oops, iops, asm, cstr>,
+    Requires<[HasCDE]> {
+
+  let Inst{25-24} = 0b10;
+  let Inst{6} = dual;
+  let isPredicable = acc;
+}
+
+// Set of registers used by the CDE instructions.
+class CDE_RegisterOperands {
+  dag Rd;
+  dag Rd_src;
+  dag Rn;
+  dag Rm;
+}
+
+// CX* CDE instruction parameter set
+class CX_Params {
+  dag Oops;      // Output operands for CX* instructions
+  dag Iops1;     // Input operands for CX1* instructions
+  dag Iops2;     // Input operands for CX2* instructions
+  dag Iops3;     // Input operands for CX3* instructions
+  dag PredOp;    // Input predicate operand
+  string PAsm;   // Predicate assembly string
+  string Cstr;   // asm constraint string
+  bit Dual;      // "dual" field for encoding
+  bit Acc;       // "acc" field for encoding
+}
+
+// VCX* CDE instruction parameter set
+class VCX_Params {
+  dag Oops;      // Output operands for VCX* instructions
+  dag Iops1;     // Input operands for VCX1* instructions
+  dag Iops2;     // Input operands for VCX2* instructions
+  dag Iops3;     // Input operands for VCX3* instructions
+  string Cstr;   // asm constraint string
+  bit Acc;       // "acc" field for encoding
+  vpred_ops Vpred; // Predication type for VCX* vector instructions
+}
+
+// CX1, CX1A, CX1D, CX1DA
+class CDE_CX1_Instr<string iname, CX_Params params>
+  : CDE_GPR_Instr<params.Dual, params.Acc, params.Oops,
+                  !con(params.Iops1, (ins imm_13b:$imm), params.PredOp),
+                  !strconcat(iname, params.PAsm, "\t$coproc, $Rd, $imm"),
+                  params.Cstr> {
+  bits<13> imm;
+  bits<4> Rd;
+
+  let Inst{23-22} = 0b00;
+  let Inst{21-16} = imm{12-7};
+  let Inst{15-12} = Rd{3-0};
+  let Inst{7} = imm{6};
+  let Inst{5-0} = imm{5-0};
+}
+
+// CX2, CX2A, CX2D, CX2DA
+class CDE_CX2_Instr<string iname, CX_Params params>
+  : CDE_GPR_Instr<params.Dual, params.Acc, params.Oops,
+                  !con(params.Iops2, (ins imm_9b:$imm), params.PredOp),
+                  !strconcat(iname, params.PAsm, "\t$coproc, $Rd, $Rn, $imm"),
+                  params.Cstr> {
+  bits<9> imm;
+  bits<4> Rd;
+  bits<4> Rn;
+
+  let Inst{23-22} = 0b01;
+  let Inst{21-20} = imm{8-7};
+  let Inst{19-16} = Rn{3-0};
+  let Inst{15-12} = Rd{3-0};
+  let Inst{7} = imm{6};
+  let Inst{5-0} = imm{5-0};
+}
+
+// CX3, CX3A, CX3D, CX3DA
+class CDE_CX3_Instr<string iname, CX_Params params>
+  : CDE_GPR_Instr<params.Dual, params.Acc, params.Oops,
+                  !con(params.Iops3, (ins imm_6b:$imm), params.PredOp),
+                  !strconcat(iname, params.PAsm, "\t$coproc, $Rd, $Rn, $Rm, $imm"),
+                  params.Cstr> {
+  bits<6> imm;
+  bits<4> Rd;
+  bits<4> Rn;
+  bits<4> Rm;
+
+  let Inst{23} = 0b1;
+  let Inst{22-20} = imm{5-3};
+  let Inst{19-16} = Rn{3-0};
+  let Inst{15-12} = Rm{3-0};
+  let Inst{7} = imm{2};
+  let Inst{5-4} = imm{1-0};
+  let Inst{3-0} = Rd{3-0};
+}
+
+// Registers for single-register variants of CX* instructions
+def cde_cx_single_regs : CDE_RegisterOperands {
+  let Rd = (outs GPRwithAPSR_NZCVnosp:$Rd);
+  let Rd_src = (ins GPRwithAPSR_NZCVnosp:$Rd_src);
+  let Rn = (ins GPRwithAPSR_NZCVnosp:$Rn);
+  let Rm = (ins GPRwithAPSR_NZCVnosp:$Rm);
+}
+
+// Registers for single-register variants of CX* instructions
+def cde_cx_dual_regs : CDE_RegisterOperands {
+  let Rd = (outs CDEDualRegOp:$Rd);
+  let Rd_src = (ins CDEDualRegOp:$Rd_src);
+  let Rn = (ins GPRwithAPSR_NZCVnosp:$Rn);
+  let Rm = (ins GPRwithAPSR_NZCVnosp:$Rm);
+}
+
+class CDE_CX_ParamsTemplate<bit dual, bit acc, CDE_RegisterOperands ops>
+  : CX_Params {
+
+  dag IOpsPrefix = !if(acc, ops.Rd_src, (ins));
+
+  let Oops = ops.Rd;
+  let Iops1 = IOpsPrefix;
+  let Iops2 = !con(IOpsPrefix, ops.Rn);
+  let Iops3 = !con(IOpsPrefix, ops.Rn, ops.Rm);
+  let PredOp = !if(acc, (ins pred:$p), (ins));
+  let PAsm = !if(acc, "${p}", "");
+  let Cstr = !if(acc, "$Rd = $Rd_src", "");
+  let Dual = dual;
+  let Acc = acc;
+}
+
+def cde_cx_params_single_noacc : CDE_CX_ParamsTemplate<0b0, 0b0, cde_cx_single_regs>;
+def cde_cx_params_single_acc   : CDE_CX_ParamsTemplate<0b0, 0b1, cde_cx_single_regs>;
+def cde_cx_params_dual_noacc   : CDE_CX_ParamsTemplate<0b1, 0b0, cde_cx_dual_regs>;
+def cde_cx_params_dual_acc     : CDE_CX_ParamsTemplate<0b1, 0b1, cde_cx_dual_regs>;
+
+def CDE_CX1   : CDE_CX1_Instr<"cx1",    cde_cx_params_single_noacc>;
+def CDE_CX1A  : CDE_CX1_Instr<"cx1a",   cde_cx_params_single_acc>;
+def CDE_CX1D  : CDE_CX1_Instr<"cx1d",   cde_cx_params_dual_noacc>;
+def CDE_CX1DA : CDE_CX1_Instr<"cx1da",  cde_cx_params_dual_acc>;
+
+def CDE_CX2   : CDE_CX2_Instr<"cx2",    cde_cx_params_single_noacc>;
+def CDE_CX2A  : CDE_CX2_Instr<"cx2a",   cde_cx_params_single_acc>;
+def CDE_CX2D  : CDE_CX2_Instr<"cx2d",   cde_cx_params_dual_noacc>;
+def CDE_CX2DA : CDE_CX2_Instr<"cx2da",  cde_cx_params_dual_acc>;
+
+def CDE_CX3   : CDE_CX3_Instr<"cx3",    cde_cx_params_single_noacc>;
+def CDE_CX3A  : CDE_CX3_Instr<"cx3a",   cde_cx_params_single_acc>;
+def CDE_CX3D  : CDE_CX3_Instr<"cx3d",   cde_cx_params_dual_noacc>;
+def CDE_CX3DA : CDE_CX3_Instr<"cx3da",  cde_cx_params_dual_acc>;
+
+let Predicates = [HasCDE] in {
+  def : Pat<(i32 (int_arm_cde_cx1 timm:$coproc, timm:$imm)),
+            (i32 (CDE_CX1 p_imm:$coproc, imm_13b:$imm))>;
+  def : Pat<(i32 (int_arm_cde_cx1a timm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+                                   timm:$imm)),
+            (i32 (CDE_CX1A p_imm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+                           imm_13b:$imm))>;
+  def : Pat<(i32 (int_arm_cde_cx2 timm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+                                  timm:$imm)),
+            (i32 (CDE_CX2 p_imm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+                          imm_9b:$imm))>;
+  def : Pat<(i32 (int_arm_cde_cx2a timm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+                                   GPRwithAPSR_NZCVnosp:$n, timm:$imm)),
+            (i32 (CDE_CX2A p_imm:$coproc, GPRwithAPSR_NZCVnosp:$acc,
+                           GPRwithAPSR_NZCVnosp:$n, imm_9b:$imm))>;
+  def : Pat<(i32 (int_arm_cde_cx3 timm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+                                  GPRwithAPSR_NZCVnosp:$m, timm:$imm)),
+            (i32 (CDE_CX3  p_imm:$coproc, GPRwithAPSR_NZCVnosp:$n,
+                           GPRwithAPSR_NZCVnosp:$m, imm_6b:$imm))>;
+  def : Pat<(i32 (int_arm_cde_cx3a timm:$coproc,
+                                   GPRwithAPSR_NZCVnosp:$acc,
+                                   GPRwithAPSR_NZCVnosp:$n,
+                                   GPRwithAPSR_NZCVnosp:$m, timm:$imm)),
+            (i32 (CDE_CX3A p_imm:$coproc,
+                           GPRwithAPSR_NZCVnosp:$acc,
+                           GPRwithAPSR_NZCVnosp:$n,
+                           GPRwithAPSR_NZCVnosp:$m, imm_6b:$imm))>;
+}
+
+class CDE_RequiresSReg : Requires<[HasCDE, HasFPRegs]>;
+class CDE_RequiresDReg : Requires<[HasCDE, HasFPRegs]>;
+class CDE_RequiresQReg : Requires<[HasCDE, HasMVEInt]>;
+
+// Base class for CDE VCX* instructions
+class CDE_FP_Vec_Instr<bit vec, bit acc, dag oops, dag iops, string asm, string cstr>
+  : CDE_Instr<acc, oops, iops, asm, cstr> {
+  let Inst{25} = 0b0;
+  let Inst{6} = vec;
+}
+
+// Base class for floating-point variants of CDE VCX* instructions
+class CDE_FP_Instr<bit acc, bit sz, dag oops, dag iops, string asm, string cstr>
+  : CDE_FP_Vec_Instr<0b0, acc, oops, iops, asm, cstr> {
+  let Inst{24} = sz;
+}
+
+// Base class for vector variants of CDE VCX* instruction
+class CDE_Vec_Instr<bit acc, dag oops, dag iops, string asm, string cstr,
+                    vpred_ops vpred>
+  : CDE_FP_Vec_Instr<0b1, acc, oops,
+                     !con(iops, (ins vpred:$vp)), asm,
+                     !strconcat(cstr, vpred.vpred_constraint)>,
+    CDE_RequiresQReg {
+}
+
+
+// VCX1/VCX1A, vector variant
+class CDE_VCX1_Vec_Instr<string iname, VCX_Params params>
+  : CDE_Vec_Instr<params.Acc, params.Oops,
+                 !con(params.Iops1, (ins imm_12b:$imm)),
+                 iname#"${vp}\t$coproc, $Qd, $imm", params.Cstr, params.Vpred> {
+  bits<12> imm;
+  bits<3> Qd;
+
+  let Inst{24} = imm{11};
+  let Inst{23} = 0b0;
+  let Inst{22} = 0b0;
+  let Inst{21-20} = 0b10;
+  let Inst{19-16} = imm{10-7};
+  let Inst{15-13} = Qd{2-0};
+  let Inst{12} = 0b0;
+  let Inst{7} = imm{6};
+  let Inst{5-0} = imm{5-0};
+
+  let Unpredictable{22} = 0b1;
+}
+
+// VCX1/VCX1A, base class for FP variants
+class CDE_VCX1_FP_Instr<bit sz, string iname, VCX_Params params>
+  : CDE_FP_Instr<params.Acc, sz, params.Oops,
+                 !con(params.Iops1, (ins imm_11b:$imm)),
+                 iname#"\t$coproc, $Vd, $imm", params.Cstr> {
+  bits<11> imm;
+
+  let Inst{23} = 0b0;
+  let Inst{21-20} = 0b10;
+  let Inst{19-16} = imm{10-7};
+  let Inst{7} = imm{6};
+  let Inst{5-0} = imm{5-0};
+}
+
+// VCX1/VCX1A, S registers
+class CDE_VCX1_FP_Instr_S<string iname, VCX_Params params>
+  : CDE_VCX1_FP_Instr<0b0, iname, params>,
+    CDE_RequiresSReg {
+  bits<5> Vd;
+
+  let Inst{22} = Vd{0};
+  let Inst{15-12} = Vd{4-1};
+}
+
+// VCX1/VCX1A, D registers
+class CDE_VCX1_FP_Instr_D<string iname, VCX_Params params>
+  : CDE_VCX1_FP_Instr<0b1, iname, params>,
+    CDE_RequiresDReg {
+  bits<5> Vd;
+
+  let Inst{22} = Vd{4};
+  let Inst{15-12} = Vd{3-0};
+}
+
+// VCX2/VCX2A, vector variant
+class CDE_VCX2_Vec_Instr<string iname, VCX_Params params>
+  : CDE_Vec_Instr<params.Acc, params.Oops,
+                 !con(params.Iops2, (ins imm_7b:$imm)),
+                 iname#"${vp}\t$coproc, $Qd, $Qm, $imm", params.Cstr,
+                 params.Vpred> {
+  bits<7> imm;
+  bits<3> Qd;
+  bits<3> Qm;
+
+  let Inst{24} = imm{6};
+  let Inst{23} = 0b0;
+  let Inst{22} = 0b0;
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{5-2};
+  let Inst{15-13} = Qd{2-0};
+  let Inst{12} = 0b0;
+  let Inst{7} = imm{1};
+  let Inst{5} = 0b0;
+  let Inst{4} = imm{0};
+  let Inst{3-1} = Qm{2-0};
+  let Inst{0} = 0b0;
+
+  let Unpredictable{22} = 0b1;
+  let Unpredictable{5} = 0b1;
+}
+
+// VCX2/VCX2A, base class for FP variants
+class CDE_VCX2_FP_Instr<bit sz, string iname, VCX_Params params>
+  : CDE_FP_Instr<params.Acc, sz, params.Oops,
+                 !con(params.Iops2, (ins imm_6b:$imm)),
+                 iname#"\t$coproc, $Vd, $Vm, $imm", params.Cstr> {
+  bits<6> imm;
+
+  let Inst{23} = 0b0;
+  let Inst{21-20} = 0b11;
+  let Inst{19-16} = imm{5-2};
+  let Inst{7} = imm{1};
+  let Inst{4} = imm{0};
+}
+
+// VCX2/VCX2A, S registers
+class CDE_VCX2_FP_Instr_S<string iname, VCX_Params params>
+  : CDE_VCX2_FP_Instr<0b0, iname, params>,
+    CDE_RequiresSReg {
+  bits<5> Vd;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{4-1};
+  let Inst{22} = Vd{0};
+  let Inst{3-0} = Vm{4-1};
+  let Inst{5} = Vm{0};
+}
+
+// VCX2/VCX2A, D registers
+class CDE_VCX2_FP_Instr_D<string iname, VCX_Params params>
+  : CDE_VCX2_FP_Instr<0b1, iname, params>,
+    CDE_RequiresDReg {
+  bits<5> Vd;
+  bits<5> Vm;
+
+  let Inst{15-12} = Vd{3-0};
+  let Inst{22} = Vd{4};
+  let Inst{3-0} = Vm{3-0};
+  let Inst{5} = Vm{4};
+}
+
+// VCX3/VCX3A, vector variant
+class CDE_VCX3_Vec_Instr<string iname, VCX_Params params>
+  : CDE_Vec_Instr<params.Acc, params.Oops,
+                 !con(params.Iops3, (ins imm_4b:$imm)),
+                 iname#"${vp}\t$coproc, $Qd, $Qn, $Qm, $imm", params.Cstr,
+                 params.Vpred> {
+  bits<4> imm;
+  bits<3> Qd;
+  bits<3> Qm;
+  bits<3> Qn;
+
+  let Inst{24} = imm{3};
+  let Inst{23} = 0b1;
+  let Inst{22} = 0b0;
+  let Inst{21-20} = imm{2-1};
+  let Inst{19-17} = Qn{2-0};
+  let Inst{16} = 0b0;
+  let Inst{15-13} = Qd{2-0};
+  let Inst{12} = 0b0;
+  let Inst{7} = 0b0;
+  let Inst{5} = 0b0;
+  let Inst{4} = imm{0};
+  let Inst{3-1} = Qm{2-0};
+  let Inst{0} = 0b0;
+
+  let Unpredictable{22} = 0b1;
+  let Unpredictable{7} = 0b1;
+  let Unpredictable{5} = 0b1;
+}
+
+// VCX3/VCX3A, base class for FP variants
+class CDE_VCX3_FP_Instr<bit sz, string iname, VCX_Params params>
+  : CDE_FP_Instr<params.Acc, sz, params.Oops,
+                 !con(params.Iops3, (ins imm_3b:$imm)),
+                 iname#"\t$coproc, $Vd, $Vn, $Vm, $imm", params.Cstr> {
+  bits<3> imm;
+
+  let Inst{23} = 0b1;
+  let Inst{21-20} = imm{2-1};
+  let Inst{4} = imm{0};
+}
+
+// VCX3/VCX3A, S registers
+class CDE_VCX3_FP_Instr_S<string iname, VCX_Params params>
+  : CDE_VCX3_FP_Instr<0b0, iname, params>,
+    CDE_RequiresSReg {
+  bits<5> Vd;
+  bits<5> Vm;
+  bits<5> Vn;
+
+  let Inst{22} = Vd{0};
+  let Inst{19-16} = Vn{4-1};
+  let Inst{15-12} = Vd{4-1};
+  let Inst{7} = Vn{0};
+  let Inst{5} = Vm{0};
+  let Inst{3-0} = Vm{4-1};
+}
+
+// VCX3/VCX3A, D registers
+class CDE_VCX3_FP_Instr_D<string iname, VCX_Params params>
+  : CDE_VCX3_FP_Instr<0b1, iname, params>,
+    CDE_RequiresDReg {
+  bits<5> Vd;
+  bits<5> Vm;
+  bits<5> Vn;
+
+  let Inst{22} = Vd{4};
+  let Inst{19-16} = Vn{3-0};
+  let Inst{15-12} = Vd{3-0};
+  let Inst{7} = Vn{4};
+  let Inst{5} = Vm{4};
+  let Inst{3-0} = Vm{3-0};
+}
+
+// Register operands for VCX* instructions
+class CDE_VCX_RegisterOperandsTemplate<RegisterClass regclass>
+  : CDE_RegisterOperands {
+  let Rd = (outs regclass:$Vd);
+  let Rd_src = (ins regclass:$Vd_src);
+  let Rn = (ins regclass:$Vn);
+  let Rm = (ins regclass:$Vm);
+}
+
+class CDE_VCXQ_RegisterOperandsTemplate<RegisterClass regclass>
+  : CDE_RegisterOperands {
+  let Rd = (outs regclass:$Qd);
+  let Rd_src = (ins regclass:$Qd_src);
+  let Rn = (ins regclass:$Qn);
+  let Rm = (ins regclass:$Qm);
+}
+
+def cde_vcx_s_regs : CDE_VCX_RegisterOperandsTemplate<SPR>;
+def cde_vcx_d_regs : CDE_VCX_RegisterOperandsTemplate<DPR_VFP2>;
+def cde_vcx_q_regs : CDE_VCXQ_RegisterOperandsTemplate<MQPR>;
+
+class CDE_VCX_ParamsTemplate<bit acc, CDE_RegisterOperands ops>
+  : VCX_Params {
+
+  dag IOpsPrefix = !if(acc, ops.Rd_src, (ins));
+
+  let Oops = ops.Rd;
+  let Iops1 = IOpsPrefix;
+  let Iops2 = !con(IOpsPrefix, ops.Rm);
+  let Iops3 = !con(IOpsPrefix, ops.Rn, ops.Rm);
+  let Cstr = !if(acc, "$Vd = $Vd_src", "");
+  let Acc = acc;
+}
+
+class CDE_VCXQ_ParamsTemplate<bit acc, CDE_RegisterOperands ops>
+  : VCX_Params {
+
+  dag IOpsPrefix = !if(acc, ops.Rd_src, (ins));
+
+  let Oops = ops.Rd;
+  let Iops1 = IOpsPrefix;
+  let Iops2 = !con(IOpsPrefix, ops.Rm);
+  let Iops3 = !con(IOpsPrefix, ops.Rn, ops.Rm);
+  let Cstr = !if(acc, "$Qd = $Qd_src", "");
+  let Acc = acc;
+  let Vpred = !if(acc, vpred_n, vpred_r);
+}
+
+def cde_vcx_params_s_noacc : CDE_VCX_ParamsTemplate<0b0, cde_vcx_s_regs>;
+def cde_vcx_params_s_acc   : CDE_VCX_ParamsTemplate<0b1, cde_vcx_s_regs>;
+def cde_vcx_params_d_noacc : CDE_VCX_ParamsTemplate<0b0, cde_vcx_d_regs>;
+def cde_vcx_params_d_acc   : CDE_VCX_ParamsTemplate<0b1, cde_vcx_d_regs>;
+def cde_vcx_params_q_noacc : CDE_VCXQ_ParamsTemplate<0b0, cde_vcx_q_regs>;
+def cde_vcx_params_q_acc   : CDE_VCXQ_ParamsTemplate<0b1, cde_vcx_q_regs>;
+
+def CDE_VCX1_fpsp   : CDE_VCX1_FP_Instr_S<"vcx1",  cde_vcx_params_s_noacc>;
+def CDE_VCX1A_fpsp  : CDE_VCX1_FP_Instr_S<"vcx1a", cde_vcx_params_s_acc>;
+def CDE_VCX1_fpdp   : CDE_VCX1_FP_Instr_D<"vcx1",  cde_vcx_params_d_noacc>;
+def CDE_VCX1A_fpdp  : CDE_VCX1_FP_Instr_D<"vcx1a", cde_vcx_params_d_acc>;
+def CDE_VCX1_vec    : CDE_VCX1_Vec_Instr<"vcx1",   cde_vcx_params_q_noacc>;
+def CDE_VCX1A_vec   : CDE_VCX1_Vec_Instr<"vcx1a",  cde_vcx_params_q_acc>;
+
+def CDE_VCX2_fpsp   : CDE_VCX2_FP_Instr_S<"vcx2",  cde_vcx_params_s_noacc>;
+def CDE_VCX2A_fpsp  : CDE_VCX2_FP_Instr_S<"vcx2a", cde_vcx_params_s_acc>;
+def CDE_VCX2_fpdp   : CDE_VCX2_FP_Instr_D<"vcx2",  cde_vcx_params_d_noacc>;
+def CDE_VCX2A_fpdp  : CDE_VCX2_FP_Instr_D<"vcx2a", cde_vcx_params_d_acc>;
+def CDE_VCX2_vec    : CDE_VCX2_Vec_Instr<"vcx2",   cde_vcx_params_q_noacc>;
+def CDE_VCX2A_vec   : CDE_VCX2_Vec_Instr<"vcx2a",  cde_vcx_params_q_acc>;
+
+def CDE_VCX3_fpsp   : CDE_VCX3_FP_Instr_S<"vcx3",  cde_vcx_params_s_noacc>;
+def CDE_VCX3A_fpsp  : CDE_VCX3_FP_Instr_S<"vcx3a", cde_vcx_params_s_acc>;
+def CDE_VCX3_fpdp   : CDE_VCX3_FP_Instr_D<"vcx3",  cde_vcx_params_d_noacc>;
+def CDE_VCX3A_fpdp  : CDE_VCX3_FP_Instr_D<"vcx3a", cde_vcx_params_d_acc>;
+def CDE_VCX3_vec    : CDE_VCX3_Vec_Instr<"vcx3",   cde_vcx_params_q_noacc>;
+def CDE_VCX3A_vec   : CDE_VCX3_Vec_Instr<"vcx3a",  cde_vcx_params_q_acc>;
+
+
+let Predicates = [HasCDE, HasFPRegs] in {
+  def : Pat<(f32 (int_arm_cde_vcx1 timm:$coproc, timm:$imm)),
+            (f32 (CDE_VCX1_fpsp p_imm:$coproc, imm_11b:$imm))>;
+  def : Pat<(f32 (int_arm_cde_vcx1a timm:$coproc, (f32 SPR:$acc), timm:$imm)),
+            (f32 (CDE_VCX1A_fpsp p_imm:$coproc, SPR:$acc, imm_11b:$imm))>;
+  def : Pat<(f64 (int_arm_cde_vcx1 timm:$coproc, timm:$imm)),
+            (f64 (CDE_VCX1_fpdp p_imm:$coproc, imm_11b:$imm))>;
+  def : Pat<(f64 (int_arm_cde_vcx1a timm:$coproc, (f64 DPR:$acc), timm:$imm)),
+            (f64 (CDE_VCX1A_fpdp p_imm:$coproc, DPR:$acc, imm_11b:$imm))>;
+
+  def : Pat<(f32 (int_arm_cde_vcx2 timm:$coproc, (f32 SPR:$n), timm:$imm)),
+            (f32 (CDE_VCX2_fpsp p_imm:$coproc, SPR:$n, imm_6b:$imm))>;
+  def : Pat<(f32 (int_arm_cde_vcx2a timm:$coproc, (f32 SPR:$acc), (f32 SPR:$n),
+                                    timm:$imm)),
+            (f32 (CDE_VCX2A_fpsp p_imm:$coproc, SPR:$acc, SPR:$n, imm_6b:$imm))>;
+  def : Pat<(f64 (int_arm_cde_vcx2 timm:$coproc, (f64 DPR:$n), timm:$imm)),
+            (f64 (CDE_VCX2_fpdp p_imm:$coproc, DPR:$n, imm_6b:$imm))>;
+  def : Pat<(f64 (int_arm_cde_vcx2a timm:$coproc, (f64 DPR:$acc), (f64 DPR:$n),
+                                    timm:$imm)),
+            (f64 (CDE_VCX2A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, imm_6b:$imm))>;
+
+  def : Pat<(f32 (int_arm_cde_vcx3 timm:$coproc, (f32 SPR:$n), (f32 SPR:$m),
+                                   timm:$imm)),
+            (f32 (CDE_VCX3_fpsp p_imm:$coproc, (f32 SPR:$n), (f32 SPR:$m),
+                                imm_3b:$imm))>;
+  def : Pat<(f32 (int_arm_cde_vcx3a timm:$coproc, (f32 SPR:$acc), (f32 SPR:$n),
+                                    (f32 SPR:$m), timm:$imm)),
+            (f32 (CDE_VCX3A_fpsp p_imm:$coproc, SPR:$acc, SPR:$n, SPR:$m,
+                                 imm_3b:$imm))>;
+  def : Pat<(f64 (int_arm_cde_vcx3 timm:$coproc, (f64 DPR:$n), (f64 DPR:$m),
+                                   timm:$imm)),
+            (f64 (CDE_VCX3_fpdp p_imm:$coproc, DPR:$n, DPR:$m, imm_3b:$imm))>;
+  def : Pat<(f64 (int_arm_cde_vcx3a timm:$coproc, (f64 DPR:$acc), (f64 DPR:$n),
+                                    (f64 DPR:$m), timm:$imm)),
+            (f64 (CDE_VCX3A_fpdp p_imm:$coproc, DPR:$acc, DPR:$n, DPR:$m,
+                                 imm_3b:$imm))>;
+}
+
+let Predicates = [HasCDE, HasMVEInt] in {
+  def : Pat<(v16i8 (int_arm_cde_vcx1q timm:$coproc, timm:$imm)),
+            (v16i8 (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm))>;
+  def : Pat<(v16i8 (int_arm_cde_vcx1qa timm:$coproc, (v16i8 MQPR:$acc),
+                                       timm:$imm)),
+            (v16i8 (CDE_VCX1A_vec p_imm:$coproc, MQPR:$acc, imm_12b:$imm))>;
+
+  def : Pat<(v16i8 (int_arm_cde_vcx2q timm:$coproc, (v16i8 MQPR:$n), timm:$imm)),
+            (v16i8 (CDE_VCX2_vec p_imm:$coproc, MQPR:$n, imm_7b:$imm))>;
+  def : Pat<(v16i8 (int_arm_cde_vcx2qa timm:$coproc, (v16i8 MQPR:$acc),
+                                       (v16i8 MQPR:$n), timm:$imm)),
+            (v16i8 (CDE_VCX2A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n,
+                                  imm_7b:$imm))>;
+
+  def : Pat<(v16i8 (int_arm_cde_vcx3q timm:$coproc, (v16i8 MQPR:$n),
+                                      (v16i8 MQPR:$m), timm:$imm)),
+            (v16i8 (CDE_VCX3_vec p_imm:$coproc, MQPR:$n, MQPR:$m,
+                                 imm_4b:$imm))>;
+  def : Pat<(v16i8 (int_arm_cde_vcx3qa timm:$coproc, (v16i8 MQPR:$acc),
+                                       (v16i8 MQPR:$n), (v16i8 MQPR:$m),
+                                       timm:$imm)),
+            (v16i8 (CDE_VCX3A_vec p_imm:$coproc, MQPR:$acc, MQPR:$n, MQPR:$m,
+                                  imm_4b:$imm))>;
+}
+
+multiclass VCXPredicatedPat_m<MVEVectorVTInfo VTI> {
+  def : Pat<(VTI.Vec (int_arm_cde_vcx1q_predicated timm:$coproc,
+                                    (VTI.Vec MQPR:$inactive), timm:$imm,
+                                    (VTI.Pred VCCR:$pred))),
+            (VTI.Vec (CDE_VCX1_vec p_imm:$coproc, imm_12b:$imm, ARMVCCThen,
+                                    (VTI.Pred VCCR:$pred),
+                                    (VTI.Vec MQPR:$inactive)))>;
+  def : Pat<(VTI.Vec (int_arm_cde_vcx1qa_predicated timm:$coproc,
+                                    (VTI.Vec MQPR:$acc), timm:$imm,
+                                    (VTI.Pred VCCR:$pred))),
+            (VTI.Vec (CDE_VCX1A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc),
+                                    imm_12b:$imm, ARMVCCThen,
+                                    (VTI.Pred VCCR:$pred)))>;
+
+  def : Pat<(VTI.Vec (int_arm_cde_vcx2q_predicated timm:$coproc,
+                                    (VTI.Vec MQPR:$inactive),
+                                    (v16i8 MQPR:$n), timm:$imm,
+                                    (VTI.Pred VCCR:$pred))),
+            (VTI.Vec (CDE_VCX2_vec p_imm:$coproc, (v16i8 MQPR:$n),
+                                    imm_7b:$imm, ARMVCCThen,
+                                    (VTI.Pred VCCR:$pred),
+                                    (VTI.Vec MQPR:$inactive)))>;
+  def : Pat<(VTI.Vec (int_arm_cde_vcx2qa_predicated timm:$coproc,
+                                    (VTI.Vec MQPR:$acc),
+                                    (v16i8 MQPR:$n), timm:$imm,
+                                    (VTI.Pred VCCR:$pred))),
+            (VTI.Vec (CDE_VCX2A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc),
+                                    (v16i8 MQPR:$n), timm:$imm, ARMVCCThen,
+                                    (VTI.Pred VCCR:$pred)))>;
+
+  def : Pat<(VTI.Vec (int_arm_cde_vcx3q_predicated timm:$coproc,
+                                    (VTI.Vec MQPR:$inactive),
+                                    (v16i8 MQPR:$n), (v16i8 MQPR:$m),
+                                    timm:$imm,
+                                    (VTI.Pred VCCR:$pred))),
+            (VTI.Vec (CDE_VCX3_vec p_imm:$coproc, (v16i8 MQPR:$n),
+                                    (v16i8 MQPR:$m),
+                                    imm_4b:$imm, ARMVCCThen,
+                                    (VTI.Pred VCCR:$pred),
+                                    (VTI.Vec MQPR:$inactive)))>;
+  def : Pat<(VTI.Vec (int_arm_cde_vcx3qa_predicated timm:$coproc,
+                                    (VTI.Vec MQPR:$acc),
+                                    (v16i8 MQPR:$n), (v16i8 MQPR:$m), timm:$imm,
+                                    (VTI.Pred VCCR:$pred))),
+            (VTI.Vec (CDE_VCX3A_vec p_imm:$coproc, (VTI.Vec MQPR:$acc),
+                                    (v16i8 MQPR:$n), (v16i8 MQPR:$m),
+                                    imm_4b:$imm, ARMVCCThen,
+                                    (VTI.Pred VCCR:$pred)))>;
+}
+
+let Predicates = [HasCDE, HasMVEInt] in
+  foreach VTI = [ MVE_v16i8, MVE_v8i16, MVE_v4i32, MVE_v2i64 ] in
+    defm : VCXPredicatedPat_m<VTI>;
+
+let Predicates = [HasCDE, HasMVEFloat] in
+  foreach VTI = [ MVE_v8f16, MVE_v4f32 ] in
+    defm : VCXPredicatedPat_m<VTI>;
diff --git a/llvm/lib/Target/ARM/ARMInstrFormats.td b/llvm/lib/Target/ARM/ARMInstrFormats.td
index 1da32ad2af6c..e13f3437cc7b 100644
--- a/llvm/lib/Target/ARM/ARMInstrFormats.td
+++ b/llvm/lib/Target/ARM/ARMInstrFormats.td
@@ -205,7 +205,6 @@ def VPTPredROperand : AsmOperandClass {
   let Name = "VPTPredR";
   let PredicateMethod = "isVPTPred";
 }
-def undef_tied_input;
 
 // Operand classes for the cluster of MC operands describing a
 // VPT-predicated MVE instruction.
@@ -409,6 +408,9 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
   bit thumbArithFlagSetting = 0;
 
   bit validForTailPredication = 0;
+  bit retainsPreviousHalfElement = 0;
+  bit horizontalReduction = 0;
+  bit doubleWidthResult = 0;
 
   // If this is a pseudo instruction, mark it isCodeGenOnly.
   let isCodeGenOnly = !eq(!cast<string>(f), "Pseudo");
@@ -422,6 +424,9 @@ class InstTemplate<AddrMode am, int sz, IndexMode im,
   let TSFlags{18-15} = D.Value;
   let TSFlags{19}    = thumbArithFlagSetting;
   let TSFlags{20}    = validForTailPredication;
+  let TSFlags{21}    = retainsPreviousHalfElement;
+  let TSFlags{22}    = horizontalReduction;
+  let TSFlags{23}    = doubleWidthResult;
 
   let Constraints = cstr;
   let Itinerary = itin;
@@ -1123,6 +1128,9 @@ class Thumb2DSPPat<dag pattern, dag result> : Pat<pattern, result> {
 class Thumb2DSPMulPat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [IsThumb2, UseMulOps, HasDSP];
 }
+class FPRegs16Pat<dag pattern, dag result> : Pat<pattern, result> {
+  list<Predicate> Predicates = [HasFPRegs16];
+}
 class FP16Pat<dag pattern, dag result> : Pat<pattern, result> {
   list<Predicate> Predicates = [HasFP16];
 }
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.cpp b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
index a802d5a06f07..2790ac215f86 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.cpp
@@ -126,7 +126,7 @@ void ARMInstrInfo::expandLoadStackGuard(MachineBasicBlock::iterator MI) const {
                MachineMemOperand::MODereferenceable |
                MachineMemOperand::MOInvariant;
   MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
-      MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, 4);
+      MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 4, Align(4));
   MIB.addMemOperand(MMO);
   BuildMI(MBB, MI, DL, get(ARM::LDRi12), Reg)
       .addReg(Reg, RegState::Kill)
diff --git a/llvm/lib/Target/ARM/ARMInstrInfo.td b/llvm/lib/Target/ARM/ARMInstrInfo.td
index ce67af6f1b49..da0a836c8f95 100644
--- a/llvm/lib/Target/ARM/ARMInstrInfo.td
+++ b/llvm/lib/Target/ARM/ARMInstrInfo.td
@@ -159,6 +159,8 @@ def ARMcall_nolink   : SDNode<"ARMISD::CALL_NOLINK", SDT_ARMcall,
 
 def ARMretflag       : SDNode<"ARMISD::RET_FLAG", SDTNone,
                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+def ARMseretflag     : SDNode<"ARMISD::SERET_FLAG", SDTNone,
+                              [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def ARMintretflag    : SDNode<"ARMISD::INTRET_FLAG", SDT_ARMcall,
                               [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
 def ARMcmov          : SDNode<"ARMISD::CMOV", SDT_ARMCMov,
@@ -264,7 +266,7 @@ def ARMvrev64    : SDNode<"ARMISD::VREV64", SDTARMVSHUF>;
 def ARMvrev32    : SDNode<"ARMISD::VREV32", SDTARMVSHUF>;
 def ARMvrev16    : SDNode<"ARMISD::VREV16", SDTARMVSHUF>;
 
-def SDTARMVGETLN  : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
+def SDTARMVGETLN  : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisVec<1>,
                                          SDTCisVT<2, i32>]>;
 def ARMvgetlaneu : SDNode<"ARMISD::VGETLANEu", SDTARMVGETLN>;
 def ARMvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
@@ -274,6 +276,10 @@ def ARMvmovImm   : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>;
 def ARMvmvnImm   : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>;
 def ARMvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>;
 
+def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                           SDTCisVT<2, i32>]>;
+def ARMvorrImm   : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>;
+def ARMvbicImm   : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>;
 
 def SDTARMVSHIMM : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisSameAs<0, 1>,
                                         SDTCisVT<2, i32>]>;
@@ -285,6 +291,11 @@ def ARMvshruImm  : SDNode<"ARMISD::VSHRuIMM", SDTARMVSHIMM>;
 def ARMvshls     : SDNode<"ARMISD::VSHLs", SDTARMVSH>;
 def ARMvshlu     : SDNode<"ARMISD::VSHLu", SDTARMVSH>;
 
+def SDTARMVMULL   : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
+                                         SDTCisSameAs<1, 2>]>;
+def ARMvmulls    : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
+def ARMvmullu    : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
+
 def SDTARMVCMP    : SDTypeProfile<1, 3, [SDTCisInt<0>, SDTCisSameAs<1, 2>,
                                          SDTCisInt<3>]>;
 def SDTARMVCMPZ   : SDTypeProfile<1, 2, [SDTCisInt<2>]>;
@@ -296,6 +307,36 @@ def ARMWLS      : SDNode<"ARMISD::WLS", SDT_ARMLoLoop, [SDNPHasChain]>;
 def ARMLE       : SDNode<"ARMISD::LE", SDT_ARMLoLoop, [SDNPHasChain]>;
 def ARMLoopDec  : SDNode<"ARMISD::LOOP_DEC", SDTIntBinOp, [SDNPHasChain]>;
 
+// 'VECTOR_REG_CAST' is an operation that reinterprets the contents of a
+// vector register as a different vector type, without changing the contents of
+// the register. It differs from 'bitconvert' in that bitconvert reinterprets
+// the _memory_ storage format of the vector, whereas VECTOR_REG_CAST
+// reinterprets the _register_ format - and in big-endian, the memory and
+// register formats are different, so they are different operations.
+//
+// For example, 'VECTOR_REG_CAST' between v8i16 and v16i8 will map the LSB of
+// the zeroth i16 lane to the zeroth i8 lane, regardless of system endianness,
+// whereas 'bitconvert' will map it to the high byte in big-endian mode,
+// because that's what (MVE) VSTRH.16 followed by VLDRB.8 would do. So the
+// bitconvert would have to emit a VREV16.8 instruction, whereas the
+// VECTOR_REG_CAST emits no code at all if the vector is already in a register.
+def ARMVectorRegCastImpl : SDNode<"ARMISD::VECTOR_REG_CAST", SDTUnaryOp>;
+
+// In little-endian, VECTOR_REG_CAST is often turned into bitconvert during
+// lowering (because in that situation they're identical). So an isel pattern
+// that needs to match something that's _logically_ a VECTOR_REG_CAST must
+// _physically_ match a different node type depending on endianness.
+//
+// This 'PatFrags' instance is a centralized facility to make that easy. It
+// matches VECTOR_REG_CAST in either endianness, and also bitconvert in the
+// endianness where it's equivalent.
+def ARMVectorRegCast: PatFrags<
+    (ops node:$x), [(ARMVectorRegCastImpl node:$x), (bitconvert node:$x)], [{
+       // Reject a match against bitconvert (aka ISD::BITCAST) if big-endian
+       return !(CurDAG->getDataLayout().isBigEndian() &&
+                N->getOpcode() == ISD::BITCAST);
+    }]>;
+
 //===----------------------------------------------------------------------===//
 // ARM Flag Definitions.
 
@@ -402,6 +443,62 @@ def fsub_mlx : PatFrag<(ops node:$lhs, node:$rhs),(fsub node:$lhs, node:$rhs),[{
   return hasNoVMLxHazardUse(N);
 }]>;
 
+def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>;
+def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>;
+
+//===----------------------------------------------------------------------===//
+// NEON/MVE pattern fragments
+//
+
+// Extract D sub-registers of Q registers.
+def DSubReg_i8_reg  : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, SDLoc(N),
+                                   MVT::i32);
+}]>;
+def DSubReg_i16_reg : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, SDLoc(N),
+                                   MVT::i32);
+}]>;
+def DSubReg_i32_reg : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, SDLoc(N),
+                                   MVT::i32);
+}]>;
+def DSubReg_f64_reg : SDNodeXForm<imm, [{
+  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+// Extract S sub-registers of Q/D registers.
+def SSubReg_f32_reg : SDNodeXForm<imm, [{
+  assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue(), SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+// Extract S sub-registers of Q/D registers containing a given f16/bf16 lane.
+def SSubReg_f16_reg : SDNodeXForm<imm, [{
+  assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
+  return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue()/2, SDLoc(N),
+                                   MVT::i32);
+}]>;
+
+// Translate lane numbers from Q registers to D subregs.
+def SubReg_i8_lane  : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32);
+}]>;
+def SubReg_i16_lane : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 3, SDLoc(N), MVT::i32);
+}]>;
+def SubReg_i32_lane : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i32);
+}]>;
+
+
+
 //===----------------------------------------------------------------------===//
 // Operand Definitions.
 //
@@ -2780,7 +2877,7 @@ multiclass AI2_ldridx<bit isByte, string opc,
 }
 
 let mayLoad = 1, hasSideEffects = 0 in {
-// FIXME: for LDR_PRE_REG etc. the itineray should be either IIC_iLoad_ru or
+// FIXME: for LDR_PRE_REG etc. the itinerary should be either IIC_iLoad_ru or
 // IIC_iLoad_siu depending on whether it the offset register is shifted.
 defm LDR  : AI2_ldridx<0, "ldr", IIC_iLoad_iu, IIC_iLoad_ru>;
 defm LDRB : AI2_ldridx<1, "ldrb", IIC_iLoad_bh_iu, IIC_iLoad_bh_ru>;
@@ -2947,6 +3044,9 @@ multiclass AI3ldrT<bits<4> op, string opc> {
     let Inst{3-0} = Rm{3-0};
     let DecoderMethod = "DecodeLDR";
   }
+
+  def ii : ARMAsmPseudo<!strconcat(opc, "${p} $Rt, $addr"),
+                        (ins addr_offset_none:$addr, pred:$p), (outs GPR:$Rt)>;
 }
 
 defm LDRSBT : AI3ldrT<0b1101, "ldrsbt">;
@@ -2992,11 +3092,6 @@ def STOREDUAL : ARMPseudoInst<(outs), (ins GPRPairOp:$Rt, addrmode3:$addr),
 }
 }
 
-let Predicates = [IsARM, HasV5TE] in {
-def : Pat<(ARMstrd GPR:$Rt, GPR:$Rt2, addrmode3:$addr),
-          (STOREDUAL (REG_SEQUENCE GPRPair, GPR:$Rt, gsub_0, GPR:$Rt2, gsub_1), addrmode3:$addr)>;
-}
-
 // Indexed stores
 multiclass AI2_stridx<bit isByte, string opc,
                       InstrItinClass iii, InstrItinClass iir> {
@@ -3063,7 +3158,7 @@ multiclass AI2_stridx<bit isByte, string opc,
 }
 
 let mayStore = 1, hasSideEffects = 0 in {
-// FIXME: for STR_PRE_REG etc. the itineray should be either IIC_iStore_ru or
+// FIXME: for STR_PRE_REG etc. the itinerary should be either IIC_iStore_ru or
 // IIC_iStore_siu depending on whether it the offset register is shifted.
 defm STR  : AI2_stridx<0, "str", IIC_iStore_iu, IIC_iStore_ru>;
 defm STRB : AI2_stridx<1, "strb", IIC_iStore_bh_iu, IIC_iStore_bh_ru>;
@@ -3797,9 +3892,8 @@ def QSUB16  : AAIIntrinsic<0b01100010, 0b11110111, "qsub16", int_arm_qsub16>;
 def QSUB8   : AAIIntrinsic<0b01100010, 0b11111111, "qsub8", int_arm_qsub8>;
 
 def QDADD   : AAIRevOpr<0b00010100, 0b00000101, "qdadd",
-              [(set GPRnopc:$Rd, (int_arm_qadd (int_arm_qadd GPRnopc:$Rm,
-                                                             GPRnopc:$Rm),
-                                  GPRnopc:$Rn))]>;
+              [(set GPRnopc:$Rd, (int_arm_qadd GPRnopc:$Rm,
+                                  (int_arm_qadd GPRnopc:$Rn, GPRnopc:$Rn)))]>;
 def QDSUB   : AAIRevOpr<0b00010110, 0b00000101, "qdsub",
               [(set GPRnopc:$Rd, (int_arm_qsub GPRnopc:$Rm,
                                   (int_arm_qadd GPRnopc:$Rn, GPRnopc:$Rn)))]>;
@@ -3814,7 +3908,7 @@ def : ARMV5TEPat<(saddsat GPR:$a, GPR:$b),
                  (QADD GPR:$a, GPR:$b)>;
 def : ARMV5TEPat<(ssubsat GPR:$a, GPR:$b),
                  (QSUB GPR:$a, GPR:$b)>;
-def : ARMV5TEPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
+def : ARMV5TEPat<(saddsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
                  (QDADD rGPR:$Rm, rGPR:$Rn)>;
 def : ARMV5TEPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
                  (QDSUB rGPR:$Rm, rGPR:$Rn)>;
@@ -5441,7 +5535,8 @@ def : ARMInstAlias<"mcr${p} $cop, $opc1, $Rt, $CRn, $CRm",
 def MRC : MovRCopro<"mrc", 1 /* from coprocessor to ARM core register */,
                     (outs GPRwithAPSR:$Rt),
                     (ins p_imm:$cop, imm0_7:$opc1, c_imm:$CRn, c_imm:$CRm,
-                         imm0_7:$opc2), []>;
+                         imm0_7:$opc2), []>,
+                    ComplexDeprecationPredicate<"MRC">;
 def : ARMInstAlias<"mrc${p} $cop, $opc1, $Rt, $CRn, $CRm",
                    (MRC GPRwithAPSR:$Rt, p_imm:$cop, imm0_7:$opc1, c_imm:$CRn,
                         c_imm:$CRm, 0, pred:$p)>;
@@ -5718,7 +5813,7 @@ def : ARMPat<(ARMthread_pointer), (MRC 15, 0, 13, 0, 3)>,
 //   when we get here from a longjmp(). We force everything out of registers
 //   except for our own input by listing the relevant registers in Defs. By
 //   doing so, we also cause the prologue/epilogue code to actively preserve
-//   all of the callee-saved resgisters, which is exactly what we want.
+//   all of the callee-saved registers, which is exactly what we want.
 //   A constant value is passed in $val, and we use the location as a scratch.
 //
 // These are pseudo-instructions and are lowered to individual MC-insts, so
@@ -6003,6 +6098,12 @@ include "ARMInstrNEON.td"
 include "ARMInstrMVE.td"
 
 //===----------------------------------------------------------------------===//
+// CDE (Custom Datapath Extension)
+//
+
+include "ARMInstrCDE.td"
+
+//===----------------------------------------------------------------------===//
 // Assembler aliases
 //
 
diff --git a/llvm/lib/Target/ARM/ARMInstrMVE.td b/llvm/lib/Target/ARM/ARMInstrMVE.td
index 604291be822c..2a1f50d97e3b 100644
--- a/llvm/lib/Target/ARM/ARMInstrMVE.td
+++ b/llvm/lib/Target/ARM/ARMInstrMVE.td
@@ -10,44 +10,6 @@
 //
 //===----------------------------------------------------------------------===//
 
-class ExpandImmAsmOp<string shift> : AsmOperandClass {
-  let Name = !strconcat("ExpandImm", shift);
-  let PredicateMethod = !strconcat("isExpImm<", shift, ">");
-  let RenderMethod = "addImmOperands";
-}
-class InvertedExpandImmAsmOp<string shift, string size> : AsmOperandClass {
-  let Name = !strconcat("InvertedExpandImm", shift, "_", size);
-  let PredicateMethod = !strconcat("isInvertedExpImm<", shift, ",", size, ">");
-  let RenderMethod = "addImmOperands";
-}
-
-class ExpandImm<string shift> : Operand<i32> {
-  let ParserMatchClass = ExpandImmAsmOp<shift>;
-  let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",false>");
-  let DecoderMethod = !strconcat("DecodeExpandedImmOperand<",shift,">");
-  let PrintMethod = "printExpandedImmOperand";
-}
-class InvertedExpandImm<string shift, string size> : Operand<i32> {
-  let ParserMatchClass = InvertedExpandImmAsmOp<shift, size>;
-  let EncoderMethod = !strconcat("getExpandedImmOpValue<",shift,",true>");
-  let PrintMethod = "printExpandedImmOperand";
-  // No decoder method needed, because this operand type is only used
-  // by aliases (VAND and VORN)
-}
-
-def expzero00 : ExpandImm<"0">;
-def expzero08 : ExpandImm<"8">;
-def expzero16 : ExpandImm<"16">;
-def expzero24 : ExpandImm<"24">;
-
-def expzero00inv16 : InvertedExpandImm<"0", "16">;
-def expzero08inv16 : InvertedExpandImm<"8", "16">;
-
-def expzero00inv32 : InvertedExpandImm<"0", "32">;
-def expzero08inv32 : InvertedExpandImm<"8", "32">;
-def expzero16inv32 : InvertedExpandImm<"16", "32">;
-def expzero24inv32 : InvertedExpandImm<"24", "32">;
-
 // VPT condition mask
 def vpt_mask : Operand<i32> {
   let PrintMethod = "printVPTMask";
@@ -277,7 +239,8 @@ class mve_addr_q_shift<int shift> : MemOperand {
 
 // A family of classes wrapping up information about the vector types
 // used by MVE.
-class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred,
+class MVEVectorVTInfo<ValueType vec, ValueType dblvec,
+                      ValueType pred, ValueType dblpred,
                       bits<2> size, string suffixletter, bit unsigned> {
   // The LLVM ValueType representing the vector, so we can use it in
   // ISel patterns.
@@ -300,6 +263,9 @@ class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred,
   // directly.
   ValueType Pred = pred;
 
+  // Same as Pred but for DblVec rather than Vec.
+  ValueType DblPred = dblpred;
+
   // The most common representation of the vector element size in MVE
   // instruction encodings: a 2-bit value V representing an (8<<V)-bit
   // vector element.
@@ -319,38 +285,38 @@ class MVEVectorVTInfo<ValueType vec, ValueType dblvec, ValueType pred,
                           !cast<string>(LaneBits));
 
   // The suffix used on an instruction that mentions the whole type.
-  string Suffix = suffixletter ## BitsSuffix;
+  string Suffix = suffixletter # BitsSuffix;
 
   // The letter part of the suffix only.
   string SuffixLetter = suffixletter;
 }
 
 // Integer vector types that don't treat signed and unsigned differently.
-def MVE_v16i8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "i", ?>;
-def MVE_v8i16 : MVEVectorVTInfo<v8i16, v4i32, v8i1,  0b01, "i", ?>;
-def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1,  0b10, "i", ?>;
-def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?,     v4i1,  0b11, "i", ?>;
+def MVE_v16i8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "i", ?>;
+def MVE_v8i16 : MVEVectorVTInfo<v8i16, v4i32, v8i1,  v4i1, 0b01, "i", ?>;
+def MVE_v4i32 : MVEVectorVTInfo<v4i32, v2i64, v4i1,  v4i1, 0b10, "i", ?>;
+def MVE_v2i64 : MVEVectorVTInfo<v2i64, ?,     v4i1,  ?,    0b11, "i", ?>;
 
 // Explicitly signed and unsigned integer vectors. They map to the
 // same set of LLVM ValueTypes as above, but are represented
 // differently in assembly and instruction encodings.
-def MVE_v16s8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "s", 0b0>;
-def MVE_v8s16 : MVEVectorVTInfo<v8i16, v4i32, v8i1,  0b01, "s", 0b0>;
-def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1,  0b10, "s", 0b0>;
-def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?,     v4i1,  0b11, "s", 0b0>;
-def MVE_v16u8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b00, "u", 0b1>;
-def MVE_v8u16 : MVEVectorVTInfo<v8i16, v4i32, v8i1,  0b01, "u", 0b1>;
-def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1,  0b10, "u", 0b1>;
-def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?,     v4i1,  0b11, "u", 0b1>;
+def MVE_v16s8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "s", 0b0>;
+def MVE_v8s16 : MVEVectorVTInfo<v8i16, v4i32, v8i1,  v4i1, 0b01, "s", 0b0>;
+def MVE_v4s32 : MVEVectorVTInfo<v4i32, v2i64, v4i1,  v4i1, 0b10, "s", 0b0>;
+def MVE_v2s64 : MVEVectorVTInfo<v2i64, ?,     v4i1,  ?,    0b11, "s", 0b0>;
+def MVE_v16u8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b00, "u", 0b1>;
+def MVE_v8u16 : MVEVectorVTInfo<v8i16, v4i32, v8i1,  v4i1, 0b01, "u", 0b1>;
+def MVE_v4u32 : MVEVectorVTInfo<v4i32, v2i64, v4i1,  v4i1, 0b10, "u", 0b1>;
+def MVE_v2u64 : MVEVectorVTInfo<v2i64, ?,     v4i1,  ?,    0b11, "u", 0b1>;
 
 // FP vector types.
-def MVE_v8f16 : MVEVectorVTInfo<v8f16, v4f32, v8i1,  0b01, "f", ?>;
-def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1,  0b10, "f", ?>;
-def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?,     v4i1,  0b11, "f", ?>;
+def MVE_v8f16 : MVEVectorVTInfo<v8f16, v4f32, v8i1,  v4i1, 0b01, "f", ?>;
+def MVE_v4f32 : MVEVectorVTInfo<v4f32, v2f64, v4i1,  v4i1, 0b10, "f", ?>;
+def MVE_v2f64 : MVEVectorVTInfo<v2f64, ?,     v4i1,  ?,    0b11, "f", ?>;
 
 // Polynomial vector types.
-def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, 0b11, "p", 0b0>;
-def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1,  0b11, "p", 0b1>;
+def MVE_v16p8 : MVEVectorVTInfo<v16i8, v8i16, v16i1, v8i1, 0b11, "p", 0b0>;
+def MVE_v8p16 : MVEVectorVTInfo<v8i16, v4i32, v8i1,  v4i1, 0b11, "p", 0b1>;
 
 // --------- Start of base classes for the instructions themselves
 
@@ -473,6 +439,8 @@ class MVE_ScalarShiftDoubleReg<string iname, dag iops, string asm,
 
   let Inst{19-17} = RdaLo{3-1};
   let Inst{11-9} = RdaHi{3-1};
+
+  let hasSideEffects = 0;
 }
 
 class MVE_ScalarShiftDRegImm<string iname, bits<2> op5_4, bit op16,
@@ -590,6 +558,7 @@ class MVE_VABAV<string suffix, bit U, bits<2> size>
   let Inst{5} = Qm{3};
   let Inst{3-1} = Qm{2-0};
   let Inst{0} = 0b1;
+  let horizontalReduction = 1;
 }
 
 multiclass MVE_VABAV_m<MVEVectorVTInfo VTI> {
@@ -639,38 +608,63 @@ class MVE_VADDV<string iname, string suffix, dag iops, string cstr,
   let Inst{5} = A;
   let Inst{3-1} = Qm{2-0};
   let Inst{0} = 0b0;
+  let horizontalReduction = 1;
+  let validForTailPredication = 1;
 }
 
-multiclass MVE_VADDV_A<string suffix, bit U, bits<2> size,
-                       list<dag> pattern=[]> {
-  def acc    : MVE_VADDV<"vaddva", suffix,
+def ARMVADDVs       : SDNode<"ARMISD::VADDVs", SDTVecReduce>;
+def ARMVADDVu       : SDNode<"ARMISD::VADDVu", SDTVecReduce>;
+
+multiclass MVE_VADDV_A<MVEVectorVTInfo VTI> {
+  def acc    : MVE_VADDV<"vaddva", VTI.Suffix,
                          (ins tGPREven:$Rda_src, MQPR:$Qm), "$Rda = $Rda_src",
-                         0b1, U, size, pattern>;
-  def no_acc : MVE_VADDV<"vaddv", suffix,
+                         0b1, VTI.Unsigned, VTI.Size>;
+  def no_acc : MVE_VADDV<"vaddv", VTI.Suffix,
                          (ins MQPR:$Qm), "",
-                         0b0, U, size, pattern>;
-}
+                         0b0, VTI.Unsigned, VTI.Size>;
 
-defm MVE_VADDVs8  : MVE_VADDV_A<"s8",  0b0, 0b00>;
-defm MVE_VADDVs16 : MVE_VADDV_A<"s16", 0b0, 0b01>;
-defm MVE_VADDVs32 : MVE_VADDV_A<"s32", 0b0, 0b10>;
-defm MVE_VADDVu8  : MVE_VADDV_A<"u8",  0b1, 0b00>;
-defm MVE_VADDVu16 : MVE_VADDV_A<"u16", 0b1, 0b01>;
-defm MVE_VADDVu32 : MVE_VADDV_A<"u32", 0b1, 0b10>;
+  defvar InstA = !cast<Instruction>(NAME # "acc");
+  defvar InstN = !cast<Instruction>(NAME # "no_acc");
 
-let Predicates = [HasMVEInt] in {
-  def : Pat<(i32 (vecreduce_add (v4i32 MQPR:$src))), (i32 (MVE_VADDVu32no_acc $src))>;
-  def : Pat<(i32 (vecreduce_add (v8i16 MQPR:$src))), (i32 (MVE_VADDVu16no_acc $src))>;
-  def : Pat<(i32 (vecreduce_add (v16i8 MQPR:$src))), (i32 (MVE_VADDVu8no_acc $src))>;
-  def : Pat<(i32 (add (i32 (vecreduce_add (v4i32 MQPR:$src1))), (i32 tGPR:$src2))),
-            (i32 (MVE_VADDVu32acc $src2, $src1))>;
-  def : Pat<(i32 (add (i32 (vecreduce_add (v8i16 MQPR:$src1))), (i32 tGPR:$src2))),
-            (i32 (MVE_VADDVu16acc $src2, $src1))>;
-  def : Pat<(i32 (add (i32 (vecreduce_add (v16i8 MQPR:$src1))), (i32 tGPR:$src2))),
-            (i32 (MVE_VADDVu8acc $src2, $src1))>;
+  let Predicates = [HasMVEInt] in {
+    if VTI.Unsigned then {
+      def : Pat<(i32 (vecreduce_add (VTI.Vec MQPR:$vec))),
+                (i32 (InstN $vec))>;
+      def : Pat<(i32 (ARMVADDVu (VTI.Vec MQPR:$vec))),
+                (i32 (InstN $vec))>;
+      def : Pat<(i32 (add (i32 (vecreduce_add (VTI.Vec MQPR:$vec))),
+                          (i32 tGPREven:$acc))),
+                (i32 (InstA $acc, $vec))>;
+      def : Pat<(i32 (add (i32 (ARMVADDVu (VTI.Vec MQPR:$vec))),
+                          (i32 tGPREven:$acc))),
+                (i32 (InstA $acc, $vec))>;
+    } else {
+      def : Pat<(i32 (ARMVADDVs (VTI.Vec MQPR:$vec))),
+                (i32 (InstN $vec))>;
+      def : Pat<(i32 (add (i32 (ARMVADDVs (VTI.Vec MQPR:$vec))),
+                          (i32 tGPREven:$acc))),
+                (i32 (InstA $acc, $vec))>;
+    }
 
+    def : Pat<(i32 (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec),
+                                                (i32 VTI.Unsigned),
+                                                (VTI.Pred VCCR:$pred))),
+              (i32 (InstN $vec, ARMVCCThen, $pred))>;
+    def : Pat<(i32 (add (int_arm_mve_addv_predicated (VTI.Vec MQPR:$vec),
+                                                     (i32 VTI.Unsigned),
+                                                     (VTI.Pred VCCR:$pred)),
+                        (i32 tGPREven:$acc))),
+              (i32 (InstA $acc, $vec, ARMVCCThen, $pred))>;
+  }
 }
 
+defm MVE_VADDVs8  : MVE_VADDV_A<MVE_v16s8>;
+defm MVE_VADDVs16 : MVE_VADDV_A<MVE_v8s16>;
+defm MVE_VADDVs32 : MVE_VADDV_A<MVE_v4s32>;
+defm MVE_VADDVu8  : MVE_VADDV_A<MVE_v16u8>;
+defm MVE_VADDVu16 : MVE_VADDV_A<MVE_v8u16>;
+defm MVE_VADDVu32 : MVE_VADDV_A<MVE_v4u32>;
+
 class MVE_VADDLV<string iname, string suffix, dag iops, string cstr,
                bit A, bit U, list<dag> pattern=[]>
   : MVE_rDest<(outs tGPREven:$RdaLo, tGPROdd:$RdaHi), iops, NoItinerary, iname,
@@ -689,21 +683,58 @@ class MVE_VADDLV<string iname, string suffix, dag iops, string cstr,
   let Inst{5} = A;
   let Inst{3-1} = Qm{2-0};
   let Inst{0} = 0b0;
-}
-
-multiclass MVE_VADDLV_A<string suffix, bit U, list<dag> pattern=[]> {
-  def acc    : MVE_VADDLV<"vaddlva", suffix,
+  let horizontalReduction = 1;
+}
+
+def SDTVecReduceL : SDTypeProfile<2, 1, [    // VADDLV
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>
+]>;
+def SDTVecReduceLA : SDTypeProfile<2, 3, [    // VADDLVA
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>,
+  SDTCisVec<4>
+]>;
+def SDTVecReduceLP : SDTypeProfile<2, 2, [    // VADDLVp
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<2>
+]>;
+def SDTVecReduceLPA : SDTypeProfile<2, 4, [    // VADDLVAp
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>,
+  SDTCisVec<4>, SDTCisVec<5>
+]>;
+
+multiclass MVE_VADDLV_A<MVEVectorVTInfo VTI> {
+  def acc    : MVE_VADDLV<"vaddlva", VTI.Suffix,
                         (ins tGPREven:$RdaLo_src, tGPROdd:$RdaHi_src, MQPR:$Qm),
                         "$RdaLo = $RdaLo_src,$RdaHi = $RdaHi_src",
-                        0b1, U, pattern>;
-  def no_acc : MVE_VADDLV<"vaddlv", suffix,
+                        0b1, VTI.Unsigned>;
+  def no_acc : MVE_VADDLV<"vaddlv", VTI.Suffix,
                         (ins MQPR:$Qm), "",
-                        0b0, U, pattern>;
-}
+                        0b0, VTI.Unsigned>;
+
+  defvar InstA = !cast<Instruction>(NAME # "acc");
+  defvar InstN = !cast<Instruction>(NAME # "no_acc");
 
+  defvar letter = VTI.SuffixLetter;
+  defvar ARMVADDLV = SDNode<"ARMISD::VADDLV" # letter, SDTVecReduceL>;
+  defvar ARMVADDLVA = SDNode<"ARMISD::VADDLVA" # letter, SDTVecReduceLA>;
+  defvar ARMVADDLVp = SDNode<"ARMISD::VADDLVp" # letter, SDTVecReduceLP>;
+  defvar ARMVADDLVAp = SDNode<"ARMISD::VADDLVAp" # letter, SDTVecReduceLPA>;
 
-defm MVE_VADDLVs32 : MVE_VADDLV_A<"s32", 0b0>;
-defm MVE_VADDLVu32 : MVE_VADDLV_A<"u32", 0b1>;
+  let Predicates = [HasMVEInt] in {
+    def : Pat<(ARMVADDLV (v4i32 MQPR:$vec)),
+              (InstN (v4i32 MQPR:$vec))>;
+    def : Pat<(ARMVADDLVA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec)),
+              (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec))>;
+    def : Pat<(ARMVADDLVp (v4i32 MQPR:$vec), (VTI.Pred VCCR:$pred)),
+              (InstN (v4i32 MQPR:$vec), ARMVCCThen, (VTI.Pred VCCR:$pred))>;
+    def : Pat<(ARMVADDLVAp tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec),
+                           (VTI.Pred VCCR:$pred)),
+              (InstA tGPREven:$acclo, tGPROdd:$acchi, (v4i32 MQPR:$vec),
+                     ARMVCCThen, (VTI.Pred VCCR:$pred))>;
+  }
+}
+
+defm MVE_VADDLVs32 : MVE_VADDLV_A<MVE_v4s32>;
+defm MVE_VADDLVu32 : MVE_VADDLV_A<MVE_v4u32>;
 
 class MVE_VMINMAXNMV<string iname, string suffix, bit sz,
                      bit bit_17, bit bit_7, list<dag> pattern=[]>
@@ -724,25 +755,48 @@ class MVE_VMINMAXNMV<string iname, string suffix, bit sz,
   let Inst{6-5} = 0b00;
   let Inst{3-1} = Qm{2-0};
   let Inst{0} = 0b0;
+  let horizontalReduction = 1;
 
   let Predicates = [HasMVEFloat];
+  let hasSideEffects = 0;
 }
 
-multiclass MVE_VMINMAXNMV_fty<string iname, bit bit_7, list<dag> pattern=[]> {
-  def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b1, bit_7, pattern>;
-  def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b1, bit_7, pattern>;
-}
+multiclass MVE_VMINMAXNMV_p<string iname, bit notAbs, bit isMin,
+                            MVEVectorVTInfo VTI, string intrBaseName,
+                            ValueType Scalar, RegisterClass ScalarReg> {
+  def "": MVE_VMINMAXNMV<iname, VTI.Suffix, VTI.Size{0}, notAbs, isMin>;
+  defvar Inst        = !cast<Instruction>(NAME);
+  defvar unpred_intr = !cast<Intrinsic>(intrBaseName);
+  defvar pred_intr   = !cast<Intrinsic>(intrBaseName#"_predicated");
 
-defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv", 0b1>;
-defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv", 0b0>;
+  let Predicates = [HasMVEFloat] in {
+    def : Pat<(Scalar (unpred_intr (Scalar ScalarReg:$prev),
+                                   (VTI.Vec MQPR:$vec))),
+           (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR),
+                                   (VTI.Vec MQPR:$vec)),
+                              ScalarReg)>;
+    def : Pat<(Scalar (pred_intr   (Scalar ScalarReg:$prev),
+                                   (VTI.Vec MQPR:$vec),
+                                   (VTI.Pred VCCR:$pred))),
+           (COPY_TO_REGCLASS (Inst (COPY_TO_REGCLASS ScalarReg:$prev, rGPR),
+                                   (VTI.Vec MQPR:$vec),
+                                   ARMVCCThen, (VTI.Pred VCCR:$pred)),
+                              ScalarReg)>;
+  }
+}
 
-multiclass MVE_VMINMAXNMAV_fty<string iname, bit bit_7, list<dag> pattern=[]> {
-  def f32 : MVE_VMINMAXNMV<iname, "f32", 0b0, 0b0, bit_7, pattern>;
-  def f16 : MVE_VMINMAXNMV<iname, "f16", 0b1, 0b0, bit_7, pattern>;
+multiclass MVE_VMINMAXNMV_fty<string iname, bit notAbs, bit isMin,
+                              string intrBase> {
+  defm f32 : MVE_VMINMAXNMV_p<iname, notAbs, isMin, MVE_v4f32, intrBase,
+                              f32, SPR>;
+  defm f16 : MVE_VMINMAXNMV_p<iname, notAbs, isMin, MVE_v8f16, intrBase,
+                              f16, HPR>;
 }
 
-defm MVE_VMINNMAV : MVE_VMINMAXNMAV_fty<"vminnmav", 0b1>;
-defm MVE_VMAXNMAV : MVE_VMINMAXNMAV_fty<"vmaxnmav", 0b0>;
+defm MVE_VMINNMV : MVE_VMINMAXNMV_fty<"vminnmv",  1, 1, "int_arm_mve_minnmv">;
+defm MVE_VMAXNMV : MVE_VMINMAXNMV_fty<"vmaxnmv",  1, 0, "int_arm_mve_maxnmv">;
+defm MVE_VMINNMAV: MVE_VMINMAXNMV_fty<"vminnmav", 0, 1, "int_arm_mve_minnmav">;
+defm MVE_VMAXNMAV: MVE_VMINMAXNMV_fty<"vmaxnmav", 0, 0, "int_arm_mve_maxnmav">;
 
 class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size,
                  bit bit_17, bit bit_7, list<dag> pattern=[]>
@@ -762,33 +816,40 @@ class MVE_VMINMAXV<string iname, string suffix, bit U, bits<2> size,
   let Inst{6-5} = 0b00;
   let Inst{3-1} = Qm{2-0};
   let Inst{0} = 0b0;
+  let horizontalReduction = 1;
 }
 
-multiclass MVE_VMINMAXV_p<string iname, bit bit_17, bit bit_7,
-                          MVEVectorVTInfo VTI, Intrinsic intr> {
+multiclass MVE_VMINMAXV_p<string iname, bit notAbs, bit isMin,
+                          MVEVectorVTInfo VTI, string intrBaseName> {
   def "": MVE_VMINMAXV<iname, VTI.Suffix, VTI.Unsigned, VTI.Size,
-                       bit_17, bit_7>;
-  defvar Inst = !cast<Instruction>(NAME);
+                       notAbs, isMin>;
+  defvar Inst        = !cast<Instruction>(NAME);
+  defvar unpred_intr = !cast<Intrinsic>(intrBaseName);
+  defvar pred_intr   = !cast<Intrinsic>(intrBaseName#"_predicated");
+  defvar base_args   = (? (i32 rGPR:$prev), (VTI.Vec MQPR:$vec));
+  defvar args        = !if(notAbs, !con(base_args, (? (i32 VTI.Unsigned))),
+                           base_args);
 
-  let Predicates = [HasMVEInt] in
-  def _pat : Pat<(i32 (intr (i32 rGPR:$prev), (VTI.Vec MQPR:$vec))),
-                 (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>;
+  let Predicates = [HasMVEInt] in {
+    def : Pat<(i32 !con(args, (unpred_intr))),
+              (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec)))>;
+    def : Pat<(i32 !con(args, (pred_intr (VTI.Pred VCCR:$pred)))),
+              (i32 (Inst (i32 rGPR:$prev), (VTI.Vec MQPR:$vec),
+                         ARMVCCThen, (VTI.Pred VCCR:$pred)))>;
+  }
 }
 
-multiclass MVE_VMINMAXV_ty<string iname, bit bit_7,
-                           Intrinsic intr_s, Intrinsic intr_u> {
-  defm s8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16s8, intr_s>;
-  defm s16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8s16, intr_s>;
-  defm s32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4s32, intr_s>;
-  defm u8 : MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v16u8, intr_u>;
-  defm u16: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v8u16, intr_u>;
-  defm u32: MVE_VMINMAXV_p<iname, 1, bit_7, MVE_v4u32, intr_u>;
+multiclass MVE_VMINMAXV_ty<string iname, bit isMin, string intrBaseName> {
+  defm s8 : MVE_VMINMAXV_p<iname, 1, isMin, MVE_v16s8, intrBaseName>;
+  defm s16: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v8s16, intrBaseName>;
+  defm s32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4s32, intrBaseName>;
+  defm u8 : MVE_VMINMAXV_p<iname, 1, isMin, MVE_v16u8, intrBaseName>;
+  defm u16: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v8u16, intrBaseName>;
+  defm u32: MVE_VMINMAXV_p<iname, 1, isMin, MVE_v4u32, intrBaseName>;
 }
 
-defm MVE_VMINV : MVE_VMINMAXV_ty<
-  "vminv", 0b1, int_arm_mve_minv_s, int_arm_mve_minv_u>;
-defm MVE_VMAXV : MVE_VMINMAXV_ty<
-  "vmaxv", 0b0, int_arm_mve_maxv_s, int_arm_mve_maxv_u>;
+defm MVE_VMINV : MVE_VMINMAXV_ty<"vminv", 1, "int_arm_mve_minv">;
+defm MVE_VMAXV : MVE_VMINMAXV_ty<"vmaxv", 0, "int_arm_mve_maxv">;
 
 let Predicates = [HasMVEInt] in {
   def : Pat<(i32 (vecreduce_smax (v16i8 MQPR:$src))),
@@ -819,14 +880,14 @@ let Predicates = [HasMVEInt] in {
 
 }
 
-multiclass MVE_VMINMAXAV_ty<string iname, bit bit_7, list<dag> pattern=[]> {
-  def s8  : MVE_VMINMAXV<iname, "s8",  0b0, 0b00, 0b0, bit_7>;
-  def s16 : MVE_VMINMAXV<iname, "s16", 0b0, 0b01, 0b0, bit_7>;
-  def s32 : MVE_VMINMAXV<iname, "s32", 0b0, 0b10, 0b0, bit_7>;
+multiclass MVE_VMINMAXAV_ty<string iname, bit isMin, string intrBaseName> {
+  defm s8 : MVE_VMINMAXV_p<iname, 0, isMin, MVE_v16s8, intrBaseName>;
+  defm s16: MVE_VMINMAXV_p<iname, 0, isMin, MVE_v8s16, intrBaseName>;
+  defm s32: MVE_VMINMAXV_p<iname, 0, isMin, MVE_v4s32, intrBaseName>;
 }
 
-defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 0b1>;
-defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0b0>;
+defm MVE_VMINAV : MVE_VMINMAXAV_ty<"vminav", 1, "int_arm_mve_minav">;
+defm MVE_VMAXAV : MVE_VMINMAXAV_ty<"vmaxav", 0, "int_arm_mve_maxav">;
 
 class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
                    bit sz, bit bit_28, bit A, bit X, bit bit_8, bit bit_0>
@@ -847,6 +908,12 @@ class MVE_VMLAMLSDAV<string iname, string suffix, dag iops, string cstr,
   let Inst{5} = A;
   let Inst{3-1} = Qm{2-0};
   let Inst{0} = bit_0;
+  let horizontalReduction = 1;
+  // Allow tail predication for non-exchanging versions. As this is also a
+  // horizontalReduction, ARMLowOverheadLoops will also have to check that
+  // the vector operands contain zeros in their false lanes for the instruction
+  // to be properly valid.
+  let validForTailPredication = !eq(X, 0);
 }
 
 multiclass MVE_VMLAMLSDAV_A<string iname, string x, MVEVectorVTInfo VTI,
@@ -932,6 +999,58 @@ defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v16s8, 0b0, 0b1>;
 defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v8s16, 0b0, 0b0>;
 defm MVE_VMLSDAV : MVE_VMLSDAV_multi<MVE_v4s32, 0b1, 0b0>;
 
+def SDTVecReduce2 : SDTypeProfile<1, 2, [    // VMLAV
+  SDTCisInt<0>, SDTCisVec<1>, SDTCisVec<2>
+]>;
+def SDTVecReduce2L : SDTypeProfile<2, 2, [    // VMLALV
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisVec<2>, SDTCisVec<3>
+]>;
+def SDTVecReduce2LA : SDTypeProfile<2, 4, [    // VMLALVA
+  SDTCisInt<0>, SDTCisInt<1>, SDTCisInt<2>, SDTCisInt<3>,
+  SDTCisVec<4>, SDTCisVec<5>
+]>;
+def ARMVMLAVs       : SDNode<"ARMISD::VMLAVs", SDTVecReduce2>;
+def ARMVMLAVu       : SDNode<"ARMISD::VMLAVu", SDTVecReduce2>;
+def ARMVMLALVs      : SDNode<"ARMISD::VMLALVs", SDTVecReduce2L>;
+def ARMVMLALVu      : SDNode<"ARMISD::VMLALVu", SDTVecReduce2L>;
+def ARMVMLALVAs      : SDNode<"ARMISD::VMLALVAs", SDTVecReduce2LA>;
+def ARMVMLALVAu      : SDNode<"ARMISD::VMLALVAu", SDTVecReduce2LA>;
+
+let Predicates = [HasMVEInt] in {
+  def : Pat<(i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))),
+            (i32 (MVE_VMLADAVu32 $src1, $src2))>;
+  def : Pat<(i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))),
+            (i32 (MVE_VMLADAVu16 $src1, $src2))>;
+  def : Pat<(i32 (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
+            (i32 (MVE_VMLADAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+  def : Pat<(i32 (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))),
+            (i32 (MVE_VMLADAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+  def : Pat<(i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))),
+            (i32 (MVE_VMLADAVu8 $src1, $src2))>;
+  def : Pat<(i32 (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+            (i32 (MVE_VMLADAVs8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+  def : Pat<(i32 (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2))),
+            (i32 (MVE_VMLADAVu8 (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+
+  def : Pat<(i32 (add (i32 (vecreduce_add (mul (v4i32 MQPR:$src1), (v4i32 MQPR:$src2)))),
+                                          (i32 tGPREven:$src3))),
+            (i32 (MVE_VMLADAVau32 $src3, $src1, $src2))>;
+  def : Pat<(i32 (add (i32 (vecreduce_add (mul (v8i16 MQPR:$src1), (v8i16 MQPR:$src2)))),
+                                          (i32 tGPREven:$src3))),
+            (i32 (MVE_VMLADAVau16 $src3, $src1, $src2))>;
+  def : Pat<(i32 (add (ARMVMLAVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)),
+            (i32 (MVE_VMLADAVas16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+  def : Pat<(i32 (add (ARMVMLAVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)), tGPREven:$Rd)),
+            (i32 (MVE_VMLADAVau16 tGPREven:$Rd, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)))>;
+  def : Pat<(i32 (add (i32 (vecreduce_add (mul (v16i8 MQPR:$src1), (v16i8 MQPR:$src2)))),
+                                          (i32 tGPREven:$src3))),
+            (i32 (MVE_VMLADAVau8 $src3, $src1, $src2))>;
+  def : Pat<(i32 (add (ARMVMLAVs (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)),
+            (i32 (MVE_VMLADAVas8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+  def : Pat<(i32 (add (ARMVMLAVu (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)), tGPREven:$Rd)),
+            (i32 (MVE_VMLADAVau8 tGPREven:$Rd, (v16i8 MQPR:$val1), (v16i8 MQPR:$val2)))>;
+}
+
 // vmlav aliases vmladav
 foreach acc = ["", "a"] in {
   foreach suffix = ["s8", "s16", "s32", "u8", "u16", "u32"] in {
@@ -963,6 +1082,14 @@ class MVE_VMLALDAVBase<string iname, string suffix, dag iops, string cstr,
   let Inst{5} = A;
   let Inst{3-1} = Qm{2-0};
   let Inst{0} = bit_0;
+  let horizontalReduction = 1;
+  // Allow tail predication for non-exchanging versions. As this is also a
+  // horizontalReduction, ARMLowOverheadLoops will also have to check that
+  // the vector operands contain zeros in their false lanes for the instruction
+  // to be properly valid.
+  let validForTailPredication = !eq(X, 0);
+
+  let hasSideEffects = 0;
 }
 
 multiclass MVE_VMLALDAVBase_A<string iname, string x, string suffix,
@@ -1023,6 +1150,26 @@ multiclass MVE_VMLALDAV_multi<string suffix, bit sz, list<dag> pattern=[]> {
 defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"16", 0b0>;
 defm MVE_VMLALDAV : MVE_VMLALDAV_multi<"32", 0b1>;
 
+let Predicates = [HasMVEInt] in {
+  def : Pat<(ARMVMLALVs (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)),
+            (MVE_VMLALDAVs32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>;
+  def : Pat<(ARMVMLALVu (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)),
+            (MVE_VMLALDAVu32 (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>;
+  def : Pat<(ARMVMLALVs (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)),
+            (MVE_VMLALDAVs16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>;
+  def : Pat<(ARMVMLALVu (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)),
+            (MVE_VMLALDAVu16 (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>;
+
+  def : Pat<(ARMVMLALVAs tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)),
+            (MVE_VMLALDAVas32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>;
+  def : Pat<(ARMVMLALVAu tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2)),
+            (MVE_VMLALDAVau32 tGPREven:$Rda, tGPROdd:$Rdb, (v4i32 MQPR:$val1), (v4i32 MQPR:$val2))>;
+  def : Pat<(ARMVMLALVAs tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)),
+            (MVE_VMLALDAVas16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>;
+  def : Pat<(ARMVMLALVAu tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2)),
+            (MVE_VMLALDAVau16 tGPREven:$Rda, tGPROdd:$Rdb, (v8i16 MQPR:$val1), (v8i16 MQPR:$val2))>;
+}
+
 // vmlalv aliases vmlaldav
 foreach acc = ["", "a"] in {
   foreach suffix = ["s16", "s32", "u16", "u32"] in {
@@ -1244,28 +1391,29 @@ let Predicates = [HasMVEInt] in {
             (v4i32 (MVE_VREV32_8 (v4i32 MQPR:$src)))>;
 }
 
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v4i32 (ARMvrev64 (v4i32 MQPR:$src))),
-            (v4i32 (MVE_VREV64_32 (v4i32 MQPR:$src)))>;
-  def : Pat<(v8i16 (ARMvrev64 (v8i16 MQPR:$src))),
-            (v8i16 (MVE_VREV64_16 (v8i16 MQPR:$src)))>;
-  def : Pat<(v16i8 (ARMvrev64 (v16i8 MQPR:$src))),
-            (v16i8 (MVE_VREV64_8  (v16i8 MQPR:$src)))>;
+multiclass MVE_VREV_basic_patterns<int revbits, list<MVEVectorVTInfo> VTIs,
+                                   Instruction Inst> {
+  defvar unpred_op = !cast<SDNode>("ARMvrev" # revbits);
 
-  def : Pat<(v8i16 (ARMvrev32 (v8i16 MQPR:$src))),
-            (v8i16 (MVE_VREV32_16 (v8i16 MQPR:$src)))>;
-  def : Pat<(v16i8 (ARMvrev32 (v16i8 MQPR:$src))),
-            (v16i8 (MVE_VREV32_8  (v16i8 MQPR:$src)))>;
+  foreach VTI = VTIs in {
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$src))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$src)))>;
+    def : Pat<(VTI.Vec (int_arm_mve_vrev_predicated (VTI.Vec MQPR:$src),
+                  revbits, (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$src), ARMVCCThen,
+                  (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>;
+  }
+}
+
+let Predicates = [HasMVEInt] in {
+  defm: MVE_VREV_basic_patterns<64, [MVE_v4i32, MVE_v4f32], MVE_VREV64_32>;
+  defm: MVE_VREV_basic_patterns<64, [MVE_v8i16, MVE_v8f16], MVE_VREV64_16>;
+  defm: MVE_VREV_basic_patterns<64, [MVE_v16i8           ], MVE_VREV64_8>;
 
-  def : Pat<(v16i8 (ARMvrev16 (v16i8 MQPR:$src))),
-            (v16i8 (MVE_VREV16_8  (v16i8 MQPR:$src)))>;
+  defm: MVE_VREV_basic_patterns<32, [MVE_v8i16, MVE_v8f16], MVE_VREV32_16>;
+  defm: MVE_VREV_basic_patterns<32, [MVE_v16i8           ], MVE_VREV32_8>;
 
-  def : Pat<(v4f32 (ARMvrev64 (v4f32 MQPR:$src))),
-            (v4f32 (MVE_VREV64_32 (v4f32 MQPR:$src)))>;
-  def : Pat<(v8f16 (ARMvrev64 (v8f16 MQPR:$src))),
-            (v8f16 (MVE_VREV64_16 (v8f16 MQPR:$src)))>;
-  def : Pat<(v8f16 (ARMvrev32 (v8f16 MQPR:$src))),
-            (v8f16 (MVE_VREV32_16 (v8f16 MQPR:$src)))>;
+  defm: MVE_VREV_basic_patterns<16, [MVE_v16i8           ], MVE_VREV16_8>;
 }
 
 def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm),
@@ -1280,14 +1428,14 @@ def MVE_VMVN : MVE_bit_arith<(outs MQPR:$Qd), (ins MQPR:$Qm),
 }
 
 let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (vnotq  (v16i8 MQPR:$val1))),
-            (v16i8 (MVE_VMVN (v16i8 MQPR:$val1)))>;
-  def : Pat<(v8i16 (vnotq  (v8i16 MQPR:$val1))),
-            (v8i16 (MVE_VMVN (v8i16 MQPR:$val1)))>;
-  def : Pat<(v4i32 (vnotq  (v4i32 MQPR:$val1))),
-            (v4i32 (MVE_VMVN (v4i32 MQPR:$val1)))>;
-  def : Pat<(v2i64 (vnotq  (v2i64 MQPR:$val1))),
-            (v2i64 (MVE_VMVN (v2i64 MQPR:$val1)))>;
+  foreach VTI = [ MVE_v16i8, MVE_v8i16, MVE_v4i32, MVE_v2i64 ] in {
+    def : Pat<(VTI.Vec (vnotq    (VTI.Vec MQPR:$val1))),
+              (VTI.Vec (MVE_VMVN (VTI.Vec MQPR:$val1)))>;
+    def : Pat<(VTI.Vec (int_arm_mve_mvn_predicated (VTI.Vec MQPR:$val1),
+                       (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (MVE_VMVN (VTI.Vec MQPR:$val1), ARMVCCThen,
+                       (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>;
+  }
 }
 
 class MVE_bit_ops<string iname, bits<2> bit_21_20, bit bit_28>
@@ -1383,10 +1531,10 @@ defm : MVE_bit_op_with_inv<MVE_v8i16, or, int_arm_mve_orn_predicated, MVE_VORN>;
 defm : MVE_bit_op_with_inv<MVE_v4i32, or, int_arm_mve_orn_predicated, MVE_VORN>;
 defm : MVE_bit_op_with_inv<MVE_v2i64, or, int_arm_mve_orn_predicated, MVE_VORN>;
 
-class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps>
+class MVE_bit_cmode<string iname, string suffix, bit halfword, dag inOps>
   : MVE_p<(outs MQPR:$Qd), inOps, NoItinerary,
           iname, suffix, "$Qd, $imm", vpred_n, "$Qd = $Qd_src"> {
-  bits<8> imm;
+  bits<12> imm;
   bits<4> Qd;
 
   let Inst{28} = imm{7};
@@ -1396,66 +1544,59 @@ class MVE_bit_cmode<string iname, string suffix, bits<4> cmode, dag inOps>
   let Inst{18-16} = imm{6-4};
   let Inst{15-13} = Qd{2-0};
   let Inst{12} = 0b0;
-  let Inst{11-8} = cmode;
+  let Inst{11} = halfword;
+  let Inst{10} = !if(halfword, 0, imm{10});
+  let Inst{9} = imm{9};
+  let Inst{8} = 0b1;
   let Inst{7-6} = 0b01;
   let Inst{4} = 0b1;
   let Inst{3-0} = imm{3-0};
 }
 
-class MVE_VORR<string suffix, bits<4> cmode, ExpandImm imm_type>
-  : MVE_bit_cmode<"vorr", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> {
-  let Inst{5} = 0b0;
-  let validForTailPredication = 1;
-}
+multiclass MVE_bit_cmode_p<string iname, bit opcode,
+                           MVEVectorVTInfo VTI, Operand imm_type, SDNode op> {
+  def "" : MVE_bit_cmode<iname, VTI.Suffix, VTI.Size{0},
+                         (ins MQPR:$Qd_src, imm_type:$imm)> {
+    let Inst{5} = opcode;
+    let validForTailPredication = 1;
+  }
 
-def MVE_VORRIZ0v4i32  : MVE_VORR<"i32", 0b0001, expzero00>;
-def MVE_VORRIZ0v8i16  : MVE_VORR<"i16", 0b1001, expzero00>;
-def MVE_VORRIZ8v4i32  : MVE_VORR<"i32", 0b0011, expzero08>;
-def MVE_VORRIZ8v8i16  : MVE_VORR<"i16", 0b1011, expzero08>;
-def MVE_VORRIZ16v4i32 : MVE_VORR<"i32", 0b0101, expzero16>;
-def MVE_VORRIZ24v4i32 : MVE_VORR<"i32", 0b0111, expzero24>;
-
-def MVE_VORNIZ0v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
-    (ins MQPR:$Qd_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
-def MVE_VORNIZ0v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm",
-    (ins MQPR:$Qd_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
-def MVE_VORNIZ8v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
-    (ins MQPR:$Qd_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
-def MVE_VORNIZ8v8i16 : MVEAsmPseudo<"vorn${vp}.i16\t$Qd, $imm",
-    (ins MQPR:$Qd_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
-def MVE_VORNIZ16v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
-    (ins MQPR:$Qd_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
-def MVE_VORNIZ24v4i32 : MVEAsmPseudo<"vorn${vp}.i32\t$Qd, $imm",
-    (ins MQPR:$Qd_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qd)>;
+  defvar Inst = !cast<Instruction>(NAME);
+  defvar UnpredPat = (VTI.Vec (op (VTI.Vec MQPR:$src), timm:$simm));
 
-def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm",
-    (MVE_VORR MQPR:$Qd, MQPR:$Qm, MQPR:$Qm, vpred_r:$vp)>;
+  let Predicates = [HasMVEInt] in {
+    def : Pat<UnpredPat, (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm))>;
+    def : Pat<(VTI.Vec (vselect (VTI.Pred VCCR:$pred),
+                          UnpredPat, (VTI.Vec MQPR:$src))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$src), imm_type:$simm,
+                             ARMVCCThen, (VTI.Pred VCCR:$pred)))>;
+  }
+}
 
-class MVE_VBIC<string suffix, bits<4> cmode, ExpandImm imm_type>
-  : MVE_bit_cmode<"vbic", suffix, cmode, (ins MQPR:$Qd_src, imm_type:$imm)> {
-  let Inst{5} = 0b1;
-  let validForTailPredication = 1;
+multiclass MVE_VORRimm<MVEVectorVTInfo VTI, Operand imm_type> {
+  defm "": MVE_bit_cmode_p<"vorr", 0, VTI, imm_type, ARMvorrImm>;
+}
+multiclass MVE_VBICimm<MVEVectorVTInfo VTI, Operand imm_type> {
+  defm "": MVE_bit_cmode_p<"vbic", 1, VTI, imm_type, ARMvbicImm>;
 }
 
-def MVE_VBICIZ0v4i32  : MVE_VBIC<"i32", 0b0001, expzero00>;
-def MVE_VBICIZ0v8i16  : MVE_VBIC<"i16", 0b1001, expzero00>;
-def MVE_VBICIZ8v4i32  : MVE_VBIC<"i32", 0b0011, expzero08>;
-def MVE_VBICIZ8v8i16  : MVE_VBIC<"i16", 0b1011, expzero08>;
-def MVE_VBICIZ16v4i32 : MVE_VBIC<"i32", 0b0101, expzero16>;
-def MVE_VBICIZ24v4i32 : MVE_VBIC<"i32", 0b0111, expzero24>;
-
-def MVE_VANDIZ0v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
-    (ins MQPR:$Qda_src, expzero00inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
-def MVE_VANDIZ0v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm",
-    (ins MQPR:$Qda_src, expzero00inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
-def MVE_VANDIZ8v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
-    (ins MQPR:$Qda_src, expzero08inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
-def MVE_VANDIZ8v8i16 : MVEAsmPseudo<"vand${vp}.i16\t$Qda, $imm",
-    (ins MQPR:$Qda_src, expzero08inv16:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
-def MVE_VANDIZ16v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
-    (ins MQPR:$Qda_src, expzero16inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
-def MVE_VANDIZ24v4i32 : MVEAsmPseudo<"vand${vp}.i32\t$Qda, $imm",
-    (ins MQPR:$Qda_src, expzero24inv32:$imm, vpred_n:$vp), (outs MQPR:$Qda)>;
+defm MVE_VORRimmi16 : MVE_VORRimm<MVE_v8i16, nImmSplatI16>;
+defm MVE_VORRimmi32 : MVE_VORRimm<MVE_v4i32, nImmSplatI32>;
+defm MVE_VBICimmi16 : MVE_VBICimm<MVE_v8i16, nImmSplatI16>;
+defm MVE_VBICimmi32 : MVE_VBICimm<MVE_v4i32, nImmSplatI32>;
+
+def MVE_VORNimmi16 : MVEInstAlias<"vorn${vp}.i16\t$Qd, $imm",
+    (MVE_VORRimmi16 MQPR:$Qd, nImmSplatNotI16:$imm, vpred_n:$vp), 0>;
+def MVE_VORNimmi32 : MVEInstAlias<"vorn${vp}.i32\t$Qd, $imm",
+    (MVE_VORRimmi32 MQPR:$Qd, nImmSplatNotI32:$imm, vpred_n:$vp), 0>;
+
+def MVE_VANDimmi16 : MVEInstAlias<"vand${vp}.i16\t$Qd, $imm",
+    (MVE_VBICimmi16 MQPR:$Qd, nImmSplatNotI16:$imm, vpred_n:$vp), 0>;
+def MVE_VANDimmi32 : MVEInstAlias<"vand${vp}.i32\t$Qd, $imm",
+    (MVE_VBICimmi32 MQPR:$Qd, nImmSplatNotI32:$imm, vpred_n:$vp), 0>;
+
+def MVE_VMOV : MVEInstAlias<"vmov${vp}\t$Qd, $Qm",
+    (MVE_VORR MQPR:$Qd, MQPR:$Qm, MQPR:$Qm, vpred_r:$vp)>;
 
 class MVE_VMOV_lane_direction {
   bit bit_20;
@@ -1494,6 +1635,8 @@ class MVE_VMOV_lane<string suffix, bit U, dag indexop,
   let Inst{11-8} = 0b1011;
   let Inst{7} = Qd{3};
   let Inst{4-0} = 0b10000;
+
+  let hasSideEffects = 0;
 }
 
 class MVE_VMOV_lane_32<MVE_VMOV_lane_direction dir>
@@ -1557,10 +1700,14 @@ let Predicates = [HasMVEInt] in {
             (MVE_VMOV_from_lane_s8 MQPR:$src, imm:$lane)>;
   def : Pat<(ARMvgetlanes (v8i16 MQPR:$src), imm:$lane),
             (MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>;
+  def : Pat<(ARMvgetlanes (v8f16 MQPR:$src), imm:$lane),
+            (MVE_VMOV_from_lane_s16 MQPR:$src, imm:$lane)>;
   def : Pat<(ARMvgetlaneu (v16i8 MQPR:$src), imm:$lane),
             (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane)>;
   def : Pat<(ARMvgetlaneu (v8i16 MQPR:$src), imm:$lane),
             (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>;
+  def : Pat<(ARMvgetlaneu (v8f16 MQPR:$src), imm:$lane),
+            (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane)>;
 
   def : Pat<(v16i8 (scalar_to_vector GPR:$src)),
             (MVE_VMOV_to_lane_8  (v16i8 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
@@ -1575,8 +1722,8 @@ let Predicates = [HasMVEInt] in {
   def : Pat<(insertelt (v4f32 MQPR:$src1), (f32 SPR:$src2), imm:$lane),
             (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS MQPR:$src1, MQPR)), SPR:$src2, (SSubReg_f32_reg imm:$lane))>;
 
-  def : Pat<(insertelt (v8f16 MQPR:$src1), HPR:$src2, imm:$lane),
-            (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS HPR:$src2, rGPR), imm:$lane)>;
+  def : Pat<(insertelt (v8f16 MQPR:$src1), (f16 HPR:$src2), imm:$lane),
+            (MVE_VMOV_to_lane_16 MQPR:$src1, (COPY_TO_REGCLASS (f16 HPR:$src2), rGPR), imm:$lane)>;
   def : Pat<(extractelt (v8f16 MQPR:$src), imm_even:$lane),
             (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_even:$lane))>;
   def : Pat<(extractelt (v8f16 MQPR:$src), imm_odd:$lane),
@@ -1588,8 +1735,8 @@ let Predicates = [HasMVEInt] in {
             (INSERT_SUBREG (v4f32 (IMPLICIT_DEF)), SPR:$src, ssub_0)>;
   def : Pat<(v4f32 (scalar_to_vector GPR:$src)),
             (MVE_VMOV_to_lane_32 (v4f32 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
-  def : Pat<(v8f16 (scalar_to_vector HPR:$src)),
-            (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), HPR:$src, ssub_0)>;
+  def : Pat<(v8f16 (scalar_to_vector (f16 HPR:$src))),
+            (INSERT_SUBREG (v8f16 (IMPLICIT_DEF)), (f16 HPR:$src), ssub_0)>;
   def : Pat<(v8f16 (scalar_to_vector GPR:$src)),
             (MVE_VMOV_to_lane_16 (v8f16 (IMPLICIT_DEF)), rGPR:$src, (i32 0))>;
 }
@@ -1882,6 +2029,26 @@ class MVE_VRHADD_Base<string suffix, bit U, bits<2> size, list<dag> pattern=[]>
   let validForTailPredication = 1;
 }
 
+def addnuw : PatFrag<(ops node:$lhs, node:$rhs),
+                     (add node:$lhs, node:$rhs), [{
+  return N->getFlags().hasNoUnsignedWrap();
+}]>;
+
+def addnsw : PatFrag<(ops node:$lhs, node:$rhs),
+                     (add node:$lhs, node:$rhs), [{
+  return N->getFlags().hasNoSignedWrap();
+}]>;
+
+def subnuw : PatFrag<(ops node:$lhs, node:$rhs),
+                     (sub node:$lhs, node:$rhs), [{
+  return N->getFlags().hasNoUnsignedWrap();
+}]>;
+
+def subnsw : PatFrag<(ops node:$lhs, node:$rhs),
+                     (sub node:$lhs, node:$rhs), [{
+  return N->getFlags().hasNoSignedWrap();
+}]>;
+
 multiclass MVE_VRHADD_m<MVEVectorVTInfo VTI,
                       SDNode unpred_op, Intrinsic pred_int> {
   def "" : MVE_VRHADD_Base<VTI.Suffix, VTI.Unsigned, VTI.Size>;
@@ -1913,6 +2080,37 @@ defm MVE_VRHADDu8  : MVE_VRHADD<MVE_v16u8>;
 defm MVE_VRHADDu16 : MVE_VRHADD<MVE_v8u16>;
 defm MVE_VRHADDu32 : MVE_VRHADD<MVE_v4u32>;
 
+// Rounding Halving Add perform the arithemtic operation with an extra bit of
+// precision, before performing the shift, to void clipping errors. We're not
+// modelling that here with these patterns, but we're using no wrap forms of
+// add to ensure that the extra bit of information is not needed for the
+// arithmetic or the rounding.
+def : Pat<(v16i8 (ARMvshrsImm (addnsw (addnsw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
+                                      (v16i8 (ARMvmovImm (i32 3585)))),
+                                   (i32 1))),
+              (MVE_VRHADDs8 MQPR:$Qm, MQPR:$Qn)>;
+def : Pat<(v8i16 (ARMvshrsImm (addnsw (addnsw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
+                                      (v8i16 (ARMvmovImm (i32 2049)))),
+                                   (i32 1))),
+              (MVE_VRHADDs16 MQPR:$Qm, MQPR:$Qn)>;
+def : Pat<(v4i32 (ARMvshrsImm (addnsw (addnsw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
+                                      (v4i32 (ARMvmovImm (i32 1)))),
+                                   (i32 1))),
+              (MVE_VRHADDs32 MQPR:$Qm, MQPR:$Qn)>;
+def : Pat<(v16i8 (ARMvshruImm (addnuw (addnuw (v16i8 MQPR:$Qm), (v16i8 MQPR:$Qn)),
+                                      (v16i8 (ARMvmovImm (i32 3585)))),
+                                   (i32 1))),
+              (MVE_VRHADDu8 MQPR:$Qm, MQPR:$Qn)>;
+def : Pat<(v8i16 (ARMvshruImm (addnuw (addnuw (v8i16 MQPR:$Qm), (v8i16 MQPR:$Qn)),
+                                      (v8i16 (ARMvmovImm (i32 2049)))),
+                                   (i32 1))),
+              (MVE_VRHADDu16 MQPR:$Qm, MQPR:$Qn)>;
+def : Pat<(v4i32 (ARMvshruImm (addnuw (addnuw (v4i32 MQPR:$Qm), (v4i32 MQPR:$Qn)),
+                                      (v4i32 (ARMvmovImm (i32 1)))),
+                                   (i32 1))),
+             (MVE_VRHADDu32 MQPR:$Qm, MQPR:$Qn)>;
+
+
 class MVE_VHADDSUB<string iname, string suffix, bit U, bit subtract,
                    bits<2> size, list<dag> pattern=[]>
   : MVE_int<iname, suffix, size, pattern> {
@@ -1936,7 +2134,8 @@ class MVE_VHSUB_<string suffix, bit U, bits<2> size,
   : MVE_VHADDSUB<"vhsub", suffix, U, 0b1, size, pattern>;
 
 multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
-                      SDNode unpred_op, Intrinsic pred_int> {
+                      SDNode unpred_op, Intrinsic pred_int, PatFrag add_op,
+                      SDNode shift_op> {
   def "" : MVE_VHADD_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
 
@@ -1945,6 +2144,9 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
     def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
 
+    def : Pat<(VTI.Vec (shift_op (add_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))),
+              (Inst MQPR:$Qm, MQPR:$Qn)>;
+
     // Predicated add-and-divide-by-two
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn), (i32 VTI.Unsigned),
                             (VTI.Pred VCCR:$mask), (VTI.Vec MQPR:$inactive))),
@@ -1954,18 +2156,24 @@ multiclass MVE_VHADD_m<MVEVectorVTInfo VTI,
   }
 }
 
-multiclass MVE_VHADD<MVEVectorVTInfo VTI>
-  : MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated>;
+multiclass MVE_VHADD<MVEVectorVTInfo VTI, PatFrag add_op, SDNode shift_op>
+  : MVE_VHADD_m<VTI, int_arm_mve_vhadd, int_arm_mve_hadd_predicated, add_op,
+                shift_op>;
 
-defm MVE_VHADDs8  : MVE_VHADD<MVE_v16s8>;
-defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16>;
-defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32>;
-defm MVE_VHADDu8  : MVE_VHADD<MVE_v16u8>;
-defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16>;
-defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32>;
+// Halving add/sub perform the arithemtic operation with an extra bit of
+// precision, before performing the shift, to void clipping errors. We're not
+// modelling that here with these patterns, but we're using no wrap forms of
+// add/sub to ensure that the extra bit of information is not needed.
+defm MVE_VHADDs8  : MVE_VHADD<MVE_v16s8, addnsw, ARMvshrsImm>;
+defm MVE_VHADDs16 : MVE_VHADD<MVE_v8s16, addnsw, ARMvshrsImm>;
+defm MVE_VHADDs32 : MVE_VHADD<MVE_v4s32, addnsw, ARMvshrsImm>;
+defm MVE_VHADDu8  : MVE_VHADD<MVE_v16u8, addnuw, ARMvshruImm>;
+defm MVE_VHADDu16 : MVE_VHADD<MVE_v8u16, addnuw, ARMvshruImm>;
+defm MVE_VHADDu32 : MVE_VHADD<MVE_v4u32, addnuw, ARMvshruImm>;
 
 multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
-                      SDNode unpred_op, Intrinsic pred_int> {
+                      SDNode unpred_op, Intrinsic pred_int, PatFrag sub_op,
+                      SDNode shift_op> {
   def "" : MVE_VHSUB_<VTI.Suffix, VTI.Unsigned, VTI.Size>;
   defvar Inst = !cast<Instruction>(NAME);
 
@@ -1975,6 +2183,10 @@ multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
                             (i32 VTI.Unsigned))),
               (VTI.Vec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
 
+    def : Pat<(VTI.Vec (shift_op (sub_op (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)), (i32 1))),
+              (Inst MQPR:$Qm, MQPR:$Qn)>;
+
+
     // Predicated subtract-and-divide-by-two
     def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
                             (i32 VTI.Unsigned), (VTI.Pred VCCR:$mask),
@@ -1985,15 +2197,16 @@ multiclass MVE_VHSUB_m<MVEVectorVTInfo VTI,
   }
 }
 
-multiclass MVE_VHSUB<MVEVectorVTInfo VTI>
-  : MVE_VHSUB_m<VTI, int_arm_mve_vhsub, int_arm_mve_hsub_predicated>;
+multiclass MVE_VHSUB<MVEVectorVTInfo VTI, PatFrag sub_op, SDNode shift_op>
+  : MVE_VHSUB_m<VTI, int_arm_mve_vhsub, int_arm_mve_hsub_predicated, sub_op,
+                shift_op>;
 
-defm MVE_VHSUBs8  : MVE_VHSUB<MVE_v16s8>;
-defm MVE_VHSUBs16 : MVE_VHSUB<MVE_v8s16>;
-defm MVE_VHSUBs32 : MVE_VHSUB<MVE_v4s32>;
-defm MVE_VHSUBu8  : MVE_VHSUB<MVE_v16u8>;
-defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16>;
-defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32>;
+defm MVE_VHSUBs8  : MVE_VHSUB<MVE_v16s8, subnsw, ARMvshrsImm>;
+defm MVE_VHSUBs16 : MVE_VHSUB<MVE_v8s16, subnsw, ARMvshrsImm>;
+defm MVE_VHSUBs32 : MVE_VHSUB<MVE_v4s32, subnsw, ARMvshrsImm>;
+defm MVE_VHSUBu8  : MVE_VHSUB<MVE_v16u8, subnuw, ARMvshruImm>;
+defm MVE_VHSUBu16 : MVE_VHSUB<MVE_v8u16, subnuw, ARMvshruImm>;
+defm MVE_VHSUBu32 : MVE_VHSUB<MVE_v4u32, subnuw, ARMvshruImm>;
 
 class MVE_VDUP<string suffix, bit B, bit E, list<dag> pattern=[]>
   : MVE_p<(outs MQPR:$Qd), (ins rGPR:$Rt), NoItinerary,
@@ -2028,24 +2241,37 @@ let Predicates = [HasMVEInt] in {
   def : Pat<(v4i32 (ARMvdup (i32 rGPR:$elem))),
             (MVE_VDUP32 rGPR:$elem)>;
 
-  def : Pat<(v4i32 (ARMvduplane (v4i32 MQPR:$src), imm:$lane)),
-            (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>;
-  // For the 16-bit and 8-bit vduplanes we don't care about the signedness
-  // of the lane move operation as we only want the lowest 8/16 bits anyway.
-  def : Pat<(v8i16 (ARMvduplane (v8i16 MQPR:$src), imm:$lane)),
-            (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>;
-  def : Pat<(v16i8 (ARMvduplane (v16i8 MQPR:$src), imm:$lane)),
-            (MVE_VDUP8  (MVE_VMOV_from_lane_u8 MQPR:$src, imm:$lane))>;
-
-  def : Pat<(v4f32 (ARMvdup (f32 SPR:$elem))),
-            (v4f32 (MVE_VDUP32 (i32 (COPY_TO_REGCLASS (f32 SPR:$elem), rGPR))))>;
-  def : Pat<(v8f16 (ARMvdup (f16 HPR:$elem))),
-            (v8f16 (MVE_VDUP16 (i32 (COPY_TO_REGCLASS (f16 HPR:$elem), rGPR))))>;
+  def : Pat<(v8f16 (ARMvdup (i32 rGPR:$elem))),
+            (MVE_VDUP16 rGPR:$elem)>;
+  def : Pat<(v4f32 (ARMvdup (i32 rGPR:$elem))),
+            (MVE_VDUP32 rGPR:$elem)>;
 
-  def : Pat<(v4f32 (ARMvduplane (v4f32 MQPR:$src), imm:$lane)),
-            (MVE_VDUP32 (MVE_VMOV_from_lane_32 MQPR:$src, imm:$lane))>;
-  def : Pat<(v8f16 (ARMvduplane (v8f16 MQPR:$src), imm:$lane)),
-            (MVE_VDUP16 (MVE_VMOV_from_lane_u16 MQPR:$src, imm:$lane))>;
+  // Match a vselect with an ARMvdup as a predicated MVE_VDUP
+  def : Pat<(v16i8 (vselect (v16i1 VCCR:$pred),
+                            (v16i8 (ARMvdup (i32 rGPR:$elem))),
+                            (v16i8 MQPR:$inactive))),
+            (MVE_VDUP8  rGPR:$elem, ARMVCCThen, (v16i1 VCCR:$pred),
+                        (v16i8 MQPR:$inactive))>;
+  def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred),
+                            (v8i16 (ARMvdup (i32 rGPR:$elem))),
+                            (v8i16 MQPR:$inactive))),
+            (MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred),
+                            (v8i16 MQPR:$inactive))>;
+  def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred),
+                            (v4i32 (ARMvdup (i32 rGPR:$elem))),
+                            (v4i32 MQPR:$inactive))),
+            (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred),
+                            (v4i32 MQPR:$inactive))>;
+  def : Pat<(v4f32 (vselect (v4i1 VCCR:$pred),
+                            (v4f32 (ARMvdup (i32 rGPR:$elem))),
+                            (v4f32 MQPR:$inactive))),
+            (MVE_VDUP32 rGPR:$elem, ARMVCCThen, (v4i1 VCCR:$pred),
+                            (v4f32 MQPR:$inactive))>;
+  def : Pat<(v8f16 (vselect (v8i1 VCCR:$pred),
+                            (v8f16 (ARMvdup (i32 rGPR:$elem))),
+                            (v8f16 MQPR:$inactive))),
+            (MVE_VDUP16 rGPR:$elem, ARMVCCThen, (v8i1 VCCR:$pred),
+                            (v8f16 MQPR:$inactive))>;
 }
 
 
@@ -2079,32 +2305,43 @@ class MVE_VCLSCLZ<string iname, string suffix, bits<2> size,
   let validForTailPredication = 1;
 }
 
-def MVE_VCLSs8  : MVE_VCLSCLZ<"vcls", "s8",  0b00, 0b0>;
-def MVE_VCLSs16 : MVE_VCLSCLZ<"vcls", "s16", 0b01, 0b0>;
-def MVE_VCLSs32 : MVE_VCLSCLZ<"vcls", "s32", 0b10, 0b0>;
+multiclass MVE_VCLSCLZ_p<string opname, bit opcode, MVEVectorVTInfo VTI,
+                         SDNode unpred_op> {
+  def "": MVE_VCLSCLZ<"v"#opname, VTI.Suffix, VTI.Size, opcode>;
 
-def MVE_VCLZs8  : MVE_VCLSCLZ<"vclz", "i8",  0b00, 0b1>;
-def MVE_VCLZs16 : MVE_VCLSCLZ<"vclz", "i16", 0b01, 0b1>;
-def MVE_VCLZs32 : MVE_VCLSCLZ<"vclz", "i32", 0b10, 0b1>;
+  defvar Inst     = !cast<Instruction>(NAME);
+  defvar pred_int = !cast<Intrinsic>("int_arm_mve_"#opname#"_predicated");
 
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 ( ctlz (v16i8 MQPR:$val1))),
-            (v16i8 ( MVE_VCLZs8 (v16i8 MQPR:$val1)))>;
-  def : Pat<(v4i32 ( ctlz (v4i32 MQPR:$val1))),
-            (v4i32 ( MVE_VCLZs32 (v4i32 MQPR:$val1)))>;
-  def : Pat<(v8i16 ( ctlz (v8i16 MQPR:$val1))),
-            (v8i16 ( MVE_VCLZs16 (v8i16 MQPR:$val1)))>;
+  let Predicates = [HasMVEInt] in {
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$val))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$val)))>;
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$val), (VTI.Pred VCCR:$pred),
+                                 (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$val), ARMVCCThen,
+                             (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>;
+  }
 }
 
+defm MVE_VCLSs8  : MVE_VCLSCLZ_p<"cls", 0, MVE_v16s8, int_arm_mve_vcls>;
+defm MVE_VCLSs16 : MVE_VCLSCLZ_p<"cls", 0, MVE_v8s16, int_arm_mve_vcls>;
+defm MVE_VCLSs32 : MVE_VCLSCLZ_p<"cls", 0, MVE_v4s32, int_arm_mve_vcls>;
+
+defm MVE_VCLZs8  : MVE_VCLSCLZ_p<"clz", 1, MVE_v16i8, ctlz>;
+defm MVE_VCLZs16 : MVE_VCLSCLZ_p<"clz", 1, MVE_v8i16, ctlz>;
+defm MVE_VCLZs32 : MVE_VCLSCLZ_p<"clz", 1, MVE_v4i32, ctlz>;
+
 class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate,
-                      list<dag> pattern=[]>
+                      bit saturate, list<dag> pattern=[]>
   : MVEIntSingleSrc<iname, suffix, size, pattern> {
 
   let Inst{28} = 0b1;
   let Inst{25-23} = 0b111;
   let Inst{21-20} = 0b11;
-  let Inst{17-16} = 0b01;
-  let Inst{12-8} = 0b00011;
+  let Inst{17} = 0b0;
+  let Inst{16} = !eq(saturate, 0);
+  let Inst{12-11} = 0b00;
+  let Inst{10} = saturate;
+  let Inst{9-8} = 0b11;
   let Inst{7} = negate;
   let Inst{6} = 0b1;
   let Inst{4} = 0b0;
@@ -2112,61 +2349,40 @@ class MVE_VABSNEG_int<string iname, string suffix, bits<2> size, bit negate,
   let validForTailPredication = 1;
 }
 
-def MVE_VABSs8  : MVE_VABSNEG_int<"vabs", "s8",  0b00, 0b0>;
-def MVE_VABSs16 : MVE_VABSNEG_int<"vabs", "s16", 0b01, 0b0>;
-def MVE_VABSs32 : MVE_VABSNEG_int<"vabs", "s32", 0b10, 0b0>;
-
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (abs (v16i8 MQPR:$v))),
-            (v16i8 (MVE_VABSs8 $v))>;
-  def : Pat<(v8i16 (abs (v8i16 MQPR:$v))),
-            (v8i16 (MVE_VABSs16 $v))>;
-  def : Pat<(v4i32 (abs (v4i32 MQPR:$v))),
-            (v4i32 (MVE_VABSs32 $v))>;
-}
+multiclass MVE_VABSNEG_int_m<string iname, bit negate, bit saturate,
+                             SDNode unpred_op, Intrinsic pred_int,
+                             MVEVectorVTInfo VTI> {
+  def "" : MVE_VABSNEG_int<iname, VTI.Suffix, VTI.Size, negate, saturate>;
+  defvar Inst = !cast<Instruction>(NAME);
 
-def MVE_VNEGs8  : MVE_VABSNEG_int<"vneg", "s8",  0b00, 0b1>;
-def MVE_VNEGs16 : MVE_VABSNEG_int<"vneg", "s16", 0b01, 0b1>;
-def MVE_VNEGs32 : MVE_VABSNEG_int<"vneg", "s32", 0b10, 0b1>;
+  let Predicates = [HasMVEInt] in {
+    // VQABS and VQNEG have more difficult isel patterns defined elsewhere
+    if !eq(saturate, 0) then {
+      def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), (VTI.Vec (Inst $v))>;
+    }
 
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (vnegq (v16i8 MQPR:$v))),
-            (v16i8 (MVE_VNEGs8 $v))>;
-  def : Pat<(v8i16 (vnegq (v8i16 MQPR:$v))),
-            (v8i16 (MVE_VNEGs16 $v))>;
-  def : Pat<(v4i32 (vnegq (v4i32 MQPR:$v))),
-            (v4i32 (MVE_VNEGs32 $v))>;
+    def : Pat<(VTI.Vec (pred_int  (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask),
+                                  (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (Inst $v, ARMVCCThen, $mask, $inactive))>;
+  }
 }
 
-class MVE_VQABSNEG<string iname, string suffix, bits<2> size,
-                   bit negate, list<dag> pattern=[]>
-  : MVEIntSingleSrc<iname, suffix, size, pattern> {
-
-  let Inst{28} = 0b1;
-  let Inst{25-23} = 0b111;
-  let Inst{21-20} = 0b11;
-  let Inst{17-16} = 0b00;
-  let Inst{12-8} = 0b00111;
-  let Inst{7} = negate;
-  let Inst{6} = 0b1;
-  let Inst{4} = 0b0;
-  let Inst{0} = 0b0;
-  let validForTailPredication = 1;
+foreach VTI = [ MVE_v16s8, MVE_v8s16, MVE_v4s32 ] in {
+  defm "MVE_VABS" # VTI.Suffix : MVE_VABSNEG_int_m<
+     "vabs",  0, 0, abs,   int_arm_mve_abs_predicated,  VTI>;
+  defm "MVE_VQABS" # VTI.Suffix : MVE_VABSNEG_int_m<
+     "vqabs", 0, 1, ?,     int_arm_mve_qabs_predicated, VTI>;
+  defm "MVE_VNEG" # VTI.Suffix : MVE_VABSNEG_int_m<
+     "vneg",  1, 0, vnegq, int_arm_mve_neg_predicated,  VTI>;
+  defm "MVE_VQNEG" # VTI.Suffix : MVE_VABSNEG_int_m<
+     "vqneg", 1, 1, ?,     int_arm_mve_qneg_predicated, VTI>;
 }
 
-def MVE_VQABSs8  : MVE_VQABSNEG<"vqabs", "s8",  0b00, 0b0>;
-def MVE_VQABSs16 : MVE_VQABSNEG<"vqabs", "s16", 0b01, 0b0>;
-def MVE_VQABSs32 : MVE_VQABSNEG<"vqabs", "s32", 0b10, 0b0>;
-
-def MVE_VQNEGs8  : MVE_VQABSNEG<"vqneg", "s8",  0b00, 0b1>;
-def MVE_VQNEGs16 : MVE_VQABSNEG<"vqneg", "s16", 0b01, 0b1>;
-def MVE_VQNEGs32 : MVE_VQABSNEG<"vqneg", "s32", 0b10, 0b1>;
-
 // int_min/int_max: vector containing INT_MIN/INT_MAX VTI.Size times
 // zero_vec: v4i32-initialized zero vector, potentially wrapped in a bitconvert
 multiclass vqabsneg_pattern<MVEVectorVTInfo VTI, dag int_min, dag int_max,
-                         dag zero_vec,  MVE_VQABSNEG vqabs_instruction,
-                         MVE_VQABSNEG vqneg_instruction> {
+                         dag zero_vec,  MVE_VABSNEG_int vqabs_instruction,
+                         MVE_VABSNEG_int vqneg_instruction> {
   let Predicates = [HasMVEInt] in {
     // The below tree can be replaced by a vqabs instruction, as it represents
     // the following vectorized expression (r being the value in $reg):
@@ -2257,6 +2473,8 @@ let Predicates = [HasMVEInt] in {
             (v8i16 (MVE_VMOVimmi16 nImmSplatI16:$simm))>;
   def : Pat<(v4i32 (ARMvmovImm timm:$simm)),
             (v4i32 (MVE_VMOVimmi32 nImmVMOVI32:$simm))>;
+  def : Pat<(v2i64 (ARMvmovImm timm:$simm)),
+            (v2i64 (MVE_VMOVimmi64 nImmSplatI64:$simm))>;
 
   def : Pat<(v8i16 (ARMvmvnImm timm:$simm)),
             (v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm))>;
@@ -2265,6 +2483,15 @@ let Predicates = [HasMVEInt] in {
 
   def : Pat<(v4f32 (ARMvmovFPImm timm:$simm)),
             (v4f32 (MVE_VMOVimmf32 nImmVMOVF32:$simm))>;
+
+  def : Pat<(v8i16 (vselect (v8i1 VCCR:$pred), (ARMvmvnImm timm:$simm),
+                            MQPR:$inactive)),
+            (v8i16 (MVE_VMVNimmi16 nImmSplatI16:$simm,
+                            ARMVCCThen, VCCR:$pred, MQPR:$inactive))>;
+  def : Pat<(v4i32 (vselect (v4i1 VCCR:$pred), (ARMvmvnImm timm:$simm),
+                            MQPR:$inactive)),
+            (v4i32 (MVE_VMVNimmi32 nImmSplatI32:$simm,
+                            ARMVCCThen, VCCR:$pred, MQPR:$inactive))>;
 }
 
 class MVE_VMINMAXA<string iname, string suffix, bits<2> size,
@@ -2291,13 +2518,37 @@ class MVE_VMINMAXA<string iname, string suffix, bits<2> size,
   let validForTailPredication = 1;
 }
 
-def MVE_VMAXAs8  : MVE_VMINMAXA<"vmaxa", "s8",  0b00, 0b0>;
-def MVE_VMAXAs16 : MVE_VMINMAXA<"vmaxa", "s16", 0b01, 0b0>;
-def MVE_VMAXAs32 : MVE_VMINMAXA<"vmaxa", "s32", 0b10, 0b0>;
+multiclass MVE_VMINMAXA_m<string iname, MVEVectorVTInfo VTI,
+                      SDNode unpred_op, Intrinsic pred_int, bit bit_12> {
+  def "" : MVE_VMINMAXA<iname, VTI.Suffix, VTI.Size, bit_12>;
+  defvar Inst = !cast<Instruction>(NAME);
+
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated v(min|max)a
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$Qd), (abs (VTI.Vec MQPR:$Qm)))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm)))>;
+
+    // Predicated v(min|max)a
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
+                            (VTI.Pred VCCR:$mask))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+  }
+}
+
+multiclass MVE_VMINA<MVEVectorVTInfo VTI>
+  : MVE_VMINMAXA_m<"vmina", VTI, umin, int_arm_mve_vmina_predicated, 0b1>;
+
+defm MVE_VMINAs8  : MVE_VMINA<MVE_v16s8>;
+defm MVE_VMINAs16 : MVE_VMINA<MVE_v8s16>;
+defm MVE_VMINAs32 : MVE_VMINA<MVE_v4s32>;
 
-def MVE_VMINAs8  : MVE_VMINMAXA<"vmina", "s8",  0b00, 0b1>;
-def MVE_VMINAs16 : MVE_VMINMAXA<"vmina", "s16", 0b01, 0b1>;
-def MVE_VMINAs32 : MVE_VMINMAXA<"vmina", "s32", 0b10, 0b1>;
+multiclass MVE_VMAXA<MVEVectorVTInfo VTI>
+  : MVE_VMINMAXA_m<"vmaxa", VTI, umax, int_arm_mve_vmaxa_predicated, 0b0>;
+
+defm MVE_VMAXAs8  : MVE_VMAXA<MVE_v16s8>;
+defm MVE_VMAXAs16 : MVE_VMAXA<MVE_v8s16>;
+defm MVE_VMAXAs32 : MVE_VMAXA<MVE_v4s32>;
 
 // end of MVE Integer instructions
 
@@ -2334,7 +2585,7 @@ class MVE_shift_imm<dag oops, dag iops, string iname, string suffix,
   let Inst{3-1} = Qm{2-0};
 }
 
-class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U,
+class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U, bit top,
               list<dag> pattern=[]>
   : MVE_shift_imm<(outs MQPR:$Qd), (ins MQPR:$Qm),
                   iname, suffix, "$Qd, $Qm", vpred_r, "",
@@ -2344,25 +2595,36 @@ class MVE_VMOVL<string iname, string suffix, bits<2> sz, bit U,
   let Inst{21} = 0b1;
   let Inst{20-19} = sz{1-0};
   let Inst{18-16} = 0b000;
+  let Inst{12} = top;
   let Inst{11-6} = 0b111101;
   let Inst{4} = 0b0;
   let Inst{0} = 0b0;
+  let doubleWidthResult = 1;
 }
 
-multiclass MVE_VMOVL_shift_half<string iname, string suffix, bits<2> sz, bit U,
-                                list<dag> pattern=[]> {
-  def bh : MVE_VMOVL<!strconcat(iname, "b"), suffix, sz, U, pattern> {
-    let Inst{12} = 0b0;
-  }
-  def th : MVE_VMOVL<!strconcat(iname, "t"), suffix, sz, U, pattern> {
-    let Inst{12} = 0b1;
-  }
+multiclass MVE_VMOVL_m<bit top, string chr, MVEVectorVTInfo OutVTI,
+                       MVEVectorVTInfo InVTI> {
+  def "": MVE_VMOVL<"vmovl" # chr, InVTI.Suffix, OutVTI.Size,
+                    InVTI.Unsigned, top>;
+  defvar Inst = !cast<Instruction>(NAME);
+
+  def : Pat<(OutVTI.Vec (int_arm_mve_vmovl_predicated (InVTI.Vec MQPR:$src),
+                            (i32 InVTI.Unsigned), (i32 top),
+                            (OutVTI.Pred VCCR:$pred),
+                            (OutVTI.Vec MQPR:$inactive))),
+            (OutVTI.Vec (Inst (InVTI.Vec MQPR:$src), ARMVCCThen,
+                            (OutVTI.Pred VCCR:$pred),
+                            (OutVTI.Vec MQPR:$inactive)))>;
 }
 
-defm MVE_VMOVLs8 : MVE_VMOVL_shift_half<"vmovl", "s8", 0b01, 0b0>;
-defm MVE_VMOVLu8 : MVE_VMOVL_shift_half<"vmovl", "u8", 0b01, 0b1>;
-defm MVE_VMOVLs16 : MVE_VMOVL_shift_half<"vmovl", "s16", 0b10, 0b0>;
-defm MVE_VMOVLu16 : MVE_VMOVL_shift_half<"vmovl", "u16", 0b10, 0b1>;
+defm MVE_VMOVLs8bh  : MVE_VMOVL_m<0, "b", MVE_v8s16, MVE_v16s8>;
+defm MVE_VMOVLs8th  : MVE_VMOVL_m<1, "t", MVE_v8s16, MVE_v16s8>;
+defm MVE_VMOVLu8bh  : MVE_VMOVL_m<0, "b", MVE_v8u16, MVE_v16u8>;
+defm MVE_VMOVLu8th  : MVE_VMOVL_m<1, "t", MVE_v8u16, MVE_v16u8>;
+defm MVE_VMOVLs16bh : MVE_VMOVL_m<0, "b", MVE_v4s32, MVE_v8s16>;
+defm MVE_VMOVLs16th : MVE_VMOVL_m<1, "t", MVE_v4s32, MVE_v8s16>;
+defm MVE_VMOVLu16bh : MVE_VMOVL_m<0, "b", MVE_v4s32, MVE_v8u16>;
+defm MVE_VMOVLu16th : MVE_VMOVL_m<1, "t", MVE_v4s32, MVE_v8u16>;
 
 let Predicates = [HasMVEInt] in {
   def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i16),
@@ -2372,12 +2634,23 @@ let Predicates = [HasMVEInt] in {
   def : Pat<(sext_inreg (v4i32 MQPR:$src), v4i8),
             (MVE_VMOVLs16bh (MVE_VMOVLs8bh MQPR:$src))>;
 
+  def : Pat<(sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), v8i8),
+            (MVE_VMOVLs8th MQPR:$src)>;
+  def : Pat<(sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))), v4i16),
+            (MVE_VMOVLs16th MQPR:$src)>;
+
+  // zext_inreg 8 -> 16
+  def : Pat<(ARMvbicImm (v8i16 MQPR:$src), (i32 0xAFF)),
+            (MVE_VMOVLu8bh MQPR:$src)>;
   // zext_inreg 16 -> 32
   def : Pat<(and (v4i32 MQPR:$src), (v4i32 (ARMvmovImm (i32 0xCFF)))),
             (MVE_VMOVLu16bh MQPR:$src)>;
-  // zext_inreg 8 -> 16
-  def : Pat<(and (v8i16 MQPR:$src), (v8i16 (ARMvmovImm (i32 0x8FF)))),
-            (MVE_VMOVLu8bh MQPR:$src)>;
+  // Same zext_inreg with vrevs, picking the top half
+  def : Pat<(ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src)))), (i32 0xAFF)),
+            (MVE_VMOVLu8th MQPR:$src)>;
+  def : Pat<(and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src)))),
+                 (v4i32 (ARMvmovImm (i32 0xCFF)))),
+            (MVE_VMOVLu16th MQPR:$src)>;
 }
 
 
@@ -2395,6 +2668,8 @@ class MVE_VSHLL_imm<string iname, string suffix, bit U, bit th,
 
   // For the MVE_VSHLL_patterns multiclass to refer to
   Operand immediateType = immtype;
+
+  let doubleWidthResult = 1;
 }
 
 // The immediate VSHLL instructions accept shift counts from 1 up to
@@ -2438,6 +2713,7 @@ class MVE_VSHLL_by_lane_width<string iname, string suffix, bits<2> size,
   let Inst{11-6} = 0b111000;
   let Inst{4} = 0b0;
   let Inst{0} = 0b1;
+  let doubleWidthResult = 1;
 }
 
 multiclass MVE_VSHLL_lw<string iname, string suffix, bits<2> sz, bit U,
@@ -2472,17 +2748,17 @@ multiclass MVE_VSHLL_patterns<MVEVectorVTInfo VTI, int top> {
 
   def : Pat<(VTI.DblVec (pred_int   (VTI.Vec MQPR:$src), imm:$imm,
                                     (i32 VTI.Unsigned), (i32 top),
-                                    (VTI.Pred VCCR:$mask),
+                                    (VTI.DblPred VCCR:$mask),
                                     (VTI.DblVec MQPR:$inactive))),
             (VTI.DblVec (inst_imm   (VTI.Vec MQPR:$src), imm:$imm,
-                                    ARMVCCThen, (VTI.Pred VCCR:$mask),
+                                    ARMVCCThen, (VTI.DblPred VCCR:$mask),
                                     (VTI.DblVec MQPR:$inactive)))>;
   def : Pat<(VTI.DblVec (pred_int   (VTI.Vec MQPR:$src), (i32 VTI.LaneBits),
                                     (i32 VTI.Unsigned), (i32 top),
-                                    (VTI.Pred VCCR:$mask),
+                                    (VTI.DblPred VCCR:$mask),
                                     (VTI.DblVec MQPR:$inactive))),
             (VTI.DblVec (inst_lw    (VTI.Vec MQPR:$src), ARMVCCThen,
-                                    (VTI.Pred VCCR:$mask),
+                                    (VTI.DblPred VCCR:$mask),
                                     (VTI.DblVec MQPR:$inactive)))>;
 }
 
@@ -2509,6 +2785,8 @@ class MVE_VxSHRN<string iname, string suffix, bit bit_12, bit bit_28,
   let Inst{11-6} = 0b111111;
   let Inst{4} = 0b0;
   let Inst{0} = 0b1;
+  let validForTailPredication = 1;
+  let retainsPreviousHalfElement = 1;
 }
 
 def MVE_VRSHRNi16bh : MVE_VxSHRN<"vrshrnb", "i16", 0b0, 0b1, shr_imm8> {
@@ -2550,6 +2828,8 @@ class MVE_VxQRSHRUN<string iname, string suffix, bit bit_28, bit bit_12,
   let Inst{11-6} = 0b111111;
   let Inst{4} = 0b0;
   let Inst{0} = 0b0;
+  let validForTailPredication = 1;
+  let retainsPreviousHalfElement = 1;
 }
 
 def MVE_VQRSHRUNs16bh : MVE_VxQRSHRUN<
@@ -2598,6 +2878,8 @@ class MVE_VxQRSHRN<string iname, string suffix, bit bit_0, bit bit_12,
   let Inst{11-6} = 0b111101;
   let Inst{4} = 0b0;
   let Inst{0} = bit_0;
+  let validForTailPredication = 1;
+  let retainsPreviousHalfElement = 1;
 }
 
 multiclass MVE_VxQRSHRN_types<string iname, bit bit_0, bit bit_12> {
@@ -3131,41 +3413,34 @@ class MVE_VRINT<string rmode, bits<3> op, string suffix, bits<2> size,
 
 }
 
-multiclass MVE_VRINT_ops<string suffix, bits<2> size, list<dag> pattern=[]> {
-  def N : MVE_VRINT<"n", 0b000, suffix, size, pattern>;
-  def X : MVE_VRINT<"x", 0b001, suffix, size, pattern>;
-  def A : MVE_VRINT<"a", 0b010, suffix, size, pattern>;
-  def Z : MVE_VRINT<"z", 0b011, suffix, size, pattern>;
-  def M : MVE_VRINT<"m", 0b101, suffix, size, pattern>;
-  def P : MVE_VRINT<"p", 0b111, suffix, size, pattern>;
-}
+multiclass MVE_VRINT_m<MVEVectorVTInfo VTI, string suffix, bits<3> opcode,
+                       SDNode unpred_op> {
+  def "": MVE_VRINT<suffix, opcode, VTI.Suffix, VTI.Size>;
+  defvar Inst = !cast<Instruction>(NAME);
+  defvar pred_int = !cast<Intrinsic>("int_arm_mve_vrint"#suffix#"_predicated");
 
-defm MVE_VRINTf16 : MVE_VRINT_ops<"f16", 0b01>;
-defm MVE_VRINTf32 : MVE_VRINT_ops<"f32", 0b10>;
+  let Predicates = [HasMVEFloat] in {
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$val))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$val)))>;
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$val), (VTI.Pred VCCR:$pred),
+                                 (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$val), ARMVCCThen,
+                             (VTI.Pred VCCR:$pred), (VTI.Vec MQPR:$inactive)))>;
+  }
+}
 
-let Predicates = [HasMVEFloat] in {
-  def : Pat<(v4f32 (frint (v4f32 MQPR:$val1))),
-            (v4f32 (MVE_VRINTf32X (v4f32 MQPR:$val1)))>;
-  def : Pat<(v8f16 (frint (v8f16 MQPR:$val1))),
-            (v8f16 (MVE_VRINTf16X (v8f16 MQPR:$val1)))>;
-  def : Pat<(v4f32 (fround (v4f32 MQPR:$val1))),
-            (v4f32 (MVE_VRINTf32A (v4f32 MQPR:$val1)))>;
-  def : Pat<(v8f16 (fround (v8f16 MQPR:$val1))),
-            (v8f16 (MVE_VRINTf16A (v8f16 MQPR:$val1)))>;
-  def : Pat<(v4f32 (ftrunc (v4f32 MQPR:$val1))),
-            (v4f32 (MVE_VRINTf32Z (v4f32 MQPR:$val1)))>;
-  def : Pat<(v8f16 (ftrunc (v8f16 MQPR:$val1))),
-            (v8f16 (MVE_VRINTf16Z (v8f16 MQPR:$val1)))>;
-  def : Pat<(v4f32 (ffloor (v4f32 MQPR:$val1))),
-            (v4f32 (MVE_VRINTf32M (v4f32 MQPR:$val1)))>;
-  def : Pat<(v8f16 (ffloor (v8f16 MQPR:$val1))),
-            (v8f16 (MVE_VRINTf16M (v8f16 MQPR:$val1)))>;
-  def : Pat<(v4f32 (fceil (v4f32 MQPR:$val1))),
-            (v4f32 (MVE_VRINTf32P (v4f32 MQPR:$val1)))>;
-  def : Pat<(v8f16 (fceil (v8f16 MQPR:$val1))),
-            (v8f16 (MVE_VRINTf16P (v8f16 MQPR:$val1)))>;
+multiclass MVE_VRINT_ops<MVEVectorVTInfo VTI> {
+  defm N : MVE_VRINT_m<VTI, "n", 0b000, int_arm_mve_vrintn>;
+  defm X : MVE_VRINT_m<VTI, "x", 0b001, frint>;
+  defm A : MVE_VRINT_m<VTI, "a", 0b010, fround>;
+  defm Z : MVE_VRINT_m<VTI, "z", 0b011, ftrunc>;
+  defm M : MVE_VRINT_m<VTI, "m", 0b101, ffloor>;
+  defm P : MVE_VRINT_m<VTI, "p", 0b111, fceil>;
 }
 
+defm MVE_VRINTf16 : MVE_VRINT_ops<MVE_v8f16>;
+defm MVE_VRINTf32 : MVE_VRINT_ops<MVE_v4f32>;
+
 class MVEFloatArithNeon<string iname, string suffix, bit size,
                            dag oops, dag iops, string ops,
                            vpred_ops vpred, string cstr, list<dag> pattern=[]>
@@ -3281,29 +3556,40 @@ class MVE_VADDSUBFMA_fp<string iname, string suffix, bit size, bit bit_4,
   let Inst{8} = bit_8;
   let Inst{7} = Qn{3};
   let Inst{4} = bit_4;
+  let validForTailPredication = 1;
 }
 
-def MVE_VFMAf32 : MVE_VADDSUBFMA_fp<"vfma", "f32", 0b0, 0b1, 0b0, 0b0,
-    (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
-def MVE_VFMAf16 : MVE_VADDSUBFMA_fp<"vfma", "f16", 0b1, 0b1, 0b0, 0b0,
-    (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
-
-def MVE_VFMSf32 : MVE_VADDSUBFMA_fp<"vfms", "f32", 0b0, 0b1, 0b0, 0b1,
-    (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
-def MVE_VFMSf16 : MVE_VADDSUBFMA_fp<"vfms", "f16", 0b1, 0b1, 0b0, 0b1,
-    (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
+multiclass MVE_VFMA_fp_multi<string iname, bit fms, MVEVectorVTInfo VTI> {
+  def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0b1, 0b0, fms,
+                             (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
+  defvar Inst = !cast<Instruction>(NAME);
+  defvar pred_int = int_arm_mve_fma_predicated;
+  defvar m1   = (VTI.Vec MQPR:$m1);
+  defvar m2   = (VTI.Vec MQPR:$m2);
+  defvar add  = (VTI.Vec MQPR:$add);
+  defvar pred = (VTI.Pred VCCR:$pred);
 
-let Predicates = [HasMVEFloat] in {
-  def : Pat<(v8f16 (fma (v8f16 MQPR:$src1), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
-            (v8f16 (MVE_VFMAf16 $src3, $src1, $src2))>;
-  def : Pat<(v4f32 (fma (v4f32 MQPR:$src1), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
-            (v4f32 (MVE_VFMAf32 $src3, $src1, $src2))>;
-  def : Pat<(v8f16 (fma (fneg (v8f16 MQPR:$src1)), (v8f16 MQPR:$src2), (v8f16 MQPR:$src3))),
-            (v8f16 (MVE_VFMSf16 $src3, $src1, $src2))>;
-  def : Pat<(v4f32 (fma (fneg (v4f32 MQPR:$src1)), (v4f32 MQPR:$src2), (v4f32 MQPR:$src3))),
-            (v4f32 (MVE_VFMSf32 $src3, $src1, $src2))>;
+  let Predicates = [HasMVEFloat] in {
+    if fms then {
+      def : Pat<(VTI.Vec (fma (fneg m1), m2, add)), (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (fma m1, (fneg m2), add)), (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (pred_int (fneg m1), m2, add, pred)),
+                (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
+      def : Pat<(VTI.Vec (pred_int m1, (fneg m2), add, pred)),
+                (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
+    } else {
+      def : Pat<(VTI.Vec (fma m1, m2, add)), (Inst $add, $m1, $m2)>;
+      def : Pat<(VTI.Vec (pred_int m1, m2, add, pred)),
+                (Inst $add, $m1, $m2, ARMVCCThen, $pred)>;
+    }
+  }
 }
 
+defm MVE_VFMAf32 : MVE_VFMA_fp_multi<"vfma", 0, MVE_v4f32>;
+defm MVE_VFMAf16 : MVE_VFMA_fp_multi<"vfma", 0, MVE_v8f16>;
+defm MVE_VFMSf32 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v4f32>;
+defm MVE_VFMSf16 : MVE_VFMA_fp_multi<"vfms", 1, MVE_v8f16>;
+
 multiclass MVE_VADDSUB_fp_m<string iname, bit bit_21, MVEVectorVTInfo VTI,
                             SDNode unpred_op, Intrinsic pred_int> {
   def "" : MVE_VADDSUBFMA_fp<iname, VTI.Suffix, VTI.Size{0}, 0, 1, bit_21> {
@@ -3423,10 +3709,10 @@ defm MVE_VABDf32 : MVE_VABD_fp_m<MVE_v4f32>;
 defm MVE_VABDf16 : MVE_VABD_fp_m<MVE_v8f16>;
 
 class MVE_VCVT_fix<string suffix, bit fsi, bit U, bit op,
-                   Operand imm_operand_type, list<dag> pattern=[]>
+                   Operand imm_operand_type>
   : MVE_float<"vcvt", suffix,
               (outs MQPR:$Qd), (ins MQPR:$Qm, imm_operand_type:$imm6),
-              "$Qd, $Qm, $imm6", vpred_r, "", pattern> {
+              "$Qd, $Qm, $imm6", vpred_r, "", []> {
   bits<4> Qd;
   bits<6> imm6;
 
@@ -3468,14 +3754,43 @@ class MVE_VCVT_fix_f16<string suffix, bit U, bit op>
   let Inst{20} = 0b1;
 }
 
-def MVE_VCVTf16s16_fix : MVE_VCVT_fix_f16<"f16.s16", 0b0, 0b0>;
-def MVE_VCVTs16f16_fix : MVE_VCVT_fix_f16<"s16.f16", 0b0, 0b1>;
-def MVE_VCVTf16u16_fix : MVE_VCVT_fix_f16<"f16.u16", 0b1, 0b0>;
-def MVE_VCVTu16f16_fix : MVE_VCVT_fix_f16<"u16.f16", 0b1, 0b1>;
-def MVE_VCVTf32s32_fix : MVE_VCVT_fix_f32<"f32.s32", 0b0, 0b0>;
-def MVE_VCVTs32f32_fix : MVE_VCVT_fix_f32<"s32.f32", 0b0, 0b1>;
-def MVE_VCVTf32u32_fix : MVE_VCVT_fix_f32<"f32.u32", 0b1, 0b0>;
-def MVE_VCVTu32f32_fix : MVE_VCVT_fix_f32<"u32.f32", 0b1, 0b1>;
+multiclass MVE_VCVT_fix_patterns<Instruction Inst, bit U, MVEVectorVTInfo DestVTI,
+                                 MVEVectorVTInfo SrcVTI> {
+  let Predicates = [HasMVEFloat] in {
+    def : Pat<(DestVTI.Vec (int_arm_mve_vcvt_fix
+                              (i32 U), (SrcVTI.Vec MQPR:$Qm), imm:$scale)),
+              (DestVTI.Vec (Inst (SrcVTI.Vec MQPR:$Qm), imm:$scale))>;
+    def : Pat<(DestVTI.Vec (int_arm_mve_vcvt_fix_predicated (i32 U),
+                              (DestVTI.Vec MQPR:$inactive),
+                              (SrcVTI.Vec MQPR:$Qm),
+                              imm:$scale,
+                              (DestVTI.Pred VCCR:$mask))),
+              (DestVTI.Vec (Inst (SrcVTI.Vec MQPR:$Qm), imm:$scale,
+                             ARMVCCThen, (DestVTI.Pred VCCR:$mask),
+                             (DestVTI.Vec MQPR:$inactive)))>;
+  }
+}
+
+multiclass MVE_VCVT_fix_f32_m<bit U, bit op,
+                              MVEVectorVTInfo DestVTI, MVEVectorVTInfo SrcVTI> {
+  def "" : MVE_VCVT_fix_f32<DestVTI.Suffix#"."#SrcVTI.Suffix, U, op>;
+  defm : MVE_VCVT_fix_patterns<!cast<Instruction>(NAME), U, DestVTI, SrcVTI>;
+}
+
+multiclass MVE_VCVT_fix_f16_m<bit U, bit op,
+                              MVEVectorVTInfo DestVTI, MVEVectorVTInfo SrcVTI> {
+  def "" : MVE_VCVT_fix_f16<DestVTI.Suffix#"."#SrcVTI.Suffix, U, op>;
+  defm : MVE_VCVT_fix_patterns<!cast<Instruction>(NAME), U, DestVTI, SrcVTI>;
+}
+
+defm MVE_VCVTf16s16_fix : MVE_VCVT_fix_f16_m<0b0, 0b0, MVE_v8f16, MVE_v8s16>;
+defm MVE_VCVTs16f16_fix : MVE_VCVT_fix_f16_m<0b0, 0b1, MVE_v8s16, MVE_v8f16>;
+defm MVE_VCVTf16u16_fix : MVE_VCVT_fix_f16_m<0b1, 0b0, MVE_v8f16, MVE_v8u16>;
+defm MVE_VCVTu16f16_fix : MVE_VCVT_fix_f16_m<0b1, 0b1, MVE_v8u16, MVE_v8f16>;
+defm MVE_VCVTf32s32_fix : MVE_VCVT_fix_f32_m<0b0, 0b0, MVE_v4f32, MVE_v4s32>;
+defm MVE_VCVTs32f32_fix : MVE_VCVT_fix_f32_m<0b0, 0b1, MVE_v4s32, MVE_v4f32>;
+defm MVE_VCVTf32u32_fix : MVE_VCVT_fix_f32_m<0b1, 0b0, MVE_v4f32, MVE_v4u32>;
+defm MVE_VCVTu32f32_fix : MVE_VCVT_fix_f32_m<0b1, 0b1, MVE_v4u32, MVE_v4f32>;
 
 class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm,
                 bits<2> rm, list<dag> pattern=[]>
@@ -3497,23 +3812,44 @@ class MVE_VCVT_fp_int_anpm<string suffix, bits<2> size, bit op, string anpm,
   let validForTailPredication = 1;
 }
 
-multiclass MVE_VCVT_fp_int_anpm_multi<string suffix, bits<2> size, bit op,
-                                list<dag> pattern=[]> {
-  def a : MVE_VCVT_fp_int_anpm<suffix, size, op, "a", 0b00>;
-  def n : MVE_VCVT_fp_int_anpm<suffix, size, op, "n", 0b01>;
-  def p : MVE_VCVT_fp_int_anpm<suffix, size, op, "p", 0b10>;
-  def m : MVE_VCVT_fp_int_anpm<suffix, size, op, "m", 0b11>;
+multiclass MVE_VCVT_fp_int_anpm_inner<MVEVectorVTInfo Int, MVEVectorVTInfo Flt,
+                                      string anpm, bits<2> rm> {
+  def "": MVE_VCVT_fp_int_anpm<Int.Suffix # "." # Flt.Suffix, Int.Size,
+                               Int.Unsigned, anpm, rm>;
+
+  defvar Inst         = !cast<Instruction>(NAME);
+  defvar IntrBaseName = "int_arm_mve_vcvt" # anpm;
+  defvar UnpredIntr   = !cast<Intrinsic>(IntrBaseName);
+  defvar PredIntr     = !cast<Intrinsic>(IntrBaseName # "_predicated");
+
+  let Predicates = [HasMVEFloat] in {
+    def : Pat<(Int.Vec (UnpredIntr (i32 Int.Unsigned), (Flt.Vec MQPR:$in))),
+              (Int.Vec (Inst (Flt.Vec MQPR:$in)))>;
+
+    def : Pat<(Int.Vec (PredIntr (i32 Int.Unsigned), (Int.Vec MQPR:$inactive),
+                                 (Flt.Vec MQPR:$in), (Flt.Pred VCCR:$pred))),
+              (Int.Vec (Inst (Flt.Vec MQPR:$in), ARMVCCThen,
+                             (Flt.Pred VCCR:$pred), (Int.Vec MQPR:$inactive)))>;
+  }
+}
+
+multiclass MVE_VCVT_fp_int_anpm_outer<MVEVectorVTInfo Int,
+                                      MVEVectorVTInfo Flt> {
+  defm a : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "a", 0b00>;
+  defm n : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "n", 0b01>;
+  defm p : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "p", 0b10>;
+  defm m : MVE_VCVT_fp_int_anpm_inner<Int, Flt, "m", 0b11>;
 }
 
 // This defines instructions such as MVE_VCVTu16f16a, with an explicit
 // rounding-mode suffix on the mnemonic. The class below will define
 // the bare MVE_VCVTu16f16 (with implied rounding toward zero).
-defm MVE_VCVTs16f16 : MVE_VCVT_fp_int_anpm_multi<"s16.f16", 0b01, 0b0>;
-defm MVE_VCVTu16f16 : MVE_VCVT_fp_int_anpm_multi<"u16.f16", 0b01, 0b1>;
-defm MVE_VCVTs32f32 : MVE_VCVT_fp_int_anpm_multi<"s32.f32", 0b10, 0b0>;
-defm MVE_VCVTu32f32 : MVE_VCVT_fp_int_anpm_multi<"u32.f32", 0b10, 0b1>;
+defm MVE_VCVTs16f16 : MVE_VCVT_fp_int_anpm_outer<MVE_v8s16, MVE_v8f16>;
+defm MVE_VCVTu16f16 : MVE_VCVT_fp_int_anpm_outer<MVE_v8u16, MVE_v8f16>;
+defm MVE_VCVTs32f32 : MVE_VCVT_fp_int_anpm_outer<MVE_v4s32, MVE_v4f32>;
+defm MVE_VCVTu32f32 : MVE_VCVT_fp_int_anpm_outer<MVE_v4u32, MVE_v4f32>;
 
-class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op,
+class MVE_VCVT_fp_int<string suffix, bits<2> size, bit toint, bit unsigned,
                       list<dag> pattern=[]>
   : MVE_float<"vcvt", suffix, (outs MQPR:$Qd),
               (ins MQPR:$Qm), "$Qd, $Qm", vpred_r, "", pattern> {
@@ -3527,41 +3863,43 @@ class MVE_VCVT_fp_int<string suffix, bits<2> size, bits<2> op,
   let Inst{17-16} = 0b11;
   let Inst{15-13} = Qd{2-0};
   let Inst{12-9} = 0b0011;
-  let Inst{8-7} = op;
+  let Inst{8} = toint;
+  let Inst{7} = unsigned;
   let Inst{4} = 0b0;
   let validForTailPredication = 1;
 }
 
+multiclass MVE_VCVT_fp_int_m<MVEVectorVTInfo Dest, MVEVectorVTInfo Src,
+                             SDNode unpred_op> {
+  defvar Unsigned = !or(!eq(Dest.SuffixLetter,"u"), !eq(Src.SuffixLetter,"u"));
+  defvar ToInt = !eq(Src.SuffixLetter,"f");
+
+  def "" : MVE_VCVT_fp_int<Dest.Suffix # "." # Src.Suffix, Dest.Size,
+                           ToInt, Unsigned>;
+  defvar Inst = !cast<Instruction>(NAME);
+
+  let Predicates = [HasMVEFloat] in {
+    def : Pat<(Dest.Vec (unpred_op (Src.Vec MQPR:$src))),
+              (Dest.Vec (Inst (Src.Vec MQPR:$src)))>;
+    def : Pat<(Dest.Vec (int_arm_mve_vcvt_fp_int_predicated
+                             (Src.Vec MQPR:$src), (i32 Unsigned),
+                             (Src.Pred VCCR:$mask), (Dest.Vec MQPR:$inactive))),
+              (Dest.Vec (Inst (Src.Vec MQPR:$src), ARMVCCThen,
+                              (Src.Pred VCCR:$mask),
+                              (Dest.Vec MQPR:$inactive)))>;
+  }
+}
 // The unsuffixed VCVT for float->int implicitly rounds toward zero,
 // which I reflect here in the llvm instruction names
-def MVE_VCVTs16f16z : MVE_VCVT_fp_int<"s16.f16", 0b01, 0b10>;
-def MVE_VCVTu16f16z : MVE_VCVT_fp_int<"u16.f16", 0b01, 0b11>;
-def MVE_VCVTs32f32z : MVE_VCVT_fp_int<"s32.f32", 0b10, 0b10>;
-def MVE_VCVTu32f32z : MVE_VCVT_fp_int<"u32.f32", 0b10, 0b11>;
+defm MVE_VCVTs16f16z : MVE_VCVT_fp_int_m<MVE_v8s16, MVE_v8f16, fp_to_sint>;
+defm MVE_VCVTu16f16z : MVE_VCVT_fp_int_m<MVE_v8u16, MVE_v8f16, fp_to_uint>;
+defm MVE_VCVTs32f32z : MVE_VCVT_fp_int_m<MVE_v4s32, MVE_v4f32, fp_to_sint>;
+defm MVE_VCVTu32f32z : MVE_VCVT_fp_int_m<MVE_v4u32, MVE_v4f32, fp_to_uint>;
 // Whereas VCVT for int->float rounds to nearest
-def MVE_VCVTf16s16n : MVE_VCVT_fp_int<"f16.s16", 0b01, 0b00>;
-def MVE_VCVTf16u16n : MVE_VCVT_fp_int<"f16.u16", 0b01, 0b01>;
-def MVE_VCVTf32s32n : MVE_VCVT_fp_int<"f32.s32", 0b10, 0b00>;
-def MVE_VCVTf32u32n : MVE_VCVT_fp_int<"f32.u32", 0b10, 0b01>;
-
-let Predicates = [HasMVEFloat] in {
-  def : Pat<(v4i32 (fp_to_sint (v4f32 MQPR:$src))),
-            (v4i32 (MVE_VCVTs32f32z (v4f32 MQPR:$src)))>;
-  def : Pat<(v4i32 (fp_to_uint (v4f32 MQPR:$src))),
-            (v4i32 (MVE_VCVTu32f32z (v4f32 MQPR:$src)))>;
-  def : Pat<(v8i16 (fp_to_sint (v8f16 MQPR:$src))),
-            (v8i16 (MVE_VCVTs16f16z (v8f16 MQPR:$src)))>;
-  def : Pat<(v8i16 (fp_to_uint (v8f16 MQPR:$src))),
-            (v8i16 (MVE_VCVTu16f16z (v8f16 MQPR:$src)))>;
-  def : Pat<(v4f32 (sint_to_fp (v4i32 MQPR:$src))),
-            (v4f32 (MVE_VCVTf32s32n (v4i32 MQPR:$src)))>;
-  def : Pat<(v4f32 (uint_to_fp (v4i32 MQPR:$src))),
-            (v4f32 (MVE_VCVTf32u32n (v4i32 MQPR:$src)))>;
-  def : Pat<(v8f16 (sint_to_fp (v8i16 MQPR:$src))),
-            (v8f16 (MVE_VCVTf16s16n (v8i16 MQPR:$src)))>;
-  def : Pat<(v8f16 (uint_to_fp (v8i16 MQPR:$src))),
-            (v8f16 (MVE_VCVTf16u16n (v8i16 MQPR:$src)))>;
-}
+defm MVE_VCVTf16s16n : MVE_VCVT_fp_int_m<MVE_v8f16, MVE_v8s16, sint_to_fp>;
+defm MVE_VCVTf16u16n : MVE_VCVT_fp_int_m<MVE_v8f16, MVE_v8u16, uint_to_fp>;
+defm MVE_VCVTf32s32n : MVE_VCVT_fp_int_m<MVE_v4f32, MVE_v4s32, sint_to_fp>;
+defm MVE_VCVTf32u32n : MVE_VCVT_fp_int_m<MVE_v4f32, MVE_v4u32, uint_to_fp>;
 
 class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate,
                    list<dag> pattern=[]>
@@ -3582,26 +3920,29 @@ class MVE_VABSNEG_fp<string iname, string suffix, bits<2> size, bit negate,
   let validForTailPredication = 1;
 }
 
-def MVE_VABSf16 : MVE_VABSNEG_fp<"vabs", "f16", 0b01, 0b0>;
-def MVE_VABSf32 : MVE_VABSNEG_fp<"vabs", "f32", 0b10, 0b0>;
-
-let Predicates = [HasMVEFloat] in {
-  def : Pat<(v8f16 (fabs MQPR:$src)),
-            (MVE_VABSf16 MQPR:$src)>;
-  def : Pat<(v4f32 (fabs MQPR:$src)),
-            (MVE_VABSf32 MQPR:$src)>;
-}
+multiclass MVE_VABSNEG_fp_m<string iname, SDNode unpred_op, Intrinsic pred_int,
+                            MVEVectorVTInfo VTI, bit opcode> {
+  def "" : MVE_VABSNEG_fp<iname, VTI.Suffix, VTI.Size, opcode>;
+  defvar Inst = !cast<Instruction>(NAME);
 
-def MVE_VNEGf16 : MVE_VABSNEG_fp<"vneg", "f16", 0b01, 0b1>;
-def MVE_VNEGf32 : MVE_VABSNEG_fp<"vneg", "f32", 0b10, 0b1>;
+  let Predicates = [HasMVEInt] in {
+    def : Pat<(VTI.Vec (unpred_op (VTI.Vec MQPR:$v))), (VTI.Vec (Inst $v))>;
 
-let Predicates = [HasMVEFloat] in {
-  def : Pat<(v8f16 (fneg MQPR:$src)),
-            (MVE_VNEGf16 MQPR:$src)>;
-  def : Pat<(v4f32 (fneg MQPR:$src)),
-            (MVE_VNEGf32 MQPR:$src)>;
+    def : Pat<(VTI.Vec (pred_int  (VTI.Vec MQPR:$v), (VTI.Pred VCCR:$mask),
+                                  (VTI.Vec MQPR:$inactive))),
+              (VTI.Vec (Inst $v, ARMVCCThen, $mask, $inactive))>;
+  }
 }
 
+defm MVE_VABSf16 : MVE_VABSNEG_fp_m<"vabs", fabs, int_arm_mve_abs_predicated,
+                                    MVE_v8f16, 0>;
+defm MVE_VABSf32 : MVE_VABSNEG_fp_m<"vabs", fabs, int_arm_mve_abs_predicated,
+                                    MVE_v4f32, 0>;
+defm MVE_VNEGf16 : MVE_VABSNEG_fp_m<"vneg", fneg, int_arm_mve_neg_predicated,
+                                    MVE_v8f16, 1>;
+defm MVE_VNEGf32 : MVE_VABSNEG_fp_m<"vneg", fneg, int_arm_mve_neg_predicated,
+                                    MVE_v4f32, 1>;
+
 class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12,
                      list<dag> pattern=[]>
   : MVE_f<(outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
@@ -3623,11 +3964,37 @@ class MVE_VMAXMINNMA<string iname, string suffix, bit size, bit bit_12,
   let Inst{0} = 0b1;
 }
 
-def MVE_VMAXNMAf32 : MVE_VMAXMINNMA<"vmaxnma", "f32", 0b0, 0b0>;
-def MVE_VMAXNMAf16 : MVE_VMAXMINNMA<"vmaxnma", "f16", 0b1, 0b0>;
+multiclass MVE_VMAXMINNMA_m<string iname, MVEVectorVTInfo VTI,
+                      SDNode unpred_op, Intrinsic pred_int,
+                      bit bit_12> {
+  def "" : MVE_VMAXMINNMA<iname, VTI.Suffix, VTI.Size{0}, bit_12>;
+  defvar Inst = !cast<Instruction>(NAME);
+
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated v(max|min)nma
+    def : Pat<(VTI.Vec (unpred_op (fabs (VTI.Vec MQPR:$Qd)),
+                                  (fabs (VTI.Vec MQPR:$Qm)))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm)))>;
+
+    // Predicated v(max|min)nma
+    def : Pat<(VTI.Vec (pred_int (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
+                            (VTI.Pred VCCR:$mask))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$Qd), (VTI.Vec MQPR:$Qm),
+                            ARMVCCThen, (VTI.Pred VCCR:$mask)))>;
+  }
+}
+
+multiclass MVE_VMAXNMA<MVEVectorVTInfo VTI, bit bit_12>
+  : MVE_VMAXMINNMA_m<"vmaxnma", VTI, fmaxnum, int_arm_mve_vmaxnma_predicated, bit_12>;
+
+defm MVE_VMAXNMAf32 : MVE_VMAXNMA<MVE_v4f32, 0b0>;
+defm MVE_VMAXNMAf16 : MVE_VMAXNMA<MVE_v8f16, 0b0>;
 
-def MVE_VMINNMAf32 : MVE_VMAXMINNMA<"vminnma", "f32", 0b0, 0b1>;
-def MVE_VMINNMAf16 : MVE_VMAXMINNMA<"vminnma", "f16", 0b1, 0b1>;
+multiclass MVE_VMINNMA<MVEVectorVTInfo VTI, bit bit_12>
+  : MVE_VMAXMINNMA_m<"vminnma", VTI, fminnum, int_arm_mve_vminnma_predicated, bit_12>;
+
+defm MVE_VMINNMAf32 : MVE_VMINNMA<MVE_v4f32, 0b1>;
+defm MVE_VMINNMAf16 : MVE_VMINNMA<MVE_v8f16, 0b1>;
 
 // end of MVE Floating Point instructions
 
@@ -3796,12 +4163,12 @@ multiclass unpred_vcmp_r<string suffix, PatLeaf fc> {
   def i32 : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)),
                 (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc))>;
 
-  def i8r  : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)),
-                 (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc))>;
-  def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)),
-                 (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc))>;
-  def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)),
-                 (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc))>;
+  def i8r  : Pat<(v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)),
+                 (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc))>;
+  def i16r : Pat<(v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)),
+                 (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc))>;
+  def i32r : Pat<(v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)),
+                 (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc))>;
 
   def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc)))),
             (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8") (v16i8 MQPR:$v1), (v16i8 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
@@ -3810,12 +4177,12 @@ multiclass unpred_vcmp_r<string suffix, PatLeaf fc> {
   def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc)))),
             (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32") (v4i32 MQPR:$v1), (v4i32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
 
-  def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup GPR:$v2)), fc)))),
-            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
-  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup GPR:$v2)), fc)))),
-            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
-  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup GPR:$v2)), fc)))),
-            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 GPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+  def : Pat<(v16i1 (and (v16i1 VCCR:$p1), (v16i1 (ARMvcmp (v16i8 MQPR:$v1), (v16i8 (ARMvdup rGPR:$v2)), fc)))),
+            (v16i1 (!cast<Instruction>("MVE_VCMP"#suffix#"8r") (v16i8 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8i16 MQPR:$v1), (v8i16 (ARMvdup rGPR:$v2)), fc)))),
+            (v8i1 (!cast<Instruction>("MVE_VCMP"#suffix#"16r") (v8i16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4i32 MQPR:$v1), (v4i32 (ARMvdup rGPR:$v2)), fc)))),
+            (v4i1 (!cast<Instruction>("MVE_VCMP"#suffix#"32r") (v4i32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
 }
 
 multiclass unpred_vcmpf_z<PatLeaf fc> {
@@ -3825,31 +4192,31 @@ multiclass unpred_vcmpf_z<PatLeaf fc> {
                 (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc))>;
 
   def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmpz (v8f16 MQPR:$v1), fc)))),
-            (v8i1 (MVE_VCMPf32r (v8f16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
+            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
   def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmpz (v4f32 MQPR:$v1), fc)))),
             (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), ZR, fc, ARMVCCThen, VCCR:$p1))>;
 }
 
 multiclass unpred_vcmpf_r<int fc> {
-  def f16 : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)),
-                (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>;
-  def f32 : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)),
-                (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>;
+  def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)),
+            (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc))>;
+  def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)),
+            (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc))>;
 
-  def f16r : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)),
-                 (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc))>;
-  def f32r : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)),
-                 (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc))>;
+  def : Pat<(v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)),
+            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc))>;
+  def : Pat<(v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)),
+            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc))>;
 
   def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc)))),
             (v8i1 (MVE_VCMPf16 (v8f16 MQPR:$v1), (v8f16 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
   def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc)))),
             (v4i1 (MVE_VCMPf32 (v4f32 MQPR:$v1), (v4f32 MQPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
 
-  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup HPR:$v2)), fc)))),
-            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f16 HPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>;
-  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup SPR:$v2)), fc)))),
-            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 (COPY_TO_REGCLASS (f32 SPR:$v2), rGPR)), fc, ARMVCCThen, VCCR:$p1))>;
+  def : Pat<(v8i1 (and (v8i1 VCCR:$p1), (v8i1 (ARMvcmp (v8f16 MQPR:$v1), (v8f16 (ARMvdup rGPR:$v2)), fc)))),
+            (v8i1 (MVE_VCMPf16r (v8f16 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
+  def : Pat<(v4i1 (and (v4i1 VCCR:$p1), (v4i1 (ARMvcmp (v4f32 MQPR:$v1), (v4f32 (ARMvdup rGPR:$v2)), fc)))),
+            (v4i1 (MVE_VCMPf32r (v4f32 MQPR:$v1), (i32 rGPR:$v2), fc, ARMVCCThen, VCCR:$p1))>;
 }
 
 let Predicates = [HasMVEInt] in {
@@ -3889,7 +4256,7 @@ let Predicates = [HasMVEFloat] in {
 }
 
 
-// Extra "worst case" and/or/xor partterns, going into and out of GRP
+// Extra "worst case" and/or/xor patterns, going into and out of GRP
 multiclass two_predops<SDPatternOperator opnode, Instruction insn> {
   def v16i1 : Pat<(v16i1 (opnode (v16i1 VCCR:$p1), (v16i1 VCCR:$p2))),
                   (v16i1 (COPY_TO_REGCLASS
@@ -3918,7 +4285,6 @@ let Predicates = [HasMVEInt] in {
 // example when moving between rGPR and VPR.P0 as part of predicate vector
 // shuffles. We also sometimes need to cast between different predicate
 // vector types (v4i1<>v8i1, etc.) also as part of lowering vector shuffles.
-
 def predicate_cast : SDNode<"ARMISD::PREDICATE_CAST", SDTUnaryOp>;
 
 let Predicates = [HasMVEInt] in {
@@ -3932,6 +4298,16 @@ let Predicates = [HasMVEInt] in {
       def : Pat<(VT  (predicate_cast (VT2 VCCR:$src))),
                 (VT  (COPY_TO_REGCLASS (VT2 VCCR:$src), VCCR))>;
   }
+
+  // Here we match the specific SDNode type 'ARMVectorRegCastImpl'
+  // rather than the more general 'ARMVectorRegCast' which would also
+  // match some bitconverts. If we use the latter in cases where the
+  // input and output types are the same, the bitconvert gets elided
+  // and we end up generating a nonsense match of nothing.
+
+  foreach VT = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+    foreach VT2 = [ v16i8, v8i16, v8f16, v4i32, v4f32, v2i64, v2f64 ] in
+      def : Pat<(VT (ARMVectorRegCastImpl (VT2 MQPR:$src))), (VT MQPR:$src)>;
 }
 
 // end of MVE compares
@@ -3973,11 +4349,32 @@ class MVE_VQxDMLxDH<string iname, bit exch, bit round, bit subtract,
   let Inst{0} = round;
 }
 
+multiclass MVE_VQxDMLxDH_p<string iname, bit exch, bit round, bit subtract,
+                           MVEVectorVTInfo VTI> {
+  def "": MVE_VQxDMLxDH<iname, exch, round, subtract, VTI.Suffix, VTI.Size,
+                        !if(!eq(VTI.LaneBits, 32), ",@earlyclobber $Qd", "")>;
+  defvar Inst = !cast<Instruction>(NAME);
+  defvar ConstParams = (? (i32 exch), (i32 round), (i32 subtract));
+  defvar unpred_intr = int_arm_mve_vqdmlad;
+  defvar pred_intr = int_arm_mve_vqdmlad_predicated;
+
+  def : Pat<(VTI.Vec !con((unpred_intr (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b),
+                                       (VTI.Vec MQPR:$c)), ConstParams)),
+            (VTI.Vec (Inst (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b),
+                           (VTI.Vec MQPR:$c)))>;
+  def : Pat<(VTI.Vec !con((pred_intr (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b),
+                                     (VTI.Vec MQPR:$c)), ConstParams,
+                          (? (VTI.Pred VCCR:$pred)))),
+            (VTI.Vec (Inst (VTI.Vec MQPR:$a), (VTI.Vec MQPR:$b),
+                           (VTI.Vec MQPR:$c),
+                           ARMVCCThen, (VTI.Pred VCCR:$pred)))>;
+}
+
 multiclass MVE_VQxDMLxDH_multi<string iname, bit exch,
                                bit round, bit subtract> {
-  def s8  : MVE_VQxDMLxDH<iname, exch, round, subtract, "s8",  0b00>;
-  def s16 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s16", 0b01>;
-  def s32 : MVE_VQxDMLxDH<iname, exch, round, subtract, "s32", 0b10, ",@earlyclobber $Qd">;
+  defm s8  : MVE_VQxDMLxDH_p<iname, exch, round, subtract, MVE_v16s8>;
+  defm s16 : MVE_VQxDMLxDH_p<iname, exch, round, subtract, MVE_v8s16>;
+  defm s32 : MVE_VQxDMLxDH_p<iname, exch, round, subtract, MVE_v4s32>;
 }
 
 defm MVE_VQDMLADH   : MVE_VQxDMLxDH_multi<"vqdmladh",   0b0, 0b0, 0b0>;
@@ -4051,6 +4448,7 @@ class MVE_VMULL<string iname, string suffix, bit bit_28, bits<2> bits_21_20,
   let Inst{7} = Qn{3};
   let Inst{0} = 0b0;
   let validForTailPredication = 1;
+  let doubleWidthResult = 1;
 }
 
 multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
@@ -4072,10 +4470,10 @@ multiclass MVE_VMULL_m<MVEVectorVTInfo VTI,
     // Predicated multiply
     def : Pat<(VTI.DblVec !con((pred_int (VTI.Vec MQPR:$Qm),
                                          (VTI.Vec MQPR:$Qn)),
-                               uflag, (? (i32 Top), (VTI.Pred VCCR:$mask),
+                               uflag, (? (i32 Top), (VTI.DblPred VCCR:$mask),
                                          (VTI.DblVec MQPR:$inactive)))),
               (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
-                                ARMVCCThen, (VTI.Pred VCCR:$mask),
+                                ARMVCCThen, (VTI.DblPred VCCR:$mask),
                                 (VTI.DblVec MQPR:$inactive)))>;
   }
 }
@@ -4122,6 +4520,50 @@ defm MVE_VMULLBp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
 defm MVE_VMULLTp16 : MVE_VMULL_m<MVE_v8p16, int_arm_mve_vmull_poly,
                                  int_arm_mve_mull_poly_predicated, 0b1>;
 
+let Predicates = [HasMVEInt] in {
+  def : Pat<(v2i64 (ARMvmulls (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))),
+            (MVE_VMULLBs32 MQPR:$src1, MQPR:$src2)>;
+  def : Pat<(v2i64 (ARMvmulls (v4i32 (ARMvrev64 (v4i32 MQPR:$src1))),
+                              (v4i32 (ARMvrev64 (v4i32 MQPR:$src2))))),
+            (MVE_VMULLTs32 MQPR:$src1, MQPR:$src2)>;
+
+  def : Pat<(mul (sext_inreg (v4i32 MQPR:$src1), v4i16),
+                 (sext_inreg (v4i32 MQPR:$src2), v4i16)),
+            (MVE_VMULLBs16 MQPR:$src1, MQPR:$src2)>;
+  def : Pat<(mul (sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src1)))), v4i16),
+                 (sext_inreg (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src2)))), v4i16)),
+            (MVE_VMULLTs16 MQPR:$src1, MQPR:$src2)>;
+
+  def : Pat<(mul (sext_inreg (v8i16 MQPR:$src1), v8i8),
+                 (sext_inreg (v8i16 MQPR:$src2), v8i8)),
+            (MVE_VMULLBs8 MQPR:$src1, MQPR:$src2)>;
+  def : Pat<(mul (sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src1)))), v8i8),
+                 (sext_inreg (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src2)))), v8i8)),
+            (MVE_VMULLTs8 MQPR:$src1, MQPR:$src2)>;
+
+  def : Pat<(v2i64 (ARMvmullu (v4i32 MQPR:$src1), (v4i32 MQPR:$src2))),
+            (MVE_VMULLBu32 MQPR:$src1, MQPR:$src2)>;
+  def : Pat<(v2i64 (ARMvmullu (v4i32 (ARMvrev64 (v4i32 MQPR:$src1))),
+                              (v4i32 (ARMvrev64 (v4i32 MQPR:$src2))))),
+            (MVE_VMULLTu32 MQPR:$src1, MQPR:$src2)>;
+
+  def : Pat<(mul (and (v4i32 MQPR:$src1), (v4i32 (ARMvmovImm (i32 0xCFF)))),
+                 (and (v4i32 MQPR:$src2), (v4i32 (ARMvmovImm (i32 0xCFF))))),
+            (MVE_VMULLBu16 MQPR:$src1, MQPR:$src2)>;
+  def : Pat<(mul (and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src1)))),
+                      (v4i32 (ARMvmovImm (i32 0xCFF)))),
+                 (and (v4i32 (ARMVectorRegCast (ARMvrev32 (v8i16 MQPR:$src2)))),
+                      (v4i32 (ARMvmovImm (i32 0xCFF))))),
+            (MVE_VMULLTu16 MQPR:$src1, MQPR:$src2)>;
+
+  def : Pat<(mul (ARMvbicImm (v8i16 MQPR:$src1), (i32 0xAFF)),
+                 (ARMvbicImm (v8i16 MQPR:$src2), (i32 0xAFF))),
+            (MVE_VMULLBu8 MQPR:$src1, MQPR:$src2)>;
+  def : Pat<(mul (ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src1)))), (i32 0xAFF)),
+                 (ARMvbicImm (v8i16 (ARMVectorRegCast (ARMvrev16 (v16i8 MQPR:$src2)))), (i32 0xAFF))),
+            (MVE_VMULLTu8 MQPR:$src1, MQPR:$src2)>;
+}
+
 class MVE_VxMULH<string iname, string suffix, bit U, bits<2> size, bit round,
                  list<dag> pattern=[]>
   : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
@@ -4195,6 +4637,8 @@ class MVE_VxMOVxN<string iname, string suffix, bit bit_28, bit bit_17,
   let Inst{8} = 0b0;
   let Inst{7} = !if(!eq(bit_17, 0), 1, 0);
   let Inst{0} = 0b1;
+  let validForTailPredication = 1;
+  let retainsPreviousHalfElement = 1;
 }
 
 multiclass MVE_VxMOVxN_halves<string iname, string suffix,
@@ -4213,21 +4657,121 @@ defm MVE_VQMOVUNs16 : MVE_VxMOVxN_halves<"vqmovun", "s16", 0b0, 0b0, 0b00>;
 defm MVE_VQMOVUNs32 : MVE_VxMOVxN_halves<"vqmovun", "s32", 0b0, 0b0, 0b01>;
 
 def MVEvmovn       : SDNode<"ARMISD::VMOVN", SDTARMVEXT>;
+
+multiclass MVE_VMOVN_p<Instruction Inst, bit top,
+                       MVEVectorVTInfo VTI, MVEVectorVTInfo InVTI> {
+  // Match the most obvious MVEvmovn(a,b,t), which overwrites the odd or even
+  // lanes of a (depending on t) with the even lanes of b.
+  def : Pat<(VTI.Vec (MVEvmovn (VTI.Vec MQPR:$Qd_src),
+                               (VTI.Vec MQPR:$Qm), (i32 top))),
+            (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qm)))>;
+
+  if !eq(top, 0) then {
+    // If we see MVEvmovn(a,ARMvrev(b),1), that wants to overwrite the odd
+    // lanes of a with the odd lanes of b. In other words, the lanes we're
+    // _keeping_ from a are the even ones. So we can flip it round and say that
+    // this is the same as overwriting the even lanes of b with the even lanes
+    // of a, i.e. it's a VMOVNB with the operands reversed.
+    defvar vrev = !cast<SDNode>("ARMvrev" # InVTI.LaneBits);
+    def : Pat<(VTI.Vec (MVEvmovn (VTI.Vec MQPR:$Qm),
+                                 (VTI.Vec (vrev MQPR:$Qd_src)), (i32 1))),
+              (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src), (VTI.Vec MQPR:$Qm)))>;
+  }
+
+  // Match the IR intrinsic for a predicated VMOVN. This regards the Qm input
+  // as having wider lanes that we're narrowing, instead of already-narrow
+  // lanes that we're taking every other one of.
+  def : Pat<(VTI.Vec (int_arm_mve_vmovn_predicated (VTI.Vec MQPR:$Qd_src),
+                                  (InVTI.Vec MQPR:$Qm), (i32 top),
+                                  (InVTI.Pred VCCR:$pred))),
+            (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src),
+                              (InVTI.Vec MQPR:$Qm),
+                              ARMVCCThen, (InVTI.Pred VCCR:$pred)))>;
+}
+
+defm : MVE_VMOVN_p<MVE_VMOVNi32bh, 0, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VMOVN_p<MVE_VMOVNi32th, 1, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VMOVN_p<MVE_VMOVNi16bh, 0, MVE_v16i8, MVE_v8i16>;
+defm : MVE_VMOVN_p<MVE_VMOVNi16th, 1, MVE_v16i8, MVE_v8i16>;
+
+multiclass MVE_VQMOVN_p<Instruction Inst, bit outU, bit inU, bit top,
+                        MVEVectorVTInfo VTI, MVEVectorVTInfo InVTI> {
+  def : Pat<(VTI.Vec (int_arm_mve_vqmovn (VTI.Vec MQPR:$Qd_src),
+                                  (InVTI.Vec MQPR:$Qm),
+                                  (i32 outU), (i32 inU), (i32 top))),
+            (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src),
+                              (InVTI.Vec MQPR:$Qm)))>;
+
+  def : Pat<(VTI.Vec (int_arm_mve_vqmovn_predicated (VTI.Vec MQPR:$Qd_src),
+                                  (InVTI.Vec MQPR:$Qm),
+                                  (i32 outU), (i32 inU), (i32 top),
+                                  (InVTI.Pred VCCR:$pred))),
+            (VTI.Vec (Inst (VTI.Vec MQPR:$Qd_src),
+                              (InVTI.Vec MQPR:$Qm),
+                              ARMVCCThen, (InVTI.Pred VCCR:$pred)))>;
+}
+
+defm : MVE_VQMOVN_p<MVE_VQMOVNs32bh,  0, 0, 0, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VQMOVN_p<MVE_VQMOVNs32th,  0, 0, 1, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VQMOVN_p<MVE_VQMOVNs16bh,  0, 0, 0, MVE_v16i8, MVE_v8i16>;
+defm : MVE_VQMOVN_p<MVE_VQMOVNs16th,  0, 0, 1, MVE_v16i8, MVE_v8i16>;
+defm : MVE_VQMOVN_p<MVE_VQMOVNu32bh,  1, 1, 0, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VQMOVN_p<MVE_VQMOVNu32th,  1, 1, 1, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VQMOVN_p<MVE_VQMOVNu16bh,  1, 1, 0, MVE_v16i8, MVE_v8i16>;
+defm : MVE_VQMOVN_p<MVE_VQMOVNu16th,  1, 1, 1, MVE_v16i8, MVE_v8i16>;
+defm : MVE_VQMOVN_p<MVE_VQMOVUNs32bh, 1, 0, 0, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VQMOVN_p<MVE_VQMOVUNs32th, 1, 0, 1, MVE_v8i16, MVE_v4i32>;
+defm : MVE_VQMOVN_p<MVE_VQMOVUNs16bh, 1, 0, 0, MVE_v16i8, MVE_v8i16>;
+defm : MVE_VQMOVN_p<MVE_VQMOVUNs16th, 1, 0, 1, MVE_v16i8, MVE_v8i16>;
+
+def SDTARMVMOVNQ : SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
+                                        SDTCisVec<2>, SDTCisVT<3, i32>]>;
+def MVEvqmovns   : SDNode<"ARMISD::VQMOVNs", SDTARMVMOVNQ>;
+def MVEvqmovnu   : SDNode<"ARMISD::VQMOVNu", SDTARMVMOVNQ>;
+
 let Predicates = [HasMVEInt] in {
-  def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))),
-            (v8i16 (MVE_VMOVNi32bh (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
-  def : Pat<(v8i16 (MVEvmovn (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))),
-            (v8i16 (MVE_VMOVNi32th (v8i16 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
-  def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 0))),
-            (v16i8 (MVE_VMOVNi16bh (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
-  def : Pat<(v16i8 (MVEvmovn (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm), (i32 1))),
-            (v16i8 (MVE_VMOVNi16th (v16i8 MQPR:$Qd_src), (v16i8 MQPR:$Qm)))>;
+  def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 0))),
+            (v8i16 (MVE_VQMOVNs32bh (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>;
+  def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 1))),
+            (v8i16 (MVE_VQMOVNs32th (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>;
+  def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))),
+            (v16i8 (MVE_VQMOVNs16bh (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+  def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))),
+            (v16i8 (MVE_VQMOVNs16th (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+
+  def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 0))),
+            (v8i16 (MVE_VQMOVNu32bh (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>;
+  def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), (i32 1))),
+            (v8i16 (MVE_VQMOVNu32th (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm)))>;
+  def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 0))),
+            (v16i8 (MVE_VQMOVNu16bh (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+  def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), (i32 1))),
+            (v16i8 (MVE_VQMOVNu16th (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm)))>;
+
+  def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshrsImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 0))),
+            (v8i16 (MVE_VQSHRNbhs32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>;
+  def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshrsImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 0))),
+            (v16i8 (MVE_VQSHRNbhs16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>;
+  def : Pat<(v8i16 (MVEvqmovns (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshrsImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 1))),
+            (v8i16 (MVE_VQSHRNths32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>;
+  def : Pat<(v16i8 (MVEvqmovns (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshrsImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 1))),
+            (v16i8 (MVE_VQSHRNths16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>;
+
+  def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshruImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 0))),
+            (v8i16 (MVE_VQSHRNbhu32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>;
+  def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshruImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 0))),
+            (v16i8 (MVE_VQSHRNbhu16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>;
+  def : Pat<(v8i16 (MVEvqmovnu (v8i16 MQPR:$Qd_src), (v4i32 (ARMvshruImm (v4i32 MQPR:$Qm), imm0_31:$imm)), (i32 1))),
+            (v8i16 (MVE_VQSHRNthu32 (v8i16 MQPR:$Qd_src), (v4i32 MQPR:$Qm), imm0_31:$imm))>;
+  def : Pat<(v16i8 (MVEvqmovnu (v16i8 MQPR:$Qd_src), (v8i16 (ARMvshruImm (v8i16 MQPR:$Qm), imm0_15:$imm)), (i32 1))),
+            (v16i8 (MVE_VQSHRNthu16 (v16i8 MQPR:$Qd_src), (v8i16 MQPR:$Qm), imm0_15:$imm))>;
 }
 
 class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
-                  list<dag> pattern=[]>
-  : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd), (ins MQPR:$Qd_src, MQPR:$Qm),
-                   "$Qd, $Qm", vpred_n, "$Qd = $Qd_src", pattern> {
+                  dag iops_extra, vpred_ops vpred, string cstr>
+  : MVE_qDest_qSrc<iname, suffix, (outs MQPR:$Qd),
+                   !con(iops_extra, (ins MQPR:$Qm)), "$Qd, $Qm",
+                   vpred, cstr, []> {
   let Inst{28} = op;
   let Inst{21-16} = 0b111111;
   let Inst{12} = T;
@@ -4235,10 +4779,17 @@ class MVE_VCVT_ff<string iname, string suffix, bit op, bit T,
   let Inst{0} = 0b1;
 
   let Predicates = [HasMVEFloat];
+  let retainsPreviousHalfElement = 1;
 }
 
+def SDTARMVCVTL    : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                                         SDTCisVT<2, i32>]>;
+def MVEvcvtn       : SDNode<"ARMISD::VCVTN", SDTARMVMOVNQ>;
+def MVEvcvtl       : SDNode<"ARMISD::VCVTL", SDTARMVCVTL>;
+
 multiclass MVE_VCVT_f2h_m<string iname, int half> {
-  def "": MVE_VCVT_ff<iname, "f16.f32", 0b0, half>;
+  def "": MVE_VCVT_ff<iname, "f16.f32", 0b0, half,
+                      (ins MQPR:$Qd_src), vpred_n, "$Qd = $Qd_src">;
   defvar Inst = !cast<Instruction>(NAME);
 
   let Predicates = [HasMVEFloat] in {
@@ -4250,11 +4801,28 @@ multiclass MVE_VCVT_f2h_m<string iname, int half> {
                          (v4i1 VCCR:$mask))),
               (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm),
                            ARMVCCThen, (v4i1 VCCR:$mask)))>;
+
+    def : Pat<(v8f16 (MVEvcvtn (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm), (i32 half))),
+              (v8f16 (Inst (v8f16 MQPR:$Qd_src), (v4f32 MQPR:$Qm)))>;
   }
 }
 
 multiclass MVE_VCVT_h2f_m<string iname, int half> {
-  def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half>;
+  def "": MVE_VCVT_ff<iname, "f32.f16", 0b1, half, (ins), vpred_r, "">;
+  defvar Inst = !cast<Instruction>(NAME);
+
+  let Predicates = [HasMVEFloat] in {
+    def : Pat<(v4f32 (int_arm_mve_vcvt_widen (v8f16 MQPR:$Qm), (i32 half))),
+              (v4f32 (Inst (v8f16 MQPR:$Qm)))>;
+    def : Pat<(v4f32 (int_arm_mve_vcvt_widen_predicated
+                         (v4f32 MQPR:$inactive), (v8f16 MQPR:$Qm), (i32 half),
+                         (v4i1 VCCR:$mask))),
+              (v4f32 (Inst (v8f16 MQPR:$Qm), ARMVCCThen,
+                           (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive)))>;
+
+    def : Pat<(v4f32 (MVEvcvtl (v8f16 MQPR:$Qm), (i32 half))),
+              (v4f32 (Inst (v8f16 MQPR:$Qm)))>;
+  }
 }
 
 defm MVE_VCVTf16f32bh : MVE_VCVT_f2h_m<"vcvtb", 0b0>;
@@ -4353,15 +4921,37 @@ class MVE_VQDMULL<string iname, string suffix, bit size, bit T,
   let Inst{7} = Qn{3};
   let Inst{0} = 0b1;
   let validForTailPredication = 1;
+  let doubleWidthResult = 1;
+}
+
+multiclass MVE_VQDMULL_m<string iname, MVEVectorVTInfo VTI, bit size, bit T,
+                         string cstr> {
+  def "" : MVE_VQDMULL<iname, VTI.Suffix, size, T, cstr>;
+  defvar Inst = !cast<Instruction>(NAME);
+
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated saturating multiply
+    def : Pat<(VTI.DblVec (int_arm_mve_vqdmull (VTI.Vec MQPR:$Qm),
+                                               (VTI.Vec MQPR:$Qn), (i32 T))),
+              (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn)))>;
+    // Predicated saturating multiply
+    def : Pat<(VTI.DblVec (int_arm_mve_vqdmull_predicated
+                                    (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                                    (i32 T), (VTI.DblPred VCCR:$mask),
+                                    (VTI.DblVec MQPR:$inactive))),
+              (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (VTI.Vec MQPR:$Qn),
+                                ARMVCCThen, (VTI.DblPred VCCR:$mask),
+                                (VTI.DblVec MQPR:$inactive)))>;
+  }
 }
 
-multiclass MVE_VQDMULL_halves<string suffix, bit size, string cstr=""> {
-  def bh : MVE_VQDMULL<"vqdmullb", suffix, size, 0b0, cstr>;
-  def th : MVE_VQDMULL<"vqdmullt", suffix, size, 0b1, cstr>;
+multiclass MVE_VQDMULL_halves<MVEVectorVTInfo VTI, bit size, string cstr=""> {
+  defm bh : MVE_VQDMULL_m<"vqdmullb", VTI, size, 0b0, cstr>;
+  defm th : MVE_VQDMULL_m<"vqdmullt", VTI, size, 0b1, cstr>;
 }
 
-defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<"s16", 0b0>;
-defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<"s32", 0b1, "@earlyclobber $Qd">;
+defm MVE_VQDMULLs16 : MVE_VQDMULL_halves<MVE_v8s16, 0b0>;
+defm MVE_VQDMULLs32 : MVE_VQDMULL_halves<MVE_v4s32, 0b1, "@earlyclobber $Qd">;
 
 // end of mve_qDest_qSrc
 
@@ -4407,10 +4997,61 @@ class MVE_qDest_single_rSrc<string iname, string suffix, list<dag> pattern=[]>
   let Inst{3-0} = Rm{3-0};
 }
 
+// Patterns for vector-scalar instructions with integer operands
+multiclass MVE_vec_scalar_int_pat_m<Instruction inst, MVEVectorVTInfo VTI,
+                                    SDNode unpred_op, SDNode pred_op,
+                                    bit unpred_has_sign = 0,
+                                    bit pred_has_sign = 0> {
+  defvar UnpredSign = !if(unpred_has_sign, (? (i32 VTI.Unsigned)), (?));
+  defvar PredSign = !if(pred_has_sign, (? (i32 VTI.Unsigned)), (?));
+
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated version
+    def : Pat<(VTI.Vec !con((unpred_op (VTI.Vec MQPR:$Qm),
+                                       (VTI.Vec (ARMvdup rGPR:$val))),
+                            UnpredSign)),
+              (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val)))>;
+    // Predicated version
+    def : Pat<(VTI.Vec !con((pred_op (VTI.Vec MQPR:$Qm),
+                                     (VTI.Vec (ARMvdup rGPR:$val))),
+                            PredSign,
+                            (pred_op (VTI.Pred VCCR:$mask),
+                                     (VTI.Vec MQPR:$inactive)))),
+              (VTI.Vec (inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val),
+                             ARMVCCThen, (VTI.Pred VCCR:$mask),
+                             (VTI.Vec MQPR:$inactive)))>;
+  }
+}
+
+// Patterns for vector-scalar instructions with FP operands
+multiclass MVE_vec_scalar_fp_pat_m<SDNode unpred_op, Intrinsic pred_int,
+                                   Instruction instr_f16,
+                                   Instruction instr_f32> {
+  let Predicates = [HasMVEFloat] in {
+    // Unpredicated F16
+    def : Pat<(v8f16 (unpred_op (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)))),
+              (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val)))>;
+    // Unpredicated F32
+    def : Pat<(v4f32 (unpred_op (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)))),
+              (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val)))>;
+    // Predicated F16
+    def : Pat<(v8f16 (pred_int (v8f16 MQPR:$Qm), (v8f16 (ARMvdup rGPR:$val)),
+                               (v8i1 VCCR:$mask), (v8f16 MQPR:$inactive))),
+              (v8f16 (instr_f16 (v8f16 MQPR:$Qm), (i32 rGPR:$val),
+                                ARMVCCThen, (v8i1 VCCR:$mask),
+                                (v8f16 MQPR:$inactive)))>;
+    // Predicated F32
+    def : Pat<(v4f32 (pred_int (v4f32 MQPR:$Qm), (v4f32 (ARMvdup rGPR:$val)),
+                               (v4i1 VCCR:$mask), (v4f32 MQPR:$inactive))),
+              (v4f32 (instr_f32 (v4f32 MQPR:$Qm), (i32 rGPR:$val),
+                                ARMVCCThen, (v4i1 VCCR:$mask),
+                                (v4f32 MQPR:$inactive)))>;
+  }
+}
+
 class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
-                     bit bit_5, bit bit_12, bit bit_16,
-                     bit bit_28, list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
+                     bit bit_5, bit bit_12, bit bit_16, bit bit_28>
+  : MVE_qDest_rSrc<iname, suffix, ""> {
 
   let Inst{28} = bit_28;
   let Inst{21-20} = size;
@@ -4421,42 +5062,60 @@ class MVE_VADDSUB_qr<string iname, string suffix, bits<2> size,
   let validForTailPredication = 1;
 }
 
-multiclass MVE_VADDSUB_qr_sizes<string iname, string suffix,
-                                bit bit_5, bit bit_12, bit bit_16,
-                                bit bit_28, list<dag> pattern=[]> {
-  def "8"  : MVE_VADDSUB_qr<iname, suffix#"8",  0b00,
-                            bit_5, bit_12, bit_16, bit_28>;
-  def "16" : MVE_VADDSUB_qr<iname, suffix#"16", 0b01,
-                            bit_5, bit_12, bit_16, bit_28>;
-  def "32" : MVE_VADDSUB_qr<iname, suffix#"32", 0b10,
-                            bit_5, bit_12, bit_16, bit_28>;
-}
-
-defm MVE_VADD_qr_i  : MVE_VADDSUB_qr_sizes<"vadd",  "i", 0b0, 0b0, 0b1, 0b0>;
-defm MVE_VQADD_qr_s : MVE_VADDSUB_qr_sizes<"vqadd", "s", 0b1, 0b0, 0b0, 0b0>;
-defm MVE_VQADD_qr_u : MVE_VADDSUB_qr_sizes<"vqadd", "u", 0b1, 0b0, 0b0, 0b1>;
-
-defm MVE_VSUB_qr_i  : MVE_VADDSUB_qr_sizes<"vsub",  "i", 0b0, 0b1, 0b1, 0b0>;
-defm MVE_VQSUB_qr_s : MVE_VADDSUB_qr_sizes<"vqsub", "s", 0b1, 0b1, 0b0, 0b0>;
-defm MVE_VQSUB_qr_u : MVE_VADDSUB_qr_sizes<"vqsub", "u", 0b1, 0b1, 0b0, 0b1>;
-
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (add (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
-            (v16i8 (MVE_VADD_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
-  def : Pat<(v8i16 (add (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
-            (v8i16 (MVE_VADD_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
-  def : Pat<(v4i32 (add (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
-            (v4i32 (MVE_VADD_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
-}
-
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (sub (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
-            (v16i8 (MVE_VSUB_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
-  def : Pat<(v8i16 (sub (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
-            (v8i16 (MVE_VSUB_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
-  def : Pat<(v4i32 (sub (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
-            (v4i32 (MVE_VSUB_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
-}
+// Vector-scalar add/sub
+multiclass MVE_VADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
+                            SDNode unpred_op, Intrinsic pred_int> {
+  def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b0, subtract, 0b1, 0b0>;
+  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+                                  unpred_op, pred_int>;
+}
+
+multiclass MVE_VADD_qr_m<MVEVectorVTInfo VTI>
+  : MVE_VADDSUB_qr_m<"vadd", VTI, 0b0, add, int_arm_mve_add_predicated>;
+
+multiclass MVE_VSUB_qr_m<MVEVectorVTInfo VTI>
+  : MVE_VADDSUB_qr_m<"vsub", VTI, 0b1, sub, int_arm_mve_sub_predicated>;
+
+defm MVE_VADD_qr_i8  : MVE_VADD_qr_m<MVE_v16i8>;
+defm MVE_VADD_qr_i16 : MVE_VADD_qr_m<MVE_v8i16>;
+defm MVE_VADD_qr_i32 : MVE_VADD_qr_m<MVE_v4i32>;
+
+defm MVE_VSUB_qr_i8  : MVE_VSUB_qr_m<MVE_v16i8>;
+defm MVE_VSUB_qr_i16 : MVE_VSUB_qr_m<MVE_v8i16>;
+defm MVE_VSUB_qr_i32 : MVE_VSUB_qr_m<MVE_v4i32>;
+
+// Vector-scalar saturating add/sub
+multiclass MVE_VQADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
+                             SDNode unpred_op_s, SDNode unpred_op_u,
+                             Intrinsic pred_int> {
+  def "" : MVE_VADDSUB_qr<iname, VTI.Suffix, VTI.Size, 0b1, subtract,
+                          0b0, VTI.Unsigned>;
+  defvar unpred_op = !if(VTI.Unsigned, unpred_op_u, unpred_op_s);
+  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+                                  unpred_op, pred_int, 0, 1>;
+}
+
+multiclass MVE_VQADD_qr_m<MVEVectorVTInfo VTI>
+  : MVE_VQADDSUB_qr_m<"vqadd", VTI, 0b0, saddsat, uaddsat,
+                     int_arm_mve_qadd_predicated>;
+
+multiclass MVE_VQSUB_qr_m<MVEVectorVTInfo VTI>
+  : MVE_VQADDSUB_qr_m<"vqsub", VTI, 0b1, ssubsat, usubsat,
+                     int_arm_mve_qsub_predicated>;
+
+defm MVE_VQADD_qr_s8  : MVE_VQADD_qr_m<MVE_v16s8>;
+defm MVE_VQADD_qr_s16 : MVE_VQADD_qr_m<MVE_v8s16>;
+defm MVE_VQADD_qr_s32 : MVE_VQADD_qr_m<MVE_v4s32>;
+defm MVE_VQADD_qr_u8  : MVE_VQADD_qr_m<MVE_v16u8>;
+defm MVE_VQADD_qr_u16 : MVE_VQADD_qr_m<MVE_v8u16>;
+defm MVE_VQADD_qr_u32 : MVE_VQADD_qr_m<MVE_v4u32>;
+
+defm MVE_VQSUB_qr_s8  : MVE_VQSUB_qr_m<MVE_v16s8>;
+defm MVE_VQSUB_qr_s16 : MVE_VQSUB_qr_m<MVE_v8s16>;
+defm MVE_VQSUB_qr_s32 : MVE_VQSUB_qr_m<MVE_v4s32>;
+defm MVE_VQSUB_qr_u8  : MVE_VQSUB_qr_m<MVE_v16u8>;
+defm MVE_VQSUB_qr_u16 : MVE_VQSUB_qr_m<MVE_v8u16>;
+defm MVE_VQSUB_qr_u32 : MVE_VQSUB_qr_m<MVE_v4u32>;
 
 class MVE_VQDMULL_qr<string iname, string suffix, bit size,
                      bit T, string cstr="", list<dag> pattern=[]>
@@ -4469,15 +5128,40 @@ class MVE_VQDMULL_qr<string iname, string suffix, bit size,
   let Inst{8} = 0b1;
   let Inst{5} = 0b1;
   let validForTailPredication = 1;
+  let doubleWidthResult = 1;
 }
 
-multiclass MVE_VQDMULL_qr_halves<string suffix, bit size, string cstr=""> {
-  def bh : MVE_VQDMULL_qr<"vqdmullb", suffix, size, 0b0, cstr>;
-  def th : MVE_VQDMULL_qr<"vqdmullt", suffix, size, 0b1, cstr>;
+multiclass MVE_VQDMULL_qr_m<string iname, MVEVectorVTInfo VTI, bit size,
+                            bit T, string cstr> {
+  def "" : MVE_VQDMULL_qr<iname, VTI.Suffix, size, T, cstr>;
+  defvar Inst = !cast<Instruction>(NAME);
+
+  let Predicates = [HasMVEInt] in {
+    // Unpredicated saturating multiply
+    def : Pat<(VTI.DblVec (int_arm_mve_vqdmull (VTI.Vec MQPR:$Qm),
+                                               (VTI.Vec (ARMvdup rGPR:$val)),
+                                               (i32 T))),
+              (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val)))>;
+    // Predicated saturating multiply
+    def : Pat<(VTI.DblVec (int_arm_mve_vqdmull_predicated
+                                    (VTI.Vec MQPR:$Qm),
+                                    (VTI.Vec (ARMvdup rGPR:$val)),
+                                    (i32 T),
+                                    (VTI.DblPred VCCR:$mask),
+                                    (VTI.DblVec MQPR:$inactive))),
+              (VTI.DblVec (Inst (VTI.Vec MQPR:$Qm), (i32 rGPR:$val),
+                             ARMVCCThen, (VTI.DblPred VCCR:$mask),
+                             (VTI.DblVec MQPR:$inactive)))>;
+  }
 }
 
-defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<"s16", 0b0>;
-defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<"s32", 0b1, "@earlyclobber $Qd">;
+multiclass MVE_VQDMULL_qr_halves<MVEVectorVTInfo VTI, bit size, string cstr=""> {
+  defm bh : MVE_VQDMULL_qr_m<"vqdmullb", VTI, size, 0b0, cstr>;
+  defm th : MVE_VQDMULL_qr_m<"vqdmullt", VTI, size, 0b1, cstr>;
+}
+
+defm MVE_VQDMULL_qr_s16 : MVE_VQDMULL_qr_halves<MVE_v8s16, 0b0>;
+defm MVE_VQDMULL_qr_s32 : MVE_VQDMULL_qr_halves<MVE_v4s32, 0b1, "@earlyclobber $Qd">;
 
 class MVE_VxADDSUB_qr<string iname, string suffix,
                       bit bit_28, bits<2> bits_21_20, bit subtract,
@@ -4493,19 +5177,34 @@ class MVE_VxADDSUB_qr<string iname, string suffix,
   let validForTailPredication = 1;
 }
 
-def MVE_VHADD_qr_s8   : MVE_VxADDSUB_qr<"vhadd", "s8",  0b0, 0b00, 0b0>;
-def MVE_VHADD_qr_s16  : MVE_VxADDSUB_qr<"vhadd", "s16", 0b0, 0b01, 0b0>;
-def MVE_VHADD_qr_s32  : MVE_VxADDSUB_qr<"vhadd", "s32", 0b0, 0b10, 0b0>;
-def MVE_VHADD_qr_u8   : MVE_VxADDSUB_qr<"vhadd", "u8",  0b1, 0b00, 0b0>;
-def MVE_VHADD_qr_u16  : MVE_VxADDSUB_qr<"vhadd", "u16", 0b1, 0b01, 0b0>;
-def MVE_VHADD_qr_u32  : MVE_VxADDSUB_qr<"vhadd", "u32", 0b1, 0b10, 0b0>;
+multiclass MVE_VHADDSUB_qr_m<string iname, MVEVectorVTInfo VTI, bit subtract,
+                             Intrinsic unpred_int, Intrinsic pred_int> {
+  def "" : MVE_VxADDSUB_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size, subtract>;
+  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME),
+                                  VTI, unpred_int, pred_int, 1, 1>;
+}
+
+multiclass MVE_VHADD_qr_m<MVEVectorVTInfo VTI> :
+  MVE_VHADDSUB_qr_m<"vhadd", VTI, 0b0, int_arm_mve_vhadd,
+                                       int_arm_mve_hadd_predicated>;
+
+multiclass MVE_VHSUB_qr_m<MVEVectorVTInfo VTI> :
+  MVE_VHADDSUB_qr_m<"vhsub", VTI, 0b1, int_arm_mve_vhsub,
+                                       int_arm_mve_hsub_predicated>;
 
-def MVE_VHSUB_qr_s8   : MVE_VxADDSUB_qr<"vhsub", "s8",  0b0, 0b00, 0b1>;
-def MVE_VHSUB_qr_s16  : MVE_VxADDSUB_qr<"vhsub", "s16", 0b0, 0b01, 0b1>;
-def MVE_VHSUB_qr_s32  : MVE_VxADDSUB_qr<"vhsub", "s32", 0b0, 0b10, 0b1>;
-def MVE_VHSUB_qr_u8   : MVE_VxADDSUB_qr<"vhsub", "u8",  0b1, 0b00, 0b1>;
-def MVE_VHSUB_qr_u16  : MVE_VxADDSUB_qr<"vhsub", "u16", 0b1, 0b01, 0b1>;
-def MVE_VHSUB_qr_u32  : MVE_VxADDSUB_qr<"vhsub", "u32", 0b1, 0b10, 0b1>;
+defm MVE_VHADD_qr_s8  : MVE_VHADD_qr_m<MVE_v16s8>;
+defm MVE_VHADD_qr_s16 : MVE_VHADD_qr_m<MVE_v8s16>;
+defm MVE_VHADD_qr_s32 : MVE_VHADD_qr_m<MVE_v4s32>;
+defm MVE_VHADD_qr_u8  : MVE_VHADD_qr_m<MVE_v16u8>;
+defm MVE_VHADD_qr_u16 : MVE_VHADD_qr_m<MVE_v8u16>;
+defm MVE_VHADD_qr_u32 : MVE_VHADD_qr_m<MVE_v4u32>;
+
+defm MVE_VHSUB_qr_s8  : MVE_VHSUB_qr_m<MVE_v16s8>;
+defm MVE_VHSUB_qr_s16 : MVE_VHSUB_qr_m<MVE_v8s16>;
+defm MVE_VHSUB_qr_s32 : MVE_VHSUB_qr_m<MVE_v4s32>;
+defm MVE_VHSUB_qr_u8  : MVE_VHSUB_qr_m<MVE_v16u8>;
+defm MVE_VHSUB_qr_u16 : MVE_VHSUB_qr_m<MVE_v8u16>;
+defm MVE_VHSUB_qr_u32 : MVE_VHSUB_qr_m<MVE_v4u32>;
 
 let Predicates = [HasMVEFloat] in {
   def MVE_VADD_qr_f32 : MVE_VxADDSUB_qr<"vadd",  "f32", 0b0, 0b11, 0b0>;
@@ -4515,6 +5214,11 @@ let Predicates = [HasMVEFloat] in {
   def MVE_VSUB_qr_f16 : MVE_VxADDSUB_qr<"vsub",  "f16", 0b1, 0b11, 0b1>;
 }
 
+defm : MVE_vec_scalar_fp_pat_m<fadd, int_arm_mve_add_predicated,
+                               MVE_VADD_qr_f16, MVE_VADD_qr_f32>;
+defm : MVE_vec_scalar_fp_pat_m<fsub, int_arm_mve_sub_predicated,
+                               MVE_VSUB_qr_f16, MVE_VSUB_qr_f32>;
+
 class MVE_VxSHL_qr<string iname, string suffix, bit U, bits<2> size,
                    bit bit_7, bit bit_17, list<dag> pattern=[]>
   : MVE_qDest_single_rSrc<iname, suffix, pattern> {
@@ -4563,19 +5267,19 @@ defm MVE_VQSHL_qr  : MVE_VxSHL_qr_types<"vqshl",  0b1, 0b0>;
 defm MVE_VQRSHL_qr : MVE_VxSHL_qr_types<"vqrshl", 0b1, 0b1>;
 
 let Predicates = [HasMVEInt] in {
-  def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))),
-            (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), GPR:$Rm))>;
-  def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))),
-            (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), GPR:$Rm))>;
-  def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))),
-            (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), GPR:$Rm))>;
+  def : Pat<(v4i32 (ARMvshlu (v4i32 MQPR:$Qm), (v4i32 (ARMvdup rGPR:$Rm)))),
+            (v4i32 (MVE_VSHL_qru32 (v4i32 MQPR:$Qm), rGPR:$Rm))>;
+  def : Pat<(v8i16 (ARMvshlu (v8i16 MQPR:$Qm), (v8i16 (ARMvdup rGPR:$Rm)))),
+            (v8i16 (MVE_VSHL_qru16 (v8i16 MQPR:$Qm), rGPR:$Rm))>;
+  def : Pat<(v16i8 (ARMvshlu (v16i8 MQPR:$Qm), (v16i8 (ARMvdup rGPR:$Rm)))),
+            (v16i8 (MVE_VSHL_qru8 (v16i8 MQPR:$Qm), rGPR:$Rm))>;
 
-  def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup GPR:$Rm)))),
-            (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), GPR:$Rm))>;
-  def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup GPR:$Rm)))),
-            (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), GPR:$Rm))>;
-  def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup GPR:$Rm)))),
-            (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), GPR:$Rm))>;
+  def : Pat<(v4i32 (ARMvshls (v4i32 MQPR:$Qm), (v4i32 (ARMvdup rGPR:$Rm)))),
+            (v4i32 (MVE_VSHL_qrs32 (v4i32 MQPR:$Qm), rGPR:$Rm))>;
+  def : Pat<(v8i16 (ARMvshls (v8i16 MQPR:$Qm), (v8i16 (ARMvdup rGPR:$Rm)))),
+            (v8i16 (MVE_VSHL_qrs16 (v8i16 MQPR:$Qm), rGPR:$Rm))>;
+  def : Pat<(v16i8 (ARMvshls (v16i8 MQPR:$Qm), (v16i8 (ARMvdup rGPR:$Rm)))),
+            (v16i8 (MVE_VSHL_qrs8 (v16i8 MQPR:$Qm), rGPR:$Rm))>;
 }
 
 class MVE_VBRSR<string iname, string suffix, bits<2> size, list<dag> pattern=[]>
@@ -4594,6 +5298,20 @@ def MVE_VBRSR8  : MVE_VBRSR<"vbrsr", "8", 0b00>;
 def MVE_VBRSR16 : MVE_VBRSR<"vbrsr", "16", 0b01>;
 def MVE_VBRSR32 : MVE_VBRSR<"vbrsr", "32", 0b10>;
 
+multiclass MVE_VBRSR_pat_m<MVEVectorVTInfo VTI, Instruction Inst> {
+  // Unpredicated
+  def : Pat<(VTI.Vec (int_arm_mve_vbrsr (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm))),
+            (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm)))>;
+  // Predicated
+  def : Pat<(VTI.Vec (int_arm_mve_vbrsr_predicated
+                          (VTI.Vec MQPR:$inactive),
+                          (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm),
+                          (VTI.Pred VCCR:$mask))),
+            (VTI.Vec (Inst (VTI.Vec MQPR:$Qn), (i32 rGPR:$Rm),
+                          ARMVCCThen, (VTI.Pred VCCR:$mask),
+                          (VTI.Vec MQPR:$inactive)))>;
+}
+
 let Predicates = [HasMVEInt] in {
   def : Pat<(v16i8 ( bitreverse (v16i8 MQPR:$val1))),
             (v16i8 ( MVE_VBRSR8 (v16i8 MQPR:$val1), (t2MOVi (i32 8)) ))>;
@@ -4603,11 +5321,19 @@ let Predicates = [HasMVEInt] in {
 
   def : Pat<(v8i16 ( bitreverse (v8i16 MQPR:$val1))),
             (v8i16 ( MVE_VBRSR16 (v8i16 MQPR:$val1), (t2MOVi (i32 16)) ))>;
+
+  defm : MVE_VBRSR_pat_m<MVE_v16i8, MVE_VBRSR8>;
+  defm : MVE_VBRSR_pat_m<MVE_v8i16, MVE_VBRSR16>;
+  defm : MVE_VBRSR_pat_m<MVE_v4i32, MVE_VBRSR32>;
 }
 
-class MVE_VMUL_qr_int<string iname, string suffix,
-                      bits<2> size, list<dag> pattern=[]>
-  : MVE_qDest_rSrc<iname, suffix, "", pattern> {
+let Predicates = [HasMVEFloat] in {
+  defm : MVE_VBRSR_pat_m<MVE_v8f16, MVE_VBRSR16>;
+  defm : MVE_VBRSR_pat_m<MVE_v4f32, MVE_VBRSR32>;
+}
+
+class MVE_VMUL_qr_int<string iname, string suffix, bits<2> size>
+  : MVE_qDest_rSrc<iname, suffix, ""> {
 
   let Inst{28} = 0b0;
   let Inst{21-20} = size;
@@ -4618,19 +5344,16 @@ class MVE_VMUL_qr_int<string iname, string suffix,
   let validForTailPredication = 1;
 }
 
-def MVE_VMUL_qr_i8  : MVE_VMUL_qr_int<"vmul", "i8",  0b00>;
-def MVE_VMUL_qr_i16 : MVE_VMUL_qr_int<"vmul", "i16", 0b01>;
-def MVE_VMUL_qr_i32 : MVE_VMUL_qr_int<"vmul", "i32", 0b10>;
-
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v16i8 (mul (v16i8 MQPR:$val1), (v16i8 (ARMvdup GPR:$val2)))),
-            (v16i8 (MVE_VMUL_qr_i8 (v16i8 MQPR:$val1), (i32 GPR:$val2)))>;
-  def : Pat<(v8i16 (mul (v8i16 MQPR:$val1), (v8i16 (ARMvdup GPR:$val2)))),
-            (v8i16 (MVE_VMUL_qr_i16 (v8i16 MQPR:$val1), (i32 GPR:$val2)))>;
-  def : Pat<(v4i32 (mul (v4i32 MQPR:$val1), (v4i32 (ARMvdup GPR:$val2)))),
-            (v4i32 (MVE_VMUL_qr_i32 (v4i32 MQPR:$val1), (i32 GPR:$val2)))>;
+multiclass MVE_VMUL_qr_int_m<MVEVectorVTInfo VTI> {
+  def "" : MVE_VMUL_qr_int<"vmul", VTI.Suffix, VTI.Size>;
+  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+                                  mul, int_arm_mve_mul_predicated>;
 }
 
+defm MVE_VMUL_qr_i8  : MVE_VMUL_qr_int_m<MVE_v16i8>;
+defm MVE_VMUL_qr_i16 : MVE_VMUL_qr_int_m<MVE_v8i16>;
+defm MVE_VMUL_qr_i32 : MVE_VMUL_qr_int_m<MVE_v4i32>;
+
 class MVE_VxxMUL_qr<string iname, string suffix,
                     bit bit_28, bits<2> bits_21_20, list<dag> pattern=[]>
   : MVE_qDest_rSrc<iname, suffix, "", pattern> {
@@ -4643,19 +5366,37 @@ class MVE_VxxMUL_qr<string iname, string suffix,
   let Inst{5} = 0b1;
 }
 
-def MVE_VQDMULH_qr_s8   : MVE_VxxMUL_qr<"vqdmulh",  "s8",  0b0, 0b00>;
-def MVE_VQDMULH_qr_s16  : MVE_VxxMUL_qr<"vqdmulh",  "s16", 0b0, 0b01>;
-def MVE_VQDMULH_qr_s32  : MVE_VxxMUL_qr<"vqdmulh",  "s32", 0b0, 0b10>;
+multiclass MVE_VxxMUL_qr_m<string iname, MVEVectorVTInfo VTI, bit bit_28,
+                           Intrinsic int_unpred, Intrinsic int_pred> {
+  def "" : MVE_VxxMUL_qr<iname, VTI.Suffix, bit_28, VTI.Size>;
+  defm : MVE_vec_scalar_int_pat_m<!cast<Instruction>(NAME), VTI,
+                                  int_unpred, int_pred>;
+}
+
+multiclass MVE_VQDMULH_qr_m<MVEVectorVTInfo VTI> :
+  MVE_VxxMUL_qr_m<"vqdmulh", VTI, 0b0,
+                  int_arm_mve_vqdmulh, int_arm_mve_qdmulh_predicated>;
+
+multiclass MVE_VQRDMULH_qr_m<MVEVectorVTInfo VTI> :
+  MVE_VxxMUL_qr_m<"vqrdmulh", VTI, 0b1,
+                  int_arm_mve_vqrdmulh, int_arm_mve_qrdmulh_predicated>;
 
-def MVE_VQRDMULH_qr_s8  : MVE_VxxMUL_qr<"vqrdmulh", "s8",  0b1, 0b00>;
-def MVE_VQRDMULH_qr_s16 : MVE_VxxMUL_qr<"vqrdmulh", "s16", 0b1, 0b01>;
-def MVE_VQRDMULH_qr_s32 : MVE_VxxMUL_qr<"vqrdmulh", "s32", 0b1, 0b10>;
+defm MVE_VQDMULH_qr_s8    : MVE_VQDMULH_qr_m<MVE_v16s8>;
+defm MVE_VQDMULH_qr_s16   : MVE_VQDMULH_qr_m<MVE_v8s16>;
+defm MVE_VQDMULH_qr_s32   : MVE_VQDMULH_qr_m<MVE_v4s32>;
+
+defm MVE_VQRDMULH_qr_s8   : MVE_VQRDMULH_qr_m<MVE_v16s8>;
+defm MVE_VQRDMULH_qr_s16  : MVE_VQRDMULH_qr_m<MVE_v8s16>;
+defm MVE_VQRDMULH_qr_s32  : MVE_VQRDMULH_qr_m<MVE_v4s32>;
 
 let Predicates = [HasMVEFloat], validForTailPredication = 1 in {
   def MVE_VMUL_qr_f16   : MVE_VxxMUL_qr<"vmul", "f16", 0b1, 0b11>;
   def MVE_VMUL_qr_f32   : MVE_VxxMUL_qr<"vmul", "f32", 0b0, 0b11>;
 }
 
+defm : MVE_vec_scalar_fp_pat_m<fmul, int_arm_mve_mul_predicated,
+                               MVE_VMUL_qr_f16, MVE_VMUL_qr_f32>;
+
 class MVE_VFMAMLA_qr<string iname, string suffix,
                      bit bit_28, bits<2> bits_21_20, bit S,
                      list<dag> pattern=[]>
@@ -4668,42 +5409,87 @@ class MVE_VFMAMLA_qr<string iname, string suffix,
   let Inst{8} = 0b0;
   let Inst{5} = 0b0;
   let validForTailPredication = 1;
+  let hasSideEffects = 0;
 }
 
-def MVE_VMLA_qr_s8     : MVE_VFMAMLA_qr<"vmla",  "s8",  0b0, 0b00, 0b0>;
-def MVE_VMLA_qr_s16    : MVE_VFMAMLA_qr<"vmla",  "s16", 0b0, 0b01, 0b0>;
-def MVE_VMLA_qr_s32    : MVE_VFMAMLA_qr<"vmla",  "s32", 0b0, 0b10, 0b0>;
-def MVE_VMLA_qr_u8     : MVE_VFMAMLA_qr<"vmla",  "u8",  0b1, 0b00, 0b0>;
-def MVE_VMLA_qr_u16    : MVE_VFMAMLA_qr<"vmla",  "u16", 0b1, 0b01, 0b0>;
-def MVE_VMLA_qr_u32    : MVE_VFMAMLA_qr<"vmla",  "u32", 0b1, 0b10, 0b0>;
+multiclass MVE_VMLA_qr_multi<string iname, MVEVectorVTInfo VTI,
+                             bit scalar_addend> {
+  def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Unsigned, VTI.Size,
+                         scalar_addend>;
+  defvar Inst = !cast<Instruction>(NAME);
+  defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # iname # "_n_predicated");
+  defvar v1   = (VTI.Vec MQPR:$v1);
+  defvar v2   = (VTI.Vec MQPR:$v2);
+  defvar vs   = (VTI.Vec (ARMvdup rGPR:$s));
+  defvar s    = (i32 rGPR:$s);
+  defvar pred = (VTI.Pred VCCR:$pred);
+
+  // The signed and unsigned variants of this instruction have different
+  // encodings, but they're functionally identical. For the sake of
+  // determinism, we generate only the unsigned variant.
+  if VTI.Unsigned then let Predicates = [HasMVEInt] in {
+    if scalar_addend then {
+      def : Pat<(VTI.Vec (add (mul v1, v2), vs)),
+                (VTI.Vec (Inst v1, v2, s))>;
+    } else {
+      def : Pat<(VTI.Vec (add (mul v2, vs), v1)),
+                (VTI.Vec (Inst v1, v2, s))>;
+    }
 
-def MVE_VMLAS_qr_s8    : MVE_VFMAMLA_qr<"vmlas", "s8",  0b0, 0b00, 0b1>;
-def MVE_VMLAS_qr_s16   : MVE_VFMAMLA_qr<"vmlas", "s16", 0b0, 0b01, 0b1>;
-def MVE_VMLAS_qr_s32   : MVE_VFMAMLA_qr<"vmlas", "s32", 0b0, 0b10, 0b1>;
-def MVE_VMLAS_qr_u8    : MVE_VFMAMLA_qr<"vmlas", "u8",  0b1, 0b00, 0b1>;
-def MVE_VMLAS_qr_u16   : MVE_VFMAMLA_qr<"vmlas", "u16", 0b1, 0b01, 0b1>;
-def MVE_VMLAS_qr_u32   : MVE_VFMAMLA_qr<"vmlas", "u32", 0b1, 0b10, 0b1>;
+    def : Pat<(VTI.Vec (pred_int v1, v2, s, pred)),
+              (VTI.Vec (Inst v1, v2, s, ARMVCCThen, pred))>;
+  }
+}
 
-let Predicates = [HasMVEInt] in {
-  def : Pat<(v4i32 (add (v4i32 MQPR:$src1),
-                        (v4i32 (mul (v4i32 MQPR:$src2),
-                                    (v4i32 (ARMvdup (i32 rGPR:$x))))))),
-            (v4i32 (MVE_VMLA_qr_u32 $src1, $src2, $x))>;
-  def : Pat<(v8i16 (add (v8i16 MQPR:$src1),
-                        (v8i16 (mul (v8i16 MQPR:$src2),
-                                    (v8i16 (ARMvdup (i32 rGPR:$x))))))),
-            (v8i16 (MVE_VMLA_qr_u16 $src1, $src2, $x))>;
-  def : Pat<(v16i8 (add (v16i8 MQPR:$src1),
-                        (v16i8 (mul (v16i8 MQPR:$src2),
-                                    (v16i8 (ARMvdup (i32 rGPR:$x))))))),
-            (v16i8 (MVE_VMLA_qr_u8 $src1, $src2, $x))>;
+defm MVE_VMLA_qr_s8   : MVE_VMLA_qr_multi<"vmla", MVE_v16s8, 0b0>;
+defm MVE_VMLA_qr_s16  : MVE_VMLA_qr_multi<"vmla", MVE_v8s16, 0b0>;
+defm MVE_VMLA_qr_s32  : MVE_VMLA_qr_multi<"vmla", MVE_v4s32, 0b0>;
+defm MVE_VMLA_qr_u8   : MVE_VMLA_qr_multi<"vmla", MVE_v16u8, 0b0>;
+defm MVE_VMLA_qr_u16  : MVE_VMLA_qr_multi<"vmla", MVE_v8u16, 0b0>;
+defm MVE_VMLA_qr_u32  : MVE_VMLA_qr_multi<"vmla", MVE_v4u32, 0b0>;
+
+defm MVE_VMLAS_qr_s8  : MVE_VMLA_qr_multi<"vmlas", MVE_v16s8, 0b1>;
+defm MVE_VMLAS_qr_s16 : MVE_VMLA_qr_multi<"vmlas", MVE_v8s16, 0b1>;
+defm MVE_VMLAS_qr_s32 : MVE_VMLA_qr_multi<"vmlas", MVE_v4s32, 0b1>;
+defm MVE_VMLAS_qr_u8  : MVE_VMLA_qr_multi<"vmlas", MVE_v16u8, 0b1>;
+defm MVE_VMLAS_qr_u16 : MVE_VMLA_qr_multi<"vmlas", MVE_v8u16, 0b1>;
+defm MVE_VMLAS_qr_u32 : MVE_VMLA_qr_multi<"vmlas", MVE_v4u32, 0b1>;
+
+multiclass MVE_VFMA_qr_multi<string iname, MVEVectorVTInfo VTI,
+                             bit scalar_addend> {
+  def "": MVE_VFMAMLA_qr<iname, VTI.Suffix, VTI.Size{0}, 0b11, scalar_addend>;
+  defvar Inst = !cast<Instruction>(NAME);
+  defvar pred_int = int_arm_mve_fma_predicated;
+  defvar v1   = (VTI.Vec MQPR:$v1);
+  defvar v2   = (VTI.Vec MQPR:$v2);
+  defvar vs   = (VTI.Vec (ARMvdup (i32 rGPR:$s)));
+  defvar is   = (i32 rGPR:$s);
+  defvar pred = (VTI.Pred VCCR:$pred);
+
+  let Predicates = [HasMVEFloat] in {
+    if scalar_addend then {
+      def : Pat<(VTI.Vec (fma v1, v2, vs)),
+                (VTI.Vec (Inst v1, v2, is))>;
+      def : Pat<(VTI.Vec (pred_int v1, v2, vs, pred)),
+                (VTI.Vec (Inst v1, v2, is, ARMVCCThen, pred))>;
+    } else {
+      def : Pat<(VTI.Vec (fma v1, vs, v2)),
+                (VTI.Vec (Inst v2, v1, is))>;
+      def : Pat<(VTI.Vec (fma vs, v1, v2)),
+                (VTI.Vec (Inst v2, v1, is))>;
+      def : Pat<(VTI.Vec (pred_int v1, vs, v2, pred)),
+                (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>;
+      def : Pat<(VTI.Vec (pred_int vs, v1, v2, pred)),
+                (VTI.Vec (Inst v2, v1, is, ARMVCCThen, pred))>;
+    }
+  }
 }
 
 let Predicates = [HasMVEFloat] in {
-  def MVE_VFMA_qr_f16  : MVE_VFMAMLA_qr<"vfma",  "f16", 0b1, 0b11, 0b0>;
-  def MVE_VFMA_qr_f32  : MVE_VFMAMLA_qr<"vfma",  "f32", 0b0, 0b11, 0b0>;
-  def MVE_VFMA_qr_Sf16 : MVE_VFMAMLA_qr<"vfmas", "f16", 0b1, 0b11, 0b1>;
-  def MVE_VFMA_qr_Sf32 : MVE_VFMAMLA_qr<"vfmas", "f32", 0b0, 0b11, 0b1>;
+  defm MVE_VFMA_qr_f16  : MVE_VFMA_qr_multi<"vfma",  MVE_v8f16, 0>;
+  defm MVE_VFMA_qr_f32  : MVE_VFMA_qr_multi<"vfma",  MVE_v4f32, 0>;
+  defm MVE_VFMA_qr_Sf16 : MVE_VFMA_qr_multi<"vfmas", MVE_v8f16, 1>;
+  defm MVE_VFMA_qr_Sf32 : MVE_VFMA_qr_multi<"vfmas", MVE_v4f32, 1>;
 }
 
 class MVE_VQDMLAH_qr<string iname, string suffix, bit U, bits<2> size,
@@ -4718,10 +5504,30 @@ class MVE_VQDMLAH_qr<string iname, string suffix, bit U, bits<2> size,
   let Inst{5} = bit_5;
 }
 
+multiclass MVE_VQDMLAH_qr_multi<string iname, MVEVectorVTInfo VTI,
+                                bit bit_5, bit bit_12> {
+  def "": MVE_VQDMLAH_qr<iname, VTI.Suffix, 0b0, VTI.Size, bit_5, bit_12>;
+  defvar Inst = !cast<Instruction>(NAME);
+  defvar unpred_int = !cast<Intrinsic>("int_arm_mve_" # iname);
+  defvar pred_int = !cast<Intrinsic>("int_arm_mve_" # iname # "_predicated");
+
+  let Predicates = [HasMVEInt] in {
+    def : Pat<(VTI.Vec (unpred_int (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2),
+                                   (i32 rGPR:$s))),
+              (VTI.Vec (Inst       (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2),
+                                   (i32 rGPR:$s)))>;
+    def : Pat<(VTI.Vec (pred_int   (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2),
+                                   (i32 rGPR:$s), (VTI.Pred VCCR:$pred))),
+              (VTI.Vec (Inst       (VTI.Vec MQPR:$v1), (VTI.Vec MQPR:$v2),
+                                   (i32 rGPR:$s), ARMVCCThen,
+                                   (VTI.Pred VCCR:$pred)))>;
+  }
+}
+
 multiclass MVE_VQDMLAH_qr_types<string iname, bit bit_5, bit bit_12> {
-  def s8  : MVE_VQDMLAH_qr<iname, "s8",  0b0, 0b00, bit_5, bit_12>;
-  def s16 : MVE_VQDMLAH_qr<iname, "s16", 0b0, 0b01, bit_5, bit_12>;
-  def s32 : MVE_VQDMLAH_qr<iname, "s32", 0b0, 0b10, bit_5, bit_12>;
+  defm s8  : MVE_VQDMLAH_qr_multi<iname, MVE_v16s8, bit_5, bit_12>;
+  defm s16 : MVE_VQDMLAH_qr_multi<iname, MVE_v8s16, bit_5, bit_12>;
+  defm s32 : MVE_VQDMLAH_qr_multi<iname, MVE_v4s32, bit_5, bit_12>;
 }
 
 defm MVE_VQDMLAH_qr   : MVE_VQDMLAH_qr_types<"vqdmlah",   0b1, 0b0>;
@@ -4752,6 +5558,7 @@ class MVE_VxDUP<string iname, string suffix, bits<2> size, bit bit_12,
   let Inst{6-1} = 0b110111;
   let Inst{0} = imm{0};
   let validForTailPredication = 1;
+  let hasSideEffects = 0;
 }
 
 def MVE_VIDUPu8  : MVE_VxDUP<"vidup", "u8",  0b00, 0b0>;
@@ -4787,6 +5594,7 @@ class MVE_VxWDUP<string iname, string suffix, bits<2> size, bit bit_12,
   let Inst{3-1} = Rm{3-1};
   let Inst{0} = imm{0};
   let validForTailPredication = 1;
+  let hasSideEffects = 0;
 }
 
 def MVE_VIWDUPu8  : MVE_VxWDUP<"viwdup", "u8",  0b00, 0b0>;
@@ -4855,6 +5663,8 @@ class MVE_VMOV_64bit<dag oops, dag iops, bit to_qreg, string ops, string cstr>
   let Inst{12-5} = 0b01111000;
   let Inst{4} = idx2;
   let Inst{3-0} = Rt{3-0};
+
+  let hasSideEffects = 0;
 }
 
 // The assembly syntax for these instructions mentions the vector
@@ -4924,6 +5734,7 @@ class MVE_vldst24_base<bit writeback, bit fourregs, bits<2> stage, bits<2> size,
 
   let mayLoad = load;
   let mayStore = !eq(load,0);
+  let hasSideEffects = 0;
 }
 
 // A parameter class used to encapsulate all the ways the writeback
@@ -5004,22 +5815,44 @@ foreach wb = [MVE_vldst24_writeback<
                      "vst" # n.nvecs # stage # "." # s.lanesize>;
 }
 
+def SDTARMVST2    : SDTypeProfile<1, 5, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVec<3>,
+                                         SDTCisSameAs<3, 4>, SDTCisVT<5, i32>]>;
+def SDTARMVST4    : SDTypeProfile<1, 7, [SDTCisPtrTy<0>, SDTCisPtrTy<1>, SDTCisVT<2, i32>, SDTCisVec<3>,
+                                         SDTCisSameAs<3, 4>, SDTCisSameAs<3, 5>,
+                                         SDTCisSameAs<3, 6>, SDTCisVT<7, i32>]>;
+def MVEVST2UPD       : SDNode<"ARMISD::VST2_UPD", SDTARMVST2, [SDNPHasChain]>;
+def MVEVST4UPD       : SDNode<"ARMISD::VST4_UPD", SDTARMVST4, [SDNPHasChain]>;
+
 multiclass MVE_vst24_patterns<int lanesize, ValueType VT> {
   foreach stage = [0,1] in
     def : Pat<(int_arm_mve_vst2q i32:$addr,
-               (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)),
+                (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage)),
               (!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize)
-               (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
-               t2_addr_offset_none:$addr)>;
+                (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
+                t2_addr_offset_none:$addr)>;
+  foreach stage = [0,1] in
+    def : Pat<(i32 (MVEVST2UPD i32:$addr, (i32 32),
+                (VT MQPR:$v0), (VT MQPR:$v1), (i32 stage))),
+              (i32 (!cast<Instruction>("MVE_VST2"#stage#"_"#lanesize#_wb)
+                (REG_SEQUENCE QQPR, VT:$v0, qsub_0, VT:$v1, qsub_1),
+                t2_addr_offset_none:$addr))>;
 
   foreach stage = [0,1,2,3] in
     def : Pat<(int_arm_mve_vst4q i32:$addr,
-               (VT MQPR:$v0), (VT MQPR:$v1),
-               (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)),
+                (VT MQPR:$v0), (VT MQPR:$v1),
+                (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage)),
               (!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize)
-               (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
-                                     VT:$v2, qsub_2, VT:$v3, qsub_3),
-               t2_addr_offset_none:$addr)>;
+                (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
+                                      VT:$v2, qsub_2, VT:$v3, qsub_3),
+                t2_addr_offset_none:$addr)>;
+  foreach stage = [0,1,2,3] in
+    def : Pat<(i32 (MVEVST4UPD i32:$addr, (i32 64),
+                (VT MQPR:$v0), (VT MQPR:$v1),
+                (VT MQPR:$v2), (VT MQPR:$v3), (i32 stage))),
+              (i32 (!cast<Instruction>("MVE_VST4"#stage#"_"#lanesize#_wb)
+                (REG_SEQUENCE QQQQPR, VT:$v0, qsub_0, VT:$v1, qsub_1,
+                                      VT:$v2, qsub_2, VT:$v3, qsub_3),
+                t2_addr_offset_none:$addr))>;
 }
 defm : MVE_vst24_patterns<8, v16i8>;
 defm : MVE_vst24_patterns<16, v8i16>;
@@ -5097,6 +5930,7 @@ class MVE_VLDRSTR_base<MVE_ldst_direction dir, bit U, bit P, bit W, bit opc,
 
   let mayLoad = dir.load;
   let mayStore = !eq(dir.load,0);
+  let hasSideEffects = 0;
   let validForTailPredication = 1;
 }
 
diff --git a/llvm/lib/Target/ARM/ARMInstrNEON.td b/llvm/lib/Target/ARM/ARMInstrNEON.td
index 6244d8d9e27e..1b3f6075c0e9 100644
--- a/llvm/lib/Target/ARM/ARMInstrNEON.td
+++ b/llvm/lib/Target/ARM/ARMInstrNEON.td
@@ -509,11 +509,6 @@ def NEONvqrshrnsuImm : SDNode<"ARMISD::VQRSHRNsuIMM", SDTARMVSHXIMM>;
 def NEONvsliImm      : SDNode<"ARMISD::VSLIIMM", SDTARMVSHINSIMM>;
 def NEONvsriImm      : SDNode<"ARMISD::VSRIIMM", SDTARMVSHINSIMM>;
 
-def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
-                                           SDTCisVT<2, i32>]>;
-def NEONvorrImm   : SDNode<"ARMISD::VORRIMM", SDTARMVORRIMM>;
-def NEONvbicImm   : SDNode<"ARMISD::VBICIMM", SDTARMVORRIMM>;
-
 def NEONvbsl      : SDNode<"ARMISD::VBSL",
                            SDTypeProfile<1, 3, [SDTCisVec<0>,
                                                 SDTCisSameAs<0, 1>,
@@ -531,11 +526,6 @@ def NEONzip       : SDNode<"ARMISD::VZIP", SDTARMVSHUF2>;
 def NEONuzp       : SDNode<"ARMISD::VUZP", SDTARMVSHUF2>;
 def NEONtrn       : SDNode<"ARMISD::VTRN", SDTARMVSHUF2>;
 
-def SDTARMVMULL   : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisInt<1>,
-                                         SDTCisSameAs<1, 2>]>;
-def NEONvmulls    : SDNode<"ARMISD::VMULLs", SDTARMVMULL>;
-def NEONvmullu    : SDNode<"ARMISD::VMULLu", SDTARMVMULL>;
-
 def SDTARMVTBL1   : SDTypeProfile<1, 2, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>,
                                          SDTCisVT<2, v8i8>]>;
 def SDTARMVTBL2   : SDTypeProfile<1, 3, [SDTCisVT<0, v8i8>, SDTCisVT<1, v8i8>,
@@ -1084,6 +1074,12 @@ def : Pat<(vector_insert (v4f16 DPR:$src),
 def : Pat<(vector_insert (v8f16 QPR:$src),
                          (f16 (load addrmode6:$addr)), imm:$lane),
           (VLD1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
+def : Pat<(vector_insert (v4bf16 DPR:$src),
+                         (bf16 (load addrmode6:$addr)), imm:$lane),
+          (VLD1LNd16 addrmode6:$addr, DPR:$src, imm:$lane)>;
+def : Pat<(vector_insert (v8bf16 QPR:$src),
+                         (bf16 (load addrmode6:$addr)), imm:$lane),
+          (VLD1LNq16Pseudo addrmode6:$addr, QPR:$src, imm:$lane)>;
 def : Pat<(vector_insert (v2f32 DPR:$src),
                          (f32 (load addrmode6:$addr)), imm:$lane),
           (VLD1LNd32 addrmode6:$addr, DPR:$src, imm:$lane)>;
@@ -2459,57 +2455,6 @@ def : Pat<(byte_alignedstore (v2f64 QPR:$value), addrmode6:$addr),
 }
 
 //===----------------------------------------------------------------------===//
-// NEON pattern fragments
-//===----------------------------------------------------------------------===//
-
-// Extract D sub-registers of Q registers.
-def DSubReg_i8_reg  : SDNodeXForm<imm, [{
-  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
-  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/8, SDLoc(N),
-                                   MVT::i32);
-}]>;
-def DSubReg_i16_reg : SDNodeXForm<imm, [{
-  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
-  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/4, SDLoc(N),
-                                   MVT::i32);
-}]>;
-def DSubReg_i32_reg : SDNodeXForm<imm, [{
-  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
-  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue()/2, SDLoc(N),
-                                   MVT::i32);
-}]>;
-def DSubReg_f64_reg : SDNodeXForm<imm, [{
-  assert(ARM::dsub_7 == ARM::dsub_0+7 && "Unexpected subreg numbering");
-  return CurDAG->getTargetConstant(ARM::dsub_0 + N->getZExtValue(), SDLoc(N),
-                                   MVT::i32);
-}]>;
-
-// Extract S sub-registers of Q/D registers.
-def SSubReg_f32_reg : SDNodeXForm<imm, [{
-  assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
-  return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue(), SDLoc(N),
-                                   MVT::i32);
-}]>;
-
-// Extract S sub-registers of Q/D registers containing a given f16 lane.
-def SSubReg_f16_reg : SDNodeXForm<imm, [{
-  assert(ARM::ssub_3 == ARM::ssub_0+3 && "Unexpected subreg numbering");
-  return CurDAG->getTargetConstant(ARM::ssub_0 + N->getZExtValue()/2, SDLoc(N),
-                                   MVT::i32);
-}]>;
-
-// Translate lane numbers from Q registers to D subregs.
-def SubReg_i8_lane  : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() & 7, SDLoc(N), MVT::i32);
-}]>;
-def SubReg_i16_lane : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() & 3, SDLoc(N), MVT::i32);
-}]>;
-def SubReg_i32_lane : SDNodeXForm<imm, [{
-  return CurDAG->getTargetConstant(N->getZExtValue() & 1, SDLoc(N), MVT::i32);
-}]>;
-
-//===----------------------------------------------------------------------===//
 // Instruction Classes
 //===----------------------------------------------------------------------===//
 
@@ -4367,7 +4312,7 @@ def : Pat<(v2f32 (fmul DPR:$Rn, (ARMvdup (f32 SPR:$Rm)))),
             (i32 0))>;
 def : Pat<(v4f16 (fmul DPR:$Rn, (ARMvdup (f16 HPR:$Rm)))),
           (VMULslhd DPR:$Rn,
-            (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0),
+            (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), (f16 HPR:$Rm), ssub_0),
             (i32 0))>;
 def : Pat<(v4f32 (fmul QPR:$Rn, (ARMvdup (f32 SPR:$Rm)))),
           (VMULslfq QPR:$Rn,
@@ -4375,7 +4320,7 @@ def : Pat<(v4f32 (fmul QPR:$Rn, (ARMvdup (f32 SPR:$Rm)))),
             (i32 0))>;
 def : Pat<(v8f16 (fmul QPR:$Rn, (ARMvdup (f16 HPR:$Rm)))),
           (VMULslhq QPR:$Rn,
-            (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), HPR:$Rm, ssub_0),
+            (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)), (f16 HPR:$Rm), ssub_0),
             (i32 0))>;
 }
 
@@ -4433,17 +4378,17 @@ def : Pat<(v4i32 (int_arm_neon_vqrdmulh (v4i32 QPR:$src1),
 let PostEncoderMethod = "NEONThumb2DataIPostEncoder",
     DecoderNamespace = "NEONData" in {
   defm VMULLs   : N3VL_QHS<0,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
-                           "vmull", "s", NEONvmulls, 1>;
+                           "vmull", "s", ARMvmulls, 1>;
   defm VMULLu   : N3VL_QHS<1,1,0b1100,0, IIC_VMULi16D, IIC_VMULi32D,
-                           "vmull", "u", NEONvmullu, 1>;
+                           "vmull", "u", ARMvmullu, 1>;
   def  VMULLp8   :  N3VLInt<0, 1, 0b00, 0b1110, 0, IIC_VMULi16D, "vmull", "p8",
                             v8i16, v8i8, int_arm_neon_vmullp, 1>;
   def  VMULLp64  : N3VLIntnp<0b00101, 0b10, 0b1110, 0, 0, NoItinerary,
                           "vmull", "p64", v2i64, v1i64, int_arm_neon_vmullp, 1>,
                     Requires<[HasV8, HasCrypto]>;
 }
-defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", NEONvmulls>;
-defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", NEONvmullu>;
+defm VMULLsls : N3VLSL_HS<0, 0b1010, IIC_VMULi16D, "vmull", "s", ARMvmulls>;
+defm VMULLslu : N3VLSL_HS<1, 0b1010, IIC_VMULi16D, "vmull", "u", ARMvmullu>;
 
 //   VQDMULL  : Vector Saturating Doubling Multiply Long (Q = D * D)
 defm VQDMULL  : N3VLInt_HS<0,1,0b1101,0, IIC_VMULi16D, IIC_VMULi32D,
@@ -4513,12 +4458,12 @@ def : Pat<(v4f32 (fadd_mlx (v4f32 QPR:$src1),
 
 //   VMLAL    : Vector Multiply Accumulate Long (Q += D * D)
 defm VMLALs   : N3VLMulOp_QHS<0,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
-                              "vmlal", "s", NEONvmulls, add>;
+                              "vmlal", "s", ARMvmulls, add>;
 defm VMLALu   : N3VLMulOp_QHS<1,1,0b1000,0, IIC_VMACi16D, IIC_VMACi32D,
-                              "vmlal", "u", NEONvmullu, add>;
+                              "vmlal", "u", ARMvmullu, add>;
 
-defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", NEONvmulls, add>;
-defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", NEONvmullu, add>;
+defm VMLALsls : N3VLMulOpSL_HS<0, 0b0010, "vmlal", "s", ARMvmulls, add>;
+defm VMLALslu : N3VLMulOpSL_HS<1, 0b0010, "vmlal", "u", ARMvmullu, add>;
 
 let Predicates = [HasNEON, HasV8_1a] in {
   // v8.1a Neon Rounding Double Multiply-Op vector operations,
@@ -4746,12 +4691,12 @@ def : Pat<(v4f32 (fsub_mlx (v4f32 QPR:$src1),
 
 //   VMLSL    : Vector Multiply Subtract Long (Q -= D * D)
 defm VMLSLs   : N3VLMulOp_QHS<0,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
-                              "vmlsl", "s", NEONvmulls, sub>;
+                              "vmlsl", "s", ARMvmulls, sub>;
 defm VMLSLu   : N3VLMulOp_QHS<1,1,0b1010,0, IIC_VMACi16D, IIC_VMACi32D,
-                              "vmlsl", "u", NEONvmullu, sub>;
+                              "vmlsl", "u", ARMvmullu, sub>;
 
-defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", NEONvmulls, sub>;
-defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", NEONvmullu, sub>;
+defm VMLSLsls : N3VLMulOpSL_HS<0, 0b0110, "vmlsl", "s", ARMvmulls, sub>;
+defm VMLSLslu : N3VLMulOpSL_HS<1, 0b0110, "vmlsl", "u", ARMvmullu, sub>;
 
 //   VQDMLSL  : Vector Saturating Doubling Multiply Subtract Long (Q -= D * D)
 defm VQDMLSL  : N3VLInt3_HS<0, 1, 0b1011, 0, IIC_VMACi16D, IIC_VMACi32D,
@@ -4833,10 +4778,10 @@ def : Pat<(v4f32 (fma (fneg QPR:$Vn), QPR:$Vm, QPR:$src1)),
 // We put them in the VFPV8 decoder namespace because the ARM and Thumb
 // encodings are the same and thus no further bit twiddling is necessary
 // in the disassembler.
-class VDOT<bit op6, bit op4, RegisterClass RegTy, string Asm, string AsmTy,
-           ValueType AccumTy, ValueType InputTy,
+class VDOT<bit op6, bit op4, bit op23, RegisterClass RegTy, string Asm,
+           string AsmTy, ValueType AccumTy, ValueType InputTy,
            SDPatternOperator OpNode> :
-      N3Vnp<0b11000, 0b10, 0b1101, op6, op4, (outs RegTy:$dst),
+      N3Vnp<{0b1100, op23}, 0b10, 0b1101, op6, op4, (outs RegTy:$dst),
             (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm), N3RegFrm, IIC_VDOTPROD,
             Asm, AsmTy,
             [(set (AccumTy RegTy:$dst),
@@ -4848,10 +4793,10 @@ class VDOT<bit op6, bit op4, RegisterClass RegTy, string Asm, string AsmTy,
   let Constraints = "$dst = $Vd";
 }
 
-def VUDOTD : VDOT<0, 1, DPR, "vudot", "u8", v2i32, v8i8,  int_arm_neon_udot>;
-def VSDOTD : VDOT<0, 0, DPR, "vsdot", "s8", v2i32, v8i8,  int_arm_neon_sdot>;
-def VUDOTQ : VDOT<1, 1, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>;
-def VSDOTQ : VDOT<1, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>;
+def VUDOTD : VDOT<0, 1, 0, DPR, "vudot", "u8", v2i32, v8i8,  int_arm_neon_udot>;
+def VSDOTD : VDOT<0, 0, 0, DPR, "vsdot", "s8", v2i32, v8i8,  int_arm_neon_sdot>;
+def VUDOTQ : VDOT<1, 1, 0, QPR, "vudot", "u8", v4i32, v16i8, int_arm_neon_udot>;
+def VSDOTQ : VDOT<1, 0, 0, QPR, "vsdot", "s8", v4i32, v16i8, int_arm_neon_sdot>;
 
 // Indexed dot product instructions:
 multiclass DOTI<string opc, string dt, bit Q, bit U, RegisterClass Ty,
@@ -4886,6 +4831,68 @@ defm VUDOTQI : DOTI<"vudot", "u8", 0b1, 0b1, QPR, v4i32, v16i8,
 defm VSDOTQI : DOTI<"vsdot", "s8", 0b1, 0b0, QPR, v4i32, v16i8,
                     int_arm_neon_sdot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
 
+// v8.6A matrix multiplication extension
+let Predicates = [HasMatMulInt8] in {
+  class N3VMatMul<bit B, bit U, string Asm, string AsmTy,
+                  SDPatternOperator OpNode>
+        : N3Vnp<{0b1100, B}, 0b10, 0b1100, 1, U, (outs QPR:$dst),
+                (ins QPR:$Vd, QPR:$Vn, QPR:$Vm), N3RegFrm, NoItinerary,
+                Asm, AsmTy,
+                [(set (v4i32 QPR:$dst), (OpNode (v4i32 QPR:$Vd),
+                                                (v16i8 QPR:$Vn),
+                                                (v16i8 QPR:$Vm)))]> {
+    let DecoderNamespace = "VFPV8";
+    let Constraints = "$dst = $Vd";
+  }
+
+  multiclass N3VMixedDotLane<bit Q, bit U, string Asm, string AsmTy, RegisterClass RegTy,
+                        ValueType AccumTy, ValueType InputTy, SDPatternOperator OpNode,
+                        dag RHS> {
+
+    def "" : N3Vnp<0b11101, 0b00, 0b1101, Q, U, (outs RegTy:$dst),
+                (ins RegTy:$Vd, RegTy:$Vn, DPR_VFP2:$Vm, VectorIndex32:$lane), N3RegFrm,
+                 NoItinerary, Asm, AsmTy, []> {
+      bit lane;
+      let Inst{5} = lane;
+      let AsmString = !strconcat(Asm, ".", AsmTy, "\t$Vd, $Vn, $Vm$lane");
+      let DecoderNamespace = "VFPV8";
+      let Constraints = "$dst = $Vd";
+    }
+
+    def : Pat<
+      (AccumTy (OpNode (AccumTy RegTy:$Vd),
+                       (InputTy RegTy:$Vn),
+                       (InputTy (bitconvert (AccumTy
+                                (ARMvduplane (AccumTy RegTy:$Vm),
+                                              VectorIndex32:$lane)))))),
+      (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
+
+  }
+
+  multiclass SUDOTLane<bit Q, RegisterClass RegTy, ValueType AccumTy, ValueType InputTy, dag RHS>
+        : N3VMixedDotLane<Q, 1, "vsudot", "u8", RegTy, AccumTy, InputTy, null_frag, null_frag> {
+    def : Pat<
+      (AccumTy (int_arm_neon_usdot (AccumTy RegTy:$Vd),
+                                   (InputTy (bitconvert (AccumTy
+                                            (ARMvduplane (AccumTy RegTy:$Vm),
+                                                          VectorIndex32:$lane)))),
+                                   (InputTy RegTy:$Vn))),
+      (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
+  }
+
+  def VSMMLA  : N3VMatMul<0, 0, "vsmmla",  "s8", int_arm_neon_smmla>;
+  def VUMMLA  : N3VMatMul<0, 1, "vummla",  "u8", int_arm_neon_ummla>;
+  def VUSMMLA : N3VMatMul<1, 0, "vusmmla", "s8", int_arm_neon_usmmla>;
+  def VUSDOTD : VDOT<0, 0, 1, DPR, "vusdot", "s8", v2i32, v8i8,  int_arm_neon_usdot>;
+  def VUSDOTQ : VDOT<1, 0, 1, QPR, "vusdot", "s8", v4i32, v16i8, int_arm_neon_usdot>;
+
+  defm VUSDOTDI : N3VMixedDotLane<0, 0, "vusdot", "s8", DPR, v2i32, v8i8,
+                                  int_arm_neon_usdot, (v2i32 DPR_VFP2:$Vm)>;
+  defm VUSDOTQI : N3VMixedDotLane<1, 0, "vusdot", "s8", QPR, v4i32, v16i8,
+                                  int_arm_neon_usdot, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+  defm VSUDOTDI : SUDOTLane<0, DPR, v2i32, v8i8, (v2i32 DPR_VFP2:$Vm)>;
+  defm VSUDOTQI : SUDOTLane<1, QPR, v4i32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+}
 
 // ARMv8.3 complex operations
 class BaseN3VCP8ComplexTied<bit op21, bit op4, bit s, bit q,
@@ -5232,7 +5239,6 @@ class VFMQ<string opc, string type, bits<2> S>
   let Inst{3} = idx{0};
 }
 
-let hasNoSchedulingInfo = 1 in {
 //                                                op1   op2   op3
 def VFMALD  : N3VCP8F16Q0<"vfmal", DPR, SPR, SPR, 0b00, 0b10, 1>;
 def VFMSLD  : N3VCP8F16Q0<"vfmsl", DPR, SPR, SPR, 0b01, 0b10, 1>;
@@ -5242,7 +5248,6 @@ def VFMALDI : VFMD<"vfmal", "f16", 0b00>;
 def VFMSLDI : VFMD<"vfmsl", "f16", 0b01>;
 def VFMALQI : VFMQ<"vfmal", "f16", 0b00>;
 def VFMSLQI : VFMQ<"vfmsl", "f16", 0b01>;
-}
 } // HasNEON, HasFP16FML
 
 
@@ -5296,7 +5301,7 @@ def VORRiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 0, 1,
                           IIC_VMOVImm,
                           "vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
                           [(set DPR:$Vd,
-                            (v4i16 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+                            (v4i16 (ARMvorrImm DPR:$src, timm:$SIMM)))]> {
   let Inst{9} = SIMM{9};
 }
 
@@ -5305,7 +5310,7 @@ def VORRiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 0, 1,
                           IIC_VMOVImm,
                           "vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
                           [(set DPR:$Vd,
-                            (v2i32 (NEONvorrImm DPR:$src, timm:$SIMM)))]> {
+                            (v2i32 (ARMvorrImm DPR:$src, timm:$SIMM)))]> {
   let Inst{10-9} = SIMM{10-9};
 }
 
@@ -5314,7 +5319,7 @@ def VORRiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 0, 1,
                           IIC_VMOVImm,
                           "vorr", "i16", "$Vd, $SIMM", "$src = $Vd",
                           [(set QPR:$Vd,
-                            (v8i16 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+                            (v8i16 (ARMvorrImm QPR:$src, timm:$SIMM)))]> {
   let Inst{9} = SIMM{9};
 }
 
@@ -5323,7 +5328,7 @@ def VORRiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 0, 1,
                           IIC_VMOVImm,
                           "vorr", "i32", "$Vd, $SIMM", "$src = $Vd",
                           [(set QPR:$Vd,
-                            (v4i32 (NEONvorrImm QPR:$src, timm:$SIMM)))]> {
+                            (v4i32 (ARMvorrImm QPR:$src, timm:$SIMM)))]> {
   let Inst{10-9} = SIMM{10-9};
 }
 
@@ -5347,7 +5352,7 @@ def VBICiv4i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 0, 1, 1,
                           IIC_VMOVImm,
                           "vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
                           [(set DPR:$Vd,
-                            (v4i16 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+                            (v4i16 (ARMvbicImm DPR:$src, timm:$SIMM)))]> {
   let Inst{9} = SIMM{9};
 }
 
@@ -5356,7 +5361,7 @@ def VBICiv2i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 0, 1, 1,
                           IIC_VMOVImm,
                           "vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
                           [(set DPR:$Vd,
-                            (v2i32 (NEONvbicImm DPR:$src, timm:$SIMM)))]> {
+                            (v2i32 (ARMvbicImm DPR:$src, timm:$SIMM)))]> {
   let Inst{10-9} = SIMM{10-9};
 }
 
@@ -5365,7 +5370,7 @@ def VBICiv8i16 : N1ModImm<1, 0b000, {1,0,?,1}, 0, 1, 1, 1,
                           IIC_VMOVImm,
                           "vbic", "i16", "$Vd, $SIMM", "$src = $Vd",
                           [(set QPR:$Vd,
-                            (v8i16 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+                            (v8i16 (ARMvbicImm QPR:$src, timm:$SIMM)))]> {
   let Inst{9} = SIMM{9};
 }
 
@@ -5374,7 +5379,7 @@ def VBICiv4i32 : N1ModImm<1, 0b000, {0,?,?,1}, 0, 1, 1, 1,
                           IIC_VMOVImm,
                           "vbic", "i32", "$Vd, $SIMM", "$src = $Vd",
                           [(set QPR:$Vd,
-                            (v4i32 (NEONvbicImm QPR:$src, timm:$SIMM)))]> {
+                            (v4i32 (ARMvbicImm QPR:$src, timm:$SIMM)))]> {
   let Inst{10-9} = SIMM{10-9};
 }
 
@@ -6354,32 +6359,57 @@ def : Pat<(extractelt (v2f64 QPR:$src1), imm:$src2),
           (EXTRACT_SUBREG QPR:$src1, (DSubReg_f64_reg imm:$src2))>;
 }
 
-def imm_even : ImmLeaf<i32, [{ return (Imm & 1) == 0; }]>;
-def imm_odd : ImmLeaf<i32, [{ return (Imm & 1) == 1; }]>;
-
-let Predicates = [HasNEON] in {
-def : Pat<(extractelt (v4f16 DPR:$src), imm_even:$lane),
-            (EXTRACT_SUBREG
-                (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
-                (SSubReg_f16_reg imm_even:$lane))>;
+multiclass ExtractEltEvenF16<ValueType VT4, ValueType VT8> {
+  def : Pat<(extractelt (VT4 DPR:$src), imm_even:$lane),
+              (EXTRACT_SUBREG
+                  (v2f32 (COPY_TO_REGCLASS (VT4 DPR:$src), DPR_VFP2)),
+                  (SSubReg_f16_reg imm_even:$lane))>;
+  def : Pat<(extractelt (VT8 QPR:$src), imm_even:$lane),
+              (EXTRACT_SUBREG
+                  (v4f32 (COPY_TO_REGCLASS (VT8 QPR:$src), QPR_VFP2)),
+                  (SSubReg_f16_reg imm_even:$lane))>;
+}
 
-def : Pat<(extractelt (v4f16 DPR:$src), imm_odd:$lane),
+multiclass ExtractEltOddF16VMOVH<ValueType VT4, ValueType VT8> {
+  def : Pat<(extractelt (VT4 DPR:$src), imm_odd:$lane),
             (COPY_TO_REGCLASS
               (VMOVH (EXTRACT_SUBREG
-                  (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
-                  (SSubReg_f16_reg imm_odd:$lane))),
+                        (v2f32 (COPY_TO_REGCLASS (VT4 DPR:$src), DPR_VFP2)),
+                        (SSubReg_f16_reg imm_odd:$lane))),
               HPR)>;
+  def : Pat<(extractelt (VT8 QPR:$src), imm_odd:$lane),
+            (COPY_TO_REGCLASS
+              (VMOVH (EXTRACT_SUBREG
+                        (v4f32 (COPY_TO_REGCLASS (VT8 QPR:$src), QPR_VFP2)),
+                        (SSubReg_f16_reg imm_odd:$lane))),
+              HPR)>;
+}
+
+let Predicates = [HasNEON] in {
+  defm : ExtractEltEvenF16<v4f16, v8f16>;
+  defm : ExtractEltOddF16VMOVH<v4f16, v8f16>;
+}
+
+let AddedComplexity = 1, Predicates = [HasNEON, HasBF16, HasFullFP16] in {
+  // If VMOVH (vmovx.f16) is available use it to extract BF16 from the odd lanes
+  defm : ExtractEltOddF16VMOVH<v4bf16, v8bf16>;
+}
 
-def : Pat<(extractelt (v8f16 QPR:$src), imm_even:$lane),
-            (EXTRACT_SUBREG
-                (v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)),
-                (SSubReg_f16_reg imm_even:$lane))>;
+let Predicates = [HasBF16, HasNEON] in {
+  defm : ExtractEltEvenF16<v4bf16, v8bf16>;
 
-def : Pat<(extractelt (v8f16 QPR:$src), imm_odd:$lane),
+  // Otherwise, if VMOVH is not available resort to extracting the odd lane
+  // into a GPR and then moving to HPR
+  def : Pat<(extractelt (v4bf16 DPR:$src), imm_odd:$lane),
             (COPY_TO_REGCLASS
-              (VMOVH (EXTRACT_SUBREG
-                  (v4f32 (COPY_TO_REGCLASS (v8f16 QPR:$src), QPR_VFP2)),
-                  (SSubReg_f16_reg imm_odd:$lane))),
+              (VGETLNu16 (v4bf16 DPR:$src), imm:$lane),
+              HPR)>;
+
+  def : Pat<(extractelt (v8bf16 QPR:$src), imm_odd:$lane),
+            (COPY_TO_REGCLASS
+              (VGETLNu16 (v4i16 (EXTRACT_SUBREG QPR:$src,
+                                                (DSubReg_i16_reg imm:$lane))),
+                         (SubReg_i16_lane imm:$lane)),
               HPR)>;
 }
 
@@ -6415,6 +6445,21 @@ def VSETLNi32 : NVSetLane<{1,1,1,0,0,0,?,0}, 0b1011, 0b00, (outs DPR:$V),
 }
 }
 
+// TODO: for odd lanes we could optimize this a bit by using the VINS
+// FullFP16 instruction when it is available
+multiclass InsertEltF16<ValueType VTScalar, ValueType VT4, ValueType VT8> {
+  def : Pat<(insertelt (VT4 DPR:$src1), (VTScalar HPR:$src2), imm:$lane),
+            (VT4 (VSETLNi16 DPR:$src1,
+                 (COPY_TO_REGCLASS HPR:$src2, GPR), imm:$lane))>;
+  def : Pat<(insertelt (VT8 QPR:$src1), (VTScalar HPR:$src2), imm:$lane),
+            (VT8 (INSERT_SUBREG QPR:$src1,
+                    (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
+                                        (DSubReg_i16_reg imm:$lane))),
+                              (COPY_TO_REGCLASS HPR:$src2, GPR),
+                              (SubReg_i16_lane imm:$lane))),
+                    (DSubReg_i16_reg imm:$lane)))>;
+}
+
 let Predicates = [HasNEON] in {
 def : Pat<(vector_insert (v16i8 QPR:$src1), GPR:$src2, imm:$lane),
           (v16i8 (INSERT_SUBREG QPR:$src1,
@@ -6442,14 +6487,7 @@ def : Pat<(v4f32 (insertelt QPR:$src1, SPR:$src2, imm:$src3)),
           (INSERT_SUBREG (v4f32 (COPY_TO_REGCLASS QPR:$src1, QPR_VFP2)),
                                 SPR:$src2, (SSubReg_f32_reg imm:$src3))>;
 
-def : Pat<(insertelt (v4f16 DPR:$src1), HPR:$src2, imm:$lane),
-          (v4f16 (VSETLNi16 DPR:$src1, (VMOVRH $src2), imm:$lane))>;
-def : Pat<(insertelt (v8f16 QPR:$src1), HPR:$src2, imm:$lane),
-          (v8f16 (INSERT_SUBREG QPR:$src1,
-                   (v4i16 (VSETLNi16 (v4i16 (EXTRACT_SUBREG QPR:$src1,
-                                      (DSubReg_i16_reg imm:$lane))),
-                             (VMOVRH $src2), (SubReg_i16_lane imm:$lane))),
-                   (DSubReg_i16_reg imm:$lane)))>;
+defm : InsertEltF16<f16, v4f16, v8f16>;
 
 //def : Pat<(v2i64 (insertelt QPR:$src1, DPR:$src2, imm:$src3)),
 //          (INSERT_SUBREG QPR:$src1, DPR:$src2, (DSubReg_f64_reg imm:$src3))>;
@@ -6484,6 +6522,9 @@ def : Pat<(v4i32 (scalar_to_vector GPR:$src)),
                          dsub_0)>;
 }
 
+let Predicates = [HasNEON, HasBF16] in
+defm : InsertEltF16<bf16, v4bf16, v8bf16>;
+
 //   VDUP     : Vector Duplicate (from ARM core register to all elements)
 
 class VDUPD<bits<8> opcod1, bits<2> opcod3, string Dt, ValueType Ty>
@@ -6588,18 +6629,35 @@ def : Pat<(v4f32 (ARMvduplane (v4f32 QPR:$src), imm:$lane)),
                                    (DSubReg_i32_reg imm:$lane))),
                            (SubReg_i32_lane imm:$lane)))>;
 
-def : Pat<(v4f16 (ARMvdup HPR:$src)),
+def : Pat<(v4f16 (ARMvdup (f16 HPR:$src))),
           (v4f16 (VDUPLN16d (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)),
-                             HPR:$src, ssub_0), (i32 0)))>;
+                             (f16 HPR:$src), ssub_0), (i32 0)))>;
 def : Pat<(v2f32 (ARMvdup (f32 SPR:$src))),
           (v2f32 (VDUPLN32d (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
                              SPR:$src, ssub_0), (i32 0)))>;
 def : Pat<(v4f32 (ARMvdup (f32 SPR:$src))),
           (v4f32 (VDUPLN32q (INSERT_SUBREG (v2f32 (IMPLICIT_DEF)),
                              SPR:$src, ssub_0), (i32 0)))>;
-def : Pat<(v8f16 (ARMvdup HPR:$src)),
+def : Pat<(v8f16 (ARMvdup (f16 HPR:$src))),
           (v8f16 (VDUPLN16q (INSERT_SUBREG (v4f16 (IMPLICIT_DEF)),
-                             HPR:$src, ssub_0), (i32 0)))>;
+                             (f16 HPR:$src), ssub_0), (i32 0)))>;
+}
+
+let Predicates = [HasNEON, HasBF16] in {
+def : Pat<(v4bf16 (ARMvduplane (v4bf16 DPR:$Vm), imm:$lane)),
+          (VDUPLN16d DPR:$Vm, imm:$lane)>;
+
+def : Pat<(v8bf16 (ARMvduplane (v8bf16 QPR:$src), imm:$lane)),
+          (v8bf16 (VDUPLN16q (v4bf16 (EXTRACT_SUBREG QPR:$src,
+                                    (DSubReg_i16_reg imm:$lane))),
+                            (SubReg_i16_lane imm:$lane)))>;
+
+def : Pat<(v4bf16 (ARMvdup (bf16 HPR:$src))),
+          (v4bf16 (VDUPLN16d (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)),
+                             (bf16 HPR:$src), ssub_0), (i32 0)))>;
+def : Pat<(v8bf16 (ARMvdup (bf16 HPR:$src))),
+          (v8bf16 (VDUPLN16q (INSERT_SUBREG (v4bf16 (IMPLICIT_DEF)),
+                             (bf16 HPR:$src), ssub_0), (i32 0)))>;
 }
 
 //   VMOVN    : Vector Narrowing Move
@@ -7330,7 +7388,7 @@ def : Pat<(arm_vmovsr GPR:$a),
         Requires<[HasNEON, DontUseVMOVSR]>;
 
 //===----------------------------------------------------------------------===//
-// Non-Instruction Patterns or Endiness - Revert Patterns
+// Non-Instruction Patterns or Endianess - Revert Patterns
 //===----------------------------------------------------------------------===//
 
 // bit_convert
@@ -7345,6 +7403,9 @@ def : Pat<(v2i32 (bitconvert (v2f32 DPR:$src))), (v2i32 DPR:$src)>;
 def : Pat<(v4i16 (bitconvert (v4f16 DPR:$src))), (v4i16  DPR:$src)>;
 def : Pat<(v4f16 (bitconvert (v4i16 DPR:$src))), (v4f16  DPR:$src)>;
 
+def : Pat<(v4i16 (bitconvert (v4bf16 DPR:$src))), (v4i16  DPR:$src)>;
+def : Pat<(v4bf16 (bitconvert (v4i16 DPR:$src))), (v4bf16  DPR:$src)>;
+
 // 128 bit conversions
 def : Pat<(v2f64 (bitconvert (v2i64 QPR:$src))), (v2f64 QPR:$src)>;
 def : Pat<(v2i64 (bitconvert (v2f64 QPR:$src))), (v2i64 QPR:$src)>;
@@ -7354,6 +7415,9 @@ def : Pat<(v4f32 (bitconvert (v4i32 QPR:$src))), (v4f32 QPR:$src)>;
 
 def : Pat<(v8i16 (bitconvert (v8f16 QPR:$src))), (v8i16  QPR:$src)>;
 def : Pat<(v8f16 (bitconvert (v8i16 QPR:$src))), (v8f16  QPR:$src)>;
+
+def : Pat<(v8i16 (bitconvert (v8bf16 QPR:$src))), (v8i16  QPR:$src)>;
+def : Pat<(v8bf16 (bitconvert (v8i16 QPR:$src))), (v8bf16  QPR:$src)>;
 }
 
 let Predicates = [IsLE,HasNEON] in {
@@ -7361,24 +7425,28 @@ let Predicates = [IsLE,HasNEON] in {
   def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (f64   DPR:$src)>;
   def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (f64   DPR:$src)>;
   def : Pat<(f64   (bitconvert (v4f16 DPR:$src))), (f64   DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v4bf16 DPR:$src))), (f64   DPR:$src)>;
   def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (f64   DPR:$src)>;
   def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (f64   DPR:$src)>;
 
   def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (v1i64 DPR:$src)>;
   def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (v1i64 DPR:$src)>;
   def : Pat<(v1i64 (bitconvert (v4f16 DPR:$src))), (v1i64 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v4bf16 DPR:$src))), (v1i64 DPR:$src)>;
   def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (v1i64 DPR:$src)>;
   def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (v1i64 DPR:$src)>;
 
   def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (v2f32 DPR:$src)>;
   def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (v2f32 DPR:$src)>;
   def : Pat<(v2f32 (bitconvert (v4f16 DPR:$src))), (v2f32 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v4bf16 DPR:$src))), (v2f32 DPR:$src)>;
   def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (v2f32 DPR:$src)>;
   def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (v2f32 DPR:$src)>;
 
   def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (v2i32 DPR:$src)>;
   def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (v2i32 DPR:$src)>;
   def : Pat<(v2i32 (bitconvert (v4f16 DPR:$src))), (v2i32 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v4bf16 DPR:$src))), (v2i32 DPR:$src)>;
   def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (v2i32 DPR:$src)>;
   def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (v2i32 DPR:$src)>;
 
@@ -7388,6 +7456,12 @@ let Predicates = [IsLE,HasNEON] in {
   def : Pat<(v4f16 (bitconvert (v2i32 DPR:$src))), (v4f16 DPR:$src)>;
   def : Pat<(v4f16 (bitconvert (v8i8  DPR:$src))), (v4f16 DPR:$src)>;
 
+  def : Pat<(v4bf16 (bitconvert (f64   DPR:$src))), (v4bf16 DPR:$src)>;
+  def : Pat<(v4bf16 (bitconvert (v1i64 DPR:$src))), (v4bf16 DPR:$src)>;
+  def : Pat<(v4bf16 (bitconvert (v2f32 DPR:$src))), (v4bf16 DPR:$src)>;
+  def : Pat<(v4bf16 (bitconvert (v2i32 DPR:$src))), (v4bf16 DPR:$src)>;
+  def : Pat<(v4bf16 (bitconvert (v8i8  DPR:$src))), (v4bf16 DPR:$src)>;
+
   def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (v4i16 DPR:$src)>;
   def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (v4i16 DPR:$src)>;
   def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (v4i16 DPR:$src)>;
@@ -7399,30 +7473,35 @@ let Predicates = [IsLE,HasNEON] in {
   def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (v8i8  DPR:$src)>;
   def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (v8i8  DPR:$src)>;
   def : Pat<(v8i8  (bitconvert (v4f16 DPR:$src))), (v8i8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v4bf16 DPR:$src))), (v8i8  DPR:$src)>;
   def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (v8i8  DPR:$src)>;
 
   // 128 bit conversions
   def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (v2f64 QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (v2f64 QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (v2f64 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8bf16 QPR:$src))), (v2f64 QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (v2f64 QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (v2f64 QPR:$src)>;
 
   def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (v2i64 QPR:$src)>;
   def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (v2i64 QPR:$src)>;
   def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (v2i64 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8bf16 QPR:$src))), (v2i64 QPR:$src)>;
   def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (v2i64 QPR:$src)>;
   def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (v2i64 QPR:$src)>;
 
   def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (v4f32 QPR:$src)>;
   def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (v4f32 QPR:$src)>;
   def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (v4f32 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8bf16 QPR:$src))), (v4f32 QPR:$src)>;
   def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (v4f32 QPR:$src)>;
   def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (v4f32 QPR:$src)>;
 
   def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (v4i32 QPR:$src)>;
   def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (v4i32 QPR:$src)>;
   def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (v4i32 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8bf16 QPR:$src))), (v4i32 QPR:$src)>;
   def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (v4i32 QPR:$src)>;
   def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (v4i32 QPR:$src)>;
 
@@ -7432,6 +7511,12 @@ let Predicates = [IsLE,HasNEON] in {
   def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (v8f16 QPR:$src)>;
   def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (v8f16 QPR:$src)>;
 
+  def : Pat<(v8bf16 (bitconvert (v2f64 QPR:$src))), (v8bf16 QPR:$src)>;
+  def : Pat<(v8bf16 (bitconvert (v2i64 QPR:$src))), (v8bf16 QPR:$src)>;
+  def : Pat<(v8bf16 (bitconvert (v4f32 QPR:$src))), (v8bf16 QPR:$src)>;
+  def : Pat<(v8bf16 (bitconvert (v4i32 QPR:$src))), (v8bf16 QPR:$src)>;
+  def : Pat<(v8bf16 (bitconvert (v16i8 QPR:$src))), (v8bf16 QPR:$src)>;
+
   def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (v8i16 QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (v8i16 QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (v8i16 QPR:$src)>;
@@ -7443,6 +7528,7 @@ let Predicates = [IsLE,HasNEON] in {
   def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (v16i8 QPR:$src)>;
   def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (v16i8 QPR:$src)>;
   def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (v16i8 QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8bf16 QPR:$src))), (v16i8 QPR:$src)>;
   def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (v16i8 QPR:$src)>;
 }
 
@@ -7451,24 +7537,28 @@ let Predicates = [IsBE,HasNEON] in {
   def : Pat<(f64   (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
   def : Pat<(f64   (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
   def : Pat<(f64   (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(f64   (bitconvert (v4bf16 DPR:$src))), (VREV64d16 DPR:$src)>;
   def : Pat<(f64   (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
   def : Pat<(f64   (bitconvert (v8i8  DPR:$src))), (VREV64d8  DPR:$src)>;
 
   def : Pat<(v1i64 (bitconvert (v2f32 DPR:$src))), (VREV64d32 DPR:$src)>;
   def : Pat<(v1i64 (bitconvert (v2i32 DPR:$src))), (VREV64d32 DPR:$src)>;
   def : Pat<(v1i64 (bitconvert (v4f16 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v1i64 (bitconvert (v4bf16 DPR:$src))), (VREV64d16 DPR:$src)>;
   def : Pat<(v1i64 (bitconvert (v4i16 DPR:$src))), (VREV64d16 DPR:$src)>;
   def : Pat<(v1i64 (bitconvert (v8i8  DPR:$src))), (VREV64d8  DPR:$src)>;
 
   def : Pat<(v2f32 (bitconvert (f64   DPR:$src))), (VREV64d32 DPR:$src)>;
   def : Pat<(v2f32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
   def : Pat<(v2f32 (bitconvert (v4f16 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v2f32 (bitconvert (v4bf16 DPR:$src))), (VREV32d16 DPR:$src)>;
   def : Pat<(v2f32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
   def : Pat<(v2f32 (bitconvert (v8i8  DPR:$src))), (VREV32d8  DPR:$src)>;
 
   def : Pat<(v2i32 (bitconvert (f64   DPR:$src))), (VREV64d32 DPR:$src)>;
   def : Pat<(v2i32 (bitconvert (v1i64 DPR:$src))), (VREV64d32 DPR:$src)>;
   def : Pat<(v2i32 (bitconvert (v4f16 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v2i32 (bitconvert (v4bf16 DPR:$src))), (VREV32d16 DPR:$src)>;
   def : Pat<(v2i32 (bitconvert (v4i16 DPR:$src))), (VREV32d16 DPR:$src)>;
   def : Pat<(v2i32 (bitconvert (v8i8  DPR:$src))), (VREV32d8  DPR:$src)>;
 
@@ -7478,6 +7568,12 @@ let Predicates = [IsBE,HasNEON] in {
   def : Pat<(v4f16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
   def : Pat<(v4f16 (bitconvert (v8i8  DPR:$src))), (VREV16d8  DPR:$src)>;
 
+  def : Pat<(v4bf16 (bitconvert (f64   DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v4bf16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
+  def : Pat<(v4bf16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v4bf16 (bitconvert (v2i32 DPR:$src))), (VREV32d16 DPR:$src)>;
+  def : Pat<(v4bf16 (bitconvert (v8i8  DPR:$src))), (VREV16d8  DPR:$src)>;
+
   def : Pat<(v4i16 (bitconvert (f64   DPR:$src))), (VREV64d16 DPR:$src)>;
   def : Pat<(v4i16 (bitconvert (v1i64 DPR:$src))), (VREV64d16 DPR:$src)>;
   def : Pat<(v4i16 (bitconvert (v2f32 DPR:$src))), (VREV32d16 DPR:$src)>;
@@ -7489,30 +7585,35 @@ let Predicates = [IsBE,HasNEON] in {
   def : Pat<(v8i8  (bitconvert (v2f32 DPR:$src))), (VREV32d8  DPR:$src)>;
   def : Pat<(v8i8  (bitconvert (v2i32 DPR:$src))), (VREV32d8  DPR:$src)>;
   def : Pat<(v8i8  (bitconvert (v4f16 DPR:$src))), (VREV16d8  DPR:$src)>;
+  def : Pat<(v8i8  (bitconvert (v4bf16 DPR:$src))), (VREV16d8  DPR:$src)>;
   def : Pat<(v8i8  (bitconvert (v4i16 DPR:$src))), (VREV16d8  DPR:$src)>;
 
   // 128 bit conversions
   def : Pat<(v2f64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v2f64 (bitconvert (v8bf16 QPR:$src))), (VREV64q16 QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
   def : Pat<(v2f64 (bitconvert (v16i8 QPR:$src))), (VREV64q8  QPR:$src)>;
 
   def : Pat<(v2i64 (bitconvert (v4f32 QPR:$src))), (VREV64q32 QPR:$src)>;
   def : Pat<(v2i64 (bitconvert (v4i32 QPR:$src))), (VREV64q32 QPR:$src)>;
   def : Pat<(v2i64 (bitconvert (v8f16 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v2i64 (bitconvert (v8bf16 QPR:$src))), (VREV64q16 QPR:$src)>;
   def : Pat<(v2i64 (bitconvert (v8i16 QPR:$src))), (VREV64q16 QPR:$src)>;
   def : Pat<(v2i64 (bitconvert (v16i8 QPR:$src))), (VREV64q8  QPR:$src)>;
 
   def : Pat<(v4f32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
   def : Pat<(v4f32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
   def : Pat<(v4f32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v4f32 (bitconvert (v8bf16 QPR:$src))), (VREV32q16 QPR:$src)>;
   def : Pat<(v4f32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
   def : Pat<(v4f32 (bitconvert (v16i8 QPR:$src))), (VREV32q8  QPR:$src)>;
 
   def : Pat<(v4i32 (bitconvert (v2f64 QPR:$src))), (VREV64q32 QPR:$src)>;
   def : Pat<(v4i32 (bitconvert (v2i64 QPR:$src))), (VREV64q32 QPR:$src)>;
   def : Pat<(v4i32 (bitconvert (v8f16 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v4i32 (bitconvert (v8bf16 QPR:$src))), (VREV32q16 QPR:$src)>;
   def : Pat<(v4i32 (bitconvert (v8i16 QPR:$src))), (VREV32q16 QPR:$src)>;
   def : Pat<(v4i32 (bitconvert (v16i8 QPR:$src))), (VREV32q8  QPR:$src)>;
 
@@ -7522,6 +7623,12 @@ let Predicates = [IsBE,HasNEON] in {
   def : Pat<(v8f16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
   def : Pat<(v8f16 (bitconvert (v16i8 QPR:$src))), (VREV16q8  QPR:$src)>;
 
+  def : Pat<(v8bf16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v8bf16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>;
+  def : Pat<(v8bf16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v8bf16 (bitconvert (v4i32 QPR:$src))), (VREV32q16 QPR:$src)>;
+  def : Pat<(v8bf16 (bitconvert (v16i8 QPR:$src))), (VREV16q8  QPR:$src)>;
+
   def : Pat<(v8i16 (bitconvert (v2f64 QPR:$src))), (VREV64q16 QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v2i64 QPR:$src))), (VREV64q16 QPR:$src)>;
   def : Pat<(v8i16 (bitconvert (v4f32 QPR:$src))), (VREV32q16 QPR:$src)>;
@@ -7533,9 +7640,26 @@ let Predicates = [IsBE,HasNEON] in {
   def : Pat<(v16i8 (bitconvert (v4f32 QPR:$src))), (VREV32q8  QPR:$src)>;
   def : Pat<(v16i8 (bitconvert (v4i32 QPR:$src))), (VREV32q8  QPR:$src)>;
   def : Pat<(v16i8 (bitconvert (v8f16 QPR:$src))), (VREV16q8  QPR:$src)>;
+  def : Pat<(v16i8 (bitconvert (v8bf16 QPR:$src))), (VREV16q8  QPR:$src)>;
   def : Pat<(v16i8 (bitconvert (v8i16 QPR:$src))), (VREV16q8  QPR:$src)>;
 }
 
+let Predicates = [HasNEON] in {
+  // Here we match the specific SDNode type 'ARMVectorRegCastImpl'
+  // rather than the more general 'ARMVectorRegCast' which would also
+  // match some bitconverts. If we use the latter in cases where the
+  // input and output types are the same, the bitconvert gets elided
+  // and we end up generating a nonsense match of nothing.
+
+  foreach VT = [ v16i8, v8i16, v8f16, v8bf16, v4i32, v4f32, v2i64, v2f64 ] in
+    foreach VT2 = [ v16i8, v8i16, v8f16, v8bf16, v4i32, v4f32, v2i64, v2f64 ] in
+      def : Pat<(VT (ARMVectorRegCastImpl (VT2 QPR:$src))), (VT QPR:$src)>;
+
+  foreach VT = [ v8i8, v4i16, v4f16, v4bf16, v2i32, v2f32, v1i64, f64 ] in
+    foreach VT2 = [ v8i8, v4i16, v4f16, v4bf16, v2i32, v2f32, v1i64, f64 ] in
+      def : Pat<(VT (ARMVectorRegCastImpl (VT2 DPR:$src))), (VT DPR:$src)>;
+}
+
 // Use VLD1/VST1 + VREV for non-word-aligned v2f64 load/store on Big Endian
 let Predicates = [IsBE,HasNEON] in {
 def : Pat<(v2f64 (byte_alignedload addrmode6:$addr)),
@@ -7863,6 +7987,8 @@ def : Pat<(v4f32 (concat_vectors DPR:$Dn, DPR:$Dm)),
           (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
 def : Pat<(v8f16 (concat_vectors DPR:$Dn, DPR:$Dm)),
           (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
+def : Pat<(v8bf16 (concat_vectors DPR:$Dn, DPR:$Dm)),
+          (REG_SEQUENCE QPR, DPR:$Dn, dsub_0, DPR:$Dm, dsub_1)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -8915,3 +9041,115 @@ def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
                      (VMOVv4i32 QPR:$Vd, nImmVMOVI32:$imm, pred:$p)>;
 def : NEONInstAlias<"vmov${p}.f32 $Vd, $imm",
                      (VMOVv2i32 DPR:$Vd, nImmVMOVI32:$imm, pred:$p)>;
+
+// ARMv8.6a BFloat16 instructions.
+let Predicates = [HasBF16, HasNEON] in {
+class BF16VDOT<bits<5> op27_23, bits<2> op21_20, bit op6,
+               dag oops, dag iops, list<dag> pattern>
+   : N3Vnp<op27_23, op21_20, 0b1101, op6, 0, oops, iops,
+           N3RegFrm, IIC_VDOTPROD, "", "", pattern>
+{
+    let DecoderNamespace = "VFPV8";
+}
+
+class BF16VDOTS<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy, ValueType InputTy>
+   : BF16VDOT<0b11000, 0b00,  Q, (outs RegTy:$dst),
+              (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
+            [(set (AccumTy RegTy:$dst),
+                  (int_arm_neon_bfdot (AccumTy RegTy:$Vd),
+                                      (InputTy RegTy:$Vn),
+                                      (InputTy RegTy:$Vm)))]> {
+  let Constraints = "$dst = $Vd";
+  let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
+    let DecoderNamespace = "VFPV8";
+}
+
+multiclass BF16VDOTI<bit Q, RegisterClass RegTy, string opc, ValueType AccumTy,
+                     ValueType InputTy, dag RHS> {
+
+  def "" : BF16VDOT<0b11100, 0b00, Q, (outs RegTy:$dst),
+                    (ins RegTy:$Vd, RegTy:$Vn,
+                    DPR_VFP2:$Vm, VectorIndex32:$lane), []> {
+    bit lane;
+    let Inst{5} = lane;
+    let Constraints = "$dst = $Vd";
+    let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm$lane");
+    let DecoderNamespace = "VFPV8";
+  }
+
+  def : Pat<
+    (AccumTy (int_arm_neon_bfdot (AccumTy RegTy:$Vd),
+                                 (InputTy RegTy:$Vn),
+                                 (InputTy (bitconvert (AccumTy
+                                          (ARMvduplane (AccumTy RegTy:$Vm),
+                                                        VectorIndex32:$lane)))))),
+    (!cast<Instruction>(NAME) RegTy:$Vd, RegTy:$Vn, RHS, VectorIndex32:$lane)>;
+}
+
+def BF16VDOTS_VDOTD : BF16VDOTS<0, DPR, "vdot", v2f32, v8i8>;
+def BF16VDOTS_VDOTQ : BF16VDOTS<1, QPR, "vdot", v4f32, v16i8>;
+
+defm BF16VDOTI_VDOTD : BF16VDOTI<0, DPR, "vdot", v2f32, v8i8, (v2f32 DPR_VFP2:$Vm)>;
+defm BF16VDOTI_VDOTQ : BF16VDOTI<1, QPR, "vdot", v4f32, v16i8, (EXTRACT_SUBREG QPR:$Vm, dsub_0)>;
+
+class BF16MM<bit Q, RegisterClass RegTy,
+             string opc>
+   : N3Vnp<0b11000, 0b00, 0b1100, Q, 0,
+           (outs RegTy:$dst), (ins RegTy:$Vd, RegTy:$Vn, RegTy:$Vm),
+           N3RegFrm, IIC_VDOTPROD, "", "",
+                [(set (v4f32 QPR:$dst), (int_arm_neon_bfmmla (v4f32 QPR:$Vd),
+                                                (v16i8 QPR:$Vn),
+                                                (v16i8 QPR:$Vm)))]> {
+   let Constraints = "$dst = $Vd";
+   let AsmString = !strconcat(opc, ".bf16", "\t$Vd, $Vn, $Vm");
+   let DecoderNamespace = "VFPV8";
+}
+
+def VMMLA : BF16MM<1, QPR, "vmmla">;
+
+class VBF16MALQ<bit T, string suffix, SDPatternOperator OpNode>
+  : N3VCP8<0b00, 0b11, T, 1,
+           (outs QPR:$dst), (ins QPR:$Vd, QPR:$Vn, QPR:$Vm),
+           NoItinerary, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm", "",
+                [(set (v4f32 QPR:$dst),
+                      (OpNode (v4f32 QPR:$Vd),
+                              (v16i8 QPR:$Vn),
+                              (v16i8 QPR:$Vm)))]> {
+  let Constraints = "$dst = $Vd";
+  let DecoderNamespace = "VFPV8";
+}
+
+def VBF16MALTQ: VBF16MALQ<1, "t", int_arm_neon_bfmlalt>;
+def VBF16MALBQ: VBF16MALQ<0, "b", int_arm_neon_bfmlalb>;
+
+multiclass VBF16MALQI<bit T, string suffix, SDPatternOperator OpNode> {
+  def "" : N3VLaneCP8<0, 0b11, T, 1, (outs QPR:$dst),
+              (ins QPR:$Vd, QPR:$Vn, DPR_8:$Vm, VectorIndex16:$idx),
+               IIC_VMACD, "vfma" # suffix, "bf16", "$Vd, $Vn, $Vm$idx", "", []> {
+  bits<2> idx;
+  let Inst{5} = idx{1};
+  let Inst{3} = idx{0};
+  let Constraints = "$dst = $Vd";
+  let DecoderNamespace = "VFPV8";
+  }
+
+  def : Pat<
+    (v4f32 (OpNode (v4f32 QPR:$Vd),
+                   (v16i8 QPR:$Vn),
+                   (v16i8 (bitconvert (v8bf16 (ARMvduplane (v8bf16 QPR:$Vm),
+                                                           VectorIndex16:$lane)))))),
+    (!cast<Instruction>(NAME) QPR:$Vd,
+                              QPR:$Vn,
+                              (EXTRACT_SUBREG QPR:$Vm,
+                                (DSubReg_i16_reg VectorIndex16:$lane)),
+                              (SubReg_i16_lane VectorIndex16:$lane))>;
+}
+
+defm VBF16MALTQI: VBF16MALQI<1, "t", int_arm_neon_bfmlalt>;
+defm VBF16MALBQI: VBF16MALQI<0, "b", int_arm_neon_bfmlalb>;
+
+def BF16_VCVT :  N2V<0b11, 0b11, 0b01, 0b10, 0b01100, 1, 0,
+                    (outs DPR:$Vd), (ins QPR:$Vm),
+                    NoItinerary, "vcvt", "bf16.f32", "$Vd, $Vm", "", []>;
+}
+// End of BFloat16 instructions
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb.td b/llvm/lib/Target/ARM/ARMInstrThumb.td
index 18bcbda44580..7fae32117243 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb.td
@@ -14,6 +14,10 @@
 // Thumb specific DAG Nodes.
 //
 
+def ARMtsecall : SDNode<"ARMISD::tSECALL", SDT_ARMcall,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                         SDNPVariadic]>;
+
 def imm_sr_XFORM: SDNodeXForm<imm, [{
   unsigned Imm = N->getZExtValue();
   return CurDAG->getTargetConstant((Imm == 32 ? 0 : Imm), SDLoc(N), MVT::i32);
@@ -499,6 +503,10 @@ let isReturn = 1, isTerminator = 1, isBarrier = 1 in {
   def tBX_RET : tPseudoExpand<(outs), (ins pred:$p), 2, IIC_Br,
                    [(ARMretflag)], (tBX LR, pred:$p)>, Sched<[WriteBr]>;
 
+  // alternative return for CMSE entry functions
+  def tBXNS_RET : tPseudoInst<(outs), (ins), 2, IIC_Br,
+                  [(ARMseretflag)]>, Sched<[WriteBr]>;
+
   // Alternative return instruction used by vararg functions.
   def tBX_RET_vararg : tPseudoExpand<(outs), (ins tGPR:$Rm, pred:$p),
                    2, IIC_Br, [],
@@ -560,6 +568,10 @@ let isCall = 1,
     let Unpredictable{1-0} = 0b11;
   }
 
+  def tBLXNS_CALL : PseudoInst<(outs), (ins GPRnopc:$func), IIC_Br,
+                    [(ARMtsecall GPRnopc:$func)]>,
+                    Requires<[IsThumb, Has8MSecExt]>, Sched<[WriteBr]>;
+
   // ARMv4T
   def tBX_CALL : tPseudoInst<(outs), (ins tGPR:$func),
                   4, IIC_Br,
@@ -1513,7 +1525,7 @@ def tTPsoft : tPseudoInst<(outs), (ins), 4, IIC_Br,
 // tromped upon when we get here from a longjmp(). We force everything out of
 // registers except for our own input by listing the relevant registers in
 // Defs. By doing so, we also cause the prologue/epilogue code to actively
-// preserve all of the callee-saved resgisters, which is exactly what we want.
+// preserve all of the callee-saved registers, which is exactly what we want.
 // $val is a scratch register for our use.
 let Defs = [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7, R12, CPSR ],
     hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
diff --git a/llvm/lib/Target/ARM/ARMInstrThumb2.td b/llvm/lib/Target/ARM/ARMInstrThumb2.td
index 4193e8147f47..7137e8ee66b8 100644
--- a/llvm/lib/Target/ARM/ARMInstrThumb2.td
+++ b/llvm/lib/Target/ARM/ARMInstrThumb2.td
@@ -1748,7 +1748,7 @@ def t2STRHT  : T2IstT<0b01, "strht", IIC_iStore_bh_i>;
 
 // ldrd / strd pre / post variants
 
-let mayLoad = 1 in
+let mayLoad = 1, hasSideEffects = 0 in
 def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
                  (ins t2addrmode_imm8s4_pre:$addr), IIC_iLoad_d_ru,
                  "ldrd", "\t$Rt, $Rt2, $addr!", "$addr.base = $wb", []>,
@@ -1756,13 +1756,13 @@ def t2LDRD_PRE  : T2Ii8s4<1, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
   let DecoderMethod = "DecodeT2LDRDPreInstruction";
 }
 
-let mayLoad = 1 in
+let mayLoad = 1, hasSideEffects = 0 in
 def t2LDRD_POST : T2Ii8s4post<0, 1, 1, (outs rGPR:$Rt, rGPR:$Rt2, GPR:$wb),
                  (ins addr_offset_none:$addr, t2am_imm8s4_offset:$imm),
                  IIC_iLoad_d_ru, "ldrd", "\t$Rt, $Rt2, $addr$imm",
                  "$addr.base = $wb", []>, Sched<[WriteLd]>;
 
-let mayStore = 1 in
+let mayStore = 1, hasSideEffects = 0 in
 def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
                  (ins rGPR:$Rt, rGPR:$Rt2, t2addrmode_imm8s4_pre:$addr),
                  IIC_iStore_d_ru, "strd", "\t$Rt, $Rt2, $addr!",
@@ -1770,7 +1770,7 @@ def t2STRD_PRE  : T2Ii8s4<1, 1, 0, (outs GPR:$wb),
   let DecoderMethod = "DecodeT2STRDPreInstruction";
 }
 
-let mayStore = 1 in
+let mayStore = 1, hasSideEffects = 0 in
 def t2STRD_POST : T2Ii8s4post<0, 1, 0, (outs GPR:$wb),
                  (ins rGPR:$Rt, rGPR:$Rt2, addr_offset_none:$addr,
                       t2am_imm8s4_offset:$imm),
@@ -1874,6 +1874,34 @@ defm t2PLD    : T2Ipl<0, 0, "pld">,  Requires<[IsThumb2]>;
 defm t2PLDW   : T2Ipl<1, 0, "pldw">, Requires<[IsThumb2,HasV7,HasMP]>;
 defm t2PLI    : T2Ipl<0, 1, "pli">,  Requires<[IsThumb2,HasV7]>;
 
+// PLD/PLDW/PLI aliases w/ the optional .w suffix
+def : t2InstAlias<"pld${p}.w\t$addr",
+                 (t2PLDi12  t2addrmode_imm12:$addr, pred:$p)>;
+def : t2InstAlias<"pld${p}.w\t$addr",
+                 (t2PLDi8   t2addrmode_negimm8:$addr, pred:$p)>;
+def : t2InstAlias<"pld${p}.w\t$addr",
+                 (t2PLDs    t2addrmode_so_reg:$addr, pred:$p)>;
+
+def : InstAlias<"pldw${p}.w\t$addr",
+                 (t2PLDWi12  t2addrmode_imm12:$addr, pred:$p), 0>,
+      Requires<[IsThumb2,HasV7,HasMP]>;
+def : InstAlias<"pldw${p}.w\t$addr",
+                 (t2PLDWi8   t2addrmode_negimm8:$addr, pred:$p), 0>,
+      Requires<[IsThumb2,HasV7,HasMP]>;
+def : InstAlias<"pldw${p}.w\t$addr",
+                 (t2PLDWs    t2addrmode_so_reg:$addr, pred:$p), 0>,
+      Requires<[IsThumb2,HasV7,HasMP]>;
+
+def : InstAlias<"pli${p}.w\t$addr",
+                 (t2PLIi12  t2addrmode_imm12:$addr, pred:$p), 0>,
+      Requires<[IsThumb2,HasV7]>;
+def : InstAlias<"pli${p}.w\t$addr",
+                 (t2PLIi8   t2addrmode_negimm8:$addr, pred:$p), 0>,
+      Requires<[IsThumb2,HasV7]>;
+def : InstAlias<"pli${p}.w\t$addr",
+                 (t2PLIs    t2addrmode_so_reg:$addr, pred:$p), 0>,
+      Requires<[IsThumb2,HasV7]>;
+
 // pci variant is very similar to i12, but supports negative offsets
 // from the PC. Only PLD and PLI have pci variants (not PLDW)
 class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr),
@@ -1896,6 +1924,24 @@ class T2Iplpci<bits<1> inst, string opc> : T2Iso<(outs), (ins t2ldrlabel:$addr),
 def t2PLDpci : T2Iplpci<0, "pld">,  Requires<[IsThumb2]>;
 def t2PLIpci : T2Iplpci<1, "pli">,  Requires<[IsThumb2,HasV7]>;
 
+def : t2InstAlias<"pld${p}.w $addr",
+                  (t2PLDpci t2ldrlabel:$addr, pred:$p)>;
+def : InstAlias<"pli${p}.w $addr",
+                 (t2PLIpci  t2ldrlabel:$addr, pred:$p), 0>,
+      Requires<[IsThumb2,HasV7]>;
+
+// PLD/PLI with alternate literal form.
+def : t2InstAlias<"pld${p} $addr",
+                  (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : InstAlias<"pli${p} $addr",
+                 (t2PLIpci  t2ldr_pcrel_imm12:$addr, pred:$p), 0>,
+      Requires<[IsThumb2,HasV7]>;
+def : t2InstAlias<"pld${p}.w $addr",
+                  (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>;
+def : InstAlias<"pli${p}.w $addr",
+                 (t2PLIpci  t2ldr_pcrel_imm12:$addr, pred:$p), 0>,
+      Requires<[IsThumb2,HasV7]>;
+
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
 //
@@ -2439,7 +2485,7 @@ def : Thumb2DSPPat<(int_arm_qadd rGPR:$Rm, rGPR:$Rn),
                    (t2QADD rGPR:$Rm, rGPR:$Rn)>;
 def : Thumb2DSPPat<(int_arm_qsub rGPR:$Rm, rGPR:$Rn),
                    (t2QSUB rGPR:$Rm, rGPR:$Rn)>;
-def : Thumb2DSPPat<(int_arm_qadd(int_arm_qadd rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
+def : Thumb2DSPPat<(int_arm_qadd rGPR:$Rm, (int_arm_qadd rGPR:$Rn, rGPR:$Rn)),
                    (t2QDADD rGPR:$Rm, rGPR:$Rn)>;
 def : Thumb2DSPPat<(int_arm_qsub rGPR:$Rm, (int_arm_qadd rGPR:$Rn, rGPR:$Rn)),
                    (t2QDSUB rGPR:$Rm, rGPR:$Rn)>;
@@ -2448,7 +2494,7 @@ def : Thumb2DSPPat<(saddsat rGPR:$Rm, rGPR:$Rn),
                    (t2QADD rGPR:$Rm, rGPR:$Rn)>;
 def : Thumb2DSPPat<(ssubsat rGPR:$Rm, rGPR:$Rn),
                    (t2QSUB rGPR:$Rm, rGPR:$Rn)>;
-def : Thumb2DSPPat<(saddsat(saddsat rGPR:$Rm, rGPR:$Rm), rGPR:$Rn),
+def : Thumb2DSPPat<(saddsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
                    (t2QDADD rGPR:$Rm, rGPR:$Rn)>;
 def : Thumb2DSPPat<(ssubsat rGPR:$Rm, (saddsat rGPR:$Rn, rGPR:$Rn)),
                    (t2QDSUB rGPR:$Rm, rGPR:$Rn)>;
@@ -2719,6 +2765,8 @@ def t2SBFX: T2TwoRegBitFI<
   let Inst{25} = 1;
   let Inst{24-20} = 0b10100;
   let Inst{15} = 0;
+
+  let hasSideEffects = 0;
 }
 
 def t2UBFX: T2TwoRegBitFI<
@@ -2728,6 +2776,8 @@ def t2UBFX: T2TwoRegBitFI<
   let Inst{25} = 1;
   let Inst{24-20} = 0b11100;
   let Inst{15} = 0;
+
+  let hasSideEffects = 0;
 }
 
 // A8.8.247  UDF - Undefined (Encoding T2)
@@ -3711,7 +3761,7 @@ def : T2Pat<(stlex_2 (and GPR:$Rt, 0xffff), addr_offset_none:$addr),
 //   when we get here from a longjmp(). We force everything out of registers
 //   except for our own input by listing the relevant registers in Defs. By
 //   doing so, we also cause the prologue/epilogue code to actively preserve
-//   all of the callee-saved resgisters, which is exactly what we want.
+//   all of the callee-saved registers, which is exactly what we want.
 //   $val is a scratch register for our use.
 let Defs =
   [ R0,  R1,  R2,  R3,  R4,  R5,  R6,  R7,  R8,  R9,  R10, R11, R12, LR, CPSR,
@@ -4150,7 +4200,7 @@ def t2LDRpci_pic : PseudoInst<(outs rGPR:$dst), (ins i32imm:$addr, pclabel:$cp),
                                            imm:$cp))]>,
                Requires<[IsThumb2]>;
 
-// Pseudo isntruction that combines movs + predicated rsbmi
+// Pseudo instruction that combines movs + predicated rsbmi
 // to implement integer ABS
 let usesCustomInserter = 1, Defs = [CPSR], hasNoSchedulingInfo = 1 in {
 def t2ABS : PseudoInst<(outs rGPR:$dst), (ins rGPR:$src),
@@ -4851,9 +4901,15 @@ def : t2InstAlias<"tst${p} $Rn, $Rm",
                   (t2TSTrr rGPR:$Rn, rGPR:$Rm, pred:$p)>;
 
 // Memory barriers
+def : InstAlias<"dmb${p}.w\t$opt", (t2DMB memb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"dmb${p}", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"dmb${p}.w", (t2DMB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"dsb${p}.w\t$opt", (t2DSB memb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"dsb${p}", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"dsb${p}.w", (t2DSB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"isb${p}.w\t$opt", (t2ISB memb_opt:$opt, pred:$p), 0>, Requires<[HasDB]>;
 def : InstAlias<"isb${p}", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
+def : InstAlias<"isb${p}.w", (t2ISB 0xf, pred:$p), 0>, Requires<[HasDB]>;
 
 // Non-predicable aliases of a predicable DSB: the predicate is (14, 0) where
 // 14 = AL (always execute) and 0 = "instruction doesn't read the CPSR".
@@ -5187,14 +5243,6 @@ def : t2InstAlias<"ldr${p}.w $Rt, $immediate",
                   (t2LDRConstPool GPRnopc:$Rt,
                   const_pool_asm_imm:$immediate, pred:$p)>;
 
-// PLD/PLDW/PLI with alternate literal form.
-def : t2InstAlias<"pld${p} $addr",
-                  (t2PLDpci t2ldr_pcrel_imm12:$addr, pred:$p)>;
-def : InstAlias<"pli${p} $addr",
-                 (t2PLIpci  t2ldr_pcrel_imm12:$addr, pred:$p), 0>,
-      Requires<[IsThumb2,HasV7]>;
-
-
 //===----------------------------------------------------------------------===//
 // ARMv8.1m instructions
 //
@@ -5207,7 +5255,7 @@ class V8_1MI<dag oops, dag iops, AddrMode am, InstrItinClass itin, string asm,
 
 def t2CLRM : V8_1MI<(outs),
                     (ins pred:$p, reglist_with_apsr:$regs, variable_ops),
-                    AddrModeNone, NoItinerary, "clrm", "${p}\t$regs", "", []> {
+                    AddrModeNone, NoItinerary, "clrm${p}", "$regs", "", []> {
   bits<16> regs;
 
   let Inst{31-16} = 0b1110100010011111;
@@ -5360,6 +5408,7 @@ def t2DoLoopStart :
   t2PseudoInst<(outs), (ins rGPR:$elts), 4, IIC_Br,
   [(int_set_loop_iterations rGPR:$elts)]>, Sched<[WriteBr]>;
 
+let hasSideEffects = 0 in
 def t2LoopDec :
   t2PseudoInst<(outs GPRlr:$Rm), (ins GPRlr:$Rn, imm0_7:$size),
                4, IIC_Br, []>, Sched<[WriteBr]>;
diff --git a/llvm/lib/Target/ARM/ARMInstrVFP.td b/llvm/lib/Target/ARM/ARMInstrVFP.td
index a41a483d1a4c..8a652c1d90f6 100644
--- a/llvm/lib/Target/ARM/ARMInstrVFP.td
+++ b/llvm/lib/Target/ARM/ARMInstrVFP.td
@@ -21,6 +21,8 @@ def SDT_VMOVSR : SDTypeProfile<1, 1, [SDTCisVT<0, f32>, SDTCisVT<1, i32>]>;
 def arm_fmstat : SDNode<"ARMISD::FMSTAT",  SDTNone, [SDNPInGlue, SDNPOutGlue]>;
 def arm_cmpfp  : SDNode<"ARMISD::CMPFP",   SDT_ARMCmp, [SDNPOutGlue]>;
 def arm_cmpfp0 : SDNode<"ARMISD::CMPFPw0", SDT_CMPFP0, [SDNPOutGlue]>;
+def arm_cmpfpe : SDNode<"ARMISD::CMPFPE",  SDT_ARMCmp, [SDNPOutGlue]>;
+def arm_cmpfpe0: SDNode<"ARMISD::CMPFPEw0",SDT_CMPFP0, [SDNPOutGlue]>;
 def arm_fmdrr  : SDNode<"ARMISD::VMOVDRR", SDT_VMOVDRR>;
 def arm_fmrrd  : SDNode<"ARMISD::VMOVRRD", SDT_VMOVRRD>;
 def arm_vmovsr  : SDNode<"ARMISD::VMOVSR", SDT_VMOVSR>;
@@ -156,11 +158,24 @@ def VLDRS : ASI5<0b1101, 0b01, (outs SPR:$Sd), (ins addrmode5:$addr),
 let isUnpredicable = 1 in
 def VLDRH : AHI5<0b1101, 0b01, (outs HPR:$Sd), (ins addrmode5fp16:$addr),
                  IIC_fpLoad16, "vldr", ".16\t$Sd, $addr",
-                 [(set HPR:$Sd, (alignedload16 addrmode5fp16:$addr))]>,
+                 [(set HPR:$Sd, (f16 (alignedload16 addrmode5fp16:$addr)))]>,
             Requires<[HasFPRegs16]>;
 
 } // End of 'let canFoldAsLoad = 1, isReMaterializable = 1 in'
 
+def : Pat<(bf16 (alignedload16 addrmode5fp16:$addr)),
+          (VLDRH addrmode5fp16:$addr)> {
+  let Predicates = [HasFPRegs16];
+}
+def : Pat<(bf16 (alignedload16 addrmode3:$addr)),
+          (COPY_TO_REGCLASS (LDRH addrmode3:$addr), HPR)> {
+  let Predicates = [HasNoFPRegs16, IsARM];
+}
+def : Pat<(bf16 (alignedload16 t2addrmode_imm12:$addr)),
+          (COPY_TO_REGCLASS (t2LDRHi12 t2addrmode_imm12:$addr), HPR)> {
+  let Predicates = [HasNoFPRegs16, IsThumb];
+}
+
 def VSTRD : ADI5<0b1101, 0b00, (outs), (ins DPR:$Dd, addrmode5:$addr),
                  IIC_fpStore64, "vstr", "\t$Dd, $addr",
                  [(alignedstore32 (f64 DPR:$Dd), addrmode5:$addr)]>,
@@ -178,9 +193,22 @@ def VSTRS : ASI5<0b1101, 0b00, (outs), (ins SPR:$Sd, addrmode5:$addr),
 let isUnpredicable = 1 in
 def VSTRH : AHI5<0b1101, 0b00, (outs), (ins HPR:$Sd, addrmode5fp16:$addr),
                  IIC_fpStore16, "vstr", ".16\t$Sd, $addr",
-                 [(alignedstore16 HPR:$Sd, addrmode5fp16:$addr)]>,
+                 [(alignedstore16 (f16 HPR:$Sd), addrmode5fp16:$addr)]>,
             Requires<[HasFPRegs16]>;
 
+def : Pat<(alignedstore16 (bf16 HPR:$Sd), addrmode5fp16:$addr),
+          (VSTRH (bf16 HPR:$Sd), addrmode5fp16:$addr)> {
+  let Predicates = [HasFPRegs16];
+}
+def : Pat<(alignedstore16 (bf16 HPR:$Sd), addrmode3:$addr),
+          (STRH (COPY_TO_REGCLASS $Sd, GPR), addrmode3:$addr)> {
+  let Predicates = [HasNoFPRegs16, IsARM];
+}
+def : Pat<(alignedstore16 (bf16 HPR:$Sd), t2addrmode_imm12:$addr),
+          (t2STRHi12 (COPY_TO_REGCLASS $Sd, GPR), t2addrmode_imm12:$addr)> {
+  let Predicates = [HasNoFPRegs16, IsThumb];
+}
+
 //===----------------------------------------------------------------------===//
 //  Load / store multiple Instructions.
 //
@@ -275,7 +303,6 @@ def : MnemonicAlias<"vstm", "vstmia">;
 //===----------------------------------------------------------------------===//
 //  Lazy load / store multiple Instructions
 //
-let mayLoad = 1 in
 def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
                   NoItinerary, "vlldm${p}\t$Rn", "", []>,
             Requires<[HasV8MMainline, Has8MSecExt]> {
@@ -286,9 +313,9 @@ def VLLDM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
     let Inst{15-12} = 0;
     let Inst{7-0}   = 0;
     let mayLoad     = 1;
+    let Defs = [Q0, Q1, Q2, Q3, Q4, Q5, Q6, Q7, VPR, FPSCR, FPSCR_NZCV];
 }
 
-let mayStore = 1 in
 def VLSTM : AXSI4<(outs), (ins GPRnopc:$Rn, pred:$p), IndexModeNone,
                   NoItinerary, "vlstm${p}\t$Rn", "", []>,
             Requires<[HasV8MMainline, Has8MSecExt]> {
@@ -385,7 +412,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VADDH  : AHbI<0b11100, 0b11, 0, 0,
                   (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpALU16, "vadd", ".f16\t$Sd, $Sn, $Sm",
-                  [(set HPR:$Sd, (fadd HPR:$Sn, HPR:$Sm))]>,
+                  [(set (f16 HPR:$Sd), (fadd (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
              Sched<[WriteFPALU32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -410,7 +437,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VSUBH  : AHbI<0b11100, 0b11, 1, 0,
                   (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpALU16, "vsub", ".f16\t$Sd, $Sn, $Sm",
-                  [(set HPR:$Sd, (fsub HPR:$Sn, HPR:$Sm))]>,
+                  [(set (f16 HPR:$Sd), (fsub (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
             Sched<[WriteFPALU32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -431,7 +458,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VDIVH  : AHbI<0b11101, 0b00, 0, 0,
                   (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpDIV16, "vdiv", ".f16\t$Sd, $Sn, $Sm",
-                  [(set HPR:$Sd, (fdiv HPR:$Sn, HPR:$Sm))]>,
+                  [(set (f16 HPR:$Sd), (fdiv (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
              Sched<[WriteFPDIV32]>;
 
 let TwoOperandAliasConstraint = "$Dn = $Dd" in
@@ -456,7 +483,7 @@ let TwoOperandAliasConstraint = "$Sn = $Sd" in
 def VMULH  : AHbI<0b11100, 0b10, 0, 0,
                   (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpMUL16, "vmul", ".f16\t$Sd, $Sn, $Sm",
-                  [(set HPR:$Sd, (fmul HPR:$Sn, HPR:$Sm))]>,
+                  [(set (f16 HPR:$Sd), (fmul (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
              Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
 
 def VNMULD : ADbI<0b11100, 0b10, 1, 0,
@@ -478,7 +505,7 @@ def VNMULS : ASbI<0b11100, 0b10, 1, 0,
 def VNMULH : AHbI<0b11100, 0b10, 1, 0,
                   (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                   IIC_fpMUL16, "vnmul", ".f16\t$Sd, $Sn, $Sm",
-                  [(set HPR:$Sd, (fneg (fmul HPR:$Sn, HPR:$Sm)))]>,
+                  [(set (f16 HPR:$Sd), (fneg (fmul (f16 HPR:$Sn), (f16 HPR:$Sm))))]>,
              Sched<[WriteFPMUL32, ReadFPMUL, ReadFPMUL]>;
 
 multiclass vsel_inst<string op, bits<2> opc, int CC> {
@@ -487,7 +514,7 @@ multiclass vsel_inst<string op, bits<2> opc, int CC> {
     def H : AHbInp<0b11100, opc, 0,
                    (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                    NoItinerary, !strconcat("vsel", op, ".f16\t$Sd, $Sn, $Sm"),
-                   [(set HPR:$Sd, (ARMcmov HPR:$Sm, HPR:$Sn, CC))]>,
+                   [(set (f16 HPR:$Sd), (ARMcmov (f16 HPR:$Sm), (f16 HPR:$Sn), CC))]>,
                    Requires<[HasFullFP16]>;
 
     def S : ASbInp<0b11100, opc, 0,
@@ -516,7 +543,7 @@ multiclass vmaxmin_inst<string op, bit opc, SDNode SD> {
     def H : AHbInp<0b11101, 0b00, opc,
                    (outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm),
                    NoItinerary, !strconcat(op, ".f16\t$Sd, $Sn, $Sm"),
-                   [(set HPR:$Sd, (SD HPR:$Sn, HPR:$Sm))]>,
+                   [(set (f16 HPR:$Sd), (SD (f16 HPR:$Sn), (f16 HPR:$Sm)))]>,
                    Requires<[HasFullFP16]>;
 
     def S : ASbInp<0b11101, 0b00, opc,
@@ -548,12 +575,12 @@ let Defs = [FPSCR_NZCV] in {
 def VCMPED : ADuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
                   IIC_fpCMP64, "vcmpe", ".f64\t$Dd, $Dm",
-                  [/* For disassembly only; pattern left blank */]>;
+                  [(arm_cmpfpe DPR:$Dd, (f64 DPR:$Dm))]>;
 
 def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins SPR:$Sd, SPR:$Sm),
                   IIC_fpCMP32, "vcmpe", ".f32\t$Sd, $Sm",
-                  [/* For disassembly only; pattern left blank */]> {
+                  [(arm_cmpfpe SPR:$Sd, SPR:$Sm)]> {
   // Some single precision VFP instructions may be executed on both NEON and
   // VFP pipelines on A8.
   let D = VFPNeonA8Domain;
@@ -562,7 +589,7 @@ def VCMPES : ASuI<0b11101, 0b11, 0b0100, 0b11, 0,
 def VCMPEH : AHuI<0b11101, 0b11, 0b0100, 0b11, 0,
                   (outs), (ins HPR:$Sd, HPR:$Sm),
                   IIC_fpCMP16, "vcmpe", ".f16\t$Sd, $Sm",
-                  [/* For disassembly only; pattern left blank */]>;
+                  [(arm_cmpfpe (f16 HPR:$Sd), (f16 HPR:$Sm))]>;
 
 def VCMPD  : ADuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins DPR:$Dd, DPR:$Dm),
@@ -581,7 +608,7 @@ def VCMPS  : ASuI<0b11101, 0b11, 0b0100, 0b01, 0,
 def VCMPH  : AHuI<0b11101, 0b11, 0b0100, 0b01, 0,
                   (outs), (ins HPR:$Sd, HPR:$Sm),
                   IIC_fpCMP16, "vcmp", ".f16\t$Sd, $Sm",
-                  [(arm_cmpfp HPR:$Sd, HPR:$Sm)]>;
+                  [(arm_cmpfp (f16 HPR:$Sd), (f16 HPR:$Sm))]>;
 } // Defs = [FPSCR_NZCV]
 
 //===----------------------------------------------------------------------===//
@@ -605,13 +632,13 @@ def VABSS  : ASuIn<0b11101, 0b11, 0b0000, 0b11, 0,
 def VABSH  : AHuI<0b11101, 0b11, 0b0000, 0b11, 0,
                    (outs HPR:$Sd), (ins HPR:$Sm),
                    IIC_fpUNA16, "vabs", ".f16\t$Sd, $Sm",
-                   [(set HPR:$Sd, (fabs (f16 HPR:$Sm)))]>;
+                   [(set (f16 HPR:$Sd), (fabs (f16 HPR:$Sm)))]>;
 
 let Defs = [FPSCR_NZCV] in {
 def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins DPR:$Dd),
                    IIC_fpCMP64, "vcmpe", ".f64\t$Dd, #0",
-                   [/* For disassembly only; pattern left blank */]> {
+                   [(arm_cmpfpe0 (f64 DPR:$Dd))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -619,7 +646,7 @@ def VCMPEZD : ADuI<0b11101, 0b11, 0b0101, 0b11, 0,
 def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins SPR:$Sd),
                    IIC_fpCMP32, "vcmpe", ".f32\t$Sd, #0",
-                   [/* For disassembly only; pattern left blank */]> {
+                   [(arm_cmpfpe0 SPR:$Sd)]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 
@@ -631,7 +658,7 @@ def VCMPEZS : ASuI<0b11101, 0b11, 0b0101, 0b11, 0,
 def VCMPEZH : AHuI<0b11101, 0b11, 0b0101, 0b11, 0,
                    (outs), (ins HPR:$Sd),
                    IIC_fpCMP16, "vcmpe", ".f16\t$Sd, #0",
-                   [/* For disassembly only; pattern left blank */]> {
+                   [(arm_cmpfpe0 (f16 HPR:$Sd))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -659,7 +686,7 @@ def VCMPZS  : ASuI<0b11101, 0b11, 0b0101, 0b01, 0,
 def VCMPZH  : AHuI<0b11101, 0b11, 0b0101, 0b01, 0,
                    (outs), (ins HPR:$Sd),
                    IIC_fpCMP16, "vcmp", ".f16\t$Sd, #0",
-                   [(arm_cmpfp0 HPR:$Sd)]> {
+                   [(arm_cmpfp0 (f16 HPR:$Sd))]> {
   let Inst{3-0} = 0b0000;
   let Inst{5}   = 0;
 }
@@ -681,6 +708,7 @@ def VCVTDS  : ASuI<0b11101, 0b11, 0b0111, 0b11, 0,
   let Inst{22}    = Dd{4};
 
   let Predicates = [HasVFP2, HasDPVFP];
+  let hasSideEffects = 0;
 }
 
 // Special case encoding: bits 11-8 is 0b1011.
@@ -705,20 +733,23 @@ def VCVTSD  : VFPAI<(outs SPR:$Sd), (ins DPR:$Dm), VFPUnaryFrm,
   let Inst{4}     = 0;
 
   let Predicates = [HasVFP2, HasDPVFP];
+  let hasSideEffects = 0;
 }
 
 // Between half, single and double-precision.
+let hasSideEffects = 0 in
 def VCVTBHS: ASuI<0b11101, 0b11, 0b0010, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtb", ".f32.f16\t$Sd, $Sm",
                  [/* Intentionally left blank, see patterns below */]>,
                  Requires<[HasFP16]>,
              Sched<[WriteFPCVT]>;
 
-def : FP16Pat<(f32 (fpextend HPR:$Sm)),
-              (VCVTBHS (COPY_TO_REGCLASS HPR:$Sm, SPR))>;
+def : FP16Pat<(f32 (fpextend (f16 HPR:$Sm))),
+              (VCVTBHS (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>;
 def : FP16Pat<(f16_to_fp GPR:$a),
               (VCVTBHS (COPY_TO_REGCLASS GPR:$a, SPR))>;
 
+let hasSideEffects = 0 in
 def VCVTBSH: ASuI<0b11101, 0b11, 0b0011, 0b01, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtb", ".f16.f32\t$Sd, $Sm",
                  [/* Intentionally left blank, see patterns below */]>,
@@ -729,19 +760,41 @@ def : FP16Pat<(f16 (fpround SPR:$Sm)),
               (COPY_TO_REGCLASS (VCVTBSH SPR:$Sm), HPR)>;
 def : FP16Pat<(fp_to_f16 SPR:$a),
               (i32 (COPY_TO_REGCLASS (VCVTBSH SPR:$a), GPR))>;
+def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane),
+              (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTBSH SPR:$src2),
+                                    (SSubReg_f16_reg imm:$lane)))>;
+def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_even:$lane),
+              (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTBSH SPR:$src2),
+                                    (SSubReg_f16_reg imm:$lane)))>;
 
+let hasSideEffects = 0 in
 def VCVTTHS: ASuI<0b11101, 0b11, 0b0010, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTSH, "vcvtt", ".f32.f16\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>,
+                 [/* Intentionally left blank, see patterns below */]>,
                  Requires<[HasFP16]>,
              Sched<[WriteFPCVT]>;
 
+def : FP16Pat<(f32 (fpextend (extractelt (v8f16 MQPR:$src), imm_odd:$lane))),
+              (VCVTTHS (EXTRACT_SUBREG MQPR:$src, (SSubReg_f16_reg imm_odd:$lane)))>;
+def : FP16Pat<(f32 (fpextend (extractelt (v4f16 DPR:$src), imm_odd:$lane))),
+              (VCVTTHS (EXTRACT_SUBREG
+                (v2f32 (COPY_TO_REGCLASS (v4f16 DPR:$src), DPR_VFP2)),
+                (SSubReg_f16_reg imm_odd:$lane)))>;
+
+let hasSideEffects = 0 in
 def VCVTTSH: ASuI<0b11101, 0b11, 0b0011, 0b11, 0, (outs SPR:$Sd), (ins SPR:$Sm),
                  /* FIXME */ IIC_fpCVTHS, "vcvtt", ".f16.f32\t$Sd, $Sm",
-                 [/* For disassembly only; pattern left blank */]>,
+                 [/* Intentionally left blank, see patterns below */]>,
                  Requires<[HasFP16]>,
             Sched<[WriteFPCVT]>;
 
+def : FP16Pat<(insertelt (v8f16 MQPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
+              (v8f16 (INSERT_SUBREG (v8f16 MQPR:$src1), (VCVTTSH SPR:$src2),
+                                    (SSubReg_f16_reg imm:$lane)))>;
+def : FP16Pat<(insertelt (v4f16 DPR:$src1), (f16 (fpround (f32 SPR:$src2))), imm_odd:$lane),
+              (v4f16 (INSERT_SUBREG (v4f16 DPR:$src1), (VCVTTSH SPR:$src2),
+                                    (SSubReg_f16_reg imm:$lane)))>;
+
 def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
                    (outs DPR:$Dd), (ins SPR:$Sm),
                    NoItinerary, "vcvtb", ".f64.f16\t$Dd, $Sm",
@@ -754,10 +807,12 @@ def VCVTBHD : ADuI<0b11101, 0b11, 0b0010, 0b01, 0,
   // Encode instruction operands.
   let Inst{3-0} = Sm{4-1};
   let Inst{5}   = Sm{0};
+
+  let hasSideEffects = 0;
 }
 
-def : FullFP16Pat<(f64 (fpextend HPR:$Sm)),
-                  (VCVTBHD (COPY_TO_REGCLASS HPR:$Sm, SPR))>,
+def : FullFP16Pat<(f64 (fpextend (f16 HPR:$Sm))),
+                  (VCVTBHD (COPY_TO_REGCLASS (f16 HPR:$Sm), SPR))>,
                   Requires<[HasFPARMv8, HasDPVFP]>;
 def : FP16Pat<(f64 (f16_to_fp GPR:$a)),
               (VCVTBHD (COPY_TO_REGCLASS GPR:$a, SPR))>,
@@ -777,6 +832,8 @@ def VCVTBDH : ADuI<0b11101, 0b11, 0b0011, 0b01, 0,
   let Inst{5}       = Dm{4};
   let Inst{15-12}   = Sd{4-1};
   let Inst{22}      = Sd{0};
+
+  let hasSideEffects = 0;
 }
 
 def : FullFP16Pat<(f16 (fpround DPR:$Dm)),
@@ -796,6 +853,8 @@ def VCVTTHD : ADuI<0b11101, 0b11, 0b0010, 0b11, 0,
   // Encode instruction operands.
   let Inst{3-0} = Sm{4-1};
   let Inst{5}   = Sm{0};
+
+  let hasSideEffects = 0;
 }
 
 def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
@@ -811,11 +870,13 @@ def VCVTTDH : ADuI<0b11101, 0b11, 0b0011, 0b11, 0,
   let Inst{22}    = Sd{0};
   let Inst{3-0}   = Dm{3-0};
   let Inst{5}     = Dm{4};
+
+  let hasSideEffects = 0;
 }
 
 multiclass vcvt_inst<string opc, bits<2> rm,
                      SDPatternOperator node = null_frag> {
-  let PostEncoderMethod = "", DecoderNamespace = "VFPV8" in {
+  let PostEncoderMethod = "", DecoderNamespace = "VFPV8", hasSideEffects = 0 in {
     def SH : AHuInp<0b11101, 0b11, 0b1100, 0b11, 0,
                     (outs SPR:$Sd), (ins HPR:$Sm),
                     NoItinerary, !strconcat("vcvt", opc, ".s32.f16\t$Sd, $Sm"),
@@ -881,14 +942,14 @@ multiclass vcvt_inst<string opc, bits<2> rm,
 
   let Predicates = [HasFPARMv8] in {
     let Predicates = [HasFullFP16] in {
-    def : Pat<(i32 (fp_to_sint (node HPR:$a))),
+    def : Pat<(i32 (fp_to_sint (node (f16 HPR:$a)))),
               (COPY_TO_REGCLASS
-                (!cast<Instruction>(NAME#"SH") HPR:$a),
+                (!cast<Instruction>(NAME#"SH") (f16 HPR:$a)),
                 GPR)>;
 
-    def : Pat<(i32 (fp_to_uint (node HPR:$a))),
+    def : Pat<(i32 (fp_to_uint (node (f16 HPR:$a)))),
               (COPY_TO_REGCLASS
-                (!cast<Instruction>(NAME#"UH") HPR:$a),
+                (!cast<Instruction>(NAME#"UH") (f16 HPR:$a)),
                 GPR)>;
     }
     def : Pat<(i32 (fp_to_sint (node SPR:$a))),
@@ -934,7 +995,7 @@ def VNEGS  : ASuIn<0b11101, 0b11, 0b0001, 0b01, 0,
 def VNEGH  : AHuI<0b11101, 0b11, 0b0001, 0b01, 0,
                   (outs HPR:$Sd), (ins HPR:$Sm),
                   IIC_fpUNA16, "vneg", ".f16\t$Sd, $Sm",
-                  [(set HPR:$Sd, (fneg HPR:$Sm))]>;
+                  [(set (f16 HPR:$Sd), (fneg (f16 HPR:$Sm)))]>;
 
 multiclass vrint_inst_zrx<string opc, bit op, bit op2, SDPatternOperator node> {
   def H : AHuI<0b11101, 0b11, 0b0110, 0b11, 0,
@@ -1033,7 +1094,7 @@ def VSQRTS : ASuI<0b11101, 0b11, 0b0001, 0b11, 0,
 def VSQRTH : AHuI<0b11101, 0b11, 0b0001, 0b11, 0,
                   (outs HPR:$Sd), (ins HPR:$Sm),
                   IIC_fpSQRT16, "vsqrt", ".f16\t$Sd, $Sm",
-                  [(set HPR:$Sd, (fsqrt (f16 HPR:$Sm)))]>;
+                  [(set (f16 HPR:$Sd), (fsqrt (f16 HPR:$Sm)))]>;
 
 let hasSideEffects = 0 in {
 let isMoveReg = 1 in {
@@ -1248,7 +1309,7 @@ def VMOVSRR : AVConv5I<0b11000100, 0b1010,
 def VMOVRH : AVConv2I<0b11100001, 0b1001,
                       (outs rGPR:$Rt), (ins HPR:$Sn),
                       IIC_fpMOVSI, "vmov", ".f16\t$Rt, $Sn",
-                      [(set rGPR:$Rt, (arm_vmovrh HPR:$Sn))]>,
+                      []>,
              Requires<[HasFPRegs16]>,
              Sched<[WriteFPMOV]> {
   // Instruction operands.
@@ -1270,7 +1331,7 @@ def VMOVRH : AVConv2I<0b11100001, 0b1001,
 def VMOVHR : AVConv4I<0b11100000, 0b1001,
                       (outs HPR:$Sn), (ins rGPR:$Rt),
                       IIC_fpMOVIS, "vmov", ".f16\t$Sn, $Rt",
-                      [(set HPR:$Sn, (arm_vmovhr rGPR:$Rt))]>,
+                      []>,
              Requires<[HasFPRegs16]>,
              Sched<[WriteFPMOV]> {
   // Instruction operands.
@@ -1288,6 +1349,11 @@ def VMOVHR : AVConv4I<0b11100000, 0b1001,
   let isUnpredicable = 1;
 }
 
+def : FPRegs16Pat<(arm_vmovrh (f16 HPR:$Sn)), (VMOVRH (f16 HPR:$Sn))>;
+def : FPRegs16Pat<(arm_vmovrh (bf16 HPR:$Sn)), (VMOVRH (bf16 HPR:$Sn))>;
+def : FPRegs16Pat<(f16 (arm_vmovhr rGPR:$Rt)), (VMOVHR rGPR:$Rt)>;
+def : FPRegs16Pat<(bf16 (arm_vmovhr rGPR:$Rt)), (VMOVHR rGPR:$Rt)>;
+
 // FMRDH: SPR -> GPR
 // FMRDL: SPR -> GPR
 // FMRRS: SPR -> GPR
@@ -1315,6 +1381,7 @@ class AVConv1IDs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{22}    = Dd{4};
 
   let Predicates = [HasVFP2, HasDPVFP];
+  let hasSideEffects = 0;
 }
 
 class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -1331,6 +1398,8 @@ class AVConv1InSs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{5}     = Sm{0};
   let Inst{15-12} = Sd{4-1};
   let Inst{22}    = Sd{0};
+
+  let hasSideEffects = 0;
 }
 
 class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -1350,6 +1419,7 @@ class AVConv1IHs_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{22}    = Sd{0};
 
   let Predicates = [HasFullFP16];
+  let hasSideEffects = 0;
 }
 
 def VSITOD : AVConv1IDs_Encode<0b11101, 0b11, 0b1000, 0b1011,
@@ -1463,6 +1533,7 @@ class AVConv1IsD_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{22}    = Sd{0};
 
   let Predicates = [HasVFP2, HasDPVFP];
+  let hasSideEffects = 0;
 }
 
 class AVConv1InsS_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
@@ -1499,6 +1570,7 @@ class AVConv1IsH_Encode<bits<5> opcod1, bits<2> opcod2, bits<4> opcod3,
   let Inst{22}    = Sd{0};
 
   let Predicates = [HasFullFP16];
+  let hasSideEffects = 0;
 }
 
 // Always set Z bit in the instruction, i.e. "round towards zero" variants.
@@ -1546,8 +1618,8 @@ def VTOSIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1101, 0b1001,
   let isUnpredicable = 1;
 }
 
-def : VFPNoNEONPat<(i32 (fp_to_sint HPR:$a)),
-                   (COPY_TO_REGCLASS (VTOSIZH HPR:$a), GPR)>;
+def : VFPNoNEONPat<(i32 (fp_to_sint (f16 HPR:$a))),
+                   (COPY_TO_REGCLASS (VTOSIZH (f16 HPR:$a)), GPR)>;
 
 def VTOUIZD : AVConv1IsD_Encode<0b11101, 0b11, 0b1100, 0b1011,
                                (outs SPR:$Sd), (ins DPR:$Dm),
@@ -1593,8 +1665,8 @@ def VTOUIZH : AVConv1IsH_Encode<0b11101, 0b11, 0b1100, 0b1001,
   let isUnpredicable = 1;
 }
 
-def : VFPNoNEONPat<(i32 (fp_to_uint HPR:$a)),
-                   (COPY_TO_REGCLASS (VTOUIZH HPR:$a), GPR)>;
+def : VFPNoNEONPat<(i32 (fp_to_uint (f16 HPR:$a))),
+                   (COPY_TO_REGCLASS (VTOUIZH (f16 HPR:$a)), GPR)>;
 
 // And the Z bit '0' variants, i.e. use the rounding mode specified by FPSCR.
 let Uses = [FPSCR] in {
@@ -1678,6 +1750,8 @@ class AVConv1XInsS_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
   // if dp_operation then UInt(D:Vd) else UInt(Vd:D);
   let Inst{22} = dst{0};
   let Inst{15-12} = dst{4-1};
+
+  let hasSideEffects = 0;
 }
 
 // Double Precision register
@@ -1690,6 +1764,7 @@ class AVConv1XInsD_Encode<bits<5> op1, bits<2> op2, bits<4> op3, bits<4> op4,
   let Inst{22} = dst{4};
   let Inst{15-12} = dst{3-0};
 
+  let hasSideEffects = 0;
   let Predicates = [HasVFP2, HasDPVFP];
 }
 
@@ -1865,6 +1940,37 @@ def VULTOD : AVConv1XInsD_Encode<0b11101, 0b11, 0b1011, 0b1011, 1,
 
 } // End of 'let Constraints = "$a = $dst" in'
 
+// BFloat16  - Single precision, unary, predicated
+class BF16_VCVT<string opc, bits<2> op7_6>
+   : VFPAI<(outs SPR:$Sd), (ins SPR:$dst, SPR:$Sm),
+           VFPUnaryFrm, NoItinerary,
+           opc, ".bf16.f32\t$Sd, $Sm", []>,
+      RegConstraint<"$dst = $Sd">,
+      Requires<[HasBF16]>,
+     Sched<[]> {
+  bits<5> Sd;
+  bits<5> Sm;
+
+  // Encode instruction operands.
+  let Inst{3-0}   = Sm{4-1};
+  let Inst{5}     = Sm{0};
+  let Inst{15-12} = Sd{4-1};
+  let Inst{22}    = Sd{0};
+
+  let Inst{27-23} = 0b11101; // opcode1
+  let Inst{21-20} = 0b11;    // opcode2
+  let Inst{19-16} = 0b0011;  // opcode3
+  let Inst{11-8}  = 0b1001;
+  let Inst{7-6}   = op7_6;
+  let Inst{4}     = 0;
+
+  let DecoderNamespace = "VFPV8";
+  let hasSideEffects = 0;
+}
+
+def BF16_VCVTB : BF16_VCVT<"vcvtb", 0b01>;
+def BF16_VCVTT : BF16_VCVT<"vcvtt", 0b11>;
+
 //===----------------------------------------------------------------------===//
 // FP Multiply-Accumulate Operations.
 //
@@ -1894,8 +2000,8 @@ def VMLAS : ASbIn<0b11100, 0b00, 0, 0,
 def VMLAH : AHbI<0b11100, 0b00, 0, 0,
                   (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpMAC16, "vmla", ".f16\t$Sd, $Sn, $Sm",
-                  [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
-                                           HPR:$Sdin))]>,
+                  [(set (f16 HPR:$Sd), (fadd_mlx (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm)),
+                                           (f16 HPR:$Sdin)))]>,
               RegConstraint<"$Sdin = $Sd">,
               Requires<[HasFullFP16,UseFPVMLx]>;
 
@@ -1905,8 +2011,8 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP, UseFPVMLx]>;
-def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
-          (VMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
+def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)),
+          (VMLAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
           Requires<[HasFullFP16,DontUseNEONForFP, UseFPVMLx]>;
 
 
@@ -1935,8 +2041,8 @@ def VMLSS : ASbIn<0b11100, 0b00, 1, 0,
 def VMLSH : AHbI<0b11100, 0b00, 1, 0,
                   (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpMAC16, "vmls", ".f16\t$Sd, $Sn, $Sm",
-                  [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
-                                           HPR:$Sdin))]>,
+                  [(set (f16 HPR:$Sd), (fadd_mlx (fneg (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm))),
+                                           (f16 HPR:$Sdin)))]>,
               RegConstraint<"$Sdin = $Sd">,
               Requires<[HasFullFP16,UseFPVMLx]>;
 
@@ -1946,8 +2052,8 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
-def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
-          (VMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
+def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)),
+          (VMLSH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
           Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
 
 def VNMLAD : ADbI<0b11100, 0b01, 1, 0,
@@ -1975,8 +2081,8 @@ def VNMLAS : ASbI<0b11100, 0b01, 1, 0,
 def VNMLAH : AHbI<0b11100, 0b01, 1, 0,
                   (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpMAC16, "vnmla", ".f16\t$Sd, $Sn, $Sm",
-                  [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
-                                           HPR:$Sdin))]>,
+                  [(set (f16 HPR:$Sd), (fsub_mlx (fneg (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm))),
+                                           (f16 HPR:$Sdin)))]>,
                 RegConstraint<"$Sdin = $Sd">,
                 Requires<[HasFullFP16,UseFPVMLx]>;
 
@@ -1987,8 +2093,8 @@ def : Pat<(fsub_mlx (fneg (fmul_su DPR:$a, (f64 DPR:$b))), DPR:$dstin),
 def : Pat<(fsub_mlx (fneg (fmul_su SPR:$a, SPR:$b)), SPR:$dstin),
           (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
-def : Pat<(fsub_mlx (fneg (fmul_su HPR:$a, HPR:$b)), HPR:$dstin),
-          (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
+def : Pat<(fsub_mlx (fneg (fmul_su (f16 HPR:$a), HPR:$b)), HPR:$dstin),
+          (VNMLAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
           Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
 
 // (-dst - (a * b)) -> -(dst + (a * b))
@@ -1998,8 +2104,8 @@ def : Pat<(fsub_mlx (fneg DPR:$dstin), (fmul_su DPR:$a, (f64 DPR:$b))),
 def : Pat<(fsub_mlx (fneg SPR:$dstin), (fmul_su SPR:$a, SPR:$b)),
           (VNMLAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
-def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su HPR:$a, HPR:$b)),
-          (VNMLAH HPR:$dstin, HPR:$a, HPR:$b)>,
+def : Pat<(fsub_mlx (fneg HPR:$dstin), (fmul_su (f16 HPR:$a), HPR:$b)),
+          (VNMLAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
           Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
 
 def VNMLSD : ADbI<0b11100, 0b01, 0, 0,
@@ -2026,7 +2132,7 @@ def VNMLSS : ASbI<0b11100, 0b01, 0, 0,
 def VNMLSH : AHbI<0b11100, 0b01, 0, 0,
                   (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpMAC16, "vnmls", ".f16\t$Sd, $Sn, $Sm",
-             [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
+             [(set (f16 HPR:$Sd), (fsub_mlx (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm)), (f16 HPR:$Sdin)))]>,
                          RegConstraint<"$Sdin = $Sd">,
                 Requires<[HasFullFP16,UseFPVMLx]>;
 
@@ -2036,8 +2142,8 @@ def : Pat<(fsub_mlx (fmul_su DPR:$a, (f64 DPR:$b)), DPR:$dstin),
 def : Pat<(fsub_mlx (fmul_su SPR:$a, SPR:$b), SPR:$dstin),
           (VNMLSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP2,DontUseNEONForFP,UseFPVMLx]>;
-def : Pat<(fsub_mlx (fmul_su HPR:$a, HPR:$b), HPR:$dstin),
-          (VNMLSH HPR:$dstin, HPR:$a, HPR:$b)>,
+def : Pat<(fsub_mlx (fmul_su (f16 HPR:$a), HPR:$b), HPR:$dstin),
+          (VNMLSH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
           Requires<[HasFullFP16,DontUseNEONForFP,UseFPVMLx]>;
 
 //===----------------------------------------------------------------------===//
@@ -2067,8 +2173,8 @@ def VFMAS : ASbIn<0b11101, 0b10, 0, 0,
 def VFMAH : AHbI<0b11101, 0b10, 0, 0,
                   (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpFMAC16, "vfma", ".f16\t$Sd, $Sn, $Sm",
-                  [(set HPR:$Sd, (fadd_mlx (fmul_su HPR:$Sn, HPR:$Sm),
-                                           HPR:$Sdin))]>,
+                  [(set (f16 HPR:$Sd), (fadd_mlx (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm)),
+                                           (f16 HPR:$Sdin)))]>,
               RegConstraint<"$Sdin = $Sd">,
               Requires<[HasFullFP16,UseFusedMAC]>,
             Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -2079,8 +2185,8 @@ def : Pat<(fadd_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
 def : Pat<(fadd_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VFMAS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
-def : Pat<(fadd_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
-          (VFMAH HPR:$dstin, HPR:$a, HPR:$b)>,
+def : Pat<(fadd_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)),
+          (VFMAH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
           Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
@@ -2091,8 +2197,8 @@ def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, DPR:$Ddin)),
 def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, SPR:$Sdin)),
           (VFMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, HPR:$Sdin)),
-          (VFMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (f16 HPR:$Sdin))),
+          (VFMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
 
 def VFMSD : ADbI<0b11101, 0b10, 1, 0,
@@ -2119,8 +2225,8 @@ def VFMSS : ASbIn<0b11101, 0b10, 1, 0,
 def VFMSH : AHbI<0b11101, 0b10, 1, 0,
                   (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpFMAC16, "vfms", ".f16\t$Sd, $Sn, $Sm",
-                  [(set HPR:$Sd, (fadd_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
-                                           HPR:$Sdin))]>,
+                  [(set (f16 HPR:$Sd), (fadd_mlx (fneg (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm))),
+                                           (f16 HPR:$Sdin)))]>,
               RegConstraint<"$Sdin = $Sd">,
               Requires<[HasFullFP16,UseFusedMAC]>,
               Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -2131,8 +2237,8 @@ def : Pat<(fsub_mlx DPR:$dstin, (fmul_su DPR:$a, (f64 DPR:$b))),
 def : Pat<(fsub_mlx SPR:$dstin, (fmul_su SPR:$a, SPR:$b)),
           (VFMSS SPR:$dstin, SPR:$a, SPR:$b)>,
           Requires<[HasVFP4,DontUseNEONForFP,UseFusedMAC]>;
-def : Pat<(fsub_mlx HPR:$dstin, (fmul_su HPR:$a, HPR:$b)),
-          (VFMSH HPR:$dstin, HPR:$a, HPR:$b)>,
+def : Pat<(fsub_mlx HPR:$dstin, (fmul_su (f16 HPR:$a), HPR:$b)),
+          (VFMSH HPR:$dstin, (f16 HPR:$a), HPR:$b)>,
           Requires<[HasFullFP16,DontUseNEONForFP,UseFusedMAC]>;
 
 // Match @llvm.fma.* intrinsics
@@ -2143,8 +2249,8 @@ def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin)),
 def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin)),
           (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f16 (fma (fneg HPR:$Sn), HPR:$Sm, HPR:$Sdin)),
-          (VFMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin))),
+          (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
 // (fma x, (fneg y), z) -> (vfms z, x, y)
 def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
@@ -2153,8 +2259,8 @@ def : Pat<(f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin)),
 def : Pat<(f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin)),
           (VFMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f16 (fma HPR:$Sn, (fneg HPR:$Sm), HPR:$Sdin)),
-          (VFMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(f16 (fma (f16 HPR:$Sn), (fneg (f16 HPR:$Sm)), (f16 HPR:$Sdin))),
+          (VFMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
 
 def VFNMAD : ADbI<0b11101, 0b01, 1, 0,
@@ -2181,8 +2287,8 @@ def VFNMAS : ASbI<0b11101, 0b01, 1, 0,
 def VFNMAH : AHbI<0b11101, 0b01, 1, 0,
                   (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpFMAC16, "vfnma", ".f16\t$Sd, $Sn, $Sm",
-                  [(set HPR:$Sd, (fsub_mlx (fneg (fmul_su HPR:$Sn, HPR:$Sm)),
-                                           HPR:$Sdin))]>,
+                  [(set (f16 HPR:$Sd), (fsub_mlx (fneg (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm))),
+                                           (f16 HPR:$Sdin)))]>,
                 RegConstraint<"$Sdin = $Sd">,
                 Requires<[HasFullFP16,UseFusedMAC]>,
                 Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -2202,8 +2308,8 @@ def : Pat<(fneg (fma (f64 DPR:$Dn), (f64 DPR:$Dm), (f64 DPR:$Ddin))),
 def : Pat<(fneg (fma (f32 SPR:$Sn), (f32 SPR:$Sm), (f32 SPR:$Sdin))),
           (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 HPR:$Sdin))),
-          (VFNMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(fneg (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (f16 (f16 HPR:$Sdin)))),
+          (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
 // (fma (fneg x), y, (fneg z)) -> (vfnma z, x, y)
 def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
@@ -2212,8 +2318,8 @@ def : Pat<(f64 (fma (fneg DPR:$Dn), DPR:$Dm, (fneg DPR:$Ddin))),
 def : Pat<(f32 (fma (fneg SPR:$Sn), SPR:$Sm, (fneg SPR:$Sdin))),
           (VFNMAS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f16 (fma (fneg HPR:$Sn), HPR:$Sm, (fneg HPR:$Sdin))),
-          (VFNMAH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))),
+          (VFNMAH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
 
 def VFNMSD : ADbI<0b11101, 0b01, 0, 0,
@@ -2239,7 +2345,7 @@ def VFNMSS : ASbI<0b11101, 0b01, 0, 0,
 def VFNMSH : AHbI<0b11101, 0b01, 0, 0,
                   (outs HPR:$Sd), (ins HPR:$Sdin, HPR:$Sn, HPR:$Sm),
                   IIC_fpFMAC16, "vfnms", ".f16\t$Sd, $Sn, $Sm",
-             [(set HPR:$Sd, (fsub_mlx (fmul_su HPR:$Sn, HPR:$Sm), HPR:$Sdin))]>,
+             [(set (f16 HPR:$Sd), (fsub_mlx (fmul_su (f16 HPR:$Sn), (f16 HPR:$Sm)), (f16 HPR:$Sdin)))]>,
                          RegConstraint<"$Sdin = $Sd">,
                   Requires<[HasFullFP16,UseFusedMAC]>,
                   Sched<[WriteFPMAC32, ReadFPMAC, ReadFPMUL, ReadFPMUL]>;
@@ -2260,8 +2366,8 @@ def : Pat<(f64 (fma DPR:$Dn, DPR:$Dm, (fneg DPR:$Ddin))),
 def : Pat<(f32 (fma SPR:$Sn, SPR:$Sm, (fneg SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-def : Pat<(f16 (fma HPR:$Sn, HPR:$Sm, (fneg HPR:$Sdin))),
-          (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(f16 (fma (f16 HPR:$Sn), (f16 HPR:$Sm), (fneg (f16 HPR:$Sdin)))),
+          (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
 // (fneg (fma (fneg x), y, z)) -> (vfnms z, x, y)
 def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
@@ -2270,8 +2376,8 @@ def : Pat<(fneg (f64 (fma (fneg DPR:$Dn), DPR:$Dm, DPR:$Ddin))),
 def : Pat<(fneg (f32 (fma (fneg SPR:$Sn), SPR:$Sm, SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-def : Pat<(fneg (f16 (fma (fneg HPR:$Sn), HPR:$Sm, HPR:$Sdin))),
-          (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(fneg (f16 (fma (fneg (f16 HPR:$Sn)), (f16 HPR:$Sm), (f16 HPR:$Sdin)))),
+          (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
 // (fneg (fma x, (fneg y), z) -> (vfnms z, x, y)
 def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
@@ -2280,8 +2386,8 @@ def : Pat<(fneg (f64 (fma DPR:$Dn, (fneg DPR:$Dm), DPR:$Ddin))),
 def : Pat<(fneg (f32 (fma SPR:$Sn, (fneg SPR:$Sm), SPR:$Sdin))),
           (VFNMSS SPR:$Sdin, SPR:$Sn, SPR:$Sm)>,
       Requires<[HasVFP4]>;
-def : Pat<(fneg (f16 (fma HPR:$Sn, (fneg HPR:$Sm), HPR:$Sdin))),
-          (VFNMSH HPR:$Sdin, HPR:$Sn, HPR:$Sm)>,
+def : Pat<(fneg (f16 (fma (f16 HPR:$Sn), (fneg (f16 HPR:$Sm)), (f16 HPR:$Sdin)))),
+          (VFNMSH (f16 HPR:$Sdin), (f16 HPR:$Sn), (f16 HPR:$Sm))>,
       Requires<[HasFullFP16]>;
 
 //===----------------------------------------------------------------------===//
@@ -2304,7 +2410,7 @@ def VMOVScc  : PseudoInst<(outs SPR:$Sd), (ins SPR:$Sn, SPR:$Sm, cmovpred:$p),
 def VMOVHcc  : PseudoInst<(outs HPR:$Sd), (ins HPR:$Sn, HPR:$Sm, cmovpred:$p),
                     IIC_fpUNA16,
                     [(set (f16 HPR:$Sd),
-                          (ARMcmov HPR:$Sn, HPR:$Sm, cmovpred:$p))]>,
+                          (ARMcmov (f16 HPR:$Sn), (f16 HPR:$Sm), cmovpred:$p))]>,
                RegConstraint<"$Sd = $Sn">, Requires<[HasFPRegs]>;
 } // hasSideEffects
 
@@ -2510,7 +2616,7 @@ def FCONSTS : VFPAI<(outs SPR:$Sd), (ins vfp_f32imm:$imm),
 def FCONSTH : VFPAI<(outs HPR:$Sd), (ins vfp_f16imm:$imm),
                      VFPMiscFrm, IIC_fpUNA16,
                      "vmov", ".f16\t$Sd, $imm",
-                     [(set HPR:$Sd, vfp_f16imm:$imm)]>,
+                     [(set (f16 HPR:$Sd), vfp_f16imm:$imm)]>,
               Requires<[HasFullFP16]> {
   bits<5> Sd;
   bits<8> imm;
diff --git a/llvm/lib/Target/ARM/ARMInstructionSelector.cpp b/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
index 67816bc2103f..c8a894fb11a8 100644
--- a/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
+++ b/llvm/lib/Target/ARM/ARMInstructionSelector.cpp
@@ -239,17 +239,17 @@ static bool selectMergeValues(MachineInstrBuilder &MIB,
 
   // We only support G_MERGE_VALUES as a way to stick together two scalar GPRs
   // into one DPR.
-  Register VReg0 = MIB->getOperand(0).getReg();
+  Register VReg0 = MIB.getReg(0);
   (void)VReg0;
   assert(MRI.getType(VReg0).getSizeInBits() == 64 &&
          RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::FPRRegBankID &&
          "Unsupported operand for G_MERGE_VALUES");
-  Register VReg1 = MIB->getOperand(1).getReg();
+  Register VReg1 = MIB.getReg(1);
   (void)VReg1;
   assert(MRI.getType(VReg1).getSizeInBits() == 32 &&
          RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID &&
          "Unsupported operand for G_MERGE_VALUES");
-  Register VReg2 = MIB->getOperand(2).getReg();
+  Register VReg2 = MIB.getReg(2);
   (void)VReg2;
   assert(MRI.getType(VReg2).getSizeInBits() == 32 &&
          RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::GPRRegBankID &&
@@ -271,17 +271,17 @@ static bool selectUnmergeValues(MachineInstrBuilder &MIB,
 
   // We only support G_UNMERGE_VALUES as a way to break up one DPR into two
   // GPRs.
-  Register VReg0 = MIB->getOperand(0).getReg();
+  Register VReg0 = MIB.getReg(0);
   (void)VReg0;
   assert(MRI.getType(VReg0).getSizeInBits() == 32 &&
          RBI.getRegBank(VReg0, MRI, TRI)->getID() == ARM::GPRRegBankID &&
          "Unsupported operand for G_UNMERGE_VALUES");
-  Register VReg1 = MIB->getOperand(1).getReg();
+  Register VReg1 = MIB.getReg(1);
   (void)VReg1;
   assert(MRI.getType(VReg1).getSizeInBits() == 32 &&
          RBI.getRegBank(VReg1, MRI, TRI)->getID() == ARM::GPRRegBankID &&
          "Unsupported operand for G_UNMERGE_VALUES");
-  Register VReg2 = MIB->getOperand(2).getReg();
+  Register VReg2 = MIB.getReg(2);
   (void)VReg2;
   assert(MRI.getType(VReg2).getSizeInBits() == 64 &&
          RBI.getRegBank(VReg2, MRI, TRI)->getID() == ARM::FPRRegBankID &&
@@ -530,7 +530,7 @@ bool ARMInstructionSelector::selectCmp(CmpConstants Helper,
                                        MachineRegisterInfo &MRI) const {
   const InsertInfo I(MIB);
 
-  auto ResReg = MIB->getOperand(0).getReg();
+  auto ResReg = MIB.getReg(0);
   if (!validReg(MRI, ResReg, 1, ARM::GPRRegBankID))
     return false;
 
@@ -542,8 +542,8 @@ bool ARMInstructionSelector::selectCmp(CmpConstants Helper,
     return true;
   }
 
-  auto LHSReg = MIB->getOperand(2).getReg();
-  auto RHSReg = MIB->getOperand(3).getReg();
+  auto LHSReg = MIB.getReg(2);
+  auto RHSReg = MIB.getReg(3);
   if (!validOpRegPair(MRI, LHSReg, RHSReg, Helper.OperandSize,
                       Helper.OperandRegBankID))
     return false;
@@ -627,7 +627,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
   bool UseMovt = STI.useMovt();
 
   unsigned Size = TM.getPointerSize(0);
-  unsigned Alignment = 4;
+  const Align Alignment(4);
 
   auto addOpsForConstantPoolLoad = [&MF, Alignment,
                                     Size](MachineInstrBuilder &MIB,
@@ -687,7 +687,7 @@ bool ARMInstructionSelector::selectGlobal(MachineInstrBuilder &MIB,
 
     if (Indirect) {
       if (!UseOpcodeThatLoads) {
-        auto ResultReg = MIB->getOperand(0).getReg();
+        auto ResultReg = MIB.getReg(0);
         auto AddressReg = MRI.createVirtualRegister(&ARM::GPRRegClass);
 
         MIB->getOperand(0).setReg(AddressReg);
@@ -773,7 +773,7 @@ bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB,
   auto &DbgLoc = MIB->getDebugLoc();
 
   // Compare the condition to 1.
-  auto CondReg = MIB->getOperand(1).getReg();
+  auto CondReg = MIB.getReg(1);
   assert(validReg(MRI, CondReg, 1, ARM::GPRRegBankID) &&
          "Unsupported types for select operation");
   auto CmpI = BuildMI(MBB, InsertBefore, DbgLoc, TII.get(Opcodes.TSTri))
@@ -785,9 +785,9 @@ bool ARMInstructionSelector::selectSelect(MachineInstrBuilder &MIB,
 
   // Move a value into the result register based on the result of the
   // comparison.
-  auto ResReg = MIB->getOperand(0).getReg();
-  auto TrueReg = MIB->getOperand(2).getReg();
-  auto FalseReg = MIB->getOperand(3).getReg();
+  auto ResReg = MIB.getReg(0);
+  auto TrueReg = MIB.getReg(2);
+  auto FalseReg = MIB.getReg(3);
   assert(validOpRegPair(MRI, ResReg, TrueReg, 32, ARM::GPRRegBankID) &&
          validOpRegPair(MRI, TrueReg, FalseReg, 32, ARM::GPRRegBankID) &&
          "Unsupported types for select operation");
@@ -990,7 +990,7 @@ bool ARMInstructionSelector::select(MachineInstr &I) {
   case G_FCONSTANT: {
     // Load from constant pool
     unsigned Size = MRI.getType(I.getOperand(0).getReg()).getSizeInBits() / 8;
-    unsigned Alignment = Size;
+    Align Alignment(Size);
 
     assert((Size == 4 || Size == 8) && "Unsupported FP constant type");
     auto LoadOpcode = Size == 4 ? ARM::VLDRS : ARM::VLDRD;
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
index e2dff51ea61c..f3657155f47e 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.cpp
@@ -357,13 +357,12 @@ ARMLegalizerInfo::getFCmpLibcalls(CmpInst::Predicate Predicate,
   llvm_unreachable("Unsupported size for FCmp predicate");
 }
 
-bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
-                                      MachineRegisterInfo &MRI,
-                                      MachineIRBuilder &MIRBuilder,
-                                      GISelChangeObserver &Observer) const {
+bool ARMLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
+                                      MachineInstr &MI) const {
   using namespace TargetOpcode;
 
-  MIRBuilder.setInstr(MI);
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
   LLVMContext &Ctx = MIRBuilder.getMF().getFunction().getContext();
 
   switch (MI.getOpcode()) {
@@ -445,8 +444,7 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
       } else {
         // We need to compare against 0.
         assert(CmpInst::isIntPredicate(ResultPred) && "Unsupported predicate");
-        auto Zero = MRI.createGenericVirtualRegister(LLT::scalar(32));
-        MIRBuilder.buildConstant(Zero, 0);
+        auto Zero = MIRBuilder.buildConstant(LLT::scalar(32), 0);
         MIRBuilder.buildICmp(ResultPred, ProcessedResult, LibcallResult, Zero);
       }
       Results.push_back(ProcessedResult);
@@ -462,7 +460,7 @@ bool ARMLegalizerInfo::legalizeCustom(MachineInstr &MI,
     // Convert to integer constants, while preserving the binary representation.
     auto AsInteger =
         MI.getOperand(1).getFPImm()->getValueAPF().bitcastToAPInt();
-    MIRBuilder.buildConstant(MI.getOperand(0).getReg(),
+    MIRBuilder.buildConstant(MI.getOperand(0),
                              *ConstantInt::get(Ctx, AsInteger));
     break;
   }
diff --git a/llvm/lib/Target/ARM/ARMLegalizerInfo.h b/llvm/lib/Target/ARM/ARMLegalizerInfo.h
index e95f8cf76103..f1c2e9c94336 100644
--- a/llvm/lib/Target/ARM/ARMLegalizerInfo.h
+++ b/llvm/lib/Target/ARM/ARMLegalizerInfo.h
@@ -28,9 +28,7 @@ class ARMLegalizerInfo : public LegalizerInfo {
 public:
   ARMLegalizerInfo(const ARMSubtarget &ST);
 
-  bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
-                      MachineIRBuilder &MIRBuilder,
-                      GISelChangeObserver &Observer) const override;
+  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
 
 private:
   void setFCmpLibcallsGNU();
diff --git a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
index 12dddd29ca84..a84d23d3bb96 100644
--- a/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
+++ b/llvm/lib/Target/ARM/ARMLoadStoreOptimizer.cpp
@@ -32,6 +32,7 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstr.h"
@@ -50,6 +51,7 @@
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Type.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Allocator.h"
@@ -900,7 +902,7 @@ MachineInstr *ARMLoadStoreOpt::MergeOpsUpdate(const MergeCandidate &Cand) {
   unsigned Offset = getMemoryOpOffset(*First);
   Register Base = getLoadStoreBaseOp(*First).getReg();
   bool BaseKill = LatestMI->killsRegister(Base);
-  unsigned PredReg = 0;
+  Register PredReg;
   ARMCC::CondCodes Pred = getInstrPredicate(*First, PredReg);
   DebugLoc DL = First->getDebugLoc();
   MachineInstr *Merged = nullptr;
@@ -991,7 +993,7 @@ static bool mayCombineMisaligned(const TargetSubtargetInfo &STI,
   // Stack pointer alignment is out of the programmers control so we can trust
   // SP-relative loads/stores.
   if (getLoadStoreBaseOp(MI).getReg() == ARM::SP &&
-      STI.getFrameLowering()->getTransientStackAlignment() >= 4)
+      STI.getFrameLowering()->getTransientStackAlign() >= Align(4))
     return true;
   return false;
 }
@@ -1183,8 +1185,8 @@ static unsigned getUpdatingLSMultipleOpcode(unsigned Opc,
 /// Check if the given instruction increments or decrements a register and
 /// return the amount it is incremented/decremented. Returns 0 if the CPSR flags
 /// generated by the instruction are possibly read as well.
-static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
-                                  ARMCC::CondCodes Pred, unsigned PredReg) {
+static int isIncrementOrDecrement(const MachineInstr &MI, Register Reg,
+                                  ARMCC::CondCodes Pred, Register PredReg) {
   bool CheckCPSRDef;
   int Scale;
   switch (MI.getOpcode()) {
@@ -1201,7 +1203,7 @@ static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
   default: return 0;
   }
 
-  unsigned MIPredReg;
+  Register MIPredReg;
   if (MI.getOperand(0).getReg() != Reg ||
       MI.getOperand(1).getReg() != Reg ||
       getInstrPredicate(MI, MIPredReg) != Pred ||
@@ -1215,8 +1217,8 @@ static int isIncrementOrDecrement(const MachineInstr &MI, unsigned Reg,
 
 /// Searches for an increment or decrement of \p Reg before \p MBBI.
 static MachineBasicBlock::iterator
-findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
-                 ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+findIncDecBefore(MachineBasicBlock::iterator MBBI, Register Reg,
+                 ARMCC::CondCodes Pred, Register PredReg, int &Offset) {
   Offset = 0;
   MachineBasicBlock &MBB = *MBBI->getParent();
   MachineBasicBlock::iterator BeginMBBI = MBB.begin();
@@ -1235,8 +1237,8 @@ findIncDecBefore(MachineBasicBlock::iterator MBBI, unsigned Reg,
 
 /// Searches for a increment or decrement of \p Reg after \p MBBI.
 static MachineBasicBlock::iterator
-findIncDecAfter(MachineBasicBlock::iterator MBBI, unsigned Reg,
-                ARMCC::CondCodes Pred, unsigned PredReg, int &Offset) {
+findIncDecAfter(MachineBasicBlock::iterator MBBI, Register Reg,
+                ARMCC::CondCodes Pred, Register PredReg, int &Offset) {
   Offset = 0;
   MachineBasicBlock &MBB = *MBBI->getParent();
   MachineBasicBlock::iterator EndMBBI = MBB.end();
@@ -1270,7 +1272,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSMultiple(MachineInstr *MI) {
   const MachineOperand &BaseOP = MI->getOperand(0);
   Register Base = BaseOP.getReg();
   bool BaseKill = BaseOP.isKill();
-  unsigned PredReg = 0;
+  Register PredReg;
   ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
   unsigned Opcode = MI->getOpcode();
   DebugLoc DL = MI->getDebugLoc();
@@ -1383,6 +1385,38 @@ static unsigned getPostIndexedLoadStoreOpcode(unsigned Opc,
   case ARM::t2STRi8:
   case ARM::t2STRi12:
     return ARM::t2STR_POST;
+
+  case ARM::MVE_VLDRBS16:
+    return ARM::MVE_VLDRBS16_post;
+  case ARM::MVE_VLDRBS32:
+    return ARM::MVE_VLDRBS32_post;
+  case ARM::MVE_VLDRBU16:
+    return ARM::MVE_VLDRBU16_post;
+  case ARM::MVE_VLDRBU32:
+    return ARM::MVE_VLDRBU32_post;
+  case ARM::MVE_VLDRHS32:
+    return ARM::MVE_VLDRHS32_post;
+  case ARM::MVE_VLDRHU32:
+    return ARM::MVE_VLDRHU32_post;
+  case ARM::MVE_VLDRBU8:
+    return ARM::MVE_VLDRBU8_post;
+  case ARM::MVE_VLDRHU16:
+    return ARM::MVE_VLDRHU16_post;
+  case ARM::MVE_VLDRWU32:
+    return ARM::MVE_VLDRWU32_post;
+  case ARM::MVE_VSTRB16:
+    return ARM::MVE_VSTRB16_post;
+  case ARM::MVE_VSTRB32:
+    return ARM::MVE_VSTRB32_post;
+  case ARM::MVE_VSTRH32:
+    return ARM::MVE_VSTRH32_post;
+  case ARM::MVE_VSTRBU8:
+    return ARM::MVE_VSTRBU8_post;
+  case ARM::MVE_VSTRHU16:
+    return ARM::MVE_VSTRHU16_post;
+  case ARM::MVE_VSTRWU32:
+    return ARM::MVE_VSTRWU32_post;
+
   default: llvm_unreachable("Unhandled opcode!");
   }
 }
@@ -1412,7 +1446,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLoadStore(MachineInstr *MI) {
   if (MI->getOperand(0).getReg() == Base)
     return false;
 
-  unsigned PredReg = 0;
+  Register PredReg;
   ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
   int Bytes = getLSMultipleTransferSize(MI);
   MachineBasicBlock &MBB = *MI->getParent();
@@ -1525,7 +1559,7 @@ bool ARMLoadStoreOpt::MergeBaseUpdateLSDouble(MachineInstr &MI) const {
   if (Reg0Op.getReg() == Base || Reg1Op.getReg() == Base)
     return false;
 
-  unsigned PredReg;
+  Register PredReg;
   ARMCC::CondCodes Pred = getInstrPredicate(MI, PredReg);
   MachineBasicBlock::iterator MBBI(MI);
   MachineBasicBlock &MBB = *MI.getParent();
@@ -1602,13 +1636,13 @@ static bool isMemoryOp(const MachineInstr &MI) {
 
   // Don't touch volatile memory accesses - we may be changing their order.
   // TODO: We could allow unordered and monotonic atomics here, but we need to
-  // make sure the resulting ldm/stm is correctly marked as atomic. 
+  // make sure the resulting ldm/stm is correctly marked as atomic.
   if (MMO.isVolatile() || MMO.isAtomic())
     return false;
 
   // Unaligned ldr/str is emulated by some kernels, but unaligned ldm/stm is
   // not.
-  if (MMO.getAlignment() < 4)
+  if (MMO.getAlign() < Align(4))
     return false;
 
   // str <undef> could probably be eliminated entirely, but for now we just want
@@ -1692,7 +1726,7 @@ bool ARMLoadStoreOpt::FixInvalidRegPairOp(MachineBasicBlock &MBB,
   assert((isT2 || MI->getOperand(3).getReg() == ARM::NoRegister) &&
          "register offset not handled below");
   int OffImm = getMemoryOpOffset(*MI);
-  unsigned PredReg = 0;
+  Register PredReg;
   ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
 
   if (OddRegNum > EvenRegNum && OffImm == 0) {
@@ -1792,7 +1826,7 @@ bool ARMLoadStoreOpt::LoadStoreMultipleOpti(MachineBasicBlock &MBB) {
       const MachineOperand &MO = MBBI->getOperand(0);
       Register Reg = MO.getReg();
       Register Base = getLoadStoreBaseOp(*MBBI).getReg();
-      unsigned PredReg = 0;
+      Register PredReg;
       ARMCC::CondCodes Pred = getInstrPredicate(*MBBI, PredReg);
       int Offset = getMemoryOpOffset(*MBBI);
       if (CurrBase == 0) {
@@ -2046,6 +2080,7 @@ namespace {
     const TargetRegisterInfo *TRI;
     const ARMSubtarget *STI;
     MachineRegisterInfo *MRI;
+    MachineDominatorTree *DT;
     MachineFunction *MF;
 
     ARMPreAllocLoadStoreOpt() : MachineFunctionPass(ID) {}
@@ -2058,29 +2093,34 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<AAResultsWrapperPass>();
+      AU.addRequired<MachineDominatorTree>();
+      AU.addPreserved<MachineDominatorTree>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
   private:
     bool CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl,
-                          unsigned &NewOpc, unsigned &EvenReg,
-                          unsigned &OddReg, unsigned &BaseReg,
-                          int &Offset,
-                          unsigned &PredReg, ARMCC::CondCodes &Pred,
-                          bool &isT2);
+                          unsigned &NewOpc, Register &EvenReg, Register &OddReg,
+                          Register &BaseReg, int &Offset, Register &PredReg,
+                          ARMCC::CondCodes &Pred, bool &isT2);
     bool RescheduleOps(MachineBasicBlock *MBB,
                        SmallVectorImpl<MachineInstr *> &Ops,
                        unsigned Base, bool isLd,
                        DenseMap<MachineInstr*, unsigned> &MI2LocMap);
     bool RescheduleLoadStoreInstrs(MachineBasicBlock *MBB);
+    bool DistributeIncrements();
+    bool DistributeIncrements(Register Base);
   };
 
 } // end anonymous namespace
 
 char ARMPreAllocLoadStoreOpt::ID = 0;
 
-INITIALIZE_PASS(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
-                ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
+INITIALIZE_PASS_BEGIN(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
+                      ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_END(ARMPreAllocLoadStoreOpt, "arm-prera-ldst-opt",
+                    ARM_PREALLOC_LOAD_STORE_OPT_NAME, false, false)
 
 // Limit the number of instructions to be rescheduled.
 // FIXME: tune this limit, and/or come up with some better heuristics.
@@ -2096,10 +2136,11 @@ bool ARMPreAllocLoadStoreOpt::runOnMachineFunction(MachineFunction &Fn) {
   TII = STI->getInstrInfo();
   TRI = STI->getRegisterInfo();
   MRI = &Fn.getRegInfo();
+  DT = &getAnalysis<MachineDominatorTree>();
   MF  = &Fn;
   AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
 
-  bool Modified = false;
+  bool Modified = DistributeIncrements();
   for (MachineBasicBlock &MFI : Fn)
     Modified |= RescheduleLoadStoreInstrs(&MFI);
 
@@ -2143,15 +2184,10 @@ static bool IsSafeAndProfitableToMove(bool isLd, unsigned Base,
   return AddedRegPressure.size() <= MemRegs.size() * 2;
 }
 
-bool
-ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
-                                          DebugLoc &dl, unsigned &NewOpc,
-                                          unsigned &FirstReg,
-                                          unsigned &SecondReg,
-                                          unsigned &BaseReg, int &Offset,
-                                          unsigned &PredReg,
-                                          ARMCC::CondCodes &Pred,
-                                          bool &isT2) {
+bool ARMPreAllocLoadStoreOpt::CanFormLdStDWord(
+    MachineInstr *Op0, MachineInstr *Op1, DebugLoc &dl, unsigned &NewOpc,
+    Register &FirstReg, Register &SecondReg, Register &BaseReg, int &Offset,
+    Register &PredReg, ARMCC::CondCodes &Pred, bool &isT2) {
   // Make sure we're allowed to generate LDRD/STRD.
   if (!STI->hasV5TEOps())
     return false;
@@ -2183,12 +2219,12 @@ ARMPreAllocLoadStoreOpt::CanFormLdStDWord(MachineInstr *Op0, MachineInstr *Op1,
       (*Op0->memoperands_begin())->isAtomic())
     return false;
 
-  unsigned Align = (*Op0->memoperands_begin())->getAlignment();
+  Align Alignment = (*Op0->memoperands_begin())->getAlign();
   const Function &Func = MF->getFunction();
-  unsigned ReqAlign = STI->hasV6Ops()
-    ? TD->getABITypeAlignment(Type::getInt64Ty(Func.getContext()))
-    : 8;  // Pre-v6 need 8-byte align
-  if (Align < ReqAlign)
+  Align ReqAlign =
+      STI->hasV6Ops() ? TD->getABITypeAlign(Type::getInt64Ty(Func.getContext()))
+                      : Align(8); // Pre-v6 need 8-byte align
+  if (Alignment < ReqAlign)
     return false;
 
   // Then make sure the immediate offset fits.
@@ -2313,8 +2349,8 @@ bool ARMPreAllocLoadStoreOpt::RescheduleOps(MachineBasicBlock *MBB,
         // to try to allocate a pair of registers that can form register pairs.
         MachineInstr *Op0 = Ops.back();
         MachineInstr *Op1 = Ops[Ops.size()-2];
-        unsigned FirstReg = 0, SecondReg = 0;
-        unsigned BaseReg = 0, PredReg = 0;
+        Register FirstReg, SecondReg;
+        Register BaseReg, PredReg;
         ARMCC::CondCodes Pred = ARMCC::AL;
         bool isT2 = false;
         unsigned NewOpc = 0;
@@ -2416,7 +2452,7 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
 
       if (!isMemoryOp(MI))
         continue;
-      unsigned PredReg = 0;
+      Register PredReg;
       if (getInstrPredicate(MI, PredReg) != ARMCC::AL)
         continue;
 
@@ -2482,6 +2518,199 @@ ARMPreAllocLoadStoreOpt::RescheduleLoadStoreInstrs(MachineBasicBlock *MBB) {
   return RetVal;
 }
 
+// Get the Base register operand index from the memory access MachineInst if we
+// should attempt to distribute postinc on it. Return -1 if not of a valid
+// instruction type. If it returns an index, it is assumed that instruction is a
+// r+i indexing mode, and getBaseOperandIndex() + 1 is the Offset index.
+static int getBaseOperandIndex(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case ARM::MVE_VLDRBS16:
+  case ARM::MVE_VLDRBS32:
+  case ARM::MVE_VLDRBU16:
+  case ARM::MVE_VLDRBU32:
+  case ARM::MVE_VLDRHS32:
+  case ARM::MVE_VLDRHU32:
+  case ARM::MVE_VLDRBU8:
+  case ARM::MVE_VLDRHU16:
+  case ARM::MVE_VLDRWU32:
+  case ARM::MVE_VSTRB16:
+  case ARM::MVE_VSTRB32:
+  case ARM::MVE_VSTRH32:
+  case ARM::MVE_VSTRBU8:
+  case ARM::MVE_VSTRHU16:
+  case ARM::MVE_VSTRWU32:
+    return 1;
+  }
+  return -1;
+}
+
+static MachineInstr *createPostIncLoadStore(MachineInstr *MI, int Offset,
+                                            Register NewReg,
+                                            const TargetInstrInfo *TII,
+                                            const TargetRegisterInfo *TRI) {
+  MachineFunction *MF = MI->getMF();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+
+  unsigned NewOpcode = getPostIndexedLoadStoreOpcode(
+      MI->getOpcode(), Offset > 0 ? ARM_AM::add : ARM_AM::sub);
+
+  const MCInstrDesc &MCID = TII->get(NewOpcode);
+  // Constrain the def register class
+  const TargetRegisterClass *TRC = TII->getRegClass(MCID, 0, TRI, *MF);
+  MRI.constrainRegClass(NewReg, TRC);
+  // And do the same for the base operand
+  TRC = TII->getRegClass(MCID, 2, TRI, *MF);
+  MRI.constrainRegClass(MI->getOperand(1).getReg(), TRC);
+
+  return BuildMI(*MI->getParent(), MI, MI->getDebugLoc(), MCID)
+      .addReg(NewReg, RegState::Define)
+      .add(MI->getOperand(0))
+      .add(MI->getOperand(1))
+      .addImm(Offset)
+      .add(MI->getOperand(3))
+      .add(MI->getOperand(4))
+      .cloneMemRefs(*MI);
+}
+
+// Given a Base Register, optimise the load/store uses to attempt to create more
+// post-inc accesses. We do this by taking zero offset loads/stores with an add,
+// and convert them to a postinc load/store of the same type. Any subsequent
+// accesses will be adjusted to use and account for the post-inc value.
+// For example:
+// LDR #0            LDR_POSTINC #16
+// LDR #4            LDR #-12
+// LDR #8            LDR #-8
+// LDR #12           LDR #-4
+// ADD #16
+bool ARMPreAllocLoadStoreOpt::DistributeIncrements(Register Base) {
+  // We are looking for:
+  // One zero offset load/store that can become postinc
+  MachineInstr *BaseAccess = nullptr;
+  // An increment that can be folded in
+  MachineInstr *Increment = nullptr;
+  // Other accesses after BaseAccess that will need to be updated to use the
+  // postinc value
+  SmallPtrSet<MachineInstr *, 8> OtherAccesses;
+  for (auto &Use : MRI->use_nodbg_instructions(Base)) {
+    if (!Increment && getAddSubImmediate(Use) != 0) {
+      Increment = &Use;
+      continue;
+    }
+
+    int BaseOp = getBaseOperandIndex(Use);
+    if (BaseOp == -1)
+      return false;
+
+    if (!Use.getOperand(BaseOp).isReg() ||
+        Use.getOperand(BaseOp).getReg() != Base)
+      return false;
+    if (Use.getOperand(BaseOp + 1).getImm() == 0)
+      BaseAccess = &Use;
+    else
+      OtherAccesses.insert(&Use);
+  }
+
+  if (!BaseAccess || !Increment ||
+      BaseAccess->getParent() != Increment->getParent())
+    return false;
+  Register PredReg;
+  if (Increment->definesRegister(ARM::CPSR) ||
+      getInstrPredicate(*Increment, PredReg) != ARMCC::AL)
+    return false;
+
+  LLVM_DEBUG(dbgs() << "\nAttempting to distribute increments on VirtualReg "
+                    << Base.virtRegIndex() << "\n");
+
+  // Make sure that Increment has no uses before BaseAccess.
+  for (MachineInstr &Use :
+       MRI->use_nodbg_instructions(Increment->getOperand(0).getReg())) {
+    if (!DT->dominates(BaseAccess, &Use) || &Use == BaseAccess) {
+      LLVM_DEBUG(dbgs() << "  BaseAccess doesn't dominate use of increment\n");
+      return false;
+    }
+  }
+
+  // Make sure that Increment can be folded into Base
+  int IncrementOffset = getAddSubImmediate(*Increment);
+  unsigned NewPostIncOpcode = getPostIndexedLoadStoreOpcode(
+      BaseAccess->getOpcode(), IncrementOffset > 0 ? ARM_AM::add : ARM_AM::sub);
+  if (!isLegalAddressImm(NewPostIncOpcode, IncrementOffset, TII)) {
+    LLVM_DEBUG(dbgs() << "  Illegal addressing mode immediate on postinc\n");
+    return false;
+  }
+
+  // And make sure that the negative value of increment can be added to all
+  // other offsets after the BaseAccess. We rely on either
+  // dominates(BaseAccess, OtherAccess) or dominates(OtherAccess, BaseAccess)
+  // to keep things simple.
+  SmallPtrSet<MachineInstr *, 4> SuccessorAccesses;
+  for (auto *Use : OtherAccesses) {
+    if (DT->dominates(BaseAccess, Use)) {
+      SuccessorAccesses.insert(Use);
+      unsigned BaseOp = getBaseOperandIndex(*Use);
+      if (!isLegalAddressImm(
+              Use->getOpcode(),
+              Use->getOperand(BaseOp + 1).getImm() - IncrementOffset, TII)) {
+        LLVM_DEBUG(dbgs() << "  Illegal addressing mode immediate on use\n");
+        return false;
+      }
+    } else if (!DT->dominates(Use, BaseAccess)) {
+      LLVM_DEBUG(
+          dbgs() << "  Unknown dominance relation between Base and Use\n");
+      return false;
+    }
+  }
+
+  // Replace BaseAccess with a post inc
+  LLVM_DEBUG(dbgs() << "Changing: "; BaseAccess->dump());
+  LLVM_DEBUG(dbgs() << "  And   : "; Increment->dump());
+  Register NewBaseReg = Increment->getOperand(0).getReg();
+  MachineInstr *BaseAccessPost =
+      createPostIncLoadStore(BaseAccess, IncrementOffset, NewBaseReg, TII, TRI);
+  BaseAccess->eraseFromParent();
+  Increment->eraseFromParent();
+  (void)BaseAccessPost;
+  LLVM_DEBUG(dbgs() << "  To    : "; BaseAccessPost->dump());
+
+  for (auto *Use : SuccessorAccesses) {
+    LLVM_DEBUG(dbgs() << "Changing: "; Use->dump());
+    unsigned BaseOp = getBaseOperandIndex(*Use);
+    Use->getOperand(BaseOp).setReg(NewBaseReg);
+    int OldOffset = Use->getOperand(BaseOp + 1).getImm();
+    Use->getOperand(BaseOp + 1).setImm(OldOffset - IncrementOffset);
+    LLVM_DEBUG(dbgs() << "  To    : "; Use->dump());
+  }
+
+  // Remove the kill flag from all uses of NewBaseReg, in case any old uses
+  // remain.
+  for (MachineOperand &Op : MRI->use_nodbg_operands(NewBaseReg))
+    Op.setIsKill(false);
+  return true;
+}
+
+bool ARMPreAllocLoadStoreOpt::DistributeIncrements() {
+  bool Changed = false;
+  SmallSetVector<Register, 4> Visited;
+  for (auto &MBB : *MF) {
+    for (auto &MI : MBB) {
+      int BaseOp = getBaseOperandIndex(MI);
+      if (BaseOp == -1 || !MI.getOperand(BaseOp).isReg())
+        continue;
+
+      Register Base = MI.getOperand(BaseOp).getReg();
+      if (!Base.isVirtual() || Visited.count(Base))
+        continue;
+
+      Visited.insert(Base);
+    }
+  }
+
+  for (auto Base : Visited)
+    Changed |= DistributeIncrements(Base);
+
+  return Changed;
+}
+
 /// Returns an instance of the load / store optimization pass.
 FunctionPass *llvm::createARMLoadStoreOptimizationPass(bool PreAlloc) {
   if (PreAlloc)
diff --git a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
index 6717d4706aef..be75d6bef08c 100644
--- a/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
+++ b/llvm/lib/Target/ARM/ARMLowOverheadLoops.cpp
@@ -35,6 +35,20 @@
 /// are defined to be as large as this maximum sequence of replacement
 /// instructions.
 ///
+/// A note on VPR.P0 (the lane mask):
+/// VPT, VCMP, VPNOT and VCTP won't overwrite VPR.P0 when they update it in a
+/// "VPT Active" context (which includes low-overhead loops and vpt blocks).
+/// They will simply "and" the result of their calculation with the current
+/// value of VPR.P0. You can think of it like this:
+/// \verbatim
+/// if VPT active:    ; Between a DLSTP/LETP, or for predicated instrs
+///   VPR.P0 &= Value
+/// else
+///   VPR.P0 = Value
+/// \endverbatim
+/// When we're inside the low-overhead loop (between DLSTP and LETP), we always
+/// fall in the "VPT active" case, so we can consider that all VPR writes by
+/// one of those instruction is actually a "and".
 //===----------------------------------------------------------------------===//
 
 #include "ARM.h"
@@ -45,6 +59,7 @@
 #include "Thumb2InstrInfo.h"
 #include "llvm/ADT/SetOperations.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/CodeGen/LivePhysRegs.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/CodeGen/MachineLoopUtils.h"
@@ -60,34 +75,93 @@ using namespace llvm;
 
 namespace {
 
+  using InstSet = SmallPtrSetImpl<MachineInstr *>;
+
+  class PostOrderLoopTraversal {
+    MachineLoop &ML;
+    MachineLoopInfo &MLI;
+    SmallPtrSet<MachineBasicBlock*, 4> Visited;
+    SmallVector<MachineBasicBlock*, 4> Order;
+
+  public:
+    PostOrderLoopTraversal(MachineLoop &ML, MachineLoopInfo &MLI)
+      : ML(ML), MLI(MLI) { }
+
+    const SmallVectorImpl<MachineBasicBlock*> &getOrder() const {
+      return Order;
+    }
+
+    // Visit all the blocks within the loop, as well as exit blocks and any
+    // blocks properly dominating the header.
+    void ProcessLoop() {
+      std::function<void(MachineBasicBlock*)> Search = [this, &Search]
+        (MachineBasicBlock *MBB) -> void {
+        if (Visited.count(MBB))
+          return;
+
+        Visited.insert(MBB);
+        for (auto *Succ : MBB->successors()) {
+          if (!ML.contains(Succ))
+            continue;
+          Search(Succ);
+        }
+        Order.push_back(MBB);
+      };
+
+      // Insert exit blocks.
+      SmallVector<MachineBasicBlock*, 2> ExitBlocks;
+      ML.getExitBlocks(ExitBlocks);
+      for (auto *MBB : ExitBlocks)
+        Order.push_back(MBB);
+
+      // Then add the loop body.
+      Search(ML.getHeader());
+
+      // Then try the preheader and its predecessors.
+      std::function<void(MachineBasicBlock*)> GetPredecessor =
+        [this, &GetPredecessor] (MachineBasicBlock *MBB) -> void {
+        Order.push_back(MBB);
+        if (MBB->pred_size() == 1)
+          GetPredecessor(*MBB->pred_begin());
+      };
+
+      if (auto *Preheader = ML.getLoopPreheader())
+        GetPredecessor(Preheader);
+      else if (auto *Preheader = MLI.findLoopPreheader(&ML, true))
+        GetPredecessor(Preheader);
+    }
+  };
+
   struct PredicatedMI {
     MachineInstr *MI = nullptr;
     SetVector<MachineInstr*> Predicates;
 
   public:
-    PredicatedMI(MachineInstr *I, SetVector<MachineInstr*> &Preds) :
-    MI(I) {
+    PredicatedMI(MachineInstr *I, SetVector<MachineInstr *> &Preds) : MI(I) {
+      assert(I && "Instruction must not be null!");
       Predicates.insert(Preds.begin(), Preds.end());
     }
   };
 
-  // Represent a VPT block, a list of instructions that begins with a VPST and
-  // has a maximum of four proceeding instructions. All instructions within the
-  // block are predicated upon the vpr and we allow instructions to define the
-  // vpr within in the block too.
+  // Represent a VPT block, a list of instructions that begins with a VPT/VPST
+  // and has a maximum of four proceeding instructions. All instructions within
+  // the block are predicated upon the vpr and we allow instructions to define
+  // the vpr within in the block too.
   class VPTBlock {
-    std::unique_ptr<PredicatedMI> VPST;
+    // The predicate then instruction, which is either a VPT, or a VPST
+    // instruction.
+    std::unique_ptr<PredicatedMI> PredicateThen;
     PredicatedMI *Divergent = nullptr;
     SmallVector<PredicatedMI, 4> Insts;
 
   public:
     VPTBlock(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
-      VPST = std::make_unique<PredicatedMI>(MI, Preds);
+      PredicateThen = std::make_unique<PredicatedMI>(MI, Preds);
     }
 
     void addInst(MachineInstr *MI, SetVector<MachineInstr*> &Preds) {
       LLVM_DEBUG(dbgs() << "ARM Loops: Adding predicated MI: " << *MI);
-      if (!Divergent && !set_difference(Preds, VPST->Predicates).empty()) {
+      if (!Divergent && !set_difference(Preds, PredicateThen->Predicates).empty()) {
         Divergent = &Insts.back();
         LLVM_DEBUG(dbgs() << " - has divergent predicate: " << *Divergent->MI);
       }
@@ -104,38 +178,73 @@ namespace {
     // Is the given instruction part of the predicate set controlling the entry
     // to the block.
     bool IsPredicatedOn(MachineInstr *MI) const {
-      return VPST->Predicates.count(MI);
+      return PredicateThen->Predicates.count(MI);
+    }
+
+    // Returns true if this is a VPT instruction.
+    bool isVPT() const { return !isVPST(); }
+
+    // Returns true if this is a VPST instruction.
+    bool isVPST() const {
+      return PredicateThen->MI->getOpcode() == ARM::MVE_VPST;
     }
 
     // Is the given instruction the only predicate which controls the entry to
     // the block.
     bool IsOnlyPredicatedOn(MachineInstr *MI) const {
-      return IsPredicatedOn(MI) && VPST->Predicates.size() == 1;
+      return IsPredicatedOn(MI) && PredicateThen->Predicates.size() == 1;
     }
 
     unsigned size() const { return Insts.size(); }
     SmallVectorImpl<PredicatedMI> &getInsts() { return Insts; }
-    MachineInstr *getVPST() const { return VPST->MI; }
+    MachineInstr *getPredicateThen() const { return PredicateThen->MI; }
     PredicatedMI *getDivergent() const { return Divergent; }
   };
 
+  struct Reduction {
+    MachineInstr *Init;
+    MachineInstr &Copy;
+    MachineInstr &Reduce;
+    MachineInstr &VPSEL;
+
+    Reduction(MachineInstr *Init, MachineInstr *Mov, MachineInstr *Add,
+              MachineInstr *Sel)
+      : Init(Init), Copy(*Mov), Reduce(*Add), VPSEL(*Sel) { }
+  };
+
   struct LowOverheadLoop {
 
-    MachineLoop *ML = nullptr;
+    MachineLoop &ML;
+    MachineBasicBlock *Preheader = nullptr;
+    MachineLoopInfo &MLI;
+    ReachingDefAnalysis &RDA;
+    const TargetRegisterInfo &TRI;
+    const ARMBaseInstrInfo &TII;
     MachineFunction *MF = nullptr;
     MachineInstr *InsertPt = nullptr;
     MachineInstr *Start = nullptr;
     MachineInstr *Dec = nullptr;
     MachineInstr *End = nullptr;
     MachineInstr *VCTP = nullptr;
+    SmallPtrSet<MachineInstr*, 4> SecondaryVCTPs;
     VPTBlock *CurrentBlock = nullptr;
     SetVector<MachineInstr*> CurrentPredicate;
     SmallVector<VPTBlock, 4> VPTBlocks;
+    SmallPtrSet<MachineInstr*, 4> ToRemove;
+    SmallVector<std::unique_ptr<Reduction>, 1> Reductions;
+    SmallPtrSet<MachineInstr*, 4> BlockMasksToRecompute;
     bool Revert = false;
     bool CannotTailPredicate = false;
 
-    LowOverheadLoop(MachineLoop *ML) : ML(ML) {
-      MF = ML->getHeader()->getParent();
+    LowOverheadLoop(MachineLoop &ML, MachineLoopInfo &MLI,
+                    ReachingDefAnalysis &RDA, const TargetRegisterInfo &TRI,
+                    const ARMBaseInstrInfo &TII)
+      : ML(ML), MLI(MLI), RDA(RDA), TRI(TRI), TII(TII) {
+      MF = ML.getHeader()->getParent();
+      if (auto *MBB = ML.getLoopPreheader())
+        Preheader = MBB;
+      else if (auto *MBB = MLI.findLoopPreheader(&ML, true))
+        Preheader = MBB;
     }
 
     // If this is an MVE instruction, check that we know how to use tail
@@ -151,22 +260,30 @@ namespace {
       // For now, let's keep things really simple and only support a single
       // block for tail predication.
       return !Revert && FoundAllComponents() && VCTP &&
-             !CannotTailPredicate && ML->getNumBlocks() == 1;
+             !CannotTailPredicate && ML.getNumBlocks() == 1;
     }
 
-    bool ValidateTailPredicate(MachineInstr *StartInsertPt,
-                               ReachingDefAnalysis *RDA,
-                               MachineLoopInfo *MLI);
+    // Check that the predication in the loop will be equivalent once we
+    // perform the conversion. Also ensure that we can provide the number
+    // of elements to the loop start instruction.
+    bool ValidateTailPredicate(MachineInstr *StartInsertPt);
+
+    // See whether the live-out instructions are a reduction that we can fixup
+    // later.
+    bool FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers);
+
+    // Check that any values available outside of the loop will be the same
+    // after tail predication conversion.
+    bool ValidateLiveOuts();
 
     // Is it safe to define LR with DLS/WLS?
     // LR can be defined if it is the operand to start, because it's the same
     // value, or if it's going to be equivalent to the operand to Start.
-    MachineInstr *IsSafeToDefineLR(ReachingDefAnalysis *RDA);
+    MachineInstr *isSafeToDefineLR();
 
     // Check the branch targets are within range and we satisfy our
     // restrictions.
-    void CheckLegality(ARMBasicBlockUtils *BBUtils, ReachingDefAnalysis *RDA,
-                       MachineLoopInfo *MLI);
+    void CheckLegality(ARMBasicBlockUtils *BBUtils);
 
     bool FoundAllComponents() const {
       return Start && Dec && End;
@@ -241,18 +358,19 @@ namespace {
 
     void RevertWhile(MachineInstr *MI) const;
 
-    bool RevertLoopDec(MachineInstr *MI, bool AllowFlags = false) const;
+    bool RevertLoopDec(MachineInstr *MI) const;
 
     void RevertLoopEnd(MachineInstr *MI, bool SkipCmp = false) const;
 
-    void RemoveLoopUpdate(LowOverheadLoop &LoLoop);
-
     void ConvertVPTBlocks(LowOverheadLoop &LoLoop);
 
+    void FixupReductions(LowOverheadLoop &LoLoop) const;
+
     MachineInstr *ExpandLoopStart(LowOverheadLoop &LoLoop);
 
     void Expand(LowOverheadLoop &LoLoop);
 
+    void IterationCountDCE(LowOverheadLoop &LoLoop);
   };
 }
 
@@ -261,7 +379,7 @@ char ARMLowOverheadLoops::ID = 0;
 INITIALIZE_PASS(ARMLowOverheadLoops, DEBUG_TYPE, ARM_LOW_OVERHEAD_LOOPS_NAME,
                 false, false)
 
-MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) {
+MachineInstr *LowOverheadLoop::isSafeToDefineLR() {
   // We can define LR because LR already contains the same value.
   if (Start->getOperand(0).getReg() == ARM::LR)
     return Start;
@@ -279,52 +397,22 @@ MachineInstr *LowOverheadLoop::IsSafeToDefineLR(ReachingDefAnalysis *RDA) {
   // Find an insertion point:
   // - Is there a (mov lr, Count) before Start? If so, and nothing else writes
   //   to Count before Start, we can insert at that mov.
-  if (auto *LRDef = RDA->getReachingMIDef(Start, ARM::LR))
-    if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
+  if (auto *LRDef = RDA.getUniqueReachingMIDef(Start, ARM::LR))
+    if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg))
       return LRDef;
 
   // - Is there a (mov lr, Count) after Start? If so, and nothing else writes
   //   to Count after Start, we can insert at that mov.
-  if (auto *LRDef = RDA->getLocalLiveOutMIDef(MBB, ARM::LR))
-    if (IsMoveLR(LRDef) && RDA->hasSameReachingDef(Start, LRDef, CountReg))
+  if (auto *LRDef = RDA.getLocalLiveOutMIDef(MBB, ARM::LR))
+    if (IsMoveLR(LRDef) && RDA.hasSameReachingDef(Start, LRDef, CountReg))
       return LRDef;
 
   // We've found no suitable LR def and Start doesn't use LR directly. Can we
   // just define LR anyway?
-  if (!RDA->isRegUsedAfter(Start, ARM::LR))
-    return Start;
-
-  return nullptr;
-}
-
-// Can we safely move 'From' to just before 'To'? To satisfy this, 'From' must
-// not define a register that is used by any instructions, after and including,
-// 'To'. These instructions also must not redefine any of Froms operands.
-template<typename Iterator>
-static bool IsSafeToMove(MachineInstr *From, MachineInstr *To, ReachingDefAnalysis *RDA) {
-  SmallSet<int, 2> Defs;
-  // First check that From would compute the same value if moved.
-  for (auto &MO : From->operands()) {
-    if (!MO.isReg() || MO.isUndef() || !MO.getReg())
-      continue;
-    if (MO.isDef())
-      Defs.insert(MO.getReg());
-    else if (!RDA->hasSameReachingDef(From, To, MO.getReg()))
-      return false;
-  }
-
-  // Now walk checking that the rest of the instructions will compute the same
-  // value.
-  for (auto I = ++Iterator(From), E = Iterator(To); I != E; ++I) {
-    for (auto &MO : I->operands())
-      if (MO.isReg() && MO.getReg() && MO.isUse() && Defs.count(MO.getReg()))
-        return false;
-  }
-  return true;
+  return RDA.isSafeToDefRegAt(Start, ARM::LR) ? Start : nullptr;
 }
 
-bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
-    ReachingDefAnalysis *RDA, MachineLoopInfo *MLI) {
+bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt) {
   assert(VCTP && "VCTP instruction expected but is not set");
   // All predication within the loop should be based on vctp. If the block
   // isn't predicated on entry, check whether the vctp is within the block
@@ -332,24 +420,35 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
   for (auto &Block : VPTBlocks) {
     if (Block.IsPredicatedOn(VCTP))
       continue;
-    if (!Block.HasNonUniformPredicate() || !isVCTP(Block.getDivergent()->MI)) {
+    if (Block.HasNonUniformPredicate() && !isVCTP(Block.getDivergent()->MI)) {
       LLVM_DEBUG(dbgs() << "ARM Loops: Found unsupported diverging predicate: "
-                 << *Block.getDivergent()->MI);
+                        << *Block.getDivergent()->MI);
       return false;
     }
     SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
     for (auto &PredMI : Insts) {
-      if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI))
+      // Check the instructions in the block and only allow:
+      //   - VCTPs
+      //   - Instructions predicated on the main VCTP
+      //   - Any VCMP
+      //      - VCMPs just "and" their result with VPR.P0. Whether they are
+      //      located before/after the VCTP is irrelevant - the end result will
+      //      be the same in both cases, so there's no point in requiring them
+      //      to be located after the VCTP!
+      if (PredMI.Predicates.count(VCTP) || isVCTP(PredMI.MI) ||
+          VCMPOpcodeToVPT(PredMI.MI->getOpcode()) != 0)
         continue;
       LLVM_DEBUG(dbgs() << "ARM Loops: Can't convert: " << *PredMI.MI
-                        << " - which is predicated on:\n";
-                        for (auto *MI : PredMI.Predicates)
-                          dbgs() << "   - " << *MI;
-                 );
+                 << " - which is predicated on:\n";
+                 for (auto *MI : PredMI.Predicates)
+                   dbgs() << "   - " << *MI);
       return false;
     }
   }
 
+  if (!ValidateLiveOuts())
+    return false;
+
   // For tail predication, we need to provide the number of elements, instead
   // of the iteration count, to the loop start instruction. The number of
   // elements is provided to the vctp instruction, so we need to check that
@@ -359,7 +458,7 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
   // If the register is defined within loop, then we can't perform TP.
   // TODO: Check whether this is just a mov of a register that would be
   // available.
-  if (RDA->getReachingDef(VCTP, NumElements) >= 0) {
+  if (RDA.hasLocalDefBefore(VCTP, NumElements)) {
     LLVM_DEBUG(dbgs() << "ARM Loops: VCTP operand is defined in the loop.\n");
     return false;
   }
@@ -367,17 +466,20 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
   // The element count register maybe defined after InsertPt, in which case we
   // need to try to move either InsertPt or the def so that the [w|d]lstp can
   // use the value.
-  MachineBasicBlock *InsertBB = InsertPt->getParent();
-  if (!RDA->isReachingDefLiveOut(InsertPt, NumElements)) {
-    if (auto *ElemDef = RDA->getLocalLiveOutMIDef(InsertBB, NumElements)) {
-      if (IsSafeToMove<MachineBasicBlock::reverse_iterator>(ElemDef, InsertPt, RDA)) {
+  // TODO: On failing to move an instruction, check if the count is provided by
+  // a mov and whether we can use the mov operand directly.
+  MachineBasicBlock *InsertBB = StartInsertPt->getParent();
+  if (!RDA.isReachingDefLiveOut(StartInsertPt, NumElements)) {
+    if (auto *ElemDef = RDA.getLocalLiveOutMIDef(InsertBB, NumElements)) {
+      if (RDA.isSafeToMoveForwards(ElemDef, StartInsertPt)) {
         ElemDef->removeFromParent();
-        InsertBB->insert(MachineBasicBlock::iterator(InsertPt), ElemDef);
+        InsertBB->insert(MachineBasicBlock::iterator(StartInsertPt), ElemDef);
         LLVM_DEBUG(dbgs() << "ARM Loops: Moved element count def: "
                    << *ElemDef);
-      } else if (IsSafeToMove<MachineBasicBlock::iterator>(InsertPt, ElemDef, RDA)) {
-        InsertPt->removeFromParent();
-        InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef), InsertPt);
+      } else if (RDA.isSafeToMoveBackwards(StartInsertPt, ElemDef)) {
+        StartInsertPt->removeFromParent();
+        InsertBB->insertAfter(MachineBasicBlock::iterator(ElemDef),
+                              StartInsertPt);
         LLVM_DEBUG(dbgs() << "ARM Loops: Moved start past: " << *ElemDef);
       } else {
         LLVM_DEBUG(dbgs() << "ARM Loops: Unable to move element count to loop "
@@ -390,10 +492,10 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
   // Especially in the case of while loops, InsertBB may not be the
   // preheader, so we need to check that the register isn't redefined
   // before entering the loop.
-  auto CannotProvideElements = [&RDA](MachineBasicBlock *MBB,
+  auto CannotProvideElements = [this](MachineBasicBlock *MBB,
                                       Register NumElements) {
     // NumElements is redefined in this block.
-    if (RDA->getReachingDef(&MBB->back(), NumElements) >= 0)
+    if (RDA.hasLocalDefBefore(&MBB->back(), NumElements))
       return true;
 
     // Don't continue searching up through multiple predecessors.
@@ -404,7 +506,7 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
   };
 
   // First, find the block that looks like the preheader.
-  MachineBasicBlock *MBB = MLI->findLoopPreheader(ML, true);
+  MachineBasicBlock *MBB = Preheader;
   if (!MBB) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Didn't find preheader.\n");
     return false;
@@ -419,13 +521,372 @@ bool LowOverheadLoop::ValidateTailPredicate(MachineInstr *StartInsertPt,
     MBB = *MBB->pred_begin();
   }
 
-  LLVM_DEBUG(dbgs() << "ARM Loops: Will use tail predication.\n");
+  // Check that the value change of the element count is what we expect and
+  // that the predication will be equivalent. For this we need:
+  // NumElements = NumElements - VectorWidth. The sub will be a sub immediate
+  // and we can also allow register copies within the chain too.
+  auto IsValidSub = [](MachineInstr *MI, int ExpectedVecWidth) {
+    return -getAddSubImmediate(*MI) == ExpectedVecWidth;
+  };
+
+  MBB = VCTP->getParent();
+  if (auto *Def = RDA.getUniqueReachingMIDef(&MBB->back(), NumElements)) {
+    SmallPtrSet<MachineInstr*, 2> ElementChain;
+    SmallPtrSet<MachineInstr*, 2> Ignore = { VCTP };
+    unsigned ExpectedVectorWidth = getTailPredVectorWidth(VCTP->getOpcode());
+
+    Ignore.insert(SecondaryVCTPs.begin(), SecondaryVCTPs.end());
+
+    if (RDA.isSafeToRemove(Def, ElementChain, Ignore)) {
+      bool FoundSub = false;
+
+      for (auto *MI : ElementChain) {
+        if (isMovRegOpcode(MI->getOpcode()))
+          continue;
+
+        if (isSubImmOpcode(MI->getOpcode())) {
+          if (FoundSub || !IsValidSub(MI, ExpectedVectorWidth))
+            return false;
+          FoundSub = true;
+        } else
+          return false;
+      }
+
+      LLVM_DEBUG(dbgs() << "ARM Loops: Will remove element count chain:\n";
+                 for (auto *MI : ElementChain)
+                   dbgs() << " - " << *MI);
+      ToRemove.insert(ElementChain.begin(), ElementChain.end());
+    }
+  }
+  return true;
+}
+
+static bool isVectorPredicated(MachineInstr *MI) {
+  int PIdx = llvm::findFirstVPTPredOperandIdx(*MI);
+  return PIdx != -1 && MI->getOperand(PIdx + 1).getReg() == ARM::VPR;
+}
+
+static bool isRegInClass(const MachineOperand &MO,
+                         const TargetRegisterClass *Class) {
+  return MO.isReg() && MO.getReg() && Class->contains(MO.getReg());
+}
+
+// MVE 'narrowing' operate on half a lane, reading from half and writing
+// to half, which are referred to has the top and bottom half. The other
+// half retains its previous value.
+static bool retainsPreviousHalfElement(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  uint64_t Flags = MCID.TSFlags;
+  return (Flags & ARMII::RetainsPreviousHalfElement) != 0;
+}
+
+// Some MVE instructions read from the top/bottom halves of their operand(s)
+// and generate a vector result with result elements that are double the
+// width of the input.
+static bool producesDoubleWidthResult(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  uint64_t Flags = MCID.TSFlags;
+  return (Flags & ARMII::DoubleWidthResult) != 0;
+}
+
+static bool isHorizontalReduction(const MachineInstr &MI) {
+  const MCInstrDesc &MCID = MI.getDesc();
+  uint64_t Flags = MCID.TSFlags;
+  return (Flags & ARMII::HorizontalReduction) != 0;
+}
+
+// Can this instruction generate a non-zero result when given only zeroed
+// operands? This allows us to know that, given operands with false bytes
+// zeroed by masked loads, that the result will also contain zeros in those
+// bytes.
+static bool canGenerateNonZeros(const MachineInstr &MI) {
+
+  // Check for instructions which can write into a larger element size,
+  // possibly writing into a previous zero'd lane.
+  if (producesDoubleWidthResult(MI))
+    return true;
+
+  switch (MI.getOpcode()) {
+  default:
+    break;
+  // FIXME: VNEG FP and -0? I think we'll need to handle this once we allow
+  // fp16 -> fp32 vector conversions.
+  // Instructions that perform a NOT will generate 1s from 0s.
+  case ARM::MVE_VMVN:
+  case ARM::MVE_VORN:
+  // Count leading zeros will do just that!
+  case ARM::MVE_VCLZs8:
+  case ARM::MVE_VCLZs16:
+  case ARM::MVE_VCLZs32:
+    return true;
+  }
+  return false;
+}
+
+
+// Look at its register uses to see if it only can only receive zeros
+// into its false lanes which would then produce zeros. Also check that
+// the output register is also defined by an FalseLanesZero instruction
+// so that if tail-predication happens, the lanes that aren't updated will
+// still be zeros.
+static bool producesFalseLanesZero(MachineInstr &MI,
+                                   const TargetRegisterClass *QPRs,
+                                   const ReachingDefAnalysis &RDA,
+                                   InstSet &FalseLanesZero) {
+  if (canGenerateNonZeros(MI))
+    return false;
+
+  bool AllowScalars = isHorizontalReduction(MI);
+  for (auto &MO : MI.operands()) {
+    if (!MO.isReg() || !MO.getReg())
+      continue;
+    if (!isRegInClass(MO, QPRs) && AllowScalars)
+      continue;
+    if (auto *OpDef = RDA.getMIOperand(&MI, MO))
+      if (FalseLanesZero.count(OpDef))
+       continue;
+    return false;
+  }
+  LLVM_DEBUG(dbgs() << "ARM Loops: Always False Zeros: " << MI);
+  return true;
+}
+
+bool
+LowOverheadLoop::FindValidReduction(InstSet &LiveMIs, InstSet &LiveOutUsers) {
+  // Also check for reductions where the operation needs to be merging values
+  // from the last and previous loop iterations. This means an instruction
+  // producing a value and a vmov storing the value calculated in the previous
+  // iteration. So we can have two live-out regs, one produced by a vmov and
+  // both being consumed by a vpsel.
+  LLVM_DEBUG(dbgs() << "ARM Loops: Looking for reduction live-outs:\n";
+             for (auto *MI : LiveMIs)
+               dbgs() << " - " << *MI);
+
+  if (!Preheader)
+    return false;
+
+  // Expect a vmov, a vadd and a single vpsel user.
+  // TODO: This means we can't currently support multiple reductions in the
+  // loop.
+  if (LiveMIs.size() != 2 || LiveOutUsers.size() != 1)
+    return false;
+
+  MachineInstr *VPSEL = *LiveOutUsers.begin();
+  if (VPSEL->getOpcode() != ARM::MVE_VPSEL)
+    return false;
+
+  unsigned VPRIdx = llvm::findFirstVPTPredOperandIdx(*VPSEL) + 1;
+  MachineInstr *Pred = RDA.getMIOperand(VPSEL, VPRIdx);
+  if (!Pred || Pred != VCTP) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Not using equivalent predicate.\n");
+    return false;
+  }
+
+  MachineInstr *Reduce = RDA.getMIOperand(VPSEL, 1);
+  if (!Reduce)
+    return false;
+
+  assert(LiveMIs.count(Reduce) && "Expected MI to be live-out");
+
+  // TODO: Support more operations than VADD.
+  switch (VCTP->getOpcode()) {
+  default:
+    return false;
+  case ARM::MVE_VCTP8:
+    if (Reduce->getOpcode() != ARM::MVE_VADDi8)
+      return false;
+    break;
+  case ARM::MVE_VCTP16:
+    if (Reduce->getOpcode() != ARM::MVE_VADDi16)
+      return false;
+    break;
+  case ARM::MVE_VCTP32:
+    if (Reduce->getOpcode() != ARM::MVE_VADDi32)
+      return false;
+    break;
+  }
+
+  // Test that the reduce op is overwriting ones of its operands.
+  if (Reduce->getOperand(0).getReg() != Reduce->getOperand(1).getReg() &&
+      Reduce->getOperand(0).getReg() != Reduce->getOperand(2).getReg()) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Reducing op isn't overwriting itself.\n");
+    return false;
+  }
+
+  // Check that the VORR is actually a VMOV.
+  MachineInstr *Copy = RDA.getMIOperand(VPSEL, 2);
+  if (!Copy || Copy->getOpcode() != ARM::MVE_VORR ||
+      !Copy->getOperand(1).isReg() || !Copy->getOperand(2).isReg() ||
+      Copy->getOperand(1).getReg() != Copy->getOperand(2).getReg())
+    return false;
+
+  assert(LiveMIs.count(Copy) && "Expected MI to be live-out");
+
+  // Check that the vadd and vmov are only used by each other and the vpsel.
+  SmallPtrSet<MachineInstr*, 2> CopyUsers;
+  RDA.getGlobalUses(Copy, Copy->getOperand(0).getReg(), CopyUsers);
+  if (CopyUsers.size() > 2 || !CopyUsers.count(Reduce)) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Copy users unsupported.\n");
+    return false;
+  }
+
+  SmallPtrSet<MachineInstr*, 2> ReduceUsers;
+  RDA.getGlobalUses(Reduce, Reduce->getOperand(0).getReg(), ReduceUsers);
+  if (ReduceUsers.size() > 2 || !ReduceUsers.count(Copy)) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Reduce users unsupported.\n");
+    return false;
+  }
+
+  // Then find whether there's an instruction initialising the register that
+  // is storing the reduction.
+  SmallPtrSet<MachineInstr*, 2> Incoming;
+  RDA.getLiveOuts(Preheader, Copy->getOperand(1).getReg(), Incoming);
+  if (Incoming.size() > 1)
+    return false;
+
+  MachineInstr *Init = Incoming.empty() ? nullptr : *Incoming.begin();
+  LLVM_DEBUG(dbgs() << "ARM Loops: Found a reduction:\n"
+             << " - " << *Copy
+             << " - " << *Reduce
+             << " - " << *VPSEL);
+  Reductions.push_back(std::make_unique<Reduction>(Init, Copy, Reduce, VPSEL));
   return true;
 }
 
-void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
-                                    ReachingDefAnalysis *RDA,
-                                    MachineLoopInfo *MLI) {
+bool LowOverheadLoop::ValidateLiveOuts() {
+  // We want to find out if the tail-predicated version of this loop will
+  // produce the same values as the loop in its original form. For this to
+  // be true, the newly inserted implicit predication must not change the
+  // the (observable) results.
+  // We're doing this because many instructions in the loop will not be
+  // predicated and so the conversion from VPT predication to tail-predication
+  // can result in different values being produced; due to the tail-predication
+  // preventing many instructions from updating their falsely predicated
+  // lanes. This analysis assumes that all the instructions perform lane-wise
+  // operations and don't perform any exchanges.
+  // A masked load, whether through VPT or tail predication, will write zeros
+  // to any of the falsely predicated bytes. So, from the loads, we know that
+  // the false lanes are zeroed and here we're trying to track that those false
+  // lanes remain zero, or where they change, the differences are masked away
+  // by their user(s).
+  // All MVE loads and stores have to be predicated, so we know that any load
+  // operands, or stored results are equivalent already. Other explicitly
+  // predicated instructions will perform the same operation in the original
+  // loop and the tail-predicated form too. Because of this, we can insert
+  // loads, stores and other predicated instructions into our Predicated
+  // set and build from there.
+  const TargetRegisterClass *QPRs = TRI.getRegClass(ARM::MQPRRegClassID);
+  SetVector<MachineInstr *> FalseLanesUnknown;
+  SmallPtrSet<MachineInstr *, 4> FalseLanesZero;
+  SmallPtrSet<MachineInstr *, 4> Predicated;
+  MachineBasicBlock *Header = ML.getHeader();
+
+  for (auto &MI : *Header) {
+    const MCInstrDesc &MCID = MI.getDesc();
+    uint64_t Flags = MCID.TSFlags;
+    if ((Flags & ARMII::DomainMask) != ARMII::DomainMVE)
+      continue;
+
+    if (isVCTP(&MI) || isVPTOpcode(MI.getOpcode()))
+      continue;
+
+    // Predicated loads will write zeros to the falsely predicated bytes of the
+    // destination register.
+    if (isVectorPredicated(&MI)) {
+      if (MI.mayLoad())
+        FalseLanesZero.insert(&MI);
+      Predicated.insert(&MI);
+      continue;
+    }
+
+    if (MI.getNumDefs() == 0)
+      continue;
+
+    if (!producesFalseLanesZero(MI, QPRs, RDA, FalseLanesZero)) {
+      // We require retaining and horizontal operations to operate upon zero'd
+      // false lanes to ensure the conversion doesn't change the output.
+      if (retainsPreviousHalfElement(MI) || isHorizontalReduction(MI))
+        return false;
+      // Otherwise we need to evaluate this instruction later to see whether
+      // unknown false lanes will get masked away by their user(s).
+      FalseLanesUnknown.insert(&MI);
+    } else if (!isHorizontalReduction(MI))
+      FalseLanesZero.insert(&MI);
+  }
+
+  auto HasPredicatedUsers = [this](MachineInstr *MI, const MachineOperand &MO,
+                              SmallPtrSetImpl<MachineInstr *> &Predicated) {
+    SmallPtrSet<MachineInstr *, 2> Uses;
+    RDA.getGlobalUses(MI, MO.getReg(), Uses);
+    for (auto *Use : Uses) {
+      if (Use != MI && !Predicated.count(Use))
+        return false;
+    }
+    return true;
+  };
+
+  // Visit the unknowns in reverse so that we can start at the values being
+  // stored and then we can work towards the leaves, hopefully adding more
+  // instructions to Predicated. Successfully terminating the loop means that
+  // all the unknown values have to found to be masked by predicated user(s).
+  // For any unpredicated values, we store them in NonPredicated so that we
+  // can later check whether these form a reduction.
+  SmallPtrSet<MachineInstr*, 2> NonPredicated;
+  for (auto *MI : reverse(FalseLanesUnknown)) {
+    for (auto &MO : MI->operands()) {
+      if (!isRegInClass(MO, QPRs) || !MO.isDef())
+        continue;
+      if (!HasPredicatedUsers(MI, MO, Predicated)) {
+        LLVM_DEBUG(dbgs() << "ARM Loops: Found an unknown def of : "
+                          << TRI.getRegAsmName(MO.getReg()) << " at " << *MI);
+        NonPredicated.insert(MI);
+        continue;
+      }
+    }
+    // Any unknown false lanes have been masked away by the user(s).
+    Predicated.insert(MI);
+  }
+
+  SmallPtrSet<MachineInstr *, 2> LiveOutMIs;
+  SmallPtrSet<MachineInstr*, 2> LiveOutUsers;
+  SmallVector<MachineBasicBlock *, 2> ExitBlocks;
+  ML.getExitBlocks(ExitBlocks);
+  assert(ML.getNumBlocks() == 1 && "Expected single block loop!");
+  assert(ExitBlocks.size() == 1 && "Expected a single exit block");
+  MachineBasicBlock *ExitBB = ExitBlocks.front();
+  for (const MachineBasicBlock::RegisterMaskPair &RegMask : ExitBB->liveins()) {
+    // Check Q-regs that are live in the exit blocks. We don't collect scalars
+    // because they won't be affected by lane predication.
+    if (QPRs->contains(RegMask.PhysReg)) {
+      if (auto *MI = RDA.getLocalLiveOutMIDef(Header, RegMask.PhysReg))
+        LiveOutMIs.insert(MI);
+      RDA.getLiveInUses(ExitBB, RegMask.PhysReg, LiveOutUsers);
+    }
+  }
+
+  // If we have any non-predicated live-outs, they need to be part of a
+  // reduction that we can fixup later. The reduction that the form of an
+  // operation that uses its previous values through a vmov and then a vpsel
+  // resides in the exit blocks to select the final bytes from n and n-1
+  // iterations.
+  if (!NonPredicated.empty() &&
+      !FindValidReduction(NonPredicated, LiveOutUsers))
+    return false;
+
+  // We've already validated that any VPT predication within the loop will be
+  // equivalent when we perform the predication transformation; so we know that
+  // any VPT predicated instruction is predicated upon VCTP. Any live-out
+  // instruction needs to be predicated, so check this here. The instructions
+  // in NonPredicated have been found to be a reduction that we can ensure its
+  // legality.
+  for (auto *MI : LiveOutMIs)
+    if (!isVectorPredicated(MI) && !NonPredicated.count(MI))
+      return false;
+
+  return true;
+}
+
+void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils) {
   if (Revert)
     return;
 
@@ -434,7 +895,7 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
 
   // TODO Maybe there's cases where the target doesn't have to be the header,
   // but for now be safe and revert.
-  if (End->getOperand(1).getMBB() != ML->getHeader()) {
+  if (End->getOperand(1).getMBB() != ML.getHeader()) {
     LLVM_DEBUG(dbgs() << "ARM Loops: LoopEnd is not targetting header.\n");
     Revert = true;
     return;
@@ -442,8 +903,8 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
 
   // The WLS and LE instructions have 12-bits for the label offset. WLS
   // requires a positive offset, while LE uses negative.
-  if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML->getHeader()) ||
-      !BBUtils->isBBInRange(End, ML->getHeader(), 4094)) {
+  if (BBUtils->getOffsetOf(End) < BBUtils->getOffsetOf(ML.getHeader()) ||
+      !BBUtils->isBBInRange(End, ML.getHeader(), 4094)) {
     LLVM_DEBUG(dbgs() << "ARM Loops: LE offset is out-of-range\n");
     Revert = true;
     return;
@@ -458,7 +919,7 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
     return;
   }
 
-  InsertPt = Revert ? nullptr : IsSafeToDefineLR(RDA);
+  InsertPt = Revert ? nullptr : isSafeToDefineLR();
   if (!InsertPt) {
     LLVM_DEBUG(dbgs() << "ARM Loops: Unable to find safe insertion point.\n");
     Revert = true;
@@ -473,9 +934,9 @@ void LowOverheadLoop::CheckLegality(ARMBasicBlockUtils *BBUtils,
     return;
   }
 
-  assert(ML->getBlocks().size() == 1 &&
+  assert(ML.getBlocks().size() == 1 &&
          "Shouldn't be processing a loop with more than one block");
-  CannotTailPredicate = !ValidateTailPredicate(InsertPt, RDA, MLI);
+  CannotTailPredicate = !ValidateTailPredicate(InsertPt);
   LLVM_DEBUG(if (CannotTailPredicate)
              dbgs() << "ARM Loops: Couldn't validate tail predicate.\n");
 }
@@ -484,29 +945,44 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
   if (CannotTailPredicate)
     return false;
 
-  // Only support a single vctp.
-  if (isVCTP(MI) && VCTP)
-    return false;
+  if (isVCTP(MI)) {
+    // If we find another VCTP, check whether it uses the same value as the main VCTP.
+    // If it does, store it in the SecondaryVCTPs set, else refuse it.
+    if (VCTP) {
+      if (!VCTP->getOperand(1).isIdenticalTo(MI->getOperand(1)) ||
+          !RDA.hasSameReachingDef(VCTP, MI, MI->getOperand(1).getReg())) {
+        LLVM_DEBUG(dbgs() << "ARM Loops: Found VCTP with a different reaching "
+                             "definition from the main VCTP");
+        return false;
+      }
+      LLVM_DEBUG(dbgs() << "ARM Loops: Found secondary VCTP: " << *MI);
+      SecondaryVCTPs.insert(MI);
+    } else {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Found 'main' VCTP: " << *MI);
+      VCTP = MI;
+    }
+  } else if (isVPTOpcode(MI->getOpcode())) {
+    if (MI->getOpcode() != ARM::MVE_VPST) {
+      assert(MI->findRegisterDefOperandIdx(ARM::VPR) != -1 &&
+             "VPT does not implicitly define VPR?!");
+      CurrentPredicate.insert(MI);
+    }
 
-  // Start a new vpt block when we discover a vpt.
-  if (MI->getOpcode() == ARM::MVE_VPST) {
     VPTBlocks.emplace_back(MI, CurrentPredicate);
     CurrentBlock = &VPTBlocks.back();
     return true;
-  } else if (isVCTP(MI))
-    VCTP = MI;
-  else if (MI->getOpcode() == ARM::MVE_VPSEL ||
-           MI->getOpcode() == ARM::MVE_VPNOT)
+  } else if (MI->getOpcode() == ARM::MVE_VPSEL ||
+             MI->getOpcode() == ARM::MVE_VPNOT) {
+    // TODO: Allow VPSEL and VPNOT, we currently cannot because:
+    // 1) It will use the VPR as a predicate operand, but doesn't have to be
+    //    instead a VPT block, which means we can assert while building up
+    //    the VPT block because we don't find another VPT or VPST to being a new
+    //    one.
+    // 2) VPSEL still requires a VPR operand even after tail predicating,
+    //    which means we can't remove it unless there is another
+    //    instruction, such as vcmp, that can provide the VPR def.
     return false;
-
-  // TODO: Allow VPSEL and VPNOT, we currently cannot because:
-  // 1) It will use the VPR as a predicate operand, but doesn't have to be
-  //    instead a VPT block, which means we can assert while building up
-  //    the VPT block because we don't find another VPST to being a new
-  //    one.
-  // 2) VPSEL still requires a VPR operand even after tail predicating,
-  //    which means we can't remove it unless there is another
-  //    instruction, such as vcmp, that can provide the VPR def.
+  }
 
   bool IsUse = false;
   bool IsDef = false;
@@ -548,7 +1024,9 @@ bool LowOverheadLoop::ValidateMVEInst(MachineInstr* MI) {
     return false;
   }
 
-  return true;
+  // If the instruction is already explicitly predicated, then the conversion
+  // will be fine, but ensure that all memory operations are predicated.
+  return !IsUse && MI->mayLoadOrStore() ? false : true;
 }
 
 bool ARMLowOverheadLoops::runOnMachineFunction(MachineFunction &mf) {
@@ -591,6 +1069,8 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
                dbgs() << " - " << Preheader->getName() << "\n";
              else if (auto *Preheader = MLI->findLoopPreheader(ML))
                dbgs() << " - " << Preheader->getName() << "\n";
+             else if (auto *Preheader = MLI->findLoopPreheader(ML, true))
+               dbgs() << " - " << Preheader->getName() << "\n";
              for (auto *MBB : ML->getBlocks())
                dbgs() << " - " << MBB->getName() << "\n";
             );
@@ -608,14 +1088,12 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
     return nullptr;
   };
 
-  LowOverheadLoop LoLoop(ML);
+  LowOverheadLoop LoLoop(*ML, *MLI, *RDA, *TRI, *TII);
   // Search the preheader for the start intrinsic.
   // FIXME: I don't see why we shouldn't be supporting multiple predecessors
   // with potentially multiple set.loop.iterations, so we need to enable this.
-  if (auto *Preheader = ML->getLoopPreheader())
-    LoLoop.Start = SearchForStart(Preheader);
-  else if (auto *Preheader = MLI->findLoopPreheader(ML, true))
-    LoLoop.Start = SearchForStart(Preheader);
+  if (LoLoop.Preheader)
+    LoLoop.Start = SearchForStart(LoLoop.Preheader);
   else
     return false;
 
@@ -624,7 +1102,9 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
   // whether we can convert that predicate using tail predication.
   for (auto *MBB : reverse(ML->getBlocks())) {
     for (auto &MI : *MBB) {
-      if (MI.getOpcode() == ARM::t2LoopDec)
+      if (MI.isDebugValue())
+        continue;
+      else if (MI.getOpcode() == ARM::t2LoopDec)
         LoLoop.Dec = &MI;
       else if (MI.getOpcode() == ARM::t2LoopEnd)
         LoLoop.End = &MI;
@@ -641,28 +1121,6 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
         // Check we know how to tail predicate any mve instructions.
         LoLoop.AnalyseMVEInst(&MI);
       }
-
-      // We need to ensure that LR is not used or defined inbetween LoopDec and
-      // LoopEnd.
-      if (!LoLoop.Dec || LoLoop.End || LoLoop.Revert)
-        continue;
-
-      // If we find that LR has been written or read between LoopDec and
-      // LoopEnd, expect that the decremented value is being used else where.
-      // Because this value isn't actually going to be produced until the
-      // latch, by LE, we would need to generate a real sub. The value is also
-      // likely to be copied/reloaded for use of LoopEnd - in which in case
-      // we'd need to perform an add because it gets subtracted again by LE!
-      // The other option is to then generate the other form of LE which doesn't
-      // perform the sub.
-      for (auto &MO : MI.operands()) {
-        if (MI.getOpcode() != ARM::t2LoopDec && MO.isReg() &&
-            MO.getReg() == ARM::LR) {
-          LLVM_DEBUG(dbgs() << "ARM Loops: Found LR Use/Def: " << MI);
-          LoLoop.Revert = true;
-          break;
-        }
-      }
     }
   }
 
@@ -672,7 +1130,15 @@ bool ARMLowOverheadLoops::ProcessLoop(MachineLoop *ML) {
     return false;
   }
 
-  LoLoop.CheckLegality(BBUtils.get(), RDA, MLI);
+  // Check that the only instruction using LoopDec is LoopEnd.
+  // TODO: Check for copy chains that really have no effect.
+  SmallPtrSet<MachineInstr*, 2> Uses;
+  RDA->getReachingLocalUses(LoLoop.Dec, ARM::LR, Uses);
+  if (Uses.size() > 1 || !Uses.count(LoLoop.End)) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Unable to remove LoopDec.\n");
+    LoLoop.Revert = true;
+  }
+  LoLoop.CheckLegality(BBUtils.get());
   Expand(LoLoop);
   return true;
 }
@@ -702,16 +1168,19 @@ void ARMLowOverheadLoops::RevertWhile(MachineInstr *MI) const {
   MI->eraseFromParent();
 }
 
-bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI,
-                                        bool SetFlags) const {
+bool ARMLowOverheadLoops::RevertLoopDec(MachineInstr *MI) const {
   LLVM_DEBUG(dbgs() << "ARM Loops: Reverting to sub: " << *MI);
   MachineBasicBlock *MBB = MI->getParent();
+  SmallPtrSet<MachineInstr*, 1> Ignore;
+  for (auto I = MachineBasicBlock::iterator(MI), E = MBB->end(); I != E; ++I) {
+    if (I->getOpcode() == ARM::t2LoopEnd) {
+      Ignore.insert(&*I);
+      break;
+    }
+  }
 
   // If nothing defines CPSR between LoopDec and LoopEnd, use a t2SUBS.
-  if (SetFlags &&
-      (RDA->isRegUsedAfter(MI, ARM::CPSR) ||
-       !RDA->hasSameReachingDef(MI, &MBB->back(), ARM::CPSR)))
-      SetFlags = false;
+  bool SetFlags = RDA->isSafeToDefRegAt(MI, ARM::CPSR, Ignore);
 
   MachineInstrBuilder MIB = BuildMI(*MBB, MI, MI->getDebugLoc(),
                                     TII->get(ARM::t2SUBri));
@@ -759,7 +1228,102 @@ void ARMLowOverheadLoops::RevertLoopEnd(MachineInstr *MI, bool SkipCmp) const {
   MI->eraseFromParent();
 }
 
+// Perform dead code elimation on the loop iteration count setup expression.
+// If we are tail-predicating, the number of elements to be processed is the
+// operand of the VCTP instruction in the vector body, see getCount(), which is
+// register $r3 in this example:
+//
+//   $lr = big-itercount-expression
+//   ..
+//   t2DoLoopStart renamable $lr
+//   vector.body:
+//     ..
+//     $vpr = MVE_VCTP32 renamable $r3
+//     renamable $lr = t2LoopDec killed renamable $lr, 1
+//     t2LoopEnd renamable $lr, %vector.body
+//     tB %end
+//
+// What we would like achieve here is to replace the do-loop start pseudo
+// instruction t2DoLoopStart with:
+//
+//    $lr = MVE_DLSTP_32 killed renamable $r3
+//
+// Thus, $r3 which defines the number of elements, is written to $lr,
+// and then we want to delete the whole chain that used to define $lr,
+// see the comment below how this chain could look like.
+//
+void ARMLowOverheadLoops::IterationCountDCE(LowOverheadLoop &LoLoop) {
+  if (!LoLoop.IsTailPredicationLegal())
+    return;
+
+  LLVM_DEBUG(dbgs() << "ARM Loops: Trying DCE on loop iteration count.\n");
+
+  MachineInstr *Def = RDA->getMIOperand(LoLoop.Start, 0);
+  if (!Def) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Couldn't find iteration count.\n");
+    return;
+  }
+
+  // Collect and remove the users of iteration count.
+  SmallPtrSet<MachineInstr*, 4> Killed  = { LoLoop.Start, LoLoop.Dec,
+                                            LoLoop.End, LoLoop.InsertPt };
+  SmallPtrSet<MachineInstr*, 2> Remove;
+  if (RDA->isSafeToRemove(Def, Remove, Killed))
+    LoLoop.ToRemove.insert(Remove.begin(), Remove.end());
+  else {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Unsafe to remove loop iteration count.\n");
+    return;
+  }
+
+  // Collect the dead code and the MBBs in which they reside.
+  RDA->collectKilledOperands(Def, Killed);
+  SmallPtrSet<MachineBasicBlock*, 2> BasicBlocks;
+  for (auto *MI : Killed)
+    BasicBlocks.insert(MI->getParent());
+
+  // Collect IT blocks in all affected basic blocks.
+  std::map<MachineInstr *, SmallPtrSet<MachineInstr *, 2>> ITBlocks;
+  for (auto *MBB : BasicBlocks) {
+    for (auto &MI : *MBB) {
+      if (MI.getOpcode() != ARM::t2IT)
+        continue;
+      RDA->getReachingLocalUses(&MI, ARM::ITSTATE, ITBlocks[&MI]);
+    }
+  }
+
+  // If we're removing all of the instructions within an IT block, then
+  // also remove the IT instruction.
+  SmallPtrSet<MachineInstr*, 2> ModifiedITs;
+  for (auto *MI : Killed) {
+    if (MachineOperand *MO = MI->findRegisterUseOperand(ARM::ITSTATE)) {
+      MachineInstr *IT = RDA->getMIOperand(MI, *MO);
+      auto &CurrentBlock = ITBlocks[IT];
+      CurrentBlock.erase(MI);
+      if (CurrentBlock.empty())
+        ModifiedITs.erase(IT);
+      else
+        ModifiedITs.insert(IT);
+    }
+  }
+
+  // Delete the killed instructions only if we don't have any IT blocks that
+  // need to be modified because we need to fixup the mask.
+  // TODO: Handle cases where IT blocks are modified.
+  if (ModifiedITs.empty()) {
+    LLVM_DEBUG(dbgs() << "ARM Loops: Will remove iteration count:\n";
+               for (auto *MI : Killed)
+                 dbgs() << " - " << *MI);
+    LoLoop.ToRemove.insert(Killed.begin(), Killed.end());
+  } else
+    LLVM_DEBUG(dbgs() << "ARM Loops: Would need to modify IT block(s).\n");
+}
+
 MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
+  LLVM_DEBUG(dbgs() << "ARM Loops: Expanding LoopStart.\n");
+  // When using tail-predication, try to delete the dead code that was used to
+  // calculate the number of loop iterations.
+  IterationCountDCE(LoLoop);
+
   MachineInstr *InsertPt = LoLoop.InsertPt;
   MachineInstr *Start = LoLoop.Start;
   MachineBasicBlock *MBB = InsertPt->getParent();
@@ -775,109 +1339,67 @@ MachineInstr* ARMLowOverheadLoops::ExpandLoopStart(LowOverheadLoop &LoLoop) {
   if (!IsDo)
     MIB.add(Start->getOperand(1));
 
-  // When using tail-predication, try to delete the dead code that was used to
-  // calculate the number of loop iterations.
-  if (LoLoop.IsTailPredicationLegal()) {
-    SmallVector<MachineInstr*, 4> Killed;
-    SmallVector<MachineInstr*, 4> Dead;
-    if (auto *Def = RDA->getReachingMIDef(Start,
-                                          Start->getOperand(0).getReg())) {
-      Killed.push_back(Def);
-
-      while (!Killed.empty()) {
-        MachineInstr *Def = Killed.back();
-        Killed.pop_back();
-        Dead.push_back(Def);
-        for (auto &MO : Def->operands()) {
-          if (!MO.isReg() || !MO.isKill())
-            continue;
-
-          MachineInstr *Kill = RDA->getReachingMIDef(Def, MO.getReg());
-          if (Kill && RDA->getNumUses(Kill, MO.getReg()) == 1)
-            Killed.push_back(Kill);
-        }
-      }
-      for (auto *MI : Dead)
-        MI->eraseFromParent();
-    }
-  }
-
   // If we're inserting at a mov lr, then remove it as it's redundant.
   if (InsertPt != Start)
-    InsertPt->eraseFromParent();
-  Start->eraseFromParent();
+    LoLoop.ToRemove.insert(InsertPt);
+  LoLoop.ToRemove.insert(Start);
   LLVM_DEBUG(dbgs() << "ARM Loops: Inserted start: " << *MIB);
   return &*MIB;
 }
 
-// Goal is to optimise and clean-up these loops:
-//
-//   vector.body:
-//     renamable $vpr = MVE_VCTP32 renamable $r3, 0, $noreg
-//     renamable $r3, dead $cpsr = tSUBi8 killed renamable $r3(tied-def 0), 4
-//     ..
-//     $lr = MVE_DLSTP_32 renamable $r3
-//
-// The SUB is the old update of the loop iteration count expression, which
-// is no longer needed. This sub is removed when the element count, which is in
-// r3 in this example, is defined by an instruction in the loop, and it has
-// no uses.
-//
-void ARMLowOverheadLoops::RemoveLoopUpdate(LowOverheadLoop &LoLoop) {
-  Register ElemCount = LoLoop.VCTP->getOperand(1).getReg();
-  MachineInstr *LastInstrInBlock = &LoLoop.VCTP->getParent()->back();
-
-  LLVM_DEBUG(dbgs() << "ARM Loops: Trying to remove loop update stmt\n");
-
-  if (LoLoop.ML->getNumBlocks() != 1) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Single block loop expected\n");
-    return;
-  }
-
-  LLVM_DEBUG(dbgs() << "ARM Loops: Analyzing elemcount in operand: ";
-             LoLoop.VCTP->getOperand(1).dump());
-
-  // Find the definition we are interested in removing, if there is one.
-  MachineInstr *Def = RDA->getReachingMIDef(LastInstrInBlock, ElemCount);
-  if (!Def) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Can't find a def, nothing to do.\n");
-    return;
-  }
-
-  // Bail if we define CPSR and it is not dead
-  if (!Def->registerDefIsDead(ARM::CPSR, TRI)) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: CPSR is not dead\n");
-    return;
-  }
-
-  // Bail if elemcount is used in exit blocks, i.e. if it is live-in.
-  if (isRegLiveInExitBlocks(LoLoop.ML, ElemCount)) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Elemcount is live-out, can't remove stmt\n");
-    return;
-  }
+void ARMLowOverheadLoops::FixupReductions(LowOverheadLoop &LoLoop) const {
+  LLVM_DEBUG(dbgs() << "ARM Loops: Fixing up reduction(s).\n");
+  auto BuildMov = [this](MachineInstr &InsertPt, Register To, Register From) {
+    MachineBasicBlock *MBB = InsertPt.getParent();
+    MachineInstrBuilder MIB =
+      BuildMI(*MBB, &InsertPt, InsertPt.getDebugLoc(), TII->get(ARM::MVE_VORR));
+    MIB.addDef(To);
+    MIB.addReg(From);
+    MIB.addReg(From);
+    MIB.addImm(0);
+    MIB.addReg(0);
+    MIB.addReg(To);
+    LLVM_DEBUG(dbgs() << "ARM Loops: Inserted VMOV: " << *MIB);
+  };
 
-  // Bail if there are uses after this Def in the block.
-  SmallVector<MachineInstr*, 4> Uses;
-  RDA->getReachingLocalUses(Def, ElemCount, Uses);
-  if (Uses.size()) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Local uses in block, can't remove stmt\n");
-    return;
-  }
+  for (auto &Reduction : LoLoop.Reductions) {
+    MachineInstr &Copy = Reduction->Copy;
+    MachineInstr &Reduce = Reduction->Reduce;
+    Register DestReg = Copy.getOperand(0).getReg();
 
-  Uses.clear();
-  RDA->getAllInstWithUseBefore(Def, ElemCount, Uses);
+    // Change the initialiser if present
+    if (Reduction->Init) {
+      MachineInstr *Init = Reduction->Init;
 
-  // Remove Def if there are no uses, or if the only use is the VCTP
-  // instruction.
-  if (!Uses.size() || (Uses.size() == 1 && Uses[0] == LoLoop.VCTP)) {
-    LLVM_DEBUG(dbgs() << "ARM Loops: Removing loop update instruction: ";
-               Def->dump());
-    Def->eraseFromParent();
-    return;
+      for (unsigned i = 0; i < Init->getNumOperands(); ++i) {
+        MachineOperand &MO = Init->getOperand(i);
+        if (MO.isReg() && MO.isUse() && MO.isTied() &&
+            Init->findTiedOperandIdx(i) == 0)
+          Init->getOperand(i).setReg(DestReg);
+      }
+      Init->getOperand(0).setReg(DestReg);
+      LLVM_DEBUG(dbgs() << "ARM Loops: Changed init regs: " << *Init);
+    } else
+      BuildMov(LoLoop.Preheader->instr_back(), DestReg, Copy.getOperand(1).getReg());
+
+    // Change the reducing op to write to the register that is used to copy
+    // its value on the next iteration. Also update the tied-def operand.
+    Reduce.getOperand(0).setReg(DestReg);
+    Reduce.getOperand(5).setReg(DestReg);
+    LLVM_DEBUG(dbgs() << "ARM Loops: Changed reduction regs: " << Reduce);
+
+    // Instead of a vpsel, just copy the register into the necessary one.
+    MachineInstr &VPSEL = Reduction->VPSEL;
+    if (VPSEL.getOperand(0).getReg() != DestReg)
+      BuildMov(VPSEL, VPSEL.getOperand(0).getReg(), DestReg);
+
+    // Remove the unnecessary instructions.
+    LLVM_DEBUG(dbgs() << "ARM Loops: Removing:\n"
+               << " - " << Copy
+               << " - " << VPSEL << "\n");
+    Copy.eraseFromParent();
+    VPSEL.eraseFromParent();
   }
-
-  LLVM_DEBUG(dbgs() << "ARM Loops: Can't remove loop update, it's used by:\n";
-             for (auto U : Uses) U->dump());
 }
 
 void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
@@ -893,28 +1415,24 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
   };
 
   // There are a few scenarios which we have to fix up:
-  // 1) A VPT block with is only predicated by the vctp and has no internal vpr
-  //    defs.
-  // 2) A VPT block which is only predicated by the vctp but has an internal
-  //    vpr def.
-  // 3) A VPT block which is predicated upon the vctp as well as another vpr
-  //    def.
-  // 4) A VPT block which is not predicated upon a vctp, but contains it and
-  //    all instructions within the block are predicated upon in.
-
+  // 1. VPT Blocks with non-uniform predicates:
+  //    - a. When the divergent instruction is a vctp
+  //    - b. When the block uses a vpst, and is only predicated on the vctp
+  //    - c. When the block uses a vpt and (optionally) contains one or more
+  //         vctp.
+  // 2. VPT Blocks with uniform predicates:
+  //    - a. The block uses a vpst, and is only predicated on the vctp
   for (auto &Block : LoLoop.getVPTBlocks()) {
     SmallVectorImpl<PredicatedMI> &Insts = Block.getInsts();
     if (Block.HasNonUniformPredicate()) {
       PredicatedMI *Divergent = Block.getDivergent();
       if (isVCTP(Divergent->MI)) {
-        // The vctp will be removed, so the size of the vpt block needs to be
-        // modified.
-        uint64_t Size = getARMVPTBlockMask(Block.size() - 1);
-        Block.getVPST()->getOperand(0).setImm(Size);
-        LLVM_DEBUG(dbgs() << "ARM Loops: Modified VPT block mask.\n");
-      } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
-        // The VPT block has a non-uniform predicate but it's entry is guarded
-        // only by a vctp, which means we:
+        // The vctp will be removed, so the block mask of the vp(s)t will need
+        // to be recomputed.
+        LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
+      } else if (Block.isVPST() && Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
+        // The VPT block has a non-uniform predicate but it uses a vpst and its
+        // entry is guarded only by a vctp, which means we:
         // - Need to remove the original vpst.
         // - Then need to unpredicate any following instructions, until
         //   we come across the divergent vpr def.
@@ -922,7 +1440,7 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
         //   the divergent vpr def.
         // TODO: We could be producing more VPT blocks than necessary and could
         // fold the newly created one into a proceeding one.
-        for (auto I = ++MachineBasicBlock::iterator(Block.getVPST()),
+        for (auto I = ++MachineBasicBlock::iterator(Block.getPredicateThen()),
              E = ++MachineBasicBlock::iterator(Divergent->MI); I != E; ++I)
           RemovePredicate(&*I);
 
@@ -935,28 +1453,58 @@ void ARMLowOverheadLoops::ConvertVPTBlocks(LowOverheadLoop &LoLoop) {
           ++Size;
           ++I;
         }
+        // Create a VPST (with a null mask for now, we'll recompute it later).
         MachineInstrBuilder MIB = BuildMI(*InsertAt->getParent(), InsertAt,
                                           InsertAt->getDebugLoc(),
                                           TII->get(ARM::MVE_VPST));
-        MIB.addImm(getARMVPTBlockMask(Size));
-        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
+        MIB.addImm(0);
+        LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen());
         LLVM_DEBUG(dbgs() << "ARM Loops: Created VPST: " << *MIB);
-        Block.getVPST()->eraseFromParent();
+        LoLoop.ToRemove.insert(Block.getPredicateThen());
+        LoLoop.BlockMasksToRecompute.insert(MIB.getInstr());
+      }
+      // Else, if the block uses a vpt, iterate over the block, removing the
+      // extra VCTPs it may contain.
+      else if (Block.isVPT()) {
+        bool RemovedVCTP = false;
+        for (PredicatedMI &Elt : Block.getInsts()) {
+          MachineInstr *MI = Elt.MI;
+          if (isVCTP(MI)) {
+            LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *MI);
+            LoLoop.ToRemove.insert(MI);
+            RemovedVCTP = true;
+            continue;
+          }
+        }
+        if (RemovedVCTP)
+          LoLoop.BlockMasksToRecompute.insert(Block.getPredicateThen());
       }
-    } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP)) {
-      // A vpt block which is only predicated upon vctp and has no internal vpr
-      // defs:
+    } else if (Block.IsOnlyPredicatedOn(LoLoop.VCTP) && Block.isVPST()) {
+      // A vpt block starting with VPST, is only predicated upon vctp and has no
+      // internal vpr defs:
       // - Remove vpst.
       // - Unpredicate the remaining instructions.
-      LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getVPST());
-      Block.getVPST()->eraseFromParent();
+      LLVM_DEBUG(dbgs() << "ARM Loops: Removing VPST: " << *Block.getPredicateThen());
+      LoLoop.ToRemove.insert(Block.getPredicateThen());
       for (auto &PredMI : Insts)
         RemovePredicate(PredMI.MI);
     }
   }
-
-  LLVM_DEBUG(dbgs() << "ARM Loops: Removing VCTP: " << *LoLoop.VCTP);
-  LoLoop.VCTP->eraseFromParent();
+  LLVM_DEBUG(dbgs() << "ARM Loops: Removing remaining VCTPs...\n");
+  // Remove the "main" VCTP
+  LoLoop.ToRemove.insert(LoLoop.VCTP);
+  LLVM_DEBUG(dbgs() << "    " << *LoLoop.VCTP);
+  // Remove remaining secondary VCTPs
+  for (MachineInstr *VCTP : LoLoop.SecondaryVCTPs) {
+    // All VCTPs that aren't marked for removal yet should be unpredicated ones.
+    // The predicated ones should have already been marked for removal when
+    // visiting the VPT blocks.
+    if (LoLoop.ToRemove.insert(VCTP).second) {
+      assert(getVPTInstrPredicate(*VCTP) == ARMVCC::None &&
+             "Removing Predicated VCTP without updating the block mask!");
+      LLVM_DEBUG(dbgs() << "    " << *VCTP);
+    }
+  }
 }
 
 void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
@@ -973,9 +1521,8 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
     MIB.add(End->getOperand(0));
     MIB.add(End->getOperand(1));
     LLVM_DEBUG(dbgs() << "ARM Loops: Inserted LE: " << *MIB);
-
-    LoLoop.End->eraseFromParent();
-    LoLoop.Dec->eraseFromParent();
+    LoLoop.ToRemove.insert(LoLoop.Dec);
+    LoLoop.ToRemove.insert(End);
     return &*MIB;
   };
 
@@ -1001,7 +1548,7 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
       RevertWhile(LoLoop.Start);
     else
       LoLoop.Start->eraseFromParent();
-    bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec, true);
+    bool FlagsAlreadySet = RevertLoopDec(LoLoop.Dec);
     RevertLoopEnd(LoLoop.End, FlagsAlreadySet);
   } else {
     LoLoop.Start = ExpandLoopStart(LoLoop);
@@ -1009,10 +1556,35 @@ void ARMLowOverheadLoops::Expand(LowOverheadLoop &LoLoop) {
     LoLoop.End = ExpandLoopEnd(LoLoop);
     RemoveDeadBranch(LoLoop.End);
     if (LoLoop.IsTailPredicationLegal()) {
-      RemoveLoopUpdate(LoLoop);
       ConvertVPTBlocks(LoLoop);
+      FixupReductions(LoLoop);
+    }
+    for (auto *I : LoLoop.ToRemove) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Erasing " << *I);
+      I->eraseFromParent();
+    }
+    for (auto *I : LoLoop.BlockMasksToRecompute) {
+      LLVM_DEBUG(dbgs() << "ARM Loops: Recomputing VPT/VPST Block Mask: " << *I);
+      recomputeVPTBlockMask(*I);
+      LLVM_DEBUG(dbgs() << "           ... done: " << *I);
     }
   }
+
+  PostOrderLoopTraversal DFS(LoLoop.ML, *MLI);
+  DFS.ProcessLoop();
+  const SmallVectorImpl<MachineBasicBlock*> &PostOrder = DFS.getOrder();
+  for (auto *MBB : PostOrder) {
+    recomputeLiveIns(*MBB);
+    // FIXME: For some reason, the live-in print order is non-deterministic for
+    // our tests and I can't out why... So just sort them.
+    MBB->sortUniqueLiveIns();
+  }
+
+  for (auto *MBB : reverse(PostOrder))
+    recomputeLivenessFlags(*MBB);
+
+  // We've moved, removed and inserted new instructions, so update RDA.
+  RDA->reset();
 }
 
 bool ARMLowOverheadLoops::RevertNonLoops() {
diff --git a/llvm/lib/Target/ARM/ARMMCInstLower.cpp b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
index c92689f4942e..f893faa4cf97 100644
--- a/llvm/lib/Target/ARM/ARMMCInstLower.cpp
+++ b/llvm/lib/Target/ARM/ARMMCInstLower.cpp
@@ -194,9 +194,9 @@ void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
   //   BLX ip
   //   POP{ r0, lr }
   //
-  OutStreamer->EmitCodeAlignment(4);
+  OutStreamer->emitCodeAlignment(4);
   auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
-  OutStreamer->EmitLabel(CurSled);
+  OutStreamer->emitLabel(CurSled);
   auto Target = OutContext.createTempSymbol();
 
   // Emit "B #20" instruction, which jumps over the next 24 bytes (because
@@ -207,13 +207,10 @@ void ARMAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind)
   EmitToStreamer(*OutStreamer, MCInstBuilder(ARM::Bcc).addImm(20)
     .addImm(ARMCC::AL).addReg(0));
 
-  MCInst Noop;
-  Subtarget->getInstrInfo()->getNoop(Noop);
-  for (int8_t I = 0; I < NoopsInSledCount; I++)
-    OutStreamer->EmitInstruction(Noop, getSubtargetInfo());
+  emitNops(NoopsInSledCount);
 
-  OutStreamer->EmitLabel(Target);
-  recordSled(CurSled, MI, Kind);
+  OutStreamer->emitLabel(Target);
+  recordSled(CurSled, MI, Kind, 2);
 }
 
 void ARMAsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI)
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
index 3b676ca4c883..507c3e69b3a4 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.cpp
@@ -15,4 +15,6 @@ void ARMFunctionInfo::anchor() {}
 
 ARMFunctionInfo::ARMFunctionInfo(MachineFunction &MF)
     : isThumb(MF.getSubtarget<ARMSubtarget>().isThumb()),
-      hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()) {}
+      hasThumb2(MF.getSubtarget<ARMSubtarget>().hasThumb2()),
+      IsCmseNSEntry(MF.getFunction().hasFnAttribute("cmse_nonsecure_entry")),
+      IsCmseNSCall(MF.getFunction().hasFnAttribute("cmse_nonsecure_call")) {}
diff --git a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
index bb136e92329b..298c8a238987 100644
--- a/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
+++ b/llvm/lib/Target/ARM/ARMMachineFunctionInfo.h
@@ -58,10 +58,6 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// emitPrologue.
   bool RestoreSPFromFP = false;
 
-  /// LRSpilledForFarJump - True if the LR register has been for spilled to
-  /// enable far jump.
-  bool LRSpilledForFarJump = false;
-
   /// LRSpilled - True if the LR register has been for spilled for
   /// any reason, so it's legal to emit an ARM::tBfar (i.e. "bl").
   bool LRSpilled = false;
@@ -87,6 +83,7 @@ class ARMFunctionInfo : public MachineFunctionInfo {
 
   /// GPRCS1Size, GPRCS2Size, DPRCSSize - Sizes of callee saved register spills
   /// areas.
+  unsigned FPCXTSaveSize = 0;
   unsigned GPRCS1Size = 0;
   unsigned GPRCS2Size = 0;
   unsigned DPRCSAlignGapSize = 0;
@@ -109,6 +106,10 @@ class ARMFunctionInfo : public MachineFunctionInfo {
   /// HasITBlocks - True if IT blocks have been inserted.
   bool HasITBlocks = false;
 
+  // Security Extensions
+  bool IsCmseNSEntry;
+  bool IsCmseNSCall;
+
   /// CPEClones - Track constant pool entries clones created by Constant Island
   /// pass.
   DenseMap<unsigned, unsigned> CPEClones;
@@ -144,6 +145,9 @@ public:
   bool isThumb1OnlyFunction() const { return isThumb && !hasThumb2; }
   bool isThumb2Function() const { return isThumb && hasThumb2; }
 
+  bool isCmseNSEntryFunction() const { return IsCmseNSEntry; }
+  bool isCmseNSCallFunction() const { return IsCmseNSCall; }
+
   unsigned getStoredByValParamsPadding() const { return StByValParamsPadding; }
   void setStoredByValParamsPadding(unsigned p) { StByValParamsPadding = p; }
 
@@ -162,9 +166,6 @@ public:
   bool isLRSpilled() const { return LRSpilled; }
   void setLRIsSpilled(bool s) { LRSpilled = s; }
 
-  bool isLRSpilledForFarJump() const { return LRSpilledForFarJump; }
-  void setLRIsSpilledForFarJump(bool s) { LRSpilledForFarJump = s; }
-
   unsigned getFramePtrSpillOffset() const { return FramePtrSpillOffset; }
   void setFramePtrSpillOffset(unsigned o) { FramePtrSpillOffset = o; }
 
@@ -179,11 +180,13 @@ public:
   void setGPRCalleeSavedArea2Offset(unsigned o) { GPRCS2Offset = o; }
   void setDPRCalleeSavedAreaOffset(unsigned o)  { DPRCSOffset = o; }
 
+  unsigned getFPCXTSaveAreaSize() const       { return FPCXTSaveSize; }
   unsigned getGPRCalleeSavedArea1Size() const { return GPRCS1Size; }
   unsigned getGPRCalleeSavedArea2Size() const { return GPRCS2Size; }
   unsigned getDPRCalleeSavedGapSize() const   { return DPRCSAlignGapSize; }
   unsigned getDPRCalleeSavedAreaSize()  const { return DPRCSSize; }
 
+  void setFPCXTSaveAreaSize(unsigned s)       { FPCXTSaveSize = s; }
   void setGPRCalleeSavedArea1Size(unsigned s) { GPRCS1Size = s; }
   void setGPRCalleeSavedArea2Size(unsigned s) { GPRCS2Size = s; }
   void setDPRCalleeSavedGapSize(unsigned s)   { DPRCSAlignGapSize = s; }
@@ -252,6 +255,7 @@ public:
   }
 
   DenseMap<unsigned, unsigned> EHPrologueRemappedRegs;
+  DenseMap<unsigned, unsigned> EHPrologueOffsetInRegs;
 
   void setPreservesR0() { PreservesR0 = true; }
   bool getPreservesR0() const { return PreservesR0; }
diff --git a/llvm/lib/Target/ARM/ARMParallelDSP.cpp b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
index e2c9335db419..e750649ce86c 100644
--- a/llvm/lib/Target/ARM/ARMParallelDSP.cpp
+++ b/llvm/lib/Target/ARM/ARMParallelDSP.cpp
@@ -19,8 +19,9 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
-#include "llvm/Analysis/OrderedBasicBlock.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsARM.h"
@@ -28,7 +29,6 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
-#include "llvm/PassSupport.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -352,7 +352,6 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
   SmallVector<Instruction*, 8> Writes;
   LoadPairs.clear();
   WideLoads.clear();
-  OrderedBasicBlock OrderedBB(BB);
 
   // Collect loads and instruction that may write to memory. For now we only
   // record loads which are simple, sign-extended and have a single user.
@@ -384,7 +383,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
       if (!isModOrRefSet(intersectModRef(AA->getModRefInfo(Write, ReadLoc),
           ModRefInfo::ModRef)))
         continue;
-      if (OrderedBB.dominates(Write, Read))
+      if (Write->comesBefore(Read))
         RAWDeps[Read].insert(Write);
     }
   }
@@ -392,8 +391,9 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
   // Check whether there's not a write between the two loads which would
   // prevent them from being safely merged.
   auto SafeToPair = [&](LoadInst *Base, LoadInst *Offset) {
-    LoadInst *Dominator = OrderedBB.dominates(Base, Offset) ? Base : Offset;
-    LoadInst *Dominated = OrderedBB.dominates(Base, Offset) ? Offset : Base;
+    bool BaseFirst = Base->comesBefore(Offset);
+    LoadInst *Dominator = BaseFirst ? Base : Offset;
+    LoadInst *Dominated = BaseFirst ? Offset : Base;
 
     if (RAWDeps.count(Dominated)) {
       InstSet &WritesBefore = RAWDeps[Dominated];
@@ -401,7 +401,7 @@ bool ARMParallelDSP::RecordMemoryOps(BasicBlock *BB) {
       for (auto Before : WritesBefore) {
         // We can't move the second load backward, past a write, to merge
         // with the first load.
-        if (OrderedBB.dominates(Dominator, Before))
+        if (Dominator->comesBefore(Before))
           return false;
       }
     }
@@ -571,6 +571,10 @@ bool ARMParallelDSP::CreateParallelPairs(Reduction &R) {
     auto Ld2 = static_cast<LoadInst*>(PMul0->RHS);
     auto Ld3 = static_cast<LoadInst*>(PMul1->RHS);
 
+    // Check that each mul is operating on two different loads.
+    if (Ld0 == Ld2 || Ld1 == Ld3)
+      return false;
+
     if (AreSequentialLoads(Ld0, Ld1, PMul0->VecLd)) {
       if (AreSequentialLoads(Ld2, Ld3, PMul1->VecLd)) {
         LLVM_DEBUG(dbgs() << "OK: found two pairs of parallel loads!\n");
@@ -705,12 +709,11 @@ void ARMParallelDSP::InsertParallelMACs(Reduction &R) {
   }
 
   // Roughly sort the mul pairs in their program order.
-  OrderedBasicBlock OrderedBB(R.getRoot()->getParent());
-  llvm::sort(R.getMulPairs(), [&OrderedBB](auto &PairA, auto &PairB) {
-               const Instruction *A = PairA.first->Root;
-               const Instruction *B = PairB.first->Root;
-               return OrderedBB.dominates(A, B);
-             });
+  llvm::sort(R.getMulPairs(), [](auto &PairA, auto &PairB) {
+    const Instruction *A = PairA.first->Root;
+    const Instruction *B = PairB.first->Root;
+    return A->comesBefore(B);
+  });
 
   IntegerType *Ty = IntegerType::get(M->getContext(), 32);
   for (auto &Pair : R.getMulPairs()) {
@@ -772,8 +775,7 @@ LoadInst* ARMParallelDSP::CreateWideLoad(MemInstList &Loads,
   const unsigned AddrSpace = DomLoad->getPointerAddressSpace();
   Value *VecPtr = IRB.CreateBitCast(Base->getPointerOperand(),
                                     LoadTy->getPointerTo(AddrSpace));
-  LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr,
-                                             Base->getAlignment());
+  LoadInst *WideLoad = IRB.CreateAlignedLoad(LoadTy, VecPtr, Base->getAlign());
 
   // Make sure everything is in the correct order in the basic block.
   MoveBefore(Base->getPointerOperand(), VecPtr);
diff --git a/llvm/lib/Target/ARM/ARMPredicates.td b/llvm/lib/Target/ARM/ARMPredicates.td
index dea1d767beb4..1ae71be9f760 100644
--- a/llvm/lib/Target/ARM/ARMPredicates.td
+++ b/llvm/lib/Target/ARM/ARMPredicates.td
@@ -7,148 +7,160 @@
 //===----------------------------------------------------------------------===//
 
 def HasV4T           : Predicate<"Subtarget->hasV4TOps()">,
-                                 AssemblerPredicate<"HasV4TOps", "armv4t">;
+                                 AssemblerPredicate<(all_of HasV4TOps), "armv4t">;
 def NoV4T            : Predicate<"!Subtarget->hasV4TOps()">;
 def HasV5T           : Predicate<"Subtarget->hasV5TOps()">,
-                                 AssemblerPredicate<"HasV5TOps", "armv5t">;
+                                 AssemblerPredicate<(all_of HasV5TOps), "armv5t">;
 def NoV5T            : Predicate<"!Subtarget->hasV5TOps()">;
 def HasV5TE          : Predicate<"Subtarget->hasV5TEOps()">,
-                                 AssemblerPredicate<"HasV5TEOps", "armv5te">;
+                                 AssemblerPredicate<(all_of HasV5TEOps), "armv5te">;
 def HasV6            : Predicate<"Subtarget->hasV6Ops()">,
-                                 AssemblerPredicate<"HasV6Ops", "armv6">;
+                                 AssemblerPredicate<(all_of HasV6Ops), "armv6">;
 def NoV6             : Predicate<"!Subtarget->hasV6Ops()">;
 def HasV6M           : Predicate<"Subtarget->hasV6MOps()">,
-                                 AssemblerPredicate<"HasV6MOps",
+                                 AssemblerPredicate<(all_of HasV6MOps),
                                                     "armv6m or armv6t2">;
 def HasV8MBaseline   : Predicate<"Subtarget->hasV8MBaselineOps()">,
-                                 AssemblerPredicate<"HasV8MBaselineOps",
+                                 AssemblerPredicate<(all_of HasV8MBaselineOps),
                                                     "armv8m.base">;
 def HasV8MMainline   : Predicate<"Subtarget->hasV8MMainlineOps()">,
-                                 AssemblerPredicate<"HasV8MMainlineOps",
+                                 AssemblerPredicate<(all_of HasV8MMainlineOps),
                                                     "armv8m.main">;
 def HasV8_1MMainline : Predicate<"Subtarget->hasV8_1MMainlineOps()">,
-                                 AssemblerPredicate<"HasV8_1MMainlineOps",
+                                 AssemblerPredicate<(all_of HasV8_1MMainlineOps),
                                                     "armv8.1m.main">;
 def HasMVEInt        : Predicate<"Subtarget->hasMVEIntegerOps()">,
-                                 AssemblerPredicate<"HasMVEIntegerOps",
+                                 AssemblerPredicate<(all_of HasMVEIntegerOps),
                                                     "mve">;
 def HasMVEFloat      : Predicate<"Subtarget->hasMVEFloatOps()">,
-                                 AssemblerPredicate<"HasMVEFloatOps",
+                                 AssemblerPredicate<(all_of HasMVEFloatOps),
                                                     "mve.fp">;
+def HasCDE           : Predicate<"Subtarget->hasCDEOps()">,
+                                 AssemblerPredicate<(all_of HasCDEOps),
+                                                    "cde">;
 def HasFPRegs        : Predicate<"Subtarget->hasFPRegs()">,
-                                 AssemblerPredicate<"FeatureFPRegs",
+                                 AssemblerPredicate<(all_of FeatureFPRegs),
                                                     "fp registers">;
 def HasFPRegs16      : Predicate<"Subtarget->hasFPRegs16()">,
-                                 AssemblerPredicate<"FeatureFPRegs16",
+                                 AssemblerPredicate<(all_of FeatureFPRegs16),
+                                                    "16-bit fp registers">;
+def HasNoFPRegs16    : Predicate<"!Subtarget->hasFPRegs16()">,
+                                 AssemblerPredicate<(all_of (not FeatureFPRegs16)),
                                                     "16-bit fp registers">;
 def HasFPRegs64      : Predicate<"Subtarget->hasFPRegs64()">,
-                                 AssemblerPredicate<"FeatureFPRegs64",
+                                 AssemblerPredicate<(all_of FeatureFPRegs64),
                                                     "64-bit fp registers">;
 def HasFPRegsV8_1M   : Predicate<"Subtarget->hasFPRegs() && Subtarget->hasV8_1MMainlineOps()">,
-                                 AssemblerPredicate<"FeatureFPRegs,HasV8_1MMainlineOps",
+                                 AssemblerPredicate<(all_of FeatureFPRegs, HasV8_1MMainlineOps),
                                                     "armv8.1m.main with FP or MVE">;
 def HasV6T2          : Predicate<"Subtarget->hasV6T2Ops()">,
-                                 AssemblerPredicate<"HasV6T2Ops", "armv6t2">;
+                                 AssemblerPredicate<(all_of HasV6T2Ops), "armv6t2">;
 def NoV6T2           : Predicate<"!Subtarget->hasV6T2Ops()">;
 def HasV6K           : Predicate<"Subtarget->hasV6KOps()">,
-                                 AssemblerPredicate<"HasV6KOps", "armv6k">;
+                                 AssemblerPredicate<(all_of HasV6KOps), "armv6k">;
 def NoV6K            : Predicate<"!Subtarget->hasV6KOps()">;
 def HasV7            : Predicate<"Subtarget->hasV7Ops()">,
-                                 AssemblerPredicate<"HasV7Ops", "armv7">;
+                                 AssemblerPredicate<(all_of HasV7Ops), "armv7">;
 def HasV8            : Predicate<"Subtarget->hasV8Ops()">,
-                                 AssemblerPredicate<"HasV8Ops", "armv8">;
+                                 AssemblerPredicate<(all_of HasV8Ops), "armv8">;
 def PreV8            : Predicate<"!Subtarget->hasV8Ops()">,
-                                 AssemblerPredicate<"!HasV8Ops", "armv7 or earlier">;
+                                 AssemblerPredicate<(all_of (not HasV8Ops)), "armv7 or earlier">;
 def HasV8_1a         : Predicate<"Subtarget->hasV8_1aOps()">,
-                                 AssemblerPredicate<"HasV8_1aOps", "armv8.1a">;
+                                 AssemblerPredicate<(all_of HasV8_1aOps), "armv8.1a">;
 def HasV8_2a         : Predicate<"Subtarget->hasV8_2aOps()">,
-                                 AssemblerPredicate<"HasV8_2aOps", "armv8.2a">;
+                                 AssemblerPredicate<(all_of HasV8_2aOps), "armv8.2a">;
 def HasV8_3a         : Predicate<"Subtarget->hasV8_3aOps()">,
-                                 AssemblerPredicate<"HasV8_3aOps", "armv8.3a">;
+                                 AssemblerPredicate<(all_of HasV8_3aOps), "armv8.3a">;
 def HasV8_4a         : Predicate<"Subtarget->hasV8_4aOps()">,
-                                 AssemblerPredicate<"HasV8_4aOps", "armv8.4a">;
+                                 AssemblerPredicate<(all_of HasV8_4aOps), "armv8.4a">;
 def HasV8_5a         : Predicate<"Subtarget->hasV8_5aOps()">,
-                                 AssemblerPredicate<"HasV8_5aOps", "armv8.5a">;
+                                 AssemblerPredicate<(all_of HasV8_5aOps), "armv8.5a">;
+def HasV8_6a         : Predicate<"Subtarget->hasV8_6aOps()">,
+                                 AssemblerPredicate<(all_of HasV8_6aOps), "armv8.6a">;
 def NoVFP            : Predicate<"!Subtarget->hasVFP2Base()">;
 def HasVFP2          : Predicate<"Subtarget->hasVFP2Base()">,
-                                 AssemblerPredicate<"FeatureVFP2_SP", "VFP2">;
+                                 AssemblerPredicate<(all_of FeatureVFP2_SP), "VFP2">;
 def HasVFP3          : Predicate<"Subtarget->hasVFP3Base()">,
-                                 AssemblerPredicate<"FeatureVFP3_D16_SP", "VFP3">;
+                                 AssemblerPredicate<(all_of FeatureVFP3_D16_SP), "VFP3">;
 def HasVFP4          : Predicate<"Subtarget->hasVFP4Base()">,
-                                 AssemblerPredicate<"FeatureVFP4_D16_SP", "VFP4">;
+                                 AssemblerPredicate<(all_of FeatureVFP4_D16_SP), "VFP4">;
 def HasDPVFP         : Predicate<"Subtarget->hasFP64()">,
-                                 AssemblerPredicate<"FeatureFP64",
+                                 AssemblerPredicate<(all_of FeatureFP64),
                                                     "double precision VFP">;
 def HasFPARMv8       : Predicate<"Subtarget->hasFPARMv8Base()">,
-                                 AssemblerPredicate<"FeatureFPARMv8_D16_SP", "FPARMv8">;
+                                 AssemblerPredicate<(all_of FeatureFPARMv8_D16_SP), "FPARMv8">;
 def HasNEON          : Predicate<"Subtarget->hasNEON()">,
-                                 AssemblerPredicate<"FeatureNEON", "NEON">;
+                                 AssemblerPredicate<(all_of FeatureNEON), "NEON">;
 def HasSHA2          : Predicate<"Subtarget->hasSHA2()">,
-                                 AssemblerPredicate<"FeatureSHA2", "sha2">;
+                                 AssemblerPredicate<(all_of FeatureSHA2), "sha2">;
 def HasAES           : Predicate<"Subtarget->hasAES()">,
-                                 AssemblerPredicate<"FeatureAES", "aes">;
+                                 AssemblerPredicate<(all_of FeatureAES), "aes">;
 def HasCrypto        : Predicate<"Subtarget->hasCrypto()">,
-                                 AssemblerPredicate<"FeatureCrypto", "crypto">;
+                                 AssemblerPredicate<(all_of FeatureCrypto), "crypto">;
 def HasDotProd       : Predicate<"Subtarget->hasDotProd()">,
-                                 AssemblerPredicate<"FeatureDotProd", "dotprod">;
+                                 AssemblerPredicate<(all_of FeatureDotProd), "dotprod">;
 def HasCRC           : Predicate<"Subtarget->hasCRC()">,
-                                 AssemblerPredicate<"FeatureCRC", "crc">;
+                                 AssemblerPredicate<(all_of FeatureCRC), "crc">;
 def HasRAS           : Predicate<"Subtarget->hasRAS()">,
-                                 AssemblerPredicate<"FeatureRAS", "ras">;
+                                 AssemblerPredicate<(all_of FeatureRAS), "ras">;
 def HasLOB           : Predicate<"Subtarget->hasLOB()">,
-                                 AssemblerPredicate<"FeatureLOB", "lob">;
+                                 AssemblerPredicate<(all_of FeatureLOB), "lob">;
 def HasFP16          : Predicate<"Subtarget->hasFP16()">,
-                                 AssemblerPredicate<"FeatureFP16","half-float conversions">;
+                                 AssemblerPredicate<(all_of FeatureFP16),"half-float conversions">;
 def HasFullFP16      : Predicate<"Subtarget->hasFullFP16()">,
-                                 AssemblerPredicate<"FeatureFullFP16","full half-float">;
+                                 AssemblerPredicate<(all_of FeatureFullFP16),"full half-float">;
 def HasFP16FML       : Predicate<"Subtarget->hasFP16FML()">,
-                                 AssemblerPredicate<"FeatureFP16FML","full half-float fml">;
+                                 AssemblerPredicate<(all_of FeatureFP16FML),"full half-float fml">;
+def HasBF16          : Predicate<"Subtarget->hasBF16()">,
+                                 AssemblerPredicate<(all_of FeatureBF16),"BFloat16 floating point extension">;
+def HasMatMulInt8    : Predicate<"Subtarget->hasMatMulInt8()">,
+                                 AssemblerPredicate<(all_of FeatureMatMulInt8),"8-bit integer matrix multiply">;
 def HasDivideInThumb : Predicate<"Subtarget->hasDivideInThumbMode()">,
-                                 AssemblerPredicate<"FeatureHWDivThumb", "divide in THUMB">;
+                                 AssemblerPredicate<(all_of FeatureHWDivThumb), "divide in THUMB">;
 def HasDivideInARM   : Predicate<"Subtarget->hasDivideInARMMode()">,
-                                 AssemblerPredicate<"FeatureHWDivARM", "divide in ARM">;
+                                 AssemblerPredicate<(all_of FeatureHWDivARM), "divide in ARM">;
 def HasDSP           : Predicate<"Subtarget->hasDSP()">,
-                                 AssemblerPredicate<"FeatureDSP", "dsp">;
+                                 AssemblerPredicate<(all_of FeatureDSP), "dsp">;
 def HasDB            : Predicate<"Subtarget->hasDataBarrier()">,
-                                 AssemblerPredicate<"FeatureDB",
+                                 AssemblerPredicate<(all_of FeatureDB),
                                                     "data-barriers">;
 def HasDFB           : Predicate<"Subtarget->hasFullDataBarrier()">,
-                                 AssemblerPredicate<"FeatureDFB",
+                                 AssemblerPredicate<(all_of FeatureDFB),
                                                     "full-data-barrier">;
 def HasV7Clrex  : Predicate<"Subtarget->hasV7Clrex()">,
-                            AssemblerPredicate<"FeatureV7Clrex",
+                            AssemblerPredicate<(all_of FeatureV7Clrex),
                                                "v7 clrex">;
 def HasAcquireRelease : Predicate<"Subtarget->hasAcquireRelease()">,
-                                  AssemblerPredicate<"FeatureAcquireRelease",
+                                  AssemblerPredicate<(all_of FeatureAcquireRelease),
                                                      "acquire/release">;
 def HasMP            : Predicate<"Subtarget->hasMPExtension()">,
-                                 AssemblerPredicate<"FeatureMP",
+                                 AssemblerPredicate<(all_of FeatureMP),
                                                     "mp-extensions">;
 def HasVirtualization: Predicate<"false">,
-                                 AssemblerPredicate<"FeatureVirtualization",
+                                 AssemblerPredicate<(all_of FeatureVirtualization),
                                                    "virtualization-extensions">;
 def HasTrustZone     : Predicate<"Subtarget->hasTrustZone()">,
-                                 AssemblerPredicate<"FeatureTrustZone",
+                                 AssemblerPredicate<(all_of FeatureTrustZone),
                                                     "TrustZone">;
 def Has8MSecExt      : Predicate<"Subtarget->has8MSecExt()">,
-                                 AssemblerPredicate<"Feature8MSecExt",
+                                 AssemblerPredicate<(all_of Feature8MSecExt),
                                                     "ARMv8-M Security Extensions">;
 def HasZCZ           : Predicate<"Subtarget->hasZeroCycleZeroing()">;
 def UseNEONForFP     : Predicate<"Subtarget->useNEONForSinglePrecisionFP()">;
 def DontUseNEONForFP : Predicate<"!Subtarget->useNEONForSinglePrecisionFP()">;
 def IsThumb          : Predicate<"Subtarget->isThumb()">,
-                                 AssemblerPredicate<"ModeThumb", "thumb">;
+                                 AssemblerPredicate<(all_of ModeThumb), "thumb">;
 def IsThumb1Only     : Predicate<"Subtarget->isThumb1Only()">;
 def IsThumb2         : Predicate<"Subtarget->isThumb2()">,
-                                 AssemblerPredicate<"ModeThumb,FeatureThumb2",
+                                 AssemblerPredicate<(all_of ModeThumb, FeatureThumb2),
                                                     "thumb2">;
 def IsMClass         : Predicate<"Subtarget->isMClass()">,
-                                 AssemblerPredicate<"FeatureMClass", "armv*m">;
+                                 AssemblerPredicate<(all_of FeatureMClass), "armv*m">;
 def IsNotMClass      : Predicate<"!Subtarget->isMClass()">,
-                                 AssemblerPredicate<"!FeatureMClass",
+                                 AssemblerPredicate<(all_of (not FeatureMClass)),
                                                     "!armv*m">;
 def IsARM            : Predicate<"!Subtarget->isThumb()">,
-                                 AssemblerPredicate<"!ModeThumb", "arm-mode">;
+                                 AssemblerPredicate<(all_of (not ModeThumb)), "arm-mode">;
 def IsMachO          : Predicate<"Subtarget->isTargetMachO()">;
 def IsNotMachO       : Predicate<"!Subtarget->isTargetMachO()">;
 def IsNaCl           : Predicate<"Subtarget->isTargetNaCl()">;
@@ -157,12 +169,12 @@ def IsNotWindows     : Predicate<"!Subtarget->isTargetWindows()">;
 def IsReadTPHard     : Predicate<"Subtarget->isReadTPHard()">;
 def IsReadTPSoft     : Predicate<"!Subtarget->isReadTPHard()">;
 def UseNaClTrap      : Predicate<"Subtarget->useNaClTrap()">,
-                                 AssemblerPredicate<"FeatureNaClTrap", "NaCl">;
+                                 AssemblerPredicate<(all_of FeatureNaClTrap), "NaCl">;
 def DontUseNaClTrap  : Predicate<"!Subtarget->useNaClTrap()">;
 
 def UseNegativeImmediates :
   Predicate<"false">,
-            AssemblerPredicate<"!FeatureNoNegativeImmediates",
+            AssemblerPredicate<(all_of (not FeatureNoNegativeImmediates)),
                                "NegativeImmediates">;
 
 // FIXME: Eventually this will be just "hasV6T2Ops".
@@ -206,4 +218,4 @@ def GenExecuteOnly : Predicate<"Subtarget->genExecuteOnly()">;
 
 // Armv8.5-A extensions
 def HasSB            : Predicate<"Subtarget->hasSB()">,
-                       AssemblerPredicate<"FeatureSB", "sb">;
+                       AssemblerPredicate<(all_of FeatureSB), "sb">;
diff --git a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
index 43c8cd5a89be..f9dbfef4c113 100644
--- a/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMRegisterBankInfo.cpp
@@ -131,45 +131,47 @@ static void checkValueMappings() {
 
 ARMRegisterBankInfo::ARMRegisterBankInfo(const TargetRegisterInfo &TRI)
     : ARMGenRegisterBankInfo() {
-  static bool AlreadyInit = false;
   // We have only one set of register banks, whatever the subtarget
   // is. Therefore, the initialization of the RegBanks table should be
   // done only once. Indeed the table of all register banks
   // (ARM::RegBanks) is unique in the compiler. At some point, it
   // will get tablegen'ed and the whole constructor becomes empty.
-  if (AlreadyInit)
-    return;
-  AlreadyInit = true;
+  static llvm::once_flag InitializeRegisterBankFlag;
 
-  const RegisterBank &RBGPR = getRegBank(ARM::GPRRegBankID);
-  (void)RBGPR;
-  assert(&ARM::GPRRegBank == &RBGPR && "The order in RegBanks is messed up");
+  static auto InitializeRegisterBankOnce = [&]() {
+    const RegisterBank &RBGPR = getRegBank(ARM::GPRRegBankID);
+    (void)RBGPR;
+    assert(&ARM::GPRRegBank == &RBGPR && "The order in RegBanks is messed up");
 
-  // Initialize the GPR bank.
-  assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRRegClassID)) &&
-         "Subclass not added?");
-  assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRwithAPSRRegClassID)) &&
-         "Subclass not added?");
-  assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRnopcRegClassID)) &&
-         "Subclass not added?");
-  assert(RBGPR.covers(*TRI.getRegClass(ARM::rGPRRegClassID)) &&
-         "Subclass not added?");
-  assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPRRegClassID)) &&
-         "Subclass not added?");
-  assert(RBGPR.covers(*TRI.getRegClass(ARM::tcGPRRegClassID)) &&
-         "Subclass not added?");
-  assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) &&
-         "Subclass not added?");
-  assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPREven_and_tGPR_and_tcGPRRegClassID)) &&
-         "Subclass not added?");
-  assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPROdd_and_tcGPRRegClassID)) &&
-         "Subclass not added?");
-  assert(RBGPR.getSize() == 32 && "GPRs should hold up to 32-bit");
+    // Initialize the GPR bank.
+    assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRRegClassID)) &&
+           "Subclass not added?");
+    assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRwithAPSRRegClassID)) &&
+           "Subclass not added?");
+    assert(RBGPR.covers(*TRI.getRegClass(ARM::GPRnopcRegClassID)) &&
+           "Subclass not added?");
+    assert(RBGPR.covers(*TRI.getRegClass(ARM::rGPRRegClassID)) &&
+           "Subclass not added?");
+    assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPRRegClassID)) &&
+           "Subclass not added?");
+    assert(RBGPR.covers(*TRI.getRegClass(ARM::tcGPRRegClassID)) &&
+           "Subclass not added?");
+    assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPR_and_tcGPRRegClassID)) &&
+           "Subclass not added?");
+    assert(RBGPR.covers(
+               *TRI.getRegClass(ARM::tGPREven_and_tGPR_and_tcGPRRegClassID)) &&
+           "Subclass not added?");
+    assert(RBGPR.covers(*TRI.getRegClass(ARM::tGPROdd_and_tcGPRRegClassID)) &&
+           "Subclass not added?");
+    assert(RBGPR.getSize() == 32 && "GPRs should hold up to 32-bit");
 
 #ifndef NDEBUG
-  ARM::checkPartialMappings();
-  ARM::checkValueMappings();
+    ARM::checkPartialMappings();
+    ARM::checkValueMappings();
 #endif
+  };
+
+  llvm::call_once(InitializeRegisterBankFlag, InitializeRegisterBankOnce);
 }
 
 const RegisterBank &
diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td
index 56055a15483a..a384b0dc757c 100644
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -305,6 +305,17 @@ def rGPR : RegisterClass<"ARM", [i32], 32, (sub GPR, SP, PC)> {
   let DiagnosticType = "rGPR";
 }
 
+// GPRs without the PC and SP but with APSR_NZCV.Some instructions allow
+// accessing the APSR_NZCV, while actually encoding PC in the register field.
+// This is useful for assembly and disassembly only.
+// Currently used by the CDE extension.
+def GPRwithAPSR_NZCVnosp
+  : RegisterClass<"ARM", [i32], 32, (add (sequence "R%u", 0, 12), LR, APSR_NZCV)> {
+  let isAllocatable = 0;
+  let DiagnosticString =
+    "operand must be a register in the range [r0, r12], r14 or apsr_nzcv";
+}
+
 // Thumb registers are R0-R7 normally. Some instructions can still use
 // the general GPR register class above (MOV, e.g.)
 def tGPR : RegisterClass<"ARM", [i32], 32, (trunc GPR, 8)> {
@@ -379,7 +390,7 @@ def SPR : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 31)> {
   let DiagnosticString = "operand must be a register in range [s0, s31]";
 }
 
-def HPR : RegisterClass<"ARM", [f16], 32, (sequence "S%u", 0, 31)> {
+def HPR : RegisterClass<"ARM", [f16, bf16], 32, (sequence "S%u", 0, 31)> {
   let AltOrders = [(add (decimate HPR, 2), SPR),
                    (add (decimate HPR, 4),
                         (decimate HPR, 2),
@@ -401,7 +412,7 @@ def SPR_8 : RegisterClass<"ARM", [f32], 32, (sequence "S%u", 0, 15)> {
 // class.
 // ARM requires only word alignment for double. It's more performant if it
 // is double-word alignment though.
-def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
+def DPR : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16, v4bf16], 64,
                         (sequence "D%u", 0, 31)> {
   // Allocate non-VFP2 registers D16-D31 first, and prefer even registers on
   // Darwin platforms.
@@ -422,20 +433,20 @@ def FPWithVPR : RegisterClass<"ARM", [f32], 32, (add SPR, DPR, VPR)> {
 
 // Subset of DPR that are accessible with VFP2 (and so that also have
 // 32-bit SPR subregs).
-def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
+def DPR_VFP2 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16, v4bf16], 64,
                              (trunc DPR, 16)> {
   let DiagnosticString = "operand must be a register in range [d0, d15]";
 }
 
 // Subset of DPR which can be used as a source of NEON scalars for 16-bit
 // operations
-def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16], 64,
+def DPR_8 : RegisterClass<"ARM", [f64, v8i8, v4i16, v2i32, v1i64, v2f32, v4f16, v4bf16], 64,
                           (trunc DPR, 8)> {
   let DiagnosticString = "operand must be a register in range [d0, d7]";
 }
 
 // Generic 128-bit vector register class.
-def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16], 128,
+def QPR : RegisterClass<"ARM", [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64, v8f16, v8bf16], 128,
                         (sequence "Q%u", 0, 15)> {
   // Allocate non-VFP2 aliases Q8-Q15 first.
   let AltOrders = [(rotl QPR, 8), (trunc QPR, 8)];
@@ -577,3 +588,6 @@ def Tuples4DSpc : RegisterTuples<[dsub_0, dsub_2, dsub_4, dsub_6],
 
 // Spaced quads of D registers.
 def DQuadSpc : RegisterClass<"ARM", [v4i64], 64, (add Tuples3DSpc)>;
+
+// FP context payload
+def FPCXTRegs : RegisterClass<"ARM", [i32], 32, (add FPCXTNS)>;
diff --git a/llvm/lib/Target/ARM/ARMScheduleA57.td b/llvm/lib/Target/ARM/ARMScheduleA57.td
index a79f3348f338..d9a8d304c41f 100644
--- a/llvm/lib/Target/ARM/ARMScheduleA57.td
+++ b/llvm/lib/Target/ARM/ARMScheduleA57.td
@@ -96,7 +96,7 @@ def CortexA57Model : SchedMachineModel {
   let FullInstRWOverlapCheck = 0;
 
   let UnsupportedFeatures = [HasV8_1MMainline, HasMVEInt, HasMVEFloat,
-                             HasFPRegsV8_1M];
+                             HasFPRegsV8_1M, HasFP16FML, HasMatMulInt8, HasBF16];
 }
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/ARM/ARMScheduleSwift.td b/llvm/lib/Target/ARM/ARMScheduleSwift.td
index 00a44599b1b2..e0e98bfa0e9b 100644
--- a/llvm/lib/Target/ARM/ARMScheduleSwift.td
+++ b/llvm/lib/Target/ARM/ARMScheduleSwift.td
@@ -744,7 +744,7 @@ let SchedModel = SwiftModel in {
                                 SwiftWriteLM14CyNo, SwiftWriteLM14CyNo,
                                 SwiftWriteLM14CyNo,  SwiftWriteLM14CyNo,
                                 SwiftWriteP01OneCycle, SwiftVLDMPerm5]>,
-    // Inaccurate: reuse describtion from 9 S registers.
+    // Inaccurate: reuse description from 9 S registers.
     SchedVar<SwiftLMAddr11Pred,[SwiftWriteLM9Cy, SwiftWriteLM10Cy,
                                 SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
                                 SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
@@ -760,7 +760,7 @@ let SchedModel = SwiftModel in {
                                 SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
                                 SwiftWriteLM11CyNo, SwiftWriteLM11CyNo,
                                 SwiftWriteP01OneCycle, SwiftVLDMPerm3]>,
-    // Inaccurate: reuse describtion from 9 S registers.
+    // Inaccurate: reuse description from 9 S registers.
     SchedVar<SwiftLMAddr13Pred, [SwiftWriteLM9Cy, SwiftWriteLM10Cy,
                                 SwiftWriteLM13Cy, SwiftWriteLM14CyNo,
                                 SwiftWriteLM17CyNo, SwiftWriteLM18CyNo,
@@ -958,7 +958,7 @@ let SchedModel = SwiftModel in {
   def : InstRW<[SwiftWriteLM7Cy, SwiftWriteP01OneCycle, SwiftWriteLM8Cy,
                 SwiftWriteLM8Cy, SwiftExt1xP0, SwiftVLDMPerm3],
         (instregex "VLD3(LN|DUP)(d|q)(8|16|32)Pseudo_UPD")>;
-  // Four element struture.
+  // Four element structure.
   def : InstRW<[SwiftWriteLM8Cy, SwiftWriteLM9Cy, SwiftWriteLM10CyNo,
                 SwiftWriteLM10CyNo, SwiftExt1xP0, SwiftVLDMPerm5],
         (instregex "VLD4(LN|DUP)(d|q)(8|16|32)$",
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
index cade06e8c109..7e06229b60c3 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.cpp
@@ -126,24 +126,24 @@ SDValue ARMSelectionDAGInfo::EmitSpecializedLibcall(
 
 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   const ARMSubtarget &Subtarget =
       DAG.getMachineFunction().getSubtarget<ARMSubtarget>();
   // Do repeated 4-byte loads and stores. To be improved.
   // This requires 4-byte alignment.
-  if ((Align & 3) != 0)
+  if (Alignment < Align(4))
     return SDValue();
   // This requires the copy size to be a constant, preferably
   // within a subtarget-specific limit.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   if (!ConstantSize)
-    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
-                                  RTLIB::MEMCPY);
+    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+                                  Alignment.value(), RTLIB::MEMCPY);
   uint64_t SizeVal = ConstantSize->getZExtValue();
   if (!AlwaysInline && SizeVal > Subtarget.getMaxInlineSizeThreshold())
-    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
-                                  RTLIB::MEMCPY);
+    return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+                                  Alignment.value(), RTLIB::MEMCPY);
 
   unsigned BytesLeft = SizeVal & 3;
   unsigned NumMemOps = SizeVal >> 2;
@@ -240,16 +240,16 @@ SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemcpy(
 
 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemmove(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, unsigned Align, bool isVolatile,
+    SDValue Size, Align Alignment, bool isVolatile,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
-  return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
-                                RTLIB::MEMMOVE);
+  return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+                                Alignment.value(), RTLIB::MEMMOVE);
 }
 
 SDValue ARMSelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, unsigned Align, bool isVolatile,
+    SDValue Size, Align Alignment, bool isVolatile,
     MachinePointerInfo DstPtrInfo) const {
-  return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size, Align,
-                                RTLIB::MEMSET);
+  return EmitSpecializedLibcall(DAG, dl, Chain, Dst, Src, Size,
+                                Alignment.value(), RTLIB::MEMSET);
 }
diff --git a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
index b8a86ae7310f..7aa831c09248 100644
--- a/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
+++ b/llvm/lib/Target/ARM/ARMSelectionDAGInfo.h
@@ -39,22 +39,22 @@ class ARMSelectionDAGInfo : public SelectionDAGTargetInfo {
 public:
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align, bool isVolatile,
-                                  bool AlwaysInline,
+                                  SDValue Size, Align Alignment,
+                                  bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const override;
 
   SDValue
   EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
                            SDValue Dst, SDValue Src, SDValue Size,
-                           unsigned Align, bool isVolatile,
+                           Align Alignment, bool isVolatile,
                            MachinePointerInfo DstPtrInfo,
                            MachinePointerInfo SrcPtrInfo) const override;
 
   // Adjust parameters for memset, see RTABI section 4.3.4
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
-                                  SDValue Op3, unsigned Align, bool isVolatile,
+                                  SDValue Op3, Align Alignment, bool isVolatile,
                                   MachinePointerInfo DstPtrInfo) const override;
 
   SDValue EmitSpecializedLibcall(SelectionDAG &DAG, const SDLoc &dl,
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index eb4d39b01cbb..46802037c2aa 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -183,7 +183,7 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
     if (!ArchFS.empty())
       ArchFS = (Twine(ArchFS) + "," + FS).str();
     else
-      ArchFS = FS;
+      ArchFS = std::string(FS);
   }
   ParseSubtargetFeatures(CPUString, ArchFS);
 
@@ -292,12 +292,15 @@ void ARMSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   case CortexA73:
   case CortexA75:
   case CortexA76:
+  case CortexA77:
+  case CortexA78:
   case CortexR4:
   case CortexR4F:
   case CortexR5:
   case CortexR7:
   case CortexM3:
   case CortexR52:
+  case CortexX1:
     break;
   case Exynos:
     LdStMultipleTiming = SingleIssuePlusExtras;
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 6bdd021970ef..2703e385dd81 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -28,6 +28,7 @@
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/MC/MCInstrItineraries.h"
 #include "llvm/MC/MCSchedule.h"
+#include "llvm/Target/TargetMachine.h"
 #include "llvm/Target/TargetOptions.h"
 #include <memory>
 #include <string>
@@ -60,6 +61,8 @@ protected:
     CortexA73,
     CortexA75,
     CortexA76,
+    CortexA77,
+    CortexA78,
     CortexA8,
     CortexA9,
     CortexM3,
@@ -68,6 +71,7 @@ protected:
     CortexR5,
     CortexR52,
     CortexR7,
+    CortexX1,
     Exynos,
     Krait,
     Kryo,
@@ -108,6 +112,7 @@ protected:
     ARMv83a,
     ARMv84a,
     ARMv85a,
+    ARMv86a,
     ARMv8a,
     ARMv8mBaseline,
     ARMv8mMainline,
@@ -157,11 +162,13 @@ protected:
   bool HasV8_3aOps = false;
   bool HasV8_4aOps = false;
   bool HasV8_5aOps = false;
+  bool HasV8_6aOps = false;
   bool HasV8MBaselineOps = false;
   bool HasV8MMainlineOps = false;
   bool HasV8_1MMainlineOps = false;
   bool HasMVEIntegerOps = false;
   bool HasMVEFloatOps = false;
+  bool HasCDEOps = false;
 
   /// HasVFPv2, HasVFPv3, HasVFPv4, HasFPARMv8, HasNEON - Specify what
   /// floating point ISAs are supported.
@@ -254,6 +261,12 @@ protected:
   /// HasFP16FML - True if subtarget supports half-precision FP fml operations
   bool HasFP16FML = false;
 
+  /// HasBF16 - True if subtarget supports BFloat16 floating point operations
+  bool HasBF16 = false;
+
+  /// HasMatMulInt8 - True if subtarget supports 8-bit integer matrix multiply
+  bool HasMatMulInt8 = false;
+
   /// HasD32 - True if subtarget has the full 32 double precision
   /// FP registers for VFPv3.
   bool HasD32 = false;
@@ -562,6 +575,7 @@ private:
   void initSubtargetFeatures(StringRef CPU, StringRef FS);
   ARMFrameLowering *initializeFrameLowering(StringRef CPU, StringRef FS);
 
+  std::bitset<8> CoprocCDE = {};
 public:
   void computeIssueWidth();
 
@@ -579,11 +593,13 @@ public:
   bool hasV8_3aOps() const { return HasV8_3aOps; }
   bool hasV8_4aOps() const { return HasV8_4aOps; }
   bool hasV8_5aOps() const { return HasV8_5aOps; }
+  bool hasV8_6aOps() const { return HasV8_6aOps; }
   bool hasV8MBaselineOps() const { return HasV8MBaselineOps; }
   bool hasV8MMainlineOps() const { return HasV8MMainlineOps; }
   bool hasV8_1MMainlineOps() const { return HasV8_1MMainlineOps; }
   bool hasMVEIntegerOps() const { return HasMVEIntegerOps; }
   bool hasMVEFloatOps() const { return HasMVEFloatOps; }
+  bool hasCDEOps() const { return HasCDEOps; }
   bool hasFPRegs() const { return HasFPRegs; }
   bool hasFPRegs16() const { return HasFPRegs16; }
   bool hasFPRegs64() const { return HasFPRegs64; }
@@ -689,12 +705,15 @@ public:
   bool hasD32() const { return HasD32; }
   bool hasFullFP16() const { return HasFullFP16; }
   bool hasFP16FML() const { return HasFP16FML; }
+  bool hasBF16() const { return HasBF16; }
 
   bool hasFuseAES() const { return HasFuseAES; }
   bool hasFuseLiterals() const { return HasFuseLiterals; }
   /// Return true if the CPU supports any kind of instruction fusion.
   bool hasFusion() const { return hasFuseAES() || hasFuseLiterals(); }
 
+  bool hasMatMulInt8() const { return HasMatMulInt8; }
+
   const Triple &getTargetTriple() const { return TargetTriple; }
 
   bool isTargetDarwin() const { return TargetTriple.isOSDarwin(); }
diff --git a/llvm/lib/Target/ARM/ARMTargetMachine.cpp b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
index 84876eda33a6..9ead5fa4308c 100644
--- a/llvm/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetMachine.cpp
@@ -96,6 +96,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTarget() {
   initializeARMExpandPseudoPass(Registry);
   initializeThumb2SizeReducePass(Registry);
   initializeMVEVPTBlockPass(Registry);
+  initializeMVEVPTOptimisationsPass(Registry);
   initializeMVETailPredicationPass(Registry);
   initializeARMLowOverheadLoopsPass(Registry);
   initializeMVEGatherScatterLoweringPass(Registry);
@@ -243,7 +244,14 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, const Triple &TT,
     this->Options.NoTrapAfterNoreturn = true;
   }
 
+  // ARM supports the debug entry values.
+  setSupportsDebugEntryValues(true);
+
   initAsmInfo();
+
+  // ARM supports the MachineOutliner.
+  setMachineOutliner(true);
+  setSupportsDefaultOutlining(false);
 }
 
 ARMBaseTargetMachine::~ARMBaseTargetMachine() = default;
@@ -359,6 +367,7 @@ public:
   void addPreRegAlloc() override;
   void addPreSched2() override;
   void addPreEmitPass() override;
+  void addPreEmitPass2() override;
 
   std::unique_ptr<CSEConfigBase> getCSEConfig() const override;
 };
@@ -483,6 +492,8 @@ bool ARMPassConfig::addGlobalInstructionSelect() {
 
 void ARMPassConfig::addPreRegAlloc() {
   if (getOptLevel() != CodeGenOpt::None) {
+    addPass(createMVEVPTOptimisationsPass());
+
     addPass(createMLxExpansionPass());
 
     if (EnableARMLoadStoreOpt)
@@ -507,9 +518,12 @@ void ARMPassConfig::addPreSched2() {
   addPass(createARMExpandPseudoPass());
 
   if (getOptLevel() != CodeGenOpt::None) {
-    // in v8, IfConversion depends on Thumb instruction widths
+    // When optimising for size, always run the Thumb2SizeReduction pass before
+    // IfConversion. Otherwise, check whether IT blocks are restricted
+    // (e.g. in v8, IfConversion depends on Thumb instruction widths)
     addPass(createThumb2SizeReductionPass([this](const Function &F) {
-      return this->TM->getSubtarget<ARMSubtarget>(F).restrictIT();
+      return this->TM->getSubtarget<ARMSubtarget>(F).hasMinSize() ||
+             this->TM->getSubtarget<ARMSubtarget>(F).restrictIT();
     }));
 
     addPass(createIfConverter([](const MachineFunction &MF) {
@@ -538,7 +552,9 @@ void ARMPassConfig::addPreEmitPass() {
   // Don't optimize barriers at -O0.
   if (getOptLevel() != CodeGenOpt::None)
     addPass(createARMOptimizeBarriersPass());
+}
 
+void ARMPassConfig::addPreEmitPass2() {
   addPass(createARMConstantIslandPass());
   addPass(createARMLowOverheadLoopsPass());
 
diff --git a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
index 891329d3f297..3f0e3360632d 100644
--- a/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetObjectFile.cpp
@@ -49,7 +49,7 @@ void ARMElfTargetObjectFile::Initialize(MCContext &Ctx,
     // Since we cannot modify flags for an existing section, we create a new
     // section with the right flags, and use 0 as the unique ID for
     // execute-only text
-    TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U);
+    TextSection = Ctx.getELFSection(".text", Type, Flags, 0, "", 0U, nullptr);
   }
 }
 
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 7ff05034c1f2..bea4e157a131 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -16,18 +16,19 @@
 #include "llvm/CodeGen/ISDOpcodes.h"
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/SubtargetFeature.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -45,7 +46,7 @@ static cl::opt<bool> DisableLowOverheadLoops(
   "disable-arm-loloops", cl::Hidden, cl::init(false),
   cl::desc("Disable the generation of low-overhead loops"));
 
-extern cl::opt<bool> DisableTailPredication;
+extern cl::opt<TailPredication::Mode> EnableTailPredication;
 
 extern cl::opt<bool> EnableMaskedGatherScatters;
 
@@ -57,17 +58,32 @@ bool ARMTTIImpl::areInlineCompatible(const Function *Caller,
   const FeatureBitset &CalleeBits =
       TM.getSubtargetImpl(*Callee)->getFeatureBits();
 
-  // To inline a callee, all features not in the whitelist must match exactly.
-  bool MatchExact = (CallerBits & ~InlineFeatureWhitelist) ==
-                    (CalleeBits & ~InlineFeatureWhitelist);
-  // For features in the whitelist, the callee's features must be a subset of
+  // To inline a callee, all features not in the allowed list must match exactly.
+  bool MatchExact = (CallerBits & ~InlineFeaturesAllowed) ==
+                    (CalleeBits & ~InlineFeaturesAllowed);
+  // For features in the allowed list, the callee's features must be a subset of
   // the callers'.
-  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeatureWhitelist) ==
-                     (CalleeBits & InlineFeatureWhitelist);
+  bool MatchSubset = ((CallerBits & CalleeBits) & InlineFeaturesAllowed) ==
+                     (CalleeBits & InlineFeaturesAllowed);
   return MatchExact && MatchSubset;
 }
 
-int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+bool ARMTTIImpl::shouldFavorBackedgeIndex(const Loop *L) const {
+  if (L->getHeader()->getParent()->hasOptSize())
+    return false;
+  if (ST->hasMVEIntegerOps())
+    return false;
+  return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1;
+}
+
+bool ARMTTIImpl::shouldFavorPostInc() const {
+  if (ST->hasMVEIntegerOps())
+    return true;
+  return false;
+}
+
+int ARMTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+                              TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
 
  unsigned Bits = Ty->getPrimitiveSizeInBits();
@@ -110,7 +126,7 @@ int ARMTTIImpl::getIntImmCodeSizeCost(unsigned Opcode, unsigned Idx,
 }
 
 int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                              Type *Ty) {
+                                  Type *Ty, TTI::TargetCostKind CostKind) {
   // Division by a constant can be turned into multiplication, but only if we
   // know it's constant. So it's not so much that the immediate is cheap (it's
   // not), but that the alternative is worse.
@@ -125,12 +141,14 @@ int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Im
     if (Imm == 255 || Imm == 65535)
       return 0;
     // Conversion to BIC is free, and means we can use ~Imm instead.
-    return std::min(getIntImmCost(Imm, Ty), getIntImmCost(~Imm, Ty));
+    return std::min(getIntImmCost(Imm, Ty, CostKind),
+                    getIntImmCost(~Imm, Ty, CostKind));
   }
 
   if (Opcode == Instruction::Add)
     // Conversion to SUB is free, and means we can use -Imm instead.
-    return std::min(getIntImmCost(Imm, Ty), getIntImmCost(-Imm, Ty));
+    return std::min(getIntImmCost(Imm, Ty, CostKind),
+                    getIntImmCost(-Imm, Ty, CostKind));
 
   if (Opcode == Instruction::ICmp && Imm.isNegative() &&
       Ty->getIntegerBitWidth() == 32) {
@@ -147,34 +165,27 @@ int ARMTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Im
   if (Opcode == Instruction::Xor && Imm.isAllOnesValue())
     return 0;
 
-  return getIntImmCost(Imm, Ty);
+  return getIntImmCost(Imm, Ty, CostKind);
 }
 
 int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 TTI::TargetCostKind CostKind,
                                  const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
-  // Single to/from double precision conversions.
-  static const CostTblEntry NEONFltDblTbl[] = {
-    // Vector fptrunc/fpext conversions.
-    { ISD::FP_ROUND,   MVT::v2f64, 2 },
-    { ISD::FP_EXTEND,  MVT::v2f32, 2 },
-    { ISD::FP_EXTEND,  MVT::v4f32, 4 }
+  // TODO: Allow non-throughput costs that aren't binary.
+  auto AdjustCost = [&CostKind](int Cost) {
+    if (CostKind != TTI::TCK_RecipThroughput)
+      return Cost == 0 ? 0 : 1;
+    return Cost;
   };
 
-  if (Src->isVectorTy() && ST->hasNEON() && (ISD == ISD::FP_ROUND ||
-                                          ISD == ISD::FP_EXTEND)) {
-    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
-    if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
-      return LT.first * Entry->Cost;
-  }
-
   EVT SrcTy = TLI->getValueType(DL, Src);
   EVT DstTy = TLI->getValueType(DL, Dst);
 
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return BaseT::getCastInstrCost(Opcode, Dst, Src);
+    return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
 
   // The extend of a load is free
   if (I && isa<LoadInst>(I->getOperand(0))) {
@@ -194,7 +205,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     };
     if (const auto *Entry = ConvertCostTableLookup(
             LoadConversionTbl, ISD, DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
 
     static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
         {ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0},
@@ -203,27 +214,129 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
         {ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8, 0},
         {ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8, 0},
         {ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8, 0},
+        // The following extend from a legal type to an illegal type, so need to
+        // split the load. This introduced an extra load operation, but the
+        // extend is still "free".
+        {ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i16, 1},
+        {ISD::ZERO_EXTEND, MVT::v8i32, MVT::v8i16, 1},
+        {ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8, 3},
+        {ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8, 3},
+        {ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 1},
+        {ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 1},
     };
     if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
       if (const auto *Entry =
               ConvertCostTableLookup(MVELoadConversionTbl, ISD,
                                      DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
-        return Entry->Cost;
+        return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+    }
+
+    static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
+        // FPExtends are similar but also require the VCVT instructions.
+        {ISD::FP_EXTEND, MVT::v4f32, MVT::v4f16, 1},
+        {ISD::FP_EXTEND, MVT::v8f32, MVT::v8f16, 3},
+    };
+    if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
+      if (const auto *Entry =
+              ConvertCostTableLookup(MVEFLoadConversionTbl, ISD,
+                                     DstTy.getSimpleVT(), SrcTy.getSimpleVT()))
+        return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+    }
+  }
+
+  // The truncate of a store is free. This is the mirror of extends above.
+  if (I && I->hasOneUse() && isa<StoreInst>(*I->user_begin())) {
+    static const TypeConversionCostTblEntry MVELoadConversionTbl[] = {
+        {ISD::TRUNCATE, MVT::v4i32, MVT::v4i16, 0},
+        {ISD::TRUNCATE, MVT::v4i32, MVT::v4i8, 0},
+        {ISD::TRUNCATE, MVT::v8i16, MVT::v8i8, 0},
+        {ISD::TRUNCATE, MVT::v8i32, MVT::v8i16, 1},
+        {ISD::TRUNCATE, MVT::v16i32, MVT::v16i8, 3},
+        {ISD::TRUNCATE, MVT::v16i16, MVT::v16i8, 1},
+    };
+    if (SrcTy.isVector() && ST->hasMVEIntegerOps()) {
+      if (const auto *Entry =
+              ConvertCostTableLookup(MVELoadConversionTbl, ISD, SrcTy.getSimpleVT(),
+                                     DstTy.getSimpleVT()))
+        return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+    }
+
+    static const TypeConversionCostTblEntry MVEFLoadConversionTbl[] = {
+        {ISD::FP_ROUND, MVT::v4f32, MVT::v4f16, 1},
+        {ISD::FP_ROUND, MVT::v8f32, MVT::v8f16, 3},
+    };
+    if (SrcTy.isVector() && ST->hasMVEFloatOps()) {
+      if (const auto *Entry =
+              ConvertCostTableLookup(MVEFLoadConversionTbl, ISD, SrcTy.getSimpleVT(),
+                                     DstTy.getSimpleVT()))
+        return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
     }
   }
 
+  // NEON vector operations that can extend their inputs.
+  if ((ISD == ISD::SIGN_EXTEND || ISD == ISD::ZERO_EXTEND) &&
+      I && I->hasOneUse() && ST->hasNEON() && SrcTy.isVector()) {
+    static const TypeConversionCostTblEntry NEONDoubleWidthTbl[] = {
+      // vaddl
+      { ISD::ADD, MVT::v4i32, MVT::v4i16, 0 },
+      { ISD::ADD, MVT::v8i16, MVT::v8i8,  0 },
+      // vsubl
+      { ISD::SUB, MVT::v4i32, MVT::v4i16, 0 },
+      { ISD::SUB, MVT::v8i16, MVT::v8i8,  0 },
+      // vmull
+      { ISD::MUL, MVT::v4i32, MVT::v4i16, 0 },
+      { ISD::MUL, MVT::v8i16, MVT::v8i8,  0 },
+      // vshll
+      { ISD::SHL, MVT::v4i32, MVT::v4i16, 0 },
+      { ISD::SHL, MVT::v8i16, MVT::v8i8,  0 },
+    };
+
+    auto *User = cast<Instruction>(*I->user_begin());
+    int UserISD = TLI->InstructionOpcodeToISD(User->getOpcode());
+    if (auto *Entry = ConvertCostTableLookup(NEONDoubleWidthTbl, UserISD,
+                                             DstTy.getSimpleVT(),
+                                             SrcTy.getSimpleVT())) {
+      return AdjustCost(Entry->Cost);
+    }
+  }
+
+  // Single to/from double precision conversions.
+  if (Src->isVectorTy() && ST->hasNEON() &&
+      ((ISD == ISD::FP_ROUND && SrcTy.getScalarType() == MVT::f64 &&
+        DstTy.getScalarType() == MVT::f32) ||
+       (ISD == ISD::FP_EXTEND && SrcTy.getScalarType() == MVT::f32 &&
+        DstTy.getScalarType() == MVT::f64))) {
+    static const CostTblEntry NEONFltDblTbl[] = {
+        // Vector fptrunc/fpext conversions.
+        {ISD::FP_ROUND, MVT::v2f64, 2},
+        {ISD::FP_EXTEND, MVT::v2f32, 2},
+        {ISD::FP_EXTEND, MVT::v4f32, 4}};
+
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+    if (const auto *Entry = CostTableLookup(NEONFltDblTbl, ISD, LT.second))
+      return AdjustCost(LT.first * Entry->Cost);
+  }
+
   // Some arithmetic, load and store operations have specific instructions
   // to cast up/down their types automatically at no extra cost.
   // TODO: Get these tables to know at least what the related operations are.
   static const TypeConversionCostTblEntry NEONVectorConversionTbl[] = {
-    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
-    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 0 },
+    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
+    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i16, 1 },
     { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
     { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i32, 1 },
     { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64, 0 },
     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i32, 1 },
 
     // The number of vmovl instructions for the extension.
+    { ISD::SIGN_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
+    { ISD::ZERO_EXTEND, MVT::v8i16, MVT::v8i8,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
+    { ISD::ZERO_EXTEND, MVT::v4i32, MVT::v4i8,  2 },
+    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
+    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i8,  3 },
+    { ISD::SIGN_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
+    { ISD::ZERO_EXTEND, MVT::v2i64, MVT::v2i16, 2 },
     { ISD::SIGN_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
     { ISD::ZERO_EXTEND, MVT::v4i64, MVT::v4i16, 3 },
     { ISD::SIGN_EXTEND, MVT::v8i32, MVT::v8i8, 3 },
@@ -294,7 +407,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     if (const auto *Entry = ConvertCostTableLookup(NEONVectorConversionTbl, ISD,
                                                    DstTy.getSimpleVT(),
                                                    SrcTy.getSimpleVT()))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
   // Scalar float to integer conversions.
@@ -324,7 +437,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     if (const auto *Entry = ConvertCostTableLookup(NEONFloatConversionTbl, ISD,
                                                    DstTy.getSimpleVT(),
                                                    SrcTy.getSimpleVT()))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
   // Scalar integer to float conversions.
@@ -355,7 +468,7 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     if (const auto *Entry = ConvertCostTableLookup(NEONIntegerConversionTbl,
                                                    ISD, DstTy.getSimpleVT(),
                                                    SrcTy.getSimpleVT()))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
   // MVE extend costs, taken from codegen tests. i8->i16 or i16->i32 is one
@@ -380,7 +493,28 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     if (const auto *Entry = ConvertCostTableLookup(MVEVectorConversionTbl,
                                                    ISD, DstTy.getSimpleVT(),
                                                    SrcTy.getSimpleVT()))
-      return Entry->Cost * ST->getMVEVectorCostFactor();
+      return AdjustCost(Entry->Cost * ST->getMVEVectorCostFactor());
+  }
+
+  if (ISD == ISD::FP_ROUND || ISD == ISD::FP_EXTEND) {
+    // As general rule, fp converts that were not matched above are scalarized
+    // and cost 1 vcvt for each lane, so long as the instruction is available.
+    // If not it will become a series of function calls.
+    const int CallCost = getCallInstrCost(nullptr, Dst, {Src}, CostKind);
+    int Lanes = 1;
+    if (SrcTy.isFixedLengthVector())
+      Lanes = SrcTy.getVectorNumElements();
+    auto IsLegal = [this](EVT VT) {
+      EVT EltVT = VT.getScalarType();
+      return (EltVT == MVT::f32 && ST->hasVFP2Base()) ||
+             (EltVT == MVT::f64 && ST->hasFP64()) ||
+             (EltVT == MVT::f16 && ST->hasFullFP16());
+    };
+
+    if (IsLegal(SrcTy) && IsLegal(DstTy))
+      return Lanes;
+    else
+      return Lanes * CallCost;
   }
 
   // Scalar integer conversion costs.
@@ -399,13 +533,14 @@ int ARMTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     if (const auto *Entry = ConvertCostTableLookup(ARMIntegerConversionTbl, ISD,
                                                    DstTy.getSimpleVT(),
                                                    SrcTy.getSimpleVT()))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
                      ? ST->getMVEVectorCostFactor()
                      : 1;
-  return BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src);
+  return AdjustCost(
+    BaseCost * BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
 }
 
 int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
@@ -420,7 +555,7 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
                         Opcode == Instruction::ExtractElement)) {
     // Cross-class copies are expensive on many microarchitectures,
     // so assume they are expensive by default.
-    if (ValTy->getVectorElementType()->isIntegerTy())
+    if (cast<VectorType>(ValTy)->getElementType()->isIntegerTy())
       return 3;
 
     // Even if it's not a cross class copy, this likely leads to mixing
@@ -438,14 +573,19 @@ int ARMTTIImpl::getVectorInstrCost(unsigned Opcode, Type *ValTy,
     // result anyway.
     return std::max(BaseT::getVectorInstrCost(Opcode, ValTy, Index),
                     ST->getMVEVectorCostFactor()) *
-           ValTy->getVectorNumElements() / 2;
+           cast<FixedVectorType>(ValTy)->getNumElements() / 2;
   }
 
   return BaseT::getVectorInstrCost(Opcode, ValTy, Index);
 }
 
 int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   TTI::TargetCostKind CostKind,
                                    const Instruction *I) {
+  // TODO: Handle other cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   // On NEON a vector select gets lowered to vbsl.
   if (ST->hasNEON() && ValTy->isVectorTy() && ISD == ISD::SELECT) {
@@ -472,7 +612,8 @@ int ARMTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
   int BaseCost = ST->hasMVEIntegerOps() && ValTy->isVectorTy()
                      ? ST->getMVEVectorCostFactor()
                      : 1;
-  return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+  return BaseCost * BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind,
+                                              I);
 }
 
 int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
@@ -496,11 +637,28 @@ int ARMTTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
 }
 
-bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
+bool ARMTTIImpl::isProfitableLSRChainElement(Instruction *I) {
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    // If a VCTP is part of a chain, it's already profitable and shouldn't be
+    // optimized, else LSR may block tail-predication.
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::arm_mve_vctp8:
+    case Intrinsic::arm_mve_vctp16:
+    case Intrinsic::arm_mve_vctp32:
+    case Intrinsic::arm_mve_vctp64:
+      return true;
+    default:
+      break;
+    }
+  }
+  return false;
+}
+
+bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
   if (!EnableMaskedLoadStores || !ST->hasMVEIntegerOps())
     return false;
 
-  if (auto *VecTy = dyn_cast<VectorType>(DataTy)) {
+  if (auto *VecTy = dyn_cast<FixedVectorType>(DataTy)) {
     // Don't support v2i1 yet.
     if (VecTy->getNumElements() == 2)
       return false;
@@ -512,12 +670,11 @@ bool ARMTTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
   }
 
   unsigned EltWidth = DataTy->getScalarSizeInBits();
-  return (EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
-         (EltWidth == 16 && (!Alignment || Alignment >= 2)) ||
-         (EltWidth == 8);
+  return (EltWidth == 32 && Alignment >= 4) ||
+         (EltWidth == 16 && Alignment >= 2) || (EltWidth == 8);
 }
 
-bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) {
+bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, Align Alignment) {
   if (!EnableMaskedGatherScatters || !ST->hasMVEIntegerOps())
     return false;
 
@@ -534,8 +691,8 @@ bool ARMTTIImpl::isLegalMaskedGather(Type *Ty, MaybeAlign Alignment) {
     return false;
 
   unsigned EltWidth = Ty->getScalarSizeInBits();
-  return ((EltWidth == 32 && (!Alignment || Alignment >= 4)) ||
-          (EltWidth == 16 && (!Alignment || Alignment >= 2)) || EltWidth == 8);
+  return ((EltWidth == 32 && Alignment >= 4) ||
+          (EltWidth == 16 && Alignment >= 2) || EltWidth == 8);
 }
 
 int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
@@ -552,8 +709,8 @@ int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
     return LibCallCost;
 
   const unsigned Size = C->getValue().getZExtValue();
-  const unsigned DstAlign = MI->getDestAlignment();
-  const unsigned SrcAlign = MI->getSourceAlignment();
+  const Align DstAlign = *MI->getDestAlign();
+  const Align SrcAlign = *MI->getSourceAlign();
   const Function *F = I->getParent()->getParent();
   const unsigned Limit = TLI->getMaxStoresPerMemmove(F->hasMinSize());
   std::vector<EVT> MemOps;
@@ -562,8 +719,9 @@ int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
   // loaded and stored. That's why we multiply the number of elements by 2 to
   // get the cost for this memcpy.
   if (getTLI()->findOptimalMemOpLowering(
-          MemOps, Limit, Size, DstAlign, SrcAlign, false /*IsMemset*/,
-          false /*ZeroMemset*/, false /*MemcpyStrSrc*/, false /*AllowOverlap*/,
+          MemOps, Limit,
+          MemOp::Copy(Size, /*DstAlignCanChange*/ false, DstAlign, SrcAlign,
+                      /*IsVolatile*/ true),
           MI->getDestAddressSpace(), MI->getSourceAddressSpace(),
           F->getAttributes()))
     return MemOps.size() * 2;
@@ -572,8 +730,8 @@ int ARMTTIImpl::getMemcpyCost(const Instruction *I) {
   return LibCallCost;
 }
 
-int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
-                               Type *SubTp) {
+int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
+                               int Index, VectorType *SubTp) {
   if (ST->hasNEON()) {
     if (Kind == TTI::SK_Broadcast) {
       static const CostTblEntry NEONDupTbl[] = {
@@ -667,12 +825,19 @@ int ARMTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
 }
 
 int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                       TTI::TargetCostKind CostKind,
                                        TTI::OperandValueKind Op1Info,
                                        TTI::OperandValueKind Op2Info,
                                        TTI::OperandValueProperties Opd1PropInfo,
                                        TTI::OperandValueProperties Opd2PropInfo,
                                        ArrayRef<const Value *> Args,
                                        const Instruction *CxtI) {
+  // TODO: Handle more cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+                                         Op2Info, Opd1PropInfo,
+                                         Opd2PropInfo, Args, CxtI);
+
   int ISDOpcode = TLI->InstructionOpcodeToISD(Opcode);
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
@@ -723,7 +888,8 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     if (const auto *Entry = CostTableLookup(CostTbl, ISDOpcode, LT.second))
       return LT.first * Entry->Cost;
 
-    int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+    int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+                                             Op2Info,
                                              Opd1PropInfo, Opd2PropInfo);
 
     // This is somewhat of a hack. The problem that we are facing is that SROA
@@ -779,12 +945,13 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     return LT.first * BaseCost;
 
   // Else this is expand, assume that we need to scalarize this op.
-  if (Ty->isVectorTy()) {
-    unsigned Num = Ty->getVectorNumElements();
-    unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType());
+  if (auto *VTy = dyn_cast<FixedVectorType>(Ty)) {
+    unsigned Num = VTy->getNumElements();
+    unsigned Cost = getArithmeticInstrCost(Opcode, Ty->getScalarType(),
+                                           CostKind);
     // Return the cost of multiple scalar invocation plus the cost of
     // inserting and extracting the values.
-    return BaseT::getScalarizationOverhead(Ty, Args) + Num * Cost;
+    return BaseT::getScalarizationOverhead(VTy, Args) + Num * Cost;
   }
 
   return BaseCost;
@@ -792,26 +959,53 @@ int ARMTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
 int ARMTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                 MaybeAlign Alignment, unsigned AddressSpace,
+                                TTI::TargetCostKind CostKind,
                                 const Instruction *I) {
-  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
+  // TODO: Handle other cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return 1;
+
+  // Type legalization can't handle structs
+  if (TLI->getValueType(DL, Src, true) == MVT::Other)
+    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                  CostKind);
 
   if (ST->hasNEON() && Src->isVectorTy() &&
       (Alignment && *Alignment != Align(16)) &&
-      Src->getVectorElementType()->isDoubleTy()) {
+      cast<VectorType>(Src)->getElementType()->isDoubleTy()) {
     // Unaligned loads/stores are extremely inefficient.
     // We need 4 uops for vst.1/vld.1 vs 1uop for vldr/vstr.
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
     return LT.first * 4;
   }
+
+  // MVE can optimize a fpext(load(4xhalf)) using an extending integer load.
+  // Same for stores.
+  if (ST->hasMVEFloatOps() && isa<FixedVectorType>(Src) && I &&
+      ((Opcode == Instruction::Load && I->hasOneUse() &&
+        isa<FPExtInst>(*I->user_begin())) ||
+       (Opcode == Instruction::Store && isa<FPTruncInst>(I->getOperand(0))))) {
+    FixedVectorType *SrcVTy = cast<FixedVectorType>(Src);
+    Type *DstTy =
+        Opcode == Instruction::Load
+            ? (*I->user_begin())->getType()
+            : cast<Instruction>(I->getOperand(0))->getOperand(0)->getType();
+    if (SrcVTy->getNumElements() == 4 && SrcVTy->getScalarType()->isHalfTy() &&
+        DstTy->getScalarType()->isFloatTy())
+      return ST->getMVEVectorCostFactor();
+  }
+
   int BaseCost = ST->hasMVEIntegerOps() && Src->isVectorTy()
                      ? ST->getMVEVectorCostFactor()
                      : 1;
-  return BaseCost * LT.first;
+  return BaseCost * BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                           CostKind, I);
 }
 
 int ARMTTIImpl::getInterleavedMemoryOpCost(
     unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-    unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
-    bool UseMaskForGaps) {
+    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+    bool UseMaskForCond, bool UseMaskForGaps) {
   assert(Factor >= 2 && "Invalid interleave factor");
   assert(isa<VectorType>(VecTy) && "Expect a vector type");
 
@@ -820,8 +1014,9 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(
 
   if (Factor <= TLI->getMaxSupportedInterleaveFactor() && !EltIs64Bits &&
       !UseMaskForCond && !UseMaskForGaps) {
-    unsigned NumElts = VecTy->getVectorNumElements();
-    auto *SubVecTy = VectorType::get(VecTy->getScalarType(), NumElts / Factor);
+    unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
+    auto *SubVecTy =
+        FixedVectorType::get(VecTy->getScalarType(), NumElts / Factor);
 
     // vldN/vstN only support legal vector types of size 64 or 128 in bits.
     // Accesses having vector types that are a multiple of 128 bits can be
@@ -842,10 +1037,109 @@ int ARMTTIImpl::getInterleavedMemoryOpCost(
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace,
+                                           Alignment, AddressSpace, CostKind,
                                            UseMaskForCond, UseMaskForGaps);
 }
 
+unsigned ARMTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+                                            const Value *Ptr, bool VariableMask,
+                                            Align Alignment,
+                                            TTI::TargetCostKind CostKind,
+                                            const Instruction *I) {
+  using namespace PatternMatch;
+  if (!ST->hasMVEIntegerOps() || !EnableMaskedGatherScatters)
+    return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
+                                         Alignment, CostKind, I);
+
+  assert(DataTy->isVectorTy() && "Can't do gather/scatters on scalar!");
+  auto *VTy = cast<FixedVectorType>(DataTy);
+
+  // TODO: Splitting, once we do that.
+
+  unsigned NumElems = VTy->getNumElements();
+  unsigned EltSize = VTy->getScalarSizeInBits();
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, DataTy);
+
+  // For now, it is assumed that for the MVE gather instructions the loads are
+  // all effectively serialised. This means the cost is the scalar cost
+  // multiplied by the number of elements being loaded. This is possibly very
+  // conservative, but even so we still end up vectorising loops because the
+  // cost per iteration for many loops is lower than for scalar loops.
+  unsigned VectorCost = NumElems * LT.first;
+  // The scalarization cost should be a lot higher. We use the number of vector
+  // elements plus the scalarization overhead.
+  unsigned ScalarCost =
+      NumElems * LT.first + BaseT::getScalarizationOverhead(VTy, {});
+
+  if (Alignment < EltSize / 8)
+    return ScalarCost;
+
+  unsigned ExtSize = EltSize;
+  // Check whether there's a single user that asks for an extended type
+  if (I != nullptr) {
+    // Dependent of the caller of this function, a gather instruction will
+    // either have opcode Instruction::Load or be a call to the masked_gather
+    // intrinsic
+    if ((I->getOpcode() == Instruction::Load ||
+         match(I, m_Intrinsic<Intrinsic::masked_gather>())) &&
+        I->hasOneUse()) {
+      const User *Us = *I->users().begin();
+      if (isa<ZExtInst>(Us) || isa<SExtInst>(Us)) {
+        // only allow valid type combinations
+        unsigned TypeSize =
+            cast<Instruction>(Us)->getType()->getScalarSizeInBits();
+        if (((TypeSize == 32 && (EltSize == 8 || EltSize == 16)) ||
+             (TypeSize == 16 && EltSize == 8)) &&
+            TypeSize * NumElems == 128) {
+          ExtSize = TypeSize;
+        }
+      }
+    }
+    // Check whether the input data needs to be truncated
+    TruncInst *T;
+    if ((I->getOpcode() == Instruction::Store ||
+         match(I, m_Intrinsic<Intrinsic::masked_scatter>())) &&
+        (T = dyn_cast<TruncInst>(I->getOperand(0)))) {
+      // Only allow valid type combinations
+      unsigned TypeSize = T->getOperand(0)->getType()->getScalarSizeInBits();
+      if (((EltSize == 16 && TypeSize == 32) ||
+           (EltSize == 8 && (TypeSize == 32 || TypeSize == 16))) &&
+          TypeSize * NumElems == 128)
+        ExtSize = TypeSize;
+    }
+  }
+
+  if (ExtSize * NumElems != 128 || NumElems < 4)
+    return ScalarCost;
+
+  // Any (aligned) i32 gather will not need to be scalarised.
+  if (ExtSize == 32)
+    return VectorCost;
+  // For smaller types, we need to ensure that the gep's inputs are correctly
+  // extended from a small enough value. Other sizes (including i64) are
+  // scalarized for now.
+  if (ExtSize != 8 && ExtSize != 16)
+    return ScalarCost;
+
+  if (const auto *BC = dyn_cast<BitCastInst>(Ptr))
+    Ptr = BC->getOperand(0);
+  if (const auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+    if (GEP->getNumOperands() != 2)
+      return ScalarCost;
+    unsigned Scale = DL.getTypeAllocSize(GEP->getResultElementType());
+    // Scale needs to be correct (which is only relevant for i16s).
+    if (Scale != 1 && Scale * 8 != ExtSize)
+      return ScalarCost;
+    // And we need to zext (not sext) the indexes from a small enough type.
+    if (const auto *ZExt = dyn_cast<ZExtInst>(GEP->getOperand(1))) {
+      if (ZExt->getOperand(0)->getType()->getScalarSizeInBits() <= ExtSize)
+        return VectorCost;
+    }
+    return ScalarCost;
+  }
+  return ScalarCost;
+}
+
 bool ARMTTIImpl::isLoweredToCall(const Function *F) {
   if (!F->isIntrinsic())
     BaseT::isLoweredToCall(F);
@@ -913,23 +1207,31 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
                                           HardwareLoopInfo &HWLoopInfo) {
   // Low-overhead branches are only supported in the 'low-overhead branch'
   // extension of v8.1-m.
-  if (!ST->hasLOB() || DisableLowOverheadLoops)
+  if (!ST->hasLOB() || DisableLowOverheadLoops) {
+    LLVM_DEBUG(dbgs() << "ARMHWLoops: Disabled\n");
     return false;
+  }
 
-  if (!SE.hasLoopInvariantBackedgeTakenCount(L))
+  if (!SE.hasLoopInvariantBackedgeTakenCount(L)) {
+    LLVM_DEBUG(dbgs() << "ARMHWLoops: No BETC\n");
     return false;
+  }
 
   const SCEV *BackedgeTakenCount = SE.getBackedgeTakenCount(L);
-  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount))
+  if (isa<SCEVCouldNotCompute>(BackedgeTakenCount)) {
+    LLVM_DEBUG(dbgs() << "ARMHWLoops: Uncomputable BETC\n");
     return false;
+  }
 
   const SCEV *TripCountSCEV =
     SE.getAddExpr(BackedgeTakenCount,
                   SE.getOne(BackedgeTakenCount->getType()));
 
   // We need to store the trip count in LR, a 32-bit register.
-  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32)
+  if (SE.getUnsignedRangeMax(TripCountSCEV).getBitWidth() > 32) {
+    LLVM_DEBUG(dbgs() << "ARMHWLoops: Trip count does not fit into 32bits\n");
     return false;
+  }
 
   // Making a call will trash LR and clear LO_BRANCH_INFO, so there's little
   // point in generating a hardware loop if that's going to happen.
@@ -1034,8 +1336,10 @@ bool ARMTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
   auto ScanLoop = [&](Loop *L) {
     for (auto *BB : L->getBlocks()) {
       for (auto &I : *BB) {
-        if (MaybeCall(I) || IsHardwareLoopIntrinsic(I))
+        if (MaybeCall(I) || IsHardwareLoopIntrinsic(I)) {
+          LLVM_DEBUG(dbgs() << "ARMHWLoops: Bad instruction: " << I << "\n");
           return false;
+        }
       }
     }
     return true;
@@ -1102,12 +1406,47 @@ static bool canTailPredicateInstruction(Instruction &I, int &ICmpCount) {
 static bool canTailPredicateLoop(Loop *L, LoopInfo *LI, ScalarEvolution &SE,
                                  const DataLayout &DL,
                                  const LoopAccessInfo *LAI) {
+  LLVM_DEBUG(dbgs() << "Tail-predication: checking allowed instructions\n");
+
+  // If there are live-out values, it is probably a reduction, which needs a
+  // final reduction step after the loop. MVE has a VADDV instruction to reduce
+  // integer vectors, but doesn't have an equivalent one for float vectors. A
+  // live-out value that is not recognised as a reduction will result in the
+  // tail-predicated loop to be reverted to a non-predicated loop and this is
+  // very expensive, i.e. it has a significant performance impact. So, in this
+  // case it's better not to tail-predicate the loop, which is what we check
+  // here. Thus, we allow only 1 live-out value, which has to be an integer
+  // reduction, which matches the loops supported by ARMLowOverheadLoops.
+  // It is important to keep ARMLowOverheadLoops and canTailPredicateLoop in
+  // sync with each other.
+  SmallVector< Instruction *, 8 > LiveOuts;
+  LiveOuts = llvm::findDefsUsedOutsideOfLoop(L);
+  bool IntReductionsDisabled =
+      EnableTailPredication == TailPredication::EnabledNoReductions ||
+      EnableTailPredication == TailPredication::ForceEnabledNoReductions;
+
+  for (auto *I : LiveOuts) {
+    if (!I->getType()->isIntegerTy()) {
+      LLVM_DEBUG(dbgs() << "Don't tail-predicate loop with non-integer "
+                           "live-out value\n");
+      return false;
+    }
+    if (I->getOpcode() != Instruction::Add) {
+      LLVM_DEBUG(dbgs() << "Only add reductions supported\n");
+      return false;
+    }
+    if (IntReductionsDisabled) {
+      LLVM_DEBUG(dbgs() << "Integer add reductions not enabled\n");
+      return false;
+    }
+  }
+
+  // Next, check that all instructions can be tail-predicated.
   PredicatedScalarEvolution PSE = LAI->getPSE();
+  SmallVector<Instruction *, 16> LoadStores;
   int ICmpCount = 0;
   int Stride = 0;
 
-  LLVM_DEBUG(dbgs() << "tail-predication: checking allowed instructions\n");
-  SmallVector<Instruction *, 16> LoadStores;
   for (BasicBlock *BB : L->blocks()) {
     for (Instruction &I : BB->instructionsWithoutDebug()) {
       if (isa<PHINode>(&I))
@@ -1155,8 +1494,10 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
                                              TargetLibraryInfo *TLI,
                                              DominatorTree *DT,
                                              const LoopAccessInfo *LAI) {
-  if (DisableTailPredication)
+  if (!EnableTailPredication) {
+    LLVM_DEBUG(dbgs() << "Tail-predication not enabled.\n");
     return false;
+  }
 
   // Creating a predicated vector loop is the first step for generating a
   // tail-predicated hardware loop, for which we need the MVE masked
@@ -1197,7 +1538,16 @@ bool ARMTTIImpl::preferPredicateOverEpilogue(Loop *L, LoopInfo *LI,
   return canTailPredicateLoop(L, LI, SE, DL, LAI);
 }
 
+bool ARMTTIImpl::emitGetActiveLaneMask() const {
+  if (!ST->hasMVEIntegerOps() || !EnableTailPredication)
+    return false;
 
+  // Intrinsic @llvm.get.active.lane.mask is supported.
+  // It is used in the MVETailPredication pass, which requires the number of
+  // elements processed by this vector loop to setup the tail-predicated
+  // loop.
+  return true;
+}
 void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                          TTI::UnrollingPreferences &UP) {
   // Only currently enable these preferences for M-Class cores.
@@ -1241,8 +1591,7 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
         return;
 
       if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
-        ImmutableCallSite CS(&I);
-        if (const Function *F = CS.getCalledFunction()) {
+        if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
           if (!isLoweredToCall(F))
             continue;
         }
@@ -1251,7 +1600,7 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
 
       SmallVector<const Value*, 4> Operands(I.value_op_begin(),
                                             I.value_op_end());
-      Cost += getUserCost(&I, Operands);
+      Cost += getUserCost(&I, Operands, TargetTransformInfo::TCK_CodeSize);
     }
   }
 
@@ -1271,27 +1620,12 @@ void ARMTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
     UP.Force = true;
 }
 
+void ARMTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                                       TTI::PeelingPreferences &PP) {
+  BaseT::getPeelingPreferences(L, SE, PP);
+}
+
 bool ARMTTIImpl::useReductionIntrinsic(unsigned Opcode, Type *Ty,
                                        TTI::ReductionFlags Flags) const {
-  assert(isa<VectorType>(Ty) && "Expected Ty to be a vector type");
-  unsigned ScalarBits = Ty->getScalarSizeInBits();
-  if (!ST->hasMVEIntegerOps())
-    return false;
-
-  switch (Opcode) {
-  case Instruction::FAdd:
-  case Instruction::FMul:
-  case Instruction::And:
-  case Instruction::Or:
-  case Instruction::Xor:
-  case Instruction::Mul:
-  case Instruction::FCmp:
-    return false;
-  case Instruction::ICmp:
-  case Instruction::Add:
-    return ScalarBits < 64 && ScalarBits * Ty->getVectorNumElements() == 128;
-  default:
-    llvm_unreachable("Unhandled reduction opcode");
-  }
-  return false;
+  return ST->hasMVEIntegerOps();
 }
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 880588adfdfd..7bf6de4bffe0 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -38,6 +38,16 @@ class ScalarEvolution;
 class Type;
 class Value;
 
+namespace TailPredication {
+  enum Mode {
+    Disabled = 0,
+    EnabledNoReductions,
+    Enabled,
+    ForceEnabledNoReductions,
+    ForceEnabled
+  };
+}
+
 class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
   using BaseT = BasicTTIImplBase<ARMTTIImpl>;
   using TTI = TargetTransformInfo;
@@ -47,13 +57,13 @@ class ARMTTIImpl : public BasicTTIImplBase<ARMTTIImpl> {
   const ARMSubtarget *ST;
   const ARMTargetLowering *TLI;
 
-  // Currently the following features are excluded from InlineFeatureWhitelist.
+  // Currently the following features are excluded from InlineFeaturesAllowed.
   // ModeThumb, FeatureNoARM, ModeSoftFloat, FeatureFP64, FeatureD32
   // Depending on whether they are set or unset, different
   // instructions/registers are available. For example, inlining a callee with
   // -thumb-mode in a caller with +thumb-mode, may cause the assembler to
   // fail if the callee uses ARM only instructions, e.g. in inline asm.
-  const FeatureBitset InlineFeatureWhitelist = {
+  const FeatureBitset InlineFeaturesAllowed = {
       ARM::FeatureVFP2, ARM::FeatureVFP3, ARM::FeatureNEON, ARM::FeatureThumb2,
       ARM::FeatureFP16, ARM::FeatureVFP4, ARM::FeatureFPARMv8,
       ARM::FeatureFullFP16, ARM::FeatureFP16FML, ARM::FeatureHWDivThumb,
@@ -93,11 +103,8 @@ public:
 
   bool enableInterleavedAccessVectorization() { return true; }
 
-  bool shouldFavorBackedgeIndex(const Loop *L) const {
-    if (L->getHeader()->getParent()->hasOptSize())
-      return false;
-    return ST->isMClass() && ST->isThumb2() && L->getNumBlocks() == 1;
-  }
+  bool shouldFavorBackedgeIndex(const Loop *L) const;
+  bool shouldFavorPostInc() const;
 
   /// Floating-point computation using ARMv8 AArch32 Advanced
   /// SIMD instructions remains unchanged from ARMv7. Only AArch64 SIMD
@@ -113,9 +120,10 @@ public:
                             Type *Ty);
 
   using BaseT::getIntImmCost;
-  int getIntImmCost(const APInt &Imm, Type *Ty);
+  int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
 
-  int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+  int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                        Type *Ty, TTI::TargetCostKind CostKind);
 
   /// @}
 
@@ -153,31 +161,57 @@ public:
     return ST->getMaxInterleaveFactor();
   }
 
-  bool isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment);
+  bool isProfitableLSRChainElement(Instruction *I);
+
+  bool isLegalMaskedLoad(Type *DataTy, Align Alignment);
 
-  bool isLegalMaskedStore(Type *DataTy, MaybeAlign Alignment) {
+  bool isLegalMaskedStore(Type *DataTy, Align Alignment) {
     return isLegalMaskedLoad(DataTy, Alignment);
   }
 
-  bool isLegalMaskedGather(Type *Ty, MaybeAlign Alignment);
+  bool isLegalMaskedGather(Type *Ty, Align Alignment);
 
-  bool isLegalMaskedScatter(Type *Ty, MaybeAlign Alignment) { return false; }
+  bool isLegalMaskedScatter(Type *Ty, Align Alignment) {
+    return isLegalMaskedGather(Ty, Alignment);
+  }
 
   int getMemcpyCost(const Instruction *I);
 
-  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+  int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
+                     VectorType *SubTp);
 
   bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
                              TTI::ReductionFlags Flags) const;
 
   bool shouldExpandReduction(const IntrinsicInst *II) const {
-    return false;
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::experimental_vector_reduce_v2_fadd:
+    case Intrinsic::experimental_vector_reduce_v2_fmul:
+      // We don't have legalization support for ordered FP reductions.
+      if (!II->getFastMathFlags().allowReassoc())
+        return true;
+      // Can't legalize reductions with soft floats.
+      return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs();
+
+    case Intrinsic::experimental_vector_reduce_fmin:
+    case Intrinsic::experimental_vector_reduce_fmax:
+      // Can't legalize reductions with soft floats, and NoNan will create
+      // fminimum which we do not know how to lower.
+      return TLI->useSoftFloat() || !TLI->getSubtarget()->hasFPRegs() ||
+             !II->getFastMathFlags().noNaNs();
+
+    default:
+      // Don't expand anything else, let legalization deal with it.
+      return false;
+    }
   }
 
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       TTI::TargetCostKind CostKind,
                        const Instruction *I = nullptr);
 
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         TTI::TargetCostKind CostKind,
                          const Instruction *I = nullptr);
 
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
@@ -187,6 +221,7 @@ public:
 
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       TTI::OperandValueKind Op1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Op2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -195,13 +230,20 @@ public:
       const Instruction *CxtI = nullptr);
 
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
-                      unsigned AddressSpace, const Instruction *I = nullptr);
+                      unsigned AddressSpace,
+                      TTI::TargetCostKind CostKind,
+                      const Instruction *I = nullptr);
 
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy, unsigned Factor,
-                                 ArrayRef<unsigned> Indices, unsigned Alignment,
-                                 unsigned AddressSpace,
-                                 bool UseMaskForCond = false,
-                                 bool UseMaskForGaps = false);
+  int getInterleavedMemoryOpCost(
+      unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+      Align Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      bool UseMaskForCond = false, bool UseMaskForGaps = false);
+
+  unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+                                  const Value *Ptr, bool VariableMask,
+                                  Align Alignment, TTI::TargetCostKind CostKind,
+                                  const Instruction *I = nullptr);
 
   bool isLoweredToCall(const Function *F);
   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
@@ -217,6 +259,10 @@ public:
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
 
+  bool emitGetActiveLaneMask() const;
+
+  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                             TTI::PeelingPreferences &PP);
   bool shouldBuildLookupTablesForConstant(Constant *C) const {
     // In the ROPI and RWPI relocation models we can't have pointers to global
     // variables or functions in constant data, so don't convert switches to
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index f6d76ee09534..05f870b90ecd 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -22,6 +22,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/ADT/Triple.h"
@@ -180,10 +181,68 @@ public:
   }
 };
 
+// Various sets of ARM instruction mnemonics which are used by the asm parser
+class ARMMnemonicSets {
+  StringSet<> CDE;
+  StringSet<> CDEWithVPTSuffix;
+public:
+  ARMMnemonicSets(const MCSubtargetInfo &STI);
+
+  /// Returns true iff a given mnemonic is a CDE instruction
+  bool isCDEInstr(StringRef Mnemonic) {
+    // Quick check before searching the set
+    if (!Mnemonic.startswith("cx") && !Mnemonic.startswith("vcx"))
+      return false;
+    return CDE.count(Mnemonic);
+  }
+
+  /// Returns true iff a given mnemonic is a VPT-predicable CDE instruction
+  /// (possibly with a predication suffix "e" or "t")
+  bool isVPTPredicableCDEInstr(StringRef Mnemonic) {
+    if (!Mnemonic.startswith("vcx"))
+      return false;
+    return CDEWithVPTSuffix.count(Mnemonic);
+  }
+
+  /// Returns true iff a given mnemonic is an IT-predicable CDE instruction
+  /// (possibly with a condition suffix)
+  bool isITPredicableCDEInstr(StringRef Mnemonic) {
+    if (!Mnemonic.startswith("cx"))
+      return false;
+    return Mnemonic.startswith("cx1a") || Mnemonic.startswith("cx1da") ||
+           Mnemonic.startswith("cx2a") || Mnemonic.startswith("cx2da") ||
+           Mnemonic.startswith("cx3a") || Mnemonic.startswith("cx3da");
+  }
+
+  /// Return true iff a given mnemonic is an integer CDE instruction with
+  /// dual-register destination
+  bool isCDEDualRegInstr(StringRef Mnemonic) {
+    if (!Mnemonic.startswith("cx"))
+      return false;
+    return Mnemonic == "cx1d" || Mnemonic == "cx1da" ||
+           Mnemonic == "cx2d" || Mnemonic == "cx2da" ||
+           Mnemonic == "cx3d" || Mnemonic == "cx3da";
+  }
+};
+
+ARMMnemonicSets::ARMMnemonicSets(const MCSubtargetInfo &STI) {
+  for (StringRef Mnemonic: { "cx1", "cx1a", "cx1d", "cx1da",
+                             "cx2", "cx2a", "cx2d", "cx2da",
+                             "cx3", "cx3a", "cx3d", "cx3da", })
+    CDE.insert(Mnemonic);
+  for (StringRef Mnemonic :
+       {"vcx1", "vcx1a", "vcx2", "vcx2a", "vcx3", "vcx3a"}) {
+    CDE.insert(Mnemonic);
+    CDEWithVPTSuffix.insert(Mnemonic);
+    CDEWithVPTSuffix.insert(std::string(Mnemonic) + "t");
+    CDEWithVPTSuffix.insert(std::string(Mnemonic) + "e");
+  }
+}
 
 class ARMAsmParser : public MCTargetAsmParser {
   const MCRegisterInfo *MRI;
   UnwindContext UC;
+  ARMMnemonicSets MS;
 
   ARMTargetStreamer &getTargetStreamer() {
     assert(getParser().getStreamer().getTargetStreamer() &&
@@ -245,12 +304,12 @@ class ARMAsmParser : public MCTargetAsmParser {
     ITInst.setOpcode(ARM::t2IT);
     ITInst.addOperand(MCOperand::createImm(ITState.Cond));
     ITInst.addOperand(MCOperand::createImm(ITState.Mask));
-    Out.EmitInstruction(ITInst, getSTI());
+    Out.emitInstruction(ITInst, getSTI());
 
     // Emit the conditonal instructions
     assert(PendingConditionalInsts.size() <= 4);
     for (const MCInst &Inst : PendingConditionalInsts) {
-      Out.EmitInstruction(Inst, getSTI());
+      Out.emitInstruction(Inst, getSTI());
     }
     PendingConditionalInsts.clear();
 
@@ -444,6 +503,8 @@ class ARMAsmParser : public MCTargetAsmParser {
 
   void tryConvertingToTwoOperandForm(StringRef Mnemonic, bool CarrySetting,
                                      OperandVector &Operands);
+  bool CDEConvertDualRegOperand(StringRef Mnemonic, OperandVector &Operands);
+
   bool isThumb() const {
     // FIXME: Can tablegen auto-generate this?
     return getSTI().getFeatureBits()[ARM::ModeThumb];
@@ -501,6 +562,9 @@ class ARMAsmParser : public MCTargetAsmParser {
   bool hasMVEFloat() const {
     return getSTI().getFeatureBits()[ARM::HasMVEFloatOps];
   }
+  bool hasCDE() const {
+    return getSTI().getFeatureBits()[ARM::HasCDEOps];
+  }
   bool has8MSecExt() const {
     return getSTI().getFeatureBits()[ARM::Feature8MSecExt];
   }
@@ -605,7 +669,7 @@ public:
 
   ARMAsmParser(const MCSubtargetInfo &STI, MCAsmParser &Parser,
                const MCInstrInfo &MII, const MCTargetOptions &Options)
-    : MCTargetAsmParser(Options, STI, MII), UC(Parser) {
+    : MCTargetAsmParser(Options, STI, MII), UC(Parser), MS(STI) {
     MCAsmParserExtension::Initialize(Parser);
 
     // Cache the MCRegisterInfo.
@@ -628,6 +692,8 @@ public:
 
   // Implementation of the MCTargetAsmParser interface:
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
   bool ParseDirective(AsmToken DirectiveID) override;
@@ -3553,8 +3619,7 @@ public:
     if (Kind == k_RegisterList && Regs.back().second == ARM::APSR)
       Kind = k_RegisterListWithAPSR;
 
-    assert(std::is_sorted(Regs.begin(), Regs.end()) &&
-           "Register list must be sorted by encoding");
+    assert(llvm::is_sorted(Regs) && "Register list must be sorted by encoding");
 
     auto Op = std::make_unique<ARMOperand>(Kind);
     for (const auto &P : Regs)
@@ -3885,6 +3950,14 @@ bool ARMAsmParser::ParseRegister(unsigned &RegNo,
   return (RegNo == (unsigned)-1);
 }
 
+OperandMatchResultTy ARMAsmParser::tryParseRegister(unsigned &RegNo,
+                                                    SMLoc &StartLoc,
+                                                    SMLoc &EndLoc) {
+  if (ParseRegister(RegNo, StartLoc, EndLoc))
+    return MatchOperand_NoMatch;
+  return MatchOperand_Success;
+}
+
 /// Try to parse a register name.  The token must be an Identifier when called,
 /// and if it is a register name the token is eaten and the register number is
 /// returned.  Otherwise return -1.
@@ -6045,20 +6118,35 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
   case AsmToken::LCurly:
     return parseRegisterList(Operands, !Mnemonic.startswith("clr"));
   case AsmToken::Dollar:
-  case AsmToken::Hash:
-    // #42 -> immediate.
+  case AsmToken::Hash: {
+    // #42 -> immediate
+    // $ 42 -> immediate
+    // $foo -> symbol name
+    // $42 -> symbol name
     S = Parser.getTok().getLoc();
-    Parser.Lex();
+
+    // Favor the interpretation of $-prefixed operands as symbol names.
+    // Cases where immediates are explicitly expected are handled by their
+    // specific ParseMethod implementations.
+    auto AdjacentToken = getLexer().peekTok(/*ShouldSkipSpace=*/false);
+    bool ExpectIdentifier = Parser.getTok().is(AsmToken::Dollar) &&
+                            (AdjacentToken.is(AsmToken::Identifier) ||
+                             AdjacentToken.is(AsmToken::Integer));
+    if (!ExpectIdentifier) {
+      // Token is not part of identifier. Drop leading $ or # before parsing
+      // expression.
+      Parser.Lex();
+    }
 
     if (Parser.getTok().isNot(AsmToken::Colon)) {
-      bool isNegative = Parser.getTok().is(AsmToken::Minus);
+      bool IsNegative = Parser.getTok().is(AsmToken::Minus);
       const MCExpr *ImmVal;
       if (getParser().parseExpression(ImmVal))
         return true;
       const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ImmVal);
       if (CE) {
         int32_t Val = CE->getValue();
-        if (isNegative && Val == 0)
+        if (IsNegative && Val == 0)
           ImmVal = MCConstantExpr::create(std::numeric_limits<int32_t>::min(),
                                           getContext());
       }
@@ -6077,7 +6165,7 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
     }
     // w/ a ':' after the '#', it's just like a plain ':'.
     LLVM_FALLTHROUGH;
-
+  }
   case AsmToken::Colon: {
     S = Parser.getTok().getLoc();
     // ":lower16:" and ":upper16:" expression prefixes
@@ -6233,6 +6321,7 @@ StringRef ARMAsmParser::splitMnemonic(StringRef Mnemonic,
       Mnemonic == "vrintp" || Mnemonic == "vrintm" || Mnemonic == "hvc" ||
       Mnemonic.startswith("vsel") || Mnemonic == "vins" || Mnemonic == "vmovx" ||
       Mnemonic == "bxns"  || Mnemonic == "blxns" ||
+      Mnemonic == "vdot"  || Mnemonic == "vmmla"  ||
       Mnemonic == "vudot" || Mnemonic == "vsdot" ||
       Mnemonic == "vcmla" || Mnemonic == "vcadd" ||
       Mnemonic == "vfmal" || Mnemonic == "vfmsl" ||
@@ -6373,14 +6462,20 @@ void ARMAsmParser::getMnemonicAcceptInfo(StringRef Mnemonic,
       Mnemonic == "vudot" || Mnemonic == "vsdot" ||
       Mnemonic == "vcmla" || Mnemonic == "vcadd" ||
       Mnemonic == "vfmal" || Mnemonic == "vfmsl" ||
+      Mnemonic == "vfmat" || Mnemonic == "vfmab" ||
+      Mnemonic == "vdot"  || Mnemonic == "vmmla" ||
       Mnemonic == "sb"    || Mnemonic == "ssbb"  ||
-      Mnemonic == "pssbb" ||
+      Mnemonic == "pssbb" || Mnemonic == "vsmmla" ||
+      Mnemonic == "vummla" || Mnemonic == "vusmmla" ||
+      Mnemonic == "vusdot" || Mnemonic == "vsudot" ||
       Mnemonic == "bfcsel" || Mnemonic == "wls" ||
       Mnemonic == "dls" || Mnemonic == "le" || Mnemonic == "csel" ||
       Mnemonic == "csinc" || Mnemonic == "csinv" || Mnemonic == "csneg" ||
       Mnemonic == "cinc" || Mnemonic == "cinv" || Mnemonic == "cneg" ||
       Mnemonic == "cset" || Mnemonic == "csetm" ||
       Mnemonic.startswith("vpt") || Mnemonic.startswith("vpst") ||
+      (hasCDE() && MS.isCDEInstr(Mnemonic) &&
+       !MS.isITPredicableCDEInstr(Mnemonic)) ||
       (hasMVE() &&
        (Mnemonic.startswith("vst2") || Mnemonic.startswith("vld2") ||
         Mnemonic.startswith("vst4") || Mnemonic.startswith("vld4") ||
@@ -6770,6 +6865,69 @@ void ARMAsmParser::fixupGNULDRDAlias(StringRef Mnemonic,
       ARMOperand::CreateReg(PairedReg, Op2.getStartLoc(), Op2.getEndLoc()));
 }
 
+// Dual-register instruction have the following syntax:
+// <mnemonic> <predicate>? <coproc>, <Rdest>, <Rdest+1>, <Rsrc>, ..., #imm
+// This function tries to remove <Rdest+1> and replace <Rdest> with a pair
+// operand. If the conversion fails an error is diagnosed, and the function
+// returns true.
+bool ARMAsmParser::CDEConvertDualRegOperand(StringRef Mnemonic,
+                                            OperandVector &Operands) {
+  assert(MS.isCDEDualRegInstr(Mnemonic));
+  bool isPredicable =
+      Mnemonic == "cx1da" || Mnemonic == "cx2da" || Mnemonic == "cx3da";
+  size_t NumPredOps = isPredicable ? 1 : 0;
+
+  if (Operands.size() <= 3 + NumPredOps)
+    return false;
+
+  StringRef Op2Diag(
+      "operand must be an even-numbered register in the range [r0, r10]");
+
+  const MCParsedAsmOperand &Op2 = *Operands[2 + NumPredOps];
+  if (!Op2.isReg())
+    return Error(Op2.getStartLoc(), Op2Diag);
+
+  unsigned RNext;
+  unsigned RPair;
+  switch (Op2.getReg()) {
+  default:
+    return Error(Op2.getStartLoc(), Op2Diag);
+  case ARM::R0:
+    RNext = ARM::R1;
+    RPair = ARM::R0_R1;
+    break;
+  case ARM::R2:
+    RNext = ARM::R3;
+    RPair = ARM::R2_R3;
+    break;
+  case ARM::R4:
+    RNext = ARM::R5;
+    RPair = ARM::R4_R5;
+    break;
+  case ARM::R6:
+    RNext = ARM::R7;
+    RPair = ARM::R6_R7;
+    break;
+  case ARM::R8:
+    RNext = ARM::R9;
+    RPair = ARM::R8_R9;
+    break;
+  case ARM::R10:
+    RNext = ARM::R11;
+    RPair = ARM::R10_R11;
+    break;
+  }
+
+  const MCParsedAsmOperand &Op3 = *Operands[3 + NumPredOps];
+  if (!Op3.isReg() || Op3.getReg() != RNext)
+    return Error(Op3.getStartLoc(), "operand must be a consecutive register");
+
+  Operands.erase(Operands.begin() + 3 + NumPredOps);
+  Operands[2 + NumPredOps] =
+      ARMOperand::CreateReg(RPair, Op2.getStartLoc(), Op2.getEndLoc());
+  return false;
+}
+
 /// Parse an arm instruction mnemonic followed by its operands.
 bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                                     SMLoc NameLoc, OperandVector &Operands) {
@@ -6786,7 +6944,7 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
   // First check for the ARM-specific .req directive.
   if (Parser.getTok().is(AsmToken::Identifier) &&
-      Parser.getTok().getIdentifier() == ".req") {
+      Parser.getTok().getIdentifier().lower() == ".req") {
     parseDirectiveReq(Name, NameLoc);
     // We always return 'error' for this, as we're done with this
     // statement and don't need to match the 'instruction."
@@ -6823,6 +6981,8 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   //    ITx   -> x100    (ITT -> 0100, ITE -> 1100)
   //    ITxy  -> xy10    (e.g. ITET -> 1010)
   //    ITxyz -> xyz1    (e.g. ITEET -> 1101)
+  // Note: See the ARM::PredBlockMask enum in
+  //   /lib/Target/ARM/Utils/ARMBaseInfo.h
   if (Mnemonic == "it" || Mnemonic.startswith("vpt") ||
       Mnemonic.startswith("vpst")) {
     SMLoc Loc = Mnemonic == "it"  ? SMLoc::getFromPointer(NameLoc.getPointer() + 2) :
@@ -6969,6 +7129,21 @@ bool ARMAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
 
   tryConvertingToTwoOperandForm(Mnemonic, CarrySetting, Operands);
 
+  if (hasCDE() && MS.isCDEInstr(Mnemonic)) {
+    // Dual-register instructions use even-odd register pairs as their
+    // destination operand, in assembly such pair is spelled as two
+    // consecutive registers, without any special syntax. ConvertDualRegOperand
+    // tries to convert such operand into register pair, e.g. r2, r3 -> r2_r3.
+    // It returns true, if an error message has been emitted. If the function
+    // returns false, the function either succeeded or an error (e.g. missing
+    // operand) will be diagnosed elsewhere.
+    if (MS.isCDEDualRegInstr(Mnemonic)) {
+      bool GotError = CDEConvertDualRegOperand(Mnemonic, Operands);
+      if (GotError)
+        return GotError;
+    }
+  }
+
   // Some instructions, mostly Thumb, have forms for the same mnemonic that
   // do and don't have a cc_out optional-def operand. With some spot-checks
   // of the operand list, we can figure out which variant we're trying to
@@ -7947,6 +8122,142 @@ bool ARMAsmParser::validateInstruction(MCInst &Inst,
       return Error (Operands[3]->getStartLoc(), "Q-register indexes must be 2 and 0 or 3 and 1");
     break;
   }
+  case ARM::UMAAL:
+  case ARM::UMLAL:
+  case ARM::UMULL:
+  case ARM::t2UMAAL:
+  case ARM::t2UMLAL:
+  case ARM::t2UMULL:
+  case ARM::SMLAL:
+  case ARM::SMLALBB:
+  case ARM::SMLALBT:
+  case ARM::SMLALD:
+  case ARM::SMLALDX:
+  case ARM::SMLALTB:
+  case ARM::SMLALTT:
+  case ARM::SMLSLD:
+  case ARM::SMLSLDX:
+  case ARM::SMULL:
+  case ARM::t2SMLAL:
+  case ARM::t2SMLALBB:
+  case ARM::t2SMLALBT:
+  case ARM::t2SMLALD:
+  case ARM::t2SMLALDX:
+  case ARM::t2SMLALTB:
+  case ARM::t2SMLALTT:
+  case ARM::t2SMLSLD:
+  case ARM::t2SMLSLDX:
+  case ARM::t2SMULL: {
+    unsigned RdHi = Inst.getOperand(0).getReg();
+    unsigned RdLo = Inst.getOperand(1).getReg();
+    if(RdHi == RdLo) {
+      return Error(Loc,
+                   "unpredictable instruction, RdHi and RdLo must be different");
+    }
+    break;
+  }
+
+  case ARM::CDE_CX1:
+  case ARM::CDE_CX1A:
+  case ARM::CDE_CX1D:
+  case ARM::CDE_CX1DA:
+  case ARM::CDE_CX2:
+  case ARM::CDE_CX2A:
+  case ARM::CDE_CX2D:
+  case ARM::CDE_CX2DA:
+  case ARM::CDE_CX3:
+  case ARM::CDE_CX3A:
+  case ARM::CDE_CX3D:
+  case ARM::CDE_CX3DA:
+  case ARM::CDE_VCX1_vec:
+  case ARM::CDE_VCX1_fpsp:
+  case ARM::CDE_VCX1_fpdp:
+  case ARM::CDE_VCX1A_vec:
+  case ARM::CDE_VCX1A_fpsp:
+  case ARM::CDE_VCX1A_fpdp:
+  case ARM::CDE_VCX2_vec:
+  case ARM::CDE_VCX2_fpsp:
+  case ARM::CDE_VCX2_fpdp:
+  case ARM::CDE_VCX2A_vec:
+  case ARM::CDE_VCX2A_fpsp:
+  case ARM::CDE_VCX2A_fpdp:
+  case ARM::CDE_VCX3_vec:
+  case ARM::CDE_VCX3_fpsp:
+  case ARM::CDE_VCX3_fpdp:
+  case ARM::CDE_VCX3A_vec:
+  case ARM::CDE_VCX3A_fpsp:
+  case ARM::CDE_VCX3A_fpdp: {
+    assert(Inst.getOperand(1).isImm() &&
+           "CDE operand 1 must be a coprocessor ID");
+    int64_t Coproc = Inst.getOperand(1).getImm();
+    if (Coproc < 8 && !ARM::isCDECoproc(Coproc, *STI))
+      return Error(Operands[1]->getStartLoc(),
+                   "coprocessor must be configured as CDE");
+    else if (Coproc >= 8)
+      return Error(Operands[1]->getStartLoc(),
+                   "coprocessor must be in the range [p0, p7]");
+    break;
+  }
+
+  case ARM::t2CDP:
+  case ARM::t2CDP2:
+  case ARM::t2LDC2L_OFFSET:
+  case ARM::t2LDC2L_OPTION:
+  case ARM::t2LDC2L_POST:
+  case ARM::t2LDC2L_PRE:
+  case ARM::t2LDC2_OFFSET:
+  case ARM::t2LDC2_OPTION:
+  case ARM::t2LDC2_POST:
+  case ARM::t2LDC2_PRE:
+  case ARM::t2LDCL_OFFSET:
+  case ARM::t2LDCL_OPTION:
+  case ARM::t2LDCL_POST:
+  case ARM::t2LDCL_PRE:
+  case ARM::t2LDC_OFFSET:
+  case ARM::t2LDC_OPTION:
+  case ARM::t2LDC_POST:
+  case ARM::t2LDC_PRE:
+  case ARM::t2MCR:
+  case ARM::t2MCR2:
+  case ARM::t2MCRR:
+  case ARM::t2MCRR2:
+  case ARM::t2MRC:
+  case ARM::t2MRC2:
+  case ARM::t2MRRC:
+  case ARM::t2MRRC2:
+  case ARM::t2STC2L_OFFSET:
+  case ARM::t2STC2L_OPTION:
+  case ARM::t2STC2L_POST:
+  case ARM::t2STC2L_PRE:
+  case ARM::t2STC2_OFFSET:
+  case ARM::t2STC2_OPTION:
+  case ARM::t2STC2_POST:
+  case ARM::t2STC2_PRE:
+  case ARM::t2STCL_OFFSET:
+  case ARM::t2STCL_OPTION:
+  case ARM::t2STCL_POST:
+  case ARM::t2STCL_PRE:
+  case ARM::t2STC_OFFSET:
+  case ARM::t2STC_OPTION:
+  case ARM::t2STC_POST:
+  case ARM::t2STC_PRE: {
+    unsigned Opcode = Inst.getOpcode();
+    // Inst.getOperand indexes operands in the (oops ...) and (iops ...) dags,
+    // CopInd is the index of the coprocessor operand.
+    size_t CopInd = 0;
+    if (Opcode == ARM::t2MRRC || Opcode == ARM::t2MRRC2)
+      CopInd = 2;
+    else if (Opcode == ARM::t2MRC || Opcode == ARM::t2MRC2)
+      CopInd = 1;
+    assert(Inst.getOperand(CopInd).isImm() &&
+           "Operand must be a coprocessor ID");
+    int64_t Coproc = Inst.getOperand(CopInd).getImm();
+    // Operands[2] is the coprocessor operand at syntactic level
+    if (ARM::isCDECoproc(Coproc, *STI))
+      return Error(Operands[2]->getStartLoc(),
+                   "coprocessor must be configured as GCP");
+    break;
+  }
   }
 
   return false;
@@ -8223,50 +8534,6 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   }
 
   switch (Inst.getOpcode()) {
-  case ARM::MVE_VORNIZ0v4i32:
-  case ARM::MVE_VORNIZ0v8i16:
-  case ARM::MVE_VORNIZ8v4i32:
-  case ARM::MVE_VORNIZ8v8i16:
-  case ARM::MVE_VORNIZ16v4i32:
-  case ARM::MVE_VORNIZ24v4i32:
-  case ARM::MVE_VANDIZ0v4i32:
-  case ARM::MVE_VANDIZ0v8i16:
-  case ARM::MVE_VANDIZ8v4i32:
-  case ARM::MVE_VANDIZ8v8i16:
-  case ARM::MVE_VANDIZ16v4i32:
-  case ARM::MVE_VANDIZ24v4i32: {
-    unsigned Opcode;
-    bool imm16 = false;
-    switch(Inst.getOpcode()) {
-    case ARM::MVE_VORNIZ0v4i32: Opcode = ARM::MVE_VORRIZ0v4i32; break;
-    case ARM::MVE_VORNIZ0v8i16: Opcode = ARM::MVE_VORRIZ0v8i16; imm16 = true; break;
-    case ARM::MVE_VORNIZ8v4i32: Opcode = ARM::MVE_VORRIZ8v4i32; break;
-    case ARM::MVE_VORNIZ8v8i16: Opcode = ARM::MVE_VORRIZ8v8i16; imm16 = true; break;
-    case ARM::MVE_VORNIZ16v4i32: Opcode = ARM::MVE_VORRIZ16v4i32; break;
-    case ARM::MVE_VORNIZ24v4i32: Opcode = ARM::MVE_VORRIZ24v4i32; break;
-    case ARM::MVE_VANDIZ0v4i32: Opcode = ARM::MVE_VBICIZ0v4i32; break;
-    case ARM::MVE_VANDIZ0v8i16: Opcode = ARM::MVE_VBICIZ0v8i16; imm16 = true; break;
-    case ARM::MVE_VANDIZ8v4i32: Opcode = ARM::MVE_VBICIZ8v4i32; break;
-    case ARM::MVE_VANDIZ8v8i16: Opcode = ARM::MVE_VBICIZ8v8i16; imm16 = true; break;
-    case ARM::MVE_VANDIZ16v4i32: Opcode = ARM::MVE_VBICIZ16v4i32; break;
-    case ARM::MVE_VANDIZ24v4i32: Opcode = ARM::MVE_VBICIZ24v4i32; break;
-    default: llvm_unreachable("unexpected opcode");
-    }
-
-    MCInst TmpInst;
-    TmpInst.setOpcode(Opcode);
-    TmpInst.addOperand(Inst.getOperand(0));
-    TmpInst.addOperand(Inst.getOperand(1));
-
-    // invert immediate
-    unsigned imm = ~Inst.getOperand(2).getImm() & (imm16 ? 0xffff : 0xffffffff);
-    TmpInst.addOperand(MCOperand::createImm(imm));
-
-    TmpInst.addOperand(Inst.getOperand(3));
-    TmpInst.addOperand(Inst.getOperand(4));
-    Inst = TmpInst;
-    return true;
-  }
   // Alias for alternate form of 'ldr{,b}t Rt, [Rn], #imm' instruction.
   case ARM::LDRT_POST:
   case ARM::LDRBT_POST: {
@@ -8285,6 +8552,26 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     Inst = TmpInst;
     return true;
   }
+  // Alias for 'ldr{sb,h,sh}t Rt, [Rn] {, #imm}' for ommitted immediate.
+  case ARM::LDRSBTii:
+  case ARM::LDRHTii:
+  case ARM::LDRSHTii: {
+    MCInst TmpInst;
+
+    if (Inst.getOpcode() == ARM::LDRSBTii)
+      TmpInst.setOpcode(ARM::LDRSBTi);
+    else if (Inst.getOpcode() == ARM::LDRHTii)
+      TmpInst.setOpcode(ARM::LDRHTi);
+    else if (Inst.getOpcode() == ARM::LDRSHTii)
+      TmpInst.setOpcode(ARM::LDRSHTi);
+    TmpInst.addOperand(Inst.getOperand(0));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(Inst.getOperand(1));
+    TmpInst.addOperand(MCOperand::createImm(256));
+    TmpInst.addOperand(Inst.getOperand(2));
+    Inst = TmpInst;
+    return true;
+  }
   // Alias for alternate form of 'str{,b}t Rt, [Rn], #imm' instruction.
   case ARM::STRT_POST:
   case ARM::STRBT_POST: {
@@ -8323,7 +8610,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
       // Reading PC provides the start of the current instruction + 8 and
       // the transform to adr is biased by that.
       MCSymbol *Dot = getContext().createTempSymbol();
-      Out.EmitLabel(Dot);
+      Out.emitLabel(Dot);
       const MCExpr *OpExpr = Inst.getOperand(2).getExpr();
       const MCExpr *InstPC = MCSymbolRefExpr::create(Dot,
                                                      MCSymbolRefExpr::VK_None,
@@ -10521,7 +10808,7 @@ bool ARMAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
       if (isITBlockFull() || isITBlockTerminator(Inst))
         flushPendingInstructions(Out);
     } else {
-      Out.EmitInstruction(Inst, getSTI());
+      Out.emitInstruction(Inst, getSTI());
     }
     return false;
   case Match_NearMisses:
@@ -10546,7 +10833,7 @@ bool ARMAsmParser::ParseDirective(AsmToken DirectiveID) {
   bool IsMachO = Format == MCObjectFileInfo::IsMachO;
   bool IsCOFF = Format == MCObjectFileInfo::IsCOFF;
 
-  StringRef IDVal = DirectiveID.getIdentifier();
+  std::string IDVal = DirectiveID.getIdentifier().lower();
   if (IDVal == ".word")
     parseLiteralValues(4, DirectiveID.getLoc());
   else if (IDVal == ".short" || IDVal == ".hword")
@@ -10632,7 +10919,7 @@ bool ARMAsmParser::parseLiteralValues(unsigned Size, SMLoc L) {
     const MCExpr *Value;
     if (getParser().parseExpression(Value))
       return true;
-    getParser().getStreamer().EmitValue(Value, Size, L);
+    getParser().getStreamer().emitValue(Value, Size, L);
     return false;
   };
   return (parseMany(parseOne));
@@ -10648,7 +10935,7 @@ bool ARMAsmParser::parseDirectiveThumb(SMLoc L) {
   if (!isThumb())
     SwitchMode();
 
-  getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+  getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
   return false;
 }
 
@@ -10661,7 +10948,7 @@ bool ARMAsmParser::parseDirectiveARM(SMLoc L) {
 
   if (isThumb())
     SwitchMode();
-  getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+  getParser().getStreamer().emitAssemblerFlag(MCAF_Code32);
   return false;
 }
 
@@ -10673,7 +10960,7 @@ void ARMAsmParser::doBeforeLabelEmit(MCSymbol *Symbol) {
 
 void ARMAsmParser::onLabelParsed(MCSymbol *Symbol) {
   if (NextSymbolIsThumb) {
-    getParser().getStreamer().EmitThumbFunc(Symbol);
+    getParser().getStreamer().emitThumbFunc(Symbol);
     NextSymbolIsThumb = false;
   }
 }
@@ -10693,7 +10980,7 @@ bool ARMAsmParser::parseDirectiveThumbFunc(SMLoc L) {
         Parser.getTok().is(AsmToken::String)) {
       MCSymbol *Func = getParser().getContext().getOrCreateSymbol(
           Parser.getTok().getIdentifier());
-      getParser().getStreamer().EmitThumbFunc(Func);
+      getParser().getStreamer().emitThumbFunc(Func);
       Parser.Lex();
       if (parseToken(AsmToken::EndOfStatement,
                      "unexpected token in '.thumb_func' directive"))
@@ -10757,14 +11044,14 @@ bool ARMAsmParser::parseDirectiveCode(SMLoc L) {
 
     if (!isThumb())
       SwitchMode();
-    getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+    getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
   } else {
     if (!hasARM())
       return Error(L, "target does not support ARM mode");
 
     if (isThumb())
       SwitchMode();
-    getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+    getParser().getStreamer().emitAssemblerFlag(MCAF_Code32);
   }
 
   return false;
@@ -10817,7 +11104,7 @@ void ARMAsmParser::FixModeAfterArchChange(bool WasThumb, SMLoc Loc) {
       SwitchMode();
     } else {
       // Mode switch forced, because the new arch doesn't support the old mode.
-      getParser().getStreamer().EmitAssemblerFlag(isThumb() ? MCAF_Code16
+      getParser().getStreamer().emitAssemblerFlag(isThumb() ? MCAF_Code16
                                                             : MCAF_Code32);
       // Warn about the implcit mode switch. GAS does not switch modes here,
       // but instead stays in the old mode, reporting an error on any following
@@ -10859,11 +11146,13 @@ bool ARMAsmParser::parseDirectiveEabiAttr(SMLoc L) {
   TagLoc = Parser.getTok().getLoc();
   if (Parser.getTok().is(AsmToken::Identifier)) {
     StringRef Name = Parser.getTok().getIdentifier();
-    Tag = ARMBuildAttrs::AttrTypeFromString(Name);
-    if (Tag == -1) {
+    Optional<unsigned> Ret =
+        ELFAttrs::attrTypeFromString(Name, ARMBuildAttrs::ARMAttributeTags);
+    if (!Ret.hasValue()) {
       Error(TagLoc, "attribute name not recognised: " + Name);
       return false;
     }
+    Tag = Ret.getValue();
     Parser.Lex();
   } else {
     const MCExpr *AttrExpr;
@@ -11314,9 +11603,9 @@ bool ARMAsmParser::parseDirectiveEven(SMLoc L) {
 
   assert(Section && "must have section to emit alignment");
   if (Section->UseCodeAlign())
-    getStreamer().EmitCodeAlignment(2);
+    getStreamer().emitCodeAlignment(2);
   else
-    getStreamer().EmitValueToAlignment(2);
+    getStreamer().emitValueToAlignment(2);
 
   return false;
 }
@@ -11516,9 +11805,9 @@ bool ARMAsmParser::parseDirectiveAlign(SMLoc L) {
     const MCSection *Section = getStreamer().getCurrentSectionOnly();
     assert(Section && "must have section to emit alignment");
     if (Section->UseCodeAlign())
-      getStreamer().EmitCodeAlignment(4, 0);
+      getStreamer().emitCodeAlignment(4, 0);
     else
-      getStreamer().EmitValueToAlignment(4, 0, 1, 0);
+      getStreamer().emitValueToAlignment(4, 0, 1, 0);
     return false;
   }
   return true;
@@ -11770,7 +12059,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
   // when we start to table-generate them, and we can use the ARM
   // flags below, that were generated by table-gen.
   static const struct {
-    const unsigned Kind;
+    const uint64_t Kind;
     const FeatureBitset ArchCheck;
     const FeatureBitset Features;
   } Extensions[] = {
@@ -11819,7 +12108,7 @@ bool ARMAsmParser::parseDirectiveArchExtension(SMLoc L) {
     EnableFeature = false;
     Name = Name.substr(2);
   }
-  unsigned FeatureKind = ARM::parseArchExt(Name);
+  uint64_t FeatureKind = ARM::parseArchExt(Name);
   if (FeatureKind == ARM::AEK_INVALID)
     return Error(ExtLoc, "unknown architectural extension: " + Name);
 
@@ -11969,6 +12258,7 @@ bool ARMAsmParser::isMnemonicVPTPredicable(StringRef Mnemonic,
          Mnemonic.startswith("vpnot") || Mnemonic.startswith("vbic") ||
          Mnemonic.startswith("vrmlsldavh") || Mnemonic.startswith("vmlsldav") ||
          Mnemonic.startswith("vcvt") ||
+         MS.isVPTPredicableCDEInstr(Mnemonic) ||
          (Mnemonic.startswith("vmov") &&
           !(ExtraToken == ".f16" || ExtraToken == ".32" ||
             ExtraToken == ".16" || ExtraToken == ".8"));
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index d26b04556abb..54ff0d9966cb 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -182,6 +182,9 @@ static DecodeStatus DecodetGPROddRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
+static DecodeStatus
+DecodeGPRwithAPSR_NZCVnospRegisterClass(MCInst &Inst, unsigned RegNo,
+                                        uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeGPRnopcRegisterClass(MCInst &Inst,
                                                unsigned RegNo, uint64_t Address,
                                                const void *Decoder);
@@ -201,6 +204,8 @@ static DecodeStatus DecoderGPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeGPRPairnospRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder);
@@ -538,10 +543,6 @@ template<unsigned MinLog, unsigned MaxLog>
 static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val,
                                           uint64_t Address,
                                           const void *Decoder);
-template <int shift>
-static DecodeStatus DecodeExpandedImmOperand(MCInst &Inst, unsigned Val,
-                                             uint64_t Address,
-                                             const void *Decoder);
 template<unsigned start>
 static DecodeStatus DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val,
                                                     uint64_t Address,
@@ -1087,8 +1088,12 @@ DecodeStatus ARMDisassembler::getThumbInstruction(MCInst &MI, uint64_t &Size,
     }
   }
 
+  uint32_t Coproc = fieldFromInstruction(Insn32, 8, 4);
+  const uint8_t *DecoderTable = ARM::isCDECoproc(Coproc, STI)
+                                    ? DecoderTableThumb2CDE32
+                                    : DecoderTableThumb2CoProc32;
   Result =
-      decodeInstruction(DecoderTableThumb2CoProc32, MI, Insn32, Address, this, STI);
+      decodeInstruction(DecoderTable, MI, Insn32, Address, this, STI);
   if (Result != MCDisassembler::Fail) {
     Size = 4;
     Check(Result, AddThumbPredicate(MI));
@@ -1220,10 +1225,12 @@ static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
                                    uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
 
+  // According to the Arm ARM RegNo = 14 is undefined, but we return fail
+  // rather than SoftFail as there is no GPRPair table entry for index 7.
   if (RegNo > 13)
     return MCDisassembler::Fail;
 
-  if ((RegNo & 1) || RegNo == 0xe)
+  if (RegNo & 1)
      S = MCDisassembler::SoftFail;
 
   unsigned RegisterPair = GPRPairDecoderTable[RegNo/2];
@@ -1231,6 +1238,19 @@ static DecodeStatus DecodeGPRPairRegisterClass(MCInst &Inst, unsigned RegNo,
   return S;
 }
 
+static DecodeStatus DecodeGPRPairnospRegisterClass(MCInst &Inst, unsigned RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  if (RegNo > 13)
+    return MCDisassembler::Fail;
+
+  unsigned RegisterPair = GPRPairDecoderTable[RegNo/2];
+  Inst.addOperand(MCOperand::createReg(RegisterPair));
+
+  if ((RegNo & 1) || RegNo > 10)
+     return MCDisassembler::SoftFail;
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeGPRspRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t Address,
                                              const void *Decoder) {
@@ -6068,6 +6088,23 @@ static DecodeStatus DecodetGPREvenRegisterClass(MCInst &Inst, unsigned RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus
+DecodeGPRwithAPSR_NZCVnospRegisterClass(MCInst &Inst, unsigned RegNo,
+                                        uint64_t Address, const void *Decoder) {
+  if (RegNo == 15) {
+    Inst.addOperand(MCOperand::createReg(ARM::APSR_NZCV));
+    return MCDisassembler::Success;
+  }
+
+  unsigned Register = GPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
+
+  if (RegNo == 13)
+    return MCDisassembler::SoftFail;
+
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address,
                                   const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -6395,16 +6432,6 @@ static DecodeStatus DecodePowerTwoOperand(MCInst &Inst, unsigned Val,
   return S;
 }
 
-template <int shift>
-static DecodeStatus DecodeExpandedImmOperand(MCInst &Inst, unsigned Val,
-                                             uint64_t Address,
-                                             const void *Decoder) {
-    Val <<= shift;
-
-    Inst.addOperand(MCOperand::createImm(Val));
-    return MCDisassembler::Success;
-}
-
 template<unsigned start>
 static DecodeStatus DecodeMVEPairVectorIndexOperand(MCInst &Inst, unsigned Val,
                                                     uint64_t Address,
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 6196881a9b8f..9ad595c016c4 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -48,38 +48,43 @@ public:
 } // end anonymous namespace
 
 Optional<MCFixupKind> ARMAsmBackend::getFixupKind(StringRef Name) const {
-  if (STI.getTargetTriple().isOSBinFormatELF() && Name == "R_ARM_NONE")
-    return FK_NONE;
-
-  return MCAsmBackend::getFixupKind(Name);
+  if (!STI.getTargetTriple().isOSBinFormatELF())
+    return None;
+
+  unsigned Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/ARM.def"
+#undef ELF_RELOC
+                      .Default(-1u);
+  if (Type == -1u)
+    return None;
+  return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type);
 }
 
 const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  unsigned IsPCRelConstant =
+      MCFixupKindInfo::FKF_IsPCRel | MCFixupKindInfo::FKF_Constant;
   const static MCFixupKindInfo InfosLE[ARM::NumTargetFixupKinds] = {
       // This table *must* be in the order that the fixup_* kinds are defined in
       // ARMFixupKinds.h.
       //
       // Name                      Offset (bits) Size (bits)     Flags
-      {"fixup_arm_ldst_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_ldst_pcrel_12", 0, 32, IsPCRelConstant},
       {"fixup_t2_ldst_pcrel_12", 0, 32,
-       MCFixupKindInfo::FKF_IsPCRel |
-           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-      {"fixup_arm_pcrel_10_unscaled", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-      {"fixup_arm_pcrel_10", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+       IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_pcrel_10_unscaled", 0, 32, IsPCRelConstant},
+      {"fixup_arm_pcrel_10", 0, 32, IsPCRelConstant},
       {"fixup_t2_pcrel_10", 0, 32,
        MCFixupKindInfo::FKF_IsPCRel |
            MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_arm_pcrel_9", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_t2_pcrel_9", 0, 32,
-       MCFixupKindInfo::FKF_IsPCRel |
-           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+       IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_thumb_adr_pcrel_10", 0, 8,
-       MCFixupKindInfo::FKF_IsPCRel |
-           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-      {"fixup_arm_adr_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+       IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_adr_pcrel_12", 0, 32, IsPCRelConstant},
       {"fixup_t2_adr_pcrel_12", 0, 32,
-       MCFixupKindInfo::FKF_IsPCRel |
-           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+       IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_arm_condbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_uncondbranch", 0, 24, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_t2_condbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
@@ -118,26 +123,22 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       // ARMFixupKinds.h.
       //
       // Name                      Offset (bits) Size (bits)     Flags
-      {"fixup_arm_ldst_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_arm_ldst_pcrel_12", 0, 32, IsPCRelConstant},
       {"fixup_t2_ldst_pcrel_12", 0, 32,
-       MCFixupKindInfo::FKF_IsPCRel |
-           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-      {"fixup_arm_pcrel_10_unscaled", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
-      {"fixup_arm_pcrel_10", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+       IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_pcrel_10_unscaled", 0, 32, IsPCRelConstant},
+      {"fixup_arm_pcrel_10", 0, 32, IsPCRelConstant},
       {"fixup_t2_pcrel_10", 0, 32,
        MCFixupKindInfo::FKF_IsPCRel |
            MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_arm_pcrel_9", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_t2_pcrel_9", 0, 32,
-       MCFixupKindInfo::FKF_IsPCRel |
-           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+       IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_thumb_adr_pcrel_10", 8, 8,
-       MCFixupKindInfo::FKF_IsPCRel |
-           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-      {"fixup_arm_adr_pcrel_12", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+       IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+      {"fixup_arm_adr_pcrel_12", 0, 32, IsPCRelConstant},
       {"fixup_t2_adr_pcrel_12", 0, 32,
-       MCFixupKindInfo::FKF_IsPCRel |
-           MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
+       IsPCRelConstant | MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
       {"fixup_arm_condbranch", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_arm_uncondbranch", 8, 24, MCFixupKindInfo::FKF_IsPCRel},
       {"fixup_t2_condbranch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
@@ -172,6 +173,11 @@ const MCFixupKindInfo &ARMAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_le", 0, 32, MCFixupKindInfo::FKF_IsPCRel}
   };
 
+  // Fixup kinds from .reloc directive are like R_ARM_NONE. They do not require
+  // any extra processing.
+  if (Kind >= FirstLiteralRelocationKind)
+    return MCAsmBackend::getFixupKindInfo(FK_NONE);
+
   if (Kind < FirstTargetFixupKind)
     return MCAsmBackend::getFixupKindInfo(Kind);
 
@@ -316,9 +322,8 @@ bool ARMAsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
   return reasonForFixupRelaxation(Fixup, Value);
 }
 
-void ARMAsmBackend::relaxInstruction(const MCInst &Inst,
-                                     const MCSubtargetInfo &STI,
-                                     MCInst &Res) const {
+void ARMAsmBackend::relaxInstruction(MCInst &Inst,
+                                     const MCSubtargetInfo &STI) const {
   unsigned RelaxedOp = getRelaxedOpcode(Inst.getOpcode(), STI);
 
   // Sanity check w/ diagnostic if we get here w/ a bogus instruction.
@@ -334,17 +339,18 @@ void ARMAsmBackend::relaxInstruction(const MCInst &Inst,
   // have to change the operands too.
   if ((Inst.getOpcode() == ARM::tCBZ || Inst.getOpcode() == ARM::tCBNZ) &&
       RelaxedOp == ARM::tHINT) {
+    MCInst Res;
     Res.setOpcode(RelaxedOp);
     Res.addOperand(MCOperand::createImm(0));
     Res.addOperand(MCOperand::createImm(14));
     Res.addOperand(MCOperand::createReg(0));
+    Inst = std::move(Res);
     return;
   }
 
   // The rest of instructions we're relaxing have the same operands.
   // We just need to update to the proper opcode.
-  Res = Inst;
-  Res.setOpcode(RelaxedOp);
+  Inst.setOpcode(RelaxedOp);
 }
 
 bool ARMAsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
@@ -438,7 +444,6 @@ unsigned ARMAsmBackend::adjustFixupValue(const MCAssembler &Asm,
   default:
     Ctx.reportError(Fixup.getLoc(), "bad relocation fixup type");
     return 0;
-  case FK_NONE:
   case FK_Data_1:
   case FK_Data_2:
   case FK_Data_4:
@@ -871,7 +876,7 @@ bool ARMAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
   const MCSymbolRefExpr *A = Target.getSymA();
   const MCSymbol *Sym = A ? &A->getSymbol() : nullptr;
   const unsigned FixupKind = Fixup.getKind();
-  if (FixupKind == FK_NONE)
+  if (FixupKind >= FirstLiteralRelocationKind)
     return true;
   if (FixupKind == ARM::fixup_arm_thumb_bl) {
     assert(Sym && "How did we resolve this?");
@@ -915,9 +920,6 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   default:
     llvm_unreachable("Unknown fixup kind!");
 
-  case FK_NONE:
-    return 0;
-
   case FK_Data_1:
   case ARM::fixup_arm_thumb_bcc:
   case ARM::fixup_arm_thumb_cp:
@@ -979,9 +981,6 @@ static unsigned getFixupKindContainerSizeBytes(unsigned Kind) {
   default:
     llvm_unreachable("Unknown fixup kind!");
 
-  case FK_NONE:
-    return 0;
-
   case FK_Data_1:
     return 1;
   case FK_Data_2:
@@ -1037,7 +1036,10 @@ void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                MutableArrayRef<char> Data, uint64_t Value,
                                bool IsResolved,
                                const MCSubtargetInfo* STI) const {
-  unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+  unsigned Kind = Fixup.getKind();
+  if (Kind >= FirstLiteralRelocationKind)
+    return;
+  unsigned NumBytes = getFixupKindNumBytes(Kind);
   MCContext &Ctx = Asm.getContext();
   Value = adjustFixupValue(Asm, Fixup, Target, Value, IsResolved, Ctx, STI);
   if (!Value)
@@ -1049,7 +1051,7 @@ void ARMAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
   // Used to point to big endian bytes.
   unsigned FullSizeBytes;
   if (Endian == support::big) {
-    FullSizeBytes = getFixupKindContainerSizeBytes(Fixup.getKind());
+    FullSizeBytes = getFixupKindContainerSizeBytes(Kind);
     assert((Offset + FullSizeBytes) <= Data.size() && "Invalid fixup size!");
     assert(NumBytes <= FullSizeBytes && "Invalid fixup size!");
   }
@@ -1116,11 +1118,11 @@ uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
     const MCCFIInstruction &Inst = Instrs[i];
     switch (Inst.getOperation()) {
     case MCCFIInstruction::OpDefCfa: // DW_CFA_def_cfa
-      CFARegisterOffset = -Inst.getOffset();
+      CFARegisterOffset = Inst.getOffset();
       CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true);
       break;
     case MCCFIInstruction::OpDefCfaOffset: // DW_CFA_def_cfa_offset
-      CFARegisterOffset = -Inst.getOffset();
+      CFARegisterOffset = Inst.getOffset();
       break;
     case MCCFIInstruction::OpDefCfaRegister: // DW_CFA_def_cfa_register
       CFARegister = *MRI.getLLVMRegNum(Inst.getRegister(), true);
@@ -1277,35 +1279,6 @@ uint32_t ARMAsmBackendDarwin::generateCompactUnwindEncoding(
   return CompactUnwindEncoding | ((FloatRegCount - 1) << 8);
 }
 
-static MachO::CPUSubTypeARM getMachOSubTypeFromArch(StringRef Arch) {
-  ARM::ArchKind AK = ARM::parseArch(Arch);
-  switch (AK) {
-  default:
-    return MachO::CPU_SUBTYPE_ARM_V7;
-  case ARM::ArchKind::ARMV4T:
-    return MachO::CPU_SUBTYPE_ARM_V4T;
-  case ARM::ArchKind::ARMV5T:
-  case ARM::ArchKind::ARMV5TE:
-  case ARM::ArchKind::ARMV5TEJ:
-    return MachO::CPU_SUBTYPE_ARM_V5;
-  case ARM::ArchKind::ARMV6:
-  case ARM::ArchKind::ARMV6K:
-    return MachO::CPU_SUBTYPE_ARM_V6;
-  case ARM::ArchKind::ARMV7A:
-    return MachO::CPU_SUBTYPE_ARM_V7;
-  case ARM::ArchKind::ARMV7S:
-    return MachO::CPU_SUBTYPE_ARM_V7S;
-  case ARM::ArchKind::ARMV7K:
-    return MachO::CPU_SUBTYPE_ARM_V7K;
-  case ARM::ArchKind::ARMV6M:
-    return MachO::CPU_SUBTYPE_ARM_V6M;
-  case ARM::ArchKind::ARMV7M:
-    return MachO::CPU_SUBTYPE_ARM_V7M;
-  case ARM::ArchKind::ARMV7EM:
-    return MachO::CPU_SUBTYPE_ARM_V7EM;
-  }
-}
-
 static MCAsmBackend *createARMAsmBackend(const Target &T,
                                          const MCSubtargetInfo &STI,
                                          const MCRegisterInfo &MRI,
@@ -1315,10 +1288,8 @@ static MCAsmBackend *createARMAsmBackend(const Target &T,
   switch (TheTriple.getObjectFormat()) {
   default:
     llvm_unreachable("unsupported object format");
-  case Triple::MachO: {
-    MachO::CPUSubTypeARM CS = getMachOSubTypeFromArch(TheTriple.getArchName());
-    return new ARMAsmBackendDarwin(T, STI, MRI, CS);
-  }
+  case Triple::MachO:
+    return new ARMAsmBackendDarwin(T, STI, MRI);
   case Triple::COFF:
     assert(TheTriple.isOSWindows() && "non-Windows ARM COFF is not supported");
     return new ARMAsmBackendWinCOFF(T, STI);
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
index 67722a5e5b64..38c7b30769b3 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.h
@@ -66,8 +66,8 @@ public:
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override;
 
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override;
+  void relaxInstruction(MCInst &Inst,
+                        const MCSubtargetInfo &STI) const override;
 
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
index 87e56940f46d..e27bb134670f 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMAsmBackendDarwin.h
@@ -16,16 +16,20 @@
 namespace llvm {
 class ARMAsmBackendDarwin : public ARMAsmBackend {
   const MCRegisterInfo &MRI;
+  Triple TT;
 public:
   const MachO::CPUSubTypeARM Subtype;
   ARMAsmBackendDarwin(const Target &T, const MCSubtargetInfo &STI,
-                      const MCRegisterInfo &MRI, MachO::CPUSubTypeARM st)
-      : ARMAsmBackend(T, STI, support::little), MRI(MRI), Subtype(st) {}
+                      const MCRegisterInfo &MRI)
+      : ARMAsmBackend(T, STI, support::little), MRI(MRI),
+        TT(STI.getTargetTriple()),
+        Subtype((MachO::CPUSubTypeARM)cantFail(
+            MachO::getCPUSubType(STI.getTargetTriple()))) {}
 
   std::unique_ptr<MCObjectTargetWriter>
   createObjectTargetWriter() const override {
-    return createARMMachObjectWriter(/*Is64Bit=*/false, MachO::CPU_TYPE_ARM,
-                                     Subtype);
+    return createARMMachObjectWriter(
+        /*Is64Bit=*/false, cantFail(MachO::getCPUType(TT)), Subtype);
   }
 
   uint32_t generateCompactUnwindEncoding(
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
index 6293a2462306..74cd2e681ded 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMBaseInfo.h
@@ -393,9 +393,21 @@ namespace ARMII {
     // in an IT block).
     ThumbArithFlagSetting = 1 << 19,
 
-    // Whether an instruction can be included in an MVE tail-predicated loop.
+    // Whether an instruction can be included in an MVE tail-predicated loop,
+    // though extra validity checks may need to be performed too.
     ValidForTailPredication = 1 << 20,
 
+    // Whether an instruction writes to the top/bottom half of a vector element
+    // and leaves the other half untouched.
+    RetainsPreviousHalfElement = 1 << 21,
+
+    // Whether the instruction produces a scalar result from vector operands.
+    HorizontalReduction = 1 << 22,
+
+    // Whether this instruction produces a vector result that is larger than
+    // its input, typically reading from the top/bottom halves of the input(s).
+    DoubleWidthResult = 1 << 23,
+
     //===------------------------------------------------------------------===//
     // Code domain.
     DomainShift   = 15,
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
index 2c26dd388c05..37d81e4b0af1 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFObjectWriter.cpp
@@ -53,8 +53,8 @@ ARMELFObjectWriter::ARMELFObjectWriter(uint8_t OSABI)
 
 bool ARMELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
                                                  unsigned Type) const {
-  // FIXME: This is extremely conservative. This really needs to use a
-  // whitelist with a clear explanation for why each realocation needs to
+  // FIXME: This is extremely conservative. This really needs to use an
+  // explicit list with a clear explanation for why each realocation needs to
   // point to the symbol, not to the section.
   switch (Type) {
   default:
@@ -79,6 +79,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
                                                const MCFixup &Fixup,
                                                bool IsPCRel,
                                                MCContext &Ctx) const {
+  unsigned Kind = Fixup.getTargetKind();
+  if (Kind >= FirstLiteralRelocationKind)
+    return Kind - FirstLiteralRelocationKind;
   MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
 
   if (IsPCRel) {
@@ -89,9 +92,18 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
     case FK_Data_4:
       switch (Modifier) {
       default:
-        llvm_unreachable("Unsupported Modifier");
-      case MCSymbolRefExpr::VK_None:
+        Ctx.reportError(Fixup.getLoc(),
+                        "invalid fixup for 4-byte pc-relative data relocation");
+        return ELF::R_ARM_NONE;
+      case MCSymbolRefExpr::VK_None: {
+        if (const MCSymbolRefExpr *SymRef = Target.getSymA()) {
+          // For GNU AS compatibility expressions such as
+          // _GLOBAL_OFFSET_TABLE_ - label emit a R_ARM_BASE_PREL relocation.
+          if (SymRef->getSymbol().getName() == "_GLOBAL_OFFSET_TABLE_")
+            return ELF::R_ARM_BASE_PREL;
+        }
         return ELF::R_ARM_REL32;
+      }
       case MCSymbolRefExpr::VK_GOTTPOFF:
         return ELF::R_ARM_TLS_IE32;
       case MCSymbolRefExpr::VK_ARM_GOT_PREL:
@@ -145,30 +157,34 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
       return ELF::R_ARM_THM_BF18;
     }
   }
-  switch (Fixup.getTargetKind()) {
+  switch (Kind) {
   default:
     Ctx.reportFatalError(Fixup.getLoc(), "unsupported relocation on symbol");
     return ELF::R_ARM_NONE;
-  case FK_NONE:
-    return ELF::R_ARM_NONE;
   case FK_Data_1:
     switch (Modifier) {
     default:
-      llvm_unreachable("unsupported Modifier");
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 1-byte data relocation");
+      return ELF::R_ARM_NONE;
     case MCSymbolRefExpr::VK_None:
       return ELF::R_ARM_ABS8;
     }
   case FK_Data_2:
     switch (Modifier) {
     default:
-      llvm_unreachable("unsupported modifier");
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 2-byte data relocation");
+      return ELF::R_ARM_NONE;
     case MCSymbolRefExpr::VK_None:
       return ELF::R_ARM_ABS16;
     }
   case FK_Data_4:
     switch (Modifier) {
     default:
-      llvm_unreachable("Unsupported Modifier");
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for 4-byte data relocation");
+      return ELF::R_ARM_NONE;
     case MCSymbolRefExpr::VK_ARM_NONE:
       return ELF::R_ARM_NONE;
     case MCSymbolRefExpr::VK_GOT:
@@ -210,7 +226,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
   case ARM::fixup_arm_movt_hi16:
     switch (Modifier) {
     default:
-      llvm_unreachable("Unsupported Modifier");
+      Ctx.reportError(Fixup.getLoc(), "invalid fixup for ARM MOVT instruction");
+      return ELF::R_ARM_NONE;
     case MCSymbolRefExpr::VK_None:
       return ELF::R_ARM_MOVT_ABS;
     case MCSymbolRefExpr::VK_ARM_SBREL:
@@ -219,7 +236,8 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
   case ARM::fixup_arm_movw_lo16:
     switch (Modifier) {
     default:
-      llvm_unreachable("Unsupported Modifier");
+      Ctx.reportError(Fixup.getLoc(), "invalid fixup for ARM MOVW instruction");
+      return ELF::R_ARM_NONE;
     case MCSymbolRefExpr::VK_None:
       return ELF::R_ARM_MOVW_ABS_NC;
     case MCSymbolRefExpr::VK_ARM_SBREL:
@@ -228,7 +246,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
   case ARM::fixup_t2_movt_hi16:
     switch (Modifier) {
     default:
-      llvm_unreachable("Unsupported Modifier");
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for Thumb MOVT instruction");
+      return ELF::R_ARM_NONE;
     case MCSymbolRefExpr::VK_None:
       return ELF::R_ARM_THM_MOVT_ABS;
     case MCSymbolRefExpr::VK_ARM_SBREL:
@@ -237,7 +257,9 @@ unsigned ARMELFObjectWriter::GetRelocTypeInner(const MCValue &Target,
   case ARM::fixup_t2_movw_lo16:
     switch (Modifier) {
     default:
-      llvm_unreachable("Unsupported Modifier");
+      Ctx.reportError(Fixup.getLoc(),
+                      "invalid fixup for Thumb MOVW instruction");
+      return ELF::R_ARM_NONE;
     case MCSymbolRefExpr::VK_None:
       return ELF::R_ARM_THM_MOVW_ABS_NC;
     case MCSymbolRefExpr::VK_ARM_SBREL:
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
index f558ca8d2d9f..876741d6c343 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMELFStreamer.cpp
@@ -93,7 +93,7 @@ class ARMTargetAsmStreamer : public ARMTargetStreamer {
   void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
                             StringRef StringValue) override;
   void emitArch(ARM::ArchKind Arch) override;
-  void emitArchExtension(unsigned ArchExt) override;
+  void emitArchExtension(uint64_t ArchExt) override;
   void emitObjectArch(ARM::ArchKind Arch) override;
   void emitFPU(unsigned FPU) override;
   void emitInst(uint32_t Inst, char Suffix = '\0') override;
@@ -177,7 +177,8 @@ void ARMTargetAsmStreamer::switchVendor(StringRef Vendor) {}
 void ARMTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
   OS << "\t.eabi_attribute\t" << Attribute << ", " << Twine(Value);
   if (IsVerboseAsm) {
-    StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute);
+    StringRef Name =
+        ELFAttrs::attrTypeAsString(Attribute, ARMBuildAttrs::ARMAttributeTags);
     if (!Name.empty())
       OS << "\t@ " << Name;
   }
@@ -193,7 +194,8 @@ void ARMTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
   default:
     OS << "\t.eabi_attribute\t" << Attribute << ", \"" << String << "\"";
     if (IsVerboseAsm) {
-      StringRef Name = ARMBuildAttrs::AttrTypeAsString(Attribute);
+      StringRef Name = ELFAttrs::attrTypeAsString(
+          Attribute, ARMBuildAttrs::ARMAttributeTags);
       if (!Name.empty())
         OS << "\t@ " << Name;
     }
@@ -212,7 +214,9 @@ void ARMTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute,
     if (!StringValue.empty())
       OS << ", \"" << StringValue << "\"";
     if (IsVerboseAsm)
-      OS << "\t@ " << ARMBuildAttrs::AttrTypeAsString(Attribute);
+      OS << "\t@ "
+         << ELFAttrs::attrTypeAsString(Attribute,
+                                       ARMBuildAttrs::ARMAttributeTags);
     break;
   }
   OS << "\n";
@@ -222,7 +226,7 @@ void ARMTargetAsmStreamer::emitArch(ARM::ArchKind Arch) {
   OS << "\t.arch\t" << ARM::getArchName(Arch) << "\n";
 }
 
-void ARMTargetAsmStreamer::emitArchExtension(unsigned ArchExt) {
+void ARMTargetAsmStreamer::emitArchExtension(uint64_t ArchExt) {
   OS << "\t.arch_extension\t" << ARM::getArchExtName(ArchExt) << "\n";
 }
 
@@ -238,7 +242,7 @@ void ARMTargetAsmStreamer::finishAttributeSection() {}
 
 void
 ARMTargetAsmStreamer::AnnotateTLSDescriptorSequence(const MCSymbolRefExpr *S) {
-  OS << "\t.tlsdescseq\t" << S->getSymbol().getName();
+  OS << "\t.tlsdescseq\t" << S->getSymbol().getName() << "\n";
 }
 
 void ARMTargetAsmStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
@@ -328,12 +332,8 @@ private:
     }
 
     // Create new attribute item
-    AttributeItem Item = {
-      AttributeItem::NumericAttribute,
-      Attribute,
-      Value,
-      StringRef("")
-    };
+    AttributeItem Item = {AttributeItem::NumericAttribute, Attribute, Value,
+                          std::string(StringRef(""))};
     Contents.push_back(Item);
   }
 
@@ -344,17 +344,13 @@ private:
       if (!OverwriteExisting)
         return;
       Item->Type = AttributeItem::TextAttribute;
-      Item->StringValue = Value;
+      Item->StringValue = std::string(Value);
       return;
     }
 
     // Create new attribute item
-    AttributeItem Item = {
-      AttributeItem::TextAttribute,
-      Attribute,
-      0,
-      Value
-    };
+    AttributeItem Item = {AttributeItem::TextAttribute, Attribute, 0,
+                          std::string(Value)};
     Contents.push_back(Item);
   }
 
@@ -366,17 +362,13 @@ private:
         return;
       Item->Type = AttributeItem::NumericAndTextAttributes;
       Item->IntValue = IntValue;
-      Item->StringValue = StringValue;
+      Item->StringValue = std::string(StringValue);
       return;
     }
 
     // Create new attribute item
-    AttributeItem Item = {
-      AttributeItem::NumericAndTextAttributes,
-      Attribute,
-      IntValue,
-      StringValue
-    };
+    AttributeItem Item = {AttributeItem::NumericAndTextAttributes, Attribute,
+                          IntValue, std::string(StringValue)};
     Contents.push_back(Item);
   }
 
@@ -452,7 +444,7 @@ public:
 
   ~ARMELFStreamer() override = default;
 
-  void FinishImpl() override;
+  void finishImpl() override;
 
   // ARM exception handling directives
   void emitFnStart();
@@ -468,13 +460,13 @@ public:
   void emitUnwindRaw(int64_t Offset, const SmallVectorImpl<uint8_t> &Opcodes);
   void emitFill(const MCExpr &NumBytes, uint64_t FillValue,
                 SMLoc Loc) override {
-    EmitDataMappingSymbol();
+    emitDataMappingSymbol();
     MCObjectStreamer::emitFill(NumBytes, FillValue, Loc);
   }
 
-  void ChangeSection(MCSection *Section, const MCExpr *Subsection) override {
+  void changeSection(MCSection *Section, const MCExpr *Subsection) override {
     LastMappingSymbols[getCurrentSection().first] = std::move(LastEMSInfo);
-    MCELFStreamer::ChangeSection(Section, Subsection);
+    MCELFStreamer::changeSection(Section, Subsection);
     auto LastMappingSymbol = LastMappingSymbols.find(Section);
     if (LastMappingSymbol != LastMappingSymbols.end()) {
       LastEMSInfo = std::move(LastMappingSymbol->second);
@@ -486,14 +478,14 @@ public:
   /// This function is the one used to emit instruction data into the ELF
   /// streamer. We override it to add the appropriate mapping symbol if
   /// necessary.
-  void EmitInstruction(const MCInst &Inst,
+  void emitInstruction(const MCInst &Inst,
                        const MCSubtargetInfo &STI) override {
     if (IsThumb)
       EmitThumbMappingSymbol();
     else
       EmitARMMappingSymbol();
 
-    MCELFStreamer::EmitInstruction(Inst, STI);
+    MCELFStreamer::emitInstruction(Inst, STI);
   }
 
   void emitInst(uint32_t Inst, char Suffix) {
@@ -533,15 +525,15 @@ public:
       llvm_unreachable("Invalid Suffix");
     }
 
-    MCELFStreamer::EmitBytes(StringRef(Buffer, Size));
+    MCELFStreamer::emitBytes(StringRef(Buffer, Size));
   }
 
   /// This is one of the functions used to emit data into an ELF section, so the
   /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
   /// necessary.
-  void EmitBytes(StringRef Data) override {
-    EmitDataMappingSymbol();
-    MCELFStreamer::EmitBytes(Data);
+  void emitBytes(StringRef Data) override {
+    emitDataMappingSymbol();
+    MCELFStreamer::emitBytes(Data);
   }
 
   void FlushPendingMappingSymbol() {
@@ -555,7 +547,7 @@ public:
   /// This is one of the functions used to emit data into an ELF section, so the
   /// ARM streamer overrides it to add the appropriate mapping symbol ($d) if
   /// necessary.
-  void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
+  void emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override {
     if (const MCSymbolRefExpr *SRE = dyn_cast_or_null<MCSymbolRefExpr>(Value)) {
       if (SRE->getKind() == MCSymbolRefExpr::VK_ARM_SBREL && !(Size == 4)) {
         getContext().reportError(Loc, "relocated expression must be 32-bit");
@@ -564,12 +556,12 @@ public:
       getOrCreateDataFragment();
     }
 
-    EmitDataMappingSymbol();
-    MCELFStreamer::EmitValueImpl(Value, Size, Loc);
+    emitDataMappingSymbol();
+    MCELFStreamer::emitValueImpl(Value, Size, Loc);
   }
 
-  void EmitAssemblerFlag(MCAssemblerFlag Flag) override {
-    MCELFStreamer::EmitAssemblerFlag(Flag);
+  void emitAssemblerFlag(MCAssemblerFlag Flag) override {
+    MCELFStreamer::emitAssemblerFlag(Flag);
 
     switch (Flag) {
     case MCAF_SyntaxUnified:
@@ -609,7 +601,7 @@ private:
     ElfMappingSymbol State;
   };
 
-  void EmitDataMappingSymbol() {
+  void emitDataMappingSymbol() {
     if (LastEMSInfo->State == EMS_Data)
       return;
     else if (LastEMSInfo->State == EMS_None) {
@@ -648,7 +640,7 @@ private:
   void EmitMappingSymbol(StringRef Name) {
     auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
         Name + "." + Twine(MappingSymbolCounter++)));
-    EmitLabel(Symbol);
+    emitLabel(Symbol);
 
     Symbol->setType(ELF::STT_NOTYPE);
     Symbol->setBinding(ELF::STB_LOCAL);
@@ -659,15 +651,15 @@ private:
                          uint64_t Offset) {
     auto *Symbol = cast<MCSymbolELF>(getContext().getOrCreateSymbol(
         Name + "." + Twine(MappingSymbolCounter++)));
-    EmitLabelAtPos(Symbol, Loc, F, Offset);
+    emitLabelAtPos(Symbol, Loc, F, Offset);
     Symbol->setType(ELF::STT_NOTYPE);
     Symbol->setBinding(ELF::STB_LOCAL);
     Symbol->setExternal(false);
   }
 
-  void EmitThumbFunc(MCSymbol *Func) override {
+  void emitThumbFunc(MCSymbol *Func) override {
     getAssembler().setIsThumbFunc(Func);
-    EmitSymbolAttribute(Func, MCSA_ELF_TypeFunction);
+    emitSymbolAttribute(Func, MCSA_ELF_TypeFunction);
   }
 
   // Helper functions for ARM exception handling directives
@@ -868,6 +860,7 @@ void ARMTargetELFStreamer::emitArchDefaultAttributes() {
   case ARM::ArchKind::ARMV8_3A:
   case ARM::ArchKind::ARMV8_4A:
   case ARM::ArchKind::ARMV8_5A:
+  case ARM::ArchKind::ARMV8_6A:
     setAttributeItem(CPU_arch_profile, ApplicationProfile, false);
     setAttributeItem(ARM_ISA_use, Allowed, false);
     setAttributeItem(THUMB_ISA_use, AllowThumb32, false);
@@ -1091,7 +1084,7 @@ void ARMTargetELFStreamer::finishAttributeSection() {
     Streamer.SwitchSection(AttributeSection);
 
     // Format version
-    Streamer.EmitIntValue(0x41, 1);
+    Streamer.emitInt8(0x41);
   }
 
   // Vendor size + Vendor name + '\0'
@@ -1102,31 +1095,31 @@ void ARMTargetELFStreamer::finishAttributeSection() {
 
   const size_t ContentsSize = calculateContentSize();
 
-  Streamer.EmitIntValue(VendorHeaderSize + TagHeaderSize + ContentsSize, 4);
-  Streamer.EmitBytes(CurrentVendor);
-  Streamer.EmitIntValue(0, 1); // '\0'
+  Streamer.emitInt32(VendorHeaderSize + TagHeaderSize + ContentsSize);
+  Streamer.emitBytes(CurrentVendor);
+  Streamer.emitInt8(0); // '\0'
 
-  Streamer.EmitIntValue(ARMBuildAttrs::File, 1);
-  Streamer.EmitIntValue(TagHeaderSize + ContentsSize, 4);
+  Streamer.emitInt8(ARMBuildAttrs::File);
+  Streamer.emitInt32(TagHeaderSize + ContentsSize);
 
   // Size should have been accounted for already, now
   // emit each field as its type (ULEB or String)
   for (size_t i = 0; i < Contents.size(); ++i) {
     AttributeItem item = Contents[i];
-    Streamer.EmitULEB128IntValue(item.Tag);
+    Streamer.emitULEB128IntValue(item.Tag);
     switch (item.Type) {
     default: llvm_unreachable("Invalid attribute type");
     case AttributeItem::NumericAttribute:
-      Streamer.EmitULEB128IntValue(item.IntValue);
+      Streamer.emitULEB128IntValue(item.IntValue);
       break;
     case AttributeItem::TextAttribute:
-      Streamer.EmitBytes(item.StringValue);
-      Streamer.EmitIntValue(0, 1); // '\0'
+      Streamer.emitBytes(item.StringValue);
+      Streamer.emitInt8(0); // '\0'
       break;
     case AttributeItem::NumericAndTextAttributes:
-      Streamer.EmitULEB128IntValue(item.IntValue);
-      Streamer.EmitBytes(item.StringValue);
-      Streamer.EmitIntValue(0, 1); // '\0'
+      Streamer.emitULEB128IntValue(item.IntValue);
+      Streamer.emitBytes(item.StringValue);
+      Streamer.emitInt8(0); // '\0'
       break;
     }
   }
@@ -1143,7 +1136,7 @@ void ARMTargetELFStreamer::emitLabel(MCSymbol *Symbol) {
   Streamer.getAssembler().registerSymbol(*Symbol);
   unsigned Type = cast<MCSymbolELF>(Symbol)->getType();
   if (Type == ELF::STT_FUNC || Type == ELF::STT_GNU_IFUNC)
-    Streamer.EmitThumbFunc(Symbol);
+    Streamer.emitThumbFunc(Symbol);
 }
 
 void
@@ -1155,13 +1148,13 @@ void ARMTargetELFStreamer::emitThumbSet(MCSymbol *Symbol, const MCExpr *Value) {
   if (const MCSymbolRefExpr *SRE = dyn_cast<MCSymbolRefExpr>(Value)) {
     const MCSymbol &Sym = SRE->getSymbol();
     if (!Sym.isDefined()) {
-      getStreamer().EmitAssignment(Symbol, Value);
+      getStreamer().emitAssignment(Symbol, Value);
       return;
     }
   }
 
-  getStreamer().EmitThumbFunc(Symbol);
-  getStreamer().EmitAssignment(Symbol, Value);
+  getStreamer().emitThumbFunc(Symbol);
+  getStreamer().emitAssignment(Symbol, Value);
 }
 
 void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) {
@@ -1170,12 +1163,12 @@ void ARMTargetELFStreamer::emitInst(uint32_t Inst, char Suffix) {
 
 void ARMTargetELFStreamer::reset() { AttributeSection = nullptr; }
 
-void ARMELFStreamer::FinishImpl() {
+void ARMELFStreamer::finishImpl() {
   MCTargetStreamer &TS = *getTargetStreamer();
   ARMTargetStreamer &ATS = static_cast<ARMTargetStreamer &>(TS);
   ATS.finishAttributeSection();
 
-  MCELFStreamer::FinishImpl();
+  MCELFStreamer::finishImpl();
 }
 
 void ARMELFStreamer::reset() {
@@ -1201,7 +1194,7 @@ inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix,
     static_cast<const MCSectionELF &>(Fn.getSection());
 
   // Create the name for new section
-  StringRef FnSecName(FnSection.getSectionName());
+  StringRef FnSecName(FnSection.getName());
   SmallString<128> EHSecName(Prefix);
   if (FnSecName != ".text") {
     EHSecName += FnSecName;
@@ -1213,13 +1206,13 @@ inline void ARMELFStreamer::SwitchToEHSection(StringRef Prefix,
     Flags |= ELF::SHF_GROUP;
   MCSectionELF *EHSection = getContext().getELFSection(
       EHSecName, Type, Flags, 0, Group, FnSection.getUniqueID(),
-      static_cast<const MCSymbolELF *>(&Fn));
+      static_cast<const MCSymbolELF *>(FnSection.getBeginSymbol()));
 
   assert(EHSection && "Failed to get the required EH section");
 
   // Switch to .ARM.extab or .ARM.exidx section
   SwitchSection(EHSection);
-  EmitCodeAlignment(4);
+  emitCodeAlignment(4);
 }
 
 inline void ARMELFStreamer::SwitchToExTabSection(const MCSymbol &FnStart) {
@@ -1258,7 +1251,7 @@ void ARMELFStreamer::EHReset() {
 void ARMELFStreamer::emitFnStart() {
   assert(FnStart == nullptr);
   FnStart = getContext().createTempSymbol();
-  EmitLabel(FnStart);
+  emitLabel(FnStart);
 }
 
 void ARMELFStreamer::emitFnEnd() {
@@ -1284,17 +1277,17 @@ void ARMELFStreamer::emitFnEnd() {
                             MCSymbolRefExpr::VK_ARM_PREL31,
                             getContext());
 
-  EmitValue(FnStartRef, 4);
+  emitValue(FnStartRef, 4);
 
   if (CantUnwind) {
-    EmitIntValue(ARM::EHABI::EXIDX_CANTUNWIND, 4);
+    emitInt32(ARM::EHABI::EXIDX_CANTUNWIND);
   } else if (ExTab) {
     // Emit a reference to the unwind opcodes in the ".ARM.extab" section.
     const MCSymbolRefExpr *ExTabEntryRef =
       MCSymbolRefExpr::create(ExTab,
                               MCSymbolRefExpr::VK_ARM_PREL31,
                               getContext());
-    EmitValue(ExTabEntryRef, 4);
+    emitValue(ExTabEntryRef, 4);
   } else {
     // For the __aeabi_unwind_cpp_pr0, we have to emit the unwind opcodes in
     // the second word of exception index table entry.  The size of the unwind
@@ -1307,7 +1300,7 @@ void ARMELFStreamer::emitFnEnd() {
                       Opcodes[1] << 8 |
                       Opcodes[2] << 16 |
                       Opcodes[3] << 24;
-    EmitIntValue(Intval, Opcodes.size());
+    emitIntValue(Intval, Opcodes.size());
   }
 
   // Switch to the section containing FnStart
@@ -1366,7 +1359,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
   // Create .ARM.extab label for offset in .ARM.exidx
   assert(!ExTab);
   ExTab = getContext().createTempSymbol();
-  EmitLabel(ExTab);
+  emitLabel(ExTab);
 
   // Emit personality
   if (Personality) {
@@ -1375,7 +1368,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
                               MCSymbolRefExpr::VK_ARM_PREL31,
                               getContext());
 
-    EmitValue(PersonalityRef, 4);
+    emitValue(PersonalityRef, 4);
   }
 
   // Emit unwind opcodes
@@ -1386,7 +1379,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
                       Opcodes[I + 1] << 8 |
                       Opcodes[I + 2] << 16 |
                       Opcodes[I + 3] << 24;
-    EmitIntValue(Intval, 4);
+    emitInt32(Intval);
   }
 
   // According to ARM EHABI section 9.2, if the __aeabi_unwind_cpp_pr1() or
@@ -1397,7 +1390,7 @@ void ARMELFStreamer::FlushUnwindOpcodes(bool NoHandlerData) {
   // In case that the .handlerdata directive is not specified by the
   // programmer, we should emit zero to terminate the handler data.
   if (NoHandlerData && !Personality)
-    EmitIntValue(0, 4);
+    emitInt32(0);
 }
 
 void ARMELFStreamer::emitHandlerData() { FlushUnwindOpcodes(false); }
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index b36106a78b71..744d919f2fd4 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -288,7 +288,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   case ARM::t2DSB:
     switch (MI->getOperand(0).getImm()) {
     default:
-      if (!printAliasInstr(MI, STI, O))
+      if (!printAliasInstr(MI, Address, STI, O))
         printInstruction(MI, Address, STI, O);
       break;
     case 0:
@@ -302,7 +302,7 @@ void ARMInstPrinter::printInst(const MCInst *MI, uint64_t Address,
     return;
   }
 
-  if (!printAliasInstr(MI, STI, O))
+  if (!printAliasInstr(MI, Address, STI, O))
     printInstruction(MI, Address, STI, O);
 
   printAnnotation(O, Annot);
@@ -1669,15 +1669,6 @@ void ARMInstPrinter::printVPTMask(const MCInst *MI, unsigned OpNum,
   }
 }
 
-void ARMInstPrinter::printExpandedImmOperand(const MCInst *MI, unsigned OpNum,
-                                             const MCSubtargetInfo &STI,
-                                             raw_ostream &O) {
-  uint32_t Val = MI->getOperand(OpNum).getImm();
-  O << markup("<imm:") << "#0x";
-  O.write_hex(Val);
-  O << markup(">");
-}
-
 void ARMInstPrinter::printMveSaturateOp(const MCInst *MI, unsigned OpNum,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &O) {
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
index 20f901033395..37cb731ff001 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.h
@@ -32,10 +32,10 @@ public:
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, uint64_t Address,
                         const MCSubtargetInfo &STI, raw_ostream &O);
-  virtual bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
-                               raw_ostream &O);
-  virtual void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                                       unsigned PrintMethodIdx,
+  virtual bool printAliasInstr(const MCInst *MI, uint64_t Address,
+                               const MCSubtargetInfo &STI, raw_ostream &O);
+  virtual void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                                       unsigned OpIdx, unsigned PrintMethodIdx,
                                        const MCSubtargetInfo &STI,
                                        raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo,
@@ -43,6 +43,10 @@ public:
 
   void printOperand(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
                     raw_ostream &O);
+  void printOperand(const MCInst *MI, uint64_t /*Address*/, unsigned OpNum,
+                    const MCSubtargetInfo &STI, raw_ostream &O) {
+    printOperand(MI, OpNum, STI, O);
+  }
 
   void printSORegRegOperand(const MCInst *MI, unsigned OpNum,
                             const MCSubtargetInfo &STI, raw_ostream &O);
@@ -109,6 +113,12 @@ public:
   template <unsigned scale>
   void printAdrLabelOperand(const MCInst *MI, unsigned OpNum,
                             const MCSubtargetInfo &STI, raw_ostream &O);
+  template <unsigned scale>
+  void printAdrLabelOperand(const MCInst *MI, uint64_t /*Address*/,
+                            unsigned OpNum, const MCSubtargetInfo &STI,
+                            raw_ostream &O) {
+    printAdrLabelOperand<scale>(MI, OpNum, STI, O);
+  }
   void printThumbS4ImmOperand(const MCInst *MI, unsigned OpNum,
                               const MCSubtargetInfo &STI, raw_ostream &O);
   void printThumbSRImm(const MCInst *MI, unsigned OpNum,
@@ -206,6 +216,11 @@ public:
                     const MCSubtargetInfo &STI, raw_ostream &O);
   void printThumbLdrLabelOperand(const MCInst *MI, unsigned OpNum,
                                  const MCSubtargetInfo &STI, raw_ostream &O);
+  void printThumbLdrLabelOperand(const MCInst *MI, uint64_t /*Address*/,
+                                 unsigned OpNum, const MCSubtargetInfo &STI,
+                                 raw_ostream &O) {
+    printThumbLdrLabelOperand(MI, OpNum, STI, O);
+  }
   void printFBits16(const MCInst *MI, unsigned OpNum,
                     const MCSubtargetInfo &STI, raw_ostream &O);
   void printFBits32(const MCInst *MI, unsigned OpNum,
@@ -260,8 +275,6 @@ public:
                                  const MCSubtargetInfo &STI, raw_ostream &O);
   void printMveAddrModeQOperand(const MCInst *MI, unsigned OpNum,
                                 const MCSubtargetInfo &STI, raw_ostream &O);
-  void printExpandedImmOperand(const MCInst *MI, unsigned OpNum,
-                               const MCSubtargetInfo &STI, raw_ostream &O);
   void printMveSaturateOp(const MCInst *MI, unsigned OpNum,
                          const MCSubtargetInfo &STI, raw_ostream &O);
 private:
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
index d30d15df3d00..765613cf347d 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCAsmInfo.cpp
@@ -37,8 +37,6 @@ ARMMCAsmInfoDarwin::ARMMCAsmInfoDarwin(const Triple &TheTriple) {
   ExceptionsType = (TheTriple.isOSDarwin() && !TheTriple.isWatchABI())
                        ? ExceptionHandling::SjLj
                        : ExceptionHandling::DwarfCFI;
-
-  UseIntegratedAssembler = true;
 }
 
 void ARMELFMCAsmInfo::anchor() { }
@@ -73,8 +71,6 @@ ARMELFMCAsmInfo::ARMELFMCAsmInfo(const Triple &TheTriple) {
 
   // foo(plt) instead of foo@plt
   UseParensForSymbolVariant = true;
-
-  UseIntegratedAssembler = true;
 }
 
 void ARMELFMCAsmInfo::setUseIntegratedAssembler(bool Value) {
@@ -116,7 +112,6 @@ ARMCOFFMCAsmInfoGNU::ARMCOFFMCAsmInfoGNU() {
   ExceptionsType = ExceptionHandling::DwarfCFI;
   UseParensForSymbolVariant = true;
 
-  UseIntegratedAssembler = true;
   DwarfRegNumForCFI = false;
 
   // Conditional Thumb 4-byte instructions can have an implicit IT.
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 268fe7efd9ce..1cb99534f146 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -413,14 +413,6 @@ public:
   unsigned getThumbSRImmOpValue(const MCInst &MI, unsigned Op,
                                  SmallVectorImpl<MCFixup> &Fixups,
                                  const MCSubtargetInfo &STI) const;
-  template <uint8_t shift, bool invert>
-  unsigned getExpandedImmOpValue(const MCInst &MI, unsigned Op,
-                                 SmallVectorImpl<MCFixup> &Fixups,
-                                 const MCSubtargetInfo &STI) const {
-    static_assert(shift <= 32, "Shift count must be less than or equal to 32.");
-    const MCOperand MO = MI.getOperand(Op);
-    return (invert ? (MO.getImm() ^ 0xff) : MO.getImm()) >> shift;
-  }
 
   unsigned NEONThumb2DataIPostEncoder(const MCInst &MI,
                                       unsigned EncodedValue,
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 9f60e70e0e02..05d73ccf6ff2 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -63,6 +63,25 @@ static bool getMCRDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
       return true;
     }
   }
+  if (STI.getFeatureBits()[llvm::ARM::HasV7Ops] &&
+      ((MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 10) ||
+       (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 11))) {
+    Info = "since v7, cp10 and cp11 are reserved for advanced SIMD or floating "
+           "point instructions";
+    return true;
+  }
+  return false;
+}
+
+static bool getMRCDeprecationInfo(MCInst &MI, const MCSubtargetInfo &STI,
+                                  std::string &Info) {
+  if (STI.getFeatureBits()[llvm::ARM::HasV7Ops] &&
+      ((MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 10) ||
+       (MI.getOperand(0).isImm() && MI.getOperand(0).getImm() == 11))) {
+    Info = "since v7, cp10 and cp11 are reserved for advanced SIMD or floating "
+           "point instructions";
+    return true;
+  }
   return false;
 }
 
@@ -168,7 +187,7 @@ MCSubtargetInfo *ARM_MC::createARMMCSubtargetInfo(const Triple &TT,
     if (!ArchFS.empty())
       ArchFS = (Twine(ArchFS) + "," + FS).str();
     else
-      ArchFS = FS;
+      ArchFS = std::string(FS);
   }
 
   return createARMMCSubtargetInfoImpl(TT, CPU, ArchFS);
@@ -200,7 +219,7 @@ static MCAsmInfo *createARMMCAsmInfo(const MCRegisterInfo &MRI,
     MAI = new ARMELFMCAsmInfo(TheTriple);
 
   unsigned Reg = MRI.getDwarfRegNum(ARM::SP, true);
-  MAI->addInitialFrameState(MCCFIInstruction::createDefCfa(nullptr, Reg, 0));
+  MAI->addInitialFrameState(MCCFIInstruction::cfiDefCfa(nullptr, Reg, 0));
 
   return MAI;
 }
@@ -266,7 +285,9 @@ public:
   bool evaluateBranch(const MCInst &Inst, uint64_t Addr,
                       uint64_t Size, uint64_t &Target) const override {
     // We only handle PCRel branches for now.
-    if (Info->get(Inst.getOpcode()).OpInfo[0].OperandType!=MCOI::OPERAND_PCREL)
+    if (Inst.getNumOperands() == 0 ||
+        Info->get(Inst.getOpcode()).OpInfo[0].OperandType !=
+            MCOI::OPERAND_PCREL)
       return false;
 
     int64_t Imm = Inst.getOperand(0).getImm();
@@ -285,8 +306,15 @@ public:
     switch (Inst.getOpcode()) {
     default:
       OpId = 0;
+      if (Inst.getNumOperands() == 0)
+        return false;
       break;
+    case ARM::MVE_WLSTP_8:
+    case ARM::MVE_WLSTP_16:
+    case ARM::MVE_WLSTP_32:
+    case ARM::MVE_WLSTP_64:
     case ARM::t2WLS:
+    case ARM::MVE_LETP:
     case ARM::t2LEUpdate:
       OpId = 2;
       break;
@@ -316,6 +344,14 @@ static MCInstrAnalysis *createThumbMCInstrAnalysis(const MCInstrInfo *Info) {
   return new ThumbMCInstrAnalysis(Info);
 }
 
+bool ARM::isCDECoproc(size_t Coproc, const MCSubtargetInfo &STI) {
+  // Unfortunately we don't have ARMTargetInfo in the disassembler, so we have
+  // to rely on feature bits.
+  if (Coproc >= 8)
+    return false;
+  return STI.getFeatureBits()[ARM::FeatureCoprocCDE0 + Coproc];
+}
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeARMTargetMC() {
   for (Target *T : {&getTheARMLETarget(), &getTheARMBETarget(),
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
index 9cbbd56225ef..7cfe6881b456 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.h
@@ -107,6 +107,9 @@ inline bool isVpred(OperandType op) {
 inline bool isVpred(uint8_t op) {
   return isVpred(static_cast<OperandType>(op));
 }
+
+bool isCDECoproc(size_t Coproc, const MCSubtargetInfo &STI);
+
 } // end namespace ARM
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
index 7b30a61e8ccb..1fee354cad93 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMTargetStreamer.cpp
@@ -80,7 +80,7 @@ void ARMTargetStreamer::emitInst(uint32_t Inst, char Suffix) {
   default:
     llvm_unreachable("Invalid Suffix");
   }
-  getStreamer().EmitBytes(StringRef(Buffer, Size));
+  getStreamer().emitBytes(StringRef(Buffer, Size));
 }
 
 // The remaining callbacks should be handled separately by each
@@ -108,7 +108,7 @@ void ARMTargetStreamer::emitIntTextAttribute(unsigned Attribute,
                                              unsigned IntValue,
                                              StringRef StringValue) {}
 void ARMTargetStreamer::emitArch(ARM::ArchKind Arch) {}
-void ARMTargetStreamer::emitArchExtension(unsigned ArchExt) {}
+void ARMTargetStreamer::emitArchExtension(uint64_t ArchExt) {}
 void ARMTargetStreamer::emitObjectArch(ARM::ArchKind Arch) {}
 void ARMTargetStreamer::emitFPU(unsigned FPU) {}
 void ARMTargetStreamer::finishAttributeSection() {}
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
index a9460b70da56..781627c3c425 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.cpp
@@ -134,7 +134,7 @@ void UnwindOpcodeAssembler::EmitSPOffset(int64_t Offset) {
     uint8_t Buff[16];
     Buff[0] = ARM::EHABI::UNWIND_OPCODE_INC_VSP_ULEB128;
     size_t ULEBSize = encodeULEB128((Offset - 0x204) >> 2, Buff + 1);
-    EmitBytes(Buff, ULEBSize + 1);
+    emitBytes(Buff, ULEBSize + 1);
   } else if (Offset > 0) {
     if (Offset > 0x100) {
       EmitInt8(ARM::EHABI::UNWIND_OPCODE_INC_VSP | 0x3fu);
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h b/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
index 5fb7307159d1..ec11a78f8a7a 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMUnwindOpAsm.h
@@ -64,7 +64,7 @@ public:
     OpBegins.push_back(OpBegins.back() + Opcodes.size());
   }
 
-  /// Finalize the unwind opcode sequence for EmitBytes()
+  /// Finalize the unwind opcode sequence for emitBytes()
   void Finalize(unsigned &PersonalityIndex,
                 SmallVectorImpl<uint8_t> &Result);
 
@@ -80,7 +80,7 @@ private:
     OpBegins.push_back(OpBegins.back() + 2);
   }
 
-  void EmitBytes(const uint8_t *Opcode, size_t Size) {
+  void emitBytes(const uint8_t *Opcode, size_t Size) {
     Ops.insert(Ops.end(), Opcode, Opcode + Size);
     OpBegins.push_back(OpBegins.back() + Size);
   }
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
index b3c8146a9bde..e6f649164a29 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMWinCOFFStreamer.cpp
@@ -22,18 +22,18 @@ public:
                      std::unique_ptr<MCObjectWriter> OW)
       : MCWinCOFFStreamer(C, std::move(AB), std::move(CE), std::move(OW)) {}
 
-  void EmitThumbFunc(MCSymbol *Symbol) override;
-  void FinishImpl() override;
+  void emitThumbFunc(MCSymbol *Symbol) override;
+  void finishImpl() override;
 };
 
-void ARMWinCOFFStreamer::EmitThumbFunc(MCSymbol *Symbol) {
+void ARMWinCOFFStreamer::emitThumbFunc(MCSymbol *Symbol) {
   getAssembler().setIsThumbFunc(Symbol);
 }
 
-void ARMWinCOFFStreamer::FinishImpl() {
-  EmitFrames(nullptr);
+void ARMWinCOFFStreamer::finishImpl() {
+  emitFrames(nullptr);
 
-  MCWinCOFFStreamer::FinishImpl();
+  MCWinCOFFStreamer::finishImpl();
 }
 }
 
diff --git a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
index 9f64af02e698..4d7ad6cd60cb 100644
--- a/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
+++ b/llvm/lib/Target/ARM/MVEGatherScatterLowering.cpp
@@ -15,6 +15,7 @@
 #include "ARM.h"
 #include "ARMBaseInstrInfo.h"
 #include "ARMSubtarget.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
@@ -37,6 +38,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
 
@@ -67,27 +69,77 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
     AU.addRequired<TargetPassConfig>();
+    AU.addRequired<LoopInfoWrapperPass>();
     FunctionPass::getAnalysisUsage(AU);
   }
 
 private:
+  LoopInfo *LI = nullptr;
+
   // Check this is a valid gather with correct alignment
   bool isLegalTypeAndAlignment(unsigned NumElements, unsigned ElemSize,
-                               unsigned Alignment);
+                               Align Alignment);
   // Check whether Ptr is hidden behind a bitcast and look through it
   void lookThroughBitcast(Value *&Ptr);
   // Check for a getelementptr and deduce base and offsets from it, on success
   // returning the base directly and the offsets indirectly using the Offsets
   // argument
-  Value *checkGEP(Value *&Offsets, Type *Ty, Value *Ptr, IRBuilder<> Builder);
+  Value *checkGEP(Value *&Offsets, Type *Ty, GetElementPtrInst *GEP,
+                  IRBuilder<> &Builder);
+  // Compute the scale of this gather/scatter instruction
+  int computeScale(unsigned GEPElemSize, unsigned MemoryElemSize);
+  // If the value is a constant, or derived from constants via additions
+  // and multilications, return its numeric value
+  Optional<int64_t> getIfConst(const Value *V);
+  // If Inst is an add instruction, check whether one summand is a
+  // constant. If so, scale this constant and return it together with
+  // the other summand.
+  std::pair<Value *, int64_t> getVarAndConst(Value *Inst, int TypeScale);
 
-  bool lowerGather(IntrinsicInst *I);
+  Value *lowerGather(IntrinsicInst *I);
   // Create a gather from a base + vector of offsets
   Value *tryCreateMaskedGatherOffset(IntrinsicInst *I, Value *Ptr,
-                                     IRBuilder<> Builder);
+                                     Instruction *&Root, IRBuilder<> &Builder);
   // Create a gather from a vector of pointers
   Value *tryCreateMaskedGatherBase(IntrinsicInst *I, Value *Ptr,
-                                   IRBuilder<> Builder);
+                                   IRBuilder<> &Builder, int64_t Increment = 0);
+  // Create an incrementing gather from a vector of pointers
+  Value *tryCreateMaskedGatherBaseWB(IntrinsicInst *I, Value *Ptr,
+                                     IRBuilder<> &Builder,
+                                     int64_t Increment = 0);
+
+  Value *lowerScatter(IntrinsicInst *I);
+  // Create a scatter to a base + vector of offsets
+  Value *tryCreateMaskedScatterOffset(IntrinsicInst *I, Value *Offsets,
+                                      IRBuilder<> &Builder);
+  // Create a scatter to a vector of pointers
+  Value *tryCreateMaskedScatterBase(IntrinsicInst *I, Value *Ptr,
+                                    IRBuilder<> &Builder,
+                                    int64_t Increment = 0);
+  // Create an incrementing scatter from a vector of pointers
+  Value *tryCreateMaskedScatterBaseWB(IntrinsicInst *I, Value *Ptr,
+                                      IRBuilder<> &Builder,
+                                      int64_t Increment = 0);
+
+  // QI gathers and scatters can increment their offsets on their own if
+  // the increment is a constant value (digit)
+  Value *tryCreateIncrementingGatScat(IntrinsicInst *I, Value *BasePtr,
+                                      Value *Ptr, GetElementPtrInst *GEP,
+                                      IRBuilder<> &Builder);
+  // QI gathers/scatters can increment their offsets on their own if the
+  // increment is a constant value (digit) - this creates a writeback QI
+  // gather/scatter
+  Value *tryCreateIncrementingWBGatScat(IntrinsicInst *I, Value *BasePtr,
+                                        Value *Ptr, unsigned TypeScale,
+                                        IRBuilder<> &Builder);
+  // Check whether these offsets could be moved out of the loop they're in
+  bool optimiseOffsets(Value *Offsets, BasicBlock *BB, LoopInfo *LI);
+  // Pushes the given add out of the loop
+  void pushOutAdd(PHINode *&Phi, Value *OffsSecondOperand, unsigned StartIndex);
+  // Pushes the given mul out of the loop
+  void pushOutMul(PHINode *&Phi, Value *IncrementPerRound,
+                  Value *OffsSecondOperand, unsigned LoopIncrement,
+                  IRBuilder<> &Builder);
 };
 
 } // end anonymous namespace
@@ -103,102 +155,177 @@ Pass *llvm::createMVEGatherScatterLoweringPass() {
 
 bool MVEGatherScatterLowering::isLegalTypeAndAlignment(unsigned NumElements,
                                                        unsigned ElemSize,
-                                                       unsigned Alignment) {
-  // Do only allow non-extending gathers for now
-  if (((NumElements == 4 && ElemSize == 32) ||
-       (NumElements == 8 && ElemSize == 16) ||
+                                                       Align Alignment) {
+  if (((NumElements == 4 &&
+        (ElemSize == 32 || ElemSize == 16 || ElemSize == 8)) ||
+       (NumElements == 8 && (ElemSize == 16 || ElemSize == 8)) ||
        (NumElements == 16 && ElemSize == 8)) &&
-      ElemSize / 8 <= Alignment)
+      Alignment >= ElemSize / 8)
     return true;
-  LLVM_DEBUG(dbgs() << "masked gathers: instruction does not have valid "
-                    << "alignment or vector type \n");
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: instruction does not have "
+                    << "valid alignment or vector type \n");
   return false;
 }
 
-Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty, Value *Ptr,
-                                          IRBuilder<> Builder) {
-  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+Value *MVEGatherScatterLowering::checkGEP(Value *&Offsets, Type *Ty,
+                                          GetElementPtrInst *GEP,
+                                          IRBuilder<> &Builder) {
   if (!GEP) {
-    LLVM_DEBUG(dbgs() << "masked gathers: no getelementpointer found\n");
+    LLVM_DEBUG(
+        dbgs() << "masked gathers/scatters: no getelementpointer found\n");
     return nullptr;
   }
-  LLVM_DEBUG(dbgs() << "masked gathers: getelementpointer found. Loading"
-                    << " from base + vector of offsets\n");
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementpointer found."
+                    << " Looking at intrinsic for base + vector of offsets\n");
   Value *GEPPtr = GEP->getPointerOperand();
   if (GEPPtr->getType()->isVectorTy()) {
-    LLVM_DEBUG(dbgs() << "masked gathers: gather from a vector of pointers"
-                      << " hidden behind a getelementptr currently not"
-                      << " supported. Expanding.\n");
     return nullptr;
   }
   if (GEP->getNumOperands() != 2) {
-    LLVM_DEBUG(dbgs() << "masked gathers: getelementptr with too many"
+    LLVM_DEBUG(dbgs() << "masked gathers/scatters: getelementptr with too many"
                       << " operands. Expanding.\n");
     return nullptr;
   }
   Offsets = GEP->getOperand(1);
-  // SExt offsets inside masked gathers are not permitted by the architecture;
-  // we therefore can't fold them
+  // Paranoid check whether the number of parallel lanes is the same
+  assert(cast<FixedVectorType>(Ty)->getNumElements() ==
+         cast<FixedVectorType>(Offsets->getType())->getNumElements());
+  // Only <N x i32> offsets can be integrated into an arm gather, any smaller
+  // type would have to be sign extended by the gep - and arm gathers can only
+  // zero extend. Additionally, the offsets do have to originate from a zext of
+  // a vector with element types smaller or equal the type of the gather we're
+  // looking at
+  if (Offsets->getType()->getScalarSizeInBits() != 32)
+    return nullptr;
   if (ZExtInst *ZextOffs = dyn_cast<ZExtInst>(Offsets))
     Offsets = ZextOffs->getOperand(0);
-  Type *OffsType = VectorType::getInteger(cast<VectorType>(Ty));
-  // If the offset we found does not have the type the intrinsic expects,
-  // i.e., the same type as the gather itself, we need to convert it (only i
-  // types) or fall back to expanding the gather
-  if (OffsType != Offsets->getType()) {
-    if (OffsType->getScalarSizeInBits() >
-        Offsets->getType()->getScalarSizeInBits()) {
-      LLVM_DEBUG(dbgs() << "masked gathers: extending offsets\n");
-      Offsets = Builder.CreateZExt(Offsets, OffsType, "");
-    } else {
-      LLVM_DEBUG(dbgs() << "masked gathers: no correct offset type. Can't"
-                        << " create masked gather\n");
+  else if (!(cast<FixedVectorType>(Offsets->getType())->getNumElements() == 4 &&
+             Offsets->getType()->getScalarSizeInBits() == 32))
+    return nullptr;
+
+  if (Ty != Offsets->getType()) {
+    if ((Ty->getScalarSizeInBits() <
+         Offsets->getType()->getScalarSizeInBits())) {
+      LLVM_DEBUG(dbgs() << "masked gathers/scatters: no correct offset type."
+                        << " Can't create intrinsic.\n");
       return nullptr;
+    } else {
+      Offsets = Builder.CreateZExt(
+          Offsets, VectorType::getInteger(cast<VectorType>(Ty)));
     }
   }
   // If none of the checks failed, return the gep's base pointer
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: found correct offsets\n");
   return GEPPtr;
 }
 
 void MVEGatherScatterLowering::lookThroughBitcast(Value *&Ptr) {
   // Look through bitcast instruction if #elements is the same
   if (auto *BitCast = dyn_cast<BitCastInst>(Ptr)) {
-    Type *BCTy = BitCast->getType();
-    Type *BCSrcTy = BitCast->getOperand(0)->getType();
-    if (BCTy->getVectorNumElements() == BCSrcTy->getVectorNumElements()) {
-      LLVM_DEBUG(dbgs() << "masked gathers: looking through bitcast\n");
+    auto *BCTy = cast<FixedVectorType>(BitCast->getType());
+    auto *BCSrcTy = cast<FixedVectorType>(BitCast->getOperand(0)->getType());
+    if (BCTy->getNumElements() == BCSrcTy->getNumElements()) {
+      LLVM_DEBUG(
+          dbgs() << "masked gathers/scatters: looking through bitcast\n");
       Ptr = BitCast->getOperand(0);
     }
   }
 }
 
-bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
+int MVEGatherScatterLowering::computeScale(unsigned GEPElemSize,
+                                           unsigned MemoryElemSize) {
+  // This can be a 32bit load/store scaled by 4, a 16bit load/store scaled by 2,
+  // or a 8bit, 16bit or 32bit load/store scaled by 1
+  if (GEPElemSize == 32 && MemoryElemSize == 32)
+    return 2;
+  else if (GEPElemSize == 16 && MemoryElemSize == 16)
+    return 1;
+  else if (GEPElemSize == 8)
+    return 0;
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: incorrect scale. Can't "
+                    << "create intrinsic\n");
+  return -1;
+}
+
+Optional<int64_t> MVEGatherScatterLowering::getIfConst(const Value *V) {
+  const Constant *C = dyn_cast<Constant>(V);
+  if (C != nullptr)
+    return Optional<int64_t>{C->getUniqueInteger().getSExtValue()};
+  if (!isa<Instruction>(V))
+    return Optional<int64_t>{};
+
+  const Instruction *I = cast<Instruction>(V);
+  if (I->getOpcode() == Instruction::Add ||
+              I->getOpcode() == Instruction::Mul) {
+    Optional<int64_t> Op0 = getIfConst(I->getOperand(0));
+    Optional<int64_t> Op1 = getIfConst(I->getOperand(1));
+    if (!Op0 || !Op1)
+      return Optional<int64_t>{};
+    if (I->getOpcode() == Instruction::Add)
+      return Optional<int64_t>{Op0.getValue() + Op1.getValue()};
+    if (I->getOpcode() == Instruction::Mul)
+      return Optional<int64_t>{Op0.getValue() * Op1.getValue()};
+  }
+  return Optional<int64_t>{};
+}
+
+std::pair<Value *, int64_t>
+MVEGatherScatterLowering::getVarAndConst(Value *Inst, int TypeScale) {
+  std::pair<Value *, int64_t> ReturnFalse =
+      std::pair<Value *, int64_t>(nullptr, 0);
+  // At this point, the instruction we're looking at must be an add or we
+  // bail out
+  Instruction *Add = dyn_cast<Instruction>(Inst);
+  if (Add == nullptr || Add->getOpcode() != Instruction::Add)
+    return ReturnFalse;
+
+  Value *Summand;
+  Optional<int64_t> Const;
+  // Find out which operand the value that is increased is
+  if ((Const = getIfConst(Add->getOperand(0))))
+    Summand = Add->getOperand(1);
+  else if ((Const = getIfConst(Add->getOperand(1))))
+    Summand = Add->getOperand(0);
+  else
+    return ReturnFalse;
+
+  // Check that the constant is small enough for an incrementing gather
+  int64_t Immediate = Const.getValue() << TypeScale;
+  if (Immediate > 512 || Immediate < -512 || Immediate % 4 != 0)
+    return ReturnFalse;
+
+  return std::pair<Value *, int64_t>(Summand, Immediate);
+}
+
+Value *MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
   using namespace PatternMatch;
   LLVM_DEBUG(dbgs() << "masked gathers: checking transform preconditions\n");
 
   // @llvm.masked.gather.*(Ptrs, alignment, Mask, Src0)
   // Attempt to turn the masked gather in I into a MVE intrinsic
   // Potentially optimising the addressing modes as we do so.
-  Type *Ty = I->getType();
+  auto *Ty = cast<FixedVectorType>(I->getType());
   Value *Ptr = I->getArgOperand(0);
-  unsigned Alignment = cast<ConstantInt>(I->getArgOperand(1))->getZExtValue();
+  Align Alignment = cast<ConstantInt>(I->getArgOperand(1))->getAlignValue();
   Value *Mask = I->getArgOperand(2);
   Value *PassThru = I->getArgOperand(3);
 
-  if (!isLegalTypeAndAlignment(Ty->getVectorNumElements(),
-                               Ty->getScalarSizeInBits(), Alignment))
-    return false;
+  if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(),
+                               Alignment))
+    return nullptr;
   lookThroughBitcast(Ptr);
   assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");
 
   IRBuilder<> Builder(I->getContext());
   Builder.SetInsertPoint(I);
   Builder.SetCurrentDebugLocation(I->getDebugLoc());
-  Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Builder);
+
+  Instruction *Root = I;
+  Value *Load = tryCreateMaskedGatherOffset(I, Ptr, Root, Builder);
   if (!Load)
     Load = tryCreateMaskedGatherBase(I, Ptr, Builder);
   if (!Load)
-    return false;
+    return nullptr;
 
   if (!isa<UndefValue>(PassThru) && !match(PassThru, m_Zero())) {
     LLVM_DEBUG(dbgs() << "masked gathers: found non-trivial passthru - "
@@ -206,72 +333,649 @@ bool MVEGatherScatterLowering::lowerGather(IntrinsicInst *I) {
     Load = Builder.CreateSelect(Mask, Load, PassThru);
   }
 
+  Root->replaceAllUsesWith(Load);
+  Root->eraseFromParent();
+  if (Root != I)
+    // If this was an extending gather, we need to get rid of the sext/zext
+    // sext/zext as well as of the gather itself
+    I->eraseFromParent();
+
   LLVM_DEBUG(dbgs() << "masked gathers: successfully built masked gather\n");
-  I->replaceAllUsesWith(Load);
-  I->eraseFromParent();
-  return true;
+  return Load;
 }
 
-Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(
-    IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) {
+Value *MVEGatherScatterLowering::tryCreateMaskedGatherBase(IntrinsicInst *I,
+                                                           Value *Ptr,
+                                                           IRBuilder<> &Builder,
+                                                           int64_t Increment) {
   using namespace PatternMatch;
+  auto *Ty = cast<FixedVectorType>(I->getType());
   LLVM_DEBUG(dbgs() << "masked gathers: loading from vector of pointers\n");
-  Type *Ty = I->getType();
-  if (Ty->getVectorNumElements() != 4)
+  if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
     // Can't build an intrinsic for this
     return nullptr;
   Value *Mask = I->getArgOperand(2);
   if (match(Mask, m_One()))
     return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base,
                                    {Ty, Ptr->getType()},
-                                   {Ptr, Builder.getInt32(0)});
+                                   {Ptr, Builder.getInt32(Increment)});
   else
     return Builder.CreateIntrinsic(
         Intrinsic::arm_mve_vldr_gather_base_predicated,
         {Ty, Ptr->getType(), Mask->getType()},
-        {Ptr, Builder.getInt32(0), Mask});
+        {Ptr, Builder.getInt32(Increment), Mask});
+}
+
+Value *MVEGatherScatterLowering::tryCreateMaskedGatherBaseWB(
+    IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) {
+  using namespace PatternMatch;
+  auto *Ty = cast<FixedVectorType>(I->getType());
+  LLVM_DEBUG(
+      dbgs()
+      << "masked gathers: loading from vector of pointers with writeback\n");
+  if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
+    // Can't build an intrinsic for this
+    return nullptr;
+  Value *Mask = I->getArgOperand(2);
+  if (match(Mask, m_One()))
+    return Builder.CreateIntrinsic(Intrinsic::arm_mve_vldr_gather_base_wb,
+                                   {Ty, Ptr->getType()},
+                                   {Ptr, Builder.getInt32(Increment)});
+  else
+    return Builder.CreateIntrinsic(
+        Intrinsic::arm_mve_vldr_gather_base_wb_predicated,
+        {Ty, Ptr->getType(), Mask->getType()},
+        {Ptr, Builder.getInt32(Increment), Mask});
 }
 
 Value *MVEGatherScatterLowering::tryCreateMaskedGatherOffset(
-    IntrinsicInst *I, Value *Ptr, IRBuilder<> Builder) {
+    IntrinsicInst *I, Value *Ptr, Instruction *&Root, IRBuilder<> &Builder) {
   using namespace PatternMatch;
-  Type *Ty = I->getType();
+
+  Type *OriginalTy = I->getType();
+  Type *ResultTy = OriginalTy;
+
+  unsigned Unsigned = 1;
+  // The size of the gather was already checked in isLegalTypeAndAlignment;
+  // if it was not a full vector width an appropriate extend should follow.
+  auto *Extend = Root;
+  if (OriginalTy->getPrimitiveSizeInBits() < 128) {
+    // Only transform gathers with exactly one use
+    if (!I->hasOneUse())
+      return nullptr;
+
+    // The correct root to replace is not the CallInst itself, but the
+    // instruction which extends it
+    Extend = cast<Instruction>(*I->users().begin());
+    if (isa<SExtInst>(Extend)) {
+      Unsigned = 0;
+    } else if (!isa<ZExtInst>(Extend)) {
+      LLVM_DEBUG(dbgs() << "masked gathers: extend needed but not provided. "
+                        << "Expanding\n");
+      return nullptr;
+    }
+    LLVM_DEBUG(dbgs() << "masked gathers: found an extending gather\n");
+    ResultTy = Extend->getType();
+    // The final size of the gather must be a full vector width
+    if (ResultTy->getPrimitiveSizeInBits() != 128) {
+      LLVM_DEBUG(dbgs() << "masked gathers: extending from the wrong type. "
+                        << "Expanding\n");
+      return nullptr;
+    }
+  }
+
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
   Value *Offsets;
-  Value *BasePtr = checkGEP(Offsets, Ty, Ptr, Builder);
+  Value *BasePtr = checkGEP(Offsets, ResultTy, GEP, Builder);
   if (!BasePtr)
     return nullptr;
+  // Check whether the offset is a constant increment that could be merged into
+  // a QI gather
+  Value *Load = tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder);
+  if (Load)
+    return Load;
 
-  unsigned Scale;
-  int GEPElemSize =
-      BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits();
-  int ResultElemSize = Ty->getScalarSizeInBits();
-  // This can be a 32bit load scaled by 4, a 16bit load scaled by 2, or a
-  // 8bit, 16bit or 32bit load scaled by 1
-  if (GEPElemSize == 32 && ResultElemSize == 32) {
-    Scale = 2;
-  } else if (GEPElemSize == 16 && ResultElemSize == 16) {
-    Scale = 1;
-  } else if (GEPElemSize == 8) {
-    Scale = 0;
-  } else {
-    LLVM_DEBUG(dbgs() << "masked gathers: incorrect scale for load. Can't"
-                      << " create masked gather\n");
+  int Scale = computeScale(
+      BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
+      OriginalTy->getScalarSizeInBits());
+  if (Scale == -1)
     return nullptr;
-  }
+  Root = Extend;
 
   Value *Mask = I->getArgOperand(2);
   if (!match(Mask, m_One()))
     return Builder.CreateIntrinsic(
         Intrinsic::arm_mve_vldr_gather_offset_predicated,
-        {Ty, BasePtr->getType(), Offsets->getType(), Mask->getType()},
-        {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()),
-         Builder.getInt32(Scale), Builder.getInt32(1), Mask});
+        {ResultTy, BasePtr->getType(), Offsets->getType(), Mask->getType()},
+        {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()),
+         Builder.getInt32(Scale), Builder.getInt32(Unsigned), Mask});
   else
     return Builder.CreateIntrinsic(
         Intrinsic::arm_mve_vldr_gather_offset,
-        {Ty, BasePtr->getType(), Offsets->getType()},
-        {BasePtr, Offsets, Builder.getInt32(Ty->getScalarSizeInBits()),
-         Builder.getInt32(Scale), Builder.getInt32(1)});
+        {ResultTy, BasePtr->getType(), Offsets->getType()},
+        {BasePtr, Offsets, Builder.getInt32(OriginalTy->getScalarSizeInBits()),
+         Builder.getInt32(Scale), Builder.getInt32(Unsigned)});
+}
+
+Value *MVEGatherScatterLowering::lowerScatter(IntrinsicInst *I) {
+  using namespace PatternMatch;
+  LLVM_DEBUG(dbgs() << "masked scatters: checking transform preconditions\n");
+
+  // @llvm.masked.scatter.*(data, ptrs, alignment, mask)
+  // Attempt to turn the masked scatter in I into a MVE intrinsic
+  // Potentially optimising the addressing modes as we do so.
+  Value *Input = I->getArgOperand(0);
+  Value *Ptr = I->getArgOperand(1);
+  Align Alignment = cast<ConstantInt>(I->getArgOperand(2))->getAlignValue();
+  auto *Ty = cast<FixedVectorType>(Input->getType());
+
+  if (!isLegalTypeAndAlignment(Ty->getNumElements(), Ty->getScalarSizeInBits(),
+                               Alignment))
+    return nullptr;
+
+  lookThroughBitcast(Ptr);
+  assert(Ptr->getType()->isVectorTy() && "Unexpected pointer type");
+
+  IRBuilder<> Builder(I->getContext());
+  Builder.SetInsertPoint(I);
+  Builder.SetCurrentDebugLocation(I->getDebugLoc());
+
+  Value *Store = tryCreateMaskedScatterOffset(I, Ptr, Builder);
+  if (!Store)
+    Store = tryCreateMaskedScatterBase(I, Ptr, Builder);
+  if (!Store)
+    return nullptr;
+
+  LLVM_DEBUG(dbgs() << "masked scatters: successfully built masked scatter\n");
+  I->eraseFromParent();
+  return Store;
+}
+
+Value *MVEGatherScatterLowering::tryCreateMaskedScatterBase(
+    IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) {
+  using namespace PatternMatch;
+  Value *Input = I->getArgOperand(0);
+  auto *Ty = cast<FixedVectorType>(Input->getType());
+  // Only QR variants allow truncating
+  if (!(Ty->getNumElements() == 4 && Ty->getScalarSizeInBits() == 32)) {
+    // Can't build an intrinsic for this
+    return nullptr;
+  }
+  Value *Mask = I->getArgOperand(3);
+  //  int_arm_mve_vstr_scatter_base(_predicated) addr, offset, data(, mask)
+  LLVM_DEBUG(dbgs() << "masked scatters: storing to a vector of pointers\n");
+  if (match(Mask, m_One()))
+    return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base,
+                                   {Ptr->getType(), Input->getType()},
+                                   {Ptr, Builder.getInt32(Increment), Input});
+  else
+    return Builder.CreateIntrinsic(
+        Intrinsic::arm_mve_vstr_scatter_base_predicated,
+        {Ptr->getType(), Input->getType(), Mask->getType()},
+        {Ptr, Builder.getInt32(Increment), Input, Mask});
+}
+
+Value *MVEGatherScatterLowering::tryCreateMaskedScatterBaseWB(
+    IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder, int64_t Increment) {
+  using namespace PatternMatch;
+  Value *Input = I->getArgOperand(0);
+  auto *Ty = cast<FixedVectorType>(Input->getType());
+  LLVM_DEBUG(
+      dbgs()
+      << "masked scatters: storing to a vector of pointers with writeback\n");
+  if (Ty->getNumElements() != 4 || Ty->getScalarSizeInBits() != 32)
+    // Can't build an intrinsic for this
+    return nullptr;
+  Value *Mask = I->getArgOperand(3);
+  if (match(Mask, m_One()))
+    return Builder.CreateIntrinsic(Intrinsic::arm_mve_vstr_scatter_base_wb,
+                                   {Ptr->getType(), Input->getType()},
+                                   {Ptr, Builder.getInt32(Increment), Input});
+  else
+    return Builder.CreateIntrinsic(
+        Intrinsic::arm_mve_vstr_scatter_base_wb_predicated,
+        {Ptr->getType(), Input->getType(), Mask->getType()},
+        {Ptr, Builder.getInt32(Increment), Input, Mask});
+}
+
+Value *MVEGatherScatterLowering::tryCreateMaskedScatterOffset(
+    IntrinsicInst *I, Value *Ptr, IRBuilder<> &Builder) {
+  using namespace PatternMatch;
+  Value *Input = I->getArgOperand(0);
+  Value *Mask = I->getArgOperand(3);
+  Type *InputTy = Input->getType();
+  Type *MemoryTy = InputTy;
+  LLVM_DEBUG(dbgs() << "masked scatters: getelementpointer found. Storing"
+                    << " to base + vector of offsets\n");
+  // If the input has been truncated, try to integrate that trunc into the
+  // scatter instruction (we don't care about alignment here)
+  if (TruncInst *Trunc = dyn_cast<TruncInst>(Input)) {
+    Value *PreTrunc = Trunc->getOperand(0);
+    Type *PreTruncTy = PreTrunc->getType();
+    if (PreTruncTy->getPrimitiveSizeInBits() == 128) {
+      Input = PreTrunc;
+      InputTy = PreTruncTy;
+    }
+  }
+  if (InputTy->getPrimitiveSizeInBits() != 128) {
+    LLVM_DEBUG(
+        dbgs() << "masked scatters: cannot create scatters for non-standard"
+               << " input types. Expanding.\n");
+    return nullptr;
+  }
+
+  GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+  Value *Offsets;
+  Value *BasePtr = checkGEP(Offsets, InputTy, GEP, Builder);
+  if (!BasePtr)
+    return nullptr;
+  // Check whether the offset is a constant increment that could be merged into
+  // a QI gather
+  Value *Store =
+      tryCreateIncrementingGatScat(I, BasePtr, Offsets, GEP, Builder);
+  if (Store)
+    return Store;
+  int Scale = computeScale(
+      BasePtr->getType()->getPointerElementType()->getPrimitiveSizeInBits(),
+      MemoryTy->getScalarSizeInBits());
+  if (Scale == -1)
+    return nullptr;
+
+  if (!match(Mask, m_One()))
+    return Builder.CreateIntrinsic(
+        Intrinsic::arm_mve_vstr_scatter_offset_predicated,
+        {BasePtr->getType(), Offsets->getType(), Input->getType(),
+         Mask->getType()},
+        {BasePtr, Offsets, Input,
+         Builder.getInt32(MemoryTy->getScalarSizeInBits()),
+         Builder.getInt32(Scale), Mask});
+  else
+    return Builder.CreateIntrinsic(
+        Intrinsic::arm_mve_vstr_scatter_offset,
+        {BasePtr->getType(), Offsets->getType(), Input->getType()},
+        {BasePtr, Offsets, Input,
+         Builder.getInt32(MemoryTy->getScalarSizeInBits()),
+         Builder.getInt32(Scale)});
+}
+
+Value *MVEGatherScatterLowering::tryCreateIncrementingGatScat(
+    IntrinsicInst *I, Value *BasePtr, Value *Offsets, GetElementPtrInst *GEP,
+    IRBuilder<> &Builder) {
+  FixedVectorType *Ty;
+  if (I->getIntrinsicID() == Intrinsic::masked_gather)
+    Ty = cast<FixedVectorType>(I->getType());
+  else
+    Ty = cast<FixedVectorType>(I->getArgOperand(0)->getType());
+  // Incrementing gathers only exist for v4i32
+  if (Ty->getNumElements() != 4 ||
+      Ty->getScalarSizeInBits() != 32)
+    return nullptr;
+  Loop *L = LI->getLoopFor(I->getParent());
+  if (L == nullptr)
+    // Incrementing gathers are not beneficial outside of a loop
+    return nullptr;
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
+                       "wb gather/scatter\n");
+
+  // The gep was in charge of making sure the offsets are scaled correctly
+  // - calculate that factor so it can be applied by hand
+  DataLayout DT = I->getParent()->getParent()->getParent()->getDataLayout();
+  int TypeScale =
+      computeScale(DT.getTypeSizeInBits(GEP->getOperand(0)->getType()),
+                   DT.getTypeSizeInBits(GEP->getType()) /
+                       cast<FixedVectorType>(GEP->getType())->getNumElements());
+  if (TypeScale == -1)
+    return nullptr;
+
+  if (GEP->hasOneUse()) {
+    // Only in this case do we want to build a wb gather, because the wb will
+    // change the phi which does affect other users of the gep (which will still
+    // be using the phi in the old way)
+    Value *Load =
+        tryCreateIncrementingWBGatScat(I, BasePtr, Offsets, TypeScale, Builder);
+    if (Load != nullptr)
+      return Load;
+  }
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to build incrementing "
+                       "non-wb gather/scatter\n");
+
+  std::pair<Value *, int64_t> Add = getVarAndConst(Offsets, TypeScale);
+  if (Add.first == nullptr)
+    return nullptr;
+  Value *OffsetsIncoming = Add.first;
+  int64_t Immediate = Add.second;
+
+  // Make sure the offsets are scaled correctly
+  Instruction *ScaledOffsets = BinaryOperator::Create(
+      Instruction::Shl, OffsetsIncoming,
+      Builder.CreateVectorSplat(Ty->getNumElements(), Builder.getInt32(TypeScale)),
+      "ScaledIndex", I);
+  // Add the base to the offsets
+  OffsetsIncoming = BinaryOperator::Create(
+      Instruction::Add, ScaledOffsets,
+      Builder.CreateVectorSplat(
+          Ty->getNumElements(),
+          Builder.CreatePtrToInt(
+              BasePtr,
+              cast<VectorType>(ScaledOffsets->getType())->getElementType())),
+      "StartIndex", I);
+
+  if (I->getIntrinsicID() == Intrinsic::masked_gather)
+    return cast<IntrinsicInst>(
+        tryCreateMaskedGatherBase(I, OffsetsIncoming, Builder, Immediate));
+  else
+    return cast<IntrinsicInst>(
+        tryCreateMaskedScatterBase(I, OffsetsIncoming, Builder, Immediate));
+}
+
+Value *MVEGatherScatterLowering::tryCreateIncrementingWBGatScat(
+    IntrinsicInst *I, Value *BasePtr, Value *Offsets, unsigned TypeScale,
+    IRBuilder<> &Builder) {
+  // Check whether this gather's offset is incremented by a constant - if so,
+  // and the load is of the right type, we can merge this into a QI gather
+  Loop *L = LI->getLoopFor(I->getParent());
+  // Offsets that are worth merging into this instruction will be incremented
+  // by a constant, thus we're looking for an add of a phi and a constant
+  PHINode *Phi = dyn_cast<PHINode>(Offsets);
+  if (Phi == nullptr || Phi->getNumIncomingValues() != 2 ||
+      Phi->getParent() != L->getHeader() || Phi->getNumUses() != 2)
+    // No phi means no IV to write back to; if there is a phi, we expect it
+    // to have exactly two incoming values; the only phis we are interested in
+    // will be loop IV's and have exactly two uses, one in their increment and
+    // one in the gather's gep
+    return nullptr;
+
+  unsigned IncrementIndex =
+      Phi->getIncomingBlock(0) == L->getLoopLatch() ? 0 : 1;
+  // Look through the phi to the phi increment
+  Offsets = Phi->getIncomingValue(IncrementIndex);
+
+  std::pair<Value *, int64_t> Add = getVarAndConst(Offsets, TypeScale);
+  if (Add.first == nullptr)
+    return nullptr;
+  Value *OffsetsIncoming = Add.first;
+  int64_t Immediate = Add.second;
+  if (OffsetsIncoming != Phi)
+    // Then the increment we are looking at is not an increment of the
+    // induction variable, and we don't want to do a writeback
+    return nullptr;
+
+  Builder.SetInsertPoint(&Phi->getIncomingBlock(1 - IncrementIndex)->back());
+  unsigned NumElems =
+      cast<FixedVectorType>(OffsetsIncoming->getType())->getNumElements();
+
+  // Make sure the offsets are scaled correctly
+  Instruction *ScaledOffsets = BinaryOperator::Create(
+      Instruction::Shl, Phi->getIncomingValue(1 - IncrementIndex),
+      Builder.CreateVectorSplat(NumElems, Builder.getInt32(TypeScale)),
+      "ScaledIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back());
+  // Add the base to the offsets
+  OffsetsIncoming = BinaryOperator::Create(
+      Instruction::Add, ScaledOffsets,
+      Builder.CreateVectorSplat(
+          NumElems,
+          Builder.CreatePtrToInt(
+              BasePtr,
+              cast<VectorType>(ScaledOffsets->getType())->getElementType())),
+      "StartIndex", &Phi->getIncomingBlock(1 - IncrementIndex)->back());
+  // The gather is pre-incrementing
+  OffsetsIncoming = BinaryOperator::Create(
+      Instruction::Sub, OffsetsIncoming,
+      Builder.CreateVectorSplat(NumElems, Builder.getInt32(Immediate)),
+      "PreIncrementStartIndex",
+      &Phi->getIncomingBlock(1 - IncrementIndex)->back());
+  Phi->setIncomingValue(1 - IncrementIndex, OffsetsIncoming);
+
+  Builder.SetInsertPoint(I);
+
+  Value *EndResult;
+  Value *NewInduction;
+  if (I->getIntrinsicID() == Intrinsic::masked_gather) {
+    // Build the incrementing gather
+    Value *Load = tryCreateMaskedGatherBaseWB(I, Phi, Builder, Immediate);
+    // One value to be handed to whoever uses the gather, one is the loop
+    // increment
+    EndResult = Builder.CreateExtractValue(Load, 0, "Gather");
+    NewInduction = Builder.CreateExtractValue(Load, 1, "GatherIncrement");
+  } else {
+    // Build the incrementing scatter
+    NewInduction = tryCreateMaskedScatterBaseWB(I, Phi, Builder, Immediate);
+    EndResult = NewInduction;
+  }
+  Instruction *AddInst = cast<Instruction>(Offsets);
+  AddInst->replaceAllUsesWith(NewInduction);
+  AddInst->eraseFromParent();
+  Phi->setIncomingValue(IncrementIndex, NewInduction);
+
+  return EndResult;
+}
+
+void MVEGatherScatterLowering::pushOutAdd(PHINode *&Phi,
+                                          Value *OffsSecondOperand,
+                                          unsigned StartIndex) {
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising add instruction\n");
+  Instruction *InsertionPoint =
+        &cast<Instruction>(Phi->getIncomingBlock(StartIndex)->back());
+  // Initialize the phi with a vector that contains a sum of the constants
+  Instruction *NewIndex = BinaryOperator::Create(
+      Instruction::Add, Phi->getIncomingValue(StartIndex), OffsSecondOperand,
+      "PushedOutAdd", InsertionPoint);
+  unsigned IncrementIndex = StartIndex == 0 ? 1 : 0;
+
+  // Order such that start index comes first (this reduces mov's)
+  Phi->addIncoming(NewIndex, Phi->getIncomingBlock(StartIndex));
+  Phi->addIncoming(Phi->getIncomingValue(IncrementIndex),
+                   Phi->getIncomingBlock(IncrementIndex));
+  Phi->removeIncomingValue(IncrementIndex);
+  Phi->removeIncomingValue(StartIndex);
+}
+
+void MVEGatherScatterLowering::pushOutMul(PHINode *&Phi,
+                                          Value *IncrementPerRound,
+                                          Value *OffsSecondOperand,
+                                          unsigned LoopIncrement,
+                                          IRBuilder<> &Builder) {
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: optimising mul instruction\n");
+
+  // Create a new scalar add outside of the loop and transform it to a splat
+  // by which loop variable can be incremented
+  Instruction *InsertionPoint = &cast<Instruction>(
+        Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1)->back());
+
+  // Create a new index
+  Value *StartIndex = BinaryOperator::Create(
+      Instruction::Mul, Phi->getIncomingValue(LoopIncrement == 1 ? 0 : 1),
+      OffsSecondOperand, "PushedOutMul", InsertionPoint);
+
+  Instruction *Product =
+      BinaryOperator::Create(Instruction::Mul, IncrementPerRound,
+                             OffsSecondOperand, "Product", InsertionPoint);
+  // Increment NewIndex by Product instead of the multiplication
+  Instruction *NewIncrement = BinaryOperator::Create(
+      Instruction::Add, Phi, Product, "IncrementPushedOutMul",
+      cast<Instruction>(Phi->getIncomingBlock(LoopIncrement)->back())
+          .getPrevNode());
+
+  Phi->addIncoming(StartIndex,
+                   Phi->getIncomingBlock(LoopIncrement == 1 ? 0 : 1));
+  Phi->addIncoming(NewIncrement, Phi->getIncomingBlock(LoopIncrement));
+  Phi->removeIncomingValue((unsigned)0);
+  Phi->removeIncomingValue((unsigned)0);
+  return;
+}
+
+// Check whether all usages of this instruction are as offsets of
+// gathers/scatters or simple arithmetics only used by gathers/scatters
+static bool hasAllGatScatUsers(Instruction *I) {
+  if (I->hasNUses(0)) {
+    return false;
+  }
+  bool Gatscat = true;
+  for (User *U : I->users()) {
+    if (!isa<Instruction>(U))
+      return false;
+    if (isa<GetElementPtrInst>(U) ||
+        isGatherScatter(dyn_cast<IntrinsicInst>(U))) {
+      return Gatscat;
+    } else {
+      unsigned OpCode = cast<Instruction>(U)->getOpcode();
+      if ((OpCode == Instruction::Add || OpCode == Instruction::Mul) &&
+          hasAllGatScatUsers(cast<Instruction>(U))) {
+        continue;
+      }
+      return false;
+    }
+  }
+  return Gatscat;
+}
+
+bool MVEGatherScatterLowering::optimiseOffsets(Value *Offsets, BasicBlock *BB,
+                                               LoopInfo *LI) {
+  LLVM_DEBUG(dbgs() << "masked gathers/scatters: trying to optimize\n");
+  // Optimise the addresses of gathers/scatters by moving invariant
+  // calculations out of the loop
+  if (!isa<Instruction>(Offsets))
+    return false;
+  Instruction *Offs = cast<Instruction>(Offsets);
+  if (Offs->getOpcode() != Instruction::Add &&
+      Offs->getOpcode() != Instruction::Mul)
+    return false;
+  Loop *L = LI->getLoopFor(BB);
+  if (L == nullptr)
+    return false;
+  if (!Offs->hasOneUse()) {
+    if (!hasAllGatScatUsers(Offs))
+      return false;
+  }
+
+  // Find out which, if any, operand of the instruction
+  // is a phi node
+  PHINode *Phi;
+  int OffsSecondOp;
+  if (isa<PHINode>(Offs->getOperand(0))) {
+    Phi = cast<PHINode>(Offs->getOperand(0));
+    OffsSecondOp = 1;
+  } else if (isa<PHINode>(Offs->getOperand(1))) {
+    Phi = cast<PHINode>(Offs->getOperand(1));
+    OffsSecondOp = 0;
+  } else {
+    bool Changed = true;
+    if (isa<Instruction>(Offs->getOperand(0)) &&
+        L->contains(cast<Instruction>(Offs->getOperand(0))))
+      Changed |= optimiseOffsets(Offs->getOperand(0), BB, LI);
+    if (isa<Instruction>(Offs->getOperand(1)) &&
+        L->contains(cast<Instruction>(Offs->getOperand(1))))
+      Changed |= optimiseOffsets(Offs->getOperand(1), BB, LI);
+    if (!Changed) {
+      return false;
+    } else {
+      if (isa<PHINode>(Offs->getOperand(0))) {
+        Phi = cast<PHINode>(Offs->getOperand(0));
+        OffsSecondOp = 1;
+      } else if (isa<PHINode>(Offs->getOperand(1))) {
+        Phi = cast<PHINode>(Offs->getOperand(1));
+        OffsSecondOp = 0;
+      } else {
+        return false;
+      }
+    }
+  }
+  // A phi node we want to perform this function on should be from the
+  // loop header, and shouldn't have more than 2 incoming values
+  if (Phi->getParent() != L->getHeader() ||
+      Phi->getNumIncomingValues() != 2)
+    return false;
+
+  // The phi must be an induction variable
+  Instruction *Op;
+  int IncrementingBlock = -1;
+
+  for (int i = 0; i < 2; i++)
+    if ((Op = dyn_cast<Instruction>(Phi->getIncomingValue(i))) != nullptr)
+      if (Op->getOpcode() == Instruction::Add &&
+          (Op->getOperand(0) == Phi || Op->getOperand(1) == Phi))
+        IncrementingBlock = i;
+  if (IncrementingBlock == -1)
+    return false;
+
+  Instruction *IncInstruction =
+      cast<Instruction>(Phi->getIncomingValue(IncrementingBlock));
+
+  // If the phi is not used by anything else, we can just adapt it when
+  // replacing the instruction; if it is, we'll have to duplicate it
+  PHINode *NewPhi;
+  Value *IncrementPerRound = IncInstruction->getOperand(
+      (IncInstruction->getOperand(0) == Phi) ? 1 : 0);
+
+  // Get the value that is added to/multiplied with the phi
+  Value *OffsSecondOperand = Offs->getOperand(OffsSecondOp);
+
+  if (IncrementPerRound->getType() != OffsSecondOperand->getType())
+    // Something has gone wrong, abort
+    return false;
+
+  // Only proceed if the increment per round is a constant or an instruction
+  // which does not originate from within the loop
+  if (!isa<Constant>(IncrementPerRound) &&
+      !(isa<Instruction>(IncrementPerRound) &&
+        !L->contains(cast<Instruction>(IncrementPerRound))))
+    return false;
+
+  if (Phi->getNumUses() == 2) {
+    // No other users -> reuse existing phi (One user is the instruction
+    // we're looking at, the other is the phi increment)
+    if (IncInstruction->getNumUses() != 1) {
+      // If the incrementing instruction does have more users than
+      // our phi, we need to copy it
+      IncInstruction = BinaryOperator::Create(
+          Instruction::BinaryOps(IncInstruction->getOpcode()), Phi,
+          IncrementPerRound, "LoopIncrement", IncInstruction);
+      Phi->setIncomingValue(IncrementingBlock, IncInstruction);
+    }
+    NewPhi = Phi;
+  } else {
+    // There are other users -> create a new phi
+    NewPhi = PHINode::Create(Phi->getType(), 0, "NewPhi", Phi);
+    std::vector<Value *> Increases;
+    // Copy the incoming values of the old phi
+    NewPhi->addIncoming(Phi->getIncomingValue(IncrementingBlock == 1 ? 0 : 1),
+                        Phi->getIncomingBlock(IncrementingBlock == 1 ? 0 : 1));
+    IncInstruction = BinaryOperator::Create(
+        Instruction::BinaryOps(IncInstruction->getOpcode()), NewPhi,
+        IncrementPerRound, "LoopIncrement", IncInstruction);
+    NewPhi->addIncoming(IncInstruction,
+                        Phi->getIncomingBlock(IncrementingBlock));
+    IncrementingBlock = 1;
+  }
+
+  IRBuilder<> Builder(BB->getContext());
+  Builder.SetInsertPoint(Phi);
+  Builder.SetCurrentDebugLocation(Offs->getDebugLoc());
+
+  switch (Offs->getOpcode()) {
+  case Instruction::Add:
+    pushOutAdd(NewPhi, OffsSecondOperand, IncrementingBlock == 1 ? 0 : 1);
+    break;
+  case Instruction::Mul:
+    pushOutMul(NewPhi, IncrementPerRound, OffsSecondOperand, IncrementingBlock,
+               Builder);
+    break;
+  default:
+    return false;
+  }
+  LLVM_DEBUG(
+      dbgs() << "masked gathers/scatters: simplified loop variable add/mul\n");
+
+  // The instruction has now been "absorbed" into the phi value
+  Offs->replaceAllUsesWith(NewPhi);
+  if (Offs->hasNUses(0))
+    Offs->eraseFromParent();
+  // Clean up the old increment in case it's unused because we built a new
+  // one
+  if (IncInstruction->hasNUses(0))
+    IncInstruction->eraseFromParent();
+
+  return true;
 }
 
 bool MVEGatherScatterLowering::runOnFunction(Function &F) {
@@ -282,20 +986,51 @@ bool MVEGatherScatterLowering::runOnFunction(Function &F) {
   auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
   if (!ST->hasMVEIntegerOps())
     return false;
+  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   SmallVector<IntrinsicInst *, 4> Gathers;
+  SmallVector<IntrinsicInst *, 4> Scatters;
+
+  bool Changed = false;
+
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
       IntrinsicInst *II = dyn_cast<IntrinsicInst>(&I);
-      if (II && II->getIntrinsicID() == Intrinsic::masked_gather)
+      if (II && II->getIntrinsicID() == Intrinsic::masked_gather) {
         Gathers.push_back(II);
+        if (isa<GetElementPtrInst>(II->getArgOperand(0)))
+          Changed |= optimiseOffsets(
+              cast<Instruction>(II->getArgOperand(0))->getOperand(1),
+              II->getParent(), LI);
+      } else if (II && II->getIntrinsicID() == Intrinsic::masked_scatter) {
+        Scatters.push_back(II);
+        if (isa<GetElementPtrInst>(II->getArgOperand(1)))
+          Changed |= optimiseOffsets(
+              cast<Instruction>(II->getArgOperand(1))->getOperand(1),
+              II->getParent(), LI);
+      }
     }
   }
 
-  if (Gathers.empty())
-    return false;
+  for (unsigned i = 0; i < Gathers.size(); i++) {
+    IntrinsicInst *I = Gathers[i];
+    Value *L = lowerGather(I);
+    if (L == nullptr)
+      continue;
 
-  for (IntrinsicInst *I : Gathers)
-    lowerGather(I);
+    // Get rid of any now dead instructions
+    SimplifyInstructionsInBlock(cast<Instruction>(L)->getParent());
+    Changed = true;
+  }
 
-  return true;
+  for (unsigned i = 0; i < Scatters.size(); i++) {
+    IntrinsicInst *I = Scatters[i];
+    Value *S = lowerScatter(I);
+    if (S == nullptr)
+      continue;
+
+    // Get rid of any now dead instructions
+    SimplifyInstructionsInBlock(cast<Instruction>(S)->getParent());
+    Changed = true;
+  }
+  return Changed;
 }
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 038c68739cdf..5bf3522ab2e6 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -1,4 +1,4 @@
-//===- MVETailPredication.cpp - MVE Tail Predication ----------------------===//
+//===- MVETailPredication.cpp - MVE Tail Predication ------------*- C++ -*-===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -8,8 +8,17 @@
 //
 /// \file
 /// Armv8.1m introduced MVE, M-Profile Vector Extension, and low-overhead
-/// branches to help accelerate DSP applications. These two extensions can be
-/// combined to provide implicit vector predication within a low-overhead loop.
+/// branches to help accelerate DSP applications. These two extensions,
+/// combined with a new form of predication called tail-predication, can be used
+/// to provide implicit vector predication within a low-overhead loop.
+/// This is implicit because the predicate of active/inactive lanes is
+/// calculated by hardware, and thus does not need to be explicitly passed
+/// to vector instructions. The instructions responsible for this are the
+/// DLSTP and WLSTP instructions, which setup a tail-predicated loop and the
+/// the total number of data elements processed by the loop. The loop-end
+/// LETP instruction is responsible for decrementing and setting the remaining
+/// elements to be processed and generating the mask of active lanes.
+///
 /// The HardwareLoops pass inserts intrinsics identifying loops that the
 /// backend will attempt to convert into a low-overhead loop. The vectorizer is
 /// responsible for generating a vectorized loop in which the lanes are
@@ -21,36 +30,62 @@
 /// - A loop containing multiple VCPT instructions, predicating multiple VPT
 ///   blocks of instructions operating on different vector types.
 ///
-/// This pass inserts the inserts the VCTP intrinsic to represent the effect of
-/// tail predication. This will be picked up by the ARM Low-overhead loop pass,
-/// which performs the final transformation to a DLSTP or WLSTP tail-predicated
-/// loop.
+/// This pass:
+/// 1) Checks if the predicates of the masked load/store instructions are
+///    generated by intrinsic @llvm.get.active.lanes(). This intrinsic consumes
+///    the Backedge Taken Count (BTC) of the scalar loop as its second argument,
+///    which we extract to set up the number of elements processed by the loop.
+/// 2) Intrinsic @llvm.get.active.lanes() is then replaced by the MVE target
+///    specific VCTP intrinsic to represent the effect of tail predication.
+///    This will be picked up by the ARM Low-overhead loop pass, which performs
+///    the final transformation to a DLSTP or WLSTP tail-predicated loop.
 
 #include "ARM.h"
 #include "ARMSubtarget.h"
+#include "ARMTargetTransformInfo.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "mve-tail-predication"
 #define DESC "Transform predicated vector loops to use MVE tail predication"
 
-cl::opt<bool>
-DisableTailPredication("disable-mve-tail-predication", cl::Hidden,
-                       cl::init(true),
-                       cl::desc("Disable MVE Tail Predication"));
+cl::opt<TailPredication::Mode> EnableTailPredication(
+   "tail-predication", cl::desc("MVE tail-predication options"),
+   cl::init(TailPredication::Disabled),
+   cl::values(clEnumValN(TailPredication::Disabled, "disabled",
+                         "Don't tail-predicate loops"),
+              clEnumValN(TailPredication::EnabledNoReductions,
+                         "enabled-no-reductions",
+                         "Enable tail-predication, but not for reduction loops"),
+              clEnumValN(TailPredication::Enabled,
+                         "enabled",
+                         "Enable tail-predication, including reduction loops"),
+              clEnumValN(TailPredication::ForceEnabledNoReductions,
+                         "force-enabled-no-reductions",
+                         "Enable tail-predication, but not for reduction loops, "
+                         "and force this which might be unsafe"),
+              clEnumValN(TailPredication::ForceEnabled,
+                         "force-enabled",
+                         "Enable tail-predication, including reduction loops, "
+                         "and force this which might be unsafe")));
+
+
 namespace {
 
 class MVETailPredication : public LoopPass {
@@ -58,6 +93,7 @@ class MVETailPredication : public LoopPass {
   Loop *L = nullptr;
   ScalarEvolution *SE = nullptr;
   TargetTransformInfo *TTI = nullptr;
+  const ARMSubtarget *ST = nullptr;
 
 public:
   static char ID;
@@ -76,7 +112,6 @@ public:
   bool runOnLoop(Loop *L, LPPassManager&) override;
 
 private:
-
   /// Perform the relevant checks on the loop and convert if possible.
   bool TryConvert(Value *TripCount);
 
@@ -84,19 +119,21 @@ private:
   /// load/stores.
   bool IsPredicatedVectorLoop();
 
-  /// Compute a value for the total number of elements that the predicated
-  /// loop will process.
-  Value *ComputeElements(Value *TripCount, VectorType *VecTy);
-
-  /// Is the icmp that generates an i1 vector, based upon a loop counter
-  /// and a limit that is defined outside the loop.
-  bool isTailPredicate(Instruction *Predicate, Value *NumElements);
+  /// Perform checks on the arguments of @llvm.get.active.lane.mask
+  /// intrinsic: check if the first is a loop induction variable, and for the
+  /// the second check that no overflow can occur in the expression that use
+  /// this backedge-taken count.
+  bool IsSafeActiveMask(IntrinsicInst *ActiveLaneMask, Value *TripCount,
+                        FixedVectorType *VecTy);
 
   /// Insert the intrinsic to represent the effect of tail predication.
-  void InsertVCTPIntrinsic(Instruction *Predicate,
-                           DenseMap<Instruction*, Instruction*> &NewPredicates,
-                           VectorType *VecTy,
-                           Value *NumElements);
+  void InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask, Value *TripCount,
+                           FixedVectorType *VecTy);
+
+  /// Rematerialize the iteration count in exit blocks, which enables
+  /// ARMLowOverheadLoops to better optimise away loop update statements inside
+  /// hardware-loops.
+  void RematerializeIterCount();
 };
 
 } // end namespace
@@ -121,13 +158,14 @@ static bool IsMasked(Instruction *I) {
 }
 
 bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
-  if (skipLoop(L) || DisableTailPredication)
+  if (skipLoop(L) || !EnableTailPredication)
     return false;
 
+  MaskedInsts.clear();
   Function &F = *L->getHeader()->getParent();
   auto &TPC = getAnalysis<TargetPassConfig>();
   auto &TM = TPC.getTM<TargetMachine>();
-  auto *ST = &TM.getSubtarget<ARMSubtarget>(F);
+  ST = &TM.getSubtarget<ARMSubtarget>(F);
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
   SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   this->L = L;
@@ -185,125 +223,59 @@ bool MVETailPredication::runOnLoop(Loop *L, LPPassManager&) {
 
   LLVM_DEBUG(dbgs() << "ARM TP: Running on Loop: " << *L << *Setup << "\n"
              << *Decrement << "\n");
-  return TryConvert(Setup->getArgOperand(0));
-}
 
-bool MVETailPredication::isTailPredicate(Instruction *I, Value *NumElements) {
-  // Look for the following:
-
-  // %trip.count.minus.1 = add i32 %N, -1
-  // %broadcast.splatinsert10 = insertelement <4 x i32> undef,
-  //                                          i32 %trip.count.minus.1, i32 0
-  // %broadcast.splat11 = shufflevector <4 x i32> %broadcast.splatinsert10,
-  //                                    <4 x i32> undef,
-  //                                    <4 x i32> zeroinitializer
-  // ...
-  // ...
-  // %index = phi i32
-  // %broadcast.splatinsert = insertelement <4 x i32> undef, i32 %index, i32 0
-  // %broadcast.splat = shufflevector <4 x i32> %broadcast.splatinsert,
-  //                                  <4 x i32> undef,
-  //                                  <4 x i32> zeroinitializer
-  // %induction = add <4 x i32> %broadcast.splat, <i32 0, i32 1, i32 2, i32 3>
-  // %pred = icmp ule <4 x i32> %induction, %broadcast.splat11
-
-  // And return whether V == %pred.
-
-  using namespace PatternMatch;
-
-  CmpInst::Predicate Pred;
-  Instruction *Shuffle = nullptr;
-  Instruction *Induction = nullptr;
-
-  // The vector icmp
-  if (!match(I, m_ICmp(Pred, m_Instruction(Induction),
-                       m_Instruction(Shuffle))) ||
-      Pred != ICmpInst::ICMP_ULE)
-    return false;
-
-  // First find the stuff outside the loop which is setting up the limit
-  // vector....
-  // The invariant shuffle that broadcast the limit into a vector.
-  Instruction *Insert = nullptr;
-  if (!match(Shuffle, m_ShuffleVector(m_Instruction(Insert), m_Undef(),
-                                      m_Zero())))
-    return false;
-
-  // Insert the limit into a vector.
-  Instruction *BECount = nullptr;
-  if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(BECount),
-                                     m_Zero())))
-    return false;
-
-  // The limit calculation, backedge count.
-  Value *TripCount = nullptr;
-  if (!match(BECount, m_Add(m_Value(TripCount), m_AllOnes())))
-    return false;
-
-  if (TripCount != NumElements || !L->isLoopInvariant(BECount))
-    return false;
-
-  // Now back to searching inside the loop body...
-  // Find the add with takes the index iv and adds a constant vector to it.
-  Instruction *BroadcastSplat = nullptr;
-  Constant *Const = nullptr;
-  if (!match(Induction, m_Add(m_Instruction(BroadcastSplat),
-                              m_Constant(Const))))
-   return false;
-
-  // Check that we're adding <0, 1, 2, 3...
-  if (auto *CDS = dyn_cast<ConstantDataSequential>(Const)) {
-    for (unsigned i = 0; i < CDS->getNumElements(); ++i) {
-      if (CDS->getElementAsInteger(i) != i)
-        return false;
-    }
-  } else
-    return false;
-
-  // The shuffle which broadcasts the index iv into a vector.
-  if (!match(BroadcastSplat, m_ShuffleVector(m_Instruction(Insert), m_Undef(),
-                                             m_Zero())))
-    return false;
-
-  // The insert element which initialises a vector with the index iv.
-  Instruction *IV = nullptr;
-  if (!match(Insert, m_InsertElement(m_Undef(), m_Instruction(IV), m_Zero())))
-    return false;
-
-  // The index iv.
-  auto *Phi = dyn_cast<PHINode>(IV);
-  if (!Phi)
-    return false;
-
-  // TODO: Don't think we need to check the entry value.
-  Value *OnEntry = Phi->getIncomingValueForBlock(L->getLoopPreheader());
-  if (!match(OnEntry, m_Zero()))
-    return false;
-
-  Value *InLoop = Phi->getIncomingValueForBlock(L->getLoopLatch());
-  unsigned Lanes = cast<VectorType>(Insert->getType())->getNumElements();
-
-  Instruction *LHS = nullptr;
-  if (!match(InLoop, m_Add(m_Instruction(LHS), m_SpecificInt(Lanes))))
+  if (!TryConvert(Setup->getArgOperand(0))) {
+    LLVM_DEBUG(dbgs() << "ARM TP: Can't tail-predicate this loop.\n");
     return false;
+  }
 
-  return LHS == Phi;
+  return true;
 }
 
-static VectorType* getVectorType(IntrinsicInst *I) {
+static FixedVectorType *getVectorType(IntrinsicInst *I) {
   unsigned TypeOp = I->getIntrinsicID() == Intrinsic::masked_load ? 0 : 1;
   auto *PtrTy = cast<PointerType>(I->getOperand(TypeOp)->getType());
-  return cast<VectorType>(PtrTy->getElementType());
+  auto *VecTy = cast<FixedVectorType>(PtrTy->getElementType());
+  assert(VecTy && "No scalable vectors expected here");
+  return VecTy;
 }
 
 bool MVETailPredication::IsPredicatedVectorLoop() {
   // Check that the loop contains at least one masked load/store intrinsic.
   // We only support 'normal' vector instructions - other than masked
   // load/stores.
+  bool ActiveLaneMask = false;
   for (auto *BB : L->getBlocks()) {
     for (auto &I : *BB) {
+      auto *Int = dyn_cast<IntrinsicInst>(&I);
+      if (!Int)
+        continue;
+
+      switch (Int->getIntrinsicID()) {
+      case Intrinsic::get_active_lane_mask:
+        ActiveLaneMask = true;
+        LLVM_FALLTHROUGH;
+      case Intrinsic::sadd_sat:
+      case Intrinsic::uadd_sat:
+      case Intrinsic::ssub_sat:
+      case Intrinsic::usub_sat:
+        continue;
+      case Intrinsic::fma:
+      case Intrinsic::trunc:
+      case Intrinsic::rint:
+      case Intrinsic::round:
+      case Intrinsic::floor:
+      case Intrinsic::ceil:
+      case Intrinsic::fabs:
+        if (ST->hasMVEFloatOps())
+          continue;
+        LLVM_FALLTHROUGH;
+      default:
+        break;
+      }
+
       if (IsMasked(&I)) {
-        VectorType *VecTy = getVectorType(cast<IntrinsicInst>(&I));
+        auto *VecTy = getVectorType(Int);
         unsigned Lanes = VecTy->getNumElements();
         unsigned ElementWidth = VecTy->getScalarSizeInBits();
         // MVE vectors are 128-bit, but don't support 128 x i1.
@@ -312,94 +284,23 @@ bool MVETailPredication::IsPredicatedVectorLoop() {
         if (Lanes * ElementWidth > MaxWidth || Lanes == MaxWidth)
           return false;
         MaskedInsts.push_back(cast<IntrinsicInst>(&I));
-      } else if (auto *Int = dyn_cast<IntrinsicInst>(&I)) {
-        for (auto &U : Int->args()) {
-          if (isa<VectorType>(U->getType()))
-            return false;
-        }
+        continue;
+      }
+
+      for (const Use &U : Int->args()) {
+        if (isa<VectorType>(U->getType()))
+          return false;
       }
     }
   }
 
+  if (!ActiveLaneMask) {
+    LLVM_DEBUG(dbgs() << "ARM TP: No get.active.lane.mask intrinsic found.\n");
+    return false;
+  }
   return !MaskedInsts.empty();
 }
 
-Value* MVETailPredication::ComputeElements(Value *TripCount,
-                                           VectorType *VecTy) {
-  const SCEV *TripCountSE = SE->getSCEV(TripCount);
-  ConstantInt *VF = ConstantInt::get(cast<IntegerType>(TripCount->getType()),
-                                     VecTy->getNumElements());
-
-  if (VF->equalsInt(1))
-    return nullptr;
-
-  // TODO: Support constant trip counts.
-  auto VisitAdd = [&](const SCEVAddExpr *S) -> const SCEVMulExpr* {
-    if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
-      if (Const->getAPInt() != -VF->getValue())
-        return nullptr;
-    } else
-      return nullptr;
-    return dyn_cast<SCEVMulExpr>(S->getOperand(1));
-  };
-
-  auto VisitMul = [&](const SCEVMulExpr *S) -> const SCEVUDivExpr* {
-    if (auto *Const = dyn_cast<SCEVConstant>(S->getOperand(0))) {
-      if (Const->getValue() != VF)
-        return nullptr;
-    } else
-      return nullptr;
-    return dyn_cast<SCEVUDivExpr>(S->getOperand(1));
-  };
-
-  auto VisitDiv = [&](const SCEVUDivExpr *S) -> const SCEV* {
-    if (auto *Const = dyn_cast<SCEVConstant>(S->getRHS())) {
-      if (Const->getValue() != VF)
-        return nullptr;
-    } else
-      return nullptr;
-
-    if (auto *RoundUp = dyn_cast<SCEVAddExpr>(S->getLHS())) {
-      if (auto *Const = dyn_cast<SCEVConstant>(RoundUp->getOperand(0))) {
-        if (Const->getAPInt() != (VF->getValue() - 1))
-          return nullptr;
-      } else
-        return nullptr;
-
-      return RoundUp->getOperand(1);
-    }
-    return nullptr;
-  };
-
-  // TODO: Can we use SCEV helpers, such as findArrayDimensions, and friends to
-  // determine the numbers of elements instead? Looks like this is what is used
-  // for delinearization, but I'm not sure if it can be applied to the
-  // vectorized form - at least not without a bit more work than I feel
-  // comfortable with.
-
-  // Search for Elems in the following SCEV:
-  // (1 + ((-VF + (VF * (((VF - 1) + %Elems) /u VF))<nuw>) /u VF))<nuw><nsw>
-  const SCEV *Elems = nullptr;
-  if (auto *TC = dyn_cast<SCEVAddExpr>(TripCountSE))
-    if (auto *Div = dyn_cast<SCEVUDivExpr>(TC->getOperand(1)))
-      if (auto *Add = dyn_cast<SCEVAddExpr>(Div->getLHS()))
-        if (auto *Mul = VisitAdd(Add))
-          if (auto *Div = VisitMul(Mul))
-            if (auto *Res = VisitDiv(Div))
-              Elems = Res;
-
-  if (!Elems)
-    return nullptr;
-
-  Instruction *InsertPt = L->getLoopPreheader()->getTerminator();
-  if (!isSafeToExpandAt(Elems, InsertPt, *SE))
-    return nullptr;
-
-  auto DL = L->getHeader()->getModule()->getDataLayout();
-  SCEVExpander Expander(*SE, DL, "elements");
-  return Expander.expandCodeFor(Elems, Elems->getType(), InsertPt);
-}
-
 // Look through the exit block to see whether there's a duplicate predicate
 // instruction. This can happen when we need to perform a select on values
 // from the last and previous iteration. Instead of doing a straight
@@ -407,31 +308,13 @@ Value* MVETailPredication::ComputeElements(Value *TripCount,
 // in the block. This means that the VPR doesn't have to be live into the
 // exit block which should make it easier to convert this loop into a proper
 // tail predicated loop.
-static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
-                    SetVector<Instruction*> &MaybeDead, Loop *L) {
+static void Cleanup(SetVector<Instruction*> &MaybeDead, Loop *L) {
   BasicBlock *Exit = L->getUniqueExitBlock();
   if (!Exit) {
     LLVM_DEBUG(dbgs() << "ARM TP: can't find loop exit block\n");
     return;
   }
 
-  for (auto &Pair : NewPredicates) {
-    Instruction *OldPred = Pair.first;
-    Instruction *NewPred = Pair.second;
-
-    for (auto &I : *Exit) {
-      if (I.isSameOperationAs(OldPred)) {
-        Instruction *PredClone = NewPred->clone();
-        PredClone->insertBefore(&I);
-        I.replaceAllUsesWith(PredClone);
-        MaybeDead.insert(&I);
-        LLVM_DEBUG(dbgs() << "ARM TP: replacing: "; I.dump();
-                   dbgs() << "ARM TP: with:      "; PredClone->dump());
-        break;
-      }
-    }
-  }
-
   // Drop references and add operands to check for dead.
   SmallPtrSet<Instruction*, 4> Dead;
   while (!MaybeDead.empty()) {
@@ -440,11 +323,10 @@ static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
     if (I->hasNUsesOrMore(1))
       continue;
 
-    for (auto &U : I->operands()) {
+    for (auto &U : I->operands())
       if (auto *OpI = dyn_cast<Instruction>(U))
         MaybeDead.insert(OpI);
-    }
-    I->dropAllReferences();
+
     Dead.insert(I);
   }
 
@@ -457,24 +339,211 @@ static void Cleanup(DenseMap<Instruction*, Instruction*> &NewPredicates,
     DeleteDeadPHIs(I);
 }
 
-void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate,
-    DenseMap<Instruction*, Instruction*> &NewPredicates,
-    VectorType *VecTy, Value *NumElements) {
-  IRBuilder<> Builder(L->getHeader()->getFirstNonPHI());
+// The active lane intrinsic has this form:
+//
+//    @llvm.get.active.lane.mask(IV, BTC)
+//
+// Here we perform checks that this intrinsic behaves as expected,
+// which means:
+//
+// 1) The element count, which is calculated with BTC + 1, cannot overflow.
+// 2) The element count needs to be sufficiently large that the decrement of
+//    element counter doesn't overflow, which means that we need to prove:
+//        ceil(ElementCount / VectorWidth) >= TripCount
+//    by rounding up ElementCount up:
+//        ((ElementCount + (VectorWidth - 1)) / VectorWidth
+//    and evaluate if expression isKnownNonNegative:
+//        (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount
+// 3) The IV must be an induction phi with an increment equal to the
+//    vector width.
+bool MVETailPredication::IsSafeActiveMask(IntrinsicInst *ActiveLaneMask,
+    Value *TripCount, FixedVectorType *VecTy) {
+  bool ForceTailPredication =
+    EnableTailPredication == TailPredication::ForceEnabledNoReductions ||
+    EnableTailPredication == TailPredication::ForceEnabled;
+  // 1) Test whether entry to the loop is protected by a conditional
+  // BTC + 1 < 0. In other words, if the scalar trip count overflows,
+  // becomes negative, we shouldn't enter the loop and creating
+  // tripcount expression BTC + 1 is not safe. So, check that BTC
+  // isn't max. This is evaluated in unsigned, because the semantics
+  // of @get.active.lane.mask is a ULE comparison.
+
+  int VectorWidth = VecTy->getNumElements();
+  auto *BackedgeTakenCount = ActiveLaneMask->getOperand(1);
+  auto *BTC = SE->getSCEV(BackedgeTakenCount);
+
+  if (!llvm::cannotBeMaxInLoop(BTC, L, *SE, false /*Signed*/) &&
+      !ForceTailPredication) {
+    LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible, BTC can be max: ";
+               BTC->dump());
+    return false;
+  }
+
+  // 2) Prove that the sub expression is non-negative, i.e. it doesn't overflow:
+  //
+  //      (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount
+  //
+  // 2.1) First prove overflow can't happen in:
+  //
+  //      ElementCount + (VectorWidth - 1)
+  //
+  // Because of a lack of context, it is difficult to get a useful bounds on
+  // this expression. But since ElementCount uses the same variables as the
+  // TripCount (TC), for which we can find meaningful value ranges, we use that
+  // instead and assert that:
+  //
+  //     upperbound(TC) <= UINT_MAX - VectorWidth
+  //
+  auto *TC = SE->getSCEV(TripCount);
+  unsigned SizeInBits = TripCount->getType()->getScalarSizeInBits();
+  auto Diff =  APInt(SizeInBits, ~0) - APInt(SizeInBits, VectorWidth);
+  uint64_t MaxMinusVW = Diff.getZExtValue();
+  uint64_t UpperboundTC = SE->getSignedRange(TC).getUpper().getZExtValue();
+
+  if (UpperboundTC > MaxMinusVW && !ForceTailPredication) {
+    LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in tripcount rounding:\n";
+               dbgs() << "upperbound(TC) <= UINT_MAX - VectorWidth\n";
+               dbgs() << UpperboundTC << " <= " << MaxMinusVW << "== false\n";);
+    return false;
+  }
+
+  // 2.2) Make sure overflow doesn't happen in final expression:
+  //  (((ElementCount + (VectorWidth - 1)) / VectorWidth) - TripCount,
+  // To do this, compare the full ranges of these subexpressions:
+  //
+  //     Range(Ceil) <= Range(TC)
+  //
+  // where Ceil = ElementCount + (VW-1) / VW. If Ceil and TC are runtime
+  // values (and not constants), we have to compensate for the lowerbound value
+  // range to be off by 1. The reason is that BTC lives in the preheader in
+  // this form:
+  //
+  //     %trip.count.minus = add nsw nuw i32 %N, -1
+  //
+  // For the loop to be executed, %N has to be >= 1 and as a result the value
+  // range of %trip.count.minus has a lower bound of 0. Value %TC has this form:
+  //
+  //     %5 = add nuw nsw i32 %4, 1
+  //     call void @llvm.set.loop.iterations.i32(i32 %5)
+  //
+  // where %5 is some expression using %N, which needs to have a lower bound of
+  // 1. Thus, if the ranges of Ceil and TC are not a single constant but a set,
+  // we first add 0 to TC such that we can do the <= comparison on both sets.
+  //
+  auto *One = SE->getOne(TripCount->getType());
+  // ElementCount = BTC + 1
+  auto *ElementCount = SE->getAddExpr(BTC, One);
+  // Tmp = ElementCount + (VW-1)
+  auto *ECPlusVWMinus1 = SE->getAddExpr(ElementCount,
+      SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth - 1)));
+  // Ceil = ElementCount + (VW-1) / VW
+  auto *Ceil = SE->getUDivExpr(ECPlusVWMinus1,
+      SE->getSCEV(ConstantInt::get(TripCount->getType(), VectorWidth)));
+
+  ConstantRange RangeCeil = SE->getSignedRange(Ceil) ;
+  ConstantRange RangeTC = SE->getSignedRange(TC) ;
+  if (!RangeTC.isSingleElement()) {
+    auto ZeroRange =
+        ConstantRange(APInt(TripCount->getType()->getScalarSizeInBits(), 0));
+    RangeTC = RangeTC.unionWith(ZeroRange);
+  }
+  if (!RangeTC.contains(RangeCeil) && !ForceTailPredication) {
+    LLVM_DEBUG(dbgs() << "ARM TP: Overflow possible in sub\n");
+    return false;
+  }
+
+  // 3) Find out if IV is an induction phi. Note that We can't use Loop
+  // helpers here to get the induction variable, because the hardware loop is
+  // no longer in loopsimplify form, and also the hwloop intrinsic use a
+  // different counter.  Using SCEV, we check that the induction is of the
+  // form i = i + 4, where the increment must be equal to the VectorWidth.
+  auto *IV = ActiveLaneMask->getOperand(0);
+  auto *IVExpr = SE->getSCEV(IV);
+  auto *AddExpr = dyn_cast<SCEVAddRecExpr>(IVExpr);
+  if (!AddExpr) {
+    LLVM_DEBUG(dbgs() << "ARM TP: induction not an add expr: "; IVExpr->dump());
+    return false;
+  }
+  // Check that this AddRec is associated with this loop.
+  if (AddExpr->getLoop() != L) {
+    LLVM_DEBUG(dbgs() << "ARM TP: phi not part of this loop\n");
+    return false;
+  }
+  auto *Step = dyn_cast<SCEVConstant>(AddExpr->getOperand(1));
+  if (!Step) {
+    LLVM_DEBUG(dbgs() << "ARM TP: induction step is not a constant: ";
+               AddExpr->getOperand(1)->dump());
+    return false;
+  }
+  auto StepValue = Step->getValue()->getSExtValue();
+  if (VectorWidth == StepValue)
+    return true;
+
+  LLVM_DEBUG(dbgs() << "ARM TP: Step value " << StepValue << " doesn't match "
+             "vector width " << VectorWidth << "\n");
+
+  return false;
+}
+
+// Materialize NumElements in the preheader block.
+static Value *getNumElements(BasicBlock *Preheader, Value *BTC) {
+  // First, check the preheader if it not already exist:
+  //
+  // preheader:
+  //    %BTC = add i32 %N, -1
+  //    ..
+  // vector.body:
+  //
+  // if %BTC already exists. We don't need to emit %NumElems = %BTC + 1,
+  // but instead can just return %N.
+  for (auto &I : *Preheader) {
+    if (I.getOpcode() != Instruction::Add || &I != BTC)
+      continue;
+    ConstantInt *MinusOne = nullptr;
+    if (!(MinusOne = dyn_cast<ConstantInt>(I.getOperand(1))))
+      continue;
+    if (MinusOne->getSExtValue() == -1) {
+      LLVM_DEBUG(dbgs() << "ARM TP: Found num elems: " << I << "\n");
+      return I.getOperand(0);
+    }
+  }
+
+  // But we do need to materialise BTC if it is not already there,
+  // e.g. if it is a constant.
+  IRBuilder<> Builder(Preheader->getTerminator());
+  Value *NumElements = Builder.CreateAdd(BTC,
+        ConstantInt::get(BTC->getType(), 1), "num.elements");
+  LLVM_DEBUG(dbgs() << "ARM TP: Created num elems: " << *NumElements << "\n");
+  return NumElements;
+}
+
+void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
+    Value *TripCount, FixedVectorType *VecTy) {
+  IRBuilder<> Builder(L->getLoopPreheader()->getTerminator());
   Module *M = L->getHeader()->getModule();
   Type *Ty = IntegerType::get(M->getContext(), 32);
+  unsigned VectorWidth = VecTy->getNumElements();
+
+  // The backedge-taken count in @llvm.get.active.lane.mask, its 2nd operand,
+  // is one less than the trip count. So we need to find or create
+  // %num.elements = %BTC + 1 in the preheader.
+  Value *BTC = ActiveLaneMask->getOperand(1);
+  Builder.SetInsertPoint(L->getLoopPreheader()->getTerminator());
+  Value *NumElements = getNumElements(L->getLoopPreheader(), BTC);
 
   // Insert a phi to count the number of elements processed by the loop.
+  Builder.SetInsertPoint(L->getHeader()->getFirstNonPHI()  );
   PHINode *Processed = Builder.CreatePHI(Ty, 2);
   Processed->addIncoming(NumElements, L->getLoopPreheader());
 
-  // Insert the intrinsic to represent the effect of tail predication.
-  Builder.SetInsertPoint(cast<Instruction>(Predicate));
+  // Replace @llvm.get.active.mask() with the ARM specific VCTP intrinic, and thus
+  // represent the effect of tail predication.
+  Builder.SetInsertPoint(ActiveLaneMask);
   ConstantInt *Factor =
-    ConstantInt::get(cast<IntegerType>(Ty), VecTy->getNumElements());
+    ConstantInt::get(cast<IntegerType>(Ty), VectorWidth);
 
   Intrinsic::ID VCTPID;
-  switch (VecTy->getNumElements()) {
+  switch (VectorWidth) {
   default:
     llvm_unreachable("unexpected number of lanes");
   case 4:  VCTPID = Intrinsic::arm_mve_vctp32; break;
@@ -488,9 +557,8 @@ void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate,
     // purposes, but takes a v4i1 instead of a v2i1.
   }
   Function *VCTP = Intrinsic::getDeclaration(M, VCTPID);
-  Value *TailPredicate = Builder.CreateCall(VCTP, Processed);
-  Predicate->replaceAllUsesWith(TailPredicate);
-  NewPredicates[Predicate] = cast<Instruction>(TailPredicate);
+  Value *VCTPCall = Builder.CreateCall(VCTP, Processed);
+  ActiveLaneMask->replaceAllUsesWith(VCTPCall);
 
   // Add the incoming value to the new phi.
   // TODO: This add likely already exists in the loop.
@@ -498,47 +566,45 @@ void MVETailPredication::InsertVCTPIntrinsic(Instruction *Predicate,
   Processed->addIncoming(Remaining, L->getLoopLatch());
   LLVM_DEBUG(dbgs() << "ARM TP: Insert processed elements phi: "
              << *Processed << "\n"
-             << "ARM TP: Inserted VCTP: " << *TailPredicate << "\n");
+             << "ARM TP: Inserted VCTP: " << *VCTPCall << "\n");
 }
 
 bool MVETailPredication::TryConvert(Value *TripCount) {
   if (!IsPredicatedVectorLoop()) {
-    LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop");
+    LLVM_DEBUG(dbgs() << "ARM TP: no masked instructions in loop.\n");
     return false;
   }
 
   LLVM_DEBUG(dbgs() << "ARM TP: Found predicated vector loop.\n");
-
-  // Walk through the masked intrinsics and try to find whether the predicate
-  // operand is generated from an induction variable.
   SetVector<Instruction*> Predicates;
-  DenseMap<Instruction*, Instruction*> NewPredicates;
 
+  // Walk through the masked intrinsics and try to find whether the predicate
+  // operand is generated by intrinsic @llvm.get.active.lane.mask().
   for (auto *I : MaskedInsts) {
-    Intrinsic::ID ID = I->getIntrinsicID();
-    unsigned PredOp = ID == Intrinsic::masked_load ? 2 : 3;
+    unsigned PredOp = I->getIntrinsicID() == Intrinsic::masked_load ? 2 : 3;
     auto *Predicate = dyn_cast<Instruction>(I->getArgOperand(PredOp));
     if (!Predicate || Predicates.count(Predicate))
       continue;
 
-    VectorType *VecTy = getVectorType(I);
-    Value *NumElements = ComputeElements(TripCount, VecTy);
-    if (!NumElements)
-      continue;
-
-    if (!isTailPredicate(Predicate, NumElements)) {
-      LLVM_DEBUG(dbgs() << "ARM TP: Not tail predicate: " << *Predicate << "\n");
+    auto *ActiveLaneMask = dyn_cast<IntrinsicInst>(Predicate);
+    if (!ActiveLaneMask ||
+        ActiveLaneMask->getIntrinsicID() != Intrinsic::get_active_lane_mask)
       continue;
-    }
 
-    LLVM_DEBUG(dbgs() << "ARM TP: Found tail predicate: " << *Predicate << "\n");
     Predicates.insert(Predicate);
+    LLVM_DEBUG(dbgs() << "ARM TP: Found active lane mask: "
+                      << *ActiveLaneMask << "\n");
 
-    InsertVCTPIntrinsic(Predicate, NewPredicates, VecTy, NumElements);
+    auto *VecTy = getVectorType(I);
+    if (!IsSafeActiveMask(ActiveLaneMask, TripCount, VecTy)) {
+      LLVM_DEBUG(dbgs() << "ARM TP: Not safe to insert VCTP.\n");
+      return false;
+    }
+    LLVM_DEBUG(dbgs() << "ARM TP: Safe to insert VCTP.\n");
+    InsertVCTPIntrinsic(ActiveLaneMask, TripCount, VecTy);
   }
 
-  // Now clean up.
-  Cleanup(NewPredicates, Predicates, L);
+  Cleanup(Predicates, L);
   return true;
 }
 
diff --git a/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp b/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
index a5df46c94f42..dc769ae526bc 100644
--- a/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
+++ b/llvm/lib/Target/ARM/MVEVPTBlockPass.cpp
@@ -22,9 +22,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineInstrBundle.h"
 #include "llvm/CodeGen/MachineOperand.h"
-#include "llvm/CodeGen/ReachingDefAnalysis.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Debug.h"
 #include <cassert>
 #include <new>
@@ -34,83 +34,220 @@ using namespace llvm;
 #define DEBUG_TYPE "arm-mve-vpt"
 
 namespace {
-  class MVEVPTBlock : public MachineFunctionPass {
-  public:
-    static char ID;
+class MVEVPTBlock : public MachineFunctionPass {
+public:
+  static char ID;
+  const Thumb2InstrInfo *TII;
+  const TargetRegisterInfo *TRI;
 
-    MVEVPTBlock() : MachineFunctionPass(ID) {}
+  MVEVPTBlock() : MachineFunctionPass(ID) {}
 
-    bool runOnMachineFunction(MachineFunction &Fn) override;
+  bool runOnMachineFunction(MachineFunction &Fn) override;
 
-    void getAnalysisUsage(AnalysisUsage &AU) const override {
-      AU.setPreservesCFG();
-      AU.addRequired<ReachingDefAnalysis>();
-      MachineFunctionPass::getAnalysisUsage(AU);
-    }
-
-    MachineFunctionProperties getRequiredProperties() const override {
-      return MachineFunctionProperties().set(
-          MachineFunctionProperties::Property::NoVRegs).set(
-          MachineFunctionProperties::Property::TracksLiveness);
-    }
-
-    StringRef getPassName() const override {
-      return "MVE VPT block insertion pass";
-    }
+  MachineFunctionProperties getRequiredProperties() const override {
+    return MachineFunctionProperties().set(
+        MachineFunctionProperties::Property::NoVRegs);
+  }
 
-  private:
-    bool InsertVPTBlocks(MachineBasicBlock &MBB);
+  StringRef getPassName() const override {
+    return "MVE VPT block insertion pass";
+  }
 
-    const Thumb2InstrInfo *TII = nullptr;
-    ReachingDefAnalysis *RDA = nullptr;
-  };
+private:
+  bool InsertVPTBlocks(MachineBasicBlock &MBB);
+};
 
-  char MVEVPTBlock::ID = 0;
+char MVEVPTBlock::ID = 0;
 
 } // end anonymous namespace
 
 INITIALIZE_PASS(MVEVPTBlock, DEBUG_TYPE, "ARM MVE VPT block pass", false, false)
 
-static MachineInstr *findVCMPToFoldIntoVPST(MachineInstr *MI,
-                                            ReachingDefAnalysis *RDA,
+static MachineInstr *findVCMPToFoldIntoVPST(MachineBasicBlock::iterator MI,
+                                            const TargetRegisterInfo *TRI,
                                             unsigned &NewOpcode) {
-  // First, search backwards to the instruction that defines VPR
-  auto *Def = RDA->getReachingMIDef(MI, ARM::VPR);
-  if (!Def)
-    return nullptr;
+  // Search backwards to the instruction that defines VPR. This may or not
+  // be a VCMP, we check that after this loop. If we find another instruction
+  // that reads cpsr, we return nullptr.
+  MachineBasicBlock::iterator CmpMI = MI;
+  while (CmpMI != MI->getParent()->begin()) {
+    --CmpMI;
+    if (CmpMI->modifiesRegister(ARM::VPR, TRI))
+      break;
+    if (CmpMI->readsRegister(ARM::VPR, TRI))
+      break;
+  }
 
-  // Now check that Def is a VCMP
-  if (!(NewOpcode = VCMPOpcodeToVPT(Def->getOpcode())))
+  if (CmpMI == MI)
+    return nullptr;
+  NewOpcode = VCMPOpcodeToVPT(CmpMI->getOpcode());
+  if (NewOpcode == 0)
     return nullptr;
 
-  // Check that Def's operands are not defined between the VCMP and MI, i.e.
-  // check that they have the same reaching def.
-  if (!RDA->hasSameReachingDef(Def, MI, Def->getOperand(1).getReg()) ||
-      !RDA->hasSameReachingDef(Def, MI, Def->getOperand(2).getReg()))
+  // Search forward from CmpMI to MI, checking if either register was def'd
+  if (registerDefinedBetween(CmpMI->getOperand(1).getReg(), std::next(CmpMI),
+                             MI, TRI))
+    return nullptr;
+  if (registerDefinedBetween(CmpMI->getOperand(2).getReg(), std::next(CmpMI),
+                             MI, TRI))
     return nullptr;
+  return &*CmpMI;
+}
+
+// Advances Iter past a block of predicated instructions.
+// Returns true if it successfully skipped the whole block of predicated
+// instructions. Returns false when it stopped early (due to MaxSteps), or if
+// Iter didn't point to a predicated instruction.
+static bool StepOverPredicatedInstrs(MachineBasicBlock::instr_iterator &Iter,
+                                     MachineBasicBlock::instr_iterator EndIter,
+                                     unsigned MaxSteps,
+                                     unsigned &NumInstrsSteppedOver) {
+  ARMVCC::VPTCodes NextPred = ARMVCC::None;
+  Register PredReg;
+  NumInstrsSteppedOver = 0;
+
+  while (Iter != EndIter) {
+    NextPred = getVPTInstrPredicate(*Iter, PredReg);
+    assert(NextPred != ARMVCC::Else &&
+           "VPT block pass does not expect Else preds");
+    if (NextPred == ARMVCC::None || MaxSteps == 0)
+      break;
+    --MaxSteps;
+    ++Iter;
+    ++NumInstrsSteppedOver;
+  };
+
+  return NumInstrsSteppedOver != 0 &&
+         (NextPred == ARMVCC::None || Iter == EndIter);
+}
+
+// Returns true if at least one instruction in the range [Iter, End) defines
+// or kills VPR.
+static bool IsVPRDefinedOrKilledByBlock(MachineBasicBlock::iterator Iter,
+                                        MachineBasicBlock::iterator End) {
+  for (; Iter != End; ++Iter)
+    if (Iter->definesRegister(ARM::VPR) || Iter->killsRegister(ARM::VPR))
+      return true;
+  return false;
+}
+
+// Creates a T, TT, TTT or TTTT BlockMask depending on BlockSize.
+static ARM::PredBlockMask GetInitialBlockMask(unsigned BlockSize) {
+  switch (BlockSize) {
+  case 1:
+    return ARM::PredBlockMask::T;
+  case 2:
+    return ARM::PredBlockMask::TT;
+  case 3:
+    return ARM::PredBlockMask::TTT;
+  case 4:
+    return ARM::PredBlockMask::TTTT;
+  default:
+    llvm_unreachable("Invalid BlockSize!");
+  }
+}
+
+// Given an iterator (Iter) that points at an instruction with a "Then"
+// predicate, tries to create the largest block of continuous predicated
+// instructions possible, and returns the VPT Block Mask of that block.
+//
+// This will try to perform some minor optimization in order to maximize the
+// size of the block.
+static ARM::PredBlockMask
+CreateVPTBlock(MachineBasicBlock::instr_iterator &Iter,
+               MachineBasicBlock::instr_iterator EndIter,
+               SmallVectorImpl<MachineInstr *> &DeadInstructions) {
+  MachineBasicBlock::instr_iterator BlockBeg = Iter;
+  (void)BlockBeg;
+  assert(getVPTInstrPredicate(*Iter) == ARMVCC::Then &&
+         "Expected a Predicated Instruction");
+
+  LLVM_DEBUG(dbgs() << "VPT block created for: "; Iter->dump());
+
+  unsigned BlockSize;
+  StepOverPredicatedInstrs(Iter, EndIter, 4, BlockSize);
+
+  LLVM_DEBUG(for (MachineBasicBlock::instr_iterator AddedInstIter =
+                      std::next(BlockBeg);
+                  AddedInstIter != Iter; ++AddedInstIter) {
+    dbgs() << "  adding: ";
+    AddedInstIter->dump();
+  });
+
+  // Generate the initial BlockMask
+  ARM::PredBlockMask BlockMask = GetInitialBlockMask(BlockSize);
+
+  // Remove VPNOTs while there's still room in the block, so we can make the
+  // largest block possible.
+  ARMVCC::VPTCodes CurrentPredicate = ARMVCC::Else;
+  while (BlockSize < 4 && Iter != EndIter &&
+         Iter->getOpcode() == ARM::MVE_VPNOT) {
+
+    // Try to skip all of the predicated instructions after the VPNOT, stopping
+    // after (4 - BlockSize). If we can't skip them all, stop.
+    unsigned ElseInstCnt = 0;
+    MachineBasicBlock::instr_iterator VPNOTBlockEndIter = std::next(Iter);
+    if (!StepOverPredicatedInstrs(VPNOTBlockEndIter, EndIter, (4 - BlockSize),
+                                  ElseInstCnt))
+      break;
+
+    // Check if this VPNOT can be removed or not: It can only be removed if at
+    // least one of the predicated instruction that follows it kills or sets
+    // VPR.
+    if (!IsVPRDefinedOrKilledByBlock(Iter, VPNOTBlockEndIter))
+      break;
+
+    LLVM_DEBUG(dbgs() << "  removing VPNOT: "; Iter->dump(););
+
+    // Record the new size of the block
+    BlockSize += ElseInstCnt;
+    assert(BlockSize <= 4 && "Block is too large!");
+
+    // Record the VPNot to remove it later.
+    DeadInstructions.push_back(&*Iter);
+    ++Iter;
+
+    // Replace the predicates of the instructions we're adding.
+    // Note that we are using "Iter" to iterate over the block so we can update
+    // it at the same time.
+    for (; Iter != VPNOTBlockEndIter; ++Iter) {
+      // Find the register in which the predicate is
+      int OpIdx = findFirstVPTPredOperandIdx(*Iter);
+      assert(OpIdx != -1);
+
+      // Change the predicate and update the mask
+      Iter->getOperand(OpIdx).setImm(CurrentPredicate);
+      BlockMask = expandPredBlockMask(BlockMask, CurrentPredicate);
+
+      LLVM_DEBUG(dbgs() << "  adding : "; Iter->dump());
+    }
 
-  return Def;
+    CurrentPredicate =
+        (CurrentPredicate == ARMVCC::Then ? ARMVCC::Else : ARMVCC::Then);
+  }
+  return BlockMask;
 }
 
 bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
   bool Modified = false;
   MachineBasicBlock::instr_iterator MBIter = Block.instr_begin();
   MachineBasicBlock::instr_iterator EndIter = Block.instr_end();
-  SmallSet<MachineInstr *, 4> RemovedVCMPs;
+
+  SmallVector<MachineInstr *, 4> DeadInstructions;
 
   while (MBIter != EndIter) {
     MachineInstr *MI = &*MBIter;
-    unsigned PredReg = 0;
-    DebugLoc dl = MI->getDebugLoc();
+    Register PredReg;
+    DebugLoc DL = MI->getDebugLoc();
 
     ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*MI, PredReg);
 
     // The idea of the predicate is that None, Then and Else are for use when
     // handling assembly language: they correspond to the three possible
     // suffixes "", "t" and "e" on the mnemonic. So when instructions are read
-    // from assembly source or disassembled from object code, you expect to see
-    // a mixture whenever there's a long VPT block. But in code generation, we
-    // hope we'll never generate an Else as input to this pass.
+    // from assembly source or disassembled from object code, you expect to
+    // see a mixture whenever there's a long VPT block. But in code
+    // generation, we hope we'll never generate an Else as input to this pass.
     assert(Pred != ARMVCC::Else && "VPT block pass does not expect Else preds");
 
     if (Pred == ARMVCC::None) {
@@ -118,46 +255,25 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
       continue;
     }
 
-    LLVM_DEBUG(dbgs() << "VPT block created for: "; MI->dump());
-    int VPTInstCnt = 1;
-    ARMVCC::VPTCodes NextPred;
-
-    // Look at subsequent instructions, checking if they can be in the same VPT
-    // block.
-    ++MBIter;
-    while (MBIter != EndIter && VPTInstCnt < 4) {
-      NextPred = getVPTInstrPredicate(*MBIter, PredReg);
-      assert(NextPred != ARMVCC::Else &&
-             "VPT block pass does not expect Else preds");
-      if (NextPred != Pred)
-        break;
-      LLVM_DEBUG(dbgs() << "  adding : "; MBIter->dump());
-      ++VPTInstCnt;
-      ++MBIter;
-    };
-
-    unsigned BlockMask = getARMVPTBlockMask(VPTInstCnt);
+    ARM::PredBlockMask BlockMask =
+        CreateVPTBlock(MBIter, EndIter, DeadInstructions);
 
-    // Search back for a VCMP that can be folded to create a VPT, or else create
-    // a VPST directly
+    // Search back for a VCMP that can be folded to create a VPT, or else
+    // create a VPST directly
     MachineInstrBuilder MIBuilder;
     unsigned NewOpcode;
-    MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, RDA, NewOpcode);
-    if (VCMP) {
+    LLVM_DEBUG(dbgs() << "  final block mask: " << (unsigned)BlockMask << "\n");
+    if (MachineInstr *VCMP = findVCMPToFoldIntoVPST(MI, TRI, NewOpcode)) {
       LLVM_DEBUG(dbgs() << "  folding VCMP into VPST: "; VCMP->dump());
-      MIBuilder = BuildMI(Block, MI, dl, TII->get(NewOpcode));
-      MIBuilder.addImm(BlockMask);
+      MIBuilder = BuildMI(Block, MI, DL, TII->get(NewOpcode));
+      MIBuilder.addImm((uint64_t)BlockMask);
       MIBuilder.add(VCMP->getOperand(1));
       MIBuilder.add(VCMP->getOperand(2));
       MIBuilder.add(VCMP->getOperand(3));
-      // We delay removing the actual VCMP instruction by saving it to a list
-      // and deleting all instructions in this list in one go after we have
-      // created the VPT blocks. We do this in order not to invalidate the
-      // ReachingDefAnalysis that is queried by 'findVCMPToFoldIntoVPST'.
-      RemovedVCMPs.insert(VCMP);
+      VCMP->eraseFromParent();
     } else {
-      MIBuilder = BuildMI(Block, MI, dl, TII->get(ARM::MVE_VPST));
-      MIBuilder.addImm(BlockMask);
+      MIBuilder = BuildMI(Block, MI, DL, TII->get(ARM::MVE_VPST));
+      MIBuilder.addImm((uint64_t)BlockMask);
     }
 
     finalizeBundle(
@@ -166,16 +282,18 @@ bool MVEVPTBlock::InsertVPTBlocks(MachineBasicBlock &Block) {
     Modified = true;
   }
 
-  for (auto *I : RemovedVCMPs)
-    I->eraseFromParent();
+  // Erase all dead instructions
+  for (MachineInstr *DeadMI : DeadInstructions) {
+    if (DeadMI->isInsideBundle())
+      DeadMI->eraseFromBundle();
+    else
+      DeadMI->eraseFromParent();
+  }
 
   return Modified;
 }
 
 bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
-  if (skipFunction(Fn.getFunction()))
-    return false;
-
   const ARMSubtarget &STI =
       static_cast<const ARMSubtarget &>(Fn.getSubtarget());
 
@@ -183,7 +301,7 @@ bool MVEVPTBlock::runOnMachineFunction(MachineFunction &Fn) {
     return false;
 
   TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
-  RDA = &getAnalysis<ReachingDefAnalysis>();
+  TRI = STI.getRegisterInfo();
 
   LLVM_DEBUG(dbgs() << "********** ARM MVE VPT BLOCKS **********\n"
                     << "********** Function: " << Fn.getName() << '\n');
diff --git a/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
new file mode 100644
index 000000000000..382ddd4572c7
--- /dev/null
+++ b/llvm/lib/Target/ARM/MVEVPTOptimisationsPass.cpp
@@ -0,0 +1,464 @@
+//===-- MVEVPTOptimisationsPass.cpp ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This pass does a few optimisations related to MVE VPT blocks before
+/// register allocation is performed. The goal is to maximize the sizes of the
+/// blocks that will be created by the MVE VPT Block Insertion pass (which runs
+/// after register allocation). The first optimisation done by this pass is the
+/// replacement of "opposite" VCMPs with VPNOTs, so the Block Insertion pass
+/// can delete them later to create larger VPT blocks.
+/// The second optimisation replaces re-uses of old VCCR values with VPNOTs when
+/// inside a block of predicated instructions. This is done to avoid
+/// spill/reloads of VPR in the middle of a block, which prevents the Block
+/// Insertion pass from creating large blocks.
+//
+//===----------------------------------------------------------------------===//
+
+#include "ARM.h"
+#include "ARMSubtarget.h"
+#include "MCTargetDesc/ARMBaseInfo.h"
+#include "Thumb2InstrInfo.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/Support/Debug.h"
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "arm-mve-vpt-opts"
+
+namespace {
+class MVEVPTOptimisations : public MachineFunctionPass {
+public:
+  static char ID;
+  const Thumb2InstrInfo *TII;
+  MachineRegisterInfo *MRI;
+
+  MVEVPTOptimisations() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &Fn) override;
+
+  StringRef getPassName() const override {
+    return "ARM MVE VPT Optimisation Pass";
+  }
+
+private:
+  MachineInstr &ReplaceRegisterUseWithVPNOT(MachineBasicBlock &MBB,
+                                            MachineInstr &Instr,
+                                            MachineOperand &User,
+                                            Register Target);
+  bool ReduceOldVCCRValueUses(MachineBasicBlock &MBB);
+  bool ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB);
+};
+
+char MVEVPTOptimisations::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(MVEVPTOptimisations, DEBUG_TYPE,
+                "ARM MVE VPT Optimisations pass", false, false)
+
+// Returns true if Opcode is any VCMP Opcode.
+static bool IsVCMP(unsigned Opcode) { return VCMPOpcodeToVPT(Opcode) != 0; }
+
+// Returns true if a VCMP with this Opcode can have its operands swapped.
+// There is 2 kind of VCMP that can't have their operands swapped: Float VCMPs,
+// and VCMPr instructions (since the r is always on the right).
+static bool CanHaveSwappedOperands(unsigned Opcode) {
+  switch (Opcode) {
+  default:
+    return true;
+  case ARM::MVE_VCMPf32:
+  case ARM::MVE_VCMPf16:
+  case ARM::MVE_VCMPf32r:
+  case ARM::MVE_VCMPf16r:
+  case ARM::MVE_VCMPi8r:
+  case ARM::MVE_VCMPi16r:
+  case ARM::MVE_VCMPi32r:
+  case ARM::MVE_VCMPu8r:
+  case ARM::MVE_VCMPu16r:
+  case ARM::MVE_VCMPu32r:
+  case ARM::MVE_VCMPs8r:
+  case ARM::MVE_VCMPs16r:
+  case ARM::MVE_VCMPs32r:
+    return false;
+  }
+}
+
+// Returns the CondCode of a VCMP Instruction.
+static ARMCC::CondCodes GetCondCode(MachineInstr &Instr) {
+  assert(IsVCMP(Instr.getOpcode()) && "Inst must be a VCMP");
+  return ARMCC::CondCodes(Instr.getOperand(3).getImm());
+}
+
+// Returns true if Cond is equivalent to a VPNOT instruction on the result of
+// Prev. Cond and Prev must be VCMPs.
+static bool IsVPNOTEquivalent(MachineInstr &Cond, MachineInstr &Prev) {
+  assert(IsVCMP(Cond.getOpcode()) && IsVCMP(Prev.getOpcode()));
+
+  // Opcodes must match.
+  if (Cond.getOpcode() != Prev.getOpcode())
+    return false;
+
+  MachineOperand &CondOP1 = Cond.getOperand(1), &CondOP2 = Cond.getOperand(2);
+  MachineOperand &PrevOP1 = Prev.getOperand(1), &PrevOP2 = Prev.getOperand(2);
+
+  // If the VCMP has the opposite condition with the same operands, we can
+  // replace it with a VPNOT
+  ARMCC::CondCodes ExpectedCode = GetCondCode(Cond);
+  ExpectedCode = ARMCC::getOppositeCondition(ExpectedCode);
+  if (ExpectedCode == GetCondCode(Prev))
+    if (CondOP1.isIdenticalTo(PrevOP1) && CondOP2.isIdenticalTo(PrevOP2))
+      return true;
+  // Check again with operands swapped if possible
+  if (!CanHaveSwappedOperands(Cond.getOpcode()))
+    return false;
+  ExpectedCode = ARMCC::getSwappedCondition(ExpectedCode);
+  return ExpectedCode == GetCondCode(Prev) && CondOP1.isIdenticalTo(PrevOP2) &&
+         CondOP2.isIdenticalTo(PrevOP1);
+}
+
+// Returns true if Instr writes to VCCR.
+static bool IsWritingToVCCR(MachineInstr &Instr) {
+  if (Instr.getNumOperands() == 0)
+    return false;
+  MachineOperand &Dst = Instr.getOperand(0);
+  if (!Dst.isReg())
+    return false;
+  Register DstReg = Dst.getReg();
+  if (!DstReg.isVirtual())
+    return false;
+  MachineRegisterInfo &RegInfo = Instr.getMF()->getRegInfo();
+  const TargetRegisterClass *RegClass = RegInfo.getRegClassOrNull(DstReg);
+  return RegClass && (RegClass->getID() == ARM::VCCRRegClassID);
+}
+
+// Transforms
+//    <Instr that uses %A ('User' Operand)>
+// Into
+//    %K = VPNOT %Target
+//    <Instr that uses %K ('User' Operand)>
+// And returns the newly inserted VPNOT.
+// This optimization is done in the hopes of preventing spills/reloads of VPR by
+// reducing the number of VCCR values with overlapping lifetimes.
+MachineInstr &MVEVPTOptimisations::ReplaceRegisterUseWithVPNOT(
+    MachineBasicBlock &MBB, MachineInstr &Instr, MachineOperand &User,
+    Register Target) {
+  Register NewResult = MRI->createVirtualRegister(MRI->getRegClass(Target));
+
+  MachineInstrBuilder MIBuilder =
+      BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT))
+          .addDef(NewResult)
+          .addReg(Target);
+  addUnpredicatedMveVpredNOp(MIBuilder);
+
+  // Make the user use NewResult instead, and clear its kill flag.
+  User.setReg(NewResult);
+  User.setIsKill(false);
+
+  LLVM_DEBUG(dbgs() << "  Inserting VPNOT (for spill prevention): ";
+             MIBuilder.getInstr()->dump());
+
+  return *MIBuilder.getInstr();
+}
+
+// Moves a VPNOT before its first user if an instruction that uses Reg is found
+// in-between the VPNOT and its user.
+// Returns true if there is at least one user of the VPNOT in the block.
+static bool MoveVPNOTBeforeFirstUser(MachineBasicBlock &MBB,
+                                     MachineBasicBlock::iterator Iter,
+                                     Register Reg) {
+  assert(Iter->getOpcode() == ARM::MVE_VPNOT && "Not a VPNOT!");
+  assert(getVPTInstrPredicate(*Iter) == ARMVCC::None &&
+         "The VPNOT cannot be predicated");
+
+  MachineInstr &VPNOT = *Iter;
+  Register VPNOTResult = VPNOT.getOperand(0).getReg();
+  Register VPNOTOperand = VPNOT.getOperand(1).getReg();
+
+  // Whether the VPNOT will need to be moved, and whether we found a user of the
+  // VPNOT.
+  bool MustMove = false, HasUser = false;
+  MachineOperand *VPNOTOperandKiller = nullptr;
+  for (; Iter != MBB.end(); ++Iter) {
+    if (MachineOperand *MO =
+            Iter->findRegisterUseOperand(VPNOTOperand, /*isKill*/ true)) {
+      // If we find the operand that kills the VPNOTOperand's result, save it.
+      VPNOTOperandKiller = MO;
+    }
+
+    if (Iter->findRegisterUseOperandIdx(Reg) != -1) {
+      MustMove = true;
+      continue;
+    }
+
+    if (Iter->findRegisterUseOperandIdx(VPNOTResult) == -1)
+      continue;
+
+    HasUser = true;
+    if (!MustMove)
+      break;
+
+    // Move the VPNOT right before Iter
+    LLVM_DEBUG(dbgs() << "Moving: "; VPNOT.dump(); dbgs() << "  Before: ";
+               Iter->dump());
+    MBB.splice(Iter, &MBB, VPNOT.getIterator());
+    // If we move the instr, and its operand was killed earlier, remove the kill
+    // flag.
+    if (VPNOTOperandKiller)
+      VPNOTOperandKiller->setIsKill(false);
+
+    break;
+  }
+  return HasUser;
+}
+
+// This optimisation attempts to reduce the number of overlapping lifetimes of
+// VCCR values by replacing uses of old VCCR values with VPNOTs. For example,
+// this replaces
+//    %A:vccr = (something)
+//    %B:vccr = VPNOT %A
+//    %Foo = (some op that uses %B)
+//    %Bar = (some op that uses %A)
+// With
+//    %A:vccr = (something)
+//    %B:vccr = VPNOT %A
+//    %Foo = (some op that uses %B)
+//    %TMP2:vccr = VPNOT %B
+//    %Bar = (some op that uses %A)
+bool MVEVPTOptimisations::ReduceOldVCCRValueUses(MachineBasicBlock &MBB) {
+  MachineBasicBlock::iterator Iter = MBB.begin(), End = MBB.end();
+  SmallVector<MachineInstr *, 4> DeadInstructions;
+  bool Modified = false;
+
+  while (Iter != End) {
+    Register VCCRValue, OppositeVCCRValue;
+    // The first loop looks for 2 unpredicated instructions:
+    //    %A:vccr = (instr)     ; A is stored in VCCRValue
+    //    %B:vccr = VPNOT %A    ; B is stored in OppositeVCCRValue
+    for (; Iter != End; ++Iter) {
+      // We're only interested in unpredicated instructions that write to VCCR.
+      if (!IsWritingToVCCR(*Iter) ||
+          getVPTInstrPredicate(*Iter) != ARMVCC::None)
+        continue;
+      Register Dst = Iter->getOperand(0).getReg();
+
+      // If we already have a VCCRValue, and this is a VPNOT on VCCRValue, we've
+      // found what we were looking for.
+      if (VCCRValue && Iter->getOpcode() == ARM::MVE_VPNOT &&
+          Iter->findRegisterUseOperandIdx(VCCRValue) != -1) {
+        // Move the VPNOT closer to its first user if needed, and ignore if it
+        // has no users.
+        if (!MoveVPNOTBeforeFirstUser(MBB, Iter, VCCRValue))
+          continue;
+
+        OppositeVCCRValue = Dst;
+        ++Iter;
+        break;
+      }
+
+      // Else, just set VCCRValue.
+      VCCRValue = Dst;
+    }
+
+    // If the first inner loop didn't find anything, stop here.
+    if (Iter == End)
+      break;
+
+    assert(VCCRValue && OppositeVCCRValue &&
+           "VCCRValue and OppositeVCCRValue shouldn't be empty if the loop "
+           "stopped before the end of the block!");
+    assert(VCCRValue != OppositeVCCRValue &&
+           "VCCRValue should not be equal to OppositeVCCRValue!");
+
+    // LastVPNOTResult always contains the same value as OppositeVCCRValue.
+    Register LastVPNOTResult = OppositeVCCRValue;
+
+    // This second loop tries to optimize the remaining instructions.
+    for (; Iter != End; ++Iter) {
+      bool IsInteresting = false;
+
+      if (MachineOperand *MO = Iter->findRegisterUseOperand(VCCRValue)) {
+        IsInteresting = true;
+
+        // - If the instruction is a VPNOT, it can be removed, and we can just
+        //   replace its uses with LastVPNOTResult.
+        // - Else, insert a new VPNOT on LastVPNOTResult to recompute VCCRValue.
+        if (Iter->getOpcode() == ARM::MVE_VPNOT) {
+          Register Result = Iter->getOperand(0).getReg();
+
+          MRI->replaceRegWith(Result, LastVPNOTResult);
+          DeadInstructions.push_back(&*Iter);
+          Modified = true;
+
+          LLVM_DEBUG(dbgs()
+                     << "Replacing all uses of '" << printReg(Result)
+                     << "' with '" << printReg(LastVPNOTResult) << "'\n");
+        } else {
+          MachineInstr &VPNOT =
+              ReplaceRegisterUseWithVPNOT(MBB, *Iter, *MO, LastVPNOTResult);
+          Modified = true;
+
+          LastVPNOTResult = VPNOT.getOperand(0).getReg();
+          std::swap(VCCRValue, OppositeVCCRValue);
+
+          LLVM_DEBUG(dbgs() << "Replacing use of '" << printReg(VCCRValue)
+                            << "' with '" << printReg(LastVPNOTResult)
+                            << "' in instr: " << *Iter);
+        }
+      } else {
+        // If the instr uses OppositeVCCRValue, make it use LastVPNOTResult
+        // instead as they contain the same value.
+        if (MachineOperand *MO =
+                Iter->findRegisterUseOperand(OppositeVCCRValue)) {
+          IsInteresting = true;
+
+          // This is pointless if LastVPNOTResult == OppositeVCCRValue.
+          if (LastVPNOTResult != OppositeVCCRValue) {
+            LLVM_DEBUG(dbgs() << "Replacing usage of '"
+                              << printReg(OppositeVCCRValue) << "' with '"
+                              << printReg(LastVPNOTResult) << " for instr: ";
+                       Iter->dump());
+            MO->setReg(LastVPNOTResult);
+            Modified = true;
+          }
+
+          MO->setIsKill(false);
+        }
+
+        // If this is an unpredicated VPNOT on
+        // LastVPNOTResult/OppositeVCCRValue, we can act like we inserted it.
+        if (Iter->getOpcode() == ARM::MVE_VPNOT &&
+            getVPTInstrPredicate(*Iter) == ARMVCC::None) {
+          Register VPNOTOperand = Iter->getOperand(1).getReg();
+          if (VPNOTOperand == LastVPNOTResult ||
+              VPNOTOperand == OppositeVCCRValue) {
+            IsInteresting = true;
+
+            std::swap(VCCRValue, OppositeVCCRValue);
+            LastVPNOTResult = Iter->getOperand(0).getReg();
+          }
+        }
+      }
+
+      // If this instruction was not interesting, and it writes to VCCR, stop.
+      if (!IsInteresting && IsWritingToVCCR(*Iter))
+        break;
+    }
+  }
+
+  for (MachineInstr *DeadInstruction : DeadInstructions)
+    DeadInstruction->removeFromParent();
+
+  return Modified;
+}
+
+// This optimisation replaces VCMPs with VPNOTs when they are equivalent.
+bool MVEVPTOptimisations::ReplaceVCMPsByVPNOTs(MachineBasicBlock &MBB) {
+  SmallVector<MachineInstr *, 4> DeadInstructions;
+
+  // The last VCMP that we have seen and that couldn't be replaced.
+  // This is reset when an instruction that writes to VCCR/VPR is found, or when
+  // a VCMP is replaced with a VPNOT.
+  // We'll only replace VCMPs with VPNOTs when this is not null, and when the
+  // current VCMP is the opposite of PrevVCMP.
+  MachineInstr *PrevVCMP = nullptr;
+  // If we find an instruction that kills the result of PrevVCMP, we save the
+  // operand here to remove the kill flag in case we need to use PrevVCMP's
+  // result.
+  MachineOperand *PrevVCMPResultKiller = nullptr;
+
+  for (MachineInstr &Instr : MBB.instrs()) {
+    if (PrevVCMP) {
+      if (MachineOperand *MO = Instr.findRegisterUseOperand(
+              PrevVCMP->getOperand(0).getReg(), /*isKill*/ true)) {
+        // If we come accross the instr that kills PrevVCMP's result, record it
+        // so we can remove the kill flag later if we need to.
+        PrevVCMPResultKiller = MO;
+      }
+    }
+
+    // Ignore predicated instructions.
+    if (getVPTInstrPredicate(Instr) != ARMVCC::None)
+      continue;
+
+    // Only look at VCMPs
+    if (!IsVCMP(Instr.getOpcode())) {
+      // If the instruction writes to VCCR, forget the previous VCMP.
+      if (IsWritingToVCCR(Instr))
+        PrevVCMP = nullptr;
+      continue;
+    }
+
+    if (!PrevVCMP || !IsVPNOTEquivalent(Instr, *PrevVCMP)) {
+      PrevVCMP = &Instr;
+      continue;
+    }
+
+    // The register containing the result of the VCMP that we're going to
+    // replace.
+    Register PrevVCMPResultReg = PrevVCMP->getOperand(0).getReg();
+
+    // Build a VPNOT to replace the VCMP, reusing its operands.
+    MachineInstrBuilder MIBuilder =
+        BuildMI(MBB, &Instr, Instr.getDebugLoc(), TII->get(ARM::MVE_VPNOT))
+            .add(Instr.getOperand(0))
+            .addReg(PrevVCMPResultReg);
+    addUnpredicatedMveVpredNOp(MIBuilder);
+    LLVM_DEBUG(dbgs() << "Inserting VPNOT (to replace VCMP): ";
+               MIBuilder.getInstr()->dump(); dbgs() << "  Removed VCMP: ";
+               Instr.dump());
+
+    // If we found an instruction that uses, and kills PrevVCMP's result,
+    // remove the kill flag.
+    if (PrevVCMPResultKiller)
+      PrevVCMPResultKiller->setIsKill(false);
+
+    // Finally, mark the old VCMP for removal and reset
+    // PrevVCMP/PrevVCMPResultKiller.
+    DeadInstructions.push_back(&Instr);
+    PrevVCMP = nullptr;
+    PrevVCMPResultKiller = nullptr;
+  }
+
+  for (MachineInstr *DeadInstruction : DeadInstructions)
+    DeadInstruction->removeFromParent();
+
+  return !DeadInstructions.empty();
+}
+
+bool MVEVPTOptimisations::runOnMachineFunction(MachineFunction &Fn) {
+  const ARMSubtarget &STI =
+      static_cast<const ARMSubtarget &>(Fn.getSubtarget());
+
+  if (!STI.isThumb2() || !STI.hasMVEIntegerOps())
+    return false;
+
+  TII = static_cast<const Thumb2InstrInfo *>(STI.getInstrInfo());
+  MRI = &Fn.getRegInfo();
+
+  LLVM_DEBUG(dbgs() << "********** ARM MVE VPT Optimisations **********\n"
+                    << "********** Function: " << Fn.getName() << '\n');
+
+  bool Modified = false;
+  for (MachineBasicBlock &MBB : Fn) {
+    Modified |= ReplaceVCMPsByVPNOTs(MBB);
+    Modified |= ReduceOldVCCRValueUses(MBB);
+  }
+
+  LLVM_DEBUG(dbgs() << "**************************************\n");
+  return Modified;
+}
+
+/// createMVEVPTOptimisationsPass
+FunctionPass *llvm::createMVEVPTOptimisationsPass() {
+  return new MVEVPTOptimisations();
+}
diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
index 956d474f1d79..d568e9afe432 100644
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.cpp
@@ -88,8 +88,10 @@ emitPrologueEpilogueSPUpdate(MachineBasicBlock &MBB,
                             0, MIFlags);
     }
     BuildMI(MBB, MBBI, dl, TII.get(ARM::tADDhirr), ARM::SP)
-      .addReg(ARM::SP).addReg(ScratchReg, RegState::Kill)
-      .add(predOps(ARMCC::AL));
+        .addReg(ARM::SP)
+        .addReg(ScratchReg, RegState::Kill)
+        .add(predOps(ARMCC::AL))
+        .setMIFlags(MIFlags);
     return;
   }
   // FIXME: This is assuming the heuristics in emitThumbRegPlusImmediate
@@ -127,7 +129,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
       // alignment boundary.
-      Amount = alignTo(Amount, getStackAlignment());
+      Amount = alignTo(Amount, getStackAlign());
 
       // Replace the pseudo instruction with a new instruction...
       unsigned Opc = Old.getOpcode();
@@ -180,9 +182,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   if (ArgRegsSaveSize) {
     emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -ArgRegsSaveSize,
                                  ARM::NoRegister, MachineInstr::FrameSetup);
-    CFAOffset -= ArgRegsSaveSize;
-    unsigned CFIIndex = MF.addFrameInst(
-        MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+    CFAOffset += ArgRegsSaveSize;
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset));
     BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex)
         .setMIFlags(MachineInstr::FrameSetup);
@@ -193,9 +195,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
       emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo,
                                    -(NumBytes - ArgRegsSaveSize),
                                    ARM::NoRegister, MachineInstr::FrameSetup);
-      CFAOffset -= NumBytes - ArgRegsSaveSize;
+      CFAOffset += NumBytes - ArgRegsSaveSize;
       unsigned CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+          MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset));
       BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
@@ -257,9 +259,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   if (adjustedGPRCS1Size) {
-    CFAOffset -= adjustedGPRCS1Size;
-    unsigned CFIIndex = MF.addFrameInst(
-        MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+    CFAOffset += adjustedGPRCS1Size;
+    unsigned CFIIndex =
+        MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset));
     BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex)
         .setMIFlags(MachineInstr::FrameSetup);
@@ -305,8 +307,8 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
         .setMIFlags(MachineInstr::FrameSetup)
         .add(predOps(ARMCC::AL));
     if(FramePtrOffsetInBlock) {
-      CFAOffset += FramePtrOffsetInBlock;
-      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
+      CFAOffset -= FramePtrOffsetInBlock;
+      unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
           nullptr, MRI->getDwarfRegNum(FramePtr, true), CFAOffset));
       BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
@@ -384,9 +386,9 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
     emitPrologueEpilogueSPUpdate(MBB, MBBI, TII, dl, *RegInfo, -NumBytes,
                                  ScratchRegister, MachineInstr::FrameSetup);
     if (!HasFP) {
-      CFAOffset -= NumBytes;
+      CFAOffset += NumBytes;
       unsigned CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(nullptr, CFAOffset));
+          MCCFIInstruction::cfiDefCfaOffset(nullptr, CFAOffset));
       BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex)
           .setMIFlags(MachineInstr::FrameSetup);
@@ -402,7 +404,7 @@ void Thumb1FrameLowering::emitPrologue(MachineFunction &MF,
   AFI->setDPRCalleeSavedAreaSize(DPRCSSize);
 
   if (RegInfo->needsStackRealignment(MF)) {
-    const unsigned NrBitsToZero = countTrailingZeros(MFI.getMaxAlignment());
+    const unsigned NrBitsToZero = Log2(MFI.getMaxAlign());
     // Emit the following sequence, using R4 as a temporary, since we cannot use
     // SP as a source or destination register for the shifts:
     // mov  r4, sp
@@ -804,11 +806,9 @@ static const unsigned *findNextOrderedReg(const unsigned *CurrentReg,
   return CurrentReg;
 }
 
-bool Thumb1FrameLowering::
-spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI,
-                          const std::vector<CalleeSavedInfo> &CSI,
-                          const TargetRegisterInfo *TRI) const {
+bool Thumb1FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return false;
 
@@ -927,11 +927,9 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
-bool Thumb1FrameLowering::
-restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            std::vector<CalleeSavedInfo> &CSI,
-                            const TargetRegisterInfo *TRI) const {
+bool Thumb1FrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return false;
 
@@ -1049,6 +1047,10 @@ restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
       if (!STI.hasV5TOps())
         continue;
 
+      // CMSE entry functions must return via BXNS, see emitEpilogue.
+      if (AFI->isCmseNSEntryFunction())
+        continue;
+
       // Pop LR into PC.
       Reg = ARM::PC;
       (*MIB).setDesc(TII.get(ARM::tPOP_RET));
diff --git a/llvm/lib/Target/ARM/Thumb1FrameLowering.h b/llvm/lib/Target/ARM/Thumb1FrameLowering.h
index 61af48712b6c..a4b2a085ea38 100644
--- a/llvm/lib/Target/ARM/Thumb1FrameLowering.h
+++ b/llvm/lib/Target/ARM/Thumb1FrameLowering.h
@@ -27,12 +27,13 @@ public:
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
                                  const TargetRegisterInfo *TRI) const override;
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MI,
-                                  std::vector<CalleeSavedInfo> &CSI,
-                                  const TargetRegisterInfo *TRI) const override;
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
+                              const TargetRegisterInfo *TRI) const override;
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
index b08b71a4952d..79afa378cb62 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -76,7 +76,7 @@ void Thumb1InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
 void Thumb1InstrInfo::
 storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                    unsigned SrcReg, bool isKill, int FI,
+                    Register SrcReg, bool isKill, int FI,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
   assert((RC == &ARM::tGPRRegClass ||
@@ -92,7 +92,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineFrameInfo &MFI = MF.getFrameInfo();
     MachineMemOperand *MMO = MF.getMachineMemOperand(
         MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
-        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+        MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
     BuildMI(MBB, I, DL, get(ARM::tSTRspi))
         .addReg(SrcReg, getKillRegState(isKill))
         .addFrameIndex(FI)
@@ -104,7 +104,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
 void Thumb1InstrInfo::
 loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                     unsigned DestReg, int FI,
+                     Register DestReg, int FI,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
   assert(
@@ -121,7 +121,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineFrameInfo &MFI = MF.getFrameInfo();
     MachineMemOperand *MMO = MF.getMachineMemOperand(
         MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
-        MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+        MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
     BuildMI(MBB, I, DL, get(ARM::tLDRspi), DestReg)
         .addFrameIndex(FI)
         .addImm(0)
diff --git a/llvm/lib/Target/ARM/Thumb1InstrInfo.h b/llvm/lib/Target/ARM/Thumb1InstrInfo.h
index 530289fe8c5d..017b7222337c 100644
--- a/llvm/lib/Target/ARM/Thumb1InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb1InstrInfo.h
@@ -42,13 +42,13 @@ public:
                    bool KillSrc) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           Register SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
-                            unsigned DestReg, int FrameIndex,
+                            Register DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
diff --git a/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp b/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
index 786fc78d0233..5cdaa7f02201 100644
--- a/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
+++ b/llvm/lib/Target/ARM/Thumb2ITBlockPass.cpp
@@ -183,7 +183,7 @@ Thumb2ITBlock::MoveCopyOutOfITBlock(MachineInstr *MI,
     ++I;
 
   if (I != E) {
-    unsigned NPredReg = 0;
+    Register NPredReg;
     ARMCC::CondCodes NCC = getITInstrPredicate(*I, NPredReg);
     if (NCC == CC || NCC == OCC)
       return true;
@@ -199,7 +199,7 @@ bool Thumb2ITBlock::InsertITInstructions(MachineBasicBlock &MBB) {
   while (MBBI != E) {
     MachineInstr *MI = &*MBBI;
     DebugLoc dl = MI->getDebugLoc();
-    unsigned PredReg = 0;
+    Register PredReg;
     ARMCC::CondCodes CC = getITInstrPredicate(*MI, PredReg);
     if (CC == ARMCC::AL) {
       ++MBBI;
@@ -239,7 +239,7 @@ bool Thumb2ITBlock::InsertITInstructions(MachineBasicBlock &MBB) {
         MachineInstr *NMI = &*MBBI;
         MI = NMI;
 
-        unsigned NPredReg = 0;
+        Register NPredReg;
         ARMCC::CondCodes NCC = getITInstrPredicate(*NMI, NPredReg);
         if (NCC == CC || NCC == OCC) {
           Mask |= ((NCC ^ CC) & 1) << Pos;
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
index e06bb9546c03..48c6b47f2154 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -66,7 +66,7 @@ Thumb2InstrInfo::ReplaceTailWithBranchTo(MachineBasicBlock::iterator Tail,
 
   // If the first instruction of Tail is predicated, we may have to update
   // the IT instruction.
-  unsigned PredReg = 0;
+  Register PredReg;
   ARMCC::CondCodes CC = getInstrPredicate(*Tail, PredReg);
   MachineBasicBlock::iterator MBBI = Tail;
   if (CC != ARMCC::AL)
@@ -114,7 +114,7 @@ Thumb2InstrInfo::isLegalToSplitMBBAt(MachineBasicBlock &MBB,
       return false;
   }
 
-  unsigned PredReg = 0;
+  Register PredReg;
   return getITInstrPredicate(*MBBI, PredReg) == ARMCC::AL;
 }
 
@@ -133,7 +133,7 @@ void Thumb2InstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
 void Thumb2InstrInfo::
 storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                    unsigned SrcReg, bool isKill, int FI,
+                    Register SrcReg, bool isKill, int FI,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
   DebugLoc DL;
@@ -143,7 +143,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
-      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+      MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
 
   if (ARM::GPRRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(ARM::t2STRi12))
@@ -176,14 +176,14 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
 void Thumb2InstrInfo::
 loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                     unsigned DestReg, int FI,
+                     Register DestReg, int FI,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
-      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+      MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
 
@@ -229,9 +229,9 @@ void Thumb2InstrInfo::expandLoadStackGuard(
 
 void llvm::emitT2RegPlusImmediate(MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator &MBBI,
-                                  const DebugLoc &dl, unsigned DestReg,
-                                  unsigned BaseReg, int NumBytes,
-                                  ARMCC::CondCodes Pred, unsigned PredReg,
+                                  const DebugLoc &dl, Register DestReg,
+                                  Register BaseReg, int NumBytes,
+                                  ARMCC::CondCodes Pred, Register PredReg,
                                   const ARMBaseInstrInfo &TII,
                                   unsigned MIFlags) {
   if (NumBytes == 0 && DestReg != BaseReg) {
@@ -471,7 +471,7 @@ immediateOffsetOpcode(unsigned opcode)
 }
 
 bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
-                               unsigned FrameReg, int &Offset,
+                               Register FrameReg, int &Offset,
                                const ARMBaseInstrInfo &TII,
                                const TargetRegisterInfo *TRI) {
   unsigned Opcode = MI.getOpcode();
@@ -491,7 +491,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
   if (IsSP || Opcode == ARM::t2ADDri || Opcode == ARM::t2ADDri12) {
     Offset += MI.getOperand(FrameRegIdx+1).getImm();
 
-    unsigned PredReg;
+    Register PredReg;
     if (Offset == 0 && getInstrPredicate(MI, PredReg) == ARMCC::AL &&
         !MI.definesRegister(ARM::CPSR)) {
       // Turn it into a move.
@@ -634,7 +634,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
       assert((Offset & OffsetMask) == 0 && "Can't encode this offset!");
       (void)OffsetMask; // squash unused-variable warning at -NDEBUG
     } else if (AddrMode == ARMII::AddrModeT2_i8s4) {
-      Offset += MI.getOperand(FrameRegIdx + 1).getImm() * 4;
+      Offset += MI.getOperand(FrameRegIdx + 1).getImm();
       NumBits = 8 + 2;
       // MCInst operand expects already scaled value.
       Scale = 1;
@@ -706,7 +706,7 @@ bool llvm::rewriteT2FrameIndex(MachineInstr &MI, unsigned FrameRegIdx,
 }
 
 ARMCC::CondCodes llvm::getITInstrPredicate(const MachineInstr &MI,
-                                           unsigned &PredReg) {
+                                           Register &PredReg) {
   unsigned Opc = MI.getOpcode();
   if (Opc == ARM::tBcc || Opc == ARM::t2Bcc)
     return ARMCC::AL;
@@ -727,7 +727,7 @@ int llvm::findFirstVPTPredOperandIdx(const MachineInstr &MI) {
 }
 
 ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI,
-                                            unsigned &PredReg) {
+                                            Register &PredReg) {
   int PIdx = findFirstVPTPredOperandIdx(MI);
   if (PIdx == -1) {
     PredReg = 0;
@@ -737,3 +737,33 @@ ARMVCC::VPTCodes llvm::getVPTInstrPredicate(const MachineInstr &MI,
   PredReg = MI.getOperand(PIdx+1).getReg();
   return (ARMVCC::VPTCodes)MI.getOperand(PIdx).getImm();
 }
+
+void llvm::recomputeVPTBlockMask(MachineInstr &Instr) {
+  assert(isVPTOpcode(Instr.getOpcode()) && "Not a VPST or VPT Instruction!");
+
+  MachineOperand &MaskOp = Instr.getOperand(0);
+  assert(MaskOp.isImm() && "Operand 0 is not the block mask of the VPT/VPST?!");
+
+  MachineBasicBlock::iterator Iter = ++Instr.getIterator(),
+                              End = Instr.getParent()->end();
+
+  // Verify that the instruction after the VPT/VPST is predicated (it should
+  // be), and skip it.
+  assert(
+      getVPTInstrPredicate(*Iter) == ARMVCC::Then &&
+      "VPT/VPST should be followed by an instruction with a 'then' predicate!");
+  ++Iter;
+
+  // Iterate over the predicated instructions, updating the BlockMask as we go.
+  ARM::PredBlockMask BlockMask = ARM::PredBlockMask::T;
+  while (Iter != End) {
+    ARMVCC::VPTCodes Pred = getVPTInstrPredicate(*Iter);
+    if (Pred == ARMVCC::None)
+      break;
+    BlockMask = expandPredBlockMask(BlockMask, Pred);
+    ++Iter;
+  }
+
+  // Rewrite the BlockMask.
+  MaskOp.setImm((int64_t)(BlockMask));
+}
diff --git a/llvm/lib/Target/ARM/Thumb2InstrInfo.h b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
index 7d8dff14e1e7..ec3763632239 100644
--- a/llvm/lib/Target/ARM/Thumb2InstrInfo.h
+++ b/llvm/lib/Target/ARM/Thumb2InstrInfo.h
@@ -44,13 +44,13 @@ public:
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           Register SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
-                            unsigned DestReg, int FrameIndex,
+                            Register DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
@@ -67,13 +67,24 @@ private:
 /// getITInstrPredicate - Valid only in Thumb2 mode. This function is identical
 /// to llvm::getInstrPredicate except it returns AL for conditional branch
 /// instructions which are "predicated", but are not in IT blocks.
-ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, unsigned &PredReg);
+ARMCC::CondCodes getITInstrPredicate(const MachineInstr &MI, Register &PredReg);
 
 // getVPTInstrPredicate: VPT analogue of that, plus a helper function
 // corresponding to MachineInstr::findFirstPredOperandIdx.
 int findFirstVPTPredOperandIdx(const MachineInstr &MI);
 ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI,
-                                      unsigned &PredReg);
+                                      Register &PredReg);
+inline ARMVCC::VPTCodes getVPTInstrPredicate(const MachineInstr &MI) {
+  Register PredReg;
+  return getVPTInstrPredicate(MI, PredReg);
 }
 
+// Recomputes the Block Mask of Instr, a VPT or VPST instruction.
+// This rebuilds the block mask of the instruction depending on the predicates
+// of the instructions following it. This should only be used after the
+// MVEVPTBlockInsertion pass has run, and should be used whenever a predicated
+// instruction is added to/removed from the block.
+void recomputeVPTBlockMask(MachineInstr &Instr);
+} // namespace llvm
+
 #endif
diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
index c5a62aa33990..ae661594bdc9 100644
--- a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -457,7 +457,7 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
       return false;
 
     if (!MI->hasOneMemOperand() ||
-        (*MI->memoperands_begin())->getAlignment() < 4)
+        (*MI->memoperands_begin())->getAlign() < Align(4))
       return false;
 
     // We're creating a completely different type of load/store - LDM from LDR.
@@ -516,13 +516,23 @@ Thumb2SizeReduce::ReduceLoadStore(MachineBasicBlock &MBB, MachineInstr *MI,
     isLdStMul = true;
     break;
   }
-  case ARM::t2STMIA:
-    // If the base register is killed, we don't care what its value is after the
-    // instruction, so we can use an updating STMIA.
+  case ARM::t2STMIA: {
+    // t2STMIA is reduced to tSTMIA_UPD which has writeback. We can only do this
+    // if the base register is killed, as then it doesn't matter what its value
+    // is after the instruction.
     if (!MI->getOperand(0).isKill())
       return false;
 
+    // If the base register is in the register list and isn't the lowest
+    // numbered register (i.e. it's in operand 4 onwards) then with writeback
+    // the stored value is unknown, so we can't convert to tSTMIA_UPD.
+    Register BaseReg = MI->getOperand(0).getReg();
+    for (unsigned i = 4; i < MI->getNumOperands(); ++i)
+      if (MI->getOperand(i).getReg() == BaseReg)
+        return false;
+
     break;
+  }
   case ARM::t2LDMIA_RET: {
     Register BaseReg = MI->getOperand(1).getReg();
     if (BaseReg != ARM::SP)
@@ -676,7 +686,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
   default: break;
   case ARM::t2ADDSri:
   case ARM::t2ADDSrr: {
-    unsigned PredReg = 0;
+    Register PredReg;
     if (getInstrPredicate(*MI, PredReg) == ARMCC::AL) {
       switch (Opc) {
       default: break;
@@ -718,7 +728,7 @@ Thumb2SizeReduce::ReduceSpecial(MachineBasicBlock &MBB, MachineInstr *MI,
     return ReduceToNarrow(MBB, MI, Entry, LiveCPSR, IsSelfLoop);
   }
   case ARM::t2TEQrr: {
-    unsigned PredReg = 0;
+    Register PredReg;
     // Can only convert to eors if we're not in an IT block.
     if (getInstrPredicate(*MI, PredReg) != ARMCC::AL)
       break;
@@ -789,7 +799,7 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
 
   // Check if it's possible / necessary to transfer the predicate.
   const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc2);
-  unsigned PredReg = 0;
+  Register PredReg;
   ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
   bool SkipPred = false;
   if (Pred != ARMCC::AL) {
@@ -882,7 +892,7 @@ Thumb2SizeReduce::ReduceToNarrow(MachineBasicBlock &MBB, MachineInstr *MI,
 
   // Check if it's possible / necessary to transfer the predicate.
   const MCInstrDesc &NewMCID = TII->get(Entry.NarrowOpc1);
-  unsigned PredReg = 0;
+  Register PredReg;
   ARMCC::CondCodes Pred = getInstrPredicate(*MI, PredReg);
   bool SkipPred = false;
   if (Pred != ARMCC::AL) {
diff --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
index b0ba58d8dc4a..4da6f6ab6994 100644
--- a/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.cpp
@@ -70,7 +70,7 @@ static void emitThumb1LoadConstPool(MachineBasicBlock &MBB,
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C = ConstantInt::get(
           Type::getInt32Ty(MBB.getParent()->getFunction().getContext()), Val);
-  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align(4));
 
   BuildMI(MBB, MBBI, dl, TII.get(ARM::tLDRpci))
     .addReg(DestReg, getDefRegState(true), SubIdx)
@@ -89,7 +89,7 @@ static void emitThumb2LoadConstPool(MachineBasicBlock &MBB,
   MachineConstantPool *ConstantPool = MF.getConstantPool();
   const Constant *C = ConstantInt::get(
            Type::getInt32Ty(MBB.getParent()->getFunction().getContext()), Val);
-  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align(4));
 
   BuildMI(MBB, MBBI, dl, TII.get(ARM::t2LDRpci))
       .addReg(DestReg, getDefRegState(true), SubIdx)
@@ -102,14 +102,13 @@ static void emitThumb2LoadConstPool(MachineBasicBlock &MBB,
 /// specified immediate.
 void ThumbRegisterInfo::emitLoadConstPool(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx, int Val,
-    ARMCC::CondCodes Pred, unsigned PredReg, unsigned MIFlags) const {
+    const DebugLoc &dl, Register DestReg, unsigned SubIdx, int Val,
+    ARMCC::CondCodes Pred, Register PredReg, unsigned MIFlags) const {
   MachineFunction &MF = *MBB.getParent();
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
   if (STI.isThumb1Only()) {
-    assert(
-        (isARMLowRegister(DestReg) || Register::isVirtualRegister(DestReg)) &&
-        "Thumb1 does not have ldr to high register");
+    assert((isARMLowRegister(DestReg) || DestReg.isVirtual()) &&
+           "Thumb1 does not have ldr to high register");
     return emitThumb1LoadConstPool(MBB, MBBI, dl, DestReg, SubIdx, Val, Pred,
                                    PredReg, MIFlags);
   }
@@ -123,7 +122,7 @@ void ThumbRegisterInfo::emitLoadConstPool(
 /// constpool entry.
 static void emitThumbRegPlusImmInReg(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-    const DebugLoc &dl, unsigned DestReg, unsigned BaseReg, int NumBytes,
+    const DebugLoc &dl, Register DestReg, Register BaseReg, int NumBytes,
     bool CanChangeCC, const TargetInstrInfo &TII,
     const ARMBaseRegisterInfo &MRI, unsigned MIFlags = MachineInstr::NoFlags) {
   MachineFunction &MF = *MBB.getParent();
@@ -139,7 +138,7 @@ static void emitThumbRegPlusImmInReg(
     isSub = true;
     NumBytes = -NumBytes;
   }
-  unsigned LdReg = DestReg;
+  Register LdReg = DestReg;
   if (DestReg == ARM::SP)
     assert(BaseReg == ARM::SP && "Unexpected!");
   if (!isARMLowRegister(DestReg) && !Register::isVirtualRegister(DestReg))
@@ -185,8 +184,8 @@ static void emitThumbRegPlusImmInReg(
 /// be too long. This is allowed to modify the condition flags.
 void llvm::emitThumbRegPlusImmediate(MachineBasicBlock &MBB,
                                      MachineBasicBlock::iterator &MBBI,
-                                     const DebugLoc &dl, unsigned DestReg,
-                                     unsigned BaseReg, int NumBytes,
+                                     const DebugLoc &dl, Register DestReg,
+                                     Register BaseReg, int NumBytes,
                                      const TargetInstrInfo &TII,
                                      const ARMBaseRegisterInfo &MRI,
                                      unsigned MIFlags) {
@@ -358,7 +357,7 @@ static unsigned convertToNonSPOpcode(unsigned Opcode) {
 
 bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II,
                                           unsigned FrameRegIdx,
-                                          unsigned FrameReg, int &Offset,
+                                          Register FrameReg, int &Offset,
                                           const ARMBaseInstrInfo &TII) const {
   MachineInstr &MI = *II;
   MachineBasicBlock &MBB = *MI.getParent();
@@ -427,8 +426,8 @@ bool ThumbRegisterInfo::rewriteFrameIndex(MachineBasicBlock::iterator II,
   return Offset == 0;
 }
 
-void ThumbRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
-                                           int64_t Offset) const {
+void ThumbRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
+                                          int64_t Offset) const {
   const MachineFunction &MF = *MI.getParent()->getParent();
   const ARMSubtarget &STI = MF.getSubtarget<ARMSubtarget>();
   if (!STI.isThumb1Only())
@@ -458,12 +457,12 @@ void ThumbRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     return ARMBaseRegisterInfo::eliminateFrameIndex(II, SPAdj, FIOperandNum,
                                                     RS);
 
-  unsigned VReg = 0;
+  Register VReg;
   const ARMBaseInstrInfo &TII = *STI.getInstrInfo();
   DebugLoc dl = MI.getDebugLoc();
   MachineInstrBuilder MIB(*MBB.getParent(), &MI);
 
-  unsigned FrameReg;
+  Register FrameReg;
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
   const ARMFrameLowering *TFI = getFrameLowering(MF);
   int Offset = TFI->ResolveFrameIndexReference(MF, FrameIndex, FrameReg, SPAdj);
diff --git a/llvm/lib/Target/ARM/ThumbRegisterInfo.h b/llvm/lib/Target/ARM/ThumbRegisterInfo.h
index 08cf67284d4c..e05a24dbaca5 100644
--- a/llvm/lib/Target/ARM/ThumbRegisterInfo.h
+++ b/llvm/lib/Target/ARM/ThumbRegisterInfo.h
@@ -38,18 +38,18 @@ public:
   /// specified immediate.
   void
   emitLoadConstPool(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
-                    const DebugLoc &dl, unsigned DestReg, unsigned SubIdx,
+                    const DebugLoc &dl, Register DestReg, unsigned SubIdx,
                     int Val, ARMCC::CondCodes Pred = ARMCC::AL,
-                    unsigned PredReg = 0,
+                    Register PredReg = Register(),
                     unsigned MIFlags = MachineInstr::NoFlags) const override;
 
   // rewrite MI to access 'Offset' bytes from the FP. Update Offset to be
   // however much remains to be handled. Return 'true' if no further
   // work is required.
   bool rewriteFrameIndex(MachineBasicBlock::iterator II, unsigned FrameRegIdx,
-                         unsigned FrameReg, int &Offset,
+                         Register FrameReg, int &Offset,
                          const ARMBaseInstrInfo &TII) const;
-  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+  void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                          int64_t Offset) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
diff --git a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp
index 4ace61cccd0f..3356d56481e5 100644
--- a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp
+++ b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.cpp
@@ -15,6 +15,37 @@
 
 using namespace llvm;
 namespace llvm {
+ARM::PredBlockMask expandPredBlockMask(ARM::PredBlockMask BlockMask,
+                                       ARMVCC::VPTCodes Kind) {
+  using PredBlockMask = ARM::PredBlockMask;
+  assert(Kind != ARMVCC::None && "Cannot expand a mask with None!");
+  assert(countTrailingZeros((unsigned)BlockMask) != 0 &&
+         "Mask is already full");
+
+  auto ChooseMask = [&](PredBlockMask AddedThen, PredBlockMask AddedElse) {
+    return Kind == ARMVCC::Then ? AddedThen : AddedElse;
+  };
+
+  switch (BlockMask) {
+  case PredBlockMask::T:
+    return ChooseMask(PredBlockMask::TT, PredBlockMask::TE);
+  case PredBlockMask::TT:
+    return ChooseMask(PredBlockMask::TTT, PredBlockMask::TTE);
+  case PredBlockMask::TE:
+    return ChooseMask(PredBlockMask::TET, PredBlockMask::TEE);
+  case PredBlockMask::TTT:
+    return ChooseMask(PredBlockMask::TTTT, PredBlockMask::TTTE);
+  case PredBlockMask::TTE:
+    return ChooseMask(PredBlockMask::TTET, PredBlockMask::TTEE);
+  case PredBlockMask::TET:
+    return ChooseMask(PredBlockMask::TETT, PredBlockMask::TETE);
+  case PredBlockMask::TEE:
+    return ChooseMask(PredBlockMask::TEET, PredBlockMask::TEEE);
+  default:
+    llvm_unreachable("Unknown Mask");
+  }
+}
+
 namespace ARMSysReg {
 
 // lookup system register using 12-bit SYSm value.
diff --git a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
index 27605422983d..80b7276adb4e 100644
--- a/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
+++ b/llvm/lib/Target/ARM/Utils/ARMBaseInfo.h
@@ -91,41 +91,41 @@ namespace ARMVCC {
     Then,
     Else
   };
-
-  enum VPTMaskValue {
-    T     =  8, // 0b1000
-    TT    =  4, // 0b0100
-    TE    = 12, // 0b1100
-    TTT   =  2, // 0b0010
-    TTE   =  6, // 0b0110
-    TEE   = 10, // 0b1010
-    TET   = 14, // 0b1110
-    TTTT  =  1, // 0b0001
-    TTTE  =  3, // 0b0011
-    TTEE  =  5, // 0b0101
-    TTET  =  7, // 0b0111
-    TEEE  =  9, // 0b1001
-    TEET  = 11, // 0b1011
-    TETT  = 13, // 0b1101
-    TETE  = 15  // 0b1111
+} // namespace ARMVCC
+
+namespace ARM {
+  /// Mask values for IT and VPT Blocks, to be used by MCOperands.
+  /// Note that this is different from the "real" encoding used by the
+  /// instructions. In this encoding, the lowest set bit indicates the end of
+  /// the encoding, and above that, "1" indicates an else, while "0" indicates
+  /// a then.
+  ///   Tx = x100
+  ///   Txy = xy10
+  ///   Txyz = xyz1
+  enum class PredBlockMask {
+    T = 0b1000,
+    TT = 0b0100,
+    TE = 0b1100,
+    TTT = 0b0010,
+    TTE = 0b0110,
+    TEE = 0b1110,
+    TET = 0b1010,
+    TTTT = 0b0001,
+    TTTE = 0b0011,
+    TTEE = 0b0111,
+    TTET = 0b0101,
+    TEEE = 0b1111,
+    TEET = 0b1101,
+    TETT = 0b1001,
+    TETE = 0b1011
   };
-}
+} // namespace ARM
 
-inline static unsigned getARMVPTBlockMask(unsigned NumInsts) {
-  switch (NumInsts) {
-  case 1:
-    return ARMVCC::T;
-  case 2:
-    return ARMVCC::TT;
-  case 3:
-    return ARMVCC::TTT;
-  case 4:
-    return ARMVCC::TTTT;
-  default:
-    break;
-  };
-  llvm_unreachable("Unexpected number of instruction in a VPT block");
-}
+// Expands a PredBlockMask by adding an E or a T at the end, depending on Kind.
+// e.g ExpandPredBlockMask(T, Then) = TT, ExpandPredBlockMask(TT, Else) = TTE,
+// and so on.
+ARM::PredBlockMask expandPredBlockMask(ARM::PredBlockMask BlockMask,
+                                       ARMVCC::VPTCodes Kind);
 
 inline static const char *ARMVPTPredToString(ARMVCC::VPTCodes CC) {
   switch (CC) {
diff --git a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
index 9b09c7456543..722eecdc16a1 100644
--- a/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
+++ b/llvm/lib/Target/AVR/AVRAsmPrinter.cpp
@@ -51,7 +51,7 @@ public:
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
                              const char *ExtraCode, raw_ostream &O) override;
 
-  void EmitInstruction(const MachineInstr *MI) override;
+  void emitInstruction(const MachineInstr *MI) override;
 
 private:
   const MCRegisterInfo &MRI;
@@ -168,7 +168,7 @@ bool AVRAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
-void AVRAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void AVRAsmPrinter::emitInstruction(const MachineInstr *MI) {
   AVRMCInstLower MCInstLowering(OutContext, *this);
 
   MCInst I;
diff --git a/llvm/lib/Target/AVR/AVRCallingConv.td b/llvm/lib/Target/AVR/AVRCallingConv.td
index 213e35fca66d..65545e531a88 100644
--- a/llvm/lib/Target/AVR/AVRCallingConv.td
+++ b/llvm/lib/Target/AVR/AVRCallingConv.td
@@ -6,21 +6,13 @@
 //
 //===----------------------------------------------------------------------===//
 // This describes the calling conventions for AVR architecture.
+// Normal functions use a special calling convention, solved in code.
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
 // AVR Return Value Calling Convention
 //===----------------------------------------------------------------------===//
 
-def RetCC_AVR : CallingConv
-<[
-  // i8 is returned in R24.
-  CCIfType<[i8], CCAssignToReg<[R24]>>,
-
-  // i16 are returned in R25:R24, R23:R22, R21:R20 and R19:R18.
-  CCIfType<[i16], CCAssignToReg<[R25R24, R23R22, R21R20, R19R18]>>
-]>;
-
 // Special return value calling convention for runtime functions.
 def RetCC_AVR_BUILTIN : CallingConv
 <[
@@ -41,14 +33,6 @@ def ArgCC_AVR_Vararg : CallingConv
   CCAssignToStack<2, 1>
 ]>;
 
-// Special argument calling convention for
-// division runtime functions.
-def ArgCC_AVR_BUILTIN_DIV : CallingConv
-<[
-  CCIfType<[i8], CCAssignToReg<[R24,R22]>>,
-  CCIfType<[i16], CCAssignToReg<[R25R24, R23R22]>>
-]>;
-
 //===----------------------------------------------------------------------===//
 // Callee-saved register lists.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/AVR/AVRDevices.td b/llvm/lib/Target/AVR/AVRDevices.td
index 62def4574437..6730f2e1673e 100644
--- a/llvm/lib/Target/AVR/AVRDevices.td
+++ b/llvm/lib/Target/AVR/AVRDevices.td
@@ -121,6 +121,11 @@ def FeatureTinyEncoding   : SubtargetFeature<"tinyencoding",
                                   "The device has Tiny core specific "
                                   "instruction encodings">;
 
+// The device has CPU registers mapped in data address space
+def FeatureMMR : SubtargetFeature<"memmappedregs", "m_hasMemMappedGPR",
+                                  "true", "The device has CPU registers "
+                                  "mapped in data address space">;
+
 class ELFArch<string name>  : SubtargetFeature<"", "ELFArch",
                                     !strconcat("ELF::",name), "">;
 
@@ -152,7 +157,7 @@ def ELFArchXMEGA7  : ELFArch<"EF_AVR_ARCH_XMEGA7">;
 // device should have.
 def FamilyAVR0           : Family<"avr0", []>;
 
-def FamilyAVR1           : Family<"avr1", [FamilyAVR0, FeatureLPM]>;
+def FamilyAVR1           : Family<"avr1", [FamilyAVR0, FeatureLPM, FeatureMMR]>;
 
 def FamilyAVR2           : Family<"avr2",
                                  [FamilyAVR1, FeatureIJMPCALL, FeatureADDSUBIW,
@@ -190,11 +195,14 @@ def FamilyAVR6           : Family<"avr6",
 
 def FamilyTiny           : Family<"avrtiny",
                                  [FamilyAVR0, FeatureBREAK, FeatureSRAM,
-                                  FeatureTinyEncoding]>;
+                                  FeatureTinyEncoding, FeatureMMR]>;
 
 def FamilyXMEGA          : Family<"xmega",
-                                 [FamilyAVR51, FeatureEIJMPCALL, FeatureSPMX,
-                                  FeatureDES]>;
+                                 [FamilyAVR0, FeatureLPM, FeatureIJMPCALL, FeatureADDSUBIW,
+                                  FeatureSRAM, FeatureJMPCALL, FeatureMultiplication,
+                                  FeatureMOVW, FeatureLPMX, FeatureSPM,
+                                  FeatureBREAK, FeatureEIJMPCALL, FeatureSPMX,
+                                  FeatureDES, FeatureELPM, FeatureELPMX]>;
 
 def FamilyXMEGAU         : Family<"xmegau",
                                   [FamilyXMEGA, FeatureRMW]>;
@@ -208,7 +216,7 @@ def FeatureSetSpecial    : FeatureSet<"special",
                                        FeatureLPM, FeatureLPMX, FeatureELPM,
                                        FeatureELPMX, FeatureSPM, FeatureSPMX,
                                        FeatureDES, FeatureRMW,
-                                       FeatureMultiplication, FeatureBREAK]>;
+                                       FeatureMultiplication, FeatureBREAK, FeatureMMR]>;
 
 //===---------------------------------------------------------------------===//
 // AVR microcontrollers supported.
diff --git a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
index f466c5c053ad..8ee69201e932 100644
--- a/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AVR/AVRExpandPseudoInsts.cpp
@@ -51,9 +51,9 @@ private:
   const TargetInstrInfo *TII;
 
   /// The register to be used for temporary storage.
-  const unsigned SCRATCH_REGISTER = AVR::R0;
+  const Register SCRATCH_REGISTER = AVR::R0;
   /// The register that will always contain zero.
-  const unsigned ZERO_REGISTER = AVR::R1;
+  const Register ZERO_REGISTER = AVR::R1;
   /// The IO address of the status register.
   const unsigned SREG_ADDR = 0x3f;
 
@@ -66,7 +66,7 @@ private:
   }
 
   MachineInstrBuilder buildMI(Block &MBB, BlockIt MBBI, unsigned Opcode,
-                              unsigned DstReg) {
+                              Register DstReg) {
     return BuildMI(MBB, MBBI, MBBI->getDebugLoc(), TII->get(Opcode), DstReg);
   }
 
@@ -91,7 +91,7 @@ private:
                                 BlockIt MBBI);
 
   /// Scavenges a free GPR8 register for use.
-  unsigned scavengeGPR8(MachineInstr &MI);
+  Register scavengeGPR8(MachineInstr &MI);
 };
 
 char AVRExpandPseudo::ID = 0;
@@ -141,7 +141,7 @@ bool AVRExpandPseudo::runOnMachineFunction(MachineFunction &MF) {
 bool AVRExpandPseudo::
 expandArith(unsigned OpLo, unsigned OpHi, Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
+  Register SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(2).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
@@ -174,7 +174,7 @@ expandArith(unsigned OpLo, unsigned OpHi, Block &MBB, BlockIt MBBI) {
 bool AVRExpandPseudo::
 expandLogic(unsigned Op, Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
+  Register SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(2).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
@@ -221,7 +221,7 @@ bool AVRExpandPseudo::
 bool AVRExpandPseudo::
 expandLogicImm(unsigned Op, Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned DstLoReg, DstHiReg;
+  Register DstLoReg, DstHiReg;
   Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool SrcIsKill = MI.getOperand(1).isKill();
@@ -273,8 +273,8 @@ bool AVRExpandPseudo::expand<AVR::SUBWRdRr>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::SUBIWRdK>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool SrcIsKill = MI.getOperand(1).isKill();
   bool ImpIsDead = MI.getOperand(3).isDead();
@@ -325,16 +325,16 @@ bool AVRExpandPseudo::expand<AVR::SBCWRdRr>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::SBCIWRdK>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool SrcIsKill = MI.getOperand(1).isKill();
   bool ImpIsDead = MI.getOperand(3).isDead();
   unsigned Imm = MI.getOperand(2).getImm();
   unsigned Lo8 = Imm & 0xff;
   unsigned Hi8 = (Imm >> 8) & 0xff;
-  OpLo = AVR::SBCIRdK;
-  OpHi = AVR::SBCIRdK;
+  unsigned OpLo = AVR::SBCIRdK;
+  unsigned OpHi = AVR::SBCIRdK;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
@@ -388,13 +388,13 @@ bool AVRExpandPseudo::expand<AVR::EORWRdRr>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::COMWRd>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool DstIsKill = MI.getOperand(1).isKill();
   bool ImpIsDead = MI.getOperand(2).isDead();
-  OpLo = AVR::COMRd;
-  OpHi = AVR::COMRd;
+  unsigned OpLo = AVR::COMRd;
+  unsigned OpHi = AVR::COMRd;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
@@ -418,14 +418,14 @@ bool AVRExpandPseudo::expand<AVR::COMWRd>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::CPWRdRr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
   bool DstIsKill = MI.getOperand(0).isKill();
   bool SrcIsKill = MI.getOperand(1).isKill();
   bool ImpIsDead = MI.getOperand(2).isDead();
-  OpLo = AVR::CPRdRr;
-  OpHi = AVR::CPCRdRr;
+  unsigned OpLo = AVR::CPRdRr;
+  unsigned OpHi = AVR::CPCRdRr;
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
@@ -451,14 +451,14 @@ bool AVRExpandPseudo::expand<AVR::CPWRdRr>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::CPCWRdRr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register SrcLoReg, SrcHiReg, DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
   bool DstIsKill = MI.getOperand(0).isKill();
   bool SrcIsKill = MI.getOperand(1).isKill();
   bool ImpIsDead = MI.getOperand(2).isDead();
-  OpLo = AVR::CPCRdRr;
-  OpHi = AVR::CPCRdRr;
+  unsigned OpLo = AVR::CPCRdRr;
+  unsigned OpHi = AVR::CPCRdRr;
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
@@ -486,11 +486,11 @@ bool AVRExpandPseudo::expand<AVR::CPCWRdRr>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::LDIWRdK>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
-  OpLo = AVR::LDIRdK;
-  OpHi = AVR::LDIRdK;
+  unsigned OpLo = AVR::LDIRdK;
+  unsigned OpHi = AVR::LDIRdK;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
@@ -535,11 +535,11 @@ bool AVRExpandPseudo::expand<AVR::LDIWRdK>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::LDSWRdK>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
-  OpLo = AVR::LDSRdK;
-  OpHi = AVR::LDSRdK;
+  unsigned OpLo = AVR::LDSRdK;
+  unsigned OpHi = AVR::LDSRdK;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
@@ -579,26 +579,26 @@ bool AVRExpandPseudo::expand<AVR::LDSWRdK>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::LDWRdPtr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned TmpReg = 0; // 0 for no temporary register
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register TmpReg = 0; // 0 for no temporary register
+  Register SrcReg = MI.getOperand(1).getReg();
   bool SrcIsKill = MI.getOperand(1).isKill();
-  OpLo = AVR::LDRdPtr;
-  OpHi = AVR::LDDRdPtrQ;
+  unsigned OpLo = AVR::LDRdPtr;
+  unsigned OpHi = AVR::LDDRdPtrQ;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   // Use a temporary register if src and dst registers are the same.
   if (DstReg == SrcReg)
     TmpReg = scavengeGPR8(MI);
 
-  unsigned CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg;
-  unsigned CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg;
+  Register CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg;
+  Register CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg;
 
   // Load low byte.
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
-    .addReg(CurDstLoReg, RegState::Define)
-    .addReg(SrcReg, RegState::Define);
+                   .addReg(CurDstLoReg, RegState::Define)
+                   .addReg(SrcReg);
 
   // Push low byte onto stack if necessary.
   if (TmpReg)
@@ -628,13 +628,13 @@ bool AVRExpandPseudo::expand<AVR::LDWRdPtr>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::LDWRdPtrPi>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool SrcIsDead = MI.getOperand(1).isKill();
-  OpLo = AVR::LDRdPtrPi;
-  OpHi = AVR::LDRdPtrPi;
+  unsigned OpLo = AVR::LDRdPtrPi;
+  unsigned OpHi = AVR::LDRdPtrPi;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
@@ -659,13 +659,13 @@ bool AVRExpandPseudo::expand<AVR::LDWRdPtrPi>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::LDWRdPtrPd>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool SrcIsDead = MI.getOperand(1).isKill();
-  OpLo = AVR::LDRdPtrPd;
-  OpHi = AVR::LDRdPtrPd;
+  unsigned OpLo = AVR::LDRdPtrPd;
+  unsigned OpHi = AVR::LDRdPtrPd;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
@@ -690,14 +690,14 @@ bool AVRExpandPseudo::expand<AVR::LDWRdPtrPd>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned TmpReg = 0; // 0 for no temporary register
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register TmpReg = 0; // 0 for no temporary register
+  Register SrcReg = MI.getOperand(1).getReg();
   unsigned Imm = MI.getOperand(2).getImm();
   bool SrcIsKill = MI.getOperand(1).isKill();
-  OpLo = AVR::LDDRdPtrQ;
-  OpHi = AVR::LDDRdPtrQ;
+  unsigned OpLo = AVR::LDDRdPtrQ;
+  unsigned OpHi = AVR::LDDRdPtrQ;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   // Since we add 1 to the Imm value for the high byte below, and 63 is the highest Imm value
@@ -708,8 +708,8 @@ bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) {
   if (DstReg == SrcReg)
     TmpReg = scavengeGPR8(MI);
 
-  unsigned CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg;
-  unsigned CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg;
+  Register CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg;
+  Register CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg;
 
   // Load low byte.
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
@@ -745,21 +745,21 @@ bool AVRExpandPseudo::expand<AVR::LDDWRdPtrQ>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::LPMWRdZ>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned TmpReg = 0; // 0 for no temporary register
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register TmpReg = 0; // 0 for no temporary register
+  Register SrcReg = MI.getOperand(1).getReg();
   bool SrcIsKill = MI.getOperand(1).isKill();
-  OpLo = AVR::LPMRdZPi;
-  OpHi = AVR::LPMRdZ;
+  unsigned OpLo = AVR::LPMRdZPi;
+  unsigned OpHi = AVR::LPMRdZ;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   // Use a temporary register if src and dst registers are the same.
   if (DstReg == SrcReg)
     TmpReg = scavengeGPR8(MI);
 
-  unsigned CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg;
-  unsigned CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg;
+  Register CurDstLoReg = (DstReg == SrcReg) ? TmpReg : DstLoReg;
+  Register CurDstHiReg = (DstReg == SrcReg) ? TmpReg : DstHiReg;
 
   // Load low byte.
   auto MIBLO = buildMI(MBB, MBBI, OpLo)
@@ -862,7 +862,7 @@ bool AVRExpandPseudo::expandAtomicArithmeticOp(unsigned Width,
   });
 }
 
-unsigned AVRExpandPseudo::scavengeGPR8(MachineInstr &MI) {
+Register AVRExpandPseudo::scavengeGPR8(MachineInstr &MI) {
   MachineBasicBlock &MBB = *MI.getParent();
   RegScavenger RS;
 
@@ -968,11 +968,11 @@ bool AVRExpandPseudo::expand<AVR::AtomicFence>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::STSWKRr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register SrcLoReg, SrcHiReg;
+  Register SrcReg = MI.getOperand(1).getReg();
   bool SrcIsKill = MI.getOperand(1).isKill();
-  OpLo = AVR::STSKRr;
-  OpHi = AVR::STSKRr;
+  unsigned OpLo = AVR::STSKRr;
+  unsigned OpHi = AVR::STSKRr;
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
 
   // Write the high byte first in case this address belongs to a special
@@ -1014,12 +1014,12 @@ bool AVRExpandPseudo::expand<AVR::STSWKRr>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::STWPtrRr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register SrcLoReg, SrcHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
   bool SrcIsKill = MI.getOperand(1).isKill();
-  OpLo = AVR::STPtrRr;
-  OpHi = AVR::STDPtrQRr;
+  unsigned OpLo = AVR::STPtrRr;
+  unsigned OpHi = AVR::STDPtrQRr;
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
 
   //:TODO: need to reverse this order like inw and stsw?
@@ -1042,14 +1042,14 @@ bool AVRExpandPseudo::expand<AVR::STWPtrRr>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::STWPtrPiRr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(2).getReg();
+  Register SrcLoReg, SrcHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(2).getReg();
   unsigned Imm = MI.getOperand(3).getImm();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool SrcIsKill = MI.getOperand(2).isKill();
-  OpLo = AVR::STPtrPiRr;
-  OpHi = AVR::STPtrPiRr;
+  unsigned OpLo = AVR::STPtrPiRr;
+  unsigned OpHi = AVR::STPtrPiRr;
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
 
   assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
@@ -1076,14 +1076,14 @@ bool AVRExpandPseudo::expand<AVR::STWPtrPiRr>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::STWPtrPdRr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(2).getReg();
+  Register SrcLoReg, SrcHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(2).getReg();
   unsigned Imm = MI.getOperand(3).getImm();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool SrcIsKill = MI.getOperand(2).isKill();
-  OpLo = AVR::STPtrPdRr;
-  OpHi = AVR::STPtrPdRr;
+  unsigned OpLo = AVR::STPtrPdRr;
+  unsigned OpHi = AVR::STPtrPdRr;
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
 
   assert(DstReg != SrcReg && "SrcReg and DstReg cannot be the same");
@@ -1110,14 +1110,14 @@ bool AVRExpandPseudo::expand<AVR::STWPtrPdRr>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::STDWPtrQRr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(2).getReg();
+  Register SrcLoReg, SrcHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(2).getReg();
   unsigned Imm = MI.getOperand(1).getImm();
   bool DstIsKill = MI.getOperand(0).isKill();
   bool SrcIsKill = MI.getOperand(2).isKill();
-  OpLo = AVR::STDPtrQRr;
-  OpHi = AVR::STDPtrQRr;
+  unsigned OpLo = AVR::STDPtrQRr;
+  unsigned OpHi = AVR::STDPtrQRr;
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
 
   // Since we add 1 to the Imm value for the high byte below, and 63 is the highest Imm value
@@ -1144,12 +1144,12 @@ bool AVRExpandPseudo::expand<AVR::STDWPtrQRr>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::INWRdA>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
+  Register DstLoReg, DstHiReg;
   unsigned Imm = MI.getOperand(1).getImm();
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
-  OpLo = AVR::INRdA;
-  OpHi = AVR::INRdA;
+  unsigned OpLo = AVR::INRdA;
+  unsigned OpHi = AVR::INRdA;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   // Since we add 1 to the Imm value for the high byte below, and 63 is the highest Imm value
@@ -1174,12 +1174,12 @@ bool AVRExpandPseudo::expand<AVR::INWRdA>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::OUTWARr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
+  Register SrcLoReg, SrcHiReg;
   unsigned Imm = MI.getOperand(0).getImm();
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
   bool SrcIsKill = MI.getOperand(1).isKill();
-  OpLo = AVR::OUTARr;
-  OpHi = AVR::OUTARr;
+  unsigned OpLo = AVR::OUTARr;
+  unsigned OpHi = AVR::OUTARr;
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
 
   // Since we add 1 to the Imm value for the high byte below, and 63 is the highest Imm value
@@ -1205,12 +1205,12 @@ bool AVRExpandPseudo::expand<AVR::OUTWARr>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::PUSHWRr>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, SrcLoReg, SrcHiReg;
-  unsigned SrcReg = MI.getOperand(0).getReg();
+  Register SrcLoReg, SrcHiReg;
+  Register SrcReg = MI.getOperand(0).getReg();
   bool SrcIsKill = MI.getOperand(0).isKill();
   unsigned Flags = MI.getFlags();
-  OpLo = AVR::PUSHRr;
-  OpHi = AVR::PUSHRr;
+  unsigned OpLo = AVR::PUSHRr;
+  unsigned OpHi = AVR::PUSHRr;
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
 
   // Low part
@@ -1230,11 +1230,11 @@ bool AVRExpandPseudo::expand<AVR::PUSHWRr>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::POPWRd>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
   unsigned Flags = MI.getFlags();
-  OpLo = AVR::POPRd;
-  OpHi = AVR::POPRd;
+  unsigned OpLo = AVR::POPRd;
+  unsigned OpHi = AVR::POPRd;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   buildMI(MBB, MBBI, OpHi, DstHiReg).setMIFlags(Flags); // High
@@ -1254,7 +1254,7 @@ bool AVRExpandPseudo::expand<AVR::ROLBRd>(Block &MBB, BlockIt MBBI) {
 
   MachineInstr &MI = *MBBI;
   unsigned OpShift, OpCarry;
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   OpShift = AVR::ADDRdRr;
   OpCarry = AVR::ADCRdRr;
@@ -1291,7 +1291,7 @@ bool AVRExpandPseudo::expand<AVR::RORBRd>(Block &MBB, BlockIt MBBI) {
 
   MachineInstr &MI = *MBBI;
   unsigned OpShiftOut, OpLoad, OpShiftIn, OpAdd;
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   OpShiftOut = AVR::LSRRd;
   OpLoad = AVR::LDIRdK;
@@ -1334,13 +1334,13 @@ bool AVRExpandPseudo::expand<AVR::RORBRd>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::LSLWRd>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool DstIsKill = MI.getOperand(1).isKill();
   bool ImpIsDead = MI.getOperand(2).isDead();
-  OpLo = AVR::ADDRdRr; // ADD Rd, Rd <==> LSL Rd
-  OpHi = AVR::ADCRdRr; // ADC Rd, Rd <==> ROL Rd
+  unsigned OpLo = AVR::ADDRdRr; // ADD Rd, Rd <==> LSL Rd
+  unsigned OpHi = AVR::ADCRdRr; // ADC Rd, Rd <==> ROL Rd
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   // Low part
@@ -1367,13 +1367,13 @@ bool AVRExpandPseudo::expand<AVR::LSLWRd>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::LSRWRd>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool DstIsKill = MI.getOperand(1).isKill();
   bool ImpIsDead = MI.getOperand(2).isDead();
-  OpLo = AVR::RORRd;
-  OpHi = AVR::LSRRd;
+  unsigned OpLo = AVR::RORRd;
+  unsigned OpHi = AVR::LSRRd;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   // High part
@@ -1410,13 +1410,13 @@ bool AVRExpandPseudo::expand<AVR::ROLWRd>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::ASRWRd>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool DstIsKill = MI.getOperand(1).isKill();
   bool ImpIsDead = MI.getOperand(2).isDead();
-  OpLo = AVR::RORRd;
-  OpHi = AVR::ASRRd;
+  unsigned OpLo = AVR::RORRd;
+  unsigned OpHi = AVR::ASRRd;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   // High part
@@ -1440,7 +1440,7 @@ bool AVRExpandPseudo::expand<AVR::ASRWRd>(Block &MBB, BlockIt MBBI) {
 
 template <> bool AVRExpandPseudo::expand<AVR::SEXT>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned DstLoReg, DstHiReg;
+  Register DstLoReg, DstHiReg;
   // sext R17:R16, R17
   // mov     r16, r17
   // lsl     r17
@@ -1454,8 +1454,8 @@ template <> bool AVRExpandPseudo::expand<AVR::SEXT>(Block &MBB, BlockIt MBBI) {
   // mov     r17, r16
   // lsl     r17
   // sbc     r17, r17
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool SrcIsKill = MI.getOperand(1).isKill();
   bool ImpIsDead = MI.getOperand(2).isDead();
@@ -1499,7 +1499,7 @@ template <> bool AVRExpandPseudo::expand<AVR::SEXT>(Block &MBB, BlockIt MBBI) {
 
 template <> bool AVRExpandPseudo::expand<AVR::ZEXT>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned DstLoReg, DstHiReg;
+  Register DstLoReg, DstHiReg;
   // zext R25:R24, R20
   // mov      R24, R20
   // eor      R25, R25
@@ -1508,8 +1508,8 @@ template <> bool AVRExpandPseudo::expand<AVR::ZEXT>(Block &MBB, BlockIt MBBI) {
   // zext R25:R24, R25
   // mov      R24, R25
   // eor      R25, R25
-  unsigned DstReg = MI.getOperand(0).getReg();
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SrcReg = MI.getOperand(1).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   bool SrcIsKill = MI.getOperand(1).isKill();
   bool ImpIsDead = MI.getOperand(2).isDead();
@@ -1536,12 +1536,12 @@ template <> bool AVRExpandPseudo::expand<AVR::ZEXT>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::SPREAD>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned OpLo, OpHi, DstLoReg, DstHiReg;
-  unsigned DstReg = MI.getOperand(0).getReg();
+  Register DstLoReg, DstHiReg;
+  Register DstReg = MI.getOperand(0).getReg();
   bool DstIsDead = MI.getOperand(0).isDead();
   unsigned Flags = MI.getFlags();
-  OpLo = AVR::INRdA;
-  OpHi = AVR::INRdA;
+  unsigned OpLo = AVR::INRdA;
+  unsigned OpHi = AVR::INRdA;
   TRI->splitReg(DstReg, DstLoReg, DstHiReg);
 
   // Low part
@@ -1563,8 +1563,8 @@ bool AVRExpandPseudo::expand<AVR::SPREAD>(Block &MBB, BlockIt MBBI) {
 template <>
 bool AVRExpandPseudo::expand<AVR::SPWRITE>(Block &MBB, BlockIt MBBI) {
   MachineInstr &MI = *MBBI;
-  unsigned SrcLoReg, SrcHiReg;
-  unsigned SrcReg = MI.getOperand(1).getReg();
+  Register SrcLoReg, SrcHiReg;
+  Register SrcReg = MI.getOperand(1).getReg();
   bool SrcIsKill = MI.getOperand(1).isKill();
   unsigned Flags = MI.getFlags();
   TRI->splitReg(SrcReg, SrcLoReg, SrcHiReg);
diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
index e6c48de5a782..c95a553b86ac 100644
--- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -30,8 +30,7 @@
 namespace llvm {
 
 AVRFrameLowering::AVRFrameLowering()
-    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align::None(),
-                          -2) {}
+    : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(1), -2) {}
 
 bool AVRFrameLowering::canSimplifyCallFramePseudos(
     const MachineFunction &MF) const {
@@ -53,30 +52,22 @@ bool AVRFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 void AVRFrameLowering::emitPrologue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.begin();
-  CallingConv::ID CallConv = MF.getFunction().getCallingConv();
   DebugLoc DL = (MBBI != MBB.end()) ? MBBI->getDebugLoc() : DebugLoc();
   const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
   const AVRInstrInfo &TII = *STI.getInstrInfo();
+  const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
   bool HasFP = hasFP(MF);
 
   // Interrupt handlers re-enable interrupts in function entry.
-  if (CallConv == CallingConv::AVR_INTR) {
+  if (AFI->isInterruptHandler()) {
     BuildMI(MBB, MBBI, DL, TII.get(AVR::BSETs))
         .addImm(0x07)
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
-  // Save the frame pointer if we have one.
-  if (HasFP) {
-    BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHWRr))
-        .addReg(AVR::R29R28, RegState::Kill)
-        .setMIFlag(MachineInstr::FrameSetup);
-  }
-
   // Emit special prologue code to save R1, R0 and SREG in interrupt/signal
   // handlers before saving any other registers.
-  if (CallConv == CallingConv::AVR_INTR ||
-      CallConv == CallingConv::AVR_SIGNAL) {
+  if (AFI->isInterruptOrSignalHandler()) {
     BuildMI(MBB, MBBI, DL, TII.get(AVR::PUSHWRr))
         .addReg(AVR::R1R0, RegState::Kill)
         .setMIFlag(MachineInstr::FrameSetup);
@@ -100,7 +91,6 @@ void AVRFrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
   unsigned FrameSize = MFI.getStackSize() - AFI->getCalleeSavedFrameSize();
 
   // Skip the callee-saved push instructions.
@@ -143,13 +133,11 @@ void AVRFrameLowering::emitPrologue(MachineFunction &MF,
 
 void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
-  CallingConv::ID CallConv = MF.getFunction().getCallingConv();
-  bool isHandler = (CallConv == CallingConv::AVR_INTR ||
-                    CallConv == CallingConv::AVR_SIGNAL);
+  const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
 
   // Early exit if the frame pointer is not needed in this function except for
   // signal/interrupt handlers where special code generation is required.
-  if (!hasFP(MF) && !isHandler) {
+  if (!hasFP(MF) && !AFI->isInterruptOrSignalHandler()) {
     return;
   }
 
@@ -159,14 +147,13 @@ void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
 
   DebugLoc DL = MBBI->getDebugLoc();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
   unsigned FrameSize = MFI.getStackSize() - AFI->getCalleeSavedFrameSize();
   const AVRSubtarget &STI = MF.getSubtarget<AVRSubtarget>();
   const AVRInstrInfo &TII = *STI.getInstrInfo();
 
   // Emit special epilogue code to restore R1, R0 and SREG in interrupt/signal
   // handlers at the very end of the function, just before reti.
-  if (isHandler) {
+  if (AFI->isInterruptOrSignalHandler()) {
     BuildMI(MBB, MBBI, DL, TII.get(AVR::POPRd), AVR::R0);
     BuildMI(MBB, MBBI, DL, TII.get(AVR::OUTARr))
         .addImm(0x3f)
@@ -174,9 +161,6 @@ void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
     BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R1R0);
   }
 
-  if (hasFP(MF))
-    BuildMI(MBB, MBBI, DL, TII.get(AVR::POPWRd), AVR::R29R28);
-
   // Early exit if there is no need to restore the frame pointer.
   if (!FrameSize) {
     return;
@@ -234,8 +218,7 @@ bool AVRFrameLowering::hasFP(const MachineFunction &MF) const {
 
 bool AVRFrameLowering::spillCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   if (CSI.empty()) {
     return false;
   }
@@ -275,8 +258,7 @@ bool AVRFrameLowering::spillCalleeSavedRegisters(
 
 bool AVRFrameLowering::restoreCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   if (CSI.empty()) {
     return false;
   }
@@ -299,15 +281,10 @@ bool AVRFrameLowering::restoreCalleeSavedRegisters(
 }
 
 /// Replace pseudo store instructions that pass arguments through the stack with
-/// real instructions. If insertPushes is true then all instructions are
-/// replaced with push instructions, otherwise regular std instructions are
-/// inserted.
+/// real instructions.
 static void fixStackStores(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI,
-                           const TargetInstrInfo &TII, bool insertPushes) {
-  const AVRSubtarget &STI = MBB.getParent()->getSubtarget<AVRSubtarget>();
-  const TargetRegisterInfo &TRI = *STI.getRegisterInfo();
-
+                           const TargetInstrInfo &TII, Register FP) {
   // Iterate through the BB until we hit a call instruction or we reach the end.
   for (auto I = MI, E = MBB.end(); I != E && !I->isCall();) {
     MachineBasicBlock::iterator NextMI = std::next(I);
@@ -322,29 +299,6 @@ static void fixStackStores(MachineBasicBlock &MBB,
 
     assert(MI.getOperand(0).getReg() == AVR::SP &&
            "Invalid register, should be SP!");
-    if (insertPushes) {
-      // Replace this instruction with a push.
-      Register SrcReg = MI.getOperand(2).getReg();
-      bool SrcIsKill = MI.getOperand(2).isKill();
-
-      // We can't use PUSHWRr here because when expanded the order of the new
-      // instructions are reversed from what we need. Perform the expansion now.
-      if (Opcode == AVR::STDWSPQRr) {
-        BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
-            .addReg(TRI.getSubReg(SrcReg, AVR::sub_hi),
-                    getKillRegState(SrcIsKill));
-        BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
-            .addReg(TRI.getSubReg(SrcReg, AVR::sub_lo),
-                    getKillRegState(SrcIsKill));
-      } else {
-        BuildMI(MBB, I, MI.getDebugLoc(), TII.get(AVR::PUSHRr))
-            .addReg(SrcReg, getKillRegState(SrcIsKill));
-      }
-
-      MI.eraseFromParent();
-      I = NextMI;
-      continue;
-    }
 
     // Replace this instruction with a regular store. Use Y as the base
     // pointer since it is guaranteed to contain a copy of SP.
@@ -352,7 +306,7 @@ static void fixStackStores(MachineBasicBlock &MBB,
         (Opcode == AVR::STDWSPQRr) ? AVR::STDWPtrQRr : AVR::STDPtrQRr;
 
     MI.setDesc(TII.get(STOpc));
-    MI.getOperand(0).setReg(AVR::R29R28);
+    MI.getOperand(0).setReg(FP);
 
     I = NextMI;
   }
@@ -368,7 +322,7 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
   // function entry. Delete the call frame pseudo and replace all pseudo stores
   // with real store instructions.
   if (hasReservedCallFrame(MF)) {
-    fixStackStores(MBB, MI, TII, false);
+    fixStackStores(MBB, MI, TII, AVR::R29R28);
     return MBB.erase(MI);
   }
 
@@ -376,18 +330,37 @@ MachineBasicBlock::iterator AVRFrameLowering::eliminateCallFramePseudoInstr(
   unsigned int Opcode = MI->getOpcode();
   int Amount = TII.getFrameSize(*MI);
 
-  // Adjcallstackup does not need to allocate stack space for the call, instead
-  // we insert push instructions that will allocate the necessary stack.
-  // For adjcallstackdown we convert it into an 'adiw reg, <amt>' handling
-  // the read and write of SP in I/O space.
+  // ADJCALLSTACKUP and ADJCALLSTACKDOWN are converted to adiw/subi
+  // instructions to read and write the stack pointer in I/O space.
   if (Amount != 0) {
-    assert(getStackAlignment() == 1 && "Unsupported stack alignment");
+    assert(getStackAlign() == Align(1) && "Unsupported stack alignment");
 
     if (Opcode == TII.getCallFrameSetupOpcode()) {
-      fixStackStores(MBB, MI, TII, true);
+      // Update the stack pointer.
+      // In many cases this can be done far more efficiently by pushing the
+      // relevant values directly to the stack. However, doing that correctly
+      // (in the right order, possibly skipping some empty space for undef
+      // values, etc) is tricky and thus left to be optimized in the future.
+      BuildMI(MBB, MI, DL, TII.get(AVR::SPREAD), AVR::R31R30).addReg(AVR::SP);
+
+      MachineInstr *New = BuildMI(MBB, MI, DL, TII.get(AVR::SUBIWRdK), AVR::R31R30)
+                              .addReg(AVR::R31R30, RegState::Kill)
+                              .addImm(Amount);
+      New->getOperand(3).setIsDead();
+
+      BuildMI(MBB, MI, DL, TII.get(AVR::SPWRITE), AVR::SP)
+          .addReg(AVR::R31R30, RegState::Kill);
+
+      // Make sure the remaining stack stores are converted to real store
+      // instructions.
+      fixStackStores(MBB, MI, TII, AVR::R31R30);
     } else {
       assert(Opcode == TII.getCallFrameDestroyOpcode());
 
+      // Note that small stack changes could be implemented more efficiently
+      // with a few pop instructions instead of the 8-9 instructions now
+      // required.
+
       // Select the best opcode to adjust SP based on the offset size.
       unsigned addOpcode;
       if (isUInt<6>(Amount)) {
@@ -419,8 +392,10 @@ void AVRFrameLowering::determineCalleeSaves(MachineFunction &MF,
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
 
   // If we have a frame pointer, the Y register needs to be saved as well.
-  // We don't do that here however - the prologue and epilogue generation
-  // code will handle it specially.
+  if (hasFP(MF)) {
+    SavedRegs.set(AVR::R29);
+    SavedRegs.set(AVR::R28);
+  }
 }
 /// The frame analyzer pass.
 ///
@@ -430,7 +405,7 @@ struct AVRFrameAnalyzer : public MachineFunctionPass {
   static char ID;
   AVRFrameAnalyzer() : MachineFunctionPass(ID) {}
 
-  bool runOnMachineFunction(MachineFunction &MF) {
+  bool runOnMachineFunction(MachineFunction &MF) override {
     const MachineFrameInfo &MFI = MF.getFrameInfo();
     AVRMachineFunctionInfo *FuncInfo = MF.getInfo<AVRMachineFunctionInfo>();
 
@@ -482,7 +457,7 @@ struct AVRFrameAnalyzer : public MachineFunctionPass {
     return false;
   }
 
-  StringRef getPassName() const { return "AVR Frame Analyzer"; }
+  StringRef getPassName() const override { return "AVR Frame Analyzer"; }
 };
 
 char AVRFrameAnalyzer::ID = 0;
@@ -498,7 +473,7 @@ struct AVRDynAllocaSR : public MachineFunctionPass {
   static char ID;
   AVRDynAllocaSR() : MachineFunctionPass(ID) {}
 
-  bool runOnMachineFunction(MachineFunction &MF) {
+  bool runOnMachineFunction(MachineFunction &MF) override {
     // Early exit when there are no variable sized objects in the function.
     if (!MF.getFrameInfo().hasVarSizedObjects()) {
       return false;
@@ -510,7 +485,7 @@ struct AVRDynAllocaSR : public MachineFunctionPass {
     MachineBasicBlock::iterator MBBI = EntryMBB.begin();
     DebugLoc DL = EntryMBB.findDebugLoc(MBBI);
 
-    unsigned SPCopy =
+    Register SPCopy =
         MF.getRegInfo().createVirtualRegister(&AVR::DREGSRegClass);
 
     // Create a copy of SP in function entry before any dynallocas are
@@ -531,7 +506,7 @@ struct AVRDynAllocaSR : public MachineFunctionPass {
     return true;
   }
 
-  StringRef getPassName() const {
+  StringRef getPassName() const override {
     return "AVR dynalloca stack pointer save/restore";
   }
 };
diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.h b/llvm/lib/Target/AVR/AVRFrameLowering.h
index a7658438232a..a550c0efbb8e 100644
--- a/llvm/lib/Target/AVR/AVRFrameLowering.h
+++ b/llvm/lib/Target/AVR/AVRFrameLowering.h
@@ -24,12 +24,12 @@ public:
   bool hasFP(const MachineFunction &MF) const override;
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
                                  const TargetRegisterInfo *TRI) const override;
   bool
   restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
                               MachineBasicBlock::iterator MI,
-                              std::vector<CalleeSavedInfo> &CSI,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
                               const TargetRegisterInfo *TRI) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp b/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
index 4c4f4faa0508..fe31fa42c403 100644
--- a/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AVR/AVRISelDAGToDAG.cpp
@@ -265,7 +265,7 @@ bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
       if (RI.getRegClass(Reg) != &AVR::PTRDISPREGSRegClass) {
         SDLoc dl(CopyFromRegOp);
 
-        unsigned VReg = RI.createVirtualRegister(&AVR::PTRDISPREGSRegClass);
+        Register VReg = RI.createVirtualRegister(&AVR::PTRDISPREGSRegClass);
 
         SDValue CopyToReg =
             CurDAG->getCopyToReg(CopyFromRegOp, dl, VReg, CopyFromRegOp);
@@ -294,7 +294,7 @@ bool AVRDAGToDAGISel::SelectInlineAsmMemoryOperand(const SDValue &Op,
   // More generic case.
   // Create chain that puts Op into pointer register
   // and return that register.
-  unsigned VReg = RI.createVirtualRegister(&AVR::PTRDISPREGSRegClass);
+  Register VReg = RI.createVirtualRegister(&AVR::PTRDISPREGSRegClass);
 
   SDValue CopyToReg = CurDAG->getCopyToReg(Op, dl, VReg, Op);
   SDValue CopyFromReg =
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.cpp b/llvm/lib/Target/AVR/AVRISelLowering.cpp
index 880688807702..bf9b32e1278e 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRISelLowering.cpp
@@ -14,6 +14,7 @@
 #include "AVRISelLowering.h"
 
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -151,10 +152,12 @@ AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM,
   setOperationAction(ISD::SREM, MVT::i16, Expand);
 
   // Make division and modulus custom
-  for (MVT VT : MVT::integer_valuetypes()) {
-    setOperationAction(ISD::UDIVREM, VT, Custom);
-    setOperationAction(ISD::SDIVREM, VT, Custom);
-  }
+  setOperationAction(ISD::UDIVREM, MVT::i8, Custom);
+  setOperationAction(ISD::UDIVREM, MVT::i16, Custom);
+  setOperationAction(ISD::UDIVREM, MVT::i32, Custom);
+  setOperationAction(ISD::SDIVREM, MVT::i8, Custom);
+  setOperationAction(ISD::SDIVREM, MVT::i16, Custom);
+  setOperationAction(ISD::SDIVREM, MVT::i32, Custom);
 
   // Do not use MUL. The AVR instructions are closer to SMUL_LOHI &co.
   setOperationAction(ISD::MUL, MVT::i8, Expand);
@@ -190,41 +193,29 @@ AVRTargetLowering::AVRTargetLowering(const AVRTargetMachine &TM,
     // improvements in how we treat 16-bit "registers" to be feasible.
   }
 
-  // Division rtlib functions (not supported)
+  // Division rtlib functions (not supported), use divmod functions instead
   setLibcallName(RTLIB::SDIV_I8, nullptr);
   setLibcallName(RTLIB::SDIV_I16, nullptr);
   setLibcallName(RTLIB::SDIV_I32, nullptr);
-  setLibcallName(RTLIB::SDIV_I64, nullptr);
-  setLibcallName(RTLIB::SDIV_I128, nullptr);
   setLibcallName(RTLIB::UDIV_I8, nullptr);
   setLibcallName(RTLIB::UDIV_I16, nullptr);
   setLibcallName(RTLIB::UDIV_I32, nullptr);
-  setLibcallName(RTLIB::UDIV_I64, nullptr);
-  setLibcallName(RTLIB::UDIV_I128, nullptr);
 
-  // Modulus rtlib functions (not supported)
+  // Modulus rtlib functions (not supported), use divmod functions instead
   setLibcallName(RTLIB::SREM_I8, nullptr);
   setLibcallName(RTLIB::SREM_I16, nullptr);
   setLibcallName(RTLIB::SREM_I32, nullptr);
-  setLibcallName(RTLIB::SREM_I64, nullptr);
-  setLibcallName(RTLIB::SREM_I128, nullptr);
   setLibcallName(RTLIB::UREM_I8, nullptr);
   setLibcallName(RTLIB::UREM_I16, nullptr);
   setLibcallName(RTLIB::UREM_I32, nullptr);
-  setLibcallName(RTLIB::UREM_I64, nullptr);
-  setLibcallName(RTLIB::UREM_I128, nullptr);
 
   // Division and modulus rtlib functions
   setLibcallName(RTLIB::SDIVREM_I8, "__divmodqi4");
   setLibcallName(RTLIB::SDIVREM_I16, "__divmodhi4");
   setLibcallName(RTLIB::SDIVREM_I32, "__divmodsi4");
-  setLibcallName(RTLIB::SDIVREM_I64, "__divmoddi4");
-  setLibcallName(RTLIB::SDIVREM_I128, "__divmodti4");
   setLibcallName(RTLIB::UDIVREM_I8, "__udivmodqi4");
   setLibcallName(RTLIB::UDIVREM_I16, "__udivmodhi4");
   setLibcallName(RTLIB::UDIVREM_I32, "__udivmodsi4");
-  setLibcallName(RTLIB::UDIVREM_I64, "__udivmoddi4");
-  setLibcallName(RTLIB::UDIVREM_I128, "__udivmodti4");
 
   // Several of the runtime library functions use a special calling conv
   setLibcallCallingConv(RTLIB::SDIVREM_I8, CallingConv::AVR_BUILTIN);
@@ -259,6 +250,8 @@ const char *AVRTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE(ASR);
     NODE(LSLLOOP);
     NODE(LSRLOOP);
+    NODE(ROLLOOP);
+    NODE(RORLOOP);
     NODE(ASRLOOP);
     NODE(BRCOND);
     NODE(CMP);
@@ -282,6 +275,8 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
   const SDNode *N = Op.getNode();
   EVT VT = Op.getValueType();
   SDLoc dl(N);
+  assert(isPowerOf2_32(VT.getSizeInBits()) &&
+         "Expected power-of-2 shift amount");
 
   // Expand non-constant shifts to loops.
   if (!isa<ConstantSDNode>(N->getOperand(1))) {
@@ -294,12 +289,20 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
     case ISD::SRL:
       return DAG.getNode(AVRISD::LSRLOOP, dl, VT, N->getOperand(0),
                          N->getOperand(1));
-    case ISD::ROTL:
-      return DAG.getNode(AVRISD::ROLLOOP, dl, VT, N->getOperand(0),
-                         N->getOperand(1));
-    case ISD::ROTR:
-      return DAG.getNode(AVRISD::RORLOOP, dl, VT, N->getOperand(0),
-                         N->getOperand(1));
+    case ISD::ROTL: {
+      SDValue Amt = N->getOperand(1);
+      EVT AmtVT = Amt.getValueType();
+      Amt = DAG.getNode(ISD::AND, dl, AmtVT, Amt,
+                        DAG.getConstant(VT.getSizeInBits() - 1, dl, AmtVT));
+      return DAG.getNode(AVRISD::ROLLOOP, dl, VT, N->getOperand(0), Amt);
+    }
+    case ISD::ROTR: {
+      SDValue Amt = N->getOperand(1);
+      EVT AmtVT = Amt.getValueType();
+      Amt = DAG.getNode(ISD::AND, dl, AmtVT, Amt,
+                        DAG.getConstant(VT.getSizeInBits() - 1, dl, AmtVT));
+      return DAG.getNode(AVRISD::RORLOOP, dl, VT, N->getOperand(0), Amt);
+    }
     case ISD::SRA:
       return DAG.getNode(AVRISD::ASRLOOP, dl, VT, N->getOperand(0),
                          N->getOperand(1));
@@ -315,9 +318,11 @@ SDValue AVRTargetLowering::LowerShifts(SDValue Op, SelectionDAG &DAG) const {
     break;
   case ISD::ROTL:
     Opc8 = AVRISD::ROL;
+    ShiftAmount = ShiftAmount % VT.getSizeInBits();
     break;
   case ISD::ROTR:
     Opc8 = AVRISD::ROR;
+    ShiftAmount = ShiftAmount % VT.getSizeInBits();
     break;
   case ISD::SRL:
     Opc8 = AVRISD::LSR;
@@ -357,12 +362,6 @@ SDValue AVRTargetLowering::LowerDivRem(SDValue Op, SelectionDAG &DAG) const {
   case MVT::i32:
     LC = IsSigned ? RTLIB::SDIVREM_I32 : RTLIB::UDIVREM_I32;
     break;
-  case MVT::i64:
-    LC = IsSigned ? RTLIB::SDIVREM_I64 : RTLIB::UDIVREM_I64;
-    break;
-  case MVT::i128:
-    LC = IsSigned ? RTLIB::SDIVREM_I128 : RTLIB::UDIVREM_I128;
-    break;
   }
 
   SDValue InChain = DAG.getEntryNode();
@@ -883,173 +882,145 @@ bool AVRTargetLowering::isOffsetFoldingLegal(
 
 #include "AVRGenCallingConv.inc"
 
-/// For each argument in a function store the number of pieces it is composed
-/// of.
-static void parseFunctionArgs(const SmallVectorImpl<ISD::InputArg> &Ins,
-                              SmallVectorImpl<unsigned> &Out) {
-  for (const ISD::InputArg &Arg : Ins) {
-    if(Arg.PartOffset > 0) continue;
-    unsigned Bytes = ((Arg.ArgVT.getSizeInBits()) + 7) / 8;
-
-    Out.push_back((Bytes + 1) / 2);
-  }
-}
-
-/// For external symbols there is no function prototype information so we
-/// have to rely directly on argument sizes.
-static void parseExternFuncCallArgs(const SmallVectorImpl<ISD::OutputArg> &In,
-                                    SmallVectorImpl<unsigned> &Out) {
-  for (unsigned i = 0, e = In.size(); i != e;) {
-    unsigned Size = 0;
-    unsigned Offset = 0;
-    while ((i != e) && (In[i].PartOffset == Offset)) {
-      Offset += In[i].VT.getStoreSize();
-      ++i;
-      ++Size;
-    }
-    Out.push_back(Size);
-  }
-}
-
-static StringRef getFunctionName(TargetLowering::CallLoweringInfo &CLI) {
-  SDValue Callee = CLI.Callee;
-
-  if (const ExternalSymbolSDNode *G = dyn_cast<ExternalSymbolSDNode>(Callee)) {
-    return G->getSymbol();
-  }
-
-  if (const GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
-    return G->getGlobal()->getName();
-  }
-
-  llvm_unreachable("don't know how to get the name for this callee");
-}
+/// Registers for calling conventions, ordered in reverse as required by ABI.
+/// Both arrays must be of the same length.
+static const MCPhysReg RegList8[] = {
+    AVR::R25, AVR::R24, AVR::R23, AVR::R22, AVR::R21, AVR::R20,
+    AVR::R19, AVR::R18, AVR::R17, AVR::R16, AVR::R15, AVR::R14,
+    AVR::R13, AVR::R12, AVR::R11, AVR::R10, AVR::R9,  AVR::R8};
+static const MCPhysReg RegList16[] = {
+    AVR::R26R25, AVR::R25R24, AVR::R24R23, AVR::R23R22,
+    AVR::R22R21, AVR::R21R20, AVR::R20R19, AVR::R19R18,
+    AVR::R18R17, AVR::R17R16, AVR::R16R15, AVR::R15R14,
+    AVR::R14R13, AVR::R13R12, AVR::R12R11, AVR::R11R10,
+    AVR::R10R9,  AVR::R9R8};
+
+static_assert(array_lengthof(RegList8) == array_lengthof(RegList16),
+        "8-bit and 16-bit register arrays must be of equal length");
 
 /// Analyze incoming and outgoing function arguments. We need custom C++ code
-/// to handle special constraints in the ABI like reversing the order of the
-/// pieces of splitted arguments. In addition, all pieces of a certain argument
-/// have to be passed either using registers or the stack but never mixing both.
-static void analyzeStandardArguments(TargetLowering::CallLoweringInfo *CLI,
-                                     const Function *F, const DataLayout *TD,
-                                     const SmallVectorImpl<ISD::OutputArg> *Outs,
-                                     const SmallVectorImpl<ISD::InputArg> *Ins,
-                                     CallingConv::ID CallConv,
-                                     SmallVectorImpl<CCValAssign> &ArgLocs,
-                                     CCState &CCInfo, bool IsCall, bool IsVarArg) {
-  static const MCPhysReg RegList8[] = {AVR::R24, AVR::R22, AVR::R20,
-                                       AVR::R18, AVR::R16, AVR::R14,
-                                       AVR::R12, AVR::R10, AVR::R8};
-  static const MCPhysReg RegList16[] = {AVR::R25R24, AVR::R23R22, AVR::R21R20,
-                                        AVR::R19R18, AVR::R17R16, AVR::R15R14,
-                                        AVR::R13R12, AVR::R11R10, AVR::R9R8};
-  if (IsVarArg) {
-    // Variadic functions do not need all the analysis below.
-    if (IsCall) {
-      CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_Vararg);
-    } else {
-      CCInfo.AnalyzeFormalArguments(*Ins, ArgCC_AVR_Vararg);
+/// to handle special constraints in the ABI.
+/// In addition, all pieces of a certain argument have to be passed either
+/// using registers or the stack but never mixing both.
+template <typename ArgT>
+static void
+analyzeArguments(TargetLowering::CallLoweringInfo *CLI, const Function *F,
+                 const DataLayout *TD, const SmallVectorImpl<ArgT> &Args,
+                 SmallVectorImpl<CCValAssign> &ArgLocs, CCState &CCInfo) {
+  unsigned NumArgs = Args.size();
+  // This is the index of the last used register, in RegList*.
+  // -1 means R26 (R26 is never actually used in CC).
+  int RegLastIdx = -1;
+  // Once a value is passed to the stack it will always be used
+  bool UseStack = false;
+  for (unsigned i = 0; i != NumArgs;) {
+    MVT VT = Args[i].VT;
+    // We have to count the number of bytes for each function argument, that is
+    // those Args with the same OrigArgIndex. This is important in case the
+    // function takes an aggregate type.
+    // Current argument will be between [i..j).
+    unsigned ArgIndex = Args[i].OrigArgIndex;
+    unsigned TotalBytes = VT.getStoreSize();
+    unsigned j = i + 1;
+    for (; j != NumArgs; ++j) {
+      if (Args[j].OrigArgIndex != ArgIndex)
+        break;
+      TotalBytes += Args[j].VT.getStoreSize();
     }
-    return;
-  }
-
-  // Fill in the Args array which will contain original argument sizes.
-  SmallVector<unsigned, 8> Args;
-  if (IsCall) {
-    parseExternFuncCallArgs(*Outs, Args);
-  } else {
-    assert(F != nullptr && "function should not be null");
-    parseFunctionArgs(*Ins, Args);
-  }
-
-  unsigned RegsLeft = array_lengthof(RegList8), ValNo = 0;
-  // Variadic functions always use the stack.
-  bool UsesStack = false;
-  for (unsigned i = 0, pos = 0, e = Args.size(); i != e; ++i) {
-    unsigned Size = Args[i];
-
-    // If we have a zero-sized argument, don't attempt to lower it.
-    // AVR-GCC does not support zero-sized arguments and so we need not
-    // worry about ABI compatibility.
-    if (Size == 0) continue;
-
-    MVT LocVT = (IsCall) ? (*Outs)[pos].VT : (*Ins)[pos].VT;
-
-    // If we have plenty of regs to pass the whole argument do it.
-    if (!UsesStack && (Size <= RegsLeft)) {
-      const MCPhysReg *RegList = (LocVT == MVT::i16) ? RegList16 : RegList8;
+    // Round up to even number of bytes.
+    TotalBytes = alignTo(TotalBytes, 2);
+    // Skip zero sized arguments
+    if (TotalBytes == 0)
+      continue;
+    // The index of the first register to be used
+    unsigned RegIdx = RegLastIdx + TotalBytes;
+    RegLastIdx = RegIdx;
+    // If there are not enough registers, use the stack
+    if (RegIdx >= array_lengthof(RegList8)) {
+      UseStack = true;
+    }
+    for (; i != j; ++i) {
+      MVT VT = Args[i].VT;
 
-      for (unsigned j = 0; j != Size; ++j) {
-        unsigned Reg = CCInfo.AllocateReg(
-            ArrayRef<MCPhysReg>(RegList, array_lengthof(RegList8)));
+      if (UseStack) {
+        auto evt = EVT(VT).getTypeForEVT(CCInfo.getContext());
+        unsigned Offset = CCInfo.AllocateStack(TD->getTypeAllocSize(evt),
+                                               TD->getABITypeAlign(evt));
         CCInfo.addLoc(
-            CCValAssign::getReg(ValNo++, LocVT, Reg, LocVT, CCValAssign::Full));
-        --RegsLeft;
-      }
-
-      // Reverse the order of the pieces to agree with the "big endian" format
-      // required in the calling convention ABI.
-      std::reverse(ArgLocs.begin() + pos, ArgLocs.begin() + pos + Size);
-    } else {
-      // Pass the rest of arguments using the stack.
-      UsesStack = true;
-      for (unsigned j = 0; j != Size; ++j) {
-        unsigned Offset = CCInfo.AllocateStack(
-            TD->getTypeAllocSize(EVT(LocVT).getTypeForEVT(CCInfo.getContext())),
-            TD->getABITypeAlignment(
-                EVT(LocVT).getTypeForEVT(CCInfo.getContext())));
-        CCInfo.addLoc(CCValAssign::getMem(ValNo++, LocVT, Offset, LocVT,
-                                          CCValAssign::Full));
+            CCValAssign::getMem(i, VT, Offset, VT, CCValAssign::Full));
+      } else {
+        unsigned Reg;
+        if (VT == MVT::i8) {
+          Reg = CCInfo.AllocateReg(RegList8[RegIdx]);
+        } else if (VT == MVT::i16) {
+          Reg = CCInfo.AllocateReg(RegList16[RegIdx]);
+        } else {
+          llvm_unreachable(
+              "calling convention can only manage i8 and i16 types");
+        }
+        assert(Reg && "register not available in calling convention");
+        CCInfo.addLoc(CCValAssign::getReg(i, VT, Reg, VT, CCValAssign::Full));
+        // Registers inside a particular argument are sorted in increasing order
+        // (remember the array is reversed).
+        RegIdx -= VT.getStoreSize();
       }
     }
-    pos += Size;
   }
 }
 
-static void analyzeBuiltinArguments(TargetLowering::CallLoweringInfo &CLI,
-                                    const Function *F, const DataLayout *TD,
-                                    const SmallVectorImpl<ISD::OutputArg> *Outs,
-                                    const SmallVectorImpl<ISD::InputArg> *Ins,
-                                    CallingConv::ID CallConv,
-                                    SmallVectorImpl<CCValAssign> &ArgLocs,
-                                    CCState &CCInfo, bool IsCall, bool IsVarArg) {
-  StringRef FuncName = getFunctionName(CLI);
-
-  if (FuncName.startswith("__udivmod") || FuncName.startswith("__divmod")) {
-    CCInfo.AnalyzeCallOperands(*Outs, ArgCC_AVR_BUILTIN_DIV);
-  } else {
-    analyzeStandardArguments(&CLI, F, TD, Outs, Ins,
-                             CallConv, ArgLocs, CCInfo,
-                             IsCall, IsVarArg);
+/// Count the total number of bytes needed to pass or return these arguments.
+template <typename ArgT>
+static unsigned getTotalArgumentsSizeInBytes(const SmallVectorImpl<ArgT> &Args) {
+  unsigned TotalBytes = 0;
+
+  for (const ArgT& Arg : Args) {
+    TotalBytes += Arg.VT.getStoreSize();
   }
+  return TotalBytes;
 }
 
-static void analyzeArguments(TargetLowering::CallLoweringInfo *CLI,
-                             const Function *F, const DataLayout *TD,
-                             const SmallVectorImpl<ISD::OutputArg> *Outs,
-                             const SmallVectorImpl<ISD::InputArg> *Ins,
-                             CallingConv::ID CallConv,
-                             SmallVectorImpl<CCValAssign> &ArgLocs,
-                             CCState &CCInfo, bool IsCall, bool IsVarArg) {
-  switch (CallConv) {
-    case CallingConv::AVR_BUILTIN: {
-      analyzeBuiltinArguments(*CLI, F, TD, Outs, Ins,
-                              CallConv, ArgLocs, CCInfo,
-                              IsCall, IsVarArg);
-      return;
-    }
-    default: {
-      analyzeStandardArguments(CLI, F, TD, Outs, Ins,
-                               CallConv, ArgLocs, CCInfo,
-                               IsCall, IsVarArg);
-      return;
+/// Analyze incoming and outgoing value of returning from a function.
+/// The algorithm is similar to analyzeArguments, but there can only be
+/// one value, possibly an aggregate, and it is limited to 8 bytes.
+template <typename ArgT>
+static void analyzeReturnValues(const SmallVectorImpl<ArgT> &Args,
+                                CCState &CCInfo) {
+  unsigned NumArgs = Args.size();
+  unsigned TotalBytes = getTotalArgumentsSizeInBytes(Args);
+  // CanLowerReturn() guarantees this assertion.
+  assert(TotalBytes <= 8 && "return values greater than 8 bytes cannot be lowered");
+
+  // GCC-ABI says that the size is rounded up to the next even number,
+  // but actually once it is more than 4 it will always round up to 8.
+  if (TotalBytes > 4) {
+    TotalBytes = 8;
+  } else {
+    TotalBytes = alignTo(TotalBytes, 2);
+  }
+
+  // The index of the first register to use.
+  int RegIdx = TotalBytes - 1;
+  for (unsigned i = 0; i != NumArgs; ++i) {
+    MVT VT = Args[i].VT;
+    unsigned Reg;
+    if (VT == MVT::i8) {
+      Reg = CCInfo.AllocateReg(RegList8[RegIdx]);
+    } else if (VT == MVT::i16) {
+      Reg = CCInfo.AllocateReg(RegList16[RegIdx]);
+    } else {
+      llvm_unreachable("calling convention can only manage i8 and i16 types");
     }
+    assert(Reg && "register not available in calling convention");
+    CCInfo.addLoc(CCValAssign::getReg(i, VT, Reg, VT, CCValAssign::Full));
+    // Registers sort in increasing order
+    RegIdx -= VT.getStoreSize();
   }
 }
 
 SDValue AVRTargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
-    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl, SelectionDAG &DAG,
-    SmallVectorImpl<SDValue> &InVals) const {
+    const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
+    SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   auto DL = DAG.getDataLayout();
@@ -1059,8 +1030,12 @@ SDValue AVRTargetLowering::LowerFormalArguments(
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), ArgLocs,
                  *DAG.getContext());
 
-  analyzeArguments(nullptr, &MF.getFunction(), &DL, 0, &Ins, CallConv, ArgLocs, CCInfo,
-                   false, isVarArg);
+  // Variadic functions do not need all the analysis below.
+  if (isVarArg) {
+    CCInfo.AnalyzeFormalArguments(Ins, ArgCC_AVR_Vararg);
+  } else {
+    analyzeArguments(nullptr, &MF.getFunction(), &DL, Ins, ArgLocs, CCInfo);
+  }
 
   SDValue ArgValue;
   for (CCValAssign &VA : ArgLocs) {
@@ -1181,8 +1156,12 @@ SDValue AVRTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                          getPointerTy(DAG.getDataLayout()));
   }
 
-  analyzeArguments(&CLI, F, &DAG.getDataLayout(), &Outs, 0, CallConv, ArgLocs, CCInfo,
-                   true, isVarArg);
+  // Variadic functions do not need all the analysis below.
+  if (isVarArg) {
+    CCInfo.AnalyzeCallOperands(Outs, ArgCC_AVR_Vararg);
+  } else {
+    analyzeArguments(&CLI, F, &DAG.getDataLayout(), Outs, ArgLocs, CCInfo);
+  }
 
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NumBytes = CCInfo.getNextStackOffset();
@@ -1319,13 +1298,10 @@ SDValue AVRTargetLowering::LowerCallResult(
                  *DAG.getContext());
 
   // Handle runtime calling convs.
-  auto CCFunction = CCAssignFnForReturn(CallConv);
-  CCInfo.AnalyzeCallResult(Ins, CCFunction);
-
-  if (CallConv != CallingConv::AVR_BUILTIN && RVLocs.size() > 1) {
-    // Reverse splitted return values to get the "big endian" format required
-    // to agree with the calling convention ABI.
-    std::reverse(RVLocs.begin(), RVLocs.end());
+  if (CallConv == CallingConv::AVR_BUILTIN) {
+    CCInfo.AnalyzeCallResult(Ins, RetCC_AVR_BUILTIN);
+  } else {
+    analyzeReturnValues(Ins, CCInfo);
   }
 
   // Copy all of the result registers out of their specified physreg.
@@ -1344,26 +1320,17 @@ SDValue AVRTargetLowering::LowerCallResult(
 //               Return Value Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
-CCAssignFn *AVRTargetLowering::CCAssignFnForReturn(CallingConv::ID CC) const {
-  switch (CC) {
-  case CallingConv::AVR_BUILTIN:
-    return RetCC_AVR_BUILTIN;
-  default:
-    return RetCC_AVR;
+bool AVRTargetLowering::CanLowerReturn(
+    CallingConv::ID CallConv, MachineFunction &MF, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
+  if (CallConv == CallingConv::AVR_BUILTIN) {
+    SmallVector<CCValAssign, 16> RVLocs;
+    CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
+    return CCInfo.CheckReturn(Outs, RetCC_AVR_BUILTIN);
   }
-}
 
-bool
-AVRTargetLowering::CanLowerReturn(CallingConv::ID CallConv,
-                                  MachineFunction &MF, bool isVarArg,
-                                  const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                  LLVMContext &Context) const
-{
-  SmallVector<CCValAssign, 16> RVLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, RVLocs, Context);
-
-  auto CCFunction = CCAssignFnForReturn(CallConv);
-  return CCInfo.CheckReturn(Outs, CCFunction);
+  unsigned TotalBytes = getTotalArgumentsSizeInBytes(Outs);
+  return TotalBytes <= 8;
 }
 
 SDValue
@@ -1379,25 +1346,19 @@ AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   CCState CCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                  *DAG.getContext());
 
-  // Analyze return values.
-  auto CCFunction = CCAssignFnForReturn(CallConv);
-  CCInfo.AnalyzeReturn(Outs, CCFunction);
-
-  // If this is the first return lowered for this function, add the regs to
-  // the liveout set for the function.
   MachineFunction &MF = DAG.getMachineFunction();
-  unsigned e = RVLocs.size();
 
-  // Reverse splitted return values to get the "big endian" format required
-  // to agree with the calling convention ABI.
-  if (e > 1) {
-    std::reverse(RVLocs.begin(), RVLocs.end());
+  // Analyze return values.
+  if (CallConv == CallingConv::AVR_BUILTIN) {
+    CCInfo.AnalyzeReturn(Outs, RetCC_AVR_BUILTIN);
+  } else {
+    analyzeReturnValues(Outs, CCInfo);
   }
 
   SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
   // Copy the result values into the output registers.
-  for (unsigned i = 0; i != e; ++i) {
+  for (unsigned i = 0, e = RVLocs.size(); i != e; ++i) {
     CCValAssign &VA = RVLocs[i];
     assert(VA.isRegLoc() && "Can only return in registers!");
 
@@ -1415,10 +1376,12 @@ AVRTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     return Chain;
   }
 
+  const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+
   unsigned RetOpc =
-      (CallConv == CallingConv::AVR_INTR || CallConv == CallingConv::AVR_SIGNAL)
-          ? AVRISD::RETI_FLAG
-          : AVRISD::RET_FLAG;
+    AFI->isInterruptOrSignalHandler()
+        ? AVRISD::RETI_FLAG
+        : AVRISD::RET_FLAG;
 
   RetOps[0] = Chain; // Update chain.
 
@@ -1514,8 +1477,8 @@ MachineBasicBlock *AVRTargetLowering::insertShift(MachineInstr &MI,
   LoopBB->addSuccessor(RemBB);
   LoopBB->addSuccessor(LoopBB);
 
-  unsigned ShiftAmtReg = RI.createVirtualRegister(&AVR::LD8RegClass);
-  unsigned ShiftAmtReg2 = RI.createVirtualRegister(&AVR::LD8RegClass);
+  Register ShiftAmtReg = RI.createVirtualRegister(&AVR::LD8RegClass);
+  Register ShiftAmtReg2 = RI.createVirtualRegister(&AVR::LD8RegClass);
   Register ShiftReg = RI.createVirtualRegister(RC);
   Register ShiftReg2 = RI.createVirtualRegister(RC);
   Register ShiftAmtSrcReg = MI.getOperand(2).getReg();
diff --git a/llvm/lib/Target/AVR/AVRISelLowering.h b/llvm/lib/Target/AVR/AVRISelLowering.h
index aca1ea1d50e5..d1eaf53b15e9 100644
--- a/llvm/lib/Target/AVR/AVRISelLowering.h
+++ b/llvm/lib/Target/AVR/AVRISelLowering.h
@@ -146,10 +146,8 @@ private:
   SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
 
-  CCAssignFn *CCAssignFnForReturn(CallingConv::ID CC) const;
-
-  bool CanLowerReturn(CallingConv::ID CallConv,
-                      MachineFunction &MF, bool isVarArg,
+  bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
+                      bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       LLVMContext &Context) const override;
 
diff --git a/llvm/lib/Target/AVR/AVRInstrFormats.td b/llvm/lib/Target/AVR/AVRInstrFormats.td
index ef596f5cebd5..6eb49076efb0 100644
--- a/llvm/lib/Target/AVR/AVRInstrFormats.td
+++ b/llvm/lib/Target/AVR/AVRInstrFormats.td
@@ -148,6 +148,8 @@ class FRd<bits<4> opcode, bits<7> f, dag outs, dag ins, string asmstr,
   let Inst{11-9} = f{6-4};
   let Inst{8-4} = d;
   let Inst{3-0} = f{3-0};
+
+  let DecoderMethod = "decodeFRd";
 }
 
 //===----------------------------------------------------------------------===//
@@ -235,6 +237,8 @@ class FLPMX<bit e, bit p, dag outs, dag ins, string asmstr, list<dag> pattern>
    let Inst{3-2} = 0b01;
    let Inst{1} = e;
    let Inst{0} = p;
+
+   let DecoderMethod = "decodeFLPMX";
 }
 
 //===----------------------------------------------------------------------===//
@@ -252,6 +256,8 @@ class FMOVWRdRr<dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{15-8} = 0b00000001;
   let Inst{7-4} = d{4-1};
   let Inst{3-0} = r{4-1};
+
+  let DecoderMethod = "decodeFMOVWRdRr";
 }
 
 //===----------------------------------------------------------------------===//
@@ -270,6 +276,8 @@ class FMUL2RdRr<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{8} = f;
   let Inst{7-4} = rd{3-0};
   let Inst{3-0} = rr{3-0};
+
+  let DecoderMethod = "decodeFMUL2RdRr";
 }
 
 // Special encoding for the FMUL family of instructions.
@@ -293,6 +301,8 @@ class FFMULRdRr<bits<2> f, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{6-4} = rd;
   let Inst{3} = f{0};
   let Inst{2-0} = rr;
+
+  let DecoderMethod = "decodeFFMULRdRr";
 }
 
 
@@ -314,6 +324,8 @@ class FWRdK<bit f, dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{7-6} = k{5-4};
   let Inst{5-4} = dst{2-1};
   let Inst{3-0} = k{3-0};
+
+  let DecoderMethod = "decodeFWRdK";
 }
 
 //===----------------------------------------------------------------------===//
@@ -332,6 +344,8 @@ class FIORdA<dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{10-9} = A{5-4};
   let Inst{8-4} = d;
   let Inst{3-0} = A{3-0};
+
+  let DecoderMethod = "decodeFIORdA";
 }
 
 //===----------------------------------------------------------------------===//
@@ -350,6 +364,8 @@ class FIOARr<dag outs, dag ins, string asmstr, list<dag> pattern>
   let Inst{10-9} = A{5-4};
   let Inst{8-4} = r;
   let Inst{3-0} = A{3-0};
+
+  let DecoderMethod = "decodeFIOARr";
 }
 
 //===----------------------------------------------------------------------===//
@@ -374,6 +390,8 @@ class FIOBIT<bits<2> t, dag outs, dag ins, string asmstr, list<dag> pattern>
 
   let Inst{3} = A{0};
   let Inst{2-0} = b{2-0};
+
+  let DecoderMethod = "decodeFIOBIT";
 }
 
 //===----------------------------------------------------------------------===//
@@ -485,7 +503,7 @@ class F32BRk<bits<3> f, dag outs, dag ins, string asmstr, list<dag> pattern>
 }
 
 //===----------------------------------------------------------------------===//
-// 32 bits direct mem instructions: <|1001|00fd|dddd|0000|kkkk|kkkk|kkkk|kkkk|> 
+// 32 bits direct mem instructions: <|1001|00fd|dddd|0000|kkkk|kkkk|kkkk|kkkk|>
 // f = secondary opcode = 1 bit
 // d = destination = 5 bits
 // k = constant address = 16 bits
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.cpp b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
index a6832f282b31..06f07696bde3 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.cpp
@@ -48,11 +48,11 @@ void AVRInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
   // Not all AVR devices support the 16-bit `MOVW` instruction.
   if (AVR::DREGSRegClass.contains(DestReg, SrcReg)) {
-    if (STI.hasMOVW()) {
+    if (STI.hasMOVW() && AVR::DREGSMOVWRegClass.contains(DestReg, SrcReg)) {
       BuildMI(MBB, MI, DL, get(AVR::MOVWRdRr), DestReg)
           .addReg(SrcReg, getKillRegState(KillSrc));
     } else {
-      unsigned DestLo, DestHi, SrcLo, SrcHi;
+      Register DestLo, DestHi, SrcLo, SrcHi;
 
       TRI.splitReg(DestReg, DestLo, DestHi);
       TRI.splitReg(SrcReg,  SrcLo,  SrcHi);
@@ -119,7 +119,7 @@ unsigned AVRInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
 
 void AVRInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MI,
-                                       unsigned SrcReg, bool isKill,
+                                       Register SrcReg, bool isKill,
                                        int FrameIndex,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
@@ -138,7 +138,7 @@ void AVRInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FrameIndex),
       MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex),
-      MFI.getObjectAlignment(FrameIndex));
+      MFI.getObjectAlign(FrameIndex));
 
   unsigned Opcode = 0;
   if (TRI->isTypeLegalForClass(*RC, MVT::i8)) {
@@ -158,7 +158,7 @@ void AVRInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
 void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MI,
-                                        unsigned DestReg, int FrameIndex,
+                                        Register DestReg, int FrameIndex,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
   DebugLoc DL;
@@ -172,7 +172,7 @@ void AVRInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FrameIndex),
       MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
-      MFI.getObjectAlignment(FrameIndex));
+      MFI.getObjectAlign(FrameIndex));
 
   unsigned Opcode = 0;
   if (TRI->isTypeLegalForClass(*RC, MVT::i8)) {
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.h b/llvm/lib/Target/AVR/AVRInstrInfo.h
index bb00ca8c724a..11f45865de54 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.h
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.h
@@ -75,12 +75,12 @@ public:
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
                    bool KillSrc) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, unsigned SrcReg,
+                           MachineBasicBlock::iterator MI, Register SrcReg,
                            bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI, unsigned DestReg,
+                            MachineBasicBlock::iterator MI, Register DestReg,
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
   unsigned isLoadFromStackSlot(const MachineInstr &MI,
diff --git a/llvm/lib/Target/AVR/AVRInstrInfo.td b/llvm/lib/Target/AVR/AVRInstrInfo.td
index acf991dcfbb1..f03c254382b4 100644
--- a/llvm/lib/Target/AVR/AVRInstrInfo.td
+++ b/llvm/lib/Target/AVR/AVRInstrInfo.td
@@ -107,7 +107,9 @@ def imm_com8 : Operand<i8> {
 
 def ioaddr_XFORM : SDNodeXForm<imm,
 [{
-  return CurDAG->getTargetConstant(uint8_t(N->getZExtValue()) - 0x20, SDLoc(N), MVT::i8);
+  uint8_t offset = Subtarget->getIORegisterOffset();
+  return CurDAG->getTargetConstant(uint8_t(N->getZExtValue()) - offset,
+                                   SDLoc(N), MVT::i8);
 }]>;
 
 def iobitpos8_XFORM : SDNodeXForm<imm,
@@ -124,20 +126,23 @@ def iobitposn8_XFORM : SDNodeXForm<imm,
 
 def ioaddr8 : PatLeaf<(imm),
 [{
-  uint64_t val = N->getZExtValue();
-  return val >= 0x20 && val < 0x60;
+  uint8_t offset = Subtarget->getIORegisterOffset();
+  uint64_t val = N->getZExtValue() - offset;
+  return val < 0x40;
 }], ioaddr_XFORM>;
 
 def lowioaddr8 : PatLeaf<(imm),
 [{
-  uint64_t val = N->getZExtValue();
-  return val >= 0x20 && val < 0x40;
+  uint8_t offset = Subtarget->getIORegisterOffset();
+  uint64_t val = N->getZExtValue() - offset;
+  return val < 0x20;
 }], ioaddr_XFORM>;
 
 def ioaddr16 : PatLeaf<(imm),
 [{
-  uint64_t val = N->getZExtValue();
-  return val >= 0x20 && val < 0x5f;
+  uint8_t offset = Subtarget->getIORegisterOffset();
+  uint64_t val = N->getZExtValue() - offset;
+  return val < 0x3f;
 }], ioaddr_XFORM>;
 
 def iobitpos8 : PatLeaf<(imm),
@@ -188,6 +193,7 @@ def brtarget_13 : Operand<OtherVT>
 def call_target : Operand<iPTR>
 {
     let EncoderMethod = "encodeCallTarget";
+    let DecoderMethod = "decodeCallTarget";
 }
 
 // A 16-bit address (which can lead to an R_AVR_16 relocation).
@@ -260,58 +266,58 @@ def LDDSTDPtrReg : Operand<i16>
 //===----------------------------------------------------------------------===//
 
 def HasSRAM       :    Predicate<"Subtarget->hasSRAM()">,
-                         AssemblerPredicate<"FeatureSRAM">;
+                         AssemblerPredicate<(all_of FeatureSRAM)>;
 
 def HasJMPCALL    :    Predicate<"Subtarget->hasJMPCALL()">,
-                         AssemblerPredicate<"FeatureJMPCALL">;
+                         AssemblerPredicate<(all_of FeatureJMPCALL)>;
 
 def HasIJMPCALL   :    Predicate<"Subtarget->hasIJMPCALL()">,
-                         AssemblerPredicate<"FeatureIJMPCALL">;
+                         AssemblerPredicate<(all_of FeatureIJMPCALL)>;
 
 def HasEIJMPCALL  :    Predicate<"Subtarget->hasEIJMPCALL()">,
-                         AssemblerPredicate<"FeatureEIJMPCALL">;
+                         AssemblerPredicate<(all_of FeatureEIJMPCALL)>;
 
 def HasADDSUBIW   :    Predicate<"Subtarget->hasADDSUBIW()">,
-                         AssemblerPredicate<"FeatureADDSUBIW">;
+                         AssemblerPredicate<(all_of FeatureADDSUBIW)>;
 
 def HasSmallStack :    Predicate<"Subtarget->HasSmallStack()">,
-                         AssemblerPredicate<"FeatureSmallStack">;
+                         AssemblerPredicate<(all_of FeatureSmallStack)>;
 
 def HasMOVW       :    Predicate<"Subtarget->hasMOVW()">,
-                         AssemblerPredicate<"FeatureMOVW">;
+                         AssemblerPredicate<(all_of FeatureMOVW)>;
 
 def HasLPM        :    Predicate<"Subtarget->hasLPM()">,
-                         AssemblerPredicate<"FeatureLPM">;
+                         AssemblerPredicate<(all_of FeatureLPM)>;
 
 def HasLPMX       :    Predicate<"Subtarget->hasLPMX()">,
-                         AssemblerPredicate<"FeatureLPMX">;
+                         AssemblerPredicate<(all_of FeatureLPMX)>;
 
 def HasELPM       :    Predicate<"Subtarget->hasELPM()">,
-                         AssemblerPredicate<"FeatureELPM">;
+                         AssemblerPredicate<(all_of FeatureELPM)>;
 
 def HasELPMX      :    Predicate<"Subtarget->hasELPMX()">,
-                         AssemblerPredicate<"FeatureELPMX">;
+                         AssemblerPredicate<(all_of FeatureELPMX)>;
 
 def HasSPM        :    Predicate<"Subtarget->hasSPM()">,
-                         AssemblerPredicate<"FeatureSPM">;
+                         AssemblerPredicate<(all_of FeatureSPM)>;
 
 def HasSPMX       :    Predicate<"Subtarget->hasSPMX()">,
-                         AssemblerPredicate<"FeatureSPMX">;
+                         AssemblerPredicate<(all_of FeatureSPMX)>;
 
 def HasDES        :    Predicate<"Subtarget->hasDES()">,
-                         AssemblerPredicate<"FeatureDES">;
+                         AssemblerPredicate<(all_of FeatureDES)>;
 
 def SupportsRMW   :    Predicate<"Subtarget->supportsRMW()">,
-                         AssemblerPredicate<"FeatureRMW">;
+                         AssemblerPredicate<(all_of FeatureRMW)>;
 
 def SupportsMultiplication : Predicate<"Subtarget->supportsMultiplication()">,
-                               AssemblerPredicate<"FeatureMultiplication">;
+                               AssemblerPredicate<(all_of FeatureMultiplication)>;
 
 def HasBREAK      :    Predicate<"Subtarget->hasBREAK()">,
-                         AssemblerPredicate<"FeatureBREAK">;
+                         AssemblerPredicate<(all_of FeatureBREAK)>;
 
 def HasTinyEncoding : Predicate<"Subtarget->hasTinyEncoding()">,
-                        AssemblerPredicate<"FeatureTinyEncoding">;
+                        AssemblerPredicate<(all_of FeatureTinyEncoding)>;
 
 
 // AVR specific condition code. These correspond to AVR_*_COND in
@@ -555,7 +561,7 @@ Defs = [R1, R0, SREG] in
 
     def MULSRdRr : FMUL2RdRr<0,
                              (outs),
-                             (ins GPR8:$lhs, GPR8:$rhs),
+                             (ins LD8:$lhs, LD8:$rhs),
                              "muls\t$lhs, $rhs",
                              []>,
                    Requires<[SupportsMultiplication]>;
@@ -563,28 +569,28 @@ Defs = [R1, R0, SREG] in
 
   def MULSURdRr : FMUL2RdRr<1,
                             (outs),
-                            (ins GPR8:$lhs, GPR8:$rhs),
+                            (ins LD8lo:$lhs, LD8lo:$rhs),
                             "mulsu\t$lhs, $rhs",
                             []>,
                   Requires<[SupportsMultiplication]>;
 
   def FMUL : FFMULRdRr<0b01,
                        (outs),
-                       (ins GPR8:$lhs, GPR8:$rhs),
+                       (ins LD8lo:$lhs, LD8lo:$rhs),
                        "fmul\t$lhs, $rhs",
                        []>,
              Requires<[SupportsMultiplication]>;
 
   def FMULS : FFMULRdRr<0b10,
                         (outs),
-                        (ins GPR8:$lhs, GPR8:$rhs),
+                        (ins LD8lo:$lhs, LD8lo:$rhs),
                         "fmuls\t$lhs, $rhs",
                         []>,
               Requires<[SupportsMultiplication]>;
 
   def FMULSU : FFMULRdRr<0b11,
                          (outs),
-                         (ins GPR8:$lhs, GPR8:$rhs),
+                         (ins LD8lo:$lhs, LD8lo:$rhs),
                          "fmulsu\t$lhs, $rhs",
                          []>,
                Requires<[SupportsMultiplication]>;
@@ -840,7 +846,7 @@ let isCall = 1 in
 //===----------------------------------------------------------------------===//
 let isTerminator = 1,
 isReturn = 1,
-isBarrier = 1 in 
+isBarrier = 1 in
 {
   def RET : F16<0b1001010100001000,
                 (outs),
@@ -2042,8 +2048,6 @@ def : Pat<(add i16:$src1, imm:$src2),
           (SUBIWRdK i16:$src1, (imm16_neg_XFORM imm:$src2))>;
 def : Pat<(addc i16:$src1, imm:$src2),
           (SUBIWRdK i16:$src1, (imm16_neg_XFORM imm:$src2))>;
-def : Pat<(adde i16:$src1, imm:$src2),
-          (SBCIWRdK i16:$src1, (imm16_neg_XFORM imm:$src2))>;
 
 def : Pat<(add i8:$src1, imm:$src2),
           (SUBIRdK i8:$src1, (imm8_neg_XFORM imm:$src2))>;
diff --git a/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h b/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
index 5226e30491c3..5432fac122ef 100644
--- a/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
+++ b/llvm/lib/Target/AVR/AVRMachineFunctionInfo.h
@@ -31,6 +31,12 @@ class AVRMachineFunctionInfo : public MachineFunctionInfo {
   /// used inside the function.
   bool HasStackArgs;
 
+  /// Whether or not the function is an interrupt handler.
+  bool IsInterruptHandler;
+
+  /// Whether or not the function is an non-blocking interrupt handler.
+  bool IsSignalHandler;
+
   /// Size of the callee-saved register portion of the
   /// stack frame in bytes.
   unsigned CalleeSavedFrameSize;
@@ -41,11 +47,17 @@ class AVRMachineFunctionInfo : public MachineFunctionInfo {
 public:
   AVRMachineFunctionInfo()
       : HasSpills(false), HasAllocas(false), HasStackArgs(false),
+        IsInterruptHandler(false), IsSignalHandler(false),
         CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {}
 
   explicit AVRMachineFunctionInfo(MachineFunction &MF)
       : HasSpills(false), HasAllocas(false), HasStackArgs(false),
-        CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {}
+        CalleeSavedFrameSize(0), VarArgsFrameIndex(0) {
+    unsigned CallConv = MF.getFunction().getCallingConv();
+
+    this->IsInterruptHandler = CallConv == CallingConv::AVR_INTR || MF.getFunction().hasFnAttribute("interrupt");
+    this->IsSignalHandler = CallConv == CallingConv::AVR_SIGNAL || MF.getFunction().hasFnAttribute("signal");
+  }
 
   bool getHasSpills() const { return HasSpills; }
   void setHasSpills(bool B) { HasSpills = B; }
@@ -56,6 +68,12 @@ public:
   bool getHasStackArgs() const { return HasStackArgs; }
   void setHasStackArgs(bool B) { HasStackArgs = B; }
 
+  /// Checks if the function is some form of interrupt service routine.
+  bool isInterruptOrSignalHandler() const { return isInterruptHandler() || isSignalHandler(); }
+
+  bool isInterruptHandler() const { return IsInterruptHandler; }
+  bool isSignalHandler() const { return IsSignalHandler; }
+
   unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
   void setCalleeSavedFrameSize(unsigned Bytes) { CalleeSavedFrameSize = Bytes; }
 
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.cpp b/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
index 8fce05c933bc..2a4905ce2461 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.cpp
@@ -22,6 +22,7 @@
 
 #include "AVR.h"
 #include "AVRInstrInfo.h"
+#include "AVRMachineFunctionInfo.h"
 #include "AVRTargetMachine.h"
 #include "MCTargetDesc/AVRMCTargetDesc.h"
 
@@ -34,19 +35,21 @@ AVRRegisterInfo::AVRRegisterInfo() : AVRGenRegisterInfo(0) {}
 
 const uint16_t *
 AVRRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  CallingConv::ID CC = MF->getFunction().getCallingConv();
+  const AVRMachineFunctionInfo *AFI = MF->getInfo<AVRMachineFunctionInfo>();
 
-  return ((CC == CallingConv::AVR_INTR || CC == CallingConv::AVR_SIGNAL)
+  return AFI->isInterruptOrSignalHandler()
               ? CSR_Interrupts_SaveList
-              : CSR_Normal_SaveList);
+              : CSR_Normal_SaveList;
 }
 
 const uint32_t *
 AVRRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                       CallingConv::ID CC) const {
-  return ((CC == CallingConv::AVR_INTR || CC == CallingConv::AVR_SIGNAL)
+  const AVRMachineFunctionInfo *AFI = MF.getInfo<AVRMachineFunctionInfo>();
+
+  return AFI->isInterruptOrSignalHandler()
               ? CSR_Interrupts_RegMask
-              : CSR_Normal_RegMask);
+              : CSR_Normal_RegMask;
 }
 
 BitVector AVRRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
@@ -95,7 +98,8 @@ AVRRegisterInfo::getLargestLegalSuperClass(const TargetRegisterClass *RC,
 }
 
 /// Fold a frame offset shared between two add instructions into a single one.
-static void foldFrameOffset(MachineBasicBlock::iterator &II, int &Offset, unsigned DstReg) {
+static void foldFrameOffset(MachineBasicBlock::iterator &II, int &Offset,
+                            Register DstReg) {
   MachineInstr &MI = *II;
   int Opcode = MI.getOpcode();
 
@@ -264,13 +268,12 @@ AVRRegisterInfo::getPointerRegClass(const MachineFunction &MF,
   return &AVR::PTRDISPREGSRegClass;
 }
 
-void AVRRegisterInfo::splitReg(unsigned Reg,
-                               unsigned &LoReg,
-                               unsigned &HiReg) const {
-    assert(AVR::DREGSRegClass.contains(Reg) && "can only split 16-bit registers");
+void AVRRegisterInfo::splitReg(Register Reg, Register &LoReg,
+                               Register &HiReg) const {
+  assert(AVR::DREGSRegClass.contains(Reg) && "can only split 16-bit registers");
 
-    LoReg = getSubReg(Reg, AVR::sub_lo);
-    HiReg = getSubReg(Reg, AVR::sub_hi);
+  LoReg = getSubReg(Reg, AVR::sub_lo);
+  HiReg = getSubReg(Reg, AVR::sub_hi);
 }
 
 bool AVRRegisterInfo::shouldCoalesce(MachineInstr *MI,
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.h b/llvm/lib/Target/AVR/AVRRegisterInfo.h
index 8e6e63af3d57..23439f2fe195 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.h
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.h
@@ -49,11 +49,7 @@ public:
 
   /// Splits a 16-bit `DREGS` register into the lo/hi register pair.
   /// \param Reg A 16-bit register to split.
-  void splitReg(unsigned Reg, unsigned &LoReg, unsigned &HiReg) const;
-
-  bool trackLivenessAfterRegAlloc(const MachineFunction &) const override {
-    return true;
-  }
+  void splitReg(Register Reg, Register &LoReg, Register &HiReg) const;
 
   bool shouldCoalesce(MachineInstr *MI,
                       const TargetRegisterClass *SrcRC,
diff --git a/llvm/lib/Target/AVR/AVRRegisterInfo.td b/llvm/lib/Target/AVR/AVRRegisterInfo.td
index ea38fedd22ce..ab5d02356c9d 100644
--- a/llvm/lib/Target/AVR/AVRRegisterInfo.td
+++ b/llvm/lib/Target/AVR/AVRRegisterInfo.td
@@ -103,6 +103,17 @@ CoveredBySubRegs = 1 in
   def R5R4   : AVRReg<4,  "r5:r4",   [R4, R5]>,   DwarfRegNum<[4]>;
   def R3R2   : AVRReg<2,  "r3:r2",   [R2, R3]>,   DwarfRegNum<[2]>;
   def R1R0   : AVRReg<0,  "r1:r0",   [R0, R1]>,   DwarfRegNum<[0]>;
+
+  // Pseudo registers for unaligned i16
+  def R26R25 : AVRReg<25, "r26:r25", [R25, R26]>, DwarfRegNum<[25]>;
+  def R24R23 : AVRReg<23, "r24:r23", [R23, R24]>, DwarfRegNum<[23]>;
+  def R22R21 : AVRReg<21, "r22:r21", [R21, R22]>, DwarfRegNum<[21]>;
+  def R20R19 : AVRReg<19, "r20:r19", [R19, R20]>, DwarfRegNum<[19]>;
+  def R18R17 : AVRReg<17, "r18:r17", [R17, R18]>, DwarfRegNum<[17]>;
+  def R16R15 : AVRReg<15, "r16:r15", [R15, R16]>, DwarfRegNum<[15]>;
+  def R14R13 : AVRReg<13, "r14:r13", [R13, R14]>, DwarfRegNum<[13]>;
+  def R12R11 : AVRReg<11, "r12:r11", [R11, R12]>, DwarfRegNum<[11]>;
+  def R10R9  : AVRReg<9,  "r10:r9",  [R9,  R10]>, DwarfRegNum<[9]>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -153,6 +164,22 @@ def DREGS : RegisterClass<"AVR", [i16], 8,
     R31R30, R27R26,
     // Callee saved registers.
     R29R28, R17R16, R15R14, R13R12, R11R10,
+    R9R8, R7R6, R5R4, R3R2, R1R0,
+    // Pseudo regs for unaligned 16-bits
+    R26R25, R24R23, R22R21,
+    R20R19, R18R17, R16R15,
+    R14R13, R12R11, R10R9
+  )>;
+
+// 16-bit pair register class for movw
+def DREGSMOVW : RegisterClass<"AVR", [i16], 8,
+  (
+    // Return value and arguments.
+    add R25R24, R19R18, R21R20, R23R22,
+    // Scratch registers.
+    R31R30, R27R26,
+    // Callee saved registers.
+    R29R28, R17R16, R15R14, R13R12, R11R10,
     R9R8, R7R6, R5R4, R3R2, R1R0
   )>;
 
diff --git a/llvm/lib/Target/AVR/AVRSubtarget.cpp b/llvm/lib/Target/AVR/AVRSubtarget.cpp
index 6a41036fdd6c..195ca95bc3bd 100644
--- a/llvm/lib/Target/AVR/AVRSubtarget.cpp
+++ b/llvm/lib/Target/AVR/AVRSubtarget.cpp
@@ -29,16 +29,19 @@ namespace llvm {
 
 AVRSubtarget::AVRSubtarget(const Triple &TT, const std::string &CPU,
                            const std::string &FS, const AVRTargetMachine &TM)
-    : AVRGenSubtargetInfo(TT, CPU, FS), InstrInfo(), FrameLowering(),
-      TLInfo(TM, initializeSubtargetDependencies(CPU, FS, TM)), TSInfo(),
+    : AVRGenSubtargetInfo(TT, CPU, FS), ELFArch(0),
 
       // Subtarget features
       m_hasSRAM(false), m_hasJMPCALL(false), m_hasIJMPCALL(false),
       m_hasEIJMPCALL(false), m_hasADDSUBIW(false), m_hasSmallStack(false),
-      m_hasMOVW(false), m_hasLPM(false), m_hasLPMX(false),  m_hasELPM(false),
+      m_hasMOVW(false), m_hasLPM(false), m_hasLPMX(false), m_hasELPM(false),
       m_hasELPMX(false), m_hasSPM(false), m_hasSPMX(false), m_hasDES(false),
       m_supportsRMW(false), m_supportsMultiplication(false), m_hasBREAK(false),
-      m_hasTinyEncoding(false), ELFArch(false), m_FeatureSetDummy(false) {
+      m_hasTinyEncoding(false), m_hasMemMappedGPR(false),
+      m_FeatureSetDummy(false),
+
+      InstrInfo(), FrameLowering(),
+      TLInfo(TM, initializeSubtargetDependencies(CPU, FS, TM)), TSInfo() {
   // Parse features string.
   ParseSubtargetFeatures(CPU, FS);
 }
diff --git a/llvm/lib/Target/AVR/AVRSubtarget.h b/llvm/lib/Target/AVR/AVRSubtarget.h
index da9289af7c8d..81d883eb30d9 100644
--- a/llvm/lib/Target/AVR/AVRSubtarget.h
+++ b/llvm/lib/Target/AVR/AVRSubtarget.h
@@ -71,6 +71,9 @@ public:
   bool supportsMultiplication() const { return m_supportsMultiplication; }
   bool hasBREAK() const { return m_hasBREAK; }
   bool hasTinyEncoding() const { return m_hasTinyEncoding; }
+  bool hasMemMappedGPR() const { return m_hasMemMappedGPR; }
+
+  uint8_t getIORegisterOffset() const { return hasMemMappedGPR() ? 0x20 : 0x0; }
 
   /// Gets the ELF architecture for the e_flags field
   /// of an ELF object file.
@@ -81,10 +84,9 @@ public:
   }
 
 private:
-  AVRInstrInfo InstrInfo;
-  AVRFrameLowering FrameLowering;
-  AVRTargetLowering TLInfo;
-  AVRSelectionDAGInfo TSInfo;
+
+  /// The ELF e_flags architecture.
+  unsigned ELFArch;
 
   // Subtarget feature settings
   // See AVR.td for details.
@@ -106,13 +108,16 @@ private:
   bool m_supportsMultiplication;
   bool m_hasBREAK;
   bool m_hasTinyEncoding;
-
-  /// The ELF e_flags architecture.
-  unsigned ELFArch;
+  bool m_hasMemMappedGPR;
 
   // Dummy member, used by FeatureSet's. We cannot have a SubtargetFeature with
   // no variable, so we instead bind pseudo features to this variable.
   bool m_FeatureSetDummy;
+
+  AVRInstrInfo InstrInfo;
+  AVRFrameLowering FrameLowering;
+  AVRTargetLowering TLInfo;
+  AVRSelectionDAGInfo TSInfo;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AVR/AVRTargetMachine.cpp b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
index b33284b73d63..0c7136e6f77e 100644
--- a/llvm/lib/Target/AVR/AVRTargetMachine.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetMachine.cpp
@@ -49,7 +49,7 @@ AVRTargetMachine::AVRTargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, AVRDataLayout, TT, getCPU(CPU), FS, Options,
                         getEffectiveRelocModel(RM),
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
-      SubTarget(TT, getCPU(CPU), FS, *this) {
+      SubTarget(TT, std::string(getCPU(CPU)), std::string(FS), *this) {
   this->TLOF = std::make_unique<AVRTargetObjectFile>();
   initAsmInfo();
 }
diff --git a/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp b/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
index 980096a09835..14206cdb8276 100644
--- a/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
+++ b/llvm/lib/Target/AVR/AVRTargetObjectFile.cpp
@@ -30,7 +30,7 @@ AVRTargetObjectFile::SelectSectionForGlobal(const GlobalObject *GO,
                                             const TargetMachine &TM) const {
   // Global values in flash memory are placed in the progmem.data section
   // unless they already have a user assigned section.
-  if (AVR::isProgramMemoryAddress(GO) && !GO->hasSection())
+  if (AVR::isProgramMemoryAddress(GO) && !GO->hasSection() && Kind.isReadOnly())
     return ProgmemDataSection;
 
   // Otherwise, we work the same way as ELF.
diff --git a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
index fc34583ae573..230bc7adc07a 100644
--- a/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
+++ b/llvm/lib/Target/AVR/AsmParser/AVRAsmParser.cpp
@@ -53,6 +53,8 @@ class AVRAsmParser : public MCTargetAsmParser {
                                bool MatchingInlineAsm) override;
 
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
 
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
@@ -64,7 +66,7 @@ class AVRAsmParser : public MCTargetAsmParser {
   bool parseOperand(OperandVector &Operands);
   int parseRegisterName(unsigned (*matchFn)(StringRef));
   int parseRegisterName();
-  int parseRegister();
+  int parseRegister(bool RestoreOnFailure = false);
   bool tryParseRegisterOperand(OperandVector &Operands);
   bool tryParseExpression(OperandVector &Operands);
   bool tryParseRelocExpression(OperandVector &Operands);
@@ -176,10 +178,10 @@ public:
     return isUInt<8>(Value);
   }
 
-  bool isReg() const { return Kind == k_Register; }
-  bool isImm() const { return Kind == k_Immediate; }
-  bool isToken() const { return Kind == k_Token; }
-  bool isMem() const { return Kind == k_Memri; }
+  bool isReg() const override { return Kind == k_Register; }
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isToken() const override { return Kind == k_Token; }
+  bool isMem() const override { return Kind == k_Memri; }
   bool isMemri() const { return Kind == k_Memri; }
 
   StringRef getToken() const {
@@ -187,7 +189,7 @@ public:
     return Tok;
   }
 
-  unsigned getReg() const {
+  unsigned getReg() const override {
     assert((Kind == k_Register || Kind == k_Memri) && "Invalid access!");
 
     return RegImm.Reg;
@@ -237,10 +239,10 @@ public:
     RegImm = {RegNo, Imm};
   }
 
-  SMLoc getStartLoc() const { return Start; }
-  SMLoc getEndLoc() const { return End; }
+  SMLoc getStartLoc() const override { return Start; }
+  SMLoc getEndLoc() const override { return End; }
 
-  virtual void print(raw_ostream &O) const {
+  void print(raw_ostream &O) const override {
     switch (Kind) {
     case k_Token:
       O << "Token: \"" << getToken() << "\"";
@@ -307,7 +309,7 @@ bool AVRAsmParser::missingFeature(llvm::SMLoc const &Loc,
 
 bool AVRAsmParser::emit(MCInst &Inst, SMLoc const &Loc, MCStreamer &Out) const {
   Inst.setLoc(Loc);
-  Out.EmitInstruction(Inst, STI);
+  Out.emitInstruction(Inst, STI);
 
   return false;
 }
@@ -359,19 +361,25 @@ int AVRAsmParser::parseRegisterName() {
   return RegNum;
 }
 
-int AVRAsmParser::parseRegister() {
+int AVRAsmParser::parseRegister(bool RestoreOnFailure) {
   int RegNum = AVR::NoRegister;
 
   if (Parser.getTok().is(AsmToken::Identifier)) {
     // Check for register pair syntax
     if (Parser.getLexer().peekTok().is(AsmToken::Colon)) {
+      AsmToken HighTok = Parser.getTok();
       Parser.Lex();
+      AsmToken ColonTok = Parser.getTok();
       Parser.Lex(); // Eat high (odd) register and colon
 
       if (Parser.getTok().is(AsmToken::Identifier)) {
         // Convert lower (even) register to DREG
         RegNum = toDREG(parseRegisterName());
       }
+      if (RegNum == AVR::NoRegister && RestoreOnFailure) {
+        getLexer().UnLex(std::move(ColonTok));
+        getLexer().UnLex(std::move(HighTok));
+      }
     } else {
       RegNum = parseRegisterName();
     }
@@ -580,12 +588,24 @@ AVRAsmParser::parseMemriOperand(OperandVector &Operands) {
 bool AVRAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                  SMLoc &EndLoc) {
   StartLoc = Parser.getTok().getLoc();
-  RegNo = parseRegister();
+  RegNo = parseRegister(/*RestoreOnFailure=*/false);
   EndLoc = Parser.getTok().getLoc();
 
   return (RegNo == AVR::NoRegister);
 }
 
+OperandMatchResultTy AVRAsmParser::tryParseRegister(unsigned &RegNo,
+                                                    SMLoc &StartLoc,
+                                                    SMLoc &EndLoc) {
+  StartLoc = Parser.getTok().getLoc();
+  RegNo = parseRegister(/*RestoreOnFailure=*/true);
+  EndLoc = Parser.getTok().getLoc();
+
+  if (RegNo == AVR::NoRegister)
+    return MatchOperand_NoMatch;
+  return MatchOperand_Success;
+}
+
 void AVRAsmParser::eatComma() {
   if (getLexer().is(AsmToken::Comma)) {
     Parser.Lex();
@@ -650,7 +670,7 @@ bool AVRAsmParser::parseLiteralValues(unsigned SizeInBytes, SMLoc L) {
       Tokens[0].getKind() == AsmToken::Minus &&
       Tokens[1].getKind() == AsmToken::Identifier) {
     MCSymbol *Symbol = getContext().getOrCreateSymbol(".text");
-    AVRStreamer.EmitValueForModiferKind(Symbol, SizeInBytes, L,
+    AVRStreamer.emitValueForModiferKind(Symbol, SizeInBytes, L,
             AVRMCExpr::VK_AVR_None);
     return false;
   }
@@ -668,7 +688,7 @@ bool AVRAsmParser::parseLiteralValues(unsigned SizeInBytes, SMLoc L) {
     }
     MCSymbol *Symbol =
         getContext().getOrCreateSymbol(Parser.getTok().getString());
-    AVRStreamer.EmitValueForModiferKind(Symbol, SizeInBytes, L, ModifierKind);
+    AVRStreamer.emitValueForModiferKind(Symbol, SizeInBytes, L, ModifierKind);
     return false;
   }
 
@@ -676,7 +696,7 @@ bool AVRAsmParser::parseLiteralValues(unsigned SizeInBytes, SMLoc L) {
     const MCExpr *Value;
     if (Parser.parseExpression(Value))
       return true;
-    Parser.getStreamer().EmitValue(Value, SizeInBytes, L);
+    Parser.getStreamer().emitValue(Value, SizeInBytes, L);
     return false;
   };
   return (parseMany(parseOne));
diff --git a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
index 694aee818f7c..8e7251a74dfd 100644
--- a/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
+++ b/llvm/lib/Target/AVR/Disassembler/AVRDisassembler.cpp
@@ -57,23 +57,180 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeAVRDisassembler() {
                                          createAVRDisassembler);
 }
 
+static const uint16_t GPRDecoderTable[] = {
+  AVR::R0, AVR::R1, AVR::R2, AVR::R3,
+  AVR::R4, AVR::R5, AVR::R6, AVR::R7,
+  AVR::R8, AVR::R9, AVR::R10, AVR::R11,
+  AVR::R12, AVR::R13, AVR::R14, AVR::R15,
+  AVR::R16, AVR::R17, AVR::R18, AVR::R19,
+  AVR::R20, AVR::R21, AVR::R22, AVR::R23,
+  AVR::R24, AVR::R25, AVR::R26, AVR::R27,
+  AVR::R28, AVR::R29, AVR::R30, AVR::R31,
+};
+
 static DecodeStatus DecodeGPR8RegisterClass(MCInst &Inst, unsigned RegNo,
                                             uint64_t Address, const void *Decoder) {
+  if (RegNo > 31)
+    return MCDisassembler::Fail;
+
+  unsigned Register = GPRDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Register));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodeLD8RegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address, const void *Decoder) {
+  if (RegNo > 15)
+    return MCDisassembler::Fail;
+
+  unsigned Register = GPRDecoderTable[RegNo+16];
+  Inst.addOperand(MCOperand::createReg(Register));
   return MCDisassembler::Success;
 }
 
 static DecodeStatus DecodePTRREGSRegisterClass(MCInst &Inst, unsigned RegNo,
                                                uint64_t Address, const void *Decoder) {
+  // Note: this function must be defined but does not seem to be called.
+  assert(false && "unimplemented: PTRREGS register class");
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeCallTarget(MCInst &Inst, unsigned Insn,
+                                     uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn,
+                              uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeFFMULRdRr(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeFMOVWRdRr(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+
+static DecodeStatus decodeFMUL2RdRr(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder);
+
 #include "AVRGenDisassemblerTables.inc"
 
+static DecodeStatus decodeFIOARr(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned addr = 0;
+  addr |= fieldFromInstruction(Insn, 0, 4);
+  addr |= fieldFromInstruction(Insn, 9, 2) << 4;
+  unsigned reg = fieldFromInstruction(Insn, 4, 5);
+  Inst.addOperand(MCOperand::createImm(addr));
+  if (DecodeGPR8RegisterClass(Inst, reg, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeFIORdA(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned addr = 0;
+  addr |= fieldFromInstruction(Insn, 0, 4);
+  addr |= fieldFromInstruction(Insn, 9, 2) << 4;
+  unsigned reg = fieldFromInstruction(Insn, 4, 5);
+  if (DecodeGPR8RegisterClass(Inst, reg, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(addr));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeFIOBIT(MCInst &Inst, unsigned Insn,
+                                 uint64_t Address, const void *Decoder) {
+  unsigned addr = fieldFromInstruction(Insn, 3, 5);
+  unsigned b = fieldFromInstruction(Insn, 0, 3);
+  Inst.addOperand(MCOperand::createImm(addr));
+  Inst.addOperand(MCOperand::createImm(b));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeCallTarget(MCInst &Inst, unsigned Field,
+                                     uint64_t Address, const void *Decoder) {
+  // Call targets need to be shifted left by one so this needs a custom
+  // decoder.
+  Inst.addOperand(MCOperand::createImm(Field << 1));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeFRd(MCInst &Inst, unsigned Insn,
+                              uint64_t Address, const void *Decoder) {
+  unsigned d = fieldFromInstruction(Insn, 4, 5);
+  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeFLPMX(MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder) {
+  if (decodeFRd(Inst, Insn, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(AVR::R31R30));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeFFMULRdRr(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  unsigned d = fieldFromInstruction(Insn, 4, 3) + 16;
+  unsigned r = fieldFromInstruction(Insn, 0, 3) + 16;
+  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  if (DecodeGPR8RegisterClass(Inst, r, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeFMOVWRdRr(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  unsigned r = fieldFromInstruction(Insn, 4, 4) * 2;
+  unsigned d = fieldFromInstruction(Insn, 0, 4) * 2;
+  if (DecodeGPR8RegisterClass(Inst, r, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeFWRdK(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  unsigned d = fieldFromInstruction(Insn, 4, 2) * 2 + 24; // starts at r24:r25
+  unsigned k = 0;
+  k |= fieldFromInstruction(Insn, 0, 4);
+  k |= fieldFromInstruction(Insn, 6, 2) << 4;
+  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  if (DecodeGPR8RegisterClass(Inst, d, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(k));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeFMUL2RdRr(MCInst &Inst, unsigned Insn,
+                                    uint64_t Address, const void *Decoder) {
+  unsigned rd = fieldFromInstruction(Insn, 4, 4) + 16;
+  unsigned rr = fieldFromInstruction(Insn, 0, 4) + 16;
+  if (DecodeGPR8RegisterClass(Inst, rd, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  if (DecodeGPR8RegisterClass(Inst, rr, Address, Decoder) == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus readInstruction16(ArrayRef<uint8_t> Bytes, uint64_t Address,
                                       uint64_t &Size, uint32_t &Insn) {
   if (Bytes.size() < 2) {
@@ -96,7 +253,7 @@ static DecodeStatus readInstruction32(ArrayRef<uint8_t> Bytes, uint64_t Address,
   }
 
   Size = 4;
-  Insn = (Bytes[0] << 0) | (Bytes[1] << 8) | (Bytes[2] << 16) | (Bytes[3] << 24);
+  Insn = (Bytes[0] << 16) | (Bytes[1] << 24) | (Bytes[2] << 0) | (Bytes[3] << 8);
 
   return MCDisassembler::Success;
 }
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
index e92b16c8ee9d..ac72abe0d9f6 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.cpp
@@ -34,8 +34,9 @@ namespace adjust {
 
 using namespace llvm;
 
-void signed_width(unsigned Width, uint64_t Value, std::string Description,
-                  const MCFixup &Fixup, MCContext *Ctx = nullptr) {
+static void signed_width(unsigned Width, uint64_t Value,
+                         std::string Description, const MCFixup &Fixup,
+                         MCContext *Ctx = nullptr) {
   if (!isIntN(Width, Value)) {
     std::string Diagnostic = "out of range " + Description;
 
@@ -53,8 +54,9 @@ void signed_width(unsigned Width, uint64_t Value, std::string Description,
   }
 }
 
-void unsigned_width(unsigned Width, uint64_t Value, std::string Description,
-                    const MCFixup &Fixup, MCContext *Ctx = nullptr) {
+static void unsigned_width(unsigned Width, uint64_t Value,
+                           std::string Description, const MCFixup &Fixup,
+                           MCContext *Ctx = nullptr) {
   if (!isUIntN(Width, Value)) {
     std::string Diagnostic = "out of range " + Description;
 
@@ -72,8 +74,8 @@ void unsigned_width(unsigned Width, uint64_t Value, std::string Description,
 }
 
 /// Adjusts the value of a branch target before fixup application.
-void adjustBranch(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-                  MCContext *Ctx = nullptr) {
+static void adjustBranch(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+                         MCContext *Ctx = nullptr) {
   // We have one extra bit of precision because the value is rightshifted by
   // one.
   unsigned_width(Size + 1, Value, std::string("branch target"), Fixup, Ctx);
@@ -83,14 +85,12 @@ void adjustBranch(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 }
 
 /// Adjusts the value of a relative branch target before fixup application.
-void adjustRelativeBranch(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-                          MCContext *Ctx = nullptr) {
+static void adjustRelativeBranch(unsigned Size, const MCFixup &Fixup,
+                                 uint64_t &Value, MCContext *Ctx = nullptr) {
   // We have one extra bit of precision because the value is rightshifted by
   // one.
   signed_width(Size + 1, Value, std::string("branch target"), Fixup, Ctx);
 
-  Value -= 2;
-
   // Rightshifts the value by one.
   AVR::fixups::adjustBranchTarget(Value);
 }
@@ -101,8 +101,8 @@ void adjustRelativeBranch(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 /// 1001 kkkk 010k kkkk kkkk kkkk 111k kkkk
 ///
 /// Offset of 0 (so the result is left shifted by 3 bits before application).
-void fixup_call(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-                MCContext *Ctx = nullptr) {
+static void fixup_call(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+                       MCContext *Ctx = nullptr) {
   adjustBranch(Size, Fixup, Value, Ctx);
 
   auto top = Value & (0xf00000 << 6);   // the top four bits
@@ -117,8 +117,8 @@ void fixup_call(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 /// Resolves to:
 /// 0000 00kk kkkk k000
 /// Offset of 0 (so the result is left shifted by 3 bits before application).
-void fixup_7_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-                   MCContext *Ctx = nullptr) {
+static void fixup_7_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+                          MCContext *Ctx = nullptr) {
   adjustRelativeBranch(Size, Fixup, Value, Ctx);
 
   // Because the value may be negative, we must mask out the sign bits
@@ -131,21 +131,33 @@ void fixup_7_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
 /// Resolves to:
 /// 0000 kkkk kkkk kkkk
 /// Offset of 0 (so the result isn't left-shifted before application).
-void fixup_13_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-                    MCContext *Ctx = nullptr) {
+static void fixup_13_pcrel(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+                           MCContext *Ctx = nullptr) {
   adjustRelativeBranch(Size, Fixup, Value, Ctx);
 
   // Because the value may be negative, we must mask out the sign bits
   Value &= 0xfff;
 }
 
+/// 6-bit fixup for the immediate operand of the STD/LDD family of
+/// instructions.
+///
+/// Resolves to:
+/// 10q0 qq10 0000 1qqq
+static void fixup_6(const MCFixup &Fixup, uint64_t &Value,
+                    MCContext *Ctx = nullptr) {
+  unsigned_width(6, Value, std::string("immediate"), Fixup, Ctx);
+
+  Value = ((Value & 0x20) << 8) | ((Value & 0x18) << 7) | (Value & 0x07);
+}
+
 /// 6-bit fixup for the immediate operand of the ADIW family of
 /// instructions.
 ///
 /// Resolves to:
 /// 0000 0000 kk00 kkkk
-void fixup_6_adiw(const MCFixup &Fixup, uint64_t &Value,
-                  MCContext *Ctx = nullptr) {
+static void fixup_6_adiw(const MCFixup &Fixup, uint64_t &Value,
+                         MCContext *Ctx = nullptr) {
   unsigned_width(6, Value, std::string("immediate"), Fixup, Ctx);
 
   Value = ((Value & 0x30) << 2) | (Value & 0x0f);
@@ -155,8 +167,8 @@ void fixup_6_adiw(const MCFixup &Fixup, uint64_t &Value,
 ///
 /// Resolves to:
 /// 0000 0000 AAAA A000
-void fixup_port5(const MCFixup &Fixup, uint64_t &Value,
-                 MCContext *Ctx = nullptr) {
+static void fixup_port5(const MCFixup &Fixup, uint64_t &Value,
+                        MCContext *Ctx = nullptr) {
   unsigned_width(5, Value, std::string("port number"), Fixup, Ctx);
 
   Value &= 0x1f;
@@ -168,8 +180,8 @@ void fixup_port5(const MCFixup &Fixup, uint64_t &Value,
 ///
 /// Resolves to:
 /// 1011 0AAd dddd AAAA
-void fixup_port6(const MCFixup &Fixup, uint64_t &Value,
-                 MCContext *Ctx = nullptr) {
+static void fixup_port6(const MCFixup &Fixup, uint64_t &Value,
+                        MCContext *Ctx = nullptr) {
   unsigned_width(6, Value, std::string("port number"), Fixup, Ctx);
 
   Value = ((Value & 0x30) << 5) | (Value & 0x0f);
@@ -177,9 +189,7 @@ void fixup_port6(const MCFixup &Fixup, uint64_t &Value,
 
 /// Adjusts a program memory address.
 /// This is a simple right-shift.
-void pm(uint64_t &Value) {
-  Value >>= 1;
-}
+static void pm(uint64_t &Value) { Value >>= 1; }
 
 /// Fixups relating to the LDI instruction.
 namespace ldi {
@@ -189,36 +199,36 @@ namespace ldi {
 /// Resolves to:
 /// 0000 KKKK 0000 KKKK
 /// Offset of 0 (so the result isn't left-shifted before application).
-void fixup(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-           MCContext *Ctx = nullptr) {
+static void fixup(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+                  MCContext *Ctx = nullptr) {
   uint64_t upper = Value & 0xf0;
   uint64_t lower = Value & 0x0f;
 
   Value = (upper << 4) | lower;
 }
 
-void neg(uint64_t &Value) { Value *= -1; }
+static void neg(uint64_t &Value) { Value *= -1; }
 
-void lo8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-         MCContext *Ctx = nullptr) {
+static void lo8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+                MCContext *Ctx = nullptr) {
   Value &= 0xff;
   ldi::fixup(Size, Fixup, Value, Ctx);
 }
 
-void hi8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-         MCContext *Ctx = nullptr) {
+static void hi8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+                MCContext *Ctx = nullptr) {
   Value = (Value & 0xff00) >> 8;
   ldi::fixup(Size, Fixup, Value, Ctx);
 }
 
-void hh8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-         MCContext *Ctx = nullptr) {
+static void hh8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+                MCContext *Ctx = nullptr) {
   Value = (Value & 0xff0000) >> 16;
   ldi::fixup(Size, Fixup, Value, Ctx);
 }
 
-void ms8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
-         MCContext *Ctx = nullptr) {
+static void ms8(unsigned Size, const MCFixup &Fixup, uint64_t &Value,
+                MCContext *Ctx = nullptr) {
   Value = (Value & 0xff000000) >> 24;
   ldi::fixup(Size, Fixup, Value, Ctx);
 }
@@ -237,17 +247,6 @@ void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup,
   uint64_t Size = AVRAsmBackend::getFixupKindInfo(Fixup.getKind()).TargetSize;
 
   unsigned Kind = Fixup.getKind();
-
-  // Parsed LLVM-generated temporary labels are already
-  // adjusted for instruction size, but normal labels aren't.
-  //
-  // To handle both cases, we simply un-adjust the temporary label
-  // case so it acts like all other labels.
-  if (const MCSymbolRefExpr *A = Target.getSymA()) {
-    if (A->getSymbol().isTemporary())
-      Value += 2;
-  }
-
   switch (Kind) {
   default:
     llvm_unreachable("unhandled fixup");
@@ -326,6 +325,9 @@ void AVRAsmBackend::adjustFixupValue(const MCFixup &Fixup,
     Value &= 0xffff;
     break;
 
+  case AVR::fixup_6:
+    adjust::fixup_6(Fixup, Value, Ctx);
+    break;
   case AVR::fixup_6_adiw:
     adjust::fixup_6_adiw(Fixup, Value, Ctx);
     break;
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
index 1e713db38145..9e150f120dd4 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRAsmBackend.h
@@ -22,9 +22,6 @@
 namespace llvm {
 
 class MCAssembler;
-class MCObjectWriter;
-class Target;
-
 struct MCFixupKindInfo;
 
 /// Utilities for manipulating generated AVR machine code.
@@ -62,9 +59,6 @@ public:
     return false;
   }
 
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override {}
-
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h
index b3504b89e4d3..a0dd1dc8ac3e 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRFixupKinds.h
@@ -137,7 +137,7 @@ namespace fixups {
 /// of the fact that all instructions are aligned to addresses of size
 /// 2, so bit 0 of an address is always 0. This gives us another bit
 /// of precision.
-/// \param[in,out] The target to adjust.
+/// \param [in,out] val The target to adjust.
 template <typename T> inline void adjustBranchTarget(T &val) { val >>= 1; }
 
 } // end of namespace fixups
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
index 832112406155..42fac5e2e000 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.cpp
@@ -78,7 +78,7 @@ void AVRInstPrinter::printInst(const MCInst *MI, uint64_t Address,
     printOperand(MI, 2, O);
     break;
   default:
-    if (!printAliasInstr(MI, O))
+    if (!printAliasInstr(MI, Address, O))
       printInstruction(MI, Address, O);
 
     printAnnotation(O, Annot);
@@ -100,8 +100,25 @@ const char *AVRInstPrinter::getPrettyRegisterName(unsigned RegNum,
 
 void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
                                   raw_ostream &O) {
-  const MCOperand &Op = MI->getOperand(OpNo);
   const MCOperandInfo &MOI = this->MII.get(MI->getOpcode()).OpInfo[OpNo];
+  if (MOI.RegClass == AVR::ZREGRegClassID) {
+    // Special case for the Z register, which sometimes doesn't have an operand
+    // in the MCInst.
+    O << "Z";
+    return;
+  }
+
+  if (OpNo >= MI->size()) {
+    // Not all operands are correctly disassembled at the moment. This means
+    // that some machine instructions won't have all the necessary operands
+    // set.
+    // To avoid asserting, print <unknown> instead until the necessary support
+    // has been implemented.
+    O << "<unknown>";
+    return;
+  }
+
+  const MCOperand &Op = MI->getOperand(OpNo);
 
   if (Op.isReg()) {
     bool isPtrReg = (MOI.RegClass == AVR::PTRREGSRegClassID) ||
@@ -114,7 +131,7 @@ void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
       O << getPrettyRegisterName(Op.getReg(), MRI);
     }
   } else if (Op.isImm()) {
-    O << Op.getImm();
+    O << formatImm(Op.getImm());
   } else {
     assert(Op.isExpr() && "Unknown operand kind in printOperand");
     O << *Op.getExpr();
@@ -125,6 +142,16 @@ void AVRInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
 /// being encoded as a pc-relative value.
 void AVRInstPrinter::printPCRelImm(const MCInst *MI, unsigned OpNo,
                                    raw_ostream &O) {
+  if (OpNo >= MI->size()) {
+    // Not all operands are correctly disassembled at the moment. This means
+    // that some machine instructions won't have all the necessary operands
+    // set.
+    // To avoid asserting, print <unknown> instead until the necessary support
+    // has been implemented.
+    O << "<unknown>";
+    return;
+  }
+
   const MCOperand &Op = MI->getOperand(OpNo);
 
   if (Op.isImm()) {
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
index 247e9fc83989..910fd3455dee 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRInstPrinter.h
@@ -38,13 +38,18 @@ private:
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPCRelImm(const MCInst *MI, uint64_t /*Address*/, unsigned OpNo,
+                     raw_ostream &O) {
+    printPCRelImm(MI, OpNo, O);
+  }
   void printMemri(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   // Autogenerated by TableGen.
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
-  bool printAliasInstr(const MCInst *MI, raw_ostream &O);
-  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                               unsigned PrintMethodIdx, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &O);
+  void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                               unsigned OpIdx, unsigned PrintMethodIdx,
+                               raw_ostream &O);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
index c25a2b232013..b11ee42bfcd6 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCAsmInfo.cpp
@@ -21,8 +21,8 @@ AVRMCAsmInfo::AVRMCAsmInfo(const Triple &TT, const MCTargetOptions &Options) {
   CalleeSaveStackSlotSize = 2;
   CommentString = ";";
   PrivateGlobalPrefix = ".L";
+  PrivateLabelPrefix = ".L";
   UsesELFSectionDirectiveForBSS = true;
-  UseIntegratedAssembler = true;
   SupportsDebugInformation = true;
 }
 
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
index d9169f90a765..77b49931843b 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.cpp
@@ -20,7 +20,7 @@
 
 using namespace llvm;
 
-void AVRMCELFStreamer::EmitValueForModiferKind(
+void AVRMCELFStreamer::emitValueForModiferKind(
     const MCSymbol *Sym, unsigned SizeInBytes, SMLoc Loc,
     AVRMCExpr::VariantKind ModifierKind) {
   MCSymbolRefExpr::VariantKind Kind = MCSymbolRefExpr::VK_AVR_NONE;
@@ -36,7 +36,7 @@ void AVRMCELFStreamer::EmitValueForModiferKind(
     Kind = MCSymbolRefExpr::VK_AVR_HI8;
   else if (ModifierKind == AVRMCExpr::VK_AVR_HH8)
     Kind = MCSymbolRefExpr::VK_AVR_HLO8;
-  MCELFStreamer::EmitValue(MCSymbolRefExpr::create(Sym, Kind, getContext()),
+  MCELFStreamer::emitValue(MCSymbolRefExpr::create(Sym, Kind, getContext()),
                            SizeInBytes, Loc);
 }
 
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
index 37a610bc4248..1d05b8d56d93 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCELFStreamer.h
@@ -41,7 +41,7 @@ public:
                       std::move(Emitter)),
         MCII(createAVRMCInstrInfo()) {}
 
-  void EmitValueForModiferKind(
+  void emitValueForModiferKind(
       const MCSymbol *Sym, unsigned SizeInBytes, SMLoc Loc = SMLoc(),
       AVRMCExpr::VariantKind ModifierKind = AVRMCExpr::VK_AVR_None);
 };
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
index 470db01ff468..ef116793d326 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRMCTargetDesc.h
@@ -27,10 +27,7 @@ class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
-class StringRef;
 class Target;
-class Triple;
-class raw_pwrite_stream;
 
 MCInstrInfo *createAVRMCInstrInfo();
 
diff --git a/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp b/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
index 3487a2bbb864..eccd343d79ab 100644
--- a/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
+++ b/llvm/lib/Target/AVR/MCTargetDesc/AVRTargetStreamer.cpp
@@ -32,11 +32,11 @@ void AVRTargetStreamer::finish() {
 
   OS.emitRawComment(" Declaring this symbol tells the CRT that it should");
   OS.emitRawComment("copy all variables from program memory to RAM on startup");
-  OS.EmitSymbolAttribute(DoCopyData, MCSA_Global);
+  OS.emitSymbolAttribute(DoCopyData, MCSA_Global);
 
   OS.emitRawComment(" Declaring this symbol tells the CRT that it should");
   OS.emitRawComment("clear the zeroed data section on startup");
-  OS.EmitSymbolAttribute(DoClearBss, MCSA_Global);
+  OS.emitSymbolAttribute(DoClearBss, MCSA_Global);
 }
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
index 1f5d5025bc7b..57488bc28f98 100644
--- a/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
+++ b/llvm/lib/Target/BPF/AsmParser/BPFAsmParser.cpp
@@ -39,6 +39,8 @@ class BPFAsmParser : public MCTargetAsmParser {
                                bool MatchingInlineAsm) override;
 
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
 
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
@@ -295,7 +297,7 @@ bool BPFAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     break;
   case Match_Success:
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst, getSTI());
+    Out.emitInstruction(Inst, getSTI());
     return false;
   case Match_MissingFeature:
     return Error(IDLoc, "instruction use requires an option to be enabled");
@@ -322,6 +324,14 @@ bool BPFAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
 bool BPFAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                  SMLoc &EndLoc) {
+  if (tryParseRegister(RegNo, StartLoc, EndLoc) != MatchOperand_Success)
+    return Error(StartLoc, "invalid register name");
+  return false;
+}
+
+OperandMatchResultTy BPFAsmParser::tryParseRegister(unsigned &RegNo,
+                                                    SMLoc &StartLoc,
+                                                    SMLoc &EndLoc) {
   const AsmToken &Tok = getParser().getTok();
   StartLoc = Tok.getLoc();
   EndLoc = Tok.getEndLoc();
@@ -330,10 +340,10 @@ bool BPFAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
 
   if (!MatchRegisterName(Name)) {
     getParser().Lex(); // Eat identifier token.
-    return false;
+    return MatchOperand_Success;
   }
 
-  return Error(StartLoc, "invalid register name");
+  return MatchOperand_NoMatch;
 }
 
 OperandMatchResultTy
diff --git a/llvm/lib/Target/BPF/BPF.h b/llvm/lib/Target/BPF/BPF.h
index 6e4f35f4c5d7..4a46b11e5e08 100644
--- a/llvm/lib/Target/BPF/BPF.h
+++ b/llvm/lib/Target/BPF/BPF.h
@@ -16,6 +16,7 @@ namespace llvm {
 class BPFTargetMachine;
 
 ModulePass *createBPFAbstractMemberAccess(BPFTargetMachine *TM);
+ModulePass *createBPFPreserveDIType();
 
 FunctionPass *createBPFISelDag(BPFTargetMachine &TM);
 FunctionPass *createBPFMISimplifyPatchablePass();
@@ -25,6 +26,7 @@ FunctionPass *createBPFMIPreEmitPeepholePass();
 FunctionPass *createBPFMIPreEmitCheckingPass();
 
 void initializeBPFAbstractMemberAccessPass(PassRegistry&);
+void initializeBPFPreserveDITypePass(PassRegistry&);
 void initializeBPFMISimplifyPatchablePass(PassRegistry&);
 void initializeBPFMIPeepholePass(PassRegistry&);
 void initializeBPFMIPeepholeTruncElimPass(PassRegistry&);
diff --git a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
index a28816cc87b7..16708c4d1ce6 100644
--- a/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
+++ b/llvm/lib/Target/BPF/BPFAbstractMemberAccess.cpp
@@ -92,7 +92,7 @@
 #define DEBUG_TYPE "bpf-abstract-member-access"
 
 namespace llvm {
-const std::string BPFCoreSharedInfo::AmaAttr = "btf_ama";
+constexpr StringRef BPFCoreSharedInfo::AmaAttr;
 } // namespace llvm
 
 using namespace llvm;
@@ -117,7 +117,7 @@ public:
   struct CallInfo {
     uint32_t Kind;
     uint32_t AccessIndex;
-    uint32_t RecordAlignment;
+    Align RecordAlignment;
     MDNode *Metadata;
     Value *Base;
   };
@@ -157,11 +157,11 @@ private:
   void replaceWithGEP(std::vector<CallInst *> &CallList,
                       uint32_t NumOfZerosIndex, uint32_t DIIndex);
   bool HasPreserveFieldInfoCall(CallInfoStack &CallStack);
-  void GetStorageBitRange(DIDerivedType *MemberTy, uint32_t RecordAlignment,
+  void GetStorageBitRange(DIDerivedType *MemberTy, Align RecordAlignment,
                           uint32_t &StartBitOffset, uint32_t &EndBitOffset);
   uint32_t GetFieldInfo(uint32_t InfoKind, DICompositeType *CTy,
                         uint32_t AccessIndex, uint32_t PatchImm,
-                        uint32_t RecordAlignment);
+                        Align RecordAlignment);
 
   Value *computeBaseAndAccessKey(CallInst *Call, CallInfo &CInfo,
                                  std::string &AccessKey, MDNode *&BaseMeta);
@@ -189,18 +189,20 @@ bool BPFAbstractMemberAccess::runOnModule(Module &M) {
   return doTransformation(M);
 }
 
-static bool SkipDIDerivedTag(unsigned Tag) {
+static bool SkipDIDerivedTag(unsigned Tag, bool skipTypedef) {
   if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type &&
       Tag != dwarf::DW_TAG_volatile_type &&
       Tag != dwarf::DW_TAG_restrict_type &&
       Tag != dwarf::DW_TAG_member)
-     return false;
+    return false;
+  if (Tag == dwarf::DW_TAG_typedef && !skipTypedef)
+    return false;
   return true;
 }
 
-static DIType * stripQualifiers(DIType *Ty) {
+static DIType * stripQualifiers(DIType *Ty, bool skipTypedef = true) {
   while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
-    if (!SkipDIDerivedTag(DTy->getTag()))
+    if (!SkipDIDerivedTag(DTy->getTag(), skipTypedef))
       break;
     Ty = DTy->getBaseType();
   }
@@ -209,7 +211,7 @@ static DIType * stripQualifiers(DIType *Ty) {
 
 static const DIType * stripQualifiers(const DIType *Ty) {
   while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
-    if (!SkipDIDerivedTag(DTy->getTag()))
+    if (!SkipDIDerivedTag(DTy->getTag(), true))
       break;
     Ty = DTy->getBaseType();
   }
@@ -237,7 +239,7 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
   if (!Call)
     return false;
 
-  const auto *GV = dyn_cast<GlobalValue>(Call->getCalledValue());
+  const auto *GV = dyn_cast<GlobalValue>(Call->getCalledOperand());
   if (!GV)
     return false;
   if (GV->getName().startswith("llvm.preserve.array.access.index")) {
@@ -248,7 +250,7 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
     CInfo.AccessIndex = getConstant(Call->getArgOperand(2));
     CInfo.Base = Call->getArgOperand(0);
     CInfo.RecordAlignment =
-        DL->getABITypeAlignment(CInfo.Base->getType()->getPointerElementType());
+        DL->getABITypeAlign(CInfo.Base->getType()->getPointerElementType());
     return true;
   }
   if (GV->getName().startswith("llvm.preserve.union.access.index")) {
@@ -259,7 +261,7 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
     CInfo.AccessIndex = getConstant(Call->getArgOperand(1));
     CInfo.Base = Call->getArgOperand(0);
     CInfo.RecordAlignment =
-        DL->getABITypeAlignment(CInfo.Base->getType()->getPointerElementType());
+        DL->getABITypeAlign(CInfo.Base->getType()->getPointerElementType());
     return true;
   }
   if (GV->getName().startswith("llvm.preserve.struct.access.index")) {
@@ -270,7 +272,7 @@ bool BPFAbstractMemberAccess::IsPreserveDIAccessIndexCall(const CallInst *Call,
     CInfo.AccessIndex = getConstant(Call->getArgOperand(2));
     CInfo.Base = Call->getArgOperand(0);
     CInfo.RecordAlignment =
-        DL->getABITypeAlignment(CInfo.Base->getType()->getPointerElementType());
+        DL->getABITypeAlign(CInfo.Base->getType()->getPointerElementType());
     return true;
   }
   if (GV->getName().startswith("llvm.bpf.preserve.field.info")) {
@@ -520,12 +522,12 @@ uint64_t BPFAbstractMemberAccess::getConstant(const Value *IndexValue) {
 
 /// Get the start and the end of storage offset for \p MemberTy.
 void BPFAbstractMemberAccess::GetStorageBitRange(DIDerivedType *MemberTy,
-                                                 uint32_t RecordAlignment,
+                                                 Align RecordAlignment,
                                                  uint32_t &StartBitOffset,
                                                  uint32_t &EndBitOffset) {
   uint32_t MemberBitSize = MemberTy->getSizeInBits();
   uint32_t MemberBitOffset = MemberTy->getOffsetInBits();
-  uint32_t AlignBits = RecordAlignment * 8;
+  uint32_t AlignBits = RecordAlignment.value() * 8;
   if (RecordAlignment > 8 || MemberBitSize > AlignBits)
     report_fatal_error("Unsupported field expression for llvm.bpf.preserve.field.info, "
                        "requiring too big alignment");
@@ -541,7 +543,7 @@ uint32_t BPFAbstractMemberAccess::GetFieldInfo(uint32_t InfoKind,
                                                DICompositeType *CTy,
                                                uint32_t AccessIndex,
                                                uint32_t PatchImm,
-                                               uint32_t RecordAlignment) {
+                                               Align RecordAlignment) {
   if (InfoKind == BPFCoreSharedInfo::FIELD_EXISTENCE)
       return 1;
 
@@ -710,7 +712,7 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call,
   // calculated here as all debuginfo types are available.
 
   // Get type name and calculate the first index.
-  // We only want to get type name from structure or union.
+  // We only want to get type name from typedef, structure or union.
   // If user wants a relocation like
   //    int *p; ... __builtin_preserve_access_index(&p[4]) ...
   // or
@@ -727,12 +729,15 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call,
     if (!Base)
       Base = CInfo.Base;
 
-    DIType *Ty = stripQualifiers(cast<DIType>(CInfo.Metadata));
+    DIType *PossibleTypeDef = stripQualifiers(cast<DIType>(CInfo.Metadata),
+                                              false);
+    DIType *Ty = stripQualifiers(PossibleTypeDef);
     if (CInfo.Kind == BPFPreserveUnionAI ||
         CInfo.Kind == BPFPreserveStructAI) {
-      // struct or union type
-      TypeName = Ty->getName();
-      TypeMeta = Ty;
+      // struct or union type. If the typedef is in the metadata, always
+      // use the typedef.
+      TypeName = std::string(PossibleTypeDef->getName());
+      TypeMeta = PossibleTypeDef;
       PatchImm += FirstIndex * (Ty->getSizeInBits() >> 3);
       break;
     }
@@ -782,7 +787,7 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call,
 
       unsigned CTag = CTy->getTag();
       if (CTag == dwarf::DW_TAG_structure_type || CTag == dwarf::DW_TAG_union_type) {
-        TypeName = CTy->getName();
+        TypeName = std::string(CTy->getName());
       } else {
         if (HasPreserveFieldInfoCall(CallStack))
           report_fatal_error("Invalid field access for llvm.preserve.field.info intrinsic");
@@ -803,8 +808,10 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call,
     CInfo = StackElem.second;
     CallStack.pop();
 
-    if (CInfo.Kind == BPFPreserveFieldInfoAI)
+    if (CInfo.Kind == BPFPreserveFieldInfoAI) {
+      InfoKind = CInfo.AccessIndex;
       break;
+    }
 
     // If the next Call (the top of the stack) is a BPFPreserveFieldInfoAI,
     // the action will be extracting field info.
@@ -822,11 +829,10 @@ Value *BPFAbstractMemberAccess::computeBaseAndAccessKey(CallInst *Call,
     AccessKey += ":" + std::to_string(AccessIndex);
 
     MDNode *MDN = CInfo.Metadata;
-    uint32_t RecordAlignment = CInfo.RecordAlignment;
     // At this stage, it cannot be pointer type.
     auto *CTy = cast<DICompositeType>(stripQualifiers(cast<DIType>(MDN)));
     PatchImm = GetFieldInfo(InfoKind, CTy, AccessIndex, PatchImm,
-                            RecordAlignment);
+                            CInfo.RecordAlignment);
   }
 
   // Access key is the
@@ -873,8 +879,8 @@ bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
 
   if (CInfo.Kind == BPFPreserveFieldInfoAI) {
     // Load the global variable which represents the returned field info.
-    auto *LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV);
-    BB->getInstList().insert(Call->getIterator(), LDInst);
+    auto *LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV, "",
+                                Call);
     Call->replaceAllUsesWith(LDInst);
     Call->eraseFromParent();
     return true;
@@ -891,8 +897,7 @@ bool BPFAbstractMemberAccess::transformGEPChain(Module &M, CallInst *Call,
   // The original Call inst is removed.
 
   // Load the global variable.
-  auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV);
-  BB->getInstList().insert(Call->getIterator(), LDInst);
+  auto *LDInst = new LoadInst(Type::getInt64Ty(BB->getContext()), GV, "", Call);
 
   // Generate a BitCast
   auto *BCInst = new BitCastInst(Base, Type::getInt8PtrTy(BB->getContext()));
diff --git a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
index b81386f479d3..37950e105bdc 100644
--- a/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
+++ b/llvm/lib/Target/BPF/BPFAsmPrinter.cpp
@@ -48,7 +48,7 @@ public:
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNum,
                              const char *ExtraCode, raw_ostream &O) override;
 
-  void EmitInstruction(const MachineInstr *MI) override;
+  void emitInstruction(const MachineInstr *MI) override;
 
 private:
   BTFDebug *BTF;
@@ -137,7 +137,7 @@ bool BPFAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
-void BPFAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void BPFAsmPrinter::emitInstruction(const MachineInstr *MI) {
   MCInst TmpInst;
 
   if (!BTF || !BTF->InstLower(MI, TmpInst)) {
diff --git a/llvm/lib/Target/BPF/BPFCORE.h b/llvm/lib/Target/BPF/BPFCORE.h
index ed4778353e52..af6425b16fa0 100644
--- a/llvm/lib/Target/BPF/BPFCORE.h
+++ b/llvm/lib/Target/BPF/BPFCORE.h
@@ -9,22 +9,36 @@
 #ifndef LLVM_LIB_TARGET_BPF_BPFCORE_H
 #define LLVM_LIB_TARGET_BPF_BPFCORE_H
 
+#include "llvm/ADT/StringRef.h"
+
 namespace llvm {
 
 class BPFCoreSharedInfo {
 public:
-  enum OffsetRelocKind : uint32_t {
+  enum PatchableRelocKind : uint32_t {
     FIELD_BYTE_OFFSET = 0,
     FIELD_BYTE_SIZE,
     FIELD_EXISTENCE,
     FIELD_SIGNEDNESS,
     FIELD_LSHIFT_U64,
     FIELD_RSHIFT_U64,
+    BTF_TYPE_ID_LOCAL,
+    BTF_TYPE_ID_REMOTE,
 
     MAX_FIELD_RELOC_KIND,
   };
+
+  enum BTFTypeIdFlag : uint32_t {
+    BTF_TYPE_ID_LOCAL_RELOC = 0,
+    BTF_TYPE_ID_REMOTE_RELOC,
+
+    MAX_BTF_TYPE_ID_FLAG,
+  };
+
   /// The attribute attached to globals representing a field access
-  static const std::string AmaAttr;
+  static constexpr StringRef AmaAttr = "btf_ama";
+  /// The attribute attached to globals representing a type id
+  static constexpr StringRef TypeIdAttr = "btf_type_id";
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
index 6f5f58554d09..d407edfbd966 100644
--- a/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
+++ b/llvm/lib/Target/BPF/BPFISelDAGToDAG.cpp
@@ -304,7 +304,7 @@ void BPFDAGToDAGISel::PreprocessLoad(SDNode *Node,
 
   LLVM_DEBUG(dbgs() << "Replacing load of size " << size << " with constant "
                     << val << '\n');
-  SDValue NVal = CurDAG->getConstant(val, DL, MVT::i64);
+  SDValue NVal = CurDAG->getConstant(val, DL, LD->getValueType(0));
 
   // After replacement, the current node is dead, we need to
   // go backward one step to make iterator still work
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.cpp b/llvm/lib/Target/BPF/BPFISelLowering.cpp
index 56e0288f26c9..a02556a39909 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFISelLowering.cpp
@@ -171,6 +171,38 @@ bool BPFTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) cons
   return false;
 }
 
+bool BPFTargetLowering::isTruncateFree(Type *Ty1, Type *Ty2) const {
+  if (!Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 > NumBits2;
+}
+
+bool BPFTargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
+  if (!VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 > NumBits2;
+}
+
+bool BPFTargetLowering::isZExtFree(Type *Ty1, Type *Ty2) const {
+  if (!getHasAlu32() || !Ty1->isIntegerTy() || !Ty2->isIntegerTy())
+    return false;
+  unsigned NumBits1 = Ty1->getPrimitiveSizeInBits();
+  unsigned NumBits2 = Ty2->getPrimitiveSizeInBits();
+  return NumBits1 == 32 && NumBits2 == 64;
+}
+
+bool BPFTargetLowering::isZExtFree(EVT VT1, EVT VT2) const {
+  if (!getHasAlu32() || !VT1.isInteger() || !VT2.isInteger())
+    return false;
+  unsigned NumBits1 = VT1.getSizeInBits();
+  unsigned NumBits2 = VT2.getSizeInBits();
+  return NumBits1 == 32 && NumBits2 == 64;
+}
+
 std::pair<unsigned, const TargetRegisterClass *>
 BPFTargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
                                                 StringRef Constraint,
@@ -195,6 +227,8 @@ SDValue BPFTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     return LowerGlobalAddress(Op, DAG);
   case ISD::SELECT_CC:
     return LowerSELECT_CC(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    report_fatal_error("Unsupported dynamic stack allocation");
   default:
     llvm_unreachable("unimplemented operand");
   }
@@ -570,6 +604,12 @@ BPFTargetLowering::EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB,
   DebugLoc DL = MI.getDebugLoc();
 
   MachineRegisterInfo &RegInfo = F->getRegInfo();
+
+  if (!isSigned) {
+    Register PromotedReg0 = RegInfo.createVirtualRegister(RC);
+    BuildMI(BB, DL, TII.get(BPF::MOV_32_64), PromotedReg0).addReg(Reg);
+    return PromotedReg0;
+  }
   Register PromotedReg0 = RegInfo.createVirtualRegister(RC);
   Register PromotedReg1 = RegInfo.createVirtualRegister(RC);
   Register PromotedReg2 = RegInfo.createVirtualRegister(RC);
diff --git a/llvm/lib/Target/BPF/BPFISelLowering.h b/llvm/lib/Target/BPF/BPFISelLowering.h
index b81bf4e1320d..cc752dda87b0 100644
--- a/llvm/lib/Target/BPF/BPFISelLowering.h
+++ b/llvm/lib/Target/BPF/BPFISelLowering.h
@@ -99,10 +99,9 @@ private:
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &DL,
                       SelectionDAG &DAG) const override;
 
-  EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
-                          bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+  EVT getOptimalMemOpType(const MemOp &Op,
                           const AttributeList &FuncAttributes) const override {
-    return Size >= 8 ? MVT::i64 : MVT::i32;
+    return Op.size() >= 8 ? MVT::i64 : MVT::i32;
   }
 
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
@@ -110,6 +109,29 @@ private:
     return true;
   }
 
+  // Prevent reducing load width during SelectionDag phase.
+  // Otherwise, we may transform the following
+  //   ctx = ctx + reloc_offset
+  //   ... (*(u32 *)ctx) & 0x8000...
+  // to
+  //   ctx = ctx + reloc_offset
+  //   ... (*(u8 *)(ctx + 1)) & 0x80 ...
+  // which will be rejected by the verifier.
+  bool shouldReduceLoadWidth(SDNode *Load, ISD::LoadExtType ExtTy,
+                             EVT NewVT) const override {
+    return false;
+  }
+
+  // isTruncateFree - Return true if it's free to truncate a value of
+  // type Ty1 to type Ty2. e.g. On BPF at alu32 mode, it's free to truncate
+  // a i64 value in register R1 to i32 by referencing its sub-register W1.
+  bool isTruncateFree(Type *Ty1, Type *Ty2) const override;
+  bool isTruncateFree(EVT VT1, EVT VT2) const override;
+
+  // For 32bit ALU result zext to 64bit is free.
+  bool isZExtFree(Type *Ty1, Type *Ty2) const override;
+  bool isZExtFree(EVT VT1, EVT VT2) const override;
+
   unsigned EmitSubregExt(MachineInstr &MI, MachineBasicBlock *BB, unsigned Reg,
                          bool isSigned) const;
 
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.cpp b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
index 626c204e99e6..54360a89782b 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.cpp
@@ -123,7 +123,7 @@ bool BPFInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 
 void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I,
-                                       unsigned SrcReg, bool IsKill, int FI,
+                                       Register SrcReg, bool IsKill, int FI,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   DebugLoc DL;
@@ -146,7 +146,7 @@ void BPFInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
 void BPFInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator I,
-                                        unsigned DestReg, int FI,
+                                        Register DestReg, int FI,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
   DebugLoc DL;
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.h b/llvm/lib/Target/BPF/BPFInstrInfo.h
index 22413d530e17..e797363ead8f 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.h
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.h
@@ -36,13 +36,13 @@ public:
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+                           MachineBasicBlock::iterator MBBI, Register SrcReg,
                            bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI, unsigned DestReg,
+                            MachineBasicBlock::iterator MBBI, Register DestReg,
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
diff --git a/llvm/lib/Target/BPF/BPFInstrInfo.td b/llvm/lib/Target/BPF/BPFInstrInfo.td
index 0f39294daa2b..4298e2eaec04 100644
--- a/llvm/lib/Target/BPF/BPFInstrInfo.td
+++ b/llvm/lib/Target/BPF/BPFInstrInfo.td
@@ -526,7 +526,7 @@ class NOP_I<string OpcodeStr>
   let BPFClass = BPF_ALU64;
 }
 
-let hasSideEffects = 0 in
+let hasSideEffects = 0, isCodeGenOnly = 1 in
   def NOP : NOP_I<"nop">;
 
 class RET<string OpcodeStr>
@@ -732,8 +732,7 @@ let isCodeGenOnly = 1 in {
 def : Pat<(i64 (sext GPR32:$src)),
           (SRA_ri (SLL_ri (MOV_32_64 GPR32:$src), 32), 32)>;
 
-def : Pat<(i64 (zext GPR32:$src)),
-          (SRL_ri (SLL_ri (MOV_32_64 GPR32:$src), 32), 32)>;
+def : Pat<(i64 (zext GPR32:$src)), (MOV_32_64 GPR32:$src)>;
 
 // For i64 -> i32 truncation, use the 32-bit subregister directly.
 def : Pat<(i32 (trunc GPR:$src)),
diff --git a/llvm/lib/Target/BPF/BPFMCInstLower.h b/llvm/lib/Target/BPF/BPFMCInstLower.h
index 0622d20814d3..4bd0f1f0bf1c 100644
--- a/llvm/lib/Target/BPF/BPFMCInstLower.h
+++ b/llvm/lib/Target/BPF/BPFMCInstLower.h
@@ -18,9 +18,7 @@ class MCInst;
 class MCOperand;
 class MCSymbol;
 class MachineInstr;
-class MachineModuleInfoMachO;
 class MachineOperand;
-class Mangler;
 
 // BPFMCInstLower - This class is used to lower an MachineInstr into an MCInst.
 class LLVM_LIBRARY_VISIBILITY BPFMCInstLower {
diff --git a/llvm/lib/Target/BPF/BPFMIPeephole.cpp b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
index 022267fbe3c2..df870314fffe 100644
--- a/llvm/lib/Target/BPF/BPFMIPeephole.cpp
+++ b/llvm/lib/Target/BPF/BPFMIPeephole.cpp
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/Support/Debug.h"
+#include <set>
 
 using namespace llvm;
 
@@ -56,6 +57,7 @@ private:
   bool isPhiFrom32Def(MachineInstr *MovMI);
   bool isMovFrom32Def(MachineInstr *MovMI);
   bool eliminateZExtSeq(void);
+  bool eliminateZExt(void);
 
   std::set<MachineInstr *> PhiInsns;
 
@@ -68,7 +70,12 @@ public:
 
     initialize(MF);
 
-    return eliminateZExtSeq();
+    // First try to eliminate (zext, lshift, rshift) and then
+    // try to eliminate zext.
+    bool ZExtSeqExist, ZExtExist;
+    ZExtSeqExist = eliminateZExtSeq();
+    ZExtExist = eliminateZExt();
+    return ZExtSeqExist || ZExtExist;
   }
 };
 
@@ -233,6 +240,51 @@ bool BPFMIPeephole::eliminateZExtSeq(void) {
   return Eliminated;
 }
 
+bool BPFMIPeephole::eliminateZExt(void) {
+  MachineInstr* ToErase = nullptr;
+  bool Eliminated = false;
+
+  for (MachineBasicBlock &MBB : *MF) {
+    for (MachineInstr &MI : MBB) {
+      // If the previous instruction was marked for elimination, remove it now.
+      if (ToErase) {
+        ToErase->eraseFromParent();
+        ToErase = nullptr;
+      }
+
+      if (MI.getOpcode() != BPF::MOV_32_64)
+        continue;
+
+      // Eliminate MOV_32_64 if possible.
+      //   MOV_32_64 rA, wB
+      //
+      // If wB has been zero extended, replace it with a SUBREG_TO_REG.
+      // This is to workaround BPF programs where pkt->{data, data_end}
+      // is encoded as u32, but actually the verifier populates them
+      // as 64bit pointer. The MOV_32_64 will zero out the top 32 bits.
+      LLVM_DEBUG(dbgs() << "Candidate MOV_32_64 instruction:");
+      LLVM_DEBUG(MI.dump());
+
+      if (!isMovFrom32Def(&MI))
+        continue;
+
+      LLVM_DEBUG(dbgs() << "Removing the MOV_32_64 instruction\n");
+
+      Register dst = MI.getOperand(0).getReg();
+      Register src = MI.getOperand(1).getReg();
+
+      // Build a SUBREG_TO_REG instruction.
+      BuildMI(MBB, MI, MI.getDebugLoc(), TII->get(BPF::SUBREG_TO_REG), dst)
+        .addImm(0).addReg(src).addImm(BPF::sub_32);
+
+      ToErase = &MI;
+      Eliminated = true;
+    }
+  }
+
+  return Eliminated;
+}
+
 } // end default namespace
 
 INITIALIZE_PASS(BPFMIPeephole, DEBUG_TYPE,
@@ -300,19 +352,16 @@ bool BPFMIPreEmitPeephole::eliminateRedundantMov(void) {
       //
       //   MOV rA, rA
       //
-      // This is particularly possible to happen when sub-register support
-      // enabled. The special type cast insn MOV_32_64 involves different
-      // register class on src (i32) and dst (i64), RA could generate useless
-      // instruction due to this.
+      // Note that we cannot remove
+      //   MOV_32_64  rA, wA
+      //   MOV_rr_32  wA, wA
+      // as these two instructions having side effects, zeroing out
+      // top 32 bits of rA.
       unsigned Opcode = MI.getOpcode();
-      if (Opcode == BPF::MOV_32_64 ||
-          Opcode == BPF::MOV_rr || Opcode == BPF::MOV_rr_32) {
+      if (Opcode == BPF::MOV_rr) {
         Register dst = MI.getOperand(0).getReg();
         Register src = MI.getOperand(1).getReg();
 
-        if (Opcode == BPF::MOV_32_64)
-          dst = TRI->getSubReg(dst, BPF::sub_32);
-
         if (dst != src)
           continue;
 
diff --git a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
index 5310f0f07b65..ae1f5ea21c12 100644
--- a/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
+++ b/llvm/lib/Target/BPF/BPFMISimplifyPatchable.cpp
@@ -22,6 +22,9 @@
 //    r1 = <calculated field_info>
 //    add r3, struct_base_reg, r1
 //
+// This pass also removes the intermediate load generated in IR pass for
+// __builtin_btf_type_id() intrinsic.
+//
 //===----------------------------------------------------------------------===//
 
 #include "BPF.h"
@@ -55,10 +58,10 @@ private:
   bool removeLD(void);
   void processCandidate(MachineRegisterInfo *MRI, MachineBasicBlock &MBB,
                         MachineInstr &MI, Register &SrcReg, Register &DstReg,
-                        const GlobalValue *GVal);
+                        const GlobalValue *GVal, bool IsAma);
   void processDstReg(MachineRegisterInfo *MRI, Register &DstReg,
                      Register &SrcReg, const GlobalValue *GVal,
-                     bool doSrcRegProp);
+                     bool doSrcRegProp, bool IsAma);
   void processInst(MachineRegisterInfo *MRI, MachineInstr *Inst,
                    MachineOperand *RelocOp, const GlobalValue *GVal);
   void checkADDrr(MachineRegisterInfo *MRI, MachineOperand *RelocOp,
@@ -70,9 +73,10 @@ private:
 public:
   // Main entry point for this pass.
   bool runOnMachineFunction(MachineFunction &MF) override {
-    if (!skipFunction(MF.getFunction())) {
-      initialize(MF);
-    }
+    if (skipFunction(MF.getFunction()))
+      return false;
+
+    initialize(MF);
     return removeLD();
   }
 };
@@ -115,11 +119,22 @@ void BPFMISimplifyPatchable::checkADDrr(MachineRegisterInfo *MRI,
     else
       continue;
 
-    // It must be a form of %1 = *(type *)(%2 + 0) or *(type *)(%2 + 0) = %1.
+    // It must be a form of %2 = *(type *)(%1 + 0) or *(type *)(%1 + 0) = %2.
     const MachineOperand &ImmOp = DefInst->getOperand(2);
     if (!ImmOp.isImm() || ImmOp.getImm() != 0)
       continue;
 
+    // Reject the form:
+    //   %1 = ADD_rr %2, %3
+    //   *(type *)(%2 + 0) = %1
+    if (Opcode == BPF::STB || Opcode == BPF::STH || Opcode == BPF::STW ||
+        Opcode == BPF::STD || Opcode == BPF::STB32 || Opcode == BPF::STH32 ||
+        Opcode == BPF::STW32) {
+      const MachineOperand &Opnd = DefInst->getOperand(0);
+      if (Opnd.isReg() && Opnd.getReg() == I->getReg())
+        continue;
+    }
+
     BuildMI(*DefInst->getParent(), *DefInst, DefInst->getDebugLoc(), TII->get(COREOp))
         .add(DefInst->getOperand(0)).addImm(Opcode).add(*BaseOp)
         .addGlobalAddress(GVal);
@@ -143,25 +158,27 @@ void BPFMISimplifyPatchable::checkShift(MachineRegisterInfo *MRI,
 
 void BPFMISimplifyPatchable::processCandidate(MachineRegisterInfo *MRI,
     MachineBasicBlock &MBB, MachineInstr &MI, Register &SrcReg,
-    Register &DstReg, const GlobalValue *GVal) {
+    Register &DstReg, const GlobalValue *GVal, bool IsAma) {
   if (MRI->getRegClass(DstReg) == &BPF::GPR32RegClass) {
-    // We can optimize such a pattern:
-    //  %1:gpr = LD_imm64 @"llvm.s:0:4$0:2"
-    //  %2:gpr32 = LDW32 %1:gpr, 0
-    //  %3:gpr = SUBREG_TO_REG 0, %2:gpr32, %subreg.sub_32
-    //  %4:gpr = ADD_rr %0:gpr, %3:gpr
-    //  or similar patterns below for non-alu32 case.
-    auto Begin = MRI->use_begin(DstReg), End = MRI->use_end();
-    decltype(End) NextI;
-    for (auto I = Begin; I != End; I = NextI) {
-      NextI = std::next(I);
-      if (!MRI->getUniqueVRegDef(I->getReg()))
-        continue;
-
-      unsigned Opcode = I->getParent()->getOpcode();
-      if (Opcode == BPF::SUBREG_TO_REG) {
-        Register TmpReg = I->getParent()->getOperand(0).getReg();
-        processDstReg(MRI, TmpReg, DstReg, GVal, false);
+    if (IsAma) {
+      // We can optimize such a pattern:
+      //  %1:gpr = LD_imm64 @"llvm.s:0:4$0:2"
+      //  %2:gpr32 = LDW32 %1:gpr, 0
+      //  %3:gpr = SUBREG_TO_REG 0, %2:gpr32, %subreg.sub_32
+      //  %4:gpr = ADD_rr %0:gpr, %3:gpr
+      //  or similar patterns below for non-alu32 case.
+      auto Begin = MRI->use_begin(DstReg), End = MRI->use_end();
+      decltype(End) NextI;
+      for (auto I = Begin; I != End; I = NextI) {
+        NextI = std::next(I);
+        if (!MRI->getUniqueVRegDef(I->getReg()))
+          continue;
+
+        unsigned Opcode = I->getParent()->getOpcode();
+        if (Opcode == BPF::SUBREG_TO_REG) {
+          Register TmpReg = I->getParent()->getOperand(0).getReg();
+          processDstReg(MRI, TmpReg, DstReg, GVal, false, IsAma);
+        }
       }
     }
 
@@ -171,12 +188,12 @@ void BPFMISimplifyPatchable::processCandidate(MachineRegisterInfo *MRI,
   }
 
   // All uses of DstReg replaced by SrcReg
-  processDstReg(MRI, DstReg, SrcReg, GVal, true);
+  processDstReg(MRI, DstReg, SrcReg, GVal, true, IsAma);
 }
 
 void BPFMISimplifyPatchable::processDstReg(MachineRegisterInfo *MRI,
     Register &DstReg, Register &SrcReg, const GlobalValue *GVal,
-    bool doSrcRegProp) {
+    bool doSrcRegProp, bool IsAma) {
   auto Begin = MRI->use_begin(DstReg), End = MRI->use_end();
   decltype(End) NextI;
   for (auto I = Begin; I != End; I = NextI) {
@@ -185,7 +202,7 @@ void BPFMISimplifyPatchable::processDstReg(MachineRegisterInfo *MRI,
       I->setReg(SrcReg);
 
     // The candidate needs to have a unique definition.
-    if (MRI->getUniqueVRegDef(I->getReg()))
+    if (IsAma && MRI->getUniqueVRegDef(I->getReg()))
       processInst(MRI, I->getParent(), &*I, GVal);
   }
 }
@@ -257,28 +274,26 @@ bool BPFMISimplifyPatchable::removeLD() {
       if (!DefInst)
         continue;
 
-      bool IsCandidate = false;
-      const GlobalValue *GVal = nullptr;
-      if (DefInst->getOpcode() == BPF::LD_imm64) {
-        const MachineOperand &MO = DefInst->getOperand(1);
-        if (MO.isGlobal()) {
-          GVal = MO.getGlobal();
-          auto *GVar = dyn_cast<GlobalVariable>(GVal);
-          if (GVar) {
-            // Global variables representing structure offset or
-            // patchable extern globals.
-            if (GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
-              assert(MI.getOperand(2).getImm() == 0);
-              IsCandidate = true;
-            }
-          }
-        }
-      }
+      if (DefInst->getOpcode() != BPF::LD_imm64)
+        continue;
+
+      const MachineOperand &MO = DefInst->getOperand(1);
+      if (!MO.isGlobal())
+        continue;
+
+      const GlobalValue *GVal = MO.getGlobal();
+      auto *GVar = dyn_cast<GlobalVariable>(GVal);
+      if (!GVar)
+        continue;
 
-      if (!IsCandidate)
+      // Global variables representing structure offset or type id.
+      bool IsAma = false;
+      if (GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr))
+        IsAma = true;
+      else if (!GVar->hasAttribute(BPFCoreSharedInfo::TypeIdAttr))
         continue;
 
-      processCandidate(MRI, MBB, MI, SrcReg, DstReg, GVal);
+      processCandidate(MRI, MBB, MI, SrcReg, DstReg, GVal, IsAma);
 
       ToErase = &MI;
       Changed = true;
diff --git a/llvm/lib/Target/BPF/BPFPreserveDIType.cpp b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
new file mode 100644
index 000000000000..c3cb7647aa79
--- /dev/null
+++ b/llvm/lib/Target/BPF/BPFPreserveDIType.cpp
@@ -0,0 +1,131 @@
+//===----------- BPFPreserveDIType.cpp - Preserve DebugInfo Types ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Preserve Debuginfo types encoded in __builtin_btf_type_id() metadata.
+//
+//===----------------------------------------------------------------------===//
+
+#include "BPF.h"
+#include "BPFCORE.h"
+#include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "bpf-preserve-di-type"
+
+namespace llvm {
+constexpr StringRef BPFCoreSharedInfo::TypeIdAttr;
+} // namespace llvm
+
+using namespace llvm;
+
+namespace {
+
+class BPFPreserveDIType final : public ModulePass {
+  StringRef getPassName() const override {
+    return "BPF Preserve DebugInfo Type";
+  }
+
+  bool runOnModule(Module &M) override;
+
+public:
+  static char ID;
+  BPFPreserveDIType() : ModulePass(ID) {}
+
+private:
+  bool doTransformation(Module &M);
+};
+} // End anonymous namespace
+
+char BPFPreserveDIType::ID = 0;
+INITIALIZE_PASS(BPFPreserveDIType, DEBUG_TYPE, "preserve debuginfo type", false,
+                false)
+
+ModulePass *llvm::createBPFPreserveDIType() { return new BPFPreserveDIType(); }
+
+bool BPFPreserveDIType::runOnModule(Module &M) {
+  LLVM_DEBUG(dbgs() << "********** preserve debuginfo type **********\n");
+
+  // Bail out if no debug info.
+  if (M.debug_compile_units().empty())
+    return false;
+
+  return doTransformation(M);
+}
+
+bool BPFPreserveDIType::doTransformation(Module &M) {
+  std::vector<CallInst *> PreserveDITypeCalls;
+
+  for (auto &F : M) {
+    for (auto &BB : F) {
+      for (auto &I : BB) {
+        auto *Call = dyn_cast<CallInst>(&I);
+        if (!Call)
+          continue;
+
+        const auto *GV = dyn_cast<GlobalValue>(Call->getCalledOperand());
+        if (!GV)
+          continue;
+
+        if (GV->getName().startswith("llvm.bpf.btf.type.id")) {
+          if (!Call->getMetadata(LLVMContext::MD_preserve_access_index))
+            report_fatal_error(
+                "Missing metadata for llvm.bpf.btf.type.id intrinsic");
+          PreserveDITypeCalls.push_back(Call);
+        }
+      }
+    }
+  }
+
+  if (PreserveDITypeCalls.empty())
+    return false;
+
+  std::string BaseName = "llvm.btf_type_id.";
+  int Count = 0;
+  for (auto Call : PreserveDITypeCalls) {
+    const ConstantInt *Flag = dyn_cast<ConstantInt>(Call->getArgOperand(2));
+    assert(Flag);
+    uint64_t FlagValue = Flag->getValue().getZExtValue();
+
+    if (FlagValue >= BPFCoreSharedInfo::MAX_BTF_TYPE_ID_FLAG)
+      report_fatal_error("Incorrect flag for llvm.bpf.btf.type.id intrinsic");
+
+    uint32_t Reloc;
+    if (FlagValue == BPFCoreSharedInfo::BTF_TYPE_ID_LOCAL_RELOC)
+      Reloc = BPFCoreSharedInfo::BTF_TYPE_ID_LOCAL;
+    else
+      Reloc = BPFCoreSharedInfo::BTF_TYPE_ID_REMOTE;
+
+    BasicBlock *BB = Call->getParent();
+    IntegerType *VarType = Type::getInt32Ty(BB->getContext());
+    std::string GVName = BaseName + std::to_string(Count) + "$" +
+        std::to_string(Reloc);
+    GlobalVariable *GV =
+        new GlobalVariable(M, VarType, false, GlobalVariable::ExternalLinkage,
+                           NULL, GVName);
+    GV->addAttribute(BPFCoreSharedInfo::TypeIdAttr);
+    MDNode *MD = Call->getMetadata(LLVMContext::MD_preserve_access_index);
+    GV->setMetadata(LLVMContext::MD_preserve_access_index, MD);
+
+    // Load the global variable which represents the type info.
+    auto *LDInst = new LoadInst(Type::getInt32Ty(BB->getContext()), GV, "",
+                                Call);
+    Call->replaceAllUsesWith(LDInst);
+    Call->eraseFromParent();
+    Count++;
+  }
+
+  return true;
+}
diff --git a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
index a711294048ba..4c36c0edcef6 100644
--- a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.cpp
@@ -19,7 +19,7 @@ using namespace llvm;
 
 SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   // Requires the copy size to be a constant.
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
@@ -27,7 +27,7 @@ SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
     return SDValue();
 
   unsigned CopyLen = ConstantSize->getZExtValue();
-  unsigned StoresNumEstimate = alignTo(CopyLen, Align) >> Log2_32(Align);
+  unsigned StoresNumEstimate = alignTo(CopyLen, Alignment) >> Log2(Alignment);
   // Impose the same copy length limit as MaxStoresPerMemcpy.
   if (StoresNumEstimate > getCommonMaxStoresPerMemFunc())
     return SDValue();
@@ -36,7 +36,7 @@ SDValue BPFSelectionDAGInfo::EmitTargetCodeForMemcpy(
 
   Dst = DAG.getNode(BPFISD::MEMCPY, dl, VTs, Chain, Dst, Src,
                     DAG.getConstant(CopyLen, dl, MVT::i64),
-                    DAG.getConstant(Align, dl, MVT::i64));
+                    DAG.getConstant(Alignment.value(), dl, MVT::i64));
 
   return Dst.getValue(0);
 }
diff --git a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
index fb88c32ceb0c..79f05e57bb5c 100644
--- a/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
+++ b/llvm/lib/Target/BPF/BPFSelectionDAGInfo.h
@@ -21,8 +21,8 @@ class BPFSelectionDAGInfo : public SelectionDAGTargetInfo {
 public:
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align, bool isVolatile,
-                                  bool AlwaysInline,
+                                  SDValue Size, Align Alignment,
+                                  bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const override;
 
diff --git a/llvm/lib/Target/BPF/BPFTargetMachine.cpp b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
index 40375bc88bff..54204ee197ec 100644
--- a/llvm/lib/Target/BPF/BPFTargetMachine.cpp
+++ b/llvm/lib/Target/BPF/BPFTargetMachine.cpp
@@ -35,6 +35,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTarget() {
 
   PassRegistry &PR = *PassRegistry::getPassRegistry();
   initializeBPFAbstractMemberAccessPass(PR);
+  initializeBPFPreserveDITypePass(PR);
   initializeBPFMIPeepholePass(PR);
   initializeBPFMIPeepholeTruncElimPass(PR);
 }
@@ -42,9 +43,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeBPFTarget() {
 // DataLayout: little or big endian
 static std::string computeDataLayout(const Triple &TT) {
   if (TT.getArch() == Triple::bpfeb)
-    return "E-m:e-p:64:64-i64:64-n32:64-S128";
+    return "E-m:e-p:64:64-i64:64-i128:128-n32:64-S128";
   else
-    return "e-m:e-p:64:64-i64:64-n32:64-S128";
+    return "e-m:e-p:64:64-i64:64-i128:128-n32:64-S128";
 }
 
 static Reloc::Model getEffectiveRelocModel(Optional<Reloc::Model> RM) {
@@ -63,7 +64,7 @@ BPFTargetMachine::BPFTargetMachine(const Target &T, const Triple &TT,
                         getEffectiveRelocModel(RM),
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
-      Subtarget(TT, CPU, FS, *this) {
+      Subtarget(TT, std::string(CPU), std::string(FS), *this) {
   initAsmInfo();
 
   BPFMCAsmInfo *MAI =
@@ -96,6 +97,7 @@ TargetPassConfig *BPFTargetMachine::createPassConfig(PassManagerBase &PM) {
 void BPFPassConfig::addIRPasses() {
 
   addPass(createBPFAbstractMemberAccess(&getBPFTargetMachine()));
+  addPass(createBPFPreserveDIType());
 
   TargetPassConfig::addIRPasses();
 }
diff --git a/llvm/lib/Target/BPF/BTFDebug.cpp b/llvm/lib/Target/BPF/BTFDebug.cpp
index a9fb04f20d1c..4510e9357489 100644
--- a/llvm/lib/Target/BPF/BTFDebug.cpp
+++ b/llvm/lib/Target/BPF/BTFDebug.cpp
@@ -34,10 +34,10 @@ static const char *BTFKindStr[] = {
 void BTFTypeBase::emitType(MCStreamer &OS) {
   OS.AddComment(std::string(BTFKindStr[Kind]) + "(id = " + std::to_string(Id) +
                 ")");
-  OS.EmitIntValue(BTFType.NameOff, 4);
+  OS.emitInt32(BTFType.NameOff);
   OS.AddComment("0x" + Twine::utohexstr(BTFType.Info));
-  OS.EmitIntValue(BTFType.Info, 4);
-  OS.EmitIntValue(BTFType.Size, 4);
+  OS.emitInt32(BTFType.Info);
+  OS.emitInt32(BTFType.Size);
 }
 
 BTFTypeDerived::BTFTypeDerived(const DIDerivedType *DTy, unsigned Tag,
@@ -148,7 +148,7 @@ void BTFTypeInt::completeType(BTFDebug &BDebug) {
 void BTFTypeInt::emitType(MCStreamer &OS) {
   BTFTypeBase::emitType(OS);
   OS.AddComment("0x" + Twine::utohexstr(IntVal));
-  OS.EmitIntValue(IntVal, 4);
+  OS.emitInt32(IntVal);
 }
 
 BTFTypeEnum::BTFTypeEnum(const DICompositeType *ETy, uint32_t VLen) : ETy(ETy) {
@@ -171,7 +171,12 @@ void BTFTypeEnum::completeType(BTFDebug &BDebug) {
     struct BTF::BTFEnum BTFEnum;
     BTFEnum.NameOff = BDebug.addString(Enum->getName());
     // BTF enum value is 32bit, enforce it.
-    BTFEnum.Val = static_cast<uint32_t>(Enum->getValue());
+    uint32_t Value;
+    if (Enum->isUnsigned())
+      Value = static_cast<uint32_t>(Enum->getValue().getZExtValue());
+    else
+      Value = static_cast<uint32_t>(Enum->getValue().getSExtValue());
+    BTFEnum.Val = Value;
     EnumValues.push_back(BTFEnum);
   }
 }
@@ -179,8 +184,8 @@ void BTFTypeEnum::completeType(BTFDebug &BDebug) {
 void BTFTypeEnum::emitType(MCStreamer &OS) {
   BTFTypeBase::emitType(OS);
   for (const auto &Enum : EnumValues) {
-    OS.EmitIntValue(Enum.NameOff, 4);
-    OS.EmitIntValue(Enum.Val, 4);
+    OS.emitInt32(Enum.NameOff);
+    OS.emitInt32(Enum.Val);
   }
 }
 
@@ -209,9 +214,9 @@ void BTFTypeArray::completeType(BTFDebug &BDebug) {
 
 void BTFTypeArray::emitType(MCStreamer &OS) {
   BTFTypeBase::emitType(OS);
-  OS.EmitIntValue(ArrayInfo.ElemType, 4);
-  OS.EmitIntValue(ArrayInfo.IndexType, 4);
-  OS.EmitIntValue(ArrayInfo.Nelems, 4);
+  OS.emitInt32(ArrayInfo.ElemType);
+  OS.emitInt32(ArrayInfo.IndexType);
+  OS.emitInt32(ArrayInfo.Nelems);
 }
 
 /// Represent either a struct or a union.
@@ -252,14 +257,14 @@ void BTFTypeStruct::completeType(BTFDebug &BDebug) {
 void BTFTypeStruct::emitType(MCStreamer &OS) {
   BTFTypeBase::emitType(OS);
   for (const auto &Member : Members) {
-    OS.EmitIntValue(Member.NameOff, 4);
-    OS.EmitIntValue(Member.Type, 4);
+    OS.emitInt32(Member.NameOff);
+    OS.emitInt32(Member.Type);
     OS.AddComment("0x" + Twine::utohexstr(Member.Offset));
-    OS.EmitIntValue(Member.Offset, 4);
+    OS.emitInt32(Member.Offset);
   }
 }
 
-std::string BTFTypeStruct::getName() { return STy->getName(); }
+std::string BTFTypeStruct::getName() { return std::string(STy->getName()); }
 
 /// The Func kind represents both subprogram and pointee of function
 /// pointers. If the FuncName is empty, it represents a pointee of function
@@ -303,8 +308,8 @@ void BTFTypeFuncProto::completeType(BTFDebug &BDebug) {
 void BTFTypeFuncProto::emitType(MCStreamer &OS) {
   BTFTypeBase::emitType(OS);
   for (const auto &Param : Parameters) {
-    OS.EmitIntValue(Param.NameOff, 4);
-    OS.EmitIntValue(Param.Type, 4);
+    OS.emitInt32(Param.NameOff);
+    OS.emitInt32(Param.Type);
   }
 }
 
@@ -340,7 +345,7 @@ void BTFKindVar::completeType(BTFDebug &BDebug) {
 
 void BTFKindVar::emitType(MCStreamer &OS) {
   BTFTypeBase::emitType(OS);
-  OS.EmitIntValue(Info, 4);
+  OS.emitInt32(Info);
 }
 
 BTFKindDataSec::BTFKindDataSec(AsmPrinter *AsmPrt, std::string SecName)
@@ -359,9 +364,9 @@ void BTFKindDataSec::emitType(MCStreamer &OS) {
   BTFTypeBase::emitType(OS);
 
   for (const auto &V : Vars) {
-    OS.EmitIntValue(std::get<0>(V), 4);
-    Asm->EmitLabelReference(std::get<1>(V), 4);
-    OS.EmitIntValue(std::get<2>(V), 4);
+    OS.emitInt32(std::get<0>(V));
+    Asm->emitLabelReference(std::get<1>(V), 4);
+    OS.emitInt32(std::get<2>(V));
   }
 }
 
@@ -374,7 +379,7 @@ uint32_t BTFStringTable::addString(StringRef S) {
   // Not find, add to the string table.
   uint32_t Offset = Size;
   OffsetToIdMap[Offset] = Table.size();
-  Table.push_back(S);
+  Table.push_back(std::string(S));
   Size += S.size() + 1;
   return Offset;
 }
@@ -563,7 +568,7 @@ void BTFDebug::visitDerivedType(const DIDerivedType *DTy, uint32_t &TypeId,
         auto CTag = CTy->getTag();
         if ((CTag == dwarf::DW_TAG_structure_type ||
              CTag == dwarf::DW_TAG_union_type) &&
-            !CTy->isForwardDecl()) {
+            !CTy->getName().empty() && !CTy->isForwardDecl()) {
           /// Find a candidate, generate a fixup. Later on the struct/union
           /// pointee type will be replaced with either a real type or
           /// a forward declaration.
@@ -600,6 +605,38 @@ void BTFDebug::visitTypeEntry(const DIType *Ty, uint32_t &TypeId,
                               bool CheckPointer, bool SeenPointer) {
   if (!Ty || DIToIdMap.find(Ty) != DIToIdMap.end()) {
     TypeId = DIToIdMap[Ty];
+
+    // To handle the case like the following:
+    //    struct t;
+    //    typedef struct t _t;
+    //    struct s1 { _t *c; };
+    //    int test1(struct s1 *arg) { ... }
+    //
+    //    struct t { int a; int b; };
+    //    struct s2 { _t c; }
+    //    int test2(struct s2 *arg) { ... }
+    //
+    // During traversing test1() argument, "_t" is recorded
+    // in DIToIdMap and a forward declaration fixup is created
+    // for "struct t" to avoid pointee type traversal.
+    //
+    // During traversing test2() argument, even if we see "_t" is
+    // already defined, we should keep moving to eventually
+    // bring in types for "struct t". Otherwise, the "struct s2"
+    // definition won't be correct.
+    if (Ty && (!CheckPointer || !SeenPointer)) {
+      if (const auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+        unsigned Tag = DTy->getTag();
+        if (Tag == dwarf::DW_TAG_typedef || Tag == dwarf::DW_TAG_const_type ||
+            Tag == dwarf::DW_TAG_volatile_type ||
+            Tag == dwarf::DW_TAG_restrict_type) {
+          uint32_t TmpTypeId;
+          visitTypeEntry(DTy->getBaseType(), TmpTypeId, CheckPointer,
+                         SeenPointer);
+        }
+      }
+    }
+
     return;
   }
 
@@ -627,7 +664,17 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
     return;
   }
 
-  // MapDef type is a struct type
+  // MapDef type may be a struct type or a non-pointer derived type
+  const DIType *OrigTy = Ty;
+  while (auto *DTy = dyn_cast<DIDerivedType>(Ty)) {
+    auto Tag = DTy->getTag();
+    if (Tag != dwarf::DW_TAG_typedef && Tag != dwarf::DW_TAG_const_type &&
+        Tag != dwarf::DW_TAG_volatile_type &&
+        Tag != dwarf::DW_TAG_restrict_type)
+      break;
+    Ty = DTy->getBaseType();
+  }
+
   const auto *CTy = dyn_cast<DICompositeType>(Ty);
   if (!CTy)
     return;
@@ -636,27 +683,15 @@ void BTFDebug::visitMapDefType(const DIType *Ty, uint32_t &TypeId) {
   if (Tag != dwarf::DW_TAG_structure_type || CTy->isForwardDecl())
     return;
 
-  // Record this type
+  // Visit all struct members to ensure pointee type is visited
   const DINodeArray Elements = CTy->getElements();
-  bool HasBitField = false;
-  for (const auto *Element : Elements) {
-    auto E = cast<DIDerivedType>(Element);
-    if (E->isBitField()) {
-      HasBitField = true;
-      break;
-    }
-  }
-
-  auto TypeEntry =
-      std::make_unique<BTFTypeStruct>(CTy, true, HasBitField, Elements.size());
-  StructTypes.push_back(TypeEntry.get());
-  TypeId = addType(std::move(TypeEntry), CTy);
-
-  // Visit all struct members
   for (const auto *Element : Elements) {
     const auto *MemberType = cast<DIDerivedType>(Element);
     visitTypeEntry(MemberType->getBaseType());
   }
+
+  // Visit this type, struct or a const/typedef/volatile/restrict type
+  visitTypeEntry(OrigTy, TypeId, false, false);
 }
 
 /// Read file contents from the actual file or from the source
@@ -667,7 +702,7 @@ std::string BTFDebug::populateFileContent(const DISubprogram *SP) {
   if (!File->getFilename().startswith("/") && File->getDirectory().size())
     FileName = File->getDirectory().str() + "/" + File->getFilename().str();
   else
-    FileName = File->getFilename();
+    FileName = std::string(File->getFilename());
 
   // No need to populate the contends if it has been populated!
   if (FileContent.find(FileName) != FileContent.end())
@@ -686,7 +721,7 @@ std::string BTFDebug::populateFileContent(const DISubprogram *SP) {
     Buf = std::move(*BufOrErr);
   if (Buf)
     for (line_iterator I(*Buf, false), E; I != E; ++I)
-      Content.push_back(*I);
+      Content.push_back(std::string(*I));
 
   FileContent[FileName] = Content;
   return FileName;
@@ -711,9 +746,9 @@ void BTFDebug::constructLineInfo(const DISubprogram *SP, MCSymbol *Label,
 
 void BTFDebug::emitCommonHeader() {
   OS.AddComment("0x" + Twine::utohexstr(BTF::MAGIC));
-  OS.EmitIntValue(BTF::MAGIC, 2);
-  OS.EmitIntValue(BTF::VERSION, 1);
-  OS.EmitIntValue(0, 1);
+  OS.emitIntValue(BTF::MAGIC, 2);
+  OS.emitInt8(BTF::VERSION);
+  OS.emitInt8(0);
 }
 
 void BTFDebug::emitBTFSection() {
@@ -726,17 +761,17 @@ void BTFDebug::emitBTFSection() {
 
   // Emit header.
   emitCommonHeader();
-  OS.EmitIntValue(BTF::HeaderSize, 4);
+  OS.emitInt32(BTF::HeaderSize);
 
   uint32_t TypeLen = 0, StrLen;
   for (const auto &TypeEntry : TypeEntries)
     TypeLen += TypeEntry->getSize();
   StrLen = StringTable.getSize();
 
-  OS.EmitIntValue(0, 4);
-  OS.EmitIntValue(TypeLen, 4);
-  OS.EmitIntValue(TypeLen, 4);
-  OS.EmitIntValue(StrLen, 4);
+  OS.emitInt32(0);
+  OS.emitInt32(TypeLen);
+  OS.emitInt32(TypeLen);
+  OS.emitInt32(StrLen);
 
   // Emit type table.
   for (const auto &TypeEntry : TypeEntries)
@@ -746,8 +781,8 @@ void BTFDebug::emitBTFSection() {
   uint32_t StringOffset = 0;
   for (const auto &S : StringTable.getTable()) {
     OS.AddComment("string offset=" + std::to_string(StringOffset));
-    OS.EmitBytes(S);
-    OS.EmitBytes(StringRef("\0", 1));
+    OS.emitBytes(S);
+    OS.emitBytes(StringRef("\0", 1));
     StringOffset += S.size() + 1;
   }
 }
@@ -764,7 +799,7 @@ void BTFDebug::emitBTFExtSection() {
 
   // Emit header.
   emitCommonHeader();
-  OS.EmitIntValue(BTF::ExtHeaderSize, 4);
+  OS.emitInt32(BTF::ExtHeaderSize);
 
   // Account for FuncInfo/LineInfo record size as well.
   uint32_t FuncLen = 4, LineLen = 4;
@@ -786,59 +821,59 @@ void BTFDebug::emitBTFExtSection() {
   if (FieldRelocLen)
     FieldRelocLen += 4;
 
-  OS.EmitIntValue(0, 4);
-  OS.EmitIntValue(FuncLen, 4);
-  OS.EmitIntValue(FuncLen, 4);
-  OS.EmitIntValue(LineLen, 4);
-  OS.EmitIntValue(FuncLen + LineLen, 4);
-  OS.EmitIntValue(FieldRelocLen, 4);
+  OS.emitInt32(0);
+  OS.emitInt32(FuncLen);
+  OS.emitInt32(FuncLen);
+  OS.emitInt32(LineLen);
+  OS.emitInt32(FuncLen + LineLen);
+  OS.emitInt32(FieldRelocLen);
 
   // Emit func_info table.
   OS.AddComment("FuncInfo");
-  OS.EmitIntValue(BTF::BPFFuncInfoSize, 4);
+  OS.emitInt32(BTF::BPFFuncInfoSize);
   for (const auto &FuncSec : FuncInfoTable) {
     OS.AddComment("FuncInfo section string offset=" +
                   std::to_string(FuncSec.first));
-    OS.EmitIntValue(FuncSec.first, 4);
-    OS.EmitIntValue(FuncSec.second.size(), 4);
+    OS.emitInt32(FuncSec.first);
+    OS.emitInt32(FuncSec.second.size());
     for (const auto &FuncInfo : FuncSec.second) {
-      Asm->EmitLabelReference(FuncInfo.Label, 4);
-      OS.EmitIntValue(FuncInfo.TypeId, 4);
+      Asm->emitLabelReference(FuncInfo.Label, 4);
+      OS.emitInt32(FuncInfo.TypeId);
     }
   }
 
   // Emit line_info table.
   OS.AddComment("LineInfo");
-  OS.EmitIntValue(BTF::BPFLineInfoSize, 4);
+  OS.emitInt32(BTF::BPFLineInfoSize);
   for (const auto &LineSec : LineInfoTable) {
     OS.AddComment("LineInfo section string offset=" +
                   std::to_string(LineSec.first));
-    OS.EmitIntValue(LineSec.first, 4);
-    OS.EmitIntValue(LineSec.second.size(), 4);
+    OS.emitInt32(LineSec.first);
+    OS.emitInt32(LineSec.second.size());
     for (const auto &LineInfo : LineSec.second) {
-      Asm->EmitLabelReference(LineInfo.Label, 4);
-      OS.EmitIntValue(LineInfo.FileNameOff, 4);
-      OS.EmitIntValue(LineInfo.LineOff, 4);
+      Asm->emitLabelReference(LineInfo.Label, 4);
+      OS.emitInt32(LineInfo.FileNameOff);
+      OS.emitInt32(LineInfo.LineOff);
       OS.AddComment("Line " + std::to_string(LineInfo.LineNum) + " Col " +
                     std::to_string(LineInfo.ColumnNum));
-      OS.EmitIntValue(LineInfo.LineNum << 10 | LineInfo.ColumnNum, 4);
+      OS.emitInt32(LineInfo.LineNum << 10 | LineInfo.ColumnNum);
     }
   }
 
   // Emit field reloc table.
   if (FieldRelocLen) {
     OS.AddComment("FieldReloc");
-    OS.EmitIntValue(BTF::BPFFieldRelocSize, 4);
+    OS.emitInt32(BTF::BPFFieldRelocSize);
     for (const auto &FieldRelocSec : FieldRelocTable) {
       OS.AddComment("Field reloc section string offset=" +
                     std::to_string(FieldRelocSec.first));
-      OS.EmitIntValue(FieldRelocSec.first, 4);
-      OS.EmitIntValue(FieldRelocSec.second.size(), 4);
+      OS.emitInt32(FieldRelocSec.first);
+      OS.emitInt32(FieldRelocSec.second.size());
       for (const auto &FieldRelocInfo : FieldRelocSec.second) {
-        Asm->EmitLabelReference(FieldRelocInfo.Label, 4);
-        OS.EmitIntValue(FieldRelocInfo.TypeID, 4);
-        OS.EmitIntValue(FieldRelocInfo.OffsetNameOff, 4);
-        OS.EmitIntValue(FieldRelocInfo.RelocKind, 4);
+        Asm->emitLabelReference(FieldRelocInfo.Label, 4);
+        OS.emitInt32(FieldRelocInfo.TypeID);
+        OS.emitInt32(FieldRelocInfo.OffsetNameOff);
+        OS.emitInt32(FieldRelocInfo.RelocKind);
       }
     }
   }
@@ -915,7 +950,7 @@ void BTFDebug::beginFunctionImpl(const MachineFunction *MF) {
     MCSection &Section = FuncLabel->getSection();
     const MCSectionELF *SectionELF = dyn_cast<MCSectionELF>(&Section);
     assert(SectionELF && "Null section for Function Label");
-    SecNameOff = addString(SectionELF->getSectionName());
+    SecNameOff = addString(SectionELF->getName());
   } else {
     SecNameOff = addString(".text");
   }
@@ -928,9 +963,9 @@ void BTFDebug::endFunctionImpl(const MachineFunction *MF) {
   SecNameOff = 0;
 }
 
-/// On-demand populate struct types as requested from abstract member
-/// accessing.
-unsigned BTFDebug::populateStructType(const DIType *Ty) {
+/// On-demand populate types as requested from abstract member
+/// accessing or preserve debuginfo type.
+unsigned BTFDebug::populateType(const DIType *Ty) {
   unsigned Id;
   visitTypeEntry(Ty, Id, false, false);
   for (const auto &TypeEntry : TypeEntries)
@@ -939,24 +974,32 @@ unsigned BTFDebug::populateStructType(const DIType *Ty) {
 }
 
 /// Generate a struct member field relocation.
-void BTFDebug::generateFieldReloc(const MCSymbol *ORSym, DIType *RootTy,
-                                  StringRef AccessPattern) {
-  unsigned RootId = populateStructType(RootTy);
-  size_t FirstDollar = AccessPattern.find_first_of('$');
-  size_t FirstColon = AccessPattern.find_first_of(':');
-  size_t SecondColon = AccessPattern.find_first_of(':', FirstColon + 1);
-  StringRef IndexPattern = AccessPattern.substr(FirstDollar + 1);
-  StringRef RelocKindStr = AccessPattern.substr(FirstColon + 1,
-      SecondColon - FirstColon);
-  StringRef PatchImmStr = AccessPattern.substr(SecondColon + 1,
-      FirstDollar - SecondColon);
-
+void BTFDebug::generatePatchImmReloc(const MCSymbol *ORSym, uint32_t RootId,
+                                     const GlobalVariable *GVar, bool IsAma) {
   BTFFieldReloc FieldReloc;
   FieldReloc.Label = ORSym;
-  FieldReloc.OffsetNameOff = addString(IndexPattern);
   FieldReloc.TypeID = RootId;
-  FieldReloc.RelocKind = std::stoull(RelocKindStr);
-  PatchImms[AccessPattern.str()] = std::stoul(PatchImmStr);
+
+  StringRef AccessPattern = GVar->getName();
+  size_t FirstDollar = AccessPattern.find_first_of('$');
+  if (IsAma) {
+    size_t FirstColon = AccessPattern.find_first_of(':');
+    size_t SecondColon = AccessPattern.find_first_of(':', FirstColon + 1);
+    StringRef IndexPattern = AccessPattern.substr(FirstDollar + 1);
+    StringRef RelocKindStr = AccessPattern.substr(FirstColon + 1,
+        SecondColon - FirstColon);
+    StringRef PatchImmStr = AccessPattern.substr(SecondColon + 1,
+        FirstDollar - SecondColon);
+
+    FieldReloc.OffsetNameOff = addString(IndexPattern);
+    FieldReloc.RelocKind = std::stoull(std::string(RelocKindStr));
+    PatchImms[GVar] = std::stoul(std::string(PatchImmStr));
+  } else {
+    StringRef RelocStr = AccessPattern.substr(FirstDollar + 1);
+    FieldReloc.OffsetNameOff = addString("0");
+    FieldReloc.RelocKind = std::stoull(std::string(RelocStr));
+    PatchImms[GVar] = RootId;
+  }
   FieldRelocTable[SecNameOff].push_back(FieldReloc);
 }
 
@@ -965,14 +1008,20 @@ void BTFDebug::processReloc(const MachineOperand &MO) {
   if (MO.isGlobal()) {
     const GlobalValue *GVal = MO.getGlobal();
     auto *GVar = dyn_cast<GlobalVariable>(GVal);
-    if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
-      MCSymbol *ORSym = OS.getContext().createTempSymbol();
-      OS.EmitLabel(ORSym);
+    if (!GVar)
+      return;
 
-      MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index);
-      DIType *Ty = dyn_cast<DIType>(MDN);
-      generateFieldReloc(ORSym, Ty, GVar->getName());
-    }
+    if (!GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr) &&
+        !GVar->hasAttribute(BPFCoreSharedInfo::TypeIdAttr))
+      return;
+
+    MCSymbol *ORSym = OS.getContext().createTempSymbol();
+    OS.emitLabel(ORSym);
+
+    MDNode *MDN = GVar->getMetadata(LLVMContext::MD_preserve_access_index);
+    uint32_t RootId = populateType(dyn_cast<DIType>(MDN));
+    generatePatchImmReloc(ORSym, RootId, GVar,
+                          GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr));
   }
 }
 
@@ -1008,6 +1057,9 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
     // Later, the insn is replaced with "r2 = <offset>"
     // where "<offset>" equals to the offset based on current
     // type definitions.
+    //
+    // If the insn is "r2 = LD_imm64 @<an TypeIdAttr global>",
+    // The LD_imm64 result will be replaced with a btf type id.
     processReloc(MI->getOperand(1));
   } else if (MI->getOpcode() == BPF::CORE_MEM ||
              MI->getOpcode() == BPF::CORE_ALU32_MEM ||
@@ -1040,7 +1092,7 @@ void BTFDebug::beginInstruction(const MachineInstr *MI) {
 
   // Create a temporary label to remember the insn for lineinfo.
   MCSymbol *LineSym = OS.getContext().createTempSymbol();
-  OS.EmitLabel(LineSym);
+  OS.emitLabel(LineSym);
 
   // Construct the lineinfo.
   auto SP = DL.get()->getScope()->getSubprogram();
@@ -1119,15 +1171,17 @@ void BTFDebug::processGlobals(bool ProcessingMapDef) {
     assert(!SecName.empty());
 
     // Find or create a DataSec
-    if (DataSecEntries.find(SecName) == DataSecEntries.end()) {
-      DataSecEntries[SecName] = std::make_unique<BTFKindDataSec>(Asm, SecName);
+    if (DataSecEntries.find(std::string(SecName)) == DataSecEntries.end()) {
+      DataSecEntries[std::string(SecName)] =
+          std::make_unique<BTFKindDataSec>(Asm, std::string(SecName));
     }
 
     // Calculate symbol size
     const DataLayout &DL = Global.getParent()->getDataLayout();
     uint32_t Size = DL.getTypeAllocSize(Global.getType()->getElementType());
 
-    DataSecEntries[SecName]->addVar(VarId, Asm->getSymbol(&Global), Size);
+    DataSecEntries[std::string(SecName)]->addVar(VarId, Asm->getSymbol(&Global),
+                                                 Size);
   }
 }
 
@@ -1138,9 +1192,15 @@ bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) {
     if (MO.isGlobal()) {
       const GlobalValue *GVal = MO.getGlobal();
       auto *GVar = dyn_cast<GlobalVariable>(GVal);
-      if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
-        // Emit "mov ri, <imm>" for patched immediate.
-        uint32_t Imm = PatchImms[GVar->getName().str()];
+      if (GVar) {
+        // Emit "mov ri, <imm>"
+        uint32_t Imm;
+        if (GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr) ||
+            GVar->hasAttribute(BPFCoreSharedInfo::TypeIdAttr))
+          Imm = PatchImms[GVar];
+        else
+          return false;
+
         OutMI.setOpcode(BPF::MOV_ri);
         OutMI.addOperand(MCOperand::createReg(MI->getOperand(0).getReg()));
         OutMI.addOperand(MCOperand::createImm(Imm));
@@ -1155,7 +1215,7 @@ bool BTFDebug::InstLower(const MachineInstr *MI, MCInst &OutMI) {
       const GlobalValue *GVal = MO.getGlobal();
       auto *GVar = dyn_cast<GlobalVariable>(GVal);
       if (GVar && GVar->hasAttribute(BPFCoreSharedInfo::AmaAttr)) {
-        uint32_t Imm = PatchImms[GVar->getName().str()];
+        uint32_t Imm = PatchImms[GVar];
         OutMI.setOpcode(MI->getOperand(1).getImm());
         if (MI->getOperand(0).isImm())
           OutMI.addOperand(MCOperand::createImm(MI->getOperand(0).getImm()));
diff --git a/llvm/lib/Target/BPF/BTFDebug.h b/llvm/lib/Target/BPF/BTFDebug.h
index 0812c4f7915d..2f39f665299a 100644
--- a/llvm/lib/Target/BPF/BTFDebug.h
+++ b/llvm/lib/Target/BPF/BTFDebug.h
@@ -16,6 +16,7 @@
 
 #include "llvm/ADT/StringMap.h"
 #include "llvm/CodeGen/DebugHandlerBase.h"
+#include "llvm/CodeGen/MachineInstr.h"
 #include <set>
 #include <unordered_map>
 #include "BTF.h"
@@ -25,6 +26,7 @@ namespace llvm {
 class AsmPrinter;
 class BTFDebug;
 class DIType;
+class GlobalVariable;
 class MCStreamer;
 class MCSymbol;
 class MachineFunction;
@@ -61,8 +63,8 @@ class BTFTypeDerived : public BTFTypeBase {
 
 public:
   BTFTypeDerived(const DIDerivedType *Ty, unsigned Tag, bool NeedsFixup);
-  void completeType(BTFDebug &BDebug);
-  void emitType(MCStreamer &OS);
+  void completeType(BTFDebug &BDebug) override;
+  void emitType(MCStreamer &OS) override;
   void setPointeeType(uint32_t PointeeType);
 };
 
@@ -72,8 +74,8 @@ class BTFTypeFwd : public BTFTypeBase {
 
 public:
   BTFTypeFwd(StringRef Name, bool IsUnion);
-  void completeType(BTFDebug &BDebug);
-  void emitType(MCStreamer &OS);
+  void completeType(BTFDebug &BDebug) override;
+  void emitType(MCStreamer &OS) override;
 };
 
 /// Handle int type.
@@ -84,9 +86,9 @@ class BTFTypeInt : public BTFTypeBase {
 public:
   BTFTypeInt(uint32_t Encoding, uint32_t SizeInBits, uint32_t OffsetInBits,
              StringRef TypeName);
-  uint32_t getSize() { return BTFTypeBase::getSize() + sizeof(uint32_t); }
-  void completeType(BTFDebug &BDebug);
-  void emitType(MCStreamer &OS);
+  uint32_t getSize() override { return BTFTypeBase::getSize() + sizeof(uint32_t); }
+  void completeType(BTFDebug &BDebug) override;
+  void emitType(MCStreamer &OS) override;
 };
 
 /// Handle enumerate type.
@@ -96,11 +98,11 @@ class BTFTypeEnum : public BTFTypeBase {
 
 public:
   BTFTypeEnum(const DICompositeType *ETy, uint32_t NumValues);
-  uint32_t getSize() {
+  uint32_t getSize() override {
     return BTFTypeBase::getSize() + EnumValues.size() * BTF::BTFEnumSize;
   }
-  void completeType(BTFDebug &BDebug);
-  void emitType(MCStreamer &OS);
+  void completeType(BTFDebug &BDebug) override;
+  void emitType(MCStreamer &OS) override;
 };
 
 /// Handle array type.
@@ -109,9 +111,9 @@ class BTFTypeArray : public BTFTypeBase {
 
 public:
   BTFTypeArray(uint32_t ElemTypeId, uint32_t NumElems);
-  uint32_t getSize() { return BTFTypeBase::getSize() + BTF::BTFArraySize; }
-  void completeType(BTFDebug &BDebug);
-  void emitType(MCStreamer &OS);
+  uint32_t getSize() override { return BTFTypeBase::getSize() + BTF::BTFArraySize; }
+  void completeType(BTFDebug &BDebug) override;
+  void emitType(MCStreamer &OS) override;
 };
 
 /// Handle struct/union type.
@@ -123,11 +125,11 @@ class BTFTypeStruct : public BTFTypeBase {
 public:
   BTFTypeStruct(const DICompositeType *STy, bool IsStruct, bool HasBitField,
                 uint32_t NumMembers);
-  uint32_t getSize() {
+  uint32_t getSize() override {
     return BTFTypeBase::getSize() + Members.size() * BTF::BTFMemberSize;
   }
-  void completeType(BTFDebug &BDebug);
-  void emitType(MCStreamer &OS);
+  void completeType(BTFDebug &BDebug) override;
+  void emitType(MCStreamer &OS) override;
   std::string getName();
 };
 
@@ -140,11 +142,11 @@ class BTFTypeFuncProto : public BTFTypeBase {
 public:
   BTFTypeFuncProto(const DISubroutineType *STy, uint32_t NumParams,
                    const std::unordered_map<uint32_t, StringRef> &FuncArgNames);
-  uint32_t getSize() {
+  uint32_t getSize() override {
     return BTFTypeBase::getSize() + Parameters.size() * BTF::BTFParamSize;
   }
-  void completeType(BTFDebug &BDebug);
-  void emitType(MCStreamer &OS);
+  void completeType(BTFDebug &BDebug) override;
+  void emitType(MCStreamer &OS) override;
 };
 
 /// Handle subprogram
@@ -153,9 +155,9 @@ class BTFTypeFunc : public BTFTypeBase {
 
 public:
   BTFTypeFunc(StringRef FuncName, uint32_t ProtoTypeId, uint32_t Scope);
-  uint32_t getSize() { return BTFTypeBase::getSize(); }
-  void completeType(BTFDebug &BDebug);
-  void emitType(MCStreamer &OS);
+  uint32_t getSize() override { return BTFTypeBase::getSize(); }
+  void completeType(BTFDebug &BDebug) override;
+  void emitType(MCStreamer &OS) override;
 };
 
 /// Handle variable instances
@@ -165,9 +167,9 @@ class BTFKindVar : public BTFTypeBase {
 
 public:
   BTFKindVar(StringRef VarName, uint32_t TypeId, uint32_t VarInfo);
-  uint32_t getSize() { return BTFTypeBase::getSize() + 4; }
-  void completeType(BTFDebug &BDebug);
-  void emitType(MCStreamer &OS);
+  uint32_t getSize() override { return BTFTypeBase::getSize() + 4; }
+  void completeType(BTFDebug &BDebug) override;
+  void emitType(MCStreamer &OS) override;
 };
 
 /// Handle data sections
@@ -178,15 +180,15 @@ class BTFKindDataSec : public BTFTypeBase {
 
 public:
   BTFKindDataSec(AsmPrinter *AsmPrt, std::string SecName);
-  uint32_t getSize() {
+  uint32_t getSize() override {
     return BTFTypeBase::getSize() + BTF::BTFDataSecVarSize * Vars.size();
   }
   void addVar(uint32_t Id, const MCSymbol *Sym, uint32_t Size) {
     Vars.push_back(std::make_tuple(Id, Sym, Size));
   }
   std::string getName() { return Name; }
-  void completeType(BTFDebug &BDebug);
-  void emitType(MCStreamer &OS);
+  void completeType(BTFDebug &BDebug) override;
+  void emitType(MCStreamer &OS) override;
 };
 
 /// String table.
@@ -249,7 +251,7 @@ class BTFDebug : public DebugHandlerBase {
   StringMap<std::vector<std::string>> FileContent;
   std::map<std::string, std::unique_ptr<BTFKindDataSec>> DataSecEntries;
   std::vector<BTFTypeStruct *> StructTypes;
-  std::map<std::string, uint32_t> PatchImms;
+  std::map<const GlobalVariable *, uint32_t> PatchImms;
   std::map<StringRef, std::pair<bool, std::vector<BTFTypeDerived *>>>
       FixupDerivedTypes;
   std::set<const Function *>ProtoFunctions;
@@ -299,11 +301,11 @@ class BTFDebug : public DebugHandlerBase {
   void processFuncPrototypes(const Function *);
 
   /// Generate one field relocation record.
-  void generateFieldReloc(const MCSymbol *ORSym, DIType *RootTy,
-                          StringRef AccessPattern);
+  void generatePatchImmReloc(const MCSymbol *ORSym, uint32_t RootId,
+                             const GlobalVariable *, bool IsAma);
 
-  /// Populating unprocessed struct type.
-  unsigned populateStructType(const DIType *Ty);
+  /// Populating unprocessed type on demand.
+  unsigned populateType(const DIType *Ty);
 
   /// Process relocation instructions.
   void processReloc(const MachineOperand &MO);
diff --git a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
index 75f963b5448a..4d98dc7341d0 100644
--- a/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
+++ b/llvm/lib/Target/BPF/Disassembler/BPFDisassembler.cpp
@@ -126,6 +126,9 @@ static DecodeStatus DecodeGPR32RegisterClass(MCInst &Inst, unsigned RegNo,
 static DecodeStatus decodeMemoryOpValue(MCInst &Inst, unsigned Insn,
                                         uint64_t Address, const void *Decoder) {
   unsigned Register = (Insn >> 16) & 0xf;
+  if (Register > 11)
+    return MCDisassembler::Fail;
+
   Inst.addOperand(MCOperand::createReg(GPRDecoderTable[Register]));
   unsigned Offset = (Insn & 0xffff);
   Inst.addOperand(MCOperand::createImm(SignExtend32<16>(Offset)));
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
index ba35a175b9a7..9d829ac45a10 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFAsmBackend.cpp
@@ -48,9 +48,6 @@ public:
     return false;
   }
 
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override {}
-
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
index 97f0cbd58608..3292c3e5ebb5 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCAsmInfo.h
@@ -17,7 +17,6 @@
 #include "llvm/MC/MCAsmInfo.h"
 
 namespace llvm {
-class Target;
 
 class BPFMCAsmInfo : public MCAsmInfo {
 public:
@@ -42,6 +41,8 @@ public:
     // section will be parsable, but with odd offsets and
     // line numbers, etc.
     CodePointerSize = 8;
+
+    UseIntegratedAssembler = false;
   }
 
   void setDwarfUsesRelocationsAcrossSections(bool enable) {
diff --git a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
index 1a391321f60d..a426a132cf47 100644
--- a/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
+++ b/llvm/lib/Target/BPF/MCTargetDesc/BPFMCTargetDesc.h
@@ -27,11 +27,7 @@ class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
-class StringRef;
 class Target;
-class Triple;
-class raw_ostream;
-class raw_pwrite_stream;
 
 MCCodeEmitter *createBPFMCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCRegisterInfo &MRI,
diff --git a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
index cee1954e369b..1e7862c36ea0 100644
--- a/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
+++ b/llvm/lib/Target/Hexagon/AsmParser/HexagonAsmParser.cpp
@@ -116,6 +116,8 @@ class HexagonAsmParser : public MCTargetAsmParser {
   bool ParseDirectiveFalign(unsigned Size, SMLoc L);
 
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
   bool ParseDirectiveSubsection(SMLoc L);
   bool ParseDirectiveComm(bool IsLocal, SMLoc L);
   bool RegisterMatchesArch(unsigned MatchNum) const;
@@ -312,6 +314,8 @@ public:
   bool iss30_2Imm() const { return true; }
   bool iss29_3Imm() const { return true; }
   bool iss27_2Imm() const { return CheckImmRange(27, 2, true, true, false); }
+  bool iss10_0Imm() const { return CheckImmRange(10, 0, true, false, false); }
+  bool iss10_6Imm() const { return CheckImmRange(10, 6, true, false, false); }
   bool iss9_0Imm() const { return CheckImmRange(9, 0, true, false, false); }
   bool iss8_0Imm() const { return CheckImmRange(8, 0, true, false, false); }
   bool iss8_0Imm64() const { return CheckImmRange(8, 0, true, true, false); }
@@ -467,13 +471,16 @@ bool HexagonAsmParser::finishBundle(SMLoc IDLoc, MCStreamer &Out) {
   LLVM_DEBUG(dbgs() << "--\n");
 
   MCB.setLoc(IDLoc);
+
   // Check the bundle for errors.
   const MCRegisterInfo *RI = getContext().getRegisterInfo();
-  HexagonMCChecker Check(getContext(), MII, getSTI(), MCB, *RI);
+  MCSubtargetInfo const &STI = getSTI();
+
+  MCInst OrigBundle = MCB;
+  HexagonMCChecker Check(getContext(), MII, STI, MCB, *RI, true);
 
-  bool CheckOk = HexagonMCInstrInfo::canonicalizePacket(MII, getSTI(),
-                                                        getContext(), MCB,
-                                                        &Check);
+  bool CheckOk = HexagonMCInstrInfo::canonicalizePacket(
+      MII, STI, getContext(), MCB, &Check, true);
 
   if (CheckOk) {
     if (HexagonMCInstrInfo::bundleSize(MCB) == 0) {
@@ -482,15 +489,12 @@ bool HexagonAsmParser::finishBundle(SMLoc IDLoc, MCStreamer &Out) {
       // Empty packets are valid yet aren't emitted
       return false;
     }
-    Out.EmitInstruction(MCB, getSTI());
-  } else {
-    // If compounding and duplexing didn't reduce the size below
-    // 4 or less we have a packet that is too big.
-    if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE) {
-      Error(IDLoc, "invalid instruction packet: out of slots");
-    }
+
+    assert(HexagonMCInstrInfo::isBundle(MCB));
+
+    Out.emitInstruction(MCB, STI);
+  } else
     return true; // Error
-  }
 
   return false; // No error
 }
@@ -518,6 +522,8 @@ bool HexagonAsmParser::matchBundleOptions() {
         HexagonMCInstrInfo::setMemReorderDisabled(MCB);
       else
         return getParser().Error(IDLoc, MemNoShuffMsg);
+    } else if (Option.compare_lower("mem_no_order") == 0) {
+      // Nothing.
     } else
       return getParser().Error(IDLoc, llvm::Twine("'") + Option +
                                           "' is not a valid bundle option");
@@ -578,6 +584,7 @@ bool HexagonAsmParser::matchOneInstruction(MCInst &MCI, SMLoc IDLoc,
   case Match_MnemonicFail:
     return Error(IDLoc, "unrecognized instruction");
   case Match_InvalidOperand:
+    LLVM_FALLTHROUGH;
   case Match_InvalidTiedOperand:
     SMLoc ErrorLoc = IDLoc;
     if (ErrorInfo != ~0U) {
@@ -937,8 +944,8 @@ bool HexagonAsmParser::isLabel(AsmToken &Token) {
   assert(Second.is(AsmToken::Colon));
   StringRef Raw(String.data(), Third.getString().data() - String.data() +
                                    Third.getString().size());
-  std::string Collapsed = Raw;
-  Collapsed.erase(llvm::remove_if(Collapsed, isspace), Collapsed.end());
+  std::string Collapsed = std::string(Raw);
+  Collapsed.erase(llvm::remove_if(Collapsed, isSpace), Collapsed.end());
   StringRef Whole = Collapsed;
   std::pair<StringRef, StringRef> DotSplit = Whole.split('.');
   if (!matchRegister(DotSplit.first.lower()))
@@ -959,6 +966,12 @@ bool HexagonAsmParser::handleNoncontigiousRegister(bool Contigious,
 
 bool HexagonAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                      SMLoc &EndLoc) {
+  return tryParseRegister(RegNo, StartLoc, EndLoc) != MatchOperand_Success;
+}
+
+OperandMatchResultTy HexagonAsmParser::tryParseRegister(unsigned &RegNo,
+                                                        SMLoc &StartLoc,
+                                                        SMLoc &EndLoc) {
   MCAsmLexer &Lexer = getLexer();
   StartLoc = getLexer().getLoc();
   SmallVector<AsmToken, 5> Lookahead;
@@ -983,8 +996,8 @@ bool HexagonAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
     Again = (Contigious && Type) || (Workaround && Type);
     NeededWorkaround = NeededWorkaround || (Again && !(Contigious && Type));
   }
-  std::string Collapsed = RawString;
-  Collapsed.erase(llvm::remove_if(Collapsed, isspace), Collapsed.end());
+  std::string Collapsed = std::string(RawString);
+  Collapsed.erase(llvm::remove_if(Collapsed, isSpace), Collapsed.end());
   StringRef FullString = Collapsed;
   std::pair<StringRef, StringRef> DotSplit = FullString.split('.');
   unsigned DotReg = matchRegister(DotSplit.first.lower());
@@ -993,8 +1006,8 @@ bool HexagonAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
       RegNo = DotReg;
       EndLoc = Lexer.getLoc();
       if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
-        return true;
-      return false;
+        return MatchOperand_NoMatch;
+      return MatchOperand_Success;
     } else {
       RegNo = DotReg;
       size_t First = RawString.find('.');
@@ -1002,28 +1015,26 @@ bool HexagonAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
       Lexer.UnLex(AsmToken(AsmToken::Identifier, DotString));
       EndLoc = Lexer.getLoc();
       if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
-        return true;
-      return false;
+        return MatchOperand_NoMatch;
+      return MatchOperand_Success;
     }
   }
   std::pair<StringRef, StringRef> ColonSplit = StringRef(FullString).split(':');
   unsigned ColonReg = matchRegister(ColonSplit.first.lower());
   if (ColonReg != Hexagon::NoRegister && RegisterMatchesArch(DotReg)) {
     do {
-      Lexer.UnLex(Lookahead.back());
-      Lookahead.pop_back();
-    } while (!Lookahead.empty () && !Lexer.is(AsmToken::Colon));
+      Lexer.UnLex(Lookahead.pop_back_val());
+    } while (!Lookahead.empty() && !Lexer.is(AsmToken::Colon));
     RegNo = ColonReg;
     EndLoc = Lexer.getLoc();
     if (handleNoncontigiousRegister(!NeededWorkaround, StartLoc))
-      return true;
-    return false;
+      return MatchOperand_NoMatch;
+    return MatchOperand_Success;
   }
   while (!Lookahead.empty()) {
-    Lexer.UnLex(Lookahead.back());
-    Lookahead.pop_back();
+    Lexer.UnLex(Lookahead.pop_back_val());
   }
-  return true;
+  return MatchOperand_NoMatch;
 }
 
 bool HexagonAsmParser::implicitExpressionLocation(OperandVector &Operands) {
@@ -1283,9 +1294,28 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
                                          SMLoc IDLoc) {
   MCContext &Context = getParser().getContext();
   const MCRegisterInfo *RI = getContext().getRegisterInfo();
-  std::string r = "r";
-  std::string v = "v";
-  std::string Colon = ":";
+  const std::string r = "r";
+  const std::string v = "v";
+  const std::string Colon = ":";
+  using RegPairVals = std::pair<unsigned, unsigned>;
+  auto GetRegPair = [this, r](RegPairVals RegPair) {
+    const std::string R1 = r + utostr(RegPair.first);
+    const std::string R2 = r + utostr(RegPair.second);
+
+    return std::make_pair(matchRegister(R1), matchRegister(R2));
+  };
+  auto GetScalarRegs = [RI, GetRegPair](unsigned RegPair) {
+    const unsigned Lower = RI->getEncodingValue(RegPair);
+    const RegPairVals RegPair_ = std::make_pair(Lower + 1, Lower);
+
+    return GetRegPair(RegPair_);
+  };
+  auto GetVecRegs = [GetRegPair](unsigned VecRegPair) {
+    const RegPairVals RegPair =
+        HexagonMCInstrInfo::GetVecRegPairIndices(VecRegPair);
+
+    return GetRegPair(RegPair);
+  };
 
   bool is32bit = false; // used to distinguish between CONST32 and CONST64
   switch (Inst.getOpcode()) {
@@ -1377,14 +1407,9 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   // Translate a "$Rdd = $Rss" to "$Rdd = combine($Rs, $Rt)"
   case Hexagon::A2_tfrp: {
     MCOperand &MO = Inst.getOperand(1);
-    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
-    std::string R1 = r + utostr(RegPairNum + 1);
-    StringRef Reg1(R1);
-    MO.setReg(matchRegister(Reg1));
-    // Add a new operand for the second register in the pair.
-    std::string R2 = r + utostr(RegPairNum);
-    StringRef Reg2(R2);
-    Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+    const std::pair<unsigned, unsigned> RegPair = GetScalarRegs(MO.getReg());
+    MO.setReg(RegPair.first);
+    Inst.addOperand(MCOperand::createReg(RegPair.second));
     Inst.setOpcode(Hexagon::A2_combinew);
     break;
   }
@@ -1392,14 +1417,9 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   case Hexagon::A2_tfrpt:
   case Hexagon::A2_tfrpf: {
     MCOperand &MO = Inst.getOperand(2);
-    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
-    std::string R1 = r + utostr(RegPairNum + 1);
-    StringRef Reg1(R1);
-    MO.setReg(matchRegister(Reg1));
-    // Add a new operand for the second register in the pair.
-    std::string R2 = r + utostr(RegPairNum);
-    StringRef Reg2(R2);
-    Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+    const std::pair<unsigned, unsigned> RegPair = GetScalarRegs(MO.getReg());
+    MO.setReg(RegPair.first);
+    Inst.addOperand(MCOperand::createReg(RegPair.second));
     Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrpt)
                        ? Hexagon::C2_ccombinewt
                        : Hexagon::C2_ccombinewf);
@@ -1408,14 +1428,9 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   case Hexagon::A2_tfrptnew:
   case Hexagon::A2_tfrpfnew: {
     MCOperand &MO = Inst.getOperand(2);
-    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
-    std::string R1 = r + utostr(RegPairNum + 1);
-    StringRef Reg1(R1);
-    MO.setReg(matchRegister(Reg1));
-    // Add a new operand for the second register in the pair.
-    std::string R2 = r + utostr(RegPairNum);
-    StringRef Reg2(R2);
-    Inst.addOperand(MCOperand::createReg(matchRegister(Reg2)));
+    const std::pair<unsigned, unsigned> RegPair = GetScalarRegs(MO.getReg());
+    MO.setReg(RegPair.first);
+    Inst.addOperand(MCOperand::createReg(RegPair.second));
     Inst.setOpcode((Inst.getOpcode() == Hexagon::A2_tfrptnew)
                        ? Hexagon::C2_ccombinewnewt
                        : Hexagon::C2_ccombinewnewf);
@@ -1425,12 +1440,9 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
   // Translate a "$Vdd = $Vss" to "$Vdd = vcombine($Vs, $Vt)"
   case Hexagon::V6_vassignp: {
     MCOperand &MO = Inst.getOperand(1);
-    unsigned int RegPairNum = RI->getEncodingValue(MO.getReg());
-    std::string R1 = v + utostr(RegPairNum + 1);
-    MO.setReg(MatchRegisterName(R1));
-    // Add a new operand for the second register in the pair.
-    std::string R2 = v + utostr(RegPairNum);
-    Inst.addOperand(MCOperand::createReg(MatchRegisterName(R2)));
+    const std::pair<unsigned, unsigned> RegPair = GetVecRegs(MO.getReg());
+    MO.setReg(RegPair.first);
+    Inst.addOperand(MCOperand::createReg(RegPair.second));
     Inst.setOpcode(Hexagon::V6_vcombine);
     break;
   }
@@ -1485,7 +1497,7 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
 
       MES->SwitchSection(mySection);
       unsigned byteSize = is32bit ? 4 : 8;
-      getStreamer().EmitCodeAlignment(byteSize, byteSize);
+      getStreamer().emitCodeAlignment(byteSize, byteSize);
 
       MCSymbol *Sym;
 
@@ -1495,9 +1507,9 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
       if (Absolute) {
         Sym = getContext().getOrCreateSymbol(StringRef(myCharStr.c_str() + 16));
         if (Sym->isUndefined()) {
-          getStreamer().EmitLabel(Sym);
-          getStreamer().EmitSymbolAttribute(Sym, MCSA_Global);
-          getStreamer().EmitIntValue(Value, byteSize);
+          getStreamer().emitLabel(Sym);
+          getStreamer().emitSymbolAttribute(Sym, MCSA_Global);
+          getStreamer().emitIntValue(Value, byteSize);
         }
       } else if (MO_1.isExpr()) {
         const char *StringStart = nullptr;
@@ -1517,9 +1529,9 @@ int HexagonAsmParser::processInstruction(MCInst &Inst,
 
         if (Sym->isUndefined()) {
           // case where symbol is not yet defined: emit symbol
-          getStreamer().EmitLabel(Sym);
-          getStreamer().EmitSymbolAttribute(Sym, MCSA_Local);
-          getStreamer().EmitValue(MO_1.getExpr(), 4);
+          getStreamer().emitLabel(Sym);
+          getStreamer().emitSymbolAttribute(Sym, MCSA_Local);
+          getStreamer().emitValue(MO_1.getExpr(), 4);
         }
       } else
         llvm_unreachable("unexpected type of machine operand!");
diff --git a/llvm/lib/Target/Hexagon/BitTracker.cpp b/llvm/lib/Target/Hexagon/BitTracker.cpp
index 8a07b991ff5a..7ef23ef35a74 100644
--- a/llvm/lib/Target/Hexagon/BitTracker.cpp
+++ b/llvm/lib/Target/Hexagon/BitTracker.cpp
@@ -954,6 +954,9 @@ void BT::visitBranchesFrom(const MachineInstr &BI) {
     ++It;
   } while (FallsThrough && It != End);
 
+  if (B.mayHaveInlineAsmBr())
+    DefaultToAll = true;
+
   if (!DefaultToAll) {
     // Need to add all CFG successors that lead to EH landing pads.
     // There won't be explicit branches to these blocks, but they must
diff --git a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
index 7a90d585eb9a..f3a87ef20a60 100644
--- a/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
+++ b/llvm/lib/Target/Hexagon/Disassembler/HexagonDisassembler.cpp
@@ -185,7 +185,10 @@ DecodeStatus HexagonDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
     return Result;
   if (Size > HEXAGON_MAX_PACKET_SIZE)
     return MCDisassembler::Fail;
-  HexagonMCChecker Checker(getContext(), *MCII, STI, MI,
+
+  const auto ArchSTI = Hexagon_MC::getArchSubtarget(&STI);
+  const auto STI_ = (ArchSTI != nullptr) ? *ArchSTI : STI;
+  HexagonMCChecker Checker(getContext(), *MCII, STI_, MI,
                            *getContext().getRegisterInfo(), false);
   if (!Checker.check())
     return MCDisassembler::Fail;
@@ -495,9 +498,13 @@ DecodeStatus HexagonDisassembler::getSingleInstruction(MCInst &MI, MCInst &MCB,
     } else if (HexagonMCInstrInfo::hasNewValue(*MCII, Inst)) {
       unsigned Producer =
           HexagonMCInstrInfo::getNewValueOperand(*MCII, Inst).getReg();
-      if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
-        Producer = ((Producer - Hexagon::W0) << 1) + SubregBit + Hexagon::V0;
-      else if (SubregBit)
+
+      if (HexagonMCInstrInfo::IsVecRegPair(Producer)) {
+        const bool Rev = HexagonMCInstrInfo::IsReverseVecRegPair(Producer);
+        const unsigned ProdPairIndex =
+            Rev ? Producer - Hexagon::WR0 : Producer - Hexagon::W0;
+        Producer = (ProdPairIndex << 1) + SubregBit + Hexagon::V0;
+      } else if (SubregBit)
         // Hexagon PRM 10.11 New-value operands
         // Nt[0] is reserved and should always be encoded as zero.
         return MCDisassembler::Fail;
@@ -603,12 +610,16 @@ static DecodeStatus DecodeHvxWRRegisterClass(MCInst &Inst, unsigned RegNo,
                                              uint64_t /*Address*/,
                                              const void *Decoder) {
   static const MCPhysReg HvxWRDecoderTable[] = {
-      Hexagon::W0,  Hexagon::W1,  Hexagon::W2,  Hexagon::W3,
-      Hexagon::W4,  Hexagon::W5,  Hexagon::W6,  Hexagon::W7,
-      Hexagon::W8,  Hexagon::W9,  Hexagon::W10, Hexagon::W11,
-      Hexagon::W12, Hexagon::W13, Hexagon::W14, Hexagon::W15};
+      Hexagon::W0,   Hexagon::WR0,  Hexagon::W1,   Hexagon::WR1,  Hexagon::W2,
+      Hexagon::WR2,  Hexagon::W3,   Hexagon::WR3,  Hexagon::W4,   Hexagon::WR4,
+      Hexagon::W5,   Hexagon::WR5,  Hexagon::W6,   Hexagon::WR6,  Hexagon::W7,
+      Hexagon::WR7,  Hexagon::W8,   Hexagon::WR8,  Hexagon::W9,   Hexagon::WR9,
+      Hexagon::W10,  Hexagon::WR10, Hexagon::W11,  Hexagon::WR11, Hexagon::W12,
+      Hexagon::WR12, Hexagon::W13,  Hexagon::WR13, Hexagon::W14,  Hexagon::WR14,
+      Hexagon::W15,  Hexagon::WR15,
+  };
 
-  return (DecodeRegisterClass(Inst, RegNo >> 1, HvxWRDecoderTable));
+  return DecodeRegisterClass(Inst, RegNo, HvxWRDecoderTable);
 }
 
 LLVM_ATTRIBUTE_UNUSED  // Suppress warning temporarily.
diff --git a/llvm/lib/Target/Hexagon/Hexagon.td b/llvm/lib/Target/Hexagon/Hexagon.td
index 26869391c7a3..2fadb0b5ddc4 100644
--- a/llvm/lib/Target/Hexagon/Hexagon.td
+++ b/llvm/lib/Target/Hexagon/Hexagon.td
@@ -23,6 +23,9 @@ include "llvm/Target/Target.td"
 // Hexagon Architectures
 include "HexagonDepArch.td"
 
+def ProcTinyCore: SubtargetFeature<"tinycore", "HexagonProcFamily",
+      "TinyCore", "Hexagon Tiny Core">;
+
 // Hexagon ISA Extensions
 def ExtensionZReg: SubtargetFeature<"zreg", "UseZRegOps", "true",
       "Hexagon ZReg extension instructions">;
@@ -42,14 +45,25 @@ def ExtensionHVXV66: SubtargetFeature<"hvxv66", "HexagonHVXVersion",
       "Hexagon::ArchEnum::V66", "Hexagon HVX instructions",
       [ExtensionHVX, ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65,
        ExtensionZReg]>;
+def ExtensionHVXV67: SubtargetFeature<"hvxv67", "HexagonHVXVersion",
+      "Hexagon::ArchEnum::V67", "Hexagon HVX instructions",
+      [ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65, ExtensionHVXV66]>;
+
 
 def ExtensionHVX64B: SubtargetFeature<"hvx-length64b", "UseHVX64BOps",
       "true", "Hexagon HVX 64B instructions", [ExtensionHVX]>;
 def ExtensionHVX128B: SubtargetFeature<"hvx-length128b", "UseHVX128BOps",
       "true", "Hexagon HVX 128B instructions", [ExtensionHVX]>;
 
+def ExtensionAudio: SubtargetFeature<"audio", "UseAudioOps", "true",
+      "Hexagon Audio extension instructions">;
+
+def FeatureCompound: SubtargetFeature<"compound", "UseCompound", "true",
+      "Use compound instructions">;
 def FeaturePackets: SubtargetFeature<"packets", "UsePackets", "true",
       "Support for instruction packets">;
+def FeaturePreV65: SubtargetFeature<"prev65", "HasPreV65", "true",
+      "Support features deprecated in v65">;
 def FeatureLongCalls: SubtargetFeature<"long-calls", "UseLongCalls", "true",
       "Use constant-extended calls">;
 def FeatureMemNoShuf: SubtargetFeature<"mem_noshuf", "HasMemNoShuf", "false",
@@ -64,6 +78,8 @@ def FeatureSmallData: SubtargetFeature<"small-data", "UseSmallData", "true",
       "Allow GP-relative addressing of global variables">;
 def FeatureDuplex: SubtargetFeature<"duplex", "EnableDuplex", "true",
       "Enable generation of duplex instruction">;
+def FeatureUnsafeFP: SubtargetFeature<"unsafe-fp", "UseUnsafeMath", "true",
+      "Use unsafe FP math">;
 def FeatureReservedR19: SubtargetFeature<"reserved-r19", "ReservedR19",
       "true", "Reserve register R19">;
 def FeatureNoreturnStackElim: SubtargetFeature<"noreturn-stack-elim",
@@ -76,21 +92,36 @@ def FeatureNoreturnStackElim: SubtargetFeature<"noreturn-stack-elim",
 
 def UseMEMOPS          : Predicate<"HST->useMemops()">;
 def UseHVX64B          : Predicate<"HST->useHVX64BOps()">,
-                         AssemblerPredicate<"ExtensionHVX64B">;
+                         AssemblerPredicate<(all_of ExtensionHVX64B)>;
 def UseHVX128B         : Predicate<"HST->useHVX128BOps()">,
-                         AssemblerPredicate<"ExtensionHVX128B">;
+                         AssemblerPredicate<(all_of ExtensionHVX128B)>;
 def UseHVX             : Predicate<"HST->useHVXOps()">,
-                         AssemblerPredicate<"ExtensionHVXV60">;
-def UseHVXV60          : Predicate<"HST->useHVXOps()">,
-                         AssemblerPredicate<"ExtensionHVXV60">;
-def UseHVXV62          : Predicate<"HST->useHVXOps()">,
-                         AssemblerPredicate<"ExtensionHVXV62">;
-def UseHVXV65          : Predicate<"HST->useHVXOps()">,
-                         AssemblerPredicate<"ExtensionHVXV65">;
-def UseHVXV66          : Predicate<"HST->useHVXOps()">,
-                         AssemblerPredicate<"ExtensionHVXV66">;
+                         AssemblerPredicate<(all_of ExtensionHVXV60)>;
+def UseHVXV60          : Predicate<"HST->useHVXV60Ops()">,
+                         AssemblerPredicate<(all_of ExtensionHVXV60)>;
+def UseHVXV62          : Predicate<"HST->useHVXV62Ops()">,
+                         AssemblerPredicate<(all_of ExtensionHVXV62)>;
+def UseHVXV65          : Predicate<"HST->useHVXV65Ops()">,
+                         AssemblerPredicate<(all_of ExtensionHVXV65)>;
+def UseHVXV66          : Predicate<"HST->useHVXV66Ops()">,
+                         AssemblerPredicate<(all_of ExtensionHVXV66)>;
+def UseHVXV67          : Predicate<"HST->useHVXV67Ops()">,
+                         AssemblerPredicate<(all_of ExtensionHVXV67)>;
+def UseAudio           : Predicate<"HST->useAudioOps()">,
+                         AssemblerPredicate<(all_of ExtensionAudio)>;
 def UseZReg            : Predicate<"HST->useZRegOps()">,
-                         AssemblerPredicate<"ExtensionZReg">;
+                         AssemblerPredicate<(all_of ExtensionZReg)>;
+def UseCompound        : Predicate<"HST->useCompound()">;
+def HasPreV65          : Predicate<"HST->hasPreV65()">,
+                         AssemblerPredicate<(all_of FeaturePreV65)>;
+def HasMemNoShuf       : Predicate<"HST->hasMemNoShuf()">,
+                         AssemblerPredicate<(all_of FeatureMemNoShuf)>;
+def UseUnsafeMath      : Predicate<"HST->useUnsafeMath()">;
+def NotOptTinyCore     : Predicate<"!HST->isTinyCore() ||"
+                                   "MF->getFunction().hasOptSize()"> {
+  let RecomputePerFunction = 1;
+}
+def UseSmallData       : Predicate<"HST->useSmallData()">;
 
 def Hvx64:  HwMode<"+hvx-length64b">;
 def Hvx128: HwMode<"+hvx-length128b">;
@@ -99,6 +130,7 @@ def Hvx128: HwMode<"+hvx-length128b">;
 // Classes used for relation maps.
 //===----------------------------------------------------------------------===//
 
+// The classes below should remain in hierarchical order...
 class ImmRegShl;
 // ImmRegRel - Filter class used to relate instructions having reg-reg form
 // with their reg-imm counterparts.
@@ -106,17 +138,14 @@ class ImmRegRel;
 // PredRel - Filter class used to relate non-predicated instructions with their
 // predicated forms.
 class PredRel;
-// PredNewRel - Filter class used to relate predicated instructions with their
-// predicate-new forms.
 class PredNewRel: PredRel;
 // NewValueRel - Filter class used to relate regular store instructions with
 // their new-value store form.
 class NewValueRel: PredNewRel;
-// NewValueRel - Filter class used to relate load/store instructions having
-// different addressing modes with each other.
 class AddrModeRel: NewValueRel;
 class PostInc_BaseImm;
 class IntrinsicsRel;
+// ... through here.
 
 //===----------------------------------------------------------------------===//
 // Generate mapping table to relate non-predicate instructions with their
@@ -335,31 +364,43 @@ class Proc<string Name, SchedMachineModel Model,
 
 def : Proc<"generic", HexagonModelV60,
            [ArchV5, ArchV55, ArchV60,
-            FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
-            FeaturePackets, FeatureSmallData]>;
+            FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops,
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv5",  HexagonModelV5,
            [ArchV5,
-            FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
-            FeaturePackets, FeatureSmallData]>;
+            FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops,
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv55", HexagonModelV55,
            [ArchV5, ArchV55,
-            FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
-            FeaturePackets, FeatureSmallData]>;
+            FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops,
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv60", HexagonModelV60,
            [ArchV5, ArchV55, ArchV60,
-            FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
-            FeaturePackets, FeatureSmallData]>;
+            FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops,
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv62", HexagonModelV62,
            [ArchV5, ArchV55, ArchV60, ArchV62,
-            FeatureDuplex, FeatureMemops, FeatureNVJ, FeatureNVS,
-            FeaturePackets, FeatureSmallData]>;
+            FeatureCompound, FeatureDuplex, FeaturePreV65, FeatureMemops,
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv65", HexagonModelV65,
            [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65,
-            FeatureDuplex, FeatureMemNoShuf, FeatureMemops, FeatureNVJ,
-            FeatureNVS, FeaturePackets, FeatureSmallData]>;
+            FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
 def : Proc<"hexagonv66", HexagonModelV66,
            [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66,
-            FeatureDuplex, FeatureMemNoShuf, FeatureMemops, FeatureNVJ,
+            FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+def : Proc<"hexagonv67", HexagonModelV67,
+           [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66, ArchV67,
+            FeatureCompound, FeatureDuplex, FeatureMemNoShuf, FeatureMemops,
+            FeatureNVJ, FeatureNVS, FeaturePackets, FeatureSmallData]>;
+// Need to update the correct features for tiny core.
+// Disable NewValueJumps since the packetizer is unable to handle a packet with
+// a new value jump and another SLOT0 instruction.
+def : Proc<"hexagonv67t", HexagonModelV67T,
+           [ArchV5, ArchV55, ArchV60, ArchV62, ArchV65, ArchV66, ArchV67,
+            ProcTinyCore, ExtensionAudio,
+            FeatureCompound, FeatureMemNoShuf, FeatureMemops,
             FeatureNVS, FeaturePackets, FeatureSmallData]>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Hexagon/HexagonArch.h b/llvm/lib/Target/Hexagon/HexagonArch.h
new file mode 100644
index 000000000000..e5d528390c51
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonArch.h
@@ -0,0 +1,37 @@
+//===- HexagonArch.h ------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONARCH_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONARCH_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/StringRef.h"
+#include "HexagonDepArch.h"
+#include <algorithm>
+
+namespace llvm {
+namespace Hexagon {
+
+template <class ArchCont, typename Val>
+bool ValidArch(ArchCont const &ArchList, Val HexArch) {
+  return std::any_of(std::begin(ArchList), std::end(ArchList),
+                     [HexArch](Val V) { return V == HexArch; });
+}
+
+template <class ArchCont, typename Val>
+llvm::Optional<ArchEnum> GetCpu(ArchCont const &ArchList, Val CPUString) {
+  llvm::Optional<ArchEnum> Res;
+  auto Entry = ArchList.find(CPUString);
+  if (Entry != ArchList.end())
+    Res = Entry->second;
+  return Res;
+}
+} // namespace Hexagon
+} // namespace llvm
+#endif  // LLVM_LIB_TARGET_HEXAGON_HEXAGONARCH_H
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
index 30fdde70d01a..f3017d02995e 100644
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.cpp
@@ -206,10 +206,10 @@ static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
 
     Sym = AP.OutContext.getOrCreateSymbol(Twine(symbolName));
     if (Sym->isUndefined()) {
-      OutStreamer.EmitLabel(Sym);
-      OutStreamer.EmitSymbolAttribute(Sym, MCSA_Global);
-      OutStreamer.EmitIntValue(Value, AlignSize);
-      OutStreamer.EmitCodeAlignment(AlignSize);
+      OutStreamer.emitLabel(Sym);
+      OutStreamer.emitSymbolAttribute(Sym, MCSA_Global);
+      OutStreamer.emitIntValue(Value, AlignSize);
+      OutStreamer.emitCodeAlignment(AlignSize);
     }
   } else {
     assert(Imm.isExpr() && "Expected expression and found none");
@@ -234,10 +234,10 @@ static MCSymbol *smallData(AsmPrinter &AP, const MachineInstr &MI,
     OutStreamer.SwitchSection(Section);
     Sym = AP.OutContext.getOrCreateSymbol(Twine(LitaName));
     if (Sym->isUndefined()) {
-      OutStreamer.EmitLabel(Sym);
-      OutStreamer.EmitSymbolAttribute(Sym, MCSA_Local);
-      OutStreamer.EmitValue(Imm.getExpr(), AlignSize);
-      OutStreamer.EmitCodeAlignment(AlignSize);
+      OutStreamer.emitLabel(Sym);
+      OutStreamer.emitSymbolAttribute(Sym, MCSA_Local);
+      OutStreamer.emitValue(Imm.getExpr(), AlignSize);
+      OutStreamer.emitCodeAlignment(AlignSize);
     }
   }
   return Sym;
@@ -740,7 +740,7 @@ void HexagonAsmPrinter::HexagonProcessInstruction(MCInst &Inst,
 }
 
 /// Print out a single Hexagon MI to the current output stream.
-void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void HexagonAsmPrinter::emitInstruction(const MachineInstr *MI) {
   MCInst MCB;
   MCB.setOpcode(Hexagon::BUNDLE);
   MCB.addOperand(MCOperand::createImm(0));
@@ -768,7 +768,7 @@ void HexagonAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   assert(Ok); (void)Ok;
   if (HexagonMCInstrInfo::bundleSize(MCB) == 0)
     return;
-  OutStreamer->EmitInstruction(MCB, getSubtargetInfo());
+  OutStreamer->emitInstruction(MCB, getSubtargetInfo());
 }
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeHexagonAsmPrinter() {
diff --git a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
index 6c4b664e83f5..3932def87854 100755
--- a/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
+++ b/llvm/lib/Target/Hexagon/HexagonAsmPrinter.h
@@ -46,7 +46,7 @@ class TargetMachine;
     bool isBlockOnlyReachableByFallthrough(const MachineBasicBlock *MBB)
           const override;
 
-    void EmitInstruction(const MachineInstr *MI) override;
+    void emitInstruction(const MachineInstr *MI) override;
     void HexagonProcessInstruction(MCInst &Inst, const MachineInstr &MBB);
 
     void printOperand(const MachineInstr *MI, unsigned OpNo, raw_ostream &O);
diff --git a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
index 799b85ed48b4..49edb0d99492 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitSimplify.cpp
@@ -1433,10 +1433,16 @@ unsigned ConstGeneration::genTfrConst(const TargetRegisterClass *RC, int64_t C,
           .addImm(int32_t(Lo));
       return Reg;
     }
+    MachineFunction *MF = B.getParent();
+    auto &HST = MF->getSubtarget<HexagonSubtarget>();
 
-    BuildMI(B, At, DL, HII.get(Hexagon::CONST64), Reg)
-        .addImm(C);
-    return Reg;
+    // Disable CONST64 for tiny core since it takes a LD resource.
+    if (!HST.isTinyCore() ||
+        MF->getFunction().hasOptSize()) {
+      BuildMI(B, At, DL, HII.get(Hexagon::CONST64), Reg)
+          .addImm(C);
+      return Reg;
+    }
   }
 
   if (RC == &Hexagon::PredRegsRegClass) {
diff --git a/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp b/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
index ebd060ce503e..1e4030b84bc1 100644
--- a/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBitTracker.cpp
@@ -330,7 +330,7 @@ bool HexagonEvaluator::evaluate(const MachineInstr &MI,
     case PS_fi: {
       int FI = op(1).getIndex();
       int Off = op(2).getImm();
-      unsigned A = MFI.getObjectAlignment(FI) + std::abs(Off);
+      unsigned A = MFI.getObjectAlign(FI).value() + std::abs(Off);
       unsigned L = countTrailingZeros(A);
       RegisterCell RC = RegisterCell::self(Reg[0].Reg, W0);
       RC.fill(0, L, BT::BitValue::Zero);
diff --git a/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp b/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
index 08f740806879..6891455631a8 100644
--- a/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonBranchRelaxation.cpp
@@ -105,7 +105,7 @@ void HexagonBranchRelaxation::computeOffset(MachineFunction &MF,
   // offset of the current instruction from the start.
   unsigned InstOffset = 0;
   for (auto &B : MF) {
-    if (B.getAlignment() != Align::None()) {
+    if (B.getAlignment() != Align(1)) {
       // Although we don't know the exact layout of the final code, we need
       // to account for alignment padding somehow. This heuristic pads each
       // aligned basic block according to the alignment value.
diff --git a/llvm/lib/Target/Hexagon/HexagonCallingConv.td b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
index 5c31a81a1e87..93e17e608dd1 100644
--- a/llvm/lib/Target/Hexagon/HexagonCallingConv.td
+++ b/llvm/lib/Target/Hexagon/HexagonCallingConv.td
@@ -18,7 +18,7 @@ def CC_HexagonStack: CallingConv<[
     CCAssignToStack<8,8>>
 ]>;
 
-def CC_Hexagon: CallingConv<[
+def CC_Hexagon_Legacy: CallingConv<[
   CCIfType<[i1,i8,i16],
     CCPromoteToType<i32>>,
   CCIfType<[f32],
@@ -48,6 +48,36 @@ def CC_Hexagon: CallingConv<[
   CCDelegateTo<CC_HexagonStack>
 ]>;
 
+def CC_Hexagon: CallingConv<[
+  CCIfType<[i1,i8,i16],
+    CCPromoteToType<i32>>,
+  CCIfType<[f32],
+    CCBitConvertToType<i32>>,
+  CCIfType<[f64],
+    CCBitConvertToType<i64>>,
+
+  CCIfByVal<
+    CCPassByVal<8,1>>,
+  CCIfArgIsVarArg<
+    CCDelegateTo<CC_HexagonStack>>,
+
+  // Pass split values in pairs, allocate odd register if necessary.
+  CCIfType<[i32],
+    CCIfSplit<
+      CCCustom<"CC_SkipOdd">>>,
+
+  CCIfType<[i32,v2i16,v4i8],
+    CCAssignToReg<[R0,R1,R2,R3,R4,R5]>>,
+  // Make sure to allocate any skipped 32-bit register, so it does not get
+  // allocated to a subsequent 32-bit value.
+  CCIfType<[i64,v2i32,v4i16,v8i8],
+    CCCustom<"CC_SkipOdd">>,
+  CCIfType<[i64,v2i32,v4i16,v8i8],
+    CCAssignToReg<[D0,D1,D2]>>,
+
+  CCDelegateTo<CC_HexagonStack>
+]>;
+
 def RetCC_Hexagon: CallingConv<[
   CCIfType<[i1,i8,i16],
     CCPromoteToType<i32>>,
diff --git a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
index 6d2aadb066cf..6a5192c866cc 100644
--- a/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCommonGEP.cpp
@@ -204,17 +204,7 @@ namespace {
   Type *next_type(Type *Ty, Value *Idx) {
     if (auto *PTy = dyn_cast<PointerType>(Ty))
       return PTy->getElementType();
-    // Advance the type.
-    if (!Ty->isStructTy()) {
-      Type *NexTy = cast<SequentialType>(Ty)->getElementType();
-      return NexTy;
-    }
-    // Otherwise it is a struct type.
-    ConstantInt *CI = dyn_cast<ConstantInt>(Idx);
-    assert(CI && "Struct type with non-constant index");
-    int64_t i = CI->getValue().getSExtValue();
-    Type *NextTy = cast<StructType>(Ty)->getElementType(i);
-    return NextTy;
+    return GetElementPtrInst::getTypeAtIndex(Ty, Idx);
   }
 
   raw_ostream &operator<< (raw_ostream &OS, const GepNode &GN) {
@@ -1302,7 +1292,8 @@ bool HexagonCommonGEP::runOnFunction(Function &F) {
 
 #ifdef EXPENSIVE_CHECKS
   // Run this only when expensive checks are enabled.
-  verifyFunction(F);
+  if (verifyFunction(F, &dbgs()))
+    report_fatal_error("Broken function");
 #endif
   return true;
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
index aa9a715718bf..05b95d8b7314 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstExtenders.cpp
@@ -379,6 +379,7 @@ namespace {
     using AssignmentMap = std::map<ExtenderInit, IndexList>;
     using LocDefList = std::vector<std::pair<Loc, IndexList>>;
 
+    const HexagonSubtarget *HST = nullptr;
     const HexagonInstrInfo *HII = nullptr;
     const HexagonRegisterInfo *HRI = nullptr;
     MachineDominatorTree *MDT = nullptr;
@@ -1562,13 +1563,31 @@ HCE::Register HCE::insertInitializer(Loc DefL, const ExtenderInit &ExtI) {
                   .add(ExtOp);
       }
     } else {
-      unsigned NewOpc = Ex.Neg ? Hexagon::S4_subi_asl_ri
-                               : Hexagon::S4_addi_asl_ri;
-      // DefR = add(##EV,asl(Rb,S))
-      InitI = BuildMI(MBB, At, dl, HII->get(NewOpc), DefR)
-                .add(ExtOp)
-                .add(MachineOperand(Ex.Rs))
-                .addImm(Ex.S);
+      if (HST->useCompound()) {
+        unsigned NewOpc = Ex.Neg ? Hexagon::S4_subi_asl_ri
+                                 : Hexagon::S4_addi_asl_ri;
+        // DefR = add(##EV,asl(Rb,S))
+        InitI = BuildMI(MBB, At, dl, HII->get(NewOpc), DefR)
+                  .add(ExtOp)
+                  .add(MachineOperand(Ex.Rs))
+                  .addImm(Ex.S);
+      } else {
+        // No compounds are available. It is not clear whether we should
+        // even process such extenders where the initializer cannot be
+        // a single instruction, but do it for now.
+        unsigned TmpR = MRI->createVirtualRegister(&Hexagon::IntRegsRegClass);
+        BuildMI(MBB, At, dl, HII->get(Hexagon::S2_asl_i_r), TmpR)
+          .add(MachineOperand(Ex.Rs))
+          .addImm(Ex.S);
+        if (Ex.Neg)
+          InitI = BuildMI(MBB, At, dl, HII->get(Hexagon::A2_subri), DefR)
+                    .add(ExtOp)
+                    .add(MachineOperand(Register(TmpR, 0)));
+        else
+          InitI = BuildMI(MBB, At, dl, HII->get(Hexagon::A2_addi), DefR)
+                    .add(MachineOperand(Register(TmpR, 0)))
+                    .add(ExtOp);
+      }
     }
   }
 
@@ -1952,8 +1971,9 @@ bool HCE::runOnMachineFunction(MachineFunction &MF) {
   }
   LLVM_DEBUG(MF.print(dbgs() << "Before " << getPassName() << '\n', nullptr));
 
-  HII = MF.getSubtarget<HexagonSubtarget>().getInstrInfo();
-  HRI = MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
+  HST = &MF.getSubtarget<HexagonSubtarget>();
+  HII = HST->getInstrInfo();
+  HRI = HST->getRegisterInfo();
   MDT = &getAnalysis<MachineDominatorTree>();
   MRI = &MF.getRegInfo();
   AssignmentMap IMap;
diff --git a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
index 5b61d1084e08..77578378b058 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -754,6 +754,9 @@ void MachineConstPropagator::visitBranchesFrom(const MachineInstr &BrI) {
     ++It;
   }
 
+  if (B.mayHaveInlineAsmBr())
+    EvalOk = false;
+
   if (EvalOk) {
     // Need to add all CFG successors that lead to EH landing pads.
     // There won't be explicit branches to these blocks, but they must
@@ -810,8 +813,12 @@ void MachineConstPropagator::visitUsesOf(unsigned Reg) {
 
 bool MachineConstPropagator::computeBlockSuccessors(const MachineBasicBlock *MB,
       SetVector<const MachineBasicBlock*> &Targets) {
+  Targets.clear();
+
   MachineBasicBlock::const_iterator FirstBr = MB->end();
   for (const MachineInstr &MI : *MB) {
+    if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
+      return false;
     if (MI.isDebugInstr())
       continue;
     if (MI.isBranch()) {
@@ -820,7 +827,6 @@ bool MachineConstPropagator::computeBlockSuccessors(const MachineBasicBlock *MB,
     }
   }
 
-  Targets.clear();
   MachineBasicBlock::const_iterator End = MB->end();
 
   bool DoNext = true;
@@ -2836,6 +2842,9 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
   if (MI.isCopy())
     return false;
 
+  MachineFunction *MF = MI.getParent()->getParent();
+  auto &HST = MF->getSubtarget<HexagonSubtarget>();
+
   // Collect all virtual register-def operands.
   SmallVector<unsigned,2> DefRegs;
   for (const MachineOperand &MO : MI.operands()) {
@@ -2923,11 +2932,13 @@ bool HexagonConstEvaluator::rewriteHexConstDefs(MachineInstr &MI,
             NewMI = BuildMI(B, At, DL, *NewD, NewR)
                       .addImm(Hi)
                       .addImm(Lo);
-          } else {
+          } else if (MF->getFunction().hasOptSize() || !HST.isTinyCore()) {
+            // Disable CONST64 for tiny core since it takes a LD resource.
             NewD = &HII.get(Hexagon::CONST64);
             NewMI = BuildMI(B, At, DL, *NewD, NewR)
                       .addImm(V);
-          }
+          } else
+            return false;
         }
       }
       (void)NewMI;
diff --git a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
index 394a329ac447..587527d8c32c 100644
--- a/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonCopyToCombine.cpp
@@ -21,7 +21,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/PassSupport.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CodeGen.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -212,7 +212,7 @@ static bool areCombinableOperations(const TargetRegisterInfo *TRI,
   // There is a combine of two constant extended values into CONST64,
   // provided both constants are true immediates.
   if (isGreaterThanNBitTFRI<16>(HighRegInst) &&
-      isGreaterThanNBitTFRI<16>(LowRegInst))
+      isGreaterThanNBitTFRI<16>(LowRegInst) && !IsConst64Disabled)
     return (HighRegInst.getOperand(1).isImm() &&
             LowRegInst.getOperand(1).isImm());
 
@@ -279,11 +279,11 @@ bool HexagonCopyToCombine::isSafeToMoveTogether(MachineInstr &I1,
     // A reverse_iterator instantiated like below starts before I2, and I1
     // respectively.
     // Look at instructions I in between I2 and (excluding) I1.
-    MachineBasicBlock::reverse_iterator I(I2),
-      End = --(MachineBasicBlock::reverse_iterator(I1));
+    MachineBasicBlock::reverse_iterator I = ++I2.getIterator().getReverse();
+    MachineBasicBlock::reverse_iterator End = I1.getIterator().getReverse();
     // At 03 we got better results (dhrystone!) by being more conservative.
     if (!ShouldCombineAggressively)
-      End = MachineBasicBlock::reverse_iterator(I1);
+      End = ++I1.getIterator().getReverse();
     // If I2 kills its operand and we move I2 over an instruction that also
     // uses I2's use reg we need to modify that (first) instruction to now kill
     // this reg.
@@ -477,6 +477,10 @@ bool HexagonCopyToCombine::runOnMachineFunction(MachineFunction &MF) {
   ShouldCombineAggressively =
     MF.getTarget().getOptLevel() <= CodeGenOpt::Default;
 
+  // Disable CONST64 for tiny core since it takes a LD resource.
+  if (!OptForSize && ST->isTinyCore())
+    IsConst64Disabled = true;
+
   // Traverse basic blocks.
   for (MachineFunction::iterator BI = MF.begin(), BE = MF.end(); BI != BE;
        ++BI) {
diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.h b/llvm/lib/Target/Hexagon/HexagonDepArch.h
index 529be7ef0ac7..45b4cf042443 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepArch.h
+++ b/llvm/lib/Target/Hexagon/HexagonDepArch.h
@@ -5,15 +5,44 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+// Automatically generated file, do not edit!
 //===----------------------------------------------------------------------===//
 
 
-#ifndef HEXAGON_DEP_ARCH_H
-#define HEXAGON_DEP_ARCH_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPARCH_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPARCH_H
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
+#include <map>
+
 namespace llvm {
 namespace Hexagon {
-enum class ArchEnum { NoArch, Generic, V5, V55, V60, V62, V65, V66 };
+enum class ArchEnum { NoArch, Generic, V5, V55, V60, V62, V65, V66, V67 };
+
+static constexpr unsigned ArchValsNumArray[] = {5, 55, 60, 62, 65, 66, 67};
+static constexpr ArrayRef<unsigned> ArchValsNum(ArchValsNumArray);
+
+static constexpr StringLiteral ArchValsTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67" };
+static constexpr ArrayRef<StringLiteral> ArchValsText(ArchValsTextArray);
+
+static constexpr StringLiteral CpuValsTextArray[] = { "hexagonv5", "hexagonv55", "hexagonv60", "hexagonv62", "hexagonv65", "hexagonv66", "hexagonv67", "hexagonv67t" };
+static constexpr ArrayRef<StringLiteral> CpuValsText(CpuValsTextArray);
+
+static constexpr StringLiteral CpuNickTextArray[] = { "v5", "v55", "v60", "v62", "v65", "v66", "v67", "v67t" };
+static constexpr ArrayRef<StringLiteral> CpuNickText(CpuNickTextArray);
+
+static const std::map<std::string, ArchEnum> CpuTable{
+    {"generic", Hexagon::ArchEnum::V60},
+    {"hexagonv5", Hexagon::ArchEnum::V5},
+    {"hexagonv55", Hexagon::ArchEnum::V55},
+    {"hexagonv60", Hexagon::ArchEnum::V60},
+    {"hexagonv62", Hexagon::ArchEnum::V62},
+    {"hexagonv65", Hexagon::ArchEnum::V65},
+    {"hexagonv66", Hexagon::ArchEnum::V66},
+    {"hexagonv67", Hexagon::ArchEnum::V67},
+    {"hexagonv67t", Hexagon::ArchEnum::V67},
+};
 } // namespace Hexagon
 } // namespace llvm;
-#endif // HEXAGON_DEP_ARCH_H
+#endif  // LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPARCH_H
diff --git a/llvm/lib/Target/Hexagon/HexagonDepArch.td b/llvm/lib/Target/Hexagon/HexagonDepArch.td
index 115cf2383a7a..9374055eae7d 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepArch.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepArch.td
@@ -5,18 +5,20 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+// Automatically generated file, do not edit!
 //===----------------------------------------------------------------------===//
 
-def ArchV66: SubtargetFeature<"v66", "HexagonArchVersion", "Hexagon::ArchEnum::V66", "Enable Hexagon V66 architecture">;
-def HasV66 : Predicate<"HST->hasV66Ops()">, AssemblerPredicate<"ArchV66">;
-def ArchV65: SubtargetFeature<"v65", "HexagonArchVersion", "Hexagon::ArchEnum::V65", "Enable Hexagon V65 architecture">;
-def HasV65 : Predicate<"HST->hasV65Ops()">, AssemblerPredicate<"ArchV65">;
-def ArchV62: SubtargetFeature<"v62", "HexagonArchVersion", "Hexagon::ArchEnum::V62", "Enable Hexagon V62 architecture">;
-def HasV62 : Predicate<"HST->hasV62Ops()">, AssemblerPredicate<"ArchV62">;
-def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "Hexagon::ArchEnum::V60", "Enable Hexagon V60 architecture">;
-def HasV60 : Predicate<"HST->hasV60Ops()">, AssemblerPredicate<"ArchV60">;
-def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "Hexagon::ArchEnum::V55", "Enable Hexagon V55 architecture">;
-def HasV55 : Predicate<"HST->hasV55Ops()">, AssemblerPredicate<"ArchV55">;
 def ArchV5: SubtargetFeature<"v5", "HexagonArchVersion", "Hexagon::ArchEnum::V5", "Enable Hexagon V5 architecture">;
-def HasV5 : Predicate<"HST->hasV5Ops()">, AssemblerPredicate<"ArchV5">;
+def HasV5 : Predicate<"HST->hasV5Ops()">, AssemblerPredicate<(all_of ArchV5)>;
+def ArchV55: SubtargetFeature<"v55", "HexagonArchVersion", "Hexagon::ArchEnum::V55", "Enable Hexagon V55 architecture">;
+def HasV55 : Predicate<"HST->hasV55Ops()">, AssemblerPredicate<(all_of ArchV55)>;
+def ArchV60: SubtargetFeature<"v60", "HexagonArchVersion", "Hexagon::ArchEnum::V60", "Enable Hexagon V60 architecture">;
+def HasV60 : Predicate<"HST->hasV60Ops()">, AssemblerPredicate<(all_of ArchV60)>;
+def ArchV62: SubtargetFeature<"v62", "HexagonArchVersion", "Hexagon::ArchEnum::V62", "Enable Hexagon V62 architecture">;
+def HasV62 : Predicate<"HST->hasV62Ops()">, AssemblerPredicate<(all_of ArchV62)>;
+def ArchV65: SubtargetFeature<"v65", "HexagonArchVersion", "Hexagon::ArchEnum::V65", "Enable Hexagon V65 architecture">;
+def HasV65 : Predicate<"HST->hasV65Ops()">, AssemblerPredicate<(all_of ArchV65)>;
+def ArchV66: SubtargetFeature<"v66", "HexagonArchVersion", "Hexagon::ArchEnum::V66", "Enable Hexagon V66 architecture">;
+def HasV66 : Predicate<"HST->hasV66Ops()">, AssemblerPredicate<(all_of ArchV66)>;
+def ArchV67: SubtargetFeature<"v67", "HexagonArchVersion", "Hexagon::ArchEnum::V67", "Enable Hexagon V67 architecture">;
+def HasV67 : Predicate<"HST->hasV67Ops()">, AssemblerPredicate<(all_of ArchV67)>;
diff --git a/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc b/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc
index 10068abce7ec..ce7aa02e3e06 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc
+++ b/llvm/lib/Target/Hexagon/HexagonDepDecoders.inc
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+// Automatically generated file, do not edit!
 //===----------------------------------------------------------------------===//
 
 // clang-format off
@@ -15,39 +15,44 @@
 #pragma clang diagnostic ignored "-Wunused-function"
 #endif
 
+static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp,
+    uint64_t, const void *Decoder) {
+  signedDecoder<8>(MI, tmp, Decoder);
+  return MCDisassembler::Success;
+}
 static DecodeStatus s4_0ImmDecoder(MCInst &MI, unsigned tmp,
     uint64_t, const void *Decoder) {
   signedDecoder<4>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s29_3ImmDecoder(MCInst &MI, unsigned tmp,
+static DecodeStatus s31_1ImmDecoder(MCInst &MI, unsigned tmp,
     uint64_t, const void *Decoder) {
-  signedDecoder<14>(MI, tmp, Decoder);
+  signedDecoder<12>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s8_0ImmDecoder(MCInst &MI, unsigned tmp,
+static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
     uint64_t, const void *Decoder) {
-  signedDecoder<8>(MI, tmp, Decoder);
+  signedDecoder<5>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
+static DecodeStatus s30_2ImmDecoder(MCInst &MI, unsigned tmp,
     uint64_t, const void *Decoder) {
-  signedDecoder<7>(MI, tmp, Decoder);
+  signedDecoder<13>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s31_1ImmDecoder(MCInst &MI, unsigned tmp,
+static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
     uint64_t, const void *Decoder) {
-  signedDecoder<12>(MI, tmp, Decoder);
+  signedDecoder<6>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp,
+static DecodeStatus s29_3ImmDecoder(MCInst &MI, unsigned tmp,
     uint64_t, const void *Decoder) {
-  signedDecoder<3>(MI, tmp, Decoder);
+  signedDecoder<14>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s30_2ImmDecoder(MCInst &MI, unsigned tmp,
+static DecodeStatus s4_3ImmDecoder(MCInst &MI, unsigned tmp,
     uint64_t, const void *Decoder) {
-  signedDecoder<13>(MI, tmp, Decoder);
+  signedDecoder<7>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 static DecodeStatus s6_0ImmDecoder(MCInst &MI, unsigned tmp,
@@ -60,14 +65,9 @@ static DecodeStatus s6_3ImmDecoder(MCInst &MI, unsigned tmp,
   signedDecoder<9>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
-static DecodeStatus s4_1ImmDecoder(MCInst &MI, unsigned tmp,
-    uint64_t, const void *Decoder) {
-  signedDecoder<5>(MI, tmp, Decoder);
-  return MCDisassembler::Success;
-}
-static DecodeStatus s4_2ImmDecoder(MCInst &MI, unsigned tmp,
+static DecodeStatus s3_0ImmDecoder(MCInst &MI, unsigned tmp,
     uint64_t, const void *Decoder) {
-  signedDecoder<6>(MI, tmp, Decoder);
+  signedDecoder<3>(MI, tmp, Decoder);
   return MCDisassembler::Success;
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
index fefbbfd3f1ac..1547e8f769b1 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepIICHVX.td
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+// Automatically generated file, do not edit!
 //===----------------------------------------------------------------------===//
 
 def tc_04da405a : InstrItinClass;
@@ -2554,3 +2554,494 @@ class DepHVXItinV66 {
       [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>
   ];
 }
+
+class DepHVXItinV67 {
+  list<InstrItinData> DepHVXItinV67_list = [
+    InstrItinData <tc_04da405a, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_05058f6f, /*SLOT1,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_05ac6f98, /*SLOT1,LOAD,VA*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_05ca8cfd, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_08a4f1b6, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_0b04c6c7, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ec46cf9, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_131f1c81, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1381a97c, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [],
+      []>,
+
+    InstrItinData <tc_15fdf750, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_16ff9ef8, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_191381c1, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 7, 1, 2, 7],
+      [Hex_FWD, HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_1ad8a370, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1ba8a0cd, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20a4bbec, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_257f6f7c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_26a377fe, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c745bb8, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_2d4051cd, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 3, 7, 5, 2],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2e8f5f6e, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 7, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_309dbb4f, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3904b926, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3aacf4a8, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_3ad719fb, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3c56e5ce, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3ce09744, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3e2aaafc, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_447d9895, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_453fe68d, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_46d6c3e0, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_51d0ecc3, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_52447ecc, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_540c3da3, /*SLOT0,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [4, 7, 1],
+      [Hex_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_54a0dc47, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [3, 2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_561aaa58, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_56c4f9fe, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_56e64202, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_58d21193, /*SLOT0,STORE,VA_DV*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_5bf8afbb, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_61bf7c03, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_649072c2, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_660769f1, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_663c80a7, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6942b6e0, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_6e7fa133, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_71646d06, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7177e272, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_718b5c53, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9],
+      [HVX_FWD]>,
+
+    InstrItinData <tc_7273323b, /*SLOT0,STORE,VA_DV*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_7417e785, /*SLOT0123,VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_SHIFT]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_767c4e9d, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7e6a3e89, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_8772086c, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_87adc037, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8e420e4d, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [7, 1, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_90bcc1db, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_933f2b39, /*SLOT23,4SLOT_MPY*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL_NOMEM]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_946013d8, /*SLOT0123,VP*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 5],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_9d1dc972, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f363d21, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a02a10a8, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [2, 1, 2, 7],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_a0dbea28, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7e6707d, /*SLOT0,NOSLOT1,LOAD,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ab23f776, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_abe8c3b2, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ac4046bc, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_af25efd9, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 2, 7, 7],
+      [HVX_FWD, Hex_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b091f1c6, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 7, 5, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b28e51aa, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b4416217, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7],
+      [HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_b9db8205, /*SLOT01,LOAD*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD]>], [9, 3, 2, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c0749f3c, /*SLOT01,LOAD,VA*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 1, 2],
+      [HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c127de3a, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_c4edf264, /*SLOT23,VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1]>], [9, 2],
+      [HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c5dba46e, /*SLOT0,STORE,VA*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_c7039829, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_cd94bfe0, /*SLOT23,VS_VX*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1], 0>,
+       InstrStage<1, [CVI_SHIFT, CVI_XLANE]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d8287c14, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_db5555f3, /*SLOT0123,VA_DV*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_dd5b0695, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_df80eeb0, /*SLOT0123,VP_VS*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_XLSHF]>], [9, 7, 5, 5],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e2d2e9e5, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [3, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e35c1e93, /*SLOT0123,VA*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [9, 9, 7, 7],
+      [HVX_FWD, HVX_FWD, HVX_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e3f68a46, /*SLOT0123,4SLOT*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ALL]>], [3],
+      [HVX_FWD]>,
+
+    InstrItinData <tc_e675c45a, /*SLOT23,VX_DV*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 7, 5, 2, 2],
+      [HVX_FWD, HVX_FWD, HVX_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e699ae41, /*SLOT01,ZW*/
+      [InstrStage<1, [SLOT0, SLOT1], 0>,
+       InstrStage<1, [CVI_ZW]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e8797b98, /*SLOT1,LOAD,VA*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE]>], [1, 2, 7],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_e99d4c2e, /*SLOT0,STORE*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 1, 2, 5],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_f1de44ef, /*SLOT2,VX_DV*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_MPY01]>], [9, 5, 2],
+      [HVX_FWD, HVX_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f21e8abb, /*SLOT0,NOSLOT1,STORE,VP*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_ST], 0>,
+       InstrStage<1, [CVI_XLANE]>], [1, 2, 5],
+      [Hex_FWD, Hex_FWD, HVX_FWD]>,
+
+    InstrItinData <tc_fd7610da, /*SLOT1,LOAD,VA_DV*/
+      [InstrStage<1, [SLOT1], 0>,
+       InstrStage<1, [CVI_LD], 0>,
+       InstrStage<1, [CVI_MPY01, CVI_XLSHF]>], [7, 1, 2, 7],
+      [HVX_FWD, Hex_FWD, Hex_FWD, HVX_FWD]>
+  ];
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
index 34da0be02d19..fecccb250198 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepIICScalar.td
@@ -5,3792 +5,7079 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+// Automatically generated file, do not edit!
 //===----------------------------------------------------------------------===//
 
-def tc_002cb246 : InstrItinClass;
-def tc_0371abea : InstrItinClass;
-def tc_05c070ec : InstrItinClass;
-def tc_05d3a09b : InstrItinClass;
-def tc_0663f615 : InstrItinClass;
-def tc_096199d3 : InstrItinClass;
-def tc_0a705168 : InstrItinClass;
-def tc_0ae0825c : InstrItinClass;
-def tc_0b2be201 : InstrItinClass;
-def tc_0d8f5752 : InstrItinClass;
-def tc_13bfbcf9 : InstrItinClass;
-def tc_14b272fa : InstrItinClass;
-def tc_14b5c689 : InstrItinClass;
-def tc_15aa71c5 : InstrItinClass;
-def tc_174516e8 : InstrItinClass;
-def tc_17e0d2cd : InstrItinClass;
-def tc_1a2fd869 : InstrItinClass;
-def tc_1ad90acd : InstrItinClass;
-def tc_1ae57e39 : InstrItinClass;
-def tc_1b6f7cec : InstrItinClass;
-def tc_1c4528a2 : InstrItinClass;
-def tc_1c80410a : InstrItinClass;
-def tc_1d81e60e : InstrItinClass;
-def tc_1fc97744 : InstrItinClass;
-def tc_20cdee80 : InstrItinClass;
-def tc_2332b92e : InstrItinClass;
-def tc_24b66c99 : InstrItinClass;
-def tc_25a78932 : InstrItinClass;
-def tc_2b8da4c2 : InstrItinClass;
-def tc_2eabeebe : InstrItinClass;
-def tc_2f7c551d : InstrItinClass;
-def tc_2ff964b4 : InstrItinClass;
-def tc_30b9bb4a : InstrItinClass;
-def tc_32779c6f : InstrItinClass;
-def tc_36153880 : InstrItinClass;
-def tc_362c6592 : InstrItinClass;
-def tc_3962fa26 : InstrItinClass;
-def tc_39dfefe8 : InstrItinClass;
-def tc_3a867367 : InstrItinClass;
-def tc_3b470976 : InstrItinClass;
-def tc_3b5b7ef9 : InstrItinClass;
-def tc_3bd75825 : InstrItinClass;
-def tc_3c76b0ff : InstrItinClass;
-def tc_3d495a39 : InstrItinClass;
-def tc_40116ca8 : InstrItinClass;
-def tc_434c8e1e : InstrItinClass;
-def tc_4414d8b1 : InstrItinClass;
-def tc_44d3da28 : InstrItinClass;
-def tc_4560740b : InstrItinClass;
-def tc_4837eefb : InstrItinClass;
-def tc_49a8207d : InstrItinClass;
-def tc_4ae7b58b : InstrItinClass;
-def tc_4b68bce4 : InstrItinClass;
-def tc_4c5ba658 : InstrItinClass;
-def tc_4d5fa3a1 : InstrItinClass;
-def tc_53559e35 : InstrItinClass;
-def tc_56336eb0 : InstrItinClass;
-def tc_56f114f4 : InstrItinClass;
-def tc_57890846 : InstrItinClass;
-def tc_5a2711e5 : InstrItinClass;
-def tc_5abb5e3f : InstrItinClass;
-def tc_5aee39f7 : InstrItinClass;
-def tc_5b54b33f : InstrItinClass;
-def tc_5b7c0967 : InstrItinClass;
-def tc_5bf126a6 : InstrItinClass;
-def tc_5d7f5414 : InstrItinClass;
-def tc_5ef37dc4 : InstrItinClass;
-def tc_6132ba3d : InstrItinClass;
-def tc_61830035 : InstrItinClass;
-def tc_640086b5 : InstrItinClass;
-def tc_643b4717 : InstrItinClass;
-def tc_67435e81 : InstrItinClass;
-def tc_675e4897 : InstrItinClass;
-def tc_679309b8 : InstrItinClass;
-def tc_6b25e783 : InstrItinClass;
-def tc_703e822c : InstrItinClass;
-def tc_7186d325 : InstrItinClass;
-def tc_7646c131 : InstrItinClass;
-def tc_76851da1 : InstrItinClass;
-def tc_779080bf : InstrItinClass;
-def tc_784490da : InstrItinClass;
-def tc_785f65a7 : InstrItinClass;
-def tc_7a91e76a : InstrItinClass;
-def tc_838b34ea : InstrItinClass;
-def tc_85c9c08f : InstrItinClass;
-def tc_85d5d03f : InstrItinClass;
-def tc_862b3e70 : InstrItinClass;
-def tc_88b4f13d : InstrItinClass;
-def tc_89e94ad3 : InstrItinClass;
-def tc_8b121f4a : InstrItinClass;
-def tc_8b3e402a : InstrItinClass;
-def tc_8c945be0 : InstrItinClass;
-def tc_8c99de45 : InstrItinClass;
-def tc_8d9d0154 : InstrItinClass;
-def tc_8fb7ab1b : InstrItinClass;
-def tc_9461ff31 : InstrItinClass;
-def tc_946df596 : InstrItinClass;
-def tc_9ad9998f : InstrItinClass;
-def tc_9bfd761f : InstrItinClass;
-def tc_9c3ecd83 : InstrItinClass;
-def tc_9ca930f7 : InstrItinClass;
-def tc_9da59d12 : InstrItinClass;
-def tc_9debc299 : InstrItinClass;
-def tc_9e313203 : InstrItinClass;
-def tc_9fc3dae0 : InstrItinClass;
-def tc_a1123dda : InstrItinClass;
-def tc_a1c00888 : InstrItinClass;
-def tc_a58fd5cc : InstrItinClass;
-def tc_a5d4aeec : InstrItinClass;
-def tc_a6b1eca9 : InstrItinClass;
-def tc_a813cf9a : InstrItinClass;
-def tc_a9d88b22 : InstrItinClass;
-def tc_ae53734a : InstrItinClass;
-def tc_b31c2e97 : InstrItinClass;
-def tc_b343892a : InstrItinClass;
-def tc_b43e7930 : InstrItinClass;
-def tc_b4407292 : InstrItinClass;
-def tc_b44ecf75 : InstrItinClass;
-def tc_b4b5c03a : InstrItinClass;
-def tc_b51dc29a : InstrItinClass;
-def tc_b83e6d73 : InstrItinClass;
-def tc_b857bf4e : InstrItinClass;
-def tc_b8bffe55 : InstrItinClass;
-def tc_b90a29b1 : InstrItinClass;
-def tc_b9272d6c : InstrItinClass;
-def tc_b9e09e03 : InstrItinClass;
-def tc_bab0eed9 : InstrItinClass;
-def tc_bafaade3 : InstrItinClass;
-def tc_bcf98408 : InstrItinClass;
-def tc_bd8382d1 : InstrItinClass;
-def tc_bdceeac1 : InstrItinClass;
-def tc_be9602ff : InstrItinClass;
-def tc_bf061958 : InstrItinClass;
-def tc_bfec0f01 : InstrItinClass;
-def tc_c4db48cb : InstrItinClass;
-def tc_c4f596e3 : InstrItinClass;
-def tc_c79a189f : InstrItinClass;
-def tc_c8ce0b5c : InstrItinClass;
-def tc_cd374165 : InstrItinClass;
-def tc_cf8126ae : InstrItinClass;
-def tc_cfd8378a : InstrItinClass;
-def tc_d08ee0f4 : InstrItinClass;
-def tc_d1aa9eaa : InstrItinClass;
-def tc_d2e63d61 : InstrItinClass;
-def tc_d5b7b0c1 : InstrItinClass;
-def tc_d5c0729a : InstrItinClass;
-def tc_d63f638c : InstrItinClass;
-def tc_d65dbf51 : InstrItinClass;
-def tc_d773585a : InstrItinClass;
-def tc_d9d43ecb : InstrItinClass;
-def tc_da4a37ed : InstrItinClass;
-def tc_da97ee82 : InstrItinClass;
-def tc_db2bce9c : InstrItinClass;
-def tc_de4df740 : InstrItinClass;
-def tc_de554571 : InstrItinClass;
-def tc_df3319ed : InstrItinClass;
-def tc_e06f432a : InstrItinClass;
-def tc_e4a7f9f0 : InstrItinClass;
-def tc_e4b3cb20 : InstrItinClass;
-def tc_e78647bd : InstrItinClass;
-def tc_e86aa961 : InstrItinClass;
-def tc_e93a3d71 : InstrItinClass;
-def tc_e95795ec : InstrItinClass;
-def tc_e9f3243f : InstrItinClass;
-def tc_f429765c : InstrItinClass;
-def tc_f675fee8 : InstrItinClass;
-def tc_f8e23f0b : InstrItinClass;
-def tc_f9058dd7 : InstrItinClass;
-def tc_fc3999b4 : InstrItinClass;
-def tc_fcc3ddf9 : InstrItinClass;
-def tc_fe211424 : InstrItinClass;
+def tc_011e0e9d : InstrItinClass;
+def tc_01d44cb2 : InstrItinClass;
+def tc_01e1be3b : InstrItinClass;
+def tc_02fe1c65 : InstrItinClass;
+def tc_0655b949 : InstrItinClass;
+def tc_075c8dd8 : InstrItinClass;
+def tc_0a195f2c : InstrItinClass;
+def tc_0a6c20ae : InstrItinClass;
+def tc_0ba0d5da : InstrItinClass;
+def tc_0dfac0a7 : InstrItinClass;
+def tc_0fac1eb8 : InstrItinClass;
+def tc_1044324a : InstrItinClass;
+def tc_10b884b7 : InstrItinClass;
+def tc_112d30d6 : InstrItinClass;
+def tc_1242dc2a : InstrItinClass;
+def tc_1248597c : InstrItinClass;
+def tc_14ab4f41 : InstrItinClass;
+def tc_151bf368 : InstrItinClass;
+def tc_158aa3f7 : InstrItinClass;
+def tc_197dce51 : InstrItinClass;
+def tc_1981450d : InstrItinClass;
+def tc_1b8138fc : InstrItinClass;
+def tc_1c2c7a4a : InstrItinClass;
+def tc_1c7522a8 : InstrItinClass;
+def tc_1d41f8b7 : InstrItinClass;
+def tc_1e7875f0 : InstrItinClass;
+def tc_1fcb8495 : InstrItinClass;
+def tc_1fe4ab69 : InstrItinClass;
+def tc_20131976 : InstrItinClass;
+def tc_2237d952 : InstrItinClass;
+def tc_234f8560 : InstrItinClass;
+def tc_23708a21 : InstrItinClass;
+def tc_24e109c7 : InstrItinClass;
+def tc_24f426ab : InstrItinClass;
+def tc_27106296 : InstrItinClass;
+def tc_280f7fe1 : InstrItinClass;
+def tc_28e55c6f : InstrItinClass;
+def tc_2c13e7f5 : InstrItinClass;
+def tc_2c3e17fc : InstrItinClass;
+def tc_2f573607 : InstrItinClass;
+def tc_2f669c77 : InstrItinClass;
+def tc_362b0be2 : InstrItinClass;
+def tc_38382228 : InstrItinClass;
+def tc_388f9897 : InstrItinClass;
+def tc_38e0bae9 : InstrItinClass;
+def tc_3d14a17b : InstrItinClass;
+def tc_3edca78f : InstrItinClass;
+def tc_3fbf1042 : InstrItinClass;
+def tc_407e96f9 : InstrItinClass;
+def tc_40d64c94 : InstrItinClass;
+def tc_4222e6bf : InstrItinClass;
+def tc_42ff66ba : InstrItinClass;
+def tc_442395f3 : InstrItinClass;
+def tc_449acf79 : InstrItinClass;
+def tc_44d5a428 : InstrItinClass;
+def tc_44fffc58 : InstrItinClass;
+def tc_45791fb8 : InstrItinClass;
+def tc_45f9d1be : InstrItinClass;
+def tc_49fdfd4b : InstrItinClass;
+def tc_4a55d03c : InstrItinClass;
+def tc_4abdbdc6 : InstrItinClass;
+def tc_4ac61d92 : InstrItinClass;
+def tc_4c1520ae : InstrItinClass;
+def tc_503ce0f3 : InstrItinClass;
+def tc_53c851ab : InstrItinClass;
+def tc_5502c366 : InstrItinClass;
+def tc_55255f2b : InstrItinClass;
+def tc_556f6577 : InstrItinClass;
+def tc_55a9a350 : InstrItinClass;
+def tc_55b33fda : InstrItinClass;
+def tc_56a124a7 : InstrItinClass;
+def tc_57a55b54 : InstrItinClass;
+def tc_5944960d : InstrItinClass;
+def tc_59a7822c : InstrItinClass;
+def tc_5a4b5e58 : InstrItinClass;
+def tc_5b347363 : InstrItinClass;
+def tc_5ceb2f9e : InstrItinClass;
+def tc_5d636bc7 : InstrItinClass;
+def tc_5da50c4b : InstrItinClass;
+def tc_5deb5e47 : InstrItinClass;
+def tc_5e4cf0e8 : InstrItinClass;
+def tc_5f2afaf7 : InstrItinClass;
+def tc_60e324ff : InstrItinClass;
+def tc_63567288 : InstrItinClass;
+def tc_64b00d8a : InstrItinClass;
+def tc_651cbe02 : InstrItinClass;
+def tc_65279839 : InstrItinClass;
+def tc_65cbd974 : InstrItinClass;
+def tc_69bfb303 : InstrItinClass;
+def tc_6ae3426b : InstrItinClass;
+def tc_6d861a95 : InstrItinClass;
+def tc_6e20402a : InstrItinClass;
+def tc_6f42bc60 : InstrItinClass;
+def tc_6fb32599 : InstrItinClass;
+def tc_6fc5dbea : InstrItinClass;
+def tc_711c805f : InstrItinClass;
+def tc_713b66bf : InstrItinClass;
+def tc_7401744f : InstrItinClass;
+def tc_7476d766 : InstrItinClass;
+def tc_74a42bda : InstrItinClass;
+def tc_76bb5435 : InstrItinClass;
+def tc_77f94a5e : InstrItinClass;
+def tc_788b1d09 : InstrItinClass;
+def tc_7b9187d3 : InstrItinClass;
+def tc_7c31e19a : InstrItinClass;
+def tc_7c6d32e4 : InstrItinClass;
+def tc_7dc63b5c : InstrItinClass;
+def tc_7dcd9d89 : InstrItinClass;
+def tc_7f7f45f5 : InstrItinClass;
+def tc_7f8ae742 : InstrItinClass;
+def tc_8035e91f : InstrItinClass;
+def tc_822c3c68 : InstrItinClass;
+def tc_829d8a86 : InstrItinClass;
+def tc_838c4d7a : InstrItinClass;
+def tc_84a7500d : InstrItinClass;
+def tc_86173609 : InstrItinClass;
+def tc_887d1bb7 : InstrItinClass;
+def tc_8a6d0d94 : InstrItinClass;
+def tc_8a825db2 : InstrItinClass;
+def tc_8b5bd4f5 : InstrItinClass;
+def tc_8e82e8ca : InstrItinClass;
+def tc_9124c04f : InstrItinClass;
+def tc_9165014d : InstrItinClass;
+def tc_92240447 : InstrItinClass;
+def tc_934753bb : InstrItinClass;
+def tc_937dd41c : InstrItinClass;
+def tc_9406230a : InstrItinClass;
+def tc_95a33176 : InstrItinClass;
+def tc_96ef76ef : InstrItinClass;
+def tc_975a4e54 : InstrItinClass;
+def tc_9783714b : InstrItinClass;
+def tc_988416e3 : InstrItinClass;
+def tc_9b34f5e0 : InstrItinClass;
+def tc_9b3c0462 : InstrItinClass;
+def tc_9bcfb2ee : InstrItinClass;
+def tc_9c52f549 : InstrItinClass;
+def tc_9e27f2f9 : InstrItinClass;
+def tc_9e72dc89 : InstrItinClass;
+def tc_9edb7c77 : InstrItinClass;
+def tc_9edefe01 : InstrItinClass;
+def tc_9f6cd987 : InstrItinClass;
+def tc_a08b630b : InstrItinClass;
+def tc_a1297125 : InstrItinClass;
+def tc_a154b476 : InstrItinClass;
+def tc_a2b365d2 : InstrItinClass;
+def tc_a3070909 : InstrItinClass;
+def tc_a32e03e7 : InstrItinClass;
+def tc_a38c45dc : InstrItinClass;
+def tc_a4e22bbd : InstrItinClass;
+def tc_a4ee89db : InstrItinClass;
+def tc_a7a13fac : InstrItinClass;
+def tc_a7bdb22c : InstrItinClass;
+def tc_a9edeffa : InstrItinClass;
+def tc_abfd9a6d : InstrItinClass;
+def tc_ac65613f : InstrItinClass;
+def tc_addc37a8 : InstrItinClass;
+def tc_ae5babd7 : InstrItinClass;
+def tc_aee6250c : InstrItinClass;
+def tc_b1ae5f67 : InstrItinClass;
+def tc_b34eb232 : InstrItinClass;
+def tc_b4dc7630 : InstrItinClass;
+def tc_b570493d : InstrItinClass;
+def tc_b7c4062a : InstrItinClass;
+def tc_b837298f : InstrItinClass;
+def tc_ba9255a6 : InstrItinClass;
+def tc_bb07f2c5 : InstrItinClass;
+def tc_bb831a7c : InstrItinClass;
+def tc_bf2ffc0f : InstrItinClass;
+def tc_c20701f0 : InstrItinClass;
+def tc_c21d7447 : InstrItinClass;
+def tc_c57d9f39 : InstrItinClass;
+def tc_c818ff7f : InstrItinClass;
+def tc_ce59038e : InstrItinClass;
+def tc_cfa0e29b : InstrItinClass;
+def tc_d03278fd : InstrItinClass;
+def tc_d33e5eee : InstrItinClass;
+def tc_d3632d88 : InstrItinClass;
+def tc_d45ba9cd : InstrItinClass;
+def tc_d47648a2 : InstrItinClass;
+def tc_d57d649c : InstrItinClass;
+def tc_d61dfdc3 : InstrItinClass;
+def tc_d68dca5c : InstrItinClass;
+def tc_d7718fbe : InstrItinClass;
+def tc_db596beb : InstrItinClass;
+def tc_db96aa6b : InstrItinClass;
+def tc_dc51281d : InstrItinClass;
+def tc_decdde8a : InstrItinClass;
+def tc_df4536ae : InstrItinClass;
+def tc_df5d53f9 : InstrItinClass;
+def tc_e3d699e3 : InstrItinClass;
+def tc_e9170fb7 : InstrItinClass;
+def tc_ed03645c : InstrItinClass;
+def tc_eed07714 : InstrItinClass;
+def tc_eeda4109 : InstrItinClass;
+def tc_ef921005 : InstrItinClass;
+def tc_f098b237 : InstrItinClass;
+def tc_f0cdeccf : InstrItinClass;
+def tc_f0e8e832 : InstrItinClass;
+def tc_f34c1c21 : InstrItinClass;
+def tc_f38f92e1 : InstrItinClass;
+def tc_f529831b : InstrItinClass;
+def tc_f6e2aff9 : InstrItinClass;
+def tc_f7569068 : InstrItinClass;
+def tc_f999c66e : InstrItinClass;
+def tc_fae9dfa5 : InstrItinClass;
+def tc_fedb7e19 : InstrItinClass;
 
 class DepScalarItinV5 {
   list<InstrItinData> DepScalarItinV5_list = [
-    InstrItinData <tc_002cb246, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_0371abea, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_05c070ec, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_05d3a09b, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_0663f615, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_096199d3, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_0a705168, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_0ae0825c, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_0b2be201, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_0d8f5752, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_13bfbcf9, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_14b272fa, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_14b5c689, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_15aa71c5, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_174516e8, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_17e0d2cd, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_1a2fd869, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_1ad90acd, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_1ae57e39, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_1b6f7cec, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_1c4528a2, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_1c80410a, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_1d81e60e, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_1fc97744, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_20cdee80, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_2332b92e, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_24b66c99, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_25a78932, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_2b8da4c2, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_2eabeebe, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_2f7c551d, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_2ff964b4, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_30b9bb4a, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_32779c6f, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_36153880, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_362c6592, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_3962fa26, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_39dfefe8, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_3a867367, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_3b470976, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_3b5b7ef9, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_3bd75825, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_3c76b0ff, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_3d495a39, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_40116ca8, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_434c8e1e, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_4414d8b1, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_44d3da28, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_4560740b, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_4837eefb, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_49a8207d, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_4ae7b58b, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_4b68bce4, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_4c5ba658, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_4d5fa3a1, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_53559e35, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_56336eb0, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_56f114f4, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_57890846, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_5a2711e5, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_5abb5e3f, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_5aee39f7, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_5b54b33f, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_5b7c0967, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_5bf126a6, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_5d7f5414, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_5ef37dc4, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_6132ba3d, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_61830035, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_640086b5, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_643b4717, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_67435e81, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_675e4897, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_679309b8, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_6b25e783, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_703e822c, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_7186d325, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_7646c131, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_76851da1, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_779080bf, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_784490da, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_785f65a7, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_7a91e76a, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_838b34ea, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_85c9c08f, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_85d5d03f, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_862b3e70, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_88b4f13d, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_89e94ad3, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_8b121f4a, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_8b3e402a, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_8c945be0, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_8c99de45, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_8d9d0154, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_8fb7ab1b, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_9461ff31, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_946df596, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_9ad9998f, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_9bfd761f, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_9c3ecd83, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_9ca930f7, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_9da59d12, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_9debc299, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_9e313203, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_9fc3dae0, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_a1123dda, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_a1c00888, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_a58fd5cc, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_a5d4aeec, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_a6b1eca9, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_a813cf9a, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_a9d88b22, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_ae53734a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_b31c2e97, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_b343892a, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_b43e7930, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_b4407292, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_b44ecf75, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_b4b5c03a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_b51dc29a, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_b83e6d73, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_b857bf4e, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_b8bffe55, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_b90a29b1, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_b9272d6c, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_b9e09e03, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_bab0eed9, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_bafaade3, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_bcf98408, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_bd8382d1, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_bdceeac1, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_be9602ff, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_bf061958, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_bfec0f01, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_c4db48cb, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_c4f596e3, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_c79a189f, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_c8ce0b5c, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_cd374165, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_cf8126ae, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_cfd8378a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_d08ee0f4, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_d1aa9eaa, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_d2e63d61, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_d5b7b0c1, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_d5c0729a, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_d63f638c, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_d65dbf51, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_d773585a, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_d9d43ecb, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_da4a37ed, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_da97ee82, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_db2bce9c, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_de4df740, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
-    InstrItinData <tc_de554571, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_df3319ed, [InstrStage<1, [SLOT3]>]>,
-    InstrItinData <tc_e06f432a, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_e4a7f9f0, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_e4b3cb20, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_e78647bd, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_e86aa961, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_e93a3d71, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_e95795ec, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_e9f3243f, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_f429765c, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_f675fee8, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_f8e23f0b, [InstrStage<1, [SLOT0, SLOT1]>]>,
-    InstrItinData <tc_f9058dd7, [InstrStage<1, [SLOT2, SLOT3]>]>,
-    InstrItinData <tc_fc3999b4, [InstrStage<1, [SLOT2]>]>,
-    InstrItinData <tc_fcc3ddf9, [InstrStage<1, [SLOT0]>]>,
-    InstrItinData <tc_fe211424, [InstrStage<1, [SLOT0]>]>  ];
+    InstrItinData <tc_011e0e9d, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_01d44cb2, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_01e1be3b, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_02fe1c65, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_0655b949, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_075c8dd8, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_0a195f2c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_0a6c20ae, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_0ba0d5da, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_0dfac0a7, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_0fac1eb8, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_1044324a, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_10b884b7, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_112d30d6, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1242dc2a, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_1248597c, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_14ab4f41, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_151bf368, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_158aa3f7, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_197dce51, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_1981450d, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_1b8138fc, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_1c2c7a4a, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1c7522a8, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_1d41f8b7, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1e7875f0, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_1fcb8495, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_1fe4ab69, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_20131976, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_2237d952, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_234f8560, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_23708a21, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_24e109c7, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_24f426ab, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_27106296, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_280f7fe1, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_28e55c6f, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_2c13e7f5, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_2c3e17fc, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_2f573607, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_2f669c77, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_362b0be2, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_38382228, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_388f9897, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_38e0bae9, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_3d14a17b, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_3edca78f, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_3fbf1042, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_407e96f9, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_40d64c94, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_4222e6bf, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_42ff66ba, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_442395f3, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_449acf79, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_44d5a428, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_44fffc58, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_45791fb8, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_45f9d1be, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_49fdfd4b, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_4a55d03c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_4abdbdc6, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_4ac61d92, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_4c1520ae, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_503ce0f3, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_53c851ab, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_5502c366, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_55255f2b, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_556f6577, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_55a9a350, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_55b33fda, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_56a124a7, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_57a55b54, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_5944960d, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_59a7822c, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_5a4b5e58, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_5b347363, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_5ceb2f9e, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_5d636bc7, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_5da50c4b, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_5deb5e47, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_5e4cf0e8, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_5f2afaf7, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_60e324ff, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_63567288, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_64b00d8a, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_651cbe02, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_65279839, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_65cbd974, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_69bfb303, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_6ae3426b, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_6d861a95, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_6e20402a, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_6f42bc60, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_6fb32599, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_6fc5dbea, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_711c805f, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_713b66bf, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_7401744f, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_7476d766, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_74a42bda, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_76bb5435, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_77f94a5e, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_788b1d09, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_7b9187d3, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_7c31e19a, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_7c6d32e4, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_7dc63b5c, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_7dcd9d89, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_7f7f45f5, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_7f8ae742, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_8035e91f, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_822c3c68, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_829d8a86, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_838c4d7a, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_84a7500d, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_86173609, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_887d1bb7, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_8a6d0d94, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_8a825db2, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_8b5bd4f5, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_8e82e8ca, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_9124c04f, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9165014d, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_92240447, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_934753bb, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_937dd41c, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_9406230a, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_95a33176, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_96ef76ef, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_975a4e54, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9783714b, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_988416e3, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9b34f5e0, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_9b3c0462, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9bcfb2ee, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_9c52f549, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9e27f2f9, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9e72dc89, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9edb7c77, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_9edefe01, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_9f6cd987, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a08b630b, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a1297125, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a154b476, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a2b365d2, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_a3070909, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_a32e03e7, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_a38c45dc, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a4e22bbd, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a4ee89db, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_a7a13fac, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a7bdb22c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_a9edeffa, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_abfd9a6d, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_ac65613f, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_addc37a8, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_ae5babd7, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_aee6250c, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_b1ae5f67, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_b34eb232, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_b4dc7630, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_b570493d, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_b7c4062a, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_b837298f, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_ba9255a6, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_bb07f2c5, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_bb831a7c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_bf2ffc0f, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_c20701f0, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_c21d7447, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_c57d9f39, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_c818ff7f, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_ce59038e, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_cfa0e29b, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_d03278fd, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_d33e5eee, [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d3632d88, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d45ba9cd, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_d47648a2, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_d57d649c, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_d61dfdc3, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d68dca5c, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_d7718fbe, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_db596beb, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_db96aa6b, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_dc51281d, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_decdde8a, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_df4536ae, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_df5d53f9, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_e3d699e3, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_e9170fb7, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_ed03645c, [InstrStage<1, [SLOT2]>]>,
+    InstrItinData <tc_eed07714, [InstrStage<1, [SLOT0, SLOT1]>]>,
+    InstrItinData <tc_eeda4109, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_ef921005, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_f098b237, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_f0cdeccf, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_f0e8e832, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_f34c1c21, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_f38f92e1, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_f529831b, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_f6e2aff9, [InstrStage<1, [SLOT0]>]>,
+    InstrItinData <tc_f7569068, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_f999c66e, [InstrStage<1, [SLOT2, SLOT3]>]>,
+    InstrItinData <tc_fae9dfa5, [InstrStage<1, [SLOT3]>]>,
+    InstrItinData <tc_fedb7e19, [InstrStage<1, [SLOT0, SLOT1]>]>  ];
 }
 
 class DepScalarItinV55 {
   list<InstrItinData> DepScalarItinV55_list = [
-    InstrItinData <tc_002cb246, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_011e0e9d, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01d44cb2, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0371abea, /*tc_st*/
+    InstrItinData <tc_01e1be3b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_02fe1c65, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0655b949, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_05c070ec, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+    InstrItinData <tc_075c8dd8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_05d3a09b, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+    InstrItinData <tc_0a195f2c, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0663f615, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_0a6c20ae, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ba0d5da, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0dfac0a7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_096199d3, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+    InstrItinData <tc_0fac1eb8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0a705168, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_1044324a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_10b884b7, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_112d30d6, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1242dc2a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1248597c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14ab4f41, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0ae0825c, /*tc_1*/
+    InstrItinData <tc_151bf368, /*tc_2early*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0b2be201, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+    InstrItinData <tc_158aa3f7, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_197dce51, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0d8f5752, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
+    InstrItinData <tc_1981450d, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1b8138fc, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_13bfbcf9, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+    InstrItinData <tc_1c2c7a4a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_14b272fa, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_1c7522a8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_14b5c689, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_1d41f8b7, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_15aa71c5, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+    InstrItinData <tc_1e7875f0, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_174516e8, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_17e0d2cd, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+    InstrItinData <tc_1fcb8495, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1a2fd869, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_1fe4ab69, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1ad90acd, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_20131976, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2237d952, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1ae57e39, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
+    InstrItinData <tc_234f8560, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1b6f7cec, /*tc_2early*/
+    InstrItinData <tc_23708a21, /*tc_2early*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_1c4528a2, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_24e109c7, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_24f426ab, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_27106296, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_280f7fe1, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_28e55c6f, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c13e7f5, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c3e17fc, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_2f573607, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1c80410a, /*tc_1*/
+    InstrItinData <tc_2f669c77, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_362b0be2, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_38382228, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_388f9897, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1d81e60e, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_38e0bae9, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d14a17b, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3edca78f, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1fc97744, /*tc_1*/
+    InstrItinData <tc_3fbf1042, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_407e96f9, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_20cdee80, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_40d64c94, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2332b92e, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2],
+    InstrItinData <tc_4222e6bf, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_42ff66ba, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_24b66c99, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+    InstrItinData <tc_442395f3, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_449acf79, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_25a78932, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+    InstrItinData <tc_44d5a428, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44fffc58, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_45791fb8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2b8da4c2, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 2, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_45f9d1be, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_2eabeebe, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
-      []>,
+    InstrItinData <tc_49fdfd4b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2f7c551d, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+    InstrItinData <tc_4a55d03c, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2ff964b4, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_4abdbdc6, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_30b9bb4a, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+    InstrItinData <tc_4ac61d92, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_32779c6f, /*tc_3x*/
+    InstrItinData <tc_4c1520ae, /*tc_3x*/
       [InstrStage<1, [SLOT3]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_36153880, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
+    InstrItinData <tc_503ce0f3, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_362c6592, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
+    InstrItinData <tc_53c851ab, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3962fa26, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5502c366, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_39dfefe8, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [],
+    InstrItinData <tc_55255f2b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_3a867367, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_3b470976, /*tc_3x*/
+    InstrItinData <tc_556f6577, /*tc_3x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3b5b7ef9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+    InstrItinData <tc_55a9a350, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55b33fda, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_56a124a7, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_57a55b54, /*tc_2early*/
+      [InstrStage<1, [SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5944960d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3bd75825, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_59a7822c, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3c76b0ff, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5a4b5e58, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3d495a39, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2, 2],
+    InstrItinData <tc_5b347363, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_40116ca8, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+    InstrItinData <tc_5ceb2f9e, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_434c8e1e, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_5d636bc7, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4414d8b1, /*tc_1*/
+    InstrItinData <tc_5da50c4b, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_44d3da28, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+    InstrItinData <tc_5deb5e47, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5e4cf0e8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4560740b, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1, 2],
+    InstrItinData <tc_5f2afaf7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4837eefb, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_49a8207d, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [2],
+    InstrItinData <tc_60e324ff, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_4ae7b58b, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+    InstrItinData <tc_63567288, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
       [Hex_FWD]>,
 
-    InstrItinData <tc_4b68bce4, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+    InstrItinData <tc_64b00d8a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_651cbe02, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4c5ba658, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+    InstrItinData <tc_65279839, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65cbd974, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4d5fa3a1, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_69bfb303, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_53559e35, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_6ae3426b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_56336eb0, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_6d861a95, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_56f114f4, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
+    InstrItinData <tc_6e20402a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6f42bc60, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_57890846, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_6fb32599, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_6fc5dbea, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_711c805f, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5a2711e5, /*tc_1*/
+    InstrItinData <tc_713b66bf, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5abb5e3f, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+    InstrItinData <tc_7401744f, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7476d766, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_74a42bda, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5aee39f7, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_76bb5435, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_77f94a5e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
 
-    InstrItinData <tc_5b54b33f, /*tc_3*/
+    InstrItinData <tc_788b1d09, /*tc_3*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5b7c0967, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+    InstrItinData <tc_7b9187d3, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5bf126a6, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 3],
+    InstrItinData <tc_7c31e19a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c6d32e4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7dc63b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5d7f5414, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_7dcd9d89, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5ef37dc4, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+    InstrItinData <tc_7f7f45f5, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f8ae742, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6132ba3d, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_8035e91f, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_822c3c68, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_829d8a86, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_838c4d7a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_61830035, /*tc_2*/
+    InstrItinData <tc_84a7500d, /*tc_2*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_640086b5, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_86173609, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_643b4717, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
+    InstrItinData <tc_887d1bb7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8a6d0d94, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8a825db2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_67435e81, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+    InstrItinData <tc_8b5bd4f5, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8e82e8ca, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9124c04f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9165014d, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_92240447, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_675e4897, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 2],
+    InstrItinData <tc_934753bb, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_937dd41c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
+      []>,
+
+    InstrItinData <tc_9406230a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_679309b8, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_95a33176, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6b25e783, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_96ef76ef, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_703e822c, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_975a4e54, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9783714b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7186d325, /*tc_st*/
+    InstrItinData <tc_988416e3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_9b34f5e0, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_9b3c0462, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9bcfb2ee, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7646c131, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+    InstrItinData <tc_9c52f549, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_76851da1, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_9e27f2f9, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_779080bf, /*tc_2*/
+    InstrItinData <tc_9e72dc89, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edb7c77, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edefe01, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f6cd987, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a08b630b, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_784490da, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a1297125, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_785f65a7, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a154b476, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7a91e76a, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+    InstrItinData <tc_a2b365d2, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_838b34ea, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+    InstrItinData <tc_a3070909, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a32e03e7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_85c9c08f, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_a38c45dc, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4e22bbd, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4ee89db, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_a7a13fac, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7bdb22c, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_85d5d03f, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
+    InstrItinData <tc_a9edeffa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_862b3e70, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_abfd9a6d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_88b4f13d, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_ac65613f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_addc37a8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae5babd7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_89e94ad3, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+    InstrItinData <tc_aee6250c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8b121f4a, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [],
+    InstrItinData <tc_b1ae5f67, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b34eb232, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_8b3e402a, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
-      [Hex_FWD]>,
+    InstrItinData <tc_b4dc7630, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8c945be0, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [3, 2],
+    InstrItinData <tc_b570493d, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8c99de45, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [],
+    InstrItinData <tc_b7c4062a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b837298f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_8d9d0154, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_ba9255a6, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8fb7ab1b, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+    InstrItinData <tc_bb07f2c5, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb831a7c, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bf2ffc0f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9461ff31, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_c20701f0, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_946df596, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_c21d7447, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9ad9998f, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [],
+    InstrItinData <tc_c57d9f39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c818ff7f, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_9bfd761f, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2],
+    InstrItinData <tc_ce59038e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cfa0e29b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9c3ecd83, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d03278fd, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9ca930f7, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
+    InstrItinData <tc_d33e5eee, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d3632d88, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9da59d12, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 3, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d45ba9cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_9debc299, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d47648a2, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d57d649c, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_9e313203, /*tc_3x*/
+    InstrItinData <tc_d61dfdc3, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9fc3dae0, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+    InstrItinData <tc_d68dca5c, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d7718fbe, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_db596beb, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_db96aa6b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_dc51281d, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a1123dda, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+    InstrItinData <tc_decdde8a, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_a1c00888, /*tc_1*/
+    InstrItinData <tc_df4536ae, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_df5d53f9, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e3d699e3, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a58fd5cc, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 1],
+    InstrItinData <tc_e9170fb7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ed03645c, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eed07714, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eeda4109, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ef921005, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f098b237, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f0cdeccf, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a5d4aeec, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+    InstrItinData <tc_f0e8e832, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f34c1c21, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f38f92e1, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f529831b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a6b1eca9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
+    InstrItinData <tc_f6e2aff9, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a813cf9a, /*tc_2*/
+    InstrItinData <tc_f7569068, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f999c66e, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fae9dfa5, /*tc_3x*/
       [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a9d88b22, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_fedb7e19, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
+  ];
+}
+
+class DepScalarItinV60 {
+  list<InstrItinData> DepScalarItinV60_list = [
+    InstrItinData <tc_011e0e9d, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01d44cb2, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01e1be3b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_02fe1c65, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0655b949, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ae53734a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_075c8dd8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b31c2e97, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+    InstrItinData <tc_0a195f2c, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b343892a, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 3, 2],
+    InstrItinData <tc_0a6c20ae, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ba0d5da, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0dfac0a7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b43e7930, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
+    InstrItinData <tc_0fac1eb8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1044324a, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b4407292, /*tc_2early*/
-      [InstrStage<1, [SLOT0]>], [],
+    InstrItinData <tc_10b884b7, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
       []>,
 
-    InstrItinData <tc_b44ecf75, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_112d30d6, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_b4b5c03a, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_1242dc2a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_b51dc29a, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 1],
+    InstrItinData <tc_1248597c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b83e6d73, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+    InstrItinData <tc_14ab4f41, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b857bf4e, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_151bf368, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b8bffe55, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1],
+    InstrItinData <tc_158aa3f7, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b90a29b1, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+    InstrItinData <tc_197dce51, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1981450d, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1b8138fc, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1c2c7a4a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1c7522a8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9272d6c, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
+    InstrItinData <tc_1d41f8b7, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1e7875f0, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9e09e03, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 2, 2],
+    InstrItinData <tc_1fcb8495, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bab0eed9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_1fe4ab69, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bafaade3, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_20131976, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bcf98408, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_2237d952, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bd8382d1, /*tc_3stall*/
+    InstrItinData <tc_234f8560, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_23708a21, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_24e109c7, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bdceeac1, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_24f426ab, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_be9602ff, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+    InstrItinData <tc_27106296, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bf061958, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+    InstrItinData <tc_280f7fe1, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bfec0f01, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_c4db48cb, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+    InstrItinData <tc_28e55c6f, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c4f596e3, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+    InstrItinData <tc_2c13e7f5, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c79a189f, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_2c3e17fc, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_2f573607, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2f669c77, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_362b0be2, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_c8ce0b5c, /*tc_3x*/
+    InstrItinData <tc_38382228, /*tc_3x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cd374165, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_388f9897, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_38e0bae9, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d14a17b, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cf8126ae, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_3edca78f, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cfd8378a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
+    InstrItinData <tc_3fbf1042, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_407e96f9, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_40d64c94, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d08ee0f4, /*tc_2*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_4222e6bf, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d1aa9eaa, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 1],
+    InstrItinData <tc_42ff66ba, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_442395f3, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_449acf79, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d2e63d61, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_44d5a428, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d5b7b0c1, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
+    InstrItinData <tc_44fffc58, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_d5c0729a, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+    InstrItinData <tc_45791fb8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d63f638c, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1],
+    InstrItinData <tc_45f9d1be, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_d65dbf51, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_49fdfd4b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d773585a, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_4a55d03c, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d9d43ecb, /*tc_2early*/
-      [InstrStage<1, [SLOT3]>], [1, 2],
+    InstrItinData <tc_4abdbdc6, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_da4a37ed, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_4ac61d92, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_da97ee82, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+    InstrItinData <tc_4c1520ae, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_503ce0f3, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_db2bce9c, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_53c851ab, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_de4df740, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
+    InstrItinData <tc_5502c366, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_de554571, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_55255f2b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_556f6577, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55a9a350, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55b33fda, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_df3319ed, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
+    InstrItinData <tc_56a124a7, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e06f432a, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [3],
-      [Hex_FWD]>,
+    InstrItinData <tc_57a55b54, /*tc_2early*/
+      [InstrStage<1, [SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e4a7f9f0, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_5944960d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e4b3cb20, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_e78647bd, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_59a7822c, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e86aa961, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5a4b5e58, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e93a3d71, /*tc_ld*/
+    InstrItinData <tc_5b347363, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5ceb2f9e, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e95795ec, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5d636bc7, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e9f3243f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5da50c4b, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f429765c, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5deb5e47, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f675fee8, /*tc_2*/
+    InstrItinData <tc_5e4cf0e8, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f8e23f0b, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5f2afaf7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f9058dd7, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_60e324ff, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_fc3999b4, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2],
+    InstrItinData <tc_63567288, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
       [Hex_FWD]>,
 
-    InstrItinData <tc_fcc3ddf9, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+    InstrItinData <tc_64b00d8a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_651cbe02, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65279839, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65cbd974, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fe211424, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>
-  ];
-}
+    InstrItinData <tc_69bfb303, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-class DepScalarItinV60 {
-  list<InstrItinData> DepScalarItinV60_list = [
-    InstrItinData <tc_002cb246, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_6ae3426b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0371abea, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+    InstrItinData <tc_6d861a95, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_05c070ec, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+    InstrItinData <tc_6e20402a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6f42bc60, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6fb32599, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_6fc5dbea, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_05d3a09b, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+    InstrItinData <tc_711c805f, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_713b66bf, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7401744f, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0663f615, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_7476d766, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_74a42bda, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_096199d3, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+    InstrItinData <tc_76bb5435, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_77f94a5e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_788b1d09, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7b9187d3, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c31e19a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0a705168, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_7c6d32e4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0ae0825c, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_7dc63b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7dcd9d89, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0b2be201, /*tc_st*/
+    InstrItinData <tc_7f7f45f5, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f8ae742, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8035e91f, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0d8f5752, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_822c3c68, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_13bfbcf9, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_829d8a86, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_838c4d7a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_84a7500d, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_14b272fa, /*tc_st*/
+    InstrItinData <tc_86173609, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_887d1bb7, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_14b5c689, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_15aa71c5, /*tc_ld*/
+    InstrItinData <tc_8a6d0d94, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_174516e8, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_8a825db2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_17e0d2cd, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+    InstrItinData <tc_8b5bd4f5, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1a2fd869, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8e82e8ca, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1ad90acd, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_9124c04f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1ae57e39, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
+    InstrItinData <tc_9165014d, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_92240447, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_934753bb, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1b6f7cec, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+    InstrItinData <tc_937dd41c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
       []>,
 
-    InstrItinData <tc_1c4528a2, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_9406230a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1c80410a, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_95a33176, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1d81e60e, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_96ef76ef, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_975a4e54, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9783714b, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1fc97744, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_988416e3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_9b34f5e0, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_9b3c0462, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9bcfb2ee, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c52f549, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_20cdee80, /*tc_2early*/
+    InstrItinData <tc_9e27f2f9, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e72dc89, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edb7c77, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edefe01, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f6cd987, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2332b92e, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2],
+    InstrItinData <tc_a08b630b, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a1297125, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a154b476, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a2b365d2, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a3070909, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_24b66c99, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+    InstrItinData <tc_a32e03e7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_25a78932, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+    InstrItinData <tc_a38c45dc, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2b8da4c2, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a4e22bbd, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2eabeebe, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+    InstrItinData <tc_a4ee89db, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_2f7c551d, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+    InstrItinData <tc_a7a13fac, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7bdb22c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a9edeffa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2ff964b4, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+    InstrItinData <tc_abfd9a6d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ac65613f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_30b9bb4a, /*tc_st*/
+    InstrItinData <tc_addc37a8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae5babd7, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_32779c6f, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
+    InstrItinData <tc_aee6250c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_36153880, /*tc_newvjump*/
+    InstrItinData <tc_b1ae5f67, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b34eb232, /*tc_3stall*/
       [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_362c6592, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b4dc7630, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3962fa26, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
+    InstrItinData <tc_b570493d, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b7c4062a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_39dfefe8, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [],
+    InstrItinData <tc_b837298f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_3a867367, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_3b470976, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+    InstrItinData <tc_ba9255a6, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3b5b7ef9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+    InstrItinData <tc_bb07f2c5, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3bd75825, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_3c76b0ff, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+    InstrItinData <tc_bb831a7c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3d495a39, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2, 2],
+    InstrItinData <tc_bf2ffc0f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c20701f0, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_40116ca8, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_c21d7447, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_434c8e1e, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_c57d9f39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4414d8b1, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_c818ff7f, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_ce59038e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cfa0e29b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_44d3da28, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+    InstrItinData <tc_d03278fd, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4560740b, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d33e5eee, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d3632d88, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d45ba9cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d47648a2, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d57d649c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_4837eefb, /*tc_3stall*/
+    InstrItinData <tc_d61dfdc3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d68dca5c, /*tc_3stall*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_49a8207d, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [2],
+    InstrItinData <tc_d7718fbe, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_db596beb, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_db96aa6b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_4ae7b58b, /*tc_2early*/
+    InstrItinData <tc_dc51281d, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_decdde8a, /*tc_2early*/
       [InstrStage<1, [SLOT2, SLOT3]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_4b68bce4, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+    InstrItinData <tc_df4536ae, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4c5ba658, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+    InstrItinData <tc_df5d53f9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e3d699e3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e9170fb7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ed03645c, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eed07714, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eeda4109, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ef921005, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f098b237, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f0cdeccf, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4d5fa3a1, /*tc_newvjump*/
+    InstrItinData <tc_f0e8e832, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f34c1c21, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f38f92e1, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_53559e35, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
+    InstrItinData <tc_f529831b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_56336eb0, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [1, 1, 2],
+    InstrItinData <tc_f6e2aff9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_56f114f4, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_f7569068, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_57890846, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_f999c66e, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5a2711e5, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_fae9dfa5, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5abb5e3f, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_fedb7e19, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
+  ];
+}
 
-    InstrItinData <tc_5aee39f7, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
+class DepScalarItinV60se {
+  list<InstrItinData> DepScalarItinV60se_list = [
+    InstrItinData <tc_011e0e9d, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5b54b33f, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+    InstrItinData <tc_01d44cb2, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01e1be3b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5b7c0967, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_02fe1c65, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5bf126a6, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 3],
+    InstrItinData <tc_0655b949, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5d7f5414, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_075c8dd8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5ef37dc4, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+    InstrItinData <tc_0a195f2c, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6132ba3d, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_0a6c20ae, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ba0d5da, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0dfac0a7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_61830035, /*tc_2*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_0fac1eb8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_640086b5, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_1044324a, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_10b884b7, /*tc_3stall*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_ST]>], [],
+      []>,
+
+    InstrItinData <tc_112d30d6, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1242dc2a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1248597c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14ab4f41, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_643b4717, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
+    InstrItinData <tc_151bf368, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_158aa3f7, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_67435e81, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+    InstrItinData <tc_197dce51, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_675e4897, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2],
+    InstrItinData <tc_1981450d, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1b8138fc, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_679309b8, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_1c2c7a4a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1c7522a8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1d41f8b7, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1e7875f0, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fcb8495, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6b25e783, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_1fe4ab69, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_703e822c, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_20131976, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2237d952, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7186d325, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+    InstrItinData <tc_234f8560, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7646c131, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+    InstrItinData <tc_23708a21, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ST]>], [],
+      []>,
+
+    InstrItinData <tc_24e109c7, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_76851da1, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_24f426ab, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_779080bf, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_27106296, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_784490da, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+    InstrItinData <tc_280f7fe1, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_785f65a7, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_28e55c6f, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7a91e76a, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+    InstrItinData <tc_2c13e7f5, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_838b34ea, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_2c3e17fc, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_85c9c08f, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_2f573607, /*tc_2early*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_85d5d03f, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_2f669c77, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_862b3e70, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+    InstrItinData <tc_362b0be2, /*tc_2early*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_ST]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_88b4f13d, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_38382228, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_89e94ad3, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+    InstrItinData <tc_388f9897, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_38e0bae9, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d14a17b, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8b121f4a, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [],
-      []>,
+    InstrItinData <tc_3edca78f, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8b3e402a, /*tc_1*/
+    InstrItinData <tc_3fbf1042, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1]>], [3],
       [Hex_FWD]>,
 
-    InstrItinData <tc_8c945be0, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2],
+    InstrItinData <tc_407e96f9, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_40d64c94, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8c99de45, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
+    InstrItinData <tc_4222e6bf, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8d9d0154, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [2],
+    InstrItinData <tc_42ff66ba, /*tc_2early*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_442395f3, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_449acf79, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44d5a428, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44fffc58, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ST]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_8fb7ab1b, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+    InstrItinData <tc_45791fb8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_45f9d1be, /*tc_2early*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_ST]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_49fdfd4b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4a55d03c, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4abdbdc6, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4ac61d92, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4c1520ae, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_503ce0f3, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9461ff31, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_53c851ab, /*tc_2early*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_946df596, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_5502c366, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9ad9998f, /*tc_3stall*/
+    InstrItinData <tc_55255f2b, /*tc_3stall*/
       [InstrStage<1, [SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_9bfd761f, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+    InstrItinData <tc_556f6577, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9c3ecd83, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_55a9a350, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9ca930f7, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
+    InstrItinData <tc_55b33fda, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9da59d12, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+    InstrItinData <tc_56a124a7, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_57a55b54, /*tc_2early*/
+      [InstrStage<1, [SLOT3], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5944960d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9debc299, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+    InstrItinData <tc_59a7822c, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5a4b5e58, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5b347363, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5ceb2f9e, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9e313203, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_5d636bc7, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9fc3dae0, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5da50c4b, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5deb5e47, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5e4cf0e8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a1123dda, /*tc_1*/
+    InstrItinData <tc_5f2afaf7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_60e324ff, /*tc_2early*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_ST]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_63567288, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1]>], [3],
       [Hex_FWD]>,
 
-    InstrItinData <tc_a1c00888, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_64b00d8a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a58fd5cc, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+    InstrItinData <tc_651cbe02, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65279839, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65cbd974, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a5d4aeec, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_69bfb303, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a6b1eca9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_6ae3426b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a813cf9a, /*tc_2*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
+    InstrItinData <tc_6d861a95, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a9d88b22, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_6e20402a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ae53734a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+    InstrItinData <tc_6f42bc60, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6fb32599, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_b31c2e97, /*tc_2early*/
+    InstrItinData <tc_6fc5dbea, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b343892a, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [2, 3, 2],
+    InstrItinData <tc_711c805f, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_713b66bf, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b43e7930, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
+    InstrItinData <tc_7401744f, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7476d766, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b4407292, /*tc_2early*/
+    InstrItinData <tc_74a42bda, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_76bb5435, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_77f94a5e, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_b44ecf75, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_b4b5c03a, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+    InstrItinData <tc_788b1d09, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b51dc29a, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 1],
+    InstrItinData <tc_7b9187d3, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b83e6d73, /*tc_st*/
+    InstrItinData <tc_7c31e19a, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b857bf4e, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_7c6d32e4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7dc63b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7dcd9d89, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b8bffe55, /*tc_4x*/
+    InstrItinData <tc_7f7f45f5, /*tc_4x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b90a29b1, /*tc_st*/
+    InstrItinData <tc_7f8ae742, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8035e91f, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_822c3c68, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_829d8a86, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9272d6c, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
+    InstrItinData <tc_838c4d7a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_84a7500d, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_86173609, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_887d1bb7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8a6d0d94, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9e09e03, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 2, 2],
+    InstrItinData <tc_8a825db2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bab0eed9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+    InstrItinData <tc_8b5bd4f5, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8e82e8ca, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9124c04f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9165014d, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_92240447, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bafaade3, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_934753bb, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bcf98408, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_937dd41c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
+      []>,
+
+    InstrItinData <tc_9406230a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bd8382d1, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+    InstrItinData <tc_95a33176, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bdceeac1, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+    InstrItinData <tc_96ef76ef, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_be9602ff, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+    InstrItinData <tc_975a4e54, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bf061958, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+    InstrItinData <tc_9783714b, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_988416e3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_9b34f5e0, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_9b3c0462, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9bcfb2ee, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c52f549, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e27f2f9, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e72dc89, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bfec0f01, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
+    InstrItinData <tc_9edb7c77, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c4db48cb, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+    InstrItinData <tc_9edefe01, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f6cd987, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c4f596e3, /*tc_st*/
+    InstrItinData <tc_a08b630b, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a1297125, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a154b476, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a2b365d2, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c79a189f, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+    InstrItinData <tc_a3070909, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a32e03e7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a38c45dc, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4e22bbd, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c8ce0b5c, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a4ee89db, /*tc_2early*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [],
+      []>,
 
-    InstrItinData <tc_cd374165, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a7a13fac, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cf8126ae, /*tc_2*/
+    InstrItinData <tc_a7bdb22c, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cfd8378a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
-      [Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_d08ee0f4, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_a9edeffa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d1aa9eaa, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 1, 1, 1],
+    InstrItinData <tc_abfd9a6d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ac65613f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_addc37a8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d2e63d61, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+    InstrItinData <tc_ae5babd7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d5b7b0c1, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
+    InstrItinData <tc_aee6250c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b1ae5f67, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_d5c0729a, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+    InstrItinData <tc_b34eb232, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_b4dc7630, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d63f638c, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_b570493d, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d65dbf51, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
+    InstrItinData <tc_b7c4062a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d773585a, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_b837298f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
 
-    InstrItinData <tc_d9d43ecb, /*tc_2early*/
-      [InstrStage<1, [SLOT3]>], [1, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_ba9255a6, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_da4a37ed, /*tc_st*/
+    InstrItinData <tc_bb07f2c5, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_da97ee82, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+    InstrItinData <tc_bb831a7c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bf2ffc0f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_db2bce9c, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_c20701f0, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c21d7447, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c57d9f39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_de4df740, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
+    InstrItinData <tc_c818ff7f, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [],
+      []>,
+
+    InstrItinData <tc_ce59038e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cfa0e29b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_de554571, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_d03278fd, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d33e5eee, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d3632d88, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_df3319ed, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
+    InstrItinData <tc_d45ba9cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d47648a2, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e06f432a, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3],
+    InstrItinData <tc_d57d649c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_ST]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_e4a7f9f0, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_d61dfdc3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d68dca5c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e4b3cb20, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d7718fbe, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_e78647bd, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_db596beb, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_db96aa6b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_dc51281d, /*tc_2early*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_ST]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e86aa961, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_decdde8a, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ST]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_e93a3d71, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_df4536ae, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e95795ec, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_df5d53f9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e9f3243f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e3d699e3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f429765c, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e9170fb7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f675fee8, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_ed03645c, /*tc_2early*/
+      [InstrStage<1, [SLOT2], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f8e23f0b, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_eed07714, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f9058dd7, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+    InstrItinData <tc_eeda4109, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ef921005, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f098b237, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f0cdeccf, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fc3999b4, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2],
+    InstrItinData <tc_f0e8e832, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f34c1c21, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f38f92e1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_fcc3ddf9, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+    InstrItinData <tc_f529831b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f6e2aff9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0], 0>,
+       InstrStage<1, [CVI_ST]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f7569068, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fe211424, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>
+    InstrItinData <tc_f999c66e, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3], 0>,
+       InstrStage<1, [CVI_ST]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fae9dfa5, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fedb7e19, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
   ];
 }
 
 class DepScalarItinV62 {
   list<InstrItinData> DepScalarItinV62_list = [
-    InstrItinData <tc_002cb246, /*tc_2*/
+    InstrItinData <tc_011e0e9d, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01d44cb2, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0371abea, /*tc_st*/
+    InstrItinData <tc_01e1be3b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_02fe1c65, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0655b949, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_05c070ec, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+    InstrItinData <tc_075c8dd8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_05d3a09b, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+    InstrItinData <tc_0a195f2c, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0663f615, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_0a6c20ae, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ba0d5da, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0dfac0a7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_096199d3, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+    InstrItinData <tc_0fac1eb8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0a705168, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_1044324a, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_10b884b7, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_112d30d6, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1242dc2a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1248597c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14ab4f41, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0ae0825c, /*tc_1*/
+    InstrItinData <tc_151bf368, /*tc_2early*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0b2be201, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+    InstrItinData <tc_158aa3f7, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_197dce51, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0d8f5752, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
+    InstrItinData <tc_1981450d, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1b8138fc, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_13bfbcf9, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+    InstrItinData <tc_1c2c7a4a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_14b272fa, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_1c7522a8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_14b5c689, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_1d41f8b7, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_15aa71c5, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+    InstrItinData <tc_1e7875f0, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_174516e8, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
-
-    InstrItinData <tc_17e0d2cd, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+    InstrItinData <tc_1fcb8495, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1a2fd869, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_1fe4ab69, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1ad90acd, /*tc_3*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_20131976, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2237d952, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1ae57e39, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
+    InstrItinData <tc_234f8560, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1b6f7cec, /*tc_2early*/
+    InstrItinData <tc_23708a21, /*tc_2early*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_1c4528a2, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_24e109c7, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_24f426ab, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_27106296, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_280f7fe1, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_28e55c6f, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c13e7f5, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c3e17fc, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_2f573607, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2f669c77, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1c80410a, /*tc_1*/
+    InstrItinData <tc_362b0be2, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_38382228, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_388f9897, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1d81e60e, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_38e0bae9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d14a17b, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3edca78f, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1fc97744, /*tc_1*/
+    InstrItinData <tc_3fbf1042, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_407e96f9, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_20cdee80, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_40d64c94, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2332b92e, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 2],
+    InstrItinData <tc_4222e6bf, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_42ff66ba, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_24b66c99, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+    InstrItinData <tc_442395f3, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_449acf79, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_25a78932, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+    InstrItinData <tc_44d5a428, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44fffc58, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_45791fb8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2b8da4c2, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_45f9d1be, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_2eabeebe, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
-      []>,
+    InstrItinData <tc_49fdfd4b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2f7c551d, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+    InstrItinData <tc_4a55d03c, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2ff964b4, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_4abdbdc6, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_30b9bb4a, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+    InstrItinData <tc_4ac61d92, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_32779c6f, /*tc_3stall*/
+    InstrItinData <tc_4c1520ae, /*tc_3x*/
       [InstrStage<1, [SLOT3]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_36153880, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
+    InstrItinData <tc_503ce0f3, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_362c6592, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
+    InstrItinData <tc_53c851ab, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3962fa26, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5502c366, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_39dfefe8, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [],
+    InstrItinData <tc_55255f2b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_3a867367, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+    InstrItinData <tc_556f6577, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55a9a350, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55b33fda, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3b470976, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_56a124a7, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3b5b7ef9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+    InstrItinData <tc_57a55b54, /*tc_2early*/
+      [InstrStage<1, [SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5944960d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3bd75825, /*tc_3*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_59a7822c, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3c76b0ff, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5a4b5e58, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3d495a39, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 2, 2],
+    InstrItinData <tc_5b347363, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_40116ca8, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+    InstrItinData <tc_5ceb2f9e, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_434c8e1e, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_5d636bc7, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4414d8b1, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_5da50c4b, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_44d3da28, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+    InstrItinData <tc_5deb5e47, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5e4cf0e8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4560740b, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+    InstrItinData <tc_5f2afaf7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4837eefb, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_49a8207d, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [2],
+    InstrItinData <tc_60e324ff, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_4ae7b58b, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+    InstrItinData <tc_63567288, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
       [Hex_FWD]>,
 
-    InstrItinData <tc_4b68bce4, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+    InstrItinData <tc_64b00d8a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_651cbe02, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4c5ba658, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+    InstrItinData <tc_65279839, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65cbd974, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4d5fa3a1, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_69bfb303, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_53559e35, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 3, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_6ae3426b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_56336eb0, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_6d861a95, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_56f114f4, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
+    InstrItinData <tc_6e20402a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6f42bc60, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_57890846, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_6fb32599, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_6fc5dbea, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_711c805f, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5a2711e5, /*tc_1*/
+    InstrItinData <tc_713b66bf, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5abb5e3f, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+    InstrItinData <tc_7401744f, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7476d766, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_74a42bda, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5aee39f7, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_76bb5435, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5b54b33f, /*tc_3x*/
+    InstrItinData <tc_77f94a5e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_788b1d09, /*tc_3x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5b7c0967, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+    InstrItinData <tc_7b9187d3, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5bf126a6, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 3],
+    InstrItinData <tc_7c31e19a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c6d32e4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7dc63b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5d7f5414, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_7dcd9d89, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5ef37dc4, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+    InstrItinData <tc_7f7f45f5, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f8ae742, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6132ba3d, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_8035e91f, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_822c3c68, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_61830035, /*tc_2*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_829d8a86, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_838c4d7a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_640086b5, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_84a7500d, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_643b4717, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
+    InstrItinData <tc_86173609, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_67435e81, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+    InstrItinData <tc_887d1bb7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_675e4897, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2],
+    InstrItinData <tc_8a6d0d94, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_679309b8, /*tc_2*/
+    InstrItinData <tc_8a825db2, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6b25e783, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_8b5bd4f5, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_703e822c, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_8e82e8ca, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9124c04f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7186d325, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+    InstrItinData <tc_9165014d, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_92240447, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_934753bb, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7646c131, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+    InstrItinData <tc_937dd41c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
+      []>,
+
+    InstrItinData <tc_9406230a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_95a33176, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_76851da1, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_96ef76ef, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_975a4e54, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_779080bf, /*tc_2*/
+    InstrItinData <tc_9783714b, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_988416e3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_9b34f5e0, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_9b3c0462, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_784490da, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+    InstrItinData <tc_9bcfb2ee, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c52f549, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e27f2f9, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e72dc89, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_785f65a7, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_9edb7c77, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7a91e76a, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+    InstrItinData <tc_9edefe01, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f6cd987, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a08b630b, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a1297125, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a154b476, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_838b34ea, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+    InstrItinData <tc_a2b365d2, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a3070909, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a32e03e7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_85c9c08f, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_a38c45dc, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4e22bbd, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4ee89db, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_a7a13fac, /*tc_2early*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7bdb22c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_85d5d03f, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1, 1],
+    InstrItinData <tc_a9edeffa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_862b3e70, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_abfd9a6d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_88b4f13d, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_ac65613f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_addc37a8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae5babd7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_89e94ad3, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+    InstrItinData <tc_aee6250c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8b121f4a, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [],
+    InstrItinData <tc_b1ae5f67, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b34eb232, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_8b3e402a, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
-      [Hex_FWD]>,
+    InstrItinData <tc_b4dc7630, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8c945be0, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2],
+    InstrItinData <tc_b570493d, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8c99de45, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [],
+    InstrItinData <tc_b7c4062a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b837298f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_8d9d0154, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_ba9255a6, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8fb7ab1b, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+    InstrItinData <tc_bb07f2c5, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb831a7c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bf2ffc0f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9461ff31, /*tc_2*/
+    InstrItinData <tc_c20701f0, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_946df596, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_c21d7447, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9ad9998f, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [],
+    InstrItinData <tc_c57d9f39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c818ff7f, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_9bfd761f, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+    InstrItinData <tc_ce59038e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cfa0e29b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9c3ecd83, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d03278fd, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9ca930f7, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
+    InstrItinData <tc_d33e5eee, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d3632d88, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9da59d12, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d45ba9cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_9debc299, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d47648a2, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9e313203, /*tc_2*/
+    InstrItinData <tc_d57d649c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d61dfdc3, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9fc3dae0, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+    InstrItinData <tc_d68dca5c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d7718fbe, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_db596beb, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_db96aa6b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_dc51281d, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a1123dda, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+    InstrItinData <tc_decdde8a, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_a1c00888, /*tc_1*/
+    InstrItinData <tc_df4536ae, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_df5d53f9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e3d699e3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e9170fb7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ed03645c, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eed07714, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eeda4109, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ef921005, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a58fd5cc, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+    InstrItinData <tc_f098b237, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f0cdeccf, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a5d4aeec, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
+    InstrItinData <tc_f0e8e832, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f34c1c21, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f38f92e1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f529831b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a6b1eca9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
+    InstrItinData <tc_f6e2aff9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a813cf9a, /*tc_2*/
+    InstrItinData <tc_f7569068, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f999c66e, /*tc_2early*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fae9dfa5, /*tc_3x*/
       [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a9d88b22, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_fedb7e19, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
+  ];
+}
+
+class DepScalarItinV65 {
+  list<InstrItinData> DepScalarItinV65_list = [
+    InstrItinData <tc_011e0e9d, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01d44cb2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01e1be3b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_02fe1c65, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0655b949, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ae53734a, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_075c8dd8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b31c2e97, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+    InstrItinData <tc_0a195f2c, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b343892a, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [2, 3, 2],
+    InstrItinData <tc_0a6c20ae, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ba0d5da, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0dfac0a7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b43e7930, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
+    InstrItinData <tc_0fac1eb8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1044324a, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b4407292, /*tc_2early*/
-      [InstrStage<1, [SLOT0]>], [],
+    InstrItinData <tc_10b884b7, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
       []>,
 
-    InstrItinData <tc_b44ecf75, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_112d30d6, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_b4b5c03a, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_1242dc2a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_b51dc29a, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 1],
+    InstrItinData <tc_1248597c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b83e6d73, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+    InstrItinData <tc_14ab4f41, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b857bf4e, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_151bf368, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b8bffe55, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+    InstrItinData <tc_158aa3f7, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b90a29b1, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+    InstrItinData <tc_197dce51, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1981450d, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1b8138fc, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1c2c7a4a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1c7522a8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9272d6c, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
+    InstrItinData <tc_1d41f8b7, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1e7875f0, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9e09e03, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [3, 2, 2],
+    InstrItinData <tc_1fcb8495, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bab0eed9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_1fe4ab69, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bafaade3, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_20131976, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bcf98408, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_2237d952, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bd8382d1, /*tc_newvjump*/
+    InstrItinData <tc_234f8560, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_23708a21, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_24e109c7, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bdceeac1, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_24f426ab, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_be9602ff, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+    InstrItinData <tc_27106296, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bf061958, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+    InstrItinData <tc_280f7fe1, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bfec0f01, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_c4db48cb, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+    InstrItinData <tc_28e55c6f, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c4f596e3, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+    InstrItinData <tc_2c13e7f5, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c79a189f, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_2c3e17fc, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_2f573607, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2f669c77, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c8ce0b5c, /*tc_3x*/
+    InstrItinData <tc_362b0be2, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_38382228, /*tc_3x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cd374165, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_388f9897, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_38e0bae9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d14a17b, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cf8126ae, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_3edca78f, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cfd8378a, /*tc_2early*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 1],
+    InstrItinData <tc_3fbf1042, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_407e96f9, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_40d64c94, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d08ee0f4, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_4222e6bf, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d1aa9eaa, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
+    InstrItinData <tc_42ff66ba, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_442395f3, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_449acf79, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d2e63d61, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_44d5a428, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d5b7b0c1, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [1],
+    InstrItinData <tc_44fffc58, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_d5c0729a, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+    InstrItinData <tc_45791fb8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d63f638c, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1],
+    InstrItinData <tc_45f9d1be, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_d65dbf51, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_49fdfd4b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d773585a, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_4a55d03c, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d9d43ecb, /*tc_2early*/
-      [InstrStage<1, [SLOT3]>], [1, 2],
+    InstrItinData <tc_4abdbdc6, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_da4a37ed, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_4ac61d92, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_da97ee82, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+    InstrItinData <tc_4c1520ae, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_503ce0f3, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_db2bce9c, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_53c851ab, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_de4df740, /*tc_2early*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 1, 1],
+    InstrItinData <tc_5502c366, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55255f2b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_556f6577, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_de554571, /*tc_2early*/
+    InstrItinData <tc_55a9a350, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55b33fda, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_df3319ed, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
+    InstrItinData <tc_56a124a7, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e06f432a, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3],
-      [Hex_FWD]>,
+    InstrItinData <tc_57a55b54, /*tc_1*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e4a7f9f0, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_5944960d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e4b3cb20, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_e78647bd, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_59a7822c, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e86aa961, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5a4b5e58, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5b347363, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e93a3d71, /*tc_ld*/
+    InstrItinData <tc_5ceb2f9e, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e95795ec, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5d636bc7, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e9f3243f, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5da50c4b, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f429765c, /*tc_2*/
+    InstrItinData <tc_5deb5e47, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5e4cf0e8, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f675fee8, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+    InstrItinData <tc_5f2afaf7, /*tc_latepredldaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_60e324ff, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_63567288, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_64b00d8a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_651cbe02, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65279839, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65cbd974, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f8e23f0b, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+    InstrItinData <tc_69bfb303, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6ae3426b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6d861a95, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6e20402a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6f42bc60, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6fb32599, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_6fc5dbea, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f9058dd7, /*tc_2*/
+    InstrItinData <tc_711c805f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_713b66bf, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7401744f, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fc3999b4, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_7476d766, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fcc3ddf9, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+    InstrItinData <tc_74a42bda, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_76bb5435, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_77f94a5e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_788b1d09, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fe211424, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>
-  ];
-}
+    InstrItinData <tc_7b9187d3, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-class DepScalarItinV65 {
-  list<InstrItinData> DepScalarItinV65_list = [
-    InstrItinData <tc_002cb246, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_7c31e19a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0371abea, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+    InstrItinData <tc_7c6d32e4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7dc63b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_05c070ec, /*tc_2latepred*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2],
+    InstrItinData <tc_7dcd9d89, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f7f45f5, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f8ae742, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_05d3a09b, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+    InstrItinData <tc_8035e91f, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0663f615, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_822c3c68, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_096199d3, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+    InstrItinData <tc_829d8a86, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_838c4d7a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0a705168, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_84a7500d, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0ae0825c, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_86173609, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0b2be201, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+    InstrItinData <tc_887d1bb7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0d8f5752, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
+    InstrItinData <tc_8a6d0d94, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_13bfbcf9, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8a825db2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_14b272fa, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_8b5bd4f5, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_14b5c689, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_8e82e8ca, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9124c04f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_15aa71c5, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+    InstrItinData <tc_9165014d, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_174516e8, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_92240447, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_17e0d2cd, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+    InstrItinData <tc_934753bb, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1a2fd869, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_937dd41c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
+      []>,
 
-    InstrItinData <tc_1ad90acd, /*tc_3*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_9406230a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1ae57e39, /*tc_2latepred*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+    InstrItinData <tc_95a33176, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1b6f7cec, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
-      []>,
+    InstrItinData <tc_96ef76ef, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1c4528a2, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_975a4e54, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9783714b, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1c80410a, /*tc_1*/
+    InstrItinData <tc_988416e3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_9b34f5e0, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_9b3c0462, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9bcfb2ee, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c52f549, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1d81e60e, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_9e27f2f9, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e72dc89, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edb7c77, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edefe01, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f6cd987, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1fc97744, /*tc_1*/
+    InstrItinData <tc_a08b630b, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a1297125, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_20cdee80, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a154b476, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2332b92e, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_a2b365d2, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a3070909, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_24b66c99, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 2],
+    InstrItinData <tc_a32e03e7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_25a78932, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+    InstrItinData <tc_a38c45dc, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2b8da4c2, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a4e22bbd, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2eabeebe, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+    InstrItinData <tc_a4ee89db, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_2f7c551d, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+    InstrItinData <tc_a7a13fac, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7bdb22c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a9edeffa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2ff964b4, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+    InstrItinData <tc_abfd9a6d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ac65613f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_30b9bb4a, /*tc_st*/
+    InstrItinData <tc_addc37a8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae5babd7, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_32779c6f, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
+    InstrItinData <tc_aee6250c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_36153880, /*tc_newvjump*/
+    InstrItinData <tc_b1ae5f67, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b34eb232, /*tc_3stall*/
       [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_362c6592, /*tc_st*/
+    InstrItinData <tc_b4dc7630, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b570493d, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b7c4062a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b837298f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_ba9255a6, /*tc_st*/
       [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3962fa26, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 2],
+    InstrItinData <tc_bb07f2c5, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb831a7c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_39dfefe8, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [],
+    InstrItinData <tc_bf2ffc0f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c20701f0, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c21d7447, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c57d9f39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c818ff7f, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_3a867367, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+    InstrItinData <tc_ce59038e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cfa0e29b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d03278fd, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d33e5eee, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d3632d88, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d45ba9cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d47648a2, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d57d649c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d61dfdc3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d68dca5c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d7718fbe, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_db596beb, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_db96aa6b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_dc51281d, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_decdde8a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_df4536ae, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_df5d53f9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e3d699e3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e9170fb7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3b470976, /*tc_4x*/
+    InstrItinData <tc_ed03645c, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eed07714, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eeda4109, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ef921005, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f098b237, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f0cdeccf, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f0e8e832, /*tc_4x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3b5b7ef9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+    InstrItinData <tc_f34c1c21, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3bd75825, /*tc_3*/
-      [InstrStage<1, [SLOT2]>], [1],
+    InstrItinData <tc_f38f92e1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_3c76b0ff, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+    InstrItinData <tc_f529831b, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3d495a39, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+    InstrItinData <tc_f6e2aff9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_40116ca8, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_f7569068, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_434c8e1e, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_f999c66e, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fae9dfa5, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4414d8b1, /*tc_2*/
+    InstrItinData <tc_fedb7e19, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
+  ];
+}
+
+class DepScalarItinV66 {
+  list<InstrItinData> DepScalarItinV66_list = [
+    InstrItinData <tc_011e0e9d, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01d44cb2, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_44d3da28, /*tc_ld*/
+    InstrItinData <tc_01e1be3b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_02fe1c65, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0655b949, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_075c8dd8, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4560740b, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+    InstrItinData <tc_0a195f2c, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0a6c20ae, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4837eefb, /*tc_3stall*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_0ba0d5da, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0dfac0a7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_49a8207d, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [2],
+    InstrItinData <tc_0fac1eb8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1044324a, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_10b884b7, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_112d30d6, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_4ae7b58b, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+    InstrItinData <tc_1242dc2a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_4b68bce4, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+    InstrItinData <tc_1248597c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14ab4f41, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4c5ba658, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+    InstrItinData <tc_151bf368, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_158aa3f7, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_197dce51, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4d5fa3a1, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [2],
+    InstrItinData <tc_1981450d, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3],
       [Hex_FWD]>,
 
-    InstrItinData <tc_53559e35, /*tc_latepredstaia*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1],
+    InstrItinData <tc_1b8138fc, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1c2c7a4a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1c7522a8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_56336eb0, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+    InstrItinData <tc_1d41f8b7, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1e7875f0, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fcb8495, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_56f114f4, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_1fe4ab69, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_20131976, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_57890846, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_2237d952, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5a2711e5, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_234f8560, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5abb5e3f, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+    InstrItinData <tc_23708a21, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_24e109c7, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5aee39f7, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 2],
+    InstrItinData <tc_24f426ab, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_27106296, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_280f7fe1, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5b54b33f, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+    InstrItinData <tc_28e55c6f, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c13e7f5, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5b7c0967, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+    InstrItinData <tc_2c3e17fc, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_2f573607, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5bf126a6, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 3],
+    InstrItinData <tc_2f669c77, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5d7f5414, /*tc_3stall*/
+    InstrItinData <tc_362b0be2, /*tc_3*/
       [InstrStage<1, [SLOT2]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_5ef37dc4, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_38382228, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6132ba3d, /*tc_1*/
+    InstrItinData <tc_388f9897, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_38e0bae9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d14a17b, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3edca78f, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3fbf1042, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_407e96f9, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_61830035, /*tc_2*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_40d64c94, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4222e6bf, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_640086b5, /*tc_1*/
+    InstrItinData <tc_42ff66ba, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_442395f3, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_449acf79, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44d5a428, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44fffc58, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_45791fb8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_45f9d1be, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_49fdfd4b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4a55d03c, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_643b4717, /*tc_1*/
+    InstrItinData <tc_4abdbdc6, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4ac61d92, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4c1520ae, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_503ce0f3, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_53c851ab, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5502c366, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_67435e81, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+    InstrItinData <tc_55255f2b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_556f6577, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_55a9a350, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_675e4897, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 1],
+    InstrItinData <tc_55b33fda, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_679309b8, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_56a124a7, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6b25e783, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_57a55b54, /*tc_1*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_703e822c, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_5944960d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_59a7822c, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7186d325, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+    InstrItinData <tc_5a4b5e58, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7646c131, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+    InstrItinData <tc_5b347363, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_76851da1, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_5ceb2f9e, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5d636bc7, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5da50c4b, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_779080bf, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_5deb5e47, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_784490da, /*tc_2*/
+    InstrItinData <tc_5e4cf0e8, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_785f65a7, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5f2afaf7, /*tc_latepredldaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7a91e76a, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_60e324ff, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_838b34ea, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_63567288, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_85c9c08f, /*tc_1*/
-      [InstrStage<1, [SLOT2]>], [2, 2],
+    InstrItinData <tc_64b00d8a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_85d5d03f, /*tc_1*/
+    InstrItinData <tc_651cbe02, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_862b3e70, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+    InstrItinData <tc_65279839, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65cbd974, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_69bfb303, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6ae3426b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6d861a95, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6e20402a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6f42bc60, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6fb32599, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_88b4f13d, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_6fc5dbea, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_711c805f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_713b66bf, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_89e94ad3, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+    InstrItinData <tc_7401744f, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7476d766, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8b121f4a, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [],
+    InstrItinData <tc_74a42bda, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_76bb5435, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_77f94a5e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_8b3e402a, /*tc_2latepred*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4],
-      [Hex_FWD]>,
+    InstrItinData <tc_788b1d09, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8c945be0, /*tc_newvjump*/
+    InstrItinData <tc_7b9187d3, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8c99de45, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
+    InstrItinData <tc_7c31e19a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8d9d0154, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_7c6d32e4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8fb7ab1b, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+    InstrItinData <tc_7dc63b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7dcd9d89, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f7f45f5, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f8ae742, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9461ff31, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_8035e91f, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_822c3c68, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_946df596, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_829d8a86, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_838c4d7a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9ad9998f, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [],
-      []>,
+    InstrItinData <tc_84a7500d, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9bfd761f, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+    InstrItinData <tc_86173609, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9c3ecd83, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_887d1bb7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9ca930f7, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
+    InstrItinData <tc_8a6d0d94, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9da59d12, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 1],
+    InstrItinData <tc_8a825db2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9debc299, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+    InstrItinData <tc_8b5bd4f5, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8e82e8ca, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9e313203, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_9124c04f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9fc3dae0, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
+    InstrItinData <tc_9165014d, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a1123dda, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+    InstrItinData <tc_92240447, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_934753bb, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_937dd41c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
+      []>,
+
+    InstrItinData <tc_9406230a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_95a33176, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_96ef76ef, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_975a4e54, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9783714b, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_988416e3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3],
       [Hex_FWD]>,
 
-    InstrItinData <tc_a1c00888, /*tc_1*/
+    InstrItinData <tc_9b34f5e0, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_9b3c0462, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9bcfb2ee, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c52f549, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e27f2f9, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e72dc89, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edb7c77, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edefe01, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f6cd987, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a58fd5cc, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+    InstrItinData <tc_a08b630b, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a1297125, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a154b476, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a5d4aeec, /*tc_ld*/
+    InstrItinData <tc_a2b365d2, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a3070909, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a32e03e7, /*tc_ld*/
       [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a6b1eca9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a38c45dc, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a813cf9a, /*tc_2*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
+    InstrItinData <tc_a4e22bbd, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4ee89db, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_a7a13fac, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7bdb22c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a9d88b22, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_a9edeffa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_abfd9a6d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ac65613f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_addc37a8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae5babd7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_aee6250c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ae53734a, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+    InstrItinData <tc_b1ae5f67, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_b31c2e97, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+    InstrItinData <tc_b34eb232, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_b4dc7630, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b570493d, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b7c4062a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b837298f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_ba9255a6, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb07f2c5, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb831a7c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bf2ffc0f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b343892a, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [2, 3, 2],
+    InstrItinData <tc_c20701f0, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b43e7930, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
+    InstrItinData <tc_c21d7447, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c57d9f39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b4407292, /*tc_2early*/
+    InstrItinData <tc_c818ff7f, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_b44ecf75, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2],
+    InstrItinData <tc_ce59038e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cfa0e29b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b4b5c03a, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+    InstrItinData <tc_d03278fd, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b51dc29a, /*tc_1*/
-      [InstrStage<1, [SLOT2]>], [3, 2],
+    InstrItinData <tc_d33e5eee, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d3632d88, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b83e6d73, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+    InstrItinData <tc_d45ba9cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d47648a2, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d57d649c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d61dfdc3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d68dca5c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d7718fbe, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_db596beb, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b857bf4e, /*tc_st*/
+    InstrItinData <tc_db96aa6b, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_b8bffe55, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+    InstrItinData <tc_dc51281d, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_decdde8a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_df4536ae, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b90a29b1, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_df5d53f9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9272d6c, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
+    InstrItinData <tc_e3d699e3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9e09e03, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [4, 1, 2],
+    InstrItinData <tc_e9170fb7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ed03645c, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eed07714, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bab0eed9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+    InstrItinData <tc_eeda4109, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ef921005, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f098b237, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f0cdeccf, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bafaade3, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_f0e8e832, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bcf98408, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_f34c1c21, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bd8382d1, /*tc_newvjump*/
+    InstrItinData <tc_f38f92e1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f529831b, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f6e2aff9, /*tc_newvjump*/
       [InstrStage<1, [SLOT0]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bdceeac1, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+    InstrItinData <tc_f7569068, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_be9602ff, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+    InstrItinData <tc_f999c66e, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fae9dfa5, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fedb7e19, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
+  ];
+}
+
+class DepScalarItinV67 {
+  list<InstrItinData> DepScalarItinV67_list = [
+    InstrItinData <tc_011e0e9d, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01d44cb2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bf061958, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+    InstrItinData <tc_01e1be3b, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bfec0f01, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_02fe1c65, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c4db48cb, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+    InstrItinData <tc_0655b949, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 3],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c4f596e3, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+    InstrItinData <tc_075c8dd8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c79a189f, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+    InstrItinData <tc_0a195f2c, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c8ce0b5c, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+    InstrItinData <tc_0a6c20ae, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ba0d5da, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0dfac0a7, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cd374165, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_0fac1eb8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cf8126ae, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_1044324a, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cfd8378a, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_10b884b7, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_112d30d6, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1242dc2a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1248597c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d08ee0f4, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_14ab4f41, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d1aa9eaa, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_151bf368, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d2e63d61, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+    InstrItinData <tc_158aa3f7, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d5b7b0c1, /*tc_1*/
-      [InstrStage<1, [SLOT2]>], [2],
+    InstrItinData <tc_197dce51, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1981450d, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3],
       [Hex_FWD]>,
 
-    InstrItinData <tc_d5c0729a, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+    InstrItinData <tc_1b8138fc, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1c2c7a4a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1c7522a8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d63f638c, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_1d41f8b7, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d65dbf51, /*tc_latepredstaia*/
-      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3],
+    InstrItinData <tc_1e7875f0, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fcb8495, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fe4ab69, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d773585a, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_20131976, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d9d43ecb, /*tc_1*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_2237d952, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_da4a37ed, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
+    InstrItinData <tc_234f8560, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_da97ee82, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+    InstrItinData <tc_23708a21, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_24e109c7, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_24f426ab, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_27106296, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_280f7fe1, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_db2bce9c, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_28e55c6f, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_de4df740, /*tc_1*/
+    InstrItinData <tc_2c13e7f5, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2c3e17fc, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_2f573607, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2f669c77, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_362b0be2, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_38382228, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_388f9897, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_de554571, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_38e0bae9, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d14a17b, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_df3319ed, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
+    InstrItinData <tc_3edca78f, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e06f432a, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3],
+    InstrItinData <tc_3fbf1042, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3],
       [Hex_FWD]>,
 
-    InstrItinData <tc_e4a7f9f0, /*tc_1*/
+    InstrItinData <tc_407e96f9, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e4b3cb20, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_40d64c94, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e78647bd, /*tc_1*/
+    InstrItinData <tc_4222e6bf, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_42ff66ba, /*tc_1*/
       [InstrStage<1, [SLOT2]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e86aa961, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_442395f3, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e93a3d71, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+    InstrItinData <tc_449acf79, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e95795ec, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_44d5a428, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e9f3243f, /*tc_latepredldaia*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
+    InstrItinData <tc_44fffc58, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_45791fb8, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f429765c, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_45f9d1be, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_f675fee8, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_49fdfd4b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f8e23f0b, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_4a55d03c, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f9058dd7, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_4abdbdc6, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fc3999b4, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_4ac61d92, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fcc3ddf9, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+    InstrItinData <tc_4c1520ae, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_503ce0f3, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fe211424, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>
-  ];
-}
+    InstrItinData <tc_53c851ab, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-class DepScalarItinV66 {
-  list<InstrItinData> DepScalarItinV66_list = [
-    InstrItinData <tc_002cb246, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_5502c366, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0371abea, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 3],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_55255f2b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [],
+      []>,
 
-    InstrItinData <tc_05c070ec, /*tc_2latepred*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_556f6577, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_05d3a09b, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+    InstrItinData <tc_55a9a350, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0663f615, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_55b33fda, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_56a124a7, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_57a55b54, /*tc_1*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5944960d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_096199d3, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2],
+    InstrItinData <tc_59a7822c, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5a4b5e58, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0a705168, /*tc_1*/
+    InstrItinData <tc_5b347363, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0ae0825c, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_5ceb2f9e, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5d636bc7, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0b2be201, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
+    InstrItinData <tc_5da50c4b, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5deb5e47, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5e4cf0e8, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_0d8f5752, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
+    InstrItinData <tc_5f2afaf7, /*tc_latepredldaia*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_60e324ff, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_63567288, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_64b00d8a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_13bfbcf9, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_651cbe02, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_14b272fa, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 3],
+    InstrItinData <tc_65279839, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65cbd974, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_14b5c689, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_69bfb303, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_15aa71c5, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+    InstrItinData <tc_6ae3426b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6d861a95, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6e20402a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_174516e8, /*tc_3x*/
+    InstrItinData <tc_6f42bc60, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6fb32599, /*tc_3stall*/
       [InstrStage<1, [SLOT3]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_17e0d2cd, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+    InstrItinData <tc_6fc5dbea, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_711c805f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_713b66bf, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1a2fd869, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 1],
+    InstrItinData <tc_7401744f, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1ad90acd, /*tc_3*/
-      [InstrStage<1, [SLOT2]>], [2, 1],
+    InstrItinData <tc_7476d766, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1ae57e39, /*tc_2latepred*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+    InstrItinData <tc_74a42bda, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1b6f7cec, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+    InstrItinData <tc_76bb5435, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_77f94a5e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_1c4528a2, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_788b1d09, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7b9187d3, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c31e19a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c6d32e4, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7dc63b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7dcd9d89, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 3],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1c80410a, /*tc_2*/
+    InstrItinData <tc_7f7f45f5, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f8ae742, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8035e91f, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_822c3c68, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_829d8a86, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_838c4d7a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_84a7500d, /*tc_2*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1d81e60e, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_86173609, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_887d1bb7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8a6d0d94, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_1fc97744, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_8a825db2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8b5bd4f5, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_20cdee80, /*tc_1*/
+    InstrItinData <tc_8e82e8ca, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9124c04f, /*tc_1*/
       [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2332b92e, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_9165014d, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_24b66c99, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_25a78932, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_92240447, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2b8da4c2, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 1],
+    InstrItinData <tc_934753bb, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2eabeebe, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+    InstrItinData <tc_937dd41c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [],
       []>,
 
-    InstrItinData <tc_2f7c551d, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+    InstrItinData <tc_9406230a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_95a33176, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_2ff964b4, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+    InstrItinData <tc_96ef76ef, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_30b9bb4a, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+    InstrItinData <tc_975a4e54, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_32779c6f, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
+    InstrItinData <tc_9783714b, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_36153880, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [],
+    InstrItinData <tc_988416e3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_9b34f5e0, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
       []>,
 
-    InstrItinData <tc_362c6592, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 3],
+    InstrItinData <tc_9b3c0462, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3962fa26, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 1, 2, 3],
+    InstrItinData <tc_9bcfb2ee, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9c52f549, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e27f2f9, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e72dc89, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9edb7c77, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_39dfefe8, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [],
-      []>,
+    InstrItinData <tc_9edefe01, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3a867367, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1],
+    InstrItinData <tc_9f6cd987, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3b470976, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+    InstrItinData <tc_a08b630b, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3b5b7ef9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2],
+    InstrItinData <tc_a1297125, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3bd75825, /*tc_3*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_a154b476, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3c76b0ff, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2],
+    InstrItinData <tc_a2b365d2, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a3070909, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a32e03e7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_3d495a39, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+    InstrItinData <tc_a38c45dc, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4e22bbd, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4ee89db, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_a7a13fac, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7bdb22c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a9edeffa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_40116ca8, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 1, 2, 3],
+    InstrItinData <tc_abfd9a6d, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ac65613f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_addc37a8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ae5babd7, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_aee6250c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b1ae5f67, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_b34eb232, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_b4dc7630, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_434c8e1e, /*tc_3stall*/
+    InstrItinData <tc_b570493d, /*tc_3stall*/
       [InstrStage<1, [SLOT3]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4414d8b1, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_b7c4062a, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b837298f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_ba9255a6, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_44d3da28, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_bb07f2c5, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4560740b, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1, 2],
+    InstrItinData <tc_bb831a7c, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4837eefb, /*tc_3stall*/
+    InstrItinData <tc_bf2ffc0f, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c20701f0, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c21d7447, /*tc_3x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_49a8207d, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_c57d9f39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4ae7b58b, /*tc_3*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [2],
-      [Hex_FWD]>,
+    InstrItinData <tc_c818ff7f, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
 
-    InstrItinData <tc_4b68bce4, /*tc_st*/
+    InstrItinData <tc_ce59038e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cfa0e29b, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4c5ba658, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2, 2],
+    InstrItinData <tc_d03278fd, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_4d5fa3a1, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [2],
+    InstrItinData <tc_d33e5eee, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d3632d88, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d45ba9cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_53559e35, /*tc_latepredstaia*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 1],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d47648a2, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_56336eb0, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2, 2],
+    InstrItinData <tc_d57d649c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d61dfdc3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d68dca5c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_56f114f4, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_d7718fbe, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_db596beb, /*tc_3x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_57890846, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_db96aa6b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_dc51281d, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5a2711e5, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_decdde8a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_df4536ae, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5abb5e3f, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+    InstrItinData <tc_df5d53f9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5aee39f7, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e3d699e3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_e9170fb7, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ed03645c, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eed07714, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_eeda4109, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ef921005, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f098b237, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5b54b33f, /*tc_3x*/
+    InstrItinData <tc_f0cdeccf, /*tc_3x*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5b7c0967, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 2],
+    InstrItinData <tc_f0e8e832, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f34c1c21, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f38f92e1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_f529831b, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f6e2aff9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f7569068, /*tc_4x*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f999c66e, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5bf126a6, /*tc_st*/
+    InstrItinData <tc_fae9dfa5, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fedb7e19, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
+  ];
+}
+
+class DepScalarItinV67T {
+  list<InstrItinData> DepScalarItinV67T_list = [
+    InstrItinData <tc_011e0e9d, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01d44cb2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_01e1be3b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_02fe1c65, /*tc_4x*/
+      [InstrStage<1, [SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0655b949, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [2, 3],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5d7f5414, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_075c8dd8, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_5ef37dc4, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2],
+    InstrItinData <tc_0a195f2c, /*tc_4x*/
+      [InstrStage<1, [SLOT3]>], [5, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6132ba3d, /*tc_2*/
+    InstrItinData <tc_0a6c20ae, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_0ba0d5da, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_0dfac0a7, /*tc_2*/
       [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_61830035, /*tc_2*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_0fac1eb8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_640086b5, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_1044324a, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_10b884b7, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_112d30d6, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1242dc2a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1248597c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_14ab4f41, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_643b4717, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+    InstrItinData <tc_151bf368, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_158aa3f7, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_67435e81, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
+    InstrItinData <tc_197dce51, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_675e4897, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 1],
+    InstrItinData <tc_1981450d, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_1b8138fc, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_679309b8, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_1c2c7a4a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1c7522a8, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1d41f8b7, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1e7875f0, /*tc_2early*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_1fcb8495, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_6b25e783, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_1fe4ab69, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_703e822c, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_20131976, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_2237d952, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7186d325, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+    InstrItinData <tc_234f8560, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7646c131, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 2],
+    InstrItinData <tc_23708a21, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_24e109c7, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_76851da1, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_24f426ab, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_779080bf, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_27106296, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_784490da, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
+    InstrItinData <tc_280f7fe1, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_785f65a7, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_28e55c6f, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_7a91e76a, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 2, 2],
+    InstrItinData <tc_2c13e7f5, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_838b34ea, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_2c3e17fc, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_85c9c08f, /*tc_1*/
+    InstrItinData <tc_2f573607, /*tc_1*/
       [InstrStage<1, [SLOT2]>], [2, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_85d5d03f, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_2f669c77, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_862b3e70, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+    InstrItinData <tc_362b0be2, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_88b4f13d, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_38382228, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_89e94ad3, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2],
+    InstrItinData <tc_388f9897, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_38e0bae9, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_3d14a17b, /*tc_1*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8b121f4a, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [],
-      []>,
+    InstrItinData <tc_3edca78f, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8b3e402a, /*tc_2latepred*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4],
+    InstrItinData <tc_3fbf1042, /*tc_1*/
+      [InstrStage<1, [SLOT0]>], [3],
       [Hex_FWD]>,
 
-    InstrItinData <tc_8c945be0, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2],
+    InstrItinData <tc_407e96f9, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_40d64c94, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8c99de45, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [],
-      []>,
+    InstrItinData <tc_4222e6bf, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_42ff66ba, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_442395f3, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_449acf79, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44d5a428, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_44fffc58, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_45791fb8, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_8d9d0154, /*tc_3stall*/
+    InstrItinData <tc_45f9d1be, /*tc_2early*/
       [InstrStage<1, [SLOT2]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_8fb7ab1b, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+    InstrItinData <tc_49fdfd4b, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4a55d03c, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4abdbdc6, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4ac61d92, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_4c1520ae, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_503ce0f3, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 2, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9461ff31, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_53c851ab, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_946df596, /*tc_1*/
+    InstrItinData <tc_5502c366, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9ad9998f, /*tc_3stall*/
+    InstrItinData <tc_55255f2b, /*tc_3stall*/
       [InstrStage<1, [SLOT3]>], [],
       []>,
 
-    InstrItinData <tc_9bfd761f, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+    InstrItinData <tc_556f6577, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9c3ecd83, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_55a9a350, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9ca930f7, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1, 2],
+    InstrItinData <tc_55b33fda, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9da59d12, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 1],
+    InstrItinData <tc_56a124a7, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_57a55b54, /*tc_1*/
+      [InstrStage<1, [SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5944960d, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9debc299, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1, 2],
+    InstrItinData <tc_59a7822c, /*tc_1*/
+      [InstrStage<1, [SLOT0]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5a4b5e58, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5b347363, /*tc_1*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5ceb2f9e, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9e313203, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+    InstrItinData <tc_5d636bc7, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_9fc3dae0, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_5da50c4b, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a1123dda, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3],
+    InstrItinData <tc_5deb5e47, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5e4cf0e8, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_5f2afaf7, /*tc_latepredldaia*/
+      [InstrStage<1, [SLOT0]>], [4, 4, 3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_60e324ff, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_a1c00888, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_63567288, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0]>], [4],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_64b00d8a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a58fd5cc, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 2, 1, 1],
+    InstrItinData <tc_651cbe02, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65279839, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_65cbd974, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a5d4aeec, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_69bfb303, /*tc_3*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a6b1eca9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_6ae3426b, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a813cf9a, /*tc_2*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
+    InstrItinData <tc_6d861a95, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_a9d88b22, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_6e20402a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 3],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_ae53734a, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2],
+    InstrItinData <tc_6f42bc60, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_6fb32599, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_b31c2e97, /*tc_1*/
+    InstrItinData <tc_6fc5dbea, /*tc_1*/
       [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b343892a, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+    InstrItinData <tc_711c805f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_713b66bf, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [3, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b43e7930, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [4, 1],
+    InstrItinData <tc_7401744f, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7476d766, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b4407292, /*tc_2early*/
+    InstrItinData <tc_74a42bda, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_76bb5435, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_77f94a5e, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [],
       []>,
 
-    InstrItinData <tc_b44ecf75, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [3, 1, 2],
+    InstrItinData <tc_788b1d09, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7b9187d3, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7c31e19a, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b4b5c03a, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_7c6d32e4, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b51dc29a, /*tc_1*/
-      [InstrStage<1, [SLOT2]>], [3, 2],
+    InstrItinData <tc_7dc63b5c, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b83e6d73, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+    InstrItinData <tc_7dcd9d89, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 3],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_7f7f45f5, /*tc_4x*/
+      [InstrStage<1, [SLOT3]>], [5, 5, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b857bf4e, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_7f8ae742, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b8bffe55, /*tc_4x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [5, 5, 1],
+    InstrItinData <tc_8035e91f, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_822c3c68, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b90a29b1, /*tc_st*/
+    InstrItinData <tc_829d8a86, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9272d6c, /*tc_3stall*/
-      [InstrStage<1, [SLOT3]>], [4, 2],
+    InstrItinData <tc_838c4d7a, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_84a7500d, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_86173609, /*tc_2latepred*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [4, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_887d1bb7, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8a6d0d94, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_b9e09e03, /*tc_3stall*/
-      [InstrStage<1, [SLOT2]>], [4, 1, 2],
+    InstrItinData <tc_8a825db2, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bab0eed9, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2, 2],
+    InstrItinData <tc_8b5bd4f5, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_8e82e8ca, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9124c04f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9165014d, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_92240447, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bafaade3, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 1],
+    InstrItinData <tc_934753bb, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bcf98408, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [4, 1],
+    InstrItinData <tc_937dd41c, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_9406230a, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bd8382d1, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+    InstrItinData <tc_95a33176, /*tc_2*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [4, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bdceeac1, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+    InstrItinData <tc_96ef76ef, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_be9602ff, /*tc_st*/
+    InstrItinData <tc_975a4e54, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 3, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9783714b, /*tc_4x*/
+      [InstrStage<1, [SLOT3]>], [5, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_988416e3, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_9b34f5e0, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [],
+      []>,
+
+    InstrItinData <tc_9b3c0462, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9bcfb2ee, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bf061958, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 1, 2],
+    InstrItinData <tc_9c52f549, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e27f2f9, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9e72dc89, /*tc_4x*/
+      [InstrStage<1, [SLOT3]>], [5, 2, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_bfec0f01, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2, 2],
+    InstrItinData <tc_9edb7c77, /*tc_4x*/
+      [InstrStage<1, [SLOT3]>], [5, 2, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c4db48cb, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2],
+    InstrItinData <tc_9edefe01, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_9f6cd987, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c4f596e3, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2],
+    InstrItinData <tc_a08b630b, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a1297125, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a154b476, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c79a189f, /*tc_st*/
+    InstrItinData <tc_a2b365d2, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [3, 1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_c8ce0b5c, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
-
-    InstrItinData <tc_cd374165, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_a3070909, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cf8126ae, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
-      [Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_a32e03e7, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_cfd8378a, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_a38c45dc, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 1, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4e22bbd, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a4ee89db, /*tc_2early*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_a7a13fac, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_a7bdb22c, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d08ee0f4, /*tc_2*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_a9edeffa, /*tc_st*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d1aa9eaa, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 1, 1],
+    InstrItinData <tc_abfd9a6d, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ac65613f, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_addc37a8, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d2e63d61, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+    InstrItinData <tc_ae5babd7, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d5b7b0c1, /*tc_1*/
-      [InstrStage<1, [SLOT2]>], [2],
+    InstrItinData <tc_aee6250c, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_b1ae5f67, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_d5c0729a, /*tc_st*/
+    InstrItinData <tc_b34eb232, /*tc_3stall*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_b4dc7630, /*tc_st*/
       [InstrStage<1, [SLOT0]>], [3, 1, 2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d63f638c, /*tc_ld*/
-      [InstrStage<1, [SLOT0]>], [1],
-      [Hex_FWD]>,
+    InstrItinData <tc_b570493d, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d65dbf51, /*tc_latepredstaia*/
-      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3],
+    InstrItinData <tc_b7c4062a, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d773585a, /*tc_3x*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 1, 1],
+    InstrItinData <tc_b837298f, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [],
+      []>,
+
+    InstrItinData <tc_ba9255a6, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb07f2c5, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bb831a7c, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2, 2, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_bf2ffc0f, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_d9d43ecb, /*tc_1*/
-      [InstrStage<1, [SLOT3]>], [2, 2],
+    InstrItinData <tc_c20701f0, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c21d7447, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_c57d9f39, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [3, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_da4a37ed, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 2, 3],
+    InstrItinData <tc_c818ff7f, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [],
+      []>,
+
+    InstrItinData <tc_ce59038e, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_cfa0e29b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 2, 3],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_da97ee82, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 3],
+    InstrItinData <tc_d03278fd, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [2, 1, 2, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_db2bce9c, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [2, 2],
+    InstrItinData <tc_d33e5eee, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d3632d88, /*tc_2*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_de4df740, /*tc_1*/
-      [InstrStage<1, [SLOT0, SLOT1, SLOT2, SLOT3]>], [3, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_d45ba9cd, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_de554571, /*tc_1*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+    InstrItinData <tc_d47648a2, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [1, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_df3319ed, /*tc_3x*/
-      [InstrStage<1, [SLOT3]>], [2, 1],
+    InstrItinData <tc_d57d649c, /*tc_3stall*/
+      [InstrStage<1, [SLOT2]>], [2],
+      [Hex_FWD]>,
+
+    InstrItinData <tc_d61dfdc3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e06f432a, /*tc_newvjump*/
-      [InstrStage<1, [SLOT0]>], [3],
+    InstrItinData <tc_d68dca5c, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_d7718fbe, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [1],
       [Hex_FWD]>,
 
-    InstrItinData <tc_e4a7f9f0, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+    InstrItinData <tc_db596beb, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e4b3cb20, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 2, 1, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_db96aa6b, /*tc_st*/
+      [InstrStage<1, [SLOT0]>], [1],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_e78647bd, /*tc_1*/
-      [InstrStage<1, [SLOT2]>], [2, 2],
+    InstrItinData <tc_dc51281d, /*tc_3*/
+      [InstrStage<1, [SLOT2]>], [2, 1],
       [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e86aa961, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [3, 1, 2, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_decdde8a, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [2],
+      [Hex_FWD]>,
 
-    InstrItinData <tc_e93a3d71, /*tc_ld*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 3, 1, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_df4536ae, /*tc_3stall*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e95795ec, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [2, 1, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_df5d53f9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_e9f3243f, /*tc_latepredldaia*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [4, 4, 3, 1, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e3d699e3, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f429765c, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_e9170fb7, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 1],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f675fee8, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2, 2],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_ed03645c, /*tc_1*/
+      [InstrStage<1, [SLOT2]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f8e23f0b, /*tc_st*/
-      [InstrStage<1, [SLOT0, SLOT1]>], [2, 1, 2, 3],
-      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData <tc_eed07714, /*tc_ld*/
+      [InstrStage<1, [SLOT0, SLOT1]>], [4, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_f9058dd7, /*tc_2*/
-      [InstrStage<1, [SLOT2, SLOT3]>], [4, 4, 2, 2],
+    InstrItinData <tc_eeda4109, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_ef921005, /*tc_1*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [3, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f098b237, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f0cdeccf, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 1, 1, 2],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fc3999b4, /*tc_2early*/
-      [InstrStage<1, [SLOT2]>], [2],
+    InstrItinData <tc_f0e8e832, /*tc_4x*/
+      [InstrStage<1, [SLOT3]>], [5, 1, 1],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f34c1c21, /*tc_2*/
+      [InstrStage<1, [SLOT2, SLOT3]>], [4, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f38f92e1, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [2],
       [Hex_FWD]>,
 
-    InstrItinData <tc_fcc3ddf9, /*tc_st*/
-      [InstrStage<1, [SLOT0]>], [1, 2, 2, 3],
+    InstrItinData <tc_f529831b, /*tc_latepredstaia*/
+      [InstrStage<1, [SLOT0]>], [4, 3, 1, 2, 3],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f6e2aff9, /*tc_newvjump*/
+      [InstrStage<1, [SLOT0]>], [3, 2, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_f7569068, /*tc_4x*/
+      [InstrStage<1, [SLOT3]>], [5, 5, 1, 1],
       [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>,
 
-    InstrItinData <tc_fe211424, /*tc_3stall*/
-      [InstrStage<1, [SLOT0]>], [1, 1],
-      [Hex_FWD, Hex_FWD]>
+    InstrItinData <tc_f999c66e, /*tc_1*/
+      [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [2, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fae9dfa5, /*tc_3x*/
+      [InstrStage<1, [SLOT3]>], [4, 2],
+      [Hex_FWD, Hex_FWD]>,
+
+    InstrItinData <tc_fedb7e19, /*tc_ld*/
+      [InstrStage<1, [SLOT0]>], [4, 2, 1, 2],
+      [Hex_FWD, Hex_FWD, Hex_FWD, Hex_FWD]>
   ];
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonDepITypes.h b/llvm/lib/Target/Hexagon/HexagonDepITypes.h
index 358345e027d8..b261b4653127 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepITypes.h
+++ b/llvm/lib/Target/Hexagon/HexagonDepITypes.h
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+// Automatically generated file, do not edit!
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
@@ -16,49 +16,48 @@ enum Type {
   TypeALU32_ADDI = 2,
   TypeALU64 = 3,
   TypeCJ = 4,
-  TypeCOPROC_VX = 5,
-  TypeCR = 6,
-  TypeCVI_4SLOT_MPY = 7,
-  TypeCVI_GATHER = 8,
-  TypeCVI_GATHER_RST = 9,
-  TypeCVI_HIST = 10,
-  TypeCVI_SCATTER = 11,
-  TypeCVI_SCATTER_DV = 12,
-  TypeCVI_SCATTER_NEW_RST = 13,
-  TypeCVI_SCATTER_NEW_ST = 14,
-  TypeCVI_SCATTER_RST = 15,
-  TypeCVI_VA = 16,
-  TypeCVI_VA_DV = 17,
-  TypeCVI_VINLANESAT = 18,
-  TypeCVI_VM_LD = 19,
-  TypeCVI_VM_NEW_ST = 20,
-  TypeCVI_VM_ST = 21,
-  TypeCVI_VM_STU = 22,
-  TypeCVI_VM_TMP_LD = 23,
-  TypeCVI_VM_VP_LDU = 24,
-  TypeCVI_VP = 25,
-  TypeCVI_VP_VS = 26,
-  TypeCVI_VS = 27,
-  TypeCVI_VS_VX = 28,
-  TypeCVI_VX = 29,
-  TypeCVI_VX_DV = 30,
-  TypeCVI_VX_LATE = 31,
-  TypeCVI_ZW = 32,
-  TypeDUPLEX = 33,
-  TypeENDLOOP = 34,
-  TypeEXTENDER = 35,
-  TypeJ = 36,
-  TypeLD = 37,
-  TypeM = 38,
-  TypeMAPPING = 39,
-  TypeNCJ = 40,
-  TypePSEUDO = 41,
-  TypeST = 42,
-  TypeSUBINSN = 43,
-  TypeS_2op = 44,
-  TypeS_3op = 45,
-  TypeV2LDST = 48,
-  TypeV4LDST = 49,
+  TypeCR = 7,
+  TypeCVI_4SLOT_MPY = 8,
+  TypeCVI_GATHER = 9,
+  TypeCVI_GATHER_DV = 10,
+  TypeCVI_GATHER_RST = 11,
+  TypeCVI_HIST = 12,
+  TypeCVI_SCATTER = 13,
+  TypeCVI_SCATTER_DV = 14,
+  TypeCVI_SCATTER_NEW_RST = 15,
+  TypeCVI_SCATTER_NEW_ST = 16,
+  TypeCVI_SCATTER_RST = 17,
+  TypeCVI_VA = 18,
+  TypeCVI_VA_DV = 19,
+  TypeCVI_VM_LD = 20,
+  TypeCVI_VM_NEW_ST = 21,
+  TypeCVI_VM_ST = 22,
+  TypeCVI_VM_STU = 23,
+  TypeCVI_VM_TMP_LD = 24,
+  TypeCVI_VM_VP_LDU = 25,
+  TypeCVI_VP = 26,
+  TypeCVI_VP_VS = 27,
+  TypeCVI_VS = 28,
+  TypeCVI_VS_VX = 29,
+  TypeCVI_VX = 30,
+  TypeCVI_VX_DV = 31,
+  TypeCVI_VX_LATE = 32,
+  TypeCVI_ZW = 33,
+  TypeDUPLEX = 34,
+  TypeENDLOOP = 35,
+  TypeEXTENDER = 36,
+  TypeJ = 37,
+  TypeLD = 38,
+  TypeM = 39,
+  TypeMAPPING = 40,
+  TypeNCJ = 41,
+  TypePSEUDO = 42,
+  TypeST = 43,
+  TypeSUBINSN = 44,
+  TypeS_2op = 45,
+  TypeS_3op = 46,
+  TypeV2LDST = 49,
+  TypeV4LDST = 50,
 };
 }
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonDepITypes.td b/llvm/lib/Target/Hexagon/HexagonDepITypes.td
index 91c02b84b87c..f251a291c23c 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepITypes.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepITypes.td
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+// Automatically generated file, do not edit!
 //===----------------------------------------------------------------------===//
 
 class IType<bits<7> t> { bits<7> Value = t; }
@@ -14,46 +14,45 @@ def TypeALU32_3op : IType<1>;
 def TypeALU32_ADDI : IType<2>;
 def TypeALU64 : IType<3>;
 def TypeCJ : IType<4>;
-def TypeCOPROC_VX : IType<5>;
-def TypeCR : IType<6>;
-def TypeCVI_4SLOT_MPY : IType<7>;
-def TypeCVI_GATHER : IType<8>;
-def TypeCVI_GATHER_RST : IType<9>;
-def TypeCVI_HIST : IType<10>;
-def TypeCVI_SCATTER : IType<11>;
-def TypeCVI_SCATTER_DV : IType<12>;
-def TypeCVI_SCATTER_NEW_RST : IType<13>;
-def TypeCVI_SCATTER_NEW_ST : IType<14>;
-def TypeCVI_SCATTER_RST : IType<15>;
-def TypeCVI_VA : IType<16>;
-def TypeCVI_VA_DV : IType<17>;
-def TypeCVI_VINLANESAT : IType<18>;
-def TypeCVI_VM_LD : IType<19>;
-def TypeCVI_VM_NEW_ST : IType<20>;
-def TypeCVI_VM_ST : IType<21>;
-def TypeCVI_VM_STU : IType<22>;
-def TypeCVI_VM_TMP_LD : IType<23>;
-def TypeCVI_VM_VP_LDU : IType<24>;
-def TypeCVI_VP : IType<25>;
-def TypeCVI_VP_VS : IType<26>;
-def TypeCVI_VS : IType<27>;
-def TypeCVI_VS_VX : IType<28>;
-def TypeCVI_VX : IType<29>;
-def TypeCVI_VX_DV : IType<30>;
-def TypeCVI_VX_LATE : IType<31>;
-def TypeCVI_ZW : IType<32>;
-def TypeDUPLEX : IType<33>;
-def TypeENDLOOP : IType<34>;
-def TypeEXTENDER : IType<35>;
-def TypeJ : IType<36>;
-def TypeLD : IType<37>;
-def TypeM : IType<38>;
-def TypeMAPPING : IType<39>;
-def TypeNCJ : IType<40>;
-def TypePSEUDO : IType<41>;
-def TypeST : IType<42>;
-def TypeSUBINSN : IType<43>;
-def TypeS_2op : IType<44>;
-def TypeS_3op : IType<45>;
-def TypeV2LDST : IType<48>;
-def TypeV4LDST : IType<49>;
+def TypeCR : IType<7>;
+def TypeCVI_4SLOT_MPY : IType<8>;
+def TypeCVI_GATHER : IType<9>;
+def TypeCVI_GATHER_DV : IType<10>;
+def TypeCVI_GATHER_RST : IType<11>;
+def TypeCVI_HIST : IType<12>;
+def TypeCVI_SCATTER : IType<13>;
+def TypeCVI_SCATTER_DV : IType<14>;
+def TypeCVI_SCATTER_NEW_RST : IType<15>;
+def TypeCVI_SCATTER_NEW_ST : IType<16>;
+def TypeCVI_SCATTER_RST : IType<17>;
+def TypeCVI_VA : IType<18>;
+def TypeCVI_VA_DV : IType<19>;
+def TypeCVI_VM_LD : IType<20>;
+def TypeCVI_VM_NEW_ST : IType<21>;
+def TypeCVI_VM_ST : IType<22>;
+def TypeCVI_VM_STU : IType<23>;
+def TypeCVI_VM_TMP_LD : IType<24>;
+def TypeCVI_VM_VP_LDU : IType<25>;
+def TypeCVI_VP : IType<26>;
+def TypeCVI_VP_VS : IType<27>;
+def TypeCVI_VS : IType<28>;
+def TypeCVI_VS_VX : IType<29>;
+def TypeCVI_VX : IType<30>;
+def TypeCVI_VX_DV : IType<31>;
+def TypeCVI_VX_LATE : IType<32>;
+def TypeCVI_ZW : IType<33>;
+def TypeDUPLEX : IType<34>;
+def TypeENDLOOP : IType<35>;
+def TypeEXTENDER : IType<36>;
+def TypeJ : IType<37>;
+def TypeLD : IType<38>;
+def TypeM : IType<39>;
+def TypeMAPPING : IType<40>;
+def TypeNCJ : IType<41>;
+def TypePSEUDO : IType<42>;
+def TypeST : IType<43>;
+def TypeSUBINSN : IType<44>;
+def TypeS_2op : IType<45>;
+def TypeS_3op : IType<46>;
+def TypeV2LDST : IType<49>;
+def TypeV4LDST : IType<50>;
diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
index c08d9a388d3e..305115da5763 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepInstrFormats.td
@@ -5,113 +5,138 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+// Automatically generated file, do not edit!
 //===----------------------------------------------------------------------===//
 
-class Enc_890909 : OpcodeHexagon {
+class Enc_5e2823 : OpcodeHexagon {
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
-  bits <2> Pe4;
-  let Inst{6-5} = Pe4{1-0};
 }
-class Enc_9be1de : OpcodeHexagon {
-  bits <2> Qs4;
-  let Inst{6-5} = Qs4{1-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <5> Vw32;
-  let Inst{4-0} = Vw32{4-0};
+class Enc_b9c5fb : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_527412 : OpcodeHexagon {
-  bits <2> Ps4;
-  let Inst{17-16} = Ps4{1-0};
-  bits <2> Pt4;
-  let Inst{9-8} = Pt4{1-0};
+class Enc_5ab2be : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_efaed8 : OpcodeHexagon {
-  bits <1> Ii;
-  let Inst{8-8} = Ii{0-0};
-}
-class Enc_a568d4 : OpcodeHexagon {
+class Enc_bd6011 : OpcodeHexagon {
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rx32;
-  let Inst{4-0} = Rx32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_27b757 : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{13-13} = Ii{3-3};
-  let Inst{10-8} = Ii{2-0};
-  bits <2> Pv4;
-  let Inst{12-11} = Pv4{1-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vs32;
-  let Inst{4-0} = Vs32{4-0};
+class Enc_cb9321 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{27-21} = Ii{15-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_1de724 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <4> n1;
-  let Inst{28-28} = n1{3-3};
-  let Inst{24-22} = n1{2-0};
+class Enc_a56825 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_0e41fa : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{12-8} = Vuu32{4-0};
+class Enc_140c83 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_18c338 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <8> II;
+  let Inst{22-16} = II{7-1};
+  let Inst{13-13} = II{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_be32a5 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
   bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_3d6d37 : OpcodeHexagon {
-  bits <2> Qs4;
-  let Inst{6-5} = Qs4{1-0};
+class Enc_ea23e4 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_e3b0c4 : OpcodeHexagon {
+
+}
+class Enc_ea4c54 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
   bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Vvv32;
-  let Inst{12-8} = Vvv32{4-0};
-  bits <5> Vw32;
-  let Inst{4-0} = Vw32{4-0};
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_a641d0 : OpcodeHexagon {
+class Enc_e38e1f : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <2> Pu4;
+  let Inst{22-21} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_9b0bc1 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
   bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Vvv32;
-  let Inst{12-8} = Vvv32{4-0};
-  bits <5> Vw32;
-  let Inst{4-0} = Vw32{4-0};
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_802dc0 : OpcodeHexagon {
-  bits <1> Ii;
-  let Inst{8-8} = Ii{0-0};
-  bits <2> Qv4;
-  let Inst{23-22} = Qv4{1-0};
+class Enc_90cd8b : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_6b197f : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{8-5} = Ii{3-0};
-  bits <5> Ryy32;
-  let Inst{4-0} = Ryy32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_3a3d62 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_0cb018 : OpcodeHexagon {
+  bits <5> Cs32;
+  let Inst{20-16} = Cs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
 class Enc_51436c : OpcodeHexagon {
   bits <16> Ii;
@@ -120,142 +145,281 @@ class Enc_51436c : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_c7a204 : OpcodeHexagon {
-  bits <6> II;
-  let Inst{5-0} = II{5-0};
+class Enc_bd811a : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Cd32;
+  let Inst{4-0} = Cd32{4-0};
+}
+class Enc_5e87ce : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{23-22} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_fcf7a7 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
   bits <5> Rtt32;
   let Inst{12-8} = Rtt32{4-0};
-  bits <5> Re32;
-  let Inst{20-16} = Re32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
 }
-class Enc_db40cd : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{6-3} = Ii{5-2};
+class Enc_88c16c : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_2b3f60 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <2> Px4;
+  let Inst{6-5} = Px4{1-0};
+}
+class Enc_311abd : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_c2b48e : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
 }
-class Enc_a1e29d : OpcodeHexagon {
+class Enc_08d755 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_02553a : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{11-5} = Ii{6-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_f0cca7 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <6> II;
+  let Inst{20-16} = II{5-1};
+  let Inst{13-13} = II{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_9cdba7 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_a05677 : OpcodeHexagon {
   bits <5> Ii;
   let Inst{12-8} = Ii{4-0};
-  bits <5> II;
-  let Inst{22-21} = II{4-3};
-  let Inst{7-5} = II{2-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rx32;
-  let Inst{4-0} = Rx32{4-0};
-}
-class Enc_d15d19 : OpcodeHexagon {
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Vs32;
-  let Inst{4-0} = Vs32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_e90a15 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <3> Ns8;
-  let Inst{18-16} = Ns8{2-0};
-  bits <4> n1;
-  let Inst{29-29} = n1{3-3};
-  let Inst{26-25} = n1{2-1};
-  let Inst{22-22} = n1{0-0};
+class Enc_2b518f : OpcodeHexagon {
+  bits <32> Ii;
+  let Inst{27-16} = Ii{31-20};
+  let Inst{13-0} = Ii{19-6};
 }
-class Enc_e0a47a : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{8-5} = Ii{3-0};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
+class Enc_fb6577 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{9-8} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_140c83 : OpcodeHexagon {
-  bits <10> Ii;
-  let Inst{21-21} = Ii{9-9};
-  let Inst{13-5} = Ii{8-0};
+class Enc_b8c967 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_7eee72 : OpcodeHexagon {
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
+class Enc_667b39 : OpcodeHexagon {
+  bits <5> Css32;
+  let Inst{20-16} = Css32{4-0};
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_310ba1 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
+class Enc_0ed752 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Cdd32;
+  let Inst{4-0} = Cdd32{4-0};
+}
+class Enc_03833b : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_0d8adb : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-5} = Ii{7-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_3680c2 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{11-5} = Ii{6-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_412ff0 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rxx32;
+  let Inst{12-8} = Rxx32{4-0};
+}
+class Enc_831a7d : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
   bits <5> Rtt32;
-  let Inst{20-16} = Rtt32{4-0};
-  bits <5> Vx32;
-  let Inst{4-0} = Vx32{4-0};
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
 }
-class Enc_d7dc10 : OpcodeHexagon {
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_d2216a : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
   bits <5> Rtt32;
   let Inst{12-8} = Rtt32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_6baed4 : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{10-8} = Ii{2-0};
-  bits <2> Pv4;
-  let Inst{12-11} = Pv4{1-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_d2c7f1 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
 }
-class Enc_736575 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <4> n1;
-  let Inst{28-28} = n1{3-3};
-  let Inst{25-23} = n1{2-0};
+class Enc_5eac98 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_8dec2e : OpcodeHexagon {
+class Enc_927852 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_7e5a82 : OpcodeHexagon {
   bits <5> Ii;
   let Inst{12-8} = Ii{4-0};
   bits <5> Rss32;
   let Inst{20-16} = Rss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_65d691 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_454a26 : OpcodeHexagon {
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_5d6c34 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
+}
+class Enc_cb4b4e : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_cda00a : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{19-16} = Ii{11-8};
+  let Inst{12-5} = Ii{7-0};
+  bits <2> Pu4;
+  let Inst{22-21} = Pu4{1-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_28dcbb : OpcodeHexagon {
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Vvv32;
-  let Inst{4-0} = Vvv32{4-0};
+class Enc_bd0b33 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
 }
-class Enc_eaa9f8 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <2> Qx4;
-  let Inst{1-0} = Qx4{1-0};
+class Enc_c0cdde : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
 }
-class Enc_509701 : OpcodeHexagon {
-  bits <19> Ii;
-  let Inst{26-25} = Ii{18-17};
-  let Inst{20-16} = Ii{16-12};
-  let Inst{13-5} = Ii{11-3};
+class Enc_78e566 : OpcodeHexagon {
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
@@ -270,80 +434,29 @@ class Enc_830e5d : OpcodeHexagon {
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_79b8c8 : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{6-3} = Ii{5-2};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
-class Enc_58a8bf : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{10-8} = Ii{2-0};
-  bits <2> Pv4;
-  let Inst{12-11} = Pv4{1-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
-class Enc_041d7b : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <5> n1;
-  let Inst{28-28} = n1{4-4};
-  let Inst{24-23} = n1{3-2};
-  let Inst{13-13} = n1{1-1};
-  let Inst{8-8} = n1{0-0};
+class Enc_f5e933 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_f44229 : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{13-13} = Ii{6-6};
-  let Inst{7-3} = Ii{5-1};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
+class Enc_48b75f : OpcodeHexagon {
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
-}
-class Enc_aad80c : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{12-8} = Vuu32{4-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vdd32;
-  let Inst{4-0} = Vdd32{4-0};
-}
-class Enc_87c142 : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{8-4} = Ii{6-2};
-  bits <4> Rt16;
-  let Inst{3-0} = Rt16{3-0};
-}
-class Enc_86a14b : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{7-3} = Ii{7-3};
-  bits <3> Rdd8;
-  let Inst{2-0} = Rdd8{2-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
 }
-class Enc_9a33d5 : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{6-3} = Ii{6-3};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_527412 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_a56825 : OpcodeHexagon {
+class Enc_329361 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
   bits <5> Rss32;
   let Inst{20-16} = Rss32{4-0};
   bits <5> Rtt32;
@@ -351,81 +464,85 @@ class Enc_a56825 : OpcodeHexagon {
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_9ea4cf : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{6-6} = Ii{0-0};
-  bits <6> II;
-  let Inst{5-0} = II{5-0};
-  bits <5> Ru32;
-  let Inst{20-16} = Ru32{4-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
+class Enc_284ebb : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
 }
-class Enc_ee5ed0 : OpcodeHexagon {
-  bits <4> Rs16;
-  let Inst{7-4} = Rs16{3-0};
-  bits <4> Rd16;
-  let Inst{3-0} = Rd16{3-0};
-  bits <2> n1;
-  let Inst{9-8} = n1{1-0};
+class Enc_607661 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_bddee3 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vyyyy32;
-  let Inst{4-0} = Vyyyy32{4-0};
-  bits <3> Rx8;
-  let Inst{18-16} = Rx8{2-0};
+class Enc_9ac432 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{17-16} = Ps4{1-0};
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <2> Pu4;
+  let Inst{7-6} = Pu4{1-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
 }
-class Enc_935d9b : OpcodeHexagon {
+class Enc_1f19b5 : OpcodeHexagon {
   bits <5> Ii;
-  let Inst{6-3} = Ii{4-1};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+  let Inst{9-5} = Ii{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
 }
-class Enc_61f0b0 : OpcodeHexagon {
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rxx32;
-  let Inst{4-0} = Rxx32{4-0};
+class Enc_e6c957 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_bd6011 : OpcodeHexagon {
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
+class Enc_83ee64 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-}
-class Enc_65d691 : OpcodeHexagon {
-  bits <2> Ps4;
-  let Inst{17-16} = Ps4{1-0};
   bits <2> Pd4;
   let Inst{1-0} = Pd4{1-0};
 }
-class Enc_e8c45e : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{13-13} = Ii{6-6};
-  let Inst{7-3} = Ii{5-1};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
+class Enc_2ae154 : OpcodeHexagon {
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
 }
-class Enc_ca3887 : OpcodeHexagon {
+class Enc_437f33 : OpcodeHexagon {
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_6c9440 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_890909 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <2> Pe4;
+  let Inst{6-5} = Pe4{1-0};
 }
 class Enc_a94f3b : OpcodeHexagon {
   bits <5> Rs32;
@@ -437,51 +554,98 @@ class Enc_a94f3b : OpcodeHexagon {
   bits <2> Pe4;
   let Inst{6-5} = Pe4{1-0};
 }
-class Enc_625deb : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{10-8} = Ii{3-1};
-  bits <4> Rs16;
-  let Inst{7-4} = Rs16{3-0};
-  bits <4> Rt16;
-  let Inst{3-0} = Rt16{3-0};
+class Enc_0aa344 : OpcodeHexagon {
+  bits <5> Gss32;
+  let Inst{20-16} = Gss32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_1f5ba6 : OpcodeHexagon {
-  bits <4> Rd16;
-  let Inst{3-0} = Rd16{3-0};
+class Enc_44271f : OpcodeHexagon {
+  bits <5> Gs32;
+  let Inst{20-16} = Gs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_cd82bc : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{21-21} = Ii{3-3};
-  let Inst{7-5} = Ii{2-0};
-  bits <6> II;
-  let Inst{13-8} = II{5-0};
+class Enc_ed5027 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Gdd32;
+  let Inst{4-0} = Gdd32{4-0};
+}
+class Enc_621fba : OpcodeHexagon {
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rx32;
-  let Inst{4-0} = Rx32{4-0};
+  bits <5> Gd32;
+  let Inst{4-0} = Gd32{4-0};
 }
-class Enc_399e12 : OpcodeHexagon {
-  bits <4> Rs16;
-  let Inst{7-4} = Rs16{3-0};
-  bits <3> Rdd8;
-  let Inst{2-0} = Rdd8{2-0};
+class Enc_81ac1d : OpcodeHexagon {
+  bits <24> Ii;
+  let Inst{24-16} = Ii{23-15};
+  let Inst{13-1} = Ii{14-2};
 }
-class Enc_d7a65e : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{12-7} = Ii{5-0};
-  bits <6> II;
-  let Inst{13-13} = II{5-5};
-  let Inst{4-0} = II{4-0};
-  bits <2> Pv4;
-  let Inst{6-5} = Pv4{1-0};
+class Enc_daea09 : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{23-22} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <2> Pu4;
+  let Inst{9-8} = Pu4{1-0};
+}
+class Enc_ecbcc8 : OpcodeHexagon {
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
 }
-class Enc_607661 : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{12-7} = Ii{5-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+class Enc_88d4d9 : OpcodeHexagon {
+  bits <2> Pu4;
+  let Inst{9-8} = Pu4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_0fa531 : OpcodeHexagon {
+  bits <15> Ii;
+  let Inst{21-21} = Ii{14-14};
+  let Inst{13-13} = Ii{13-13};
+  let Inst{11-1} = Ii{12-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_4dc228 : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{12-8} = Ii{8-4};
+  let Inst{4-3} = Ii{3-2};
+  bits <10> II;
+  let Inst{20-16} = II{9-5};
+  let Inst{7-5} = II{4-2};
+  let Inst{1-0} = II{1-0};
+}
+class Enc_864a5a : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{12-8} = Ii{8-4};
+  let Inst{4-3} = Ii{3-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_a51a9a : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-8} = Ii{7-3};
+  let Inst{4-2} = Ii{2-0};
+}
+class Enc_33f8ba : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-8} = Ii{7-3};
+  let Inst{4-2} = Ii{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_c9a18e : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
 }
 class Enc_6a5972 : OpcodeHexagon {
   bits <11> Ii;
@@ -492,74 +656,56 @@ class Enc_6a5972 : OpcodeHexagon {
   bits <4> Rt16;
   let Inst{11-8} = Rt16{3-0};
 }
-class Enc_ff3442 : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{13-13} = Ii{3-3};
-  let Inst{10-8} = Ii{2-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
+class Enc_eafd18 : OpcodeHexagon {
+  bits <5> II;
+  let Inst{12-8} = II{4-0};
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
 }
-class Enc_53dca9 : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{11-8} = Ii{5-2};
+class Enc_14d27a : OpcodeHexagon {
+  bits <5> II;
+  let Inst{12-8} = II{4-0};
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
-  let Inst{7-4} = Rs16{3-0};
-  bits <4> Rd16;
-  let Inst{3-0} = Rd16{3-0};
-}
-class Enc_27fd0e : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{8-5} = Ii{5-2};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
-class Enc_d7bc34 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <3> Rt8;
-  let Inst{18-16} = Rt8{2-0};
-  bits <5> Vyyyy32;
-  let Inst{4-0} = Vyyyy32{4-0};
-}
-class Enc_93af4c : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{10-4} = Ii{6-0};
-  bits <4> Rx16;
-  let Inst{3-0} = Rx16{3-0};
-}
-class Enc_621fba : OpcodeHexagon {
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Gd32;
-  let Inst{4-0} = Gd32{4-0};
+  let Inst{19-16} = Rs16{3-0};
 }
-class Enc_5bdd42 : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{8-5} = Ii{6-3};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_e90a15 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <4> n1;
+  let Inst{29-29} = n1{3-3};
+  let Inst{26-25} = n1{2-1};
+  let Inst{22-22} = n1{0-0};
 }
-class Enc_ad9bef : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Rtt32;
-  let Inst{20-16} = Rtt32{4-0};
-  bits <5> Vxx32;
-  let Inst{4-0} = Vxx32{4-0};
+class Enc_5a18b3 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> n1;
+  let Inst{29-29} = n1{4-4};
+  let Inst{26-25} = n1{3-2};
+  let Inst{22-22} = n1{1-1};
+  let Inst{13-13} = n1{0-0};
 }
-class Enc_71f1b4 : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{8-5} = Ii{5-2};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_1de724 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{24-22} = n1{2-0};
 }
 class Enc_14640c : OpcodeHexagon {
   bits <11> Ii;
@@ -572,165 +718,215 @@ class Enc_14640c : OpcodeHexagon {
   let Inst{24-22} = n1{3-1};
   let Inst{13-13} = n1{0-0};
 }
-class Enc_31db33 : OpcodeHexagon {
-  bits <2> Qt4;
-  let Inst{6-5} = Qt4{1-0};
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
+class Enc_668704 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{25-22} = n1{3-0};
 }
-class Enc_65f095 : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{6-3} = Ii{5-2};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_800e04 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{25-22} = n1{4-1};
+  let Inst{13-13} = n1{0-0};
 }
-class Enc_784502 : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{10-8} = Ii{2-0};
-  bits <2> Pv4;
-  let Inst{12-11} = Pv4{1-0};
-  bits <3> Os8;
-  let Inst{2-0} = Os8{2-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_4aca3a : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <3> n1;
+  let Inst{29-29} = n1{2-2};
+  let Inst{26-25} = n1{1-0};
 }
-class Enc_6413b6 : OpcodeHexagon {
+class Enc_f7ea77 : OpcodeHexagon {
   bits <11> Ii;
   let Inst{21-20} = Ii{10-9};
   let Inst{7-1} = Ii{8-2};
   bits <3> Ns8;
   let Inst{18-16} = Ns8{2-0};
-  bits <5> n1;
-  let Inst{29-29} = n1{4-4};
-  let Inst{26-25} = n1{3-2};
-  let Inst{23-23} = n1{1-1};
+  bits <4> n1;
+  let Inst{29-29} = n1{3-3};
+  let Inst{26-25} = n1{2-1};
   let Inst{13-13} = n1{0-0};
 }
-class Enc_7a0ea6 : OpcodeHexagon {
-  bits <4> Rd16;
-  let Inst{3-0} = Rd16{3-0};
-  bits <1> n1;
-  let Inst{9-9} = n1{0-0};
+class Enc_405228 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <3> n1;
+  let Inst{28-28} = n1{2-2};
+  let Inst{24-23} = n1{1-0};
 }
-class Enc_84bff1 : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{7-7} = Ii{0-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+class Enc_3a2484 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{24-23} = n1{2-1};
+  let Inst{13-13} = n1{0-0};
 }
-class Enc_f4413a : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{8-5} = Ii{3-0};
-  bits <2> Pt4;
-  let Inst{10-9} = Pt4{1-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_736575 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{25-23} = n1{2-0};
 }
-class Enc_78e566 : OpcodeHexagon {
-  bits <2> Pt4;
-  let Inst{9-8} = Pt4{1-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+class Enc_8e583a : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{25-23} = n1{3-1};
+  let Inst{13-13} = n1{0-0};
 }
-class Enc_437f33 : OpcodeHexagon {
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <2> Pu4;
-  let Inst{6-5} = Pu4{1-0};
-  bits <5> Rx32;
-  let Inst{4-0} = Rx32{4-0};
+class Enc_3694bd : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> n1;
+  let Inst{29-29} = n1{4-4};
+  let Inst{26-25} = n1{3-2};
+  let Inst{23-22} = n1{1-0};
 }
-class Enc_0527db : OpcodeHexagon {
+class Enc_a6853f : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <6> n1;
+  let Inst{29-29} = n1{5-5};
+  let Inst{26-25} = n1{4-3};
+  let Inst{23-22} = n1{2-1};
+  let Inst{13-13} = n1{0-0};
+}
+class Enc_a42857 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
   bits <4> Rs16;
-  let Inst{7-4} = Rs16{3-0};
-  bits <4> Rx16;
-  let Inst{3-0} = Rx16{3-0};
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{24-22} = n1{3-1};
+  let Inst{8-8} = n1{0-0};
 }
-class Enc_420cf3 : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{22-21} = Ii{5-4};
-  let Inst{13-13} = Ii{3-3};
-  let Inst{7-5} = Ii{2-0};
-  bits <5> Ru32;
-  let Inst{4-0} = Ru32{4-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{12-8} = Rd32{4-0};
+class Enc_f6fe0b : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{24-22} = n1{4-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
 }
-class Enc_e39bb2 : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{9-4} = Ii{5-0};
-  bits <4> Rd16;
-  let Inst{3-0} = Rd16{3-0};
+class Enc_3e3989 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <6> n1;
+  let Inst{28-28} = n1{5-5};
+  let Inst{25-22} = n1{4-1};
+  let Inst{8-8} = n1{0-0};
 }
-class Enc_1b64fb : OpcodeHexagon {
-  bits <16> Ii;
-  let Inst{26-25} = Ii{15-14};
-  let Inst{20-16} = Ii{13-9};
-  let Inst{13-13} = Ii{8-8};
-  let Inst{7-0} = Ii{7-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
+class Enc_b909d2 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <7> n1;
+  let Inst{28-28} = n1{6-6};
+  let Inst{25-22} = n1{5-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
 }
-class Enc_c1d806 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
-  bits <2> Qe4;
-  let Inst{6-5} = Qe4{1-0};
+class Enc_f82302 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <4> n1;
+  let Inst{29-29} = n1{3-3};
+  let Inst{26-25} = n1{2-1};
+  let Inst{23-23} = n1{0-0};
 }
-class Enc_c6220b : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{7-7} = Ii{0-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Ru32;
-  let Inst{12-8} = Ru32{4-0};
-  bits <3> Nt8;
-  let Inst{2-0} = Nt8{2-0};
+class Enc_6413b6 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+  bits <5> n1;
+  let Inst{29-29} = n1{4-4};
+  let Inst{26-25} = n1{3-2};
+  let Inst{23-23} = n1{1-1};
+  let Inst{13-13} = n1{0-0};
 }
-class Enc_322e1b : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{22-21} = Ii{5-4};
-  let Inst{13-13} = Ii{3-3};
-  let Inst{7-5} = Ii{2-0};
-  bits <6> II;
-  let Inst{23-23} = II{5-5};
-  let Inst{4-0} = II{4-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{12-8} = Rd32{4-0};
+class Enc_b78edd : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> n1;
+  let Inst{28-28} = n1{3-3};
+  let Inst{24-23} = n1{2-1};
+  let Inst{8-8} = n1{0-0};
 }
-class Enc_989021 : OpcodeHexagon {
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vy32;
-  let Inst{12-8} = Vy32{4-0};
-  bits <5> Vx32;
-  let Inst{4-0} = Vx32{4-0};
+class Enc_041d7b : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{24-23} = n1{3-2};
+  let Inst{13-13} = n1{1-1};
+  let Inst{8-8} = n1{0-0};
+}
+class Enc_b1e1fb : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <5> n1;
+  let Inst{28-28} = n1{4-4};
+  let Inst{25-23} = n1{3-1};
+  let Inst{8-8} = n1{0-0};
 }
 class Enc_178717 : OpcodeHexagon {
   bits <11> Ii;
@@ -744,62 +940,63 @@ class Enc_178717 : OpcodeHexagon {
   let Inst{13-13} = n1{1-1};
   let Inst{8-8} = n1{0-0};
 }
-class Enc_78cbf0 : OpcodeHexagon {
-  bits <18> Ii;
-  let Inst{26-25} = Ii{17-16};
-  let Inst{20-16} = Ii{15-11};
-  let Inst{13-13} = Ii{10-10};
-  let Inst{7-0} = Ii{9-2};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
-}
-class Enc_052c7d : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{6-3} = Ii{4-1};
+class Enc_5de85f : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
 }
-class Enc_fcf7a7 : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
+class Enc_9e4c3f : OpcodeHexagon {
+  bits <6> II;
+  let Inst{13-8} = II{5-0};
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rd16;
+  let Inst{19-16} = Rd16{3-0};
 }
-class Enc_55355c : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{7-7} = Ii{0-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Ru32;
-  let Inst{12-8} = Ru32{4-0};
-  bits <5> Rtt32;
-  let Inst{4-0} = Rtt32{4-0};
+class Enc_66bce1 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{11-8} = Rd16{3-0};
 }
-class Enc_211aaa : OpcodeHexagon {
+class Enc_69d63b : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <3> Ns8;
+  let Inst{18-16} = Ns8{2-0};
+}
+class Enc_ad1c74 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{21-20} = Ii{10-9};
+  let Inst{7-1} = Ii{8-2};
+  bits <4> Rs16;
+  let Inst{19-16} = Rs16{3-0};
+}
+class Enc_a27588 : OpcodeHexagon {
   bits <11> Ii;
   let Inst{26-25} = Ii{10-9};
   let Inst{13-5} = Ii{8-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
 }
-class Enc_6185fe : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{7-7} = Ii{0-0};
-  bits <6> II;
-  let Inst{11-8} = II{5-2};
-  let Inst{6-5} = II{1-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+class Enc_1f5d8f : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
 class Enc_74aef2 : OpcodeHexagon {
   bits <4> Ii;
@@ -811,127 +1008,123 @@ class Enc_74aef2 : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_cd4705 : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{7-5} = Ii{2-0};
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <5> Vx32;
-  let Inst{4-0} = Vx32{4-0};
+class Enc_6b197f : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_2ebe3b : OpcodeHexagon {
+class Enc_5cd7e9 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-5} = Ii{9-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+}
+class Enc_9e2e1c : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
   bits <1> Mu2;
   let Inst{13-13} = Mu2{0-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_3d5b28 : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+class Enc_bd1cbc : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_5ab2be : OpcodeHexagon {
+class Enc_de0214 : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-5} = Ii{9-1};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_fef969 : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{20-16} = Ii{5-1};
-  let Inst{5-5} = Ii{0-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
+class Enc_74d4e5 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_63eaeb : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{1-0} = Ii{1-0};
-  bits <4> Rs16;
-  let Inst{7-4} = Rs16{3-0};
+class Enc_e83554 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_95441f : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <2> Qd4;
-  let Inst{1-0} = Qd4{1-0};
+class Enc_152467 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_372c9d : OpcodeHexagon {
-  bits <2> Pv4;
-  let Inst{12-11} = Pv4{1-0};
+class Enc_2d7491 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-5} = Ii{10-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_7eee72 : OpcodeHexagon {
   bits <1> Mu2;
   let Inst{13-13} = Mu2{0-0};
-  bits <3> Os8;
-  let Inst{2-0} = Os8{2-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_4dff07 : OpcodeHexagon {
-  bits <2> Qv4;
-  let Inst{12-11} = Qv4{1-0};
+class Enc_70b24b : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
   bits <1> Mu2;
   let Inst{13-13} = Mu2{0-0};
-  bits <5> Vs32;
-  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_04c959 : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{7-7} = Ii{0-0};
-  bits <6> II;
-  let Inst{11-8} = II{5-2};
-  let Inst{6-5} = II{1-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Ryy32;
-  let Inst{4-0} = Ryy32{4-0};
-}
-class Enc_b62ef7 : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{10-8} = Ii{2-0};
-  bits <5> Vs32;
-  let Inst{4-0} = Vs32{4-0};
+class Enc_71f1b4 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_2b518f : OpcodeHexagon {
-  bits <32> Ii;
-  let Inst{27-16} = Ii{31-20};
-  let Inst{13-0} = Ii{19-6};
-}
-class Enc_b388cf : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{12-8} = Ii{4-0};
-  bits <5> II;
-  let Inst{22-21} = II{4-3};
-  let Inst{7-5} = II{2-0};
+class Enc_211aaa : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-5} = Ii{8-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_ad1c74 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-}
-class Enc_74d4e5 : OpcodeHexagon {
+class Enc_e0a47a : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
   bits <1> Mu2;
   let Inst{13-13} = Mu2{0-0};
   bits <5> Rd32;
@@ -939,14 +1132,6 @@ class Enc_74d4e5 : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_c90aca : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{12-5} = Ii{7-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rx32;
-  let Inst{4-0} = Rx32{4-0};
-}
 class Enc_222336 : OpcodeHexagon {
   bits <4> Ii;
   let Inst{8-5} = Ii{3-0};
@@ -955,338 +1140,162 @@ class Enc_222336 : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_5e87ce : OpcodeHexagon {
+class Enc_25bef0 : OpcodeHexagon {
   bits <16> Ii;
-  let Inst{23-22} = Ii{15-14};
+  let Inst{26-25} = Ii{15-14};
   let Inst{20-16} = Ii{13-9};
   let Inst{13-5} = Ii{8-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_158beb : OpcodeHexagon {
-  bits <2> Qs4;
-  let Inst{6-5} = Qs4{1-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Vv32;
-  let Inst{4-0} = Vv32{4-0};
-}
-class Enc_f7ea77 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <3> Ns8;
-  let Inst{18-16} = Ns8{2-0};
-  bits <4> n1;
-  let Inst{29-29} = n1{3-3};
-  let Inst{26-25} = n1{2-1};
-  let Inst{13-13} = n1{0-0};
-}
-class Enc_245865 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{23-19} = Vv32{4-0};
-  bits <3> Rt8;
-  let Inst{18-16} = Rt8{2-0};
-  bits <5> Vx32;
-  let Inst{4-0} = Vx32{4-0};
-}
-class Enc_88d4d9 : OpcodeHexagon {
-  bits <2> Pu4;
-  let Inst{9-8} = Pu4{1-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-}
-class Enc_226535 : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{12-7} = Ii{7-2};
+class Enc_fa3ba4 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{26-25} = Ii{13-12};
+  let Inst{13-5} = Ii{11-3};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rt32;
-  let Inst{4-0} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_31aa6a : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{6-3} = Ii{4-1};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
+class Enc_b05839 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-5} = Ii{6-3};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_397f23 : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{13-13} = Ii{7-7};
-  let Inst{7-3} = Ii{6-2};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-}
-class Enc_865390 : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{10-8} = Ii{2-0};
-  bits <2> Pv4;
-  let Inst{12-11} = Pv4{1-0};
-  bits <5> Vs32;
-  let Inst{4-0} = Vs32{4-0};
+class Enc_5bdd42 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-5} = Ii{6-3};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_98c0b8 : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{7-7} = Ii{0-0};
-  bits <2> Pv4;
-  let Inst{6-5} = Pv4{1-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
+class Enc_509701 : OpcodeHexagon {
+  bits <19> Ii;
+  let Inst{26-25} = Ii{18-17};
+  let Inst{20-16} = Ii{16-12};
+  let Inst{13-5} = Ii{11-3};
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_bfbf03 : OpcodeHexagon {
-  bits <2> Qs4;
-  let Inst{9-8} = Qs4{1-0};
-  bits <2> Qd4;
-  let Inst{1-0} = Qd4{1-0};
+class Enc_8df4be : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{26-25} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-5} = Ii{9-1};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_ecbcc8 : OpcodeHexagon {
+class Enc_2a3787 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-5} = Ii{10-2};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-}
-class Enc_f5e933 : OpcodeHexagon {
-  bits <2> Ps4;
-  let Inst{17-16} = Ps4{1-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_3fc427 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <5> Vxx32;
-  let Inst{4-0} = Vxx32{4-0};
-}
-class Enc_01d3d0 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vdd32;
-  let Inst{4-0} = Vdd32{4-0};
-}
-class Enc_b0e9d8 : OpcodeHexagon {
-  bits <10> Ii;
-  let Inst{21-21} = Ii{9-9};
-  let Inst{13-5} = Ii{8-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_27fd0e : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{8-5} = Ii{5-2};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
   bits <5> Rx32;
-  let Inst{4-0} = Rx32{4-0};
-}
-class Enc_1bd127 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <3> Rt8;
-  let Inst{18-16} = Rt8{2-0};
-  bits <5> Vdddd32;
-  let Inst{4-0} = Vdddd32{4-0};
-}
-class Enc_3694bd : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <3> Ns8;
-  let Inst{18-16} = Ns8{2-0};
-  bits <5> n1;
-  let Inst{29-29} = n1{4-4};
-  let Inst{26-25} = n1{3-2};
-  let Inst{23-22} = n1{1-0};
-}
-class Enc_a42857 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <5> n1;
-  let Inst{28-28} = n1{4-4};
-  let Inst{24-22} = n1{3-1};
-  let Inst{8-8} = n1{0-0};
-}
-class Enc_b7fad3 : OpcodeHexagon {
-  bits <2> Pv4;
-  let Inst{9-8} = Pv4{1-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_223005 : OpcodeHexagon {
+class Enc_3d920a : OpcodeHexagon {
   bits <6> Ii;
-  let Inst{6-3} = Ii{5-2};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
+  let Inst{8-5} = Ii{5-2};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_9e4c3f : OpcodeHexagon {
-  bits <6> II;
-  let Inst{13-8} = II{5-0};
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rd16;
-  let Inst{19-16} = Rd16{3-0};
+class Enc_4f4ed7 : OpcodeHexagon {
+  bits <18> Ii;
+  let Inst{26-25} = Ii{17-16};
+  let Inst{20-16} = Ii{15-11};
+  let Inst{13-5} = Ii{10-2};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_8b8d61 : OpcodeHexagon {
+class Enc_a21d47 : OpcodeHexagon {
   bits <6> Ii;
-  let Inst{22-21} = Ii{5-4};
-  let Inst{13-13} = Ii{3-3};
-  let Inst{7-5} = Ii{2-0};
+  let Inst{10-5} = Ii{5-0};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Ru32;
-  let Inst{4-0} = Ru32{4-0};
   bits <5> Rd32;
-  let Inst{12-8} = Rd32{4-0};
-}
-class Enc_88c16c : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
-  bits <5> Rxx32;
-  let Inst{4-0} = Rxx32{4-0};
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_770858 : OpcodeHexagon {
-  bits <2> Ps4;
-  let Inst{6-5} = Ps4{1-0};
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
+class Enc_f4413a : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{8-5} = Ii{3-0};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_bd811a : OpcodeHexagon {
+class Enc_acd6ed : OpcodeHexagon {
+  bits <9> Ii;
+  let Inst{10-5} = Ii{8-3};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Cd32;
-  let Inst{4-0} = Cd32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_b05839 : OpcodeHexagon {
+class Enc_9d1247 : OpcodeHexagon {
   bits <7> Ii;
   let Inst{8-5} = Ii{6-3};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_bc03e5 : OpcodeHexagon {
-  bits <17> Ii;
-  let Inst{26-25} = Ii{16-15};
-  let Inst{20-16} = Ii{14-10};
-  let Inst{13-13} = Ii{9-9};
-  let Inst{7-0} = Ii{8-1};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
-}
-class Enc_412ff0 : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Ru32;
-  let Inst{4-0} = Ru32{4-0};
-  bits <5> Rxx32;
-  let Inst{12-8} = Rxx32{4-0};
-}
-class Enc_ef601b : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{13-13} = Ii{3-3};
-  let Inst{10-8} = Ii{2-0};
-  bits <2> Pv4;
-  let Inst{12-11} = Pv4{1-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-}
-class Enc_c9a18e : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <3> Ns8;
-  let Inst{18-16} = Ns8{2-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-}
-class Enc_be32a5 : OpcodeHexagon {
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_e6abcf : OpcodeHexagon {
+class Enc_a198f6 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{10-5} = Ii{6-1};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
-}
-class Enc_d6990d : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{12-8} = Vuu32{4-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vxx32;
-  let Inst{4-0} = Vxx32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_6c9440 : OpcodeHexagon {
-  bits <10> Ii;
-  let Inst{21-21} = Ii{9-9};
-  let Inst{13-5} = Ii{8-0};
+class Enc_733b27 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{8-5} = Ii{4-1};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_0d8adb : OpcodeHexagon {
+class Enc_f82eaf : OpcodeHexagon {
   bits <8> Ii;
-  let Inst{12-5} = Ii{7-0};
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
-}
-class Enc_50e578 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
+  let Inst{10-5} = Ii{7-2};
+  bits <2> Pt4;
+  let Inst{12-11} = Pt4{1-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_1cf4ca : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{17-16} = Ii{5-4};
-  let Inst{6-3} = Ii{3-0};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-}
-class Enc_48b75f : OpcodeHexagon {
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
-}
 class Enc_b97f71 : OpcodeHexagon {
   bits <6> Ii;
   let Inst{8-5} = Ii{5-2};
@@ -1297,379 +1306,267 @@ class Enc_b97f71 : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_9d1247 : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{8-5} = Ii{6-3};
-  bits <2> Pt4;
-  let Inst{10-9} = Pt4{1-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
-class Enc_7b7ba8 : OpcodeHexagon {
-  bits <2> Qu4;
-  let Inst{9-8} = Qu4{1-0};
+class Enc_d44e31 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
   bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
+  let Inst{4-0} = Rt32{4-0};
 }
-class Enc_f7430e : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{13-13} = Ii{3-3};
-  let Inst{10-8} = Ii{2-0};
-  bits <2> Pv4;
-  let Inst{12-11} = Pv4{1-0};
+class Enc_163a3c : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
   bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <3> Os8;
-  let Inst{2-0} = Os8{2-0};
-}
-class Enc_e7581c : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
-}
-class Enc_2301d6 : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{20-16} = Ii{5-1};
-  let Inst{8-8} = Ii{0-0};
-  bits <2> Pt4;
-  let Inst{10-9} = Pt4{1-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+  let Inst{4-0} = Rt32{4-0};
 }
-class Enc_c31910 : OpcodeHexagon {
+class Enc_226535 : OpcodeHexagon {
   bits <8> Ii;
-  let Inst{23-21} = Ii{7-5};
-  let Inst{13-13} = Ii{4-4};
-  let Inst{7-5} = Ii{3-1};
-  let Inst{3-3} = Ii{0-0};
-  bits <5> II;
-  let Inst{12-8} = II{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
-class Enc_2f2f04 : OpcodeHexagon {
-  bits <1> Ii;
-  let Inst{5-5} = Ii{0-0};
-  bits <5> Vuu32;
-  let Inst{12-8} = Vuu32{4-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vdd32;
-  let Inst{4-0} = Vdd32{4-0};
-}
-class Enc_8d8a30 : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{13-13} = Ii{3-3};
-  let Inst{10-8} = Ii{2-0};
-  bits <2> Pv4;
-  let Inst{12-11} = Pv4{1-0};
+  let Inst{12-7} = Ii{7-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
   bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
+  let Inst{4-0} = Rt32{4-0};
 }
-class Enc_2d7491 : OpcodeHexagon {
-  bits <13> Ii;
-  let Inst{26-25} = Ii{12-11};
-  let Inst{13-5} = Ii{10-2};
+class Enc_46c951 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <5> II;
+  let Inst{4-0} = II{4-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_a803e0 : OpcodeHexagon {
+class Enc_e66a97 : OpcodeHexagon {
   bits <7> Ii;
   let Inst{12-7} = Ii{6-1};
-  bits <8> II;
-  let Inst{13-13} = II{7-7};
-  let Inst{6-0} = II{6-0};
+  bits <5> II;
+  let Inst{4-0} = II{4-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
 }
-class Enc_45364e : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
+class Enc_84b2cd : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <5> II;
+  let Inst{4-0} = II{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
 }
-class Enc_b909d2 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <7> n1;
-  let Inst{28-28} = n1{6-6};
-  let Inst{25-22} = n1{5-2};
-  let Inst{13-13} = n1{1-1};
-  let Inst{8-8} = n1{0-0};
+class Enc_f394d3 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
 }
-class Enc_e6c957 : OpcodeHexagon {
-  bits <10> Ii;
-  let Inst{21-21} = Ii{9-9};
-  let Inst{13-5} = Ii{8-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+class Enc_04c959 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Ryy32;
+  let Inst{4-0} = Ryy32{4-0};
 }
-class Enc_0d8870 : OpcodeHexagon {
-  bits <12> Ii;
-  let Inst{26-25} = Ii{11-10};
-  let Inst{13-13} = Ii{9-9};
-  let Inst{7-0} = Ii{8-1};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
+class Enc_323f2d : OpcodeHexagon {
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
 }
-class Enc_9fae8a : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{13-8} = Ii{5-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_4f677b : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_18c338 : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{12-5} = Ii{7-0};
-  bits <8> II;
-  let Inst{22-16} = II{7-1};
-  let Inst{13-13} = II{0-0};
+class Enc_7fa7f6 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
 }
-class Enc_5ccba9 : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{12-7} = Ii{7-2};
+class Enc_6185fe : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
   bits <6> II;
-  let Inst{13-13} = II{5-5};
-  let Inst{4-0} = II{4-0};
-  bits <2> Pv4;
-  let Inst{6-5} = Pv4{1-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-}
-class Enc_0ed752 : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Cdd32;
-  let Inst{4-0} = Cdd32{4-0};
+  let Inst{11-8} = II{5-2};
+  let Inst{6-5} = II{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_143445 : OpcodeHexagon {
-  bits <13> Ii;
-  let Inst{26-25} = Ii{12-11};
-  let Inst{13-13} = Ii{10-10};
-  let Inst{7-0} = Ii{9-2};
+class Enc_da664b : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_3a3d62 : OpcodeHexagon {
+class Enc_84bff1 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_3e3989 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <6> n1;
-  let Inst{28-28} = n1{5-5};
-  let Inst{25-22} = n1{4-1};
-  let Inst{8-8} = n1{0-0};
+class Enc_2301d6 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{20-16} = Ii{5-1};
+  let Inst{8-8} = Ii{0-0};
+  bits <2> Pt4;
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_152467 : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{8-5} = Ii{4-1};
+class Enc_2e1979 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_9ac432 : OpcodeHexagon {
-  bits <2> Ps4;
-  let Inst{17-16} = Ps4{1-0};
+class Enc_2a7b91 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{20-16} = Ii{5-1};
+  let Inst{8-8} = Ii{0-0};
   bits <2> Pt4;
-  let Inst{9-8} = Pt4{1-0};
-  bits <2> Pu4;
-  let Inst{7-6} = Pu4{1-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
-}
-class Enc_a90628 : OpcodeHexagon {
-  bits <2> Qv4;
-  let Inst{23-22} = Qv4{1-0};
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vx32;
-  let Inst{4-0} = Vx32{4-0};
+  let Inst{10-9} = Pt4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_f37377 : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{12-7} = Ii{7-2};
-  bits <8> II;
-  let Inst{13-13} = II{7-7};
-  let Inst{6-0} = II{6-0};
+class Enc_98c0b8 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_a198f6 : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{10-5} = Ii{6-1};
-  bits <2> Pt4;
-  let Inst{12-11} = Pt4{1-0};
+class Enc_b7fad3 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{9-8} = Pv4{1-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_4e4a80 : OpcodeHexagon {
-  bits <2> Qs4;
-  let Inst{6-5} = Qs4{1-0};
+class Enc_a75aa6 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
   bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
+  let Inst{12-8} = Rt32{4-0};
   bits <1> Mu2;
   let Inst{13-13} = Mu2{0-0};
-  bits <5> Vvv32;
-  let Inst{4-0} = Vvv32{4-0};
-}
-class Enc_3dac0b : OpcodeHexagon {
-  bits <2> Qt4;
-  let Inst{6-5} = Qt4{1-0};
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <5> Vdd32;
-  let Inst{4-0} = Vdd32{4-0};
 }
-class Enc_e38e1f : OpcodeHexagon {
+class Enc_c90aca : OpcodeHexagon {
   bits <8> Ii;
   let Inst{12-5} = Ii{7-0};
-  bits <2> Pu4;
-  let Inst{22-21} = Pu4{1-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-}
-class Enc_f8ecf9 : OpcodeHexagon {
-  bits <5> Vuu32;
-  let Inst{12-8} = Vuu32{4-0};
-  bits <5> Vvv32;
-  let Inst{20-16} = Vvv32{4-0};
-  bits <5> Vdd32;
-  let Inst{4-0} = Vdd32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
 }
-class Enc_7f1a05 : OpcodeHexagon {
-  bits <5> Ru32;
-  let Inst{4-0} = Ru32{4-0};
+class Enc_61f0b0 : OpcodeHexagon {
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Ry32;
-  let Inst{12-8} = Ry32{4-0};
-}
-class Enc_2df31d : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{9-4} = Ii{7-2};
-  bits <4> Rd16;
-  let Inst{3-0} = Rd16{3-0};
-}
-class Enc_25bef0 : OpcodeHexagon {
-  bits <16> Ii;
-  let Inst{26-25} = Ii{15-14};
-  let Inst{20-16} = Ii{13-9};
-  let Inst{13-5} = Ii{8-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-}
-class Enc_f82302 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <3> Ns8;
-  let Inst{18-16} = Ns8{2-0};
-  bits <4> n1;
-  let Inst{29-29} = n1{3-3};
-  let Inst{26-25} = n1{2-1};
-  let Inst{23-23} = n1{0-0};
-}
-class Enc_44271f : OpcodeHexagon {
-  bits <5> Gs32;
-  let Inst{20-16} = Gs32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
 }
-class Enc_83ee64 : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{12-8} = Ii{4-0};
+class Enc_a568d4 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
 }
-class Enc_adf111 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
+class Enc_3d5b28 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
   bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <2> Qx4;
-  let Inst{1-0} = Qx4{1-0};
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_46c951 : OpcodeHexagon {
+class Enc_322e1b : OpcodeHexagon {
   bits <6> Ii;
-  let Inst{12-7} = Ii{5-0};
-  bits <5> II;
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <6> II;
+  let Inst{23-23} = II{5-5};
   let Inst{4-0} = II{4-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
 }
-class Enc_5d6c34 : OpcodeHexagon {
+class Enc_420cf3 : OpcodeHexagon {
   bits <6> Ii;
-  let Inst{13-8} = Ii{5-0};
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
 }
-class Enc_4df4e9 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{26-25} = Ii{10-9};
-  let Inst{13-13} = Ii{8-8};
-  let Inst{7-0} = Ii{7-0};
+class Enc_277737 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{22-21} = Ii{7-6};
+  let Inst{13-13} = Ii{5-5};
+  let Inst{7-5} = Ii{4-2};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
-}
-class Enc_263841 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Rtt32;
-  let Inst{20-16} = Rtt32{4-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
-}
-class Enc_91b9fe : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{6-3} = Ii{4-1};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
 }
 class Enc_a7b8e8 : OpcodeHexagon {
   bits <6> Ii;
@@ -1683,175 +1580,151 @@ class Enc_a7b8e8 : OpcodeHexagon {
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_2b3f60 : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
+class Enc_7f1a05 : OpcodeHexagon {
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ry32;
+  let Inst{12-8} = Ry32{4-0};
+}
+class Enc_1b64fb : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{26-25} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_ad1831 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{26-25} = Ii{15-14};
+  let Inst{20-16} = Ii{13-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_5c124a : OpcodeHexagon {
+  bits <19> Ii;
+  let Inst{26-25} = Ii{18-17};
+  let Inst{20-16} = Ii{16-12};
+  let Inst{13-13} = Ii{11-11};
+  let Inst{7-0} = Ii{10-3};
   bits <5> Rtt32;
   let Inst{12-8} = Rtt32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-  bits <2> Px4;
-  let Inst{6-5} = Px4{1-0};
 }
-class Enc_bd1cbc : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{8-5} = Ii{4-1};
-  bits <5> Ryy32;
-  let Inst{4-0} = Ryy32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
-}
-class Enc_c85e2a : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{12-8} = Ii{4-0};
-  bits <5> II;
-  let Inst{22-21} = II{4-3};
-  let Inst{7-5} = II{2-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+class Enc_fda92c : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{26-25} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
 }
-class Enc_a30110 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{23-19} = Vv32{4-0};
-  bits <3> Rt8;
-  let Inst{18-16} = Rt8{2-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
+class Enc_bc03e5 : OpcodeHexagon {
+  bits <17> Ii;
+  let Inst{26-25} = Ii{16-15};
+  let Inst{20-16} = Ii{14-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
 }
-class Enc_33f8ba : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{12-8} = Ii{7-3};
-  let Inst{4-2} = Ii{2-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_541f26 : OpcodeHexagon {
+  bits <18> Ii;
+  let Inst{26-25} = Ii{17-16};
+  let Inst{20-16} = Ii{15-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
 }
-class Enc_690862 : OpcodeHexagon {
-  bits <13> Ii;
-  let Inst{26-25} = Ii{12-11};
+class Enc_78cbf0 : OpcodeHexagon {
+  bits <18> Ii;
+  let Inst{26-25} = Ii{17-16};
+  let Inst{20-16} = Ii{15-11};
   let Inst{13-13} = Ii{10-10};
   let Inst{7-0} = Ii{9-2};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
   bits <3> Nt8;
   let Inst{10-8} = Nt8{2-0};
 }
-class Enc_2a3787 : OpcodeHexagon {
-  bits <13> Ii;
-  let Inst{26-25} = Ii{12-11};
-  let Inst{13-5} = Ii{10-2};
+class Enc_47ef61 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_d5c73f : OpcodeHexagon {
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
+class Enc_22c845 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{10-0} = Ii{13-3};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_3f97c8 : OpcodeHexagon {
+class Enc_70fb07 : OpcodeHexagon {
   bits <6> Ii;
-  let Inst{6-3} = Ii{5-2};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_28a2dc : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
   bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+  let Inst{4-0} = Rx32{4-0};
 }
-class Enc_d50cd3 : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{7-5} = Ii{2-0};
+class Enc_12b6e9 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
   bits <5> Rss32;
   let Inst{20-16} = Rss32{4-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_729ff7 : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{7-5} = Ii{2-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
+class Enc_1aa186 : OpcodeHexagon {
   bits <5> Rss32;
   let Inst{20-16} = Rss32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_217147 : OpcodeHexagon {
-  bits <2> Qv4;
-  let Inst{23-22} = Qv4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
 }
-class Enc_b9c5fb : OpcodeHexagon {
+class Enc_8dec2e : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
   bits <5> Rss32;
   let Inst{20-16} = Rss32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_f394d3 : OpcodeHexagon {
-  bits <6> II;
-  let Inst{11-8} = II{5-2};
-  let Inst{6-5} = II{1-0};
-  bits <5> Ryy32;
-  let Inst{4-0} = Ryy32{4-0};
-  bits <5> Re32;
-  let Inst{20-16} = Re32{4-0};
-}
-class Enc_0cb018 : OpcodeHexagon {
-  bits <5> Cs32;
-  let Inst{20-16} = Cs32{4-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_541f26 : OpcodeHexagon {
-  bits <18> Ii;
-  let Inst{26-25} = Ii{17-16};
-  let Inst{20-16} = Ii{15-11};
-  let Inst{13-13} = Ii{10-10};
-  let Inst{7-0} = Ii{9-2};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-}
-class Enc_724154 : OpcodeHexagon {
-  bits <6> II;
-  let Inst{5-0} = II{5-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
-  bits <5> Re32;
-  let Inst{20-16} = Re32{4-0};
-}
-class Enc_179b35 : OpcodeHexagon {
+class Enc_b388cf : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> II;
+  let Inst{22-21} = II{4-3};
+  let Inst{7-5} = II{2-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
-  bits <5> Rx32;
-  let Inst{4-0} = Rx32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_585242 : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{13-13} = Ii{5-5};
-  let Inst{7-3} = Ii{4-0};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
+class Enc_e07374 : OpcodeHexagon {
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
-}
-class Enc_cf1927 : OpcodeHexagon {
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <3> Os8;
-  let Inst{2-0} = Os8{2-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
 class Enc_b84c4c : OpcodeHexagon {
   bits <6> Ii;
@@ -1864,121 +1737,87 @@ class Enc_b84c4c : OpcodeHexagon {
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_8203bb : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{12-7} = Ii{5-0};
-  bits <8> II;
-  let Inst{13-13} = II{7-7};
-  let Inst{6-0} = II{6-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-}
-class Enc_e66a97 : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{12-7} = Ii{6-1};
+class Enc_a1e29d : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
   bits <5> II;
-  let Inst{4-0} = II{4-0};
+  let Inst{22-21} = II{4-3};
+  let Inst{7-5} = II{2-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-}
-class Enc_8c2412 : OpcodeHexagon {
-  bits <2> Ps4;
-  let Inst{6-5} = Ps4{1-0};
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <5> Vdd32;
-  let Inst{4-0} = Vdd32{4-0};
-}
-class Enc_284ebb : OpcodeHexagon {
-  bits <2> Ps4;
-  let Inst{17-16} = Ps4{1-0};
-  bits <2> Pt4;
-  let Inst{9-8} = Pt4{1-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
-}
-class Enc_733b27 : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{8-5} = Ii{4-1};
-  bits <2> Pt4;
-  let Inst{10-9} = Pt4{1-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
   bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+  let Inst{4-0} = Rx32{4-0};
 }
-class Enc_22c845 : OpcodeHexagon {
-  bits <14> Ii;
-  let Inst{10-0} = Ii{13-3};
+class Enc_179b35 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
   bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+  let Inst{4-0} = Rx32{4-0};
 }
-class Enc_ed5027 : OpcodeHexagon {
+class Enc_143a3c : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <6> II;
+  let Inst{23-21} = II{5-3};
+  let Inst{7-5} = II{2-0};
   bits <5> Rss32;
   let Inst{20-16} = Rss32{4-0};
-  bits <5> Gdd32;
-  let Inst{4-0} = Gdd32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
 }
-class Enc_9b0bc1 : OpcodeHexagon {
-  bits <2> Pu4;
-  let Inst{6-5} = Pu4{1-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_c85e2a : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{12-8} = Ii{4-0};
+  bits <5> II;
+  let Inst{22-21} = II{4-3};
+  let Inst{7-5} = II{2-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_ea4c54 : OpcodeHexagon {
-  bits <2> Pu4;
-  let Inst{6-5} = Pu4{1-0};
+class Enc_da8d43 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-13} = Ii{5-5};
+  let Inst{7-3} = Ii{4-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_b72622 : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{5-5} = Ii{0-0};
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
+class Enc_cc449f : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
-  bits <5> Rxx32;
-  let Inst{4-0} = Rxx32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_569cfe : OpcodeHexagon {
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vx32;
-  let Inst{4-0} = Vx32{4-0};
+class Enc_585242 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-13} = Ii{5-5};
+  let Inst{7-3} = Ii{4-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
 }
-class Enc_96ce4f : OpcodeHexagon {
+class Enc_52a5dd : OpcodeHexagon {
   bits <4> Ii;
   let Inst{6-3} = Ii{3-0};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
   bits <3> Nt8;
   let Inst{10-8} = Nt8{2-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_143a3c : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{13-8} = Ii{5-0};
-  bits <6> II;
-  let Inst{23-21} = II{5-3};
-  let Inst{7-5} = II{2-0};
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rxx32;
-  let Inst{4-0} = Rxx32{4-0};
-}
 class Enc_57a33e : OpcodeHexagon {
   bits <9> Ii;
   let Inst{13-13} = Ii{8-8};
@@ -1990,206 +1829,145 @@ class Enc_57a33e : OpcodeHexagon {
   bits <5> Rtt32;
   let Inst{12-8} = Rtt32{4-0};
 }
-class Enc_311abd : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{12-8} = Ii{4-0};
+class Enc_9a33d5 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{6-3} = Ii{6-3};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_e8c45e : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{13-13} = Ii{6-6};
+  let Inst{7-3} = Ii{5-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
 }
-class Enc_a1640c : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{13-8} = Ii{5-0};
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+class Enc_b886fd : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_de0214 : OpcodeHexagon {
-  bits <12> Ii;
-  let Inst{26-25} = Ii{11-10};
-  let Inst{13-5} = Ii{9-1};
+class Enc_f44229 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{13-13} = Ii{6-6};
+  let Inst{7-3} = Ii{5-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
 }
-class Enc_daea09 : OpcodeHexagon {
-  bits <17> Ii;
-  let Inst{23-22} = Ii{16-15};
-  let Inst{20-16} = Ii{14-10};
-  let Inst{13-13} = Ii{9-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <2> Pu4;
-  let Inst{9-8} = Pu4{1-0};
+class Enc_31aa6a : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_fda92c : OpcodeHexagon {
-  bits <17> Ii;
-  let Inst{26-25} = Ii{16-15};
-  let Inst{20-16} = Ii{14-10};
-  let Inst{13-13} = Ii{9-9};
-  let Inst{7-0} = Ii{8-1};
+class Enc_397f23 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{13-13} = Ii{7-7};
+  let Inst{7-3} = Ii{6-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
 }
-class Enc_831a7d : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
-  bits <5> Rxx32;
-  let Inst{4-0} = Rxx32{4-0};
-  bits <2> Pe4;
-  let Inst{6-5} = Pe4{1-0};
-}
-class Enc_11a146 : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{11-8} = Ii{3-0};
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-}
-class Enc_b15941 : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{6-3} = Ii{3-0};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
+class Enc_7eaeb6 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_b78edd : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <4> n1;
-  let Inst{28-28} = n1{3-3};
-  let Inst{24-23} = n1{2-1};
-  let Inst{8-8} = n1{0-0};
-}
-class Enc_a27588 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{26-25} = Ii{10-9};
-  let Inst{13-5} = Ii{8-0};
+class Enc_8dbdfe : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{13-13} = Ii{7-7};
+  let Inst{7-3} = Ii{6-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Ryy32;
-  let Inst{4-0} = Ryy32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
 }
-class Enc_2a7b91 : OpcodeHexagon {
+class Enc_65f095 : OpcodeHexagon {
   bits <6> Ii;
-  let Inst{20-16} = Ii{5-1};
-  let Inst{8-8} = Ii{0-0};
-  bits <2> Pt4;
-  let Inst{10-9} = Pt4{1-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_b43b67 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
-  bits <2> Qx4;
-  let Inst{6-5} = Qx4{1-0};
+  let Inst{6-3} = Ii{5-2};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_4aca3a : OpcodeHexagon {
+class Enc_448f7f : OpcodeHexagon {
   bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <3> Ns8;
-  let Inst{18-16} = Ns8{2-0};
-  bits <3> n1;
-  let Inst{29-29} = n1{2-2};
-  let Inst{26-25} = n1{1-0};
-}
-class Enc_b38ffc : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{11-8} = Ii{3-0};
-  bits <4> Rs16;
-  let Inst{7-4} = Rs16{3-0};
-  bits <4> Rt16;
-  let Inst{3-0} = Rt16{3-0};
-}
-class Enc_cda00a : OpcodeHexagon {
-  bits <12> Ii;
-  let Inst{19-16} = Ii{11-8};
-  let Inst{12-5} = Ii{7-0};
-  bits <2> Pu4;
-  let Inst{22-21} = Pu4{1-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-}
-class Enc_2fbf3c : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{10-8} = Ii{2-0};
-  bits <4> Rs16;
-  let Inst{7-4} = Rs16{3-0};
-  bits <4> Rd16;
-  let Inst{3-0} = Rd16{3-0};
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
 }
-class Enc_70b24b : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{8-5} = Ii{5-2};
+class Enc_d5c73f : OpcodeHexagon {
   bits <1> Mu2;
   let Inst{13-13} = Mu2{0-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_2ae154 : OpcodeHexagon {
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_b15941 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
   bits <5> Rx32;
-  let Inst{4-0} = Rx32{4-0};
-}
-class Enc_50b5ac : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{17-16} = Ii{5-4};
-  let Inst{6-3} = Ii{3-0};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_2ea740 : OpcodeHexagon {
+class Enc_10bc21 : OpcodeHexagon {
   bits <4> Ii;
-  let Inst{13-13} = Ii{3-3};
-  let Inst{10-8} = Ii{2-0};
-  bits <2> Qv4;
-  let Inst{12-11} = Qv4{1-0};
+  let Inst{6-3} = Ii{3-0};
   bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vs32;
-  let Inst{4-0} = Vs32{4-0};
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_08d755 : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{12-5} = Ii{7-0};
+class Enc_4df4e9 : OpcodeHexagon {
+  bits <11> Ii;
+  let Inst{26-25} = Ii{10-9};
+  let Inst{13-13} = Ii{8-8};
+  let Inst{7-0} = Ii{7-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
-}
-class Enc_1178da : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{7-5} = Ii{2-0};
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <5> Vxx32;
-  let Inst{4-0} = Vxx32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
 }
 class Enc_8dbe85 : OpcodeHexagon {
   bits <1> Mu2;
@@ -2199,275 +1977,299 @@ class Enc_8dbe85 : OpcodeHexagon {
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_5a18b3 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <3> Ns8;
-  let Inst{18-16} = Ns8{2-0};
-  bits <5> n1;
-  let Inst{29-29} = n1{4-4};
-  let Inst{26-25} = n1{3-2};
-  let Inst{22-22} = n1{1-1};
-  let Inst{13-13} = n1{0-0};
+class Enc_96ce4f : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_14d27a : OpcodeHexagon {
-  bits <5> II;
-  let Inst{12-8} = II{4-0};
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
+class Enc_c7cd90 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{6-3} = Ii{3-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_a05677 : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{12-8} = Ii{4-0};
+class Enc_ce6828 : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{26-25} = Ii{13-12};
+  let Inst{13-13} = Ii{11-11};
+  let Inst{7-0} = Ii{10-3};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-}
-class Enc_f0cca7 : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{12-5} = Ii{7-0};
-  bits <6> II;
-  let Inst{20-16} = II{5-1};
-  let Inst{13-13} = II{0-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_500cb0 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vxx32;
-  let Inst{4-0} = Vxx32{4-0};
-}
-class Enc_7e5a82 : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{12-8} = Ii{4-0};
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_12b6e9 : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{11-8} = Ii{3-0};
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_6f70ca : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{8-4} = Ii{7-3};
-}
-class Enc_7222b7 : OpcodeHexagon {
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <2> Qd4;
-  let Inst{1-0} = Qd4{1-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
 }
-class Enc_e3b0c4 : OpcodeHexagon {
-
+class Enc_928ca1 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_a255dc : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{10-8} = Ii{2-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
+class Enc_395cc4 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{6-3} = Ii{6-3};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_cb785b : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
+class Enc_85bf58 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{6-3} = Ii{6-3};
   bits <5> Rtt32;
-  let Inst{20-16} = Rtt32{4-0};
-  bits <5> Vdd32;
-  let Inst{4-0} = Vdd32{4-0};
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_cb4b4e : OpcodeHexagon {
-  bits <2> Pu4;
-  let Inst{6-5} = Pu4{1-0};
+class Enc_e957fb : OpcodeHexagon {
+  bits <12> Ii;
+  let Inst{26-25} = Ii{11-10};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_1f5d8f : OpcodeHexagon {
+class Enc_935d9b : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
   bits <1> Mu2;
   let Inst{13-13} = Mu2{0-0};
-  bits <5> Ryy32;
-  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_9cdba7 : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{12-5} = Ii{7-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+class Enc_052c7d : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_5cd7e9 : OpcodeHexagon {
+class Enc_0d8870 : OpcodeHexagon {
   bits <12> Ii;
   let Inst{26-25} = Ii{11-10};
-  let Inst{13-5} = Ii{9-1};
+  let Inst{13-13} = Ii{9-9};
+  let Inst{7-0} = Ii{8-1};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Ryy32;
-  let Inst{4-0} = Ryy32{4-0};
-}
-class Enc_454a26 : OpcodeHexagon {
-  bits <2> Pt4;
-  let Inst{9-8} = Pt4{1-0};
-  bits <2> Ps4;
-  let Inst{17-16} = Ps4{1-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
 }
-class Enc_a6853f : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <3> Ns8;
-  let Inst{18-16} = Ns8{2-0};
-  bits <6> n1;
-  let Inst{29-29} = n1{5-5};
-  let Inst{26-25} = n1{4-3};
-  let Inst{23-22} = n1{2-1};
-  let Inst{13-13} = n1{0-0};
+class Enc_91b9fe : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_c175d0 : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{11-8} = Ii{3-0};
-  bits <4> Rs16;
-  let Inst{7-4} = Rs16{3-0};
-  bits <4> Rd16;
-  let Inst{3-0} = Rd16{3-0};
+class Enc_e26546 : OpcodeHexagon {
+  bits <5> Ii;
+  let Inst{6-3} = Ii{4-1};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_16c48b : OpcodeHexagon {
+class Enc_143445 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
   bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_79b8c8 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
   bits <1> Mu2;
   let Inst{13-13} = Mu2{0-0};
-  bits <5> Vv32;
-  let Inst{12-8} = Vv32{4-0};
-  bits <5> Vw32;
-  let Inst{4-0} = Vw32{4-0};
-}
-class Enc_895bd9 : OpcodeHexagon {
-  bits <2> Qu4;
-  let Inst{9-8} = Qu4{1-0};
   bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vx32;
-  let Inst{4-0} = Vx32{4-0};
-}
-class Enc_ea23e4 : OpcodeHexagon {
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-}
-class Enc_4dc228 : OpcodeHexagon {
-  bits <9> Ii;
-  let Inst{12-8} = Ii{8-4};
-  let Inst{4-3} = Ii{3-2};
-  bits <10> II;
-  let Inst{20-16} = II{9-5};
-  let Inst{7-5} = II{4-2};
-  let Inst{1-0} = II{1-0};
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_10bc21 : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{6-3} = Ii{3-0};
+class Enc_db40cd : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_1aaec1 : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{10-8} = Ii{2-0};
-  bits <3> Os8;
-  let Inst{2-0} = Os8{2-0};
+class Enc_690862 : OpcodeHexagon {
+  bits <13> Ii;
+  let Inst{26-25} = Ii{12-11};
+  let Inst{13-13} = Ii{10-10};
+  let Inst{7-0} = Ii{9-2};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_3f97c8 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_329361 : OpcodeHexagon {
-  bits <2> Pu4;
-  let Inst{6-5} = Pu4{1-0};
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
+class Enc_223005 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{6-3} = Ii{5-2};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_cd82bc : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{21-21} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <6> II;
+  let Inst{13-8} = II{5-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
+}
+class Enc_729ff7 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
   bits <5> Rtt32;
   let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_d2c7f1 : OpcodeHexagon {
+class Enc_8c6530 : OpcodeHexagon {
   bits <5> Rtt32;
   let Inst{12-8} = Rtt32{4-0};
   bits <5> Rss32;
   let Inst{20-16} = Rss32{4-0};
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
   bits <5> Rdd32;
   let Inst{4-0} = Rdd32{4-0};
-  bits <2> Pe4;
-  let Inst{6-5} = Pe4{1-0};
 }
-class Enc_3680c2 : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{11-5} = Ii{6-0};
+class Enc_d50cd3 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
   bits <5> Rss32;
   let Inst{20-16} = Rss32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_1ef990 : OpcodeHexagon {
-  bits <2> Pv4;
-  let Inst{12-11} = Pv4{1-0};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Vs32;
-  let Inst{4-0} = Vs32{4-0};
+class Enc_dbd70c : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <2> Pu4;
+  let Inst{6-5} = Pu4{1-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_8b8d61 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{22-21} = Ii{5-4};
+  let Inst{13-13} = Ii{3-3};
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rd32;
+  let Inst{12-8} = Rd32{4-0};
+}
+class Enc_c31910 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{23-21} = Ii{7-5};
+  let Inst{13-13} = Ii{4-4};
+  let Inst{7-5} = Ii{3-1};
+  let Inst{3-3} = Ii{0-0};
+  bits <5> II;
+  let Inst{12-8} = II{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_e957fb : OpcodeHexagon {
-  bits <12> Ii;
-  let Inst{26-25} = Ii{11-10};
-  let Inst{13-13} = Ii{9-9};
-  let Inst{7-0} = Ii{8-1};
+class Enc_9fae8a : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_a1640c : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{13-8} = Ii{5-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_fef969 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{20-16} = Ii{5-1};
+  let Inst{5-5} = Ii{0-0};
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_c0cdde : OpcodeHexagon {
-  bits <9> Ii;
+class Enc_b0e9d8 : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
   let Inst{13-5} = Ii{8-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
+  bits <5> Rx32;
+  let Inst{4-0} = Rx32{4-0};
 }
-class Enc_c9e3bc : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{13-13} = Ii{3-3};
-  let Inst{10-8} = Ii{2-0};
+class Enc_b4e6cf : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-5} = Ii{8-0};
+  bits <5> Ru32;
+  let Inst{4-0} = Ru32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_1cf4ca : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{17-16} = Ii{5-4};
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
   bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vs32;
-  let Inst{4-0} = Vs32{4-0};
+  let Inst{12-8} = Rt32{4-0};
 }
-class Enc_2e1979 : OpcodeHexagon {
+class Enc_6339d5 : OpcodeHexagon {
   bits <2> Ii;
   let Inst{13-13} = Ii{1-1};
   let Inst{7-7} = Ii{0-0};
@@ -2475,28 +2277,21 @@ class Enc_2e1979 : OpcodeHexagon {
   let Inst{6-5} = Pv4{1-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
   bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-}
-class Enc_0b2e5b : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{7-5} = Ii{2-0};
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
+  let Inst{4-0} = Rt32{4-0};
 }
-class Enc_6f83e7 : OpcodeHexagon {
-  bits <2> Qv4;
-  let Inst{23-22} = Qv4{1-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
+class Enc_44215c : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{17-16} = Ii{5-4};
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
 }
-class Enc_6339d5 : OpcodeHexagon {
+class Enc_47ee5e : OpcodeHexagon {
   bits <2> Ii;
   let Inst{13-13} = Ii{1-1};
   let Inst{7-7} = Ii{0-0};
@@ -2506,108 +2301,141 @@ class Enc_6339d5 : OpcodeHexagon {
   let Inst{20-16} = Rs32{4-0};
   bits <5> Ru32;
   let Inst{12-8} = Ru32{4-0};
-  bits <5> Rt32;
-  let Inst{4-0} = Rt32{4-0};
+  bits <3> Nt8;
+  let Inst{2-0} = Nt8{2-0};
 }
-class Enc_d483b9 : OpcodeHexagon {
-  bits <1> Ii;
-  let Inst{5-5} = Ii{0-0};
-  bits <5> Vuu32;
-  let Inst{12-8} = Vuu32{4-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vxx32;
-  let Inst{4-0} = Vxx32{4-0};
+class Enc_50b5ac : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{17-16} = Ii{5-4};
+  let Inst{6-3} = Ii{3-0};
+  bits <2> Pv4;
+  let Inst{1-0} = Pv4{1-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
 }
-class Enc_51635c : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{8-4} = Ii{6-2};
-  bits <4> Rd16;
-  let Inst{3-0} = Rd16{3-0};
+class Enc_1a9974 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rtt32;
+  let Inst{4-0} = Rtt32{4-0};
 }
-class Enc_e26546 : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{6-3} = Ii{4-1};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_d7dc10 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+  bits <2> Pd4;
+  let Inst{1-0} = Pd4{1-0};
 }
-class Enc_70fb07 : OpcodeHexagon {
+class Enc_8203bb : OpcodeHexagon {
   bits <6> Ii;
-  let Inst{13-8} = Ii{5-0};
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rxx32;
-  let Inst{4-0} = Rxx32{4-0};
+  let Inst{12-7} = Ii{5-0};
+  bits <8> II;
+  let Inst{13-13} = II{7-7};
+  let Inst{6-0} = II{6-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
 }
-class Enc_6c9ee0 : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{10-8} = Ii{2-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_d7a65e : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{12-7} = Ii{5-0};
+  bits <6> II;
+  let Inst{13-13} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
 }
-class Enc_fa3ba4 : OpcodeHexagon {
-  bits <14> Ii;
-  let Inst{26-25} = Ii{13-12};
-  let Inst{13-5} = Ii{11-3};
+class Enc_a803e0 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <8> II;
+  let Inst{13-13} = II{7-7};
+  let Inst{6-0} = II{6-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_44661f : OpcodeHexagon {
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_f20719 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{12-7} = Ii{6-1};
+  bits <6> II;
+  let Inst{13-13} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
 }
-class Enc_277737 : OpcodeHexagon {
+class Enc_f37377 : OpcodeHexagon {
   bits <8> Ii;
-  let Inst{22-21} = Ii{7-6};
-  let Inst{13-13} = Ii{5-5};
-  let Inst{7-5} = Ii{4-2};
-  bits <5> Ru32;
-  let Inst{4-0} = Ru32{4-0};
+  let Inst{12-7} = Ii{7-2};
+  bits <8> II;
+  let Inst{13-13} = II{7-7};
+  let Inst{6-0} = II{6-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{12-8} = Rd32{4-0};
 }
-class Enc_5c124a : OpcodeHexagon {
-  bits <19> Ii;
-  let Inst{26-25} = Ii{18-17};
-  let Inst{20-16} = Ii{16-12};
-  let Inst{13-13} = Ii{11-11};
-  let Inst{7-0} = Ii{10-3};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
+class Enc_5ccba9 : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{12-7} = Ii{7-2};
+  bits <6> II;
+  let Inst{13-13} = II{5-5};
+  let Inst{4-0} = II{4-0};
+  bits <2> Pv4;
+  let Inst{6-5} = Pv4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
 }
-class Enc_928ca1 : OpcodeHexagon {
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_8bcba4 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
 }
-class Enc_da664b : OpcodeHexagon {
+class Enc_eca7c8 : OpcodeHexagon {
   bits <2> Ii;
   let Inst{13-13} = Ii{1-1};
   let Inst{7-7} = Ii{0-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rt32;
+  let Inst{4-0} = Rt32{4-0};
+}
+class Enc_9ea4cf : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{6-6} = Ii{0-0};
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Ru32;
+  let Inst{20-16} = Ru32{4-0};
   bits <5> Rt32;
   let Inst{12-8} = Rt32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
 }
-class Enc_47ee5e : OpcodeHexagon {
+class Enc_724154 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+  bits <5> Re32;
+  let Inst{20-16} = Re32{4-0};
+}
+class Enc_c6220b : OpcodeHexagon {
   bits <2> Ii;
   let Inst{13-13} = Ii{1-1};
   let Inst{7-7} = Ii{0-0};
-  bits <2> Pv4;
-  let Inst{6-5} = Pv4{1-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
   bits <5> Ru32;
@@ -2615,776 +2443,1334 @@ class Enc_47ee5e : OpcodeHexagon {
   bits <3> Nt8;
   let Inst{2-0} = Nt8{2-0};
 }
-class Enc_8bcba4 : OpcodeHexagon {
+class Enc_7eb485 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{6-6} = Ii{0-0};
   bits <6> II;
   let Inst{5-0} = II{5-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
+  bits <5> Ru32;
+  let Inst{20-16} = Ru32{4-0};
+  bits <3> Nt8;
+  let Inst{10-8} = Nt8{2-0};
+}
+class Enc_c7a204 : OpcodeHexagon {
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
   bits <5> Re32;
   let Inst{20-16} = Re32{4-0};
 }
-class Enc_3a2484 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <4> n1;
-  let Inst{28-28} = n1{3-3};
-  let Inst{24-23} = n1{2-1};
-  let Inst{13-13} = n1{0-0};
+class Enc_55355c : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{7-7} = Ii{0-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Ru32;
+  let Inst{12-8} = Ru32{4-0};
+  bits <5> Rtt32;
+  let Inst{4-0} = Rtt32{4-0};
 }
-class Enc_a5ed8a : OpcodeHexagon {
+class Enc_f79415 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{6-6} = Ii{0-0};
+  bits <6> II;
+  let Inst{5-0} = II{5-0};
+  bits <5> Ru32;
+  let Inst{20-16} = Ru32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_645d54 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
   bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_cb9321 : OpcodeHexagon {
-  bits <16> Ii;
-  let Inst{27-21} = Ii{15-9};
-  let Inst{13-5} = Ii{8-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_b72622 : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{13-13} = Ii{1-1};
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Rxx32;
+  let Inst{4-0} = Rxx32{4-0};
+}
+class Enc_11a146 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_668704 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
+class Enc_93af4c : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{10-4} = Ii{6-0};
+  bits <4> Rx16;
+  let Inst{3-0} = Rx16{3-0};
+}
+class Enc_0527db : OpcodeHexagon {
   bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <5> n1;
-  let Inst{28-28} = n1{4-4};
-  let Inst{25-22} = n1{3-0};
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rx16;
+  let Inst{3-0} = Rx16{3-0};
 }
-class Enc_a7341a : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <5> Vx32;
-  let Inst{4-0} = Vx32{4-0};
+class Enc_2df31d : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{9-4} = Ii{7-2};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
 }
-class Enc_5eac98 : OpcodeHexagon {
+class Enc_97d666 : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_1f5ba6 : OpcodeHexagon {
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_63eaeb : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{1-0} = Ii{1-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+}
+class Enc_ed48be : OpcodeHexagon {
+  bits <2> Ii;
+  let Inst{6-5} = Ii{1-0};
+  bits <3> Rdd8;
+  let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_399e12 : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <3> Rdd8;
+  let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_ee5ed0 : OpcodeHexagon {
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+  bits <2> n1;
+  let Inst{9-8} = n1{1-0};
+}
+class Enc_e39bb2 : OpcodeHexagon {
   bits <6> Ii;
-  let Inst{13-8} = Ii{5-0};
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+  let Inst{9-4} = Ii{5-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
 }
-class Enc_02553a : OpcodeHexagon {
+class Enc_7a0ea6 : OpcodeHexagon {
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+  bits <1> n1;
+  let Inst{9-9} = n1{0-0};
+}
+class Enc_53dca9 : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{11-8} = Ii{5-2};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_c175d0 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_2fbf3c : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_86a14b : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{7-3} = Ii{7-3};
+  bits <3> Rdd8;
+  let Inst{2-0} = Rdd8{2-0};
+}
+class Enc_2bae10 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{10-8} = Ii{3-1};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
+}
+class Enc_51635c : OpcodeHexagon {
   bits <7> Ii;
-  let Inst{11-5} = Ii{6-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
+  let Inst{8-4} = Ii{6-2};
+  bits <4> Rd16;
+  let Inst{3-0} = Rd16{3-0};
 }
-class Enc_acd6ed : OpcodeHexagon {
+class Enc_b38ffc : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{11-8} = Ii{3-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
+}
+class Enc_f55a0c : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{11-8} = Ii{5-2};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
+}
+class Enc_6f70ca : OpcodeHexagon {
+  bits <8> Ii;
+  let Inst{8-4} = Ii{7-3};
+}
+class Enc_84d359 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{3-0} = Ii{3-0};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+}
+class Enc_b8309d : OpcodeHexagon {
   bits <9> Ii;
-  let Inst{10-5} = Ii{8-3};
-  bits <2> Pt4;
-  let Inst{12-11} = Pt4{1-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+  let Inst{8-3} = Ii{8-3};
+  bits <3> Rtt8;
+  let Inst{2-0} = Rtt8{2-0};
 }
-class Enc_8e583a : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
+class Enc_625deb : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{10-8} = Ii{3-1};
   bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <5> n1;
-  let Inst{28-28} = n1{4-4};
-  let Inst{25-23} = n1{3-1};
-  let Inst{13-13} = n1{0-0};
+  let Inst{7-4} = Rs16{3-0};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
 }
-class Enc_b886fd : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{6-3} = Ii{4-1};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_87c142 : OpcodeHexagon {
+  bits <7> Ii;
+  let Inst{8-4} = Ii{6-2};
+  bits <4> Rt16;
+  let Inst{3-0} = Rt16{3-0};
 }
-class Enc_24a7dc : OpcodeHexagon {
+class Enc_a6ce9c : OpcodeHexagon {
+  bits <6> Ii;
+  let Inst{3-0} = Ii{5-2};
+  bits <4> Rs16;
+  let Inst{7-4} = Rs16{3-0};
+}
+class Enc_2146c1 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <3> Qss8;
+  let Inst{2-0} = Qss8{2-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_843e80 : OpcodeHexagon {
   bits <5> Vu32;
   let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+  bits <3> Qxx8;
+  let Inst{2-0} = Qxx8{2-0};
+}
+class Enc_1f3376 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
   bits <5> Vv32;
-  let Inst{23-19} = Vv32{4-0};
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vxx32;
+  let Inst{7-3} = Vxx32{4-0};
+}
+class Enc_8e9fbd : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
   bits <3> Rt8;
-  let Inst{18-16} = Rt8{2-0};
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
+}
+class Enc_57e245 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
   bits <5> Vdd32;
-  let Inst{4-0} = Vdd32{4-0};
+  let Inst{7-3} = Vdd32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
 }
-class Enc_2d829e : OpcodeHexagon {
-  bits <14> Ii;
-  let Inst{10-0} = Ii{13-3};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_274a4c : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
 }
-class Enc_4f4ed7 : OpcodeHexagon {
-  bits <18> Ii;
-  let Inst{26-25} = Ii{17-16};
-  let Inst{20-16} = Ii{15-11};
-  let Inst{13-5} = Ii{10-2};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+class Enc_fbacc2 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vxx32;
+  let Inst{7-3} = Vxx32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
 }
-class Enc_84b2cd : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{12-7} = Ii{7-2};
-  bits <5> II;
-  let Inst{4-0} = II{4-0};
+class Enc_2a736a : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_b8513b : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_b5e54d : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
 }
-class Enc_8dbdfe : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{13-13} = Ii{7-7};
-  let Inst{7-3} = Ii{6-2};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
+class Enc_50e578 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
   bits <5> Rs32;
   let Inst{20-16} = Rs32{4-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
-}
-class Enc_90cd8b : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
   bits <5> Rd32;
   let Inst{4-0} = Rd32{4-0};
 }
-class Enc_bd0b33 : OpcodeHexagon {
-  bits <10> Ii;
-  let Inst{21-21} = Ii{9-9};
-  let Inst{13-5} = Ii{8-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
+class Enc_b5b643 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
 }
-class Enc_8b8927 : OpcodeHexagon {
+class Enc_2516bf : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_8d04c3 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_2ad23d : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_85daf5 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_e570b0 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_41dcc3 : OpcodeHexagon {
   bits <5> Rt32;
   let Inst{20-16} = Rt32{4-0};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_3126d7 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
   bits <5> Vv32;
-  let Inst{4-0} = Vv32{4-0};
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
 }
-class Enc_c7cd90 : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{6-3} = Ii{3-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
+class Enc_1cd70f : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_12dd8f : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_8d5d98 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vxx32;
+  let Inst{7-3} = Vxx32{4-0};
+}
+class Enc_fc563d : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_c84567 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_334c2b : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_3c46e8 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_129701 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_790d6e : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_880793 : OpcodeHexagon {
+  bits <3> Qt8;
+  let Inst{2-0} = Qt8{2-0};
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_a265b7 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_6b1bc4 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <3> Qt8;
+  let Inst{10-8} = Qt8{2-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_b2ffce : OpcodeHexagon {
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_fde0e3 : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_b3bac4 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_e7c9de : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+}
+class Enc_5c3a80 : OpcodeHexagon {
+  bits <3> Qt8;
+  let Inst{10-8} = Qt8{2-0};
+  bits <3> Qd8;
+  let Inst{5-3} = Qd8{2-0};
+}
+class Enc_8f7cc3 : OpcodeHexagon {
+  bits <3> Qtt8;
+  let Inst{10-8} = Qtt8{2-0};
+  bits <3> Qdd8;
+  let Inst{5-3} = Qdd8{2-0};
+}
+class Enc_f106e0 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{8-4} = Vv32{4-0};
+  bits <5> Vt32;
+  let Inst{13-9} = Vt32{4-0};
+  bits <4> Vdd16;
+  let Inst{3-0} = Vdd16{3-0};
+}
+class Enc_7db2f8 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{13-9} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{8-4} = Vv32{4-0};
+  bits <4> Vdd16;
+  let Inst{3-0} = Vdd16{3-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_405228 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <3> n1;
-  let Inst{28-28} = n1{2-2};
-  let Inst{24-23} = n1{1-0};
+class Enc_37c406 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <4> Vdd16;
+  let Inst{7-4} = Vdd16{3-0};
 }
-class Enc_81ac1d : OpcodeHexagon {
-  bits <24> Ii;
-  let Inst{24-16} = Ii{23-15};
-  let Inst{13-1} = Ii{14-2};
+class Enc_72a92d : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vxx32;
+  let Inst{7-3} = Vxx32{4-0};
 }
-class Enc_395cc4 : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{6-3} = Ii{6-3};
+class Enc_d7e8ba : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_ce4c54 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+}
+class Enc_3a81ac : OpcodeHexagon {
   bits <1> Mu2;
   let Inst{13-13} = Mu2{0-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_a51a9a : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{12-8} = Ii{7-3};
-  let Inst{4-2} = Ii{2-0};
+class Enc_6c4697 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_d44e31 : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{12-7} = Ii{5-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_b0e553 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_5883d0 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
   bits <5> Rt32;
-  let Inst{4-0} = Rt32{4-0};
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
 }
-class Enc_f77fbc : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{13-13} = Ii{3-3};
-  let Inst{10-8} = Ii{2-0};
+class Enc_9a895f : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_f3adb6 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_b5d5a7 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
   bits <5> Rt32;
   let Inst{20-16} = Rt32{4-0};
-  bits <3> Os8;
-  let Inst{2-0} = Os8{2-0};
+  bits <5> Vs32;
+  let Inst{7-3} = Vs32{4-0};
 }
-class Enc_d2216a : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+class Enc_5b76ab : OpcodeHexagon {
+  bits <10> Ii;
+  let Inst{21-21} = Ii{9-9};
+  let Inst{13-8} = Ii{8-3};
+  let Inst{2-0} = Ii{2-0};
+  bits <5> Vs32;
+  let Inst{7-3} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_85bf58 : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{6-3} = Ii{6-3};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
+class Enc_17a474 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vs32;
+  let Inst{7-3} = Vs32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_71bb9b : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
-  bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <5> Vdd32;
-  let Inst{4-0} = Vdd32{4-0};
+class Enc_9a9d62 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{7-3} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_52a5dd : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{6-3} = Ii{3-0};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
+class Enc_3a527f : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Vs32;
+  let Inst{7-3} = Vs32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_5e2823 : OpcodeHexagon {
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+class Enc_c39a8b : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vss32;
+  let Inst{7-3} = Vss32{4-0};
 }
-class Enc_28a2dc : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{12-8} = Ii{4-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_908985 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vss32;
+  let Inst{7-3} = Vss32{4-0};
   bits <5> Rx32;
-  let Inst{4-0} = Rx32{4-0};
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_5138b3 : OpcodeHexagon {
+class Enc_e8ddd5 : OpcodeHexagon {
+  bits <16> Ii;
+  let Inst{21-21} = Ii{15-15};
+  let Inst{13-8} = Ii{14-9};
+  let Inst{2-0} = Ii{8-6};
+  bits <5> Vss32;
+  let Inst{7-3} = Vss32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_6a4549 : OpcodeHexagon {
   bits <5> Vu32;
   let Inst{12-8} = Vu32{4-0};
   bits <5> Rt32;
   let Inst{20-16} = Rt32{4-0};
-  bits <5> Vx32;
-  let Inst{4-0} = Vx32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
 }
-class Enc_84d359 : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{3-0} = Ii{3-0};
-  bits <4> Rs16;
-  let Inst{7-4} = Rs16{3-0};
+class Enc_932b58 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
 }
-class Enc_e07374 : OpcodeHexagon {
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+class Enc_124cac : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vxx32;
+  let Inst{7-3} = Vxx32{4-0};
 }
-class Enc_e0820b : OpcodeHexagon {
+class Enc_aceeef : OpcodeHexagon {
   bits <5> Vu32;
   let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_2c3281 : OpcodeHexagon {
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
+}
+class Enc_a4ae28 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
   bits <5> Vv32;
-  let Inst{20-16} = Vv32{4-0};
-  bits <2> Qs4;
-  let Inst{6-5} = Qs4{1-0};
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Qd8;
+  let Inst{5-3} = Qd8{2-0};
+}
+class Enc_c1652e : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Qd8;
+  let Inst{5-3} = Qd8{2-0};
+}
+class Enc_9aae4a : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+  bits <3> Qd8;
+  let Inst{2-0} = Qd8{2-0};
+}
+class Enc_dcfcbb : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
   bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
+  let Inst{7-3} = Vd32{4-0};
 }
-class Enc_323f2d : OpcodeHexagon {
-  bits <6> II;
-  let Inst{11-8} = II{5-2};
-  let Inst{6-5} = II{1-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-  bits <5> Re32;
-  let Inst{20-16} = Re32{4-0};
+class Enc_a7ca29 : OpcodeHexagon {
+  bits <3> Qt8;
+  let Inst{2-0} = Qt8{2-0};
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{7-3} = Vd32{4-0};
 }
-class Enc_1a9974 : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{7-7} = Ii{0-0};
-  bits <2> Pv4;
-  let Inst{6-5} = Pv4{1-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Ru32;
-  let Inst{12-8} = Ru32{4-0};
-  bits <5> Rtt32;
-  let Inst{4-0} = Rtt32{4-0};
+class Enc_dd5f9f : OpcodeHexagon {
+  bits <3> Qtt8;
+  let Inst{2-0} = Qtt8{2-0};
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
 }
-class Enc_5de85f : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
+class Enc_7dc746 : OpcodeHexagon {
+  bits <3> Quu8;
+  let Inst{10-8} = Quu8{2-0};
   bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <3> Ns8;
-  let Inst{18-16} = Ns8{2-0};
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Qdd8;
+  let Inst{5-3} = Qdd8{2-0};
 }
-class Enc_dd766a : OpcodeHexagon {
+class Enc_fa5efc : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_aac08c : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+}
+class Enc_9a8c1f : OpcodeHexagon {
   bits <5> Vu32;
   let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
   bits <5> Vdd32;
-  let Inst{4-0} = Vdd32{4-0};
+  let Inst{7-3} = Vdd32{4-0};
 }
-class Enc_0b51ce : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{10-8} = Ii{2-0};
-  bits <2> Qv4;
-  let Inst{12-11} = Qv4{1-0};
-  bits <5> Vs32;
-  let Inst{4-0} = Vs32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+class Enc_a9eee0 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{20-16} = Vu32{4-0};
+  bits <5> Vxx32;
+  let Inst{7-3} = Vxx32{4-0};
 }
-class Enc_b4e6cf : OpcodeHexagon {
+class Enc_9ce456 : OpcodeHexagon {
   bits <10> Ii;
   let Inst{21-21} = Ii{9-9};
-  let Inst{13-5} = Ii{8-0};
-  bits <5> Ru32;
-  let Inst{4-0} = Ru32{4-0};
+  let Inst{13-8} = Ii{8-3};
+  let Inst{2-0} = Ii{2-0};
+  bits <5> Vss32;
+  let Inst{7-3} = Vss32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_44215c : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{17-16} = Ii{5-4};
-  let Inst{6-3} = Ii{3-0};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
+class Enc_96f0fd : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{7-3} = Vx32{4-0};
+  bits <3> Qdd8;
+  let Inst{2-0} = Qdd8{2-0};
 }
-class Enc_0aa344 : OpcodeHexagon {
-  bits <5> Gss32;
-  let Inst{20-16} = Gss32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+class Enc_a662ae : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <3> Rt8;
+  let Inst{2-0} = Rt8{2-0};
+  bits <5> Vdd32;
+  let Inst{7-3} = Vdd32{4-0};
 }
-class Enc_a21d47 : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{10-5} = Ii{5-0};
-  bits <2> Pt4;
-  let Inst{12-11} = Pt4{1-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+class Enc_ec09c9 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{20-16} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <3> Qdd8;
+  let Inst{5-3} = Qdd8{2-0};
 }
-class Enc_cc449f : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{6-3} = Ii{3-0};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
+class Enc_400b42 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
   bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+  let Inst{20-16} = Rt32{4-0};
+  bits <3> Qdd8;
+  let Inst{5-3} = Qdd8{2-0};
 }
-class Enc_645d54 : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{5-5} = Ii{0-0};
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
+class Enc_a5ed8a : OpcodeHexagon {
   bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
 }
-class Enc_667b39 : OpcodeHexagon {
-  bits <5> Css32;
-  let Inst{20-16} = Css32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+class Enc_134437 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{9-8} = Qs4{1-0};
+  bits <2> Qt4;
+  let Inst{23-22} = Qt4{1-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
 }
-class Enc_927852 : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+class Enc_bfbf03 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{9-8} = Qs4{1-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
 }
-class Enc_163a3c : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{12-7} = Ii{6-1};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_7222b7 : OpcodeHexagon {
   bits <5> Rt32;
-  let Inst{4-0} = Rt32{4-0};
+  let Inst{20-16} = Rt32{4-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
 }
-class Enc_a75aa6 : OpcodeHexagon {
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_f3f408 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
   bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_a255dc : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2ebe3b : OpcodeHexagon {
   bits <1> Mu2;
   let Inst{13-13} = Mu2{0-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_b087ac : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
+class Enc_8d8a30 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
   bits <5> Rt32;
   let Inst{20-16} = Rt32{4-0};
   bits <5> Vd32;
   let Inst{4-0} = Vd32{4-0};
 }
-class Enc_691712 : OpcodeHexagon {
+class Enc_58a8bf : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
   bits <2> Pv4;
   let Inst{12-11} = Pv4{1-0};
-  bits <1> Mu2;
-  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_b1e1fb : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <5> n1;
-  let Inst{28-28} = n1{4-4};
-  let Inst{25-23} = n1{3-1};
-  let Inst{8-8} = n1{0-0};
-}
-class Enc_1f19b5 : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{9-5} = Ii{4-0};
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
-}
-class Enc_b8c967 : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{12-5} = Ii{7-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-}
-class Enc_fb6577 : OpcodeHexagon {
-  bits <2> Pu4;
-  let Inst{9-8} = Pu4{1-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-}
-class Enc_2bae10 : OpcodeHexagon {
-  bits <4> Ii;
-  let Inst{10-8} = Ii{3-1};
-  bits <4> Rs16;
-  let Inst{7-4} = Rs16{3-0};
-  bits <4> Rd16;
-  let Inst{3-0} = Rd16{3-0};
-}
-class Enc_c4dc92 : OpcodeHexagon {
-  bits <2> Qv4;
-  let Inst{23-22} = Qv4{1-0};
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
+class Enc_f8c1c4 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
   bits <5> Vd32;
   let Inst{4-0} = Vd32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_03833b : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
+class Enc_c9e3bc : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
   bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
 }
-class Enc_dbd70c : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
-  bits <2> Pu4;
-  let Inst{6-5} = Pu4{1-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+class Enc_27b757 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
 }
-class Enc_f6fe0b : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <6> n1;
-  let Inst{28-28} = n1{5-5};
-  let Inst{24-22} = n1{4-2};
-  let Inst{13-13} = n1{1-1};
-  let Inst{8-8} = n1{0-0};
+class Enc_865390 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_9e2e1c : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{8-5} = Ii{4-1};
+class Enc_1ef990 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
   bits <1> Mu2;
   let Inst{13-13} = Mu2{0-0};
-  bits <5> Ryy32;
-  let Inst{4-0} = Ryy32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_8df4be : OpcodeHexagon {
-  bits <17> Ii;
-  let Inst{26-25} = Ii{16-15};
-  let Inst{20-16} = Ii{14-10};
-  let Inst{13-5} = Ii{9-1};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
-}
-class Enc_66bce1 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <4> Rd16;
-  let Inst{11-8} = Rd16{3-0};
+class Enc_b62ef7 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_b8309d : OpcodeHexagon {
-  bits <9> Ii;
-  let Inst{8-3} = Ii{8-3};
-  bits <3> Rtt8;
-  let Inst{2-0} = Rtt8{2-0};
+class Enc_d15d19 : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_5e8512 : OpcodeHexagon {
-  bits <5> Vu32;
-  let Inst{12-8} = Vu32{4-0};
+class Enc_f77fbc : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
   bits <5> Rt32;
   let Inst{20-16} = Rt32{4-0};
-  bits <5> Vxx32;
-  let Inst{4-0} = Vxx32{4-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
 }
-class Enc_4f677b : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{7-7} = Ii{0-0};
-  bits <6> II;
-  let Inst{11-8} = II{5-2};
-  let Inst{6-5} = II{1-0};
+class Enc_f7430e : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
   bits <5> Rt32;
   let Inst{20-16} = Rt32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
 }
-class Enc_3d920a : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{8-5} = Ii{5-2};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+class Enc_784502 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_e83554 : OpcodeHexagon {
-  bits <5> Ii;
-  let Inst{8-5} = Ii{4-1};
+class Enc_372c9d : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
   bits <1> Mu2;
   let Inst{13-13} = Mu2{0-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_ed48be : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{6-5} = Ii{1-0};
-  bits <3> Rdd8;
-  let Inst{2-0} = Rdd8{2-0};
+class Enc_1aaec1 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_f8c1c4 : OpcodeHexagon {
-  bits <2> Pv4;
-  let Inst{12-11} = Pv4{1-0};
+class Enc_cf1927 : OpcodeHexagon {
   bits <1> Mu2;
   let Inst{13-13} = Mu2{0-0};
-  bits <5> Vd32;
-  let Inst{4-0} = Vd32{4-0};
+  bits <3> Os8;
+  let Inst{2-0} = Os8{2-0};
   bits <5> Rx32;
   let Inst{20-16} = Rx32{4-0};
 }
-class Enc_1aa186 : OpcodeHexagon {
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
+class Enc_2ea740 : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
   bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rxx32;
-  let Inst{4-0} = Rxx32{4-0};
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
 }
-class Enc_134437 : OpcodeHexagon {
-  bits <2> Qs4;
-  let Inst{9-8} = Qs4{1-0};
-  bits <2> Qt4;
-  let Inst{23-22} = Qt4{1-0};
-  bits <2> Qd4;
-  let Inst{1-0} = Qd4{1-0};
+class Enc_0b51ce : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
 }
-class Enc_f3f408 : OpcodeHexagon {
+class Enc_4dff07 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{12-11} = Qv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vs32;
+  let Inst{4-0} = Vs32{4-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_ff3442 : OpcodeHexagon {
   bits <4> Ii;
   let Inst{13-13} = Ii{3-3};
   let Inst{10-8} = Ii{2-0};
   bits <5> Rt32;
   let Inst{20-16} = Rt32{4-0};
+}
+class Enc_6c9ee0 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_44661f : OpcodeHexagon {
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_e7581c : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
   bits <5> Vd32;
   let Inst{4-0} = Vd32{4-0};
 }
-class Enc_97d666 : OpcodeHexagon {
-  bits <4> Rs16;
-  let Inst{7-4} = Rs16{3-0};
-  bits <4> Rd16;
-  let Inst{3-0} = Rd16{3-0};
+class Enc_45364e : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
 }
-class Enc_f82eaf : OpcodeHexagon {
-  bits <8> Ii;
-  let Inst{10-5} = Ii{7-2};
-  bits <2> Pt4;
-  let Inst{12-11} = Pt4{1-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+class Enc_f8ecf9 : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Vvv32;
+  let Inst{20-16} = Vvv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
 }
-class Enc_69d63b : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <3> Ns8;
-  let Inst{18-16} = Ns8{2-0};
+class Enc_a90628 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
 }
-class Enc_f79415 : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{6-6} = Ii{0-0};
-  bits <6> II;
-  let Inst{5-0} = II{5-0};
-  bits <5> Ru32;
-  let Inst{20-16} = Ru32{4-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
+class Enc_b43b67 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <2> Qx4;
+  let Inst{6-5} = Qx4{1-0};
 }
-class Enc_ce6828 : OpcodeHexagon {
-  bits <14> Ii;
-  let Inst{26-25} = Ii{13-12};
-  let Inst{13-13} = Ii{11-11};
-  let Inst{7-0} = Ii{10-3};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
+class Enc_c1d806 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+  bits <2> Qe4;
+  let Inst{6-5} = Qe4{1-0};
 }
-class Enc_800e04 : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <4> Rs16;
-  let Inst{19-16} = Rs16{3-0};
-  bits <6> n1;
-  let Inst{28-28} = n1{5-5};
-  let Inst{25-22} = n1{4-1};
-  let Inst{13-13} = n1{0-0};
+class Enc_e0820b : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <2> Qs4;
+  let Inst{6-5} = Qs4{1-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
 }
-class Enc_ad1831 : OpcodeHexagon {
-  bits <16> Ii;
-  let Inst{26-25} = Ii{15-14};
-  let Inst{20-16} = Ii{13-9};
-  let Inst{13-13} = Ii{8-8};
-  let Inst{7-0} = Ii{7-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
+class Enc_71bb9b : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
 }
-class Enc_0fa531 : OpcodeHexagon {
-  bits <15> Ii;
-  let Inst{21-21} = Ii{14-14};
-  let Inst{13-13} = Ii{13-13};
-  let Inst{11-1} = Ii{12-2};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_3fc427 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
 }
-class Enc_7eaeb6 : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{6-3} = Ii{5-2};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
+class Enc_a30110 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_0b2e5b : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_7b7ba8 : OpcodeHexagon {
+  bits <2> Qu4;
+  let Inst{9-8} = Qu4{1-0};
   bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rx32;
-  let Inst{20-16} = Rx32{4-0};
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
 }
-class Enc_f55a0c : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{11-8} = Ii{5-2};
-  bits <4> Rs16;
-  let Inst{7-4} = Rs16{3-0};
-  bits <4> Rt16;
-  let Inst{3-0} = Rt16{3-0};
+class Enc_895bd9 : OpcodeHexagon {
+  bits <2> Qu4;
+  let Inst{9-8} = Qu4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
 }
-class Enc_f20719 : OpcodeHexagon {
-  bits <7> Ii;
-  let Inst{12-7} = Ii{6-1};
-  bits <6> II;
-  let Inst{13-13} = II{5-5};
-  let Inst{4-0} = II{4-0};
-  bits <2> Pv4;
-  let Inst{6-5} = Pv4{1-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_c4dc92 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
 }
-class Enc_eafd18 : OpcodeHexagon {
-  bits <5> II;
-  let Inst{12-8} = II{4-0};
-  bits <11> Ii;
-  let Inst{21-20} = Ii{10-9};
-  let Inst{7-1} = Ii{8-2};
-  bits <3> Ns8;
-  let Inst{18-16} = Ns8{2-0};
+class Enc_0f8bab : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <2> Qd4;
+  let Inst{1-0} = Qd4{1-0};
 }
-class Enc_7b523d : OpcodeHexagon {
+class Enc_adf111 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <2> Qx4;
+  let Inst{1-0} = Qx4{1-0};
+}
+class Enc_b087ac : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_5138b3 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_8c2412 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{6-5} = Ps4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_770858 : OpcodeHexagon {
+  bits <2> Ps4;
+  let Inst{6-5} = Ps4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_989021 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vy32;
+  let Inst{12-8} = Vy32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_24a7dc : OpcodeHexagon {
   bits <5> Vu32;
   let Inst{12-8} = Vu32{4-0};
   bits <5> Vv32;
   let Inst{23-19} = Vv32{4-0};
   bits <3> Rt8;
   let Inst{18-16} = Rt8{2-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_aad80c : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_d6990d : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
   bits <5> Vxx32;
   let Inst{4-0} = Vxx32{4-0};
 }
-class Enc_47ef61 : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{7-5} = Ii{2-0};
+class Enc_0e41fa : OpcodeHexagon {
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
   bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Rd32;
-  let Inst{4-0} = Rd32{4-0};
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
 }
 class Enc_cc857d : OpcodeHexagon {
   bits <5> Vuu32;
@@ -3394,85 +3780,225 @@ class Enc_cc857d : OpcodeHexagon {
   bits <5> Vx32;
   let Inst{4-0} = Vx32{4-0};
 }
-class Enc_7fa7f6 : OpcodeHexagon {
-  bits <6> II;
-  let Inst{11-8} = II{5-2};
-  let Inst{6-5} = II{1-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
-  bits <5> Re32;
-  let Inst{20-16} = Re32{4-0};
+class Enc_a7341a : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
 }
-class Enc_0f8bab : OpcodeHexagon {
+class Enc_95441f : OpcodeHexagon {
   bits <5> Vu32;
   let Inst{12-8} = Vu32{4-0};
-  bits <5> Rt32;
-  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
   bits <2> Qd4;
   let Inst{1-0} = Qd4{1-0};
 }
-class Enc_7eb485 : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{6-6} = Ii{0-0};
-  bits <6> II;
-  let Inst{5-0} = II{5-0};
-  bits <5> Ru32;
-  let Inst{20-16} = Ru32{4-0};
-  bits <3> Nt8;
-  let Inst{10-8} = Nt8{2-0};
+class Enc_eaa9f8 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <2> Qx4;
+  let Inst{1-0} = Qx4{1-0};
 }
-class Enc_864a5a : OpcodeHexagon {
-  bits <9> Ii;
-  let Inst{12-8} = Ii{8-4};
-  let Inst{4-3} = Ii{3-2};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_8b8927 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vv32;
+  let Inst{4-0} = Vv32{4-0};
 }
-class Enc_c2b48e : OpcodeHexagon {
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_158beb : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{6-5} = Qs4{1-0};
   bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
-  bits <2> Pd4;
-  let Inst{1-0} = Pd4{1-0};
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vv32;
+  let Inst{4-0} = Vv32{4-0};
 }
-class Enc_8c6530 : OpcodeHexagon {
+class Enc_28dcbb : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vvv32;
+  let Inst{4-0} = Vvv32{4-0};
+}
+class Enc_4e4a80 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{6-5} = Qs4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vvv32;
+  let Inst{4-0} = Vvv32{4-0};
+}
+class Enc_217147 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+}
+class Enc_569cfe : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_263841 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
   bits <5> Rtt32;
-  let Inst{12-8} = Rtt32{4-0};
-  bits <5> Rss32;
-  let Inst{20-16} = Rss32{4-0};
-  bits <2> Pu4;
-  let Inst{6-5} = Pu4{1-0};
-  bits <5> Rdd32;
-  let Inst{4-0} = Rdd32{4-0};
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
 }
-class Enc_448f7f : OpcodeHexagon {
-  bits <11> Ii;
-  let Inst{26-25} = Ii{10-9};
-  let Inst{13-13} = Ii{8-8};
-  let Inst{7-0} = Ii{7-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_245865 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_cd4705 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_7b523d : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{23-19} = Vv32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_1178da : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_4b39e4 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{7-5} = Ii{2-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_310ba1 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vx32;
+  let Inst{4-0} = Vx32{4-0};
+}
+class Enc_01d3d0 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
   bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
 }
-class Enc_da8d43 : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{13-13} = Ii{5-5};
-  let Inst{7-3} = Ii{4-0};
-  bits <2> Pv4;
-  let Inst{1-0} = Pv4{1-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
+class Enc_5e8512 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
   bits <5> Rt32;
-  let Inst{12-8} = Rt32{4-0};
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
 }
-class Enc_a6ce9c : OpcodeHexagon {
-  bits <6> Ii;
-  let Inst{3-0} = Ii{5-2};
-  bits <4> Rs16;
-  let Inst{7-4} = Rs16{3-0};
+class Enc_31db33 : OpcodeHexagon {
+  bits <2> Qt4;
+  let Inst{6-5} = Qt4{1-0};
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vv32;
+  let Inst{20-16} = Vv32{4-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_6f83e7 : OpcodeHexagon {
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+  bits <5> Vd32;
+  let Inst{4-0} = Vd32{4-0};
+}
+class Enc_cb785b : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_ad9bef : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Rtt32;
+  let Inst{20-16} = Rtt32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_2f2f04 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_d483b9 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{5-5} = Ii{0-0};
+  bits <5> Vuu32;
+  let Inst{12-8} = Vuu32{4-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_1bd127 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vdddd32;
+  let Inst{4-0} = Vdddd32{4-0};
+}
+class Enc_d7bc34 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <3> Rt8;
+  let Inst{18-16} = Rt8{2-0};
+  bits <5> Vyyyy32;
+  let Inst{4-0} = Vyyyy32{4-0};
 }
 class Enc_3b7631 : OpcodeHexagon {
   bits <5> Vu32;
@@ -3482,20 +4008,67 @@ class Enc_3b7631 : OpcodeHexagon {
   bits <3> Rx8;
   let Inst{18-16} = Rx8{2-0};
 }
-class Enc_eca7c8 : OpcodeHexagon {
-  bits <2> Ii;
-  let Inst{13-13} = Ii{1-1};
-  let Inst{7-7} = Ii{0-0};
-  bits <5> Rs32;
-  let Inst{20-16} = Rs32{4-0};
-  bits <5> Ru32;
-  let Inst{12-8} = Ru32{4-0};
+class Enc_bddee3 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vyyyy32;
+  let Inst{4-0} = Vyyyy32{4-0};
+  bits <3> Rx8;
+  let Inst{18-16} = Rx8{2-0};
+}
+class Enc_dd766a : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vdd32;
+  let Inst{4-0} = Vdd32{4-0};
+}
+class Enc_16c48b : OpcodeHexagon {
   bits <5> Rt32;
-  let Inst{4-0} = Rt32{4-0};
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vw32;
+  let Inst{4-0} = Vw32{4-0};
 }
-class Enc_4b39e4 : OpcodeHexagon {
-  bits <3> Ii;
-  let Inst{7-5} = Ii{2-0};
+class Enc_9be1de : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{6-5} = Qs4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vv32;
+  let Inst{12-8} = Vv32{4-0};
+  bits <5> Vw32;
+  let Inst{4-0} = Vw32{4-0};
+}
+class Enc_a641d0 : OpcodeHexagon {
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vw32;
+  let Inst{4-0} = Vw32{4-0};
+}
+class Enc_3d6d37 : OpcodeHexagon {
+  bits <2> Qs4;
+  let Inst{6-5} = Qs4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Vvv32;
+  let Inst{12-8} = Vvv32{4-0};
+  bits <5> Vw32;
+  let Inst{4-0} = Vw32{4-0};
+}
+class Enc_3dac0b : OpcodeHexagon {
+  bits <2> Qt4;
+  let Inst{6-5} = Qt4{1-0};
   bits <5> Vu32;
   let Inst{12-8} = Vu32{4-0};
   bits <5> Vv32;
@@ -3503,3 +4076,117 @@ class Enc_4b39e4 : OpcodeHexagon {
   bits <5> Vdd32;
   let Inst{4-0} = Vdd32{4-0};
 }
+class Enc_500cb0 : OpcodeHexagon {
+  bits <5> Vu32;
+  let Inst{12-8} = Vu32{4-0};
+  bits <5> Vxx32;
+  let Inst{4-0} = Vxx32{4-0};
+}
+class Enc_efaed8 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{8-8} = Ii{0-0};
+}
+class Enc_802dc0 : OpcodeHexagon {
+  bits <1> Ii;
+  let Inst{8-8} = Ii{0-0};
+  bits <2> Qv4;
+  let Inst{23-22} = Qv4{1-0};
+}
+class Enc_ef601b : OpcodeHexagon {
+  bits <4> Ii;
+  let Inst{13-13} = Ii{3-3};
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rt32;
+  let Inst{20-16} = Rt32{4-0};
+}
+class Enc_6baed4 : OpcodeHexagon {
+  bits <3> Ii;
+  let Inst{10-8} = Ii{2-0};
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_691712 : OpcodeHexagon {
+  bits <2> Pv4;
+  let Inst{12-11} = Pv4{1-0};
+  bits <1> Mu2;
+  let Inst{13-13} = Mu2{0-0};
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_403871 : OpcodeHexagon {
+  bits <5> Rx32;
+  let Inst{20-16} = Rx32{4-0};
+}
+class Enc_2d829e : OpcodeHexagon {
+  bits <14> Ii;
+  let Inst{10-0} = Ii{13-3};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_ca3887 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_9e9047 : OpcodeHexagon {
+  bits <2> Pt4;
+  let Inst{9-8} = Pt4{1-0};
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+}
+class Enc_7d1542 : OpcodeHexagon {
+  bits <7> Ss128;
+  let Inst{22-16} = Ss128{6-0};
+  bits <5> Rd32;
+  let Inst{4-0} = Rd32{4-0};
+}
+class Enc_8f7633 : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <7> Sd128;
+  let Inst{6-0} = Sd128{6-0};
+}
+class Enc_46f33d : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rt32;
+  let Inst{12-8} = Rt32{4-0};
+}
+class Enc_d0fe02 : OpcodeHexagon {
+  bits <5> Rxx32;
+  let Inst{20-16} = Rxx32{4-0};
+  bits <0> sgp10;
+}
+class Enc_e32517 : OpcodeHexagon {
+  bits <7> Sss128;
+  let Inst{22-16} = Sss128{6-0};
+  bits <5> Rdd32;
+  let Inst{4-0} = Rdd32{4-0};
+}
+class Enc_a705fc : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <7> Sdd128;
+  let Inst{6-0} = Sdd128{6-0};
+}
+class Enc_e6abcf : OpcodeHexagon {
+  bits <5> Rs32;
+  let Inst{20-16} = Rs32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_b00112 : OpcodeHexagon {
+  bits <5> Rss32;
+  let Inst{20-16} = Rss32{4-0};
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
+class Enc_598f6c : OpcodeHexagon {
+  bits <5> Rtt32;
+  let Inst{12-8} = Rtt32{4-0};
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
index a49051888c77..ccc3f98d8378 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepInstrInfo.td
@@ -5,14 +5,14 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+// Automatically generated file, do not edit!
 //===----------------------------------------------------------------------===//
 
 def A2_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = abs($Rs32)",
-tc_cf8126ae, TypeS_2op>, Enc_5e2823 {
+tc_d61dfdc3, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001100100;
 let hasNewValue = 1;
@@ -23,7 +23,7 @@ def A2_absp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = abs($Rss32)",
-tc_cf8126ae, TypeS_2op>, Enc_b9c5fb {
+tc_d61dfdc3, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000000100;
 let prefersSlot3 = 1;
@@ -32,7 +32,7 @@ def A2_abssat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = abs($Rs32):sat",
-tc_cf8126ae, TypeS_2op>, Enc_5e2823 {
+tc_d61dfdc3, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10001100100;
 let hasNewValue = 1;
@@ -44,15 +44,15 @@ def A2_add : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = add($Rs32,$Rt32)",
-tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
+tc_713b66bf, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let BaseOpcode = "A2_add";
 let CextOpcode = "A2_add";
 let InputType = "reg";
-let BaseOpcode = "A2_add";
 let isCommutable = 1;
 let isPredicable = 1;
 }
@@ -60,7 +60,7 @@ def A2_addh_h16_hh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.h,$Rs32.h):<<16",
-tc_679309b8, TypeALU64>, Enc_bd6011 {
+tc_01d44cb2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -72,7 +72,7 @@ def A2_addh_h16_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.h,$Rs32.l):<<16",
-tc_679309b8, TypeALU64>, Enc_bd6011 {
+tc_01d44cb2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -84,7 +84,7 @@ def A2_addh_h16_lh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.h):<<16",
-tc_679309b8, TypeALU64>, Enc_bd6011 {
+tc_01d44cb2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -96,7 +96,7 @@ def A2_addh_h16_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.l):<<16",
-tc_679309b8, TypeALU64>, Enc_bd6011 {
+tc_01d44cb2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -108,7 +108,7 @@ def A2_addh_h16_sat_hh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.h,$Rs32.h):sat:<<16",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -121,7 +121,7 @@ def A2_addh_h16_sat_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.h,$Rs32.l):sat:<<16",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -134,7 +134,7 @@ def A2_addh_h16_sat_lh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.h):sat:<<16",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -147,7 +147,7 @@ def A2_addh_h16_sat_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.l):sat:<<16",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101010;
@@ -160,7 +160,7 @@ def A2_addh_l16_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.h)",
-tc_4414d8b1, TypeALU64>, Enc_bd6011 {
+tc_f34c1c21, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101000;
@@ -172,7 +172,7 @@ def A2_addh_l16_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.l)",
-tc_4414d8b1, TypeALU64>, Enc_bd6011 {
+tc_f34c1c21, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101000;
@@ -184,7 +184,7 @@ def A2_addh_l16_sat_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.h):sat",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101000;
@@ -197,7 +197,7 @@ def A2_addh_l16_sat_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = add($Rt32.l,$Rs32.l):sat",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101000;
@@ -210,13 +210,13 @@ def A2_addi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = add($Rs32,#$Ii)",
-tc_5a2711e5, TypeALU32_ADDI>, Enc_cb9321, PredNewRel, ImmRegRel {
+tc_713b66bf, TypeALU32_ADDI>, Enc_cb9321, PredNewRel, ImmRegRel {
 let Inst{31-28} = 0b1011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let BaseOpcode = "A2_addi";
 let CextOpcode = "A2_add";
 let InputType = "imm";
-let BaseOpcode = "A2_addi";
 let isPredicable = 1;
 let isAdd = 1;
 let isExtendable = 1;
@@ -229,7 +229,7 @@ def A2_addp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = add($Rss32,$Rtt32)",
-tc_946df596, TypeALU64>, Enc_a56825 {
+tc_5da50c4b, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -240,7 +240,7 @@ def A2_addpsat : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = add($Rss32,$Rtt32):sat",
-tc_779080bf, TypeALU64>, Enc_a56825 {
+tc_8a825db2, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
@@ -252,7 +252,7 @@ def A2_addsat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = add($Rs32,$Rt32):sat",
-tc_61830035, TypeALU32_3op>, Enc_5ab2be {
+tc_95a33176, TypeALU32_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110010;
@@ -267,14 +267,14 @@ def A2_addsp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "$Rdd32 = add($Rs32,$Rtt32)",
-tc_679309b8, TypeALU64> {
+tc_01d44cb2, TypeALU64> {
 let isPseudo = 1;
 }
 def A2_addsph : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = add($Rss32,$Rtt32):raw:hi",
-tc_679309b8, TypeALU64>, Enc_a56825 {
+tc_01d44cb2, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
@@ -284,7 +284,7 @@ def A2_addspl : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = add($Rss32,$Rtt32):raw:lo",
-tc_679309b8, TypeALU64>, Enc_a56825 {
+tc_01d44cb2, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
@@ -294,15 +294,15 @@ def A2_and : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = and($Rs32,$Rt32)",
-tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
+tc_713b66bf, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110001000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let BaseOpcode = "A2_and";
 let CextOpcode = "A2_and";
 let InputType = "reg";
-let BaseOpcode = "A2_and";
 let isCommutable = 1;
 let isPredicable = 1;
 }
@@ -310,7 +310,7 @@ def A2_andir : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = and($Rs32,#$Ii)",
-tc_5a2711e5, TypeALU32_2op>, Enc_140c83, ImmRegRel {
+tc_713b66bf, TypeALU32_2op>, Enc_140c83, ImmRegRel {
 let Inst{31-22} = 0b0111011000;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -326,7 +326,7 @@ def A2_andp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = and($Rss32,$Rtt32)",
-tc_946df596, TypeALU64>, Enc_a56825 {
+tc_5da50c4b, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011111;
@@ -336,7 +336,7 @@ def A2_aslh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = aslh($Rs32)",
-tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+tc_c57d9f39, TypeALU32_2op>, Enc_5e2823, PredNewRel {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01110000000;
 let hasNewValue = 1;
@@ -348,7 +348,7 @@ def A2_asrh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = asrh($Rs32)",
-tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+tc_c57d9f39, TypeALU32_2op>, Enc_5e2823, PredNewRel {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01110000001;
 let hasNewValue = 1;
@@ -360,7 +360,7 @@ def A2_combine_hh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = combine($Rt32.h,$Rs32.h)",
-tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 {
+tc_713b66bf, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011100;
@@ -372,7 +372,7 @@ def A2_combine_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = combine($Rt32.h,$Rs32.l)",
-tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 {
+tc_713b66bf, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011101;
@@ -384,7 +384,7 @@ def A2_combine_lh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = combine($Rt32.l,$Rs32.h)",
-tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 {
+tc_713b66bf, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011110;
@@ -396,7 +396,7 @@ def A2_combine_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = combine($Rt32.l,$Rs32.l)",
-tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 {
+tc_713b66bf, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011111;
@@ -408,7 +408,7 @@ def A2_combineii : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins s32_0Imm:$Ii, s8_0Imm:$II),
 "$Rdd32 = combine(#$Ii,#$II)",
-tc_5a2711e5, TypeALU32_2op>, Enc_18c338 {
+tc_713b66bf, TypeALU32_2op>, Enc_18c338 {
 let Inst{31-23} = 0b011111000;
 let isReMaterializable = 1;
 let isAsCheapAsAMove = 1;
@@ -423,19 +423,19 @@ def A2_combinew : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = combine($Rs32,$Rt32)",
-tc_5a2711e5, TypeALU32_3op>, Enc_be32a5, PredNewRel {
+tc_713b66bf, TypeALU32_3op>, Enc_be32a5, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110101000;
-let InputType = "reg";
 let BaseOpcode = "A2_combinew";
+let InputType = "reg";
 let isPredicable = 1;
 }
 def A2_max : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = max($Rs32,$Rt32)",
-tc_779080bf, TypeALU64>, Enc_5ab2be {
+tc_8a825db2, TypeALU64>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101110;
@@ -447,7 +447,7 @@ def A2_maxp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = max($Rss32,$Rtt32)",
-tc_779080bf, TypeALU64>, Enc_a56825 {
+tc_8a825db2, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -457,7 +457,7 @@ def A2_maxu : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = maxu($Rs32,$Rt32)",
-tc_779080bf, TypeALU64>, Enc_5ab2be {
+tc_8a825db2, TypeALU64>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101110;
@@ -469,7 +469,7 @@ def A2_maxup : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = maxu($Rss32,$Rtt32)",
-tc_779080bf, TypeALU64>, Enc_a56825 {
+tc_8a825db2, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -479,7 +479,7 @@ def A2_min : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = min($Rt32,$Rs32)",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101101;
@@ -491,7 +491,7 @@ def A2_minp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = min($Rtt32,$Rss32)",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -501,7 +501,7 @@ def A2_minu : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = minu($Rt32,$Rs32)",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101101;
@@ -513,7 +513,7 @@ def A2_minup : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = minu($Rtt32,$Rss32)",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -523,7 +523,7 @@ def A2_neg : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = neg($Rs32)",
-tc_57890846, TypeALU32_2op> {
+tc_c57d9f39, TypeALU32_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -533,7 +533,7 @@ def A2_negp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = neg($Rss32)",
-tc_0ae0825c, TypeS_2op>, Enc_b9c5fb {
+tc_9f6cd987, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10000000100;
 }
@@ -541,7 +541,7 @@ def A2_negsat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = neg($Rs32):sat",
-tc_cf8126ae, TypeS_2op>, Enc_5e2823 {
+tc_d61dfdc3, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10001100100;
 let hasNewValue = 1;
@@ -553,7 +553,7 @@ def A2_nop : HInst<
 (outs),
 (ins),
 "nop",
-tc_2eabeebe, TypeALU32_2op>, Enc_e3b0c4 {
+tc_b837298f, TypeALU32_2op>, Enc_e3b0c4 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-16} = 0b0111111100000000;
 }
@@ -561,7 +561,7 @@ def A2_not : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = not($Rs32)",
-tc_57890846, TypeALU32_2op> {
+tc_c57d9f39, TypeALU32_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -571,7 +571,7 @@ def A2_notp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = not($Rss32)",
-tc_0ae0825c, TypeS_2op>, Enc_b9c5fb {
+tc_9f6cd987, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000000100;
 }
@@ -579,15 +579,15 @@ def A2_or : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = or($Rs32,$Rt32)",
-tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
+tc_713b66bf, TypeALU32_3op>, Enc_5ab2be, PredNewRel, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110001001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let BaseOpcode = "A2_or";
 let CextOpcode = "A2_or";
 let InputType = "reg";
-let BaseOpcode = "A2_or";
 let isCommutable = 1;
 let isPredicable = 1;
 }
@@ -595,7 +595,7 @@ def A2_orir : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = or($Rs32,#$Ii)",
-tc_5a2711e5, TypeALU32_2op>, Enc_140c83, ImmRegRel {
+tc_713b66bf, TypeALU32_2op>, Enc_140c83, ImmRegRel {
 let Inst{31-22} = 0b0111011010;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -611,7 +611,7 @@ def A2_orp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = or($Rss32,$Rtt32)",
-tc_946df596, TypeALU64>, Enc_a56825 {
+tc_5da50c4b, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011111;
@@ -621,7 +621,7 @@ def A2_paddf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4) $Rd32 = add($Rs32,$Rt32)",
-tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
+tc_1c2c7a4a, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111011000;
@@ -629,15 +629,15 @@ let isPredicated = 1;
 let isPredicatedFalse = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
+let BaseOpcode = "A2_add";
 let CextOpcode = "A2_add";
 let InputType = "reg";
-let BaseOpcode = "A2_add";
 }
 def A2_paddfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4.new) $Rd32 = add($Rs32,$Rt32)",
-tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
+tc_442395f3, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111011000;
@@ -646,24 +646,24 @@ let isPredicatedFalse = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPredicatedNew = 1;
+let BaseOpcode = "A2_add";
 let CextOpcode = "A2_add";
 let InputType = "reg";
-let BaseOpcode = "A2_add";
 }
 def A2_paddif : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
 "if (!$Pu4) $Rd32 = add($Rs32,#$Ii)",
-tc_4c5ba658, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
+tc_1c2c7a4a, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b0;
 let Inst{31-23} = 0b011101001;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
+let BaseOpcode = "A2_addi";
 let CextOpcode = "A2_add";
 let InputType = "imm";
-let BaseOpcode = "A2_addi";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -674,7 +674,7 @@ def A2_paddifnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
 "if (!$Pu4.new) $Rd32 = add($Rs32,#$Ii)",
-tc_05c070ec, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
+tc_442395f3, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b1;
 let Inst{31-23} = 0b011101001;
 let isPredicated = 1;
@@ -682,9 +682,9 @@ let isPredicatedFalse = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPredicatedNew = 1;
+let BaseOpcode = "A2_addi";
 let CextOpcode = "A2_add";
 let InputType = "imm";
-let BaseOpcode = "A2_addi";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -695,15 +695,15 @@ def A2_paddit : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
 "if ($Pu4) $Rd32 = add($Rs32,#$Ii)",
-tc_4c5ba658, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
+tc_1c2c7a4a, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b0;
 let Inst{31-23} = 0b011101000;
 let isPredicated = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
+let BaseOpcode = "A2_addi";
 let CextOpcode = "A2_add";
 let InputType = "imm";
-let BaseOpcode = "A2_addi";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -714,16 +714,16 @@ def A2_padditnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
 "if ($Pu4.new) $Rd32 = add($Rs32,#$Ii)",
-tc_05c070ec, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
+tc_442395f3, TypeALU32_2op>, Enc_e38e1f, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b1;
 let Inst{31-23} = 0b011101000;
 let isPredicated = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPredicatedNew = 1;
+let BaseOpcode = "A2_addi";
 let CextOpcode = "A2_add";
 let InputType = "imm";
-let BaseOpcode = "A2_addi";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -734,22 +734,22 @@ def A2_paddt : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4) $Rd32 = add($Rs32,$Rt32)",
-tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
+tc_1c2c7a4a, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111011000;
 let isPredicated = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
+let BaseOpcode = "A2_add";
 let CextOpcode = "A2_add";
 let InputType = "reg";
-let BaseOpcode = "A2_add";
 }
 def A2_paddtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4.new) $Rd32 = add($Rs32,$Rt32)",
-tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
+tc_442395f3, TypeALU32_3op>, Enc_ea4c54, PredNewRel, ImmRegRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111011000;
@@ -757,15 +757,15 @@ let isPredicated = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPredicatedNew = 1;
+let BaseOpcode = "A2_add";
 let CextOpcode = "A2_add";
 let InputType = "reg";
-let BaseOpcode = "A2_add";
 }
 def A2_pandf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4) $Rd32 = and($Rs32,$Rt32)",
-tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_1c2c7a4a, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111001000;
@@ -779,7 +779,7 @@ def A2_pandfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4.new) $Rd32 = and($Rs32,$Rt32)",
-tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_442395f3, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111001000;
@@ -794,7 +794,7 @@ def A2_pandt : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4) $Rd32 = and($Rs32,$Rt32)",
-tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_1c2c7a4a, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111001000;
@@ -807,7 +807,7 @@ def A2_pandtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4.new) $Rd32 = and($Rs32,$Rt32)",
-tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_442395f3, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111001000;
@@ -821,7 +821,7 @@ def A2_porf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4) $Rd32 = or($Rs32,$Rt32)",
-tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_1c2c7a4a, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111001001;
@@ -835,7 +835,7 @@ def A2_porfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4.new) $Rd32 = or($Rs32,$Rt32)",
-tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_442395f3, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111001001;
@@ -850,7 +850,7 @@ def A2_port : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4) $Rd32 = or($Rs32,$Rt32)",
-tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_1c2c7a4a, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111001001;
@@ -863,7 +863,7 @@ def A2_portnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4.new) $Rd32 = or($Rs32,$Rt32)",
-tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_442395f3, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111001001;
@@ -877,7 +877,7 @@ def A2_psubf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = sub($Rt32,$Rs32)",
-tc_4c5ba658, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
+tc_1c2c7a4a, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111011001;
@@ -891,7 +891,7 @@ def A2_psubfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = sub($Rt32,$Rs32)",
-tc_05c070ec, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
+tc_442395f3, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111011001;
@@ -906,7 +906,7 @@ def A2_psubt : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = sub($Rt32,$Rs32)",
-tc_4c5ba658, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
+tc_1c2c7a4a, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111011001;
@@ -919,7 +919,7 @@ def A2_psubtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rt32, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = sub($Rt32,$Rs32)",
-tc_05c070ec, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
+tc_442395f3, TypeALU32_3op>, Enc_9b0bc1, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111011001;
@@ -933,7 +933,7 @@ def A2_pxorf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4) $Rd32 = xor($Rs32,$Rt32)",
-tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_1c2c7a4a, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111001011;
@@ -947,7 +947,7 @@ def A2_pxorfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4.new) $Rd32 = xor($Rs32,$Rt32)",
-tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_442395f3, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111001011;
@@ -962,7 +962,7 @@ def A2_pxort : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4) $Rd32 = xor($Rs32,$Rt32)",
-tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_1c2c7a4a, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111001011;
@@ -975,7 +975,7 @@ def A2_pxortnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4.new) $Rd32 = xor($Rs32,$Rt32)",
-tc_05c070ec, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
+tc_442395f3, TypeALU32_3op>, Enc_ea4c54, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111001011;
@@ -989,7 +989,7 @@ def A2_roundsat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = round($Rss32):sat",
-tc_cf8126ae, TypeS_2op>, Enc_90cd8b {
+tc_d61dfdc3, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000110;
 let hasNewValue = 1;
@@ -1001,7 +1001,7 @@ def A2_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = sat($Rss32)",
-tc_0ae0825c, TypeS_2op>, Enc_90cd8b {
+tc_9f6cd987, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001000110;
 let hasNewValue = 1;
@@ -1012,7 +1012,7 @@ def A2_satb : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = satb($Rs32)",
-tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
+tc_9f6cd987, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10001100110;
 let hasNewValue = 1;
@@ -1023,7 +1023,7 @@ def A2_sath : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = sath($Rs32)",
-tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
+tc_9f6cd987, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001100110;
 let hasNewValue = 1;
@@ -1034,7 +1034,7 @@ def A2_satub : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = satub($Rs32)",
-tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
+tc_9f6cd987, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10001100110;
 let hasNewValue = 1;
@@ -1045,7 +1045,7 @@ def A2_satuh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = satuh($Rs32)",
-tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
+tc_9f6cd987, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10001100110;
 let hasNewValue = 1;
@@ -1056,22 +1056,22 @@ def A2_sub : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32,$Rs32)",
-tc_5a2711e5, TypeALU32_3op>, Enc_bd6011, PredNewRel, ImmRegRel {
+tc_713b66bf, TypeALU32_3op>, Enc_bd6011, PredNewRel, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let BaseOpcode = "A2_sub";
 let CextOpcode = "A2_sub";
 let InputType = "reg";
-let BaseOpcode = "A2_sub";
 let isPredicable = 1;
 }
 def A2_subh_h16_hh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.h,$Rs32.h):<<16",
-tc_679309b8, TypeALU64>, Enc_bd6011 {
+tc_01d44cb2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1083,7 +1083,7 @@ def A2_subh_h16_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.h,$Rs32.l):<<16",
-tc_679309b8, TypeALU64>, Enc_bd6011 {
+tc_01d44cb2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1095,7 +1095,7 @@ def A2_subh_h16_lh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.h):<<16",
-tc_679309b8, TypeALU64>, Enc_bd6011 {
+tc_01d44cb2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1107,7 +1107,7 @@ def A2_subh_h16_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.l):<<16",
-tc_679309b8, TypeALU64>, Enc_bd6011 {
+tc_01d44cb2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1119,7 +1119,7 @@ def A2_subh_h16_sat_hh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.h,$Rs32.h):sat:<<16",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1132,7 +1132,7 @@ def A2_subh_h16_sat_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.h,$Rs32.l):sat:<<16",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1145,7 +1145,7 @@ def A2_subh_h16_sat_lh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.h):sat:<<16",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1158,7 +1158,7 @@ def A2_subh_h16_sat_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.l):sat:<<16",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101011;
@@ -1171,7 +1171,7 @@ def A2_subh_l16_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.h)",
-tc_4414d8b1, TypeALU64>, Enc_bd6011 {
+tc_f34c1c21, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101001;
@@ -1183,7 +1183,7 @@ def A2_subh_l16_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.l)",
-tc_4414d8b1, TypeALU64>, Enc_bd6011 {
+tc_f34c1c21, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101001;
@@ -1195,7 +1195,7 @@ def A2_subh_l16_sat_hl : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.h):sat",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101001;
@@ -1208,7 +1208,7 @@ def A2_subh_l16_sat_ll : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32.l,$Rs32.l):sat",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101001;
@@ -1221,7 +1221,7 @@ def A2_subp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = sub($Rtt32,$Rss32)",
-tc_946df596, TypeALU64>, Enc_ea23e4 {
+tc_5da50c4b, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -1230,7 +1230,7 @@ def A2_subri : HInst<
 (outs IntRegs:$Rd32),
 (ins s32_0Imm:$Ii, IntRegs:$Rs32),
 "$Rd32 = sub(#$Ii,$Rs32)",
-tc_5a2711e5, TypeALU32_2op>, Enc_140c83, PredNewRel, ImmRegRel {
+tc_713b66bf, TypeALU32_2op>, Enc_140c83, PredNewRel, ImmRegRel {
 let Inst{31-22} = 0b0111011001;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -1246,7 +1246,7 @@ def A2_subsat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32,$Rs32):sat",
-tc_61830035, TypeALU32_3op>, Enc_bd6011 {
+tc_95a33176, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110110;
@@ -1260,7 +1260,7 @@ def A2_svaddh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = vaddh($Rs32,$Rt32)",
-tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be {
+tc_713b66bf, TypeALU32_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110000;
@@ -1273,7 +1273,7 @@ def A2_svaddhs : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = vaddh($Rs32,$Rt32):sat",
-tc_61830035, TypeALU32_3op>, Enc_5ab2be {
+tc_95a33176, TypeALU32_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110001;
@@ -1288,7 +1288,7 @@ def A2_svadduhs : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = vadduh($Rs32,$Rt32):sat",
-tc_61830035, TypeALU32_3op>, Enc_5ab2be {
+tc_95a33176, TypeALU32_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110011;
@@ -1303,7 +1303,7 @@ def A2_svavgh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = vavgh($Rs32,$Rt32)",
-tc_1c80410a, TypeALU32_3op>, Enc_5ab2be {
+tc_8b5bd4f5, TypeALU32_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110111000;
@@ -1317,7 +1317,7 @@ def A2_svavghs : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = vavgh($Rs32,$Rt32):rnd",
-tc_d08ee0f4, TypeALU32_3op>, Enc_5ab2be {
+tc_84a7500d, TypeALU32_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110111001;
@@ -1331,7 +1331,7 @@ def A2_svnavgh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = vnavgh($Rt32,$Rs32)",
-tc_1c80410a, TypeALU32_3op>, Enc_bd6011 {
+tc_8b5bd4f5, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110111011;
@@ -1344,7 +1344,7 @@ def A2_svsubh : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = vsubh($Rt32,$Rs32)",
-tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 {
+tc_713b66bf, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110100;
@@ -1356,7 +1356,7 @@ def A2_svsubhs : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = vsubh($Rt32,$Rs32):sat",
-tc_61830035, TypeALU32_3op>, Enc_bd6011 {
+tc_95a33176, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110101;
@@ -1370,7 +1370,7 @@ def A2_svsubuhs : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = vsubuh($Rt32,$Rs32):sat",
-tc_61830035, TypeALU32_3op>, Enc_bd6011 {
+tc_95a33176, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110110111;
@@ -1384,7 +1384,7 @@ def A2_swiz : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = swiz($Rs32)",
-tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
+tc_9f6cd987, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10001100100;
 let hasNewValue = 1;
@@ -1394,7 +1394,7 @@ def A2_sxtb : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = sxtb($Rs32)",
-tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+tc_c57d9f39, TypeALU32_2op>, Enc_5e2823, PredNewRel {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01110000101;
 let hasNewValue = 1;
@@ -1406,7 +1406,7 @@ def A2_sxth : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = sxth($Rs32)",
-tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+tc_c57d9f39, TypeALU32_2op>, Enc_5e2823, PredNewRel {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01110000111;
 let hasNewValue = 1;
@@ -1418,7 +1418,7 @@ def A2_sxtw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = sxtw($Rs32)",
-tc_0ae0825c, TypeS_2op>, Enc_3a3d62 {
+tc_9f6cd987, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10000100010;
 }
@@ -1426,20 +1426,20 @@ def A2_tfr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = $Rs32",
-tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+tc_c57d9f39, TypeALU32_2op>, Enc_5e2823, PredNewRel {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01110000011;
 let hasNewValue = 1;
 let opNewValue = 0;
-let InputType = "reg";
 let BaseOpcode = "A2_tfr";
+let InputType = "reg";
 let isPredicable = 1;
 }
 def A2_tfrcrr : HInst<
 (outs IntRegs:$Rd32),
 (ins CtrRegs:$Cs32),
 "$Rd32 = $Cs32",
-tc_b9272d6c, TypeCR>, Enc_0cb018 {
+tc_7476d766, TypeCR>, Enc_0cb018 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01101010000;
 let hasNewValue = 1;
@@ -1449,14 +1449,14 @@ def A2_tfrf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = $Rs32",
-tc_4c5ba658, TypeALU32_2op>, PredNewRel, ImmRegRel {
+tc_1c2c7a4a, TypeALU32_2op>, PredNewRel, ImmRegRel {
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
+let BaseOpcode = "A2_tfr";
 let CextOpcode = "A2_tfr";
 let InputType = "reg";
-let BaseOpcode = "A2_tfr";
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -1464,15 +1464,15 @@ def A2_tfrfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = $Rs32",
-tc_05c070ec, TypeALU32_2op>, PredNewRel, ImmRegRel {
+tc_442395f3, TypeALU32_2op>, PredNewRel, ImmRegRel {
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPredicatedNew = 1;
+let BaseOpcode = "A2_tfr";
 let CextOpcode = "A2_tfr";
 let InputType = "reg";
-let BaseOpcode = "A2_tfr";
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -1480,7 +1480,7 @@ def A2_tfrih : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, u16_0Imm:$Ii),
 "$Rx32.h = #$Ii",
-tc_5a2711e5, TypeALU32_2op>, Enc_51436c {
+tc_713b66bf, TypeALU32_2op>, Enc_51436c {
 let Inst{21-21} = 0b1;
 let Inst{31-24} = 0b01110010;
 let hasNewValue = 1;
@@ -1491,7 +1491,7 @@ def A2_tfril : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, u16_0Imm:$Ii),
 "$Rx32.l = #$Ii",
-tc_5a2711e5, TypeALU32_2op>, Enc_51436c {
+tc_713b66bf, TypeALU32_2op>, Enc_51436c {
 let Inst{21-21} = 0b1;
 let Inst{31-24} = 0b01110001;
 let hasNewValue = 1;
@@ -1502,7 +1502,7 @@ def A2_tfrp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = $Rss32",
-tc_5a2711e5, TypeALU32_2op>, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, PredNewRel {
 let BaseOpcode = "A2_tfrp";
 let isPredicable = 1;
 let isPseudo = 1;
@@ -1511,7 +1511,7 @@ def A2_tfrpf : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, DoubleRegs:$Rss32),
 "if (!$Pu4) $Rdd32 = $Rss32",
-tc_5a2711e5, TypeALU32_2op>, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, PredNewRel {
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let BaseOpcode = "A2_tfrp";
@@ -1521,7 +1521,7 @@ def A2_tfrpfnew : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, DoubleRegs:$Rss32),
 "if (!$Pu4.new) $Rdd32 = $Rss32",
-tc_1ae57e39, TypeALU32_2op>, PredNewRel {
+tc_86173609, TypeALU32_2op>, PredNewRel {
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let isPredicatedNew = 1;
@@ -1532,7 +1532,7 @@ def A2_tfrpi : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins s8_0Imm:$Ii),
 "$Rdd32 = #$Ii",
-tc_5a2711e5, TypeALU64> {
+tc_713b66bf, TypeALU64> {
 let isReMaterializable = 1;
 let isAsCheapAsAMove = 1;
 let isMoveImm = 1;
@@ -1542,7 +1542,7 @@ def A2_tfrpt : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, DoubleRegs:$Rss32),
 "if ($Pu4) $Rdd32 = $Rss32",
-tc_5a2711e5, TypeALU32_2op>, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, PredNewRel {
 let isPredicated = 1;
 let BaseOpcode = "A2_tfrp";
 let isPseudo = 1;
@@ -1551,7 +1551,7 @@ def A2_tfrptnew : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, DoubleRegs:$Rss32),
 "if ($Pu4.new) $Rdd32 = $Rss32",
-tc_1ae57e39, TypeALU32_2op>, PredNewRel {
+tc_86173609, TypeALU32_2op>, PredNewRel {
 let isPredicated = 1;
 let isPredicatedNew = 1;
 let BaseOpcode = "A2_tfrp";
@@ -1561,7 +1561,7 @@ def A2_tfrrcr : HInst<
 (outs CtrRegs:$Cd32),
 (ins IntRegs:$Rs32),
 "$Cd32 = $Rs32",
-tc_434c8e1e, TypeCR>, Enc_bd811a {
+tc_49fdfd4b, TypeCR>, Enc_bd811a {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01100010001;
 let hasNewValue = 1;
@@ -1571,14 +1571,14 @@ def A2_tfrsi : HInst<
 (outs IntRegs:$Rd32),
 (ins s32_0Imm:$Ii),
 "$Rd32 = #$Ii",
-tc_57890846, TypeALU32_2op>, Enc_5e87ce, PredNewRel, ImmRegRel {
+tc_c57d9f39, TypeALU32_2op>, Enc_5e87ce, PredNewRel, ImmRegRel {
 let Inst{21-21} = 0b0;
 let Inst{31-24} = 0b01111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let BaseOpcode = "A2_tfrsi";
 let CextOpcode = "A2_tfr";
 let InputType = "imm";
-let BaseOpcode = "A2_tfrsi";
 let isPredicable = 1;
 let isReMaterializable = 1;
 let isAsCheapAsAMove = 1;
@@ -1593,13 +1593,13 @@ def A2_tfrt : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = $Rs32",
-tc_4c5ba658, TypeALU32_2op>, PredNewRel, ImmRegRel {
+tc_1c2c7a4a, TypeALU32_2op>, PredNewRel, ImmRegRel {
 let isPredicated = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
+let BaseOpcode = "A2_tfr";
 let CextOpcode = "A2_tfr";
 let InputType = "reg";
-let BaseOpcode = "A2_tfr";
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -1607,14 +1607,14 @@ def A2_tfrtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = $Rs32",
-tc_05c070ec, TypeALU32_2op>, PredNewRel, ImmRegRel {
+tc_442395f3, TypeALU32_2op>, PredNewRel, ImmRegRel {
 let isPredicated = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPredicatedNew = 1;
+let BaseOpcode = "A2_tfr";
 let CextOpcode = "A2_tfr";
 let InputType = "reg";
-let BaseOpcode = "A2_tfr";
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -1622,7 +1622,7 @@ def A2_vabsh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vabsh($Rss32)",
-tc_cf8126ae, TypeS_2op>, Enc_b9c5fb {
+tc_d61dfdc3, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000000010;
 let prefersSlot3 = 1;
@@ -1631,7 +1631,7 @@ def A2_vabshsat : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vabsh($Rss32):sat",
-tc_cf8126ae, TypeS_2op>, Enc_b9c5fb {
+tc_d61dfdc3, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10000000010;
 let prefersSlot3 = 1;
@@ -1641,7 +1641,7 @@ def A2_vabsw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vabsw($Rss32)",
-tc_cf8126ae, TypeS_2op>, Enc_b9c5fb {
+tc_d61dfdc3, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000000010;
 let prefersSlot3 = 1;
@@ -1650,7 +1650,7 @@ def A2_vabswsat : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vabsw($Rss32):sat",
-tc_cf8126ae, TypeS_2op>, Enc_b9c5fb {
+tc_d61dfdc3, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10000000010;
 let prefersSlot3 = 1;
@@ -1660,7 +1660,7 @@ def A2_vaddb_map : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vaddb($Rss32,$Rtt32)",
-tc_946df596, TypeMAPPING> {
+tc_5da50c4b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -1668,7 +1668,7 @@ def A2_vaddh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vaddh($Rss32,$Rtt32)",
-tc_946df596, TypeALU64>, Enc_a56825 {
+tc_5da50c4b, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -1677,7 +1677,7 @@ def A2_vaddhs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vaddh($Rss32,$Rtt32):sat",
-tc_779080bf, TypeALU64>, Enc_a56825 {
+tc_8a825db2, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -1688,7 +1688,7 @@ def A2_vaddub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vaddub($Rss32,$Rtt32)",
-tc_946df596, TypeALU64>, Enc_a56825 {
+tc_5da50c4b, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -1697,7 +1697,7 @@ def A2_vaddubs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vaddub($Rss32,$Rtt32):sat",
-tc_779080bf, TypeALU64>, Enc_a56825 {
+tc_8a825db2, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -1708,7 +1708,7 @@ def A2_vadduhs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vadduh($Rss32,$Rtt32):sat",
-tc_779080bf, TypeALU64>, Enc_a56825 {
+tc_8a825db2, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -1719,7 +1719,7 @@ def A2_vaddw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vaddw($Rss32,$Rtt32)",
-tc_946df596, TypeALU64>, Enc_a56825 {
+tc_5da50c4b, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -1728,7 +1728,7 @@ def A2_vaddws : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vaddw($Rss32,$Rtt32):sat",
-tc_779080bf, TypeALU64>, Enc_a56825 {
+tc_8a825db2, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011000;
@@ -1739,7 +1739,7 @@ def A2_vavgh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgh($Rss32,$Rtt32)",
-tc_6132ba3d, TypeALU64>, Enc_a56825 {
+tc_f098b237, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011010;
@@ -1749,7 +1749,7 @@ def A2_vavghcr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgh($Rss32,$Rtt32):crnd",
-tc_002cb246, TypeALU64>, Enc_a56825 {
+tc_0dfac0a7, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011010;
@@ -1759,7 +1759,7 @@ def A2_vavghr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgh($Rss32,$Rtt32):rnd",
-tc_e4a7f9f0, TypeALU64>, Enc_a56825 {
+tc_20131976, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011010;
@@ -1769,7 +1769,7 @@ def A2_vavgub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgub($Rss32,$Rtt32)",
-tc_6132ba3d, TypeALU64>, Enc_a56825 {
+tc_f098b237, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011010;
@@ -1779,7 +1779,7 @@ def A2_vavgubr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgub($Rss32,$Rtt32):rnd",
-tc_e4a7f9f0, TypeALU64>, Enc_a56825 {
+tc_20131976, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011010;
@@ -1789,7 +1789,7 @@ def A2_vavguh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavguh($Rss32,$Rtt32)",
-tc_6132ba3d, TypeALU64>, Enc_a56825 {
+tc_f098b237, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011010;
@@ -1799,7 +1799,7 @@ def A2_vavguhr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavguh($Rss32,$Rtt32):rnd",
-tc_e4a7f9f0, TypeALU64>, Enc_a56825 {
+tc_20131976, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011010;
@@ -1809,7 +1809,7 @@ def A2_vavguw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavguw($Rss32,$Rtt32)",
-tc_6132ba3d, TypeALU64>, Enc_a56825 {
+tc_f098b237, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
@@ -1819,7 +1819,7 @@ def A2_vavguwr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavguw($Rss32,$Rtt32):rnd",
-tc_e4a7f9f0, TypeALU64>, Enc_a56825 {
+tc_20131976, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
@@ -1829,7 +1829,7 @@ def A2_vavgw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgw($Rss32,$Rtt32)",
-tc_6132ba3d, TypeALU64>, Enc_a56825 {
+tc_f098b237, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
@@ -1839,7 +1839,7 @@ def A2_vavgwcr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgw($Rss32,$Rtt32):crnd",
-tc_002cb246, TypeALU64>, Enc_a56825 {
+tc_0dfac0a7, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
@@ -1849,7 +1849,7 @@ def A2_vavgwr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vavgw($Rss32,$Rtt32):rnd",
-tc_e4a7f9f0, TypeALU64>, Enc_a56825 {
+tc_20131976, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011011;
@@ -1859,7 +1859,7 @@ def A2_vcmpbeq : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmpb.eq($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b110000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1868,7 +1868,7 @@ def A2_vcmpbgtu : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmpb.gtu($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b111000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1877,7 +1877,7 @@ def A2_vcmpheq : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmph.eq($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1886,7 +1886,7 @@ def A2_vcmphgt : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmph.gt($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1895,7 +1895,7 @@ def A2_vcmphgtu : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmph.gtu($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b101000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1904,7 +1904,7 @@ def A2_vcmpweq : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmpw.eq($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1913,7 +1913,7 @@ def A2_vcmpwgt : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmpw.gt($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1922,7 +1922,7 @@ def A2_vcmpwgtu : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmpw.gtu($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010000;
@@ -1931,7 +1931,7 @@ def A2_vconj : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vconj($Rss32):sat",
-tc_cf8126ae, TypeS_2op>, Enc_b9c5fb {
+tc_d61dfdc3, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10000000100;
 let prefersSlot3 = 1;
@@ -1941,7 +1941,7 @@ def A2_vmaxb : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vmaxb($Rtt32,$Rss32)",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -1951,7 +1951,7 @@ def A2_vmaxh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vmaxh($Rtt32,$Rss32)",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -1961,7 +1961,7 @@ def A2_vmaxub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vmaxub($Rtt32,$Rss32)",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -1971,7 +1971,7 @@ def A2_vmaxuh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vmaxuh($Rtt32,$Rss32)",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -1981,7 +1981,7 @@ def A2_vmaxuw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vmaxuw($Rtt32,$Rss32)",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -1991,7 +1991,7 @@ def A2_vmaxw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vmaxw($Rtt32,$Rss32)",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -2001,7 +2001,7 @@ def A2_vminb : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vminb($Rtt32,$Rss32)",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011110;
@@ -2011,7 +2011,7 @@ def A2_vminh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vminh($Rtt32,$Rss32)",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -2021,7 +2021,7 @@ def A2_vminub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vminub($Rtt32,$Rss32)",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -2031,7 +2031,7 @@ def A2_vminuh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vminuh($Rtt32,$Rss32)",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -2041,7 +2041,7 @@ def A2_vminuw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vminuw($Rtt32,$Rss32)",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -2051,7 +2051,7 @@ def A2_vminw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vminw($Rtt32,$Rss32)",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011101;
@@ -2061,7 +2061,7 @@ def A2_vnavgh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vnavgh($Rtt32,$Rss32)",
-tc_6132ba3d, TypeALU64>, Enc_ea23e4 {
+tc_f098b237, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011100;
@@ -2071,7 +2071,7 @@ def A2_vnavghcr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vnavgh($Rtt32,$Rss32):crnd:sat",
-tc_002cb246, TypeALU64>, Enc_ea23e4 {
+tc_0dfac0a7, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011100;
@@ -2082,7 +2082,7 @@ def A2_vnavghr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vnavgh($Rtt32,$Rss32):rnd:sat",
-tc_002cb246, TypeALU64>, Enc_ea23e4 {
+tc_0dfac0a7, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011100;
@@ -2093,7 +2093,7 @@ def A2_vnavgw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vnavgw($Rtt32,$Rss32)",
-tc_6132ba3d, TypeALU64>, Enc_ea23e4 {
+tc_f098b237, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011100;
@@ -2103,7 +2103,7 @@ def A2_vnavgwcr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vnavgw($Rtt32,$Rss32):crnd:sat",
-tc_002cb246, TypeALU64>, Enc_ea23e4 {
+tc_0dfac0a7, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011100;
@@ -2114,7 +2114,7 @@ def A2_vnavgwr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vnavgw($Rtt32,$Rss32):rnd:sat",
-tc_002cb246, TypeALU64>, Enc_ea23e4 {
+tc_0dfac0a7, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011100;
@@ -2125,7 +2125,7 @@ def A2_vraddub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vraddub($Rss32,$Rtt32)",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000010;
@@ -2135,7 +2135,7 @@ def A2_vraddub_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vraddub($Rss32,$Rtt32)",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010010;
@@ -2146,7 +2146,7 @@ def A2_vrsadub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrsadub($Rss32,$Rtt32)",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000010;
@@ -2156,7 +2156,7 @@ def A2_vrsadub_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrsadub($Rss32,$Rtt32)",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010010;
@@ -2167,7 +2167,7 @@ def A2_vsubb_map : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vsubb($Rss32,$Rtt32)",
-tc_946df596, TypeMAPPING> {
+tc_5da50c4b, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -2175,7 +2175,7 @@ def A2_vsubh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vsubh($Rtt32,$Rss32)",
-tc_946df596, TypeALU64>, Enc_ea23e4 {
+tc_5da50c4b, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -2184,7 +2184,7 @@ def A2_vsubhs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vsubh($Rtt32,$Rss32):sat",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -2195,7 +2195,7 @@ def A2_vsubub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vsubub($Rtt32,$Rss32)",
-tc_946df596, TypeALU64>, Enc_ea23e4 {
+tc_5da50c4b, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -2204,7 +2204,7 @@ def A2_vsububs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vsubub($Rtt32,$Rss32):sat",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -2215,7 +2215,7 @@ def A2_vsubuhs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vsubuh($Rtt32,$Rss32):sat",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -2226,7 +2226,7 @@ def A2_vsubw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vsubw($Rtt32,$Rss32)",
-tc_946df596, TypeALU64>, Enc_ea23e4 {
+tc_5da50c4b, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -2235,7 +2235,7 @@ def A2_vsubws : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vsubw($Rtt32,$Rss32):sat",
-tc_779080bf, TypeALU64>, Enc_ea23e4 {
+tc_8a825db2, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011001;
@@ -2246,14 +2246,14 @@ def A2_xor : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = xor($Rs32,$Rt32)",
-tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, PredNewRel {
+tc_713b66bf, TypeALU32_3op>, Enc_5ab2be, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110001011;
 let hasNewValue = 1;
 let opNewValue = 0;
-let InputType = "reg";
 let BaseOpcode = "A2_xor";
+let InputType = "reg";
 let isCommutable = 1;
 let isPredicable = 1;
 }
@@ -2261,7 +2261,7 @@ def A2_xorp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = xor($Rss32,$Rtt32)",
-tc_946df596, TypeALU64>, Enc_a56825 {
+tc_5da50c4b, TypeALU64>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011111;
@@ -2271,7 +2271,7 @@ def A2_zxtb : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = zxtb($Rs32)",
-tc_5a2711e5, TypeALU32_2op>, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, PredNewRel {
 let hasNewValue = 1;
 let opNewValue = 0;
 let BaseOpcode = "A2_zxtb";
@@ -2283,7 +2283,7 @@ def A2_zxth : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = zxth($Rs32)",
-tc_57890846, TypeALU32_2op>, Enc_5e2823, PredNewRel {
+tc_c57d9f39, TypeALU32_2op>, Enc_5e2823, PredNewRel {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01110000110;
 let hasNewValue = 1;
@@ -2295,7 +2295,7 @@ def A4_addp_c : HInst<
 (outs DoubleRegs:$Rdd32, PredRegs:$Px4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Px4in),
 "$Rdd32 = add($Rss32,$Rtt32,$Px4):carry",
-tc_9c3ecd83, TypeS_3op>, Enc_2b3f60 {
+tc_1d41f8b7, TypeS_3op>, Enc_2b3f60 {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000010110;
@@ -2306,7 +2306,7 @@ def A4_andn : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = and($Rt32,~$Rs32)",
-tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 {
+tc_713b66bf, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110001100;
@@ -2318,7 +2318,7 @@ def A4_andnp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = and($Rtt32,~$Rss32)",
-tc_946df596, TypeALU64>, Enc_ea23e4 {
+tc_5da50c4b, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011111;
@@ -2327,7 +2327,7 @@ def A4_bitsplit : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = bitsplit($Rs32,$Rt32)",
-tc_4414d8b1, TypeALU64>, Enc_be32a5 {
+tc_f34c1c21, TypeALU64>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010100001;
@@ -2337,7 +2337,7 @@ def A4_bitspliti : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rdd32 = bitsplit($Rs32,#$Ii)",
-tc_4414d8b1, TypeS_2op>, Enc_311abd {
+tc_f34c1c21, TypeS_2op>, Enc_311abd {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001000110;
@@ -2347,14 +2347,14 @@ def A4_boundscheck : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "$Pd4 = boundscheck($Rs32,$Rtt32)",
-tc_85d5d03f, TypeALU64> {
+tc_4a55d03c, TypeALU64> {
 let isPseudo = 1;
 }
 def A4_boundscheck_hi : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = boundscheck($Rss32,$Rtt32):raw:hi",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b101000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11010010000;
@@ -2363,7 +2363,7 @@ def A4_boundscheck_lo : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = boundscheck($Rss32,$Rtt32):raw:lo",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11010010000;
@@ -2372,7 +2372,7 @@ def A4_cmpbeq : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmpb.eq($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b110000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111110;
@@ -2385,7 +2385,7 @@ def A4_cmpbeqi : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u8_0Imm:$Ii),
 "$Pd4 = cmpb.eq($Rs32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_08d755, ImmRegRel {
+tc_a1297125, TypeALU64>, Enc_08d755, ImmRegRel {
 let Inst{4-2} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011101000;
@@ -2398,7 +2398,7 @@ def A4_cmpbgt : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmpb.gt($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111110;
@@ -2410,7 +2410,7 @@ def A4_cmpbgti : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s8_0Imm:$Ii),
 "$Pd4 = cmpb.gt($Rs32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_08d755, ImmRegRel {
+tc_a1297125, TypeALU64>, Enc_08d755, ImmRegRel {
 let Inst{4-2} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011101001;
@@ -2422,7 +2422,7 @@ def A4_cmpbgtu : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmpb.gtu($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b111000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111110;
@@ -2434,7 +2434,7 @@ def A4_cmpbgtui : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Pd4 = cmpb.gtu($Rs32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_02553a, ImmRegRel {
+tc_a1297125, TypeALU64>, Enc_02553a, ImmRegRel {
 let Inst{4-2} = 0b000;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b11011101010;
@@ -2451,7 +2451,7 @@ def A4_cmpheq : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmph.eq($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111110;
@@ -2464,7 +2464,7 @@ def A4_cmpheqi : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Pd4 = cmph.eq($Rs32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_08d755, ImmRegRel {
+tc_a1297125, TypeALU64>, Enc_08d755, ImmRegRel {
 let Inst{4-2} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011101000;
@@ -2482,7 +2482,7 @@ def A4_cmphgt : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmph.gt($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111110;
@@ -2494,7 +2494,7 @@ def A4_cmphgti : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Pd4 = cmph.gt($Rs32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_08d755, ImmRegRel {
+tc_a1297125, TypeALU64>, Enc_08d755, ImmRegRel {
 let Inst{4-2} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011101001;
@@ -2511,7 +2511,7 @@ def A4_cmphgtu : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmph.gtu($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e, ImmRegRel {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b101000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111110;
@@ -2523,7 +2523,7 @@ def A4_cmphgtui : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Pd4 = cmph.gtu($Rs32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_02553a, ImmRegRel {
+tc_a1297125, TypeALU64>, Enc_02553a, ImmRegRel {
 let Inst{4-2} = 0b010;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b11011101010;
@@ -2540,7 +2540,7 @@ def A4_combineii : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins s8_0Imm:$Ii, u32_0Imm:$II),
 "$Rdd32 = combine(#$Ii,#$II)",
-tc_5a2711e5, TypeALU32_2op>, Enc_f0cca7 {
+tc_713b66bf, TypeALU32_2op>, Enc_f0cca7 {
 let Inst{31-21} = 0b01111100100;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -2552,7 +2552,7 @@ def A4_combineir : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins s32_0Imm:$Ii, IntRegs:$Rs32),
 "$Rdd32 = combine(#$Ii,$Rs32)",
-tc_5a2711e5, TypeALU32_2op>, Enc_9cdba7 {
+tc_713b66bf, TypeALU32_2op>, Enc_9cdba7 {
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b01110011001;
 let isExtendable = 1;
@@ -2565,7 +2565,7 @@ def A4_combineri : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rdd32 = combine($Rs32,#$Ii)",
-tc_5a2711e5, TypeALU32_2op>, Enc_9cdba7 {
+tc_713b66bf, TypeALU32_2op>, Enc_9cdba7 {
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b01110011000;
 let isExtendable = 1;
@@ -2578,7 +2578,7 @@ def A4_cround_ri : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = cround($Rs32,#$Ii)",
-tc_002cb246, TypeS_2op>, Enc_a05677 {
+tc_0dfac0a7, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100111;
@@ -2590,7 +2590,7 @@ def A4_cround_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = cround($Rs32,$Rt32)",
-tc_002cb246, TypeS_3op>, Enc_5ab2be {
+tc_0dfac0a7, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110110;
@@ -2602,14 +2602,14 @@ def A4_ext : HInst<
 (outs),
 (ins u26_6Imm:$Ii),
 "immext(#$Ii)",
-tc_862b3e70, TypeEXTENDER>, Enc_2b518f {
+tc_112d30d6, TypeEXTENDER>, Enc_2b518f {
 let Inst{31-28} = 0b0000;
 }
 def A4_modwrapu : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = modwrap($Rs32,$Rt32)",
-tc_779080bf, TypeALU64>, Enc_5ab2be {
+tc_8a825db2, TypeALU64>, Enc_5ab2be {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011111;
@@ -2621,7 +2621,7 @@ def A4_orn : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = or($Rt32,~$Rs32)",
-tc_5a2711e5, TypeALU32_3op>, Enc_bd6011 {
+tc_713b66bf, TypeALU32_3op>, Enc_bd6011 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110001101;
@@ -2633,7 +2633,7 @@ def A4_ornp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = or($Rtt32,~$Rss32)",
-tc_946df596, TypeALU64>, Enc_ea23e4 {
+tc_5da50c4b, TypeALU64>, Enc_ea23e4 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010011111;
@@ -2642,7 +2642,7 @@ def A4_paslhf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = aslh($Rs32)",
-tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1010;
 let Inst{31-21} = 0b01110000000;
@@ -2656,7 +2656,7 @@ def A4_paslhfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = aslh($Rs32)",
-tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_86173609, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1011;
 let Inst{31-21} = 0b01110000000;
@@ -2671,7 +2671,7 @@ def A4_paslht : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = aslh($Rs32)",
-tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1000;
 let Inst{31-21} = 0b01110000000;
@@ -2684,7 +2684,7 @@ def A4_paslhtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = aslh($Rs32)",
-tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_86173609, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1001;
 let Inst{31-21} = 0b01110000000;
@@ -2698,7 +2698,7 @@ def A4_pasrhf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = asrh($Rs32)",
-tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1010;
 let Inst{31-21} = 0b01110000001;
@@ -2712,7 +2712,7 @@ def A4_pasrhfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = asrh($Rs32)",
-tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_86173609, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1011;
 let Inst{31-21} = 0b01110000001;
@@ -2727,7 +2727,7 @@ def A4_pasrht : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = asrh($Rs32)",
-tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1000;
 let Inst{31-21} = 0b01110000001;
@@ -2740,7 +2740,7 @@ def A4_pasrhtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = asrh($Rs32)",
-tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_86173609, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1001;
 let Inst{31-21} = 0b01110000001;
@@ -2754,7 +2754,7 @@ def A4_psxtbf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = sxtb($Rs32)",
-tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1010;
 let Inst{31-21} = 0b01110000101;
@@ -2768,7 +2768,7 @@ def A4_psxtbfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = sxtb($Rs32)",
-tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_86173609, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1011;
 let Inst{31-21} = 0b01110000101;
@@ -2783,7 +2783,7 @@ def A4_psxtbt : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = sxtb($Rs32)",
-tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1000;
 let Inst{31-21} = 0b01110000101;
@@ -2796,7 +2796,7 @@ def A4_psxtbtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = sxtb($Rs32)",
-tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_86173609, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1001;
 let Inst{31-21} = 0b01110000101;
@@ -2810,7 +2810,7 @@ def A4_psxthf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = sxth($Rs32)",
-tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1010;
 let Inst{31-21} = 0b01110000111;
@@ -2824,7 +2824,7 @@ def A4_psxthfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = sxth($Rs32)",
-tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_86173609, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1011;
 let Inst{31-21} = 0b01110000111;
@@ -2839,7 +2839,7 @@ def A4_psxtht : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = sxth($Rs32)",
-tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1000;
 let Inst{31-21} = 0b01110000111;
@@ -2852,7 +2852,7 @@ def A4_psxthtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = sxth($Rs32)",
-tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_86173609, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1001;
 let Inst{31-21} = 0b01110000111;
@@ -2866,7 +2866,7 @@ def A4_pzxtbf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = zxtb($Rs32)",
-tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1010;
 let Inst{31-21} = 0b01110000100;
@@ -2880,7 +2880,7 @@ def A4_pzxtbfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = zxtb($Rs32)",
-tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_86173609, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1011;
 let Inst{31-21} = 0b01110000100;
@@ -2895,7 +2895,7 @@ def A4_pzxtbt : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = zxtb($Rs32)",
-tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1000;
 let Inst{31-21} = 0b01110000100;
@@ -2908,7 +2908,7 @@ def A4_pzxtbtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = zxtb($Rs32)",
-tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_86173609, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1001;
 let Inst{31-21} = 0b01110000100;
@@ -2922,7 +2922,7 @@ def A4_pzxthf : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) $Rd32 = zxth($Rs32)",
-tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1010;
 let Inst{31-21} = 0b01110000110;
@@ -2936,7 +2936,7 @@ def A4_pzxthfnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) $Rd32 = zxth($Rs32)",
-tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_86173609, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1011;
 let Inst{31-21} = 0b01110000110;
@@ -2951,7 +2951,7 @@ def A4_pzxtht : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) $Rd32 = zxth($Rs32)",
-tc_5a2711e5, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_713b66bf, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1000;
 let Inst{31-21} = 0b01110000110;
@@ -2964,7 +2964,7 @@ def A4_pzxthtnew : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) $Rd32 = zxth($Rs32)",
-tc_1ae57e39, TypeALU32_2op>, Enc_fb6577, PredNewRel {
+tc_86173609, TypeALU32_2op>, Enc_fb6577, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1001;
 let Inst{31-21} = 0b01110000110;
@@ -2978,7 +2978,7 @@ def A4_rcmpeq : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = cmp.eq($Rs32,$Rt32)",
-tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, ImmRegRel {
+tc_713b66bf, TypeALU32_3op>, Enc_5ab2be, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011010;
@@ -2992,7 +2992,7 @@ def A4_rcmpeqi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = cmp.eq($Rs32,#$Ii)",
-tc_5a2711e5, TypeALU32_2op>, Enc_b8c967, ImmRegRel {
+tc_713b66bf, TypeALU32_2op>, Enc_b8c967, ImmRegRel {
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b01110011010;
 let hasNewValue = 1;
@@ -3009,7 +3009,7 @@ def A4_rcmpneq : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = !cmp.eq($Rs32,$Rt32)",
-tc_5a2711e5, TypeALU32_3op>, Enc_5ab2be, ImmRegRel {
+tc_713b66bf, TypeALU32_3op>, Enc_5ab2be, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110011011;
@@ -3023,7 +3023,7 @@ def A4_rcmpneqi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = !cmp.eq($Rs32,#$Ii)",
-tc_5a2711e5, TypeALU32_2op>, Enc_b8c967, ImmRegRel {
+tc_713b66bf, TypeALU32_2op>, Enc_b8c967, ImmRegRel {
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b01110011011;
 let hasNewValue = 1;
@@ -3040,7 +3040,7 @@ def A4_round_ri : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = round($Rs32,#$Ii)",
-tc_002cb246, TypeS_2op>, Enc_a05677 {
+tc_0dfac0a7, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100111;
@@ -3052,7 +3052,7 @@ def A4_round_ri_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = round($Rs32,#$Ii):sat",
-tc_002cb246, TypeS_2op>, Enc_a05677 {
+tc_0dfac0a7, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100111;
@@ -3065,7 +3065,7 @@ def A4_round_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = round($Rs32,$Rt32)",
-tc_002cb246, TypeS_3op>, Enc_5ab2be {
+tc_0dfac0a7, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110110;
@@ -3077,7 +3077,7 @@ def A4_round_rr_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = round($Rs32,$Rt32):sat",
-tc_002cb246, TypeS_3op>, Enc_5ab2be {
+tc_0dfac0a7, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110110;
@@ -3090,7 +3090,7 @@ def A4_subp_c : HInst<
 (outs DoubleRegs:$Rdd32, PredRegs:$Px4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Px4in),
 "$Rdd32 = sub($Rss32,$Rtt32,$Px4):carry",
-tc_9c3ecd83, TypeS_3op>, Enc_2b3f60 {
+tc_1d41f8b7, TypeS_3op>, Enc_2b3f60 {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000010111;
@@ -3101,7 +3101,7 @@ def A4_tfrcpp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins CtrRegs64:$Css32),
 "$Rdd32 = $Css32",
-tc_b9272d6c, TypeCR>, Enc_667b39 {
+tc_7476d766, TypeCR>, Enc_667b39 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01101000000;
 }
@@ -3109,7 +3109,7 @@ def A4_tfrpcp : HInst<
 (outs CtrRegs64:$Cdd32),
 (ins DoubleRegs:$Rss32),
 "$Cdd32 = $Rss32",
-tc_434c8e1e, TypeCR>, Enc_0ed752 {
+tc_49fdfd4b, TypeCR>, Enc_0ed752 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01100011001;
 }
@@ -3117,7 +3117,7 @@ def A4_tlbmatch : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Pd4 = tlbmatch($Rss32,$Rt32)",
-tc_4837eefb, TypeALU64>, Enc_03833b {
+tc_d68dca5c, TypeALU64>, Enc_03833b {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11010010000;
@@ -3127,7 +3127,7 @@ def A4_vcmpbeq_any : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = any8(vcmpb.eq($Rss32,$Rtt32))",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11010010000;
@@ -3136,7 +3136,7 @@ def A4_vcmpbeqi : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, u8_0Imm:$Ii),
 "$Pd4 = vcmpb.eq($Rss32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_0d8adb {
+tc_a1297125, TypeALU64>, Enc_0d8adb {
 let Inst{4-2} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011100000;
@@ -3145,7 +3145,7 @@ def A4_vcmpbgt : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = vcmpb.gt($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11010010000;
@@ -3154,7 +3154,7 @@ def A4_vcmpbgti : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
 "$Pd4 = vcmpb.gt($Rss32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_0d8adb {
+tc_a1297125, TypeALU64>, Enc_0d8adb {
 let Inst{4-2} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011100001;
@@ -3163,7 +3163,7 @@ def A4_vcmpbgtui : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
 "$Pd4 = vcmpb.gtu($Rss32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_3680c2 {
+tc_a1297125, TypeALU64>, Enc_3680c2 {
 let Inst{4-2} = 0b000;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b11011100010;
@@ -3172,7 +3172,7 @@ def A4_vcmpheqi : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
 "$Pd4 = vcmph.eq($Rss32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_0d8adb {
+tc_a1297125, TypeALU64>, Enc_0d8adb {
 let Inst{4-2} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011100000;
@@ -3181,7 +3181,7 @@ def A4_vcmphgti : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
 "$Pd4 = vcmph.gt($Rss32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_0d8adb {
+tc_a1297125, TypeALU64>, Enc_0d8adb {
 let Inst{4-2} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011100001;
@@ -3190,7 +3190,7 @@ def A4_vcmphgtui : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
 "$Pd4 = vcmph.gtu($Rss32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_3680c2 {
+tc_a1297125, TypeALU64>, Enc_3680c2 {
 let Inst{4-2} = 0b010;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b11011100010;
@@ -3199,7 +3199,7 @@ def A4_vcmpweqi : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
 "$Pd4 = vcmpw.eq($Rss32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_0d8adb {
+tc_a1297125, TypeALU64>, Enc_0d8adb {
 let Inst{4-2} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011100000;
@@ -3208,7 +3208,7 @@ def A4_vcmpwgti : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, s8_0Imm:$Ii),
 "$Pd4 = vcmpw.gt($Rss32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_0d8adb {
+tc_a1297125, TypeALU64>, Enc_0d8adb {
 let Inst{4-2} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11011100001;
@@ -3217,7 +3217,7 @@ def A4_vcmpwgtui : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, u7_0Imm:$Ii),
 "$Pd4 = vcmpw.gtu($Rss32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_3680c2 {
+tc_a1297125, TypeALU64>, Enc_3680c2 {
 let Inst{4-2} = 0b100;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b11011100010;
@@ -3226,7 +3226,7 @@ def A4_vrmaxh : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrmaxh($Rss32,$Ru32)",
-tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
+tc_788b1d09, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011001;
@@ -3237,7 +3237,7 @@ def A4_vrmaxuh : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrmaxuh($Rss32,$Ru32)",
-tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
+tc_788b1d09, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11001011001;
@@ -3248,7 +3248,7 @@ def A4_vrmaxuw : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrmaxuw($Rss32,$Ru32)",
-tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
+tc_788b1d09, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11001011001;
@@ -3259,7 +3259,7 @@ def A4_vrmaxw : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrmaxw($Rss32,$Ru32)",
-tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
+tc_788b1d09, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011001;
@@ -3270,7 +3270,7 @@ def A4_vrminh : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrminh($Rss32,$Ru32)",
-tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
+tc_788b1d09, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011001;
@@ -3281,7 +3281,7 @@ def A4_vrminuh : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrminuh($Rss32,$Ru32)",
-tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
+tc_788b1d09, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11001011001;
@@ -3292,7 +3292,7 @@ def A4_vrminuw : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrminuw($Rss32,$Ru32)",
-tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
+tc_788b1d09, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11001011001;
@@ -3303,7 +3303,7 @@ def A4_vrminw : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Ru32),
 "$Rxx32 = vrminw($Rss32,$Ru32)",
-tc_5b54b33f, TypeS_3op>, Enc_412ff0 {
+tc_788b1d09, TypeS_3op>, Enc_412ff0 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011001;
@@ -3314,7 +3314,7 @@ def A5_ACS : HInst<
 (outs DoubleRegs:$Rxx32, PredRegs:$Pe4),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32,$Pe4 = vacsh($Rss32,$Rtt32)",
-tc_d1aa9eaa, TypeM>, Enc_831a7d, Requires<[HasV55]> {
+tc_38e0bae9, TypeM>, Enc_831a7d, Requires<[HasV55]> {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010101;
@@ -3327,7 +3327,7 @@ def A5_vaddhubs : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vaddhub($Rss32,$Rtt32):sat",
-tc_002cb246, TypeS_3op>, Enc_d2216a {
+tc_0dfac0a7, TypeS_3op>, Enc_d2216a {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001010;
@@ -3340,7 +3340,7 @@ def A6_vcmpbeq_notany : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = !any8(vcmpb.eq($Rss32,$Rtt32))",
-tc_1fc97744, TypeALU64>, Enc_fcf7a7, Requires<[HasV65]> {
+tc_407e96f9, TypeALU64>, Enc_fcf7a7, Requires<[HasV65]> {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11010010000;
@@ -3349,18 +3349,57 @@ def A6_vminub_RdP : HInst<
 (outs DoubleRegs:$Rdd32, PredRegs:$Pe4),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32,$Pe4 = vminub($Rtt32,$Rss32)",
-tc_f9058dd7, TypeM>, Enc_d2c7f1, Requires<[HasV62]> {
+tc_7401744f, TypeM>, Enc_d2c7f1, Requires<[HasV62]> {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010111;
 let isPredicateLate = 1;
 let prefersSlot3 = 1;
 }
+def A7_clip : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, u5_0Imm:$Ii),
+"$Rd32 = clip($Rs32,#$Ii)",
+tc_407e96f9, TypeS_2op>, Enc_a05677, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b101;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001000110;
+let hasNewValue = 1;
+let opNewValue = 0;
+}
+def A7_croundd_ri : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
+"$Rdd32 = cround($Rss32,#$Ii)",
+tc_9b3c0462, TypeS_2op>, Enc_5eac98, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b010;
+let Inst{31-21} = 0b10001100111;
+let prefersSlot3 = 1;
+}
+def A7_croundd_rr : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, IntRegs:$Rt32),
+"$Rdd32 = cround($Rss32,$Rt32)",
+tc_9b3c0462, TypeS_3op>, Enc_927852, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11000110110;
+let prefersSlot3 = 1;
+}
+def A7_vclip : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
+"$Rdd32 = vclip($Rss32,#$Ii)",
+tc_407e96f9, TypeS_2op>, Enc_7e5a82, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b10001000110;
+}
 def C2_all8 : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4),
 "$Pd4 = all8($Ps4)",
-tc_de554571, TypeCR>, Enc_65d691 {
+tc_151bf368, TypeCR>, Enc_65d691 {
 let Inst{13-2} = 0b000000000000;
 let Inst{31-18} = 0b01101011101000;
 }
@@ -3368,7 +3407,7 @@ def C2_and : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Pt4, PredRegs:$Ps4),
 "$Pd4 = and($Pt4,$Ps4)",
-tc_640086b5, TypeCR>, Enc_454a26 {
+tc_651cbe02, TypeCR>, Enc_454a26 {
 let Inst{7-2} = 0b000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011000000;
@@ -3377,7 +3416,7 @@ def C2_andn : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Pt4, PredRegs:$Ps4),
 "$Pd4 = and($Pt4,!$Ps4)",
-tc_640086b5, TypeCR>, Enc_454a26 {
+tc_651cbe02, TypeCR>, Enc_454a26 {
 let Inst{7-2} = 0b000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011011000;
@@ -3386,7 +3425,7 @@ def C2_any8 : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4),
 "$Pd4 = any8($Ps4)",
-tc_de554571, TypeCR>, Enc_65d691 {
+tc_151bf368, TypeCR>, Enc_65d691 {
 let Inst{13-2} = 0b000000000000;
 let Inst{31-18} = 0b01101011100000;
 }
@@ -3394,7 +3433,7 @@ def C2_bitsclr : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = bitsclr($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111100;
@@ -3403,7 +3442,7 @@ def C2_bitsclri : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u6_0Imm:$Ii),
 "$Pd4 = bitsclr($Rs32,#$Ii)",
-tc_643b4717, TypeS_2op>, Enc_5d6c34 {
+tc_a1297125, TypeS_2op>, Enc_5d6c34 {
 let Inst{7-2} = 0b000000;
 let Inst{31-21} = 0b10000101100;
 }
@@ -3411,7 +3450,7 @@ def C2_bitsset : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = bitsset($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111010;
@@ -3420,7 +3459,7 @@ def C2_ccombinewf : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4) $Rdd32 = combine($Rs32,$Rt32)",
-tc_4c5ba658, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
+tc_1c2c7a4a, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111101000;
@@ -3432,7 +3471,7 @@ def C2_ccombinewnewf : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pu4.new) $Rdd32 = combine($Rs32,$Rt32)",
-tc_05c070ec, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
+tc_442395f3, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111101000;
@@ -3445,7 +3484,7 @@ def C2_ccombinewnewt : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4.new) $Rdd32 = combine($Rs32,$Rt32)",
-tc_05c070ec, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
+tc_442395f3, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11111101000;
@@ -3457,7 +3496,7 @@ def C2_ccombinewt : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pu4) $Rdd32 = combine($Rs32,$Rt32)",
-tc_4c5ba658, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
+tc_1c2c7a4a, TypeALU32_3op>, Enc_cb4b4e, PredNewRel {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11111101000;
@@ -3468,7 +3507,7 @@ def C2_cmoveif : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii),
 "if (!$Pu4) $Rd32 = #$Ii",
-tc_5a2711e5, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
+tc_713b66bf, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b0;
 let Inst{20-20} = 0b0;
 let Inst{31-23} = 0b011111101;
@@ -3476,9 +3515,9 @@ let isPredicated = 1;
 let isPredicatedFalse = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
+let BaseOpcode = "A2_tfrsi";
 let CextOpcode = "A2_tfr";
 let InputType = "imm";
-let BaseOpcode = "A2_tfrsi";
 let isMoveImm = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -3490,16 +3529,16 @@ def C2_cmoveit : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii),
 "if ($Pu4) $Rd32 = #$Ii",
-tc_5a2711e5, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
+tc_713b66bf, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b0;
 let Inst{20-20} = 0b0;
 let Inst{31-23} = 0b011111100;
 let isPredicated = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
+let BaseOpcode = "A2_tfrsi";
 let CextOpcode = "A2_tfr";
 let InputType = "imm";
-let BaseOpcode = "A2_tfrsi";
 let isMoveImm = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -3511,7 +3550,7 @@ def C2_cmovenewif : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii),
 "if (!$Pu4.new) $Rd32 = #$Ii",
-tc_1ae57e39, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
+tc_86173609, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b1;
 let Inst{20-20} = 0b0;
 let Inst{31-23} = 0b011111101;
@@ -3520,9 +3559,9 @@ let isPredicatedFalse = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPredicatedNew = 1;
+let BaseOpcode = "A2_tfrsi";
 let CextOpcode = "A2_tfr";
 let InputType = "imm";
-let BaseOpcode = "A2_tfrsi";
 let isMoveImm = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -3534,7 +3573,7 @@ def C2_cmovenewit : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii),
 "if ($Pu4.new) $Rd32 = #$Ii",
-tc_1ae57e39, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
+tc_86173609, TypeALU32_2op>, Enc_cda00a, PredNewRel, ImmRegRel {
 let Inst{13-13} = 0b1;
 let Inst{20-20} = 0b0;
 let Inst{31-23} = 0b011111100;
@@ -3542,9 +3581,9 @@ let isPredicated = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPredicatedNew = 1;
+let BaseOpcode = "A2_tfrsi";
 let CextOpcode = "A2_tfr";
 let InputType = "imm";
-let BaseOpcode = "A2_tfrsi";
 let isMoveImm = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -3556,7 +3595,7 @@ def C2_cmpeq : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmp.eq($Rs32,$Rt32)",
-tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+tc_9c52f549, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110010000;
@@ -3569,7 +3608,7 @@ def C2_cmpeqi : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Pd4 = cmp.eq($Rs32,#$Ii)",
-tc_56f114f4, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
+tc_d33e5eee, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
 let Inst{4-2} = 0b000;
 let Inst{31-22} = 0b0111010100;
 let CextOpcode = "C2_cmpeq";
@@ -3585,7 +3624,7 @@ def C2_cmpeqp : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = cmp.eq($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010100;
@@ -3596,7 +3635,7 @@ def C2_cmpgei : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s8_0Imm:$Ii),
 "$Pd4 = cmp.ge($Rs32,#$Ii)",
-tc_56f114f4, TypeALU32_2op> {
+tc_d33e5eee, TypeALU32_2op> {
 let isCompare = 1;
 let isPseudo = 1;
 }
@@ -3604,7 +3643,7 @@ def C2_cmpgeui : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u8_0Imm:$Ii),
 "$Pd4 = cmp.geu($Rs32,#$Ii)",
-tc_56f114f4, TypeALU32_2op> {
+tc_d33e5eee, TypeALU32_2op> {
 let isCompare = 1;
 let isPseudo = 1;
 }
@@ -3612,7 +3651,7 @@ def C2_cmpgt : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmp.gt($Rs32,$Rt32)",
-tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+tc_9c52f549, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110010010;
@@ -3624,7 +3663,7 @@ def C2_cmpgti : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Pd4 = cmp.gt($Rs32,#$Ii)",
-tc_56f114f4, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
+tc_d33e5eee, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
 let Inst{4-2} = 0b000;
 let Inst{31-22} = 0b0111010101;
 let CextOpcode = "C2_cmpgt";
@@ -3640,7 +3679,7 @@ def C2_cmpgtp : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = cmp.gt($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010100;
@@ -3650,7 +3689,7 @@ def C2_cmpgtu : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmp.gtu($Rs32,$Rt32)",
-tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+tc_9c52f549, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110010011;
@@ -3662,7 +3701,7 @@ def C2_cmpgtui : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Pd4 = cmp.gtu($Rs32,#$Ii)",
-tc_56f114f4, TypeALU32_2op>, Enc_c0cdde, ImmRegRel {
+tc_d33e5eee, TypeALU32_2op>, Enc_c0cdde, ImmRegRel {
 let Inst{4-2} = 0b000;
 let Inst{31-21} = 0b01110101100;
 let CextOpcode = "C2_cmpgtu";
@@ -3678,7 +3717,7 @@ def C2_cmpgtup : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = cmp.gtu($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010100;
@@ -3688,7 +3727,7 @@ def C2_cmplt : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmp.lt($Rs32,$Rt32)",
-tc_56f114f4, TypeALU32_3op> {
+tc_d33e5eee, TypeALU32_3op> {
 let isCompare = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
@@ -3697,7 +3736,7 @@ def C2_cmpltu : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = cmp.ltu($Rs32,$Rt32)",
-tc_56f114f4, TypeALU32_3op> {
+tc_d33e5eee, TypeALU32_3op> {
 let isCompare = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
@@ -3706,7 +3745,7 @@ def C2_mask : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4),
 "$Rdd32 = mask($Pt4)",
-tc_0ae0825c, TypeS_2op>, Enc_78e566 {
+tc_9f6cd987, TypeS_2op>, Enc_78e566 {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b0000;
 let Inst{31-16} = 0b1000011000000000;
@@ -3715,7 +3754,7 @@ def C2_mux : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mux($Pu4,$Rs32,$Rt32)",
-tc_4c5ba658, TypeALU32_3op>, Enc_ea4c54 {
+tc_1c2c7a4a, TypeALU32_3op>, Enc_ea4c54 {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110100000;
@@ -3727,7 +3766,7 @@ def C2_muxii : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii, s8_0Imm:$II),
 "$Rd32 = mux($Pu4,#$Ii,#$II)",
-tc_4c5ba658, TypeALU32_2op>, Enc_830e5d {
+tc_1c2c7a4a, TypeALU32_2op>, Enc_830e5d {
 let Inst{31-25} = 0b0111101;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -3741,7 +3780,7 @@ def C2_muxir : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = mux($Pu4,$Rs32,#$Ii)",
-tc_4c5ba658, TypeALU32_2op>, Enc_e38e1f {
+tc_1c2c7a4a, TypeALU32_2op>, Enc_e38e1f {
 let Inst{13-13} = 0b0;
 let Inst{31-23} = 0b011100110;
 let hasNewValue = 1;
@@ -3757,7 +3796,7 @@ def C2_muxri : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pu4, s32_0Imm:$Ii, IntRegs:$Rs32),
 "$Rd32 = mux($Pu4,#$Ii,$Rs32)",
-tc_4c5ba658, TypeALU32_2op>, Enc_e38e1f {
+tc_1c2c7a4a, TypeALU32_2op>, Enc_e38e1f {
 let Inst{13-13} = 0b0;
 let Inst{31-23} = 0b011100111;
 let hasNewValue = 1;
@@ -3773,7 +3812,7 @@ def C2_not : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4),
 "$Pd4 = not($Ps4)",
-tc_de554571, TypeCR>, Enc_65d691 {
+tc_151bf368, TypeCR>, Enc_65d691 {
 let Inst{13-2} = 0b000000000000;
 let Inst{31-18} = 0b01101011110000;
 }
@@ -3781,7 +3820,7 @@ def C2_or : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Pt4, PredRegs:$Ps4),
 "$Pd4 = or($Pt4,$Ps4)",
-tc_640086b5, TypeCR>, Enc_454a26 {
+tc_651cbe02, TypeCR>, Enc_454a26 {
 let Inst{7-2} = 0b000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011001000;
@@ -3790,7 +3829,7 @@ def C2_orn : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Pt4, PredRegs:$Ps4),
 "$Pd4 = or($Pt4,!$Ps4)",
-tc_640086b5, TypeCR>, Enc_454a26 {
+tc_651cbe02, TypeCR>, Enc_454a26 {
 let Inst{7-2} = 0b000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011111000;
@@ -3799,7 +3838,7 @@ def C2_pxfer_map : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4),
 "$Pd4 = $Ps4",
-tc_640086b5, TypeMAPPING> {
+tc_651cbe02, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -3807,7 +3846,7 @@ def C2_tfrpr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Ps4),
 "$Rd32 = $Ps4",
-tc_0ae0825c, TypeS_2op>, Enc_f5e933 {
+tc_9f6cd987, TypeS_2op>, Enc_f5e933 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-18} = 0b10001001010000;
 let hasNewValue = 1;
@@ -3817,7 +3856,7 @@ def C2_tfrrp : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32),
 "$Pd4 = $Rs32",
-tc_cfd8378a, TypeS_2op>, Enc_48b75f {
+tc_55b33fda, TypeS_2op>, Enc_48b75f {
 let Inst{13-2} = 0b000000000000;
 let Inst{31-21} = 0b10000101010;
 }
@@ -3825,7 +3864,7 @@ def C2_vitpack : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Ps4, PredRegs:$Pt4),
 "$Rd32 = vitpack($Ps4,$Pt4)",
-tc_4414d8b1, TypeS_2op>, Enc_527412 {
+tc_f34c1c21, TypeS_2op>, Enc_527412 {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b10001001000000;
@@ -3837,7 +3876,7 @@ def C2_vmux : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pu4, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmux($Pu4,$Rss32,$Rtt32)",
-tc_b4b5c03a, TypeALU64>, Enc_329361 {
+tc_6fc5dbea, TypeALU64>, Enc_329361 {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010001000;
@@ -3846,7 +3885,7 @@ def C2_xor : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4),
 "$Pd4 = xor($Ps4,$Pt4)",
-tc_640086b5, TypeCR>, Enc_284ebb {
+tc_651cbe02, TypeCR>, Enc_284ebb {
 let Inst{7-2} = 0b000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011010000;
@@ -3855,7 +3894,7 @@ def C4_addipc : HInst<
 (outs IntRegs:$Rd32),
 (ins u32_0Imm:$Ii),
 "$Rd32 = add(pc,#$Ii)",
-tc_a813cf9a, TypeCR>, Enc_607661 {
+tc_3edca78f, TypeCR>, Enc_607661 {
 let Inst{6-5} = 0b00;
 let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0110101001001001;
@@ -3871,7 +3910,7 @@ def C4_and_and : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = and($Ps4,and($Pt4,$Pu4))",
-tc_b31c2e97, TypeCR>, Enc_9ac432 {
+tc_a7a13fac, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011000100;
@@ -3880,7 +3919,7 @@ def C4_and_andn : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = and($Ps4,and($Pt4,!$Pu4))",
-tc_b31c2e97, TypeCR>, Enc_9ac432 {
+tc_a7a13fac, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011100100;
@@ -3889,7 +3928,7 @@ def C4_and_or : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = and($Ps4,or($Pt4,$Pu4))",
-tc_b31c2e97, TypeCR>, Enc_9ac432 {
+tc_a7a13fac, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011001100;
@@ -3898,7 +3937,7 @@ def C4_and_orn : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = and($Ps4,or($Pt4,!$Pu4))",
-tc_b31c2e97, TypeCR>, Enc_9ac432 {
+tc_a7a13fac, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011101100;
@@ -3907,7 +3946,7 @@ def C4_cmplte : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = !cmp.gt($Rs32,$Rt32)",
-tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+tc_9c52f549, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b000100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110010010;
@@ -3919,7 +3958,7 @@ def C4_cmpltei : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Pd4 = !cmp.gt($Rs32,#$Ii)",
-tc_56f114f4, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
+tc_d33e5eee, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
 let Inst{4-2} = 0b100;
 let Inst{31-22} = 0b0111010101;
 let CextOpcode = "C4_cmplte";
@@ -3935,7 +3974,7 @@ def C4_cmplteu : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = !cmp.gtu($Rs32,$Rt32)",
-tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+tc_9c52f549, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b000100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110010011;
@@ -3947,7 +3986,7 @@ def C4_cmplteui : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Pd4 = !cmp.gtu($Rs32,#$Ii)",
-tc_56f114f4, TypeALU32_2op>, Enc_c0cdde, ImmRegRel {
+tc_d33e5eee, TypeALU32_2op>, Enc_c0cdde, ImmRegRel {
 let Inst{4-2} = 0b100;
 let Inst{31-21} = 0b01110101100;
 let CextOpcode = "C4_cmplteu";
@@ -3963,7 +4002,7 @@ def C4_cmpneq : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = !cmp.eq($Rs32,$Rt32)",
-tc_de4df740, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
+tc_9c52f549, TypeALU32_3op>, Enc_c2b48e, ImmRegRel {
 let Inst{7-2} = 0b000100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110010000;
@@ -3976,7 +4015,7 @@ def C4_cmpneqi : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Pd4 = !cmp.eq($Rs32,#$Ii)",
-tc_56f114f4, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
+tc_d33e5eee, TypeALU32_2op>, Enc_bd0b33, ImmRegRel {
 let Inst{4-2} = 0b100;
 let Inst{31-22} = 0b0111010100;
 let CextOpcode = "C4_cmpneq";
@@ -3992,7 +4031,7 @@ def C4_fastcorner9 : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4),
 "$Pd4 = fastcorner9($Ps4,$Pt4)",
-tc_640086b5, TypeCR>, Enc_284ebb {
+tc_651cbe02, TypeCR>, Enc_284ebb {
 let Inst{7-2} = 0b100100;
 let Inst{13-10} = 0b1000;
 let Inst{31-18} = 0b01101011000000;
@@ -4001,7 +4040,7 @@ def C4_fastcorner9_not : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4),
 "$Pd4 = !fastcorner9($Ps4,$Pt4)",
-tc_640086b5, TypeCR>, Enc_284ebb {
+tc_651cbe02, TypeCR>, Enc_284ebb {
 let Inst{7-2} = 0b100100;
 let Inst{13-10} = 0b1000;
 let Inst{31-18} = 0b01101011000100;
@@ -4010,7 +4049,7 @@ def C4_nbitsclr : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = !bitsclr($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111101;
@@ -4019,7 +4058,7 @@ def C4_nbitsclri : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u6_0Imm:$Ii),
 "$Pd4 = !bitsclr($Rs32,#$Ii)",
-tc_643b4717, TypeS_2op>, Enc_5d6c34 {
+tc_a1297125, TypeS_2op>, Enc_5d6c34 {
 let Inst{7-2} = 0b000000;
 let Inst{31-21} = 0b10000101101;
 }
@@ -4027,7 +4066,7 @@ def C4_nbitsset : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = !bitsset($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111011;
@@ -4036,7 +4075,7 @@ def C4_or_and : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = or($Ps4,and($Pt4,$Pu4))",
-tc_b31c2e97, TypeCR>, Enc_9ac432 {
+tc_a7a13fac, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011010100;
@@ -4045,7 +4084,7 @@ def C4_or_andn : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = or($Ps4,and($Pt4,!$Pu4))",
-tc_b31c2e97, TypeCR>, Enc_9ac432 {
+tc_a7a13fac, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011110100;
@@ -4054,7 +4093,7 @@ def C4_or_or : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = or($Ps4,or($Pt4,$Pu4))",
-tc_b31c2e97, TypeCR>, Enc_9ac432 {
+tc_a7a13fac, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011011100;
@@ -4063,7 +4102,7 @@ def C4_or_orn : HInst<
 (outs PredRegs:$Pd4),
 (ins PredRegs:$Ps4, PredRegs:$Pt4, PredRegs:$Pu4),
 "$Pd4 = or($Ps4,or($Pt4,!$Pu4))",
-tc_b31c2e97, TypeCR>, Enc_9ac432 {
+tc_a7a13fac, TypeCR>, Enc_9ac432 {
 let Inst{5-2} = 0b0000;
 let Inst{13-10} = 0b0000;
 let Inst{31-18} = 0b01101011111100;
@@ -4072,7 +4111,7 @@ def F2_conv_d2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_d2df($Rss32)",
-tc_3a867367, TypeS_2op>, Enc_b9c5fb {
+tc_9783714b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4082,7 +4121,7 @@ def F2_conv_d2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_d2sf($Rss32)",
-tc_3a867367, TypeS_2op>, Enc_90cd8b {
+tc_9783714b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000010;
 let hasNewValue = 1;
@@ -4094,7 +4133,7 @@ def F2_conv_df2d : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2d($Rss32)",
-tc_3a867367, TypeS_2op>, Enc_b9c5fb {
+tc_9783714b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4104,7 +4143,7 @@ def F2_conv_df2d_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2d($Rss32):chop",
-tc_3a867367, TypeS_2op>, Enc_b9c5fb {
+tc_9783714b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4114,7 +4153,7 @@ def F2_conv_df2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2sf($Rss32)",
-tc_3a867367, TypeS_2op>, Enc_90cd8b {
+tc_9783714b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000000;
 let hasNewValue = 1;
@@ -4126,7 +4165,7 @@ def F2_conv_df2ud : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2ud($Rss32)",
-tc_3a867367, TypeS_2op>, Enc_b9c5fb {
+tc_9783714b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4136,7 +4175,7 @@ def F2_conv_df2ud_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_df2ud($Rss32):chop",
-tc_3a867367, TypeS_2op>, Enc_b9c5fb {
+tc_9783714b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4146,7 +4185,7 @@ def F2_conv_df2uw : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2uw($Rss32)",
-tc_3a867367, TypeS_2op>, Enc_90cd8b {
+tc_9783714b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000011;
 let hasNewValue = 1;
@@ -4158,7 +4197,7 @@ def F2_conv_df2uw_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2uw($Rss32):chop",
-tc_3a867367, TypeS_2op>, Enc_90cd8b {
+tc_9783714b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000101;
 let hasNewValue = 1;
@@ -4170,7 +4209,7 @@ def F2_conv_df2w : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2w($Rss32)",
-tc_3a867367, TypeS_2op>, Enc_90cd8b {
+tc_9783714b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000100;
 let hasNewValue = 1;
@@ -4182,7 +4221,7 @@ def F2_conv_df2w_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_df2w($Rss32):chop",
-tc_3a867367, TypeS_2op>, Enc_90cd8b {
+tc_9783714b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000111;
 let hasNewValue = 1;
@@ -4194,7 +4233,7 @@ def F2_conv_sf2d : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2d($Rs32)",
-tc_3a867367, TypeS_2op>, Enc_3a3d62 {
+tc_9783714b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4204,7 +4243,7 @@ def F2_conv_sf2d_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2d($Rs32):chop",
-tc_3a867367, TypeS_2op>, Enc_3a3d62 {
+tc_9783714b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4214,7 +4253,7 @@ def F2_conv_sf2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2df($Rs32)",
-tc_3a867367, TypeS_2op>, Enc_3a3d62 {
+tc_9783714b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4224,7 +4263,7 @@ def F2_conv_sf2ud : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2ud($Rs32)",
-tc_3a867367, TypeS_2op>, Enc_3a3d62 {
+tc_9783714b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4234,7 +4273,7 @@ def F2_conv_sf2ud_chop : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_sf2ud($Rs32):chop",
-tc_3a867367, TypeS_2op>, Enc_3a3d62 {
+tc_9783714b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4244,7 +4283,7 @@ def F2_conv_sf2uw : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2uw($Rs32)",
-tc_3a867367, TypeS_2op>, Enc_5e2823 {
+tc_9783714b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011011;
 let hasNewValue = 1;
@@ -4256,7 +4295,7 @@ def F2_conv_sf2uw_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2uw($Rs32):chop",
-tc_3a867367, TypeS_2op>, Enc_5e2823 {
+tc_9783714b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001011011;
 let hasNewValue = 1;
@@ -4268,7 +4307,7 @@ def F2_conv_sf2w : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2w($Rs32)",
-tc_3a867367, TypeS_2op>, Enc_5e2823 {
+tc_9783714b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011100;
 let hasNewValue = 1;
@@ -4280,7 +4319,7 @@ def F2_conv_sf2w_chop : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_sf2w($Rs32):chop",
-tc_3a867367, TypeS_2op>, Enc_5e2823 {
+tc_9783714b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001011100;
 let hasNewValue = 1;
@@ -4292,7 +4331,7 @@ def F2_conv_ud2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = convert_ud2df($Rss32)",
-tc_3a867367, TypeS_2op>, Enc_b9c5fb {
+tc_9783714b, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10000000111;
 let isFP = 1;
@@ -4302,7 +4341,7 @@ def F2_conv_ud2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = convert_ud2sf($Rss32)",
-tc_3a867367, TypeS_2op>, Enc_90cd8b {
+tc_9783714b, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10001000001;
 let hasNewValue = 1;
@@ -4314,7 +4353,7 @@ def F2_conv_uw2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_uw2df($Rs32)",
-tc_3a867367, TypeS_2op>, Enc_3a3d62 {
+tc_9783714b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4324,7 +4363,7 @@ def F2_conv_uw2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_uw2sf($Rs32)",
-tc_3a867367, TypeS_2op>, Enc_5e2823 {
+tc_9783714b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011001;
 let hasNewValue = 1;
@@ -4336,7 +4375,7 @@ def F2_conv_w2df : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = convert_w2df($Rs32)",
-tc_3a867367, TypeS_2op>, Enc_3a3d62 {
+tc_9783714b, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10000100100;
 let isFP = 1;
@@ -4346,7 +4385,7 @@ def F2_conv_w2sf : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = convert_w2sf($Rs32)",
-tc_3a867367, TypeS_2op>, Enc_5e2823 {
+tc_9783714b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011010;
 let hasNewValue = 1;
@@ -4358,7 +4397,7 @@ def F2_dfadd : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = dfadd($Rss32,$Rtt32)",
-tc_2f7c551d, TypeM>, Enc_a56825, Requires<[HasV66]> {
+tc_f0e8e832, TypeM>, Enc_a56825, Requires<[HasV66]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -4369,7 +4408,7 @@ def F2_dfclass : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
 "$Pd4 = dfclass($Rss32,#$Ii)",
-tc_643b4717, TypeALU64>, Enc_1f19b5 {
+tc_a1297125, TypeALU64>, Enc_1f19b5 {
 let Inst{4-2} = 0b100;
 let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b11011100100;
@@ -4380,7 +4419,7 @@ def F2_dfcmpeq : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.eq($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4392,7 +4431,7 @@ def F2_dfcmpge : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.ge($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4404,7 +4443,7 @@ def F2_dfcmpgt : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.gt($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4416,7 +4455,7 @@ def F2_dfcmpuo : HInst<
 (outs PredRegs:$Pd4),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Pd4 = dfcmp.uo($Rss32,$Rtt32)",
-tc_85d5d03f, TypeALU64>, Enc_fcf7a7 {
+tc_4a55d03c, TypeALU64>, Enc_fcf7a7 {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010010111;
@@ -4428,7 +4467,7 @@ def F2_dfimm_n : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins u10_0Imm:$Ii),
 "$Rdd32 = dfmake(#$Ii):neg",
-tc_9e313203, TypeALU64>, Enc_e6c957 {
+tc_65279839, TypeALU64>, Enc_e6c957 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101100101;
 let prefersSlot3 = 1;
@@ -4437,16 +4476,84 @@ def F2_dfimm_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins u10_0Imm:$Ii),
 "$Rdd32 = dfmake(#$Ii):pos",
-tc_9e313203, TypeALU64>, Enc_e6c957 {
+tc_65279839, TypeALU64>, Enc_e6c957 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101100100;
 let prefersSlot3 = 1;
 }
+def F2_dfmax : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = dfmax($Rss32,$Rtt32)",
+tc_9b3c0462, TypeM>, Enc_a56825, Requires<[HasV67]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000001;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_dfmin : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = dfmin($Rss32,$Rtt32)",
+tc_9b3c0462, TypeM>, Enc_a56825, Requires<[HasV67]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let isFP = 1;
+let prefersSlot3 = 1;
+let Uses = [USR];
+}
+def F2_dfmpyfix : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = dfmpyfix($Rss32,$Rtt32)",
+tc_f0e8e832, TypeM>, Enc_a56825, Requires<[HasV67]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000010;
+let isFP = 1;
+let Uses = [USR];
+}
+def F2_dfmpyhh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += dfmpyhh($Rss32,$Rtt32)",
+tc_0a195f2c, TypeM>, Enc_88c16c, Requires<[HasV67]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let isFP = 1;
+let Uses = [USR];
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def F2_dfmpylh : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += dfmpylh($Rss32,$Rtt32)",
+tc_01e1be3b, TypeM>, Enc_88c16c, Requires<[HasV67]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010000;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def F2_dfmpyll : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = dfmpyll($Rss32,$Rtt32)",
+tc_556f6577, TypeM>, Enc_a56825, Requires<[HasV67]> {
+let Inst{7-5} = 0b011;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000101;
+let prefersSlot3 = 1;
+}
 def F2_dfsub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = dfsub($Rss32,$Rtt32)",
-tc_2f7c551d, TypeM>, Enc_a56825, Requires<[HasV66]> {
+tc_f0e8e832, TypeM>, Enc_a56825, Requires<[HasV66]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000100;
@@ -4457,7 +4564,7 @@ def F2_sfadd : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfadd($Rs32,$Rt32)",
-tc_3b470976, TypeM>, Enc_5ab2be {
+tc_02fe1c65, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011000;
@@ -4471,7 +4578,7 @@ def F2_sfclass : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Pd4 = sfclass($Rs32,#$Ii)",
-tc_643b4717, TypeS_2op>, Enc_83ee64 {
+tc_a1297125, TypeS_2op>, Enc_83ee64 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000101111;
@@ -4482,7 +4589,7 @@ def F2_sfcmpeq : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.eq($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4494,7 +4601,7 @@ def F2_sfcmpge : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.ge($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4506,7 +4613,7 @@ def F2_sfcmpgt : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.gt($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4518,7 +4625,7 @@ def F2_sfcmpuo : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = sfcmp.uo($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111111;
@@ -4530,7 +4637,7 @@ def F2_sffixupd : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sffixupd($Rs32,$Rt32)",
-tc_3b470976, TypeM>, Enc_5ab2be {
+tc_02fe1c65, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011110;
@@ -4542,7 +4649,7 @@ def F2_sffixupn : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sffixupn($Rs32,$Rt32)",
-tc_3b470976, TypeM>, Enc_5ab2be {
+tc_02fe1c65, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011110;
@@ -4554,7 +4661,7 @@ def F2_sffixupr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = sffixupr($Rs32)",
-tc_3a867367, TypeS_2op>, Enc_5e2823 {
+tc_9783714b, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001011101;
 let hasNewValue = 1;
@@ -4565,7 +4672,7 @@ def F2_sffma : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += sfmpy($Rs32,$Rt32)",
-tc_a58fd5cc, TypeM>, Enc_2ae154 {
+tc_9e72dc89, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4579,7 +4686,7 @@ def F2_sffma_lib : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += sfmpy($Rs32,$Rt32):lib",
-tc_a58fd5cc, TypeM>, Enc_2ae154 {
+tc_9e72dc89, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4593,7 +4700,7 @@ def F2_sffma_sc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32, PredRegs:$Pu4),
 "$Rx32 += sfmpy($Rs32,$Rt32,$Pu4):scale",
-tc_4560740b, TypeM>, Enc_437f33 {
+tc_9edb7c77, TypeM>, Enc_437f33 {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111011;
@@ -4607,7 +4714,7 @@ def F2_sffms : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= sfmpy($Rs32,$Rt32)",
-tc_a58fd5cc, TypeM>, Enc_2ae154 {
+tc_9e72dc89, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4621,7 +4728,7 @@ def F2_sffms_lib : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= sfmpy($Rs32,$Rt32):lib",
-tc_a58fd5cc, TypeM>, Enc_2ae154 {
+tc_9e72dc89, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -4635,7 +4742,7 @@ def F2_sfimm_n : HInst<
 (outs IntRegs:$Rd32),
 (ins u10_0Imm:$Ii),
 "$Rd32 = sfmake(#$Ii):neg",
-tc_9e313203, TypeALU64>, Enc_6c9440 {
+tc_65279839, TypeALU64>, Enc_6c9440 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101011001;
 let hasNewValue = 1;
@@ -4646,7 +4753,7 @@ def F2_sfimm_p : HInst<
 (outs IntRegs:$Rd32),
 (ins u10_0Imm:$Ii),
 "$Rd32 = sfmake(#$Ii):pos",
-tc_9e313203, TypeALU64>, Enc_6c9440 {
+tc_65279839, TypeALU64>, Enc_6c9440 {
 let Inst{20-16} = 0b00000;
 let Inst{31-22} = 0b1101011000;
 let hasNewValue = 1;
@@ -4657,7 +4764,7 @@ def F2_sfinvsqrta : HInst<
 (outs IntRegs:$Rd32, PredRegs:$Pe4),
 (ins IntRegs:$Rs32),
 "$Rd32,$Pe4 = sfinvsqrta($Rs32)",
-tc_b8bffe55, TypeS_2op>, Enc_890909 {
+tc_7f7f45f5, TypeS_2op>, Enc_890909 {
 let Inst{13-7} = 0b0000000;
 let Inst{31-21} = 0b10001011111;
 let hasNewValue = 1;
@@ -4669,7 +4776,7 @@ def F2_sfmax : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmax($Rs32,$Rt32)",
-tc_88b4f13d, TypeM>, Enc_5ab2be {
+tc_c20701f0, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011100;
@@ -4683,7 +4790,7 @@ def F2_sfmin : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmin($Rs32,$Rt32)",
-tc_88b4f13d, TypeM>, Enc_5ab2be {
+tc_c20701f0, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011100;
@@ -4697,7 +4804,7 @@ def F2_sfmpy : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfmpy($Rs32,$Rt32)",
-tc_3b470976, TypeM>, Enc_5ab2be {
+tc_02fe1c65, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011010;
@@ -4711,7 +4818,7 @@ def F2_sfrecipa : HInst<
 (outs IntRegs:$Rd32, PredRegs:$Pe4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32,$Pe4 = sfrecipa($Rs32,$Rt32)",
-tc_2ff964b4, TypeM>, Enc_a94f3b {
+tc_f7569068, TypeM>, Enc_a94f3b {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011111;
@@ -4724,7 +4831,7 @@ def F2_sfsub : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = sfsub($Rs32,$Rt32)",
-tc_3b470976, TypeM>, Enc_5ab2be {
+tc_02fe1c65, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101011000;
@@ -4737,7 +4844,7 @@ def G4_tfrgcpp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins GuestRegs64:$Gss32),
 "$Rdd32 = $Gss32",
-tc_0d8f5752, TypeCR>, Enc_0aa344 {
+tc_fae9dfa5, TypeCR>, Enc_0aa344 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01101000001;
 }
@@ -4745,7 +4852,7 @@ def G4_tfrgcrr : HInst<
 (outs IntRegs:$Rd32),
 (ins GuestRegs:$Gs32),
 "$Rd32 = $Gs32",
-tc_0d8f5752, TypeCR>, Enc_44271f {
+tc_fae9dfa5, TypeCR>, Enc_44271f {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01101010001;
 let hasNewValue = 1;
@@ -4755,7 +4862,7 @@ def G4_tfrgpcp : HInst<
 (outs GuestRegs64:$Gdd32),
 (ins DoubleRegs:$Rss32),
 "$Gdd32 = $Rss32",
-tc_bcf98408, TypeCR>, Enc_ed5027 {
+tc_6ae3426b, TypeCR>, Enc_ed5027 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01100011000;
 let hasNewValue = 1;
@@ -4765,7 +4872,7 @@ def G4_tfrgrcr : HInst<
 (outs GuestRegs:$Gd32),
 (ins IntRegs:$Rs32),
 "$Gd32 = $Rs32",
-tc_bcf98408, TypeCR>, Enc_621fba {
+tc_6ae3426b, TypeCR>, Enc_621fba {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b01100010000;
 let hasNewValue = 1;
@@ -4775,7 +4882,7 @@ def J2_call : HInst<
 (outs),
 (ins a30_2Imm:$Ii),
 "call $Ii",
-tc_4ae7b58b, TypeJ>, Enc_81ac1d, PredRel {
+tc_44fffc58, TypeJ>, Enc_81ac1d, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{31-25} = 0b0101101;
 let isCall = 1;
@@ -4797,7 +4904,7 @@ def J2_callf : HInst<
 (outs),
 (ins PredRegs:$Pu4, a30_2Imm:$Ii),
 "if (!$Pu4) call $Ii",
-tc_1d81e60e, TypeJ>, Enc_daea09, PredRel {
+tc_69bfb303, TypeJ>, Enc_daea09, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b000;
 let Inst{21-21} = 0b1;
@@ -4824,7 +4931,7 @@ def J2_callr : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "callr $Rs32",
-tc_3bd75825, TypeJ>, Enc_ecbcc8 {
+tc_362b0be2, TypeJ>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b01010000101;
 let isCall = 1;
@@ -4838,7 +4945,7 @@ def J2_callrf : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) callr $Rs32",
-tc_1ad90acd, TypeJ>, Enc_88d4d9 {
+tc_dc51281d, TypeJ>, Enc_88d4d9 {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b01010001001;
@@ -4856,7 +4963,7 @@ def J2_callrt : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) callr $Rs32",
-tc_1ad90acd, TypeJ>, Enc_88d4d9 {
+tc_dc51281d, TypeJ>, Enc_88d4d9 {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b01010001000;
@@ -4873,7 +4980,7 @@ def J2_callt : HInst<
 (outs),
 (ins PredRegs:$Pu4, a30_2Imm:$Ii),
 "if ($Pu4) call $Ii",
-tc_1d81e60e, TypeJ>, Enc_daea09, PredRel {
+tc_69bfb303, TypeJ>, Enc_daea09, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b000;
 let Inst{21-21} = 0b0;
@@ -4899,7 +5006,7 @@ def J2_endloop0 : HInst<
 (outs),
 (ins),
 "endloop0",
-tc_1b6f7cec, TypeJ> {
+tc_23708a21, TypeJ> {
 let Uses = [LC0, SA0];
 let Defs = [LC0, P3, PC, USR];
 let isBranch = 1;
@@ -4910,7 +5017,7 @@ def J2_endloop01 : HInst<
 (outs),
 (ins),
 "endloop01",
-tc_1b6f7cec, TypeJ> {
+tc_23708a21, TypeJ> {
 let Uses = [LC0, LC1, SA0, SA1];
 let Defs = [LC0, LC1, P3, PC, USR];
 let isPseudo = 1;
@@ -4919,7 +5026,7 @@ def J2_endloop1 : HInst<
 (outs),
 (ins),
 "endloop1",
-tc_1b6f7cec, TypeJ> {
+tc_23708a21, TypeJ> {
 let Uses = [LC1, SA1];
 let Defs = [LC1, PC];
 let isBranch = 1;
@@ -4930,7 +5037,7 @@ def J2_jump : HInst<
 (outs),
 (ins b30_2Imm:$Ii),
 "jump $Ii",
-tc_ae53734a, TypeJ>, Enc_81ac1d, PredNewRel {
+tc_decdde8a, TypeJ>, Enc_81ac1d, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{31-25} = 0b0101100;
 let isTerminator = 1;
@@ -4938,8 +5045,8 @@ let isBranch = 1;
 let cofRelax2 = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "imm";
 let BaseOpcode = "J2_jump";
+let InputType = "imm";
 let isBarrier = 1;
 let isPredicable = 1;
 let isExtendable = 1;
@@ -4952,7 +5059,7 @@ def J2_jumpf : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if (!$Pu4) jump:nt $Ii",
-tc_db2bce9c, TypeJ>, Enc_daea09, PredNewRel {
+tc_56a124a7, TypeJ>, Enc_daea09, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b000;
 let Inst{21-21} = 0b1;
@@ -4965,8 +5072,8 @@ let cofRelax1 = 1;
 let cofRelax2 = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "imm";
 let BaseOpcode = "J2_jump";
+let InputType = "imm";
 let isTaken = Inst{12};
 let isExtendable = 1;
 let opExtendable = 1;
@@ -4978,7 +5085,7 @@ def J2_jumpf_nopred_map : HInst<
 (outs),
 (ins PredRegs:$Pu4, b15_2Imm:$Ii),
 "if (!$Pu4) jump $Ii",
-tc_db2bce9c, TypeMAPPING>, Requires<[HasV60]> {
+tc_56a124a7, TypeMAPPING>, Requires<[HasV60]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -4986,7 +5093,7 @@ def J2_jumpfnew : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if (!$Pu4.new) jump:nt $Ii",
-tc_20cdee80, TypeJ>, Enc_daea09, PredNewRel {
+tc_eeda4109, TypeJ>, Enc_daea09, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b010;
 let Inst{21-21} = 0b1;
@@ -5000,8 +5107,8 @@ let cofRelax1 = 1;
 let cofRelax2 = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "imm";
 let BaseOpcode = "J2_jump";
+let InputType = "imm";
 let isTaken = Inst{12};
 let isExtendable = 1;
 let opExtendable = 1;
@@ -5013,7 +5120,7 @@ def J2_jumpfnewpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if (!$Pu4.new) jump:t $Ii",
-tc_20cdee80, TypeJ>, Enc_daea09, PredNewRel {
+tc_eeda4109, TypeJ>, Enc_daea09, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b110;
 let Inst{21-21} = 0b1;
@@ -5027,8 +5134,8 @@ let cofRelax1 = 1;
 let cofRelax2 = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "imm";
 let BaseOpcode = "J2_jump";
+let InputType = "imm";
 let isTaken = Inst{12};
 let isExtendable = 1;
 let opExtendable = 1;
@@ -5040,7 +5147,7 @@ def J2_jumpfpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if (!$Pu4) jump:t $Ii",
-tc_cd374165, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel {
+tc_711c805f, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b100;
 let Inst{21-21} = 0b1;
@@ -5053,8 +5160,8 @@ let cofRelax1 = 1;
 let cofRelax2 = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "imm";
 let BaseOpcode = "J2_jump";
+let InputType = "imm";
 let isTaken = Inst{12};
 let isExtendable = 1;
 let opExtendable = 1;
@@ -5066,7 +5173,7 @@ def J2_jumpr : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "jumpr $Rs32",
-tc_d5b7b0c1, TypeJ>, Enc_ecbcc8, PredNewRel {
+tc_60e324ff, TypeJ>, Enc_ecbcc8, PredNewRel {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b01010010100;
 let isTerminator = 1;
@@ -5074,8 +5181,8 @@ let isIndirectBranch = 1;
 let isBranch = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "reg";
 let BaseOpcode = "J2_jumpr";
+let InputType = "reg";
 let isBarrier = 1;
 let isPredicable = 1;
 }
@@ -5083,7 +5190,7 @@ def J2_jumprf : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) jumpr:nt $Rs32",
-tc_85c9c08f, TypeJ>, Enc_88d4d9, PredNewRel {
+tc_2f573607, TypeJ>, Enc_88d4d9, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b01010011011;
@@ -5094,15 +5201,15 @@ let isIndirectBranch = 1;
 let isBranch = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "reg";
 let BaseOpcode = "J2_jumpr";
+let InputType = "reg";
 let isTaken = Inst{12};
 }
 def J2_jumprf_nopred_map : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) jumpr $Rs32",
-tc_85c9c08f, TypeMAPPING>, Requires<[HasV60]> {
+tc_2f573607, TypeMAPPING>, Requires<[HasV60]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -5110,7 +5217,7 @@ def J2_jumprfnew : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) jumpr:nt $Rs32",
-tc_b51dc29a, TypeJ>, Enc_88d4d9, PredNewRel {
+tc_ed03645c, TypeJ>, Enc_88d4d9, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0010;
 let Inst{31-21} = 0b01010011011;
@@ -5122,15 +5229,15 @@ let isBranch = 1;
 let isPredicatedNew = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "reg";
 let BaseOpcode = "J2_jumpr";
+let InputType = "reg";
 let isTaken = Inst{12};
 }
 def J2_jumprfnewpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4.new) jumpr:t $Rs32",
-tc_b51dc29a, TypeJ>, Enc_88d4d9, PredNewRel {
+tc_ed03645c, TypeJ>, Enc_88d4d9, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0110;
 let Inst{31-21} = 0b01010011011;
@@ -5142,15 +5249,15 @@ let isBranch = 1;
 let isPredicatedNew = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "reg";
 let BaseOpcode = "J2_jumpr";
+let InputType = "reg";
 let isTaken = Inst{12};
 }
 def J2_jumprfpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if (!$Pu4) jumpr:t $Rs32",
-tc_e78647bd, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel {
+tc_42ff66ba, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0100;
 let Inst{31-21} = 0b01010011011;
@@ -5161,15 +5268,15 @@ let isIndirectBranch = 1;
 let isBranch = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "reg";
 let BaseOpcode = "J2_jumpr";
+let InputType = "reg";
 let isTaken = Inst{12};
 }
 def J2_jumprgtez : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32>=#0) jump:nt $Ii",
-tc_d9d43ecb, TypeCR>, Enc_0fa531 {
+tc_57a55b54, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b0;
 let Inst{31-22} = 0b0110000101;
@@ -5187,7 +5294,7 @@ def J2_jumprgtezpt : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32>=#0) jump:t $Ii",
-tc_d9d43ecb, TypeCR>, Enc_0fa531 {
+tc_57a55b54, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b1;
 let Inst{31-22} = 0b0110000101;
@@ -5205,7 +5312,7 @@ def J2_jumprltez : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32<=#0) jump:nt $Ii",
-tc_d9d43ecb, TypeCR>, Enc_0fa531 {
+tc_57a55b54, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b0;
 let Inst{31-22} = 0b0110000111;
@@ -5223,7 +5330,7 @@ def J2_jumprltezpt : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32<=#0) jump:t $Ii",
-tc_d9d43ecb, TypeCR>, Enc_0fa531 {
+tc_57a55b54, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b1;
 let Inst{31-22} = 0b0110000111;
@@ -5241,7 +5348,7 @@ def J2_jumprnz : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32==#0) jump:nt $Ii",
-tc_d9d43ecb, TypeCR>, Enc_0fa531 {
+tc_57a55b54, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b0;
 let Inst{31-22} = 0b0110000110;
@@ -5259,7 +5366,7 @@ def J2_jumprnzpt : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32==#0) jump:t $Ii",
-tc_d9d43ecb, TypeCR>, Enc_0fa531 {
+tc_57a55b54, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b1;
 let Inst{31-22} = 0b0110000110;
@@ -5277,7 +5384,7 @@ def J2_jumprt : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) jumpr:nt $Rs32",
-tc_85c9c08f, TypeJ>, Enc_88d4d9, PredNewRel {
+tc_2f573607, TypeJ>, Enc_88d4d9, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b01010011010;
@@ -5287,15 +5394,15 @@ let isIndirectBranch = 1;
 let isBranch = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "reg";
 let BaseOpcode = "J2_jumpr";
+let InputType = "reg";
 let isTaken = Inst{12};
 }
 def J2_jumprt_nopred_map : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) jumpr $Rs32",
-tc_85c9c08f, TypeMAPPING>, Requires<[HasV60]> {
+tc_2f573607, TypeMAPPING>, Requires<[HasV60]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -5303,7 +5410,7 @@ def J2_jumprtnew : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) jumpr:nt $Rs32",
-tc_b51dc29a, TypeJ>, Enc_88d4d9, PredNewRel {
+tc_ed03645c, TypeJ>, Enc_88d4d9, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0010;
 let Inst{31-21} = 0b01010011010;
@@ -5314,15 +5421,15 @@ let isBranch = 1;
 let isPredicatedNew = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "reg";
 let BaseOpcode = "J2_jumpr";
+let InputType = "reg";
 let isTaken = Inst{12};
 }
 def J2_jumprtnewpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4.new) jumpr:t $Rs32",
-tc_b51dc29a, TypeJ>, Enc_88d4d9, PredNewRel {
+tc_ed03645c, TypeJ>, Enc_88d4d9, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0110;
 let Inst{31-21} = 0b01010011010;
@@ -5333,15 +5440,15 @@ let isBranch = 1;
 let isPredicatedNew = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "reg";
 let BaseOpcode = "J2_jumpr";
+let InputType = "reg";
 let isTaken = Inst{12};
 }
 def J2_jumprtpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, IntRegs:$Rs32),
 "if ($Pu4) jumpr:t $Rs32",
-tc_e78647bd, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel {
+tc_42ff66ba, TypeJ>, Enc_88d4d9, Requires<[HasV60]>, PredNewRel {
 let Inst{7-0} = 0b00000000;
 let Inst{13-10} = 0b0100;
 let Inst{31-21} = 0b01010011010;
@@ -5351,15 +5458,15 @@ let isIndirectBranch = 1;
 let isBranch = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "reg";
 let BaseOpcode = "J2_jumpr";
+let InputType = "reg";
 let isTaken = Inst{12};
 }
 def J2_jumprz : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32!=#0) jump:nt $Ii",
-tc_d9d43ecb, TypeCR>, Enc_0fa531 {
+tc_57a55b54, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b0;
 let Inst{31-22} = 0b0110000100;
@@ -5377,7 +5484,7 @@ def J2_jumprzpt : HInst<
 (outs),
 (ins IntRegs:$Rs32, b13_2Imm:$Ii),
 "if ($Rs32!=#0) jump:t $Ii",
-tc_d9d43ecb, TypeCR>, Enc_0fa531 {
+tc_57a55b54, TypeCR>, Enc_0fa531 {
 let Inst{0-0} = 0b0;
 let Inst{12-12} = 0b1;
 let Inst{31-22} = 0b0110000100;
@@ -5395,7 +5502,7 @@ def J2_jumpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if ($Pu4) jump:nt $Ii",
-tc_db2bce9c, TypeJ>, Enc_daea09, PredNewRel {
+tc_56a124a7, TypeJ>, Enc_daea09, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b000;
 let Inst{21-21} = 0b0;
@@ -5407,8 +5514,8 @@ let cofRelax1 = 1;
 let cofRelax2 = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "imm";
 let BaseOpcode = "J2_jump";
+let InputType = "imm";
 let isTaken = Inst{12};
 let isExtendable = 1;
 let opExtendable = 1;
@@ -5420,7 +5527,7 @@ def J2_jumpt_nopred_map : HInst<
 (outs),
 (ins PredRegs:$Pu4, b15_2Imm:$Ii),
 "if ($Pu4) jump $Ii",
-tc_db2bce9c, TypeMAPPING>, Requires<[HasV60]> {
+tc_56a124a7, TypeMAPPING>, Requires<[HasV60]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -5428,7 +5535,7 @@ def J2_jumptnew : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if ($Pu4.new) jump:nt $Ii",
-tc_20cdee80, TypeJ>, Enc_daea09, PredNewRel {
+tc_eeda4109, TypeJ>, Enc_daea09, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b010;
 let Inst{21-21} = 0b0;
@@ -5441,8 +5548,8 @@ let cofRelax1 = 1;
 let cofRelax2 = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "imm";
 let BaseOpcode = "J2_jump";
+let InputType = "imm";
 let isTaken = Inst{12};
 let isExtendable = 1;
 let opExtendable = 1;
@@ -5454,7 +5561,7 @@ def J2_jumptnewpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if ($Pu4.new) jump:t $Ii",
-tc_20cdee80, TypeJ>, Enc_daea09, PredNewRel {
+tc_eeda4109, TypeJ>, Enc_daea09, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b110;
 let Inst{21-21} = 0b0;
@@ -5467,8 +5574,8 @@ let cofRelax1 = 1;
 let cofRelax2 = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "imm";
 let BaseOpcode = "J2_jump";
+let InputType = "imm";
 let isTaken = Inst{12};
 let isExtendable = 1;
 let opExtendable = 1;
@@ -5480,7 +5587,7 @@ def J2_jumptpt : HInst<
 (outs),
 (ins PredRegs:$Pu4, b30_2Imm:$Ii),
 "if ($Pu4) jump:t $Ii",
-tc_cd374165, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel {
+tc_711c805f, TypeJ>, Enc_daea09, Requires<[HasV60]>, PredNewRel {
 let Inst{0-0} = 0b0;
 let Inst{12-10} = 0b100;
 let Inst{21-21} = 0b0;
@@ -5492,8 +5599,8 @@ let cofRelax1 = 1;
 let cofRelax2 = 1;
 let cofMax1 = 1;
 let Defs = [PC];
-let InputType = "imm";
 let BaseOpcode = "J2_jump";
+let InputType = "imm";
 let isTaken = Inst{12};
 let isExtendable = 1;
 let opExtendable = 1;
@@ -5505,7 +5612,7 @@ def J2_loop0i : HInst<
 (outs),
 (ins b30_2Imm:$Ii, u10_0Imm:$II),
 "loop0($Ii,#$II)",
-tc_a9d88b22, TypeCR>, Enc_4dc228 {
+tc_1248597c, TypeCR>, Enc_4dc228 {
 let Inst{2-2} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01101001000;
@@ -5522,7 +5629,7 @@ def J2_loop0r : HInst<
 (outs),
 (ins b30_2Imm:$Ii, IntRegs:$Rs32),
 "loop0($Ii,$Rs32)",
-tc_df3319ed, TypeCR>, Enc_864a5a {
+tc_9406230a, TypeCR>, Enc_864a5a {
 let Inst{2-0} = 0b000;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
@@ -5540,7 +5647,7 @@ def J2_loop1i : HInst<
 (outs),
 (ins b30_2Imm:$Ii, u10_0Imm:$II),
 "loop1($Ii,#$II)",
-tc_a9d88b22, TypeCR>, Enc_4dc228 {
+tc_1248597c, TypeCR>, Enc_4dc228 {
 let Inst{2-2} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01101001001;
@@ -5557,7 +5664,7 @@ def J2_loop1r : HInst<
 (outs),
 (ins b30_2Imm:$Ii, IntRegs:$Rs32),
 "loop1($Ii,$Rs32)",
-tc_df3319ed, TypeCR>, Enc_864a5a {
+tc_9406230a, TypeCR>, Enc_864a5a {
 let Inst{2-0} = 0b000;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
@@ -5575,7 +5682,7 @@ def J2_pause : HInst<
 (outs),
 (ins u8_0Imm:$Ii),
 "pause(#$Ii)",
-tc_8d9d0154, TypeJ>, Enc_a51a9a {
+tc_d57d649c, TypeJ>, Enc_a51a9a {
 let Inst{1-0} = 0b00;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
@@ -5586,7 +5693,7 @@ def J2_ploop1si : HInst<
 (outs),
 (ins b30_2Imm:$Ii, u10_0Imm:$II),
 "p3 = sp1loop0($Ii,#$II)",
-tc_1c4528a2, TypeCR>, Enc_4dc228 {
+tc_4abdbdc6, TypeCR>, Enc_4dc228 {
 let Inst{2-2} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01101001101;
@@ -5604,7 +5711,7 @@ def J2_ploop1sr : HInst<
 (outs),
 (ins b30_2Imm:$Ii, IntRegs:$Rs32),
 "p3 = sp1loop0($Ii,$Rs32)",
-tc_32779c6f, TypeCR>, Enc_864a5a {
+tc_6d861a95, TypeCR>, Enc_864a5a {
 let Inst{2-0} = 0b000;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
@@ -5623,7 +5730,7 @@ def J2_ploop2si : HInst<
 (outs),
 (ins b30_2Imm:$Ii, u10_0Imm:$II),
 "p3 = sp2loop0($Ii,#$II)",
-tc_1c4528a2, TypeCR>, Enc_4dc228 {
+tc_4abdbdc6, TypeCR>, Enc_4dc228 {
 let Inst{2-2} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01101001110;
@@ -5641,7 +5748,7 @@ def J2_ploop2sr : HInst<
 (outs),
 (ins b30_2Imm:$Ii, IntRegs:$Rs32),
 "p3 = sp2loop0($Ii,$Rs32)",
-tc_32779c6f, TypeCR>, Enc_864a5a {
+tc_6d861a95, TypeCR>, Enc_864a5a {
 let Inst{2-0} = 0b000;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
@@ -5660,7 +5767,7 @@ def J2_ploop3si : HInst<
 (outs),
 (ins b30_2Imm:$Ii, u10_0Imm:$II),
 "p3 = sp3loop0($Ii,#$II)",
-tc_1c4528a2, TypeCR>, Enc_4dc228 {
+tc_4abdbdc6, TypeCR>, Enc_4dc228 {
 let Inst{2-2} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01101001111;
@@ -5678,7 +5785,7 @@ def J2_ploop3sr : HInst<
 (outs),
 (ins b30_2Imm:$Ii, IntRegs:$Rs32),
 "p3 = sp3loop0($Ii,$Rs32)",
-tc_32779c6f, TypeCR>, Enc_864a5a {
+tc_6d861a95, TypeCR>, Enc_864a5a {
 let Inst{2-0} = 0b000;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
@@ -5697,7 +5804,7 @@ def J2_trap0 : HInst<
 (outs),
 (ins u8_0Imm:$Ii),
 "trap0(#$Ii)",
-tc_fc3999b4, TypeJ>, Enc_a51a9a {
+tc_45f9d1be, TypeJ>, Enc_a51a9a {
 let Inst{1-0} = 0b00;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
@@ -5709,7 +5816,7 @@ def J2_trap1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, u8_0Imm:$Ii),
 "trap1($Rx32,#$Ii)",
-tc_b9e09e03, TypeJ>, Enc_33f8ba {
+tc_53c851ab, TypeJ>, Enc_33f8ba, Requires<[HasV65]> {
 let Inst{1-0} = 0b00;
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
@@ -5726,7 +5833,7 @@ def J2_trap1_noregmap : HInst<
 (outs),
 (ins u8_0Imm:$Ii),
 "trap1(#$Ii)",
-tc_b9e09e03, TypeMAPPING> {
+tc_53c851ab, TypeMAPPING>, Requires<[HasV65]> {
 let hasSideEffects = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
@@ -5735,7 +5842,7 @@ def J4_cmpeq_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (!cmp.eq($Ns8.new,$Rt32)) jump:nt $Ii",
-tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_24e109c7, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -5761,7 +5868,7 @@ def J4_cmpeq_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (!cmp.eq($Ns8.new,$Rt32)) jump:t $Ii",
-tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_24e109c7, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -5787,7 +5894,7 @@ def J4_cmpeq_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b00;
 let Inst{31-22} = 0b0001010001;
@@ -5813,7 +5920,7 @@ def J4_cmpeq_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b10;
 let Inst{31-22} = 0b0001010001;
@@ -5839,7 +5946,7 @@ def J4_cmpeq_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-22} = 0b0001010001;
@@ -5865,7 +5972,7 @@ def J4_cmpeq_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b11;
 let Inst{31-22} = 0b0001010001;
@@ -5891,7 +5998,7 @@ def J4_cmpeq_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (cmp.eq($Ns8.new,$Rt32)) jump:nt $Ii",
-tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_24e109c7, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -5916,7 +6023,7 @@ def J4_cmpeq_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (cmp.eq($Ns8.new,$Rt32)) jump:t $Ii",
-tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_24e109c7, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -5941,7 +6048,7 @@ def J4_cmpeq_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b00;
 let Inst{31-22} = 0b0001010000;
@@ -5966,7 +6073,7 @@ def J4_cmpeq_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,$Rt16); if (p0.new) jump:t $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b10;
 let Inst{31-22} = 0b0001010000;
@@ -5991,7 +6098,7 @@ def J4_cmpeq_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-22} = 0b0001010000;
@@ -6016,7 +6123,7 @@ def J4_cmpeq_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,$Rt16); if (p1.new) jump:t $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b11;
 let Inst{31-22} = 0b0001010000;
@@ -6041,7 +6148,7 @@ def J4_cmpeqi_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (!cmp.eq($Ns8.new,#$II)) jump:nt $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -6067,7 +6174,7 @@ def J4_cmpeqi_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (!cmp.eq($Ns8.new,#$II)) jump:t $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -6093,7 +6200,7 @@ def J4_cmpeqi_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$II); if (!p0.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001000001;
@@ -6119,7 +6226,7 @@ def J4_cmpeqi_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$II); if (!p0.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001000001;
@@ -6145,7 +6252,7 @@ def J4_cmpeqi_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$II); if (!p1.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001001001;
@@ -6171,7 +6278,7 @@ def J4_cmpeqi_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$II); if (!p1.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001001001;
@@ -6197,7 +6304,7 @@ def J4_cmpeqi_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (cmp.eq($Ns8.new,#$II)) jump:nt $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -6222,7 +6329,7 @@ def J4_cmpeqi_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (cmp.eq($Ns8.new,#$II)) jump:t $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -6247,7 +6354,7 @@ def J4_cmpeqi_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$II); if (p0.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001000000;
@@ -6272,7 +6379,7 @@ def J4_cmpeqi_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$II); if (p0.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001000000;
@@ -6297,7 +6404,7 @@ def J4_cmpeqi_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$II); if (p1.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001001000;
@@ -6322,7 +6429,7 @@ def J4_cmpeqi_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$II); if (p1.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001001000;
@@ -6347,7 +6454,7 @@ def J4_cmpeqn1_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (!cmp.eq($Ns8.new,#$n1)) jump:nt $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_e90a15, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_e90a15, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{19-19} = 0b0;
@@ -6373,7 +6480,7 @@ def J4_cmpeqn1_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (!cmp.eq($Ns8.new,#$n1)) jump:t $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_5a18b3, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_5a18b3, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{19-19} = 0b0;
@@ -6399,7 +6506,7 @@ def J4_cmpeqn1_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$n1); if (!p0.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_1de724, PredRel {
+tc_24f426ab, TypeCJ>, Enc_1de724, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{31-22} = 0b0001000111;
@@ -6425,7 +6532,7 @@ def J4_cmpeqn1_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$n1); if (!p0.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_14640c, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14640c, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{31-22} = 0b0001000111;
@@ -6451,7 +6558,7 @@ def J4_cmpeqn1_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$n1); if (!p1.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_668704, PredRel {
+tc_24f426ab, TypeCJ>, Enc_668704, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{31-22} = 0b0001001111;
@@ -6477,7 +6584,7 @@ def J4_cmpeqn1_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$n1); if (!p1.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_800e04, PredRel {
+tc_24f426ab, TypeCJ>, Enc_800e04, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{31-22} = 0b0001001111;
@@ -6503,7 +6610,7 @@ def J4_cmpeqn1_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (cmp.eq($Ns8.new,#$n1)) jump:nt $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_4aca3a, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_4aca3a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{19-19} = 0b0;
@@ -6528,7 +6635,7 @@ def J4_cmpeqn1_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (cmp.eq($Ns8.new,#$n1)) jump:t $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_f7ea77, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_f7ea77, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{19-19} = 0b0;
@@ -6553,7 +6660,7 @@ def J4_cmpeqn1_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$n1); if (p0.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_405228, PredRel {
+tc_24f426ab, TypeCJ>, Enc_405228, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{31-22} = 0b0001000110;
@@ -6578,7 +6685,7 @@ def J4_cmpeqn1_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$n1); if (p0.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_3a2484, PredRel {
+tc_24f426ab, TypeCJ>, Enc_3a2484, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{31-22} = 0b0001000110;
@@ -6603,7 +6710,7 @@ def J4_cmpeqn1_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$n1); if (p1.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_736575, PredRel {
+tc_24f426ab, TypeCJ>, Enc_736575, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{31-22} = 0b0001001110;
@@ -6628,7 +6735,7 @@ def J4_cmpeqn1_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.eq($Rs16,#$n1); if (p1.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_8e583a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_8e583a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{31-22} = 0b0001001110;
@@ -6653,7 +6760,7 @@ def J4_cmpgt_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (!cmp.gt($Ns8.new,$Rt32)) jump:nt $Ii",
-tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_24e109c7, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -6679,7 +6786,7 @@ def J4_cmpgt_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (!cmp.gt($Ns8.new,$Rt32)) jump:t $Ii",
-tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_24e109c7, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -6705,7 +6812,7 @@ def J4_cmpgt_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b00;
 let Inst{31-22} = 0b0001010011;
@@ -6731,7 +6838,7 @@ def J4_cmpgt_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b10;
 let Inst{31-22} = 0b0001010011;
@@ -6757,7 +6864,7 @@ def J4_cmpgt_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-22} = 0b0001010011;
@@ -6783,7 +6890,7 @@ def J4_cmpgt_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b11;
 let Inst{31-22} = 0b0001010011;
@@ -6809,7 +6916,7 @@ def J4_cmpgt_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (cmp.gt($Ns8.new,$Rt32)) jump:nt $Ii",
-tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_24e109c7, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -6834,7 +6941,7 @@ def J4_cmpgt_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (cmp.gt($Ns8.new,$Rt32)) jump:t $Ii",
-tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_24e109c7, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -6859,7 +6966,7 @@ def J4_cmpgt_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b00;
 let Inst{31-22} = 0b0001010010;
@@ -6884,7 +6991,7 @@ def J4_cmpgt_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,$Rt16); if (p0.new) jump:t $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b10;
 let Inst{31-22} = 0b0001010010;
@@ -6909,7 +7016,7 @@ def J4_cmpgt_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-22} = 0b0001010010;
@@ -6934,7 +7041,7 @@ def J4_cmpgt_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,$Rt16); if (p1.new) jump:t $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b11;
 let Inst{31-22} = 0b0001010010;
@@ -6959,7 +7066,7 @@ def J4_cmpgti_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (!cmp.gt($Ns8.new,#$II)) jump:nt $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -6985,7 +7092,7 @@ def J4_cmpgti_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (!cmp.gt($Ns8.new,#$II)) jump:t $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -7011,7 +7118,7 @@ def J4_cmpgti_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$II); if (!p0.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001000011;
@@ -7037,7 +7144,7 @@ def J4_cmpgti_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$II); if (!p0.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001000011;
@@ -7063,7 +7170,7 @@ def J4_cmpgti_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$II); if (!p1.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001001011;
@@ -7089,7 +7196,7 @@ def J4_cmpgti_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$II); if (!p1.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001001011;
@@ -7115,7 +7222,7 @@ def J4_cmpgti_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (cmp.gt($Ns8.new,#$II)) jump:nt $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -7140,7 +7247,7 @@ def J4_cmpgti_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (cmp.gt($Ns8.new,#$II)) jump:t $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -7165,7 +7272,7 @@ def J4_cmpgti_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$II); if (p0.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001000010;
@@ -7190,7 +7297,7 @@ def J4_cmpgti_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$II); if (p0.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001000010;
@@ -7215,7 +7322,7 @@ def J4_cmpgti_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$II); if (p1.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001001010;
@@ -7240,7 +7347,7 @@ def J4_cmpgti_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$II); if (p1.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001001010;
@@ -7265,7 +7372,7 @@ def J4_cmpgtn1_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (!cmp.gt($Ns8.new,#$n1)) jump:nt $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_3694bd, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_3694bd, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{19-19} = 0b0;
@@ -7291,7 +7398,7 @@ def J4_cmpgtn1_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (!cmp.gt($Ns8.new,#$n1)) jump:t $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_a6853f, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_a6853f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{19-19} = 0b0;
@@ -7317,7 +7424,7 @@ def J4_cmpgtn1_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$n1); if (!p0.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_a42857, PredRel {
+tc_24f426ab, TypeCJ>, Enc_a42857, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000001;
 let Inst{31-22} = 0b0001000111;
@@ -7343,7 +7450,7 @@ def J4_cmpgtn1_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$n1); if (!p0.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_f6fe0b, PredRel {
+tc_24f426ab, TypeCJ>, Enc_f6fe0b, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100001;
 let Inst{31-22} = 0b0001000111;
@@ -7369,7 +7476,7 @@ def J4_cmpgtn1_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$n1); if (!p1.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_3e3989, PredRel {
+tc_24f426ab, TypeCJ>, Enc_3e3989, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000001;
 let Inst{31-22} = 0b0001001111;
@@ -7395,7 +7502,7 @@ def J4_cmpgtn1_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$n1); if (!p1.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_b909d2, PredRel {
+tc_24f426ab, TypeCJ>, Enc_b909d2, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100001;
 let Inst{31-22} = 0b0001001111;
@@ -7421,7 +7528,7 @@ def J4_cmpgtn1_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (cmp.gt($Ns8.new,#$n1)) jump:nt $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_f82302, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_f82302, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{19-19} = 0b0;
@@ -7446,7 +7553,7 @@ def J4_cmpgtn1_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, n1Const:$n1, b30_2Imm:$Ii),
 "if (cmp.gt($Ns8.new,#$n1)) jump:t $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_6413b6, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_6413b6, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{19-19} = 0b0;
@@ -7471,7 +7578,7 @@ def J4_cmpgtn1_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$n1); if (p0.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_b78edd, PredRel {
+tc_24f426ab, TypeCJ>, Enc_b78edd, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000001;
 let Inst{31-22} = 0b0001000110;
@@ -7496,7 +7603,7 @@ def J4_cmpgtn1_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p0 = cmp.gt($Rs16,#$n1); if (p0.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_041d7b, PredRel {
+tc_24f426ab, TypeCJ>, Enc_041d7b, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100001;
 let Inst{31-22} = 0b0001000110;
@@ -7521,7 +7628,7 @@ def J4_cmpgtn1_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$n1); if (p1.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_b1e1fb, PredRel {
+tc_24f426ab, TypeCJ>, Enc_b1e1fb, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000001;
 let Inst{31-22} = 0b0001001110;
@@ -7546,7 +7653,7 @@ def J4_cmpgtn1_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1, b30_2Imm:$Ii),
 "p1 = cmp.gt($Rs16,#$n1); if (p1.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_178717, PredRel {
+tc_24f426ab, TypeCJ>, Enc_178717, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100001;
 let Inst{31-22} = 0b0001001110;
@@ -7571,7 +7678,7 @@ def J4_cmpgtu_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (!cmp.gtu($Ns8.new,$Rt32)) jump:nt $Ii",
-tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_24e109c7, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -7597,7 +7704,7 @@ def J4_cmpgtu_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (!cmp.gtu($Ns8.new,$Rt32)) jump:t $Ii",
-tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_24e109c7, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -7623,7 +7730,7 @@ def J4_cmpgtu_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,$Rt16); if (!p0.new) jump:nt $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b00;
 let Inst{31-22} = 0b0001010101;
@@ -7649,7 +7756,7 @@ def J4_cmpgtu_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,$Rt16); if (!p0.new) jump:t $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b10;
 let Inst{31-22} = 0b0001010101;
@@ -7675,7 +7782,7 @@ def J4_cmpgtu_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,$Rt16); if (!p1.new) jump:nt $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-22} = 0b0001010101;
@@ -7701,7 +7808,7 @@ def J4_cmpgtu_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,$Rt16); if (!p1.new) jump:t $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b11;
 let Inst{31-22} = 0b0001010101;
@@ -7727,7 +7834,7 @@ def J4_cmpgtu_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (cmp.gtu($Ns8.new,$Rt32)) jump:nt $Ii",
-tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_24e109c7, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -7752,7 +7859,7 @@ def J4_cmpgtu_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, IntRegs:$Rt32, b30_2Imm:$Ii),
 "if (cmp.gtu($Ns8.new,$Rt32)) jump:t $Ii",
-tc_9bfd761f, TypeNCJ>, Enc_c9a18e, PredRel {
+tc_24e109c7, TypeNCJ>, Enc_c9a18e, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -7777,7 +7884,7 @@ def J4_cmpgtu_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,$Rt16); if (p0.new) jump:nt $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b00;
 let Inst{31-22} = 0b0001010100;
@@ -7802,7 +7909,7 @@ def J4_cmpgtu_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,$Rt16); if (p0.new) jump:t $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b10;
 let Inst{31-22} = 0b0001010100;
@@ -7827,7 +7934,7 @@ def J4_cmpgtu_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,$Rt16); if (p1.new) jump:nt $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-22} = 0b0001010100;
@@ -7852,7 +7959,7 @@ def J4_cmpgtu_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, GeneralSubRegs:$Rt16, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,$Rt16); if (p1.new) jump:t $Ii",
-tc_56336eb0, TypeCJ>, Enc_6a5972, PredRel {
+tc_9e27f2f9, TypeCJ>, Enc_6a5972, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b11;
 let Inst{31-22} = 0b0001010100;
@@ -7877,7 +7984,7 @@ def J4_cmpgtui_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (!cmp.gtu($Ns8.new,#$II)) jump:nt $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -7903,7 +8010,7 @@ def J4_cmpgtui_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (!cmp.gtu($Ns8.new,#$II)) jump:t $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -7929,7 +8036,7 @@ def J4_cmpgtui_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,#$II); if (!p0.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001000101;
@@ -7955,7 +8062,7 @@ def J4_cmpgtui_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,#$II); if (!p0.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001000101;
@@ -7981,7 +8088,7 @@ def J4_cmpgtui_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,#$II); if (!p1.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001001101;
@@ -8007,7 +8114,7 @@ def J4_cmpgtui_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,#$II); if (!p1.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001001101;
@@ -8033,7 +8140,7 @@ def J4_cmpgtui_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (cmp.gtu($Ns8.new,#$II)) jump:nt $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -8058,7 +8165,7 @@ def J4_cmpgtui_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, u5_0Imm:$II, b30_2Imm:$Ii),
 "if (cmp.gtu($Ns8.new,#$II)) jump:t $Ii",
-tc_bd8382d1, TypeNCJ>, Enc_eafd18, PredRel {
+tc_f6e2aff9, TypeNCJ>, Enc_eafd18, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -8083,7 +8190,7 @@ def J4_cmpgtui_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,#$II); if (p0.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001000100;
@@ -8108,7 +8215,7 @@ def J4_cmpgtui_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p0 = cmp.gtu($Rs16,#$II); if (p0.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001000100;
@@ -8133,7 +8240,7 @@ def J4_cmpgtui_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,#$II); if (p1.new) jump:nt $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-22} = 0b0001001100;
@@ -8158,7 +8265,7 @@ def J4_cmpgtui_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u5_0Imm:$II, b30_2Imm:$Ii),
 "p1 = cmp.gtu($Rs16,#$II); if (p1.new) jump:t $Ii",
-tc_3d495a39, TypeCJ>, Enc_14d27a, PredRel {
+tc_24f426ab, TypeCJ>, Enc_14d27a, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-22} = 0b0001001100;
@@ -8183,7 +8290,7 @@ def J4_cmplt_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (!cmp.gt($Rt32,$Ns8.new)) jump:nt $Ii",
-tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
+tc_975a4e54, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -8209,7 +8316,7 @@ def J4_cmplt_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (!cmp.gt($Rt32,$Ns8.new)) jump:t $Ii",
-tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
+tc_975a4e54, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -8235,7 +8342,7 @@ def J4_cmplt_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (cmp.gt($Rt32,$Ns8.new)) jump:nt $Ii",
-tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
+tc_975a4e54, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -8260,7 +8367,7 @@ def J4_cmplt_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (cmp.gt($Rt32,$Ns8.new)) jump:t $Ii",
-tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
+tc_975a4e54, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -8285,7 +8392,7 @@ def J4_cmpltu_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (!cmp.gtu($Rt32,$Ns8.new)) jump:nt $Ii",
-tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
+tc_975a4e54, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -8311,7 +8418,7 @@ def J4_cmpltu_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (!cmp.gtu($Rt32,$Ns8.new)) jump:t $Ii",
-tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
+tc_975a4e54, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -8337,7 +8444,7 @@ def J4_cmpltu_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (cmp.gtu($Rt32,$Ns8.new)) jump:nt $Ii",
-tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
+tc_975a4e54, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{19-19} = 0b0;
@@ -8362,7 +8469,7 @@ def J4_cmpltu_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Rt32, IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (cmp.gtu($Rt32,$Ns8.new)) jump:t $Ii",
-tc_b343892a, TypeNCJ>, Enc_5de85f, PredRel {
+tc_975a4e54, TypeNCJ>, Enc_5de85f, PredRel {
 let Inst{0-0} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{19-19} = 0b0;
@@ -8387,7 +8494,7 @@ def J4_hintjumpr : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "hintjr($Rs32)",
-tc_d5b7b0c1, TypeJ>, Enc_ecbcc8 {
+tc_60e324ff, TypeJ>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b01010010101;
 let isTerminator = 1;
@@ -8399,7 +8506,7 @@ def J4_jumpseti : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins u6_0Imm:$II, b30_2Imm:$Ii),
 "$Rd16 = #$II ; jump $Ii",
-tc_0663f615, TypeCJ>, Enc_9e4c3f {
+tc_5502c366, TypeCJ>, Enc_9e4c3f {
 let Inst{0-0} = 0b0;
 let Inst{31-22} = 0b0001011000;
 let hasNewValue = 1;
@@ -8419,7 +8526,7 @@ def J4_jumpsetr : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "$Rd16 = $Rs16 ; jump $Ii",
-tc_0663f615, TypeCJ>, Enc_66bce1 {
+tc_5502c366, TypeCJ>, Enc_66bce1 {
 let Inst{0-0} = 0b0;
 let Inst{13-12} = 0b00;
 let Inst{31-22} = 0b0001011100;
@@ -8440,7 +8547,7 @@ def J4_tstbit0_f_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (!tstbit($Ns8.new,#0)) jump:nt $Ii",
-tc_8c945be0, TypeNCJ>, Enc_69d63b {
+tc_7b9187d3, TypeNCJ>, Enc_69d63b {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{19-19} = 0b0;
@@ -8465,7 +8572,7 @@ def J4_tstbit0_f_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (!tstbit($Ns8.new,#0)) jump:t $Ii",
-tc_8c945be0, TypeNCJ>, Enc_69d63b {
+tc_7b9187d3, TypeNCJ>, Enc_69d63b {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{19-19} = 0b0;
@@ -8490,7 +8597,7 @@ def J4_tstbit0_fp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p0 = tstbit($Rs16,#0); if (!p0.new) jump:nt $Ii",
-tc_2332b92e, TypeCJ>, Enc_ad1c74 {
+tc_f999c66e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000011;
 let Inst{31-22} = 0b0001000111;
@@ -8515,7 +8622,7 @@ def J4_tstbit0_fp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p0 = tstbit($Rs16,#0); if (!p0.new) jump:t $Ii",
-tc_2332b92e, TypeCJ>, Enc_ad1c74 {
+tc_f999c66e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100011;
 let Inst{31-22} = 0b0001000111;
@@ -8540,7 +8647,7 @@ def J4_tstbit0_fp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p1 = tstbit($Rs16,#0); if (!p1.new) jump:nt $Ii",
-tc_2332b92e, TypeCJ>, Enc_ad1c74 {
+tc_f999c66e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000011;
 let Inst{31-22} = 0b0001001111;
@@ -8565,7 +8672,7 @@ def J4_tstbit0_fp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p1 = tstbit($Rs16,#0); if (!p1.new) jump:t $Ii",
-tc_2332b92e, TypeCJ>, Enc_ad1c74 {
+tc_f999c66e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100011;
 let Inst{31-22} = 0b0001001111;
@@ -8590,7 +8697,7 @@ def J4_tstbit0_t_jumpnv_nt : HInst<
 (outs),
 (ins IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (tstbit($Ns8.new,#0)) jump:nt $Ii",
-tc_8c945be0, TypeNCJ>, Enc_69d63b {
+tc_7b9187d3, TypeNCJ>, Enc_69d63b {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000000;
 let Inst{19-19} = 0b0;
@@ -8614,7 +8721,7 @@ def J4_tstbit0_t_jumpnv_t : HInst<
 (outs),
 (ins IntRegs:$Ns8, b30_2Imm:$Ii),
 "if (tstbit($Ns8.new,#0)) jump:t $Ii",
-tc_8c945be0, TypeNCJ>, Enc_69d63b {
+tc_7b9187d3, TypeNCJ>, Enc_69d63b {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100000;
 let Inst{19-19} = 0b0;
@@ -8638,7 +8745,7 @@ def J4_tstbit0_tp0_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p0 = tstbit($Rs16,#0); if (p0.new) jump:nt $Ii",
-tc_2332b92e, TypeCJ>, Enc_ad1c74 {
+tc_f999c66e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000011;
 let Inst{31-22} = 0b0001000110;
@@ -8662,7 +8769,7 @@ def J4_tstbit0_tp0_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p0 = tstbit($Rs16,#0); if (p0.new) jump:t $Ii",
-tc_2332b92e, TypeCJ>, Enc_ad1c74 {
+tc_f999c66e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100011;
 let Inst{31-22} = 0b0001000110;
@@ -8686,7 +8793,7 @@ def J4_tstbit0_tp1_jump_nt : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p1 = tstbit($Rs16,#0); if (p1.new) jump:nt $Ii",
-tc_2332b92e, TypeCJ>, Enc_ad1c74 {
+tc_f999c66e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b000011;
 let Inst{31-22} = 0b0001001110;
@@ -8710,7 +8817,7 @@ def J4_tstbit0_tp1_jump_t : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, b30_2Imm:$Ii),
 "p1 = tstbit($Rs16,#0); if (p1.new) jump:t $Ii",
-tc_2332b92e, TypeCJ>, Enc_ad1c74 {
+tc_f999c66e, TypeCJ>, Enc_ad1c74 {
 let Inst{0-0} = 0b0;
 let Inst{13-8} = 0b100011;
 let Inst{31-22} = 0b0001001110;
@@ -8734,7 +8841,7 @@ def L2_deallocframe : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = deallocframe($Rs32):raw",
-tc_15aa71c5, TypeLD>, Enc_3a3d62 {
+tc_e9170fb7, TypeLD>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10010000000;
 let accessSize = DoubleWordAccess;
@@ -8746,7 +8853,7 @@ def L2_loadalignb_io : HInst<
 (outs DoubleRegs:$Ryy32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Ryy32 = memb_fifo($Rs32+#$Ii)",
-tc_5ef37dc4, TypeLD>, Enc_a27588 {
+tc_fedb7e19, TypeLD>, Enc_a27588 {
 let Inst{24-21} = 0b0100;
 let Inst{31-27} = 0b10010;
 let addrMode = BaseImmOffset;
@@ -8763,7 +8870,7 @@ def L2_loadalignb_pbr : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Ryy32 = memb_fifo($Rx32++$Mu2:brev)",
-tc_3c76b0ff, TypeLD>, Enc_1f5d8f {
+tc_1c7522a8, TypeLD>, Enc_1f5d8f {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011110100;
 let addrMode = PostInc;
@@ -8775,7 +8882,7 @@ def L2_loadalignb_pci : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
 "$Ryy32 = memb_fifo($Rx32++#$Ii:circ($Mu2))",
-tc_785f65a7, TypeLD>, Enc_74aef2 {
+tc_76bb5435, TypeLD>, Enc_74aef2 {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011000100;
 let addrMode = PostInc;
@@ -8788,7 +8895,7 @@ def L2_loadalignb_pcr : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Ryy32 = memb_fifo($Rx32++I:circ($Mu2))",
-tc_3c76b0ff, TypeLD>, Enc_1f5d8f {
+tc_1c7522a8, TypeLD>, Enc_1f5d8f {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011000100;
 let addrMode = PostInc;
@@ -8801,7 +8908,7 @@ def L2_loadalignb_pi : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "$Ryy32 = memb_fifo($Rx32++#$Ii)",
-tc_3c76b0ff, TypeLD>, Enc_6b197f {
+tc_1c7522a8, TypeLD>, Enc_6b197f {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011010100;
 let addrMode = PostInc;
@@ -8813,7 +8920,7 @@ def L2_loadalignb_pr : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Ryy32 = memb_fifo($Rx32++$Mu2)",
-tc_3c76b0ff, TypeLD>, Enc_1f5d8f {
+tc_1c7522a8, TypeLD>, Enc_1f5d8f {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011100100;
 let addrMode = PostInc;
@@ -8825,7 +8932,7 @@ def L2_loadalignb_zomap : HInst<
 (outs DoubleRegs:$Ryy32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rs32),
 "$Ryy32 = memb_fifo($Rs32)",
-tc_5ef37dc4, TypeMAPPING> {
+tc_fedb7e19, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let Constraints = "$Ryy32 = $Ryy32in";
@@ -8834,7 +8941,7 @@ def L2_loadalignh_io : HInst<
 (outs DoubleRegs:$Ryy32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rs32, s31_1Imm:$Ii),
 "$Ryy32 = memh_fifo($Rs32+#$Ii)",
-tc_5ef37dc4, TypeLD>, Enc_5cd7e9 {
+tc_fedb7e19, TypeLD>, Enc_5cd7e9 {
 let Inst{24-21} = 0b0010;
 let Inst{31-27} = 0b10010;
 let addrMode = BaseImmOffset;
@@ -8851,7 +8958,7 @@ def L2_loadalignh_pbr : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Ryy32 = memh_fifo($Rx32++$Mu2:brev)",
-tc_3c76b0ff, TypeLD>, Enc_1f5d8f {
+tc_1c7522a8, TypeLD>, Enc_1f5d8f {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011110010;
 let addrMode = PostInc;
@@ -8863,7 +8970,7 @@ def L2_loadalignh_pci : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
 "$Ryy32 = memh_fifo($Rx32++#$Ii:circ($Mu2))",
-tc_785f65a7, TypeLD>, Enc_9e2e1c {
+tc_76bb5435, TypeLD>, Enc_9e2e1c {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011000010;
 let addrMode = PostInc;
@@ -8876,7 +8983,7 @@ def L2_loadalignh_pcr : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Ryy32 = memh_fifo($Rx32++I:circ($Mu2))",
-tc_3c76b0ff, TypeLD>, Enc_1f5d8f {
+tc_1c7522a8, TypeLD>, Enc_1f5d8f {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011000010;
 let addrMode = PostInc;
@@ -8889,7 +8996,7 @@ def L2_loadalignh_pi : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "$Ryy32 = memh_fifo($Rx32++#$Ii)",
-tc_3c76b0ff, TypeLD>, Enc_bd1cbc {
+tc_1c7522a8, TypeLD>, Enc_bd1cbc {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011010010;
 let addrMode = PostInc;
@@ -8901,7 +9008,7 @@ def L2_loadalignh_pr : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Rx32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Ryy32 = memh_fifo($Rx32++$Mu2)",
-tc_3c76b0ff, TypeLD>, Enc_1f5d8f {
+tc_1c7522a8, TypeLD>, Enc_1f5d8f {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011100010;
 let addrMode = PostInc;
@@ -8913,7 +9020,7 @@ def L2_loadalignh_zomap : HInst<
 (outs DoubleRegs:$Ryy32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rs32),
 "$Ryy32 = memh_fifo($Rs32)",
-tc_5ef37dc4, TypeMAPPING> {
+tc_fedb7e19, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let Constraints = "$Ryy32 = $Ryy32in";
@@ -8922,7 +9029,7 @@ def L2_loadbsw2_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii),
 "$Rd32 = membh($Rs32+#$Ii)",
-tc_17e0d2cd, TypeLD>, Enc_de0214 {
+tc_4222e6bf, TypeLD>, Enc_de0214 {
 let Inst{24-21} = 0b0001;
 let Inst{31-27} = 0b10010;
 let hasNewValue = 1;
@@ -8940,7 +9047,7 @@ def L2_loadbsw2_pbr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = membh($Rx32++$Mu2:brev)",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011110001;
 let hasNewValue = 1;
@@ -8954,7 +9061,7 @@ def L2_loadbsw2_pci : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
 "$Rd32 = membh($Rx32++#$Ii:circ($Mu2))",
-tc_e93a3d71, TypeLD>, Enc_e83554 {
+tc_5ceb2f9e, TypeLD>, Enc_e83554 {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011000001;
 let hasNewValue = 1;
@@ -8969,7 +9076,7 @@ def L2_loadbsw2_pcr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = membh($Rx32++I:circ($Mu2))",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011000001;
 let hasNewValue = 1;
@@ -8984,7 +9091,7 @@ def L2_loadbsw2_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii),
 "$Rd32 = membh($Rx32++#$Ii)",
-tc_44d3da28, TypeLD>, Enc_152467 {
+tc_075c8dd8, TypeLD>, Enc_152467 {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011010001;
 let hasNewValue = 1;
@@ -8998,7 +9105,7 @@ def L2_loadbsw2_pr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = membh($Rx32++$Mu2)",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011100001;
 let hasNewValue = 1;
@@ -9012,7 +9119,7 @@ def L2_loadbsw2_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = membh($Rs32)",
-tc_17e0d2cd, TypeMAPPING> {
+tc_4222e6bf, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -9022,7 +9129,7 @@ def L2_loadbsw4_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, s30_2Imm:$Ii),
 "$Rdd32 = membh($Rs32+#$Ii)",
-tc_17e0d2cd, TypeLD>, Enc_2d7491 {
+tc_4222e6bf, TypeLD>, Enc_2d7491 {
 let Inst{24-21} = 0b0111;
 let Inst{31-27} = 0b10010;
 let addrMode = BaseImmOffset;
@@ -9038,7 +9145,7 @@ def L2_loadbsw4_pbr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = membh($Rx32++$Mu2:brev)",
-tc_44d3da28, TypeLD>, Enc_7eee72 {
+tc_075c8dd8, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011110111;
 let addrMode = PostInc;
@@ -9050,7 +9157,7 @@ def L2_loadbsw4_pci : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
 "$Rdd32 = membh($Rx32++#$Ii:circ($Mu2))",
-tc_e93a3d71, TypeLD>, Enc_70b24b {
+tc_5ceb2f9e, TypeLD>, Enc_70b24b {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011000111;
 let addrMode = PostInc;
@@ -9063,7 +9170,7 @@ def L2_loadbsw4_pcr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = membh($Rx32++I:circ($Mu2))",
-tc_44d3da28, TypeLD>, Enc_7eee72 {
+tc_075c8dd8, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011000111;
 let addrMode = PostInc;
@@ -9076,7 +9183,7 @@ def L2_loadbsw4_pi : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii),
 "$Rdd32 = membh($Rx32++#$Ii)",
-tc_44d3da28, TypeLD>, Enc_71f1b4 {
+tc_075c8dd8, TypeLD>, Enc_71f1b4 {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011010111;
 let addrMode = PostInc;
@@ -9088,7 +9195,7 @@ def L2_loadbsw4_pr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = membh($Rx32++$Mu2)",
-tc_44d3da28, TypeLD>, Enc_7eee72 {
+tc_075c8dd8, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011100111;
 let addrMode = PostInc;
@@ -9100,7 +9207,7 @@ def L2_loadbsw4_zomap : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = membh($Rs32)",
-tc_17e0d2cd, TypeMAPPING> {
+tc_4222e6bf, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -9108,7 +9215,7 @@ def L2_loadbzw2_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii),
 "$Rd32 = memubh($Rs32+#$Ii)",
-tc_17e0d2cd, TypeLD>, Enc_de0214 {
+tc_4222e6bf, TypeLD>, Enc_de0214 {
 let Inst{24-21} = 0b0011;
 let Inst{31-27} = 0b10010;
 let hasNewValue = 1;
@@ -9126,7 +9233,7 @@ def L2_loadbzw2_pbr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memubh($Rx32++$Mu2:brev)",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011110011;
 let hasNewValue = 1;
@@ -9140,7 +9247,7 @@ def L2_loadbzw2_pci : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
 "$Rd32 = memubh($Rx32++#$Ii:circ($Mu2))",
-tc_e93a3d71, TypeLD>, Enc_e83554 {
+tc_5ceb2f9e, TypeLD>, Enc_e83554 {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011000011;
 let hasNewValue = 1;
@@ -9155,7 +9262,7 @@ def L2_loadbzw2_pcr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memubh($Rx32++I:circ($Mu2))",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011000011;
 let hasNewValue = 1;
@@ -9170,7 +9277,7 @@ def L2_loadbzw2_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii),
 "$Rd32 = memubh($Rx32++#$Ii)",
-tc_44d3da28, TypeLD>, Enc_152467 {
+tc_075c8dd8, TypeLD>, Enc_152467 {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011010011;
 let hasNewValue = 1;
@@ -9184,7 +9291,7 @@ def L2_loadbzw2_pr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memubh($Rx32++$Mu2)",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011100011;
 let hasNewValue = 1;
@@ -9198,7 +9305,7 @@ def L2_loadbzw2_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = memubh($Rs32)",
-tc_17e0d2cd, TypeMAPPING> {
+tc_4222e6bf, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -9208,7 +9315,7 @@ def L2_loadbzw4_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, s30_2Imm:$Ii),
 "$Rdd32 = memubh($Rs32+#$Ii)",
-tc_17e0d2cd, TypeLD>, Enc_2d7491 {
+tc_4222e6bf, TypeLD>, Enc_2d7491 {
 let Inst{24-21} = 0b0101;
 let Inst{31-27} = 0b10010;
 let addrMode = BaseImmOffset;
@@ -9224,7 +9331,7 @@ def L2_loadbzw4_pbr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = memubh($Rx32++$Mu2:brev)",
-tc_44d3da28, TypeLD>, Enc_7eee72 {
+tc_075c8dd8, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011110101;
 let addrMode = PostInc;
@@ -9236,7 +9343,7 @@ def L2_loadbzw4_pci : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
 "$Rdd32 = memubh($Rx32++#$Ii:circ($Mu2))",
-tc_e93a3d71, TypeLD>, Enc_70b24b {
+tc_5ceb2f9e, TypeLD>, Enc_70b24b {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011000101;
 let addrMode = PostInc;
@@ -9249,7 +9356,7 @@ def L2_loadbzw4_pcr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = memubh($Rx32++I:circ($Mu2))",
-tc_44d3da28, TypeLD>, Enc_7eee72 {
+tc_075c8dd8, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011000101;
 let addrMode = PostInc;
@@ -9262,7 +9369,7 @@ def L2_loadbzw4_pi : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii),
 "$Rdd32 = memubh($Rx32++#$Ii)",
-tc_44d3da28, TypeLD>, Enc_71f1b4 {
+tc_075c8dd8, TypeLD>, Enc_71f1b4 {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011010101;
 let addrMode = PostInc;
@@ -9274,7 +9381,7 @@ def L2_loadbzw4_pr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = memubh($Rx32++$Mu2)",
-tc_44d3da28, TypeLD>, Enc_7eee72 {
+tc_075c8dd8, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011100101;
 let addrMode = PostInc;
@@ -9286,7 +9393,7 @@ def L2_loadbzw4_zomap : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = memubh($Rs32)",
-tc_17e0d2cd, TypeMAPPING> {
+tc_4222e6bf, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -9294,7 +9401,7 @@ def L2_loadrb_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = memb($Rs32+#$Ii)",
-tc_17e0d2cd, TypeLD>, Enc_211aaa, AddrModeRel, PostInc_BaseImm {
+tc_4222e6bf, TypeLD>, Enc_211aaa, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1000;
 let Inst{31-27} = 0b10010;
 let hasNewValue = 1;
@@ -9302,8 +9409,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrb";
 let BaseOpcode = "L2_loadrb_io";
+let CextOpcode = "L2_loadrb";
 let isPredicable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -9315,7 +9422,7 @@ def L2_loadrb_pbr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memb($Rx32++$Mu2:brev)",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011111000;
 let hasNewValue = 1;
@@ -9329,7 +9436,7 @@ def L2_loadrb_pci : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
 "$Rd32 = memb($Rx32++#$Ii:circ($Mu2))",
-tc_e93a3d71, TypeLD>, Enc_e0a47a {
+tc_5ceb2f9e, TypeLD>, Enc_e0a47a {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011001000;
 let hasNewValue = 1;
@@ -9344,7 +9451,7 @@ def L2_loadrb_pcr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memb($Rx32++I:circ($Mu2))",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011001000;
 let hasNewValue = 1;
@@ -9359,7 +9466,7 @@ def L2_loadrb_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii),
 "$Rd32 = memb($Rx32++#$Ii)",
-tc_44d3da28, TypeLD>, Enc_222336, PredNewRel, PostInc_BaseImm {
+tc_075c8dd8, TypeLD>, Enc_222336, PredNewRel, PostInc_BaseImm {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011011000;
 let hasNewValue = 1;
@@ -9367,8 +9474,8 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = ByteAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrb";
 let BaseOpcode = "L2_loadrb_pi";
+let CextOpcode = "L2_loadrb";
 let isPredicable = 1;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -9376,7 +9483,7 @@ def L2_loadrb_pr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memb($Rx32++$Mu2)",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011101000;
 let hasNewValue = 1;
@@ -9390,7 +9497,7 @@ def L2_loadrb_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = memb($Rs32)",
-tc_17e0d2cd, TypeMAPPING> {
+tc_4222e6bf, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -9400,7 +9507,7 @@ def L2_loadrbgp : HInst<
 (outs IntRegs:$Rd32),
 (ins u32_0Imm:$Ii),
 "$Rd32 = memb(gp+#$Ii)",
-tc_c4db48cb, TypeV2LDST>, Enc_25bef0, AddrModeRel {
+tc_8a6d0d94, TypeV2LDST>, Enc_25bef0, AddrModeRel {
 let Inst{24-21} = 0b1000;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -9419,14 +9526,14 @@ def L2_loadrd_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, s29_3Imm:$Ii),
 "$Rdd32 = memd($Rs32+#$Ii)",
-tc_17e0d2cd, TypeLD>, Enc_fa3ba4, AddrModeRel, PostInc_BaseImm {
+tc_4222e6bf, TypeLD>, Enc_fa3ba4, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1110;
 let Inst{31-27} = 0b10010;
 let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrd";
 let BaseOpcode = "L2_loadrd_io";
+let CextOpcode = "L2_loadrd";
 let isPredicable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -9438,7 +9545,7 @@ def L2_loadrd_pbr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = memd($Rx32++$Mu2:brev)",
-tc_44d3da28, TypeLD>, Enc_7eee72 {
+tc_075c8dd8, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011111110;
 let addrMode = PostInc;
@@ -9450,7 +9557,7 @@ def L2_loadrd_pci : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_3Imm:$Ii, ModRegs:$Mu2),
 "$Rdd32 = memd($Rx32++#$Ii:circ($Mu2))",
-tc_e93a3d71, TypeLD>, Enc_b05839 {
+tc_5ceb2f9e, TypeLD>, Enc_b05839 {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011001110;
 let addrMode = PostInc;
@@ -9463,7 +9570,7 @@ def L2_loadrd_pcr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = memd($Rx32++I:circ($Mu2))",
-tc_44d3da28, TypeLD>, Enc_7eee72 {
+tc_075c8dd8, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011001110;
 let addrMode = PostInc;
@@ -9476,14 +9583,14 @@ def L2_loadrd_pi : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_3Imm:$Ii),
 "$Rdd32 = memd($Rx32++#$Ii)",
-tc_44d3da28, TypeLD>, Enc_5bdd42, PredNewRel, PostInc_BaseImm {
+tc_075c8dd8, TypeLD>, Enc_5bdd42, PredNewRel, PostInc_BaseImm {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011011110;
 let addrMode = PostInc;
 let accessSize = DoubleWordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrd";
 let BaseOpcode = "L2_loadrd_pi";
+let CextOpcode = "L2_loadrd";
 let isPredicable = 1;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -9491,7 +9598,7 @@ def L2_loadrd_pr : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rdd32 = memd($Rx32++$Mu2)",
-tc_44d3da28, TypeLD>, Enc_7eee72 {
+tc_075c8dd8, TypeLD>, Enc_7eee72 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011101110;
 let addrMode = PostInc;
@@ -9503,7 +9610,7 @@ def L2_loadrd_zomap : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = memd($Rs32)",
-tc_17e0d2cd, TypeMAPPING> {
+tc_4222e6bf, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -9511,7 +9618,7 @@ def L2_loadrdgp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins u29_3Imm:$Ii),
 "$Rdd32 = memd(gp+#$Ii)",
-tc_c4db48cb, TypeV2LDST>, Enc_509701, AddrModeRel {
+tc_8a6d0d94, TypeV2LDST>, Enc_509701, AddrModeRel {
 let Inst{24-21} = 0b1110;
 let Inst{31-27} = 0b01001;
 let accessSize = DoubleWordAccess;
@@ -9528,7 +9635,7 @@ def L2_loadrh_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii),
 "$Rd32 = memh($Rs32+#$Ii)",
-tc_17e0d2cd, TypeLD>, Enc_de0214, AddrModeRel, PostInc_BaseImm {
+tc_4222e6bf, TypeLD>, Enc_de0214, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1010;
 let Inst{31-27} = 0b10010;
 let hasNewValue = 1;
@@ -9536,8 +9643,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrh";
 let BaseOpcode = "L2_loadrh_io";
+let CextOpcode = "L2_loadrh";
 let isPredicable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -9549,7 +9656,7 @@ def L2_loadrh_pbr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memh($Rx32++$Mu2:brev)",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011111010;
 let hasNewValue = 1;
@@ -9563,7 +9670,7 @@ def L2_loadrh_pci : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
 "$Rd32 = memh($Rx32++#$Ii:circ($Mu2))",
-tc_e93a3d71, TypeLD>, Enc_e83554 {
+tc_5ceb2f9e, TypeLD>, Enc_e83554 {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011001010;
 let hasNewValue = 1;
@@ -9578,7 +9685,7 @@ def L2_loadrh_pcr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memh($Rx32++I:circ($Mu2))",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011001010;
 let hasNewValue = 1;
@@ -9593,7 +9700,7 @@ def L2_loadrh_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii),
 "$Rd32 = memh($Rx32++#$Ii)",
-tc_44d3da28, TypeLD>, Enc_152467, PredNewRel, PostInc_BaseImm {
+tc_075c8dd8, TypeLD>, Enc_152467, PredNewRel, PostInc_BaseImm {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011011010;
 let hasNewValue = 1;
@@ -9601,8 +9708,8 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrh";
 let BaseOpcode = "L2_loadrh_pi";
+let CextOpcode = "L2_loadrh";
 let isPredicable = 1;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -9610,7 +9717,7 @@ def L2_loadrh_pr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memh($Rx32++$Mu2)",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011101010;
 let hasNewValue = 1;
@@ -9624,7 +9731,7 @@ def L2_loadrh_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = memh($Rs32)",
-tc_17e0d2cd, TypeMAPPING> {
+tc_4222e6bf, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -9634,7 +9741,7 @@ def L2_loadrhgp : HInst<
 (outs IntRegs:$Rd32),
 (ins u31_1Imm:$Ii),
 "$Rd32 = memh(gp+#$Ii)",
-tc_c4db48cb, TypeV2LDST>, Enc_8df4be, AddrModeRel {
+tc_8a6d0d94, TypeV2LDST>, Enc_8df4be, AddrModeRel {
 let Inst{24-21} = 0b1010;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -9653,7 +9760,7 @@ def L2_loadri_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s30_2Imm:$Ii),
 "$Rd32 = memw($Rs32+#$Ii)",
-tc_17e0d2cd, TypeLD>, Enc_2a3787, AddrModeRel, PostInc_BaseImm {
+tc_4222e6bf, TypeLD>, Enc_2a3787, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1100;
 let Inst{31-27} = 0b10010;
 let hasNewValue = 1;
@@ -9661,8 +9768,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadri";
 let BaseOpcode = "L2_loadri_io";
+let CextOpcode = "L2_loadri";
 let isPredicable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -9674,7 +9781,7 @@ def L2_loadri_pbr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memw($Rx32++$Mu2:brev)",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011111100;
 let hasNewValue = 1;
@@ -9688,7 +9795,7 @@ def L2_loadri_pci : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2),
 "$Rd32 = memw($Rx32++#$Ii:circ($Mu2))",
-tc_e93a3d71, TypeLD>, Enc_27fd0e {
+tc_5ceb2f9e, TypeLD>, Enc_27fd0e {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011001100;
 let hasNewValue = 1;
@@ -9703,7 +9810,7 @@ def L2_loadri_pcr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memw($Rx32++I:circ($Mu2))",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011001100;
 let hasNewValue = 1;
@@ -9718,7 +9825,7 @@ def L2_loadri_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii),
 "$Rd32 = memw($Rx32++#$Ii)",
-tc_44d3da28, TypeLD>, Enc_3d920a, PredNewRel, PostInc_BaseImm {
+tc_075c8dd8, TypeLD>, Enc_3d920a, PredNewRel, PostInc_BaseImm {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011011100;
 let hasNewValue = 1;
@@ -9726,8 +9833,8 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = WordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadri";
 let BaseOpcode = "L2_loadri_pi";
+let CextOpcode = "L2_loadri";
 let isPredicable = 1;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -9735,7 +9842,7 @@ def L2_loadri_pr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memw($Rx32++$Mu2)",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011101100;
 let hasNewValue = 1;
@@ -9749,7 +9856,7 @@ def L2_loadri_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = memw($Rs32)",
-tc_17e0d2cd, TypeMAPPING> {
+tc_4222e6bf, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -9759,7 +9866,7 @@ def L2_loadrigp : HInst<
 (outs IntRegs:$Rd32),
 (ins u30_2Imm:$Ii),
 "$Rd32 = memw(gp+#$Ii)",
-tc_c4db48cb, TypeV2LDST>, Enc_4f4ed7, AddrModeRel {
+tc_8a6d0d94, TypeV2LDST>, Enc_4f4ed7, AddrModeRel {
 let Inst{24-21} = 0b1100;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -9778,7 +9885,7 @@ def L2_loadrub_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rd32 = memub($Rs32+#$Ii)",
-tc_17e0d2cd, TypeLD>, Enc_211aaa, AddrModeRel, PostInc_BaseImm {
+tc_4222e6bf, TypeLD>, Enc_211aaa, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1001;
 let Inst{31-27} = 0b10010;
 let hasNewValue = 1;
@@ -9786,8 +9893,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrub";
 let BaseOpcode = "L2_loadrub_io";
+let CextOpcode = "L2_loadrub";
 let isPredicable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -9799,7 +9906,7 @@ def L2_loadrub_pbr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memub($Rx32++$Mu2:brev)",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011111001;
 let hasNewValue = 1;
@@ -9813,7 +9920,7 @@ def L2_loadrub_pci : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2),
 "$Rd32 = memub($Rx32++#$Ii:circ($Mu2))",
-tc_e93a3d71, TypeLD>, Enc_e0a47a {
+tc_5ceb2f9e, TypeLD>, Enc_e0a47a {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011001001;
 let hasNewValue = 1;
@@ -9828,7 +9935,7 @@ def L2_loadrub_pcr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memub($Rx32++I:circ($Mu2))",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011001001;
 let hasNewValue = 1;
@@ -9843,7 +9950,7 @@ def L2_loadrub_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii),
 "$Rd32 = memub($Rx32++#$Ii)",
-tc_44d3da28, TypeLD>, Enc_222336, PredNewRel, PostInc_BaseImm {
+tc_075c8dd8, TypeLD>, Enc_222336, PredNewRel, PostInc_BaseImm {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011011001;
 let hasNewValue = 1;
@@ -9851,8 +9958,8 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = ByteAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrub";
 let BaseOpcode = "L2_loadrub_pi";
+let CextOpcode = "L2_loadrub";
 let isPredicable = 1;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -9860,7 +9967,7 @@ def L2_loadrub_pr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memub($Rx32++$Mu2)",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011101001;
 let hasNewValue = 1;
@@ -9874,7 +9981,7 @@ def L2_loadrub_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = memub($Rs32)",
-tc_17e0d2cd, TypeMAPPING> {
+tc_4222e6bf, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -9884,7 +9991,7 @@ def L2_loadrubgp : HInst<
 (outs IntRegs:$Rd32),
 (ins u32_0Imm:$Ii),
 "$Rd32 = memub(gp+#$Ii)",
-tc_c4db48cb, TypeV2LDST>, Enc_25bef0, AddrModeRel {
+tc_8a6d0d94, TypeV2LDST>, Enc_25bef0, AddrModeRel {
 let Inst{24-21} = 0b1001;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -9903,7 +10010,7 @@ def L2_loadruh_io : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii),
 "$Rd32 = memuh($Rs32+#$Ii)",
-tc_17e0d2cd, TypeLD>, Enc_de0214, AddrModeRel, PostInc_BaseImm {
+tc_4222e6bf, TypeLD>, Enc_de0214, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1011;
 let Inst{31-27} = 0b10010;
 let hasNewValue = 1;
@@ -9911,8 +10018,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadruh";
 let BaseOpcode = "L2_loadruh_io";
+let CextOpcode = "L2_loadruh";
 let isPredicable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -9924,7 +10031,7 @@ def L2_loadruh_pbr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memuh($Rx32++$Mu2:brev)",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011111011;
 let hasNewValue = 1;
@@ -9938,7 +10045,7 @@ def L2_loadruh_pci : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2),
 "$Rd32 = memuh($Rx32++#$Ii:circ($Mu2))",
-tc_e93a3d71, TypeLD>, Enc_e83554 {
+tc_5ceb2f9e, TypeLD>, Enc_e83554 {
 let Inst{12-9} = 0b0000;
 let Inst{31-21} = 0b10011001011;
 let hasNewValue = 1;
@@ -9953,7 +10060,7 @@ def L2_loadruh_pcr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memuh($Rx32++I:circ($Mu2))",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b10011001011;
 let hasNewValue = 1;
@@ -9968,7 +10075,7 @@ def L2_loadruh_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii),
 "$Rd32 = memuh($Rx32++#$Ii)",
-tc_44d3da28, TypeLD>, Enc_152467, PredNewRel, PostInc_BaseImm {
+tc_075c8dd8, TypeLD>, Enc_152467, PredNewRel, PostInc_BaseImm {
 let Inst{13-9} = 0b00000;
 let Inst{31-21} = 0b10011011011;
 let hasNewValue = 1;
@@ -9976,8 +10083,8 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadruh";
 let BaseOpcode = "L2_loadruh_pi";
+let CextOpcode = "L2_loadruh";
 let isPredicable = 1;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -9985,7 +10092,7 @@ def L2_loadruh_pr : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2),
 "$Rd32 = memuh($Rx32++$Mu2)",
-tc_44d3da28, TypeLD>, Enc_74d4e5 {
+tc_075c8dd8, TypeLD>, Enc_74d4e5 {
 let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b10011101011;
 let hasNewValue = 1;
@@ -9999,7 +10106,7 @@ def L2_loadruh_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = memuh($Rs32)",
-tc_17e0d2cd, TypeMAPPING> {
+tc_4222e6bf, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10009,7 +10116,7 @@ def L2_loadruhgp : HInst<
 (outs IntRegs:$Rd32),
 (ins u31_1Imm:$Ii),
 "$Rd32 = memuh(gp+#$Ii)",
-tc_c4db48cb, TypeV2LDST>, Enc_8df4be, AddrModeRel {
+tc_8a6d0d94, TypeV2LDST>, Enc_8df4be, AddrModeRel {
 let Inst{24-21} = 0b1011;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -10028,7 +10135,7 @@ def L2_loadw_locked : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = memw_locked($Rs32)",
-tc_b43e7930, TypeLD>, Enc_5e2823 {
+tc_64b00d8a, TypeLD>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10010010000;
 let hasNewValue = 1;
@@ -10041,7 +10148,7 @@ def L2_ploadrbf_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memb($Rs32+#$Ii)",
-tc_5ef37dc4, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_fedb7e19, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000101000;
 let isPredicated = 1;
@@ -10051,8 +10158,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrb";
 let BaseOpcode = "L2_loadrb_io";
+let CextOpcode = "L2_loadrb";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10063,7 +10170,7 @@ def L2_ploadrbf_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memb($Rx32++#$Ii)",
-tc_3c76b0ff, TypeLD>, Enc_f4413a, PredNewRel {
+tc_1c7522a8, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011011000;
 let isPredicated = 1;
@@ -10080,7 +10187,7 @@ def L2_ploadrbf_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4) $Rd32 = memb($Rs32)",
-tc_5ef37dc4, TypeMAPPING> {
+tc_fedb7e19, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10090,7 +10197,7 @@ def L2_ploadrbfnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memb($Rs32+#$Ii)",
-tc_44d3da28, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_075c8dd8, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000111000;
 let isPredicated = 1;
@@ -10101,8 +10208,8 @@ let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrb";
 let BaseOpcode = "L2_loadrb_io";
+let CextOpcode = "L2_loadrb";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10113,7 +10220,7 @@ def L2_ploadrbfnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memb($Rx32++#$Ii)",
-tc_e9f3243f, TypeLD>, Enc_f4413a, PredNewRel {
+tc_5f2afaf7, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011011000;
 let isPredicated = 1;
@@ -10131,7 +10238,7 @@ def L2_ploadrbfnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4.new) $Rd32 = memb($Rs32)",
-tc_44d3da28, TypeMAPPING> {
+tc_075c8dd8, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10141,7 +10248,7 @@ def L2_ploadrbt_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memb($Rs32+#$Ii)",
-tc_5ef37dc4, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_fedb7e19, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000001000;
 let isPredicated = 1;
@@ -10150,8 +10257,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrb";
 let BaseOpcode = "L2_loadrb_io";
+let CextOpcode = "L2_loadrb";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10162,7 +10269,7 @@ def L2_ploadrbt_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memb($Rx32++#$Ii)",
-tc_3c76b0ff, TypeLD>, Enc_f4413a, PredNewRel {
+tc_1c7522a8, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011011000;
 let isPredicated = 1;
@@ -10178,7 +10285,7 @@ def L2_ploadrbt_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4) $Rd32 = memb($Rs32)",
-tc_5ef37dc4, TypeMAPPING> {
+tc_fedb7e19, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10188,7 +10295,7 @@ def L2_ploadrbtnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memb($Rs32+#$Ii)",
-tc_44d3da28, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_075c8dd8, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000011000;
 let isPredicated = 1;
@@ -10198,8 +10305,8 @@ let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrb";
 let BaseOpcode = "L2_loadrb_io";
+let CextOpcode = "L2_loadrb";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10210,7 +10317,7 @@ def L2_ploadrbtnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memb($Rx32++#$Ii)",
-tc_e9f3243f, TypeLD>, Enc_f4413a, PredNewRel {
+tc_5f2afaf7, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011011000;
 let isPredicated = 1;
@@ -10227,7 +10334,7 @@ def L2_ploadrbtnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4.new) $Rd32 = memb($Rs32)",
-tc_44d3da28, TypeMAPPING> {
+tc_075c8dd8, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10237,7 +10344,7 @@ def L2_ploadrdf_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
 "if (!$Pt4) $Rdd32 = memd($Rs32+#$Ii)",
-tc_5ef37dc4, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
+tc_fedb7e19, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000101110;
 let isPredicated = 1;
@@ -10245,8 +10352,8 @@ let isPredicatedFalse = 1;
 let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrd";
 let BaseOpcode = "L2_loadrd_io";
+let CextOpcode = "L2_loadrd";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10257,7 +10364,7 @@ def L2_ploadrdf_pi : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
 "if (!$Pt4) $Rdd32 = memd($Rx32++#$Ii)",
-tc_3c76b0ff, TypeLD>, Enc_9d1247, PredNewRel {
+tc_1c7522a8, TypeLD>, Enc_9d1247, PredNewRel {
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011011110;
 let isPredicated = 1;
@@ -10272,7 +10379,7 @@ def L2_ploadrdf_zomap : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4) $Rdd32 = memd($Rs32)",
-tc_5ef37dc4, TypeMAPPING> {
+tc_fedb7e19, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -10280,7 +10387,7 @@ def L2_ploadrdfnew_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
 "if (!$Pt4.new) $Rdd32 = memd($Rs32+#$Ii)",
-tc_44d3da28, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
+tc_075c8dd8, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000111110;
 let isPredicated = 1;
@@ -10289,8 +10396,8 @@ let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrd";
 let BaseOpcode = "L2_loadrd_io";
+let CextOpcode = "L2_loadrd";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10301,7 +10408,7 @@ def L2_ploadrdfnew_pi : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
 "if (!$Pt4.new) $Rdd32 = memd($Rx32++#$Ii)",
-tc_e9f3243f, TypeLD>, Enc_9d1247, PredNewRel {
+tc_5f2afaf7, TypeLD>, Enc_9d1247, PredNewRel {
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011011110;
 let isPredicated = 1;
@@ -10317,7 +10424,7 @@ def L2_ploadrdfnew_zomap : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4.new) $Rdd32 = memd($Rs32)",
-tc_44d3da28, TypeMAPPING> {
+tc_075c8dd8, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -10325,15 +10432,15 @@ def L2_ploadrdt_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
 "if ($Pt4) $Rdd32 = memd($Rs32+#$Ii)",
-tc_5ef37dc4, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
+tc_fedb7e19, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000001110;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrd";
 let BaseOpcode = "L2_loadrd_io";
+let CextOpcode = "L2_loadrd";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10344,7 +10451,7 @@ def L2_ploadrdt_pi : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
 "if ($Pt4) $Rdd32 = memd($Rx32++#$Ii)",
-tc_3c76b0ff, TypeLD>, Enc_9d1247, PredNewRel {
+tc_1c7522a8, TypeLD>, Enc_9d1247, PredNewRel {
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011011110;
 let isPredicated = 1;
@@ -10358,7 +10465,7 @@ def L2_ploadrdt_zomap : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4) $Rdd32 = memd($Rs32)",
-tc_5ef37dc4, TypeMAPPING> {
+tc_fedb7e19, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -10366,7 +10473,7 @@ def L2_ploadrdtnew_io : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u29_3Imm:$Ii),
 "if ($Pt4.new) $Rdd32 = memd($Rs32+#$Ii)",
-tc_44d3da28, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
+tc_075c8dd8, TypeV2LDST>, Enc_acd6ed, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000011110;
 let isPredicated = 1;
@@ -10374,8 +10481,8 @@ let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrd";
 let BaseOpcode = "L2_loadrd_io";
+let CextOpcode = "L2_loadrd";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10386,7 +10493,7 @@ def L2_ploadrdtnew_pi : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_3Imm:$Ii),
 "if ($Pt4.new) $Rdd32 = memd($Rx32++#$Ii)",
-tc_e9f3243f, TypeLD>, Enc_9d1247, PredNewRel {
+tc_5f2afaf7, TypeLD>, Enc_9d1247, PredNewRel {
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011011110;
 let isPredicated = 1;
@@ -10401,7 +10508,7 @@ def L2_ploadrdtnew_zomap : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4.new) $Rdd32 = memd($Rs32)",
-tc_44d3da28, TypeMAPPING> {
+tc_075c8dd8, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -10409,7 +10516,7 @@ def L2_ploadrhf_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if (!$Pt4) $Rd32 = memh($Rs32+#$Ii)",
-tc_5ef37dc4, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_fedb7e19, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000101010;
 let isPredicated = 1;
@@ -10419,8 +10526,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrh";
 let BaseOpcode = "L2_loadrh_io";
+let CextOpcode = "L2_loadrh";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10431,7 +10538,7 @@ def L2_ploadrhf_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if (!$Pt4) $Rd32 = memh($Rx32++#$Ii)",
-tc_3c76b0ff, TypeLD>, Enc_733b27, PredNewRel {
+tc_1c7522a8, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011011010;
 let isPredicated = 1;
@@ -10448,7 +10555,7 @@ def L2_ploadrhf_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4) $Rd32 = memh($Rs32)",
-tc_5ef37dc4, TypeMAPPING> {
+tc_fedb7e19, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10458,7 +10565,7 @@ def L2_ploadrhfnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memh($Rs32+#$Ii)",
-tc_44d3da28, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_075c8dd8, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000111010;
 let isPredicated = 1;
@@ -10469,8 +10576,8 @@ let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrh";
 let BaseOpcode = "L2_loadrh_io";
+let CextOpcode = "L2_loadrh";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10481,7 +10588,7 @@ def L2_ploadrhfnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memh($Rx32++#$Ii)",
-tc_e9f3243f, TypeLD>, Enc_733b27, PredNewRel {
+tc_5f2afaf7, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011011010;
 let isPredicated = 1;
@@ -10499,7 +10606,7 @@ def L2_ploadrhfnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4.new) $Rd32 = memh($Rs32)",
-tc_44d3da28, TypeMAPPING> {
+tc_075c8dd8, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10509,7 +10616,7 @@ def L2_ploadrht_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if ($Pt4) $Rd32 = memh($Rs32+#$Ii)",
-tc_5ef37dc4, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_fedb7e19, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000001010;
 let isPredicated = 1;
@@ -10518,8 +10625,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrh";
 let BaseOpcode = "L2_loadrh_io";
+let CextOpcode = "L2_loadrh";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10530,7 +10637,7 @@ def L2_ploadrht_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if ($Pt4) $Rd32 = memh($Rx32++#$Ii)",
-tc_3c76b0ff, TypeLD>, Enc_733b27, PredNewRel {
+tc_1c7522a8, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011011010;
 let isPredicated = 1;
@@ -10546,7 +10653,7 @@ def L2_ploadrht_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4) $Rd32 = memh($Rs32)",
-tc_5ef37dc4, TypeMAPPING> {
+tc_fedb7e19, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10556,7 +10663,7 @@ def L2_ploadrhtnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memh($Rs32+#$Ii)",
-tc_44d3da28, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_075c8dd8, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000011010;
 let isPredicated = 1;
@@ -10566,8 +10673,8 @@ let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrh";
 let BaseOpcode = "L2_loadrh_io";
+let CextOpcode = "L2_loadrh";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10578,7 +10685,7 @@ def L2_ploadrhtnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memh($Rx32++#$Ii)",
-tc_e9f3243f, TypeLD>, Enc_733b27, PredNewRel {
+tc_5f2afaf7, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011011010;
 let isPredicated = 1;
@@ -10595,7 +10702,7 @@ def L2_ploadrhtnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4.new) $Rd32 = memh($Rs32)",
-tc_44d3da28, TypeMAPPING> {
+tc_075c8dd8, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10605,7 +10712,7 @@ def L2_ploadrif_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
 "if (!$Pt4) $Rd32 = memw($Rs32+#$Ii)",
-tc_5ef37dc4, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
+tc_fedb7e19, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000101100;
 let isPredicated = 1;
@@ -10615,8 +10722,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadri";
 let BaseOpcode = "L2_loadri_io";
+let CextOpcode = "L2_loadri";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10627,7 +10734,7 @@ def L2_ploadrif_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
 "if (!$Pt4) $Rd32 = memw($Rx32++#$Ii)",
-tc_3c76b0ff, TypeLD>, Enc_b97f71, PredNewRel {
+tc_1c7522a8, TypeLD>, Enc_b97f71, PredNewRel {
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011011100;
 let isPredicated = 1;
@@ -10644,7 +10751,7 @@ def L2_ploadrif_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4) $Rd32 = memw($Rs32)",
-tc_5ef37dc4, TypeMAPPING> {
+tc_fedb7e19, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10654,7 +10761,7 @@ def L2_ploadrifnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memw($Rs32+#$Ii)",
-tc_44d3da28, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
+tc_075c8dd8, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000111100;
 let isPredicated = 1;
@@ -10665,8 +10772,8 @@ let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
-let CextOpcode = "L2_loadri";
 let BaseOpcode = "L2_loadri_io";
+let CextOpcode = "L2_loadri";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10677,7 +10784,7 @@ def L2_ploadrifnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memw($Rx32++#$Ii)",
-tc_e9f3243f, TypeLD>, Enc_b97f71, PredNewRel {
+tc_5f2afaf7, TypeLD>, Enc_b97f71, PredNewRel {
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011011100;
 let isPredicated = 1;
@@ -10695,7 +10802,7 @@ def L2_ploadrifnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4.new) $Rd32 = memw($Rs32)",
-tc_44d3da28, TypeMAPPING> {
+tc_075c8dd8, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10705,7 +10812,7 @@ def L2_ploadrit_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
 "if ($Pt4) $Rd32 = memw($Rs32+#$Ii)",
-tc_5ef37dc4, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
+tc_fedb7e19, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000001100;
 let isPredicated = 1;
@@ -10714,8 +10821,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadri";
 let BaseOpcode = "L2_loadri_io";
+let CextOpcode = "L2_loadri";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10726,7 +10833,7 @@ def L2_ploadrit_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
 "if ($Pt4) $Rd32 = memw($Rx32++#$Ii)",
-tc_3c76b0ff, TypeLD>, Enc_b97f71, PredNewRel {
+tc_1c7522a8, TypeLD>, Enc_b97f71, PredNewRel {
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011011100;
 let isPredicated = 1;
@@ -10742,7 +10849,7 @@ def L2_ploadrit_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4) $Rd32 = memw($Rs32)",
-tc_5ef37dc4, TypeMAPPING> {
+tc_fedb7e19, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10752,7 +10859,7 @@ def L2_ploadritnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u30_2Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memw($Rs32+#$Ii)",
-tc_44d3da28, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
+tc_075c8dd8, TypeV2LDST>, Enc_f82eaf, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000011100;
 let isPredicated = 1;
@@ -10762,8 +10869,8 @@ let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
-let CextOpcode = "L2_loadri";
 let BaseOpcode = "L2_loadri_io";
+let CextOpcode = "L2_loadri";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10774,7 +10881,7 @@ def L2_ploadritnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_2Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memw($Rx32++#$Ii)",
-tc_e9f3243f, TypeLD>, Enc_b97f71, PredNewRel {
+tc_5f2afaf7, TypeLD>, Enc_b97f71, PredNewRel {
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011011100;
 let isPredicated = 1;
@@ -10791,7 +10898,7 @@ def L2_ploadritnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4.new) $Rd32 = memw($Rs32)",
-tc_44d3da28, TypeMAPPING> {
+tc_075c8dd8, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10801,7 +10908,7 @@ def L2_ploadrubf_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memub($Rs32+#$Ii)",
-tc_5ef37dc4, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_fedb7e19, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000101001;
 let isPredicated = 1;
@@ -10811,8 +10918,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrub";
 let BaseOpcode = "L2_loadrub_io";
+let CextOpcode = "L2_loadrub";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10823,7 +10930,7 @@ def L2_ploadrubf_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memub($Rx32++#$Ii)",
-tc_3c76b0ff, TypeLD>, Enc_f4413a, PredNewRel {
+tc_1c7522a8, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011011001;
 let isPredicated = 1;
@@ -10840,7 +10947,7 @@ def L2_ploadrubf_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4) $Rd32 = memub($Rs32)",
-tc_5ef37dc4, TypeMAPPING> {
+tc_fedb7e19, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10850,7 +10957,7 @@ def L2_ploadrubfnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memub($Rs32+#$Ii)",
-tc_44d3da28, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_075c8dd8, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000111001;
 let isPredicated = 1;
@@ -10861,8 +10968,8 @@ let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrub";
 let BaseOpcode = "L2_loadrub_io";
+let CextOpcode = "L2_loadrub";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10873,7 +10980,7 @@ def L2_ploadrubfnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memub($Rx32++#$Ii)",
-tc_e9f3243f, TypeLD>, Enc_f4413a, PredNewRel {
+tc_5f2afaf7, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011011001;
 let isPredicated = 1;
@@ -10891,7 +10998,7 @@ def L2_ploadrubfnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4.new) $Rd32 = memub($Rs32)",
-tc_44d3da28, TypeMAPPING> {
+tc_075c8dd8, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10901,7 +11008,7 @@ def L2_ploadrubt_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memub($Rs32+#$Ii)",
-tc_5ef37dc4, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_fedb7e19, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000001001;
 let isPredicated = 1;
@@ -10910,8 +11017,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrub";
 let BaseOpcode = "L2_loadrub_io";
+let CextOpcode = "L2_loadrub";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10922,7 +11029,7 @@ def L2_ploadrubt_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memub($Rx32++#$Ii)",
-tc_3c76b0ff, TypeLD>, Enc_f4413a, PredNewRel {
+tc_1c7522a8, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011011001;
 let isPredicated = 1;
@@ -10938,7 +11045,7 @@ def L2_ploadrubt_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4) $Rd32 = memub($Rs32)",
-tc_5ef37dc4, TypeMAPPING> {
+tc_fedb7e19, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10948,7 +11055,7 @@ def L2_ploadrubtnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memub($Rs32+#$Ii)",
-tc_44d3da28, TypeV2LDST>, Enc_a21d47, AddrModeRel {
+tc_075c8dd8, TypeV2LDST>, Enc_a21d47, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000011001;
 let isPredicated = 1;
@@ -10958,8 +11065,8 @@ let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
-let CextOpcode = "L2_loadrub";
 let BaseOpcode = "L2_loadrub_io";
+let CextOpcode = "L2_loadrub";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -10970,7 +11077,7 @@ def L2_ploadrubtnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memub($Rx32++#$Ii)",
-tc_e9f3243f, TypeLD>, Enc_f4413a, PredNewRel {
+tc_5f2afaf7, TypeLD>, Enc_f4413a, PredNewRel {
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011011001;
 let isPredicated = 1;
@@ -10987,7 +11094,7 @@ def L2_ploadrubtnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4.new) $Rd32 = memub($Rs32)",
-tc_44d3da28, TypeMAPPING> {
+tc_075c8dd8, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -10997,7 +11104,7 @@ def L2_ploadruhf_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if (!$Pt4) $Rd32 = memuh($Rs32+#$Ii)",
-tc_5ef37dc4, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_fedb7e19, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000101011;
 let isPredicated = 1;
@@ -11007,8 +11114,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadruh";
 let BaseOpcode = "L2_loadruh_io";
+let CextOpcode = "L2_loadruh";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -11019,7 +11126,7 @@ def L2_ploadruhf_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if (!$Pt4) $Rd32 = memuh($Rx32++#$Ii)",
-tc_3c76b0ff, TypeLD>, Enc_733b27, PredNewRel {
+tc_1c7522a8, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011011011;
 let isPredicated = 1;
@@ -11036,7 +11143,7 @@ def L2_ploadruhf_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4) $Rd32 = memuh($Rs32)",
-tc_5ef37dc4, TypeMAPPING> {
+tc_fedb7e19, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -11046,7 +11153,7 @@ def L2_ploadruhfnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memuh($Rs32+#$Ii)",
-tc_44d3da28, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_075c8dd8, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000111011;
 let isPredicated = 1;
@@ -11057,8 +11164,8 @@ let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
-let CextOpcode = "L2_loadruh";
 let BaseOpcode = "L2_loadruh_io";
+let CextOpcode = "L2_loadruh";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -11069,7 +11176,7 @@ def L2_ploadruhfnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memuh($Rx32++#$Ii)",
-tc_e9f3243f, TypeLD>, Enc_733b27, PredNewRel {
+tc_5f2afaf7, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011011011;
 let isPredicated = 1;
@@ -11087,7 +11194,7 @@ def L2_ploadruhfnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if (!$Pt4.new) $Rd32 = memuh($Rs32)",
-tc_44d3da28, TypeMAPPING> {
+tc_075c8dd8, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -11097,7 +11204,7 @@ def L2_ploadruht_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if ($Pt4) $Rd32 = memuh($Rs32+#$Ii)",
-tc_5ef37dc4, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_fedb7e19, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000001011;
 let isPredicated = 1;
@@ -11106,8 +11213,8 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
-let CextOpcode = "L2_loadruh";
 let BaseOpcode = "L2_loadruh_io";
+let CextOpcode = "L2_loadruh";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -11118,7 +11225,7 @@ def L2_ploadruht_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if ($Pt4) $Rd32 = memuh($Rx32++#$Ii)",
-tc_3c76b0ff, TypeLD>, Enc_733b27, PredNewRel {
+tc_1c7522a8, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011011011;
 let isPredicated = 1;
@@ -11134,7 +11241,7 @@ def L2_ploadruht_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4) $Rd32 = memuh($Rs32)",
-tc_5ef37dc4, TypeMAPPING> {
+tc_fedb7e19, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -11144,7 +11251,7 @@ def L2_ploadruhtnew_io : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32, u31_1Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memuh($Rs32+#$Ii)",
-tc_44d3da28, TypeV2LDST>, Enc_a198f6, AddrModeRel {
+tc_075c8dd8, TypeV2LDST>, Enc_a198f6, AddrModeRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b01000011011;
 let isPredicated = 1;
@@ -11154,8 +11261,8 @@ let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
-let CextOpcode = "L2_loadruh";
 let BaseOpcode = "L2_loadruh_io";
+let CextOpcode = "L2_loadruh";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 0;
@@ -11166,7 +11273,7 @@ def L2_ploadruhtnew_pi : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Rx32),
 (ins PredRegs:$Pt4, IntRegs:$Rx32in, s4_1Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memuh($Rx32++#$Ii)",
-tc_e9f3243f, TypeLD>, Enc_733b27, PredNewRel {
+tc_5f2afaf7, TypeLD>, Enc_733b27, PredNewRel {
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011011011;
 let isPredicated = 1;
@@ -11183,7 +11290,7 @@ def L2_ploadruhtnew_zomap : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, IntRegs:$Rs32),
 "if ($Pt4.new) $Rd32 = memuh($Rs32)",
-tc_44d3da28, TypeMAPPING> {
+tc_075c8dd8, TypeMAPPING> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -11193,7 +11300,7 @@ def L4_add_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rs32+#$Ii) += $Rt32",
-tc_7186d325, TypeV4LDST>, Enc_d44e31 {
+tc_9bcfb2ee, TypeV4LDST>, Enc_d44e31 {
 let Inst{6-5} = 0b00;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110000;
@@ -11212,7 +11319,7 @@ def L4_add_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memb($Rs32) += $Rt32",
-tc_7186d325, TypeMAPPING> {
+tc_9bcfb2ee, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11220,7 +11327,7 @@ def L4_add_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+#$Ii) += $Rt32",
-tc_7186d325, TypeV4LDST>, Enc_163a3c {
+tc_9bcfb2ee, TypeV4LDST>, Enc_163a3c {
 let Inst{6-5} = 0b00;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110001;
@@ -11239,7 +11346,7 @@ def L4_add_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memh($Rs32) += $Rt32",
-tc_7186d325, TypeMAPPING> {
+tc_9bcfb2ee, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11247,7 +11354,7 @@ def L4_add_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "memw($Rs32+#$Ii) += $Rt32",
-tc_7186d325, TypeV4LDST>, Enc_226535 {
+tc_9bcfb2ee, TypeV4LDST>, Enc_226535 {
 let Inst{6-5} = 0b00;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110010;
@@ -11266,7 +11373,7 @@ def L4_add_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memw($Rs32) += $Rt32",
-tc_7186d325, TypeMAPPING> {
+tc_9bcfb2ee, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11274,7 +11381,7 @@ def L4_and_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rs32+#$Ii) &= $Rt32",
-tc_7186d325, TypeV4LDST>, Enc_d44e31 {
+tc_9bcfb2ee, TypeV4LDST>, Enc_d44e31 {
 let Inst{6-5} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110000;
@@ -11293,7 +11400,7 @@ def L4_and_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memb($Rs32) &= $Rt32",
-tc_7186d325, TypeMAPPING> {
+tc_9bcfb2ee, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11301,7 +11408,7 @@ def L4_and_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+#$Ii) &= $Rt32",
-tc_7186d325, TypeV4LDST>, Enc_163a3c {
+tc_9bcfb2ee, TypeV4LDST>, Enc_163a3c {
 let Inst{6-5} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110001;
@@ -11320,7 +11427,7 @@ def L4_and_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memh($Rs32) &= $Rt32",
-tc_7186d325, TypeMAPPING> {
+tc_9bcfb2ee, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11328,7 +11435,7 @@ def L4_and_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "memw($Rs32+#$Ii) &= $Rt32",
-tc_7186d325, TypeV4LDST>, Enc_226535 {
+tc_9bcfb2ee, TypeV4LDST>, Enc_226535 {
 let Inst{6-5} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110010;
@@ -11347,7 +11454,7 @@ def L4_and_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memw($Rs32) &= $Rt32",
-tc_7186d325, TypeMAPPING> {
+tc_9bcfb2ee, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11355,7 +11462,7 @@ def L4_iadd_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
 "memb($Rs32+#$Ii) += #$II",
-tc_096199d3, TypeV4LDST>, Enc_46c951 {
+tc_158aa3f7, TypeV4LDST>, Enc_46c951 {
 let Inst{6-5} = 0b00;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111000;
@@ -11374,7 +11481,7 @@ def L4_iadd_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memb($Rs32) += #$II",
-tc_096199d3, TypeMAPPING> {
+tc_158aa3f7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11382,7 +11489,7 @@ def L4_iadd_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
 "memh($Rs32+#$Ii) += #$II",
-tc_096199d3, TypeV4LDST>, Enc_e66a97 {
+tc_158aa3f7, TypeV4LDST>, Enc_e66a97 {
 let Inst{6-5} = 0b00;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111001;
@@ -11401,7 +11508,7 @@ def L4_iadd_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memh($Rs32) += #$II",
-tc_096199d3, TypeMAPPING> {
+tc_158aa3f7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11409,7 +11516,7 @@ def L4_iadd_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
 "memw($Rs32+#$Ii) += #$II",
-tc_096199d3, TypeV4LDST>, Enc_84b2cd {
+tc_158aa3f7, TypeV4LDST>, Enc_84b2cd {
 let Inst{6-5} = 0b00;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111010;
@@ -11428,7 +11535,7 @@ def L4_iadd_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memw($Rs32) += #$II",
-tc_096199d3, TypeMAPPING> {
+tc_158aa3f7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11436,7 +11543,7 @@ def L4_iand_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
 "memb($Rs32+#$Ii) = clrbit(#$II)",
-tc_096199d3, TypeV4LDST>, Enc_46c951 {
+tc_158aa3f7, TypeV4LDST>, Enc_46c951 {
 let Inst{6-5} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111000;
@@ -11455,7 +11562,7 @@ def L4_iand_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memb($Rs32) = clrbit(#$II)",
-tc_096199d3, TypeMAPPING> {
+tc_158aa3f7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11463,7 +11570,7 @@ def L4_iand_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
 "memh($Rs32+#$Ii) = clrbit(#$II)",
-tc_096199d3, TypeV4LDST>, Enc_e66a97 {
+tc_158aa3f7, TypeV4LDST>, Enc_e66a97 {
 let Inst{6-5} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111001;
@@ -11482,7 +11589,7 @@ def L4_iand_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memh($Rs32) = clrbit(#$II)",
-tc_096199d3, TypeMAPPING> {
+tc_158aa3f7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11490,7 +11597,7 @@ def L4_iand_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
 "memw($Rs32+#$Ii) = clrbit(#$II)",
-tc_096199d3, TypeV4LDST>, Enc_84b2cd {
+tc_158aa3f7, TypeV4LDST>, Enc_84b2cd {
 let Inst{6-5} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111010;
@@ -11509,7 +11616,7 @@ def L4_iand_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memw($Rs32) = clrbit(#$II)",
-tc_096199d3, TypeMAPPING> {
+tc_158aa3f7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11517,7 +11624,7 @@ def L4_ior_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
 "memb($Rs32+#$Ii) = setbit(#$II)",
-tc_096199d3, TypeV4LDST>, Enc_46c951 {
+tc_158aa3f7, TypeV4LDST>, Enc_46c951 {
 let Inst{6-5} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111000;
@@ -11536,7 +11643,7 @@ def L4_ior_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memb($Rs32) = setbit(#$II)",
-tc_096199d3, TypeMAPPING> {
+tc_158aa3f7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11544,7 +11651,7 @@ def L4_ior_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
 "memh($Rs32+#$Ii) = setbit(#$II)",
-tc_096199d3, TypeV4LDST>, Enc_e66a97 {
+tc_158aa3f7, TypeV4LDST>, Enc_e66a97 {
 let Inst{6-5} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111001;
@@ -11563,7 +11670,7 @@ def L4_ior_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memh($Rs32) = setbit(#$II)",
-tc_096199d3, TypeMAPPING> {
+tc_158aa3f7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11571,7 +11678,7 @@ def L4_ior_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
 "memw($Rs32+#$Ii) = setbit(#$II)",
-tc_096199d3, TypeV4LDST>, Enc_84b2cd {
+tc_158aa3f7, TypeV4LDST>, Enc_84b2cd {
 let Inst{6-5} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111010;
@@ -11590,7 +11697,7 @@ def L4_ior_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memw($Rs32) = setbit(#$II)",
-tc_096199d3, TypeMAPPING> {
+tc_158aa3f7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11598,7 +11705,7 @@ def L4_isub_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, u5_0Imm:$II),
 "memb($Rs32+#$Ii) -= #$II",
-tc_096199d3, TypeV4LDST>, Enc_46c951 {
+tc_158aa3f7, TypeV4LDST>, Enc_46c951 {
 let Inst{6-5} = 0b01;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111000;
@@ -11617,7 +11724,7 @@ def L4_isub_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memb($Rs32) -= #$II",
-tc_096199d3, TypeMAPPING> {
+tc_158aa3f7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11625,7 +11732,7 @@ def L4_isub_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, u5_0Imm:$II),
 "memh($Rs32+#$Ii) -= #$II",
-tc_096199d3, TypeV4LDST>, Enc_e66a97 {
+tc_158aa3f7, TypeV4LDST>, Enc_e66a97 {
 let Inst{6-5} = 0b01;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111001;
@@ -11644,7 +11751,7 @@ def L4_isub_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memh($Rs32) -= #$II",
-tc_096199d3, TypeMAPPING> {
+tc_158aa3f7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11652,7 +11759,7 @@ def L4_isub_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, u5_0Imm:$II),
 "memw($Rs32+#$Ii) -= #$II",
-tc_096199d3, TypeV4LDST>, Enc_84b2cd {
+tc_158aa3f7, TypeV4LDST>, Enc_84b2cd {
 let Inst{6-5} = 0b01;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111111010;
@@ -11671,7 +11778,7 @@ def L4_isub_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, u5_0Imm:$II),
 "memw($Rs32) -= #$II",
-tc_096199d3, TypeMAPPING> {
+tc_158aa3f7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -11679,7 +11786,7 @@ def L4_loadalignb_ap : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Re32),
 (ins DoubleRegs:$Ryy32in, u32_0Imm:$II),
 "$Ryy32 = memb_fifo($Re32=#$II)",
-tc_7a91e76a, TypeLD>, Enc_f394d3 {
+tc_ac65613f, TypeLD>, Enc_f394d3 {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011010100;
@@ -11699,7 +11806,7 @@ def L4_loadalignb_ur : HInst<
 (outs DoubleRegs:$Ryy32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Ryy32 = memb_fifo($Rt32<<#$Ii+#$II)",
-tc_a5d4aeec, TypeLD>, Enc_04c959 {
+tc_a32e03e7, TypeLD>, Enc_04c959 {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011100100;
 let addrMode = BaseLongOffset;
@@ -11719,7 +11826,7 @@ def L4_loadalignh_ap : HInst<
 (outs DoubleRegs:$Ryy32, IntRegs:$Re32),
 (ins DoubleRegs:$Ryy32in, u32_0Imm:$II),
 "$Ryy32 = memh_fifo($Re32=#$II)",
-tc_7a91e76a, TypeLD>, Enc_f394d3 {
+tc_ac65613f, TypeLD>, Enc_f394d3 {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011010010;
@@ -11739,7 +11846,7 @@ def L4_loadalignh_ur : HInst<
 (outs DoubleRegs:$Ryy32),
 (ins DoubleRegs:$Ryy32in, IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Ryy32 = memh_fifo($Rt32<<#$Ii+#$II)",
-tc_a5d4aeec, TypeLD>, Enc_04c959 {
+tc_a32e03e7, TypeLD>, Enc_04c959 {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011100010;
 let addrMode = BaseLongOffset;
@@ -11759,7 +11866,7 @@ def L4_loadbsw2_ap : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rd32 = membh($Re32=#$II)",
-tc_3b5b7ef9, TypeLD>, Enc_323f2d {
+tc_822c3c68, TypeLD>, Enc_323f2d {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011010001;
@@ -11780,7 +11887,7 @@ def L4_loadbsw2_ur : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rd32 = membh($Rt32<<#$Ii+#$II)",
-tc_bab0eed9, TypeLD>, Enc_4f677b {
+tc_abfd9a6d, TypeLD>, Enc_4f677b {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011100001;
 let hasNewValue = 1;
@@ -11801,7 +11908,7 @@ def L4_loadbsw4_ap : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rdd32 = membh($Re32=#$II)",
-tc_3b5b7ef9, TypeLD>, Enc_7fa7f6 {
+tc_822c3c68, TypeLD>, Enc_7fa7f6 {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011010111;
@@ -11820,7 +11927,7 @@ def L4_loadbsw4_ur : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rdd32 = membh($Rt32<<#$Ii+#$II)",
-tc_bab0eed9, TypeLD>, Enc_6185fe {
+tc_abfd9a6d, TypeLD>, Enc_6185fe {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011100111;
 let addrMode = BaseLongOffset;
@@ -11839,7 +11946,7 @@ def L4_loadbzw2_ap : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rd32 = memubh($Re32=#$II)",
-tc_3b5b7ef9, TypeLD>, Enc_323f2d {
+tc_822c3c68, TypeLD>, Enc_323f2d {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011010011;
@@ -11860,7 +11967,7 @@ def L4_loadbzw2_ur : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rd32 = memubh($Rt32<<#$Ii+#$II)",
-tc_bab0eed9, TypeLD>, Enc_4f677b {
+tc_abfd9a6d, TypeLD>, Enc_4f677b {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011100011;
 let hasNewValue = 1;
@@ -11881,7 +11988,7 @@ def L4_loadbzw4_ap : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rdd32 = memubh($Re32=#$II)",
-tc_3b5b7ef9, TypeLD>, Enc_7fa7f6 {
+tc_822c3c68, TypeLD>, Enc_7fa7f6 {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011010101;
@@ -11900,7 +12007,7 @@ def L4_loadbzw4_ur : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rdd32 = memubh($Rt32<<#$Ii+#$II)",
-tc_bab0eed9, TypeLD>, Enc_6185fe {
+tc_abfd9a6d, TypeLD>, Enc_6185fe {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011100101;
 let addrMode = BaseLongOffset;
@@ -11919,7 +12026,7 @@ def L4_loadd_locked : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = memd_locked($Rs32)",
-tc_b43e7930, TypeLD>, Enc_3a3d62 {
+tc_64b00d8a, TypeLD>, Enc_3a3d62 {
 let Inst{13-5} = 0b010000000;
 let Inst{31-21} = 0b10010010000;
 let accessSize = DoubleWordAccess;
@@ -11930,7 +12037,7 @@ def L4_loadrb_ap : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rd32 = memb($Re32=#$II)",
-tc_3b5b7ef9, TypeLD>, Enc_323f2d {
+tc_822c3c68, TypeLD>, Enc_323f2d {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011011000;
@@ -11951,7 +12058,7 @@ def L4_loadrb_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rd32 = memb($Rs32+$Rt32<<#$Ii)",
-tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+tc_bf2ffc0f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111010000;
 let hasNewValue = 1;
@@ -11959,16 +12066,16 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrb_rr";
 let CextOpcode = "L2_loadrb";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrb_rr";
 let isPredicable = 1;
 }
 def L4_loadrb_ur : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rd32 = memb($Rt32<<#$Ii+#$II)",
-tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+tc_abfd9a6d, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011101000;
 let hasNewValue = 1;
@@ -11990,7 +12097,7 @@ def L4_loadrd_ap : HInst<
 (outs DoubleRegs:$Rdd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rdd32 = memd($Re32=#$II)",
-tc_3b5b7ef9, TypeLD>, Enc_7fa7f6 {
+tc_822c3c68, TypeLD>, Enc_7fa7f6 {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011011110;
@@ -12009,22 +12116,22 @@ def L4_loadrd_rr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
-tc_bf061958, TypeLD>, Enc_84bff1, AddrModeRel, ImmRegShl {
+tc_bf2ffc0f, TypeLD>, Enc_84bff1, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111010110;
 let addrMode = BaseRegOffset;
 let accessSize = DoubleWordAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrd_rr";
 let CextOpcode = "L2_loadrd";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrd_rr";
 let isPredicable = 1;
 }
 def L4_loadrd_ur : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rdd32 = memd($Rt32<<#$Ii+#$II)",
-tc_bab0eed9, TypeLD>, Enc_6185fe, AddrModeRel, ImmRegShl {
+tc_abfd9a6d, TypeLD>, Enc_6185fe, AddrModeRel, ImmRegShl {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011101110;
 let addrMode = BaseLongOffset;
@@ -12044,7 +12151,7 @@ def L4_loadrh_ap : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rd32 = memh($Re32=#$II)",
-tc_3b5b7ef9, TypeLD>, Enc_323f2d {
+tc_822c3c68, TypeLD>, Enc_323f2d {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011011010;
@@ -12065,7 +12172,7 @@ def L4_loadrh_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rd32 = memh($Rs32+$Rt32<<#$Ii)",
-tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+tc_bf2ffc0f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111010010;
 let hasNewValue = 1;
@@ -12073,16 +12180,16 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrh_rr";
 let CextOpcode = "L2_loadrh";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrh_rr";
 let isPredicable = 1;
 }
 def L4_loadrh_ur : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rd32 = memh($Rt32<<#$Ii+#$II)",
-tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+tc_abfd9a6d, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011101010;
 let hasNewValue = 1;
@@ -12104,7 +12211,7 @@ def L4_loadri_ap : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rd32 = memw($Re32=#$II)",
-tc_3b5b7ef9, TypeLD>, Enc_323f2d {
+tc_822c3c68, TypeLD>, Enc_323f2d {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011011100;
@@ -12125,7 +12232,7 @@ def L4_loadri_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rd32 = memw($Rs32+$Rt32<<#$Ii)",
-tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+tc_bf2ffc0f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111010100;
 let hasNewValue = 1;
@@ -12133,16 +12240,16 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = WordAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadri_rr";
 let CextOpcode = "L2_loadri";
 let InputType = "reg";
-let BaseOpcode = "L4_loadri_rr";
 let isPredicable = 1;
 }
 def L4_loadri_ur : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rd32 = memw($Rt32<<#$Ii+#$II)",
-tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+tc_abfd9a6d, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011101100;
 let hasNewValue = 1;
@@ -12164,7 +12271,7 @@ def L4_loadrub_ap : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rd32 = memub($Re32=#$II)",
-tc_3b5b7ef9, TypeLD>, Enc_323f2d {
+tc_822c3c68, TypeLD>, Enc_323f2d {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011011001;
@@ -12185,7 +12292,7 @@ def L4_loadrub_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rd32 = memub($Rs32+$Rt32<<#$Ii)",
-tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+tc_bf2ffc0f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111010001;
 let hasNewValue = 1;
@@ -12193,16 +12300,16 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrub_rr";
 let CextOpcode = "L2_loadrub";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrub_rr";
 let isPredicable = 1;
 }
 def L4_loadrub_ur : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rd32 = memub($Rt32<<#$Ii+#$II)",
-tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+tc_abfd9a6d, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011101001;
 let hasNewValue = 1;
@@ -12224,7 +12331,7 @@ def L4_loadruh_ap : HInst<
 (outs IntRegs:$Rd32, IntRegs:$Re32),
 (ins u32_0Imm:$II),
 "$Rd32 = memuh($Re32=#$II)",
-tc_3b5b7ef9, TypeLD>, Enc_323f2d {
+tc_822c3c68, TypeLD>, Enc_323f2d {
 let Inst{7-7} = 0b0;
 let Inst{13-12} = 0b01;
 let Inst{31-21} = 0b10011011011;
@@ -12245,7 +12352,7 @@ def L4_loadruh_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
-tc_bf061958, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
+tc_bf2ffc0f, TypeLD>, Enc_da664b, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111010011;
 let hasNewValue = 1;
@@ -12253,16 +12360,16 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadruh_rr";
 let CextOpcode = "L2_loadruh";
 let InputType = "reg";
-let BaseOpcode = "L4_loadruh_rr";
 let isPredicable = 1;
 }
 def L4_loadruh_ur : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, u2_0Imm:$Ii, u32_0Imm:$II),
 "$Rd32 = memuh($Rt32<<#$Ii+#$II)",
-tc_bab0eed9, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
+tc_abfd9a6d, TypeLD>, Enc_4f677b, AddrModeRel, ImmRegShl {
 let Inst{12-12} = 0b1;
 let Inst{31-21} = 0b10011101011;
 let hasNewValue = 1;
@@ -12284,7 +12391,7 @@ def L4_or_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rs32+#$Ii) |= $Rt32",
-tc_7186d325, TypeV4LDST>, Enc_d44e31 {
+tc_9bcfb2ee, TypeV4LDST>, Enc_d44e31 {
 let Inst{6-5} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110000;
@@ -12303,7 +12410,7 @@ def L4_or_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memb($Rs32) |= $Rt32",
-tc_7186d325, TypeMAPPING> {
+tc_9bcfb2ee, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -12311,7 +12418,7 @@ def L4_or_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+#$Ii) |= $Rt32",
-tc_7186d325, TypeV4LDST>, Enc_163a3c {
+tc_9bcfb2ee, TypeV4LDST>, Enc_163a3c {
 let Inst{6-5} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110001;
@@ -12330,7 +12437,7 @@ def L4_or_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memh($Rs32) |= $Rt32",
-tc_7186d325, TypeMAPPING> {
+tc_9bcfb2ee, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -12338,7 +12445,7 @@ def L4_or_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "memw($Rs32+#$Ii) |= $Rt32",
-tc_7186d325, TypeV4LDST>, Enc_226535 {
+tc_9bcfb2ee, TypeV4LDST>, Enc_226535 {
 let Inst{6-5} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110010;
@@ -12357,7 +12464,7 @@ def L4_or_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memw($Rs32) |= $Rt32",
-tc_7186d325, TypeMAPPING> {
+tc_9bcfb2ee, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -12365,7 +12472,7 @@ def L4_ploadrbf_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memb(#$Ii)",
-tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7c6d32e4, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011111000;
@@ -12377,8 +12484,8 @@ let addrMode = Absolute;
 let accessSize = ByteAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrb";
 let BaseOpcode = "L4_loadrb_abs";
+let CextOpcode = "L2_loadrb";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12390,7 +12497,7 @@ def L4_ploadrbf_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
-tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_45791fb8, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110001000;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12399,15 +12506,15 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrb_rr";
 let CextOpcode = "L2_loadrb";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrb_rr";
 }
 def L4_ploadrbfnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memb(#$Ii)",
-tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_822c3c68, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011111000;
@@ -12420,8 +12527,8 @@ let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrb";
 let BaseOpcode = "L4_loadrb_abs";
+let CextOpcode = "L2_loadrb";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12433,7 +12540,7 @@ def L4_ploadrbfnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4.new) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
-tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_b7c4062a, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110011000;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12443,15 +12550,15 @@ let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrb_rr";
 let CextOpcode = "L2_loadrb";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrb_rr";
 }
 def L4_ploadrbt_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memb(#$Ii)",
-tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7c6d32e4, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011111000;
@@ -12462,8 +12569,8 @@ let addrMode = Absolute;
 let accessSize = ByteAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrb";
 let BaseOpcode = "L4_loadrb_abs";
+let CextOpcode = "L2_loadrb";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12475,7 +12582,7 @@ def L4_ploadrbt_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
-tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_45791fb8, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110000000;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -12483,15 +12590,15 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrb_rr";
 let CextOpcode = "L2_loadrb";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrb_rr";
 }
 def L4_ploadrbtnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memb(#$Ii)",
-tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_822c3c68, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011111000;
@@ -12503,8 +12610,8 @@ let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrb";
 let BaseOpcode = "L4_loadrb_abs";
+let CextOpcode = "L2_loadrb";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12516,7 +12623,7 @@ def L4_ploadrbtnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4.new) $Rd32 = memb($Rs32+$Rt32<<#$Ii)",
-tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_b7c4062a, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110010000;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -12525,15 +12632,15 @@ let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrb_rr";
 let CextOpcode = "L2_loadrb";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrb_rr";
 }
 def L4_ploadrdf_abs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4) $Rdd32 = memd(#$Ii)",
-tc_7646c131, TypeLD>, Enc_2a7b91, AddrModeRel {
+tc_7c6d32e4, TypeLD>, Enc_2a7b91, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011111110;
@@ -12543,8 +12650,8 @@ let addrMode = Absolute;
 let accessSize = DoubleWordAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrd";
 let BaseOpcode = "L4_loadrd_abs";
+let CextOpcode = "L2_loadrd";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12556,22 +12663,22 @@ def L4_ploadrdf_rr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
-tc_e4b3cb20, TypeLD>, Enc_98c0b8, AddrModeRel {
+tc_45791fb8, TypeLD>, Enc_98c0b8, AddrModeRel {
 let Inst{31-21} = 0b00110001110;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = BaseRegOffset;
 let accessSize = DoubleWordAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrd_rr";
 let CextOpcode = "L2_loadrd";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrd_rr";
 }
 def L4_ploadrdfnew_abs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rdd32 = memd(#$Ii)",
-tc_3b5b7ef9, TypeLD>, Enc_2a7b91, AddrModeRel {
+tc_822c3c68, TypeLD>, Enc_2a7b91, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011111110;
@@ -12582,8 +12689,8 @@ let accessSize = DoubleWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrd";
 let BaseOpcode = "L4_loadrd_abs";
+let CextOpcode = "L2_loadrd";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12595,7 +12702,7 @@ def L4_ploadrdfnew_rr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4.new) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
-tc_25a78932, TypeLD>, Enc_98c0b8, AddrModeRel {
+tc_b7c4062a, TypeLD>, Enc_98c0b8, AddrModeRel {
 let Inst{31-21} = 0b00110011110;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12603,15 +12710,15 @@ let addrMode = BaseRegOffset;
 let accessSize = DoubleWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrd_rr";
 let CextOpcode = "L2_loadrd";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrd_rr";
 }
 def L4_ploadrdt_abs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4) $Rdd32 = memd(#$Ii)",
-tc_7646c131, TypeLD>, Enc_2a7b91, AddrModeRel {
+tc_7c6d32e4, TypeLD>, Enc_2a7b91, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011111110;
@@ -12620,8 +12727,8 @@ let addrMode = Absolute;
 let accessSize = DoubleWordAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrd";
 let BaseOpcode = "L4_loadrd_abs";
+let CextOpcode = "L2_loadrd";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12633,21 +12740,21 @@ def L4_ploadrdt_rr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
-tc_e4b3cb20, TypeLD>, Enc_98c0b8, AddrModeRel {
+tc_45791fb8, TypeLD>, Enc_98c0b8, AddrModeRel {
 let Inst{31-21} = 0b00110000110;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
 let accessSize = DoubleWordAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrd_rr";
 let CextOpcode = "L2_loadrd";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrd_rr";
 }
 def L4_ploadrdtnew_abs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rdd32 = memd(#$Ii)",
-tc_3b5b7ef9, TypeLD>, Enc_2a7b91, AddrModeRel {
+tc_822c3c68, TypeLD>, Enc_2a7b91, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011111110;
@@ -12657,8 +12764,8 @@ let accessSize = DoubleWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrd";
 let BaseOpcode = "L4_loadrd_abs";
+let CextOpcode = "L2_loadrd";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12670,22 +12777,22 @@ def L4_ploadrdtnew_rr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4.new) $Rdd32 = memd($Rs32+$Rt32<<#$Ii)",
-tc_25a78932, TypeLD>, Enc_98c0b8, AddrModeRel {
+tc_b7c4062a, TypeLD>, Enc_98c0b8, AddrModeRel {
 let Inst{31-21} = 0b00110010110;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
 let accessSize = DoubleWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrd_rr";
 let CextOpcode = "L2_loadrd";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrd_rr";
 }
 def L4_ploadrhf_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memh(#$Ii)",
-tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7c6d32e4, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011111010;
@@ -12697,8 +12804,8 @@ let addrMode = Absolute;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrh";
 let BaseOpcode = "L4_loadrh_abs";
+let CextOpcode = "L2_loadrh";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12710,7 +12817,7 @@ def L4_ploadrhf_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
-tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_45791fb8, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110001010;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12719,15 +12826,15 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrh_rr";
 let CextOpcode = "L2_loadrh";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrh_rr";
 }
 def L4_ploadrhfnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memh(#$Ii)",
-tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_822c3c68, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011111010;
@@ -12740,8 +12847,8 @@ let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrh";
 let BaseOpcode = "L4_loadrh_abs";
+let CextOpcode = "L2_loadrh";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12753,7 +12860,7 @@ def L4_ploadrhfnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4.new) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
-tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_b7c4062a, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110011010;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12763,15 +12870,15 @@ let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrh_rr";
 let CextOpcode = "L2_loadrh";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrh_rr";
 }
 def L4_ploadrht_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memh(#$Ii)",
-tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7c6d32e4, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011111010;
@@ -12782,8 +12889,8 @@ let addrMode = Absolute;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrh";
 let BaseOpcode = "L4_loadrh_abs";
+let CextOpcode = "L2_loadrh";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12795,7 +12902,7 @@ def L4_ploadrht_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
-tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_45791fb8, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110000010;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -12803,15 +12910,15 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrh_rr";
 let CextOpcode = "L2_loadrh";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrh_rr";
 }
 def L4_ploadrhtnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memh(#$Ii)",
-tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_822c3c68, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011111010;
@@ -12823,8 +12930,8 @@ let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrh";
 let BaseOpcode = "L4_loadrh_abs";
+let CextOpcode = "L2_loadrh";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12836,7 +12943,7 @@ def L4_ploadrhtnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4.new) $Rd32 = memh($Rs32+$Rt32<<#$Ii)",
-tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_b7c4062a, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110010010;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -12845,15 +12952,15 @@ let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrh_rr";
 let CextOpcode = "L2_loadrh";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrh_rr";
 }
 def L4_ploadrif_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memw(#$Ii)",
-tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7c6d32e4, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011111100;
@@ -12865,8 +12972,8 @@ let addrMode = Absolute;
 let accessSize = WordAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadri";
 let BaseOpcode = "L4_loadri_abs";
+let CextOpcode = "L2_loadri";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12878,7 +12985,7 @@ def L4_ploadrif_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
-tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_45791fb8, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110001100;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12887,15 +12994,15 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = WordAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadri_rr";
 let CextOpcode = "L2_loadri";
 let InputType = "reg";
-let BaseOpcode = "L4_loadri_rr";
 }
 def L4_ploadrifnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memw(#$Ii)",
-tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_822c3c68, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011111100;
@@ -12908,8 +13015,8 @@ let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadri";
 let BaseOpcode = "L4_loadri_abs";
+let CextOpcode = "L2_loadri";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12921,7 +13028,7 @@ def L4_ploadrifnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4.new) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
-tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_b7c4062a, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110011100;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -12931,15 +13038,15 @@ let addrMode = BaseRegOffset;
 let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadri_rr";
 let CextOpcode = "L2_loadri";
 let InputType = "reg";
-let BaseOpcode = "L4_loadri_rr";
 }
 def L4_ploadrit_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memw(#$Ii)",
-tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7c6d32e4, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011111100;
@@ -12950,8 +13057,8 @@ let addrMode = Absolute;
 let accessSize = WordAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadri";
 let BaseOpcode = "L4_loadri_abs";
+let CextOpcode = "L2_loadri";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -12963,7 +13070,7 @@ def L4_ploadrit_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
-tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_45791fb8, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110000100;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -12971,15 +13078,15 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = WordAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadri_rr";
 let CextOpcode = "L2_loadri";
 let InputType = "reg";
-let BaseOpcode = "L4_loadri_rr";
 }
 def L4_ploadritnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memw(#$Ii)",
-tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_822c3c68, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011111100;
@@ -12991,8 +13098,8 @@ let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadri";
 let BaseOpcode = "L4_loadri_abs";
+let CextOpcode = "L2_loadri";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -13004,7 +13111,7 @@ def L4_ploadritnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4.new) $Rd32 = memw($Rs32+$Rt32<<#$Ii)",
-tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_b7c4062a, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110010100;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -13013,15 +13120,15 @@ let addrMode = BaseRegOffset;
 let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadri_rr";
 let CextOpcode = "L2_loadri";
 let InputType = "reg";
-let BaseOpcode = "L4_loadri_rr";
 }
 def L4_ploadrubf_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memub(#$Ii)",
-tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7c6d32e4, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011111001;
@@ -13033,8 +13140,8 @@ let addrMode = Absolute;
 let accessSize = ByteAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrub";
 let BaseOpcode = "L4_loadrub_abs";
+let CextOpcode = "L2_loadrub";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -13046,7 +13153,7 @@ def L4_ploadrubf_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
-tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_45791fb8, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110001001;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -13055,15 +13162,15 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrub_rr";
 let CextOpcode = "L2_loadrub";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrub_rr";
 }
 def L4_ploadrubfnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memub(#$Ii)",
-tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_822c3c68, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011111001;
@@ -13076,8 +13183,8 @@ let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrub";
 let BaseOpcode = "L4_loadrub_abs";
+let CextOpcode = "L2_loadrub";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -13089,7 +13196,7 @@ def L4_ploadrubfnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4.new) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
-tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_b7c4062a, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110011001;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -13099,15 +13206,15 @@ let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrub_rr";
 let CextOpcode = "L2_loadrub";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrub_rr";
 }
 def L4_ploadrubt_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memub(#$Ii)",
-tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7c6d32e4, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011111001;
@@ -13118,8 +13225,8 @@ let addrMode = Absolute;
 let accessSize = ByteAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrub";
 let BaseOpcode = "L4_loadrub_abs";
+let CextOpcode = "L2_loadrub";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -13131,7 +13238,7 @@ def L4_ploadrubt_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
-tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_45791fb8, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110000001;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -13139,15 +13246,15 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrub_rr";
 let CextOpcode = "L2_loadrub";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrub_rr";
 }
 def L4_ploadrubtnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memub(#$Ii)",
-tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_822c3c68, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011111001;
@@ -13159,8 +13266,8 @@ let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrub";
 let BaseOpcode = "L4_loadrub_abs";
+let CextOpcode = "L2_loadrub";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -13172,7 +13279,7 @@ def L4_ploadrubtnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4.new) $Rd32 = memub($Rs32+$Rt32<<#$Ii)",
-tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_b7c4062a, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110010001;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -13181,15 +13288,15 @@ let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadrub_rr";
 let CextOpcode = "L2_loadrub";
 let InputType = "reg";
-let BaseOpcode = "L4_loadrub_rr";
 }
 def L4_ploadruhf_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4) $Rd32 = memuh(#$Ii)",
-tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7c6d32e4, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b101;
 let Inst{31-21} = 0b10011111011;
@@ -13201,8 +13308,8 @@ let addrMode = Absolute;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadruh";
 let BaseOpcode = "L4_loadruh_abs";
+let CextOpcode = "L2_loadruh";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -13214,7 +13321,7 @@ def L4_ploadruhf_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
-tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_45791fb8, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110001011;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -13223,15 +13330,15 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadruh_rr";
 let CextOpcode = "L2_loadruh";
 let InputType = "reg";
-let BaseOpcode = "L4_loadruh_rr";
 }
 def L4_ploadruhfnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if (!$Pt4.new) $Rd32 = memuh(#$Ii)",
-tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_822c3c68, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b111;
 let Inst{31-21} = 0b10011111011;
@@ -13244,8 +13351,8 @@ let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadruh";
 let BaseOpcode = "L4_loadruh_abs";
+let CextOpcode = "L2_loadruh";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -13257,7 +13364,7 @@ def L4_ploadruhfnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if (!$Pv4.new) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
-tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_b7c4062a, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110011011;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -13267,15 +13374,15 @@ let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadruh_rr";
 let CextOpcode = "L2_loadruh";
 let InputType = "reg";
-let BaseOpcode = "L4_loadruh_rr";
 }
 def L4_ploadruht_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4) $Rd32 = memuh(#$Ii)",
-tc_7646c131, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_7c6d32e4, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b100;
 let Inst{31-21} = 0b10011111011;
@@ -13286,8 +13393,8 @@ let addrMode = Absolute;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadruh";
 let BaseOpcode = "L4_loadruh_abs";
+let CextOpcode = "L2_loadruh";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -13299,7 +13406,7 @@ def L4_ploadruht_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
-tc_e4b3cb20, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_45791fb8, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110000011;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -13307,15 +13414,15 @@ let opNewValue = 0;
 let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadruh_rr";
 let CextOpcode = "L2_loadruh";
 let InputType = "reg";
-let BaseOpcode = "L4_loadruh_rr";
 }
 def L4_ploadruhtnew_abs : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pt4, u32_0Imm:$Ii),
 "if ($Pt4.new) $Rd32 = memuh(#$Ii)",
-tc_3b5b7ef9, TypeLD>, Enc_2301d6, AddrModeRel {
+tc_822c3c68, TypeLD>, Enc_2301d6, AddrModeRel {
 let Inst{7-5} = 0b100;
 let Inst{13-11} = 0b110;
 let Inst{31-21} = 0b10011111011;
@@ -13327,8 +13434,8 @@ let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadruh";
 let BaseOpcode = "L4_loadruh_abs";
+let CextOpcode = "L2_loadruh";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -13340,7 +13447,7 @@ def L4_ploadruhtnew_rr : HInst<
 (outs IntRegs:$Rd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "if ($Pv4.new) $Rd32 = memuh($Rs32+$Rt32<<#$Ii)",
-tc_25a78932, TypeLD>, Enc_2e1979, AddrModeRel {
+tc_b7c4062a, TypeLD>, Enc_2e1979, AddrModeRel {
 let Inst{31-21} = 0b00110010011;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -13349,15 +13456,15 @@ let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayLoad = 1;
+let BaseOpcode = "L4_loadruh_rr";
 let CextOpcode = "L2_loadruh";
 let InputType = "reg";
-let BaseOpcode = "L4_loadruh_rr";
 }
 def L4_return : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = dealloc_return($Rs32):raw",
-tc_675e4897, TypeLD>, Enc_3a3d62, PredNewRel {
+tc_40d64c94, TypeLD>, Enc_3a3d62, PredNewRel {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10010110000;
 let isTerminator = 1;
@@ -13378,7 +13485,7 @@ def L4_return_f : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32),
 "if (!$Pv4) $Rdd32 = dealloc_return($Rs32):raw",
-tc_2b8da4c2, TypeLD>, Enc_b7fad3, PredNewRel {
+tc_df5d53f9, TypeLD>, Enc_b7fad3, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1100;
 let Inst{31-21} = 0b10010110000;
@@ -13400,7 +13507,7 @@ def L4_return_fnew_pnt : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32),
 "if (!$Pv4.new) $Rdd32 = dealloc_return($Rs32):nt:raw",
-tc_9da59d12, TypeLD>, Enc_b7fad3, PredNewRel {
+tc_14ab4f41, TypeLD>, Enc_b7fad3, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1010;
 let Inst{31-21} = 0b10010110000;
@@ -13423,7 +13530,7 @@ def L4_return_fnew_pt : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32),
 "if (!$Pv4.new) $Rdd32 = dealloc_return($Rs32):t:raw",
-tc_9da59d12, TypeLD>, Enc_b7fad3, PredNewRel {
+tc_14ab4f41, TypeLD>, Enc_b7fad3, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b1110;
 let Inst{31-21} = 0b10010110000;
@@ -13446,7 +13553,7 @@ def L4_return_map_to_raw_f : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if (!$Pv4) dealloc_return",
-tc_2b8da4c2, TypeMAPPING>, Requires<[HasV65]> {
+tc_df5d53f9, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13454,7 +13561,7 @@ def L4_return_map_to_raw_fnew_pnt : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if (!$Pv4.new) dealloc_return:nt",
-tc_9da59d12, TypeMAPPING>, Requires<[HasV65]> {
+tc_14ab4f41, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13462,7 +13569,7 @@ def L4_return_map_to_raw_fnew_pt : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if (!$Pv4.new) dealloc_return:t",
-tc_9da59d12, TypeMAPPING>, Requires<[HasV65]> {
+tc_14ab4f41, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13470,7 +13577,7 @@ def L4_return_map_to_raw_t : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if ($Pv4) dealloc_return",
-tc_4d5fa3a1, TypeMAPPING>, Requires<[HasV65]> {
+tc_f38f92e1, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13478,7 +13585,7 @@ def L4_return_map_to_raw_tnew_pnt : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if ($Pv4.new) dealloc_return:nt",
-tc_e06f432a, TypeMAPPING>, Requires<[HasV65]> {
+tc_1981450d, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13486,7 +13593,7 @@ def L4_return_map_to_raw_tnew_pt : HInst<
 (outs),
 (ins PredRegs:$Pv4),
 "if ($Pv4.new) dealloc_return:t",
-tc_e06f432a, TypeMAPPING>, Requires<[HasV65]> {
+tc_1981450d, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13494,7 +13601,7 @@ def L4_return_t : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32),
 "if ($Pv4) $Rdd32 = dealloc_return($Rs32):raw",
-tc_2b8da4c2, TypeLD>, Enc_b7fad3, PredNewRel {
+tc_df5d53f9, TypeLD>, Enc_b7fad3, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b0100;
 let Inst{31-21} = 0b10010110000;
@@ -13515,7 +13622,7 @@ def L4_return_tnew_pnt : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32),
 "if ($Pv4.new) $Rdd32 = dealloc_return($Rs32):nt:raw",
-tc_9da59d12, TypeLD>, Enc_b7fad3, PredNewRel {
+tc_14ab4f41, TypeLD>, Enc_b7fad3, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b0010;
 let Inst{31-21} = 0b10010110000;
@@ -13537,7 +13644,7 @@ def L4_return_tnew_pt : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins PredRegs:$Pv4, IntRegs:$Rs32),
 "if ($Pv4.new) $Rdd32 = dealloc_return($Rs32):t:raw",
-tc_9da59d12, TypeLD>, Enc_b7fad3, PredNewRel {
+tc_14ab4f41, TypeLD>, Enc_b7fad3, PredNewRel {
 let Inst{7-5} = 0b000;
 let Inst{13-10} = 0b0110;
 let Inst{31-21} = 0b10010110000;
@@ -13559,7 +13666,7 @@ def L4_sub_memopb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rs32+#$Ii) -= $Rt32",
-tc_7186d325, TypeV4LDST>, Enc_d44e31 {
+tc_9bcfb2ee, TypeV4LDST>, Enc_d44e31 {
 let Inst{6-5} = 0b01;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110000;
@@ -13578,7 +13685,7 @@ def L4_sub_memopb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memb($Rs32) -= $Rt32",
-tc_7186d325, TypeMAPPING> {
+tc_9bcfb2ee, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13586,7 +13693,7 @@ def L4_sub_memoph_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+#$Ii) -= $Rt32",
-tc_7186d325, TypeV4LDST>, Enc_163a3c {
+tc_9bcfb2ee, TypeV4LDST>, Enc_163a3c {
 let Inst{6-5} = 0b01;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110001;
@@ -13605,7 +13712,7 @@ def L4_sub_memoph_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memh($Rs32) -= $Rt32",
-tc_7186d325, TypeMAPPING> {
+tc_9bcfb2ee, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13613,7 +13720,7 @@ def L4_sub_memopw_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "memw($Rs32+#$Ii) -= $Rt32",
-tc_7186d325, TypeV4LDST>, Enc_226535 {
+tc_9bcfb2ee, TypeV4LDST>, Enc_226535 {
 let Inst{6-5} = 0b01;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00111110010;
@@ -13632,7 +13739,7 @@ def L4_sub_memopw_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memw($Rs32) -= $Rt32",
-tc_7186d325, TypeMAPPING> {
+tc_9bcfb2ee, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13640,7 +13747,7 @@ def L6_deallocframe_map_to_raw : HInst<
 (outs),
 (ins),
 "deallocframe",
-tc_15aa71c5, TypeMAPPING>, Requires<[HasV65]> {
+tc_e9170fb7, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13648,7 +13755,7 @@ def L6_memcpy : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32, ModRegs:$Mu2),
 "memcpy($Rs32,$Rt32,$Mu2)",
-tc_a6b1eca9, TypeLD>, Enc_a75aa6, Requires<[HasV66]> {
+tc_5944960d, TypeLD>, Enc_a75aa6, Requires<[HasV66]> {
 let Inst{7-0} = 0b01000000;
 let Inst{31-21} = 0b10010010000;
 let mayLoad = 1;
@@ -13659,7 +13766,7 @@ def L6_return_map_to_raw : HInst<
 (outs),
 (ins),
 "dealloc_return",
-tc_675e4897, TypeMAPPING>, Requires<[HasV65]> {
+tc_40d64c94, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -13667,7 +13774,7 @@ def M2_acci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += add($Rs32,$Rt32)",
-tc_f675fee8, TypeM>, Enc_2ae154, ImmRegRel {
+tc_2c13e7f5, TypeM>, Enc_2ae154, ImmRegRel {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -13682,7 +13789,7 @@ def M2_accii : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rx32 += add($Rs32,#$Ii)",
-tc_f675fee8, TypeM>, Enc_c90aca, ImmRegRel {
+tc_2c13e7f5, TypeM>, Enc_c90aca, ImmRegRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100010000;
 let hasNewValue = 1;
@@ -13701,7 +13808,7 @@ def M2_cmaci_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += cmpyi($Rs32,$Rt32)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111000;
@@ -13712,7 +13819,7 @@ def M2_cmacr_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += cmpyr($Rs32,$Rt32)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111000;
@@ -13723,7 +13830,7 @@ def M2_cmacs_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += cmpy($Rs32,$Rt32):sat",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111000;
@@ -13735,7 +13842,7 @@ def M2_cmacs_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += cmpy($Rs32,$Rt32):<<1:sat",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111100;
@@ -13747,7 +13854,7 @@ def M2_cmacsc_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += cmpy($Rs32,$Rt32*):sat",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111010;
@@ -13759,7 +13866,7 @@ def M2_cmacsc_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += cmpy($Rs32,$Rt32*):<<1:sat",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111110;
@@ -13771,7 +13878,7 @@ def M2_cmpyi_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = cmpyi($Rs32,$Rt32)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101000;
@@ -13781,7 +13888,7 @@ def M2_cmpyr_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = cmpyr($Rs32,$Rt32)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101000;
@@ -13791,7 +13898,7 @@ def M2_cmpyrs_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = cmpy($Rs32,$Rt32):rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101001;
@@ -13804,7 +13911,7 @@ def M2_cmpyrs_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = cmpy($Rs32,$Rt32):<<1:rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101101;
@@ -13817,7 +13924,7 @@ def M2_cmpyrsc_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = cmpy($Rs32,$Rt32*):rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101011;
@@ -13830,7 +13937,7 @@ def M2_cmpyrsc_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = cmpy($Rs32,$Rt32*):<<1:rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101111;
@@ -13843,7 +13950,7 @@ def M2_cmpys_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = cmpy($Rs32,$Rt32):sat",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101000;
@@ -13854,7 +13961,7 @@ def M2_cmpys_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = cmpy($Rs32,$Rt32):<<1:sat",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101100;
@@ -13865,7 +13972,7 @@ def M2_cmpysc_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = cmpy($Rs32,$Rt32*):sat",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101010;
@@ -13876,7 +13983,7 @@ def M2_cmpysc_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = cmpy($Rs32,$Rt32*):<<1:sat",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101110;
@@ -13887,7 +13994,7 @@ def M2_cnacs_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= cmpy($Rs32,$Rt32):sat",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111000;
@@ -13899,7 +14006,7 @@ def M2_cnacs_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= cmpy($Rs32,$Rt32):<<1:sat",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111100;
@@ -13911,7 +14018,7 @@ def M2_cnacsc_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= cmpy($Rs32,$Rt32*):sat",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111010;
@@ -13923,7 +14030,7 @@ def M2_cnacsc_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= cmpy($Rs32,$Rt32*):<<1:sat",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111110;
@@ -13935,7 +14042,7 @@ def M2_dpmpyss_acc_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32,$Rt32)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111000;
@@ -13946,7 +14053,7 @@ def M2_dpmpyss_nac_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32,$Rt32)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111001;
@@ -13957,7 +14064,7 @@ def M2_dpmpyss_rnd_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32):rnd",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101001;
@@ -13969,7 +14076,7 @@ def M2_dpmpyss_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32,$Rt32)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101000;
@@ -13979,7 +14086,7 @@ def M2_dpmpyuu_acc_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32,$Rt32)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111010;
@@ -13990,7 +14097,7 @@ def M2_dpmpyuu_nac_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32,$Rt32)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111011;
@@ -14001,7 +14108,7 @@ def M2_dpmpyuu_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32,$Rt32)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101010;
@@ -14011,7 +14118,7 @@ def M2_hmmpyh_rs1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32.h):<<1:rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101101;
@@ -14024,7 +14131,7 @@ def M2_hmmpyh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32.h):<<1:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101101;
@@ -14037,7 +14144,7 @@ def M2_hmmpyl_rs1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32.l):<<1:rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101111;
@@ -14050,7 +14157,7 @@ def M2_hmmpyl_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32.l):<<1:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101101;
@@ -14063,7 +14170,7 @@ def M2_maci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyi($Rs32,$Rt32)",
-tc_d773585a, TypeM>, Enc_2ae154, ImmRegRel {
+tc_7f8ae742, TypeM>, Enc_2ae154, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -14078,7 +14185,7 @@ def M2_macsin : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Rx32 -= mpyi($Rs32,#$Ii)",
-tc_05d3a09b, TypeM>, Enc_c90aca {
+tc_a154b476, TypeM>, Enc_c90aca {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100001100;
 let hasNewValue = 1;
@@ -14096,7 +14203,7 @@ def M2_macsip : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Rx32 += mpyi($Rs32,#$Ii)",
-tc_05d3a09b, TypeM>, Enc_c90aca, ImmRegRel {
+tc_a154b476, TypeM>, Enc_c90aca, ImmRegRel {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100001000;
 let hasNewValue = 1;
@@ -14115,7 +14222,7 @@ def M2_mmachs_rs0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywoh($Rss32,$Rtt32):rnd:sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -14127,7 +14234,7 @@ def M2_mmachs_rs1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywoh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010101;
@@ -14139,7 +14246,7 @@ def M2_mmachs_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywoh($Rss32,$Rtt32):sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010000;
@@ -14151,7 +14258,7 @@ def M2_mmachs_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywoh($Rss32,$Rtt32):<<1:sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010100;
@@ -14163,7 +14270,7 @@ def M2_mmacls_rs0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweh($Rss32,$Rtt32):rnd:sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -14175,7 +14282,7 @@ def M2_mmacls_rs1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010101;
@@ -14187,7 +14294,7 @@ def M2_mmacls_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweh($Rss32,$Rtt32):sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010000;
@@ -14199,7 +14306,7 @@ def M2_mmacls_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweh($Rss32,$Rtt32):<<1:sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010100;
@@ -14211,7 +14318,7 @@ def M2_mmacuhs_rs0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywouh($Rss32,$Rtt32):rnd:sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010011;
@@ -14223,7 +14330,7 @@ def M2_mmacuhs_rs1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywouh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010111;
@@ -14235,7 +14342,7 @@ def M2_mmacuhs_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywouh($Rss32,$Rtt32):sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010010;
@@ -14247,7 +14354,7 @@ def M2_mmacuhs_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpywouh($Rss32,$Rtt32):<<1:sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010110;
@@ -14259,7 +14366,7 @@ def M2_mmaculs_rs0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweuh($Rss32,$Rtt32):rnd:sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010011;
@@ -14271,7 +14378,7 @@ def M2_mmaculs_rs1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweuh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010111;
@@ -14283,7 +14390,7 @@ def M2_mmaculs_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweuh($Rss32,$Rtt32):sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010010;
@@ -14295,7 +14402,7 @@ def M2_mmaculs_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyweuh($Rss32,$Rtt32):<<1:sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010110;
@@ -14307,7 +14414,7 @@ def M2_mmpyh_rs0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywoh($Rss32,$Rtt32):rnd:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000001;
@@ -14318,7 +14425,7 @@ def M2_mmpyh_rs1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywoh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -14329,7 +14436,7 @@ def M2_mmpyh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywoh($Rss32,$Rtt32):sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -14340,7 +14447,7 @@ def M2_mmpyh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywoh($Rss32,$Rtt32):<<1:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000100;
@@ -14351,7 +14458,7 @@ def M2_mmpyl_rs0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweh($Rss32,$Rtt32):rnd:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000001;
@@ -14362,7 +14469,7 @@ def M2_mmpyl_rs1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -14373,7 +14480,7 @@ def M2_mmpyl_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweh($Rss32,$Rtt32):sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -14384,7 +14491,7 @@ def M2_mmpyl_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweh($Rss32,$Rtt32):<<1:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000100;
@@ -14395,7 +14502,7 @@ def M2_mmpyuh_rs0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywouh($Rss32,$Rtt32):rnd:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000011;
@@ -14406,7 +14513,7 @@ def M2_mmpyuh_rs1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywouh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000111;
@@ -14417,7 +14524,7 @@ def M2_mmpyuh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywouh($Rss32,$Rtt32):sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000010;
@@ -14428,7 +14535,7 @@ def M2_mmpyuh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpywouh($Rss32,$Rtt32):<<1:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000110;
@@ -14439,7 +14546,7 @@ def M2_mmpyul_rs0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweuh($Rss32,$Rtt32):rnd:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000011;
@@ -14450,7 +14557,7 @@ def M2_mmpyul_rs1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweuh($Rss32,$Rtt32):<<1:rnd:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000111;
@@ -14461,7 +14568,7 @@ def M2_mmpyul_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweuh($Rss32,$Rtt32):sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000010;
@@ -14472,7 +14579,7 @@ def M2_mmpyul_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyweuh($Rss32,$Rtt32):<<1:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000110;
@@ -14483,7 +14590,7 @@ def M2_mnaci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyi($Rs32,$Rt32)",
-tc_bdceeac1, TypeM>, Enc_2ae154, Requires<[HasV66]> {
+tc_01e1be3b, TypeM>, Enc_2ae154, Requires<[HasV66]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111100;
@@ -14496,7 +14603,7 @@ def M2_mpy_acc_hh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14509,7 +14616,7 @@ def M2_mpy_acc_hh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14522,7 +14629,7 @@ def M2_mpy_acc_hl_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14535,7 +14642,7 @@ def M2_mpy_acc_hl_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14548,7 +14655,7 @@ def M2_mpy_acc_lh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14561,7 +14668,7 @@ def M2_mpy_acc_lh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14574,7 +14681,7 @@ def M2_mpy_acc_ll_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14587,7 +14694,7 @@ def M2_mpy_acc_ll_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14600,7 +14707,7 @@ def M2_mpy_acc_sat_hh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.h):sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14614,7 +14721,7 @@ def M2_mpy_acc_sat_hh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.h):<<1:sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14628,7 +14735,7 @@ def M2_mpy_acc_sat_hl_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.l):sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14642,7 +14749,7 @@ def M2_mpy_acc_sat_hl_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.h,$Rt32.l):<<1:sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14656,7 +14763,7 @@ def M2_mpy_acc_sat_lh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.h):sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14670,7 +14777,7 @@ def M2_mpy_acc_sat_lh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.h):<<1:sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14684,7 +14791,7 @@ def M2_mpy_acc_sat_ll_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.l):sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110000;
@@ -14698,7 +14805,7 @@ def M2_mpy_acc_sat_ll_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32.l,$Rt32.l):<<1:sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110100;
@@ -14712,7 +14819,7 @@ def M2_mpy_hh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h)",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -14724,7 +14831,7 @@ def M2_mpy_hh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h):<<1",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -14736,7 +14843,7 @@ def M2_mpy_hl_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l)",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -14748,7 +14855,7 @@ def M2_mpy_hl_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l):<<1",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -14760,7 +14867,7 @@ def M2_mpy_lh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h)",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -14772,7 +14879,7 @@ def M2_mpy_lh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h):<<1",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -14784,7 +14891,7 @@ def M2_mpy_ll_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l)",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -14796,7 +14903,7 @@ def M2_mpy_ll_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l):<<1",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -14808,7 +14915,7 @@ def M2_mpy_nac_hh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14821,7 +14928,7 @@ def M2_mpy_nac_hh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14834,7 +14941,7 @@ def M2_mpy_nac_hl_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14847,7 +14954,7 @@ def M2_mpy_nac_hl_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14860,7 +14967,7 @@ def M2_mpy_nac_lh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14873,7 +14980,7 @@ def M2_mpy_nac_lh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14886,7 +14993,7 @@ def M2_mpy_nac_ll_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14899,7 +15006,7 @@ def M2_mpy_nac_ll_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14912,7 +15019,7 @@ def M2_mpy_nac_sat_hh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.h):sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14926,7 +15033,7 @@ def M2_mpy_nac_sat_hh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.h):<<1:sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14940,7 +15047,7 @@ def M2_mpy_nac_sat_hl_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.l):sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14954,7 +15061,7 @@ def M2_mpy_nac_sat_hl_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.h,$Rt32.l):<<1:sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14968,7 +15075,7 @@ def M2_mpy_nac_sat_lh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.h):sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -14982,7 +15089,7 @@ def M2_mpy_nac_sat_lh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.h):<<1:sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -14996,7 +15103,7 @@ def M2_mpy_nac_sat_ll_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.l):sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110001;
@@ -15010,7 +15117,7 @@ def M2_mpy_nac_sat_ll_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32.l,$Rt32.l):<<1:sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110101;
@@ -15024,7 +15131,7 @@ def M2_mpy_rnd_hh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h):rnd",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -15036,7 +15143,7 @@ def M2_mpy_rnd_hh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15048,7 +15155,7 @@ def M2_mpy_rnd_hl_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l):rnd",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -15060,7 +15167,7 @@ def M2_mpy_rnd_hl_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15072,7 +15179,7 @@ def M2_mpy_rnd_lh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h):rnd",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -15084,7 +15191,7 @@ def M2_mpy_rnd_lh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15096,7 +15203,7 @@ def M2_mpy_rnd_ll_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l):rnd",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -15108,7 +15215,7 @@ def M2_mpy_rnd_ll_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15120,7 +15227,7 @@ def M2_mpy_sat_hh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h):sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -15133,7 +15240,7 @@ def M2_mpy_sat_hh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -15146,7 +15253,7 @@ def M2_mpy_sat_hl_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l):sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -15159,7 +15266,7 @@ def M2_mpy_sat_hl_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -15172,7 +15279,7 @@ def M2_mpy_sat_lh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h):sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -15185,7 +15292,7 @@ def M2_mpy_sat_lh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -15198,7 +15305,7 @@ def M2_mpy_sat_ll_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l):sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100000;
@@ -15211,7 +15318,7 @@ def M2_mpy_sat_ll_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100100;
@@ -15224,7 +15331,7 @@ def M2_mpy_sat_rnd_hh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h):rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -15237,7 +15344,7 @@ def M2_mpy_sat_rnd_hh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15250,7 +15357,7 @@ def M2_mpy_sat_rnd_hl_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l):rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -15263,7 +15370,7 @@ def M2_mpy_sat_rnd_hl_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15276,7 +15383,7 @@ def M2_mpy_sat_rnd_lh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h):rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -15289,7 +15396,7 @@ def M2_mpy_sat_rnd_lh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15302,7 +15409,7 @@ def M2_mpy_sat_rnd_ll_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l):rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100001;
@@ -15315,7 +15422,7 @@ def M2_mpy_sat_rnd_ll_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100101;
@@ -15328,7 +15435,7 @@ def M2_mpy_up : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32)",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101000;
@@ -15340,7 +15447,7 @@ def M2_mpy_up_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32):<<1",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101101;
@@ -15352,7 +15459,7 @@ def M2_mpy_up_s1_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpy($Rs32,$Rt32):<<1:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101111;
@@ -15365,7 +15472,7 @@ def M2_mpyd_acc_hh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.h,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110000;
@@ -15376,7 +15483,7 @@ def M2_mpyd_acc_hh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.h,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110100;
@@ -15387,7 +15494,7 @@ def M2_mpyd_acc_hl_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.h,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110000;
@@ -15398,7 +15505,7 @@ def M2_mpyd_acc_hl_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.h,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110100;
@@ -15409,7 +15516,7 @@ def M2_mpyd_acc_lh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.l,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110000;
@@ -15420,7 +15527,7 @@ def M2_mpyd_acc_lh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.l,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110100;
@@ -15431,7 +15538,7 @@ def M2_mpyd_acc_ll_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.l,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110000;
@@ -15442,7 +15549,7 @@ def M2_mpyd_acc_ll_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpy($Rs32.l,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110100;
@@ -15453,7 +15560,7 @@ def M2_mpyd_hh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.h)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100000;
@@ -15463,7 +15570,7 @@ def M2_mpyd_hh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.h):<<1",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100100;
@@ -15473,7 +15580,7 @@ def M2_mpyd_hl_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.l)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100000;
@@ -15483,7 +15590,7 @@ def M2_mpyd_hl_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.l):<<1",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100100;
@@ -15493,7 +15600,7 @@ def M2_mpyd_lh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.h)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100000;
@@ -15503,7 +15610,7 @@ def M2_mpyd_lh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.h):<<1",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100100;
@@ -15513,7 +15620,7 @@ def M2_mpyd_ll_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.l)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100000;
@@ -15523,7 +15630,7 @@ def M2_mpyd_ll_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.l):<<1",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100100;
@@ -15533,7 +15640,7 @@ def M2_mpyd_nac_hh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.h,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110001;
@@ -15544,7 +15651,7 @@ def M2_mpyd_nac_hh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.h,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110101;
@@ -15555,7 +15662,7 @@ def M2_mpyd_nac_hl_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.h,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110001;
@@ -15566,7 +15673,7 @@ def M2_mpyd_nac_hl_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.h,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110101;
@@ -15577,7 +15684,7 @@ def M2_mpyd_nac_lh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.l,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110001;
@@ -15588,7 +15695,7 @@ def M2_mpyd_nac_lh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.l,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110101;
@@ -15599,7 +15706,7 @@ def M2_mpyd_nac_ll_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.l,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110001;
@@ -15610,7 +15717,7 @@ def M2_mpyd_nac_ll_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpy($Rs32.l,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110101;
@@ -15621,7 +15728,7 @@ def M2_mpyd_rnd_hh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.h):rnd",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100001;
@@ -15631,7 +15738,7 @@ def M2_mpyd_rnd_hh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.h):<<1:rnd",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100101;
@@ -15641,7 +15748,7 @@ def M2_mpyd_rnd_hl_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.l):rnd",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100001;
@@ -15651,7 +15758,7 @@ def M2_mpyd_rnd_hl_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.h,$Rt32.l):<<1:rnd",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100101;
@@ -15661,7 +15768,7 @@ def M2_mpyd_rnd_lh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.h):rnd",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100001;
@@ -15671,7 +15778,7 @@ def M2_mpyd_rnd_lh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.h):<<1:rnd",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100101;
@@ -15681,7 +15788,7 @@ def M2_mpyd_rnd_ll_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.l):rnd",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100001;
@@ -15691,7 +15798,7 @@ def M2_mpyd_rnd_ll_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpy($Rs32.l,$Rt32.l):<<1:rnd",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100101;
@@ -15701,7 +15808,7 @@ def M2_mpyi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyi($Rs32,$Rt32)",
-tc_bafaade3, TypeM>, Enc_5ab2be, ImmRegRel {
+tc_c21d7447, TypeM>, Enc_5ab2be, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101000;
@@ -15715,7 +15822,7 @@ def M2_mpysin : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u8_0Imm:$Ii),
 "$Rd32 = -mpyi($Rs32,#$Ii)",
-tc_c8ce0b5c, TypeM>, Enc_b8c967 {
+tc_38382228, TypeM>, Enc_b8c967 {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100000100;
 let hasNewValue = 1;
@@ -15726,7 +15833,7 @@ def M2_mpysip : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Rd32 = +mpyi($Rs32,#$Ii)",
-tc_c8ce0b5c, TypeM>, Enc_b8c967 {
+tc_38382228, TypeM>, Enc_b8c967 {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100000000;
 let hasNewValue = 1;
@@ -15742,7 +15849,7 @@ def M2_mpysmi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, m32_0Imm:$Ii),
 "$Rd32 = mpyi($Rs32,#$Ii)",
-tc_c8ce0b5c, TypeM>, ImmRegRel {
+tc_38382228, TypeM>, ImmRegRel {
 let hasNewValue = 1;
 let opNewValue = 0;
 let CextOpcode = "M2_mpyi";
@@ -15758,7 +15865,7 @@ def M2_mpysu_up : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpysu($Rs32,$Rt32)",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101011;
@@ -15770,7 +15877,7 @@ def M2_mpyu_acc_hh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.h,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110010;
@@ -15783,7 +15890,7 @@ def M2_mpyu_acc_hh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.h,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110110;
@@ -15796,7 +15903,7 @@ def M2_mpyu_acc_hl_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.h,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110010;
@@ -15809,7 +15916,7 @@ def M2_mpyu_acc_hl_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.h,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110110;
@@ -15822,7 +15929,7 @@ def M2_mpyu_acc_lh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.l,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110010;
@@ -15835,7 +15942,7 @@ def M2_mpyu_acc_lh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.l,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110110;
@@ -15848,7 +15955,7 @@ def M2_mpyu_acc_ll_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.l,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110010;
@@ -15861,7 +15968,7 @@ def M2_mpyu_acc_ll_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpyu($Rs32.l,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110110;
@@ -15874,7 +15981,7 @@ def M2_mpyu_hh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.h,$Rt32.h)",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100010;
@@ -15886,7 +15993,7 @@ def M2_mpyu_hh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.h,$Rt32.h):<<1",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100110;
@@ -15898,7 +16005,7 @@ def M2_mpyu_hl_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.h,$Rt32.l)",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100010;
@@ -15910,7 +16017,7 @@ def M2_mpyu_hl_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.h,$Rt32.l):<<1",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100110;
@@ -15922,7 +16029,7 @@ def M2_mpyu_lh_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.l,$Rt32.h)",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100010;
@@ -15934,7 +16041,7 @@ def M2_mpyu_lh_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.l,$Rt32.h):<<1",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100110;
@@ -15946,7 +16053,7 @@ def M2_mpyu_ll_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.l,$Rt32.l)",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100010;
@@ -15958,7 +16065,7 @@ def M2_mpyu_ll_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32.l,$Rt32.l):<<1",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101100110;
@@ -15970,7 +16077,7 @@ def M2_mpyu_nac_hh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.h,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110011;
@@ -15983,7 +16090,7 @@ def M2_mpyu_nac_hh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.h,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110111;
@@ -15996,7 +16103,7 @@ def M2_mpyu_nac_hl_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.h,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110011;
@@ -16009,7 +16116,7 @@ def M2_mpyu_nac_hl_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.h,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110111;
@@ -16022,7 +16129,7 @@ def M2_mpyu_nac_lh_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.l,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110011;
@@ -16035,7 +16142,7 @@ def M2_mpyu_nac_lh_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.l,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110111;
@@ -16048,7 +16155,7 @@ def M2_mpyu_nac_ll_s0 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.l,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110011;
@@ -16061,7 +16168,7 @@ def M2_mpyu_nac_ll_s1 : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpyu($Rs32.l,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101110111;
@@ -16074,7 +16181,7 @@ def M2_mpyu_up : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyu($Rs32,$Rt32)",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101010;
@@ -16086,7 +16193,7 @@ def M2_mpyud_acc_hh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.h,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110010;
@@ -16097,7 +16204,7 @@ def M2_mpyud_acc_hh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.h,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110110;
@@ -16108,7 +16215,7 @@ def M2_mpyud_acc_hl_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.h,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110010;
@@ -16119,7 +16226,7 @@ def M2_mpyud_acc_hl_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.h,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110110;
@@ -16130,7 +16237,7 @@ def M2_mpyud_acc_lh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.l,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110010;
@@ -16141,7 +16248,7 @@ def M2_mpyud_acc_lh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.l,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110110;
@@ -16152,7 +16259,7 @@ def M2_mpyud_acc_ll_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.l,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110010;
@@ -16163,7 +16270,7 @@ def M2_mpyud_acc_ll_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += mpyu($Rs32.l,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110110;
@@ -16174,7 +16281,7 @@ def M2_mpyud_hh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.h,$Rt32.h)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100010;
@@ -16184,7 +16291,7 @@ def M2_mpyud_hh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.h,$Rt32.h):<<1",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100110;
@@ -16194,7 +16301,7 @@ def M2_mpyud_hl_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.h,$Rt32.l)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100010;
@@ -16204,7 +16311,7 @@ def M2_mpyud_hl_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.h,$Rt32.l):<<1",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100110;
@@ -16214,7 +16321,7 @@ def M2_mpyud_lh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.l,$Rt32.h)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100010;
@@ -16224,7 +16331,7 @@ def M2_mpyud_lh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.l,$Rt32.h):<<1",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100110;
@@ -16234,7 +16341,7 @@ def M2_mpyud_ll_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.l,$Rt32.l)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100010;
@@ -16244,7 +16351,7 @@ def M2_mpyud_ll_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = mpyu($Rs32.l,$Rt32.l):<<1",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100100110;
@@ -16254,7 +16361,7 @@ def M2_mpyud_nac_hh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.h,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110011;
@@ -16265,7 +16372,7 @@ def M2_mpyud_nac_hh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.h,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110111;
@@ -16276,7 +16383,7 @@ def M2_mpyud_nac_hl_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.h,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110011;
@@ -16287,7 +16394,7 @@ def M2_mpyud_nac_hl_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.h,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110111;
@@ -16298,7 +16405,7 @@ def M2_mpyud_nac_lh_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.l,$Rt32.h)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110011;
@@ -16309,7 +16416,7 @@ def M2_mpyud_nac_lh_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.l,$Rt32.h):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110111;
@@ -16320,7 +16427,7 @@ def M2_mpyud_nac_ll_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.l,$Rt32.l)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110011;
@@ -16331,7 +16438,7 @@ def M2_mpyud_nac_ll_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 -= mpyu($Rs32.l,$Rt32.l):<<1",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100110111;
@@ -16342,7 +16449,7 @@ def M2_mpyui : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = mpyui($Rs32,$Rt32)",
-tc_bafaade3, TypeM> {
+tc_c21d7447, TypeM> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -16352,7 +16459,7 @@ def M2_nacci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= add($Rs32,$Rt32)",
-tc_f675fee8, TypeM>, Enc_2ae154 {
+tc_2c13e7f5, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111100;
@@ -16366,7 +16473,7 @@ def M2_naccii : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rx32 -= add($Rs32,#$Ii)",
-tc_f675fee8, TypeM>, Enc_c90aca {
+tc_2c13e7f5, TypeM>, Enc_c90aca {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100010100;
 let hasNewValue = 1;
@@ -16384,7 +16491,7 @@ def M2_subacc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rx32 += sub($Rt32,$Rs32)",
-tc_f675fee8, TypeM>, Enc_a568d4 {
+tc_2c13e7f5, TypeM>, Enc_a568d4 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111000;
@@ -16398,7 +16505,7 @@ def M2_vabsdiffh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vabsdiffh($Rtt32,$Rss32)",
-tc_002cb246, TypeM>, Enc_ea23e4 {
+tc_0dfac0a7, TypeM>, Enc_ea23e4 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000011;
@@ -16408,7 +16515,7 @@ def M2_vabsdiffw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vabsdiffw($Rtt32,$Rss32)",
-tc_002cb246, TypeM>, Enc_ea23e4 {
+tc_0dfac0a7, TypeM>, Enc_ea23e4 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000001;
@@ -16418,7 +16525,7 @@ def M2_vcmac_s0_sat_i : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vcmpyi($Rss32,$Rtt32):sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010010;
@@ -16430,7 +16537,7 @@ def M2_vcmac_s0_sat_r : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vcmpyr($Rss32,$Rtt32):sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -16442,7 +16549,7 @@ def M2_vcmpy_s0_sat_i : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vcmpyi($Rss32,$Rtt32):sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000010;
@@ -16453,7 +16560,7 @@ def M2_vcmpy_s0_sat_r : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vcmpyr($Rss32,$Rtt32):sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000001;
@@ -16464,7 +16571,7 @@ def M2_vcmpy_s1_sat_i : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vcmpyi($Rss32,$Rtt32):<<1:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000110;
@@ -16475,7 +16582,7 @@ def M2_vcmpy_s1_sat_r : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vcmpyr($Rss32,$Rtt32):<<1:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -16486,7 +16593,7 @@ def M2_vdmacs_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vdmpy($Rss32,$Rtt32):sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010000;
@@ -16498,7 +16605,7 @@ def M2_vdmacs_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vdmpy($Rss32,$Rtt32):<<1:sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010100;
@@ -16510,7 +16617,7 @@ def M2_vdmpyrs_s0 : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vdmpy($Rss32,$Rtt32):rnd:sat",
-tc_bafaade3, TypeM>, Enc_d2216a {
+tc_c21d7447, TypeM>, Enc_d2216a {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101001000;
@@ -16523,7 +16630,7 @@ def M2_vdmpyrs_s1 : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vdmpy($Rss32,$Rtt32):<<1:rnd:sat",
-tc_bafaade3, TypeM>, Enc_d2216a {
+tc_c21d7447, TypeM>, Enc_d2216a {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101001100;
@@ -16536,7 +16643,7 @@ def M2_vdmpys_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vdmpy($Rss32,$Rtt32):sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -16547,7 +16654,7 @@ def M2_vdmpys_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vdmpy($Rss32,$Rtt32):<<1:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000100;
@@ -16558,7 +16665,7 @@ def M2_vmac2 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += vmpyh($Rs32,$Rt32)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111001;
@@ -16569,7 +16676,7 @@ def M2_vmac2es : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyeh($Rss32,$Rtt32)",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -16580,7 +16687,7 @@ def M2_vmac2es_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyeh($Rss32,$Rtt32):sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010000;
@@ -16592,7 +16699,7 @@ def M2_vmac2es_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vmpyeh($Rss32,$Rtt32):<<1:sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010100;
@@ -16604,7 +16711,7 @@ def M2_vmac2s_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += vmpyh($Rs32,$Rt32):sat",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111000;
@@ -16616,7 +16723,7 @@ def M2_vmac2s_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += vmpyh($Rs32,$Rt32):<<1:sat",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111100;
@@ -16628,7 +16735,7 @@ def M2_vmac2su_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += vmpyhsu($Rs32,$Rt32):sat",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111011;
@@ -16640,7 +16747,7 @@ def M2_vmac2su_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += vmpyhsu($Rs32,$Rt32):<<1:sat",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111111;
@@ -16652,7 +16759,7 @@ def M2_vmpy2es_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyeh($Rss32,$Rtt32):sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -16663,7 +16770,7 @@ def M2_vmpy2es_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vmpyeh($Rss32,$Rtt32):<<1:sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000100;
@@ -16674,7 +16781,7 @@ def M2_vmpy2s_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = vmpyh($Rs32,$Rt32):sat",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101000;
@@ -16685,7 +16792,7 @@ def M2_vmpy2s_s0pack : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = vmpyh($Rs32,$Rt32):rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101001;
@@ -16698,7 +16805,7 @@ def M2_vmpy2s_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = vmpyh($Rs32,$Rt32):<<1:sat",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101100;
@@ -16709,7 +16816,7 @@ def M2_vmpy2s_s1pack : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = vmpyh($Rs32,$Rt32):<<1:rnd:sat",
-tc_bafaade3, TypeM>, Enc_5ab2be {
+tc_c21d7447, TypeM>, Enc_5ab2be {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101101101;
@@ -16722,7 +16829,7 @@ def M2_vmpy2su_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = vmpyhsu($Rs32,$Rt32):sat",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101000;
@@ -16733,7 +16840,7 @@ def M2_vmpy2su_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = vmpyhsu($Rs32,$Rt32):<<1:sat",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101100;
@@ -16744,7 +16851,7 @@ def M2_vraddh : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vraddh($Rss32,$Rtt32)",
-tc_bafaade3, TypeM>, Enc_d2216a {
+tc_c21d7447, TypeM>, Enc_d2216a {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101001001;
@@ -16756,7 +16863,7 @@ def M2_vradduh : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vradduh($Rss32,$Rtt32)",
-tc_bafaade3, TypeM>, Enc_d2216a {
+tc_c21d7447, TypeM>, Enc_d2216a {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101001000;
@@ -16768,7 +16875,7 @@ def M2_vrcmaci_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrcmpyi($Rss32,$Rtt32)",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010000;
@@ -16779,7 +16886,7 @@ def M2_vrcmaci_s0c : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrcmpyi($Rss32,$Rtt32*)",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010010;
@@ -16790,7 +16897,7 @@ def M2_vrcmacr_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrcmpyr($Rss32,$Rtt32)",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010000;
@@ -16801,7 +16908,7 @@ def M2_vrcmacr_s0c : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrcmpyr($Rss32,$Rtt32*)",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010011;
@@ -16812,7 +16919,7 @@ def M2_vrcmpyi_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrcmpyi($Rss32,$Rtt32)",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -16822,7 +16929,7 @@ def M2_vrcmpyi_s0c : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrcmpyi($Rss32,$Rtt32*)",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000010;
@@ -16832,7 +16939,7 @@ def M2_vrcmpyr_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrcmpyr($Rss32,$Rtt32)",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -16842,7 +16949,7 @@ def M2_vrcmpyr_s0c : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrcmpyr($Rss32,$Rtt32*)",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000011;
@@ -16852,7 +16959,7 @@ def M2_vrcmpys_acc_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 += vrcmpys($Rss32,$Rt32):<<1:sat",
-tc_d773585a, TypeM> {
+tc_7f8ae742, TypeM> {
 let isPseudo = 1;
 let Constraints = "$Rxx32 = $Rxx32in";
 }
@@ -16860,7 +16967,7 @@ def M2_vrcmpys_acc_s1_h : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrcmpys($Rss32,$Rtt32):<<1:sat:raw:hi",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010101;
@@ -16872,7 +16979,7 @@ def M2_vrcmpys_acc_s1_l : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrcmpys($Rss32,$Rtt32):<<1:sat:raw:lo",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010111;
@@ -16884,14 +16991,14 @@ def M2_vrcmpys_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vrcmpys($Rss32,$Rt32):<<1:sat",
-tc_bafaade3, TypeM> {
+tc_c21d7447, TypeM> {
 let isPseudo = 1;
 }
 def M2_vrcmpys_s1_h : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrcmpys($Rss32,$Rtt32):<<1:sat:raw:hi",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -16902,7 +17009,7 @@ def M2_vrcmpys_s1_l : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrcmpys($Rss32,$Rtt32):<<1:sat:raw:lo",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000111;
@@ -16913,7 +17020,7 @@ def M2_vrcmpys_s1rp : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = vrcmpys($Rss32,$Rt32):<<1:rnd:sat",
-tc_bafaade3, TypeM> {
+tc_c21d7447, TypeM> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -16922,7 +17029,7 @@ def M2_vrcmpys_s1rp_h : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vrcmpys($Rss32,$Rtt32):<<1:rnd:sat:raw:hi",
-tc_bafaade3, TypeM>, Enc_d2216a {
+tc_c21d7447, TypeM>, Enc_d2216a {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101001101;
@@ -16935,7 +17042,7 @@ def M2_vrcmpys_s1rp_l : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = vrcmpys($Rss32,$Rtt32):<<1:rnd:sat:raw:lo",
-tc_bafaade3, TypeM>, Enc_d2216a {
+tc_c21d7447, TypeM>, Enc_d2216a {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101001101;
@@ -16948,7 +17055,7 @@ def M2_vrmac_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrmpyh($Rss32,$Rtt32)",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010000;
@@ -16959,7 +17066,7 @@ def M2_vrmpy_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrmpyh($Rss32,$Rtt32)",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000000;
@@ -16969,7 +17076,7 @@ def M2_xor_xacc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 ^= xor($Rs32,$Rt32)",
-tc_f429765c, TypeM>, Enc_2ae154 {
+tc_a4e22bbd, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111100;
@@ -16983,7 +17090,7 @@ def M4_and_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= and($Rs32,$Rt32)",
-tc_f429765c, TypeM>, Enc_2ae154 {
+tc_a4e22bbd, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111010;
@@ -16997,7 +17104,7 @@ def M4_and_andn : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= and($Rs32,~$Rt32)",
-tc_f429765c, TypeM>, Enc_2ae154 {
+tc_a4e22bbd, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111001;
@@ -17011,7 +17118,7 @@ def M4_and_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= or($Rs32,$Rt32)",
-tc_f429765c, TypeM>, Enc_2ae154 {
+tc_a4e22bbd, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111010;
@@ -17025,7 +17132,7 @@ def M4_and_xor : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= xor($Rs32,$Rt32)",
-tc_f429765c, TypeM>, Enc_2ae154 {
+tc_a4e22bbd, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111010;
@@ -17039,7 +17146,7 @@ def M4_cmpyi_wh : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = cmpyiwh($Rss32,$Rt32):<<1:rnd:sat",
-tc_bafaade3, TypeS_3op>, Enc_3d5b28 {
+tc_c21d7447, TypeS_3op>, Enc_3d5b28 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -17052,7 +17159,7 @@ def M4_cmpyi_whc : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = cmpyiwh($Rss32,$Rt32*):<<1:rnd:sat",
-tc_bafaade3, TypeS_3op>, Enc_3d5b28 {
+tc_c21d7447, TypeS_3op>, Enc_3d5b28 {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -17065,7 +17172,7 @@ def M4_cmpyr_wh : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = cmpyrwh($Rss32,$Rt32):<<1:rnd:sat",
-tc_bafaade3, TypeS_3op>, Enc_3d5b28 {
+tc_c21d7447, TypeS_3op>, Enc_3d5b28 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -17078,7 +17185,7 @@ def M4_cmpyr_whc : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = cmpyrwh($Rss32,$Rt32*):<<1:rnd:sat",
-tc_bafaade3, TypeS_3op>, Enc_3d5b28 {
+tc_c21d7447, TypeS_3op>, Enc_3d5b28 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -17091,7 +17198,7 @@ def M4_mac_up_s1_sat : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += mpy($Rs32,$Rt32):<<1:sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111011;
@@ -17106,7 +17213,7 @@ def M4_mpyri_addi : HInst<
 (outs IntRegs:$Rd32),
 (ins u32_0Imm:$Ii, IntRegs:$Rs32, u6_0Imm:$II),
 "$Rd32 = add(#$Ii,mpyi($Rs32,#$II))",
-tc_05d3a09b, TypeALU64>, Enc_322e1b, ImmRegRel {
+tc_a154b476, TypeALU64>, Enc_322e1b, Requires<[UseCompound]>, ImmRegRel {
 let Inst{31-24} = 0b11011000;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -17122,7 +17229,7 @@ def M4_mpyri_addr : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Ru32, IntRegs:$Rs32, u32_0Imm:$Ii),
 "$Rd32 = add($Ru32,mpyi($Rs32,#$Ii))",
-tc_05d3a09b, TypeALU64>, Enc_420cf3, ImmRegRel {
+tc_a154b476, TypeALU64>, Enc_420cf3, Requires<[UseCompound]>, ImmRegRel {
 let Inst{31-23} = 0b110111111;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -17139,7 +17246,7 @@ def M4_mpyri_addr_u2 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Ru32, u6_2Imm:$Ii, IntRegs:$Rs32),
 "$Rd32 = add($Ru32,mpyi(#$Ii,$Rs32))",
-tc_1a2fd869, TypeALU64>, Enc_277737 {
+tc_503ce0f3, TypeALU64>, Enc_277737, Requires<[UseCompound]> {
 let Inst{31-23} = 0b110111110;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -17149,7 +17256,7 @@ def M4_mpyrr_addi : HInst<
 (outs IntRegs:$Rd32),
 (ins u32_0Imm:$Ii, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = add(#$Ii,mpyi($Rs32,$Rt32))",
-tc_d773585a, TypeALU64>, Enc_a7b8e8, ImmRegRel {
+tc_7f8ae742, TypeALU64>, Enc_a7b8e8, Requires<[UseCompound]>, ImmRegRel {
 let Inst{31-23} = 0b110101110;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -17166,7 +17273,7 @@ def M4_mpyrr_addr : HInst<
 (outs IntRegs:$Ry32),
 (ins IntRegs:$Ru32, IntRegs:$Ry32in, IntRegs:$Rs32),
 "$Ry32 = add($Ru32,mpyi($Ry32in,$Rs32))",
-tc_d773585a, TypeM>, Enc_7f1a05, ImmRegRel {
+tc_7f8ae742, TypeM>, Enc_7f1a05, Requires<[UseCompound]>, ImmRegRel {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100011000;
@@ -17181,7 +17288,7 @@ def M4_nac_up_s1_sat : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= mpy($Rs32,$Rt32):<<1:sat",
-tc_d773585a, TypeM>, Enc_2ae154 {
+tc_7f8ae742, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111011;
@@ -17196,7 +17303,7 @@ def M4_or_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= and($Rs32,$Rt32)",
-tc_f429765c, TypeM>, Enc_2ae154 {
+tc_a4e22bbd, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111010;
@@ -17210,7 +17317,7 @@ def M4_or_andn : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= and($Rs32,~$Rt32)",
-tc_f429765c, TypeM>, Enc_2ae154 {
+tc_a4e22bbd, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111001;
@@ -17224,7 +17331,7 @@ def M4_or_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= or($Rs32,$Rt32)",
-tc_f429765c, TypeM>, Enc_2ae154 {
+tc_a4e22bbd, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111110;
@@ -17238,7 +17345,7 @@ def M4_or_xor : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= xor($Rs32,$Rt32)",
-tc_f429765c, TypeM>, Enc_2ae154 {
+tc_a4e22bbd, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111110;
@@ -17252,7 +17359,7 @@ def M4_pmpyw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = pmpyw($Rs32,$Rt32)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101010;
@@ -17262,7 +17369,7 @@ def M4_pmpyw_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 ^= pmpyw($Rs32,$Rt32)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111001;
@@ -17273,7 +17380,7 @@ def M4_vpmpyh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = vpmpyh($Rs32,$Rt32)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101110;
@@ -17283,7 +17390,7 @@ def M4_vpmpyh_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 ^= vpmpyh($Rs32,$Rt32)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111101;
@@ -17294,7 +17401,7 @@ def M4_vrmpyeh_acc_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrmpyweh($Rss32,$Rtt32)",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -17305,7 +17412,7 @@ def M4_vrmpyeh_acc_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrmpyweh($Rss32,$Rtt32):<<1",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010101;
@@ -17316,7 +17423,7 @@ def M4_vrmpyeh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrmpyweh($Rss32,$Rtt32)",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000010;
@@ -17326,7 +17433,7 @@ def M4_vrmpyeh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrmpyweh($Rss32,$Rtt32):<<1",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000110;
@@ -17336,7 +17443,7 @@ def M4_vrmpyoh_acc_s0 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrmpywoh($Rss32,$Rtt32)",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010011;
@@ -17347,7 +17454,7 @@ def M4_vrmpyoh_acc_s1 : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrmpywoh($Rss32,$Rtt32):<<1",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010111;
@@ -17358,7 +17465,7 @@ def M4_vrmpyoh_s0 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrmpywoh($Rss32,$Rtt32)",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000001;
@@ -17368,7 +17475,7 @@ def M4_vrmpyoh_s1 : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrmpywoh($Rss32,$Rtt32):<<1",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -17378,7 +17485,7 @@ def M4_xor_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 ^= and($Rs32,$Rt32)",
-tc_f429765c, TypeM>, Enc_2ae154 {
+tc_a4e22bbd, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111110;
@@ -17392,7 +17499,7 @@ def M4_xor_andn : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 ^= and($Rs32,~$Rt32)",
-tc_f429765c, TypeM>, Enc_2ae154 {
+tc_a4e22bbd, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111001;
@@ -17406,7 +17513,7 @@ def M4_xor_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 ^= or($Rs32,$Rt32)",
-tc_f429765c, TypeM>, Enc_2ae154 {
+tc_a4e22bbd, TypeM>, Enc_2ae154 {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101111110;
@@ -17420,7 +17527,7 @@ def M4_xor_xacc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 ^= xor($Rss32,$Rtt32)",
-tc_f429765c, TypeS_3op>, Enc_88c16c {
+tc_a4e22bbd, TypeS_3op>, Enc_88c16c {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001010100;
@@ -17431,7 +17538,7 @@ def M5_vdmacbsu : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vdmpybsu($Rss32,$Rtt32):sat",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010001;
@@ -17443,7 +17550,7 @@ def M5_vdmpybsu : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vdmpybsu($Rss32,$Rtt32):sat",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
@@ -17454,7 +17561,7 @@ def M5_vmacbsu : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += vmpybsu($Rs32,$Rt32)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111110;
@@ -17465,7 +17572,7 @@ def M5_vmacbuu : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rxx32 += vmpybu($Rs32,$Rt32)",
-tc_d773585a, TypeM>, Enc_61f0b0 {
+tc_7f8ae742, TypeM>, Enc_61f0b0 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100111100;
@@ -17476,7 +17583,7 @@ def M5_vmpybsu : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = vmpybsu($Rs32,$Rt32)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101010;
@@ -17486,7 +17593,7 @@ def M5_vmpybuu : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = vmpybu($Rs32,$Rt32)",
-tc_bafaade3, TypeM>, Enc_be32a5 {
+tc_c21d7447, TypeM>, Enc_be32a5 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11100101100;
@@ -17496,7 +17603,7 @@ def M5_vrmacbsu : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrmpybsu($Rss32,$Rtt32)",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010110;
@@ -17507,7 +17614,7 @@ def M5_vrmacbuu : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 += vrmpybu($Rss32,$Rtt32)",
-tc_d773585a, TypeM>, Enc_88c16c {
+tc_7f8ae742, TypeM>, Enc_88c16c {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101010100;
@@ -17518,7 +17625,7 @@ def M5_vrmpybsu : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrmpybsu($Rss32,$Rtt32)",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000110;
@@ -17528,7 +17635,7 @@ def M5_vrmpybuu : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vrmpybu($Rss32,$Rtt32)",
-tc_bafaade3, TypeM>, Enc_a56825 {
+tc_c21d7447, TypeM>, Enc_a56825 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000100;
@@ -17538,7 +17645,7 @@ def M6_vabsdiffb : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vabsdiffb($Rtt32,$Rss32)",
-tc_9461ff31, TypeM>, Enc_ea23e4, Requires<[HasV62]> {
+tc_9b3c0462, TypeM>, Enc_ea23e4, Requires<[HasV62]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000111;
@@ -17548,17 +17655,222 @@ def M6_vabsdiffub : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = vabsdiffub($Rtt32,$Rss32)",
-tc_9461ff31, TypeM>, Enc_ea23e4, Requires<[HasV62]> {
+tc_9b3c0462, TypeM>, Enc_ea23e4, Requires<[HasV62]> {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11101000101;
 let prefersSlot3 = 1;
 }
+def M7_dcmpyiw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = cmpyiw($Rss32,$Rtt32)",
+tc_5a4b5e58, TypeM>, Enc_a56825, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000011;
+let prefersSlot3 = 1;
+}
+def M7_dcmpyiw_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += cmpyiw($Rss32,$Rtt32)",
+tc_197dce51, TypeM>, Enc_88c16c, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010011;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M7_dcmpyiwc : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = cmpyiw($Rss32,$Rtt32*)",
+tc_5a4b5e58, TypeM>, Enc_a56825, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000111;
+let prefersSlot3 = 1;
+}
+def M7_dcmpyiwc_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += cmpyiw($Rss32,$Rtt32*)",
+tc_197dce51, TypeM>, Enc_88c16c, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b110;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010010;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M7_dcmpyrw : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = cmpyrw($Rss32,$Rtt32)",
+tc_5a4b5e58, TypeM>, Enc_a56825, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000100;
+let prefersSlot3 = 1;
+}
+def M7_dcmpyrw_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += cmpyrw($Rss32,$Rtt32)",
+tc_197dce51, TypeM>, Enc_88c16c, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010100;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M7_dcmpyrwc : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = cmpyrw($Rss32,$Rtt32*)",
+tc_5a4b5e58, TypeM>, Enc_a56825, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101000110;
+let prefersSlot3 = 1;
+}
+def M7_dcmpyrwc_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += cmpyrw($Rss32,$Rtt32*)",
+tc_197dce51, TypeM>, Enc_88c16c, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b010;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101010110;
+let prefersSlot3 = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M7_vdmpy : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rdd32 = vdmpyw($Rss32,$Rtt32)",
+tc_5a4b5e58, TypeM>, Requires<[HasV67]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+}
+def M7_vdmpy_acc : HInst<
+(outs DoubleRegs:$Rxx32),
+(ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rxx32 += vdmpyw($Rss32,$Rtt32)",
+tc_197dce51, TypeM>, Requires<[HasV67]> {
+let isPseudo = 1;
+let isCodeGenOnly = 1;
+let Constraints = "$Rxx32 = $Rxx32in";
+}
+def M7_wcmpyiw : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = cmpyiw($Rss32,$Rtt32):<<1:sat",
+tc_5a4b5e58, TypeM>, Enc_d2216a, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001001;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M7_wcmpyiw_rnd : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = cmpyiw($Rss32,$Rtt32):<<1:rnd:sat",
+tc_5a4b5e58, TypeM>, Enc_d2216a, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001101;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M7_wcmpyiwc : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = cmpyiw($Rss32,$Rtt32*):<<1:sat",
+tc_5a4b5e58, TypeM>, Enc_d2216a, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001000;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M7_wcmpyiwc_rnd : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = cmpyiw($Rss32,$Rtt32*):<<1:rnd:sat",
+tc_5a4b5e58, TypeM>, Enc_d2216a, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b100;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001100;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M7_wcmpyrw : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = cmpyrw($Rss32,$Rtt32):<<1:sat",
+tc_5a4b5e58, TypeM>, Enc_d2216a, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001010;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M7_wcmpyrw_rnd : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = cmpyrw($Rss32,$Rtt32):<<1:rnd:sat",
+tc_5a4b5e58, TypeM>, Enc_d2216a, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001110;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M7_wcmpyrwc : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = cmpyrw($Rss32,$Rtt32*):<<1:sat",
+tc_5a4b5e58, TypeM>, Enc_d2216a, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001011;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
+def M7_wcmpyrwc_rnd : HInst<
+(outs IntRegs:$Rd32),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"$Rd32 = cmpyrw($Rss32,$Rtt32*):<<1:rnd:sat",
+tc_5a4b5e58, TypeM>, Enc_d2216a, Requires<[HasV67,UseAudio]> {
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b11101001111;
+let hasNewValue = 1;
+let opNewValue = 0;
+let prefersSlot3 = 1;
+let Defs = [USR_OVF];
+}
 def PS_loadrbabs : HInst<
 (outs IntRegs:$Rd32),
 (ins u32_0Imm:$Ii),
 "$Rd32 = memb(#$Ii)",
-tc_c4db48cb, TypeV2LDST>, Enc_25bef0, AddrModeRel {
+tc_8a6d0d94, TypeV2LDST>, Enc_25bef0, AddrModeRel {
 let Inst{24-21} = 0b1000;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -17567,8 +17879,8 @@ let addrMode = Absolute;
 let accessSize = ByteAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrb";
 let BaseOpcode = "L4_loadrb_abs";
+let CextOpcode = "L2_loadrb";
 let isPredicable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtended = 1;
@@ -17581,15 +17893,15 @@ def PS_loadrdabs : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins u29_3Imm:$Ii),
 "$Rdd32 = memd(#$Ii)",
-tc_c4db48cb, TypeV2LDST>, Enc_509701, AddrModeRel {
+tc_8a6d0d94, TypeV2LDST>, Enc_509701, AddrModeRel {
 let Inst{24-21} = 0b1110;
 let Inst{31-27} = 0b01001;
 let addrMode = Absolute;
 let accessSize = DoubleWordAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrd";
 let BaseOpcode = "L4_loadrd_abs";
+let CextOpcode = "L2_loadrd";
 let isPredicable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtended = 1;
@@ -17602,7 +17914,7 @@ def PS_loadrhabs : HInst<
 (outs IntRegs:$Rd32),
 (ins u31_1Imm:$Ii),
 "$Rd32 = memh(#$Ii)",
-tc_c4db48cb, TypeV2LDST>, Enc_8df4be, AddrModeRel {
+tc_8a6d0d94, TypeV2LDST>, Enc_8df4be, AddrModeRel {
 let Inst{24-21} = 0b1010;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -17611,8 +17923,8 @@ let addrMode = Absolute;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrh";
 let BaseOpcode = "L4_loadrh_abs";
+let CextOpcode = "L2_loadrh";
 let isPredicable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtended = 1;
@@ -17625,7 +17937,7 @@ def PS_loadriabs : HInst<
 (outs IntRegs:$Rd32),
 (ins u30_2Imm:$Ii),
 "$Rd32 = memw(#$Ii)",
-tc_c4db48cb, TypeV2LDST>, Enc_4f4ed7, AddrModeRel {
+tc_8a6d0d94, TypeV2LDST>, Enc_4f4ed7, AddrModeRel {
 let Inst{24-21} = 0b1100;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -17634,8 +17946,8 @@ let addrMode = Absolute;
 let accessSize = WordAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadri";
 let BaseOpcode = "L4_loadri_abs";
+let CextOpcode = "L2_loadri";
 let isPredicable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtended = 1;
@@ -17648,7 +17960,7 @@ def PS_loadrubabs : HInst<
 (outs IntRegs:$Rd32),
 (ins u32_0Imm:$Ii),
 "$Rd32 = memub(#$Ii)",
-tc_c4db48cb, TypeV2LDST>, Enc_25bef0, AddrModeRel {
+tc_8a6d0d94, TypeV2LDST>, Enc_25bef0, AddrModeRel {
 let Inst{24-21} = 0b1001;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -17657,8 +17969,8 @@ let addrMode = Absolute;
 let accessSize = ByteAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadrub";
 let BaseOpcode = "L4_loadrub_abs";
+let CextOpcode = "L2_loadrub";
 let isPredicable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtended = 1;
@@ -17671,7 +17983,7 @@ def PS_loadruhabs : HInst<
 (outs IntRegs:$Rd32),
 (ins u31_1Imm:$Ii),
 "$Rd32 = memuh(#$Ii)",
-tc_c4db48cb, TypeV2LDST>, Enc_8df4be, AddrModeRel {
+tc_8a6d0d94, TypeV2LDST>, Enc_8df4be, AddrModeRel {
 let Inst{24-21} = 0b1011;
 let Inst{31-27} = 0b01001;
 let hasNewValue = 1;
@@ -17680,8 +17992,8 @@ let addrMode = Absolute;
 let accessSize = HalfWordAccess;
 let mayLoad = 1;
 let isExtended = 1;
-let CextOpcode = "L2_loadruh";
 let BaseOpcode = "L4_loadruh_abs";
+let CextOpcode = "L2_loadruh";
 let isPredicable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtended = 1;
@@ -17694,15 +18006,15 @@ def PS_storerbabs : HInst<
 (outs),
 (ins u32_0Imm:$Ii, IntRegs:$Rt32),
 "memb(#$Ii) = $Rt32",
-tc_0371abea, TypeV2LDST>, Enc_1b64fb, AddrModeRel {
+tc_0655b949, TypeV2LDST>, Enc_1b64fb, AddrModeRel {
 let Inst{24-21} = 0b0000;
 let Inst{31-27} = 0b01001;
 let addrMode = Absolute;
 let accessSize = ByteAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerbabs";
+let CextOpcode = "S2_storerb";
 let isPredicable = 1;
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
@@ -17716,7 +18028,7 @@ def PS_storerbnewabs : HInst<
 (outs),
 (ins u32_0Imm:$Ii, IntRegs:$Nt8),
 "memb(#$Ii) = $Nt8.new",
-tc_5bf126a6, TypeV2LDST>, Enc_ad1831, AddrModeRel {
+tc_6e20402a, TypeV2LDST>, Enc_ad1831, AddrModeRel {
 let Inst{12-11} = 0b00;
 let Inst{24-21} = 0b0101;
 let Inst{31-27} = 0b01001;
@@ -17727,8 +18039,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerbabs";
+let CextOpcode = "S2_storerb";
 let isPredicable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtended = 1;
@@ -17742,15 +18054,15 @@ def PS_storerdabs : HInst<
 (outs),
 (ins u29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "memd(#$Ii) = $Rtt32",
-tc_0371abea, TypeV2LDST>, Enc_5c124a, AddrModeRel {
+tc_0655b949, TypeV2LDST>, Enc_5c124a, AddrModeRel {
 let Inst{24-21} = 0b0110;
 let Inst{31-27} = 0b01001;
 let addrMode = Absolute;
 let accessSize = DoubleWordAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerd";
 let BaseOpcode = "S2_storerdabs";
+let CextOpcode = "S2_storerd";
 let isPredicable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtended = 1;
@@ -17763,15 +18075,15 @@ def PS_storerfabs : HInst<
 (outs),
 (ins u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh(#$Ii) = $Rt32.h",
-tc_0371abea, TypeV2LDST>, Enc_fda92c, AddrModeRel {
+tc_0655b949, TypeV2LDST>, Enc_fda92c, AddrModeRel {
 let Inst{24-21} = 0b0011;
 let Inst{31-27} = 0b01001;
 let addrMode = Absolute;
 let accessSize = HalfWordAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerf";
 let BaseOpcode = "S2_storerfabs";
+let CextOpcode = "S2_storerf";
 let isPredicable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtended = 1;
@@ -17784,15 +18096,15 @@ def PS_storerhabs : HInst<
 (outs),
 (ins u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh(#$Ii) = $Rt32",
-tc_0371abea, TypeV2LDST>, Enc_fda92c, AddrModeRel {
+tc_0655b949, TypeV2LDST>, Enc_fda92c, AddrModeRel {
 let Inst{24-21} = 0b0010;
 let Inst{31-27} = 0b01001;
 let addrMode = Absolute;
 let accessSize = HalfWordAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerhabs";
+let CextOpcode = "S2_storerh";
 let isPredicable = 1;
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
@@ -17806,7 +18118,7 @@ def PS_storerhnewabs : HInst<
 (outs),
 (ins u31_1Imm:$Ii, IntRegs:$Nt8),
 "memh(#$Ii) = $Nt8.new",
-tc_5bf126a6, TypeV2LDST>, Enc_bc03e5, AddrModeRel {
+tc_6e20402a, TypeV2LDST>, Enc_bc03e5, AddrModeRel {
 let Inst{12-11} = 0b01;
 let Inst{24-21} = 0b0101;
 let Inst{31-27} = 0b01001;
@@ -17817,8 +18129,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerhabs";
+let CextOpcode = "S2_storerh";
 let isPredicable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtended = 1;
@@ -17832,15 +18144,15 @@ def PS_storeriabs : HInst<
 (outs),
 (ins u30_2Imm:$Ii, IntRegs:$Rt32),
 "memw(#$Ii) = $Rt32",
-tc_0371abea, TypeV2LDST>, Enc_541f26, AddrModeRel {
+tc_0655b949, TypeV2LDST>, Enc_541f26, AddrModeRel {
 let Inst{24-21} = 0b0100;
 let Inst{31-27} = 0b01001;
 let addrMode = Absolute;
 let accessSize = WordAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeriabs";
+let CextOpcode = "S2_storeri";
 let isPredicable = 1;
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
@@ -17854,7 +18166,7 @@ def PS_storerinewabs : HInst<
 (outs),
 (ins u30_2Imm:$Ii, IntRegs:$Nt8),
 "memw(#$Ii) = $Nt8.new",
-tc_5bf126a6, TypeV2LDST>, Enc_78cbf0, AddrModeRel {
+tc_6e20402a, TypeV2LDST>, Enc_78cbf0, AddrModeRel {
 let Inst{12-11} = 0b10;
 let Inst{24-21} = 0b0101;
 let Inst{31-27} = 0b01001;
@@ -17865,8 +18177,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeriabs";
+let CextOpcode = "S2_storeri";
 let isPredicable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtended = 1;
@@ -17876,11 +18188,22 @@ let opExtentBits = 18;
 let opExtentAlign = 2;
 let opNewValue = 1;
 }
+def PS_trap1 : HInst<
+(outs),
+(ins u8_0Imm:$Ii),
+"trap1(#$Ii)",
+tc_53c851ab, TypeJ>, Enc_a51a9a, Requires<[HasPreV65]> {
+let Inst{1-0} = 0b00;
+let Inst{7-5} = 0b000;
+let Inst{13-13} = 0b0;
+let Inst{31-16} = 0b0101010010000000;
+let isSolo = 1;
+}
 def S2_addasl_rrri : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32, u3_0Imm:$Ii),
 "$Rd32 = addasl($Rt32,$Rs32,#$Ii)",
-tc_f675fee8, TypeS_3op>, Enc_47ef61 {
+tc_2c13e7f5, TypeS_3op>, Enc_47ef61 {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000100000;
 let hasNewValue = 1;
@@ -17891,7 +18214,7 @@ def S2_allocframe : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, u11_3Imm:$Ii),
 "allocframe($Rx32,#$Ii):raw",
-tc_b44ecf75, TypeST>, Enc_22c845 {
+tc_934753bb, TypeST>, Enc_22c845 {
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b10100000100;
 let hasNewValue = 1;
@@ -17907,7 +18230,7 @@ def S2_asl_i_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = asl($Rss32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_5eac98 {
+tc_5da50c4b, TypeS_2op>, Enc_5eac98 {
 let Inst{7-5} = 0b010;
 let Inst{31-21} = 0b10000000000;
 }
@@ -17915,7 +18238,7 @@ def S2_asl_i_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 += asl($Rss32,#$Ii)",
-tc_f675fee8, TypeS_2op>, Enc_70fb07 {
+tc_2c13e7f5, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b110;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -17925,7 +18248,7 @@ def S2_asl_i_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 &= asl($Rss32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_70fb07 {
+tc_a4e22bbd, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b010;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -17935,7 +18258,7 @@ def S2_asl_i_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 -= asl($Rss32,#$Ii)",
-tc_f675fee8, TypeS_2op>, Enc_70fb07 {
+tc_2c13e7f5, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b010;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -17945,7 +18268,7 @@ def S2_asl_i_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 |= asl($Rss32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_70fb07 {
+tc_a4e22bbd, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b110;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -17955,7 +18278,7 @@ def S2_asl_i_p_xacc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 ^= asl($Rss32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_70fb07 {
+tc_a4e22bbd, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b010;
 let Inst{31-21} = 0b10000010100;
 let prefersSlot3 = 1;
@@ -17965,7 +18288,7 @@ def S2_asl_i_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = asl($Rs32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_a05677 {
+tc_5da50c4b, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100000;
@@ -17976,7 +18299,7 @@ def S2_asl_i_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 += asl($Rs32,#$Ii)",
-tc_f675fee8, TypeS_2op>, Enc_28a2dc {
+tc_2c13e7f5, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -17989,7 +18312,7 @@ def S2_asl_i_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 &= asl($Rs32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_28a2dc {
+tc_a4e22bbd, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -18002,7 +18325,7 @@ def S2_asl_i_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 -= asl($Rs32,#$Ii)",
-tc_f675fee8, TypeS_2op>, Enc_28a2dc {
+tc_2c13e7f5, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -18015,7 +18338,7 @@ def S2_asl_i_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 |= asl($Rs32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_28a2dc {
+tc_a4e22bbd, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -18028,7 +18351,7 @@ def S2_asl_i_r_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = asl($Rs32,#$Ii):sat",
-tc_779080bf, TypeS_2op>, Enc_a05677 {
+tc_8a825db2, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100010;
@@ -18041,7 +18364,7 @@ def S2_asl_i_r_xacc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 ^= asl($Rs32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_28a2dc {
+tc_a4e22bbd, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110100;
@@ -18054,7 +18377,7 @@ def S2_asl_i_vh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vaslh($Rss32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_12b6e9 {
+tc_5da50c4b, TypeS_2op>, Enc_12b6e9 {
 let Inst{7-5} = 0b010;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10000000100;
@@ -18063,7 +18386,7 @@ def S2_asl_i_vw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
 "$Rdd32 = vaslw($Rss32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_7e5a82 {
+tc_5da50c4b, TypeS_2op>, Enc_7e5a82 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000000010;
@@ -18072,7 +18395,7 @@ def S2_asl_r_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = asl($Rss32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_927852 {
+tc_5da50c4b, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011100;
@@ -18081,7 +18404,7 @@ def S2_asl_r_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 += asl($Rss32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_1aa186 {
+tc_2c13e7f5, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011110;
@@ -18092,7 +18415,7 @@ def S2_asl_r_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 &= asl($Rss32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_1aa186 {
+tc_a4e22bbd, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011010;
@@ -18103,7 +18426,7 @@ def S2_asl_r_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 -= asl($Rss32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_1aa186 {
+tc_2c13e7f5, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011100;
@@ -18114,7 +18437,7 @@ def S2_asl_r_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 |= asl($Rss32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_1aa186 {
+tc_a4e22bbd, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011000;
@@ -18125,7 +18448,7 @@ def S2_asl_r_p_xor : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 ^= asl($Rss32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_1aa186 {
+tc_a4e22bbd, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011011;
@@ -18136,7 +18459,7 @@ def S2_asl_r_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = asl($Rs32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_5ab2be {
+tc_5da50c4b, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110010;
@@ -18147,7 +18470,7 @@ def S2_asl_r_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += asl($Rs32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_2ae154 {
+tc_2c13e7f5, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100110;
@@ -18160,7 +18483,7 @@ def S2_asl_r_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= asl($Rs32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_2ae154 {
+tc_a4e22bbd, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100010;
@@ -18173,7 +18496,7 @@ def S2_asl_r_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= asl($Rs32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_2ae154 {
+tc_2c13e7f5, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100100;
@@ -18186,7 +18509,7 @@ def S2_asl_r_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= asl($Rs32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_2ae154 {
+tc_a4e22bbd, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100000;
@@ -18199,7 +18522,7 @@ def S2_asl_r_r_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = asl($Rs32,$Rt32):sat",
-tc_779080bf, TypeS_3op>, Enc_5ab2be {
+tc_8a825db2, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110000;
@@ -18212,7 +18535,7 @@ def S2_asl_r_vh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vaslh($Rss32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_927852 {
+tc_5da50c4b, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011010;
@@ -18221,7 +18544,7 @@ def S2_asl_r_vw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vaslw($Rss32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_927852 {
+tc_5da50c4b, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011000;
@@ -18230,7 +18553,7 @@ def S2_asr_i_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = asr($Rss32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_5eac98 {
+tc_5da50c4b, TypeS_2op>, Enc_5eac98 {
 let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b10000000000;
 }
@@ -18238,7 +18561,7 @@ def S2_asr_i_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 += asr($Rss32,#$Ii)",
-tc_f675fee8, TypeS_2op>, Enc_70fb07 {
+tc_2c13e7f5, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b100;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -18248,7 +18571,7 @@ def S2_asr_i_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 &= asr($Rss32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_70fb07 {
+tc_a4e22bbd, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -18258,7 +18581,7 @@ def S2_asr_i_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 -= asr($Rss32,#$Ii)",
-tc_f675fee8, TypeS_2op>, Enc_70fb07 {
+tc_2c13e7f5, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -18268,7 +18591,7 @@ def S2_asr_i_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 |= asr($Rss32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_70fb07 {
+tc_a4e22bbd, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b100;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -18278,7 +18601,7 @@ def S2_asr_i_p_rnd : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = asr($Rss32,#$Ii):rnd",
-tc_002cb246, TypeS_2op>, Enc_5eac98 {
+tc_0dfac0a7, TypeS_2op>, Enc_5eac98 {
 let Inst{7-5} = 0b111;
 let Inst{31-21} = 0b10000000110;
 let prefersSlot3 = 1;
@@ -18287,14 +18610,14 @@ def S2_asr_i_p_rnd_goodsyntax : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = asrrnd($Rss32,#$Ii)",
-tc_002cb246, TypeS_2op> {
+tc_0dfac0a7, TypeS_2op> {
 let isPseudo = 1;
 }
 def S2_asr_i_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = asr($Rs32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_a05677 {
+tc_5da50c4b, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100000;
@@ -18305,7 +18628,7 @@ def S2_asr_i_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 += asr($Rs32,#$Ii)",
-tc_f675fee8, TypeS_2op>, Enc_28a2dc {
+tc_2c13e7f5, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -18318,7 +18641,7 @@ def S2_asr_i_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 &= asr($Rs32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_28a2dc {
+tc_a4e22bbd, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -18331,7 +18654,7 @@ def S2_asr_i_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 -= asr($Rs32,#$Ii)",
-tc_f675fee8, TypeS_2op>, Enc_28a2dc {
+tc_2c13e7f5, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -18344,7 +18667,7 @@ def S2_asr_i_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 |= asr($Rs32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_28a2dc {
+tc_a4e22bbd, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -18357,7 +18680,7 @@ def S2_asr_i_r_rnd : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = asr($Rs32,#$Ii):rnd",
-tc_002cb246, TypeS_2op>, Enc_a05677 {
+tc_0dfac0a7, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100010;
@@ -18369,7 +18692,7 @@ def S2_asr_i_r_rnd_goodsyntax : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = asrrnd($Rs32,#$Ii)",
-tc_002cb246, TypeS_2op> {
+tc_0dfac0a7, TypeS_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -18378,7 +18701,7 @@ def S2_asr_i_svw_trun : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
 "$Rd32 = vasrw($Rss32,#$Ii)",
-tc_4414d8b1, TypeS_2op>, Enc_8dec2e {
+tc_f34c1c21, TypeS_2op>, Enc_8dec2e {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001000110;
@@ -18390,7 +18713,7 @@ def S2_asr_i_vh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vasrh($Rss32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_12b6e9 {
+tc_5da50c4b, TypeS_2op>, Enc_12b6e9 {
 let Inst{7-5} = 0b000;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10000000100;
@@ -18399,7 +18722,7 @@ def S2_asr_i_vw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
 "$Rdd32 = vasrw($Rss32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_7e5a82 {
+tc_5da50c4b, TypeS_2op>, Enc_7e5a82 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000000010;
@@ -18408,7 +18731,7 @@ def S2_asr_r_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = asr($Rss32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_927852 {
+tc_5da50c4b, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011100;
@@ -18417,7 +18740,7 @@ def S2_asr_r_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 += asr($Rss32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_1aa186 {
+tc_2c13e7f5, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011110;
@@ -18428,7 +18751,7 @@ def S2_asr_r_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 &= asr($Rss32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_1aa186 {
+tc_a4e22bbd, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011010;
@@ -18439,7 +18762,7 @@ def S2_asr_r_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 -= asr($Rss32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_1aa186 {
+tc_2c13e7f5, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011100;
@@ -18450,7 +18773,7 @@ def S2_asr_r_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 |= asr($Rss32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_1aa186 {
+tc_a4e22bbd, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011000;
@@ -18461,7 +18784,7 @@ def S2_asr_r_p_xor : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 ^= asr($Rss32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_1aa186 {
+tc_a4e22bbd, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011011;
@@ -18472,7 +18795,7 @@ def S2_asr_r_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = asr($Rs32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_5ab2be {
+tc_5da50c4b, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110010;
@@ -18483,7 +18806,7 @@ def S2_asr_r_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += asr($Rs32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_2ae154 {
+tc_2c13e7f5, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100110;
@@ -18496,7 +18819,7 @@ def S2_asr_r_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= asr($Rs32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_2ae154 {
+tc_a4e22bbd, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100010;
@@ -18509,7 +18832,7 @@ def S2_asr_r_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= asr($Rs32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_2ae154 {
+tc_2c13e7f5, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100100;
@@ -18522,7 +18845,7 @@ def S2_asr_r_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= asr($Rs32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_2ae154 {
+tc_a4e22bbd, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100000;
@@ -18535,7 +18858,7 @@ def S2_asr_r_r_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = asr($Rs32,$Rt32):sat",
-tc_779080bf, TypeS_3op>, Enc_5ab2be {
+tc_8a825db2, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110000;
@@ -18548,7 +18871,7 @@ def S2_asr_r_svw_trun : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rd32 = vasrw($Rss32,$Rt32)",
-tc_4414d8b1, TypeS_3op>, Enc_3d5b28 {
+tc_f34c1c21, TypeS_3op>, Enc_3d5b28 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000101000;
@@ -18560,7 +18883,7 @@ def S2_asr_r_vh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vasrh($Rss32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_927852 {
+tc_5da50c4b, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011010;
@@ -18569,7 +18892,7 @@ def S2_asr_r_vw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vasrw($Rss32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_927852 {
+tc_5da50c4b, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011000;
@@ -18578,7 +18901,7 @@ def S2_brev : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = brev($Rs32)",
-tc_14b5c689, TypeS_2op>, Enc_5e2823 {
+tc_a7bdb22c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10001100010;
 let hasNewValue = 1;
@@ -18589,7 +18912,7 @@ def S2_brevp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = brev($Rss32)",
-tc_14b5c689, TypeS_2op>, Enc_b9c5fb {
+tc_a7bdb22c, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000000110;
 let prefersSlot3 = 1;
@@ -18598,7 +18921,7 @@ def S2_cabacdecbin : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = decbin($Rss32,$Rtt32)",
-tc_76851da1, TypeS_3op>, Enc_a56825 {
+tc_db596beb, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001110;
@@ -18610,7 +18933,7 @@ def S2_cl0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = cl0($Rs32)",
-tc_14b5c689, TypeS_2op>, Enc_5e2823 {
+tc_a7bdb22c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10001100000;
 let hasNewValue = 1;
@@ -18621,7 +18944,7 @@ def S2_cl0p : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = cl0($Rss32)",
-tc_14b5c689, TypeS_2op>, Enc_90cd8b {
+tc_a7bdb22c, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10001000010;
 let hasNewValue = 1;
@@ -18632,7 +18955,7 @@ def S2_cl1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = cl1($Rs32)",
-tc_14b5c689, TypeS_2op>, Enc_5e2823 {
+tc_a7bdb22c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10001100000;
 let hasNewValue = 1;
@@ -18643,7 +18966,7 @@ def S2_cl1p : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = cl1($Rss32)",
-tc_14b5c689, TypeS_2op>, Enc_90cd8b {
+tc_a7bdb22c, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001000010;
 let hasNewValue = 1;
@@ -18654,7 +18977,7 @@ def S2_clb : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = clb($Rs32)",
-tc_14b5c689, TypeS_2op>, Enc_5e2823 {
+tc_a7bdb22c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001100000;
 let hasNewValue = 1;
@@ -18665,7 +18988,7 @@ def S2_clbnorm : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = normamt($Rs32)",
-tc_14b5c689, TypeS_2op>, Enc_5e2823 {
+tc_a7bdb22c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10001100000;
 let hasNewValue = 1;
@@ -18676,7 +18999,7 @@ def S2_clbp : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = clb($Rss32)",
-tc_14b5c689, TypeS_2op>, Enc_90cd8b {
+tc_a7bdb22c, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001000010;
 let hasNewValue = 1;
@@ -18687,7 +19010,7 @@ def S2_clrbit_i : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = clrbit($Rs32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_a05677 {
+tc_5da50c4b, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100110;
@@ -18698,7 +19021,7 @@ def S2_clrbit_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = clrbit($Rs32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_5ab2be {
+tc_5da50c4b, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110100;
@@ -18709,7 +19032,7 @@ def S2_ct0 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = ct0($Rs32)",
-tc_14b5c689, TypeS_2op>, Enc_5e2823 {
+tc_a7bdb22c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001100010;
 let hasNewValue = 1;
@@ -18720,7 +19043,7 @@ def S2_ct0p : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = ct0($Rss32)",
-tc_14b5c689, TypeS_2op>, Enc_90cd8b {
+tc_a7bdb22c, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10001000111;
 let hasNewValue = 1;
@@ -18731,7 +19054,7 @@ def S2_ct1 : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = ct1($Rs32)",
-tc_14b5c689, TypeS_2op>, Enc_5e2823 {
+tc_a7bdb22c, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10001100010;
 let hasNewValue = 1;
@@ -18742,7 +19065,7 @@ def S2_ct1p : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = ct1($Rss32)",
-tc_14b5c689, TypeS_2op>, Enc_90cd8b {
+tc_a7bdb22c, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001000111;
 let hasNewValue = 1;
@@ -18753,7 +19076,7 @@ def S2_deinterleave : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = deinterleave($Rss32)",
-tc_14b5c689, TypeS_2op>, Enc_b9c5fb {
+tc_a7bdb22c, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000000110;
 let prefersSlot3 = 1;
@@ -18762,7 +19085,7 @@ def S2_extractu : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
 "$Rd32 = extractu($Rs32,#$Ii,#$II)",
-tc_f675fee8, TypeS_2op>, Enc_b388cf {
+tc_2c13e7f5, TypeS_2op>, Enc_b388cf {
 let Inst{13-13} = 0b0;
 let Inst{31-23} = 0b100011010;
 let hasNewValue = 1;
@@ -18773,7 +19096,7 @@ def S2_extractu_rp : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "$Rd32 = extractu($Rs32,$Rtt32)",
-tc_002cb246, TypeS_3op>, Enc_e07374 {
+tc_a08b630b, TypeS_3op>, Enc_e07374 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001001000;
@@ -18785,7 +19108,7 @@ def S2_extractup : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
 "$Rdd32 = extractu($Rss32,#$Ii,#$II)",
-tc_f675fee8, TypeS_2op>, Enc_b84c4c {
+tc_2c13e7f5, TypeS_2op>, Enc_b84c4c {
 let Inst{31-24} = 0b10000001;
 let prefersSlot3 = 1;
 }
@@ -18793,7 +19116,7 @@ def S2_extractup_rp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = extractu($Rss32,$Rtt32)",
-tc_002cb246, TypeS_3op>, Enc_a56825 {
+tc_a08b630b, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001000;
@@ -18803,7 +19126,7 @@ def S2_insert : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
 "$Rx32 = insert($Rs32,#$Ii,#$II)",
-tc_bfec0f01, TypeS_2op>, Enc_a1e29d {
+tc_bb831a7c, TypeS_2op>, Enc_a1e29d {
 let Inst{13-13} = 0b0;
 let Inst{31-23} = 0b100011110;
 let hasNewValue = 1;
@@ -18815,7 +19138,7 @@ def S2_insert_rp : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "$Rx32 = insert($Rs32,$Rtt32)",
-tc_f429765c, TypeS_3op>, Enc_179b35 {
+tc_a4e22bbd, TypeS_3op>, Enc_179b35 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001000000;
@@ -18828,7 +19151,7 @@ def S2_insertp : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
 "$Rxx32 = insert($Rss32,#$Ii,#$II)",
-tc_bfec0f01, TypeS_2op>, Enc_143a3c {
+tc_bb831a7c, TypeS_2op>, Enc_143a3c {
 let Inst{31-24} = 0b10000011;
 let prefersSlot3 = 1;
 let Constraints = "$Rxx32 = $Rxx32in";
@@ -18837,7 +19160,7 @@ def S2_insertp_rp : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rxx32 = insert($Rss32,$Rtt32)",
-tc_f429765c, TypeS_3op>, Enc_88c16c {
+tc_a4e22bbd, TypeS_3op>, Enc_88c16c {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001010000;
@@ -18848,7 +19171,7 @@ def S2_interleave : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = interleave($Rss32)",
-tc_14b5c689, TypeS_2op>, Enc_b9c5fb {
+tc_a7bdb22c, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10000000110;
 let prefersSlot3 = 1;
@@ -18857,7 +19180,7 @@ def S2_lfsp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = lfs($Rss32,$Rtt32)",
-tc_002cb246, TypeS_3op>, Enc_a56825 {
+tc_a08b630b, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001100;
@@ -18867,7 +19190,7 @@ def S2_lsl_r_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = lsl($Rss32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_927852 {
+tc_5da50c4b, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011100;
@@ -18876,7 +19199,7 @@ def S2_lsl_r_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 += lsl($Rss32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_1aa186 {
+tc_2c13e7f5, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011110;
@@ -18887,7 +19210,7 @@ def S2_lsl_r_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 &= lsl($Rss32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_1aa186 {
+tc_a4e22bbd, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011010;
@@ -18898,7 +19221,7 @@ def S2_lsl_r_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 -= lsl($Rss32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_1aa186 {
+tc_2c13e7f5, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011100;
@@ -18909,7 +19232,7 @@ def S2_lsl_r_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 |= lsl($Rss32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_1aa186 {
+tc_a4e22bbd, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011000;
@@ -18920,7 +19243,7 @@ def S2_lsl_r_p_xor : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 ^= lsl($Rss32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_1aa186 {
+tc_a4e22bbd, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011011;
@@ -18931,7 +19254,7 @@ def S2_lsl_r_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = lsl($Rs32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_5ab2be {
+tc_5da50c4b, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110010;
@@ -18942,7 +19265,7 @@ def S2_lsl_r_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += lsl($Rs32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_2ae154 {
+tc_2c13e7f5, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100110;
@@ -18955,7 +19278,7 @@ def S2_lsl_r_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= lsl($Rs32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_2ae154 {
+tc_a4e22bbd, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100010;
@@ -18968,7 +19291,7 @@ def S2_lsl_r_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= lsl($Rs32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_2ae154 {
+tc_2c13e7f5, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100100;
@@ -18981,7 +19304,7 @@ def S2_lsl_r_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= lsl($Rs32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_2ae154 {
+tc_a4e22bbd, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100000;
@@ -18994,7 +19317,7 @@ def S2_lsl_r_vh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vlslh($Rss32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_927852 {
+tc_5da50c4b, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011010;
@@ -19003,7 +19326,7 @@ def S2_lsl_r_vw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vlslw($Rss32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_927852 {
+tc_5da50c4b, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011000;
@@ -19012,7 +19335,7 @@ def S2_lsr_i_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = lsr($Rss32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_5eac98 {
+tc_5da50c4b, TypeS_2op>, Enc_5eac98 {
 let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b10000000000;
 }
@@ -19020,7 +19343,7 @@ def S2_lsr_i_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 += lsr($Rss32,#$Ii)",
-tc_f675fee8, TypeS_2op>, Enc_70fb07 {
+tc_2c13e7f5, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b101;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -19030,7 +19353,7 @@ def S2_lsr_i_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 &= lsr($Rss32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_70fb07 {
+tc_a4e22bbd, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -19040,7 +19363,7 @@ def S2_lsr_i_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 -= lsr($Rss32,#$Ii)",
-tc_f675fee8, TypeS_2op>, Enc_70fb07 {
+tc_2c13e7f5, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -19050,7 +19373,7 @@ def S2_lsr_i_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 |= lsr($Rss32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_70fb07 {
+tc_a4e22bbd, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b101;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -19060,7 +19383,7 @@ def S2_lsr_i_p_xacc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 ^= lsr($Rss32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_70fb07 {
+tc_a4e22bbd, TypeS_2op>, Enc_70fb07 {
 let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b10000010100;
 let prefersSlot3 = 1;
@@ -19070,7 +19393,7 @@ def S2_lsr_i_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = lsr($Rs32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_a05677 {
+tc_5da50c4b, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100000;
@@ -19081,7 +19404,7 @@ def S2_lsr_i_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 += lsr($Rs32,#$Ii)",
-tc_f675fee8, TypeS_2op>, Enc_28a2dc {
+tc_2c13e7f5, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -19094,7 +19417,7 @@ def S2_lsr_i_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 &= lsr($Rs32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_28a2dc {
+tc_a4e22bbd, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -19107,7 +19430,7 @@ def S2_lsr_i_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 -= lsr($Rs32,#$Ii)",
-tc_f675fee8, TypeS_2op>, Enc_28a2dc {
+tc_2c13e7f5, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -19120,7 +19443,7 @@ def S2_lsr_i_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 |= lsr($Rs32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_28a2dc {
+tc_a4e22bbd, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -19133,7 +19456,7 @@ def S2_lsr_i_r_xacc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 ^= lsr($Rs32,#$Ii)",
-tc_f429765c, TypeS_2op>, Enc_28a2dc {
+tc_a4e22bbd, TypeS_2op>, Enc_28a2dc {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110100;
@@ -19146,7 +19469,7 @@ def S2_lsr_i_vh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vlsrh($Rss32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_12b6e9 {
+tc_5da50c4b, TypeS_2op>, Enc_12b6e9 {
 let Inst{7-5} = 0b001;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10000000100;
@@ -19155,7 +19478,7 @@ def S2_lsr_i_vw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u5_0Imm:$Ii),
 "$Rdd32 = vlsrw($Rss32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_7e5a82 {
+tc_5da50c4b, TypeS_2op>, Enc_7e5a82 {
 let Inst{7-5} = 0b001;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000000010;
@@ -19164,7 +19487,7 @@ def S2_lsr_r_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = lsr($Rss32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_927852 {
+tc_5da50c4b, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011100;
@@ -19173,7 +19496,7 @@ def S2_lsr_r_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 += lsr($Rss32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_1aa186 {
+tc_2c13e7f5, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011110;
@@ -19184,7 +19507,7 @@ def S2_lsr_r_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 &= lsr($Rss32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_1aa186 {
+tc_a4e22bbd, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011010;
@@ -19195,7 +19518,7 @@ def S2_lsr_r_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 -= lsr($Rss32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_1aa186 {
+tc_2c13e7f5, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011100;
@@ -19206,7 +19529,7 @@ def S2_lsr_r_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 |= lsr($Rss32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_1aa186 {
+tc_a4e22bbd, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011000;
@@ -19217,7 +19540,7 @@ def S2_lsr_r_p_xor : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 ^= lsr($Rss32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_1aa186 {
+tc_a4e22bbd, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001011011;
@@ -19228,7 +19551,7 @@ def S2_lsr_r_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = lsr($Rs32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_5ab2be {
+tc_5da50c4b, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110010;
@@ -19239,7 +19562,7 @@ def S2_lsr_r_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 += lsr($Rs32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_2ae154 {
+tc_2c13e7f5, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100110;
@@ -19252,7 +19575,7 @@ def S2_lsr_r_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 &= lsr($Rs32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_2ae154 {
+tc_a4e22bbd, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100010;
@@ -19265,7 +19588,7 @@ def S2_lsr_r_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 -= lsr($Rs32,$Rt32)",
-tc_f675fee8, TypeS_3op>, Enc_2ae154 {
+tc_2c13e7f5, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100100;
@@ -19278,7 +19601,7 @@ def S2_lsr_r_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rx32 |= lsr($Rs32,$Rt32)",
-tc_f429765c, TypeS_3op>, Enc_2ae154 {
+tc_a4e22bbd, TypeS_3op>, Enc_2ae154 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001100000;
@@ -19291,7 +19614,7 @@ def S2_lsr_r_vh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vlsrh($Rss32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_927852 {
+tc_5da50c4b, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011010;
@@ -19300,7 +19623,7 @@ def S2_lsr_r_vw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vlsrw($Rss32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_927852 {
+tc_5da50c4b, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011000;
@@ -19309,7 +19632,7 @@ def S2_mask : HInst<
 (outs IntRegs:$Rd32),
 (ins u5_0Imm:$Ii, u5_0Imm:$II),
 "$Rd32 = mask(#$Ii,#$II)",
-tc_9461ff31, TypeS_2op>, Enc_c85e2a, Requires<[HasV66]> {
+tc_1fcb8495, TypeS_2op>, Enc_c85e2a, Requires<[HasV66]> {
 let Inst{13-13} = 0b1;
 let Inst{20-16} = 0b00000;
 let Inst{31-23} = 0b100011010;
@@ -19321,7 +19644,7 @@ def S2_packhl : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = packhl($Rs32,$Rt32)",
-tc_5a2711e5, TypeALU32_3op>, Enc_be32a5 {
+tc_713b66bf, TypeALU32_3op>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11110101100;
@@ -19331,7 +19654,7 @@ def S2_parityp : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rd32 = parity($Rss32,$Rtt32)",
-tc_002cb246, TypeALU64>, Enc_d2216a {
+tc_a08b630b, TypeALU64>, Enc_d2216a {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010000000;
@@ -19343,7 +19666,7 @@ def S2_pstorerbf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memb($Rs32+#$Ii) = $Rt32",
-tc_f8e23f0b, TypeV2LDST>, Enc_da8d43, AddrModeRel {
+tc_8035e91f, TypeV2LDST>, Enc_da8d43, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000100000;
 let isPredicated = 1;
@@ -19351,9 +19674,9 @@ let isPredicatedFalse = 1;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S2_storerb_io";
 let isNVStorable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -19365,7 +19688,7 @@ def S2_pstorerbf_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memb($Rx32++#$Ii) = $Rt32",
-tc_24b66c99, TypeST>, Enc_cc449f, AddrModeRel {
+tc_9edefe01, TypeST>, Enc_cc449f, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -19383,7 +19706,7 @@ def S2_pstorerbf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4) memb($Rs32) = $Rt32",
-tc_f8e23f0b, TypeMAPPING> {
+tc_8035e91f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -19391,7 +19714,7 @@ def S2_pstorerbfnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memb($Rx32++#$Ii) = $Rt32",
-tc_53559e35, TypeST>, Enc_cc449f, AddrModeRel {
+tc_449acf79, TypeST>, Enc_cc449f, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -19410,7 +19733,7 @@ def S2_pstorerbnewf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memb($Rs32+#$Ii) = $Nt8.new",
-tc_8fb7ab1b, TypeV2LDST>, Enc_585242, AddrModeRel {
+tc_011e0e9d, TypeV2LDST>, Enc_585242, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b01000100101;
@@ -19422,9 +19745,9 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S2_storerb_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -19436,7 +19759,7 @@ def S2_pstorerbnewf_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memb($Rx32++#$Ii) = $Nt8.new",
-tc_838b34ea, TypeST>, Enc_52a5dd, AddrModeRel {
+tc_ce59038e, TypeST>, Enc_52a5dd, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b100;
@@ -19449,8 +19772,8 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerb_pi";
+let CextOpcode = "S2_storerb";
 let opNewValue = 4;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -19458,7 +19781,7 @@ def S2_pstorerbnewf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if (!$Pv4) memb($Rs32) = $Nt8.new",
-tc_8fb7ab1b, TypeMAPPING> {
+tc_011e0e9d, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -19467,7 +19790,7 @@ def S2_pstorerbnewfnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memb($Rx32++#$Ii) = $Nt8.new",
-tc_d65dbf51, TypeST>, Enc_52a5dd, AddrModeRel {
+tc_f529831b, TypeST>, Enc_52a5dd, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b100;
@@ -19481,8 +19804,8 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerb_pi";
+let CextOpcode = "S2_storerb";
 let opNewValue = 4;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -19490,7 +19813,7 @@ def S2_pstorerbnewt_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memb($Rs32+#$Ii) = $Nt8.new",
-tc_8fb7ab1b, TypeV2LDST>, Enc_585242, AddrModeRel {
+tc_011e0e9d, TypeV2LDST>, Enc_585242, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b01000000101;
@@ -19501,9 +19824,9 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S2_storerb_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -19515,7 +19838,7 @@ def S2_pstorerbnewt_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memb($Rx32++#$Ii) = $Nt8.new",
-tc_838b34ea, TypeST>, Enc_52a5dd, AddrModeRel {
+tc_ce59038e, TypeST>, Enc_52a5dd, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b100;
@@ -19527,8 +19850,8 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerb_pi";
+let CextOpcode = "S2_storerb";
 let opNewValue = 4;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -19536,7 +19859,7 @@ def S2_pstorerbnewt_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if ($Pv4) memb($Rs32) = $Nt8.new",
-tc_8fb7ab1b, TypeMAPPING> {
+tc_011e0e9d, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -19545,7 +19868,7 @@ def S2_pstorerbnewtnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memb($Rx32++#$Ii) = $Nt8.new",
-tc_d65dbf51, TypeST>, Enc_52a5dd, AddrModeRel {
+tc_f529831b, TypeST>, Enc_52a5dd, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b100;
@@ -19558,8 +19881,8 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerb_pi";
+let CextOpcode = "S2_storerb";
 let opNewValue = 4;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -19567,16 +19890,16 @@ def S2_pstorerbt_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memb($Rs32+#$Ii) = $Rt32",
-tc_f8e23f0b, TypeV2LDST>, Enc_da8d43, AddrModeRel {
+tc_8035e91f, TypeV2LDST>, Enc_da8d43, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000000000;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S2_storerb_io";
 let isNVStorable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -19588,7 +19911,7 @@ def S2_pstorerbt_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memb($Rx32++#$Ii) = $Rt32",
-tc_24b66c99, TypeST>, Enc_cc449f, AddrModeRel {
+tc_9edefe01, TypeST>, Enc_cc449f, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -19605,7 +19928,7 @@ def S2_pstorerbt_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4) memb($Rs32) = $Rt32",
-tc_f8e23f0b, TypeMAPPING> {
+tc_8035e91f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -19613,7 +19936,7 @@ def S2_pstorerbtnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memb($Rx32++#$Ii) = $Rt32",
-tc_53559e35, TypeST>, Enc_cc449f, AddrModeRel {
+tc_449acf79, TypeST>, Enc_cc449f, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -19631,7 +19954,7 @@ def S2_pstorerdf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4) memd($Rs32+#$Ii) = $Rtt32",
-tc_f8e23f0b, TypeV2LDST>, Enc_57a33e, AddrModeRel {
+tc_8035e91f, TypeV2LDST>, Enc_57a33e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000100110;
 let isPredicated = 1;
@@ -19639,9 +19962,9 @@ let isPredicatedFalse = 1;
 let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerd_io";
 let CextOpcode = "S2_storerd";
 let InputType = "imm";
-let BaseOpcode = "S2_storerd_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -19652,7 +19975,7 @@ def S2_pstorerdf_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4) memd($Rx32++#$Ii) = $Rtt32",
-tc_24b66c99, TypeST>, Enc_9a33d5, AddrModeRel {
+tc_9edefe01, TypeST>, Enc_9a33d5, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -19662,15 +19985,15 @@ let isPredicatedFalse = 1;
 let addrMode = PostInc;
 let accessSize = DoubleWordAccess;
 let mayStore = 1;
-let CextOpcode = "S2_storerd";
 let BaseOpcode = "S2_storerd_pi";
+let CextOpcode = "S2_storerd";
 let Constraints = "$Rx32 = $Rx32in";
 }
 def S2_pstorerdf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "if (!$Pv4) memd($Rs32) = $Rtt32",
-tc_f8e23f0b, TypeMAPPING> {
+tc_8035e91f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -19678,7 +20001,7 @@ def S2_pstorerdfnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4.new) memd($Rx32++#$Ii) = $Rtt32",
-tc_53559e35, TypeST>, Enc_9a33d5, AddrModeRel {
+tc_449acf79, TypeST>, Enc_9a33d5, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -19689,24 +20012,24 @@ let addrMode = PostInc;
 let accessSize = DoubleWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerd";
 let BaseOpcode = "S2_storerd_pi";
+let CextOpcode = "S2_storerd";
 let Constraints = "$Rx32 = $Rx32in";
 }
 def S2_pstorerdt_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4) memd($Rs32+#$Ii) = $Rtt32",
-tc_f8e23f0b, TypeV2LDST>, Enc_57a33e, AddrModeRel {
+tc_8035e91f, TypeV2LDST>, Enc_57a33e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000000110;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerd_io";
 let CextOpcode = "S2_storerd";
 let InputType = "imm";
-let BaseOpcode = "S2_storerd_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -19717,7 +20040,7 @@ def S2_pstorerdt_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4) memd($Rx32++#$Ii) = $Rtt32",
-tc_24b66c99, TypeST>, Enc_9a33d5, AddrModeRel {
+tc_9edefe01, TypeST>, Enc_9a33d5, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -19726,15 +20049,15 @@ let isPredicated = 1;
 let addrMode = PostInc;
 let accessSize = DoubleWordAccess;
 let mayStore = 1;
-let CextOpcode = "S2_storerd";
 let BaseOpcode = "S2_storerd_pi";
+let CextOpcode = "S2_storerd";
 let Constraints = "$Rx32 = $Rx32in";
 }
 def S2_pstorerdt_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "if ($Pv4) memd($Rs32) = $Rtt32",
-tc_f8e23f0b, TypeMAPPING> {
+tc_8035e91f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -19742,7 +20065,7 @@ def S2_pstorerdtnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4.new) memd($Rx32++#$Ii) = $Rtt32",
-tc_53559e35, TypeST>, Enc_9a33d5, AddrModeRel {
+tc_449acf79, TypeST>, Enc_9a33d5, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -19752,15 +20075,15 @@ let addrMode = PostInc;
 let accessSize = DoubleWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerd";
 let BaseOpcode = "S2_storerd_pi";
+let CextOpcode = "S2_storerd";
 let Constraints = "$Rx32 = $Rx32in";
 }
 def S2_pstorerff_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rs32+#$Ii) = $Rt32.h",
-tc_f8e23f0b, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_8035e91f, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000100011;
 let isPredicated = 1;
@@ -19768,9 +20091,9 @@ let isPredicatedFalse = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerf_io";
 let CextOpcode = "S2_storerf";
 let InputType = "imm";
-let BaseOpcode = "S2_storerf_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -19781,7 +20104,7 @@ def S2_pstorerff_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rx32++#$Ii) = $Rt32.h",
-tc_24b66c99, TypeST>, Enc_b886fd, AddrModeRel {
+tc_9edefe01, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -19791,15 +20114,15 @@ let isPredicatedFalse = 1;
 let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
-let CextOpcode = "S2_storerf";
 let BaseOpcode = "S2_storerf_pi";
+let CextOpcode = "S2_storerf";
 let Constraints = "$Rx32 = $Rx32in";
 }
 def S2_pstorerff_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rs32) = $Rt32.h",
-tc_f8e23f0b, TypeMAPPING> {
+tc_8035e91f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -19807,7 +20130,7 @@ def S2_pstorerffnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rx32++#$Ii) = $Rt32.h",
-tc_53559e35, TypeST>, Enc_b886fd, AddrModeRel {
+tc_449acf79, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -19818,24 +20141,24 @@ let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerf";
 let BaseOpcode = "S2_storerf_pi";
+let CextOpcode = "S2_storerf";
 let Constraints = "$Rx32 = $Rx32in";
 }
 def S2_pstorerft_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh($Rs32+#$Ii) = $Rt32.h",
-tc_f8e23f0b, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_8035e91f, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000000011;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerf_io";
 let CextOpcode = "S2_storerf";
 let InputType = "imm";
-let BaseOpcode = "S2_storerf_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -19846,7 +20169,7 @@ def S2_pstorerft_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh($Rx32++#$Ii) = $Rt32.h",
-tc_24b66c99, TypeST>, Enc_b886fd, AddrModeRel {
+tc_9edefe01, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -19855,15 +20178,15 @@ let isPredicated = 1;
 let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
-let CextOpcode = "S2_storerf";
 let BaseOpcode = "S2_storerf_pi";
+let CextOpcode = "S2_storerf";
 let Constraints = "$Rx32 = $Rx32in";
 }
 def S2_pstorerft_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4) memh($Rs32) = $Rt32.h",
-tc_f8e23f0b, TypeMAPPING> {
+tc_8035e91f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -19871,7 +20194,7 @@ def S2_pstorerftnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rx32++#$Ii) = $Rt32.h",
-tc_53559e35, TypeST>, Enc_b886fd, AddrModeRel {
+tc_449acf79, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -19881,15 +20204,15 @@ let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerf";
 let BaseOpcode = "S2_storerf_pi";
+let CextOpcode = "S2_storerf";
 let Constraints = "$Rx32 = $Rx32in";
 }
 def S2_pstorerhf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rs32+#$Ii) = $Rt32",
-tc_f8e23f0b, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_8035e91f, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000100010;
 let isPredicated = 1;
@@ -19897,9 +20220,9 @@ let isPredicatedFalse = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S2_storerh_io";
 let isNVStorable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -19911,7 +20234,7 @@ def S2_pstorerhf_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rx32++#$Ii) = $Rt32",
-tc_24b66c99, TypeST>, Enc_b886fd, AddrModeRel {
+tc_9edefe01, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -19929,7 +20252,7 @@ def S2_pstorerhf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rs32) = $Rt32",
-tc_f8e23f0b, TypeMAPPING> {
+tc_8035e91f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -19937,7 +20260,7 @@ def S2_pstorerhfnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rx32++#$Ii) = $Rt32",
-tc_53559e35, TypeST>, Enc_b886fd, AddrModeRel {
+tc_449acf79, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -19956,7 +20279,7 @@ def S2_pstorerhnewf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memh($Rs32+#$Ii) = $Nt8.new",
-tc_8fb7ab1b, TypeV2LDST>, Enc_f44229, AddrModeRel {
+tc_011e0e9d, TypeV2LDST>, Enc_f44229, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b01000100101;
@@ -19968,9 +20291,9 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S2_storerh_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -19982,7 +20305,7 @@ def S2_pstorerhnewf_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memh($Rx32++#$Ii) = $Nt8.new",
-tc_838b34ea, TypeST>, Enc_31aa6a, AddrModeRel {
+tc_ce59038e, TypeST>, Enc_31aa6a, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b101;
@@ -19995,8 +20318,8 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerh_pi";
+let CextOpcode = "S2_storerh";
 let opNewValue = 4;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -20004,7 +20327,7 @@ def S2_pstorerhnewf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if (!$Pv4) memh($Rs32) = $Nt8.new",
-tc_8fb7ab1b, TypeMAPPING> {
+tc_011e0e9d, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -20013,7 +20336,7 @@ def S2_pstorerhnewfnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memh($Rx32++#$Ii) = $Nt8.new",
-tc_d65dbf51, TypeST>, Enc_31aa6a, AddrModeRel {
+tc_f529831b, TypeST>, Enc_31aa6a, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b101;
@@ -20027,8 +20350,8 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerh_pi";
+let CextOpcode = "S2_storerh";
 let opNewValue = 4;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -20036,7 +20359,7 @@ def S2_pstorerhnewt_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memh($Rs32+#$Ii) = $Nt8.new",
-tc_8fb7ab1b, TypeV2LDST>, Enc_f44229, AddrModeRel {
+tc_011e0e9d, TypeV2LDST>, Enc_f44229, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b01000000101;
@@ -20047,9 +20370,9 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S2_storerh_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -20061,7 +20384,7 @@ def S2_pstorerhnewt_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memh($Rx32++#$Ii) = $Nt8.new",
-tc_838b34ea, TypeST>, Enc_31aa6a, AddrModeRel {
+tc_ce59038e, TypeST>, Enc_31aa6a, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b101;
@@ -20073,8 +20396,8 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerh_pi";
+let CextOpcode = "S2_storerh";
 let opNewValue = 4;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -20082,7 +20405,7 @@ def S2_pstorerhnewt_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if ($Pv4) memh($Rs32) = $Nt8.new",
-tc_8fb7ab1b, TypeMAPPING> {
+tc_011e0e9d, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -20091,7 +20414,7 @@ def S2_pstorerhnewtnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memh($Rx32++#$Ii) = $Nt8.new",
-tc_d65dbf51, TypeST>, Enc_31aa6a, AddrModeRel {
+tc_f529831b, TypeST>, Enc_31aa6a, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b101;
@@ -20104,8 +20427,8 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerh_pi";
+let CextOpcode = "S2_storerh";
 let opNewValue = 4;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -20113,16 +20436,16 @@ def S2_pstorerht_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh($Rs32+#$Ii) = $Rt32",
-tc_f8e23f0b, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_8035e91f, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000000010;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S2_storerh_io";
 let isNVStorable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -20134,7 +20457,7 @@ def S2_pstorerht_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh($Rx32++#$Ii) = $Rt32",
-tc_24b66c99, TypeST>, Enc_b886fd, AddrModeRel {
+tc_9edefe01, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -20151,7 +20474,7 @@ def S2_pstorerht_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4) memh($Rs32) = $Rt32",
-tc_f8e23f0b, TypeMAPPING> {
+tc_8035e91f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -20159,7 +20482,7 @@ def S2_pstorerhtnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rx32++#$Ii) = $Rt32",
-tc_53559e35, TypeST>, Enc_b886fd, AddrModeRel {
+tc_449acf79, TypeST>, Enc_b886fd, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -20177,7 +20500,7 @@ def S2_pstorerif_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memw($Rs32+#$Ii) = $Rt32",
-tc_f8e23f0b, TypeV2LDST>, Enc_397f23, AddrModeRel {
+tc_8035e91f, TypeV2LDST>, Enc_397f23, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000100100;
 let isPredicated = 1;
@@ -20185,9 +20508,9 @@ let isPredicatedFalse = 1;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S2_storeri_io";
 let isNVStorable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -20199,7 +20522,7 @@ def S2_pstorerif_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memw($Rx32++#$Ii) = $Rt32",
-tc_24b66c99, TypeST>, Enc_7eaeb6, AddrModeRel {
+tc_9edefe01, TypeST>, Enc_7eaeb6, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -20217,7 +20540,7 @@ def S2_pstorerif_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4) memw($Rs32) = $Rt32",
-tc_f8e23f0b, TypeMAPPING> {
+tc_8035e91f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -20225,7 +20548,7 @@ def S2_pstorerifnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memw($Rx32++#$Ii) = $Rt32",
-tc_53559e35, TypeST>, Enc_7eaeb6, AddrModeRel {
+tc_449acf79, TypeST>, Enc_7eaeb6, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -20236,8 +20559,8 @@ let addrMode = PostInc;
 let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeri_pi";
+let CextOpcode = "S2_storeri";
 let isNVStorable = 1;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -20245,7 +20568,7 @@ def S2_pstorerinewf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memw($Rs32+#$Ii) = $Nt8.new",
-tc_8fb7ab1b, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
+tc_011e0e9d, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b01000100101;
@@ -20257,9 +20580,9 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S2_storeri_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -20271,7 +20594,7 @@ def S2_pstorerinewf_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memw($Rx32++#$Ii) = $Nt8.new",
-tc_838b34ea, TypeST>, Enc_65f095, AddrModeRel {
+tc_ce59038e, TypeST>, Enc_65f095, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b110;
@@ -20284,8 +20607,8 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeri_pi";
+let CextOpcode = "S2_storeri";
 let opNewValue = 4;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -20293,7 +20616,7 @@ def S2_pstorerinewf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if (!$Pv4) memw($Rs32) = $Nt8.new",
-tc_8fb7ab1b, TypeMAPPING> {
+tc_011e0e9d, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -20302,7 +20625,7 @@ def S2_pstorerinewfnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memw($Rx32++#$Ii) = $Nt8.new",
-tc_d65dbf51, TypeST>, Enc_65f095, AddrModeRel {
+tc_f529831b, TypeST>, Enc_65f095, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b110;
@@ -20316,8 +20639,8 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeri_pi";
+let CextOpcode = "S2_storeri";
 let opNewValue = 4;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -20325,7 +20648,7 @@ def S2_pstorerinewt_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memw($Rs32+#$Ii) = $Nt8.new",
-tc_8fb7ab1b, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
+tc_011e0e9d, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b01000000101;
@@ -20336,9 +20659,9 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S2_storeri_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -20350,7 +20673,7 @@ def S2_pstorerinewt_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memw($Rx32++#$Ii) = $Nt8.new",
-tc_838b34ea, TypeST>, Enc_65f095, AddrModeRel {
+tc_ce59038e, TypeST>, Enc_65f095, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b110;
@@ -20362,8 +20685,8 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeri_pi";
+let CextOpcode = "S2_storeri";
 let opNewValue = 4;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -20371,7 +20694,7 @@ def S2_pstorerinewt_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if ($Pv4) memw($Rs32) = $Nt8.new",
-tc_8fb7ab1b, TypeMAPPING> {
+tc_011e0e9d, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -20380,7 +20703,7 @@ def S2_pstorerinewtnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memw($Rx32++#$Ii) = $Nt8.new",
-tc_d65dbf51, TypeST>, Enc_65f095, AddrModeRel {
+tc_f529831b, TypeST>, Enc_65f095, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b110;
@@ -20393,8 +20716,8 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeri_pi";
+let CextOpcode = "S2_storeri";
 let opNewValue = 4;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -20402,16 +20725,16 @@ def S2_pstorerit_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memw($Rs32+#$Ii) = $Rt32",
-tc_f8e23f0b, TypeV2LDST>, Enc_397f23, AddrModeRel {
+tc_8035e91f, TypeV2LDST>, Enc_397f23, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000000100;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S2_storeri_io";
 let isNVStorable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -20423,7 +20746,7 @@ def S2_pstorerit_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memw($Rx32++#$Ii) = $Rt32",
-tc_24b66c99, TypeST>, Enc_7eaeb6, AddrModeRel {
+tc_9edefe01, TypeST>, Enc_7eaeb6, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
@@ -20440,7 +20763,7 @@ def S2_pstorerit_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4) memw($Rs32) = $Rt32",
-tc_f8e23f0b, TypeMAPPING> {
+tc_8035e91f, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -20448,7 +20771,7 @@ def S2_pstoreritnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins PredRegs:$Pv4, IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memw($Rx32++#$Ii) = $Rt32",
-tc_53559e35, TypeST>, Enc_7eaeb6, AddrModeRel {
+tc_449acf79, TypeST>, Enc_7eaeb6, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -20466,7 +20789,7 @@ def S2_setbit_i : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = setbit($Rs32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_a05677 {
+tc_5da50c4b, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100110;
@@ -20477,7 +20800,7 @@ def S2_setbit_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = setbit($Rs32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_5ab2be {
+tc_5da50c4b, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110100;
@@ -20488,7 +20811,7 @@ def S2_shuffeb : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = shuffeb($Rss32,$Rtt32)",
-tc_946df596, TypeS_3op>, Enc_a56825 {
+tc_5da50c4b, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001000;
@@ -20497,7 +20820,7 @@ def S2_shuffeh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = shuffeh($Rss32,$Rtt32)",
-tc_946df596, TypeS_3op>, Enc_a56825 {
+tc_5da50c4b, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001000;
@@ -20506,7 +20829,7 @@ def S2_shuffob : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = shuffob($Rtt32,$Rss32)",
-tc_946df596, TypeS_3op>, Enc_ea23e4 {
+tc_5da50c4b, TypeS_3op>, Enc_ea23e4 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001000;
@@ -20515,7 +20838,7 @@ def S2_shuffoh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32),
 "$Rdd32 = shuffoh($Rtt32,$Rss32)",
-tc_946df596, TypeS_3op>, Enc_ea23e4 {
+tc_5da50c4b, TypeS_3op>, Enc_ea23e4 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001100;
@@ -20524,15 +20847,15 @@ def S2_storerb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rs32+#$Ii) = $Rt32",
-tc_30b9bb4a, TypeST>, Enc_448f7f, AddrModeRel, PostInc_BaseImm {
+tc_ae5babd7, TypeST>, Enc_448f7f, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1000;
 let Inst{31-27} = 0b10100;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S2_storerb_io";
 let isPredicable = 1;
 let isNVStorable = 1;
 let isExtendable = 1;
@@ -20545,7 +20868,7 @@ def S2_storerb_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memb($Rx32++$Mu2:brev) = $Rt32",
-tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel {
+tc_a2b365d2, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101111000;
 let addrMode = PostInc;
@@ -20559,7 +20882,7 @@ def S2_storerb_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
 "memb($Rx32++#$Ii:circ($Mu2)) = $Rt32",
-tc_e86aa961, TypeST>, Enc_b15941, AddrModeRel {
+tc_b4dc7630, TypeST>, Enc_b15941, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{31-21} = 0b10101001000;
@@ -20575,7 +20898,7 @@ def S2_storerb_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memb($Rx32++I:circ($Mu2)) = $Rt32",
-tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel {
+tc_a2b365d2, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000010;
 let Inst{31-21} = 0b10101001000;
 let addrMode = PostInc;
@@ -20590,7 +20913,7 @@ def S2_storerb_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rx32++#$Ii) = $Rt32",
-tc_da97ee82, TypeST>, Enc_10bc21, AddrModeRel, PostInc_BaseImm {
+tc_a2b365d2, TypeST>, Enc_10bc21, AddrModeRel, PostInc_BaseImm {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
@@ -20598,8 +20921,8 @@ let Inst{31-21} = 0b10101011000;
 let addrMode = PostInc;
 let accessSize = ByteAccess;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerb_pi";
+let CextOpcode = "S2_storerb";
 let isPredicable = 1;
 let isNVStorable = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -20608,12 +20931,13 @@ def S2_storerb_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memb($Rx32++$Mu2) = $Rt32",
-tc_da97ee82, TypeST>, Enc_d5c73f {
+tc_a2b365d2, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101101000;
 let addrMode = PostInc;
 let accessSize = ByteAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerb_pr";
 let isNVStorable = 1;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -20621,7 +20945,7 @@ def S2_storerb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memb($Rs32) = $Rt32",
-tc_30b9bb4a, TypeMAPPING> {
+tc_ae5babd7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -20629,7 +20953,7 @@ def S2_storerbgp : HInst<
 (outs),
 (ins u32_0Imm:$Ii, IntRegs:$Rt32),
 "memb(gp+#$Ii) = $Rt32",
-tc_0371abea, TypeV2LDST>, Enc_1b64fb, AddrModeRel {
+tc_0655b949, TypeV2LDST>, Enc_1b64fb, AddrModeRel {
 let Inst{24-21} = 0b0000;
 let Inst{31-27} = 0b01001;
 let accessSize = ByteAccess;
@@ -20647,7 +20971,7 @@ def S2_storerbnew_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Nt8),
 "memb($Rs32+#$Ii) = $Nt8.new",
-tc_be9602ff, TypeST>, Enc_4df4e9, AddrModeRel {
+tc_5deb5e47, TypeST>, Enc_4df4e9, AddrModeRel {
 let Inst{12-11} = 0b00;
 let Inst{24-21} = 0b1101;
 let Inst{31-27} = 0b10100;
@@ -20657,9 +20981,9 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S2_storerb_io";
 let isPredicable = 1;
 let isExtendable = 1;
 let opExtendable = 1;
@@ -20672,7 +20996,7 @@ def S2_storerbnew_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memb($Rx32++$Mu2:brev) = $Nt8.new",
-tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel {
+tc_92240447, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b10101111101;
@@ -20690,7 +21014,7 @@ def S2_storerbnew_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
 "memb($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
-tc_d5c0729a, TypeST>, Enc_96ce4f, AddrModeRel {
+tc_addc37a8, TypeST>, Enc_96ce4f, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{12-11} = 0b00;
@@ -20710,7 +21034,7 @@ def S2_storerbnew_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memb($Rx32++I:circ($Mu2)) = $Nt8.new",
-tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel {
+tc_92240447, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000010;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b10101001101;
@@ -20729,7 +21053,7 @@ def S2_storerbnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_0Imm:$Ii, IntRegs:$Nt8),
 "memb($Rx32++#$Ii) = $Nt8.new",
-tc_c79a189f, TypeST>, Enc_c7cd90, AddrModeRel {
+tc_92240447, TypeST>, Enc_c7cd90, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b000;
@@ -20750,7 +21074,7 @@ def S2_storerbnew_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memb($Rx32++$Mu2) = $Nt8.new",
-tc_c79a189f, TypeST>, Enc_8dbe85 {
+tc_92240447, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b10101101101;
@@ -20760,6 +21084,7 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerb_pr";
 let opNewValue = 3;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -20767,7 +21092,7 @@ def S2_storerbnew_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Nt8),
 "memb($Rs32) = $Nt8.new",
-tc_be9602ff, TypeMAPPING> {
+tc_5deb5e47, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 1;
@@ -20776,7 +21101,7 @@ def S2_storerbnewgp : HInst<
 (outs),
 (ins u32_0Imm:$Ii, IntRegs:$Nt8),
 "memb(gp+#$Ii) = $Nt8.new",
-tc_5bf126a6, TypeV2LDST>, Enc_ad1831, AddrModeRel {
+tc_6e20402a, TypeV2LDST>, Enc_ad1831, AddrModeRel {
 let Inst{12-11} = 0b00;
 let Inst{24-21} = 0b0101;
 let Inst{31-27} = 0b01001;
@@ -20798,15 +21123,15 @@ def S2_storerd_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "memd($Rs32+#$Ii) = $Rtt32",
-tc_30b9bb4a, TypeST>, Enc_ce6828, AddrModeRel, PostInc_BaseImm {
+tc_ae5babd7, TypeST>, Enc_ce6828, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1110;
 let Inst{31-27} = 0b10100;
 let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerd_io";
 let CextOpcode = "S2_storerd";
 let InputType = "imm";
-let BaseOpcode = "S2_storerd_io";
 let isPredicable = 1;
 let isExtendable = 1;
 let opExtendable = 1;
@@ -20818,7 +21143,7 @@ def S2_storerd_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
 "memd($Rx32++$Mu2:brev) = $Rtt32",
-tc_da97ee82, TypeST>, Enc_928ca1 {
+tc_a2b365d2, TypeST>, Enc_928ca1 {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101111110;
 let addrMode = PostInc;
@@ -20830,7 +21155,7 @@ def S2_storerd_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_3Imm:$Ii, ModRegs:$Mu2, DoubleRegs:$Rtt32),
 "memd($Rx32++#$Ii:circ($Mu2)) = $Rtt32",
-tc_e86aa961, TypeST>, Enc_395cc4 {
+tc_b4dc7630, TypeST>, Enc_395cc4 {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{31-21} = 0b10101001110;
@@ -20844,7 +21169,7 @@ def S2_storerd_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
 "memd($Rx32++I:circ($Mu2)) = $Rtt32",
-tc_da97ee82, TypeST>, Enc_928ca1 {
+tc_a2b365d2, TypeST>, Enc_928ca1 {
 let Inst{7-0} = 0b00000010;
 let Inst{31-21} = 0b10101001110;
 let addrMode = PostInc;
@@ -20857,7 +21182,7 @@ def S2_storerd_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_3Imm:$Ii, DoubleRegs:$Rtt32),
 "memd($Rx32++#$Ii) = $Rtt32",
-tc_da97ee82, TypeST>, Enc_85bf58, AddrModeRel, PostInc_BaseImm {
+tc_a2b365d2, TypeST>, Enc_85bf58, AddrModeRel, PostInc_BaseImm {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
@@ -20865,8 +21190,8 @@ let Inst{31-21} = 0b10101011110;
 let addrMode = PostInc;
 let accessSize = DoubleWordAccess;
 let mayStore = 1;
-let CextOpcode = "S2_storerd";
 let BaseOpcode = "S2_storerd_pi";
+let CextOpcode = "S2_storerd";
 let isPredicable = 1;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -20874,7 +21199,7 @@ def S2_storerd_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, DoubleRegs:$Rtt32),
 "memd($Rx32++$Mu2) = $Rtt32",
-tc_da97ee82, TypeST>, Enc_928ca1 {
+tc_a2b365d2, TypeST>, Enc_928ca1 {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101101110;
 let addrMode = PostInc;
@@ -20886,7 +21211,7 @@ def S2_storerd_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "memd($Rs32) = $Rtt32",
-tc_30b9bb4a, TypeMAPPING> {
+tc_ae5babd7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -20894,7 +21219,7 @@ def S2_storerdgp : HInst<
 (outs),
 (ins u29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "memd(gp+#$Ii) = $Rtt32",
-tc_0371abea, TypeV2LDST>, Enc_5c124a, AddrModeRel {
+tc_0655b949, TypeV2LDST>, Enc_5c124a, AddrModeRel {
 let Inst{24-21} = 0b0110;
 let Inst{31-27} = 0b01001;
 let accessSize = DoubleWordAccess;
@@ -20911,15 +21236,15 @@ def S2_storerf_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+#$Ii) = $Rt32.h",
-tc_30b9bb4a, TypeST>, Enc_e957fb, AddrModeRel, PostInc_BaseImm {
+tc_ae5babd7, TypeST>, Enc_e957fb, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1011;
 let Inst{31-27} = 0b10100;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerf_io";
 let CextOpcode = "S2_storerf";
 let InputType = "imm";
-let BaseOpcode = "S2_storerf_io";
 let isPredicable = 1;
 let isExtendable = 1;
 let opExtendable = 1;
@@ -20931,7 +21256,7 @@ def S2_storerf_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++$Mu2:brev) = $Rt32.h",
-tc_da97ee82, TypeST>, Enc_d5c73f {
+tc_a2b365d2, TypeST>, Enc_d5c73f {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101111011;
 let addrMode = PostInc;
@@ -20943,7 +21268,7 @@ def S2_storerf_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++#$Ii:circ($Mu2)) = $Rt32.h",
-tc_e86aa961, TypeST>, Enc_935d9b {
+tc_b4dc7630, TypeST>, Enc_935d9b {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{31-21} = 0b10101001011;
@@ -20957,7 +21282,7 @@ def S2_storerf_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++I:circ($Mu2)) = $Rt32.h",
-tc_da97ee82, TypeST>, Enc_d5c73f {
+tc_a2b365d2, TypeST>, Enc_d5c73f {
 let Inst{7-0} = 0b00000010;
 let Inst{31-21} = 0b10101001011;
 let addrMode = PostInc;
@@ -20970,7 +21295,7 @@ def S2_storerf_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rx32++#$Ii) = $Rt32.h",
-tc_da97ee82, TypeST>, Enc_052c7d, AddrModeRel, PostInc_BaseImm {
+tc_a2b365d2, TypeST>, Enc_052c7d, AddrModeRel, PostInc_BaseImm {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
@@ -20978,8 +21303,8 @@ let Inst{31-21} = 0b10101011011;
 let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
-let CextOpcode = "S2_storerf";
 let BaseOpcode = "S2_storerf_pi";
+let CextOpcode = "S2_storerf";
 let isPredicable = 1;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -20987,7 +21312,7 @@ def S2_storerf_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++$Mu2) = $Rt32.h",
-tc_da97ee82, TypeST>, Enc_d5c73f {
+tc_a2b365d2, TypeST>, Enc_d5c73f {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101101011;
 let addrMode = PostInc;
@@ -20999,7 +21324,7 @@ def S2_storerf_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memh($Rs32) = $Rt32.h",
-tc_30b9bb4a, TypeMAPPING> {
+tc_ae5babd7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -21007,7 +21332,7 @@ def S2_storerfgp : HInst<
 (outs),
 (ins u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh(gp+#$Ii) = $Rt32.h",
-tc_0371abea, TypeV2LDST>, Enc_fda92c, AddrModeRel {
+tc_0655b949, TypeV2LDST>, Enc_fda92c, AddrModeRel {
 let Inst{24-21} = 0b0011;
 let Inst{31-27} = 0b01001;
 let accessSize = HalfWordAccess;
@@ -21024,15 +21349,15 @@ def S2_storerh_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+#$Ii) = $Rt32",
-tc_30b9bb4a, TypeST>, Enc_e957fb, AddrModeRel, PostInc_BaseImm {
+tc_ae5babd7, TypeST>, Enc_e957fb, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1010;
 let Inst{31-27} = 0b10100;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S2_storerh_io";
 let isPredicable = 1;
 let isNVStorable = 1;
 let isExtendable = 1;
@@ -21045,7 +21370,7 @@ def S2_storerh_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++$Mu2:brev) = $Rt32",
-tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel {
+tc_a2b365d2, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101111010;
 let addrMode = PostInc;
@@ -21059,7 +21384,7 @@ def S2_storerh_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++#$Ii:circ($Mu2)) = $Rt32",
-tc_e86aa961, TypeST>, Enc_935d9b, AddrModeRel {
+tc_b4dc7630, TypeST>, Enc_935d9b, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{31-21} = 0b10101001010;
@@ -21075,7 +21400,7 @@ def S2_storerh_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++I:circ($Mu2)) = $Rt32",
-tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel {
+tc_a2b365d2, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000010;
 let Inst{31-21} = 0b10101001010;
 let addrMode = PostInc;
@@ -21090,7 +21415,7 @@ def S2_storerh_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Rt32),
 "memh($Rx32++#$Ii) = $Rt32",
-tc_da97ee82, TypeST>, Enc_052c7d, AddrModeRel, PostInc_BaseImm {
+tc_a2b365d2, TypeST>, Enc_052c7d, AddrModeRel, PostInc_BaseImm {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
@@ -21098,8 +21423,8 @@ let Inst{31-21} = 0b10101011010;
 let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerh_pi";
+let CextOpcode = "S2_storerh";
 let isPredicable = 1;
 let isNVStorable = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -21108,12 +21433,13 @@ def S2_storerh_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memh($Rx32++$Mu2) = $Rt32",
-tc_da97ee82, TypeST>, Enc_d5c73f {
+tc_a2b365d2, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101101010;
 let addrMode = PostInc;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_pr";
 let isNVStorable = 1;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -21121,7 +21447,7 @@ def S2_storerh_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memh($Rs32) = $Rt32",
-tc_30b9bb4a, TypeMAPPING> {
+tc_ae5babd7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -21129,7 +21455,7 @@ def S2_storerhgp : HInst<
 (outs),
 (ins u31_1Imm:$Ii, IntRegs:$Rt32),
 "memh(gp+#$Ii) = $Rt32",
-tc_0371abea, TypeV2LDST>, Enc_fda92c, AddrModeRel {
+tc_0655b949, TypeV2LDST>, Enc_fda92c, AddrModeRel {
 let Inst{24-21} = 0b0010;
 let Inst{31-27} = 0b01001;
 let accessSize = HalfWordAccess;
@@ -21147,7 +21473,7 @@ def S2_storerhnew_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Nt8),
 "memh($Rs32+#$Ii) = $Nt8.new",
-tc_be9602ff, TypeST>, Enc_0d8870, AddrModeRel {
+tc_5deb5e47, TypeST>, Enc_0d8870, AddrModeRel {
 let Inst{12-11} = 0b01;
 let Inst{24-21} = 0b1101;
 let Inst{31-27} = 0b10100;
@@ -21157,9 +21483,9 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S2_storerh_io";
 let isPredicable = 1;
 let isExtendable = 1;
 let opExtendable = 1;
@@ -21172,7 +21498,7 @@ def S2_storerhnew_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memh($Rx32++$Mu2:brev) = $Nt8.new",
-tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel {
+tc_92240447, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b10101111101;
@@ -21190,7 +21516,7 @@ def S2_storerhnew_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
 "memh($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
-tc_d5c0729a, TypeST>, Enc_91b9fe, AddrModeRel {
+tc_addc37a8, TypeST>, Enc_91b9fe, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{12-11} = 0b01;
@@ -21210,7 +21536,7 @@ def S2_storerhnew_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memh($Rx32++I:circ($Mu2)) = $Nt8.new",
-tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel {
+tc_92240447, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000010;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b10101001101;
@@ -21229,7 +21555,7 @@ def S2_storerhnew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_1Imm:$Ii, IntRegs:$Nt8),
 "memh($Rx32++#$Ii) = $Nt8.new",
-tc_c79a189f, TypeST>, Enc_e26546, AddrModeRel {
+tc_92240447, TypeST>, Enc_e26546, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b001;
@@ -21250,7 +21576,7 @@ def S2_storerhnew_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memh($Rx32++$Mu2) = $Nt8.new",
-tc_c79a189f, TypeST>, Enc_8dbe85 {
+tc_92240447, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b10101101101;
@@ -21260,6 +21586,7 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_pr";
 let opNewValue = 3;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -21267,7 +21594,7 @@ def S2_storerhnew_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Nt8),
 "memh($Rs32) = $Nt8.new",
-tc_be9602ff, TypeMAPPING> {
+tc_5deb5e47, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 1;
@@ -21276,7 +21603,7 @@ def S2_storerhnewgp : HInst<
 (outs),
 (ins u31_1Imm:$Ii, IntRegs:$Nt8),
 "memh(gp+#$Ii) = $Nt8.new",
-tc_5bf126a6, TypeV2LDST>, Enc_bc03e5, AddrModeRel {
+tc_6e20402a, TypeV2LDST>, Enc_bc03e5, AddrModeRel {
 let Inst{12-11} = 0b01;
 let Inst{24-21} = 0b0101;
 let Inst{31-27} = 0b01001;
@@ -21298,15 +21625,15 @@ def S2_storeri_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Rt32),
 "memw($Rs32+#$Ii) = $Rt32",
-tc_30b9bb4a, TypeST>, Enc_143445, AddrModeRel, PostInc_BaseImm {
+tc_ae5babd7, TypeST>, Enc_143445, AddrModeRel, PostInc_BaseImm {
 let Inst{24-21} = 0b1100;
 let Inst{31-27} = 0b10100;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S2_storeri_io";
 let isPredicable = 1;
 let isNVStorable = 1;
 let isExtendable = 1;
@@ -21319,7 +21646,7 @@ def S2_storeri_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memw($Rx32++$Mu2:brev) = $Rt32",
-tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel {
+tc_a2b365d2, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101111100;
 let addrMode = PostInc;
@@ -21333,7 +21660,7 @@ def S2_storeri_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2, IntRegs:$Rt32),
 "memw($Rx32++#$Ii:circ($Mu2)) = $Rt32",
-tc_e86aa961, TypeST>, Enc_79b8c8, AddrModeRel {
+tc_b4dc7630, TypeST>, Enc_79b8c8, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{31-21} = 0b10101001100;
@@ -21349,7 +21676,7 @@ def S2_storeri_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memw($Rx32++I:circ($Mu2)) = $Rt32",
-tc_da97ee82, TypeST>, Enc_d5c73f, AddrModeRel {
+tc_a2b365d2, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000010;
 let Inst{31-21} = 0b10101001100;
 let addrMode = PostInc;
@@ -21364,7 +21691,7 @@ def S2_storeri_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Rt32),
 "memw($Rx32++#$Ii) = $Rt32",
-tc_da97ee82, TypeST>, Enc_db40cd, AddrModeRel, PostInc_BaseImm {
+tc_a2b365d2, TypeST>, Enc_db40cd, AddrModeRel, PostInc_BaseImm {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
@@ -21372,8 +21699,8 @@ let Inst{31-21} = 0b10101011100;
 let addrMode = PostInc;
 let accessSize = WordAccess;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeri_pi";
+let CextOpcode = "S2_storeri";
 let isPredicable = 1;
 let isNVStorable = 1;
 let Constraints = "$Rx32 = $Rx32in";
@@ -21382,12 +21709,13 @@ def S2_storeri_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Rt32),
 "memw($Rx32++$Mu2) = $Rt32",
-tc_da97ee82, TypeST>, Enc_d5c73f {
+tc_a2b365d2, TypeST>, Enc_d5c73f, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b10101101100;
 let addrMode = PostInc;
 let accessSize = WordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_pr";
 let isNVStorable = 1;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -21395,7 +21723,7 @@ def S2_storeri_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memw($Rs32) = $Rt32",
-tc_30b9bb4a, TypeMAPPING> {
+tc_ae5babd7, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -21403,7 +21731,7 @@ def S2_storerigp : HInst<
 (outs),
 (ins u30_2Imm:$Ii, IntRegs:$Rt32),
 "memw(gp+#$Ii) = $Rt32",
-tc_0371abea, TypeV2LDST>, Enc_541f26, AddrModeRel {
+tc_0655b949, TypeV2LDST>, Enc_541f26, AddrModeRel {
 let Inst{24-21} = 0b0100;
 let Inst{31-27} = 0b01001;
 let accessSize = WordAccess;
@@ -21421,7 +21749,7 @@ def S2_storerinew_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Nt8),
 "memw($Rs32+#$Ii) = $Nt8.new",
-tc_be9602ff, TypeST>, Enc_690862, AddrModeRel {
+tc_5deb5e47, TypeST>, Enc_690862, AddrModeRel {
 let Inst{12-11} = 0b10;
 let Inst{24-21} = 0b1101;
 let Inst{31-27} = 0b10100;
@@ -21431,9 +21759,9 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S2_storeri_io";
 let isPredicable = 1;
 let isExtendable = 1;
 let opExtendable = 1;
@@ -21446,7 +21774,7 @@ def S2_storerinew_pbr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memw($Rx32++$Mu2:brev) = $Nt8.new",
-tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel {
+tc_92240447, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b10101111101;
@@ -21464,7 +21792,7 @@ def S2_storerinew_pci : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii, ModRegs:$Mu2, IntRegs:$Nt8),
 "memw($Rx32++#$Ii:circ($Mu2)) = $Nt8.new",
-tc_d5c0729a, TypeST>, Enc_3f97c8, AddrModeRel {
+tc_addc37a8, TypeST>, Enc_3f97c8, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{12-11} = 0b10;
@@ -21484,7 +21812,7 @@ def S2_storerinew_pcr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memw($Rx32++I:circ($Mu2)) = $Nt8.new",
-tc_c79a189f, TypeST>, Enc_8dbe85, AddrModeRel {
+tc_92240447, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000010;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b10101001101;
@@ -21503,7 +21831,7 @@ def S2_storerinew_pi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, s4_2Imm:$Ii, IntRegs:$Nt8),
 "memw($Rx32++#$Ii) = $Nt8.new",
-tc_c79a189f, TypeST>, Enc_223005, AddrModeRel {
+tc_92240447, TypeST>, Enc_223005, AddrModeRel {
 let Inst{2-0} = 0b000;
 let Inst{7-7} = 0b0;
 let Inst{13-11} = 0b010;
@@ -21523,7 +21851,7 @@ def S2_storerinew_pr : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Nt8),
 "memw($Rx32++$Mu2) = $Nt8.new",
-tc_c79a189f, TypeST>, Enc_8dbe85 {
+tc_92240447, TypeST>, Enc_8dbe85, AddrModeRel {
 let Inst{7-0} = 0b00000000;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b10101101101;
@@ -21533,6 +21861,7 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_pr";
 let opNewValue = 3;
 let Constraints = "$Rx32 = $Rx32in";
 }
@@ -21540,7 +21869,7 @@ def S2_storerinew_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Nt8),
 "memw($Rs32) = $Nt8.new",
-tc_be9602ff, TypeMAPPING> {
+tc_5deb5e47, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 1;
@@ -21549,7 +21878,7 @@ def S2_storerinewgp : HInst<
 (outs),
 (ins u30_2Imm:$Ii, IntRegs:$Nt8),
 "memw(gp+#$Ii) = $Nt8.new",
-tc_5bf126a6, TypeV2LDST>, Enc_78cbf0, AddrModeRel {
+tc_6e20402a, TypeV2LDST>, Enc_78cbf0, AddrModeRel {
 let Inst{12-11} = 0b10;
 let Inst{24-21} = 0b0101;
 let Inst{31-27} = 0b01001;
@@ -21571,7 +21900,7 @@ def S2_storew_locked : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "memw_locked($Rs32,$Pd4) = $Rt32",
-tc_5abb5e3f, TypeST>, Enc_c2b48e {
+tc_6f42bc60, TypeST>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10100000101;
@@ -21584,7 +21913,7 @@ def S2_svsathb : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = vsathb($Rs32)",
-tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
+tc_9f6cd987, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001100100;
 let hasNewValue = 1;
@@ -21595,7 +21924,7 @@ def S2_svsathub : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = vsathub($Rs32)",
-tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
+tc_9f6cd987, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10001100100;
 let hasNewValue = 1;
@@ -21606,7 +21935,7 @@ def S2_tableidxb : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
 "$Rx32 = tableidxb($Rs32,#$Ii,#$II):raw",
-tc_bfec0f01, TypeS_2op>, Enc_cd82bc {
+tc_bb831a7c, TypeS_2op>, Enc_cd82bc {
 let Inst{31-22} = 0b1000011100;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -21617,7 +21946,7 @@ def S2_tableidxb_goodsyntax : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
 "$Rx32 = tableidxb($Rs32,#$Ii,#$II)",
-tc_bfec0f01, TypeS_2op> {
+tc_bb831a7c, TypeS_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -21628,7 +21957,7 @@ def S2_tableidxd : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
 "$Rx32 = tableidxd($Rs32,#$Ii,#$II):raw",
-tc_bfec0f01, TypeS_2op>, Enc_cd82bc {
+tc_bb831a7c, TypeS_2op>, Enc_cd82bc {
 let Inst{31-22} = 0b1000011111;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -21639,7 +21968,7 @@ def S2_tableidxd_goodsyntax : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
 "$Rx32 = tableidxd($Rs32,#$Ii,#$II)",
-tc_bfec0f01, TypeS_2op> {
+tc_bb831a7c, TypeS_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -21649,7 +21978,7 @@ def S2_tableidxh : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
 "$Rx32 = tableidxh($Rs32,#$Ii,#$II):raw",
-tc_bfec0f01, TypeS_2op>, Enc_cd82bc {
+tc_bb831a7c, TypeS_2op>, Enc_cd82bc {
 let Inst{31-22} = 0b1000011101;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -21660,7 +21989,7 @@ def S2_tableidxh_goodsyntax : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
 "$Rx32 = tableidxh($Rs32,#$Ii,#$II)",
-tc_bfec0f01, TypeS_2op> {
+tc_bb831a7c, TypeS_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -21670,7 +21999,7 @@ def S2_tableidxw : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, s6_0Imm:$II),
 "$Rx32 = tableidxw($Rs32,#$Ii,#$II):raw",
-tc_bfec0f01, TypeS_2op>, Enc_cd82bc {
+tc_bb831a7c, TypeS_2op>, Enc_cd82bc {
 let Inst{31-22} = 0b1000011110;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -21681,7 +22010,7 @@ def S2_tableidxw_goodsyntax : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u4_0Imm:$Ii, u5_0Imm:$II),
 "$Rx32 = tableidxw($Rs32,#$Ii,#$II)",
-tc_bfec0f01, TypeS_2op> {
+tc_bb831a7c, TypeS_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -21691,7 +22020,7 @@ def S2_togglebit_i : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = togglebit($Rs32,#$Ii)",
-tc_946df596, TypeS_2op>, Enc_a05677 {
+tc_5da50c4b, TypeS_2op>, Enc_a05677 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100110;
@@ -21702,7 +22031,7 @@ def S2_togglebit_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = togglebit($Rs32,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_5ab2be {
+tc_5da50c4b, TypeS_3op>, Enc_5ab2be {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110100;
@@ -21713,7 +22042,7 @@ def S2_tstbit_i : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Pd4 = tstbit($Rs32,#$Ii)",
-tc_643b4717, TypeS_2op>, Enc_83ee64 {
+tc_a1297125, TypeS_2op>, Enc_83ee64 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000101000;
@@ -21722,7 +22051,7 @@ def S2_tstbit_r : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = tstbit($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111000;
@@ -21731,7 +22060,7 @@ def S2_valignib : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32, u3_0Imm:$Ii),
 "$Rdd32 = valignb($Rtt32,$Rss32,#$Ii)",
-tc_b4b5c03a, TypeS_3op>, Enc_729ff7 {
+tc_6fc5dbea, TypeS_3op>, Enc_729ff7 {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000000000;
 }
@@ -21739,7 +22068,7 @@ def S2_valignrb : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rtt32, DoubleRegs:$Rss32, PredRegs:$Pu4),
 "$Rdd32 = valignb($Rtt32,$Rss32,$Pu4)",
-tc_b4b5c03a, TypeS_3op>, Enc_8c6530 {
+tc_6fc5dbea, TypeS_3op>, Enc_8c6530 {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000010000;
@@ -21748,7 +22077,7 @@ def S2_vcnegh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vcnegh($Rss32,$Rt32)",
-tc_779080bf, TypeS_3op>, Enc_927852 {
+tc_8a825db2, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011110;
@@ -21759,7 +22088,7 @@ def S2_vcrotate : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rdd32 = vcrotate($Rss32,$Rt32)",
-tc_002cb246, TypeS_3op>, Enc_927852 {
+tc_0dfac0a7, TypeS_3op>, Enc_927852 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000011110;
@@ -21770,7 +22099,7 @@ def S2_vrcnegh : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32),
 "$Rxx32 += vrcnegh($Rss32,$Rt32)",
-tc_d773585a, TypeS_3op>, Enc_1aa186 {
+tc_7f8ae742, TypeS_3op>, Enc_1aa186 {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b11001011001;
@@ -21781,7 +22110,7 @@ def S2_vrndpackwh : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vrndwh($Rss32)",
-tc_14b5c689, TypeS_2op>, Enc_90cd8b {
+tc_e3d699e3, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001000100;
 let hasNewValue = 1;
@@ -21792,7 +22121,7 @@ def S2_vrndpackwhs : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vrndwh($Rss32):sat",
-tc_cf8126ae, TypeS_2op>, Enc_90cd8b {
+tc_d61dfdc3, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10001000100;
 let hasNewValue = 1;
@@ -21804,7 +22133,7 @@ def S2_vsathb : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vsathb($Rss32)",
-tc_0ae0825c, TypeS_2op>, Enc_90cd8b {
+tc_9f6cd987, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10001000000;
 let hasNewValue = 1;
@@ -21815,7 +22144,7 @@ def S2_vsathb_nopack : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vsathb($Rss32)",
-tc_0ae0825c, TypeS_2op>, Enc_b9c5fb {
+tc_9f6cd987, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10000000000;
 let Defs = [USR_OVF];
@@ -21824,7 +22153,7 @@ def S2_vsathub : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vsathub($Rss32)",
-tc_0ae0825c, TypeS_2op>, Enc_90cd8b {
+tc_9f6cd987, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001000000;
 let hasNewValue = 1;
@@ -21835,7 +22164,7 @@ def S2_vsathub_nopack : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vsathub($Rss32)",
-tc_0ae0825c, TypeS_2op>, Enc_b9c5fb {
+tc_9f6cd987, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000000000;
 let Defs = [USR_OVF];
@@ -21844,7 +22173,7 @@ def S2_vsatwh : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vsatwh($Rss32)",
-tc_0ae0825c, TypeS_2op>, Enc_90cd8b {
+tc_9f6cd987, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10001000000;
 let hasNewValue = 1;
@@ -21855,7 +22184,7 @@ def S2_vsatwh_nopack : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vsatwh($Rss32)",
-tc_0ae0825c, TypeS_2op>, Enc_b9c5fb {
+tc_9f6cd987, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000000000;
 let Defs = [USR_OVF];
@@ -21864,7 +22193,7 @@ def S2_vsatwuh : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vsatwuh($Rss32)",
-tc_0ae0825c, TypeS_2op>, Enc_90cd8b {
+tc_9f6cd987, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10001000000;
 let hasNewValue = 1;
@@ -21875,7 +22204,7 @@ def S2_vsatwuh_nopack : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32),
 "$Rdd32 = vsatwuh($Rss32)",
-tc_0ae0825c, TypeS_2op>, Enc_b9c5fb {
+tc_9f6cd987, TypeS_2op>, Enc_b9c5fb {
 let Inst{13-5} = 0b000000101;
 let Inst{31-21} = 0b10000000000;
 let Defs = [USR_OVF];
@@ -21884,7 +22213,7 @@ def S2_vsplatrb : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32),
 "$Rd32 = vsplatb($Rs32)",
-tc_0ae0825c, TypeS_2op>, Enc_5e2823 {
+tc_9f6cd987, TypeS_2op>, Enc_5e2823 {
 let Inst{13-5} = 0b000000111;
 let Inst{31-21} = 0b10001100010;
 let hasNewValue = 1;
@@ -21896,7 +22225,7 @@ def S2_vsplatrh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = vsplath($Rs32)",
-tc_0ae0825c, TypeS_2op>, Enc_3a3d62 {
+tc_9f6cd987, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10000100010;
 let isReMaterializable = 1;
@@ -21906,7 +22235,7 @@ def S2_vspliceib : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, u3_0Imm:$Ii),
 "$Rdd32 = vspliceb($Rss32,$Rtt32,#$Ii)",
-tc_b4b5c03a, TypeS_3op>, Enc_d50cd3 {
+tc_6fc5dbea, TypeS_3op>, Enc_d50cd3 {
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000000100;
 }
@@ -21914,7 +22243,7 @@ def S2_vsplicerb : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32, PredRegs:$Pu4),
 "$Rdd32 = vspliceb($Rss32,$Rtt32,$Pu4)",
-tc_b4b5c03a, TypeS_3op>, Enc_dbd70c {
+tc_6fc5dbea, TypeS_3op>, Enc_dbd70c {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000010100;
@@ -21923,7 +22252,7 @@ def S2_vsxtbh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = vsxtbh($Rs32)",
-tc_0ae0825c, TypeS_2op>, Enc_3a3d62 {
+tc_9f6cd987, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10000100000;
 let isReMaterializable = 1;
@@ -21933,7 +22262,7 @@ def S2_vsxthw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = vsxthw($Rs32)",
-tc_0ae0825c, TypeS_2op>, Enc_3a3d62 {
+tc_9f6cd987, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000100000;
 let isReMaterializable = 1;
@@ -21943,7 +22272,7 @@ def S2_vtrunehb : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vtrunehb($Rss32)",
-tc_0ae0825c, TypeS_2op>, Enc_90cd8b {
+tc_9f6cd987, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10001000100;
 let hasNewValue = 1;
@@ -21953,7 +22282,7 @@ def S2_vtrunewh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vtrunewh($Rss32,$Rtt32)",
-tc_946df596, TypeS_3op>, Enc_a56825 {
+tc_5da50c4b, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001100;
@@ -21962,7 +22291,7 @@ def S2_vtrunohb : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = vtrunohb($Rss32)",
-tc_0ae0825c, TypeS_2op>, Enc_90cd8b {
+tc_9f6cd987, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001000100;
 let hasNewValue = 1;
@@ -21972,7 +22301,7 @@ def S2_vtrunowh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vtrunowh($Rss32,$Rtt32)",
-tc_946df596, TypeS_3op>, Enc_a56825 {
+tc_5da50c4b, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001100;
@@ -21981,7 +22310,7 @@ def S2_vzxtbh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = vzxtbh($Rs32)",
-tc_0ae0825c, TypeS_2op>, Enc_3a3d62 {
+tc_9f6cd987, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b10000100000;
 let isReMaterializable = 1;
@@ -21991,7 +22320,7 @@ def S2_vzxthw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = vzxthw($Rs32)",
-tc_0ae0825c, TypeS_2op>, Enc_3a3d62 {
+tc_9f6cd987, TypeS_2op>, Enc_3a3d62 {
 let Inst{13-5} = 0b000000110;
 let Inst{31-21} = 0b10000100000;
 let isReMaterializable = 1;
@@ -22001,7 +22330,7 @@ def S4_addaddi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, s32_0Imm:$Ii),
 "$Rd32 = add($Rs32,add($Ru32,#$Ii))",
-tc_f675fee8, TypeALU64>, Enc_8b8d61 {
+tc_2c13e7f5, TypeALU64>, Enc_8b8d61, Requires<[UseCompound]> {
 let Inst{31-23} = 0b110110110;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -22016,7 +22345,7 @@ def S4_addi_asl_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = add(#$Ii,asl($Rx32in,#$II))",
-tc_f675fee8, TypeALU64>, Enc_c31910 {
+tc_2c13e7f5, TypeALU64>, Enc_c31910, Requires<[UseCompound]> {
 let Inst{2-0} = 0b100;
 let Inst{4-4} = 0b0;
 let Inst{31-24} = 0b11011110;
@@ -22034,7 +22363,7 @@ def S4_addi_lsr_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = add(#$Ii,lsr($Rx32in,#$II))",
-tc_f675fee8, TypeALU64>, Enc_c31910 {
+tc_2c13e7f5, TypeALU64>, Enc_c31910, Requires<[UseCompound]> {
 let Inst{2-0} = 0b100;
 let Inst{4-4} = 0b1;
 let Inst{31-24} = 0b11011110;
@@ -22052,7 +22381,7 @@ def S4_andi_asl_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = and(#$Ii,asl($Rx32in,#$II))",
-tc_f429765c, TypeALU64>, Enc_c31910 {
+tc_a4e22bbd, TypeALU64>, Enc_c31910, Requires<[UseCompound]> {
 let Inst{2-0} = 0b000;
 let Inst{4-4} = 0b0;
 let Inst{31-24} = 0b11011110;
@@ -22070,7 +22399,7 @@ def S4_andi_lsr_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = and(#$Ii,lsr($Rx32in,#$II))",
-tc_f429765c, TypeALU64>, Enc_c31910 {
+tc_a4e22bbd, TypeALU64>, Enc_c31910, Requires<[UseCompound]> {
 let Inst{2-0} = 0b000;
 let Inst{4-4} = 0b1;
 let Inst{31-24} = 0b11011110;
@@ -22088,7 +22417,7 @@ def S4_clbaddi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s6_0Imm:$Ii),
 "$Rd32 = add(clb($Rs32),#$Ii)",
-tc_002cb246, TypeS_2op>, Enc_9fae8a {
+tc_a08b630b, TypeS_2op>, Enc_9fae8a {
 let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b10001100001;
 let hasNewValue = 1;
@@ -22099,7 +22428,7 @@ def S4_clbpaddi : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, s6_0Imm:$Ii),
 "$Rd32 = add(clb($Rss32),#$Ii)",
-tc_002cb246, TypeS_2op>, Enc_a1640c {
+tc_a08b630b, TypeS_2op>, Enc_a1640c {
 let Inst{7-5} = 0b010;
 let Inst{31-21} = 0b10001000011;
 let hasNewValue = 1;
@@ -22110,7 +22439,7 @@ def S4_clbpnorm : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = normamt($Rss32)",
-tc_14b5c689, TypeS_2op>, Enc_90cd8b {
+tc_a7bdb22c, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000000;
 let Inst{31-21} = 0b10001000011;
 let hasNewValue = 1;
@@ -22121,7 +22450,7 @@ def S4_extract : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii, u5_0Imm:$II),
 "$Rd32 = extract($Rs32,#$Ii,#$II)",
-tc_f675fee8, TypeS_2op>, Enc_b388cf {
+tc_2c13e7f5, TypeS_2op>, Enc_b388cf {
 let Inst{13-13} = 0b0;
 let Inst{31-23} = 0b100011011;
 let hasNewValue = 1;
@@ -22132,7 +22461,7 @@ def S4_extract_rp : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "$Rd32 = extract($Rs32,$Rtt32)",
-tc_002cb246, TypeS_3op>, Enc_e07374 {
+tc_a08b630b, TypeS_3op>, Enc_e07374 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11001001000;
@@ -22144,7 +22473,7 @@ def S4_extractp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii, u6_0Imm:$II),
 "$Rdd32 = extract($Rss32,#$Ii,#$II)",
-tc_f675fee8, TypeS_2op>, Enc_b84c4c {
+tc_2c13e7f5, TypeS_2op>, Enc_b84c4c {
 let Inst{31-24} = 0b10001010;
 let prefersSlot3 = 1;
 }
@@ -22152,7 +22481,7 @@ def S4_extractp_rp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = extract($Rss32,$Rtt32)",
-tc_002cb246, TypeS_3op>, Enc_a56825 {
+tc_a08b630b, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001110;
@@ -22162,7 +22491,7 @@ def S4_lsli : HInst<
 (outs IntRegs:$Rd32),
 (ins s6_0Imm:$Ii, IntRegs:$Rt32),
 "$Rd32 = lsl(#$Ii,$Rt32)",
-tc_946df596, TypeS_3op>, Enc_fef969 {
+tc_5da50c4b, TypeS_3op>, Enc_fef969 {
 let Inst{7-6} = 0b11;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000110100;
@@ -22173,7 +22502,7 @@ def S4_ntstbit_i : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Pd4 = !tstbit($Rs32,#$Ii)",
-tc_643b4717, TypeS_2op>, Enc_83ee64 {
+tc_a1297125, TypeS_2op>, Enc_83ee64 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10000101001;
@@ -22182,7 +22511,7 @@ def S4_ntstbit_r : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Pd4 = !tstbit($Rs32,$Rt32)",
-tc_85d5d03f, TypeS_3op>, Enc_c2b48e {
+tc_4a55d03c, TypeS_3op>, Enc_c2b48e {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000111001;
@@ -22191,7 +22520,7 @@ def S4_or_andi : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rx32 |= and($Rs32,#$Ii)",
-tc_f429765c, TypeALU64>, Enc_b0e9d8 {
+tc_a4e22bbd, TypeALU64>, Enc_b0e9d8 {
 let Inst{31-22} = 0b1101101000;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -22208,7 +22537,7 @@ def S4_or_andix : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Ru32, IntRegs:$Rx32in, s32_0Imm:$Ii),
 "$Rx32 = or($Ru32,and($Rx32in,#$Ii))",
-tc_f429765c, TypeALU64>, Enc_b4e6cf {
+tc_a4e22bbd, TypeALU64>, Enc_b4e6cf, Requires<[UseCompound]> {
 let Inst{31-22} = 0b1101101001;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -22224,7 +22553,7 @@ def S4_or_ori : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, s32_0Imm:$Ii),
 "$Rx32 |= or($Rs32,#$Ii)",
-tc_f429765c, TypeALU64>, Enc_b0e9d8 {
+tc_a4e22bbd, TypeALU64>, Enc_b0e9d8 {
 let Inst{31-22} = 0b1101101010;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -22241,7 +22570,7 @@ def S4_ori_asl_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = or(#$Ii,asl($Rx32in,#$II))",
-tc_f429765c, TypeALU64>, Enc_c31910 {
+tc_a4e22bbd, TypeALU64>, Enc_c31910, Requires<[UseCompound]> {
 let Inst{2-0} = 0b010;
 let Inst{4-4} = 0b0;
 let Inst{31-24} = 0b11011110;
@@ -22259,7 +22588,7 @@ def S4_ori_lsr_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = or(#$Ii,lsr($Rx32in,#$II))",
-tc_f429765c, TypeALU64>, Enc_c31910 {
+tc_a4e22bbd, TypeALU64>, Enc_c31910, Requires<[UseCompound]> {
 let Inst{2-0} = 0b010;
 let Inst{4-4} = 0b1;
 let Inst{31-24} = 0b11011110;
@@ -22277,7 +22606,7 @@ def S4_parity : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = parity($Rs32,$Rt32)",
-tc_002cb246, TypeALU64>, Enc_5ab2be {
+tc_a08b630b, TypeALU64>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101111;
@@ -22289,7 +22618,7 @@ def S4_pstorerbf_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memb(#$Ii) = $Rt32",
-tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_ba9255a6, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -22300,8 +22629,8 @@ let addrMode = Absolute;
 let accessSize = ByteAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerbabs";
+let CextOpcode = "S2_storerb";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -22314,23 +22643,23 @@ def S4_pstorerbf_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
+tc_1fe4ab69, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110101000;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storerb_rr";
 let CextOpcode = "S2_storerb";
 let InputType = "reg";
-let BaseOpcode = "S4_storerb_rr";
 let isNVStorable = 1;
 }
 def S4_pstorerbfnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memb(#$Ii) = $Rt32",
-tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_bb07f2c5, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -22342,8 +22671,8 @@ let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerbabs";
+let CextOpcode = "S2_storerb";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -22356,7 +22685,7 @@ def S4_pstorerbfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memb($Rs32+#$Ii) = $Rt32",
-tc_da97ee82, TypeV2LDST>, Enc_da8d43, AddrModeRel {
+tc_a2b365d2, TypeV2LDST>, Enc_da8d43, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000110000;
 let isPredicated = 1;
@@ -22365,9 +22694,9 @@ let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S2_storerb_io";
 let isNVStorable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -22379,7 +22708,7 @@ def S4_pstorerbfnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
+tc_8e82e8ca, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110111000;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -22387,16 +22716,16 @@ let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storerb_rr";
 let CextOpcode = "S2_storerb";
 let InputType = "reg";
-let BaseOpcode = "S4_storerb_rr";
 let isNVStorable = 1;
 }
 def S4_pstorerbfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4.new) memb($Rs32) = $Rt32",
-tc_da97ee82, TypeMAPPING> {
+tc_a2b365d2, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -22404,7 +22733,7 @@ def S4_pstorerbnewf_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memb(#$Ii) = $Nt8.new",
-tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel {
+tc_cfa0e29b, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b000;
@@ -22418,8 +22747,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerbabs";
+let CextOpcode = "S2_storerb";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -22432,7 +22761,7 @@ def S4_pstorerbnewf_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_0a6c20ae, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b00;
 let Inst{31-21} = 0b00110101101;
 let isPredicated = 1;
@@ -22443,16 +22772,16 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storerb_rr";
 let CextOpcode = "S2_storerb";
 let InputType = "reg";
-let BaseOpcode = "S4_storerb_rr";
 let opNewValue = 4;
 }
 def S4_pstorerbnewfnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memb(#$Ii) = $Nt8.new",
-tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel {
+tc_0fac1eb8, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b100;
@@ -22467,8 +22796,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerbabs";
+let CextOpcode = "S2_storerb";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -22481,7 +22810,7 @@ def S4_pstorerbnewfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memb($Rs32+#$Ii) = $Nt8.new",
-tc_c79a189f, TypeV2LDST>, Enc_585242, AddrModeRel {
+tc_92240447, TypeV2LDST>, Enc_585242, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b01000110101;
@@ -22494,9 +22823,9 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S2_storerb_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -22508,7 +22837,7 @@ def S4_pstorerbnewfnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_829d8a86, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b00;
 let Inst{31-21} = 0b00110111101;
 let isPredicated = 1;
@@ -22520,16 +22849,16 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storerb_rr";
 let CextOpcode = "S2_storerb";
 let InputType = "reg";
-let BaseOpcode = "S4_storerb_rr";
 let opNewValue = 4;
 }
 def S4_pstorerbnewfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if (!$Pv4.new) memb($Rs32) = $Nt8.new",
-tc_c79a189f, TypeMAPPING> {
+tc_92240447, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -22538,7 +22867,7 @@ def S4_pstorerbnewt_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memb(#$Ii) = $Nt8.new",
-tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel {
+tc_cfa0e29b, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b000;
@@ -22551,8 +22880,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerbabs";
+let CextOpcode = "S2_storerb";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -22565,7 +22894,7 @@ def S4_pstorerbnewt_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_0a6c20ae, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b00;
 let Inst{31-21} = 0b00110100101;
 let isPredicated = 1;
@@ -22575,16 +22904,16 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storerb_rr";
 let CextOpcode = "S2_storerb";
 let InputType = "reg";
-let BaseOpcode = "S4_storerb_rr";
 let opNewValue = 4;
 }
 def S4_pstorerbnewtnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memb(#$Ii) = $Nt8.new",
-tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel {
+tc_0fac1eb8, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b100;
@@ -22598,8 +22927,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerbabs";
+let CextOpcode = "S2_storerb";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -22612,7 +22941,7 @@ def S4_pstorerbnewtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memb($Rs32+#$Ii) = $Nt8.new",
-tc_c79a189f, TypeV2LDST>, Enc_585242, AddrModeRel {
+tc_92240447, TypeV2LDST>, Enc_585242, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b01000010101;
@@ -22624,9 +22953,9 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S2_storerb_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -22638,7 +22967,7 @@ def S4_pstorerbnewtnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_829d8a86, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b00;
 let Inst{31-21} = 0b00110110101;
 let isPredicated = 1;
@@ -22649,16 +22978,16 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storerb_rr";
 let CextOpcode = "S2_storerb";
 let InputType = "reg";
-let BaseOpcode = "S4_storerb_rr";
 let opNewValue = 4;
 }
 def S4_pstorerbnewtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if ($Pv4.new) memb($Rs32) = $Nt8.new",
-tc_c79a189f, TypeMAPPING> {
+tc_92240447, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -22667,7 +22996,7 @@ def S4_pstorerbt_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memb(#$Ii) = $Rt32",
-tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_ba9255a6, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -22677,8 +23006,8 @@ let addrMode = Absolute;
 let accessSize = ByteAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerbabs";
+let CextOpcode = "S2_storerb";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -22691,22 +23020,22 @@ def S4_pstorerbt_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
+tc_1fe4ab69, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110100000;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storerb_rr";
 let CextOpcode = "S2_storerb";
 let InputType = "reg";
-let BaseOpcode = "S4_storerb_rr";
 let isNVStorable = 1;
 }
 def S4_pstorerbtnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memb(#$Ii) = $Rt32",
-tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_bb07f2c5, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -22717,8 +23046,8 @@ let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S2_storerbabs";
+let CextOpcode = "S2_storerb";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -22731,7 +23060,7 @@ def S4_pstorerbtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memb($Rs32+#$Ii) = $Rt32",
-tc_da97ee82, TypeV2LDST>, Enc_da8d43, AddrModeRel {
+tc_a2b365d2, TypeV2LDST>, Enc_da8d43, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000010000;
 let isPredicated = 1;
@@ -22739,9 +23068,9 @@ let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S2_storerb_io";
 let isNVStorable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -22753,23 +23082,23 @@ def S4_pstorerbtnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memb($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
+tc_8e82e8ca, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110110000;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storerb_rr";
 let CextOpcode = "S2_storerb";
 let InputType = "reg";
-let BaseOpcode = "S4_storerb_rr";
 let isNVStorable = 1;
 }
 def S4_pstorerbtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4.new) memb($Rs32) = $Rt32",
-tc_da97ee82, TypeMAPPING> {
+tc_a2b365d2, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -22777,7 +23106,7 @@ def S4_pstorerdf_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4) memd(#$Ii) = $Rtt32",
-tc_362c6592, TypeST>, Enc_50b5ac, AddrModeRel {
+tc_ba9255a6, TypeST>, Enc_50b5ac, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -22788,8 +23117,8 @@ let addrMode = Absolute;
 let accessSize = DoubleWordAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerd";
 let BaseOpcode = "S2_storerdabs";
+let CextOpcode = "S2_storerd";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -22801,22 +23130,22 @@ def S4_pstorerdf_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
-tc_3962fa26, TypeST>, Enc_1a9974, AddrModeRel {
+tc_1fe4ab69, TypeST>, Enc_1a9974, AddrModeRel {
 let Inst{31-21} = 0b00110101110;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = BaseRegOffset;
 let accessSize = DoubleWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerd_rr";
 let CextOpcode = "S2_storerd";
 let InputType = "reg";
-let BaseOpcode = "S2_storerd_rr";
 }
 def S4_pstorerdfnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4.new) memd(#$Ii) = $Rtt32",
-tc_da4a37ed, TypeST>, Enc_50b5ac, AddrModeRel {
+tc_bb07f2c5, TypeST>, Enc_50b5ac, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -22828,8 +23157,8 @@ let accessSize = DoubleWordAccess;
 let isPredicatedNew = 1;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerd";
 let BaseOpcode = "S2_storerdabs";
+let CextOpcode = "S2_storerd";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -22841,7 +23170,7 @@ def S4_pstorerdfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4.new) memd($Rs32+#$Ii) = $Rtt32",
-tc_da97ee82, TypeV2LDST>, Enc_57a33e, AddrModeRel {
+tc_a2b365d2, TypeV2LDST>, Enc_57a33e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000110110;
 let isPredicated = 1;
@@ -22850,9 +23179,9 @@ let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerd_io";
 let CextOpcode = "S2_storerd";
 let InputType = "imm";
-let BaseOpcode = "S2_storerd_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -22863,7 +23192,7 @@ def S4_pstorerdfnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if (!$Pv4.new) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
-tc_40116ca8, TypeST>, Enc_1a9974, AddrModeRel {
+tc_8e82e8ca, TypeST>, Enc_1a9974, AddrModeRel {
 let Inst{31-21} = 0b00110111110;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -22871,15 +23200,15 @@ let addrMode = BaseRegOffset;
 let accessSize = DoubleWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerd_rr";
 let CextOpcode = "S2_storerd";
 let InputType = "reg";
-let BaseOpcode = "S2_storerd_rr";
 }
 def S4_pstorerdfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "if (!$Pv4.new) memd($Rs32) = $Rtt32",
-tc_da97ee82, TypeMAPPING> {
+tc_a2b365d2, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -22887,7 +23216,7 @@ def S4_pstorerdt_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4) memd(#$Ii) = $Rtt32",
-tc_362c6592, TypeST>, Enc_50b5ac, AddrModeRel {
+tc_ba9255a6, TypeST>, Enc_50b5ac, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -22897,8 +23226,8 @@ let addrMode = Absolute;
 let accessSize = DoubleWordAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerd";
 let BaseOpcode = "S2_storerdabs";
+let CextOpcode = "S2_storerd";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -22910,21 +23239,21 @@ def S4_pstorerdt_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
-tc_3962fa26, TypeST>, Enc_1a9974, AddrModeRel {
+tc_1fe4ab69, TypeST>, Enc_1a9974, AddrModeRel {
 let Inst{31-21} = 0b00110100110;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
 let accessSize = DoubleWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerd_rr";
 let CextOpcode = "S2_storerd";
 let InputType = "reg";
-let BaseOpcode = "S2_storerd_rr";
 }
 def S4_pstorerdtnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4.new) memd(#$Ii) = $Rtt32",
-tc_da4a37ed, TypeST>, Enc_50b5ac, AddrModeRel {
+tc_bb07f2c5, TypeST>, Enc_50b5ac, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -22935,8 +23264,8 @@ let accessSize = DoubleWordAccess;
 let isPredicatedNew = 1;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerd";
 let BaseOpcode = "S2_storerdabs";
+let CextOpcode = "S2_storerd";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -22948,7 +23277,7 @@ def S4_pstorerdtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u29_3Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4.new) memd($Rs32+#$Ii) = $Rtt32",
-tc_da97ee82, TypeV2LDST>, Enc_57a33e, AddrModeRel {
+tc_a2b365d2, TypeV2LDST>, Enc_57a33e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000010110;
 let isPredicated = 1;
@@ -22956,9 +23285,9 @@ let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerd_io";
 let CextOpcode = "S2_storerd";
 let InputType = "imm";
-let BaseOpcode = "S2_storerd_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -22969,22 +23298,22 @@ def S4_pstorerdtnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
 "if ($Pv4.new) memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
-tc_40116ca8, TypeST>, Enc_1a9974, AddrModeRel {
+tc_8e82e8ca, TypeST>, Enc_1a9974, AddrModeRel {
 let Inst{31-21} = 0b00110110110;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
 let accessSize = DoubleWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerd_rr";
 let CextOpcode = "S2_storerd";
 let InputType = "reg";
-let BaseOpcode = "S2_storerd_rr";
 }
 def S4_pstorerdtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "if ($Pv4.new) memd($Rs32) = $Rtt32",
-tc_da97ee82, TypeMAPPING> {
+tc_a2b365d2, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -22992,7 +23321,7 @@ def S4_pstorerff_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh(#$Ii) = $Rt32.h",
-tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_ba9255a6, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -23003,8 +23332,8 @@ let addrMode = Absolute;
 let accessSize = HalfWordAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerf";
 let BaseOpcode = "S2_storerfabs";
+let CextOpcode = "S2_storerf";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -23016,22 +23345,22 @@ def S4_pstorerff_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
-tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
+tc_1fe4ab69, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110101011;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storerf_rr";
 let CextOpcode = "S2_storerf";
 let InputType = "reg";
-let BaseOpcode = "S4_storerf_rr";
 }
 def S4_pstorerffnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh(#$Ii) = $Rt32.h",
-tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_bb07f2c5, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -23043,8 +23372,8 @@ let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerf";
 let BaseOpcode = "S2_storerfabs";
+let CextOpcode = "S2_storerf";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -23056,7 +23385,7 @@ def S4_pstorerffnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rs32+#$Ii) = $Rt32.h",
-tc_da97ee82, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_a2b365d2, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000110011;
 let isPredicated = 1;
@@ -23065,9 +23394,9 @@ let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerf_io";
 let CextOpcode = "S2_storerf";
 let InputType = "imm";
-let BaseOpcode = "S2_storerf_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -23078,7 +23407,7 @@ def S4_pstorerffnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
-tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
+tc_8e82e8ca, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110111011;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -23086,15 +23415,15 @@ let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storerf_rr";
 let CextOpcode = "S2_storerf";
 let InputType = "reg";
-let BaseOpcode = "S4_storerf_rr";
 }
 def S4_pstorerffnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rs32) = $Rt32.h",
-tc_da97ee82, TypeMAPPING> {
+tc_a2b365d2, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -23102,7 +23431,7 @@ def S4_pstorerft_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh(#$Ii) = $Rt32.h",
-tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_ba9255a6, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -23112,8 +23441,8 @@ let addrMode = Absolute;
 let accessSize = HalfWordAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerf";
 let BaseOpcode = "S2_storerfabs";
+let CextOpcode = "S2_storerf";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -23125,21 +23454,21 @@ def S4_pstorerft_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
-tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
+tc_1fe4ab69, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110100011;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storerf_rr";
 let CextOpcode = "S2_storerf";
 let InputType = "reg";
-let BaseOpcode = "S4_storerf_rr";
 }
 def S4_pstorerftnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh(#$Ii) = $Rt32.h",
-tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_bb07f2c5, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -23150,8 +23479,8 @@ let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerf";
 let BaseOpcode = "S2_storerfabs";
+let CextOpcode = "S2_storerf";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -23163,7 +23492,7 @@ def S4_pstorerftnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rs32+#$Ii) = $Rt32.h",
-tc_da97ee82, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_a2b365d2, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000010011;
 let isPredicated = 1;
@@ -23171,9 +23500,9 @@ let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerf_io";
 let CextOpcode = "S2_storerf";
 let InputType = "imm";
-let BaseOpcode = "S2_storerf_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -23184,22 +23513,22 @@ def S4_pstorerftnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
-tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
+tc_8e82e8ca, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110110011;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storerf_rr";
 let CextOpcode = "S2_storerf";
 let InputType = "reg";
-let BaseOpcode = "S4_storerf_rr";
 }
 def S4_pstorerftnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rs32) = $Rt32.h",
-tc_da97ee82, TypeMAPPING> {
+tc_a2b365d2, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -23207,7 +23536,7 @@ def S4_pstorerhf_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh(#$Ii) = $Rt32",
-tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_ba9255a6, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -23218,8 +23547,8 @@ let addrMode = Absolute;
 let accessSize = HalfWordAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerhabs";
+let CextOpcode = "S2_storerh";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -23232,23 +23561,23 @@ def S4_pstorerhf_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
+tc_1fe4ab69, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110101010;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_rr";
 let CextOpcode = "S2_storerh";
 let InputType = "reg";
-let BaseOpcode = "S2_storerh_rr";
 let isNVStorable = 1;
 }
 def S4_pstorerhfnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh(#$Ii) = $Rt32",
-tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_bb07f2c5, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -23260,8 +23589,8 @@ let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerhabs";
+let CextOpcode = "S2_storerh";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -23274,7 +23603,7 @@ def S4_pstorerhfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rs32+#$Ii) = $Rt32",
-tc_da97ee82, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_a2b365d2, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000110010;
 let isPredicated = 1;
@@ -23283,9 +23612,9 @@ let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S2_storerh_io";
 let isNVStorable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -23297,7 +23626,7 @@ def S4_pstorerhfnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
+tc_8e82e8ca, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110111010;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -23305,16 +23634,16 @@ let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_rr";
 let CextOpcode = "S2_storerh";
 let InputType = "reg";
-let BaseOpcode = "S2_storerh_rr";
 let isNVStorable = 1;
 }
 def S4_pstorerhfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4.new) memh($Rs32) = $Rt32",
-tc_da97ee82, TypeMAPPING> {
+tc_a2b365d2, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -23322,7 +23651,7 @@ def S4_pstorerhnewf_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memh(#$Ii) = $Nt8.new",
-tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel {
+tc_cfa0e29b, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b001;
@@ -23336,8 +23665,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerhabs";
+let CextOpcode = "S2_storerh";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -23350,7 +23679,7 @@ def S4_pstorerhnewf_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_0a6c20ae, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b01;
 let Inst{31-21} = 0b00110101101;
 let isPredicated = 1;
@@ -23361,16 +23690,16 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_rr";
 let CextOpcode = "S2_storerh";
 let InputType = "reg";
-let BaseOpcode = "S2_storerh_rr";
 let opNewValue = 4;
 }
 def S4_pstorerhnewfnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memh(#$Ii) = $Nt8.new",
-tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel {
+tc_0fac1eb8, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b101;
@@ -23385,8 +23714,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerhabs";
+let CextOpcode = "S2_storerh";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -23399,7 +23728,7 @@ def S4_pstorerhnewfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memh($Rs32+#$Ii) = $Nt8.new",
-tc_c79a189f, TypeV2LDST>, Enc_f44229, AddrModeRel {
+tc_92240447, TypeV2LDST>, Enc_f44229, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b01000110101;
@@ -23412,9 +23741,9 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S2_storerh_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -23426,7 +23755,7 @@ def S4_pstorerhnewfnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_829d8a86, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b01;
 let Inst{31-21} = 0b00110111101;
 let isPredicated = 1;
@@ -23438,16 +23767,16 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_rr";
 let CextOpcode = "S2_storerh";
 let InputType = "reg";
-let BaseOpcode = "S2_storerh_rr";
 let opNewValue = 4;
 }
 def S4_pstorerhnewfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if (!$Pv4.new) memh($Rs32) = $Nt8.new",
-tc_c79a189f, TypeMAPPING> {
+tc_92240447, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -23456,7 +23785,7 @@ def S4_pstorerhnewt_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memh(#$Ii) = $Nt8.new",
-tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel {
+tc_cfa0e29b, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b001;
@@ -23469,8 +23798,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerhabs";
+let CextOpcode = "S2_storerh";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -23483,7 +23812,7 @@ def S4_pstorerhnewt_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_0a6c20ae, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b01;
 let Inst{31-21} = 0b00110100101;
 let isPredicated = 1;
@@ -23493,16 +23822,16 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_rr";
 let CextOpcode = "S2_storerh";
 let InputType = "reg";
-let BaseOpcode = "S2_storerh_rr";
 let opNewValue = 4;
 }
 def S4_pstorerhnewtnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memh(#$Ii) = $Nt8.new",
-tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel {
+tc_0fac1eb8, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b101;
@@ -23516,8 +23845,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerhabs";
+let CextOpcode = "S2_storerh";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -23530,7 +23859,7 @@ def S4_pstorerhnewtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memh($Rs32+#$Ii) = $Nt8.new",
-tc_c79a189f, TypeV2LDST>, Enc_f44229, AddrModeRel {
+tc_92240447, TypeV2LDST>, Enc_f44229, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b01000010101;
@@ -23542,9 +23871,9 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S2_storerh_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -23556,7 +23885,7 @@ def S4_pstorerhnewtnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_829d8a86, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b01;
 let Inst{31-21} = 0b00110110101;
 let isPredicated = 1;
@@ -23567,16 +23896,16 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_rr";
 let CextOpcode = "S2_storerh";
 let InputType = "reg";
-let BaseOpcode = "S2_storerh_rr";
 let opNewValue = 4;
 }
 def S4_pstorerhnewtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if ($Pv4.new) memh($Rs32) = $Nt8.new",
-tc_c79a189f, TypeMAPPING> {
+tc_92240447, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -23585,7 +23914,7 @@ def S4_pstorerht_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh(#$Ii) = $Rt32",
-tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_ba9255a6, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -23595,8 +23924,8 @@ let addrMode = Absolute;
 let accessSize = HalfWordAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerhabs";
+let CextOpcode = "S2_storerh";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -23609,22 +23938,22 @@ def S4_pstorerht_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
+tc_1fe4ab69, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110100010;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_rr";
 let CextOpcode = "S2_storerh";
 let InputType = "reg";
-let BaseOpcode = "S2_storerh_rr";
 let isNVStorable = 1;
 }
 def S4_pstorerhtnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh(#$Ii) = $Rt32",
-tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_bb07f2c5, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -23635,8 +23964,8 @@ let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerhabs";
+let CextOpcode = "S2_storerh";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -23649,7 +23978,7 @@ def S4_pstorerhtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u31_1Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rs32+#$Ii) = $Rt32",
-tc_da97ee82, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
+tc_a2b365d2, TypeV2LDST>, Enc_e8c45e, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000010010;
 let isPredicated = 1;
@@ -23657,9 +23986,9 @@ let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S2_storerh_io";
 let isNVStorable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -23671,23 +24000,23 @@ def S4_pstorerhtnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
+tc_8e82e8ca, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110110010;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_rr";
 let CextOpcode = "S2_storerh";
 let InputType = "reg";
-let BaseOpcode = "S2_storerh_rr";
 let isNVStorable = 1;
 }
 def S4_pstorerhtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4.new) memh($Rs32) = $Rt32",
-tc_da97ee82, TypeMAPPING> {
+tc_a2b365d2, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -23695,7 +24024,7 @@ def S4_pstorerif_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memw(#$Ii) = $Rt32",
-tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_ba9255a6, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -23706,8 +24035,8 @@ let addrMode = Absolute;
 let accessSize = WordAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeriabs";
+let CextOpcode = "S2_storeri";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -23720,23 +24049,23 @@ def S4_pstorerif_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
+tc_1fe4ab69, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110101100;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = BaseRegOffset;
 let accessSize = WordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_rr";
 let CextOpcode = "S2_storeri";
 let InputType = "reg";
-let BaseOpcode = "S2_storeri_rr";
 let isNVStorable = 1;
 }
 def S4_pstorerifnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memw(#$Ii) = $Rt32",
-tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_bb07f2c5, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -23748,8 +24077,8 @@ let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeriabs";
+let CextOpcode = "S2_storeri";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -23762,7 +24091,7 @@ def S4_pstorerifnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memw($Rs32+#$Ii) = $Rt32",
-tc_da97ee82, TypeV2LDST>, Enc_397f23, AddrModeRel {
+tc_a2b365d2, TypeV2LDST>, Enc_397f23, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000110100;
 let isPredicated = 1;
@@ -23771,9 +24100,9 @@ let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S2_storeri_io";
 let isNVStorable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -23785,7 +24114,7 @@ def S4_pstorerifnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if (!$Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
+tc_8e82e8ca, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110111100;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -23793,16 +24122,16 @@ let addrMode = BaseRegOffset;
 let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_rr";
 let CextOpcode = "S2_storeri";
 let InputType = "reg";
-let BaseOpcode = "S2_storeri_rr";
 let isNVStorable = 1;
 }
 def S4_pstorerifnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if (!$Pv4.new) memw($Rs32) = $Rt32",
-tc_da97ee82, TypeMAPPING> {
+tc_a2b365d2, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -23810,7 +24139,7 @@ def S4_pstorerinewf_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memw(#$Ii) = $Nt8.new",
-tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel {
+tc_cfa0e29b, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b010;
@@ -23824,8 +24153,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeriabs";
+let CextOpcode = "S2_storeri";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -23838,7 +24167,7 @@ def S4_pstorerinewf_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_0a6c20ae, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b10;
 let Inst{31-21} = 0b00110101101;
 let isPredicated = 1;
@@ -23849,16 +24178,16 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_rr";
 let CextOpcode = "S2_storeri";
 let InputType = "reg";
-let BaseOpcode = "S2_storeri_rr";
 let opNewValue = 4;
 }
 def S4_pstorerinewfnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memw(#$Ii) = $Nt8.new",
-tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel {
+tc_0fac1eb8, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b1;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b110;
@@ -23873,8 +24202,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeriabs";
+let CextOpcode = "S2_storeri";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -23887,7 +24216,7 @@ def S4_pstorerinewfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memw($Rs32+#$Ii) = $Nt8.new",
-tc_c79a189f, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
+tc_92240447, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b01000110101;
@@ -23900,9 +24229,9 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S2_storeri_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -23914,7 +24243,7 @@ def S4_pstorerinewfnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if (!$Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_829d8a86, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b10;
 let Inst{31-21} = 0b00110111101;
 let isPredicated = 1;
@@ -23926,16 +24255,16 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_rr";
 let CextOpcode = "S2_storeri";
 let InputType = "reg";
-let BaseOpcode = "S2_storeri_rr";
 let opNewValue = 4;
 }
 def S4_pstorerinewfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if (!$Pv4.new) memw($Rs32) = $Nt8.new",
-tc_c79a189f, TypeMAPPING> {
+tc_92240447, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -23944,7 +24273,7 @@ def S4_pstorerinewt_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memw(#$Ii) = $Nt8.new",
-tc_4b68bce4, TypeST>, Enc_44215c, AddrModeRel {
+tc_cfa0e29b, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b010;
@@ -23957,8 +24286,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeriabs";
+let CextOpcode = "S2_storeri";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -23971,7 +24300,7 @@ def S4_pstorerinewt_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_e95795ec, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_0a6c20ae, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b10;
 let Inst{31-21} = 0b00110100101;
 let isPredicated = 1;
@@ -23981,16 +24310,16 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_rr";
 let CextOpcode = "S2_storeri";
 let InputType = "reg";
-let BaseOpcode = "S2_storeri_rr";
 let opNewValue = 4;
 }
 def S4_pstorerinewtnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memw(#$Ii) = $Nt8.new",
-tc_d2e63d61, TypeST>, Enc_44215c, AddrModeRel {
+tc_0fac1eb8, TypeST>, Enc_44215c, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-11} = 0b110;
@@ -24004,8 +24333,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeriabs";
+let CextOpcode = "S2_storeri";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 1;
@@ -24018,7 +24347,7 @@ def S4_pstorerinewtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memw($Rs32+#$Ii) = $Nt8.new",
-tc_c79a189f, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
+tc_92240447, TypeV2LDST>, Enc_8dbdfe, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b01000010101;
@@ -24030,9 +24359,9 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S2_storeri_io";
 let isExtendable = 1;
 let opExtendable = 2;
 let isExtentSigned = 0;
@@ -24044,7 +24373,7 @@ def S4_pstorerinewtnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "if ($Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_b90a29b1, TypeST>, Enc_47ee5e, AddrModeRel {
+tc_829d8a86, TypeST>, Enc_47ee5e, AddrModeRel {
 let Inst{4-3} = 0b10;
 let Inst{31-21} = 0b00110110101;
 let isPredicated = 1;
@@ -24055,16 +24384,16 @@ let isPredicatedNew = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_rr";
 let CextOpcode = "S2_storeri";
 let InputType = "reg";
-let BaseOpcode = "S2_storeri_rr";
 let opNewValue = 4;
 }
 def S4_pstorerinewtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Nt8),
 "if ($Pv4.new) memw($Rs32) = $Nt8.new",
-tc_c79a189f, TypeMAPPING> {
+tc_92240447, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let opNewValue = 2;
@@ -24073,7 +24402,7 @@ def S4_pstorerit_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memw(#$Ii) = $Rt32",
-tc_362c6592, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_ba9255a6, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b0;
@@ -24083,8 +24412,8 @@ let addrMode = Absolute;
 let accessSize = WordAccess;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeriabs";
+let CextOpcode = "S2_storeri";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -24097,22 +24426,22 @@ def S4_pstorerit_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_3962fa26, TypeST>, Enc_6339d5, AddrModeRel {
+tc_1fe4ab69, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110100100;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
 let accessSize = WordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_rr";
 let CextOpcode = "S2_storeri";
 let InputType = "reg";
-let BaseOpcode = "S2_storeri_rr";
 let isNVStorable = 1;
 }
 def S4_pstoreritnew_abs : HInst<
 (outs),
 (ins PredRegs:$Pv4, u32_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memw(#$Ii) = $Rt32",
-tc_da4a37ed, TypeST>, Enc_1cf4ca, AddrModeRel {
+tc_bb07f2c5, TypeST>, Enc_1cf4ca, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
@@ -24123,8 +24452,8 @@ let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let isExtended = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeriabs";
+let CextOpcode = "S2_storeri";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -24137,7 +24466,7 @@ def S4_pstoreritnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u30_2Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memw($Rs32+#$Ii) = $Rt32",
-tc_da97ee82, TypeV2LDST>, Enc_397f23, AddrModeRel {
+tc_a2b365d2, TypeV2LDST>, Enc_397f23, AddrModeRel {
 let Inst{2-2} = 0b0;
 let Inst{31-21} = 0b01000010100;
 let isPredicated = 1;
@@ -24145,9 +24474,9 @@ let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S2_storeri_io";
 let isNVStorable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -24159,23 +24488,23 @@ def S4_pstoreritnew_rr : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "if ($Pv4.new) memw($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_40116ca8, TypeST>, Enc_6339d5, AddrModeRel {
+tc_8e82e8ca, TypeST>, Enc_6339d5, AddrModeRel {
 let Inst{31-21} = 0b00110110100;
 let isPredicated = 1;
 let addrMode = BaseRegOffset;
 let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_rr";
 let CextOpcode = "S2_storeri";
 let InputType = "reg";
-let BaseOpcode = "S2_storeri_rr";
 let isNVStorable = 1;
 }
 def S4_pstoreritnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, IntRegs:$Rt32),
 "if ($Pv4.new) memw($Rs32) = $Rt32",
-tc_da97ee82, TypeMAPPING> {
+tc_a2b365d2, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24183,7 +24512,7 @@ def S4_stored_locked : HInst<
 (outs PredRegs:$Pd4),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "memd_locked($Rs32,$Pd4) = $Rtt32",
-tc_5abb5e3f, TypeST>, Enc_d7dc10 {
+tc_6f42bc60, TypeST>, Enc_d7dc10 {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10100000111;
@@ -24196,14 +24525,14 @@ def S4_storeirb_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
 "memb($Rs32+#$Ii) = #$II",
-tc_b83e6d73, TypeST>, Enc_8203bb, PredNewRel {
+tc_7c31e19a, TypeST>, Enc_8203bb, PredNewRel {
 let Inst{31-21} = 0b00111100000;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storeirb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S4_storeirb_io";
 let isPredicable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -24215,7 +24544,7 @@ def S4_storeirb_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, s8_0Imm:$II),
 "memb($Rs32) = #$II",
-tc_b83e6d73, TypeMAPPING> {
+tc_7c31e19a, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24223,16 +24552,16 @@ def S4_storeirbf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
 "if (!$Pv4) memb($Rs32+#$Ii) = #$II",
-tc_0b2be201, TypeST>, Enc_d7a65e, PredNewRel {
+tc_d03278fd, TypeST>, Enc_d7a65e, PredNewRel {
 let Inst{31-21} = 0b00111000100;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storeirb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S4_storeirb_io";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -24243,7 +24572,7 @@ def S4_storeirbf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if (!$Pv4) memb($Rs32) = #$II",
-tc_0b2be201, TypeMAPPING> {
+tc_d03278fd, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24251,7 +24580,7 @@ def S4_storeirbfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
 "if (!$Pv4.new) memb($Rs32+#$Ii) = #$II",
-tc_c4f596e3, TypeST>, Enc_d7a65e, PredNewRel {
+tc_65cbd974, TypeST>, Enc_d7a65e, PredNewRel {
 let Inst{31-21} = 0b00111001100;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -24259,9 +24588,9 @@ let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storeirb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S4_storeirb_io";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -24272,7 +24601,7 @@ def S4_storeirbfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if (!$Pv4.new) memb($Rs32) = #$II",
-tc_c4f596e3, TypeMAPPING> {
+tc_65cbd974, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24280,15 +24609,15 @@ def S4_storeirbt_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
 "if ($Pv4) memb($Rs32+#$Ii) = #$II",
-tc_0b2be201, TypeST>, Enc_d7a65e, PredNewRel {
+tc_d03278fd, TypeST>, Enc_d7a65e, PredNewRel {
 let Inst{31-21} = 0b00111000000;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storeirb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S4_storeirb_io";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -24299,7 +24628,7 @@ def S4_storeirbt_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if ($Pv4) memb($Rs32) = #$II",
-tc_0b2be201, TypeMAPPING> {
+tc_d03278fd, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24307,16 +24636,16 @@ def S4_storeirbtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
 "if ($Pv4.new) memb($Rs32+#$Ii) = #$II",
-tc_c4f596e3, TypeST>, Enc_d7a65e, PredNewRel {
+tc_65cbd974, TypeST>, Enc_d7a65e, PredNewRel {
 let Inst{31-21} = 0b00111001000;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storeirb_io";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S4_storeirb_io";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -24327,7 +24656,7 @@ def S4_storeirbtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if ($Pv4.new) memb($Rs32) = #$II",
-tc_c4f596e3, TypeMAPPING> {
+tc_65cbd974, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24335,14 +24664,14 @@ def S4_storeirh_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
 "memh($Rs32+#$Ii) = #$II",
-tc_b83e6d73, TypeST>, Enc_a803e0, PredNewRel {
+tc_7c31e19a, TypeST>, Enc_a803e0, PredNewRel {
 let Inst{31-21} = 0b00111100001;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storeirh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S4_storeirh_io";
 let isPredicable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -24354,7 +24683,7 @@ def S4_storeirh_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, s8_0Imm:$II),
 "memh($Rs32) = #$II",
-tc_b83e6d73, TypeMAPPING> {
+tc_7c31e19a, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24362,16 +24691,16 @@ def S4_storeirhf_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
 "if (!$Pv4) memh($Rs32+#$Ii) = #$II",
-tc_0b2be201, TypeST>, Enc_f20719, PredNewRel {
+tc_d03278fd, TypeST>, Enc_f20719, PredNewRel {
 let Inst{31-21} = 0b00111000101;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storeirh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S4_storeirh_io";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -24382,7 +24711,7 @@ def S4_storeirhf_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if (!$Pv4) memh($Rs32) = #$II",
-tc_0b2be201, TypeMAPPING> {
+tc_d03278fd, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24390,7 +24719,7 @@ def S4_storeirhfnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
 "if (!$Pv4.new) memh($Rs32+#$Ii) = #$II",
-tc_c4f596e3, TypeST>, Enc_f20719, PredNewRel {
+tc_65cbd974, TypeST>, Enc_f20719, PredNewRel {
 let Inst{31-21} = 0b00111001101;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -24398,9 +24727,9 @@ let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storeirh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S4_storeirh_io";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -24411,7 +24740,7 @@ def S4_storeirhfnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if (!$Pv4.new) memh($Rs32) = #$II",
-tc_c4f596e3, TypeMAPPING> {
+tc_65cbd974, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24419,15 +24748,15 @@ def S4_storeirht_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
 "if ($Pv4) memh($Rs32+#$Ii) = #$II",
-tc_0b2be201, TypeST>, Enc_f20719, PredNewRel {
+tc_d03278fd, TypeST>, Enc_f20719, PredNewRel {
 let Inst{31-21} = 0b00111000001;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storeirh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S4_storeirh_io";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -24438,7 +24767,7 @@ def S4_storeirht_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if ($Pv4) memh($Rs32) = #$II",
-tc_0b2be201, TypeMAPPING> {
+tc_d03278fd, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24446,16 +24775,16 @@ def S4_storeirhtnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_1Imm:$Ii, s32_0Imm:$II),
 "if ($Pv4.new) memh($Rs32+#$Ii) = #$II",
-tc_c4f596e3, TypeST>, Enc_f20719, PredNewRel {
+tc_65cbd974, TypeST>, Enc_f20719, PredNewRel {
 let Inst{31-21} = 0b00111001001;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storeirh_io";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S4_storeirh_io";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -24466,7 +24795,7 @@ def S4_storeirhtnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if ($Pv4.new) memh($Rs32) = #$II",
-tc_c4f596e3, TypeMAPPING> {
+tc_65cbd974, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24474,14 +24803,14 @@ def S4_storeiri_io : HInst<
 (outs),
 (ins IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
 "memw($Rs32+#$Ii) = #$II",
-tc_b83e6d73, TypeST>, Enc_f37377, PredNewRel {
+tc_7c31e19a, TypeST>, Enc_f37377, PredNewRel {
 let Inst{31-21} = 0b00111100010;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storeiri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S4_storeiri_io";
 let isPredicable = 1;
 let isExtendable = 1;
 let opExtendable = 2;
@@ -24493,7 +24822,7 @@ def S4_storeiri_zomap : HInst<
 (outs),
 (ins IntRegs:$Rs32, s8_0Imm:$II),
 "memw($Rs32) = #$II",
-tc_b83e6d73, TypeMAPPING> {
+tc_7c31e19a, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24501,16 +24830,16 @@ def S4_storeirif_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
 "if (!$Pv4) memw($Rs32+#$Ii) = #$II",
-tc_0b2be201, TypeST>, Enc_5ccba9, PredNewRel {
+tc_d03278fd, TypeST>, Enc_5ccba9, PredNewRel {
 let Inst{31-21} = 0b00111000110;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storeiri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S4_storeiri_io";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -24521,7 +24850,7 @@ def S4_storeirif_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if (!$Pv4) memw($Rs32) = #$II",
-tc_0b2be201, TypeMAPPING> {
+tc_d03278fd, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24529,7 +24858,7 @@ def S4_storeirifnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
 "if (!$Pv4.new) memw($Rs32+#$Ii) = #$II",
-tc_c4f596e3, TypeST>, Enc_5ccba9, PredNewRel {
+tc_65cbd974, TypeST>, Enc_5ccba9, PredNewRel {
 let Inst{31-21} = 0b00111001110;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -24537,9 +24866,9 @@ let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storeiri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S4_storeiri_io";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -24550,7 +24879,7 @@ def S4_storeirifnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if (!$Pv4.new) memw($Rs32) = #$II",
-tc_c4f596e3, TypeMAPPING> {
+tc_65cbd974, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24558,15 +24887,15 @@ def S4_storeirit_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
 "if ($Pv4) memw($Rs32+#$Ii) = #$II",
-tc_0b2be201, TypeST>, Enc_5ccba9, PredNewRel {
+tc_d03278fd, TypeST>, Enc_5ccba9, PredNewRel {
 let Inst{31-21} = 0b00111000010;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storeiri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S4_storeiri_io";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -24577,7 +24906,7 @@ def S4_storeirit_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if ($Pv4) memw($Rs32) = #$II",
-tc_0b2be201, TypeMAPPING> {
+tc_d03278fd, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24585,16 +24914,16 @@ def S4_storeiritnew_io : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
 "if ($Pv4.new) memw($Rs32+#$Ii) = #$II",
-tc_c4f596e3, TypeST>, Enc_5ccba9, PredNewRel {
+tc_65cbd974, TypeST>, Enc_5ccba9, PredNewRel {
 let Inst{31-21} = 0b00111001010;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
 let isPredicatedNew = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storeiri_io";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S4_storeiri_io";
 let isExtendable = 1;
 let opExtendable = 3;
 let isExtentSigned = 1;
@@ -24605,7 +24934,7 @@ def S4_storeiritnew_zomap : HInst<
 (outs),
 (ins PredRegs:$Pv4, IntRegs:$Rs32, s6_0Imm:$II),
 "if ($Pv4.new) memw($Rs32) = #$II",
-tc_c4f596e3, TypeMAPPING> {
+tc_65cbd974, TypeMAPPING> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -24613,7 +24942,7 @@ def S4_storerb_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, IntRegs:$Rt32),
 "memb($Re32=#$II) = $Rt32",
-tc_da4a37ed, TypeST>, Enc_8bcba4, AddrModeRel {
+tc_bb07f2c5, TypeST>, Enc_8bcba4, AddrModeRel {
 let Inst{7-6} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10101011000;
@@ -24634,15 +24963,15 @@ def S4_storerb_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "memb($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_5aee39f7, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
+tc_280f7fe1, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111011000;
 let addrMode = BaseRegOffset;
 let accessSize = ByteAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storerb_rr";
 let CextOpcode = "S2_storerb";
 let InputType = "reg";
-let BaseOpcode = "S4_storerb_rr";
 let isNVStorable = 1;
 let isPredicable = 1;
 }
@@ -24650,16 +24979,16 @@ def S4_storerb_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
 "memb($Ru32<<#$Ii+#$II) = $Rt32",
-tc_14b272fa, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
+tc_887d1bb7, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
 let Inst{7-7} = 0b1;
 let Inst{31-21} = 0b10101101000;
 let addrMode = BaseLongOffset;
 let accessSize = ByteAccess;
 let isExtended = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storerb_ur";
 let CextOpcode = "S2_storerb";
 let InputType = "imm";
-let BaseOpcode = "S4_storerb_ur";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -24672,7 +25001,7 @@ def S4_storerbnew_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, IntRegs:$Nt8),
 "memb($Re32=#$II) = $Nt8.new",
-tc_d2e63d61, TypeST>, Enc_724154, AddrModeRel {
+tc_0fac1eb8, TypeST>, Enc_724154, AddrModeRel {
 let Inst{7-6} = 0b10;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b10101011101;
@@ -24696,7 +25025,7 @@ def S4_storerbnew_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "memb($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_67435e81, TypeST>, Enc_c6220b, AddrModeRel {
+tc_96ef76ef, TypeST>, Enc_c6220b, AddrModeRel {
 let Inst{6-3} = 0b0000;
 let Inst{31-21} = 0b00111011101;
 let addrMode = BaseRegOffset;
@@ -24705,9 +25034,9 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storerb_rr";
 let CextOpcode = "S2_storerb";
 let InputType = "reg";
-let BaseOpcode = "S4_storerb_rr";
 let isPredicable = 1;
 let opNewValue = 3;
 }
@@ -24715,7 +25044,7 @@ def S4_storerbnew_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
 "memb($Ru32<<#$Ii+#$II) = $Nt8.new",
-tc_fcc3ddf9, TypeST>, Enc_7eb485, AddrModeRel {
+tc_55a9a350, TypeST>, Enc_7eb485, AddrModeRel {
 let Inst{7-7} = 0b1;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b10101101101;
@@ -24726,8 +25055,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerb";
 let BaseOpcode = "S4_storerb_ur";
+let CextOpcode = "S2_storerb";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -24740,7 +25069,7 @@ def S4_storerd_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, DoubleRegs:$Rtt32),
 "memd($Re32=#$II) = $Rtt32",
-tc_da4a37ed, TypeST>, Enc_c7a204 {
+tc_bb07f2c5, TypeST>, Enc_c7a204 {
 let Inst{7-6} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10101011110;
@@ -24760,31 +25089,31 @@ def S4_storerd_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, DoubleRegs:$Rtt32),
 "memd($Rs32+$Ru32<<#$Ii) = $Rtt32",
-tc_5aee39f7, TypeST>, Enc_55355c, AddrModeRel, ImmRegShl {
+tc_280f7fe1, TypeST>, Enc_55355c, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111011110;
 let addrMode = BaseRegOffset;
 let accessSize = DoubleWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerd_rr";
 let CextOpcode = "S2_storerd";
 let InputType = "reg";
-let BaseOpcode = "S2_storerd_rr";
 let isPredicable = 1;
 }
 def S4_storerd_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, DoubleRegs:$Rtt32),
 "memd($Ru32<<#$Ii+#$II) = $Rtt32",
-tc_14b272fa, TypeST>, Enc_f79415, AddrModeRel, ImmRegShl {
+tc_887d1bb7, TypeST>, Enc_f79415, AddrModeRel, ImmRegShl {
 let Inst{7-7} = 0b1;
 let Inst{31-21} = 0b10101101110;
 let addrMode = BaseLongOffset;
 let accessSize = DoubleWordAccess;
 let isExtended = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerd_ur";
 let CextOpcode = "S2_storerd";
 let InputType = "imm";
-let BaseOpcode = "S2_storerd_ur";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -24796,7 +25125,7 @@ def S4_storerf_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, IntRegs:$Rt32),
 "memh($Re32=#$II) = $Rt32.h",
-tc_da4a37ed, TypeST>, Enc_8bcba4 {
+tc_bb07f2c5, TypeST>, Enc_8bcba4 {
 let Inst{7-6} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10101011011;
@@ -24816,31 +25145,31 @@ def S4_storerf_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+$Ru32<<#$Ii) = $Rt32.h",
-tc_5aee39f7, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
+tc_280f7fe1, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111011011;
 let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S4_storerf_rr";
 let CextOpcode = "S2_storerf";
 let InputType = "reg";
-let BaseOpcode = "S4_storerf_rr";
 let isPredicable = 1;
 }
 def S4_storerf_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
 "memh($Ru32<<#$Ii+#$II) = $Rt32.h",
-tc_14b272fa, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
+tc_887d1bb7, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
 let Inst{7-7} = 0b1;
 let Inst{31-21} = 0b10101101011;
 let addrMode = BaseLongOffset;
 let accessSize = HalfWordAccess;
 let isExtended = 1;
 let mayStore = 1;
+let BaseOpcode = "S4_storerf_rr";
 let CextOpcode = "S2_storerf";
 let InputType = "imm";
-let BaseOpcode = "S4_storerf_rr";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -24852,7 +25181,7 @@ def S4_storerh_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, IntRegs:$Rt32),
 "memh($Re32=#$II) = $Rt32",
-tc_da4a37ed, TypeST>, Enc_8bcba4, AddrModeRel {
+tc_bb07f2c5, TypeST>, Enc_8bcba4, AddrModeRel {
 let Inst{7-6} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10101011010;
@@ -24873,15 +25202,15 @@ def S4_storerh_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "memh($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_5aee39f7, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
+tc_280f7fe1, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111011010;
 let addrMode = BaseRegOffset;
 let accessSize = HalfWordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_rr";
 let CextOpcode = "S2_storerh";
 let InputType = "reg";
-let BaseOpcode = "S2_storerh_rr";
 let isNVStorable = 1;
 let isPredicable = 1;
 }
@@ -24889,16 +25218,16 @@ def S4_storerh_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
 "memh($Ru32<<#$Ii+#$II) = $Rt32",
-tc_14b272fa, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
+tc_887d1bb7, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
 let Inst{7-7} = 0b1;
 let Inst{31-21} = 0b10101101010;
 let addrMode = BaseLongOffset;
 let accessSize = HalfWordAccess;
 let isExtended = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_ur";
 let CextOpcode = "S2_storerh";
 let InputType = "imm";
-let BaseOpcode = "S2_storerh_ur";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -24911,7 +25240,7 @@ def S4_storerhnew_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, IntRegs:$Nt8),
 "memh($Re32=#$II) = $Nt8.new",
-tc_d2e63d61, TypeST>, Enc_724154, AddrModeRel {
+tc_0fac1eb8, TypeST>, Enc_724154, AddrModeRel {
 let Inst{7-6} = 0b10;
 let Inst{13-11} = 0b001;
 let Inst{31-21} = 0b10101011101;
@@ -24935,7 +25264,7 @@ def S4_storerhnew_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "memh($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_67435e81, TypeST>, Enc_c6220b, AddrModeRel {
+tc_96ef76ef, TypeST>, Enc_c6220b, AddrModeRel {
 let Inst{6-3} = 0b0001;
 let Inst{31-21} = 0b00111011101;
 let addrMode = BaseRegOffset;
@@ -24944,9 +25273,9 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storerh_rr";
 let CextOpcode = "S2_storerh";
 let InputType = "reg";
-let BaseOpcode = "S2_storerh_rr";
 let isPredicable = 1;
 let opNewValue = 3;
 }
@@ -24954,7 +25283,7 @@ def S4_storerhnew_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
 "memh($Ru32<<#$Ii+#$II) = $Nt8.new",
-tc_fcc3ddf9, TypeST>, Enc_7eb485, AddrModeRel {
+tc_55a9a350, TypeST>, Enc_7eb485, AddrModeRel {
 let Inst{7-7} = 0b1;
 let Inst{12-11} = 0b01;
 let Inst{31-21} = 0b10101101101;
@@ -24965,8 +25294,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storerh";
 let BaseOpcode = "S2_storerh_ur";
+let CextOpcode = "S2_storerh";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -24979,7 +25308,7 @@ def S4_storeri_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, IntRegs:$Rt32),
 "memw($Re32=#$II) = $Rt32",
-tc_da4a37ed, TypeST>, Enc_8bcba4, AddrModeRel {
+tc_bb07f2c5, TypeST>, Enc_8bcba4, AddrModeRel {
 let Inst{7-6} = 0b10;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10101011100;
@@ -25000,15 +25329,15 @@ def S4_storeri_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Rt32),
 "memw($Rs32+$Ru32<<#$Ii) = $Rt32",
-tc_5aee39f7, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
+tc_280f7fe1, TypeST>, Enc_eca7c8, AddrModeRel, ImmRegShl {
 let Inst{6-5} = 0b00;
 let Inst{31-21} = 0b00111011100;
 let addrMode = BaseRegOffset;
 let accessSize = WordAccess;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_rr";
 let CextOpcode = "S2_storeri";
 let InputType = "reg";
-let BaseOpcode = "S2_storeri_rr";
 let isNVStorable = 1;
 let isPredicable = 1;
 }
@@ -25016,16 +25345,16 @@ def S4_storeri_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Rt32),
 "memw($Ru32<<#$Ii+#$II) = $Rt32",
-tc_14b272fa, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
+tc_887d1bb7, TypeST>, Enc_9ea4cf, AddrModeRel, ImmRegShl {
 let Inst{7-7} = 0b1;
 let Inst{31-21} = 0b10101101100;
 let addrMode = BaseLongOffset;
 let accessSize = WordAccess;
 let isExtended = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_ur";
 let CextOpcode = "S2_storeri";
 let InputType = "imm";
-let BaseOpcode = "S2_storeri_ur";
 let isNVStorable = 1;
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
@@ -25038,7 +25367,7 @@ def S4_storerinew_ap : HInst<
 (outs IntRegs:$Re32),
 (ins u32_0Imm:$II, IntRegs:$Nt8),
 "memw($Re32=#$II) = $Nt8.new",
-tc_d2e63d61, TypeST>, Enc_724154, AddrModeRel {
+tc_0fac1eb8, TypeST>, Enc_724154, AddrModeRel {
 let Inst{7-6} = 0b10;
 let Inst{13-11} = 0b010;
 let Inst{31-21} = 0b10101011101;
@@ -25062,7 +25391,7 @@ def S4_storerinew_rr : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Ru32, u2_0Imm:$Ii, IntRegs:$Nt8),
 "memw($Rs32+$Ru32<<#$Ii) = $Nt8.new",
-tc_67435e81, TypeST>, Enc_c6220b, AddrModeRel {
+tc_96ef76ef, TypeST>, Enc_c6220b, AddrModeRel {
 let Inst{6-3} = 0b0010;
 let Inst{31-21} = 0b00111011101;
 let addrMode = BaseRegOffset;
@@ -25071,9 +25400,9 @@ let isNVStore = 1;
 let isNewValue = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
+let BaseOpcode = "S2_storeri_rr";
 let CextOpcode = "S2_storeri";
 let InputType = "reg";
-let BaseOpcode = "S2_storeri_rr";
 let isPredicable = 1;
 let opNewValue = 3;
 }
@@ -25081,7 +25410,7 @@ def S4_storerinew_ur : HInst<
 (outs),
 (ins IntRegs:$Ru32, u2_0Imm:$Ii, u32_0Imm:$II, IntRegs:$Nt8),
 "memw($Ru32<<#$Ii+#$II) = $Nt8.new",
-tc_fcc3ddf9, TypeST>, Enc_7eb485, AddrModeRel {
+tc_55a9a350, TypeST>, Enc_7eb485, AddrModeRel {
 let Inst{7-7} = 0b1;
 let Inst{12-11} = 0b10;
 let Inst{31-21} = 0b10101101101;
@@ -25092,8 +25421,8 @@ let isNewValue = 1;
 let isExtended = 1;
 let isRestrictNoSlot1Store = 1;
 let mayStore = 1;
-let CextOpcode = "S2_storeri";
 let BaseOpcode = "S2_storeri_ur";
+let CextOpcode = "S2_storeri";
 let DecoderNamespace = "MustExtend";
 let isExtendable = 1;
 let opExtendable = 2;
@@ -25106,7 +25435,7 @@ def S4_subaddi : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Ru32),
 "$Rd32 = add($Rs32,sub(#$Ii,$Ru32))",
-tc_f675fee8, TypeALU64>, Enc_8b8d61 {
+tc_2c13e7f5, TypeALU64>, Enc_8b8d61, Requires<[UseCompound]> {
 let Inst{31-23} = 0b110110111;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25121,7 +25450,7 @@ def S4_subi_asl_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = sub(#$Ii,asl($Rx32in,#$II))",
-tc_f675fee8, TypeALU64>, Enc_c31910 {
+tc_2c13e7f5, TypeALU64>, Enc_c31910, Requires<[UseCompound]> {
 let Inst{2-0} = 0b110;
 let Inst{4-4} = 0b0;
 let Inst{31-24} = 0b11011110;
@@ -25139,7 +25468,7 @@ def S4_subi_lsr_ri : HInst<
 (outs IntRegs:$Rx32),
 (ins u32_0Imm:$Ii, IntRegs:$Rx32in, u5_0Imm:$II),
 "$Rx32 = sub(#$Ii,lsr($Rx32in,#$II))",
-tc_f675fee8, TypeALU64>, Enc_c31910 {
+tc_2c13e7f5, TypeALU64>, Enc_c31910, Requires<[UseCompound]> {
 let Inst{2-0} = 0b110;
 let Inst{4-4} = 0b1;
 let Inst{31-24} = 0b11011110;
@@ -25157,7 +25486,7 @@ def S4_vrcrotate : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rdd32 = vrcrotate($Rss32,$Rt32,#$Ii)",
-tc_13bfbcf9, TypeS_3op>, Enc_645d54 {
+tc_f0cdeccf, TypeS_3op>, Enc_645d54 {
 let Inst{7-6} = 0b11;
 let Inst{31-21} = 0b11000011110;
 let prefersSlot3 = 1;
@@ -25166,7 +25495,7 @@ def S4_vrcrotate_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, IntRegs:$Rt32, u2_0Imm:$Ii),
 "$Rxx32 += vrcrotate($Rss32,$Rt32,#$Ii)",
-tc_9debc299, TypeS_3op>, Enc_b72622 {
+tc_a38c45dc, TypeS_3op>, Enc_b72622 {
 let Inst{7-6} = 0b00;
 let Inst{31-21} = 0b11001011101;
 let prefersSlot3 = 1;
@@ -25176,7 +25505,7 @@ def S4_vxaddsubh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vxaddsubh($Rss32,$Rtt32):sat",
-tc_779080bf, TypeS_3op>, Enc_a56825 {
+tc_8a825db2, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001010;
@@ -25187,7 +25516,7 @@ def S4_vxaddsubhr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vxaddsubh($Rss32,$Rtt32):rnd:>>1:sat",
-tc_002cb246, TypeS_3op>, Enc_a56825 {
+tc_0dfac0a7, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001110;
@@ -25198,7 +25527,7 @@ def S4_vxaddsubw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vxaddsubw($Rss32,$Rtt32):sat",
-tc_779080bf, TypeS_3op>, Enc_a56825 {
+tc_8a825db2, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001010;
@@ -25209,7 +25538,7 @@ def S4_vxsubaddh : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vxsubaddh($Rss32,$Rtt32):sat",
-tc_779080bf, TypeS_3op>, Enc_a56825 {
+tc_8a825db2, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b110;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001010;
@@ -25220,7 +25549,7 @@ def S4_vxsubaddhr : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vxsubaddh($Rss32,$Rtt32):rnd:>>1:sat",
-tc_002cb246, TypeS_3op>, Enc_a56825 {
+tc_0dfac0a7, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001110;
@@ -25231,7 +25560,7 @@ def S4_vxsubaddw : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vxsubaddw($Rss32,$Rtt32):sat",
-tc_779080bf, TypeS_3op>, Enc_a56825 {
+tc_8a825db2, TypeS_3op>, Enc_a56825 {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001010;
@@ -25242,7 +25571,7 @@ def S5_asrhub_rnd_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):raw",
-tc_002cb246, TypeS_2op>, Enc_11a146 {
+tc_0dfac0a7, TypeS_2op>, Enc_11a146 {
 let Inst{7-5} = 0b100;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10001000011;
@@ -25255,7 +25584,7 @@ def S5_asrhub_rnd_sat_goodsyntax : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):rnd:sat",
-tc_002cb246, TypeS_2op> {
+tc_0dfac0a7, TypeS_2op> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isPseudo = 1;
@@ -25264,7 +25593,7 @@ def S5_asrhub_sat : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rd32 = vasrhub($Rss32,#$Ii):sat",
-tc_002cb246, TypeS_2op>, Enc_11a146 {
+tc_0dfac0a7, TypeS_2op>, Enc_11a146 {
 let Inst{7-5} = 0b101;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10001000011;
@@ -25277,7 +25606,7 @@ def S5_popcountp : HInst<
 (outs IntRegs:$Rd32),
 (ins DoubleRegs:$Rss32),
 "$Rd32 = popcount($Rss32)",
-tc_703e822c, TypeS_2op>, Enc_90cd8b {
+tc_d3632d88, TypeS_2op>, Enc_90cd8b {
 let Inst{13-5} = 0b000000011;
 let Inst{31-21} = 0b10001000011;
 let hasNewValue = 1;
@@ -25288,7 +25617,7 @@ def S5_vasrhrnd : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vasrh($Rss32,#$Ii):raw",
-tc_002cb246, TypeS_2op>, Enc_12b6e9 {
+tc_0dfac0a7, TypeS_2op>, Enc_12b6e9 {
 let Inst{7-5} = 0b000;
 let Inst{13-12} = 0b00;
 let Inst{31-21} = 0b10000000001;
@@ -25298,14 +25627,14 @@ def S5_vasrhrnd_goodsyntax : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u4_0Imm:$Ii),
 "$Rdd32 = vasrh($Rss32,#$Ii):rnd",
-tc_002cb246, TypeS_2op> {
+tc_0dfac0a7, TypeS_2op> {
 let isPseudo = 1;
 }
 def S6_allocframe_to_raw : HInst<
 (outs),
 (ins u11_3Imm:$Ii),
 "allocframe(#$Ii)",
-tc_b44ecf75, TypeMAPPING>, Requires<[HasV65]> {
+tc_934753bb, TypeMAPPING>, Requires<[HasV65]> {
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 }
@@ -25313,7 +25642,7 @@ def S6_rol_i_p : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rdd32 = rol($Rss32,#$Ii)",
-tc_1fc97744, TypeS_2op>, Enc_5eac98, Requires<[HasV60]> {
+tc_407e96f9, TypeS_2op>, Enc_5eac98, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{31-21} = 0b10000000000;
 }
@@ -25321,7 +25650,7 @@ def S6_rol_i_p_acc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 += rol($Rss32,#$Ii)",
-tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
+tc_5e4cf0e8, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b111;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -25331,7 +25660,7 @@ def S6_rol_i_p_and : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 &= rol($Rss32,#$Ii)",
-tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
+tc_5e4cf0e8, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -25341,7 +25670,7 @@ def S6_rol_i_p_nac : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 -= rol($Rss32,#$Ii)",
-tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
+tc_5e4cf0e8, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{31-21} = 0b10000010000;
 let prefersSlot3 = 1;
@@ -25351,7 +25680,7 @@ def S6_rol_i_p_or : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 |= rol($Rss32,#$Ii)",
-tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
+tc_5e4cf0e8, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b111;
 let Inst{31-21} = 0b10000010010;
 let prefersSlot3 = 1;
@@ -25361,7 +25690,7 @@ def S6_rol_i_p_xacc : HInst<
 (outs DoubleRegs:$Rxx32),
 (ins DoubleRegs:$Rxx32in, DoubleRegs:$Rss32, u6_0Imm:$Ii),
 "$Rxx32 ^= rol($Rss32,#$Ii)",
-tc_784490da, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
+tc_5e4cf0e8, TypeS_2op>, Enc_70fb07, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{31-21} = 0b10000010100;
 let prefersSlot3 = 1;
@@ -25371,7 +25700,7 @@ def S6_rol_i_r : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rd32 = rol($Rs32,#$Ii)",
-tc_1fc97744, TypeS_2op>, Enc_a05677, Requires<[HasV60]> {
+tc_407e96f9, TypeS_2op>, Enc_a05677, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001100000;
@@ -25382,7 +25711,7 @@ def S6_rol_i_r_acc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 += rol($Rs32,#$Ii)",
-tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
+tc_5e4cf0e8, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -25395,7 +25724,7 @@ def S6_rol_i_r_and : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 &= rol($Rs32,#$Ii)",
-tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
+tc_5e4cf0e8, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -25408,7 +25737,7 @@ def S6_rol_i_r_nac : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 -= rol($Rs32,#$Ii)",
-tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
+tc_5e4cf0e8, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110000;
@@ -25421,7 +25750,7 @@ def S6_rol_i_r_or : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 |= rol($Rs32,#$Ii)",
-tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
+tc_5e4cf0e8, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b111;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110010;
@@ -25434,7 +25763,7 @@ def S6_rol_i_r_xacc : HInst<
 (outs IntRegs:$Rx32),
 (ins IntRegs:$Rx32in, IntRegs:$Rs32, u5_0Imm:$Ii),
 "$Rx32 ^= rol($Rs32,#$Ii)",
-tc_784490da, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
+tc_5e4cf0e8, TypeS_2op>, Enc_28a2dc, Requires<[HasV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10001110100;
@@ -25447,7 +25776,7 @@ def S6_vsplatrbp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32),
 "$Rdd32 = vsplatb($Rs32)",
-tc_a1c00888, TypeS_2op>, Enc_3a3d62, Requires<[HasV62]> {
+tc_ef921005, TypeS_2op>, Enc_3a3d62, Requires<[HasV62]> {
 let Inst{13-5} = 0b000000100;
 let Inst{31-21} = 0b10000100010;
 }
@@ -25455,7 +25784,7 @@ def S6_vtrunehb_ppp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vtrunehb($Rss32,$Rtt32)",
-tc_1fc97744, TypeS_3op>, Enc_a56825, Requires<[HasV62]> {
+tc_407e96f9, TypeS_3op>, Enc_a56825, Requires<[HasV62]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001100;
@@ -25464,7 +25793,7 @@ def S6_vtrunohb_ppp : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
 "$Rdd32 = vtrunohb($Rss32,$Rtt32)",
-tc_1fc97744, TypeS_3op>, Enc_a56825, Requires<[HasV62]> {
+tc_407e96f9, TypeS_3op>, Enc_a56825, Requires<[HasV62]> {
 let Inst{7-5} = 0b101;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11000001100;
@@ -25473,7 +25802,7 @@ def SA1_addi : HInst<
 (outs GeneralSubRegs:$Rx16),
 (ins IntRegs:$Rx16in, s32_0Imm:$Ii),
 "$Rx16 = add($Rx16in,#$Ii)",
-tc_0a705168, TypeSUBINSN>, Enc_93af4c {
+tc_5b347363, TypeSUBINSN>, Enc_93af4c {
 let Inst{12-11} = 0b00;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25490,7 +25819,7 @@ def SA1_addrx : HInst<
 (outs GeneralSubRegs:$Rx16),
 (ins IntRegs:$Rx16in, GeneralSubRegs:$Rs16),
 "$Rx16 = add($Rx16in,$Rs16)",
-tc_0a705168, TypeSUBINSN>, Enc_0527db {
+tc_5b347363, TypeSUBINSN>, Enc_0527db {
 let Inst{12-8} = 0b11000;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25502,7 +25831,7 @@ def SA1_addsp : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins u6_2Imm:$Ii),
 "$Rd16 = add(r29,#$Ii)",
-tc_9fc3dae0, TypeSUBINSN>, Enc_2df31d {
+tc_3d14a17b, TypeSUBINSN>, Enc_2df31d {
 let Inst{12-10} = 0b011;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25514,7 +25843,7 @@ def SA1_and1 : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16),
 "$Rd16 = and($Rs16,#1)",
-tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 {
+tc_3d14a17b, TypeSUBINSN>, Enc_97d666 {
 let Inst{12-8} = 0b10010;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25525,7 +25854,7 @@ def SA1_clrf : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins),
 "if (!p0) $Rd16 = #0",
-tc_a1123dda, TypeSUBINSN>, Enc_1f5ba6 {
+tc_3fbf1042, TypeSUBINSN>, Enc_1f5ba6 {
 let Inst{12-4} = 0b110100111;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -25539,7 +25868,7 @@ def SA1_clrfnew : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins),
 "if (!p0.new) $Rd16 = #0",
-tc_8b3e402a, TypeSUBINSN>, Enc_1f5ba6 {
+tc_63567288, TypeSUBINSN>, Enc_1f5ba6 {
 let Inst{12-4} = 0b110100101;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -25554,7 +25883,7 @@ def SA1_clrt : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins),
 "if (p0) $Rd16 = #0",
-tc_a1123dda, TypeSUBINSN>, Enc_1f5ba6 {
+tc_3fbf1042, TypeSUBINSN>, Enc_1f5ba6 {
 let Inst{12-4} = 0b110100110;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -25567,7 +25896,7 @@ def SA1_clrtnew : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins),
 "if (p0.new) $Rd16 = #0",
-tc_8b3e402a, TypeSUBINSN>, Enc_1f5ba6 {
+tc_63567288, TypeSUBINSN>, Enc_1f5ba6 {
 let Inst{12-4} = 0b110100100;
 let isPredicated = 1;
 let hasNewValue = 1;
@@ -25581,7 +25910,7 @@ def SA1_cmpeqi : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u2_0Imm:$Ii),
 "p0 = cmp.eq($Rs16,#$Ii)",
-tc_5b7c0967, TypeSUBINSN>, Enc_63eaeb {
+tc_59a7822c, TypeSUBINSN>, Enc_63eaeb {
 let Inst{3-2} = 0b00;
 let Inst{12-8} = 0b11001;
 let AsmVariantName = "NonParsable";
@@ -25592,7 +25921,7 @@ def SA1_combine0i : HInst<
 (outs GeneralDoubleLow8Regs:$Rdd8),
 (ins u2_0Imm:$Ii),
 "$Rdd8 = combine(#0,#$Ii)",
-tc_9fc3dae0, TypeSUBINSN>, Enc_ed48be {
+tc_3d14a17b, TypeSUBINSN>, Enc_ed48be {
 let Inst{4-3} = 0b00;
 let Inst{12-7} = 0b111000;
 let hasNewValue = 1;
@@ -25604,7 +25933,7 @@ def SA1_combine1i : HInst<
 (outs GeneralDoubleLow8Regs:$Rdd8),
 (ins u2_0Imm:$Ii),
 "$Rdd8 = combine(#1,#$Ii)",
-tc_9fc3dae0, TypeSUBINSN>, Enc_ed48be {
+tc_3d14a17b, TypeSUBINSN>, Enc_ed48be {
 let Inst{4-3} = 0b01;
 let Inst{12-7} = 0b111000;
 let hasNewValue = 1;
@@ -25616,7 +25945,7 @@ def SA1_combine2i : HInst<
 (outs GeneralDoubleLow8Regs:$Rdd8),
 (ins u2_0Imm:$Ii),
 "$Rdd8 = combine(#2,#$Ii)",
-tc_9fc3dae0, TypeSUBINSN>, Enc_ed48be {
+tc_3d14a17b, TypeSUBINSN>, Enc_ed48be {
 let Inst{4-3} = 0b10;
 let Inst{12-7} = 0b111000;
 let hasNewValue = 1;
@@ -25628,7 +25957,7 @@ def SA1_combine3i : HInst<
 (outs GeneralDoubleLow8Regs:$Rdd8),
 (ins u2_0Imm:$Ii),
 "$Rdd8 = combine(#3,#$Ii)",
-tc_9fc3dae0, TypeSUBINSN>, Enc_ed48be {
+tc_3d14a17b, TypeSUBINSN>, Enc_ed48be {
 let Inst{4-3} = 0b11;
 let Inst{12-7} = 0b111000;
 let hasNewValue = 1;
@@ -25640,7 +25969,7 @@ def SA1_combinerz : HInst<
 (outs GeneralDoubleLow8Regs:$Rdd8),
 (ins GeneralSubRegs:$Rs16),
 "$Rdd8 = combine($Rs16,#0)",
-tc_9fc3dae0, TypeSUBINSN>, Enc_399e12 {
+tc_3d14a17b, TypeSUBINSN>, Enc_399e12 {
 let Inst{3-3} = 0b1;
 let Inst{12-8} = 0b11101;
 let hasNewValue = 1;
@@ -25652,7 +25981,7 @@ def SA1_combinezr : HInst<
 (outs GeneralDoubleLow8Regs:$Rdd8),
 (ins GeneralSubRegs:$Rs16),
 "$Rdd8 = combine(#0,$Rs16)",
-tc_9fc3dae0, TypeSUBINSN>, Enc_399e12 {
+tc_3d14a17b, TypeSUBINSN>, Enc_399e12 {
 let Inst{3-3} = 0b0;
 let Inst{12-8} = 0b11101;
 let hasNewValue = 1;
@@ -25664,7 +25993,7 @@ def SA1_dec : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16, n1Const:$n1),
 "$Rd16 = add($Rs16,#$n1)",
-tc_0a705168, TypeSUBINSN>, Enc_ee5ed0 {
+tc_5b347363, TypeSUBINSN>, Enc_ee5ed0 {
 let Inst{12-8} = 0b10011;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25675,7 +26004,7 @@ def SA1_inc : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16),
 "$Rd16 = add($Rs16,#1)",
-tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 {
+tc_3d14a17b, TypeSUBINSN>, Enc_97d666 {
 let Inst{12-8} = 0b10001;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25686,7 +26015,7 @@ def SA1_seti : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins u32_0Imm:$Ii),
 "$Rd16 = #$Ii",
-tc_9fc3dae0, TypeSUBINSN>, Enc_e39bb2 {
+tc_3d14a17b, TypeSUBINSN>, Enc_e39bb2 {
 let Inst{12-10} = 0b010;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25702,7 +26031,7 @@ def SA1_setin1 : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins n1Const:$n1),
 "$Rd16 = #$n1",
-tc_9fc3dae0, TypeSUBINSN>, Enc_7a0ea6 {
+tc_3d14a17b, TypeSUBINSN>, Enc_7a0ea6 {
 let Inst{12-4} = 0b110100000;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25713,7 +26042,7 @@ def SA1_sxtb : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16),
 "$Rd16 = sxtb($Rs16)",
-tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 {
+tc_3d14a17b, TypeSUBINSN>, Enc_97d666 {
 let Inst{12-8} = 0b10101;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25724,7 +26053,7 @@ def SA1_sxth : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16),
 "$Rd16 = sxth($Rs16)",
-tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 {
+tc_3d14a17b, TypeSUBINSN>, Enc_97d666 {
 let Inst{12-8} = 0b10100;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25735,7 +26064,7 @@ def SA1_tfr : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16),
 "$Rd16 = $Rs16",
-tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 {
+tc_3d14a17b, TypeSUBINSN>, Enc_97d666 {
 let Inst{12-8} = 0b10000;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25746,7 +26075,7 @@ def SA1_zxtb : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16),
 "$Rd16 = and($Rs16,#255)",
-tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 {
+tc_3d14a17b, TypeSUBINSN>, Enc_97d666 {
 let Inst{12-8} = 0b10111;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25757,7 +26086,7 @@ def SA1_zxth : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16),
 "$Rd16 = zxth($Rs16)",
-tc_9fc3dae0, TypeSUBINSN>, Enc_97d666 {
+tc_3d14a17b, TypeSUBINSN>, Enc_97d666 {
 let Inst{12-8} = 0b10110;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25768,7 +26097,7 @@ def SL1_loadri_io : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
 "$Rd16 = memw($Rs16+#$Ii)",
-tc_17e0d2cd, TypeSUBINSN>, Enc_53dca9 {
+tc_4222e6bf, TypeSUBINSN>, Enc_53dca9 {
 let Inst{12-12} = 0b0;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25782,7 +26111,7 @@ def SL1_loadrub_io : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
 "$Rd16 = memub($Rs16+#$Ii)",
-tc_17e0d2cd, TypeSUBINSN>, Enc_c175d0 {
+tc_4222e6bf, TypeSUBINSN>, Enc_c175d0 {
 let Inst{12-12} = 0b1;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25796,20 +26125,20 @@ def SL2_deallocframe : HInst<
 (outs),
 (ins),
 "deallocframe",
-tc_39dfefe8, TypeSUBINSN>, Enc_e3b0c4 {
+tc_937dd41c, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111100000000;
 let accessSize = DoubleWordAccess;
 let AsmVariantName = "NonParsable";
 let mayLoad = 1;
 let Uses = [FRAMEKEY, R30];
-let Defs = [R30, R29, R31];
+let Defs = [R29, R30, R31];
 let DecoderNamespace = "SUBINSN_L2";
 }
 def SL2_jumpr31 : HInst<
 (outs),
 (ins),
 "jumpr r31",
-tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 {
+tc_a4ee89db, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111111000000;
 let isTerminator = 1;
 let isIndirectBranch = 1;
@@ -25824,7 +26153,7 @@ def SL2_jumpr31_f : HInst<
 (outs),
 (ins),
 "if (!p0) jumpr r31",
-tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 {
+tc_a4ee89db, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111111000101;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -25842,7 +26171,7 @@ def SL2_jumpr31_fnew : HInst<
 (outs),
 (ins),
 "if (!p0.new) jumpr:nt r31",
-tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 {
+tc_a4ee89db, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111111000111;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -25861,7 +26190,7 @@ def SL2_jumpr31_t : HInst<
 (outs),
 (ins),
 "if (p0) jumpr r31",
-tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 {
+tc_a4ee89db, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111111000100;
 let isPredicated = 1;
 let isTerminator = 1;
@@ -25878,7 +26207,7 @@ def SL2_jumpr31_tnew : HInst<
 (outs),
 (ins),
 "if (p0.new) jumpr:nt r31",
-tc_b4407292, TypeSUBINSN>, Enc_e3b0c4 {
+tc_a4ee89db, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111111000110;
 let isPredicated = 1;
 let isTerminator = 1;
@@ -25896,7 +26225,7 @@ def SL2_loadrb_io : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16, u3_0Imm:$Ii),
 "$Rd16 = memb($Rs16+#$Ii)",
-tc_17e0d2cd, TypeSUBINSN>, Enc_2fbf3c {
+tc_4222e6bf, TypeSUBINSN>, Enc_2fbf3c {
 let Inst{12-11} = 0b10;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25910,7 +26239,7 @@ def SL2_loadrd_sp : HInst<
 (outs GeneralDoubleLow8Regs:$Rdd8),
 (ins u5_3Imm:$Ii),
 "$Rdd8 = memd(r29+#$Ii)",
-tc_c4db48cb, TypeSUBINSN>, Enc_86a14b {
+tc_8a6d0d94, TypeSUBINSN>, Enc_86a14b {
 let Inst{12-8} = 0b11110;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25925,7 +26254,7 @@ def SL2_loadrh_io : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii),
 "$Rd16 = memh($Rs16+#$Ii)",
-tc_17e0d2cd, TypeSUBINSN>, Enc_2bae10 {
+tc_4222e6bf, TypeSUBINSN>, Enc_2bae10 {
 let Inst{12-11} = 0b00;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25939,7 +26268,7 @@ def SL2_loadri_sp : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins u5_2Imm:$Ii),
 "$Rd16 = memw(r29+#$Ii)",
-tc_c4db48cb, TypeSUBINSN>, Enc_51635c {
+tc_8a6d0d94, TypeSUBINSN>, Enc_51635c {
 let Inst{12-9} = 0b1110;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25954,7 +26283,7 @@ def SL2_loadruh_io : HInst<
 (outs GeneralSubRegs:$Rd16),
 (ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii),
 "$Rd16 = memuh($Rs16+#$Ii)",
-tc_17e0d2cd, TypeSUBINSN>, Enc_2bae10 {
+tc_4222e6bf, TypeSUBINSN>, Enc_2bae10 {
 let Inst{12-11} = 0b01;
 let hasNewValue = 1;
 let opNewValue = 0;
@@ -25968,7 +26297,7 @@ def SL2_return : HInst<
 (outs),
 (ins),
 "dealloc_return",
-tc_36153880, TypeSUBINSN>, Enc_e3b0c4 {
+tc_c818ff7f, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111101000000;
 let isTerminator = 1;
 let isIndirectBranch = 1;
@@ -25979,14 +26308,14 @@ let cofMax1 = 1;
 let isRestrictNoSlot1Store = 1;
 let isReturn = 1;
 let Uses = [FRAMEKEY, R30];
-let Defs = [PC, R30, R29, R31];
+let Defs = [PC, R29, R30, R31];
 let DecoderNamespace = "SUBINSN_L2";
 }
 def SL2_return_f : HInst<
 (outs),
 (ins),
 "if (!p0) dealloc_return",
-tc_36153880, TypeSUBINSN>, Enc_e3b0c4 {
+tc_c818ff7f, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111101000101;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -25999,7 +26328,7 @@ let cofMax1 = 1;
 let isRestrictNoSlot1Store = 1;
 let isReturn = 1;
 let Uses = [FRAMEKEY, P0, R30];
-let Defs = [PC, R30, R29, R31];
+let Defs = [PC, R29, R30, R31];
 let isTaken = Inst{4};
 let DecoderNamespace = "SUBINSN_L2";
 }
@@ -26007,7 +26336,7 @@ def SL2_return_fnew : HInst<
 (outs),
 (ins),
 "if (!p0.new) dealloc_return:nt",
-tc_36153880, TypeSUBINSN>, Enc_e3b0c4 {
+tc_c818ff7f, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111101000111;
 let isPredicated = 1;
 let isPredicatedFalse = 1;
@@ -26021,7 +26350,7 @@ let cofMax1 = 1;
 let isRestrictNoSlot1Store = 1;
 let isReturn = 1;
 let Uses = [FRAMEKEY, P0, R30];
-let Defs = [PC, R30, R29, R31];
+let Defs = [PC, R29, R30, R31];
 let isTaken = Inst{4};
 let DecoderNamespace = "SUBINSN_L2";
 }
@@ -26029,7 +26358,7 @@ def SL2_return_t : HInst<
 (outs),
 (ins),
 "if (p0) dealloc_return",
-tc_36153880, TypeSUBINSN>, Enc_e3b0c4 {
+tc_c818ff7f, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111101000100;
 let isPredicated = 1;
 let isTerminator = 1;
@@ -26041,7 +26370,7 @@ let cofMax1 = 1;
 let isRestrictNoSlot1Store = 1;
 let isReturn = 1;
 let Uses = [FRAMEKEY, P0, R30];
-let Defs = [PC, R30, R29, R31];
+let Defs = [PC, R29, R30, R31];
 let isTaken = Inst{4};
 let DecoderNamespace = "SUBINSN_L2";
 }
@@ -26049,7 +26378,7 @@ def SL2_return_tnew : HInst<
 (outs),
 (ins),
 "if (p0.new) dealloc_return:nt",
-tc_36153880, TypeSUBINSN>, Enc_e3b0c4 {
+tc_c818ff7f, TypeSUBINSN>, Enc_e3b0c4 {
 let Inst{12-0} = 0b1111101000110;
 let isPredicated = 1;
 let isTerminator = 1;
@@ -26062,7 +26391,7 @@ let cofMax1 = 1;
 let isRestrictNoSlot1Store = 1;
 let isReturn = 1;
 let Uses = [FRAMEKEY, P0, R30];
-let Defs = [PC, R30, R29, R31];
+let Defs = [PC, R29, R30, R31];
 let isTaken = Inst{4};
 let DecoderNamespace = "SUBINSN_L2";
 }
@@ -26070,7 +26399,7 @@ def SS1_storeb_io : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii, GeneralSubRegs:$Rt16),
 "memb($Rs16+#$Ii) = $Rt16",
-tc_30b9bb4a, TypeSUBINSN>, Enc_b38ffc {
+tc_ae5babd7, TypeSUBINSN>, Enc_b38ffc {
 let Inst{12-12} = 0b1;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
@@ -26082,7 +26411,7 @@ def SS1_storew_io : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii, GeneralSubRegs:$Rt16),
 "memw($Rs16+#$Ii) = $Rt16",
-tc_30b9bb4a, TypeSUBINSN>, Enc_f55a0c {
+tc_ae5babd7, TypeSUBINSN>, Enc_f55a0c {
 let Inst{12-12} = 0b0;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
@@ -26094,22 +26423,22 @@ def SS2_allocframe : HInst<
 (outs),
 (ins u5_3Imm:$Ii),
 "allocframe(#$Ii)",
-tc_49a8207d, TypeSUBINSN>, Enc_6f70ca {
+tc_1242dc2a, TypeSUBINSN>, Enc_6f70ca {
 let Inst{3-0} = 0b0000;
 let Inst{12-9} = 0b1110;
 let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
 let AsmVariantName = "NonParsable";
 let mayStore = 1;
-let Uses = [FRAMEKEY, FRAMELIMIT, R30, R29, R31];
-let Defs = [R30, R29];
+let Uses = [FRAMEKEY, FRAMELIMIT, R29, R30, R31];
+let Defs = [R29, R30];
 let DecoderNamespace = "SUBINSN_S2";
 }
 def SS2_storebi0 : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
 "memb($Rs16+#$Ii) = #0",
-tc_89e94ad3, TypeSUBINSN>, Enc_84d359 {
+tc_44d5a428, TypeSUBINSN>, Enc_84d359 {
 let Inst{12-8} = 0b10010;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
@@ -26121,7 +26450,7 @@ def SS2_storebi1 : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u4_0Imm:$Ii),
 "memb($Rs16+#$Ii) = #1",
-tc_89e94ad3, TypeSUBINSN>, Enc_84d359 {
+tc_44d5a428, TypeSUBINSN>, Enc_84d359 {
 let Inst{12-8} = 0b10011;
 let addrMode = BaseImmOffset;
 let accessSize = ByteAccess;
@@ -26133,7 +26462,7 @@ def SS2_stored_sp : HInst<
 (outs),
 (ins s6_3Imm:$Ii, GeneralDoubleLow8Regs:$Rtt8),
 "memd(r29+#$Ii) = $Rtt8",
-tc_0371abea, TypeSUBINSN>, Enc_b8309d {
+tc_0655b949, TypeSUBINSN>, Enc_b8309d {
 let Inst{12-9} = 0b0101;
 let addrMode = BaseImmOffset;
 let accessSize = DoubleWordAccess;
@@ -26146,7 +26475,7 @@ def SS2_storeh_io : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u3_1Imm:$Ii, GeneralSubRegs:$Rt16),
 "memh($Rs16+#$Ii) = $Rt16",
-tc_30b9bb4a, TypeSUBINSN>, Enc_625deb {
+tc_ae5babd7, TypeSUBINSN>, Enc_625deb {
 let Inst{12-11} = 0b00;
 let addrMode = BaseImmOffset;
 let accessSize = HalfWordAccess;
@@ -26158,7 +26487,7 @@ def SS2_storew_sp : HInst<
 (outs),
 (ins u5_2Imm:$Ii, GeneralSubRegs:$Rt16),
 "memw(r29+#$Ii) = $Rt16",
-tc_0371abea, TypeSUBINSN>, Enc_87c142 {
+tc_0655b949, TypeSUBINSN>, Enc_87c142 {
 let Inst{12-9} = 0b0100;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
@@ -26171,7 +26500,7 @@ def SS2_storewi0 : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
 "memw($Rs16+#$Ii) = #0",
-tc_89e94ad3, TypeSUBINSN>, Enc_a6ce9c {
+tc_44d5a428, TypeSUBINSN>, Enc_a6ce9c {
 let Inst{12-8} = 0b10000;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
@@ -26183,7 +26512,7 @@ def SS2_storewi1 : HInst<
 (outs),
 (ins GeneralSubRegs:$Rs16, u4_2Imm:$Ii),
 "memw($Rs16+#$Ii) = #1",
-tc_89e94ad3, TypeSUBINSN>, Enc_a6ce9c {
+tc_44d5a428, TypeSUBINSN>, Enc_a6ce9c {
 let Inst{12-8} = 0b10001;
 let addrMode = BaseImmOffset;
 let accessSize = WordAccess;
@@ -26198,6 +26527,7 @@ def V6_MAP_equb : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26207,6 +26537,7 @@ def V6_MAP_equb_and : HInst<
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 &= vcmp.eq($Vu32.ub,$Vv32.ub)",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26218,6 +26549,7 @@ def V6_MAP_equb_ior : HInst<
 "$Qx4 |= vcmp.eq($Vu32.ub,$Vv32.ub)",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26228,6 +26560,7 @@ def V6_MAP_equb_xor : HInst<
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 ^= vcmp.eq($Vu32.ub,$Vv32.ub)",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26240,6 +26573,7 @@ def V6_MAP_equh : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26249,6 +26583,7 @@ def V6_MAP_equh_and : HInst<
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 &= vcmp.eq($Vu32.uh,$Vv32.uh)",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26260,6 +26595,7 @@ def V6_MAP_equh_ior : HInst<
 "$Qx4 |= vcmp.eq($Vu32.uh,$Vv32.uh)",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26270,6 +26606,7 @@ def V6_MAP_equh_xor : HInst<
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 ^= vcmp.eq($Vu32.uh,$Vv32.uh)",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26282,6 +26619,7 @@ def V6_MAP_equw : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26291,6 +26629,7 @@ def V6_MAP_equw_and : HInst<
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 &= vcmp.eq($Vu32.uw,$Vv32.uw)",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26302,6 +26641,7 @@ def V6_MAP_equw_ior : HInst<
 "$Qx4 |= vcmp.eq($Vu32.uw,$Vv32.uw)",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26312,6 +26652,7 @@ def V6_MAP_equw_xor : HInst<
 (ins HvxQR:$Qx4in, HvxVR:$Vu32, HvxVR:$Vv32),
 "$Qx4 ^= vcmp.eq($Vu32.uw,$Vv32.uw)",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26327,6 +26668,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10010010000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isSolo = 1;
 let mayLoad = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26338,6 +26680,7 @@ def V6_extractw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26349,6 +26692,7 @@ def V6_hi : HInst<
 CVI_VA, TypeCVI_VA>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -26359,6 +26703,7 @@ def V6_ld0 : HInst<
 PSEUDO, TypeCVI_VM_LD>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26370,6 +26715,7 @@ def V6_ldcnp0 : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26381,6 +26727,7 @@ def V6_ldcnpnt0 : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26392,6 +26739,7 @@ def V6_ldcp0 : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26403,6 +26751,7 @@ def V6_ldcpnt0 : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26414,6 +26763,7 @@ def V6_ldnp0 : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26425,6 +26775,7 @@ def V6_ldnpnt0 : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26436,6 +26787,7 @@ def V6_ldnt0 : HInst<
 PSEUDO, TypeCVI_VM_LD>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26458,6 +26810,7 @@ def V6_ldp0 : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26469,6 +26822,7 @@ def V6_ldpnt0 : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26480,6 +26834,7 @@ def V6_ldtnp0 : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26491,6 +26846,7 @@ def V6_ldtnpnt0 : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26502,6 +26858,7 @@ def V6_ldtp0 : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26513,6 +26870,7 @@ def V6_ldtpnt0 : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26524,6 +26882,7 @@ def V6_ldu0 : HInst<
 PSEUDO, TypeCVI_VM_LD>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26535,6 +26894,7 @@ def V6_lo : HInst<
 CVI_VA, TypeCVI_VA>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -26542,22 +26902,24 @@ def V6_lvsplatb : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32),
 "$Vd32.b = vsplat($Rt32)",
-tc_c4edf264, TypeCVI_VX>, Enc_a5ed8a, Requires<[UseHVXV62]> {
+tc_c4edf264, TypeCVI_VX_LATE>, Enc_a5ed8a, Requires<[UseHVXV62]> {
 let Inst{13-5} = 0b000000010;
 let Inst{31-21} = 0b00011001110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_lvsplath : HInst<
 (outs HvxVR:$Vd32),
 (ins IntRegs:$Rt32),
 "$Vd32.h = vsplat($Rt32)",
-tc_c4edf264, TypeCVI_VX>, Enc_a5ed8a, Requires<[UseHVXV62]> {
+tc_c4edf264, TypeCVI_VX_LATE>, Enc_a5ed8a, Requires<[UseHVXV62]> {
 let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b00011001110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_lvsplatw : HInst<
@@ -26569,6 +26931,7 @@ let Inst{13-5} = 0b000000001;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_pred_and : HInst<
@@ -26582,6 +26945,7 @@ let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_pred_and_n : HInst<
@@ -26595,6 +26959,7 @@ let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_pred_not : HInst<
@@ -26607,6 +26972,7 @@ let Inst{13-10} = 0b0000;
 let Inst{31-16} = 0b0001111000000011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_pred_or : HInst<
@@ -26620,6 +26986,7 @@ let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_pred_or_n : HInst<
@@ -26633,6 +27000,7 @@ let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_pred_scalar2 : HInst<
@@ -26644,6 +27012,7 @@ let Inst{13-2} = 0b000000010001;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_pred_scalar2v2 : HInst<
@@ -26655,6 +27024,7 @@ let Inst{13-2} = 0b000000010011;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_pred_xor : HInst<
@@ -26668,6 +27038,7 @@ let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_shuffeqh : HInst<
@@ -26681,6 +27052,7 @@ let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_shuffeqw : HInst<
@@ -26694,6 +27066,7 @@ let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_st0 : HInst<
@@ -26701,6 +27074,7 @@ def V6_st0 : HInst<
 (ins IntRegs:$Rt32, HvxVR:$Vs32),
 "vmem($Rt32) = $Vs32",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26710,6 +27084,7 @@ def V6_stn0 : HInst<
 (ins IntRegs:$Rt32, HvxVR:$Os8),
 "vmem($Rt32) = $Os8.new",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26720,6 +27095,7 @@ def V6_stnnt0 : HInst<
 (ins IntRegs:$Rt32, HvxVR:$Os8),
 "vmem($Rt32):nt = $Os8.new",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26730,6 +27106,7 @@ def V6_stnp0 : HInst<
 (ins PredRegs:$Pv4, IntRegs:$Rt32, HvxVR:$Vs32),
 "if (!$Pv4) vmem($Rt32) = $Vs32",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26739,6 +27116,7 @@ def V6_stnpnt0 : HInst<
 (ins PredRegs:$Pv4, IntRegs:$Rt32, HvxVR:$Vs32),
 "if (!$Pv4) vmem($Rt32):nt = $Vs32",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26748,6 +27126,7 @@ def V6_stnq0 : HInst<
 (ins HvxQR:$Qv4, IntRegs:$Rt32, HvxVR:$Vs32),
 "if (!$Qv4) vmem($Rt32) = $Vs32",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26757,6 +27136,7 @@ def V6_stnqnt0 : HInst<
 (ins HvxQR:$Qv4, IntRegs:$Rt32, HvxVR:$Vs32),
 "if (!$Qv4) vmem($Rt32):nt = $Vs32",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26766,6 +27146,7 @@ def V6_stnt0 : HInst<
 (ins IntRegs:$Rt32, HvxVR:$Vs32),
 "vmem($Rt32):nt = $Vs32",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26775,6 +27156,7 @@ def V6_stp0 : HInst<
 (ins PredRegs:$Pv4, IntRegs:$Rt32, HvxVR:$Vs32),
 "if ($Pv4) vmem($Rt32) = $Vs32",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26784,6 +27166,7 @@ def V6_stpnt0 : HInst<
 (ins PredRegs:$Pv4, IntRegs:$Rt32, HvxVR:$Vs32),
 "if ($Pv4) vmem($Rt32):nt = $Vs32",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26793,6 +27176,7 @@ def V6_stq0 : HInst<
 (ins HvxQR:$Qv4, IntRegs:$Rt32, HvxVR:$Vs32),
 "if ($Qv4) vmem($Rt32) = $Vs32",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26802,6 +27186,7 @@ def V6_stqnt0 : HInst<
 (ins HvxQR:$Qv4, IntRegs:$Rt32, HvxVR:$Vs32),
 "if ($Qv4) vmem($Rt32):nt = $Vs32",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26811,6 +27196,7 @@ def V6_stu0 : HInst<
 (ins IntRegs:$Rt32, HvxVR:$Vs32),
 "vmemu($Rt32) = $Vs32",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26820,6 +27206,7 @@ def V6_stunp0 : HInst<
 (ins PredRegs:$Pv4, IntRegs:$Rt32, HvxVR:$Vs32),
 "if (!$Pv4) vmemu($Rt32) = $Vs32",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26829,6 +27216,7 @@ def V6_stup0 : HInst<
 (ins PredRegs:$Pv4, IntRegs:$Rt32, HvxVR:$Vs32),
 "if ($Pv4) vmemu($Rt32) = $Vs32",
 PSEUDO, TypeCVI_VM_ST>, Requires<[UseHVXV60]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26846,6 +27234,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26863,6 +27252,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_pi";
@@ -26881,6 +27271,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -26899,6 +27290,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_ai";
@@ -26919,6 +27311,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
@@ -26940,6 +27333,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
@@ -26961,6 +27355,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
@@ -26982,6 +27377,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27002,6 +27398,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27022,6 +27419,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27043,6 +27441,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27063,6 +27462,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27083,6 +27483,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27104,6 +27505,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_ai";
@@ -27124,6 +27526,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_pi";
@@ -27144,6 +27547,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_ppu";
@@ -27163,6 +27567,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27184,6 +27589,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
@@ -27206,6 +27612,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
@@ -27228,6 +27635,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
@@ -27250,6 +27658,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
@@ -27271,6 +27680,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
@@ -27292,6 +27702,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
@@ -27314,6 +27725,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
@@ -27335,6 +27747,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
@@ -27356,6 +27769,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let CVINew = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
@@ -27378,6 +27792,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27399,6 +27814,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27420,6 +27836,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27440,6 +27857,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27461,6 +27879,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27483,6 +27902,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27503,6 +27923,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27523,6 +27944,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27543,6 +27965,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27564,6 +27987,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27585,6 +28009,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27606,6 +28031,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27626,6 +28052,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27646,6 +28073,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27667,6 +28095,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27687,6 +28116,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27707,6 +28137,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isNonTemporal = 1;
 let isRestrictNoSlot1Store = 1;
@@ -27727,6 +28158,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_pi";
@@ -27747,6 +28179,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_ppu";
@@ -27768,6 +28201,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_ai";
@@ -27787,6 +28221,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_pi";
@@ -27806,6 +28241,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_ppu";
@@ -27825,6 +28261,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_ai";
@@ -27845,6 +28282,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_ai";
@@ -27865,6 +28303,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_pi";
@@ -27885,6 +28324,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_ppu";
@@ -27904,6 +28344,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_pi";
@@ -27923,6 +28364,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_ppu";
@@ -27943,6 +28385,7 @@ let opNewValue = 0;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_ai";
@@ -27962,6 +28405,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_pi";
@@ -27981,6 +28425,7 @@ let opNewValue = 0;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let BaseOpcode = "V6_vL32b_tmp_ppu";
@@ -27997,6 +28442,7 @@ let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000001;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32Ub_ai";
 let isPredicable = 1;
@@ -28013,6 +28459,7 @@ let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32Ub_ai";
 let DecoderNamespace = "EXT_mmvec";
@@ -28029,6 +28476,7 @@ let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32Ub_pi";
 let DecoderNamespace = "EXT_mmvec";
@@ -28045,6 +28493,7 @@ let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32Ub_ppu";
 let DecoderNamespace = "EXT_mmvec";
@@ -28060,6 +28509,7 @@ let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001001;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32Ub_pi";
 let isPredicable = 1;
@@ -28075,6 +28525,7 @@ let Inst{12-5} = 0b00000111;
 let Inst{31-21} = 0b00101011001;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32Ub_ppu";
 let isPredicable = 1;
@@ -28091,6 +28542,7 @@ let Inst{31-21} = 0b00101000101;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32Ub_ai";
 let DecoderNamespace = "EXT_mmvec";
@@ -28106,6 +28558,7 @@ let Inst{31-21} = 0b00101001101;
 let isPredicated = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32Ub_pi";
 let DecoderNamespace = "EXT_mmvec";
@@ -28121,6 +28574,7 @@ let Inst{31-21} = 0b00101011101;
 let isPredicated = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32Ub_ppu";
 let DecoderNamespace = "EXT_mmvec";
@@ -28136,6 +28590,7 @@ let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000001;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ai";
 let isNVStorable = 1;
@@ -28153,6 +28608,7 @@ let Inst{31-21} = 0b00101000001;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let mayStore = 1;
@@ -28173,6 +28629,7 @@ let isPredicatedFalse = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let mayStore = 1;
@@ -28193,6 +28650,7 @@ let isPredicatedFalse = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let mayStore = 1;
@@ -28213,6 +28671,7 @@ let isPredicatedFalse = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let mayStore = 1;
@@ -28232,6 +28691,7 @@ let Inst{31-21} = 0b00101001001;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let mayStore = 1;
@@ -28251,6 +28711,7 @@ let Inst{31-21} = 0b00101011001;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let mayStore = 1;
@@ -28271,6 +28732,7 @@ let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let mayStore = 1;
@@ -28290,6 +28752,7 @@ let isPredicated = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let mayStore = 1;
@@ -28309,6 +28772,7 @@ let isPredicated = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let mayStore = 1;
@@ -28328,6 +28792,7 @@ let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ai";
 let isNVStorable = 1;
@@ -28345,6 +28810,7 @@ let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_pi";
 let isNVStorable = 1;
@@ -28362,6 +28828,7 @@ let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ppu";
 let isNVStorable = 1;
@@ -28377,6 +28844,7 @@ let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b00101000100;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -28390,6 +28858,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001100;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx32 = $Rx32in";
@@ -28403,6 +28872,7 @@ let Inst{10-5} = 0b000001;
 let Inst{31-21} = 0b00101011100;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx32 = $Rx32in";
@@ -28417,6 +28887,7 @@ let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000011;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ai";
@@ -28435,6 +28906,7 @@ let Inst{31-21} = 0b00101000011;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let isNonTemporal = 1;
@@ -28456,6 +28928,7 @@ let isPredicatedFalse = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let isNonTemporal = 1;
@@ -28477,6 +28950,7 @@ let isPredicatedFalse = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let isNonTemporal = 1;
@@ -28498,6 +28972,7 @@ let isPredicatedFalse = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let isNonTemporal = 1;
@@ -28518,6 +28993,7 @@ let Inst{31-21} = 0b00101001011;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let isNonTemporal = 1;
@@ -28538,6 +29014,7 @@ let Inst{31-21} = 0b00101011011;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let isNonTemporal = 1;
@@ -28559,6 +29036,7 @@ let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let isNonTemporal = 1;
@@ -28579,6 +29057,7 @@ let isPredicated = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let isNonTemporal = 1;
@@ -28599,6 +29078,7 @@ let isPredicated = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
 let isNVStore = 1;
+let isCVI = 1;
 let CVINew = 1;
 let isNewValue = 1;
 let isNonTemporal = 1;
@@ -28619,6 +29099,7 @@ let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ai";
@@ -28637,6 +29118,7 @@ let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_pi";
@@ -28655,6 +29137,7 @@ let isPredicated = 1;
 let isPredicatedFalse = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ppu";
@@ -28671,6 +29154,7 @@ let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b00101000110;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -28685,6 +29169,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001110;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -28699,6 +29184,7 @@ let Inst{10-5} = 0b000001;
 let Inst{31-21} = 0b00101011110;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -28714,6 +29200,7 @@ let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001011;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_pi";
@@ -28731,6 +29218,7 @@ let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b00101011011;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ppu";
@@ -28749,6 +29237,7 @@ let Inst{31-21} = 0b00101000111;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ai";
@@ -28766,6 +29255,7 @@ let Inst{31-21} = 0b00101001111;
 let isPredicated = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_pi";
@@ -28783,6 +29273,7 @@ let Inst{31-21} = 0b00101011111;
 let isPredicated = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ppu";
@@ -28799,6 +29290,7 @@ let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b00101000110;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -28813,6 +29305,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001110;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -28827,6 +29320,7 @@ let Inst{10-5} = 0b000000;
 let Inst{31-21} = 0b00101011110;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let isNonTemporal = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -28842,6 +29336,7 @@ let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001001;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_pi";
 let isNVStorable = 1;
@@ -28858,7 +29353,9 @@ let Inst{12-5} = 0b00000000;
 let Inst{31-21} = 0b00101011001;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
+let BaseOpcode = "V6_vS32b_ppu";
 let isNVStorable = 1;
 let isPredicable = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -28874,6 +29371,7 @@ let Inst{31-21} = 0b00101000101;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ai";
 let isNVStorable = 1;
@@ -28890,6 +29388,7 @@ let Inst{31-21} = 0b00101001101;
 let isPredicated = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_pi";
 let isNVStorable = 1;
@@ -28906,6 +29405,7 @@ let Inst{31-21} = 0b00101011101;
 let isPredicated = 1;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let BaseOpcode = "V6_vS32b_ppu";
 let isNVStorable = 1;
@@ -28921,6 +29421,7 @@ let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b00101000100;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -28934,6 +29435,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101001100;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx32 = $Rx32in";
@@ -28947,6 +29449,7 @@ let Inst{10-5} = 0b000000;
 let Inst{31-21} = 0b00101011100;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx32 = $Rx32in";
@@ -28961,6 +29464,7 @@ let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101000001;
 let addrMode = BaseImmOffset;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let CVINew = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -28975,6 +29479,7 @@ let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101001001;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let CVINew = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -28989,6 +29494,7 @@ let Inst{12-0} = 0b0000000101000;
 let Inst{31-21} = 0b00101011001;
 let addrMode = PostInc;
 let accessSize = HVXVectorAccess;
+let isCVI = 1;
 let CVINew = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29004,6 +29510,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vabsb_alt : HInst<
@@ -29013,6 +29520,7 @@ def V6_vabsb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29027,6 +29535,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vabsb_sat_alt : HInst<
@@ -29036,6 +29545,7 @@ def V6_vabsb_sat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29050,6 +29560,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vabsdiffh_alt : HInst<
@@ -29059,6 +29570,7 @@ def V6_vabsdiffh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29073,6 +29585,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vabsdiffub_alt : HInst<
@@ -29082,6 +29595,7 @@ def V6_vabsdiffub_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29096,6 +29610,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vabsdiffuh_alt : HInst<
@@ -29105,6 +29620,7 @@ def V6_vabsdiffuh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29119,6 +29635,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vabsdiffw_alt : HInst<
@@ -29128,6 +29645,7 @@ def V6_vabsdiffw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29142,6 +29660,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vabsh_alt : HInst<
@@ -29151,6 +29670,7 @@ def V6_vabsh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29165,6 +29685,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vabsh_sat_alt : HInst<
@@ -29174,6 +29695,7 @@ def V6_vabsh_sat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29185,6 +29707,7 @@ def V6_vabsub_alt : HInst<
 tc_0ec46cf9, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29196,6 +29719,7 @@ def V6_vabsuh_alt : HInst<
 tc_0ec46cf9, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29207,6 +29731,7 @@ def V6_vabsuw_alt : HInst<
 tc_0ec46cf9, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29221,6 +29746,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vabsw_alt : HInst<
@@ -29230,6 +29756,7 @@ def V6_vabsw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29244,6 +29771,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vabsw_sat_alt : HInst<
@@ -29253,6 +29781,7 @@ def V6_vabsw_sat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29267,6 +29796,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddb_alt : HInst<
@@ -29276,6 +29806,7 @@ def V6_vaddb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29290,6 +29821,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddb_dv_alt : HInst<
@@ -29299,6 +29831,7 @@ def V6_vaddb_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29315,6 +29848,7 @@ let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -29326,6 +29860,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29343,6 +29878,7 @@ let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -29354,6 +29890,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29369,6 +29906,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddbsat_alt : HInst<
@@ -29378,6 +29916,7 @@ def V6_vaddbsat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29392,6 +29931,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddbsat_dv_alt : HInst<
@@ -29401,6 +29941,7 @@ def V6_vaddbsat_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29415,6 +29956,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -29422,7 +29964,7 @@ def V6_vaddcarryo : HInst<
 (outs HvxVR:$Vd32, HvxQR:$Qe4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w,$Qe4 = vadd($Vu32.w,$Vv32.w):carry",
-tc_e35c1e93, TypeCOPROC_VX>, Enc_c1d806, Requires<[UseHVXV66]> {
+tc_e35c1e93, TypeCVI_VA>, Enc_c1d806, Requires<[UseHVXV66]> {
 let Inst{7-7} = 0b0;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011101101;
@@ -29430,6 +29972,7 @@ let hasNewValue = 1;
 let opNewValue = 0;
 let hasNewValue2 = 1;
 let opNewValue2 = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddcarrysat : HInst<
@@ -29442,6 +29985,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011101100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddclbh : HInst<
@@ -29454,6 +29998,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddclbw : HInst<
@@ -29466,6 +30011,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddh : HInst<
@@ -29478,6 +30024,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddh_alt : HInst<
@@ -29487,6 +30034,7 @@ def V6_vaddh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29501,6 +30049,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddh_dv_alt : HInst<
@@ -29510,6 +30059,7 @@ def V6_vaddh_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29526,6 +30076,7 @@ let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -29537,6 +30088,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29554,6 +30106,7 @@ let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -29565,6 +30118,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29580,6 +30134,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddhsat_alt : HInst<
@@ -29589,6 +30144,7 @@ def V6_vaddhsat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29603,6 +30159,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddhsat_dv_alt : HInst<
@@ -29612,6 +30169,7 @@ def V6_vaddhsat_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29626,6 +30184,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddhw_acc : HInst<
@@ -29639,6 +30198,7 @@ let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -29650,6 +30210,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29662,6 +30223,7 @@ def V6_vaddhw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29676,6 +30238,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddubh_acc : HInst<
@@ -29689,6 +30252,7 @@ let Inst{31-21} = 0b00011100010;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -29700,6 +30264,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29712,6 +30277,7 @@ def V6_vaddubh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29726,6 +30292,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddubsat_alt : HInst<
@@ -29735,6 +30302,7 @@ def V6_vaddubsat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29749,6 +30317,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddubsat_dv_alt : HInst<
@@ -29758,6 +30327,7 @@ def V6_vaddubsat_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29772,6 +30342,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vadduhsat : HInst<
@@ -29784,6 +30355,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vadduhsat_alt : HInst<
@@ -29793,6 +30365,7 @@ def V6_vadduhsat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29807,6 +30380,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vadduhsat_dv_alt : HInst<
@@ -29816,6 +30390,7 @@ def V6_vadduhsat_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29830,6 +30405,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vadduhw_acc : HInst<
@@ -29843,6 +30419,7 @@ let Inst{31-21} = 0b00011100010;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -29854,6 +30431,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29866,6 +30444,7 @@ def V6_vadduhw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29880,6 +30459,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vadduwsat_alt : HInst<
@@ -29889,6 +30469,7 @@ def V6_vadduwsat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29903,6 +30484,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vadduwsat_dv_alt : HInst<
@@ -29912,6 +30494,7 @@ def V6_vadduwsat_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29926,6 +30509,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddw_alt : HInst<
@@ -29935,6 +30519,7 @@ def V6_vaddw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29949,6 +30534,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddw_dv_alt : HInst<
@@ -29958,6 +30544,7 @@ def V6_vaddw_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -29974,6 +30561,7 @@ let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -29985,6 +30573,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30002,6 +30591,7 @@ let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -30013,6 +30603,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30028,6 +30619,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddwsat_alt : HInst<
@@ -30037,6 +30629,7 @@ def V6_vaddwsat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30051,6 +30644,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaddwsat_dv_alt : HInst<
@@ -30060,6 +30654,7 @@ def V6_vaddwsat_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30074,6 +30669,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_valignbi : HInst<
@@ -30085,6 +30681,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011110001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vand : HInst<
@@ -30097,31 +30694,34 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vandnqrt : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxQR:$Qu4, IntRegs:$Rt32),
 "$Vd32 = vand(!$Qu4,$Rt32)",
-tc_ac4046bc, TypeCVI_VX>, Enc_7b7ba8, Requires<[UseHVXV62]> {
+tc_ac4046bc, TypeCVI_VX_LATE>, Enc_7b7ba8, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b101;
 let Inst{13-10} = 0b0001;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vandnqrt_acc : HInst<
 (outs HvxVR:$Vx32),
 (ins HvxVR:$Vx32in, HvxQR:$Qu4, IntRegs:$Rt32),
 "$Vx32 |= vand(!$Qu4,$Rt32)",
-tc_2e8f5f6e, TypeCVI_VX>, Enc_895bd9, Requires<[UseHVXV62]> {
+tc_2e8f5f6e, TypeCVI_VX_LATE>, Enc_895bd9, Requires<[UseHVXV62]> {
 let Inst{7-5} = 0b011;
 let Inst{13-10} = 0b1001;
 let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -30133,6 +30733,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30145,6 +30746,7 @@ def V6_vandnqrt_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30159,6 +30761,7 @@ let Inst{13-10} = 0b0000;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vandqrt_acc : HInst<
@@ -30172,6 +30775,7 @@ let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -30183,6 +30787,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30195,6 +30800,7 @@ def V6_vandqrt_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30210,6 +30816,7 @@ let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vandvqv : HInst<
@@ -30223,6 +30830,7 @@ let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vandvrt : HInst<
@@ -30235,6 +30843,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vandvrt_acc : HInst<
@@ -30246,6 +30855,7 @@ let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001011;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -30255,6 +30865,7 @@ def V6_vandvrt_acc_alt : HInst<
 "$Qx4.ub |= vand($Vu32.ub,$Rt32.ub)",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30267,6 +30878,7 @@ def V6_vandvrt_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30281,6 +30893,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaslh_acc : HInst<
@@ -30294,6 +30907,7 @@ let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -30305,6 +30919,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30317,6 +30932,7 @@ def V6_vaslh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30331,6 +30947,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaslhv_alt : HInst<
@@ -30340,6 +30957,7 @@ def V6_vaslhv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30354,6 +30972,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaslw_acc : HInst<
@@ -30367,6 +30986,7 @@ let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -30378,6 +30998,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30390,6 +31011,7 @@ def V6_vaslw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30404,6 +31026,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vaslwv_alt : HInst<
@@ -30413,6 +31036,7 @@ def V6_vaslwv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30427,6 +31051,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011010101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -30437,6 +31062,7 @@ def V6_vasr_into_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV66]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30452,6 +31078,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vasrh_acc : HInst<
@@ -30465,6 +31092,7 @@ let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -30476,6 +31104,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30488,6 +31117,7 @@ def V6_vasrh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30502,18 +31132,9 @@ let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
-def V6_vasrhbrndsat_alt : HInst<
-(outs HvxVR:$Vd32),
-(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
-"$Vd32 = vasrhb($Vu32,$Vv32,$Rt8):rnd:sat",
-tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> {
-let hasNewValue = 1;
-let opNewValue = 0;
-let isPseudo = 1;
-let isCodeGenOnly = 1;
-}
 def V6_vasrhbsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
@@ -30524,6 +31145,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vasrhubrndsat : HInst<
@@ -30536,18 +31158,9 @@ let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
-def V6_vasrhubrndsat_alt : HInst<
-(outs HvxVR:$Vd32),
-(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
-"$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):rnd:sat",
-tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> {
-let hasNewValue = 1;
-let opNewValue = 0;
-let isPseudo = 1;
-let isCodeGenOnly = 1;
-}
 def V6_vasrhubsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
@@ -30558,18 +31171,9 @@ let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
-def V6_vasrhubsat_alt : HInst<
-(outs HvxVR:$Vd32),
-(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
-"$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):sat",
-tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> {
-let hasNewValue = 1;
-let opNewValue = 0;
-let isPseudo = 1;
-let isCodeGenOnly = 1;
-}
 def V6_vasrhv : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -30580,6 +31184,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vasrhv_alt : HInst<
@@ -30589,6 +31194,7 @@ def V6_vasrhv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30603,6 +31209,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vasruhubsat : HInst<
@@ -30615,6 +31222,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vasruwuhrndsat : HInst<
@@ -30627,6 +31235,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vasruwuhsat : HInst<
@@ -30639,6 +31248,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vasrw : HInst<
@@ -30651,6 +31261,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vasrw_acc : HInst<
@@ -30664,6 +31275,7 @@ let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -30675,6 +31287,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30687,6 +31300,7 @@ def V6_vasrw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30701,18 +31315,9 @@ let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
-def V6_vasrwh_alt : HInst<
-(outs HvxVR:$Vd32),
-(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
-"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8)",
-tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> {
-let hasNewValue = 1;
-let opNewValue = 0;
-let isPseudo = 1;
-let isCodeGenOnly = 1;
-}
 def V6_vasrwhrndsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
@@ -30723,18 +31328,9 @@ let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
-def V6_vasrwhrndsat_alt : HInst<
-(outs HvxVR:$Vd32),
-(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
-"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):rnd:sat",
-tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> {
-let hasNewValue = 1;
-let opNewValue = 0;
-let isPseudo = 1;
-let isCodeGenOnly = 1;
-}
 def V6_vasrwhsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
@@ -30745,18 +31341,9 @@ let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
-def V6_vasrwhsat_alt : HInst<
-(outs HvxVR:$Vd32),
-(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
-"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):sat",
-tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> {
-let hasNewValue = 1;
-let opNewValue = 0;
-let isPseudo = 1;
-let isCodeGenOnly = 1;
-}
 def V6_vasrwuhrndsat : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
@@ -30767,6 +31354,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vasrwuhsat : HInst<
@@ -30779,18 +31367,9 @@ let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
-def V6_vasrwuhsat_alt : HInst<
-(outs HvxVR:$Vd32),
-(ins HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8),
-"$Vd32 = vasrwuh($Vu32,$Vv32,$Rt8):sat",
-tc_16ff9ef8, TypeMAPPING>, Requires<[HasV60]> {
-let hasNewValue = 1;
-let opNewValue = 0;
-let isPseudo = 1;
-let isCodeGenOnly = 1;
-}
 def V6_vasrwv : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
@@ -30801,6 +31380,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vasrwv_alt : HInst<
@@ -30810,6 +31390,7 @@ def V6_vasrwv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30824,6 +31405,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-16} = 0b0001111000000011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vassignp : HInst<
@@ -30833,6 +31415,7 @@ def V6_vassignp : HInst<
 CVI_VA, TypeCVI_VA_DV>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -30846,6 +31429,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vavgb_alt : HInst<
@@ -30855,6 +31439,7 @@ def V6_vavgb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30869,6 +31454,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vavgbrnd_alt : HInst<
@@ -30878,6 +31464,7 @@ def V6_vavgbrnd_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30892,6 +31479,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vavgh_alt : HInst<
@@ -30901,6 +31489,7 @@ def V6_vavgh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30915,6 +31504,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vavghrnd_alt : HInst<
@@ -30924,6 +31514,7 @@ def V6_vavghrnd_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30938,6 +31529,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vavgub_alt : HInst<
@@ -30947,6 +31539,7 @@ def V6_vavgub_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30961,6 +31554,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vavgubrnd_alt : HInst<
@@ -30970,6 +31564,7 @@ def V6_vavgubrnd_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -30984,6 +31579,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vavguh_alt : HInst<
@@ -30993,6 +31589,7 @@ def V6_vavguh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31007,6 +31604,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vavguhrnd_alt : HInst<
@@ -31016,6 +31614,7 @@ def V6_vavguhrnd_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31030,6 +31629,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vavguw_alt : HInst<
@@ -31039,6 +31639,7 @@ def V6_vavguw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31053,6 +31654,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vavguwrnd_alt : HInst<
@@ -31062,6 +31664,7 @@ def V6_vavguwrnd_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31076,6 +31679,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vavgw_alt : HInst<
@@ -31085,6 +31689,7 @@ def V6_vavgw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31099,6 +31704,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vavgwrnd_alt : HInst<
@@ -31108,6 +31714,7 @@ def V6_vavgwrnd_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31123,6 +31730,7 @@ let Inst{31-21} = 0b00011010011;
 let isPredicated = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vcl0h : HInst<
@@ -31135,6 +31743,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vcl0h_alt : HInst<
@@ -31144,6 +31753,7 @@ def V6_vcl0h_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31158,6 +31768,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vcl0w_alt : HInst<
@@ -31167,6 +31778,7 @@ def V6_vcl0w_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31182,6 +31794,7 @@ let Inst{31-16} = 0b0001101000000000;
 let isPredicated = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vcombine : HInst<
@@ -31194,6 +31807,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isRegSequence = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -31204,6 +31818,7 @@ def V6_vd0 : HInst<
 CVI_VA, TypeCVI_VA>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31215,6 +31830,7 @@ def V6_vdd0 : HInst<
 tc_718b5c53, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31231,6 +31847,7 @@ let hasNewValue = 1;
 let opNewValue = 0;
 let hasNewValue2 = 1;
 let opNewValue2 = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
 }
@@ -31244,6 +31861,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdealb4w : HInst<
@@ -31256,6 +31874,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdealb4w_alt : HInst<
@@ -31265,6 +31884,7 @@ def V6_vdealb4w_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31276,6 +31896,7 @@ def V6_vdealb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31290,6 +31911,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdealh_alt : HInst<
@@ -31299,6 +31921,7 @@ def V6_vdealh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31313,6 +31936,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdelta : HInst<
@@ -31325,6 +31949,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdmpybus : HInst<
@@ -31337,6 +31962,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdmpybus_acc : HInst<
@@ -31350,6 +31976,7 @@ let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -31361,6 +31988,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31373,6 +32001,7 @@ def V6_vdmpybus_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31387,6 +32016,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdmpybus_dv_acc : HInst<
@@ -31400,6 +32030,7 @@ let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -31411,6 +32042,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31423,6 +32055,7 @@ def V6_vdmpybus_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31437,6 +32070,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdmpyhb_acc : HInst<
@@ -31450,6 +32084,7 @@ let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -31461,6 +32096,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31473,6 +32109,7 @@ def V6_vdmpyhb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31487,6 +32124,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdmpyhb_dv_acc : HInst<
@@ -31500,6 +32138,7 @@ let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -31511,6 +32150,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31523,6 +32163,7 @@ def V6_vdmpyhb_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31537,6 +32178,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdmpyhisat_acc : HInst<
@@ -31550,6 +32192,7 @@ let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -31561,6 +32204,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31573,6 +32217,7 @@ def V6_vdmpyhisat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31587,6 +32232,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdmpyhsat_acc : HInst<
@@ -31600,6 +32246,7 @@ let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -31611,6 +32258,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31623,6 +32271,7 @@ def V6_vdmpyhsat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31637,6 +32286,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdmpyhsuisat_acc : HInst<
@@ -31650,6 +32300,7 @@ let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -31661,6 +32312,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31673,6 +32325,7 @@ def V6_vdmpyhsuisat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31687,6 +32340,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdmpyhsusat_acc : HInst<
@@ -31700,6 +32354,7 @@ let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -31711,6 +32366,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31723,6 +32379,7 @@ def V6_vdmpyhsusat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31737,6 +32394,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdmpyhvsat_acc : HInst<
@@ -31750,6 +32408,7 @@ let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -31761,6 +32420,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31773,6 +32433,7 @@ def V6_vdmpyhvsat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31787,6 +32448,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vdsaduh_acc : HInst<
@@ -31800,6 +32462,7 @@ let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -31811,6 +32474,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31823,6 +32487,7 @@ def V6_vdsaduh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -31837,6 +32502,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_veqb_and : HInst<
@@ -31847,6 +32513,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -31859,6 +32526,7 @@ let Inst{7-2} = 0b010000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -31870,6 +32538,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b100000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -31883,6 +32552,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_veqh_and : HInst<
@@ -31893,6 +32563,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -31905,6 +32576,7 @@ let Inst{7-2} = 0b010001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -31916,6 +32588,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b100001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -31929,6 +32602,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_veqw_and : HInst<
@@ -31939,6 +32613,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -31951,6 +32626,7 @@ let Inst{7-2} = 0b010010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -31962,6 +32638,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b100010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -31976,6 +32653,7 @@ let hasNewValue = 1;
 let opNewValue = 0;
 let accessSize = HalfWordAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let hasTmpDst = 1;
 let mayLoad = 1;
 let Defs = [VTMP];
@@ -31992,6 +32670,7 @@ let hasNewValue = 1;
 let opNewValue = 0;
 let accessSize = HalfWordAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let hasTmpDst = 1;
 let mayLoad = 1;
 let Defs = [VTMP];
@@ -32001,13 +32680,14 @@ def V6_vgathermhw : HInst<
 (outs),
 (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32),
 "vtmp.h = vgather($Rt32,$Mu2,$Vvv32.w).h",
-tc_05058f6f, TypeCVI_GATHER>, Enc_28dcbb, Requires<[UseHVXV65]> {
+tc_05058f6f, TypeCVI_GATHER_DV>, Enc_28dcbb, Requires<[UseHVXV65]> {
 let Inst{12-5} = 0b00010000;
 let Inst{31-21} = 0b00101111000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let accessSize = HalfWordAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let hasTmpDst = 1;
 let mayLoad = 1;
 let Defs = [VTMP];
@@ -32017,13 +32697,14 @@ def V6_vgathermhwq : HInst<
 (outs),
 (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32),
 "if ($Qs4) vtmp.h = vgather($Rt32,$Mu2,$Vvv32.w).h",
-tc_fd7610da, TypeCVI_GATHER>, Enc_4e4a80, Requires<[UseHVXV65]> {
+tc_fd7610da, TypeCVI_GATHER_DV>, Enc_4e4a80, Requires<[UseHVXV65]> {
 let Inst{12-7} = 0b001100;
 let Inst{31-21} = 0b00101111000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let accessSize = HalfWordAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let hasTmpDst = 1;
 let mayLoad = 1;
 let Defs = [VTMP];
@@ -32040,6 +32721,7 @@ let hasNewValue = 1;
 let opNewValue = 0;
 let accessSize = WordAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let hasTmpDst = 1;
 let mayLoad = 1;
 let Defs = [VTMP];
@@ -32056,6 +32738,7 @@ let hasNewValue = 1;
 let opNewValue = 0;
 let accessSize = WordAccess;
 let isCVLoad = 1;
+let isCVI = 1;
 let hasTmpDst = 1;
 let mayLoad = 1;
 let Defs = [VTMP];
@@ -32071,6 +32754,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vgtb_and : HInst<
@@ -32081,6 +32765,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000100;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32093,6 +32778,7 @@ let Inst{7-2} = 0b010100;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32104,6 +32790,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b100100;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32117,6 +32804,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vgth_and : HInst<
@@ -32127,6 +32815,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32139,6 +32828,7 @@ let Inst{7-2} = 0b010101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32150,6 +32840,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b100101;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32163,6 +32854,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vgtub_and : HInst<
@@ -32173,6 +32865,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b001000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32185,6 +32878,7 @@ let Inst{7-2} = 0b011000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32196,6 +32890,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b101000;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32209,6 +32904,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vgtuh_and : HInst<
@@ -32219,6 +32915,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b001001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32231,6 +32928,7 @@ let Inst{7-2} = 0b011001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32242,6 +32940,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b101001;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32255,6 +32954,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vgtuw_and : HInst<
@@ -32265,6 +32965,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b001010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32277,6 +32978,7 @@ let Inst{7-2} = 0b011010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32288,6 +32990,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b101010;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32301,6 +33004,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vgtw_and : HInst<
@@ -32311,6 +33015,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b000110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32323,6 +33028,7 @@ let Inst{7-2} = 0b010110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32334,6 +33040,7 @@ tc_257f6f7c, TypeCVI_VA>, Enc_eaa9f8, Requires<[UseHVXV60]> {
 let Inst{7-2} = 0b100110;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100100;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -32344,6 +33051,7 @@ def V6_vhist : HInst<
 tc_1381a97c, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV60]> {
 let Inst{13-0} = 0b10000010000000;
 let Inst{31-16} = 0b0001111000000000;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vhistq : HInst<
@@ -32354,6 +33062,7 @@ tc_e3f68a46, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV60]> {
 let Inst{13-0} = 0b10000010000000;
 let Inst{21-16} = 0b000010;
 let Inst{31-24} = 0b00011110;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vinsertwr : HInst<
@@ -32365,6 +33074,7 @@ let Inst{13-5} = 0b100000001;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -32378,6 +33088,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vlalignbi : HInst<
@@ -32389,6 +33100,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011110011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vlsrb : HInst<
@@ -32401,6 +33113,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vlsrh : HInst<
@@ -32413,6 +33126,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vlsrh_alt : HInst<
@@ -32422,6 +33136,7 @@ def V6_vlsrh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32436,6 +33151,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vlsrhv_alt : HInst<
@@ -32445,6 +33161,7 @@ def V6_vlsrhv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32459,6 +33176,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vlsrw_alt : HInst<
@@ -32468,6 +33186,7 @@ def V6_vlsrw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32482,6 +33201,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vlsrwv_alt : HInst<
@@ -32491,6 +33211,7 @@ def V6_vlsrwv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32505,6 +33226,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vlutvvb : HInst<
@@ -32517,6 +33239,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vlutvvb_nm : HInst<
@@ -32529,6 +33252,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vlutvvb_oracc : HInst<
@@ -32542,6 +33266,7 @@ let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -32555,6 +33280,7 @@ let Inst{31-21} = 0b00011100110;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -32567,6 +33293,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vlutvwh : HInst<
@@ -32579,6 +33306,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vlutvwh_nm : HInst<
@@ -32591,6 +33319,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-24} = 0b00011000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vlutvwh_oracc : HInst<
@@ -32604,6 +33333,7 @@ let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -32617,6 +33347,7 @@ let Inst{31-21} = 0b00011100111;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -32629,6 +33360,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmaxb : HInst<
@@ -32641,6 +33373,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmaxb_alt : HInst<
@@ -32650,6 +33383,7 @@ def V6_vmaxb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32664,6 +33398,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmaxh_alt : HInst<
@@ -32673,6 +33408,7 @@ def V6_vmaxh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32687,6 +33423,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmaxub_alt : HInst<
@@ -32696,6 +33433,7 @@ def V6_vmaxub_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32710,6 +33448,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmaxuh_alt : HInst<
@@ -32719,6 +33458,7 @@ def V6_vmaxuh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32733,6 +33473,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmaxw_alt : HInst<
@@ -32742,6 +33483,7 @@ def V6_vmaxw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32756,6 +33498,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vminb_alt : HInst<
@@ -32765,6 +33508,7 @@ def V6_vminb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32779,6 +33523,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vminh_alt : HInst<
@@ -32788,6 +33533,7 @@ def V6_vminh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32802,6 +33548,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vminub_alt : HInst<
@@ -32811,6 +33558,7 @@ def V6_vminub_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32825,6 +33573,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vminuh_alt : HInst<
@@ -32834,6 +33583,7 @@ def V6_vminuh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32848,6 +33598,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vminw_alt : HInst<
@@ -32857,6 +33608,7 @@ def V6_vminw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32871,6 +33623,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpabus_acc : HInst<
@@ -32884,6 +33637,7 @@ let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -32895,6 +33649,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32907,6 +33662,7 @@ def V6_vmpabus_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32921,6 +33677,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpabusv_alt : HInst<
@@ -32930,6 +33687,7 @@ def V6_vmpabusv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32944,6 +33702,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpabuu_acc : HInst<
@@ -32957,6 +33716,7 @@ let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -32968,6 +33728,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32980,6 +33741,7 @@ def V6_vmpabuu_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -32994,6 +33756,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpabuuv_alt : HInst<
@@ -33003,6 +33766,7 @@ def V6_vmpabuuv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33017,6 +33781,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpahb_acc : HInst<
@@ -33030,6 +33795,7 @@ let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -33041,6 +33807,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33053,6 +33820,7 @@ def V6_vmpahb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33067,6 +33835,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -33080,6 +33849,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpauhb_acc : HInst<
@@ -33093,6 +33863,7 @@ let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -33104,6 +33875,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33116,6 +33888,7 @@ def V6_vmpauhb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33130,6 +33903,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -33143,6 +33917,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -33156,6 +33931,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpybus_acc : HInst<
@@ -33169,6 +33945,7 @@ let Inst{31-21} = 0b00011001001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -33180,6 +33957,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33192,6 +33970,7 @@ def V6_vmpybus_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33206,6 +33985,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpybusv_acc : HInst<
@@ -33219,6 +33999,7 @@ let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -33230,6 +34011,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33242,6 +34024,7 @@ def V6_vmpybusv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33256,6 +34039,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpybv_acc : HInst<
@@ -33269,6 +34053,7 @@ let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -33280,6 +34065,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33292,6 +34078,7 @@ def V6_vmpybv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33306,6 +34093,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyewuh_64 : HInst<
@@ -33318,6 +34106,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyewuh_alt : HInst<
@@ -33327,6 +34116,7 @@ def V6_vmpyewuh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33341,6 +34131,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyh_acc : HInst<
@@ -33354,6 +34145,7 @@ let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -33365,6 +34157,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33377,6 +34170,7 @@ def V6_vmpyh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33392,6 +34186,7 @@ let Inst{31-21} = 0b00011001010;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -33403,6 +34198,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33418,6 +34214,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyhsrs_alt : HInst<
@@ -33427,6 +34224,7 @@ def V6_vmpyhsrs_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33441,6 +34239,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyhss_alt : HInst<
@@ -33450,6 +34249,7 @@ def V6_vmpyhss_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33464,6 +34264,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyhus_acc : HInst<
@@ -33477,6 +34278,7 @@ let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -33488,6 +34290,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33500,6 +34303,7 @@ def V6_vmpyhus_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33514,6 +34318,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyhv_acc : HInst<
@@ -33527,6 +34332,7 @@ let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -33538,6 +34344,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33550,6 +34357,7 @@ def V6_vmpyhv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33564,6 +34372,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyhvsrs_alt : HInst<
@@ -33573,6 +34382,7 @@ def V6_vmpyhvsrs_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33587,6 +34397,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyiewh_acc : HInst<
@@ -33600,6 +34411,7 @@ let Inst{31-21} = 0b00011100010;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -33611,6 +34423,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33626,6 +34439,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyiewuh_acc : HInst<
@@ -33639,6 +34453,7 @@ let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -33650,6 +34465,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33662,6 +34478,7 @@ def V6_vmpyiewuh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33676,6 +34493,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyih_acc : HInst<
@@ -33689,6 +34507,7 @@ let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -33700,6 +34519,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33712,6 +34532,7 @@ def V6_vmpyih_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33726,6 +34547,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyihb_acc : HInst<
@@ -33739,6 +34561,7 @@ let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -33750,6 +34573,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33762,6 +34586,7 @@ def V6_vmpyihb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33776,6 +34601,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyiowh_alt : HInst<
@@ -33785,6 +34611,7 @@ def V6_vmpyiowh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33799,6 +34626,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyiwb_acc : HInst<
@@ -33812,6 +34640,7 @@ let Inst{31-21} = 0b00011001010;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -33823,6 +34652,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33835,6 +34665,7 @@ def V6_vmpyiwb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33849,6 +34680,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyiwh_acc : HInst<
@@ -33862,6 +34694,7 @@ let Inst{31-21} = 0b00011001010;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -33873,6 +34706,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33885,6 +34719,7 @@ def V6_vmpyiwh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33899,6 +34734,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyiwub_acc : HInst<
@@ -33912,6 +34748,7 @@ let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -33923,6 +34760,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33935,6 +34773,7 @@ def V6_vmpyiwub_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33949,6 +34788,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyowh_64_acc : HInst<
@@ -33962,6 +34802,7 @@ let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -33972,6 +34813,7 @@ def V6_vmpyowh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -33986,6 +34828,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyowh_rnd_alt : HInst<
@@ -33995,6 +34838,7 @@ def V6_vmpyowh_rnd_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34010,6 +34854,7 @@ let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -34021,6 +34866,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
@@ -34036,6 +34882,7 @@ let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -34047,6 +34894,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
@@ -34061,6 +34909,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyub_acc : HInst<
@@ -34074,6 +34923,7 @@ let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -34085,6 +34935,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34097,6 +34948,7 @@ def V6_vmpyub_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34111,6 +34963,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyubv_acc : HInst<
@@ -34124,6 +34977,7 @@ let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -34135,6 +34989,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34147,6 +35002,7 @@ def V6_vmpyubv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34161,6 +35017,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyuh_acc : HInst<
@@ -34174,6 +35031,7 @@ let Inst{31-21} = 0b00011001010;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -34185,6 +35043,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34197,6 +35056,7 @@ def V6_vmpyuh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34211,6 +35071,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyuhe_acc : HInst<
@@ -34224,6 +35085,7 @@ let Inst{31-21} = 0b00011001100;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -34237,6 +35099,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vmpyuhv_acc : HInst<
@@ -34250,6 +35113,7 @@ let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -34261,6 +35125,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34273,6 +35138,7 @@ def V6_vmpyuhv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34287,6 +35153,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011110111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vnavgb : HInst<
@@ -34299,6 +35166,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011111000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vnavgb_alt : HInst<
@@ -34308,6 +35176,7 @@ def V6_vnavgb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34322,6 +35191,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vnavgh_alt : HInst<
@@ -34331,6 +35201,7 @@ def V6_vnavgh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34345,6 +35216,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vnavgub_alt : HInst<
@@ -34354,6 +35226,7 @@ def V6_vnavgub_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34368,6 +35241,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vnavgw_alt : HInst<
@@ -34377,6 +35251,7 @@ def V6_vnavgw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34393,6 +35268,7 @@ let isPredicated = 1;
 let isPredicatedFalse = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vncmov : HInst<
@@ -34407,6 +35283,7 @@ let isPredicated = 1;
 let isPredicatedFalse = 1;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vnormamth : HInst<
@@ -34419,6 +35296,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vnormamth_alt : HInst<
@@ -34428,6 +35306,7 @@ def V6_vnormamth_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34442,6 +35321,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vnormamtw_alt : HInst<
@@ -34451,6 +35331,7 @@ def V6_vnormamtw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34465,6 +35346,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vor : HInst<
@@ -34477,6 +35359,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vpackeb : HInst<
@@ -34489,6 +35372,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vpackeb_alt : HInst<
@@ -34498,6 +35382,7 @@ def V6_vpackeb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34512,6 +35397,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vpackeh_alt : HInst<
@@ -34521,6 +35407,7 @@ def V6_vpackeh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34535,6 +35422,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vpackhb_sat_alt : HInst<
@@ -34544,6 +35432,7 @@ def V6_vpackhb_sat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34558,6 +35447,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vpackhub_sat_alt : HInst<
@@ -34567,6 +35457,7 @@ def V6_vpackhub_sat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34581,6 +35472,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vpackob_alt : HInst<
@@ -34590,6 +35482,7 @@ def V6_vpackob_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34604,6 +35497,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vpackoh_alt : HInst<
@@ -34613,6 +35507,7 @@ def V6_vpackoh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34627,6 +35522,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vpackwh_sat_alt : HInst<
@@ -34636,6 +35532,7 @@ def V6_vpackwh_sat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34650,6 +35547,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vpackwuh_sat_alt : HInst<
@@ -34659,6 +35557,7 @@ def V6_vpackwuh_sat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34673,6 +35572,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vpopcounth_alt : HInst<
@@ -34682,6 +35582,7 @@ def V6_vpopcounth_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34696,6 +35597,7 @@ let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vprefixqh : HInst<
@@ -34708,6 +35610,7 @@ let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vprefixqw : HInst<
@@ -34720,6 +35623,7 @@ let Inst{21-16} = 0b000011;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrdelta : HInst<
@@ -34732,6 +35636,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpybub_rtt : HInst<
@@ -34744,6 +35649,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpybub_rtt_acc : HInst<
@@ -34757,6 +35663,7 @@ let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -34768,6 +35675,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34780,6 +35688,7 @@ def V6_vrmpybub_rtt_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34794,6 +35703,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpybus_acc : HInst<
@@ -34807,6 +35717,7 @@ let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -34818,6 +35729,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34830,6 +35742,7 @@ def V6_vrmpybus_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34844,6 +35757,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpybusi_acc : HInst<
@@ -34857,6 +35771,7 @@ let Inst{31-21} = 0b00011001010;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -34868,6 +35783,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34880,6 +35796,7 @@ def V6_vrmpybusi_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34894,6 +35811,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpybusv_acc : HInst<
@@ -34907,6 +35825,7 @@ let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -34918,6 +35837,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34930,6 +35850,7 @@ def V6_vrmpybusv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34944,6 +35865,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpybv_acc : HInst<
@@ -34957,6 +35879,7 @@ let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -34968,6 +35891,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34980,6 +35904,7 @@ def V6_vrmpybv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -34994,6 +35919,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpyub_acc : HInst<
@@ -35007,6 +35933,7 @@ let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -35018,6 +35945,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35030,6 +35958,7 @@ def V6_vrmpyub_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35044,6 +35973,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpyub_rtt_acc : HInst<
@@ -35057,6 +35987,7 @@ let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -35068,6 +35999,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35080,6 +36012,7 @@ def V6_vrmpyub_rtt_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35094,6 +36027,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpyubi_acc : HInst<
@@ -35107,6 +36041,7 @@ let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -35118,6 +36053,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35130,6 +36066,7 @@ def V6_vrmpyubi_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35144,6 +36081,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpyubv_acc : HInst<
@@ -35157,6 +36095,7 @@ let Inst{31-21} = 0b00011100000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -35168,6 +36107,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35180,6 +36120,7 @@ def V6_vrmpyubv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35194,6 +36135,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-19} = 0b0001100111101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpyzbb_rt_acc : HInst<
@@ -35207,6 +36149,7 @@ let Inst{31-19} = 0b0001100111000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vyyyy32 = $Vyyyy32in";
 }
@@ -35220,6 +36163,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-19} = 0b0001100111100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx8 = $Rx8in";
 }
@@ -35234,6 +36178,7 @@ let Inst{31-19} = 0b0001100111001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in";
 }
@@ -35247,6 +36192,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-19} = 0b0001100111111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpyzbub_rt_acc : HInst<
@@ -35260,6 +36206,7 @@ let Inst{31-19} = 0b0001100111010;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vyyyy32 = $Vyyyy32in";
 }
@@ -35273,6 +36220,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-19} = 0b0001100111110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx8 = $Rx8in";
 }
@@ -35287,6 +36235,7 @@ let Inst{31-19} = 0b0001100111011;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in";
 }
@@ -35300,6 +36249,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-19} = 0b0001100111101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpyzcb_rt_acc : HInst<
@@ -35313,6 +36263,7 @@ let Inst{31-19} = 0b0001100111000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vyyyy32 = $Vyyyy32in";
 }
@@ -35326,6 +36277,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-19} = 0b0001100111100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx8 = $Rx8in";
 }
@@ -35340,6 +36292,7 @@ let Inst{31-19} = 0b0001100111001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in";
 }
@@ -35353,6 +36306,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-19} = 0b0001100111101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpyzcbs_rt_acc : HInst<
@@ -35366,6 +36320,7 @@ let Inst{31-19} = 0b0001100111000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vyyyy32 = $Vyyyy32in";
 }
@@ -35379,6 +36334,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-19} = 0b0001100111100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx8 = $Rx8in";
 }
@@ -35393,6 +36349,7 @@ let Inst{31-19} = 0b0001100111001;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in";
 }
@@ -35406,6 +36363,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-19} = 0b0001100111111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrmpyznb_rt_acc : HInst<
@@ -35419,6 +36377,7 @@ let Inst{31-19} = 0b0001100111010;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vyyyy32 = $Vyyyy32in";
 }
@@ -35432,6 +36391,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-19} = 0b0001100111110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Rx8 = $Rx8in";
 }
@@ -35446,6 +36406,7 @@ let Inst{31-19} = 0b0001100111011;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vyyyy32 = $Vyyyy32in, $Rx8 = $Rx8in";
 }
@@ -35459,6 +36420,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrotr : HInst<
@@ -35471,6 +36433,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011010100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrotr_alt : HInst<
@@ -35480,6 +36443,7 @@ def V6_vrotr_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV66]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35494,6 +36458,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vroundhb_alt : HInst<
@@ -35503,6 +36468,7 @@ def V6_vroundhb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35517,6 +36483,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vroundhub_alt : HInst<
@@ -35526,6 +36493,7 @@ def V6_vroundhub_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35540,6 +36508,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrounduhub_alt : HInst<
@@ -35549,6 +36518,7 @@ def V6_vrounduhub_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35563,6 +36533,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111111;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrounduwuh_alt : HInst<
@@ -35572,6 +36543,7 @@ def V6_vrounduwuh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35586,6 +36558,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vroundwh_alt : HInst<
@@ -35595,6 +36568,7 @@ def V6_vroundwh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35609,6 +36583,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vroundwuh_alt : HInst<
@@ -35618,6 +36593,7 @@ def V6_vroundwuh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35632,6 +36608,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vrsadubi_acc : HInst<
@@ -35645,6 +36622,7 @@ let Inst{31-21} = 0b00011001010;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -35656,6 +36634,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35668,6 +36647,7 @@ def V6_vrsadubi_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35682,18 +36662,20 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011101100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsathub : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.ub = vsat($Vu32.h,$Vv32.h)",
-tc_8772086c, TypeCVI_VINLANESAT>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_8772086c, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b010;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsathub_alt : HInst<
@@ -35703,6 +36685,7 @@ def V6_vsathub_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35717,6 +36700,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsatuwuh_alt : HInst<
@@ -35726,6 +36710,7 @@ def V6_vsatuwuh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35734,12 +36719,13 @@ def V6_vsatwh : HInst<
 (outs HvxVR:$Vd32),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.h = vsat($Vu32.w,$Vv32.w)",
-tc_8772086c, TypeCVI_VINLANESAT>, Enc_45364e, Requires<[UseHVXV60]> {
+tc_8772086c, TypeCVI_VA>, Enc_45364e, Requires<[UseHVXV60]> {
 let Inst{7-5} = 0b011;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsatwh_alt : HInst<
@@ -35749,6 +36735,7 @@ def V6_vsatwh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35763,6 +36750,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsb_alt : HInst<
@@ -35772,6 +36760,7 @@ def V6_vsb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35784,6 +36773,7 @@ tc_9f363d21, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b001;
 let Inst{31-21} = 0b00101111001;
 let accessSize = HalfWordAccess;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -35796,6 +36786,7 @@ let Inst{7-5} = 0b101;
 let Inst{31-21} = 0b00101111001;
 let accessSize = HalfWordAccess;
 let isAccumulator = 1;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -35805,6 +36796,7 @@ def V6_vscattermh_add_alt : HInst<
 "vscatter($Rt32,$Mu2,$Vv32.h) += $Vw32.h",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35814,6 +36806,7 @@ def V6_vscattermh_alt : HInst<
 (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32),
 "vscatter($Rt32,$Mu2,$Vv32.h) = $Vw32.h",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35826,6 +36819,7 @@ tc_8e420e4d, TypeCVI_SCATTER>, Enc_9be1de, Requires<[UseHVXV65]> {
 let Inst{7-7} = 0b1;
 let Inst{31-21} = 0b00101111100;
 let accessSize = HalfWordAccess;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -35834,6 +36828,7 @@ def V6_vscattermhq_alt : HInst<
 (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32),
 "if ($Qs4) vscatter($Rt32,$Mu2,$Vv32.h) = $Vw32.h",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35846,6 +36841,7 @@ tc_7273323b, TypeCVI_SCATTER_DV>, Enc_a641d0, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b010;
 let Inst{31-21} = 0b00101111001;
 let accessSize = HalfWordAccess;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -35858,6 +36854,7 @@ let Inst{7-5} = 0b110;
 let Inst{31-21} = 0b00101111001;
 let accessSize = HalfWordAccess;
 let isAccumulator = 1;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -35869,6 +36866,7 @@ tc_58d21193, TypeCVI_SCATTER_DV>, Enc_3d6d37, Requires<[UseHVXV65]> {
 let Inst{7-7} = 0b0;
 let Inst{31-21} = 0b00101111101;
 let accessSize = HalfWordAccess;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -35880,6 +36878,7 @@ tc_9f363d21, TypeCVI_SCATTER>, Enc_16c48b, Requires<[UseHVXV65]> {
 let Inst{7-5} = 0b000;
 let Inst{31-21} = 0b00101111001;
 let accessSize = WordAccess;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -35892,6 +36891,7 @@ let Inst{7-5} = 0b100;
 let Inst{31-21} = 0b00101111001;
 let accessSize = WordAccess;
 let isAccumulator = 1;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -35901,6 +36901,7 @@ def V6_vscattermw_add_alt : HInst<
 "vscatter($Rt32,$Mu2,$Vv32.w) += $Vw32.w",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35910,6 +36911,7 @@ def V6_vscattermw_alt : HInst<
 (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32),
 "vscatter($Rt32,$Mu2,$Vv32.w) = $Vw32.w",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35920,6 +36922,7 @@ def V6_vscattermwh_add_alt : HInst<
 "vscatter($Rt32,$Mu2,$Vvv32.w) += $Vw32.h",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35929,6 +36932,7 @@ def V6_vscattermwh_alt : HInst<
 (ins IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32, HvxVR:$Vw32),
 "vscatter($Rt32,$Mu2,$Vvv32.w) = $Vw32.h",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35938,6 +36942,7 @@ def V6_vscattermwhq_alt : HInst<
 (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxWR:$Vvv32, HvxVR:$Vw32),
 "if ($Qs4) vscatter($Rt32,$Mu2,$Vvv32.w) = $Vw32.h",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35950,6 +36955,7 @@ tc_8e420e4d, TypeCVI_SCATTER>, Enc_9be1de, Requires<[UseHVXV65]> {
 let Inst{7-7} = 0b0;
 let Inst{31-21} = 0b00101111100;
 let accessSize = WordAccess;
+let isCVI = 1;
 let mayStore = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
@@ -35958,6 +36964,7 @@ def V6_vscattermwq_alt : HInst<
 (ins HvxQR:$Qs4, IntRegs:$Rt32, ModRegs:$Mu2, HvxVR:$Vv32, HvxVR:$Vw32),
 "if ($Qs4) vscatter($Rt32,$Mu2,$Vv32.w) = $Vw32.w",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV65]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35972,6 +36979,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsh_alt : HInst<
@@ -35981,6 +36989,7 @@ def V6_vsh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -35995,6 +37004,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vshufeh_alt : HInst<
@@ -36004,6 +37014,7 @@ def V6_vshufeh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36020,6 +37031,7 @@ let hasNewValue = 1;
 let opNewValue = 0;
 let hasNewValue2 = 1;
 let opNewValue2 = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vy32 = $Vy32in, $Vx32 = $Vx32in";
 }
@@ -36033,6 +37045,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vshuffb_alt : HInst<
@@ -36042,6 +37055,7 @@ def V6_vshuffb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36056,6 +37070,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vshuffeb_alt : HInst<
@@ -36065,6 +37080,7 @@ def V6_vshuffeb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36079,6 +37095,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vshuffh_alt : HInst<
@@ -36088,6 +37105,7 @@ def V6_vshuffh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36102,6 +37120,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vshuffob_alt : HInst<
@@ -36111,6 +37130,7 @@ def V6_vshuffob_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36125,6 +37145,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-24} = 0b00011011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vshufoeb : HInst<
@@ -36137,6 +37158,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vshufoeb_alt : HInst<
@@ -36146,6 +37168,7 @@ def V6_vshufoeb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36160,6 +37183,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vshufoeh_alt : HInst<
@@ -36169,6 +37193,7 @@ def V6_vshufoeh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36183,6 +37208,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vshufoh_alt : HInst<
@@ -36192,6 +37218,7 @@ def V6_vshufoh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36206,6 +37233,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubb_alt : HInst<
@@ -36215,6 +37243,7 @@ def V6_vsubb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36229,6 +37258,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubb_dv_alt : HInst<
@@ -36238,6 +37268,7 @@ def V6_vsubb_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36253,6 +37284,7 @@ let Inst{21-16} = 0b000010;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -36263,6 +37295,7 @@ def V6_vsubbnq_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36279,6 +37312,7 @@ let Inst{21-16} = 0b000001;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -36289,6 +37323,7 @@ def V6_vsubbq_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36304,6 +37339,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubbsat_alt : HInst<
@@ -36313,6 +37349,7 @@ def V6_vsubbsat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36327,6 +37364,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubbsat_dv_alt : HInst<
@@ -36336,6 +37374,7 @@ def V6_vsubbsat_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36350,6 +37389,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011100101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Qx4 = $Qx4in";
 }
@@ -36357,7 +37397,7 @@ def V6_vsubcarryo : HInst<
 (outs HvxVR:$Vd32, HvxQR:$Qe4),
 (ins HvxVR:$Vu32, HvxVR:$Vv32),
 "$Vd32.w,$Qe4 = vsub($Vu32.w,$Vv32.w):carry",
-tc_e35c1e93, TypeCOPROC_VX>, Enc_c1d806, Requires<[UseHVXV66]> {
+tc_e35c1e93, TypeCVI_VA>, Enc_c1d806, Requires<[UseHVXV66]> {
 let Inst{7-7} = 0b1;
 let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011101101;
@@ -36365,6 +37405,7 @@ let hasNewValue = 1;
 let opNewValue = 0;
 let hasNewValue2 = 1;
 let opNewValue2 = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubh : HInst<
@@ -36377,6 +37418,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubh_alt : HInst<
@@ -36386,6 +37428,7 @@ def V6_vsubh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36400,6 +37443,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubh_dv_alt : HInst<
@@ -36409,6 +37453,7 @@ def V6_vsubh_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36424,6 +37469,7 @@ let Inst{21-16} = 0b000010;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -36434,6 +37480,7 @@ def V6_vsubhnq_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36450,6 +37497,7 @@ let Inst{21-16} = 0b000001;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -36460,6 +37508,7 @@ def V6_vsubhq_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36475,6 +37524,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubhsat_alt : HInst<
@@ -36484,6 +37534,7 @@ def V6_vsubhsat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36498,6 +37549,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubhsat_dv_alt : HInst<
@@ -36507,6 +37559,7 @@ def V6_vsubhsat_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36521,6 +37574,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubhw_alt : HInst<
@@ -36530,6 +37584,7 @@ def V6_vsubhw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36544,6 +37599,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsububh_alt : HInst<
@@ -36553,6 +37609,7 @@ def V6_vsububh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36567,6 +37624,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsububsat_alt : HInst<
@@ -36576,6 +37634,7 @@ def V6_vsububsat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36590,6 +37649,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsububsat_dv_alt : HInst<
@@ -36599,6 +37659,7 @@ def V6_vsububsat_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36613,6 +37674,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubuhsat : HInst<
@@ -36625,6 +37687,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubuhsat_alt : HInst<
@@ -36634,6 +37697,7 @@ def V6_vsubuhsat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36648,6 +37712,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubuhsat_dv_alt : HInst<
@@ -36657,6 +37722,7 @@ def V6_vsubuhsat_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36671,6 +37737,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubuhw_alt : HInst<
@@ -36680,6 +37747,7 @@ def V6_vsubuhw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36694,6 +37762,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011111110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubuwsat_alt : HInst<
@@ -36703,6 +37772,7 @@ def V6_vsubuwsat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36717,6 +37787,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011110101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubuwsat_dv_alt : HInst<
@@ -36726,6 +37797,7 @@ def V6_vsubuwsat_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV62]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36740,6 +37812,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubw_alt : HInst<
@@ -36749,6 +37822,7 @@ def V6_vsubw_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36763,6 +37837,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100100;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubw_dv_alt : HInst<
@@ -36772,6 +37847,7 @@ def V6_vsubw_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36787,6 +37863,7 @@ let Inst{21-16} = 0b000010;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -36797,6 +37874,7 @@ def V6_vsubwnq_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36813,6 +37891,7 @@ let Inst{21-16} = 0b000010;
 let Inst{31-24} = 0b00011110;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vx32 = $Vx32in";
 }
@@ -36823,6 +37902,7 @@ def V6_vsubwq_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36838,6 +37918,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100011;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubwsat_alt : HInst<
@@ -36847,6 +37928,7 @@ def V6_vsubwsat_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36861,6 +37943,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vsubwsat_dv_alt : HInst<
@@ -36870,6 +37953,7 @@ def V6_vsubwsat_dv_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36884,6 +37968,7 @@ let Inst{13-13} = 0b1;
 let Inst{31-21} = 0b00011110101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vtmpyb : HInst<
@@ -36896,6 +37981,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vtmpyb_acc : HInst<
@@ -36909,6 +37995,7 @@ let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -36920,6 +38007,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36932,6 +38020,7 @@ def V6_vtmpyb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36946,6 +38035,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vtmpybus_acc : HInst<
@@ -36959,6 +38049,7 @@ let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -36970,6 +38061,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36982,6 +38074,7 @@ def V6_vtmpybus_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -36996,6 +38089,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vtmpyhb_acc : HInst<
@@ -37009,6 +38103,7 @@ let Inst{31-21} = 0b00011001000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -37020,6 +38115,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37032,6 +38128,7 @@ def V6_vtmpyhb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37045,6 +38142,7 @@ let hasNewValue = 1;
 let opNewValue = 0;
 let hasNewValue2 = 1;
 let opNewValue2 = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37060,6 +38158,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vunpackb_alt : HInst<
@@ -37069,6 +38168,7 @@ def V6_vunpackb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37083,6 +38183,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vunpackh_alt : HInst<
@@ -37092,6 +38193,7 @@ def V6_vunpackh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37107,6 +38209,7 @@ let Inst{31-16} = 0b0001111000000000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -37118,6 +38221,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
@@ -37133,6 +38237,7 @@ let Inst{31-16} = 0b0001111000000000;
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 let Constraints = "$Vxx32 = $Vxx32in";
 }
@@ -37144,6 +38249,7 @@ PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
 let isAccumulator = 1;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37159,6 +38265,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vunpackub_alt : HInst<
@@ -37168,6 +38275,7 @@ def V6_vunpackub_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37182,6 +38290,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vunpackuh_alt : HInst<
@@ -37191,6 +38300,7 @@ def V6_vunpackuh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37202,6 +38312,7 @@ def V6_vwhist128 : HInst<
 tc_1381a97c, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> {
 let Inst{13-0} = 0b10010010000000;
 let Inst{31-16} = 0b0001111000000000;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vwhist128m : HInst<
@@ -37212,6 +38323,7 @@ tc_b28e51aa, TypeCVI_HIST>, Enc_efaed8, Requires<[UseHVXV62]> {
 let Inst{7-0} = 0b10000000;
 let Inst{13-9} = 0b10011;
 let Inst{31-16} = 0b0001111000000000;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vwhist128q : HInst<
@@ -37222,6 +38334,7 @@ tc_e3f68a46, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> {
 let Inst{13-0} = 0b10010010000000;
 let Inst{21-16} = 0b000010;
 let Inst{31-24} = 0b00011110;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vwhist128qm : HInst<
@@ -37233,6 +38346,7 @@ let Inst{7-0} = 0b10000000;
 let Inst{13-9} = 0b10011;
 let Inst{21-16} = 0b000010;
 let Inst{31-24} = 0b00011110;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vwhist256 : HInst<
@@ -37242,6 +38356,7 @@ def V6_vwhist256 : HInst<
 tc_1381a97c, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> {
 let Inst{13-0} = 0b10001010000000;
 let Inst{31-16} = 0b0001111000000000;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vwhist256_sat : HInst<
@@ -37251,6 +38366,7 @@ def V6_vwhist256_sat : HInst<
 tc_1381a97c, TypeCVI_HIST>, Enc_e3b0c4, Requires<[UseHVXV62]> {
 let Inst{13-0} = 0b10001110000000;
 let Inst{31-16} = 0b0001111000000000;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vwhist256q : HInst<
@@ -37261,6 +38377,7 @@ tc_e3f68a46, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> {
 let Inst{13-0} = 0b10001010000000;
 let Inst{21-16} = 0b000010;
 let Inst{31-24} = 0b00011110;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vwhist256q_sat : HInst<
@@ -37271,6 +38388,7 @@ tc_e3f68a46, TypeCVI_HIST>, Enc_217147, Requires<[UseHVXV62]> {
 let Inst{13-0} = 0b10001110000000;
 let Inst{21-16} = 0b000010;
 let Inst{31-24} = 0b00011110;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vxor : HInst<
@@ -37283,6 +38401,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00011100001;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vzb : HInst<
@@ -37295,6 +38414,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vzb_alt : HInst<
@@ -37304,6 +38424,7 @@ def V6_vzb_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37318,6 +38439,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-16} = 0b0001111000000010;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_vzh_alt : HInst<
@@ -37327,6 +38449,7 @@ def V6_vzh_alt : HInst<
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV60]> {
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37340,6 +38463,7 @@ let Inst{7-0} = 0b00000000;
 let Inst{12-11} = 0b00;
 let Inst{31-21} = 0b00101100000;
 let addrMode = BaseImmOffset;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37353,6 +38477,7 @@ let Inst{7-0} = 0b00000000;
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b00101101000;
 let addrMode = PostInc;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37366,6 +38491,7 @@ tc_a0dbea28, TypeCVI_ZW>, Enc_44661f, Requires<[UseHVXV66,UseZReg]> {
 let Inst{12-0} = 0b0000000000001;
 let Inst{31-21} = 0b00101101000;
 let addrMode = PostInc;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37380,6 +38506,7 @@ let Inst{7-0} = 0b00000000;
 let Inst{31-21} = 0b00101100100;
 let isPredicated = 1;
 let addrMode = BaseImmOffset;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37394,6 +38521,7 @@ let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b00101101100;
 let isPredicated = 1;
 let addrMode = PostInc;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37408,6 +38536,7 @@ let Inst{10-0} = 0b00000000001;
 let Inst{31-21} = 0b00101101100;
 let isPredicated = 1;
 let addrMode = PostInc;
+let isCVI = 1;
 let mayLoad = 1;
 let isRestrictNoSlot1Store = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37422,6 +38551,7 @@ let Inst{13-5} = 0b000001001;
 let Inst{31-21} = 0b00011001101;
 let hasNewValue = 1;
 let opNewValue = 0;
+let isCVI = 1;
 let DecoderNamespace = "EXT_mmvec";
 }
 def V6_zld0 : HInst<
@@ -37429,6 +38559,7 @@ def V6_zld0 : HInst<
 (ins IntRegs:$Rt32),
 "z = vmem($Rt32)",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV66]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37438,6 +38569,7 @@ def V6_zldp0 : HInst<
 (ins PredRegs:$Pv4, IntRegs:$Rt32),
 "if ($Pv4) z = vmem($Rt32)",
 PSEUDO, TypeMAPPING>, Requires<[UseHVXV66]> {
+let isCVI = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
 let DecoderNamespace = "EXT_mmvec";
@@ -37446,7 +38578,7 @@ def Y2_barrier : HInst<
 (outs),
 (ins),
 "barrier",
-tc_8c99de45, TypeST>, Enc_e3b0c4 {
+tc_77f94a5e, TypeST>, Enc_e3b0c4 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-16} = 0b1010100000000000;
 let isSoloAX = 1;
@@ -37456,7 +38588,7 @@ def Y2_break : HInst<
 (outs),
 (ins),
 "brkpt",
-tc_9ad9998f, TypeCR>, Enc_e3b0c4 {
+tc_55255f2b, TypeCR>, Enc_e3b0c4 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-16} = 0b0110110000100000;
 let isSolo = 1;
@@ -37465,7 +38597,7 @@ def Y2_dccleana : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "dccleana($Rs32)",
-tc_b857bf4e, TypeST>, Enc_ecbcc8 {
+tc_b1ae5f67, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000000;
 let isRestrictSlot1AOK = 1;
@@ -37475,7 +38607,7 @@ def Y2_dccleaninva : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "dccleaninva($Rs32)",
-tc_b857bf4e, TypeST>, Enc_ecbcc8 {
+tc_b1ae5f67, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000010;
 let isRestrictSlot1AOK = 1;
@@ -37485,7 +38617,7 @@ def Y2_dcfetch : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "dcfetch($Rs32)",
-tc_d63f638c, TypeMAPPING> {
+tc_d45ba9cd, TypeMAPPING> {
 let hasSideEffects = 1;
 let isPseudo = 1;
 let isCodeGenOnly = 1;
@@ -37494,7 +38626,7 @@ def Y2_dcfetchbo : HInst<
 (outs),
 (ins IntRegs:$Rs32, u11_3Imm:$Ii),
 "dcfetch($Rs32+#$Ii)",
-tc_9ca930f7, TypeLD>, Enc_2d829e {
+tc_2237d952, TypeLD>, Enc_2d829e {
 let Inst{13-11} = 0b000;
 let Inst{31-21} = 0b10010100000;
 let addrMode = BaseImmOffset;
@@ -37505,7 +38637,7 @@ def Y2_dcinva : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "dcinva($Rs32)",
-tc_b857bf4e, TypeST>, Enc_ecbcc8 {
+tc_b1ae5f67, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000001;
 let isRestrictSlot1AOK = 1;
@@ -37515,7 +38647,7 @@ def Y2_dczeroa : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "dczeroa($Rs32)",
-tc_b857bf4e, TypeST>, Enc_ecbcc8 {
+tc_b1ae5f67, TypeST>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b10100000110;
 let isRestrictSlot1AOK = 1;
@@ -37526,7 +38658,7 @@ def Y2_icinva : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "icinva($Rs32)",
-tc_5d7f5414, TypeJ>, Enc_ecbcc8 {
+tc_0ba0d5da, TypeJ>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b01010110110;
 let isSolo = 1;
@@ -37535,7 +38667,7 @@ def Y2_isync : HInst<
 (outs),
 (ins),
 "isync",
-tc_8b121f4a, TypeJ>, Enc_e3b0c4 {
+tc_9b34f5e0, TypeJ>, Enc_e3b0c4 {
 let Inst{13-0} = 0b00000000000010;
 let Inst{31-16} = 0b0101011111000000;
 let isSolo = 1;
@@ -37544,7 +38676,7 @@ def Y2_syncht : HInst<
 (outs),
 (ins),
 "syncht",
-tc_8c99de45, TypeST>, Enc_e3b0c4 {
+tc_77f94a5e, TypeST>, Enc_e3b0c4 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-16} = 0b1010100001000000;
 let isSolo = 1;
@@ -37553,7 +38685,7 @@ def Y2_wait : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "wait($Rs32)",
-tc_174516e8, TypeCR>, Enc_ecbcc8, Requires<[HasV65]> {
+tc_d7718fbe, TypeCR>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b01100100010;
 let isSolo = 1;
@@ -37562,7 +38694,7 @@ def Y4_l2fetch : HInst<
 (outs),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "l2fetch($Rs32,$Rt32)",
-tc_fe211424, TypeST>, Enc_ca3887 {
+tc_a3070909, TypeST>, Enc_ca3887 {
 let Inst{7-0} = 0b00000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10100110000;
@@ -37574,7 +38706,7 @@ def Y4_trace : HInst<
 (outs),
 (ins IntRegs:$Rs32),
 "trace($Rs32)",
-tc_6b25e783, TypeCR>, Enc_ecbcc8 {
+tc_d7718fbe, TypeCR>, Enc_ecbcc8 {
 let Inst{13-0} = 0b00000000000000;
 let Inst{31-21} = 0b01100010010;
 let isSoloAX = 1;
@@ -37583,7 +38715,7 @@ def Y5_l2fetch : HInst<
 (outs),
 (ins IntRegs:$Rs32, DoubleRegs:$Rtt32),
 "l2fetch($Rs32,$Rtt32)",
-tc_fe211424, TypeST>, Enc_e6abcf {
+tc_a3070909, TypeST>, Enc_e6abcf {
 let Inst{7-0} = 0b00000000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b10100110100;
@@ -37591,11 +38723,37 @@ let isSoloAX = 1;
 let mayStore = 1;
 let hasSideEffects = 1;
 }
+def Y6_diag : HInst<
+(outs),
+(ins IntRegs:$Rs32),
+"diag($Rs32)",
+tc_2c3e17fc, TypeCR>, Enc_ecbcc8, Requires<[HasV67]> {
+let Inst{13-0} = 0b00000000100000;
+let Inst{31-21} = 0b01100010010;
+}
+def Y6_diag0 : HInst<
+(outs),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"diag0($Rss32,$Rtt32)",
+tc_28e55c6f, TypeCR>, Enc_b00112, Requires<[HasV67]> {
+let Inst{7-0} = 0b01000000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100010010;
+}
+def Y6_diag1 : HInst<
+(outs),
+(ins DoubleRegs:$Rss32, DoubleRegs:$Rtt32),
+"diag1($Rss32,$Rtt32)",
+tc_28e55c6f, TypeCR>, Enc_b00112, Requires<[HasV67]> {
+let Inst{7-0} = 0b01100000;
+let Inst{13-13} = 0b0;
+let Inst{31-21} = 0b01100010010;
+}
 def dep_A2_addsat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rd32 = add($Rs32,$Rt32):sat:deprecated",
-tc_779080bf, TypeALU64>, Enc_5ab2be {
+tc_8a825db2, TypeALU64>, Enc_5ab2be {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101100;
@@ -37608,7 +38766,7 @@ def dep_A2_subsat : HInst<
 (outs IntRegs:$Rd32),
 (ins IntRegs:$Rt32, IntRegs:$Rs32),
 "$Rd32 = sub($Rt32,$Rs32):sat:deprecated",
-tc_779080bf, TypeALU64>, Enc_bd6011 {
+tc_8a825db2, TypeALU64>, Enc_bd6011 {
 let Inst{7-5} = 0b100;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010101100;
@@ -37621,8 +38779,476 @@ def dep_S2_packhl : HInst<
 (outs DoubleRegs:$Rdd32),
 (ins IntRegs:$Rs32, IntRegs:$Rt32),
 "$Rdd32 = packhl($Rs32,$Rt32):deprecated",
-tc_946df596, TypeALU64>, Enc_be32a5 {
+tc_5da50c4b, TypeALU64>, Enc_be32a5 {
 let Inst{7-5} = 0b000;
 let Inst{13-13} = 0b0;
 let Inst{31-21} = 0b11010100000;
 }
+def dup_A2_add : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, IntRegs:$Rt32),
+"$Rd32 = add($Rs32,$Rt32)",
+tc_388f9897, TypeALU32_3op>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+}
+def dup_A2_addi : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = add($Rs32,#$Ii)",
+tc_388f9897, TypeALU32_ADDI>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def dup_A2_andir : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = and($Rs32,#$Ii)",
+tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def dup_A2_combineii : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s32_0Imm:$Ii, s8_0Imm:$II),
+"$Rdd32 = combine(#$Ii,#$II)",
+tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def dup_A2_sxtb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = sxtb($Rs32)",
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+}
+def dup_A2_sxth : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = sxth($Rs32)",
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+}
+def dup_A2_tfr : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = $Rs32",
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+}
+def dup_A2_tfrsi : HInst<
+(outs IntRegs:$Rd32),
+(ins s32_0Imm:$Ii),
+"$Rd32 = #$Ii",
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 16;
+let opExtentAlign = 0;
+}
+def dup_A2_zxtb : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = zxtb($Rs32)",
+PSEUDO, TypeMAPPING>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+}
+def dup_A2_zxth : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32),
+"$Rd32 = zxth($Rs32)",
+tc_9124c04f, TypeALU32_2op>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+}
+def dup_A4_combineii : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s8_0Imm:$Ii, u32_0Imm:$II),
+"$Rdd32 = combine(#$Ii,#$II)",
+tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 0;
+let opExtentBits = 6;
+let opExtentAlign = 0;
+}
+def dup_A4_combineir : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins s32_0Imm:$Ii, IntRegs:$Rs32),
+"$Rdd32 = combine(#$Ii,$Rs32)",
+tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def dup_A4_combineri : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rdd32 = combine($Rs32,#$Ii)",
+tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def dup_C2_cmoveif : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if (!$Pu4) $Rd32 = #$Ii",
+tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def dup_C2_cmoveit : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if ($Pu4) $Rd32 = #$Ii",
+tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def dup_C2_cmovenewif : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if (!$Pu4.new) $Rd32 = #$Ii",
+tc_4ac61d92, TypeALU32_2op>, Requires<[HasV67]> {
+let isPredicated = 1;
+let isPredicatedFalse = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPredicatedNew = 1;
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def dup_C2_cmovenewit : HInst<
+(outs IntRegs:$Rd32),
+(ins PredRegs:$Pu4, s32_0Imm:$Ii),
+"if ($Pu4.new) $Rd32 = #$Ii",
+tc_4ac61d92, TypeALU32_2op>, Requires<[HasV67]> {
+let isPredicated = 1;
+let hasNewValue = 1;
+let opNewValue = 0;
+let AsmVariantName = "NonParsable";
+let isPredicatedNew = 1;
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 0;
+}
+def dup_C2_cmpeqi : HInst<
+(outs PredRegs:$Pd4),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Pd4 = cmp.eq($Rs32,#$Ii)",
+tc_388f9897, TypeALU32_2op>, Requires<[HasV67]> {
+let AsmVariantName = "NonParsable";
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 10;
+let opExtentAlign = 0;
+}
+def dup_L2_deallocframe : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32),
+"$Rdd32 = deallocframe($Rs32):raw",
+tc_aee6250c, TypeLD>, Requires<[HasV67]> {
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let Uses = [FRAMEKEY];
+let Defs = [R29];
+let isPseudo = 1;
+}
+def dup_L2_loadrb_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = memb($Rs32+#$Ii)",
+tc_eed07714, TypeLD>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+}
+def dup_L2_loadrd_io : HInst<
+(outs DoubleRegs:$Rdd32),
+(ins IntRegs:$Rs32, s29_3Imm:$Ii),
+"$Rdd32 = memd($Rs32+#$Ii)",
+tc_eed07714, TypeLD>, Requires<[HasV67]> {
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 14;
+let opExtentAlign = 3;
+}
+def dup_L2_loadrh_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Rd32 = memh($Rs32+#$Ii)",
+tc_eed07714, TypeLD>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def dup_L2_loadri_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii),
+"$Rd32 = memw($Rs32+#$Ii)",
+tc_eed07714, TypeLD>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+}
+def dup_L2_loadrub_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii),
+"$Rd32 = memub($Rs32+#$Ii)",
+tc_eed07714, TypeLD>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+}
+def dup_L2_loadruh_io : HInst<
+(outs IntRegs:$Rd32),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii),
+"$Rd32 = memuh($Rs32+#$Ii)",
+tc_eed07714, TypeLD>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let AsmVariantName = "NonParsable";
+let mayLoad = 1;
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def dup_S2_allocframe : HInst<
+(outs IntRegs:$Rx32),
+(ins IntRegs:$Rx32in, u11_3Imm:$Ii),
+"allocframe($Rx32,#$Ii):raw",
+tc_74a42bda, TypeST>, Requires<[HasV67]> {
+let hasNewValue = 1;
+let opNewValue = 0;
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let Uses = [FRAMEKEY, FRAMELIMIT, R30, R31];
+let Defs = [R30];
+let isPseudo = 1;
+let Constraints = "$Rx32 = $Rx32in";
+}
+def dup_S2_storerb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s32_0Imm:$Ii, IntRegs:$Rt32),
+"memb($Rs32+#$Ii) = $Rt32",
+tc_a9edeffa, TypeST>, Requires<[HasV67]> {
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 11;
+let opExtentAlign = 0;
+}
+def dup_S2_storerd_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s29_3Imm:$Ii, DoubleRegs:$Rtt32),
+"memd($Rs32+#$Ii) = $Rtt32",
+tc_a9edeffa, TypeST>, Requires<[HasV67]> {
+let addrMode = BaseImmOffset;
+let accessSize = DoubleWordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 14;
+let opExtentAlign = 3;
+}
+def dup_S2_storerh_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s31_1Imm:$Ii, IntRegs:$Rt32),
+"memh($Rs32+#$Ii) = $Rt32",
+tc_a9edeffa, TypeST>, Requires<[HasV67]> {
+let addrMode = BaseImmOffset;
+let accessSize = HalfWordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 12;
+let opExtentAlign = 1;
+}
+def dup_S2_storeri_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, s30_2Imm:$Ii, IntRegs:$Rt32),
+"memw($Rs32+#$Ii) = $Rt32",
+tc_a9edeffa, TypeST>, Requires<[HasV67]> {
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 1;
+let isExtentSigned = 1;
+let opExtentBits = 13;
+let opExtentAlign = 2;
+}
+def dup_S4_storeirb_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u6_0Imm:$Ii, s32_0Imm:$II),
+"memb($Rs32+#$Ii) = #$II",
+tc_838c4d7a, TypeV4LDST>, Requires<[HasV67]> {
+let addrMode = BaseImmOffset;
+let accessSize = ByteAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
+def dup_S4_storeiri_io : HInst<
+(outs),
+(ins IntRegs:$Rs32, u6_2Imm:$Ii, s32_0Imm:$II),
+"memw($Rs32+#$Ii) = #$II",
+tc_838c4d7a, TypeV4LDST>, Requires<[HasV67]> {
+let addrMode = BaseImmOffset;
+let accessSize = WordAccess;
+let AsmVariantName = "NonParsable";
+let mayStore = 1;
+let isPseudo = 1;
+let isExtendable = 1;
+let opExtendable = 2;
+let isExtentSigned = 1;
+let opExtentBits = 8;
+let opExtentAlign = 0;
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
index 61a1df5eb94b..0143d6f44d88 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepMapAsm2Intrin.td
@@ -1,2750 +1,2680 @@
-//===-------------------------------------------------------*- tablegen -*-===//
+//===----------------------------------------------------------------------===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+// Automatically generated file, do not edit!
 //===----------------------------------------------------------------------===//
 
 
 // V5 Scalar Instructions.
 
-def: Pat<(int_hexagon_S2_asr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_asr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsatwh DoubleRegs:$src1),
-         (S2_vsatwh DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpysu_up IntRegs:$src1, IntRegs:$src2),
-         (M2_mpysu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpysc_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_cmpysc_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpysc_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_cmpysc_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_cmpyi_whc DoubleRegs:$src1, IntRegs:$src2),
-         (M4_cmpyi_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_sat_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_sat_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_shuffoh DoubleRegs:$src1, DoubleRegs:$src2),
-         (S2_shuffoh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfmax IntRegs:$src1, IntRegs:$src2),
-         (F2_sfmax IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vabswsat DoubleRegs:$src1),
-         (A2_vabswsat DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (S2_asr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
-         (S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_combineri IntRegs:$src1, s32_0ImmPred_timm:$src2),
-         (A4_combineri IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vcmpy_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vcmpy_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_notp DoubleRegs:$src1),
-         (A2_notp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_hl_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_hl_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_or_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
-         (C4_or_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2s_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_vmac2s_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2s_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_vmac2s_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_brevp DoubleRegs:$src1),
-         (S2_brevp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_pmpyw_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_pmpyw_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_cl1 IntRegs:$src1),
-         (S2_cl1 IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmplte IntRegs:$src1, IntRegs:$src2),
-         (C4_cmplte IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyul_s0 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_mmpyul_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_maxup DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_maxup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
-         (A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_interleave DoubleRegs:$src1),
-         (S2_interleave DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_abssat IntRegs:$src1),
-         (A2_abssat IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmpwgtu DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vcmpwgtu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpeq IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (C2_cmpeq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgt IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (C2_cmpgt IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C2_cmpgtu IntRegs:$src1, IntRegs:$src2),
-         (C2_cmpgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+         (C2_tfrpr (C2_cmpgtu IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2),
-         (C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmphgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
-         (A4_cmphgtui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpgti IntRegs:$src1, s32_0ImmPred_timm:$src2),
-         (C2_cmpgti IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyi IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyi IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2uw_chop DoubleRegs:$src1),
-         (F2_conv_df2uw_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpheq IntRegs:$src1, IntRegs:$src2),
-         (A4_cmpheq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_extractup DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3),
-         (S2_extractup DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_ntstbit_r IntRegs:$src1, IntRegs:$src2),
-         (S4_ntstbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_w2sf IntRegs:$src1),
-         (F2_conv_w2sf IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_not PredRegs:$src1),
-         (C2_not PredRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_tfrpr PredRegs:$src1),
-         (C2_tfrpr PredRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_ll_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_ll_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbgt IntRegs:$src1, IntRegs:$src2),
-         (A4_cmpbgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+         (C2_tfrpr (C2_cmpgtp DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_rcmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (A4_rcmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_rcmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
          (A4_rcmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_orp DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_orp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_up IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
-         (S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
-         (S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbgtu IntRegs:$src1, IntRegs:$src2),
-         (A4_cmpbgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_rcmpeq IntRegs:$src1, IntRegs:$src2),
+         (A4_rcmpeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_rcmpneq IntRegs:$src1, IntRegs:$src2),
+         (A4_rcmpneq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_bitsset IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (C2_bitsset IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_bitsclr IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (C2_bitsclr IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_nbitsset IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (C4_nbitsset IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_nbitsclr IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (C4_nbitsclr IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (C2_tfrpr (C2_cmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgti IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (C2_tfrpr (C2_cmpgti IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_cmpgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+         (C2_tfrpr (C2_cmpgtui IntRegs:$src1, u32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_bitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2),
+         (C2_tfrpr (C2_bitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_nbitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2),
+         (C2_tfrpr (C4_nbitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (C2_tfrpr (C4_cmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmpltei IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (C2_tfrpr (C4_cmpltei IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmplteui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+         (C2_tfrpr (C4_cmplteui IntRegs:$src1, u32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmpneq IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (C4_cmpneq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmplte IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (C4_cmplte IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_cmplteu IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (C4_cmplteu IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_and PredRegs:$src1, PredRegs:$src2),
+         (C2_tfrpr (C2_and (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_or PredRegs:$src1, PredRegs:$src2),
+         (C2_tfrpr (C2_or (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_xor PredRegs:$src1, PredRegs:$src2),
+         (C2_tfrpr (C2_xor (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_andn PredRegs:$src1, PredRegs:$src2),
+         (C2_tfrpr (C2_andn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_not PredRegs:$src1),
+         (C2_tfrpr (C2_not (C2_tfrrp PredRegs:$src1)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_orn PredRegs:$src1, PredRegs:$src2),
+         (C2_tfrpr (C2_orn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_and_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C2_tfrpr (C4_and_and (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_and_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C2_tfrpr (C4_and_or (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_or_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C2_tfrpr (C4_or_and (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_or_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C2_tfrpr (C4_or_or (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_and_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C2_tfrpr (C4_and_andn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C2_tfrpr (C4_and_orn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C2_tfrpr (C4_or_andn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_or_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
+         (C2_tfrpr (C4_or_orn (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2), (C2_tfrrp PredRegs:$src3)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_pxfer_map PredRegs:$src1),
+         (C2_tfrpr (C2_pxfer_map (C2_tfrrp PredRegs:$src1)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_any8 PredRegs:$src1),
+         (C2_tfrpr (C2_any8 (C2_tfrrp PredRegs:$src1)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_all8 PredRegs:$src1),
+         (C2_tfrpr (C2_all8 (C2_tfrrp PredRegs:$src1)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_vitpack PredRegs:$src1, PredRegs:$src2),
+         (C2_vitpack (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_mux PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (C2_mux (C2_tfrrp PredRegs:$src1), IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_muxii PredRegs:$src1, s32_0ImmPred_timm:$src2, s8_0ImmPred_timm:$src3),
+         (C2_muxii (C2_tfrrp PredRegs:$src1), s32_0ImmPred_timm:$src2, s8_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+         (C2_muxir (C2_tfrrp PredRegs:$src1), IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_muxri PredRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3),
+         (C2_muxri (C2_tfrrp PredRegs:$src1), s32_0ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (C2_vmux (C2_tfrrp PredRegs:$src1), DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_mask PredRegs:$src1),
+         (C2_mask (C2_tfrrp PredRegs:$src1))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpbeq DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (A2_vcmpbeq DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred_timm:$src2),
+         (C2_tfrpr (A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2),
-         (A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+         (C2_tfrpr (A4_vcmpbeq_any DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpbgtu DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (A2_vcmpbgtu DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
+         (C2_tfrpr (A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbgt DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (A4_vcmpbgt DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+         (C2_tfrpr (A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbeq IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (A4_cmpbeq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbeqi IntRegs:$src1, u8_0ImmPred_timm:$src2),
+         (C2_tfrpr (A4_cmpbeqi IntRegs:$src1, u8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgtu IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (A4_cmpbgtu IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+         (C2_tfrpr (A4_cmpbgtui IntRegs:$src1, u32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpbgt IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (A4_cmpbgt IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_cmpbgti IntRegs:$src1, s8_0ImmPred_timm:$src2),
-         (A4_cmpbgti IntRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addsp IntRegs:$src1, DoubleRegs:$src2),
-         (A2_addsp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2),
-         (S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+         (C2_tfrpr (A4_cmpbgti IntRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpheq DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (A2_vcmpheq DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmphgt DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (A2_vcmphgt DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmphgtu DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (A2_vcmphgtu DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
-         (A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2),
-         (S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_pmpyw IntRegs:$src1, IntRegs:$src2),
-         (M4_pmpyw IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsathb DoubleRegs:$src1),
-         (S2_vsathb DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_asr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyu_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyu_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_lsl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_pxorf PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (A2_pxorf PredRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
-         (S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (S2_asl_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_absp DoubleRegs:$src1),
-         (A2_absp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_all8 PredRegs:$src1),
-         (C2_all8 PredRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrminuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (A4_vrminuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffma_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (F2_sffma_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyoh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M4_vrmpyoh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_bitsset IntRegs:$src1, IntRegs:$src2),
-         (C2_bitsset IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpysip IntRegs:$src1, u32_0ImmPred_timm:$src2),
-         (M2_mpysip IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpysin IntRegs:$src1, u8_0ImmPred_timm:$src2),
-         (M2_mpysin IntRegs:$src1, u8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+         (C2_tfrpr (A4_vcmpheqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+         (C2_tfrpr (A4_vcmphgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
+         (C2_tfrpr (A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpheq IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (A4_cmpheq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgt IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (A4_cmphgt IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgtu IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (A4_cmphgtu IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmpheqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (C2_tfrpr (A4_cmpheqi IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgti IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (C2_tfrpr (A4_cmphgti IntRegs:$src1, s32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cmphgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
+         (C2_tfrpr (A4_cmphgtui IntRegs:$src1, u32_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpwgt DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (A2_vcmpwgt DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vcmpwgtu DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (A2_vcmpwgtu DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+         (C2_tfrpr (A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
+         (C2_tfrpr (A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
+         (C2_tfrpr (A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A4_boundscheck IntRegs:$src1, DoubleRegs:$src2),
-         (A4_boundscheck IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2),
-         (M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+         (C2_tfrpr (A4_boundscheck IntRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_tlbmatch DoubleRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (A4_tlbmatch DoubleRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_tfrpr PredRegs:$src1),
+         (C2_tfrpr (C2_tfrrp PredRegs:$src1))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C2_tfrrp IntRegs:$src1),
+         (C2_tfrpr (C2_tfrrp IntRegs:$src1))>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_C4_fastcorner9 PredRegs:$src1, PredRegs:$src2),
-         (C4_fastcorner9 PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmpys_s1rp DoubleRegs:$src1, IntRegs:$src2),
-         (M2_vrcmpys_s1rp DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subsat IntRegs:$src1, IntRegs:$src2),
-         (A2_subsat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_r IntRegs:$src1, IntRegs:$src2),
-         (S2_asl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_p DoubleRegs:$src1, IntRegs:$src2),
-         (S2_asl_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vnavgh DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vnavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+         (C2_tfrpr (C4_fastcorner9 (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_C4_fastcorner9_not PredRegs:$src1, PredRegs:$src2),
+         (C2_tfrpr (C4_fastcorner9_not (C2_tfrrp PredRegs:$src1), (C2_tfrrp PredRegs:$src2)))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_acc_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_acc_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_nac_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpy_nac_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_ud2df DoubleRegs:$src1),
-         (F2_conv_ud2df DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_subi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S4_subi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vzxthw IntRegs:$src1),
-         (S2_vzxthw IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfadd IntRegs:$src1, IntRegs:$src2),
-         (F2_sfadd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_sub IntRegs:$src1, IntRegs:$src2),
-         (A2_sub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2su_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_vmac2su_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4),
-         (S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_packhl IntRegs:$src1, IntRegs:$src2),
-         (S2_packhl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
-         (A4_vcmpwgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svsubhs IntRegs:$src1, IntRegs:$src2),
-         (A2_svsubhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_l16_hl IntRegs:$src1, IntRegs:$src2),
-         (A2_addh_l16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_d2df DoubleRegs:$src1),
-         (F2_conv_d2df DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
-         (C2_cmpgtui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vconj DoubleRegs:$src1),
-         (A2_vconj DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2),
-         (S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_vh DoubleRegs:$src1, IntRegs:$src2),
-         (S2_lsr_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_l16_hl IntRegs:$src1, IntRegs:$src2),
-         (A2_subh_l16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vxsubaddhr DoubleRegs:$src1, DoubleRegs:$src2),
-         (S4_vxsubaddhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_clbp DoubleRegs:$src1),
-         (S2_clbp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_deinterleave DoubleRegs:$src1),
-         (S2_deinterleave DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_any8 PredRegs:$src1),
-         (C2_any8 PredRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_togglebit_r IntRegs:$src1, IntRegs:$src2),
-         (S2_togglebit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_togglebit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (S2_togglebit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_uw2sf IntRegs:$src1),
-         (F2_conv_uw2sf IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsathb_nopack DoubleRegs:$src1),
-         (S2_vsathb_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_cmacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_cmacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_hl_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_hl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_sat_hh_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_sat_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_sat_hh_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_sat_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmacuhs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmacuhs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmacuhs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmacuhs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_clrbit_r IntRegs:$src1, IntRegs:$src2),
-         (S2_clrbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
-         (C4_or_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
-         (A4_vcmpwgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrmaxh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (A4_vrmaxh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmpbeq DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vcmpbeq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmphgt DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vcmphgt DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vnavgwcr DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vnavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2),
-         (M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfrsi s32_0ImmPred_timm:$src1),
-         (A2_tfrsi s32_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svnavgh IntRegs:$src1, IntRegs:$src2),
-         (A2_svnavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (S2_lsr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
-         (A4_vcmphgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svavgh IntRegs:$src1, IntRegs:$src2),
-         (A2_svavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
-         (S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_combine_hl IntRegs:$src1, IntRegs:$src2),
-         (A2_combine_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_up IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_combine_hh IntRegs:$src1, IntRegs:$src2),
-         (A2_combine_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_negsat IntRegs:$src1),
-         (A2_negsat IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_hl_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_hl_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_bitsplit IntRegs:$src1, IntRegs:$src2),
-         (A4_bitsplit IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vabshsat DoubleRegs:$src1),
-         (A2_vabshsat DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyui IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyui IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_l16_sat_ll IntRegs:$src1, IntRegs:$src2),
-         (A2_addh_l16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (S2_lsl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyul_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_mmpyul_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_lsr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mpyrr_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_mpyrr_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
-         (S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_sat_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_sat_hl_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_sat_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmachs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmachs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmachs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmachs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmpyr_s0c DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vrcmpyr_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_sat_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffixupn IntRegs:$src1, IntRegs:$src2),
-         (F2_sffixupn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vadduhs DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vadduhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsubuhs DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vsubuhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_hl IntRegs:$src1, IntRegs:$src2),
-         (A2_subh_h16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_hh IntRegs:$src1, IntRegs:$src2),
-         (A2_subh_h16_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_xorp DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_xorp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_tfrpcp DoubleRegs:$src1),
-         (A4_tfrpcp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_lh IntRegs:$src1, IntRegs:$src2),
-         (A2_addh_h16_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_sat_hl IntRegs:$src1, IntRegs:$src2),
-         (A2_addh_h16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_ll IntRegs:$src1, IntRegs:$src2),
-         (A2_addh_h16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_sat_hh IntRegs:$src1, IntRegs:$src2),
-         (A2_addh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_zxtb IntRegs:$src1),
-         (A2_zxtb IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_zxth IntRegs:$src1),
-         (A2_zxth IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vnavgwr DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vnavgwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_or_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_or_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vmacbsu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M5_vmacbsu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
-         (C4_cmpneqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_sat DoubleRegs:$src1),
-         (A2_sat DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyd_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyd_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addsat IntRegs:$src1, IntRegs:$src2),
-         (A2_addsat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svavghs IntRegs:$src1, IntRegs:$src2),
-         (A2_svavghs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_bitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2),
-         (C2_bitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2),
-         (A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2),
-         (A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmaculs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmaculs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmaculs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmaculs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vradduh DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vradduh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_addp_c DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
-         (A4_addp_c DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_xor PredRegs:$src1, PredRegs:$src2),
-         (C2_xor PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (S2_lsl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyh_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_mmpyh_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyh_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_mmpyh_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2ud_chop DoubleRegs:$src1),
-         (F2_conv_df2ud_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_or_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
-         (C4_or_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vxaddsubhr DoubleRegs:$src1, DoubleRegs:$src2),
-         (S4_vxaddsubhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsathub DoubleRegs:$src1),
-         (S2_vsathub DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2sf DoubleRegs:$src1),
-         (F2_conv_df2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_hmmpyh_rs1 IntRegs:$src1, IntRegs:$src2),
-         (M2_hmmpyh_rs1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_hmmpyh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_hmmpyh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavgwr DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vavgwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_sxth IntRegs:$src1),
-         (A2_sxth IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_sxtb IntRegs:$src1),
-         (A2_sxtb IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_or_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
-         (C4_or_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmaci_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_vrcmaci_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_sxtw IntRegs:$src1),
-         (A2_sxtw IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vabsdiffh DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vabsdiffh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_hmmpyl_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_hmmpyl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_cl1p DoubleRegs:$src1),
-         (S2_cl1p DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vabsdiffw DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vabsdiffw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_andnp DoubleRegs:$src1, DoubleRegs:$src2),
-         (A4_andnp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (C2_vmux PredRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_parityp DoubleRegs:$src1, DoubleRegs:$src2),
-         (S2_parityp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyd_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_hh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_hl_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_hl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyu_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyu_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyu_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyu_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfcmpeq IntRegs:$src1, IntRegs:$src2),
-         (F2_sfcmpeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vaddb_map DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (S2_lsr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmpheq DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vcmpheq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_clbnorm IntRegs:$src1),
-         (S2_clbnorm IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_subaddi IntRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3),
-         (S4_subaddi IntRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_hh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_hh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_hl_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_hl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_acc_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mpyud_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_tstbit_r IntRegs:$src1, IntRegs:$src2),
-         (S2_tstbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred_timm:$src3),
-         (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_tstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (S2_tstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_mpyud_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_hh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_hl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_lh_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_ll_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyud_ll_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyud_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpysmi IntRegs:$src1, m32_0ImmPred_timm:$src2),
+         (M2_mpysmi IntRegs:$src1, m32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
+         (M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
+         (M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_dpmpyss_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_dpmpyuu_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_up IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2),
          (M2_mpy_up_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2),
-         (S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
-         (S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_or_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_or_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_hh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyu_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_hh_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyu_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_asl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyu_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyu_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_ll_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_sat_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_ll_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_sat_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_w2df IntRegs:$src1),
-         (F2_conv_w2df IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2),
-         (A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
-         (C2_cmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vcnegh DoubleRegs:$src1, IntRegs:$src2),
-         (S2_vcnegh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
-         (A4_vcmpweqi DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpy_up_s1_sat IntRegs:$src1, IntRegs:$src2),
+         (M2_mpy_up_s1_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyu_up IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpysu_up IntRegs:$src1, IntRegs:$src2),
+         (M2_mpysu_up IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_nac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_nac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyi IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyi IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mpyui IntRegs:$src1, IntRegs:$src2),
+         (M2_mpyui IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+         (M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+         (M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_subacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_mpyrr_addr IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred_timm:$src2, IntRegs:$src3),
+         (M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
+         (M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_M4_mpyri_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, u6_0ImmPred_timm:$src3),
+         (M4_mpyri_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_M4_mpyrr_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_mpyrr_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2s_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_vmpy2s_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2s_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_vmpy2s_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2s_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_vmac2s_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2s_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_vmac2s_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2su_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_vmac2su_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_vmac2su_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2),
+         (M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2),
+         (M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_vmac2 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2es_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vmpy2es_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmpy2es_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vmpy2es_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vmac2es DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vmac2es DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrmac_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vrmac_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_vdmpyrs_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_vdmpyrs_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_xor_xacc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M4_xor_xacc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vdmpys_s1 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vdmpys_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2),
+         (M5_vrmpybuu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2),
+         (M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vrmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M5_vrmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vmpybuu IntRegs:$src1, IntRegs:$src2),
+         (M5_vmpybuu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vmpybsu IntRegs:$src1, IntRegs:$src2),
+         (M5_vmpybsu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vmacbsu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M5_vmacbsu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2),
+         (M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M5_vdmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M5_vdmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmacs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vdmacs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmacs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vdmacs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_vdmpys_s0 DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_vdmpys_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavgubr DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vavgubr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_hl_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyu_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_hl_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyu_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_cl0p DoubleRegs:$src1),
-         (S2_cl0p DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3),
-         (S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffixupd IntRegs:$src1, IntRegs:$src2),
-         (F2_sffixupd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_sat_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_sat_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vdmpys_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vdmpys_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cmacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cmacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_cmacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_cmacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_cmacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_cmacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_ct1 IntRegs:$src1),
-         (S2_ct1 IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_ct0 IntRegs:$src1),
-         (S2_ct0 IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_dpmpyuu_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpys_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpys_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpys_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpys_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpysc_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpysc_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpysc_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpysc_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cnacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cnacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cnacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cnacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cnacsc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cnacsc_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2),
+         (M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpys_s1rp DoubleRegs:$src1, IntRegs:$src2),
+         (M2_vrcmpys_s1rp DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmachs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmachs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacls_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacls_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmachs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmachs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmachs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmachs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyl_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyl_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyh_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyh_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyh_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyh_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyeh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M4_vrmpyeh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyeh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M4_vrmpyeh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M4_vrmpyeh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M4_vrmpyeh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyoh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M4_vrmpyoh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M4_vrmpyoh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M4_vrmpyoh_acc_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M4_vrmpyoh_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_hmmpyl_rs1 IntRegs:$src1, IntRegs:$src2),
+         (M2_hmmpyl_rs1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_hmmpyh_rs1 IntRegs:$src1, IntRegs:$src2),
+         (M2_hmmpyh_rs1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_hmmpyl_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_hmmpyl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_hmmpyh_s1 IntRegs:$src1, IntRegs:$src2),
+         (M2_hmmpyh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmaculs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmaculs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacuhs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacuhs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacuhs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacuhs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyul_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyul_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyul_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyul_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyuh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyuh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyuh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyuh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmaculs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmaculs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmaculs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmaculs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacuhs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacuhs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmacuhs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_mmacuhs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyul_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyul_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_mmpyul_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_ntstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (S4_ntstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffixupr IntRegs:$src1),
-         (F2_sffixupr IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmphgtu DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vcmphgtu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_andn PredRegs:$src1, PredRegs:$src2),
-         (C2_andn PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2),
-         (M2_vmpy2s_s0pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
-         (S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_acc_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_sat_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_rcmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
-         (A4_rcmpeqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_mmpyuh_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_mmpyuh_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
          (M2_mmpyuh_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_round_ri IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (A4_round_ri IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_max IntRegs:$src1, IntRegs:$src2),
-         (A2_max IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_round_rr IntRegs:$src1, IntRegs:$src2),
-         (A4_round_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_combineii s8_0ImmPred_timm:$src1, u32_0ImmPred_timm:$src2),
-         (A4_combineii s8_0ImmPred_timm:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_combineir s32_0ImmPred_timm:$src1, IntRegs:$src2),
-         (A4_combineir s32_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
-         (C4_and_orn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M5_vmacbuu DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_rcmpeq IntRegs:$src1, IntRegs:$src2),
-         (A4_rcmpeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2),
-         (M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vzxtbh IntRegs:$src1),
-         (S2_vzxtbh IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmacuhs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmacuhs_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_r_sat IntRegs:$src1, IntRegs:$src2),
-         (S2_asr_r_r_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_combinew IntRegs:$src1, IntRegs:$src2),
-         (A2_combinew IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmaci_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vrcmaci_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmacr_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vrcmacr_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmaci_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vrcmaci_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vrcmacr_s0c DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_cmacr_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vrcmpyi_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpyr_s0 DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vrcmpyr_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vrcmpyr_s0c DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vrcmpyr_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2),
          (M2_cmpyi_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_ori_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S4_ori_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_nbitsset IntRegs:$src1, IntRegs:$src2),
-         (C4_nbitsset IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyu_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyu_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_ll_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyu_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_ll_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyu_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_l16_ll IntRegs:$src1, IntRegs:$src2),
-         (A2_addh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (S2_lsr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_modwrapu IntRegs:$src1, IntRegs:$src2),
-         (A4_modwrapu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_rcmpneq IntRegs:$src1, IntRegs:$src2),
-         (A4_rcmpneq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_acc_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_acc_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfimm_p u10_0ImmPred_timm:$src1),
-         (F2_sfimm_p u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfimm_n u10_0ImmPred_timm:$src1),
-         (F2_sfimm_n u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2),
+         (M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2),
+         (M4_cmpyi_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2),
          (M4_cmpyr_wh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavgub DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vavgub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_d2sf DoubleRegs:$src1),
-         (F2_conv_d2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbeqi IntRegs:$src1, u8_0ImmPred_timm:$src2),
-         (A4_cmpbeqi IntRegs:$src1, u8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfcmpuo IntRegs:$src1, IntRegs:$src2),
-         (F2_sfcmpuo IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsatwh_nopack DoubleRegs:$src1),
-         (S2_vsatwh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_hh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_lsl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_cmpyi_whc DoubleRegs:$src1, IntRegs:$src2),
+         (M4_cmpyi_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2),
+         (M4_cmpyr_whc DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vcmpy_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vcmpy_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vcmac_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vcmac_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vcmac_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M2_vcmac_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vcrotate DoubleRegs:$src1, IntRegs:$src2),
+         (S2_vcrotate DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred_timm:$src4),
+         (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred_timm:$src3),
+         (S4_vrcrotate DoubleRegs:$src1, IntRegs:$src2, u2_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vcnegh DoubleRegs:$src1, IntRegs:$src2),
+         (S2_vcnegh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_vrcnegh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_pmpyw IntRegs:$src1, IntRegs:$src2),
+         (M4_pmpyw IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vpmpyh IntRegs:$src1, IntRegs:$src2),
+         (M4_vpmpyh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_pmpyw_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_pmpyw_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_vpmpyh_acc DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_add IntRegs:$src1, IntRegs:$src2),
+         (A2_add IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sub IntRegs:$src1, IntRegs:$src2),
+         (A2_sub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addsat IntRegs:$src1, IntRegs:$src2),
+         (A2_addsat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subsat IntRegs:$src1, IntRegs:$src2),
+         (A2_subsat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addi IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (A2_addi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_l16_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_l16_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_l16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_l16_sat_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_l16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_l16_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_l16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_l16_sat_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_l16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_lh IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_hh IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_sat_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_sat_lh IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_sat_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_sat_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addh_h16_sat_hh IntRegs:$src1, IntRegs:$src2),
+         (A2_addh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_lh IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_hh IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_sat_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_sat_lh IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_sat_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2),
+         (A2_subh_h16_sat_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_aslh IntRegs:$src1),
+         (A2_aslh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_asrh IntRegs:$src1),
+         (A2_asrh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_addp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_addsp IntRegs:$src1, DoubleRegs:$src2),
+         (A2_addsp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_subp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_negsat IntRegs:$src1),
+         (A2_negsat IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_abs IntRegs:$src1),
+         (A2_abs IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_abssat IntRegs:$src1),
+         (A2_abssat IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vconj DoubleRegs:$src1),
+         (A2_vconj DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_negp DoubleRegs:$src1),
+         (A2_negp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_absp DoubleRegs:$src1),
+         (A2_absp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_max IntRegs:$src1, IntRegs:$src2),
+         (A2_max IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_maxu IntRegs:$src1, IntRegs:$src2),
+         (A2_maxu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_min IntRegs:$src1, IntRegs:$src2),
+         (A2_min IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_minu IntRegs:$src1, IntRegs:$src2),
          (A2_minu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_lh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_sat_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_maxp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_maxp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_maxup DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_maxup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_minp DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_minp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_minup DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_minup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfr IntRegs:$src1),
+         (A2_tfr IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrsi s32_0ImmPred_timm:$src1),
+         (A2_tfrsi s32_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrp DoubleRegs:$src1),
+         (A2_tfrp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_zxtb IntRegs:$src1),
+         (A2_zxtb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sxtb IntRegs:$src1),
+         (A2_sxtb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_zxth IntRegs:$src1),
+         (A2_zxth IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sxth IntRegs:$src1),
+         (A2_sxth IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combinew IntRegs:$src1, IntRegs:$src2),
+         (A2_combinew IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_combineri IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (A4_combineri IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_combineir s32_0ImmPred_timm:$src1, IntRegs:$src2),
+         (A4_combineir s32_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combineii s32_0ImmPred_timm:$src1, s8_0ImmPred_timm:$src2),
+         (A2_combineii s32_0ImmPred_timm:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combine_hh IntRegs:$src1, IntRegs:$src2),
+         (A2_combine_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combine_hl IntRegs:$src1, IntRegs:$src2),
+         (A2_combine_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combine_lh IntRegs:$src1, IntRegs:$src2),
+         (A2_combine_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_combine_ll IntRegs:$src1, IntRegs:$src2),
+         (A2_combine_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfril IntRegs:$src1, u16_0ImmPred_timm:$src2),
+         (A2_tfril IntRegs:$src1, u16_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_tfrih IntRegs:$src1, u16_0ImmPred_timm:$src2),
+         (A2_tfrih IntRegs:$src1, u16_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_and IntRegs:$src1, IntRegs:$src2),
+         (A2_and IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_or IntRegs:$src1, IntRegs:$src2),
+         (A2_or IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_xor IntRegs:$src1, IntRegs:$src2),
+         (A2_xor IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_xor_xacc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M4_xor_xacc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_andn IntRegs:$src1, IntRegs:$src2),
+         (A4_andn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_orn IntRegs:$src1, IntRegs:$src2),
+         (A4_orn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_andnp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A4_andnp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_ornp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A4_ornp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+         (S4_addaddi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_subaddi IntRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3),
+         (S4_subaddi IntRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_and_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_and_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_and_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_and_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_and_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_and_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_or_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_or_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_or_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_or_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_or_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_or_xor IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
-         (S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyuh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_mmpyuh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyuh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_mmpyuh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfcmpge IntRegs:$src1, IntRegs:$src2),
-         (F2_sfcmpge IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfmin IntRegs:$src1, IntRegs:$src2),
-         (F2_sfmin IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfcmpgt IntRegs:$src1, IntRegs:$src2),
-         (F2_sfcmpgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vpmpyh IntRegs:$src1, IntRegs:$src2),
-         (M4_vpmpyh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmacuhs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmacuhs_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_rnd_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_rnd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+         (S4_or_andix IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+         (S4_or_andi IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
+         (S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_xor_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_xor_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_xor_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_subri s32_0ImmPred_timm:$src1, IntRegs:$src2),
+         (A2_subri s32_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_andir IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (A2_andir IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_orir IntRegs:$src1, s32_0ImmPred_timm:$src2),
+         (A2_orir IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_andp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_andp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_orp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_orp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_xorp DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_xorp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_notp DoubleRegs:$src1),
+         (A2_notp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sxtw IntRegs:$src1),
+         (A2_sxtw IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_sat DoubleRegs:$src1),
+         (A2_sat DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_roundsat DoubleRegs:$src1),
          (A2_roundsat DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_ct1p DoubleRegs:$src1),
-         (S2_ct1p DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_extract_rp IntRegs:$src1, DoubleRegs:$src2),
-         (S4_extract_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmplteui IntRegs:$src1, u32_0ImmPred_timm:$src2),
-         (C4_cmplteui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_addi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S4_addi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_tfrcpp CtrRegs64:$src1),
-         (A4_tfrcpp CtrRegs64:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
-         (S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmphgti IntRegs:$src1, s32_0ImmPred_timm:$src2),
-         (A4_cmphgti IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmphgtu IntRegs:$src1, IntRegs:$src2),
-         (A4_cmphgtu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_subi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S4_subi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2),
-         (S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsubws DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vsubws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_sath IntRegs:$src1),
          (A2_sath IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_satuh IntRegs:$src1),
+         (A2_satuh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_satub IntRegs:$src1),
+         (A2_satub IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_satb IntRegs:$src1),
          (A2_satb IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3, u6_0ImmPred_timm:$src4),
-         (S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3, u6_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2),
-         (S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddb_map DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddhs DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vaddhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vadduhs DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vadduhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2),
+         (A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vaddws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S4_vxaddsubw DoubleRegs:$src1, DoubleRegs:$src2),
          (S4_vxaddsubw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2),
+         (S4_vxsubaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S4_vxaddsubh DoubleRegs:$src1, DoubleRegs:$src2),
          (S4_vxaddsubh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_asrh IntRegs:$src1),
-         (A2_asrh IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_extractp_rp DoubleRegs:$src1, DoubleRegs:$src2),
-         (S4_extractp_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (S2_lsr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_or PredRegs:$src1, PredRegs:$src2),
-         (C2_or PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyul_s1 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_mmpyul_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmacr_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_vrcmacr_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_xor IntRegs:$src1, IntRegs:$src2),
-         (A2_xor IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_add IntRegs:$src1, IntRegs:$src2),
-         (A2_add IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsububs DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vsububs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2s_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_vmpy2s_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2s_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_vmpy2s_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vraddub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (A2_vraddub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfinvsqrta IntRegs:$src1),
-         (F2_sfinvsqrta IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_ct0p DoubleRegs:$src1),
-         (S2_ct0p DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2),
+         (S4_vxsubaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxaddsubhr DoubleRegs:$src1, DoubleRegs:$src2),
+         (S4_vxaddsubhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_vxsubaddhr DoubleRegs:$src1, DoubleRegs:$src2),
+         (S4_vxsubaddhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svavgh IntRegs:$src1, IntRegs:$src2),
+         (A2_svavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svavghs IntRegs:$src1, IntRegs:$src2),
+         (A2_svavghs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svnavgh IntRegs:$src1, IntRegs:$src2),
+         (A2_svnavgh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_svaddh IntRegs:$src1, IntRegs:$src2),
          (A2_svaddh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vcrotate DoubleRegs:$src1, IntRegs:$src2),
-         (S2_vcrotate DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_aslh IntRegs:$src1),
-         (A2_aslh IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_lh IntRegs:$src1, IntRegs:$src2),
-         (A2_subh_h16_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_ll IntRegs:$src1, IntRegs:$src2),
-         (A2_subh_h16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_hmmpyl_rs1 IntRegs:$src1, IntRegs:$src2),
-         (M2_hmmpyl_rs1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_p DoubleRegs:$src1, IntRegs:$src2),
-         (S2_asr_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsplatrh IntRegs:$src1),
-         (S2_vsplatrh IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_r IntRegs:$src1, IntRegs:$src2),
-         (S2_asr_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_hl IntRegs:$src1, IntRegs:$src2),
-         (A2_addh_h16_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsplatrb IntRegs:$src1),
-         (S2_vsplatrb IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_hh IntRegs:$src1, IntRegs:$src2),
-         (A2_addh_h16_hh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_cmpyr_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_dpmpyss_rnd_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_muxri PredRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3),
-         (C2_muxri PredRegs:$src1, s32_0ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_vmac2es_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_vmac2es_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_pxfer_map PredRegs:$src1),
-         (C2_pxfer_map PredRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyu_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyu_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_acc_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_acc_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vaddw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vaddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_sat_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_sat_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2),
-         (C2_cmpeqp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mpyri_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, u6_0ImmPred_timm:$src3),
-         (M4_mpyri_addi u32_0ImmPred_timm:$src1, IntRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_andi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S4_andi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
-         (M2_macsip IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfrcrr CtrRegs:$src1),
-         (A2_tfrcrr CtrRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
-         (M2_macsin IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_orn PredRegs:$src1, PredRegs:$src2),
-         (C2_orn PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_and_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_and_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfmpy IntRegs:$src1, IntRegs:$src2),
-         (F2_sfmpy IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_lsr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_vw DoubleRegs:$src1, IntRegs:$src2),
-         (S2_asr_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_and_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_and_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_vh DoubleRegs:$src1, IntRegs:$src2),
-         (S2_asr_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_mask PredRegs:$src1),
-         (C2_mask PredRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_up_s1_sat IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_up_s1_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbgt DoubleRegs:$src1, DoubleRegs:$src2),
-         (A4_vcmpbgt DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vrmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M5_vrmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svaddhs IntRegs:$src1, IntRegs:$src2),
+         (A2_svaddhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svadduhs IntRegs:$src1, IntRegs:$src2),
+         (A2_svadduhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svsubh IntRegs:$src1, IntRegs:$src2),
+         (A2_svsubh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svsubhs IntRegs:$src1, IntRegs:$src2),
+         (A2_svsubhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_svsubuhs IntRegs:$src1, IntRegs:$src2),
+         (A2_svsubuhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vraddub DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vraddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vraddub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (A2_vraddub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vradduh DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vradduh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubb_map DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsububs DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsububs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsubh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubhs DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsubhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubuhs DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsubuhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsubw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vsubws DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vsubws DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vabsh DoubleRegs:$src1),
+         (A2_vabsh DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vabshsat DoubleRegs:$src1),
+         (A2_vabshsat DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vabsw DoubleRegs:$src1),
+         (A2_vabsw DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vabswsat DoubleRegs:$src1),
+         (A2_vabswsat DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vabsdiffw DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vabsdiffw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_M2_vabsdiffh DoubleRegs:$src1, DoubleRegs:$src2),
+         (M2_vabsdiffh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vrsadub DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vrsadub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfrrcr IntRegs:$src1),
-         (A2_tfrrcr IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (M2_vrcmpys_acc_s1 DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2),
-         (F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
-         (M2_accii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2),
-         (A5_vaddhubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (A2_vrsadub_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgub DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavgub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavguh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavgh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vnavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vnavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgwr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavgwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavgwr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vnavgwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavgwcr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vnavgwcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavghcr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vnavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavguw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavguwr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavgubr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavgubr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavguhr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavguhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vnavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_ri IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (A4_round_ri IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_rr IntRegs:$src1, IntRegs:$src2),
+         (A4_round_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_ri_sat IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (A4_round_ri_sat IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_round_rr_sat IntRegs:$src1, IntRegs:$src2),
+         (A4_round_rr_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cround_ri IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (A4_cround_ri IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_cround_rr IntRegs:$src1, IntRegs:$src2),
+         (A4_cround_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrminh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrmaxh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrmaxh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrminuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrminuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrminw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrmaxw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrminuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminb DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vminb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vmaxb DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vmaxb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminub DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vminub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vminh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2),
          (A2_vmaxh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsxthw IntRegs:$src1),
-         (S2_vsxthw IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_andi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S4_andi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpgt IntRegs:$src1, IntRegs:$src2),
-         (C2_cmpgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2d_chop DoubleRegs:$src1),
-         (F2_conv_df2d_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyu_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyu_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2w IntRegs:$src1),
-         (F2_conv_sf2w IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vminw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vmaxw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vminuw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vminuw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_vmaxuw DoubleRegs:$src1, DoubleRegs:$src2),
+         (A2_vmaxuw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_modwrapu IntRegs:$src1, IntRegs:$src2),
+         (A4_modwrapu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfadd IntRegs:$src1, IntRegs:$src2),
+         (F2_sfadd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfsub IntRegs:$src1, IntRegs:$src2),
+         (F2_sfsub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfmpy IntRegs:$src1, IntRegs:$src2),
+         (F2_sfmpy IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (F2_sffma IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4),
+         (F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, (C2_tfrrp PredRegs:$src4))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffma_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (F2_sffma_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (F2_sffms_lib IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfcmpeq IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (F2_sfcmpeq IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfcmpgt IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (F2_sfcmpgt IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfcmpge IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (F2_sfcmpge IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfcmpuo IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (F2_sfcmpuo IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfmax IntRegs:$src1, IntRegs:$src2),
+         (F2_sfmax IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfmin IntRegs:$src1, IntRegs:$src2),
+         (F2_sfmin IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_sfclass IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (F2_sfclass IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_acc_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_xor_andn IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred_timm:$src3),
-         (S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2),
-         (M5_vdmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyu_nac_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyu_nac_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addi IntRegs:$src1, s32_0ImmPred_timm:$src2),
-         (A2_addi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addp DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_addp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2),
-         (M2_vmpy2s_s1pack IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_clbpnorm DoubleRegs:$src1),
-         (S4_clbpnorm DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_round_rr_sat IntRegs:$src1, IntRegs:$src2),
-         (A4_round_rr_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_nacci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2),
-         (S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_sat_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_sat_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+         (C2_tfrpr (F2_sfclass IntRegs:$src1, u5_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfimm_p u10_0ImmPred_timm:$src1),
+         (F2_sfimm_p u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sfimm_n u10_0ImmPred_timm:$src1),
+         (F2_sfimm_n u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffixupn IntRegs:$src1, IntRegs:$src2),
+         (F2_sffixupn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffixupd IntRegs:$src1, IntRegs:$src2),
+         (F2_sffixupd IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_sffixupr IntRegs:$src1),
+         (F2_sffixupr IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfcmpeq DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (F2_dfcmpeq DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfcmpgt DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (F2_dfcmpgt DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (F2_dfcmpge DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfcmpuo DoubleRegs:$src1, DoubleRegs:$src2),
+         (C2_tfrpr (F2_dfcmpuo DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfclass DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+         (C2_tfrpr (F2_dfclass DoubleRegs:$src1, u5_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfimm_p u10_0ImmPred_timm:$src1),
+         (F2_dfimm_p u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_dfimm_n u10_0ImmPred_timm:$src1),
+         (F2_dfimm_n u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2df IntRegs:$src1),
+         (F2_conv_sf2df IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2sf DoubleRegs:$src1),
+         (F2_conv_df2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_uw2sf IntRegs:$src1),
+         (F2_conv_uw2sf IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_uw2df IntRegs:$src1),
+         (F2_conv_uw2df IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_w2sf IntRegs:$src1),
+         (F2_conv_w2sf IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_w2df IntRegs:$src1),
+         (F2_conv_w2df IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_ud2sf DoubleRegs:$src1),
+         (F2_conv_ud2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_ud2df DoubleRegs:$src1),
+         (F2_conv_ud2df DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_d2sf DoubleRegs:$src1),
+         (F2_conv_d2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_d2df DoubleRegs:$src1),
+         (F2_conv_d2df DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_conv_sf2uw IntRegs:$src1),
          (F2_conv_sf2uw IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsubh DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vsubh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2w IntRegs:$src1),
+         (F2_conv_sf2w IntRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_conv_sf2ud IntRegs:$src1),
          (F2_conv_sf2ud IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsubw DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vsubw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmpwgt DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vcmpwgt DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_xor_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_xor_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2d IntRegs:$src1),
+         (F2_conv_sf2d IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2uw DoubleRegs:$src1),
+         (F2_conv_df2uw DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2w DoubleRegs:$src1),
+         (F2_conv_df2w DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2ud DoubleRegs:$src1),
+         (F2_conv_df2ud DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2d DoubleRegs:$src1),
+         (F2_conv_df2d DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_F2_conv_sf2uw_chop IntRegs:$src1),
          (F2_conv_sf2uw_chop IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_vw DoubleRegs:$src1, IntRegs:$src2),
-         (S2_asl_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsatwuh_nopack DoubleRegs:$src1),
-         (S2_vsatwuh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_vh DoubleRegs:$src1, IntRegs:$src2),
-         (S2_asl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svsubuhs IntRegs:$src1, IntRegs:$src2),
-         (A2_svsubuhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vmpybsu IntRegs:$src1, IntRegs:$src2),
-         (M5_vmpybsu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_l16_sat_ll IntRegs:$src1, IntRegs:$src2),
-         (A2_subh_l16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_and_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
-         (C4_and_and PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyu_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyu_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_p DoubleRegs:$src1, IntRegs:$src2),
-         (S2_lsr_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2w_chop IntRegs:$src1),
+         (F2_conv_sf2w_chop IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2ud_chop IntRegs:$src1),
+         (F2_conv_sf2ud_chop IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_sf2d_chop IntRegs:$src1),
+         (F2_conv_sf2d_chop IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2uw_chop DoubleRegs:$src1),
+         (F2_conv_df2uw_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2w_chop DoubleRegs:$src1),
+         (F2_conv_df2w_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2ud_chop DoubleRegs:$src1),
+         (F2_conv_df2ud_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_F2_conv_df2d_chop DoubleRegs:$src1),
+         (F2_conv_df2d_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r IntRegs:$src1, IntRegs:$src2),
+         (S2_asr_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r IntRegs:$src1, IntRegs:$src2),
+         (S2_asl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsr_r_r IntRegs:$src1, IntRegs:$src2),
          (S2_lsr_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_subp_c DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
-         (A4_subp_c DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsubhs DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vsubhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_vitpack PredRegs:$src1, PredRegs:$src2),
-         (C2_vitpack PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavguhr DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vavguhr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsplicerb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
-         (S2_vsplicerb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_nbitsclr IntRegs:$src1, IntRegs:$src2),
-         (C4_nbitsclr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmpbgtu DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vcmpbgtu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpys_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_cmpys_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpys_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_cmpys_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfcmpuo DoubleRegs:$src1, DoubleRegs:$src2),
-         (F2_dfcmpuo DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_shuffob DoubleRegs:$src1, DoubleRegs:$src2),
-         (S2_shuffob DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_and PredRegs:$src1, PredRegs:$src2),
-         (C2_and PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S5_popcountp DoubleRegs:$src1),
-         (S5_popcountp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_extractp DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3),
-         (S4_extractp DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_cl0 IntRegs:$src1),
-         (S2_cl0 IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2),
-         (A4_vcmpbgti DoubleRegs:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmacls_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmacls_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmpneq IntRegs:$src1, IntRegs:$src2),
-         (C4_cmpneq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmac2es DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_vmac2es DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vdmacs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_vdmacs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vdmacs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_vdmacs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_ll_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyud_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_ll_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyud_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_clb IntRegs:$src1),
-         (S2_clb IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_nac_hl_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_nac_hl_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_maci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vmaxuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_bitspliti IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (A4_bitspliti IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vmaxub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyud_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_hh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyud_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrmac_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_vrmac_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_lh_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_sat_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_r_sat IntRegs:$src1, IntRegs:$src2),
-         (S2_asl_r_r_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2d IntRegs:$src1),
-         (F2_conv_sf2d IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r IntRegs:$src1, IntRegs:$src2),
+         (S2_lsl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asr_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asl_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p DoubleRegs:$src1, IntRegs:$src2),
+         (S2_lsr_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2),
+         (S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (S2_asr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfimm_n u10_0ImmPred_timm:$src1),
-         (F2_dfimm_n u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmphgt IntRegs:$src1, IntRegs:$src2),
-         (A4_cmphgt IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfimm_p u10_0ImmPred_timm:$src1),
-         (F2_dfimm_p u10_0ImmPred_timm:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_acc_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vcmpy_s1_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred_timm:$src2, IntRegs:$src3),
-         (M4_mpyri_addr_u2 IntRegs:$src1, u6_2ImmPred_timm:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vcmpy_s1_sat_i DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
          (S2_lsl_r_p_nac DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M5_vrmacbuu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3),
-         (S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_dpmpyss_acc_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cnacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_cnacs_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cnacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_cnacs_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_maxu IntRegs:$src1, IntRegs:$src2),
-         (A2_maxu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_maxp DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_maxp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_andir IntRegs:$src1, s32_0ImmPred_timm:$src2),
-         (A2_andir IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfrecipa IntRegs:$src1, IntRegs:$src2),
-         (F2_sfrecipa IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_combineii s32_0ImmPred_timm:$src1, s8_0ImmPred_timm:$src2),
-         (A2_combineii s32_0ImmPred_timm:$src1, s8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_orn IntRegs:$src1, IntRegs:$src2),
-         (A4_orn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbgtui IntRegs:$src1, u32_0ImmPred_timm:$src2),
-         (A4_cmpbgtui IntRegs:$src1, u32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_r_and IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (S2_lsr_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred_timm:$src2),
-         (A4_vcmpbeqi DoubleRegs:$src1, u8_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_r IntRegs:$src1, IntRegs:$src2),
-         (S2_lsl_r_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2),
-         (S2_lsl_r_p DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_or IntRegs:$src1, IntRegs:$src2),
-         (A2_or IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfcmpeq DoubleRegs:$src1, DoubleRegs:$src2),
-         (F2_dfcmpeq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpeq IntRegs:$src1, IntRegs:$src2),
-         (C2_cmpeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfrp DoubleRegs:$src1),
-         (A2_tfrp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_and_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
-         (C4_and_andn PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsathub_nopack DoubleRegs:$src1),
-         (S2_vsathub_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_satuh IntRegs:$src1),
-         (A2_satuh IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_satub IntRegs:$src1),
-         (A2_satub IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2),
-         (M2_vrcmpys_s1 DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
-         (S4_or_ori IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_fastcorner9_not PredRegs:$src1, PredRegs:$src2),
-         (C4_fastcorner9_not PredRegs:$src1, PredRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfrih IntRegs:$src1, u16_0ImmPred_timm:$src2),
-         (A2_tfrih IntRegs:$src1, u16_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfril IntRegs:$src1, u16_0ImmPred_timm:$src2),
-         (A2_tfril IntRegs:$src1, u16_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3),
-         (M4_mpyri_addr IntRegs:$src1, IntRegs:$src2, u32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vtrunehb DoubleRegs:$src1),
-         (S2_vtrunehb DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vabsw DoubleRegs:$src1),
-         (A2_vabsw DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vabsh DoubleRegs:$src1),
-         (A2_vabsh DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sfsub IntRegs:$src1, IntRegs:$src2),
-         (F2_sfsub IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_muxii PredRegs:$src1, s32_0ImmPred_timm:$src2, s8_0ImmPred_timm:$src3),
-         (C2_muxii PredRegs:$src1, s32_0ImmPred_timm:$src2, s8_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
-         (C2_muxir PredRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_swiz IntRegs:$src1),
-         (A2_swiz IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_cmpyrsc_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_cmpyrsc_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vraddub DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vraddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_tlbmatch DoubleRegs:$src1, IntRegs:$src2),
-         (A4_tlbmatch DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2w_chop DoubleRegs:$src1),
-         (F2_conv_df2w_chop DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_and IntRegs:$src1, IntRegs:$src2),
-         (A2_and IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
          (S2_lsr_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_extract IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3),
-         (S4_extract IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vcmpweq DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_acci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_p_or DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_asl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
+         (S2_lsl_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_r_sat IntRegs:$src1, IntRegs:$src2),
+         (S2_asr_r_r_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_r_sat IntRegs:$src1, IntRegs:$src2),
+         (S2_asl_r_r_sat IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_asr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_lsr_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_asl_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+         (S2_asr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+         (S2_lsr_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+         (S2_asl_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_lsr_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asl_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
          (S2_lsr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_ud2sf DoubleRegs:$src1),
-         (F2_conv_ud2sf DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_tfr IntRegs:$src1),
-         (A2_tfr IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subri s32_0ImmPred_timm:$src1, IntRegs:$src2),
-         (A2_subri s32_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (A4_vrmaxuw DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vmpybuu IntRegs:$src1, IntRegs:$src2),
-         (M5_vmpybuu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (A4_vrmaxuh DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
-         (S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vavgw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_brev IntRegs:$src1),
-         (S2_brev IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vavgh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_clrbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (S2_clrbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
-         (S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (S2_lsl_r_r_nac IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_mmpyl_rs1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_hl_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyud_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_mmpyl_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_mmpyl_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3),
-         (M2_naccii IntRegs:$src1, IntRegs:$src2, s32_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vrndpackwhs DoubleRegs:$src1),
-         (S2_vrndpackwhs DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2),
-         (S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_dpmpyss_nac_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_ll_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_mac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred_timm:$src4),
-         (S4_vrcrotate_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3, u2_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_uw2df IntRegs:$src1),
-         (F2_conv_uw2df IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vaddubs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (S2_asr_r_r_acc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_orir IntRegs:$src1, s32_0ImmPred_timm:$src2),
-         (A2_orir IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_andp DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_andp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2),
-         (S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_min IntRegs:$src1, IntRegs:$src2),
-         (A2_min IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpysmi IntRegs:$src1, m32_0ImmPred_timm:$src2),
-         (M2_mpysmi IntRegs:$src1, m32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vcmpy_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyu_acc_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyu_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyu_acc_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_svw_trun DoubleRegs:$src1, IntRegs:$src2),
-         (S2_asr_r_svw_trun DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_mmpyh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_mmpyh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2df IntRegs:$src1),
-         (F2_conv_sf2df IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vtrunohb DoubleRegs:$src1),
-         (S2_vtrunohb DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2d_chop IntRegs:$src1),
-         (F2_conv_sf2d_chop IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2w DoubleRegs:$src1),
-         (F2_conv_df2w DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
-         (S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asl_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_lsr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asl_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_lsr_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asl_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_lsr_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
          (S2_asl_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2d DoubleRegs:$src1),
-         (F2_conv_df2d DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmaculs_s1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmaculs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmaculs_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svadduhs IntRegs:$src1, IntRegs:$src2),
-         (A2_svadduhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2w_chop IntRegs:$src1),
-         (F2_conv_sf2w_chop IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_svsathub IntRegs:$src1),
-         (S2_svsathub IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_rnd_hl_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_rnd_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_setbit_r IntRegs:$src1, IntRegs:$src2),
-         (S2_setbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vavghr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4),
-         (F2_sffma_sc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3, PredRegs:$src4)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfclass DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
-         (F2_dfclass DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2ud DoubleRegs:$src1),
-         (F2_conv_df2ud DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_df2uw DoubleRegs:$src1),
-         (F2_conv_df2uw DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_cmpyrs_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_cmpyrs_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmpltei IntRegs:$src1, s32_0ImmPred_timm:$src2),
-         (C4_cmpltei IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_cmplteu IntRegs:$src1, IntRegs:$src2),
-         (C4_cmplteu IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vsubb_map DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vsubub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2),
-         (A2_subh_l16_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asl_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_lsr_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asl_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_lsr_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S2_asl_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_lsr_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asl_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_lsr_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred_timm:$src2),
          (S2_asr_i_r_rnd IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vrmpy_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_rnd_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyd_rnd_hh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_minup DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_minup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_valignrb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
-         (S2_valignrb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_asr_r_p_acc DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmpyl_rs0 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_mmpyl_rs0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmaci_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_vrcmaci_s0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vaddub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_combine_lh IntRegs:$src1, IntRegs:$src2),
-         (A2_combine_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vdmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M5_vdmacbsu DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_combine_ll IntRegs:$src1, IntRegs:$src2),
-         (A2_combine_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyud_hl_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vrcmpyi_s0c DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
          (S2_asr_i_p_rnd DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_addpsat DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svaddhs IntRegs:$src1, IntRegs:$src2),
-         (A2_svaddhs IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_ori_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S4_ori_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_sat_rnd_ll_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_sat_rnd_ll_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vminw DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vminw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vminh DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vminh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vrcmpyr_s0 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vrcmpyr_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vminb DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vminb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vcmac_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_vcmac_s0_sat_i DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_lh_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyud_lh_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpyud_lh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (S2_asl_r_r_or IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S4_lsli s6_0ImmPred_timm:$src1, IntRegs:$src2),
          (S4_lsli s6_0ImmPred_timm:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred_timm:$src3),
+         (S2_addasl_rrri IntRegs:$src1, IntRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_andi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_andi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_ori_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_ori_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_addi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_addi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_subi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_subi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_andi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_andi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_ori_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_ori_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_addi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_addi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S4_subi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S4_subi_lsr_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[UseCompound, HasV5]>;
+def: Pat<(int_hexagon_S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3),
+         (S2_valignib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_valignrb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
+         (S2_valignrb DoubleRegs:$src1, DoubleRegs:$src2, (C2_tfrrp PredRegs:$src3))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3),
+         (S2_vspliceib DoubleRegs:$src1, DoubleRegs:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsplicerb DoubleRegs:$src1, DoubleRegs:$src2, PredRegs:$src3),
+         (S2_vsplicerb DoubleRegs:$src1, DoubleRegs:$src2, (C2_tfrrp PredRegs:$src3))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsplatrh IntRegs:$src1),
+         (S2_vsplatrh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsplatrb IntRegs:$src1),
+         (S2_vsplatrb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4),
+         (S2_insert IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3, u5_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_bitspliti IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (A4_bitspliti IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A4_bitsplit IntRegs:$src1, IntRegs:$src2),
+         (A4_bitsplit IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extract IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3),
+         (S4_extract IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractu IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3),
+         (S2_extractu IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3, u6_0ImmPred_timm:$src4),
+         (S2_insertp DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3, u6_0ImmPred_timm:$src4)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extractp DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3),
+         (S4_extractp DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractup DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3),
+         (S2_extractup DoubleRegs:$src1, u6_0ImmPred_timm:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insert_rp IntRegs:$src1, IntRegs:$src2, DoubleRegs:$src3),
+         (S2_insert_rp IntRegs:$src1, IntRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extract_rp IntRegs:$src1, DoubleRegs:$src2),
+         (S4_extract_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2),
+         (S2_extractu_rp IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (S2_insertp_rp DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_extractp_rp DoubleRegs:$src1, DoubleRegs:$src2),
+         (S4_extractp_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_extractup_rp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_tstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (C2_tfrpr (S2_tstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ntstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (C2_tfrpr (S4_ntstbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_setbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_setbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_togglebit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_togglebit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clrbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_clrbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_tstbit_r IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (S2_tstbit_r IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_ntstbit_r IntRegs:$src1, IntRegs:$src2),
+         (C2_tfrpr (S4_ntstbit_r IntRegs:$src1, IntRegs:$src2))>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_setbit_r IntRegs:$src1, IntRegs:$src2),
+         (S2_setbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_togglebit_r IntRegs:$src1, IntRegs:$src2),
+         (S2_togglebit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clrbit_r IntRegs:$src1, IntRegs:$src2),
+         (S2_clrbit_r IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+         (S2_asr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+         (S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+         (S2_asl_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_vh DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asr_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
+         (S5_asrhub_sat DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_vh DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_vh DoubleRegs:$src1, IntRegs:$src2),
+         (S2_lsr_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2),
+         (S2_lsl_r_vh DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_asr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_asr_i_svw_trun DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_svw_trun DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asr_r_svw_trun DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_lsr_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S2_asl_i_vw DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asr_r_vw DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asr_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_asl_r_vw DoubleRegs:$src1, IntRegs:$src2),
+         (S2_asl_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2),
+         (S2_lsr_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2),
          (S2_lsl_r_vw DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_mpy_hh_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyeh_s0 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M4_vrmpyeh_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_vrmpyeh_s1 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M4_vrmpyeh_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_lh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_lh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vraddh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_tfrrp IntRegs:$src1),
-         (C2_tfrrp IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_sat_ll_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_sat_ll_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vrndpackwh DoubleRegs:$src1),
+         (S2_vrndpackwh DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vrndpackwhs DoubleRegs:$src1),
+         (S2_vrndpackwhs DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsxtbh IntRegs:$src1),
+         (S2_vsxtbh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vzxtbh IntRegs:$src1),
+         (S2_vzxtbh IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsathub DoubleRegs:$src1),
+         (S2_vsathub DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_svsathub IntRegs:$src1),
+         (S2_svsathub IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_svsathb IntRegs:$src1),
+         (S2_svsathb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsathb DoubleRegs:$src1),
+         (S2_vsathb DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vtrunohb DoubleRegs:$src1),
+         (S2_vtrunohb DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_vtrunewh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_vtrunowh DoubleRegs:$src1, DoubleRegs:$src2),
          (S2_vtrunowh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_abs IntRegs:$src1),
-         (A2_abs IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpbeq IntRegs:$src1, IntRegs:$src2),
-         (A4_cmpbeq IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_negp DoubleRegs:$src1),
-         (A2_negp DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (S2_asl_i_r_sat IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2),
-         (A2_addh_l16_sat_hl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vtrunehb DoubleRegs:$src1),
+         (S2_vtrunehb DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsxthw IntRegs:$src1),
+         (S2_vsxthw IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vzxthw IntRegs:$src1),
+         (S2_vzxthw IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsatwh DoubleRegs:$src1),
+         (S2_vsatwh DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_vsatwuh DoubleRegs:$src1),
          (S2_vsatwuh DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_dfcmpgt DoubleRegs:$src1, DoubleRegs:$src2),
-         (F2_dfcmpgt DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_svsathb IntRegs:$src1),
-         (S2_svsathb IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2),
-         (C2_cmpgtup DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cround_ri IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (A4_cround_ri IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred_timm:$src2),
-         (S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cround_rr IntRegs:$src1, IntRegs:$src2),
-         (A4_cround_rr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_mux PredRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (C2_mux PredRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_dpmpyuu_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_packhl IntRegs:$src1, IntRegs:$src2),
+         (S2_packhl IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_A2_swiz IntRegs:$src1),
+         (A2_swiz IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsathub_nopack DoubleRegs:$src1),
+         (S2_vsathub_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsathb_nopack DoubleRegs:$src1),
+         (S2_vsathb_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsatwh_nopack DoubleRegs:$src1),
+         (S2_vsatwh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_vsatwuh_nopack DoubleRegs:$src1),
+         (S2_vsatwuh_nopack DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_shuffob DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_shuffob DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S2_shuffeb DoubleRegs:$src1, DoubleRegs:$src2),
          (S2_shuffeb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vminuw DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vminuw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vaddhs DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vaddhs DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_insert_rp IntRegs:$src1, IntRegs:$src2, DoubleRegs:$src3),
-         (S2_insert_rp IntRegs:$src1, IntRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vminuh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vminub DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vminub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_extractu IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3),
-         (S2_extractu IntRegs:$src1, u5_0ImmPred_timm:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_svsubh IntRegs:$src1, IntRegs:$src2),
-         (A2_svsubh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_clbaddi IntRegs:$src1, s6_0ImmPred_timm:$src2),
-         (S4_clbaddi IntRegs:$src1, s6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (F2_sffms IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vsxtbh IntRegs:$src1),
-         (S2_vsxtbh IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_nac_ll_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_nac_ll_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subp DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_subp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2es_s1 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vmpy2es_s1 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2es_s0 DoubleRegs:$src1, DoubleRegs:$src2),
-         (M2_vmpy2es_s0 DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_shuffoh DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_shuffoh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_shuffeh DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S5_popcountp DoubleRegs:$src1),
+         (S5_popcountp DoubleRegs:$src1)>, Requires<[HasV5]>;
 def: Pat<(int_hexagon_S4_parity IntRegs:$src1, IntRegs:$src2),
          (S4_parity IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S4_addi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S4_addi_asl_ri u32_0ImmPred_timm:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_nac_hh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyd_nac_hh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S2_asr_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_cmpheqi IntRegs:$src1, s32_0ImmPred_timm:$src2),
-         (A4_cmpheqi IntRegs:$src1, s32_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_lsr_r_p_xor DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_F2_conv_sf2ud_chop IntRegs:$src1),
-         (F2_conv_sf2ud_chop IntRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_acc_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_acc_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3),
-         (S2_asl_r_p_and DoubleRegs:$src1, DoubleRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_sat_lh IntRegs:$src1, IntRegs:$src2),
-         (A2_addh_h16_sat_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_addh_h16_sat_ll IntRegs:$src1, IntRegs:$src2),
-         (A2_addh_h16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M4_nac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M4_nac_up_s1_sat IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_nac_lh_s1 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpyud_nac_lh_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_round_ri_sat IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (A4_round_ri_sat IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_hl_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_hl_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vavghcr DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vavghcr DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmacls_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmacls_rs0 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_mmacls_rs1 DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_cmaci_s0 DoubleRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_setbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (S2_setbit_i IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S2_asl_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_andn IntRegs:$src1, IntRegs:$src2),
-         (A4_andn IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2),
-         (M5_vrmpybsu DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_vrndpackwh DoubleRegs:$src1),
-         (S2_vrndpackwh DoubleRegs:$src1)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vcmac_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (M2_vcmac_s0_sat_r DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_vmaxuw DoubleRegs:$src1, DoubleRegs:$src2),
-         (A2_vmaxuw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C2_bitsclr IntRegs:$src1, IntRegs:$src2),
-         (C2_bitsclr IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_xor_xacc IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2),
-         (A4_vcmpbgtui DoubleRegs:$src1, u7_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A4_ornp DoubleRegs:$src1, DoubleRegs:$src2),
-         (A4_ornp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_and_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3),
-         (C4_and_or PredRegs:$src1, PredRegs:$src2, PredRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_sat_hh_s1 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_mpy_nac_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
-         (M2_mpy_nac_sat_hh_s0 IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_sat_ll IntRegs:$src1, IntRegs:$src2),
-         (A2_subh_h16_sat_ll IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_A2_subh_h16_sat_lh IntRegs:$src1, IntRegs:$src2),
-         (A2_subh_h16_sat_lh IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2),
-         (M2_vmpy2su_s1 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2),
-         (M2_vmpy2su_s0 IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S2_asr_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_C4_nbitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2),
-         (C4_nbitsclri IntRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2),
-         (S2_lsr_i_vh DoubleRegs:$src1, u4_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
-def: Pat<(int_hexagon_S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S2_lsr_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV5]>;
-
-// V55 Scalar Instructions.
-
-def: Pat<(int_hexagon_A5_ACS DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
-         (A5_ACS DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV55]>;
+def: Pat<(int_hexagon_S2_parityp DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_parityp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2),
+         (S2_lfsp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clbnorm IntRegs:$src1),
+         (S2_clbnorm IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_clbaddi IntRegs:$src1, s6_0ImmPred_timm:$src2),
+         (S4_clbaddi IntRegs:$src1, s6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_clbpnorm DoubleRegs:$src1),
+         (S4_clbpnorm DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred_timm:$src2),
+         (S4_clbpaddi DoubleRegs:$src1, s6_0ImmPred_timm:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clb IntRegs:$src1),
+         (S2_clb IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_cl0 IntRegs:$src1),
+         (S2_cl0 IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_cl1 IntRegs:$src1),
+         (S2_cl1 IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_clbp DoubleRegs:$src1),
+         (S2_clbp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_cl0p DoubleRegs:$src1),
+         (S2_cl0p DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_cl1p DoubleRegs:$src1),
+         (S2_cl1p DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_brev IntRegs:$src1),
+         (S2_brev IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_brevp DoubleRegs:$src1),
+         (S2_brevp DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_ct0 IntRegs:$src1),
+         (S2_ct0 IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_ct1 IntRegs:$src1),
+         (S2_ct1 IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_ct0p DoubleRegs:$src1),
+         (S2_ct0p DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_ct1p DoubleRegs:$src1),
+         (S2_ct1p DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_interleave DoubleRegs:$src1),
+         (S2_interleave DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_S2_deinterleave DoubleRegs:$src1),
+         (S2_deinterleave DoubleRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_Y2_dczeroa IntRegs:$src1),
+         (Y2_dczeroa IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_Y2_dccleana IntRegs:$src1),
+         (Y2_dccleana IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_Y2_dccleaninva IntRegs:$src1),
+         (Y2_dccleaninva IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_Y2_dcinva IntRegs:$src1),
+         (Y2_dcinva IntRegs:$src1)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_Y4_l2fetch IntRegs:$src1, IntRegs:$src2),
+         (Y4_l2fetch IntRegs:$src1, IntRegs:$src2)>, Requires<[HasV5]>;
+def: Pat<(int_hexagon_Y5_l2fetch IntRegs:$src1, DoubleRegs:$src2),
+         (Y5_l2fetch IntRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV5]>;
 
 // V60 Scalar Instructions.
 
-def: Pat<(int_hexagon_S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (S6_rol_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV60]>;
 def: Pat<(int_hexagon_S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
          (S6_rol_i_p DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
-         (S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S6_rol_i_r_acc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
 def: Pat<(int_hexagon_S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
          (S6_rol_i_p_acc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
-         (S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
-def: Pat<(int_hexagon_S6_rol_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2),
-         (S6_rol_i_r IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV60]>;
 def: Pat<(int_hexagon_S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
          (S6_rol_i_r_nac IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S6_rol_i_p_nac DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S6_rol_i_r_xacc IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S6_rol_i_p_xacc DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S6_rol_i_r_and IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3),
+         (S6_rol_i_r_or IntRegs:$src1, IntRegs:$src2, u5_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
+def: Pat<(int_hexagon_S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
+         (S6_rol_i_p_and DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
 def: Pat<(int_hexagon_S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3),
          (S6_rol_i_p_or DoubleRegs:$src1, DoubleRegs:$src2, u6_0ImmPred_timm:$src3)>, Requires<[HasV60]>;
 
 // V62 Scalar Instructions.
 
-def: Pat<(int_hexagon_S6_vtrunehb_ppp DoubleRegs:$src1, DoubleRegs:$src2),
-         (S6_vtrunehb_ppp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>;
-def: Pat<(int_hexagon_V6_ldntnt0 IntRegs:$src1),
-         (V6_ldntnt0 IntRegs:$src1)>, Requires<[HasV62]>;
-def: Pat<(int_hexagon_M6_vabsdiffub DoubleRegs:$src1, DoubleRegs:$src2),
-         (M6_vabsdiffub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>;
-def: Pat<(int_hexagon_S6_vtrunohb_ppp DoubleRegs:$src1, DoubleRegs:$src2),
-         (S6_vtrunohb_ppp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>;
 def: Pat<(int_hexagon_M6_vabsdiffb DoubleRegs:$src1, DoubleRegs:$src2),
          (M6_vabsdiffb DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>;
-def: Pat<(int_hexagon_A6_vminub_RdP DoubleRegs:$src1, DoubleRegs:$src2),
-         (A6_vminub_RdP DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>;
+def: Pat<(int_hexagon_M6_vabsdiffub DoubleRegs:$src1, DoubleRegs:$src2),
+         (M6_vabsdiffub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>;
 def: Pat<(int_hexagon_S6_vsplatrbp IntRegs:$src1),
          (S6_vsplatrbp IntRegs:$src1)>, Requires<[HasV62]>;
+def: Pat<(int_hexagon_S6_vtrunehb_ppp DoubleRegs:$src1, DoubleRegs:$src2),
+         (S6_vtrunehb_ppp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>;
+def: Pat<(int_hexagon_S6_vtrunohb_ppp DoubleRegs:$src1, DoubleRegs:$src2),
+         (S6_vtrunohb_ppp DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV62]>;
 
 // V65 Scalar Instructions.
 
 def: Pat<(int_hexagon_A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2),
-         (A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV65]>;
+         (C2_tfrpr (A6_vcmpbeq_notany DoubleRegs:$src1, DoubleRegs:$src2))>, Requires<[HasV65]>;
 
 // V66 Scalar Instructions.
 
-def: Pat<(int_hexagon_F2_dfsub DoubleRegs:$src1, DoubleRegs:$src2),
-         (F2_dfsub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV66]>;
-def: Pat<(int_hexagon_F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2),
-         (F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV66]>;
 def: Pat<(int_hexagon_M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3),
          (M2_mnaci IntRegs:$src1, IntRegs:$src2, IntRegs:$src3)>, Requires<[HasV66]>;
+def: Pat<(int_hexagon_F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2),
+         (F2_dfadd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV66]>;
+def: Pat<(int_hexagon_F2_dfsub DoubleRegs:$src1, DoubleRegs:$src2),
+         (F2_dfsub DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV66]>;
 def: Pat<(int_hexagon_S2_mask u5_0ImmPred_timm:$src1, u5_0ImmPred_timm:$src2),
          (S2_mask u5_0ImmPred_timm:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV66]>;
 
+// V67 Scalar Instructions.
+
+def: Pat<(int_hexagon_M7_dcmpyrw DoubleRegs:$src1, DoubleRegs:$src2),
+         (M7_dcmpyrw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyrw_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M7_dcmpyrw_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2),
+         (M7_dcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyrwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M7_dcmpyrwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyiw DoubleRegs:$src1, DoubleRegs:$src2),
+         (M7_dcmpyiw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyiw_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M7_dcmpyiw_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyiwc DoubleRegs:$src1, DoubleRegs:$src2),
+         (M7_dcmpyiwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_dcmpyiwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M7_dcmpyiwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_vdmpy DoubleRegs:$src1, DoubleRegs:$src2),
+         (M7_dcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_vdmpy_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (M7_dcmpyrwc_acc DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyrw DoubleRegs:$src1, DoubleRegs:$src2),
+         (M7_wcmpyrw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2),
+         (M7_wcmpyrwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyiw DoubleRegs:$src1, DoubleRegs:$src2),
+         (M7_wcmpyiw DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyiwc DoubleRegs:$src1, DoubleRegs:$src2),
+         (M7_wcmpyiwc DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyrw_rnd DoubleRegs:$src1, DoubleRegs:$src2),
+         (M7_wcmpyrw_rnd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyrwc_rnd DoubleRegs:$src1, DoubleRegs:$src2),
+         (M7_wcmpyrwc_rnd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyiw_rnd DoubleRegs:$src1, DoubleRegs:$src2),
+         (M7_wcmpyiw_rnd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_M7_wcmpyiwc_rnd DoubleRegs:$src1, DoubleRegs:$src2),
+         (M7_wcmpyiwc_rnd DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_A7_croundd_ri DoubleRegs:$src1, u6_0ImmPred_timm:$src2),
+         (A7_croundd_ri DoubleRegs:$src1, u6_0ImmPred_timm:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_A7_croundd_rr DoubleRegs:$src1, IntRegs:$src2),
+         (A7_croundd_rr DoubleRegs:$src1, IntRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_A7_clip IntRegs:$src1, u5_0ImmPred_timm:$src2),
+         (A7_clip IntRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_A7_vclip DoubleRegs:$src1, u5_0ImmPred_timm:$src2),
+         (A7_vclip DoubleRegs:$src1, u5_0ImmPred_timm:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_F2_dfmax DoubleRegs:$src1, DoubleRegs:$src2),
+         (F2_dfmax DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_F2_dfmin DoubleRegs:$src1, DoubleRegs:$src2),
+         (F2_dfmin DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_F2_dfmpyfix DoubleRegs:$src1, DoubleRegs:$src2),
+         (F2_dfmpyfix DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_F2_dfmpyll DoubleRegs:$src1, DoubleRegs:$src2),
+         (F2_dfmpyll DoubleRegs:$src1, DoubleRegs:$src2)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_F2_dfmpylh DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (F2_dfmpylh DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
+def: Pat<(int_hexagon_F2_dfmpyhh DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3),
+         (F2_dfmpyhh DoubleRegs:$src1, DoubleRegs:$src2, DoubleRegs:$src3)>, Requires<[HasV67]>;
+
 // V60 HVX Instructions.
 
-def: Pat<(int_hexagon_V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqb_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vminub HvxVR:$src1, HvxVR:$src2),
-         (V6_vminub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vminub_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vminub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaslw_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyhvsrs_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsathub HvxVR:$src1, HvxVR:$src2),
-         (V6_vsathub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsathub_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vsathub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddh_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddh_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
-         (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybusi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
-         (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshufoh HvxVR:$src1, HvxVR:$src2),
-         (V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshufoh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrwv HvxVR:$src1, HvxVR:$src2),
-         (V6_vasrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrwv_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vasrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2),
-         (V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsuisat_128B HvxWR:$src1, IntRegs:$src2),
-         (V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
-         (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrsadubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
-         (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vnavgw HvxVR:$src1, HvxVR:$src2),
-         (V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vnavgw_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vnavgh HvxVR:$src1, HvxVR:$src2),
-         (V6_vnavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vnavgh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vnavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavgub HvxVR:$src1, HvxVR:$src2),
-         (V6_vavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavgub_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubb HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubb_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavgubrnd HvxVR:$src1, HvxVR:$src2),
-         (V6_vavgubrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavgubrnd_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vavgubrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybusv HvxVR:$src1, HvxVR:$src2),
-         (V6_vrmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybusv_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vrmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vroundhb HvxVR:$src1, HvxVR:$src2),
-         (V6_vroundhb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vroundhb_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vroundhb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vadduhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsububsat HvxVR:$src1, HvxVR:$src2),
-         (V6_vsububsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsububsat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vsububsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpabus_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmux_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyhus HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyhus HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyhus_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyhus HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vS32b_qpred_ai HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
+         (V6_vS32b_qpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vS32b_qpred_ai_128B HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
+         (V6_vS32b_qpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vS32b_nqpred_ai HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
+         (V6_vS32b_nqpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vS32b_nqpred_ai_128B HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
+         (V6_vS32b_nqpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vS32b_nt_qpred_ai HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
+         (V6_vS32b_nt_qpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vS32b_nt_qpred_ai_128B HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
+         (V6_vS32b_nt_qpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vS32b_nt_nqpred_ai HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
+         (V6_vS32b_nt_nqpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vS32b_nt_nqpred_ai_128B HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
+         (V6_vS32b_nt_nqpred_ai HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_valignb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlalignb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_valignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlalignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vror HvxVR:$src1, IntRegs:$src2),
+         (V6_vror HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vror_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vror HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackub HvxVR:$src1),
+         (V6_vunpackub HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackub_128B HvxVR:$src1),
+         (V6_vunpackub HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackb HvxVR:$src1),
+         (V6_vunpackb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackb_128B HvxVR:$src1),
+         (V6_vunpackb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackuh HvxVR:$src1),
+         (V6_vunpackuh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackuh_128B HvxVR:$src1),
+         (V6_vunpackuh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackh HvxVR:$src1),
+         (V6_vunpackh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackh_128B HvxVR:$src1),
+         (V6_vunpackh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackob HvxWR:$src1, HvxVR:$src2),
+         (V6_vunpackob HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackob_128B HvxWR:$src1, HvxVR:$src2),
+         (V6_vunpackob HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vunpackoh HvxWR:$src1, HvxVR:$src2),
+         (V6_vunpackoh HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vunpackoh_128B HvxWR:$src1, HvxVR:$src2),
+         (V6_vunpackoh HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vpackeb HvxVR:$src1, HvxVR:$src2),
          (V6_vpackeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vpackeb_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vpackeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubhnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavghrnd HvxVR:$src1, HvxVR:$src2),
-         (V6_vavghrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavghrnd_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vavghrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vtran2x2_map HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vtran2x2_map HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vtran2x2_map_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vtran2x2_map HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdelta HvxVR:$src1, HvxVR:$src2),
-         (V6_vdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdelta_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vtmpyhb HvxWR:$src1, IntRegs:$src2),
-         (V6_vtmpyhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vtmpyhb_128B HvxWR:$src1, IntRegs:$src2),
-         (V6_vtmpyhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackeh HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackeh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vpackob HvxVR:$src1, HvxVR:$src2),
          (V6_vpackob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vpackob_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vpackob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmaxh HvxVR:$src1, HvxVR:$src2),
-         (V6_vmaxh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmaxh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmaxh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vtmpybus_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubuhsat HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubuhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubuhsat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubuhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrw_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_or HvxQR:$src1, HvxQR:$src2),
-         (V6_pred_or HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_or_128B HvxQR:$src1, HvxQR:$src2),
-         (V6_pred_or HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyub_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_lo HvxWR:$src1),
-         (V6_lo HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_lo_128B HvxWR:$src1),
-         (V6_lo HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubb_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vsubb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubb_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vsubb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiwh HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyiwh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiwh_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyiwh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiwb HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyiwb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiwb_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyiwb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ldu0 IntRegs:$src1),
-         (V6_ldu0 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ldu0_128B IntRegs:$src1),
-         (V6_ldu0 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuh_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgth_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavgh HvxVR:$src1, HvxVR:$src2),
-         (V6_vavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavgh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlalignb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vlalignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsh HvxVR:$src1),
-         (V6_vsh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsh_128B HvxVR:$src1),
-         (V6_vsh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_and_n HvxQR:$src1, HvxQR:$src2),
-         (V6_pred_and_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_and_n_128B HvxQR:$src1, HvxQR:$src2),
-         (V6_pred_and_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackoh HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackoh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackhub_sat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackhb_sat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackwuh_sat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpackwh_sat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vzb HvxVR:$src1),
+         (V6_vzb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vzb_128B HvxVR:$src1),
+         (V6_vzb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vsb HvxVR:$src1),
          (V6_vsb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsb_128B HvxVR:$src1),
          (V6_vsb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vroundwuh HvxVR:$src1, HvxVR:$src2),
-         (V6_vroundwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vroundwuh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vroundwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrhv HvxVR:$src1, HvxVR:$src2),
-         (V6_vasrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrhv_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vasrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshuffh HvxVR:$src1),
-         (V6_vshuffh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshuffh_128B HvxVR:$src1),
-         (V6_vshuffh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vnavgub HvxVR:$src1, HvxVR:$src2),
-         (V6_vnavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vnavgub_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vnavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybv HvxVR:$src1, HvxVR:$src2),
-         (V6_vrmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybv_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vrmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vnormamth HvxVR:$src1),
-         (V6_vnormamth HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vnormamth_128B HvxVR:$src1),
-         (V6_vnormamth HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhb HvxVR:$src1, IntRegs:$src2),
-         (V6_vdmpyhb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhb_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vdmpyhb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavguh HvxVR:$src1, HvxVR:$src2),
-         (V6_vavguh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavguh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vavguh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlsrwv HvxVR:$src1, HvxVR:$src2),
-         (V6_vlsrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlsrwv_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vlsrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlsrhv HvxVR:$src1, HvxVR:$src2),
-         (V6_vlsrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlsrhv_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vlsrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhisat HvxWR:$src1, IntRegs:$src2),
-         (V6_vdmpyhisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhisat_128B HvxWR:$src1, IntRegs:$src2),
-         (V6_vdmpyhisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2),
-         (V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhvsat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddw HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddw_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vzh HvxVR:$src1),
          (V6_vzh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vzh_128B HvxVR:$src1),
          (V6_vzh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddh HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmaxub HvxVR:$src1, HvxVR:$src2),
-         (V6_vmaxub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmaxub_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmaxub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyhv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vadduhsat HvxVR:$src1, HvxVR:$src2),
-         (V6_vadduhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vadduhsat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vadduhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshufoeh HvxVR:$src1, HvxVR:$src2),
-         (V6_vshufoeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshufoeh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vshufoeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyuhv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqh HvxVR:$src1, HvxVR:$src2),
-         (V6_veqh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_veqh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpabuuv HvxWR:$src1, HvxWR:$src2),
-         (V6_vmpabuuv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpabuuv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vmpabuuv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrwhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vminuh HvxVR:$src1, HvxVR:$src2),
-         (V6_vminuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vminuh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vminuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vror HvxVR:$src1, IntRegs:$src2),
-         (V6_vror HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vror_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vror HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_rnd_sacc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmaxuh HvxVR:$src1, HvxVR:$src2),
-         (V6_vmaxuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmaxuh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmaxuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsh_sat HvxVR:$src1),
-         (V6_vabsh_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsh_sat_128B HvxVR:$src1),
-         (V6_vabsh_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_or_n HvxQR:$src1, HvxQR:$src2),
-         (V6_pred_or_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_or_n_128B HvxQR:$src1, HvxQR:$src2),
-         (V6_pred_or_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdealb HvxVR:$src1),
-         (V6_vdealb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdealb_128B HvxVR:$src1),
-         (V6_vdealb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpybusv HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpybusv_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vzb HvxVR:$src1),
-         (V6_vzb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vzb_128B HvxVR:$src1),
-         (V6_vzb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsh HvxVR:$src1),
+         (V6_vsh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsh_128B HvxVR:$src1),
+         (V6_vsh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpybus HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpybus_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpybus_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vdmpybus_dv HvxWR:$src1, IntRegs:$src2),
          (V6_vdmpybus_dv HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vdmpybus_dv_128B HvxWR:$src1, IntRegs:$src2),
          (V6_vdmpybus_dv HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddbq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddb HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddb_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddwq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshufoeb HvxVR:$src1, HvxVR:$src2),
-         (V6_vshufoeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshufoeb_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vshufoeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2),
-         (V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpackhub_sat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vpackhub_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiwh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpybus_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpybus_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpybus_dv_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpybus_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpyhb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpyhb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpyhb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpyhb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb_dv HvxWR:$src1, IntRegs:$src2),
+         (V6_vdmpyhb_dv HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb_dv_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vdmpyhb_dv HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhvsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vdmpyhvsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhvsat_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsat_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsat_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhisat HvxWR:$src1, IntRegs:$src2),
+         (V6_vdmpyhisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhisat_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vdmpyhisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpyhisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhisat_acc_128B HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpyhisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsusat_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsusat_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2),
+         (V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsuisat_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vdmpyhsuisat HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsuisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpyhsuisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdmpyhsuisat_acc_128B HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdmpyhsuisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vtmpyb HvxWR:$src1, IntRegs:$src2),
          (V6_vtmpyb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vtmpyb_128B HvxWR:$src1, IntRegs:$src2),
          (V6_vtmpyb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpabusv HvxWR:$src1, HvxWR:$src2),
-         (V6_vmpabusv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpabusv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vmpabusv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_and HvxQR:$src1, HvxQR:$src2),
-         (V6_pred_and HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_and_128B HvxQR:$src1, HvxQR:$src2),
-         (V6_pred_and HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubwnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2),
-         (V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpackwuh_sat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vpackwuh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vswap_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpyb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpybus HvxWR:$src1, IntRegs:$src2),
+         (V6_vtmpybus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpybus_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vtmpybus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpybus_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vtmpybus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpyhb HvxWR:$src1, IntRegs:$src2),
+         (V6_vtmpyhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpyhb_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vtmpyhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vtmpyhb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyub HvxVR:$src1, IntRegs:$src2),
+         (V6_vrmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyub_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vrmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyub_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vrmpyub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubv HvxVR:$src1, HvxVR:$src2),
+         (V6_vrmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vrmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vrmpyubv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vrmpyubv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtb_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybv HvxVR:$src1, HvxVR:$src2),
+         (V6_vrmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vrmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+         (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+         (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+         (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpyubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+         (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybus HvxVR:$src1, IntRegs:$src2),
+         (V6_vrmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybus_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vrmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vrmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybus_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vrmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+         (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+         (V6_vrmpybusi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+         (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybusi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+         (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybusv HvxVR:$src1, HvxVR:$src2),
+         (V6_vrmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybusv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vrmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrmpybusv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vrmpybusv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrmpybusv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vrmpybusv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdsaduh HvxWR:$src1, IntRegs:$src2),
+         (V6_vdsaduh HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdsaduh_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vdsaduh HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdsaduh_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+         (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
+         (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+         (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrsadubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
+         (V6_vrsadubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrw HvxVR:$src1, IntRegs:$src2),
+         (V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrw_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vaslw HvxVR:$src1, IntRegs:$src2),
          (V6_vaslw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vaslw_128B HvxVR:$src1, IntRegs:$src2),
          (V6_vaslw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2),
-         (V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpackhb_sat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vpackhb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyih_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshuffvdd_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddb_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vaddb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddb_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vaddb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vunpackub HvxVR:$src1),
-         (V6_vunpackub HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vunpackub_128B HvxVR:$src1),
-         (V6_vunpackub HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuw HvxVR:$src1, HvxVR:$src2),
-         (V6_vgtuw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuw_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vgtuw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvwh_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtub HvxVR:$src1, HvxVR:$src2),
-         (V6_vgtub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtub_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vgtub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyowh HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyieoh HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyieoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyieoh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyieoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_extractw HvxVR:$src1, IntRegs:$src2),
-         (V6_extractw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_extractw_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_extractw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavgwrnd HvxVR:$src1, HvxVR:$src2),
-         (V6_vavgwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavgwrnd_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vavgwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsat_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vdmpyhsat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtub_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyub HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyub_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyuh HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyuh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyuh_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyuh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vunpackob HvxWR:$src1, HvxVR:$src2),
-         (V6_vunpackob HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vunpackob_128B HvxWR:$src1, HvxVR:$src2),
-         (V6_vunpackob HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpahb HvxWR:$src1, IntRegs:$src2),
-         (V6_vmpahb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpahb_128B HvxWR:$src1, IntRegs:$src2),
-         (V6_vmpahb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vandqrt HvxQR:$src1, IntRegs:$src2),
-         (V6_vandqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vandqrt_128B HvxQR:$src1, IntRegs:$src2),
-         (V6_vandqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vxor HvxVR:$src1, HvxVR:$src2),
-         (V6_vxor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vxor_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vxor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlsrw HvxVR:$src1, IntRegs:$src2),
+         (V6_vlsrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlsrw_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vlsrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwv HvxVR:$src1, HvxVR:$src2),
+         (V6_vasrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vasrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslwv HvxVR:$src1, HvxVR:$src2),
+         (V6_vaslwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslwv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaslwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlsrwv HvxVR:$src1, HvxVR:$src2),
+         (V6_vlsrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlsrwv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vlsrwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrh HvxVR:$src1, IntRegs:$src2),
+         (V6_vasrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrh_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vasrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslh HvxVR:$src1, IntRegs:$src2),
+         (V6_vaslh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslh_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vaslh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlsrh HvxVR:$src1, IntRegs:$src2),
+         (V6_vlsrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlsrh_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vlsrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhv HvxVR:$src1, HvxVR:$src2),
+         (V6_vasrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vasrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslhv HvxVR:$src1, HvxVR:$src2),
+         (V6_vaslhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslhv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaslhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlsrhv HvxVR:$src1, HvxVR:$src2),
+         (V6_vlsrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlsrhv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vlsrhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwh_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
          (V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vasrwhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
          (V6_vasrwhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyhsat_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vrmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybus_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vrmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubhw HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubhw_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdealb4w HvxVR:$src1, HvxVR:$src2),
-         (V6_vdealb4w HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdealb4w_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vdealb4w HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_sacc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpybv HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpybv_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsdiffh HvxVR:$src1, HvxVR:$src2),
-         (V6_vabsdiffh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsdiffh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vabsdiffh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshuffob HvxVR:$src1, HvxVR:$src2),
-         (V6_vshuffob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshuffob_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vshuffob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyub_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vnormamtw HvxVR:$src1),
-         (V6_vnormamtw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vnormamtw_128B HvxVR:$src1),
-         (V6_vnormamtw HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vunpackuh HvxVR:$src1),
-         (V6_vunpackuh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vunpackuh_128B HvxVR:$src1),
-         (V6_vunpackuh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuh_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiewuh_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vunpackoh HvxWR:$src1, HvxVR:$src2),
-         (V6_vunpackoh HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vunpackoh_128B HvxWR:$src1, HvxVR:$src2),
-         (V6_vunpackoh HvxWR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2),
-         (V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsat_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vdmpyhsat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyubv HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyubv_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyhss HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyhss HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyhss_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyhss HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_hi HvxWR:$src1),
-         (V6_hi HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_hi_128B HvxWR:$src1),
-         (V6_hi HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
          (V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vasrwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
          (V6_vasrwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqw HvxVR:$src1, HvxVR:$src2),
-         (V6_veqw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqw_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_veqw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdsaduh HvxWR:$src1, IntRegs:$src2),
-         (V6_vdsaduh HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdsaduh_128B HvxWR:$src1, IntRegs:$src2),
-         (V6_vdsaduh HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vroundwh HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vroundwh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vroundwuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vroundwuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhbrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vroundhb HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundhb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vroundhb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundhb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vroundhub HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vroundhub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vroundhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslw_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vaslw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrw_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vasrw_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddb HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubb HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddb_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddb_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubb_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubb_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubb_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddh HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubh HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddh_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddh_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubh_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubh_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddw HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vsubw HvxVR:$src1, HvxVR:$src2),
          (V6_vsubw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsubw_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vsubw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddw_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddw_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vsubw_dv HvxWR:$src1, HvxWR:$src2),
          (V6_vsubw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsubw_dv_128B HvxWR:$src1, HvxWR:$src2),
          (V6_vsubw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqb_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyih HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyih HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyih_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyih HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vtmpyb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vtmpyb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybus HvxVR:$src1, IntRegs:$src2),
-         (V6_vrmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybus_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vrmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpybus_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgth_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddubsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddubsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddubsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddubsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddubsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsububsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vsububsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsububsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsububsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsububsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduhsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vadduhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduhsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vadduhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vadduhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuhsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubuhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuhsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubuhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vsubhsat HvxVR:$src1, HvxVR:$src2),
          (V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsubhsat_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vsubhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
-         (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
-         (V6_vrmpyubi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsw HvxVR:$src1),
-         (V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsw_128B HvxVR:$src1),
-         (V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddwsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddwsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2),
          (V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vaddwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
          (V6_vaddwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlsrw HvxVR:$src1, IntRegs:$src2),
-         (V6_vlsrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlsrw_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vlsrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsh HvxVR:$src1),
-         (V6_vabsh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsh_128B HvxVR:$src1),
-         (V6_vabsh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlsrh HvxVR:$src1, IntRegs:$src2),
-         (V6_vlsrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlsrh_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vlsrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_valignb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_valignb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubwsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubwsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgub HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgubrnd HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgubrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgubrnd_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgubrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavguh HvxVR:$src1, HvxVR:$src2),
+         (V6_vavguh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavguh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavguh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavguhrnd HvxVR:$src1, HvxVR:$src2),
+         (V6_vavguhrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavguhrnd_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavguhrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgh HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavghrnd HvxVR:$src1, HvxVR:$src2),
+         (V6_vavghrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavghrnd_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavghrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnavgh HvxVR:$src1, HvxVR:$src2),
+         (V6_vnavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnavgh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vnavgh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgw HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgwrnd HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgwrnd_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnavgw HvxVR:$src1, HvxVR:$src2),
+         (V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnavgw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vnavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsdiffub HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsdiffub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsdiffuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsdiffh HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsdiffh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsdiffw HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsdiffw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vabsdiffw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnavgub HvxVR:$src1, HvxVR:$src2),
+         (V6_vnavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnavgub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vnavgub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddubh HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddubh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsububh HvxVR:$src1, HvxVR:$src2),
+         (V6_vsububh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsububh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsububh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhw HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubhw HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubhw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduhw HvxVR:$src1, HvxVR:$src2),
+         (V6_vadduhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduhw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vadduhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuhw HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuhw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vd0 ),
+         (V6_vd0 )>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vd0_128B ),
+         (V6_vd0 )>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddbq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubbq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsubhq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vsubhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpackoh HvxVR:$src1, HvxVR:$src2),
-         (V6_vpackoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpackoh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vpackoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vdmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpybus_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vdmpybus_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhvsat_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vdmpyhvsat_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vrmpybv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddhsat HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddhsat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddhsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vcombine HvxVR:$src1, HvxVR:$src2),
-         (V6_vcombine HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vcombine_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vcombine HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
-         (V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vandqrt_acc_128B HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
-         (V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaslhv HvxVR:$src1, HvxVR:$src2),
-         (V6_vaslhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaslhv_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vaslhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vinsertwr HvxVR:$src1, IntRegs:$src2),
-         (V6_vinsertwr HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vinsertwr_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vinsertwr HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubh_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vsubh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubh_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vsubh_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshuffb HvxVR:$src1),
-         (V6_vshuffb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshuffb_128B HvxVR:$src1),
-         (V6_vshuffb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vand HvxVR:$src1, HvxVR:$src2),
-         (V6_vand HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vand_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vand HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubhnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddwq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubwq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddwnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubwnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vsubwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsh HvxVR:$src1),
+         (V6_vabsh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsh_128B HvxVR:$src1),
+         (V6_vabsh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsh_sat HvxVR:$src1),
+         (V6_vabsh_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsh_sat_128B HvxVR:$src1),
+         (V6_vabsh_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsw HvxVR:$src1),
+         (V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsw_128B HvxVR:$src1),
+         (V6_vabsw HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsw_sat HvxVR:$src1),
+         (V6_vabsw_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsw_sat_128B HvxVR:$src1),
+         (V6_vabsw_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybv HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpybv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyubv HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyubv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyubv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybusv HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybusv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpybusv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybusv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabusv HvxWR:$src1, HvxWR:$src2),
+         (V6_vmpabusv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabusv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vmpabusv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabuuv HvxWR:$src1, HvxWR:$src2),
+         (V6_vmpabuuv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabuuv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vmpabuuv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vmpyhv HvxVR:$src1, HvxVR:$src2),
          (V6_vmpyhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vmpyhv_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vmpyhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsuisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vdmpyhsuisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsuisat_acc_128B HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vdmpyhsuisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsububsat_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vsububsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtb_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdsaduh_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vdsaduh_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyub HvxVR:$src1, IntRegs:$src2),
-         (V6_vrmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyub_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vrmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyuh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vcl0h HvxVR:$src1),
-         (V6_vcl0h HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vcl0h_128B HvxVR:$src1),
-         (V6_vcl0h HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuhv HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyuhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuhv_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyuhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuhv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyuhv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhvsrs_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyhvsrs HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhus HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyhus HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhus_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyhus HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vmpyhus_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vmpyhus_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpybv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpybv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
-         (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrsadubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
-         (V6_vrsadubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhb_dv_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vdmpyhb_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshufeh HvxVR:$src1, HvxVR:$src2),
-         (V6_vshufeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshufeh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vshufeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyih HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyih HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyih_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyih HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyih_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyih_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vmpyewuh HvxVR:$src1, HvxVR:$src2),
          (V6_vmpyewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vmpyewuh_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vmpyewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyhsrs_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpybus_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vdmpybus_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpybus_dv_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vdmpybus_dv_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddubh HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddubh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrwh_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasrwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ld0 IntRegs:$src1),
-         (V6_ld0 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ld0_128B IntRegs:$src1),
-         (V6_ld0 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpopcounth HvxVR:$src1),
-         (V6_vpopcounth HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpopcounth_128B HvxVR:$src1),
-         (V6_vpopcounth HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ldnt0 IntRegs:$src1),
-         (V6_ldnt0 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ldnt0_128B IntRegs:$src1),
-         (V6_ldnt0 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgth_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddubsat_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vaddubsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpackeh HvxVR:$src1, HvxVR:$src2),
-         (V6_vpackeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpackeh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vpackeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyh HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyh_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vminh HvxVR:$src1, HvxVR:$src2),
-         (V6_vminh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vminh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vminh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_scalar2 IntRegs:$src1),
-         (V6_pred_scalar2 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_scalar2_128B IntRegs:$src1),
-         (V6_pred_scalar2 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdealh HvxVR:$src1),
-         (V6_vdealh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdealh_128B HvxVR:$src1),
-         (V6_vdealh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2),
-         (V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vpackwh_sat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vpackwh_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaslh HvxVR:$src1, IntRegs:$src2),
-         (V6_vaslh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaslh_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vaslh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vor HvxVR:$src1, HvxVR:$src2),
-         (V6_vor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vor_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_rnd_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_sacc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyowh_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_rnd_sacc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyowh_rnd_sacc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyieoh HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyieoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyieoh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyieoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiewuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vmpyiowh HvxVR:$src1, HvxVR:$src2),
          (V6_vmpyiowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vmpyiowh_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vmpyiowh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
-         (V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_oracc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
-         (V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vandvrt HvxVR:$src1, IntRegs:$src2),
-         (V6_vandvrt HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vandvrt_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vandvrt HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqh_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vadduhw HvxVR:$src1, HvxVR:$src2),
-         (V6_vadduhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vadduhw_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vadduhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vcl0w HvxVR:$src1),
-         (V6_vcl0w HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vcl0w_128B HvxVR:$src1),
-         (V6_vcl0w HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyihb HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyihb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyihb_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyihb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vtmpybus HvxWR:$src1, IntRegs:$src2),
-         (V6_vtmpybus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vtmpybus_128B HvxWR:$src1, IntRegs:$src2),
-         (V6_vtmpybus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vd0 ),
-         (V6_vd0 )>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vd0_128B ),
-         (V6_vd0 )>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqh_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpybus HvxVR:$src1, IntRegs:$src2),
-         (V6_vdmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpybus_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vdmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtub_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiewh_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiewuh_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyiewuh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyub HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyub_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyub_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyub_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vmpybus HvxVR:$src1, IntRegs:$src2),
          (V6_vmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vmpybus_128B HvxVR:$src1, IntRegs:$src2),
          (V6_vmpybus HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vdmpyhb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhb_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vdmpyhb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vandvrt_acc_128B HvxQR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vassign HvxVR:$src1),
-         (V6_vassign HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vassign_128B HvxVR:$src1),
-         (V6_vassign HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddwnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddwnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtub_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhb_dv HvxWR:$src1, IntRegs:$src2),
-         (V6_vdmpyhb_dv HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhb_dv_128B HvxWR:$src1, IntRegs:$src2),
-         (V6_vdmpyhb_dv HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vunpackb HvxVR:$src1),
-         (V6_vunpackb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vunpackb_128B HvxVR:$src1),
-         (V6_vunpackb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vunpackh HvxVR:$src1),
-         (V6_vunpackh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vunpackh_128B HvxVR:$src1),
-         (V6_vunpackh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpybus_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpybus_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabus HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpabus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabus_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpabus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabus_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vmpabus_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpahb HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpahb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpahb_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpahb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
          (V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vmpahb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
          (V6_vmpahb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddbnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddbnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
-         (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlalignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
-         (V6_vlalignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsatwh HvxVR:$src1, HvxVR:$src2),
-         (V6_vsatwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsatwh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vsatwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuh HvxVR:$src1, HvxVR:$src2),
-         (V6_vgtuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vgtuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyh HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyh_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhsat_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyhsat_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhss HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyhss HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhss_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyhss HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyhsrs_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyhsrs HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuh HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyuh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuh_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyuh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyuh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyihb HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyihb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyihb_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyihb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
          (V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vmpyihb_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
          (V6_vmpyihb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybusv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vrmpybusv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybusv_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vrmpybusv_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrdelta HvxVR:$src1, HvxVR:$src2),
-         (V6_vrdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrdelta_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vrdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vroundwh HvxVR:$src1, HvxVR:$src2),
-         (V6_vroundwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vroundwh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vroundwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddw_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vaddw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddw_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vaddw_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwb HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyiwb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwb_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyiwb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
          (V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vmpyiwb_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
          (V6_vmpyiwb_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubbq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vsubbq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
-         (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_valignbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
-         (V6_valignbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddwsat HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddwsat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwh HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyiwh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwh_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyiwh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyiwh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vand HvxVR:$src1, HvxVR:$src2),
+         (V6_vand HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vand_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vand HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vor HvxVR:$src1, HvxVR:$src2),
+         (V6_vor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vor_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vxor HvxVR:$src1, HvxVR:$src2),
+         (V6_vxor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vxor_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vxor HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnot HvxVR:$src1),
+         (V6_vnot HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnot_128B HvxVR:$src1),
+         (V6_vnot HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandqrt HvxQR:$src1, IntRegs:$src2),
+         (V6_vandqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandqrt_128B HvxQR:$src1, IntRegs:$src2),
+         (V6_vandqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
+         (V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandqrt_acc_128B HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
+         (V6_vandqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandvrt HvxVR:$src1, IntRegs:$src2),
+         (V6_vandvrt HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandvrt_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vandvrt HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandvrt_acc_128B HvxQR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vandvrt_acc HvxQR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtw HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqw HvxVR:$src1, HvxVR:$src2),
+         (V6_veqw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_veqw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_veqw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_veqw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsdiffub HvxVR:$src1, HvxVR:$src2),
-         (V6_vabsdiffub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsdiffub_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vabsdiffub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vshuffeb HvxVR:$src1, HvxVR:$src2),
-         (V6_vshuffeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vshuffeb_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vshuffeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2),
-         (V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsdiffuh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vabsdiffuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_veqw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_veqw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_veqw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
@@ -2753,584 +2683,664 @@ def: Pat<(int_hexagon_V6_vgth HvxVR:$src1, HvxVR:$src2),
          (V6_vgth HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vgth_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vgth HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtuw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgth_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgth_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgth_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgth_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgth_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgth_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqh HvxVR:$src1, HvxVR:$src2),
+         (V6_veqh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_veqh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqh_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqh_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vgtb HvxVR:$src1, HvxVR:$src2),
          (V6_vgtb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vgtb_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vgtb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtw HvxVR:$src1, HvxVR:$src2),
-         (V6_vgtw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtw_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vgtw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubwq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vsubwq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vnot HvxVR:$src1),
-         (V6_vnot HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vnot_128B HvxVR:$src1),
-         (V6_vnot HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtb_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vgtb_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vgtb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtb_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqb HvxVR:$src1, HvxVR:$src2),
+         (V6_veqb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_veqb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqb_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqb_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqb_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqb_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_veqb_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuw HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtuw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtuw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuw_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuw_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vgtuw_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vgtuw_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddubsat HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddubsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddubsat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddubsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuh_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuh_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuh_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuh_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtuh_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtuh_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtub HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vgtub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtub_and_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtub_and HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtub_or_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtub_or HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vgtub_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vgtub_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_or HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_or HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_or_128B HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_or HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_and HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_and HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_and_128B HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_and HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_not HvxQR:$src1),
+         (V6_pred_not HvxQR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_not_128B HvxQR:$src1),
+         (V6_pred_not HvxQR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_xor HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_xor HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_xor_128B HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_xor HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_and_n HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_and_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_and_n_128B HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_and_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_or_n HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_or_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_or_n_128B HvxQR:$src1, HvxQR:$src2),
+         (V6_pred_or_n HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_pred_scalar2 IntRegs:$src1),
+         (V6_pred_scalar2 IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_pred_scalar2_128B IntRegs:$src1),
+         (V6_pred_scalar2 IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmux_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmux HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vswap_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vswap HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmaxub HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmaxub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vminub HvxVR:$src1, HvxVR:$src2),
+         (V6_vminub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vminub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vminub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmaxuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmaxuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vminuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vminuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vminuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vminuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmaxh HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmaxh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vminh HvxVR:$src1, HvxVR:$src2),
+         (V6_vminh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vminh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vminh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vmaxw HvxVR:$src1, HvxVR:$src2),
          (V6_vmaxw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vmaxw_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vmaxw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaslwv HvxVR:$src1, HvxVR:$src2),
-         (V6_vaslwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaslwv_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vaslwv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsw_sat HvxVR:$src1),
-         (V6_vabsw_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsw_sat_128B HvxVR:$src1),
-         (V6_vabsw_sat HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vsubwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vroundhub HvxVR:$src1, HvxVR:$src2),
-         (V6_vroundhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vroundhub_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vroundhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vdmpyhisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhisat_acc_128B HvxVR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vdmpyhisat_acc HvxVR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpabus HvxWR:$src1, IntRegs:$src2),
-         (V6_vmpabus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpabus_128B HvxWR:$src1, IntRegs:$src2),
-         (V6_vmpabus HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vassignp HvxWR:$src1),
-         (V6_vassignp HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vassignp_128B HvxWR:$src1),
-         (V6_vassignp HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqb HvxVR:$src1, HvxVR:$src2),
-         (V6_veqb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqb_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_veqb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsububh HvxVR:$src1, HvxVR:$src2),
-         (V6_vsububh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsububh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vsububh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vminw HvxVR:$src1, HvxVR:$src2),
+         (V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vminw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsathub HvxVR:$src1, HvxVR:$src2),
+         (V6_vsathub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsathub_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsathub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsatwh HvxVR:$src1, HvxVR:$src2),
+         (V6_vsatwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsatwh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsatwh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshuffeb HvxVR:$src1, HvxVR:$src2),
+         (V6_vshuffeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshuffeb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vshuffeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshuffob HvxVR:$src1, HvxVR:$src2),
+         (V6_vshuffob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshuffob_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vshuffob HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshufeh HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshufeh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshufoh HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshufoh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufoh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshuffvdd_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vshuffvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdealvdd_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshufoeh HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufoeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshufoeh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufoeh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshufoeb HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufoeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshufoeb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vshufoeb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdealh HvxVR:$src1),
+         (V6_vdealh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdealh_128B HvxVR:$src1),
+         (V6_vdealh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdealb HvxVR:$src1),
+         (V6_vdealb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdealb_128B HvxVR:$src1),
+         (V6_vdealb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdealb4w HvxVR:$src1, HvxVR:$src2),
+         (V6_vdealb4w HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdealb4w_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vdealb4w HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshuffh HvxVR:$src1),
+         (V6_vshuffh HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshuffh_128B HvxVR:$src1),
+         (V6_vshuffh HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vshuffb HvxVR:$src1),
+         (V6_vshuffb HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vshuffb_128B HvxVR:$src1),
+         (V6_vshuffb HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_extractw HvxVR:$src1, IntRegs:$src2),
+         (V6_extractw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_extractw_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_extractw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vinsertwr HvxVR:$src1, IntRegs:$src2),
+         (V6_vinsertwr HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vinsertwr_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vinsertwr HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_lvsplatw IntRegs:$src1),
          (V6_lvsplatw IntRegs:$src1)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_lvsplatw_128B IntRegs:$src1),
          (V6_lvsplatw IntRegs:$src1)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddhnq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddhnq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2),
-         (V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsusat_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vdmpyhsusat HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_not HvxQR:$src1),
-         (V6_pred_not HvxQR:$src1)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_not_128B HvxQR:$src1),
-         (V6_pred_not HvxQR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vassignp HvxWR:$src1),
+         (V6_vassignp HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vassignp_128B HvxWR:$src1),
+         (V6_vassignp HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vassign HvxVR:$src1),
+         (V6_vassign HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vassign_128B HvxVR:$src1),
+         (V6_vassign HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcombine HvxVR:$src1, HvxVR:$src2),
+         (V6_vcombine HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcombine_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vcombine HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdelta HvxVR:$src1, HvxVR:$src2),
+         (V6_vdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdelta_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrdelta HvxVR:$src1, HvxVR:$src2),
+         (V6_vrdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrdelta_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vrdelta HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcl0w HvxVR:$src1),
+         (V6_vcl0w HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcl0w_128B HvxVR:$src1),
+         (V6_vcl0w HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vcl0h HvxVR:$src1),
+         (V6_vcl0h HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vcl0h_128B HvxVR:$src1),
+         (V6_vcl0h HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnormamtw HvxVR:$src1),
+         (V6_vnormamtw HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnormamtw_128B HvxVR:$src1),
+         (V6_vnormamtw HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vnormamth HvxVR:$src1),
+         (V6_vnormamth HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vnormamth_128B HvxVR:$src1),
+         (V6_vnormamth HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vpopcounth HvxVR:$src1),
+         (V6_vpopcounth HvxVR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vpopcounth_128B HvxVR:$src1),
+         (V6_vpopcounth HvxVR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvvb HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
+         (V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
+         (V6_vlutvvb_oracc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvwh HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
          (V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vlutvwh_oracc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4),
          (V6_vlutvwh_oracc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, IntRegsLow8:$src4)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiewh_acc_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyiewh_acc HvxVR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdealvdd_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vdealvdd HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavgw HvxVR:$src1, HvxVR:$src2),
-         (V6_vavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavgw_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vavgw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdmpyhsusat_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vdmpyhsusat_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vgtw_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vgtw_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vtmpyhb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vtmpyhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddhw HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddhw_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddhq_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddhq HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyubv HvxVR:$src1, HvxVR:$src2),
-         (V6_vrmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyubv_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vrmpyubv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubh HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
-         (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyubi_128B HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3),
-         (V6_vrmpyubi HvxWR:$src1, IntRegs:$src2, u1_0ImmPred_timm:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vminw HvxVR:$src1, HvxVR:$src2),
-         (V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vminw_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vminw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyubv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyubv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_pred_xor HvxQR:$src1, HvxQR:$src2),
-         (V6_pred_xor HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_pred_xor_128B HvxQR:$src1, HvxQR:$src2),
-         (V6_pred_xor HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_veqb_xor_128B HvxQR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_veqb_xor HvxQR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiewuh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyiewuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpybusv_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpybusv_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavguhrnd HvxVR:$src1, HvxVR:$src2),
-         (V6_vavguhrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavguhrnd_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vavguhrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_rnd_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyowh_rnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubwsat HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubwsat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubuhw HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubuhw_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubuhw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
-         (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybusi_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4),
-         (V6_vrmpybusi_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3, u1_0ImmPred_timm:$src4)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrw HvxVR:$src1, IntRegs:$src2),
-         (V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrw_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vasrw HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrh HvxVR:$src1, IntRegs:$src2),
-         (V6_vasrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrh_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vasrh HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyuhv HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyuhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyuhv_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyuhv HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrhbrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasrhbrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubuhsat_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vsubuhsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV60, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsdiffw HvxVR:$src1, HvxVR:$src2),
-         (V6_vabsdiffw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsdiffw_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vabsdiffw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_hi HvxWR:$src1),
+         (V6_hi HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_hi_128B HvxWR:$src1),
+         (V6_hi HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_lo HvxWR:$src1),
+         (V6_lo HvxWR:$src1)>, Requires<[HasV60, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_lo_128B HvxWR:$src1),
+         (V6_lo HvxWR:$src1)>, Requires<[HasV60, UseHVX128B]>;
 
 // V62 HVX Instructions.
 
-def: Pat<(int_hexagon_V6_vandnqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
-         (V6_vandnqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vandnqrt_acc_128B HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
-         (V6_vandnqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddclbh HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddclbh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddclbh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddclbh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyowh_64_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyewuh_64_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsatuwuh HvxVR:$src1, HvxVR:$src2),
-         (V6_vsatuwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsatuwuh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vsatuwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_shuffeqh HvxQR:$src1, HvxQR:$src2),
-         (V6_shuffeqh HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_shuffeqh_128B HvxQR:$src1, HvxQR:$src2),
-         (V6_shuffeqh HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_shuffeqw HvxQR:$src1, HvxQR:$src2),
-         (V6_shuffeqw HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_shuffeqw_128B HvxQR:$src1, HvxQR:$src2),
-         (V6_shuffeqw HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ldcnpnt0 PredRegs:$src1, IntRegs:$src2),
-         (V6_ldcnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ldcnpnt0_128B PredRegs:$src1, IntRegs:$src2),
-         (V6_ldcnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
-         (V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubcarry_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
-         (V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrhbsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vminb HvxVR:$src1, HvxVR:$src2),
-         (V6_vminb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vminb_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vminb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpauhb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddhw_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vlsrb HvxVR:$src1, IntRegs:$src2),
          (V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vlsrb_128B HvxVR:$src1, IntRegs:$src2),
          (V6_vlsrb HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
-         (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvwhi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
-         (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddububb_sat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubbsat_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ldtp0 PredRegs:$src1, IntRegs:$src2),
-         (V6_ldtp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ldtp0_128B PredRegs:$src1, IntRegs:$src2),
-         (V6_ldtp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
-         (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_oracci_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
-         (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubuwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ldpnt0 PredRegs:$src1, IntRegs:$src2),
-         (V6_ldpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ldpnt0_128B PredRegs:$src1, IntRegs:$src2),
-         (V6_ldpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vandvnqv HvxQR:$src1, HvxVR:$src2),
-         (V6_vandvnqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vandvnqv_128B HvxQR:$src1, HvxVR:$src2),
-         (V6_vandvnqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_lvsplatb IntRegs:$src1),
-         (V6_lvsplatb IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_lvsplatb_128B IntRegs:$src1),
-         (V6_lvsplatb IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_lvsplath IntRegs:$src1),
-         (V6_lvsplath IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_lvsplath_128B IntRegs:$src1),
-         (V6_lvsplath IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ldtpnt0 PredRegs:$src1, IntRegs:$src2),
-         (V6_ldtpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ldtpnt0_128B PredRegs:$src1, IntRegs:$src2),
-         (V6_ldtpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvwh_nm_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ldnpnt0 PredRegs:$src1, IntRegs:$src2),
-         (V6_ldnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ldnpnt0_128B PredRegs:$src1, IntRegs:$src2),
-         (V6_ldnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpauhb HvxWR:$src1, IntRegs:$src2),
-         (V6_vmpauhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpauhb_128B HvxWR:$src1, IntRegs:$src2),
-         (V6_vmpauhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ldtnp0 PredRegs:$src1, IntRegs:$src2),
-         (V6_ldtnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ldtnp0_128B PredRegs:$src1, IntRegs:$src2),
-         (V6_ldtnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasruwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrhbsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasrhbsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrounduwuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vrounduwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrounduwuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vrounduwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vrounduhub HvxVR:$src1, HvxVR:$src2),
          (V6_vrounduhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vrounduhub_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vrounduhub HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vadduhw_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ldcp0 PredRegs:$src1, IntRegs:$src2),
-         (V6_ldcp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ldcp0_128B PredRegs:$src1, IntRegs:$src2),
-         (V6_ldcp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vadduwsat HvxVR:$src1, HvxVR:$src2),
          (V6_vadduwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vadduwsat_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vadduwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ldtnpnt0 PredRegs:$src1, IntRegs:$src2),
-         (V6_ldtnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ldtnpnt0_128B PredRegs:$src1, IntRegs:$src2),
-         (V6_ldtnpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuwsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuwsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubuwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubuwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vaddbsat HvxVR:$src1, HvxVR:$src2),
          (V6_vaddbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vaddbsat_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vaddbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vandnqrt HvxQR:$src1, IntRegs:$src2),
-         (V6_vandnqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vandnqrt_128B HvxQR:$src1, IntRegs:$src2),
-         (V6_vandnqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddbsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddbsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vaddbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubbsat HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubbsat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubbsat_dv_128B HvxWR:$src1, HvxWR:$src2),
+         (V6_vsubbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
+         (V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddcarry_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
+         (V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
+         (V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubcarry_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
+         (V6_vsubcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddububb_sat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsubububb_sat_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddhw_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vadduhw_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vadduhw_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddubh_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyewuh_64_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmpyewuh_64 HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyowh_64_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
+         (V6_vmpyowh_64_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpauhb HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpauhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpauhb_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpauhb HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpauhb_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vmpauhb_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyiwub HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyiwub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyiwub_128B HvxVR:$src1, IntRegs:$src2),
+         (V6_vmpyiwub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
          (V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vmpyiwub_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
          (V6_vmpyiwub_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmaxb HvxVR:$src1, HvxVR:$src2),
-         (V6_vmaxb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmaxb_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vmaxb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandnqrt HvxQR:$src1, IntRegs:$src2),
+         (V6_vandnqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandnqrt_128B HvxQR:$src1, IntRegs:$src2),
+         (V6_vandnqrt HvxQR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandnqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
+         (V6_vandnqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandnqrt_acc_128B HvxVR:$src1, HvxQR:$src2, IntRegs:$src3),
+         (V6_vandnqrt_acc HvxVR:$src1, HvxQR:$src2, IntRegs:$src3)>, Requires<[HasV62, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vandvqv HvxQR:$src1, HvxVR:$src2),
          (V6_vandvqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vandvqv_128B HvxQR:$src1, HvxVR:$src2),
          (V6_vandvqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
-         (V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddcarry_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
-         (V6_vaddcarry HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasrwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
-         (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvvbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
-         (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubuwsat HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubuwsat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubuwsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddbsat_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vaddbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddbsat_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vaddbsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ldnp0 PredRegs:$src1, IntRegs:$src2),
-         (V6_ldnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ldnp0_128B PredRegs:$src1, IntRegs:$src2),
-         (V6_ldnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasruwuhrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasruwuhrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrounduwuh HvxVR:$src1, HvxVR:$src2),
-         (V6_vrounduwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrounduwuh_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vrounduwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlutvvb_nm_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vandvnqv HvxQR:$src1, HvxVR:$src2),
+         (V6_vandvnqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vandvnqv_128B HvxQR:$src1, HvxVR:$src2),
+         (V6_vandvnqv HvxQR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_pred_scalar2v2 IntRegs:$src1),
          (V6_pred_scalar2v2 IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_pred_scalar2v2_128B IntRegs:$src1),
          (V6_pred_scalar2v2 IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ldp0 PredRegs:$src1, IntRegs:$src2),
-         (V6_ldp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ldp0_128B PredRegs:$src1, IntRegs:$src2),
-         (V6_ldp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddubh_acc_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
-         (V6_vaddubh_acc HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_shuffeqw HvxQR:$src1, HvxQR:$src2),
+         (V6_shuffeqw HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_shuffeqw_128B HvxQR:$src1, HvxQR:$src2),
+         (V6_shuffeqw HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_shuffeqh HvxQR:$src1, HvxQR:$src2),
+         (V6_shuffeqh HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_shuffeqh_128B HvxQR:$src1, HvxQR:$src2),
+         (V6_shuffeqh HvxQR:$src1, HvxQR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmaxb HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmaxb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vmaxb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vminb HvxVR:$src1, HvxVR:$src2),
+         (V6_vminb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vminb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vminb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vsatuwuh HvxVR:$src1, HvxVR:$src2),
+         (V6_vsatuwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vsatuwuh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vsatuwuh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_lvsplath IntRegs:$src1),
+         (V6_lvsplath IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_lvsplath_128B IntRegs:$src1),
+         (V6_lvsplath IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_lvsplatb IntRegs:$src1),
+         (V6_lvsplatb IntRegs:$src1)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_lvsplatb_128B IntRegs:$src1),
+         (V6_lvsplatb IntRegs:$src1)>, Requires<[HasV62, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vaddclbw HvxVR:$src1, HvxVR:$src2),
          (V6_vaddclbw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vaddclbw_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vaddclbw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ldcpnt0 PredRegs:$src1, IntRegs:$src2),
-         (V6_ldcpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ldcpnt0_128B PredRegs:$src1, IntRegs:$src2),
-         (V6_ldcpnt0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2),
-         (V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vadduwsat_dv_128B HvxWR:$src1, HvxWR:$src2),
-         (V6_vadduwsat_dv HvxWR:$src1, HvxWR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyiwub HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyiwub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyiwub_128B HvxVR:$src1, IntRegs:$src2),
-         (V6_vmpyiwub HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubububb_sat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubububb_sat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_ldcnp0 PredRegs:$src1, IntRegs:$src2),
-         (V6_ldcnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_ldcnp0_128B PredRegs:$src1, IntRegs:$src2),
-         (V6_ldcnp0 PredRegs:$src1, IntRegs:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddclbh HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddclbh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddclbh_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vaddclbh HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvbi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_vlutvvbi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
+         (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_oracci_128B HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
+         (V6_vlutvvb_oracci HvxVR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwhi_128B HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3),
+         (V6_vlutvwhi HvxVR:$src1, HvxVR:$src2, u3_0ImmPred_timm:$src3)>, Requires<[HasV62, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
          (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vlutvwh_oracci_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4),
          (V6_vlutvwh_oracci HvxWR:$src1, HvxVR:$src2, HvxVR:$src3, u3_0ImmPred_timm:$src4)>, Requires<[HasV62, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vsubbsat HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vsubbsat_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vsubbsat HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvvb_nm_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvvb_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlutvwh_nm_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vlutvwh_nm HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV62, UseHVX128B]>;
 
 // V65 HVX Instructions.
 
+def: Pat<(int_hexagon_V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasruwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasruhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
+         (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
          (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vasruhubrndsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
          (V6_vasruhubrndsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2),
-         (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt_128B HvxVR:$src1, DoubleRegs:$src2),
-         (V6_vrmpybub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
-         (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpahhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
-         (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaslh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vasrh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavguw HvxVR:$src1, HvxVR:$src2),
+         (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavguw_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vavguwrnd HvxVR:$src1, HvxVR:$src2),
          (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vavguwrnd_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vavguwrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgb HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgb_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vavgbrnd HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vavgbrnd_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vnavgb HvxVR:$src1, HvxVR:$src2),
          (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vnavgb_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vnavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasrh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vasrh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
-         (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpauhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
-         (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vdd0 ),
+         (V6_vdd0 )>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vdd0_128B ),
+         (V6_vdd0 )>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsb HvxVR:$src1),
+         (V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsb_128B HvxVR:$src1),
+         (V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vabsb_sat HvxVR:$src1),
+         (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vabsb_sat_128B HvxVR:$src1),
+         (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabuu HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabuu_128B HvxWR:$src1, IntRegs:$src2),
+         (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpabuu_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
+         (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
          (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vmpyh_acc_128B HvxWR:$src1, HvxVR:$src2, IntRegs:$src3),
          (V6_vmpyh_acc HvxWR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3),
-         (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpybub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3),
-         (V6_vrmpybub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavgb HvxVR:$src1, HvxVR:$src2),
-         (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavgb_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vavgb HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaslh_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vaslh_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavguw HvxVR:$src1, HvxVR:$src2),
-         (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavguw_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vavguw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vlut4 HvxVR:$src1, DoubleRegs:$src2),
-         (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vlut4_128B HvxVR:$src1, DoubleRegs:$src2),
-         (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpyuhe_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
-         (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2),
-         (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt_128B HvxVR:$src1, DoubleRegs:$src2),
-         (V6_vrmpyub_rtt HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
+         (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpahhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
+         (V6_vmpahhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
+         (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpauhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
+         (V6_vmpauhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
          (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vmpsuhuhsat_128B HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3),
          (V6_vmpsuhuhsat HvxVR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasruhubsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasruhubsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vlut4 HvxVR:$src1, DoubleRegs:$src2),
+         (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vlut4_128B HvxVR:$src1, DoubleRegs:$src2),
+         (V6_vlut4 HvxVR:$src1, DoubleRegs:$src2)>, Requires<[HasV65, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vmpyuhe HvxVR:$src1, IntRegs:$src2),
          (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vmpyuhe_128B HvxVR:$src1, IntRegs:$src2),
          (V6_vmpyuhe HvxVR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3),
-         (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrmpyub_rtt_acc_128B HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3),
-         (V6_vrmpyub_rtt_acc HvxWR:$src1, HvxVR:$src2, DoubleRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vasruwuhsat_128B HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3),
-         (V6_vasruwuhsat HvxVR:$src1, HvxVR:$src2, IntRegsLow8:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpabuu_acc_128B HvxWR:$src1, HvxWR:$src2, IntRegs:$src3),
-         (V6_vmpabuu_acc HvxWR:$src1, HvxWR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vprefixqw HvxQR:$src1),
-         (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vprefixqw_128B HvxQR:$src1),
-         (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vprefixqh HvxQR:$src1),
-         (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vprefixqh_128B HvxQR:$src1),
-         (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vmpyuhe_acc_128B HvxVR:$src1, HvxVR:$src2, IntRegs:$src3),
+         (V6_vmpyuhe_acc HvxVR:$src1, HvxVR:$src2, IntRegs:$src3)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermw_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermw IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermh_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermh IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermw_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermw_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermh_add_128B IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4),
+         (V6_vscattermh_add IntRegs:$src1, ModRegs:$src2, HvxVR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
+         (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
+         (V6_vscattermwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
+         (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermhq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5),
+         (V6_vscattermhq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxVR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
+         (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermhw_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
+         (V6_vscattermhw IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5),
+         (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermhwq_128B HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5),
+         (V6_vscattermhwq HvxQR:$src1, IntRegs:$src2, ModRegs:$src3, HvxWR:$src4, HvxVR:$src5)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
+         (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vscattermhw_add_128B IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4),
+         (V6_vscattermhw_add IntRegs:$src1, ModRegs:$src2, HvxWR:$src3, HvxVR:$src4)>, Requires<[HasV65, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vprefixqb HvxQR:$src1),
          (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vprefixqb_128B HvxQR:$src1),
          (V6_vprefixqb HvxQR:$src1)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsb HvxVR:$src1),
-         (V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsb_128B HvxVR:$src1),
-         (V6_vabsb HvxVR:$src1)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vavgbrnd HvxVR:$src1, HvxVR:$src2),
-         (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vavgbrnd_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vavgbrnd HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vdd0 ),
-         (V6_vdd0 )>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vdd0_128B ),
-         (V6_vdd0 )>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vmpabuu HvxWR:$src1, IntRegs:$src2),
-         (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vmpabuu_128B HvxWR:$src1, IntRegs:$src2),
-         (V6_vmpabuu HvxWR:$src1, IntRegs:$src2)>, Requires<[HasV65, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vabsb_sat HvxVR:$src1),
-         (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vabsb_sat_128B HvxVR:$src1),
-         (V6_vabsb_sat HvxVR:$src1)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vprefixqh HvxQR:$src1),
+         (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vprefixqh_128B HvxQR:$src1),
+         (V6_vprefixqh HvxQR:$src1)>, Requires<[HasV65, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vprefixqw HvxQR:$src1),
+         (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vprefixqw_128B HvxQR:$src1),
+         (V6_vprefixqw HvxQR:$src1)>, Requires<[HasV65, UseHVX128B]>;
 
 // V66 HVX Instructions.
 
-def: Pat<(int_hexagon_V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
-         (V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV66, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vaddcarrysat_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
-         (V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV66, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vrotr HvxVR:$src1, HvxVR:$src2),
+         (V6_vrotr HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vrotr_128B HvxVR:$src1, HvxVR:$src2),
+         (V6_vrotr HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV66, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vasr_into_128B HvxWR:$src1, HvxVR:$src2, HvxVR:$src3),
          (V6_vasr_into HvxWR:$src1, HvxVR:$src2, HvxVR:$src3)>, Requires<[HasV66, UseHVX128B]>;
+def: Pat<(int_hexagon_V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
+         (V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV66, UseHVX64B]>;
+def: Pat<(int_hexagon_V6_vaddcarrysat_128B HvxVR:$src1, HvxVR:$src2, HvxQR:$src3),
+         (V6_vaddcarrysat HvxVR:$src1, HvxVR:$src2, HvxQR:$src3)>, Requires<[HasV66, UseHVX128B]>;
 def: Pat<(int_hexagon_V6_vsatdw HvxVR:$src1, HvxVR:$src2),
          (V6_vsatdw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX64B]>;
 def: Pat<(int_hexagon_V6_vsatdw_128B HvxVR:$src1, HvxVR:$src2),
          (V6_vsatdw HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX128B]>;
-def: Pat<(int_hexagon_V6_vrotr HvxVR:$src1, HvxVR:$src2),
-         (V6_vrotr HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX64B]>;
-def: Pat<(int_hexagon_V6_vrotr_128B HvxVR:$src1, HvxVR:$src2),
-         (V6_vrotr HvxVR:$src1, HvxVR:$src2)>, Requires<[HasV66, UseHVX128B]>;
diff --git a/llvm/lib/Target/Hexagon/HexagonDepMappings.td b/llvm/lib/Target/Hexagon/HexagonDepMappings.td
index 22ee495b25e6..3fca1aee9a60 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepMappings.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepMappings.td
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+// Automatically generated file, do not edit!
 //===----------------------------------------------------------------------===//
 
 def A2_negAlias : InstAlias<"$Rd32 = neg($Rs32)", (A2_subri IntRegs:$Rd32, 0, IntRegs:$Rs32)>;
@@ -94,6 +94,8 @@ def L4_sub_memopw_zomapAlias : InstAlias<"memw($Rs32) -= $Rt32", (L4_sub_memopw_
 def L6_deallocframe_map_to_rawAlias : InstAlias<"deallocframe", (L2_deallocframe D15, R30)>;
 def L6_return_map_to_rawAlias : InstAlias<"dealloc_return", (L4_return D15, R30)>;
 def M2_mpyuiAlias : InstAlias<"$Rd32 = mpyui($Rs32,$Rt32)", (M2_mpyi IntRegs:$Rd32, IntRegs:$Rs32, IntRegs:$Rt32)>;
+def M7_vdmpyAlias : InstAlias<"$Rdd32 = vdmpyw($Rss32,$Rtt32)", (M7_dcmpyrwc DoubleRegs:$Rdd32, DoubleRegs:$Rss32, DoubleRegs:$Rtt32)>;
+def M7_vdmpy_accAlias : InstAlias<"$Rxx32 += vdmpyw($Rss32,$Rtt32)", (M7_dcmpyrwc_acc DoubleRegs:$Rxx32, DoubleRegs:$Rss32, DoubleRegs:$Rtt32)>;
 def S2_pstorerbf_zomapAlias : InstAlias<"if (!$Pv4) memb($Rs32) = $Rt32", (S2_pstorerbf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Rt32)>;
 def S2_pstorerbnewf_zomapAlias : InstAlias<"if (!$Pv4) memb($Rs32) = $Nt8.new", (S2_pstorerbnewf_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
 def S2_pstorerbnewt_zomapAlias : InstAlias<"if ($Pv4) memb($Rs32) = $Nt8.new", (S2_pstorerbnewt_io PredRegs:$Pv4, IntRegs:$Rs32, 0, IntRegs:$Nt8)>;
@@ -253,16 +255,9 @@ def V6_vaslwv_altAlias : InstAlias<"$Vd32 = vaslw($Vu32,$Vv32)", (V6_vaslwv HvxV
 def V6_vasr_into_altAlias : InstAlias<"$Vxx32 = vasrinto($Vu32,$Vv32)", (V6_vasr_into HvxWR:$Vxx32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>;
 def V6_vasrh_acc_altAlias : InstAlias<"$Vx32 += vasrh($Vu32,$Rt32)", (V6_vasrh_acc HvxVR:$Vx32, HvxVR:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
 def V6_vasrh_altAlias : InstAlias<"$Vd32 = vasrh($Vu32,$Rt32)", (V6_vasrh HvxVR:$Vd32, HvxVR:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
-def V6_vasrhbrndsat_altAlias : InstAlias<"$Vd32 = vasrhb($Vu32,$Vv32,$Rt8):rnd:sat", (V6_vasrhbrndsat HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8)>;
-def V6_vasrhubrndsat_altAlias : InstAlias<"$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):rnd:sat", (V6_vasrhubrndsat HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8)>;
-def V6_vasrhubsat_altAlias : InstAlias<"$Vd32 = vasrhub($Vu32,$Vv32,$Rt8):sat", (V6_vasrhubsat HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8)>;
 def V6_vasrhv_altAlias : InstAlias<"$Vd32 = vasrh($Vu32,$Vv32)", (V6_vasrhv HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>;
 def V6_vasrw_acc_altAlias : InstAlias<"$Vx32 += vasrw($Vu32,$Rt32)", (V6_vasrw_acc HvxVR:$Vx32, HvxVR:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
 def V6_vasrw_altAlias : InstAlias<"$Vd32 = vasrw($Vu32,$Rt32)", (V6_vasrw HvxVR:$Vd32, HvxVR:$Vu32, IntRegs:$Rt32)>, Requires<[UseHVX]>;
-def V6_vasrwh_altAlias : InstAlias<"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8)", (V6_vasrwhsat HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8)>;
-def V6_vasrwhrndsat_altAlias : InstAlias<"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):rnd:sat", (V6_vasrwhrndsat HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8)>;
-def V6_vasrwhsat_altAlias : InstAlias<"$Vd32 = vasrwh($Vu32,$Vv32,$Rt8):sat", (V6_vasrwhsat HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8)>;
-def V6_vasrwuhsat_altAlias : InstAlias<"$Vd32 = vasrwuh($Vu32,$Vv32,$Rt8):sat", (V6_vasrwuhsat HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32, IntRegsLow8:$Rt8)>;
 def V6_vasrwv_altAlias : InstAlias<"$Vd32 = vasrw($Vu32,$Vv32)", (V6_vasrwv HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>;
 def V6_vavgb_altAlias : InstAlias<"$Vd32 = vavgb($Vu32,$Vv32)", (V6_vavgb HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>;
 def V6_vavgbrnd_altAlias : InstAlias<"$Vd32 = vavgb($Vu32,$Vv32):rnd", (V6_vavgbrnd HvxVR:$Vd32, HvxVR:$Vu32, HvxVR:$Vv32)>, Requires<[UseHVX]>;
diff --git a/llvm/lib/Target/Hexagon/HexagonDepMask.h b/llvm/lib/Target/Hexagon/HexagonDepMask.h
new file mode 100644
index 000000000000..742fe2d14d5b
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonDepMask.h
@@ -0,0 +1,2821 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+// Automatically generated file, do not edit!
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPMASK_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPMASK_H
+
+// clang-format off
+HexagonInstruction InstructionEncodings[] = {
+{ /*Tag:A2_addi*/
+  /*Rd32=add(Rs32,#s16)*/
+  0xf0000000,
+  0xb0000000,
+  0x0fe03fe0,
+  0 },
+{ /*Tag:A2_andir*/
+  /*Rd32=and(Rs32,#s10)*/
+  0xffc00000,
+  0x76000000,
+  0x00203fe0,
+  0 },
+{ /*Tag:A2_combineii*/
+  /*Rdd32=combine(#s8,#S8)*/
+  0xff800000,
+  0x7c000000,
+  0x00001fe0,
+  0 },
+{ /*Tag:A2_orir*/
+  /*Rd32=or(Rs32,#s10)*/
+  0xffc00000,
+  0x76800000,
+  0x00203fe0,
+  0 },
+{ /*Tag:A2_paddif*/
+  /*if (!Pu4) Rd32=add(Rs32,#s8)*/
+  0xff802000,
+  0x74800000,
+  0x00001fe0,
+  0 },
+{ /*Tag:A2_paddifnew*/
+  /*if (!Pu4.new) Rd32=add(Rs32,#s8)*/
+  0xff802000,
+  0x74802000,
+  0x00001fe0,
+  0 },
+{ /*Tag:A2_paddit*/
+  /*if (Pu4) Rd32=add(Rs32,#s8)*/
+  0xff802000,
+  0x74000000,
+  0x00001fe0,
+  0 },
+{ /*Tag:A2_padditnew*/
+  /*if (Pu4.new) Rd32=add(Rs32,#s8)*/
+  0xff802000,
+  0x74002000,
+  0x00001fe0,
+  0 },
+{ /*Tag:A2_subri*/
+  /*Rd32=sub(#s10,Rs32)*/
+  0xffc00000,
+  0x76400000,
+  0x00203fe0,
+  0 },
+{ /*Tag:A2_tfrsi*/
+  /*Rd32=#s16*/
+  0xff000000,
+  0x78000000,
+  0x00df3fe0,
+  0 },
+{ /*Tag:A4_cmpbgtui*/
+  /*Pd4=cmpb.gtu(Rs32,#u7)*/
+  0xff601018,
+  0xdd400000,
+  0x00000fe0,
+  0 },
+{ /*Tag:A4_cmpheqi*/
+  /*Pd4=cmph.eq(Rs32,#s8)*/
+  0xff600018,
+  0xdd000008,
+  0x00001fe0,
+  0 },
+{ /*Tag:A4_cmphgti*/
+  /*Pd4=cmph.gt(Rs32,#s8)*/
+  0xff600018,
+  0xdd200008,
+  0x00001fe0,
+  0 },
+{ /*Tag:A4_cmphgtui*/
+  /*Pd4=cmph.gtu(Rs32,#u7)*/
+  0xff601018,
+  0xdd400008,
+  0x00000fe0,
+  0 },
+{ /*Tag:A4_combineii*/
+  /*Rdd32=combine(#s8,#U6)*/
+  0xff800000,
+  0x7c800000,
+  0x001f2000,
+  0 },
+{ /*Tag:A4_combineir*/
+  /*Rdd32=combine(#s8,Rs32)*/
+  0xff602000,
+  0x73202000,
+  0x00001fe0,
+  0 },
+{ /*Tag:A4_combineri*/
+  /*Rdd32=combine(Rs32,#s8)*/
+  0xff602000,
+  0x73002000,
+  0x00001fe0,
+  0 },
+{ /*Tag:A4_rcmpeqi*/
+  /*Rd32=cmp.eq(Rs32,#s8)*/
+  0xff602000,
+  0x73402000,
+  0x00001fe0,
+  0 },
+{ /*Tag:A4_rcmpneqi*/
+  /*Rd32=!cmp.eq(Rs32,#s8)*/
+  0xff602000,
+  0x73602000,
+  0x00001fe0,
+  0 },
+{ /*Tag:C2_cmoveif*/
+  /*if (!Pu4) Rd32=#s12*/
+  0xff902000,
+  0x7e800000,
+  0x000f1fe0,
+  0 },
+{ /*Tag:C2_cmoveit*/
+  /*if (Pu4) Rd32=#s12*/
+  0xff902000,
+  0x7e000000,
+  0x000f1fe0,
+  0 },
+{ /*Tag:C2_cmovenewif*/
+  /*if (!Pu4.new) Rd32=#s12*/
+  0xff902000,
+  0x7e802000,
+  0x000f1fe0,
+  0 },
+{ /*Tag:C2_cmovenewit*/
+  /*if (Pu4.new) Rd32=#s12*/
+  0xff902000,
+  0x7e002000,
+  0x000f1fe0,
+  0 },
+{ /*Tag:C2_cmpeqi*/
+  /*Pd4=cmp.eq(Rs32,#s10)*/
+  0xffc0001c,
+  0x75000000,
+  0x00203fe0,
+  0 },
+{ /*Tag:C2_cmpgti*/
+  /*Pd4=cmp.gt(Rs32,#s10)*/
+  0xffc0001c,
+  0x75400000,
+  0x00203fe0,
+  0 },
+{ /*Tag:C2_cmpgtui*/
+  /*Pd4=cmp.gtu(Rs32,#u9)*/
+  0xffe0001c,
+  0x75800000,
+  0x00003fe0,
+  0 },
+{ /*Tag:C2_muxii*/
+  /*Rd32=mux(Pu4,#s8,#S8)*/
+  0xfe000000,
+  0x7a000000,
+  0x00001fe0,
+  0 },
+{ /*Tag:C2_muxir*/
+  /*Rd32=mux(Pu4,Rs32,#s8)*/
+  0xff802000,
+  0x73000000,
+  0x00001fe0,
+  0 },
+{ /*Tag:C2_muxri*/
+  /*Rd32=mux(Pu4,#s8,Rs32)*/
+  0xff802000,
+  0x73800000,
+  0x00001fe0,
+  0 },
+{ /*Tag:C4_addipc*/
+  /*Rd32=add(pc,#u6)*/
+  0xffff0000,
+  0x6a490000,
+  0x00001f80,
+  0 },
+{ /*Tag:C4_cmpltei*/
+  /*Pd4=!cmp.gt(Rs32,#s10)*/
+  0xffc0001c,
+  0x75400010,
+  0x00203fe0,
+  0 },
+{ /*Tag:C4_cmplteui*/
+  /*Pd4=!cmp.gtu(Rs32,#u9)*/
+  0xffe0001c,
+  0x75800010,
+  0x00003fe0,
+  0 },
+{ /*Tag:C4_cmpneqi*/
+  /*Pd4=!cmp.eq(Rs32,#s10)*/
+  0xffc0001c,
+  0x75000010,
+  0x00203fe0,
+  0 },
+{ /*Tag:J2_call*/
+  /*call #r22:2*/
+  0xfe000001,
+  0x5a000000,
+  0x01ff3ffe,
+  0 },
+{ /*Tag:J2_callf*/
+  /*if (!Pu4) call #r15:2*/
+  0xff200800,
+  0x5d200000,
+  0x00df20fe,
+  0 },
+{ /*Tag:J2_callt*/
+  /*if (Pu4) call #r15:2*/
+  0xff200800,
+  0x5d000000,
+  0x00df20fe,
+  0 },
+{ /*Tag:J2_jump*/
+  /*jump #r22:2*/
+  0xfe000000,
+  0x58000000,
+  0x01ff3ffe,
+  0 },
+{ /*Tag:J2_jumpf*/
+  /*if (!Pu4) jump:nt #r15:2*/
+  0xff201800,
+  0x5c200000,
+  0x00df20fe,
+  0 },
+{ /*Tag:J2_jumpfnew*/
+  /*if (!Pu4.new) jump:nt #r15:2*/
+  0xff201800,
+  0x5c200800,
+  0x00df20fe,
+  0 },
+{ /*Tag:J2_jumpfnewpt*/
+  /*if (!Pu4.new) jump:t #r15:2*/
+  0xff201800,
+  0x5c201800,
+  0x00df20fe,
+  0 },
+{ /*Tag:J2_jumpfpt*/
+  /*if (!Pu4) jump:t #r15:2*/
+  0xff201800,
+  0x5c201000,
+  0x00df20fe,
+  0 },
+{ /*Tag:J2_jumpt*/
+  /*if (Pu4) jump:nt #r15:2*/
+  0xff201800,
+  0x5c000000,
+  0x00df20fe,
+  0 },
+{ /*Tag:J2_jumptnew*/
+  /*if (Pu4.new) jump:nt #r15:2*/
+  0xff201800,
+  0x5c000800,
+  0x00df20fe,
+  0 },
+{ /*Tag:J2_jumptnewpt*/
+  /*if (Pu4.new) jump:t #r15:2*/
+  0xff201800,
+  0x5c001800,
+  0x00df20fe,
+  0 },
+{ /*Tag:J2_jumptpt*/
+  /*if (Pu4) jump:t #r15:2*/
+  0xff201800,
+  0x5c001000,
+  0x00df20fe,
+  0 },
+{ /*Tag:J2_loop0i*/
+  /*loop0(#r7:2,#U10)*/
+  0xffe00000,
+  0x69000000,
+  0x00001f18,
+  0 },
+{ /*Tag:J2_loop0r*/
+  /*loop0(#r7:2,Rs32)*/
+  0xffe00000,
+  0x60000000,
+  0x00001f18,
+  0 },
+{ /*Tag:J2_loop1i*/
+  /*loop1(#r7:2,#U10)*/
+  0xffe00000,
+  0x69200000,
+  0x00001f18,
+  0 },
+{ /*Tag:J2_loop1r*/
+  /*loop1(#r7:2,Rs32)*/
+  0xffe00000,
+  0x60200000,
+  0x00001f18,
+  0 },
+{ /*Tag:J2_ploop1si*/
+  /*p3=sp1loop0(#r7:2,#U10)*/
+  0xffe00000,
+  0x69a00000,
+  0x00001f18,
+  0 },
+{ /*Tag:J2_ploop1sr*/
+  /*p3=sp1loop0(#r7:2,Rs32)*/
+  0xffe00000,
+  0x60a00000,
+  0x00001f18,
+  0 },
+{ /*Tag:J2_ploop2si*/
+  /*p3=sp2loop0(#r7:2,#U10)*/
+  0xffe00000,
+  0x69c00000,
+  0x00001f18,
+  0 },
+{ /*Tag:J2_ploop2sr*/
+  /*p3=sp2loop0(#r7:2,Rs32)*/
+  0xffe00000,
+  0x60c00000,
+  0x00001f18,
+  0 },
+{ /*Tag:J2_ploop3si*/
+  /*p3=sp3loop0(#r7:2,#U10)*/
+  0xffe00000,
+  0x69e00000,
+  0x00001f18,
+  0 },
+{ /*Tag:J2_ploop3sr*/
+  /*p3=sp3loop0(#r7:2,Rs32)*/
+  0xffe00000,
+  0x60e00000,
+  0x00001f18,
+  0 },
+{ /*Tag:J4_cmpeq_f_jumpnv_nt*/
+  /*if (!cmp.eq(Ns8.new,Rt32)) jump:nt #r9:2*/
+  0xffc02000,
+  0x20400000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeq_f_jumpnv_t*/
+  /*if (!cmp.eq(Ns8.new,Rt32)) jump:t #r9:2*/
+  0xffc02000,
+  0x20402000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeq_fp0_jump_nt*/
+  /*p0=cmp.eq(Rs16,Rt16); if (!p0.new) jump:nt #r9:2*/
+  0xffc03000,
+  0x14400000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeq_fp0_jump_t*/
+  /*p0=cmp.eq(Rs16,Rt16); if (!p0.new) jump:t #r9:2*/
+  0xffc03000,
+  0x14402000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeq_fp1_jump_nt*/
+  /*p1=cmp.eq(Rs16,Rt16); if (!p1.new) jump:nt #r9:2*/
+  0xffc03000,
+  0x14401000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeq_fp1_jump_t*/
+  /*p1=cmp.eq(Rs16,Rt16); if (!p1.new) jump:t #r9:2*/
+  0xffc03000,
+  0x14403000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeq_t_jumpnv_nt*/
+  /*if (cmp.eq(Ns8.new,Rt32)) jump:nt #r9:2*/
+  0xffc02000,
+  0x20000000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeq_t_jumpnv_t*/
+  /*if (cmp.eq(Ns8.new,Rt32)) jump:t #r9:2*/
+  0xffc02000,
+  0x20002000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeq_tp0_jump_nt*/
+  /*p0=cmp.eq(Rs16,Rt16); if (p0.new) jump:nt #r9:2*/
+  0xffc03000,
+  0x14000000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeq_tp0_jump_t*/
+  /*p0=cmp.eq(Rs16,Rt16); if (p0.new) jump:t #r9:2*/
+  0xffc03000,
+  0x14002000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeq_tp1_jump_nt*/
+  /*p1=cmp.eq(Rs16,Rt16); if (p1.new) jump:nt #r9:2*/
+  0xffc03000,
+  0x14001000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeq_tp1_jump_t*/
+  /*p1=cmp.eq(Rs16,Rt16); if (p1.new) jump:t #r9:2*/
+  0xffc03000,
+  0x14003000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqi_f_jumpnv_nt*/
+  /*if (!cmp.eq(Ns8.new,#U5)) jump:nt #r9:2*/
+  0xffc02000,
+  0x24400000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqi_f_jumpnv_t*/
+  /*if (!cmp.eq(Ns8.new,#U5)) jump:t #r9:2*/
+  0xffc02000,
+  0x24402000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqi_fp0_jump_nt*/
+  /*p0=cmp.eq(Rs16,#U5); if (!p0.new) jump:nt #r9:2*/
+  0xffc02000,
+  0x10400000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqi_fp0_jump_t*/
+  /*p0=cmp.eq(Rs16,#U5); if (!p0.new) jump:t #r9:2*/
+  0xffc02000,
+  0x10402000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqi_fp1_jump_nt*/
+  /*p1=cmp.eq(Rs16,#U5); if (!p1.new) jump:nt #r9:2*/
+  0xffc02000,
+  0x12400000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqi_fp1_jump_t*/
+  /*p1=cmp.eq(Rs16,#U5); if (!p1.new) jump:t #r9:2*/
+  0xffc02000,
+  0x12402000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqi_t_jumpnv_nt*/
+  /*if (cmp.eq(Ns8.new,#U5)) jump:nt #r9:2*/
+  0xffc02000,
+  0x24000000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqi_t_jumpnv_t*/
+  /*if (cmp.eq(Ns8.new,#U5)) jump:t #r9:2*/
+  0xffc02000,
+  0x24002000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqi_tp0_jump_nt*/
+  /*p0=cmp.eq(Rs16,#U5); if (p0.new) jump:nt #r9:2*/
+  0xffc02000,
+  0x10000000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqi_tp0_jump_t*/
+  /*p0=cmp.eq(Rs16,#U5); if (p0.new) jump:t #r9:2*/
+  0xffc02000,
+  0x10002000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqi_tp1_jump_nt*/
+  /*p1=cmp.eq(Rs16,#U5); if (p1.new) jump:nt #r9:2*/
+  0xffc02000,
+  0x12000000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqi_tp1_jump_t*/
+  /*p1=cmp.eq(Rs16,#U5); if (p1.new) jump:t #r9:2*/
+  0xffc02000,
+  0x12002000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqn1_f_jumpnv_nt*/
+  /*if (!cmp.eq(Ns8.new,#-1)) jump:nt #r9:2*/
+  0xffc02000,
+  0x26400000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqn1_f_jumpnv_t*/
+  /*if (!cmp.eq(Ns8.new,#-1)) jump:t #r9:2*/
+  0xffc02000,
+  0x26402000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqn1_fp0_jump_nt*/
+  /*p0=cmp.eq(Rs16,#-1); if (!p0.new) jump:nt #r9:2*/
+  0xffc02300,
+  0x11c00000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqn1_fp0_jump_t*/
+  /*p0=cmp.eq(Rs16,#-1); if (!p0.new) jump:t #r9:2*/
+  0xffc02300,
+  0x11c02000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqn1_fp1_jump_nt*/
+  /*p1=cmp.eq(Rs16,#-1); if (!p1.new) jump:nt #r9:2*/
+  0xffc02300,
+  0x13c00000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqn1_fp1_jump_t*/
+  /*p1=cmp.eq(Rs16,#-1); if (!p1.new) jump:t #r9:2*/
+  0xffc02300,
+  0x13c02000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqn1_t_jumpnv_nt*/
+  /*if (cmp.eq(Ns8.new,#-1)) jump:nt #r9:2*/
+  0xffc02000,
+  0x26000000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqn1_t_jumpnv_t*/
+  /*if (cmp.eq(Ns8.new,#-1)) jump:t #r9:2*/
+  0xffc02000,
+  0x26002000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqn1_tp0_jump_nt*/
+  /*p0=cmp.eq(Rs16,#-1); if (p0.new) jump:nt #r9:2*/
+  0xffc02300,
+  0x11800000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqn1_tp0_jump_t*/
+  /*p0=cmp.eq(Rs16,#-1); if (p0.new) jump:t #r9:2*/
+  0xffc02300,
+  0x11802000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqn1_tp1_jump_nt*/
+  /*p1=cmp.eq(Rs16,#-1); if (p1.new) jump:nt #r9:2*/
+  0xffc02300,
+  0x13800000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpeqn1_tp1_jump_t*/
+  /*p1=cmp.eq(Rs16,#-1); if (p1.new) jump:t #r9:2*/
+  0xffc02300,
+  0x13802000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgt_f_jumpnv_nt*/
+  /*if (!cmp.gt(Ns8.new,Rt32)) jump:nt #r9:2*/
+  0xffc02000,
+  0x20c00000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgt_f_jumpnv_t*/
+  /*if (!cmp.gt(Ns8.new,Rt32)) jump:t #r9:2*/
+  0xffc02000,
+  0x20c02000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgt_fp0_jump_nt*/
+  /*p0=cmp.gt(Rs16,Rt16); if (!p0.new) jump:nt #r9:2*/
+  0xffc03000,
+  0x14c00000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgt_fp0_jump_t*/
+  /*p0=cmp.gt(Rs16,Rt16); if (!p0.new) jump:t #r9:2*/
+  0xffc03000,
+  0x14c02000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgt_fp1_jump_nt*/
+  /*p1=cmp.gt(Rs16,Rt16); if (!p1.new) jump:nt #r9:2*/
+  0xffc03000,
+  0x14c01000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgt_fp1_jump_t*/
+  /*p1=cmp.gt(Rs16,Rt16); if (!p1.new) jump:t #r9:2*/
+  0xffc03000,
+  0x14c03000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgt_t_jumpnv_nt*/
+  /*if (cmp.gt(Ns8.new,Rt32)) jump:nt #r9:2*/
+  0xffc02000,
+  0x20800000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgt_t_jumpnv_t*/
+  /*if (cmp.gt(Ns8.new,Rt32)) jump:t #r9:2*/
+  0xffc02000,
+  0x20802000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgt_tp0_jump_nt*/
+  /*p0=cmp.gt(Rs16,Rt16); if (p0.new) jump:nt #r9:2*/
+  0xffc03000,
+  0x14800000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgt_tp0_jump_t*/
+  /*p0=cmp.gt(Rs16,Rt16); if (p0.new) jump:t #r9:2*/
+  0xffc03000,
+  0x14802000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgt_tp1_jump_nt*/
+  /*p1=cmp.gt(Rs16,Rt16); if (p1.new) jump:nt #r9:2*/
+  0xffc03000,
+  0x14801000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgt_tp1_jump_t*/
+  /*p1=cmp.gt(Rs16,Rt16); if (p1.new) jump:t #r9:2*/
+  0xffc03000,
+  0x14803000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgti_f_jumpnv_nt*/
+  /*if (!cmp.gt(Ns8.new,#U5)) jump:nt #r9:2*/
+  0xffc02000,
+  0x24c00000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgti_f_jumpnv_t*/
+  /*if (!cmp.gt(Ns8.new,#U5)) jump:t #r9:2*/
+  0xffc02000,
+  0x24c02000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgti_fp0_jump_nt*/
+  /*p0=cmp.gt(Rs16,#U5); if (!p0.new) jump:nt #r9:2*/
+  0xffc02000,
+  0x10c00000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgti_fp0_jump_t*/
+  /*p0=cmp.gt(Rs16,#U5); if (!p0.new) jump:t #r9:2*/
+  0xffc02000,
+  0x10c02000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgti_fp1_jump_nt*/
+  /*p1=cmp.gt(Rs16,#U5); if (!p1.new) jump:nt #r9:2*/
+  0xffc02000,
+  0x12c00000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgti_fp1_jump_t*/
+  /*p1=cmp.gt(Rs16,#U5); if (!p1.new) jump:t #r9:2*/
+  0xffc02000,
+  0x12c02000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgti_t_jumpnv_nt*/
+  /*if (cmp.gt(Ns8.new,#U5)) jump:nt #r9:2*/
+  0xffc02000,
+  0x24800000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgti_t_jumpnv_t*/
+  /*if (cmp.gt(Ns8.new,#U5)) jump:t #r9:2*/
+  0xffc02000,
+  0x24802000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgti_tp0_jump_nt*/
+  /*p0=cmp.gt(Rs16,#U5); if (p0.new) jump:nt #r9:2*/
+  0xffc02000,
+  0x10800000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgti_tp0_jump_t*/
+  /*p0=cmp.gt(Rs16,#U5); if (p0.new) jump:t #r9:2*/
+  0xffc02000,
+  0x10802000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgti_tp1_jump_nt*/
+  /*p1=cmp.gt(Rs16,#U5); if (p1.new) jump:nt #r9:2*/
+  0xffc02000,
+  0x12800000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgti_tp1_jump_t*/
+  /*p1=cmp.gt(Rs16,#U5); if (p1.new) jump:t #r9:2*/
+  0xffc02000,
+  0x12802000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtn1_f_jumpnv_nt*/
+  /*if (!cmp.gt(Ns8.new,#-1)) jump:nt #r9:2*/
+  0xffc02000,
+  0x26c00000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtn1_f_jumpnv_t*/
+  /*if (!cmp.gt(Ns8.new,#-1)) jump:t #r9:2*/
+  0xffc02000,
+  0x26c02000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtn1_fp0_jump_nt*/
+  /*p0=cmp.gt(Rs16,#-1); if (!p0.new) jump:nt #r9:2*/
+  0xffc02300,
+  0x11c00100,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtn1_fp0_jump_t*/
+  /*p0=cmp.gt(Rs16,#-1); if (!p0.new) jump:t #r9:2*/
+  0xffc02300,
+  0x11c02100,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtn1_fp1_jump_nt*/
+  /*p1=cmp.gt(Rs16,#-1); if (!p1.new) jump:nt #r9:2*/
+  0xffc02300,
+  0x13c00100,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtn1_fp1_jump_t*/
+  /*p1=cmp.gt(Rs16,#-1); if (!p1.new) jump:t #r9:2*/
+  0xffc02300,
+  0x13c02100,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtn1_t_jumpnv_nt*/
+  /*if (cmp.gt(Ns8.new,#-1)) jump:nt #r9:2*/
+  0xffc02000,
+  0x26800000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtn1_t_jumpnv_t*/
+  /*if (cmp.gt(Ns8.new,#-1)) jump:t #r9:2*/
+  0xffc02000,
+  0x26802000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtn1_tp0_jump_nt*/
+  /*p0=cmp.gt(Rs16,#-1); if (p0.new) jump:nt #r9:2*/
+  0xffc02300,
+  0x11800100,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtn1_tp0_jump_t*/
+  /*p0=cmp.gt(Rs16,#-1); if (p0.new) jump:t #r9:2*/
+  0xffc02300,
+  0x11802100,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtn1_tp1_jump_nt*/
+  /*p1=cmp.gt(Rs16,#-1); if (p1.new) jump:nt #r9:2*/
+  0xffc02300,
+  0x13800100,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtn1_tp1_jump_t*/
+  /*p1=cmp.gt(Rs16,#-1); if (p1.new) jump:t #r9:2*/
+  0xffc02300,
+  0x13802100,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtu_f_jumpnv_nt*/
+  /*if (!cmp.gtu(Ns8.new,Rt32)) jump:nt #r9:2*/
+  0xffc02000,
+  0x21400000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtu_f_jumpnv_t*/
+  /*if (!cmp.gtu(Ns8.new,Rt32)) jump:t #r9:2*/
+  0xffc02000,
+  0x21402000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtu_fp0_jump_nt*/
+  /*p0=cmp.gtu(Rs16,Rt16); if (!p0.new) jump:nt #r9:2*/
+  0xffc03000,
+  0x15400000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtu_fp0_jump_t*/
+  /*p0=cmp.gtu(Rs16,Rt16); if (!p0.new) jump:t #r9:2*/
+  0xffc03000,
+  0x15402000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtu_fp1_jump_nt*/
+  /*p1=cmp.gtu(Rs16,Rt16); if (!p1.new) jump:nt #r9:2*/
+  0xffc03000,
+  0x15401000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtu_fp1_jump_t*/
+  /*p1=cmp.gtu(Rs16,Rt16); if (!p1.new) jump:t #r9:2*/
+  0xffc03000,
+  0x15403000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtu_t_jumpnv_nt*/
+  /*if (cmp.gtu(Ns8.new,Rt32)) jump:nt #r9:2*/
+  0xffc02000,
+  0x21000000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtu_t_jumpnv_t*/
+  /*if (cmp.gtu(Ns8.new,Rt32)) jump:t #r9:2*/
+  0xffc02000,
+  0x21002000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtu_tp0_jump_nt*/
+  /*p0=cmp.gtu(Rs16,Rt16); if (p0.new) jump:nt #r9:2*/
+  0xffc03000,
+  0x15000000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtu_tp0_jump_t*/
+  /*p0=cmp.gtu(Rs16,Rt16); if (p0.new) jump:t #r9:2*/
+  0xffc03000,
+  0x15002000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtu_tp1_jump_nt*/
+  /*p1=cmp.gtu(Rs16,Rt16); if (p1.new) jump:nt #r9:2*/
+  0xffc03000,
+  0x15001000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtu_tp1_jump_t*/
+  /*p1=cmp.gtu(Rs16,Rt16); if (p1.new) jump:t #r9:2*/
+  0xffc03000,
+  0x15003000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtui_f_jumpnv_nt*/
+  /*if (!cmp.gtu(Ns8.new,#U5)) jump:nt #r9:2*/
+  0xffc02000,
+  0x25400000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtui_f_jumpnv_t*/
+  /*if (!cmp.gtu(Ns8.new,#U5)) jump:t #r9:2*/
+  0xffc02000,
+  0x25402000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtui_fp0_jump_nt*/
+  /*p0=cmp.gtu(Rs16,#U5); if (!p0.new) jump:nt #r9:2*/
+  0xffc02000,
+  0x11400000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtui_fp0_jump_t*/
+  /*p0=cmp.gtu(Rs16,#U5); if (!p0.new) jump:t #r9:2*/
+  0xffc02000,
+  0x11402000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtui_fp1_jump_nt*/
+  /*p1=cmp.gtu(Rs16,#U5); if (!p1.new) jump:nt #r9:2*/
+  0xffc02000,
+  0x13400000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtui_fp1_jump_t*/
+  /*p1=cmp.gtu(Rs16,#U5); if (!p1.new) jump:t #r9:2*/
+  0xffc02000,
+  0x13402000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtui_t_jumpnv_nt*/
+  /*if (cmp.gtu(Ns8.new,#U5)) jump:nt #r9:2*/
+  0xffc02000,
+  0x25000000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtui_t_jumpnv_t*/
+  /*if (cmp.gtu(Ns8.new,#U5)) jump:t #r9:2*/
+  0xffc02000,
+  0x25002000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtui_tp0_jump_nt*/
+  /*p0=cmp.gtu(Rs16,#U5); if (p0.new) jump:nt #r9:2*/
+  0xffc02000,
+  0x11000000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtui_tp0_jump_t*/
+  /*p0=cmp.gtu(Rs16,#U5); if (p0.new) jump:t #r9:2*/
+  0xffc02000,
+  0x11002000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtui_tp1_jump_nt*/
+  /*p1=cmp.gtu(Rs16,#U5); if (p1.new) jump:nt #r9:2*/
+  0xffc02000,
+  0x13000000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpgtui_tp1_jump_t*/
+  /*p1=cmp.gtu(Rs16,#U5); if (p1.new) jump:t #r9:2*/
+  0xffc02000,
+  0x13002000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmplt_f_jumpnv_nt*/
+  /*if (!cmp.gt(Rt32,Ns8.new)) jump:nt #r9:2*/
+  0xffc02000,
+  0x21c00000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmplt_f_jumpnv_t*/
+  /*if (!cmp.gt(Rt32,Ns8.new)) jump:t #r9:2*/
+  0xffc02000,
+  0x21c02000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmplt_t_jumpnv_nt*/
+  /*if (cmp.gt(Rt32,Ns8.new)) jump:nt #r9:2*/
+  0xffc02000,
+  0x21800000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmplt_t_jumpnv_t*/
+  /*if (cmp.gt(Rt32,Ns8.new)) jump:t #r9:2*/
+  0xffc02000,
+  0x21802000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpltu_f_jumpnv_nt*/
+  /*if (!cmp.gtu(Rt32,Ns8.new)) jump:nt #r9:2*/
+  0xffc02000,
+  0x22400000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpltu_f_jumpnv_t*/
+  /*if (!cmp.gtu(Rt32,Ns8.new)) jump:t #r9:2*/
+  0xffc02000,
+  0x22402000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpltu_t_jumpnv_nt*/
+  /*if (cmp.gtu(Rt32,Ns8.new)) jump:nt #r9:2*/
+  0xffc02000,
+  0x22000000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_cmpltu_t_jumpnv_t*/
+  /*if (cmp.gtu(Rt32,Ns8.new)) jump:t #r9:2*/
+  0xffc02000,
+  0x22002000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_jumpseti*/
+  /*Rd16=#U6 ; jump #r9:2*/
+  0xff000000,
+  0x16000000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_jumpsetr*/
+  /*Rd16=Rs16 ; jump #r9:2*/
+  0xff000000,
+  0x17000000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_tstbit0_f_jumpnv_nt*/
+  /*if (!tstbit(Ns8.new,#0)) jump:nt #r9:2*/
+  0xffc02000,
+  0x25c00000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_tstbit0_f_jumpnv_t*/
+  /*if (!tstbit(Ns8.new,#0)) jump:t #r9:2*/
+  0xffc02000,
+  0x25c02000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_tstbit0_fp0_jump_nt*/
+  /*p0=tstbit(Rs16,#0); if (!p0.new) jump:nt #r9:2*/
+  0xffc02300,
+  0x11c00300,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_tstbit0_fp0_jump_t*/
+  /*p0=tstbit(Rs16,#0); if (!p0.new) jump:t #r9:2*/
+  0xffc02300,
+  0x11c02300,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_tstbit0_fp1_jump_nt*/
+  /*p1=tstbit(Rs16,#0); if (!p1.new) jump:nt #r9:2*/
+  0xffc02300,
+  0x13c00300,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_tstbit0_fp1_jump_t*/
+  /*p1=tstbit(Rs16,#0); if (!p1.new) jump:t #r9:2*/
+  0xffc02300,
+  0x13c02300,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_tstbit0_t_jumpnv_nt*/
+  /*if (tstbit(Ns8.new,#0)) jump:nt #r9:2*/
+  0xffc02000,
+  0x25800000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_tstbit0_t_jumpnv_t*/
+  /*if (tstbit(Ns8.new,#0)) jump:t #r9:2*/
+  0xffc02000,
+  0x25802000,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_tstbit0_tp0_jump_nt*/
+  /*p0=tstbit(Rs16,#0); if (p0.new) jump:nt #r9:2*/
+  0xffc02300,
+  0x11800300,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_tstbit0_tp0_jump_t*/
+  /*p0=tstbit(Rs16,#0); if (p0.new) jump:t #r9:2*/
+  0xffc02300,
+  0x11802300,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_tstbit0_tp1_jump_nt*/
+  /*p1=tstbit(Rs16,#0); if (p1.new) jump:nt #r9:2*/
+  0xffc02300,
+  0x13800300,
+  0x003000fe,
+  0 },
+{ /*Tag:J4_tstbit0_tp1_jump_t*/
+  /*p1=tstbit(Rs16,#0); if (p1.new) jump:t #r9:2*/
+  0xffc02300,
+  0x13802300,
+  0x003000fe,
+  0 },
+{ /*Tag:L2_loadalignb_io*/
+  /*Ryy32=memb_fifo(Rs32+#s11:0)*/
+  0xf9e00000,
+  0x90800000,
+  0x06003fe0,
+  0 },
+{ /*Tag:L2_loadalignh_io*/
+  /*Ryy32=memh_fifo(Rs32+#s11:1)*/
+  0xf9e00000,
+  0x90400000,
+  0x06003fe0,
+  0 },
+{ /*Tag:L2_loadbsw2_io*/
+  /*Rd32=membh(Rs32+#s11:1)*/
+  0xf9e00000,
+  0x90200000,
+  0x06003fe0,
+  0 },
+{ /*Tag:L2_loadbsw4_io*/
+  /*Rdd32=membh(Rs32+#s11:2)*/
+  0xf9e00000,
+  0x90e00000,
+  0x06003fe0,
+  0 },
+{ /*Tag:L2_loadbzw2_io*/
+  /*Rd32=memubh(Rs32+#s11:1)*/
+  0xf9e00000,
+  0x90600000,
+  0x06003fe0,
+  0 },
+{ /*Tag:L2_loadbzw4_io*/
+  /*Rdd32=memubh(Rs32+#s11:2)*/
+  0xf9e00000,
+  0x90a00000,
+  0x06003fe0,
+  0 },
+{ /*Tag:L2_loadrb_io*/
+  /*Rd32=memb(Rs32+#s11:0)*/
+  0xf9e00000,
+  0x91000000,
+  0x06003fe0,
+  0 },
+{ /*Tag:L2_loadrbgp*/
+  /*Rd32=memb(gp+#u16:0)*/
+  0xf9e00000,
+  0x49000000,
+  0x061f3fe0,
+  0 },
+{ /*Tag:L2_loadrd_io*/
+  /*Rdd32=memd(Rs32+#s11:3)*/
+  0xf9e00000,
+  0x91c00000,
+  0x06003fe0,
+  0 },
+{ /*Tag:L2_loadrdgp*/
+  /*Rdd32=memd(gp+#u16:3)*/
+  0xf9e00000,
+  0x49c00000,
+  0x061f3fe0,
+  0 },
+{ /*Tag:L2_loadrh_io*/
+  /*Rd32=memh(Rs32+#s11:1)*/
+  0xf9e00000,
+  0x91400000,
+  0x06003fe0,
+  0 },
+{ /*Tag:L2_loadrhgp*/
+  /*Rd32=memh(gp+#u16:1)*/
+  0xf9e00000,
+  0x49400000,
+  0x061f3fe0,
+  0 },
+{ /*Tag:L2_loadri_io*/
+  /*Rd32=memw(Rs32+#s11:2)*/
+  0xf9e00000,
+  0x91800000,
+  0x06003fe0,
+  0 },
+{ /*Tag:L2_loadrigp*/
+  /*Rd32=memw(gp+#u16:2)*/
+  0xf9e00000,
+  0x49800000,
+  0x061f3fe0,
+  0 },
+{ /*Tag:L2_loadrub_io*/
+  /*Rd32=memub(Rs32+#s11:0)*/
+  0xf9e00000,
+  0x91200000,
+  0x06003fe0,
+  0 },
+{ /*Tag:L2_loadrubgp*/
+  /*Rd32=memub(gp+#u16:0)*/
+  0xf9e00000,
+  0x49200000,
+  0x061f3fe0,
+  0 },
+{ /*Tag:L2_loadruh_io*/
+  /*Rd32=memuh(Rs32+#s11:1)*/
+  0xf9e00000,
+  0x91600000,
+  0x06003fe0,
+  0 },
+{ /*Tag:L2_loadruhgp*/
+  /*Rd32=memuh(gp+#u16:1)*/
+  0xf9e00000,
+  0x49600000,
+  0x061f3fe0,
+  0 },
+{ /*Tag:L2_ploadrbf_io*/
+  /*if (!Pt4) Rd32=memb(Rs32+#u6:0)*/
+  0xffe02000,
+  0x45000000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrbfnew_io*/
+  /*if (!Pt4.new) Rd32=memb(Rs32+#u6:0)*/
+  0xffe02000,
+  0x47000000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrbt_io*/
+  /*if (Pt4) Rd32=memb(Rs32+#u6:0)*/
+  0xffe02000,
+  0x41000000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrbtnew_io*/
+  /*if (Pt4.new) Rd32=memb(Rs32+#u6:0)*/
+  0xffe02000,
+  0x43000000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrdf_io*/
+  /*if (!Pt4) Rdd32=memd(Rs32+#u6:3)*/
+  0xffe02000,
+  0x45c00000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrdfnew_io*/
+  /*if (!Pt4.new) Rdd32=memd(Rs32+#u6:3)*/
+  0xffe02000,
+  0x47c00000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrdt_io*/
+  /*if (Pt4) Rdd32=memd(Rs32+#u6:3)*/
+  0xffe02000,
+  0x41c00000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrdtnew_io*/
+  /*if (Pt4.new) Rdd32=memd(Rs32+#u6:3)*/
+  0xffe02000,
+  0x43c00000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrhf_io*/
+  /*if (!Pt4) Rd32=memh(Rs32+#u6:1)*/
+  0xffe02000,
+  0x45400000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrhfnew_io*/
+  /*if (!Pt4.new) Rd32=memh(Rs32+#u6:1)*/
+  0xffe02000,
+  0x47400000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrht_io*/
+  /*if (Pt4) Rd32=memh(Rs32+#u6:1)*/
+  0xffe02000,
+  0x41400000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrhtnew_io*/
+  /*if (Pt4.new) Rd32=memh(Rs32+#u6:1)*/
+  0xffe02000,
+  0x43400000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrif_io*/
+  /*if (!Pt4) Rd32=memw(Rs32+#u6:2)*/
+  0xffe02000,
+  0x45800000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrifnew_io*/
+  /*if (!Pt4.new) Rd32=memw(Rs32+#u6:2)*/
+  0xffe02000,
+  0x47800000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrit_io*/
+  /*if (Pt4) Rd32=memw(Rs32+#u6:2)*/
+  0xffe02000,
+  0x41800000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadritnew_io*/
+  /*if (Pt4.new) Rd32=memw(Rs32+#u6:2)*/
+  0xffe02000,
+  0x43800000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrubf_io*/
+  /*if (!Pt4) Rd32=memub(Rs32+#u6:0)*/
+  0xffe02000,
+  0x45200000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrubfnew_io*/
+  /*if (!Pt4.new) Rd32=memub(Rs32+#u6:0)*/
+  0xffe02000,
+  0x47200000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrubt_io*/
+  /*if (Pt4) Rd32=memub(Rs32+#u6:0)*/
+  0xffe02000,
+  0x41200000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadrubtnew_io*/
+  /*if (Pt4.new) Rd32=memub(Rs32+#u6:0)*/
+  0xffe02000,
+  0x43200000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadruhf_io*/
+  /*if (!Pt4) Rd32=memuh(Rs32+#u6:1)*/
+  0xffe02000,
+  0x45600000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadruhfnew_io*/
+  /*if (!Pt4.new) Rd32=memuh(Rs32+#u6:1)*/
+  0xffe02000,
+  0x47600000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadruht_io*/
+  /*if (Pt4) Rd32=memuh(Rs32+#u6:1)*/
+  0xffe02000,
+  0x41600000,
+  0x000007e0,
+  0 },
+{ /*Tag:L2_ploadruhtnew_io*/
+  /*if (Pt4.new) Rd32=memuh(Rs32+#u6:1)*/
+  0xffe02000,
+  0x43600000,
+  0x000007e0,
+  0 },
+{ /*Tag:L4_add_memopb_io*/
+  /*memb(Rs32+#u6:0)+=Rt32*/
+  0xff602060,
+  0x3e000000,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_add_memoph_io*/
+  /*memh(Rs32+#u6:1)+=Rt32*/
+  0xff602060,
+  0x3e200000,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_add_memopw_io*/
+  /*memw(Rs32+#u6:2)+=Rt32*/
+  0xff602060,
+  0x3e400000,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_and_memopb_io*/
+  /*memb(Rs32+#u6:0)&=Rt32*/
+  0xff602060,
+  0x3e000040,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_and_memoph_io*/
+  /*memh(Rs32+#u6:1)&=Rt32*/
+  0xff602060,
+  0x3e200040,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_and_memopw_io*/
+  /*memw(Rs32+#u6:2)&=Rt32*/
+  0xff602060,
+  0x3e400040,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_iadd_memopb_io*/
+  /*memb(Rs32+#u6:0)+=#U5*/
+  0xff602060,
+  0x3f000000,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_iadd_memoph_io*/
+  /*memh(Rs32+#u6:1)+=#U5*/
+  0xff602060,
+  0x3f200000,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_iadd_memopw_io*/
+  /*memw(Rs32+#u6:2)+=#U5*/
+  0xff602060,
+  0x3f400000,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_iand_memopb_io*/
+  /*memb(Rs32+#u6:0)=clrbit(#U5)*/
+  0xff602060,
+  0x3f000040,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_iand_memoph_io*/
+  /*memh(Rs32+#u6:1)=clrbit(#U5)*/
+  0xff602060,
+  0x3f200040,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_iand_memopw_io*/
+  /*memw(Rs32+#u6:2)=clrbit(#U5)*/
+  0xff602060,
+  0x3f400040,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_ior_memopb_io*/
+  /*memb(Rs32+#u6:0)=setbit(#U5)*/
+  0xff602060,
+  0x3f000060,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_ior_memoph_io*/
+  /*memh(Rs32+#u6:1)=setbit(#U5)*/
+  0xff602060,
+  0x3f200060,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_ior_memopw_io*/
+  /*memw(Rs32+#u6:2)=setbit(#U5)*/
+  0xff602060,
+  0x3f400060,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_isub_memopb_io*/
+  /*memb(Rs32+#u6:0)-=#U5*/
+  0xff602060,
+  0x3f000020,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_isub_memoph_io*/
+  /*memh(Rs32+#u6:1)-=#U5*/
+  0xff602060,
+  0x3f200020,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_isub_memopw_io*/
+  /*memw(Rs32+#u6:2)-=#U5*/
+  0xff602060,
+  0x3f400020,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_loadalignb_ap*/
+  /*Ryy32=memb_fifo(Re32=#U6)*/
+  0xffe03000,
+  0x9a801000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadalignb_ur*/
+  /*Ryy32=memb_fifo(Rt32<<#u2+#U6)*/
+  0xffe01000,
+  0x9c801000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadalignh_ap*/
+  /*Ryy32=memh_fifo(Re32=#U6)*/
+  0xffe03000,
+  0x9a401000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadalignh_ur*/
+  /*Ryy32=memh_fifo(Rt32<<#u2+#U6)*/
+  0xffe01000,
+  0x9c401000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadbsw2_ap*/
+  /*Rd32=membh(Re32=#U6)*/
+  0xffe03000,
+  0x9a201000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadbsw2_ur*/
+  /*Rd32=membh(Rt32<<#u2+#U6)*/
+  0xffe01000,
+  0x9c201000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadbsw4_ap*/
+  /*Rdd32=membh(Re32=#U6)*/
+  0xffe03000,
+  0x9ae01000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadbsw4_ur*/
+  /*Rdd32=membh(Rt32<<#u2+#U6)*/
+  0xffe01000,
+  0x9ce01000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadbzw2_ap*/
+  /*Rd32=memubh(Re32=#U6)*/
+  0xffe03000,
+  0x9a601000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadbzw2_ur*/
+  /*Rd32=memubh(Rt32<<#u2+#U6)*/
+  0xffe01000,
+  0x9c601000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadbzw4_ap*/
+  /*Rdd32=memubh(Re32=#U6)*/
+  0xffe03000,
+  0x9aa01000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadbzw4_ur*/
+  /*Rdd32=memubh(Rt32<<#u2+#U6)*/
+  0xffe01000,
+  0x9ca01000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadrb_ap*/
+  /*Rd32=memb(Re32=#U6)*/
+  0xffe03000,
+  0x9b001000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadrb_ur*/
+  /*Rd32=memb(Rt32<<#u2+#U6)*/
+  0xffe01000,
+  0x9d001000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadrd_ap*/
+  /*Rdd32=memd(Re32=#U6)*/
+  0xffe03000,
+  0x9bc01000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadrd_ur*/
+  /*Rdd32=memd(Rt32<<#u2+#U6)*/
+  0xffe01000,
+  0x9dc01000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadrh_ap*/
+  /*Rd32=memh(Re32=#U6)*/
+  0xffe03000,
+  0x9b401000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadrh_ur*/
+  /*Rd32=memh(Rt32<<#u2+#U6)*/
+  0xffe01000,
+  0x9d401000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadri_ap*/
+  /*Rd32=memw(Re32=#U6)*/
+  0xffe03000,
+  0x9b801000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadri_ur*/
+  /*Rd32=memw(Rt32<<#u2+#U6)*/
+  0xffe01000,
+  0x9d801000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadrub_ap*/
+  /*Rd32=memub(Re32=#U6)*/
+  0xffe03000,
+  0x9b201000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadrub_ur*/
+  /*Rd32=memub(Rt32<<#u2+#U6)*/
+  0xffe01000,
+  0x9d201000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadruh_ap*/
+  /*Rd32=memuh(Re32=#U6)*/
+  0xffe03000,
+  0x9b601000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_loadruh_ur*/
+  /*Rd32=memuh(Rt32<<#u2+#U6)*/
+  0xffe01000,
+  0x9d601000,
+  0x00000f60,
+  0 },
+{ /*Tag:L4_or_memopb_io*/
+  /*memb(Rs32+#u6:0)|=Rt32*/
+  0xff602060,
+  0x3e000060,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_or_memoph_io*/
+  /*memh(Rs32+#u6:1)|=Rt32*/
+  0xff602060,
+  0x3e200060,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_or_memopw_io*/
+  /*memw(Rs32+#u6:2)|=Rt32*/
+  0xff602060,
+  0x3e400060,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_ploadrbf_abs*/
+  /*if (!Pt4) Rd32=memb(#u6)*/
+  0xffe03880,
+  0x9f002880,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrbfnew_abs*/
+  /*if (!Pt4.new) Rd32=memb(#u6)*/
+  0xffe03880,
+  0x9f003880,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrbt_abs*/
+  /*if (Pt4) Rd32=memb(#u6)*/
+  0xffe03880,
+  0x9f002080,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrbtnew_abs*/
+  /*if (Pt4.new) Rd32=memb(#u6)*/
+  0xffe03880,
+  0x9f003080,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrdf_abs*/
+  /*if (!Pt4) Rdd32=memd(#u6)*/
+  0xffe03880,
+  0x9fc02880,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrdfnew_abs*/
+  /*if (!Pt4.new) Rdd32=memd(#u6)*/
+  0xffe03880,
+  0x9fc03880,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrdt_abs*/
+  /*if (Pt4) Rdd32=memd(#u6)*/
+  0xffe03880,
+  0x9fc02080,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrdtnew_abs*/
+  /*if (Pt4.new) Rdd32=memd(#u6)*/
+  0xffe03880,
+  0x9fc03080,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrhf_abs*/
+  /*if (!Pt4) Rd32=memh(#u6)*/
+  0xffe03880,
+  0x9f402880,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrhfnew_abs*/
+  /*if (!Pt4.new) Rd32=memh(#u6)*/
+  0xffe03880,
+  0x9f403880,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrht_abs*/
+  /*if (Pt4) Rd32=memh(#u6)*/
+  0xffe03880,
+  0x9f402080,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrhtnew_abs*/
+  /*if (Pt4.new) Rd32=memh(#u6)*/
+  0xffe03880,
+  0x9f403080,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrif_abs*/
+  /*if (!Pt4) Rd32=memw(#u6)*/
+  0xffe03880,
+  0x9f802880,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrifnew_abs*/
+  /*if (!Pt4.new) Rd32=memw(#u6)*/
+  0xffe03880,
+  0x9f803880,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrit_abs*/
+  /*if (Pt4) Rd32=memw(#u6)*/
+  0xffe03880,
+  0x9f802080,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadritnew_abs*/
+  /*if (Pt4.new) Rd32=memw(#u6)*/
+  0xffe03880,
+  0x9f803080,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrubf_abs*/
+  /*if (!Pt4) Rd32=memub(#u6)*/
+  0xffe03880,
+  0x9f202880,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrubfnew_abs*/
+  /*if (!Pt4.new) Rd32=memub(#u6)*/
+  0xffe03880,
+  0x9f203880,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrubt_abs*/
+  /*if (Pt4) Rd32=memub(#u6)*/
+  0xffe03880,
+  0x9f202080,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadrubtnew_abs*/
+  /*if (Pt4.new) Rd32=memub(#u6)*/
+  0xffe03880,
+  0x9f203080,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadruhf_abs*/
+  /*if (!Pt4) Rd32=memuh(#u6)*/
+  0xffe03880,
+  0x9f602880,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadruhfnew_abs*/
+  /*if (!Pt4.new) Rd32=memuh(#u6)*/
+  0xffe03880,
+  0x9f603880,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadruht_abs*/
+  /*if (Pt4) Rd32=memuh(#u6)*/
+  0xffe03880,
+  0x9f602080,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_ploadruhtnew_abs*/
+  /*if (Pt4.new) Rd32=memuh(#u6)*/
+  0xffe03880,
+  0x9f603080,
+  0x001f0100,
+  0 },
+{ /*Tag:L4_sub_memopb_io*/
+  /*memb(Rs32+#u6:0)-=Rt32*/
+  0xff602060,
+  0x3e000020,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_sub_memoph_io*/
+  /*memh(Rs32+#u6:1)-=Rt32*/
+  0xff602060,
+  0x3e200020,
+  0x00001f80,
+  0 },
+{ /*Tag:L4_sub_memopw_io*/
+  /*memw(Rs32+#u6:2)-=Rt32*/
+  0xff602060,
+  0x3e400020,
+  0x00001f80,
+  0 },
+{ /*Tag:M2_accii*/
+  /*Rx32+=add(Rs32,#s8)*/
+  0xff802000,
+  0xe2000000,
+  0x00001fe0,
+  0 },
+{ /*Tag:M2_macsin*/
+  /*Rx32-=mpyi(Rs32,#u8)*/
+  0xff802000,
+  0xe1800000,
+  0x00001fe0,
+  0 },
+{ /*Tag:M2_macsip*/
+  /*Rx32+=mpyi(Rs32,#u8)*/
+  0xff802000,
+  0xe1000000,
+  0x00001fe0,
+  0 },
+{ /*Tag:M2_mpysip*/
+  /*Rd32=+mpyi(Rs32,#u8)*/
+  0xff802000,
+  0xe0000000,
+  0x00001fe0,
+  0 },
+{ /*Tag:M2_naccii*/
+  /*Rx32-=add(Rs32,#s8)*/
+  0xff802000,
+  0xe2800000,
+  0x00001fe0,
+  0 },
+{ /*Tag:M4_mpyri_addi*/
+  /*Rd32=add(#u6,mpyi(Rs32,#U6))*/
+  0xff000000,
+  0xd8000000,
+  0x006020e0,
+  0 },
+{ /*Tag:M4_mpyri_addr*/
+  /*Rd32=add(Ru32,mpyi(Rs32,#u6))*/
+  0xff800000,
+  0xdf800000,
+  0x006020e0,
+  0 },
+{ /*Tag:M4_mpyrr_addi*/
+  /*Rd32=add(#u6,mpyi(Rs32,Rt32))*/
+  0xff800000,
+  0xd7000000,
+  0x006020e0,
+  0 },
+{ /*Tag:PS_loadrbabs*/
+  /*Rd32=memb(#u16:0)*/
+  0xf9e00000,
+  0x49000000,
+  0x061f3fe0,
+  0 },
+{ /*Tag:PS_loadrdabs*/
+  /*Rdd32=memd(#u16:3)*/
+  0xf9e00000,
+  0x49c00000,
+  0x061f3fe0,
+  0 },
+{ /*Tag:PS_loadrhabs*/
+  /*Rd32=memh(#u16:1)*/
+  0xf9e00000,
+  0x49400000,
+  0x061f3fe0,
+  0 },
+{ /*Tag:PS_loadriabs*/
+  /*Rd32=memw(#u16:2)*/
+  0xf9e00000,
+  0x49800000,
+  0x061f3fe0,
+  0 },
+{ /*Tag:PS_loadrubabs*/
+  /*Rd32=memub(#u16:0)*/
+  0xf9e00000,
+  0x49200000,
+  0x061f3fe0,
+  0 },
+{ /*Tag:PS_loadruhabs*/
+  /*Rd32=memuh(#u16:1)*/
+  0xf9e00000,
+  0x49600000,
+  0x061f3fe0,
+  0 },
+{ /*Tag:PS_storerbabs*/
+  /*memb(#u16:0)=Rt32*/
+  0xf9e00000,
+  0x48000000,
+  0x061f20ff,
+  0 },
+{ /*Tag:PS_storerbnewabs*/
+  /*memb(#u16:0)=Nt8.new*/
+  0xf9e01800,
+  0x48a00000,
+  0x061f20ff,
+  0 },
+{ /*Tag:PS_storerdabs*/
+  /*memd(#u16:3)=Rtt32*/
+  0xf9e00000,
+  0x48c00000,
+  0x061f20ff,
+  0 },
+{ /*Tag:PS_storerfabs*/
+  /*memh(#u16:1)=Rt32.h*/
+  0xf9e00000,
+  0x48600000,
+  0x061f20ff,
+  0 },
+{ /*Tag:PS_storerhabs*/
+  /*memh(#u16:1)=Rt32*/
+  0xf9e00000,
+  0x48400000,
+  0x061f20ff,
+  0 },
+{ /*Tag:PS_storerhnewabs*/
+  /*memh(#u16:1)=Nt8.new*/
+  0xf9e01800,
+  0x48a00800,
+  0x061f20ff,
+  0 },
+{ /*Tag:PS_storeriabs*/
+  /*memw(#u16:2)=Rt32*/
+  0xf9e00000,
+  0x48800000,
+  0x061f20ff,
+  0 },
+{ /*Tag:PS_storerinewabs*/
+  /*memw(#u16:2)=Nt8.new*/
+  0xf9e01800,
+  0x48a01000,
+  0x061f20ff,
+  0 },
+{ /*Tag:S2_pstorerbf_io*/
+  /*if (!Pv4) memb(Rs32+#u6:0)=Rt32*/
+  0xffe00004,
+  0x44000000,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerbnewf_io*/
+  /*if (!Pv4) memb(Rs32+#u6:0)=Nt8.new*/
+  0xffe01804,
+  0x44a00000,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerbnewt_io*/
+  /*if (Pv4) memb(Rs32+#u6:0)=Nt8.new*/
+  0xffe01804,
+  0x40a00000,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerbt_io*/
+  /*if (Pv4) memb(Rs32+#u6:0)=Rt32*/
+  0xffe00004,
+  0x40000000,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerdf_io*/
+  /*if (!Pv4) memd(Rs32+#u6:3)=Rtt32*/
+  0xffe00004,
+  0x44c00000,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerdt_io*/
+  /*if (Pv4) memd(Rs32+#u6:3)=Rtt32*/
+  0xffe00004,
+  0x40c00000,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerff_io*/
+  /*if (!Pv4) memh(Rs32+#u6:1)=Rt32.h*/
+  0xffe00004,
+  0x44600000,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerft_io*/
+  /*if (Pv4) memh(Rs32+#u6:1)=Rt32.h*/
+  0xffe00004,
+  0x40600000,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerhf_io*/
+  /*if (!Pv4) memh(Rs32+#u6:1)=Rt32*/
+  0xffe00004,
+  0x44400000,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerhnewf_io*/
+  /*if (!Pv4) memh(Rs32+#u6:1)=Nt8.new*/
+  0xffe01804,
+  0x44a00800,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerhnewt_io*/
+  /*if (Pv4) memh(Rs32+#u6:1)=Nt8.new*/
+  0xffe01804,
+  0x40a00800,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerht_io*/
+  /*if (Pv4) memh(Rs32+#u6:1)=Rt32*/
+  0xffe00004,
+  0x40400000,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerif_io*/
+  /*if (!Pv4) memw(Rs32+#u6:2)=Rt32*/
+  0xffe00004,
+  0x44800000,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerinewf_io*/
+  /*if (!Pv4) memw(Rs32+#u6:2)=Nt8.new*/
+  0xffe01804,
+  0x44a01000,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerinewt_io*/
+  /*if (Pv4) memw(Rs32+#u6:2)=Nt8.new*/
+  0xffe01804,
+  0x40a01000,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_pstorerit_io*/
+  /*if (Pv4) memw(Rs32+#u6:2)=Rt32*/
+  0xffe00004,
+  0x40800000,
+  0x000020f8,
+  0 },
+{ /*Tag:S2_storerb_io*/
+  /*memb(Rs32+#s11:0)=Rt32*/
+  0xf9e00000,
+  0xa1000000,
+  0x060020ff,
+  0 },
+{ /*Tag:S2_storerbgp*/
+  /*memb(gp+#u16:0)=Rt32*/
+  0xf9e00000,
+  0x48000000,
+  0x061f20ff,
+  0 },
+{ /*Tag:S2_storerbnew_io*/
+  /*memb(Rs32+#s11:0)=Nt8.new*/
+  0xf9e01800,
+  0xa1a00000,
+  0x060020ff,
+  0 },
+{ /*Tag:S2_storerbnewgp*/
+  /*memb(gp+#u16:0)=Nt8.new*/
+  0xf9e01800,
+  0x48a00000,
+  0x061f20ff,
+  0 },
+{ /*Tag:S2_storerd_io*/
+  /*memd(Rs32+#s11:3)=Rtt32*/
+  0xf9e00000,
+  0xa1c00000,
+  0x060020ff,
+  0 },
+{ /*Tag:S2_storerdgp*/
+  /*memd(gp+#u16:3)=Rtt32*/
+  0xf9e00000,
+  0x48c00000,
+  0x061f20ff,
+  0 },
+{ /*Tag:S2_storerf_io*/
+  /*memh(Rs32+#s11:1)=Rt32.h*/
+  0xf9e00000,
+  0xa1600000,
+  0x060020ff,
+  0 },
+{ /*Tag:S2_storerfgp*/
+  /*memh(gp+#u16:1)=Rt32.h*/
+  0xf9e00000,
+  0x48600000,
+  0x061f20ff,
+  0 },
+{ /*Tag:S2_storerh_io*/
+  /*memh(Rs32+#s11:1)=Rt32*/
+  0xf9e00000,
+  0xa1400000,
+  0x060020ff,
+  0 },
+{ /*Tag:S2_storerhgp*/
+  /*memh(gp+#u16:1)=Rt32*/
+  0xf9e00000,
+  0x48400000,
+  0x061f20ff,
+  0 },
+{ /*Tag:S2_storerhnew_io*/
+  /*memh(Rs32+#s11:1)=Nt8.new*/
+  0xf9e01800,
+  0xa1a00800,
+  0x060020ff,
+  0 },
+{ /*Tag:S2_storerhnewgp*/
+  /*memh(gp+#u16:1)=Nt8.new*/
+  0xf9e01800,
+  0x48a00800,
+  0x061f20ff,
+  0 },
+{ /*Tag:S2_storeri_io*/
+  /*memw(Rs32+#s11:2)=Rt32*/
+  0xf9e00000,
+  0xa1800000,
+  0x060020ff,
+  0 },
+{ /*Tag:S2_storerigp*/
+  /*memw(gp+#u16:2)=Rt32*/
+  0xf9e00000,
+  0x48800000,
+  0x061f20ff,
+  0 },
+{ /*Tag:S2_storerinew_io*/
+  /*memw(Rs32+#s11:2)=Nt8.new*/
+  0xf9e01800,
+  0xa1a01000,
+  0x060020ff,
+  0 },
+{ /*Tag:S2_storerinewgp*/
+  /*memw(gp+#u16:2)=Nt8.new*/
+  0xf9e01800,
+  0x48a01000,
+  0x061f20ff,
+  0 },
+{ /*Tag:S4_addaddi*/
+  /*Rd32=add(Rs32,add(Ru32,#s6))*/
+  0xff800000,
+  0xdb000000,
+  0x006020e0,
+  0 },
+{ /*Tag:S4_addi_asl_ri*/
+  /*Rx32=add(#u8,asl(Rx32,#U5))*/
+  0xff000016,
+  0xde000004,
+  0x00e020e8,
+  0 },
+{ /*Tag:S4_addi_lsr_ri*/
+  /*Rx32=add(#u8,lsr(Rx32,#U5))*/
+  0xff000016,
+  0xde000014,
+  0x00e020e8,
+  0 },
+{ /*Tag:S4_andi_asl_ri*/
+  /*Rx32=and(#u8,asl(Rx32,#U5))*/
+  0xff000016,
+  0xde000000,
+  0x00e020e8,
+  0 },
+{ /*Tag:S4_andi_lsr_ri*/
+  /*Rx32=and(#u8,lsr(Rx32,#U5))*/
+  0xff000016,
+  0xde000010,
+  0x00e020e8,
+  0 },
+{ /*Tag:S4_or_andi*/
+  /*Rx32|=and(Rs32,#s10)*/
+  0xffc00000,
+  0xda000000,
+  0x00203fe0,
+  0 },
+{ /*Tag:S4_or_andix*/
+  /*Rx32=or(Ru32,and(Rx32,#s10))*/
+  0xffc00000,
+  0xda400000,
+  0x00203fe0,
+  0 },
+{ /*Tag:S4_or_ori*/
+  /*Rx32|=or(Rs32,#s10)*/
+  0xffc00000,
+  0xda800000,
+  0x00203fe0,
+  0 },
+{ /*Tag:S4_ori_asl_ri*/
+  /*Rx32=or(#u8,asl(Rx32,#U5))*/
+  0xff000016,
+  0xde000002,
+  0x00e020e8,
+  0 },
+{ /*Tag:S4_ori_lsr_ri*/
+  /*Rx32=or(#u8,lsr(Rx32,#U5))*/
+  0xff000016,
+  0xde000012,
+  0x00e020e8,
+  0 },
+{ /*Tag:S4_pstorerbf_abs*/
+  /*if (!Pv4) memb(#u6)=Rt32*/
+  0xffe02084,
+  0xaf000084,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerbfnew_abs*/
+  /*if (!Pv4.new) memb(#u6)=Rt32*/
+  0xffe02084,
+  0xaf002084,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerbfnew_io*/
+  /*if (!Pv4.new) memb(Rs32+#u6:0)=Rt32*/
+  0xffe00004,
+  0x46000000,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerbnewf_abs*/
+  /*if (!Pv4) memb(#u6)=Nt8.new*/
+  0xffe03884,
+  0xafa00084,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerbnewfnew_abs*/
+  /*if (!Pv4.new) memb(#u6)=Nt8.new*/
+  0xffe03884,
+  0xafa02084,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerbnewfnew_io*/
+  /*if (!Pv4.new) memb(Rs32+#u6:0)=Nt8.new*/
+  0xffe01804,
+  0x46a00000,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerbnewt_abs*/
+  /*if (Pv4) memb(#u6)=Nt8.new*/
+  0xffe03884,
+  0xafa00080,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerbnewtnew_abs*/
+  /*if (Pv4.new) memb(#u6)=Nt8.new*/
+  0xffe03884,
+  0xafa02080,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerbnewtnew_io*/
+  /*if (Pv4.new) memb(Rs32+#u6:0)=Nt8.new*/
+  0xffe01804,
+  0x42a00000,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerbt_abs*/
+  /*if (Pv4) memb(#u6)=Rt32*/
+  0xffe02084,
+  0xaf000080,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerbtnew_abs*/
+  /*if (Pv4.new) memb(#u6)=Rt32*/
+  0xffe02084,
+  0xaf002080,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerbtnew_io*/
+  /*if (Pv4.new) memb(Rs32+#u6:0)=Rt32*/
+  0xffe00004,
+  0x42000000,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerdf_abs*/
+  /*if (!Pv4) memd(#u6)=Rtt32*/
+  0xffe02084,
+  0xafc00084,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerdfnew_abs*/
+  /*if (!Pv4.new) memd(#u6)=Rtt32*/
+  0xffe02084,
+  0xafc02084,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerdfnew_io*/
+  /*if (!Pv4.new) memd(Rs32+#u6:3)=Rtt32*/
+  0xffe00004,
+  0x46c00000,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerdt_abs*/
+  /*if (Pv4) memd(#u6)=Rtt32*/
+  0xffe02084,
+  0xafc00080,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerdtnew_abs*/
+  /*if (Pv4.new) memd(#u6)=Rtt32*/
+  0xffe02084,
+  0xafc02080,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerdtnew_io*/
+  /*if (Pv4.new) memd(Rs32+#u6:3)=Rtt32*/
+  0xffe00004,
+  0x42c00000,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerff_abs*/
+  /*if (!Pv4) memh(#u6)=Rt32.h*/
+  0xffe02084,
+  0xaf600084,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerffnew_abs*/
+  /*if (!Pv4.new) memh(#u6)=Rt32.h*/
+  0xffe02084,
+  0xaf602084,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerffnew_io*/
+  /*if (!Pv4.new) memh(Rs32+#u6:1)=Rt32.h*/
+  0xffe00004,
+  0x46600000,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerft_abs*/
+  /*if (Pv4) memh(#u6)=Rt32.h*/
+  0xffe02084,
+  0xaf600080,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerftnew_abs*/
+  /*if (Pv4.new) memh(#u6)=Rt32.h*/
+  0xffe02084,
+  0xaf602080,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerftnew_io*/
+  /*if (Pv4.new) memh(Rs32+#u6:1)=Rt32.h*/
+  0xffe00004,
+  0x42600000,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerhf_abs*/
+  /*if (!Pv4) memh(#u6)=Rt32*/
+  0xffe02084,
+  0xaf400084,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerhfnew_abs*/
+  /*if (!Pv4.new) memh(#u6)=Rt32*/
+  0xffe02084,
+  0xaf402084,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerhfnew_io*/
+  /*if (!Pv4.new) memh(Rs32+#u6:1)=Rt32*/
+  0xffe00004,
+  0x46400000,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerhnewf_abs*/
+  /*if (!Pv4) memh(#u6)=Nt8.new*/
+  0xffe03884,
+  0xafa00884,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerhnewfnew_abs*/
+  /*if (!Pv4.new) memh(#u6)=Nt8.new*/
+  0xffe03884,
+  0xafa02884,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerhnewfnew_io*/
+  /*if (!Pv4.new) memh(Rs32+#u6:1)=Nt8.new*/
+  0xffe01804,
+  0x46a00800,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerhnewt_abs*/
+  /*if (Pv4) memh(#u6)=Nt8.new*/
+  0xffe03884,
+  0xafa00880,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerhnewtnew_abs*/
+  /*if (Pv4.new) memh(#u6)=Nt8.new*/
+  0xffe03884,
+  0xafa02880,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerhnewtnew_io*/
+  /*if (Pv4.new) memh(Rs32+#u6:1)=Nt8.new*/
+  0xffe01804,
+  0x42a00800,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerht_abs*/
+  /*if (Pv4) memh(#u6)=Rt32*/
+  0xffe02084,
+  0xaf400080,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerhtnew_abs*/
+  /*if (Pv4.new) memh(#u6)=Rt32*/
+  0xffe02084,
+  0xaf402080,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerhtnew_io*/
+  /*if (Pv4.new) memh(Rs32+#u6:1)=Rt32*/
+  0xffe00004,
+  0x42400000,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerif_abs*/
+  /*if (!Pv4) memw(#u6)=Rt32*/
+  0xffe02084,
+  0xaf800084,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerifnew_abs*/
+  /*if (!Pv4.new) memw(#u6)=Rt32*/
+  0xffe02084,
+  0xaf802084,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerifnew_io*/
+  /*if (!Pv4.new) memw(Rs32+#u6:2)=Rt32*/
+  0xffe00004,
+  0x46800000,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerinewf_abs*/
+  /*if (!Pv4) memw(#u6)=Nt8.new*/
+  0xffe03884,
+  0xafa01084,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerinewfnew_abs*/
+  /*if (!Pv4.new) memw(#u6)=Nt8.new*/
+  0xffe03884,
+  0xafa03084,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerinewfnew_io*/
+  /*if (!Pv4.new) memw(Rs32+#u6:2)=Nt8.new*/
+  0xffe01804,
+  0x46a01000,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerinewt_abs*/
+  /*if (Pv4) memw(#u6)=Nt8.new*/
+  0xffe03884,
+  0xafa01080,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerinewtnew_abs*/
+  /*if (Pv4.new) memw(#u6)=Nt8.new*/
+  0xffe03884,
+  0xafa03080,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstorerinewtnew_io*/
+  /*if (Pv4.new) memw(Rs32+#u6:2)=Nt8.new*/
+  0xffe01804,
+  0x42a01000,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_pstorerit_abs*/
+  /*if (Pv4) memw(#u6)=Rt32*/
+  0xffe02084,
+  0xaf800080,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstoreritnew_abs*/
+  /*if (Pv4.new) memw(#u6)=Rt32*/
+  0xffe02084,
+  0xaf802080,
+  0x00030078,
+  0 },
+{ /*Tag:S4_pstoreritnew_io*/
+  /*if (Pv4.new) memw(Rs32+#u6:2)=Rt32*/
+  0xffe00004,
+  0x42800000,
+  0x000020f8,
+  0 },
+{ /*Tag:S4_storeirb_io*/
+  /*memb(Rs32+#u6:0)=#S8*/
+  0xfe600000,
+  0x3c000000,
+  0x0000207f,
+  0 },
+{ /*Tag:S4_storeirbf_io*/
+  /*if (!Pv4) memb(Rs32+#u6:0)=#S6*/
+  0xffe00000,
+  0x38800000,
+  0x0000201f,
+  0 },
+{ /*Tag:S4_storeirbfnew_io*/
+  /*if (!Pv4.new) memb(Rs32+#u6:0)=#S6*/
+  0xffe00000,
+  0x39800000,
+  0x0000201f,
+  0 },
+{ /*Tag:S4_storeirbt_io*/
+  /*if (Pv4) memb(Rs32+#u6:0)=#S6*/
+  0xffe00000,
+  0x38000000,
+  0x0000201f,
+  0 },
+{ /*Tag:S4_storeirbtnew_io*/
+  /*if (Pv4.new) memb(Rs32+#u6:0)=#S6*/
+  0xffe00000,
+  0x39000000,
+  0x0000201f,
+  0 },
+{ /*Tag:S4_storeirh_io*/
+  /*memh(Rs32+#u6:1)=#S8*/
+  0xfe600000,
+  0x3c200000,
+  0x0000207f,
+  0 },
+{ /*Tag:S4_storeirhf_io*/
+  /*if (!Pv4) memh(Rs32+#u6:1)=#S6*/
+  0xffe00000,
+  0x38a00000,
+  0x0000201f,
+  0 },
+{ /*Tag:S4_storeirhfnew_io*/
+  /*if (!Pv4.new) memh(Rs32+#u6:1)=#S6*/
+  0xffe00000,
+  0x39a00000,
+  0x0000201f,
+  0 },
+{ /*Tag:S4_storeirht_io*/
+  /*if (Pv4) memh(Rs32+#u6:1)=#S6*/
+  0xffe00000,
+  0x38200000,
+  0x0000201f,
+  0 },
+{ /*Tag:S4_storeirhtnew_io*/
+  /*if (Pv4.new) memh(Rs32+#u6:1)=#S6*/
+  0xffe00000,
+  0x39200000,
+  0x0000201f,
+  0 },
+{ /*Tag:S4_storeiri_io*/
+  /*memw(Rs32+#u6:2)=#S8*/
+  0xfe600000,
+  0x3c400000,
+  0x0000207f,
+  0 },
+{ /*Tag:S4_storeirif_io*/
+  /*if (!Pv4) memw(Rs32+#u6:2)=#S6*/
+  0xffe00000,
+  0x38c00000,
+  0x0000201f,
+  0 },
+{ /*Tag:S4_storeirifnew_io*/
+  /*if (!Pv4.new) memw(Rs32+#u6:2)=#S6*/
+  0xffe00000,
+  0x39c00000,
+  0x0000201f,
+  0 },
+{ /*Tag:S4_storeirit_io*/
+  /*if (Pv4) memw(Rs32+#u6:2)=#S6*/
+  0xffe00000,
+  0x38400000,
+  0x0000201f,
+  0 },
+{ /*Tag:S4_storeiritnew_io*/
+  /*if (Pv4.new) memw(Rs32+#u6:2)=#S6*/
+  0xffe00000,
+  0x39400000,
+  0x0000201f,
+  0 },
+{ /*Tag:S4_storerb_ap*/
+  /*memb(Re32=#U6)=Rt32*/
+  0xffe02080,
+  0xab000080,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storerb_ur*/
+  /*memb(Ru32<<#u2+#U6)=Rt32*/
+  0xffe00080,
+  0xad000080,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storerbnew_ap*/
+  /*memb(Re32=#U6)=Nt8.new*/
+  0xffe03880,
+  0xaba00080,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storerbnew_ur*/
+  /*memb(Ru32<<#u2+#U6)=Nt8.new*/
+  0xffe01880,
+  0xada00080,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storerd_ap*/
+  /*memd(Re32=#U6)=Rtt32*/
+  0xffe02080,
+  0xabc00080,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storerd_ur*/
+  /*memd(Ru32<<#u2+#U6)=Rtt32*/
+  0xffe00080,
+  0xadc00080,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storerf_ap*/
+  /*memh(Re32=#U6)=Rt32.h*/
+  0xffe02080,
+  0xab600080,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storerf_ur*/
+  /*memh(Ru32<<#u2+#U6)=Rt32.h*/
+  0xffe00080,
+  0xad600080,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storerh_ap*/
+  /*memh(Re32=#U6)=Rt32*/
+  0xffe02080,
+  0xab400080,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storerh_ur*/
+  /*memh(Ru32<<#u2+#U6)=Rt32*/
+  0xffe00080,
+  0xad400080,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storerhnew_ap*/
+  /*memh(Re32=#U6)=Nt8.new*/
+  0xffe03880,
+  0xaba00880,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storerhnew_ur*/
+  /*memh(Ru32<<#u2+#U6)=Nt8.new*/
+  0xffe01880,
+  0xada00880,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storeri_ap*/
+  /*memw(Re32=#U6)=Rt32*/
+  0xffe02080,
+  0xab800080,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storeri_ur*/
+  /*memw(Ru32<<#u2+#U6)=Rt32*/
+  0xffe00080,
+  0xad800080,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storerinew_ap*/
+  /*memw(Re32=#U6)=Nt8.new*/
+  0xffe03880,
+  0xaba01080,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_storerinew_ur*/
+  /*memw(Ru32<<#u2+#U6)=Nt8.new*/
+  0xffe01880,
+  0xada01080,
+  0x0000003f,
+  0 },
+{ /*Tag:S4_subaddi*/
+  /*Rd32=add(Rs32,sub(#s6,Ru32))*/
+  0xff800000,
+  0xdb800000,
+  0x006020e0,
+  0 },
+{ /*Tag:S4_subi_asl_ri*/
+  /*Rx32=sub(#u8,asl(Rx32,#U5))*/
+  0xff000016,
+  0xde000006,
+  0x00e020e8,
+  0 },
+{ /*Tag:S4_subi_lsr_ri*/
+  /*Rx32=sub(#u8,lsr(Rx32,#U5))*/
+  0xff000016,
+  0xde000016,
+  0x00e020e8,
+  0 },
+{ /*Tag:SA1_addi*/
+  /*Rx16=add(Rx16,#s7)*/
+  0xf8002000,
+  0x20002000,
+  0x07f00000,
+  1 },
+{ /*Tag:SA1_addi*/
+  /*Rx16=add(Rx16,#s7)*/
+  0xf8002000,
+  0x40000000,
+  0x07f00000,
+  1 },
+{ /*Tag:SA1_addi*/
+  /*Rx16=add(Rx16,#s7)*/
+  0xf8002000,
+  0x40002000,
+  0x07f00000,
+  1 },
+{ /*Tag:SA1_addi*/
+  /*Rx16=add(Rx16,#s7)*/
+  0xf8002000,
+  0x60000000,
+  0x07f00000,
+  1 },
+{ /*Tag:SA1_addi*/
+  /*Rx16=add(Rx16,#s7)*/
+  0xf8002000,
+  0x60002000,
+  0x07f00000,
+  1 },
+{ /*Tag:SA1_seti*/
+  /*Rd16=#u6*/
+  0xfc002000,
+  0x28002000,
+  0x03f00000,
+  1 },
+{ /*Tag:SA1_seti*/
+  /*Rd16=#u6*/
+  0xfc002000,
+  0x48000000,
+  0x03f00000,
+  1 },
+{ /*Tag:SA1_seti*/
+  /*Rd16=#u6*/
+  0xfc002000,
+  0x48002000,
+  0x03f00000,
+  1 },
+{ /*Tag:SA1_seti*/
+  /*Rd16=#u6*/
+  0xfc002000,
+  0x68000000,
+  0x03f00000,
+  1 },
+{ /*Tag:SA1_seti*/
+  /*Rd16=#u6*/
+  0xfc002000,
+  0x68002000,
+  0x03f00000,
+  1 },
+{ /*Tag:dup_A2_addi*/
+  /*Rd32=add(Rs32,#s16)*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_A2_andir*/
+  /*Rd32=and(Rs32,#s10)*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_A2_combineii*/
+  /*Rdd32=combine(#s8,#S8)*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_A2_tfrsi*/
+  /*Rd32=#s16*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_A4_combineii*/
+  /*Rdd32=combine(#s8,#U6)*/
+  0x00000000,
+  0x00000000,
+  0x00002404,
+  0 },
+{ /*Tag:dup_A4_combineir*/
+  /*Rdd32=combine(#s8,Rs32)*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_A4_combineri*/
+  /*Rdd32=combine(Rs32,#s8)*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_C2_cmoveif*/
+  /*if (!Pu4) Rd32=#s12*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_C2_cmoveit*/
+  /*if (Pu4) Rd32=#s12*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_C2_cmovenewif*/
+  /*if (!Pu4.new) Rd32=#s12*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_C2_cmovenewit*/
+  /*if (Pu4.new) Rd32=#s12*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_C2_cmpeqi*/
+  /*Pd4=cmp.eq(Rs32,#s10)*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_L2_loadrb_io*/
+  /*Rd32=memb(Rs32+#s11:0)*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_L2_loadrd_io*/
+  /*Rdd32=memd(Rs32+#s11:3)*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_L2_loadrh_io*/
+  /*Rd32=memh(Rs32+#s11:1)*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_L2_loadri_io*/
+  /*Rd32=memw(Rs32+#s11:2)*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_L2_loadrub_io*/
+  /*Rd32=memub(Rs32+#s11:0)*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_L2_loadruh_io*/
+  /*Rd32=memuh(Rs32+#s11:1)*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_S2_storerb_io*/
+  /*memb(Rs32+#s11:0)=Rt32*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_S2_storerd_io*/
+  /*memd(Rs32+#s11:3)=Rtt32*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_S2_storerh_io*/
+  /*memh(Rs32+#s11:1)=Rt32*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_S2_storeri_io*/
+  /*memw(Rs32+#s11:2)=Rt32*/
+  0x00000000,
+  0x00000000,
+  0x00000000,
+  0 },
+{ /*Tag:dup_S4_storeirb_io*/
+  /*memb(Rs32+#u6:0)=#S8*/
+  0x00000000,
+  0x00000000,
+  0x00002404,
+  0 },
+{ /*Tag:dup_S4_storeiri_io*/
+  /*memw(Rs32+#u6:2)=#S8*/
+  0x00000000,
+  0x00000000,
+  0x00002404,
+  0 }
+};
+// clang-format off
+
+#endif  // LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPMASK_H
diff --git a/llvm/lib/Target/Hexagon/HexagonDepOperands.td b/llvm/lib/Target/Hexagon/HexagonDepOperands.td
index 8a94d96522cc..6ef668d30764 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepOperands.td
+++ b/llvm/lib/Target/Hexagon/HexagonDepOperands.td
@@ -5,7 +5,7 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+// Automatically generated file, do not edit!
 //===----------------------------------------------------------------------===//
 
 multiclass ImmOpPred<code pred, ValueType vt = i32> {
@@ -13,120 +13,120 @@ multiclass ImmOpPred<code pred, ValueType vt = i32> {
   def _timm : PatLeaf<(vt timm), pred>;
 }
 
-def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand; let DecoderMethod = "s4_0ImmDecoder"; }
-defm s4_0ImmPred : ImmOpPred<[{ return isShiftedInt<4, 0>(N->getSExtValue());}]>;
-def s29_3ImmOperand : AsmOperandClass { let Name = "s29_3Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s29_3Imm : Operand<i32> { let ParserMatchClass = s29_3ImmOperand; let DecoderMethod = "s29_3ImmDecoder"; }
-defm s29_3ImmPred : ImmOpPred<[{ return isShiftedInt<32, 3>(N->getSExtValue());}]>;
-def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; let RenderMethod = "addImmOperands"; }
-def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u6_0ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>;
-def a30_2ImmOperand : AsmOperandClass { let Name = "a30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
-def a30_2Imm : Operand<i32> { let ParserMatchClass = a30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
-defm a30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
-def u29_3ImmOperand : AsmOperandClass { let Name = "u29_3Imm"; let RenderMethod = "addImmOperands"; }
-def u29_3Imm : Operand<i32> { let ParserMatchClass = u29_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u29_3ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 3>(N->getSExtValue());}]>;
+def s32_0ImmOperand : AsmOperandClass { let Name = "s32_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s32_0Imm : Operand<i32> { let ParserMatchClass = s32_0ImmOperand; let DecoderMethod = "s32_0ImmDecoder"; }
+defm s32_0ImmPred : ImmOpPred<[{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
 def s8_0ImmOperand : AsmOperandClass { let Name = "s8_0Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s8_0Imm : Operand<i32> { let ParserMatchClass = s8_0ImmOperand; let DecoderMethod = "s8_0ImmDecoder"; }
 defm s8_0ImmPred : ImmOpPred<[{ return isShiftedInt<8, 0>(N->getSExtValue());}]>;
+def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; let RenderMethod = "addImmOperands"; }
+def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u16_0ImmPred : ImmOpPred<[{ return isShiftedUInt<16, 0>(N->getSExtValue());}]>;
+def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; let RenderMethod = "addImmOperands"; }
+def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u5_0ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 0>(N->getSExtValue());}]>;
+def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; let RenderMethod = "addImmOperands"; }
+def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u8_0ImmPred : ImmOpPred<[{ return isShiftedUInt<8, 0>(N->getSExtValue());}]>;
 def u32_0ImmOperand : AsmOperandClass { let Name = "u32_0Imm"; let RenderMethod = "addImmOperands"; }
 def u32_0Imm : Operand<i32> { let ParserMatchClass = u32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
 defm u32_0ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 0>(N->getSExtValue());}]>;
-def u4_2ImmOperand : AsmOperandClass { let Name = "u4_2Imm"; let RenderMethod = "addImmOperands"; }
-def u4_2Imm : Operand<i32> { let ParserMatchClass = u4_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u4_2ImmPred : ImmOpPred<[{ return isShiftedUInt<4, 2>(N->getSExtValue());}]>;
-def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; let RenderMethod = "addImmOperands"; }
-def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u3_0ImmPred : ImmOpPred<[{ return isShiftedUInt<3, 0>(N->getSExtValue());}]>;
+def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; let RenderMethod = "addImmOperands"; }
+def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u26_6ImmPred : ImmOpPred<[{ return isShiftedUInt<26, 6>(N->getSExtValue());}]>;
+def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; let RenderMethod = "addImmOperands"; }
+def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u7_0ImmPred : ImmOpPred<[{ return isShiftedUInt<7, 0>(N->getSExtValue());}]>;
+def u6_0ImmOperand : AsmOperandClass { let Name = "u6_0Imm"; let RenderMethod = "addImmOperands"; }
+def u6_0Imm : Operand<i32> { let ParserMatchClass = u6_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u6_0ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 0>(N->getSExtValue());}]>;
+def u10_0ImmOperand : AsmOperandClass { let Name = "u10_0Imm"; let RenderMethod = "addImmOperands"; }
+def u10_0Imm : Operand<i32> { let ParserMatchClass = u10_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u10_0ImmPred : ImmOpPred<[{ return isShiftedUInt<10, 0>(N->getSExtValue());}]>;
+def a30_2ImmOperand : AsmOperandClass { let Name = "a30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def a30_2Imm : Operand<i32> { let ParserMatchClass = a30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+defm a30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+def b30_2ImmOperand : AsmOperandClass { let Name = "b30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def b30_2Imm : Operand<OtherVT> { let ParserMatchClass = b30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+defm b30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
 def b15_2ImmOperand : AsmOperandClass { let Name = "b15_2Imm"; let RenderMethod = "addSignedImmOperands"; }
 def b15_2Imm : Operand<OtherVT> { let ParserMatchClass = b15_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
 defm b15_2ImmPred : ImmOpPred<[{ return isShiftedInt<15, 2>(N->getSExtValue());}]>;
-def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; let RenderMethod = "addImmOperands"; }
-def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u11_3ImmPred : ImmOpPred<[{ return isShiftedUInt<11, 3>(N->getSExtValue());}]>;
-def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand; let DecoderMethod = "s4_3ImmDecoder"; }
-defm s4_3ImmPred : ImmOpPred<[{ return isShiftedInt<4, 3>(N->getSExtValue());}]>;
-def m32_0ImmOperand : AsmOperandClass { let Name = "m32_0Imm"; let RenderMethod = "addImmOperands"; }
-def m32_0Imm : Operand<i32> { let ParserMatchClass = m32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm m32_0ImmPred : ImmOpPred<[{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
-def u3_1ImmOperand : AsmOperandClass { let Name = "u3_1Imm"; let RenderMethod = "addImmOperands"; }
-def u3_1Imm : Operand<i32> { let ParserMatchClass = u3_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u3_1ImmPred : ImmOpPred<[{ return isShiftedUInt<3, 1>(N->getSExtValue());}]>;
-def u1_0ImmOperand : AsmOperandClass { let Name = "u1_0Imm"; let RenderMethod = "addImmOperands"; }
-def u1_0Imm : Operand<i32> { let ParserMatchClass = u1_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u1_0ImmPred : ImmOpPred<[{ return isShiftedUInt<1, 0>(N->getSExtValue());}]>;
+def b13_2ImmOperand : AsmOperandClass { let Name = "b13_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def b13_2Imm : Operand<OtherVT> { let ParserMatchClass = b13_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
+defm b13_2ImmPred : ImmOpPred<[{ return isShiftedInt<13, 2>(N->getSExtValue());}]>;
+def s4_0ImmOperand : AsmOperandClass { let Name = "s4_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_0Imm : Operand<i32> { let ParserMatchClass = s4_0ImmOperand; let DecoderMethod = "s4_0ImmDecoder"; }
+defm s4_0ImmPred : ImmOpPred<[{ return isShiftedInt<4, 0>(N->getSExtValue());}]>;
 def s31_1ImmOperand : AsmOperandClass { let Name = "s31_1Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s31_1Imm : Operand<i32> { let ParserMatchClass = s31_1ImmOperand; let DecoderMethod = "s31_1ImmDecoder"; }
 defm s31_1ImmPred : ImmOpPred<[{ return isShiftedInt<32, 1>(N->getSExtValue());}]>;
-def s3_0ImmOperand : AsmOperandClass { let Name = "s3_0Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s3_0Imm : Operand<i32> { let ParserMatchClass = s3_0ImmOperand; let DecoderMethod = "s3_0ImmDecoder"; }
-defm s3_0ImmPred : ImmOpPred<[{ return isShiftedInt<3, 0>(N->getSExtValue());}]>;
+def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand; let DecoderMethod = "s4_1ImmDecoder"; }
+defm s4_1ImmPred : ImmOpPred<[{ return isShiftedInt<4, 1>(N->getSExtValue());}]>;
 def s30_2ImmOperand : AsmOperandClass { let Name = "s30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s30_2Imm : Operand<i32> { let ParserMatchClass = s30_2ImmOperand; let DecoderMethod = "s30_2ImmDecoder"; }
 defm s30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
+def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand; let DecoderMethod = "s4_2ImmDecoder"; }
+defm s4_2ImmPred : ImmOpPred<[{ return isShiftedInt<4, 2>(N->getSExtValue());}]>;
+def s29_3ImmOperand : AsmOperandClass { let Name = "s29_3Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s29_3Imm : Operand<i32> { let ParserMatchClass = s29_3ImmOperand; let DecoderMethod = "s29_3ImmDecoder"; }
+defm s29_3ImmPred : ImmOpPred<[{ return isShiftedInt<32, 3>(N->getSExtValue());}]>;
+def s4_3ImmOperand : AsmOperandClass { let Name = "s4_3Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s4_3Imm : Operand<i32> { let ParserMatchClass = s4_3ImmOperand; let DecoderMethod = "s4_3ImmDecoder"; }
+defm s4_3ImmPred : ImmOpPred<[{ return isShiftedInt<4, 3>(N->getSExtValue());}]>;
+def u29_3ImmOperand : AsmOperandClass { let Name = "u29_3Imm"; let RenderMethod = "addImmOperands"; }
+def u29_3Imm : Operand<i32> { let ParserMatchClass = u29_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u29_3ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 3>(N->getSExtValue());}]>;
+def u31_1ImmOperand : AsmOperandClass { let Name = "u31_1Imm"; let RenderMethod = "addImmOperands"; }
+def u31_1Imm : Operand<i32> { let ParserMatchClass = u31_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u31_1ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 1>(N->getSExtValue());}]>;
+def u30_2ImmOperand : AsmOperandClass { let Name = "u30_2Imm"; let RenderMethod = "addImmOperands"; }
+def u30_2Imm : Operand<i32> { let ParserMatchClass = u30_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u30_2ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>;
+def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; let RenderMethod = "addImmOperands"; }
+def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u2_0ImmPred : ImmOpPred<[{ return isShiftedUInt<2, 0>(N->getSExtValue());}]>;
+def m32_0ImmOperand : AsmOperandClass { let Name = "m32_0Imm"; let RenderMethod = "addImmOperands"; }
+def m32_0Imm : Operand<i32> { let ParserMatchClass = m32_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm m32_0ImmPred : ImmOpPred<[{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
+def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; let RenderMethod = "addImmOperands"; }
+def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u6_2ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 2>(N->getSExtValue());}]>;
+def u3_0ImmOperand : AsmOperandClass { let Name = "u3_0Imm"; let RenderMethod = "addImmOperands"; }
+def u3_0Imm : Operand<i32> { let ParserMatchClass = u3_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u3_0ImmPred : ImmOpPred<[{ return isShiftedUInt<3, 0>(N->getSExtValue());}]>;
+def u11_3ImmOperand : AsmOperandClass { let Name = "u11_3Imm"; let RenderMethod = "addImmOperands"; }
+def u11_3Imm : Operand<i32> { let ParserMatchClass = u11_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u11_3ImmPred : ImmOpPred<[{ return isShiftedUInt<11, 3>(N->getSExtValue());}]>;
 def u4_0ImmOperand : AsmOperandClass { let Name = "u4_0Imm"; let RenderMethod = "addImmOperands"; }
 def u4_0Imm : Operand<i32> { let ParserMatchClass = u4_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
 defm u4_0ImmPred : ImmOpPred<[{ return isShiftedUInt<4, 0>(N->getSExtValue());}]>;
 def s6_0ImmOperand : AsmOperandClass { let Name = "s6_0Imm"; let RenderMethod = "addSignedImmOperands"; }
 def s6_0Imm : Operand<i32> { let ParserMatchClass = s6_0ImmOperand; let DecoderMethod = "s6_0ImmDecoder"; }
 defm s6_0ImmPred : ImmOpPred<[{ return isShiftedInt<6, 0>(N->getSExtValue());}]>;
-def u5_3ImmOperand : AsmOperandClass { let Name = "u5_3Imm"; let RenderMethod = "addImmOperands"; }
-def u5_3Imm : Operand<i32> { let ParserMatchClass = u5_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u5_3ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 3>(N->getSExtValue());}]>;
-def s32_0ImmOperand : AsmOperandClass { let Name = "s32_0Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s32_0Imm : Operand<i32> { let ParserMatchClass = s32_0ImmOperand; let DecoderMethod = "s32_0ImmDecoder"; }
-defm s32_0ImmPred : ImmOpPred<[{ return isShiftedInt<32, 0>(N->getSExtValue());}]>;
-def s6_3ImmOperand : AsmOperandClass { let Name = "s6_3Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s6_3Imm : Operand<i32> { let ParserMatchClass = s6_3ImmOperand; let DecoderMethod = "s6_3ImmDecoder"; }
-defm s6_3ImmPred : ImmOpPred<[{ return isShiftedInt<6, 3>(N->getSExtValue());}]>;
-def u10_0ImmOperand : AsmOperandClass { let Name = "u10_0Imm"; let RenderMethod = "addImmOperands"; }
-def u10_0Imm : Operand<i32> { let ParserMatchClass = u10_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u10_0ImmPred : ImmOpPred<[{ return isShiftedUInt<10, 0>(N->getSExtValue());}]>;
-def u31_1ImmOperand : AsmOperandClass { let Name = "u31_1Imm"; let RenderMethod = "addImmOperands"; }
-def u31_1Imm : Operand<i32> { let ParserMatchClass = u31_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u31_1ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 1>(N->getSExtValue());}]>;
-def s4_1ImmOperand : AsmOperandClass { let Name = "s4_1Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s4_1Imm : Operand<i32> { let ParserMatchClass = s4_1ImmOperand; let DecoderMethod = "s4_1ImmDecoder"; }
-defm s4_1ImmPred : ImmOpPred<[{ return isShiftedInt<4, 1>(N->getSExtValue());}]>;
-def u16_0ImmOperand : AsmOperandClass { let Name = "u16_0Imm"; let RenderMethod = "addImmOperands"; }
-def u16_0Imm : Operand<i32> { let ParserMatchClass = u16_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u16_0ImmPred : ImmOpPred<[{ return isShiftedUInt<16, 0>(N->getSExtValue());}]>;
 def u6_1ImmOperand : AsmOperandClass { let Name = "u6_1Imm"; let RenderMethod = "addImmOperands"; }
 def u6_1Imm : Operand<i32> { let ParserMatchClass = u6_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
 defm u6_1ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 1>(N->getSExtValue());}]>;
+def u4_2ImmOperand : AsmOperandClass { let Name = "u4_2Imm"; let RenderMethod = "addImmOperands"; }
+def u4_2Imm : Operand<i32> { let ParserMatchClass = u4_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u4_2ImmPred : ImmOpPred<[{ return isShiftedUInt<4, 2>(N->getSExtValue());}]>;
+def u5_3ImmOperand : AsmOperandClass { let Name = "u5_3Imm"; let RenderMethod = "addImmOperands"; }
+def u5_3Imm : Operand<i32> { let ParserMatchClass = u5_3ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u5_3ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 3>(N->getSExtValue());}]>;
+def u3_1ImmOperand : AsmOperandClass { let Name = "u3_1Imm"; let RenderMethod = "addImmOperands"; }
+def u3_1Imm : Operand<i32> { let ParserMatchClass = u3_1ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u3_1ImmPred : ImmOpPred<[{ return isShiftedUInt<3, 1>(N->getSExtValue());}]>;
 def u5_2ImmOperand : AsmOperandClass { let Name = "u5_2Imm"; let RenderMethod = "addImmOperands"; }
 def u5_2Imm : Operand<i32> { let ParserMatchClass = u5_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
 defm u5_2ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 2>(N->getSExtValue());}]>;
-def u26_6ImmOperand : AsmOperandClass { let Name = "u26_6Imm"; let RenderMethod = "addImmOperands"; }
-def u26_6Imm : Operand<i32> { let ParserMatchClass = u26_6ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u26_6ImmPred : ImmOpPred<[{ return isShiftedUInt<26, 6>(N->getSExtValue());}]>;
-def u6_2ImmOperand : AsmOperandClass { let Name = "u6_2Imm"; let RenderMethod = "addImmOperands"; }
-def u6_2Imm : Operand<i32> { let ParserMatchClass = u6_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u6_2ImmPred : ImmOpPred<[{ return isShiftedUInt<6, 2>(N->getSExtValue());}]>;
-def u7_0ImmOperand : AsmOperandClass { let Name = "u7_0Imm"; let RenderMethod = "addImmOperands"; }
-def u7_0Imm : Operand<i32> { let ParserMatchClass = u7_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u7_0ImmPred : ImmOpPred<[{ return isShiftedUInt<7, 0>(N->getSExtValue());}]>;
-def b13_2ImmOperand : AsmOperandClass { let Name = "b13_2Imm"; let RenderMethod = "addSignedImmOperands"; }
-def b13_2Imm : Operand<OtherVT> { let ParserMatchClass = b13_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
-defm b13_2ImmPred : ImmOpPred<[{ return isShiftedInt<13, 2>(N->getSExtValue());}]>;
-def u5_0ImmOperand : AsmOperandClass { let Name = "u5_0Imm"; let RenderMethod = "addImmOperands"; }
-def u5_0Imm : Operand<i32> { let ParserMatchClass = u5_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u5_0ImmPred : ImmOpPred<[{ return isShiftedUInt<5, 0>(N->getSExtValue());}]>;
-def u2_0ImmOperand : AsmOperandClass { let Name = "u2_0Imm"; let RenderMethod = "addImmOperands"; }
-def u2_0Imm : Operand<i32> { let ParserMatchClass = u2_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u2_0ImmPred : ImmOpPred<[{ return isShiftedUInt<2, 0>(N->getSExtValue());}]>;
-def s4_2ImmOperand : AsmOperandClass { let Name = "s4_2Imm"; let RenderMethod = "addSignedImmOperands"; }
-def s4_2Imm : Operand<i32> { let ParserMatchClass = s4_2ImmOperand; let DecoderMethod = "s4_2ImmDecoder"; }
-defm s4_2ImmPred : ImmOpPred<[{ return isShiftedInt<4, 2>(N->getSExtValue());}]>;
-def b30_2ImmOperand : AsmOperandClass { let Name = "b30_2Imm"; let RenderMethod = "addSignedImmOperands"; }
-def b30_2Imm : Operand<OtherVT> { let ParserMatchClass = b30_2ImmOperand; let DecoderMethod = "brtargetDecoder"; let PrintMethod = "printBrtarget"; }
-defm b30_2ImmPred : ImmOpPred<[{ return isShiftedInt<32, 2>(N->getSExtValue());}]>;
-def u8_0ImmOperand : AsmOperandClass { let Name = "u8_0Imm"; let RenderMethod = "addImmOperands"; }
-def u8_0Imm : Operand<i32> { let ParserMatchClass = u8_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u8_0ImmPred : ImmOpPred<[{ return isShiftedUInt<8, 0>(N->getSExtValue());}]>;
-def u30_2ImmOperand : AsmOperandClass { let Name = "u30_2Imm"; let RenderMethod = "addImmOperands"; }
-def u30_2Imm : Operand<i32> { let ParserMatchClass = u30_2ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
-defm u30_2ImmPred : ImmOpPred<[{ return isShiftedUInt<32, 2>(N->getSExtValue());}]>;
+def s6_3ImmOperand : AsmOperandClass { let Name = "s6_3Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s6_3Imm : Operand<i32> { let ParserMatchClass = s6_3ImmOperand; let DecoderMethod = "s6_3ImmDecoder"; }
+defm s6_3ImmPred : ImmOpPred<[{ return isShiftedInt<6, 3>(N->getSExtValue());}]>;
+def s3_0ImmOperand : AsmOperandClass { let Name = "s3_0Imm"; let RenderMethod = "addSignedImmOperands"; }
+def s3_0Imm : Operand<i32> { let ParserMatchClass = s3_0ImmOperand; let DecoderMethod = "s3_0ImmDecoder"; }
+defm s3_0ImmPred : ImmOpPred<[{ return isShiftedInt<3, 0>(N->getSExtValue());}]>;
+def u1_0ImmOperand : AsmOperandClass { let Name = "u1_0Imm"; let RenderMethod = "addImmOperands"; }
+def u1_0Imm : Operand<i32> { let ParserMatchClass = u1_0ImmOperand; let DecoderMethod = "unsignedImmDecoder"; }
+defm u1_0ImmPred : ImmOpPred<[{ return isShiftedUInt<1, 0>(N->getSExtValue());}]>;
diff --git a/llvm/lib/Target/Hexagon/HexagonDepTimingClasses.h b/llvm/lib/Target/Hexagon/HexagonDepTimingClasses.h
index b6be74f848bb..dba39232433d 100644
--- a/llvm/lib/Target/Hexagon/HexagonDepTimingClasses.h
+++ b/llvm/lib/Target/Hexagon/HexagonDepTimingClasses.h
@@ -5,137 +5,147 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// Automatically generated file, please consult code owner before editing.
+// Automatically generated file, do not edit!
 //===----------------------------------------------------------------------===//
 
 
-#ifndef TARGET_HEXAGON_HEXAGON_DEP_TIMING_CLASSES_H
-#define TARGET_HEXAGON_HEXAGON_DEP_TIMING_CLASSES_H
+#ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPTIMINGCLASSES_H
+#define LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPTIMINGCLASSES_H
 
 #include "HexagonInstrInfo.h"
 
 namespace llvm {
 
-inline bool is_TC3x(unsigned SchedClass) {
+inline bool is_TC1(unsigned SchedClass) {
   switch (SchedClass) {
-  case Hexagon::Sched::tc_05d3a09b:
-  case Hexagon::Sched::tc_0d8f5752:
-  case Hexagon::Sched::tc_13bfbcf9:
-  case Hexagon::Sched::tc_174516e8:
-  case Hexagon::Sched::tc_1a2fd869:
-  case Hexagon::Sched::tc_1c4528a2:
-  case Hexagon::Sched::tc_32779c6f:
-  case Hexagon::Sched::tc_5b54b33f:
-  case Hexagon::Sched::tc_6b25e783:
-  case Hexagon::Sched::tc_76851da1:
-  case Hexagon::Sched::tc_9debc299:
-  case Hexagon::Sched::tc_a9d88b22:
-  case Hexagon::Sched::tc_bafaade3:
-  case Hexagon::Sched::tc_bcf98408:
-  case Hexagon::Sched::tc_bdceeac1:
-  case Hexagon::Sched::tc_c8ce0b5c:
-  case Hexagon::Sched::tc_d1aa9eaa:
-  case Hexagon::Sched::tc_d773585a:
-  case Hexagon::Sched::tc_df3319ed:
+  case Hexagon::Sched::tc_112d30d6:
+  case Hexagon::Sched::tc_151bf368:
+  case Hexagon::Sched::tc_1c2c7a4a:
+  case Hexagon::Sched::tc_1d41f8b7:
+  case Hexagon::Sched::tc_23708a21:
+  case Hexagon::Sched::tc_24f426ab:
+  case Hexagon::Sched::tc_2f573607:
+  case Hexagon::Sched::tc_388f9897:
+  case Hexagon::Sched::tc_3d14a17b:
+  case Hexagon::Sched::tc_3fbf1042:
+  case Hexagon::Sched::tc_407e96f9:
+  case Hexagon::Sched::tc_42ff66ba:
+  case Hexagon::Sched::tc_4a55d03c:
+  case Hexagon::Sched::tc_5502c366:
+  case Hexagon::Sched::tc_55b33fda:
+  case Hexagon::Sched::tc_56a124a7:
+  case Hexagon::Sched::tc_57a55b54:
+  case Hexagon::Sched::tc_59a7822c:
+  case Hexagon::Sched::tc_5b347363:
+  case Hexagon::Sched::tc_5da50c4b:
+  case Hexagon::Sched::tc_60e324ff:
+  case Hexagon::Sched::tc_651cbe02:
+  case Hexagon::Sched::tc_6fc5dbea:
+  case Hexagon::Sched::tc_711c805f:
+  case Hexagon::Sched::tc_713b66bf:
+  case Hexagon::Sched::tc_9124c04f:
+  case Hexagon::Sched::tc_9c52f549:
+  case Hexagon::Sched::tc_9e27f2f9:
+  case Hexagon::Sched::tc_9f6cd987:
+  case Hexagon::Sched::tc_a1297125:
+  case Hexagon::Sched::tc_a7a13fac:
+  case Hexagon::Sched::tc_b837298f:
+  case Hexagon::Sched::tc_c57d9f39:
+  case Hexagon::Sched::tc_d33e5eee:
+  case Hexagon::Sched::tc_decdde8a:
+  case Hexagon::Sched::tc_ed03645c:
+  case Hexagon::Sched::tc_eeda4109:
+  case Hexagon::Sched::tc_ef921005:
+  case Hexagon::Sched::tc_f999c66e:
     return true;
   default:
     return false;
   }
 }
 
-inline bool is_TC2early(unsigned SchedClass) {
+inline bool is_TC2(unsigned SchedClass) {
   switch (SchedClass) {
-  case Hexagon::Sched::tc_b4407292:
-  case Hexagon::Sched::tc_fc3999b4:
+  case Hexagon::Sched::tc_01d44cb2:
+  case Hexagon::Sched::tc_0dfac0a7:
+  case Hexagon::Sched::tc_1fcb8495:
+  case Hexagon::Sched::tc_20131976:
+  case Hexagon::Sched::tc_2c13e7f5:
+  case Hexagon::Sched::tc_3edca78f:
+  case Hexagon::Sched::tc_5e4cf0e8:
+  case Hexagon::Sched::tc_65279839:
+  case Hexagon::Sched::tc_7401744f:
+  case Hexagon::Sched::tc_84a7500d:
+  case Hexagon::Sched::tc_8a825db2:
+  case Hexagon::Sched::tc_8b5bd4f5:
+  case Hexagon::Sched::tc_95a33176:
+  case Hexagon::Sched::tc_9b3c0462:
+  case Hexagon::Sched::tc_a08b630b:
+  case Hexagon::Sched::tc_a4e22bbd:
+  case Hexagon::Sched::tc_a7bdb22c:
+  case Hexagon::Sched::tc_bb831a7c:
+  case Hexagon::Sched::tc_c20701f0:
+  case Hexagon::Sched::tc_d3632d88:
+  case Hexagon::Sched::tc_d61dfdc3:
+  case Hexagon::Sched::tc_e3d699e3:
+  case Hexagon::Sched::tc_f098b237:
+  case Hexagon::Sched::tc_f34c1c21:
     return true;
   default:
     return false;
   }
 }
 
-inline bool is_TC4x(unsigned SchedClass) {
+inline bool is_TC3x(unsigned SchedClass) {
   switch (SchedClass) {
-  case Hexagon::Sched::tc_2f7c551d:
-  case Hexagon::Sched::tc_2ff964b4:
-  case Hexagon::Sched::tc_3a867367:
-  case Hexagon::Sched::tc_3b470976:
-  case Hexagon::Sched::tc_4560740b:
-  case Hexagon::Sched::tc_a58fd5cc:
-  case Hexagon::Sched::tc_b8bffe55:
+  case Hexagon::Sched::tc_01e1be3b:
+  case Hexagon::Sched::tc_1248597c:
+  case Hexagon::Sched::tc_197dce51:
+  case Hexagon::Sched::tc_28e55c6f:
+  case Hexagon::Sched::tc_2c3e17fc:
+  case Hexagon::Sched::tc_38382228:
+  case Hexagon::Sched::tc_38e0bae9:
+  case Hexagon::Sched::tc_4abdbdc6:
+  case Hexagon::Sched::tc_503ce0f3:
+  case Hexagon::Sched::tc_556f6577:
+  case Hexagon::Sched::tc_5a4b5e58:
+  case Hexagon::Sched::tc_6ae3426b:
+  case Hexagon::Sched::tc_6d861a95:
+  case Hexagon::Sched::tc_788b1d09:
+  case Hexagon::Sched::tc_7f8ae742:
+  case Hexagon::Sched::tc_9406230a:
+  case Hexagon::Sched::tc_a154b476:
+  case Hexagon::Sched::tc_a38c45dc:
+  case Hexagon::Sched::tc_c21d7447:
+  case Hexagon::Sched::tc_d7718fbe:
+  case Hexagon::Sched::tc_db596beb:
+  case Hexagon::Sched::tc_f0cdeccf:
+  case Hexagon::Sched::tc_fae9dfa5:
     return true;
   default:
     return false;
   }
 }
 
-inline bool is_TC2(unsigned SchedClass) {
+inline bool is_TC2early(unsigned SchedClass) {
   switch (SchedClass) {
-  case Hexagon::Sched::tc_002cb246:
-  case Hexagon::Sched::tc_14b5c689:
-  case Hexagon::Sched::tc_1c80410a:
-  case Hexagon::Sched::tc_4414d8b1:
-  case Hexagon::Sched::tc_6132ba3d:
-  case Hexagon::Sched::tc_61830035:
-  case Hexagon::Sched::tc_679309b8:
-  case Hexagon::Sched::tc_703e822c:
-  case Hexagon::Sched::tc_779080bf:
-  case Hexagon::Sched::tc_784490da:
-  case Hexagon::Sched::tc_88b4f13d:
-  case Hexagon::Sched::tc_9461ff31:
-  case Hexagon::Sched::tc_9e313203:
-  case Hexagon::Sched::tc_a813cf9a:
-  case Hexagon::Sched::tc_bfec0f01:
-  case Hexagon::Sched::tc_cf8126ae:
-  case Hexagon::Sched::tc_d08ee0f4:
-  case Hexagon::Sched::tc_e4a7f9f0:
-  case Hexagon::Sched::tc_f429765c:
-  case Hexagon::Sched::tc_f675fee8:
-  case Hexagon::Sched::tc_f9058dd7:
+  case Hexagon::Sched::tc_45f9d1be:
+  case Hexagon::Sched::tc_a4ee89db:
     return true;
   default:
     return false;
   }
 }
 
-inline bool is_TC1(unsigned SchedClass) {
+inline bool is_TC4x(unsigned SchedClass) {
   switch (SchedClass) {
-  case Hexagon::Sched::tc_0663f615:
-  case Hexagon::Sched::tc_0a705168:
-  case Hexagon::Sched::tc_0ae0825c:
-  case Hexagon::Sched::tc_1b6f7cec:
-  case Hexagon::Sched::tc_1fc97744:
-  case Hexagon::Sched::tc_20cdee80:
-  case Hexagon::Sched::tc_2332b92e:
-  case Hexagon::Sched::tc_2eabeebe:
-  case Hexagon::Sched::tc_3d495a39:
-  case Hexagon::Sched::tc_4c5ba658:
-  case Hexagon::Sched::tc_56336eb0:
-  case Hexagon::Sched::tc_56f114f4:
-  case Hexagon::Sched::tc_57890846:
-  case Hexagon::Sched::tc_5a2711e5:
-  case Hexagon::Sched::tc_5b7c0967:
-  case Hexagon::Sched::tc_640086b5:
-  case Hexagon::Sched::tc_643b4717:
-  case Hexagon::Sched::tc_85c9c08f:
-  case Hexagon::Sched::tc_85d5d03f:
-  case Hexagon::Sched::tc_862b3e70:
-  case Hexagon::Sched::tc_946df596:
-  case Hexagon::Sched::tc_9c3ecd83:
-  case Hexagon::Sched::tc_9fc3dae0:
-  case Hexagon::Sched::tc_a1123dda:
-  case Hexagon::Sched::tc_a1c00888:
-  case Hexagon::Sched::tc_ae53734a:
-  case Hexagon::Sched::tc_b31c2e97:
-  case Hexagon::Sched::tc_b4b5c03a:
-  case Hexagon::Sched::tc_b51dc29a:
-  case Hexagon::Sched::tc_cd374165:
-  case Hexagon::Sched::tc_cfd8378a:
-  case Hexagon::Sched::tc_d5b7b0c1:
-  case Hexagon::Sched::tc_d9d43ecb:
-  case Hexagon::Sched::tc_db2bce9c:
-  case Hexagon::Sched::tc_de4df740:
-  case Hexagon::Sched::tc_de554571:
-  case Hexagon::Sched::tc_e78647bd:
+  case Hexagon::Sched::tc_02fe1c65:
+  case Hexagon::Sched::tc_0a195f2c:
+  case Hexagon::Sched::tc_7f7f45f5:
+  case Hexagon::Sched::tc_9783714b:
+  case Hexagon::Sched::tc_9e72dc89:
+  case Hexagon::Sched::tc_9edb7c77:
+  case Hexagon::Sched::tc_f0e8e832:
+  case Hexagon::Sched::tc_f7569068:
     return true;
   default:
     return false;
@@ -143,4 +153,4 @@ inline bool is_TC1(unsigned SchedClass) {
 }
 } // namespace llvm
 
-#endif
-\ No newline at end of file
+#endif  // LLVM_LIB_TARGET_HEXAGON_HEXAGONDEPTIMINGCLASSES_H
diff --git a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
index d0285a7aa377..a431af17e6d0 100644
--- a/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonEarlyIfConv.cpp
@@ -1017,18 +1017,20 @@ void HexagonEarlyIfConversion::mergeBlocks(MachineBasicBlock *PredB,
   PredB->removeSuccessor(SuccB);
   PredB->splice(PredB->end(), SuccB, SuccB->begin(), SuccB->end());
   PredB->transferSuccessorsAndUpdatePHIs(SuccB);
+  MachineBasicBlock *OldLayoutSuccessor = SuccB->getNextNode();
   removeBlock(SuccB);
   if (!TermOk)
-    PredB->updateTerminator();
+    PredB->updateTerminator(OldLayoutSuccessor);
 }
 
 void HexagonEarlyIfConversion::simplifyFlowGraph(const FlowPattern &FP) {
+  MachineBasicBlock *OldLayoutSuccessor = FP.SplitB->getNextNode();
   if (FP.TrueB)
     removeBlock(FP.TrueB);
   if (FP.FalseB)
     removeBlock(FP.FalseB);
 
-  FP.SplitB->updateTerminator();
+  FP.SplitB->updateTerminator(OldLayoutSuccessor);
   if (FP.SplitB->succ_size() != 1)
     return;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp b/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
index d21de8ccb5ab..97a4b351af66 100644
--- a/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFixupHwLoops.cpp
@@ -20,7 +20,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/PassSupport.h"
+#include "llvm/Pass.h"
 
 using namespace llvm;
 
@@ -114,7 +114,7 @@ bool HexagonFixupHwLoops::fixupLoopInstrs(MachineFunction &MF) {
 
   // First pass - compute the offset of each basic block.
   for (const MachineBasicBlock &MBB : MF) {
-    if (MBB.getAlignment() != Align::None()) {
+    if (MBB.getAlignment() != Align(1)) {
       // Although we don't know the exact layout of the final code, we need
       // to account for alignment padding somehow. This heuristic pads each
       // aligned basic block according to the alignment value.
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index aff8e57b0a94..010b7171ce17 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -261,20 +261,20 @@ static unsigned getMax32BitSubRegister(unsigned Reg,
 }
 
 /// Returns the callee saved register with the largest id in the vector.
-static unsigned getMaxCalleeSavedReg(const std::vector<CalleeSavedInfo> &CSI,
+static unsigned getMaxCalleeSavedReg(ArrayRef<CalleeSavedInfo> CSI,
                                      const TargetRegisterInfo &TRI) {
-    static_assert(Hexagon::R1 > 0,
-                  "Assume physical registers are encoded as positive integers");
-    if (CSI.empty())
-      return 0;
-
-    unsigned Max = getMax32BitSubRegister(CSI[0].getReg(), TRI);
-    for (unsigned I = 1, E = CSI.size(); I < E; ++I) {
-      unsigned Reg = getMax32BitSubRegister(CSI[I].getReg(), TRI);
-      if (Reg > Max)
-        Max = Reg;
-    }
-    return Max;
+  static_assert(Hexagon::R1 > 0,
+                "Assume physical registers are encoded as positive integers");
+  if (CSI.empty())
+    return 0;
+
+  unsigned Max = getMax32BitSubRegister(CSI[0].getReg(), TRI);
+  for (unsigned I = 1, E = CSI.size(); I < E; ++I) {
+    unsigned Reg = getMax32BitSubRegister(CSI[I].getReg(), TRI);
+    if (Reg > Max)
+      Max = Reg;
+  }
+  return Max;
 }
 
 /// Checks if the basic block contains any instruction that needs a stack
@@ -395,6 +395,9 @@ void HexagonFrameLowering::findShrunkPrologEpilog(MachineFunction &MF,
       MachineBasicBlock *&PrologB, MachineBasicBlock *&EpilogB) const {
   static unsigned ShrinkCounter = 0;
 
+  if (MF.getSubtarget<HexagonSubtarget>().isEnvironmentMusl() &&
+      MF.getFunction().isVarArg())
+    return;
   if (ShrinkLimit.getPosition()) {
     if (ShrinkCounter >= ShrinkLimit)
       return;
@@ -588,7 +591,7 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
   auto &HII = *HST.getInstrInfo();
   auto &HRI = *HST.getRegisterInfo();
 
-  unsigned MaxAlign = std::max(MFI.getMaxAlignment(), getStackAlignment());
+  Align MaxAlign = std::max(MFI.getMaxAlign(), getStackAlign());
 
   // Calculate the total stack frame size.
   // Get the number of bytes to allocate from the FrameInfo.
@@ -600,7 +603,7 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
   FrameSize = MaxCFA + alignTo(FrameSize, MaxAlign);
   MFI.setStackSize(FrameSize);
 
-  bool AlignStack = (MaxAlign > getStackAlignment());
+  bool AlignStack = (MaxAlign > getStackAlign());
 
   // Get the number of bytes to allocate from the FrameInfo.
   unsigned NumBytes = MFI.getStackSize();
@@ -622,12 +625,124 @@ void HexagonFrameLowering::insertPrologueInBlock(MachineBasicBlock &MBB,
 
   DebugLoc dl = MBB.findDebugLoc(InsertPt);
 
+  if (MF.getFunction().isVarArg() &&
+      MF.getSubtarget<HexagonSubtarget>().isEnvironmentMusl()) {
+    // Calculate the size of register saved area.
+    int NumVarArgRegs = 6 - FirstVarArgSavedReg;
+    int RegisterSavedAreaSizePlusPadding = (NumVarArgRegs % 2 == 0)
+                                              ? NumVarArgRegs * 4
+                                              : NumVarArgRegs * 4 + 4;
+    if (RegisterSavedAreaSizePlusPadding > 0) {
+      // Decrement the stack pointer by size of register saved area plus
+      // padding if any.
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
+        .addReg(SP)
+        .addImm(-RegisterSavedAreaSizePlusPadding)
+        .setMIFlag(MachineInstr::FrameSetup);
+
+      int NumBytes = 0;
+      // Copy all the named arguments below register saved area.
+      auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+      for (int i = HMFI.getFirstNamedArgFrameIndex(),
+               e = HMFI.getLastNamedArgFrameIndex(); i >= e; --i) {
+        uint64_t ObjSize = MFI.getObjectSize(i);
+        Align ObjAlign = MFI.getObjectAlign(i);
+
+        // Determine the kind of load/store that should be used.
+        unsigned LDOpc, STOpc;
+        uint64_t OpcodeChecker = ObjAlign.value();
+
+        // Handle cases where alignment of an object is > its size.
+        if (ObjAlign > ObjSize) {
+          if (ObjSize <= 1)
+            OpcodeChecker = 1;
+          else if (ObjSize <= 2)
+            OpcodeChecker = 2;
+          else if (ObjSize <= 4)
+            OpcodeChecker = 4;
+          else if (ObjSize > 4)
+            OpcodeChecker = 8;
+        }
+
+        switch (OpcodeChecker) {
+          case 1:
+            LDOpc = Hexagon::L2_loadrb_io;
+            STOpc = Hexagon::S2_storerb_io;
+            break;
+          case 2:
+            LDOpc = Hexagon::L2_loadrh_io;
+            STOpc = Hexagon::S2_storerh_io;
+            break;
+          case 4:
+            LDOpc = Hexagon::L2_loadri_io;
+            STOpc = Hexagon::S2_storeri_io;
+            break;
+          case 8:
+          default:
+            LDOpc = Hexagon::L2_loadrd_io;
+            STOpc = Hexagon::S2_storerd_io;
+            break;
+        }
+
+        unsigned RegUsed = LDOpc == Hexagon::L2_loadrd_io ? Hexagon::D3
+                                                          : Hexagon::R6;
+        int LoadStoreCount = ObjSize / OpcodeChecker;
+
+        if (ObjSize % OpcodeChecker)
+          ++LoadStoreCount;
+
+        // Get the start location of the load. NumBytes is basically the
+        // offset from the stack pointer of previous function, which would be
+        // the caller in this case, as this function has variable argument
+        // list.
+        if (NumBytes != 0)
+          NumBytes = alignTo(NumBytes, ObjAlign);
+
+        int Count = 0;
+        while (Count < LoadStoreCount) {
+          // Load the value of the named argument on stack.
+          BuildMI(MBB, InsertPt, dl, HII.get(LDOpc), RegUsed)
+              .addReg(SP)
+              .addImm(RegisterSavedAreaSizePlusPadding +
+                      ObjAlign.value() * Count + NumBytes)
+              .setMIFlag(MachineInstr::FrameSetup);
+
+          // Store it below the register saved area plus padding.
+          BuildMI(MBB, InsertPt, dl, HII.get(STOpc))
+              .addReg(SP)
+              .addImm(ObjAlign.value() * Count + NumBytes)
+              .addReg(RegUsed)
+              .setMIFlag(MachineInstr::FrameSetup);
+
+          Count++;
+        }
+        NumBytes += MFI.getObjectSize(i);
+      }
+
+      // Make NumBytes 8 byte aligned
+      NumBytes = alignTo(NumBytes, 8);
+
+      // If the number of registers having variable arguments is odd,
+      // leave 4 bytes of padding to get to the location where first
+      // variable argument which was passed through register was copied.
+      NumBytes = (NumVarArgRegs % 2 == 0) ? NumBytes : NumBytes + 4;
+
+      for (int j = FirstVarArgSavedReg, i = 0; j < 6; ++j, ++i) {
+        BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::S2_storeri_io))
+          .addReg(SP)
+          .addImm(NumBytes + 4 * i)
+          .addReg(Hexagon::R0 + j)
+          .setMIFlag(MachineInstr::FrameSetup);
+      }
+    }
+  }
+
   if (hasFP(MF)) {
     insertAllocframe(MBB, InsertPt, NumBytes);
     if (AlignStack) {
       BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_andir), SP)
           .addReg(SP)
-          .addImm(-int64_t(MaxAlign));
+          .addImm(-int64_t(MaxAlign.value()));
     }
     // If the stack-checking is enabled, and we spilled the callee-saved
     // registers inline (i.e. did not use a spill function), then call
@@ -655,7 +770,16 @@ void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
 
   if (!hasFP(MF)) {
     MachineFrameInfo &MFI = MF.getFrameInfo();
-    if (unsigned NumBytes = MFI.getStackSize()) {
+    unsigned NumBytes = MFI.getStackSize();
+    if (MF.getFunction().isVarArg() &&
+        MF.getSubtarget<HexagonSubtarget>().isEnvironmentMusl()) {
+      // On Hexagon Linux, deallocate the stack for the register saved area.
+      int NumVarArgRegs = 6 - FirstVarArgSavedReg;
+      int RegisterSavedAreaSizePlusPadding = (NumVarArgRegs % 2 == 0) ?
+        (NumVarArgRegs * 4) : (NumVarArgRegs * 4 + 4);
+      NumBytes += RegisterSavedAreaSizePlusPadding;
+    }
+    if (NumBytes) {
       BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
         .addReg(SP)
         .addImm(NumBytes);
@@ -710,24 +834,49 @@ void HexagonFrameLowering::insertEpilogueInBlock(MachineBasicBlock &MBB) const {
       NeedsDeallocframe = false;
   }
 
-  if (!NeedsDeallocframe)
-    return;
-  // If the returning instruction is PS_jmpret, replace it with dealloc_return,
-  // otherwise just add deallocframe. The function could be returning via a
-  // tail call.
-  if (RetOpc != Hexagon::PS_jmpret || DisableDeallocRet) {
-    BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::L2_deallocframe))
+  if (!MF.getSubtarget<HexagonSubtarget>().isEnvironmentMusl() ||
+      !MF.getFunction().isVarArg()) {
+    if (!NeedsDeallocframe)
+      return;
+    // If the returning instruction is PS_jmpret, replace it with
+    // dealloc_return, otherwise just add deallocframe. The function
+    // could be returning via a tail call.
+    if (RetOpc != Hexagon::PS_jmpret || DisableDeallocRet) {
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::L2_deallocframe))
       .addDef(Hexagon::D15)
       .addReg(Hexagon::R30);
-    return;
-  }
-  unsigned NewOpc = Hexagon::L4_return;
-  MachineInstr *NewI = BuildMI(MBB, RetI, dl, HII.get(NewOpc))
+      return;
+    }
+    unsigned NewOpc = Hexagon::L4_return;
+    MachineInstr *NewI = BuildMI(MBB, RetI, dl, HII.get(NewOpc))
       .addDef(Hexagon::D15)
       .addReg(Hexagon::R30);
-  // Transfer the function live-out registers.
-  NewI->copyImplicitOps(MF, *RetI);
-  MBB.erase(RetI);
+    // Transfer the function live-out registers.
+    NewI->copyImplicitOps(MF, *RetI);
+    MBB.erase(RetI);
+  } else {
+    // L2_deallocframe instruction after it.
+    // Calculate the size of register saved area.
+    int NumVarArgRegs = 6 - FirstVarArgSavedReg;
+    int RegisterSavedAreaSizePlusPadding = (NumVarArgRegs % 2 == 0) ?
+      (NumVarArgRegs * 4) : (NumVarArgRegs * 4 + 4);
+
+    MachineBasicBlock::iterator Term = MBB.getFirstTerminator();
+    MachineBasicBlock::iterator I = (Term == MBB.begin()) ? MBB.end()
+                                                          : std::prev(Term);
+    if (I == MBB.end() ||
+       (I->getOpcode() != Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT &&
+        I->getOpcode() != Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_EXT_PIC &&
+        I->getOpcode() != Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4 &&
+        I->getOpcode() != Hexagon::RESTORE_DEALLOC_BEFORE_TAILCALL_V4_PIC))
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::L2_deallocframe))
+        .addDef(Hexagon::D15)
+        .addReg(Hexagon::R30);
+    if (RegisterSavedAreaSizePlusPadding != 0)
+      BuildMI(MBB, InsertPt, dl, HII.get(Hexagon::A2_addi), SP)
+        .addReg(SP)
+        .addImm(RegisterSavedAreaSizePlusPadding);
+  }
 }
 
 void HexagonFrameLowering::insertAllocframe(MachineBasicBlock &MBB,
@@ -744,7 +893,7 @@ void HexagonFrameLowering::insertAllocframe(MachineBasicBlock &MBB,
   // Create a dummy memory operand to avoid allocframe from being treated as
   // a volatile memory reference.
   auto *MMO = MF.getMachineMemOperand(MachinePointerInfo::getStack(MF, 0),
-                                      MachineMemOperand::MOStore, 4, 4);
+                                      MachineMemOperand::MOStore, 4, Align(4));
 
   DebugLoc dl = MBB.findDebugLoc(InsertPt);
   unsigned SP = HRI.getStackRegister();
@@ -907,9 +1056,9 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
     //   |         +-- Old SP (before allocframe)
     //   +-- New FP (after allocframe)
     //
-    // MCCFIInstruction::createDefCfa subtracts the offset from the register.
+    // MCCFIInstruction::cfiDefCfa adds the offset from the register.
     // MCCFIInstruction::createOffset takes the offset without sign change.
-    auto DefCfa = MCCFIInstruction::createDefCfa(FrameLabel, DwFPReg, -8);
+    auto DefCfa = MCCFIInstruction::cfiDefCfa(FrameLabel, DwFPReg, 8);
     BuildMI(MBB, At, DL, CFID)
         .addCFIIndex(MF.addFrameInst(DefCfa));
     // R31 (return addr) = CFA - 4
@@ -954,7 +1103,7 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
       // Instead, get the offset (relative to the FP) directly.
       Offset = MFI.getObjectOffset(F->getFrameIdx());
     } else {
-      unsigned FrameReg;
+      Register FrameReg;
       Offset = getFrameIndexReference(MF, F->getFrameIdx(), FrameReg);
     }
     // Subtract 8 to make room for R30 and R31, which are added above.
@@ -1108,7 +1257,8 @@ static const char *getSpillFunctionFor(unsigned MaxReg, SpillKind SpillType,
 }
 
 int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
-      int FI, unsigned &FrameReg) const {
+                                                 int FI,
+                                                 Register &FrameReg) const {
   auto &MFI = MF.getFrameInfo();
   auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
 
@@ -1119,9 +1269,9 @@ int HexagonFrameLowering::getFrameIndexReference(const MachineFunction &MF,
 
   auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
   unsigned FrameSize = MFI.getStackSize();
-  unsigned SP = HRI.getStackRegister();
-  unsigned FP = HRI.getFrameRegister();
-  unsigned AP = HMFI.getStackAlignBasePhysReg();
+  Register SP = HRI.getStackRegister();
+  Register FP = HRI.getFrameRegister();
+  Register AP = HMFI.getStackAlignBasePhysReg();
   // It may happen that AP will be absent even HasAlloca && HasExtraAlign
   // is true. HasExtraAlign may be set because of vector spills, without
   // aligned locals or aligned outgoing function arguments. Since vector
@@ -1358,7 +1508,7 @@ void HexagonFrameLowering::processFunctionBeforeFrameFinalized(
   // via AP, which may not be available at the particular place in the program.
   MachineFrameInfo &MFI = MF.getFrameInfo();
   bool HasAlloca = MFI.hasVarSizedObjects();
-  bool NeedsAlign = (MFI.getMaxAlignment() > getStackAlignment());
+  bool NeedsAlign = (MFI.getMaxAlign() > getStackAlign());
 
   if (!HasAlloca || !NeedsAlign)
     return;
@@ -1371,8 +1521,8 @@ void HexagonFrameLowering::processFunctionBeforeFrameFinalized(
     unsigned S = MFI.getObjectSize(i);
     // Reduce the alignment to at most 8. This will require unaligned vector
     // stores if they happen here.
-    unsigned A = std::max(MFI.getObjectAlignment(i), 8U);
-    MFI.setObjectAlignment(i, 8);
+    Align A = std::max(MFI.getObjectAlign(i), Align(8));
+    MFI.setObjectAlignment(i, Align(8));
     LFS = alignTo(LFS+S, A);
     MFI.mapLocalFrameObject(i, -static_cast<int64_t>(LFS));
     DealignSlots.insert(i);
@@ -1398,12 +1548,11 @@ void HexagonFrameLowering::processFunctionBeforeFrameFinalized(
           if (auto *FS = dyn_cast_or_null<FixedStackPseudoSourceValue>(PV)) {
             int FI = FS->getFrameIndex();
             if (DealignSlots.count(FI)) {
-              unsigned A = MFI.getObjectAlignment(FI);
-              auto *NewMMO = MF.getMachineMemOperand(MMO->getPointerInfo(),
-                                MMO->getFlags(), MMO->getSize(), A,
-                                MMO->getAAInfo(), MMO->getRanges(),
-                                MMO->getSyncScopeID(), MMO->getOrdering(),
-                                MMO->getFailureOrdering());
+              auto *NewMMO = MF.getMachineMemOperand(
+                  MMO->getPointerInfo(), MMO->getFlags(), MMO->getSize(),
+                  MFI.getObjectAlign(FI), MMO->getAAInfo(), MMO->getRanges(),
+                  MMO->getSyncScopeID(), MMO->getOrdering(),
+                  MMO->getFailureOrdering());
               new_memops.push_back(NewMMO);
               KeepOld = false;
               continue;
@@ -1562,9 +1711,8 @@ bool HexagonFrameLowering::assignCalleeSavedSpillSlots(MachineFunction &MF,
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(R);
     unsigned Size = TRI->getSpillSize(*RC);
     int Off = MinOffset - Size;
-    unsigned Align = std::min(TRI->getSpillAlignment(*RC), getStackAlignment());
-    assert(isPowerOf2_32(Align));
-    Off &= -Align;
+    Align Alignment = std::min(TRI->getSpillAlign(*RC), getStackAlign());
+    Off &= -Alignment.value();
     int FI = MFI.CreateFixedSpillStackObject(Size, Off);
     MinOffset = std::min(MinOffset, Off);
     CSI.push_back(CalleeSavedInfo(R, FI));
@@ -1787,11 +1935,11 @@ bool HexagonFrameLowering::expandStoreVec2(MachineBasicBlock &B,
   bool NeedsAligna = needsAligna(MF);
 
   unsigned Size = HRI.getSpillSize(Hexagon::HvxVRRegClass);
-  unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
-  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  Align NeedAlign = HRI.getSpillAlign(Hexagon::HvxVRRegClass);
+  Align HasAlign = MFI.getObjectAlign(FI);
   unsigned StoreOpc;
 
-  auto UseAligned = [&] (unsigned NeedAlign, unsigned HasAlign) {
+  auto UseAligned = [&](Align NeedAlign, Align HasAlign) {
     return !NeedsAligna && (NeedAlign <= HasAlign);
   };
 
@@ -1839,11 +1987,11 @@ bool HexagonFrameLowering::expandLoadVec2(MachineBasicBlock &B,
   bool NeedsAligna = needsAligna(MF);
 
   unsigned Size = HRI.getSpillSize(Hexagon::HvxVRRegClass);
-  unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
-  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  Align NeedAlign = HRI.getSpillAlign(Hexagon::HvxVRRegClass);
+  Align HasAlign = MFI.getObjectAlign(FI);
   unsigned LoadOpc;
 
-  auto UseAligned = [&] (unsigned NeedAlign, unsigned HasAlign) {
+  auto UseAligned = [&](Align NeedAlign, Align HasAlign) {
     return !NeedsAligna && (NeedAlign <= HasAlign);
   };
 
@@ -1883,8 +2031,8 @@ bool HexagonFrameLowering::expandStoreVec(MachineBasicBlock &B,
   bool IsKill = MI->getOperand(2).isKill();
   int FI = MI->getOperand(0).getIndex();
 
-  unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
-  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  Align NeedAlign = HRI.getSpillAlign(Hexagon::HvxVRRegClass);
+  Align HasAlign = MFI.getObjectAlign(FI);
   bool UseAligned = !NeedsAligna && (NeedAlign <= HasAlign);
   unsigned StoreOpc = UseAligned ? Hexagon::V6_vS32b_ai
                                  : Hexagon::V6_vS32Ub_ai;
@@ -1913,8 +2061,8 @@ bool HexagonFrameLowering::expandLoadVec(MachineBasicBlock &B,
   Register DstR = MI->getOperand(0).getReg();
   int FI = MI->getOperand(1).getIndex();
 
-  unsigned NeedAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
-  unsigned HasAlign = MFI.getObjectAlignment(FI);
+  Align NeedAlign = HRI.getSpillAlign(Hexagon::HvxVRRegClass);
+  Align HasAlign = MFI.getObjectAlign(FI);
   bool UseAligned = !NeedsAligna && (NeedAlign <= HasAlign);
   unsigned LoadOpc = UseAligned ? Hexagon::V6_vL32b_ai
                                 : Hexagon::V6_vL32Ub_ai;
@@ -2016,7 +2164,8 @@ void HexagonFrameLowering::determineCalleeSaves(MachineFunction &MF,
           Num = 2; // Vector predicate spills also need a vector register.
           break;
       }
-      unsigned S = HRI.getSpillSize(*RC), A = HRI.getSpillAlignment(*RC);
+      unsigned S = HRI.getSpillSize(*RC);
+      Align A = HRI.getSpillAlign(*RC);
       for (unsigned i = 0; i < Num; i++) {
         int NewFI = MFI.CreateSpillStackObject(S, A);
         RS->addScavengingFrameIndex(NewFI);
@@ -2473,6 +2622,8 @@ void HexagonFrameLowering::addCalleeSaveRegistersAsImpOperand(MachineInstr *MI,
 /// checks are performed, which may still lead to the inline code.
 bool HexagonFrameLowering::shouldInlineCSR(const MachineFunction &MF,
       const CSIVect &CSI) const {
+  if (MF.getSubtarget<HexagonSubtarget>().isEnvironmentMusl())
+    return true;
   if (MF.getInfo<HexagonMachineFunctionInfo>()->hasEHReturn())
     return true;
   if (!hasFP(MF))
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
index 27265dd53794..87d385e1ce3c 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -29,8 +29,10 @@ class TargetRegisterClass;
 
 class HexagonFrameLowering : public TargetFrameLowering {
 public:
+  // First register which could possibly hold a variable argument.
+  int FirstVarArgSavedReg;
   explicit HexagonFrameLowering()
-      : TargetFrameLowering(StackGrowsDown, Align(8), 0, Align::None(), true) {}
+      : TargetFrameLowering(StackGrowsDown, Align(8), 0, Align(1), true) {}
 
   // All of the prolog/epilog functionality, including saving and restoring
   // callee-saved registers is handled in emitPrologue. This is to have the
@@ -43,14 +45,17 @@ public:
   bool enableCalleeSaveSkip(const MachineFunction &MF) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-      MachineBasicBlock::iterator MI, const std::vector<CalleeSavedInfo> &CSI,
-      const TargetRegisterInfo *TRI) const override {
+                                 MachineBasicBlock::iterator MI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
+                                 const TargetRegisterInfo *TRI) const override {
     return true;
   }
 
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-      MachineBasicBlock::iterator MI, std::vector<CalleeSavedInfo> &CSI,
-      const TargetRegisterInfo *TRI) const override {
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
+                              const TargetRegisterInfo *TRI) const override {
     return true;
   }
 
@@ -78,7 +83,7 @@ public:
   }
 
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-      unsigned &FrameReg) const override;
+                             Register &FrameReg) const override;
   bool hasFP(const MachineFunction &MF) const override;
 
   const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries)
diff --git a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
index 342ca21525c5..d9307190ae16 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -221,15 +221,16 @@ bool HexagonGenExtract::convert(Instruction *In) {
 }
 
 bool HexagonGenExtract::visitBlock(BasicBlock *B) {
+  bool Changed = false;
+
   // Depth-first, bottom-up traversal.
   for (auto *DTN : children<DomTreeNode*>(DT->getNode(B)))
-    visitBlock(DTN->getBlock());
+    Changed |= visitBlock(DTN->getBlock());
 
   // Allow limiting the number of generated extracts for debugging purposes.
   bool HasCutoff = ExtractCutoff.getPosition();
   unsigned Cutoff = ExtractCutoff;
 
-  bool Changed = false;
   BasicBlock::iterator I = std::prev(B->end()), NextI, Begin = B->begin();
   while (true) {
     if (HasCutoff && (ExtractCount >= Cutoff))
diff --git a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
index 1cf1500bc832..4833935f8d24 100644
--- a/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonHardwareLoops.cpp
@@ -467,7 +467,7 @@ bool HexagonHardwareLoops::findInductionRegister(MachineLoop *L,
   if (!PredI->isCompare())
     return false;
 
-  unsigned CmpReg1 = 0, CmpReg2 = 0;
+  Register CmpReg1, CmpReg2;
   int CmpImm = 0, CmpMask = 0;
   bool CmpAnalyzed =
       TII->analyzeCompare(*PredI, CmpReg1, CmpReg2, CmpMask, CmpImm);
@@ -640,7 +640,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
   if (!TB || (FB && TB != Header && FB != Header))
     return nullptr;
 
-  // Branches of form "if (!P) ..." cause HexagonInstrInfo::AnalyzeBranch
+  // Branches of form "if (!P) ..." cause HexagonInstrInfo::analyzeBranch
   // to put imm(0), followed by P in the vector Cond.
   // If TB is not the header, it means that the "not-taken" path must lead
   // to the header.
@@ -651,7 +651,7 @@ CountValue *HexagonHardwareLoops::getLoopTripCount(MachineLoop *L,
   MachineInstr *CondI = MRI->getVRegDef(PredReg);
   unsigned CondOpc = CondI->getOpcode();
 
-  unsigned CmpReg1 = 0, CmpReg2 = 0;
+  Register CmpReg1, CmpReg2;
   int Mask = 0, ImmValue = 0;
   bool AnalyzedCmp =
       TII->analyzeCompare(*CondI, CmpReg1, CmpReg2, Mask, ImmValue);
@@ -1455,7 +1455,7 @@ bool HexagonHardwareLoops::loopCountMayWrapOrUnderFlow(
   for (MachineRegisterInfo::use_instr_nodbg_iterator I = MRI->use_instr_nodbg_begin(Reg),
          E = MRI->use_instr_nodbg_end(); I != E; ++I) {
     MachineInstr *MI = &*I;
-    unsigned CmpReg1 = 0, CmpReg2 = 0;
+    Register CmpReg1, CmpReg2;
     int CmpMask = 0, CmpValue = 0;
 
     if (!TII->analyzeCompare(*MI, CmpReg1, CmpReg2, CmpMask, CmpValue))
@@ -1657,7 +1657,7 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
 
   MachineBasicBlock *TB = nullptr, *FB = nullptr;
   SmallVector<MachineOperand,2> Cond;
-  // AnalyzeBranch returns true if it fails to analyze branch.
+  // analyzeBranch returns true if it fails to analyze branch.
   bool NotAnalyzed = TII->analyzeBranch(*ExitingBlock, TB, FB, Cond, false);
   if (NotAnalyzed || Cond.empty())
     return false;
@@ -1693,7 +1693,7 @@ bool HexagonHardwareLoops::fixupInductionVariable(MachineLoop *L) {
 
   // Expecting a predicate register as a condition.  It won't be a hardware
   // predicate register at this point yet, just a vreg.
-  // HexagonInstrInfo::AnalyzeBranch for negated branches inserts imm(0)
+  // HexagonInstrInfo::analyzeBranch for negated branches inserts imm(0)
   // into Cond, followed by the predicate register.  For non-negated branches
   // it's just the register.
   unsigned CSz = Cond.size();
diff --git a/llvm/lib/Target/Hexagon/HexagonIICScalar.td b/llvm/lib/Target/Hexagon/HexagonIICScalar.td
index d37cc3a2cc3e..e9239ab5ad22 100644
--- a/llvm/lib/Target/Hexagon/HexagonIICScalar.td
+++ b/llvm/lib/Target/Hexagon/HexagonIICScalar.td
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 // These itinerary class descriptions are based on the instruction timing
-// classes as per V62. Curretnly, they are just extracted from
+// classes as per V62. Currently, they are just extracted from
 // HexagonScheduleV62.td but will soon be auto-generated by HexagonGen.py.
 
 class PseudoItin {
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
index 9cf5b257a00a..b4b389a7b956 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.cpp
@@ -734,8 +734,8 @@ void HexagonDAGToDAGISel::SelectFrameIndex(SDNode *N) {
   MachineFrameInfo &MFI = MF->getFrameInfo();
   const HexagonFrameLowering *HFI = HST->getFrameLowering();
   int FX = cast<FrameIndexSDNode>(N)->getIndex();
-  unsigned StkA = HFI->getStackAlignment();
-  unsigned MaxA = MFI.getMaxAlignment();
+  Align StkA = HFI->getStackAlign();
+  Align MaxA = MFI.getMaxAlign();
   SDValue FI = CurDAG->getTargetFrameIndex(FX, MVT::i32);
   SDLoc DL(N);
   SDValue Zero = CurDAG->getTargetConstant(0, DL, MVT::i32);
@@ -787,10 +787,18 @@ void HexagonDAGToDAGISel::SelectVAlign(SDNode *N) {
                                        MVT::i64, Ops);
 
     // Shift right by "(Addr & 0x3) * 8" bytes.
+    SDNode *C;
     SDValue M0 = CurDAG->getTargetConstant(0x18, dl, MVT::i32);
     SDValue M1 = CurDAG->getTargetConstant(0x03, dl, MVT::i32);
-    SDNode *C = CurDAG->getMachineNode(Hexagon::S4_andi_asl_ri, dl, MVT::i32,
-                                       M0, N->getOperand(2), M1);
+    if (HST->useCompound()) {
+      C = CurDAG->getMachineNode(Hexagon::S4_andi_asl_ri, dl, MVT::i32,
+                                 M0, N->getOperand(2), M1);
+    } else {
+      SDNode *T = CurDAG->getMachineNode(Hexagon::S2_asl_i_r, dl, MVT::i32,
+                                         N->getOperand(2), M1);
+      C = CurDAG->getMachineNode(Hexagon::A2_andir, dl, MVT::i32,
+                                 SDValue(T, 0), M0);
+    }
     SDNode *S = CurDAG->getMachineNode(Hexagon::S2_lsr_r_p, dl, MVT::i64,
                                        SDValue(R, 0), SDValue(C, 0));
     SDValue E = CurDAG->getTargetExtractSubreg(Hexagon::isub_lo, dl, ResTy,
@@ -1179,7 +1187,7 @@ void HexagonDAGToDAGISel::ppHoistZextI1(std::vector<SDNode*> &&Nodes) {
         Ops[i] = U->getOperand(i);
       EVT BVT = Ops[I1N].getValueType();
 
-      SDLoc dl(U);
+      const SDLoc &dl(U);
       SDValue C0 = DAG.getConstant(0, dl, BVT);
       SDValue C1 = DAG.getConstant(1, dl, BVT);
       SDValue If0, If1;
@@ -1197,8 +1205,15 @@ void HexagonDAGToDAGISel::ppHoistZextI1(std::vector<SDNode*> &&Nodes) {
         Ops[I1N] = C1;
         If1 = DAG.getNode(UseOpc, dl, UVT, Ops);
       }
-      SDValue Sel = DAG.getNode(ISD::SELECT, dl, UVT, OpI1, If1, If0);
-      DAG.ReplaceAllUsesWith(U, Sel.getNode());
+      // We're generating a SELECT way after legalization, so keep the types
+      // simple.
+      unsigned UW = UVT.getSizeInBits();
+      EVT SVT = (UW == 32 || UW == 64) ? MVT::getIntegerVT(UW) : UVT;
+      SDValue Sel = DAG.getNode(ISD::SELECT, dl, SVT, OpI1,
+                                DAG.getBitcast(SVT, If1),
+                                DAG.getBitcast(SVT, If0));
+      SDValue Ret = DAG.getBitcast(UVT, Sel);
+      DAG.ReplaceAllUsesWith(U, Ret.getNode());
     }
   }
 }
@@ -1260,7 +1275,7 @@ void HexagonDAGToDAGISel::PreprocessISelDAG() {
   }
 }
 
-void HexagonDAGToDAGISel::EmitFunctionEntryCode() {
+void HexagonDAGToDAGISel::emitFunctionEntryCode() {
   auto &HST = MF->getSubtarget<HexagonSubtarget>();
   auto &HFI = *HST.getFrameLowering();
   if (!HFI.needsAligna(*MF))
@@ -1269,9 +1284,9 @@ void HexagonDAGToDAGISel::EmitFunctionEntryCode() {
   MachineFrameInfo &MFI = MF->getFrameInfo();
   MachineBasicBlock *EntryBB = &MF->front();
   unsigned AR = FuncInfo->CreateReg(MVT::i32);
-  unsigned EntryMaxA = MFI.getMaxAlignment();
+  Align EntryMaxA = MFI.getMaxAlign();
   BuildMI(EntryBB, DebugLoc(), HII->get(Hexagon::PS_aligna), AR)
-      .addImm(EntryMaxA);
+      .addImm(EntryMaxA.value());
   MF->getInfo<HexagonMachineFunctionInfo>()->setStackAlignBaseVReg(AR);
 }
 
@@ -1281,7 +1296,7 @@ void HexagonDAGToDAGISel::updateAligna() {
     return;
   auto *AlignaI = const_cast<MachineInstr*>(HFI.getAlignaInstr(*MF));
   assert(AlignaI != nullptr);
-  unsigned MaxA = MF->getFrameInfo().getMaxAlignment();
+  unsigned MaxA = MF->getFrameInfo().getMaxAlign().value();
   if (AlignaI->getOperand(1).getImm() < MaxA)
     AlignaI->getOperand(1).setImm(MaxA);
 }
@@ -1300,28 +1315,28 @@ bool HexagonDAGToDAGISel::SelectAddrFI(SDValue &N, SDValue &R) {
 }
 
 inline bool HexagonDAGToDAGISel::SelectAddrGA(SDValue &N, SDValue &R) {
-  return SelectGlobalAddress(N, R, false, 0);
+  return SelectGlobalAddress(N, R, false, Align(1));
 }
 
 inline bool HexagonDAGToDAGISel::SelectAddrGP(SDValue &N, SDValue &R) {
-  return SelectGlobalAddress(N, R, true, 0);
+  return SelectGlobalAddress(N, R, true, Align(1));
 }
 
 inline bool HexagonDAGToDAGISel::SelectAnyImm(SDValue &N, SDValue &R) {
-  return SelectAnyImmediate(N, R, 0);
+  return SelectAnyImmediate(N, R, Align(1));
 }
 
 inline bool HexagonDAGToDAGISel::SelectAnyImm0(SDValue &N, SDValue &R) {
-  return SelectAnyImmediate(N, R, 0);
+  return SelectAnyImmediate(N, R, Align(1));
 }
 inline bool HexagonDAGToDAGISel::SelectAnyImm1(SDValue &N, SDValue &R) {
-  return SelectAnyImmediate(N, R, 1);
+  return SelectAnyImmediate(N, R, Align(2));
 }
 inline bool HexagonDAGToDAGISel::SelectAnyImm2(SDValue &N, SDValue &R) {
-  return SelectAnyImmediate(N, R, 2);
+  return SelectAnyImmediate(N, R, Align(4));
 }
 inline bool HexagonDAGToDAGISel::SelectAnyImm3(SDValue &N, SDValue &R) {
-  return SelectAnyImmediate(N, R, 3);
+  return SelectAnyImmediate(N, R, Align(8));
 }
 
 inline bool HexagonDAGToDAGISel::SelectAnyInt(SDValue &N, SDValue &R) {
@@ -1333,17 +1348,13 @@ inline bool HexagonDAGToDAGISel::SelectAnyInt(SDValue &N, SDValue &R) {
 }
 
 bool HexagonDAGToDAGISel::SelectAnyImmediate(SDValue &N, SDValue &R,
-                                             uint32_t LogAlign) {
-  auto IsAligned = [LogAlign] (uint64_t V) -> bool {
-    return alignTo(V, (uint64_t)1 << LogAlign) == V;
-  };
-
+                                             Align Alignment) {
   switch (N.getOpcode()) {
   case ISD::Constant: {
     if (N.getValueType() != MVT::i32)
       return false;
     int32_t V = cast<const ConstantSDNode>(N)->getZExtValue();
-    if (!IsAligned(V))
+    if (!isAligned(Alignment, V))
       return false;
     R = CurDAG->getTargetConstant(V, SDLoc(N), N.getValueType());
     return true;
@@ -1351,37 +1362,34 @@ bool HexagonDAGToDAGISel::SelectAnyImmediate(SDValue &N, SDValue &R,
   case HexagonISD::JT:
   case HexagonISD::CP:
     // These are assumed to always be aligned at least 8-byte boundary.
-    if (LogAlign > 3)
+    if (Alignment > Align(8))
       return false;
     R = N.getOperand(0);
     return true;
   case ISD::ExternalSymbol:
     // Symbols may be aligned at any boundary.
-    if (LogAlign > 0)
+    if (Alignment > Align(1))
       return false;
     R = N;
     return true;
   case ISD::BlockAddress:
     // Block address is always aligned at least 4-byte boundary.
-    if (LogAlign > 2 || !IsAligned(cast<BlockAddressSDNode>(N)->getOffset()))
+    if (Alignment > Align(4) ||
+        !isAligned(Alignment, cast<BlockAddressSDNode>(N)->getOffset()))
       return false;
     R = N;
     return true;
   }
 
-  if (SelectGlobalAddress(N, R, false, LogAlign) ||
-      SelectGlobalAddress(N, R, true, LogAlign))
+  if (SelectGlobalAddress(N, R, false, Alignment) ||
+      SelectGlobalAddress(N, R, true, Alignment))
     return true;
 
   return false;
 }
 
 bool HexagonDAGToDAGISel::SelectGlobalAddress(SDValue &N, SDValue &R,
-                                              bool UseGP, uint32_t LogAlign) {
-  auto IsAligned = [LogAlign] (uint64_t V) -> bool {
-    return alignTo(V, (uint64_t)1 << LogAlign) == V;
-  };
-
+                                              bool UseGP, Align Alignment) {
   switch (N.getOpcode()) {
   case ISD::ADD: {
     SDValue N0 = N.getOperand(0);
@@ -1392,10 +1400,9 @@ bool HexagonDAGToDAGISel::SelectGlobalAddress(SDValue &N, SDValue &R,
     if (!UseGP && GAOpc != HexagonISD::CONST32)
       return false;
     if (ConstantSDNode *Const = dyn_cast<ConstantSDNode>(N1)) {
-      SDValue Addr = N0.getOperand(0);
-      // For the purpose of alignment, sextvalue and zextvalue are the same.
-      if (!IsAligned(Const->getZExtValue()))
+      if (!isAligned(Alignment, Const->getZExtValue()))
         return false;
+      SDValue Addr = N0.getOperand(0);
       if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Addr)) {
         if (GA->getOpcode() == ISD::TargetGlobalAddress) {
           uint64_t NewOff = GA->getOffset() + (uint64_t)Const->getSExtValue();
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
index 6c77d8803359..1e50385a7b4b 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAG.h
@@ -25,7 +25,6 @@ namespace llvm {
 class MachineFunction;
 class HexagonInstrInfo;
 class HexagonRegisterInfo;
-class HexagonTargetLowering;
 
 class HexagonDAGToDAGISel : public SelectionDAGISel {
   const HexagonSubtarget *HST;
@@ -51,7 +50,7 @@ public:
     return true;
   }
   void PreprocessISelDAG() override;
-  void EmitFunctionEntryCode() override;
+  void emitFunctionEntryCode() override;
 
   void Select(SDNode *N) override;
 
@@ -60,9 +59,8 @@ public:
   inline bool SelectAddrGP(SDValue &N, SDValue &R);
   inline bool SelectAnyImm(SDValue &N, SDValue &R);
   inline bool SelectAnyInt(SDValue &N, SDValue &R);
-  bool SelectAnyImmediate(SDValue &N, SDValue &R, uint32_t LogAlign);
-  bool SelectGlobalAddress(SDValue &N, SDValue &R, bool UseGP,
-                           uint32_t LogAlign);
+  bool SelectAnyImmediate(SDValue &N, SDValue &R, Align Alignment);
+  bool SelectGlobalAddress(SDValue &N, SDValue &R, bool UseGP, Align Alignment);
   bool SelectAddrFI(SDValue &N, SDValue &R);
   bool DetectUseSxtw(SDValue &N, SDValue &R);
 
diff --git a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
index 7e143a349400..c0f92042e5da 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelDAGToDAGHVX.cpp
@@ -1199,7 +1199,7 @@ OpRef HvxSelector::vmuxs(ArrayRef<uint8_t> Bytes, OpRef Va, OpRef Vb,
                          ResultStack &Results) {
   DEBUG_WITH_TYPE("isel", {dbgs() << __func__ << '\n';});
   MVT ByteTy = getSingleVT(MVT::i8);
-  MVT BoolTy = MVT::getVectorVT(MVT::i1, 8*HwLen); // XXX
+  MVT BoolTy = MVT::getVectorVT(MVT::i1, HwLen);
   const SDLoc &dl(Results.InpNode);
   SDValue B = getVectorConstant(Bytes, dl);
   Results.push(Hexagon::V6_vd0, ByteTy, {});
@@ -2201,30 +2201,30 @@ void HexagonDAGToDAGISel::SelectHVXDualOutput(SDNode *N) {
   SDNode *Result;
   switch (IID) {
   case Intrinsic::hexagon_V6_vaddcarry: {
-    SmallVector<SDValue, 3> Ops = { N->getOperand(1), N->getOperand(2),
-                                    N->getOperand(3) };
-    SDVTList VTs = CurDAG->getVTList(MVT::v16i32, MVT::v512i1);
+    std::array<SDValue, 3> Ops = {
+        {N->getOperand(1), N->getOperand(2), N->getOperand(3)}};
+    SDVTList VTs = CurDAG->getVTList(MVT::v16i32, MVT::v64i1);
     Result = CurDAG->getMachineNode(Hexagon::V6_vaddcarry, SDLoc(N), VTs, Ops);
     break;
   }
   case Intrinsic::hexagon_V6_vaddcarry_128B: {
-    SmallVector<SDValue, 3> Ops = { N->getOperand(1), N->getOperand(2),
-                                    N->getOperand(3) };
-    SDVTList VTs = CurDAG->getVTList(MVT::v32i32, MVT::v1024i1);
+    std::array<SDValue, 3> Ops = {
+        {N->getOperand(1), N->getOperand(2), N->getOperand(3)}};
+    SDVTList VTs = CurDAG->getVTList(MVT::v32i32, MVT::v128i1);
     Result = CurDAG->getMachineNode(Hexagon::V6_vaddcarry, SDLoc(N), VTs, Ops);
     break;
   }
   case Intrinsic::hexagon_V6_vsubcarry: {
-    SmallVector<SDValue, 3> Ops = { N->getOperand(1), N->getOperand(2),
-                                    N->getOperand(3) };
-    SDVTList VTs = CurDAG->getVTList(MVT::v16i32, MVT::v512i1);
+    std::array<SDValue, 3> Ops = {
+        {N->getOperand(1), N->getOperand(2), N->getOperand(3)}};
+    SDVTList VTs = CurDAG->getVTList(MVT::v16i32, MVT::v64i1);
     Result = CurDAG->getMachineNode(Hexagon::V6_vsubcarry, SDLoc(N), VTs, Ops);
     break;
   }
   case Intrinsic::hexagon_V6_vsubcarry_128B: {
-    SmallVector<SDValue, 3> Ops = { N->getOperand(1), N->getOperand(2),
-                                    N->getOperand(3) };
-    SDVTList VTs = CurDAG->getVTList(MVT::v32i32, MVT::v1024i1);
+    std::array<SDValue, 3> Ops = {
+        {N->getOperand(1), N->getOperand(2), N->getOperand(3)}};
+    SDVTList VTs = CurDAG->getVTList(MVT::v32i32, MVT::v128i1);
     Result = CurDAG->getMachineNode(Hexagon::V6_vsubcarry, SDLoc(N), VTs, Ops);
     break;
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index e11ecdc7d035..768fea639cf9 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -109,6 +109,11 @@ static cl::opt<bool> AlignLoads("hexagon-align-loads",
   cl::Hidden, cl::init(false),
   cl::desc("Rewrite unaligned loads as a pair of aligned loads"));
 
+static cl::opt<bool>
+    DisableArgsMinAlignment("hexagon-disable-args-min-alignment", cl::Hidden,
+                            cl::init(false),
+                            cl::desc("Disable minimum alignment of 1 for "
+                                     "arguments passed by value on stack"));
 
 namespace {
 
@@ -167,10 +172,10 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
                                          SDValue Chain, ISD::ArgFlagsTy Flags,
                                          SelectionDAG &DAG, const SDLoc &dl) {
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
-  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
-                       /*isVolatile=*/false, /*AlwaysInline=*/false,
-                       /*isTailCall=*/false,
-                       MachinePointerInfo(), MachinePointerInfo());
+  return DAG.getMemcpy(
+      Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
+      /*isVolatile=*/false, /*AlwaysInline=*/false,
+      /*isTailCall=*/false, MachinePointerInfo(), MachinePointerInfo());
 }
 
 bool
@@ -387,19 +392,22 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   auto PtrVT = getPointerTy(MF.getDataLayout());
 
-  unsigned NumParams = CLI.CS.getInstruction()
-                        ? CLI.CS.getFunctionType()->getNumParams()
-                        : 0;
+  unsigned NumParams = CLI.CB ? CLI.CB->getFunctionType()->getNumParams() : 0;
   if (GlobalAddressSDNode *GAN = dyn_cast<GlobalAddressSDNode>(Callee))
     Callee = DAG.getTargetGlobalAddress(GAN->getGlobal(), dl, MVT::i32);
 
+  // Linux ABI treats var-arg calls the same way as regular ones.
+  bool TreatAsVarArg = !Subtarget.isEnvironmentMusl() && IsVarArg;
+
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
-  HexagonCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext(),
+  HexagonCCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs, *DAG.getContext(),
                         NumParams);
 
   if (Subtarget.useHVXOps())
     CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_HVX);
+  else if (DisableArgsMinAlignment)
+    CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon_Legacy);
   else
     CCInfo.AnalyzeCallOperands(Outs, CC_Hexagon);
 
@@ -429,7 +437,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       DAG.getCopyFromReg(Chain, dl, HRI.getStackRegister(), PtrVT);
 
   bool NeedsArgAlign = false;
-  unsigned LargestAlignSeen = 0;
+  Align LargestAlignSeen;
   // Walk the register/memloc assignments, inserting copies/loads.
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -466,8 +474,8 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                         StackPtr.getValueType());
       MemAddr = DAG.getNode(ISD::ADD, dl, MVT::i32, StackPtr, MemAddr);
       if (ArgAlign)
-        LargestAlignSeen = std::max(LargestAlignSeen,
-                             (unsigned)VA.getLocVT().getStoreSizeInBits() >> 3);
+        LargestAlignSeen = std::max(
+            LargestAlignSeen, Align(VA.getLocVT().getStoreSizeInBits() / 8));
       if (Flags.isByVal()) {
         // The argument is a struct passed by value. According to LLVM, "Arg"
         // is a pointer.
@@ -490,7 +498,7 @@ HexagonTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   if (NeedsArgAlign && Subtarget.hasV60Ops()) {
     LLVM_DEBUG(dbgs() << "Function needs byte stack align due to call args\n");
-    unsigned VecAlign = HRI.getSpillAlignment(Hexagon::HvxVRRegClass);
+    Align VecAlign(HRI.getSpillAlignment(Hexagon::HvxVRRegClass));
     LargestAlignSeen = std::max(LargestAlignSeen, VecAlign);
     MFI.ensureMaxAlignment(LargestAlignSeen);
   }
@@ -726,7 +734,7 @@ HexagonTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
   auto &HFI = *Subtarget.getFrameLowering();
   // "Zero" means natural stack alignment.
   if (A == 0)
-    A = HFI.getStackAlignment();
+    A = HFI.getStackAlign().value();
 
   LLVM_DEBUG({
     dbgs () << __func__ << " Align: " << A << " Size: ";
@@ -750,13 +758,19 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
+  // Linux ABI treats var-arg calls the same way as regular ones.
+  bool TreatAsVarArg = !Subtarget.isEnvironmentMusl() && IsVarArg;
+
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  HexagonCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext(),
+  HexagonCCState CCInfo(CallConv, TreatAsVarArg, MF, ArgLocs,
+                        *DAG.getContext(),
                         MF.getFunction().getFunctionType()->getNumParams());
 
   if (Subtarget.useHVXOps())
     CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon_HVX);
+  else if (DisableArgsMinAlignment)
+    CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon_Legacy);
   else
     CCInfo.AnalyzeFormalArguments(Ins, CC_Hexagon);
 
@@ -766,8 +780,24 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
   // caller's stack is passed only when the struct size is smaller than (and
   // equal to) 8 bytes. If not, no address will be passed into callee and
   // callee return the result direclty through R0/R1.
+  auto NextSingleReg = [] (const TargetRegisterClass &RC, unsigned Reg) {
+    switch (RC.getID()) {
+    case Hexagon::IntRegsRegClassID:
+      return Reg - Hexagon::R0 + 1;
+    case Hexagon::DoubleRegsRegClassID:
+      return (Reg - Hexagon::D0 + 1) * 2;
+    case Hexagon::HvxVRRegClassID:
+      return Reg - Hexagon::V0 + 1;
+    case Hexagon::HvxWRRegClassID:
+      return (Reg - Hexagon::W0 + 1) * 2;
+    }
+    llvm_unreachable("Unexpected register class");
+  };
 
+  auto &HFL = const_cast<HexagonFrameLowering&>(*Subtarget.getFrameLowering());
   auto &HMFI = *MF.getInfo<HexagonMachineFunctionInfo>();
+  HFL.FirstVarArgSavedReg = 0;
+  HMFI.setFirstNamedArgFrameIndex(-int(MFI.getNumFixedObjects()));
 
   for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
     CCValAssign &VA = ArgLocs[i];
@@ -811,6 +841,7 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
       }
       InVals.push_back(Copy);
       MRI.addLiveIn(VA.getLocReg(), VReg);
+      HFL.FirstVarArgSavedReg = NextSingleReg(*RC, VA.getLocReg());
     } else {
       assert(VA.isMemLoc() && "Argument should be passed in memory");
 
@@ -838,8 +869,48 @@ SDValue HexagonTargetLowering::LowerFormalArguments(
     }
   }
 
+  if (IsVarArg && Subtarget.isEnvironmentMusl()) {
+    for (int i = HFL.FirstVarArgSavedReg; i < 6; i++)
+      MRI.addLiveIn(Hexagon::R0+i);
+  }
+
+  if (IsVarArg && Subtarget.isEnvironmentMusl()) {
+    HMFI.setFirstNamedArgFrameIndex(HMFI.getFirstNamedArgFrameIndex() - 1);
+    HMFI.setLastNamedArgFrameIndex(-int(MFI.getNumFixedObjects()));
+
+    // Create Frame index for the start of register saved area.
+    int NumVarArgRegs = 6 - HFL.FirstVarArgSavedReg;
+    bool RequiresPadding = (NumVarArgRegs & 1);
+    int RegSaveAreaSizePlusPadding = RequiresPadding
+                                        ? (NumVarArgRegs + 1) * 4
+                                        : NumVarArgRegs * 4;
+
+    if (RegSaveAreaSizePlusPadding > 0) {
+      // The offset to saved register area should be 8 byte aligned.
+      int RegAreaStart = HEXAGON_LRFP_SIZE + CCInfo.getNextStackOffset();
+      if (!(RegAreaStart % 8))
+        RegAreaStart = (RegAreaStart + 7) & -8;
 
-  if (IsVarArg) {
+      int RegSaveAreaFrameIndex =
+        MFI.CreateFixedObject(RegSaveAreaSizePlusPadding, RegAreaStart, true);
+      HMFI.setRegSavedAreaStartFrameIndex(RegSaveAreaFrameIndex);
+
+      // This will point to the next argument passed via stack.
+      int Offset = RegAreaStart + RegSaveAreaSizePlusPadding;
+      int FI = MFI.CreateFixedObject(Hexagon_PointerSize, Offset, true);
+      HMFI.setVarArgsFrameIndex(FI);
+    } else {
+      // This will point to the next argument passed via stack, when
+      // there is no saved register area.
+      int Offset = HEXAGON_LRFP_SIZE + CCInfo.getNextStackOffset();
+      int FI = MFI.CreateFixedObject(Hexagon_PointerSize, Offset, true);
+      HMFI.setRegSavedAreaStartFrameIndex(FI);
+      HMFI.setVarArgsFrameIndex(FI);
+    }
+  }
+
+
+  if (IsVarArg && !Subtarget.isEnvironmentMusl()) {
     // This will point to the next argument passed via stack.
     int Offset = HEXAGON_LRFP_SIZE + CCInfo.getNextStackOffset();
     int FI = MFI.CreateFixedObject(Hexagon_PointerSize, Offset, true);
@@ -857,8 +928,81 @@ HexagonTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
   HexagonMachineFunctionInfo *QFI = MF.getInfo<HexagonMachineFunctionInfo>();
   SDValue Addr = DAG.getFrameIndex(QFI->getVarArgsFrameIndex(), MVT::i32);
   const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
-  return DAG.getStore(Op.getOperand(0), SDLoc(Op), Addr, Op.getOperand(1),
-                      MachinePointerInfo(SV));
+
+  if (!Subtarget.isEnvironmentMusl()) {
+    return DAG.getStore(Op.getOperand(0), SDLoc(Op), Addr, Op.getOperand(1),
+                        MachinePointerInfo(SV));
+  }
+  auto &FuncInfo = *MF.getInfo<HexagonMachineFunctionInfo>();
+  auto &HFL = *Subtarget.getFrameLowering();
+  SDLoc DL(Op);
+  SmallVector<SDValue, 8> MemOps;
+
+  // Get frame index of va_list.
+  SDValue FIN = Op.getOperand(1);
+
+  // If first Vararg register is odd, add 4 bytes to start of
+  // saved register area to point to the first register location.
+  // This is because the saved register area has to be 8 byte aligned.
+  // Incase of an odd start register, there will be 4 bytes of padding in
+  // the beginning of saved register area. If all registers area used up,
+  // the following condition will handle it correctly.
+  SDValue SavedRegAreaStartFrameIndex =
+    DAG.getFrameIndex(FuncInfo.getRegSavedAreaStartFrameIndex(), MVT::i32);
+
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  if (HFL.FirstVarArgSavedReg & 1)
+    SavedRegAreaStartFrameIndex =
+      DAG.getNode(ISD::ADD, DL, PtrVT,
+                  DAG.getFrameIndex(FuncInfo.getRegSavedAreaStartFrameIndex(),
+                                    MVT::i32),
+                  DAG.getIntPtrConstant(4, DL));
+
+  // Store the saved register area start pointer.
+  SDValue Store =
+    DAG.getStore(Op.getOperand(0), DL,
+                 SavedRegAreaStartFrameIndex,
+                 FIN, MachinePointerInfo(SV));
+  MemOps.push_back(Store);
+
+  // Store saved register area end pointer.
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT,
+                    FIN, DAG.getIntPtrConstant(4, DL));
+  Store = DAG.getStore(Op.getOperand(0), DL,
+                       DAG.getFrameIndex(FuncInfo.getVarArgsFrameIndex(),
+                                         PtrVT),
+                       FIN, MachinePointerInfo(SV, 4));
+  MemOps.push_back(Store);
+
+  // Store overflow area pointer.
+  FIN = DAG.getNode(ISD::ADD, DL, PtrVT,
+                    FIN, DAG.getIntPtrConstant(4, DL));
+  Store = DAG.getStore(Op.getOperand(0), DL,
+                       DAG.getFrameIndex(FuncInfo.getVarArgsFrameIndex(),
+                                         PtrVT),
+                       FIN, MachinePointerInfo(SV, 8));
+  MemOps.push_back(Store);
+
+  return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+}
+
+SDValue
+HexagonTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
+  // Assert that the linux ABI is enabled for the current compilation.
+  assert(Subtarget.isEnvironmentMusl() && "Linux ABI should be enabled");
+  SDValue Chain = Op.getOperand(0);
+  SDValue DestPtr = Op.getOperand(1);
+  SDValue SrcPtr = Op.getOperand(2);
+  const Value *DestSV = cast<SrcValueSDNode>(Op.getOperand(3))->getValue();
+  const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+  SDLoc DL(Op);
+  // Size of the va_list is 12 bytes as it has 3 pointers. Therefore,
+  // we need to memcopy 12 bytes from va_list to another similar list.
+  return DAG.getMemcpy(Chain, DL, DestPtr, SrcPtr,
+                       DAG.getIntPtrConstant(12, DL), Align(4),
+                       /*isVolatile*/ false, false, false,
+                       MachinePointerInfo(DestSV), MachinePointerInfo(SrcSV));
 }
 
 SDValue HexagonTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
@@ -943,57 +1087,40 @@ HexagonTargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   return SDValue();
 }
 
-static Constant *convert_i1_to_i8(const Constant *ConstVal) {
-  SmallVector<Constant *, 128> NewConst;
-  const ConstantVector *CV = dyn_cast<ConstantVector>(ConstVal);
-  if (!CV)
-    return nullptr;
-
-  LLVMContext &Ctx = ConstVal->getContext();
-  IRBuilder<> IRB(Ctx);
-  unsigned NumVectorElements = CV->getNumOperands();
-  assert(isPowerOf2_32(NumVectorElements) &&
-         "conversion only supported for pow2 VectorSize!");
-
-  for (unsigned i = 0; i < NumVectorElements / 8; ++i) {
-    uint8_t x = 0;
-    for (unsigned j = 0; j < 8; ++j) {
-      uint8_t y = CV->getOperand(i * 8 + j)->getUniqueInteger().getZExtValue();
-      x |= y << (7 - j);
-    }
-    assert((x == 0 || x == 255) && "Either all 0's or all 1's expected!");
-    NewConst.push_back(IRB.getInt8(x));
-  }
-  return ConstantVector::get(NewConst);
-}
-
 SDValue
 HexagonTargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
   EVT ValTy = Op.getValueType();
   ConstantPoolSDNode *CPN = cast<ConstantPoolSDNode>(Op);
   Constant *CVal = nullptr;
   bool isVTi1Type = false;
-  if (const Constant *ConstVal = dyn_cast<Constant>(CPN->getConstVal())) {
-    Type *CValTy = ConstVal->getType();
-    if (CValTy->isVectorTy() &&
-        CValTy->getVectorElementType()->isIntegerTy(1)) {
-      CVal = convert_i1_to_i8(ConstVal);
-      isVTi1Type = (CVal != nullptr);
+  if (auto *CV = dyn_cast<ConstantVector>(CPN->getConstVal())) {
+    if (cast<VectorType>(CV->getType())->getElementType()->isIntegerTy(1)) {
+      IRBuilder<> IRB(CV->getContext());
+      SmallVector<Constant*, 128> NewConst;
+      unsigned VecLen = CV->getNumOperands();
+      assert(isPowerOf2_32(VecLen) &&
+             "conversion only supported for pow2 VectorSize");
+      for (unsigned i = 0; i < VecLen; ++i)
+        NewConst.push_back(IRB.getInt8(CV->getOperand(i)->isZeroValue()));
+
+      CVal = ConstantVector::get(NewConst);
+      isVTi1Type = true;
     }
   }
-  unsigned Align = CPN->getAlignment();
+  Align Alignment = CPN->getAlign();
   bool IsPositionIndependent = isPositionIndependent();
   unsigned char TF = IsPositionIndependent ? HexagonII::MO_PCREL : 0;
 
   unsigned Offset = 0;
   SDValue T;
   if (CPN->isMachineConstantPoolEntry())
-    T = DAG.getTargetConstantPool(CPN->getMachineCPVal(), ValTy, Align, Offset,
-                                  TF);
+    T = DAG.getTargetConstantPool(CPN->getMachineCPVal(), ValTy, Alignment,
+                                  Offset, TF);
   else if (isVTi1Type)
-    T = DAG.getTargetConstantPool(CVal, ValTy, Align, Offset, TF);
+    T = DAG.getTargetConstantPool(CVal, ValTy, Alignment, Offset, TF);
   else
-    T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Align, Offset, TF);
+    T = DAG.getTargetConstantPool(CPN->getConstVal(), ValTy, Alignment, Offset,
+                                  TF);
 
   assert(cast<ConstantPoolSDNode>(T)->getTargetFlags() == TF &&
          "Inconsistent target flag encountered");
@@ -1375,7 +1502,10 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::VASTART, MVT::Other, Custom);
   setOperationAction(ISD::VAEND,   MVT::Other, Expand);
   setOperationAction(ISD::VAARG,   MVT::Other, Expand);
-  setOperationAction(ISD::VACOPY,  MVT::Other, Expand);
+  if (Subtarget.isEnvironmentMusl())
+    setOperationAction(ISD::VACOPY, MVT::Other, Custom);
+  else
+    setOperationAction(ISD::VACOPY,  MVT::Other, Expand);
 
   setOperationAction(ISD::STACKSAVE, MVT::Other, Expand);
   setOperationAction(ISD::STACKRESTORE, MVT::Other, Expand);
@@ -1621,6 +1751,11 @@ HexagonTargetLowering::HexagonTargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::FADD, MVT::f64, Legal);
     setOperationAction(ISD::FSUB, MVT::f64, Legal);
   }
+  if (Subtarget.hasV67Ops()) {
+    setOperationAction(ISD::FMINNUM, MVT::f64, Legal);
+    setOperationAction(ISD::FMAXNUM, MVT::f64, Legal);
+    setOperationAction(ISD::FMUL,    MVT::f64, Legal);
+  }
 
   setTargetDAGCombine(ISD::VSELECT);
 
@@ -1855,8 +1990,7 @@ bool HexagonTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     // The offset value comes through Modifier register. For now, assume the
     // offset is 0.
     Info.offset = 0;
-    Info.align =
-        MaybeAlign(DL.getABITypeAlignment(Info.memVT.getTypeForEVT(Cont)));
+    Info.align = DL.getABITypeAlign(Info.memVT.getTypeForEVT(Cont));
     Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
@@ -2139,13 +2273,16 @@ HexagonTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
   const SDLoc &dl(Op);
 
   // Handle conversion from i8 to v8i1.
-  if (ResTy == MVT::v8i1) {
-    SDValue Sc = DAG.getBitcast(tyScalar(InpTy), InpV);
-    SDValue Ext = DAG.getZExtOrTrunc(Sc, dl, MVT::i32);
-    return getInstr(Hexagon::C2_tfrrp, dl, ResTy, Ext, DAG);
+  if (InpTy == MVT::i8) {
+    if (ResTy == MVT::v8i1) {
+      SDValue Sc = DAG.getBitcast(tyScalar(InpTy), InpV);
+      SDValue Ext = DAG.getZExtOrTrunc(Sc, dl, MVT::i32);
+      return getInstr(Hexagon::C2_tfrrp, dl, ResTy, Ext, DAG);
+    }
+    return SDValue();
   }
 
-  return SDValue();
+  return Op;
 }
 
 bool
@@ -2779,10 +2916,10 @@ HexagonTargetLowering::LowerUnalignedLoad(SDValue Op, SelectionDAG &DAG)
   MachineMemOperand *WideMMO = nullptr;
   if (MachineMemOperand *MMO = LN->getMemOperand()) {
     MachineFunction &MF = DAG.getMachineFunction();
-    WideMMO = MF.getMachineMemOperand(MMO->getPointerInfo(), MMO->getFlags(),
-                    2*LoadLen, LoadLen, MMO->getAAInfo(), MMO->getRanges(),
-                    MMO->getSyncScopeID(), MMO->getOrdering(),
-                    MMO->getFailureOrdering());
+    WideMMO = MF.getMachineMemOperand(
+        MMO->getPointerInfo(), MMO->getFlags(), 2 * LoadLen, Align(LoadLen),
+        MMO->getAAInfo(), MMO->getRanges(), MMO->getSyncScopeID(),
+        MMO->getOrdering(), MMO->getFailureOrdering());
   }
 
   SDValue Load0 = DAG.getLoad(LoadTy, dl, Chain, Base0, WideMMO);
@@ -2928,6 +3065,7 @@ HexagonTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::GlobalAddress:        return LowerGLOBALADDRESS(Op, DAG);
     case ISD::BlockAddress:         return LowerBlockAddress(Op, DAG);
     case ISD::GLOBAL_OFFSET_TABLE:  return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
+    case ISD::VACOPY:               return LowerVACOPY(Op, DAG);
     case ISD::VASTART:              return LowerVASTART(Op, DAG);
     case ISD::DYNAMIC_STACKALLOC:   return LowerDYNAMIC_STACKALLOC(Op, DAG);
     case ISD::SETCC:                return LowerSETCC(Op, DAG);
@@ -2946,6 +3084,12 @@ void
 HexagonTargetLowering::LowerOperationWrapper(SDNode *N,
                                              SmallVectorImpl<SDValue> &Results,
                                              SelectionDAG &DAG) const {
+  if (isHvxOperation(N)) {
+    LowerHvxOperationWrapper(N, Results, DAG);
+    if (!Results.empty())
+      return;
+  }
+
   // We are only custom-lowering stores to verify the alignment of the
   // address if it is a compile-time constant. Since a store can be modified
   // during type-legalization (the value being stored may need legalization),
@@ -2959,6 +3103,12 @@ void
 HexagonTargetLowering::ReplaceNodeResults(SDNode *N,
                                           SmallVectorImpl<SDValue> &Results,
                                           SelectionDAG &DAG) const {
+  if (isHvxOperation(N)) {
+    ReplaceHvxNodeResults(N, Results, DAG);
+    if (!Results.empty())
+      return;
+  }
+
   const SDLoc &dl(N);
   switch (N->getOpcode()) {
     case ISD::SRL:
@@ -3079,8 +3229,8 @@ HexagonTargetLowering::getRegForInlineAsmConstraint(
       switch (VT.getSizeInBits()) {
       default:
         return {0u, nullptr};
-      case 512:
-      case 1024:
+      case 64:
+      case 128:
         return {0u, &Hexagon::HvxQRRegClass};
       }
       break;
@@ -3127,12 +3277,12 @@ bool HexagonTargetLowering::isLegalAddressingMode(const DataLayout &DL,
     // The type Ty passed here would then be "void". Skip the alignment
     // checks, but do not return false right away, since that confuses
     // LSR into crashing.
-    unsigned A = DL.getABITypeAlignment(Ty);
+    Align A = DL.getABITypeAlign(Ty);
     // The base offset must be a multiple of the alignment.
-    if ((AM.BaseOffs % A) != 0)
+    if (!isAligned(A, AM.BaseOffs))
       return false;
     // The shifted offset must fit in 11 bits.
-    if (!isInt<11>(AM.BaseOffs >> Log2_32(A)))
+    if (!isInt<11>(AM.BaseOffs >> Log2(A)))
       return false;
   }
 
@@ -3232,30 +3382,36 @@ bool HexagonTargetLowering::IsEligibleForTailCallOptimization(
 /// zero. 'MemcpyStrSrc' indicates whether the memcpy source is constant so it
 /// does not need to be loaded.  It returns EVT::Other if the type should be
 /// determined using generic target-independent logic.
-EVT HexagonTargetLowering::getOptimalMemOpType(uint64_t Size,
-      unsigned DstAlign, unsigned SrcAlign, bool IsMemset, bool ZeroMemset,
-      bool MemcpyStrSrc, const AttributeList &FuncAttributes) const {
-
-  auto Aligned = [](unsigned GivenA, unsigned MinA) -> bool {
-    return (GivenA % MinA) == 0;
-  };
-
-  if (Size >= 8 && Aligned(DstAlign, 8) && (IsMemset || Aligned(SrcAlign, 8)))
+EVT HexagonTargetLowering::getOptimalMemOpType(
+    const MemOp &Op, const AttributeList &FuncAttributes) const {
+  if (Op.size() >= 8 && Op.isAligned(Align(8)))
     return MVT::i64;
-  if (Size >= 4 && Aligned(DstAlign, 4) && (IsMemset || Aligned(SrcAlign, 4)))
+  if (Op.size() >= 4 && Op.isAligned(Align(4)))
     return MVT::i32;
-  if (Size >= 2 && Aligned(DstAlign, 2) && (IsMemset || Aligned(SrcAlign, 2)))
+  if (Op.size() >= 2 && Op.isAligned(Align(2)))
     return MVT::i16;
-
   return MVT::Other;
 }
 
+bool HexagonTargetLowering::allowsMemoryAccess(
+    LLVMContext &Context, const DataLayout &DL, EVT VT, unsigned AddrSpace,
+    Align Alignment, MachineMemOperand::Flags Flags, bool *Fast) const {
+  MVT SVT = VT.getSimpleVT();
+  if (Subtarget.isHVXVectorType(SVT, true))
+    return allowsHvxMemoryAccess(SVT, Flags, Fast);
+  return TargetLoweringBase::allowsMemoryAccess(
+              Context, DL, VT, AddrSpace, Alignment, Flags, Fast);
+}
+
 bool HexagonTargetLowering::allowsMisalignedMemoryAccesses(
-    EVT VT, unsigned AS, unsigned Align, MachineMemOperand::Flags Flags,
-    bool *Fast) const {
+      EVT VT, unsigned AddrSpace, unsigned Alignment,
+      MachineMemOperand::Flags Flags, bool *Fast) const {
+  MVT SVT = VT.getSimpleVT();
+  if (Subtarget.isHVXVectorType(SVT, true))
+    return allowsHvxMisalignedMemoryAccesses(SVT, Flags, Fast);
   if (Fast)
     *Fast = false;
-  return Subtarget.isHVXVectorType(VT.getSimpleVT());
+  return false;
 }
 
 std::pair<const TargetRegisterClass*, uint8_t>
@@ -3357,9 +3513,5 @@ bool HexagonTargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
 TargetLowering::AtomicExpansionKind
 HexagonTargetLowering::shouldExpandAtomicCmpXchgInIR(
     AtomicCmpXchgInst *AI) const {
-  const DataLayout &DL = AI->getModule()->getDataLayout();
-  unsigned Size = DL.getTypeStoreSize(AI->getCompareOperand()->getType());
-  if (Size >= 4 && Size <= 8)
-    return AtomicExpansionKind::LLSC;
-  return AtomicExpansionKind::None;
+  return AtomicExpansionKind::LLSC;
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.h b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
index e79646de6287..7d6e6b6185c8 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.h
@@ -235,19 +235,20 @@ namespace HexagonISD {
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
-    unsigned
+    Register
     getExceptionPointerRegister(const Constant *PersonalityFn) const override {
       return Hexagon::R0;
     }
 
     /// If a physical register, this returns the register that receives the
     /// exception typeid on entry to a landing pad.
-    unsigned
+    Register
     getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
       return Hexagon::R1;
     }
 
     SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerVACOPY(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerJumpTable(SDValue Op, SelectionDAG &DAG) const;
 
@@ -301,12 +302,16 @@ namespace HexagonISD {
     /// the immediate into a register.
     bool isLegalICmpImmediate(int64_t Imm) const override;
 
-    EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
-        unsigned SrcAlign, bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
-        const AttributeList &FuncAttributes) const override;
+    EVT getOptimalMemOpType(const MemOp &Op,
+                            const AttributeList &FuncAttributes) const override;
+
+    bool allowsMemoryAccess(LLVMContext &Context, const DataLayout &DL, EVT VT,
+                            unsigned AddrSpace, Align Alignment,
+                            MachineMemOperand::Flags Flags,
+                            bool *Fast) const override;
 
     bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AddrSpace,
-        unsigned Align, MachineMemOperand::Flags Flags, bool *Fast)
+        unsigned Alignment, MachineMemOperand::Flags Flags, bool *Fast)
         const override;
 
     /// Returns relocation base for the given PIC jumptable.
@@ -404,8 +409,15 @@ namespace HexagonISD {
     VectorPair opSplit(SDValue Vec, const SDLoc &dl, SelectionDAG &DAG) const;
     SDValue opCastElem(SDValue Vec, MVT ElemTy, SelectionDAG &DAG) const;
 
+    bool allowsHvxMemoryAccess(MVT VecTy, MachineMemOperand::Flags Flags,
+                               bool *Fast) const;
+    bool allowsHvxMisalignedMemoryAccesses(MVT VecTy,
+                                           MachineMemOperand::Flags Flags,
+                                           bool *Fast) const;
+
     bool isHvxSingleTy(MVT Ty) const;
     bool isHvxPairTy(MVT Ty) const;
+    bool isHvxBoolTy(MVT Ty) const;
     SDValue convertToByteIndex(SDValue ElemIdx, MVT ElemTy,
                                SelectionDAG &DAG) const;
     SDValue getIndexInWord32(SDValue Idx, MVT ElemTy, SelectionDAG &DAG) const;
@@ -437,6 +449,8 @@ namespace HexagonISD {
                                    const SDLoc &dl, SelectionDAG &DAG) const;
     SDValue extendHvxVectorPred(SDValue VecV, const SDLoc &dl, MVT ResTy,
                                 bool ZeroExt, SelectionDAG &DAG) const;
+    SDValue compressHvxPred(SDValue VecQ, const SDLoc &dl, MVT ResTy,
+                            SelectionDAG &DAG) const;
 
     SDValue LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxConcatVectors(SDValue Op, SelectionDAG &DAG) const;
@@ -444,7 +458,7 @@ namespace HexagonISD {
     SDValue LowerHvxInsertElement(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxExtractSubvector(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxInsertSubvector(SDValue Op, SelectionDAG &DAG) const;
-
+    SDValue LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxAnyExt(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxSignExt(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxZeroExt(SDValue Op, SelectionDAG &DAG) const;
@@ -454,6 +468,9 @@ namespace HexagonISD {
     SDValue LowerHvxSetCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerHvxShift(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerHvxStore(SDValue Op, SelectionDAG &DAG) const;
+    SDValue HvxVecPredBitcastComputation(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const;
     SDValue SplitHvxMemOp(SDValue Op, SelectionDAG &DAG) const;
@@ -463,8 +480,12 @@ namespace HexagonISD {
         const override;
 
     bool isHvxOperation(SDValue Op) const;
+    bool isHvxOperation(SDNode *N) const;
     SDValue LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const;
-
+    void LowerHvxOperationWrapper(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                                  SelectionDAG &DAG) const;
+    void ReplaceHvxNodeResults(SDNode *N, SmallVectorImpl<SDValue> &Results,
+                               SelectionDAG &DAG) const;
     SDValue PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const;
   };
 
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
index 204950f9010e..7cda915fffe9 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLoweringHVX.cpp
@@ -9,6 +9,7 @@
 #include "HexagonISelLowering.h"
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
+#include "llvm/IR/IntrinsicsHexagon.h"
 #include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
@@ -39,7 +40,6 @@ HexagonTargetLowering::initializeHVXLowering() {
     addRegisterClass(MVT::v16i1, &Hexagon::HvxQRRegClass);
     addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
     addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
-    addRegisterClass(MVT::v512i1, &Hexagon::HvxQRRegClass);
   } else if (Subtarget.useHVX128BOps()) {
     addRegisterClass(MVT::v128i8,  &Hexagon::HvxVRRegClass);
     addRegisterClass(MVT::v64i16,  &Hexagon::HvxVRRegClass);
@@ -50,7 +50,6 @@ HexagonTargetLowering::initializeHVXLowering() {
     addRegisterClass(MVT::v32i1, &Hexagon::HvxQRRegClass);
     addRegisterClass(MVT::v64i1, &Hexagon::HvxQRRegClass);
     addRegisterClass(MVT::v128i1, &Hexagon::HvxQRRegClass);
-    addRegisterClass(MVT::v1024i1, &Hexagon::HvxQRRegClass);
   }
 
   // Set up operation actions.
@@ -66,8 +65,18 @@ HexagonTargetLowering::initializeHVXLowering() {
     AddPromotedToType(Opc, FromTy, ToTy);
   };
 
-  setOperationAction(ISD::VECTOR_SHUFFLE, ByteV, Legal);
-  setOperationAction(ISD::VECTOR_SHUFFLE, ByteW, Legal);
+  // Handle bitcasts of vector predicates to scalars (e.g. v32i1 to i32).
+  // Note: v16i1 -> i16 is handled in type legalization instead of op
+  // legalization.
+  setOperationAction(ISD::BITCAST,            MVT::i16,   Custom);
+  setOperationAction(ISD::BITCAST,            MVT::i32,   Custom);
+  setOperationAction(ISD::BITCAST,            MVT::i64,   Custom);
+  setOperationAction(ISD::BITCAST,            MVT::v16i1, Custom);
+  setOperationAction(ISD::BITCAST,            MVT::v128i1, Custom);
+  setOperationAction(ISD::BITCAST,            MVT::i128, Custom);
+  setOperationAction(ISD::VECTOR_SHUFFLE,     ByteV,      Legal);
+  setOperationAction(ISD::VECTOR_SHUFFLE,     ByteW,      Legal);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
   for (MVT T : LegalV) {
     setIndexedLoadAction(ISD::POST_INC,  T, Legal);
@@ -194,12 +203,13 @@ HexagonTargetLowering::initializeHVXLowering() {
     setOperationAction(ISD::XOR,                BoolV, Legal);
   }
 
-  if (Use64b)
+  if (Use64b) {
     for (MVT T: {MVT::v32i8, MVT::v32i16, MVT::v16i8, MVT::v16i16, MVT::v16i32})
       setOperationAction(ISD::SIGN_EXTEND_INREG, T, Legal);
-  else
+  } else {
     for (MVT T: {MVT::v64i8, MVT::v64i16, MVT::v32i8, MVT::v32i16, MVT::v32i32})
       setOperationAction(ISD::SIGN_EXTEND_INREG, T, Legal);
+  }
 
   setTargetDAGCombine(ISD::VSELECT);
 }
@@ -283,6 +293,37 @@ HexagonTargetLowering::isHvxPairTy(MVT Ty) const {
          Ty.getSizeInBits() == 16 * Subtarget.getVectorLength();
 }
 
+bool
+HexagonTargetLowering::isHvxBoolTy(MVT Ty) const {
+  return Subtarget.isHVXVectorType(Ty, true) &&
+         Ty.getVectorElementType() == MVT::i1;
+}
+
+bool HexagonTargetLowering::allowsHvxMemoryAccess(
+    MVT VecTy, MachineMemOperand::Flags Flags, bool *Fast) const {
+  // Bool vectors are excluded by default, but make it explicit to
+  // emphasize that bool vectors cannot be loaded or stored.
+  // Also, disallow double vector stores (to prevent unnecessary
+  // store widening in DAG combiner).
+  if (VecTy.getSizeInBits() > 8*Subtarget.getVectorLength())
+    return false;
+  if (!Subtarget.isHVXVectorType(VecTy, /*IncludeBool=*/false))
+    return false;
+  if (Fast)
+    *Fast = true;
+  return true;
+}
+
+bool HexagonTargetLowering::allowsHvxMisalignedMemoryAccesses(
+    MVT VecTy, MachineMemOperand::Flags Flags, bool *Fast) const {
+  if (!Subtarget.isHVXVectorType(VecTy))
+    return false;
+  // XXX Should this be false?  vmemu are a bit slower than vmem.
+  if (Fast)
+    *Fast = true;
+  return true;
+}
+
 SDValue
 HexagonTargetLowering::convertToByteIndex(SDValue ElemIdx, MVT ElemTy,
                                           SelectionDAG &DAG) const {
@@ -402,10 +443,11 @@ HexagonTargetLowering::buildHvxVectorReg(ArrayRef<SDValue> Values,
     ArrayRef<Constant*> Tmp((Constant**)Consts.begin(),
                             (Constant**)Consts.end());
     Constant *CV = ConstantVector::get(Tmp);
-    unsigned Align = HwLen;
-    SDValue CP = LowerConstantPool(DAG.getConstantPool(CV, VecTy, Align), DAG);
+    Align Alignment(HwLen);
+    SDValue CP =
+        LowerConstantPool(DAG.getConstantPool(CV, VecTy, Alignment), DAG);
     return DAG.getLoad(VecTy, dl, DAG.getEntryNode(), CP,
-                       MachinePointerInfo::getConstantPool(MF), Align);
+                       MachinePointerInfo::getConstantPool(MF), Alignment);
   }
 
   // A special case is a situation where the vector is built entirely from
@@ -1023,6 +1065,63 @@ HexagonTargetLowering::extendHvxVectorPred(SDValue VecV, const SDLoc &dl,
 }
 
 SDValue
+HexagonTargetLowering::compressHvxPred(SDValue VecQ, const SDLoc &dl,
+      MVT ResTy, SelectionDAG &DAG) const {
+  // Given a predicate register VecQ, transfer bits VecQ[0..HwLen-1]
+  // (i.e. the entire predicate register) to bits [0..HwLen-1] of a
+  // vector register. The remaining bits of the vector register are
+  // unspecified.
+
+  MachineFunction &MF = DAG.getMachineFunction();
+  unsigned HwLen = Subtarget.getVectorLength();
+  MVT ByteTy = MVT::getVectorVT(MVT::i8, HwLen);
+  MVT PredTy = ty(VecQ);
+  unsigned PredLen = PredTy.getVectorNumElements();
+  assert(HwLen % PredLen == 0);
+  MVT VecTy = MVT::getVectorVT(MVT::getIntegerVT(8*HwLen/PredLen), PredLen);
+
+  Type *Int8Ty = Type::getInt8Ty(*DAG.getContext());
+  SmallVector<Constant*, 128> Tmp;
+  // Create an array of bytes (hex): 01,02,04,08,10,20,40,80, 01,02,04,08,...
+  // These are bytes with the LSB rotated left with respect to their index.
+  for (unsigned i = 0; i != HwLen/8; ++i) {
+    for (unsigned j = 0; j != 8; ++j)
+      Tmp.push_back(ConstantInt::get(Int8Ty, 1ull << j));
+  }
+  Constant *CV = ConstantVector::get(Tmp);
+  Align Alignment(HwLen);
+  SDValue CP =
+      LowerConstantPool(DAG.getConstantPool(CV, ByteTy, Alignment), DAG);
+  SDValue Bytes =
+      DAG.getLoad(ByteTy, dl, DAG.getEntryNode(), CP,
+                  MachinePointerInfo::getConstantPool(MF), Alignment);
+
+  // Select the bytes that correspond to true bits in the vector predicate.
+  SDValue Sel = DAG.getSelect(dl, VecTy, VecQ, DAG.getBitcast(VecTy, Bytes),
+      getZero(dl, VecTy, DAG));
+  // Calculate the OR of all bytes in each group of 8. That will compress
+  // all the individual bits into a single byte.
+  // First, OR groups of 4, via vrmpy with 0x01010101.
+  SDValue All1 =
+      DAG.getSplatBuildVector(MVT::v4i8, dl, DAG.getConstant(1, dl, MVT::i32));
+  SDValue Vrmpy = getInstr(Hexagon::V6_vrmpyub, dl, ByteTy, {Sel, All1}, DAG);
+  // Then rotate the accumulated vector by 4 bytes, and do the final OR.
+  SDValue Rot = getInstr(Hexagon::V6_valignbi, dl, ByteTy,
+      {Vrmpy, Vrmpy, DAG.getTargetConstant(4, dl, MVT::i32)}, DAG);
+  SDValue Vor = DAG.getNode(ISD::OR, dl, ByteTy, {Vrmpy, Rot});
+
+  // Pick every 8th byte and coalesce them at the beginning of the output.
+  // For symmetry, coalesce every 1+8th byte after that, then every 2+8th
+  // byte and so on.
+  SmallVector<int,128> Mask;
+  for (unsigned i = 0; i != HwLen; ++i)
+    Mask.push_back((8*i) % HwLen + i/(HwLen/8));
+  SDValue Collect =
+      DAG.getVectorShuffle(ByteTy, dl, Vor, DAG.getUNDEF(ByteTy), Mask);
+  return DAG.getBitcast(ResTy, Collect);
+}
+
+SDValue
 HexagonTargetLowering::LowerHvxBuildVector(SDValue Op, SelectionDAG &DAG)
       const {
   const SDLoc &dl(Op);
@@ -1431,6 +1530,53 @@ HexagonTargetLowering::LowerHvxMulh(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue
+HexagonTargetLowering::LowerHvxBitcast(SDValue Op, SelectionDAG &DAG) const {
+  SDValue ValQ = Op.getOperand(0);
+  MVT ResTy = ty(Op);
+  MVT VecTy = ty(ValQ);
+  const SDLoc &dl(Op);
+
+  if (isHvxBoolTy(VecTy) && ResTy.isScalarInteger()) {
+    unsigned HwLen = Subtarget.getVectorLength();
+    MVT WordTy = MVT::getVectorVT(MVT::i32, HwLen/4);
+    SDValue VQ = compressHvxPred(ValQ, dl, WordTy, DAG);
+    unsigned BitWidth = ResTy.getSizeInBits();
+
+    if (BitWidth < 64) {
+      SDValue W0 = extractHvxElementReg(VQ, DAG.getConstant(0, dl, MVT::i32),
+          dl, MVT::i32, DAG);
+      if (BitWidth == 32)
+        return W0;
+      assert(BitWidth < 32u);
+      return DAG.getZExtOrTrunc(W0, dl, ResTy);
+    }
+
+    // The result is >= 64 bits. The only options are 64 or 128.
+    assert(BitWidth == 64 || BitWidth == 128);
+    SmallVector<SDValue,4> Words;
+    for (unsigned i = 0; i != BitWidth/32; ++i) {
+      SDValue W = extractHvxElementReg(
+          VQ, DAG.getConstant(i, dl, MVT::i32), dl, MVT::i32, DAG);
+      Words.push_back(W);
+    }
+    SmallVector<SDValue,2> Combines;
+    assert(Words.size() % 2 == 0);
+    for (unsigned i = 0, e = Words.size(); i < e; i += 2) {
+      SDValue C = DAG.getNode(
+          HexagonISD::COMBINE, dl, MVT::i64, {Words[i+1], Words[i]});
+      Combines.push_back(C);
+    }
+
+    if (BitWidth == 64)
+      return Combines[0];
+
+    return DAG.getNode(ISD::BUILD_PAIR, dl, ResTy, Combines);
+  }
+
+  return Op;
+}
+
+SDValue
 HexagonTargetLowering::LowerHvxExtend(SDValue Op, SelectionDAG &DAG) const {
   // Sign- and zero-extends are legal.
   assert(Op.getOpcode() == ISD::ANY_EXTEND_VECTOR_INREG);
@@ -1446,6 +1592,28 @@ HexagonTargetLowering::LowerHvxShift(SDValue Op, SelectionDAG &DAG) const {
 }
 
 SDValue
+HexagonTargetLowering::LowerHvxIntrinsic(SDValue Op, SelectionDAG &DAG) const {
+  const SDLoc &dl(Op);
+  MVT ResTy = ty(Op);
+
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  bool Use64b = Subtarget.useHVX64BOps();
+  unsigned IntPredCast = Use64b ? Intrinsic::hexagon_V6_pred_typecast
+                                : Intrinsic::hexagon_V6_pred_typecast_128B;
+  if (IntNo == IntPredCast) {
+    SDValue Vs = Op.getOperand(1);
+    MVT OpTy = ty(Vs);
+    if (isHvxBoolTy(ResTy) && isHvxBoolTy(OpTy)) {
+      if (ResTy == OpTy)
+        return Vs;
+      return DAG.getNode(HexagonISD::TYPECAST, dl, ResTy, Vs);
+    }
+  }
+
+  return Op;
+}
+
+SDValue
 HexagonTargetLowering::SplitHvxPairOp(SDValue Op, SelectionDAG &DAG) const {
   assert(!Op.isMachineOpcode());
   SmallVector<SDValue,2> OpsL, OpsH;
@@ -1566,7 +1734,7 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::INSERT_VECTOR_ELT:       return LowerHvxInsertElement(Op, DAG);
     case ISD::EXTRACT_SUBVECTOR:       return LowerHvxExtractSubvector(Op, DAG);
     case ISD::EXTRACT_VECTOR_ELT:      return LowerHvxExtractElement(Op, DAG);
-
+    case ISD::BITCAST:                 return LowerHvxBitcast(Op, DAG);
     case ISD::ANY_EXTEND:              return LowerHvxAnyExt(Op, DAG);
     case ISD::SIGN_EXTEND:             return LowerHvxSignExt(Op, DAG);
     case ISD::ZERO_EXTEND:             return LowerHvxZeroExt(Op, DAG);
@@ -1580,6 +1748,7 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
     case ISD::ANY_EXTEND_VECTOR_INREG: return LowerHvxExtend(Op, DAG);
     case ISD::SETCC:
     case ISD::INTRINSIC_VOID:          return Op;
+    case ISD::INTRINSIC_WO_CHAIN:      return LowerHvxIntrinsic(Op, DAG);
     // Unaligned loads will be handled by the default lowering.
     case ISD::LOAD:                    return SDValue();
   }
@@ -1589,6 +1758,28 @@ HexagonTargetLowering::LowerHvxOperation(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("Unhandled HVX operation");
 }
 
+void
+HexagonTargetLowering::LowerHvxOperationWrapper(SDNode *N,
+      SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+}
+
+void
+HexagonTargetLowering::ReplaceHvxNodeResults(SDNode *N,
+      SmallVectorImpl<SDValue> &Results, SelectionDAG &DAG) const {
+  unsigned Opc = N->getOpcode();
+  switch (Opc) {
+    case ISD::BITCAST:
+      if (isHvxBoolTy(ty(N->getOperand(0)))) {
+        SDValue Op(N, 0);
+        SDValue C = LowerHvxBitcast(Op, DAG);
+        Results.push_back(C);
+      }
+      break;
+    default:
+      break;
+  }
+}
+
 SDValue
 HexagonTargetLowering::PerformHvxDAGCombine(SDNode *N, DAGCombinerInfo &DCI)
       const {
@@ -1621,3 +1812,16 @@ HexagonTargetLowering::isHvxOperation(SDValue Op) const {
                         return Subtarget.isHVXVectorType(ty(V), true);
                       });
 }
+
+bool
+HexagonTargetLowering::isHvxOperation(SDNode *N) const {
+  // If the type of any result, or any operand type are HVX vector types,
+  // this is an HVX operation.
+  auto IsHvxTy = [this] (EVT Ty) {
+    return Ty.isSimple() && Subtarget.isHVXVectorType(Ty.getSimpleVT(), true);
+  };
+  auto IsHvxOp = [this] (SDValue Op) {
+    return Subtarget.isHVXVectorType(ty(Op), true);
+  };
+  return llvm::any_of(N->values(), IsHvxTy) || llvm::any_of(N->ops(), IsHvxOp);
+}
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
index f156de671059..ef2b3040931d 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
+++ b/llvm/lib/Target/Hexagon/HexagonInstrFormats.td
@@ -37,6 +37,8 @@ def HVXVectorAccess  : MemAccessSize<5>;
 //                         Instruction Class Declaration +
 //===----------------------------------------------------------------------===//
 
+// "Parse" bits are explicitly NOT defined in the opcode space to prevent
+//  TableGen from using them for generation of the decoder tables.
 class OpcodeHexagon {
   field bits<32> Inst = ?; // Default to an invalid insn.
   bits<4> IClass = 0; // ICLASS
@@ -164,6 +166,9 @@ class InstHexagon<dag outs, dag ins, string asmstr, list<dag> pattern,
   bit CVINew = 0;
   let TSFlags{62} = CVINew;
 
+  bit isCVI = 0;
+  let TSFlags{63} = isCVI;
+
   // Fields used for relation models.
   bit isNonTemporal = 0;
   string isNT = ""; // set to "true" for non-temporal vector stores.
@@ -226,9 +231,105 @@ class PseudoM<dag outs, dag ins, string asmstr, list<dag> pattern = [],
     OpcodeHexagon;
 
 //===----------------------------------------------------------------------===//
+//                         Special Instructions -
+//===----------------------------------------------------------------------===//
+
+// The 'invalid_decode' instruction is used by the disassembler to
+// show an instruction that didn't decode correctly.  This feature
+// is only leveraged in a special disassembler mode that's activated
+// by a command line flag.
+def tc_invalid : InstrItinClass;
+class Enc_invalid : OpcodeHexagon {
+}
+def invalid_decode : HInst<
+(outs ),
+(ins ),
+"<invalid>",
+tc_invalid, TypeALU32_2op>, Enc_invalid {
+let Inst{13-0} = 0b00000000000000;
+let Inst{31-16} = 0b0000000000000000;
+let isCodeGenOnly = 1;
+}
+
+//===----------------------------------------------------------------------===//
+//                      Duplex Instruction Class Declaration
+//===----------------------------------------------------------------------===//
+
+class OpcodeDuplex {
+  field bits<32> Inst = ?; // Default to an invalid insn.
+  bits<4> IClass = 0; // ICLASS
+  bits<13> ISubHi = 0; // Low sub-insn
+  bits<13> ISubLo = 0; // High sub-insn
+
+  let Inst{31-29} = IClass{3-1};
+  let Inst{13}    = IClass{0};
+  let Inst{15-14} = 0;
+  let Inst{28-16} = ISubHi;
+  let Inst{12-0}  = ISubLo;
+}
+
+class InstDuplex<bits<4> iClass, list<dag> pattern = [],
+                 string cstr = "">
+  : Instruction, OpcodeDuplex {
+  let Namespace = "Hexagon";
+  IType Type = TypeDUPLEX;  // uses slot 0,1
+  let isCodeGenOnly = 1;
+  let hasSideEffects = 0;
+  dag OutOperandList = (outs);
+  dag InOperandList = (ins);
+  let IClass = iClass;
+  let Constraints = cstr;
+  let Itinerary = DUPLEX;
+  let Size = 4;
+
+  // SoftFail is a field the disassembler can use to provide a way for
+  // instructions to not match without killing the whole decode process. It is
+  // mainly used for ARM, but Tablegen expects this field to exist or it fails
+  // to build the decode table.
+  field bits<32> SoftFail = 0;
+
+  // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
+
+  let TSFlags{6-0} = Type.Value;
+
+  // Predicated instructions.
+  bits<1> isPredicated = 0;
+  let TSFlags{7} = isPredicated;
+  bits<1> isPredicatedFalse = 0;
+  let TSFlags{8} = isPredicatedFalse;
+  bits<1> isPredicatedNew = 0;
+  let TSFlags{9} = isPredicatedNew;
+
+  // New-value insn helper fields.
+  bits<1> isNewValue = 0;
+  let TSFlags{10} = isNewValue; // New-value consumer insn.
+  bits<1> hasNewValue = 0;
+  let TSFlags{11} = hasNewValue; // New-value producer insn.
+  bits<3> opNewValue = 0;
+  let TSFlags{14-12} = opNewValue; // New-value produced operand.
+  bits<1> isNVStorable = 0;
+  let TSFlags{15} = isNVStorable; // Store that can become new-value store.
+  bits<1> isNVStore = 0;
+  let TSFlags{16} = isNVStore; // New-value store insn.
+
+  // Immediate extender helper fields.
+  bits<1> isExtendable = 0;
+  let TSFlags{17} = isExtendable; // Insn may be extended.
+  bits<1> isExtended = 0;
+  let TSFlags{18} = isExtended; // Insn must be extended.
+  bits<3> opExtendable = 0;
+  let TSFlags{21-19} = opExtendable; // Which operand may be extended.
+  bits<1> isExtentSigned = 0;
+  let TSFlags{22} = isExtentSigned; // Signed or unsigned range.
+  bits<5> opExtentBits = 0;
+  let TSFlags{27-23} = opExtentBits; //Number of bits of range before extending.
+  bits<2> opExtentAlign = 0;
+  let TSFlags{29-28} = opExtentAlign; // Alignment exponent before extending.
+}
+
+//===----------------------------------------------------------------------===//
 //                         Instruction Classes Definitions -
 //===----------------------------------------------------------------------===//
 
-include "HexagonInstrFormatsV5.td"
 include "HexagonInstrFormatsV60.td"
 include "HexagonInstrFormatsV65.td"
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV5.td b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV5.td
deleted file mode 100644
index 68ef2d2d3a8a..000000000000
--- a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV5.td
+++ /dev/null
@@ -1,86 +0,0 @@
-//==- HexagonInstrFormatsV5.td - Hexagon Instruction Formats --*- tablegen -==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This file describes the Hexagon V5 instruction classes in TableGen format.
-//
-//===----------------------------------------------------------------------===//
-
-//                      Duplex Instruction Class Declaration
-//===----------------------------------------------------------------------===//
-
-class OpcodeDuplex {
-  field bits<32> Inst = ?; // Default to an invalid insn.
-  bits<4> IClass = 0; // ICLASS
-  bits<13> ISubHi = 0; // Low sub-insn
-  bits<13> ISubLo = 0; // High sub-insn
-
-  let Inst{31-29} = IClass{3-1};
-  let Inst{13}    = IClass{0};
-  let Inst{15-14} = 0;
-  let Inst{28-16} = ISubHi;
-  let Inst{12-0}  = ISubLo;
-}
-
-class InstDuplex<bits<4> iClass, list<dag> pattern = [],
-                 string cstr = "">
-  : Instruction, OpcodeDuplex {
-  let Namespace = "Hexagon";
-  IType Type = TypeDUPLEX;  // uses slot 0,1
-  let isCodeGenOnly = 1;
-  let hasSideEffects = 0;
-  dag OutOperandList = (outs);
-  dag InOperandList = (ins);
-  let IClass = iClass;
-  let Constraints = cstr;
-  let Itinerary = DUPLEX;
-  let Size = 4;
-
-  // SoftFail is a field the disassembler can use to provide a way for
-  // instructions to not match without killing the whole decode process. It is
-  // mainly used for ARM, but Tablegen expects this field to exist or it fails
-  // to build the decode table.
-  field bits<32> SoftFail = 0;
-
-  // *** Must match MCTargetDesc/HexagonBaseInfo.h ***
-
-  let TSFlags{6-0} = Type.Value;
-
-  // Predicated instructions.
-  bits<1> isPredicated = 0;
-  let TSFlags{7} = isPredicated;
-  bits<1> isPredicatedFalse = 0;
-  let TSFlags{8} = isPredicatedFalse;
-  bits<1> isPredicatedNew = 0;
-  let TSFlags{9} = isPredicatedNew;
-
-  // New-value insn helper fields.
-  bits<1> isNewValue = 0;
-  let TSFlags{10} = isNewValue; // New-value consumer insn.
-  bits<1> hasNewValue = 0;
-  let TSFlags{11} = hasNewValue; // New-value producer insn.
-  bits<3> opNewValue = 0;
-  let TSFlags{14-12} = opNewValue; // New-value produced operand.
-  bits<1> isNVStorable = 0;
-  let TSFlags{15} = isNVStorable; // Store that can become new-value store.
-  bits<1> isNVStore = 0;
-  let TSFlags{16} = isNVStore; // New-value store insn.
-
-  // Immediate extender helper fields.
-  bits<1> isExtendable = 0;
-  let TSFlags{17} = isExtendable; // Insn may be extended.
-  bits<1> isExtended = 0;
-  let TSFlags{18} = isExtended; // Insn must be extended.
-  bits<3> opExtendable = 0;
-  let TSFlags{21-19} = opExtendable; // Which operand may be extended.
-  bits<1> isExtentSigned = 0;
-  let TSFlags{22} = isExtentSigned; // Signed or unsigned range.
-  bits<5> opExtentBits = 0;
-  let TSFlags{27-23} = opExtentBits; //Number of bits of range before extending.
-  bits<2> opExtentAlign = 0;
-  let TSFlags{29-28} = opExtentAlign; // Alignment exponent before extending.
-}
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td
index eaecffe9c89e..246a1d364d41 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td
+++ b/llvm/lib/Target/Hexagon/HexagonInstrFormatsV65.td
@@ -11,13 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 //----------------------------------------------------------------------------//
-//                         Hexagon Intruction Flags +
+//                         Hexagon Instruction Flags +
 //
 //                        *** Must match BaseInfo.h ***
 //----------------------------------------------------------------------------//
 
 //----------------------------------------------------------------------------//
-//                         Intruction Classes Definitions +
+//                         Instruction Classes Definitions +
 //----------------------------------------------------------------------------//
 
 class CVI_VA_Resource_NoOpcode<dag outs, dag ins, string asmstr,
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
index 39ec8936214e..d1cd23c3be3e 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.cpp
@@ -118,6 +118,12 @@ HexagonInstrInfo::HexagonInstrInfo(HexagonSubtarget &ST)
   : HexagonGenInstrInfo(Hexagon::ADJCALLSTACKDOWN, Hexagon::ADJCALLSTACKUP),
     Subtarget(ST) {}
 
+namespace llvm {
+namespace HexagonFUnits {
+  bool isSlot0Only(unsigned units);
+}
+}
+
 static bool isIntRegForSubInst(unsigned Reg) {
   return (Reg >= Hexagon::R0 && Reg <= Hexagon::R7) ||
          (Reg >= Hexagon::R16 && Reg <= Hexagon::R23);
@@ -370,7 +376,7 @@ bool HexagonInstrInfo::hasStoreToStackSlot(
 /// This function can analyze one/two way branching only and should (mostly) be
 /// called by target independent side.
 /// First entry is always the opcode of the branching instruction, except when
-/// the Cond vector is supposed to be empty, e.g., when AnalyzeBranch fails, a
+/// the Cond vector is supposed to be empty, e.g., when analyzeBranch fails, a
 /// BB with only unconditional jump. Subsequent entries depend upon the opcode,
 /// e.g. Jump_c p will have
 /// Cond[0] = Jump_c
@@ -784,6 +790,25 @@ bool HexagonInstrInfo::isProfitableToDupForIfCvt(MachineBasicBlock &MBB,
   return NumInstrs <= 4;
 }
 
+static void getLiveInRegsAt(LivePhysRegs &Regs, const MachineInstr &MI) {
+  SmallVector<std::pair<MCPhysReg, const MachineOperand*>,2> Clobbers;
+  const MachineBasicBlock &B = *MI.getParent();
+  Regs.addLiveIns(B);
+  auto E = MachineBasicBlock::const_iterator(MI.getIterator());
+  for (auto I = B.begin(); I != E; ++I) {
+    Clobbers.clear();
+    Regs.stepForward(*I, Clobbers);
+  }
+}
+
+static void getLiveOutRegsAt(LivePhysRegs &Regs, const MachineInstr &MI) {
+  const MachineBasicBlock &B = *MI.getParent();
+  Regs.addLiveOuts(B);
+  auto E = ++MachineBasicBlock::const_iterator(MI.getIterator()).getReverse();
+  for (auto I = B.rbegin(); I != E; ++I)
+    Regs.stepBackward(*I);
+}
+
 void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
                                    MachineBasicBlock::iterator I,
                                    const DebugLoc &DL, MCRegister DestReg,
@@ -849,11 +874,15 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
   if (Hexagon::HvxWRRegClass.contains(SrcReg, DestReg)) {
-    Register LoSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
-    Register HiSrc = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+    LivePhysRegs LiveAtMI(HRI);
+    getLiveInRegsAt(LiveAtMI, *I);
+    Register SrcLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+    Register SrcHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+    unsigned UndefLo = getUndefRegState(!LiveAtMI.contains(SrcLo));
+    unsigned UndefHi = getUndefRegState(!LiveAtMI.contains(SrcHi));
     BuildMI(MBB, I, DL, get(Hexagon::V6_vcombine), DestReg)
-      .addReg(HiSrc, KillFlag)
-      .addReg(LoSrc, KillFlag);
+      .addReg(SrcHi, KillFlag | UndefHi)
+      .addReg(SrcLo, KillFlag | UndefLo);
     return;
   }
   if (Hexagon::HvxQRRegClass.contains(SrcReg, DestReg)) {
@@ -882,17 +911,16 @@ void HexagonInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 }
 
 void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-      MachineBasicBlock::iterator I, unsigned SrcReg, bool isKill, int FI,
+      MachineBasicBlock::iterator I, Register SrcReg, bool isKill, int FI,
       const TargetRegisterClass *RC, const TargetRegisterInfo *TRI) const {
   DebugLoc DL = MBB.findDebugLoc(I);
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  unsigned SlotAlign = MFI.getObjectAlignment(FI);
   unsigned KillFlag = getKillRegState(isKill);
 
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOStore,
-      MFI.getObjectSize(FI), SlotAlign);
+      MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
 
   if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::S2_storeri_io))
@@ -928,17 +956,16 @@ void HexagonInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 }
 
 void HexagonInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, unsigned DestReg,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I, Register DestReg,
     int FI, const TargetRegisterClass *RC,
     const TargetRegisterInfo *TRI) const {
   DebugLoc DL = MBB.findDebugLoc(I);
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  unsigned SlotAlign = MFI.getObjectAlignment(FI);
 
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI), MachineMemOperand::MOLoad,
-      MFI.getObjectSize(FI), SlotAlign);
+      MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
 
   if (Hexagon::IntRegsRegClass.hasSubClassEq(RC)) {
     BuildMI(MBB, I, DL, get(Hexagon::L2_loadri_io), DestReg)
@@ -966,14 +993,6 @@ void HexagonInstrInfo::loadRegFromStackSlot(
   }
 }
 
-static void getLiveRegsAt(LivePhysRegs &Regs, const MachineInstr &MI) {
-  const MachineBasicBlock &B = *MI.getParent();
-  Regs.addLiveOuts(B);
-  auto E = ++MachineBasicBlock::const_iterator(MI.getIterator()).getReverse();
-  for (auto I = B.rbegin(); I != E; ++I)
-    Regs.stepBackward(*I);
-}
-
 /// expandPostRAPseudo - This function is called for all pseudo instructions
 /// that remain after register allocation. Many pseudo instructions are
 /// created to help register allocation. This is the place to convert them
@@ -985,6 +1004,7 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   MachineFunction &MF = *MBB.getParent();
   MachineRegisterInfo &MRI = MF.getRegInfo();
   const HexagonRegisterInfo &HRI = *Subtarget.getRegisterInfo();
+  LivePhysRegs LiveIn(HRI), LiveOut(HRI);
   DebugLoc DL = MI.getDebugLoc();
   unsigned Opc = MI.getOpcode();
 
@@ -1005,10 +1025,9 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   auto UseAligned = [&] (const MachineInstr &MI, unsigned NeedAlign) {
     if (MI.memoperands().empty())
       return false;
-    return all_of(MI.memoperands(),
-                  [NeedAlign] (const MachineMemOperand *MMO) {
-                    return NeedAlign <= MMO->getAlignment();
-                  });
+    return all_of(MI.memoperands(), [NeedAlign](const MachineMemOperand *MMO) {
+      return MMO->getAlign() >= NeedAlign;
+    });
   };
 
   switch (Opc) {
@@ -1032,10 +1051,15 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     case Hexagon::V6_vassignp: {
       Register SrcReg = MI.getOperand(1).getReg();
       Register DstReg = MI.getOperand(0).getReg();
+      Register SrcLo = HRI.getSubReg(SrcReg, Hexagon::vsub_lo);
+      Register SrcHi = HRI.getSubReg(SrcReg, Hexagon::vsub_hi);
+      getLiveInRegsAt(LiveIn, MI);
+      unsigned UndefLo = getUndefRegState(!LiveIn.contains(SrcLo));
+      unsigned UndefHi = getUndefRegState(!LiveIn.contains(SrcHi));
       unsigned Kill = getKillRegState(MI.getOperand(1).isKill());
       BuildMI(MBB, MI, DL, get(Hexagon::V6_vcombine), DstReg)
-        .addReg(HRI.getSubReg(SrcReg, Hexagon::vsub_hi), Kill)
-        .addReg(HRI.getSubReg(SrcReg, Hexagon::vsub_lo), Kill);
+          .addReg(SrcHi, UndefHi)
+          .addReg(SrcLo, Kill | UndefLo);
       MBB.erase(MI);
       return true;
     }
@@ -1255,9 +1279,8 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       const MachineOperand &Op1 = MI.getOperand(1);
       const MachineOperand &Op2 = MI.getOperand(2);
       const MachineOperand &Op3 = MI.getOperand(3);
-      LivePhysRegs LiveAtMI(HRI);
-      getLiveRegsAt(LiveAtMI, MI);
-      bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
+      getLiveOutRegsAt(LiveOut, MI);
+      bool IsDestLive = !LiveOut.available(MRI, Op0.getReg());
       Register PReg = Op1.getReg();
       assert(Op1.getSubReg() == 0);
       unsigned PState = getRegState(Op1);
@@ -1289,9 +1312,8 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       MachineOperand &Op1 = MI.getOperand(1);
       MachineOperand &Op2 = MI.getOperand(2);
       MachineOperand &Op3 = MI.getOperand(3);
-      LivePhysRegs LiveAtMI(HRI);
-      getLiveRegsAt(LiveAtMI, MI);
-      bool IsDestLive = !LiveAtMI.available(MRI, Op0.getReg());
+      getLiveOutRegsAt(LiveOut, MI);
+      bool IsDestLive = !LiveOut.available(MRI, Op0.getReg());
       Register PReg = Op1.getReg();
       assert(Op1.getSubReg() == 0);
       unsigned PState = getRegState(Op1);
@@ -1349,7 +1371,8 @@ bool HexagonInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
       static const CrashPseudoSourceValue CrashPSV(*this);
       MachineMemOperand *MMO = MF.getMachineMemOperand(
           MachinePointerInfo(&CrashPSV),
-          MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 8, 1);
+          MachineMemOperand::MOLoad | MachineMemOperand::MOVolatile, 8,
+          Align(1));
       BuildMI(MBB, MI, DL, get(Hexagon::PS_loadrdabs), Hexagon::D13)
         .addImm(0xBADC0FEE)  // Misaligned load.
         .addMemOperand(MMO);
@@ -1707,6 +1730,10 @@ bool HexagonInstrInfo::isSchedulingBoundary(const MachineInstr &MI,
   if (MI.getDesc().isTerminator() || MI.isPosition())
     return true;
 
+  // INLINEASM_BR can jump to another block
+  if (MI.getOpcode() == TargetOpcode::INLINEASM_BR)
+    return true;
+
   if (MI.isInlineAsm() && !ScheduleInlineAsm)
     return true;
 
@@ -1735,7 +1762,7 @@ unsigned HexagonInstrInfo::getInlineAsmLength(const char *Str,
     if (*Str == '\n' || strncmp(Str, MAI.getSeparatorString(),
                                 strlen(MAI.getSeparatorString())) == 0)
       atInsnStart = true;
-    if (atInsnStart && !std::isspace(static_cast<unsigned char>(*Str))) {
+    if (atInsnStart && !isSpace(static_cast<unsigned char>(*Str))) {
       Length += MaxInstLength;
       atInsnStart = false;
     }
@@ -1762,8 +1789,8 @@ HexagonInstrInfo::CreateTargetPostRAHazardRecognizer(
 /// \p SrcReg and \p SrcReg2 if having two register operands, and the value it
 /// compares against in CmpValue. Return true if the comparison instruction
 /// can be analyzed.
-bool HexagonInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                                      unsigned &SrcReg2, int &Mask,
+bool HexagonInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                                      Register &SrcReg2, int &Mask,
                                       int &Value) const {
   unsigned Opc = MI.getOpcode();
 
@@ -2940,12 +2967,16 @@ bool HexagonInstrInfo::addLatencyToSchedule(const MachineInstr &MI1,
 }
 
 /// Get the base register and byte offset of a load/store instr.
-bool HexagonInstrInfo::getMemOperandWithOffset(
-    const MachineInstr &LdSt, const MachineOperand *&BaseOp, int64_t &Offset,
+bool HexagonInstrInfo::getMemOperandsWithOffsetWidth(
+    const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
+    int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
     const TargetRegisterInfo *TRI) const {
-  unsigned AccessSize = 0;
-  BaseOp = getBaseAndOffset(LdSt, Offset, AccessSize);
-  return BaseOp != nullptr && BaseOp->isReg();
+  OffsetIsScalable = false;
+  const MachineOperand *BaseOp = getBaseAndOffset(LdSt, Offset, Width);
+  if (!BaseOp || !BaseOp->isReg())
+    return false;
+  BaseOps.push_back(BaseOp);
+  return true;
 }
 
 /// Can these instructions execute at the same time in a bundle.
@@ -3403,6 +3434,64 @@ unsigned HexagonInstrInfo::getCompoundOpcode(const MachineInstr &GA,
                                 : Hexagon::J4_cmpeqi_tp1_jump_nt;
 }
 
+// Returns -1 if there is no opcode found.
+int HexagonInstrInfo::getDuplexOpcode(const MachineInstr &MI,
+                                      bool ForBigCore) const {
+  // Static table to switch the opcodes across Tiny Core and Big Core.
+  // dup_ opcodes are Big core opcodes.
+  // NOTE: There are special instructions that need to handled later.
+  // L4_return* instructions, they will only occupy SLOT0 (on big core too).
+  // PS_jmpret - This pseudo translates to J2_jumpr which occupies only SLOT2.
+  // The compiler need to base the root instruction to L6_return_map_to_raw
+  // which can go any slot.
+  static const std::map<unsigned, unsigned> DupMap = {
+      {Hexagon::A2_add, Hexagon::dup_A2_add},
+      {Hexagon::A2_addi, Hexagon::dup_A2_addi},
+      {Hexagon::A2_andir, Hexagon::dup_A2_andir},
+      {Hexagon::A2_combineii, Hexagon::dup_A2_combineii},
+      {Hexagon::A2_sxtb, Hexagon::dup_A2_sxtb},
+      {Hexagon::A2_sxth, Hexagon::dup_A2_sxth},
+      {Hexagon::A2_tfr, Hexagon::dup_A2_tfr},
+      {Hexagon::A2_tfrsi, Hexagon::dup_A2_tfrsi},
+      {Hexagon::A2_zxtb, Hexagon::dup_A2_zxtb},
+      {Hexagon::A2_zxth, Hexagon::dup_A2_zxth},
+      {Hexagon::A4_combineii, Hexagon::dup_A4_combineii},
+      {Hexagon::A4_combineir, Hexagon::dup_A4_combineir},
+      {Hexagon::A4_combineri, Hexagon::dup_A4_combineri},
+      {Hexagon::C2_cmoveif, Hexagon::dup_C2_cmoveif},
+      {Hexagon::C2_cmoveit, Hexagon::dup_C2_cmoveit},
+      {Hexagon::C2_cmovenewif, Hexagon::dup_C2_cmovenewif},
+      {Hexagon::C2_cmovenewit, Hexagon::dup_C2_cmovenewit},
+      {Hexagon::C2_cmpeqi, Hexagon::dup_C2_cmpeqi},
+      {Hexagon::L2_deallocframe, Hexagon::dup_L2_deallocframe},
+      {Hexagon::L2_loadrb_io, Hexagon::dup_L2_loadrb_io},
+      {Hexagon::L2_loadrd_io, Hexagon::dup_L2_loadrd_io},
+      {Hexagon::L2_loadrh_io, Hexagon::dup_L2_loadrh_io},
+      {Hexagon::L2_loadri_io, Hexagon::dup_L2_loadri_io},
+      {Hexagon::L2_loadrub_io, Hexagon::dup_L2_loadrub_io},
+      {Hexagon::L2_loadruh_io, Hexagon::dup_L2_loadruh_io},
+      {Hexagon::S2_allocframe, Hexagon::dup_S2_allocframe},
+      {Hexagon::S2_storerb_io, Hexagon::dup_S2_storerb_io},
+      {Hexagon::S2_storerd_io, Hexagon::dup_S2_storerd_io},
+      {Hexagon::S2_storerh_io, Hexagon::dup_S2_storerh_io},
+      {Hexagon::S2_storeri_io, Hexagon::dup_S2_storeri_io},
+      {Hexagon::S4_storeirb_io, Hexagon::dup_S4_storeirb_io},
+      {Hexagon::S4_storeiri_io, Hexagon::dup_S4_storeiri_io},
+  };
+  unsigned OpNum = MI.getOpcode();
+  // Conversion to Big core.
+  if (ForBigCore) {
+    auto Iter = DupMap.find(OpNum);
+    if (Iter != DupMap.end())
+      return Iter->second;
+  } else { // Conversion to Tiny core.
+    for (auto Iter = DupMap.begin(), End = DupMap.end(); Iter != End; ++Iter)
+      if (Iter->second == OpNum)
+        return Iter->first;
+  }
+  return -1;
+}
+
 int HexagonInstrInfo::getCondOpcode(int Opc, bool invertPredicate) const {
   enum Hexagon::PredSense inPredSense;
   inPredSense = invertPredicate ? Hexagon::PredSense_false :
@@ -3735,6 +3824,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
   // Rd = memw(Rs+#u4:2)
   // Rd = memub(Rs+#u4:0)
   case Hexagon::L2_loadri_io:
+  case Hexagon::dup_L2_loadri_io:
     DstReg = MI.getOperand(0).getReg();
     SrcReg = MI.getOperand(1).getReg();
     // Special case this one from Group L2.
@@ -3753,6 +3843,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
     }
     break;
   case Hexagon::L2_loadrub_io:
+  case Hexagon::dup_L2_loadrub_io:
     // Rd = memub(Rs+#u4:0)
     DstReg = MI.getOperand(0).getReg();
     SrcReg = MI.getOperand(1).getReg();
@@ -3772,6 +3863,8 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
   // [if ([!]p0[.new])] jumpr r31
   case Hexagon::L2_loadrh_io:
   case Hexagon::L2_loadruh_io:
+  case Hexagon::dup_L2_loadrh_io:
+  case Hexagon::dup_L2_loadruh_io:
     // Rd = memh/memuh(Rs+#u3:1)
     DstReg = MI.getOperand(0).getReg();
     SrcReg = MI.getOperand(1).getReg();
@@ -3781,6 +3874,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
       return HexagonII::HSIG_L2;
     break;
   case Hexagon::L2_loadrb_io:
+  case Hexagon::dup_L2_loadrb_io:
     // Rd = memb(Rs+#u3:0)
     DstReg = MI.getOperand(0).getReg();
     SrcReg = MI.getOperand(1).getReg();
@@ -3790,6 +3884,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
       return HexagonII::HSIG_L2;
     break;
   case Hexagon::L2_loadrd_io:
+  case Hexagon::dup_L2_loadrd_io:
     // Rdd = memd(r29+#u5:3)
     DstReg = MI.getOperand(0).getReg();
     SrcReg = MI.getOperand(1).getReg();
@@ -3806,6 +3901,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
   case Hexagon::RESTORE_DEALLOC_RET_JMP_V4_PIC:
   case Hexagon::L4_return:
   case Hexagon::L2_deallocframe:
+  case Hexagon::dup_L2_deallocframe:
     return HexagonII::HSIG_L2;
   case Hexagon::EH_RETURN_JMPR:
   case Hexagon::PS_jmpret:
@@ -3825,6 +3921,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
   case Hexagon::SL2_jumpr31_t:
   case Hexagon::SL2_jumpr31_f:
   case Hexagon::SL2_jumpr31_tnew:
+  case Hexagon::SL2_jumpr31_fnew:
     DstReg = MI.getOperand(1).getReg();
     SrcReg = MI.getOperand(0).getReg();
     // [if ([!]p0[.new])] jumpr r31
@@ -3850,6 +3947,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
   // memw(Rs+#u4:2) = Rt
   // memb(Rs+#u4:0) = Rt
   case Hexagon::S2_storeri_io:
+  case Hexagon::dup_S2_storeri_io:
     // Special case this one from Group S2.
     // memw(r29+#u5:2) = Rt
     Src1Reg = MI.getOperand(0).getReg();
@@ -3866,6 +3964,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
       return HexagonII::HSIG_S1;
     break;
   case Hexagon::S2_storerb_io:
+  case Hexagon::dup_S2_storerb_io:
     // memb(Rs+#u4:0) = Rt
     Src1Reg = MI.getOperand(0).getReg();
     Src2Reg = MI.getOperand(2).getReg();
@@ -3883,6 +3982,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
   // memb(Rs+#u4) = #U1
   // allocframe(#u5:3)
   case Hexagon::S2_storerh_io:
+  case Hexagon::dup_S2_storerh_io:
     // memh(Rs+#u3:1) = Rt
     Src1Reg = MI.getOperand(0).getReg();
     Src2Reg = MI.getOperand(2).getReg();
@@ -3892,6 +3992,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
       return HexagonII::HSIG_S1;
     break;
   case Hexagon::S2_storerd_io:
+  case Hexagon::dup_S2_storerd_io:
     // memd(r29+#s6:3) = Rtt
     Src1Reg = MI.getOperand(0).getReg();
     Src2Reg = MI.getOperand(2).getReg();
@@ -3902,6 +4003,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
       return HexagonII::HSIG_S2;
     break;
   case Hexagon::S4_storeiri_io:
+  case Hexagon::dup_S4_storeiri_io:
     // memw(Rs+#u4:2) = #U1
     Src1Reg = MI.getOperand(0).getReg();
     if (isIntRegForSubInst(Src1Reg) && MI.getOperand(1).isImm() &&
@@ -3910,6 +4012,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
       return HexagonII::HSIG_S2;
     break;
   case Hexagon::S4_storeirb_io:
+  case Hexagon::dup_S4_storeirb_io:
     // memb(Rs+#u4) = #U1
     Src1Reg = MI.getOperand(0).getReg();
     if (isIntRegForSubInst(Src1Reg) &&
@@ -3918,6 +4021,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
       return HexagonII::HSIG_S2;
     break;
   case Hexagon::S2_allocframe:
+  case Hexagon::dup_S2_allocframe:
     if (MI.getOperand(2).isImm() &&
         isShiftedUInt<5,3>(MI.getOperand(2).getImm()))
       return HexagonII::HSIG_S1;
@@ -3941,6 +4045,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
   // Rd = sxth/sxtb/zxtb/zxth(Rs)
   // Rd = and(Rs,#1)
   case Hexagon::A2_addi:
+  case Hexagon::dup_A2_addi:
     DstReg = MI.getOperand(0).getReg();
     SrcReg = MI.getOperand(1).getReg();
     if (isIntRegForSubInst(DstReg)) {
@@ -3962,6 +4067,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
     }
     break;
   case Hexagon::A2_add:
+  case Hexagon::dup_A2_add:
     // Rx = add(Rx,Rs)
     DstReg = MI.getOperand(0).getReg();
     Src1Reg = MI.getOperand(1).getReg();
@@ -3971,6 +4077,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
       return HexagonII::HSIG_A;
     break;
   case Hexagon::A2_andir:
+  case Hexagon::dup_A2_andir:
     // Same as zxtb.
     // Rd16=and(Rs16,#255)
     // Rd16=and(Rs16,#1)
@@ -3983,6 +4090,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
       return HexagonII::HSIG_A;
     break;
   case Hexagon::A2_tfr:
+  case Hexagon::dup_A2_tfr:
     // Rd = Rs
     DstReg = MI.getOperand(0).getReg();
     SrcReg = MI.getOperand(1).getReg();
@@ -3990,6 +4098,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
       return HexagonII::HSIG_A;
     break;
   case Hexagon::A2_tfrsi:
+  case Hexagon::dup_A2_tfrsi:
     // Rd = #u6
     // Do not test for #u6 size since the const is getting extended
     // regardless and compound could be formed.
@@ -4002,6 +4111,10 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
   case Hexagon::C2_cmovenewit:
   case Hexagon::C2_cmoveif:
   case Hexagon::C2_cmovenewif:
+  case Hexagon::dup_C2_cmoveit:
+  case Hexagon::dup_C2_cmovenewit:
+  case Hexagon::dup_C2_cmoveif:
+  case Hexagon::dup_C2_cmovenewif:
     // if ([!]P0[.new]) Rd = #0
     // Actual form:
     // %r16 = C2_cmovenewit internal %p0, 0, implicit undef %r16;
@@ -4013,6 +4126,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
       return HexagonII::HSIG_A;
     break;
   case Hexagon::C2_cmpeqi:
+  case Hexagon::dup_C2_cmpeqi:
     // P0 = cmp.eq(Rs,#u2)
     DstReg = MI.getOperand(0).getReg();
     SrcReg = MI.getOperand(1).getReg();
@@ -4023,6 +4137,8 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
     break;
   case Hexagon::A2_combineii:
   case Hexagon::A4_combineii:
+  case Hexagon::dup_A2_combineii:
+  case Hexagon::dup_A4_combineii:
     // Rdd = combine(#u2,#U2)
     DstReg = MI.getOperand(0).getReg();
     if (isDblRegForSubInst(DstReg, HRI) &&
@@ -4035,6 +4151,8 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
       return HexagonII::HSIG_A;
     break;
   case Hexagon::A4_combineri:
+  case Hexagon::dup_A4_combineri:
+    // Rdd = combine(Rs,#0)
     // Rdd = combine(Rs,#0)
     DstReg = MI.getOperand(0).getReg();
     SrcReg = MI.getOperand(1).getReg();
@@ -4044,6 +4162,7 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
       return HexagonII::HSIG_A;
     break;
   case Hexagon::A4_combineir:
+  case Hexagon::dup_A4_combineir:
     // Rdd = combine(#0,Rs)
     DstReg = MI.getOperand(0).getReg();
     SrcReg = MI.getOperand(2).getReg();
@@ -4056,6 +4175,10 @@ HexagonII::SubInstructionGroup HexagonInstrInfo::getDuplexCandidateGroup(
   case Hexagon::A2_sxth:
   case Hexagon::A2_zxtb:
   case Hexagon::A2_zxth:
+  case Hexagon::dup_A2_sxtb:
+  case Hexagon::dup_A2_sxth:
+  case Hexagon::dup_A2_zxtb:
+  case Hexagon::dup_A2_zxth:
     // Rd = sxth/sxtb/zxtb/zxth(Rs)
     DstReg = MI.getOperand(0).getReg();
     SrcReg = MI.getOperand(1).getReg();
@@ -4199,6 +4322,61 @@ bool HexagonInstrInfo::isAddrModeWithOffset(const MachineInstr &MI) const {
           addrMode == HexagonII::BaseLongOffset);
 }
 
+bool HexagonInstrInfo::isPureSlot0(const MachineInstr &MI) const {
+  // Workaround for the Global Scheduler. Sometimes, it creates
+  // A4_ext as a Pseudo instruction and calls this function to see if
+  // it can be added to an existing bundle. Since the instruction doesn't
+  // belong to any BB yet, we can't use getUnits API.
+  if (MI.getOpcode() == Hexagon::A4_ext)
+    return false;
+
+  unsigned FuncUnits = getUnits(MI);
+  return HexagonFUnits::isSlot0Only(FuncUnits);
+}
+
+bool HexagonInstrInfo::isRestrictNoSlot1Store(const MachineInstr &MI) const {
+  const uint64_t F = MI.getDesc().TSFlags;
+  return ((F >> HexagonII::RestrictNoSlot1StorePos) &
+          HexagonII::RestrictNoSlot1StoreMask);
+}
+
+void HexagonInstrInfo::changeDuplexOpcode(MachineBasicBlock::instr_iterator MII,
+                                          bool ToBigInstrs) const {
+  int Opcode = -1;
+  if (ToBigInstrs) { // To BigCore Instr.
+    // Check if the instruction can form a Duplex.
+    if (getDuplexCandidateGroup(*MII))
+      // Get the opcode marked "dup_*" tag.
+      Opcode = getDuplexOpcode(*MII, ToBigInstrs);
+  } else // To TinyCore Instr.
+    Opcode = getDuplexOpcode(*MII, ToBigInstrs);
+
+  // Change the opcode of the instruction.
+  if (Opcode >= 0)
+    MII->setDesc(get(Opcode));
+}
+
+// This function is used to translate instructions to facilitate generating
+// Duplexes on TinyCore.
+void HexagonInstrInfo::translateInstrsForDup(MachineFunction &MF,
+                                             bool ToBigInstrs) const {
+  for (auto &MB : MF)
+    for (MachineBasicBlock::instr_iterator Instr = MB.instr_begin(),
+                                           End = MB.instr_end();
+         Instr != End; ++Instr)
+      changeDuplexOpcode(Instr, ToBigInstrs);
+}
+
+// This is a specialized form of above function.
+void HexagonInstrInfo::translateInstrsForDup(
+    MachineBasicBlock::instr_iterator MII, bool ToBigInstrs) const {
+  MachineBasicBlock *MBB = MII->getParent();
+  while ((MII != MBB->instr_end()) && MII->isInsideBundle()) {
+    changeDuplexOpcode(MII, ToBigInstrs);
+    ++MII;
+  }
+}
+
 unsigned HexagonInstrInfo::getMemAccessSize(const MachineInstr &MI) const {
   using namespace HexagonII;
 
@@ -4328,7 +4506,7 @@ uint64_t HexagonInstrInfo::getType(const MachineInstr &MI) const {
   return (F >> HexagonII::TypePos) & HexagonII::TypeMask;
 }
 
-unsigned HexagonInstrInfo::getUnits(const MachineInstr &MI) const {
+InstrStage::FuncUnits HexagonInstrInfo::getUnits(const MachineInstr &MI) const {
   const InstrItineraryData &II = *Subtarget.getInstrItineraryData();
   const InstrStage &IS = *II.beginStage(MI.getDesc().getSchedClass());
 
diff --git a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
index 676f6f0a2a8c..847b9a672891 100644
--- a/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonInstrInfo.h
@@ -109,19 +109,19 @@ public:
                      bool AllowModify) const override;
 
   /// Remove the branching code at the end of the specific MBB.
-  /// This is only invoked in cases where AnalyzeBranch returns success. It
+  /// This is only invoked in cases where analyzeBranch returns success. It
   /// returns the number of instructions that were removed.
   unsigned removeBranch(MachineBasicBlock &MBB,
                         int *BytesRemoved = nullptr) const override;
 
   /// Insert branch code into the end of the specified MachineBasicBlock.
   /// The operands to this method are the same as those
-  /// returned by AnalyzeBranch.  This is only invoked in cases where
-  /// AnalyzeBranch returns success. It returns the number of instructions
+  /// returned by analyzeBranch.  This is only invoked in cases where
+  /// analyzeBranch returns success. It returns the number of instructions
   /// inserted.
   ///
   /// It is also invoked by tail merging to add unconditional branches in
-  /// cases where AnalyzeBranch doesn't apply because there was no original
+  /// cases where analyzeBranch doesn't apply because there was no original
   /// branch to analyze.  At least this much must be implemented, else tail
   /// merging needs to be disabled.
   unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
@@ -182,7 +182,7 @@ public:
   /// is true, the register operand is the last use and must be marked kill.
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           Register SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
@@ -191,7 +191,7 @@ public:
   /// machine basic block before the specified machine instruction.
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
-                            unsigned DestReg, int FrameIndex,
+                            Register DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
@@ -204,10 +204,11 @@ public:
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   /// Get the base register and byte offset of a load/store instr.
-  bool getMemOperandWithOffset(const MachineInstr &LdSt,
-                               const MachineOperand *&BaseOp,
-                               int64_t &Offset,
-                               const TargetRegisterInfo *TRI) const override;
+  bool getMemOperandsWithOffsetWidth(
+      const MachineInstr &LdSt,
+      SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
+      bool &OffsetIsScalable, unsigned &Width,
+      const TargetRegisterInfo *TRI) const override;
 
   /// Reverses the branch condition of the specified condition list,
   /// returning false on success and true if it cannot be reversed.
@@ -268,8 +269,8 @@ public:
   /// in SrcReg and SrcReg2 if having two register operands, and the value it
   /// compares against in CmpValue. Return true if the comparison instruction
   /// can be analyzed.
-  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                      unsigned &SrcReg2, int &Mask, int &Value) const override;
+  bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                      Register &SrcReg2, int &Mask, int &Value) const override;
 
   /// Compute the instruction latency of a given instruction.
   /// If the instruction has higher cost when predicated, it's returned via
@@ -341,10 +342,10 @@ public:
                               MachineBasicBlock *TargetBB,
                               SmallPtrSet<MachineBasicBlock *, 8> &Visited) const;
 
-  bool isBaseImmOffset(const MachineInstr &MI) const;
   bool isAbsoluteSet(const MachineInstr &MI) const;
   bool isAccumulator(const MachineInstr &MI) const;
   bool isAddrModeWithOffset(const MachineInstr &MI) const;
+  bool isBaseImmOffset(const MachineInstr &MI) const;
   bool isComplex(const MachineInstr &MI) const;
   bool isCompoundBranchInstr(const MachineInstr &MI) const;
   bool isConstExtended(const MachineInstr &MI) const;
@@ -387,6 +388,8 @@ public:
   bool isPredicated(unsigned Opcode) const;
   bool isPredicateLate(unsigned Opcode) const;
   bool isPredictedTaken(unsigned Opcode) const;
+  bool isPureSlot0(const MachineInstr &MI) const;
+  bool isRestrictNoSlot1Store(const MachineInstr &MI) const;
   bool isSaveCalleeSavedRegsCall(const MachineInstr &MI) const;
   bool isSignExtendingLoad(const MachineInstr &MI) const;
   bool isSolo(const MachineInstr &MI) const;
@@ -435,6 +438,7 @@ public:
   getCompoundCandidateGroup(const MachineInstr &MI) const;
   unsigned getCompoundOpcode(const MachineInstr &GA,
                              const MachineInstr &GB) const;
+  int getDuplexOpcode(const MachineInstr &MI, bool ForBigCore = true) const;
   int getCondOpcode(int Opc, bool sense) const;
   int getDotCurOp(const MachineInstr &MI) const;
   int getNonDotCurOp(const MachineInstr &MI) const;
@@ -461,7 +465,7 @@ public:
   short getRegForm(const MachineInstr &MI) const;
   unsigned getSize(const MachineInstr &MI) const;
   uint64_t getType(const MachineInstr &MI) const;
-  unsigned getUnits(const MachineInstr &MI) const;
+  InstrStage::FuncUnits getUnits(const MachineInstr &MI) const;
 
   MachineBasicBlock::instr_iterator expandVGatherPseudo(MachineInstr &MI) const;
 
@@ -480,6 +484,17 @@ public:
 
   void setBundleNoShuf(MachineBasicBlock::instr_iterator MIB) const;
   bool getBundleNoShuf(const MachineInstr &MIB) const;
+
+  // When TinyCore with Duplexes is enabled, this function is used to translate
+  // tiny-instructions to big-instructions and vice versa to get the slot
+  // consumption.
+  void changeDuplexOpcode(MachineBasicBlock::instr_iterator MII,
+                          bool ToBigInstrs) const;
+  void translateInstrsForDup(MachineFunction &MF,
+                             bool ToBigInstrs = true) const;
+  void translateInstrsForDup(MachineBasicBlock::instr_iterator MII,
+                             bool ToBigInstrs) const;
+
   // Addressing mode relations.
   short changeAddrMode_abs_io(short Opc) const;
   short changeAddrMode_io_abs(short Opc) const;
diff --git a/llvm/lib/Target/Hexagon/HexagonIntrinsics.td b/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
index 8ae55b207188..10d0261a95dd 100644
--- a/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
+++ b/llvm/lib/Target/Hexagon/HexagonIntrinsics.td
@@ -205,12 +205,12 @@ def: T_stc_pat<S2_storerf_pci, int_hexagon_circ_sthhi, s4_1ImmPred_timm, I32>;
 
 multiclass MaskedStore <InstHexagon MI, Intrinsic IntID> {
   def : Pat<(IntID HvxQR:$src1, IntRegs:$src2, HvxVR:$src3),
-            (MI HvxQR:$src1, IntRegs:$src2, #0, HvxVR:$src3)>,
+            (MI HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>,
         Requires<[UseHVX]>;
 
   def : Pat<(!cast<Intrinsic>(IntID#"_128B") HvxQR:$src1, IntRegs:$src2,
                                              HvxVR:$src3),
-            (MI HvxQR:$src1, IntRegs:$src2, #0, HvxVR:$src3)>,
+            (MI HvxQR:$src1, IntRegs:$src2, 0, HvxVR:$src3)>,
         Requires<[UseHVX]>;
 }
 
@@ -236,6 +236,8 @@ def: T_R_pat<Y2_dczeroa,     int_hexagon_Y2_dczeroa>;
 def: T_RR_pat<Y4_l2fetch,    int_hexagon_Y4_l2fetch>;
 def: T_RP_pat<Y5_l2fetch,    int_hexagon_Y5_l2fetch>;
 
+def: Pat<(int_hexagon_Y2_dcfetch I32:$Rt), (Y2_dcfetchbo I32:$Rt, 0)>;
+
 //
 // Patterns for optimizing code generations for HVX.
 
@@ -277,76 +279,6 @@ def : Pat <(v32i32 (int_hexagon_V6_hi_128B (v64i32 HvxWR:$src1))),
            Requires<[UseHVX]>;
 }
 
-def : Pat <(v512i1 (bitconvert (v16i32 HvxVR:$src1))),
-           (v512i1 (V6_vandvrt (v16i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>,
-           Requires<[UseHVX]>;
-
-def : Pat <(v512i1 (bitconvert (v32i16 HvxVR:$src1))),
-           (v512i1 (V6_vandvrt (v32i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>,
-           Requires<[UseHVX]>;
-
-def : Pat <(v512i1 (bitconvert (v64i8  HvxVR:$src1))),
-           (v512i1 (V6_vandvrt (v64i8  HvxVR:$src1), (A2_tfrsi 0x01010101)))>,
-           Requires<[UseHVX]>;
-
-def : Pat <(v16i32 (bitconvert (v512i1 HvxQR:$src1))),
-           (v16i32 (V6_vandqrt (v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>,
-           Requires<[UseHVX]>;
-
-def : Pat <(v32i16 (bitconvert (v512i1 HvxQR:$src1))),
-           (v32i16 (V6_vandqrt (v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>,
-           Requires<[UseHVX]>;
-
-def : Pat <(v64i8  (bitconvert (v512i1 HvxQR:$src1))),
-           (v64i8  (V6_vandqrt (v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>,
-           Requires<[UseHVX]>;
-
-def : Pat <(v1024i1 (bitconvert (v32i32 HvxVR:$src1))),
-           (v1024i1 (V6_vandvrt (v32i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>,
-           Requires<[UseHVX]>;
-
-def : Pat <(v1024i1 (bitconvert (v64i16 HvxVR:$src1))),
-           (v1024i1 (V6_vandvrt (v64i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>,
-           Requires<[UseHVX]>;
-
-def : Pat <(v1024i1 (bitconvert (v128i8 HvxVR:$src1))),
-           (v1024i1 (V6_vandvrt (v128i8 HvxVR:$src1), (A2_tfrsi 0x01010101)))>,
-           Requires<[UseHVX]>;
-
-def : Pat <(v32i32 (bitconvert (v1024i1 HvxQR:$src1))),
-           (v32i32 (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>,
-           Requires<[UseHVX]>;
-
-def : Pat <(v64i16 (bitconvert (v1024i1 HvxQR:$src1))),
-           (v64i16 (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>,
-           Requires<[UseHVX]>;
-
-def : Pat <(v128i8 (bitconvert (v1024i1 HvxQR:$src1))),
-           (v128i8 (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>,
-           Requires<[UseHVX]>;
-
-let AddedComplexity = 140 in {
-def : Pat <(store (v512i1 HvxQR:$src1), (i32 IntRegs:$addr)),
-           (V6_vS32b_ai IntRegs:$addr, 0,
-           (v16i32 (V6_vandqrt (v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101))))>,
-           Requires<[UseHVX]>;
-
-def : Pat <(v512i1 (load (i32 IntRegs:$addr))),
-           (v512i1 (V6_vandvrt
-           (v16i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>,
-           Requires<[UseHVX]>;
-
-def : Pat <(store (v1024i1 HvxQR:$src1), (i32 IntRegs:$addr)),
-           (V6_vS32b_ai IntRegs:$addr, 0,
-           (v32i32 (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101))))>,
-           Requires<[UseHVX]>;
-
-def : Pat <(v1024i1 (load (i32 IntRegs:$addr))),
-           (v1024i1 (V6_vandvrt
-           (v32i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>,
-           Requires<[UseHVX]>;
-}
-
 def: Pat<(v64i16 (trunc v64i32:$Vdd)),
          (v64i16 (V6_vpackwh_sat
                  (v32i32 (V6_hi HvxWR:$Vdd)),
diff --git a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td b/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
index a60c80beb5d6..1245ee7974b5 100644
--- a/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
+++ b/llvm/lib/Target/Hexagon/HexagonIntrinsicsV60.td
@@ -25,59 +25,59 @@ def : Pat < (v32i32 (int_hexagon_V6_hi_128B (v64i32 HvxWR:$src1))),
             (v32i32 (EXTRACT_SUBREG (v64i32 HvxWR:$src1), vsub_hi)) >;
 }
 
-def : Pat <(v512i1 (bitconvert (v16i32 HvxVR:$src1))),
-           (v512i1 (V6_vandvrt(v16i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
+def : Pat <(v64i1 (bitconvert (v16i32 HvxVR:$src1))),
+           (v64i1 (V6_vandvrt(v16i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
 
-def : Pat <(v512i1 (bitconvert (v32i16 HvxVR:$src1))),
-           (v512i1 (V6_vandvrt(v32i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
+def : Pat <(v64i1 (bitconvert (v32i16 HvxVR:$src1))),
+           (v64i1 (V6_vandvrt(v32i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
 
-def : Pat <(v512i1 (bitconvert (v64i8  HvxVR:$src1))),
-           (v512i1 (V6_vandvrt(v64i8  HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
+def : Pat <(v64i1 (bitconvert (v64i8  HvxVR:$src1))),
+           (v64i1 (V6_vandvrt(v64i8  HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
 
-def : Pat <(v16i32 (bitconvert (v512i1 HvxQR:$src1))),
-           (v16i32 (V6_vandqrt(v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
+def : Pat <(v16i32 (bitconvert (v64i1 HvxQR:$src1))),
+           (v16i32 (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
 
-def : Pat <(v32i16 (bitconvert (v512i1 HvxQR:$src1))),
-           (v32i16 (V6_vandqrt(v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
+def : Pat <(v32i16 (bitconvert (v64i1 HvxQR:$src1))),
+           (v32i16 (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
 
-def : Pat <(v64i8  (bitconvert (v512i1 HvxQR:$src1))),
-           (v64i8  (V6_vandqrt(v512i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
+def : Pat <(v64i8  (bitconvert (v64i1 HvxQR:$src1))),
+           (v64i8  (V6_vandqrt(v64i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
 
-def : Pat <(v1024i1 (bitconvert (v32i32 HvxVR:$src1))),
-           (v1024i1 (V6_vandvrt (v32i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
+def : Pat <(v128i1 (bitconvert (v32i32 HvxVR:$src1))),
+           (v128i1 (V6_vandvrt (v32i32 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
 
-def : Pat <(v1024i1 (bitconvert (v64i16 HvxVR:$src1))),
-           (v1024i1 (V6_vandvrt (v64i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
+def : Pat <(v128i1 (bitconvert (v64i16 HvxVR:$src1))),
+           (v128i1 (V6_vandvrt (v64i16 HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
 
-def : Pat <(v1024i1 (bitconvert (v128i8  HvxVR:$src1))),
-           (v1024i1 (V6_vandvrt (v128i8  HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
+def : Pat <(v128i1 (bitconvert (v128i8  HvxVR:$src1))),
+           (v128i1 (V6_vandvrt (v128i8  HvxVR:$src1), (A2_tfrsi 0x01010101)))>;
 
-def : Pat <(v32i32 (bitconvert (v1024i1 HvxQR:$src1))),
-           (v32i32 (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
+def : Pat <(v32i32 (bitconvert (v128i1 HvxQR:$src1))),
+           (v32i32 (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
 
-def : Pat <(v64i16 (bitconvert (v1024i1 HvxQR:$src1))),
-           (v64i16 (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
+def : Pat <(v64i16 (bitconvert (v128i1 HvxQR:$src1))),
+           (v64i16 (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
 
-def : Pat <(v128i8  (bitconvert (v1024i1 HvxQR:$src1))),
-           (v128i8  (V6_vandqrt (v1024i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
+def : Pat <(v128i8  (bitconvert (v128i1 HvxQR:$src1))),
+           (v128i8  (V6_vandqrt (v128i1 HvxQR:$src1), (A2_tfrsi 0x01010101)))>;
 
 let AddedComplexity = 140 in {
-def : Pat <(store (v512i1 HvxQR:$src1), (i32 IntRegs:$addr)),
+def : Pat <(store (v64i1 HvxQR:$src1), (i32 IntRegs:$addr)),
            (V6_vS32b_ai IntRegs:$addr, 0,
-           (v16i32 (V6_vandqrt (v512i1 HvxQR:$src1),
+           (v16i32 (V6_vandqrt (v64i1 HvxQR:$src1),
                                        (A2_tfrsi 0x01010101))))>;
 
-def : Pat <(v512i1 (load (i32 IntRegs:$addr))),
-           (v512i1 (V6_vandvrt
+def : Pat <(v64i1 (load (i32 IntRegs:$addr))),
+           (v64i1 (V6_vandvrt
            (v16i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>;
 
-def : Pat <(store (v1024i1 HvxQR:$src1), (i32 IntRegs:$addr)),
+def : Pat <(store (v128i1 HvxQR:$src1), (i32 IntRegs:$addr)),
            (V6_vS32b_ai IntRegs:$addr, 0,
-           (v32i32 (V6_vandqrt (v1024i1 HvxQR:$src1),
+           (v32i32 (V6_vandqrt (v128i1 HvxQR:$src1),
                                        (A2_tfrsi 0x01010101))))>;
 
-def : Pat <(v1024i1 (load (i32 IntRegs:$addr))),
-           (v1024i1 (V6_vandvrt
+def : Pat <(v128i1 (load (i32 IntRegs:$addr))),
+           (v128i1 (V6_vandvrt
            (v32i32 (V6_vL32b_ai IntRegs:$addr, 0)), (A2_tfrsi 0x01010101)))>;
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
index ffaf71e23690..2c1e0cadd9ee 100644
--- a/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonLoopIdiomRecognition.cpp
@@ -20,7 +20,6 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemoryLocation.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -57,6 +56,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <algorithm>
 #include <array>
 #include <cassert>
diff --git a/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp b/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
index d1a153920e5e..188d91355a35 100644
--- a/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonMCInstLower.cpp
@@ -51,7 +51,7 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
     RelocationType = MCSymbolRefExpr::VK_None;
     break;
   case HexagonII::MO_PCREL:
-    RelocationType = MCSymbolRefExpr::VK_Hexagon_PCREL;
+    RelocationType = MCSymbolRefExpr::VK_PCREL;
     break;
   case HexagonII::MO_GOT:
     RelocationType = MCSymbolRefExpr::VK_GOT;
diff --git a/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h b/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
index 2961e16cc9dc..89ef5c2a891d 100644
--- a/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonMachineFunctionInfo.h
@@ -30,6 +30,9 @@ class HexagonMachineFunctionInfo : public MachineFunctionInfo {
   unsigned StackAlignBaseVReg = 0;    // Aligned-stack base register (virtual)
   unsigned StackAlignBasePhysReg = 0; //                             (physical)
   int VarArgsFrameIndex;
+  int RegSavedAreaStartFrameIndex;
+  int FirstNamedArgFrameIndex;
+  int LastNamedArgFrameIndex;
   bool HasClobberLR = false;
   bool HasEHReturn = false;
   std::map<const MachineInstr*, unsigned> PacketInfo;
@@ -46,6 +49,15 @@ public:
   void setVarArgsFrameIndex(int v) { VarArgsFrameIndex = v; }
   int getVarArgsFrameIndex() { return VarArgsFrameIndex; }
 
+  void setRegSavedAreaStartFrameIndex(int v) { RegSavedAreaStartFrameIndex = v;}
+  int getRegSavedAreaStartFrameIndex() { return RegSavedAreaStartFrameIndex; }
+
+  void setFirstNamedArgFrameIndex(int v) { FirstNamedArgFrameIndex = v; }
+  int getFirstNamedArgFrameIndex() { return FirstNamedArgFrameIndex; }
+
+  void setLastNamedArgFrameIndex(int v) { LastNamedArgFrameIndex = v; }
+  int getLastNamedArgFrameIndex() { return LastNamedArgFrameIndex; }
+
   void setStartPacket(MachineInstr* MI) {
     PacketInfo[MI] |= Hexagon::StartPacket;
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp b/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
index e3579dfa9ba9..8dc1113194a8 100644
--- a/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonNewValueJump.cpp
@@ -291,7 +291,7 @@ static bool canCompareBeNewValueJump(const HexagonInstrInfo *QII,
     // at machine code level, we don't need this, but if we decide
     // to move new value jump prior to RA, we would be needing this.
     MachineRegisterInfo &MRI = MF.getRegInfo();
-    if (secondReg && !Register::isPhysicalRegister(cmpOp2)) {
+    if (!Register::isPhysicalRegister(cmpOp2)) {
       MachineInstr *def = MRI.getVRegDef(cmpOp2);
       if (def->getOpcode() == TargetOpcode::COPY)
         return false;
diff --git a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
index 886034d9601a..c718e5f2d9fb 100644
--- a/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonOptAddrMode.cpp
@@ -12,9 +12,6 @@
 #include "HexagonInstrInfo.h"
 #include "HexagonSubtarget.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
-#include "RDFGraph.h"
-#include "RDFLiveness.h"
-#include "RDFRegisters.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/StringRef.h"
@@ -27,6 +24,9 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -561,6 +561,7 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
       MIB.add(ImmOp);
       MIB.add(OldMI->getOperand(3));
       OpStart = 4;
+      Changed = true;
     } else if (HII->getAddrMode(*OldMI) == HexagonII::BaseImmOffset) {
       short NewOpCode = HII->changeAddrMode_io_abs(*OldMI);
       assert(NewOpCode >= 0 && "Invalid New opcode\n");
@@ -570,10 +571,8 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
       MIB.addGlobalAddress(GV, Offset, ImmOp.getTargetFlags());
       MIB.add(OldMI->getOperand(2));
       OpStart = 3;
+      Changed = true;
     }
-    Changed = true;
-    LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
-    LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
   } else if (ImmOpNum == 1 && OldMI->getOperand(2).getImm() == 0) {
     short NewOpCode = HII->changeAddrMode_rr_io(*OldMI);
     assert(NewOpCode >= 0 && "Invalid New opcode\n");
@@ -582,12 +581,14 @@ bool HexagonOptAddrMode::changeStore(MachineInstr *OldMI, MachineOperand ImmOp,
     MIB.add(ImmOp);
     OpStart = 3;
     Changed = true;
+  }
+  if (Changed) {
     LLVM_DEBUG(dbgs() << "[Changing]: " << *OldMI << "\n");
     LLVM_DEBUG(dbgs() << "[TO]: " << *MIB << "\n");
-  }
-  if (Changed)
+
     for (unsigned i = OpStart; i < OpEnd; ++i)
       MIB.add(OldMI->getOperand(i));
+  }
 
   return Changed;
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonPatterns.td b/llvm/lib/Target/Hexagon/HexagonPatterns.td
index cf711058823c..cc10627955fb 100644
--- a/llvm/lib/Target/Hexagon/HexagonPatterns.td
+++ b/llvm/lib/Target/Hexagon/HexagonPatterns.td
@@ -362,6 +362,16 @@ def Rol: pf2<rotl>;
 // --(1) Immediate -------------------------------------------------------
 //
 
+def Imm64Lo: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(int32_t (N->getSExtValue()),
+                                   SDLoc(N), MVT::i32);
+}]>;
+def Imm64Hi: SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(int32_t (N->getSExtValue()>>32),
+                                   SDLoc(N), MVT::i32);
+}]>;
+
+
 def SDTHexagonCONST32
   : SDTypeProfile<1, 1, [SDTCisVT<0, i32>, SDTCisVT<1, i32>, SDTCisPtrTy<0>]>;
 
@@ -389,7 +399,10 @@ def: Pat<(HexagonCP         tconstpool:$A),     (A2_tfrsi imm:$A)>;
 
 def: Pat<(i1 0),        (PS_false)>;
 def: Pat<(i1 1),        (PS_true)>;
-def: Pat<(i64 imm:$v),  (CONST64 imm:$v)>;
+def: Pat<(i64 imm:$v),  (CONST64 imm:$v)>,
+     Requires<[UseSmallData,NotOptTinyCore]>;
+def: Pat<(i64 imm:$v),
+         (Combinew (A2_tfrsi (Imm64Hi $v)), (A2_tfrsi (Imm64Lo $v)))>;
 
 def ftoi : SDNodeXForm<fpimm, [{
   APInt I = N->getValueAPF().bitcastToAPInt();
@@ -923,6 +936,13 @@ let AddedComplexity = 100 in {
   defm: MinMax_pats<F2_sfmax, F2_sfmin, select, setole, i1, F32>;
 }
 
+let AddedComplexity = 100, Predicates = [HasV67] in {
+  defm: MinMax_pats<F2_dfmin, F2_dfmax, select, setogt, i1, F64>;
+  defm: MinMax_pats<F2_dfmin, F2_dfmax, select, setoge, i1, F64>;
+  defm: MinMax_pats<F2_dfmax, F2_dfmin, select, setolt, i1, F64>;
+  defm: MinMax_pats<F2_dfmax, F2_dfmin, select, setole, i1, F64>;
+}
+
 defm: MinMax_pats<A2_vminb,  A2_vmaxb,  vselect,  setgt,  v8i1,  V8I8>;
 defm: MinMax_pats<A2_vminb,  A2_vmaxb,  vselect,  setge,  v8i1,  V8I8>;
 defm: MinMax_pats<A2_vminh,  A2_vmaxh,  vselect,  setgt,  v4i1, V4I16>;
@@ -1075,7 +1095,7 @@ def Divu64_8: SDNodeXForm<imm, [{
 // Special cases:
 let AddedComplexity = 100 in {
   def: Pat<(fshl I32:$Rs, I32:$Rt, (i32 16)),
-           (A2_combine_hl I32:$Rs, I32:$Rt)>;
+           (A2_combine_lh I32:$Rs, I32:$Rt)>;
   def: Pat<(fshl I64:$Rs, I64:$Rt, IsMul8_U3:$S),
            (S2_valignib I64:$Rs, I64:$Rt, (Divu64_8 $S))>;
 }
@@ -1109,7 +1129,7 @@ def FShr64r: OutPatFrag<(ops node:$Rs, node:$Rt, node:$Ru),
 // Special cases:
 let AddedComplexity = 100 in {
   def: Pat<(fshr I32:$Rs, I32:$Rt, (i32 16)),
-           (A2_combine_hl I32:$Rs, I32:$Rt)>;
+           (A2_combine_lh I32:$Rs, I32:$Rt)>;
   def: Pat<(fshr I64:$Rs, I64:$Rt, IsMul8_U3:$S),
            (S2_valignib I64:$Rs, I64:$Rt, (Divu8 $S))>;
 }
@@ -1231,7 +1251,7 @@ class OpshIRI_pat<InstHexagon MI, PatFrag Op, PatFrag ShOp,
   : Pat<(Op anyimm:$u8, (ShOp RegPred:$Rs, ImmPred:$U5)),
         (MI anyimm:$u8, RegPred:$Rs, imm:$U5)>;
 
-let AddedComplexity = 200 in {
+let AddedComplexity = 200, Predicates = [UseCompound] in {
   def: OpshIRI_pat<S4_addi_asl_ri,  Add, Su<Shl>, I32, u5_0ImmPred>;
   def: OpshIRI_pat<S4_addi_lsr_ri,  Add, Su<Srl>, I32, u5_0ImmPred>;
   def: OpshIRI_pat<S4_subi_asl_ri,  Sub, Su<Shl>, I32, u5_0ImmPred>;
@@ -1408,6 +1428,26 @@ let Predicates = [HasV66] in {
   def: OpR_RR_pat<F2_dfsub,     pf2<fsub>,    f64, F64>;
 }
 
+def DfMpy: OutPatFrag<(ops node:$Rs, node:$Rt),
+  (F2_dfmpyhh
+    (F2_dfmpylh
+      (F2_dfmpylh
+        (F2_dfmpyll $Rs, $Rt),
+      $Rs, $Rt),
+    $Rt, $Rs),
+  $Rs, $Rt)>;
+
+let Predicates = [HasV67,UseUnsafeMath], AddedComplexity = 50 in {
+  def: Pat<(fmul F64:$Rs, F64:$Rt), (DfMpy $Rs, $Rt)>;
+}
+let Predicates = [HasV67] in {
+  def: OpR_RR_pat<F2_dfmin,     pf2<fminnum>, f64, F64>;
+  def: OpR_RR_pat<F2_dfmax,     pf2<fmaxnum>, f64, F64>;
+
+  def: Pat<(fmul F64:$Rs, F64:$Rt), (DfMpy (F2_dfmpyfix $Rs, $Rt),
+                                           (F2_dfmpyfix $Rt, $Rs))>;
+}
+
 // In expressions like a0*b0 + a1*b1 + ..., prefer to generate multiply-add,
 // over add-add with individual multiplies as inputs.
 let AddedComplexity = 10 in {
@@ -1510,7 +1550,7 @@ let AddedComplexity = 110 in {  // greater than S2_asl_r_r_and/or/xor.
 
 // S4_addaddi and S4_subaddi don't have tied operands, so give them
 // a bit of preference.
-let AddedComplexity = 30 in {
+let AddedComplexity = 30, Predicates = [UseCompound] in {
   def: Pat<(add I32:$Rs, (Su<Add> I32:$Ru, anyimm:$s6)),
            (S4_addaddi IntRegs:$Rs, IntRegs:$Ru, imm:$s6)>;
   def: Pat<(add anyimm:$s6, (Su<Add> I32:$Rs, I32:$Ru)),
@@ -1523,8 +1563,10 @@ let AddedComplexity = 30 in {
            (S4_subaddi IntRegs:$Rs, imm:$s6, IntRegs:$Ru)>;
 }
 
+let Predicates = [UseCompound] in
 def: Pat<(or I32:$Ru, (Su<And> I32:$Rx, anyimm:$s10)),
          (S4_or_andix IntRegs:$Ru, IntRegs:$Rx, imm:$s10)>;
+
 def: Pat<(or I32:$Rx, (Su<And> I32:$Rs, anyimm:$s10)),
          (S4_or_andi IntRegs:$Rx, IntRegs:$Rs, imm:$s10)>;
 def: Pat<(or I32:$Rx, (Su<Or> I32:$Rs, anyimm:$s10)),
@@ -1625,7 +1667,7 @@ def : Pat <(mulhs I64:$Rss, I64:$Rtt),
 // will put the immediate addend into a register, while these instructions will
 // use it directly. Such a construct does not appear in the middle of a gep,
 // where M2_macsip would be preferable.
-let AddedComplexity = 20 in {
+let AddedComplexity = 20, Predicates = [UseCompound] in {
   def: Pat<(add (Su<Mul> I32:$Rs, u6_0ImmPred:$U6), anyimm:$u6),
            (M4_mpyri_addi imm:$u6, IntRegs:$Rs, imm:$U6)>;
   def: Pat<(add (Su<Mul> I32:$Rs, I32:$Rt), anyimm:$u6),
@@ -1633,13 +1675,14 @@ let AddedComplexity = 20 in {
 }
 
 // Keep these instructions less preferable to M2_macsip/M2_macsin.
-def: Pat<(add I32:$Ru, (Su<Mul> I32:$Rs, u6_2ImmPred:$u6_2)),
-         (M4_mpyri_addr_u2 IntRegs:$Ru, imm:$u6_2, IntRegs:$Rs)>;
-def: Pat<(add I32:$Ru, (Su<Mul> I32:$Rs, anyimm:$u6)),
-         (M4_mpyri_addr IntRegs:$Ru, IntRegs:$Rs, imm:$u6)>;
-def: Pat<(add I32:$Ru, (Su<Mul> I32:$Ry, I32:$Rs)),
-         (M4_mpyrr_addr IntRegs:$Ru, IntRegs:$Ry, IntRegs:$Rs)>;
-
+let Predicates = [UseCompound] in {
+  def: Pat<(add I32:$Ru, (Su<Mul> I32:$Rs, u6_2ImmPred:$u6_2)),
+           (M4_mpyri_addr_u2 IntRegs:$Ru, imm:$u6_2, IntRegs:$Rs)>;
+  def: Pat<(add I32:$Ru, (Su<Mul> I32:$Rs, anyimm:$u6)),
+           (M4_mpyri_addr IntRegs:$Ru, IntRegs:$Rs, imm:$u6)>;
+  def: Pat<(add I32:$Ru, (Su<Mul> I32:$Ry, I32:$Rs)),
+           (M4_mpyrr_addr IntRegs:$Ru, IntRegs:$Ry, IntRegs:$Rs)>;
+}
 
 def: Pat<(fma F32:$Rs, F32:$Rt, F32:$Rx),
          (F2_sffma F32:$Rx, F32:$Rs, F32:$Rt)>;
@@ -1648,7 +1691,6 @@ def: Pat<(fma (fneg F32:$Rs), F32:$Rt, F32:$Rx),
 def: Pat<(fma F32:$Rs, (fneg F32:$Rt), F32:$Rx),
          (F2_sffms F32:$Rx, F32:$Rs, F32:$Rt)>;
 
-
 def: Pat<(mul V2I32:$Rs, V2I32:$Rt),
          (PS_vmulw V2I32:$Rs, V2I32:$Rt)>;
 def: Pat<(add V2I32:$Rx, (mul V2I32:$Rs, V2I32:$Rt)),
diff --git a/llvm/lib/Target/Hexagon/HexagonPeephole.cpp b/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
index 0ccfe64ad1e5..d0b02f035d1e 100644
--- a/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonPeephole.cpp
@@ -45,7 +45,7 @@
 #include "llvm/CodeGen/TargetInstrInfo.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/IR/Constants.h"
-#include "llvm/PassSupport.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/Target/Hexagon/HexagonPseudo.td b/llvm/lib/Target/Hexagon/HexagonPseudo.td
index d2b6d64e3c92..20c939577586 100644
--- a/llvm/lib/Target/Hexagon/HexagonPseudo.td
+++ b/llvm/lib/Target/Hexagon/HexagonPseudo.td
@@ -112,7 +112,7 @@ let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
     opExtendable = 0, hasSideEffects = 0 in
 class LOOP_iBase<string mnemonic, InstHexagon rootInst>
          : InstHexagon <(outs), (ins b30_2Imm:$offset, u10_0Imm:$src2),
-           #mnemonic#"($offset,#$src2)",
+           mnemonic#"($offset,#$src2)",
            [], "", rootInst.Itinerary, rootInst.Type>, OpcodeHexagon {
     bits<9> offset;
     bits<10> src2;
@@ -132,7 +132,7 @@ let isExtendable = 1, isExtentSigned = 1, opExtentBits = 9, opExtentAlign = 2,
     opExtendable = 0, hasSideEffects = 0 in
 class LOOP_rBase<string mnemonic, InstHexagon rootInst>
          : InstHexagon<(outs), (ins b30_2Imm:$offset, IntRegs:$src2),
-           #mnemonic#"($offset,$src2)",
+           mnemonic#"($offset,$src2)",
            [], "", rootInst.Itinerary, rootInst.Type>, OpcodeHexagon {
     bits<9> offset;
     bits<5> src2;
@@ -490,7 +490,7 @@ def TFRI64_V4 : InstHexagon<(outs DoubleRegs:$dst),
     A2_combineii.Itinerary, TypeALU32_2op>, OpcodeHexagon;
 
 // Hexagon doesn't have a vector multiply with C semantics.
-// Instead, generate a pseudo instruction that gets expaneded into two
+// Instead, generate a pseudo instruction that gets expanded into two
 // scalar MPYI instructions.
 // This is expanded by ExpandPostRAPseudos.
 let isPseudo = 1 in
@@ -527,13 +527,15 @@ multiclass NewCircularLoad<RegisterClass RC, MemAccessSize MS> {
 
 let isCodeGenOnly = 1, isPseudo = 1, Defs = [CS], Uses = [CS],
     addrMode = PostInc, accessSize = MS, hasSideEffects = 0 in {
+  // Use timing class of L2_loadrb_pci.
   def NAME#_pci : LDInst<(outs RC:$Rd32, IntRegs:$Rx32),
        (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, IntRegs:$Cs),
-       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_e93a3d71>;
+       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_5ceb2f9e>;
 
+  // Use timing class of L2_loadrb_pcr.
   def NAME#_pcr : LDInst<(outs RC:$Rd32, IntRegs:$Rx32),
        (ins IntRegs:$Rx32in, ModRegs:$Mu2, IntRegs:$Cs),
-       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_44d3da28>;
+       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_075c8dd8>;
 }
 }
 
@@ -548,13 +550,15 @@ multiclass NewCircularStore<RegisterClass RC, MemAccessSize MS> {
 
 let isCodeGenOnly = 1, isPseudo = 1, Defs = [CS], Uses = [CS],
     addrMode = PostInc, accessSize = MS, hasSideEffects = 0 in {
+  // Use timing class of S2_storerb_pci.
   def NAME#_pci : STInst<(outs IntRegs:$Rx32),
        (ins IntRegs:$Rx32in, s4_0Imm:$Ii, ModRegs:$Mu2, RC:$Rt32, IntRegs:$Cs),
-       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_e86aa961>;
+       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_b4dc7630>;
 
+  // Use timing class of S2_storerb_pcr.
   def NAME#_pcr : STInst<(outs IntRegs:$Rx32),
        (ins IntRegs:$Rx32in, ModRegs:$Mu2, RC:$Rt32, IntRegs:$Cs),
-       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_da97ee82>;
+       ".error \"should not emit\" ", [], "$Rx32 = $Rx32in", tc_a2b365d2>;
 }
 }
 
diff --git a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
index 517ad1c6ee7b..f26e23befde2 100644
--- a/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonRDFOpt.cpp
@@ -11,9 +11,6 @@
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "RDFCopy.h"
 #include "RDFDeadCode.h"
-#include "RDFGraph.h"
-#include "RDFLiveness.h"
-#include "RDFRegisters.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SetVector.h"
@@ -24,6 +21,9 @@
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
index d55aeaf10852..52f247977094 100644
--- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.cpp
@@ -172,6 +172,13 @@ BitVector HexagonRegisterInfo::getReservedRegs(const MachineFunction &MF)
   Reserved.set(Hexagon::C8);
   Reserved.set(Hexagon::USR_OVF);
 
+  // Leveraging these registers will require more work to recognize
+  // the new semantics posed, Hi/LoVec patterns, etc.
+  // Note well: if enabled, they should be restricted to only
+  // where `HST.useHVXOps() && HST.hasV67Ops()` is true.
+  for (auto Reg : Hexagon_MC::GetVectRegRev())
+    Reserved.set(Reg);
+
   if (MF.getSubtarget<HexagonSubtarget>().hasReservedR19())
     Reserved.set(Hexagon::R19);
 
@@ -196,7 +203,7 @@ void HexagonRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   auto &HII = *HST.getInstrInfo();
   auto &HFI = *HST.getFrameLowering();
 
-  unsigned BP = 0;
+  Register BP;
   int FI = MI.getOperand(FIOp).getIndex();
   // Select the base pointer (BP) and calculate the actual offset from BP
   // to the beginning of the object at index FI.
diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
index fc166b5a3410..52d15da3bcb5 100644
--- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.h
@@ -56,10 +56,6 @@ public:
   /// Returns true if the frame pointer is valid.
   bool useFPForScavengingIndex(const MachineFunction &MF) const override;
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
-    return true;
-  }
-
   bool shouldCoalesce(MachineInstr *MI, const TargetRegisterClass *SrcRC,
         unsigned SubReg, const TargetRegisterClass *DstRC, unsigned DstSubReg,
         const TargetRegisterClass *NewRC, LiveIntervals &LIS) const override;
diff --git a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
index c23b837bb62f..49428db223a1 100644
--- a/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
+++ b/llvm/lib/Target/Hexagon/HexagonRegisterInfo.td
@@ -18,6 +18,12 @@ let Namespace = "Hexagon" in {
     let HWEncoding{4-0} = num;
   }
 
+  // These registers are used to preserve a distinction between
+  // vector register pairs of differing order.
+  class HexagonFakeReg<string n> : Register<n> {
+    let isArtificial = 1;
+  }
+
   class HexagonDoubleReg<bits<5> num, string n, list<Register> subregs,
                          list<string> alt = []> :
         RegisterWithSubRegs<n, subregs> {
@@ -30,6 +36,13 @@ let Namespace = "Hexagon" in {
   class Ri<bits<5> num, string n, list<string> alt = []> :
         HexagonReg<num, n, alt>;
 
+  // Rp - false/pseudo registers.  These registers are used
+  // to provide a distinct set of aliases for both styles of vector
+  // register pairs without encountering subregister indexing constraints.
+  class R_fake<string n> :
+        HexagonFakeReg<n>;
+
+
   // Rf - 32-bit floating-point registers.
   class Rf<bits<5> num, string n> : HexagonReg<num, n>;
 
@@ -81,6 +94,7 @@ let Namespace = "Hexagon" in {
   def isub_hi  : SubRegIndex<32, 32>;
   def vsub_lo  : SubRegIndex<512>;
   def vsub_hi  : SubRegIndex<512, 512>;
+  def vsub_fake: SubRegIndex<512>;
   def wsub_lo  : SubRegIndex<1024>;
   def wsub_hi  : SubRegIndex<1024, 1024>;
   def subreg_overflow : SubRegIndex<1, 0>;
@@ -183,27 +197,49 @@ let Namespace = "Hexagon" in {
 
   foreach i = 0-31 in {
     def V#i  : Ri<i, "v"#i>,  DwarfRegNum<[!add(i, 99)]>;
+    def VF#i : R_fake<"__"#!add(i,999999)>,  DwarfRegNum<[!add(i, 999999)]>;
+    def VFR#i : R_fake<"__"#!add(i,9999999)>,  DwarfRegNum<[!add(i, 9999999)]>;
   }
   def VTMP : Ri<0, "vtmp">, DwarfRegNum<[131]>;
 
   // Aliases of the V* registers used to hold double vec values.
-  let SubRegIndices = [vsub_lo, vsub_hi], CoveredBySubRegs = 1 in {
-  def W0  : Rd< 0,  "v1:0",  [V0,  V1]>,  DwarfRegNum<[99]>;
-  def W1  : Rd< 2,  "v3:2",  [V2,  V3]>,  DwarfRegNum<[101]>;
-  def W2  : Rd< 4,  "v5:4",  [V4,  V5]>,  DwarfRegNum<[103]>;
-  def W3  : Rd< 6,  "v7:6",  [V6,  V7]>,  DwarfRegNum<[105]>;
-  def W4  : Rd< 8,  "v9:8",  [V8,  V9]>,  DwarfRegNum<[107]>;
-  def W5  : Rd<10, "v11:10", [V10, V11]>, DwarfRegNum<[109]>;
-  def W6  : Rd<12, "v13:12", [V12, V13]>, DwarfRegNum<[111]>;
-  def W7  : Rd<14, "v15:14", [V14, V15]>, DwarfRegNum<[113]>;
-  def W8  : Rd<16, "v17:16", [V16, V17]>, DwarfRegNum<[115]>;
-  def W9  : Rd<18, "v19:18", [V18, V19]>, DwarfRegNum<[117]>;
-  def W10 : Rd<20, "v21:20", [V20, V21]>, DwarfRegNum<[119]>;
-  def W11 : Rd<22, "v23:22", [V22, V23]>, DwarfRegNum<[121]>;
-  def W12 : Rd<24, "v25:24", [V24, V25]>, DwarfRegNum<[123]>;
-  def W13 : Rd<26, "v27:26", [V26, V27]>, DwarfRegNum<[125]>;
-  def W14 : Rd<28, "v29:28", [V28, V29]>, DwarfRegNum<[127]>;
-  def W15 : Rd<30, "v31:30", [V30, V31]>, DwarfRegNum<[129]>;
+  let SubRegIndices = [vsub_lo, vsub_hi, vsub_fake], CoveredBySubRegs = 1 in {
+  def W0  : Rd< 0,  "v1:0",  [V0,  V1, VF0]>,  DwarfRegNum<[99]>;
+  def W1  : Rd< 2,  "v3:2",  [V2,  V3, VF1]>,  DwarfRegNum<[101]>;
+  def W2  : Rd< 4,  "v5:4",  [V4,  V5, VF2]>,  DwarfRegNum<[103]>;
+  def W3  : Rd< 6,  "v7:6",  [V6,  V7, VF3]>,  DwarfRegNum<[105]>;
+  def W4  : Rd< 8,  "v9:8",  [V8,  V9, VF4]>,  DwarfRegNum<[107]>;
+  def W5  : Rd<10, "v11:10", [V10, V11, VF5]>, DwarfRegNum<[109]>;
+  def W6  : Rd<12, "v13:12", [V12, V13, VF6]>, DwarfRegNum<[111]>;
+  def W7  : Rd<14, "v15:14", [V14, V15, VF7]>, DwarfRegNum<[113]>;
+  def W8  : Rd<16, "v17:16", [V16, V17, VF8]>, DwarfRegNum<[115]>;
+  def W9  : Rd<18, "v19:18", [V18, V19, VF9]>, DwarfRegNum<[117]>;
+  def W10 : Rd<20, "v21:20", [V20, V21, VF10]>, DwarfRegNum<[119]>;
+  def W11 : Rd<22, "v23:22", [V22, V23, VF11]>, DwarfRegNum<[121]>;
+  def W12 : Rd<24, "v25:24", [V24, V25, VF12]>, DwarfRegNum<[123]>;
+  def W13 : Rd<26, "v27:26", [V26, V27, VF13]>, DwarfRegNum<[125]>;
+  def W14 : Rd<28, "v29:28", [V28, V29, VF14]>, DwarfRegNum<[127]>;
+  def W15 : Rd<30, "v31:30", [V30, V31, VF15]>, DwarfRegNum<[129]>;
+  }
+
+  // Reverse Aliases of the V* registers used to hold double vec values.
+  let SubRegIndices = [vsub_lo, vsub_hi, vsub_fake], CoveredBySubRegs = 1 in {
+  def WR0 : Rd< 1,  "v0:1",  [V0, V1, VFR0]>,  DwarfRegNum<[161]>;
+  def WR1 : Rd< 3,  "v2:3",  [V2, V3, VFR1]>,  DwarfRegNum<[162]>;
+  def WR2 : Rd< 5,  "v4:5",  [V4, V5, VFR2]>,  DwarfRegNum<[163]>;
+  def WR3 : Rd< 7,  "v6:7",  [V6, V7, VFR3]>,  DwarfRegNum<[164]>;
+  def WR4 : Rd< 9,  "v8:9",  [V8, V9, VFR4]>,  DwarfRegNum<[165]>;
+  def WR5 : Rd<11, "v10:11", [V10, V11, VFR5]>,  DwarfRegNum<[166]>;
+  def WR6 : Rd<13, "v12:13", [V12, V13, VFR6]>,  DwarfRegNum<[167]>;
+  def WR7 : Rd<15, "v14:15", [V14, V15, VFR7]>,  DwarfRegNum<[168]>;
+  def WR8 : Rd<17, "v16:17", [V16, V17, VFR8]>,  DwarfRegNum<[169]>;
+  def WR9 : Rd<19, "v18:19", [V18, V19, VFR9]>,  DwarfRegNum<[170]>;
+  def WR10: Rd<21, "v20:21", [V20, V21, VFR10]>,  DwarfRegNum<[171]>;
+  def WR11: Rd<23, "v22:23", [V22, V23, VFR11]>,  DwarfRegNum<[172]>;
+  def WR12: Rd<25, "v24:25", [V24, V25, VFR12]>,  DwarfRegNum<[173]>;
+  def WR13: Rd<27, "v26:27", [V26, V27, VFR13]>,  DwarfRegNum<[174]>;
+  def WR14: Rd<29, "v28:29", [V28, V29, VFR14]>,  DwarfRegNum<[175]>;
+  def WR15: Rd<31, "v30:31", [V30, V31, VFR15]>,  DwarfRegNum<[176]>;
   }
 
   // Aliases of the V* registers used to hold quad vec values.
@@ -283,7 +319,7 @@ let Namespace = "Hexagon" in {
 // HVX types
 
 def VecI1:   ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
-                               [v512i1, v1024i1, v512i1]>;
+                               [v64i1,  v128i1,  v64i1]>;
 def VecI8:   ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
                                [v64i8,  v128i8,  v64i8]>;
 def VecI16:  ValueTypeByHwMode<[Hvx64,  Hvx128,  DefaultMode],
@@ -314,15 +350,15 @@ def HvxVR : RegisterClass<"Hexagon", [VecI8, VecI16, VecI32], 512,
 }
 
 def HvxWR : RegisterClass<"Hexagon", [VecPI8, VecPI16, VecPI32], 1024,
-  (add (sequence "W%u", 0, 15))> {
+  (add (sequence "W%u", 0, 15), (sequence "WR%u", 0, 15))> {
   let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
     [RegInfo<1024,1024,1024>, RegInfo<2048,2048,2048>, RegInfo<1024,1024,1024>]>;
 }
 
-def HvxQR : RegisterClass<"Hexagon", [VecI1, VecQ8, VecQ16, VecQ32], 512,
+def HvxQR : RegisterClass<"Hexagon", [VecI1, VecQ8, VecQ16, VecQ32], 128,
   (add Q0, Q1, Q2, Q3)> {
   let RegInfos = RegInfoByHwMode<[Hvx64, Hvx128, DefaultMode],
-    [RegInfo<512,512,512>, RegInfo<1024,1024,1024>, RegInfo<512,512,512>]>;
+    [RegInfo<64,512,512>, RegInfo<128,1024,1024>, RegInfo<64,512,512>]>;
 }
 
 def HvxVQR : RegisterClass<"Hexagon", [untyped], 2048,
@@ -365,6 +401,10 @@ def CtrRegs : RegisterClass<"Hexagon", [i32], 32,
        FRAMELIMIT, FRAMEKEY, PKTCOUNTLO, PKTCOUNTHI, UTIMERLO, UTIMERHI,
        M0, M1, USR)>;
 
+let Size = 64 in
+def VectRegRev : RegisterClass<"Hexagon", [i64], 64,
+  (add (sequence "WR%u", 0, 15))>;
+
 let isAllocatable = 0 in
 def UsrBits : RegisterClass<"Hexagon", [i1], 0, (add USR_OVF)>;
 
diff --git a/llvm/lib/Target/Hexagon/HexagonSchedule.td b/llvm/lib/Target/Hexagon/HexagonSchedule.td
index 0834e9000460..5efd02ada54c 100644
--- a/llvm/lib/Target/Hexagon/HexagonSchedule.td
+++ b/llvm/lib/Target/Hexagon/HexagonSchedule.td
@@ -56,37 +56,15 @@ def tc_ENDLOOP      : InstrItinClass;
 include "HexagonDepIICScalar.td"
 include "HexagonDepIICHVX.td"
 
-//===----------------------------------------------------------------------===//
-// V5 Machine Info +
-//===----------------------------------------------------------------------===//
-
 include "HexagonScheduleV5.td"
-
-// V55 Machine Info +
 include "HexagonScheduleV55.td"
 
-//===----------------------------------------------------------------------===//
-// V60 Machine Info -
-//===----------------------------------------------------------------------===//
-
 include "HexagonIICScalar.td"
 include "HexagonIICHVX.td"
 include "HexagonScheduleV60.td"
 
-//===----------------------------------------------------------------------===//
-// V62 Machine Info +
-//===----------------------------------------------------------------------===//
-
 include "HexagonScheduleV62.td"
-
-//===----------------------------------------------------------------------===//
-// V65 Machine Info +
-//===----------------------------------------------------------------------===//
-
 include "HexagonScheduleV65.td"
-
-//===----------------------------------------------------------------------===//
-// V66 Machine Info +
-//===----------------------------------------------------------------------===//
-
 include "HexagonScheduleV66.td"
+include "HexagonScheduleV67.td"
+include "HexagonScheduleV67T.td"
diff --git a/llvm/lib/Target/Hexagon/HexagonScheduleV67.td b/llvm/lib/Target/Hexagon/HexagonScheduleV67.td
new file mode 100644
index 000000000000..4f9d861a5504
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonScheduleV67.td
@@ -0,0 +1,39 @@
+//=-HexagonScheduleV67.td - HexagonV67 Scheduling Definitions *- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+//
+// ScalarItin and HVXItin contain some old itineraries
+// still used by a handful of instructions. Hopefully, we will be able
+// to get rid of them soon.
+
+def HexagonV67ItinList : DepScalarItinV66, ScalarItin,
+                         DepHVXItinV66, HVXItin, PseudoItin {
+  list<InstrItinData> ItinList =
+    !listconcat(DepScalarItinV66_list, ScalarItin_list,
+                DepHVXItinV66_list, HVXItin_list, PseudoItin_list);
+}
+
+def HexagonItinerariesV67 :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+                            CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL,
+                            CVI_ALL_NOMEM, CVI_ZW],
+                            [Hex_FWD, HVX_FWD],
+                            HexagonV67ItinList.ItinList>;
+
+def HexagonModelV67 : SchedMachineModel {
+  // Max issue per cycle == bundle width.
+  let IssueWidth = 4;
+  let Itineraries = HexagonItinerariesV67;
+  let LoadLatency = 1;
+  let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V67 Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Hexagon/HexagonScheduleV67T.td b/llvm/lib/Target/Hexagon/HexagonScheduleV67T.td
new file mode 100644
index 000000000000..f2bcb1e7256c
--- /dev/null
+++ b/llvm/lib/Target/Hexagon/HexagonScheduleV67T.td
@@ -0,0 +1,61 @@
+//=- HexagonScheduleV67T.td - Hexagon V67 Tiny Core Scheduling Definitions --=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+class HexagonV67TPseudoItin {
+  list<InstrItinData> V67TPseudoItin_list = [
+    InstrItinData<PSEUDO, [InstrStage<1, [SLOT0, SLOT2, SLOT3]>], [2, 1, 1],
+                          [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData<PSEUDOM, [InstrStage<1, [SLOT2, SLOT3], 0>,
+                            InstrStage<1, [SLOT2, SLOT3]>],
+                           [2, 1, 1],
+                           [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData<DUPLEX, [InstrStage<1, [SLOT0]>],
+                          [2, 1, 1]>,
+    InstrItinData<tc_ENDLOOP, [InstrStage<1, [SLOT_ENDLOOP]>], [2]>
+  ];
+}
+
+// V67TItin_list and HVXItin contain some old itineraries
+// still used by a handful of instructions. Hopefully, we will be able to
+// get rid of them soon.
+def HexagonV67TItinList : DepScalarItinV67T,
+                          DepHVXItinV67, HVXItin, HexagonV67TPseudoItin {
+  list<InstrItinData> V67TItin_list = [
+    InstrItinData<LD_tc_ld_SLOT01, [InstrStage<1, [SLOT0]>],
+                                   [3, 1, 1],
+                                   [Hex_FWD, Hex_FWD, Hex_FWD]>,
+    InstrItinData<ST_tc_st_SLOT01, [InstrStage<1, [SLOT0]>],
+                                   [1, 1, 3, 3],
+                                   [Hex_FWD, Hex_FWD]>
+  ];
+
+  list<InstrItinData> ItinList =
+    !listconcat(DepScalarItinV67T_list,
+                DepHVXItinV67_list, V67TItin_list,
+                HVXItin_list, V67TPseudoItin_list);
+}
+
+def HexagonItinerariesV67T :
+      ProcessorItineraries<[SLOT0, SLOT1, SLOT2, SLOT3, SLOT_ENDLOOP,
+                            CVI_ST, CVI_XLANE, CVI_SHIFT, CVI_MPY0, CVI_MPY1,
+                            CVI_LD, CVI_XLSHF, CVI_MPY01, CVI_ALL,
+                            CVI_ALL_NOMEM, CVI_ZW],
+                            [Hex_FWD, HVX_FWD],
+                            HexagonV67TItinList.ItinList>;
+
+
+def HexagonModelV67T : SchedMachineModel {
+  let IssueWidth = 3;
+  let Itineraries = HexagonItinerariesV67T;
+  let LoadLatency = 1;
+  let CompleteModel = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Hexagon V67 Tiny Core Resource Definitions -
+//===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp b/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
index c5ba7ced4c30..1b724e8fcae9 100644
--- a/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.cpp
@@ -18,10 +18,10 @@ using namespace llvm;
 
 SDValue HexagonSelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
-  if (AlwaysInline || (Align & 0x3) != 0 || !ConstantSize)
+  if (AlwaysInline || Alignment < Align(4) || !ConstantSize)
     return SDValue();
 
   uint64_t SizeVal = ConstantSize->getZExtValue();
diff --git a/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h b/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
index af8b8318b059..0d3b1725d1bc 100644
--- a/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonSelectionDAGInfo.h
@@ -23,8 +23,8 @@ public:
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align, bool isVolatile,
-                                  bool AlwaysInline,
+                                  SDValue Size, Align Alignment,
+                                  bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const override;
 };
diff --git a/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp b/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
index d80e0ed50c93..b45d871e04d6 100644
--- a/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSplitDouble.cpp
@@ -504,7 +504,7 @@ void HexagonSplitDoubleRegs::collectIndRegsForLoop(const MachineLoop *L,
 
   // Get the registers on which the loop controlling compare instruction
   // depends.
-  unsigned CmpR1 = 0, CmpR2 = 0;
+  Register CmpR1, CmpR2;
   const MachineInstr *CmpI = MRI->getVRegDef(PR);
   while (CmpI->getOpcode() == Hexagon::C2_not)
     CmpI = MRI->getVRegDef(CmpI->getOperand(1).getReg());
@@ -688,11 +688,12 @@ void HexagonSplitDoubleRegs::splitMemRef(MachineInstr *MI,
   for (auto &MO : MI->memoperands()) {
     const MachinePointerInfo &Ptr = MO->getPointerInfo();
     MachineMemOperand::Flags F = MO->getFlags();
-    int A = MO->getAlignment();
+    Align A = MO->getAlign();
 
-    auto *Tmp1 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, A);
+    auto *Tmp1 = MF.getMachineMemOperand(Ptr, F, 4 /*size*/, A);
     LowI->addMemOperand(MF, Tmp1);
-    auto *Tmp2 = MF.getMachineMemOperand(Ptr, F, 4/*size*/, std::min(A, 4));
+    auto *Tmp2 =
+        MF.getMachineMemOperand(Ptr, F, 4 /*size*/, std::min(A, Align(4)));
     HighI->addMemOperand(MF, Tmp2);
   }
 }
diff --git a/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp b/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
index aab37393ed36..2c4007145bd0 100644
--- a/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonStoreWidening.cpp
@@ -314,7 +314,7 @@ bool HexagonStoreWidening::selectStores(InstrGroup::iterator Begin,
   MachineInstr *FirstMI = *Begin;
   assert(!FirstMI->memoperands_empty() && "Expecting some memory operands");
   const MachineMemOperand &FirstMMO = getStoreTarget(FirstMI);
-  unsigned Alignment = FirstMMO.getAlignment();
+  unsigned Alignment = FirstMMO.getAlign().value();
   unsigned SizeAccum = FirstMMO.getSize();
   unsigned FirstOffset = getStoreOffset(FirstMI);
 
@@ -417,9 +417,8 @@ bool HexagonStoreWidening::createWideStores(InstrGroup &OG, InstrGroup &NG,
   DebugLoc DL = OG.back()->getDebugLoc();
   const MachineMemOperand &OldM = getStoreTarget(FirstSt);
   MachineMemOperand *NewM =
-    MF->getMachineMemOperand(OldM.getPointerInfo(), OldM.getFlags(),
-                             TotalSize, OldM.getAlignment(),
-                             OldM.getAAInfo());
+      MF->getMachineMemOperand(OldM.getPointerInfo(), OldM.getFlags(),
+                               TotalSize, OldM.getAlign(), OldM.getAAInfo());
 
   if (Acc < 0x10000) {
     // Create mem[hw] = #Acc
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
index 6c706fea096b..2b7e1bcba9a3 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.cpp
@@ -75,14 +75,14 @@ static cl::opt<bool> EnableCheckBankConflict("hexagon-check-bank-conflict",
   cl::Hidden, cl::ZeroOrMore, cl::init(true),
   cl::desc("Enable checking for cache bank conflicts"));
 
-
 HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
                                    StringRef FS, const TargetMachine &TM)
     : HexagonGenSubtargetInfo(TT, CPU, FS), OptLevel(TM.getOptLevel()),
-      CPUString(Hexagon_MC::selectHexagonCPU(CPU)),
-      InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+      CPUString(std::string(Hexagon_MC::selectHexagonCPU(CPU))),
+      TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       RegInfo(getHwMode()), TLInfo(TM, *this),
       InstrItins(getInstrItineraryForCPU(CPUString)) {
+  Hexagon_MC::addArchSubtarget(this, FS);
   // Beware of the default constructor of InstrItineraryData: it will
   // reset all members to 0.
   assert(InstrItins.Itineraries != nullptr && "InstrItins not initialized");
@@ -90,24 +90,16 @@ HexagonSubtarget::HexagonSubtarget(const Triple &TT, StringRef CPU,
 
 HexagonSubtarget &
 HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
-  static std::map<StringRef, Hexagon::ArchEnum> CpuTable{
-      {"generic", Hexagon::ArchEnum::V60},
-      {"hexagonv5", Hexagon::ArchEnum::V5},
-      {"hexagonv55", Hexagon::ArchEnum::V55},
-      {"hexagonv60", Hexagon::ArchEnum::V60},
-      {"hexagonv62", Hexagon::ArchEnum::V62},
-      {"hexagonv65", Hexagon::ArchEnum::V65},
-      {"hexagonv66", Hexagon::ArchEnum::V66},
-  };
-
-  auto FoundIt = CpuTable.find(CPUString);
-  if (FoundIt != CpuTable.end())
-    HexagonArchVersion = FoundIt->second;
+  Optional<Hexagon::ArchEnum> ArchVer =
+      Hexagon::GetCpu(Hexagon::CpuTable, CPUString);
+  if (ArchVer)
+    HexagonArchVersion = *ArchVer;
   else
     llvm_unreachable("Unrecognized Hexagon processor version");
 
   UseHVX128BOps = false;
   UseHVX64BOps = false;
+  UseAudioOps = false;
   UseLongCalls = false;
 
   UseBSBScheduling = hasV60Ops() && EnableBSBSched;
@@ -117,6 +109,13 @@ HexagonSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
   if (OverrideLongCalls.getPosition())
     UseLongCalls = OverrideLongCalls;
 
+  if (isTinyCore()) {
+    // Tiny core has a single thread, so back-to-back scheduling is enabled by
+    // default.
+    if (!EnableBSBSched.getPosition())
+      UseBSBScheduling = false;
+  }
+
   FeatureBitset Features = getFeatureBits();
   if (HexagonDisableDuplex)
     setFeatureBits(Features.reset(Hexagon::FeatureDuplex));
@@ -316,13 +315,14 @@ bool HexagonSubtarget::useAA() const {
 
 /// Perform target specific adjustments to the latency of a schedule
 /// dependency.
-void HexagonSubtarget::adjustSchedDependency(SUnit *Src, SUnit *Dst,
+void HexagonSubtarget::adjustSchedDependency(SUnit *Src, int SrcOpIdx,
+                                             SUnit *Dst, int DstOpIdx,
                                              SDep &Dep) const {
-  MachineInstr *SrcInst = Src->getInstr();
-  MachineInstr *DstInst = Dst->getInstr();
   if (!Src->isInstr() || !Dst->isInstr())
     return;
 
+  MachineInstr *SrcInst = Src->getInstr();
+  MachineInstr *DstInst = Dst->getInstr();
   const HexagonInstrInfo *QII = getInstrInfo();
 
   // Instructions with .new operands have zero latency.
@@ -424,8 +424,17 @@ void HexagonSubtarget::restoreLatency(SUnit *Src, SUnit *Dst) const {
     int DefIdx = -1;
     for (unsigned OpNum = 0; OpNum < SrcI->getNumOperands(); OpNum++) {
       const MachineOperand &MO = SrcI->getOperand(OpNum);
-      if (MO.isReg() && MO.isDef() && MO.getReg() == DepR)
-        DefIdx = OpNum;
+      bool IsSameOrSubReg = false;
+      if (MO.isReg()) {
+        unsigned MOReg = MO.getReg();
+        if (Register::isVirtualRegister(DepR)) {
+          IsSameOrSubReg = (MOReg == DepR);
+        } else {
+          IsSameOrSubReg = getRegisterInfo()->isSubRegisterEq(DepR, MOReg);
+        }
+        if (MO.isDef() && IsSameOrSubReg)
+          DefIdx = OpNum;
+      }
     }
     assert(DefIdx >= 0 && "Def Reg not found in Src MI");
     MachineInstr *DstI = Dst->getInstr();
diff --git a/llvm/lib/Target/Hexagon/HexagonSubtarget.h b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
index 31157a0065d9..de4f245519e4 100644
--- a/llvm/lib/Target/Hexagon/HexagonSubtarget.h
+++ b/llvm/lib/Target/Hexagon/HexagonSubtarget.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
 #define LLVM_LIB_TARGET_HEXAGON_HEXAGONSUBTARGET_H
 
-#include "HexagonDepArch.h"
+#include "HexagonArch.h"
 #include "HexagonFrameLowering.h"
 #include "HexagonISelLowering.h"
 #include "HexagonInstrInfo.h"
@@ -45,14 +45,18 @@ class HexagonSubtarget : public HexagonGenSubtargetInfo {
   bool UseHVX64BOps = false;
   bool UseHVX128BOps = false;
 
+  bool UseAudioOps = false;
+  bool UseCompound = false;
   bool UseLongCalls = false;
   bool UseMemops = false;
   bool UsePackets = false;
   bool UseNewValueJumps = false;
   bool UseNewValueStores = false;
   bool UseSmallData = false;
+  bool UseUnsafeMath = false;
   bool UseZRegOps = false;
 
+  bool HasPreV65 = false;
   bool HasMemNoShuf = false;
   bool EnableDuplex = false;
   bool ReservedR19 = false;
@@ -83,7 +87,14 @@ public:
   };
 
 private:
+  enum HexagonProcFamilyEnum { Others, TinyCore };
+
   std::string CPUString;
+  Triple TargetTriple;
+
+  // The following objects can use the TargetTriple, so they must be
+  // declared after it.
+  HexagonProcFamilyEnum HexagonProcFamily = Others;
   HexagonInstrInfo InstrInfo;
   HexagonRegisterInfo RegInfo;
   HexagonTargetLowering TLInfo;
@@ -95,6 +106,11 @@ public:
   HexagonSubtarget(const Triple &TT, StringRef CPU, StringRef FS,
                    const TargetMachine &TM);
 
+  const Triple &getTargetTriple() const { return TargetTriple; }
+  bool isEnvironmentMusl() const {
+    return TargetTriple.getEnvironment() == Triple::Musl;
+  }
+
   /// getInstrItins - Return the instruction itineraries based on subtarget
   /// selection.
   const InstrItineraryData *getInstrItineraryData() const override {
@@ -157,18 +173,45 @@ public:
   bool hasV66OpsOnly() const {
     return getHexagonArchVersion() == Hexagon::ArchEnum::V66;
   }
+  bool hasV67Ops() const {
+    return getHexagonArchVersion() >= Hexagon::ArchEnum::V67;
+  }
+  bool hasV67OpsOnly() const {
+    return getHexagonArchVersion() == Hexagon::ArchEnum::V67;
+  }
 
+  bool useAudioOps() const { return UseAudioOps; }
+  bool useCompound() const { return UseCompound; }
   bool useLongCalls() const { return UseLongCalls; }
   bool useMemops() const { return UseMemops; }
   bool usePackets() const { return UsePackets; }
   bool useNewValueJumps() const { return UseNewValueJumps; }
   bool useNewValueStores() const { return UseNewValueStores; }
   bool useSmallData() const { return UseSmallData; }
+  bool useUnsafeMath() const { return UseUnsafeMath; }
   bool useZRegOps() const { return UseZRegOps; }
 
+  bool isTinyCore() const { return HexagonProcFamily == TinyCore; }
+  bool isTinyCoreWithDuplex() const { return isTinyCore() && EnableDuplex; }
+
   bool useHVXOps() const {
     return HexagonHVXVersion > Hexagon::ArchEnum::NoArch;
   }
+  bool useHVXV60Ops() const {
+    return HexagonHVXVersion >= Hexagon::ArchEnum::V60;
+  }
+  bool useHVXV62Ops() const {
+    return HexagonHVXVersion >= Hexagon::ArchEnum::V62;
+  }
+  bool useHVXV65Ops() const {
+    return HexagonHVXVersion >= Hexagon::ArchEnum::V65;
+  }
+  bool useHVXV66Ops() const {
+    return HexagonHVXVersion >= Hexagon::ArchEnum::V66;
+  }
+  bool useHVXV67Ops() const {
+    return HexagonHVXVersion >= Hexagon::ArchEnum::V67;
+  }
   bool useHVX128BOps() const { return useHVXOps() && UseHVX128BOps; }
   bool useHVX64BOps() const { return useHVXOps() && UseHVX64BOps; }
 
@@ -186,7 +229,11 @@ public:
   // compiler time and will be removed eventually anyway.
   bool enableMachineSchedDefaultSched() const override { return false; }
 
+  // For use with PostRAScheduling: get the anti-dependence breaking that should
+  // be performed before post-RA scheduling.
   AntiDepBreakMode getAntiDepBreakMode() const override { return ANTIDEP_ALL; }
+  /// True if the subtarget should run a scheduler after register
+  /// allocation.
   bool enablePostRAScheduler() const override { return true; }
 
   bool enableSubRegLiveness() const override;
@@ -211,7 +258,8 @@ public:
 
   /// Perform target specific adjustments to the latency of a schedule
   /// dependency.
-  void adjustSchedDependency(SUnit *def, SUnit *use, SDep& dep) const override;
+  void adjustSchedDependency(SUnit *Def, int DefOpIdx, SUnit *Use, int UseOpIdx,
+                             SDep &Dep) const override;
 
   unsigned getVectorLength() const {
     assert(useHVXOps());
@@ -239,9 +287,6 @@ public:
     ArrayRef<MVT> ElemTypes = getHVXElementTypes();
 
     if (IncludeBool && ElemTy == MVT::i1) {
-      // Special case for the v512i1, etc.
-      if (8*HwLen == NumElems)
-        return true;
       // Boolean HVX vector types are formed from regular HVX vector types
       // by replacing the element type with i1.
       for (MVT T : ElemTypes)
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
index 9e9ce209a825..3fe42ea13f51 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetMachine.cpp
@@ -237,6 +237,14 @@ HexagonTargetMachine::getSubtargetImpl(const Function &F) const {
   std::string FS = !FSAttr.hasAttribute(Attribute::None)
                        ? FSAttr.getValueAsString().str()
                        : TargetFS;
+  // Append the preexisting target features last, so that +mattr overrides
+  // the "unsafe-fp-math" function attribute.
+  // Creating a separate target feature is not strictly necessary, it only
+  // exists to make "unsafe-fp-math" force creating a new subtarget.
+
+  if (FnAttrs.hasFnAttribute("unsafe-fp-math") &&
+      F.getFnAttribute("unsafe-fp-math").getValueAsString() == "true")
+    FS = FS.empty() ? "+unsafe-fp" : "+unsafe-fp," + FS;
 
   auto &I = SubtargetMap[CPU + FS];
   if (!I) {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
index fdcc41a4ca41..cfc8ed813c92 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.cpp
@@ -22,6 +22,7 @@
 #include "llvm/IR/GlobalObject.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/SectionKind.h"
@@ -112,7 +113,6 @@ static const char *getSectionSuffixForSize(unsigned Size) {
 void HexagonTargetObjectFile::Initialize(MCContext &Ctx,
       const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
 
   SmallDataSection =
     getContext().getELFSection(".sdata", ELF::SHT_PROGBITS,
@@ -308,7 +308,8 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
     const ArrayType *ATy = cast<const ArrayType>(Ty);
     return getSmallestAddressableSize(ATy->getElementType(), GV, TM);
   }
-  case Type::VectorTyID: {
+  case Type::FixedVectorTyID:
+  case Type::ScalableVectorTyID: {
     const VectorType *PTy = cast<const VectorType>(Ty);
     return getSmallestAddressableSize(PTy->getElementType(), GV, TM);
   }
@@ -323,6 +324,7 @@ unsigned HexagonTargetObjectFile::getSmallestAddressableSize(const Type *Ty,
   }
   case Type::FunctionTyID:
   case Type::VoidTyID:
+  case Type::BFloatTyID:
   case Type::X86_FP80TyID:
   case Type::FP128TyID:
   case Type::PPC_FP128TyID:
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h
index b36282578950..550aac72346f 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetObjectFile.h
@@ -13,6 +13,7 @@
 #include "llvm/MC/MCSectionELF.h"
 
 namespace llvm {
+  class Type;
 
   class HexagonTargetObjectFile : public TargetLoweringObjectFileELF {
   public:
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h b/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
index c5200b76933e..a5b14a7e0764 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetStreamer.h
@@ -15,13 +15,13 @@ namespace llvm {
 class HexagonTargetStreamer : public MCTargetStreamer {
 public:
   HexagonTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
-  virtual void EmitCodeAlignment(unsigned ByteAlignment,
+  virtual void emitCodeAlignment(unsigned ByteAlignment,
                                  unsigned MaxBytesToEmit = 0){};
   virtual void emitFAlign(unsigned Size, unsigned MaxBytesToEmit){};
-  virtual void EmitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
+  virtual void emitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
                                       unsigned ByteAlignment,
                                       unsigned AccessGranularity){};
-  virtual void EmitLocalCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
+  virtual void emitLocalCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
                                            unsigned ByteAlign,
                                            unsigned AccessGranularity){};
 };
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
index 4d4627cd2071..80c8736cb74a 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.cpp
@@ -45,7 +45,7 @@ bool HexagonTTIImpl::useHVX() const {
 
 bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
   assert(VecTy->isVectorTy());
-  if (cast<VectorType>(VecTy)->isScalable())
+  if (isa<ScalableVectorType>(VecTy))
     return false;
   // Avoid types like <2 x i32*>.
   if (!cast<VectorType>(VecTy)->getElementType()->isIntegerTy())
@@ -60,8 +60,8 @@ bool HexagonTTIImpl::isTypeForHVX(Type *VecTy) const {
 }
 
 unsigned HexagonTTIImpl::getTypeNumElements(Type *Ty) const {
-  if (Ty->isVectorTy())
-    return Ty->getVectorNumElements();
+  if (auto *VTy = dyn_cast<FixedVectorType>(Ty))
+    return VTy->getNumElements();
   assert((Ty->isIntegerTy() || Ty->isFloatingPointTy()) &&
          "Expecting scalar type");
   return 1;
@@ -78,12 +78,17 @@ HexagonTTIImpl::getPopcntSupport(unsigned IntTyWidthInBit) const {
 void HexagonTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                              TTI::UnrollingPreferences &UP) {
   UP.Runtime = UP.Partial = true;
+}
+
+void HexagonTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                                           TTI::PeelingPreferences &PP) {
+  BaseT::getPeelingPreferences(L, SE, PP);
   // Only try to peel innermost loops with small runtime trip counts.
   if (L && L->empty() && canPeel(L) &&
       SE.getSmallConstantTripCount(L) == 0 &&
       SE.getSmallConstantMaxTripCount(L) > 0 &&
       SE.getSmallConstantMaxTripCount(L) <= 5) {
-    UP.PeelCount = 2;
+    PP.PeelCount = 2;
   }
 }
 
@@ -115,9 +120,10 @@ unsigned HexagonTTIImpl::getMinimumVF(unsigned ElemWidth) const {
   return (8 * ST.getVectorLength()) / ElemWidth;
 }
 
-unsigned HexagonTTIImpl::getScalarizationOverhead(Type *Ty, bool Insert,
-      bool Extract) {
-  return BaseT::getScalarizationOverhead(Ty, Insert, Extract);
+unsigned HexagonTTIImpl::getScalarizationOverhead(VectorType *Ty,
+                                                  const APInt &DemandedElts,
+                                                  bool Insert, bool Extract) {
+  return BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, Extract);
 }
 
 unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(
@@ -126,24 +132,18 @@ unsigned HexagonTTIImpl::getOperandsScalarizationOverhead(
 }
 
 unsigned HexagonTTIImpl::getCallInstrCost(Function *F, Type *RetTy,
-      ArrayRef<Type*> Tys) {
-  return BaseT::getCallInstrCost(F, RetTy, Tys);
-}
-
-unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-      ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) {
-  return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
+      ArrayRef<Type*> Tys, TTI::TargetCostKind CostKind) {
+  return BaseT::getCallInstrCost(F, RetTy, Tys, CostKind);
 }
 
-unsigned HexagonTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-      ArrayRef<Type*> Tys, FastMathFlags FMF,
-      unsigned ScalarizationCostPassed) {
-  if (ID == Intrinsic::bswap) {
-    std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, RetTy);
+unsigned
+HexagonTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                                      TTI::TargetCostKind CostKind) {
+  if (ICA.getID() == Intrinsic::bswap) {
+    std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ICA.getReturnType());
     return LT.first + 2;
   }
-  return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
-                                      ScalarizationCostPassed);
+  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
 unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp,
@@ -154,14 +154,20 @@ unsigned HexagonTTIImpl::getAddressComputationCost(Type *Tp,
 unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                          MaybeAlign Alignment,
                                          unsigned AddressSpace,
+                                         TTI::TargetCostKind CostKind,
                                          const Instruction *I) {
   assert(Opcode == Instruction::Load || Opcode == Instruction::Store);
+  // TODO: Handle other cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return 1;
+
   if (Opcode == Instruction::Store)
-    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
+    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                  CostKind, I);
 
   if (Src->isVectorTy()) {
     VectorType *VecTy = cast<VectorType>(Src);
-    unsigned VecWidth = VecTy->getBitWidth();
+    unsigned VecWidth = VecTy->getPrimitiveSizeInBits().getFixedSize();
     if (useHVX() && isTypeForHVX(VecTy)) {
       unsigned RegWidth = getRegisterBitWidth(true);
       assert(RegWidth && "Non-zero vector register width expected");
@@ -183,7 +189,7 @@ unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
     unsigned Cost =
         VecTy->getElementType()->isFloatingPointTy() ? FloatFactor : 1;
 
-    // At this point unspecified alignment is considered as Align::None().
+    // At this point unspecified alignment is considered as Align(1).
     const Align BoundAlignment = std::min(Alignment.valueOrOne(), Align(8));
     unsigned AlignWidth = 8 * BoundAlignment.value();
     unsigned NumLoads = alignTo(VecWidth, AlignWidth) / AlignWidth;
@@ -195,12 +201,16 @@ unsigned HexagonTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
     return (3 - LogA) * Cost * NumLoads;
   }
 
-  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace, I);
+  return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                CostKind, I);
 }
 
-unsigned HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode,
-      Type *Src, unsigned Alignment, unsigned AddressSpace) {
-  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+unsigned HexagonTTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *Src,
+                                               Align Alignment,
+                                               unsigned AddressSpace,
+                                               TTI::TargetCostKind CostKind) {
+  return BaseT::getMaskedMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                      CostKind);
 }
 
 unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
@@ -208,57 +218,70 @@ unsigned HexagonTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp,
   return 1;
 }
 
-unsigned HexagonTTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
-      Value *Ptr, bool VariableMask, unsigned Alignment) {
+unsigned HexagonTTIImpl::getGatherScatterOpCost(
+    unsigned Opcode, Type *DataTy, const Value *Ptr, bool VariableMask,
+    Align Alignment, TTI::TargetCostKind CostKind, const Instruction *I) {
   return BaseT::getGatherScatterOpCost(Opcode, DataTy, Ptr, VariableMask,
-                                       Alignment);
+                                       Alignment, CostKind, I);
 }
 
-unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode,
-      Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
-      unsigned Alignment, unsigned AddressSpace, bool UseMaskForCond,
-      bool UseMaskForGaps) {
+unsigned HexagonTTIImpl::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+    bool UseMaskForCond, bool UseMaskForGaps) {
   if (Indices.size() != Factor || UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
                                              Alignment, AddressSpace,
+                                             CostKind,
                                              UseMaskForCond, UseMaskForGaps);
   return getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace,
-                         nullptr);
+                         CostKind);
 }
 
 unsigned HexagonTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-      Type *CondTy, const Instruction *I) {
-  if (ValTy->isVectorTy()) {
+      Type *CondTy, TTI::TargetCostKind CostKind, const Instruction *I) {
+  if (ValTy->isVectorTy() && CostKind == TTI::TCK_RecipThroughput) {
     std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, ValTy);
     if (Opcode == Instruction::FCmp)
       return LT.first + FloatFactor * getTypeNumElements(ValTy);
   }
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
 }
 
 unsigned HexagonTTIImpl::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+    TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
     const Instruction *CxtI) {
+  // TODO: Handle more cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+                                         Opd2Info, Opd1PropInfo,
+                                         Opd2PropInfo, Args, CxtI);
+
   if (Ty->isVectorTy()) {
     std::pair<int, MVT> LT = TLI.getTypeLegalizationCost(DL, Ty);
     if (LT.second.isFloatingPoint())
       return LT.first + FloatFactor * getTypeNumElements(Ty);
   }
-  return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info, Opd2Info,
                                        Opd1PropInfo, Opd2PropInfo, Args, CxtI);
 }
 
 unsigned HexagonTTIImpl::getCastInstrCost(unsigned Opcode, Type *DstTy,
-      Type *SrcTy, const Instruction *I) {
+      Type *SrcTy, TTI::TargetCostKind CostKind, const Instruction *I) {
   if (SrcTy->isFPOrFPVectorTy() || DstTy->isFPOrFPVectorTy()) {
     unsigned SrcN = SrcTy->isFPOrFPVectorTy() ? getTypeNumElements(SrcTy) : 0;
     unsigned DstN = DstTy->isFPOrFPVectorTy() ? getTypeNumElements(DstTy) : 0;
 
     std::pair<int, MVT> SrcLT = TLI.getTypeLegalizationCost(DL, SrcTy);
     std::pair<int, MVT> DstLT = TLI.getTypeLegalizationCost(DL, DstTy);
-    return std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN);
+    unsigned Cost = std::max(SrcLT.first, DstLT.first) + FloatFactor * (SrcN + DstN);
+    // TODO: Allow non-throughput costs that aren't binary.
+    if (CostKind != TTI::TCK_RecipThroughput)
+      return Cost == 0 ? 0 : 1;
+    return Cost;
   }
   return 1;
 }
@@ -292,8 +315,10 @@ unsigned HexagonTTIImpl::getCacheLineSize() const {
   return ST.getL1CacheLineSize();
 }
 
-int HexagonTTIImpl::getUserCost(const User *U,
-                                ArrayRef<const Value *> Operands) {
+int
+HexagonTTIImpl::getUserCost(const User *U,
+                            ArrayRef<const Value *> Operands,
+                            TTI::TargetCostKind CostKind) {
   auto isCastFoldedIntoLoad = [this](const CastInst *CI) -> bool {
     if (!CI->isIntegerCast())
       return false;
@@ -315,7 +340,7 @@ int HexagonTTIImpl::getUserCost(const User *U,
   if (const CastInst *CI = dyn_cast<const CastInst>(U))
     if (isCastFoldedIntoLoad(CI))
       return TargetTransformInfo::TCC_Free;
-  return BaseT::getUserCost(U, Operands);
+  return BaseT::getUserCost(U, Operands, CostKind);
 }
 
 bool HexagonTTIImpl::shouldBuildLookupTables() const {
diff --git a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
index ace0d797bbdb..5fe397486402 100644
--- a/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
+++ b/llvm/lib/Target/Hexagon/HexagonTargetTransformInfo.h
@@ -64,6 +64,9 @@ public:
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
 
+  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                             TTI::PeelingPreferences &PP);
+
   /// Bias LSR towards creating post-increment opportunities.
   bool shouldFavorPostInc() const;
 
@@ -101,34 +104,41 @@ public:
     return true;
   }
 
-  unsigned getScalarizationOverhead(Type *Ty, bool Insert, bool Extract);
-  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value*> Args,
-            unsigned VF);
-  unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys);
-  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-            ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF);
-  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-            ArrayRef<Type*> Tys, FastMathFlags FMF,
-            unsigned ScalarizationCostPassed = UINT_MAX);
+  unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
+                                    bool Insert, bool Extract);
+  unsigned getOperandsScalarizationOverhead(ArrayRef<const Value *> Args,
+                                            unsigned VF);
+  unsigned getCallInstrCost(Function *F, Type *RetTy, ArrayRef<Type*> Tys,
+                            TTI::TargetCostKind CostKind);
+  unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                                 TTI::TargetCostKind CostKind);
   unsigned getAddressComputationCost(Type *Tp, ScalarEvolution *SE,
             const SCEV *S);
   unsigned getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
                            unsigned AddressSpace,
+                           TTI::TargetCostKind CostKind,
                            const Instruction *I = nullptr);
-  unsigned getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-            unsigned AddressSpace);
+  unsigned
+  getMaskedMemoryOpCost(unsigned Opcode, Type *Src, Align Alignment,
+                        unsigned AddressSpace,
+                        TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
   unsigned getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
             Type *SubTp);
-  unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
-            bool VariableMask, unsigned Alignment);
-  unsigned getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-            unsigned Factor, ArrayRef<unsigned> Indices, unsigned Alignment,
-            unsigned AddressSpace, bool UseMaskForCond = false,
-            bool UseMaskForGaps = false);
+  unsigned getGatherScatterOpCost(unsigned Opcode, Type *DataTy,
+                                  const Value *Ptr, bool VariableMask,
+                                  Align Alignment, TTI::TargetCostKind CostKind,
+                                  const Instruction *I);
+  unsigned getInterleavedMemoryOpCost(
+      unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+      Align Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      bool UseMaskForCond = false, bool UseMaskForGaps = false);
   unsigned getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
-            const Instruction *I);
+                              TTI::TargetCostKind CostKind,
+                              const Instruction *I = nullptr);
   unsigned getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -136,16 +146,18 @@ public:
       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
       const Instruction *CxtI = nullptr);
   unsigned getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+            TTI::TargetCostKind CostKind,
             const Instruction *I = nullptr);
   unsigned getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
 
-  unsigned getCFInstrCost(unsigned Opcode) {
+  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
     return 1;
   }
 
   /// @}
 
-  int getUserCost(const User *U, ArrayRef<const Value *> Operands);
+  int getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                  TTI::TargetCostKind CostKind);
 
   // Hexagon specific decision to generate a lookup table.
   bool shouldBuildLookupTables() const;
diff --git a/llvm/lib/Target/Hexagon/HexagonVExtract.cpp b/llvm/lib/Target/Hexagon/HexagonVExtract.cpp
index b7d6dbe21c74..b5f06ebd3189 100644
--- a/llvm/lib/Target/Hexagon/HexagonVExtract.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVExtract.cpp
@@ -15,7 +15,7 @@
 #include "HexagonRegisterInfo.h"
 #include "HexagonSubtarget.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/PassSupport.h"
+#include "llvm/Pass.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -107,7 +107,7 @@ bool HexagonVExtract::runOnMachineFunction(MachineFunction &MF) {
   Register AR =
       MF.getInfo<HexagonMachineFunctionInfo>()->getStackAlignBaseVReg();
   std::map<unsigned, SmallVector<MachineInstr*,4>> VExtractMap;
-  unsigned MaxAlign = 0;
+  MaybeAlign MaxAlign;
   bool Changed = false;
 
   for (MachineBasicBlock &MBB : MF) {
@@ -137,14 +137,14 @@ bool HexagonVExtract::runOnMachineFunction(MachineFunction &MF) {
       continue;
 
     const auto &VecRC = *MRI.getRegClass(VecR);
-    unsigned Align = HRI.getSpillAlignment(VecRC);
-    MaxAlign = std::max(MaxAlign, Align);
+    Align Alignment = HRI.getSpillAlign(VecRC);
+    MaxAlign = max(MaxAlign, Alignment);
     // Make sure this is not a spill slot: spill slots cannot be aligned
     // if there are variable-sized objects on the stack. They must be
     // accessible via FP (which is not aligned), because SP is unknown,
     // and AP may not be available at the location of the load/store.
-    int FI = MFI.CreateStackObject(HRI.getSpillSize(VecRC), Align,
-                                   /*isSpillSlot*/false);
+    int FI = MFI.CreateStackObject(HRI.getSpillSize(VecRC), Alignment,
+                                   /*isSpillSlot*/ false);
 
     MachineInstr *DefI = MRI.getVRegDef(VecR);
     MachineBasicBlock::iterator At = std::next(DefI->getIterator());
@@ -178,13 +178,13 @@ bool HexagonVExtract::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  if (AR) {
+  if (AR && MaxAlign) {
     // Update the required stack alignment.
     MachineInstr *AlignaI = MRI.getVRegDef(AR);
     assert(AlignaI->getOpcode() == Hexagon::PS_aligna);
     MachineOperand &Op = AlignaI->getOperand(1);
-    if (MaxAlign > Op.getImm())
-      Op.setImm(MaxAlign);
+    if (*MaxAlign > Op.getImm())
+      Op.setImm(MaxAlign->value());
   }
 
   return Changed;
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
index 36d71c41da54..fa1ba4f2e469 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.cpp
@@ -242,6 +242,10 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  // TinyCore with Duplexes: Translate to big-instructions.
+  if (HST.isTinyCoreWithDuplex())
+    HII->translateInstrsForDup(MF, true);
+
   // Loop over all of the basic blocks.
   for (auto &MB : MF) {
     auto Begin = MB.begin(), End = MB.end();
@@ -267,6 +271,10 @@ bool HexagonPacketizer::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
+  // TinyCore with Duplexes: Translate to tiny-instructions.
+  if (HST.isTinyCoreWithDuplex())
+    HII->translateInstrsForDup(MF, false);
+
   Packetizer.unpacketizeSoloInstrs(MF);
   return true;
 }
@@ -1052,12 +1060,11 @@ bool HexagonPacketizerList::ignorePseudoInstruction(const MachineInstr &MI,
   // we ignore the instruction.
   const MCInstrDesc& TID = MI.getDesc();
   auto *IS = ResourceTracker->getInstrItins()->beginStage(TID.getSchedClass());
-  unsigned FuncUnits = IS->getUnits();
-  return !FuncUnits;
+  return !IS->getUnits();
 }
 
 bool HexagonPacketizerList::isSoloInstruction(const MachineInstr &MI) {
-  // Ensure any bundles created by gather packetize remain seperate.
+  // Ensure any bundles created by gather packetize remain separate.
   if (MI.isBundle())
     return true;
 
@@ -1802,6 +1809,8 @@ void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB,
     setmemShufDisabled(false);
   }
 
+  PacketHasDuplex = false;
+  PacketHasSLOT0OnlyInsn = false;
   ResourceTracker->clearResources();
   LLVM_DEBUG(dbgs() << "End packet\n");
 }
@@ -1809,7 +1818,64 @@ void HexagonPacketizerList::endPacket(MachineBasicBlock *MBB,
 bool HexagonPacketizerList::shouldAddToPacket(const MachineInstr &MI) {
   if (Minimal)
     return false;
-  return !producesStall(MI);
+
+  // Constrainst for not packetizing this MI with existing instructions in a
+  // packet.
+  //	MI is a store instruction.
+  //	CurrentPacketMIs has a SLOT0 only instruction with constraint
+  //    A_RESTRICT_NOSLOT1_STORE/isRestrictNoSlot1Store.
+  if (MI.mayStore() && isPureSlot0InsnWithNoSlot1Store(MI))
+    return false;
+
+  if (producesStall(MI))
+    return false;
+
+  // If TinyCore with Duplexes is enabled, check if this MI can form a Duplex
+  // with any other instruction in the existing packet.
+  auto &HST = MI.getParent()->getParent()->getSubtarget<HexagonSubtarget>();
+  // Constraint 1: Only one duplex allowed per packet.
+  // Constraint 2: Consider duplex checks only if there is atleast one
+  // instruction in a packet.
+  // Constraint 3: If one of the existing instructions in the packet has a
+  // SLOT0 only instruction that can not be duplexed, do not attempt to form
+  // duplexes. (TODO: This will invalidate the L4_return* instructions to form a
+  // duplex)
+  if (HST.isTinyCoreWithDuplex() && CurrentPacketMIs.size() > 0 &&
+      !PacketHasDuplex) {
+    // Check for SLOT0 only non-duplexable instruction in packet.
+    for (auto &MJ : CurrentPacketMIs)
+      PacketHasSLOT0OnlyInsn |= HII->isPureSlot0(*MJ);
+    // Get the Big Core Opcode (dup_*).
+    int Opcode = HII->getDuplexOpcode(MI, false);
+    if (Opcode >= 0) {
+      // We now have an instruction that can be duplexed.
+      for (auto &MJ : CurrentPacketMIs) {
+        if (HII->isDuplexPair(MI, *MJ) && !PacketHasSLOT0OnlyInsn) {
+          PacketHasDuplex = true;
+          return true;
+        }
+      }
+      // If it can not be duplexed, check if there is a valid transition in DFA
+      // with the original opcode.
+      MachineInstr &MIRef = const_cast<MachineInstr &>(MI);
+      MIRef.setDesc(HII->get(Opcode));
+      return ResourceTracker->canReserveResources(MIRef);
+    }
+  }
+
+  return true;
+}
+
+bool HexagonPacketizerList::isPureSlot0InsnWithNoSlot1Store(
+    const MachineInstr &MI) {
+  bool noSlot1Store = false;
+  bool isSlot0Only = false;
+  for (auto J : CurrentPacketMIs) {
+    noSlot1Store |= HII->isRestrictNoSlot1Store(*J);
+    isSlot0Only |= HII->isPureSlot0(*J);
+  }
+
+  return (noSlot1Store && isSlot0Only);
 }
 
 // V60 forward scheduling.
diff --git a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
index 943b9ac7ecc4..27a47220570a 100644
--- a/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
+++ b/llvm/lib/Target/Hexagon/HexagonVLIWPacketizer.h
@@ -57,6 +57,13 @@ class HexagonPacketizerList : public VLIWPacketizerList {
   // instruction from the previous packet.
   bool PacketStalls = false;
 
+  // Set to true if the packet has a duplex pair of sub-instructions.
+  bool PacketHasDuplex = false;
+
+  // Set to true if the packet has a instruction that can only be executed
+  // in SLOT0.
+  bool PacketHasSLOT0OnlyInsn = false;
+
 protected:
   /// A handle to the branch probability pass.
   const MachineBranchProbabilityInfo *MBPI;
@@ -149,6 +156,7 @@ protected:
   bool hasRegMaskDependence(const MachineInstr &I, const MachineInstr &J);
   bool hasDualStoreDependence(const MachineInstr &I, const MachineInstr &J);
   bool producesStall(const MachineInstr &MI);
+  bool isPureSlot0InsnWithNoSlot1Store(const MachineInstr &MI);
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp b/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
index 65a8dcd75bdc..fbc5e5c344ed 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorPrint.cpp
@@ -71,9 +71,10 @@ public:
 char HexagonVectorPrint::ID = 0;
 
 static bool isVecReg(unsigned Reg) {
-  return (Reg >= Hexagon::V0 && Reg <= Hexagon::V31)
-      || (Reg >= Hexagon::W0 && Reg <= Hexagon::W15)
-      || (Reg >= Hexagon::Q0 && Reg <= Hexagon::Q3);
+  return (Reg >= Hexagon::V0 && Reg <= Hexagon::V31) ||
+         (Reg >= Hexagon::W0 && Reg <= Hexagon::W15) ||
+         (Reg >= Hexagon::WR0 && Reg <= Hexagon::WR15) ||
+         (Reg >= Hexagon::Q0 && Reg <= Hexagon::Q3);
 }
 
 static std::string getStringReg(unsigned R) {
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
index 8f1e5c1c3a97..e7069819fa57 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonAsmBackend.cpp
@@ -22,6 +22,7 @@
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
 #include <sstream>
@@ -43,6 +44,7 @@ class HexagonAsmBackend : public MCAsmBackend {
   std::unique_ptr <MCInstrInfo> MCII;
   std::unique_ptr <MCInst *> RelaxTarget;
   MCInst * Extender;
+  unsigned MaxPacketSize;
 
   void ReplaceInstruction(MCCodeEmitter &E, MCRelaxableFragment &RF,
                           MCInst &HMB) const {
@@ -62,7 +64,8 @@ public:
                     StringRef CPU)
       : MCAsmBackend(support::little), OSABI(OSABI), CPU(CPU), relaxedCnt(0),
         MCII(T.createMCInstrInfo()), RelaxTarget(new MCInst *),
-        Extender(nullptr) {}
+        Extender(nullptr), MaxPacketSize(HexagonMCInstrInfo::packetSize(CPU))
+        {}
 
   std::unique_ptr<MCObjectTargetWriter>
   createObjectTargetWriter() const override {
@@ -648,11 +651,12 @@ public:
     llvm_unreachable("Handled by fixupNeedsRelaxationAdvanced");
   }
 
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override {
+  void relaxInstruction(MCInst &Inst,
+                        const MCSubtargetInfo &STI) const override {
     assert(HexagonMCInstrInfo::isBundle(Inst) &&
            "Hexagon relaxInstruction only works on bundles");
 
+    MCInst Res;
     Res.setOpcode(Hexagon::BUNDLE);
     Res.addOperand(MCOperand::createImm(Inst.getOperand(0).getImm()));
     // Copy the results into the bundle.
@@ -676,6 +680,8 @@ public:
       // now copy over the original instruction(the one we may have extended)
       Res.addOperand(MCOperand::createInst(I.getInst()));
     }
+
+    Inst = std::move(Res);
     (void)Update;
     assert(Update && "Didn't find relaxation target");
   }
@@ -685,7 +691,7 @@ public:
                           ParseIn  = 0x00004000, // In packet parse-bits.
                           ParseEnd = 0x0000c000; // End of packet parse-bits.
 
-    while(Count % HEXAGON_INSTR_SIZE) {
+    while (Count % HEXAGON_INSTR_SIZE) {
       LLVM_DEBUG(dbgs() << "Alignment not a multiple of the instruction size:"
                         << Count % HEXAGON_INSTR_SIZE << "/"
                         << HEXAGON_INSTR_SIZE << "\n");
@@ -693,11 +699,11 @@ public:
       OS << '\0';
     }
 
-    while(Count) {
+    while (Count) {
       Count -= HEXAGON_INSTR_SIZE;
       // Close the packet whenever a multiple of the maximum packet size remains
-      uint32_t ParseBits = (Count % (HEXAGON_PACKET_SIZE * HEXAGON_INSTR_SIZE))?
-                           ParseIn: ParseEnd;
+      uint32_t ParseBits = (Count % (MaxPacketSize * HEXAGON_INSTR_SIZE)) ?
+                           ParseIn : ParseEnd;
       support::endian::write<uint32_t>(OS, Nopcode | ParseBits, Endian);
     }
     return true;
@@ -728,7 +734,8 @@ public:
               MCContext &Context = Asm.getContext();
               auto &RF = cast<MCRelaxableFragment>(*K);
               auto &Inst = const_cast<MCInst &>(RF.getInst());
-              while (Size > 0 && HexagonMCInstrInfo::bundleSize(Inst) < 4) {
+              while (Size > 0 &&
+                     HexagonMCInstrInfo::bundleSize(Inst) < MaxPacketSize) {
                 MCInst *Nop = new (Context) MCInst;
                 Nop->setOpcode(Hexagon::A2_nop);
                 Inst.addOperand(MCOperand::createInst(Nop));
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
index 3c64893bae45..4125566bc58a 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonBaseInfo.h
@@ -27,11 +27,6 @@ namespace HexagonII {
   unsigned const TypeCVI_FIRST = TypeCVI_4SLOT_MPY;
   unsigned const TypeCVI_LAST = TypeCVI_ZW;
 
-  enum SubTarget {
-    HasV55SubT    = 0x3c,
-    HasV60SubT    = 0x38,
-  };
-
   enum AddrMode {
     NoAddrMode     = 0,  // No addressing mode
     Absolute       = 1,  // Absolute addressing mode
@@ -165,6 +160,9 @@ namespace HexagonII {
 
     CVINewPos = 62,
     CVINewMask = 0x1,
+
+    isCVIPos = 63,
+    isCVIMask = 0x1,
   };
 
   // *** The code above must match HexagonInstrFormat*.td *** //
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
index cdbeae38b3a1..3dba6b07c460 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonELFObjectWriter.cpp
@@ -64,7 +64,7 @@ unsigned HexagonELFObjectWriter::getRelocType(MCContext &Ctx,
       return ELF::R_HEX_IE_GOT_32;
     case MCSymbolRefExpr::VariantKind::VK_Hexagon_LD_GOT:
       return ELF::R_HEX_LD_GOT_32;
-    case MCSymbolRefExpr::VariantKind::VK_Hexagon_PCREL:
+    case MCSymbolRefExpr::VariantKind::VK_PCREL:
       return ELF::R_HEX_32_PCREL;
     case MCSymbolRefExpr::VariantKind::VK_TPREL:
       return ELF::R_HEX_TPREL_32;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
index f3da67562320..e5e5d08937ef 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCAsmInfo.cpp
@@ -34,4 +34,5 @@ HexagonMCAsmInfo::HexagonMCAsmInfo(const Triple &TT) {
   UsesELFSectionDirectiveForBSS  = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
   UseLogicalShr = false;
+  UseIntegratedAssembler = false;
 }
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
index 8b262bd0248e..fee1acdbbe8a 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.cpp
@@ -81,6 +81,9 @@ void HexagonMCChecker::initReg(MCInst const &MCI, unsigned R, unsigned &PredReg,
       if (!MCSubRegIterator(*SRI, &RI).isValid())
         // Skip super-registers used indirectly.
         Uses.insert(*SRI);
+
+  if (HexagonMCInstrInfo::IsReverseVecRegPair(R))
+    ReversePairs.insert(R);
 }
 
 void HexagonMCChecker::init(MCInst const &MCI) {
@@ -133,6 +136,9 @@ void HexagonMCChecker::init(MCInst const &MCI) {
     if (R == Hexagon::C8)
       R = Hexagon::USR;
 
+    if (HexagonMCInstrInfo::IsReverseVecRegPair(R))
+      ReversePairs.insert(R);
+
     // Note register definitions, direct ones as well as indirect side-effects.
     // Super-registers are not tracked directly, but their components.
     for (MCRegAliasIterator SRI(R, &RI, !MCSubRegIterator(R, &RI).isValid());
@@ -192,7 +198,7 @@ HexagonMCChecker::HexagonMCChecker(MCContext &Context, MCInstrInfo const &MCII,
                                    MCSubtargetInfo const &STI, MCInst &mcb,
                                    MCRegisterInfo const &ri, bool ReportErrors)
     : Context(Context), MCB(mcb), RI(ri), MCII(MCII), STI(STI),
-      ReportErrors(ReportErrors) {
+      ReportErrors(ReportErrors), ReversePairs() {
   init();
 }
 
@@ -200,7 +206,10 @@ HexagonMCChecker::HexagonMCChecker(HexagonMCChecker const &Other,
                                    MCSubtargetInfo const &STI,
                                    bool CopyReportErrors)
     : Context(Other.Context), MCB(Other.MCB), RI(Other.RI), MCII(Other.MCII),
-      STI(STI), ReportErrors(CopyReportErrors ? Other.ReportErrors : false) {}
+      STI(STI), ReportErrors(CopyReportErrors ? Other.ReportErrors : false),
+      ReversePairs() {
+  init();
+}
 
 bool HexagonMCChecker::check(bool FullCheck) {
   bool chkP = checkPredicates();
@@ -218,8 +227,9 @@ bool HexagonMCChecker::check(bool FullCheck) {
   bool chkAXOK = checkAXOK();
   bool chkCofMax1 = checkCOFMax1();
   bool chkHWLoop = checkHWLoop();
+  bool chkLegalVecRegPair = checkLegalVecRegPair();
   bool chk = chkP && chkNV && chkR && chkRRO && chkS && chkSh && chkSl &&
-             chkAXOK && chkCofMax1 && chkHWLoop;
+             chkAXOK && chkCofMax1 && chkHWLoop && chkLegalVecRegPair;
 
   return chk;
 }
@@ -381,7 +391,7 @@ bool HexagonMCChecker::checkPredicates() {
   for (const auto &I : NewPreds) {
     unsigned P = I;
 
-    if (!Defs.count(P) || LatePreds.count(P)) {
+    if (!Defs.count(P) || LatePreds.count(P) || Defs.count(Hexagon::P3_0)) {
       // Error out if the new predicate register is not defined,
       // or defined "late"
       // (e.g., "{ if (p3.new)... ; p3 = sp1loop0(#r7:2, Rs) }").
@@ -729,3 +739,16 @@ void HexagonMCChecker::reportWarning(Twine const &Msg) {
   if (ReportErrors)
     Context.reportWarning(MCB.getLoc(), Msg);
 }
+
+bool HexagonMCChecker::checkLegalVecRegPair() {
+  const bool IsPermitted = STI.getFeatureBits()[Hexagon::ArchV67];
+  const bool HasReversePairs = ReversePairs.size() != 0;
+
+  if (!IsPermitted && HasReversePairs) {
+    for (auto R : ReversePairs)
+      reportError("register pair `" + Twine(RI.getName(R)) +
+                  "' is not permitted for this architecture");
+    return false;
+  }
+  return true;
+}
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
index bc55ade9ccd7..00afdb664ba5 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCChecker.h
@@ -72,6 +72,10 @@ class HexagonMCChecker {
   using ReadOnlyIterator = std::set<unsigned>::iterator;
   std::set<unsigned> ReadOnly;
 
+  // Contains the vector-pair-registers with the even number
+  // first ("v0:1", e.g.) used/def'd in this packet.
+  std::set<unsigned> ReversePairs;
+
   void init();
   void init(MCInst const &);
   void initReg(MCInst const &, unsigned, unsigned &PredReg, bool &isTrue);
@@ -94,6 +98,7 @@ class HexagonMCChecker {
   bool checkAXOK();
   bool checkHWLoop();
   bool checkCOFMax1();
+  bool checkLegalVecRegPair();
 
   static void compoundRegisterMap(unsigned &);
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
index 95e23c99868a..24169c83bdb9 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCCodeEmitter.cpp
@@ -147,7 +147,7 @@ static const std::map<unsigned, std::vector<unsigned>> ExtFixups = {
       _,                _,              _,                      _,
       _,                _,              _,                      _,
       _                 }},
-  { MCSymbolRefExpr::VK_Hexagon_PCREL,
+  { MCSymbolRefExpr::VK_PCREL,
     { _,                _,              _,                      _,
       _,                _,              P(_6_PCREL_X),          _,
       _,                P(_9_X),        _,                      _,
@@ -311,7 +311,7 @@ static const std::map<unsigned, std::vector<unsigned>> StdFixups = {
       _,                _,              _,                      _,
       _,                _,              _,                      _,
       _                 }},
-  { MCSymbolRefExpr::VK_Hexagon_PCREL,
+  { MCSymbolRefExpr::VK_PCREL,
     { _,                _,              _,                      _,
       _,                _,              _,                      _,
       _,                _,              _,                      _,
@@ -391,15 +391,9 @@ void HexagonMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
 
 static bool RegisterMatches(unsigned Consumer, unsigned Producer,
                             unsigned Producer2) {
-  if (Consumer == Producer)
-    return true;
-  if (Consumer == Producer2)
-    return true;
-  // Calculate if we're a single vector consumer referencing a double producer
-  if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
-    if (Consumer >= Hexagon::V0 && Consumer <= Hexagon::V31)
-      return ((Consumer - Hexagon::V0) >> 1) == (Producer - Hexagon::W0);
-  return false;
+  return (Consumer == Producer) || (Consumer == Producer2) ||
+         HexagonMCInstrInfo::IsSingleConsumerRefPairProducer(Producer,
+                                                             Consumer);
 }
 
 /// EncodeSingleInstruction - Emit a single
@@ -497,7 +491,7 @@ Hexagon::Fixups HexagonMCCodeEmitter::getFixupNoBits(
       { MCSymbolRefExpr::VK_Hexagon_LD_GOT, fixup_Hexagon_LD_GOT_32_6_X },
       { MCSymbolRefExpr::VK_Hexagon_IE,     fixup_Hexagon_IE_32_6_X },
       { MCSymbolRefExpr::VK_Hexagon_IE_GOT, fixup_Hexagon_IE_GOT_32_6_X },
-      { MCSymbolRefExpr::VK_Hexagon_PCREL,  fixup_Hexagon_B32_PCREL_X },
+      { MCSymbolRefExpr::VK_PCREL,          fixup_Hexagon_B32_PCREL_X },
       { MCSymbolRefExpr::VK_Hexagon_GD_PLT, fixup_Hexagon_GD_PLT_B32_PCREL_X },
       { MCSymbolRefExpr::VK_Hexagon_LD_PLT, fixup_Hexagon_LD_PLT_B32_PCREL_X },
     };
@@ -735,7 +729,8 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
     unsigned SOffset = 0;
     unsigned VOffset = 0;
     unsigned UseReg = MO.getReg();
-    unsigned DefReg1, DefReg2;
+    unsigned DefReg1 = Hexagon::NoRegister;
+    unsigned DefReg2 = Hexagon::NoRegister;
 
     auto Instrs = HexagonMCInstrInfo::bundleInstructions(*State.Bundle);
     const MCOperand *I = Instrs.begin() + State.Index - 1;
@@ -746,7 +741,8 @@ HexagonMCCodeEmitter::getMachineOpValue(MCInst const &MI, MCOperand const &MO,
       if (HexagonMCInstrInfo::isImmext(Inst))
         continue;
 
-      DefReg1 = DefReg2 = 0;
+      DefReg1 = Hexagon::NoRegister;
+      DefReg2 = Hexagon::NoRegister;
       ++SOffset;
       if (HexagonMCInstrInfo::isVector(MCII, Inst)) {
         // Vector instructions don't count scalars.
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
index 3cbb8600ce7a..5154a0a1e46c 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCDuplexInfo.cpp
@@ -10,12 +10,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "HexagonMCExpr.h"
 #include "MCTargetDesc/HexagonBaseInfo.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -296,8 +295,7 @@ unsigned HexagonMCInstrInfo::getDuplexCandidateGroup(MCInst const &MCI) {
     DstReg = MCI.getOperand(1).getReg();
     SrcReg = MCI.getOperand(0).getReg();
     // [if ([!]p0[.new])] jumpr r31
-    if ((HexagonMCInstrInfo::isPredReg(SrcReg) && (Hexagon::P0 == SrcReg)) &&
-        (Hexagon::R31 == DstReg)) {
+    if ((Hexagon::P0 == SrcReg) && (Hexagon::R31 == DstReg)) {
       return HexagonII::HSIG_L2;
     }
     break;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
index a799f7f7c0b9..53e76a8b9ed7 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.cpp
@@ -58,7 +58,7 @@ HexagonMCELFStreamer::HexagonMCELFStreamer(
     : MCELFStreamer(Context, std::move(TAB), std::move(OW), std::move(Emitter)),
       MCII(createHexagonMCInstrInfo()) {}
 
-void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCB,
+void HexagonMCELFStreamer::emitInstruction(const MCInst &MCB,
                                            const MCSubtargetInfo &STI) {
   assert(MCB.getOpcode() == Hexagon::BUNDLE);
   assert(HexagonMCInstrInfo::bundleSize(MCB) <= HEXAGON_PACKET_SIZE);
@@ -71,7 +71,7 @@ void HexagonMCELFStreamer::EmitInstruction(const MCInst &MCB,
     EmitSymbol(*MCI);
   }
 
-  MCObjectStreamer::EmitInstruction(MCB, STI);
+  MCObjectStreamer::emitInstruction(MCB, STI);
 }
 
 void HexagonMCELFStreamer::EmitSymbol(const MCInst &Inst) {
@@ -110,9 +110,9 @@ void HexagonMCELFStreamer::HexagonMCEmitCommonSymbol(MCSymbol *Symbol,
     SwitchSection(&Section);
 
     if (ELFSymbol->isUndefined()) {
-      EmitValueToAlignment(ByteAlignment, 0, 1, 0);
-      EmitLabel(Symbol);
-      EmitZeros(Size);
+      emitValueToAlignment(ByteAlignment, 0, 1, 0);
+      emitLabel(Symbol);
+      emitZeros(Size);
     }
 
     // Update the maximum alignment of the section if necessary.
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
index 6248bd25d433..edf4ce29f908 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCELFStreamer.h
@@ -30,7 +30,7 @@ public:
                        std::unique_ptr<MCCodeEmitter> Emitter,
                        MCAssembler *Assembler);
 
-  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+  void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
   void EmitSymbol(const MCInst &Inst);
   void HexagonMCEmitLocalCommonSymbol(MCSymbol *Symbol, uint64_t Size,
                                       unsigned ByteAlignment,
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
index 59b1326adf0c..e88f46a04dae 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCExpr.h
@@ -12,7 +12,6 @@
 #include "llvm/MC/MCExpr.h"
 
 namespace llvm {
-class MCInst;
 class HexagonMCExpr : public MCTargetExpr {
 public:
   static HexagonMCExpr *create(MCExpr const *Expr, MCContext &Ctx);
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
index 0750bfe74f76..f9f342a07f6d 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.cpp
@@ -17,6 +17,7 @@
 #include "MCTargetDesc/HexagonMCShuffler.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
@@ -119,10 +120,10 @@ size_t HexagonMCInstrInfo::bundleSize(MCInst const &MCI) {
     return (1);
 }
 
-bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII,
-                                            MCSubtargetInfo const &STI,
-                                            MCContext &Context, MCInst &MCB,
-                                            HexagonMCChecker *Check) {
+namespace {
+bool canonicalizePacketImpl(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                            MCContext &Context, MCInst &MCB,
+                            HexagonMCChecker *Check) {
   // Check the bundle for errors.
   bool CheckOk = Check ? Check->check(false) : true;
   if (!CheckOk)
@@ -132,9 +133,9 @@ bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII,
   if (!HexagonDisableCompound)
     HexagonMCInstrInfo::tryCompound(MCII, STI, Context, MCB);
   HexagonMCShuffle(Context, false, MCII, STI, MCB);
+
   // Examine the packet and convert pairs of instructions to duplex
   // instructions when possible.
-  MCInst InstBundlePreDuplex = MCInst(MCB);
   if (STI.getFeatureBits() [Hexagon::FeatureDuplex]) {
     SmallVector<DuplexCandidate, 8> possibleDuplexes;
     possibleDuplexes =
@@ -146,8 +147,11 @@ bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII,
   HexagonMCInstrInfo::padEndloop(MCB, Context);
   // If compounding and duplexing didn't reduce the size below
   // 4 or less we have a packet that is too big.
-  if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE)
+  if (HexagonMCInstrInfo::bundleSize(MCB) > HEXAGON_PACKET_SIZE) {
+    if (Check)
+      Check->reportError("invalid instruction packet: out of slots");
     return false;
+  }
   // Check the bundle for errors.
   CheckOk = Check ? Check->check(true) : true;
   if (!CheckOk)
@@ -155,6 +159,27 @@ bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII,
   HexagonMCShuffle(Context, true, MCII, STI, MCB);
   return true;
 }
+} // namespace
+
+bool HexagonMCInstrInfo::canonicalizePacket(MCInstrInfo const &MCII,
+                                            MCSubtargetInfo const &STI,
+                                            MCContext &Context, MCInst &MCB,
+                                            HexagonMCChecker *Check,
+                                            bool AttemptCompatibility) {
+  auto ArchSTI = Hexagon_MC::getArchSubtarget(&STI);
+  if (!AttemptCompatibility || ArchSTI == nullptr)
+    return canonicalizePacketImpl(MCII, STI, Context, MCB, Check);
+
+  const MCRegisterInfo *RI = Context.getRegisterInfo();
+  HexagonMCChecker DefaultCheck(Context, MCII, STI, MCB, *RI, false);
+  HexagonMCChecker *BaseCheck = (Check == nullptr) ? &DefaultCheck : Check;
+  HexagonMCChecker PerfCheck(*BaseCheck, STI, false);
+  if (canonicalizePacketImpl(MCII, STI, Context, MCB, &PerfCheck))
+    return true;
+
+  HexagonMCChecker ArchCheck(*BaseCheck, *ArchSTI, true);
+  return canonicalizePacketImpl(MCII, *ArchSTI, Context, MCB, &ArchCheck);
+}
 
 MCInst HexagonMCInstrInfo::deriveExtender(MCInstrInfo const &MCII,
                                           MCInst const &Inst,
@@ -394,6 +419,26 @@ unsigned HexagonMCInstrInfo::getType(MCInstrInfo const &MCII,
   return ((F >> HexagonII::TypePos) & HexagonII::TypeMask);
 }
 
+/// Return the resources used by this instruction
+unsigned HexagonMCInstrInfo::getCVIResources(MCInstrInfo const &MCII,
+                                      MCSubtargetInfo const &STI,
+                                      MCInst const &MCI) {
+
+  const InstrItinerary *II = STI.getSchedModel().InstrItineraries;
+  int SchedClass = HexagonMCInstrInfo::getDesc(MCII, MCI).getSchedClass();
+  int Size = II[SchedClass].LastStage - II[SchedClass].FirstStage;
+
+  // HVX resources used are currenty located at the second to last stage.
+  // This could also be done with a linear search of the stages looking for:
+  // CVI_ALL, CVI_MPY01, CVI_XLSHF, CVI_MPY0, CVI_MPY1, CVI_SHIFT, CVI_XLANE,
+  // CVI_ZW
+  unsigned Stage = II[SchedClass].LastStage - 1;
+
+  if (Size < 2)
+    return 0;
+  return ((Stage + HexagonStages)->getUnits());
+}
+
 /// Return the slots this instruction can execute out of
 unsigned HexagonMCInstrInfo::getUnits(MCInstrInfo const &MCII,
                                       MCSubtargetInfo const &STI,
@@ -473,7 +518,7 @@ bool HexagonMCInstrInfo::hasNewValue2(MCInstrInfo const &MCII,
 
 MCInst const &HexagonMCInstrInfo::instruction(MCInst const &MCB, size_t Index) {
   assert(isBundle(MCB));
-  assert(Index < HEXAGON_PACKET_SIZE);
+  assert(Index < HEXAGON_PRESHUFFLE_PACKET_SIZE);
   return *MCB.getOperand(bundleInstructionsOffset + Index).getInst();
 }
 
@@ -613,6 +658,12 @@ bool HexagonMCInstrInfo::isNewValue(MCInstrInfo const &MCII,
   return ((F >> HexagonII::NewValuePos) & HexagonII::NewValueMask);
 }
 
+bool HexagonMCInstrInfo::isNewValueStore(MCInstrInfo const &MCII,
+                                         MCInst const &MCI) {
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return (F >> HexagonII::NVStorePos) & HexagonII::NVStoreMask;
+}
+
 /// Return whether the operand is extendable.
 bool HexagonMCInstrInfo::isOpExtendable(MCInstrInfo const &MCII,
                                         MCInst const &MCI, unsigned short O) {
@@ -625,6 +676,45 @@ bool HexagonMCInstrInfo::isOuterLoop(MCInst const &MCI) {
   return (Flags & outerLoopMask) != 0;
 }
 
+bool HexagonMCInstrInfo::IsVecRegPair(unsigned VecReg) {
+  return (VecReg >= Hexagon::W0 && VecReg <= Hexagon::W15) ||
+         (VecReg >= Hexagon::WR0 && VecReg <= Hexagon::WR15);
+}
+
+bool HexagonMCInstrInfo::IsReverseVecRegPair(unsigned VecReg) {
+  return (VecReg >= Hexagon::WR0 && VecReg <= Hexagon::WR15);
+}
+
+bool HexagonMCInstrInfo::IsVecRegSingle(unsigned VecReg) {
+  return (VecReg >= Hexagon::V0 && VecReg <= Hexagon::V31);
+}
+
+std::pair<unsigned, unsigned>
+HexagonMCInstrInfo::GetVecRegPairIndices(unsigned VecRegPair) {
+  assert(IsVecRegPair(VecRegPair) &&
+         "VecRegPair must be a vector register pair");
+
+  const bool IsRev = IsReverseVecRegPair(VecRegPair);
+  const unsigned PairIndex =
+      2 * (IsRev ? VecRegPair - Hexagon::WR0 : VecRegPair - Hexagon::W0);
+
+  return IsRev ? std::make_pair(PairIndex, PairIndex + 1)
+               : std::make_pair(PairIndex + 1, PairIndex);
+}
+
+bool HexagonMCInstrInfo::IsSingleConsumerRefPairProducer(unsigned Producer,
+                                                         unsigned Consumer) {
+  if (IsVecRegPair(Producer) && IsVecRegSingle(Consumer)) {
+    const unsigned ProdPairIndex = IsReverseVecRegPair(Producer)
+                                       ? Producer - Hexagon::WR0
+                                       : Producer - Hexagon::W0;
+    const unsigned ConsumerSingleIndex = (Consumer - Hexagon::V0) >> 1;
+
+    return ConsumerSingleIndex == ProdPairIndex;
+  }
+  return false;
+}
+
 bool HexagonMCInstrInfo::isPredicated(MCInstrInfo const &MCII,
                                       MCInst const &MCI) {
   const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
@@ -655,8 +745,17 @@ bool HexagonMCInstrInfo::isPredicatedTrue(MCInstrInfo const &MCII,
       !((F >> HexagonII::PredicatedFalsePos) & HexagonII::PredicatedFalseMask));
 }
 
-bool HexagonMCInstrInfo::isPredReg(unsigned Reg) {
-  return (Reg >= Hexagon::P0 && Reg <= Hexagon::P3_0);
+bool HexagonMCInstrInfo::isPredReg(MCRegisterInfo const &MRI, unsigned Reg) {
+  auto &PredRegClass = MRI.getRegClass(Hexagon::PredRegsRegClassID);
+  return PredRegClass.contains(Reg);
+}
+
+bool HexagonMCInstrInfo::isPredRegister(MCInstrInfo const &MCII,
+                                        MCInst const &Inst, unsigned I) {
+  MCInstrDesc const &Desc = HexagonMCInstrInfo::getDesc(MCII, Inst);
+
+  return Inst.getOperand(I).isReg() &&
+         Desc.OpInfo[I].RegClass == Hexagon::PredRegsRegClassID;
 }
 
 /// Return whether the insn can be packaged only with A and X-type insns.
@@ -753,10 +852,8 @@ bool HexagonMCInstrInfo::isSubInstruction(MCInst const &MCI) {
 }
 
 bool HexagonMCInstrInfo::isVector(MCInstrInfo const &MCII, MCInst const &MCI) {
-  if ((getType(MCII, MCI) <= HexagonII::TypeCVI_LAST) &&
-      (getType(MCII, MCI) >= HexagonII::TypeCVI_FIRST))
-    return true;
-  return false;
+  const uint64_t F = HexagonMCInstrInfo::getDesc(MCII, MCI).TSFlags;
+  return (F >> HexagonII::isCVIPos) & HexagonII::isCVIMask;
 }
 
 int64_t HexagonMCInstrInfo::minConstant(MCInst const &MCI, size_t Index) {
@@ -802,6 +899,18 @@ bool HexagonMCInstrInfo::s27_2_reloc(MCExpr const &Expr) {
   return HExpr->s27_2_reloc();
 }
 
+unsigned HexagonMCInstrInfo::packetSizeSlots(MCSubtargetInfo const &STI) {
+  const bool IsTiny = STI.getFeatureBits()[Hexagon::ProcTinyCore];
+
+  return IsTiny ? (HEXAGON_PACKET_SIZE - 1) : HEXAGON_PACKET_SIZE;
+}
+
+unsigned HexagonMCInstrInfo::packetSize(StringRef CPU) {
+  return llvm::StringSwitch<unsigned>(CPU)
+      .Case("hexagonv67t", 3)
+      .Default(4);
+}
+
 void HexagonMCInstrInfo::padEndloop(MCInst &MCB, MCContext &Context) {
   MCInst Nop;
   Nop.setOpcode(Hexagon::A2_nop);
@@ -836,6 +945,33 @@ bool HexagonMCInstrInfo::hasTmpDst(MCInstrInfo const &MCII, MCInst const &MCI) {
   return (F >> HexagonII::HasTmpDstPos) & HexagonII::HasTmpDstMask;
 }
 
+bool HexagonMCInstrInfo::requiresSlot(MCSubtargetInfo const &STI,
+                                      MCInst const &MCI) {
+  const unsigned OpCode = MCI.getOpcode();
+  const bool IsTiny = STI.getFeatureBits() [Hexagon::ProcTinyCore];
+  const bool NoSlotReqd = Hexagon::A4_ext == OpCode ||
+                          (IsTiny && Hexagon::A2_nop == OpCode) ||
+                          (IsTiny && Hexagon::J4_hintjumpr == OpCode);
+
+  return !NoSlotReqd;
+}
+
+unsigned HexagonMCInstrInfo::slotsConsumed(MCInstrInfo const &MCII,
+                                           MCSubtargetInfo const &STI,
+                                           MCInst const &MCI) {
+  unsigned slotsUsed = 0;
+  for (auto HMI : bundleInstructions(MCI)) {
+    MCInst const &MCI = *HMI.getInst();
+    if (!requiresSlot(STI, MCI))
+      continue;
+    if (isDuplex(MCII, MCI))
+      slotsUsed += 2;
+    else
+      ++slotsUsed;
+  }
+  return slotsUsed;
+}
+
 void HexagonMCInstrInfo::replaceDuplex(MCContext &Context, MCInst &MCB,
                                        DuplexCandidate Candidate) {
   assert(Candidate.packetIndexI < MCB.size());
@@ -874,9 +1010,8 @@ unsigned HexagonMCInstrInfo::SubregisterBit(unsigned Consumer,
                                             unsigned Producer2) {
   // If we're a single vector consumer of a double producer, set subreg bit
   // based on if we're accessing the lower or upper register component
-  if (Producer >= Hexagon::W0 && Producer <= Hexagon::W15)
-    if (Consumer >= Hexagon::V0 && Consumer <= Hexagon::V31)
-      return (Consumer - Hexagon::V0) & 0x1;
+  if (IsVecRegPair(Producer) && IsVecRegSingle(Consumer))
+    return (Consumer - Hexagon::V0) & 0x1;
   if (Producer2 != Hexagon::NoRegister)
     return Consumer == Producer;
   return 0;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
index 829f872c453e..7b3c079880f8 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCInstrInfo.h
@@ -28,6 +28,7 @@ class MCContext;
 class MCExpr;
 class MCInstrDesc;
 class MCInstrInfo;
+class MCRegisterInfo;
 class MCSubtargetInfo;
 
 class DuplexCandidate {
@@ -91,7 +92,8 @@ size_t bundleSize(MCInst const &MCI);
 // Put the packet in to canonical form, compound, duplex, pad, and shuffle
 bool canonicalizePacket(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                         MCContext &Context, MCInst &MCB,
-                        HexagonMCChecker *Checker);
+                        HexagonMCChecker *Checker,
+                        bool AttemptCompatibility = false);
 
 // Create a duplex instruction given the two subinsts
 MCInst *deriveDuplex(MCContext &Context, unsigned iClass, MCInst const &inst0,
@@ -165,6 +167,11 @@ MCOperand const &getNewValueOperand2(MCInstrInfo const &MCII,
 // Return the Hexagon ISA class for the insn.
 unsigned getType(MCInstrInfo const &MCII, MCInst const &MCI);
 
+/// Return the resources used by this instruction
+unsigned getCVIResources(MCInstrInfo const &MCII,
+                                      MCSubtargetInfo const &STI,
+                                      MCInst const &MCI);
+
 /// Return the slots used by the insn.
 unsigned getUnits(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                   MCInst const &MCI);
@@ -252,6 +259,8 @@ bool isMemReorderDisabled(MCInst const &MCI);
 
 // Return whether the insn is a new-value consumer.
 bool isNewValue(MCInstrInfo const &MCII, MCInst const &MCI);
+/// Return true if the operand is a new-value store insn.
+bool isNewValueStore(MCInstrInfo const &MCII, MCInst const &MCI);
 bool isOpExtendable(MCInstrInfo const &MCII, MCInst const &MCI, unsigned short);
 
 // Can these two instructions be duplexed
@@ -270,8 +279,11 @@ bool isPredicatedNew(MCInstrInfo const &MCII, MCInst const &MCI);
 // Return whether the predicate sense is true
 bool isPredicatedTrue(MCInstrInfo const &MCII, MCInst const &MCI);
 
-// Is this a predicate register
-bool isPredReg(unsigned Reg);
+// Return true if this is a scalar predicate register.
+bool isPredReg(MCRegisterInfo const &MRI, unsigned Reg);
+
+// Returns true if the Ith operand is a predicate register.
+bool isPredRegister(MCInstrInfo const &MCII, MCInst const &Inst, unsigned I);
 
 // Return whether the insn is a prefix.
 bool isPrefix(MCInstrInfo const &MCII, MCInst const &MCI);
@@ -290,6 +302,21 @@ bool isVector(MCInstrInfo const &MCII, MCInst const &MCI);
 bool mustExtend(MCExpr const &Expr);
 bool mustNotExtend(MCExpr const &Expr);
 
+// Returns true if this instruction requires a slot to execute.
+bool requiresSlot(MCSubtargetInfo const &STI, MCInst const &MCI);
+
+unsigned packetSize(StringRef CPU);
+
+// Returns the maximum number of slots available in the given
+// subtarget's packets.
+unsigned packetSizeSlots(MCSubtargetInfo const &STI);
+
+// Returns the number of slots consumed by this packet, considering duplexed
+// and compound instructions.
+unsigned slotsConsumed(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
+                       MCInst const &MCI);
+
+
 // Pad the bundle with nops to satisfy endloop requirements
 void padEndloop(MCInst &MCI, MCContext &Context);
 class PredicateInfo {
@@ -324,6 +351,16 @@ bool subInstWouldBeExtended(MCInst const &potentialDuplex);
 unsigned SubregisterBit(unsigned Consumer, unsigned Producer,
                         unsigned Producer2);
 
+bool IsVecRegSingle(unsigned VecReg);
+bool IsVecRegPair(unsigned VecReg);
+bool IsReverseVecRegPair(unsigned VecReg);
+bool IsSingleConsumerRefPairProducer(unsigned Producer, unsigned Consumer);
+
+/// Returns an ordered pair of the constituent register ordinals for
+/// each of the elements of \a VecRegPair.  For example, Hexagon::W0 ("v0:1")
+/// returns { 0, 1 } and Hexagon::W1 ("v3:2") returns { 3, 2 }.
+std::pair<unsigned, unsigned> GetVecRegPairIndices(unsigned VecRegPair);
+
 // Attempt to find and replace compound pairs
 void tryCompound(MCInstrInfo const &MCII, MCSubtargetInfo const &STI,
                  MCContext &Context, MCInst &MCI);
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
index f8dc0547baad..7514d0e67744 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.cpp
@@ -10,13 +10,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/HexagonMCTargetDesc.h"
-#include "HexagonDepArch.h"
+#include "HexagonArch.h"
 #include "HexagonTargetStreamer.h"
 #include "MCTargetDesc/HexagonInstPrinter.h"
 #include "MCTargetDesc/HexagonMCAsmInfo.h"
 #include "MCTargetDesc/HexagonMCELFStreamer.h"
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
+#include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "TargetInfo/HexagonTargetInfo.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/StringRef.h"
@@ -37,8 +37,10 @@
 #include "llvm/Support/raw_ostream.h"
 #include <cassert>
 #include <cstdint>
+#include <mutex>
 #include <new>
 #include <string>
+#include <unordered_map>
 
 using namespace llvm;
 
@@ -72,6 +74,10 @@ cl::opt<bool> MV65("mv65", cl::Hidden, cl::desc("Build for Hexagon V65"),
                    cl::init(false));
 cl::opt<bool> MV66("mv66", cl::Hidden, cl::desc("Build for Hexagon V66"),
                    cl::init(false));
+cl::opt<bool> MV67("mv67", cl::Hidden, cl::desc("Build for Hexagon V67"),
+                   cl::init(false));
+cl::opt<bool> MV67T("mv67t", cl::Hidden, cl::desc("Build for Hexagon V67T"),
+                    cl::init(false));
 
 cl::opt<Hexagon::ArchEnum>
     EnableHVX("mhvx",
@@ -81,6 +87,7 @@ cl::opt<Hexagon::ArchEnum>
         clEnumValN(Hexagon::ArchEnum::V62, "v62", "Build for HVX v62"),
         clEnumValN(Hexagon::ArchEnum::V65, "v65", "Build for HVX v65"),
         clEnumValN(Hexagon::ArchEnum::V66, "v66", "Build for HVX v66"),
+        clEnumValN(Hexagon::ArchEnum::V67, "v67", "Build for HVX v67"),
         // Sentinel for no value specified.
         clEnumValN(Hexagon::ArchEnum::Generic, "", "")),
       // Sentinel for flag not present.
@@ -107,14 +114,22 @@ static StringRef HexagonGetArchVariant() {
     return "hexagonv65";
   if (MV66)
     return "hexagonv66";
+  if (MV67)
+    return "hexagonv67";
+  if (MV67T)
+    return "hexagonv67t";
   return "";
 }
 
 StringRef Hexagon_MC::selectHexagonCPU(StringRef CPU) {
   StringRef ArchV = HexagonGetArchVariant();
   if (!ArchV.empty() && !CPU.empty()) {
-    if (ArchV != CPU)
-      report_fatal_error("conflicting architectures specified.");
+    // Tiny cores have a "t" suffix that is discarded when creating a secondary
+    // non-tiny subtarget.  See: addArchSubtarget
+    std::pair<StringRef,StringRef> ArchP = ArchV.split('t');
+    std::pair<StringRef,StringRef> CPUP = CPU.split('t');
+    if (!ArchP.first.equals(CPUP.first))
+        report_fatal_error("conflicting architectures specified.");
     return CPU;
   }
   if (ArchV.empty()) {
@@ -127,6 +142,56 @@ StringRef Hexagon_MC::selectHexagonCPU(StringRef CPU) {
 
 unsigned llvm::HexagonGetLastSlot() { return HexagonItinerariesV5FU::SLOT3; }
 
+unsigned llvm::HexagonConvertUnits(unsigned ItinUnits, unsigned *Lanes) {
+  enum {
+    CVI_NONE = 0,
+    CVI_XLANE = 1 << 0,
+    CVI_SHIFT = 1 << 1,
+    CVI_MPY0 = 1 << 2,
+    CVI_MPY1 = 1 << 3,
+    CVI_ZW = 1 << 4
+  };
+
+  if (ItinUnits == HexagonItinerariesV62FU::CVI_ALL ||
+      ItinUnits == HexagonItinerariesV62FU::CVI_ALL_NOMEM)
+    return (*Lanes = 4, CVI_XLANE);
+  else if (ItinUnits & HexagonItinerariesV62FU::CVI_MPY01 &&
+           ItinUnits & HexagonItinerariesV62FU::CVI_XLSHF)
+    return (*Lanes = 2, CVI_XLANE | CVI_MPY0);
+  else if (ItinUnits & HexagonItinerariesV62FU::CVI_MPY01)
+    return (*Lanes = 2, CVI_MPY0);
+  else if (ItinUnits & HexagonItinerariesV62FU::CVI_XLSHF)
+    return (*Lanes = 2, CVI_XLANE);
+  else if (ItinUnits & HexagonItinerariesV62FU::CVI_XLANE &&
+           ItinUnits & HexagonItinerariesV62FU::CVI_SHIFT &&
+           ItinUnits & HexagonItinerariesV62FU::CVI_MPY0 &&
+           ItinUnits & HexagonItinerariesV62FU::CVI_MPY1)
+    return (*Lanes = 1, CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1);
+  else if (ItinUnits & HexagonItinerariesV62FU::CVI_XLANE &&
+           ItinUnits & HexagonItinerariesV62FU::CVI_SHIFT)
+    return (*Lanes = 1, CVI_XLANE | CVI_SHIFT);
+  else if (ItinUnits & HexagonItinerariesV62FU::CVI_MPY0 &&
+           ItinUnits & HexagonItinerariesV62FU::CVI_MPY1)
+    return (*Lanes = 1, CVI_MPY0 | CVI_MPY1);
+  else if (ItinUnits == HexagonItinerariesV62FU::CVI_ZW)
+    return (*Lanes = 1, CVI_ZW);
+  else if (ItinUnits == HexagonItinerariesV62FU::CVI_XLANE)
+    return (*Lanes = 1, CVI_XLANE);
+  else if (ItinUnits == HexagonItinerariesV62FU::CVI_SHIFT)
+    return (*Lanes = 1, CVI_SHIFT);
+
+  return (*Lanes = 0, CVI_NONE);
+}
+
+
+namespace llvm {
+namespace HexagonFUnits {
+bool isSlot0Only(unsigned units) {
+  return HexagonItinerariesV62FU::SLOT0 == units;
+}
+} // namespace HexagonFUnits
+} // namespace llvm
+
 namespace {
 
 class HexagonTargetAsmStreamer : public HexagonTargetStreamer {
@@ -186,7 +251,7 @@ public:
   }
 
 
-  void EmitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
+  void emitCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
                               unsigned ByteAlignment,
                               unsigned AccessSize) override {
     HexagonMCELFStreamer &HexagonELFStreamer =
@@ -195,7 +260,7 @@ public:
                                                  AccessSize);
   }
 
-  void EmitLocalCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
+  void emitLocalCommonSymbolSorted(MCSymbol *Symbol, uint64_t Size,
                                    unsigned ByteAlignment,
                                    unsigned AccessSize) override {
     HexagonMCELFStreamer &HexagonELFStreamer =
@@ -225,9 +290,8 @@ static MCAsmInfo *createHexagonMCAsmInfo(const MCRegisterInfo &MRI,
   MCAsmInfo *MAI = new HexagonMCAsmInfo(TT);
 
   // VirtualFP = (R30 + #0).
-  MCCFIInstruction Inst =
-      MCCFIInstruction::createDefCfa(nullptr,
-          MRI.getDwarfRegNum(Hexagon::R30, true), 0);
+  MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(
+      nullptr, MRI.getDwarfRegNum(Hexagon::R30, true), 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
@@ -296,40 +360,51 @@ std::string selectHexagonFS(StringRef CPU, StringRef FS) {
   case Hexagon::ArchEnum::V66:
     Result.push_back("+hvxv66");
     break;
+  case Hexagon::ArchEnum::V67:
+    Result.push_back("+hvxv67");
+    break;
   case Hexagon::ArchEnum::Generic:{
     Result.push_back(StringSwitch<StringRef>(CPU)
              .Case("hexagonv60", "+hvxv60")
              .Case("hexagonv62", "+hvxv62")
              .Case("hexagonv65", "+hvxv65")
-             .Case("hexagonv66", "+hvxv66"));
+             .Case("hexagonv66", "+hvxv66")
+             .Case("hexagonv67", "+hvxv67")
+             .Case("hexagonv67t", "+hvxv67"));
     break;
   }
   case Hexagon::ArchEnum::NoArch:
-    // Sentinal if -mhvx isn't specified
+    // Sentinel if -mhvx isn't specified
     break;
   }
   return join(Result.begin(), Result.end(), ",");
 }
 }
 
-static bool isCPUValid(std::string CPU)
-{
-  std::vector<std::string> table {
-    "generic",    "hexagonv5",  "hexagonv55", "hexagonv60",
-    "hexagonv62", "hexagonv65", "hexagonv66",
-  };
-
-  return std::find(table.begin(), table.end(), CPU) != table.end();
+static bool isCPUValid(const std::string &CPU) {
+  return Hexagon::CpuTable.find(CPU) != Hexagon::CpuTable.cend();
 }
 
 namespace {
 std::pair<std::string, std::string> selectCPUAndFS(StringRef CPU,
                                                    StringRef FS) {
   std::pair<std::string, std::string> Result;
-  Result.first = Hexagon_MC::selectHexagonCPU(CPU);
+  Result.first = std::string(Hexagon_MC::selectHexagonCPU(CPU));
   Result.second = selectHexagonFS(Result.first, FS);
   return Result;
 }
+std::mutex ArchSubtargetMutex;
+std::unordered_map<std::string, std::unique_ptr<MCSubtargetInfo const>>
+    ArchSubtarget;
+} // namespace
+
+MCSubtargetInfo const *
+Hexagon_MC::getArchSubtarget(MCSubtargetInfo const *STI) {
+  std::lock_guard<std::mutex> Lock(ArchSubtargetMutex);
+  auto Existing = ArchSubtarget.find(std::string(STI->getCPU()));
+  if (Existing == ArchSubtarget.end())
+    return nullptr;
+  return Existing->second.get();
 }
 
 FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
@@ -338,7 +413,8 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
   // turns on hvxvNN, corresponding to the existing ArchVNN.
   FeatureBitset FB = S;
   unsigned CpuArch = ArchV5;
-  for (unsigned F : {ArchV66, ArchV65, ArchV62, ArchV60, ArchV55, ArchV5}) {
+  for (unsigned F : {ArchV67, ArchV66, ArchV65, ArchV62, ArchV60, ArchV55,
+                     ArchV5}) {
     if (!FB.test(F))
       continue;
     CpuArch = F;
@@ -353,7 +429,7 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
   }
   bool HasHvxVer = false;
   for (unsigned F : {ExtensionHVXV60, ExtensionHVXV62, ExtensionHVXV65,
-                     ExtensionHVXV66}) {
+                     ExtensionHVXV66, ExtensionHVXV67}) {
     if (!FB.test(F))
       continue;
     HasHvxVer = true;
@@ -366,6 +442,9 @@ FeatureBitset Hexagon_MC::completeHVXFeatures(const FeatureBitset &S) {
 
   // HasHvxVer is false, and UseHvx is true.
   switch (CpuArch) {
+    case ArchV67:
+      FB.set(ExtensionHVXV67);
+      LLVM_FALLTHROUGH;
     case ArchV66:
       FB.set(ExtensionHVXV66);
       LLVM_FALLTHROUGH;
@@ -389,22 +468,52 @@ MCSubtargetInfo *Hexagon_MC::createHexagonMCSubtargetInfo(const Triple &TT,
   StringRef CPUName = Features.first;
   StringRef ArchFS = Features.second;
 
+  MCSubtargetInfo *X = createHexagonMCSubtargetInfoImpl(TT, CPUName, ArchFS);
+  if (X != nullptr && (CPUName == "hexagonv67t"))
+    addArchSubtarget(X, ArchFS);
+
+  if (CPU.equals("help"))
+      exit(0);
+
   if (!isCPUValid(CPUName.str())) {
     errs() << "error: invalid CPU \"" << CPUName.str().c_str()
            << "\" specified\n";
     return nullptr;
   }
 
-  MCSubtargetInfo *X = createHexagonMCSubtargetInfoImpl(TT, CPUName, ArchFS);
   if (HexagonDisableDuplex) {
     llvm::FeatureBitset Features = X->getFeatureBits();
     X->setFeatureBits(Features.reset(Hexagon::FeatureDuplex));
   }
 
   X->setFeatureBits(completeHVXFeatures(X->getFeatureBits()));
+
+  // The Z-buffer instructions are grandfathered in for current
+  // architectures but omitted for new ones.  Future instruction
+  // sets may introduce new/conflicting z-buffer instructions.
+  const bool ZRegOnDefault =
+      (CPUName == "hexagonv67") || (CPUName == "hexagonv66");
+  if (ZRegOnDefault) {
+    llvm::FeatureBitset Features = X->getFeatureBits();
+    X->setFeatureBits(Features.set(Hexagon::ExtensionZReg));
+  }
+
   return X;
 }
 
+void Hexagon_MC::addArchSubtarget(MCSubtargetInfo const *STI,
+                                  StringRef FS) {
+  assert(STI != nullptr);
+  if (STI->getCPU().contains("t")) {
+    auto ArchSTI = createHexagonMCSubtargetInfo(
+        STI->getTargetTriple(),
+        STI->getCPU().substr(0, STI->getCPU().size() - 1), FS);
+    std::lock_guard<std::mutex> Lock(ArchSubtargetMutex);
+    ArchSubtarget[std::string(STI->getCPU())] =
+        std::unique_ptr<MCSubtargetInfo const>(ArchSTI);
+  }
+}
+
 unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
   static std::map<StringRef,unsigned> ElfFlags = {
     {"hexagonv5",  ELF::EF_HEXAGON_MACH_V5},
@@ -413,6 +522,8 @@ unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
     {"hexagonv62", ELF::EF_HEXAGON_MACH_V62},
     {"hexagonv65", ELF::EF_HEXAGON_MACH_V65},
     {"hexagonv66", ELF::EF_HEXAGON_MACH_V66},
+    {"hexagonv67", ELF::EF_HEXAGON_MACH_V67},
+    {"hexagonv67t", ELF::EF_HEXAGON_MACH_V67T},
   };
 
   auto F = ElfFlags.find(STI.getCPU());
@@ -420,6 +531,10 @@ unsigned Hexagon_MC::GetELFFlags(const MCSubtargetInfo &STI) {
   return F->second;
 }
 
+llvm::ArrayRef<MCPhysReg> Hexagon_MC::GetVectRegRev() {
+  return makeArrayRef(VectRegRev);
+}
+
 namespace {
 class HexagonMCInstrAnalysis : public MCInstrAnalysis {
 public:
@@ -437,6 +552,10 @@ public:
 
   bool evaluateBranch(MCInst const &Inst, uint64_t Addr,
                       uint64_t Size, uint64_t &Target) const override {
+    if (!(isCall(Inst) || isUnconditionalBranch(Inst) ||
+          isConditionalBranch(Inst)))
+      return false;
+
     //assert(!HexagonMCInstrInfo::isBundle(Inst));
     if(!HexagonMCInstrInfo::isExtendable(*Info, Inst))
       return false;
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
index 7b42460a2a1c..5bf7c9a1a908 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonMCTargetDesc.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCTARGETDESC_H
 #define LLVM_LIB_TARGET_HEXAGON_MCTARGETDESC_HEXAGONMCTARGETDESC_H
 
+#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include <cstdint>
 #include <string>
@@ -46,7 +47,6 @@
 
 namespace llvm {
 
-struct InstrItinerary;
 struct InstrStage;
 class FeatureBitset;
 class MCAsmBackend;
@@ -60,8 +60,6 @@ class MCTargetOptions;
 class Target;
 class Triple;
 class StringRef;
-class raw_ostream;
-class raw_pwrite_stream;
 
 extern cl::opt<bool> HexagonDisableCompound;
 extern cl::opt<bool> HexagonDisableDuplex;
@@ -78,7 +76,12 @@ namespace Hexagon_MC {
   /// etc. do not need to go through TargetRegistry.
   MCSubtargetInfo *createHexagonMCSubtargetInfo(const Triple &TT, StringRef CPU,
                                                 StringRef FS);
+  MCSubtargetInfo const *getArchSubtarget(MCSubtargetInfo const *STI);
+  void addArchSubtarget(MCSubtargetInfo const *STI,
+                        StringRef FS);
   unsigned GetELFFlags(const MCSubtargetInfo &STI);
+
+  llvm::ArrayRef<MCPhysReg> GetVectRegRev();
 }
 
 MCCodeEmitter *createHexagonMCCodeEmitter(const MCInstrInfo &MCII,
@@ -94,6 +97,7 @@ std::unique_ptr<MCObjectTargetWriter>
 createHexagonELFObjectWriter(uint8_t OSABI, StringRef CPU);
 
 unsigned HexagonGetLastSlot();
+unsigned HexagonConvertUnits(unsigned ItinUnits, unsigned *Lanes);
 
 } // End llvm namespace
 
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
index 18c7790a17cc..2788b86181e2 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.cpp
@@ -105,62 +105,30 @@ unsigned HexagonResource::setWeight(unsigned s) {
   return Weight;
 }
 
-void HexagonCVIResource::SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU) {
-  (*TUL)[HexagonII::TypeCVI_VA] =
-      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
-  (*TUL)[HexagonII::TypeCVI_VA_DV] = UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2);
-  (*TUL)[HexagonII::TypeCVI_VX] = UnitsAndLanes(CVI_MPY0 | CVI_MPY1, 1);
-  (*TUL)[HexagonII::TypeCVI_VX_LATE] = UnitsAndLanes(CVI_MPY0 | CVI_MPY1, 1);
-  (*TUL)[HexagonII::TypeCVI_VX_DV] = UnitsAndLanes(CVI_MPY0, 2);
-  (*TUL)[HexagonII::TypeCVI_VP] = UnitsAndLanes(CVI_XLANE, 1);
-  (*TUL)[HexagonII::TypeCVI_VP_VS] = UnitsAndLanes(CVI_XLANE, 2);
-  (*TUL)[HexagonII::TypeCVI_VS] = UnitsAndLanes(CVI_SHIFT, 1);
-  (*TUL)[HexagonII::TypeCVI_VS_VX] = UnitsAndLanes(CVI_XLANE | CVI_SHIFT, 1);
-  (*TUL)[HexagonII::TypeCVI_VINLANESAT] =
-      (CPU == "hexagonv60")
-          ? UnitsAndLanes(CVI_SHIFT, 1)
-          : UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
-  (*TUL)[HexagonII::TypeCVI_VM_LD] =
-      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
-  (*TUL)[HexagonII::TypeCVI_VM_TMP_LD] = UnitsAndLanes(CVI_NONE, 0);
-  (*TUL)[HexagonII::TypeCVI_VM_VP_LDU] = UnitsAndLanes(CVI_XLANE, 1);
-  (*TUL)[HexagonII::TypeCVI_VM_ST] =
-      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
-  (*TUL)[HexagonII::TypeCVI_VM_NEW_ST] = UnitsAndLanes(CVI_NONE, 0);
-  (*TUL)[HexagonII::TypeCVI_VM_STU] = UnitsAndLanes(CVI_XLANE, 1);
-  (*TUL)[HexagonII::TypeCVI_HIST] = UnitsAndLanes(CVI_XLANE, 4);
-  (*TUL)[HexagonII::TypeCVI_GATHER] =
-      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
-  (*TUL)[HexagonII::TypeCVI_SCATTER] =
-      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
-  (*TUL)[HexagonII::TypeCVI_SCATTER_DV] =
-      UnitsAndLanes(CVI_XLANE | CVI_MPY0, 2);
-  (*TUL)[HexagonII::TypeCVI_SCATTER_NEW_ST] =
-      UnitsAndLanes(CVI_XLANE | CVI_SHIFT | CVI_MPY0 | CVI_MPY1, 1);
-  (*TUL)[HexagonII::TypeCVI_4SLOT_MPY] = UnitsAndLanes(CVI_XLANE, 4);
-  (*TUL)[HexagonII::TypeCVI_ZW] = UnitsAndLanes(CVI_ZW, 1);
-}
-
-HexagonCVIResource::HexagonCVIResource(TypeUnitsAndLanes *TUL,
-                                       MCInstrInfo const &MCII, unsigned s,
+HexagonCVIResource::HexagonCVIResource(MCInstrInfo const &MCII,
+                                       MCSubtargetInfo const &STI,
+                                       unsigned s,
                                        MCInst const *id)
     : HexagonResource(s) {
-  unsigned T = HexagonMCInstrInfo::getType(MCII, *id);
 
-  if (TUL->count(T)) {
-    // For an HVX insn.
-    Valid = true;
-    setUnits((*TUL)[T].first);
-    setLanes((*TUL)[T].second);
-    setLoad(HexagonMCInstrInfo::getDesc(MCII, *id).mayLoad());
-    setStore(HexagonMCInstrInfo::getDesc(MCII, *id).mayStore());
-  } else {
+  const unsigned ItinUnits = HexagonMCInstrInfo::getCVIResources(MCII, STI, *id);
+  unsigned Lanes;
+  const unsigned Units = HexagonConvertUnits(ItinUnits, &Lanes);
+
+  if (Units == 0 && Lanes == 0) {
     // For core insns.
     Valid = false;
     setUnits(0);
     setLanes(0);
     setLoad(false);
     setStore(false);
+  } else {
+    // For an HVX insn.
+    Valid = true;
+    setUnits(Units);
+    setLanes(Lanes);
+    setLoad(HexagonMCInstrInfo::getDesc(MCII, *id).mayLoad());
+    setStore(HexagonMCInstrInfo::getDesc(MCII, *id).mayStore());
   }
 }
 
@@ -201,124 +169,293 @@ HexagonShuffler::HexagonShuffler(MCContext &Context, bool ReportErrors,
                                  MCSubtargetInfo const &STI)
     : Context(Context), MCII(MCII), STI(STI), ReportErrors(ReportErrors) {
   reset();
-  HexagonCVIResource::SetupTUL(&TUL, STI.getCPU());
 }
 
 void HexagonShuffler::reset() {
   Packet.clear();
   BundleFlags = 0;
+  CheckFailure = false;
 }
 
 void HexagonShuffler::append(MCInst const &ID, MCInst const *Extender,
                              unsigned S) {
-  HexagonInstr PI(&TUL, MCII, &ID, Extender, S);
+  HexagonInstr PI(MCII, STI, &ID, Extender, S);
 
   Packet.push_back(PI);
 }
 
-static struct {
-  unsigned first;
-  unsigned second;
-} jumpSlots[] = {{8, 4}, {8, 2}, {8, 1}, {4, 2}, {4, 1}, {2, 1}};
-#define MAX_JUMP_SLOTS (sizeof(jumpSlots) / sizeof(jumpSlots[0]))
 
-void HexagonShuffler::restrictSlot1AOK() {
-  bool HasRestrictSlot1AOK = false;
-  SMLoc RestrictLoc;
-  for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-    MCInst const &Inst = ISJ->getDesc();
-    if (HexagonMCInstrInfo::isRestrictSlot1AOK(MCII, Inst)) {
-      HasRestrictSlot1AOK = true;
-      RestrictLoc = Inst.getLoc();
-    }
-  }
-  if (HasRestrictSlot1AOK)
-    for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-      MCInst const &Inst = ISJ->getDesc();
-      unsigned Type = HexagonMCInstrInfo::getType(MCII, Inst);
+static const unsigned Slot0Mask = 1 << 0;
+static const unsigned Slot1Mask = 1 << 1;
+static const unsigned Slot3Mask = 1 << 3;
+static const unsigned slotSingleLoad = Slot0Mask;
+static const unsigned slotSingleStore = Slot0Mask;
+
+void HexagonShuffler::restrictSlot1AOK(HexagonPacketSummary const &Summary) {
+  if (Summary.Slot1AOKLoc)
+    for (HexagonInstr &ISJ : insts()) {
+      MCInst const &Inst = ISJ.getDesc();
+      const unsigned Type = HexagonMCInstrInfo::getType(MCII, Inst);
       if (Type != HexagonII::TypeALU32_2op &&
           Type != HexagonII::TypeALU32_3op &&
           Type != HexagonII::TypeALU32_ADDI) {
-        unsigned Units = ISJ->Core.getUnits();
-        if (Units & 2U) {
+        const unsigned Units = ISJ.Core.getUnits();
+
+        if (Units & Slot1Mask) {
           AppliedRestrictions.push_back(std::make_pair(
               Inst.getLoc(),
               "Instruction was restricted from being in slot 1"));
-          AppliedRestrictions.push_back(
-              std::make_pair(RestrictLoc, "Instruction can only be combine "
-                                          "with an ALU instruction in slot 1"));
-          ISJ->Core.setUnits(Units & ~2U);
+          AppliedRestrictions.push_back(std::make_pair(
+              *Summary.Slot1AOKLoc, "Instruction can only be combined "
+                                    "with an ALU instruction in slot 1"));
+          ISJ.Core.setUnits(Units & ~Slot1Mask);
         }
       }
     }
 }
 
-void HexagonShuffler::restrictNoSlot1Store() {
-  bool HasRestrictNoSlot1Store = false;
-  SMLoc RestrictLoc;
-  for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-    MCInst const &Inst = ISJ->getDesc();
-    if (HexagonMCInstrInfo::isRestrictNoSlot1Store(MCII, Inst)) {
-      HasRestrictNoSlot1Store = true;
-      RestrictLoc = Inst.getLoc();
+void HexagonShuffler::restrictNoSlot1Store(
+    HexagonPacketSummary const &Summary) {
+  // If this packet contains an instruction that bars slot-1 stores,
+  // we should mask off slot 1 from all of the store instructions in
+  // this packet.
+
+  if (!Summary.NoSlot1StoreLoc)
+    return;
+
+  bool AppliedRestriction = false;
+
+  for (HexagonInstr &ISJ : insts()) {
+    MCInst const &Inst = ISJ.getDesc();
+    if (HexagonMCInstrInfo::getDesc(MCII, Inst).mayStore()) {
+      unsigned Units = ISJ.Core.getUnits();
+      if (Units & Slot1Mask) {
+        AppliedRestriction = true;
+        AppliedRestrictions.push_back(std::make_pair(
+            Inst.getLoc(), "Instruction was restricted from being in slot 1"));
+        ISJ.Core.setUnits(Units & ~Slot1Mask);
+      }
     }
   }
-  if (HasRestrictNoSlot1Store) {
-    bool AppliedRestriction = false;
-    for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-      MCInst const &Inst = ISJ->getDesc();
-      if (HexagonMCInstrInfo::getDesc(MCII, Inst).mayStore()) {
-        unsigned Units = ISJ->Core.getUnits();
-        if (Units & 2U) {
-          AppliedRestriction = true;
-          AppliedRestrictions.push_back(std::make_pair(
-              Inst.getLoc(),
-              "Instruction was restricted from being in slot 1"));
-          ISJ->Core.setUnits(Units & ~2U);
+
+  if (AppliedRestriction)
+    AppliedRestrictions.push_back(
+        std::make_pair(*Summary.NoSlot1StoreLoc,
+                       "Instruction does not allow a store in slot 1"));
+}
+
+bool HexagonShuffler::applySlotRestrictions(
+    HexagonPacketSummary const &Summary) {
+  // These restrictions can modify the slot masks in the instructions
+  // in the Packet member.  They should run unconditionally and their
+  // order does not matter.
+  restrictSlot1AOK(Summary);
+  restrictNoSlot1Store(Summary);
+
+  permitNonSlot();
+
+  // These restrictions can modify the slot masks in the instructions
+  // in the Packet member, but they can also detect constraint failures
+  // which are fatal.
+  if (!CheckFailure)
+    restrictStoreLoadOrder(Summary);
+  if (!CheckFailure)
+    restrictBranchOrder(Summary);
+  if (!CheckFailure)
+    restrictPreferSlot3(Summary);
+  return !CheckFailure;
+}
+
+void HexagonShuffler::restrictBranchOrder(HexagonPacketSummary const &Summary) {
+  // preserve branch order
+  const bool HasMultipleBranches = Summary.branchInsts.size() > 1;
+  if (!HasMultipleBranches)
+    return;
+
+  if (Summary.branchInsts.size() > 2) {
+    reportError(Twine("too many branches in packet"));
+    return;
+  }
+
+  const static std::pair<unsigned, unsigned> jumpSlots[] = {
+      {8, 4}, {8, 2}, {8, 1}, {4, 2}, {4, 1}, {2, 1}};
+  // try all possible choices
+  for (std::pair<unsigned, unsigned> jumpSlot : jumpSlots) {
+    // validate first jump with this slot rule
+    if (!(jumpSlot.first & Summary.branchInsts[0]->Core.getUnits()))
+      continue;
+
+    // validate second jump with this slot rule
+    if (!(jumpSlot.second & Summary.branchInsts[1]->Core.getUnits()))
+      continue;
+
+    // both valid for this configuration, set new slot rules
+    const HexagonPacket PacketSave = Packet;
+    Summary.branchInsts[0]->Core.setUnits(jumpSlot.first);
+    Summary.branchInsts[1]->Core.setUnits(jumpSlot.second);
+
+    const bool HasShuffledPacket = tryAuction(Summary).hasValue();
+    if (HasShuffledPacket)
+      return;
+
+    // if yes, great, if not then restore original slot mask
+    // restore original values
+    Packet = PacketSave;
+  }
+
+  reportError("invalid instruction packet: out of slots");
+}
+
+
+void HexagonShuffler::permitNonSlot() {
+  for (HexagonInstr &ISJ : insts()) {
+    const bool RequiresSlot = HexagonMCInstrInfo::requiresSlot(STI, *ISJ.ID);
+    if (!RequiresSlot)
+      ISJ.Core.setAllUnits();
+  }
+}
+
+bool HexagonShuffler::ValidResourceUsage(HexagonPacketSummary const &Summary) {
+  Optional<HexagonPacket> ShuffledPacket = tryAuction(Summary);
+
+  if (!ShuffledPacket) {
+    reportError("invalid instruction packet: slot error");
+    return false;
+  } else {
+    Packet = *ShuffledPacket;
+  }
+
+  // Verify the CVI slot subscriptions.
+  std::stable_sort(begin(), end(), HexagonInstr::lessCVI);
+  // create vector of hvx instructions to check
+  HVXInstsT hvxInsts;
+  hvxInsts.clear();
+  for (const_iterator I = cbegin(); I != cend(); ++I) {
+    struct CVIUnits inst;
+    inst.Units = I->CVI.getUnits();
+    inst.Lanes = I->CVI.getLanes();
+    if (inst.Units == 0)
+      continue; // not an hvx inst or an hvx inst that doesn't uses any pipes
+    hvxInsts.push_back(inst);
+  }
+
+  // if there are any hvx instructions in this packet, check pipe usage
+  if (hvxInsts.size() > 0) {
+    unsigned startIdx, usedUnits;
+    startIdx = usedUnits = 0x0;
+    if (!checkHVXPipes(hvxInsts, startIdx, usedUnits)) {
+      // too many pipes used to be valid
+      reportError(Twine("invalid instruction packet: slot error"));
+      return false;
+    }
+  }
+  return true;
+}
+
+bool HexagonShuffler::restrictStoreLoadOrder(
+    HexagonPacketSummary const &Summary) {
+  // Modify packet accordingly.
+  // TODO: need to reserve slots #0 and #1 for duplex insns.
+  static const unsigned slotFirstLoadStore = Slot1Mask;
+  static const unsigned slotLastLoadStore = Slot0Mask;
+  unsigned slotLoadStore = slotFirstLoadStore;
+
+  for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
+    MCInst const &ID = ISJ->getDesc();
+
+    if (!ISJ->Core.getUnits())
+      // Error if insn may not be executed in any slot.
+      return false;
+
+    // A single load must use slot #0.
+    if (HexagonMCInstrInfo::getDesc(MCII, ID).mayLoad()) {
+      if (Summary.loads == 1 && Summary.loads == Summary.memory &&
+          Summary.memops == 0)
+        // Pin the load to slot #0.
+        switch (ID.getOpcode()) {
+        case Hexagon::V6_vgathermw:
+        case Hexagon::V6_vgathermh:
+        case Hexagon::V6_vgathermhw:
+        case Hexagon::V6_vgathermwq:
+        case Hexagon::V6_vgathermhq:
+        case Hexagon::V6_vgathermhwq:
+          // Slot1 only loads
+          break;
+        default:
+          ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleLoad);
+          break;
+        }
+      else if (Summary.loads >= 1 && isMemReorderDisabled()) { // }:mem_noshuf
+        // Loads must keep the original order ONLY if
+        // isMemReorderDisabled() == true
+        if (slotLoadStore < slotLastLoadStore) {
+          // Error if no more slots available for loads.
+          reportError("invalid instruction packet: too many loads");
+          return false;
+        }
+        // Pin the load to the highest slot available to it.
+        ISJ->Core.setUnits(ISJ->Core.getUnits() & slotLoadStore);
+        // Update the next highest slot available to loads.
+        slotLoadStore >>= 1;
+      }
+    }
+
+    // A single store must use slot #0.
+    if (HexagonMCInstrInfo::getDesc(MCII, ID).mayStore()) {
+      if (!Summary.store0) {
+        const bool PacketHasNoOnlySlot0 =
+            llvm::none_of(insts(), [&](HexagonInstr const &I) {
+              return I.Core.getUnits() == Slot0Mask &&
+                     I.ID->getOpcode() != ID.getOpcode();
+            });
+        const bool SafeToMoveToSlot0 =
+            (Summary.loads == 0) ||
+            (!isMemReorderDisabled() && PacketHasNoOnlySlot0);
+
+        if (Summary.stores == 1 && SafeToMoveToSlot0)
+          // Pin the store to slot #0 only if isMemReorderDisabled() == false
+          ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleStore);
+        else if (Summary.stores >= 1) {
+          if (slotLoadStore < slotLastLoadStore) {
+            // Error if no more slots available for stores.
+            reportError("invalid instruction packet: too many stores");
+            return false;
+          }
+          // Pin the store to the highest slot available to it.
+          ISJ->Core.setUnits(ISJ->Core.getUnits() & slotLoadStore);
+          // Update the next highest slot available to stores.
+          slotLoadStore >>= 1;
         }
       }
+      if (Summary.store1 && Summary.stores > 1) {
+        // Error if a single store with another store.
+        reportError("invalid instruction packet: too many stores");
+        return false;
+      }
     }
-    if (AppliedRestriction)
-      AppliedRestrictions.push_back(std::make_pair(
-          RestrictLoc, "Instruction does not allow a store in slot 1"));
   }
-}
 
-void HexagonShuffler::applySlotRestrictions() {
-  restrictSlot1AOK();
-  restrictNoSlot1Store();
+  return true;
 }
 
-/// Check that the packet is legal and enforce relative insn order.
-bool HexagonShuffler::check() {
-  // Descriptive slot masks.
-  const unsigned slotSingleLoad = 0x1, slotSingleStore = 0x1,
-                 slotThree = 0x8, // slotFirstJump = 0x8,
-                 slotFirstLoadStore = 0x2, slotLastLoadStore = 0x1;
-  // Highest slots for branches and stores used to keep their original order.
-  // unsigned slotJump = slotFirstJump;
-  unsigned slotLoadStore = slotFirstLoadStore;
-  // Number of memory operations, loads, solo loads, stores, solo stores, single
-  // stores.
-  unsigned memory = 0, loads = 0, load0 = 0, stores = 0, store0 = 0, store1 = 0;
-  unsigned NonZCVIloads = 0, AllCVIloads = 0, CVIstores = 0;
-  // Number of duplex insns
-  unsigned duplex = 0;
-  unsigned pSlot3Cnt = 0;
-  unsigned memops = 0;
-  iterator slot3ISJ = end();
-  std::vector<iterator> foundBranches;
-  unsigned reservedSlots = 0;
+HexagonShuffler::HexagonPacketSummary HexagonShuffler::GetPacketSummary() {
+  HexagonPacketSummary Summary = HexagonPacketSummary();
 
   // Collect information from the insns in the packet.
   for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
     MCInst const &ID = ISJ->getDesc();
 
+    if (HexagonMCInstrInfo::isRestrictSlot1AOK(MCII, ID))
+      Summary.Slot1AOKLoc = ID.getLoc();
+    if (HexagonMCInstrInfo::isRestrictNoSlot1Store(MCII, ID))
+      Summary.NoSlot1StoreLoc = ID.getLoc();
+
     if (HexagonMCInstrInfo::prefersSlot3(MCII, ID)) {
-      ++pSlot3Cnt;
-      slot3ISJ = ISJ;
+      ++Summary.pSlot3Cnt;
+      Summary.PrefSlot3Inst = ISJ;
     }
-    reservedSlots |= HexagonMCInstrInfo::getOtherReservedSlots(MCII, STI, ID);
+    Summary.ReservedSlotMask |=
+        HexagonMCInstrInfo::getOtherReservedSlots(MCII, STI, ID);
 
     switch (HexagonMCInstrInfo::getType(MCII, ID)) {
     case HexagonII::TypeS_2op:
@@ -326,26 +463,27 @@ bool HexagonShuffler::check() {
     case HexagonII::TypeALU64:
       break;
     case HexagonII::TypeJ:
-      foundBranches.push_back(ISJ);
+      Summary.branchInsts.push_back(ISJ);
       break;
     case HexagonII::TypeCVI_VM_VP_LDU:
     case HexagonII::TypeCVI_VM_LD:
     case HexagonII::TypeCVI_VM_TMP_LD:
     case HexagonII::TypeCVI_GATHER:
+    case HexagonII::TypeCVI_GATHER_DV:
     case HexagonII::TypeCVI_GATHER_RST:
-      ++NonZCVIloads;
+      ++Summary.NonZCVIloads;
       LLVM_FALLTHROUGH;
     case HexagonII::TypeCVI_ZW:
-      ++AllCVIloads;
+      ++Summary.AllCVIloads;
       LLVM_FALLTHROUGH;
     case HexagonII::TypeLD:
-      ++loads;
-      ++memory;
+      ++Summary.loads;
+      ++Summary.memory;
       if (ISJ->Core.getUnits() == slotSingleLoad ||
           HexagonMCInstrInfo::getType(MCII, ID) == HexagonII::TypeCVI_VM_VP_LDU)
-        ++load0;
+        ++Summary.load0;
       if (HexagonMCInstrInfo::getDesc(MCII, ID).isReturn())
-        foundBranches.push_back(ISJ);
+        Summary.branchInsts.push_back(ISJ);
       break;
     case HexagonII::TypeCVI_VM_STU:
     case HexagonII::TypeCVI_VM_ST:
@@ -355,266 +493,143 @@ bool HexagonShuffler::check() {
     case HexagonII::TypeCVI_SCATTER_RST:
     case HexagonII::TypeCVI_SCATTER_NEW_RST:
     case HexagonII::TypeCVI_SCATTER_NEW_ST:
-      ++CVIstores;
+      ++Summary.CVIstores;
       LLVM_FALLTHROUGH;
     case HexagonII::TypeST:
-      ++stores;
-      ++memory;
+      ++Summary.stores;
+      ++Summary.memory;
       if (ISJ->Core.getUnits() == slotSingleStore ||
           HexagonMCInstrInfo::getType(MCII, ID) == HexagonII::TypeCVI_VM_STU)
-        ++store0;
+        ++Summary.store0;
       break;
     case HexagonII::TypeV4LDST:
-      ++loads;
-      ++stores;
-      ++store1;
-      ++memops;
-      ++memory;
+      ++Summary.loads;
+      ++Summary.stores;
+      ++Summary.store1;
+      ++Summary.memops;
+      ++Summary.memory;
       break;
     case HexagonII::TypeNCJ:
-      ++memory; // NV insns are memory-like.
-      foundBranches.push_back(ISJ);
+      ++Summary.memory; // NV insns are memory-like.
+      Summary.branchInsts.push_back(ISJ);
       break;
     case HexagonII::TypeV2LDST:
       if (HexagonMCInstrInfo::getDesc(MCII, ID).mayLoad()) {
-        ++loads;
-        ++memory;
+        ++Summary.loads;
+        ++Summary.memory;
         if (ISJ->Core.getUnits() == slotSingleLoad ||
             HexagonMCInstrInfo::getType(MCII, ID) ==
                 HexagonII::TypeCVI_VM_VP_LDU)
-          ++load0;
+          ++Summary.load0;
       } else {
         assert(HexagonMCInstrInfo::getDesc(MCII, ID).mayStore());
-        ++memory;
-        ++stores;
+        ++Summary.memory;
+        ++Summary.stores;
       }
       break;
     case HexagonII::TypeCR:
     // Legacy conditional branch predicated on a register.
     case HexagonII::TypeCJ:
       if (HexagonMCInstrInfo::getDesc(MCII, ID).isBranch())
-        foundBranches.push_back(ISJ);
+        Summary.branchInsts.push_back(ISJ);
       break;
     case HexagonII::TypeDUPLEX: {
-      ++duplex;
+      ++Summary.duplex;
       MCInst const &Inst0 = *ID.getOperand(0).getInst();
       MCInst const &Inst1 = *ID.getOperand(1).getInst();
       if (HexagonMCInstrInfo::getDesc(MCII, Inst0).isBranch())
-        foundBranches.push_back(ISJ);
+        Summary.branchInsts.push_back(ISJ);
       if (HexagonMCInstrInfo::getDesc(MCII, Inst1).isBranch())
-        foundBranches.push_back(ISJ);
+        Summary.branchInsts.push_back(ISJ);
       if (HexagonMCInstrInfo::getDesc(MCII, Inst0).isReturn())
-        foundBranches.push_back(ISJ);
+        Summary.branchInsts.push_back(ISJ);
       if (HexagonMCInstrInfo::getDesc(MCII, Inst1).isReturn())
-        foundBranches.push_back(ISJ);
+        Summary.branchInsts.push_back(ISJ);
       break;
     }
     }
   }
-  applySlotRestrictions();
+  return Summary;
+}
 
+bool HexagonShuffler::ValidPacketMemoryOps(
+    HexagonPacketSummary const &Summary) const {
   // Check if the packet is legal.
-  const unsigned ZCVIloads = AllCVIloads - NonZCVIloads;
+  const unsigned ZCVIloads = Summary.AllCVIloads - Summary.NonZCVIloads;
   const bool ValidHVXMem =
-      NonZCVIloads <= 1 && ZCVIloads <= 1 && CVIstores <= 1;
-  if ((load0 > 1 || store0 > 1 || !ValidHVXMem) ||
-      (duplex > 1 || (duplex && memory))) {
-    reportError(llvm::Twine("invalid instruction packet"));
-    return false;
-  }
-
-  // Modify packet accordingly.
-  // TODO: need to reserve slots #0 and #1 for duplex insns.
-  bool bOnlySlot3 = false;
-  for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-    MCInst const &ID = ISJ->getDesc();
-
-    if (!ISJ->Core.getUnits()) {
-      // Error if insn may not be executed in any slot.
-      return false;
-    }
-
-    // A single load must use slot #0.
-    if (HexagonMCInstrInfo::getDesc(MCII, ID).mayLoad()) {
-      if (loads == 1 && loads == memory && memops == 0)
-        // Pin the load to slot #0.
-        switch (ID.getOpcode()) {
-        case Hexagon::V6_vgathermw:
-        case Hexagon::V6_vgathermh:
-        case Hexagon::V6_vgathermhw:
-        case Hexagon::V6_vgathermwq:
-        case Hexagon::V6_vgathermhq:
-        case Hexagon::V6_vgathermhwq:
-          // Slot1 only loads
-          break;
-        default:
-          ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleLoad);
-          break;
-        }
-      else if (loads >= 1 && isMemReorderDisabled()) { // }:mem_noshuf
-        // Loads must keep the original order ONLY if
-        // isMemReorderDisabled() == true
-        if (slotLoadStore < slotLastLoadStore) {
-          // Error if no more slots available for loads.
-          reportError(
-              llvm::Twine("invalid instruction packet: too many loads"));
-          return false;
-        }
-        // Pin the load to the highest slot available to it.
-        ISJ->Core.setUnits(ISJ->Core.getUnits() & slotLoadStore);
-        // Update the next highest slot available to loads.
-        slotLoadStore >>= 1;
-      }
-    }
+      Summary.NonZCVIloads <= 1 && ZCVIloads <= 1 && Summary.CVIstores <= 1;
+  const bool InvalidPacket =
+      ((Summary.load0 > 1 || Summary.store0 > 1 || !ValidHVXMem) ||
+       (Summary.duplex > 1 || (Summary.duplex && Summary.memory)));
 
-    // A single store must use slot #0.
-    if (HexagonMCInstrInfo::getDesc(MCII, ID).mayStore()) {
-      if (!store0) {
-        if (stores == 1 && (loads == 0 || !isMemReorderDisabled()))
-          // Pin the store to slot #0 only if isMemReorderDisabled() == false
-          ISJ->Core.setUnits(ISJ->Core.getUnits() & slotSingleStore);
-        else if (stores >= 1) {
-          if (slotLoadStore < slotLastLoadStore) {
-            // Error if no more slots available for stores.
-            reportError(Twine("invalid instruction packet: too many stores"));
-            return false;
-          }
-          // Pin the store to the highest slot available to it.
-          ISJ->Core.setUnits(ISJ->Core.getUnits() & slotLoadStore);
-          // Update the next highest slot available to stores.
-          slotLoadStore >>= 1;
-        }
-      }
-      if (store1 && stores > 1) {
-        // Error if a single store with another store.
-        reportError(Twine("invalid instruction packet: too many stores"));
-        return false;
-      }
-    }
-
-    // flag if an instruction requires to be in slot 3
-    if (ISJ->Core.getUnits() == slotThree)
-      bOnlySlot3 = true;
-
-    if (!ISJ->Core.getUnits()) {
-      // Error if insn may not be executed in any slot.
-      reportError(Twine("invalid instruction packet: out of slots"));
-      return false;
-    }
-  }
-
-  // preserve branch order
-  bool validateSlots = true;
-  if (foundBranches.size() > 1) {
-    if (foundBranches.size() > 2) {
-      reportError(Twine("too many branches in packet"));
-      return false;
-    }
-
-    // try all possible choices
-    for (unsigned int i = 0; i < MAX_JUMP_SLOTS; ++i) {
-      // validate first jump with this slot rule
-      if (!(jumpSlots[i].first & foundBranches[0]->Core.getUnits()))
-        continue;
-
-      // validate second jump with this slot rule
-      if (!(jumpSlots[i].second & foundBranches[1]->Core.getUnits()))
-        continue;
-
-      // both valid for this configuration, set new slot rules
-      PacketSave = Packet;
-      foundBranches[0]->Core.setUnits(jumpSlots[i].first);
-      foundBranches[1]->Core.setUnits(jumpSlots[i].second);
-
-      HexagonUnitAuction AuctionCore(reservedSlots);
-      std::stable_sort(begin(), end(), HexagonInstr::lessCore);
-
-      // see if things ok with that instruction being pinned to slot "slotJump"
-      bool bFail = false;
-      for (iterator I = begin(); I != end() && !bFail; ++I)
-        if (!AuctionCore.bid(I->Core.getUnits()))
-          bFail = true;
-
-      // if yes, great, if not then restore original slot mask
-      if (!bFail) {
-        validateSlots = false; // all good, no need to re-do auction
-        break;
-      } else
-        // restore original values
-        Packet = PacketSave;
-    }
-    if (validateSlots) {
-      reportError(Twine("invalid instruction packet: out of slots"));
-      return false;
-    }
-  }
-
-  if (foundBranches.size() <= 1 && bOnlySlot3 == false && pSlot3Cnt == 1 &&
-      slot3ISJ != end()) {
-    validateSlots = true;
-    // save off slot mask of instruction marked with A_PREFER_SLOT3
-    // and then pin it to slot #3
-    unsigned saveUnits = slot3ISJ->Core.getUnits();
-    slot3ISJ->Core.setUnits(saveUnits & slotThree);
+  return !InvalidPacket;
+}
 
-    HexagonUnitAuction AuctionCore(reservedSlots);
-    std::stable_sort(begin(), end(), HexagonInstr::lessCore);
+void HexagonShuffler::restrictPreferSlot3(HexagonPacketSummary const &Summary) {
+  // flag if an instruction requires to be in slot 3
+  const bool HasOnlySlot3 = llvm::any_of(insts(), [&](HexagonInstr const &I) {
+    return (I.Core.getUnits() == Slot3Mask);
+  });
+  const bool NeedsPrefSlot3Shuffle =
+      (Summary.branchInsts.size() <= 1 && !HasOnlySlot3 &&
+       Summary.pSlot3Cnt == 1 && Summary.PrefSlot3Inst);
+
+  if (!NeedsPrefSlot3Shuffle)
+    return;
+
+  HexagonInstr *PrefSlot3Inst = *Summary.PrefSlot3Inst;
+  // save off slot mask of instruction marked with A_PREFER_SLOT3
+  // and then pin it to slot #3
+  const unsigned saveUnits = PrefSlot3Inst->Core.getUnits();
+  PrefSlot3Inst->Core.setUnits(saveUnits & Slot3Mask);
+  const bool HasShuffledPacket = tryAuction(Summary).hasValue();
+  if (HasShuffledPacket)
+    return;
+
+  PrefSlot3Inst->Core.setUnits(saveUnits);
+}
 
-    // see if things ok with that instruction being pinned to slot #3
-    bool bFail = false;
-    for (iterator I = begin(); I != end() && !bFail; ++I)
-      if (!AuctionCore.bid(I->Core.getUnits()))
-        bFail = true;
+/// Check that the packet is legal and enforce relative insn order.
+bool HexagonShuffler::check() {
+  const HexagonPacketSummary Summary = GetPacketSummary();
+  if (!applySlotRestrictions(Summary))
+    return false;
 
-    // if yes, great, if not then restore original slot mask
-    if (!bFail)
-      validateSlots = false; // all good, no need to re-do auction
-    else
-      for (iterator ISJ = begin(); ISJ != end(); ++ISJ) {
-        MCInst const &ID = ISJ->getDesc();
-        if (HexagonMCInstrInfo::prefersSlot3(MCII, ID))
-          ISJ->Core.setUnits(saveUnits);
-      }
+  if (!ValidPacketMemoryOps(Summary)) {
+    reportError("invalid instruction packet");
+    return false;
   }
 
-  // Check if any slot, core or CVI, is over-subscribed.
-  // Verify the core slot subscriptions.
-  if (validateSlots) {
-    HexagonUnitAuction AuctionCore(reservedSlots);
+  ValidResourceUsage(Summary);
 
-    std::stable_sort(begin(), end(), HexagonInstr::lessCore);
-
-    for (iterator I = begin(); I != end(); ++I)
-      if (!AuctionCore.bid(I->Core.getUnits())) {
-        reportError(Twine("invalid instruction packet: slot error"));
-        return false;
-      }
-  }
-  // Verify the CVI slot subscriptions.
-  std::stable_sort(begin(), end(), HexagonInstr::lessCVI);
-  // create vector of hvx instructions to check
-  HVXInstsT hvxInsts;
-  hvxInsts.clear();
-  for (iterator I = begin(); I != end(); ++I) {
-    struct CVIUnits inst;
-    inst.Units = I->CVI.getUnits();
-    inst.Lanes = I->CVI.getLanes();
-    if (inst.Units == 0)
-      continue; // not an hvx inst or an hvx inst that doesn't uses any pipes
-    hvxInsts.push_back(inst);
-  }
-  // if there are any hvx instructions in this packet, check pipe usage
-  if (hvxInsts.size() > 0) {
-    unsigned startIdx, usedUnits;
-    startIdx = usedUnits = 0x0;
-    if (!checkHVXPipes(hvxInsts, startIdx, usedUnits)) {
-      // too many pipes used to be valid
-      reportError(Twine("invalid instruction packet: slot error"));
-      return false;
-    }
-  }
+  return !CheckFailure;
+}
 
-  return true;
+llvm::Optional<HexagonShuffler::HexagonPacket>
+HexagonShuffler::tryAuction(HexagonPacketSummary const &Summary) const {
+  HexagonPacket PacketResult = Packet;
+  HexagonUnitAuction AuctionCore(Summary.ReservedSlotMask);
+  std::stable_sort(PacketResult.begin(), PacketResult.end(),
+                   HexagonInstr::lessCore);
+
+  const bool ValidSlots =
+      llvm::all_of(insts(PacketResult), [&AuctionCore](HexagonInstr const &I) {
+        return AuctionCore.bid(I.Core.getUnits());
+      });
+
+  LLVM_DEBUG(
+    dbgs() << "Shuffle attempt: " << (ValidSlots ? "passed" : "failed")
+           << "\n";
+    for (HexagonInstr const &ISJ : insts(PacketResult))
+      dbgs() << "\t" << HexagonMCInstrInfo::getName(MCII, *ISJ.ID) << ": "
+             << llvm::format_hex(ISJ.Core.getUnits(), 4, true) << "\n";
+  );
+
+  Optional<HexagonPacket> Res;
+  if (ValidSlots)
+    Res = PacketResult;
+
+  return Res;
 }
 
 bool HexagonShuffler::shuffle() {
@@ -653,20 +668,25 @@ bool HexagonShuffler::shuffle() {
         ++emptySlots;
     }
 
-  for (iterator ISJ = begin(); ISJ != end(); ++ISJ)
-    LLVM_DEBUG(dbgs().write_hex(ISJ->Core.getUnits()); if (ISJ->CVI.isValid()) {
-      dbgs() << '/';
-      dbgs().write_hex(ISJ->CVI.getUnits()) << '|';
-      dbgs() << ISJ->CVI.getLanes();
-    } dbgs() << ':'
-             << HexagonMCInstrInfo::getDesc(MCII, ISJ->getDesc()).getOpcode();
-               dbgs() << '\n');
-  LLVM_DEBUG(dbgs() << '\n');
+  LLVM_DEBUG(
+    for (HexagonInstr const &ISJ : insts()) {
+      dbgs().write_hex(ISJ.Core.getUnits());
+      if (ISJ.CVI.isValid()) {
+        dbgs() << '/';
+        dbgs().write_hex(ISJ.CVI.getUnits()) << '|';
+        dbgs() << ISJ.CVI.getLanes();
+      }
+      dbgs() << ':'
+             << HexagonMCInstrInfo::getDesc(MCII, ISJ.getDesc()).getOpcode()
+             << '\n';
+    } dbgs() << '\n';
+  );
 
   return Ok;
 }
 
 void HexagonShuffler::reportError(Twine const &Msg) {
+  CheckFailure = true;
   if (ReportErrors) {
     for (auto const &I : AppliedRestrictions) {
       auto SM = Context.getSourceManager();
diff --git a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
index bf3bad36dfe5..1b4ebc5111db 100644
--- a/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
+++ b/llvm/lib/Target/Hexagon/MCTargetDesc/HexagonShuffler.h
@@ -17,11 +17,13 @@
 #include "MCTargetDesc/HexagonMCInstrInfo.h"
 #include "MCTargetDesc/HexagonMCTargetDesc.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/SMLoc.h"
 #include <cstdint>
+#include <functional>
 #include <utility>
 
 namespace llvm {
@@ -45,6 +47,9 @@ public:
     setWeight(s);
   }
 
+  void setAllUnits() {
+    setUnits(((1u << HEXAGON_PACKET_SIZE) - 1));
+  }
   unsigned setWeight(unsigned s);
 
   unsigned getUnits() const { return (Slots); }
@@ -65,7 +70,6 @@ public:
 class HexagonCVIResource : public HexagonResource {
 public:
   using UnitsAndLanes = std::pair<unsigned, unsigned>;
-  using TypeUnitsAndLanes = DenseMap<unsigned, UnitsAndLanes>;
 
 private:
   // Available HVX slots.
@@ -90,11 +94,10 @@ private:
   void setStore(bool f = true) { Store = f; }
 
 public:
-  HexagonCVIResource(TypeUnitsAndLanes *TUL, MCInstrInfo const &MCII,
+  HexagonCVIResource(MCInstrInfo const &MCII,
+                     MCSubtargetInfo const &STI,
                      unsigned s, MCInst const *id);
 
-  static void SetupTUL(TypeUnitsAndLanes *TUL, StringRef CPU);
-
   bool isValid() const { return Valid; }
   unsigned getLanes() const { return Lanes; }
   bool mayLoad() const { return Load; }
@@ -111,10 +114,10 @@ class HexagonInstr {
   HexagonCVIResource CVI;
 
 public:
-  HexagonInstr(HexagonCVIResource::TypeUnitsAndLanes *T,
-               MCInstrInfo const &MCII, MCInst const *id,
+  HexagonInstr(MCInstrInfo const &MCII,
+               MCSubtargetInfo const &STI, MCInst const *id,
                MCInst const *Extender, unsigned s)
-      : ID(id), Extender(Extender), Core(s), CVI(T, MCII, s, id) {}
+      : ID(id), Extender(Extender), Core(s), CVI(MCII, STI, s, id){};
 
   MCInst const &getDesc() const { return *ID; }
   MCInst const *getExtender() const { return Extender; }
@@ -140,11 +143,30 @@ class HexagonShuffler {
   using HexagonPacket =
       SmallVector<HexagonInstr, HEXAGON_PRESHUFFLE_PACKET_SIZE>;
 
+  struct HexagonPacketSummary {
+    // Number of memory operations, loads, solo loads, stores, solo stores,
+    // single stores.
+    unsigned memory;
+    unsigned loads;
+    unsigned load0;
+    unsigned stores;
+    unsigned store0;
+    unsigned store1;
+    unsigned NonZCVIloads;
+    unsigned AllCVIloads;
+    unsigned CVIstores;
+    // Number of duplex insns
+    unsigned duplex;
+    unsigned pSlot3Cnt;
+    Optional<HexagonInstr *> PrefSlot3Inst;
+    unsigned memops;
+    unsigned ReservedSlotMask;
+    SmallVector<HexagonInstr *, HEXAGON_PRESHUFFLE_PACKET_SIZE> branchInsts;
+    Optional<SMLoc> Slot1AOKLoc;
+    Optional<SMLoc> NoSlot1StoreLoc;
+  };
   // Insn handles in a bundle.
   HexagonPacket Packet;
-  HexagonPacket PacketSave;
-
-  HexagonCVIResource::TypeUnitsAndLanes TUL;
 
 protected:
   MCContext &Context;
@@ -153,13 +175,29 @@ protected:
   MCSubtargetInfo const &STI;
   SMLoc Loc;
   bool ReportErrors;
+  bool CheckFailure;
   std::vector<std::pair<SMLoc, std::string>> AppliedRestrictions;
-  void applySlotRestrictions();
-  void restrictSlot1AOK();
-  void restrictNoSlot1Store();
+  bool applySlotRestrictions(HexagonPacketSummary const &Summary);
+  void restrictSlot1AOK(HexagonPacketSummary const &Summary);
+  void restrictNoSlot1Store(HexagonPacketSummary const &Summary);
+  void restrictNoSlot1();
+  bool restrictStoreLoadOrder(HexagonPacketSummary const &Summary);
+  void restrictBranchOrder(HexagonPacketSummary const &Summary);
+  void restrictPreferSlot3(HexagonPacketSummary const &Summary);
+  void permitNonSlot();
+
+  Optional<HexagonPacket> tryAuction(HexagonPacketSummary const &Summary) const;
+
+  HexagonPacketSummary GetPacketSummary();
+  bool ValidPacketMemoryOps(HexagonPacketSummary const &Summary) const;
+  bool ValidResourceUsage(HexagonPacketSummary const &Summary);
+  bool validPacketInsts() const;
 
 public:
   using iterator = HexagonPacket::iterator;
+  using const_iterator = HexagonPacket::const_iterator;
+  using packet_range = iterator_range<HexagonPacket::iterator>;
+  using const_packet_range = iterator_range<HexagonPacket::const_iterator>;
 
   HexagonShuffler(MCContext &Context, bool ReportErrors,
                   MCInstrInfo const &MCII, MCSubtargetInfo const &STI);
@@ -179,6 +217,25 @@ public:
 
   iterator begin() { return (Packet.begin()); }
   iterator end() { return (Packet.end()); }
+  const_iterator cbegin() const { return (Packet.begin()); }
+  const_iterator cend() const { return (Packet.end()); }
+  packet_range insts(HexagonPacket &P) {
+    return make_range(P.begin(), P.end());
+  }
+  const_packet_range insts(HexagonPacket const &P) const {
+    return make_range(P.begin(), P.end());
+  }
+  packet_range insts() { return make_range(begin(), end()); }
+  const_packet_range insts() const { return make_range(cbegin(), cend()); }
+
+  using InstPredicate = bool (*)(MCInstrInfo const &, MCInst const &);
+
+  bool HasInstWith(InstPredicate Pred) const {
+    return llvm::any_of(insts(), [&](HexagonInstr const &I) {
+      MCInst const &Inst = I.getDesc();
+      return (*Pred)(MCII, Inst);
+    });
+  }
 
   // Add insn handle to the bundle .
   void append(MCInst const &ID, MCInst const *Extender, unsigned S);
diff --git a/llvm/lib/Target/Hexagon/RDFCopy.cpp b/llvm/lib/Target/Hexagon/RDFCopy.cpp
index a9d39fd4b2dc..34d58f0a7a23 100644
--- a/llvm/lib/Target/Hexagon/RDFCopy.cpp
+++ b/llvm/lib/Target/Hexagon/RDFCopy.cpp
@@ -11,13 +11,13 @@
 //===----------------------------------------------------------------------===//
 
 #include "RDFCopy.h"
-#include "RDFGraph.h"
-#include "RDFLiveness.h"
-#include "RDFRegisters.h"
 #include "llvm/CodeGen/MachineDominators.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
diff --git a/llvm/lib/Target/Hexagon/RDFCopy.h b/llvm/lib/Target/Hexagon/RDFCopy.h
index 1450ab884849..99b18a75d8c2 100644
--- a/llvm/lib/Target/Hexagon/RDFCopy.h
+++ b/llvm/lib/Target/Hexagon/RDFCopy.h
@@ -9,9 +9,9 @@
 #ifndef LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H
 #define LLVM_LIB_TARGET_HEXAGON_RDFCOPY_H
 
-#include "RDFGraph.h"
-#include "RDFLiveness.h"
-#include "RDFRegisters.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
+#include "llvm/CodeGen/RDFRegisters.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include <map>
 #include <vector>
diff --git a/llvm/lib/Target/Hexagon/RDFDeadCode.cpp b/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
index af86c7b1956b..5a98debd3c00 100644
--- a/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
+++ b/llvm/lib/Target/Hexagon/RDFDeadCode.cpp
@@ -9,13 +9,13 @@
 // RDF-based generic dead code elimination.
 
 #include "RDFDeadCode.h"
-#include "RDFGraph.h"
-#include "RDFLiveness.h"
 
 #include "llvm/ADT/SetVector.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
 #include "llvm/Support/Debug.h"
 
 #include <queue>
diff --git a/llvm/lib/Target/Hexagon/RDFDeadCode.h b/llvm/lib/Target/Hexagon/RDFDeadCode.h
index 7f91977e1d6c..859c8161d355 100644
--- a/llvm/lib/Target/Hexagon/RDFDeadCode.h
+++ b/llvm/lib/Target/Hexagon/RDFDeadCode.h
@@ -23,8 +23,8 @@
 #ifndef RDF_DEADCODE_H
 #define RDF_DEADCODE_H
 
-#include "RDFGraph.h"
-#include "RDFLiveness.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
 #include "llvm/ADT/SetVector.h"
 
 namespace llvm {
diff --git a/llvm/lib/Target/Hexagon/RDFGraph.h b/llvm/lib/Target/Hexagon/RDFGraph.h
deleted file mode 100644
index 585f43e116f9..000000000000
--- a/llvm/lib/Target/Hexagon/RDFGraph.h
+++ /dev/null
@@ -1,968 +0,0 @@
-//===- RDFGraph.h -----------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Target-independent, SSA-based data flow graph for register data flow (RDF)
-// for a non-SSA program representation (e.g. post-RA machine code).
-//
-//
-// *** Introduction
-//
-// The RDF graph is a collection of nodes, each of which denotes some element
-// of the program. There are two main types of such elements: code and refe-
-// rences. Conceptually, "code" is something that represents the structure
-// of the program, e.g. basic block or a statement, while "reference" is an
-// instance of accessing a register, e.g. a definition or a use. Nodes are
-// connected with each other based on the structure of the program (such as
-// blocks, instructions, etc.), and based on the data flow (e.g. reaching
-// definitions, reached uses, etc.). The single-reaching-definition principle
-// of SSA is generally observed, although, due to the non-SSA representation
-// of the program, there are some differences between the graph and a "pure"
-// SSA representation.
-//
-//
-// *** Implementation remarks
-//
-// Since the graph can contain a large number of nodes, memory consumption
-// was one of the major design considerations. As a result, there is a single
-// base class NodeBase which defines all members used by all possible derived
-// classes. The members are arranged in a union, and a derived class cannot
-// add any data members of its own. Each derived class only defines the
-// functional interface, i.e. member functions. NodeBase must be a POD,
-// which implies that all of its members must also be PODs.
-// Since nodes need to be connected with other nodes, pointers have been
-// replaced with 32-bit identifiers: each node has an id of type NodeId.
-// There are mapping functions in the graph that translate between actual
-// memory addresses and the corresponding identifiers.
-// A node id of 0 is equivalent to nullptr.
-//
-//
-// *** Structure of the graph
-//
-// A code node is always a collection of other nodes. For example, a code
-// node corresponding to a basic block will contain code nodes corresponding
-// to instructions. In turn, a code node corresponding to an instruction will
-// contain a list of reference nodes that correspond to the definitions and
-// uses of registers in that instruction. The members are arranged into a
-// circular list, which is yet another consequence of the effort to save
-// memory: for each member node it should be possible to obtain its owner,
-// and it should be possible to access all other members. There are other
-// ways to accomplish that, but the circular list seemed the most natural.
-//
-// +- CodeNode -+
-// |            | <---------------------------------------------------+
-// +-+--------+-+                                                     |
-//   |FirstM  |LastM                                                  |
-//   |        +-------------------------------------+                 |
-//   |                                              |                 |
-//   V                                              V                 |
-//  +----------+ Next +----------+ Next       Next +----------+ Next  |
-//  |          |----->|          |-----> ... ----->|          |----->-+
-//  +- Member -+      +- Member -+                 +- Member -+
-//
-// The order of members is such that related reference nodes (see below)
-// should be contiguous on the member list.
-//
-// A reference node is a node that encapsulates an access to a register,
-// in other words, data flowing into or out of a register. There are two
-// major kinds of reference nodes: defs and uses. A def node will contain
-// the id of the first reached use, and the id of the first reached def.
-// Each def and use will contain the id of the reaching def, and also the
-// id of the next reached def (for def nodes) or use (for use nodes).
-// The "next node sharing the same reaching def" is denoted as "sibling".
-// In summary:
-// - Def node contains: reaching def, sibling, first reached def, and first
-// reached use.
-// - Use node contains: reaching def and sibling.
-//
-// +-- DefNode --+
-// | R2 = ...    | <---+--------------------+
-// ++---------+--+     |                    |
-//  |Reached  |Reached |                    |
-//  |Def      |Use     |                    |
-//  |         |        |Reaching            |Reaching
-//  |         V        |Def                 |Def
-//  |      +-- UseNode --+ Sib  +-- UseNode --+ Sib       Sib
-//  |      | ... = R2    |----->| ... = R2    |----> ... ----> 0
-//  |      +-------------+      +-------------+
-//  V
-// +-- DefNode --+ Sib
-// | R2 = ...    |----> ...
-// ++---------+--+
-//  |         |
-//  |         |
-// ...       ...
-//
-// To get a full picture, the circular lists connecting blocks within a
-// function, instructions within a block, etc. should be superimposed with
-// the def-def, def-use links shown above.
-// To illustrate this, consider a small example in a pseudo-assembly:
-// foo:
-//   add r2, r0, r1   ; r2 = r0+r1
-//   addi r0, r2, 1   ; r0 = r2+1
-//   ret r0           ; return value in r0
-//
-// The graph (in a format used by the debugging functions) would look like:
-//
-//   DFG dump:[
-//   f1: Function foo
-//   b2: === %bb.0 === preds(0), succs(0):
-//   p3: phi [d4<r0>(,d12,u9):]
-//   p5: phi [d6<r1>(,,u10):]
-//   s7: add [d8<r2>(,,u13):, u9<r0>(d4):, u10<r1>(d6):]
-//   s11: addi [d12<r0>(d4,,u15):, u13<r2>(d8):]
-//   s14: ret [u15<r0>(d12):]
-//   ]
-//
-// The f1, b2, p3, etc. are node ids. The letter is prepended to indicate the
-// kind of the node (i.e. f - function, b - basic block, p - phi, s - state-
-// ment, d - def, u - use).
-// The format of a def node is:
-//   dN<R>(rd,d,u):sib,
-// where
-//   N   - numeric node id,
-//   R   - register being defined
-//   rd  - reaching def,
-//   d   - reached def,
-//   u   - reached use,
-//   sib - sibling.
-// The format of a use node is:
-//   uN<R>[!](rd):sib,
-// where
-//   N   - numeric node id,
-//   R   - register being used,
-//   rd  - reaching def,
-//   sib - sibling.
-// Possible annotations (usually preceding the node id):
-//   +   - preserving def,
-//   ~   - clobbering def,
-//   "   - shadow ref (follows the node id),
-//   !   - fixed register (appears after register name).
-//
-// The circular lists are not explicit in the dump.
-//
-//
-// *** Node attributes
-//
-// NodeBase has a member "Attrs", which is the primary way of determining
-// the node's characteristics. The fields in this member decide whether
-// the node is a code node or a reference node (i.e. node's "type"), then
-// within each type, the "kind" determines what specifically this node
-// represents. The remaining bits, "flags", contain additional information
-// that is even more detailed than the "kind".
-// CodeNode's kinds are:
-// - Phi:   Phi node, members are reference nodes.
-// - Stmt:  Statement, members are reference nodes.
-// - Block: Basic block, members are instruction nodes (i.e. Phi or Stmt).
-// - Func:  The whole function. The members are basic block nodes.
-// RefNode's kinds are:
-// - Use.
-// - Def.
-//
-// Meaning of flags:
-// - Preserving: applies only to defs. A preserving def is one that can
-//   preserve some of the original bits among those that are included in
-//   the register associated with that def. For example, if R0 is a 32-bit
-//   register, but a def can only change the lower 16 bits, then it will
-//   be marked as preserving.
-// - Shadow: a reference that has duplicates holding additional reaching
-//   defs (see more below).
-// - Clobbering: applied only to defs, indicates that the value generated
-//   by this def is unspecified. A typical example would be volatile registers
-//   after function calls.
-// - Fixed: the register in this def/use cannot be replaced with any other
-//   register. A typical case would be a parameter register to a call, or
-//   the register with the return value from a function.
-// - Undef: the register in this reference the register is assumed to have
-//   no pre-existing value, even if it appears to be reached by some def.
-//   This is typically used to prevent keeping registers artificially live
-//   in cases when they are defined via predicated instructions. For example:
-//     r0 = add-if-true cond, r10, r11                (1)
-//     r0 = add-if-false cond, r12, r13, implicit r0  (2)
-//     ... = r0                                       (3)
-//   Before (1), r0 is not intended to be live, and the use of r0 in (3) is
-//   not meant to be reached by any def preceding (1). However, since the
-//   defs in (1) and (2) are both preserving, these properties alone would
-//   imply that the use in (3) may indeed be reached by some prior def.
-//   Adding Undef flag to the def in (1) prevents that. The Undef flag
-//   may be applied to both defs and uses.
-// - Dead: applies only to defs. The value coming out of a "dead" def is
-//   assumed to be unused, even if the def appears to be reaching other defs
-//   or uses. The motivation for this flag comes from dead defs on function
-//   calls: there is no way to determine if such a def is dead without
-//   analyzing the target's ABI. Hence the graph should contain this info,
-//   as it is unavailable otherwise. On the other hand, a def without any
-//   uses on a typical instruction is not the intended target for this flag.
-//
-// *** Shadow references
-//
-// It may happen that a super-register can have two (or more) non-overlapping
-// sub-registers. When both of these sub-registers are defined and followed
-// by a use of the super-register, the use of the super-register will not
-// have a unique reaching def: both defs of the sub-registers need to be
-// accounted for. In such cases, a duplicate use of the super-register is
-// added and it points to the extra reaching def. Both uses are marked with
-// a flag "shadow". Example:
-// Assume t0 is a super-register of r0 and r1, r0 and r1 do not overlap:
-//   set r0, 1        ; r0 = 1
-//   set r1, 1        ; r1 = 1
-//   addi t1, t0, 1   ; t1 = t0+1
-//
-// The DFG:
-//   s1: set [d2<r0>(,,u9):]
-//   s3: set [d4<r1>(,,u10):]
-//   s5: addi [d6<t1>(,,):, u7"<t0>(d2):, u8"<t0>(d4):]
-//
-// The statement s5 has two use nodes for t0: u7" and u9". The quotation
-// mark " indicates that the node is a shadow.
-//
-
-#ifndef LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
-#define LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
-
-#include "RDFRegisters.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/MC/LaneBitmask.h"
-#include "llvm/Support/Allocator.h"
-#include "llvm/Support/MathExtras.h"
-#include <cassert>
-#include <cstdint>
-#include <cstring>
-#include <map>
-#include <set>
-#include <unordered_map>
-#include <utility>
-#include <vector>
-
-// RDF uses uint32_t to refer to registers. This is to ensure that the type
-// size remains specific. In other places, registers are often stored using
-// unsigned.
-static_assert(sizeof(uint32_t) == sizeof(unsigned), "Those should be equal");
-
-namespace llvm {
-
-class MachineBasicBlock;
-class MachineDominanceFrontier;
-class MachineDominatorTree;
-class MachineFunction;
-class MachineInstr;
-class MachineOperand;
-class raw_ostream;
-class TargetInstrInfo;
-class TargetRegisterInfo;
-
-namespace rdf {
-
-  using NodeId = uint32_t;
-
-  struct DataFlowGraph;
-
-  struct NodeAttrs {
-    enum : uint16_t {
-      None          = 0x0000,   // Nothing
-
-      // Types: 2 bits
-      TypeMask      = 0x0003,
-      Code          = 0x0001,   // 01, Container
-      Ref           = 0x0002,   // 10, Reference
-
-      // Kind: 3 bits
-      KindMask      = 0x0007 << 2,
-      Def           = 0x0001 << 2,  // 001
-      Use           = 0x0002 << 2,  // 010
-      Phi           = 0x0003 << 2,  // 011
-      Stmt          = 0x0004 << 2,  // 100
-      Block         = 0x0005 << 2,  // 101
-      Func          = 0x0006 << 2,  // 110
-
-      // Flags: 7 bits for now
-      FlagMask      = 0x007F << 5,
-      Shadow        = 0x0001 << 5,  // 0000001, Has extra reaching defs.
-      Clobbering    = 0x0002 << 5,  // 0000010, Produces unspecified values.
-      PhiRef        = 0x0004 << 5,  // 0000100, Member of PhiNode.
-      Preserving    = 0x0008 << 5,  // 0001000, Def can keep original bits.
-      Fixed         = 0x0010 << 5,  // 0010000, Fixed register.
-      Undef         = 0x0020 << 5,  // 0100000, Has no pre-existing value.
-      Dead          = 0x0040 << 5,  // 1000000, Does not define a value.
-    };
-
-    static uint16_t type(uint16_t T)  { return T & TypeMask; }
-    static uint16_t kind(uint16_t T)  { return T & KindMask; }
-    static uint16_t flags(uint16_t T) { return T & FlagMask; }
-
-    static uint16_t set_type(uint16_t A, uint16_t T) {
-      return (A & ~TypeMask) | T;
-    }
-
-    static uint16_t set_kind(uint16_t A, uint16_t K) {
-      return (A & ~KindMask) | K;
-    }
-
-    static uint16_t set_flags(uint16_t A, uint16_t F) {
-      return (A & ~FlagMask) | F;
-    }
-
-    // Test if A contains B.
-    static bool contains(uint16_t A, uint16_t B) {
-      if (type(A) != Code)
-        return false;
-      uint16_t KB = kind(B);
-      switch (kind(A)) {
-        case Func:
-          return KB == Block;
-        case Block:
-          return KB == Phi || KB == Stmt;
-        case Phi:
-        case Stmt:
-          return type(B) == Ref;
-      }
-      return false;
-    }
-  };
-
-  struct BuildOptions {
-    enum : unsigned {
-      None          = 0x00,
-      KeepDeadPhis  = 0x01,   // Do not remove dead phis during build.
-    };
-  };
-
-  template <typename T> struct NodeAddr {
-    NodeAddr() = default;
-    NodeAddr(T A, NodeId I) : Addr(A), Id(I) {}
-
-    // Type cast (casting constructor). The reason for having this class
-    // instead of std::pair.
-    template <typename S> NodeAddr(const NodeAddr<S> &NA)
-      : Addr(static_cast<T>(NA.Addr)), Id(NA.Id) {}
-
-    bool operator== (const NodeAddr<T> &NA) const {
-      assert((Addr == NA.Addr) == (Id == NA.Id));
-      return Addr == NA.Addr;
-    }
-    bool operator!= (const NodeAddr<T> &NA) const {
-      return !operator==(NA);
-    }
-
-    T Addr = nullptr;
-    NodeId Id = 0;
-  };
-
-  struct NodeBase;
-
-  // Fast memory allocation and translation between node id and node address.
-  // This is really the same idea as the one underlying the "bump pointer
-  // allocator", the difference being in the translation. A node id is
-  // composed of two components: the index of the block in which it was
-  // allocated, and the index within the block. With the default settings,
-  // where the number of nodes per block is 4096, the node id (minus 1) is:
-  //
-  // bit position:                11             0
-  // +----------------------------+--------------+
-  // | Index of the block         |Index in block|
-  // +----------------------------+--------------+
-  //
-  // The actual node id is the above plus 1, to avoid creating a node id of 0.
-  //
-  // This method significantly improved the build time, compared to using maps
-  // (std::unordered_map or DenseMap) to translate between pointers and ids.
-  struct NodeAllocator {
-    // Amount of storage for a single node.
-    enum { NodeMemSize = 32 };
-
-    NodeAllocator(uint32_t NPB = 4096)
-        : NodesPerBlock(NPB), BitsPerIndex(Log2_32(NPB)),
-          IndexMask((1 << BitsPerIndex)-1) {
-      assert(isPowerOf2_32(NPB));
-    }
-
-    NodeBase *ptr(NodeId N) const {
-      uint32_t N1 = N-1;
-      uint32_t BlockN = N1 >> BitsPerIndex;
-      uint32_t Offset = (N1 & IndexMask) * NodeMemSize;
-      return reinterpret_cast<NodeBase*>(Blocks[BlockN]+Offset);
-    }
-
-    NodeId id(const NodeBase *P) const;
-    NodeAddr<NodeBase*> New();
-    void clear();
-
-  private:
-    void startNewBlock();
-    bool needNewBlock();
-
-    uint32_t makeId(uint32_t Block, uint32_t Index) const {
-      // Add 1 to the id, to avoid the id of 0, which is treated as "null".
-      return ((Block << BitsPerIndex) | Index) + 1;
-    }
-
-    const uint32_t NodesPerBlock;
-    const uint32_t BitsPerIndex;
-    const uint32_t IndexMask;
-    char *ActiveEnd = nullptr;
-    std::vector<char*> Blocks;
-    using AllocatorTy = BumpPtrAllocatorImpl<MallocAllocator, 65536>;
-    AllocatorTy MemPool;
-  };
-
-  using RegisterSet = std::set<RegisterRef>;
-
-  struct TargetOperandInfo {
-    TargetOperandInfo(const TargetInstrInfo &tii) : TII(tii) {}
-    virtual ~TargetOperandInfo() = default;
-
-    virtual bool isPreserving(const MachineInstr &In, unsigned OpNum) const;
-    virtual bool isClobbering(const MachineInstr &In, unsigned OpNum) const;
-    virtual bool isFixedReg(const MachineInstr &In, unsigned OpNum) const;
-
-    const TargetInstrInfo &TII;
-  };
-
-  // Packed register reference. Only used for storage.
-  struct PackedRegisterRef {
-    RegisterId Reg;
-    uint32_t MaskId;
-  };
-
-  struct LaneMaskIndex : private IndexedSet<LaneBitmask> {
-    LaneMaskIndex() = default;
-
-    LaneBitmask getLaneMaskForIndex(uint32_t K) const {
-      return K == 0 ? LaneBitmask::getAll() : get(K);
-    }
-
-    uint32_t getIndexForLaneMask(LaneBitmask LM) {
-      assert(LM.any());
-      return LM.all() ? 0 : insert(LM);
-    }
-
-    uint32_t getIndexForLaneMask(LaneBitmask LM) const {
-      assert(LM.any());
-      return LM.all() ? 0 : find(LM);
-    }
-  };
-
-  struct NodeBase {
-  public:
-    // Make sure this is a POD.
-    NodeBase() = default;
-
-    uint16_t getType()  const { return NodeAttrs::type(Attrs); }
-    uint16_t getKind()  const { return NodeAttrs::kind(Attrs); }
-    uint16_t getFlags() const { return NodeAttrs::flags(Attrs); }
-    NodeId   getNext()  const { return Next; }
-
-    uint16_t getAttrs() const { return Attrs; }
-    void setAttrs(uint16_t A) { Attrs = A; }
-    void setFlags(uint16_t F) { setAttrs(NodeAttrs::set_flags(getAttrs(), F)); }
-
-    // Insert node NA after "this" in the circular chain.
-    void append(NodeAddr<NodeBase*> NA);
-
-    // Initialize all members to 0.
-    void init() { memset(this, 0, sizeof *this); }
-
-    void setNext(NodeId N) { Next = N; }
-
-  protected:
-    uint16_t Attrs;
-    uint16_t Reserved;
-    NodeId Next;                // Id of the next node in the circular chain.
-    // Definitions of nested types. Using anonymous nested structs would make
-    // this class definition clearer, but unnamed structs are not a part of
-    // the standard.
-    struct Def_struct  {
-      NodeId DD, DU;          // Ids of the first reached def and use.
-    };
-    struct PhiU_struct  {
-      NodeId PredB;           // Id of the predecessor block for a phi use.
-    };
-    struct Code_struct {
-      void *CP;               // Pointer to the actual code.
-      NodeId FirstM, LastM;   // Id of the first member and last.
-    };
-    struct Ref_struct {
-      NodeId RD, Sib;         // Ids of the reaching def and the sibling.
-      union {
-        Def_struct Def;
-        PhiU_struct PhiU;
-      };
-      union {
-        MachineOperand *Op;   // Non-phi refs point to a machine operand.
-        PackedRegisterRef PR; // Phi refs store register info directly.
-      };
-    };
-
-    // The actual payload.
-    union {
-      Ref_struct Ref;
-      Code_struct Code;
-    };
-  };
-  // The allocator allocates chunks of 32 bytes for each node. The fact that
-  // each node takes 32 bytes in memory is used for fast translation between
-  // the node id and the node address.
-  static_assert(sizeof(NodeBase) <= NodeAllocator::NodeMemSize,
-        "NodeBase must be at most NodeAllocator::NodeMemSize bytes");
-
-  using NodeList = SmallVector<NodeAddr<NodeBase *>, 4>;
-  using NodeSet = std::set<NodeId>;
-
-  struct RefNode : public NodeBase {
-    RefNode() = default;
-
-    RegisterRef getRegRef(const DataFlowGraph &G) const;
-
-    MachineOperand &getOp() {
-      assert(!(getFlags() & NodeAttrs::PhiRef));
-      return *Ref.Op;
-    }
-
-    void setRegRef(RegisterRef RR, DataFlowGraph &G);
-    void setRegRef(MachineOperand *Op, DataFlowGraph &G);
-
-    NodeId getReachingDef() const {
-      return Ref.RD;
-    }
-    void setReachingDef(NodeId RD) {
-      Ref.RD = RD;
-    }
-
-    NodeId getSibling() const {
-      return Ref.Sib;
-    }
-    void setSibling(NodeId Sib) {
-      Ref.Sib = Sib;
-    }
-
-    bool isUse() const {
-      assert(getType() == NodeAttrs::Ref);
-      return getKind() == NodeAttrs::Use;
-    }
-
-    bool isDef() const {
-      assert(getType() == NodeAttrs::Ref);
-      return getKind() == NodeAttrs::Def;
-    }
-
-    template <typename Predicate>
-    NodeAddr<RefNode*> getNextRef(RegisterRef RR, Predicate P, bool NextOnly,
-        const DataFlowGraph &G);
-    NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
-  };
-
-  struct DefNode : public RefNode {
-    NodeId getReachedDef() const {
-      return Ref.Def.DD;
-    }
-    void setReachedDef(NodeId D) {
-      Ref.Def.DD = D;
-    }
-    NodeId getReachedUse() const {
-      return Ref.Def.DU;
-    }
-    void setReachedUse(NodeId U) {
-      Ref.Def.DU = U;
-    }
-
-    void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
-  };
-
-  struct UseNode : public RefNode {
-    void linkToDef(NodeId Self, NodeAddr<DefNode*> DA);
-  };
-
-  struct PhiUseNode : public UseNode {
-    NodeId getPredecessor() const {
-      assert(getFlags() & NodeAttrs::PhiRef);
-      return Ref.PhiU.PredB;
-    }
-    void setPredecessor(NodeId B) {
-      assert(getFlags() & NodeAttrs::PhiRef);
-      Ref.PhiU.PredB = B;
-    }
-  };
-
-  struct CodeNode : public NodeBase {
-    template <typename T> T getCode() const {
-      return static_cast<T>(Code.CP);
-    }
-    void setCode(void *C) {
-      Code.CP = C;
-    }
-
-    NodeAddr<NodeBase*> getFirstMember(const DataFlowGraph &G) const;
-    NodeAddr<NodeBase*> getLastMember(const DataFlowGraph &G) const;
-    void addMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
-    void addMemberAfter(NodeAddr<NodeBase*> MA, NodeAddr<NodeBase*> NA,
-        const DataFlowGraph &G);
-    void removeMember(NodeAddr<NodeBase*> NA, const DataFlowGraph &G);
-
-    NodeList members(const DataFlowGraph &G) const;
-    template <typename Predicate>
-    NodeList members_if(Predicate P, const DataFlowGraph &G) const;
-  };
-
-  struct InstrNode : public CodeNode {
-    NodeAddr<NodeBase*> getOwner(const DataFlowGraph &G);
-  };
-
-  struct PhiNode : public InstrNode {
-    MachineInstr *getCode() const {
-      return nullptr;
-    }
-  };
-
-  struct StmtNode : public InstrNode {
-    MachineInstr *getCode() const {
-      return CodeNode::getCode<MachineInstr*>();
-    }
-  };
-
-  struct BlockNode : public CodeNode {
-    MachineBasicBlock *getCode() const {
-      return CodeNode::getCode<MachineBasicBlock*>();
-    }
-
-    void addPhi(NodeAddr<PhiNode*> PA, const DataFlowGraph &G);
-  };
-
-  struct FuncNode : public CodeNode {
-    MachineFunction *getCode() const {
-      return CodeNode::getCode<MachineFunction*>();
-    }
-
-    NodeAddr<BlockNode*> findBlock(const MachineBasicBlock *BB,
-        const DataFlowGraph &G) const;
-    NodeAddr<BlockNode*> getEntryBlock(const DataFlowGraph &G);
-  };
-
-  struct DataFlowGraph {
-    DataFlowGraph(MachineFunction &mf, const TargetInstrInfo &tii,
-        const TargetRegisterInfo &tri, const MachineDominatorTree &mdt,
-        const MachineDominanceFrontier &mdf, const TargetOperandInfo &toi);
-
-    NodeBase *ptr(NodeId N) const;
-    template <typename T> T ptr(NodeId N) const {
-      return static_cast<T>(ptr(N));
-    }
-
-    NodeId id(const NodeBase *P) const;
-
-    template <typename T> NodeAddr<T> addr(NodeId N) const {
-      return { ptr<T>(N), N };
-    }
-
-    NodeAddr<FuncNode*> getFunc() const { return Func; }
-    MachineFunction &getMF() const { return MF; }
-    const TargetInstrInfo &getTII() const { return TII; }
-    const TargetRegisterInfo &getTRI() const { return TRI; }
-    const PhysicalRegisterInfo &getPRI() const { return PRI; }
-    const MachineDominatorTree &getDT() const { return MDT; }
-    const MachineDominanceFrontier &getDF() const { return MDF; }
-    const RegisterAggr &getLiveIns() const { return LiveIns; }
-
-    struct DefStack {
-      DefStack() = default;
-
-      bool empty() const { return Stack.empty() || top() == bottom(); }
-
-    private:
-      using value_type = NodeAddr<DefNode *>;
-      struct Iterator {
-        using value_type = DefStack::value_type;
-
-        Iterator &up() { Pos = DS.nextUp(Pos); return *this; }
-        Iterator &down() { Pos = DS.nextDown(Pos); return *this; }
-
-        value_type operator*() const {
-          assert(Pos >= 1);
-          return DS.Stack[Pos-1];
-        }
-        const value_type *operator->() const {
-          assert(Pos >= 1);
-          return &DS.Stack[Pos-1];
-        }
-        bool operator==(const Iterator &It) const { return Pos == It.Pos; }
-        bool operator!=(const Iterator &It) const { return Pos != It.Pos; }
-
-      private:
-        friend struct DefStack;
-
-        Iterator(const DefStack &S, bool Top);
-
-        // Pos-1 is the index in the StorageType object that corresponds to
-        // the top of the DefStack.
-        const DefStack &DS;
-        unsigned Pos;
-      };
-
-    public:
-      using iterator = Iterator;
-
-      iterator top() const { return Iterator(*this, true); }
-      iterator bottom() const { return Iterator(*this, false); }
-      unsigned size() const;
-
-      void push(NodeAddr<DefNode*> DA) { Stack.push_back(DA); }
-      void pop();
-      void start_block(NodeId N);
-      void clear_block(NodeId N);
-
-    private:
-      friend struct Iterator;
-
-      using StorageType = std::vector<value_type>;
-
-      bool isDelimiter(const StorageType::value_type &P, NodeId N = 0) const {
-        return (P.Addr == nullptr) && (N == 0 || P.Id == N);
-      }
-
-      unsigned nextUp(unsigned P) const;
-      unsigned nextDown(unsigned P) const;
-
-      StorageType Stack;
-    };
-
-    // Make this std::unordered_map for speed of accessing elements.
-    // Map: Register (physical or virtual) -> DefStack
-    using DefStackMap = std::unordered_map<RegisterId, DefStack>;
-
-    void build(unsigned Options = BuildOptions::None);
-    void pushAllDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
-    void markBlock(NodeId B, DefStackMap &DefM);
-    void releaseBlock(NodeId B, DefStackMap &DefM);
-
-    PackedRegisterRef pack(RegisterRef RR) {
-      return { RR.Reg, LMI.getIndexForLaneMask(RR.Mask) };
-    }
-    PackedRegisterRef pack(RegisterRef RR) const {
-      return { RR.Reg, LMI.getIndexForLaneMask(RR.Mask) };
-    }
-    RegisterRef unpack(PackedRegisterRef PR) const {
-      return RegisterRef(PR.Reg, LMI.getLaneMaskForIndex(PR.MaskId));
-    }
-
-    RegisterRef makeRegRef(unsigned Reg, unsigned Sub) const;
-    RegisterRef makeRegRef(const MachineOperand &Op) const;
-    RegisterRef restrictRef(RegisterRef AR, RegisterRef BR) const;
-
-    NodeAddr<RefNode*> getNextRelated(NodeAddr<InstrNode*> IA,
-        NodeAddr<RefNode*> RA) const;
-    NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA,
-        NodeAddr<RefNode*> RA, bool Create);
-    NodeAddr<RefNode*> getNextImp(NodeAddr<InstrNode*> IA,
-        NodeAddr<RefNode*> RA) const;
-    NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA,
-        NodeAddr<RefNode*> RA, bool Create);
-    NodeAddr<RefNode*> getNextShadow(NodeAddr<InstrNode*> IA,
-        NodeAddr<RefNode*> RA) const;
-
-    NodeList getRelatedRefs(NodeAddr<InstrNode*> IA,
-        NodeAddr<RefNode*> RA) const;
-
-    NodeAddr<BlockNode*> findBlock(MachineBasicBlock *BB) const {
-      return BlockNodes.at(BB);
-    }
-
-    void unlinkUse(NodeAddr<UseNode*> UA, bool RemoveFromOwner) {
-      unlinkUseDF(UA);
-      if (RemoveFromOwner)
-        removeFromOwner(UA);
-    }
-
-    void unlinkDef(NodeAddr<DefNode*> DA, bool RemoveFromOwner) {
-      unlinkDefDF(DA);
-      if (RemoveFromOwner)
-        removeFromOwner(DA);
-    }
-
-    // Some useful filters.
-    template <uint16_t Kind>
-    static bool IsRef(const NodeAddr<NodeBase*> BA) {
-      return BA.Addr->getType() == NodeAttrs::Ref &&
-             BA.Addr->getKind() == Kind;
-    }
-
-    template <uint16_t Kind>
-    static bool IsCode(const NodeAddr<NodeBase*> BA) {
-      return BA.Addr->getType() == NodeAttrs::Code &&
-             BA.Addr->getKind() == Kind;
-    }
-
-    static bool IsDef(const NodeAddr<NodeBase*> BA) {
-      return BA.Addr->getType() == NodeAttrs::Ref &&
-             BA.Addr->getKind() == NodeAttrs::Def;
-    }
-
-    static bool IsUse(const NodeAddr<NodeBase*> BA) {
-      return BA.Addr->getType() == NodeAttrs::Ref &&
-             BA.Addr->getKind() == NodeAttrs::Use;
-    }
-
-    static bool IsPhi(const NodeAddr<NodeBase*> BA) {
-      return BA.Addr->getType() == NodeAttrs::Code &&
-             BA.Addr->getKind() == NodeAttrs::Phi;
-    }
-
-    static bool IsPreservingDef(const NodeAddr<DefNode*> DA) {
-      uint16_t Flags = DA.Addr->getFlags();
-      return (Flags & NodeAttrs::Preserving) && !(Flags & NodeAttrs::Undef);
-    }
-
-  private:
-    void reset();
-
-    RegisterSet getLandingPadLiveIns() const;
-
-    NodeAddr<NodeBase*> newNode(uint16_t Attrs);
-    NodeAddr<NodeBase*> cloneNode(const NodeAddr<NodeBase*> B);
-    NodeAddr<UseNode*> newUse(NodeAddr<InstrNode*> Owner,
-        MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
-    NodeAddr<PhiUseNode*> newPhiUse(NodeAddr<PhiNode*> Owner,
-        RegisterRef RR, NodeAddr<BlockNode*> PredB,
-        uint16_t Flags = NodeAttrs::PhiRef);
-    NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner,
-        MachineOperand &Op, uint16_t Flags = NodeAttrs::None);
-    NodeAddr<DefNode*> newDef(NodeAddr<InstrNode*> Owner,
-        RegisterRef RR, uint16_t Flags = NodeAttrs::PhiRef);
-    NodeAddr<PhiNode*> newPhi(NodeAddr<BlockNode*> Owner);
-    NodeAddr<StmtNode*> newStmt(NodeAddr<BlockNode*> Owner,
-        MachineInstr *MI);
-    NodeAddr<BlockNode*> newBlock(NodeAddr<FuncNode*> Owner,
-        MachineBasicBlock *BB);
-    NodeAddr<FuncNode*> newFunc(MachineFunction *MF);
-
-    template <typename Predicate>
-    std::pair<NodeAddr<RefNode*>,NodeAddr<RefNode*>>
-    locateNextRef(NodeAddr<InstrNode*> IA, NodeAddr<RefNode*> RA,
-        Predicate P) const;
-
-    using BlockRefsMap = std::map<NodeId, RegisterSet>;
-
-    void buildStmt(NodeAddr<BlockNode*> BA, MachineInstr &In);
-    void recordDefsForDF(BlockRefsMap &PhiM, NodeAddr<BlockNode*> BA);
-    void buildPhis(BlockRefsMap &PhiM, RegisterSet &AllRefs,
-        NodeAddr<BlockNode*> BA);
-    void removeUnusedPhis();
-
-    void pushClobbers(NodeAddr<InstrNode*> IA, DefStackMap &DM);
-    void pushDefs(NodeAddr<InstrNode*> IA, DefStackMap &DM);
-    template <typename T> void linkRefUp(NodeAddr<InstrNode*> IA,
-        NodeAddr<T> TA, DefStack &DS);
-    template <typename Predicate> void linkStmtRefs(DefStackMap &DefM,
-        NodeAddr<StmtNode*> SA, Predicate P);
-    void linkBlockRefs(DefStackMap &DefM, NodeAddr<BlockNode*> BA);
-
-    void unlinkUseDF(NodeAddr<UseNode*> UA);
-    void unlinkDefDF(NodeAddr<DefNode*> DA);
-
-    void removeFromOwner(NodeAddr<RefNode*> RA) {
-      NodeAddr<InstrNode*> IA = RA.Addr->getOwner(*this);
-      IA.Addr->removeMember(RA, *this);
-    }
-
-    MachineFunction &MF;
-    const TargetInstrInfo &TII;
-    const TargetRegisterInfo &TRI;
-    const PhysicalRegisterInfo PRI;
-    const MachineDominatorTree &MDT;
-    const MachineDominanceFrontier &MDF;
-    const TargetOperandInfo &TOI;
-
-    RegisterAggr LiveIns;
-    NodeAddr<FuncNode*> Func;
-    NodeAllocator Memory;
-    // Local map:  MachineBasicBlock -> NodeAddr<BlockNode*>
-    std::map<MachineBasicBlock*,NodeAddr<BlockNode*>> BlockNodes;
-    // Lane mask map.
-    LaneMaskIndex LMI;
-  };  // struct DataFlowGraph
-
-  template <typename Predicate>
-  NodeAddr<RefNode*> RefNode::getNextRef(RegisterRef RR, Predicate P,
-        bool NextOnly, const DataFlowGraph &G) {
-    // Get the "Next" reference in the circular list that references RR and
-    // satisfies predicate "Pred".
-    auto NA = G.addr<NodeBase*>(getNext());
-
-    while (NA.Addr != this) {
-      if (NA.Addr->getType() == NodeAttrs::Ref) {
-        NodeAddr<RefNode*> RA = NA;
-        if (RA.Addr->getRegRef(G) == RR && P(NA))
-          return NA;
-        if (NextOnly)
-          break;
-        NA = G.addr<NodeBase*>(NA.Addr->getNext());
-      } else {
-        // We've hit the beginning of the chain.
-        assert(NA.Addr->getType() == NodeAttrs::Code);
-        NodeAddr<CodeNode*> CA = NA;
-        NA = CA.Addr->getFirstMember(G);
-      }
-    }
-    // Return the equivalent of "nullptr" if such a node was not found.
-    return NodeAddr<RefNode*>();
-  }
-
-  template <typename Predicate>
-  NodeList CodeNode::members_if(Predicate P, const DataFlowGraph &G) const {
-    NodeList MM;
-    auto M = getFirstMember(G);
-    if (M.Id == 0)
-      return MM;
-
-    while (M.Addr != this) {
-      if (P(M))
-        MM.push_back(M);
-      M = G.addr<NodeBase*>(M.Addr->getNext());
-    }
-    return MM;
-  }
-
-  template <typename T>
-  struct Print {
-    Print(const T &x, const DataFlowGraph &g) : Obj(x), G(g) {}
-
-    const T &Obj;
-    const DataFlowGraph &G;
-  };
-
-  template <typename T>
-  struct PrintNode : Print<NodeAddr<T>> {
-    PrintNode(const NodeAddr<T> &x, const DataFlowGraph &g)
-      : Print<NodeAddr<T>>(x, g) {}
-  };
-
-  raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterRef> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<NodeId> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<DefNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<UseNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS,
-                          const Print<NodeAddr<PhiUseNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<RefNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<NodeList> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<NodeSet> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<NodeAddr<PhiNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS,
-                          const Print<NodeAddr<StmtNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS,
-                          const Print<NodeAddr<InstrNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS,
-                          const Print<NodeAddr<BlockNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS,
-                          const Print<NodeAddr<FuncNode *>> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterSet> &P);
-  raw_ostream &operator<<(raw_ostream &OS, const Print<RegisterAggr> &P);
-  raw_ostream &operator<<(raw_ostream &OS,
-                          const Print<DataFlowGraph::DefStack> &P);
-
-} // end namespace rdf
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_HEXAGON_RDFGRAPH_H
diff --git a/llvm/lib/Target/Hexagon/RDFLiveness.h b/llvm/lib/Target/Hexagon/RDFLiveness.h
deleted file mode 100644
index ea4890271726..000000000000
--- a/llvm/lib/Target/Hexagon/RDFLiveness.h
+++ /dev/null
@@ -1,151 +0,0 @@
-//===- RDFLiveness.h --------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// Recalculate the liveness information given a data flow graph.
-// This includes block live-ins and kill flags.
-
-#ifndef LLVM_LIB_TARGET_HEXAGON_RDFLIVENESS_H
-#define LLVM_LIB_TARGET_HEXAGON_RDFLIVENESS_H
-
-#include "RDFGraph.h"
-#include "RDFRegisters.h"
-#include "llvm/ADT/DenseMap.h"
-#include "llvm/MC/LaneBitmask.h"
-#include <map>
-#include <set>
-#include <utility>
-
-namespace llvm {
-
-class MachineBasicBlock;
-class MachineDominanceFrontier;
-class MachineDominatorTree;
-class MachineRegisterInfo;
-class TargetRegisterInfo;
-
-namespace rdf {
-
-  struct Liveness {
-  public:
-    // This is really a std::map, except that it provides a non-trivial
-    // default constructor to the element accessed via [].
-    struct LiveMapType {
-      LiveMapType(const PhysicalRegisterInfo &pri) : Empty(pri) {}
-
-      RegisterAggr &operator[] (MachineBasicBlock *B) {
-        return Map.emplace(B, Empty).first->second;
-      }
-
-    private:
-      RegisterAggr Empty;
-      std::map<MachineBasicBlock*,RegisterAggr> Map;
-    };
-
-    using NodeRef = std::pair<NodeId, LaneBitmask>;
-    using NodeRefSet = std::set<NodeRef>;
-    // RegisterId in RefMap must be normalized.
-    using RefMap = std::map<RegisterId, NodeRefSet>;
-
-    Liveness(MachineRegisterInfo &mri, const DataFlowGraph &g)
-        : DFG(g), TRI(g.getTRI()), PRI(g.getPRI()), MDT(g.getDT()),
-          MDF(g.getDF()), LiveMap(g.getPRI()), Empty(), NoRegs(g.getPRI()) {}
-
-    NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA,
-        bool TopShadows, bool FullChain, const RegisterAggr &DefRRs);
-
-    NodeList getAllReachingDefs(NodeAddr<RefNode*> RefA) {
-      return getAllReachingDefs(RefA.Addr->getRegRef(DFG), RefA, false,
-                                false, NoRegs);
-    }
-
-    NodeList getAllReachingDefs(RegisterRef RefRR, NodeAddr<RefNode*> RefA) {
-      return getAllReachingDefs(RefRR, RefA, false, false, NoRegs);
-    }
-
-    NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA,
-        const RegisterAggr &DefRRs);
-
-    NodeSet getAllReachedUses(RegisterRef RefRR, NodeAddr<DefNode*> DefA) {
-      return getAllReachedUses(RefRR, DefA, NoRegs);
-    }
-
-    std::pair<NodeSet,bool> getAllReachingDefsRec(RegisterRef RefRR,
-        NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs);
-
-    NodeAddr<RefNode*> getNearestAliasedRef(RegisterRef RefRR,
-        NodeAddr<InstrNode*> IA);
-
-    LiveMapType &getLiveMap() { return LiveMap; }
-    const LiveMapType &getLiveMap() const { return LiveMap; }
-
-    const RefMap &getRealUses(NodeId P) const {
-      auto F = RealUseMap.find(P);
-      return F == RealUseMap.end() ? Empty : F->second;
-    }
-
-    void computePhiInfo();
-    void computeLiveIns();
-    void resetLiveIns();
-    void resetKills();
-    void resetKills(MachineBasicBlock *B);
-
-    void trace(bool T) { Trace = T; }
-
-  private:
-    const DataFlowGraph &DFG;
-    const TargetRegisterInfo &TRI;
-    const PhysicalRegisterInfo &PRI;
-    const MachineDominatorTree &MDT;
-    const MachineDominanceFrontier &MDF;
-    LiveMapType LiveMap;
-    const RefMap Empty;
-    const RegisterAggr NoRegs;
-    bool Trace = false;
-
-    // Cache of mapping from node ids (for RefNodes) to the containing
-    // basic blocks. Not computing it each time for each node reduces
-    // the liveness calculation time by a large fraction.
-    using NodeBlockMap = DenseMap<NodeId, MachineBasicBlock *>;
-    NodeBlockMap NBMap;
-
-    // Phi information:
-    //
-    // RealUseMap
-    // map: NodeId -> (map: RegisterId -> NodeRefSet)
-    //      phi id -> (map: register -> set of reached non-phi uses)
-    std::map<NodeId, RefMap> RealUseMap;
-
-    // Inverse iterated dominance frontier.
-    std::map<MachineBasicBlock*,std::set<MachineBasicBlock*>> IIDF;
-
-    // Live on entry.
-    std::map<MachineBasicBlock*,RefMap> PhiLON;
-
-    // Phi uses are considered to be located at the end of the block that
-    // they are associated with. The reaching def of a phi use dominates the
-    // block that the use corresponds to, but not the block that contains
-    // the phi itself. To include these uses in the liveness propagation (up
-    // the dominator tree), create a map: block -> set of uses live on exit.
-    std::map<MachineBasicBlock*,RefMap> PhiLOX;
-
-    MachineBasicBlock *getBlockWithRef(NodeId RN) const;
-    void traverse(MachineBasicBlock *B, RefMap &LiveIn);
-    void emptify(RefMap &M);
-
-    std::pair<NodeSet,bool> getAllReachingDefsRecImpl(RegisterRef RefRR,
-        NodeAddr<RefNode*> RefA, NodeSet &Visited, const NodeSet &Defs,
-        unsigned Nest, unsigned MaxNest);
-  };
-
-  raw_ostream &operator<<(raw_ostream &OS, const Print<Liveness::RefMap> &P);
-
-} // end namespace rdf
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_HEXAGON_RDFLIVENESS_H
diff --git a/llvm/lib/Target/Hexagon/RDFRegisters.h b/llvm/lib/Target/Hexagon/RDFRegisters.h
deleted file mode 100644
index 4afaf80e4659..000000000000
--- a/llvm/lib/Target/Hexagon/RDFRegisters.h
+++ /dev/null
@@ -1,240 +0,0 @@
-//===- RDFRegisters.h -------------------------------------------*- C++ -*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
-#define LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
-
-#include "llvm/ADT/BitVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/CodeGen/TargetRegisterInfo.h"
-#include "llvm/MC/LaneBitmask.h"
-#include <cassert>
-#include <cstdint>
-#include <map>
-#include <set>
-#include <vector>
-
-namespace llvm {
-
-class MachineFunction;
-class raw_ostream;
-
-namespace rdf {
-
-  using RegisterId = uint32_t;
-
-  // Template class for a map translating uint32_t into arbitrary types.
-  // The map will act like an indexed set: upon insertion of a new object,
-  // it will automatically assign a new index to it. Index of 0 is treated
-  // as invalid and is never allocated.
-  template <typename T, unsigned N = 32>
-  struct IndexedSet {
-    IndexedSet() { Map.reserve(N); }
-
-    T get(uint32_t Idx) const {
-      // Index Idx corresponds to Map[Idx-1].
-      assert(Idx != 0 && !Map.empty() && Idx-1 < Map.size());
-      return Map[Idx-1];
-    }
-
-    uint32_t insert(T Val) {
-      // Linear search.
-      auto F = llvm::find(Map, Val);
-      if (F != Map.end())
-        return F - Map.begin() + 1;
-      Map.push_back(Val);
-      return Map.size();  // Return actual_index + 1.
-    }
-
-    uint32_t find(T Val) const {
-      auto F = llvm::find(Map, Val);
-      assert(F != Map.end());
-      return F - Map.begin() + 1;
-    }
-
-    uint32_t size() const { return Map.size(); }
-
-    using const_iterator = typename std::vector<T>::const_iterator;
-
-    const_iterator begin() const { return Map.begin(); }
-    const_iterator end() const { return Map.end(); }
-
-  private:
-    std::vector<T> Map;
-  };
-
-  struct RegisterRef {
-    RegisterId Reg = 0;
-    LaneBitmask Mask = LaneBitmask::getNone();
-
-    RegisterRef() = default;
-    explicit RegisterRef(RegisterId R, LaneBitmask M = LaneBitmask::getAll())
-      : Reg(R), Mask(R != 0 ? M : LaneBitmask::getNone()) {}
-
-    operator bool() const {
-      return Reg != 0 && Mask.any();
-    }
-
-    bool operator== (const RegisterRef &RR) const {
-      return Reg == RR.Reg && Mask == RR.Mask;
-    }
-
-    bool operator!= (const RegisterRef &RR) const {
-      return !operator==(RR);
-    }
-
-    bool operator< (const RegisterRef &RR) const {
-      return Reg < RR.Reg || (Reg == RR.Reg && Mask < RR.Mask);
-    }
-  };
-
-
-  struct PhysicalRegisterInfo {
-    PhysicalRegisterInfo(const TargetRegisterInfo &tri,
-                         const MachineFunction &mf);
-
-    static bool isRegMaskId(RegisterId R) {
-      return Register::isStackSlot(R);
-    }
-
-    RegisterId getRegMaskId(const uint32_t *RM) const {
-      return Register::index2StackSlot(RegMasks.find(RM));
-    }
-
-    const uint32_t *getRegMaskBits(RegisterId R) const {
-      return RegMasks.get(Register::stackSlot2Index(R));
-    }
-
-    RegisterRef normalize(RegisterRef RR) const;
-
-    bool alias(RegisterRef RA, RegisterRef RB) const {
-      if (!isRegMaskId(RA.Reg))
-        return !isRegMaskId(RB.Reg) ? aliasRR(RA, RB) : aliasRM(RA, RB);
-      return !isRegMaskId(RB.Reg) ? aliasRM(RB, RA) : aliasMM(RA, RB);
-    }
-
-    std::set<RegisterId> getAliasSet(RegisterId Reg) const;
-
-    RegisterRef getRefForUnit(uint32_t U) const {
-      return RegisterRef(UnitInfos[U].Reg, UnitInfos[U].Mask);
-    }
-
-    const BitVector &getMaskUnits(RegisterId MaskId) const {
-      return MaskInfos[Register::stackSlot2Index(MaskId)].Units;
-    }
-
-    RegisterRef mapTo(RegisterRef RR, unsigned R) const;
-    const TargetRegisterInfo &getTRI() const { return TRI; }
-
-  private:
-    struct RegInfo {
-      const TargetRegisterClass *RegClass = nullptr;
-    };
-    struct UnitInfo {
-      RegisterId Reg = 0;
-      LaneBitmask Mask;
-    };
-    struct MaskInfo {
-      BitVector Units;
-    };
-
-    const TargetRegisterInfo &TRI;
-    IndexedSet<const uint32_t*> RegMasks;
-    std::vector<RegInfo> RegInfos;
-    std::vector<UnitInfo> UnitInfos;
-    std::vector<MaskInfo> MaskInfos;
-
-    bool aliasRR(RegisterRef RA, RegisterRef RB) const;
-    bool aliasRM(RegisterRef RR, RegisterRef RM) const;
-    bool aliasMM(RegisterRef RM, RegisterRef RN) const;
-  };
-
-  struct RegisterAggr {
-    RegisterAggr(const PhysicalRegisterInfo &pri)
-        : Units(pri.getTRI().getNumRegUnits()), PRI(pri) {}
-    RegisterAggr(const RegisterAggr &RG) = default;
-
-    bool empty() const { return Units.none(); }
-    bool hasAliasOf(RegisterRef RR) const;
-    bool hasCoverOf(RegisterRef RR) const;
-
-    static bool isCoverOf(RegisterRef RA, RegisterRef RB,
-                          const PhysicalRegisterInfo &PRI) {
-      return RegisterAggr(PRI).insert(RA).hasCoverOf(RB);
-    }
-
-    RegisterAggr &insert(RegisterRef RR);
-    RegisterAggr &insert(const RegisterAggr &RG);
-    RegisterAggr &intersect(RegisterRef RR);
-    RegisterAggr &intersect(const RegisterAggr &RG);
-    RegisterAggr &clear(RegisterRef RR);
-    RegisterAggr &clear(const RegisterAggr &RG);
-
-    RegisterRef intersectWith(RegisterRef RR) const;
-    RegisterRef clearIn(RegisterRef RR) const;
-    RegisterRef makeRegRef() const;
-
-    void print(raw_ostream &OS) const;
-
-    struct rr_iterator {
-      using MapType = std::map<RegisterId, LaneBitmask>;
-
-    private:
-      MapType Masks;
-      MapType::iterator Pos;
-      unsigned Index;
-      const RegisterAggr *Owner;
-
-    public:
-      rr_iterator(const RegisterAggr &RG, bool End);
-
-      RegisterRef operator*() const {
-        return RegisterRef(Pos->first, Pos->second);
-      }
-
-      rr_iterator &operator++() {
-        ++Pos;
-        ++Index;
-        return *this;
-      }
-
-      bool operator==(const rr_iterator &I) const {
-        assert(Owner == I.Owner);
-        (void)Owner;
-        return Index == I.Index;
-      }
-
-      bool operator!=(const rr_iterator &I) const {
-        return !(*this == I);
-      }
-    };
-
-    rr_iterator rr_begin() const {
-      return rr_iterator(*this, false);
-    }
-    rr_iterator rr_end() const {
-      return rr_iterator(*this, true);
-    }
-
-  private:
-    BitVector Units;
-    const PhysicalRegisterInfo &PRI;
-  };
-
-  // Optionally print the lane mask, if it is not ~0.
-  struct PrintLaneMaskOpt {
-    PrintLaneMaskOpt(LaneBitmask M) : Mask(M) {}
-    LaneBitmask Mask;
-  };
-  raw_ostream &operator<< (raw_ostream &OS, const PrintLaneMaskOpt &P);
-
-} // end namespace rdf
-
-} // end namespace llvm
-
-#endif // LLVM_LIB_TARGET_HEXAGON_RDFREGISTERS_H
diff --git a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
index 8b8504978c75..639ab24b0817 100644
--- a/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
+++ b/llvm/lib/Target/Lanai/AsmParser/LanaiAsmParser.cpp
@@ -47,7 +47,7 @@ struct LanaiOperand;
 
 class LanaiAsmParser : public MCTargetAsmParser {
   // Parse operands
-  std::unique_ptr<LanaiOperand> parseRegister();
+  std::unique_ptr<LanaiOperand> parseRegister(bool RestoreOnFailure = false);
 
   std::unique_ptr<LanaiOperand> parseImmediate();
 
@@ -67,6 +67,8 @@ class LanaiAsmParser : public MCTargetAsmParser {
                         SMLoc NameLoc, OperandVector &Operands) override;
 
   bool ParseRegister(unsigned &RegNum, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
 
   bool MatchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
@@ -657,7 +659,7 @@ bool LanaiAsmParser::MatchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode,
 
   switch (MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm)) {
   case Match_Success:
-    Out.EmitInstruction(Inst, SubtargetInfo);
+    Out.emitInstruction(Inst, SubtargetInfo);
     Opcode = Inst.getOpcode();
     return false;
   case Match_MissingFeature:
@@ -687,21 +689,30 @@ bool LanaiAsmParser::MatchAndEmitInstruction(SMLoc IdLoc, unsigned &Opcode,
 // backwards compatible with GCC and the different ways inline assembly is
 // handled.
 // TODO: see if there isn't a better way to do this.
-std::unique_ptr<LanaiOperand> LanaiAsmParser::parseRegister() {
+std::unique_ptr<LanaiOperand>
+LanaiAsmParser::parseRegister(bool RestoreOnFailure) {
   SMLoc Start = Parser.getTok().getLoc();
   SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  Optional<AsmToken> PercentTok;
 
   unsigned RegNum;
   // Eat the '%'.
-  if (Lexer.getKind() == AsmToken::Percent)
+  if (Lexer.getKind() == AsmToken::Percent) {
+    PercentTok = Parser.getTok();
     Parser.Lex();
+  }
   if (Lexer.getKind() == AsmToken::Identifier) {
     RegNum = MatchRegisterName(Lexer.getTok().getIdentifier());
-    if (RegNum == 0)
+    if (RegNum == 0) {
+      if (PercentTok.hasValue() && RestoreOnFailure)
+        Lexer.UnLex(PercentTok.getValue());
       return nullptr;
+    }
     Parser.Lex(); // Eat identifier token
     return LanaiOperand::createReg(RegNum, Start, End);
   }
+  if (PercentTok.hasValue() && RestoreOnFailure)
+    Lexer.UnLex(PercentTok.getValue());
   return nullptr;
 }
 
@@ -710,12 +721,25 @@ bool LanaiAsmParser::ParseRegister(unsigned &RegNum, SMLoc &StartLoc,
   const AsmToken &Tok = getParser().getTok();
   StartLoc = Tok.getLoc();
   EndLoc = Tok.getEndLoc();
-  std::unique_ptr<LanaiOperand> Op = parseRegister();
+  std::unique_ptr<LanaiOperand> Op = parseRegister(/*RestoreOnFailure=*/false);
   if (Op != nullptr)
     RegNum = Op->getReg();
   return (Op == nullptr);
 }
 
+OperandMatchResultTy LanaiAsmParser::tryParseRegister(unsigned &RegNum,
+                                                      SMLoc &StartLoc,
+                                                      SMLoc &EndLoc) {
+  const AsmToken &Tok = getParser().getTok();
+  StartLoc = Tok.getLoc();
+  EndLoc = Tok.getEndLoc();
+  std::unique_ptr<LanaiOperand> Op = parseRegister(/*RestoreOnFailure=*/true);
+  if (Op == nullptr)
+    return MatchOperand_NoMatch;
+  RegNum = Op->getReg();
+  return MatchOperand_Success;
+}
+
 std::unique_ptr<LanaiOperand> LanaiAsmParser::parseIdentifier() {
   SMLoc Start = Parser.getTok().getLoc();
   SMLoc End = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
diff --git a/llvm/lib/Target/Lanai/Lanai.h b/llvm/lib/Target/Lanai/Lanai.h
index 2f06ea91ab03..2bd266b1b96e 100644
--- a/llvm/lib/Target/Lanai/Lanai.h
+++ b/llvm/lib/Target/Lanai/Lanai.h
@@ -19,9 +19,6 @@
 namespace llvm {
 class FunctionPass;
 class LanaiTargetMachine;
-class MachineFunctionPass;
-class TargetMachine;
-class formatted_raw_ostream;
 
 // createLanaiISelDag - This pass converts a legalized DAG into a
 // Lanai-specific DAG, ready for instruction scheduling.
diff --git a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
index c13ee08e1213..6bac7c75853d 100644
--- a/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
+++ b/llvm/lib/Target/Lanai/LanaiAsmPrinter.cpp
@@ -51,7 +51,7 @@ public:
   void printOperand(const MachineInstr *MI, int OpNum, raw_ostream &O);
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        const char *ExtraCode, raw_ostream &O) override;
-  void EmitInstruction(const MachineInstr *MI) override;
+  void emitInstruction(const MachineInstr *MI) override;
   bool isBlockOnlyReachableByFallthrough(
       const MachineBasicBlock *MBB) const override;
 
@@ -155,7 +155,7 @@ void LanaiAsmPrinter::emitCallInstruction(const MachineInstr *MI) {
   // Insert save rca instruction immediately before the call.
   // TODO: We should generate a pc-relative mov instruction here instead
   // of pc + 16 (should be mov .+16 %rca).
-  OutStreamer->EmitInstruction(MCInstBuilder(Lanai::ADD_I_LO)
+  OutStreamer->emitInstruction(MCInstBuilder(Lanai::ADD_I_LO)
                                    .addReg(Lanai::RCA)
                                    .addReg(Lanai::PC)
                                    .addImm(16),
@@ -163,7 +163,7 @@ void LanaiAsmPrinter::emitCallInstruction(const MachineInstr *MI) {
 
   // Push rca onto the stack.
   //   st %rca, [--%sp]
-  OutStreamer->EmitInstruction(MCInstBuilder(Lanai::SW_RI)
+  OutStreamer->emitInstruction(MCInstBuilder(Lanai::SW_RI)
                                    .addReg(Lanai::RCA)
                                    .addReg(Lanai::SP)
                                    .addImm(-4)
@@ -175,9 +175,9 @@ void LanaiAsmPrinter::emitCallInstruction(const MachineInstr *MI) {
     MCInst TmpInst;
     MCInstLowering.Lower(MI, TmpInst);
     TmpInst.setOpcode(Lanai::BT);
-    OutStreamer->EmitInstruction(TmpInst, STI);
+    OutStreamer->emitInstruction(TmpInst, STI);
   } else {
-    OutStreamer->EmitInstruction(MCInstBuilder(Lanai::ADD_R)
+    OutStreamer->emitInstruction(MCInstBuilder(Lanai::ADD_R)
                                      .addReg(Lanai::PC)
                                      .addReg(MI->getOperand(0).getReg())
                                      .addReg(Lanai::R0)
@@ -191,10 +191,10 @@ void LanaiAsmPrinter::customEmitInstruction(const MachineInstr *MI) {
   MCSubtargetInfo STI = getSubtargetInfo();
   MCInst TmpInst;
   MCInstLowering.Lower(MI, TmpInst);
-  OutStreamer->EmitInstruction(TmpInst, STI);
+  OutStreamer->emitInstruction(TmpInst, STI);
 }
 
-void LanaiAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void LanaiAsmPrinter::emitInstruction(const MachineInstr *MI) {
   MachineBasicBlock::const_instr_iterator I = MI->getIterator();
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
 
@@ -211,7 +211,7 @@ void LanaiAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 // isBlockOnlyReachableByFallthough - Return true if the basic block has
 // exactly one predecessor and the control transfer mechanism between
 // the predecessor and this block is a fall-through.
-// FIXME: could the overridden cases be handled in AnalyzeBranch?
+// FIXME: could the overridden cases be handled in analyzeBranch?
 bool LanaiAsmPrinter::isBlockOnlyReachableByFallthrough(
     const MachineBasicBlock *MBB) const {
   // The predecessor has to be immediately before this block.
diff --git a/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp b/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
index eddc2b8e61f7..3c84ed057fd1 100644
--- a/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiFrameLowering.cpp
@@ -32,8 +32,8 @@ void LanaiFrameLowering::determineFrameLayout(MachineFunction &MF) const {
   unsigned FrameSize = MFI.getStackSize();
 
   // Get the alignment.
-  unsigned StackAlign = LRI->needsStackRealignment(MF) ? MFI.getMaxAlignment()
-                                                       : getStackAlignment();
+  Align StackAlign =
+      LRI->needsStackRealignment(MF) ? MFI.getMaxAlign() : getStackAlign();
 
   // Get the maximum call frame size of all the calls.
   unsigned MaxCallFrameSize = MFI.getMaxCallFrameSize();
diff --git a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
index 6fa0c93d4a05..32ccf7172594 100644
--- a/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
+++ b/llvm/lib/Target/Lanai/LanaiISelLowering.cpp
@@ -388,7 +388,7 @@ static bool CC_Lanai32_VarArg(unsigned ValNo, MVT ValVT, MVT LocVT,
   }
 
   // VarArgs get passed on stack
-  unsigned Offset = State.AllocateStack(4, 4);
+  unsigned Offset = State.AllocateStack(4, Align(4));
   State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
   return false;
 }
@@ -633,13 +633,13 @@ SDValue LanaiTargetLowering::LowerCCCCallTo(
 
     SDValue Arg = OutVals[I];
     unsigned Size = Flags.getByValSize();
-    unsigned Align = Flags.getByValAlign();
+    Align Alignment = Flags.getNonZeroByValAlign();
 
-    int FI = MFI.CreateStackObject(Size, Align, false);
+    int FI = MFI.CreateStackObject(Size, Alignment, false);
     SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     SDValue SizeNode = DAG.getConstant(Size, DL, MVT::i32);
 
-    Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Align,
+    Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment,
                           /*IsVolatile=*/false,
                           /*AlwaysInline=*/false,
                           /*isTailCall=*/false, MachinePointerInfo(),
@@ -1136,7 +1136,7 @@ SDValue LanaiTargetLowering::LowerConstantPool(SDValue Op,
   if (getTargetMachine().getCodeModel() == CodeModel::Small ||
       TLOF->isConstantInSmallSection(DAG.getDataLayout(), C)) {
     SDValue Small = DAG.getTargetConstantPool(
-        C, MVT::i32, N->getAlignment(), N->getOffset(), LanaiII::MO_NO_FLAG);
+        C, MVT::i32, N->getAlign(), N->getOffset(), LanaiII::MO_NO_FLAG);
     return DAG.getNode(ISD::OR, DL, MVT::i32,
                        DAG.getRegister(Lanai::R0, MVT::i32),
                        DAG.getNode(LanaiISD::SMALL, DL, MVT::i32, Small));
@@ -1144,9 +1144,9 @@ SDValue LanaiTargetLowering::LowerConstantPool(SDValue Op,
     uint8_t OpFlagHi = LanaiII::MO_ABS_HI;
     uint8_t OpFlagLo = LanaiII::MO_ABS_LO;
 
-    SDValue Hi = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
+    SDValue Hi = DAG.getTargetConstantPool(C, MVT::i32, N->getAlign(),
                                            N->getOffset(), OpFlagHi);
-    SDValue Lo = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
+    SDValue Lo = DAG.getTargetConstantPool(C, MVT::i32, N->getAlign(),
                                            N->getOffset(), OpFlagLo);
     Hi = DAG.getNode(LanaiISD::HI, DL, MVT::i32, Hi);
     Lo = DAG.getNode(LanaiISD::LO, DL, MVT::i32, Lo);
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
index 4ce72f9621ad..c82142970357 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.cpp
@@ -48,7 +48,7 @@ void LanaiInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
 void LanaiInstrInfo::storeRegToStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
-    unsigned SourceRegister, bool IsKill, int FrameIndex,
+    Register SourceRegister, bool IsKill, int FrameIndex,
     const TargetRegisterClass *RegisterClass,
     const TargetRegisterInfo * /*RegisterInfo*/) const {
   DebugLoc DL;
@@ -68,7 +68,7 @@ void LanaiInstrInfo::storeRegToStackSlot(
 
 void LanaiInstrInfo::loadRegFromStackSlot(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator Position,
-    unsigned DestinationRegister, int FrameIndex,
+    Register DestinationRegister, int FrameIndex,
     const TargetRegisterClass *RegisterClass,
     const TargetRegisterInfo * /*RegisterInfo*/) const {
   DebugLoc DL;
@@ -174,8 +174,8 @@ LanaiInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
   return makeArrayRef(TargetFlags);
 }
 
-bool LanaiInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                                    unsigned &SrcReg2, int &CmpMask,
+bool LanaiInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                                    Register &SrcReg2, int &CmpMask,
                                     int &CmpValue) const {
   switch (MI.getOpcode()) {
   default:
@@ -183,7 +183,7 @@ bool LanaiInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   case Lanai::SFSUB_F_RI_LO:
   case Lanai::SFSUB_F_RI_HI:
     SrcReg = MI.getOperand(0).getReg();
-    SrcReg2 = 0;
+    SrcReg2 = Register();
     CmpMask = ~0;
     CmpValue = MI.getOperand(1).getImm();
     return true;
@@ -281,7 +281,7 @@ inline static unsigned flagSettingOpcodeVariant(unsigned OldOpcode) {
 }
 
 bool LanaiInstrInfo::optimizeCompareInstr(
-    MachineInstr &CmpInstr, unsigned SrcReg, unsigned SrcReg2, int /*CmpMask*/,
+    MachineInstr &CmpInstr, Register SrcReg, Register SrcReg2, int /*CmpMask*/,
     int CmpValue, const MachineRegisterInfo *MRI) const {
   // Get the unique definition of SrcReg.
   MachineInstr *MI = MRI->getUniqueVRegDef(SrcReg);
@@ -454,9 +454,9 @@ bool LanaiInstrInfo::analyzeSelect(const MachineInstr &MI,
 
 // Identify instructions that can be folded into a SELECT instruction, and
 // return the defining instruction.
-static MachineInstr *canFoldIntoSelect(unsigned Reg,
+static MachineInstr *canFoldIntoSelect(Register Reg,
                                        const MachineRegisterInfo &MRI) {
-  if (!Register::isVirtualRegister(Reg))
+  if (!Reg.isVirtual())
     return nullptr;
   if (!MRI.hasOneNonDBGUse(Reg))
     return nullptr;
@@ -795,10 +795,10 @@ bool LanaiInstrInfo::getMemOperandWithOffsetWidth(
   return true;
 }
 
-bool LanaiInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
-                                        const MachineOperand *&BaseOp,
-                                        int64_t &Offset,
-                                        const TargetRegisterInfo *TRI) const {
+bool LanaiInstrInfo::getMemOperandsWithOffsetWidth(
+    const MachineInstr &LdSt, SmallVectorImpl<const MachineOperand *> &BaseOps,
+    int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
+    const TargetRegisterInfo *TRI) const {
   switch (LdSt.getOpcode()) {
   default:
     return false;
@@ -811,7 +811,11 @@ bool LanaiInstrInfo::getMemOperandWithOffset(const MachineInstr &LdSt,
   case Lanai::STH_RI:
   case Lanai::LDBs_RI:
   case Lanai::LDBz_RI:
-    unsigned Width;
-    return getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI);
+    const MachineOperand *BaseOp;
+    OffsetIsScalable = false;
+    if (!getMemOperandWithOffsetWidth(LdSt, BaseOp, Offset, Width, TRI))
+      return false;
+    BaseOps.push_back(BaseOp);
+    return true;
   }
 }
diff --git a/llvm/lib/Target/Lanai/LanaiInstrInfo.h b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
index c7741dd7437f..44c1e629a8e6 100644
--- a/llvm/lib/Target/Lanai/LanaiInstrInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiInstrInfo.h
@@ -54,23 +54,24 @@ public:
   void
   storeRegToStackSlot(MachineBasicBlock &MBB,
                       MachineBasicBlock::iterator Position,
-                      unsigned SourceRegister, bool IsKill, int FrameIndex,
+                      Register SourceRegister, bool IsKill, int FrameIndex,
                       const TargetRegisterClass *RegisterClass,
                       const TargetRegisterInfo *RegisterInfo) const override;
 
   void
   loadRegFromStackSlot(MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator Position,
-                       unsigned DestinationRegister, int FrameIndex,
+                       Register DestinationRegister, int FrameIndex,
                        const TargetRegisterClass *RegisterClass,
                        const TargetRegisterInfo *RegisterInfo) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
-  bool getMemOperandWithOffset(const MachineInstr &LdSt,
-                               const MachineOperand *&BaseOp,
-                               int64_t &Offset,
-                               const TargetRegisterInfo *TRI) const override;
+  bool getMemOperandsWithOffsetWidth(
+      const MachineInstr &LdSt,
+      SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
+      bool &OffsetIsScalable, unsigned &Width,
+      const TargetRegisterInfo *TRI) const override;
 
   bool getMemOperandWithOffsetWidth(const MachineInstr &LdSt,
                                     const MachineOperand *&BaseOp,
@@ -94,15 +95,15 @@ public:
   // For a comparison instruction, return the source registers in SrcReg and
   // SrcReg2 if having two register operands, and the value it compares against
   // in CmpValue. Return true if the comparison instruction can be analyzed.
-  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                      unsigned &SrcReg2, int &CmpMask,
+  bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                      Register &SrcReg2, int &CmpMask,
                       int &CmpValue) const override;
 
   // See if the comparison instruction can be converted into something more
   // efficient. E.g., on Lanai register-register instructions can set the flag
   // register, obviating the need for a separate compare.
-  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
-                            unsigned SrcReg2, int CmpMask, int CmpValue,
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+                            Register SrcReg2, int CmpMask, int CmpValue,
                             const MachineRegisterInfo *MRI) const override;
 
   // Analyze the given select instruction, returning true if it cannot be
diff --git a/llvm/lib/Target/Lanai/LanaiMCInstLower.h b/llvm/lib/Target/Lanai/LanaiMCInstLower.h
index 00d3ebb05045..6323319fae43 100644
--- a/llvm/lib/Target/Lanai/LanaiMCInstLower.h
+++ b/llvm/lib/Target/Lanai/LanaiMCInstLower.h
@@ -18,9 +18,7 @@ class MCInst;
 class MCOperand;
 class MCSymbol;
 class MachineInstr;
-class MachineModuleInfoMachO;
 class MachineOperand;
-class Mangler;
 
 // LanaiMCInstLower - This class is used to lower an MachineInstr
 // into an MCInst.
diff --git a/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp b/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
index 7b4e0750ba08..eeef1d919925 100644
--- a/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.cpp
@@ -11,12 +11,3 @@
 using namespace llvm;
 
 void LanaiMachineFunctionInfo::anchor() {}
-
-unsigned LanaiMachineFunctionInfo::getGlobalBaseReg() {
-  // Return if it has already been initialized.
-  if (GlobalBaseReg)
-    return GlobalBaseReg;
-
-  return GlobalBaseReg =
-             MF.getRegInfo().createVirtualRegister(&Lanai::GPRRegClass);
-}
diff --git a/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h b/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h
index 2c97c619c246..de712637b5a4 100644
--- a/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiMachineFunctionInfo.h
@@ -24,29 +24,25 @@ namespace llvm {
 class LanaiMachineFunctionInfo : public MachineFunctionInfo {
   virtual void anchor();
 
-  MachineFunction &MF;
-
   // SRetReturnReg - Lanai ABI require that sret lowering includes
   // returning the value of the returned struct in a register. This field
   // holds the virtual register into which the sret argument is passed.
-  unsigned SRetReturnReg;
+  Register SRetReturnReg;
 
   // GlobalBaseReg - keeps track of the virtual register initialized for
   // use as the global base register. This is used for PIC in some PIC
   // relocation models.
-  unsigned GlobalBaseReg;
+  Register GlobalBaseReg;
 
   // VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex;
 
 public:
   explicit LanaiMachineFunctionInfo(MachineFunction &MF)
-      : MF(MF), SRetReturnReg(0), GlobalBaseReg(0), VarArgsFrameIndex(0) {}
-
-  unsigned getSRetReturnReg() const { return SRetReturnReg; }
-  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+      : VarArgsFrameIndex(0) {}
 
-  unsigned getGlobalBaseReg();
+  Register getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(Register Reg) { SRetReturnReg = Reg; }
 
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
diff --git a/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp b/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
index 7c28debb94dd..64f87ae5f963 100644
--- a/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiRegisterInfo.cpp
@@ -66,11 +66,6 @@ bool LanaiRegisterInfo::requiresRegisterScavenging(
   return true;
 }
 
-bool LanaiRegisterInfo::trackLivenessAfterRegAlloc(
-    const MachineFunction & /*MF*/) const {
-  return true;
-}
-
 static bool isALUArithLoOpcode(unsigned Opcode) {
   switch (Opcode) {
   case Lanai::ADD_I_LO:
diff --git a/llvm/lib/Target/Lanai/LanaiRegisterInfo.h b/llvm/lib/Target/Lanai/LanaiRegisterInfo.h
index 4e4da619d366..cbc95b273e1d 100644
--- a/llvm/lib/Target/Lanai/LanaiRegisterInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiRegisterInfo.h
@@ -34,8 +34,6 @@ struct LanaiRegisterInfo : public LanaiGenRegisterInfo {
 
   bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
-
   void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                            unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
diff --git a/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp b/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp
index dff87a3e264d..307619c04481 100644
--- a/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.cpp
@@ -20,7 +20,7 @@ namespace llvm {
 
 SDValue LanaiSelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG & /*DAG*/, const SDLoc & /*dl*/, SDValue /*Chain*/,
-    SDValue /*Dst*/, SDValue /*Src*/, SDValue Size, unsigned /*Align*/,
+    SDValue /*Dst*/, SDValue /*Src*/, SDValue Size, Align /*Alignment*/,
     bool /*isVolatile*/, bool /*AlwaysInline*/,
     MachinePointerInfo /*DstPtrInfo*/,
     MachinePointerInfo /*SrcPtrInfo*/) const {
diff --git a/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.h b/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.h
index c5650a7c1f53..8355168a7396 100644
--- a/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiSelectionDAGInfo.h
@@ -24,8 +24,8 @@ public:
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align, bool isVolatile,
-                                  bool AlwaysInline,
+                                  SDValue Size, Align Alignment,
+                                  bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const override;
 };
diff --git a/llvm/lib/Target/Lanai/LanaiSubtarget.cpp b/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
index 9a872c789bcc..ebf91e08fbc8 100644
--- a/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
+++ b/llvm/lib/Target/Lanai/LanaiSubtarget.cpp
@@ -23,7 +23,7 @@
 using namespace llvm;
 
 void LanaiSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
-  std::string CPUName = CPU;
+  std::string CPUName = std::string(CPU);
   if (CPUName.empty())
     CPUName = "generic";
 
diff --git a/llvm/lib/Target/Lanai/LanaiTargetMachine.h b/llvm/lib/Target/Lanai/LanaiTargetMachine.h
index d2ac40007e24..fb2bc0644fe8 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetMachine.h
+++ b/llvm/lib/Target/Lanai/LanaiTargetMachine.h
@@ -22,7 +22,6 @@
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
-class formatted_raw_ostream;
 
 class LanaiTargetMachine : public LLVMTargetMachine {
   LanaiSubtarget Subtarget;
diff --git a/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp b/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
index b0f7c090bb8e..a421f3156153 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
+++ b/llvm/lib/Target/Lanai/LanaiTargetObjectFile.cpp
@@ -28,7 +28,6 @@ static cl::opt<unsigned> SSThreshold(
 void LanaiTargetObjectFile::Initialize(MCContext &Ctx,
                                        const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
 
   SmallDataSection = getContext().getELFSection(
       ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
@@ -117,13 +116,13 @@ bool LanaiTargetObjectFile::isConstantInSmallSection(const DataLayout &DL,
   return isInSmallSection(DL.getTypeAllocSize(CN->getType()));
 }
 
-MCSection *LanaiTargetObjectFile::getSectionForConstant(const DataLayout &DL,
-                                                        SectionKind Kind,
-                                                        const Constant *C,
-                                                        unsigned &Align) const {
+MCSection *LanaiTargetObjectFile::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    Align &Alignment) const {
   if (isConstantInSmallSection(DL, C))
     return SmallDataSection;
 
   // Otherwise, we work the same as ELF.
-  return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C, Align);
+  return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C,
+                                                            Alignment);
 }
diff --git a/llvm/lib/Target/Lanai/LanaiTargetObjectFile.h b/llvm/lib/Target/Lanai/LanaiTargetObjectFile.h
index 938a1e675b6a..404349465dbc 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetObjectFile.h
+++ b/llvm/lib/Target/Lanai/LanaiTargetObjectFile.h
@@ -12,7 +12,6 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 
 namespace llvm {
-class LanaiTargetMachine;
 class LanaiTargetObjectFile : public TargetLoweringObjectFileELF {
   MCSection *SmallDataSection;
   MCSection *SmallBSSSection;
@@ -38,7 +37,7 @@ public:
 
   MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
                                    const Constant *C,
-                                   unsigned &Align) const override;
+                                   Align &Alignment) const override;
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h b/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
index a22d3a34f98c..7366d5059c9f 100644
--- a/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
+++ b/llvm/lib/Target/Lanai/LanaiTargetTransformInfo.h
@@ -49,7 +49,7 @@ public:
     return TTI::PSK_Software;
   }
 
-  int getIntImmCost(const APInt &Imm, Type *Ty) {
+  int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind) {
     assert(Ty->isIntegerTy());
     if (Imm == 0)
       return TTI::TCC_Free;
@@ -66,17 +66,19 @@ public:
     return 4 * TTI::TCC_Basic;
   }
 
-  int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm, Type *Ty) {
-    return getIntImmCost(Imm, Ty);
+  int getIntImmCostInst(unsigned Opc, unsigned Idx, const APInt &Imm, Type *Ty,
+                        TTI::TargetCostKind CostKind) {
+    return getIntImmCost(Imm, Ty, CostKind);
   }
 
   int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                          Type *Ty) {
-    return getIntImmCost(Imm, Ty);
+                          Type *Ty, TTI::TargetCostKind CostKind) {
+    return getIntImmCost(Imm, Ty, CostKind);
   }
 
   unsigned getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -87,7 +89,8 @@ public:
 
     switch (ISD) {
     default:
-      return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+      return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+                                           Opd2Info,
                                            Opd1PropInfo, Opd2PropInfo);
     case ISD::MUL:
     case ISD::SDIV:
@@ -98,7 +101,8 @@ public:
       // instruction cost was arbitrarily chosen to reduce the desirability
       // of emitting arithmetic instructions that are emulated in software.
       // TODO: Investigate the performance impact given specialized lowerings.
-      return 64 * BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+      return 64 * BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+                                                Opd2Info,
                                                 Opd1PropInfo, Opd2PropInfo);
     }
   }
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
index a6ce3d5eb4ff..0fb27a926003 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiAsmBackend.cpp
@@ -74,10 +74,6 @@ public:
     return false;
   }
 
-  void relaxInstruction(const MCInst & /*Inst*/,
-                        const MCSubtargetInfo & /*STI*/,
-                        MCInst & /*Res*/) const override {}
-
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
index ccc413995917..7027d18126bb 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.cpp
@@ -141,7 +141,7 @@ void LanaiInstPrinter::printInst(const MCInst *MI, uint64_t Address,
                                  StringRef Annotation,
                                  const MCSubtargetInfo & /*STI*/,
                                  raw_ostream &OS) {
-  if (!printAlias(MI, OS) && !printAliasInstr(MI, OS))
+  if (!printAlias(MI, OS) && !printAliasInstr(MI, Address, OS))
     printInstruction(MI, Address, OS);
   printAnnotation(OS, Annotation);
 }
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h
index a71a9497c691..ce6df2969d73 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiInstPrinter.h
@@ -44,9 +44,10 @@ public:
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
-  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
-  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                               unsigned PrintMethodIdx, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                               unsigned OpIdx, unsigned PrintMethodIdx,
+                               raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
   void printRegName(raw_ostream &OS, unsigned RegNo) const override;
 
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
index f1c174897047..d8c7bd15aacb 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCAsmInfo.cpp
@@ -28,9 +28,6 @@ LanaiMCAsmInfo::LanaiMCAsmInfo(const Triple & /*TheTriple*/,
   // Lanai assembly requires ".section" before ".bss"
   UsesELFSectionDirectiveForBSS = true;
 
-  // Use the integrated assembler instead of system one.
-  UseIntegratedAssembler = true;
-
   // Use '!' as comment string to correspond with old toolchain.
   CommentString = "!";
 
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
index 9de15bf61c8c..2ff893273c92 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.cpp
@@ -52,7 +52,7 @@ static MCRegisterInfo *createLanaiMCRegisterInfo(const Triple & /*TT*/) {
 
 static MCSubtargetInfo *
 createLanaiMCSubtargetInfo(const Triple &TT, StringRef CPU, StringRef FS) {
-  std::string CPUName = CPU;
+  std::string CPUName = std::string(CPU);
   if (CPUName.empty())
     CPUName = "generic";
 
diff --git a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
index cf66d3226659..651ed36cdc24 100644
--- a/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
+++ b/llvm/lib/Target/Lanai/MCTargetDesc/LanaiMCTargetDesc.h
@@ -22,14 +22,9 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCInstrAnalysis;
 class MCObjectTargetWriter;
-class MCRelocationInfo;
 class MCSubtargetInfo;
 class Target;
-class Triple;
-class StringRef;
-class raw_pwrite_stream;
 
 MCCodeEmitter *createLanaiMCCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo &MRI,
diff --git a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
index 0995e80a0a09..9529b5e802d5 100644
--- a/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
+++ b/llvm/lib/Target/MSP430/AsmParser/MSP430AsmParser.cpp
@@ -46,6 +46,8 @@ class MSP430AsmParser : public MCTargetAsmParser {
                                bool MatchingInlineAsm) override;
 
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
 
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
@@ -154,12 +156,12 @@ public:
     addExprOperand(Inst, Mem.Offset);
   }
 
-  bool isReg() const        { return Kind == k_Reg; }
-  bool isImm() const        { return Kind == k_Imm; }
-  bool isToken() const      { return Kind == k_Tok; }
-  bool isMem() const        { return Kind == k_Mem; }
-  bool isIndReg() const     { return Kind == k_IndReg; }
-  bool isPostIndReg() const { return Kind == k_PostIndReg; }
+  bool isReg()   const override { return Kind == k_Reg; }
+  bool isImm()   const override { return Kind == k_Imm; }
+  bool isToken() const override { return Kind == k_Tok; }
+  bool isMem()   const override { return Kind == k_Mem; }
+  bool isIndReg()         const { return Kind == k_IndReg; }
+  bool isPostIndReg()     const { return Kind == k_PostIndReg; }
 
   bool isCGImm() const {
     if (Kind != k_Imm)
@@ -180,7 +182,7 @@ public:
     return Tok;
   }
 
-  unsigned getReg() const {
+  unsigned getReg() const override {
     assert(Kind == k_Reg && "Invalid access!");
     return Reg;
   }
@@ -220,10 +222,10 @@ public:
     return std::make_unique<MSP430Operand>(k_PostIndReg, RegNum, S, E);
   }
 
-  SMLoc getStartLoc() const { return Start; }
-  SMLoc getEndLoc() const { return End; }
+  SMLoc getStartLoc() const override { return Start; }
+  SMLoc getEndLoc() const override { return End; }
 
-  virtual void print(raw_ostream &O) const {
+  void print(raw_ostream &O) const override {
     switch (Kind) {
     case k_Tok:
       O << "Token " << Tok;
@@ -261,7 +263,7 @@ bool MSP430AsmParser::MatchAndEmitInstruction(SMLoc Loc, unsigned &Opcode,
   switch (MatchResult) {
   case Match_Success:
     Inst.setLoc(Loc);
-    Out.EmitInstruction(Inst, STI);
+    Out.emitInstruction(Inst, STI);
     return false;
   case Match_MnemonicFail:
     return Error(Loc, "invalid instruction mnemonic");
@@ -288,13 +290,28 @@ static unsigned MatchRegisterAltName(StringRef Name);
 
 bool MSP430AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                     SMLoc &EndLoc) {
+  switch (tryParseRegister(RegNo, StartLoc, EndLoc)) {
+  case MatchOperand_ParseFail:
+    return Error(StartLoc, "invalid register name");
+  case MatchOperand_Success:
+    return false;
+  case MatchOperand_NoMatch:
+    return true;
+  }
+
+  llvm_unreachable("unknown match result type");
+}
+
+OperandMatchResultTy MSP430AsmParser::tryParseRegister(unsigned &RegNo,
+                                                       SMLoc &StartLoc,
+                                                       SMLoc &EndLoc) {
   if (getLexer().getKind() == AsmToken::Identifier) {
     auto Name = getLexer().getTok().getIdentifier().lower();
     RegNo = MatchRegisterName(Name);
     if (RegNo == MSP430::NoRegister) {
       RegNo = MatchRegisterAltName(Name);
       if (RegNo == MSP430::NoRegister)
-        return true;
+        return MatchOperand_NoMatch;
     }
 
     AsmToken const &T = getParser().getTok();
@@ -302,10 +319,10 @@ bool MSP430AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
     EndLoc = T.getEndLoc();
     getLexer().Lex(); // eat register token
 
-    return false;
+    return MatchOperand_Success;
   }
 
-  return Error(StartLoc, "invalid register name");
+  return MatchOperand_ParseFail;
 }
 
 bool MSP430AsmParser::parseJccInstruction(ParseInstructionInfo &Info,
@@ -414,7 +431,7 @@ bool MSP430AsmParser::ParseDirectiveRefSym(AsmToken DirectiveID) {
       return TokError("expected identifier in directive");
 
     MCSymbol *Sym = getContext().getOrCreateSymbol(Name);
-    getStreamer().EmitSymbolAttribute(Sym, MCSA_Global);
+    getStreamer().emitSymbolAttribute(Sym, MCSA_Global);
     return false;
 }
 
@@ -523,7 +540,7 @@ bool MSP430AsmParser::ParseLiteralValues(unsigned Size, SMLoc L) {
     const MCExpr *Value;
     if (getParser().parseExpression(Value))
       return true;
-    getParser().getStreamer().EmitValue(Value, Size, L);
+    getParser().getStreamer().emitValue(Value, Size, L);
     return false;
   };
   return (parseMany(parseOne));
@@ -545,7 +562,7 @@ static unsigned convertGR16ToGR8(unsigned Reg) {
   case MSP430::SP:  return MSP430::SPB;
   case MSP430::SR:  return MSP430::SRB;
   case MSP430::CG:  return MSP430::CGB;
-  case MSP430::FP:  return MSP430::FPB;
+  case MSP430::R4:  return MSP430::R4B;
   case MSP430::R5:  return MSP430::R5B;
   case MSP430::R6:  return MSP430::R6B;
   case MSP430::R7:  return MSP430::R7B;
diff --git a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
index 6aa76156bf14..d2902189ec40 100644
--- a/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
+++ b/llvm/lib/Target/MSP430/Disassembler/MSP430Disassembler.cpp
@@ -65,7 +65,7 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeMSP430Disassembler() {
 
 static const unsigned GR8DecoderTable[] = {
   MSP430::PCB,  MSP430::SPB,  MSP430::SRB,  MSP430::CGB,
-  MSP430::FPB,  MSP430::R5B,  MSP430::R6B,  MSP430::R7B,
+  MSP430::R4B,  MSP430::R5B,  MSP430::R6B,  MSP430::R7B,
   MSP430::R8B,  MSP430::R9B,  MSP430::R10B, MSP430::R11B,
   MSP430::R12B, MSP430::R13B, MSP430::R14B, MSP430::R15B
 };
@@ -83,7 +83,7 @@ static DecodeStatus DecodeGR8RegisterClass(MCInst &MI, uint64_t RegNo,
 
 static const unsigned GR16DecoderTable[] = {
   MSP430::PC,  MSP430::SP,  MSP430::SR,  MSP430::CG,
-  MSP430::FP,  MSP430::R5,  MSP430::R6,  MSP430::R7,
+  MSP430::R4,  MSP430::R5,  MSP430::R6,  MSP430::R7,
   MSP430::R8,  MSP430::R9,  MSP430::R10, MSP430::R11,
   MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15
 };
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
index 365e5da74de0..958212dc77c9 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430AsmBackend.cpp
@@ -95,9 +95,6 @@ public:
     return false;
   }
 
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override {}
-
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
index 4e054f85ccc3..87ee312424c8 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430ELFStreamer.cpp
@@ -43,26 +43,26 @@ MSP430TargetELFStreamer::MSP430TargetELFStreamer(MCStreamer &S,
   Streamer.SwitchSection(AttributeSection);
 
   // Format version.
-  Streamer.EmitIntValue(0x41, 1);
+  Streamer.emitInt8(0x41);
   // Subsection length.
-  Streamer.EmitIntValue(22, 4);
+  Streamer.emitInt32(22);
   // Vendor name string, zero-terminated.
-  Streamer.EmitBytes("mspabi");
-  Streamer.EmitIntValue(0, 1);
+  Streamer.emitBytes("mspabi");
+  Streamer.emitInt8(0);
 
   // Attribute vector scope tag. 1 stands for the entire file.
-  Streamer.EmitIntValue(1, 1);
+  Streamer.emitInt8(1);
   // Attribute vector length.
-  Streamer.EmitIntValue(11, 4);
+  Streamer.emitInt32(11);
   // OFBA_MSPABI_Tag_ISA(4) = 1, MSP430
-  Streamer.EmitIntValue(4, 1);
-  Streamer.EmitIntValue(1, 1);
+  Streamer.emitInt8(4);
+  Streamer.emitInt8(1);
   // OFBA_MSPABI_Tag_Code_Model(6) = 1, Small
-  Streamer.EmitIntValue(6, 1);
-  Streamer.EmitIntValue(1, 1);
+  Streamer.emitInt8(6);
+  Streamer.emitInt8(1);
   // OFBA_MSPABI_Tag_Data_Model(8) = 1, Small
-  Streamer.EmitIntValue(8, 1);
-  Streamer.EmitIntValue(1, 1);
+  Streamer.emitInt8(8);
+  Streamer.emitInt8(1);
 }
 
 MCELFStreamer &MSP430TargetELFStreamer::getStreamer() {
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp
index 0c6da5a35c68..420893f65d5b 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 void MSP430InstPrinter::printInst(const MCInst *MI, uint64_t Address,
                                   StringRef Annot, const MCSubtargetInfo &STI,
                                   raw_ostream &O) {
-  if (!printAliasInstr(MI, O))
+  if (!printAliasInstr(MI, Address, O))
     printInstruction(MI, Address, O);
   printAnnotation(O, Annot);
 }
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h
index 200dc0e6db60..6a6b07f2eba0 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430InstPrinter.h
@@ -27,9 +27,10 @@ namespace llvm {
 
     // Autogenerated by tblgen.
     void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
-    bool printAliasInstr(const MCInst *MI, raw_ostream &O);
-    void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                                 unsigned PrintMethodIdx, raw_ostream &O);
+    bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &O);
+    void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                                 unsigned OpIdx, unsigned PrintMethodIdx,
+                                 raw_ostream &O);
     static const char *getRegisterName(unsigned RegNo);
 
 private:
diff --git a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
index cfdc44ada771..de07b47096d3 100644
--- a/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
+++ b/llvm/lib/Target/MSP430/MCTargetDesc/MSP430MCAsmInfo.cpp
@@ -24,5 +24,6 @@ MSP430MCAsmInfo::MSP430MCAsmInfo(const Triple &TT,
 
   AlignmentIsInBytes = false;
   UsesELFSectionDirectiveForBSS = true;
-  UseIntegratedAssembler = true;
+
+  SupportsDebugInformation = true;
 }
diff --git a/llvm/lib/Target/MSP430/MSP430.h b/llvm/lib/Target/MSP430/MSP430.h
index 67f35b8034d9..34f0a37bced9 100644
--- a/llvm/lib/Target/MSP430/MSP430.h
+++ b/llvm/lib/Target/MSP430/MSP430.h
@@ -36,7 +36,6 @@ namespace MSP430CC {
 namespace llvm {
   class MSP430TargetMachine;
   class FunctionPass;
-  class formatted_raw_ostream;
 
   FunctionPass *createMSP430ISelDag(MSP430TargetMachine &TM,
                                     CodeGenOpt::Level OptLevel);
diff --git a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
index 2f871b959a71..459188434f2c 100644
--- a/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
+++ b/llvm/lib/Target/MSP430/MSP430AsmPrinter.cpp
@@ -57,7 +57,7 @@ namespace {
                          const char *ExtraCode, raw_ostream &O) override;
     bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                                const char *ExtraCode, raw_ostream &O) override;
-    void EmitInstruction(const MachineInstr *MI) override;
+    void emitInstruction(const MachineInstr *MI) override;
 
     void EmitInterruptVectorSection(MachineFunction &ISR);
   };
@@ -148,7 +148,7 @@ bool MSP430AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
 }
 
 //===----------------------------------------------------------------------===//
-void MSP430AsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void MSP430AsmPrinter::emitInstruction(const MachineInstr *MI) {
   MSP430MCInstLower MCInstLowering(OutContext, *this);
 
   MCInst TmpInst;
@@ -169,7 +169,7 @@ void MSP430AsmPrinter::EmitInterruptVectorSection(MachineFunction &ISR) {
   OutStreamer->SwitchSection(IV);
 
   const MCSymbol *FunctionSymbol = getSymbol(F);
-  OutStreamer->EmitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
+  OutStreamer->emitSymbolValue(FunctionSymbol, TM.getProgramPointerSize());
   OutStreamer->SwitchSection(Cur);
 }
 
@@ -180,7 +180,7 @@ bool MSP430AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   }
 
   SetupMachineFunction(MF);
-  EmitFunctionBody();
+  emitFunctionBody();
   return false;
 }
 
diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
index de60ad9bd7e6..4be8d0760e68 100644
--- a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -64,16 +64,16 @@ void MSP430FrameLowering::emitPrologue(MachineFunction &MF,
 
     // Save FP into the appropriate stack slot...
     BuildMI(MBB, MBBI, DL, TII.get(MSP430::PUSH16r))
-      .addReg(MSP430::FP, RegState::Kill);
+      .addReg(MSP430::R4, RegState::Kill);
 
     // Update FP with the new base value...
-    BuildMI(MBB, MBBI, DL, TII.get(MSP430::MOV16rr), MSP430::FP)
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::MOV16rr), MSP430::R4)
       .addReg(MSP430::SP);
 
     // Mark the FramePtr as live-in in every block except the entry.
     for (MachineFunction::iterator I = std::next(MF.begin()), E = MF.end();
          I != E; ++I)
-      I->addLiveIn(MSP430::FP);
+      I->addLiveIn(MSP430::R4);
 
   } else
     NumBytes = StackSize - MSP430FI->getCalleeSavedFrameSize();
@@ -132,7 +132,7 @@ void MSP430FrameLowering::emitEpilogue(MachineFunction &MF,
     NumBytes = FrameSize - CSSize;
 
     // pop FP.
-    BuildMI(MBB, MBBI, DL, TII.get(MSP430::POP16r), MSP430::FP);
+    BuildMI(MBB, MBBI, DL, TII.get(MSP430::POP16r), MSP430::R4);
   } else
     NumBytes = StackSize - CSSize;
 
@@ -154,7 +154,7 @@ void MSP430FrameLowering::emitEpilogue(MachineFunction &MF,
 
   if (MFI.hasVarSizedObjects()) {
     BuildMI(MBB, MBBI, DL,
-            TII.get(MSP430::MOV16rr), MSP430::SP).addReg(MSP430::FP);
+            TII.get(MSP430::MOV16rr), MSP430::SP).addReg(MSP430::R4);
     if (CSSize) {
       MachineInstr *MI =
         BuildMI(MBB, MBBI, DL,
@@ -176,11 +176,9 @@ void MSP430FrameLowering::emitEpilogue(MachineFunction &MF,
 }
 
 // FIXME: Can we eleminate these in favour of generic code?
-bool
-MSP430FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                           MachineBasicBlock::iterator MI,
-                                        const std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const {
+bool MSP430FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return false;
 
@@ -202,11 +200,9 @@ MSP430FrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
-bool
-MSP430FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                                 MachineBasicBlock::iterator MI,
-                                        std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const {
+bool MSP430FrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return false;
 
@@ -227,8 +223,6 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
     MachineBasicBlock::iterator I) const {
   const MSP430InstrInfo &TII =
       *static_cast<const MSP430InstrInfo *>(MF.getSubtarget().getInstrInfo());
-  unsigned StackAlign = getStackAlignment();
-
   if (!hasReservedCallFrame(MF)) {
     // If the stack pointer can be changed after prologue, turn the
     // adjcallstackup instruction into a 'sub SP, <amt>' and the
@@ -240,7 +234,7 @@ MachineBasicBlock::iterator MSP430FrameLowering::eliminateCallFramePseudoInstr(
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
       // alignment boundary.
-      Amount = (Amount+StackAlign-1)/StackAlign*StackAlign;
+      Amount = alignTo(Amount, getStackAlign());
 
       MachineInstr *New = nullptr;
       if (Old.getOpcode() == TII.getCallFrameSetupOpcode()) {
diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.h b/llvm/lib/Target/MSP430/MSP430FrameLowering.h
index 70e284053021..f6995edf4b0a 100644
--- a/llvm/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.h
@@ -36,12 +36,13 @@ public:
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
                                  const TargetRegisterInfo *TRI) const override;
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MI,
-                                  std::vector<CalleeSavedInfo> &CSI,
-                                  const TargetRegisterInfo *TRI) const override;
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
+                              const TargetRegisterInfo *TRI) const override;
 
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp b/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
index 8550230155c8..7dabb9b4abae 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelDAGToDAG.cpp
@@ -50,7 +50,7 @@ namespace {
     const BlockAddress *BlockAddr = nullptr;
     const char *ES = nullptr;
     int JT = -1;
-    unsigned Align = 0;  // CP alignment.
+    Align Alignment; // CP alignment.
 
     MSP430ISelAddressMode() = default;
 
@@ -74,12 +74,12 @@ namespace {
       } else if (CP) {
         errs() << " CP ";
         CP->dump();
-        errs() << " Align" << Align << '\n';
+        errs() << " Align" << Alignment.value() << '\n';
       } else if (ES) {
         errs() << "ES ";
         errs() << ES << '\n';
       } else if (JT != -1)
-        errs() << " JT" << JT << " Align" << Align << '\n';
+        errs() << " JT" << JT << " Align" << Alignment.value() << '\n';
     }
 #endif
   };
@@ -146,7 +146,7 @@ bool MSP430DAGToDAGISel::MatchWrapper(SDValue N, MSP430ISelAddressMode &AM) {
     //AM.SymbolFlags = G->getTargetFlags();
   } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
     AM.CP = CP->getConstVal();
-    AM.Align = CP->getAlignment();
+    AM.Alignment = CP->getAlign();
     AM.Disp += CP->getOffset();
     //AM.SymbolFlags = CP->getTargetFlags();
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
@@ -263,8 +263,8 @@ bool MSP430DAGToDAGISel::SelectAddr(SDValue N,
                                           MVT::i16, AM.Disp,
                                           0/*AM.SymbolFlags*/);
   else if (AM.CP)
-    Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i16,
-                                         AM.Align, AM.Disp, 0/*AM.SymbolFlags*/);
+    Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i16, AM.Alignment, AM.Disp,
+                                         0 /*AM.SymbolFlags*/);
   else if (AM.ES)
     Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i16, 0/*AM.SymbolFlags*/);
   else if (AM.JT != -1)
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
index 37e6ea24d088..821339f50355 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -514,7 +514,7 @@ static void AnalyzeArguments(CCState &State,
 
     // Handle byval arguments
     if (ArgFlags.isByVal()) {
-      State.HandleByVal(ValNo++, ArgVT, LocVT, LocInfo, 2, 2, ArgFlags);
+      State.HandleByVal(ValNo++, ArgVT, LocVT, LocInfo, 2, Align(2), ArgFlags);
       continue;
     }
 
@@ -863,13 +863,11 @@ SDValue MSP430TargetLowering::LowerCCCCallTo(
 
       if (Flags.isByVal()) {
         SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i16);
-        MemOp = DAG.getMemcpy(Chain, dl, PtrOff, Arg, SizeNode,
-                              Flags.getByValAlign(),
-                              /*isVolatile*/false,
-                              /*AlwaysInline=*/true,
-                              /*isTailCall=*/false,
-                              MachinePointerInfo(),
-                              MachinePointerInfo());
+        MemOp = DAG.getMemcpy(
+            Chain, dl, PtrOff, Arg, SizeNode, Flags.getNonZeroByValAlign(),
+            /*isVolatile*/ false,
+            /*AlwaysInline=*/true,
+            /*isTailCall=*/false, MachinePointerInfo(), MachinePointerInfo());
       } else {
         MemOp = DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
       }
@@ -1302,7 +1300,7 @@ SDValue MSP430TargetLowering::LowerFRAMEADDR(SDValue Op,
   SDLoc dl(Op);  // FIXME probably not meaningful
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   SDValue FrameAddr = DAG.getCopyFromReg(DAG.getEntryNode(), dl,
-                                         MSP430::FP, VT);
+                                         MSP430::R4, VT);
   while (Depth--)
     FrameAddr = DAG.getLoad(VT, dl, DAG.getEntryNode(), FrameAddr,
                             MachinePointerInfo());
diff --git a/llvm/lib/Target/MSP430/MSP430ISelLowering.h b/llvm/lib/Target/MSP430/MSP430ISelLowering.h
index 650f9a704062..f23042a369fd 100644
--- a/llvm/lib/Target/MSP430/MSP430ISelLowering.h
+++ b/llvm/lib/Target/MSP430/MSP430ISelLowering.h
@@ -79,6 +79,10 @@ namespace llvm {
       return MVT::i8;
     }
 
+    MVT::SimpleValueType getCmpLibcallReturnType() const override {
+      return MVT::i16;
+    }
+
     /// LowerOperation - Provide custom lowering hooks for some operations.
     SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
 
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
index 9e03334d6b62..130211878be1 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -35,7 +35,7 @@ MSP430InstrInfo::MSP430InstrInfo(MSP430Subtarget &STI)
 
 void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator MI,
-                                    unsigned SrcReg, bool isKill, int FrameIdx,
+                                    Register SrcReg, bool isKill, int FrameIdx,
                                           const TargetRegisterClass *RC,
                                           const TargetRegisterInfo *TRI) const {
   DebugLoc DL;
@@ -46,7 +46,7 @@ void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FrameIdx),
       MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
-      MFI.getObjectAlignment(FrameIdx));
+      MFI.getObjectAlign(FrameIdx));
 
   if (RC == &MSP430::GR16RegClass)
     BuildMI(MBB, MI, DL, get(MSP430::MOV16mr))
@@ -62,7 +62,7 @@ void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
 void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                            MachineBasicBlock::iterator MI,
-                                           unsigned DestReg, int FrameIdx,
+                                           Register DestReg, int FrameIdx,
                                            const TargetRegisterClass *RC,
                                            const TargetRegisterInfo *TRI) const{
   DebugLoc DL;
@@ -73,7 +73,7 @@ void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FrameIdx),
       MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
-      MFI.getObjectAlignment(FrameIdx));
+      MFI.getObjectAlign(FrameIdx));
 
   if (RC == &MSP430::GR16RegClass)
     BuildMI(MBB, MI, DL, get(MSP430::MOV16rm))
@@ -160,18 +160,6 @@ reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   return false;
 }
 
-bool MSP430InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
-  if (!MI.isTerminator())
-    return false;
-
-  // Conditional branch is a special case.
-  if (MI.isBranch() && !MI.isBarrier())
-    return true;
-  if (!MI.isPredicable())
-    return true;
-  return !isPredicated(MI);
-}
-
 bool MSP430InstrInfo::analyzeBranch(MachineBasicBlock &MBB,
                                     MachineBasicBlock *&TBB,
                                     MachineBasicBlock *&FBB,
diff --git a/llvm/lib/Target/MSP430/MSP430InstrInfo.h b/llvm/lib/Target/MSP430/MSP430InstrInfo.h
index e3838772c061..710913b2d36f 100644
--- a/llvm/lib/Target/MSP430/MSP430InstrInfo.h
+++ b/llvm/lib/Target/MSP430/MSP430InstrInfo.h
@@ -41,13 +41,13 @@ public:
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI,
-                           unsigned SrcReg, bool isKill,
+                           Register SrcReg, bool isKill,
                            int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MI,
-                            unsigned DestReg, int FrameIdx,
+                            Register DestReg, int FrameIdx,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
@@ -56,7 +56,6 @@ public:
   // Branch folding goodness
   bool
   reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
-  bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
   bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
                      MachineBasicBlock *&FBB,
                      SmallVectorImpl<MachineOperand> &Cond,
diff --git a/llvm/lib/Target/MSP430/MSP430MCInstLower.h b/llvm/lib/Target/MSP430/MSP430MCInstLower.h
index 910ad4bb12d5..4d0197d9e2b1 100644
--- a/llvm/lib/Target/MSP430/MSP430MCInstLower.h
+++ b/llvm/lib/Target/MSP430/MSP430MCInstLower.h
@@ -18,7 +18,6 @@ namespace llvm {
   class MCOperand;
   class MCSymbol;
   class MachineInstr;
-  class MachineModuleInfoMachO;
   class MachineOperand;
 
   /// MSP430MCInstLower - This class is used to lower an MachineInstr
diff --git a/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h b/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
index 712519cfe38a..261db9e288f5 100644
--- a/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
+++ b/llvm/lib/Target/MSP430/MSP430MachineFunctionInfo.h
@@ -35,7 +35,7 @@ class MSP430MachineFunctionInfo : public MachineFunctionInfo {
   /// SRetReturnReg - Some subtargets require that sret lowering includes
   /// returning the value of the returned struct in a register. This field
   /// holds the virtual register into which the sret argument is passed.
-  unsigned SRetReturnReg = 0;
+  Register SRetReturnReg;
 
 public:
   MSP430MachineFunctionInfo() = default;
@@ -46,8 +46,8 @@ public:
   unsigned getCalleeSavedFrameSize() const { return CalleeSavedFrameSize; }
   void setCalleeSavedFrameSize(unsigned bytes) { CalleeSavedFrameSize = bytes; }
 
-  unsigned getSRetReturnReg() const { return SRetReturnReg; }
-  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+  Register getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(Register Reg) { SRetReturnReg = Reg; }
 
   int getRAIndex() const { return ReturnAddrIndex; }
   void setRAIndex(int Index) { ReturnAddrIndex = Index; }
diff --git a/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp b/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
index bec357a1548d..5583ebee2f31 100644
--- a/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
+++ b/llvm/lib/Target/MSP430/MSP430RegisterInfo.cpp
@@ -39,7 +39,7 @@ MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const MSP430FrameLowering *TFI = getFrameLowering(*MF);
   const Function* F = &MF->getFunction();
   static const MCPhysReg CalleeSavedRegs[] = {
-    MSP430::FP, MSP430::R5, MSP430::R6, MSP430::R7,
+    MSP430::R4, MSP430::R5, MSP430::R6, MSP430::R7,
     MSP430::R8, MSP430::R9, MSP430::R10,
     0
   };
@@ -49,7 +49,7 @@ MSP430RegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     0
   };
   static const MCPhysReg CalleeSavedRegsIntr[] = {
-    MSP430::FP,  MSP430::R5,  MSP430::R6,  MSP430::R7,
+    MSP430::R4,  MSP430::R5,  MSP430::R6,  MSP430::R7,
     MSP430::R8,  MSP430::R9,  MSP430::R10, MSP430::R11,
     MSP430::R12, MSP430::R13, MSP430::R14, MSP430::R15,
     0
@@ -86,8 +86,8 @@ BitVector MSP430RegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 
   // Mark frame pointer as reserved if needed.
   if (TFI->hasFP(MF)) {
-    Reserved.set(MSP430::FPB);
-    Reserved.set(MSP430::FP);
+    Reserved.set(MSP430::R4B);
+    Reserved.set(MSP430::R4);
   }
 
   return Reserved;
@@ -112,7 +112,7 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   DebugLoc dl = MI.getDebugLoc();
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
 
-  unsigned BasePtr = (TFI->hasFP(MF) ? MSP430::FP : MSP430::SP);
+  unsigned BasePtr = (TFI->hasFP(MF) ? MSP430::R4 : MSP430::SP);
   int Offset = MF.getFrameInfo().getObjectOffset(FrameIndex);
 
   // Skip the saved PC
@@ -156,5 +156,5 @@ MSP430RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
 Register MSP430RegisterInfo::getFrameRegister(const MachineFunction &MF) const {
   const MSP430FrameLowering *TFI = getFrameLowering(MF);
-  return TFI->hasFP(MF) ? MSP430::FP : MSP430::SP;
+  return TFI->hasFP(MF) ? MSP430::R4 : MSP430::SP;
 }
diff --git a/llvm/lib/Target/MSP430/MSP430RegisterInfo.td b/llvm/lib/Target/MSP430/MSP430RegisterInfo.td
index 11003dba383f..61cc72d494b5 100644
--- a/llvm/lib/Target/MSP430/MSP430RegisterInfo.td
+++ b/llvm/lib/Target/MSP430/MSP430RegisterInfo.td
@@ -15,6 +15,7 @@ class MSP430Reg<bits<4> num, string n, list<string> alt = []> : Register<n> {
   let Namespace = "MSP430";
   let HWEncoding{3-0} = num;
   let AltNames = alt;
+  let DwarfNumbers = [num];
 }
 
 class MSP430RegWithSubregs<bits<4> num, string n, list<Register> subregs,
@@ -24,6 +25,7 @@ class MSP430RegWithSubregs<bits<4> num, string n, list<Register> subregs,
   let Namespace = "MSP430";
   let HWEncoding{3-0} = num;
   let AltNames = alt;
+  let DwarfNumbers = [num];
 }
 
 //===----------------------------------------------------------------------===//
@@ -34,7 +36,7 @@ def PCB  : MSP430Reg<0,  "r0", ["pc"]>;
 def SPB  : MSP430Reg<1,  "r1", ["sp"]>;
 def SRB  : MSP430Reg<2,  "r2", ["sr"]>;
 def CGB  : MSP430Reg<3,  "r3", ["cg"]>;
-def FPB  : MSP430Reg<4,  "r4", ["fp"]>;
+def R4B  : MSP430Reg<4,  "r4", ["fp"]>;
 def R5B  : MSP430Reg<5,  "r5">;
 def R6B  : MSP430Reg<6,  "r6">;
 def R7B  : MSP430Reg<7,  "r7">;
@@ -54,7 +56,7 @@ def PC  : MSP430RegWithSubregs<0,  "r0",  [PCB], ["pc"]>;
 def SP  : MSP430RegWithSubregs<1,  "r1",  [SPB], ["sp"]>;
 def SR  : MSP430RegWithSubregs<2,  "r2",  [SRB], ["sr"]>;
 def CG  : MSP430RegWithSubregs<3,  "r3",  [CGB], ["cg"]>;
-def FP  : MSP430RegWithSubregs<4,  "r4",  [FPB], ["fp"]>;
+def R4  : MSP430RegWithSubregs<4,  "r4",  [R4B], ["fp"]>;
 def R5  : MSP430RegWithSubregs<5,  "r5",  [R5B]>;
 def R6  : MSP430RegWithSubregs<6,  "r6",  [R6B]>;
 def R7  : MSP430RegWithSubregs<7,  "r7",  [R7B]>;
@@ -72,7 +74,7 @@ def GR8 : RegisterClass<"MSP430", [i8], 8,
    // Volatile registers
   (add R12B, R13B, R14B, R15B, R11B, R10B, R9B, R8B, R7B, R6B, R5B,
    // Frame pointer, sometimes allocable
-   FPB,
+   R4B,
    // Volatile, but not allocable
    PCB, SPB, SRB, CGB)>;
 
@@ -80,6 +82,6 @@ def GR16 : RegisterClass<"MSP430", [i16], 16,
    // Volatile registers
   (add R12, R13, R14, R15, R11, R10, R9, R8, R7, R6, R5,
    // Frame pointer, sometimes allocable
-   FP,
+   R4,
    // Volatile, but not allocable
    PC, SP, SR, CG)>;
diff --git a/llvm/lib/Target/MSP430/MSP430Subtarget.cpp b/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
index 20168773cd53..1f3c1d34f76f 100644
--- a/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
+++ b/llvm/lib/Target/MSP430/MSP430Subtarget.cpp
@@ -43,7 +43,7 @@ MSP430Subtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
   ExtendedInsts = false;
   HWMultMode = NoHWMult;
 
-  std::string CPUName = CPU;
+  StringRef CPUName = CPU;
   if (CPUName.empty())
     CPUName = "msp430";
 
diff --git a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
index 81851427c0ed..827f24daad16 100644
--- a/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/llvm/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -47,7 +47,7 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T, const Triple &TT,
                         Options, getEffectiveRelocModel(RM),
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
       TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
-      Subtarget(TT, CPU, FS, *this) {
+      Subtarget(TT, std::string(CPU), std::string(FS), *this) {
   initAsmInfo();
 }
 
diff --git a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
index e467ed36938b..9dbbdeb34dba 100644
--- a/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
+++ b/llvm/lib/Target/Mips/AsmParser/MipsAsmParser.cpp
@@ -179,6 +179,8 @@ class MipsAsmParser : public MCTargetAsmParser {
 
   /// Parse a register as used in CFI directives
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
 
   bool parseParenSuffix(StringRef Name, OperandVector &Operands);
 
@@ -296,6 +298,12 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                     const MCSubtargetInfo *STI);
 
+  bool expandSle(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                 const MCSubtargetInfo *STI);
+
+  bool expandSleImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                    const MCSubtargetInfo *STI);
+
   bool expandRotation(MCInst &Inst, SMLoc IDLoc,
                       MCStreamer &Out, const MCSubtargetInfo *STI);
   bool expandRotationImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
@@ -332,6 +340,12 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                   const MCSubtargetInfo *STI);
 
+  bool expandSne(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                 const MCSubtargetInfo *STI);
+
+  bool expandSneI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                  const MCSubtargetInfo *STI);
+
   bool expandMXTRAlias(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
                        const MCSubtargetInfo *STI);
 
@@ -347,6 +361,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseSetArchDirective();
   bool parseSetFeature(uint64_t Feature);
   bool isPicAndNotNxxAbi(); // Used by .cpload, .cprestore, and .cpsetup.
+  bool parseDirectiveCpAdd(SMLoc Loc);
   bool parseDirectiveCpLoad(SMLoc Loc);
   bool parseDirectiveCpLocal(SMLoc Loc);
   bool parseDirectiveCpRestore(SMLoc Loc);
@@ -366,6 +381,7 @@ class MipsAsmParser : public MCTargetAsmParser {
   bool parseSetMsaDirective();
   bool parseSetNoMsaDirective();
   bool parseSetNoDspDirective();
+  bool parseSetNoMips3DDirective();
   bool parseSetReorderDirective();
   bool parseSetNoReorderDirective();
   bool parseSetMips16Directive();
@@ -2126,10 +2142,10 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
           MCSymbolRefExpr::create(JalSym, MCSymbolRefExpr::VK_None,
                                   getContext(), IDLoc);
 
-      TOut.getStreamer().EmitRelocDirective(*TmpExpr,
-          inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
+      TOut.getStreamer().emitRelocDirective(
+          *TmpExpr, inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
           RelocJalrExpr, IDLoc, *STI);
-      TOut.getStreamer().EmitLabel(TmpLabel);
+      TOut.getStreamer().emitLabel(TmpLabel);
     }
 
     Inst = JalrInst;
@@ -2311,7 +2327,7 @@ bool MipsAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
       tryExpandInstruction(Inst, IDLoc, Out, STI);
   switch (ExpandResult) {
   case MER_NotAMacro:
-    Out.EmitInstruction(Inst, *STI);
+    Out.emitInstruction(Inst, *STI);
     break;
   case MER_Success:
     break;
@@ -2512,6 +2528,14 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   case Mips::SGTImm64:
   case Mips::SGTUImm64:
     return expandSgtImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SLE:
+  case Mips::SLEU:
+    return expandSle(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SLEImm:
+  case Mips::SLEUImm:
+  case Mips::SLEImm64:
+  case Mips::SLEUImm64:
+    return expandSleImm(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::SLTImm64:
     if (isInt<16>(Inst.getOperand(2).getImm())) {
       Inst.setOpcode(Mips::SLTi64);
@@ -2588,6 +2612,10 @@ MipsAsmParser::tryExpandInstruction(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return expandSeq(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::SEQIMacro:
     return expandSeqI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SNEMacro:
+    return expandSne(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
+  case Mips::SNEIMacro:
+    return expandSneI(Inst, IDLoc, Out, STI) ? MER_Fail : MER_Success;
   case Mips::MFTC0:   case Mips::MTTC0:
   case Mips::MFTGPR:  case Mips::MTTGPR:
   case Mips::MFTLO:   case Mips::MTTLO:
@@ -2638,7 +2666,7 @@ bool MipsAsmParser::expandJalWithRegs(MCInst &Inst, SMLoc IDLoc,
     const MCOperand SecondRegOp = Inst.getOperand(1);
     JalrInst.addOperand(SecondRegOp);
   }
-  Out.EmitInstruction(JalrInst, *STI);
+  Out.emitInstruction(JalrInst, *STI);
 
   // If .set reorder is active and branch instruction has a delay slot,
   // emit a NOP after it.
@@ -3386,8 +3414,8 @@ bool MipsAsmParser::expandLoadSingleImmToFPR(MCInst &Inst, SMLoc IDLoc,
       MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
 
   getStreamer().SwitchSection(ReadOnlySection);
-  getStreamer().EmitLabel(Sym, IDLoc);
-  getStreamer().EmitIntValue(ImmOp32, 4);
+  getStreamer().emitLabel(Sym, IDLoc);
+  getStreamer().emitInt32(ImmOp32);
   getStreamer().SwitchSection(CS);
 
   if (emitPartialAddress(TOut, IDLoc, Sym))
@@ -3438,9 +3466,9 @@ bool MipsAsmParser::expandLoadDoubleImmToGPR(MCInst &Inst, SMLoc IDLoc,
       MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
 
   getStreamer().SwitchSection(ReadOnlySection);
-  getStreamer().EmitLabel(Sym, IDLoc);
-  getStreamer().EmitValueToAlignment(8);
-  getStreamer().EmitIntValue(ImmOp64, 8);
+  getStreamer().emitLabel(Sym, IDLoc);
+  getStreamer().emitValueToAlignment(8);
+  getStreamer().emitIntValue(ImmOp64, 8);
   getStreamer().SwitchSection(CS);
 
   unsigned TmpReg = getATReg(IDLoc);
@@ -3521,9 +3549,9 @@ bool MipsAsmParser::expandLoadDoubleImmToFPR(MCInst &Inst, bool Is64FPU,
       MipsMCExpr::create(MipsMCExpr::MEK_LO, LoSym, getContext());
 
   getStreamer().SwitchSection(ReadOnlySection);
-  getStreamer().EmitLabel(Sym, IDLoc);
-  getStreamer().EmitValueToAlignment(8);
-  getStreamer().EmitIntValue(ImmOp64, 8);
+  getStreamer().emitLabel(Sym, IDLoc);
+  getStreamer().emitValueToAlignment(8);
+  getStreamer().emitIntValue(ImmOp64, 8);
   getStreamer().SwitchSection(CS);
 
   if (emitPartialAddress(TOut, IDLoc, Sym))
@@ -3569,7 +3597,7 @@ bool MipsAsmParser::expandUncondBranchMMPseudo(MCInst &Inst, SMLoc IDLoc,
       Inst.addOperand(MCOperand::createImm(Offset.getImm()));
     }
   }
-  Out.EmitInstruction(Inst, *STI);
+  Out.emitInstruction(Inst, *STI);
 
   // If .set reorder is active and branch instruction has a delay slot,
   // emit a NOP after it.
@@ -3856,7 +3884,7 @@ bool MipsAsmParser::expandLoadStoreMultiple(MCInst &Inst, SMLoc IDLoc,
   }
 
   Inst.setOpcode(NewOpcode);
-  Out.EmitInstruction(Inst, *STI);
+  Out.emitInstruction(Inst, *STI);
   return false;
 }
 
@@ -4258,7 +4286,7 @@ bool MipsAsmParser::expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
 
   if (!Signed) {
     if (!UseTraps)
-      TOut.getStreamer().EmitLabel(BrTarget);
+      TOut.getStreamer().emitLabel(BrTarget);
 
     TOut.emitR(isDiv ? Mips::MFLO : Mips::MFHI, RdReg, IDLoc, STI);
     return false;
@@ -4269,7 +4297,7 @@ bool MipsAsmParser::expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     return true;
 
   if (!UseTraps)
-    TOut.getStreamer().EmitLabel(BrTarget);
+    TOut.getStreamer().emitLabel(BrTarget);
 
   TOut.emitRRI(Mips::ADDiu, ATReg, ZeroReg, -1, IDLoc, STI);
 
@@ -4297,7 +4325,7 @@ bool MipsAsmParser::expandDivRem(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
     TOut.emitII(Mips::BREAK, 0x6, 0, IDLoc, STI);
   }
 
-  TOut.getStreamer().EmitLabel(BrTargetEnd);
+  TOut.getStreamer().emitLabel(BrTargetEnd);
   TOut.emitR(isDiv ? Mips::MFLO : Mips::MFHI, RdReg, IDLoc, STI);
   return false;
 }
@@ -4636,6 +4664,88 @@ bool MipsAsmParser::expandSgtImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   return false;
 }
 
+bool MipsAsmParser::expandSle(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                              const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() &&
+         Inst.getOperand(1).isReg() &&
+         Inst.getOperand(2).isReg() && "Invalid instruction operand.");
+
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  unsigned OpReg = Inst.getOperand(2).getReg();
+  unsigned OpCode;
+
+  warnIfNoMacro(IDLoc);
+
+  switch (Inst.getOpcode()) {
+  case Mips::SLE:
+    OpCode = Mips::SLT;
+    break;
+  case Mips::SLEU:
+    OpCode = Mips::SLTu;
+    break;
+  default:
+    llvm_unreachable("unexpected 'sge' opcode");
+  }
+
+  // $SrcReg <= $OpReg is equal to (not ($OpReg < $SrcReg))
+  TOut.emitRRR(OpCode, DstReg, OpReg, SrcReg, IDLoc, STI);
+  TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI);
+
+  return false;
+}
+
+bool MipsAsmParser::expandSleImm(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                                 const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() &&
+         Inst.getOperand(1).isReg() &&
+         Inst.getOperand(2).isImm() && "Invalid instruction operand.");
+
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  int64_t ImmValue = Inst.getOperand(2).getImm();
+  unsigned OpRegCode;
+
+  warnIfNoMacro(IDLoc);
+
+  switch (Inst.getOpcode()) {
+  case Mips::SLEImm:
+  case Mips::SLEImm64:
+    OpRegCode = Mips::SLT;
+    break;
+  case Mips::SLEUImm:
+  case Mips::SLEUImm64:
+    OpRegCode = Mips::SLTu;
+    break;
+  default:
+    llvm_unreachable("unexpected 'sge' opcode with immediate");
+  }
+
+  // $SrcReg <= Imm is equal to (not (Imm < $SrcReg))
+  unsigned ImmReg = DstReg;
+  if (DstReg == SrcReg) {
+    unsigned ATReg = getATReg(Inst.getLoc());
+    if (!ATReg)
+      return true;
+    ImmReg = ATReg;
+  }
+
+  if (loadImmediate(ImmValue, ImmReg, Mips::NoRegister, isInt<32>(ImmValue),
+                    false, IDLoc, Out, STI))
+    return true;
+
+  TOut.emitRRR(OpRegCode, DstReg, ImmReg, SrcReg, IDLoc, STI);
+  TOut.emitRRI(Mips::XORi, DstReg, DstReg, 1, IDLoc, STI);
+
+  return false;
+}
+
 bool MipsAsmParser::expandAliasImmediate(MCInst &Inst, SMLoc IDLoc,
                                          MCStreamer &Out,
                                          const MCSubtargetInfo *STI) {
@@ -5099,7 +5209,7 @@ bool MipsAsmParser::expandMulO(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
       TOut.emitNop(IDLoc, STI);
     TOut.emitII(Mips::BREAK, 6, 0, IDLoc, STI);
 
-    TOut.getStreamer().EmitLabel(BrTarget);
+    TOut.getStreamer().emitLabel(BrTarget);
   }
   TOut.emitR(Mips::MFLO, DstReg, IDLoc, STI);
 
@@ -5136,7 +5246,7 @@ bool MipsAsmParser::expandMulOU(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
       TOut.emitNop(IDLoc, STI);
     TOut.emitII(Mips::BREAK, 6, 0, IDLoc, STI);
 
-    TOut.getStreamer().EmitLabel(BrTarget);
+    TOut.getStreamer().emitLabel(BrTarget);
   }
 
   return false;
@@ -5325,6 +5435,88 @@ bool MipsAsmParser::expandSeqI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
   return false;
 }
 
+bool MipsAsmParser::expandSne(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                              const MCSubtargetInfo *STI) {
+
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() &&
+         Inst.getOperand(1).isReg() &&
+         Inst.getOperand(2).isReg() && "Invalid instruction operand.");
+
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  unsigned OpReg = Inst.getOperand(2).getReg();
+
+  warnIfNoMacro(IDLoc);
+
+  if (SrcReg != Mips::ZERO && OpReg != Mips::ZERO) {
+    TOut.emitRRR(Mips::XOR, DstReg, SrcReg, OpReg, IDLoc, STI);
+    TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, DstReg, IDLoc, STI);
+    return false;
+  }
+
+  unsigned Reg = SrcReg == Mips::ZERO ? OpReg : SrcReg;
+  TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, Reg, IDLoc, STI);
+  return false;
+}
+
+bool MipsAsmParser::expandSneI(MCInst &Inst, SMLoc IDLoc, MCStreamer &Out,
+                               const MCSubtargetInfo *STI) {
+  MipsTargetStreamer &TOut = getTargetStreamer();
+
+  assert(Inst.getNumOperands() == 3 && "Invalid operand count");
+  assert(Inst.getOperand(0).isReg() &&
+         Inst.getOperand(1).isReg() &&
+         Inst.getOperand(2).isImm() && "Invalid instruction operand.");
+
+  unsigned DstReg = Inst.getOperand(0).getReg();
+  unsigned SrcReg = Inst.getOperand(1).getReg();
+  int64_t ImmValue = Inst.getOperand(2).getImm();
+
+  warnIfNoMacro(IDLoc);
+
+  if (ImmValue == 0) {
+    TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, SrcReg, IDLoc, STI);
+    return false;
+  }
+
+  if (SrcReg == Mips::ZERO) {
+    Warning(IDLoc, "comparison is always true");
+    if (loadImmediate(1, DstReg, Mips::NoRegister, true, false, IDLoc, Out,
+                      STI))
+      return true;
+    return false;
+  }
+
+  unsigned Opc;
+  if (ImmValue > -0x8000 && ImmValue < 0) {
+    ImmValue = -ImmValue;
+    Opc = isGP64bit() ? Mips::DADDiu : Mips::ADDiu;
+  } else {
+    Opc = Mips::XORi;
+  }
+
+  if (isUInt<16>(ImmValue)) {
+    TOut.emitRRI(Opc, DstReg, SrcReg, ImmValue, IDLoc, STI);
+    TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, DstReg, IDLoc, STI);
+    return false;
+  }
+
+  unsigned ATReg = getATReg(IDLoc);
+  if (!ATReg)
+    return true;
+
+  if (loadImmediate(ImmValue, ATReg, Mips::NoRegister, isInt<32>(ImmValue),
+                    false, IDLoc, Out, STI))
+    return true;
+
+  TOut.emitRRR(Mips::XOR, DstReg, SrcReg, ATReg, IDLoc, STI);
+  TOut.emitRRR(Mips::SLTu, DstReg, Mips::ZERO, DstReg, IDLoc, STI);
+  return false;
+}
+
 // Map the DSP accumulator and control register to the corresponding gpr
 // operand. Unlike the other alias, the m(f|t)t(lo|hi|acx) instructions
 // do not map the DSP registers contigously to gpr registers.
@@ -6202,6 +6394,12 @@ bool MipsAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
 
 bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                   SMLoc &EndLoc) {
+  return tryParseRegister(RegNo, StartLoc, EndLoc) != MatchOperand_Success;
+}
+
+OperandMatchResultTy MipsAsmParser::tryParseRegister(unsigned &RegNo,
+                                                     SMLoc &StartLoc,
+                                                     SMLoc &EndLoc) {
   SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Operands;
   OperandMatchResultTy ResTy = parseAnyRegister(Operands);
   if (ResTy == MatchOperand_Success) {
@@ -6219,11 +6417,12 @@ bool MipsAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
       RegNo = isGP64bit() ? Operand.getGPR64Reg() : Operand.getGPR32Reg();
     }
 
-    return (RegNo == (unsigned)-1);
+    return (RegNo == (unsigned)-1) ? MatchOperand_NoMatch
+                                   : MatchOperand_Success;
   }
 
   assert(Operands.size() == 0);
-  return (RegNo == (unsigned)-1);
+  return (RegNo == (unsigned)-1) ? MatchOperand_NoMatch : MatchOperand_Success;
 }
 
 bool MipsAsmParser::parseMemOffset(const MCExpr *&Res, bool isParenExpr) {
@@ -6976,6 +7175,21 @@ bool MipsAsmParser::parseSetNoDspDirective() {
   return false;
 }
 
+bool MipsAsmParser::parseSetNoMips3DDirective() {
+  MCAsmParser &Parser = getParser();
+  Parser.Lex(); // Eat "nomips3d".
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+
+  clearFeatureBits(Mips::FeatureMips3D, "mips3d");
+  getTargetStreamer().emitDirectiveSetNoMips3D();
+  return false;
+}
+
 bool MipsAsmParser::parseSetMips16Directive() {
   MCAsmParser &Parser = getParser();
   Parser.Lex(); // Eat "mips16".
@@ -7308,6 +7522,10 @@ bool MipsAsmParser::parseSetFeature(uint64_t Feature) {
   switch (Feature) {
   default:
     llvm_unreachable("Unimplemented feature");
+  case Mips::FeatureMips3D:
+    setFeatureBits(Mips::FeatureMips3D, "mips3d");
+    getTargetStreamer().emitDirectiveSetMips3D();
+    break;
   case Mips::FeatureDSP:
     setFeatureBits(Mips::FeatureDSP, "dsp");
     getTargetStreamer().emitDirectiveSetDsp();
@@ -7415,6 +7633,31 @@ bool MipsAsmParser::isPicAndNotNxxAbi() {
   return inPicMode() && !(isABI_N32() || isABI_N64());
 }
 
+bool MipsAsmParser::parseDirectiveCpAdd(SMLoc Loc) {
+  SmallVector<std::unique_ptr<MCParsedAsmOperand>, 1> Reg;
+  OperandMatchResultTy ResTy = parseAnyRegister(Reg);
+  if (ResTy == MatchOperand_NoMatch || ResTy == MatchOperand_ParseFail) {
+    reportParseError("expected register");
+    return false;
+  }
+
+  MipsOperand &RegOpnd = static_cast<MipsOperand &>(*Reg[0]);
+  if (!RegOpnd.isGPRAsmReg()) {
+    reportParseError(RegOpnd.getStartLoc(), "invalid register");
+    return false;
+  }
+
+  // If this is not the end of the statement, report an error.
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    reportParseError("unexpected token, expected end of statement");
+    return false;
+  }
+  getParser().Lex(); // Consume the EndOfStatement.
+
+  getTargetStreamer().emitDirectiveCpAdd(RegOpnd.getGPR32Reg());
+  return false;
+}
+
 bool MipsAsmParser::parseDirectiveCpLoad(SMLoc Loc) {
   if (AssemblerOptions.back()->isReorder())
     Warning(Loc, ".cpload should be inside a noreorder section");
@@ -7723,6 +7966,10 @@ bool MipsAsmParser::parseDirectiveSet() {
     return parseSetFeature(Mips::FeatureDSPR2);
   if (IdVal == "nodsp")
     return parseSetNoDspDirective();
+  if (IdVal == "mips3d")
+    return parseSetFeature(Mips::FeatureMips3D);
+  if (IdVal == "nomips3d")
+    return parseSetNoMips3DDirective();
   if (IdVal == "msa")
     return parseSetMsaDirective();
   if (IdVal == "nomsa")
@@ -7761,7 +8008,7 @@ bool MipsAsmParser::parseDirectiveGpWord() {
   // method to evaluate the expression.
   if (getParser().parseExpression(Value))
     return true;
-  getParser().getStreamer().EmitGPRel32Value(Value);
+  getParser().getStreamer().emitGPRel32Value(Value);
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(getLexer().getLoc(),
@@ -7779,7 +8026,7 @@ bool MipsAsmParser::parseDirectiveGpDWord() {
   // method to evaluate the expression.
   if (getParser().parseExpression(Value))
     return true;
-  getParser().getStreamer().EmitGPRel64Value(Value);
+  getParser().getStreamer().emitGPRel64Value(Value);
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(getLexer().getLoc(),
@@ -7797,7 +8044,7 @@ bool MipsAsmParser::parseDirectiveDtpRelWord() {
   // method to evaluate the expression.
   if (getParser().parseExpression(Value))
     return true;
-  getParser().getStreamer().EmitDTPRel32Value(Value);
+  getParser().getStreamer().emitDTPRel32Value(Value);
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(getLexer().getLoc(),
@@ -7815,7 +8062,7 @@ bool MipsAsmParser::parseDirectiveDtpRelDWord() {
   // method to evaluate the expression.
   if (getParser().parseExpression(Value))
     return true;
-  getParser().getStreamer().EmitDTPRel64Value(Value);
+  getParser().getStreamer().emitDTPRel64Value(Value);
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(getLexer().getLoc(),
@@ -7833,7 +8080,7 @@ bool MipsAsmParser::parseDirectiveTpRelWord() {
   // method to evaluate the expression.
   if (getParser().parseExpression(Value))
     return true;
-  getParser().getStreamer().EmitTPRel32Value(Value);
+  getParser().getStreamer().emitTPRel32Value(Value);
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(getLexer().getLoc(),
@@ -7851,7 +8098,7 @@ bool MipsAsmParser::parseDirectiveTpRelDWord() {
   // method to evaluate the expression.
   if (getParser().parseExpression(Value))
     return true;
-  getParser().getStreamer().EmitTPRel64Value(Value);
+  getParser().getStreamer().emitTPRel64Value(Value);
 
   if (getLexer().isNot(AsmToken::EndOfStatement))
     return Error(getLexer().getLoc(),
@@ -8323,6 +8570,10 @@ bool MipsAsmParser::ParseDirective(AsmToken DirectiveID) {
   MCAsmParser &Parser = getParser();
   StringRef IDVal = DirectiveID.getString();
 
+  if (IDVal == ".cpadd") {
+    parseDirectiveCpAdd(DirectiveID.getLoc());
+    return false;
+  }
   if (IDVal == ".cpload") {
     parseDirectiveCpLoad(DirectiveID.getLoc());
     return false;
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
index fca1149453c9..c5a1a3e6286e 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIFlagsSection.cpp
@@ -57,17 +57,17 @@ namespace llvm {
 
 MCStreamer &operator<<(MCStreamer &OS, MipsABIFlagsSection &ABIFlagsSection) {
   // Write out a Elf_Internal_ABIFlags_v0 struct
-  OS.EmitIntValue(ABIFlagsSection.getVersionValue(), 2);      // version
-  OS.EmitIntValue(ABIFlagsSection.getISALevelValue(), 1);     // isa_level
-  OS.EmitIntValue(ABIFlagsSection.getISARevisionValue(), 1);  // isa_rev
-  OS.EmitIntValue(ABIFlagsSection.getGPRSizeValue(), 1);      // gpr_size
-  OS.EmitIntValue(ABIFlagsSection.getCPR1SizeValue(), 1);     // cpr1_size
-  OS.EmitIntValue(ABIFlagsSection.getCPR2SizeValue(), 1);     // cpr2_size
-  OS.EmitIntValue(ABIFlagsSection.getFpABIValue(), 1);        // fp_abi
-  OS.EmitIntValue(ABIFlagsSection.getISAExtensionValue(), 4); // isa_ext
-  OS.EmitIntValue(ABIFlagsSection.getASESetValue(), 4);       // ases
-  OS.EmitIntValue(ABIFlagsSection.getFlags1Value(), 4);       // flags1
-  OS.EmitIntValue(ABIFlagsSection.getFlags2Value(), 4);       // flags2
+  OS.emitIntValue(ABIFlagsSection.getVersionValue(), 2);      // version
+  OS.emitIntValue(ABIFlagsSection.getISALevelValue(), 1);     // isa_level
+  OS.emitIntValue(ABIFlagsSection.getISARevisionValue(), 1);  // isa_rev
+  OS.emitIntValue(ABIFlagsSection.getGPRSizeValue(), 1);      // gpr_size
+  OS.emitIntValue(ABIFlagsSection.getCPR1SizeValue(), 1);     // cpr1_size
+  OS.emitIntValue(ABIFlagsSection.getCPR2SizeValue(), 1);     // cpr2_size
+  OS.emitIntValue(ABIFlagsSection.getFpABIValue(), 1);        // fp_abi
+  OS.emitIntValue(ABIFlagsSection.getISAExtensionValue(), 4); // isa_ext
+  OS.emitIntValue(ABIFlagsSection.getASESetValue(), 4);       // ases
+  OS.emitIntValue(ABIFlagsSection.getFlags1Value(), 4);       // flags1
+  OS.emitIntValue(ABIFlagsSection.getFlags2Value(), 4);       // flags2
   return OS;
 }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
index 534e6573b63c..046cc686b311 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsABIInfo.h
@@ -18,7 +18,6 @@ namespace llvm {
 template <typename T> class ArrayRef;
 class MCTargetOptions;
 class StringRef;
-class TargetRegisterClass;
 
 class MipsABIInfo {
 public:
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
index cca75dfc45c2..1126b871cb11 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.h
@@ -22,9 +22,7 @@ namespace llvm {
 
 class MCAssembler;
 struct MCFixupKindInfo;
-class MCObjectWriter;
 class MCRegisterInfo;
-class MCSymbolELF;
 class Target;
 
 class MipsAsmBackend : public MCAsmBackend {
@@ -74,17 +72,6 @@ public:
     return false;
   }
 
-  /// RelaxInstruction - Relax the instruction in the given fragment
-  /// to the next wider instruction.
-  ///
-  /// \param Inst - The instruction to relax, which may be the same
-  /// as the output.
-  /// \param [out] Res On return, the relaxed instruction.
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override {}
-
-  /// @}
-
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
index cc3168790b98..9c317e3f8840 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFObjectWriter.cpp
@@ -234,14 +234,15 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
   case Mips::fixup_Mips_32:
   case FK_Data_4:
     return IsPCRel ? ELF::R_MIPS_PC32 : ELF::R_MIPS_32;
+  case Mips::fixup_Mips_64:
+  case FK_Data_8:
+    return IsPCRel
+               ? setRTypes(ELF::R_MIPS_PC32, ELF::R_MIPS_64, ELF::R_MIPS_NONE)
+               : (unsigned)ELF::R_MIPS_64;
   }
 
   if (IsPCRel) {
     switch (Kind) {
-    case FK_Data_8:
-      Ctx.reportError(Fixup.getLoc(),
-                      "MIPS does not support 64-bit PC-relative relocations");
-      return ELF::R_MIPS_NONE;
     case Mips::fixup_Mips_Branch_PCRel:
     case Mips::fixup_Mips_PC16:
       return ELF::R_MIPS_PC16;
@@ -277,9 +278,6 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
   }
 
   switch (Kind) {
-  case Mips::fixup_Mips_64:
-  case FK_Data_8:
-    return ELF::R_MIPS_64;
   case FK_DTPRel_4:
     return ELF::R_MIPS_TLS_DTPREL32;
   case FK_DTPRel_8:
@@ -289,14 +287,9 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
   case FK_TPRel_8:
     return ELF::R_MIPS_TLS_TPREL64;
   case FK_GPRel_4:
-    if (is64Bit()) {
-      unsigned Type = (unsigned)ELF::R_MIPS_NONE;
-      Type = setRType((unsigned)ELF::R_MIPS_GPREL32, Type);
-      Type = setRType2((unsigned)ELF::R_MIPS_64, Type);
-      Type = setRType3((unsigned)ELF::R_MIPS_NONE, Type);
-      return Type;
-    }
-    return ELF::R_MIPS_GPREL32;
+    return setRTypes(ELF::R_MIPS_GPREL32,
+                     is64Bit() ? ELF::R_MIPS_64 : ELF::R_MIPS_NONE,
+                     ELF::R_MIPS_NONE);
   case Mips::fixup_Mips_GPREL16:
     return ELF::R_MIPS_GPREL16;
   case Mips::fixup_Mips_26:
@@ -329,34 +322,16 @@ unsigned MipsELFObjectWriter::getRelocType(MCContext &Ctx,
     return ELF::R_MIPS_GOT_OFST;
   case Mips::fixup_Mips_GOT_DISP:
     return ELF::R_MIPS_GOT_DISP;
-  case Mips::fixup_Mips_GPOFF_HI: {
-    unsigned Type = (unsigned)ELF::R_MIPS_NONE;
-    Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
-    Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
-    Type = setRType3((unsigned)ELF::R_MIPS_HI16, Type);
-    return Type;
-  }
-  case Mips::fixup_MICROMIPS_GPOFF_HI: {
-    unsigned Type = (unsigned)ELF::R_MIPS_NONE;
-    Type = setRType((unsigned)ELF::R_MICROMIPS_GPREL16, Type);
-    Type = setRType2((unsigned)ELF::R_MICROMIPS_SUB, Type);
-    Type = setRType3((unsigned)ELF::R_MICROMIPS_HI16, Type);
-    return Type;
-  }
-  case Mips::fixup_Mips_GPOFF_LO: {
-    unsigned Type = (unsigned)ELF::R_MIPS_NONE;
-    Type = setRType((unsigned)ELF::R_MIPS_GPREL16, Type);
-    Type = setRType2((unsigned)ELF::R_MIPS_SUB, Type);
-    Type = setRType3((unsigned)ELF::R_MIPS_LO16, Type);
-    return Type;
-  }
-  case Mips::fixup_MICROMIPS_GPOFF_LO: {
-    unsigned Type = (unsigned)ELF::R_MIPS_NONE;
-    Type = setRType((unsigned)ELF::R_MICROMIPS_GPREL16, Type);
-    Type = setRType2((unsigned)ELF::R_MICROMIPS_SUB, Type);
-    Type = setRType3((unsigned)ELF::R_MICROMIPS_LO16, Type);
-    return Type;
-  }
+  case Mips::fixup_Mips_GPOFF_HI:
+    return setRTypes(ELF::R_MIPS_GPREL16, ELF::R_MIPS_SUB, ELF::R_MIPS_HI16);
+  case Mips::fixup_MICROMIPS_GPOFF_HI:
+    return setRTypes(ELF::R_MICROMIPS_GPREL16, ELF::R_MICROMIPS_SUB,
+                     ELF::R_MICROMIPS_HI16);
+  case Mips::fixup_Mips_GPOFF_LO:
+    return setRTypes(ELF::R_MIPS_GPREL16, ELF::R_MIPS_SUB, ELF::R_MIPS_LO16);
+  case Mips::fixup_MICROMIPS_GPOFF_LO:
+    return setRTypes(ELF::R_MICROMIPS_GPREL16, ELF::R_MICROMIPS_SUB,
+                     ELF::R_MICROMIPS_LO16);
   case Mips::fixup_Mips_HIGHER:
     return ELF::R_MIPS_HIGHER;
   case Mips::fixup_Mips_HIGHEST:
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
index 1b83e9445fb5..e6e32ec7f27c 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.cpp
@@ -33,9 +33,9 @@ MipsELFStreamer::MipsELFStreamer(MCContext &Context,
       std::unique_ptr<MipsRegInfoRecord>(RegInfoRecord));
 }
 
-void MipsELFStreamer::EmitInstruction(const MCInst &Inst,
+void MipsELFStreamer::emitInstruction(const MCInst &Inst,
                                       const MCSubtargetInfo &STI) {
-  MCELFStreamer::EmitInstruction(Inst, STI);
+  MCELFStreamer::emitInstruction(Inst, STI);
 
   MCContext &Context = getContext();
   const MCRegisterInfo *MCRegInfo = Context.getRegisterInfo();
@@ -53,20 +53,20 @@ void MipsELFStreamer::EmitInstruction(const MCInst &Inst,
   createPendingLabelRelocs();
 }
 
-void MipsELFStreamer::EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
+void MipsELFStreamer::emitCFIStartProcImpl(MCDwarfFrameInfo &Frame) {
   Frame.Begin = getContext().createTempSymbol();
-  MCELFStreamer::EmitLabel(Frame.Begin);
+  MCELFStreamer::emitLabel(Frame.Begin);
 }
 
-MCSymbol *MipsELFStreamer::EmitCFILabel() {
+MCSymbol *MipsELFStreamer::emitCFILabel() {
   MCSymbol *Label = getContext().createTempSymbol("cfi", true);
-  MCELFStreamer::EmitLabel(Label);
+  MCELFStreamer::emitLabel(Label);
   return Label;
 }
 
-void MipsELFStreamer::EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
+void MipsELFStreamer::emitCFIEndProcImpl(MCDwarfFrameInfo &Frame) {
   Frame.End = getContext().createTempSymbol();
-  MCELFStreamer::EmitLabel(Frame.End);
+  MCELFStreamer::emitLabel(Frame.End);
 }
 
 void MipsELFStreamer::createPendingLabelRelocs() {
@@ -85,8 +85,8 @@ void MipsELFStreamer::createPendingLabelRelocs() {
   Labels.clear();
 }
 
-void MipsELFStreamer::EmitLabel(MCSymbol *Symbol, SMLoc Loc) {
-  MCELFStreamer::EmitLabel(Symbol);
+void MipsELFStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
+  MCELFStreamer::emitLabel(Symbol);
   Labels.push_back(Symbol);
 }
 
@@ -96,14 +96,14 @@ void MipsELFStreamer::SwitchSection(MCSection *Section,
   Labels.clear();
 }
 
-void MipsELFStreamer::EmitValueImpl(const MCExpr *Value, unsigned Size,
+void MipsELFStreamer::emitValueImpl(const MCExpr *Value, unsigned Size,
                                     SMLoc Loc) {
-  MCELFStreamer::EmitValueImpl(Value, Size, Loc);
+  MCELFStreamer::emitValueImpl(Value, Size, Loc);
   Labels.clear();
 }
 
-void MipsELFStreamer::EmitIntValue(uint64_t Value, unsigned Size) {
-  MCELFStreamer::EmitIntValue(Value, Size);
+void MipsELFStreamer::emitIntValue(uint64_t Value, unsigned Size) {
+  MCELFStreamer::emitIntValue(Value, Size);
   Labels.clear();
 }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
index 2febfbc69b6f..f6a2c039c0c3 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsELFStreamer.h
@@ -41,12 +41,12 @@ public:
   /// \p Inst is actually emitted. For example, we can inspect the operands and
   /// gather sufficient information that allows us to reason about the register
   /// usage for the translation unit.
-  void EmitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+  void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
 
   /// Overriding this function allows us to record all labels that should be
   /// marked as microMIPS. Based on this data marking is done in
   /// EmitInstruction.
-  void EmitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
+  void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
 
   /// Overriding this function allows us to dismiss all labels that are
   /// candidates for marking as microMIPS when .section directive is processed.
@@ -56,14 +56,14 @@ public:
   /// Overriding these functions allows us to dismiss all labels that are
   /// candidates for marking as microMIPS when .word/.long/.4byte etc
   /// directives are emitted.
-  void EmitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override;
-  void EmitIntValue(uint64_t Value, unsigned Size) override;
+  void emitValueImpl(const MCExpr *Value, unsigned Size, SMLoc Loc) override;
+  void emitIntValue(uint64_t Value, unsigned Size) override;
 
   // Overriding these functions allows us to avoid recording of these labels
   // in EmitLabel and later marking them as microMIPS.
-  void EmitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
-  void EmitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
-  MCSymbol *EmitCFILabel() override;
+  void emitCFIStartProcImpl(MCDwarfFrameInfo &Frame) override;
+  void emitCFIEndProcImpl(MCDwarfFrameInfo &Frame) override;
+  MCSymbol *emitCFILabel() override;
 
   /// Emits all the option records stored up until the point it's called.
   void EmitMipsOptionRecords();
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
index 649ba20324bf..3700d6309e1a 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.cpp
@@ -109,7 +109,7 @@ void MipsInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   }
 
   // Try to print any aliases first.
-  if (!printAliasInstr(MI, O) && !printAlias(*MI, O))
+  if (!printAliasInstr(MI, Address, O) && !printAlias(*MI, O))
     printInstruction(MI, Address, O);
   printAnnotation(O, Annot);
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h
index 0b1ee800e440..3f534a2f1843 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsInstPrinter.h
@@ -86,12 +86,17 @@ public:
   void printInst(const MCInst *MI, uint64_t Address, StringRef Annot,
                  const MCSubtargetInfo &STI, raw_ostream &O) override;
 
-  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
-  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                               unsigned PrintMethodIdx, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                               unsigned OpIdx, unsigned PrintMethodIdx,
+                               raw_ostream &O);
 
 private:
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperand(const MCInst *MI, uint64_t /*Address*/, unsigned OpNum,
+                    raw_ostream &O) {
+    printOperand(MI, OpNum, O);
+  }
   template <unsigned Bits, unsigned Offset = 0>
   void printUImm(const MCInst *MI, int opNum, raw_ostream &O);
   void printMemOperand(const MCInst *MI, int opNum, raw_ostream &O);
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
index 5182205edaea..9c85a39bc348 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCAsmInfo.cpp
@@ -50,5 +50,4 @@ MipsMCAsmInfo::MipsMCAsmInfo(const Triple &TheTriple,
   ExceptionsType = ExceptionHandling::DwarfCFI;
   DwarfRegNumForCFI = true;
   HasMipsExpressions = true;
-  UseIntegratedAssembler = true;
 }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 846f508005f5..9de34cc0e787 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -129,7 +129,7 @@ void MipsMCCodeEmitter::EmitByte(unsigned char C, raw_ostream &OS) const {
   OS << (char)C;
 }
 
-void MipsMCCodeEmitter::EmitInstruction(uint64_t Val, unsigned Size,
+void MipsMCCodeEmitter::emitInstruction(uint64_t Val, unsigned Size,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &OS) const {
   // Output the instruction encoding in little endian byte order.
@@ -137,8 +137,8 @@ void MipsMCCodeEmitter::EmitInstruction(uint64_t Val, unsigned Size,
   //   mips32r2:   4 | 3 | 2 | 1
   //   microMIPS:  2 | 1 | 4 | 3
   if (IsLittleEndian && Size == 4 && isMicroMips(STI)) {
-    EmitInstruction(Val >> 16, 2, STI, OS);
-    EmitInstruction(Val, 2, STI, OS);
+    emitInstruction(Val >> 16, 2, STI, OS);
+    emitInstruction(Val, 2, STI, OS);
   } else {
     for (unsigned i = 0; i < Size; ++i) {
       unsigned Shift = IsLittleEndian ? i * 8 : (Size - 1 - i) * 8;
@@ -226,7 +226,7 @@ encodeInstruction(const MCInst &MI, raw_ostream &OS,
   if (!Size)
     llvm_unreachable("Desc.getSize() returns 0");
 
-  EmitInstruction(Binary, Size, STI, OS);
+  emitInstruction(Binary, Size, STI, OS);
 }
 
 /// getBranchTargetOpValue - Return binary encoding of the branch
@@ -723,21 +723,8 @@ getExprOpValue(const MCExpr *Expr, SmallVectorImpl<MCFixup> &Fixups,
     return 0;
   }
 
-  if (Kind == MCExpr::SymbolRef) {
-    Mips::Fixups FixupKind = Mips::Fixups(0);
-
-    switch(cast<MCSymbolRefExpr>(Expr)->getKind()) {
-    default: llvm_unreachable("Unknown fixup kind!");
-      break;
-    case MCSymbolRefExpr::VK_None:
-      // FIXME: This is ok for O32/N32 but not N64.
-      FixupKind = Mips::fixup_Mips_32;
-      break;
-    } // switch
-
-    Fixups.push_back(MCFixup::create(0, Expr, MCFixupKind(FixupKind)));
-    return 0;
-  }
+  if (Kind == MCExpr::SymbolRef)
+    Ctx.reportError(Expr->getLoc(), "expected an immediate");
   return 0;
 }
 
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
index ff6e1d62b05f..16e94c723b34 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.h
@@ -44,7 +44,7 @@ public:
 
   void EmitByte(unsigned char C, raw_ostream &OS) const;
 
-  void EmitInstruction(uint64_t Val, unsigned Size, const MCSubtargetInfo &STI,
+  void emitInstruction(uint64_t Val, unsigned Size, const MCSubtargetInfo &STI,
                        raw_ostream &OS) const;
 
   void encodeInstruction(const MCInst &MI, raw_ostream &OS,
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
index 809be99ff3f4..b7ecb0fdca5e 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.h
@@ -29,8 +29,6 @@ class MCTargetOptions;
 class StringRef;
 class Target;
 class Triple;
-class raw_ostream;
-class raw_pwrite_stream;
 
 MCCodeEmitter *createMipsMCCodeEmitterEB(const MCInstrInfo &MCII,
                                          const MCRegisterInfo &MRI,
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
index 0544758f8a25..eade2d9bd745 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsNaClELFStreamer.cpp
@@ -105,7 +105,7 @@ private:
     MaskInst.addOperand(MCOperand::createReg(AddrReg));
     MaskInst.addOperand(MCOperand::createReg(AddrReg));
     MaskInst.addOperand(MCOperand::createReg(MaskReg));
-    MipsELFStreamer::EmitInstruction(MaskInst, STI);
+    MipsELFStreamer::emitInstruction(MaskInst, STI);
   }
 
   // Sandbox indirect branch or return instruction by inserting mask operation
@@ -113,10 +113,10 @@ private:
   void sandboxIndirectJump(const MCInst &MI, const MCSubtargetInfo &STI) {
     unsigned AddrReg = MI.getOperand(0).getReg();
 
-    EmitBundleLock(false);
+    emitBundleLock(false);
     emitMask(AddrReg, IndirectBranchMaskReg, STI);
-    MipsELFStreamer::EmitInstruction(MI, STI);
-    EmitBundleUnlock();
+    MipsELFStreamer::emitInstruction(MI, STI);
+    emitBundleUnlock();
   }
 
   // Sandbox memory access or SP change.  Insert mask operation before and/or
@@ -124,26 +124,26 @@ private:
   void sandboxLoadStoreStackChange(const MCInst &MI, unsigned AddrIdx,
                                    const MCSubtargetInfo &STI, bool MaskBefore,
                                    bool MaskAfter) {
-    EmitBundleLock(false);
+    emitBundleLock(false);
     if (MaskBefore) {
       // Sandbox memory access.
       unsigned BaseReg = MI.getOperand(AddrIdx).getReg();
       emitMask(BaseReg, LoadStoreStackMaskReg, STI);
     }
-    MipsELFStreamer::EmitInstruction(MI, STI);
+    MipsELFStreamer::emitInstruction(MI, STI);
     if (MaskAfter) {
       // Sandbox SP change.
       unsigned SPReg = MI.getOperand(0).getReg();
       assert((Mips::SP == SPReg) && "Unexpected stack-pointer register.");
       emitMask(SPReg, LoadStoreStackMaskReg, STI);
     }
-    EmitBundleUnlock();
+    emitBundleUnlock();
   }
 
 public:
   /// This function is the one used to emit instruction data into the ELF
   /// streamer.  We override it to mask dangerous instructions.
-  void EmitInstruction(const MCInst &Inst,
+  void emitInstruction(const MCInst &Inst,
                        const MCSubtargetInfo &STI) override {
     // Sandbox indirect jumps.
     if (isIndirectJump(Inst)) {
@@ -181,25 +181,25 @@ public:
         report_fatal_error("Dangerous instruction in branch delay slot!");
 
       // Start the sandboxing sequence by emitting call.
-      EmitBundleLock(true);
+      emitBundleLock(true);
       if (IsIndirectCall) {
         unsigned TargetReg = Inst.getOperand(1).getReg();
         emitMask(TargetReg, IndirectBranchMaskReg, STI);
       }
-      MipsELFStreamer::EmitInstruction(Inst, STI);
+      MipsELFStreamer::emitInstruction(Inst, STI);
       PendingCall = true;
       return;
     }
     if (PendingCall) {
       // Finish the sandboxing sequence by emitting branch delay.
-      MipsELFStreamer::EmitInstruction(Inst, STI);
-      EmitBundleUnlock();
+      MipsELFStreamer::emitInstruction(Inst, STI);
+      emitBundleUnlock();
       PendingCall = false;
       return;
     }
 
     // None of the sandboxing applies, just emit the instruction.
-    MipsELFStreamer::EmitInstruction(Inst, STI);
+    MipsELFStreamer::emitInstruction(Inst, STI);
   }
 };
 
@@ -270,7 +270,7 @@ MCELFStreamer *createMipsNaClELFStreamer(MCContext &Context,
     S->getAssembler().setRelaxAll(true);
 
   // Set bundle-alignment as required by the NaCl ABI for the target.
-  S->EmitBundleAlignMode(Log2(MIPS_NACL_BUNDLE_ALIGN));
+  S->emitBundleAlignMode(Log2(MIPS_NACL_BUNDLE_ALIGN));
 
   return S;
 }
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
index bdfb70aa9813..a4a953bcd7c3 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsOptionRecord.cpp
@@ -40,17 +40,17 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
     Sec->setAlignment(Align(8));
     Streamer->SwitchSection(Sec);
 
-    Streamer->EmitIntValue(ELF::ODK_REGINFO, 1);  // kind
-    Streamer->EmitIntValue(40, 1); // size
-    Streamer->EmitIntValue(0, 2);  // section
-    Streamer->EmitIntValue(0, 4);  // info
-    Streamer->EmitIntValue(ri_gprmask, 4);
-    Streamer->EmitIntValue(0, 4); // pad
-    Streamer->EmitIntValue(ri_cprmask[0], 4);
-    Streamer->EmitIntValue(ri_cprmask[1], 4);
-    Streamer->EmitIntValue(ri_cprmask[2], 4);
-    Streamer->EmitIntValue(ri_cprmask[3], 4);
-    Streamer->EmitIntValue(ri_gp_value, 8);
+    Streamer->emitInt8(ELF::ODK_REGINFO); // kind
+    Streamer->emitInt8(40);               // size
+    Streamer->emitInt16(0);               // section
+    Streamer->emitInt32(0);               // info
+    Streamer->emitInt32(ri_gprmask);
+    Streamer->emitInt32(0); // pad
+    Streamer->emitInt32(ri_cprmask[0]);
+    Streamer->emitInt32(ri_cprmask[1]);
+    Streamer->emitInt32(ri_cprmask[2]);
+    Streamer->emitInt32(ri_cprmask[3]);
+    Streamer->emitIntValue(ri_gp_value, 8);
   } else {
     MCSectionELF *Sec = Context.getELFSection(".reginfo", ELF::SHT_MIPS_REGINFO,
                                               ELF::SHF_ALLOC, 24, "");
@@ -58,13 +58,13 @@ void MipsRegInfoRecord::EmitMipsOptionRecord() {
     Sec->setAlignment(MTS->getABI().IsN32() ? Align(8) : Align(4));
     Streamer->SwitchSection(Sec);
 
-    Streamer->EmitIntValue(ri_gprmask, 4);
-    Streamer->EmitIntValue(ri_cprmask[0], 4);
-    Streamer->EmitIntValue(ri_cprmask[1], 4);
-    Streamer->EmitIntValue(ri_cprmask[2], 4);
-    Streamer->EmitIntValue(ri_cprmask[3], 4);
+    Streamer->emitInt32(ri_gprmask);
+    Streamer->emitInt32(ri_cprmask[0]);
+    Streamer->emitInt32(ri_cprmask[1]);
+    Streamer->emitInt32(ri_cprmask[2]);
+    Streamer->emitInt32(ri_cprmask[3]);
     assert((ri_gp_value & 0xffffffff) == ri_gp_value);
-    Streamer->EmitIntValue(ri_gp_value, 4);
+    Streamer->emitInt32(ri_gp_value);
   }
 
   Streamer->PopSection();
diff --git a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
index 054dc79f4aa9..6ec8fe805968 100644
--- a/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
+++ b/llvm/lib/Target/Mips/MCTargetDesc/MipsTargetStreamer.cpp
@@ -109,6 +109,9 @@ void MipsTargetStreamer::emitDirectiveSetHardFloat() {
 void MipsTargetStreamer::emitDirectiveSetDsp() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetDspr2() { forbidModuleDirective(); }
 void MipsTargetStreamer::emitDirectiveSetNoDsp() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetMips3D() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveSetNoMips3D() { forbidModuleDirective(); }
+void MipsTargetStreamer::emitDirectiveCpAdd(unsigned RegNo) {}
 void MipsTargetStreamer::emitDirectiveCpLoad(unsigned RegNo) {}
 void MipsTargetStreamer::emitDirectiveCpLocal(unsigned RegNo) {
   // .cplocal $reg
@@ -169,7 +172,7 @@ void MipsTargetStreamer::emitR(unsigned Opcode, unsigned Reg0, SMLoc IDLoc,
   TmpInst.setOpcode(Opcode);
   TmpInst.addOperand(MCOperand::createReg(Reg0));
   TmpInst.setLoc(IDLoc);
-  getStreamer().EmitInstruction(TmpInst, *STI);
+  getStreamer().emitInstruction(TmpInst, *STI);
 }
 
 void MipsTargetStreamer::emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1,
@@ -179,7 +182,7 @@ void MipsTargetStreamer::emitRX(unsigned Opcode, unsigned Reg0, MCOperand Op1,
   TmpInst.addOperand(MCOperand::createReg(Reg0));
   TmpInst.addOperand(Op1);
   TmpInst.setLoc(IDLoc);
-  getStreamer().EmitInstruction(TmpInst, *STI);
+  getStreamer().emitInstruction(TmpInst, *STI);
 }
 
 void MipsTargetStreamer::emitRI(unsigned Opcode, unsigned Reg0, int32_t Imm,
@@ -199,7 +202,7 @@ void MipsTargetStreamer::emitII(unsigned Opcode, int16_t Imm1, int16_t Imm2,
   TmpInst.addOperand(MCOperand::createImm(Imm1));
   TmpInst.addOperand(MCOperand::createImm(Imm2));
   TmpInst.setLoc(IDLoc);
-  getStreamer().EmitInstruction(TmpInst, *STI);
+  getStreamer().emitInstruction(TmpInst, *STI);
 }
 
 void MipsTargetStreamer::emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1,
@@ -211,7 +214,7 @@ void MipsTargetStreamer::emitRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1,
   TmpInst.addOperand(MCOperand::createReg(Reg1));
   TmpInst.addOperand(Op2);
   TmpInst.setLoc(IDLoc);
-  getStreamer().EmitInstruction(TmpInst, *STI);
+  getStreamer().emitInstruction(TmpInst, *STI);
 }
 
 void MipsTargetStreamer::emitRRR(unsigned Opcode, unsigned Reg0, unsigned Reg1,
@@ -230,7 +233,7 @@ void MipsTargetStreamer::emitRRRX(unsigned Opcode, unsigned Reg0, unsigned Reg1,
   TmpInst.addOperand(MCOperand::createReg(Reg2));
   TmpInst.addOperand(Op3);
   TmpInst.setLoc(IDLoc);
-  getStreamer().EmitInstruction(TmpInst, *STI);
+  getStreamer().emitInstruction(TmpInst, *STI);
 }
 
 void MipsTargetStreamer::emitRRI(unsigned Opcode, unsigned Reg0, unsigned Reg1,
@@ -251,7 +254,7 @@ void MipsTargetStreamer::emitRRIII(unsigned Opcode, unsigned Reg0,
   TmpInst.addOperand(MCOperand::createImm(Imm1));
   TmpInst.addOperand(MCOperand::createImm(Imm2));
   TmpInst.setLoc(IDLoc);
-  getStreamer().EmitInstruction(TmpInst, *STI);
+  getStreamer().emitInstruction(TmpInst, *STI);
 }
 
 void MipsTargetStreamer::emitAddu(unsigned DstReg, unsigned SrcReg,
@@ -609,6 +612,16 @@ void MipsTargetAsmStreamer::emitDirectiveSetNoDsp() {
   MipsTargetStreamer::emitDirectiveSetNoDsp();
 }
 
+void MipsTargetAsmStreamer::emitDirectiveSetMips3D() {
+  OS << "\t.set\tmips3d\n";
+  MipsTargetStreamer::emitDirectiveSetMips3D();
+}
+
+void MipsTargetAsmStreamer::emitDirectiveSetNoMips3D() {
+  OS << "\t.set\tnomips3d\n";
+  MipsTargetStreamer::emitDirectiveSetNoMips3D();
+}
+
 void MipsTargetAsmStreamer::emitDirectiveSetPop() {
   OS << "\t.set\tpop\n";
   MipsTargetStreamer::emitDirectiveSetPop();
@@ -650,6 +663,12 @@ void MipsTargetAsmStreamer::emitFMask(unsigned FPUBitmask,
   OS << "," << FPUTopSavedRegOff << '\n';
 }
 
+void MipsTargetAsmStreamer::emitDirectiveCpAdd(unsigned RegNo) {
+  OS << "\t.cpadd\t$"
+     << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
+  forbidModuleDirective();
+}
+
 void MipsTargetAsmStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   OS << "\t.cpload\t$"
      << StringRef(MipsInstPrinter::getRegisterName(RegNo)).lower() << "\n";
@@ -883,9 +902,9 @@ void MipsTargetELFStreamer::finish() {
       if (Alignment) {
         OS.SwitchSection(&Section);
         if (Section.UseCodeAlign())
-          OS.EmitCodeAlignment(Alignment, Alignment);
+          OS.emitCodeAlignment(Alignment, Alignment);
         else
-          OS.EmitValueToAlignment(Alignment, 0, 1, Alignment);
+          OS.emitValueToAlignment(Alignment, 0, 1, Alignment);
       }
     }
   }
@@ -997,17 +1016,17 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
 
   OS.SwitchSection(Sec);
 
-  OS.EmitValueImpl(ExprRef, 4);
+  OS.emitValueImpl(ExprRef, 4);
 
-  OS.EmitIntValue(GPRInfoSet ? GPRBitMask : 0, 4); // reg_mask
-  OS.EmitIntValue(GPRInfoSet ? GPROffset : 0, 4);  // reg_offset
+  OS.emitIntValue(GPRInfoSet ? GPRBitMask : 0, 4); // reg_mask
+  OS.emitIntValue(GPRInfoSet ? GPROffset : 0, 4);  // reg_offset
 
-  OS.EmitIntValue(FPRInfoSet ? FPRBitMask : 0, 4); // fpreg_mask
-  OS.EmitIntValue(FPRInfoSet ? FPROffset : 0, 4);  // fpreg_offset
+  OS.emitIntValue(FPRInfoSet ? FPRBitMask : 0, 4); // fpreg_mask
+  OS.emitIntValue(FPRInfoSet ? FPROffset : 0, 4);  // fpreg_offset
 
-  OS.EmitIntValue(FrameInfoSet ? FrameOffset : 0, 4); // frame_offset
-  OS.EmitIntValue(FrameInfoSet ? FrameReg : 0, 4);    // frame_reg
-  OS.EmitIntValue(FrameInfoSet ? ReturnReg : 0, 4);   // return_reg
+  OS.emitIntValue(FrameInfoSet ? FrameOffset : 0, 4); // frame_offset
+  OS.emitIntValue(FrameInfoSet ? FrameReg : 0, 4);    // frame_reg
+  OS.emitIntValue(FrameInfoSet ? ReturnReg : 0, 4);   // return_reg
 
   // The .end directive marks the end of a procedure. Invalidate
   // the information gathered up until this point.
@@ -1017,7 +1036,7 @@ void MipsTargetELFStreamer::emitDirectiveEnd(StringRef Name) {
 
   // .end also implicitly sets the size.
   MCSymbol *CurPCSym = Context.createTempSymbol();
-  OS.EmitLabel(CurPCSym);
+  OS.emitLabel(CurPCSym);
   const MCExpr *Size = MCBinaryExpr::createSub(
       MCSymbolRefExpr::create(CurPCSym, MCSymbolRefExpr::VK_None, Context),
       ExprRef, Context);
@@ -1108,6 +1127,17 @@ void MipsTargetELFStreamer::emitFMask(unsigned FPUBitmask,
   FPROffset = FPUTopSavedRegOff;
 }
 
+void MipsTargetELFStreamer::emitDirectiveCpAdd(unsigned RegNo) {
+  // .cpadd $reg
+  // This directive inserts code to add $gp to the argument's register
+  // when support for position independent code is enabled.
+  if (!Pic)
+    return;
+
+  emitAddu(RegNo, RegNo, GPReg, getABI().IsN64(), &STI);
+  forbidModuleDirective();
+}
+
 void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   // .cpload $reg
   // This directive expands to:
@@ -1139,7 +1169,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
                               MCA.getContext()),
       MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(HiSym));
-  getStreamer().EmitInstruction(TmpInst, STI);
+  getStreamer().emitInstruction(TmpInst, STI);
 
   TmpInst.clear();
 
@@ -1152,7 +1182,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
                               MCA.getContext()),
       MCA.getContext());
   TmpInst.addOperand(MCOperand::createExpr(LoSym));
-  getStreamer().EmitInstruction(TmpInst, STI);
+  getStreamer().emitInstruction(TmpInst, STI);
 
   TmpInst.clear();
 
@@ -1160,7 +1190,7 @@ void MipsTargetELFStreamer::emitDirectiveCpLoad(unsigned RegNo) {
   TmpInst.addOperand(MCOperand::createReg(GPReg));
   TmpInst.addOperand(MCOperand::createReg(GPReg));
   TmpInst.addOperand(MCOperand::createReg(RegNo));
-  getStreamer().EmitInstruction(TmpInst, STI);
+  getStreamer().emitInstruction(TmpInst, STI);
 
   forbidModuleDirective();
 }
@@ -1269,7 +1299,7 @@ void MipsTargetELFStreamer::emitDirectiveCpreturn(unsigned SaveLocation,
     Inst.addOperand(MCOperand::createReg(Mips::SP));
     Inst.addOperand(MCOperand::createImm(SaveLocation));
   }
-  getStreamer().EmitInstruction(Inst, STI);
+  getStreamer().emitInstruction(Inst, STI);
 
   forbidModuleDirective();
 }
diff --git a/llvm/lib/Target/Mips/MicroMipsInstrFormats.td b/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
index e9fb9b310e3b..101d080f9567 100644
--- a/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
+++ b/llvm/lib/Target/Mips/MicroMipsInstrFormats.td
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This files descributes the formats of the microMIPS instruction set.
+// This files describes the formats of the microMIPS instruction set.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/Mips/MicroMipsInstrInfo.td b/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
index b707f1b96184..269ad8b548a4 100644
--- a/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MicroMipsInstrInfo.td
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This files describes the defintions of the microMIPSr3 instructions.
+// This files describes the definitions of the microMIPSr3 instructions.
 //
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp b/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp
index db93b3d80ede..55d3c59cbf03 100644
--- a/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp
+++ b/llvm/lib/Target/Mips/MicroMipsSizeReduction.cpp
@@ -376,12 +376,12 @@ static bool CheckXWPInstr(MachineInstr *MI, bool ReduceToLwp,
 
 // Returns true if the registers Reg1 and Reg2 are consecutive
 static bool ConsecutiveRegisters(unsigned Reg1, unsigned Reg2) {
-  static SmallVector<unsigned, 31> Registers = {
-      Mips::AT, Mips::V0, Mips::V1, Mips::A0, Mips::A1, Mips::A2, Mips::A3,
-      Mips::T0, Mips::T1, Mips::T2, Mips::T3, Mips::T4, Mips::T5, Mips::T6,
-      Mips::T7, Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5,
-      Mips::S6, Mips::S7, Mips::T8, Mips::T9, Mips::K0, Mips::K1, Mips::GP,
-      Mips::SP, Mips::FP, Mips::RA};
+  constexpr std::array<unsigned, 31> Registers = {
+      {Mips::AT, Mips::V0, Mips::V1, Mips::A0, Mips::A1, Mips::A2, Mips::A3,
+       Mips::T0, Mips::T1, Mips::T2, Mips::T3, Mips::T4, Mips::T5, Mips::T6,
+       Mips::T7, Mips::S0, Mips::S1, Mips::S2, Mips::S3, Mips::S4, Mips::S5,
+       Mips::S6, Mips::S7, Mips::T8, Mips::T9, Mips::K0, Mips::K1, Mips::GP,
+       Mips::SP, Mips::FP, Mips::RA}};
 
   for (uint8_t i = 0; i < Registers.size() - 1; i++) {
     if (Registers[i] == Reg1) {
diff --git a/llvm/lib/Target/Mips/Mips.td b/llvm/lib/Target/Mips/Mips.td
index b8a69815cc12..7fe750249c58 100644
--- a/llvm/lib/Target/Mips/Mips.td
+++ b/llvm/lib/Target/Mips/Mips.td
@@ -54,22 +54,6 @@ class AdditionalRequires<list<Predicate> preds> {
 }
 
 //===----------------------------------------------------------------------===//
-// Register File, Calling Conv, Instruction Descriptions
-//===----------------------------------------------------------------------===//
-
-include "MipsRegisterInfo.td"
-include "MipsSchedule.td"
-include "MipsInstrInfo.td"
-include "MipsCallingConv.td"
-include "MipsRegisterBanks.td"
-
-// Avoid forward declaration issues.
-include "MipsScheduleP5600.td"
-include "MipsScheduleGeneric.td"
-
-def MipsInstrInfo : InstrInfo;
-
-//===----------------------------------------------------------------------===//
 // Mips Subtarget features                                                    //
 //===----------------------------------------------------------------------===//
 
@@ -177,6 +161,8 @@ def FeatureDSPR3
     : SubtargetFeature<"dspr3", "HasDSPR3", "true", "Mips DSP-R3 ASE",
                        [ FeatureDSP, FeatureDSPR2 ]>;
 
+def FeatureMips3D : SubtargetFeature<"mips3d", "Has3D", "true", "Mips 3D ASE">;
+
 def FeatureMSA : SubtargetFeature<"msa", "HasMSA", "true", "Mips MSA ASE">;
 
 def FeatureEVA : SubtargetFeature<"eva", "HasEVA", "true", "Mips EVA ASE">;
@@ -221,6 +207,23 @@ def FeatureUseIndirectJumpsHazard : SubtargetFeature<"use-indirect-jump-hazard",
                                                     "UseIndirectJumpsHazard",
                                                     "true", "Use indirect jump"
                         " guards to prevent certain speculation based attacks">;
+
+//===----------------------------------------------------------------------===//
+// Register File, Calling Conv, Instruction Descriptions
+//===----------------------------------------------------------------------===//
+
+include "MipsRegisterInfo.td"
+include "MipsSchedule.td"
+include "MipsInstrInfo.td"
+include "MipsCallingConv.td"
+include "MipsRegisterBanks.td"
+
+// Avoid forward declaration issues.
+include "MipsScheduleP5600.td"
+include "MipsScheduleGeneric.td"
+
+def MipsInstrInfo : InstrInfo;
+
 //===----------------------------------------------------------------------===//
 // Mips processors supported.
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/Mips/Mips16FrameLowering.cpp b/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
index 5a2a916a6b7a..fefa1134b021 100644
--- a/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
+++ b/llvm/lib/Target/Mips/Mips16FrameLowering.cpp
@@ -62,8 +62,8 @@ void Mips16FrameLowering::emitPrologue(MachineFunction &MF,
   TII.makeFrame(Mips::SP, StackSize, MBB, MBBI);
 
   // emit ".cfi_def_cfa_offset StackSize"
-  unsigned CFIIndex = MF.addFrameInst(
-      MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
+  unsigned CFIIndex =
+      MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize));
   BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
 
@@ -109,11 +109,9 @@ void Mips16FrameLowering::emitEpilogue(MachineFunction &MF,
   TII.restoreFrame(Mips::SP, StackSize, MBB, MBBI);
 }
 
-bool Mips16FrameLowering::
-spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI,
-                          const std::vector<CalleeSavedInfo> &CSI,
-                          const TargetRegisterInfo *TRI) const {
+bool Mips16FrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
 
   //
@@ -137,10 +135,9 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
-bool Mips16FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                          MachineBasicBlock::iterator MI,
-                                       std::vector<CalleeSavedInfo> &CSI,
-                                       const TargetRegisterInfo *TRI) const {
+bool Mips16FrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   //
   // Registers RA,S0,S1 are the callee saved registers and they will be restored
   // with the restore instruction during emitEpilogue.
diff --git a/llvm/lib/Target/Mips/Mips16FrameLowering.h b/llvm/lib/Target/Mips/Mips16FrameLowering.h
index 6b62453f8dfe..7f0f1cd4ea37 100644
--- a/llvm/lib/Target/Mips/Mips16FrameLowering.h
+++ b/llvm/lib/Target/Mips/Mips16FrameLowering.h
@@ -27,13 +27,14 @@ public:
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
                                  const TargetRegisterInfo *TRI) const override;
 
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MI,
-                                  std::vector<CalleeSavedInfo> &CSI,
-                                  const TargetRegisterInfo *TRI) const override;
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
+                              const TargetRegisterInfo *TRI) const override;
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
diff --git a/llvm/lib/Target/Mips/Mips16HardFloat.cpp b/llvm/lib/Target/Mips/Mips16HardFloat.cpp
index e9a3c7ec4b19..cc1f72c03632 100644
--- a/llvm/lib/Target/Mips/Mips16HardFloat.cpp
+++ b/llvm/lib/Target/Mips/Mips16HardFloat.cpp
@@ -43,7 +43,7 @@ namespace {
 
 } // end anonymous namespace
 
-static void EmitInlineAsm(LLVMContext &C, BasicBlock *BB, StringRef AsmText) {
+static void emitInlineAsm(LLVMContext &C, BasicBlock *BB, StringRef AsmText) {
   std::vector<Type *> AsmArgTypes;
   std::vector<Value *> AsmArgs;
 
@@ -260,7 +260,7 @@ static void assureFPCallStub(Function &F, Module *M,
     return;
   LLVMContext &Context = M->getContext();
   bool LE = TM.isLittleEndian();
-  std::string Name = F.getName();
+  std::string Name(F.getName());
   std::string SectionName = ".mips16.call.fp." + Name;
   std::string StubName = "__call_stub_fp_" + Name;
   //
@@ -339,7 +339,7 @@ static void assureFPCallStub(Function &F, Module *M,
     AsmText += "jr $$18\n";
   else
     AsmText += "jr $$25\n";
-  EmitInlineAsm(Context, BB, AsmText);
+  emitInlineAsm(Context, BB, AsmText);
 
   new UnreachableInst(Context, BB);
 }
@@ -448,7 +448,7 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
   bool PicMode = TM.isPositionIndependent();
   bool LE = TM.isLittleEndian();
   LLVMContext &Context = M->getContext();
-  std::string Name = F->getName();
+  std::string Name(F->getName());
   std::string SectionName = ".mips16.fn." + Name;
   std::string StubName = "__fn_stub_" + Name;
   std::string LocalName = "$$__fn_local_" + Name;
@@ -475,7 +475,7 @@ static void createFPFnStub(Function *F, Module *M, FPParamVariant PV,
   AsmText += swapFPIntParams(PV, M, LE, false);
   AsmText += "jr $$25\n";
   AsmText += LocalName + " = " + Name + "\n";
-  EmitInlineAsm(Context, BB, AsmText);
+  emitInlineAsm(Context, BB, AsmText);
 
   new UnreachableInst(FStub->getContext(), BB);
 }
diff --git a/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp b/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
index 768d54fc9c24..ddd28d095e51 100644
--- a/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/Mips16ISelDAGToDAG.cpp
@@ -72,7 +72,7 @@ void Mips16DAGToDAGISel::initGlobalBaseReg(MachineFunction &MF) {
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
   DebugLoc DL;
-  Register V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg();
+  Register V0, V1, V2, GlobalBaseReg = MipsFI->getGlobalBaseReg(MF);
   const TargetRegisterClass *RC = &Mips::CPU16RegsRegClass;
 
   V0 = RegInfo.createVirtualRegister(RC);
diff --git a/llvm/lib/Target/Mips/Mips16ISelLowering.cpp b/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
index 5425df77d9b8..a3b86bdc2ca0 100644
--- a/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
+++ b/llvm/lib/Target/Mips/Mips16ISelLowering.cpp
@@ -492,7 +492,7 @@ getOpndList(SmallVectorImpl<SDValue> &Ops,
       ExternalSymbolSDNode *S = cast<ExternalSymbolSDNode>(JumpTarget);
       JumpTarget = getAddrGlobal(S, CLI.DL, JumpTarget.getValueType(), DAG,
                                  MipsII::MO_GOT, Chain,
-                                 FuncInfo->callPtrInfo(S->getSymbol()));
+                                 FuncInfo->callPtrInfo(MF, S->getSymbol()));
     } else
       RegsToPass.push_front(std::make_pair((unsigned)Mips::T9, Callee));
   }
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
index d2a1ba39cb0e..3403ec01aef2 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.cpp
@@ -105,7 +105,7 @@ Mips16InstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
 
 void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB,
                                       MachineBasicBlock::iterator I,
-                                      unsigned SrcReg, bool isKill, int FI,
+                                      Register SrcReg, bool isKill, int FI,
                                       const TargetRegisterClass *RC,
                                       const TargetRegisterInfo *TRI,
                                       int64_t Offset) const {
@@ -123,7 +123,7 @@ void Mips16InstrInfo::storeRegToStack(MachineBasicBlock &MBB,
 
 void Mips16InstrInfo::loadRegFromStack(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator I,
-                                       unsigned DestReg, int FI,
+                                       Register DestReg, int FI,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI,
                                        int64_t Offset) const {
@@ -182,7 +182,7 @@ unsigned Mips16InstrInfo::getOppositeBranchOpc(unsigned Opc) const {
 }
 
 static void addSaveRestoreRegs(MachineInstrBuilder &MIB,
-                               const std::vector<CalleeSavedInfo> &CSI,
+                               ArrayRef<CalleeSavedInfo> CSI,
                                unsigned Flags = 0) {
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     // Add the callee-saved register as live-in. Do not add if the register is
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.h b/llvm/lib/Target/Mips/Mips16InstrInfo.h
index 2ff849cb2ca2..294afd6460f6 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.h
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.h
@@ -54,14 +54,14 @@ public:
 
   void storeRegToStack(MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator MBBI,
-                       unsigned SrcReg, bool isKill, int FrameIndex,
+                       Register SrcReg, bool isKill, int FrameIndex,
                        const TargetRegisterClass *RC,
                        const TargetRegisterInfo *TRI,
                        int64_t Offset) const override;
 
   void loadRegFromStack(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator MBBI,
-                        unsigned DestReg, int FrameIndex,
+                        Register DestReg, int FrameIndex,
                         const TargetRegisterClass *RC,
                         const TargetRegisterInfo *TRI,
                         int64_t Offset) const override;
diff --git a/llvm/lib/Target/Mips/Mips16InstrInfo.td b/llvm/lib/Target/Mips/Mips16InstrInfo.td
index 19ea50c89b96..990202b23bc0 100644
--- a/llvm/lib/Target/Mips/Mips16InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips16InstrInfo.td
@@ -1642,7 +1642,7 @@ def : Mips16Pat<(select (i32 (setle CPU16Regs:$a, CPU16Regs:$b)),
                  CPU16Regs:$b, CPU16Regs:$a)>;
 
 //
-// unnsigned
+// unsigned
 // x = (a <= b)? x : y
 //
 // if  (b < a) x = y
diff --git a/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp b/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
index 5703f585a6a2..f6f43da9abf8 100644
--- a/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
+++ b/llvm/lib/Target/Mips/Mips16RegisterInfo.cpp
@@ -53,12 +53,10 @@ bool Mips16RegisterInfo::useFPForScavengingIndex
   return false;
 }
 
-bool Mips16RegisterInfo::saveScavengerRegister
-  (MachineBasicBlock &MBB,
-   MachineBasicBlock::iterator I,
-   MachineBasicBlock::iterator &UseMI,
-   const TargetRegisterClass *RC,
-   unsigned Reg) const {
+bool Mips16RegisterInfo::saveScavengerRegister(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+    MachineBasicBlock::iterator &UseMI, const TargetRegisterClass *RC,
+    Register Reg) const {
   DebugLoc DL;
   const TargetInstrInfo &TII = *MBB.getParent()->getSubtarget().getInstrInfo();
   TII.copyPhysReg(MBB, I, DL, Mips::T0, Reg, true);
@@ -96,7 +94,7 @@ void Mips16RegisterInfo::eliminateFI(MachineBasicBlock::iterator II,
   //  3. Locations for callee-saved registers.
   // Everything else is referenced relative to whatever register
   // getFrameRegister() returns.
-  unsigned FrameReg;
+  Register FrameReg;
 
   if (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI)
     FrameReg = Mips::SP;
diff --git a/llvm/lib/Target/Mips/Mips16RegisterInfo.h b/llvm/lib/Target/Mips/Mips16RegisterInfo.h
index fca78b43f96b..ff115b30162b 100644
--- a/llvm/lib/Target/Mips/Mips16RegisterInfo.h
+++ b/llvm/lib/Target/Mips/Mips16RegisterInfo.h
@@ -16,7 +16,6 @@
 #include "MipsRegisterInfo.h"
 
 namespace llvm {
-class Mips16InstrInfo;
 
 class Mips16RegisterInfo : public MipsRegisterInfo {
 public:
@@ -29,10 +28,10 @@ public:
   bool useFPForScavengingIndex(const MachineFunction &MF) const override;
 
   bool saveScavengerRegister(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator I,
-                                     MachineBasicBlock::iterator &UseMI,
-                                     const TargetRegisterClass *RC,
-                                     unsigned Reg) const override;
+                             MachineBasicBlock::iterator I,
+                             MachineBasicBlock::iterator &UseMI,
+                             const TargetRegisterClass *RC,
+                             Register Reg) const override;
 
   const TargetRegisterClass *intRegClass(unsigned Size) const override;
 
diff --git a/llvm/lib/Target/Mips/Mips64InstrInfo.td b/llvm/lib/Target/Mips/Mips64InstrInfo.td
index 306289d56e4b..bd62a56d3008 100644
--- a/llvm/lib/Target/Mips/Mips64InstrInfo.td
+++ b/llvm/lib/Target/Mips/Mips64InstrInfo.td
@@ -1248,5 +1248,19 @@ def : MipsInstAlias<"sgtu $rs, $imm", (SGTUImm64 GPR64Opnd:$rs,
                                                  GPR64Opnd:$rs,
                                                  imm64:$imm), 0>, GPR_64;
 
+def SLEImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                 (ins GPR64Opnd:$rs, imm64:$imm),
+                                 "sle\t$rd, $rs, $imm">, GPR_64;
+def : MipsInstAlias<"sle $rs, $imm", (SLEImm64 GPR64Opnd:$rs,
+                                               GPR64Opnd:$rs,
+                                               imm64:$imm), 0>, GPR_64;
+
+def SLEUImm64 : MipsAsmPseudoInst<(outs GPR64Opnd:$rd),
+                                  (ins GPR64Opnd:$rs, imm64:$imm),
+                                  "sleu\t$rd, $rs, $imm">, GPR_64;
+def : MipsInstAlias<"sleu $rs, $imm", (SLEUImm64 GPR64Opnd:$rs,
+                                                 GPR64Opnd:$rs,
+                                                 imm64:$imm), 0>, GPR_64;
+
 def : MipsInstAlias<"rdhwr $rt, $rs",
                     (RDHWR64 GPR64Opnd:$rt, HWRegsOpnd:$rs, 0), 1>, GPR_64;
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
index 8f75336dce5a..cc073fbf5231 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -174,18 +174,18 @@ static void emitDirectiveRelocJalr(const MachineInstr &MI,
             MCSymbolRefExpr::create(OffsetLabel, OutContext);
         const MCExpr *CaleeExpr =
             MCSymbolRefExpr::create(Callee, OutContext);
-        OutStreamer.EmitRelocDirective
-            (*OffsetExpr,
-             Subtarget.inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
-             CaleeExpr, SMLoc(), *TM.getMCSubtargetInfo());
-        OutStreamer.EmitLabel(OffsetLabel);
+        OutStreamer.emitRelocDirective(
+            *OffsetExpr,
+            Subtarget.inMicroMipsMode() ? "R_MICROMIPS_JALR" : "R_MIPS_JALR",
+            CaleeExpr, SMLoc(), *TM.getMCSubtargetInfo());
+        OutStreamer.emitLabel(OffsetLabel);
         return;
       }
     }
   }
 }
 
-void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void MipsAsmPrinter::emitInstruction(const MachineInstr *MI) {
   MipsTargetStreamer &TS = getTargetStreamer();
   unsigned Opc = MI->getOpcode();
   TS.forbidModuleDirective();
@@ -202,7 +202,7 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   // If we just ended a constant pool, mark it as such.
   if (InConstantPool && Opc != Mips::CONSTPOOL_ENTRY) {
-    OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+    OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
     InConstantPool = false;
   }
   if (Opc == Mips::CONSTPOOL_ENTRY) {
@@ -218,17 +218,17 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
     // If this is the first entry of the pool, mark it.
     if (!InConstantPool) {
-      OutStreamer->EmitDataRegion(MCDR_DataRegion);
+      OutStreamer->emitDataRegion(MCDR_DataRegion);
       InConstantPool = true;
     }
 
-    OutStreamer->EmitLabel(GetCPISymbol(LabelId));
+    OutStreamer->emitLabel(GetCPISymbol(LabelId));
 
     const MachineConstantPoolEntry &MCPE = MCP->getConstants()[CPIdx];
     if (MCPE.isMachineConstantPoolEntry())
-      EmitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
+      emitMachineConstantPoolValue(MCPE.Val.MachineCPVal);
     else
-      EmitGlobalConstant(MF->getDataLayout(), MCPE.Val.ConstVal);
+      emitGlobalConstant(MF->getDataLayout(), MCPE.Val.ConstVal);
     return;
   }
 
@@ -280,7 +280,7 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     //
     if (I->isPseudo() && !Subtarget->inMips16Mode()
         && !isLongBranchPseudo(I->getOpcode()))
-      llvm_unreachable("Pseudo opcode found in EmitInstruction()");
+      llvm_unreachable("Pseudo opcode found in emitInstruction()");
 
     MCInst TmpInst0;
     MCInstLowering.Lower(&*I, TmpInst0);
@@ -398,13 +398,13 @@ const char *MipsAsmPrinter::getCurrentABIString() const {
   }
 }
 
-void MipsAsmPrinter::EmitFunctionEntryLabel() {
+void MipsAsmPrinter::emitFunctionEntryLabel() {
   MipsTargetStreamer &TS = getTargetStreamer();
 
   // NaCl sandboxing requires that indirect call instructions are masked.
   // This means that function entry points should be bundle-aligned.
   if (Subtarget->isTargetNaCl())
-    EmitAlignment(std::max(MF->getAlignment(), MIPS_NACL_BUNDLE_ALIGN));
+    emitAlignment(std::max(MF->getAlignment(), MIPS_NACL_BUNDLE_ALIGN));
 
   if (Subtarget->inMicroMipsMode()) {
     TS.emitDirectiveSetMicroMips();
@@ -419,12 +419,12 @@ void MipsAsmPrinter::EmitFunctionEntryLabel() {
     TS.emitDirectiveSetNoMips16();
 
   TS.emitDirectiveEnt(*CurrentFnSym);
-  OutStreamer->EmitLabel(CurrentFnSym);
+  OutStreamer->emitLabel(CurrentFnSym);
 }
 
 /// EmitFunctionBodyStart - Targets can override this to emit stuff before
 /// the first basic block in the function.
-void MipsAsmPrinter::EmitFunctionBodyStart() {
+void MipsAsmPrinter::emitFunctionBodyStart() {
   MipsTargetStreamer &TS = getTargetStreamer();
 
   MCInstLowering.Initialize(&MF->getContext());
@@ -445,7 +445,7 @@ void MipsAsmPrinter::EmitFunctionBodyStart() {
 
 /// EmitFunctionBodyEnd - Targets can override this to emit stuff after
 /// the last basic block in the function.
-void MipsAsmPrinter::EmitFunctionBodyEnd() {
+void MipsAsmPrinter::emitFunctionBodyEnd() {
   MipsTargetStreamer &TS = getTargetStreamer();
 
   // There are instruction for this macros, but they must
@@ -462,11 +462,11 @@ void MipsAsmPrinter::EmitFunctionBodyEnd() {
   if (!InConstantPool)
     return;
   InConstantPool = false;
-  OutStreamer->EmitDataRegion(MCDR_DataRegionEnd);
+  OutStreamer->emitDataRegion(MCDR_DataRegionEnd);
 }
 
-void MipsAsmPrinter::EmitBasicBlockEnd(const MachineBasicBlock &MBB) {
-  AsmPrinter::EmitBasicBlockEnd(MBB);
+void MipsAsmPrinter::emitBasicBlockEnd(const MachineBasicBlock &MBB) {
+  AsmPrinter::emitBasicBlockEnd(MBB);
   MipsTargetStreamer &TS = getTargetStreamer();
   if (MBB.empty())
     TS.emitDirectiveInsn();
@@ -770,7 +770,7 @@ printRegisterList(const MachineInstr *MI, int opNum, raw_ostream &O) {
   }
 }
 
-void MipsAsmPrinter::EmitStartOfAsmFile(Module &M) {
+void MipsAsmPrinter::emitStartOfAsmFile(Module &M) {
   MipsTargetStreamer &TS = getTargetStreamer();
 
   // MipsTargetStreamer has an initialization order problem when emitting an
@@ -860,7 +860,7 @@ void MipsAsmPrinter::EmitJal(const MCSubtargetInfo &STI, MCSymbol *Symbol) {
   I.setOpcode(Mips::JAL);
   I.addOperand(
       MCOperand::createExpr(MCSymbolRefExpr::create(Symbol, OutContext)));
-  OutStreamer->EmitInstruction(I, STI);
+  OutStreamer->emitInstruction(I, STI);
 }
 
 void MipsAsmPrinter::EmitInstrReg(const MCSubtargetInfo &STI, unsigned Opcode,
@@ -868,7 +868,7 @@ void MipsAsmPrinter::EmitInstrReg(const MCSubtargetInfo &STI, unsigned Opcode,
   MCInst I;
   I.setOpcode(Opcode);
   I.addOperand(MCOperand::createReg(Reg));
-  OutStreamer->EmitInstruction(I, STI);
+  OutStreamer->emitInstruction(I, STI);
 }
 
 void MipsAsmPrinter::EmitInstrRegReg(const MCSubtargetInfo &STI,
@@ -888,7 +888,7 @@ void MipsAsmPrinter::EmitInstrRegReg(const MCSubtargetInfo &STI,
   I.setOpcode(Opcode);
   I.addOperand(MCOperand::createReg(Reg1));
   I.addOperand(MCOperand::createReg(Reg2));
-  OutStreamer->EmitInstruction(I, STI);
+  OutStreamer->emitInstruction(I, STI);
 }
 
 void MipsAsmPrinter::EmitInstrRegRegReg(const MCSubtargetInfo &STI,
@@ -899,7 +899,7 @@ void MipsAsmPrinter::EmitInstrRegRegReg(const MCSubtargetInfo &STI,
   I.addOperand(MCOperand::createReg(Reg1));
   I.addOperand(MCOperand::createReg(Reg2));
   I.addOperand(MCOperand::createReg(Reg3));
-  OutStreamer->EmitInstruction(I, STI);
+  OutStreamer->emitInstruction(I, STI);
 }
 
 void MipsAsmPrinter::EmitMovFPIntPair(const MCSubtargetInfo &STI,
@@ -990,7 +990,7 @@ void MipsAsmPrinter::EmitFPCallStub(
   //
   // .global xxxx
   //
-  OutStreamer->EmitSymbolAttribute(MSymbol, MCSA_Global);
+  OutStreamer->emitSymbolAttribute(MSymbol, MCSA_Global);
   const char *RetType;
   //
   // make the comment field identifying the return and parameter
@@ -1054,7 +1054,7 @@ void MipsAsmPrinter::EmitFPCallStub(
   //
   // .align 2
   //
-  OutStreamer->EmitValueToAlignment(4);
+  OutStreamer->emitValueToAlignment(4);
   MipsTargetStreamer &TS = getTargetStreamer();
   //
   // .set nomips16
@@ -1073,8 +1073,8 @@ void MipsAsmPrinter::EmitFPCallStub(
   TS.emitDirectiveEnt(*Stub);
   MCSymbol *MType =
       OutContext.getOrCreateSymbol("__call_stub_fp_" + Twine(Symbol));
-  OutStreamer->EmitSymbolAttribute(MType, MCSA_ELF_TypeFunction);
-  OutStreamer->EmitLabel(Stub);
+  OutStreamer->emitSymbolAttribute(MType, MCSA_ELF_TypeFunction);
+  OutStreamer->emitLabel(Stub);
 
   // Only handle non-pic for now.
   assert(!isPositionIndependent() &&
@@ -1113,7 +1113,7 @@ void MipsAsmPrinter::EmitFPCallStub(
   EmitInstrReg(*STI, Mips::JR, Mips::S2);
 
   MCSymbol *Tmp = OutContext.createTempSymbol();
-  OutStreamer->EmitLabel(Tmp);
+  OutStreamer->emitLabel(Tmp);
   const MCSymbolRefExpr *E = MCSymbolRefExpr::create(Stub, OutContext);
   const MCSymbolRefExpr *T = MCSymbolRefExpr::create(Tmp, OutContext);
   const MCExpr *T_min_E = MCBinaryExpr::createSub(T, E, OutContext);
@@ -1122,7 +1122,7 @@ void MipsAsmPrinter::EmitFPCallStub(
   OutStreamer->PopSection();
 }
 
-void MipsAsmPrinter::EmitEndOfAsmFile(Module &M) {
+void MipsAsmPrinter::emitEndOfAsmFile(Module &M) {
   // Emit needed stubs
   //
   for (std::map<
@@ -1203,9 +1203,9 @@ void MipsAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) {
   //   LD       RA, 8(SP)
   //   DADDIU   SP, SP, 16
   //
-  OutStreamer->EmitCodeAlignment(4);
+  OutStreamer->emitCodeAlignment(4);
   auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
-  OutStreamer->EmitLabel(CurSled);
+  OutStreamer->emitLabel(CurSled);
   auto Target = OutContext.createTempSymbol();
 
   // Emit "B .tmpN" instruction, which jumps over the nop sled to the actual
@@ -1223,7 +1223,7 @@ void MipsAsmPrinter::EmitSled(const MachineInstr &MI, SledKind Kind) {
                                      .addReg(Mips::ZERO)
                                      .addImm(0));
 
-  OutStreamer->EmitLabel(Target);
+  OutStreamer->emitLabel(Target);
 
   if (!Subtarget->isGP64bit()) {
     EmitToStreamer(*OutStreamer,
@@ -1255,15 +1255,15 @@ void MipsAsmPrinter::PrintDebugValueComment(const MachineInstr *MI,
 
 // Emit .dtprelword or .dtpreldword directive
 // and value for debug thread local expression.
-void MipsAsmPrinter::EmitDebugValue(const MCExpr *Value, unsigned Size) const {
+void MipsAsmPrinter::emitDebugValue(const MCExpr *Value, unsigned Size) const {
   if (auto *MipsExpr = dyn_cast<MipsMCExpr>(Value)) {
     if (MipsExpr && MipsExpr->getKind() == MipsMCExpr::MEK_DTPREL) {
       switch (Size) {
       case 4:
-        OutStreamer->EmitDTPRel32Value(MipsExpr->getSubExpr());
+        OutStreamer->emitDTPRel32Value(MipsExpr->getSubExpr());
         break;
       case 8:
-        OutStreamer->EmitDTPRel64Value(MipsExpr->getSubExpr());
+        OutStreamer->emitDTPRel64Value(MipsExpr->getSubExpr());
         break;
       default:
         llvm_unreachable("Unexpected size of expression value.");
@@ -1271,7 +1271,7 @@ void MipsAsmPrinter::EmitDebugValue(const MCExpr *Value, unsigned Size) const {
       return;
     }
   }
-  AsmPrinter::EmitDebugValue(Value, Size);
+  AsmPrinter::emitDebugValue(Value, Size);
 }
 
 // Align all targets of indirect branches on bundle size.  Used only if target
diff --git a/llvm/lib/Target/Mips/MipsAsmPrinter.h b/llvm/lib/Target/Mips/MipsAsmPrinter.h
index 173a1312812e..64424b181504 100644
--- a/llvm/lib/Target/Mips/MipsAsmPrinter.h
+++ b/llvm/lib/Target/Mips/MipsAsmPrinter.h
@@ -126,22 +126,22 @@ public:
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
-  void EmitConstantPool() override {
+  void emitConstantPool() override {
     bool UsingConstantPools =
       (Subtarget->inMips16Mode() && Subtarget->useConstantIslands());
     if (!UsingConstantPools)
-      AsmPrinter::EmitConstantPool();
+      AsmPrinter::emitConstantPool();
     // we emit constant pools customly!
   }
 
-  void EmitInstruction(const MachineInstr *MI) override;
+  void emitInstruction(const MachineInstr *MI) override;
   void printSavedRegsBitmask();
   void emitFrameDirective();
   const char *getCurrentABIString() const;
-  void EmitFunctionEntryLabel() override;
-  void EmitFunctionBodyStart() override;
-  void EmitFunctionBodyEnd() override;
-  void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override;
+  void emitFunctionEntryLabel() override;
+  void emitFunctionBodyStart() override;
+  void emitFunctionBodyEnd() override;
+  void emitBasicBlockEnd(const MachineBasicBlock &MBB) override;
   bool isBlockOnlyReachableByFallthrough(
                                    const MachineBasicBlock* MBB) const override;
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -154,10 +154,10 @@ public:
   void printFCCOperand(const MachineInstr *MI, int opNum, raw_ostream &O,
                        const char *Modifier = nullptr);
   void printRegisterList(const MachineInstr *MI, int opNum, raw_ostream &O);
-  void EmitStartOfAsmFile(Module &M) override;
-  void EmitEndOfAsmFile(Module &M) override;
+  void emitStartOfAsmFile(Module &M) override;
+  void emitEndOfAsmFile(Module &M) override;
   void PrintDebugValueComment(const MachineInstr *MI, raw_ostream &OS);
-  void EmitDebugValue(const MCExpr *Value, unsigned Size) const override;
+  void emitDebugValue(const MCExpr *Value, unsigned Size) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
index 1523a6c020aa..aa8e298fa759 100644
--- a/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
+++ b/llvm/lib/Target/Mips/MipsBranchExpansion.cpp
@@ -342,16 +342,25 @@ void MipsBranchExpansion::replaceBranch(MachineBasicBlock &MBB, Iter Br,
   for (unsigned I = 0, E = Br->getDesc().getNumOperands(); I < E; ++I) {
     MachineOperand &MO = Br->getOperand(I);
 
-    if (!MO.isReg()) {
-      assert(MO.isMBB() && "MBB operand expected.");
+    switch (MO.getType()) {
+    case MachineOperand::MO_Register:
+      MIB.addReg(MO.getReg());
       break;
+    case MachineOperand::MO_Immediate:
+      // Octeon BBIT family of branch has an immediate operand
+      // (e.g. BBIT0 $v0, 3, %bb.1).
+      if (!TII->isBranchWithImm(Br->getOpcode()))
+        llvm_unreachable("Unexpected immediate in branch instruction");
+      MIB.addImm(MO.getImm());
+      break;
+    case MachineOperand::MO_MachineBasicBlock:
+      MIB.addMBB(MBBOpnd);
+      break;
+    default:
+      llvm_unreachable("Unexpected operand type in branch instruction");
     }
-
-    MIB.addReg(MO.getReg());
   }
 
-  MIB.addMBB(MBBOpnd);
-
   if (Br->hasDelaySlot()) {
     // Bundle the instruction in the delay slot to the newly created branch
     // and erase the original branch.
diff --git a/llvm/lib/Target/Mips/MipsCCState.cpp b/llvm/lib/Target/Mips/MipsCCState.cpp
index ef48c850a1b8..fe3fe82797c3 100644
--- a/llvm/lib/Target/Mips/MipsCCState.cpp
+++ b/llvm/lib/Target/Mips/MipsCCState.cpp
@@ -30,9 +30,9 @@ static bool isF128SoftLibCall(const char *CallSym) {
 
   // Check that LibCalls is sorted alphabetically.
   auto Comp = [](const char *S1, const char *S2) { return strcmp(S1, S2) < 0; };
-  assert(std::is_sorted(std::begin(LibCalls), std::end(LibCalls), Comp));
-  return std::binary_search(std::begin(LibCalls), std::end(LibCalls),
-                            CallSym, Comp);
+  assert(llvm::is_sorted(LibCalls, Comp));
+  return std::binary_search(std::begin(LibCalls), std::end(LibCalls), CallSym,
+                            Comp);
 }
 
 /// This function returns true if Ty is fp128, {f128} or i128 which was
diff --git a/llvm/lib/Target/Mips/MipsCallLowering.cpp b/llvm/lib/Target/Mips/MipsCallLowering.cpp
index 6ba15c232867..cffd99affac1 100644
--- a/llvm/lib/Target/Mips/MipsCallLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsCallLowering.cpp
@@ -110,10 +110,10 @@ private:
     MIRBuilder.getMBB().addLiveIn(PhysReg);
   }
 
-  void buildLoad(Register Val, const CCValAssign &VA) {
+  MachineInstrBuilder buildLoad(const DstOp &Res, const CCValAssign &VA) {
     MachineMemOperand *MMO;
     Register Addr = getStackAddress(VA, MMO);
-    MIRBuilder.buildLoad(Val, Addr, *MMO);
+    return MIRBuilder.buildLoad(Res, Addr, *MMO);
   }
 };
 
@@ -136,29 +136,19 @@ private:
 void IncomingValueHandler::assignValueToReg(Register ValVReg,
                                             const CCValAssign &VA,
                                             const EVT &VT) {
-  const MipsSubtarget &STI =
-      static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
   Register PhysReg = VA.getLocReg();
   if (VT == MVT::f64 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
     const MipsSubtarget &STI =
         static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
-
-    MIRBuilder
-        .buildInstr(STI.isFP64bit() ? Mips::BuildPairF64_64
-                                    : Mips::BuildPairF64)
-        .addDef(ValVReg)
-        .addUse(PhysReg + (STI.isLittle() ? 0 : 1))
-        .addUse(PhysReg + (STI.isLittle() ? 1 : 0))
-        .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
-                          *STI.getRegBankInfo());
+    bool IsEL = STI.isLittle();
+    LLT s32 = LLT::scalar(32);
+    auto Lo = MIRBuilder.buildCopy(s32, Register(PhysReg + (IsEL ? 0 : 1)));
+    auto Hi = MIRBuilder.buildCopy(s32, Register(PhysReg + (IsEL ? 1 : 0)));
+    MIRBuilder.buildMerge(ValVReg, {Lo, Hi});
     markPhysRegUsed(PhysReg);
     markPhysRegUsed(PhysReg + 1);
   } else if (VT == MVT::f32 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
-    MIRBuilder.buildInstr(Mips::MTC1)
-        .addDef(ValVReg)
-        .addUse(PhysReg)
-        .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
-                          *STI.getRegBankInfo());
+    MIRBuilder.buildCopy(ValVReg, PhysReg);
     markPhysRegUsed(PhysReg);
   } else {
     switch (VA.getLocInfo()) {
@@ -189,13 +179,11 @@ Register IncomingValueHandler::getStackAddress(const CCValAssign &VA,
       MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
 
   const TargetFrameLowering *TFL = MF.getSubtarget().getFrameLowering();
-  unsigned Align = MinAlign(TFL->getStackAlignment(), Offset);
-  MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, Size, Align);
-
-  Register AddrReg = MRI.createGenericVirtualRegister(LLT::pointer(0, 32));
-  MIRBuilder.buildFrameIndex(AddrReg, FI);
+  Align Alignment = commonAlignment(TFL->getStackAlign(), Offset);
+  MMO =
+      MF.getMachineMemOperand(MPO, MachineMemOperand::MOLoad, Size, Alignment);
 
-  return AddrReg;
+  return MIRBuilder.buildFrameIndex(LLT::pointer(0, 32), FI).getReg(0);
 }
 
 void IncomingValueHandler::assignValueToAddress(Register ValVReg,
@@ -203,9 +191,8 @@ void IncomingValueHandler::assignValueToAddress(Register ValVReg,
   if (VA.getLocInfo() == CCValAssign::SExt ||
       VA.getLocInfo() == CCValAssign::ZExt ||
       VA.getLocInfo() == CCValAssign::AExt) {
-    Register LoadReg = MRI.createGenericVirtualRegister(LLT::scalar(32));
-    buildLoad(LoadReg, VA);
-    MIRBuilder.buildTrunc(ValVReg, LoadReg);
+    auto Load = buildLoad(LLT::scalar(32), VA);
+    MIRBuilder.buildTrunc(ValVReg, Load);
   } else
     buildLoad(ValVReg, VA);
 }
@@ -251,32 +238,15 @@ void OutgoingValueHandler::assignValueToReg(Register ValVReg,
                                             const CCValAssign &VA,
                                             const EVT &VT) {
   Register PhysReg = VA.getLocReg();
-  const MipsSubtarget &STI =
-      static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
-
   if (VT == MVT::f64 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
-    MIRBuilder
-        .buildInstr(STI.isFP64bit() ? Mips::ExtractElementF64_64
-                                    : Mips::ExtractElementF64)
-        .addDef(PhysReg + (STI.isLittle() ? 1 : 0))
-        .addUse(ValVReg)
-        .addImm(1)
-        .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
-                          *STI.getRegBankInfo());
-    MIRBuilder
-        .buildInstr(STI.isFP64bit() ? Mips::ExtractElementF64_64
-                                    : Mips::ExtractElementF64)
-        .addDef(PhysReg + (STI.isLittle() ? 0 : 1))
-        .addUse(ValVReg)
-        .addImm(0)
-        .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
-                          *STI.getRegBankInfo());
+    const MipsSubtarget &STI =
+        static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
+    bool IsEL = STI.isLittle();
+    auto Unmerge = MIRBuilder.buildUnmerge(LLT::scalar(32), ValVReg);
+    MIRBuilder.buildCopy(Register(PhysReg + (IsEL ? 0 : 1)), Unmerge.getReg(0));
+    MIRBuilder.buildCopy(Register(PhysReg + (IsEL ? 1 : 0)), Unmerge.getReg(1));
   } else if (VT == MVT::f32 && PhysReg >= Mips::A0 && PhysReg <= Mips::A3) {
-    MIRBuilder.buildInstr(Mips::MFC1)
-        .addDef(PhysReg)
-        .addUse(ValVReg)
-        .constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
-                          *STI.getRegBankInfo());
+    MIRBuilder.buildCopy(PhysReg, ValVReg);
   } else {
     Register ExtReg = extendRegister(ValVReg, VA);
     MIRBuilder.buildCopy(PhysReg, ExtReg);
@@ -291,23 +261,21 @@ Register OutgoingValueHandler::getStackAddress(const CCValAssign &VA,
 
   LLT p0 = LLT::pointer(0, 32);
   LLT s32 = LLT::scalar(32);
-  Register SPReg = MRI.createGenericVirtualRegister(p0);
-  MIRBuilder.buildCopy(SPReg, Register(Mips::SP));
+  auto SPReg = MIRBuilder.buildCopy(p0, Register(Mips::SP));
 
-  Register OffsetReg = MRI.createGenericVirtualRegister(s32);
   unsigned Offset = VA.getLocMemOffset();
-  MIRBuilder.buildConstant(OffsetReg, Offset);
+  auto OffsetReg = MIRBuilder.buildConstant(s32, Offset);
 
-  Register AddrReg = MRI.createGenericVirtualRegister(p0);
-  MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg);
+  auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg);
 
   MachinePointerInfo MPO =
       MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
   unsigned Size = alignTo(VA.getValVT().getSizeInBits(), 8) / 8;
-  unsigned Align = MinAlign(TFL->getStackAlignment(), Offset);
-  MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, Size, Align);
+  Align Alignment = commonAlignment(TFL->getStackAlign(), Offset);
+  MMO =
+      MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, Size, Alignment);
 
-  return AddrReg;
+  return AddrReg.getReg(0);
 }
 
 void OutgoingValueHandler::assignValueToAddress(Register ValVReg,
@@ -323,19 +291,13 @@ Register OutgoingValueHandler::extendRegister(Register ValReg,
   LLT LocTy{VA.getLocVT()};
   switch (VA.getLocInfo()) {
   case CCValAssign::SExt: {
-    Register ExtReg = MRI.createGenericVirtualRegister(LocTy);
-    MIRBuilder.buildSExt(ExtReg, ValReg);
-    return ExtReg;
+    return MIRBuilder.buildSExt(LocTy, ValReg).getReg(0);
   }
   case CCValAssign::ZExt: {
-    Register ExtReg = MRI.createGenericVirtualRegister(LocTy);
-    MIRBuilder.buildZExt(ExtReg, ValReg);
-    return ExtReg;
+    return MIRBuilder.buildZExt(LocTy, ValReg).getReg(0);
   }
   case CCValAssign::AExt: {
-    Register ExtReg = MRI.createGenericVirtualRegister(LocTy);
-    MIRBuilder.buildAnyExt(ExtReg, ValReg);
-    return ExtReg;
+    return MIRBuilder.buildAnyExt(LocTy, ValReg).getReg(0);
   }
   // TODO : handle upper extends
   case CCValAssign::Full:
@@ -489,7 +451,7 @@ bool MipsCallLowering::lowerFormalArguments(
       static_cast<const MipsTargetMachine &>(MF.getTarget());
   const MipsABIInfo &ABI = TM.getABI();
   CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(F.getCallingConv()),
-                       1);
+                       Align(1));
   CCInfo.AnalyzeFormalArguments(Ins, TLI.CCAssignFnForCall());
   setLocInfo(ArgLocs, Ins);
 
@@ -524,9 +486,8 @@ bool MipsCallLowering::lowerFormalArguments(
       MachinePointerInfo MPO = MachinePointerInfo::getFixedStack(MF, FI);
       MachineInstrBuilder FrameIndex =
           MIRBuilder.buildFrameIndex(LLT::pointer(MPO.getAddrSpace(), 32), FI);
-      MachineMemOperand *MMO =
-          MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore, RegSize,
-                                  /* Alignment */ RegSize);
+      MachineMemOperand *MMO = MF.getMachineMemOperand(
+          MPO, MachineMemOperand::MOStore, RegSize, Align(RegSize));
       MIRBuilder.buildStore(Copy, FrameIndex, *MMO);
     }
   }
@@ -611,7 +572,8 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   MipsCCState CCInfo(F.getCallingConv(), IsCalleeVarArg, MF, ArgLocs,
                      F.getContext());
 
-  CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(Info.CallConv), 1);
+  CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(Info.CallConv),
+                       Align(1));
   const char *Call =
       Info.Callee.isSymbol() ? Info.Callee.getSymbolName() : nullptr;
   CCInfo.AnalyzeCallOperands(Outs, TLI.CCAssignFnForCall(), FuncOrigArgs, Call);
@@ -631,7 +593,7 @@ bool MipsCallLowering::lowerCall(MachineIRBuilder &MIRBuilder,
   if (IsCalleeGlobalPIC) {
     MIRBuilder.buildCopy(
       Register(Mips::GP),
-      MF.getInfo<MipsFunctionInfo>()->getGlobalBaseRegForGlobalISel());
+      MF.getInfo<MipsFunctionInfo>()->getGlobalBaseRegForGlobalISel(MF));
     MIB.addDef(Mips::GP, RegState::Implicit);
   }
   MIRBuilder.insertInstr(MIB);
@@ -691,7 +653,7 @@ void MipsCallLowering::subTargetRegTypeForCallingConv(
       if (i == 0)
         Flags.setOrigAlign(TLI.getABIAlignmentForCallingConv(Arg.Ty, DL));
       else
-        Flags.setOrigAlign(Align::None());
+        Flags.setOrigAlign(Align(1));
 
       ISDArgs.emplace_back(Flags, RegisterVT, VT, true, OrigArgIndices[ArgNo],
                            0);
diff --git a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
index 1f1a1574443c..faf7160e63e2 100644
--- a/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
+++ b/llvm/lib/Target/Mips/MipsConstantIslandPass.cpp
@@ -529,7 +529,7 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
   MF->push_back(BB);
 
   // MachineConstantPool measures alignment in bytes. We measure in log2(bytes).
-  const Align MaxAlign(MCP->getConstantPoolAlignment());
+  const Align MaxAlign = MCP->getConstantPoolAlign();
 
   // Mark the basic block as required by the const-pool.
   // If AlignConstantIslands isn't set, use 4-byte alignment for everything.
@@ -554,14 +554,13 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
   for (unsigned i = 0, e = CPs.size(); i != e; ++i) {
     unsigned Size = TD.getTypeAllocSize(CPs[i].getType());
     assert(Size >= 4 && "Too small constant pool entry");
-    unsigned Align = CPs[i].getAlignment();
-    assert(isPowerOf2_32(Align) && "Invalid alignment");
+    Align Alignment = CPs[i].getAlign();
     // Verify that all constant pool entries are a multiple of their alignment.
     // If not, we would have to pad them out so that instructions stay aligned.
-    assert((Size % Align) == 0 && "CP Entry not multiple of 4 bytes!");
+    assert(isAligned(Alignment, Size) && "CP Entry not multiple of 4 bytes!");
 
     // Insert CONSTPOOL_ENTRY before entries with a smaller alignment.
-    unsigned LogAlign = Log2_32(Align);
+    unsigned LogAlign = Log2(Alignment);
     MachineBasicBlock::iterator InsAt = InsPoint[LogAlign];
 
     MachineInstr *CPEMI =
@@ -579,7 +578,7 @@ MipsConstantIslands::doInitialPlacement(std::vector<MachineInstr*> &CPEMIs) {
     CPEntries.emplace_back(1, CPEntry(CPEMI, i));
     ++NumCPEs;
     LLVM_DEBUG(dbgs() << "Moved CPI#" << i << " to end of function, size = "
-                      << Size << ", align = " << Align << '\n');
+                      << Size << ", align = " << Alignment.value() << '\n');
   }
   LLVM_DEBUG(BB->dump());
 }
@@ -628,7 +627,7 @@ Align MipsConstantIslands::getCPEAlign(const MachineInstr &CPEMI) {
 
   unsigned CPI = CPEMI.getOperand(1).getIndex();
   assert(CPI < MCP->getConstants().size() && "Invalid constant pool index.");
-  return Align(MCP->getConstants()[CPI].getAlignment());
+  return MCP->getConstants()[CPI].getAlign();
 }
 
 /// initializeFunctionInfo - Do the initial scan of the function, building up
@@ -940,7 +939,7 @@ bool MipsConstantIslands::isWaterInRange(unsigned UserOffset,
   MachineFunction::const_iterator NextBlock = ++Water->getIterator();
   if (NextBlock == MF->end()) {
     NextBlockOffset = BBInfo[Water->getNumber()].postOffset();
-    NextBlockAlignment = Align::None();
+    NextBlockAlignment = Align(1);
   } else {
     NextBlockOffset = BBInfo[NextBlock->getNumber()].Offset;
     NextBlockAlignment = NextBlock->getAlignment();
@@ -1656,7 +1655,7 @@ void MipsConstantIslands::prescanForConstants() {
             Type *Int32Ty =
               Type::getInt32Ty(MF->getFunction().getContext());
             const Constant *C = ConstantInt::get(Int32Ty, V);
-            unsigned index = MCP->getConstantPoolIndex(C, 4);
+            unsigned index = MCP->getConstantPoolIndex(C, Align(4));
             I->getOperand(2).ChangeToImmediate(index);
             LLVM_DEBUG(dbgs() << "constant island constant " << *I << "\n");
             I->setDesc(TII->get(Mips::LwRxPcTcp16));
diff --git a/llvm/lib/Target/Mips/MipsDSPInstrFormats.td b/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
index 6f062d0f3c25..abb6aea50710 100644
--- a/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
+++ b/llvm/lib/Target/Mips/MipsDSPInstrFormats.td
@@ -21,11 +21,11 @@ def Dsp2MicroMips : InstrMapping {
 }
 
 def HasDSP : Predicate<"Subtarget->hasDSP()">,
-             AssemblerPredicate<"FeatureDSP">;
+             AssemblerPredicate<(all_of FeatureDSP)>;
 def HasDSPR2 : Predicate<"Subtarget->hasDSPR2()">,
-               AssemblerPredicate<"FeatureDSPR2">;
+               AssemblerPredicate<(all_of FeatureDSPR2)>;
 def HasDSPR3 : Predicate<"Subtarget->hasDSPR3()">,
-               AssemblerPredicate<"FeatureDSPR3">;
+               AssemblerPredicate<(all_of FeatureDSPR3)>;
 
 class ISA_DSPR2 {
   list<Predicate> ASEPredicate = [HasDSPR2];
diff --git a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
index 84ff674569cd..155d19ba6959 100644
--- a/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
+++ b/llvm/lib/Target/Mips/MipsDelaySlotFiller.cpp
@@ -96,7 +96,7 @@ static cl::opt<CompactBranchPolicy> MipsCompactBranchPolicy(
     cl::values(clEnumValN(CB_Never, "never",
                           "Do not use compact branches if possible."),
                clEnumValN(CB_Optimal, "optimal",
-                          "Use compact branches where appropiate (default)."),
+                          "Use compact branches where appropriate (default)."),
                clEnumValN(CB_Always, "always",
                           "Always use compact branches if possible.")));
 
diff --git a/llvm/lib/Target/Mips/MipsFastISel.cpp b/llvm/lib/Target/Mips/MipsFastISel.cpp
index 80f288ac500c..8a847eaf6618 100644
--- a/llvm/lib/Target/Mips/MipsFastISel.cpp
+++ b/llvm/lib/Target/Mips/MipsFastISel.cpp
@@ -420,7 +420,7 @@ unsigned MipsFastISel::materializeGV(const GlobalValue *GV, MVT VT) {
   if (IsThreadLocal)
     return 0;
   emitInst(Mips::LW, DestReg)
-      .addReg(MFI->getGlobalBaseReg())
+      .addReg(MFI->getGlobalBaseReg(*MF))
       .addGlobalAddress(GV, 0, MipsII::MO_GOT);
   if ((GV->hasInternalLinkage() ||
        (GV->hasLocalLinkage() && !isa<Function>(GV)))) {
@@ -437,7 +437,7 @@ unsigned MipsFastISel::materializeExternalCallSym(MCSymbol *Sym) {
   const TargetRegisterClass *RC = &Mips::GPR32RegClass;
   unsigned DestReg = createResultReg(RC);
   emitInst(Mips::LW, DestReg)
-      .addReg(MFI->getGlobalBaseReg())
+      .addReg(MFI->getGlobalBaseReg(*MF))
       .addSym(Sym, MipsII::MO_GOT);
   return DestReg;
 }
@@ -795,12 +795,11 @@ bool MipsFastISel::emitLoad(MVT VT, unsigned &ResultReg, Address &Addr,
   }
   if (Addr.isFIBase()) {
     unsigned FI = Addr.getFI();
-    unsigned Align = 4;
     int64_t Offset = Addr.getOffset();
     MachineFrameInfo &MFI = MF->getFrameInfo();
     MachineMemOperand *MMO = MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
-        MFI.getObjectSize(FI), Align);
+        MFI.getObjectSize(FI), Align(4));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
         .addFrameIndex(FI)
         .addImm(Offset)
@@ -846,12 +845,11 @@ bool MipsFastISel::emitStore(MVT VT, unsigned SrcReg, Address &Addr,
   }
   if (Addr.isFIBase()) {
     unsigned FI = Addr.getFI();
-    unsigned Align = 4;
     int64_t Offset = Addr.getOffset();
     MachineFrameInfo &MFI = MF->getFrameInfo();
     MachineMemOperand *MMO = MF->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
-        MFI.getObjectSize(FI), Align);
+        MFI.getObjectSize(FI), Align(4));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
         .addReg(SrcReg)
         .addFrameIndex(FI)
@@ -1263,7 +1261,7 @@ bool MipsFastISel::processCallArgs(CallLoweringInfo &CLI,
       Addr.setReg(Mips::SP);
       Addr.setOffset(VA.getLocMemOffset() + BEAlign);
 
-      unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+      Align Alignment = DL.getABITypeAlign(ArgVal->getType());
       MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
           MachinePointerInfo::getStack(*FuncInfo.MF, Addr.getOffset()),
           MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
diff --git a/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
index 8c36bcd5c8f2..d88696525e9e 100644
--- a/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/MipsISelDAGToDAG.cpp
@@ -65,7 +65,7 @@ bool MipsDAGToDAGISel::runOnMachineFunction(MachineFunction &MF) {
 /// getGlobalBaseReg - Output the instructions required to put the
 /// GOT address into a register.
 SDNode *MipsDAGToDAGISel::getGlobalBaseReg() {
-  Register GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg();
+  Register GlobalBaseReg = MF->getInfo<MipsFunctionInfo>()->getGlobalBaseReg(*MF);
   return CurDAG->getRegister(GlobalBaseReg, getTargetLowering()->getPointerTy(
                                                 CurDAG->getDataLayout()))
       .getNode();
@@ -253,7 +253,7 @@ bool MipsDAGToDAGISel::selectVecAddAsVecSubIfProfitable(SDNode *Node) {
   SDLoc DL(Node);
 
   SDValue NegC = CurDAG->FoldConstantArithmetic(
-      ISD::SUB, DL, VT, CurDAG->getConstant(0, DL, VT).getNode(), C.getNode());
+      ISD::SUB, DL, VT, {CurDAG->getConstant(0, DL, VT), C});
   assert(NegC && "Constant-folding failed!");
   SDValue NewNode = CurDAG->getNode(ISD::SUB, DL, VT, X, NegC);
 
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.cpp b/llvm/lib/Target/Mips/MipsISelLowering.cpp
index 46b1f35a6fc7..2da35020006e 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsISelLowering.cpp
@@ -142,8 +142,9 @@ unsigned MipsTargetLowering::getVectorTypeBreakdownForCallingConv(
 }
 
 SDValue MipsTargetLowering::getGlobalReg(SelectionDAG &DAG, EVT Ty) const {
-  MipsFunctionInfo *FI = DAG.getMachineFunction().getInfo<MipsFunctionInfo>();
-  return DAG.getRegister(FI->getGlobalBaseReg(), Ty);
+  MachineFunction &MF = DAG.getMachineFunction();
+  MipsFunctionInfo *FI = MF.getInfo<MipsFunctionInfo>();
+  return DAG.getRegister(FI->getGlobalBaseReg(MF), Ty);
 }
 
 SDValue MipsTargetLowering::getTargetNode(GlobalAddressSDNode *N, EVT Ty,
@@ -173,7 +174,7 @@ SDValue MipsTargetLowering::getTargetNode(JumpTableSDNode *N, EVT Ty,
 SDValue MipsTargetLowering::getTargetNode(ConstantPoolSDNode *N, EVT Ty,
                                           SelectionDAG &DAG,
                                           unsigned Flag) const {
-  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
                                    N->getOffset(), Flag);
 }
 
@@ -1451,6 +1452,14 @@ MipsTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case Mips::PseudoD_SELECT_I:
   case Mips::PseudoD_SELECT_I64:
     return emitPseudoD_SELECT(MI, BB);
+  case Mips::LDR_W:
+    return emitLDR_W(MI, BB);
+  case Mips::LDR_D:
+    return emitLDR_D(MI, BB);
+  case Mips::STR_W:
+    return emitSTR_W(MI, BB);
+  case Mips::STR_D:
+    return emitSTR_D(MI, BB);
   }
 }
 
@@ -2900,8 +2909,8 @@ static bool CC_MipsO32(unsigned ValNo, MVT ValVT, MVT LocVT,
   // argument which is not f32 or f64.
   bool AllocateFloatsInIntReg = State.isVarArg() || ValNo > 1 ||
                                 State.getFirstUnallocated(F32Regs) != ValNo;
-  unsigned OrigAlign = ArgFlags.getOrigAlign();
-  bool isI64 = (ValVT == MVT::i32 && OrigAlign == 8);
+  Align OrigAlign = ArgFlags.getNonZeroOrigAlign();
+  bool isI64 = (ValVT == MVT::i32 && OrigAlign == Align(8));
   bool isVectorFloat = MipsState->WasOriginalArgVectorFloat(ValNo);
 
   // The MIPS vector ABI for floats passes them in a pair of registers
@@ -3201,7 +3210,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // caller side but removing it breaks the frame size calculation.
   unsigned ReservedArgArea =
       MemcpyInByVal ? 0 : ABI.GetCalleeAllocdArgSizeInBytes(CallConv);
-  CCInfo.AllocateStack(ReservedArgArea, 1);
+  CCInfo.AllocateStack(ReservedArgArea, Align(1));
 
   CCInfo.AnalyzeCallOperands(Outs, CC_Mips, CLI.getArgs(),
                              ES ? ES->getSymbol() : nullptr);
@@ -3209,6 +3218,9 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Get a count of how many bytes are to be pushed on the stack.
   unsigned NextStackOffset = CCInfo.getNextStackOffset();
 
+  // Call site info for function parameters tracking.
+  MachineFunction::CallSiteInfo CSInfo;
+
   // Check if it's really possible to do a tail call. Restrict it to functions
   // that are part of this compilation unit.
   bool InternalLinkage = false;
@@ -3223,7 +3235,7 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                      G->getGlobal()->hasProtectedVisibility());
      }
   }
-  if (!IsTailCall && CLI.CS && CLI.CS.isMustTailCall())
+  if (!IsTailCall && CLI.CB && CLI.CB->isMustTailCall())
     report_fatal_error("failed to perform tail call elimination on a call "
                        "site marked musttail");
 
@@ -3335,6 +3347,17 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // RegsToPass vector
     if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+
+      // If the parameter is passed through reg $D, which splits into
+      // two physical registers, avoid creating call site info.
+      if (Mips::AFGR64RegClass.contains(VA.getLocReg()))
+        continue;
+
+      // Collect CSInfo about which register passes which parameter.
+      const TargetOptions &Options = DAG.getTarget().Options;
+      if (Options.SupportsDebugEntryValues)
+        CSInfo.emplace_back(VA.getLocReg(), i);
+
       continue;
     }
 
@@ -3398,11 +3421,11 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       else if (Subtarget.useXGOT()) {
         Callee = getAddrGlobalLargeGOT(G, DL, Ty, DAG, MipsII::MO_CALL_HI16,
                                        MipsII::MO_CALL_LO16, Chain,
-                                       FuncInfo->callPtrInfo(Val));
+                                       FuncInfo->callPtrInfo(MF, Val));
         IsCallReloc = true;
       } else {
         Callee = getAddrGlobal(G, DL, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
-                               FuncInfo->callPtrInfo(Val));
+                               FuncInfo->callPtrInfo(MF, Val));
         IsCallReloc = true;
       }
     } else
@@ -3420,11 +3443,11 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     else if (Subtarget.useXGOT()) {
       Callee = getAddrGlobalLargeGOT(S, DL, Ty, DAG, MipsII::MO_CALL_HI16,
                                      MipsII::MO_CALL_LO16, Chain,
-                                     FuncInfo->callPtrInfo(Sym));
+                                     FuncInfo->callPtrInfo(MF, Sym));
       IsCallReloc = true;
     } else { // PIC
       Callee = getAddrGlobal(S, DL, Ty, DAG, MipsII::MO_GOT_CALL, Chain,
-                             FuncInfo->callPtrInfo(Sym));
+                             FuncInfo->callPtrInfo(MF, Sym));
       IsCallReloc = true;
     }
 
@@ -3439,12 +3462,16 @@ MipsTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   if (IsTailCall) {
     MF.getFrameInfo().setHasTailCall();
-    return DAG.getNode(MipsISD::TailCall, DL, MVT::Other, Ops);
+    SDValue Ret = DAG.getNode(MipsISD::TailCall, DL, MVT::Other, Ops);
+    DAG.addCallSiteInfo(Ret.getNode(), std::move(CSInfo));
+    return Ret;
   }
 
   Chain = DAG.getNode(MipsISD::JmpLink, DL, NodeTys, Ops);
   SDValue InFlag = Chain.getValue(1);
 
+  DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
+
   // Create the CALLSEQ_END node in the case of where it is not a call to
   // memcpy.
   if (!(MemcpyInByVal)) {
@@ -3605,7 +3632,7 @@ SDValue MipsTargetLowering::LowerFormalArguments(
   SmallVector<CCValAssign, 16> ArgLocs;
   MipsCCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
                      *DAG.getContext());
-  CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), 1);
+  CCInfo.AllocateStack(ABI.GetCalleeAllocdArgSizeInBytes(CallConv), Align(1));
   const Function &Func = DAG.getMachineFunction().getFunction();
   Function::const_arg_iterator FuncArg = Func.arg_begin();
 
@@ -3940,7 +3967,7 @@ MipsTargetLowering::getSingleConstraintMatchWeight(
     break;
   case 'f': // FPU or MSA register
     if (Subtarget.hasMSA() && type->isVectorTy() &&
-        cast<VectorType>(type)->getBitWidth() == 128)
+        type->getPrimitiveSizeInBits().getFixedSize() == 128)
       weight = CW_Register;
     else if (type->isFloatTy())
       weight = CW_Register;
@@ -4269,9 +4296,7 @@ MipsTargetLowering::isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const {
 }
 
 EVT MipsTargetLowering::getOptimalMemOpType(
-    uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
-    bool ZeroMemset, bool MemcpyStrSrc,
-    const AttributeList &FuncAttributes) const {
+    const MemOp &Op, const AttributeList &FuncAttributes) const {
   if (Subtarget.hasMips64())
     return MVT::i64;
 
@@ -4363,7 +4388,8 @@ void MipsTargetLowering::passByValArg(
   unsigned ByValSizeInBytes = Flags.getByValSize();
   unsigned OffsetInBytes = 0; // From beginning of struct
   unsigned RegSizeInBytes = Subtarget.getGPRSizeInBytes();
-  unsigned Alignment = std::min(Flags.getByValAlign(), RegSizeInBytes);
+  Align Alignment =
+      std::min(Flags.getNonZeroByValAlign(), Align(RegSizeInBytes));
   EVT PtrTy = getPointerTy(DAG.getDataLayout()),
       RegTy = MVT::getIntegerVT(RegSizeInBytes * 8);
   unsigned NumRegs = LastReg - FirstReg;
@@ -4378,7 +4404,7 @@ void MipsTargetLowering::passByValArg(
       SDValue LoadPtr = DAG.getNode(ISD::ADD, DL, PtrTy, Arg,
                                     DAG.getConstant(OffsetInBytes, DL, PtrTy));
       SDValue LoadVal = DAG.getLoad(RegTy, DL, Chain, LoadPtr,
-                                    MachinePointerInfo(), Alignment);
+                                    MachinePointerInfo(), Alignment.value());
       MemOpChains.push_back(LoadVal.getValue(1));
       unsigned ArgReg = ArgRegs[FirstReg + I];
       RegsToPass.push_back(std::make_pair(ArgReg, LoadVal));
@@ -4405,7 +4431,7 @@ void MipsTargetLowering::passByValArg(
                                                       PtrTy));
         SDValue LoadVal = DAG.getExtLoad(
             ISD::ZEXTLOAD, DL, RegTy, Chain, LoadPtr, MachinePointerInfo(),
-            MVT::getIntegerVT(LoadSizeInBytes * 8), Alignment);
+            MVT::getIntegerVT(LoadSizeInBytes * 8), Alignment.value());
         MemOpChains.push_back(LoadVal.getValue(1));
 
         // Shift the loaded value.
@@ -4426,7 +4452,7 @@ void MipsTargetLowering::passByValArg(
 
         OffsetInBytes += LoadSizeInBytes;
         TotalBytesLoaded += LoadSizeInBytes;
-        Alignment = std::min(Alignment, LoadSizeInBytes);
+        Alignment = std::min(Alignment, Align(LoadSizeInBytes));
       }
 
       unsigned ArgReg = ArgRegs[FirstReg + I];
@@ -4441,11 +4467,10 @@ void MipsTargetLowering::passByValArg(
                             DAG.getConstant(OffsetInBytes, DL, PtrTy));
   SDValue Dst = DAG.getNode(ISD::ADD, DL, PtrTy, StackPtr,
                             DAG.getIntPtrConstant(VA.getLocMemOffset(), DL));
-  Chain = DAG.getMemcpy(Chain, DL, Dst, Src,
-                        DAG.getConstant(MemCpySize, DL, PtrTy),
-                        Alignment, /*isVolatile=*/false, /*AlwaysInline=*/false,
-                        /*isTailCall=*/false,
-                        MachinePointerInfo(), MachinePointerInfo());
+  Chain = DAG.getMemcpy(
+      Chain, DL, Dst, Src, DAG.getConstant(MemCpySize, DL, PtrTy),
+      Align(Alignment), /*isVolatile=*/false, /*AlwaysInline=*/false,
+      /*isTailCall=*/false, MachinePointerInfo(), MachinePointerInfo());
   MemOpChains.push_back(Chain);
 }
 
@@ -4497,12 +4522,12 @@ void MipsTargetLowering::writeVarArgRegs(std::vector<SDValue> &OutChains,
 }
 
 void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size,
-                                     unsigned Align) const {
+                                     Align Alignment) const {
   const TargetFrameLowering *TFL = Subtarget.getFrameLowering();
 
   assert(Size && "Byval argument's size shouldn't be 0.");
 
-  Align = std::min(Align, TFL->getStackAlignment());
+  Alignment = std::min(Alignment, TFL->getStackAlign());
 
   unsigned FirstReg = 0;
   unsigned NumRegs = 0;
@@ -4516,17 +4541,17 @@ void MipsTargetLowering::HandleByVal(CCState *State, unsigned &Size,
 
     // We used to check the size as well but we can't do that anymore since
     // CCState::HandleByVal() rounds up the size after calling this function.
-    assert(!(Align % RegSizeInBytes) &&
-           "Byval argument's alignment should be a multiple of"
-           "RegSizeInBytes.");
+    assert(
+        Alignment >= Align(RegSizeInBytes) &&
+        "Byval argument's alignment should be a multiple of RegSizeInBytes.");
 
     FirstReg = State->getFirstUnallocated(IntArgRegs);
 
-    // If Align > RegSizeInBytes, the first arg register must be even.
+    // If Alignment > RegSizeInBytes, the first arg register must be even.
     // FIXME: This condition happens to do the right thing but it's not the
     //        right way to test it. We want to check that the stack frame offset
     //        of the register is aligned.
-    if ((Align > RegSizeInBytes) && (FirstReg % 2)) {
+    if ((Alignment > RegSizeInBytes) && (FirstReg % 2)) {
       State->AllocateReg(IntArgRegs[FirstReg], ShadowRegs[FirstReg]);
       ++FirstReg;
     }
@@ -4717,3 +4742,274 @@ MipsTargetLowering::getRegisterByName(const char *RegName, LLT VT,
   }
   report_fatal_error("Invalid register name global variable");
 }
+
+MachineBasicBlock *MipsTargetLowering::emitLDR_W(MachineInstr &MI,
+                                                 MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const bool IsLittle = Subtarget.isLittle();
+  DebugLoc DL = MI.getDebugLoc();
+
+  Register Dest = MI.getOperand(0).getReg();
+  Register Address = MI.getOperand(1).getReg();
+  unsigned Imm = MI.getOperand(2).getImm();
+
+  MachineBasicBlock::iterator I(MI);
+
+  if (Subtarget.hasMips32r6() || Subtarget.hasMips64r6()) {
+    // Mips release 6 can load from adress that is not naturally-aligned.
+    Register Temp = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    BuildMI(*BB, I, DL, TII->get(Mips::LW))
+        .addDef(Temp)
+        .addUse(Address)
+        .addImm(Imm);
+    BuildMI(*BB, I, DL, TII->get(Mips::FILL_W)).addDef(Dest).addUse(Temp);
+  } else {
+    // Mips release 5 needs to use instructions that can load from an unaligned
+    // memory address.
+    Register LoadHalf = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    Register LoadFull = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    Register Undef = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    BuildMI(*BB, I, DL, TII->get(Mips::IMPLICIT_DEF)).addDef(Undef);
+    BuildMI(*BB, I, DL, TII->get(Mips::LWR))
+        .addDef(LoadHalf)
+        .addUse(Address)
+        .addImm(Imm + (IsLittle ? 0 : 3))
+        .addUse(Undef);
+    BuildMI(*BB, I, DL, TII->get(Mips::LWL))
+        .addDef(LoadFull)
+        .addUse(Address)
+        .addImm(Imm + (IsLittle ? 3 : 0))
+        .addUse(LoadHalf);
+    BuildMI(*BB, I, DL, TII->get(Mips::FILL_W)).addDef(Dest).addUse(LoadFull);
+  }
+
+  MI.eraseFromParent();
+  return BB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitLDR_D(MachineInstr &MI,
+                                                 MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const bool IsLittle = Subtarget.isLittle();
+  DebugLoc DL = MI.getDebugLoc();
+
+  Register Dest = MI.getOperand(0).getReg();
+  Register Address = MI.getOperand(1).getReg();
+  unsigned Imm = MI.getOperand(2).getImm();
+
+  MachineBasicBlock::iterator I(MI);
+
+  if (Subtarget.hasMips32r6() || Subtarget.hasMips64r6()) {
+    // Mips release 6 can load from adress that is not naturally-aligned.
+    if (Subtarget.isGP64bit()) {
+      Register Temp = MRI.createVirtualRegister(&Mips::GPR64RegClass);
+      BuildMI(*BB, I, DL, TII->get(Mips::LD))
+          .addDef(Temp)
+          .addUse(Address)
+          .addImm(Imm);
+      BuildMI(*BB, I, DL, TII->get(Mips::FILL_D)).addDef(Dest).addUse(Temp);
+    } else {
+      Register Wtemp = MRI.createVirtualRegister(&Mips::MSA128WRegClass);
+      Register Lo = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+      Register Hi = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+      BuildMI(*BB, I, DL, TII->get(Mips::LW))
+          .addDef(Lo)
+          .addUse(Address)
+          .addImm(Imm + (IsLittle ? 0 : 4));
+      BuildMI(*BB, I, DL, TII->get(Mips::LW))
+          .addDef(Hi)
+          .addUse(Address)
+          .addImm(Imm + (IsLittle ? 4 : 0));
+      BuildMI(*BB, I, DL, TII->get(Mips::FILL_W)).addDef(Wtemp).addUse(Lo);
+      BuildMI(*BB, I, DL, TII->get(Mips::INSERT_W), Dest)
+          .addUse(Wtemp)
+          .addUse(Hi)
+          .addImm(1);
+    }
+  } else {
+    // Mips release 5 needs to use instructions that can load from an unaligned
+    // memory address.
+    Register LoHalf = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    Register LoFull = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    Register LoUndef = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    Register HiHalf = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    Register HiFull = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    Register HiUndef = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    Register Wtemp = MRI.createVirtualRegister(&Mips::MSA128WRegClass);
+    BuildMI(*BB, I, DL, TII->get(Mips::IMPLICIT_DEF)).addDef(LoUndef);
+    BuildMI(*BB, I, DL, TII->get(Mips::LWR))
+        .addDef(LoHalf)
+        .addUse(Address)
+        .addImm(Imm + (IsLittle ? 0 : 7))
+        .addUse(LoUndef);
+    BuildMI(*BB, I, DL, TII->get(Mips::LWL))
+        .addDef(LoFull)
+        .addUse(Address)
+        .addImm(Imm + (IsLittle ? 3 : 4))
+        .addUse(LoHalf);
+    BuildMI(*BB, I, DL, TII->get(Mips::IMPLICIT_DEF)).addDef(HiUndef);
+    BuildMI(*BB, I, DL, TII->get(Mips::LWR))
+        .addDef(HiHalf)
+        .addUse(Address)
+        .addImm(Imm + (IsLittle ? 4 : 3))
+        .addUse(HiUndef);
+    BuildMI(*BB, I, DL, TII->get(Mips::LWL))
+        .addDef(HiFull)
+        .addUse(Address)
+        .addImm(Imm + (IsLittle ? 7 : 0))
+        .addUse(HiHalf);
+    BuildMI(*BB, I, DL, TII->get(Mips::FILL_W)).addDef(Wtemp).addUse(LoFull);
+    BuildMI(*BB, I, DL, TII->get(Mips::INSERT_W), Dest)
+        .addUse(Wtemp)
+        .addUse(HiFull)
+        .addImm(1);
+  }
+
+  MI.eraseFromParent();
+  return BB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitSTR_W(MachineInstr &MI,
+                                                 MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const bool IsLittle = Subtarget.isLittle();
+  DebugLoc DL = MI.getDebugLoc();
+
+  Register StoreVal = MI.getOperand(0).getReg();
+  Register Address = MI.getOperand(1).getReg();
+  unsigned Imm = MI.getOperand(2).getImm();
+
+  MachineBasicBlock::iterator I(MI);
+
+  if (Subtarget.hasMips32r6() || Subtarget.hasMips64r6()) {
+    // Mips release 6 can store to adress that is not naturally-aligned.
+    Register BitcastW = MRI.createVirtualRegister(&Mips::MSA128WRegClass);
+    Register Tmp = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    BuildMI(*BB, I, DL, TII->get(Mips::COPY)).addDef(BitcastW).addUse(StoreVal);
+    BuildMI(*BB, I, DL, TII->get(Mips::COPY_S_W))
+        .addDef(Tmp)
+        .addUse(BitcastW)
+        .addImm(0);
+    BuildMI(*BB, I, DL, TII->get(Mips::SW))
+        .addUse(Tmp)
+        .addUse(Address)
+        .addImm(Imm);
+  } else {
+    // Mips release 5 needs to use instructions that can store to an unaligned
+    // memory address.
+    Register Tmp = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    BuildMI(*BB, I, DL, TII->get(Mips::COPY_S_W))
+        .addDef(Tmp)
+        .addUse(StoreVal)
+        .addImm(0);
+    BuildMI(*BB, I, DL, TII->get(Mips::SWR))
+        .addUse(Tmp)
+        .addUse(Address)
+        .addImm(Imm + (IsLittle ? 0 : 3));
+    BuildMI(*BB, I, DL, TII->get(Mips::SWL))
+        .addUse(Tmp)
+        .addUse(Address)
+        .addImm(Imm + (IsLittle ? 3 : 0));
+  }
+
+  MI.eraseFromParent();
+
+  return BB;
+}
+
+MachineBasicBlock *MipsTargetLowering::emitSTR_D(MachineInstr &MI,
+                                                 MachineBasicBlock *BB) const {
+  MachineFunction *MF = BB->getParent();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const bool IsLittle = Subtarget.isLittle();
+  DebugLoc DL = MI.getDebugLoc();
+
+  Register StoreVal = MI.getOperand(0).getReg();
+  Register Address = MI.getOperand(1).getReg();
+  unsigned Imm = MI.getOperand(2).getImm();
+
+  MachineBasicBlock::iterator I(MI);
+
+  if (Subtarget.hasMips32r6() || Subtarget.hasMips64r6()) {
+    // Mips release 6 can store to adress that is not naturally-aligned.
+    if (Subtarget.isGP64bit()) {
+      Register BitcastD = MRI.createVirtualRegister(&Mips::MSA128DRegClass);
+      Register Lo = MRI.createVirtualRegister(&Mips::GPR64RegClass);
+      BuildMI(*BB, I, DL, TII->get(Mips::COPY))
+          .addDef(BitcastD)
+          .addUse(StoreVal);
+      BuildMI(*BB, I, DL, TII->get(Mips::COPY_S_D))
+          .addDef(Lo)
+          .addUse(BitcastD)
+          .addImm(0);
+      BuildMI(*BB, I, DL, TII->get(Mips::SD))
+          .addUse(Lo)
+          .addUse(Address)
+          .addImm(Imm);
+    } else {
+      Register BitcastW = MRI.createVirtualRegister(&Mips::MSA128WRegClass);
+      Register Lo = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+      Register Hi = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+      BuildMI(*BB, I, DL, TII->get(Mips::COPY))
+          .addDef(BitcastW)
+          .addUse(StoreVal);
+      BuildMI(*BB, I, DL, TII->get(Mips::COPY_S_W))
+          .addDef(Lo)
+          .addUse(BitcastW)
+          .addImm(0);
+      BuildMI(*BB, I, DL, TII->get(Mips::COPY_S_W))
+          .addDef(Hi)
+          .addUse(BitcastW)
+          .addImm(1);
+      BuildMI(*BB, I, DL, TII->get(Mips::SW))
+          .addUse(Lo)
+          .addUse(Address)
+          .addImm(Imm + (IsLittle ? 0 : 4));
+      BuildMI(*BB, I, DL, TII->get(Mips::SW))
+          .addUse(Hi)
+          .addUse(Address)
+          .addImm(Imm + (IsLittle ? 4 : 0));
+    }
+  } else {
+    // Mips release 5 needs to use instructions that can store to an unaligned
+    // memory address.
+    Register Bitcast = MRI.createVirtualRegister(&Mips::MSA128WRegClass);
+    Register Lo = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    Register Hi = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+    BuildMI(*BB, I, DL, TII->get(Mips::COPY)).addDef(Bitcast).addUse(StoreVal);
+    BuildMI(*BB, I, DL, TII->get(Mips::COPY_S_W))
+        .addDef(Lo)
+        .addUse(Bitcast)
+        .addImm(0);
+    BuildMI(*BB, I, DL, TII->get(Mips::COPY_S_W))
+        .addDef(Hi)
+        .addUse(Bitcast)
+        .addImm(1);
+    BuildMI(*BB, I, DL, TII->get(Mips::SWR))
+        .addUse(Lo)
+        .addUse(Address)
+        .addImm(Imm + (IsLittle ? 0 : 3));
+    BuildMI(*BB, I, DL, TII->get(Mips::SWL))
+        .addUse(Lo)
+        .addUse(Address)
+        .addImm(Imm + (IsLittle ? 3 : 0));
+    BuildMI(*BB, I, DL, TII->get(Mips::SWR))
+        .addUse(Hi)
+        .addUse(Address)
+        .addImm(Imm + (IsLittle ? 4 : 7));
+    BuildMI(*BB, I, DL, TII->get(Mips::SWL))
+        .addUse(Hi)
+        .addUse(Address)
+        .addImm(Imm + (IsLittle ? 7 : 4));
+  }
+
+  MI.eraseFromParent();
+  return BB;
+}
diff --git a/llvm/lib/Target/Mips/MipsISelLowering.h b/llvm/lib/Target/Mips/MipsISelLowering.h
index 92cbe1d54c5b..16b4d51d3ca6 100644
--- a/llvm/lib/Target/Mips/MipsISelLowering.h
+++ b/llvm/lib/Target/Mips/MipsISelLowering.h
@@ -306,7 +306,7 @@ class TargetRegisterClass;
     /// Return the correct alignment for the current calling convention.
     Align getABIAlignmentForCallingConv(Type *ArgTy,
                                         DataLayout DL) const override {
-      const Align ABIAlign(DL.getABITypeAlignment(ArgTy));
+      const Align ABIAlign = DL.getABITypeAlign(ArgTy);
       if (ArgTy->isVectorTy())
         return std::min(ABIAlign, Align(8));
       return ABIAlign;
@@ -346,21 +346,21 @@ class TargetRegisterClass;
     void AdjustInstrPostInstrSelection(MachineInstr &MI,
                                        SDNode *Node) const override;
 
-    void HandleByVal(CCState *, unsigned &, unsigned) const override;
+    void HandleByVal(CCState *, unsigned &, Align) const override;
 
     Register getRegisterByName(const char* RegName, LLT VT,
                                const MachineFunction &MF) const override;
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
-    unsigned
+    Register
     getExceptionPointerRegister(const Constant *PersonalityFn) const override {
       return ABI.IsN64() ? Mips::A0_64 : Mips::A0;
     }
 
     /// If a physical register, this returns the register that receives the
     /// exception typeid on entry to a landing pad.
-    unsigned
+    Register
     getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
       return ABI.IsN64() ? Mips::A1_64 : Mips::A1;
     }
@@ -669,10 +669,7 @@ class TargetRegisterClass;
 
     bool isOffsetFoldingLegal(const GlobalAddressSDNode *GA) const override;
 
-    EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign,
-                            unsigned SrcAlign,
-                            bool IsMemset, bool ZeroMemset,
-                            bool MemcpyStrSrc,
+    EVT getOptimalMemOpType(const MemOp &Op,
                             const AttributeList &FuncAttributes) const override;
 
     /// isFPImmLegal - Returns true if the target can instruction select the
@@ -709,6 +706,10 @@ class TargetRegisterClass;
                                         bool isFPCmp, unsigned Opc) const;
     MachineBasicBlock *emitPseudoD_SELECT(MachineInstr &MI,
                                           MachineBasicBlock *BB) const;
+    MachineBasicBlock *emitLDR_W(MachineInstr &MI, MachineBasicBlock *BB) const;
+    MachineBasicBlock *emitLDR_D(MachineInstr &MI, MachineBasicBlock *BB) const;
+    MachineBasicBlock *emitSTR_W(MachineInstr &MI, MachineBasicBlock *BB) const;
+    MachineBasicBlock *emitSTR_D(MachineInstr &MI, MachineBasicBlock *BB) const;
   };
 
   /// Create MipsTargetLowering objects.
diff --git a/llvm/lib/Target/Mips/MipsInstrFPU.td b/llvm/lib/Target/Mips/MipsInstrFPU.td
index 79776998463f..5696df96e798 100644
--- a/llvm/lib/Target/Mips/MipsInstrFPU.td
+++ b/llvm/lib/Target/Mips/MipsInstrFPU.td
@@ -48,6 +48,7 @@ def MipsFPBrcond : SDNode<"MipsISD::FPBrcond", SDT_MipsFPBrcond,
                           [SDNPHasChain, SDNPOptInGlue]>;
 def MipsTruncIntFP : SDNode<"MipsISD::TruncIntFP", SDT_MipsTruncIntFP>;
 def MipsBuildPairF64 : SDNode<"MipsISD::BuildPairF64", SDT_MipsBuildPairF64>;
+def : GINodeEquiv<G_MERGE_VALUES, MipsBuildPairF64>;
 def MipsExtractElementF64 : SDNode<"MipsISD::ExtractElementF64",
                                    SDT_MipsExtractElementF64>;
 
@@ -62,15 +63,17 @@ let PrintMethod = "printFCCOperand", DecoderMethod = "DecodeCondCode" in
 //===----------------------------------------------------------------------===//
 
 def IsFP64bit        : Predicate<"Subtarget->isFP64bit()">,
-                       AssemblerPredicate<"FeatureFP64Bit">;
+                       AssemblerPredicate<(all_of FeatureFP64Bit)>;
 def NotFP64bit       : Predicate<"!Subtarget->isFP64bit()">,
-                       AssemblerPredicate<"!FeatureFP64Bit">;
+                       AssemblerPredicate<(all_of (not FeatureFP64Bit))>;
 def IsSingleFloat    : Predicate<"Subtarget->isSingleFloat()">,
-                       AssemblerPredicate<"FeatureSingleFloat">;
+                       AssemblerPredicate<(all_of FeatureSingleFloat)>;
 def IsNotSingleFloat : Predicate<"!Subtarget->isSingleFloat()">,
-                       AssemblerPredicate<"!FeatureSingleFloat">;
+                       AssemblerPredicate<(all_of (not FeatureSingleFloat))>;
 def IsNotSoftFloat   : Predicate<"!Subtarget->useSoftFloat()">,
-                       AssemblerPredicate<"!FeatureSoftFloat">;
+                       AssemblerPredicate<(all_of (not FeatureSoftFloat))>;
+def HasMips3D        : Predicate<"Subtarget->has3D()">,
+                       AssemblerPredicate<(all_of FeatureMips3D)>;
 
 //===----------------------------------------------------------------------===//
 // Mips FGR size adjectives.
@@ -455,6 +458,12 @@ let DecoderNamespace = "MipsFP64" in {
     def PLU_PS64    : ADDS_FT<"plu.ps", FGR64Opnd, II_CVT, 0>,
                       ADDS_FM<0x2D, 22>,
                       ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+    def PUL_PS64    : ADDS_FT<"pul.ps", FGR64Opnd, II_CVT, 0>,
+                      ADDS_FM<0x2E, 22>,
+                      ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+    def PUU_PS64    : ADDS_FT<"puu.ps", FGR64Opnd, II_CVT, 0>,
+                      ADDS_FM<0x2F, 22>,
+                      ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
 
     def CVT_S_PU64  : ABSS_FT<"cvt.s.pu", FGR32Opnd, FGR64Opnd, II_CVT>,
                       ABSS_FM<0x20, 22>,
@@ -470,6 +479,21 @@ let DecoderNamespace = "MipsFP64" in {
 }
 
 let DecoderNamespace = "MipsFP64" in {
+  let AdditionalPredicates = [HasMips3D] in {
+    def ADDR_PS64   : ADDS_FT<"addr.ps", FGR64Opnd, II_ADDR_PS, 0>,
+                      ADDS_FM<0x18, 22>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+    def MULR_PS64   : ADDS_FT<"mulr.ps", FGR64Opnd, II_MULR_PS, 0>,
+                      ADDS_FM<0x1a, 22>, ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+    def CVT_PS_PW64 : ABSS_FT<"cvt.ps.pw", FGR64Opnd, FGR64Opnd, II_CVT>,
+                      ABSS_FM<0x26, 20>,
+                      ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+    def CVT_PW_PS64 : ABSS_FT<"cvt.pw.ps", FGR64Opnd, FGR64Opnd, II_CVT>,
+                      ABSS_FM<0x24, 22>,
+                      ISA_MIPS32R2_NOT_32R6_64R6, FGR_64;
+  }
+}
+
+let DecoderNamespace = "MipsFP64" in {
   let AdditionalPredicates = [NotInMicroMips] in {
     def CVT_S_L   : ABSS_FT<"cvt.s.l", FGR32Opnd, FGR64Opnd, II_CVT>,
                     ABSS_FM<0x20, 21>, INSN_MIPS3_32R2, FGR_64;
diff --git a/llvm/lib/Target/Mips/MipsInstrFormats.td b/llvm/lib/Target/Mips/MipsInstrFormats.td
index 4624c1f2d04a..10529c7d9e19 100644
--- a/llvm/lib/Target/Mips/MipsInstrFormats.td
+++ b/llvm/lib/Target/Mips/MipsInstrFormats.td
@@ -169,39 +169,6 @@ class FR<bits<6> op, bits<6> _funct, dag outs, dag ins, string asmstr,
 }
 
 //===----------------------------------------------------------------------===//
-// Format I instruction class in Mips : <|opcode|rs|rt|immediate|>
-//===----------------------------------------------------------------------===//
-
-class FI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern,
-         InstrItinClass itin>: InstSE<outs, ins, asmstr, pattern, itin, FrmI>
-{
-  bits<5>  rt;
-  bits<5>  rs;
-  bits<16> imm16;
-
-  let Opcode = op;
-
-  let Inst{25-21} = rs;
-  let Inst{20-16} = rt;
-  let Inst{15-0}  = imm16;
-}
-
-class BranchBase<bits<6> op, dag outs, dag ins, string asmstr,
-                  list<dag> pattern, InstrItinClass itin>:
-  InstSE<outs, ins, asmstr, pattern, itin, FrmI>
-{
-  bits<5>  rs;
-  bits<5>  rt;
-  bits<16> imm16;
-
-  let Opcode = op;
-
-  let Inst{25-21} = rs;
-  let Inst{20-16} = rt;
-  let Inst{15-0}  = imm16;
-}
-
-//===----------------------------------------------------------------------===//
 // Format J instruction class in Mips : <|opcode|address|>
 //===----------------------------------------------------------------------===//
 
@@ -711,20 +678,6 @@ class EI_FM<bits<1> sc> : StdArch
 // Format FI instruction class in Mips : <|opcode|base|ft|immediate|>
 //===----------------------------------------------------------------------===//
 
-class FFI<bits<6> op, dag outs, dag ins, string asmstr, list<dag> pattern>:
-  InstSE<outs, ins, asmstr, pattern, NoItinerary, FrmFI>
-{
-  bits<5>  ft;
-  bits<5>  base;
-  bits<16> imm16;
-
-  let Opcode = op;
-
-  let Inst{25-21} = base;
-  let Inst{20-16} = ft;
-  let Inst{15-0}  = imm16;
-}
-
 class ADDS_FM<bits<6> funct, bits<5> fmt> : StdArch {
   bits<5> fd;
   bits<5> fs;
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.cpp b/llvm/lib/Target/Mips/MipsInstrInfo.cpp
index 25bbe5990827..0c6080258a3a 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.cpp
@@ -23,6 +23,7 @@
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/CodeGen/TargetOpcodes.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/MC/MCInstrDesc.h"
 #include "llvm/Target/TargetMachine.h"
@@ -66,10 +67,10 @@ MipsInstrInfo::GetMemOperand(MachineBasicBlock &MBB, int FI,
                              MachineMemOperand::Flags Flags) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  unsigned Align = MFI.getObjectAlignment(FI);
 
   return MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
-                                 Flags, MFI.getObjectSize(FI), Align);
+                                 Flags, MFI.getObjectSize(FI),
+                                 MFI.getObjectAlign(FI));
 }
 
 //===----------------------------------------------------------------------===//
@@ -841,3 +842,56 @@ MipsInstrInfo::getSerializableDirectMachineOperandTargetFlags() const {
   };
   return makeArrayRef(Flags);
 }
+
+Optional<ParamLoadedValue>
+MipsInstrInfo::describeLoadedValue(const MachineInstr &MI, Register Reg) const {
+  DIExpression *Expr =
+      DIExpression::get(MI.getMF()->getFunction().getContext(), {});
+
+  // TODO: Special MIPS instructions that need to be described separately.
+  if (auto RegImm = isAddImmediate(MI, Reg)) {
+    Register SrcReg = RegImm->Reg;
+    int64_t Offset = RegImm->Imm;
+    // When SrcReg is $zero, treat loaded value as immediate only.
+    // Ex. $a2 = ADDiu $zero, 10
+    if (SrcReg == Mips::ZERO || SrcReg == Mips::ZERO_64) {
+      return ParamLoadedValue(MI.getOperand(2), Expr);
+    }
+    Expr = DIExpression::prepend(Expr, DIExpression::ApplyOffset, Offset);
+    return ParamLoadedValue(MachineOperand::CreateReg(SrcReg, false), Expr);
+  } else if (auto DestSrc = isCopyInstr(MI)) {
+    const MachineFunction *MF = MI.getMF();
+    const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
+    Register DestReg = DestSrc->Destination->getReg();
+    // TODO: Handle cases where the Reg is sub- or super-register of the
+    // DestReg.
+    if (TRI->isSuperRegister(Reg, DestReg) || TRI->isSubRegister(Reg, DestReg))
+      return None;
+  }
+
+  return TargetInstrInfo::describeLoadedValue(MI, Reg);
+}
+
+Optional<RegImmPair> MipsInstrInfo::isAddImmediate(const MachineInstr &MI,
+                                                   Register Reg) const {
+  // TODO: Handle cases where Reg is a super- or sub-register of the
+  // destination register.
+  const MachineOperand &Op0 = MI.getOperand(0);
+  if (!Op0.isReg() || Reg != Op0.getReg())
+    return None;
+
+  switch (MI.getOpcode()) {
+  case Mips::ADDiu:
+  case Mips::DADDiu: {
+    const MachineOperand &Dop = MI.getOperand(0);
+    const MachineOperand &Sop1 = MI.getOperand(1);
+    const MachineOperand &Sop2 = MI.getOperand(2);
+    // Value is sum of register and immediate. Immediate value could be
+    // global string address which is not supported.
+    if (Dop.isReg() && Sop1.isReg() && Sop2.isImm())
+      return RegImmPair{Sop1.getReg(), Sop2.getImm()};
+    // TODO: Handle case where Sop1 is a frame-index.
+  }
+  }
+  return None;
+}
+\ No newline at end of file
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.h b/llvm/lib/Target/Mips/MipsInstrInfo.h
index 092a960b4ba7..c96ed202df30 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.h
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.h
@@ -106,12 +106,16 @@ public:
 
   virtual unsigned getOppositeBranchOpc(unsigned Opc) const = 0;
 
+  virtual bool isBranchWithImm(unsigned Opc) const {
+    return false;
+  }
+
   /// Return the number of bytes of code the specified instruction may be.
   unsigned getInstSizeInBytes(const MachineInstr &MI) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           Register SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override {
     storeRegToStack(MBB, MBBI, SrcReg, isKill, FrameIndex, RC, TRI, 0);
@@ -119,7 +123,7 @@ public:
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
-                            unsigned DestReg, int FrameIndex,
+                            Register DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override {
     loadRegFromStack(MBB, MBBI, DestReg, FrameIndex, RC, TRI, 0);
@@ -127,14 +131,14 @@ public:
 
   virtual void storeRegToStack(MachineBasicBlock &MBB,
                                MachineBasicBlock::iterator MI,
-                               unsigned SrcReg, bool isKill, int FrameIndex,
+                               Register SrcReg, bool isKill, int FrameIndex,
                                const TargetRegisterClass *RC,
                                const TargetRegisterInfo *TRI,
                                int64_t Offset) const = 0;
 
   virtual void loadRegFromStack(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI,
-                                unsigned DestReg, int FrameIndex,
+                                Register DestReg, int FrameIndex,
                                 const TargetRegisterClass *RC,
                                 const TargetRegisterInfo *TRI,
                                 int64_t Offset) const = 0;
@@ -161,6 +165,12 @@ public:
   ArrayRef<std::pair<unsigned, const char *>>
   getSerializableDirectMachineOperandTargetFlags() const override;
 
+  Optional<RegImmPair> isAddImmediate(const MachineInstr &MI,
+                                      Register Reg) const override;
+
+  Optional<ParamLoadedValue> describeLoadedValue(const MachineInstr &MI,
+                                                 Register Reg) const override;
+
 protected:
   bool isZeroImm(const MachineOperand &op) const;
 
diff --git a/llvm/lib/Target/Mips/MipsInstrInfo.td b/llvm/lib/Target/Mips/MipsInstrInfo.td
index d9a3ff802708..a3b928870f3f 100644
--- a/llvm/lib/Target/Mips/MipsInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsInstrInfo.td
@@ -156,69 +156,69 @@ def MipsSDR : SDNode<"MipsISD::SDR", SDTStore,
 // Mips Instruction Predicate Definitions.
 //===----------------------------------------------------------------------===//
 def HasMips2     :    Predicate<"Subtarget->hasMips2()">,
-                      AssemblerPredicate<"FeatureMips2">;
+                      AssemblerPredicate<(all_of FeatureMips2)>;
 def HasMips3_32  :    Predicate<"Subtarget->hasMips3_32()">,
-                      AssemblerPredicate<"FeatureMips3_32">;
+                      AssemblerPredicate<(all_of FeatureMips3_32)>;
 def HasMips3_32r2 :   Predicate<"Subtarget->hasMips3_32r2()">,
-                      AssemblerPredicate<"FeatureMips3_32r2">;
+                      AssemblerPredicate<(all_of FeatureMips3_32r2)>;
 def HasMips3     :    Predicate<"Subtarget->hasMips3()">,
-                      AssemblerPredicate<"FeatureMips3">;
+                      AssemblerPredicate<(all_of FeatureMips3)>;
 def NotMips3     :    Predicate<"!Subtarget->hasMips3()">,
-                      AssemblerPredicate<"!FeatureMips3">;
+                      AssemblerPredicate<(all_of (not FeatureMips3))>;
 def HasMips4_32  :    Predicate<"Subtarget->hasMips4_32()">,
-                      AssemblerPredicate<"FeatureMips4_32">;
+                      AssemblerPredicate<(all_of FeatureMips4_32)>;
 def NotMips4_32  :    Predicate<"!Subtarget->hasMips4_32()">,
-                      AssemblerPredicate<"!FeatureMips4_32">;
+                      AssemblerPredicate<(all_of (not FeatureMips4_32))>;
 def HasMips4_32r2 :   Predicate<"Subtarget->hasMips4_32r2()">,
-                      AssemblerPredicate<"FeatureMips4_32r2">;
+                      AssemblerPredicate<(all_of FeatureMips4_32r2)>;
 def HasMips5_32r2 :   Predicate<"Subtarget->hasMips5_32r2()">,
-                      AssemblerPredicate<"FeatureMips5_32r2">;
+                      AssemblerPredicate<(all_of FeatureMips5_32r2)>;
 def HasMips32    :    Predicate<"Subtarget->hasMips32()">,
-                      AssemblerPredicate<"FeatureMips32">;
+                      AssemblerPredicate<(all_of FeatureMips32)>;
 def HasMips32r2  :    Predicate<"Subtarget->hasMips32r2()">,
-                      AssemblerPredicate<"FeatureMips32r2">;
+                      AssemblerPredicate<(all_of FeatureMips32r2)>;
 def HasMips32r5  :    Predicate<"Subtarget->hasMips32r5()">,
-                      AssemblerPredicate<"FeatureMips32r5">;
+                      AssemblerPredicate<(all_of FeatureMips32r5)>;
 def HasMips32r6  :    Predicate<"Subtarget->hasMips32r6()">,
-                      AssemblerPredicate<"FeatureMips32r6">;
+                      AssemblerPredicate<(all_of FeatureMips32r6)>;
 def NotMips32r6  :    Predicate<"!Subtarget->hasMips32r6()">,
-                      AssemblerPredicate<"!FeatureMips32r6">;
+                      AssemblerPredicate<(all_of (not FeatureMips32r6))>;
 def IsGP64bit    :    Predicate<"Subtarget->isGP64bit()">,
-                      AssemblerPredicate<"FeatureGP64Bit">;
+                      AssemblerPredicate<(all_of FeatureGP64Bit)>;
 def IsGP32bit    :    Predicate<"!Subtarget->isGP64bit()">,
-                      AssemblerPredicate<"!FeatureGP64Bit">;
+                      AssemblerPredicate<(all_of (not FeatureGP64Bit))>;
 def IsPTR64bit    :   Predicate<"Subtarget->isABI_N64()">,
-                      AssemblerPredicate<"FeaturePTR64Bit">;
+                      AssemblerPredicate<(all_of FeaturePTR64Bit)>;
 def IsPTR32bit    :   Predicate<"!Subtarget->isABI_N64()">,
-                      AssemblerPredicate<"!FeaturePTR64Bit">;
+                      AssemblerPredicate<(all_of (not FeaturePTR64Bit))>;
 def HasMips64    :    Predicate<"Subtarget->hasMips64()">,
-                      AssemblerPredicate<"FeatureMips64">;
+                      AssemblerPredicate<(all_of FeatureMips64)>;
 def NotMips64    :    Predicate<"!Subtarget->hasMips64()">,
-                      AssemblerPredicate<"!FeatureMips64">;
+                      AssemblerPredicate<(all_of (not FeatureMips64))>;
 def HasMips64r2  :    Predicate<"Subtarget->hasMips64r2()">,
-                      AssemblerPredicate<"FeatureMips64r2">;
+                      AssemblerPredicate<(all_of FeatureMips64r2)>;
 def HasMips64r5  :    Predicate<"Subtarget->hasMips64r5()">,
-                      AssemblerPredicate<"FeatureMips64r5">;
+                      AssemblerPredicate<(all_of FeatureMips64r5)>;
 def HasMips64r6  :    Predicate<"Subtarget->hasMips64r6()">,
-                      AssemblerPredicate<"FeatureMips64r6">;
+                      AssemblerPredicate<(all_of FeatureMips64r6)>;
 def NotMips64r6  :    Predicate<"!Subtarget->hasMips64r6()">,
-                      AssemblerPredicate<"!FeatureMips64r6">;
+                      AssemblerPredicate<(all_of (not FeatureMips64r6))>;
 def InMips16Mode :    Predicate<"Subtarget->inMips16Mode()">,
-                      AssemblerPredicate<"FeatureMips16">;
+                      AssemblerPredicate<(all_of FeatureMips16)>;
 def NotInMips16Mode : Predicate<"!Subtarget->inMips16Mode()">,
-                      AssemblerPredicate<"!FeatureMips16">;
+                      AssemblerPredicate<(all_of (not FeatureMips16))>;
 def HasCnMips    :    Predicate<"Subtarget->hasCnMips()">,
-                      AssemblerPredicate<"FeatureCnMips">;
+                      AssemblerPredicate<(all_of FeatureCnMips)>;
 def NotCnMips    :    Predicate<"!Subtarget->hasCnMips()">,
-                      AssemblerPredicate<"!FeatureCnMips">;
+                      AssemblerPredicate<(all_of (not FeatureCnMips))>;
 def HasCnMipsP   :    Predicate<"Subtarget->hasCnMipsP()">,
-                      AssemblerPredicate<"FeatureCnMipsP">;
+                      AssemblerPredicate<(all_of FeatureCnMipsP)>;
 def NotCnMipsP   :    Predicate<"!Subtarget->hasCnMipsP()">,
-                      AssemblerPredicate<"!FeatureCnMipsP">;
+                      AssemblerPredicate<(all_of (not FeatureCnMipsP))>;
 def IsSym32     :     Predicate<"Subtarget->hasSym32()">,
-                      AssemblerPredicate<"FeatureSym32">;
+                      AssemblerPredicate<(all_of FeatureSym32)>;
 def IsSym64     :     Predicate<"!Subtarget->hasSym32()">,
-                      AssemblerPredicate<"!FeatureSym32">;
+                      AssemblerPredicate<(all_of (not FeatureSym32))>;
 def IsN64       :     Predicate<"Subtarget->isABI_N64()">;
 def IsNotN64    :     Predicate<"!Subtarget->isABI_N64()">;
 def RelocNotPIC :     Predicate<"!TM.isPositionIndependent()">;
@@ -227,34 +227,34 @@ def NoNaNsFPMath :    Predicate<"TM.Options.NoNaNsFPMath">;
 def UseAbs :          Predicate<"Subtarget->inAbs2008Mode() ||"
                                 "TM.Options.NoNaNsFPMath">;
 def HasStdEnc :       Predicate<"Subtarget->hasStandardEncoding()">,
-                      AssemblerPredicate<"!FeatureMips16">;
+                      AssemblerPredicate<(all_of (not FeatureMips16))>;
 def NotDSP :          Predicate<"!Subtarget->hasDSP()">;
 def InMicroMips    :  Predicate<"Subtarget->inMicroMipsMode()">,
-                      AssemblerPredicate<"FeatureMicroMips">;
+                      AssemblerPredicate<(all_of FeatureMicroMips)>;
 def NotInMicroMips :  Predicate<"!Subtarget->inMicroMipsMode()">,
-                      AssemblerPredicate<"!FeatureMicroMips">;
+                      AssemblerPredicate<(all_of (not FeatureMicroMips))>;
 def IsLE           :  Predicate<"Subtarget->isLittle()">;
 def IsBE           :  Predicate<"!Subtarget->isLittle()">;
 def IsNotNaCl    :    Predicate<"!Subtarget->isTargetNaCl()">;
-def UseTCCInDIV    :  AssemblerPredicate<"FeatureUseTCCInDIV">;
+def UseTCCInDIV    :  AssemblerPredicate<(all_of FeatureUseTCCInDIV)>;
 def HasEVA       :    Predicate<"Subtarget->hasEVA()">,
-                      AssemblerPredicate<"FeatureEVA">;
+                      AssemblerPredicate<(all_of FeatureEVA)>;
 def HasMSA : Predicate<"Subtarget->hasMSA()">,
-             AssemblerPredicate<"FeatureMSA">;
+             AssemblerPredicate<(all_of FeatureMSA)>;
 def HasMadd4 : Predicate<"!Subtarget->disableMadd4()">,
-               AssemblerPredicate<"!FeatureMadd4">;
+               AssemblerPredicate<(all_of (not FeatureMadd4))>;
 def HasMT  : Predicate<"Subtarget->hasMT()">,
-             AssemblerPredicate<"FeatureMT">;
+             AssemblerPredicate<(all_of FeatureMT)>;
 def UseIndirectJumpsHazard : Predicate<"Subtarget->useIndirectJumpsHazard()">,
-                            AssemblerPredicate<"FeatureUseIndirectJumpsHazard">;
+                            AssemblerPredicate<(all_of FeatureUseIndirectJumpsHazard)>;
 def NoIndirectJumpGuards : Predicate<"!Subtarget->useIndirectJumpsHazard()">,
-                           AssemblerPredicate<"!FeatureUseIndirectJumpsHazard">;
+                           AssemblerPredicate<(all_of (not FeatureUseIndirectJumpsHazard))>;
 def HasCRC   : Predicate<"Subtarget->hasCRC()">,
-               AssemblerPredicate<"FeatureCRC">;
+               AssemblerPredicate<(all_of FeatureCRC)>;
 def HasVirt  : Predicate<"Subtarget->hasVirt()">,
-               AssemblerPredicate<"FeatureVirt">;
+               AssemblerPredicate<(all_of FeatureVirt)>;
 def HasGINV  : Predicate<"Subtarget->hasGINV()">,
-               AssemblerPredicate<"FeatureGINV">;
+               AssemblerPredicate<(all_of FeatureGINV)>;
 // TODO: Add support for FPOpFusion::Standard
 def AllowFPOpFusion : Predicate<"TM.Options.AllowFPOpFusion =="
                                 " FPOpFusion::Fast">;
@@ -498,7 +498,7 @@ class MADD4 {
   list<Predicate> AdditionalPredicates = [HasMadd4];
 }
 
-// Classses used for separating expansions that differ based on the ABI in
+// Classes used for separating expansions that differ based on the ABI in
 // use.
 class ABI_N64 {
   list<Predicate> AdditionalPredicates = [IsN64];
@@ -1286,7 +1286,7 @@ def LUiORiPred  : PatLeaf<(imm), [{
   return isInt<32>(SVal) && (SVal & 0xffff);
 }]>;
 
-// Mips Address Mode! SDNode frameindex could possibily be a match
+// Mips Address Mode! SDNode frameindex could possibly be a match
 // since load and store instructions from stack used it.
 def addr :
   ComplexPattern<iPTR, 2, "selectIntAddr", [frameindex]>;
@@ -1871,10 +1871,11 @@ class MTC3OP<string asmstr, RegisterOperand RO, RegisterOperand RD,
 class TrapBase<Instruction RealInst>
   : PseudoSE<(outs), (ins), [(trap)], II_TRAP>,
     PseudoInstExpansion<(RealInst 0, 0)> {
-  let isBarrier = 1;
-  let isTerminator = 1;
+  let mayStore = 0;
+  let mayLoad = 0;
+  let hasSideEffects = 1;
+  let isTrap = 1;
   let isCodeGenOnly = 1;
-  let isCTI = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2588,6 +2589,22 @@ def : MipsInstAlias<"seq $rd, $imm",
                     (SEQIMacro GPR32Opnd:$rd, GPR32Opnd:$rd, simm32:$imm), 0>,
                     NOT_ASE_CNMIPS;
 
+def SNEMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                 (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                                 "sne $rd, $rs, $rt">, NOT_ASE_CNMIPS;
+
+def : MipsInstAlias<"sne $rd, $rs",
+                    (SNEMacro GPR32Opnd:$rd, GPR32Opnd:$rd, GPR32Opnd:$rs), 0>,
+                    NOT_ASE_CNMIPS;
+
+def SNEIMacro : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                  (ins GPR32Opnd:$rs, simm32_relaxed:$imm),
+                                  "sne $rd, $rs, $imm">, NOT_ASE_CNMIPS;
+
+def : MipsInstAlias<"sne $rd, $imm",
+                    (SNEIMacro GPR32Opnd:$rd, GPR32Opnd:$rd, simm32:$imm), 0>,
+                    NOT_ASE_CNMIPS;
+
 def MULImmMacro : MipsAsmPseudoInst<(outs), (ins GPR32Opnd:$rd, GPR32Opnd:$rs,
                                                  simm32_relaxed:$imm),
                                     "mul\t$rd, $rs, $imm">,
@@ -2735,6 +2752,34 @@ let AdditionalPredicates = [NotInMicroMips] in {
                                                  uimm32_coerced:$imm), 0>,
         GPR_32;
 
+  def SLE : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                              (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                              "sle\t$rd, $rs, $rt">, ISA_MIPS1;
+  def : MipsInstAlias<"sle $rs, $rt",
+                      (SLE GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
+        ISA_MIPS1;
+  def SLEImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                 (ins GPR32Opnd:$rs, simm32:$imm),
+                                 "sle\t$rd, $rs, $imm">, GPR_32;
+  def : MipsInstAlias<"sle $rs, $imm", (SLEImm GPR32Opnd:$rs,
+                                               GPR32Opnd:$rs,
+                                               simm32:$imm), 0>,
+        GPR_32;
+
+  def SLEU : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                               (ins GPR32Opnd:$rs, GPR32Opnd:$rt),
+                               "sleu\t$rd, $rs, $rt">, ISA_MIPS1;
+  def : MipsInstAlias<"sleu $rs, $rt",
+                      (SLEU GPR32Opnd:$rs, GPR32Opnd:$rs, GPR32Opnd:$rt), 0>,
+        ISA_MIPS1;
+  def SLEUImm : MipsAsmPseudoInst<(outs GPR32Opnd:$rd),
+                                  (ins GPR32Opnd:$rs, uimm32_coerced:$imm),
+                                  "sleu\t$rd, $rs, $imm">, GPR_32;
+  def : MipsInstAlias<"sleu $rs, $imm", (SLEUImm GPR32Opnd:$rs,
+                                                 GPR32Opnd:$rs,
+                                                 uimm32_coerced:$imm), 0>,
+        GPR_32;
+
   def : MipsInstAlias<
           "not $rt, $rs",
           (NOR GPR32Opnd:$rt, GPR32Opnd:$rs, ZERO), 0>, ISA_MIPS1;
diff --git a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
index 2f4c9d74262e..256fb74c1d6c 100644
--- a/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
+++ b/llvm/lib/Target/Mips/MipsInstructionSelector.cpp
@@ -49,6 +49,12 @@ private:
   getRegClassForTypeOnBank(Register Reg, MachineRegisterInfo &MRI) const;
   unsigned selectLoadStoreOpCode(MachineInstr &I,
                                  MachineRegisterInfo &MRI) const;
+  bool buildUnalignedStore(MachineInstr &I, unsigned Opc,
+                           MachineOperand &BaseAddr, unsigned Offset,
+                           MachineMemOperand *MMO) const;
+  bool buildUnalignedLoad(MachineInstr &I, unsigned Opc, Register Dest,
+                          MachineOperand &BaseAddr, unsigned Offset,
+                          Register TiedDest, MachineMemOperand *MMO) const;
 
   const MipsTargetMachine &TM;
   const MipsSubtarget &STI;
@@ -248,6 +254,35 @@ MipsInstructionSelector::selectLoadStoreOpCode(MachineInstr &I,
   return Opc;
 }
 
+bool MipsInstructionSelector::buildUnalignedStore(
+    MachineInstr &I, unsigned Opc, MachineOperand &BaseAddr, unsigned Offset,
+    MachineMemOperand *MMO) const {
+  MachineInstr *NewInst =
+      BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc))
+          .add(I.getOperand(0))
+          .add(BaseAddr)
+          .addImm(Offset)
+          .addMemOperand(MMO);
+  if (!constrainSelectedInstRegOperands(*NewInst, TII, TRI, RBI))
+    return false;
+  return true;
+}
+
+bool MipsInstructionSelector::buildUnalignedLoad(
+    MachineInstr &I, unsigned Opc, Register Dest, MachineOperand &BaseAddr,
+    unsigned Offset, Register TiedDest, MachineMemOperand *MMO) const {
+  MachineInstr *NewInst =
+      BuildMI(*I.getParent(), I, I.getDebugLoc(), TII.get(Opc))
+          .addDef(Dest)
+          .add(BaseAddr)
+          .addImm(Offset)
+          .addUse(TiedDest)
+          .addMemOperand(*I.memoperands_begin());
+  if (!constrainSelectedInstRegOperands(*NewInst, TII, TRI, RBI))
+    return false;
+  return true;
+}
+
 bool MipsInstructionSelector::select(MachineInstr &I) {
 
   MachineBasicBlock &MBB = *I.getParent();
@@ -358,7 +393,7 @@ bool MipsInstructionSelector::select(MachineInstr &I) {
             .addUse(DestAddress)
             .addJumpTableIndex(I.getOperand(1).getIndex(), MipsII::MO_ABS_LO)
             .addMemOperand(MF.getMachineMemOperand(
-                MachinePointerInfo(), MachineMemOperand::MOLoad, 4, 4));
+                MachinePointerInfo(), MachineMemOperand::MOLoad, 4, Align(4)));
     if (!constrainSelectedInstRegOperands(*LW, TII, TRI, RBI))
       return false;
 
@@ -369,7 +404,7 @@ bool MipsInstructionSelector::select(MachineInstr &I) {
                                .addDef(Dest)
                                .addUse(DestTmp)
                                .addUse(MF.getInfo<MipsFunctionInfo>()
-                                           ->getGlobalBaseRegForGlobalISel());
+                                           ->getGlobalBaseRegForGlobalISel(MF));
       if (!constrainSelectedInstRegOperands(*ADDu, TII, TRI, RBI))
         return false;
     }
@@ -404,10 +439,7 @@ bool MipsInstructionSelector::select(MachineInstr &I) {
   case G_LOAD:
   case G_ZEXTLOAD:
   case G_SEXTLOAD: {
-    const unsigned NewOpc = selectLoadStoreOpCode(I, MRI);
-    if (NewOpc == I.getOpcode())
-      return false;
-
+    auto MMO = *I.memoperands_begin();
     MachineOperand BaseAddr = I.getOperand(1);
     int64_t SignedOffset = 0;
     // Try to fold load/store + G_PTR_ADD + G_CONSTANT
@@ -429,11 +461,48 @@ bool MipsInstructionSelector::select(MachineInstr &I) {
       }
     }
 
+    // Unaligned memory access
+    if (MMO->getAlign() < MMO->getSize() &&
+        !STI.systemSupportsUnalignedAccess()) {
+      if (MMO->getSize() != 4 || !isRegInGprb(I.getOperand(0).getReg(), MRI))
+        return false;
+
+      if (I.getOpcode() == G_STORE) {
+        if (!buildUnalignedStore(I, Mips::SWL, BaseAddr, SignedOffset + 3, MMO))
+          return false;
+        if (!buildUnalignedStore(I, Mips::SWR, BaseAddr, SignedOffset, MMO))
+          return false;
+        I.eraseFromParent();
+        return true;
+      }
+
+      if (I.getOpcode() == G_LOAD) {
+        Register ImplDef = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+        BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::IMPLICIT_DEF))
+            .addDef(ImplDef);
+        Register Tmp = MRI.createVirtualRegister(&Mips::GPR32RegClass);
+        if (!buildUnalignedLoad(I, Mips::LWL, Tmp, BaseAddr, SignedOffset + 3,
+                                ImplDef, MMO))
+          return false;
+        if (!buildUnalignedLoad(I, Mips::LWR, I.getOperand(0).getReg(),
+                                BaseAddr, SignedOffset, Tmp, MMO))
+          return false;
+        I.eraseFromParent();
+        return true;
+      }
+
+      return false;
+    }
+
+    const unsigned NewOpc = selectLoadStoreOpCode(I, MRI);
+    if (NewOpc == I.getOpcode())
+      return false;
+
     MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(NewOpc))
              .add(I.getOperand(0))
              .add(BaseAddr)
              .addImm(SignedOffset)
-             .addMemOperand(*I.memoperands_begin());
+             .addMemOperand(MMO);
     break;
   }
   case G_UDIV:
@@ -472,6 +541,36 @@ bool MipsInstructionSelector::select(MachineInstr &I) {
              .add(I.getOperand(3));
     break;
   }
+  case G_UNMERGE_VALUES: {
+    if (I.getNumOperands() != 3)
+      return false;
+    Register Src = I.getOperand(2).getReg();
+    Register Lo = I.getOperand(0).getReg();
+    Register Hi = I.getOperand(1).getReg();
+    if (!isRegInFprb(Src, MRI) ||
+        !(isRegInGprb(Lo, MRI) && isRegInGprb(Hi, MRI)))
+      return false;
+
+    unsigned Opcode =
+        STI.isFP64bit() ? Mips::ExtractElementF64_64 : Mips::ExtractElementF64;
+
+    MachineInstr *ExtractLo = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Opcode))
+                                  .addDef(Lo)
+                                  .addUse(Src)
+                                  .addImm(0);
+    if (!constrainSelectedInstRegOperands(*ExtractLo, TII, TRI, RBI))
+      return false;
+
+    MachineInstr *ExtractHi = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Opcode))
+                                  .addDef(Hi)
+                                  .addUse(Src)
+                                  .addImm(1);
+    if (!constrainSelectedInstRegOperands(*ExtractHi, TII, TRI, RBI))
+      return false;
+
+    I.eraseFromParent();
+    return true;
+  }
   case G_IMPLICIT_DEF: {
     Register Dst = I.getOperand(0).getReg();
     MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::IMPLICIT_DEF))
@@ -570,7 +669,7 @@ bool MipsInstructionSelector::select(MachineInstr &I) {
       MachineInstr *LWGOT = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LW))
                                 .addDef(I.getOperand(0).getReg())
                                 .addReg(MF.getInfo<MipsFunctionInfo>()
-                                            ->getGlobalBaseRegForGlobalISel())
+                                            ->getGlobalBaseRegForGlobalISel(MF))
                                 .addGlobalAddress(GVal);
       // Global Values that don't have local linkage are handled differently
       // when they are part of call sequence. MipsCallLowering::lowerCall
@@ -582,7 +681,7 @@ bool MipsInstructionSelector::select(MachineInstr &I) {
         LWGOT->getOperand(2).setTargetFlags(MipsII::MO_GOT);
       LWGOT->addMemOperand(
           MF, MF.getMachineMemOperand(MachinePointerInfo::getGOT(MF),
-                                      MachineMemOperand::MOLoad, 4, 4));
+                                      MachineMemOperand::MOLoad, 4, Align(4)));
       if (!constrainSelectedInstRegOperands(*LWGOT, TII, TRI, RBI))
         return false;
 
@@ -626,11 +725,11 @@ bool MipsInstructionSelector::select(MachineInstr &I) {
       MI = BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LW))
                .addDef(I.getOperand(0).getReg())
                .addReg(MF.getInfo<MipsFunctionInfo>()
-                           ->getGlobalBaseRegForGlobalISel())
+                           ->getGlobalBaseRegForGlobalISel(MF))
                .addJumpTableIndex(I.getOperand(1).getIndex(), MipsII::MO_GOT)
-               .addMemOperand(
-                   MF.getMachineMemOperand(MachinePointerInfo::getGOT(MF),
-                                           MachineMemOperand::MOLoad, 4, 4));
+               .addMemOperand(MF.getMachineMemOperand(
+                   MachinePointerInfo::getGOT(MF), MachineMemOperand::MOLoad, 4,
+                   Align(4)));
     } else {
       MI =
           BuildMI(MBB, I, I.getDebugLoc(), TII.get(Mips::LUi))
diff --git a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
index 9645aa24dc05..b489c8137769 100644
--- a/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsLegalizerInfo.cpp
@@ -21,22 +21,38 @@ struct TypesAndMemOps {
   LLT ValTy;
   LLT PtrTy;
   unsigned MemSize;
-  bool MustBeNaturallyAligned;
+  bool SystemSupportsUnalignedAccess;
 };
 
+// Assumes power of 2 memory size. Subtargets that have only naturally-aligned
+// memory access need to perform additional legalization here.
+static bool isUnalignedMemmoryAccess(uint64_t MemSize, uint64_t AlignInBits) {
+  assert(isPowerOf2_64(MemSize) && "Expected power of 2 memory size");
+  assert(isPowerOf2_64(AlignInBits) && "Expected power of 2 align");
+  if (MemSize > AlignInBits)
+    return true;
+  return false;
+}
+
 static bool
 CheckTy0Ty1MemSizeAlign(const LegalityQuery &Query,
                         std::initializer_list<TypesAndMemOps> SupportedValues) {
+  unsigned QueryMemSize = Query.MMODescrs[0].SizeInBits;
+
+  // Non power of two memory access is never legal.
+  if (!isPowerOf2_64(QueryMemSize))
+    return false;
+
   for (auto &Val : SupportedValues) {
     if (Val.ValTy != Query.Types[0])
       continue;
     if (Val.PtrTy != Query.Types[1])
       continue;
-    if (Val.MemSize != Query.MMODescrs[0].SizeInBits)
-      continue;
-    if (Val.MustBeNaturallyAligned &&
-        Query.MMODescrs[0].SizeInBits % Query.MMODescrs[0].AlignInBits != 0)
+    if (Val.MemSize != QueryMemSize)
       continue;
+    if (!Val.SystemSupportsUnalignedAccess &&
+        isUnalignedMemmoryAccess(QueryMemSize, Query.MMODescrs[0].AlignInBits))
+      return false;
     return true;
   }
   return false;
@@ -79,20 +95,55 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
       .legalFor({s32})
       .maxScalar(0, s32);
 
+  // MIPS32r6 does not have alignment restrictions for memory access.
+  // For MIPS32r5 and older memory access must be naturally-aligned i.e. aligned
+  // to at least a multiple of its own size. There is however a two instruction
+  // combination that performs 4 byte unaligned access (lwr/lwl and swl/swr)
+  // therefore 4 byte load and store are legal and will use NoAlignRequirements.
+  bool NoAlignRequirements = true;
+
   getActionDefinitionsBuilder({G_LOAD, G_STORE})
       .legalIf([=, &ST](const LegalityQuery &Query) {
-        if (CheckTy0Ty1MemSizeAlign(Query, {{s32, p0, 8, ST.hasMips32r6()},
-                                            {s32, p0, 16, ST.hasMips32r6()},
-                                            {s32, p0, 32, ST.hasMips32r6()},
-                                            {p0, p0, 32, ST.hasMips32r6()},
-                                            {s64, p0, 64, ST.hasMips32r6()}}))
+        if (CheckTy0Ty1MemSizeAlign(
+                Query, {{s32, p0, 8, NoAlignRequirements},
+                        {s32, p0, 16, ST.systemSupportsUnalignedAccess()},
+                        {s32, p0, 32, NoAlignRequirements},
+                        {p0, p0, 32, NoAlignRequirements},
+                        {s64, p0, 64, ST.systemSupportsUnalignedAccess()}}))
+          return true;
+        if (ST.hasMSA() && CheckTy0Ty1MemSizeAlign(
+                               Query, {{v16s8, p0, 128, NoAlignRequirements},
+                                       {v8s16, p0, 128, NoAlignRequirements},
+                                       {v4s32, p0, 128, NoAlignRequirements},
+                                       {v2s64, p0, 128, NoAlignRequirements}}))
+          return true;
+        return false;
+      })
+      // Custom lower scalar memory access, up to 8 bytes, for:
+      // - non-power-of-2 MemSizes
+      // - unaligned 2 or 8 byte MemSizes for MIPS32r5 and older
+      .customIf([=, &ST](const LegalityQuery &Query) {
+        if (!Query.Types[0].isScalar() || Query.Types[1] != p0 ||
+            Query.Types[0] == s1)
+          return false;
+
+        unsigned Size = Query.Types[0].getSizeInBits();
+        unsigned QueryMemSize = Query.MMODescrs[0].SizeInBits;
+        assert(QueryMemSize <= Size && "Scalar can't hold MemSize");
+
+        if (Size > 64 || QueryMemSize > 64)
+          return false;
+
+        if (!isPowerOf2_64(Query.MMODescrs[0].SizeInBits))
           return true;
-        if (ST.hasMSA() &&
-            CheckTy0Ty1MemSizeAlign(Query, {{v16s8, p0, 128, false},
-                                            {v8s16, p0, 128, false},
-                                            {v4s32, p0, 128, false},
-                                            {v2s64, p0, 128, false}}))
+
+        if (!ST.systemSupportsUnalignedAccess() &&
+            isUnalignedMemmoryAccess(QueryMemSize,
+                                     Query.MMODescrs[0].AlignInBits)) {
+          assert(QueryMemSize != 32 && "4 byte load and store are legal");
           return true;
+        }
+
         return false;
       })
       .minScalar(0, s32);
@@ -111,7 +162,7 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
                                  {s32, p0, 16, 8}})
       .clampScalar(0, s32, s32);
 
-  getActionDefinitionsBuilder({G_ZEXT, G_SEXT})
+  getActionDefinitionsBuilder({G_ZEXT, G_SEXT, G_ANYEXT})
       .legalIf([](const LegalityQuery &Query) { return false; })
       .maxScalar(0, s32);
 
@@ -202,6 +253,25 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
       .lowerFor({s32})
       .maxScalar(0, s32);
 
+  getActionDefinitionsBuilder(G_CTLZ)
+      .legalFor({{s32, s32}})
+      .maxScalar(0, s32)
+      .maxScalar(1, s32);
+  getActionDefinitionsBuilder(G_CTLZ_ZERO_UNDEF)
+      .lowerFor({{s32, s32}});
+
+  getActionDefinitionsBuilder(G_CTTZ)
+      .lowerFor({{s32, s32}})
+      .maxScalar(0, s32)
+      .maxScalar(1, s32);
+  getActionDefinitionsBuilder(G_CTTZ_ZERO_UNDEF)
+      .lowerFor({{s32, s32}, {s64, s64}});
+
+  getActionDefinitionsBuilder(G_CTPOP)
+      .lowerFor({{s32, s32}})
+      .clampScalar(0, s32, s32)
+      .clampScalar(1, s32, s32);
+
   // FP instructions
   getActionDefinitionsBuilder(G_FCONSTANT)
       .legalFor({s32, s64});
@@ -256,20 +326,98 @@ MipsLegalizerInfo::MipsLegalizerInfo(const MipsSubtarget &ST) {
   verify(*ST.getInstrInfo());
 }
 
-bool MipsLegalizerInfo::legalizeCustom(MachineInstr &MI,
-                                       MachineRegisterInfo &MRI,
-                                       MachineIRBuilder &MIRBuilder,
-                                       GISelChangeObserver &Observer) const {
-
+bool MipsLegalizerInfo::legalizeCustom(LegalizerHelper &Helper,
+                                       MachineInstr &MI) const {
   using namespace TargetOpcode;
 
-  MIRBuilder.setInstr(MI);
-  const MipsSubtarget &STI =
-      static_cast<const MipsSubtarget &>(MIRBuilder.getMF().getSubtarget());
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
+
   const LLT s32 = LLT::scalar(32);
   const LLT s64 = LLT::scalar(64);
 
   switch (MI.getOpcode()) {
+  case G_LOAD:
+  case G_STORE: {
+    unsigned MemSize = (**MI.memoperands_begin()).getSize();
+    Register Val = MI.getOperand(0).getReg();
+    unsigned Size = MRI.getType(Val).getSizeInBits();
+
+    MachineMemOperand *MMOBase = *MI.memoperands_begin();
+
+    assert(MemSize <= 8 && "MemSize is too large");
+    assert(Size <= 64 && "Scalar size is too large");
+
+    // Split MemSize into two, P2HalfMemSize is largest power of two smaller
+    // then MemSize. e.g. 8 = 4 + 4 , 6 = 4 + 2, 3 = 2 + 1.
+    unsigned P2HalfMemSize, RemMemSize;
+    if (isPowerOf2_64(MemSize)) {
+      P2HalfMemSize = RemMemSize = MemSize / 2;
+    } else {
+      P2HalfMemSize = 1 << Log2_32(MemSize);
+      RemMemSize = MemSize - P2HalfMemSize;
+    }
+
+    Register BaseAddr = MI.getOperand(1).getReg();
+    LLT PtrTy = MRI.getType(BaseAddr);
+    MachineFunction &MF = MIRBuilder.getMF();
+
+    auto P2HalfMemOp = MF.getMachineMemOperand(MMOBase, 0, P2HalfMemSize);
+    auto RemMemOp = MF.getMachineMemOperand(MMOBase, P2HalfMemSize, RemMemSize);
+
+    if (MI.getOpcode() == G_STORE) {
+      // Widen Val to s32 or s64 in order to create legal G_LSHR or G_UNMERGE.
+      if (Size < 32)
+        Val = MIRBuilder.buildAnyExt(s32, Val).getReg(0);
+      if (Size > 32 && Size < 64)
+        Val = MIRBuilder.buildAnyExt(s64, Val).getReg(0);
+
+      auto C_P2HalfMemSize = MIRBuilder.buildConstant(s32, P2HalfMemSize);
+      auto Addr = MIRBuilder.buildPtrAdd(PtrTy, BaseAddr, C_P2HalfMemSize);
+
+      if (MI.getOpcode() == G_STORE && MemSize <= 4) {
+        MIRBuilder.buildStore(Val, BaseAddr, *P2HalfMemOp);
+        auto C_P2Half_InBits = MIRBuilder.buildConstant(s32, P2HalfMemSize * 8);
+        auto Shift = MIRBuilder.buildLShr(s32, Val, C_P2Half_InBits);
+        MIRBuilder.buildStore(Shift, Addr, *RemMemOp);
+      } else {
+        auto Unmerge = MIRBuilder.buildUnmerge(s32, Val);
+        MIRBuilder.buildStore(Unmerge.getReg(0), BaseAddr, *P2HalfMemOp);
+        MIRBuilder.buildStore(Unmerge.getReg(1), Addr, *RemMemOp);
+      }
+    }
+
+    if (MI.getOpcode() == G_LOAD) {
+
+      if (MemSize <= 4) {
+        // This is anyextending load, use 4 byte lwr/lwl.
+        auto *Load4MMO = MF.getMachineMemOperand(MMOBase, 0, 4);
+
+        if (Size == 32)
+          MIRBuilder.buildLoad(Val, BaseAddr, *Load4MMO);
+        else {
+          auto Load = MIRBuilder.buildLoad(s32, BaseAddr, *Load4MMO);
+          MIRBuilder.buildTrunc(Val, Load.getReg(0));
+        }
+
+      } else {
+        auto C_P2HalfMemSize = MIRBuilder.buildConstant(s32, P2HalfMemSize);
+        auto Addr = MIRBuilder.buildPtrAdd(PtrTy, BaseAddr, C_P2HalfMemSize);
+
+        auto Load_P2Half = MIRBuilder.buildLoad(s32, BaseAddr, *P2HalfMemOp);
+        auto Load_Rem = MIRBuilder.buildLoad(s32, Addr, *RemMemOp);
+
+        if (Size == 64)
+          MIRBuilder.buildMerge(Val, {Load_P2Half, Load_Rem});
+        else {
+          auto Merge = MIRBuilder.buildMerge(s64, {Load_P2Half, Load_Rem});
+          MIRBuilder.buildTrunc(Val, Merge);
+        }
+      }
+    }
+    MI.eraseFromParent();
+    break;
+  }
   case G_UITOFP: {
     Register Dst = MI.getOperand(0).getReg();
     Register Src = MI.getOperand(1).getReg();
@@ -288,11 +436,8 @@ bool MipsLegalizerInfo::legalizeCustom(MachineInstr &MI,
     // Next, subtract  2^52 * 0x1.0000000000000 i.e. 0x10000000000000.0 from it.
     // Done. Trunc double to float if needed.
 
-    MachineInstrBuilder Bitcast = MIRBuilder.buildInstr(
-        STI.isFP64bit() ? Mips::BuildPairF64_64 : Mips::BuildPairF64, {s64},
-        {Src, MIRBuilder.buildConstant(s32, UINT32_C(0x43300000))});
-    Bitcast.constrainAllUses(MIRBuilder.getTII(), *STI.getRegisterInfo(),
-                             *STI.getRegBankInfo());
+    auto C_HiMask = MIRBuilder.buildConstant(s32, UINT32_C(0x43300000));
+    auto Bitcast = MIRBuilder.buildMerge(s64, {Src, C_HiMask.getReg(0)});
 
     MachineInstrBuilder TwoP52FP = MIRBuilder.buildFConstant(
         s64, BitsToDouble(UINT64_C(0x4330000000000000)));
@@ -352,15 +497,15 @@ static bool MSA2OpIntrinsicToGeneric(MachineInstr &MI, unsigned Opcode,
   return true;
 }
 
-bool MipsLegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
-                                          MachineRegisterInfo &MRI,
-                                          MachineIRBuilder &MIRBuilder) const {
+bool MipsLegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
+                                          MachineInstr &MI) const {
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
+  MachineRegisterInfo &MRI = *MIRBuilder.getMRI();
   const MipsSubtarget &ST =
       static_cast<const MipsSubtarget &>(MI.getMF()->getSubtarget());
   const MipsInstrInfo &TII = *ST.getInstrInfo();
   const MipsRegisterInfo &TRI = *ST.getRegisterInfo();
   const RegisterBankInfo &RBI = *ST.getRegBankInfo();
-  MIRBuilder.setInstr(MI);
 
   switch (MI.getIntrinsicID()) {
   case Intrinsic::memcpy:
@@ -377,14 +522,14 @@ bool MipsLegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
     return constrainSelectedInstRegOperands(*Trap, TII, TRI, RBI);
   }
   case Intrinsic::vacopy: {
-    Register Tmp = MRI.createGenericVirtualRegister(LLT::pointer(0, 32));
     MachinePointerInfo MPO;
-    MIRBuilder.buildLoad(Tmp, MI.getOperand(2),
-                         *MI.getMF()->getMachineMemOperand(
-                             MPO, MachineMemOperand::MOLoad, 4, 4));
+    auto Tmp =
+        MIRBuilder.buildLoad(LLT::pointer(0, 32), MI.getOperand(2),
+                             *MI.getMF()->getMachineMemOperand(
+                                 MPO, MachineMemOperand::MOLoad, 4, Align(4)));
     MIRBuilder.buildStore(Tmp, MI.getOperand(1),
                           *MI.getMF()->getMachineMemOperand(
-                              MPO, MachineMemOperand::MOStore, 4, 4));
+                              MPO, MachineMemOperand::MOStore, 4, Align(4)));
     MI.eraseFromParent();
     return true;
   }
diff --git a/llvm/lib/Target/Mips/MipsLegalizerInfo.h b/llvm/lib/Target/Mips/MipsLegalizerInfo.h
index 9696c262b2db..05027b718a85 100644
--- a/llvm/lib/Target/Mips/MipsLegalizerInfo.h
+++ b/llvm/lib/Target/Mips/MipsLegalizerInfo.h
@@ -25,12 +25,10 @@ class MipsLegalizerInfo : public LegalizerInfo {
 public:
   MipsLegalizerInfo(const MipsSubtarget &ST);
 
-  bool legalizeCustom(MachineInstr &MI, MachineRegisterInfo &MRI,
-                      MachineIRBuilder &MIRBuilder,
-                      GISelChangeObserver &Observer) const override;
+  bool legalizeCustom(LegalizerHelper &Helper, MachineInstr &MI) const override;
 
-  bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
-                         MachineIRBuilder &MIRBuilder) const override;
+  bool legalizeIntrinsic(LegalizerHelper &Helper,
+                         MachineInstr &MI) const override;
 };
 } // end namespace llvm
 #endif
diff --git a/llvm/lib/Target/Mips/MipsMSAInstrInfo.td b/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
index 0fef518c240e..3e32574596ca 100644
--- a/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
+++ b/llvm/lib/Target/Mips/MipsMSAInstrInfo.td
@@ -2339,6 +2339,16 @@ class LDI_H_DESC : MSA_I10_LDI_DESC_BASE<"ldi.h", MSA128HOpnd>;
 class LDI_W_DESC : MSA_I10_LDI_DESC_BASE<"ldi.w", MSA128WOpnd>;
 class LDI_D_DESC : MSA_I10_LDI_DESC_BASE<"ldi.d", MSA128DOpnd>;
 
+class MSA_LOAD_PSEUDO_BASE<SDPatternOperator intrinsic, RegisterOperand RO> :
+  PseudoSE<(outs RO:$dst), (ins PtrRC:$ptr, GPR32:$imm),
+           [(set RO:$dst, (intrinsic iPTR:$ptr, GPR32:$imm))]> {
+  let hasNoSchedulingInfo = 1;
+  let usesCustomInserter = 1;
+}
+
+def LDR_D : MSA_LOAD_PSEUDO_BASE<int_mips_ldr_d, MSA128DOpnd>;
+def LDR_W : MSA_LOAD_PSEUDO_BASE<int_mips_ldr_w, MSA128WOpnd>;
+
 class LSA_DESC_BASE<string instr_asm, RegisterOperand RORD,
                     InstrItinClass itin = NoItinerary> {
   dag OutOperandList = (outs RORD:$rd);
@@ -2671,6 +2681,16 @@ class ST_W_DESC : ST_DESC_BASE<"st.w", store, v4i32, MSA128WOpnd,
 class ST_D_DESC : ST_DESC_BASE<"st.d", store, v2i64, MSA128DOpnd,
                                mem_simm10_lsl3, addrimm10lsl3>;
 
+class MSA_STORE_PSEUDO_BASE<SDPatternOperator intrinsic, RegisterOperand RO> :
+  PseudoSE<(outs), (ins RO:$dst, PtrRC:$ptr, GPR32:$imm),
+           [(intrinsic RO:$dst, iPTR:$ptr, GPR32:$imm)]> {
+  let hasNoSchedulingInfo = 1;
+  let usesCustomInserter = 1;
+}
+
+def STR_D : MSA_STORE_PSEUDO_BASE<int_mips_str_d, MSA128DOpnd>;
+def STR_W : MSA_STORE_PSEUDO_BASE<int_mips_str_w, MSA128WOpnd>;
+
 class SUBS_S_B_DESC : MSA_3R_DESC_BASE<"subs_s.b", int_mips_subs_s_b,
                                        MSA128BOpnd>;
 class SUBS_S_H_DESC : MSA_3R_DESC_BASE<"subs_s.h", int_mips_subs_s_h,
diff --git a/llvm/lib/Target/Mips/MipsMachineFunction.cpp b/llvm/lib/Target/Mips/MipsMachineFunction.cpp
index 85b20fc58231..a7a2be30f58a 100644
--- a/llvm/lib/Target/Mips/MipsMachineFunction.cpp
+++ b/llvm/lib/Target/Mips/MipsMachineFunction.cpp
@@ -44,22 +44,22 @@ static const TargetRegisterClass &getGlobalBaseRegClass(MachineFunction &MF) {
   return Mips::GPR32RegClass;
 }
 
-Register MipsFunctionInfo::getGlobalBaseReg() {
+Register MipsFunctionInfo::getGlobalBaseReg(MachineFunction &MF) {
   if (!GlobalBaseReg)
     GlobalBaseReg =
         MF.getRegInfo().createVirtualRegister(&getGlobalBaseRegClass(MF));
   return GlobalBaseReg;
 }
 
-Register MipsFunctionInfo::getGlobalBaseRegForGlobalISel() {
+Register MipsFunctionInfo::getGlobalBaseRegForGlobalISel(MachineFunction &MF) {
   if (!GlobalBaseReg) {
-    getGlobalBaseReg();
-    initGlobalBaseReg();
+    getGlobalBaseReg(MF);
+    initGlobalBaseReg(MF);
   }
   return GlobalBaseReg;
 }
 
-void MipsFunctionInfo::initGlobalBaseReg() {
+void MipsFunctionInfo::initGlobalBaseReg(MachineFunction &MF) {
   if (!GlobalBaseReg)
     return;
 
@@ -68,14 +68,13 @@ void MipsFunctionInfo::initGlobalBaseReg() {
   MachineRegisterInfo &RegInfo = MF.getRegInfo();
   const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo();
   DebugLoc DL;
-  unsigned V0, V1;
   const TargetRegisterClass *RC;
   const MipsABIInfo &ABI =
       static_cast<const MipsTargetMachine &>(MF.getTarget()).getABI();
   RC = (ABI.IsN64()) ? &Mips::GPR64RegClass : &Mips::GPR32RegClass;
 
-  V0 = RegInfo.createVirtualRegister(RC);
-  V1 = RegInfo.createVirtualRegister(RC);
+  Register V0 = RegInfo.createVirtualRegister(RC);
+  Register V1 = RegInfo.createVirtualRegister(RC);
 
   if (ABI.IsN64()) {
     MF.getRegInfo().addLiveIn(Mips::T9_64);
@@ -147,7 +146,7 @@ void MipsFunctionInfo::initGlobalBaseReg() {
       .addReg(Mips::V0).addReg(Mips::T9);
 }
 
-void MipsFunctionInfo::createEhDataRegsFI() {
+void MipsFunctionInfo::createEhDataRegsFI(MachineFunction &MF) {
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   for (int I = 0; I < 4; ++I) {
     const TargetRegisterClass &RC =
@@ -155,12 +154,12 @@ void MipsFunctionInfo::createEhDataRegsFI() {
             ? Mips::GPR64RegClass
             : Mips::GPR32RegClass;
 
-    EhDataRegFI[I] = MF.getFrameInfo().CreateStackObject(TRI.getSpillSize(RC),
-        TRI.getSpillAlignment(RC), false);
+    EhDataRegFI[I] = MF.getFrameInfo().CreateStackObject(
+        TRI.getSpillSize(RC), TRI.getSpillAlign(RC), false);
   }
 }
 
-void MipsFunctionInfo::createISRRegFI() {
+void MipsFunctionInfo::createISRRegFI(MachineFunction &MF) {
   // ISRs require spill slots for Status & ErrorPC Coprocessor 0 registers.
   // The current implementation only supports Mips32r2+ not Mips64rX. Status
   // is always 32 bits, ErrorPC is 32 or 64 bits dependent on architecture,
@@ -170,7 +169,7 @@ void MipsFunctionInfo::createISRRegFI() {
 
   for (int I = 0; I < 2; ++I)
     ISRDataRegFI[I] = MF.getFrameInfo().CreateStackObject(
-        TRI.getSpillSize(RC), TRI.getSpillAlignment(RC), false);
+        TRI.getSpillSize(RC), TRI.getSpillAlign(RC), false);
 }
 
 bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
@@ -181,19 +180,22 @@ bool MipsFunctionInfo::isEhDataRegFI(int FI) const {
 bool MipsFunctionInfo::isISRRegFI(int FI) const {
   return IsISR && (FI == ISRDataRegFI[0] || FI == ISRDataRegFI[1]);
 }
-MachinePointerInfo MipsFunctionInfo::callPtrInfo(const char *ES) {
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(MachineFunction &MF,
+                                                 const char *ES) {
   return MachinePointerInfo(MF.getPSVManager().getExternalSymbolCallEntry(ES));
 }
 
-MachinePointerInfo MipsFunctionInfo::callPtrInfo(const GlobalValue *GV) {
+MachinePointerInfo MipsFunctionInfo::callPtrInfo(MachineFunction &MF,
+                                                 const GlobalValue *GV) {
   return MachinePointerInfo(MF.getPSVManager().getGlobalValueCallEntry(GV));
 }
 
-int MipsFunctionInfo::getMoveF64ViaSpillFI(const TargetRegisterClass *RC) {
+int MipsFunctionInfo::getMoveF64ViaSpillFI(MachineFunction &MF,
+                                           const TargetRegisterClass *RC) {
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   if (MoveF64ViaSpillFI == -1) {
     MoveF64ViaSpillFI = MF.getFrameInfo().CreateStackObject(
-        TRI.getSpillSize(*RC), TRI.getSpillAlignment(*RC), false);
+        TRI.getSpillSize(*RC), TRI.getSpillAlign(*RC), false);
   }
   return MoveF64ViaSpillFI;
 }
diff --git a/llvm/lib/Target/Mips/MipsMachineFunction.h b/llvm/lib/Target/Mips/MipsMachineFunction.h
index aaa1e0e18441..786d210e2aaa 100644
--- a/llvm/lib/Target/Mips/MipsMachineFunction.h
+++ b/llvm/lib/Target/Mips/MipsMachineFunction.h
@@ -24,7 +24,7 @@ namespace llvm {
 /// Mips target-specific information for each MachineFunction.
 class MipsFunctionInfo : public MachineFunctionInfo {
 public:
-  MipsFunctionInfo(MachineFunction &MF) : MF(MF) {}
+  MipsFunctionInfo(MachineFunction &MF) {}
 
   ~MipsFunctionInfo() override;
 
@@ -32,12 +32,12 @@ public:
   void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
 
   bool globalBaseRegSet() const;
-  Register getGlobalBaseReg();
-  Register getGlobalBaseRegForGlobalISel();
+  Register getGlobalBaseReg(MachineFunction &MF);
+  Register getGlobalBaseRegForGlobalISel(MachineFunction &MF);
 
   // Insert instructions to initialize the global base register in the
   // first MBB of the function.
-  void initGlobalBaseReg();
+  void initGlobalBaseReg(MachineFunction &MF);
 
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
@@ -53,30 +53,30 @@ public:
   bool callsEhReturn() const { return CallsEhReturn; }
   void setCallsEhReturn() { CallsEhReturn = true; }
 
-  void createEhDataRegsFI();
+  void createEhDataRegsFI(MachineFunction &MF);
   int getEhDataRegFI(unsigned Reg) const { return EhDataRegFI[Reg]; }
   bool isEhDataRegFI(int FI) const;
 
   /// Create a MachinePointerInfo that has an ExternalSymbolPseudoSourceValue
   /// object representing a GOT entry for an external function.
-  MachinePointerInfo callPtrInfo(const char *ES);
+  MachinePointerInfo callPtrInfo(MachineFunction &MF, const char *ES);
 
   // Functions with the "interrupt" attribute require special prologues,
   // epilogues and additional spill slots.
   bool isISR() const { return IsISR; }
   void setISR() { IsISR = true; }
-  void createISRRegFI();
-  int getISRRegFI(unsigned Reg) const { return ISRDataRegFI[Reg]; }
+  void createISRRegFI(MachineFunction &MF);
+  int getISRRegFI(Register Reg) const { return ISRDataRegFI[Reg]; }
   bool isISRRegFI(int FI) const;
 
   /// Create a MachinePointerInfo that has a GlobalValuePseudoSourceValue object
   /// representing a GOT entry for a global function.
-  MachinePointerInfo callPtrInfo(const GlobalValue *GV);
+  MachinePointerInfo callPtrInfo(MachineFunction &MF, const GlobalValue *GV);
 
   void setSaveS2() { SaveS2 = true; }
   bool hasSaveS2() const { return SaveS2; }
 
-  int getMoveF64ViaSpillFI(const TargetRegisterClass *RC);
+  int getMoveF64ViaSpillFI(MachineFunction &MF, const TargetRegisterClass *RC);
 
   std::map<const char *, const Mips16HardFloatInfo::FuncSignature *>
   StubsNeeded;
@@ -84,17 +84,15 @@ public:
 private:
   virtual void anchor();
 
-  MachineFunction& MF;
-
   /// SRetReturnReg - Some subtargets require that sret lowering includes
   /// returning the value of the returned struct in a register. This field
   /// holds the virtual register into which the sret argument is passed.
-  unsigned SRetReturnReg = 0;
+  Register SRetReturnReg;
 
   /// GlobalBaseReg - keeps track of the virtual register initialized for
   /// use as the global base register. This is used for PIC in some PIC
   /// relocation models.
-  unsigned GlobalBaseReg = 0;
+  Register GlobalBaseReg;
 
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex = 0;
diff --git a/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp b/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
index 8bd64ff6cb27..2823d300dc6e 100644
--- a/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
+++ b/llvm/lib/Target/Mips/MipsOptimizePICCall.cpp
@@ -218,8 +218,7 @@ bool OptimizePICCall::runOnMachineFunction(MachineFunction &F) {
     MBBI.preVisit(ScopedHT);
     Changed |= visitNode(MBBI);
     const MachineDomTreeNode *Node = MBBI.getNode();
-    const std::vector<MachineDomTreeNode *> &Children = Node->getChildren();
-    WorkList.append(Children.begin(), Children.end());
+    WorkList.append(Node->begin(), Node->end());
   }
 
   return Changed;
diff --git a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
index f9d93ca29658..310e54b0ea8d 100644
--- a/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
+++ b/llvm/lib/Target/Mips/MipsPreLegalizerCombiner.cpp
@@ -44,9 +44,22 @@ bool MipsPreLegalizerCombinerInfo::combine(GISelChangeObserver &Observer,
     return false;
   case TargetOpcode::G_LOAD:
   case TargetOpcode::G_SEXTLOAD:
-  case TargetOpcode::G_ZEXTLOAD:
+  case TargetOpcode::G_ZEXTLOAD: {
+    // Don't attempt to combine non power of 2 loads or unaligned loads when
+    // subtarget doesn't support them.
+    auto MMO = *MI.memoperands_begin();
+    const MipsSubtarget &STI =
+        static_cast<const MipsSubtarget &>(MI.getMF()->getSubtarget());
+    if (!isPowerOf2_64(MMO->getSize()))
+      return false;
+    bool isUnaligned = MMO->getAlign() < MMO->getSize();
+    if (!STI.systemSupportsUnalignedAccess() && isUnaligned)
+      return false;
+
     return Helper.tryCombineExtendingLoads(MI);
   }
+  }
+
   return false;
 }
 
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
index 2a3f5a05dfe0..6325e513f9f8 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.cpp
@@ -132,9 +132,6 @@ static bool isFloatingPointOpcodeUse(unsigned Opc) {
   case TargetOpcode::G_FPTOSI:
   case TargetOpcode::G_FPTOUI:
   case TargetOpcode::G_FCMP:
-  case Mips::MFC1:
-  case Mips::ExtractElementF64:
-  case Mips::ExtractElementF64_64:
     return true;
   default:
     return isFloatingPointOpcode(Opc);
@@ -147,15 +144,25 @@ static bool isFloatingPointOpcodeDef(unsigned Opc) {
   switch (Opc) {
   case TargetOpcode::G_SITOFP:
   case TargetOpcode::G_UITOFP:
-  case Mips::MTC1:
-  case Mips::BuildPairF64:
-  case Mips::BuildPairF64_64:
     return true;
   default:
     return isFloatingPointOpcode(Opc);
   }
 }
 
+static bool isGprbTwoInstrUnalignedLoadOrStore(const MachineInstr *MI) {
+  if (MI->getOpcode() == TargetOpcode::G_LOAD ||
+      MI->getOpcode() == TargetOpcode::G_STORE) {
+    auto MMO = *MI->memoperands_begin();
+    const MipsSubtarget &STI =
+        static_cast<const MipsSubtarget &>(MI->getMF()->getSubtarget());
+    if (MMO->getSize() == 4 && (!STI.systemSupportsUnalignedAccess() &&
+                                MMO->getAlign() < MMO->getSize()))
+      return true;
+  }
+  return false;
+}
+
 static bool isAmbiguous(unsigned Opc) {
   switch (Opc) {
   case TargetOpcode::G_LOAD:
@@ -163,6 +170,8 @@ static bool isAmbiguous(unsigned Opc) {
   case TargetOpcode::G_PHI:
   case TargetOpcode::G_SELECT:
   case TargetOpcode::G_IMPLICIT_DEF:
+  case TargetOpcode::G_UNMERGE_VALUES:
+  case TargetOpcode::G_MERGE_VALUES:
     return true;
   default:
     return false;
@@ -247,10 +256,17 @@ MipsRegisterBankInfo::AmbiguousRegDefUseContainer::AmbiguousRegDefUseContainer(
 
   if (MI->getOpcode() == TargetOpcode::G_IMPLICIT_DEF)
     addDefUses(MI->getOperand(0).getReg(), MRI);
+
+  if (MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES)
+    addUseDef(MI->getOperand(MI->getNumOperands() - 1).getReg(), MRI);
+
+  if (MI->getOpcode() == TargetOpcode::G_MERGE_VALUES)
+      addDefUses(MI->getOperand(0).getReg(), MRI);
 }
 
 bool MipsRegisterBankInfo::TypeInfoForMF::visit(
-    const MachineInstr *MI, const MachineInstr *WaitingForTypeOfMI) {
+    const MachineInstr *MI, const MachineInstr *WaitingForTypeOfMI,
+    InstType &AmbiguousTy) {
   assert(isAmbiguous(MI->getOpcode()) && "Visiting non-Ambiguous opcode.\n");
   if (wasVisited(MI))
     return true; // InstType has already been determined for MI.
@@ -258,18 +274,28 @@ bool MipsRegisterBankInfo::TypeInfoForMF::visit(
   startVisit(MI);
   AmbiguousRegDefUseContainer DefUseContainer(MI);
 
+  if (isGprbTwoInstrUnalignedLoadOrStore(MI)) {
+    setTypes(MI, Integer);
+    return true;
+  }
+
+  if (AmbiguousTy == InstType::Ambiguous &&
+      (MI->getOpcode() == TargetOpcode::G_MERGE_VALUES ||
+       MI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES))
+    AmbiguousTy = InstType::AmbiguousWithMergeOrUnmerge;
+
   // Visit instructions where MI's DEF operands are USED.
-  if (visitAdjacentInstrs(MI, DefUseContainer.getDefUses(), true))
+  if (visitAdjacentInstrs(MI, DefUseContainer.getDefUses(), true, AmbiguousTy))
     return true;
 
   // Visit instructions that DEFINE MI's USE operands.
-  if (visitAdjacentInstrs(MI, DefUseContainer.getUseDefs(), false))
+  if (visitAdjacentInstrs(MI, DefUseContainer.getUseDefs(), false, AmbiguousTy))
     return true;
 
   // All MI's adjacent instructions, are ambiguous.
   if (!WaitingForTypeOfMI) {
     // This is chain of ambiguous instructions.
-    setTypes(MI, InstType::Ambiguous);
+    setTypes(MI, AmbiguousTy);
     return true;
   }
   // Excluding WaitingForTypeOfMI, MI is either connected to chains of ambiguous
@@ -286,7 +312,7 @@ bool MipsRegisterBankInfo::TypeInfoForMF::visit(
 
 bool MipsRegisterBankInfo::TypeInfoForMF::visitAdjacentInstrs(
     const MachineInstr *MI, SmallVectorImpl<MachineInstr *> &AdjacentInstrs,
-    bool isDefUse) {
+    bool isDefUse, InstType &AmbiguousTy) {
   while (!AdjacentInstrs.empty()) {
     MachineInstr *AdjMI = AdjacentInstrs.pop_back_val();
 
@@ -303,9 +329,11 @@ bool MipsRegisterBankInfo::TypeInfoForMF::visitAdjacentInstrs(
       return true;
     }
 
-    // Defaults to integer instruction. Includes G_MERGE_VALUES and
-    // G_UNMERGE_VALUES.
-    if (!isAmbiguous(AdjMI->getOpcode())) {
+    // Defaults to integer instruction. Small registers in G_MERGE (uses) and
+    // G_UNMERGE (defs) will always be gprb.
+    if ((!isDefUse && AdjMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES) ||
+        (isDefUse && AdjMI->getOpcode() == TargetOpcode::G_MERGE_VALUES) ||
+        !isAmbiguous(AdjMI->getOpcode())) {
       setTypes(MI, InstType::Integer);
       return true;
     }
@@ -314,7 +342,7 @@ bool MipsRegisterBankInfo::TypeInfoForMF::visitAdjacentInstrs(
     // adjacent instructions and determine InstType without visiting AdjMI.
     if (!wasVisited(AdjMI) ||
         getRecordedTypeForInstr(AdjMI) != InstType::NotDetermined) {
-      if (visit(AdjMI, MI)) {
+      if (visit(AdjMI, MI, AmbiguousTy)) {
         // InstType is successfully determined and is same as for AdjMI.
         setTypes(MI, getRecordedTypeForInstr(AdjMI));
         return true;
@@ -355,14 +383,15 @@ void MipsRegisterBankInfo::TypeInfoForMF::setTypesAccordingToPhysicalRegister(
 
 MipsRegisterBankInfo::InstType
 MipsRegisterBankInfo::TypeInfoForMF::determineInstType(const MachineInstr *MI) {
-  visit(MI, nullptr);
+  InstType DefaultAmbiguousType = InstType::Ambiguous;
+  visit(MI, nullptr, DefaultAmbiguousType);
   return getRecordedTypeForInstr(MI);
 }
 
 void MipsRegisterBankInfo::TypeInfoForMF::cleanupIfNewFunction(
     llvm::StringRef FunctionName) {
   if (MFName != FunctionName) {
-    MFName = FunctionName;
+    MFName = std::string(FunctionName);
     WaitingQueues.clear();
     Types.clear();
   }
@@ -453,6 +482,7 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
   case G_BRINDIRECT:
   case G_VASTART:
   case G_BSWAP:
+  case G_CTLZ:
     OperandsMapping = &Mips::ValueMappings[Mips::GPRIdx];
     break;
   case G_ADD:
@@ -467,7 +497,7 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
       OperandsMapping = getMSAMapping(MF);
     break;
   case G_STORE:
-  case G_LOAD:
+  case G_LOAD: {
     if (Op0Size == 128) {
       OperandsMapping = getOperandsMapping(
           {getMSAMapping(MF), &Mips::ValueMappings[Mips::GPRIdx]});
@@ -477,41 +507,56 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     if (!Op0Ty.isPointer())
       InstTy = TI.determineInstType(&MI);
 
-    if (InstTy == InstType::FloatingPoint ||
-        (Op0Size == 64 && InstTy == InstType::Ambiguous))
+    if (isFloatingPoint_32or64(InstTy, Op0Size) ||
+        isAmbiguous_64(InstTy, Op0Size)) {
       OperandsMapping = getOperandsMapping(
           {getFprbMapping(Op0Size), &Mips::ValueMappings[Mips::GPRIdx]});
-    else
+    } else {
+      assert((isInteger_32(InstTy, Op0Size) ||
+              isAmbiguous_32(InstTy, Op0Size) ||
+              isAmbiguousWithMergeOrUnmerge_64(InstTy, Op0Size)) &&
+             "Unexpected Inst type");
       OperandsMapping =
           getOperandsMapping({getGprbOrCustomMapping(Op0Size, MappingID),
                               &Mips::ValueMappings[Mips::GPRIdx]});
+    }
 
     break;
-  case G_PHI:
+  }
+  case G_PHI: {
     if (!Op0Ty.isPointer())
       InstTy = TI.determineInstType(&MI);
 
     // PHI is copylike and should have one regbank in mapping for def register.
-    if (InstTy == InstType::Integer && Op0Size == 64) {
+    if (isAmbiguousWithMergeOrUnmerge_64(InstTy, Op0Size)) {
       OperandsMapping =
           getOperandsMapping({&Mips::ValueMappings[Mips::DPRIdx]});
+      TI.clearTypeInfoData(&MI);
       return getInstructionMapping(CustomMappingID, /*Cost=*/1, OperandsMapping,
                                    /*NumOperands=*/1);
     }
+    assert((isInteger_32(InstTy, Op0Size) ||
+            isFloatingPoint_32or64(InstTy, Op0Size) ||
+            isAmbiguous_32or64(InstTy, Op0Size)) &&
+           "Unexpected Inst type");
     // Use default handling for PHI, i.e. set reg bank of def operand to match
     // register banks of use operands.
     return getInstrMappingImpl(MI);
+  }
   case G_SELECT: {
     if (!Op0Ty.isPointer())
       InstTy = TI.determineInstType(&MI);
-
-    if (InstTy == InstType::FloatingPoint ||
-        (Op0Size == 64 && InstTy == InstType::Ambiguous)) {
+    if (isFloatingPoint_32or64(InstTy, Op0Size) ||
+        isAmbiguous_64(InstTy, Op0Size)) {
       const RegisterBankInfo::ValueMapping *Bank = getFprbMapping(Op0Size);
       OperandsMapping = getOperandsMapping(
           {Bank, &Mips::ValueMappings[Mips::GPRIdx], Bank, Bank});
       break;
     } else {
+      assert((isInteger_32(InstTy, Op0Size) ||
+              isAmbiguous_32(InstTy, Op0Size) ||
+              isAmbiguousWithMergeOrUnmerge_64(InstTy, Op0Size)) &&
+             "Unexpected Inst type");
       const RegisterBankInfo::ValueMapping *Bank =
           getGprbOrCustomMapping(Op0Size, MappingID);
       OperandsMapping = getOperandsMapping(
@@ -519,28 +564,45 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     }
     break;
   }
-  case G_IMPLICIT_DEF:
+  case G_IMPLICIT_DEF: {
     if (!Op0Ty.isPointer())
       InstTy = TI.determineInstType(&MI);
 
-    if (InstTy == InstType::FloatingPoint)
+    if (isFloatingPoint_32or64(InstTy, Op0Size))
       OperandsMapping = getFprbMapping(Op0Size);
-    else
+    else {
+      assert((isInteger_32(InstTy, Op0Size) ||
+              isAmbiguousWithMergeOrUnmerge_64(InstTy, Op0Size)) &&
+             "Unexpected Inst type");
       OperandsMapping = getGprbOrCustomMapping(Op0Size, MappingID);
-
-    break;
-  case G_UNMERGE_VALUES:
+    }
+  } break;
+  case G_UNMERGE_VALUES: {
+    assert(MI.getNumOperands() == 3 && "Unsupported G_UNMERGE_VALUES");
+    unsigned Op3Size = MRI.getType(MI.getOperand(2).getReg()).getSizeInBits();
+    InstTy = TI.determineInstType(&MI);
+    assert((isAmbiguousWithMergeOrUnmerge_64(InstTy, Op3Size) ||
+            isFloatingPoint_64(InstTy, Op3Size)) &&
+           "Unexpected Inst type");
     OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::GPRIdx],
                                           &Mips::ValueMappings[Mips::GPRIdx],
                                           &Mips::ValueMappings[Mips::DPRIdx]});
-    MappingID = CustomMappingID;
+    if (isAmbiguousWithMergeOrUnmerge_64(InstTy, Op3Size))
+      MappingID = CustomMappingID;
     break;
-  case G_MERGE_VALUES:
+  }
+  case G_MERGE_VALUES: {
+    InstTy = TI.determineInstType(&MI);
+    assert((isAmbiguousWithMergeOrUnmerge_64(InstTy, Op0Size) ||
+            isFloatingPoint_64(InstTy, Op0Size)) &&
+           "Unexpected Inst type");
     OperandsMapping = getOperandsMapping({&Mips::ValueMappings[Mips::DPRIdx],
                                           &Mips::ValueMappings[Mips::GPRIdx],
                                           &Mips::ValueMappings[Mips::GPRIdx]});
-    MappingID = CustomMappingID;
+    if (isAmbiguousWithMergeOrUnmerge_64(InstTy, Op0Size))
+      MappingID = CustomMappingID;
     break;
+  }
   case G_FADD:
   case G_FSUB:
   case G_FMUL:
@@ -605,6 +667,8 @@ MipsRegisterBankInfo::getInstrMapping(const MachineInstr &MI) const {
     return getInvalidInstructionMapping();
   }
 
+  if (MappingID == CustomMappingID)
+    TI.clearTypeInfoData(&MI);
   return getInstructionMapping(MappingID, /*Cost=*/1, OperandsMapping,
                                NumOperands);
 }
@@ -652,10 +716,10 @@ void MipsRegisterBankInfo::setRegBank(MachineInstr &MI,
 
 static void
 combineAwayG_UNMERGE_VALUES(LegalizationArtifactCombiner &ArtCombiner,
-                            MachineInstr &MI) {
+                            MachineInstr &MI, GISelObserverWrapper &Observer) {
   SmallVector<Register, 4> UpdatedDefs;
   SmallVector<MachineInstr *, 2> DeadInstrs;
-  ArtCombiner.tryCombineMerges(MI, DeadInstrs, UpdatedDefs);
+  ArtCombiner.tryCombineMerges(MI, DeadInstrs, UpdatedDefs, Observer);
   for (MachineInstr *DeadMI : DeadInstrs)
     DeadMI->eraseFromParent();
 }
@@ -688,7 +752,7 @@ void MipsRegisterBankInfo::applyMappingImpl(
       // not be considered for regbank selection. RegBankSelect for mips
       // visits/makes corresponding G_MERGE first. Combine them here.
       if (NewMI->getOpcode() == TargetOpcode::G_UNMERGE_VALUES)
-        combineAwayG_UNMERGE_VALUES(ArtCombiner, *NewMI);
+        combineAwayG_UNMERGE_VALUES(ArtCombiner, *NewMI, WrapperObserver);
       // This G_MERGE will be combined away when its corresponding G_UNMERGE
       // gets regBankSelected.
       else if (NewMI->getOpcode() == TargetOpcode::G_MERGE_VALUES)
@@ -700,7 +764,7 @@ void MipsRegisterBankInfo::applyMappingImpl(
     return;
   }
   case TargetOpcode::G_UNMERGE_VALUES:
-    combineAwayG_UNMERGE_VALUES(ArtCombiner, MI);
+    combineAwayG_UNMERGE_VALUES(ArtCombiner, MI, WrapperObserver);
     return;
   default:
     break;
diff --git a/llvm/lib/Target/Mips/MipsRegisterBankInfo.h b/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
index 66267f8d794d..55eeaf096b14 100644
--- a/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
+++ b/llvm/lib/Target/Mips/MipsRegisterBankInfo.h
@@ -66,9 +66,55 @@ private:
     /// Represents moving 'bags of bits' around. Select same bank for entire
     /// chain to avoid cross bank copies. Currently we select fprb for s64 and
     /// gprb for s32 Ambiguous operands.
-    Ambiguous
+    Ambiguous,
+    /// Only used for s64. Unlike Ambiguous s64, AmbiguousWithMergeOrUnmerge s64
+    /// is mapped to gprb (legalized using narrow scalar to s32).
+    AmbiguousWithMergeOrUnmerge
   };
 
+  bool isAmbiguous_64(InstType InstTy, unsigned OpSize) const {
+    if (InstTy == InstType::Ambiguous && OpSize == 64)
+      return true;
+    return false;
+  }
+
+  bool isAmbiguous_32(InstType InstTy, unsigned OpSize) const {
+    if (InstTy == InstType::Ambiguous && OpSize == 32)
+      return true;
+    return false;
+  }
+
+  bool isAmbiguous_32or64(InstType InstTy, unsigned OpSize) const {
+    if (InstTy == InstType::Ambiguous && (OpSize == 32 || OpSize == 64))
+      return true;
+    return false;
+  }
+
+  bool isAmbiguousWithMergeOrUnmerge_64(InstType InstTy,
+                                        unsigned OpSize) const {
+    if (InstTy == InstType::AmbiguousWithMergeOrUnmerge && OpSize == 64)
+      return true;
+    return false;
+  }
+
+  bool isFloatingPoint_32or64(InstType InstTy, unsigned OpSize) const {
+    if (InstTy == InstType::FloatingPoint && (OpSize == 32 || OpSize == 64))
+      return true;
+    return false;
+  }
+
+  bool isFloatingPoint_64(InstType InstTy, unsigned OpSize) const {
+    if (InstTy == InstType::FloatingPoint && OpSize == 64)
+      return true;
+    return false;
+  }
+
+  bool isInteger_32(InstType InstTy, unsigned OpSize) const {
+    if (InstTy == InstType::Integer && OpSize == 32)
+      return true;
+    return false;
+  }
+
   /// Some generic instructions have operands that can be mapped to either fprb
   /// or gprb e.g. for G_LOAD we consider only operand 0 as ambiguous, operand 1
   /// is always gprb since it is a pointer.
@@ -113,12 +159,13 @@ private:
     DenseMap<const MachineInstr *, InstType> Types;
 
     /// Recursively visit MI's adjacent instructions and find MI's InstType.
-    bool visit(const MachineInstr *MI, const MachineInstr *WaitingForTypeOfMI);
+    bool visit(const MachineInstr *MI, const MachineInstr *WaitingForTypeOfMI,
+               InstType &AmbiguousTy);
 
     /// Visit MI's adjacent UseDefs or DefUses.
     bool visitAdjacentInstrs(const MachineInstr *MI,
                              SmallVectorImpl<MachineInstr *> &AdjacentInstrs,
-                             bool isDefUse);
+                             bool isDefUse, InstType &AmbiguousTy);
 
     /// Set type for MI, and recursively for all instructions that are
     /// waiting for MI's type.
@@ -170,6 +217,13 @@ private:
     InstType determineInstType(const MachineInstr *MI);
 
     void cleanupIfNewFunction(llvm::StringRef FunctionName);
+
+    /// MI is about to get destroyed (using narrow scalar). Internal data is
+    /// saved based on MI's address, clear it since it is no longer valid.
+    void clearTypeInfoData(const MachineInstr *MI) {
+      Types.erase(MI);
+      WaitingQueues.erase(MI);
+    };
   };
 };
 } // end namespace llvm
diff --git a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
index 7b02d126eb28..3452bf495a34 100644
--- a/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -245,11 +245,6 @@ MipsRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
   return true;
 }
 
-bool
-MipsRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
-  return true;
-}
-
 // FrameIndex represent objects inside a abstract stack.
 // We must replace FrameIndex with an stack/frame pointer
 // direct reference.
@@ -271,7 +266,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                     << "spOffset   : " << spOffset << "\n"
                     << "stackSize  : " << stackSize << "\n"
                     << "alignment  : "
-                    << MF.getFrameInfo().getObjectAlignment(FrameIndex)
+                    << DebugStr(MF.getFrameInfo().getObjectAlign(FrameIndex))
                     << "\n");
 
   eliminateFI(MI, FIOperandNum, FrameIndex, stackSize, spOffset);
diff --git a/llvm/lib/Target/Mips/MipsRegisterInfo.h b/llvm/lib/Target/Mips/MipsRegisterInfo.h
index 4ed32b09718b..06f214c2d6b1 100644
--- a/llvm/lib/Target/Mips/MipsRegisterInfo.h
+++ b/llvm/lib/Target/Mips/MipsRegisterInfo.h
@@ -58,8 +58,6 @@ public:
 
   bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
-
   /// Stack Frame Processing Methods
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
                            int SPAdj, unsigned FIOperandNum,
diff --git a/llvm/lib/Target/Mips/MipsRegisterInfo.td b/llvm/lib/Target/Mips/MipsRegisterInfo.td
index 8a6279da46b7..7d4dcca89e31 100644
--- a/llvm/lib/Target/Mips/MipsRegisterInfo.td
+++ b/llvm/lib/Target/Mips/MipsRegisterInfo.td
@@ -194,23 +194,23 @@ let Namespace = "Mips" in {
 
   // FP control registers.
   foreach I = 0-31 in
-  def FCR#I : MipsReg<#I, ""#I>;
+  def FCR#I : MipsReg<I, ""#I>;
 
   // FP condition code registers.
   foreach I = 0-7 in
-  def FCC#I : MipsReg<#I, "fcc"#I>;
+  def FCC#I : MipsReg<I, "fcc"#I>;
 
   // COP0 registers.
   foreach I = 0-31 in
-  def COP0#I : MipsReg<#I, ""#I>;
+  def COP0#I : MipsReg<I, ""#I>;
 
   // COP2 registers.
   foreach I = 0-31 in
-  def COP2#I : MipsReg<#I, ""#I>;
+  def COP2#I : MipsReg<I, ""#I>;
 
   // COP3 registers.
   foreach I = 0-31 in
-  def COP3#I : MipsReg<#I, ""#I>;
+  def COP3#I : MipsReg<I, ""#I>;
 
   // PC register
   def PC : Register<"pc">;
@@ -222,11 +222,11 @@ let Namespace = "Mips" in {
   def HWR3 : MipsReg<3, "hwr_ccres">;
 
   foreach I = 4-31 in
-  def HWR#I : MipsReg<#I, ""#I>;
+  def HWR#I : MipsReg<I, ""#I>;
 
   // Accum registers
   foreach I = 0-3 in
-  def AC#I : ACCReg<#I, "ac"#I,
+  def AC#I : ACCReg<I, "ac"#I,
                     [!cast<Register>("LO"#I), !cast<Register>("HI"#I)]>;
 
   def AC0_64 : ACCReg<0, "ac0", [LO0_64, HI0_64]>;
@@ -262,7 +262,7 @@ let Namespace = "Mips" in {
   // These registers do not exist, but instructions like `cfcmsa`
   // and `ctcmsa` allows to specify them.
   foreach I = 8-31 in
-  def MSA#I : MipsReg<#I, ""#I>;
+  def MSA#I : MipsReg<I, ""#I>;
 
   // Octeon multiplier and product registers
   def MPL0 : MipsReg<0, "mpl0">;
diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
index 166ddea0431f..a657bb44ac78 100644
--- a/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.cpp
@@ -320,7 +320,7 @@ bool ExpandPseudo::expandBuildPairF64(MachineBasicBlock &MBB,
 
     // We re-use the same spill slot each time so that the stack frame doesn't
     // grow too much in functions with a large number of moves.
-    int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC2);
+    int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(MF, RC2);
     if (!Subtarget.isLittle())
       std::swap(LoReg, HiReg);
     TII.storeRegToStack(MBB, I, LoReg, I->getOperand(1).isKill(), FI, RC,
@@ -386,7 +386,7 @@ bool ExpandPseudo::expandExtractElementF64(MachineBasicBlock &MBB,
 
     // We re-use the same spill slot each time so that the stack frame doesn't
     // grow too much in functions with a large number of moves.
-    int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(RC);
+    int FI = MF.getInfo<MipsFunctionInfo>()->getMoveF64ViaSpillFI(MF, RC);
     TII.storeRegToStack(MBB, I, SrcReg, Op1.isKill(), FI, RC, &RegInfo, 0);
     TII.loadRegFromStack(MBB, I, DstReg, FI, RC2, &RegInfo, Offset);
     return true;
@@ -434,8 +434,8 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
   TII.adjustStackPtr(SP, -StackSize, MBB, MBBI);
 
   // emit ".cfi_def_cfa_offset StackSize"
-  unsigned CFIIndex = MF.addFrameInst(
-      MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
+  unsigned CFIIndex =
+      MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize));
   BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
 
@@ -539,11 +539,11 @@ void MipsSEFrameLowering::emitPrologue(MachineFunction &MF,
       // addiu $Reg, $zero, -MaxAlignment
       // andi $sp, $sp, $Reg
       Register VR = MF.getRegInfo().createVirtualRegister(RC);
-      assert(isInt<16>(MFI.getMaxAlignment()) &&
+      assert((Log2(MFI.getMaxAlign()) < 16) &&
              "Function's alignment size requirement is not supported.");
-      int MaxAlign = -(int)MFI.getMaxAlignment();
+      int64_t MaxAlign = -(int64_t)MFI.getMaxAlign().value();
 
-      BuildMI(MBB, MBBI, dl, TII.get(ADDiu), VR).addReg(ZERO) .addImm(MaxAlign);
+      BuildMI(MBB, MBBI, dl, TII.get(ADDiu), VR).addReg(ZERO).addImm(MaxAlign);
       BuildMI(MBB, MBBI, dl, TII.get(AND), SP).addReg(SP).addReg(VR);
 
       if (hasBP(MF)) {
@@ -776,7 +776,7 @@ void MipsSEFrameLowering::emitInterruptEpilogueStub(
 
 int MipsSEFrameLowering::getFrameIndexReference(const MachineFunction &MF,
                                                 int FI,
-                                                unsigned &FrameReg) const {
+                                                Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   MipsABIInfo ABI = STI.getABI();
 
@@ -789,11 +789,9 @@ int MipsSEFrameLowering::getFrameIndexReference(const MachineFunction &MF,
          getOffsetOfLocalArea() + MFI.getOffsetAdjustment();
 }
 
-bool MipsSEFrameLowering::
-spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI,
-                          const std::vector<CalleeSavedInfo> &CSI,
-                          const TargetRegisterInfo *TRI) const {
+bool MipsSEFrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
   const TargetInstrInfo &TII = *STI.getInstrInfo();
 
@@ -880,11 +878,11 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
 
   // Create spill slots for eh data registers if function calls eh_return.
   if (MipsFI->callsEhReturn())
-    MipsFI->createEhDataRegsFI();
+    MipsFI->createEhDataRegsFI(MF);
 
   // Create spill slots for Coprocessor 0 registers if function is an ISR.
   if (MipsFI->isISR())
-    MipsFI->createISRRegFI();
+    MipsFI->createISRRegFI(MF);
 
   // Expand pseudo instructions which load, store or copy accumulators.
   // Add an emergency spill slot if a pseudo was expanded.
@@ -895,8 +893,7 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
     const TargetRegisterClass &RC = STI.isGP64bit() ?
       Mips::GPR64RegClass : Mips::GPR32RegClass;
     int FI = MF.getFrameInfo().CreateStackObject(TRI->getSpillSize(RC),
-                                                 TRI->getSpillAlignment(RC),
-                                                 false);
+                                                 TRI->getSpillAlign(RC), false);
     RS->addScavengingFrameIndex(FI);
   }
 
@@ -912,8 +909,7 @@ void MipsSEFrameLowering::determineCalleeSaves(MachineFunction &MF,
   const TargetRegisterClass &RC =
       ABI.ArePtrs64bit() ? Mips::GPR64RegClass : Mips::GPR32RegClass;
   int FI = MF.getFrameInfo().CreateStackObject(TRI->getSpillSize(RC),
-                                               TRI->getSpillAlignment(RC),
-                                               false);
+                                               TRI->getSpillAlign(RC), false);
   RS->addScavengingFrameIndex(FI);
 }
 
diff --git a/llvm/lib/Target/Mips/MipsSEFrameLowering.h b/llvm/lib/Target/Mips/MipsSEFrameLowering.h
index 78ffe161d9c6..c818a65f5b14 100644
--- a/llvm/lib/Target/Mips/MipsSEFrameLowering.h
+++ b/llvm/lib/Target/Mips/MipsSEFrameLowering.h
@@ -28,11 +28,11 @@ public:
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
+                             Register &FrameReg) const override;
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
                                  const TargetRegisterInfo *TRI) const override;
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
diff --git a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
index bef1a3657ea5..7be5fc33a0af 100644
--- a/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelDAGToDAG.cpp
@@ -153,7 +153,7 @@ void MipsSEDAGToDAGISel::emitMCountABI(MachineInstr &MI, MachineBasicBlock &MBB,
 }
 
 void MipsSEDAGToDAGISel::processFunctionAfterISel(MachineFunction &MF) {
-  MF.getInfo<MipsFunctionInfo>()->initGlobalBaseReg();
+  MF.getInfo<MipsFunctionInfo>()->initGlobalBaseReg(MF);
 
   MachineRegisterInfo *MRI = &MF.getRegInfo();
 
@@ -833,7 +833,9 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   }
 
   case ISD::INTRINSIC_W_CHAIN: {
-    switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+    const unsigned IntrinsicOpcode =
+        cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntrinsicOpcode) {
     default:
       break;
 
@@ -845,6 +847,41 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
       ReplaceNode(Node, Reg.getNode());
       return true;
     }
+    case Intrinsic::mips_ldr_d:
+    case Intrinsic::mips_ldr_w: {
+      unsigned Op = (IntrinsicOpcode == Intrinsic::mips_ldr_d) ? Mips::LDR_D
+                                                               : Mips::LDR_W;
+
+      SDLoc DL(Node);
+      assert(Node->getNumOperands() == 4 && "Unexpected number of operands.");
+      const SDValue &Chain = Node->getOperand(0);
+      const SDValue &Intrinsic = Node->getOperand(1);
+      const SDValue &Pointer = Node->getOperand(2);
+      const SDValue &Constant = Node->getOperand(3);
+
+      assert(Chain.getValueType() == MVT::Other);
+      (void)Intrinsic;
+      assert(Intrinsic.getOpcode() == ISD::TargetConstant &&
+             Constant.getOpcode() == ISD::Constant &&
+             "Invalid instruction operand.");
+
+      // Convert Constant to TargetConstant.
+      const ConstantInt *Val =
+          cast<ConstantSDNode>(Constant)->getConstantIntValue();
+      SDValue Imm =
+          CurDAG->getTargetConstant(*Val, DL, Constant.getValueType());
+
+      SmallVector<SDValue, 3> Ops{Pointer, Imm, Chain};
+
+      assert(Node->getNumValues() == 2);
+      assert(Node->getValueType(0).is128BitVector());
+      assert(Node->getValueType(1) == MVT::Other);
+      SmallVector<EVT, 2> ResTys{Node->getValueType(0), Node->getValueType(1)};
+
+      ReplaceNode(Node, CurDAG->getMachineNode(Op, DL, ResTys, Ops));
+
+      return true;
+    }
     }
     break;
   }
@@ -866,7 +903,9 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
   }
 
   case ISD::INTRINSIC_VOID: {
-    switch (cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue()) {
+    const unsigned IntrinsicOpcode =
+        cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
+    switch (IntrinsicOpcode) {
     default:
       break;
 
@@ -879,6 +918,40 @@ bool MipsSEDAGToDAGISel::trySelect(SDNode *Node) {
       ReplaceNode(Node, ChainOut.getNode());
       return true;
     }
+    case Intrinsic::mips_str_d:
+    case Intrinsic::mips_str_w: {
+      unsigned Op = (IntrinsicOpcode == Intrinsic::mips_str_d) ? Mips::STR_D
+                                                               : Mips::STR_W;
+
+      SDLoc DL(Node);
+      assert(Node->getNumOperands() == 5 && "Unexpected number of operands.");
+      const SDValue &Chain = Node->getOperand(0);
+      const SDValue &Intrinsic = Node->getOperand(1);
+      const SDValue &Vec = Node->getOperand(2);
+      const SDValue &Pointer = Node->getOperand(3);
+      const SDValue &Constant = Node->getOperand(4);
+
+      assert(Chain.getValueType() == MVT::Other);
+      (void)Intrinsic;
+      assert(Intrinsic.getOpcode() == ISD::TargetConstant &&
+             Constant.getOpcode() == ISD::Constant &&
+             "Invalid instruction operand.");
+
+      // Convert Constant to TargetConstant.
+      const ConstantInt *Val =
+          cast<ConstantSDNode>(Constant)->getConstantIntValue();
+      SDValue Imm =
+          CurDAG->getTargetConstant(*Val, DL, Constant.getValueType());
+
+      SmallVector<SDValue, 4> Ops{Vec, Pointer, Imm, Chain};
+
+      assert(Node->getNumValues() == 1);
+      assert(Node->getValueType(0) == MVT::Other);
+      SmallVector<EVT, 1> ResTys{Node->getValueType(0)};
+
+      ReplaceNode(Node, CurDAG->getMachineNode(Op, DL, ResTys, Ops));
+      return true;
+    }
     }
     break;
   }
diff --git a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
index 798e8784405f..bdf29c53cbd5 100644
--- a/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsSEISelLowering.cpp
@@ -1342,9 +1342,8 @@ static SDValue lowerDSPIntr(SDValue Op, SelectionDAG &DAG, unsigned Opc) {
   // Scan output.
   SmallVector<EVT, 2> ResTys;
 
-  for (SDNode::value_iterator I = Op->value_begin(), E = Op->value_end();
-       I != E; ++I)
-    ResTys.push_back((*I == MVT::i64) ? MVT::Untyped : *I);
+  for (EVT Ty : Op->values())
+    ResTys.push_back((Ty == MVT::i64) ? MVT::Untyped : Ty);
 
   // Create node.
   SDValue Val = DAG.getNode(Opc, DL, ResTys, Ops);
diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
index d4f09a2f3586..901a4fe4e2ac 100644
--- a/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
+++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.cpp
@@ -243,7 +243,7 @@ MipsSEInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const {
 
 void MipsSEInstrInfo::
 storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                unsigned SrcReg, bool isKill, int FI,
+                Register SrcReg, bool isKill, int FI,
                 const TargetRegisterClass *RC, const TargetRegisterInfo *TRI,
                 int64_t Offset) const {
   DebugLoc DL;
@@ -317,7 +317,7 @@ storeRegToStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
 void MipsSEInstrInfo::
 loadRegFromStack(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                 unsigned DestReg, int FI, const TargetRegisterClass *RC,
+                 Register DestReg, int FI, const TargetRegisterClass *RC,
                  const TargetRegisterInfo *TRI, int64_t Offset) const {
   DebugLoc DL;
   if (I != MBB.end()) DL = I->getDebugLoc();
@@ -483,6 +483,20 @@ bool MipsSEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   return true;
 }
 
+/// isBranchWithImm - Return true if the branch contains an immediate
+/// operand (\see lib/Target/Mips/MipsBranchExpansion.cpp).
+bool MipsSEInstrInfo::isBranchWithImm(unsigned Opc) const {
+  switch (Opc) {
+  default:
+    return false;
+  case Mips::BBIT0:
+  case Mips::BBIT1:
+  case Mips::BBIT032:
+  case Mips::BBIT132:
+    return true;
+  }
+}
+
 /// getOppositeBranchOpc - Return the inverse of the specified
 /// opcode, e.g. turning BEQ to BNE.
 unsigned MipsSEInstrInfo::getOppositeBranchOpc(unsigned Opc) const {
diff --git a/llvm/lib/Target/Mips/MipsSEInstrInfo.h b/llvm/lib/Target/Mips/MipsSEInstrInfo.h
index 08c00ec8ccef..44a6dac2ccbd 100644
--- a/llvm/lib/Target/Mips/MipsSEInstrInfo.h
+++ b/llvm/lib/Target/Mips/MipsSEInstrInfo.h
@@ -48,20 +48,22 @@ public:
 
   void storeRegToStack(MachineBasicBlock &MBB,
                        MachineBasicBlock::iterator MI,
-                       unsigned SrcReg, bool isKill, int FrameIndex,
+                       Register SrcReg, bool isKill, int FrameIndex,
                        const TargetRegisterClass *RC,
                        const TargetRegisterInfo *TRI,
                        int64_t Offset) const override;
 
   void loadRegFromStack(MachineBasicBlock &MBB,
                         MachineBasicBlock::iterator MI,
-                        unsigned DestReg, int FrameIndex,
+                        Register DestReg, int FrameIndex,
                         const TargetRegisterClass *RC,
                         const TargetRegisterInfo *TRI,
                         int64_t Offset) const override;
 
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
+  bool isBranchWithImm(unsigned Opc) const override;
+
   unsigned getOppositeBranchOpc(unsigned Opc) const override;
 
   /// Adjust SP by Amount bytes.
diff --git a/llvm/lib/Target/Mips/MipsSERegisterInfo.h b/llvm/lib/Target/Mips/MipsSERegisterInfo.h
index 82ddf40f56a7..cc8496e0268b 100644
--- a/llvm/lib/Target/Mips/MipsSERegisterInfo.h
+++ b/llvm/lib/Target/Mips/MipsSERegisterInfo.h
@@ -17,7 +17,6 @@
 #include "MipsRegisterInfo.h"
 
 namespace llvm {
-class MipsSEInstrInfo;
 
 class MipsSERegisterInfo : public MipsRegisterInfo {
 public:
diff --git a/llvm/lib/Target/Mips/MipsSchedule.td b/llvm/lib/Target/Mips/MipsSchedule.td
index 0c0ddeab22c4..568c85af655d 100644
--- a/llvm/lib/Target/Mips/MipsSchedule.td
+++ b/llvm/lib/Target/Mips/MipsSchedule.td
@@ -27,6 +27,7 @@ def II_ADD              : InstrItinClass;
 def II_ADDU             : InstrItinClass;
 def II_ADD_D            : InstrItinClass;
 def II_ADD_S            : InstrItinClass;
+def II_ADDR_PS          : InstrItinClass;
 def II_ALIGN            : InstrItinClass;
 def II_AND              : InstrItinClass;
 def II_ANDI             : InstrItinClass;
@@ -278,6 +279,7 @@ def II_MUL              : InstrItinClass;
 def II_MUH              : InstrItinClass;
 def II_MUHU             : InstrItinClass;
 def II_MULU             : InstrItinClass;
+def II_MULR_PS          : InstrItinClass;
 def II_MULT             : InstrItinClass;
 def II_MULTU            : InstrItinClass;
 def II_MUL_D            : InstrItinClass;
diff --git a/llvm/lib/Target/Mips/MipsScheduleGeneric.td b/llvm/lib/Target/Mips/MipsScheduleGeneric.td
index faccb37c2361..3888ca4e82f5 100644
--- a/llvm/lib/Target/Mips/MipsScheduleGeneric.td
+++ b/llvm/lib/Target/Mips/MipsScheduleGeneric.td
@@ -822,17 +822,19 @@ def : InstRW<[GenericWriteFPUS], (instrs FABS_S, FABS_D32, FABS_D64, FADD_D32,
 // madd.d, msub.dm mul.d, mul.ps, nmadd.d, nmsub.d, ceil.[wl].[sd], cvt.d.[sw],
 // cvt.s.[dw], cvt.w.[sd], cvt.[sw].ps, round.[lw].[ds], floor.[lw].ds,
 // trunc.w.[ds], trunc.w.ps,
-def : InstRW<[GenericWriteFPUL], (instrs CEIL_L_D64, CEIL_L_S, CEIL_W_D32,
+def : InstRW<[GenericWriteFPUL], (instrs ADDR_PS64,
+                                  CEIL_L_D64, CEIL_L_S, CEIL_W_D32,
                                   CEIL_W_D64, CEIL_W_S, CVT_D32_S, CVT_D32_W,
                                   CVT_D64_L, CVT_D64_S, CVT_D64_W, CVT_L_D64,
                                   CVT_L_S, CVT_S_D32, CVT_S_D64, CVT_S_L,
                                   CVT_S_W, CVT_W_D32, CVT_W_D64, CVT_W_S,
                                   CVT_PS_S64, CVT_S_PL64, CVT_S_PU64,
+                                  CVT_PS_PW64, CVT_PW_PS64,
                                   FLOOR_L_D64, FLOOR_L_S, FLOOR_W_D32,
                                   FLOOR_W_D64, FLOOR_W_S, FMUL_D32, FMUL_D64,
-                                  MADD_D32, MADD_D64, MSUB_D32, MSUB_D64,
+                                  MADD_D32, MADD_D64, MSUB_D32, MSUB_D64, MULR_PS64,
                                   NMADD_D32, NMADD_D64, NMSUB_D32, NMSUB_D64,
-                                  PLL_PS64, PLU_PS64,
+                                  PLL_PS64, PLU_PS64, PUL_PS64, PUU_PS64,
                                   ROUND_L_D64, ROUND_L_S, ROUND_W_D32,
                                   ROUND_W_D64, ROUND_W_S, TRUNC_L_D64,
                                   TRUNC_L_S, TRUNC_W_D32, TRUNC_W_D64,
diff --git a/llvm/lib/Target/Mips/MipsScheduleP5600.td b/llvm/lib/Target/Mips/MipsScheduleP5600.td
index 7331917baa25..3d159d412489 100644
--- a/llvm/lib/Target/Mips/MipsScheduleP5600.td
+++ b/llvm/lib/Target/Mips/MipsScheduleP5600.td
@@ -20,7 +20,8 @@ def MipsP5600Model : SchedMachineModel {
                                          IsGP64bit, IsPTR64bit,
                                          InMicroMips, InMips16Mode,
                                          HasCnMips, HasCnMipsP,
-                                         HasDSP, HasDSPR2, HasMT, HasCRC];
+                                         HasDSP, HasDSPR2, HasMips3D, HasMT,
+                                         HasCRC];
 }
 
 let SchedModel = MipsP5600Model in {
@@ -457,7 +458,7 @@ def : InstRW<[P5600WriteFPUL], (instrs CVT_PS_S64, CVT_S_PL64, CVT_S_PU64)>;
 def : InstRW<[P5600WriteFPUL], (instregex "^C_[A-Z]+_(S|D32|D64)$")>;
 def : InstRW<[P5600WriteFPUL], (instregex "^FCMP_(S32|D32|D64)$")>;
 def : InstRW<[P5600WriteFPUL], (instregex "^PseudoCVT_(S|D32|D64)_(L|W)$")>;
-def : InstRW<[P5600WriteFPUL], (instrs PLL_PS64, PLU_PS64)>;
+def : InstRW<[P5600WriteFPUL], (instrs PLL_PS64, PLU_PS64, PUL_PS64, PUU_PS64)>;
 
 // div.[ds], div.ps
 def : InstRW<[P5600WriteFPUDivS], (instrs FDIV_S)>;
diff --git a/llvm/lib/Target/Mips/MipsSubtarget.cpp b/llvm/lib/Target/Mips/MipsSubtarget.cpp
index 133b818114c8..ef4191cec3df 100644
--- a/llvm/lib/Target/Mips/MipsSubtarget.cpp
+++ b/llvm/lib/Target/Mips/MipsSubtarget.cpp
@@ -237,7 +237,7 @@ CodeGenOpt::Level MipsSubtarget::getOptLevelToEnablePostRAScheduler() const {
 MipsSubtarget &
 MipsSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS,
                                                const TargetMachine &TM) {
-  std::string CPUName = MIPS_MC::selectMipsCPU(TM.getTargetTriple(), CPU);
+  StringRef CPUName = MIPS_MC::selectMipsCPU(TM.getTargetTriple(), CPU);
 
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
diff --git a/llvm/lib/Target/Mips/MipsSubtarget.h b/llvm/lib/Target/Mips/MipsSubtarget.h
index 5a1dfec41a47..26ee961fc95d 100644
--- a/llvm/lib/Target/Mips/MipsSubtarget.h
+++ b/llvm/lib/Target/Mips/MipsSubtarget.h
@@ -149,6 +149,9 @@ class MipsSubtarget : public MipsGenSubtargetInfo {
   // HasDSP, HasDSPR2, HasDSPR3 -- supports DSP ASE.
   bool HasDSP, HasDSPR2, HasDSPR3;
 
+  // Has3D -- Supports Mips3D ASE.
+  bool Has3D;
+
   // Allow mixed Mips16 and Mips32 in one source file
   bool AllowMixed16_32;
 
@@ -312,6 +315,7 @@ public:
   bool hasDSP() const { return HasDSP; }
   bool hasDSPR2() const { return HasDSPR2; }
   bool hasDSPR3() const { return HasDSPR3; }
+  bool has3D() const { return Has3D; }
   bool hasMSA() const { return HasMSA; }
   bool disableMadd4() const { return DisableMadd4; }
   bool hasEVA() const { return HasEVA; }
diff --git a/llvm/lib/Target/Mips/MipsTargetMachine.cpp b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
index 8fec6db00cb9..80cb6ce7ac0c 100644
--- a/llvm/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetMachine.cpp
@@ -131,6 +131,9 @@ MipsTargetMachine::MipsTargetMachine(const Target &T, const Triple &TT,
                       MaybeAlign(Options.StackAlignmentOverride)) {
   Subtarget = &DefaultSubtarget;
   initAsmInfo();
+
+  // Mips supports the debug entry values.
+  setSupportsDebugEntryValues(true);
 }
 
 MipsTargetMachine::~MipsTargetMachine() = default;
diff --git a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
index 0852b5a18c68..481157a8aa89 100644
--- a/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
+++ b/llvm/lib/Target/Mips/MipsTargetObjectFile.cpp
@@ -44,7 +44,6 @@ EmbeddedData("membedded-data", cl::Hidden,
 
 void MipsTargetObjectFile::Initialize(MCContext &Ctx, const TargetMachine &TM){
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
 
   SmallDataSection = getContext().getELFSection(
       ".sdata", ELF::SHT_PROGBITS,
@@ -177,12 +176,13 @@ bool MipsTargetObjectFile::IsConstantInSmallSection(
 MCSection *MipsTargetObjectFile::getSectionForConstant(const DataLayout &DL,
                                                        SectionKind Kind,
                                                        const Constant *C,
-                                                       unsigned &Align) const {
+                                                       Align &Alignment) const {
   if (IsConstantInSmallSection(DL, C, *TM))
     return SmallDataSection;
 
   // Otherwise, we work the same as ELF.
-  return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C, Align);
+  return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C,
+                                                            Alignment);
 }
 
 const MCExpr *
diff --git a/llvm/lib/Target/Mips/MipsTargetObjectFile.h b/llvm/lib/Target/Mips/MipsTargetObjectFile.h
index bdf485f83260..07e9caf0dd09 100644
--- a/llvm/lib/Target/Mips/MipsTargetObjectFile.h
+++ b/llvm/lib/Target/Mips/MipsTargetObjectFile.h
@@ -40,7 +40,7 @@ class MipsTargetMachine;
 
     MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
                                      const Constant *C,
-                                     unsigned &Align) const override;
+                                     Align &Alignment) const override;
     /// Describe a TLS variable address within debug info.
     const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
   };
diff --git a/llvm/lib/Target/Mips/MipsTargetStreamer.h b/llvm/lib/Target/Mips/MipsTargetStreamer.h
index b389ba8938c4..f4282f5d6974 100644
--- a/llvm/lib/Target/Mips/MipsTargetStreamer.h
+++ b/llvm/lib/Target/Mips/MipsTargetStreamer.h
@@ -19,8 +19,6 @@
 
 namespace llvm {
 
-struct MipsABIFlagsSection;
-
 class MipsTargetStreamer : public MCTargetStreamer {
 public:
   MipsTargetStreamer(MCStreamer &S);
@@ -84,12 +82,15 @@ public:
   virtual void emitDirectiveSetDsp();
   virtual void emitDirectiveSetDspr2();
   virtual void emitDirectiveSetNoDsp();
+  virtual void emitDirectiveSetMips3D();
+  virtual void emitDirectiveSetNoMips3D();
   virtual void emitDirectiveSetPop();
   virtual void emitDirectiveSetPush();
   virtual void emitDirectiveSetSoftFloat();
   virtual void emitDirectiveSetHardFloat();
 
   // PIC support
+  virtual void emitDirectiveCpAdd(unsigned RegNo);
   virtual void emitDirectiveCpLoad(unsigned RegNo);
   virtual void emitDirectiveCpLocal(unsigned RegNo);
   virtual bool emitDirectiveCpRestore(int Offset,
@@ -263,12 +264,15 @@ public:
   void emitDirectiveSetDsp() override;
   void emitDirectiveSetDspr2() override;
   void emitDirectiveSetNoDsp() override;
+  void emitDirectiveSetMips3D() override;
+  void emitDirectiveSetNoMips3D() override;
   void emitDirectiveSetPop() override;
   void emitDirectiveSetPush() override;
   void emitDirectiveSetSoftFloat() override;
   void emitDirectiveSetHardFloat() override;
 
   // PIC support
+  void emitDirectiveCpAdd(unsigned RegNo) override;
   void emitDirectiveCpLoad(unsigned RegNo) override;
   void emitDirectiveCpLocal(unsigned RegNo) override;
 
@@ -341,6 +345,7 @@ public:
   void emitFMask(unsigned FPUBitmask, int FPUTopSavedRegOff) override;
 
   // PIC support
+  void emitDirectiveCpAdd(unsigned RegNo) override;
   void emitDirectiveCpLoad(unsigned RegNo) override;
   void emitDirectiveCpLocal(unsigned RegNo) override;
   bool emitDirectiveCpRestore(int Offset, function_ref<unsigned()> GetATReg,
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
index 7e1da9b7a94b..aef0eed6ab9a 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.cpp
@@ -51,4 +51,6 @@ NVPTXMCAsmInfo::NVPTXMCAsmInfo(const Triple &TheTriple,
   // @TODO: Can we just disable this?
   WeakDirective = "\t// .weak\t";
   GlobalDirective = "\t// .globl\t";
+
+  UseIntegratedAssembler = false;
 }
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
index ce5ca99c5397..77c4daea2b6a 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCAsmInfo.h
@@ -16,7 +16,6 @@
 #include "llvm/MC/MCAsmInfo.h"
 
 namespace llvm {
-class Target;
 class Triple;
 
 class NVPTXMCAsmInfo : public MCAsmInfo {
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
index e1691d2384e6..b394566edd0d 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXMCTargetDesc.h
@@ -15,11 +15,6 @@
 
 #include <stdint.h>
 
-namespace llvm {
-class Target;
-
-} // End llvm namespace
-
 // Defines symbolic names for PTX registers.
 #define GET_REGINFO_ENUM
 #include "NVPTXGenRegisterInfo.inc"
diff --git a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
index 17f5ba7d900b..cdb70ff1f973 100644
--- a/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
+++ b/llvm/lib/Target/NVPTX/MCTargetDesc/NVPTXTargetStreamer.cpp
@@ -26,13 +26,13 @@ NVPTXTargetStreamer::~NVPTXTargetStreamer() = default;
 
 void NVPTXTargetStreamer::outputDwarfFileDirectives() {
   for (const std::string &S : DwarfFiles)
-    getStreamer().EmitRawText(S.data());
+    getStreamer().emitRawText(S.data());
   DwarfFiles.clear();
 }
 
 void NVPTXTargetStreamer::closeLastSection() {
   if (HasSections)
-    getStreamer().EmitRawText("\t}");
+    getStreamer().emitRawText("\t}");
 }
 
 void NVPTXTargetStreamer::emitDwarfFileDirective(StringRef Directive) {
@@ -128,7 +128,7 @@ void NVPTXTargetStreamer::emitRawBytes(StringRef Data) {
       if (Label == Directive)
         Label = ",";
     }
-    Streamer.EmitRawText(OS.str());
+    Streamer.emitRawText(OS.str());
   }
 #endif
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTX.h b/llvm/lib/Target/NVPTX/NVPTX.h
index 0acbace5f848..dfe0b9cb5ee6 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.h
+++ b/llvm/lib/Target/NVPTX/NVPTX.h
@@ -21,7 +21,6 @@ namespace llvm {
 class NVPTXTargetMachine;
 class FunctionPass;
 class MachineFunctionPass;
-class formatted_raw_ostream;
 
 namespace NVPTXCC {
 enum CondCodes {
diff --git a/llvm/lib/Target/NVPTX/NVPTX.td b/llvm/lib/Target/NVPTX/NVPTX.td
index 1d947ef1ce62..2b39e9f412f7 100644
--- a/llvm/lib/Target/NVPTX/NVPTX.td
+++ b/llvm/lib/Target/NVPTX/NVPTX.td
@@ -55,6 +55,8 @@ def SM72 : SubtargetFeature<"sm_72", "SmVersion", "72",
                              "Target SM 7.2">;
 def SM75 : SubtargetFeature<"sm_75", "SmVersion", "75",
                              "Target SM 7.5">;
+def SM80 : SubtargetFeature<"sm_80", "SmVersion", "80",
+                             "Target SM 8.0">;
 
 // PTX Versions
 def PTX32 : SubtargetFeature<"ptx32", "PTXVersion", "32",
@@ -77,6 +79,10 @@ def PTX63 : SubtargetFeature<"ptx63", "PTXVersion", "63",
                              "Use PTX version 6.3">;
 def PTX64 : SubtargetFeature<"ptx64", "PTXVersion", "64",
                              "Use PTX version 6.4">;
+def PTX65 : SubtargetFeature<"ptx65", "PTXVersion", "65",
+                             "Use PTX version 6.5">;
+def PTX70 : SubtargetFeature<"ptx70", "PTXVersion", "70",
+                             "Use PTX version 7.0">;
 
 //===----------------------------------------------------------------------===//
 // NVPTX supported processors.
@@ -100,6 +106,7 @@ def : Proc<"sm_62", [SM62, PTX50]>;
 def : Proc<"sm_70", [SM70, PTX60]>;
 def : Proc<"sm_72", [SM72, PTX61]>;
 def : Proc<"sm_75", [SM75, PTX63]>;
+def : Proc<"sm_80", [SM80, PTX70]>;
 
 def NVPTXInstrInfo : InstrInfo {
 }
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
index 7117438dc503..da1a398a68f0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.cpp
@@ -141,7 +141,7 @@ VisitGlobalVariableForEmission(const GlobalVariable *GV,
   Visiting.erase(GV);
 }
 
-void NVPTXAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void NVPTXAsmPrinter::emitInstruction(const MachineInstr *MI) {
   MCInst Inst;
   lowerToMCInst(MI, Inst);
   EmitToStreamer(*OutStreamer, Inst);
@@ -434,13 +434,13 @@ bool NVPTXAsmPrinter::isLoopHeaderOfNoUnroll(
   return false;
 }
 
-void NVPTXAsmPrinter::EmitBasicBlockStart(const MachineBasicBlock &MBB) {
-  AsmPrinter::EmitBasicBlockStart(MBB);
+void NVPTXAsmPrinter::emitBasicBlockStart(const MachineBasicBlock &MBB) {
+  AsmPrinter::emitBasicBlockStart(MBB);
   if (isLoopHeaderOfNoUnroll(MBB))
-    OutStreamer->EmitRawText(StringRef("\t.pragma \"nounroll\";\n"));
+    OutStreamer->emitRawText(StringRef("\t.pragma \"nounroll\";\n"));
 }
 
-void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
+void NVPTXAsmPrinter::emitFunctionEntryLabel() {
   SmallString<128> Str;
   raw_svector_ostream O(Str);
 
@@ -467,11 +467,11 @@ void NVPTXAsmPrinter::EmitFunctionEntryLabel() {
   if (isKernelFunction(*F))
     emitKernelFunctionDirectives(*F, O);
 
-  OutStreamer->EmitRawText(O.str());
+  OutStreamer->emitRawText(O.str());
 
   VRegMapping.clear();
   // Emit open brace for function body.
-  OutStreamer->EmitRawText(StringRef("{\n"));
+  OutStreamer->emitRawText(StringRef("{\n"));
   setAndEmitFunctionVirtualRegisters(*MF);
   // Emit initial .loc debug directive for correct relocation symbol data.
   if (MMI && MMI->hasDebugInfo())
@@ -485,18 +485,18 @@ bool NVPTXAsmPrinter::runOnMachineFunction(MachineFunction &F) {
   // debug labels/data after the last basic block.
   // We need to emit the closing brace here because we don't have function that
   // finished emission of the function body.
-  OutStreamer->EmitRawText(StringRef("}\n"));
+  OutStreamer->emitRawText(StringRef("}\n"));
   return Result;
 }
 
-void NVPTXAsmPrinter::EmitFunctionBodyStart() {
+void NVPTXAsmPrinter::emitFunctionBodyStart() {
   SmallString<128> Str;
   raw_svector_ostream O(Str);
   emitDemotedVars(&MF->getFunction(), O);
-  OutStreamer->EmitRawText(O.str());
+  OutStreamer->emitRawText(O.str());
 }
 
-void NVPTXAsmPrinter::EmitFunctionBodyEnd() {
+void NVPTXAsmPrinter::emitFunctionBodyEnd() {
   VRegMapping.clear();
 }
 
@@ -762,13 +762,21 @@ static bool isEmptyXXStructor(GlobalVariable *GV) {
   return InitList->getNumOperands() == 0;
 }
 
-bool NVPTXAsmPrinter::doInitialization(Module &M) {
+void NVPTXAsmPrinter::emitStartOfAsmFile(Module &M) {
   // Construct a default subtarget off of the TargetMachine defaults. The
   // rest of NVPTX isn't friendly to change subtargets per function and
   // so the default TargetMachine will have all of the options.
   const NVPTXTargetMachine &NTM = static_cast<const NVPTXTargetMachine &>(TM);
   const auto* STI = static_cast<const NVPTXSubtarget*>(NTM.getSubtargetImpl());
+  SmallString<128> Str1;
+  raw_svector_ostream OS1(Str1);
+
+  // Emit header before any dwarf directives are emitted below.
+  emitHeader(M, OS1, *STI);
+  OutStreamer->emitRawText(OS1.str());
+}
 
+bool NVPTXAsmPrinter::doInitialization(Module &M) {
   if (M.alias_size()) {
     report_fatal_error("Module has aliases, which NVPTX does not support.");
     return true; // error
@@ -784,26 +792,9 @@ bool NVPTXAsmPrinter::doInitialization(Module &M) {
     return true;  // error
   }
 
-  SmallString<128> Str1;
-  raw_svector_ostream OS1(Str1);
-
   // We need to call the parent's one explicitly.
   bool Result = AsmPrinter::doInitialization(M);
 
-  // Emit header before any dwarf directives are emitted below.
-  emitHeader(M, OS1, *STI);
-  OutStreamer->EmitRawText(OS1.str());
-
-  // Emit module-level inline asm if it exists.
-  if (!M.getModuleInlineAsm().empty()) {
-    OutStreamer->AddComment("Start of file scope inline assembly");
-    OutStreamer->AddBlankLine();
-    OutStreamer->EmitRawText(StringRef(M.getModuleInlineAsm()));
-    OutStreamer->AddBlankLine();
-    OutStreamer->AddComment("End of file scope inline assembly");
-    OutStreamer->AddBlankLine();
-  }
-
   GlobalsEmitted = false;
 
   return Result;
@@ -838,7 +829,7 @@ void NVPTXAsmPrinter::emitGlobals(const Module &M) {
 
   OS2 << '\n';
 
-  OutStreamer->EmitRawText(OS2.str());
+  OutStreamer->emitRawText(OS2.str());
 }
 
 void NVPTXAsmPrinter::emitHeader(Module &M, raw_ostream &O,
@@ -929,7 +920,7 @@ bool NVPTXAsmPrinter::doFinalization(Module &M) {
     static_cast<NVPTXTargetStreamer *>(OutStreamer->getTargetStreamer())
         ->closeLastSection();
     // Emit empty .debug_loc section for better support of the empty files.
-    OutStreamer->EmitRawText("\t.section\t.debug_loc\t{\t}");
+    OutStreamer->emitRawText("\t.section\t.debug_loc\t{\t}");
   }
 
   // Output last DWARF .file directives, if any.
@@ -982,7 +973,7 @@ void NVPTXAsmPrinter::emitLinkageDirective(const GlobalValue *V,
       msg.append("Error: ");
       msg.append("Symbol ");
       if (V->hasName())
-        msg.append(V->getName());
+        msg.append(std::string(V->getName()));
       msg.append("has unsupported appending linkage type");
       llvm_unreachable(msg.c_str());
     } else if (!V->hasInternalLinkage() &&
@@ -1184,7 +1175,7 @@ void NVPTXAsmPrinter::printModuleLevelGV(const GlobalVariable *GVar,
     case Type::IntegerTyID: // Integers larger than 64 bits
     case Type::StructTyID:
     case Type::ArrayTyID:
-    case Type::VectorTyID:
+    case Type::FixedVectorTyID:
       ElementSize = DL.getTypeStoreSize(ETy);
       // Ptx allows variable initilization only for constant and
       // global state spaces.
@@ -1358,7 +1349,7 @@ void NVPTXAsmPrinter::emitPTXGlobalVariable(const GlobalVariable *GVar,
   switch (ETy->getTypeID()) {
   case Type::StructTyID:
   case Type::ArrayTyID:
-  case Type::VectorTyID:
+  case Type::FixedVectorTyID:
     ElementSize = DL.getTypeStoreSize(ETy);
     O << " .b8 ";
     getSymbol(GVar)->print(O, MAI);
@@ -1439,7 +1430,7 @@ void NVPTXAsmPrinter::emitFunctionParamList(const Function *F, raw_ostream &O) {
     if (isKernelFunction(*F)) {
       if (isSampler(*I) || isImage(*I)) {
         if (isImage(*I)) {
-          std::string sname = I->getName();
+          std::string sname = std::string(I->getName());
           if (isImageWriteOnly(*I) || isImageReadWrite(*I)) {
             if (hasImageHandles)
               O << "\t.param .u64 .ptr .surfref ";
@@ -1634,8 +1625,8 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   int NumBytes = (int) MFI.getStackSize();
   if (NumBytes) {
-    O << "\t.local .align " << MFI.getMaxAlignment() << " .b8 \t" << DEPOTNAME
-      << getFunctionNumber() << "[" << NumBytes << "];\n";
+    O << "\t.local .align " << MFI.getMaxAlign().value() << " .b8 \t"
+      << DEPOTNAME << getFunctionNumber() << "[" << NumBytes << "];\n";
     if (static_cast<const NVPTXTargetMachine &>(MF.getTarget()).is64Bit()) {
       O << "\t.reg .b64 \t%SP;\n";
       O << "\t.reg .b64 \t%SPL;\n";
@@ -1684,7 +1675,7 @@ void NVPTXAsmPrinter::setAndEmitFunctionVirtualRegisters(
     }
   }
 
-  OutStreamer->EmitRawText(O.str());
+  OutStreamer->emitRawText(O.str());
 }
 
 void NVPTXAsmPrinter::printFPConstant(const ConstantFP *Fp, raw_ostream &O) {
@@ -1815,7 +1806,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
         aggBuffer->addBytes(ptr, 4, Bytes);
         break;
       } else if (const auto *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
-        if (const auto *constInt = dyn_cast_or_null<ConstantInt>(
+        if (const auto *constInt = dyn_cast<ConstantInt>(
                 ConstantFoldConstant(Cexpr, DL))) {
           int int32 = (int)(constInt->getZExtValue());
           ConvertIntToBytes<>(ptr, int32);
@@ -1837,7 +1828,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
         aggBuffer->addBytes(ptr, 8, Bytes);
         break;
       } else if (const ConstantExpr *Cexpr = dyn_cast<ConstantExpr>(CPV)) {
-        if (const auto *constInt = dyn_cast_or_null<ConstantInt>(
+        if (const auto *constInt = dyn_cast<ConstantInt>(
                 ConstantFoldConstant(Cexpr, DL))) {
           long long int64 = (long long)(constInt->getZExtValue());
           ConvertIntToBytes<>(ptr, int64);
@@ -1892,7 +1883,7 @@ void NVPTXAsmPrinter::bufferLEByte(const Constant *CPV, int Bytes,
   }
 
   case Type::ArrayTyID:
-  case Type::VectorTyID:
+  case Type::FixedVectorTyID:
   case Type::StructTyID: {
     if (isa<ConstantAggregate>(CPV) || isa<ConstantDataSequential>(CPV)) {
       int ElementSize = DL.getTypeAllocSize(CPV->getType());
@@ -1993,23 +1984,22 @@ NVPTXAsmPrinter::lowerConstantForGV(const Constant *CV, bool ProcessingGeneric)
   }
 
   switch (CE->getOpcode()) {
-  default:
+  default: {
     // If the code isn't optimized, there may be outstanding folding
     // opportunities. Attempt to fold the expression using DataLayout as a
     // last resort before giving up.
-    if (Constant *C = ConstantFoldConstant(CE, getDataLayout()))
-      if (C && C != CE)
-        return lowerConstantForGV(C, ProcessingGeneric);
+    Constant *C = ConstantFoldConstant(CE, getDataLayout());
+    if (C != CE)
+      return lowerConstantForGV(C, ProcessingGeneric);
 
     // Otherwise report the problem to the user.
-    {
-      std::string S;
-      raw_string_ostream OS(S);
-      OS << "Unsupported expression in static initializer: ";
-      CE->printAsOperand(OS, /*PrintType=*/false,
-                     !MF ? nullptr : MF->getFunction().getParent());
-      report_fatal_error(OS.str());
-    }
+    std::string S;
+    raw_string_ostream OS(S);
+    OS << "Unsupported expression in static initializer: ";
+    CE->printAsOperand(OS, /*PrintType=*/false,
+                   !MF ? nullptr : MF->getFunction().getParent());
+    report_fatal_error(OS.str());
+  }
 
   case Instruction::AddrSpaceCast: {
     // Strip the addrspacecast and pass along the operand
diff --git a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
index 7a66854d32f4..5c3a4eb470c1 100644
--- a/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
+++ b/llvm/lib/Target/NVPTX/NVPTXAsmPrinter.h
@@ -32,7 +32,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
-#include "llvm/PassAnalysisSupport.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -200,13 +200,14 @@ private:
   const Function *F;
   std::string CurrentFnName;
 
-  void EmitBasicBlockStart(const MachineBasicBlock &MBB) override;
-  void EmitFunctionEntryLabel() override;
-  void EmitFunctionBodyStart() override;
-  void EmitFunctionBodyEnd() override;
+  void emitStartOfAsmFile(Module &M) override;
+  void emitBasicBlockStart(const MachineBasicBlock &MBB) override;
+  void emitFunctionEntryLabel() override;
+  void emitFunctionBodyStart() override;
+  void emitFunctionBodyEnd() override;
   void emitImplicitDef(const MachineInstr *MI) const override;
 
-  void EmitInstruction(const MachineInstr *) override;
+  void emitInstruction(const MachineInstr *) override;
   void lowerToMCInst(const MachineInstr *MI, MCInst &OutMI);
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp);
   MCOperand GetSymbolRef(const MCSymbol *Symbol);
diff --git a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index d26912f47e50..c533921842e4 100644
--- a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -65,7 +65,7 @@ void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
 
 int NVPTXFrameLowering::getFrameIndexReference(const MachineFunction &MF,
                                                int FI,
-                                               unsigned &FrameReg) const {
+                                               Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   FrameReg = NVPTX::VRDepot;
   return MFI.getObjectOffset(FI) - getOffsetOfLocalArea();
@@ -83,3 +83,8 @@ MachineBasicBlock::iterator NVPTXFrameLowering::eliminateCallFramePseudoInstr(
   // ADJCALLSTACKUP instructions.
   return MBB.erase(I);
 }
+
+TargetFrameLowering::DwarfFrameBase
+NVPTXFrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
+  return {DwarfFrameBase::CFA, {0}};
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
index 40269f58f06e..e4c2b9e77f70 100644
--- a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -16,7 +16,7 @@
 #include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
-class NVPTXSubtarget;
+
 class NVPTXFrameLowering : public TargetFrameLowering {
 public:
   explicit NVPTXFrameLowering();
@@ -25,11 +25,12 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
+                             Register &FrameReg) const override;
 
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
+  DwarfFrameBase getDwarfFrameBase(const MachineFunction &MF) const override;
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
index b36d9b2e240a..9078ff8cfb97 100644
--- a/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXGenericToNVVM.cpp
@@ -144,7 +144,7 @@ bool GenericToNVVM::runOnModule(Module &M) {
     // variable initializers, as other uses have been already been removed
     // while walking through the instructions in function definitions.
     GV->replaceAllUsesWith(BitCastNewGV);
-    std::string Name = GV->getName();
+    std::string Name = std::string(GV->getName());
     GV->eraseFromParent();
     NewGV->setName(Name);
   }
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 92f71c687c46..f45cc06e0a0a 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -31,7 +31,6 @@
 #include "llvm/CodeGen/ValueTypes.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -88,11 +87,6 @@ static cl::opt<bool> UsePrecSqrtF32(
     cl::desc("NVPTX Specific: 0 use sqrt.approx, 1 use sqrt.rn."),
     cl::init(true));
 
-static cl::opt<bool> FtzEnabled(
-    "nvptx-f32ftz", cl::ZeroOrMore, cl::Hidden,
-    cl::desc("NVPTX Specific: Flush f32 subnormals to sign-preserving zero."),
-    cl::init(false));
-
 int NVPTXTargetLowering::getDivF32Level() const {
   if (UsePrecDivF32.getNumOccurrences() > 0) {
     // If nvptx-prec-div32=N is used on the command-line, always honor it
@@ -117,18 +111,8 @@ bool NVPTXTargetLowering::usePrecSqrtF32() const {
 }
 
 bool NVPTXTargetLowering::useF32FTZ(const MachineFunction &MF) const {
-  // TODO: Get rid of this flag; there can be only one way to do this.
-  if (FtzEnabled.getNumOccurrences() > 0) {
-    // If nvptx-f32ftz is used on the command-line, always honor it
-    return FtzEnabled;
-  } else {
-    const Function &F = MF.getFunction();
-    // Otherwise, check for an nvptx-f32ftz attribute on the function
-    if (F.hasFnAttribute("nvptx-f32ftz"))
-      return F.getFnAttribute("nvptx-f32ftz").getValueAsString() == "true";
-    else
-      return false;
-  }
+  return MF.getDenormalMode(APFloat::IEEEsingle()).Output ==
+         DenormalMode::PreserveSign;
 }
 
 static bool IsPTXVectorType(MVT VT) {
@@ -233,11 +217,10 @@ static void ComputePTXValueVTs(const TargetLowering &TLI, const DataLayout &DL,
 // covered by the vector op. Otherwise, it returns 1.
 static unsigned CanMergeParamLoadStoresStartingAt(
     unsigned Idx, uint32_t AccessSize, const SmallVectorImpl<EVT> &ValueVTs,
-    const SmallVectorImpl<uint64_t> &Offsets, unsigned ParamAlignment) {
-  assert(isPowerOf2_32(AccessSize) && "must be a power of 2!");
+    const SmallVectorImpl<uint64_t> &Offsets, Align ParamAlignment) {
 
   // Can't vectorize if param alignment is not sufficient.
-  if (AccessSize > ParamAlignment)
+  if (ParamAlignment < AccessSize)
     return 1;
   // Can't vectorize if offset is not aligned.
   if (Offsets[Idx] & (AccessSize - 1))
@@ -297,7 +280,7 @@ enum ParamVectorizationFlags {
 static SmallVector<ParamVectorizationFlags, 16>
 VectorizePTXValueVTs(const SmallVectorImpl<EVT> &ValueVTs,
                      const SmallVectorImpl<uint64_t> &Offsets,
-                     unsigned ParamAlignment) {
+                     Align ParamAlignment) {
   // Set vector size to match ValueVTs and mark all elements as
   // scalars by default.
   SmallVector<ParamVectorizationFlags, 16> VectorInfo;
@@ -1258,8 +1241,8 @@ NVPTXTargetLowering::LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const {
 
 std::string NVPTXTargetLowering::getPrototype(
     const DataLayout &DL, Type *retTy, const ArgListTy &Args,
-    const SmallVectorImpl<ISD::OutputArg> &Outs, unsigned retAlignment,
-    ImmutableCallSite CS) const {
+    const SmallVectorImpl<ISD::OutputArg> &Outs, MaybeAlign retAlignment,
+    const CallBase &CB) const {
   auto PtrVT = getPointerTy(DL);
 
   bool isABI = (STI.getSmVersion() >= 20);
@@ -1294,8 +1277,8 @@ std::string NVPTXTargetLowering::getPrototype(
       O << ".param .b" << PtrVT.getSizeInBits() << " _";
     } else if (retTy->isAggregateType() || retTy->isVectorTy() ||
                retTy->isIntegerTy(128)) {
-      O << ".param .align " << retAlignment << " .b8 _["
-        << DL.getTypeAllocSize(retTy) << "]";
+      O << ".param .align " << (retAlignment ? retAlignment->value() : 0)
+        << " .b8 _[" << DL.getTypeAllocSize(retTy) << "]";
     } else {
       llvm_unreachable("Unknown return type");
     }
@@ -1316,7 +1299,7 @@ std::string NVPTXTargetLowering::getPrototype(
     if (!Outs[OIdx].Flags.isByVal()) {
       if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
         unsigned align = 0;
-        const CallInst *CallI = cast<CallInst>(CS.getInstruction());
+        const CallInst *CallI = cast<CallInst>(&CB);
         // +1 because index 0 is reserved for return type alignment
         if (!getAlign(*CallI, i + 1, align))
           align = DL.getABITypeAlignment(Ty);
@@ -1358,9 +1341,9 @@ std::string NVPTXTargetLowering::getPrototype(
     assert(PTy && "Param with byval attribute should be a pointer type");
     Type *ETy = PTy->getElementType();
 
-    unsigned align = Outs[OIdx].Flags.getByValAlign();
+    Align align = Outs[OIdx].Flags.getNonZeroByValAlign();
     unsigned sz = DL.getTypeAllocSize(ETy);
-    O << ".param .align " << align << " .b8 ";
+    O << ".param .align " << align.value() << " .b8 ";
     O << "_";
     O << "[" << sz << "]";
   }
@@ -1368,31 +1351,29 @@ std::string NVPTXTargetLowering::getPrototype(
   return O.str();
 }
 
-unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
-                                                   ImmutableCallSite CS,
-                                                   Type *Ty, unsigned Idx,
-                                                   const DataLayout &DL) const {
-  if (!CS) {
+Align NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
+                                                const CallBase *CB, Type *Ty,
+                                                unsigned Idx,
+                                                const DataLayout &DL) const {
+  if (!CB) {
     // CallSite is zero, fallback to ABI type alignment
-    return DL.getABITypeAlignment(Ty);
+    return DL.getABITypeAlign(Ty);
   }
 
-  unsigned Align = 0;
-  const Value *DirectCallee = CS.getCalledFunction();
+  unsigned Alignment = 0;
+  const Function *DirectCallee = CB->getCalledFunction();
 
   if (!DirectCallee) {
     // We don't have a direct function symbol, but that may be because of
     // constant cast instructions in the call.
-    const Instruction *CalleeI = CS.getInstruction();
-    assert(CalleeI && "Call target is not a function or derived value?");
 
     // With bitcast'd call targets, the instruction will be the call
-    if (isa<CallInst>(CalleeI)) {
+    if (const auto *CI = dyn_cast<CallInst>(CB)) {
       // Check if we have call alignment metadata
-      if (getAlign(*cast<CallInst>(CalleeI), Idx, Align))
-        return Align;
+      if (getAlign(*CI, Idx, Alignment))
+        return Align(Alignment);
 
-      const Value *CalleeV = cast<CallInst>(CalleeI)->getCalledValue();
+      const Value *CalleeV = CI->getCalledOperand();
       // Ignore any bitcast instructions
       while (isa<ConstantExpr>(CalleeV)) {
         const ConstantExpr *CE = cast<ConstantExpr>(CalleeV);
@@ -1404,20 +1385,20 @@ unsigned NVPTXTargetLowering::getArgumentAlignment(SDValue Callee,
 
       // We have now looked past all of the bitcasts.  Do we finally have a
       // Function?
-      if (isa<Function>(CalleeV))
-        DirectCallee = CalleeV;
+      if (const auto *CalleeF = dyn_cast<Function>(CalleeV))
+        DirectCallee = CalleeF;
     }
   }
 
   // Check for function alignment information if we found that the
   // ultimate target is a Function
   if (DirectCallee)
-    if (getAlign(*cast<Function>(DirectCallee), Idx, Align))
-      return Align;
+    if (getAlign(*DirectCallee, Idx, Alignment))
+      return Align(Alignment);
 
   // Call is indirect or alignment information is not available, fall back to
   // the ABI type alignment
-  return DL.getABITypeAlignment(Ty);
+  return DL.getABITypeAlign(Ty);
 }
 
 SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
@@ -1432,7 +1413,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool &isTailCall = CLI.IsTailCall;
   ArgListTy &Args = CLI.getArgs();
   Type *RetTy = CLI.RetTy;
-  ImmutableCallSite CS = CLI.CS;
+  const CallBase *CB = CLI.CB;
   const DataLayout &DL = DAG.getDataLayout();
 
   bool isABI = (STI.getSmVersion() >= 20);
@@ -1465,15 +1446,14 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       SmallVector<EVT, 16> VTs;
       SmallVector<uint64_t, 16> Offsets;
       ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets);
-      unsigned ArgAlign =
-          getArgumentAlignment(Callee, CS, Ty, paramCount + 1, DL);
+      Align ArgAlign = getArgumentAlignment(Callee, CB, Ty, paramCount + 1, DL);
       unsigned AllocSize = DL.getTypeAllocSize(Ty);
       SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
       bool NeedAlign; // Does argument declaration specify alignment?
       if (Ty->isAggregateType() || Ty->isVectorTy() || Ty->isIntegerTy(128)) {
         // declare .param .align <align> .b8 .param<n>[<size>];
         SDValue DeclareParamOps[] = {
-            Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
+            Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
             DAG.getConstant(paramCount, dl, MVT::i32),
             DAG.getConstant(AllocSize, dl, MVT::i32), InFlag};
         Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
@@ -1554,8 +1534,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
           // Adjust type of the store op if we've extended the scalar
           // return value.
           EVT TheStoreType = ExtendIntegerParam ? MVT::i32 : VTs[j];
-          unsigned EltAlign =
-              NeedAlign ? GreatestCommonDivisor64(ArgAlign, Offsets[j]) : 0;
+          MaybeAlign EltAlign;
+          if (NeedAlign)
+            EltAlign = commonAlignment(ArgAlign, Offsets[j]);
 
           Chain = DAG.getMemIntrinsicNode(
               Op, dl, DAG.getVTList(MVT::Other, MVT::Glue), StoreOperands,
@@ -1585,7 +1566,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // declare .param .align <align> .b8 .param<n>[<size>];
     unsigned sz = Outs[OIdx].Flags.getByValSize();
     SDVTList DeclareParamVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    unsigned ArgAlign = Outs[OIdx].Flags.getByValAlign();
+    Align ArgAlign = Outs[OIdx].Flags.getNonZeroByValAlign();
     // The ByValAlign in the Outs[OIdx].Flags is alway set at this point,
     // so we don't need to worry about natural alignment or not.
     // See TargetLowering::LowerCallTo().
@@ -1593,18 +1574,19 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // Enforce minumum alignment of 4 to work around ptxas miscompile
     // for sm_50+. See corresponding alignment adjustment in
     // emitFunctionParamList() for details.
-    if (ArgAlign < 4)
-      ArgAlign = 4;
-    SDValue DeclareParamOps[] = {Chain, DAG.getConstant(ArgAlign, dl, MVT::i32),
-                                 DAG.getConstant(paramCount, dl, MVT::i32),
-                                 DAG.getConstant(sz, dl, MVT::i32), InFlag};
+    if (ArgAlign < Align(4))
+      ArgAlign = Align(4);
+    SDValue DeclareParamOps[] = {
+        Chain, DAG.getConstant(ArgAlign.value(), dl, MVT::i32),
+        DAG.getConstant(paramCount, dl, MVT::i32),
+        DAG.getConstant(sz, dl, MVT::i32), InFlag};
     Chain = DAG.getNode(NVPTXISD::DeclareParam, dl, DeclareParamVTs,
                         DeclareParamOps);
     InFlag = Chain.getValue(1);
     for (unsigned j = 0, je = VTs.size(); j != je; ++j) {
       EVT elemtype = VTs[j];
       int curOffset = Offsets[j];
-      unsigned PartAlign = GreatestCommonDivisor64(ArgAlign, curOffset);
+      unsigned PartAlign = GreatestCommonDivisor64(ArgAlign.value(), curOffset);
       auto PtrVT = getPointerTy(DL);
       SDValue srcAddr = DAG.getNode(ISD::ADD, dl, PtrVT, OutVals[OIdx],
                                     DAG.getConstant(curOffset, dl, PtrVT));
@@ -1618,10 +1600,9 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                                  DAG.getConstant(paramCount, dl, MVT::i32),
                                  DAG.getConstant(curOffset, dl, MVT::i32),
                                  theVal, InFlag };
-      Chain = DAG.getMemIntrinsicNode(NVPTXISD::StoreParam, dl, CopyParamVTs,
-                                      CopyParamOps, elemtype,
-                                      MachinePointerInfo(), /* Align */ 0,
-                                      MachineMemOperand::MOStore);
+      Chain = DAG.getMemIntrinsicNode(
+          NVPTXISD::StoreParam, dl, CopyParamVTs, CopyParamOps, elemtype,
+          MachinePointerInfo(), /* Align */ None, MachineMemOperand::MOStore);
 
       InFlag = Chain.getValue(1);
     }
@@ -1629,7 +1610,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   }
 
   GlobalAddressSDNode *Func = dyn_cast<GlobalAddressSDNode>(Callee.getNode());
-  unsigned retAlignment = 0;
+  MaybeAlign retAlignment = None;
 
   // Handle Result
   if (Ins.size() > 0) {
@@ -1657,12 +1638,13 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
                           DeclareRetOps);
       InFlag = Chain.getValue(1);
     } else {
-      retAlignment = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
+      retAlignment = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
+      assert(retAlignment && "retAlignment is guaranteed to be set");
       SDVTList DeclareRetVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-      SDValue DeclareRetOps[] = { Chain,
-                                  DAG.getConstant(retAlignment, dl, MVT::i32),
-                                  DAG.getConstant(resultsz / 8, dl, MVT::i32),
-                                  DAG.getConstant(0, dl, MVT::i32), InFlag };
+      SDValue DeclareRetOps[] = {
+          Chain, DAG.getConstant(retAlignment->value(), dl, MVT::i32),
+          DAG.getConstant(resultsz / 8, dl, MVT::i32),
+          DAG.getConstant(0, dl, MVT::i32), InFlag};
       Chain = DAG.getNode(NVPTXISD::DeclareRetParam, dl, DeclareRetVTs,
                           DeclareRetOps);
       InFlag = Chain.getValue(1);
@@ -1672,7 +1654,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // Both indirect calls and libcalls have nullptr Func. In order to distinguish
   // between them we must rely on the call site value which is valid for
   // indirect calls but is always null for libcalls.
-  bool isIndirectCall = !Func && CS;
+  bool isIndirectCall = !Func && CB;
 
   if (isa<ExternalSymbolSDNode>(Callee)) {
     Function* CalleeFunc = nullptr;
@@ -1695,7 +1677,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // The prototype is embedded in a string and put as the operand for a
     // CallPrototype SDNode which will print out to the value of the string.
     SDVTList ProtoVTs = DAG.getVTList(MVT::Other, MVT::Glue);
-    std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, CS);
+    std::string Proto = getPrototype(DL, RetTy, Args, Outs, retAlignment, *CB);
     const char *ProtoStr =
       nvTM->getManagedStrPool()->getManagedString(Proto.c_str())->c_str();
     SDValue ProtoOps[] = {
@@ -1768,7 +1750,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     ComputePTXValueVTs(*this, DL, RetTy, VTs, &Offsets, 0);
     assert(VTs.size() == Ins.size() && "Bad value decomposition");
 
-    unsigned RetAlign = getArgumentAlignment(Callee, CS, RetTy, 0, DL);
+    Align RetAlign = getArgumentAlignment(Callee, CB, RetTy, 0, DL);
     auto VectorInfo = VectorizePTXValueVTs(VTs, Offsets, RetAlign);
 
     SmallVector<EVT, 6> LoadVTs;
@@ -1784,7 +1766,7 @@ SDValue NVPTXTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       bool needTruncate = false;
       EVT TheLoadType = VTs[i];
       EVT EltType = Ins[i].VT;
-      unsigned EltAlign = GreatestCommonDivisor64(RetAlign, Offsets[i]);
+      Align EltAlign = commonAlignment(RetAlign, Offsets[i]);
       if (ExtendIntegerRetVal) {
         TheLoadType = MVT::i32;
         EltType = MVT::i32;
@@ -2320,10 +2302,10 @@ NVPTXTargetLowering::LowerSTOREVector(SDValue Op, SelectionDAG &DAG) const {
     MemSDNode *MemSD = cast<MemSDNode>(N);
     const DataLayout &TD = DAG.getDataLayout();
 
-    unsigned Align = MemSD->getAlignment();
-    unsigned PrefAlign =
-        TD.getPrefTypeAlignment(ValVT.getTypeForEVT(*DAG.getContext()));
-    if (Align < PrefAlign) {
+    Align Alignment = MemSD->getAlign();
+    Align PrefAlign =
+        TD.getPrefTypeAlign(ValVT.getTypeForEVT(*DAG.getContext()));
+    if (Alignment < PrefAlign) {
       // This store is not sufficiently aligned, so bail out and let this vector
       // store be scalarized.  Note that we may still be able to emit smaller
       // vector stores.  For example, if we are storing a <4 x float> with an
@@ -2559,7 +2541,7 @@ SDValue NVPTXTargetLowering::LowerFormalArguments(
       ComputePTXValueVTs(*this, DL, Ty, VTs, &Offsets, 0);
       assert(VTs.size() > 0 && "Unexpected empty type.");
       auto VectorInfo =
-          VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlignment(Ty));
+          VectorizePTXValueVTs(VTs, Offsets, DL.getABITypeAlign(Ty));
 
       SDValue Arg = getParamSymbol(DAG, idx, PtrVT);
       int VecIdx = -1; // Index of the first element of the current vector.
@@ -2678,7 +2660,7 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   assert(VTs.size() == OutVals.size() && "Bad return value decomposition");
 
   auto VectorInfo = VectorizePTXValueVTs(
-      VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlignment(RetTy) : 1);
+      VTs, Offsets, RetTy->isSized() ? DL.getABITypeAlign(RetTy) : Align(1));
 
   // PTX Interoperability Guide 3.3(A): [Integer] Values shorter than
   // 32-bits are sign extended or zero extended, depending on whether
@@ -2730,10 +2712,9 @@ NVPTXTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       // Adjust type of load/store op if we've extended the scalar
       // return value.
       EVT TheStoreType = ExtendIntegerRetVal ? MVT::i32 : VTs[i];
-      Chain = DAG.getMemIntrinsicNode(Op, dl, DAG.getVTList(MVT::Other),
-                                      StoreOperands, TheStoreType,
-                                      MachinePointerInfo(), /* Align */ 1,
-                                      MachineMemOperand::MOStore);
+      Chain = DAG.getMemIntrinsicNode(
+          Op, dl, DAG.getVTList(MVT::Other), StoreOperands, TheStoreType,
+          MachinePointerInfo(), Align(1), MachineMemOperand::MOStore);
       // Cleanup vector state.
       StoreOperands.clear();
     }
@@ -3799,8 +3780,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align =
-        MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
+    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
 
     return true;
   }
@@ -3819,8 +3799,7 @@ bool NVPTXTargetLowering::getTgtMemIntrinsic(
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.flags = MachineMemOperand::MOLoad;
-    Info.align =
-        MaybeAlign(cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
+    Info.align = cast<ConstantInt>(I.getArgOperand(1))->getMaybeAlignValue();
 
     return true;
   }
@@ -4810,11 +4789,10 @@ static void ReplaceLoadVector(SDNode *N, SelectionDAG &DAG,
 
   LoadSDNode *LD = cast<LoadSDNode>(N);
 
-  unsigned Align = LD->getAlignment();
+  Align Alignment = LD->getAlign();
   auto &TD = DAG.getDataLayout();
-  unsigned PrefAlign =
-      TD.getPrefTypeAlignment(ResVT.getTypeForEVT(*DAG.getContext()));
-  if (Align < PrefAlign) {
+  Align PrefAlign = TD.getPrefTypeAlign(ResVT.getTypeForEVT(*DAG.getContext()));
+  if (Alignment < PrefAlign) {
     // This load is not sufficiently aligned, so bail out and let this vector
     // load be scalarized.  Note that we may still be able to emit smaller
     // vector loads.  For example, if we are loading a <4 x float> with an
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 546fe49808e2..df9cd4159962 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -491,8 +491,7 @@ public:
 
   std::string getPrototype(const DataLayout &DL, Type *, const ArgListTy &,
                            const SmallVectorImpl<ISD::OutputArg> &,
-                           unsigned retAlignment,
-                           ImmutableCallSite CS) const;
+                           MaybeAlign retAlignment, const CallBase &CB) const;
 
   SDValue LowerReturn(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
@@ -579,8 +578,8 @@ private:
                           SelectionDAG &DAG) const override;
   SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-  unsigned getArgumentAlignment(SDValue Callee, ImmutableCallSite CS, Type *Ty,
-                                unsigned Idx, const DataLayout &DL) const;
+  Align getArgumentAlignment(SDValue Callee, const CallBase *CB, Type *Ty,
+                             unsigned Idx, const DataLayout &DL) const;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
index cff230289e60..ec0c92ccf5c5 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.cpp
@@ -69,7 +69,7 @@ void NVPTXInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
       .addReg(SrcReg, getKillRegState(KillSrc));
 }
 
-/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// analyzeBranch - Analyze the branching code at the end of MBB, returning
 /// true if it cannot be understood (e.g. it's a switch dispatch or isn't
 /// implemented for a target).  Upon success, this returns false and returns
 /// with the following information in various cases:
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
index 83039241a7c7..6cf59d285e8d 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerAggrCopies.cpp
@@ -113,8 +113,8 @@ bool NVPTXLowerAggrCopies::runOnFunction(Function &F) {
     createMemCpyLoopKnownSize(/* ConvertedInst */ SI,
                               /* SrcAddr */ SrcAddr, /* DstAddr */ DstAddr,
                               /* CopyLen */ CopyLen,
-                              /* SrcAlign */ LI->getAlignment(),
-                              /* DestAlign */ SI->getAlignment(),
+                              /* SrcAlign */ LI->getAlign(),
+                              /* DestAlign */ SI->getAlign(),
                               /* SrcIsVolatile */ LI->isVolatile(),
                               /* DstIsVolatile */ SI->isVolatile(), TTI);
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index c3c5f6fbcba7..e60b5eeacdae 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -159,12 +159,14 @@ void NVPTXLowerArgs::handleByValParam(Argument *Arg) {
   assert(PType && "Expecting pointer type in handleByValParam");
 
   Type *StructType = PType->getElementType();
-  unsigned AS = Func->getParent()->getDataLayout().getAllocaAddrSpace();
+  const DataLayout &DL = Func->getParent()->getDataLayout();
+  unsigned AS = DL.getAllocaAddrSpace();
   AllocaInst *AllocA = new AllocaInst(StructType, AS, Arg->getName(), FirstInst);
   // Set the alignment to alignment of the byval parameter. This is because,
   // later load/stores assume that alignment, and we are going to replace
   // the use of the byval parameter with this alloca instruction.
-  AllocA->setAlignment(MaybeAlign(Func->getParamAlignment(Arg->getArgNo())));
+  AllocA->setAlignment(Func->getParamAlign(Arg->getArgNo())
+                           .getValueOr(DL.getPrefTypeAlign(StructType)));
   Arg->replaceAllUsesWith(AllocA);
 
   Value *ArgInParam = new AddrSpaceCastInst(
diff --git a/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp b/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
index a7127b0e9a99..ea2274f394e6 100644
--- a/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXPrologEpilogPass.cpp
@@ -67,14 +67,14 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
         if (MI.isDebugValue()) {
           assert(i == 0 && "Frame indices can only appear as the first "
                            "operand of a DBG_VALUE machine instruction");
-          unsigned Reg;
+          Register Reg;
           int64_t Offset =
               TFI.getFrameIndexReference(MF, MI.getOperand(0).getIndex(), Reg);
           MI.getOperand(0).ChangeToRegister(Reg, /*isDef=*/false);
           MI.getOperand(0).setIsDebug();
           auto *DIExpr = DIExpression::prepend(
               MI.getDebugExpression(), DIExpression::ApplyOffset, Offset);
-          MI.getOperand(3).setMetadata(DIExpr);
+          MI.getDebugExpressionOp().setMetadata(DIExpr);
           continue;
         }
 
@@ -97,22 +97,21 @@ bool NVPTXPrologEpilogPass::runOnMachineFunction(MachineFunction &MF) {
 }
 
 /// AdjustStackOffset - Helper function used to adjust the stack frame offset.
-static inline void
-AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
-                  bool StackGrowsDown, int64_t &Offset,
-                  unsigned &MaxAlign) {
+static inline void AdjustStackOffset(MachineFrameInfo &MFI, int FrameIdx,
+                                     bool StackGrowsDown, int64_t &Offset,
+                                     Align &MaxAlign) {
   // If the stack grows down, add the object size to find the lowest address.
   if (StackGrowsDown)
     Offset += MFI.getObjectSize(FrameIdx);
 
-  unsigned Align = MFI.getObjectAlignment(FrameIdx);
+  Align Alignment = MFI.getObjectAlign(FrameIdx);
 
   // If the alignment of this object is greater than that of the stack, then
   // increase the stack alignment to match.
-  MaxAlign = std::max(MaxAlign, Align);
+  MaxAlign = std::max(MaxAlign, Alignment);
 
   // Adjust to alignment boundary.
-  Offset = (Offset + Align - 1) / Align * Align;
+  Offset = alignTo(Offset, Alignment);
 
   if (StackGrowsDown) {
     LLVM_DEBUG(dbgs() << "alloc FI(" << FrameIdx << ") at SP[" << -Offset
@@ -169,7 +168,7 @@ NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
 
   // NOTE: We do not have a call stack
 
-  unsigned MaxAlign = MFI.getMaxAlignment();
+  Align MaxAlign = MFI.getMaxAlign();
 
   // No scavenger
 
@@ -178,10 +177,10 @@ NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
   // frame index registers. Functions which don't want/need this optimization
   // will continue to use the existing code path.
   if (MFI.getUseLocalStackAllocationBlock()) {
-    unsigned Align = MFI.getLocalFrameMaxAlign().value();
+    Align Alignment = MFI.getLocalFrameMaxAlign();
 
     // Adjust to alignment boundary.
-    Offset = (Offset + Align - 1) / Align * Align;
+    Offset = alignTo(Offset, Alignment);
 
     LLVM_DEBUG(dbgs() << "Local frame base offset: " << Offset << "\n");
 
@@ -196,7 +195,7 @@ NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
     // Allocate the local block
     Offset += MFI.getLocalFrameSize();
 
-    MaxAlign = std::max(Align, MaxAlign);
+    MaxAlign = std::max(Alignment, MaxAlign);
   }
 
   // No stack protector
@@ -227,18 +226,16 @@ NVPTXPrologEpilogPass::calculateFrameObjectOffsets(MachineFunction &Fn) {
     // ensure that the callee's frame or the alloca data is suitably aligned;
     // otherwise, for leaf functions, align to the TransientStackAlignment
     // value.
-    unsigned StackAlign;
+    Align StackAlign;
     if (MFI.adjustsStack() || MFI.hasVarSizedObjects() ||
         (RegInfo->needsStackRealignment(Fn) && MFI.getObjectIndexEnd() != 0))
-      StackAlign = TFI.getStackAlignment();
+      StackAlign = TFI.getStackAlign();
     else
-      StackAlign = TFI.getTransientStackAlignment();
+      StackAlign = TFI.getTransientStackAlign();
 
     // If the frame pointer is eliminated, all frame offsets will be relative to
     // SP not FP. Align to MaxAlign so this works.
-    StackAlign = std::max(StackAlign, MaxAlign);
-    unsigned AlignMask = StackAlign - 1;
-    Offset = (Offset + AlignMask) & ~uint64_t(AlignMask);
+    Offset = alignTo(Offset, std::max(StackAlign, MaxAlign));
   }
 
   // Update frame info to pretend that this is part of the stack...
diff --git a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
index e213089e4085..8ae542130a14 100644
--- a/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXReplaceImageHandles.cpp
@@ -152,7 +152,7 @@ findIndexForHandle(MachineOperand &Op, MachineFunction &MF, unsigned &Idx) {
 
     assert(TexHandleDef.getOperand(6).isSymbol() && "Load is not a symbol!");
     StringRef Sym = TexHandleDef.getOperand(6).getSymbolName();
-    std::string ParamBaseName = MF.getName();
+    std::string ParamBaseName = std::string(MF.getName());
     ParamBaseName += "_param_";
     assert(Sym.startswith(ParamBaseName) && "Invalid symbol reference");
     unsigned Param = atoi(Sym.data()+ParamBaseName.size());
diff --git a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
index 357826c2d19c..f1fa6416f15f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXSubtarget.cpp
@@ -33,13 +33,13 @@ void NVPTXSubtarget::anchor() {}
 NVPTXSubtarget &NVPTXSubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                                 StringRef FS) {
     // Provide the default CPU if we don't have one.
-  TargetName = CPU.empty() ? "sm_20" : CPU;
+    TargetName = std::string(CPU.empty() ? "sm_20" : CPU);
 
-  ParseSubtargetFeatures(TargetName, FS);
+    ParseSubtargetFeatures(TargetName, FS);
 
-  // Set default to PTX 3.2 (CUDA 5.5)
-  if (PTXVersion == 0) {
-    PTXVersion = 32;
+    // Set default to PTX 3.2 (CUDA 5.5)
+    if (PTXVersion == 0) {
+      PTXVersion = 32;
   }
 
   return *this;
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
index 0778706d936a..85709eb731e2 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetMachine.cpp
@@ -117,7 +117,7 @@ NVPTXTargetMachine::NVPTXTargetMachine(const Target &T, const Triple &TT,
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
       is64bit(is64bit), UseShortPointers(UseShortPointersOpt),
       TLOF(std::make_unique<NVPTXTargetObjectFile>()),
-      Subtarget(TT, CPU, FS, *this) {
+      Subtarget(TT, std::string(CPU), std::string(FS), *this) {
   if (TT.getOS() == Triple::NVCL)
     drvInterface = NVPTX::NVCL;
   else
@@ -276,8 +276,6 @@ void NVPTXPassConfig::addIRPasses() {
   addPass(createNVPTXLowerArgsPass(&getNVPTXTargetMachine()));
   if (getOptLevel() != CodeGenOpt::None) {
     addAddressSpaceInferencePasses();
-    if (!DisableLoadStoreVectorizer)
-      addPass(createLoadStoreVectorizerPass());
     addStraightLineScalarOptimizationPasses();
   }
 
@@ -295,8 +293,11 @@ void NVPTXPassConfig::addIRPasses() {
   //   %1 = shl %a, 2
   //
   // but EarlyCSE can do neither of them.
-  if (getOptLevel() != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None) {
     addEarlyCSEOrGVNPass();
+    if (!DisableLoadStoreVectorizer)
+      addPass(createLoadStoreVectorizerPass());
+  }
 }
 
 bool NVPTXPassConfig::addInstSelector() {
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h b/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
index ab2a93b75922..366d92a5a805 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetObjectFile.h
@@ -27,7 +27,7 @@ public:
 
   MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
                                    const Constant *C,
-                                   unsigned &Align) const override {
+                                   Align &Alignment) const override {
     return ReadOnlySection;
   }
 
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
index afc40a7abed0..3873c73fb2e0 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.cpp
@@ -112,7 +112,8 @@ bool NVPTXTTIImpl::isSourceOfDivergence(const Value *V) {
 }
 
 int NVPTXTTIImpl::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+    TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
     const Instruction *CxtI) {
@@ -123,7 +124,8 @@ int NVPTXTTIImpl::getArithmeticInstrCost(
 
   switch (ISD) {
   default:
-    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+                                         Opd2Info,
                                          Opd1PropInfo, Opd2PropInfo);
   case ISD::ADD:
   case ISD::MUL:
@@ -136,7 +138,8 @@ int NVPTXTTIImpl::getArithmeticInstrCost(
     if (LT.second.SimpleTy == MVT::i64)
       return 2 * LT.first;
     // Delegate other cases to the basic TTI.
-    return BaseT::getArithmeticInstrCost(Opcode, Ty, Opd1Info, Opd2Info,
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Opd1Info,
+                                         Opd2Info,
                                          Opd1PropInfo, Opd2PropInfo);
   }
 }
@@ -152,3 +155,8 @@ void NVPTXTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   UP.Partial = UP.Runtime = true;
   UP.PartialThreshold = UP.Threshold / 4;
 }
+
+void NVPTXTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                                         TTI::PeelingPreferences &PP) {
+  BaseT::getPeelingPreferences(L, SE, PP);
+}
diff --git a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
index 864d8b91a89a..cb832031f1ad 100644
--- a/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
+++ b/llvm/lib/Target/NVPTX/NVPTXTargetTransformInfo.h
@@ -50,13 +50,11 @@ public:
 
   // Loads and stores can be vectorized if the alignment is at least as big as
   // the load/store we want to vectorize.
-  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes,
-                                   unsigned Alignment,
+  bool isLegalToVectorizeLoadChain(unsigned ChainSizeInBytes, Align Alignment,
                                    unsigned AddrSpace) const {
     return Alignment >= ChainSizeInBytes;
   }
-  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes,
-                                    unsigned Alignment,
+  bool isLegalToVectorizeStoreChain(unsigned ChainSizeInBytes, Align Alignment,
                                     unsigned AddrSpace) const {
     return isLegalToVectorizeLoadChain(ChainSizeInBytes, Alignment, AddrSpace);
   }
@@ -87,6 +85,7 @@ public:
 
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -96,6 +95,10 @@ public:
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
+
+  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                             TTI::PeelingPreferences &PP);
+
   bool hasVolatileVariant(Instruction *I, unsigned AddrSpace) {
     // Volatile loads/stores are only supported for shared and global address
     // spaces, or for generic AS that maps to them.
diff --git a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
index 43c2e9920403..74d129d330f3 100644
--- a/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXUtilities.cpp
@@ -226,17 +226,17 @@ bool isManaged(const Value &val) {
 
 std::string getTextureName(const Value &val) {
   assert(val.hasName() && "Found texture variable with no name");
-  return val.getName();
+  return std::string(val.getName());
 }
 
 std::string getSurfaceName(const Value &val) {
   assert(val.hasName() && "Found surface variable with no name");
-  return val.getName();
+  return std::string(val.getName());
 }
 
 std::string getSamplerName(const Value &val) {
   assert(val.hasName() && "Found sampler variable with no name");
-  return val.getName();
+  return std::string(val.getName());
 }
 
 bool getMaxNTIDx(const Function &F, unsigned &x) {
diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index 7e7902c27a81..13fd7d05ab9f 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -109,6 +109,8 @@ class PPCAsmParser : public MCTargetAsmParser {
   bool MatchRegisterName(unsigned &RegNo, int64_t &IntVal);
 
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
 
   const MCExpr *ExtractModifierFromExpr(const MCExpr *E,
                                         PPCMCExpr::VariantKind &Variant);
@@ -356,6 +358,16 @@ public:
   bool isS16ImmX16() const { return Kind == Expression ||
                                     (Kind == Immediate && isInt<16>(getImm()) &&
                                      (getImm() & 15) == 0); }
+  bool isS34ImmX16() const {
+    return Kind == Expression ||
+           (Kind == Immediate && isInt<34>(getImm()) && (getImm() & 15) == 0);
+  }
+  bool isS34Imm() const {
+    // Once the PC-Rel ABI is finalized, evaluate whether a 34-bit
+    // ContextImmediate is needed.
+    return Kind == Expression || (Kind == Immediate && isInt<34>(getImm()));
+  }
+
   bool isS17Imm() const {
     switch (Kind) {
       case Expression:
@@ -388,6 +400,7 @@ public:
   bool isCondBr() const { return Kind == Expression ||
                                  (Kind == Immediate && isInt<16>(getImm()) &&
                                   (getImm() & 3) == 0); }
+  bool isImmZero() const { return Kind == Immediate && getImm() == 0; }
   bool isRegNumber() const { return Kind == Immediate && isUInt<5>(getImm()); }
   bool isVSRegNumber() const {
     return Kind == Immediate && isUInt<6>(getImm());
@@ -1142,7 +1155,7 @@ bool PPCAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     // Post-process instructions (typically extended mnemonics)
     ProcessInstruction(Inst, Operands);
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst, getSTI());
+    Out.emitInstruction(Inst, getSTI());
     return false;
   case Match_MissingFeature:
     return Error(IDLoc, "instruction use requires an option to be enabled");
@@ -1210,14 +1223,22 @@ bool PPCAsmParser::MatchRegisterName(unsigned &RegNo, int64_t &IntVal) {
 
 bool PPCAsmParser::
 ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  if (tryParseRegister(RegNo, StartLoc, EndLoc) != MatchOperand_Success)
+    return TokError("invalid register name");
+  return false;
+}
+
+OperandMatchResultTy PPCAsmParser::tryParseRegister(unsigned &RegNo,
+                                                    SMLoc &StartLoc,
+                                                    SMLoc &EndLoc) {
   const AsmToken &Tok = getParser().getTok();
   StartLoc = Tok.getLoc();
   EndLoc = Tok.getEndLoc();
   RegNo = 0;
   int64_t IntVal;
   if (MatchRegisterName(RegNo, IntVal))
-    return TokError("invalid register name");
-  return false;
+    return MatchOperand_NoMatch;
+  return MatchOperand_Success;
 }
 
 /// Extract \code @l/@ha \endcode modifier from expression.  Recursively scan
@@ -1380,7 +1401,7 @@ ParseExpression(const MCExpr *&EVal) {
   PPCMCExpr::VariantKind Variant;
   const MCExpr *E = ExtractModifierFromExpr(EVal, Variant);
   if (E)
-    EVal = PPCMCExpr::create(Variant, E, false, getParser().getContext());
+    EVal = PPCMCExpr::create(Variant, E, getParser().getContext());
 
   return false;
 }
@@ -1427,7 +1448,7 @@ ParseDarwinExpression(const MCExpr *&EVal) {
     if (getLexer().isNot(AsmToken::RParen))
       return Error(Parser.getTok().getLoc(), "expected ')'");
     Parser.Lex(); // Eat the ')'
-    EVal = PPCMCExpr::create(Variant, EVal, false, getParser().getContext());
+    EVal = PPCMCExpr::create(Variant, EVal, getParser().getContext());
   }
   return false;
 }
@@ -1560,12 +1581,12 @@ bool PPCAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // instruction name, to match what TableGen is doing.
   std::string NewOpcode;
   if (parseOptionalToken(AsmToken::Plus)) {
-    NewOpcode = Name;
+    NewOpcode = std::string(Name);
     NewOpcode += '+';
     Name = NewOpcode;
   }
   if (parseOptionalToken(AsmToken::Minus)) {
-    NewOpcode = Name;
+    NewOpcode = std::string(Name);
     NewOpcode += '-';
     Name = NewOpcode;
   }
@@ -1658,9 +1679,9 @@ bool PPCAsmParser::ParseDirectiveWord(unsigned Size, AsmToken ID) {
       if (!isUIntN(8 * Size, IntValue) && !isIntN(8 * Size, IntValue))
         return Error(ExprLoc, "literal value out of range for '" +
                                   ID.getIdentifier() + "' directive");
-      getStreamer().EmitIntValue(IntValue, Size);
+      getStreamer().emitIntValue(IntValue, Size);
     } else
-      getStreamer().EmitValue(Value, Size, ExprLoc);
+      getStreamer().emitValue(Value, Size, ExprLoc);
     return false;
   };
 
@@ -1681,7 +1702,7 @@ bool PPCAsmParser::ParseDirectiveTC(unsigned Size, AsmToken ID) {
     return addErrorSuffix(" in '.tc' directive");
 
   // Align to word size.
-  getParser().getStreamer().EmitValueToAlignment(Size);
+  getParser().getStreamer().emitValueToAlignment(Size);
 
   // Emit expressions.
   return ParseDirectiveWord(Size, ID);
@@ -1710,10 +1731,10 @@ bool PPCAsmParser::ParseDirectiveMachine(SMLoc L) {
   if (parseToken(AsmToken::EndOfStatement))
     return addErrorSuffix(" in '.machine' directive");
 
-  PPCTargetStreamer &TStreamer =
-      *static_cast<PPCTargetStreamer *>(
-           getParser().getStreamer().getTargetStreamer());
-  TStreamer.emitMachine(CPU);
+  PPCTargetStreamer *TStreamer = static_cast<PPCTargetStreamer *>(
+      getParser().getStreamer().getTargetStreamer());
+  if (TStreamer != nullptr)
+    TStreamer->emitMachine(CPU);
 
   return false;
 }
@@ -1752,10 +1773,10 @@ bool PPCAsmParser::ParseDirectiveAbiVersion(SMLoc L) {
       parseToken(AsmToken::EndOfStatement))
     return addErrorSuffix(" in '.abiversion' directive");
 
-  PPCTargetStreamer &TStreamer =
-      *static_cast<PPCTargetStreamer *>(
-           getParser().getStreamer().getTargetStreamer());
-  TStreamer.emitAbiVersion(AbiVersion);
+  PPCTargetStreamer *TStreamer = static_cast<PPCTargetStreamer *>(
+      getParser().getStreamer().getTargetStreamer());
+  if (TStreamer != nullptr)
+    TStreamer->emitAbiVersion(AbiVersion);
 
   return false;
 }
@@ -1775,10 +1796,10 @@ bool PPCAsmParser::ParseDirectiveLocalEntry(SMLoc L) {
       parseToken(AsmToken::EndOfStatement))
     return addErrorSuffix(" in '.localentry' directive");
 
-  PPCTargetStreamer &TStreamer =
-      *static_cast<PPCTargetStreamer *>(
-           getParser().getStreamer().getTargetStreamer());
-  TStreamer.emitLocalEntry(Sym, Expr);
+  PPCTargetStreamer *TStreamer = static_cast<PPCTargetStreamer *>(
+      getParser().getStreamer().getTargetStreamer());
+  if (TStreamer != nullptr)
+    TStreamer->emitLocalEntry(Sym, Expr);
 
   return false;
 }
@@ -1830,23 +1851,23 @@ PPCAsmParser::applyModifierToExpr(const MCExpr *E,
                                   MCContext &Ctx) {
   switch (Variant) {
   case MCSymbolRefExpr::VK_PPC_LO:
-    return PPCMCExpr::create(PPCMCExpr::VK_PPC_LO, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_LO, E, Ctx);
   case MCSymbolRefExpr::VK_PPC_HI:
-    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HI, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HI, E, Ctx);
   case MCSymbolRefExpr::VK_PPC_HA:
-    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HA, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HA, E, Ctx);
   case MCSymbolRefExpr::VK_PPC_HIGH:
-    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGH, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGH, E, Ctx);
   case MCSymbolRefExpr::VK_PPC_HIGHA:
-    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHA, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHA, E, Ctx);
   case MCSymbolRefExpr::VK_PPC_HIGHER:
-    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHER, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHER, E, Ctx);
   case MCSymbolRefExpr::VK_PPC_HIGHERA:
-    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHERA, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHERA, E, Ctx);
   case MCSymbolRefExpr::VK_PPC_HIGHEST:
-    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHEST, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHEST, E, Ctx);
   case MCSymbolRefExpr::VK_PPC_HIGHESTA:
-    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHESTA, E, false, Ctx);
+    return PPCMCExpr::create(PPCMCExpr::VK_PPC_HIGHESTA, E, Ctx);
   default:
     return nullptr;
   }
diff --git a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
index e3c0f958c7ed..74c6fd3733f0 100644
--- a/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
+++ b/llvm/lib/Target/PowerPC/Disassembler/PPCDisassembler.cpp
@@ -60,9 +60,16 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCDisassembler() {
                                          createPPCLEDisassembler);
 }
 
-static DecodeStatus DecodePCRel24BranchTarget(MCInst &Inst, unsigned Imm,
-                                              uint64_t Addr,
-                                              const void *Decoder) {
+static DecodeStatus decodeCondBrTarget(MCInst &Inst, unsigned Imm,
+                                       uint64_t /*Address*/,
+                                       const void * /*Decoder*/) {
+  Inst.addOperand(MCOperand::createImm(SignExtend32<14>(Imm)));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeDirectBrTarget(MCInst &Inst, unsigned Imm,
+                                         uint64_t /*Address*/,
+                                         const void * /*Decoder*/) {
   int32_t Offset = SignExtend32<24>(Imm);
   Inst.addOperand(MCOperand::createImm(Offset));
   return MCDisassembler::Success;
@@ -191,6 +198,14 @@ static DecodeStatus decodeSImmOperand(MCInst &Inst, uint64_t Imm,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeImmZeroOperand(MCInst &Inst, uint64_t Imm,
+                                         int64_t Address, const void *Decoder) {
+  if (Imm != 0)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createImm(Imm));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus decodeMemRIOperands(MCInst &Inst, uint64_t Imm,
                                         int64_t Address, const void *Decoder) {
   // Decode the memri field (imm, reg), which has the low 16-bits as the
@@ -262,6 +277,35 @@ static DecodeStatus decodeMemRIX16Operands(MCInst &Inst, uint64_t Imm,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus decodeMemRI34PCRelOperands(MCInst &Inst, uint64_t Imm,
+                                               int64_t Address,
+                                               const void *Decoder) {
+  // Decode the memri34_pcrel field (imm, reg), which has the low 34-bits as the
+  // displacement, and the next 5 bits as an immediate 0.
+  uint64_t Base = Imm >> 34;
+  uint64_t Disp = Imm & 0x3FFFFFFFFUL;
+
+  assert(Base < 32 && "Invalid base register");
+
+  Inst.addOperand(MCOperand::createImm(SignExtend64<34>(Disp)));
+  return decodeImmZeroOperand(Inst, Base, Address, Decoder);
+}
+
+static DecodeStatus decodeMemRI34Operands(MCInst &Inst, uint64_t Imm,
+                                          int64_t Address,
+                                          const void *Decoder) {
+  // Decode the memri34 field (imm, reg), which has the low 34-bits as the
+  // displacement, and the next 5 bits as the register #.
+  uint64_t Base = Imm >> 34;
+  uint64_t Disp = Imm & 0x3FFFFFFFFUL;
+
+  assert(Base < 32 && "Invalid base register");
+
+  Inst.addOperand(MCOperand::createImm(SignExtend64<34>(Disp)));
+  Inst.addOperand(MCOperand::createReg(RRegsNoR0[Base]));
+  return MCDisassembler::Success;
+}
+
 static DecodeStatus decodeSPE8Operands(MCInst &Inst, uint64_t Imm,
                                          int64_t Address, const void *Decoder) {
   // Decode the spe8disp field (imm, reg), which has the low 5-bits as the
@@ -324,6 +368,29 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
                                              ArrayRef<uint8_t> Bytes,
                                              uint64_t Address,
                                              raw_ostream &CS) const {
+  auto *ReadFunc = IsLittleEndian ? support::endian::read32le
+                                  : support::endian::read32be;
+
+  // If this is an 8-byte prefixed instruction, handle it here.
+  // Note: prefixed instructions aren't technically 8-byte entities - the prefix
+  //       appears in memory at an address 4 bytes prior to that of the base
+  //       instruction regardless of endianness. So we read the two pieces and
+  //       rebuild the 8-byte instruction.
+  // TODO: In this function we call decodeInstruction several times with
+  //       different decoder tables. It may be possible to only call once by
+  //       looking at the top 6 bits of the instruction.
+  if (STI.getFeatureBits()[PPC::FeaturePrefixInstrs] && Bytes.size() >= 8) {
+    uint32_t Prefix = ReadFunc(Bytes.data());
+    uint32_t BaseInst = ReadFunc(Bytes.data() + 4);
+    uint64_t Inst = BaseInst | (uint64_t)Prefix << 32;
+    DecodeStatus result = decodeInstruction(DecoderTable64, MI, Inst, Address,
+                                            this, STI);
+    if (result != MCDisassembler::Fail) {
+      Size = 8;
+      return result;
+    }
+  }
+
   // Get the four bytes of the instruction.
   Size = 4;
   if (Bytes.size() < 4) {
@@ -332,8 +399,7 @@ DecodeStatus PPCDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
   }
 
   // Read the instruction in the proper endianness.
-  uint32_t Inst = IsLittleEndian ? support::endian::read32le(Bytes.data())
-                                 : support::endian::read32be(Bytes.data());
+  uint64_t Inst = ReadFunc(Bytes.data());
 
   if (STI.getFeatureBits()[PPC::FeatureQPX]) {
     DecodeStatus result =
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
index 8778e916f7e4..dbaf221db9fc 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCAsmBackend.cpp
@@ -28,7 +28,6 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
   switch (Kind) {
   default:
     llvm_unreachable("Unknown fixup kind!");
-  case FK_NONE:
   case FK_Data_1:
   case FK_Data_2:
   case FK_Data_4:
@@ -40,11 +39,14 @@ static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
     return Value & 0xfffc;
   case PPC::fixup_ppc_br24:
   case PPC::fixup_ppc_br24abs:
+  case PPC::fixup_ppc_br24_notoc:
     return Value & 0x3fffffc;
   case PPC::fixup_ppc_half16:
     return Value & 0xffff;
   case PPC::fixup_ppc_half16ds:
     return Value & 0xfffc;
+  case PPC::fixup_ppc_pcrel34:
+    return Value & 0x3ffffffff;
   }
 }
 
@@ -52,8 +54,6 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   switch (Kind) {
   default:
     llvm_unreachable("Unknown fixup kind!");
-  case FK_NONE:
-    return 0;
   case FK_Data_1:
     return 1;
   case FK_Data_2:
@@ -65,7 +65,9 @@ static unsigned getFixupKindNumBytes(unsigned Kind) {
   case PPC::fixup_ppc_brcond14abs:
   case PPC::fixup_ppc_br24:
   case PPC::fixup_ppc_br24abs:
+  case PPC::fixup_ppc_br24_notoc:
     return 4;
+  case PPC::fixup_ppc_pcrel34:
   case FK_Data_8:
     return 8;
   case PPC::fixup_ppc_nofixup:
@@ -91,24 +93,33 @@ public:
     const static MCFixupKindInfo InfosBE[PPC::NumTargetFixupKinds] = {
       // name                    offset  bits  flags
       { "fixup_ppc_br24",        6,      24,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_br24_notoc",  6,      24,   MCFixupKindInfo::FKF_IsPCRel },
       { "fixup_ppc_brcond14",    16,     14,   MCFixupKindInfo::FKF_IsPCRel },
       { "fixup_ppc_br24abs",     6,      24,   0 },
       { "fixup_ppc_brcond14abs", 16,     14,   0 },
       { "fixup_ppc_half16",       0,     16,   0 },
       { "fixup_ppc_half16ds",     0,     14,   0 },
+      { "fixup_ppc_pcrel34",     0,      34,   MCFixupKindInfo::FKF_IsPCRel },
       { "fixup_ppc_nofixup",      0,      0,   0 }
     };
     const static MCFixupKindInfo InfosLE[PPC::NumTargetFixupKinds] = {
       // name                    offset  bits  flags
       { "fixup_ppc_br24",        2,      24,   MCFixupKindInfo::FKF_IsPCRel },
+      { "fixup_ppc_br24_notoc",  2,      24,   MCFixupKindInfo::FKF_IsPCRel },
       { "fixup_ppc_brcond14",    2,      14,   MCFixupKindInfo::FKF_IsPCRel },
       { "fixup_ppc_br24abs",     2,      24,   0 },
       { "fixup_ppc_brcond14abs", 2,      14,   0 },
       { "fixup_ppc_half16",      0,      16,   0 },
       { "fixup_ppc_half16ds",    2,      14,   0 },
+      { "fixup_ppc_pcrel34",     0,      34,   MCFixupKindInfo::FKF_IsPCRel },
       { "fixup_ppc_nofixup",     0,       0,   0 }
     };
 
+    // Fixup kinds from .reloc directive are like R_PPC_NONE/R_PPC64_NONE. They
+    // do not require any extra processing.
+    if (Kind >= FirstLiteralRelocationKind)
+      return MCAsmBackend::getFixupKindInfo(FK_NONE);
+
     if (Kind < FirstTargetFixupKind)
       return MCAsmBackend::getFixupKindInfo(Kind);
 
@@ -123,11 +134,14 @@ public:
                   const MCValue &Target, MutableArrayRef<char> Data,
                   uint64_t Value, bool IsResolved,
                   const MCSubtargetInfo *STI) const override {
-    Value = adjustFixupValue(Fixup.getKind(), Value);
+    MCFixupKind Kind = Fixup.getKind();
+    if (Kind >= FirstLiteralRelocationKind)
+      return;
+    Value = adjustFixupValue(Kind, Value);
     if (!Value) return;           // Doesn't change encoding.
 
     unsigned Offset = Fixup.getOffset();
-    unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+    unsigned NumBytes = getFixupKindNumBytes(Kind);
 
     // For each byte of the fragment that the fixup touches, mask in the bits
     // from the fixup value. The Value has been "split up" into the appropriate
@@ -140,13 +154,13 @@ public:
 
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
                              const MCValue &Target) override {
-    switch ((unsigned)Fixup.getKind()) {
+    MCFixupKind Kind = Fixup.getKind();
+    switch ((unsigned)Kind) {
     default:
-      return false;
-    case FK_NONE:
-      return true;
+      return Kind >= FirstLiteralRelocationKind;
     case PPC::fixup_ppc_br24:
     case PPC::fixup_ppc_br24abs:
+    case PPC::fixup_ppc_br24_notoc:
       // If the target symbol has a local entry point we must not attempt
       // to resolve the fixup directly.  Emit a relocation and leave
       // resolution of the final target address to the linker.
@@ -178,8 +192,8 @@ public:
     llvm_unreachable("relaxInstruction() unimplemented");
   }
 
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override {
+  void relaxInstruction(MCInst &Inst,
+                        const MCSubtargetInfo &STI) const override {
     // FIXME.
     llvm_unreachable("relaxInstruction() unimplemented");
   }
@@ -200,21 +214,6 @@ public:
 // FIXME: This should be in a separate file.
 namespace {
 
-class DarwinPPCAsmBackend : public PPCAsmBackend {
-public:
-  DarwinPPCAsmBackend(const Target &T, const Triple &TT)
-      : PPCAsmBackend(T, TT) {}
-
-  std::unique_ptr<MCObjectTargetWriter>
-  createObjectTargetWriter() const override {
-    bool Is64 = TT.isPPC64();
-    return createPPCMachObjectWriter(
-        /*Is64Bit=*/Is64,
-        (Is64 ? MachO::CPU_TYPE_POWERPC64 : MachO::CPU_TYPE_POWERPC),
-        MachO::CPU_SUBTYPE_POWERPC_ALL);
-  }
-};
-
 class ELFPPCAsmBackend : public PPCAsmBackend {
 public:
   ELFPPCAsmBackend(const Target &T, const Triple &TT) : PPCAsmBackend(T, TT) {}
@@ -243,14 +242,25 @@ public:
 } // end anonymous namespace
 
 Optional<MCFixupKind> ELFPPCAsmBackend::getFixupKind(StringRef Name) const {
-  if (TT.isPPC64()) {
-    if (Name == "R_PPC64_NONE")
-      return FK_NONE;
-  } else {
-    if (Name == "R_PPC_NONE")
-      return FK_NONE;
+  if (TT.isOSBinFormatELF()) {
+    unsigned Type;
+    if (TT.isPPC64()) {
+      Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/PowerPC64.def"
+#undef ELF_RELOC
+                 .Default(-1u);
+    } else {
+      Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/PowerPC.def"
+#undef ELF_RELOC
+                 .Default(-1u);
+    }
+    if (Type != -1u)
+      return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type);
   }
-  return MCAsmBackend::getFixupKind(Name);
+  return None;
 }
 
 MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
@@ -258,9 +268,6 @@ MCAsmBackend *llvm::createPPCAsmBackend(const Target &T,
                                         const MCRegisterInfo &MRI,
                                         const MCTargetOptions &Options) {
   const Triple &TT = STI.getTargetTriple();
-  if (TT.isOSDarwin())
-    return new DarwinPPCAsmBackend(T, TT);
-
   if (TT.isOSBinFormatXCOFF())
     return new XCOFFPPCAsmBackend(T, TT);
 
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
index 20f752c3041a..d8b3301e97f1 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFObjectWriter.cpp
@@ -73,6 +73,9 @@ static MCSymbolRefExpr::VariantKind getAccessVariant(const MCValue &Target,
 unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
+  MCFixupKind Kind = Fixup.getKind();
+  if (Kind >= FirstLiteralRelocationKind)
+    return Kind - FirstLiteralRelocationKind;
   MCSymbolRefExpr::VariantKind Modifier = getAccessVariant(Target, Fixup);
 
   // determine the type of the relocation
@@ -83,6 +86,7 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
       llvm_unreachable("Unimplemented");
     case PPC::fixup_ppc_br24:
     case PPC::fixup_ppc_br24abs:
+    case PPC::fixup_ppc_br24_notoc:
       switch (Modifier) {
       default: llvm_unreachable("Unsupported Modifier");
       case MCSymbolRefExpr::VK_None:
@@ -94,6 +98,9 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
       case MCSymbolRefExpr::VK_PPC_LOCAL:
         Type = ELF::R_PPC_LOCAL24PC;
         break;
+      case MCSymbolRefExpr::VK_PPC_NOTOC:
+        Type = ELF::R_PPC64_REL24_NOTOC;
+        break;
       }
       break;
     case PPC::fixup_ppc_brcond14:
@@ -121,6 +128,18 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
       Target.print(errs());
       errs() << '\n';
       report_fatal_error("Invalid PC-relative half16ds relocation");
+    case PPC::fixup_ppc_pcrel34:
+      switch (Modifier) {
+      default:
+        llvm_unreachable("Unsupported Modifier for fixup_ppc_pcrel34");
+      case MCSymbolRefExpr::VK_PCREL:
+        Type = ELF::R_PPC64_PCREL34;
+        break;
+      case MCSymbolRefExpr::VK_PPC_GOT_PCREL:
+        Type = ELF::R_PPC64_GOT_PCREL34;
+        break;
+      }
+      break;
     case FK_Data_4:
     case FK_PCRel_4:
       Type = ELF::R_PPC_REL32;
@@ -133,9 +152,6 @@ unsigned PPCELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
   } else {
     switch (Fixup.getTargetKind()) {
       default: llvm_unreachable("invalid fixup kind!");
-    case FK_NONE:
-      Type = ELF::R_PPC_NONE;
-      break;
     case PPC::fixup_ppc_br24abs:
       Type = ELF::R_PPC_ADDR24;
       break;
@@ -431,6 +447,7 @@ bool PPCELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
       return false;
 
     case ELF::R_PPC_REL24:
+    case ELF::R_PPC64_REL24_NOTOC:
       // If the target symbol has a local entry point, we must keep the
       // target symbol to preserve that information for the linker.
       // The "other" values are stored in the last 6 bits of the second byte.
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
new file mode 100644
index 000000000000..4373778cc96c
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.cpp
@@ -0,0 +1,112 @@
+//===-------- PPCELFStreamer.cpp - ELF Object Output ---------------------===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a custom MCELFStreamer for PowerPC.
+//
+// The purpose of the custom ELF streamer is to allow us to intercept
+// instructions as they are being emitted and align all 8 byte instructions
+// to a 64 byte boundary if required (by adding a 4 byte nop). This is important
+// because 8 byte instructions are not allowed to cross 64 byte boundaries
+// and by aliging anything that is within 4 bytes of the boundary we can
+// guarantee that the 8 byte instructions do not cross that boundary.
+//
+//===----------------------------------------------------------------------===//
+
+
+#include "PPCELFStreamer.h"
+#include "PPCInstrInfo.h"
+#include "PPCMCCodeEmitter.h"
+#include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrDesc.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/SourceMgr.h"
+
+using namespace llvm;
+
+PPCELFStreamer::PPCELFStreamer(MCContext &Context,
+                               std::unique_ptr<MCAsmBackend> MAB,
+                               std::unique_ptr<MCObjectWriter> OW,
+                               std::unique_ptr<MCCodeEmitter> Emitter)
+    : MCELFStreamer(Context, std::move(MAB), std::move(OW),
+                    std::move(Emitter)), LastLabel(NULL) {
+}
+
+void PPCELFStreamer::emitPrefixedInstruction(const MCInst &Inst,
+                                             const MCSubtargetInfo &STI) {
+  // Prefixed instructions must not cross a 64-byte boundary (i.e. prefix is
+  // before the boundary and the remaining 4-bytes are after the boundary). In
+  // order to achieve this, a nop is added prior to any such boundary-crossing
+  // prefixed instruction. Align to 64 bytes if possible but add a maximum of 4
+  // bytes when trying to do that. If alignment requires adding more than 4
+  // bytes then the instruction won't be aligned. When emitting a code alignment
+  // a new fragment is created for this alignment. This fragment will contain
+  // all of the nops required as part of the alignment operation. In the cases
+  // when no nops are added then The fragment is still created but it remains
+  // empty.
+  emitCodeAlignment(64, 4);
+
+  // Emit the instruction.
+  // Since the previous emit created a new fragment then adding this instruction
+  // also forces the addition of a new fragment. Inst is now the first
+  // instruction in that new fragment.
+  MCELFStreamer::emitInstruction(Inst, STI);
+
+  // The above instruction is forced to start a new fragment because it
+  // comes after a code alignment fragment. Get that new fragment.
+  MCFragment *InstructionFragment = getCurrentFragment();
+  SMLoc InstLoc = Inst.getLoc();
+  // Check if there was a last label emitted.
+  if (LastLabel && !LastLabel->isUnset() && LastLabelLoc.isValid() &&
+      InstLoc.isValid()) {
+    const SourceMgr *SourceManager = getContext().getSourceManager();
+    unsigned InstLine = SourceManager->FindLineNumber(InstLoc);
+    unsigned LabelLine = SourceManager->FindLineNumber(LastLabelLoc);
+    // If the Label and the Instruction are on the same line then move the
+    // label to the top of the fragment containing the aligned instruction that
+    // was just added.
+    if (InstLine == LabelLine) {
+      AssignFragment(LastLabel, InstructionFragment);
+      LastLabel->setOffset(0);
+    }
+  }
+}
+
+void PPCELFStreamer::emitInstruction(const MCInst &Inst,
+                                     const MCSubtargetInfo &STI) {
+  PPCMCCodeEmitter *Emitter =
+      static_cast<PPCMCCodeEmitter*>(getAssembler().getEmitterPtr());
+
+  // Special handling is only for prefixed instructions.
+  if (!Emitter->isPrefixedInstruction(Inst)) {
+    MCELFStreamer::emitInstruction(Inst, STI);
+    return;
+  }
+  emitPrefixedInstruction(Inst, STI);
+}
+
+void PPCELFStreamer::emitLabel(MCSymbol *Symbol, SMLoc Loc) {
+  LastLabel = Symbol;
+  LastLabelLoc = Loc;
+  MCELFStreamer::emitLabel(Symbol);
+}
+
+MCELFStreamer *llvm::createPPCELFStreamer(
+    MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
+    std::unique_ptr<MCObjectWriter> OW,
+    std::unique_ptr<MCCodeEmitter> Emitter) {
+  return new PPCELFStreamer(Context, std::move(MAB), std::move(OW),
+                            std::move(Emitter));
+}
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h
new file mode 100644
index 000000000000..51863232d071
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCELFStreamer.h
@@ -0,0 +1,54 @@
+//===- PPCELFStreamer.h - ELF Object Output --------------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This is a custom MCELFStreamer for PowerPC.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_PPC_MCELFSTREAMER_PPCELFSTREAMER_H
+#define LLVM_LIB_TARGET_PPC_MCELFSTREAMER_PPCELFSTREAMER_H
+
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/MC/MCELFStreamer.h"
+#include <memory>
+
+namespace llvm {
+
+class MCAsmBackend;
+class MCCodeEmitter;
+class MCContext;
+class MCSubtargetInfo;
+
+class PPCELFStreamer : public MCELFStreamer {
+  // We need to keep track of the last label we emitted (only one) because
+  // depending on whether the label is on the same line as an aligned
+  // instruction or not, the label may refer to the instruction or the nop.
+  MCSymbol *LastLabel;
+  SMLoc LastLabelLoc;
+
+public:
+  PPCELFStreamer(MCContext &Context, std::unique_ptr<MCAsmBackend> MAB,
+                 std::unique_ptr<MCObjectWriter> OW,
+                 std::unique_ptr<MCCodeEmitter> Emitter);
+
+  void emitInstruction(const MCInst &Inst, const MCSubtargetInfo &STI) override;
+
+  // EmitLabel updates LastLabel and LastLabelLoc when a new label is emitted.
+  void emitLabel(MCSymbol *Symbol, SMLoc Loc = SMLoc()) override;
+private:
+  void emitPrefixedInstruction(const MCInst &Inst, const MCSubtargetInfo &STI);
+};
+
+MCELFStreamer *createPPCELFStreamer(MCContext &Context,
+                                    std::unique_ptr<MCAsmBackend> MAB,
+                                    std::unique_ptr<MCObjectWriter> OW,
+                                    std::unique_ptr<MCCodeEmitter> Emitter);
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_PPC_MCELFSTREAMER_PPCELFSTREAMER_H
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
index 845489788c86..2fb8947fd4e0 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCFixupKinds.h
@@ -19,6 +19,10 @@ enum Fixups {
   // 24-bit PC relative relocation for direct branches like 'b' and 'bl'.
   fixup_ppc_br24 = FirstTargetFixupKind,
 
+  // 24-bit PC relative relocation for direct branches like 'b' and 'bl' where
+  // the caller does not use the TOC.
+  fixup_ppc_br24_notoc,
+
   /// 14-bit PC relative relocation for conditional branches.
   fixup_ppc_brcond14,
 
@@ -36,6 +40,9 @@ enum Fixups {
   /// instrs like 'std'.
   fixup_ppc_half16ds,
 
+  // A 34-bit fixup corresponding to PC-relative paddi.
+  fixup_ppc_pcrel34,
+
   /// Not a true fixup, but ties a symbol to a call to __tls_get_addr for the
   /// TLS general and local dynamic models, or inserts the thread-pointer
   /// register number.
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
index 9cc1c539e24a..16da62a74b8c 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.cpp
@@ -116,16 +116,6 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
     }
   }
 
-  if ((MI->getOpcode() == PPC::OR || MI->getOpcode() == PPC::OR8) &&
-      MI->getOperand(1).getReg() == MI->getOperand(2).getReg()) {
-    O << "\tmr ";
-    printOperand(MI, 0, O);
-    O << ", ";
-    printOperand(MI, 1, O);
-    printAnnotation(O, Annot);
-    return;
-  }
-
   if (MI->getOpcode() == PPC::RLDICR ||
       MI->getOpcode() == PPC::RLDICR_32) {
     unsigned char SH = MI->getOperand(2).getImm();
@@ -193,7 +183,7 @@ void PPCInstPrinter::printInst(const MCInst *MI, uint64_t Address,
     }
   }
 
-  if (!printAliasInstr(MI, O))
+  if (!printAliasInstr(MI, Address, O))
     printInstruction(MI, Address, O);
   printAnnotation(O, Annot);
 }
@@ -339,6 +329,13 @@ void PPCInstPrinter::printS5ImmOperand(const MCInst *MI, unsigned OpNo,
   O << (int)Value;
 }
 
+void PPCInstPrinter::printImmZeroOperand(const MCInst *MI, unsigned OpNo,
+                                         raw_ostream &O) {
+  unsigned int Value = MI->getOperand(OpNo).getImm();
+  assert(Value == 0 && "Operand must be zero");
+  O << (unsigned int)Value;
+}
+
 void PPCInstPrinter::printU5ImmOperand(const MCInst *MI, unsigned OpNo,
                                        raw_ostream &O) {
   unsigned int Value = MI->getOperand(OpNo).getImm();
@@ -391,6 +388,17 @@ void PPCInstPrinter::printS16ImmOperand(const MCInst *MI, unsigned OpNo,
     printOperand(MI, OpNo, O);
 }
 
+void PPCInstPrinter::printS34ImmOperand(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  if (MI->getOperand(OpNo).isImm()) {
+    long long Value = MI->getOperand(OpNo).getImm();
+    assert(isInt<34>(Value) && "Invalid s34imm argument!");
+    O << (long long)Value;
+  }
+  else
+    printOperand(MI, OpNo, O);
+}
+
 void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
                                         raw_ostream &O) {
   if (MI->getOperand(OpNo).isImm())
@@ -399,18 +407,29 @@ void PPCInstPrinter::printU16ImmOperand(const MCInst *MI, unsigned OpNo,
     printOperand(MI, OpNo, O);
 }
 
-void PPCInstPrinter::printBranchOperand(const MCInst *MI, unsigned OpNo,
-                                        raw_ostream &O) {
+void PPCInstPrinter::printBranchOperand(const MCInst *MI, uint64_t Address,
+                                        unsigned OpNo, raw_ostream &O) {
   if (!MI->getOperand(OpNo).isImm())
     return printOperand(MI, OpNo, O);
-
-  // Branches can take an immediate operand.  This is used by the branch
-  // selection pass to print .+8, an eight byte displacement from the PC.
-  O << ".";
   int32_t Imm = SignExtend32<32>((unsigned)MI->getOperand(OpNo).getImm() << 2);
-  if (Imm >= 0)
-    O << "+";
-  O << Imm;
+  if (PrintBranchImmAsAddress) {
+    uint64_t Target = Address + Imm;
+    if (!TT.isPPC64())
+      Target &= 0xffffffff;
+    O << formatHex(Target);
+  } else {
+    // Branches can take an immediate operand. This is used by the branch
+    // selection pass to print, for example `.+8` (for ELF) or `$+8` (for AIX)
+    // to express an eight byte displacement from the program counter.
+    if (!TT.isOSAIX())
+      O << ".";
+    else
+      O << "$";
+
+    if (Imm >= 0)
+      O << "+";
+    O << Imm;
+  }
 }
 
 void PPCInstPrinter::printAbsBranchOperand(const MCInst *MI, unsigned OpNo,
@@ -451,6 +470,22 @@ void PPCInstPrinter::printMemRegImm(const MCInst *MI, unsigned OpNo,
   O << ')';
 }
 
+void PPCInstPrinter::printMemRegImm34PCRel(const MCInst *MI, unsigned OpNo,
+                                           raw_ostream &O) {
+  printS34ImmOperand(MI, OpNo, O);
+  O << '(';
+  printImmZeroOperand(MI, OpNo + 1, O);
+  O << ')';
+}
+
+void PPCInstPrinter::printMemRegImm34(const MCInst *MI, unsigned OpNo,
+                                        raw_ostream &O) {
+  printS34ImmOperand(MI, OpNo, O);
+  O << '(';
+  printOperand(MI, OpNo + 1, O);
+  O << ')';
+}
+
 void PPCInstPrinter::printMemRegReg(const MCInst *MI, unsigned OpNo,
                                     raw_ostream &O) {
   // When used as the base register, r0 reads constant zero rather than
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
index a3ec41aa348d..9763aeceef94 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCInstPrinter.h
@@ -39,9 +39,9 @@ public:
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
-  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
-  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                               unsigned PrintMethodIdx,
+  bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                               unsigned OpIdx, unsigned PrintMethodIdx,
                                raw_ostream &OS);
 
   void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
@@ -61,14 +61,19 @@ public:
   void printU10ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU12ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printS16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printS34ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printU16ImmOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
-  void printBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printImmZeroOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printBranchOperand(const MCInst *MI, uint64_t Address, unsigned OpNo,
+                          raw_ostream &O);
   void printAbsBranchOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printTLSCall(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   void printcrbitm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 
   void printMemRegImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemRegImm34PCRel(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printMemRegImm34(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printMemRegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
 };
 } // end namespace llvm
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
index dc2c216a3efd..593dc2843c3d 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "PPCMCAsmInfo.h"
 #include "llvm/ADT/Triple.h"
+#include <cassert>
 
 using namespace llvm;
 
@@ -50,15 +51,15 @@ PPCELFMCAsmInfo::PPCELFMCAsmInfo(bool is64Bit, const Triple& T) {
   Data64bitsDirective = is64Bit ? "\t.quad\t" : nullptr;
   AssemblerDialect = 1;           // New-Style mnemonics.
   LCOMMDirectiveAlignmentType = LCOMM::ByteAlignment;
-
-  UseIntegratedAssembler = true;
 }
 
 void PPCXCOFFMCAsmInfo::anchor() {}
 
 PPCXCOFFMCAsmInfo::PPCXCOFFMCAsmInfo(bool Is64Bit, const Triple &T) {
-  assert(!IsLittleEndian && "Little-endian XCOFF not supported.");
+  if (T.getArch() == Triple::ppc64le)
+    report_fatal_error("XCOFF is not supported for little-endian targets");
   CodePointerSize = CalleeSaveStackSlotSize = Is64Bit ? 8 : 4;
-  ZeroDirective = "\t.space\t";
-  SymbolsHaveSMC = true;
+
+  // A size of 8 is only supported by the assembler under 64-bit.
+  Data64bitsDirective = Is64Bit ? "\t.vbyte\t8, " : nullptr;
 }
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
index 8c52bbbd8a56..27c687686641 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCAsmInfo.h
@@ -28,7 +28,7 @@ public:
 };
 
 class PPCXCOFFMCAsmInfo : public MCAsmInfoXCOFF {
-  virtual void anchor();
+  void anchor() override;
 
 public:
   explicit PPCXCOFFMCAsmInfo(bool is64Bit, const Triple &);
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
index 676efc500455..fb65e7320f2b 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.cpp
@@ -48,7 +48,9 @@ getDirectBrEncoding(const MCInst &MI, unsigned OpNo,
 
   // Add a fixup for the branch target.
   Fixups.push_back(MCFixup::create(0, MO.getExpr(),
-                                   (MCFixupKind)PPC::fixup_ppc_br24));
+                                   ((MI.getOpcode() == PPC::BL8_NOTOC)
+                                        ? (MCFixupKind)PPC::fixup_ppc_br24_notoc
+                                        : (MCFixupKind)PPC::fixup_ppc_br24)));
   return 0;
 }
 
@@ -102,6 +104,20 @@ unsigned PPCMCCodeEmitter::getImm16Encoding(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
+uint64_t
+PPCMCCodeEmitter::getImm34Encoding(const MCInst &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
+
+  // Add a fixup for the immediate field.
+  Fixups.push_back(MCFixup::create(0, MO.getExpr(),
+                                   (MCFixupKind)PPC::fixup_ppc_pcrel34));
+  return 0;
+}
+
 unsigned PPCMCCodeEmitter::getMemRIEncoding(const MCInst &MI, unsigned OpNo,
                                             SmallVectorImpl<MCFixup> &Fixups,
                                             const MCSubtargetInfo &STI) const {
@@ -159,6 +175,104 @@ unsigned PPCMCCodeEmitter::getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
   return RegBits;
 }
 
+uint64_t
+PPCMCCodeEmitter::getMemRI34PCRelEncoding(const MCInst &MI, unsigned OpNo,
+                                          SmallVectorImpl<MCFixup> &Fixups,
+                                          const MCSubtargetInfo &STI) const {
+  // Encode the PCRelative version of memri34: imm34(r0).
+  // In the PC relative version the register for the address must be zero.
+  // The 34 bit immediate can fall into one of three cases:
+  // 1) It is a relocation to be filled in by the linker represented as:
+  //    (MCExpr::SymbolRef)
+  // 2) It is a relocation + SignedOffset represented as:
+  //    (MCExpr::Binary(MCExpr::SymbolRef + MCExpr::Constant))
+  // 3) It is a known value at compile time.
+
+  // Make sure that the register is a zero as expected.
+  assert(MI.getOperand(OpNo + 1).isImm() && "Expecting an immediate.");
+  uint64_t RegBits =
+    getMachineOpValue(MI, MI.getOperand(OpNo + 1), Fixups, STI) << 34;
+  assert(RegBits == 0 && "Operand must be 0.");
+
+  // If this is not a MCExpr then we are in case 3) and we are dealing with
+  // a value known at compile time, not a relocation.
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (!MO.isExpr())
+    return ((getMachineOpValue(MI, MO, Fixups, STI)) & 0x3FFFFFFFFUL) | RegBits;
+
+  // At this point in the function it is known that MO is of type MCExpr.
+  // Therefore we are dealing with either case 1) a symbol ref or
+  // case 2) a symbol ref plus a constant.
+  const MCExpr *Expr = MO.getExpr();
+  switch (Expr->getKind()) {
+  default:
+    llvm_unreachable("Unsupported MCExpr for getMemRI34PCRelEncoding.");
+  case MCExpr::SymbolRef: {
+    // Relocation alone.
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(Expr);
+    (void)SRE;
+    // Currently these are the only valid PCRelative Relocations.
+    assert((SRE->getKind() == MCSymbolRefExpr::VK_PCREL ||
+            SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_PCREL) &&
+           "VariantKind must be VK_PCREL or VK_PPC_GOT_PCREL");
+    // Generate the fixup for the relocation.
+    Fixups.push_back(
+        MCFixup::create(0, Expr,
+                        static_cast<MCFixupKind>(PPC::fixup_ppc_pcrel34)));
+    // Put zero in the location of the immediate. The linker will fill in the
+    // correct value based on the relocation.
+    return 0;
+  }
+  case MCExpr::Binary: {
+    // Relocation plus some offset.
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    assert(BE->getOpcode() == MCBinaryExpr::Add &&
+           "Binary expression opcode must be an add.");
+
+    const MCExpr *LHS = BE->getLHS();
+    const MCExpr *RHS = BE->getRHS();
+
+    // Need to check in both directions. Reloc+Offset and Offset+Reloc.
+    if (LHS->getKind() != MCExpr::SymbolRef)
+      std::swap(LHS, RHS);
+
+    if (LHS->getKind() != MCExpr::SymbolRef ||
+        RHS->getKind() != MCExpr::Constant)
+      llvm_unreachable("Expecting to have one constant and one relocation.");
+
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(LHS);
+    (void)SRE;
+    assert(isInt<34>(cast<MCConstantExpr>(RHS)->getValue()) &&
+           "Value must fit in 34 bits.");
+
+    // Currently these are the only valid PCRelative Relocations.
+    assert((SRE->getKind() == MCSymbolRefExpr::VK_PCREL ||
+            SRE->getKind() == MCSymbolRefExpr::VK_PPC_GOT_PCREL) &&
+           "VariantKind must be VK_PCREL or VK_PPC_GOT_PCREL");
+    // Generate the fixup for the relocation.
+    Fixups.push_back(
+        MCFixup::create(0, Expr,
+                        static_cast<MCFixupKind>(PPC::fixup_ppc_pcrel34)));
+    // Put zero in the location of the immediate. The linker will fill in the
+    // correct value based on the relocation.
+    return 0;
+    }
+  }
+}
+
+uint64_t
+PPCMCCodeEmitter::getMemRI34Encoding(const MCInst &MI, unsigned OpNo,
+                                     SmallVectorImpl<MCFixup> &Fixups,
+                                     const MCSubtargetInfo &STI) const {
+  // Encode (imm, reg) as a memri34, which has the low 34-bits as the
+  // displacement and the next 5 bits as the register #.
+  assert(MI.getOperand(OpNo + 1).isReg() && "Expecting a register.");
+  uint64_t RegBits = getMachineOpValue(MI, MI.getOperand(OpNo + 1), Fixups, STI)
+                     << 34;
+  const MCOperand &MO = MI.getOperand(OpNo);
+  return ((getMachineOpValue(MI, MO, Fixups, STI)) & 0x3FFFFFFFFUL) | RegBits;
+}
+
 unsigned PPCMCCodeEmitter::getSPE8DisEncoding(const MCInst &MI, unsigned OpNo,
                                               SmallVectorImpl<MCFixup> &Fixups,
                                               const MCSubtargetInfo &STI)
@@ -257,7 +371,7 @@ static unsigned getOpIdxForMO(const MCInst &MI, const MCOperand &MO) {
   return ~0U; // Silence any warnings about no return.
 }
 
-unsigned PPCMCCodeEmitter::
+uint64_t PPCMCCodeEmitter::
 getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                   SmallVectorImpl<MCFixup> &Fixups,
                   const MCSubtargetInfo &STI) const {
@@ -316,5 +430,11 @@ unsigned PPCMCCodeEmitter::getInstSizeInBytes(const MCInst &MI) const {
   return Desc.getSize();
 }
 
+bool PPCMCCodeEmitter::isPrefixedInstruction(const MCInst &MI) const {
+  unsigned Opcode = MI.getOpcode();
+  const PPCInstrInfo *InstrInfo = static_cast<const PPCInstrInfo*>(&MCII);
+  return InstrInfo->isPrefixed(Opcode);
+}
+
 #define ENABLE_INSTR_PREDICATE_VERIFIER
 #include "PPCGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
index 1324faa12553..588aa76bd806 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCCodeEmitter.h
@@ -50,6 +50,9 @@ public:
   unsigned getImm16Encoding(const MCInst &MI, unsigned OpNo,
                             SmallVectorImpl<MCFixup> &Fixups,
                             const MCSubtargetInfo &STI) const;
+  uint64_t getImm34Encoding(const MCInst &MI, unsigned OpNo,
+                            SmallVectorImpl<MCFixup> &Fixups,
+                            const MCSubtargetInfo &STI) const;
   unsigned getMemRIEncoding(const MCInst &MI, unsigned OpNo,
                             SmallVectorImpl<MCFixup> &Fixups,
                             const MCSubtargetInfo &STI) const;
@@ -59,6 +62,12 @@ public:
   unsigned getMemRIX16Encoding(const MCInst &MI, unsigned OpNo,
                                SmallVectorImpl<MCFixup> &Fixups,
                                const MCSubtargetInfo &STI) const;
+  uint64_t getMemRI34PCRelEncoding(const MCInst &MI, unsigned OpNo,
+                                   SmallVectorImpl<MCFixup> &Fixups,
+                                   const MCSubtargetInfo &STI) const;
+  uint64_t getMemRI34Encoding(const MCInst &MI, unsigned OpNo,
+                              SmallVectorImpl<MCFixup> &Fixups,
+                              const MCSubtargetInfo &STI) const;
   unsigned getSPE8DisEncoding(const MCInst &MI, unsigned OpNo,
                               SmallVectorImpl<MCFixup> &Fixups,
                               const MCSubtargetInfo &STI) const;
@@ -80,7 +89,7 @@ public:
 
   /// getMachineOpValue - Return binary encoding of operand. If the machine
   /// operand requires relocation, record the relocation and return zero.
-  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+  uint64_t getMachineOpValue(const MCInst &MI, const MCOperand &MO,
                              SmallVectorImpl<MCFixup> &Fixups,
                              const MCSubtargetInfo &STI) const;
 
@@ -97,6 +106,9 @@ public:
   // Get the number of bytes used to encode the given MCInst.
   unsigned getInstSizeInBytes(const MCInst &MI) const;
 
+  // Is this instruction a prefixed instruction.
+  bool isPrefixedInstruction(const MCInst &MI) const;
+
 private:
   FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
   void
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
index fb9dd5d7aa75..abff44449131 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.cpp
@@ -17,39 +17,44 @@ using namespace llvm;
 
 #define DEBUG_TYPE "ppcmcexpr"
 
-const PPCMCExpr*
-PPCMCExpr::create(VariantKind Kind, const MCExpr *Expr,
-                  bool IsDarwin, MCContext &Ctx) {
-  return new (Ctx) PPCMCExpr(Kind, Expr, IsDarwin);
+const PPCMCExpr *PPCMCExpr::create(VariantKind Kind, const MCExpr *Expr,
+                                   MCContext &Ctx) {
+  return new (Ctx) PPCMCExpr(Kind, Expr);
 }
 
 void PPCMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
-  if (isDarwinSyntax()) {
-    switch (Kind) {
-    default: llvm_unreachable("Invalid kind!");
-    case VK_PPC_LO: OS << "lo16"; break;
-    case VK_PPC_HI: OS << "hi16"; break;
-    case VK_PPC_HA: OS << "ha16"; break;
-    }
-
-    OS << '(';
-    getSubExpr()->print(OS, MAI);
-    OS << ')';
-  } else {
-    getSubExpr()->print(OS, MAI);
+  getSubExpr()->print(OS, MAI);
 
-    switch (Kind) {
-    default: llvm_unreachable("Invalid kind!");
-    case VK_PPC_LO: OS << "@l"; break;
-    case VK_PPC_HI: OS << "@h"; break;
-    case VK_PPC_HA: OS << "@ha"; break;
-    case VK_PPC_HIGH: OS << "@high"; break;
-    case VK_PPC_HIGHA: OS << "@higha"; break;
-    case VK_PPC_HIGHER: OS << "@higher"; break;
-    case VK_PPC_HIGHERA: OS << "@highera"; break;
-    case VK_PPC_HIGHEST: OS << "@highest"; break;
-    case VK_PPC_HIGHESTA: OS << "@highesta"; break;
-    }
+  switch (Kind) {
+  default:
+    llvm_unreachable("Invalid kind!");
+  case VK_PPC_LO:
+    OS << "@l";
+    break;
+  case VK_PPC_HI:
+    OS << "@h";
+    break;
+  case VK_PPC_HA:
+    OS << "@ha";
+    break;
+  case VK_PPC_HIGH:
+    OS << "@high";
+    break;
+  case VK_PPC_HIGHA:
+    OS << "@higha";
+    break;
+  case VK_PPC_HIGHER:
+    OS << "@higher";
+    break;
+  case VK_PPC_HIGHERA:
+    OS << "@highera";
+    break;
+  case VK_PPC_HIGHEST:
+    OS << "@highest";
+    break;
+  case VK_PPC_HIGHESTA:
+    OS << "@highesta";
+    break;
   }
 }
 
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
index ad1454566162..1dbc7eae63c8 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCExpr.h
@@ -33,33 +33,29 @@ public:
 private:
   const VariantKind Kind;
   const MCExpr *Expr;
-  bool IsDarwin;
 
   int64_t evaluateAsInt64(int64_t Value) const;
 
-  explicit PPCMCExpr(VariantKind Kind, const MCExpr *Expr, bool IsDarwin)
-      : Kind(Kind), Expr(Expr), IsDarwin(IsDarwin) {}
+  explicit PPCMCExpr(VariantKind Kind, const MCExpr *Expr)
+      : Kind(Kind), Expr(Expr) {}
 
 public:
   /// @name Construction
   /// @{
 
   static const PPCMCExpr *create(VariantKind Kind, const MCExpr *Expr,
-                                 bool IsDarwin, MCContext &Ctx);
+                                 MCContext &Ctx);
 
-  static const PPCMCExpr *createLo(const MCExpr *Expr,
-                                   bool IsDarwin, MCContext &Ctx) {
-    return create(VK_PPC_LO, Expr, IsDarwin, Ctx);
+  static const PPCMCExpr *createLo(const MCExpr *Expr, MCContext &Ctx) {
+    return create(VK_PPC_LO, Expr, Ctx);
   }
 
-  static const PPCMCExpr *createHi(const MCExpr *Expr,
-                                   bool IsDarwin, MCContext &Ctx) {
-    return create(VK_PPC_HI, Expr, IsDarwin, Ctx);
+  static const PPCMCExpr *createHi(const MCExpr *Expr, MCContext &Ctx) {
+    return create(VK_PPC_HI, Expr, Ctx);
   }
 
-  static const PPCMCExpr *createHa(const MCExpr *Expr,
-                                   bool IsDarwin, MCContext &Ctx) {
-    return create(VK_PPC_HA, Expr, IsDarwin, Ctx);
+  static const PPCMCExpr *createHa(const MCExpr *Expr, MCContext &Ctx) {
+    return create(VK_PPC_HA, Expr, Ctx);
   }
 
   /// @}
@@ -72,10 +68,6 @@ public:
   /// getSubExpr - Get the child of this expression.
   const MCExpr *getSubExpr() const { return Expr; }
 
-  /// isDarwinSyntax - True if expression is to be printed using Darwin syntax.
-  bool isDarwinSyntax() const { return IsDarwin; }
-
-
   /// @}
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index cbfb8e2ff282..3092d56da1c5 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -13,6 +13,7 @@
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "MCTargetDesc/PPCInstPrinter.h"
 #include "MCTargetDesc/PPCMCAsmInfo.h"
+#include "PPCELFStreamer.h"
 #include "PPCTargetStreamer.h"
 #include "TargetInfo/PowerPCTargetInfo.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -20,11 +21,14 @@
 #include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCELFStreamer.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
@@ -91,12 +95,21 @@ static MCAsmInfo *createPPCMCAsmInfo(const MCRegisterInfo &MRI,
   // Initial state of the frame pointer is R1.
   unsigned Reg = isPPC64 ? PPC::X1 : PPC::R1;
   MCCFIInstruction Inst =
-      MCCFIInstruction::createDefCfa(nullptr, MRI.getDwarfRegNum(Reg, true), 0);
+      MCCFIInstruction::cfiDefCfa(nullptr, MRI.getDwarfRegNum(Reg, true), 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
 }
 
+static MCStreamer *createPPCMCStreamer(const Triple &T, MCContext &Context,
+                                       std::unique_ptr<MCAsmBackend> &&MAB,
+                                       std::unique_ptr<MCObjectWriter> &&OW,
+                                       std::unique_ptr<MCCodeEmitter> &&Emitter,
+                                       bool RelaxAll) {
+  return createPPCELFStreamer(Context, std::move(MAB), std::move(OW),
+                              std::move(Emitter));
+}
+
 namespace {
 
 class PPCTargetAsmStreamer : public PPCTargetStreamer {
@@ -107,14 +120,17 @@ public:
       : PPCTargetStreamer(S), OS(OS) {}
 
   void emitTCEntry(const MCSymbol &S) override {
-    const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
-    OS << "\t.tc ";
-    OS << (MAI->getSymbolsHaveSMC()
-               ? cast<MCSymbolXCOFF>(S).getUnqualifiedName()
-               : S.getName());
-    OS << "[TC],";
-    OS << S.getName();
-    OS << '\n';
+    if (const MCSymbolXCOFF *XSym = dyn_cast<MCSymbolXCOFF>(&S)) {
+      MCSymbolXCOFF *TCSym =
+          cast<MCSymbolXCOFF>(Streamer.getContext().getOrCreateSymbol(
+              XSym->getSymbolTableName() + "[TC]"));
+      if (TCSym->hasRename())
+        Streamer.emitXCOFFRenameDirective(TCSym, TCSym->getSymbolTableName());
+      OS << "\t.tc " << TCSym->getName() << "," << XSym->getName() << '\n';
+      return;
+    }
+
+    OS << "\t.tc " << S.getName() << "[TC]," << S.getName() << '\n';
   }
 
   void emitMachine(StringRef CPU) override {
@@ -146,8 +162,8 @@ public:
 
   void emitTCEntry(const MCSymbol &S) override {
     // Creates a R_PPC64_TOC relocation
-    Streamer.EmitValueToAlignment(8);
-    Streamer.EmitSymbolValue(&S, 8);
+    Streamer.emitValueToAlignment(8);
+    Streamer.emitSymbolValue(&S, 8);
   }
 
   void emitMachine(StringRef CPU) override {
@@ -166,13 +182,9 @@ public:
   void emitLocalEntry(MCSymbolELF *S, const MCExpr *LocalOffset) override {
     MCAssembler &MCA = getStreamer().getAssembler();
 
-    int64_t Res;
-    if (!LocalOffset->evaluateAsAbsolute(Res, MCA))
-      report_fatal_error(".localentry expression must be absolute.");
-
-    unsigned Encoded = ELF::encodePPC64LocalEntryOffset(Res);
-    if (Res != ELF::decodePPC64LocalEntryOffset(Encoded))
-      report_fatal_error(".localentry expression cannot be encoded.");
+    // encodePPC64LocalEntryOffset will report an error if it cannot
+    // encode LocalOffset.
+    unsigned Encoded = encodePPC64LocalEntryOffset(LocalOffset);
 
     unsigned Other = S->getOther();
     Other &= ~ELF::STO_PPC64_LOCAL_MASK;
@@ -201,6 +213,10 @@ public:
     for (auto *Sym : UpdateOther)
       if (Sym->isVariable())
         copyLocalEntry(Sym, Sym->getVariableValue());
+
+    // Clear the set of symbols that needs to be updated so the streamer can
+    // be reused without issues.
+    UpdateOther.clear();
   }
 
 private:
@@ -217,6 +233,31 @@ private:
     D->setOther(Other);
     return true;
   }
+
+  unsigned encodePPC64LocalEntryOffset(const MCExpr *LocalOffset) {
+    MCAssembler &MCA = getStreamer().getAssembler();
+    int64_t Offset;
+    if (!LocalOffset->evaluateAsAbsolute(Offset, MCA))
+      MCA.getContext().reportFatalError(
+          LocalOffset->getLoc(), ".localentry expression must be absolute.");
+
+    switch (Offset) {
+    default:
+      MCA.getContext().reportFatalError(
+          LocalOffset->getLoc(),
+          ".localentry expression is not a valid power of 2.");
+    case 0:
+      return 0;
+    case 1:
+      return 1 << ELF::STO_PPC64_LOCAL_BIT;
+    case 4:
+    case 8:
+    case 16:
+    case 32:
+    case 64:
+      return (int)Log2(Offset) << (int)ELF::STO_PPC64_LOCAL_BIT;
+    }
+  }
 };
 
 class PPCTargetMachOStreamer : public PPCTargetStreamer {
@@ -248,8 +289,8 @@ public:
   void emitTCEntry(const MCSymbol &S) override {
     const MCAsmInfo *MAI = Streamer.getContext().getAsmInfo();
     const unsigned PointerSize = MAI->getCodePointerSize();
-    Streamer.EmitValueToAlignment(PointerSize);
-    Streamer.EmitSymbolValue(&S, PointerSize);
+    Streamer.emitValueToAlignment(PointerSize);
+    Streamer.emitSymbolValue(&S, PointerSize);
   }
 
   void emitMachine(StringRef CPU) override {
@@ -313,6 +354,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCTargetMC() {
     // Register the asm backend.
     TargetRegistry::RegisterMCAsmBackend(*T, createPPCAsmBackend);
 
+    // Register the elf streamer.
+    TargetRegistry::RegisterELFStreamer(*T, createPPCMCStreamer);
+
     // Register the object target streamer.
     TargetRegistry::RegisterObjectTargetStreamer(*T,
                                                  createObjectTargetStreamer);
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
index 49443679bb31..719e005d9813 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.h
@@ -32,9 +32,6 @@ class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
 class Target;
-class Triple;
-class StringRef;
-class raw_pwrite_stream;
 
 MCCodeEmitter *createPPCMCCodeEmitter(const MCInstrInfo &MCII,
                                       const MCRegisterInfo &MRI,
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
deleted file mode 100644
index 672f910ab086..000000000000
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCMachObjectWriter.cpp
+++ /dev/null
@@ -1,380 +0,0 @@
-//===-- PPCMachObjectWriter.cpp - PPC Mach-O Writer -----------------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/PPCFixupKinds.h"
-#include "MCTargetDesc/PPCMCTargetDesc.h"
-#include "llvm/ADT/Twine.h"
-#include "llvm/BinaryFormat/MachO.h"
-#include "llvm/MC/MCAsmLayout.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCMachObjectWriter.h"
-#include "llvm/MC/MCSectionMachO.h"
-#include "llvm/MC/MCValue.h"
-#include "llvm/Support/ErrorHandling.h"
-#include "llvm/Support/Format.h"
-
-using namespace llvm;
-
-namespace {
-class PPCMachObjectWriter : public MCMachObjectTargetWriter {
-  bool recordScatteredRelocation(MachObjectWriter *Writer,
-                                 const MCAssembler &Asm,
-                                 const MCAsmLayout &Layout,
-                                 const MCFragment *Fragment,
-                                 const MCFixup &Fixup, MCValue Target,
-                                 unsigned Log2Size, uint64_t &FixedValue);
-
-  void RecordPPCRelocation(MachObjectWriter *Writer, const MCAssembler &Asm,
-                           const MCAsmLayout &Layout,
-                           const MCFragment *Fragment, const MCFixup &Fixup,
-                           MCValue Target, uint64_t &FixedValue);
-
-public:
-  PPCMachObjectWriter(bool Is64Bit, uint32_t CPUType, uint32_t CPUSubtype)
-      : MCMachObjectTargetWriter(Is64Bit, CPUType, CPUSubtype) {}
-
-  void recordRelocation(MachObjectWriter *Writer, MCAssembler &Asm,
-                        const MCAsmLayout &Layout, const MCFragment *Fragment,
-                        const MCFixup &Fixup, MCValue Target,
-                        uint64_t &FixedValue) override {
-    if (Writer->is64Bit()) {
-      report_fatal_error("Relocation emission for MachO/PPC64 unimplemented.");
-    } else
-      RecordPPCRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
-                          FixedValue);
-  }
-};
-}
-
-/// computes the log2 of the size of the relocation,
-/// used for relocation_info::r_length.
-static unsigned getFixupKindLog2Size(unsigned Kind) {
-  switch (Kind) {
-  default:
-    report_fatal_error("log2size(FixupKind): Unhandled fixup kind!");
-  case FK_PCRel_1:
-  case FK_Data_1:
-    return 0;
-  case FK_PCRel_2:
-  case FK_Data_2:
-    return 1;
-  case FK_PCRel_4:
-  case PPC::fixup_ppc_brcond14:
-  case PPC::fixup_ppc_half16:
-  case PPC::fixup_ppc_br24:
-  case FK_Data_4:
-    return 2;
-  case FK_PCRel_8:
-  case FK_Data_8:
-    return 3;
-  }
-  return 0;
-}
-
-/// Translates generic PPC fixup kind to Mach-O/PPC relocation type enum.
-/// Outline based on PPCELFObjectWriter::getRelocType().
-static unsigned getRelocType(const MCValue &Target,
-                             const MCFixupKind FixupKind, // from
-                                                          // Fixup.getKind()
-                             const bool IsPCRel) {
-  const MCSymbolRefExpr::VariantKind Modifier =
-      Target.isAbsolute() ? MCSymbolRefExpr::VK_None
-                          : Target.getSymA()->getKind();
-  // determine the type of the relocation
-  unsigned Type = MachO::GENERIC_RELOC_VANILLA;
-  if (IsPCRel) { // relative to PC
-    switch ((unsigned)FixupKind) {
-    default:
-      report_fatal_error("Unimplemented fixup kind (relative)");
-    case PPC::fixup_ppc_br24:
-      Type = MachO::PPC_RELOC_BR24; // R_PPC_REL24
-      break;
-    case PPC::fixup_ppc_brcond14:
-      Type = MachO::PPC_RELOC_BR14;
-      break;
-    case PPC::fixup_ppc_half16:
-      switch (Modifier) {
-      default:
-        llvm_unreachable("Unsupported modifier for half16 fixup");
-      case MCSymbolRefExpr::VK_PPC_HA:
-        Type = MachO::PPC_RELOC_HA16;
-        break;
-      case MCSymbolRefExpr::VK_PPC_LO:
-        Type = MachO::PPC_RELOC_LO16;
-        break;
-      case MCSymbolRefExpr::VK_PPC_HI:
-        Type = MachO::PPC_RELOC_HI16;
-        break;
-      }
-      break;
-    }
-  } else {
-    switch ((unsigned)FixupKind) {
-    default:
-      report_fatal_error("Unimplemented fixup kind (absolute)!");
-    case PPC::fixup_ppc_half16:
-      switch (Modifier) {
-      default:
-        llvm_unreachable("Unsupported modifier for half16 fixup");
-      case MCSymbolRefExpr::VK_PPC_HA:
-        Type = MachO::PPC_RELOC_HA16_SECTDIFF;
-        break;
-      case MCSymbolRefExpr::VK_PPC_LO:
-        Type = MachO::PPC_RELOC_LO16_SECTDIFF;
-        break;
-      case MCSymbolRefExpr::VK_PPC_HI:
-        Type = MachO::PPC_RELOC_HI16_SECTDIFF;
-        break;
-      }
-      break;
-    case FK_Data_4:
-      break;
-    case FK_Data_2:
-      break;
-    }
-  }
-  return Type;
-}
-
-static void makeRelocationInfo(MachO::any_relocation_info &MRE,
-                               const uint32_t FixupOffset, const uint32_t Index,
-                               const unsigned IsPCRel, const unsigned Log2Size,
-                               const unsigned IsExtern, const unsigned Type) {
-  MRE.r_word0 = FixupOffset;
-  // The bitfield offsets that work (as determined by trial-and-error)
-  // are different than what is documented in the mach-o manuals.
-  // This appears to be an endianness issue; reversing the order of the
-  // documented bitfields in <llvm/BinaryFormat/MachO.h> fixes this (but
-  // breaks x86/ARM assembly).
-  MRE.r_word1 = ((Index << 8) |    // was << 0
-                 (IsPCRel << 7) |  // was << 24
-                 (Log2Size << 5) | // was << 25
-                 (IsExtern << 4) | // was << 27
-                 (Type << 0));     // was << 28
-}
-
-static void
-makeScatteredRelocationInfo(MachO::any_relocation_info &MRE,
-                            const uint32_t Addr, const unsigned Type,
-                            const unsigned Log2Size, const unsigned IsPCRel,
-                            const uint32_t Value2) {
-  // For notes on bitfield positions and endianness, see:
-  // https://developer.apple.com/library/mac/documentation/developertools/conceptual/MachORuntime/Reference/reference.html#//apple_ref/doc/uid/20001298-scattered_relocation_entry
-  MRE.r_word0 = ((Addr << 0) | (Type << 24) | (Log2Size << 28) |
-                 (IsPCRel << 30) | MachO::R_SCATTERED);
-  MRE.r_word1 = Value2;
-}
-
-/// Compute fixup offset (address).
-static uint32_t getFixupOffset(const MCAsmLayout &Layout,
-                               const MCFragment *Fragment,
-                               const MCFixup &Fixup) {
-  uint32_t FixupOffset = Layout.getFragmentOffset(Fragment) + Fixup.getOffset();
-  // On Mach-O, ppc_fixup_half16 relocations must refer to the
-  // start of the instruction, not the second halfword, as ELF does
-  if (Fixup.getTargetKind() == PPC::fixup_ppc_half16)
-    FixupOffset &= ~uint32_t(3);
-  return FixupOffset;
-}
-
-/// \return false if falling back to using non-scattered relocation,
-/// otherwise true for normal scattered relocation.
-/// based on X86MachObjectWriter::recordScatteredRelocation
-/// and ARMMachObjectWriter::recordScatteredRelocation
-bool PPCMachObjectWriter::recordScatteredRelocation(
-    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
-    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
-    unsigned Log2Size, uint64_t &FixedValue) {
-  // caller already computes these, can we just pass and reuse?
-  const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
-  const MCFixupKind FK = Fixup.getKind();
-  const unsigned IsPCRel = Writer->isFixupKindPCRel(Asm, FK);
-  const unsigned Type = getRelocType(Target, FK, IsPCRel);
-
-  // Is this a local or SECTDIFF relocation entry?
-  // SECTDIFF relocation entries have symbol subtractions,
-  // and require two entries, the first for the add-symbol value,
-  // the second for the subtract-symbol value.
-
-  // See <reloc.h>.
-  const MCSymbol *A = &Target.getSymA()->getSymbol();
-
-  if (!A->getFragment())
-    report_fatal_error("symbol '" + A->getName() +
-                       "' can not be undefined in a subtraction expression");
-
-  uint32_t Value = Writer->getSymbolAddress(*A, Layout);
-  uint64_t SecAddr = Writer->getSectionAddress(A->getFragment()->getParent());
-  FixedValue += SecAddr;
-  uint32_t Value2 = 0;
-
-  if (const MCSymbolRefExpr *B = Target.getSymB()) {
-    const MCSymbol *SB = &B->getSymbol();
-
-    if (!SB->getFragment())
-      report_fatal_error("symbol '" + SB->getName() +
-                         "' can not be undefined in a subtraction expression");
-
-    // FIXME: is Type correct? see include/llvm/BinaryFormat/MachO.h
-    Value2 = Writer->getSymbolAddress(*SB, Layout);
-    FixedValue -= Writer->getSectionAddress(SB->getFragment()->getParent());
-  }
-  // FIXME: does FixedValue get used??
-
-  // Relocations are written out in reverse order, so the PAIR comes first.
-  if (Type == MachO::PPC_RELOC_SECTDIFF ||
-      Type == MachO::PPC_RELOC_HI16_SECTDIFF ||
-      Type == MachO::PPC_RELOC_LO16_SECTDIFF ||
-      Type == MachO::PPC_RELOC_HA16_SECTDIFF ||
-      Type == MachO::PPC_RELOC_LO14_SECTDIFF ||
-      Type == MachO::PPC_RELOC_LOCAL_SECTDIFF) {
-    // X86 had this piece, but ARM does not
-    // If the offset is too large to fit in a scattered relocation,
-    // we're hosed. It's an unfortunate limitation of the MachO format.
-    if (FixupOffset > 0xffffff) {
-      char Buffer[32];
-      format("0x%x", FixupOffset).print(Buffer, sizeof(Buffer));
-      Asm.getContext().reportError(Fixup.getLoc(),
-                                  Twine("Section too large, can't encode "
-                                        "r_address (") +
-                                      Buffer + ") into 24 bits of scattered "
-                                               "relocation entry.");
-      return false;
-    }
-
-    // Is this supposed to follow MCTarget/PPCAsmBackend.cpp:adjustFixupValue()?
-    // see PPCMCExpr::evaluateAsRelocatableImpl()
-    uint32_t other_half = 0;
-    switch (Type) {
-    case MachO::PPC_RELOC_LO16_SECTDIFF:
-      other_half = (FixedValue >> 16) & 0xffff;
-      // applyFixupOffset longer extracts the high part because it now assumes
-      // this was already done.
-      // It looks like this is not true for the FixedValue needed with Mach-O
-      // relocs.
-      // So we need to adjust FixedValue again here.
-      FixedValue &= 0xffff;
-      break;
-    case MachO::PPC_RELOC_HA16_SECTDIFF:
-      other_half = FixedValue & 0xffff;
-      FixedValue =
-          ((FixedValue >> 16) + ((FixedValue & 0x8000) ? 1 : 0)) & 0xffff;
-      break;
-    case MachO::PPC_RELOC_HI16_SECTDIFF:
-      other_half = FixedValue & 0xffff;
-      FixedValue = (FixedValue >> 16) & 0xffff;
-      break;
-    default:
-      llvm_unreachable("Invalid PPC scattered relocation type.");
-      break;
-    }
-
-    MachO::any_relocation_info MRE;
-    makeScatteredRelocationInfo(MRE, other_half, MachO::GENERIC_RELOC_PAIR,
-                                Log2Size, IsPCRel, Value2);
-    Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
-  } else {
-    // If the offset is more than 24-bits, it won't fit in a scattered
-    // relocation offset field, so we fall back to using a non-scattered
-    // relocation. This is a bit risky, as if the offset reaches out of
-    // the block and the linker is doing scattered loading on this
-    // symbol, things can go badly.
-    //
-    // Required for 'as' compatibility.
-    if (FixupOffset > 0xffffff)
-      return false;
-  }
-  MachO::any_relocation_info MRE;
-  makeScatteredRelocationInfo(MRE, FixupOffset, Type, Log2Size, IsPCRel, Value);
-  Writer->addRelocation(nullptr, Fragment->getParent(), MRE);
-  return true;
-}
-
-// see PPCELFObjectWriter for a general outline of cases
-void PPCMachObjectWriter::RecordPPCRelocation(
-    MachObjectWriter *Writer, const MCAssembler &Asm, const MCAsmLayout &Layout,
-    const MCFragment *Fragment, const MCFixup &Fixup, MCValue Target,
-    uint64_t &FixedValue) {
-  const MCFixupKind FK = Fixup.getKind(); // unsigned
-  const unsigned Log2Size = getFixupKindLog2Size(FK);
-  const bool IsPCRel = Writer->isFixupKindPCRel(Asm, FK);
-  const unsigned RelocType = getRelocType(Target, FK, IsPCRel);
-
-  // If this is a difference or a defined symbol plus an offset, then we need a
-  // scattered relocation entry. Differences always require scattered
-  // relocations.
-  if (Target.getSymB() &&
-      // Q: are branch targets ever scattered?
-      RelocType != MachO::PPC_RELOC_BR24 &&
-      RelocType != MachO::PPC_RELOC_BR14) {
-    recordScatteredRelocation(Writer, Asm, Layout, Fragment, Fixup, Target,
-                              Log2Size, FixedValue);
-    return;
-  }
-
-  // this doesn't seem right for RIT_PPC_BR24
-  // Get the symbol data, if any.
-  const MCSymbol *A = nullptr;
-  if (Target.getSymA())
-    A = &Target.getSymA()->getSymbol();
-
-  // See <reloc.h>.
-  const uint32_t FixupOffset = getFixupOffset(Layout, Fragment, Fixup);
-  unsigned Index = 0;
-  unsigned Type = RelocType;
-
-  const MCSymbol *RelSymbol = nullptr;
-  if (Target.isAbsolute()) { // constant
-                             // SymbolNum of 0 indicates the absolute section.
-                             //
-    // FIXME: Currently, these are never generated (see code below). I cannot
-    // find a case where they are actually emitted.
-    report_fatal_error("FIXME: relocations to absolute targets "
-                       "not yet implemented");
-    // the above line stolen from ARM, not sure
-  } else {
-    // Resolve constant variables.
-    if (A->isVariable()) {
-      int64_t Res;
-      if (A->getVariableValue()->evaluateAsAbsolute(
-              Res, Layout, Writer->getSectionAddressMap())) {
-        FixedValue = Res;
-        return;
-      }
-    }
-
-    // Check whether we need an external or internal relocation.
-    if (Writer->doesSymbolRequireExternRelocation(*A)) {
-      RelSymbol = A;
-      // For external relocations, make sure to offset the fixup value to
-      // compensate for the addend of the symbol address, if it was
-      // undefined. This occurs with weak definitions, for example.
-      if (!A->isUndefined())
-        FixedValue -= Layout.getSymbolOffset(*A);
-    } else {
-      // The index is the section ordinal (1-based).
-      const MCSection &Sec = A->getSection();
-      Index = Sec.getOrdinal() + 1;
-      FixedValue += Writer->getSectionAddress(&Sec);
-    }
-    if (IsPCRel)
-      FixedValue -= Writer->getSectionAddress(Fragment->getParent());
-  }
-
-  // struct relocation_info (8 bytes)
-  MachO::any_relocation_info MRE;
-  makeRelocationInfo(MRE, FixupOffset, Index, IsPCRel, Log2Size, false, Type);
-  Writer->addRelocation(RelSymbol, Fragment->getParent(), MRE);
-}
-
-std::unique_ptr<MCObjectTargetWriter>
-llvm::createPPCMachObjectWriter(bool Is64Bit, uint32_t CPUType,
-                                uint32_t CPUSubtype) {
-  return std::make_unique<PPCMachObjectWriter>(Is64Bit, CPUType, CPUSubtype);
-}
diff --git a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
index 7fdbb8990b55..d672d54772e0 100644
--- a/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
+++ b/llvm/lib/Target/PowerPC/MCTargetDesc/PPCXCOFFObjectWriter.cpp
@@ -7,16 +7,26 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "PPCMCTargetDesc.h"
+#include "MCTargetDesc/PPCFixupKinds.h"
+#include "MCTargetDesc/PPCMCTargetDesc.h"
+#include "llvm/BinaryFormat/XCOFF.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCValue.h"
 #include "llvm/MC/MCXCOFFObjectWriter.h"
 
 using namespace llvm;
 
 namespace {
 class PPCXCOFFObjectWriter : public MCXCOFFObjectTargetWriter {
+  static constexpr uint8_t SignBitMask = 0x80;
 
 public:
   PPCXCOFFObjectWriter(bool Is64Bit);
+
+  std::pair<uint8_t, uint8_t>
+  getRelocTypeAndSignSize(const MCValue &Target, const MCFixup &Fixup,
+                          bool IsPCRel) const override;
 };
 } // end anonymous namespace
 
@@ -27,3 +37,40 @@ std::unique_ptr<MCObjectTargetWriter>
 llvm::createPPCXCOFFObjectWriter(bool Is64Bit) {
   return std::make_unique<PPCXCOFFObjectWriter>(Is64Bit);
 }
+
+std::pair<uint8_t, uint8_t> PPCXCOFFObjectWriter::getRelocTypeAndSignSize(
+    const MCValue &Target, const MCFixup &Fixup, bool IsPCRel) const {
+  const MCSymbolRefExpr::VariantKind Modifier =
+      Target.isAbsolute() ? MCSymbolRefExpr::VK_None
+                          : Target.getSymA()->getKind();
+  // People from AIX OS team says AIX link editor does not care about
+  // the sign bit in the relocation entry "most" of the time.
+  // The system assembler seems to set the sign bit on relocation entry
+  // based on similar property of IsPCRel. So we will do the same here.
+  // TODO: More investigation on how assembler decides to set the sign
+  // bit, and we might want to match that.
+  const uint8_t EncodedSignednessIndicator = IsPCRel ? SignBitMask : 0u;
+
+  // The magic number we use in SignAndSize has a strong relationship with
+  // the corresponding MCFixupKind. In most cases, it's the MCFixupKind
+  // number - 1, because SignAndSize encodes the bit length being
+  // relocated minus 1.
+  switch ((unsigned)Fixup.getKind()) {
+  default:
+    report_fatal_error("Unimplemented fixup kind.");
+  case PPC::fixup_ppc_half16:
+    switch (Modifier) {
+    default:
+      report_fatal_error("Unsupported modifier for half16 fixup.");
+    case MCSymbolRefExpr::VK_None:
+      return {XCOFF::RelocationType::R_TOC, EncodedSignednessIndicator | 15};
+    }
+    break;
+  case PPC::fixup_ppc_br24:
+    // Branches are 4 byte aligned, so the 24 bits we encode in
+    // the instruction actually represents a 26 bit offset.
+    return {XCOFF::RelocationType::R_RBR, EncodedSignednessIndicator | 25};
+  case FK_Data_4:
+    return {XCOFF::RelocationType::R_POS, EncodedSignednessIndicator | 31};
+  }
+}
diff --git a/llvm/lib/Target/PowerPC/P9InstrResources.td b/llvm/lib/Target/PowerPC/P9InstrResources.td
index 9b3d13989ee2..d7e3519d5539 100644
--- a/llvm/lib/Target/PowerPC/P9InstrResources.td
+++ b/llvm/lib/Target/PowerPC/P9InstrResources.td
@@ -373,6 +373,7 @@ def : InstRW<[P9_DPE_7C, P9_DPO_7C, IP_EXECE_1C, IP_EXECO_1C, DISP_1C],
     VMSUMSHS,
     VMSUMUBM,
     VMSUMUHM,
+    VMSUMUDM,
     VMSUMUHS,
     VMULESB,
     VMULESH,
diff --git a/llvm/lib/Target/PowerPC/PPC.h b/llvm/lib/Target/PowerPC/PPC.h
index a83509f0e687..7e0aa2c6061d 100644
--- a/llvm/lib/Target/PowerPC/PPC.h
+++ b/llvm/lib/Target/PowerPC/PPC.h
@@ -51,10 +51,9 @@ namespace llvm {
   FunctionPass *createPPCExpandISELPass();
   FunctionPass *createPPCPreEmitPeepholePass();
   void LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                    AsmPrinter &AP, bool IsDarwin);
+                                    AsmPrinter &AP);
   bool LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
-                                         MCOperand &OutMO, AsmPrinter &AP,
-                                         bool IsDarwin);
+                                         MCOperand &OutMO, AsmPrinter &AP);
 
   void initializePPCCTRLoopsPass(PassRegistry&);
 #ifndef NDEBUG
@@ -99,33 +98,33 @@ namespace llvm {
     /// the function's picbase, e.g. lo16(symbol-picbase).
     MO_PIC_FLAG = 2,
 
-    /// MO_NLP_FLAG - If this bit is set, the symbol reference is actually to
-    /// the non_lazy_ptr for the global, e.g. lo16(symbol$non_lazy_ptr-picbase).
-    MO_NLP_FLAG = 4,
+    /// MO_PCREL_FLAG - If this bit is set, the symbol reference is relative to
+    /// the current instruction address(pc), e.g., var@pcrel. Fixup is VK_PCREL.
+    MO_PCREL_FLAG = 4,
 
-    /// MO_NLP_HIDDEN_FLAG - If this bit is set, the symbol reference is to a
-    /// symbol with hidden visibility.  This causes a different kind of
-    /// non-lazy-pointer to be generated.
-    MO_NLP_HIDDEN_FLAG = 8,
+    /// MO_GOT_FLAG - If this bit is set the symbol reference is to be computed
+    /// via the GOT. For example when combined with the MO_PCREL_FLAG it should
+    /// produce the relocation @got@pcrel. Fixup is VK_PPC_GOT_PCREL.
+    MO_GOT_FLAG = 8,
 
     /// The next are not flags but distinct values.
-    MO_ACCESS_MASK = 0xf0,
+    MO_ACCESS_MASK = 0xf00,
 
     /// MO_LO, MO_HA - lo16(symbol) and ha16(symbol)
-    MO_LO = 1 << 4,
-    MO_HA = 2 << 4,
+    MO_LO = 1 << 8,
+    MO_HA = 2 << 8,
 
-    MO_TPREL_LO = 4 << 4,
-    MO_TPREL_HA = 3 << 4,
+    MO_TPREL_LO = 4 << 8,
+    MO_TPREL_HA = 3 << 8,
 
     /// These values identify relocations on immediates folded
     /// into memory operations.
-    MO_DTPREL_LO = 5 << 4,
-    MO_TLSLD_LO = 6 << 4,
-    MO_TOC_LO = 7 << 4,
+    MO_DTPREL_LO = 5 << 8,
+    MO_TLSLD_LO = 6 << 8,
+    MO_TOC_LO = 7 << 8,
 
     // Symbol for VK_PPC_TLS fixup attached to an ADD instruction
-    MO_TLS = 8 << 4
+    MO_TLS = 8 << 8
   };
   } // end namespace PPCII
 
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index bef0a81ee3ad..9ad78bf67fe6 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -51,6 +51,7 @@ def DirectivePwr6x
 def DirectivePwr7: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR7", "">;
 def DirectivePwr8: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR8", "">;
 def DirectivePwr9: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR9", "">;
+def DirectivePwr10: SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR10", "">;
 def DirectivePwrFuture
     : SubtargetFeature<"", "CPUDirective", "PPC::DIR_PWR_FUTURE", "">;
 
@@ -166,6 +167,16 @@ def FeatureHTM : SubtargetFeature<"htm", "HasHTM", "true",
                                   "Enable Hardware Transactional Memory instructions">;
 def FeatureMFTB   : SubtargetFeature<"", "FeatureMFTB", "true",
                                         "Implement mftb using the mfspr instruction">;
+def FeatureFusion : SubtargetFeature<"fusion", "HasFusion", "true",
+                                     "Target supports instruction fusion">;
+def FeatureAddiLoadFusion : SubtargetFeature<"fuse-addi-load",
+                                             "HasAddiLoadFusion", "true",
+                                             "Power8 Addi-Load fusion",
+                                             [FeatureFusion]>;
+def FeatureAddisLoadFusion : SubtargetFeature<"fuse-addis-load",
+                                              "HasAddisLoadFusion", "true",
+                                              "Power8 Addis-Load fusion",
+                                              [FeatureFusion]>;
 def FeatureUnalignedFloats :
   SubtargetFeature<"allow-unaligned-fp-access", "AllowsUnalignedFPAccess",
                    "true", "CPU does not trap on unaligned FP access">;
@@ -194,7 +205,11 @@ def DeprecatedDST    : SubtargetFeature<"", "DeprecatedDST", "true",
 
 def FeatureISA3_0 : SubtargetFeature<"isa-v30-instructions", "IsISA3_0",
                                      "true",
-                                     "Enable instructions added in ISA 3.0.">;
+                                     "Enable instructions in ISA 3.0.">;
+def FeatureISA3_1 : SubtargetFeature<"isa-v31-instructions", "IsISA3_1",
+                                     "true",
+                                     "Enable instructions in ISA 3.1.",
+                                     [FeatureISA3_0]>;
 def FeatureP9Altivec : SubtargetFeature<"power9-altivec", "HasP9Altivec", "true",
                                         "Enable POWER9 Altivec instructions",
                                         [FeatureISA3_0, FeatureP8Altivec]>;
@@ -202,6 +217,10 @@ def FeatureP9Vector  : SubtargetFeature<"power9-vector", "HasP9Vector", "true",
                                         "Enable POWER9 vector instructions",
                                         [FeatureISA3_0, FeatureP8Vector,
                                          FeatureP9Altivec]>;
+def FeatureP10Vector  : SubtargetFeature<"power10-vector", "HasP10Vector",
+                                         "true",
+                                         "Enable POWER10 vector instructions",
+                                         [FeatureISA3_1, FeatureP9Vector]>;
 // A separate feature for this even though it is equivalent to P9Vector
 // because this is a feature of the implementation rather than the architecture
 // and may go away with future CPU's.
@@ -209,6 +228,21 @@ def FeatureVectorsUseTwoUnits : SubtargetFeature<"vectors-use-two-units",
                                                  "VectorsUseTwoUnits",
                                                  "true",
                                                  "Vectors use two units">;
+def FeaturePrefixInstrs : SubtargetFeature<"prefix-instrs", "HasPrefixInstrs",
+                                           "true",
+                                           "Enable prefixed instructions",
+                                           [FeatureISA3_0, FeatureP8Vector,
+                                            FeatureP9Altivec]>;
+def FeaturePCRelativeMemops :
+  SubtargetFeature<"pcrelative-memops", "HasPCRelativeMemops", "true",
+                   "Enable PC relative Memory Ops",
+                   [FeatureISA3_0]>;
+
+def FeaturePredictableSelectIsExpensive :
+  SubtargetFeature<"predictable-select-expensive",
+                   "PredictableSelectIsExpensive",
+                   "true",
+                   "Prefer likely predicted branches over selects">;
 
 // Since new processors generally contain a superset of features of those that
 // came before them, the idea is to make implementations of new processors
@@ -225,7 +259,7 @@ def FeatureVectorsUseTwoUnits : SubtargetFeature<"vectors-use-two-units",
 //         !listconcat(FutureProcessorInheritableFeatures,
 //                     FutureProcessorSpecificFeatures)
 
-// Makes it explicit and obvious what is new in FutureProcesor vs. Power8 as
+// Makes it explicit and obvious what is new in FutureProcessor vs. Power8 as
 // well as providing a single point of definition if the feature set will be
 // used elsewhere.
 def ProcessorFeatures {
@@ -262,25 +296,34 @@ def ProcessorFeatures {
     !listconcat(P7InheritableFeatures, P7SpecificFeatures);
 
   // Power8
-  list<SubtargetFeature> P8AdditionalFeatures = [DirectivePwr8,
-                                                 FeatureP8Altivec,
-                                                 FeatureP8Vector,
-                                                 FeatureP8Crypto,
-                                                 FeatureHTM,
-                                                 FeatureDirectMove,
-                                                 FeatureICBT,
-                                                 FeaturePartwordAtomic];
-  list<SubtargetFeature> P8SpecificFeatures = [];
+  list<SubtargetFeature> P8AdditionalFeatures =
+    [DirectivePwr8,
+     FeatureP8Altivec,
+     FeatureP8Vector,
+     FeatureP8Crypto,
+     FeatureHTM,
+     FeatureDirectMove,
+     FeatureICBT,
+     FeaturePartwordAtomic,
+     FeaturePredictableSelectIsExpensive
+    ];
+
+  list<SubtargetFeature> P8SpecificFeatures = [FeatureAddiLoadFusion,
+                                               FeatureAddisLoadFusion];
   list<SubtargetFeature> P8InheritableFeatures =
     !listconcat(P7InheritableFeatures, P8AdditionalFeatures);
   list<SubtargetFeature> P8Features =
     !listconcat(P8InheritableFeatures, P8SpecificFeatures);
 
   // Power9
-  list<SubtargetFeature> P9AdditionalFeatures = [DirectivePwr9,
-                                                 FeatureP9Altivec,
-                                                 FeatureP9Vector,
-                                                 FeatureISA3_0];
+  list<SubtargetFeature> P9AdditionalFeatures =
+    [DirectivePwr9,
+     FeatureP9Altivec,
+     FeatureP9Vector,
+     FeatureISA3_0,
+     FeaturePredictableSelectIsExpensive
+    ];
+
   // Some features are unique to Power9 and there is no reason to assume
   // they will be part of any future CPUs. One example is the narrower
   // dispatch for vector operations than scalar ones. For the time being,
@@ -294,13 +337,25 @@ def ProcessorFeatures {
   list<SubtargetFeature> P9Features =
     !listconcat(P9InheritableFeatures, P9SpecificFeatures);
 
+  // Power10
+  // For P10 CPU we assume that all of the existing features from Power9
+  // still exist with the exception of those we know are Power9 specific.
+  list<SubtargetFeature> P10AdditionalFeatures =
+    [DirectivePwr10, FeatureISA3_1, FeaturePrefixInstrs,
+     FeaturePCRelativeMemops, FeatureP10Vector];
+  list<SubtargetFeature> P10SpecificFeatures = [];
+  list<SubtargetFeature> P10InheritableFeatures =
+    !listconcat(P9InheritableFeatures, P10AdditionalFeatures);
+  list<SubtargetFeature> P10Features =
+    !listconcat(P10InheritableFeatures, P10SpecificFeatures);
+
   // Future
-  // For future CPU we assume that all of the existing features from Power 9
-  // still exist with the exception of those we know are Power 9 specific.
+  // For future CPU we assume that all of the existing features from Power10
+  // still exist with the exception of those we know are Power10 specific.
   list<SubtargetFeature> FutureAdditionalFeatures = [];
   list<SubtargetFeature> FutureSpecificFeatures = [];
   list<SubtargetFeature> FutureInheritableFeatures =
-    !listconcat(P9InheritableFeatures, FutureAdditionalFeatures);
+    !listconcat(P10InheritableFeatures, FutureAdditionalFeatures);
   list<SubtargetFeature> FutureFeatures =
     !listconcat(FutureInheritableFeatures, FutureSpecificFeatures);
 }
@@ -442,7 +497,7 @@ def : ProcessorModel<"g5", G5Model,
 def : ProcessorModel<"e500", PPCE500Model,
                   [DirectiveE500,
                    FeatureICBT, FeatureBookE,
-                   FeatureISEL, FeatureMFTB, FeatureSPE]>;
+                   FeatureISEL, FeatureMFTB, FeatureMSYNC, FeatureSPE]>;
 def : ProcessorModel<"e500mc", PPCE500mcModel,
                   [DirectiveE500mc,
                    FeatureSTFIWX, FeatureICBT, FeatureBookE,
@@ -505,6 +560,8 @@ def : ProcessorModel<"pwr6x", G5Model,
 def : ProcessorModel<"pwr7", P7Model, ProcessorFeatures.P7Features>;
 def : ProcessorModel<"pwr8", P8Model, ProcessorFeatures.P8Features>;
 def : ProcessorModel<"pwr9", P9Model, ProcessorFeatures.P9Features>;
+// No scheduler model yet.
+def : ProcessorModel<"pwr10", NoSchedModel, ProcessorFeatures.P10Features>;
 // No scheduler model for future CPU.
 def : ProcessorModel<"future", NoSchedModel,
                   ProcessorFeatures.FutureFeatures>;
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 4311df5dbeb8..bf5fe741bac8 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -83,8 +83,6 @@ protected:
   const PPCSubtarget *Subtarget = nullptr;
   StackMaps SM;
 
-  virtual MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO);
-
 public:
   explicit PPCAsmPrinter(TargetMachine &TM,
                          std::unique_ptr<MCStreamer> Streamer)
@@ -100,7 +98,7 @@ public:
     return AsmPrinter::doInitialization(M);
   }
 
-  void EmitInstruction(const MachineInstr *MI) override;
+  void emitInstruction(const MachineInstr *MI) override;
 
   /// This function is for PrintAsmOperand and PrintAsmMemoryOperand,
   /// invoked by EmitMSInlineAsmStr and EmitGCCInlineAsmStr only.
@@ -113,7 +111,7 @@ public:
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
                              const char *ExtraCode, raw_ostream &O) override;
 
-  void EmitEndOfAsmFile(Module &M) override;
+  void emitEndOfAsmFile(Module &M) override;
 
   void LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI);
   void LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI);
@@ -137,37 +135,41 @@ public:
     return "Linux PPC Assembly Printer";
   }
 
-  bool doFinalization(Module &M) override;
-  void EmitStartOfAsmFile(Module &M) override;
+  void emitStartOfAsmFile(Module &M) override;
+  void emitEndOfAsmFile(Module &) override;
 
-  void EmitFunctionEntryLabel() override;
+  void emitFunctionEntryLabel() override;
 
-  void EmitFunctionBodyStart() override;
-  void EmitFunctionBodyEnd() override;
-  void EmitInstruction(const MachineInstr *MI) override;
+  void emitFunctionBodyStart() override;
+  void emitFunctionBodyEnd() override;
+  void emitInstruction(const MachineInstr *MI) override;
 };
 
 class PPCAIXAsmPrinter : public PPCAsmPrinter {
 private:
   static void ValidateGV(const GlobalVariable *GV);
-protected:
-  MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO) override;
 
 public:
   PPCAIXAsmPrinter(TargetMachine &TM, std::unique_ptr<MCStreamer> Streamer)
-      : PPCAsmPrinter(TM, std::move(Streamer)) {}
+      : PPCAsmPrinter(TM, std::move(Streamer)) {
+    if (MAI->isLittleEndian())
+      report_fatal_error(
+          "cannot create AIX PPC Assembly Printer for a little-endian target");
+  }
 
   StringRef getPassName() const override { return "AIX PPC Assembly Printer"; }
 
+  bool doInitialization(Module &M) override;
+
   void SetupMachineFunction(MachineFunction &MF) override;
 
-  const MCExpr *lowerConstant(const Constant *CV) override;
+  void emitGlobalVariable(const GlobalVariable *GV) override;
 
-  void EmitGlobalVariable(const GlobalVariable *GV) override;
+  void emitFunctionDescriptor() override;
 
-  void EmitFunctionDescriptor() override;
+  void emitEndOfAsmFile(Module &) override;
 
-  void EmitEndOfAsmFile(Module &) override;
+  void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const override;
 };
 
 } // end anonymous namespace
@@ -176,23 +178,7 @@ void PPCAsmPrinter::PrintSymbolOperand(const MachineOperand &MO,
                                        raw_ostream &O) {
   // Computing the address of a global symbol, not calling it.
   const GlobalValue *GV = MO.getGlobal();
-  MCSymbol *SymToPrint;
-
-  // External or weakly linked global variables need non-lazily-resolved stubs
-  if (Subtarget->hasLazyResolverStub(GV)) {
-    SymToPrint = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
-    MachineModuleInfoImpl::StubValueTy &StubSym =
-        MMI->getObjFileInfo<MachineModuleInfoMachO>().getGVStubEntry(
-            SymToPrint);
-    if (!StubSym.getPointer())
-      StubSym = MachineModuleInfoImpl::StubValueTy(getSymbol(GV),
-                                                   !GV->hasInternalLinkage());
-  } else {
-    SymToPrint = getSymbol(GV);
-  }
-
-  SymToPrint->print(O, MAI);
-
+  getSymbol(GV)->print(O, MAI);
   printOffset(MO.getOffset(), O);
 }
 
@@ -208,9 +194,7 @@ void PPCAsmPrinter::printOperand(const MachineInstr *MI, unsigned OpNo,
 
     // Linux assembler (Others?) does not take register mnemonics.
     // FIXME - What about special registers used in mfspr/mtspr?
-    if (!Subtarget->isDarwin())
-      RegName = PPCRegisterInfo::stripRegisterPrefix(RegName);
-    O << RegName;
+    O << PPCRegisterInfo::stripRegisterPrefix(RegName);
     return;
   }
   case MachineOperand::MO_Immediate:
@@ -298,18 +282,17 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
 
     switch (ExtraCode[0]) {
     default: return true;  // Unknown modifier.
+    case 'L': // A memory reference to the upper word of a double word op.
+      O << getDataLayout().getPointerSize() << "(";
+      printOperand(MI, OpNo, O);
+      O << ")";
+      return false;
     case 'y': // A memory reference for an X-form instruction
-      {
-        const char *RegName = "r0";
-        if (!Subtarget->isDarwin())
-          RegName = PPCRegisterInfo::stripRegisterPrefix(RegName);
-        O << RegName << ", ";
-        printOperand(MI, OpNo, O);
-        return false;
-      }
+      O << "0, ";
+      printOperand(MI, OpNo, O);
+      return false;
     case 'U': // Print 'u' for update form.
     case 'X': // Print 'x' for indexed form.
-    {
       // FIXME: Currently for PowerPC memory operands are always loaded
       // into a register, so we never get an update or indexed form.
       // This is bad even for offset forms, since even if we know we
@@ -319,7 +302,6 @@ bool PPCAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
       assert(MI->getOperand(OpNo).isReg());
       return false;
     }
-    }
   }
 
   assert(MI->getOperand(OpNo).isReg());
@@ -339,7 +321,7 @@ MCSymbol *PPCAsmPrinter::lookUpOrCreateTOCEntry(const MCSymbol *Sym) {
   return TOCEntry;
 }
 
-void PPCAsmPrinter::EmitEndOfAsmFile(Module &M) {
+void PPCAsmPrinter::emitEndOfAsmFile(Module &M) {
   emitStackMaps(SM);
 }
 
@@ -348,7 +330,7 @@ void PPCAsmPrinter::LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI) {
   
   auto &Ctx = OutStreamer->getContext();
   MCSymbol *MILabel = Ctx.createTempSymbol();
-  OutStreamer->EmitLabel(MILabel);
+  OutStreamer->emitLabel(MILabel);
 
   SM.recordStackMap(*MILabel, MI);
   assert(NumNOPBytes % 4 == 0 && "Invalid number of NOP bytes requested!");
@@ -377,7 +359,7 @@ void PPCAsmPrinter::LowerSTACKMAP(StackMaps &SM, const MachineInstr &MI) {
 void PPCAsmPrinter::LowerPATCHPOINT(StackMaps &SM, const MachineInstr &MI) {
   auto &Ctx = OutStreamer->getContext();
   MCSymbol *MILabel = Ctx.createTempSymbol();
-  OutStreamer->EmitLabel(MILabel);
+  OutStreamer->emitLabel(MILabel);
 
   SM.recordPatchPoint(*MILabel, MI);
   PatchPointOpers Opers(&MI);
@@ -516,16 +498,17 @@ void PPCAsmPrinter::EmitTlsCall(const MachineInstr *MI,
 
 /// Map a machine operand for a TOC pseudo-machine instruction to its
 /// corresponding MCSymbol.
-MCSymbol *PPCAsmPrinter::getMCSymbolForTOCPseudoMO(const MachineOperand &MO) {
+static MCSymbol *getMCSymbolForTOCPseudoMO(const MachineOperand &MO,
+                                           AsmPrinter &AP) {
   switch (MO.getType()) {
   case MachineOperand::MO_GlobalAddress:
-    return getSymbol(MO.getGlobal());
+    return AP.getSymbol(MO.getGlobal());
   case MachineOperand::MO_ConstantPoolIndex:
-    return GetCPISymbol(MO.getIndex());
+    return AP.GetCPISymbol(MO.getIndex());
   case MachineOperand::MO_JumpTableIndex:
-    return GetJTISymbol(MO.getIndex());
+    return AP.GetJTISymbol(MO.getIndex());
   case MachineOperand::MO_BlockAddress:
-    return GetBlockAddressSymbol(MO.getBlockAddress());
+    return AP.GetBlockAddressSymbol(MO.getBlockAddress());
   default:
     llvm_unreachable("Unexpected operand type to get symbol.");
   }
@@ -534,9 +517,8 @@ MCSymbol *PPCAsmPrinter::getMCSymbolForTOCPseudoMO(const MachineOperand &MO) {
 /// EmitInstruction -- Print out a single PowerPC MI in Darwin syntax to
 /// the current output stream.
 ///
-void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void PPCAsmPrinter::emitInstruction(const MachineInstr *MI) {
   MCInst TmpInst;
-  const bool IsDarwin = TM.getTargetTriple().isOSDarwin();
   const bool IsPPC64 = Subtarget->isPPC64();
   const bool IsAIX = Subtarget->isAIXABI();
   const Module *M = MF->getFunction().getParent();
@@ -614,7 +596,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
                        .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
 
     // Emit the label.
-    OutStreamer->EmitLabel(PICBase);
+    OutStreamer->emitLabel(PICBase);
     return;
   }
   case PPC::UpdateGBR: {
@@ -625,7 +607,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     //       addis r30, r30, {.LTOC,_GLOBAL_OFFSET_TABLE} - .L0$pb@ha
     //       addi r30, r30, {.LTOC,_GLOBAL_OFFSET_TABLE} - .L0$pb@l
     // Get the offset from the GOT Base Register to the GOT
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
     if (Subtarget->isSecurePlt() && isPositionIndependent() ) {
       unsigned PICR = TmpInst.getOperand(0).getReg();
       MCSymbol *BaseSymbol = OutContext.getOrCreateSymbol(
@@ -637,19 +619,19 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       const MCExpr *DeltaExpr = MCBinaryExpr::createSub(
           MCSymbolRefExpr::create(BaseSymbol, OutContext), PB, OutContext);
 
-      const MCExpr *DeltaHi = PPCMCExpr::createHa(DeltaExpr, false, OutContext);
+      const MCExpr *DeltaHi = PPCMCExpr::createHa(DeltaExpr, OutContext);
       EmitToStreamer(
           *OutStreamer,
           MCInstBuilder(PPC::ADDIS).addReg(PICR).addReg(PICR).addExpr(DeltaHi));
 
-      const MCExpr *DeltaLo = PPCMCExpr::createLo(DeltaExpr, false, OutContext);
+      const MCExpr *DeltaLo = PPCMCExpr::createLo(DeltaExpr, OutContext);
       EmitToStreamer(
           *OutStreamer,
           MCInstBuilder(PPC::ADDI).addReg(PICR).addReg(PICR).addExpr(DeltaLo));
       return;
     } else {
       MCSymbol *PICOffset =
-        MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol();
+        MF->getInfo<PPCFunctionInfo>()->getPICOffsetSymbol(*MF);
       TmpInst.setOpcode(PPC::LWZ);
       const MCExpr *Exp =
         MCSymbolRefExpr::create(PICOffset, MCSymbolRefExpr::VK_None, OutContext);
@@ -676,10 +658,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     }
   }
   case PPC::LWZtoc: {
-    assert(!IsDarwin && "TOC is an ELF/XCOFF construct.");
-
     // Transform %rN = LWZtoc @op1, %r2
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
     // Change the opcode to LWZ.
     TmpInst.setOpcode(PPC::LWZ);
@@ -689,7 +669,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
            "Invalid operand for LWZtoc.");
 
     // Map the operand to its corresponding MCSymbol.
-    const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO);
+    const MCSymbol *const MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
 
     // Create a reference to the GOT entry for the symbol. The GOT entry will be
     // synthesized later.
@@ -734,10 +714,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case PPC::LDtocCPT:
   case PPC::LDtocBA:
   case PPC::LDtoc: {
-    assert(!IsDarwin && "TOC is an ELF/XCOFF construct");
-
     // Transform %x3 = LDtoc @min1, %x2
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
     // Change the opcode to LD.
     TmpInst.setOpcode(PPC::LD);
@@ -750,7 +728,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // global address operand to be a reference to the TOC entry we will
     // synthesize later.
     MCSymbol *TOCEntry =
-        lookUpOrCreateTOCEntry(getMCSymbolForTOCPseudoMO(MO));
+        lookUpOrCreateTOCEntry(getMCSymbolForTOCPseudoMO(MO, *this));
 
     const MCSymbolRefExpr::VariantKind VK =
         IsAIX ? MCSymbolRefExpr::VK_None : MCSymbolRefExpr::VK_PPC_TOC;
@@ -766,7 +744,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
            " AIX.");
 
     // Transform %rd = ADDIStocHA %rA, @sym(%r2)
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
     // Change the opcode to ADDIS.
     TmpInst.setOpcode(PPC::ADDIS);
@@ -776,7 +754,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
            "Invalid operand for ADDIStocHA.");
 
     // Map the machine operand to its corresponding MCSymbol.
-    MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO);
+    MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
 
     // Always use TOC on AIX. Map the global address operand to be a reference
     // to the TOC entry we will synthesize later. 'TOCEntry' is a label used to
@@ -796,7 +774,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
            " AIX.");
 
     // Transform %rd = LWZtocL @sym, %rs.
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
     // Change the opcode to lwz.
     TmpInst.setOpcode(PPC::LWZ);
@@ -806,7 +784,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
            "Invalid operand for LWZtocL.");
 
     // Map the machine operand to its corresponding MCSymbol.
-    MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO);
+    MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
 
     // Always use TOC on AIX. Map the global address operand to be a reference
     // to the TOC entry we will synthesize later. 'TOCEntry' is a label used to
@@ -821,10 +799,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::ADDIStocHA8: {
-    assert(!IsDarwin && "TOC is an ELF/XCOFF construct");
-
     // Transform %xd = ADDIStocHA8 %x2, @sym
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
     // Change the opcode to ADDIS8. If the global address is the address of
     // an external symbol, is a jump table address, is a block address, or is a
@@ -836,7 +812,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     assert((MO.isGlobal() || MO.isCPI() || MO.isJTI() || MO.isBlockAddress()) &&
            "Invalid operand for ADDIStocHA8!");
 
-    const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO);
+    const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
 
     const bool GlobalToc =
         MO.isGlobal() && Subtarget->isGVIndirectSymbol(MO.getGlobal());
@@ -861,10 +837,8 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     return;
   }
   case PPC::LDtocL: {
-    assert(!IsDarwin && "TOC is an ELF/XCOFF construct");
-
     // Transform %xd = LDtocL @sym, %xs
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
     // Change the opcode to LD. If the global address is the address of
     // an external symbol, is a jump table address, is a block address, or is
@@ -882,7 +856,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
         "LDtocL used on symbol that could be accessed directly is "
         "invalid. Must match ADDIStocHA8."));
 
-    const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO);
+    const MCSymbol *MOSymbol = getMCSymbolForTOCPseudoMO(MO, *this);
 
     if (!MO.isCPI() || TM.getCodeModel() == CodeModel::Large)
       MOSymbol = lookUpOrCreateTOCEntry(MOSymbol);
@@ -897,7 +871,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
   case PPC::ADDItocL: {
     // Transform %xd = ADDItocL %xs, @sym
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
     // Change the opcode to ADDI8. If the global address is external, then
     // generate a TOC entry and reference that. Otherwise, reference the
@@ -912,7 +886,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
         "Interposable definitions must use indirect access."));
 
     const MCExpr *Exp =
-        MCSymbolRefExpr::create(getMCSymbolForTOCPseudoMO(MO),
+        MCSymbolRefExpr::create(getMCSymbolForTOCPseudoMO(MO, *this),
                                 MCSymbolRefExpr::VK_PPC_TOC_LO, OutContext);
     TmpInst.getOperand(2) = MCOperand::createExpr(Exp);
     EmitToStreamer(*OutStreamer, TmpInst);
@@ -937,7 +911,7 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case PPC::LDgotTprelL:
   case PPC::LDgotTprelL32: {
     // Transform %xd = LDgotTprelL @sym, %xs
-    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
+    LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
 
     // Change the opcode to LD.
     TmpInst.setOpcode(IsPPC64 ? PPC::LD : PPC::LWZ);
@@ -966,9 +940,9 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCBinaryExpr::createSub(MCSymbolRefExpr::create(GOTSymbol, OutContext),
                                 MCSymbolRefExpr::create(GOTRef, OutContext),
         OutContext);
-    OutStreamer->EmitLabel(GOTRef);
-    OutStreamer->EmitValue(OffsExpr, 4);
-    OutStreamer->EmitLabel(NextInstr);
+    OutStreamer->emitLabel(GOTRef);
+    OutStreamer->emitValue(OffsExpr, 4);
+    OutStreamer->emitLabel(NextInstr);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MFLR)
                                  .addReg(MI->getOperand(0).getReg()));
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::LWZ)
@@ -1167,10 +1141,11 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // suite shows a handful of test cases that fail this check for
     // Darwin.  Those need to be investigated before this sanity test
     // can be enabled for those subtargets.
-    if (!IsDarwin) {
-      unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
-      const MachineOperand &MO = MI->getOperand(OpNum);
-      if (MO.isGlobal() && MO.getGlobal()->getAlignment() < 4)
+    unsigned OpNum = (MI->getOpcode() == PPC::STD) ? 2 : 1;
+    const MachineOperand &MO = MI->getOperand(OpNum);
+    if (MO.isGlobal()) {
+      const DataLayout &DL = MO.getGlobal()->getParent()->getDataLayout();
+      if (MO.getGlobal()->getPointerAlignment(DL) < 4)
         llvm_unreachable("Global must be word-aligned for LD, STD, LWA!");
     }
     // Now process the instruction normally.
@@ -1178,17 +1153,17 @@ void PPCAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
   }
 
-  LowerPPCMachineInstrToMCInst(MI, TmpInst, *this, IsDarwin);
+  LowerPPCMachineInstrToMCInst(MI, TmpInst, *this);
   EmitToStreamer(*OutStreamer, TmpInst);
 }
 
-void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void PPCLinuxAsmPrinter::emitInstruction(const MachineInstr *MI) {
   if (!Subtarget->isPPC64())
-    return PPCAsmPrinter::EmitInstruction(MI);
+    return PPCAsmPrinter::emitInstruction(MI);
 
   switch (MI->getOpcode()) {
   default:
-    return PPCAsmPrinter::EmitInstruction(MI);
+    return PPCAsmPrinter::emitInstruction(MI);
   case TargetOpcode::PATCHABLE_FUNCTION_ENTER: {
     // .begin:
     //   b .end # lis 0, FuncId[16..32]
@@ -1203,7 +1178,7 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // of instructions change.
     MCSymbol *BeginOfSled = OutContext.createTempSymbol();
     MCSymbol *EndOfSled = OutContext.createTempSymbol();
-    OutStreamer->EmitLabel(BeginOfSled);
+    OutStreamer->emitLabel(BeginOfSled);
     EmitToStreamer(*OutStreamer,
                    MCInstBuilder(PPC::B).addExpr(
                        MCSymbolRefExpr::create(EndOfSled, OutContext)));
@@ -1218,8 +1193,8 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
                            OutContext.getOrCreateSymbol("__xray_FunctionEntry"),
                            OutContext)));
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0));
-    OutStreamer->EmitLabel(EndOfSled);
-    recordSled(BeginOfSled, *MI, SledKind::FUNCTION_ENTER);
+    OutStreamer->emitLabel(EndOfSled);
+    recordSled(BeginOfSled, *MI, SledKind::FUNCTION_ENTER, 2);
     break;
   }
   case TargetOpcode::PATCHABLE_RET: {
@@ -1229,7 +1204,7 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     for (const auto &MO :
          make_range(std::next(MI->operands_begin()), MI->operands_end())) {
       MCOperand MCOp;
-      if (LowerPPCMachineOperandToMCOperand(MO, MCOp, *this, false))
+      if (LowerPPCMachineOperandToMCOperand(MO, MCOp, *this))
         RetInst.addOperand(MCOp);
     }
 
@@ -1289,9 +1264,9 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     //
     // Update compiler-rt/lib/xray/xray_powerpc64.cc accordingly when number
     // of instructions change.
-    OutStreamer->EmitCodeAlignment(8);
+    OutStreamer->emitCodeAlignment(8);
     MCSymbol *BeginOfSled = OutContext.createTempSymbol();
-    OutStreamer->EmitLabel(BeginOfSled);
+    OutStreamer->emitLabel(BeginOfSled);
     EmitToStreamer(*OutStreamer, RetInst);
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::NOP));
     EmitToStreamer(
@@ -1306,8 +1281,8 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::MTLR8).addReg(PPC::X0));
     EmitToStreamer(*OutStreamer, RetInst);
     if (IsConditional)
-      OutStreamer->EmitLabel(FallthroughLabel);
-    recordSled(BeginOfSled, *MI, SledKind::FUNCTION_EXIT);
+      OutStreamer->emitLabel(FallthroughLabel);
+    recordSled(BeginOfSled, *MI, SledKind::FUNCTION_EXIT, 2);
     break;
   }
   case TargetOpcode::PATCHABLE_FUNCTION_EXIT:
@@ -1320,7 +1295,7 @@ void PPCLinuxAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   }
 }
 
-void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
+void PPCLinuxAsmPrinter::emitStartOfAsmFile(Module &M) {
   if (static_cast<const PPCTargetMachine &>(TM).isELFv2ABI()) {
     PPCTargetStreamer *TS =
       static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
@@ -1331,10 +1306,10 @@ void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
 
   if (static_cast<const PPCTargetMachine &>(TM).isPPC64() ||
       !isPositionIndependent())
-    return AsmPrinter::EmitStartOfAsmFile(M);
+    return AsmPrinter::emitStartOfAsmFile(M);
 
   if (M.getPICLevel() == PICLevel::SmallPIC)
-    return AsmPrinter::EmitStartOfAsmFile(M);
+    return AsmPrinter::emitStartOfAsmFile(M);
 
   OutStreamer->SwitchSection(OutContext.getELFSection(
       ".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC));
@@ -1342,7 +1317,7 @@ void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
   MCSymbol *TOCSym = OutContext.getOrCreateSymbol(Twine(".LTOC"));
   MCSymbol *CurrentPos = OutContext.createTempSymbol();
 
-  OutStreamer->EmitLabel(CurrentPos);
+  OutStreamer->emitLabel(CurrentPos);
 
   // The GOT pointer points to the middle of the GOT, in order to reference the
   // entire 64kB range.  0x8000 is the midpoint.
@@ -1351,24 +1326,24 @@ void PPCLinuxAsmPrinter::EmitStartOfAsmFile(Module &M) {
                             MCConstantExpr::create(0x8000, OutContext),
                             OutContext);
 
-  OutStreamer->EmitAssignment(TOCSym, tocExpr);
+  OutStreamer->emitAssignment(TOCSym, tocExpr);
 
   OutStreamer->SwitchSection(getObjFileLowering().getTextSection());
 }
 
-void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
+void PPCLinuxAsmPrinter::emitFunctionEntryLabel() {
   // linux/ppc32 - Normal entry label.
   if (!Subtarget->isPPC64() &&
       (!isPositionIndependent() ||
        MF->getFunction().getParent()->getPICLevel() == PICLevel::SmallPIC))
-    return AsmPrinter::EmitFunctionEntryLabel();
+    return AsmPrinter::emitFunctionEntryLabel();
 
   if (!Subtarget->isPPC64()) {
     const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
     if (PPCFI->usesPICBase() && !Subtarget->isSecurePlt()) {
-      MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol();
+      MCSymbol *RelocSymbol = PPCFI->getPICOffsetSymbol(*MF);
       MCSymbol *PICBase = MF->getPICBaseSymbol();
-      OutStreamer->EmitLabel(RelocSymbol);
+      OutStreamer->emitLabel(RelocSymbol);
 
       const MCExpr *OffsExpr =
         MCBinaryExpr::createSub(
@@ -1376,11 +1351,11 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
                                                                OutContext),
                                   MCSymbolRefExpr::create(PICBase, OutContext),
           OutContext);
-      OutStreamer->EmitValue(OffsExpr, 4);
-      OutStreamer->EmitLabel(CurrentFnSym);
+      OutStreamer->emitValue(OffsExpr, 4);
+      OutStreamer->emitLabel(CurrentFnSym);
       return;
     } else
-      return AsmPrinter::EmitFunctionEntryLabel();
+      return AsmPrinter::emitFunctionEntryLabel();
   }
 
   // ELFv2 ABI - Normal entry label.
@@ -1394,17 +1369,17 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
       const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
 
       MCSymbol *TOCSymbol = OutContext.getOrCreateSymbol(StringRef(".TOC."));
-      MCSymbol *GlobalEPSymbol = PPCFI->getGlobalEPSymbol();
+      MCSymbol *GlobalEPSymbol = PPCFI->getGlobalEPSymbol(*MF);
       const MCExpr *TOCDeltaExpr =
         MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
                                 MCSymbolRefExpr::create(GlobalEPSymbol,
                                                         OutContext),
                                 OutContext);
 
-      OutStreamer->EmitLabel(PPCFI->getTOCOffsetSymbol());
-      OutStreamer->EmitValue(TOCDeltaExpr, 8);
+      OutStreamer->emitLabel(PPCFI->getTOCOffsetSymbol(*MF));
+      OutStreamer->emitValue(TOCDeltaExpr, 8);
     }
-    return AsmPrinter::EmitFunctionEntryLabel();
+    return AsmPrinter::emitFunctionEntryLabel();
   }
 
   // Emit an official procedure descriptor.
@@ -1412,61 +1387,56 @@ void PPCLinuxAsmPrinter::EmitFunctionEntryLabel() {
   MCSectionELF *Section = OutStreamer->getContext().getELFSection(
       ".opd", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
   OutStreamer->SwitchSection(Section);
-  OutStreamer->EmitLabel(CurrentFnSym);
-  OutStreamer->EmitValueToAlignment(8);
+  OutStreamer->emitLabel(CurrentFnSym);
+  OutStreamer->emitValueToAlignment(8);
   MCSymbol *Symbol1 = CurrentFnSymForSize;
   // Generates a R_PPC64_ADDR64 (from FK_DATA_8) relocation for the function
   // entry point.
-  OutStreamer->EmitValue(MCSymbolRefExpr::create(Symbol1, OutContext),
+  OutStreamer->emitValue(MCSymbolRefExpr::create(Symbol1, OutContext),
                          8 /*size*/);
   MCSymbol *Symbol2 = OutContext.getOrCreateSymbol(StringRef(".TOC."));
   // Generates a R_PPC64_TOC relocation for TOC base insertion.
-  OutStreamer->EmitValue(
+  OutStreamer->emitValue(
     MCSymbolRefExpr::create(Symbol2, MCSymbolRefExpr::VK_PPC_TOCBASE, OutContext),
     8/*size*/);
   // Emit a null environment pointer.
-  OutStreamer->EmitIntValue(0, 8 /* size */);
+  OutStreamer->emitIntValue(0, 8 /* size */);
   OutStreamer->SwitchSection(Current.first, Current.second);
 }
 
-bool PPCLinuxAsmPrinter::doFinalization(Module &M) {
+void PPCLinuxAsmPrinter::emitEndOfAsmFile(Module &M) {
   const DataLayout &DL = getDataLayout();
 
   bool isPPC64 = DL.getPointerSizeInBits() == 64;
 
-  PPCTargetStreamer &TS =
-      static_cast<PPCTargetStreamer &>(*OutStreamer->getTargetStreamer());
+  PPCTargetStreamer *TS =
+      static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
 
   if (!TOC.empty()) {
-    MCSectionELF *Section;
-
-    if (isPPC64)
-      Section = OutStreamer->getContext().getELFSection(
-          ".toc", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
-        else
-          Section = OutStreamer->getContext().getELFSection(
-              ".got2", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
+    const char *Name = isPPC64 ? ".toc" : ".got2";
+    MCSectionELF *Section = OutContext.getELFSection(
+        Name, ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
     OutStreamer->SwitchSection(Section);
+    if (!isPPC64)
+      OutStreamer->emitValueToAlignment(4);
 
     for (const auto &TOCMapPair : TOC) {
       const MCSymbol *const TOCEntryTarget = TOCMapPair.first;
       MCSymbol *const TOCEntryLabel = TOCMapPair.second;
 
-      OutStreamer->EmitLabel(TOCEntryLabel);
-      if (isPPC64) {
-        TS.emitTCEntry(*TOCEntryTarget);
-      } else {
-        OutStreamer->EmitValueToAlignment(4);
-        OutStreamer->EmitSymbolValue(TOCEntryTarget, 4);
-      }
+      OutStreamer->emitLabel(TOCEntryLabel);
+      if (isPPC64 && TS != nullptr)
+        TS->emitTCEntry(*TOCEntryTarget);
+      else
+        OutStreamer->emitSymbolValue(TOCEntryTarget, 4);
     }
   }
 
-  return AsmPrinter::doFinalization(M);
+  PPCAsmPrinter::emitEndOfAsmFile(M);
 }
 
 /// EmitFunctionBodyStart - Emit a global entry point prefix for ELFv2.
-void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
+void PPCLinuxAsmPrinter::emitFunctionBodyStart() {
   // In the ELFv2 ABI, in functions that use the TOC register, we need to
   // provide two entry points.  The ABI guarantees that when calling the
   // local entry point, r2 is set up by the caller to contain the TOC base
@@ -1498,16 +1468,23 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
   //
   // This ensures we have r2 set up correctly while executing the function
   // body, no matter which entry point is called.
-  if (Subtarget->isELFv2ABI()
-      // Only do all that if the function uses r2 in the first place.
-      && !MF->getRegInfo().use_empty(PPC::X2)) {
+  const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
+  const bool UsesX2OrR2 = !MF->getRegInfo().use_empty(PPC::X2) ||
+                          !MF->getRegInfo().use_empty(PPC::R2);
+  const bool PCrelGEPRequired = Subtarget->isUsingPCRelativeCalls() &&
+                                UsesX2OrR2 && PPCFI->usesTOCBasePtr();
+  const bool NonPCrelGEPRequired = !Subtarget->isUsingPCRelativeCalls() &&
+                                   Subtarget->isELFv2ABI() && UsesX2OrR2;
+
+  // Only do all that if the function uses R2 as the TOC pointer
+  // in the first place. We don't need the global entry point if the
+  // function uses R2 as an allocatable register.
+  if (NonPCrelGEPRequired || PCrelGEPRequired) {
     // Note: The logic here must be synchronized with the code in the
     // branch-selection pass which sets the offset of the first block in the
     // function. This matters because it affects the alignment.
-    const PPCFunctionInfo *PPCFI = MF->getInfo<PPCFunctionInfo>();
-
-    MCSymbol *GlobalEntryLabel = PPCFI->getGlobalEPSymbol();
-    OutStreamer->EmitLabel(GlobalEntryLabel);
+    MCSymbol *GlobalEntryLabel = PPCFI->getGlobalEPSymbol(*MF);
+    OutStreamer->emitLabel(GlobalEntryLabel);
     const MCSymbolRefExpr *GlobalEntryLabelExp =
       MCSymbolRefExpr::create(GlobalEntryLabel, OutContext);
 
@@ -1517,21 +1494,19 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
         MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCSymbol, OutContext),
                                 GlobalEntryLabelExp, OutContext);
 
-      const MCExpr *TOCDeltaHi =
-        PPCMCExpr::createHa(TOCDeltaExpr, false, OutContext);
+      const MCExpr *TOCDeltaHi = PPCMCExpr::createHa(TOCDeltaExpr, OutContext);
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDIS)
                                    .addReg(PPC::X2)
                                    .addReg(PPC::X12)
                                    .addExpr(TOCDeltaHi));
 
-      const MCExpr *TOCDeltaLo =
-        PPCMCExpr::createLo(TOCDeltaExpr, false, OutContext);
+      const MCExpr *TOCDeltaLo = PPCMCExpr::createLo(TOCDeltaExpr, OutContext);
       EmitToStreamer(*OutStreamer, MCInstBuilder(PPC::ADDI)
                                    .addReg(PPC::X2)
                                    .addReg(PPC::X2)
                                    .addExpr(TOCDeltaLo));
     } else {
-      MCSymbol *TOCOffset = PPCFI->getTOCOffsetSymbol();
+      MCSymbol *TOCOffset = PPCFI->getTOCOffsetSymbol(*MF);
       const MCExpr *TOCOffsetDeltaExpr =
         MCBinaryExpr::createSub(MCSymbolRefExpr::create(TOCOffset, OutContext),
                                 GlobalEntryLabelExp, OutContext);
@@ -1546,8 +1521,8 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
                                    .addReg(PPC::X12));
     }
 
-    MCSymbol *LocalEntryLabel = PPCFI->getLocalEPSymbol();
-    OutStreamer->EmitLabel(LocalEntryLabel);
+    MCSymbol *LocalEntryLabel = PPCFI->getLocalEPSymbol(*MF);
+    OutStreamer->emitLabel(LocalEntryLabel);
     const MCSymbolRefExpr *LocalEntryLabelExp =
        MCSymbolRefExpr::create(LocalEntryLabel, OutContext);
     const MCExpr *LocalOffsetExp =
@@ -1559,13 +1534,43 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyStart() {
 
     if (TS)
       TS->emitLocalEntry(cast<MCSymbolELF>(CurrentFnSym), LocalOffsetExp);
+  } else if (Subtarget->isUsingPCRelativeCalls()) {
+    // When generating the entry point for a function we have a few scenarios
+    // based on whether or not that function uses R2 and whether or not that
+    // function makes calls (or is a leaf function).
+    // 1) A leaf function that does not use R2 (or treats it as callee-saved
+    //    and preserves it). In this case st_other=0 and both
+    //    the local and global entry points for the function are the same.
+    //    No special entry point code is required.
+    // 2) A function uses the TOC pointer R2. This function may or may not have
+    //    calls. In this case st_other=[2,6] and the global and local entry
+    //    points are different. Code to correctly setup the TOC pointer in R2
+    //    is put between the global and local entry points. This case is
+    //    covered by the if statatement above.
+    // 3) A function does not use the TOC pointer R2 but does have calls.
+    //    In this case st_other=1 since we do not know whether or not any
+    //    of the callees clobber R2. This case is dealt with in this else if
+    //    block. Tail calls are considered calls and the st_other should also
+    //    be set to 1 in that case as well.
+    // 4) The function does not use the TOC pointer but R2 is used inside
+    //    the function. In this case st_other=1 once again.
+    // 5) This function uses inline asm. We mark R2 as reserved if the function
+    //    has inline asm as we have to assume that it may be used.
+    if (MF->getFrameInfo().hasCalls() || MF->getFrameInfo().hasTailCall() ||
+        MF->hasInlineAsm() || (!PPCFI->usesTOCBasePtr() && UsesX2OrR2)) {
+      PPCTargetStreamer *TS =
+          static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
+      if (TS)
+        TS->emitLocalEntry(cast<MCSymbolELF>(CurrentFnSym),
+                           MCConstantExpr::create(1, OutContext));
+    }
   }
 }
 
 /// EmitFunctionBodyEnd - Print the traceback table before the .size
 /// directive.
 ///
-void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() {
+void PPCLinuxAsmPrinter::emitFunctionBodyEnd() {
   // Only the 64-bit target requires a traceback table.  For now,
   // we only emit the word of zeroes that GDB requires to find
   // the end of the function, and zeroes for the eight-byte
@@ -1574,19 +1579,73 @@ void PPCLinuxAsmPrinter::EmitFunctionBodyEnd() {
   // the PPC64 ELF ABI (this is a low-priority item because GDB does not
   // currently make use of these fields).
   if (Subtarget->isPPC64()) {
-    OutStreamer->EmitIntValue(0, 4/*size*/);
-    OutStreamer->EmitIntValue(0, 8/*size*/);
+    OutStreamer->emitIntValue(0, 4/*size*/);
+    OutStreamer->emitIntValue(0, 8/*size*/);
+  }
+}
+
+void PPCAIXAsmPrinter::emitLinkage(const GlobalValue *GV,
+                                   MCSymbol *GVSym) const {
+
+  assert(MAI->hasVisibilityOnlyWithLinkage() &&
+         "AIX's linkage directives take a visibility setting.");
+
+  MCSymbolAttr LinkageAttr = MCSA_Invalid;
+  switch (GV->getLinkage()) {
+  case GlobalValue::ExternalLinkage:
+    LinkageAttr = GV->isDeclaration() ? MCSA_Extern : MCSA_Global;
+    break;
+  case GlobalValue::LinkOnceAnyLinkage:
+  case GlobalValue::LinkOnceODRLinkage:
+  case GlobalValue::WeakAnyLinkage:
+  case GlobalValue::WeakODRLinkage:
+  case GlobalValue::ExternalWeakLinkage:
+    LinkageAttr = MCSA_Weak;
+    break;
+  case GlobalValue::AvailableExternallyLinkage:
+    LinkageAttr = MCSA_Extern;
+    break;
+  case GlobalValue::PrivateLinkage:
+    return;
+  case GlobalValue::InternalLinkage:
+    assert(GV->getVisibility() == GlobalValue::DefaultVisibility &&
+           "InternalLinkage should not have other visibility setting.");
+    LinkageAttr = MCSA_LGlobal;
+    break;
+  case GlobalValue::AppendingLinkage:
+    llvm_unreachable("Should never emit this");
+  case GlobalValue::CommonLinkage:
+    llvm_unreachable("CommonLinkage of XCOFF should not come to this path");
   }
+
+  assert(LinkageAttr != MCSA_Invalid && "LinkageAttr should not MCSA_Invalid.");
+
+  MCSymbolAttr VisibilityAttr = MCSA_Invalid;
+  switch (GV->getVisibility()) {
+
+  // TODO: "exported" and "internal" Visibility needs to go here.
+  case GlobalValue::DefaultVisibility:
+    break;
+  case GlobalValue::HiddenVisibility:
+    VisibilityAttr = MAI->getHiddenVisibilityAttr();
+    break;
+  case GlobalValue::ProtectedVisibility:
+    VisibilityAttr = MAI->getProtectedVisibilityAttr();
+    break;
+  }
+
+  OutStreamer->emitXCOFFSymbolLinkageWithVisibility(GVSym, LinkageAttr,
+                                                    VisibilityAttr);
 }
 
 void PPCAIXAsmPrinter::SetupMachineFunction(MachineFunction &MF) {
-  // Get the function descriptor symbol.
-  CurrentFnDescSym = getSymbol(&MF.getFunction());
-  // Set the containing csect.
-  MCSectionXCOFF *FnDescSec = OutStreamer->getContext().getXCOFFSection(
-      CurrentFnDescSym->getName(), XCOFF::XMC_DS, XCOFF::XTY_SD,
-      XCOFF::C_HIDEXT, SectionKind::getData());
-  cast<MCSymbolXCOFF>(CurrentFnDescSym)->setContainingCsect(FnDescSec);
+  // Setup CurrentFnDescSym and its containing csect.
+  MCSectionXCOFF *FnDescSec =
+      cast<MCSectionXCOFF>(getObjFileLowering().getSectionForFunctionDescriptor(
+          &MF.getFunction(), TM));
+  FnDescSec->setAlignment(Align(Subtarget->isPPC64() ? 8 : 4));
+
+  CurrentFnDescSym = FnDescSec->getQualNameSymbol();
 
   return AsmPrinter::SetupMachineFunction(MF);
 }
@@ -1603,31 +1662,20 @@ void PPCAIXAsmPrinter::ValidateGV(const GlobalVariable *GV) {
     report_fatal_error("COMDAT not yet supported by AIX.");
 }
 
-const MCExpr *PPCAIXAsmPrinter::lowerConstant(const Constant *CV) {
-  if (const Function *F = dyn_cast<Function>(CV)) {
-    MCSymbolXCOFF *FSym = cast<MCSymbolXCOFF>(getSymbol(F));
-    if (!FSym->hasContainingCsect()) {
-      const XCOFF::StorageClass SC =
-          F->isDeclaration()
-              ? TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(F)
-              : XCOFF::C_HIDEXT;
-      MCSectionXCOFF *Csect = OutStreamer->getContext().getXCOFFSection(
-          FSym->getName(), XCOFF::XMC_DS,
-          F->isDeclaration() ? XCOFF::XTY_ER : XCOFF::XTY_SD, SC,
-          SectionKind::getData());
-      FSym->setContainingCsect(Csect);
-    }
-    return MCSymbolRefExpr::create(
-        FSym->getContainingCsect()->getQualNameSymbol(), OutContext);
-  }
-  return PPCAsmPrinter::lowerConstant(CV);
+static bool isSpecialLLVMGlobalArrayForStaticInit(const GlobalVariable *GV) {
+  return StringSwitch<bool>(GV->getName())
+      .Cases("llvm.global_ctors", "llvm.global_dtors", true)
+      .Default(false);
 }
 
-void PPCAIXAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+void PPCAIXAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   ValidateGV(GV);
 
-  // External global variables are already handled.
-  if (!GV->hasInitializer())
+  // TODO: Update the handling of global arrays for static init when we support
+  // the ".ref" directive.
+  // Otherwise, we can skip these arrays, because the AIX linker collects
+  // static init functions simply based on their name.
+  if (isSpecialLLVMGlobalArrayForStaticInit(GV))
     return;
 
   // Create the symbol, set its storage class.
@@ -1635,156 +1683,133 @@ void PPCAIXAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   GVSym->setStorageClass(
       TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GV));
 
+  if (GV->isDeclarationForLinker()) {
+    emitLinkage(GV, GVSym);
+    return;
+  }
+
   SectionKind GVKind = getObjFileLowering().getKindForGlobal(GV, TM);
-  if ((!GVKind.isGlobalWriteableData() && !GVKind.isReadOnly()) ||
-      GVKind.isMergeable2ByteCString() || GVKind.isMergeable4ByteCString())
+  if (!GVKind.isGlobalWriteableData() && !GVKind.isReadOnly())
     report_fatal_error("Encountered a global variable kind that is "
                        "not supported yet.");
 
-  // Create the containing csect and switch to it.
   MCSectionXCOFF *Csect = cast<MCSectionXCOFF>(
       getObjFileLowering().SectionForGlobal(GV, GVKind, TM));
+
+  // Switch to the containing csect.
   OutStreamer->SwitchSection(Csect);
-  GVSym->setContainingCsect(Csect);
 
   const DataLayout &DL = GV->getParent()->getDataLayout();
 
   // Handle common symbols.
   if (GVKind.isCommon() || GVKind.isBSSLocal()) {
-    unsigned Align =
-      GV->getAlignment() ? GV->getAlignment() : DL.getPreferredAlignment(GV);
+    Align Alignment = GV->getAlign().getValueOr(DL.getPreferredAlign(GV));
     uint64_t Size = DL.getTypeAllocSize(GV->getType()->getElementType());
 
     if (GVKind.isBSSLocal())
-      OutStreamer->EmitXCOFFLocalCommonSymbol(
-          GVSym, Size, Csect->getQualNameSymbol(), Align);
+      OutStreamer->emitXCOFFLocalCommonSymbol(
+          OutContext.getOrCreateSymbol(GVSym->getUnqualifiedName()), Size,
+          GVSym, Alignment.value());
     else
-      OutStreamer->EmitCommonSymbol(Csect->getQualNameSymbol(), Size, Align);
+      OutStreamer->emitCommonSymbol(GVSym, Size, Alignment.value());
     return;
   }
 
   MCSymbol *EmittedInitSym = GVSym;
-  EmitLinkage(GV, EmittedInitSym);
-  EmitAlignment(getGVAlignment(GV, DL), GV);
-  OutStreamer->EmitLabel(EmittedInitSym);
-  EmitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
+  emitLinkage(GV, EmittedInitSym);
+  emitAlignment(getGVAlignment(GV, DL), GV);
+  OutStreamer->emitLabel(EmittedInitSym);
+  emitGlobalConstant(GV->getParent()->getDataLayout(), GV->getInitializer());
 }
 
-void PPCAIXAsmPrinter::EmitFunctionDescriptor() {
+void PPCAIXAsmPrinter::emitFunctionDescriptor() {
   const DataLayout &DL = getDataLayout();
   const unsigned PointerSize = DL.getPointerSizeInBits() == 64 ? 8 : 4;
 
   MCSectionSubPair Current = OutStreamer->getCurrentSection();
   // Emit function descriptor.
   OutStreamer->SwitchSection(
-      cast<MCSymbolXCOFF>(CurrentFnDescSym)->getContainingCsect());
-  OutStreamer->EmitLabel(CurrentFnDescSym);
+      cast<MCSymbolXCOFF>(CurrentFnDescSym)->getRepresentedCsect());
   // Emit function entry point address.
-  OutStreamer->EmitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext),
+  OutStreamer->emitValue(MCSymbolRefExpr::create(CurrentFnSym, OutContext),
                          PointerSize);
   // Emit TOC base address.
-  const MCSectionXCOFF *TOCBaseSec = OutStreamer->getContext().getXCOFFSection(
-      StringRef("TOC"), XCOFF::XMC_TC0, XCOFF::XTY_SD, XCOFF::C_HIDEXT,
-      SectionKind::getData());
-  const MCSymbol *TOCBaseSym = TOCBaseSec->getQualNameSymbol();
-  OutStreamer->EmitValue(MCSymbolRefExpr::create(TOCBaseSym, OutContext),
+  const MCSymbol *TOCBaseSym =
+      cast<MCSectionXCOFF>(getObjFileLowering().getTOCBaseSection())
+          ->getQualNameSymbol();
+  OutStreamer->emitValue(MCSymbolRefExpr::create(TOCBaseSym, OutContext),
                          PointerSize);
   // Emit a null environment pointer.
-  OutStreamer->EmitIntValue(0, PointerSize);
+  OutStreamer->emitIntValue(0, PointerSize);
 
   OutStreamer->SwitchSection(Current.first, Current.second);
 }
 
-void PPCAIXAsmPrinter::EmitEndOfAsmFile(Module &M) {
+void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
   // If there are no functions in this module, we will never need to reference
   // the TOC base.
   if (M.empty())
     return;
 
-  // Emit TOC base.
-  MCSectionXCOFF *TOCBaseSection = OutStreamer->getContext().getXCOFFSection(
-      StringRef("TOC"), XCOFF::XMC_TC0, XCOFF::XTY_SD, XCOFF::C_HIDEXT,
-      SectionKind::getData());
-  // The TOC-base always has 0 size, but 4 byte alignment.
-  TOCBaseSection->setAlignment(Align(4));
   // Switch to section to emit TOC base.
-  OutStreamer->SwitchSection(TOCBaseSection);
+  OutStreamer->SwitchSection(getObjFileLowering().getTOCBaseSection());
 
-  PPCTargetStreamer &TS =
-      static_cast<PPCTargetStreamer &>(*OutStreamer->getTargetStreamer());
+  PPCTargetStreamer *TS =
+      static_cast<PPCTargetStreamer *>(OutStreamer->getTargetStreamer());
+
+  const unsigned EntryByteSize = Subtarget->isPPC64() ? 8 : 4;
+  const unsigned TOCEntriesByteSize = TOC.size() * EntryByteSize;
+  // TODO: If TOC entries' size is larger than 32768, then we run out of
+  // positive displacement to reach the TOC entry. We need to decide how to
+  // handle entries' size larger than that later.
+  if (TOCEntriesByteSize > 32767) {
+    report_fatal_error("Handling of TOC entry displacement larger than 32767 "
+                       "is not yet implemented.");
+  }
 
   for (auto &I : TOC) {
     // Setup the csect for the current TC entry.
-    MCSectionXCOFF *TCEntry = OutStreamer->getContext().getXCOFFSection(
-        cast<MCSymbolXCOFF>(I.first)->getUnqualifiedName(), XCOFF::XMC_TC,
-        XCOFF::XTY_SD, XCOFF::C_HIDEXT, SectionKind::getData());
-    cast<MCSymbolXCOFF>(I.second)->setContainingCsect(TCEntry);
+    MCSectionXCOFF *TCEntry = cast<MCSectionXCOFF>(
+        getObjFileLowering().getSectionForTOCEntry(I.first));
     OutStreamer->SwitchSection(TCEntry);
 
-    OutStreamer->EmitLabel(I.second);
-    TS.emitTCEntry(*I.first);
+    OutStreamer->emitLabel(I.second);
+    if (TS != nullptr)
+      TS->emitTCEntry(*I.first);
   }
 }
 
-MCSymbol *
-PPCAIXAsmPrinter::getMCSymbolForTOCPseudoMO(const MachineOperand &MO) {
-  const GlobalObject *GO = nullptr;
-
-  // If the MO is a function or certain kind of globals, we want to make sure to
-  // refer to the csect symbol, otherwise we can just do the default handling.
-  if (MO.getType() != MachineOperand::MO_GlobalAddress ||
-      !(GO = dyn_cast<const GlobalObject>(MO.getGlobal())))
-    return PPCAsmPrinter::getMCSymbolForTOCPseudoMO(MO);
-
-  // Do an early error check for globals we don't support. This will go away
-  // eventually.
-  const auto *GV = dyn_cast<const GlobalVariable>(GO);
-  if (GV) {
-    ValidateGV(GV);
-  }
+bool PPCAIXAsmPrinter::doInitialization(Module &M) {
+  if (M.alias_size() > 0u)
+    report_fatal_error(
+        "module has aliases, which LLVM does not yet support for AIX");
 
-  MCSymbolXCOFF *XSym = cast<MCSymbolXCOFF>(getSymbol(GO));
-
-  // If the global object is a global variable without initializer or is a
-  // declaration of a function, then XSym is an external referenced symbol.
-  // Hence we may need to explictly create a MCSectionXCOFF for it so that we
-  // can return its symbol later.
-  if (GO->isDeclaration()) {
-    if (!XSym->hasContainingCsect()) {
-      // Make sure the storage class is set.
-      const XCOFF::StorageClass SC =
-          TargetLoweringObjectFileXCOFF::getStorageClassForGlobal(GO);
-      XSym->setStorageClass(SC);
-
-      MCSectionXCOFF *Csect = OutStreamer->getContext().getXCOFFSection(
-          XSym->getName(), isa<Function>(GO) ? XCOFF::XMC_DS : XCOFF::XMC_UA,
-          XCOFF::XTY_ER, SC, SectionKind::getMetadata());
-      XSym->setContainingCsect(Csect);
-    }
+  const bool Result = PPCAsmPrinter::doInitialization(M);
 
-    return XSym->getContainingCsect()->getQualNameSymbol();
-  }
+  auto setCsectAlignment = [this](const GlobalObject *GO) {
+    // Declarations have 0 alignment which is set by default.
+    if (GO->isDeclarationForLinker())
+      return;
 
-  // Handle initialized global variables and defined functions.
-  SectionKind GOKind = getObjFileLowering().getKindForGlobal(GO, TM);
-
-  if (GOKind.isText()) {
-    // If the MO is a function, we want to make sure to refer to the function
-    // descriptor csect.
-    return OutStreamer->getContext()
-        .getXCOFFSection(XSym->getName(), XCOFF::XMC_DS, XCOFF::XTY_SD,
-                         XCOFF::C_HIDEXT, SectionKind::getData())
-        ->getQualNameSymbol();
-  } else if (GOKind.isCommon() || GOKind.isBSSLocal()) {
-    // If the operand is a common then we should refer to the csect symbol.
-    return cast<MCSectionXCOFF>(
-               getObjFileLowering().SectionForGlobal(GO, GOKind, TM))
-        ->getQualNameSymbol();
-  }
+    SectionKind GOKind = getObjFileLowering().getKindForGlobal(GO, TM);
+    MCSectionXCOFF *Csect = cast<MCSectionXCOFF>(
+        getObjFileLowering().SectionForGlobal(GO, GOKind, TM));
+
+    Align GOAlign = getGVAlignment(GO, GO->getParent()->getDataLayout());
+    if (GOAlign > Csect->getAlignment())
+      Csect->setAlignment(GOAlign);
+  };
+
+  // We need to know, up front, the alignment of csects for the assembly path,
+  // because once a .csect directive gets emitted, we could not change the
+  // alignment value on it.
+  for (const auto &G : M.globals())
+    setCsectAlignment(&G);
+
+  for (const auto &F : M)
+    setCsectAlignment(&F);
 
-  // Other global variables are refered to by labels inside of a single csect,
-  // so refer to the label directly.
-  return getSymbol(GV);
+  return Result;
 }
 
 /// createPPCAsmPrinterPass - Returns a pass that prints the PPC assembly code
diff --git a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
index 104cf2ba3c00..2259a29f838a 100644
--- a/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
+++ b/llvm/lib/Target/PowerPC/PPCBoolRetToInt.cpp
@@ -220,7 +220,7 @@ class PPCBoolRetToInt : public FunctionPass {
     auto Defs = findAllDefs(U);
 
     // If the values are all Constants or Arguments, don't bother
-    if (llvm::none_of(Defs, isa<Instruction, Value *>))
+    if (llvm::none_of(Defs, [](Value *V) { return isa<Instruction>(V); }))
       return false;
 
     // Presently, we only know how to handle PHINode, Constant, Arguments and
diff --git a/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp b/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
index 109b665e0d57..50ae4450a837 100644
--- a/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
+++ b/llvm/lib/Target/PowerPC/PPCBranchCoalescing.cpp
@@ -272,6 +272,11 @@ bool PPCBranchCoalescing::canCoalesceBranch(CoalescingCandidateInfo &Cand) {
     return false;
   }
 
+  if (Cand.BranchBlock->mayHaveInlineAsmBr()) {
+    LLVM_DEBUG(dbgs() << "Inline Asm Br - skip\n");
+    return false;
+  }
+
   // For now only consider triangles (i.e, BranchTargetBlock is set,
   // FalseMBB is null, and BranchTargetBlock is a successor to BranchBlock)
   if (!Cand.BranchTargetBlock || FalseMBB ||
diff --git a/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp b/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
index cdff4d383d23..47b9e97f0d67 100644
--- a/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
+++ b/llvm/lib/Target/PowerPC/PPCBranchSelector.cpp
@@ -31,6 +31,9 @@ using namespace llvm;
 #define DEBUG_TYPE "ppc-branch-select"
 
 STATISTIC(NumExpanded, "Number of branches expanded to long format");
+STATISTIC(NumPrefixed, "Number of prefixed instructions");
+STATISTIC(NumPrefixedAligned,
+          "Number of prefixed instructions that have been aligned");
 
 namespace {
   struct PPCBSel : public MachineFunctionPass {
@@ -82,7 +85,7 @@ FunctionPass *llvm::createPPCBranchSelectionPass() {
 unsigned PPCBSel::GetAlignmentAdjustment(MachineBasicBlock &MBB,
                                          unsigned Offset) {
   const Align Alignment = MBB.getAlignment();
-  if (Alignment == Align::None())
+  if (Alignment == Align(1))
     return 0;
 
   const Align ParentAlign = MBB.getParent()->getAlignment();
@@ -134,10 +137,38 @@ unsigned PPCBSel::ComputeBlockSizes(MachineFunction &Fn) {
     }
 
     unsigned BlockSize = 0;
+    unsigned UnalignedBytesRemaining = 0;
     for (MachineInstr &MI : *MBB) {
-      BlockSize += TII->getInstSizeInBytes(MI);
+      unsigned MINumBytes = TII->getInstSizeInBytes(MI);
       if (MI.isInlineAsm() && (FirstImpreciseBlock < 0))
         FirstImpreciseBlock = MBB->getNumber();
+      if (TII->isPrefixed(MI.getOpcode())) {
+        NumPrefixed++;
+
+        // All 8 byte instructions may require alignment. Each 8 byte
+        // instruction may be aligned by another 4 bytes.
+        // This means that an 8 byte instruction may require 12 bytes
+        // (8 for the instruction itself and 4 for the alignment nop).
+        // This will happen if an 8 byte instruction can be aligned to 64 bytes
+        // by only adding a 4 byte nop.
+        // We don't know the alignment at this point in the code so we have to
+        // adopt a more pessimistic approach. If an instruction may need
+        // alignment we assume that it does need alignment and add 4 bytes to
+        // it. As a result we may end up with more long branches than before
+        // but we are in the safe position where if we need a long branch we
+        // have one.
+        // The if statement checks to make sure that two 8 byte instructions
+        // are at least 64 bytes away from each other. It is not possible for
+        // two instructions that both need alignment to be within 64 bytes of
+        // each other.
+        if (!UnalignedBytesRemaining) {
+          BlockSize += 4;
+          UnalignedBytesRemaining = 60;
+          NumPrefixedAligned++;
+        }
+      }
+      UnalignedBytesRemaining -= std::min(UnalignedBytesRemaining, MINumBytes);
+      BlockSize += MINumBytes;
     }
 
     BlockSizes[MBB->getNumber()].first = BlockSize;
diff --git a/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp b/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
index 4ce705300e1b..bb12e05173a6 100644
--- a/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
+++ b/llvm/lib/Target/PowerPC/PPCCTRLoops.cpp
@@ -46,7 +46,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
-#include "llvm/PassSupport.h"
+#include "llvm/Pass.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/Target/PowerPC/PPCCallingConv.td b/llvm/lib/Target/PowerPC/PPCCallingConv.td
index 369b9ce1a711..1eaa7f7a44b3 100644
--- a/llvm/lib/Target/PowerPC/PPCCallingConv.td
+++ b/llvm/lib/Target/PowerPC/PPCCallingConv.td
@@ -283,15 +283,6 @@ def CC_PPC32_SVR4_ByVal : CallingConv<[
 def CSR_Altivec : CalleeSavedRegs<(add V20, V21, V22, V23, V24, V25, V26, V27,
                                        V28, V29, V30, V31)>;
 
-def CSR_Darwin32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
-                                        R21, R22, R23, R24, R25, R26, R27, R28,
-                                        R29, R30, R31, F14, F15, F16, F17, F18,
-                                        F19, F20, F21, F22, F23, F24, F25, F26,
-                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
-                                   )>;
-
-def CSR_Darwin32_Altivec : CalleeSavedRegs<(add CSR_Darwin32, CSR_Altivec)>;
-
 // SPE does not use FPRs, so break out the common register set as base.
 def CSR_SVR432_COMM : CalleeSavedRegs<(add R14, R15, R16, R17, R18, R19, R20,
                                           R21, R22, R23, R24, R25, R26, R27,
@@ -316,45 +307,20 @@ def CSR_AIX32 : CalleeSavedRegs<(add R13, R14, R15, R16, R17, R18, R19, R20,
                                      F27, F28, F29, F30, F31, CR2, CR3, CR4
                                 )>;
 
-def CSR_Darwin64 : CalleeSavedRegs<(add X13, X14, X15, X16, X17, X18, X19, X20,
-                                        X21, X22, X23, X24, X25, X26, X27, X28,
-                                        X29, X30, X31, F14, F15, F16, F17, F18,
-                                        F19, F20, F21, F22, F23, F24, F25, F26,
-                                        F27, F28, F29, F30, F31, CR2, CR3, CR4
-                                   )>;
-
-def CSR_Darwin64_Altivec : CalleeSavedRegs<(add CSR_Darwin64, CSR_Altivec)>;
-
-def CSR_SVR464   : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20,
+// Common CalleeSavedRegs for SVR4 and AIX.
+def CSR_PPC64   : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20,
                                         X21, X22, X23, X24, X25, X26, X27, X28,
                                         X29, X30, X31, F14, F15, F16, F17, F18,
                                         F19, F20, F21, F22, F23, F24, F25, F26,
                                         F27, F28, F29, F30, F31, CR2, CR3, CR4
                                    )>;
 
-def CSR_AIX64 : CalleeSavedRegs<(add X14, X15, X16, X17, X18, X19, X20,
-                                     X21, X22, X23, X24, X25, X26, X27, X28,
-                                     X29, X30, X31, F14, F15, F16, F17, F18,
-                                     F19, F20, F21, F22, F23, F24, F25, F26,
-                                     F27, F28, F29, F30, F31, CR2, CR3, CR4
-                                )>;
-
-// CSRs that are handled by prologue, epilogue.
-def CSR_SRV464_TLS_PE : CalleeSavedRegs<(add)>;
-
-def CSR_SVR464_ViaCopy : CalleeSavedRegs<(add CSR_SVR464)>;
-
-def CSR_SVR464_Altivec : CalleeSavedRegs<(add CSR_SVR464, CSR_Altivec)>;
-
-def CSR_SVR464_Altivec_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_Altivec)>;
-
-def CSR_SVR464_R2 : CalleeSavedRegs<(add CSR_SVR464, X2)>;
 
-def CSR_SVR464_R2_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_R2)>;
+def CSR_PPC64_Altivec : CalleeSavedRegs<(add CSR_PPC64, CSR_Altivec)>;
 
-def CSR_SVR464_R2_Altivec : CalleeSavedRegs<(add CSR_SVR464_Altivec, X2)>;
+def CSR_PPC64_R2 : CalleeSavedRegs<(add CSR_PPC64, X2)>;
 
-def CSR_SVR464_R2_Altivec_ViaCopy : CalleeSavedRegs<(add CSR_SVR464_R2_Altivec)>;
+def CSR_PPC64_R2_Altivec : CalleeSavedRegs<(add CSR_PPC64_Altivec, X2)>;
 
 def CSR_NoRegs : CalleeSavedRegs<(add)>;
 
diff --git a/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp b/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
index aa5d830b549e..c9f74bbf861c 100644
--- a/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
+++ b/llvm/lib/Target/PowerPC/PPCEarlyReturn.cpp
@@ -90,8 +90,8 @@ protected:
               // This is a conditional branch to the return. Replace the branch
               // with a bclr.
               BuildMI(**PI, J, J->getDebugLoc(), TII->get(PPC::BCCLR))
-                  .addImm(J->getOperand(0).getImm())
-                  .addReg(J->getOperand(1).getReg())
+                  .add(J->getOperand(0))
+                  .add(J->getOperand(1))
                   .copyImplicitOps(*I);
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
@@ -106,7 +106,7 @@ protected:
               BuildMI(
                   **PI, J, J->getDebugLoc(),
                   TII->get(J->getOpcode() == PPC::BC ? PPC::BCLR : PPC::BCLRn))
-                  .addReg(J->getOperand(0).getReg())
+                  .add(J->getOperand(0))
                   .copyImplicitOps(*I);
               MachineBasicBlock::iterator K = J--;
               K->eraseFromParent();
diff --git a/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp b/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp
index e8ef451c7ec9..4c74e82cf041 100644
--- a/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp
+++ b/llvm/lib/Target/PowerPC/PPCExpandISEL.cpp
@@ -381,21 +381,10 @@ void PPCExpandISEL::reorganizeBlockLayout(BlockISELList &BIL,
                          MBB->end());
     NewSuccessor->transferSuccessorsAndUpdatePHIs(MBB);
 
-    // Copy the original liveIns of MBB to NewSuccessor.
-    for (auto &LI : MBB->liveins())
-      NewSuccessor->addLiveIn(LI);
-
-    // After splitting the NewSuccessor block, Regs defined but not killed
-    // in MBB should be treated as liveins of NewSuccessor.
-    // Note: Cannot use stepBackward instead since we are using the Reg
-    // liveness state at the end of MBB (liveOut of MBB) as the liveIn for
-    // NewSuccessor. Otherwise, will cause cyclic dependence.
-    LivePhysRegs LPR(*MF->getSubtarget<PPCSubtarget>().getRegisterInfo());
-    SmallVector<std::pair<MCPhysReg, const MachineOperand *>, 2> Clobbers;
-    for (MachineInstr &MI : *MBB)
-      LPR.stepForward(MI, Clobbers);
-    for (auto &LI : LPR)
-      NewSuccessor->addLiveIn(LI);
+    // Update the liveins for NewSuccessor.
+    LivePhysRegs LPR;
+    computeAndAddLiveIns(LPR, *NewSuccessor);
+
   } else {
     // Remove successor from MBB.
     MBB->removeSuccessor(Successor);
@@ -441,44 +430,26 @@ void PPCExpandISEL::populateBlocks(BlockISELList &BIL) {
                                                        // condition is true
     MachineOperand &FalseValue = MI->getOperand(2); // Value to store if
                                                        // condition is false
-    MachineOperand &ConditionRegister = MI->getOperand(3); // Condition
 
     LLVM_DEBUG(dbgs() << "Dest: " << Dest << "\n");
     LLVM_DEBUG(dbgs() << "TrueValue: " << TrueValue << "\n");
     LLVM_DEBUG(dbgs() << "FalseValue: " << FalseValue << "\n");
-    LLVM_DEBUG(dbgs() << "ConditionRegister: " << ConditionRegister << "\n");
+    LLVM_DEBUG(dbgs() << "ConditionRegister: " << MI->getOperand(3) << "\n");
 
     // If the Dest Register and True Value Register are not the same one, we
     // need the True Block.
     bool IsADDIInstRequired = !useSameRegister(Dest, TrueValue);
     bool IsORIInstRequired = !useSameRegister(Dest, FalseValue);
 
-    if (IsADDIInstRequired) {
-      // Copy the result into the destination if the condition is true.
+    // Copy the result into the destination if the condition is true.
+    if (IsADDIInstRequired)
       BuildMI(*TrueBlock, TrueBlockI, dl,
               TII->get(isISEL8(*MI) ? PPC::ADDI8 : PPC::ADDI))
           .add(Dest)
           .add(TrueValue)
           .add(MachineOperand::CreateImm(0));
 
-      // Add the LiveIn registers required by true block.
-      TrueBlock->addLiveIn(TrueValue.getReg());
-    }
-
-    if (IsORIInstRequired) {
-      // Add the LiveIn registers required by false block.
-      FalseBlock->addLiveIn(FalseValue.getReg());
-    }
-
-    if (NewSuccessor) {
-      // Add the LiveIn registers required by NewSuccessor block.
-      NewSuccessor->addLiveIn(Dest.getReg());
-      NewSuccessor->addLiveIn(TrueValue.getReg());
-      NewSuccessor->addLiveIn(FalseValue.getReg());
-      NewSuccessor->addLiveIn(ConditionRegister.getReg());
-    }
-
-    // Copy the value into the destination if the condition is false.
+    // Copy the result into the destination if the condition is false.
     if (IsORIInstRequired)
       BuildMI(*FalseBlock, FalseBlockI, dl,
               TII->get(isISEL8(*MI) ? PPC::ORI8 : PPC::ORI))
@@ -490,6 +461,18 @@ void PPCExpandISEL::populateBlocks(BlockISELList &BIL) {
 
     NumExpanded++;
   }
+
+  if (IsTrueBlockRequired) {
+    // Update the liveins for TrueBlock.
+    LivePhysRegs LPR;
+    computeAndAddLiveIns(LPR, *TrueBlock);
+  }
+
+  if (IsFalseBlockRequired) {
+    // Update the liveins for FalseBlock.
+    LivePhysRegs LPR;
+    computeAndAddLiveIns(LPR, *FalseBlock);
+  }
 }
 
 void PPCExpandISEL::expandMergeableISELs(BlockISELList &BIL) {
diff --git a/llvm/lib/Target/PowerPC/PPCFastISel.cpp b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
index d8425d89da92..39790ac9a8aa 100644
--- a/llvm/lib/Target/PowerPC/PPCFastISel.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFastISel.cpp
@@ -87,6 +87,7 @@ class PPCFastISel final : public FastISel {
 
   const TargetMachine &TM;
   const PPCSubtarget *PPCSubTarget;
+  const PPCSubtarget *Subtarget;
   PPCFunctionInfo *PPCFuncInfo;
   const TargetInstrInfo &TII;
   const TargetLowering &TLI;
@@ -97,12 +98,12 @@ class PPCFastISel final : public FastISel {
                          const TargetLibraryInfo *LibInfo)
         : FastISel(FuncInfo, LibInfo), TM(FuncInfo.MF->getTarget()),
           PPCSubTarget(&FuncInfo.MF->getSubtarget<PPCSubtarget>()),
+          Subtarget(&FuncInfo.MF->getSubtarget<PPCSubtarget>()),
           PPCFuncInfo(FuncInfo.MF->getInfo<PPCFunctionInfo>()),
-          TII(*PPCSubTarget->getInstrInfo()),
-          TLI(*PPCSubTarget->getTargetLowering()),
+          TII(*Subtarget->getInstrInfo()), TLI(*Subtarget->getTargetLowering()),
           Context(&FuncInfo.Fn->getContext()) {}
 
-  // Backend specific FastISel code.
+    // Backend specific FastISel code.
   private:
     bool fastSelectInstruction(const Instruction *I) override;
     unsigned fastMaterializeConstant(const Constant *C) override;
@@ -456,7 +457,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
                               bool IsZExt, unsigned FP64LoadOpc) {
   unsigned Opc;
   bool UseOffset = true;
-  bool HasSPE = PPCSubTarget->hasSPE();
+  bool HasSPE = Subtarget->hasSPE();
 
   // If ResultReg is given, it determines the register class of the load.
   // Otherwise, RC is the register class to use.  If the result of the
@@ -498,7 +499,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
       UseOffset = ((Addr.Offset & 3) == 0);
       break;
     case MVT::f32:
-      Opc = PPCSubTarget->hasSPE() ? PPC::SPELWZ : PPC::LFS;
+      Opc = Subtarget->hasSPE() ? PPC::SPELWZ : PPC::LFS;
       break;
     case MVT::f64:
       Opc = FP64LoadOpc;
@@ -536,7 +537,7 @@ bool PPCFastISel::PPCEmitLoad(MVT VT, Register &ResultReg, Address &Addr,
         MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI,
                                           Addr.Offset),
         MachineMemOperand::MOLoad, MFI.getObjectSize(Addr.Base.FI),
-        MFI.getObjectAlignment(Addr.Base.FI));
+        MFI.getObjectAlign(Addr.Base.FI));
 
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addImm(Addr.Offset).addFrameIndex(Addr.Base.FI).addMemOperand(MMO);
@@ -614,7 +615,7 @@ bool PPCFastISel::SelectLoad(const Instruction *I) {
 
   Register ResultReg = 0;
   if (!PPCEmitLoad(VT, ResultReg, Addr, RC, true,
-      PPCSubTarget->hasSPE() ? PPC::EVLDD : PPC::LFD))
+                   Subtarget->hasSPE() ? PPC::EVLDD : PPC::LFD))
     return false;
   updateValueMap(I, ResultReg);
   return true;
@@ -647,10 +648,10 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
       UseOffset = ((Addr.Offset & 3) == 0);
       break;
     case MVT::f32:
-      Opc = PPCSubTarget->hasSPE() ? PPC::SPESTW : PPC::STFS;
+      Opc = Subtarget->hasSPE() ? PPC::SPESTW : PPC::STFS;
       break;
     case MVT::f64:
-      Opc = PPCSubTarget->hasSPE() ? PPC::EVSTDD : PPC::STFD;
+      Opc = Subtarget->hasSPE() ? PPC::EVSTDD : PPC::STFD;
       break;
   }
 
@@ -682,7 +683,7 @@ bool PPCFastISel::PPCEmitStore(MVT VT, unsigned SrcReg, Address &Addr) {
         MachinePointerInfo::getFixedStack(*FuncInfo.MF, Addr.Base.FI,
                                           Addr.Offset),
         MachineMemOperand::MOStore, MFI.getObjectSize(Addr.Base.FI),
-        MFI.getObjectAlignment(Addr.Base.FI));
+        MFI.getObjectAlign(Addr.Base.FI));
 
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc))
         .addReg(SrcReg)
@@ -794,8 +795,9 @@ bool PPCFastISel::SelectBranch(const Instruction *I) {
         return false;
 
       BuildMI(*BrBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::BCC))
-        .addImm(PPCSubTarget->hasSPE() ? PPC::PRED_SPE : PPCPred)
-        .addReg(CondReg).addMBB(TBB);
+          .addImm(Subtarget->hasSPE() ? PPC::PRED_SPE : PPCPred)
+          .addReg(CondReg)
+          .addMBB(TBB);
       finishCondBranch(BI->getParent(), TBB, FBB);
       return true;
     }
@@ -827,7 +829,7 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
     return false;
   MVT SrcVT = SrcEVT.getSimpleVT();
 
-  if (SrcVT == MVT::i1 && PPCSubTarget->useCRBits())
+  if (SrcVT == MVT::i1 && Subtarget->useCRBits())
     return false;
 
   // See if operand 2 is an immediate encodeable in the compare.
@@ -836,7 +838,7 @@ bool PPCFastISel::PPCEmitCmp(const Value *SrcValue1, const Value *SrcValue2,
   // similar to ARM in this regard.
   long Imm = 0;
   bool UseImm = false;
-  const bool HasSPE = PPCSubTarget->hasSPE();
+  const bool HasSPE = Subtarget->hasSPE();
 
   // Only 16-bit integer constants can be represented in compares for
   // PowerPC.  Others will be materialized into a register.
@@ -988,7 +990,7 @@ bool PPCFastISel::SelectFPTrunc(const Instruction *I) {
   // Round the result to single precision.
   unsigned DestReg;
   auto RC = MRI.getRegClass(SrcReg);
-  if (PPCSubTarget->hasSPE()) {
+  if (Subtarget->hasSPE()) {
     DestReg = createResultReg(&PPC::GPRCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
       TII.get(PPC::EFSCFD), DestReg)
@@ -1030,7 +1032,7 @@ unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
   // Get a stack slot 8 bytes wide, aligned on an 8-byte boundary.
   Address Addr;
   Addr.BaseType = Address::FrameIndexBase;
-  Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
+  Addr.Base.FI = MFI.CreateStackObject(8, Align(8), false);
 
   // Store the value from the GPR.
   if (!PPCEmitStore(MVT::i64, SrcReg, Addr))
@@ -1043,10 +1045,10 @@ unsigned PPCFastISel::PPCMoveToFPReg(MVT SrcVT, unsigned SrcReg,
   if (SrcVT == MVT::i32) {
     if (!IsSigned) {
       LoadOpc = PPC::LFIWZX;
-      Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4;
-    } else if (PPCSubTarget->hasLFIWAX()) {
+      Addr.Offset = (Subtarget->isLittleEndian()) ? 0 : 4;
+    } else if (Subtarget->hasLFIWAX()) {
       LoadOpc = PPC::LFIWAX;
-      Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4;
+      Addr.Offset = (Subtarget->isLittleEndian()) ? 0 : 4;
     }
   }
 
@@ -1086,7 +1088,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
     return false;
 
   // Shortcut for SPE.  Doesn't need to store/load, since it's all in the GPRs
-  if (PPCSubTarget->hasSPE()) {
+  if (Subtarget->hasSPE()) {
     unsigned Opc;
     if (DstVT == MVT::f32)
       Opc = IsSigned ? PPC::EFSCFSI : PPC::EFSCFUI;
@@ -1103,7 +1105,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
 
   // We can only lower an unsigned convert if we have the newer
   // floating-point conversion operations.
-  if (!IsSigned && !PPCSubTarget->hasFPCVT())
+  if (!IsSigned && !Subtarget->hasFPCVT())
     return false;
 
   // FIXME: For now we require the newer floating-point conversion operations
@@ -1111,7 +1113,7 @@ bool PPCFastISel::SelectIToFP(const Instruction *I, bool IsSigned) {
   // to single-precision float.  Otherwise we have to generate a lot of
   // fiddly code to avoid double rounding.  If necessary, the fiddly code
   // can be found in PPCTargetLowering::LowerINT_TO_FP().
-  if (DstVT == MVT::f32 && !PPCSubTarget->hasFPCVT())
+  if (DstVT == MVT::f32 && !Subtarget->hasFPCVT())
     return false;
 
   // Extend the input if necessary.
@@ -1159,7 +1161,7 @@ unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
   // easiest code gen possible.
   Address Addr;
   Addr.BaseType = Address::FrameIndexBase;
-  Addr.Base.FI = MFI.CreateStackObject(8, 8, false);
+  Addr.Base.FI = MFI.CreateStackObject(8, Align(8), false);
 
   // Store the value from the FPR.
   if (!PPCEmitStore(MVT::f64, SrcReg, Addr))
@@ -1168,7 +1170,7 @@ unsigned PPCFastISel::PPCMoveToIntReg(const Instruction *I, MVT VT,
   // Reload it into a GPR.  If we want an i32 on big endian, modify the
   // address to have a 4-byte offset so we load from the right place.
   if (VT == MVT::i32)
-    Addr.Offset = (PPCSubTarget->isLittleEndian()) ? 0 : 4;
+    Addr.Offset = (Subtarget->isLittleEndian()) ? 0 : 4;
 
   // Look at the currently assigned register for this instruction
   // to determine the required register class.
@@ -1196,8 +1198,8 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
     return false;
 
   // If we don't have FCTIDUZ, or SPE, and we need it, punt to SelectionDAG.
-  if (DstVT == MVT::i64 && !IsSigned &&
-      !PPCSubTarget->hasFPCVT() && !PPCSubTarget->hasSPE())
+  if (DstVT == MVT::i64 && !IsSigned && !Subtarget->hasFPCVT() &&
+      !Subtarget->hasSPE())
     return false;
 
   Value *Src = I->getOperand(0);
@@ -1226,7 +1228,7 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
   unsigned Opc;
   auto RC = MRI.getRegClass(SrcReg);
 
-  if (PPCSubTarget->hasSPE()) {
+  if (Subtarget->hasSPE()) {
     DestReg = createResultReg(&PPC::GPRCRegClass);
     if (IsSigned)
       Opc = InRC == &PPC::GPRCRegClass ? PPC::EFSCTSIZ : PPC::EFDCTSIZ;
@@ -1234,7 +1236,7 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
       Opc = InRC == &PPC::GPRCRegClass ? PPC::EFSCTUIZ : PPC::EFDCTUIZ;
   } else if (isVSFRCRegClass(RC)) {
     DestReg = createResultReg(&PPC::VSFRCRegClass);
-    if (DstVT == MVT::i32) 
+    if (DstVT == MVT::i32)
       Opc = IsSigned ? PPC::XSCVDPSXWS : PPC::XSCVDPUXWS;
     else
       Opc = IsSigned ? PPC::XSCVDPSXDS : PPC::XSCVDPUXDS;
@@ -1244,7 +1246,7 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
       if (IsSigned)
         Opc = PPC::FCTIWZ;
       else
-        Opc = PPCSubTarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
+        Opc = Subtarget->hasFPCVT() ? PPC::FCTIWUZ : PPC::FCTIDZ;
     else
       Opc = IsSigned ? PPC::FCTIDZ : PPC::FCTIDUZ;
   }
@@ -1254,8 +1256,9 @@ bool PPCFastISel::SelectFPToI(const Instruction *I, bool IsSigned) {
     .addReg(SrcReg);
 
   // Now move the integer value from a float register to an integer register.
-  unsigned IntReg = PPCSubTarget->hasSPE() ? DestReg :
-    PPCMoveToIntReg(I, DstVT, DestReg, IsSigned);
+  unsigned IntReg = Subtarget->hasSPE()
+                        ? DestReg
+                        : PPCMoveToIntReg(I, DstVT, DestReg, IsSigned);
 
   if (IntReg == 0)
     return false;
@@ -1383,8 +1386,8 @@ bool PPCFastISel::processCallArgs(SmallVectorImpl<Value*> &Args,
   CCState CCInfo(CC, IsVarArg, *FuncInfo.MF, ArgLocs, *Context);
 
   // Reserve space for the linkage area on the stack.
-  unsigned LinkageSize = PPCSubTarget->getFrameLowering()->getLinkageSize();
-  CCInfo.AllocateStack(LinkageSize, 8);
+  unsigned LinkageSize = Subtarget->getFrameLowering()->getLinkageSize();
+  CCInfo.AllocateStack(LinkageSize, Align(8));
 
   CCInfo.AnalyzeCallOperands(ArgVTs, ArgFlags, CC_PPC64_ELF_FIS);
 
@@ -1573,7 +1576,7 @@ bool PPCFastISel::fastLowerCall(CallLoweringInfo &CLI) {
   else if (!isTypeLegal(RetTy, RetVT) && RetVT != MVT::i16 &&
            RetVT != MVT::i8)
     return false;
-  else if (RetVT == MVT::i1 && PPCSubTarget->useCRBits())
+  else if (RetVT == MVT::i1 && Subtarget->useCRBits())
     // We can't handle boolean returns when CR bits are in use.
     return false;
 
@@ -1688,9 +1691,6 @@ bool PPCFastISel::SelectRet(const Instruction *I) {
   if (!FuncInfo.CanLowerReturn)
     return false;
 
-  if (TLI.supportSplitCSR(FuncInfo.MF))
-    return false;
-
   const ReturnInst *Ret = cast<ReturnInst>(I);
   const Function &F = *I->getParent()->getParent();
 
@@ -1996,10 +1996,9 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
     return 0;
 
   // All FP constants are loaded from the constant pool.
-  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
-  assert(Align > 0 && "Unexpectedly missing alignment information!");
-  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Align);
-  const bool HasSPE = PPCSubTarget->hasSPE();
+  Align Alignment = DL.getPrefTypeAlign(CFP->getType());
+  unsigned Idx = MCP.getConstantPoolIndex(cast<Constant>(CFP), Alignment);
+  const bool HasSPE = Subtarget->hasSPE();
   const TargetRegisterClass *RC;
   if (HasSPE)
     RC = ((VT == MVT::f32) ? &PPC::GPRCRegClass : &PPC::SPERCRegClass);
@@ -2011,7 +2010,7 @@ unsigned PPCFastISel::PPCMaterializeFP(const ConstantFP *CFP, MVT VT) {
 
   MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
       MachinePointerInfo::getConstantPool(*FuncInfo.MF),
-      MachineMemOperand::MOLoad, (VT == MVT::f32) ? 4 : 8, Align);
+      MachineMemOperand::MOLoad, (VT == MVT::f32) ? 4 : 8, Alignment);
 
   unsigned Opc;
 
@@ -2093,7 +2092,7 @@ unsigned PPCFastISel::PPCMaterializeGV(const GlobalValue *GV, MVT VT) {
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::ADDIStocHA8),
             HighPartReg).addReg(PPC::X2).addGlobalAddress(GV);
 
-    if (PPCSubTarget->isGVIndirectSymbol(GV)) {
+    if (Subtarget->isGVIndirectSymbol(GV)) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(PPC::LDtocL),
               DestReg).addGlobalAddress(GV).addReg(HighPartReg);
     } else {
@@ -2200,7 +2199,7 @@ unsigned PPCFastISel::PPCMaterializeInt(const ConstantInt *CI, MVT VT,
                                         bool UseSExt) {
   // If we're using CR bit registers for i1 values, handle that as a special
   // case first.
-  if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
+  if (VT == MVT::i1 && Subtarget->useCRBits()) {
     unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(CI->isZero() ? PPC::CRUNSET : PPC::CRSET), ImmReg);
@@ -2355,7 +2354,7 @@ bool PPCFastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   Register ResultReg = MI->getOperand(0).getReg();
 
   if (!PPCEmitLoad(VT, ResultReg, Addr, nullptr, IsZExt,
-        PPCSubTarget->hasSPE() ? PPC::EVLDD : PPC::LFD))
+                   Subtarget->hasSPE() ? PPC::EVLDD : PPC::LFD))
     return false;
 
   MachineBasicBlock::iterator I(MI);
@@ -2382,7 +2381,7 @@ unsigned PPCFastISel::fastEmit_i(MVT Ty, MVT VT, unsigned Opc, uint64_t Imm) {
 
   // If we're using CR bit registers for i1 values, handle that as a special
   // case first.
-  if (VT == MVT::i1 && PPCSubTarget->useCRBits()) {
+  if (VT == MVT::i1 && Subtarget->useCRBits()) {
     unsigned ImmReg = createResultReg(&PPC::CRBITRCRegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(Imm == 0 ? PPC::CRUNSET : PPC::CRSET), ImmReg);
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index 4c608520e265..bd9174c1973d 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/PPCPredicates.h"
 #include "PPCFrameLowering.h"
 #include "PPCInstrBuilder.h"
 #include "PPCInstrInfo.h"
@@ -31,6 +32,7 @@ using namespace llvm;
 #define DEBUG_TYPE "framelowering"
 STATISTIC(NumPESpillVSR, "Number of spills to vector in prologue");
 STATISTIC(NumPEReloadVSR, "Number of reloads from vector in epilogue");
+STATISTIC(NumPrologProbed, "Number of prologues probed");
 
 static cl::opt<bool>
 EnablePEVectorSpills("ppc-enable-pe-vector-spills",
@@ -47,7 +49,7 @@ static const MCPhysReg VRRegNo[] = {
 };
 
 static unsigned computeReturnSaveOffset(const PPCSubtarget &STI) {
-  if (STI.isDarwinABI() || STI.isAIXABI())
+  if (STI.isAIXABI())
     return STI.isPPC64() ? 16 : 8;
   // SVR4 ABI:
   return STI.isPPC64() ? 16 : 4;
@@ -60,20 +62,12 @@ static unsigned computeTOCSaveOffset(const PPCSubtarget &STI) {
 }
 
 static unsigned computeFramePointerSaveOffset(const PPCSubtarget &STI) {
-  // For the Darwin ABI:
-  // We cannot use the TOC save slot (offset +20) in the PowerPC linkage area
-  // for saving the frame pointer (if needed.)  While the published ABI has
-  // not used this slot since at least MacOSX 10.2, there is older code
-  // around that does use it, and that needs to continue to work.
-  if (STI.isDarwinABI())
-    return STI.isPPC64() ? -8U : -4U;
-
-  // SVR4 ABI: First slot in the general register save area.
+  // First slot in the general register save area.
   return STI.isPPC64() ? -8U : -4U;
 }
 
 static unsigned computeLinkageSize(const PPCSubtarget &STI) {
-  if ((STI.isDarwinABI() || STI.isAIXABI()) || STI.isPPC64())
+  if (STI.isAIXABI() || STI.isPPC64())
     return (STI.isELFv2ABI() ? 4 : 6) * (STI.isPPC64() ? 8 : 4);
 
   // 32-bit SVR4 ABI:
@@ -81,18 +75,16 @@ static unsigned computeLinkageSize(const PPCSubtarget &STI) {
 }
 
 static unsigned computeBasePointerSaveOffset(const PPCSubtarget &STI) {
-  if (STI.isDarwinABI())
-    return STI.isPPC64() ? -16U : -8U;
+  // Third slot in the general purpose register save area.
+  if (STI.is32BitELFABI() && STI.getTargetMachine().isPositionIndependent())
+    return -12U;
 
-  // SVR4 ABI: First slot in the general register save area.
-  return STI.isPPC64()
-             ? -16U
-             : STI.getTargetMachine().isPositionIndependent() ? -12U : -8U;
+  // Second slot in the general purpose register save area.
+  return STI.isPPC64() ? -16U : -8U;
 }
 
-static unsigned computeCRSaveOffset() {
-  // The condition register save offset needs to be updated for AIX PPC32.
-  return 8;
+static unsigned computeCRSaveOffset(const PPCSubtarget &STI) {
+  return (STI.isAIXABI() && !STI.isPPC64()) ? 4 : 8;
 }
 
 PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI)
@@ -103,71 +95,97 @@ PPCFrameLowering::PPCFrameLowering(const PPCSubtarget &STI)
       FramePointerSaveOffset(computeFramePointerSaveOffset(Subtarget)),
       LinkageSize(computeLinkageSize(Subtarget)),
       BasePointerSaveOffset(computeBasePointerSaveOffset(Subtarget)),
-      CRSaveOffset(computeCRSaveOffset()) {}
+      CRSaveOffset(computeCRSaveOffset(Subtarget)) {}
 
 // With the SVR4 ABI, callee-saved registers have fixed offsets on the stack.
 const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
     unsigned &NumEntries) const {
-  if (Subtarget.isDarwinABI()) {
-    NumEntries = 1;
-    if (Subtarget.isPPC64()) {
-      static const SpillSlot darwin64Offsets = {PPC::X31, -8};
-      return &darwin64Offsets;
-    } else {
-      static const SpillSlot darwinOffsets = {PPC::R31, -4};
-      return &darwinOffsets;
-    }
-  }
 
-  // Early exit if not using the SVR4 ABI.
-  if (!Subtarget.isSVR4ABI()) {
-    NumEntries = 0;
-    return nullptr;
-  }
+// Floating-point register save area offsets.
+#define CALLEE_SAVED_FPRS \
+      {PPC::F31, -8},     \
+      {PPC::F30, -16},    \
+      {PPC::F29, -24},    \
+      {PPC::F28, -32},    \
+      {PPC::F27, -40},    \
+      {PPC::F26, -48},    \
+      {PPC::F25, -56},    \
+      {PPC::F24, -64},    \
+      {PPC::F23, -72},    \
+      {PPC::F22, -80},    \
+      {PPC::F21, -88},    \
+      {PPC::F20, -96},    \
+      {PPC::F19, -104},   \
+      {PPC::F18, -112},   \
+      {PPC::F17, -120},   \
+      {PPC::F16, -128},   \
+      {PPC::F15, -136},   \
+      {PPC::F14, -144}
+
+// 32-bit general purpose register save area offsets shared by ELF and
+// AIX. AIX has an extra CSR with r13.
+#define CALLEE_SAVED_GPRS32 \
+      {PPC::R31, -4},       \
+      {PPC::R30, -8},       \
+      {PPC::R29, -12},      \
+      {PPC::R28, -16},      \
+      {PPC::R27, -20},      \
+      {PPC::R26, -24},      \
+      {PPC::R25, -28},      \
+      {PPC::R24, -32},      \
+      {PPC::R23, -36},      \
+      {PPC::R22, -40},      \
+      {PPC::R21, -44},      \
+      {PPC::R20, -48},      \
+      {PPC::R19, -52},      \
+      {PPC::R18, -56},      \
+      {PPC::R17, -60},      \
+      {PPC::R16, -64},      \
+      {PPC::R15, -68},      \
+      {PPC::R14, -72}
+
+// 64-bit general purpose register save area offsets.
+#define CALLEE_SAVED_GPRS64 \
+      {PPC::X31, -8},       \
+      {PPC::X30, -16},      \
+      {PPC::X29, -24},      \
+      {PPC::X28, -32},      \
+      {PPC::X27, -40},      \
+      {PPC::X26, -48},      \
+      {PPC::X25, -56},      \
+      {PPC::X24, -64},      \
+      {PPC::X23, -72},      \
+      {PPC::X22, -80},      \
+      {PPC::X21, -88},      \
+      {PPC::X20, -96},      \
+      {PPC::X19, -104},     \
+      {PPC::X18, -112},     \
+      {PPC::X17, -120},     \
+      {PPC::X16, -128},     \
+      {PPC::X15, -136},     \
+      {PPC::X14, -144}
+
+// Vector register save area offsets.
+#define CALLEE_SAVED_VRS \
+      {PPC::V31, -16},   \
+      {PPC::V30, -32},   \
+      {PPC::V29, -48},   \
+      {PPC::V28, -64},   \
+      {PPC::V27, -80},   \
+      {PPC::V26, -96},   \
+      {PPC::V25, -112},  \
+      {PPC::V24, -128},  \
+      {PPC::V23, -144},  \
+      {PPC::V22, -160},  \
+      {PPC::V21, -176},  \
+      {PPC::V20, -192}
 
   // Note that the offsets here overlap, but this is fixed up in
   // processFunctionBeforeFrameFinalized.
 
-  static const SpillSlot Offsets[] = {
-      // Floating-point register save area offsets.
-      {PPC::F31, -8},
-      {PPC::F30, -16},
-      {PPC::F29, -24},
-      {PPC::F28, -32},
-      {PPC::F27, -40},
-      {PPC::F26, -48},
-      {PPC::F25, -56},
-      {PPC::F24, -64},
-      {PPC::F23, -72},
-      {PPC::F22, -80},
-      {PPC::F21, -88},
-      {PPC::F20, -96},
-      {PPC::F19, -104},
-      {PPC::F18, -112},
-      {PPC::F17, -120},
-      {PPC::F16, -128},
-      {PPC::F15, -136},
-      {PPC::F14, -144},
-
-      // General register save area offsets.
-      {PPC::R31, -4},
-      {PPC::R30, -8},
-      {PPC::R29, -12},
-      {PPC::R28, -16},
-      {PPC::R27, -20},
-      {PPC::R26, -24},
-      {PPC::R25, -28},
-      {PPC::R24, -32},
-      {PPC::R23, -36},
-      {PPC::R22, -40},
-      {PPC::R21, -44},
-      {PPC::R20, -48},
-      {PPC::R19, -52},
-      {PPC::R18, -56},
-      {PPC::R17, -60},
-      {PPC::R16, -64},
-      {PPC::R15, -68},
-      {PPC::R14, -72},
+  static const SpillSlot ELFOffsets32[] = {
+      CALLEE_SAVED_FPRS,
+      CALLEE_SAVED_GPRS32,
 
       // CR save area offset.  We map each of the nonvolatile CR fields
       // to the slot for CR2, which is the first of the nonvolatile CR
@@ -178,19 +196,7 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
       // VRSAVE save area offset.
       {PPC::VRSAVE, -4},
 
-      // Vector register save area
-      {PPC::V31, -16},
-      {PPC::V30, -32},
-      {PPC::V29, -48},
-      {PPC::V28, -64},
-      {PPC::V27, -80},
-      {PPC::V26, -96},
-      {PPC::V25, -112},
-      {PPC::V24, -128},
-      {PPC::V23, -144},
-      {PPC::V22, -160},
-      {PPC::V21, -176},
-      {PPC::V20, -192},
+      CALLEE_SAVED_VRS,
 
       // SPE register save area (overlaps Vector save area).
       {PPC::S31, -8},
@@ -212,73 +218,48 @@ const PPCFrameLowering::SpillSlot *PPCFrameLowering::getCalleeSavedSpillSlots(
       {PPC::S15, -136},
       {PPC::S14, -144}};
 
-  static const SpillSlot Offsets64[] = {
-      // Floating-point register save area offsets.
-      {PPC::F31, -8},
-      {PPC::F30, -16},
-      {PPC::F29, -24},
-      {PPC::F28, -32},
-      {PPC::F27, -40},
-      {PPC::F26, -48},
-      {PPC::F25, -56},
-      {PPC::F24, -64},
-      {PPC::F23, -72},
-      {PPC::F22, -80},
-      {PPC::F21, -88},
-      {PPC::F20, -96},
-      {PPC::F19, -104},
-      {PPC::F18, -112},
-      {PPC::F17, -120},
-      {PPC::F16, -128},
-      {PPC::F15, -136},
-      {PPC::F14, -144},
-
-      // General register save area offsets.
-      {PPC::X31, -8},
-      {PPC::X30, -16},
-      {PPC::X29, -24},
-      {PPC::X28, -32},
-      {PPC::X27, -40},
-      {PPC::X26, -48},
-      {PPC::X25, -56},
-      {PPC::X24, -64},
-      {PPC::X23, -72},
-      {PPC::X22, -80},
-      {PPC::X21, -88},
-      {PPC::X20, -96},
-      {PPC::X19, -104},
-      {PPC::X18, -112},
-      {PPC::X17, -120},
-      {PPC::X16, -128},
-      {PPC::X15, -136},
-      {PPC::X14, -144},
+  static const SpillSlot ELFOffsets64[] = {
+      CALLEE_SAVED_FPRS,
+      CALLEE_SAVED_GPRS64,
 
       // VRSAVE save area offset.
       {PPC::VRSAVE, -4},
+      CALLEE_SAVED_VRS
+  };
 
-      // Vector register save area
-      {PPC::V31, -16},
-      {PPC::V30, -32},
-      {PPC::V29, -48},
-      {PPC::V28, -64},
-      {PPC::V27, -80},
-      {PPC::V26, -96},
-      {PPC::V25, -112},
-      {PPC::V24, -128},
-      {PPC::V23, -144},
-      {PPC::V22, -160},
-      {PPC::V21, -176},
-      {PPC::V20, -192}};
+  static const SpillSlot AIXOffsets32[] = {
+      CALLEE_SAVED_FPRS,
+      CALLEE_SAVED_GPRS32,
+      // Add AIX's extra CSR.
+      {PPC::R13, -76},
+      // TODO: Update when we add vector support for AIX.
+  };
 
-  if (Subtarget.isPPC64()) {
-    NumEntries = array_lengthof(Offsets64);
+  static const SpillSlot AIXOffsets64[] = {
+      CALLEE_SAVED_FPRS,
+      CALLEE_SAVED_GPRS64,
+      // TODO: Update when we add vector support for AIX.
+  };
 
-    return Offsets64;
-  } else {
-    NumEntries = array_lengthof(Offsets);
+  if (Subtarget.is64BitELFABI()) {
+    NumEntries = array_lengthof(ELFOffsets64);
+    return ELFOffsets64;
+  }
 
-    return Offsets;
+  if (Subtarget.is32BitELFABI()) {
+    NumEntries = array_lengthof(ELFOffsets32);
+    return ELFOffsets32;
   }
+
+  assert(Subtarget.isAIXABI() && "Unexpected ABI.");
+
+  if (Subtarget.isPPC64()) {
+    NumEntries = array_lengthof(AIXOffsets64);
+    return AIXOffsets64;
+  }
+
+  NumEntries = array_lengthof(AIXOffsets32);
+  return AIXOffsets32;
 }
 
 /// RemoveVRSaveCode - We have found that this function does not need any code
@@ -479,9 +460,9 @@ PPCFrameLowering::determineFrameLayout(const MachineFunction &MF,
     UseEstimate ? MFI.estimateStackSize(MF) : MFI.getStackSize();
 
   // Get stack alignments. The frame must be aligned to the greatest of these:
-  unsigned TargetAlign = getStackAlignment(); // alignment required per the ABI
-  unsigned MaxAlign = MFI.getMaxAlignment(); // algmt required by data in frame
-  unsigned AlignMask = std::max(MaxAlign, TargetAlign) - 1;
+  Align TargetAlign = getStackAlign(); // alignment required per the ABI
+  Align MaxAlign = MFI.getMaxAlign();  // algmt required by data in frame
+  Align Alignment = std::max(TargetAlign, MaxAlign);
 
   const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
 
@@ -513,7 +494,7 @@ PPCFrameLowering::determineFrameLayout(const MachineFunction &MF,
   // If we have dynamic alloca then maxCallFrameSize needs to be aligned so
   // that allocations will be aligned.
   if (MFI.hasVarSizedObjects())
-    maxCallFrameSize = (maxCallFrameSize + AlignMask) & ~AlignMask;
+    maxCallFrameSize = alignTo(maxCallFrameSize, Alignment);
 
   // Update the new max call frame size if the caller passes in a valid pointer.
   if (NewMaxCallFrameSize)
@@ -523,7 +504,7 @@ PPCFrameLowering::determineFrameLayout(const MachineFunction &MF,
   FrameSize += maxCallFrameSize;
 
   // Make sure the frame is aligned.
-  FrameSize = (FrameSize + AlignMask) & ~AlignMask;
+  FrameSize = alignTo(FrameSize, Alignment);
 
   return FrameSize;
 }
@@ -613,11 +594,11 @@ bool
 PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
                                       bool UseAtEnd,
                                       bool TwoUniqueRegsRequired,
-                                      unsigned *SR1,
-                                      unsigned *SR2) const {
+                                      Register *SR1,
+                                      Register *SR2) const {
   RegScavenger RS;
-  unsigned R0 =  Subtarget.isPPC64() ? PPC::X0 : PPC::R0;
-  unsigned R12 = Subtarget.isPPC64() ? PPC::X12 : PPC::R12;
+  Register R0 =  Subtarget.isPPC64() ? PPC::X0 : PPC::R0;
+  Register R12 = Subtarget.isPPC64() ? PPC::X12 : PPC::R12;
 
   // Set the defaults for the two scratch registers.
   if (SR1)
@@ -684,7 +665,7 @@ PPCFrameLowering::findScratchRegister(MachineBasicBlock *MBB,
     if (SecondScratchReg != -1)
       *SR2 = SecondScratchReg;
     else
-      *SR2 = TwoUniqueRegsRequired ? (unsigned)PPC::NoRegister : *SR1;
+      *SR2 = TwoUniqueRegsRequired ? Register() : *SR1;
   }
 
   // Now that we've done our best to provide both registers, double check
@@ -709,7 +690,7 @@ PPCFrameLowering::twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const {
   int NegFrameSize = -FrameSize;
   bool IsLargeFrame = !isInt<16>(NegFrameSize);
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  unsigned MaxAlign = MFI.getMaxAlignment();
+  Align MaxAlign = MFI.getMaxAlign();
   bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI();
 
   return (IsLargeFrame || !HasRedZone) && HasBP && MaxAlign > 1;
@@ -778,11 +759,13 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   MachineFrameInfo &MFI = MF.getFrameInfo();
   const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
   const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  const PPCTargetLowering &TLI = *Subtarget.getTargetLowering();
 
   MachineModuleInfo &MMI = MF.getMMI();
   const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
   DebugLoc dl;
-  bool needsCFI = MF.needsFrameMoves();
+  // AIX assembler does not support cfi directives.
+  const bool needsCFI = MF.needsFrameMoves() && !Subtarget.isAIXABI();
 
   // Get processor type.
   bool isPPC64 = Subtarget.isPPC64();
@@ -790,8 +773,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   bool isSVR4ABI = Subtarget.isSVR4ABI();
   bool isAIXABI = Subtarget.isAIXABI();
   bool isELFv2ABI = Subtarget.isELFv2ABI();
-  assert((Subtarget.isDarwinABI() || isSVR4ABI || isAIXABI) &&
-         "Unsupported PPC ABI.");
+  assert((isSVR4ABI || isAIXABI) && "Unsupported PPC ABI.");
 
   // Scan the prolog, looking for an UPDATE_VRSAVE instruction.  If we find it,
   // process it.
@@ -821,20 +803,20 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   bool MustSaveLR = FI->mustSaveLR();
   bool MustSaveTOC = FI->mustSaveTOC();
-  const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
+  const SmallVectorImpl<Register> &MustSaveCRs = FI->getMustSaveCRs();
   bool MustSaveCR = !MustSaveCRs.empty();
   // Do we have a frame pointer and/or base pointer for this function?
   bool HasFP = hasFP(MF);
   bool HasBP = RegInfo->hasBasePointer(MF);
   bool HasRedZone = isPPC64 || !isSVR4ABI;
 
-  unsigned SPReg       = isPPC64 ? PPC::X1  : PPC::R1;
+  Register SPReg       = isPPC64 ? PPC::X1  : PPC::R1;
   Register BPReg = RegInfo->getBaseRegister(MF);
-  unsigned FPReg       = isPPC64 ? PPC::X31 : PPC::R31;
-  unsigned LRReg       = isPPC64 ? PPC::LR8 : PPC::LR;
-  unsigned TOCReg      = isPPC64 ? PPC::X2 :  PPC::R2;
-  unsigned ScratchReg  = 0;
-  unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
+  Register FPReg       = isPPC64 ? PPC::X31 : PPC::R31;
+  Register LRReg       = isPPC64 ? PPC::LR8 : PPC::LR;
+  Register TOCReg      = isPPC64 ? PPC::X2 :  PPC::R2;
+  Register ScratchReg;
+  Register TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
   //  ...(R12/X12 is volatile in both Darwin & SVR4, & can't be a function arg.)
   const MCInstrDesc& MFLRInst = TII.get(isPPC64 ? PPC::MFLR8
                                                 : PPC::MFLR );
@@ -854,6 +836,9 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
                                                             : PPC::SUBFC);
   const MCInstrDesc& SubtractImmCarryingInst = TII.get(isPPC64 ? PPC::SUBFIC8
                                                                : PPC::SUBFIC);
+  const MCInstrDesc &MoveFromCondRegInst = TII.get(isPPC64 ? PPC::MFCR8
+                                                           : PPC::MFCR);
+  const MCInstrDesc &StoreWordInst = TII.get(isPPC64 ? PPC::STW8 : PPC::STW);
 
   // Regarding this assert: Even though LR is saved in the caller's frame (i.e.,
   // LROffset is positive), that slot is callee-owned. Because PPC32 SVR4 has no
@@ -863,9 +848,12 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
          "FrameSize must be >0 to save/restore the FP or LR for 32-bit SVR4.");
 
   // Using the same bool variable as below to suppress compiler warnings.
-  bool SingleScratchReg =
-    findScratchRegister(&MBB, false, twoUniqueScratchRegsRequired(&MBB),
-                        &ScratchReg, &TempReg);
+  // Stack probe requires two scratch registers, one for old sp, one for large
+  // frame and large probe size.
+  bool SingleScratchReg = findScratchRegister(
+      &MBB, false,
+      twoUniqueScratchRegsRequired(&MBB) || TLI.hasInlineStackProbe(MF),
+      &ScratchReg, &TempReg);
   assert(SingleScratchReg &&
          "Required number of registers not available in this block");
 
@@ -906,21 +894,14 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   // Get stack alignments.
-  unsigned MaxAlign = MFI.getMaxAlignment();
+  Align MaxAlign = MFI.getMaxAlign();
   if (HasBP && MaxAlign > 1)
-    assert(isPowerOf2_32(MaxAlign) && isInt<16>(MaxAlign) &&
-           "Invalid alignment!");
+    assert(Log2(MaxAlign) < 16 && "Invalid alignment!");
 
   // Frames of 32KB & larger require special handling because they cannot be
   // indexed into with a simple STDU/STWU/STD/STW immediate offset operand.
   bool isLargeFrame = !isInt<16>(NegFrameSize);
 
-  assert((isPPC64 || !MustSaveCR) &&
-         "Prologue CR saving supported only in 64-bit mode");
-
-  if (MustSaveCR && isAIXABI)
-    report_fatal_error("Prologue CR saving is unimplemented on AIX.");
-
   // Check if we can move the stack update instruction (stdu) down the prologue
   // past the callee saves. Hopefully this will avoid the situation where the
   // saves are waiting for the update on the store with update to complete.
@@ -960,49 +941,42 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
     }
   }
 
-  // If we need to spill the CR and the LR but we don't have two separate
-  // registers available, we must spill them one at a time
-  if (MustSaveCR && SingleScratchReg && MustSaveLR) {
+  // Where in the prologue we move the CR fields depends on how many scratch
+  // registers we have, and if we need to save the link register or not. This
+  // lambda is to avoid duplicating the logic in 2 places.
+  auto BuildMoveFromCR = [&]() {
+    if (isELFv2ABI && MustSaveCRs.size() == 1) {
     // In the ELFv2 ABI, we are not required to save all CR fields.
-    // If only one or two CR fields are clobbered, it is more efficient to use
-    // mfocrf to selectively save just those fields, because mfocrf has short
+    // If only one CR field is clobbered, it is more efficient to use
+    // mfocrf to selectively save just that field, because mfocrf has short
     // latency compares to mfcr.
-    unsigned MfcrOpcode = PPC::MFCR8;
-    unsigned CrState = RegState::ImplicitKill;
-    if (isELFv2ABI && MustSaveCRs.size() == 1) {
-      MfcrOpcode = PPC::MFOCRF8;
-      CrState = RegState::Kill;
+      assert(isPPC64 && "V2 ABI is 64-bit only.");
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, dl, TII.get(PPC::MFOCRF8), TempReg);
+      MIB.addReg(MustSaveCRs[0], RegState::Kill);
+    } else {
+      MachineInstrBuilder MIB =
+          BuildMI(MBB, MBBI, dl, MoveFromCondRegInst, TempReg);
+      for (unsigned CRfield : MustSaveCRs)
+        MIB.addReg(CRfield, RegState::ImplicitKill);
     }
-    MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, dl, TII.get(MfcrOpcode), TempReg);
-    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
-      MIB.addReg(MustSaveCRs[i], CrState);
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
-      .addReg(TempReg, getKillRegState(true))
-      .addImm(getCRSaveOffset())
-      .addReg(SPReg);
+  };
+
+  // If we need to spill the CR and the LR but we don't have two separate
+  // registers available, we must spill them one at a time
+  if (MustSaveCR && SingleScratchReg && MustSaveLR) {
+    BuildMoveFromCR();
+    BuildMI(MBB, MBBI, dl, StoreWordInst)
+        .addReg(TempReg, getKillRegState(true))
+        .addImm(CRSaveOffset)
+        .addReg(SPReg);
   }
 
   if (MustSaveLR)
     BuildMI(MBB, MBBI, dl, MFLRInst, ScratchReg);
 
-  if (MustSaveCR &&
-      !(SingleScratchReg && MustSaveLR)) { // will only occur for PPC64
-    // In the ELFv2 ABI, we are not required to save all CR fields.
-    // If only one or two CR fields are clobbered, it is more efficient to use
-    // mfocrf to selectively save just those fields, because mfocrf has short
-    // latency compares to mfcr.
-    unsigned MfcrOpcode = PPC::MFCR8;
-    unsigned CrState = RegState::ImplicitKill;
-    if (isELFv2ABI && MustSaveCRs.size() == 1) {
-      MfcrOpcode = PPC::MFOCRF8;
-      CrState = RegState::Kill;
-    }
-    MachineInstrBuilder MIB =
-      BuildMI(MBB, MBBI, dl, TII.get(MfcrOpcode), TempReg);
-    for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
-      MIB.addReg(MustSaveCRs[i], CrState);
-  }
+  if (MustSaveCR && !(SingleScratchReg && MustSaveLR))
+    BuildMoveFromCR();
 
   if (HasRedZone) {
     if (HasFP)
@@ -1029,11 +1003,11 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
       .addReg(SPReg);
 
   if (MustSaveCR &&
-      !(SingleScratchReg && MustSaveLR)) { // will only occur for PPC64
+      !(SingleScratchReg && MustSaveLR)) {
     assert(HasRedZone && "A red zone is always available on PPC64");
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::STW8))
+    BuildMI(MBB, MBBI, dl, StoreWordInst)
       .addReg(TempReg, getKillRegState(true))
-      .addImm(getCRSaveOffset())
+      .addImm(CRSaveOffset)
       .addReg(SPReg);
   }
 
@@ -1055,58 +1029,81 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   // the negated frame size will be placed in ScratchReg.
   bool HasSTUX = false;
 
-  // This condition must be kept in sync with canUseAsPrologue.
-  if (HasBP && MaxAlign > 1) {
-    if (isPPC64)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg)
-        .addReg(SPReg)
-        .addImm(0)
-        .addImm(64 - Log2_32(MaxAlign));
-    else // PPC32...
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), ScratchReg)
-        .addReg(SPReg)
-        .addImm(0)
-        .addImm(32 - Log2_32(MaxAlign))
-        .addImm(31);
-    if (!isLargeFrame) {
-      BuildMI(MBB, MBBI, dl, SubtractImmCarryingInst, ScratchReg)
-        .addReg(ScratchReg, RegState::Kill)
+  // If FrameSize <= TLI.getStackProbeSize(MF), as POWER ABI requires backchain
+  // pointer is always stored at SP, we will get a free probe due to an essential
+  // STU(X) instruction.
+  if (TLI.hasInlineStackProbe(MF) && FrameSize > TLI.getStackProbeSize(MF)) {
+    // To be consistent with other targets, a pseudo instruction is emitted and
+    // will be later expanded in `inlineStackProbe`.
+    BuildMI(MBB, MBBI, dl,
+            TII.get(isPPC64 ? PPC::PROBED_STACKALLOC_64
+                            : PPC::PROBED_STACKALLOC_32))
+        .addDef(ScratchReg)
+        .addDef(TempReg) // TempReg stores the old sp.
         .addImm(NegFrameSize);
-    } else {
-      assert(!SingleScratchReg && "Only a single scratch reg available");
-      BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg)
-        .addImm(NegFrameSize >> 16);
-      BuildMI(MBB, MBBI, dl, OrImmInst, TempReg)
-        .addReg(TempReg, RegState::Kill)
-        .addImm(NegFrameSize & 0xFFFF);
-      BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg)
-        .addReg(ScratchReg, RegState::Kill)
-        .addReg(TempReg, RegState::Kill);
+    // FIXME: HasSTUX is only read if HasRedZone is not set, in such case, we
+    // update the ScratchReg to meet the assumption that ScratchReg contains
+    // the NegFrameSize. This solution is rather tricky.
+    if (!HasRedZone) {
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::SUBF), ScratchReg)
+          .addReg(TempReg)
+          .addReg(SPReg);
+      HasSTUX = true;
     }
+  } else {
+    // This condition must be kept in sync with canUseAsPrologue.
+    if (HasBP && MaxAlign > 1) {
+      if (isPPC64)
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::RLDICL), ScratchReg)
+            .addReg(SPReg)
+            .addImm(0)
+            .addImm(64 - Log2(MaxAlign));
+      else // PPC32...
+        BuildMI(MBB, MBBI, dl, TII.get(PPC::RLWINM), ScratchReg)
+            .addReg(SPReg)
+            .addImm(0)
+            .addImm(32 - Log2(MaxAlign))
+            .addImm(31);
+      if (!isLargeFrame) {
+        BuildMI(MBB, MBBI, dl, SubtractImmCarryingInst, ScratchReg)
+            .addReg(ScratchReg, RegState::Kill)
+            .addImm(NegFrameSize);
+      } else {
+        assert(!SingleScratchReg && "Only a single scratch reg available");
+        BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, TempReg)
+            .addImm(NegFrameSize >> 16);
+        BuildMI(MBB, MBBI, dl, OrImmInst, TempReg)
+            .addReg(TempReg, RegState::Kill)
+            .addImm(NegFrameSize & 0xFFFF);
+        BuildMI(MBB, MBBI, dl, SubtractCarryingInst, ScratchReg)
+            .addReg(ScratchReg, RegState::Kill)
+            .addReg(TempReg, RegState::Kill);
+      }
 
-    BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
-      .addReg(SPReg, RegState::Kill)
-      .addReg(SPReg)
-      .addReg(ScratchReg);
-    HasSTUX = true;
+      BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
+          .addReg(SPReg, RegState::Kill)
+          .addReg(SPReg)
+          .addReg(ScratchReg);
+      HasSTUX = true;
 
-  } else if (!isLargeFrame) {
-    BuildMI(MBB, StackUpdateLoc, dl, StoreUpdtInst, SPReg)
-      .addReg(SPReg)
-      .addImm(NegFrameSize)
-      .addReg(SPReg);
+    } else if (!isLargeFrame) {
+      BuildMI(MBB, StackUpdateLoc, dl, StoreUpdtInst, SPReg)
+          .addReg(SPReg)
+          .addImm(NegFrameSize)
+          .addReg(SPReg);
 
-  } else {
-    BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
-      .addImm(NegFrameSize >> 16);
-    BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
-      .addReg(ScratchReg, RegState::Kill)
-      .addImm(NegFrameSize & 0xFFFF);
-    BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
-      .addReg(SPReg, RegState::Kill)
-      .addReg(SPReg)
-      .addReg(ScratchReg);
-    HasSTUX = true;
+    } else {
+      BuildMI(MBB, MBBI, dl, LoadImmShiftedInst, ScratchReg)
+          .addImm(NegFrameSize >> 16);
+      BuildMI(MBB, MBBI, dl, OrImmInst, ScratchReg)
+          .addReg(ScratchReg, RegState::Kill)
+          .addImm(NegFrameSize & 0xFFFF);
+      BuildMI(MBB, MBBI, dl, StoreUpdtIdxInst, SPReg)
+          .addReg(SPReg, RegState::Kill)
+          .addReg(SPReg)
+          .addReg(ScratchReg);
+      HasSTUX = true;
+    }
   }
 
   // Save the TOC register after the stack pointer update if a prologue TOC
@@ -1247,7 +1244,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
       // Adjust the definition of CFA to account for the change in SP.
       assert(NegFrameSize);
       CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(nullptr, NegFrameSize));
+          MCCFIInstruction::cfiDefCfaOffset(nullptr, -NegFrameSize));
     }
     BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex);
@@ -1337,7 +1334,7 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
         // actually saved gets its own CFI record.
         unsigned CRReg = isELFv2ABI? Reg : (unsigned) PPC::CR2;
         unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
-            nullptr, MRI->getDwarfRegNum(CRReg, true), getCRSaveOffset()));
+            nullptr, MRI->getDwarfRegNum(CRReg, true), CRSaveOffset));
         BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
             .addCFIIndex(CFIIndex);
         continue;
@@ -1367,6 +1364,175 @@ void PPCFrameLowering::emitPrologue(MachineFunction &MF,
   }
 }
 
+void PPCFrameLowering::inlineStackProbe(MachineFunction &MF,
+                                        MachineBasicBlock &PrologMBB) const {
+  // TODO: Generate CFI instructions.
+  bool isPPC64 = Subtarget.isPPC64();
+  const PPCTargetLowering &TLI = *Subtarget.getTargetLowering();
+  const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  // AIX assembler does not support cfi directives.
+  const bool needsCFI = MF.needsFrameMoves() && !Subtarget.isAIXABI();
+  auto StackAllocMIPos = llvm::find_if(PrologMBB, [](MachineInstr &MI) {
+    int Opc = MI.getOpcode();
+    return Opc == PPC::PROBED_STACKALLOC_64 || Opc == PPC::PROBED_STACKALLOC_32;
+  });
+  if (StackAllocMIPos == PrologMBB.end())
+    return;
+  const BasicBlock *ProbedBB = PrologMBB.getBasicBlock();
+  DebugLoc DL = PrologMBB.findDebugLoc(StackAllocMIPos);
+  MachineInstr &MI = *StackAllocMIPos;
+  int64_t NegFrameSize = MI.getOperand(2).getImm();
+  int64_t NegProbeSize = -(int64_t)TLI.getStackProbeSize(MF);
+  assert(isInt<32>(NegProbeSize) && "Unhandled probe size");
+  int64_t NumBlocks = NegFrameSize / NegProbeSize;
+  int64_t NegResidualSize = NegFrameSize % NegProbeSize;
+  Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;
+  Register ScratchReg = MI.getOperand(0).getReg();
+  Register FPReg = MI.getOperand(1).getReg();
+  const PPCRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
+  bool HasBP = RegInfo->hasBasePointer(MF);
+  Align MaxAlign = MFI.getMaxAlign();
+  // Initialize current frame pointer.
+  const MCInstrDesc &CopyInst = TII.get(isPPC64 ? PPC::OR8 : PPC::OR);
+  BuildMI(PrologMBB, {MI}, DL, CopyInst, FPReg).addReg(SPReg).addReg(SPReg);
+  // Subroutines to generate .cfi_* directives.
+  auto buildDefCFAReg = [&](MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI, Register Reg) {
+    unsigned RegNum = MRI->getDwarfRegNum(Reg, true);
+    unsigned CFIIndex = MF.addFrameInst(
+        MCCFIInstruction::createDefCfaRegister(nullptr, RegNum));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  };
+  auto buildDefCFA = [&](MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI, Register Reg,
+                         int Offset) {
+    unsigned RegNum = MRI->getDwarfRegNum(Reg, true);
+    unsigned CFIIndex = MBB.getParent()->addFrameInst(
+        MCCFIInstruction::cfiDefCfa(nullptr, RegNum, Offset));
+    BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::CFI_INSTRUCTION))
+        .addCFIIndex(CFIIndex);
+  };
+  // Subroutine to determine if we can use the Imm as part of d-form.
+  auto CanUseDForm = [](int64_t Imm) { return isInt<16>(Imm) && Imm % 4 == 0; };
+  // Subroutine to materialize the Imm into TempReg.
+  auto MaterializeImm = [&](MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI, int64_t Imm,
+                            Register &TempReg) {
+    assert(isInt<32>(Imm) && "Unhandled imm");
+    if (isInt<16>(Imm))
+      BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::LI8 : PPC::LI), TempReg)
+          .addImm(Imm);
+    else {
+      BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::LIS8 : PPC::LIS), TempReg)
+          .addImm(Imm >> 16);
+      BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::ORI8 : PPC::ORI), TempReg)
+          .addReg(TempReg)
+          .addImm(Imm & 0xFFFF);
+    }
+  };
+  // Subroutine to store frame pointer and decrease stack pointer by probe size.
+  auto allocateAndProbe = [&](MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBI, int64_t NegSize,
+                              Register NegSizeReg, bool UseDForm) {
+    if (UseDForm)
+      BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::STDU : PPC::STWU), SPReg)
+          .addReg(FPReg)
+          .addImm(NegSize)
+          .addReg(SPReg);
+    else
+      BuildMI(MBB, MBBI, DL, TII.get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
+          .addReg(FPReg)
+          .addReg(SPReg)
+          .addReg(NegSizeReg);
+  };
+  // Use FPReg to calculate CFA.
+  if (needsCFI)
+    buildDefCFA(PrologMBB, {MI}, FPReg, 0);
+  // For case HasBP && MaxAlign > 1, we have to align the SP by performing
+  // SP = SP - SP % MaxAlign.
+  if (HasBP && MaxAlign > 1) {
+    if (isPPC64)
+      BuildMI(PrologMBB, {MI}, DL, TII.get(PPC::RLDICL), ScratchReg)
+          .addReg(FPReg)
+          .addImm(0)
+          .addImm(64 - Log2(MaxAlign));
+    else
+      BuildMI(PrologMBB, {MI}, DL, TII.get(PPC::RLWINM), ScratchReg)
+          .addReg(FPReg)
+          .addImm(0)
+          .addImm(32 - Log2(MaxAlign))
+          .addImm(31);
+    BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::STDUX : PPC::STWUX),
+            SPReg)
+        .addReg(FPReg)
+        .addReg(SPReg)
+        .addReg(ScratchReg);
+  }
+  // Probe residual part.
+  if (NegResidualSize) {
+    bool ResidualUseDForm = CanUseDForm(NegResidualSize);
+    if (!ResidualUseDForm)
+      MaterializeImm(PrologMBB, {MI}, NegResidualSize, ScratchReg);
+    allocateAndProbe(PrologMBB, {MI}, NegResidualSize, ScratchReg,
+                     ResidualUseDForm);
+  }
+  bool UseDForm = CanUseDForm(NegProbeSize);
+  // If number of blocks is small, just probe them directly.
+  if (NumBlocks < 3) {
+    if (!UseDForm)
+      MaterializeImm(PrologMBB, {MI}, NegProbeSize, ScratchReg);
+    for (int i = 0; i < NumBlocks; ++i)
+      allocateAndProbe(PrologMBB, {MI}, NegProbeSize, ScratchReg, UseDForm);
+    if (needsCFI) {
+      // Restore using SPReg to calculate CFA.
+      buildDefCFAReg(PrologMBB, {MI}, SPReg);
+    }
+  } else {
+    // Since CTR is a volatile register and current shrinkwrap implementation
+    // won't choose an MBB in a loop as the PrologMBB, it's safe to synthesize a
+    // CTR loop to probe.
+    // Calculate trip count and stores it in CTRReg.
+    MaterializeImm(PrologMBB, {MI}, NumBlocks, ScratchReg);
+    BuildMI(PrologMBB, {MI}, DL, TII.get(isPPC64 ? PPC::MTCTR8 : PPC::MTCTR))
+        .addReg(ScratchReg, RegState::Kill);
+    if (!UseDForm)
+      MaterializeImm(PrologMBB, {MI}, NegProbeSize, ScratchReg);
+    // Create MBBs of the loop.
+    MachineFunction::iterator MBBInsertPoint =
+        std::next(PrologMBB.getIterator());
+    MachineBasicBlock *LoopMBB = MF.CreateMachineBasicBlock(ProbedBB);
+    MF.insert(MBBInsertPoint, LoopMBB);
+    MachineBasicBlock *ExitMBB = MF.CreateMachineBasicBlock(ProbedBB);
+    MF.insert(MBBInsertPoint, ExitMBB);
+    // Synthesize the loop body.
+    allocateAndProbe(*LoopMBB, LoopMBB->end(), NegProbeSize, ScratchReg,
+                     UseDForm);
+    BuildMI(LoopMBB, DL, TII.get(isPPC64 ? PPC::BDNZ8 : PPC::BDNZ))
+        .addMBB(LoopMBB);
+    LoopMBB->addSuccessor(ExitMBB);
+    LoopMBB->addSuccessor(LoopMBB);
+    // Synthesize the exit MBB.
+    ExitMBB->splice(ExitMBB->end(), &PrologMBB,
+                    std::next(MachineBasicBlock::iterator(MI)),
+                    PrologMBB.end());
+    ExitMBB->transferSuccessorsAndUpdatePHIs(&PrologMBB);
+    PrologMBB.addSuccessor(LoopMBB);
+    if (needsCFI) {
+      // Restore using SPReg to calculate CFA.
+      buildDefCFAReg(*ExitMBB, ExitMBB->begin(), SPReg);
+    }
+    // Update liveins.
+    recomputeLiveIns(*LoopMBB);
+    recomputeLiveIns(*ExitMBB);
+  }
+  ++NumPrologProbed;
+  MI.eraseFromParent();
+}
+
 void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {
   MachineBasicBlock::iterator MBBI = MBB.getFirstTerminator();
@@ -1392,18 +1558,18 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   // Check if the link register (LR) has been saved.
   PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
   bool MustSaveLR = FI->mustSaveLR();
-  const SmallVectorImpl<unsigned> &MustSaveCRs = FI->getMustSaveCRs();
+  const SmallVectorImpl<Register> &MustSaveCRs = FI->getMustSaveCRs();
   bool MustSaveCR = !MustSaveCRs.empty();
   // Do we have a frame pointer and/or base pointer for this function?
   bool HasFP = hasFP(MF);
   bool HasBP = RegInfo->hasBasePointer(MF);
   bool HasRedZone = Subtarget.isPPC64() || !Subtarget.isSVR4ABI();
 
-  unsigned SPReg      = isPPC64 ? PPC::X1  : PPC::R1;
+  Register SPReg      = isPPC64 ? PPC::X1  : PPC::R1;
   Register BPReg = RegInfo->getBaseRegister(MF);
-  unsigned FPReg      = isPPC64 ? PPC::X31 : PPC::R31;
-  unsigned ScratchReg = 0;
-  unsigned TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
+  Register FPReg      = isPPC64 ? PPC::X31 : PPC::R31;
+  Register ScratchReg;
+  Register TempReg     = isPPC64 ? PPC::X12 : PPC::R12; // another scratch reg
   const MCInstrDesc& MTLRInst = TII.get( isPPC64 ? PPC::MTLR8
                                                  : PPC::MTLR );
   const MCInstrDesc& LoadInst = TII.get( isPPC64 ? PPC::LD
@@ -1418,7 +1584,10 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
                                                    : PPC::ADDI );
   const MCInstrDesc& AddInst = TII.get( isPPC64 ? PPC::ADD8
                                                 : PPC::ADD4 );
-
+  const MCInstrDesc& LoadWordInst = TII.get( isPPC64 ? PPC::LWZ8
+                                                     : PPC::LWZ);
+  const MCInstrDesc& MoveToCRInst = TII.get( isPPC64 ? PPC::MTOCRF8
+                                                     : PPC::MTOCRF);
   int LROffset = getReturnSaveOffset();
 
   int FPOffset = 0;
@@ -1593,20 +1762,17 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   // value (although not the base register). Make sure it is not overwritten
   // too early.
 
-  assert((isPPC64 || !MustSaveCR) &&
-         "Epilogue CR restoring supported only in 64-bit mode");
-
   // If we need to restore both the LR and the CR and we only have one
   // available scratch register, we must do them one at a time.
   if (MustSaveCR && SingleScratchReg && MustSaveLR) {
     // Here TempReg == ScratchReg, and in the absence of red zone ScratchReg
     // is live here.
     assert(HasRedZone && "Expecting red zone");
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
-      .addImm(getCRSaveOffset())
+    BuildMI(MBB, MBBI, dl, LoadWordInst, TempReg)
+      .addImm(CRSaveOffset)
       .addReg(SPReg);
     for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
+      BuildMI(MBB, MBBI, dl, MoveToCRInst, MustSaveCRs[i])
         .addReg(TempReg, getKillRegState(i == e-1));
   }
 
@@ -1623,11 +1789,9 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 
   if (MustSaveCR && !(SingleScratchReg && MustSaveLR)) {
-    // This will only occur for PPC64.
-    assert(isPPC64 && "Expecting 64-bit mode");
     assert(RBReg == SPReg && "Should be using SP as a base register");
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::LWZ8), TempReg)
-      .addImm(getCRSaveOffset())
+    BuildMI(MBB, MBBI, dl, LoadWordInst, TempReg)
+      .addImm(CRSaveOffset)
       .addReg(RBReg);
   }
 
@@ -1682,9 +1846,9 @@ void PPCFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 
   if (MustSaveCR &&
-      !(SingleScratchReg && MustSaveLR)) // will only occur for PPC64
+      !(SingleScratchReg && MustSaveLR))
     for (unsigned i = 0, e = MustSaveCRs.size(); i != e; ++i)
-      BuildMI(MBB, MBBI, dl, TII.get(PPC::MTOCRF8), MustSaveCRs[i])
+      BuildMI(MBB, MBBI, dl, MoveToCRInst, MustSaveCRs[i])
         .addReg(TempReg, getKillRegState(i == e-1));
 
   if (MustSaveLR)
@@ -1729,13 +1893,25 @@ void PPCFrameLowering::createTailCallBranchInstr(MachineBasicBlock &MBB) const {
   DebugLoc dl = MBBI->getDebugLoc();
   const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
 
-  // Create branch instruction for pseudo tail call return instruction
+  // Create branch instruction for pseudo tail call return instruction.
+  // The TCRETURNdi variants are direct calls. Valid targets for those are
+  // MO_GlobalAddress operands as well as MO_ExternalSymbol with PC-Rel
+  // since we can tail call external functions with PC-Rel (i.e. we don't need
+  // to worry about different TOC pointers). Some of the external functions will
+  // be MO_GlobalAddress while others like memcpy for example, are going to
+  // be MO_ExternalSymbol.
   unsigned RetOpcode = MBBI->getOpcode();
   if (RetOpcode == PPC::TCRETURNdi) {
     MBBI = MBB.getLastNonDebugInstr();
     MachineOperand &JumpTarget = MBBI->getOperand(0);
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
-      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+    if (JumpTarget.isGlobal())
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
+        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+    else if (JumpTarget.isSymbol())
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB)).
+        addExternalSymbol(JumpTarget.getSymbolName());
+    else
+      llvm_unreachable("Expecting Global or External Symbol");
   } else if (RetOpcode == PPC::TCRETURNri) {
     MBBI = MBB.getLastNonDebugInstr();
     assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
@@ -1747,8 +1923,14 @@ void PPCFrameLowering::createTailCallBranchInstr(MachineBasicBlock &MBB) const {
   } else if (RetOpcode == PPC::TCRETURNdi8) {
     MBBI = MBB.getLastNonDebugInstr();
     MachineOperand &JumpTarget = MBBI->getOperand(0);
-    BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
-      addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+    if (JumpTarget.isGlobal())
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
+        addGlobalAddress(JumpTarget.getGlobal(), JumpTarget.getOffset());
+    else if (JumpTarget.isSymbol())
+      BuildMI(MBB, MBBI, dl, TII.get(PPC::TAILB8)).
+        addExternalSymbol(JumpTarget.getSymbolName());
+    else
+      llvm_unreachable("Expecting Global or External Symbol");
   } else if (RetOpcode == PPC::TCRETURNri8) {
     MBBI = MBB.getLastNonDebugInstr();
     assert(MBBI->getOperand(0).isReg() && "Expecting register operand.");
@@ -1776,7 +1958,6 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
   //  Save R31 if necessary
   int FPSI = FI->getFramePointerSaveIndex();
   const bool isPPC64 = Subtarget.isPPC64();
-  const bool IsDarwinABI  = Subtarget.isDarwinABI();
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
   // If the frame pointer save index hasn't been defined yet.
@@ -1823,25 +2004,26 @@ void PPCFrameLowering::determineCalleeSaves(MachineFunction &MF,
     MFI.CreateFixedObject(-1 * TCSPDelta, TCSPDelta, true);
   }
 
-  // For 32-bit SVR4, allocate the nonvolatile CR spill slot iff the
-  // function uses CR 2, 3, or 4.
-  if (!isPPC64 && !IsDarwinABI &&
-      (SavedRegs.test(PPC::CR2) ||
-       SavedRegs.test(PPC::CR3) ||
+  // Allocate the nonvolatile CR spill slot iff the function uses CR 2, 3, or 4.
+  // For 64-bit SVR4, and all flavors of AIX we create a FixedStack
+  // object at the offset of the CR-save slot in the linkage area. The actual
+  // save and restore of the condition register will be created as part of the
+  // prologue and epilogue insertion, but the FixedStack object is needed to
+  // keep the CalleSavedInfo valid.
+  if ((SavedRegs.test(PPC::CR2) || SavedRegs.test(PPC::CR3) ||
        SavedRegs.test(PPC::CR4))) {
-    int FrameIdx = MFI.CreateFixedObject((uint64_t)4, (int64_t)-4, true);
+    const uint64_t SpillSize = 4; // Condition register is always 4 bytes.
+    const int64_t SpillOffset =
+        Subtarget.isPPC64() ? 8 : Subtarget.isAIXABI() ? 4 : -4;
+    int FrameIdx =
+        MFI.CreateFixedObject(SpillSize, SpillOffset,
+                              /* IsImmutable */ true, /* IsAliased */ false);
     FI->setCRSpillFrameIndex(FrameIdx);
   }
 }
 
 void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                                        RegScavenger *RS) const {
-  // Early exit if not using the SVR4 ABI.
-  if (!Subtarget.isSVR4ABI()) {
-    addScavengingSpillSlot(MF, RS);
-    return;
-  }
-
   // Get callee saved register information.
   MachineFrameInfo &MFI = MF.getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
@@ -2014,11 +2196,8 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
       std::min<unsigned>(TRI->getEncodingValue(MinGPR),
                          TRI->getEncodingValue(MinG8R));
 
-    if (Subtarget.isPPC64()) {
-      LowerBound -= (31 - MinReg + 1) * 8;
-    } else {
-      LowerBound -= (31 - MinReg + 1) * 4;
-    }
+    const unsigned GPRegSize = Subtarget.isPPC64() ? 8 : 4;
+    LowerBound -= (31 - MinReg + 1) * GPRegSize;
   }
 
   // For 32-bit only, the CR save area is below the general register
@@ -2026,19 +2205,13 @@ void PPCFrameLowering::processFunctionBeforeFrameFinalized(MachineFunction &MF,
   // to the stack pointer and hence does not need an adjustment here.
   // Only CR2 (the first nonvolatile spilled) has an associated frame
   // index so that we have a single uniform save area.
-  if (spillsCR(MF) && !(Subtarget.isPPC64() && Subtarget.isSVR4ABI())) {
+  if (spillsCR(MF) && Subtarget.is32BitELFABI()) {
     // Adjust the frame index of the CR spill slot.
-    for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
-      unsigned Reg = CSI[i].getReg();
-
-      if ((Subtarget.isSVR4ABI() && Reg == PPC::CR2)
-          // Leave Darwin logic as-is.
-          || (!Subtarget.isSVR4ABI() &&
-              (PPC::CRBITRCRegClass.contains(Reg) ||
-               PPC::CRRCRegClass.contains(Reg)))) {
-        int FI = CSI[i].getFrameIdx();
-
+    for (const auto &CSInfo : CSI) {
+      if (CSInfo.getReg() == PPC::CR2) {
+        int FI = CSInfo.getFrameIdx();
         MFI.setObjectOffset(FI, LowerBound + MFI.getObjectOffset(FI));
+        break;
       }
     }
 
@@ -2108,17 +2281,17 @@ PPCFrameLowering::addScavengingSpillSlot(MachineFunction &MF,
     const TargetRegisterClass &RC = Subtarget.isPPC64() ? G8RC : GPRC;
     const TargetRegisterInfo &TRI = *Subtarget.getRegisterInfo();
     unsigned Size = TRI.getSpillSize(RC);
-    unsigned Align = TRI.getSpillAlignment(RC);
-    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
+    Align Alignment = TRI.getSpillAlign(RC);
+    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Alignment, false));
 
     // Might we have over-aligned allocas?
-    bool HasAlVars = MFI.hasVarSizedObjects() &&
-                     MFI.getMaxAlignment() > getStackAlignment();
+    bool HasAlVars =
+        MFI.hasVarSizedObjects() && MFI.getMaxAlign() > getStackAlign();
 
     // These kinds of spills might need two registers.
     if (spillsCR(MF) || spillsVRSAVE(MF) || HasAlVars)
-      RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
-
+      RS->addScavengingFrameIndex(
+          MFI.CreateStackObject(Size, Alignment, false));
   }
 }
 
@@ -2179,17 +2352,9 @@ bool PPCFrameLowering::assignCalleeSavedSpillSlots(
   return AllSpilledToReg;
 }
 
-
-bool
-PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                     MachineBasicBlock::iterator MI,
-                                     const std::vector<CalleeSavedInfo> &CSI,
-                                     const TargetRegisterInfo *TRI) const {
-
-  // Currently, this function only handles SVR4 32- and 64-bit ABIs.
-  // Return false otherwise to maintain pre-existing behavior.
-  if (!Subtarget.isSVR4ABI())
-    return false;
+bool PPCFrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
 
   MachineFunction *MF = MBB.getParent();
   const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
@@ -2201,10 +2366,8 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
-    // Only Darwin actually uses the VRSAVE register, but it can still appear
-    // here if, for example, @llvm.eh.unwind.init() is used.  If we're not on
-    // Darwin, ignore it.
-    if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI())
+    // VRSAVE can appear here if, for example, @llvm.eh.unwind.init() is used.
+    if (Reg == PPC::VRSAVE)
       continue;
 
     // CR2 through CR4 are the nonvolatile CR fields.
@@ -2232,7 +2395,7 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
     // Insert the spill to the stack frame.
     if (IsCRField) {
       PPCFunctionInfo *FuncInfo = MF->getInfo<PPCFunctionInfo>();
-      if (Subtarget.isPPC64()) {
+      if (!Subtarget.is32BitELFABI()) {
         // The actual spill will happen at the start of the prologue.
         FuncInfo->addMustSaveCR(Reg);
       } else {
@@ -2260,37 +2423,37 @@ PPCFrameLowering::spillCalleeSavedRegisters(MachineBasicBlock &MBB,
         // Use !IsLiveIn for the kill flag.
         // We do not want to kill registers that are live in this function
         // before their use because they will become undefined registers.
-        TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn,
-                                CSI[i].getFrameIdx(), RC, TRI);
+        // Functions without NoUnwind need to preserve the order of elements in
+        // saved vector registers.
+        if (Subtarget.needsSwapsForVSXMemOps() &&
+            !MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
+          TII.storeRegToStackSlotNoUpd(MBB, MI, Reg, !IsLiveIn,
+                                       CSI[i].getFrameIdx(), RC, TRI);
+        else
+          TII.storeRegToStackSlot(MBB, MI, Reg, !IsLiveIn, CSI[i].getFrameIdx(),
+                                  RC, TRI);
       }
     }
   }
   return true;
 }
 
-static void
-restoreCRs(bool isPPC64, bool is31,
-           bool CR2Spilled, bool CR3Spilled, bool CR4Spilled,
-           MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-           const std::vector<CalleeSavedInfo> &CSI, unsigned CSIIndex) {
+static void restoreCRs(bool is31, bool CR2Spilled, bool CR3Spilled,
+                       bool CR4Spilled, MachineBasicBlock &MBB,
+                       MachineBasicBlock::iterator MI,
+                       ArrayRef<CalleeSavedInfo> CSI, unsigned CSIIndex) {
 
   MachineFunction *MF = MBB.getParent();
   const PPCInstrInfo &TII = *MF->getSubtarget<PPCSubtarget>().getInstrInfo();
   DebugLoc DL;
-  unsigned RestoreOp, MoveReg;
+  unsigned MoveReg = PPC::R12;
 
-  if (isPPC64)
-    // This is handled during epilogue generation.
-    return;
-  else {
-    // 32-bit:  FP-relative
-    MBB.insert(MI, addFrameReference(BuildMI(*MF, DL, TII.get(PPC::LWZ),
-                                             PPC::R12),
-                                     CSI[CSIIndex].getFrameIdx()));
-    RestoreOp = PPC::MTOCRF;
-    MoveReg = PPC::R12;
-  }
+  // 32-bit:  FP-relative
+  MBB.insert(MI,
+             addFrameReference(BuildMI(*MF, DL, TII.get(PPC::LWZ), MoveReg),
+                               CSI[CSIIndex].getFrameIdx()));
 
+  unsigned RestoreOp = PPC::MTOCRF;
   if (CR2Spilled)
     MBB.insert(MI, BuildMI(*MF, DL, TII.get(RestoreOp), PPC::CR2)
                .addReg(MoveReg, getKillRegState(!CR3Spilled && !CR4Spilled)));
@@ -2343,17 +2506,13 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   return MBB.erase(I);
 }
 
-bool
-PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                        MachineBasicBlock::iterator MI,
-                                        std::vector<CalleeSavedInfo> &CSI,
-                                        const TargetRegisterInfo *TRI) const {
-
-  // Currently, this function only handles SVR4 32- and 64-bit ABIs.
-  // Return false otherwise to maintain pre-existing behavior.
-  if (!Subtarget.isSVR4ABI())
-    return false;
+static bool isCalleeSavedCR(unsigned Reg) {
+  return PPC::CR2 == Reg || Reg == PPC::CR3 || Reg == PPC::CR4;
+}
 
+bool PPCFrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
   const PPCInstrInfo &TII = *Subtarget.getInstrInfo();
   PPCFunctionInfo *FI = MF->getInfo<PPCFunctionInfo>();
@@ -2374,15 +2533,18 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
   for (unsigned i = 0, e = CSI.size(); i != e; ++i) {
     unsigned Reg = CSI[i].getReg();
 
-    // Only Darwin actually uses the VRSAVE register, but it can still appear
-    // here if, for example, @llvm.eh.unwind.init() is used.  If we're not on
-    // Darwin, ignore it.
-    if (Reg == PPC::VRSAVE && !Subtarget.isDarwinABI())
+    // VRSAVE can appear here if, for example, @llvm.eh.unwind.init() is used.
+    if (Reg == PPC::VRSAVE)
       continue;
 
     if ((Reg == PPC::X2 || Reg == PPC::R2) && MustSaveTOC)
       continue;
 
+    // Restore of callee saved condition register field is handled during
+    // epilogue insertion.
+    if (isCalleeSavedCR(Reg) && !Subtarget.is32BitELFABI())
+      continue;
+
     if (Reg == PPC::CR2) {
       CR2Spilled = true;
       // The spill slot is associated only with CR2, which is the
@@ -2396,14 +2558,12 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
       CR4Spilled = true;
       continue;
     } else {
-      // When we first encounter a non-CR register after seeing at
+      // On 32-bit ELF when we first encounter a non-CR register after seeing at
       // least one CR register, restore all spilled CRs together.
-      if ((CR2Spilled || CR3Spilled || CR4Spilled)
-          && !(PPC::CR2 <= Reg && Reg <= PPC::CR4)) {
+      if (CR2Spilled || CR3Spilled || CR4Spilled) {
         bool is31 = needsFP(*MF);
-        restoreCRs(Subtarget.isPPC64(), is31,
-                   CR2Spilled, CR3Spilled, CR4Spilled,
-                   MBB, I, CSI, CSIIndex);
+        restoreCRs(is31, CR2Spilled, CR3Spilled, CR4Spilled, MBB, I, CSI,
+                   CSIIndex);
         CR2Spilled = CR3Spilled = CR4Spilled = false;
       }
 
@@ -2415,7 +2575,16 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
       } else {
        // Default behavior for non-CR saves.
         const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-        TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);
+
+        // Functions without NoUnwind need to preserve the order of elements in
+        // saved vector registers.
+        if (Subtarget.needsSwapsForVSXMemOps() &&
+            !MF->getFunction().hasFnAttribute(Attribute::NoUnwind))
+          TII.loadRegFromStackSlotNoUpd(MBB, I, Reg, CSI[i].getFrameIdx(), RC,
+                                        TRI);
+        else
+          TII.loadRegFromStackSlot(MBB, I, Reg, CSI[i].getFrameIdx(), RC, TRI);
+
         assert(I != MBB.begin() &&
                "loadRegFromStackSlot didn't insert any code!");
       }
@@ -2432,9 +2601,10 @@ PPCFrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
 
   // If we haven't yet spilled the CRs, do so now.
   if (CR2Spilled || CR3Spilled || CR4Spilled) {
+    assert(Subtarget.is32BitELFABI() &&
+           "Only set CR[2|3|4]Spilled on 32-bit SVR4.");
     bool is31 = needsFP(*MF);
-    restoreCRs(Subtarget.isPPC64(), is31, CR2Spilled, CR3Spilled, CR4Spilled,
-               MBB, I, CSI, CSIIndex);
+    restoreCRs(is31, CR2Spilled, CR3Spilled, CR4Spilled, MBB, I, CSI, CSIIndex);
   }
 
   return true;
@@ -2445,14 +2615,10 @@ unsigned PPCFrameLowering::getTOCSaveOffset() const {
 }
 
 unsigned PPCFrameLowering::getFramePointerSaveOffset() const {
-  if (Subtarget.isAIXABI())
-    report_fatal_error("FramePointer is not implemented on AIX yet.");
   return FramePointerSaveOffset;
 }
 
 unsigned PPCFrameLowering::getBasePointerSaveOffset() const {
-  if (Subtarget.isAIXABI())
-    report_fatal_error("BasePointer is not implemented on AIX yet.");
   return BasePointerSaveOffset;
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/llvm/lib/Target/PowerPC/PPCFrameLowering.h
index a5fbc9acbb28..8bf52c0ed01a 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.h
@@ -61,8 +61,8 @@ class PPCFrameLowering: public TargetFrameLowering {
   bool findScratchRegister(MachineBasicBlock *MBB,
                            bool UseAtEnd,
                            bool TwoUniqueRegsRequired = false,
-                           unsigned *SR1 = nullptr,
-                           unsigned *SR2 = nullptr) const;
+                           Register *SR1 = nullptr,
+                           Register *SR2 = nullptr) const;
   bool twoUniqueScratchRegsRequired(MachineBasicBlock *MBB) const;
 
   /**
@@ -100,6 +100,8 @@ public:
   /// the function.
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void inlineStackProbe(MachineFunction &MF,
+                        MachineBasicBlock &PrologMBB) const override;
 
   bool hasFP(const MachineFunction &MF) const override;
   bool needsFP(const MachineFunction &MF) const;
@@ -113,7 +115,7 @@ public:
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
                                  const TargetRegisterInfo *TRI) const override;
   /// This function will assign callee saved gprs to volatile vector registers
   /// for prologue spills when applicable. It returns false if there are any
@@ -127,10 +129,11 @@ public:
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
 
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MI,
-                                  std::vector<CalleeSavedInfo> &CSI,
-                                  const TargetRegisterInfo *TRI) const override;
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
+                              const TargetRegisterInfo *TRI) const override;
 
   /// targetHandlesStackFrameRounding - Returns true if the target is
   /// responsible for rounding up the stack frame (probably at emitPrologue
@@ -153,10 +156,6 @@ public:
   /// base pointer.
   unsigned getBasePointerSaveOffset() const;
 
-  /// getCRSaveOffset - Return the previous frame offset to save the
-  /// CR register.
-  unsigned getCRSaveOffset() const { return CRSaveOffset; }
-
   /// getLinkageSize - Return the size of the PowerPC ABI linkage area.
   ///
   unsigned getLinkageSize() const { return LinkageSize; }
diff --git a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 776ec52e2604..8ffd89ef5ccd 100644
--- a/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -139,6 +139,7 @@ namespace {
   class PPCDAGToDAGISel : public SelectionDAGISel {
     const PPCTargetMachine &TM;
     const PPCSubtarget *PPCSubTarget = nullptr;
+    const PPCSubtarget *Subtarget = nullptr;
     const PPCTargetLowering *PPCLowering = nullptr;
     unsigned GlobalBaseReg = 0;
 
@@ -150,10 +151,11 @@ namespace {
       // Make sure we re-emit a set of the global base reg if necessary
       GlobalBaseReg = 0;
       PPCSubTarget = &MF.getSubtarget<PPCSubtarget>();
-      PPCLowering = PPCSubTarget->getTargetLowering();
+      Subtarget = &MF.getSubtarget<PPCSubtarget>();
+      PPCLowering = Subtarget->getTargetLowering();
       SelectionDAGISel::runOnMachineFunction(MF);
 
-      if (!PPCSubTarget->isSVR4ABI())
+      if (!Subtarget->isSVR4ABI())
         InsertVRSaveCode(MF);
 
       return true;
@@ -204,7 +206,6 @@ namespace {
     bool tryBitfieldInsert(SDNode *N);
     bool tryBitPermutation(SDNode *N);
     bool tryIntCompareInGPR(SDNode *N);
-    bool tryAndWithMask(SDNode *N);
 
     // tryTLSXFormLoad - Convert an ISD::LOAD fed by a PPCISD::ADD_TLS into
     // an X-Form load instruction with the offset being a relocation coming from
@@ -239,7 +240,7 @@ namespace {
     /// bit signed displacement.
     /// Returns false if it can be represented by [r+imm], which are preferred.
     bool SelectAddrIdx(SDValue N, SDValue &Base, SDValue &Index) {
-      return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG, 0);
+      return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG, None);
     }
 
     /// SelectAddrIdx4 - Given the specified address, check to see if it can be
@@ -249,7 +250,8 @@ namespace {
     /// displacement must be a multiple of 4.
     /// Returns false if it can be represented by [r+imm], which are preferred.
     bool SelectAddrIdxX4(SDValue N, SDValue &Base, SDValue &Index) {
-      return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG, 4);
+      return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG,
+                                              Align(4));
     }
 
     /// SelectAddrIdx16 - Given the specified address, check to see if it can be
@@ -259,7 +261,8 @@ namespace {
     /// displacement must be a multiple of 16.
     /// Returns false if it can be represented by [r+imm], which are preferred.
     bool SelectAddrIdxX16(SDValue N, SDValue &Base, SDValue &Index) {
-      return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG, 16);
+      return PPCLowering->SelectAddressRegReg(N, Base, Index, *CurDAG,
+                                              Align(16));
     }
 
     /// SelectAddrIdxOnly - Given the specified address, force it to be
@@ -267,28 +270,29 @@ namespace {
     bool SelectAddrIdxOnly(SDValue N, SDValue &Base, SDValue &Index) {
       return PPCLowering->SelectAddressRegRegOnly(N, Base, Index, *CurDAG);
     }
-    
+
     /// SelectAddrImm - Returns true if the address N can be represented by
     /// a base register plus a signed 16-bit displacement [r+imm].
     /// The last parameter \p 0 means D form has no requirment for 16 bit signed
     /// displacement.
     bool SelectAddrImm(SDValue N, SDValue &Disp,
                        SDValue &Base) {
-      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 0);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, None);
     }
 
     /// SelectAddrImmX4 - Returns true if the address N can be represented by
     /// a base register plus a signed 16-bit displacement that is a multiple of
     /// 4 (last parameter). Suitable for use by STD and friends.
     bool SelectAddrImmX4(SDValue N, SDValue &Disp, SDValue &Base) {
-      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 4);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, Align(4));
     }
 
     /// SelectAddrImmX16 - Returns true if the address N can be represented by
     /// a base register plus a signed 16-bit displacement that is a multiple of
     /// 16(last parameter). Suitable for use by STXV and friends.
     bool SelectAddrImmX16(SDValue N, SDValue &Disp, SDValue &Base) {
-      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG, 16);
+      return PPCLowering->SelectAddressRegImm(N, Disp, Base, *CurDAG,
+                                              Align(16));
     }
 
     // Select an address into a single register.
@@ -297,6 +301,10 @@ namespace {
       return true;
     }
 
+    bool SelectAddrPCRel(SDValue N, SDValue &Base) {
+      return PPCLowering->SelectAddressPCRel(N, Base);
+    }
+
     /// SelectInlineAsmMemoryOperand - Implement addressing mode selection for
     /// inline asm expressions.  It is always correct to compute the value into
     /// a register.  The case of adding a (possibly relocatable) constant to a
@@ -317,7 +325,7 @@ namespace {
       case InlineAsm::Constraint_Zy:
         // We need to make sure that this one operand does not end up in r0
         // (because we might end up lowering this as 0(%op)).
-        const TargetRegisterInfo *TRI = PPCSubTarget->getRegisterInfo();
+        const TargetRegisterInfo *TRI = Subtarget->getRegisterInfo();
         const TargetRegisterClass *TRC = TRI->getPointerRegClass(*MF, /*Kind=*/1);
         SDLoc dl(Op);
         SDValue RC = CurDAG->getTargetConstant(TRC->getID(), dl, MVT::i32);
@@ -343,6 +351,13 @@ namespace {
 
 private:
     bool trySETCC(SDNode *N);
+    bool tryAsSingleRLDICL(SDNode *N);
+    bool tryAsSingleRLDICR(SDNode *N);
+    bool tryAsSingleRLWINM(SDNode *N);
+    bool tryAsSingleRLWINM8(SDNode *N);
+    bool tryAsSingleRLWIMI(SDNode *N);
+    bool tryAsPairOfRLDICL(SDNode *N);
+    bool tryAsSingleRLDIMI(SDNode *N);
 
     void PeepholePPC64();
     void PeepholePPC64ZExt();
@@ -394,7 +409,7 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
   Register InVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
   Register UpdatedVRSAVE = RegInfo->createVirtualRegister(&PPC::GPRCRegClass);
 
-  const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo();
+  const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
   MachineBasicBlock &EntryBB = *Fn.begin();
   DebugLoc dl;
   // Emit the following code into the entry block:
@@ -429,7 +444,7 @@ void PPCDAGToDAGISel::InsertVRSaveCode(MachineFunction &Fn) {
 ///
 SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
   if (!GlobalBaseReg) {
-    const TargetInstrInfo &TII = *PPCSubTarget->getInstrInfo();
+    const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
     // Insert the set of GlobalBaseReg into the first MBB of the function
     MachineBasicBlock &FirstMBB = MF->front();
     MachineBasicBlock::iterator MBBI = FirstMBB.begin();
@@ -437,9 +452,9 @@ SDNode *PPCDAGToDAGISel::getGlobalBaseReg() {
     DebugLoc dl;
 
     if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) == MVT::i32) {
-      if (PPCSubTarget->isTargetELF()) {
+      if (Subtarget->isTargetELF()) {
         GlobalBaseReg = PPC::R30;
-        if (!PPCSubTarget->isSecurePlt() &&
+        if (!Subtarget->isSecurePlt() &&
             M->getPICLevel() == PICLevel::SmallPIC) {
           BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MoveGOTtoLR));
           BuildMI(FirstMBB, MBBI, dl, TII.get(PPC::MFLR), GlobalBaseReg);
@@ -3788,7 +3803,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
       Opc = PPC::CMPD;
     }
   } else if (LHS.getValueType() == MVT::f32) {
-    if (PPCSubTarget->hasSPE()) {
+    if (Subtarget->hasSPE()) {
       switch (CC) {
         default:
         case ISD::SETEQ:
@@ -3815,7 +3830,7 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
     } else
       Opc = PPC::FCMPUS;
   } else if (LHS.getValueType() == MVT::f64) {
-    if (PPCSubTarget->hasSPE()) {
+    if (Subtarget->hasSPE()) {
       switch (CC) {
         default:
         case ISD::SETEQ:
@@ -3840,10 +3855,10 @@ SDValue PPCDAGToDAGISel::SelectCC(SDValue LHS, SDValue RHS, ISD::CondCode CC,
           break;
       }
     } else
-      Opc = PPCSubTarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
+      Opc = Subtarget->hasVSX() ? PPC::XSCMPUDP : PPC::FCMPUD;
   } else {
     assert(LHS.getValueType() == MVT::f128 && "Unknown vt!");
-    assert(PPCSubTarget->hasVSX() && "__float128 requires VSX");
+    assert(Subtarget->hasVSX() && "__float128 requires VSX");
     Opc = PPC::XSCMPUQP;
   }
   return SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i32, LHS, RHS), 0);
@@ -3872,10 +3887,10 @@ static PPC::Predicate getPredicateForSetCC(ISD::CondCode CC, const EVT &VT,
     return UseSPE ? PPC::PRED_GT : PPC::PRED_LT;
   case ISD::SETULE:
   case ISD::SETLE:
-    return UseSPE ? PPC::PRED_LE : PPC::PRED_LE;
+    return PPC::PRED_LE;
   case ISD::SETOGT:
   case ISD::SETGT:
-    return UseSPE ? PPC::PRED_GT : PPC::PRED_GT;
+    return PPC::PRED_GT;
   case ISD::SETUGE:
   case ISD::SETGE:
     return UseSPE ? PPC::PRED_LE : PPC::PRED_GE;
@@ -4038,8 +4053,7 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
       CurDAG->getTargetLoweringInfo().getPointerTy(CurDAG->getDataLayout());
   bool isPPC64 = (PtrVT == MVT::i64);
 
-  if (!PPCSubTarget->useCRBits() &&
-      isInt32Immediate(N->getOperand(1), Imm)) {
+  if (!Subtarget->useCRBits() && isInt32Immediate(N->getOperand(1), Imm)) {
     // We can codegen setcc op, imm very efficiently compared to a brcond.
     // Check for those cases here.
     // setcc op, 0
@@ -4128,20 +4142,20 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
   // Altivec Vector compare instructions do not set any CR register by default and
   // vector compare operations return the same type as the operands.
   if (LHS.getValueType().isVector()) {
-    if (PPCSubTarget->hasQPX() || PPCSubTarget->hasSPE())
+    if (Subtarget->hasQPX() || Subtarget->hasSPE())
       return false;
 
     EVT VecVT = LHS.getValueType();
     bool Swap, Negate;
-    unsigned int VCmpInst = getVCmpInst(VecVT.getSimpleVT(), CC,
-                                        PPCSubTarget->hasVSX(), Swap, Negate);
+    unsigned int VCmpInst =
+        getVCmpInst(VecVT.getSimpleVT(), CC, Subtarget->hasVSX(), Swap, Negate);
     if (Swap)
       std::swap(LHS, RHS);
 
     EVT ResVT = VecVT.changeVectorElementTypeToInteger();
     if (Negate) {
       SDValue VCmp(CurDAG->getMachineNode(VCmpInst, dl, ResVT, LHS, RHS), 0);
-      CurDAG->SelectNodeTo(N, PPCSubTarget->hasVSX() ? PPC::XXLNOR : PPC::VNOR,
+      CurDAG->SelectNodeTo(N, Subtarget->hasVSX() ? PPC::XXLNOR : PPC::VNOR,
                            ResVT, VCmp, VCmp);
       return true;
     }
@@ -4150,7 +4164,7 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
     return true;
   }
 
-  if (PPCSubTarget->useCRBits())
+  if (Subtarget->useCRBits())
     return false;
 
   bool Inv;
@@ -4160,7 +4174,7 @@ bool PPCDAGToDAGISel::trySETCC(SDNode *N) {
 
   // SPE e*cmp* instructions only set the 'gt' bit, so hard-code that
   // The correct compare instruction is already set by SelectCC()
-  if (PPCSubTarget->hasSPE() && LHS.getValueType().isFloatingPoint()) {
+  if (Subtarget->hasSPE() && LHS.getValueType().isFloatingPoint()) {
     Idx = 1;
   }
 
@@ -4209,7 +4223,7 @@ bool PPCDAGToDAGISel::isOffsetMultipleOf(SDNode *N, unsigned Val) const {
     // because it is translated to r31 or r1 + slot + offset. We won't know the
     // slot number until the stack frame is finalized.
     const MachineFrameInfo &MFI = CurDAG->getMachineFunction().getFrameInfo();
-    unsigned SlotAlign = MFI.getObjectAlignment(FI->getIndex());
+    unsigned SlotAlign = MFI.getObjectAlign(FI->getIndex()).value();
     if ((SlotAlign % Val) != 0)
       return false;
 
@@ -4241,13 +4255,10 @@ static bool mayUseP9Setb(SDNode *N, const ISD::CondCode &CC, SelectionDAG *DAG,
   SDValue TrueRes = N->getOperand(2);
   SDValue FalseRes = N->getOperand(3);
   ConstantSDNode *TrueConst = dyn_cast<ConstantSDNode>(TrueRes);
-  if (!TrueConst)
+  if (!TrueConst || (N->getSimpleValueType(0) != MVT::i64 &&
+                     N->getSimpleValueType(0) != MVT::i32))
     return false;
 
-  assert((N->getSimpleValueType(0) == MVT::i64 ||
-          N->getSimpleValueType(0) == MVT::i32) &&
-         "Expecting either i64 or i32 here.");
-
   // We are looking for any of:
   // (select_cc lhs, rhs,  1, (sext (setcc [lr]hs, [lr]hs, cc2)), cc1)
   // (select_cc lhs, rhs, -1, (zext (setcc [lr]hs, [lr]hs, cc2)), cc1)
@@ -4371,142 +4382,251 @@ static bool mayUseP9Setb(SDNode *N, const ISD::CondCode &CC, SelectionDAG *DAG,
   return true;
 }
 
-bool PPCDAGToDAGISel::tryAndWithMask(SDNode *N) {
-  if (N->getOpcode() != ISD::AND)
+bool PPCDAGToDAGISel::tryAsSingleRLWINM(SDNode *N) {
+  assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected");
+  unsigned Imm;
+  if (!isInt32Immediate(N->getOperand(1), Imm))
     return false;
 
   SDLoc dl(N);
   SDValue Val = N->getOperand(0);
-  unsigned Imm, Imm2, SH, MB, ME;
-  uint64_t Imm64;
-
+  unsigned SH, MB, ME;
   // If this is an and of a value rotated between 0 and 31 bits and then and'd
   // with a mask, emit rlwinm
-  if (isInt32Immediate(N->getOperand(1), Imm) &&
-      isRotateAndMask(N->getOperand(0).getNode(), Imm, false, SH, MB, ME)) {
-    SDValue Val = N->getOperand(0).getOperand(0);
-    SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl),
-                      getI32Imm(ME, dl) };
+  if (isRotateAndMask(Val.getNode(), Imm, false, SH, MB, ME)) {
+    Val = Val.getOperand(0);
+    SDValue Ops[] = {Val, getI32Imm(SH, dl), getI32Imm(MB, dl),
+                     getI32Imm(ME, dl)};
     CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
     return true;
   }
 
   // If this is just a masked value where the input is not handled, and
   // is not a rotate-left (handled by a pattern in the .td file), emit rlwinm
-  if (isInt32Immediate(N->getOperand(1), Imm)) {
-    if (isRunOfOnes(Imm, MB, ME) &&
-        N->getOperand(0).getOpcode() != ISD::ROTL) {
-      SDValue Ops[] = { Val, getI32Imm(0, dl), getI32Imm(MB, dl),
-                        getI32Imm(ME, dl) };
-      CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
-      return true;
-    }
-    // AND X, 0 -> 0, not "rlwinm 32".
-    if (Imm == 0) {
-      ReplaceUses(SDValue(N, 0), N->getOperand(1));
-      return true;
-    }
+  if (isRunOfOnes(Imm, MB, ME) && Val.getOpcode() != ISD::ROTL) {
+    SDValue Ops[] = {Val, getI32Imm(0, dl), getI32Imm(MB, dl),
+                     getI32Imm(ME, dl)};
+    CurDAG->SelectNodeTo(N, PPC::RLWINM, MVT::i32, Ops);
+    return true;
+  }
 
-    // ISD::OR doesn't get all the bitfield insertion fun.
-    // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) might be a
-    // bitfield insert.
-    if (N->getOperand(0).getOpcode() == ISD::OR &&
-        isInt32Immediate(N->getOperand(0).getOperand(1), Imm2)) {
-      // The idea here is to check whether this is equivalent to:
-      //   (c1 & m) | (x & ~m)
-      // where m is a run-of-ones mask. The logic here is that, for each bit in
-      // c1 and c2:
-      //  - if both are 1, then the output will be 1.
-      //  - if both are 0, then the output will be 0.
-      //  - if the bit in c1 is 0, and the bit in c2 is 1, then the output will
-      //    come from x.
-      //  - if the bit in c1 is 1, and the bit in c2 is 0, then the output will
-      //    be 0.
-      //  If that last condition is never the case, then we can form m from the
-      //  bits that are the same between c1 and c2.
-      unsigned MB, ME;
-      if (isRunOfOnes(~(Imm^Imm2), MB, ME) && !(~Imm & Imm2)) {
-        SDValue Ops[] = { N->getOperand(0).getOperand(0),
-                            N->getOperand(0).getOperand(1),
-                            getI32Imm(0, dl), getI32Imm(MB, dl),
-                            getI32Imm(ME, dl) };
-        ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops));
-        return true;
-      }
-    }
-  } else if (isInt64Immediate(N->getOperand(1).getNode(), Imm64)) {
-    // If this is a 64-bit zero-extension mask, emit rldicl.
-    if (isMask_64(Imm64)) {
-      MB = 64 - countTrailingOnes(Imm64);
-      SH = 0;
-
-      if (Val.getOpcode() == ISD::ANY_EXTEND) {
-        auto Op0 = Val.getOperand(0);
-        if ( Op0.getOpcode() == ISD::SRL &&
-           isInt32Immediate(Op0.getOperand(1).getNode(), Imm) && Imm <= MB) {
-
-           auto ResultType = Val.getNode()->getValueType(0);
-           auto ImDef = CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl,
-                                               ResultType);
-           SDValue IDVal (ImDef, 0);
-
-           Val = SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl,
-                         ResultType, IDVal, Op0.getOperand(0),
-                         getI32Imm(1, dl)), 0);
-           SH = 64 - Imm;
-        }
-      }
+  // AND X, 0 -> 0, not "rlwinm 32".
+  if (Imm == 0) {
+    ReplaceUses(SDValue(N, 0), N->getOperand(1));
+    return true;
+  }
 
-      // If the operand is a logical right shift, we can fold it into this
-      // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb)
-      // for n <= mb. The right shift is really a left rotate followed by a
-      // mask, and this mask is a more-restrictive sub-mask of the mask implied
-      // by the shift.
-      if (Val.getOpcode() == ISD::SRL &&
-          isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) {
-        assert(Imm < 64 && "Illegal shift amount");
-        Val = Val.getOperand(0);
-        SH = 64 - Imm;
-      }
+  return false;
+}
 
-      SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) };
-      CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
-      return true;
-    } else if (isMask_64(~Imm64)) {
-      // If this is a negated 64-bit zero-extension mask,
-      // i.e. the immediate is a sequence of ones from most significant side
-      // and all zero for reminder, we should use rldicr.
-      MB = 63 - countTrailingOnes(~Imm64);
-      SH = 0;
-      SDValue Ops[] = { Val, getI32Imm(SH, dl), getI32Imm(MB, dl) };
-      CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops);
-      return true;
-    }
+bool PPCDAGToDAGISel::tryAsSingleRLWINM8(SDNode *N) {
+  assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected");
+  uint64_t Imm64;
+  if (!isInt64Immediate(N->getOperand(1).getNode(), Imm64))
+    return false;
 
-    // It is not 16-bit imm that means we need two instructions at least if
-    // using "and" instruction. Try to exploit it with rotate mask instructions.
-    if (isRunOfOnes64(Imm64, MB, ME)) {
-      if (MB >= 32 && MB <= ME) {
-        //                MB  ME
-        // +----------------------+
-        // |xxxxxxxxxxx00011111000|
-        // +----------------------+
-        //  0         32         64
-        // We can only do it if the MB is larger than 32 and MB <= ME
-        // as RLWINM will replace the content of [0 - 32) with [32 - 64) even
-        // we didn't rotate it.
-        SDValue Ops[] = { Val, getI64Imm(0, dl), getI64Imm(MB - 32, dl),
-                          getI64Imm(ME - 32, dl) };
-        CurDAG->SelectNodeTo(N, PPC::RLWINM8, MVT::i64, Ops);
-        return true;
-      }
-      // TODO - handle it with rldicl + rldicl
-    }
+  unsigned MB, ME;
+  if (isRunOfOnes64(Imm64, MB, ME) && MB >= 32 && MB <= ME) {
+    //                MB  ME
+    // +----------------------+
+    // |xxxxxxxxxxx00011111000|
+    // +----------------------+
+    //  0         32         64
+    // We can only do it if the MB is larger than 32 and MB <= ME
+    // as RLWINM will replace the contents of [0 - 32) with [32 - 64) even
+    // we didn't rotate it.
+    SDLoc dl(N);
+    SDValue Ops[] = {N->getOperand(0), getI64Imm(0, dl), getI64Imm(MB - 32, dl),
+                     getI64Imm(ME - 32, dl)};
+    CurDAG->SelectNodeTo(N, PPC::RLWINM8, MVT::i64, Ops);
+    return true;
+  }
+
+  return false;
+}
+
+bool PPCDAGToDAGISel::tryAsPairOfRLDICL(SDNode *N) {
+  assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected");
+  uint64_t Imm64;
+  if (!isInt64Immediate(N->getOperand(1).getNode(), Imm64))
+    return false;
+
+  // Do nothing if it is 16-bit imm as the pattern in the .td file handle
+  // it well with "andi.".
+  if (isUInt<16>(Imm64))
+    return false;
+
+  SDLoc Loc(N);
+  SDValue Val = N->getOperand(0);
+
+  // Optimized with two rldicl's as follows:
+  // Add missing bits on left to the mask and check that the mask is a
+  // wrapped run of ones, i.e.
+  // Change pattern |0001111100000011111111|
+  //             to |1111111100000011111111|.
+  unsigned NumOfLeadingZeros = countLeadingZeros(Imm64);
+  if (NumOfLeadingZeros != 0)
+    Imm64 |= maskLeadingOnes<uint64_t>(NumOfLeadingZeros);
+
+  unsigned MB, ME;
+  if (!isRunOfOnes64(Imm64, MB, ME))
+    return false;
+
+  //         ME     MB                   MB-ME+63
+  // +----------------------+     +----------------------+
+  // |1111111100000011111111| ->  |0000001111111111111111|
+  // +----------------------+     +----------------------+
+  //  0                    63      0                    63
+  // There are ME + 1 ones on the left and (MB - ME + 63) & 63 zeros in between.
+  unsigned OnesOnLeft = ME + 1;
+  unsigned ZerosInBetween = (MB - ME + 63) & 63;
+  // Rotate left by OnesOnLeft (so leading ones are now trailing ones) and clear
+  // on the left the bits that are already zeros in the mask.
+  Val = SDValue(CurDAG->getMachineNode(PPC::RLDICL, Loc, MVT::i64, Val,
+                                       getI64Imm(OnesOnLeft, Loc),
+                                       getI64Imm(ZerosInBetween, Loc)),
+                0);
+  //        MB-ME+63                      ME     MB
+  // +----------------------+     +----------------------+
+  // |0000001111111111111111| ->  |0001111100000011111111|
+  // +----------------------+     +----------------------+
+  //  0                    63      0                    63
+  // Rotate back by 64 - OnesOnLeft to undo previous rotate. Then clear on the
+  // left the number of ones we previously added.
+  SDValue Ops[] = {Val, getI64Imm(64 - OnesOnLeft, Loc),
+                   getI64Imm(NumOfLeadingZeros, Loc)};
+  CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
+  return true;
+}
+
+bool PPCDAGToDAGISel::tryAsSingleRLWIMI(SDNode *N) {
+  assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected");
+  unsigned Imm;
+  if (!isInt32Immediate(N->getOperand(1), Imm))
+    return false;
+
+  SDValue Val = N->getOperand(0);
+  unsigned Imm2;
+  // ISD::OR doesn't get all the bitfield insertion fun.
+  // (and (or x, c1), c2) where isRunOfOnes(~(c1^c2)) might be a
+  // bitfield insert.
+  if (Val.getOpcode() != ISD::OR || !isInt32Immediate(Val.getOperand(1), Imm2))
+    return false;
+
+  // The idea here is to check whether this is equivalent to:
+  //   (c1 & m) | (x & ~m)
+  // where m is a run-of-ones mask. The logic here is that, for each bit in
+  // c1 and c2:
+  //  - if both are 1, then the output will be 1.
+  //  - if both are 0, then the output will be 0.
+  //  - if the bit in c1 is 0, and the bit in c2 is 1, then the output will
+  //    come from x.
+  //  - if the bit in c1 is 1, and the bit in c2 is 0, then the output will
+  //    be 0.
+  //  If that last condition is never the case, then we can form m from the
+  //  bits that are the same between c1 and c2.
+  unsigned MB, ME;
+  if (isRunOfOnes(~(Imm ^ Imm2), MB, ME) && !(~Imm & Imm2)) {
+    SDLoc dl(N);
+    SDValue Ops[] = {Val.getOperand(0), Val.getOperand(1), getI32Imm(0, dl),
+                     getI32Imm(MB, dl), getI32Imm(ME, dl)};
+    ReplaceNode(N, CurDAG->getMachineNode(PPC::RLWIMI, dl, MVT::i32, Ops));
+    return true;
   }
 
   return false;
 }
 
+bool PPCDAGToDAGISel::tryAsSingleRLDICL(SDNode *N) {
+  assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected");
+  uint64_t Imm64;
+  if (!isInt64Immediate(N->getOperand(1).getNode(), Imm64) || !isMask_64(Imm64))
+    return false;
+
+  // If this is a 64-bit zero-extension mask, emit rldicl.
+  unsigned MB = 64 - countTrailingOnes(Imm64);
+  unsigned SH = 0;
+  unsigned Imm;
+  SDValue Val = N->getOperand(0);
+  SDLoc dl(N);
+
+  if (Val.getOpcode() == ISD::ANY_EXTEND) {
+    auto Op0 = Val.getOperand(0);
+    if (Op0.getOpcode() == ISD::SRL &&
+        isInt32Immediate(Op0.getOperand(1).getNode(), Imm) && Imm <= MB) {
+
+      auto ResultType = Val.getNode()->getValueType(0);
+      auto ImDef = CurDAG->getMachineNode(PPC::IMPLICIT_DEF, dl, ResultType);
+      SDValue IDVal(ImDef, 0);
+
+      Val = SDValue(CurDAG->getMachineNode(PPC::INSERT_SUBREG, dl, ResultType,
+                                           IDVal, Op0.getOperand(0),
+                                           getI32Imm(1, dl)),
+                    0);
+      SH = 64 - Imm;
+    }
+  }
+
+  // If the operand is a logical right shift, we can fold it into this
+  // instruction: rldicl(rldicl(x, 64-n, n), 0, mb) -> rldicl(x, 64-n, mb)
+  // for n <= mb. The right shift is really a left rotate followed by a
+  // mask, and this mask is a more-restrictive sub-mask of the mask implied
+  // by the shift.
+  if (Val.getOpcode() == ISD::SRL &&
+      isInt32Immediate(Val.getOperand(1).getNode(), Imm) && Imm <= MB) {
+    assert(Imm < 64 && "Illegal shift amount");
+    Val = Val.getOperand(0);
+    SH = 64 - Imm;
+  }
+
+  SDValue Ops[] = {Val, getI32Imm(SH, dl), getI32Imm(MB, dl)};
+  CurDAG->SelectNodeTo(N, PPC::RLDICL, MVT::i64, Ops);
+  return true;
+}
+
+bool PPCDAGToDAGISel::tryAsSingleRLDICR(SDNode *N) {
+  assert(N->getOpcode() == ISD::AND && "ISD::AND SDNode expected");
+  uint64_t Imm64;
+  if (!isInt64Immediate(N->getOperand(1).getNode(), Imm64) ||
+      !isMask_64(~Imm64))
+    return false;
+
+  // If this is a negated 64-bit zero-extension mask,
+  // i.e. the immediate is a sequence of ones from most significant side
+  // and all zero for reminder, we should use rldicr.
+  unsigned MB = 63 - countTrailingOnes(~Imm64);
+  unsigned SH = 0;
+  SDLoc dl(N);
+  SDValue Ops[] = {N->getOperand(0), getI32Imm(SH, dl), getI32Imm(MB, dl)};
+  CurDAG->SelectNodeTo(N, PPC::RLDICR, MVT::i64, Ops);
+  return true;
+}
+
+bool PPCDAGToDAGISel::tryAsSingleRLDIMI(SDNode *N) {
+  assert(N->getOpcode() == ISD::OR && "ISD::OR SDNode expected");
+  uint64_t Imm64;
+  unsigned MB, ME;
+  SDValue N0 = N->getOperand(0);
+
+  // We won't get fewer instructions if the imm is 32-bit integer.
+  // rldimi requires the imm to have consecutive ones with both sides zero.
+  // Also, make sure the first Op has only one use, otherwise this may increase
+  // register pressure since rldimi is destructive.
+  if (!isInt64Immediate(N->getOperand(1).getNode(), Imm64) ||
+      isUInt<32>(Imm64) || !isRunOfOnes64(Imm64, MB, ME) || !N0.hasOneUse())
+    return false;
+
+  unsigned SH = 63 - ME;
+  SDLoc Dl(N);
+  // Use select64Imm for making LI instr instead of directly putting Imm64
+  SDValue Ops[] = {
+      N->getOperand(0),
+      SDValue(selectI64Imm(CurDAG, getI64Imm(-1, Dl).getNode()), 0),
+      getI32Imm(SH, Dl), getI32Imm(MB, Dl)};
+  CurDAG->SelectNodeTo(N, PPC::RLDIMI, MVT::i64, Ops);
+  return true;
+}
+
 // Select - Convert the specified operand from a target-independent to a
 // target-specific node if it hasn't already been changed.
 void PPCDAGToDAGISel::Select(SDNode *N) {
@@ -4551,7 +4671,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
   case PPCISD::ADDI_TLSGD_L_ADDR: {
     const Module *Mod = MF->getFunction().getParent();
     if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) != MVT::i32 ||
-        !PPCSubTarget->isSecurePlt() || !PPCSubTarget->isTargetELF() ||
+        !Subtarget->isSecurePlt() || !Subtarget->isTargetELF() ||
         Mod->getPICLevel() == PICLevel::SmallPIC)
       break;
     // Attach global base pointer on GETtlsADDR32 node in order to
@@ -4560,8 +4680,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
   } break;
   case PPCISD::CALL: {
     if (PPCLowering->getPointerTy(CurDAG->getDataLayout()) != MVT::i32 ||
-        !TM.isPositionIndependent() || !PPCSubTarget->isSecurePlt() ||
-        !PPCSubTarget->isTargetELF())
+        !TM.isPositionIndependent() || !Subtarget->isSecurePlt() ||
+        !Subtarget->isTargetELF())
       break;
 
     SDValue Op = N->getOperand(1);
@@ -4625,7 +4745,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
   case ISD::STORE: {
     // Change TLS initial-exec D-form stores to X-form stores.
     StoreSDNode *ST = cast<StoreSDNode>(N);
-    if (EnableTLSOpt && PPCSubTarget->isELFv2ABI() &&
+    if (EnableTLSOpt && Subtarget->isELFv2ABI() &&
         ST->getAddressingMode() != ISD::PRE_INC)
       if (tryTLSXFormStore(ST))
         return;
@@ -4639,7 +4759,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // Normal loads are handled by code generated from the .td file.
     if (LD->getAddressingMode() != ISD::PRE_INC) {
       // Change TLS initial-exec D-form loads to X-form loads.
-      if (EnableTLSOpt && PPCSubTarget->isELFv2ABI())
+      if (EnableTLSOpt && Subtarget->isELFv2ABI())
         if (tryTLSXFormLoad(LD))
           return;
       break;
@@ -4730,7 +4850,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
 
   case ISD::AND:
     // If this is an 'and' with a mask, try to emit rlwinm/rldicl/rldicr
-    if (tryAndWithMask(N))
+    if (tryAsSingleRLWINM(N) || tryAsSingleRLWIMI(N) || tryAsSingleRLDICL(N) ||
+        tryAsSingleRLDICR(N) || tryAsSingleRLWINM8(N) || tryAsPairOfRLDICL(N))
       return;
 
     // Other cases are autogenerated.
@@ -4753,10 +4874,15 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
       }
     }
 
+    // If this is 'or' against an imm with consecutive ones and both sides zero,
+    // try to emit rldimi
+    if (tryAsSingleRLDIMI(N))
+      return;
+
     // OR with a 32-bit immediate can be handled by ori + oris
     // without creating an immediate in a GPR.
     uint64_t Imm64 = 0;
-    bool IsPPC64 = PPCSubTarget->isPPC64();
+    bool IsPPC64 = Subtarget->isPPC64();
     if (IsPPC64 && isInt64Immediate(N->getOperand(1), Imm64) &&
         (Imm64 & ~0xFFFFFFFFuLL) == 0) {
       // If ImmHi (ImmHi) is zero, only one ori (oris) is generated later.
@@ -4779,7 +4905,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // XOR with a 32-bit immediate can be handled by xori + xoris
     // without creating an immediate in a GPR.
     uint64_t Imm64 = 0;
-    bool IsPPC64 = PPCSubTarget->isPPC64();
+    bool IsPPC64 = Subtarget->isPPC64();
     if (IsPPC64 && isInt64Immediate(N->getOperand(1), Imm64) &&
         (Imm64 & ~0xFFFFFFFFuLL) == 0) {
       // If ImmHi (ImmHi) is zero, only one xori (xoris) is generated later.
@@ -4866,11 +4992,10 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     bool isPPC64 = (PtrVT == MVT::i64);
 
     // If this is a select of i1 operands, we'll pattern match it.
-    if (PPCSubTarget->useCRBits() &&
-        N->getOperand(0).getValueType() == MVT::i1)
+    if (Subtarget->useCRBits() && N->getOperand(0).getValueType() == MVT::i1)
       break;
 
-    if (PPCSubTarget->isISA3_0() && PPCSubTarget->isPPC64()) {
+    if (Subtarget->isISA3_0() && Subtarget->isPPC64()) {
       bool NeedSwapOps = false;
       bool IsUnCmp = false;
       if (mayUseP9Setb(N, CC, CurDAG, NeedSwapOps, IsUnCmp)) {
@@ -4945,7 +5070,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     }
 
     unsigned BROpc =
-        getPredicateForSetCC(CC, N->getOperand(0).getValueType(), PPCSubTarget);
+        getPredicateForSetCC(CC, N->getOperand(0).getValueType(), Subtarget);
 
     unsigned SelectCCOp;
     if (N->getValueType(0) == MVT::i32)
@@ -4953,28 +5078,28 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     else if (N->getValueType(0) == MVT::i64)
       SelectCCOp = PPC::SELECT_CC_I8;
     else if (N->getValueType(0) == MVT::f32) {
-      if (PPCSubTarget->hasP8Vector())
+      if (Subtarget->hasP8Vector())
         SelectCCOp = PPC::SELECT_CC_VSSRC;
-      else if (PPCSubTarget->hasSPE())
+      else if (Subtarget->hasSPE())
         SelectCCOp = PPC::SELECT_CC_SPE4;
       else
         SelectCCOp = PPC::SELECT_CC_F4;
     } else if (N->getValueType(0) == MVT::f64) {
-      if (PPCSubTarget->hasVSX())
+      if (Subtarget->hasVSX())
         SelectCCOp = PPC::SELECT_CC_VSFRC;
-      else if (PPCSubTarget->hasSPE())
+      else if (Subtarget->hasSPE())
         SelectCCOp = PPC::SELECT_CC_SPE;
       else
         SelectCCOp = PPC::SELECT_CC_F8;
     } else if (N->getValueType(0) == MVT::f128)
       SelectCCOp = PPC::SELECT_CC_F16;
-    else if (PPCSubTarget->hasSPE())
+    else if (Subtarget->hasSPE())
       SelectCCOp = PPC::SELECT_CC_SPE;
-    else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f64)
+    else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f64)
       SelectCCOp = PPC::SELECT_CC_QFRC;
-    else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4f32)
+    else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4f32)
       SelectCCOp = PPC::SELECT_CC_QSRC;
-    else if (PPCSubTarget->hasQPX() && N->getValueType(0) == MVT::v4i1)
+    else if (Subtarget->hasQPX() && N->getValueType(0) == MVT::v4i1)
       SelectCCOp = PPC::SELECT_CC_QBRC;
     else if (N->getValueType(0) == MVT::v2f64 ||
              N->getValueType(0) == MVT::v2i64)
@@ -4988,8 +5113,8 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     return;
   }
   case ISD::VECTOR_SHUFFLE:
-    if (PPCSubTarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
-                                  N->getValueType(0) == MVT::v2i64)) {
+    if (Subtarget->hasVSX() && (N->getValueType(0) == MVT::v2f64 ||
+                                N->getValueType(0) == MVT::v2i64)) {
       ShuffleVectorSDNode *SVN = cast<ShuffleVectorSDNode>(N);
 
       SDValue Op1 = N->getOperand(SVN->getMaskElt(0) < 2 ? 0 : 1),
@@ -5024,7 +5149,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
 
       // For little endian, we must swap the input operands and adjust
       // the mask elements (reverse and invert them).
-      if (PPCSubTarget->isLittleEndian()) {
+      if (Subtarget->isLittleEndian()) {
         std::swap(Op1, Op2);
         unsigned tmp = DM[0];
         DM[0] = 1 - DM[1];
@@ -5041,7 +5166,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     break;
   case PPCISD::BDNZ:
   case PPCISD::BDZ: {
-    bool IsPPC64 = PPCSubTarget->isPPC64();
+    bool IsPPC64 = Subtarget->isPPC64();
     SDValue Ops[] = { N->getOperand(1), N->getOperand(0) };
     CurDAG->SelectNodeTo(N, N->getOpcode() == PPCISD::BDNZ
                                 ? (IsPPC64 ? PPC::BDNZ8 : PPC::BDNZ)
@@ -5069,7 +5194,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
   case ISD::BR_CC: {
     ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(1))->get();
     unsigned PCC =
-        getPredicateForSetCC(CC, N->getOperand(2).getValueType(), PPCSubTarget);
+        getPredicateForSetCC(CC, N->getOperand(2).getValueType(), Subtarget);
 
     if (N->getOperand(2).getValueType() == MVT::i1) {
       unsigned Opc;
@@ -5122,11 +5247,9 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     return;
   }
   case PPCISD::TOC_ENTRY: {
-    const bool isPPC64 = PPCSubTarget->isPPC64();
-    const bool isELFABI = PPCSubTarget->isSVR4ABI();
-    const bool isAIXABI = PPCSubTarget->isAIXABI();
-
-    assert(!PPCSubTarget->isDarwin() && "TOC is an ELF/XCOFF construct");
+    const bool isPPC64 = Subtarget->isPPC64();
+    const bool isELFABI = Subtarget->isSVR4ABI();
+    const bool isAIXABI = Subtarget->isAIXABI();
 
     // PowerPC only support small, medium and large code model.
     const CodeModel::Model CModel = TM.getCodeModel();
@@ -5177,7 +5300,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
     // or 64-bit medium (ELF-only) or large (ELF and AIX) code model code. We
     // generate two instructions as described below. The first source operand
     // is a symbol reference. If it must be toc-referenced according to
-    // PPCSubTarget, we generate:
+    // Subtarget, we generate:
     // [32-bit AIX]
     //   LWZtocL(@sym, ADDIStocHA(%r2, @sym))
     // [64-bit ELF/AIX]
@@ -5209,7 +5332,7 @@ void PPCDAGToDAGISel::Select(SDNode *N) {
   }
   case PPCISD::PPC32_PICGOT:
     // Generate a PIC-safe GOT reference.
-    assert(PPCSubTarget->is32BitELFABI() &&
+    assert(Subtarget->is32BitELFABI() &&
            "PPCISD::PPC32_PICGOT is only supported for 32-bit SVR4");
     CurDAG->SelectNodeTo(N, PPC::PPC32PICGOT,
                          PPCLowering->getPointerTy(CurDAG->getDataLayout()),
@@ -5306,7 +5429,7 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) {
          "Only OR nodes are supported for CMPB");
 
   SDValue Res;
-  if (!PPCSubTarget->hasCMPB())
+  if (!Subtarget->hasCMPB())
     return Res;
 
   if (N->getValueType(0) != MVT::i32 &&
@@ -5517,7 +5640,7 @@ SDValue PPCDAGToDAGISel::combineToCMPB(SDNode *N) {
 // only one instruction (like a zero or one), then we should fold in those
 // operations with the select.
 void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) {
-  if (!PPCSubTarget->useCRBits())
+  if (!Subtarget->useCRBits())
     return;
 
   if (N->getOpcode() != ISD::ZERO_EXTEND &&
@@ -5549,8 +5672,7 @@ void PPCDAGToDAGISel::foldBoolExts(SDValue &Res, SDNode *&N) {
       SDValue O1 = UserO1.getNode() == N ? Val : UserO1;
 
       return CurDAG->FoldConstantArithmetic(User->getOpcode(), dl,
-                                            User->getValueType(0),
-                                            O0.getNode(), O1.getNode());
+                                            User->getValueType(0), {O0, O1});
     };
 
     // FIXME: When the semantics of the interaction between select and undef
@@ -6259,7 +6381,7 @@ static bool PeepholePPC64ZExtGather(SDValue Op32,
 }
 
 void PPCDAGToDAGISel::PeepholePPC64ZExt() {
-  if (!PPCSubTarget->isPPC64())
+  if (!Subtarget->isPPC64())
     return;
 
   // When we zero-extend from i32 to i64, we use a pattern like this:
@@ -6428,10 +6550,6 @@ void PPCDAGToDAGISel::PeepholePPC64ZExt() {
 }
 
 void PPCDAGToDAGISel::PeepholePPC64() {
-  // These optimizations are currently supported only for 64-bit SVR4.
-  if (PPCSubTarget->isDarwin() || !PPCSubTarget->isPPC64())
-    return;
-
   SelectionDAG::allnodes_iterator Position = CurDAG->allnodes_end();
 
   while (Position != CurDAG->allnodes_begin()) {
@@ -6544,7 +6662,8 @@ void PPCDAGToDAGISel::PeepholePPC64() {
     int MaxDisplacement = 7;
     if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
       const GlobalValue *GV = GA->getGlobal();
-      MaxDisplacement = std::min((int) GV->getAlignment() - 1, MaxDisplacement);
+      Align Alignment = GV->getPointerAlignment(CurDAG->getDataLayout());
+      MaxDisplacement = std::min((int)Alignment.value() - 1, MaxDisplacement);
     }
 
     bool UpdateHBase = false;
@@ -6610,10 +6729,10 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       if (GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(ImmOpnd)) {
         SDLoc dl(GA);
         const GlobalValue *GV = GA->getGlobal();
+        Align Alignment = GV->getPointerAlignment(CurDAG->getDataLayout());
         // We can't perform this optimization for data whose alignment
         // is insufficient for the instruction encoding.
-        if (GV->getAlignment() < 4 &&
-            (RequiresMod4Offset || (Offset % 4) != 0)) {
+        if (Alignment < 4 && (RequiresMod4Offset || (Offset % 4) != 0)) {
           LLVM_DEBUG(dbgs() << "Rejected this candidate for alignment.\n\n");
           continue;
         }
@@ -6621,8 +6740,7 @@ void PPCDAGToDAGISel::PeepholePPC64() {
       } else if (ConstantPoolSDNode *CP =
                  dyn_cast<ConstantPoolSDNode>(ImmOpnd)) {
         const Constant *C = CP->getConstVal();
-        ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64,
-                                                CP->getAlignment(),
+        ImmOpnd = CurDAG->getTargetConstantPool(C, MVT::i64, CP->getAlign(),
                                                 Offset, Flags);
       }
     }
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 60ed72e1018b..ddfbd04e1ebc 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -55,7 +55,6 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/TargetRegisterInfo.h"
 #include "llvm/CodeGen/ValueTypes.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -118,14 +117,13 @@ cl::desc("disable sibling call optimization on ppc"), cl::Hidden);
 static cl::opt<bool> DisableInnermostLoopAlign32("disable-ppc-innermost-loop-align32",
 cl::desc("don't always align innermost loop to 32 bytes on ppc"), cl::Hidden);
 
-static cl::opt<bool> EnableQuadPrecision("enable-ppc-quad-precision",
-cl::desc("enable quad precision float support on ppc"), cl::Hidden);
-
 static cl::opt<bool> UseAbsoluteJumpTables("ppc-use-absolute-jumptables",
 cl::desc("use absolute jump tables on ppc"), cl::Hidden);
 
 STATISTIC(NumTailCalls, "Number of tail calls");
 STATISTIC(NumSiblingCalls, "Number of sibling calls");
+STATISTIC(ShufflesHandledWithVPERM, "Number of shuffles lowered to a VPERM");
+STATISTIC(NumDynamicAllocaProbed, "Number of dynamic stack allocation probed");
 
 static bool isNByteElemShuffleMask(ShuffleVectorSDNode *, unsigned, int);
 
@@ -167,6 +165,23 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i8, Expand);
   }
 
+  if (Subtarget.isISA3_0()) {
+    setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Legal);
+    setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Legal);
+    setTruncStoreAction(MVT::f64, MVT::f16, Legal);
+    setTruncStoreAction(MVT::f32, MVT::f16, Legal);
+  } else {
+    // No extending loads from f16 or HW conversions back and forth.
+    setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
+    setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+    setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+    setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
+    setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+    setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+    setTruncStoreAction(MVT::f64, MVT::f16, Expand);
+    setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  }
+
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
   // PowerPC has pre-inc load and store's.
@@ -243,15 +258,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   // PowerPC has no SREM/UREM instructions unless we are on P9
   // On P9 we may use a hardware instruction to compute the remainder.
-  // The instructions are not legalized directly because in the cases where the
-  // result of both the remainder and the division is required it is more
-  // efficient to compute the remainder from the result of the division rather
-  // than use the remainder instruction.
+  // When the result of both the remainder and the division is required it is
+  // more efficient to compute the remainder from the result of the division
+  // rather than use the remainder instruction. The instructions are legalized
+  // directly because the DivRemPairsPass performs the transformation at the IR
+  // level.
   if (Subtarget.isISA3_0()) {
-    setOperationAction(ISD::SREM, MVT::i32, Custom);
-    setOperationAction(ISD::UREM, MVT::i32, Custom);
-    setOperationAction(ISD::SREM, MVT::i64, Custom);
-    setOperationAction(ISD::UREM, MVT::i64, Custom);
+    setOperationAction(ISD::SREM, MVT::i32, Legal);
+    setOperationAction(ISD::UREM, MVT::i32, Legal);
+    setOperationAction(ISD::SREM, MVT::i64, Legal);
+    setOperationAction(ISD::UREM, MVT::i64, Legal);
   } else {
     setOperationAction(ISD::SREM, MVT::i32, Expand);
     setOperationAction(ISD::UREM, MVT::i32, Expand);
@@ -269,6 +285,40 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setOperationAction(ISD::UDIVREM, MVT::i64, Expand);
   setOperationAction(ISD::SDIVREM, MVT::i64, Expand);
 
+  // Handle constrained floating-point operations of scalar.
+  // TODO: Handle SPE specific operation.
+  setOperationAction(ISD::STRICT_FADD, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FSUB, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FMUL, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FDIV, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FMA, MVT::f32, Legal);
+  setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
+
+  setOperationAction(ISD::STRICT_FADD, MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FSUB, MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FMUL, MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FDIV, MVT::f64, Legal);
+  setOperationAction(ISD::STRICT_FMA, MVT::f64, Legal);
+  if (Subtarget.hasVSX())
+    setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f64, Legal);
+
+  if (Subtarget.hasFSQRT()) {
+    setOperationAction(ISD::STRICT_FSQRT, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FSQRT, MVT::f64, Legal);
+  }
+
+  if (Subtarget.hasFPRND()) {
+    setOperationAction(ISD::STRICT_FFLOOR, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FCEIL,  MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FTRUNC, MVT::f32, Legal);
+    setOperationAction(ISD::STRICT_FROUND, MVT::f32, Legal);
+
+    setOperationAction(ISD::STRICT_FFLOOR, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FCEIL,  MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FTRUNC, MVT::f64, Legal);
+    setOperationAction(ISD::STRICT_FROUND, MVT::f64, Legal);
+  }
+
   // We don't support sin/cos/sqrt/fmod/pow
   setOperationAction(ISD::FSIN , MVT::f64, Expand);
   setOperationAction(ISD::FCOS , MVT::f64, Expand);
@@ -373,6 +423,9 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
 
   if (Subtarget.hasSPE()) {
     // SPE has built-in conversions
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Legal);
+    setOperationAction(ISD::STRICT_SINT_TO_FP, MVT::i32, Legal);
+    setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::i32, Legal);
     setOperationAction(ISD::FP_TO_SINT, MVT::i32, Legal);
     setOperationAction(ISD::SINT_TO_FP, MVT::i32, Legal);
     setOperationAction(ISD::UINT_TO_FP, MVT::i32, Legal);
@@ -522,9 +575,10 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::SINT_TO_FP, MVT::i32, Custom);
   } else {
     // PowerPC does not have FP_TO_UINT on 32-bit implementations.
-    if (Subtarget.hasSPE())
+    if (Subtarget.hasSPE()) {
+      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Legal);
       setOperationAction(ISD::FP_TO_UINT, MVT::i32, Legal);
-    else
+    } else
       setOperationAction(ISD::FP_TO_UINT, MVT::i32, Expand);
   }
 
@@ -567,6 +621,12 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   }
 
   if (Subtarget.hasAltivec()) {
+    for (MVT VT : { MVT::v16i8, MVT::v8i16, MVT::v4i32 }) {
+      setOperationAction(ISD::SADDSAT, VT, Legal);
+      setOperationAction(ISD::SSUBSAT, VT, Legal);
+      setOperationAction(ISD::UADDSAT, VT, Legal);
+      setOperationAction(ISD::USUBSAT, VT, Legal);
+    }
     // First set operation action for all vector types to expand. Then we
     // will selectively turn on ones that can be effectively codegen'd.
     for (MVT VT : MVT::fixedlen_vector_valuetypes()) {
@@ -677,6 +737,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
         setLoadExtAction(ISD::EXTLOAD, VT, InnerVT, Expand);
       }
     }
+    setOperationAction(ISD::SELECT_CC, MVT::v4i32, Expand);
     if (!Subtarget.hasP8Vector()) {
       setOperationAction(ISD::SMAX, MVT::v2i64, Expand);
       setOperationAction(ISD::SMIN, MVT::v2i64, Expand);
@@ -720,6 +781,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     if (!Subtarget.hasP8Altivec())
       setOperationAction(ISD::ABS, MVT::v2i64, Expand);
 
+    // Custom lowering ROTL v1i128 to VECTOR_SHUFFLE v16i8.
+    setOperationAction(ISD::ROTL, MVT::v1i128, Custom);
     // With hasAltivec set, we can lower ISD::ROTL to vrl(b|h|w).
     if (Subtarget.hasAltivec())
       for (auto VT : {MVT::v4i32, MVT::v8i16, MVT::v16i8})
@@ -746,7 +809,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     else
       setOperationAction(ISD::MUL, MVT::v4i32, Custom);
 
-    setOperationAction(ISD::MUL, MVT::v8i16, Custom);
+    setOperationAction(ISD::MUL, MVT::v8i16, Legal);
     setOperationAction(ISD::MUL, MVT::v16i8, Custom);
 
     setOperationAction(ISD::SCALAR_TO_VECTOR, MVT::v4f32, Custom);
@@ -793,12 +856,16 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::FCEIL, MVT::v2f64, Legal);
       setOperationAction(ISD::FTRUNC, MVT::v2f64, Legal);
       setOperationAction(ISD::FNEARBYINT, MVT::v2f64, Legal);
+      setOperationAction(ISD::FRINT, MVT::v2f64, Legal);
       setOperationAction(ISD::FROUND, MVT::v2f64, Legal);
       setOperationAction(ISD::FROUND, MVT::f64, Legal);
+      setOperationAction(ISD::FRINT, MVT::f64, Legal);
 
       setOperationAction(ISD::FNEARBYINT, MVT::v4f32, Legal);
+      setOperationAction(ISD::FRINT, MVT::v4f32, Legal);
       setOperationAction(ISD::FROUND, MVT::v4f32, Legal);
       setOperationAction(ISD::FROUND, MVT::f32, Legal);
+      setOperationAction(ISD::FRINT, MVT::f32, Legal);
 
       setOperationAction(ISD::MUL, MVT::v2f64, Legal);
       setOperationAction(ISD::FMA, MVT::v2f64, Legal);
@@ -888,6 +955,37 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
         setOperationAction(ISD::BUILD_VECTOR, MVT::v2i64, Custom);
       setOperationAction(ISD::BUILD_VECTOR, MVT::v2f64, Custom);
 
+      // Handle constrained floating-point operations of vector.
+      // The predictor is `hasVSX` because altivec instruction has
+      // no exception but VSX vector instruction has.
+      setOperationAction(ISD::STRICT_FADD, MVT::v4f32, Legal);
+      setOperationAction(ISD::STRICT_FSUB, MVT::v4f32, Legal);
+      setOperationAction(ISD::STRICT_FMUL, MVT::v4f32, Legal);
+      setOperationAction(ISD::STRICT_FDIV, MVT::v4f32, Legal);
+      setOperationAction(ISD::STRICT_FMA, MVT::v4f32, Legal);
+      setOperationAction(ISD::STRICT_FSQRT, MVT::v4f32, Legal);
+      setOperationAction(ISD::STRICT_FMAXNUM, MVT::v4f32, Legal);
+      setOperationAction(ISD::STRICT_FMINNUM, MVT::v4f32, Legal);
+      setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v4f32, Legal);
+      setOperationAction(ISD::STRICT_FFLOOR, MVT::v4f32, Legal);
+      setOperationAction(ISD::STRICT_FCEIL,  MVT::v4f32, Legal);
+      setOperationAction(ISD::STRICT_FTRUNC, MVT::v4f32, Legal);
+      setOperationAction(ISD::STRICT_FROUND, MVT::v4f32, Legal);
+
+      setOperationAction(ISD::STRICT_FADD, MVT::v2f64, Legal);
+      setOperationAction(ISD::STRICT_FSUB, MVT::v2f64, Legal);
+      setOperationAction(ISD::STRICT_FMUL, MVT::v2f64, Legal);
+      setOperationAction(ISD::STRICT_FDIV, MVT::v2f64, Legal);
+      setOperationAction(ISD::STRICT_FMA, MVT::v2f64, Legal);
+      setOperationAction(ISD::STRICT_FSQRT, MVT::v2f64, Legal);
+      setOperationAction(ISD::STRICT_FMAXNUM, MVT::v2f64, Legal);
+      setOperationAction(ISD::STRICT_FMINNUM, MVT::v2f64, Legal);
+      setOperationAction(ISD::STRICT_FNEARBYINT, MVT::v2f64, Legal);
+      setOperationAction(ISD::STRICT_FFLOOR, MVT::v2f64, Legal);
+      setOperationAction(ISD::STRICT_FCEIL,  MVT::v2f64, Legal);
+      setOperationAction(ISD::STRICT_FTRUNC, MVT::v2f64, Legal);
+      setOperationAction(ISD::STRICT_FROUND, MVT::v2f64, Legal);
+
       addRegisterClass(MVT::v2i64, &PPC::VSRCRegClass);
     }
 
@@ -907,44 +1005,59 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::SRL, MVT::v1i128, Legal);
       setOperationAction(ISD::SRA, MVT::v1i128, Expand);
 
-      if (EnableQuadPrecision) {
-        addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
-        setOperationAction(ISD::FADD, MVT::f128, Legal);
-        setOperationAction(ISD::FSUB, MVT::f128, Legal);
-        setOperationAction(ISD::FDIV, MVT::f128, Legal);
-        setOperationAction(ISD::FMUL, MVT::f128, Legal);
-        setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
-        // No extending loads to f128 on PPC.
-        for (MVT FPT : MVT::fp_valuetypes())
-          setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
-        setOperationAction(ISD::FMA, MVT::f128, Legal);
-        setCondCodeAction(ISD::SETULT, MVT::f128, Expand);
-        setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);
-        setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);
-        setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);
-        setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);
-        setCondCodeAction(ISD::SETONE, MVT::f128, Expand);
-
-        setOperationAction(ISD::FTRUNC, MVT::f128, Legal);
-        setOperationAction(ISD::FRINT, MVT::f128, Legal);
-        setOperationAction(ISD::FFLOOR, MVT::f128, Legal);
-        setOperationAction(ISD::FCEIL, MVT::f128, Legal);
-        setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);
-        setOperationAction(ISD::FROUND, MVT::f128, Legal);
-
-        setOperationAction(ISD::SELECT, MVT::f128, Expand);
-        setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
-        setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);
-        setTruncStoreAction(MVT::f128, MVT::f64, Expand);
-        setTruncStoreAction(MVT::f128, MVT::f32, Expand);
-        setOperationAction(ISD::BITCAST, MVT::i128, Custom);
-        // No implementation for these ops for PowerPC.
-        setOperationAction(ISD::FSIN , MVT::f128, Expand);
-        setOperationAction(ISD::FCOS , MVT::f128, Expand);
-        setOperationAction(ISD::FPOW, MVT::f128, Expand);
-        setOperationAction(ISD::FPOWI, MVT::f128, Expand);
-        setOperationAction(ISD::FREM, MVT::f128, Expand);
-      }
+      addRegisterClass(MVT::f128, &PPC::VRRCRegClass);
+      setOperationAction(ISD::FADD, MVT::f128, Legal);
+      setOperationAction(ISD::FSUB, MVT::f128, Legal);
+      setOperationAction(ISD::FDIV, MVT::f128, Legal);
+      setOperationAction(ISD::FMUL, MVT::f128, Legal);
+      setOperationAction(ISD::FP_EXTEND, MVT::f128, Legal);
+      // No extending loads to f128 on PPC.
+      for (MVT FPT : MVT::fp_valuetypes())
+        setLoadExtAction(ISD::EXTLOAD, MVT::f128, FPT, Expand);
+      setOperationAction(ISD::FMA, MVT::f128, Legal);
+      setCondCodeAction(ISD::SETULT, MVT::f128, Expand);
+      setCondCodeAction(ISD::SETUGT, MVT::f128, Expand);
+      setCondCodeAction(ISD::SETUEQ, MVT::f128, Expand);
+      setCondCodeAction(ISD::SETOGE, MVT::f128, Expand);
+      setCondCodeAction(ISD::SETOLE, MVT::f128, Expand);
+      setCondCodeAction(ISD::SETONE, MVT::f128, Expand);
+
+      setOperationAction(ISD::FTRUNC, MVT::f128, Legal);
+      setOperationAction(ISD::FRINT, MVT::f128, Legal);
+      setOperationAction(ISD::FFLOOR, MVT::f128, Legal);
+      setOperationAction(ISD::FCEIL, MVT::f128, Legal);
+      setOperationAction(ISD::FNEARBYINT, MVT::f128, Legal);
+      setOperationAction(ISD::FROUND, MVT::f128, Legal);
+
+      setOperationAction(ISD::SELECT, MVT::f128, Expand);
+      setOperationAction(ISD::FP_ROUND, MVT::f64, Legal);
+      setOperationAction(ISD::FP_ROUND, MVT::f32, Legal);
+      setTruncStoreAction(MVT::f128, MVT::f64, Expand);
+      setTruncStoreAction(MVT::f128, MVT::f32, Expand);
+      setOperationAction(ISD::BITCAST, MVT::i128, Custom);
+      // No implementation for these ops for PowerPC.
+      setOperationAction(ISD::FSIN, MVT::f128, Expand);
+      setOperationAction(ISD::FCOS, MVT::f128, Expand);
+      setOperationAction(ISD::FPOW, MVT::f128, Expand);
+      setOperationAction(ISD::FPOWI, MVT::f128, Expand);
+      setOperationAction(ISD::FREM, MVT::f128, Expand);
+
+      // Handle constrained floating-point operations of fp128
+      setOperationAction(ISD::STRICT_FADD, MVT::f128, Legal);
+      setOperationAction(ISD::STRICT_FSUB, MVT::f128, Legal);
+      setOperationAction(ISD::STRICT_FMUL, MVT::f128, Legal);
+      setOperationAction(ISD::STRICT_FDIV, MVT::f128, Legal);
+      setOperationAction(ISD::STRICT_FMA, MVT::f128, Legal);
+      setOperationAction(ISD::STRICT_FSQRT, MVT::f128, Legal);
+      setOperationAction(ISD::STRICT_FP_EXTEND, MVT::f128, Legal);
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::f64, Legal);
+      setOperationAction(ISD::STRICT_FP_ROUND, MVT::f32, Legal);
+      setOperationAction(ISD::STRICT_FRINT, MVT::f128, Legal);
+      setOperationAction(ISD::STRICT_FNEARBYINT, MVT::f128, Legal);
+      setOperationAction(ISD::STRICT_FFLOOR, MVT::f128, Legal);
+      setOperationAction(ISD::STRICT_FCEIL, MVT::f128, Legal);
+      setOperationAction(ISD::STRICT_FTRUNC, MVT::f128, Legal);
+      setOperationAction(ISD::STRICT_FROUND, MVT::f128, Legal);
       setOperationAction(ISD::FP_EXTEND, MVT::v2f32, Custom);
       setOperationAction(ISD::BSWAP, MVT::v8i16, Legal);
       setOperationAction(ISD::BSWAP, MVT::v4i32, Legal);
@@ -1117,6 +1230,8 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
       setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
       setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
     }
+
+    // TODO: Handle constrained floating-point operations of v4f64
   }
 
   if (Subtarget.has64BitSupport())
@@ -1151,6 +1266,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   setTargetDAGCombine(ISD::SRA);
   setTargetDAGCombine(ISD::SRL);
   setTargetDAGCombine(ISD::MUL);
+  setTargetDAGCombine(ISD::FMA);
   setTargetDAGCombine(ISD::SINT_TO_FP);
   setTargetDAGCombine(ISD::BUILD_VECTOR);
   if (Subtarget.hasFPCVT())
@@ -1190,34 +1306,18 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     setTargetDAGCombine(ISD::VSELECT);
   }
 
-  // Darwin long double math library functions have $LDBL128 appended.
-  if (Subtarget.isDarwin()) {
-    setLibcallName(RTLIB::COS_PPCF128, "cosl$LDBL128");
-    setLibcallName(RTLIB::POW_PPCF128, "powl$LDBL128");
-    setLibcallName(RTLIB::REM_PPCF128, "fmodl$LDBL128");
-    setLibcallName(RTLIB::SIN_PPCF128, "sinl$LDBL128");
-    setLibcallName(RTLIB::SQRT_PPCF128, "sqrtl$LDBL128");
-    setLibcallName(RTLIB::LOG_PPCF128, "logl$LDBL128");
-    setLibcallName(RTLIB::LOG2_PPCF128, "log2l$LDBL128");
-    setLibcallName(RTLIB::LOG10_PPCF128, "log10l$LDBL128");
-    setLibcallName(RTLIB::EXP_PPCF128, "expl$LDBL128");
-    setLibcallName(RTLIB::EXP2_PPCF128, "exp2l$LDBL128");
-  }
-
-  if (EnableQuadPrecision) {
-    setLibcallName(RTLIB::LOG_F128, "logf128");
-    setLibcallName(RTLIB::LOG2_F128, "log2f128");
-    setLibcallName(RTLIB::LOG10_F128, "log10f128");
-    setLibcallName(RTLIB::EXP_F128, "expf128");
-    setLibcallName(RTLIB::EXP2_F128, "exp2f128");
-    setLibcallName(RTLIB::SIN_F128, "sinf128");
-    setLibcallName(RTLIB::COS_F128, "cosf128");
-    setLibcallName(RTLIB::POW_F128, "powf128");
-    setLibcallName(RTLIB::FMIN_F128, "fminf128");
-    setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
-    setLibcallName(RTLIB::POWI_F128, "__powikf2");
-    setLibcallName(RTLIB::REM_F128, "fmodf128");
-  }
+  setLibcallName(RTLIB::LOG_F128, "logf128");
+  setLibcallName(RTLIB::LOG2_F128, "log2f128");
+  setLibcallName(RTLIB::LOG10_F128, "log10f128");
+  setLibcallName(RTLIB::EXP_F128, "expf128");
+  setLibcallName(RTLIB::EXP2_F128, "exp2f128");
+  setLibcallName(RTLIB::SIN_F128, "sinf128");
+  setLibcallName(RTLIB::COS_F128, "cosf128");
+  setLibcallName(RTLIB::POW_F128, "powf128");
+  setLibcallName(RTLIB::FMIN_F128, "fminf128");
+  setLibcallName(RTLIB::FMAX_F128, "fmaxf128");
+  setLibcallName(RTLIB::POWI_F128, "__powikf2");
+  setLibcallName(RTLIB::REM_F128, "fmodf128");
 
   // With 32 condition bits, we don't need to sink (and duplicate) compares
   // aggressively in CodeGenPrep.
@@ -1227,8 +1327,6 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   }
 
   setMinFunctionAlignment(Align(4));
-  if (Subtarget.isDarwin())
-    setPrefFunctionAlignment(Align(16));
 
   switch (Subtarget.getCPUDirective()) {
   default: break;
@@ -1245,6 +1343,7 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
   case PPC::DIR_PWR9:
+  case PPC::DIR_PWR10:
   case PPC::DIR_PWR_FUTURE:
     setPrefLoopAlignment(Align(16));
     setPrefFunctionAlignment(Align(16));
@@ -1280,27 +1379,33 @@ PPCTargetLowering::PPCTargetLowering(const PPCTargetMachine &TM,
     MaxLoadsPerMemcmp = 8;
     MaxLoadsPerMemcmpOptSize = 4;
   }
+
+  // Let the subtarget (CPU) decide if a predictable select is more expensive
+  // than the corresponding branch. This information is used in CGP to decide
+  // when to convert selects into branches.
+  PredictableSelectIsExpensive = Subtarget.isPredictableSelectIsExpensive();
 }
 
 /// getMaxByValAlign - Helper for getByValTypeAlignment to determine
 /// the desired ByVal argument alignment.
-static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
-                             unsigned MaxMaxAlign) {
+static void getMaxByValAlign(Type *Ty, Align &MaxAlign, Align MaxMaxAlign) {
   if (MaxAlign == MaxMaxAlign)
     return;
   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
-    if (MaxMaxAlign >= 32 && VTy->getBitWidth() >= 256)
-      MaxAlign = 32;
-    else if (VTy->getBitWidth() >= 128 && MaxAlign < 16)
-      MaxAlign = 16;
+    if (MaxMaxAlign >= 32 &&
+        VTy->getPrimitiveSizeInBits().getFixedSize() >= 256)
+      MaxAlign = Align(32);
+    else if (VTy->getPrimitiveSizeInBits().getFixedSize() >= 128 &&
+             MaxAlign < 16)
+      MaxAlign = Align(16);
   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
-    unsigned EltAlign = 0;
+    Align EltAlign;
     getMaxByValAlign(ATy->getElementType(), EltAlign, MaxMaxAlign);
     if (EltAlign > MaxAlign)
       MaxAlign = EltAlign;
   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
     for (auto *EltTy : STy->elements()) {
-      unsigned EltAlign = 0;
+      Align EltAlign;
       getMaxByValAlign(EltTy, EltAlign, MaxMaxAlign);
       if (EltAlign > MaxAlign)
         MaxAlign = EltAlign;
@@ -1314,16 +1419,12 @@ static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign,
 /// function arguments in the caller parameter area.
 unsigned PPCTargetLowering::getByValTypeAlignment(Type *Ty,
                                                   const DataLayout &DL) const {
-  // Darwin passes everything on 4 byte boundary.
-  if (Subtarget.isDarwin())
-    return 4;
-
   // 16byte and wider vectors are passed on 16byte boundary.
   // The rest is 8 on PPC64 and 4 on PPC32 boundary.
-  unsigned Align = Subtarget.isPPC64() ? 8 : 4;
+  Align Alignment = Subtarget.isPPC64() ? Align(8) : Align(4);
   if (Subtarget.hasAltivec() || Subtarget.hasQPX())
-    getMaxByValAlign(Ty, Align, Subtarget.hasQPX() ? 32 : 16);
-  return Align;
+    getMaxByValAlign(Ty, Alignment, Subtarget.hasQPX() ? Align(32) : Align(16));
+  return Alignment.value();
 }
 
 bool PPCTargetLowering::useSoftFloat() const {
@@ -1338,6 +1439,16 @@ bool PPCTargetLowering::preferIncOfAddToSubOfNot(EVT VT) const {
   return VT.isScalarInteger();
 }
 
+/// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a specific
+/// type is cheaper than a multiply followed by a shift.
+/// This is true for words and doublewords on 64-bit PowerPC.
+bool PPCTargetLowering::isMulhCheaperThanMulShift(EVT Type) const {
+  if (Subtarget.isPPC64() && (isOperationLegal(ISD::MULHS, Type) ||
+                              isOperationLegal(ISD::MULHU, Type)))
+    return true;
+  return TargetLowering::isMulhCheaperThanMulShift(Type);
+}
+
 const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((PPCISD::NodeType)Opcode) {
   case PPCISD::FIRST_NUMBER:    break;
@@ -1359,10 +1470,12 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::FRE:             return "PPCISD::FRE";
   case PPCISD::FRSQRTE:         return "PPCISD::FRSQRTE";
   case PPCISD::STFIWX:          return "PPCISD::STFIWX";
-  case PPCISD::VMADDFP:         return "PPCISD::VMADDFP";
-  case PPCISD::VNMSUBFP:        return "PPCISD::VNMSUBFP";
   case PPCISD::VPERM:           return "PPCISD::VPERM";
   case PPCISD::XXSPLT:          return "PPCISD::XXSPLT";
+  case PPCISD::XXSPLTI_SP_TO_DP:
+    return "PPCISD::XXSPLTI_SP_TO_DP";
+  case PPCISD::XXSPLTI32DX:
+    return "PPCISD::XXSPLTI32DX";
   case PPCISD::VECINSERT:       return "PPCISD::VECINSERT";
   case PPCISD::XXPERMDI:        return "PPCISD::XXPERMDI";
   case PPCISD::VECSHL:          return "PPCISD::VECSHL";
@@ -1374,6 +1487,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::ATOMIC_CMP_SWAP_16: return "PPCISD::ATOMIC_CMP_SWAP_16";
   case PPCISD::DYNALLOC:        return "PPCISD::DYNALLOC";
   case PPCISD::DYNAREAOFFSET:   return "PPCISD::DYNAREAOFFSET";
+  case PPCISD::PROBED_ALLOCA:   return "PPCISD::PROBED_ALLOCA";
   case PPCISD::GlobalBaseReg:   return "PPCISD::GlobalBaseReg";
   case PPCISD::SRL:             return "PPCISD::SRL";
   case PPCISD::SRA:             return "PPCISD::SRA";
@@ -1381,6 +1495,7 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::SRA_ADDZE:       return "PPCISD::SRA_ADDZE";
   case PPCISD::CALL:            return "PPCISD::CALL";
   case PPCISD::CALL_NOP:        return "PPCISD::CALL_NOP";
+  case PPCISD::CALL_NOTOC:      return "PPCISD::CALL_NOTOC";
   case PPCISD::MTCTR:           return "PPCISD::MTCTR";
   case PPCISD::BCTRL:           return "PPCISD::BCTRL";
   case PPCISD::BCTRL_LOAD_TOC:  return "PPCISD::BCTRL_LOAD_TOC";
@@ -1394,6 +1509,8 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::MTVSRZ:          return "PPCISD::MTVSRZ";
   case PPCISD::SINT_VEC_TO_FP:  return "PPCISD::SINT_VEC_TO_FP";
   case PPCISD::UINT_VEC_TO_FP:  return "PPCISD::UINT_VEC_TO_FP";
+  case PPCISD::SCALAR_TO_VECTOR_PERMUTED:
+    return "PPCISD::SCALAR_TO_VECTOR_PERMUTED";
   case PPCISD::ANDI_rec_1_EQ_BIT:
     return "PPCISD::ANDI_rec_1_EQ_BIT";
   case PPCISD::ANDI_rec_1_GT_BIT:
@@ -1407,7 +1524,6 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::LXSIZX:          return "PPCISD::LXSIZX";
   case PPCISD::STXSIX:          return "PPCISD::STXSIX";
   case PPCISD::VEXTS:           return "PPCISD::VEXTS";
-  case PPCISD::SExtVElems:      return "PPCISD::SExtVElems";
   case PPCISD::LXVD2X:          return "PPCISD::LXVD2X";
   case PPCISD::STXVD2X:         return "PPCISD::STXVD2X";
   case PPCISD::LOAD_VEC_BE:     return "PPCISD::LOAD_VEC_BE";
@@ -1457,7 +1573,9 @@ const char *PPCTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case PPCISD::EXTSWSLI:        return "PPCISD::EXTSWSLI";
   case PPCISD::LD_VSX_LH:       return "PPCISD::LD_VSX_LH";
   case PPCISD::FP_EXTEND_HALF:  return "PPCISD::FP_EXTEND_HALF";
+  case PPCISD::MAT_PCREL_ADDR:  return "PPCISD::MAT_PCREL_ADDR";
   case PPCISD::LD_SPLAT:        return "PPCISD::LD_SPLAT";
+  case PPCISD::FNMSUB:          return "PPCISD::FNMSUB";
   }
   return nullptr;
 }
@@ -2320,17 +2438,22 @@ bool PPCTargetLowering::SelectAddressEVXRegReg(SDValue N, SDValue &Base,
 /// non-zero and N can be represented by a base register plus a signed 16-bit
 /// displacement, make a more precise judgement by checking (displacement % \p
 /// EncodingAlignment).
-bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
-                                            SDValue &Index, SelectionDAG &DAG,
-                                            unsigned EncodingAlignment) const {
-  int16_t imm = 0;
+bool PPCTargetLowering::SelectAddressRegReg(
+    SDValue N, SDValue &Base, SDValue &Index, SelectionDAG &DAG,
+    MaybeAlign EncodingAlignment) const {
+  // If we have a PC Relative target flag don't select as [reg+reg]. It will be
+  // a [pc+imm].
+  if (SelectAddressPCRel(N, Base))
+    return false;
+
+  int16_t Imm = 0;
   if (N.getOpcode() == ISD::ADD) {
     // Is there any SPE load/store (f64), which can't handle 16bit offset?
     // SPE load/store can only handle 8-bit offsets.
     if (hasSPE() && SelectAddressEVXRegReg(N, Base, Index, DAG))
         return true;
-    if (isIntS16Immediate(N.getOperand(1), imm) &&
-        (!EncodingAlignment || !(imm % EncodingAlignment)))
+    if (isIntS16Immediate(N.getOperand(1), Imm) &&
+        (!EncodingAlignment || isAligned(*EncodingAlignment, Imm)))
       return false; // r+i
     if (N.getOperand(1).getOpcode() == PPCISD::Lo)
       return false;    // r+i
@@ -2339,8 +2462,8 @@ bool PPCTargetLowering::SelectAddressRegReg(SDValue N, SDValue &Base,
     Index = N.getOperand(1);
     return true;
   } else if (N.getOpcode() == ISD::OR) {
-    if (isIntS16Immediate(N.getOperand(1), imm) &&
-        (!EncodingAlignment || !(imm % EncodingAlignment)))
+    if (isIntS16Immediate(N.getOperand(1), Imm) &&
+        (!EncodingAlignment || isAligned(*EncodingAlignment, Imm)))
       return false; // r+i can fold it if we can.
 
     // If this is an or of disjoint bitfields, we can codegen this as an add
@@ -2395,8 +2518,7 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
-  unsigned Align = MFI.getObjectAlignment(FrameIdx);
-  if (Align >= 4)
+  if (MFI.getObjectAlign(FrameIdx) >= Align(4))
     return;
 
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
@@ -2407,12 +2529,17 @@ static void fixupFuncForFI(SelectionDAG &DAG, int FrameIdx, EVT VT) {
 /// a signed 16-bit displacement [r+imm], and if it is not better
 /// represented as reg+reg.  If \p EncodingAlignment is non-zero, only accept
 /// displacements that are multiples of that value.
-bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
-                                            SDValue &Base,
-                                            SelectionDAG &DAG,
-                                            unsigned EncodingAlignment) const {
+bool PPCTargetLowering::SelectAddressRegImm(
+    SDValue N, SDValue &Disp, SDValue &Base, SelectionDAG &DAG,
+    MaybeAlign EncodingAlignment) const {
   // FIXME dl should come from parent load or store, not from address
   SDLoc dl(N);
+
+  // If we have a PC Relative target flag don't select as [reg+imm]. It will be
+  // a [pc+imm].
+  if (SelectAddressPCRel(N, Base))
+    return false;
+
   // If this can be more profitably realized as r+r, fail.
   if (SelectAddressRegReg(N, Disp, Base, DAG, EncodingAlignment))
     return false;
@@ -2420,7 +2547,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
   if (N.getOpcode() == ISD::ADD) {
     int16_t imm = 0;
     if (isIntS16Immediate(N.getOperand(1), imm) &&
-        (!EncodingAlignment || (imm % EncodingAlignment) == 0)) {
+        (!EncodingAlignment || isAligned(*EncodingAlignment, imm))) {
       Disp = DAG.getTargetConstant(imm, dl, N.getValueType());
       if (FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(N.getOperand(0))) {
         Base = DAG.getTargetFrameIndex(FI->getIndex(), N.getValueType());
@@ -2444,7 +2571,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
   } else if (N.getOpcode() == ISD::OR) {
     int16_t imm = 0;
     if (isIntS16Immediate(N.getOperand(1), imm) &&
-        (!EncodingAlignment || (imm % EncodingAlignment) == 0)) {
+        (!EncodingAlignment || isAligned(*EncodingAlignment, imm))) {
       // If this is an or of disjoint bitfields, we can codegen this as an add
       // (for better address arithmetic) if the LHS and RHS of the OR are
       // provably disjoint.
@@ -2471,7 +2598,7 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     // this as "d, 0"
     int16_t Imm;
     if (isIntS16Immediate(CN, Imm) &&
-        (!EncodingAlignment || (Imm % EncodingAlignment) == 0)) {
+        (!EncodingAlignment || isAligned(*EncodingAlignment, Imm))) {
       Disp = DAG.getTargetConstant(Imm, dl, CN->getValueType(0));
       Base = DAG.getRegister(Subtarget.isPPC64() ? PPC::ZERO8 : PPC::ZERO,
                              CN->getValueType(0));
@@ -2481,7 +2608,8 @@ bool PPCTargetLowering::SelectAddressRegImm(SDValue N, SDValue &Disp,
     // Handle 32-bit sext immediates with LIS + addr mode.
     if ((CN->getValueType(0) == MVT::i32 ||
          (int64_t)CN->getZExtValue() == (int)CN->getZExtValue()) &&
-        (!EncodingAlignment || (CN->getZExtValue() % EncodingAlignment) == 0)) {
+        (!EncodingAlignment ||
+         isAligned(*EncodingAlignment, CN->getZExtValue()))) {
       int Addr = (int)CN->getZExtValue();
 
       // Otherwise, break this down into an LIS + disp.
@@ -2536,6 +2664,27 @@ bool PPCTargetLowering::SelectAddressRegRegOnly(SDValue N, SDValue &Base,
   return true;
 }
 
+template <typename Ty> static bool isValidPCRelNode(SDValue N) {
+  Ty *PCRelCand = dyn_cast<Ty>(N);
+  return PCRelCand && (PCRelCand->getTargetFlags() & PPCII::MO_PCREL_FLAG);
+}
+
+/// Returns true if this address is a PC Relative address.
+/// PC Relative addresses are marked with the flag PPCII::MO_PCREL_FLAG
+/// or if the node opcode is PPCISD::MAT_PCREL_ADDR.
+bool PPCTargetLowering::SelectAddressPCRel(SDValue N, SDValue &Base) const {
+  // This is a materialize PC Relative node. Always select this as PC Relative.
+  Base = N;
+  if (N.getOpcode() == PPCISD::MAT_PCREL_ADDR)
+    return true;
+  if (isValidPCRelNode<ConstantPoolSDNode>(N) ||
+      isValidPCRelNode<GlobalAddressSDNode>(N) ||
+      isValidPCRelNode<JumpTableSDNode>(N) ||
+      isValidPCRelNode<BlockAddressSDNode>(N))
+    return true;
+  return false;
+}
+
 /// Returns true if we should use a direct load into vector instruction
 /// (such as lxsd or lfd), instead of a load into gpr + direct move sequence.
 static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {
@@ -2573,7 +2722,8 @@ static bool usePartialVectorLoads(SDNode *N, const PPCSubtarget& ST) {
   for (SDNode::use_iterator UI = LD->use_begin(), UE = LD->use_end();
        UI != UE; ++UI)
     if (UI.getUse().get().getResNo() == 0 &&
-        UI->getOpcode() != ISD::SCALAR_TO_VECTOR)
+        UI->getOpcode() != ISD::SCALAR_TO_VECTOR &&
+        UI->getOpcode() != PPCISD::SCALAR_TO_VECTOR_PERMUTED)
       return false;
 
   return true;
@@ -2646,14 +2796,14 @@ bool PPCTargetLowering::getPreIndexedAddressParts(SDNode *N, SDValue &Base,
 
   // LDU/STU can only handle immediates that are a multiple of 4.
   if (VT != MVT::i64) {
-    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 0))
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, None))
       return false;
   } else {
     // LDU/STU need an address with at least 4-byte alignment.
     if (Alignment < 4)
       return false;
 
-    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, 4))
+    if (!SelectAddressRegImm(Ptr, Offset, Base, DAG, Align(4)))
       return false;
   }
 
@@ -2687,18 +2837,6 @@ static void getLabelAccessInfo(bool IsPIC, const PPCSubtarget &Subtarget,
     HiOpFlags |= PPCII::MO_PIC_FLAG;
     LoOpFlags |= PPCII::MO_PIC_FLAG;
   }
-
-  // If this is a reference to a global value that requires a non-lazy-ptr, make
-  // sure that instruction lowering adds it.
-  if (GV && Subtarget.hasLazyResolverStub(GV)) {
-    HiOpFlags |= PPCII::MO_NLP_FLAG;
-    LoOpFlags |= PPCII::MO_NLP_FLAG;
-
-    if (GV->hasHiddenVisibility()) {
-      HiOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
-      LoOpFlags |= PPCII::MO_NLP_HIDDEN_FLAG;
-    }
-  }
 }
 
 static SDValue LowerLabelRef(SDValue HiPart, SDValue LoPart, bool isPIC,
@@ -2740,7 +2878,7 @@ SDValue PPCTargetLowering::getTOCEntry(SelectionDAG &DAG, const SDLoc &dl,
   SDValue Ops[] = { GA, Reg };
   return DAG.getMemIntrinsicNode(
       PPCISD::TOC_ENTRY, dl, DAG.getVTList(VT, MVT::Other), Ops, VT,
-      MachinePointerInfo::getGOT(DAG.getMachineFunction()), 0,
+      MachinePointerInfo::getGOT(DAG.getMachineFunction()), None,
       MachineMemOperand::MOLoad);
 }
 
@@ -2753,8 +2891,15 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
   // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
   if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
+    if (Subtarget.isUsingPCRelativeCalls()) {
+      SDLoc DL(CP);
+      EVT Ty = getPointerTy(DAG.getDataLayout());
+      SDValue ConstPool = DAG.getTargetConstantPool(
+          C, Ty, CP->getAlign(), CP->getOffset(), PPCII::MO_PCREL_FLAG);
+      return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, ConstPool);
+    }
     setUsesTOCBasePtr(DAG);
-    SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0);
+    SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0);
     return getTOCEntry(DAG, SDLoc(CP), GA);
   }
 
@@ -2763,15 +2908,15 @@ SDValue PPCTargetLowering::LowerConstantPool(SDValue Op,
   getLabelAccessInfo(IsPIC, Subtarget, MOHiFlag, MOLoFlag);
 
   if (IsPIC && Subtarget.isSVR4ABI()) {
-    SDValue GA = DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(),
-                                           PPCII::MO_PIC_FLAG);
+    SDValue GA =
+        DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), PPCII::MO_PIC_FLAG);
     return getTOCEntry(DAG, SDLoc(CP), GA);
   }
 
   SDValue CPIHi =
-    DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOHiFlag);
+      DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOHiFlag);
   SDValue CPILo =
-    DAG.getTargetConstantPool(C, PtrVT, CP->getAlignment(), 0, MOLoFlag);
+      DAG.getTargetConstantPool(C, PtrVT, CP->getAlign(), 0, MOLoFlag);
   return LowerLabelRef(CPIHi, CPILo, IsPIC, DAG);
 }
 
@@ -2828,6 +2973,16 @@ SDValue PPCTargetLowering::LowerJumpTable(SDValue Op, SelectionDAG &DAG) const {
   EVT PtrVT = Op.getValueType();
   JumpTableSDNode *JT = cast<JumpTableSDNode>(Op);
 
+  // isUsingPCRelativeCalls() returns true when PCRelative is enabled
+  if (Subtarget.isUsingPCRelativeCalls()) {
+    SDLoc DL(JT);
+    EVT Ty = getPointerTy(DAG.getDataLayout());
+    SDValue GA =
+        DAG.getTargetJumpTable(JT->getIndex(), Ty, PPCII::MO_PCREL_FLAG);
+    SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
+    return MatAddr;
+  }
+
   // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
   if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
@@ -2857,6 +3012,16 @@ SDValue PPCTargetLowering::LowerBlockAddress(SDValue Op,
   BlockAddressSDNode *BASDN = cast<BlockAddressSDNode>(Op);
   const BlockAddress *BA = BASDN->getBlockAddress();
 
+  // isUsingPCRelativeCalls() returns true when PCRelative is enabled
+  if (Subtarget.isUsingPCRelativeCalls()) {
+    SDLoc DL(BASDN);
+    EVT Ty = getPointerTy(DAG.getDataLayout());
+    SDValue GA = DAG.getTargetBlockAddress(BA, Ty, BASDN->getOffset(),
+                                           PPCII::MO_PCREL_FLAG);
+    SDValue MatAddr = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
+    return MatAddr;
+  }
+
   // 64-bit SVR4 ABI and AIX ABI code are always position-independent.
   // The actual BlockAddress is stored in the TOC.
   if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
@@ -2986,6 +3151,22 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
   // 64-bit SVR4 ABI & AIX ABI code is always position-independent.
   // The actual address of the GlobalValue is stored in the TOC.
   if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
+    if (Subtarget.isUsingPCRelativeCalls()) {
+      EVT Ty = getPointerTy(DAG.getDataLayout());
+      if (isAccessedAsGotIndirect(Op)) {
+        SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),
+                                                PPCII::MO_PCREL_FLAG |
+                                                    PPCII::MO_GOT_FLAG);
+        SDValue MatPCRel = DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
+        SDValue Load = DAG.getLoad(MVT::i64, DL, DAG.getEntryNode(), MatPCRel,
+                                   MachinePointerInfo());
+        return Load;
+      } else {
+        SDValue GA = DAG.getTargetGlobalAddress(GV, DL, Ty, GSDN->getOffset(),
+                                                PPCII::MO_PCREL_FLAG);
+        return DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, Ty, GA);
+      }
+    }
     setUsesTOCBasePtr(DAG);
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset());
     return getTOCEntry(DAG, DL, GA);
@@ -3007,13 +3188,7 @@ SDValue PPCTargetLowering::LowerGlobalAddress(SDValue Op,
   SDValue GALo =
     DAG.getTargetGlobalAddress(GV, DL, PtrVT, GSDN->getOffset(), MOLoFlag);
 
-  SDValue Ptr = LowerLabelRef(GAHi, GALo, IsPIC, DAG);
-
-  // If the global reference is actually to a non-lazy-pointer, we have to do an
-  // extra load to get the address of the global.
-  if (MOHiFlag & PPCII::MO_NLP_FLAG)
-    Ptr = DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), Ptr, MachinePointerInfo());
-  return Ptr;
+  return LowerLabelRef(GAHi, GALo, IsPIC, DAG);
 }
 
 SDValue PPCTargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
@@ -3174,10 +3349,10 @@ SDValue PPCTargetLowering::LowerVACOPY(SDValue Op, SelectionDAG &DAG) const {
 
   // We have to copy the entire va_list struct:
   // 2*sizeof(char) + 2 Byte alignment + 2*sizeof(char*) = 12 Byte
-  return DAG.getMemcpy(Op.getOperand(0), Op,
-                       Op.getOperand(1), Op.getOperand(2),
-                       DAG.getConstant(12, SDLoc(Op), MVT::i32), 8, false, true,
-                       false, MachinePointerInfo(), MachinePointerInfo());
+  return DAG.getMemcpy(Op.getOperand(0), Op, Op.getOperand(1), Op.getOperand(2),
+                       DAG.getConstant(12, SDLoc(Op), MVT::i32), Align(8),
+                       false, true, false, MachinePointerInfo(),
+                       MachinePointerInfo());
 }
 
 SDValue PPCTargetLowering::LowerADJUST_TRAMPOLINE(SDValue Op,
@@ -3234,7 +3409,7 @@ SDValue PPCTargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
 
   SDLoc dl(Op);
 
-  if (Subtarget.isDarwinABI() || Subtarget.isPPC64()) {
+  if (Subtarget.isPPC64() || Subtarget.isAIXABI()) {
     // vastart just stores the address of the VarArgsFrameIndex slot into the
     // memory location argument.
     SDValue FR = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
@@ -3340,31 +3515,31 @@ static unsigned CalculateStackSlotSize(EVT ArgVT, ISD::ArgFlagsTy Flags,
 
 /// CalculateStackSlotAlignment - Calculates the alignment of this argument
 /// on the stack.
-static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
-                                            ISD::ArgFlagsTy Flags,
-                                            unsigned PtrByteSize) {
-  unsigned Align = PtrByteSize;
+static Align CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
+                                         ISD::ArgFlagsTy Flags,
+                                         unsigned PtrByteSize) {
+  Align Alignment(PtrByteSize);
 
   // Altivec parameters are padded to a 16 byte boundary.
   if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
       ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
       ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64 ||
       ArgVT == MVT::v1i128 || ArgVT == MVT::f128)
-    Align = 16;
+    Alignment = Align(16);
   // QPX vector types stored in double-precision are padded to a 32 byte
   // boundary.
   else if (ArgVT == MVT::v4f64 || ArgVT == MVT::v4i1)
-    Align = 32;
+    Alignment = Align(32);
 
   // ByVal parameters are aligned as requested.
   if (Flags.isByVal()) {
-    unsigned BVAlign = Flags.getByValAlign();
+    auto BVAlign = Flags.getNonZeroByValAlign();
     if (BVAlign > PtrByteSize) {
-      if (BVAlign % PtrByteSize != 0)
-          llvm_unreachable(
+      if (BVAlign.value() % PtrByteSize != 0)
+        llvm_unreachable(
             "ByVal alignment is not a multiple of the pointer size");
 
-      Align = BVAlign;
+      Alignment = BVAlign;
     }
   }
 
@@ -3374,12 +3549,12 @@ static unsigned CalculateStackSlotAlignment(EVT ArgVT, EVT OrigVT,
     // needs to be aligned to the size of the full type.  (Except for
     // ppcf128, which is only aligned as its f64 components.)
     if (Flags.isSplit() && OrigVT != MVT::ppcf128)
-      Align = OrigVT.getStoreSize();
+      Alignment = Align(OrigVT.getStoreSize());
     else
-      Align = ArgVT.getStoreSize();
+      Alignment = Align(ArgVT.getStoreSize());
   }
 
-  return Align;
+  return Alignment;
 }
 
 /// CalculateStackSlotUsed - Return whether this argument will use its
@@ -3397,9 +3572,9 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
   bool UseMemory = false;
 
   // Respect alignment of argument on the stack.
-  unsigned Align =
-    CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
-  ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+  Align Alignment =
+      CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+  ArgOffset = alignTo(ArgOffset, Alignment);
   // If there's no space left in the argument save area, we must
   // use memory (this check also catches zero-sized arguments).
   if (ArgOffset >= LinkageSize + ParamAreaSize)
@@ -3443,10 +3618,7 @@ static bool CalculateStackSlotUsed(EVT ArgVT, EVT OrigVT,
 /// ensure minimum alignment required for target.
 static unsigned EnsureStackAlignment(const PPCFrameLowering *Lowering,
                                      unsigned NumBytes) {
-  unsigned TargetAlign = Lowering->getStackAlignment();
-  unsigned AlignMask = TargetAlign - 1;
-  NumBytes = (NumBytes + AlignMask) & ~AlignMask;
-  return NumBytes;
+  return alignTo(NumBytes, Lowering->getStackAlign());
 }
 
 SDValue PPCTargetLowering::LowerFormalArguments(
@@ -3509,7 +3681,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
   // Potential tail calls could cause overwriting of argument stack slots.
   bool isImmutable = !(getTargetMachine().Options.GuaranteedTailCallOpt &&
                        (CallConv == CallingConv::Fast));
-  unsigned PtrByteSize = 4;
+  const Align PtrAlign(4);
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
@@ -3518,7 +3690,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
 
   // Reserve space for the linkage area on the stack.
   unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
-  CCInfo.AllocateStack(LinkageSize, PtrByteSize);
+  CCInfo.AllocateStack(LinkageSize, PtrAlign);
   if (useSoftFloat())
     CCInfo.PreAnalyzeFormalArguments(Ins);
 
@@ -3627,7 +3799,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
                       ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
-  CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
+  CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrAlign);
 
   CCByValInfo.AnalyzeFormalArguments(Ins, CC_PPC32_SVR4_ByVal);
 
@@ -3674,7 +3846,8 @@ SDValue PPCTargetLowering::LowerFormalArguments_32SVR4(
       MFI.CreateFixedObject(PtrVT.getSizeInBits()/8,
                             CCInfo.getNextStackOffset(), true));
 
-    FuncInfo->setVarArgsFrameIndex(MFI.CreateStackObject(Depth, 8, false));
+    FuncInfo->setVarArgsFrameIndex(
+        MFI.CreateStackObject(Depth, Align(8), false));
     SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
 
     // The fixed integer arguments of a variadic function are stored to the
@@ -3821,11 +3994,13 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
     // We re-align the argument offset for each argument, except when using the
     // fast calling convention, when we need to make sure we do that only when
     // we'll actually use a stack slot.
-    unsigned CurArgOffset, Align;
+    unsigned CurArgOffset;
+    Align Alignment;
     auto ComputeArgOffset = [&]() {
       /* Respect alignment of argument on the stack.  */
-      Align = CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
-      ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+      Alignment =
+          CalculateStackSlotAlignment(ObjectVT, OrigVT, Flags, PtrByteSize);
+      ArgOffset = alignTo(ArgOffset, Alignment);
       CurArgOffset = ArgOffset;
     };
 
@@ -3873,7 +4048,7 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
           ArgSize + ArgOffset > LinkageSize + Num_GPR_Regs * PtrByteSize)
         FI = MFI.CreateFixedObject(ArgSize, ArgOffset, false, true);
       else
-        FI = MFI.CreateStackObject(ArgSize, Align, false);
+        FI = MFI.CreateStackObject(ArgSize, Alignment, false);
       SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
 
       // Handle aggregates smaller than 8 bytes.
@@ -4121,7 +4296,11 @@ SDValue PPCTargetLowering::LowerFormalArguments_64SVR4(
 
   // If the function takes variable number of arguments, make a frame index for
   // the start of the first vararg value... for expansion of llvm.va_start.
-  if (isVarArg) {
+  // On ELFv2ABI spec, it writes:
+  // C programs that are intended to be *portable* across different compilers
+  // and architectures must use the header file <stdarg.h> to deal with variable
+  // argument lists.
+  if (isVarArg && MFI.hasVAStart()) {
     int Depth = ArgOffset;
 
     FuncInfo->setVarArgsFrameIndex(
@@ -4529,30 +4708,67 @@ static int CalculateTailCallSPDiff(SelectionDAG& DAG, bool isTailCall,
 
 static bool isFunctionGlobalAddress(SDValue Callee);
 
-static bool
-callsShareTOCBase(const Function *Caller, SDValue Callee,
-                    const TargetMachine &TM) {
-   // Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
-   // don't have enough information to determine if the caller and calle share
-   // the same  TOC base, so we have to pessimistically assume they don't for
-   // correctness.
-   GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
-   if (!G)
-     return false;
-
-   const GlobalValue *GV = G->getGlobal();
+static bool callsShareTOCBase(const Function *Caller, SDValue Callee,
+                              const TargetMachine &TM) {
+  // It does not make sense to call callsShareTOCBase() with a caller that
+  // is PC Relative since PC Relative callers do not have a TOC.
+#ifndef NDEBUG
+  const PPCSubtarget *STICaller = &TM.getSubtarget<PPCSubtarget>(*Caller);
+  assert(!STICaller->isUsingPCRelativeCalls() &&
+         "PC Relative callers do not have a TOC and cannot share a TOC Base");
+#endif
+
+  // Callee is either a GlobalAddress or an ExternalSymbol. ExternalSymbols
+  // don't have enough information to determine if the caller and callee share
+  // the same  TOC base, so we have to pessimistically assume they don't for
+  // correctness.
+  GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee);
+  if (!G)
+    return false;
+
+  const GlobalValue *GV = G->getGlobal();
+
+  // If the callee is preemptable, then the static linker will use a plt-stub
+  // which saves the toc to the stack, and needs a nop after the call
+  // instruction to convert to a toc-restore.
+  if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV))
+    return false;
+
+  // Functions with PC Relative enabled may clobber the TOC in the same DSO.
+  // We may need a TOC restore in the situation where the caller requires a
+  // valid TOC but the callee is PC Relative and does not.
+  const Function *F = dyn_cast<Function>(GV);
+  const GlobalAlias *Alias = dyn_cast<GlobalAlias>(GV);
+
+  // If we have an Alias we can try to get the function from there.
+  if (Alias) {
+    const GlobalObject *GlobalObj = Alias->getBaseObject();
+    F = dyn_cast<Function>(GlobalObj);
+  }
+
+  // If we still have no valid function pointer we do not have enough
+  // information to determine if the callee uses PC Relative calls so we must
+  // assume that it does.
+  if (!F)
+    return false;
+
+  // If the callee uses PC Relative we cannot guarantee that the callee won't
+  // clobber the TOC of the caller and so we must assume that the two
+  // functions do not share a TOC base.
+  const PPCSubtarget *STICallee = &TM.getSubtarget<PPCSubtarget>(*F);
+  if (STICallee->isUsingPCRelativeCalls())
+    return false;
+
   // The medium and large code models are expected to provide a sufficiently
   // large TOC to provide all data addressing needs of a module with a
-  // single TOC. Since each module will be addressed with a single TOC then we
-  // only need to check that caller and callee don't cross dso boundaries.
+  // single TOC.
   if (CodeModel::Medium == TM.getCodeModel() ||
       CodeModel::Large == TM.getCodeModel())
-    return TM.shouldAssumeDSOLocal(*Caller->getParent(), GV);
+    return true;
 
   // Otherwise we need to ensure callee and caller are in the same section,
   // since the linker may allocate multiple TOCs, and we don't know which
   // sections will belong to the same TOC base.
-
   if (!GV->isStrongDefinitionForLinker())
     return false;
 
@@ -4567,26 +4783,6 @@ callsShareTOCBase(const Function *Caller, SDValue Callee,
       return false;
   }
 
-  // If the callee might be interposed, then we can't assume the ultimate call
-  // target will be in the same section. Even in cases where we can assume that
-  // interposition won't happen, in any case where the linker might insert a
-  // stub to allow for interposition, we must generate code as though
-  // interposition might occur. To understand why this matters, consider a
-  // situation where: a -> b -> c where the arrows indicate calls. b and c are
-  // in the same section, but a is in a different module (i.e. has a different
-  // TOC base pointer). If the linker allows for interposition between b and c,
-  // then it will generate a stub for the call edge between b and c which will
-  // save the TOC pointer into the designated stack slot allocated by b. If we
-  // return true here, and therefore allow a tail call between b and c, that
-  // stack slot won't exist and the b -> c stub will end up saving b'c TOC base
-  // pointer into the stack slot allocated by a (where the a -> b stub saved
-  // a's TOC base pointer). If we're not considering a tail call, but rather,
-  // whether a nop is needed after the call instruction in b, because the linker
-  // will insert a stub, it might complain about a missing nop if we omit it
-  // (although many don't complain in this case).
-  if (!TM.shouldAssumeDSOLocal(*Caller->getParent(), GV))
-    return false;
-
   return true;
 }
 
@@ -4628,13 +4824,12 @@ needStackSlotPassParameters(const PPCSubtarget &Subtarget,
   return false;
 }
 
-static bool
-hasSameArgumentList(const Function *CallerFn, ImmutableCallSite CS) {
-  if (CS.arg_size() != CallerFn->arg_size())
+static bool hasSameArgumentList(const Function *CallerFn, const CallBase &CB) {
+  if (CB.arg_size() != CallerFn->arg_size())
     return false;
 
-  ImmutableCallSite::arg_iterator CalleeArgIter = CS.arg_begin();
-  ImmutableCallSite::arg_iterator CalleeArgEnd = CS.arg_end();
+  auto CalleeArgIter = CB.arg_begin();
+  auto CalleeArgEnd = CB.arg_end();
   Function::const_arg_iterator CallerArgIter = CallerFn->arg_begin();
 
   for (; CalleeArgIter != CalleeArgEnd; ++CalleeArgIter, ++CallerArgIter) {
@@ -4676,15 +4871,10 @@ areCallingConvEligibleForTCO_64SVR4(CallingConv::ID CallerCC,
   return CallerCC == CallingConv::C || CallerCC == CalleeCC;
 }
 
-bool
-PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
-                                    SDValue Callee,
-                                    CallingConv::ID CalleeCC,
-                                    ImmutableCallSite CS,
-                                    bool isVarArg,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SelectionDAG& DAG) const {
+bool PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
+    SDValue Callee, CallingConv::ID CalleeCC, const CallBase *CB, bool isVarArg,
+    const SmallVectorImpl<ISD::OutputArg> &Outs,
+    const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const {
   bool TailCallOpt = getTargetMachine().Options.GuaranteedTailCallOpt;
 
   if (DisableSCO && !TailCallOpt) return false;
@@ -4726,15 +4916,22 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
       needStackSlotPassParameters(Subtarget, Outs))
     return false;
 
-  // No TCO/SCO on indirect call because Caller have to restore its TOC
-  if (!isFunctionGlobalAddress(Callee) &&
-      !isa<ExternalSymbolSDNode>(Callee))
+  // All variants of 64-bit ELF ABIs without PC-Relative addressing require that
+  // the caller and callee share the same TOC for TCO/SCO. If the caller and
+  // callee potentially have different TOC bases then we cannot tail call since
+  // we need to restore the TOC pointer after the call.
+  // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
+  // We cannot guarantee this for indirect calls or calls to external functions.
+  // When PC-Relative addressing is used, the concept of the TOC is no longer
+  // applicable so this check is not required.
+  // Check first for indirect calls.
+  if (!Subtarget.isUsingPCRelativeCalls() &&
+      !isFunctionGlobalAddress(Callee) && !isa<ExternalSymbolSDNode>(Callee))
     return false;
 
-  // If the caller and callee potentially have different TOC bases then we
-  // cannot tail call since we need to restore the TOC pointer after the call.
-  // ref: https://bugzilla.mozilla.org/show_bug.cgi?id=973977
-  if (!callsShareTOCBase(&Caller, Callee, getTargetMachine()))
+  // Check if we share the TOC base.
+  if (!Subtarget.isUsingPCRelativeCalls() &&
+      !callsShareTOCBase(&Caller, Callee, getTargetMachine()))
     return false;
 
   // TCO allows altering callee ABI, so we don't have to check further.
@@ -4746,10 +4943,14 @@ PPCTargetLowering::IsEligibleForTailCallOptimization_64SVR4(
   // If callee use the same argument list that caller is using, then we can
   // apply SCO on this case. If it is not, then we need to check if callee needs
   // stack for passing arguments.
-  if (!hasSameArgumentList(&Caller, CS) &&
-      needStackSlotPassParameters(Subtarget, Outs)) {
+  // PC Relative tail calls may not have a CallBase.
+  // If there is no CallBase we cannot verify if we have the same argument
+  // list so assume that we don't have the same argument list.
+  if (CB && !hasSameArgumentList(&Caller, *CB) &&
+      needStackSlotPassParameters(Subtarget, Outs))
+    return false;
+  else if (!CB && needStackSlotPassParameters(Subtarget, Outs))
     return false;
-  }
 
   return true;
 }
@@ -4858,18 +5059,6 @@ static SDValue EmitTailCallStoreFPAndRetAddr(SelectionDAG &DAG, SDValue Chain,
     SDValue NewRetAddrFrIdx = DAG.getFrameIndex(NewRetAddr, VT);
     Chain = DAG.getStore(Chain, dl, OldRetAddr, NewRetAddrFrIdx,
                          MachinePointerInfo::getFixedStack(MF, NewRetAddr));
-
-    // When using the 32/64-bit SVR4 ABI there is no need to move the FP stack
-    // slot as the FP is never overwritten.
-    if (Subtarget.isDarwinABI()) {
-      int NewFPLoc = SPDiff + FL->getFramePointerSaveOffset();
-      int NewFPIdx = MF.getFrameInfo().CreateFixedObject(SlotSize, NewFPLoc,
-                                                         true);
-      SDValue NewFramePtrIdx = DAG.getFrameIndex(NewFPIdx, VT);
-      Chain = DAG.getStore(Chain, dl, OldFP, NewFramePtrIdx,
-                           MachinePointerInfo::getFixedStack(
-                               DAG.getMachineFunction(), NewFPIdx));
-    }
   }
   return Chain;
 }
@@ -4904,14 +5093,6 @@ SDValue PPCTargetLowering::EmitTailCallLoadFPAndRetAddr(
     LROpOut = getReturnAddrFrameIndex(DAG);
     LROpOut = DAG.getLoad(VT, dl, Chain, LROpOut, MachinePointerInfo());
     Chain = SDValue(LROpOut.getNode(), 1);
-
-    // When using the 32/64-bit SVR4 ABI there is no need to load the FP stack
-    // slot as the FP is never overwritten.
-    if (Subtarget.isDarwinABI()) {
-      FPOpOut = getFramePointerFrameIndex(DAG);
-      FPOpOut = DAG.getLoad(VT, dl, Chain, FPOpOut, MachinePointerInfo());
-      Chain = SDValue(FPOpOut.getNode(), 1);
-    }
   }
   return Chain;
 }
@@ -4926,9 +5107,9 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
                                          SDValue Chain, ISD::ArgFlagsTy Flags,
                                          SelectionDAG &DAG, const SDLoc &dl) {
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
-  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
-                       false, false, false, MachinePointerInfo(),
-                       MachinePointerInfo());
+  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode,
+                       Flags.getNonZeroByValAlign(), false, false, false,
+                       MachinePointerInfo(), MachinePointerInfo());
 }
 
 /// LowerMemOpCallTo - Store the argument to the stack or remember it in case of
@@ -5079,28 +5260,37 @@ static bool isIndirectCall(const SDValue &Callee, SelectionDAG &DAG,
   return true;
 }
 
-static unsigned getCallOpcode(bool isIndirectCall, bool isPatchPoint,
-                              bool isTailCall, const Function &Caller,
+// AIX and 64-bit ELF ABIs w/o PCRel require a TOC save/restore around calls.
+static inline bool isTOCSaveRestoreRequired(const PPCSubtarget &Subtarget) {
+  return Subtarget.isAIXABI() ||
+         (Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls());
+}
+
+static unsigned getCallOpcode(PPCTargetLowering::CallFlags CFlags,
+                              const Function &Caller,
                               const SDValue &Callee,
                               const PPCSubtarget &Subtarget,
                               const TargetMachine &TM) {
-  if (isTailCall)
+  if (CFlags.IsTailCall)
     return PPCISD::TC_RETURN;
 
   // This is a call through a function pointer.
-  if (isIndirectCall) {
+  if (CFlags.IsIndirect) {
     // AIX and the 64-bit ELF ABIs need to maintain the TOC pointer accross
     // indirect calls. The save of the caller's TOC pointer to the stack will be
     // inserted into the DAG as part of call lowering. The restore of the TOC
     // pointer is modeled by using a pseudo instruction for the call opcode that
     // represents the 2 instruction sequence of an indirect branch and link,
     // immediately followed by a load of the TOC pointer from the the stack save
-    // slot into gpr2.
-    if (Subtarget.isAIXABI() || Subtarget.is64BitELFABI())
-      return PPCISD::BCTRL_LOAD_TOC;
+    // slot into gpr2. For 64-bit ELFv2 ABI with PCRel, do not restore the TOC
+    // as it is not saved or used.
+    return isTOCSaveRestoreRequired(Subtarget) ? PPCISD::BCTRL_LOAD_TOC
+                                               : PPCISD::BCTRL;
+  }
 
-    // An indirect call that does not need a TOC restore.
-    return PPCISD::BCTRL;
+  if (Subtarget.isUsingPCRelativeCalls()) {
+    assert(Subtarget.is64BitELFABI() && "PC Relative is only on ELF ABI.");
+    return PPCISD::CALL_NOTOC;
   }
 
   // The ABIs that maintain a TOC pointer accross calls need to have a nop
@@ -5118,14 +5308,6 @@ static unsigned getCallOpcode(bool isIndirectCall, bool isPatchPoint,
   return PPCISD::CALL;
 }
 
-static bool isValidAIXExternalSymSDNode(StringRef SymName) {
-  return StringSwitch<bool>(SymName)
-      .Cases("__divdi3", "__fixunsdfdi", "__floatundidf", "__floatundisf",
-             "__moddi3", "__udivdi3", "__umoddi3", true)
-      .Cases("ceil", "floor", "memcpy", "memmove", "memset", "round", true)
-      .Default(false);
-}
-
 static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
                                const SDLoc &dl, const PPCSubtarget &Subtarget) {
   if (!Subtarget.usesFunctionDescriptors() && !Subtarget.isELFv2ABI())
@@ -5161,14 +5343,14 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
         MCSymbolXCOFF *S = cast<MCSymbolXCOFF>(
             Context.getOrCreateSymbol(Twine(".") + Twine(FuncName)));
 
-        if (IsDeclaration && !S->hasContainingCsect()) {
+        if (IsDeclaration && !S->hasRepresentedCsectSet()) {
           // On AIX, an undefined symbol needs to be associated with a
           // MCSectionXCOFF to get the correct storage mapping class.
           // In this case, XCOFF::XMC_PR.
           MCSectionXCOFF *Sec = Context.getXCOFFSection(
-              S->getName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC,
+              S->getSymbolTableName(), XCOFF::XMC_PR, XCOFF::XTY_ER, SC,
               SectionKind::getMetadata());
-          S->setContainingCsect(Sec);
+          S->setRepresentedCsect(Sec);
         }
 
         MVT PtrVT =
@@ -5209,12 +5391,7 @@ static SDValue transformCallee(const SDValue &Callee, SelectionDAG &DAG,
                                               SC);
     }
 
-    // TODO: Remove this when the support for ExternalSymbolSDNode is complete.
-    if (isValidAIXExternalSymSDNode(SymName)) {
-      return getAIXFuncEntryPointSymbolSDNode(SymName, true, XCOFF::C_EXT);
-    }
-
-    report_fatal_error("Unexpected ExternalSymbolSDNode: " + Twine(SymName));
+    return getAIXFuncEntryPointSymbolSDNode(SymName, true, XCOFF::C_EXT);
   }
 
   // No transformation needed.
@@ -5252,7 +5429,7 @@ static void prepareIndirectCall(SelectionDAG &DAG, SDValue &Callee,
 static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
                                           SDValue &Glue, SDValue &Chain,
                                           SDValue CallSeqStart,
-                                          ImmutableCallSite CS, const SDLoc &dl,
+                                          const CallBase *CB, const SDLoc &dl,
                                           bool hasNest,
                                           const PPCSubtarget &Subtarget) {
   // Function pointers in the 64-bit SVR4 ABI do not point to the function
@@ -5288,7 +5465,7 @@ static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
                          MachineMemOperand::MOInvariant)
                       : MachineMemOperand::MONone;
 
-  MachinePointerInfo MPI(CS ? CS.getCalledValue() : nullptr);
+  MachinePointerInfo MPI(CB ? CB->getCalledOperand() : nullptr);
 
   // Registers used in building the DAG.
   const MCRegister EnvPtrReg = Subtarget.getEnvironmentPointerRegister();
@@ -5342,12 +5519,12 @@ static void prepareDescriptorIndirectCall(SelectionDAG &DAG, SDValue &Callee,
 }
 
 static void
-buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,
-                  const SDLoc &dl, bool isTailCall, bool isVarArg,
-                  bool isPatchPoint, bool hasNest, SelectionDAG &DAG,
+buildCallOperands(SmallVectorImpl<SDValue> &Ops,
+                  PPCTargetLowering::CallFlags CFlags, const SDLoc &dl,
+                  SelectionDAG &DAG,
                   SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
                   SDValue Glue, SDValue Chain, SDValue &Callee, int SPDiff,
-                  const PPCSubtarget &Subtarget, bool isIndirect) {
+                  const PPCSubtarget &Subtarget) {
   const bool IsPPC64 = Subtarget.isPPC64();
   // MVT for a general purpose register.
   const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
@@ -5356,10 +5533,10 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,
   Ops.push_back(Chain);
 
   // If it's a direct call pass the callee as the second operand.
-  if (!isIndirect)
+  if (!CFlags.IsIndirect)
     Ops.push_back(Callee);
   else {
-    assert(!isPatchPoint && "Patch point call are not indirect.");
+    assert(!CFlags.IsPatchPoint && "Patch point calls are not indirect.");
 
     // For the TOC based ABIs, we have saved the TOC pointer to the linkage area
     // on the stack (this would have been done in `LowerCall_64SVR4` or
@@ -5368,7 +5545,9 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,
     // pointer from the linkage area. The operand for the TOC restore is an add
     // of the TOC save offset to the stack pointer. This must be the second
     // operand: after the chain input but before any other variadic arguments.
-    if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) {
+    // For 64-bit ELFv2 ABI with PCRel, do not restore the TOC as it is not
+    // saved or used.
+    if (isTOCSaveRestoreRequired(Subtarget)) {
       const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
 
       SDValue StackPtr = DAG.getRegister(StackPtrReg, RegVT);
@@ -5379,18 +5558,18 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,
     }
 
     // Add the register used for the environment pointer.
-    if (Subtarget.usesFunctionDescriptors() && !hasNest)
+    if (Subtarget.usesFunctionDescriptors() && !CFlags.HasNest)
       Ops.push_back(DAG.getRegister(Subtarget.getEnvironmentPointerRegister(),
                                     RegVT));
 
 
     // Add CTR register as callee so a bctr can be emitted later.
-    if (isTailCall)
+    if (CFlags.IsTailCall)
       Ops.push_back(DAG.getRegister(IsPPC64 ? PPC::CTR8 : PPC::CTR, RegVT));
   }
 
   // If this is a tail call add stack pointer delta.
-  if (isTailCall)
+  if (CFlags.IsTailCall)
     Ops.push_back(DAG.getConstant(SPDiff, dl, MVT::i32));
 
   // Add argument registers to the end of the list so that they are known live
@@ -5402,17 +5581,18 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,
   // We cannot add R2/X2 as an operand here for PATCHPOINT, because there is
   // no way to mark dependencies as implicit here.
   // We will add the R2/X2 dependency in EmitInstrWithCustomInserter.
-  if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) && !isPatchPoint)
+  if ((Subtarget.is64BitELFABI() || Subtarget.isAIXABI()) &&
+       !CFlags.IsPatchPoint && !Subtarget.isUsingPCRelativeCalls())
     Ops.push_back(DAG.getRegister(Subtarget.getTOCPointerRegister(), RegVT));
 
   // Add implicit use of CR bit 6 for 32-bit SVR4 vararg calls
-  if (isVarArg && Subtarget.is32BitELFABI())
+  if (CFlags.IsVarArg && Subtarget.is32BitELFABI())
     Ops.push_back(DAG.getRegister(PPC::CR1EQ, MVT::i32));
 
   // Add a register mask operand representing the call-preserved registers.
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
   const uint32_t *Mask =
-      TRI->getCallPreservedMask(DAG.getMachineFunction(), CallConv);
+      TRI->getCallPreservedMask(DAG.getMachineFunction(), CFlags.CallConv);
   assert(Mask && "Missing call preserved mask for calling convention");
   Ops.push_back(DAG.getRegisterMask(Mask));
 
@@ -5422,44 +5602,47 @@ buildCallOperands(SmallVectorImpl<SDValue> &Ops, CallingConv::ID CallConv,
 }
 
 SDValue PPCTargetLowering::FinishCall(
-    CallingConv::ID CallConv, const SDLoc &dl, bool isTailCall, bool isVarArg,
-    bool isPatchPoint, bool hasNest, SelectionDAG &DAG,
+    CallFlags CFlags, const SDLoc &dl, SelectionDAG &DAG,
     SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass, SDValue Glue,
     SDValue Chain, SDValue CallSeqStart, SDValue &Callee, int SPDiff,
     unsigned NumBytes, const SmallVectorImpl<ISD::InputArg> &Ins,
-    SmallVectorImpl<SDValue> &InVals, ImmutableCallSite CS) const {
+    SmallVectorImpl<SDValue> &InVals, const CallBase *CB) const {
 
-  if (Subtarget.is64BitELFABI() || Subtarget.isAIXABI())
+  if ((Subtarget.is64BitELFABI() && !Subtarget.isUsingPCRelativeCalls()) ||
+      Subtarget.isAIXABI())
     setUsesTOCBasePtr(DAG);
 
-  const bool isIndirect = isIndirectCall(Callee, DAG, Subtarget, isPatchPoint);
-  unsigned CallOpc = getCallOpcode(isIndirect, isPatchPoint, isTailCall,
-                                   DAG.getMachineFunction().getFunction(),
-                                   Callee, Subtarget, DAG.getTarget());
+  unsigned CallOpc =
+      getCallOpcode(CFlags, DAG.getMachineFunction().getFunction(), Callee,
+                    Subtarget, DAG.getTarget());
 
-  if (!isIndirect)
+  if (!CFlags.IsIndirect)
     Callee = transformCallee(Callee, DAG, dl, Subtarget);
   else if (Subtarget.usesFunctionDescriptors())
-    prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CS,
-                                  dl, hasNest, Subtarget);
+    prepareDescriptorIndirectCall(DAG, Callee, Glue, Chain, CallSeqStart, CB,
+                                  dl, CFlags.HasNest, Subtarget);
   else
     prepareIndirectCall(DAG, Callee, Glue, Chain, dl);
 
   // Build the operand list for the call instruction.
   SmallVector<SDValue, 8> Ops;
-  buildCallOperands(Ops, CallConv, dl, isTailCall, isVarArg, isPatchPoint,
-                    hasNest, DAG, RegsToPass, Glue, Chain, Callee, SPDiff,
-                    Subtarget, isIndirect);
+  buildCallOperands(Ops, CFlags, dl, DAG, RegsToPass, Glue, Chain, Callee,
+                    SPDiff, Subtarget);
 
   // Emit tail call.
-  if (isTailCall) {
+  if (CFlags.IsTailCall) {
+    // Indirect tail call when using PC Relative calls do not have the same
+    // constraints.
     assert(((Callee.getOpcode() == ISD::Register &&
              cast<RegisterSDNode>(Callee)->getReg() == PPC::CTR) ||
             Callee.getOpcode() == ISD::TargetExternalSymbol ||
             Callee.getOpcode() == ISD::TargetGlobalAddress ||
-            isa<ConstantSDNode>(Callee)) &&
-           "Expecting a global address, external symbol, absolute value or "
-           "register");
+            isa<ConstantSDNode>(Callee) ||
+            (CFlags.IsIndirect && Subtarget.isUsingPCRelativeCalls())) &&
+           "Expecting a global address, external symbol, absolute value, "
+           "register or an indirect tail call when PC Relative calls are "
+           "used.");
+    // PC Relative calls also use TC_RETURN as the way to mark tail calls.
     assert(CallOpc == PPCISD::TC_RETURN &&
            "Unexpected call opcode for a tail call.");
     DAG.getMachineFunction().getFrameInfo().setHasTailCall();
@@ -5468,12 +5651,13 @@ SDValue PPCTargetLowering::FinishCall(
 
   std::array<EVT, 2> ReturnTypes = {{MVT::Other, MVT::Glue}};
   Chain = DAG.getNode(CallOpc, dl, ReturnTypes, Ops);
+  DAG.addNoMergeSiteInfo(Chain.getNode(), CFlags.NoMerge);
   Glue = Chain.getValue(1);
 
   // When performing tail call optimization the callee pops its arguments off
   // the stack. Account for this here so these bytes can be pushed back on in
   // PPCFrameLowering::eliminateCallFramePseudoInstr.
-  int BytesCalleePops = (CallConv == CallingConv::Fast &&
+  int BytesCalleePops = (CFlags.CallConv == CallingConv::Fast &&
                          getTargetMachine().Options.GuaranteedTailCallOpt)
                             ? NumBytes
                             : 0;
@@ -5483,7 +5667,8 @@ SDValue PPCTargetLowering::FinishCall(
                              Glue, dl);
   Glue = Chain.getValue(1);
 
-  return LowerCallResult(Chain, Glue, CallConv, isVarArg, Ins, dl, DAG, InVals);
+  return LowerCallResult(Chain, Glue, CFlags.CallConv, CFlags.IsVarArg, Ins, dl,
+                         DAG, InVals);
 }
 
 SDValue
@@ -5500,15 +5685,14 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   CallingConv::ID CallConv              = CLI.CallConv;
   bool isVarArg                         = CLI.IsVarArg;
   bool isPatchPoint                     = CLI.IsPatchPoint;
-  ImmutableCallSite CS                  = CLI.CS;
+  const CallBase *CB                    = CLI.CB;
 
   if (isTailCall) {
-    if (Subtarget.useLongCalls() && !(CS && CS.isMustTailCall()))
+    if (Subtarget.useLongCalls() && !(CB && CB->isMustTailCall()))
       isTailCall = false;
     else if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
-      isTailCall =
-        IsEligibleForTailCallOptimization_64SVR4(Callee, CallConv, CS,
-                                                 isVarArg, Outs, Ins, DAG);
+      isTailCall = IsEligibleForTailCallOptimization_64SVR4(
+          Callee, CallConv, CB, isVarArg, Outs, Ins, DAG);
     else
       isTailCall = IsEligibleForTailCallOptimization(Callee, CallConv, isVarArg,
                                                      Ins, DAG);
@@ -5517,21 +5701,23 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       if (!getTargetMachine().Options.GuaranteedTailCallOpt)
         ++NumSiblingCalls;
 
-      assert(isa<GlobalAddressSDNode>(Callee) &&
+      // PC Relative calls no longer guarantee that the callee is a Global
+      // Address Node. The callee could be an indirect tail call in which
+      // case the SDValue for the callee could be a load (to load the address
+      // of a function pointer) or it may be a register copy (to move the
+      // address of the callee from a function parameter into a virtual
+      // register). It may also be an ExternalSymbolSDNode (ex memcopy).
+      assert((Subtarget.isUsingPCRelativeCalls() ||
+              isa<GlobalAddressSDNode>(Callee)) &&
              "Callee should be an llvm::Function object.");
-      LLVM_DEBUG(
-          const GlobalValue *GV =
-              cast<GlobalAddressSDNode>(Callee)->getGlobal();
-          const unsigned Width =
-              80 - strlen("TCO caller: ") - strlen(", callee linkage: 0, 0");
-          dbgs() << "TCO caller: "
-                 << left_justify(DAG.getMachineFunction().getName(), Width)
-                 << ", callee linkage: " << GV->getVisibility() << ", "
-                 << GV->getLinkage() << "\n");
+
+      LLVM_DEBUG(dbgs() << "TCO caller: " << DAG.getMachineFunction().getName()
+                        << "\nTCO callee: ");
+      LLVM_DEBUG(Callee.dump());
     }
   }
 
-  if (!isTailCall && CS && CS.isMustTailCall())
+  if (!isTailCall && CB && CB->isMustTailCall())
     report_fatal_error("failed to perform tail call elimination on a call "
                        "site marked musttail");
 
@@ -5542,42 +5728,49 @@ PPCTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       !isTailCall)
     Callee = LowerGlobalAddress(Callee, DAG);
 
+  CallFlags CFlags(
+      CallConv, isTailCall, isVarArg, isPatchPoint,
+      isIndirectCall(Callee, DAG, Subtarget, isPatchPoint),
+      // hasNest
+      Subtarget.is64BitELFABI() &&
+          any_of(Outs, [](ISD::OutputArg Arg) { return Arg.Flags.isNest(); }),
+      CLI.NoMerge);
+
   if (Subtarget.isSVR4ABI() && Subtarget.isPPC64())
-    return LowerCall_64SVR4(Chain, Callee, CallConv, isVarArg,
-                            isTailCall, isPatchPoint, Outs, OutVals, Ins,
-                            dl, DAG, InVals, CS);
+    return LowerCall_64SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
+                            InVals, CB);
 
   if (Subtarget.isSVR4ABI())
-    return LowerCall_32SVR4(Chain, Callee, CallConv, isVarArg,
-                            isTailCall, isPatchPoint, Outs, OutVals, Ins,
-                            dl, DAG, InVals, CS);
+    return LowerCall_32SVR4(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
+                            InVals, CB);
 
   if (Subtarget.isAIXABI())
-    return LowerCall_AIX(Chain, Callee, CallConv, isVarArg,
-                         isTailCall, isPatchPoint, Outs, OutVals, Ins,
-                         dl, DAG, InVals, CS);
+    return LowerCall_AIX(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
+                         InVals, CB);
 
-  return LowerCall_Darwin(Chain, Callee, CallConv, isVarArg,
-                          isTailCall, isPatchPoint, Outs, OutVals, Ins,
-                          dl, DAG, InVals, CS);
+  return LowerCall_Darwin(Chain, Callee, CFlags, Outs, OutVals, Ins, dl, DAG,
+                          InVals, CB);
 }
 
 SDValue PPCTargetLowering::LowerCall_32SVR4(
-    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
-    bool isTailCall, bool isPatchPoint,
+    SDValue Chain, SDValue Callee, CallFlags CFlags,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-    ImmutableCallSite CS) const {
+    const CallBase *CB) const {
   // See PPCTargetLowering::LowerFormalArguments_32SVR4() for a description
   // of the 32-bit SVR4 ABI stack frame layout.
 
+  const CallingConv::ID CallConv = CFlags.CallConv;
+  const bool IsVarArg = CFlags.IsVarArg;
+  const bool IsTailCall = CFlags.IsTailCall;
+
   assert((CallConv == CallingConv::C ||
           CallConv == CallingConv::Cold ||
           CallConv == CallingConv::Fast) && "Unknown calling convention!");
 
-  unsigned PtrByteSize = 4;
+  const Align PtrAlign(4);
 
   MachineFunction &MF = DAG.getMachineFunction();
 
@@ -5596,15 +5789,15 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
 
   // Assign locations to all of the outgoing arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  PPCCCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+  PPCCCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
 
   // Reserve space for the linkage area on the stack.
   CCInfo.AllocateStack(Subtarget.getFrameLowering()->getLinkageSize(),
-                       PtrByteSize);
+                       PtrAlign);
   if (useSoftFloat())
     CCInfo.PreAnalyzeCallOperands(Outs);
 
-  if (isVarArg) {
+  if (IsVarArg) {
     // Handle fixed and variable vector arguments differently.
     // Fixed vector arguments go into registers as long as registers are
     // available. Variable vector arguments always go into memory.
@@ -5639,10 +5832,10 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
 
   // Assign locations to all of the outgoing aggregate by value arguments.
   SmallVector<CCValAssign, 16> ByValArgLocs;
-  CCState CCByValInfo(CallConv, isVarArg, MF, ByValArgLocs, *DAG.getContext());
+  CCState CCByValInfo(CallConv, IsVarArg, MF, ByValArgLocs, *DAG.getContext());
 
   // Reserve stack space for the allocations in CCInfo.
-  CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrByteSize);
+  CCByValInfo.AllocateStack(CCInfo.getNextStackOffset(), PtrAlign);
 
   CCByValInfo.AnalyzeCallOperands(Outs, CC_PPC32_SVR4_ByVal);
 
@@ -5653,7 +5846,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
 
   // Calculate by how many bytes the stack has to be adjusted in case of tail
   // call optimization.
-  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+  int SPDiff = CalculateTailCallSPDiff(DAG, IsTailCall, NumBytes);
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass
@@ -5749,7 +5942,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
       assert(VA.isMemLoc());
       unsigned LocMemOffset = VA.getLocMemOffset();
 
-      if (!isTailCall) {
+      if (!IsTailCall) {
         SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
         PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(MF.getDataLayout()),
                              StackPtr, PtrOff);
@@ -5778,7 +5971,7 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
 
   // Set CR bit 6 to true if this is a vararg call with floating args passed in
   // registers.
-  if (isVarArg) {
+  if (IsVarArg) {
     SDVTList VTs = DAG.getVTList(MVT::Other, MVT::Glue);
     SDValue Ops[] = { Chain, InFlag };
 
@@ -5788,14 +5981,12 @@ SDValue PPCTargetLowering::LowerCall_32SVR4(
     InFlag = Chain.getValue(1);
   }
 
-  if (isTailCall)
+  if (IsTailCall)
     PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
                     TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
-                    /* unused except on PPC64 ELFv1 */ false, DAG,
-                    RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
-                    NumBytes, Ins, InVals, CS);
+  return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
+                    Callee, SPDiff, NumBytes, Ins, InVals, CB);
 }
 
 // Copy an argument into memory, being careful to do this outside the
@@ -5816,25 +6007,24 @@ SDValue PPCTargetLowering::createMemcpyOutsideCallSeq(
 }
 
 SDValue PPCTargetLowering::LowerCall_64SVR4(
-    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
-    bool isTailCall, bool isPatchPoint,
+    SDValue Chain, SDValue Callee, CallFlags CFlags,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-    ImmutableCallSite CS) const {
+    const CallBase *CB) const {
   bool isELFv2ABI = Subtarget.isELFv2ABI();
   bool isLittleEndian = Subtarget.isLittleEndian();
   unsigned NumOps = Outs.size();
-  bool hasNest = false;
   bool IsSibCall = false;
+  bool IsFastCall = CFlags.CallConv == CallingConv::Fast;
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   unsigned PtrByteSize = 8;
 
   MachineFunction &MF = DAG.getMachineFunction();
 
-  if (isTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
+  if (CFlags.IsTailCall && !getTargetMachine().Options.GuaranteedTailCallOpt)
     IsSibCall = true;
 
   // Mark this function as potentially containing a function that contains a
@@ -5842,11 +6032,10 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
   // and restoring the callers stack pointer in this functions epilog. This is
   // done because by tail calling the called function might overwrite the value
   // in this function's (MF) stack pointer stack slot 0(SP).
-  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
-      CallConv == CallingConv::Fast)
+  if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
     MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
 
-  assert(!(CallConv == CallingConv::Fast && isVarArg) &&
+  assert(!(IsFastCall && CFlags.IsVarArg) &&
          "fastcc not supported on varargs functions");
 
   // Count how many bytes are to be pushed on the stack, including the linkage
@@ -5876,7 +6065,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
   // can be passed to the callee in registers.
   // For the fast calling convention, there is another check below.
   // Note: We should keep consistent with LowerFormalArguments_64SVR4()
-  bool HasParameterArea = !isELFv2ABI || isVarArg || CallConv == CallingConv::Fast;
+  bool HasParameterArea = !isELFv2ABI || CFlags.IsVarArg || IsFastCall;
   if (!HasParameterArea) {
     unsigned ParamAreaSize = NumGPRs * PtrByteSize;
     unsigned AvailableFPRs = NumFPRs;
@@ -5898,7 +6087,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
 
   // Avoid allocating parameter area for fastcc functions if all the arguments
   // can be passed in the registers.
-  if (CallConv == CallingConv::Fast)
+  if (IsFastCall)
     HasParameterArea = false;
 
   // Add up all the space actually used.
@@ -5910,7 +6099,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     if (Flags.isNest())
       continue;
 
-    if (CallConv == CallingConv::Fast) {
+    if (IsFastCall) {
       if (Flags.isByVal()) {
         NumGPRsUsed += (Flags.getByValSize()+7)/8;
         if (NumGPRsUsed > NumGPRs)
@@ -5958,9 +6147,9 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     }
 
     /* Respect alignment of argument on the stack.  */
-    unsigned Align =
-      CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
-    NumBytes = ((NumBytes + Align - 1) / Align) * Align;
+    auto Alignement =
+        CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+    NumBytes = alignTo(NumBytes, Alignement);
 
     NumBytes += CalculateStackSlotSize(ArgVT, Flags, PtrByteSize);
     if (Flags.isInConsecutiveRegsLast())
@@ -5983,8 +6172,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     NumBytes = LinkageSize;
 
   // Tail call needs the stack to be aligned.
-  if (getTargetMachine().Options.GuaranteedTailCallOpt &&
-      CallConv == CallingConv::Fast)
+  if (getTargetMachine().Options.GuaranteedTailCallOpt && IsFastCall)
     NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
 
   int SPDiff = 0;
@@ -5992,11 +6180,11 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
   // Calculate by how many bytes the stack has to be adjusted in case of tail
   // call optimization.
   if (!IsSibCall)
-    SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+    SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);
 
   // To protect arguments on the stack from being clobbered in a tail call,
   // force all the loads to happen before doing any other lowering.
-  if (isTailCall)
+  if (CFlags.IsTailCall)
     Chain = DAG.getStackArgumentTokenFactor(Chain);
 
   // Adjust the stack pointer for the new arguments...
@@ -6040,16 +6228,16 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     // we'll actually use a stack slot.
     auto ComputePtrOff = [&]() {
       /* Respect alignment of argument on the stack.  */
-      unsigned Align =
-        CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
-      ArgOffset = ((ArgOffset + Align - 1) / Align) * Align;
+      auto Alignment =
+          CalculateStackSlotAlignment(ArgVT, OrigVT, Flags, PtrByteSize);
+      ArgOffset = alignTo(ArgOffset, Alignment);
 
       PtrOff = DAG.getConstant(ArgOffset, dl, StackPtr.getValueType());
 
       PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
     };
 
-    if (CallConv != CallingConv::Fast) {
+    if (!IsFastCall) {
       ComputePtrOff();
 
       /* Compute GPR index associated with argument offset.  */
@@ -6080,7 +6268,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       if (Size == 0)
         continue;
 
-      if (CallConv == CallingConv::Fast)
+      if (IsFastCall)
         ComputePtrOff();
 
       // All aggregates smaller than 8 bytes must be passed right-justified.
@@ -6185,7 +6373,6 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       if (Flags.isNest()) {
         // The 'nest' parameter, if any, is passed in R11.
         RegsToPass.push_back(std::make_pair(PPC::X11, Arg));
-        hasNest = true;
         break;
       }
 
@@ -6195,18 +6382,18 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       if (GPR_idx != NumGPRs) {
         RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
       } else {
-        if (CallConv == CallingConv::Fast)
+        if (IsFastCall)
           ComputePtrOff();
 
         assert(HasParameterArea &&
                "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
-                         true, isTailCall, false, MemOpChains,
+                         true, CFlags.IsTailCall, false, MemOpChains,
                          TailCallArguments, dl);
-        if (CallConv == CallingConv::Fast)
+        if (IsFastCall)
           ArgOffset += PtrByteSize;
       }
-      if (CallConv != CallingConv::Fast)
+      if (!IsFastCall)
         ArgOffset += PtrByteSize;
       break;
     case MVT::f32:
@@ -6220,7 +6407,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       // Unnamed arguments for vararg functions always go to GPRs and
       // then the parameter save area.  For now, put all arguments to vararg
       // routines always in both locations (FPR *and* GPR or stack slot).
-      bool NeedGPROrStack = isVarArg || FPR_idx == NumFPRs;
+      bool NeedGPROrStack = CFlags.IsVarArg || FPR_idx == NumFPRs;
       bool NeededLoad = false;
 
       // First load the argument into the next available FPR.
@@ -6230,7 +6417,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       // Next, load the argument into GPR or stack slot if needed.
       if (!NeedGPROrStack)
         ;
-      else if (GPR_idx != NumGPRs && CallConv != CallingConv::Fast) {
+      else if (GPR_idx != NumGPRs && !IsFastCall) {
         // FIXME: We may want to re-enable this for CallingConv::Fast on the P8
         // once we support fp <-> gpr moves.
 
@@ -6274,7 +6461,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
         if (ArgVal.getNode())
           RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], ArgVal));
       } else {
-        if (CallConv == CallingConv::Fast)
+        if (IsFastCall)
           ComputePtrOff();
 
         // Single-precision floating-point values are mapped to the
@@ -6288,7 +6475,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
         assert(HasParameterArea &&
                "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
-                         true, isTailCall, false, MemOpChains,
+                         true, CFlags.IsTailCall, false, MemOpChains,
                          TailCallArguments, dl);
 
         NeededLoad = true;
@@ -6296,7 +6483,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       // When passing an array of floats, the array occupies consecutive
       // space in the argument area; only round up to the next doubleword
       // at the end of the array.  Otherwise, each float takes 8 bytes.
-      if (CallConv != CallingConv::Fast || NeededLoad) {
+      if (!IsFastCall || NeededLoad) {
         ArgOffset += (Arg.getValueType() == MVT::f32 &&
                       Flags.isInConsecutiveRegs()) ? 4 : 8;
         if (Flags.isInConsecutiveRegsLast())
@@ -6321,7 +6508,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       // usual; unnamed arguments always go to the stack or the corresponding
       // GPRs when within range.  For now, we always put the value in both
       // locations (or even all three).
-      if (isVarArg) {
+      if (CFlags.IsVarArg) {
         assert(HasParameterArea &&
                "Parameter area must exist if we have a varargs call.");
         // We could elide this store in the case where the object fits
@@ -6353,19 +6540,19 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       if (VR_idx != NumVRs) {
         RegsToPass.push_back(std::make_pair(VR[VR_idx++], Arg));
       } else {
-        if (CallConv == CallingConv::Fast)
+        if (IsFastCall)
           ComputePtrOff();
 
         assert(HasParameterArea &&
                "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
-                         true, isTailCall, true, MemOpChains,
+                         true, CFlags.IsTailCall, true, MemOpChains,
                          TailCallArguments, dl);
-        if (CallConv == CallingConv::Fast)
+        if (IsFastCall)
           ArgOffset += 16;
       }
 
-      if (CallConv != CallingConv::Fast)
+      if (!IsFastCall)
         ArgOffset += 16;
       break;
       } // not QPX
@@ -6377,7 +6564,7 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     case MVT::v4f64:
     case MVT::v4i1: {
       bool IsF32 = Arg.getValueType().getSimpleVT().SimpleTy == MVT::v4f32;
-      if (isVarArg) {
+      if (CFlags.IsVarArg) {
         assert(HasParameterArea &&
                "Parameter area must exist if we have a varargs call.");
         // We could elide this store in the case where the object fits
@@ -6409,19 +6596,19 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
       if (QFPR_idx != NumQFPRs) {
         RegsToPass.push_back(std::make_pair(QFPR[QFPR_idx++], Arg));
       } else {
-        if (CallConv == CallingConv::Fast)
+        if (IsFastCall)
           ComputePtrOff();
 
         assert(HasParameterArea &&
                "Parameter area must exist to pass an argument in memory.");
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
-                         true, isTailCall, true, MemOpChains,
+                         true, CFlags.IsTailCall, true, MemOpChains,
                          TailCallArguments, dl);
-        if (CallConv == CallingConv::Fast)
+        if (IsFastCall)
           ArgOffset += (IsF32 ? 16 : 32);
       }
 
-      if (CallConv != CallingConv::Fast)
+      if (!IsFastCall)
         ArgOffset += (IsF32 ? 16 : 32);
       break;
       }
@@ -6438,23 +6625,26 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
   // Check if this is an indirect call (MTCTR/BCTRL).
   // See prepareDescriptorIndirectCall and buildCallOperands for more
   // information about calls through function pointers in the 64-bit SVR4 ABI.
-  if (!isTailCall && !isPatchPoint &&
-      !isFunctionGlobalAddress(Callee) &&
-      !isa<ExternalSymbolSDNode>(Callee)) {
-    // Load r2 into a virtual register and store it to the TOC save area.
-    setUsesTOCBasePtr(DAG);
-    SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
-    // TOC save area offset.
-    unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
-    SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
-    SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
-    Chain = DAG.getStore(
-        Val.getValue(1), dl, Val, AddPtr,
-        MachinePointerInfo::getStack(DAG.getMachineFunction(), TOCSaveOffset));
+  if (CFlags.IsIndirect) {
+    // For 64-bit ELFv2 ABI with PCRel, do not save the TOC of the
+    // caller in the TOC save area.
+    if (isTOCSaveRestoreRequired(Subtarget)) {
+      assert(!CFlags.IsTailCall && "Indirect tails calls not supported");
+      // Load r2 into a virtual register and store it to the TOC save area.
+      setUsesTOCBasePtr(DAG);
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, PPC::X2, MVT::i64);
+      // TOC save area offset.
+      unsigned TOCSaveOffset = Subtarget.getFrameLowering()->getTOCSaveOffset();
+      SDValue PtrOff = DAG.getIntPtrConstant(TOCSaveOffset, dl);
+      SDValue AddPtr = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+      Chain = DAG.getStore(Val.getValue(1), dl, Val, AddPtr,
+                           MachinePointerInfo::getStack(
+                               DAG.getMachineFunction(), TOCSaveOffset));
+    }
     // In the ELFv2 ABI, R12 must contain the address of an indirect callee.
     // This does not mean the MTCTR instruction must use R12; it's easier
     // to model this as an extra parameter, so do that.
-    if (isELFv2ABI && !isPatchPoint)
+    if (isELFv2ABI && !CFlags.IsPatchPoint)
       RegsToPass.push_back(std::make_pair((unsigned)PPC::X12, Callee));
   }
 
@@ -6467,23 +6657,21 @@ SDValue PPCTargetLowering::LowerCall_64SVR4(
     InFlag = Chain.getValue(1);
   }
 
-  if (isTailCall && !IsSibCall)
+  if (CFlags.IsTailCall && !IsSibCall)
     PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
                     TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint, hasNest,
-                    DAG, RegsToPass, InFlag, Chain, CallSeqStart, Callee,
-                    SPDiff, NumBytes, Ins, InVals, CS);
+  return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
+                    Callee, SPDiff, NumBytes, Ins, InVals, CB);
 }
 
 SDValue PPCTargetLowering::LowerCall_Darwin(
-    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
-    bool isTailCall, bool isPatchPoint,
+    SDValue Chain, SDValue Callee, CallFlags CFlags,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-    ImmutableCallSite CS) const {
+    const CallBase *CB) const {
   unsigned NumOps = Outs.size();
 
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -6498,7 +6686,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
   // done because by tail calling the called function might overwrite the value
   // in this function's (MF) stack pointer stack slot 0(SP).
   if (getTargetMachine().Options.GuaranteedTailCallOpt &&
-      CallConv == CallingConv::Fast)
+      CFlags.CallConv == CallingConv::Fast)
     MF.getInfo<PPCFunctionInfo>()->setHasFastCall();
 
   // Count how many bytes are to be pushed on the stack, including the linkage
@@ -6521,7 +6709,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
     if (ArgVT == MVT::v4f32 || ArgVT == MVT::v4i32 ||
         ArgVT == MVT::v8i16 || ArgVT == MVT::v16i8 ||
         ArgVT == MVT::v2f64 || ArgVT == MVT::v2i64) {
-      if (!isVarArg && !isPPC64) {
+      if (!CFlags.IsVarArg && !isPPC64) {
         // Non-varargs Altivec parameters go after all the non-Altivec
         // parameters; handle those later so we know how much padding we need.
         nAltivecParamsAtEnd++;
@@ -6548,16 +6736,16 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
 
   // Tail call needs the stack to be aligned.
   if (getTargetMachine().Options.GuaranteedTailCallOpt &&
-      CallConv == CallingConv::Fast)
+      CFlags.CallConv == CallingConv::Fast)
     NumBytes = EnsureStackAlignment(Subtarget.getFrameLowering(), NumBytes);
 
   // Calculate by how many bytes the stack has to be adjusted in case of tail
   // call optimization.
-  int SPDiff = CalculateTailCallSPDiff(DAG, isTailCall, NumBytes);
+  int SPDiff = CalculateTailCallSPDiff(DAG, CFlags.IsTailCall, NumBytes);
 
   // To protect arguments on the stack from being clobbered in a tail call,
   // force all the loads to happen before doing any other lowering.
-  if (isTailCall)
+  if (CFlags.IsTailCall)
     Chain = DAG.getStackArgumentTokenFactor(Chain);
 
   // Adjust the stack pointer for the new arguments...
@@ -6693,7 +6881,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
         RegsToPass.push_back(std::make_pair(GPR[GPR_idx++], Arg));
       } else {
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
-                         isPPC64, isTailCall, false, MemOpChains,
+                         isPPC64, CFlags.IsTailCall, false, MemOpChains,
                          TailCallArguments, dl);
       }
       ArgOffset += PtrByteSize;
@@ -6703,7 +6891,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
       if (FPR_idx != NumFPRs) {
         RegsToPass.push_back(std::make_pair(FPR[FPR_idx++], Arg));
 
-        if (isVarArg) {
+        if (CFlags.IsVarArg) {
           SDValue Store =
               DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo());
           MemOpChains.push_back(Store);
@@ -6735,7 +6923,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
         }
       } else
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
-                         isPPC64, isTailCall, false, MemOpChains,
+                         isPPC64, CFlags.IsTailCall, false, MemOpChains,
                          TailCallArguments, dl);
       if (isPPC64)
         ArgOffset += 8;
@@ -6746,7 +6934,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
     case MVT::v4i32:
     case MVT::v8i16:
     case MVT::v16i8:
-      if (isVarArg) {
+      if (CFlags.IsVarArg) {
         // These go aligned on the stack, or in the corresponding R registers
         // when within range.  The Darwin PPC ABI doc claims they also go in
         // V registers; in fact gcc does this only for arguments that are
@@ -6792,7 +6980,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
       } else if (nAltivecParamsAtEnd==0) {
         // We are emitting Altivec params in order.
         LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
-                         isPPC64, isTailCall, true, MemOpChains,
+                         isPPC64, CFlags.IsTailCall, true, MemOpChains,
                          TailCallArguments, dl);
         ArgOffset += 16;
       }
@@ -6804,7 +6992,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
   // don't track this here because nobody below needs it.
   // If there are more Altivec parameters than fit in registers emit
   // the stores here.
-  if (!isVarArg && nAltivecParamsAtEnd > NumVRs) {
+  if (!CFlags.IsVarArg && nAltivecParamsAtEnd > NumVRs) {
     unsigned j = 0;
     // Offset is aligned; skip 1st 12 params which go in V registers.
     ArgOffset = ((ArgOffset+15)/16)*16;
@@ -6818,7 +7006,7 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
           SDValue PtrOff;
           // We are emitting Altivec params in order.
           LowerMemOpCallTo(DAG, MF, Chain, Arg, PtrOff, SPDiff, ArgOffset,
-                           isPPC64, isTailCall, true, MemOpChains,
+                           isPPC64, CFlags.IsTailCall, true, MemOpChains,
                            TailCallArguments, dl);
           ArgOffset += 16;
         }
@@ -6832,12 +7020,11 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
   // On Darwin, R12 must contain the address of an indirect callee.  This does
   // not mean the MTCTR instruction must use R12; it's easier to model this as
   // an extra parameter, so do that.
-  if (!isTailCall &&
-      !isFunctionGlobalAddress(Callee) &&
-      !isa<ExternalSymbolSDNode>(Callee) &&
-      !isBLACompatibleAddress(Callee, DAG))
+  if (CFlags.IsIndirect) {
+    assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
     RegsToPass.push_back(std::make_pair((unsigned)(isPPC64 ? PPC::X12 :
                                                    PPC::R12), Callee));
+  }
 
   // Build a sequence of copy-to-reg nodes chained together with token chain
   // and flag operands which copy the outgoing args into the appropriate regs.
@@ -6848,37 +7035,37 @@ SDValue PPCTargetLowering::LowerCall_Darwin(
     InFlag = Chain.getValue(1);
   }
 
-  if (isTailCall)
+  if (CFlags.IsTailCall)
     PrepareTailCall(DAG, InFlag, Chain, dl, SPDiff, NumBytes, LROp, FPOp,
                     TailCallArguments);
 
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
-                    /* unused except on PPC64 ELFv1 */ false, DAG,
-                    RegsToPass, InFlag, Chain, CallSeqStart, Callee, SPDiff,
-                    NumBytes, Ins, InVals, CS);
+  return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
+                    Callee, SPDiff, NumBytes, Ins, InVals, CB);
 }
 
 static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
                    CCValAssign::LocInfo LocInfo, ISD::ArgFlagsTy ArgFlags,
                    CCState &State) {
 
+  const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
+      State.getMachineFunction().getSubtarget());
+  const bool IsPPC64 = Subtarget.isPPC64();
+  const Align PtrAlign = IsPPC64 ? Align(8) : Align(4);
+  const MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
+
+  assert((!ValVT.isInteger() ||
+          (ValVT.getSizeInBits() <= RegVT.getSizeInBits())) &&
+         "Integer argument exceeds register size: should have been legalized");
+
   if (ValVT == MVT::f128)
     report_fatal_error("f128 is unimplemented on AIX.");
 
-  if (ArgFlags.isByVal())
-    report_fatal_error("Passing structure by value is unimplemented.");
-
   if (ArgFlags.isNest())
     report_fatal_error("Nest arguments are unimplemented.");
 
   if (ValVT.isVector() || LocVT.isVector())
     report_fatal_error("Vector arguments are unimplemented on AIX.");
 
-  const PPCSubtarget &Subtarget = static_cast<const PPCSubtarget &>(
-      State.getMachineFunction().getSubtarget());
-  const bool IsPPC64 = Subtarget.isPPC64();
-  const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
-
   static const MCPhysReg GPR_32[] = {// 32-bit registers.
                                      PPC::R3, PPC::R4, PPC::R5, PPC::R6,
                                      PPC::R7, PPC::R8, PPC::R9, PPC::R10};
@@ -6886,6 +7073,38 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
                                      PPC::X3, PPC::X4, PPC::X5, PPC::X6,
                                      PPC::X7, PPC::X8, PPC::X9, PPC::X10};
 
+  if (ArgFlags.isByVal()) {
+    if (ArgFlags.getNonZeroByValAlign() > PtrAlign)
+      report_fatal_error("Pass-by-value arguments with alignment greater than "
+                         "register width are not supported.");
+
+    const unsigned ByValSize = ArgFlags.getByValSize();
+
+    // An empty aggregate parameter takes up no storage and no registers,
+    // but needs a MemLoc for a stack slot for the formal arguments side.
+    if (ByValSize == 0) {
+      State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
+                                       State.getNextStackOffset(), RegVT,
+                                       LocInfo));
+      return false;
+    }
+
+    const unsigned StackSize = alignTo(ByValSize, PtrAlign);
+    unsigned Offset = State.AllocateStack(StackSize, PtrAlign);
+    for (const unsigned E = Offset + StackSize; Offset < E;
+         Offset += PtrAlign.value()) {
+      if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
+        State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
+      else {
+        State.addLoc(CCValAssign::getMem(ValNo, MVT::INVALID_SIMPLE_VALUE_TYPE,
+                                         Offset, MVT::INVALID_SIMPLE_VALUE_TYPE,
+                                         LocInfo));
+        break;
+      }
+    }
+    return false;
+  }
+
   // Arguments always reserve parameter save area.
   switch (ValVT.SimpleTy) {
   default:
@@ -6895,49 +7114,55 @@ static bool CC_AIX(unsigned ValNo, MVT ValVT, MVT LocVT,
     assert(IsPPC64 && "PPC32 should have split i64 values.");
     LLVM_FALLTHROUGH;
   case MVT::i1:
-  case MVT::i32:
-    State.AllocateStack(PtrByteSize, PtrByteSize);
-    if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
-      MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
-      // Promote integers if needed.
-      if (ValVT.getSizeInBits() < RegVT.getSizeInBits())
-        LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
-                                    : CCValAssign::LocInfo::ZExt;
+  case MVT::i32: {
+    const unsigned Offset = State.AllocateStack(PtrAlign.value(), PtrAlign);
+    // AIX integer arguments are always passed in register width.
+    if (ValVT.getSizeInBits() < RegVT.getSizeInBits())
+      LocInfo = ArgFlags.isSExt() ? CCValAssign::LocInfo::SExt
+                                  : CCValAssign::LocInfo::ZExt;
+    if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32))
       State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, RegVT, LocInfo));
-    }
     else
-      report_fatal_error("Handling of placing parameters on the stack is "
-                         "unimplemented!");
-    return false;
+      State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, RegVT, LocInfo));
 
+    return false;
+  }
   case MVT::f32:
   case MVT::f64: {
     // Parameter save area (PSA) is reserved even if the float passes in fpr.
     const unsigned StoreSize = LocVT.getStoreSize();
     // Floats are always 4-byte aligned in the PSA on AIX.
     // This includes f64 in 64-bit mode for ABI compatibility.
-    State.AllocateStack(IsPPC64 ? 8 : StoreSize, 4);
-    if (unsigned Reg = State.AllocateReg(FPR))
-      State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
-    else
-      report_fatal_error("Handling of placing parameters on the stack is "
-                         "unimplemented!");
-
-    // AIX requires that GPRs are reserved for float arguments.
-    // Successfully reserved GPRs are only initialized for vararg calls.
-    MVT RegVT = IsPPC64 ? MVT::i64 : MVT::i32;
-    for (unsigned I = 0; I < StoreSize; I += PtrByteSize) {
+    const unsigned Offset =
+        State.AllocateStack(IsPPC64 ? 8 : StoreSize, Align(4));
+    unsigned FReg = State.AllocateReg(FPR);
+    if (FReg)
+      State.addLoc(CCValAssign::getReg(ValNo, ValVT, FReg, LocVT, LocInfo));
+
+    // Reserve and initialize GPRs or initialize the PSA as required.
+    for (unsigned I = 0; I < StoreSize; I += PtrAlign.value()) {
       if (unsigned Reg = State.AllocateReg(IsPPC64 ? GPR_64 : GPR_32)) {
+        assert(FReg && "An FPR should be available when a GPR is reserved.");
         if (State.isVarArg()) {
+          // Successfully reserved GPRs are only initialized for vararg calls.
           // Custom handling is required for:
           //   f64 in PPC32 needs to be split into 2 GPRs.
           //   f32 in PPC64 needs to occupy only lower 32 bits of 64-bit GPR.
           State.addLoc(
               CCValAssign::getCustomReg(ValNo, ValVT, Reg, RegVT, LocInfo));
         }
-      } else if (State.isVarArg()) {
-        report_fatal_error("Handling of placing parameters on the stack is "
-                           "unimplemented!");
+      } else {
+        // If there are insufficient GPRs, the PSA needs to be initialized.
+        // Initialization occurs even if an FPR was initialized for
+        // compatibility with the AIX XL compiler. The full memory for the
+        // argument will be initialized even if a prior word is saved in GPR.
+        // A custom memLoc is used when the argument also passes in FPR so
+        // that the callee handling can skip over it easily.
+        State.addLoc(
+            FReg ? CCValAssign::getCustomMem(ValNo, ValVT, Offset, LocVT,
+                                             LocInfo)
+                 : CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+        break;
       }
     }
 
@@ -6982,6 +7207,64 @@ static SDValue truncateScalarIntegerArg(ISD::ArgFlagsTy Flags, EVT ValVT,
   return DAG.getNode(ISD::TRUNCATE, dl, ValVT, ArgValue);
 }
 
+static unsigned mapArgRegToOffsetAIX(unsigned Reg, const PPCFrameLowering *FL) {
+  const unsigned LASize = FL->getLinkageSize();
+
+  if (PPC::GPRCRegClass.contains(Reg)) {
+    assert(Reg >= PPC::R3 && Reg <= PPC::R10 &&
+           "Reg must be a valid argument register!");
+    return LASize + 4 * (Reg - PPC::R3);
+  }
+
+  if (PPC::G8RCRegClass.contains(Reg)) {
+    assert(Reg >= PPC::X3 && Reg <= PPC::X10 &&
+           "Reg must be a valid argument register!");
+    return LASize + 8 * (Reg - PPC::X3);
+  }
+
+  llvm_unreachable("Only general purpose registers expected.");
+}
+
+//   AIX ABI Stack Frame Layout:
+//
+//   Low Memory +--------------------------------------------+
+//   SP   +---> | Back chain                                 | ---+
+//        |     +--------------------------------------------+    |   
+//        |     | Saved Condition Register                   |    |
+//        |     +--------------------------------------------+    |
+//        |     | Saved Linkage Register                     |    |
+//        |     +--------------------------------------------+    | Linkage Area
+//        |     | Reserved for compilers                     |    |
+//        |     +--------------------------------------------+    |
+//        |     | Reserved for binders                       |    |
+//        |     +--------------------------------------------+    |
+//        |     | Saved TOC pointer                          | ---+
+//        |     +--------------------------------------------+
+//        |     | Parameter save area                        |
+//        |     +--------------------------------------------+
+//        |     | Alloca space                               |
+//        |     +--------------------------------------------+
+//        |     | Local variable space                       |
+//        |     +--------------------------------------------+
+//        |     | Float/int conversion temporary             |
+//        |     +--------------------------------------------+
+//        |     | Save area for AltiVec registers            |
+//        |     +--------------------------------------------+
+//        |     | AltiVec alignment padding                  |
+//        |     +--------------------------------------------+
+//        |     | Save area for VRSAVE register              |
+//        |     +--------------------------------------------+
+//        |     | Save area for General Purpose registers    |
+//        |     +--------------------------------------------+
+//        |     | Save area for Floating Point registers     |
+//        |     +--------------------------------------------+
+//        +---- | Back chain                                 |
+// High Memory  +--------------------------------------------+
+//
+//  Specifications:
+//  AIX 7.2 Assembler Language Reference
+//  Subroutine linkage convention
+
 SDValue PPCTargetLowering::LowerFormalArguments_AIX(
     SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
@@ -6991,9 +7274,6 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
           CallConv == CallingConv::Fast) &&
          "Unexpected calling convention!");
 
-  if (isVarArg)
-    report_fatal_error("This call type is unimplemented on AIX.");
-
   if (getTargetMachine().Options.GuaranteedTailCallOpt)
     report_fatal_error("Tail call support is unimplemented on AIX.");
 
@@ -7011,67 +7291,214 @@ SDValue PPCTargetLowering::LowerFormalArguments_AIX(
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
   MachineFunction &MF = DAG.getMachineFunction();
+  MachineFrameInfo &MFI = MF.getFrameInfo();
   CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
 
+  const EVT PtrVT = getPointerTy(MF.getDataLayout());
   // Reserve space for the linkage area on the stack.
   const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
-  // On AIX a minimum of 8 words is saved to the parameter save area.
-  const unsigned MinParameterSaveArea = 8 * PtrByteSize;
-  CCInfo.AllocateStack(LinkageSize + MinParameterSaveArea, PtrByteSize);
+  CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
   CCInfo.AnalyzeFormalArguments(Ins, CC_AIX);
 
-  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
-    CCValAssign &VA = ArgLocs[i];
-    SDValue ArgValue;
-    ISD::ArgFlagsTy Flags = Ins[i].Flags;
-    if (VA.isRegLoc()) {
-      EVT ValVT = VA.getValVT();
-      MVT LocVT = VA.getLocVT();
+  SmallVector<SDValue, 8> MemOps;
+
+  for (size_t I = 0, End = ArgLocs.size(); I != End; /* No increment here */) {
+    CCValAssign &VA = ArgLocs[I++];
+    MVT LocVT = VA.getLocVT();
+    ISD::ArgFlagsTy Flags = Ins[VA.getValNo()].Flags;
+
+    // For compatibility with the AIX XL compiler, the float args in the
+    // parameter save area are initialized even if the argument is available
+    // in register.  The caller is required to initialize both the register
+    // and memory, however, the callee can choose to expect it in either.
+    // The memloc is dismissed here because the argument is retrieved from
+    // the register.
+    if (VA.isMemLoc() && VA.needsCustom())
+      continue;
+
+    if (Flags.isByVal() && VA.isMemLoc()) {
+      const unsigned Size =
+          alignTo(Flags.getByValSize() ? Flags.getByValSize() : PtrByteSize,
+                  PtrByteSize);
+      const int FI = MF.getFrameInfo().CreateFixedObject(
+          Size, VA.getLocMemOffset(), /* IsImmutable */ false,
+          /* IsAliased */ true);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      InVals.push_back(FIN);
+
+      continue;
+    }
+
+    if (Flags.isByVal()) {
+      assert(VA.isRegLoc() && "MemLocs should already be handled.");
+
+      const MCPhysReg ArgReg = VA.getLocReg();
+      const PPCFrameLowering *FL = Subtarget.getFrameLowering();
+
+      if (Flags.getNonZeroByValAlign() > PtrByteSize)
+        report_fatal_error("Over aligned byvals not supported yet.");
+
+      const unsigned StackSize = alignTo(Flags.getByValSize(), PtrByteSize);
+      const int FI = MF.getFrameInfo().CreateFixedObject(
+          StackSize, mapArgRegToOffsetAIX(ArgReg, FL), /* IsImmutable */ false,
+          /* IsAliased */ true);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      InVals.push_back(FIN);
+
+      // Add live ins for all the RegLocs for the same ByVal.
+      const TargetRegisterClass *RegClass =
+          IsPPC64 ? &PPC::G8RCRegClass : &PPC::GPRCRegClass;
+
+      auto HandleRegLoc = [&, RegClass, LocVT](const MCPhysReg PhysReg,
+                                               unsigned Offset) {
+        const unsigned VReg = MF.addLiveIn(PhysReg, RegClass);
+        // Since the callers side has left justified the aggregate in the
+        // register, we can simply store the entire register into the stack
+        // slot.
+        SDValue CopyFrom = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
+        // The store to the fixedstack object is needed becuase accessing a
+        // field of the ByVal will use a gep and load. Ideally we will optimize
+        // to extracting the value from the register directly, and elide the
+        // stores when the arguments address is not taken, but that will need to
+        // be future work.
+        SDValue Store =
+            DAG.getStore(CopyFrom.getValue(1), dl, CopyFrom,
+                         DAG.getObjectPtrOffset(dl, FIN, Offset),
+                         MachinePointerInfo::getFixedStack(MF, FI, Offset));
+
+        MemOps.push_back(Store);
+      };
+
+      unsigned Offset = 0;
+      HandleRegLoc(VA.getLocReg(), Offset);
+      Offset += PtrByteSize;
+      for (; Offset != StackSize && ArgLocs[I].isRegLoc();
+           Offset += PtrByteSize) {
+        assert(ArgLocs[I].getValNo() == VA.getValNo() &&
+               "RegLocs should be for ByVal argument.");
+
+        const CCValAssign RL = ArgLocs[I++];
+        HandleRegLoc(RL.getLocReg(), Offset);
+      }
+
+      if (Offset != StackSize) {
+        assert(ArgLocs[I].getValNo() == VA.getValNo() &&
+               "Expected MemLoc for remaining bytes.");
+        assert(ArgLocs[I].isMemLoc() && "Expected MemLoc for remaining bytes.");
+        // Consume the MemLoc.The InVal has already been emitted, so nothing
+        // more needs to be done.
+        ++I;
+      }
+
+      continue;
+    }
+
+    EVT ValVT = VA.getValVT();
+    if (VA.isRegLoc() && !VA.needsCustom()) {
       MVT::SimpleValueType SVT = ValVT.getSimpleVT().SimpleTy;
       unsigned VReg =
           MF.addLiveIn(VA.getLocReg(), getRegClassForSVT(SVT, IsPPC64));
-      ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
+      SDValue ArgValue = DAG.getCopyFromReg(Chain, dl, VReg, LocVT);
       if (ValVT.isScalarInteger() &&
           (ValVT.getSizeInBits() < LocVT.getSizeInBits())) {
         ArgValue =
             truncateScalarIntegerArg(Flags, ValVT, DAG, ArgValue, LocVT, dl);
       }
       InVals.push_back(ArgValue);
-    } else {
-      report_fatal_error("Handling of formal arguments on the stack is "
-                         "unimplemented!");
+      continue;
+    }
+    if (VA.isMemLoc()) {
+      const unsigned LocSize = LocVT.getStoreSize();
+      const unsigned ValSize = ValVT.getStoreSize();
+      assert((ValSize <= LocSize) &&
+             "Object size is larger than size of MemLoc");
+      int CurArgOffset = VA.getLocMemOffset();
+      // Objects are right-justified because AIX is big-endian.
+      if (LocSize > ValSize)
+        CurArgOffset += LocSize - ValSize;
+      // Potential tail calls could cause overwriting of argument stack slots.
+      const bool IsImmutable =
+          !(getTargetMachine().Options.GuaranteedTailCallOpt &&
+            (CallConv == CallingConv::Fast));
+      int FI = MFI.CreateFixedObject(ValSize, CurArgOffset, IsImmutable);
+      SDValue FIN = DAG.getFrameIndex(FI, PtrVT);
+      SDValue ArgValue =
+          DAG.getLoad(ValVT, dl, Chain, FIN, MachinePointerInfo());
+      InVals.push_back(ArgValue);
+      continue;
     }
   }
 
+  // On AIX a minimum of 8 words is saved to the parameter save area.
+  const unsigned MinParameterSaveArea = 8 * PtrByteSize;
   // Area that is at least reserved in the caller of this function.
-  unsigned MinReservedArea = CCInfo.getNextStackOffset();
+  unsigned CallerReservedArea =
+      std::max(CCInfo.getNextStackOffset(), LinkageSize + MinParameterSaveArea);
 
   // Set the size that is at least reserved in caller of this function. Tail
   // call optimized function's reserved stack space needs to be aligned so
   // that taking the difference between two stack areas will result in an
   // aligned stack.
-  MinReservedArea =
-      EnsureStackAlignment(Subtarget.getFrameLowering(), MinReservedArea);
+  CallerReservedArea =
+      EnsureStackAlignment(Subtarget.getFrameLowering(), CallerReservedArea);
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
-  FuncInfo->setMinReservedArea(MinReservedArea);
+  FuncInfo->setMinReservedArea(CallerReservedArea);
+
+  if (isVarArg) {
+    FuncInfo->setVarArgsFrameIndex(
+        MFI.CreateFixedObject(PtrByteSize, CCInfo.getNextStackOffset(), true));
+    SDValue FIN = DAG.getFrameIndex(FuncInfo->getVarArgsFrameIndex(), PtrVT);
+
+    static const MCPhysReg GPR_32[] = {PPC::R3, PPC::R4, PPC::R5, PPC::R6,
+                                       PPC::R7, PPC::R8, PPC::R9, PPC::R10};
+
+    static const MCPhysReg GPR_64[] = {PPC::X3, PPC::X4, PPC::X5, PPC::X6,
+                                       PPC::X7, PPC::X8, PPC::X9, PPC::X10};
+    const unsigned NumGPArgRegs = array_lengthof(IsPPC64 ? GPR_64 : GPR_32);
+
+    // The fixed integer arguments of a variadic function are stored to the
+    // VarArgsFrameIndex on the stack so that they may be loaded by
+    // dereferencing the result of va_next.
+    for (unsigned GPRIndex =
+             (CCInfo.getNextStackOffset() - LinkageSize) / PtrByteSize;
+         GPRIndex < NumGPArgRegs; ++GPRIndex) {
+
+      const unsigned VReg =
+          IsPPC64 ? MF.addLiveIn(GPR_64[GPRIndex], &PPC::G8RCRegClass)
+                  : MF.addLiveIn(GPR_32[GPRIndex], &PPC::GPRCRegClass);
+
+      SDValue Val = DAG.getCopyFromReg(Chain, dl, VReg, PtrVT);
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), dl, Val, FIN, MachinePointerInfo());
+      MemOps.push_back(Store);
+      // Increment the address for the next argument to store.
+      SDValue PtrOff = DAG.getConstant(PtrByteSize, dl, PtrVT);
+      FIN = DAG.getNode(ISD::ADD, dl, PtrOff.getValueType(), FIN, PtrOff);
+    }
+  }
+
+  if (!MemOps.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
 
   return Chain;
 }
 
 SDValue PPCTargetLowering::LowerCall_AIX(
-    SDValue Chain, SDValue Callee, CallingConv::ID CallConv, bool isVarArg,
-    bool isTailCall, bool isPatchPoint,
+    SDValue Chain, SDValue Callee, CallFlags CFlags,
     const SmallVectorImpl<ISD::OutputArg> &Outs,
     const SmallVectorImpl<SDValue> &OutVals,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals,
-    ImmutableCallSite CS) const {
+    const CallBase *CB) const {
+  // See PPCTargetLowering::LowerFormalArguments_AIX() for a description of the
+  // AIX ABI stack frame layout.
 
-  assert((CallConv == CallingConv::C ||
-          CallConv == CallingConv::Cold ||
-          CallConv == CallingConv::Fast) && "Unexpected calling convention!");
+  assert((CFlags.CallConv == CallingConv::C ||
+          CFlags.CallConv == CallingConv::Cold ||
+          CFlags.CallConv == CallingConv::Fast) &&
+         "Unexpected calling convention!");
 
-  if (isPatchPoint)
+  if (CFlags.IsPatchPoint)
     report_fatal_error("This call type is unimplemented on AIX.");
 
   const PPCSubtarget& Subtarget =
@@ -7083,7 +7510,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(
 
   MachineFunction &MF = DAG.getMachineFunction();
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+  CCState CCInfo(CFlags.CallConv, CFlags.IsVarArg, MF, ArgLocs,
+                 *DAG.getContext());
 
   // Reserve space for the linkage save area (LSA) on the stack.
   // In both PPC32 and PPC64 there are 6 reserved slots in the LSA:
@@ -7091,8 +7519,9 @@ SDValue PPCTargetLowering::LowerCall_AIX(
   // The LSA is 24 bytes (6x4) in PPC32 and 48 bytes (6x8) in PPC64.
   const unsigned LinkageSize = Subtarget.getFrameLowering()->getLinkageSize();
   const bool IsPPC64 = Subtarget.isPPC64();
+  const EVT PtrVT = getPointerTy(DAG.getDataLayout());
   const unsigned PtrByteSize = IsPPC64 ? 8 : 4;
-  CCInfo.AllocateStack(LinkageSize, PtrByteSize);
+  CCInfo.AllocateStack(LinkageSize, Align(PtrByteSize));
   CCInfo.AnalyzeCallOperands(Outs, CC_AIX);
 
   // The prolog code of the callee may store up to 8 GPR argument registers to
@@ -7102,7 +7531,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(
   // conservatively assume that it is needed.  As such, make sure we have at
   // least enough stack space for the caller to store the 8 GPRs.
   const unsigned MinParameterSaveAreaSize = 8 * PtrByteSize;
-  const unsigned NumBytes = LinkageSize + MinParameterSaveAreaSize;
+  const unsigned NumBytes = std::max(LinkageSize + MinParameterSaveAreaSize,
+                                     CCInfo.getNextStackOffset());
 
   // Adjust the stack pointer for the new arguments...
   // These operations are automatically eliminated by the prolog/epilog pass.
@@ -7110,77 +7540,192 @@ SDValue PPCTargetLowering::LowerCall_AIX(
   SDValue CallSeqStart = Chain;
 
   SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // Set up a copy of the stack pointer for loading and storing any
+  // arguments that may not fit in the registers available for argument
+  // passing.
+  const SDValue StackPtr = IsPPC64 ? DAG.getRegister(PPC::X1, MVT::i64)
+                                   : DAG.getRegister(PPC::R1, MVT::i32);
 
   for (unsigned I = 0, E = ArgLocs.size(); I != E;) {
-    CCValAssign &VA = ArgLocs[I++];
+    const unsigned ValNo = ArgLocs[I].getValNo();
+    SDValue Arg = OutVals[ValNo];
+    ISD::ArgFlagsTy Flags = Outs[ValNo].Flags;
+
+    if (Flags.isByVal()) {
+      const unsigned ByValSize = Flags.getByValSize();
 
-    if (VA.isMemLoc())
-      report_fatal_error("Handling of placing parameters on the stack is "
-                         "unimplemented!");
-    if (!VA.isRegLoc())
-      report_fatal_error(
-          "Unexpected non-register location for function call argument.");
+      // Nothing to do for zero-sized ByVals on the caller side.
+      if (!ByValSize) {
+        ++I;
+        continue;
+      }
 
-    SDValue Arg = OutVals[VA.getValNo()];
+      auto GetLoad = [&](EVT VT, unsigned LoadOffset) {
+        return DAG.getExtLoad(ISD::ZEXTLOAD, dl, PtrVT, Chain,
+                              (LoadOffset != 0)
+                                  ? DAG.getObjectPtrOffset(dl, Arg, LoadOffset)
+                                  : Arg,
+                              MachinePointerInfo(), VT);
+      };
 
-    if (!VA.needsCustom()) {
-      switch (VA.getLocInfo()) {
-      default:
-        report_fatal_error("Unexpected argument extension type.");
-      case CCValAssign::Full:
-        break;
-      case CCValAssign::ZExt:
-        Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
-        break;
-      case CCValAssign::SExt:
-        Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
-        break;
+      unsigned LoadOffset = 0;
+
+      // Initialize registers, which are fully occupied by the by-val argument.
+      while (LoadOffset + PtrByteSize <= ByValSize && ArgLocs[I].isRegLoc()) {
+        SDValue Load = GetLoad(PtrVT, LoadOffset);
+        MemOpChains.push_back(Load.getValue(1));
+        LoadOffset += PtrByteSize;
+        const CCValAssign &ByValVA = ArgLocs[I++];
+        assert(ByValVA.getValNo() == ValNo &&
+               "Unexpected location for pass-by-value argument.");
+        RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), Load));
+      }
+
+      if (LoadOffset == ByValSize)
+        continue;
+
+      // There must be one more loc to handle the remainder.
+      assert(ArgLocs[I].getValNo() == ValNo &&
+             "Expected additional location for by-value argument.");
+
+      if (ArgLocs[I].isMemLoc()) {
+        assert(LoadOffset < ByValSize && "Unexpected memloc for by-val arg.");
+        const CCValAssign &ByValVA = ArgLocs[I++];
+        ISD::ArgFlagsTy MemcpyFlags = Flags;
+        // Only memcpy the bytes that don't pass in register.
+        MemcpyFlags.setByValSize(ByValSize - LoadOffset);
+        Chain = CallSeqStart = createMemcpyOutsideCallSeq(
+            (LoadOffset != 0) ? DAG.getObjectPtrOffset(dl, Arg, LoadOffset)
+                              : Arg,
+            DAG.getObjectPtrOffset(dl, StackPtr, ByValVA.getLocMemOffset()),
+            CallSeqStart, MemcpyFlags, DAG, dl);
+        continue;
+      }
+
+      // Initialize the final register residue.
+      // Any residue that occupies the final by-val arg register must be
+      // left-justified on AIX. Loads must be a power-of-2 size and cannot be
+      // larger than the ByValSize. For example: a 7 byte by-val arg requires 4,
+      // 2 and 1 byte loads.
+      const unsigned ResidueBytes = ByValSize % PtrByteSize;
+      assert(ResidueBytes != 0 && LoadOffset + PtrByteSize > ByValSize &&
+             "Unexpected register residue for by-value argument.");
+      SDValue ResidueVal;
+      for (unsigned Bytes = 0; Bytes != ResidueBytes;) {
+        const unsigned N = PowerOf2Floor(ResidueBytes - Bytes);
+        const MVT VT =
+            N == 1 ? MVT::i8
+                   : ((N == 2) ? MVT::i16 : (N == 4 ? MVT::i32 : MVT::i64));
+        SDValue Load = GetLoad(VT, LoadOffset);
+        MemOpChains.push_back(Load.getValue(1));
+        LoadOffset += N;
+        Bytes += N;
+
+        // By-val arguments are passed left-justfied in register.
+        // Every load here needs to be shifted, otherwise a full register load
+        // should have been used.
+        assert(PtrVT.getSimpleVT().getSizeInBits() > (Bytes * 8) &&
+               "Unexpected load emitted during handling of pass-by-value "
+               "argument.");
+        unsigned NumSHLBits = PtrVT.getSimpleVT().getSizeInBits() - (Bytes * 8);
+        EVT ShiftAmountTy =
+            getShiftAmountTy(Load->getValueType(0), DAG.getDataLayout());
+        SDValue SHLAmt = DAG.getConstant(NumSHLBits, dl, ShiftAmountTy);
+        SDValue ShiftedLoad =
+            DAG.getNode(ISD::SHL, dl, Load.getValueType(), Load, SHLAmt);
+        ResidueVal = ResidueVal ? DAG.getNode(ISD::OR, dl, PtrVT, ResidueVal,
+                                              ShiftedLoad)
+                                : ShiftedLoad;
       }
+
+      const CCValAssign &ByValVA = ArgLocs[I++];
+      RegsToPass.push_back(std::make_pair(ByValVA.getLocReg(), ResidueVal));
+      continue;
+    }
+
+    CCValAssign &VA = ArgLocs[I++];
+    const MVT LocVT = VA.getLocVT();
+    const MVT ValVT = VA.getValVT();
+
+    switch (VA.getLocInfo()) {
+    default:
+      report_fatal_error("Unexpected argument extension type.");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, dl, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.isRegLoc() && !VA.needsCustom()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      continue;
+    }
+
+    if (VA.isMemLoc()) {
+      SDValue PtrOff =
+          DAG.getConstant(VA.getLocMemOffset(), dl, StackPtr.getValueType());
+      PtrOff = DAG.getNode(ISD::ADD, dl, PtrVT, StackPtr, PtrOff);
+      MemOpChains.push_back(
+          DAG.getStore(Chain, dl, Arg, PtrOff, MachinePointerInfo()));
 
       continue;
     }
 
     // Custom handling is used for GPR initializations for vararg float
     // arguments.
-    assert(isVarArg && VA.getValVT().isFloatingPoint() &&
-           VA.getLocVT().isInteger() &&
-           "Unexpected custom register handling for calling convention.");
+    assert(VA.isRegLoc() && VA.needsCustom() && CFlags.IsVarArg &&
+           ValVT.isFloatingPoint() && LocVT.isInteger() &&
+           "Unexpected register handling for calling convention.");
 
     SDValue ArgAsInt =
-        DAG.getBitcast(MVT::getIntegerVT(VA.getValVT().getSizeInBits()), Arg);
+        DAG.getBitcast(MVT::getIntegerVT(ValVT.getSizeInBits()), Arg);
 
-    if (Arg.getValueType().getStoreSize() == VA.getLocVT().getStoreSize())
+    if (Arg.getValueType().getStoreSize() == LocVT.getStoreSize())
       // f32 in 32-bit GPR
       // f64 in 64-bit GPR
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), ArgAsInt));
-    else if (Arg.getValueType().getSizeInBits() < VA.getLocVT().getSizeInBits())
+    else if (Arg.getValueType().getSizeInBits() < LocVT.getSizeInBits())
       // f32 in 64-bit GPR.
       RegsToPass.push_back(std::make_pair(
-          VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, VA.getLocVT())));
+          VA.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, LocVT)));
     else {
       // f64 in two 32-bit GPRs
       // The 2 GPRs are marked custom and expected to be adjacent in ArgLocs.
-      assert(Arg.getValueType() == MVT::f64 && isVarArg && !IsPPC64 &&
+      assert(Arg.getValueType() == MVT::f64 && CFlags.IsVarArg && !IsPPC64 &&
              "Unexpected custom register for argument!");
       CCValAssign &GPR1 = VA;
       SDValue MSWAsI64 = DAG.getNode(ISD::SRL, dl, MVT::i64, ArgAsInt,
                                      DAG.getConstant(32, dl, MVT::i8));
       RegsToPass.push_back(std::make_pair(
           GPR1.getLocReg(), DAG.getZExtOrTrunc(MSWAsI64, dl, MVT::i32)));
-      assert(I != E && "A second custom GPR is expected!");
-      CCValAssign &GPR2 = ArgLocs[I++];
-      assert(GPR2.isRegLoc() && GPR2.getValNo() == GPR1.getValNo() &&
-             GPR2.needsCustom() && "A second custom GPR is expected!");
-      RegsToPass.push_back(std::make_pair(
-          GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));
+
+      if (I != E) {
+        // If only 1 GPR was available, there will only be one custom GPR and
+        // the argument will also pass in memory.
+        CCValAssign &PeekArg = ArgLocs[I];
+        if (PeekArg.isRegLoc() && PeekArg.getValNo() == PeekArg.getValNo()) {
+          assert(PeekArg.needsCustom() && "A second custom GPR is expected.");
+          CCValAssign &GPR2 = ArgLocs[I++];
+          RegsToPass.push_back(std::make_pair(
+              GPR2.getLocReg(), DAG.getZExtOrTrunc(ArgAsInt, dl, MVT::i32)));
+        }
+      }
     }
   }
 
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOpChains);
+
   // For indirect calls, we need to save the TOC base to the stack for
   // restoration after the call.
-  if (!isTailCall && !isPatchPoint &&
-      !isFunctionGlobalAddress(Callee) && !isa<ExternalSymbolSDNode>(Callee)) {
+  if (CFlags.IsIndirect) {
+    assert(!CFlags.IsTailCall && "Indirect tail-calls not supported.");
     const MCRegister TOCBaseReg = Subtarget.getTOCPointerRegister();
     const MCRegister StackPtrReg = Subtarget.getStackPointerRegister();
     const MVT PtrVT = Subtarget.isPPC64() ? MVT::i64 : MVT::i32;
@@ -7206,10 +7751,8 @@ SDValue PPCTargetLowering::LowerCall_AIX(
   }
 
   const int SPDiff = 0;
-  return FinishCall(CallConv, dl, isTailCall, isVarArg, isPatchPoint,
-                    /* unused except on PPC64 ELFv1 */ false, DAG, RegsToPass,
-                    InFlag, Chain, CallSeqStart, Callee, SPDiff, NumBytes, Ins,
-                    InVals, CS);
+  return FinishCall(CFlags, dl, DAG, RegsToPass, InFlag, Chain, CallSeqStart,
+                    Callee, SPDiff, NumBytes, Ins, InVals, CB);
 }
 
 bool
@@ -7281,25 +7824,6 @@ PPCTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
   }
 
-  const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
-  const MCPhysReg *I =
-    TRI->getCalleeSavedRegsViaCopy(&DAG.getMachineFunction());
-  if (I) {
-    for (; *I; ++I) {
-
-      if (PPC::G8RCRegClass.contains(*I))
-        RetOps.push_back(DAG.getRegister(*I, MVT::i64));
-      else if (PPC::F8RCRegClass.contains(*I))
-        RetOps.push_back(DAG.getRegister(*I, MVT::getFloatingPointVT(64)));
-      else if (PPC::CRRCRegClass.contains(*I))
-        RetOps.push_back(DAG.getRegister(*I, MVT::i1));
-      else if (PPC::VRRCRegClass.contains(*I))
-        RetOps.push_back(DAG.getRegister(*I, MVT::Other));
-      else
-        llvm_unreachable("Unexpected register class in CSRsViaCopy!");
-    }
-  }
-
   RetOps[0] = Chain;  // Update chain.
 
   // Add the flag if we have it.
@@ -7401,6 +7925,7 @@ PPCTargetLowering::getFramePointerFrameIndex(SelectionDAG & DAG) const {
 
 SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                                    SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
   // Get the inputs.
   SDValue Chain = Op.getOperand(0);
   SDValue Size  = Op.getOperand(1);
@@ -7413,9 +7938,10 @@ SDValue PPCTargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                 DAG.getConstant(0, dl, PtrVT), Size);
   // Construct a node for the frame pointer save index.
   SDValue FPSIdx = getFramePointerFrameIndex(DAG);
-  // Build a DYNALLOC node.
   SDValue Ops[3] = { Chain, NegSize, FPSIdx };
   SDVTList VTs = DAG.getVTList(PtrVT, MVT::Other);
+  if (hasInlineStackProbe(MF))
+    return DAG.getNode(PPCISD::PROBED_ALLOCA, dl, VTs, Ops);
   return DAG.getNode(PPCISD::DYNALLOC, dl, VTs, Ops);
 }
 
@@ -7564,15 +8090,6 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
       !Op.getOperand(2).getValueType().isFloatingPoint())
     return Op;
 
-  bool HasNoInfs = DAG.getTarget().Options.NoInfsFPMath;
-  bool HasNoNaNs = DAG.getTarget().Options.NoNaNsFPMath;
-  // We might be able to do better than this under some circumstances, but in
-  // general, fsel-based lowering of select is a finite-math-only optimization.
-  // For more information, see section F.3 of the 2.06 ISA specification.
-  // With ISA 3.0, we have xsmaxcdp/xsmincdp which are OK to emit even in the
-  // presence of infinities.
-  if (!Subtarget.hasP9Vector() && (!HasNoInfs || !HasNoNaNs))
-    return Op;
   ISD::CondCode CC = cast<CondCodeSDNode>(Op.getOperand(4))->get();
 
   EVT ResVT = Op.getValueType();
@@ -7580,14 +8097,14 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
   SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
   SDValue TV  = Op.getOperand(2), FV  = Op.getOperand(3);
   SDLoc dl(Op);
+  SDNodeFlags Flags = Op.getNode()->getFlags();
 
+  // We have xsmaxcdp/xsmincdp which are OK to emit even in the
+  // presence of infinities.
   if (Subtarget.hasP9Vector() && LHS == TV && RHS == FV) {
     switch (CC) {
     default:
-      // Not a min/max but with finite math, we may still be able to use fsel.
-      if (HasNoInfs && HasNoNaNs)
-        break;
-      return Op;
+      break;
     case ISD::SETOGT:
     case ISD::SETGT:
       return DAG.getNode(PPCISD::XSMAXCDP, dl, Op.getValueType(), LHS, RHS);
@@ -7597,10 +8114,13 @@ SDValue PPCTargetLowering::LowerSELECT_CC(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  // TODO: Propagate flags from the select rather than global settings.
-  SDNodeFlags Flags;
-  Flags.setNoInfs(true);
-  Flags.setNoNaNs(true);
+  // We might be able to do better than this under some circumstances, but in
+  // general, fsel-based lowering of select is a finite-math-only optimization.
+  // For more information, see section F.3 of the 2.06 ISA specification.
+  // With ISA 3.0
+  if ((!DAG.getTarget().Options.NoInfsFPMath && !Flags.hasNoInfs()) ||
+      (!DAG.getTarget().Options.NoNaNsFPMath && !Flags.hasNoNaNs()))
+    return Op;
 
   // If the RHS of the comparison is a 0.0, we don't need to do the
   // subtraction at all.
@@ -7720,15 +8240,17 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
 
   // Emit a store to the stack slot.
   SDValue Chain;
+  Align Alignment(DAG.getEVTAlign(Tmp.getValueType()));
   if (i32Stack) {
     MachineFunction &MF = DAG.getMachineFunction();
+    Alignment = Align(4);
     MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, 4);
+        MF.getMachineMemOperand(MPI, MachineMemOperand::MOStore, 4, Alignment);
     SDValue Ops[] = { DAG.getEntryNode(), Tmp, FIPtr };
     Chain = DAG.getMemIntrinsicNode(PPCISD::STFIWX, dl,
               DAG.getVTList(MVT::Other), Ops, MVT::i32, MMO);
   } else
-    Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI);
+    Chain = DAG.getStore(DAG.getEntryNode(), dl, Tmp, FIPtr, MPI, Alignment);
 
   // Result is a load from the stack slot.  If loading 4 bytes, make sure to
   // add in a bias on big endian.
@@ -7741,6 +8263,7 @@ void PPCTargetLowering::LowerFP_TO_INTForReuse(SDValue Op, ReuseLoadInfo &RLI,
   RLI.Chain = Chain;
   RLI.Ptr = FIPtr;
   RLI.MPI = MPI;
+  RLI.Alignment = Alignment;
 }
 
 /// Custom lowers floating point to integer conversions to use
@@ -7782,7 +8305,7 @@ SDValue PPCTargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG,
                                           const SDLoc &dl) const {
 
   // FP to INT conversions are legal for f128.
-  if (EnableQuadPrecision && (Op->getOperand(0).getValueType() == MVT::f128))
+  if (Op->getOperand(0).getValueType() == MVT::f128)
     return Op;
 
   // Expand ppcf128 to i32 by hand for the benefit of llvm-gcc bootstrap on
@@ -7848,9 +8371,10 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
                                             SelectionDAG &DAG,
                                             ISD::LoadExtType ET) const {
   SDLoc dl(Op);
+  bool ValidFPToUint = Op.getOpcode() == ISD::FP_TO_UINT &&
+                       (Subtarget.hasFPCVT() || Op.getValueType() == MVT::i32);
   if (ET == ISD::NON_EXTLOAD &&
-      (Op.getOpcode() == ISD::FP_TO_UINT ||
-       Op.getOpcode() == ISD::FP_TO_SINT) &&
+      (ValidFPToUint || Op.getOpcode() == ISD::FP_TO_SINT) &&
       isOperationLegalOrCustom(Op.getOpcode(),
                                Op.getOperand(0).getValueType())) {
 
@@ -7877,7 +8401,7 @@ bool PPCTargetLowering::canReuseLoadAddress(SDValue Op, EVT MemVT,
   RLI.MPI = LD->getPointerInfo();
   RLI.IsDereferenceable = LD->isDereferenceable();
   RLI.IsInvariant = LD->isInvariant();
-  RLI.Alignment = LD->getAlignment();
+  RLI.Alignment = LD->getAlign();
   RLI.AAInfo = LD->getAAInfo();
   RLI.Ranges = LD->getRanges();
 
@@ -8021,16 +8545,19 @@ SDValue PPCTargetLowering::LowerINT_TO_FPVector(SDValue Op, SelectionDAG &DAG,
   SDValue ShuffleSrc2 =
       SignedConv ? DAG.getUNDEF(WideVT) : DAG.getConstant(0, dl, WideVT);
   SDValue Arrange = DAG.getVectorShuffle(WideVT, dl, Wide, ShuffleSrc2, ShuffV);
-  unsigned ExtendOp =
-      SignedConv ? (unsigned)PPCISD::SExtVElems : (unsigned)ISD::BITCAST;
 
   SDValue Extend;
-  if (!Subtarget.hasP9Altivec() && SignedConv) {
+  if (SignedConv) {
     Arrange = DAG.getBitcast(IntermediateVT, Arrange);
+    EVT ExtVT = Op.getOperand(0).getValueType();
+    if (Subtarget.hasP9Altivec())
+      ExtVT = EVT::getVectorVT(*DAG.getContext(), WideVT.getVectorElementType(),
+                               IntermediateVT.getVectorNumElements());
+
     Extend = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, IntermediateVT, Arrange,
-                         DAG.getValueType(Op.getOperand(0).getValueType()));
+                         DAG.getValueType(ExtVT));
   } else
-    Extend = DAG.getNode(ExtendOp, dl, IntermediateVT, Arrange);
+    Extend = DAG.getNode(ISD::BITCAST, dl, IntermediateVT, Arrange);
 
   return DAG.getNode(Opc, dl, Op.getValueType(), Extend);
 }
@@ -8046,7 +8573,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     return LowerINT_TO_FPVector(Op, DAG, dl);
 
   // Conversions to f128 are legal.
-  if (EnableQuadPrecision && (Op.getValueType() == MVT::f128))
+  if (Op.getValueType() == MVT::f128)
     return Op;
 
   if (Subtarget.hasQPX() && Op.getOperand(0).getValueType() == MVT::v4i1) {
@@ -8141,8 +8668,10 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
                                  SINT, DAG.getConstant(53, dl, MVT::i32));
       Cond = DAG.getNode(ISD::ADD, dl, MVT::i64,
                          Cond, DAG.getConstant(1, dl, MVT::i64));
-      Cond = DAG.getSetCC(dl, MVT::i32,
-                          Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);
+      Cond = DAG.getSetCC(
+          dl,
+          getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), MVT::i64),
+          Cond, DAG.getConstant(1, dl, MVT::i64), ISD::SETUGT);
 
       SINT = DAG.getNode(ISD::SELECT, dl, MVT::i64, Cond, Round, SINT);
     }
@@ -8183,7 +8712,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
       MachineFrameInfo &MFI = MF.getFrameInfo();
       EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
-      int FrameIdx = MFI.CreateStackObject(4, 4, false);
+      int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
       SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
       SDValue Store =
@@ -8198,7 +8727,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
       RLI.Chain = Store;
       RLI.MPI =
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
-      RLI.Alignment = 4;
+      RLI.Alignment = Align(4);
 
       MachineMemOperand *MMO =
         MF.getMachineMemOperand(RLI.MPI, MachineMemOperand::MOLoad, 4,
@@ -8235,7 +8764,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     bool ReusingLoad;
     if (!(ReusingLoad = canReuseLoadAddress(Op.getOperand(0), MVT::i32, RLI,
                                             DAG))) {
-      int FrameIdx = MFI.CreateStackObject(4, 4, false);
+      int FrameIdx = MFI.CreateStackObject(4, Align(4), false);
       SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
       SDValue Store =
@@ -8250,7 +8779,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
       RLI.Chain = Store;
       RLI.MPI =
           MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
-      RLI.Alignment = 4;
+      RLI.Alignment = Align(4);
     }
 
     MachineMemOperand *MMO =
@@ -8267,7 +8796,7 @@ SDValue PPCTargetLowering::LowerINT_TO_FP(SDValue Op,
     assert(Subtarget.isPPC64() &&
            "i32->FP without LFIWAX supported only on PPC64");
 
-    int FrameIdx = MFI.CreateStackObject(8, 8, false);
+    int FrameIdx = MFI.CreateStackObject(8, Align(8), false);
     SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
     SDValue Ext64 = DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::i64,
@@ -8319,22 +8848,20 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   EVT PtrVT = getPointerTy(MF.getDataLayout());
 
   // Save FP Control Word to register
-  EVT NodeTys[] = {
-    MVT::f64,    // return register
-    MVT::Glue    // unused in this context
-  };
-  SDValue Chain = DAG.getNode(PPCISD::MFFS, dl, NodeTys, None);
+  SDValue Chain = Op.getOperand(0);
+  SDValue MFFS = DAG.getNode(PPCISD::MFFS, dl, {MVT::f64, MVT::Other}, Chain);
+  Chain = MFFS.getValue(1);
 
   // Save FP register to stack slot
-  int SSFI = MF.getFrameInfo().CreateStackObject(8, 8, false);
+  int SSFI = MF.getFrameInfo().CreateStackObject(8, Align(8), false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
-  SDValue Store = DAG.getStore(DAG.getEntryNode(), dl, Chain, StackSlot,
-                               MachinePointerInfo());
+  Chain = DAG.getStore(Chain, dl, MFFS, StackSlot, MachinePointerInfo());
 
   // Load FP Control Word from low 32 bits of stack slot.
   SDValue Four = DAG.getConstant(4, dl, PtrVT);
   SDValue Addr = DAG.getNode(ISD::ADD, dl, PtrVT, StackSlot, Four);
-  SDValue CWD = DAG.getLoad(MVT::i32, dl, Store, Addr, MachinePointerInfo());
+  SDValue CWD = DAG.getLoad(MVT::i32, dl, Chain, Addr, MachinePointerInfo());
+  Chain = CWD.getValue(1);
 
   // Transform as necessary
   SDValue CWD1 =
@@ -8351,8 +8878,11 @@ SDValue PPCTargetLowering::LowerFLT_ROUNDS_(SDValue Op,
   SDValue RetVal =
     DAG.getNode(ISD::XOR, dl, MVT::i32, CWD1, CWD2);
 
-  return DAG.getNode((VT.getSizeInBits() < 16 ?
-                      ISD::TRUNCATE : ISD::ZERO_EXTEND), dl, VT, RetVal);
+  RetVal =
+      DAG.getNode((VT.getSizeInBits() < 16 ? ISD::TRUNCATE : ISD::ZERO_EXTEND),
+                  dl, VT, RetVal);
+
+  return DAG.getMergeValues({RetVal, Chain}, dl);
 }
 
 SDValue PPCTargetLowering::LowerSHL_PARTS(SDValue Op, SelectionDAG &DAG) const {
@@ -8446,19 +8976,21 @@ SDValue PPCTargetLowering::LowerSRA_PARTS(SDValue Op, SelectionDAG &DAG) const {
 // Vector related lowering.
 //
 
-/// BuildSplatI - Build a canonical splati of Val with an element size of
-/// SplatSize.  Cast the result to VT.
-static SDValue BuildSplatI(int Val, unsigned SplatSize, EVT VT,
-                           SelectionDAG &DAG, const SDLoc &dl) {
+/// getCanonicalConstSplat - Build a canonical splat immediate of Val with an
+/// element size of SplatSize. Cast the result to VT.
+static SDValue getCanonicalConstSplat(uint64_t Val, unsigned SplatSize, EVT VT,
+                                      SelectionDAG &DAG, const SDLoc &dl) {
   static const MVT VTys[] = { // canonical VT to use for each size.
     MVT::v16i8, MVT::v8i16, MVT::Other, MVT::v4i32
   };
 
   EVT ReqVT = VT != MVT::Other ? VT : VTys[SplatSize-1];
 
-  // Force vspltis[hw] -1 to vspltisb -1 to canonicalize.
-  if (Val == -1)
+  // For a splat with all ones, turn it to vspltisb 0xFF to canonicalize.
+  if (Val == ((1LU << (SplatSize * 8)) - 1)) {
     SplatSize = 1;
+    Val = 0xFF;
+  }
 
   EVT CanonicalVT = VTys[SplatSize-1];
 
@@ -8569,10 +9101,9 @@ SDValue PPCTargetLowering::LowerBITCAST(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
   SDValue Op0 = Op->getOperand(0);
 
-  if (!EnableQuadPrecision ||
-      (Op.getValueType() != MVT::f128 ) ||
+  if ((Op.getValueType() != MVT::f128) ||
       (Op0.getOpcode() != ISD::BUILD_PAIR) ||
-      (Op0.getOperand(0).getValueType() !=  MVT::i64) ||
+      (Op0.getOperand(0).getValueType() != MVT::i64) ||
       (Op0.getOperand(1).getValueType() != MVT::i64))
     return SDValue();
 
@@ -8584,7 +9115,8 @@ static const SDValue *getNormalLoadInput(const SDValue &Op) {
   const SDValue *InputLoad = &Op;
   if (InputLoad->getOpcode() == ISD::BITCAST)
     InputLoad = &InputLoad->getOperand(0);
-  if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR)
+  if (InputLoad->getOpcode() == ISD::SCALAR_TO_VECTOR ||
+      InputLoad->getOpcode() == PPCISD::SCALAR_TO_VECTOR_PERMUTED)
     InputLoad = &InputLoad->getOperand(0);
   if (InputLoad->getOpcode() != ISD::LOAD)
     return nullptr;
@@ -8592,6 +9124,34 @@ static const SDValue *getNormalLoadInput(const SDValue &Op) {
   return ISD::isNormalLoad(LD) ? InputLoad : nullptr;
 }
 
+// Convert the argument APFloat to a single precision APFloat if there is no
+// loss in information during the conversion to single precision APFloat and the
+// resulting number is not a denormal number. Return true if successful.
+bool llvm::convertToNonDenormSingle(APFloat &ArgAPFloat) {
+  APFloat APFloatToConvert = ArgAPFloat;
+  bool LosesInfo = true;
+  APFloatToConvert.convert(APFloat::IEEEsingle(), APFloat::rmNearestTiesToEven,
+                           &LosesInfo);
+  bool Success = (!LosesInfo && !APFloatToConvert.isDenormal());
+  if (Success)
+    ArgAPFloat = APFloatToConvert;
+  return Success;
+}
+
+// Bitcast the argument APInt to a double and convert it to a single precision
+// APFloat, bitcast the APFloat to an APInt and assign it to the original
+// argument if there is no loss in information during the conversion from
+// double to single precision APFloat and the resulting number is not a denormal
+// number. Return true if successful.
+bool llvm::convertToNonDenormSingle(APInt &ArgAPInt) {
+  double DpValue = ArgAPInt.bitsToDouble();
+  APFloat APFloatDp(DpValue);
+  bool Success = convertToNonDenormSingle(APFloatDp);
+  if (Success)
+    ArgAPInt = APFloatDp.bitcastToAPInt();
+  return Success;
+}
+
 // If this is a case we can't handle, return null and let the default
 // expansion code take care of it.  If we CAN select this case, and if it
 // selects to a single instruction, return Op.  Otherwise, if we can codegen
@@ -8608,7 +9168,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     // then convert it to a floating-point vector and compare it
     // to a zero vector to get the boolean result.
     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-    int FrameIdx = MFI.CreateStackObject(16, 16, false);
+    int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
     MachinePointerInfo PtrInfo =
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
     EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -8643,8 +9203,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
       }
 
       Constant *CP = ConstantVector::get(CV);
-      SDValue CPIdx = DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()),
-                                          16 /* alignment */);
+      SDValue CPIdx =
+          DAG.getConstantPool(CP, getPointerTy(DAG.getDataLayout()), Align(16));
 
       SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
       SDVTList VTs = DAG.getVTList({MVT::v4i1, /*chain*/ MVT::Other});
@@ -8711,9 +9271,23 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   APInt APSplatBits, APSplatUndef;
   unsigned SplatBitSize;
   bool HasAnyUndefs;
-  if (! BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
-                             HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
-      SplatBitSize > 32) {
+  bool BVNIsConstantSplat =
+      BVN->isConstantSplat(APSplatBits, APSplatUndef, SplatBitSize,
+                           HasAnyUndefs, 0, !Subtarget.isLittleEndian());
+
+  // If it is a splat of a double, check if we can shrink it to a 32 bit
+  // non-denormal float which when converted back to double gives us the same
+  // double. This is to exploit the XXSPLTIDP instruction.
+  if (BVNIsConstantSplat && Subtarget.hasPrefixInstrs() &&
+      (SplatBitSize == 64) && (Op->getValueType(0) == MVT::v2f64) &&
+      convertToNonDenormSingle(APSplatBits)) {
+    SDValue SplatNode = DAG.getNode(
+        PPCISD::XXSPLTI_SP_TO_DP, dl, MVT::v2f64,
+        DAG.getTargetConstant(APSplatBits.getZExtValue(), dl, MVT::i32));
+    return DAG.getBitcast(Op.getValueType(), SplatNode);
+  }
+
+  if (!BVNIsConstantSplat || SplatBitSize > 32) {
 
     const SDValue *InputLoad = getNormalLoadInput(Op.getOperand(0));
     // Handle load-and-splat patterns as we have instructions that will do this
@@ -8752,8 +9326,8 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     return SDValue();
   }
 
-  unsigned SplatBits = APSplatBits.getZExtValue();
-  unsigned SplatUndef = APSplatUndef.getZExtValue();
+  uint64_t SplatBits = APSplatBits.getZExtValue();
+  uint64_t SplatUndef = APSplatUndef.getZExtValue();
   unsigned SplatSize = SplatBitSize / 8;
 
   // First, handle single instruction cases.
@@ -8768,17 +9342,30 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     return Op;
   }
 
-  // We have XXSPLTIB for constant splats one byte wide
-  // FIXME: SplatBits is an unsigned int being cast to an int while passing it
-  // as an argument to BuildSplatiI. Given SplatSize == 1 it is okay here.
+  // We have XXSPLTIW for constant splats four bytes wide.
+  // Given vector length is a multiple of 4, 2-byte splats can be replaced
+  // with 4-byte splats. We replicate the SplatBits in case of 2-byte splat to
+  // make a 4-byte splat element. For example: 2-byte splat of 0xABAB can be
+  // turned into a 4-byte splat of 0xABABABAB.
+  if (Subtarget.hasPrefixInstrs() && SplatSize == 2)
+    return getCanonicalConstSplat((SplatBits |= SplatBits << 16), SplatSize * 2,
+                                  Op.getValueType(), DAG, dl);
+
+  if (Subtarget.hasPrefixInstrs() && SplatSize == 4)
+    return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
+                                  dl);
+
+  // We have XXSPLTIB for constant splats one byte wide.
   if (Subtarget.hasP9Vector() && SplatSize == 1)
-    return BuildSplatI(SplatBits, SplatSize, Op.getValueType(), DAG, dl);
+    return getCanonicalConstSplat(SplatBits, SplatSize, Op.getValueType(), DAG,
+                                  dl);
 
   // If the sign extended value is in the range [-16,15], use VSPLTI[bhw].
   int32_t SextVal= (int32_t(SplatBits << (32-SplatBitSize)) >>
                     (32-SplatBitSize));
   if (SextVal >= -16 && SextVal <= 15)
-    return BuildSplatI(SextVal, SplatSize, Op.getValueType(), DAG, dl);
+    return getCanonicalConstSplat(SextVal, SplatSize, Op.getValueType(), DAG,
+                                  dl);
 
   // Two instruction sequences.
 
@@ -8809,7 +9396,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   // for fneg/fabs.
   if (SplatSize == 4 && SplatBits == (0x7FFFFFFF&~SplatUndef)) {
     // Make -1 and vspltisw -1:
-    SDValue OnesV = BuildSplatI(-1, 4, MVT::v4i32, DAG, dl);
+    SDValue OnesV = getCanonicalConstSplat(-1, 4, MVT::v4i32, DAG, dl);
 
     // Make the VSLW intrinsic, computing 0x8000_0000.
     SDValue Res = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, OnesV,
@@ -8837,7 +9424,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
     // vsplti + shl self.
     if (SextVal == (int)((unsigned)i << TypeShiftAmt)) {
-      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
       static const unsigned IIDs[] = { // Intrinsic to use for each size.
         Intrinsic::ppc_altivec_vslb, Intrinsic::ppc_altivec_vslh, 0,
         Intrinsic::ppc_altivec_vslw
@@ -8848,7 +9435,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
     // vsplti + srl self.
     if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
-      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
       static const unsigned IIDs[] = { // Intrinsic to use for each size.
         Intrinsic::ppc_altivec_vsrb, Intrinsic::ppc_altivec_vsrh, 0,
         Intrinsic::ppc_altivec_vsrw
@@ -8859,7 +9446,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
     // vsplti + sra self.
     if (SextVal == (int)((unsigned)i >> TypeShiftAmt)) {
-      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
       static const unsigned IIDs[] = { // Intrinsic to use for each size.
         Intrinsic::ppc_altivec_vsrab, Intrinsic::ppc_altivec_vsrah, 0,
         Intrinsic::ppc_altivec_vsraw
@@ -8871,7 +9458,7 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
     // vsplti + rol self.
     if (SextVal == (int)(((unsigned)i << TypeShiftAmt) |
                          ((unsigned)i >> (SplatBitSize-TypeShiftAmt)))) {
-      SDValue Res = BuildSplatI(i, SplatSize, MVT::Other, DAG, dl);
+      SDValue Res = getCanonicalConstSplat(i, SplatSize, MVT::Other, DAG, dl);
       static const unsigned IIDs[] = { // Intrinsic to use for each size.
         Intrinsic::ppc_altivec_vrlb, Intrinsic::ppc_altivec_vrlh, 0,
         Intrinsic::ppc_altivec_vrlw
@@ -8882,19 +9469,19 @@ SDValue PPCTargetLowering::LowerBUILD_VECTOR(SDValue Op,
 
     // t = vsplti c, result = vsldoi t, t, 1
     if (SextVal == (int)(((unsigned)i << 8) | (i < 0 ? 0xFF : 0))) {
-      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
       unsigned Amt = Subtarget.isLittleEndian() ? 15 : 1;
       return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 2
     if (SextVal == (int)(((unsigned)i << 16) | (i < 0 ? 0xFFFF : 0))) {
-      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
       unsigned Amt = Subtarget.isLittleEndian() ? 14 : 2;
       return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
     }
     // t = vsplti c, result = vsldoi t, t, 3
     if (SextVal == (int)(((unsigned)i << 24) | (i < 0 ? 0xFFFFFF : 0))) {
-      SDValue T = BuildSplatI(i, SplatSize, MVT::v16i8, DAG, dl);
+      SDValue T = getCanonicalConstSplat(i, SplatSize, MVT::v16i8, DAG, dl);
       unsigned Amt = Subtarget.isLittleEndian() ? 13 : 3;
       return BuildVSLDOI(T, T, Amt, Op.getValueType(), DAG, dl);
     }
@@ -9193,6 +9780,107 @@ SDValue PPCTargetLowering::lowerToVINSERTH(ShuffleVectorSDNode *N,
   return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
 }
 
+/// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be
+/// handled by the XXSPLTI32DX instruction introduced in ISA 3.1, otherwise
+/// return the default SDValue.
+SDValue PPCTargetLowering::lowerToXXSPLTI32DX(ShuffleVectorSDNode *SVN,
+                                              SelectionDAG &DAG) const {
+  // The LHS and RHS may be bitcasts to v16i8 as we canonicalize shuffles
+  // to v16i8. Peek through the bitcasts to get the actual operands.
+  SDValue LHS = peekThroughBitcasts(SVN->getOperand(0));
+  SDValue RHS = peekThroughBitcasts(SVN->getOperand(1));
+
+  auto ShuffleMask = SVN->getMask();
+  SDValue VecShuffle(SVN, 0);
+  SDLoc DL(SVN);
+
+  // Check that we have a four byte shuffle.
+  if (!isNByteElemShuffleMask(SVN, 4, 1))
+    return SDValue();
+
+  // Canonicalize the RHS being a BUILD_VECTOR when lowering to xxsplti32dx.
+  if (RHS->getOpcode() != ISD::BUILD_VECTOR) {
+    std::swap(LHS, RHS);
+    VecShuffle = DAG.getCommutedVectorShuffle(*SVN);
+    ShuffleMask = cast<ShuffleVectorSDNode>(VecShuffle)->getMask();
+  }
+
+  // Ensure that the RHS is a vector of constants.
+  BuildVectorSDNode *BVN = dyn_cast<BuildVectorSDNode>(RHS.getNode());
+  if (!BVN)
+    return SDValue();
+
+  // Check if RHS is a splat of 4-bytes (or smaller).
+  APInt APSplatValue, APSplatUndef;
+  unsigned SplatBitSize;
+  bool HasAnyUndefs;
+  if (!BVN->isConstantSplat(APSplatValue, APSplatUndef, SplatBitSize,
+                            HasAnyUndefs, 0, !Subtarget.isLittleEndian()) ||
+      SplatBitSize > 32)
+    return SDValue();
+
+  // Check that the shuffle mask matches the semantics of XXSPLTI32DX.
+  // The instruction splats a constant C into two words of the source vector
+  // producing { C, Unchanged, C, Unchanged } or { Unchanged, C, Unchanged, C }.
+  // Thus we check that the shuffle mask is the equivalent  of
+  // <0, [4-7], 2, [4-7]> or <[4-7], 1, [4-7], 3> respectively.
+  // Note: the check above of isNByteElemShuffleMask() ensures that the bytes
+  // within each word are consecutive, so we only need to check the first byte.
+  SDValue Index;
+  bool IsLE = Subtarget.isLittleEndian();
+  if ((ShuffleMask[0] == 0 && ShuffleMask[8] == 8) &&
+      (ShuffleMask[4] % 4 == 0 && ShuffleMask[12] % 4 == 0 &&
+       ShuffleMask[4] > 15 && ShuffleMask[12] > 15))
+    Index = DAG.getTargetConstant(IsLE ? 0 : 1, DL, MVT::i32);
+  else if ((ShuffleMask[4] == 4 && ShuffleMask[12] == 12) &&
+           (ShuffleMask[0] % 4 == 0 && ShuffleMask[8] % 4 == 0 &&
+            ShuffleMask[0] > 15 && ShuffleMask[8] > 15))
+    Index = DAG.getTargetConstant(IsLE ? 1 : 0, DL, MVT::i32);
+  else
+    return SDValue();
+
+  // If the splat is narrower than 32-bits, we need to get the 32-bit value
+  // for XXSPLTI32DX.
+  unsigned SplatVal = APSplatValue.getZExtValue();
+  for (; SplatBitSize < 32; SplatBitSize <<= 1)
+    SplatVal |= (SplatVal << SplatBitSize);
+
+  SDValue SplatNode = DAG.getNode(
+      PPCISD::XXSPLTI32DX, DL, MVT::v2i64, DAG.getBitcast(MVT::v2i64, LHS),
+      Index, DAG.getTargetConstant(SplatVal, DL, MVT::i32));
+  return DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, SplatNode);
+}
+
+/// LowerROTL - Custom lowering for ROTL(v1i128) to vector_shuffle(v16i8).
+/// We lower ROTL(v1i128) to vector_shuffle(v16i8) only if shift amount is
+/// a multiple of 8. Otherwise convert it to a scalar rotation(i128)
+/// i.e (or (shl x, C1), (srl x, 128-C1)).
+SDValue PPCTargetLowering::LowerROTL(SDValue Op, SelectionDAG &DAG) const {
+  assert(Op.getOpcode() == ISD::ROTL && "Should only be called for ISD::ROTL");
+  assert(Op.getValueType() == MVT::v1i128 &&
+         "Only set v1i128 as custom, other type shouldn't reach here!");
+  SDLoc dl(Op);
+  SDValue N0 = peekThroughBitcasts(Op.getOperand(0));
+  SDValue N1 = peekThroughBitcasts(Op.getOperand(1));
+  unsigned SHLAmt = N1.getConstantOperandVal(0);
+  if (SHLAmt % 8 == 0) {
+    SmallVector<int, 16> Mask(16, 0);
+    std::iota(Mask.begin(), Mask.end(), 0);
+    std::rotate(Mask.begin(), Mask.begin() + SHLAmt / 8, Mask.end());
+    if (SDValue Shuffle =
+            DAG.getVectorShuffle(MVT::v16i8, dl, DAG.getBitcast(MVT::v16i8, N0),
+                                 DAG.getUNDEF(MVT::v16i8), Mask))
+      return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, Shuffle);
+  }
+  SDValue ArgVal = DAG.getBitcast(MVT::i128, N0);
+  SDValue SHLOp = DAG.getNode(ISD::SHL, dl, MVT::i128, ArgVal,
+                              DAG.getConstant(SHLAmt, dl, MVT::i32));
+  SDValue SRLOp = DAG.getNode(ISD::SRL, dl, MVT::i128, ArgVal,
+                              DAG.getConstant(128 - SHLAmt, dl, MVT::i32));
+  SDValue OROp = DAG.getNode(ISD::OR, dl, MVT::i128, SHLOp, SRLOp);
+  return DAG.getNode(ISD::BITCAST, dl, MVT::v1i128, OROp);
+}
+
 /// LowerVECTOR_SHUFFLE - Return the code we lower for VECTOR_SHUFFLE.  If this
 /// is a shuffle we can handle in a single instruction, return it.  Otherwise,
 /// return the code it can be lowered into.  Worst case, it can always be
@@ -9203,6 +9891,18 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
   SDValue V1 = Op.getOperand(0);
   SDValue V2 = Op.getOperand(1);
   ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+
+  // Any nodes that were combined in the target-independent combiner prior
+  // to vector legalization will not be sent to the target combine. Try to
+  // combine it here.
+  if (SDValue NewShuffle = combineVectorShuffle(SVOp, DAG)) {
+    if (!isa<ShuffleVectorSDNode>(NewShuffle))
+      return NewShuffle;
+    Op = NewShuffle;
+    SVOp = cast<ShuffleVectorSDNode>(Op);
+    V1 = Op.getOperand(0);
+    V2 = Op.getOperand(1);
+  }
   EVT VT = Op.getValueType();
   bool isLittleEndian = Subtarget.isLittleEndian();
 
@@ -9228,6 +9928,11 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
         Offset = isLittleEndian ? (3 - SplatIdx) * 4 : SplatIdx * 4;
       else
         Offset = isLittleEndian ? (1 - SplatIdx) * 8 : SplatIdx * 8;
+
+      // If we are loading a partial vector, it does not make sense to adjust
+      // the base pointer. This happens with (splat (s_to_v_permuted (ld))).
+      if (LD->getMemoryVT().getSizeInBits() == (IsFourByte ? 32 : 64))
+        Offset = 0;
       SDValue BasePtr = LD->getBasePtr();
       if (Offset != 0)
         BasePtr = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
@@ -9266,6 +9971,12 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
     return DAG.getNode(ISD::BITCAST, dl, MVT::v16i8, Ins);
   }
 
+  if (Subtarget.hasPrefixInstrs()) {
+    SDValue SplatInsertNode;
+    if ((SplatInsertNode = lowerToXXSPLTI32DX(SVOp, DAG)))
+      return SplatInsertNode;
+  }
+
   if (Subtarget.hasP9Altivec()) {
     SDValue NewISDNode;
     if ((NewISDNode = lowerToVINSERTH(SVOp, DAG)))
@@ -9501,7 +10212,13 @@ SDValue PPCTargetLowering::LowerVECTOR_SHUFFLE(SDValue Op,
                                              MVT::i32));
   }
 
+  ShufflesHandledWithVPERM++;
   SDValue VPermMask = DAG.getBuildVector(MVT::v16i8, dl, ResultMask);
+  LLVM_DEBUG(dbgs() << "Emitting a VPERM for the following shuffle:\n");
+  LLVM_DEBUG(SVOp->dump());
+  LLVM_DEBUG(dbgs() << "With the following permute control vector:\n");
+  LLVM_DEBUG(VPermMask.dump());
+
   if (isLittleEndian)
     return DAG.getNode(PPCISD::VPERM, dl, V1.getValueType(),
                        V2, V1, VPermMask);
@@ -9858,18 +10575,6 @@ SDValue PPCTargetLowering::LowerINTRINSIC_VOID(SDValue Op,
   return SDValue();
 }
 
-SDValue PPCTargetLowering::LowerREM(SDValue Op, SelectionDAG &DAG) const {
-  // Check for a DIV with the same operands as this REM.
-  for (auto UI : Op.getOperand(1)->uses()) {
-    if ((Op.getOpcode() == ISD::SREM && UI->getOpcode() == ISD::SDIV) ||
-        (Op.getOpcode() == ISD::UREM && UI->getOpcode() == ISD::UDIV))
-      if (UI->getOperand(0) == Op.getOperand(0) &&
-          UI->getOperand(1) == Op.getOperand(1))
-        return SDValue();
-  }
-  return Op;
-}
-
 // Lower scalar BSWAP64 to xxbrd.
 SDValue PPCTargetLowering::LowerBSWAP(SDValue Op, SelectionDAG &DAG) const {
   SDLoc dl(Op);
@@ -9928,7 +10633,7 @@ SDValue PPCTargetLowering::LowerSCALAR_TO_VECTOR(SDValue Op,
   SDLoc dl(Op);
   // Create a stack slot that is 16-byte aligned.
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-  int FrameIdx = MFI.CreateStackObject(16, 16, false);
+  int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue FIdx = DAG.getFrameIndex(FrameIdx, PtrVT);
 
@@ -9998,7 +10703,7 @@ SDValue PPCTargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     Value);
 
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-  int FrameIdx = MFI.CreateStackObject(16, 16, false);
+  int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
   MachinePointerInfo PtrInfo =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -10139,9 +10844,8 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
 
     SDValue Stores[4];
     for (unsigned Idx = 0; Idx < 4; ++Idx) {
-      SDValue Ex = DAG.getNode(
-          ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
-          DAG.getConstant(Idx, dl, getVectorIdxTy(DAG.getDataLayout())));
+      SDValue Ex = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, ScalarVT, Value,
+                               DAG.getVectorIdxConstant(Idx, dl));
       SDValue Store;
       if (ScalarVT != ScalarMemVT)
         Store =
@@ -10198,7 +10902,7 @@ SDValue PPCTargetLowering::LowerVectorStore(SDValue Op,
     Value);
 
   MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-  int FrameIdx = MFI.CreateStackObject(16, 16, false);
+  int FrameIdx = MFI.CreateStackObject(16, Align(16), false);
   MachinePointerInfo PtrInfo =
       MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), FrameIdx);
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
@@ -10247,9 +10951,9 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
   if (Op.getValueType() == MVT::v4i32) {
     SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
 
-    SDValue Zero  = BuildSplatI(  0, 1, MVT::v4i32, DAG, dl);
-    SDValue Neg16 = BuildSplatI(-16, 4, MVT::v4i32, DAG, dl);//+16 as shift amt.
-
+    SDValue Zero = getCanonicalConstSplat(0, 1, MVT::v4i32, DAG, dl);
+    // +16 as shift amt.
+    SDValue Neg16 = getCanonicalConstSplat(-16, 4, MVT::v4i32, DAG, dl);
     SDValue RHSSwap =   // = vrlw RHS, 16
       BuildIntrinsicOp(Intrinsic::ppc_altivec_vrlw, RHS, Neg16, DAG, dl);
 
@@ -10269,13 +10973,6 @@ SDValue PPCTargetLowering::LowerMUL(SDValue Op, SelectionDAG &DAG) const {
     HiProd = BuildIntrinsicOp(Intrinsic::ppc_altivec_vslw, HiProd,
                               Neg16, DAG, dl);
     return DAG.getNode(ISD::ADD, dl, MVT::v4i32, LoProd, HiProd);
-  } else if (Op.getValueType() == MVT::v8i16) {
-    SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
-
-    SDValue Zero = BuildSplatI(0, 1, MVT::v8i16, DAG, dl);
-
-    return BuildIntrinsicOp(Intrinsic::ppc_altivec_vmladduhm,
-                            LHS, RHS, Zero, DAG, dl);
   } else if (Op.getValueType() == MVT::v16i8) {
     SDValue LHS = Op.getOperand(0), RHS = Op.getOperand(1);
     bool isLittleEndian = Subtarget.isLittleEndian();
@@ -10357,6 +11054,7 @@ SDValue PPCTargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   assert(Op.getOpcode() == ISD::FP_EXTEND &&
          "Should only be called for ISD::FP_EXTEND");
 
+  // FIXME: handle extends from half precision float vectors on P9.
   // We only want to custom lower an extend from v2f32 to v2f64.
   if (Op.getValueType() != MVT::v2f64 ||
       Op.getOperand(0).getValueType() != MVT::v2f32)
@@ -10481,6 +11179,7 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::MUL:                return LowerMUL(Op, DAG);
   case ISD::ABS:                return LowerABS(Op, DAG);
   case ISD::FP_EXTEND:          return LowerFP_EXTEND(Op, DAG);
+  case ISD::ROTL:               return LowerROTL(Op, DAG);
 
   // For counter-based loop handling.
   case ISD::INTRINSIC_W_CHAIN:  return SDValue();
@@ -10493,9 +11192,6 @@ SDValue PPCTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
 
   case ISD::INTRINSIC_VOID:
     return LowerINTRINSIC_VOID(Op, DAG);
-  case ISD::SREM:
-  case ISD::UREM:
-    return LowerREM(Op, DAG);
   case ISD::BSWAP:
     return LowerBSWAP(Op, DAG);
   case ISD::ATOMIC_CMP_SWAP:
@@ -10514,8 +11210,8 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
     SDVTList VTs = DAG.getVTList(MVT::i32, MVT::i32, MVT::Other);
     SDValue RTB = DAG.getNode(PPCISD::READ_TIME_BASE, dl, VTs, N->getOperand(0));
 
-    Results.push_back(RTB);
-    Results.push_back(RTB.getValue(1));
+    Results.push_back(
+        DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, RTB, RTB.getValue(1)));
     Results.push_back(RTB.getValue(2));
     break;
   }
@@ -10570,6 +11266,11 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
   case ISD::BITCAST:
     // Don't handle bitcast here.
     return;
+  case ISD::FP_EXTEND:
+    SDValue Lowered = LowerFP_EXTEND(SDValue(N, 0), DAG);
+    if (Lowered)
+      Results.push_back(Lowered);
+    return;
   }
 }
 
@@ -11170,13 +11871,192 @@ PPCTargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
   return MBB;
 }
 
+bool PPCTargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
+  // If the function specifically requests inline stack probes, emit them.
+  if (MF.getFunction().hasFnAttribute("probe-stack"))
+    return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+           "inline-asm";
+  return false;
+}
+
+unsigned PPCTargetLowering::getStackProbeSize(MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+  unsigned StackAlign = TFI->getStackAlignment();
+  assert(StackAlign >= 1 && isPowerOf2_32(StackAlign) &&
+         "Unexpected stack alignment");
+  // The default stack probe size is 4096 if the function has no
+  // stack-probe-size attribute.
+  unsigned StackProbeSize = 4096;
+  const Function &Fn = MF.getFunction();
+  if (Fn.hasFnAttribute("stack-probe-size"))
+    Fn.getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+  // Round down to the stack alignment.
+  StackProbeSize &= ~(StackAlign - 1);
+  return StackProbeSize ? StackProbeSize : StackAlign;
+}
+
+// Lower dynamic stack allocation with probing. `emitProbedAlloca` is splitted
+// into three phases. In the first phase, it uses pseudo instruction
+// PREPARE_PROBED_ALLOCA to get the future result of actual FramePointer and
+// FinalStackPtr. In the second phase, it generates a loop for probing blocks.
+// At last, it uses pseudo instruction DYNAREAOFFSET to get the future result of
+// MaxCallFrameSize so that it can calculate correct data area pointer.
+MachineBasicBlock *
+PPCTargetLowering::emitProbedAlloca(MachineInstr &MI,
+                                    MachineBasicBlock *MBB) const {
+  const bool isPPC64 = Subtarget.isPPC64();
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  DebugLoc DL = MI.getDebugLoc();
+  const unsigned ProbeSize = getStackProbeSize(*MF);
+  const BasicBlock *ProbedBB = MBB->getBasicBlock();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  // The CFG of probing stack looks as
+  //         +-----+
+  //         | MBB |
+  //         +--+--+
+  //            |
+  //       +----v----+
+  //  +--->+ TestMBB +---+
+  //  |    +----+----+   |
+  //  |         |        |
+  //  |   +-----v----+   |
+  //  +---+ BlockMBB |   |
+  //      +----------+   |
+  //                     |
+  //       +---------+   |
+  //       | TailMBB +<--+
+  //       +---------+
+  // In MBB, calculate previous frame pointer and final stack pointer.
+  // In TestMBB, test if sp is equal to final stack pointer, if so, jump to
+  // TailMBB. In BlockMBB, update the sp atomically and jump back to TestMBB.
+  // TailMBB is spliced via \p MI.
+  MachineBasicBlock *TestMBB = MF->CreateMachineBasicBlock(ProbedBB);
+  MachineBasicBlock *TailMBB = MF->CreateMachineBasicBlock(ProbedBB);
+  MachineBasicBlock *BlockMBB = MF->CreateMachineBasicBlock(ProbedBB);
+
+  MachineFunction::iterator MBBIter = ++MBB->getIterator();
+  MF->insert(MBBIter, TestMBB);
+  MF->insert(MBBIter, BlockMBB);
+  MF->insert(MBBIter, TailMBB);
+
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+
+  Register DstReg = MI.getOperand(0).getReg();
+  Register NegSizeReg = MI.getOperand(1).getReg();
+  Register SPReg = isPPC64 ? PPC::X1 : PPC::R1;
+  Register FinalStackPtr = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+  Register FramePointer = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+
+  // Get the canonical FinalStackPtr like what
+  // PPCRegisterInfo::lowerDynamicAlloc does.
+  BuildMI(*MBB, {MI}, DL,
+          TII->get(isPPC64 ? PPC::PREPARE_PROBED_ALLOCA_64
+                           : PPC::PREPARE_PROBED_ALLOCA_32),
+          FramePointer)
+      .addDef(FinalStackPtr)
+      .addReg(NegSizeReg)
+      .add(MI.getOperand(2))
+      .add(MI.getOperand(3));
+
+  // Materialize a scratch register for update.
+  int64_t NegProbeSize = -(int64_t)ProbeSize;
+  assert(isInt<32>(NegProbeSize) && "Unhandled probe size!");
+  Register ScratchReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+  if (!isInt<16>(NegProbeSize)) {
+    Register TempReg = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+    BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LIS8 : PPC::LIS), TempReg)
+        .addImm(NegProbeSize >> 16);
+    BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::ORI8 : PPC::ORI),
+            ScratchReg)
+        .addReg(TempReg)
+        .addImm(NegProbeSize & 0xFFFF);
+  } else
+    BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::LI8 : PPC::LI), ScratchReg)
+        .addImm(NegProbeSize);
+
+  {
+    // Probing leading residual part.
+    Register Div = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+    BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::DIVD : PPC::DIVW), Div)
+        .addReg(NegSizeReg)
+        .addReg(ScratchReg);
+    Register Mul = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+    BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::MULLD : PPC::MULLW), Mul)
+        .addReg(Div)
+        .addReg(ScratchReg);
+    Register NegMod = MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+    BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::SUBF8 : PPC::SUBF), NegMod)
+        .addReg(Mul)
+        .addReg(NegSizeReg);
+    BuildMI(*MBB, {MI}, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
+        .addReg(FramePointer)
+        .addReg(SPReg)
+        .addReg(NegMod);
+  }
+
+  {
+    // Remaining part should be multiple of ProbeSize.
+    Register CmpResult = MRI.createVirtualRegister(&PPC::CRRCRegClass);
+    BuildMI(TestMBB, DL, TII->get(isPPC64 ? PPC::CMPD : PPC::CMPW), CmpResult)
+        .addReg(SPReg)
+        .addReg(FinalStackPtr);
+    BuildMI(TestMBB, DL, TII->get(PPC::BCC))
+        .addImm(PPC::PRED_EQ)
+        .addReg(CmpResult)
+        .addMBB(TailMBB);
+    TestMBB->addSuccessor(BlockMBB);
+    TestMBB->addSuccessor(TailMBB);
+  }
+
+  {
+    // Touch the block.
+    // |P...|P...|P...
+    BuildMI(BlockMBB, DL, TII->get(isPPC64 ? PPC::STDUX : PPC::STWUX), SPReg)
+        .addReg(FramePointer)
+        .addReg(SPReg)
+        .addReg(ScratchReg);
+    BuildMI(BlockMBB, DL, TII->get(PPC::B)).addMBB(TestMBB);
+    BlockMBB->addSuccessor(TestMBB);
+  }
+
+  // Calculation of MaxCallFrameSize is deferred to prologepilog, use
+  // DYNAREAOFFSET pseudo instruction to get the future result.
+  Register MaxCallFrameSizeReg =
+      MRI.createVirtualRegister(isPPC64 ? G8RC : GPRC);
+  BuildMI(TailMBB, DL,
+          TII->get(isPPC64 ? PPC::DYNAREAOFFSET8 : PPC::DYNAREAOFFSET),
+          MaxCallFrameSizeReg)
+      .add(MI.getOperand(2))
+      .add(MI.getOperand(3));
+  BuildMI(TailMBB, DL, TII->get(isPPC64 ? PPC::ADD8 : PPC::ADD4), DstReg)
+      .addReg(SPReg)
+      .addReg(MaxCallFrameSizeReg);
+
+  // Splice instructions after MI to TailMBB.
+  TailMBB->splice(TailMBB->end(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  TailMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  MBB->addSuccessor(TestMBB);
+
+  // Delete the pseudo instruction.
+  MI.eraseFromParent();
+
+  ++NumDynamicAllocaProbed;
+  return TailMBB;
+}
+
 MachineBasicBlock *
 PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
                                                MachineBasicBlock *BB) const {
   if (MI.getOpcode() == TargetOpcode::STACKMAP ||
       MI.getOpcode() == TargetOpcode::PATCHPOINT) {
     if (Subtarget.is64BitELFABI() &&
-        MI.getOpcode() == TargetOpcode::PATCHPOINT) {
+        MI.getOpcode() == TargetOpcode::PATCHPOINT &&
+        !Subtarget.isUsingPCRelativeCalls()) {
       // Call lowering should have added an r2 operand to indicate a dependence
       // on the TOC base pointer value. It can't however, because there is no
       // way to mark the dependence as implicit there, and so the stackmap code
@@ -11858,12 +12738,12 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
         }
 
         MachineFrameInfo &MFI = F->getFrameInfo();
-        int FrameIdx = MFI.CreateStackObject(8, 8, false);
+        int FrameIdx = MFI.CreateStackObject(8, Align(8), false);
 
         MachineMemOperand *MMOStore = F->getMachineMemOperand(
-          MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
-          MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
-          MFI.getObjectAlignment(FrameIdx));
+            MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
+            MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
+            MFI.getObjectAlign(FrameIdx));
 
         // Store the SrcReg into the stack.
         BuildMI(*BB, MI, dl, TII->get(StoreOp))
@@ -11873,9 +12753,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
           .addMemOperand(MMOStore);
 
         MachineMemOperand *MMOLoad = F->getMachineMemOperand(
-          MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
-          MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
-          MFI.getObjectAlignment(FrameIdx));
+            MachinePointerInfo::getFixedStack(*F, FrameIdx, 0),
+            MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
+            MFI.getObjectAlign(FrameIdx));
 
         // Load from the stack where SrcReg is stored, and save to DestReg,
         // so we have done the RegClass conversion from RegClass::SrcReg to
@@ -11935,6 +12815,9 @@ PPCTargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
       .addReg(NewFPSCRReg)
       .addImm(0)
       .addImm(0);
+  } else if (MI.getOpcode() == PPC::PROBED_ALLOCA_32 ||
+             MI.getOpcode() == PPC::PROBED_ALLOCA_64) {
+    return emitProbedAlloca(MI, BB);
   } else {
     llvm_unreachable("Unexpected instr type to insert");
   }
@@ -13139,15 +14022,20 @@ static SDValue addShuffleForVecExtend(SDNode *N, SelectionDAG &DAG,
       DAG.getVectorShuffle(Input.getValueType(), dl, Input,
                            DAG.getUNDEF(Input.getValueType()), ShuffleMask);
 
-  EVT Ty = N->getValueType(0);
-  SDValue BV = DAG.getNode(PPCISD::SExtVElems, dl, Ty, Shuffle);
-  return BV;
+  EVT VT = N->getValueType(0);
+  SDValue Conv = DAG.getBitcast(VT, Shuffle);
+
+  EVT ExtVT = EVT::getVectorVT(*DAG.getContext(),
+                               Input.getValueType().getVectorElementType(),
+                               VT.getVectorNumElements());
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Conv,
+                     DAG.getValueType(ExtVT));
 }
 
 // Look for build vector patterns where input operands come from sign
 // extended vector_extract elements of specific indices. If the correct indices
-// aren't used, add a vector shuffle to fix up the indices and create a new
-// PPCISD:SExtVElems node which selects the vector sign extend instructions
+// aren't used, add a vector shuffle to fix up the indices and create
+// SIGN_EXTEND_INREG node which selects the vector sign extend instructions
 // during instruction selection.
 static SDValue combineBVOfVecSExt(SDNode *N, SelectionDAG &DAG) {
   // This array encodes the indices that the vector sign extend instructions
@@ -13470,8 +14358,8 @@ SDValue PPCTargetLowering::expandVSXLoadForLE(SDNode *N,
 
   // Do not expand to PPCISD::LXVD2X + PPCISD::XXSWAPD when the load is
   // aligned and the type is a vector with elements up to 4 bytes
-  if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
-      && VecTy.getScalarSizeInBits() <= 32 ) {
+  if (Subtarget.needsSwapsForVSXMemOps() && MMO->getAlign() >= Align(16) &&
+      VecTy.getScalarSizeInBits() <= 32) {
     return SDValue();
   }
 
@@ -13541,8 +14429,8 @@ SDValue PPCTargetLowering::expandVSXStoreForLE(SDNode *N,
 
   // Do not expand to PPCISD::XXSWAPD and PPCISD::STXVD2X when the load is
   // aligned and the type is a vector with elements up to 4 bytes
-  if (Subtarget.needsSwapsForVSXMemOps() && !(MMO->getAlignment()%16)
-      && VecTy.getScalarSizeInBits() <= 32 ) {
+  if (Subtarget.needsSwapsForVSXMemOps() && MMO->getAlign() >= Align(16) &&
+      VecTy.getScalarSizeInBits() <= 32) {
     return SDValue();
   }
 
@@ -13588,7 +14476,7 @@ SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
         (Op1VT == MVT::i32 || Op1VT == MVT::i64 ||
          (Subtarget.hasP9Vector() && (Op1VT == MVT::i16 || Op1VT == MVT::i8)));
 
-  if (ResVT == MVT::ppcf128 || !Subtarget.hasP8Altivec() ||
+  if (ResVT == MVT::ppcf128 || !Subtarget.hasP8Vector() ||
       cast<StoreSDNode>(N)->isTruncatingStore() || !ValidTypeForStoreFltAsInt)
     return SDValue();
 
@@ -13622,6 +14510,210 @@ SDValue PPCTargetLowering::combineStoreFPToInt(SDNode *N,
   return Val;
 }
 
+static bool isAlternatingShuffMask(const ArrayRef<int> &Mask, int NumElts) {
+  // Check that the source of the element keeps flipping
+  // (i.e. Mask[i] < NumElts -> Mask[i+i] >= NumElts).
+  bool PrevElemFromFirstVec = Mask[0] < NumElts;
+  for (int i = 1, e = Mask.size(); i < e; i++) {
+    if (PrevElemFromFirstVec && Mask[i] < NumElts)
+      return false;
+    if (!PrevElemFromFirstVec && Mask[i] >= NumElts)
+      return false;
+    PrevElemFromFirstVec = !PrevElemFromFirstVec;
+  }
+  return true;
+}
+
+static bool isSplatBV(SDValue Op) {
+  if (Op.getOpcode() != ISD::BUILD_VECTOR)
+    return false;
+  SDValue FirstOp;
+
+  // Find first non-undef input.
+  for (int i = 0, e = Op.getNumOperands(); i < e; i++) {
+    FirstOp = Op.getOperand(i);
+    if (!FirstOp.isUndef())
+      break;
+  }
+
+  // All inputs are undef or the same as the first non-undef input.
+  for (int i = 1, e = Op.getNumOperands(); i < e; i++)
+    if (Op.getOperand(i) != FirstOp && !Op.getOperand(i).isUndef())
+      return false;
+  return true;
+}
+
+static SDValue isScalarToVec(SDValue Op) {
+  if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return Op;
+  if (Op.getOpcode() != ISD::BITCAST)
+    return SDValue();
+  Op = Op.getOperand(0);
+  if (Op.getOpcode() == ISD::SCALAR_TO_VECTOR)
+    return Op;
+  return SDValue();
+}
+
+static void fixupShuffleMaskForPermutedSToV(SmallVectorImpl<int> &ShuffV,
+                                            int LHSMaxIdx, int RHSMinIdx,
+                                            int RHSMaxIdx, int HalfVec) {
+  for (int i = 0, e = ShuffV.size(); i < e; i++) {
+    int Idx = ShuffV[i];
+    if ((Idx >= 0 && Idx < LHSMaxIdx) || (Idx >= RHSMinIdx && Idx < RHSMaxIdx))
+      ShuffV[i] += HalfVec;
+  }
+  return;
+}
+
+// Replace a SCALAR_TO_VECTOR with a SCALAR_TO_VECTOR_PERMUTED except if
+// the original is:
+// (<n x Ty> (scalar_to_vector (Ty (extract_elt <n x Ty> %a, C))))
+// In such a case, just change the shuffle mask to extract the element
+// from the permuted index.
+static SDValue getSToVPermuted(SDValue OrigSToV, SelectionDAG &DAG) {
+  SDLoc dl(OrigSToV);
+  EVT VT = OrigSToV.getValueType();
+  assert(OrigSToV.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+         "Expecting a SCALAR_TO_VECTOR here");
+  SDValue Input = OrigSToV.getOperand(0);
+
+  if (Input.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    ConstantSDNode *Idx = dyn_cast<ConstantSDNode>(Input.getOperand(1));
+    SDValue OrigVector = Input.getOperand(0);
+
+    // Can't handle non-const element indices or different vector types
+    // for the input to the extract and the output of the scalar_to_vector.
+    if (Idx && VT == OrigVector.getValueType()) {
+      SmallVector<int, 16> NewMask(VT.getVectorNumElements(), -1);
+      NewMask[VT.getVectorNumElements() / 2] = Idx->getZExtValue();
+      return DAG.getVectorShuffle(VT, dl, OrigVector, OrigVector, NewMask);
+    }
+  }
+  return DAG.getNode(PPCISD::SCALAR_TO_VECTOR_PERMUTED, dl, VT,
+                     OrigSToV.getOperand(0));
+}
+
+// On little endian subtargets, combine shuffles such as:
+// vector_shuffle<16,1,17,3,18,5,19,7,20,9,21,11,22,13,23,15>, <zero>, %b
+// into:
+// vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7>, <zero>, %b
+// because the latter can be matched to a single instruction merge.
+// Furthermore, SCALAR_TO_VECTOR on little endian always involves a permute
+// to put the value into element zero. Adjust the shuffle mask so that the
+// vector can remain in permuted form (to prevent a swap prior to a shuffle).
+SDValue PPCTargetLowering::combineVectorShuffle(ShuffleVectorSDNode *SVN,
+                                                SelectionDAG &DAG) const {
+  SDValue LHS = SVN->getOperand(0);
+  SDValue RHS = SVN->getOperand(1);
+  auto Mask = SVN->getMask();
+  int NumElts = LHS.getValueType().getVectorNumElements();
+  SDValue Res(SVN, 0);
+  SDLoc dl(SVN);
+
+  // None of these combines are useful on big endian systems since the ISA
+  // already has a big endian bias.
+  if (!Subtarget.isLittleEndian() || !Subtarget.hasVSX())
+    return Res;
+
+  // If this is not a shuffle of a shuffle and the first element comes from
+  // the second vector, canonicalize to the commuted form. This will make it
+  // more likely to match one of the single instruction patterns.
+  if (Mask[0] >= NumElts && LHS.getOpcode() != ISD::VECTOR_SHUFFLE &&
+      RHS.getOpcode() != ISD::VECTOR_SHUFFLE) {
+    std::swap(LHS, RHS);
+    Res = DAG.getCommutedVectorShuffle(*SVN);
+    Mask = cast<ShuffleVectorSDNode>(Res)->getMask();
+  }
+
+  // Adjust the shuffle mask if either input vector comes from a
+  // SCALAR_TO_VECTOR and keep the respective input vector in permuted
+  // form (to prevent the need for a swap).
+  SmallVector<int, 16> ShuffV(Mask.begin(), Mask.end());
+  SDValue SToVLHS = isScalarToVec(LHS);
+  SDValue SToVRHS = isScalarToVec(RHS);
+  if (SToVLHS || SToVRHS) {
+    int NumEltsIn = SToVLHS ? SToVLHS.getValueType().getVectorNumElements()
+                            : SToVRHS.getValueType().getVectorNumElements();
+    int NumEltsOut = ShuffV.size();
+
+    // Initially assume that neither input is permuted. These will be adjusted
+    // accordingly if either input is.
+    int LHSMaxIdx = -1;
+    int RHSMinIdx = -1;
+    int RHSMaxIdx = -1;
+    int HalfVec = LHS.getValueType().getVectorNumElements() / 2;
+
+    // Get the permuted scalar to vector nodes for the source(s) that come from
+    // ISD::SCALAR_TO_VECTOR.
+    if (SToVLHS) {
+      // Set up the values for the shuffle vector fixup.
+      LHSMaxIdx = NumEltsOut / NumEltsIn;
+      SToVLHS = getSToVPermuted(SToVLHS, DAG);
+      if (SToVLHS.getValueType() != LHS.getValueType())
+        SToVLHS = DAG.getBitcast(LHS.getValueType(), SToVLHS);
+      LHS = SToVLHS;
+    }
+    if (SToVRHS) {
+      RHSMinIdx = NumEltsOut;
+      RHSMaxIdx = NumEltsOut / NumEltsIn + RHSMinIdx;
+      SToVRHS = getSToVPermuted(SToVRHS, DAG);
+      if (SToVRHS.getValueType() != RHS.getValueType())
+        SToVRHS = DAG.getBitcast(RHS.getValueType(), SToVRHS);
+      RHS = SToVRHS;
+    }
+
+    // Fix up the shuffle mask to reflect where the desired element actually is.
+    // The minimum and maximum indices that correspond to element zero for both
+    // the LHS and RHS are computed and will control which shuffle mask entries
+    // are to be changed. For example, if the RHS is permuted, any shuffle mask
+    // entries in the range [RHSMinIdx,RHSMaxIdx) will be incremented by
+    // HalfVec to refer to the corresponding element in the permuted vector.
+    fixupShuffleMaskForPermutedSToV(ShuffV, LHSMaxIdx, RHSMinIdx, RHSMaxIdx,
+                                    HalfVec);
+    Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
+
+    // We may have simplified away the shuffle. We won't be able to do anything
+    // further with it here.
+    if (!isa<ShuffleVectorSDNode>(Res))
+      return Res;
+    Mask = cast<ShuffleVectorSDNode>(Res)->getMask();
+  }
+
+  // The common case after we commuted the shuffle is that the RHS is a splat
+  // and we have elements coming in from the splat at indices that are not
+  // conducive to using a merge.
+  // Example:
+  // vector_shuffle<0,17,1,19,2,21,3,23,4,25,5,27,6,29,7,31> t1, <zero>
+  if (!isSplatBV(RHS))
+    return Res;
+
+  // We are looking for a mask such that all even elements are from
+  // one vector and all odd elements from the other.
+  if (!isAlternatingShuffMask(Mask, NumElts))
+    return Res;
+
+  // Adjust the mask so we are pulling in the same index from the splat
+  // as the index from the interesting vector in consecutive elements.
+  // Example (even elements from first vector):
+  // vector_shuffle<0,16,1,17,2,18,3,19,4,20,5,21,6,22,7,23> t1, <zero>
+  if (Mask[0] < NumElts)
+    for (int i = 1, e = Mask.size(); i < e; i += 2)
+      ShuffV[i] = (ShuffV[i - 1] + NumElts);
+  // Example (odd elements from first vector):
+  // vector_shuffle<16,0,17,1,18,2,19,3,20,4,21,5,22,6,23,7> t1, <zero>
+  else
+    for (int i = 0, e = Mask.size(); i < e; i += 2)
+      ShuffV[i] = (ShuffV[i + 1] + NumElts);
+
+  // If the RHS has undefs, we need to remove them since we may have created
+  // a shuffle that adds those instead of the splat value.
+  SDValue SplatVal = cast<BuildVectorSDNode>(RHS.getNode())->getSplatValue();
+  RHS = DAG.getSplatBuildVector(RHS.getValueType(), dl, SplatVal);
+
+  Res = DAG.getVectorShuffle(SVN->getValueType(0), dl, LHS, RHS, ShuffV);
+  return Res;
+}
+
 SDValue PPCTargetLowering::combineVReverseMemOP(ShuffleVectorSDNode *SVN,
                                                 LSBaseSDNode *LSBase,
                                                 DAGCombinerInfo &DCI) const {
@@ -13693,6 +14785,9 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
     return combineSRL(N, DCI);
   case ISD::MUL:
     return combineMUL(N, DCI);
+  case ISD::FMA:
+  case PPCISD::FNMSUB:
+    return combineFMALike(N, DCI);
   case PPCISD::SHL:
     if (isNullConstant(N->getOperand(0))) // 0 << V -> 0.
         return N->getOperand(0);
@@ -13728,7 +14823,7 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
       LSBaseSDNode* LSBase = cast<LSBaseSDNode>(N->getOperand(0));
       return combineVReverseMemOP(cast<ShuffleVectorSDNode>(N), LSBase, DCI);
     }
-    break;
+    return combineVectorShuffle(cast<ShuffleVectorSDNode>(N), DCI.DAG);
   case ISD::STORE: {
 
     EVT Op1VT = N->getOperand(1).getValueType();
@@ -13935,17 +15030,18 @@ SDValue PPCTargetLowering::PerformDAGCombine(SDNode *N,
 
     EVT MemVT = LD->getMemoryVT();
     Type *Ty = MemVT.getTypeForEVT(*DAG.getContext());
-    unsigned ABIAlignment = DAG.getDataLayout().getABITypeAlignment(Ty);
+    Align ABIAlignment = DAG.getDataLayout().getABITypeAlign(Ty);
     Type *STy = MemVT.getScalarType().getTypeForEVT(*DAG.getContext());
-    unsigned ScalarABIAlignment = DAG.getDataLayout().getABITypeAlignment(STy);
+    Align ScalarABIAlignment = DAG.getDataLayout().getABITypeAlign(STy);
     if (LD->isUnindexed() && VT.isVector() &&
         ((Subtarget.hasAltivec() && ISD::isNON_EXTLoad(N) &&
           // P8 and later hardware should just use LOAD.
-          !Subtarget.hasP8Vector() && (VT == MVT::v16i8 || VT == MVT::v8i16 ||
-                                       VT == MVT::v4i32 || VT == MVT::v4f32)) ||
+          !Subtarget.hasP8Vector() &&
+          (VT == MVT::v16i8 || VT == MVT::v8i16 || VT == MVT::v4i32 ||
+           VT == MVT::v4f32)) ||
          (Subtarget.hasQPX() && (VT == MVT::v4f64 || VT == MVT::v4f32) &&
-          LD->getAlignment() >= ScalarABIAlignment)) &&
-        LD->getAlignment() < ABIAlignment) {
+          LD->getAlign() >= ScalarABIAlignment)) &&
+        LD->getAlign() < ABIAlignment) {
       // This is a type-legal unaligned Altivec or QPX load.
       SDValue Chain = LD->getChain();
       SDValue Ptr = LD->getBasePtr();
@@ -14492,6 +15588,7 @@ Align PPCTargetLowering::getPrefLoopAlignment(MachineLoop *ML) const {
   case PPC::DIR_PWR7:
   case PPC::DIR_PWR8:
   case PPC::DIR_PWR9:
+  case PPC::DIR_PWR10:
   case PPC::DIR_PWR_FUTURE: {
     if (!ML)
       break;
@@ -14898,18 +15995,16 @@ SDValue PPCTargetLowering::LowerFRAMEADDR(SDValue Op,
 Register PPCTargetLowering::getRegisterByName(const char* RegName, LLT VT,
                                               const MachineFunction &MF) const {
   bool isPPC64 = Subtarget.isPPC64();
-  bool IsDarwinABI = Subtarget.isDarwinABI();
 
   bool is64Bit = isPPC64 && VT == LLT::scalar(64);
   if (!is64Bit && VT != LLT::scalar(32))
     report_fatal_error("Invalid register global variable type");
 
   Register Reg = StringSwitch<Register>(RegName)
-                   .Case("r1", is64Bit ? PPC::X1 : PPC::R1)
-                   .Case("r2", (IsDarwinABI || isPPC64) ? Register() : PPC::R2)
-                   .Case("r13", (!isPPC64 && IsDarwinABI) ? Register() :
-                                  (is64Bit ? PPC::X13 : PPC::R13))
-                   .Default(Register());
+                     .Case("r1", is64Bit ? PPC::X1 : PPC::R1)
+                     .Case("r2", isPPC64 ? Register() : PPC::R2)
+                     .Case("r13", (is64Bit ? PPC::X13 : PPC::R13))
+                     .Default(Register());
 
   if (Reg)
     return Reg;
@@ -15002,7 +16097,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = -VT.getStoreSize()+1;
     Info.size = 2*VT.getStoreSize()-1;
-    Info.align = Align::None();
+    Info.align = Align(1);
     Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
@@ -15036,7 +16131,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = I.getArgOperand(0);
     Info.offset = 0;
     Info.size = VT.getStoreSize();
-    Info.align = Align::None();
+    Info.align = Align(1);
     Info.flags = MachineMemOperand::MOLoad;
     return true;
   }
@@ -15088,7 +16183,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = -VT.getStoreSize()+1;
     Info.size = 2*VT.getStoreSize()-1;
-    Info.align = Align::None();
+    Info.align = Align(1);
     Info.flags = MachineMemOperand::MOStore;
     return true;
   }
@@ -15121,7 +16216,7 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     Info.ptrVal = I.getArgOperand(1);
     Info.offset = 0;
     Info.size = VT.getStoreSize();
-    Info.align = Align::None();
+    Info.align = Align(1);
     Info.flags = MachineMemOperand::MOStore;
     return true;
   }
@@ -15132,35 +16227,24 @@ bool PPCTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
   return false;
 }
 
-/// getOptimalMemOpType - Returns the target specific optimal type for load
-/// and store operations as a result of memset, memcpy, and memmove
-/// lowering. If DstAlign is zero that means it's safe to destination
-/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
-/// means there isn't a need to check it against alignment requirement,
-/// probably because the source does not need to be loaded. If 'IsMemset' is
-/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
-/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
-/// source is constant so it does not need to be loaded.
 /// It returns EVT::Other if the type should be determined using generic
 /// target-independent logic.
 EVT PPCTargetLowering::getOptimalMemOpType(
-    uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
-    bool ZeroMemset, bool MemcpyStrSrc,
-    const AttributeList &FuncAttributes) const {
+    const MemOp &Op, const AttributeList &FuncAttributes) const {
   if (getTargetMachine().getOptLevel() != CodeGenOpt::None) {
     // When expanding a memset, require at least two QPX instructions to cover
     // the cost of loading the value to be stored from the constant pool.
-    if (Subtarget.hasQPX() && Size >= 32 && (!IsMemset || Size >= 64) &&
-       (!SrcAlign || SrcAlign >= 32) && (!DstAlign || DstAlign >= 32) &&
+    if (Subtarget.hasQPX() && Op.size() >= 32 &&
+        (Op.isMemcpy() || Op.size() >= 64) && Op.isAligned(Align(32)) &&
         !FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
       return MVT::v4f64;
     }
 
     // We should use Altivec/VSX loads and stores when available. For unaligned
     // addresses, unaligned VSX loads are only fast starting with the P8.
-    if (Subtarget.hasAltivec() && Size >= 16 &&
-        (((!SrcAlign || SrcAlign >= 16) && (!DstAlign || DstAlign >= 16)) ||
-         ((IsMemset && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
+    if (Subtarget.hasAltivec() && Op.size() >= 16 &&
+        (Op.isAligned(Align(16)) ||
+         ((Op.isMemset() && Subtarget.hasVSX()) || Subtarget.hasP8Vector())))
       return MVT::v4i32;
   }
 
@@ -15251,7 +16335,8 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
   if (!VT.isSimple())
     return false;
 
-  if (VT.isFloatingPoint() && !Subtarget.allowsUnalignedFPAccess())
+  if (VT.isFloatingPoint() && !VT.isVector() &&
+      !Subtarget.allowsUnalignedFPAccess())
     return false;
 
   if (VT.getSimpleVT().isVector()) {
@@ -15275,22 +16360,48 @@ bool PPCTargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
 
 bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                                    EVT VT) const {
-  VT = VT.getScalarType();
-
-  if (!VT.isSimple())
-    return false;
+  return isFMAFasterThanFMulAndFAdd(
+      MF.getFunction(), VT.getTypeForEVT(MF.getFunction().getContext()));
+}
 
-  switch (VT.getSimpleVT().SimpleTy) {
-  case MVT::f32:
-  case MVT::f64:
+bool PPCTargetLowering::isFMAFasterThanFMulAndFAdd(const Function &F,
+                                                   Type *Ty) const {
+  switch (Ty->getScalarType()->getTypeID()) {
+  case Type::FloatTyID:
+  case Type::DoubleTyID:
     return true;
-  case MVT::f128:
-    return (EnableQuadPrecision && Subtarget.hasP9Vector());
+  case Type::FP128TyID:
+    return Subtarget.hasP9Vector();
   default:
-    break;
+    return false;
   }
+}
 
-  return false;
+// Currently this is a copy from AArch64TargetLowering::isProfitableToHoist.
+// FIXME: add more patterns which are profitable to hoist.
+bool PPCTargetLowering::isProfitableToHoist(Instruction *I) const {
+  if (I->getOpcode() != Instruction::FMul)
+    return true;
+
+  if (!I->hasOneUse())
+    return true;
+
+  Instruction *User = I->user_back();
+  assert(User && "A single use instruction with no uses.");
+
+  if (User->getOpcode() != Instruction::FSub &&
+      User->getOpcode() != Instruction::FAdd)
+    return true;
+
+  const TargetOptions &Options = getTargetMachine().Options;
+  const Function *F = I->getFunction();
+  const DataLayout &DL = F->getParent()->getDataLayout();
+  Type *Ty = User->getOperand(0)->getType();
+
+  return !(
+      isFMAFasterThanFMulAndFAdd(*F, Ty) &&
+      isOperationLegalOrCustom(ISD::FMA, getValueType(DL, Ty)) &&
+      (Options.AllowFPOpFusion == FPOpFusion::Fast || Options.UnsafeFPMath));
 }
 
 const MCPhysReg *
@@ -15306,12 +16417,12 @@ PPCTargetLowering::getScratchRegisters(CallingConv::ID) const {
   return ScratchRegs;
 }
 
-unsigned PPCTargetLowering::getExceptionPointerRegister(
+Register PPCTargetLowering::getExceptionPointerRegister(
     const Constant *PersonalityFn) const {
   return Subtarget.isPPC64() ? PPC::X3 : PPC::R3;
 }
 
-unsigned PPCTargetLowering::getExceptionSelectorRegister(
+Register PPCTargetLowering::getExceptionSelectorRegister(
     const Constant *PersonalityFn) const {
   return Subtarget.isPPC64() ? PPC::X4 : PPC::R4;
 }
@@ -15342,58 +16453,83 @@ PPCTargetLowering::createFastISel(FunctionLoweringInfo &FuncInfo,
   return PPC::createFastISel(FuncInfo, LibInfo);
 }
 
-void PPCTargetLowering::initializeSplitCSR(MachineBasicBlock *Entry) const {
-  if (Subtarget.isDarwinABI()) return;
-  if (!Subtarget.isPPC64()) return;
-
-  // Update IsSplitCSR in PPCFunctionInfo
-  PPCFunctionInfo *PFI = Entry->getParent()->getInfo<PPCFunctionInfo>();
-  PFI->setIsSplitCSR(true);
+// 'Inverted' means the FMA opcode after negating one multiplicand.
+// For example, (fma -a b c) = (fnmsub a b c)
+static unsigned invertFMAOpcode(unsigned Opc) {
+  switch (Opc) {
+  default:
+    llvm_unreachable("Invalid FMA opcode for PowerPC!");
+  case ISD::FMA:
+    return PPCISD::FNMSUB;
+  case PPCISD::FNMSUB:
+    return ISD::FMA;
+  }
 }
 
-void PPCTargetLowering::insertCopiesSplitCSR(
-  MachineBasicBlock *Entry,
-  const SmallVectorImpl<MachineBasicBlock *> &Exits) const {
-  const PPCRegisterInfo *TRI = Subtarget.getRegisterInfo();
-  const MCPhysReg *IStart = TRI->getCalleeSavedRegsViaCopy(Entry->getParent());
-  if (!IStart)
-    return;
+SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
+                                                bool LegalOps, bool OptForSize,
+                                                NegatibleCost &Cost,
+                                                unsigned Depth) const {
+  if (Depth > SelectionDAG::MaxRecursionDepth)
+    return SDValue();
 
-  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
-  MachineRegisterInfo *MRI = &Entry->getParent()->getRegInfo();
-  MachineBasicBlock::iterator MBBI = Entry->begin();
-  for (const MCPhysReg *I = IStart; *I; ++I) {
-    const TargetRegisterClass *RC = nullptr;
-    if (PPC::G8RCRegClass.contains(*I))
-      RC = &PPC::G8RCRegClass;
-    else if (PPC::F8RCRegClass.contains(*I))
-      RC = &PPC::F8RCRegClass;
-    else if (PPC::CRRCRegClass.contains(*I))
-      RC = &PPC::CRRCRegClass;
-    else if (PPC::VRRCRegClass.contains(*I))
-      RC = &PPC::VRRCRegClass;
-    else
-      llvm_unreachable("Unexpected register class in CSRsViaCopy!");
+  unsigned Opc = Op.getOpcode();
+  EVT VT = Op.getValueType();
+  SDNodeFlags Flags = Op.getNode()->getFlags();
+
+  switch (Opc) {
+  case PPCISD::FNMSUB:
+    // TODO: QPX subtarget is deprecated. No transformation here.
+    if (!Op.hasOneUse() || !isTypeLegal(VT) || Subtarget.hasQPX())
+      break;
+
+    const TargetOptions &Options = getTargetMachine().Options;
+    SDValue N0 = Op.getOperand(0);
+    SDValue N1 = Op.getOperand(1);
+    SDValue N2 = Op.getOperand(2);
+    SDLoc Loc(Op);
+
+    NegatibleCost N2Cost = NegatibleCost::Expensive;
+    SDValue NegN2 =
+        getNegatedExpression(N2, DAG, LegalOps, OptForSize, N2Cost, Depth + 1);
+
+    if (!NegN2)
+      return SDValue();
 
-    Register NewVR = MRI->createVirtualRegister(RC);
-    // Create copy from CSR to a virtual register.
-    // FIXME: this currently does not emit CFI pseudo-instructions, it works
-    // fine for CXX_FAST_TLS since the C++-style TLS access functions should be
-    // nounwind. If we want to generalize this later, we may need to emit
-    // CFI pseudo-instructions.
-    assert(Entry->getParent()->getFunction().hasFnAttribute(
-             Attribute::NoUnwind) &&
-           "Function should be nounwind in insertCopiesSplitCSR!");
-    Entry->addLiveIn(*I);
-    BuildMI(*Entry, MBBI, DebugLoc(), TII->get(TargetOpcode::COPY), NewVR)
-      .addReg(*I);
+    // (fneg (fnmsub a b c)) => (fnmsub (fneg a) b (fneg c))
+    // (fneg (fnmsub a b c)) => (fnmsub a (fneg b) (fneg c))
+    // These transformations may change sign of zeroes. For example,
+    // -(-ab-(-c))=-0 while -(-(ab-c))=+0 when a=b=c=1.
+    if (Flags.hasNoSignedZeros() || Options.NoSignedZerosFPMath) {
+      // Try and choose the cheaper one to negate.
+      NegatibleCost N0Cost = NegatibleCost::Expensive;
+      SDValue NegN0 = getNegatedExpression(N0, DAG, LegalOps, OptForSize,
+                                           N0Cost, Depth + 1);
+
+      NegatibleCost N1Cost = NegatibleCost::Expensive;
+      SDValue NegN1 = getNegatedExpression(N1, DAG, LegalOps, OptForSize,
+                                           N1Cost, Depth + 1);
+
+      if (NegN0 && N0Cost <= N1Cost) {
+        Cost = std::min(N0Cost, N2Cost);
+        return DAG.getNode(Opc, Loc, VT, NegN0, N1, NegN2, Flags);
+      } else if (NegN1) {
+        Cost = std::min(N1Cost, N2Cost);
+        return DAG.getNode(Opc, Loc, VT, N0, NegN1, NegN2, Flags);
+      }
+    }
+
+    // (fneg (fnmsub a b c)) => (fma a b (fneg c))
+    if (isOperationLegal(ISD::FMA, VT)) {
+      Cost = N2Cost;
+      return DAG.getNode(ISD::FMA, Loc, VT, N0, N1, NegN2, Flags);
+    }
 
-    // Insert the copy-back instructions right before the terminator.
-    for (auto *Exit : Exits)
-      BuildMI(*Exit, Exit->getFirstTerminator(), DebugLoc(),
-              TII->get(TargetOpcode::COPY), *I)
-        .addReg(NewVR);
+    break;
   }
+
+  return TargetLowering::getNegatedExpression(Op, DAG, LegalOps, OptForSize,
+                                              Cost, Depth);
 }
 
 // Override to enable LOAD_STACK_GUARD lowering on Linux.
@@ -15421,6 +16557,13 @@ bool PPCTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
     return false;
   case MVT::f32:
   case MVT::f64:
+    if (Subtarget.hasPrefixInstrs()) {
+      // With prefixed instructions, we can materialize anything that can be
+      // represented with a 32-bit immediate, not just positive zero.
+      APFloat APFloatOfImm = Imm;
+      return convertToNonDenormSingle(APFloatOfImm);
+    }
+    LLVM_FALLTHROUGH;
   case MVT::ppcf128:
     return Imm.isPosZero();
   }
@@ -15591,10 +16734,59 @@ static SDValue combineADDToADDZE(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Transform
+// (add C1, (MAT_PCREL_ADDR GlobalAddr+C2)) to
+// (MAT_PCREL_ADDR GlobalAddr+(C1+C2))
+// In this case both C1 and C2 must be known constants.
+// C1+C2 must fit into a 34 bit signed integer.
+static SDValue combineADDToMAT_PCREL_ADDR(SDNode *N, SelectionDAG &DAG,
+                                          const PPCSubtarget &Subtarget) {
+  if (!Subtarget.isUsingPCRelativeCalls())
+    return SDValue();
+
+  // Check both Operand 0 and Operand 1 of the ADD node for the PCRel node.
+  // If we find that node try to cast the Global Address and the Constant.
+  SDValue LHS = N->getOperand(0);
+  SDValue RHS = N->getOperand(1);
+
+  if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
+    std::swap(LHS, RHS);
+
+  if (LHS.getOpcode() != PPCISD::MAT_PCREL_ADDR)
+    return SDValue();
+
+  // Operand zero of PPCISD::MAT_PCREL_ADDR is the GA node.
+  GlobalAddressSDNode *GSDN = dyn_cast<GlobalAddressSDNode>(LHS.getOperand(0));
+  ConstantSDNode* ConstNode = dyn_cast<ConstantSDNode>(RHS);
+
+  // Check that both casts succeeded.
+  if (!GSDN || !ConstNode)
+    return SDValue();
+
+  int64_t NewOffset = GSDN->getOffset() + ConstNode->getSExtValue();
+  SDLoc DL(GSDN);
+
+  // The signed int offset needs to fit in 34 bits.
+  if (!isInt<34>(NewOffset))
+    return SDValue();
+
+  // The new global address is a copy of the old global address except
+  // that it has the updated Offset.
+  SDValue GA =
+      DAG.getTargetGlobalAddress(GSDN->getGlobal(), DL, GSDN->getValueType(0),
+                                 NewOffset, GSDN->getTargetFlags());
+  SDValue MatPCRel =
+      DAG.getNode(PPCISD::MAT_PCREL_ADDR, DL, GSDN->getValueType(0), GA);
+  return MatPCRel;
+}
+
 SDValue PPCTargetLowering::combineADD(SDNode *N, DAGCombinerInfo &DCI) const {
   if (auto Value = combineADDToADDZE(N, DCI.DAG, Subtarget))
     return Value;
 
+  if (auto Value = combineADDToMAT_PCREL_ADDR(N, DCI.DAG, Subtarget))
+    return Value;
+
   return SDValue();
 }
 
@@ -15619,6 +16811,24 @@ SDValue PPCTargetLowering::combineTRUNCATE(SDNode *N,
   SDLoc dl(N);
   SDValue Op0 = N->getOperand(0);
 
+  // fold (truncate (abs (sub (zext a), (zext b)))) -> (vabsd a, b)
+  if (Subtarget.hasP9Altivec() && Op0.getOpcode() == ISD::ABS) {
+    EVT VT = N->getValueType(0);
+    if (VT != MVT::v4i32 && VT != MVT::v8i16 && VT != MVT::v16i8)
+      return SDValue();
+    SDValue Sub = Op0.getOperand(0);
+    if (Sub.getOpcode() == ISD::SUB) {
+      SDValue SubOp0 = Sub.getOperand(0);
+      SDValue SubOp1 = Sub.getOperand(1);
+      if ((SubOp0.getOpcode() == ISD::ZERO_EXTEND) &&
+          (SubOp1.getOpcode() == ISD::ZERO_EXTEND)) {
+        return DCI.DAG.getNode(PPCISD::VABSD, dl, VT, SubOp0.getOperand(0),
+                               SubOp1.getOperand(0),
+                               DCI.DAG.getTargetConstant(0, dl, MVT::i32));
+      }
+    }
+  }
+
   // Looking for a truncate of i128 to i64.
   if (Op0.getValueType() != MVT::i128 || N->getValueType(0) != MVT::i64)
     return SDValue();
@@ -15673,6 +16883,7 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
       // vector        7       2      2
       return true;
     case PPC::DIR_PWR9:
+    case PPC::DIR_PWR10:
     case PPC::DIR_PWR_FUTURE:
       //  type        mul     add    shl
       // scalar        5       2      2
@@ -15734,6 +16945,44 @@ SDValue PPCTargetLowering::combineMUL(SDNode *N, DAGCombinerInfo &DCI) const {
   }
 }
 
+// Combine fma-like op (like fnmsub) with fnegs to appropriate op. Do this
+// in combiner since we need to check SD flags and other subtarget features.
+SDValue PPCTargetLowering::combineFMALike(SDNode *N,
+                                          DAGCombinerInfo &DCI) const {
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDValue N2 = N->getOperand(2);
+  SDNodeFlags Flags = N->getFlags();
+  EVT VT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+  const TargetOptions &Options = getTargetMachine().Options;
+  unsigned Opc = N->getOpcode();
+  bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+  bool LegalOps = !DCI.isBeforeLegalizeOps();
+  SDLoc Loc(N);
+
+  // TODO: QPX subtarget is deprecated. No transformation here.
+  if (Subtarget.hasQPX() || !isOperationLegal(ISD::FMA, VT))
+    return SDValue();
+
+  // Allowing transformation to FNMSUB may change sign of zeroes when ab-c=0
+  // since (fnmsub a b c)=-0 while c-ab=+0.
+  if (!Flags.hasNoSignedZeros() && !Options.NoSignedZerosFPMath)
+    return SDValue();
+
+  // (fma (fneg a) b c) => (fnmsub a b c)
+  // (fnmsub (fneg a) b c) => (fma a b c)
+  if (SDValue NegN0 = getCheaperNegatedExpression(N0, DAG, LegalOps, CodeSize))
+    return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, NegN0, N1, N2, Flags);
+
+  // (fma a (fneg b) c) => (fnmsub a b c)
+  // (fnmsub a (fneg b) c) => (fma a b c)
+  if (SDValue NegN1 = getCheaperNegatedExpression(N1, DAG, LegalOps, CodeSize))
+    return DAG.getNode(invertFMAOpcode(Opc), Loc, VT, N0, NegN1, N2, Flags);
+
+  return SDValue();
+}
+
 bool PPCTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   // Only duplicate to increase tail-calls for the 64bit SysV ABIs.
   if (!Subtarget.is64BitELFABI())
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index e0c381827b87..768eaa43e013 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -85,19 +85,10 @@ namespace llvm {
     /// VSFRC that is sign-extended from ByteWidth to a 64-byte integer.
     VEXTS,
 
-    /// SExtVElems, takes an input vector of a smaller type and sign
-    /// extends to an output vector of a larger type.
-    SExtVElems,
-
     /// Reciprocal estimate instructions (unary FP ops).
     FRE,
     FRSQRTE,
 
-    // VMADDFP, VNMSUBFP - The VMADDFP and VNMSUBFP instructions, taking
-    // three v4f32 operands and producing a v4f32 result.
-    VMADDFP,
-    VNMSUBFP,
-
     /// VPERM - The PPC VPERM Instruction.
     ///
     VPERM,
@@ -106,6 +97,15 @@ namespace llvm {
     ///
     XXSPLT,
 
+    /// XXSPLTI_SP_TO_DP - The PPC VSX splat instructions for immediates for
+    /// converting immediate single precision numbers to double precision
+    /// vector or scalar.
+    XXSPLTI_SP_TO_DP,
+
+    /// XXSPLTI32DX - The PPC XXSPLTI32DX instruction.
+    ///
+    XXSPLTI32DX,
+
     /// VECINSERT - The PPC vector insert instruction
     ///
     VECINSERT,
@@ -142,6 +142,10 @@ namespace llvm {
     /// dynamic alloca.
     DYNAREAOFFSET,
 
+    /// To avoid stack clash, allocation is performed by block and each block is
+    /// probed.
+    PROBED_ALLOCA,
+
     /// GlobalBaseReg - On Darwin, this node represents the result of the mflr
     /// at function entry, used for PIC code.
     GlobalBaseReg,
@@ -157,6 +161,9 @@ namespace llvm {
     SRA,
     SHL,
 
+    /// FNMSUB - Negated multiply-subtract instruction.
+    FNMSUB,
+
     /// EXTSWSLI = The PPC extswsli instruction, which does an extend-sign
     /// word and shift left immediate.
     EXTSWSLI,
@@ -169,9 +176,11 @@ namespace llvm {
 
     /// CALL - A direct function call.
     /// CALL_NOP is a call with the special NOP which follows 64-bit
+    /// CALL_NOTOC the caller does not use the TOC.
     /// SVR4 calls and 32-bit/64-bit AIX calls.
     CALL,
     CALL_NOP,
+    CALL_NOTOC,
 
     /// CHAIN,FLAG = MTCTR(VAL, CHAIN[, INFLAG]) - Directly corresponds to a
     /// MTCTR instruction.
@@ -225,6 +234,14 @@ namespace llvm {
     /// As with SINT_VEC_TO_FP, used for converting illegal types.
     UINT_VEC_TO_FP,
 
+    /// PowerPC instructions that have SCALAR_TO_VECTOR semantics tend to
+    /// place the value into the least significant element of the most
+    /// significant doubleword in the vector. This is not element zero for
+    /// anything smaller than a doubleword on either endianness. This node has
+    /// the same semantics as SCALAR_TO_VECTOR except that the value remains in
+    /// the aforementioned location in the vector register.
+    SCALAR_TO_VECTOR_PERMUTED,
+
     // FIXME: Remove these once the ANDI glue bug is fixed:
     /// i1 = ANDI_rec_1_[EQ|GT]_BIT(i32 or i64 x) - Represents the result of the
     /// eq or gt bit of CR0 after executing andi. x, 1. This is used to
@@ -430,6 +447,11 @@ namespace llvm {
     /// lower (IDX=1) half of v4f32 to v2f64.
     FP_EXTEND_HALF,
 
+    /// MAT_PCREL_ADDR = Materialize a PC Relative address. This can be done
+    /// either through an add like PADDI or through a PC Relative load like
+    /// PLD.
+    MAT_PCREL_ADDR,
+
     /// CHAIN = STBRX CHAIN, GPRC, Ptr, Type - This is a
     /// byte-swapping store instruction.  It byte-swaps the low "Type" bits of
     /// the GPRC input, then stores it through Ptr.  Type can be either i16 or
@@ -637,7 +659,7 @@ namespace llvm {
     /// then the VPERM for the shuffle. All in all a very slow sequence.
     TargetLoweringBase::LegalizeTypeAction getPreferredVectorAction(MVT VT)
       const override {
-      if (VT.getScalarSizeInBits() % 8 == 0)
+      if (VT.getVectorNumElements() != 1 && VT.getScalarSizeInBits() % 8 == 0)
         return TypeWidenVector;
       return TargetLoweringBase::getPreferredVectorAction(VT);
     }
@@ -676,17 +698,9 @@ namespace llvm {
       return VT.isScalarInteger();
     }
 
-    bool supportSplitCSR(MachineFunction *MF) const override {
-      return
-        MF->getFunction().getCallingConv() == CallingConv::CXX_FAST_TLS &&
-        MF->getFunction().hasFnAttribute(Attribute::NoUnwind);
-    }
-
-    void initializeSplitCSR(MachineBasicBlock *Entry) const override;
-
-    void insertCopiesSplitCSR(
-      MachineBasicBlock *Entry,
-      const SmallVectorImpl<MachineBasicBlock *> &Exits) const override;
+    SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG, bool LegalOps,
+                                 bool OptForSize, NegatibleCost &Cost,
+                                 unsigned Depth = 0) const override;
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
@@ -716,7 +730,7 @@ namespace llvm {
     /// Returns false if it can be represented by [r+imm], which are preferred.
     bool SelectAddressRegReg(SDValue N, SDValue &Base, SDValue &Index,
                              SelectionDAG &DAG,
-                             unsigned EncodingAlignment = 0) const;
+                             MaybeAlign EncodingAlignment = None) const;
 
     /// SelectAddressRegImm - Returns true if the address N can be represented
     /// by a base register plus a signed 16-bit displacement [r+imm], and if it
@@ -725,13 +739,17 @@ namespace llvm {
     /// requirement, i.e. multiples of 4 for DS form.
     bool SelectAddressRegImm(SDValue N, SDValue &Disp, SDValue &Base,
                              SelectionDAG &DAG,
-                             unsigned EncodingAlignment) const;
+                             MaybeAlign EncodingAlignment) const;
 
     /// SelectAddressRegRegOnly - Given the specified addressed, force it to be
     /// represented as an indexed [r+r] operation.
     bool SelectAddressRegRegOnly(SDValue N, SDValue &Base, SDValue &Index,
                                  SelectionDAG &DAG) const;
 
+    /// SelectAddressPCRel - Represent the specified address as pc relative to
+    /// be represented as [pc+imm]
+    bool SelectAddressPCRel(SDValue N, SDValue &Base) const;
+
     Sched::Preference getSchedulingPreference(SDNode *N) const override;
 
     /// LowerOperation - Provide custom lowering hooks for some operations.
@@ -794,6 +812,13 @@ namespace llvm {
     MachineBasicBlock *emitEHSjLjLongJmp(MachineInstr &MI,
                                          MachineBasicBlock *MBB) const;
 
+    MachineBasicBlock *emitProbedAlloca(MachineInstr &MI,
+                                        MachineBasicBlock *MBB) const;
+
+    bool hasInlineStackProbe(MachineFunction &MF) const override;
+
+    unsigned getStackProbeSize(MachineFunction &MF) const;
+
     ConstraintType getConstraintType(StringRef Constraint) const override;
 
     /// Examine constraint string and operand type and determine a weight value.
@@ -879,7 +904,7 @@ namespace llvm {
       if (VT != MVT::f32 && VT != MVT::f64)
         return false;
 
-      return true; 
+      return true;
     }
 
     // Returns true if the address of the global is stored in TOC entry.
@@ -892,21 +917,10 @@ namespace llvm {
                             MachineFunction &MF,
                             unsigned Intrinsic) const override;
 
-    /// getOptimalMemOpType - Returns the target specific optimal type for load
-    /// and store operations as a result of memset, memcpy, and memmove
-    /// lowering. If DstAlign is zero that means it's safe to destination
-    /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
-    /// means there isn't a need to check it against alignment requirement,
-    /// probably because the source does not need to be loaded. If 'IsMemset' is
-    /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
-    /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
-    /// source is constant so it does not need to be loaded.
     /// It returns EVT::Other if the type should be determined using generic
     /// target-independent logic.
-    EVT
-    getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
-                        bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
-                        const AttributeList &FuncAttributes) const override;
+    EVT getOptimalMemOpType(const MemOp &Op,
+                            const AttributeList &FuncAttributes) const override;
 
     /// Is unaligned memory access allowed for the given type, and is it fast
     /// relative to software emulation.
@@ -922,6 +936,14 @@ namespace llvm {
     bool isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
                                     EVT VT) const override;
 
+    bool isFMAFasterThanFMulAndFAdd(const Function &F, Type *Ty) const override;
+
+    /// isProfitableToHoist - Check if it is profitable to hoist instruction
+    /// \p I to its dominator block.
+    /// For example, it is not profitable if \p I and it's only user can form a
+    /// FMA instruction, because Powerpc prefers FMADD.
+    bool isProfitableToHoist(Instruction *I) const override;
+
     const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
 
     // Should we expand the build vector with shuffles?
@@ -950,14 +972,19 @@ namespace llvm {
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
-    unsigned
+    Register
     getExceptionPointerRegister(const Constant *PersonalityFn) const override;
 
     /// If a physical register, this returns the register that receives the
     /// exception typeid on entry to a landing pad.
-    unsigned
+    Register
     getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
+    /// isMulhCheaperThanMulShift - Return true if a mulh[s|u] node for a
+    /// specific type is cheaper than a multiply followed by a shift.
+    /// This is true for words and doublewords on 64-bit PowerPC.
+    bool isMulhCheaperThanMulShift(EVT Type) const override;
+
     /// Override to support customized stack guard loading.
     bool useLoadStackGuardNode() const override;
     void insertSSPDeclarations(Module &M) const override;
@@ -973,6 +1000,24 @@ namespace llvm {
                                                unsigned JTI,
                                                MCContext &Ctx) const override;
 
+    /// Structure that collects some common arguments that get passed around
+    /// between the functions for call lowering.
+    struct CallFlags {
+      const CallingConv::ID CallConv;
+      const bool IsTailCall : 1;
+      const bool IsVarArg : 1;
+      const bool IsPatchPoint : 1;
+      const bool IsIndirect : 1;
+      const bool HasNest : 1;
+      const bool NoMerge : 1;
+
+      CallFlags(CallingConv::ID CC, bool IsTailCall, bool IsVarArg,
+                bool IsPatchPoint, bool IsIndirect, bool HasNest, bool NoMerge)
+          : CallConv(CC), IsTailCall(IsTailCall), IsVarArg(IsVarArg),
+            IsPatchPoint(IsPatchPoint), IsIndirect(IsIndirect),
+            HasNest(HasNest), NoMerge(NoMerge) {}
+    };
+
   private:
     struct ReuseLoadInfo {
       SDValue Ptr;
@@ -981,7 +1026,7 @@ namespace llvm {
       MachinePointerInfo MPI;
       bool IsDereferenceable = false;
       bool IsInvariant = false;
-      unsigned Alignment = 0;
+      Align Alignment;
       AAMDNodes AAInfo;
       const MDNode *Ranges = nullptr;
 
@@ -1032,15 +1077,10 @@ namespace llvm {
                                       const SmallVectorImpl<ISD::InputArg> &Ins,
                                       SelectionDAG& DAG) const;
 
-    bool
-    IsEligibleForTailCallOptimization_64SVR4(
-                                    SDValue Callee,
-                                    CallingConv::ID CalleeCC,
-                                    ImmutableCallSite CS,
-                                    bool isVarArg,
-                                    const SmallVectorImpl<ISD::OutputArg> &Outs,
-                                    const SmallVectorImpl<ISD::InputArg> &Ins,
-                                    SelectionDAG& DAG) const;
+    bool IsEligibleForTailCallOptimization_64SVR4(
+        SDValue Callee, CallingConv::ID CalleeCC, const CallBase *CB,
+        bool isVarArg, const SmallVectorImpl<ISD::OutputArg> &Outs,
+        const SmallVectorImpl<ISD::InputArg> &Ins, SelectionDAG &DAG) const;
 
     SDValue EmitTailCallLoadFPAndRetAddr(SelectionDAG &DAG, int SPDiff,
                                          SDValue Chain, SDValue &LROpOut,
@@ -1083,7 +1123,6 @@ namespace llvm {
     SDValue LowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_VOID(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerREM(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBSWAP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_CMP_SWAP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
@@ -1091,6 +1130,7 @@ namespace llvm {
     SDValue LowerMUL(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerABS(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerROTL(SDValue Op, SelectionDAG &DAG) const;
 
     SDValue LowerVectorLoad(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVectorStore(SDValue Op, SelectionDAG &DAG) const;
@@ -1100,15 +1140,14 @@ namespace llvm {
                             const SmallVectorImpl<ISD::InputArg> &Ins,
                             const SDLoc &dl, SelectionDAG &DAG,
                             SmallVectorImpl<SDValue> &InVals) const;
-    SDValue FinishCall(CallingConv::ID CallConv, const SDLoc &dl,
-                       bool isTailCall, bool isVarArg, bool isPatchPoint,
-                       bool hasNest, SelectionDAG &DAG,
+
+    SDValue FinishCall(CallFlags CFlags, const SDLoc &dl, SelectionDAG &DAG,
                        SmallVector<std::pair<unsigned, SDValue>, 8> &RegsToPass,
                        SDValue InFlag, SDValue Chain, SDValue CallSeqStart,
                        SDValue &Callee, int SPDiff, unsigned NumBytes,
                        const SmallVectorImpl<ISD::InputArg> &Ins,
                        SmallVectorImpl<SDValue> &InVals,
-                       ImmutableCallSite CS) const;
+                       const CallBase *CB) const;
 
     SDValue
     LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
@@ -1155,42 +1194,34 @@ namespace llvm {
                                        ISD::ArgFlagsTy Flags, SelectionDAG &DAG,
                                        const SDLoc &dl) const;
 
-    SDValue LowerCall_Darwin(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool isTailCall, bool isPatchPoint,
+    SDValue LowerCall_Darwin(SDValue Chain, SDValue Callee, CallFlags CFlags,
                              const SmallVectorImpl<ISD::OutputArg> &Outs,
                              const SmallVectorImpl<SDValue> &OutVals,
                              const SmallVectorImpl<ISD::InputArg> &Ins,
                              const SDLoc &dl, SelectionDAG &DAG,
                              SmallVectorImpl<SDValue> &InVals,
-                             ImmutableCallSite CS) const;
-    SDValue LowerCall_64SVR4(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool isTailCall, bool isPatchPoint,
+                             const CallBase *CB) const;
+    SDValue LowerCall_64SVR4(SDValue Chain, SDValue Callee, CallFlags CFlags,
                              const SmallVectorImpl<ISD::OutputArg> &Outs,
                              const SmallVectorImpl<SDValue> &OutVals,
                              const SmallVectorImpl<ISD::InputArg> &Ins,
                              const SDLoc &dl, SelectionDAG &DAG,
                              SmallVectorImpl<SDValue> &InVals,
-                             ImmutableCallSite CS) const;
-    SDValue LowerCall_32SVR4(SDValue Chain, SDValue Callee,
-                             CallingConv::ID CallConv, bool isVarArg,
-                             bool isTailCall, bool isPatchPoint,
+                             const CallBase *CB) const;
+    SDValue LowerCall_32SVR4(SDValue Chain, SDValue Callee, CallFlags CFlags,
                              const SmallVectorImpl<ISD::OutputArg> &Outs,
                              const SmallVectorImpl<SDValue> &OutVals,
                              const SmallVectorImpl<ISD::InputArg> &Ins,
                              const SDLoc &dl, SelectionDAG &DAG,
                              SmallVectorImpl<SDValue> &InVals,
-                             ImmutableCallSite CS) const;
-    SDValue LowerCall_AIX(SDValue Chain, SDValue Callee,
-                          CallingConv::ID CallConv, bool isVarArg,
-                          bool isTailCall, bool isPatchPoint,
+                             const CallBase *CB) const;
+    SDValue LowerCall_AIX(SDValue Chain, SDValue Callee, CallFlags CFlags,
                           const SmallVectorImpl<ISD::OutputArg> &Outs,
                           const SmallVectorImpl<SDValue> &OutVals,
                           const SmallVectorImpl<ISD::InputArg> &Ins,
                           const SDLoc &dl, SelectionDAG &DAG,
                           SmallVectorImpl<SDValue> &InVals,
-                          ImmutableCallSite CS) const;
+                          const CallBase *CB) const;
 
     SDValue lowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue lowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
@@ -1206,10 +1237,13 @@ namespace llvm {
     SDValue combineSRL(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineMUL(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineADD(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineFMALike(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineTRUNCATE(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineSetCC(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineABS(SDNode *N, DAGCombinerInfo &DCI) const;
     SDValue combineVSelect(SDNode *N, DAGCombinerInfo &DCI) const;
+    SDValue combineVectorShuffle(ShuffleVectorSDNode *SVN,
+                                 SelectionDAG &DAG) const;
     SDValue combineVReverseMemOP(ShuffleVectorSDNode *SVN, LSBaseSDNode *LSBase,
                                  DAGCombinerInfo &DCI) const;
 
@@ -1240,6 +1274,10 @@ namespace llvm {
     /// essentially v16i8 vector version of VINSERTH.
     SDValue lowerToVINSERTB(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
 
+    /// lowerToXXSPLTI32DX - Return the SDValue if this VECTOR_SHUFFLE can be
+    /// handled by the XXSPLTI32DX instruction introduced in ISA 3.1.
+    SDValue lowerToXXSPLTI32DX(ShuffleVectorSDNode *N, SelectionDAG &DAG) const;
+
     // Return whether the call instruction can potentially be optimized to a
     // tail call. This will cause the optimizers to attempt to move, or
     // duplicate return instructions to help enable tail call optimizations.
@@ -1258,6 +1296,9 @@ namespace llvm {
   bool isIntS16Immediate(SDNode *N, int16_t &Imm);
   bool isIntS16Immediate(SDValue Op, int16_t &Imm);
 
+  bool convertToNonDenormSingle(APInt &ArgAPInt);
+  bool convertToNonDenormSingle(APFloat &ArgAPFloat);
+
 } // end namespace llvm
 
 #endif // LLVM_TARGET_POWERPC_PPC32ISELLOWERING_H
diff --git a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
index 43431a1e0069..1c457d4170d5 100644
--- a/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
+++ b/llvm/lib/Target/PowerPC/PPCInstr64Bit.td
@@ -140,6 +140,15 @@ let isCall = 1, PPC970_Unit = 7, Defs = [LR8] in {
                              (outs), (ins abscalltarget:$func),
                              "bla $func\n\tnop", IIC_BrB,
                              [(PPCcall_nop (i64 imm:$func))]>;
+    let Predicates = [PCRelativeMemops] in {
+      // BL8_NOTOC means that the caller does not use the TOC pointer and if
+      // it does use R2 then it is just a caller saved register. Therefore it is
+      // safe to emit only the bl and not the nop for this instruction. The
+      // linker will not try to restore R2 after the call.
+      def BL8_NOTOC : IForm<18, 0, 1, (outs),
+                            (ins calltarget:$func),
+                            "bl $func", IIC_BrB, []>;
+    }
   }
   let Uses = [CTR8, RM] in {
     let isPredicable = 1 in
@@ -194,6 +203,11 @@ def : Pat<(PPCcall (i64 texternalsym:$dst)),
 def : Pat<(PPCcall_nop (i64 texternalsym:$dst)),
           (BL8_NOP texternalsym:$dst)>;
 
+def : Pat<(PPCcall_notoc (i64 tglobaladdr:$dst)),
+          (BL8_NOTOC tglobaladdr:$dst)>;
+def : Pat<(PPCcall_notoc (i64 texternalsym:$dst)),
+          (BL8_NOTOC texternalsym:$dst)>;
+
 // Calls for AIX
 def : Pat<(PPCcall (i64 mcsym:$dst)),
           (BL8 mcsym:$dst)>;
@@ -411,6 +425,19 @@ def DYNALLOC8 : PPCEmitTimePseudo<(outs g8rc:$result), (ins g8rc:$negsize, memri
                              (PPCdynalloc i64:$negsize, iaddr:$fpsi))]>;
 def DYNAREAOFFSET8 : PPCEmitTimePseudo<(outs i64imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET8",
                        [(set i64:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
+// Probed alloca to support stack clash protection.
+let Defs = [X1], Uses = [X1], hasNoSchedulingInfo = 1 in {
+def PROBED_ALLOCA_64 : PPCCustomInserterPseudo<(outs g8rc:$result),
+                         (ins g8rc:$negsize, memri:$fpsi), "#PROBED_ALLOCA_64",
+                           [(set i64:$result,
+                             (PPCprobedalloca i64:$negsize, iaddr:$fpsi))]>;
+def PREPARE_PROBED_ALLOCA_64 : PPCEmitTimePseudo<(outs g8rc:$fp,
+    g8rc:$sp),
+    (ins g8rc:$negsize, memri:$fpsi), "#PREPARE_PROBED_ALLOCA_64", []>;
+def PROBED_STACKALLOC_64 : PPCEmitTimePseudo<(outs g8rc:$scratch, g8rc:$temp),
+    (ins i64imm:$stacksize),
+    "#PROBED_STACKALLOC_64", []>;
+}
 
 let hasSideEffects = 0 in {
 let Defs = [LR8] in {
@@ -772,8 +799,9 @@ def POPCNTW : XForm_11<31, 378, (outs gprc:$rA), (ins gprc:$rS),
                        "popcntw $rA, $rS", IIC_IntGeneral,
                        [(set i32:$rA, (ctpop i32:$rS))]>;
 
-def POPCNTB : XForm_11<31, 122, (outs gprc:$rA), (ins gprc:$rS),
-                       "popcntb $rA, $rS", IIC_IntGeneral, []>;
+def POPCNTB : XForm_11<31, 122, (outs g8rc:$rA), (ins g8rc:$rS),
+                       "popcntb $rA, $rS", IIC_IntGeneral,
+                       [(set i64:$rA, (int_ppc_popcntb i64:$rS))]>;
 
 defm DIVD  : XOForm_1rcr<31, 489, 0, (outs g8rc:$rT), (ins g8rc:$rA, g8rc:$rB),
                           "divd", "$rT, $rA, $rB", IIC_IntDivD,
@@ -909,6 +937,104 @@ def ISEL8   : AForm_4<31, 15,
 }  // hasSideEffects = 0
 }  // End FXU Operations.
 
+def : InstAlias<"li $rD, $imm", (ADDI8 g8rc:$rD, ZERO8, s16imm64:$imm)>;
+def : InstAlias<"lis $rD, $imm", (ADDIS8 g8rc:$rD, ZERO8, s17imm64:$imm)>;
+
+def : InstAlias<"mr $rA, $rB", (OR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+def : InstAlias<"mr. $rA, $rB", (OR8_rec g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+
+def : InstAlias<"not $rA, $rB", (NOR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+def : InstAlias<"not. $rA, $rB", (NOR8_rec g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+
+def : InstAlias<"mtcr $rA", (MTCRF8 255, g8rc:$rA)>;
+
+def : InstAlias<"sub $rA, $rB, $rC", (SUBF8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"sub. $rA, $rB, $rC", (SUBF8_rec g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"subc $rA, $rB, $rC", (SUBFC8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC8_rec g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
+
+def : InstAlias<"rotlwi $rA, $rS, $n", (RLWINM8 g8rc:$rA, g8rc:$rS, u5imm:$n, 0, 31)>;
+def : InstAlias<"rotlwi. $rA, $rS, $n", (RLWINM8_rec g8rc:$rA, g8rc:$rS, u5imm:$n, 0, 31)>;
+def : InstAlias<"rotlw $rA, $rS, $rB", (RLWNM8 g8rc:$rA, g8rc:$rS, g8rc:$rB, 0, 31)>;
+def : InstAlias<"rotlw. $rA, $rS, $rB", (RLWNM8_rec g8rc:$rA, g8rc:$rS, g8rc:$rB, 0, 31)>;
+def : InstAlias<"clrlwi $rA, $rS, $n", (RLWINM8 g8rc:$rA, g8rc:$rS, 0, u5imm:$n, 31)>;
+def : InstAlias<"clrlwi. $rA, $rS, $n", (RLWINM8_rec g8rc:$rA, g8rc:$rS, 0, u5imm:$n, 31)>;
+
+def : InstAlias<"isellt $rT, $rA, $rB",
+                (ISEL8 g8rc:$rT, g8rc_nox0:$rA, g8rc:$rB, CR0LT)>;
+def : InstAlias<"iselgt $rT, $rA, $rB",
+                (ISEL8 g8rc:$rT, g8rc_nox0:$rA, g8rc:$rB, CR0GT)>;
+def : InstAlias<"iseleq $rT, $rA, $rB",
+                (ISEL8 g8rc:$rT, g8rc_nox0:$rA, g8rc:$rB, CR0EQ)>;
+
+def : InstAlias<"nop", (ORI8 X0, X0, 0)>;
+def : InstAlias<"xnop", (XORI8 X0, X0, 0)>;
+
+def : InstAlias<"cntlzw $rA, $rS", (CNTLZW8 g8rc:$rA, g8rc:$rS)>;
+def : InstAlias<"cntlzw. $rA, $rS", (CNTLZW8_rec g8rc:$rA, g8rc:$rS)>;
+
+def : InstAlias<"mtxer $Rx", (MTSPR8 1, g8rc:$Rx)>;
+def : InstAlias<"mfxer $Rx", (MFSPR8 g8rc:$Rx, 1)>;
+
+def : InstAlias<"mtudscr $Rx", (MTSPR8 3, g8rc:$Rx)>;
+def : InstAlias<"mfudscr $Rx", (MFSPR8 g8rc:$Rx, 3)>;
+
+def : InstAlias<"mfrtcu $Rx", (MFSPR8 g8rc:$Rx, 4)>;
+def : InstAlias<"mfrtcl $Rx", (MFSPR8 g8rc:$Rx, 5)>;
+
+def : InstAlias<"mtlr $Rx", (MTSPR8 8, g8rc:$Rx)>;
+def : InstAlias<"mflr $Rx", (MFSPR8 g8rc:$Rx, 8)>;
+
+def : InstAlias<"mtctr $Rx", (MTSPR8 9, g8rc:$Rx)>;
+def : InstAlias<"mfctr $Rx", (MFSPR8 g8rc:$Rx, 9)>;
+
+def : InstAlias<"mtuamr $Rx", (MTSPR8 13, g8rc:$Rx)>;
+def : InstAlias<"mfuamr $Rx", (MFSPR8 g8rc:$Rx, 13)>;
+
+def : InstAlias<"mtdscr $Rx", (MTSPR8 17, g8rc:$Rx)>;
+def : InstAlias<"mfdscr $Rx", (MFSPR8 g8rc:$Rx, 17)>;
+
+def : InstAlias<"mtdsisr $Rx", (MTSPR8 18, g8rc:$Rx)>;
+def : InstAlias<"mfdsisr $Rx", (MFSPR8 g8rc:$Rx, 18)>;
+
+def : InstAlias<"mtdar $Rx", (MTSPR8 19, g8rc:$Rx)>;
+def : InstAlias<"mfdar $Rx", (MFSPR8 g8rc:$Rx, 19)>;
+
+def : InstAlias<"mtdec $Rx", (MTSPR8 22, g8rc:$Rx)>;
+def : InstAlias<"mfdec $Rx", (MFSPR8 g8rc:$Rx, 22)>;
+
+def : InstAlias<"mtsdr1 $Rx", (MTSPR8 25, g8rc:$Rx)>;
+def : InstAlias<"mfsdr1 $Rx", (MFSPR8 g8rc:$Rx, 25)>;
+
+def : InstAlias<"mtsrr0 $Rx", (MTSPR8 26, g8rc:$Rx)>;
+def : InstAlias<"mfsrr0 $Rx", (MFSPR8 g8rc:$Rx, 26)>;
+
+def : InstAlias<"mtsrr1 $Rx", (MTSPR8 27, g8rc:$Rx)>;
+def : InstAlias<"mfsrr1 $Rx", (MFSPR8 g8rc:$Rx, 27)>;
+
+def : InstAlias<"mtcfar $Rx", (MTSPR8 28, g8rc:$Rx)>;
+def : InstAlias<"mfcfar $Rx", (MFSPR8 g8rc:$Rx, 28)>;
+
+def : InstAlias<"mtamr $Rx", (MTSPR8 29, g8rc:$Rx)>;
+def : InstAlias<"mfamr $Rx", (MFSPR8 g8rc:$Rx, 29)>;
+
+foreach SPRG = 0-3 in {
+  def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR8 g8rc:$RT, !add(SPRG, 272))>;
+  def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR8 g8rc:$RT, !add(SPRG, 272))>;
+  def : InstAlias<"mfsprg "#SPRG#", $RT", (MTSPR8 !add(SPRG, 272), g8rc:$RT)>;
+  def : InstAlias<"mfsprg"#SPRG#" $RT", (MTSPR8 !add(SPRG, 272), g8rc:$RT)>;
+}
+
+def : InstAlias<"mfasr $RT", (MFSPR8 g8rc:$RT, 280)>;
+def : InstAlias<"mtasr $RT", (MTSPR8 280, g8rc:$RT)>;
+
+def : InstAlias<"mttbl $Rx", (MTSPR8 284, g8rc:$Rx)>;
+def : InstAlias<"mttbu $Rx", (MTSPR8 285, g8rc:$Rx)>;
+
+def : InstAlias<"mfpvr $RT", (MFSPR8 g8rc:$RT, 287)>;
+
+def : InstAlias<"mfspefscr $Rx", (MFSPR8 g8rc:$Rx, 512)>;
+def : InstAlias<"mtspefscr $Rx", (MTSPR8 512, g8rc:$Rx)>;
 
 //===----------------------------------------------------------------------===//
 // Load/Store instructions.
@@ -1110,7 +1236,7 @@ def ADDISgotTprelHA: PPCEmitTimePseudo<(outs g8rc:$rD), (ins g8rc_nox0:$reg, s16
                            (PPCaddisGotTprelHA i64:$reg,
                                                tglobaltlsaddr:$disp))]>,
                   isPPC64;
-def LDgotTprelL: PPCEmitTimePseudo<(outs g8rc:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
+def LDgotTprelL: PPCEmitTimePseudo<(outs g8rc_nox0:$rD), (ins s16imm64:$disp, g8rc_nox0:$reg),
                         "#LDgotTprelL",
                         [(set i64:$rD,
                           (PPCldGotTprelL tglobaltlsaddr:$disp, i64:$reg))]>,
diff --git a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
index f94816a35f79..920eeed9d41f 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrAltivec.td
@@ -341,7 +341,7 @@ class VXCR_Int_Ty<bits<11> xo, string opc, Intrinsic IntID, ValueType Ty>
 //===----------------------------------------------------------------------===//
 // Instruction Definitions.
 
-def HasAltivec : Predicate<"PPCSubTarget->hasAltivec()">;
+def HasAltivec : Predicate<"Subtarget->hasAltivec()">;
 let Predicates = [HasAltivec] in {
 
 def DSS      : DSS_Form<0, 822, (outs), (ins u5imm:$STRM),
@@ -491,7 +491,7 @@ let isCommutable = 1 in {
 def VADDFP : VXForm_1<10, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                       "vaddfp $vD, $vA, $vB", IIC_VecFP,
                       [(set v4f32:$vD, (fadd v4f32:$vA, v4f32:$vB))]>;
-                      
+
 def VADDUBM : VXForm_1<0, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                       "vaddubm $vD, $vA, $vB", IIC_VecGeneral,
                       [(set v16i8:$vD, (add v16i8:$vA, v16i8:$vB))]>;
@@ -501,7 +501,7 @@ def VADDUHM : VXForm_1<64, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
 def VADDUWM : VXForm_1<128, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                       "vadduwm $vD, $vA, $vB", IIC_VecGeneral,
                       [(set v4i32:$vD, (add v4i32:$vA, v4i32:$vB))]>;
-                      
+
 def VADDCUW : VX1_Int_Ty<384, "vaddcuw", int_ppc_altivec_vaddcuw, v4i32>;
 def VADDSBS : VX1_Int_Ty<768, "vaddsbs", int_ppc_altivec_vaddsbs, v16i8>;
 def VADDSHS : VX1_Int_Ty<832, "vaddshs", int_ppc_altivec_vaddshs, v8i16>;
@@ -635,7 +635,7 @@ def VMULOUB : VX1_Int_Ty2<  8, "vmuloub", int_ppc_altivec_vmuloub,
 def VMULOUH : VX1_Int_Ty2< 72, "vmulouh", int_ppc_altivec_vmulouh,
                           v4i32, v8i16>;
 } // isCommutable
-                       
+
 def VREFP     : VX2_Int_SP<266, "vrefp",     int_ppc_altivec_vrefp>;
 def VRFIM     : VX2_Int_SP<714, "vrfim",     int_ppc_altivec_vrfim>;
 def VRFIN     : VX2_Int_SP<522, "vrfin",     int_ppc_altivec_vrfin>;
@@ -657,7 +657,7 @@ def VSUBUHM : VXForm_1<1088, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
 def VSUBUWM : VXForm_1<1152, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                       "vsubuwm $vD, $vA, $vB", IIC_VecGeneral,
                       [(set v4i32:$vD, (sub v4i32:$vA, v4i32:$vB))]>;
-                      
+
 def VSUBSBS : VX1_Int_Ty<1792, "vsubsbs" , int_ppc_altivec_vsubsbs, v16i8>;
 def VSUBSHS : VX1_Int_Ty<1856, "vsubshs" , int_ppc_altivec_vsubshs, v8i16>;
 def VSUBSWS : VX1_Int_Ty<1920, "vsubsws" , int_ppc_altivec_vsubsws, v4i32>;
@@ -869,6 +869,26 @@ def : Pat<(v8i16 (rotl v8i16:$vA, v8i16:$vB)),
 def : Pat<(v4i32 (rotl v4i32:$vA, v4i32:$vB)),
           (v4i32 (VRLW v4i32:$vA, v4i32:$vB))>;
 
+// Multiply
+def : Pat<(mul v8i16:$vA, v8i16:$vB), (VMLADDUHM $vA, $vB, (v8i16(V_SET0H)))>;
+
+// Add
+def : Pat<(add (mul v8i16:$vA, v8i16:$vB), v8i16:$vC), (VMLADDUHM $vA, $vB, $vC)>;
+
+// Saturating adds/subtracts.
+def : Pat<(v16i8 (saddsat v16i8:$vA, v16i8:$vB)), (v16i8 (VADDSBS $vA, $vB))>;
+def : Pat<(v16i8 (uaddsat v16i8:$vA, v16i8:$vB)), (v16i8 (VADDUBS $vA, $vB))>;
+def : Pat<(v8i16 (saddsat v8i16:$vA, v8i16:$vB)), (v8i16 (VADDSHS $vA, $vB))>;
+def : Pat<(v8i16 (uaddsat v8i16:$vA, v8i16:$vB)), (v8i16 (VADDUHS $vA, $vB))>;
+def : Pat<(v4i32 (saddsat v4i32:$vA, v4i32:$vB)), (v4i32 (VADDSWS $vA, $vB))>;
+def : Pat<(v4i32 (uaddsat v4i32:$vA, v4i32:$vB)), (v4i32 (VADDUWS $vA, $vB))>;
+def : Pat<(v16i8 (ssubsat v16i8:$vA, v16i8:$vB)), (v16i8 (VSUBSBS $vA, $vB))>;
+def : Pat<(v16i8 (usubsat v16i8:$vA, v16i8:$vB)), (v16i8 (VSUBUBS $vA, $vB))>;
+def : Pat<(v8i16 (ssubsat v8i16:$vA, v8i16:$vB)), (v8i16 (VSUBSHS $vA, $vB))>;
+def : Pat<(v8i16 (usubsat v8i16:$vA, v8i16:$vB)), (v8i16 (VSUBUHS $vA, $vB))>;
+def : Pat<(v4i32 (ssubsat v4i32:$vA, v4i32:$vB)), (v4i32 (VSUBSWS $vA, $vB))>;
+def : Pat<(v4i32 (usubsat v4i32:$vA, v4i32:$vB)), (v4i32 (VSUBUWS $vA, $vB))>;
+
 // Loads.
 def : Pat<(v4i32 (load xoaddr:$src)), (LVX xoaddr:$src)>;
 
@@ -1002,14 +1022,9 @@ def : Pat<(and v4i32:$A, (vnot_ppc v4i32:$B)),
 
 def : Pat<(fmul v4f32:$vA, v4f32:$vB),
           (VMADDFP $vA, $vB,
-             (v4i32 (VSLW (v4i32 (V_SETALLONES)), (v4i32 (V_SETALLONES)))))>; 
+             (v4i32 (VSLW (v4i32 (V_SETALLONES)), (v4i32 (V_SETALLONES)))))>;
 
-// Fused multiply add and multiply sub for packed float.  These are represented
-// separately from the real instructions above, for operations that must have
-// the additional precision, such as Newton-Rhapson (used by divide, sqrt)
-def : Pat<(PPCvmaddfp v4f32:$A, v4f32:$B, v4f32:$C),
-          (VMADDFP $A, $B, $C)>;
-def : Pat<(PPCvnmsubfp v4f32:$A, v4f32:$B, v4f32:$C),
+def : Pat<(PPCfnmsub v4f32:$A, v4f32:$B, v4f32:$C),
           (VNMSUBFP $A, $B, $C)>;
 
 def : Pat<(int_ppc_altivec_vmaddfp v4f32:$A, v4f32:$B, v4f32:$C),
@@ -1121,8 +1136,8 @@ def : Pat<(v16i8 (srl (sub v16i8:$vA, (v16i8 (bitconvert(vnot_ppc v4i32:$vB)))),
 
 } // end HasAltivec
 
-def HasP8Altivec : Predicate<"PPCSubTarget->hasP8Altivec()">;
-def HasP8Crypto : Predicate<"PPCSubTarget->hasP8Crypto()">;
+def HasP8Altivec : Predicate<"Subtarget->hasP8Altivec()">;
+def HasP8Crypto : Predicate<"Subtarget->hasP8Crypto()">;
 let Predicates = [HasP8Altivec] in {
 
 let isCommutable = 1 in {
@@ -1143,7 +1158,7 @@ def VMINSD : VX1_Int_Ty<962, "vminsd", int_ppc_altivec_vminsd, v2i64>;
 def VMINUD : VX1_Int_Ty<706, "vminud", int_ppc_altivec_vminud, v2i64>;
 } // isCommutable
 
-// Vector merge 
+// Vector merge
 def VMRGEW : VXForm_1<1932, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                       "vmrgew $vD, $vA, $vB", IIC_VecFP,
                       [(set v16i8:$vD,
@@ -1251,16 +1266,16 @@ def VPOPCNTD : VXForm_2<1987, (outs vrrc:$vD), (ins vrrc:$vB),
                         [(set v2i64:$vD, (ctpop v2i64:$vB))]>;
 
 let isCommutable = 1 in {
-// FIXME: Use AddedComplexity > 400 to ensure these patterns match before the 
+// FIXME: Use AddedComplexity > 400 to ensure these patterns match before the
 //        VSX equivalents. We need to fix this up at some point. Two possible
 //        solutions for this problem:
 //        1. Disable Altivec patterns that compete with VSX patterns using the
-//           !HasVSX predicate. This essentially favours VSX over Altivec, in 
-//           hopes of reducing register pressure (larger register set using VSX 
+//           !HasVSX predicate. This essentially favours VSX over Altivec, in
+//           hopes of reducing register pressure (larger register set using VSX
 //           instructions than VMX instructions)
 //        2. Employ a more disciplined use of AddedComplexity, which would provide
 //           more fine-grained control than option 1. This would be beneficial
-//           if we find situations where Altivec is really preferred over VSX. 
+//           if we find situations where Altivec is really preferred over VSX.
 def VEQV  : VXForm_1<1668, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
                      "veqv $vD, $vA, $vB", IIC_VecGeneral,
                      [(set v4i32:$vD, (vnot_ppc (xor v4i32:$vA, v4i32:$vB)))]>;
@@ -1339,9 +1354,13 @@ def VSBOX : VXBX_Int_Ty<1480, "vsbox", int_ppc_altivec_crypto_vsbox, v2i64>;
 } // HasP8Crypto
 
 // The following altivec instructions were introduced in Power ISA 3.0
-def HasP9Altivec : Predicate<"PPCSubTarget->hasP9Altivec()">;
+def HasP9Altivec : Predicate<"Subtarget->hasP9Altivec()">;
 let Predicates = [HasP9Altivec] in {
 
+// Vector Multiply-Sum
+def VMSUMUDM : VA1a_Int_Ty3<35, "vmsumudm", int_ppc_altivec_vmsumudm,
+                            v1i128, v2i64, v1i128>;
+
 // i8 element comparisons.
 def VCMPNEB   : VCMP   <  7, "vcmpneb $vD, $vA, $vB"  , v16i8>;
 def VCMPNEB_rec  : VCMPo  <  7, "vcmpneb. $vD, $vA, $vB" , v16i8>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrFormats.td b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
index 115bd44ea202..632d4d9deb8a 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrFormats.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrFormats.td
@@ -39,7 +39,11 @@ class I<bits<6> opcode, dag OOL, dag IOL, string asmstr, InstrItinClass itin>
 
   // Indicate that this instruction is of type X-Form Load or Store
   bits<1> XFormMemOp = 0;
-  let TSFlags{7}  = XFormMemOp;
+  let TSFlags{6}  = XFormMemOp;
+
+  // Indicate that this instruction is prefixed.
+  bits<1> Prefixed = 0;
+  let TSFlags{7}  = Prefixed;
 
   // Fields used for relation models.
   string BaseName = "";
diff --git a/llvm/lib/Target/PowerPC/PPCInstrHTM.td b/llvm/lib/Target/PowerPC/PPCInstrHTM.td
index 6cbf999ca73d..992ad8216f3b 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrHTM.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrHTM.td
@@ -13,7 +13,7 @@
 
 
 
-def HasHTM : Predicate<"PPCSubTarget->hasHTM()">;
+def HasHTM : Predicate<"Subtarget->hasHTM()">;
 
 def HTM_get_imm : SDNodeXForm<imm, [{
   return getI32Imm (N->getZExtValue(), SDLoc(N));
@@ -169,3 +169,8 @@ def : Pat<(i64 (int_ppc_ttest)),
                    36, 28)>;
 
 } // [HasHTM]
+
+def : InstAlias<"tend.", (TEND 0)>, Requires<[HasHTM]>;
+def : InstAlias<"tendall.", (TEND 1)>, Requires<[HasHTM]>;
+def : InstAlias<"tsuspend.", (TSR 0)>, Requires<[HasHTM]>;
+def : InstAlias<"tresume.", (TSR 1)>, Requires<[HasHTM]>;
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 30906a32b00c..11c97210ead9 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -19,6 +19,7 @@
 #include "PPCTargetMachine.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
@@ -72,27 +73,6 @@ static cl::opt<bool>
 UseOldLatencyCalc("ppc-old-latency-calc", cl::Hidden,
   cl::desc("Use the old (incorrect) instruction latency calculation"));
 
-// Index into the OpcodesForSpill array.
-enum SpillOpcodeKey {
-  SOK_Int4Spill,
-  SOK_Int8Spill,
-  SOK_Float8Spill,
-  SOK_Float4Spill,
-  SOK_CRSpill,
-  SOK_CRBitSpill,
-  SOK_VRVectorSpill,
-  SOK_VSXVectorSpill,
-  SOK_VectorFloat8Spill,
-  SOK_VectorFloat4Spill,
-  SOK_VRSaveSpill,
-  SOK_QuadFloat8Spill,
-  SOK_QuadFloat4Spill,
-  SOK_QuadBitSpill,
-  SOK_SpillToVSR,
-  SOK_SPESpill,
-  SOK_LastOpcodeSpill  // This must be last on the enum.
-};
-
 // Pin the vtable to this file.
 void PPCInstrInfo::anchor() {}
 
@@ -225,13 +205,42 @@ int PPCInstrInfo::getOperandLatency(const InstrItineraryData *ItinData,
   return Latency;
 }
 
+/// This is an architecture-specific helper function of reassociateOps.
+/// Set special operand attributes for new instructions after reassociation.
+void PPCInstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
+                                         MachineInstr &OldMI2,
+                                         MachineInstr &NewMI1,
+                                         MachineInstr &NewMI2) const {
+  // Propagate FP flags from the original instructions.
+  // But clear poison-generating flags because those may not be valid now.
+  uint16_t IntersectedFlags = OldMI1.getFlags() & OldMI2.getFlags();
+  NewMI1.setFlags(IntersectedFlags);
+  NewMI1.clearFlag(MachineInstr::MIFlag::NoSWrap);
+  NewMI1.clearFlag(MachineInstr::MIFlag::NoUWrap);
+  NewMI1.clearFlag(MachineInstr::MIFlag::IsExact);
+
+  NewMI2.setFlags(IntersectedFlags);
+  NewMI2.clearFlag(MachineInstr::MIFlag::NoSWrap);
+  NewMI2.clearFlag(MachineInstr::MIFlag::NoUWrap);
+  NewMI2.clearFlag(MachineInstr::MIFlag::IsExact);
+}
+
+void PPCInstrInfo::setSpecialOperandAttr(MachineInstr &MI,
+                                         uint16_t Flags) const {
+  MI.setFlags(Flags);
+  MI.clearFlag(MachineInstr::MIFlag::NoSWrap);
+  MI.clearFlag(MachineInstr::MIFlag::NoUWrap);
+  MI.clearFlag(MachineInstr::MIFlag::IsExact);
+}
+
 // This function does not list all associative and commutative operations, but
 // only those worth feeding through the machine combiner in an attempt to
 // reduce the critical path. Mostly, this means floating-point operations,
-// because they have high latencies (compared to other operations, such and
+// because they have high latencies(>=5) (compared to other operations, such as
 // and/or, which are also associative and commutative, but have low latencies).
 bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
   switch (Inst.getOpcode()) {
+  // Floating point:
   // FP Add:
   case PPC::FADD:
   case PPC::FADDS:
@@ -258,12 +267,157 @@ bool PPCInstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
   case PPC::QVFMUL:
   case PPC::QVFMULS:
   case PPC::QVFMULSs:
+    return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
+           Inst.getFlag(MachineInstr::MIFlag::FmNsz);
+  // Fixed point:
+  // Multiply:
+  case PPC::MULHD:
+  case PPC::MULLD:
+  case PPC::MULHW:
+  case PPC::MULLW:
     return true;
   default:
     return false;
   }
 }
 
+#define InfoArrayIdxFMAInst 0
+#define InfoArrayIdxFAddInst 1
+#define InfoArrayIdxFMULInst 2
+#define InfoArrayIdxAddOpIdx 3
+#define InfoArrayIdxMULOpIdx 4
+// Array keeps info for FMA instructions:
+// Index 0(InfoArrayIdxFMAInst): FMA instruction;
+// Index 1(InfoArrayIdxFAddInst): ADD instruction assoaicted with FMA;
+// Index 2(InfoArrayIdxFMULInst): MUL instruction assoaicted with FMA;
+// Index 3(InfoArrayIdxAddOpIdx): ADD operand index in FMA operands;
+// Index 4(InfoArrayIdxMULOpIdx): first MUL operand index in FMA operands;
+//                                second MUL operand index is plus 1.
+static const uint16_t FMAOpIdxInfo[][5] = {
+    // FIXME: Add more FMA instructions like XSNMADDADP and so on.
+    {PPC::XSMADDADP, PPC::XSADDDP, PPC::XSMULDP, 1, 2},
+    {PPC::XSMADDASP, PPC::XSADDSP, PPC::XSMULSP, 1, 2},
+    {PPC::XVMADDADP, PPC::XVADDDP, PPC::XVMULDP, 1, 2},
+    {PPC::XVMADDASP, PPC::XVADDSP, PPC::XVMULSP, 1, 2},
+    {PPC::FMADD, PPC::FADD, PPC::FMUL, 3, 1},
+    {PPC::FMADDS, PPC::FADDS, PPC::FMULS, 3, 1},
+    {PPC::QVFMADDSs, PPC::QVFADDSs, PPC::QVFMULSs, 3, 1},
+    {PPC::QVFMADD, PPC::QVFADD, PPC::QVFMUL, 3, 1}};
+
+// Check if an opcode is a FMA instruction. If it is, return the index in array
+// FMAOpIdxInfo. Otherwise, return -1.
+int16_t PPCInstrInfo::getFMAOpIdxInfo(unsigned Opcode) const {
+  for (unsigned I = 0; I < array_lengthof(FMAOpIdxInfo); I++)
+    if (FMAOpIdxInfo[I][InfoArrayIdxFMAInst] == Opcode)
+      return I;
+  return -1;
+}
+
+// Try to reassociate FMA chains like below:
+//
+// Pattern 1:
+//   A =  FADD X,  Y          (Leaf)
+//   B =  FMA  A,  M21,  M22  (Prev)
+//   C =  FMA  B,  M31,  M32  (Root)
+// -->
+//   A =  FMA  X,  M21,  M22
+//   B =  FMA  Y,  M31,  M32
+//   C =  FADD A,  B
+//
+// Pattern 2:
+//   A =  FMA  X,  M11,  M12  (Leaf)
+//   B =  FMA  A,  M21,  M22  (Prev)
+//   C =  FMA  B,  M31,  M32  (Root)
+// -->
+//   A =  FMUL M11,  M12
+//   B =  FMA  X,  M21,  M22
+//   D =  FMA  A,  M31,  M32
+//   C =  FADD B,  D
+//
+// breaking the dependency between A and B, allowing FMA to be executed in
+// parallel (or back-to-back in a pipeline) instead of depending on each other.
+bool PPCInstrInfo::getFMAPatterns(
+    MachineInstr &Root,
+    SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
+  MachineBasicBlock *MBB = Root.getParent();
+  const MachineRegisterInfo &MRI = MBB->getParent()->getRegInfo();
+
+  auto IsAllOpsVirtualReg = [](const MachineInstr &Instr) {
+    for (const auto &MO : Instr.explicit_operands())
+      if (!(MO.isReg() && Register::isVirtualRegister(MO.getReg())))
+        return false;
+    return true;
+  };
+
+  auto IsReassociable = [&](const MachineInstr &Instr, int16_t &AddOpIdx,
+                            bool IsLeaf, bool IsAdd) {
+    int16_t Idx = -1;
+    if (!IsAdd) {
+      Idx = getFMAOpIdxInfo(Instr.getOpcode());
+      if (Idx < 0)
+        return false;
+    } else if (Instr.getOpcode() !=
+               FMAOpIdxInfo[getFMAOpIdxInfo(Root.getOpcode())]
+                           [InfoArrayIdxFAddInst])
+      return false;
+
+    // Instruction can be reassociated.
+    // fast math flags may prohibit reassociation.
+    if (!(Instr.getFlag(MachineInstr::MIFlag::FmReassoc) &&
+          Instr.getFlag(MachineInstr::MIFlag::FmNsz)))
+      return false;
+
+    // Instruction operands are virtual registers for reassociation.
+    if (!IsAllOpsVirtualReg(Instr))
+      return false;
+
+    if (IsAdd && IsLeaf)
+      return true;
+
+    AddOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxAddOpIdx];
+
+    const MachineOperand &OpAdd = Instr.getOperand(AddOpIdx);
+    MachineInstr *MIAdd = MRI.getUniqueVRegDef(OpAdd.getReg());
+    // If 'add' operand's def is not in current block, don't do ILP related opt.
+    if (!MIAdd || MIAdd->getParent() != MBB)
+      return false;
+
+    // If this is not Leaf FMA Instr, its 'add' operand should only have one use
+    // as this fma will be changed later.
+    return IsLeaf ? true : MRI.hasOneNonDBGUse(OpAdd.getReg());
+  };
+
+  int16_t AddOpIdx = -1;
+  // Root must be a valid FMA like instruction.
+  if (!IsReassociable(Root, AddOpIdx, false, false))
+    return false;
+
+  assert((AddOpIdx >= 0) && "add operand index not right!");
+
+  Register RegB = Root.getOperand(AddOpIdx).getReg();
+  MachineInstr *Prev = MRI.getUniqueVRegDef(RegB);
+
+  // Prev must be a valid FMA like instruction.
+  AddOpIdx = -1;
+  if (!IsReassociable(*Prev, AddOpIdx, false, false))
+    return false;
+
+  assert((AddOpIdx >= 0) && "add operand index not right!");
+
+  Register RegA = Prev->getOperand(AddOpIdx).getReg();
+  MachineInstr *Leaf = MRI.getUniqueVRegDef(RegA);
+  AddOpIdx = -1;
+  if (IsReassociable(*Leaf, AddOpIdx, true, false)) {
+    Patterns.push_back(MachineCombinerPattern::REASSOC_XMM_AMM_BMM);
+    return true;
+  }
+  if (IsReassociable(*Leaf, AddOpIdx, true, true)) {
+    Patterns.push_back(MachineCombinerPattern::REASSOC_XY_AMM_BMM);
+    return true;
+  }
+  return false;
+}
+
 bool PPCInstrInfo::getMachineCombinerPatterns(
     MachineInstr &Root,
     SmallVectorImpl<MachineCombinerPattern> &Patterns) const {
@@ -272,16 +426,201 @@ bool PPCInstrInfo::getMachineCombinerPatterns(
   if (Subtarget.getTargetMachine().getOptLevel() != CodeGenOpt::Aggressive)
     return false;
 
-  // FP reassociation is only legal when we don't need strict IEEE semantics.
-  if (!Root.getParent()->getParent()->getTarget().Options.UnsafeFPMath)
-    return false;
+  if (getFMAPatterns(Root, Patterns))
+    return true;
 
   return TargetInstrInfo::getMachineCombinerPatterns(Root, Patterns);
 }
 
+void PPCInstrInfo::genAlternativeCodeSequence(
+    MachineInstr &Root, MachineCombinerPattern Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+  switch (Pattern) {
+  case MachineCombinerPattern::REASSOC_XY_AMM_BMM:
+  case MachineCombinerPattern::REASSOC_XMM_AMM_BMM:
+    reassociateFMA(Root, Pattern, InsInstrs, DelInstrs, InstrIdxForVirtReg);
+    break;
+  default:
+    // Reassociate default patterns.
+    TargetInstrInfo::genAlternativeCodeSequence(Root, Pattern, InsInstrs,
+                                                DelInstrs, InstrIdxForVirtReg);
+    break;
+  }
+}
+
+// Currently, only handle two patterns REASSOC_XY_AMM_BMM and
+// REASSOC_XMM_AMM_BMM. See comments for getFMAPatterns.
+void PPCInstrInfo::reassociateFMA(
+    MachineInstr &Root, MachineCombinerPattern Pattern,
+    SmallVectorImpl<MachineInstr *> &InsInstrs,
+    SmallVectorImpl<MachineInstr *> &DelInstrs,
+    DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const {
+  MachineFunction *MF = Root.getMF();
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineOperand &OpC = Root.getOperand(0);
+  Register RegC = OpC.getReg();
+  const TargetRegisterClass *RC = MRI.getRegClass(RegC);
+  MRI.constrainRegClass(RegC, RC);
+
+  unsigned FmaOp = Root.getOpcode();
+  int16_t Idx = getFMAOpIdxInfo(FmaOp);
+  assert(Idx >= 0 && "Root must be a FMA instruction");
+
+  uint16_t AddOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxAddOpIdx];
+  uint16_t FirstMulOpIdx = FMAOpIdxInfo[Idx][InfoArrayIdxMULOpIdx];
+  MachineInstr *Prev = MRI.getUniqueVRegDef(Root.getOperand(AddOpIdx).getReg());
+  MachineInstr *Leaf =
+      MRI.getUniqueVRegDef(Prev->getOperand(AddOpIdx).getReg());
+  uint16_t IntersectedFlags =
+      Root.getFlags() & Prev->getFlags() & Leaf->getFlags();
+
+  auto GetOperandInfo = [&](const MachineOperand &Operand, Register &Reg,
+                            bool &KillFlag) {
+    Reg = Operand.getReg();
+    MRI.constrainRegClass(Reg, RC);
+    KillFlag = Operand.isKill();
+  };
+
+  auto GetFMAInstrInfo = [&](const MachineInstr &Instr, Register &MulOp1,
+                             Register &MulOp2, bool &MulOp1KillFlag,
+                             bool &MulOp2KillFlag) {
+    GetOperandInfo(Instr.getOperand(FirstMulOpIdx), MulOp1, MulOp1KillFlag);
+    GetOperandInfo(Instr.getOperand(FirstMulOpIdx + 1), MulOp2, MulOp2KillFlag);
+  };
+
+  Register RegM11, RegM12, RegX, RegY, RegM21, RegM22, RegM31, RegM32;
+  bool KillX = false, KillY = false, KillM11 = false, KillM12 = false,
+       KillM21 = false, KillM22 = false, KillM31 = false, KillM32 = false;
+
+  GetFMAInstrInfo(Root, RegM31, RegM32, KillM31, KillM32);
+  GetFMAInstrInfo(*Prev, RegM21, RegM22, KillM21, KillM22);
+
+  if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) {
+    GetFMAInstrInfo(*Leaf, RegM11, RegM12, KillM11, KillM12);
+    GetOperandInfo(Leaf->getOperand(AddOpIdx), RegX, KillX);
+  } else if (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) {
+    GetOperandInfo(Leaf->getOperand(1), RegX, KillX);
+    GetOperandInfo(Leaf->getOperand(2), RegY, KillY);
+  }
+
+  // Create new virtual registers for the new results instead of
+  // recycling legacy ones because the MachineCombiner's computation of the
+  // critical path requires a new register definition rather than an existing
+  // one.
+  Register NewVRA = MRI.createVirtualRegister(RC);
+  InstrIdxForVirtReg.insert(std::make_pair(NewVRA, 0));
+
+  Register NewVRB = MRI.createVirtualRegister(RC);
+  InstrIdxForVirtReg.insert(std::make_pair(NewVRB, 1));
+
+  Register NewVRD = 0;
+  if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) {
+    NewVRD = MRI.createVirtualRegister(RC);
+    InstrIdxForVirtReg.insert(std::make_pair(NewVRD, 2));
+  }
+
+  auto AdjustOperandOrder = [&](MachineInstr *MI, Register RegAdd, bool KillAdd,
+                                Register RegMul1, bool KillRegMul1,
+                                Register RegMul2, bool KillRegMul2) {
+    MI->getOperand(AddOpIdx).setReg(RegAdd);
+    MI->getOperand(AddOpIdx).setIsKill(KillAdd);
+    MI->getOperand(FirstMulOpIdx).setReg(RegMul1);
+    MI->getOperand(FirstMulOpIdx).setIsKill(KillRegMul1);
+    MI->getOperand(FirstMulOpIdx + 1).setReg(RegMul2);
+    MI->getOperand(FirstMulOpIdx + 1).setIsKill(KillRegMul2);
+  };
+
+  if (Pattern == MachineCombinerPattern::REASSOC_XY_AMM_BMM) {
+    // Create new instructions for insertion.
+    MachineInstrBuilder MINewB =
+        BuildMI(*MF, Prev->getDebugLoc(), get(FmaOp), NewVRB)
+            .addReg(RegX, getKillRegState(KillX))
+            .addReg(RegM21, getKillRegState(KillM21))
+            .addReg(RegM22, getKillRegState(KillM22));
+    MachineInstrBuilder MINewA =
+        BuildMI(*MF, Root.getDebugLoc(), get(FmaOp), NewVRA)
+            .addReg(RegY, getKillRegState(KillY))
+            .addReg(RegM31, getKillRegState(KillM31))
+            .addReg(RegM32, getKillRegState(KillM32));
+    // If AddOpIdx is not 1, adjust the order.
+    if (AddOpIdx != 1) {
+      AdjustOperandOrder(MINewB, RegX, KillX, RegM21, KillM21, RegM22, KillM22);
+      AdjustOperandOrder(MINewA, RegY, KillY, RegM31, KillM31, RegM32, KillM32);
+    }
+
+    MachineInstrBuilder MINewC =
+        BuildMI(*MF, Root.getDebugLoc(),
+                get(FMAOpIdxInfo[Idx][InfoArrayIdxFAddInst]), RegC)
+            .addReg(NewVRB, getKillRegState(true))
+            .addReg(NewVRA, getKillRegState(true));
+
+    // Update flags for newly created instructions.
+    setSpecialOperandAttr(*MINewA, IntersectedFlags);
+    setSpecialOperandAttr(*MINewB, IntersectedFlags);
+    setSpecialOperandAttr(*MINewC, IntersectedFlags);
+
+    // Record new instructions for insertion.
+    InsInstrs.push_back(MINewA);
+    InsInstrs.push_back(MINewB);
+    InsInstrs.push_back(MINewC);
+  } else if (Pattern == MachineCombinerPattern::REASSOC_XMM_AMM_BMM) {
+    assert(NewVRD && "new FMA register not created!");
+    // Create new instructions for insertion.
+    MachineInstrBuilder MINewA =
+        BuildMI(*MF, Leaf->getDebugLoc(),
+                get(FMAOpIdxInfo[Idx][InfoArrayIdxFMULInst]), NewVRA)
+            .addReg(RegM11, getKillRegState(KillM11))
+            .addReg(RegM12, getKillRegState(KillM12));
+    MachineInstrBuilder MINewB =
+        BuildMI(*MF, Prev->getDebugLoc(), get(FmaOp), NewVRB)
+            .addReg(RegX, getKillRegState(KillX))
+            .addReg(RegM21, getKillRegState(KillM21))
+            .addReg(RegM22, getKillRegState(KillM22));
+    MachineInstrBuilder MINewD =
+        BuildMI(*MF, Root.getDebugLoc(), get(FmaOp), NewVRD)
+            .addReg(NewVRA, getKillRegState(true))
+            .addReg(RegM31, getKillRegState(KillM31))
+            .addReg(RegM32, getKillRegState(KillM32));
+    // If AddOpIdx is not 1, adjust the order.
+    if (AddOpIdx != 1) {
+      AdjustOperandOrder(MINewB, RegX, KillX, RegM21, KillM21, RegM22, KillM22);
+      AdjustOperandOrder(MINewD, NewVRA, true, RegM31, KillM31, RegM32,
+                         KillM32);
+    }
+
+    MachineInstrBuilder MINewC =
+        BuildMI(*MF, Root.getDebugLoc(),
+                get(FMAOpIdxInfo[Idx][InfoArrayIdxFAddInst]), RegC)
+            .addReg(NewVRB, getKillRegState(true))
+            .addReg(NewVRD, getKillRegState(true));
+
+    // Update flags for newly created instructions.
+    setSpecialOperandAttr(*MINewA, IntersectedFlags);
+    setSpecialOperandAttr(*MINewB, IntersectedFlags);
+    setSpecialOperandAttr(*MINewD, IntersectedFlags);
+    setSpecialOperandAttr(*MINewC, IntersectedFlags);
+
+    // Record new instructions for insertion.
+    InsInstrs.push_back(MINewA);
+    InsInstrs.push_back(MINewB);
+    InsInstrs.push_back(MINewD);
+    InsInstrs.push_back(MINewC);
+  }
+
+  assert(!InsInstrs.empty() &&
+         "Insertion instructions set should not be empty!");
+
+  // Record old instructions for deletion.
+  DelInstrs.push_back(Leaf);
+  DelInstrs.push_back(Prev);
+  DelInstrs.push_back(&Root);
+}
+
 // Detect 32 -> 64-bit extensions where we may reuse the low sub-register.
 bool PPCInstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
-                                         unsigned &SrcReg, unsigned &DstReg,
+                                         Register &SrcReg, Register &DstReg,
                                          unsigned &SubIdx) const {
   switch (MI.getOpcode()) {
   default: return false;
@@ -753,9 +1092,10 @@ unsigned PPCInstrInfo::insertBranch(MachineBasicBlock &MBB,
 
 // Select analysis.
 bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
-                ArrayRef<MachineOperand> Cond,
-                unsigned TrueReg, unsigned FalseReg,
-                int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+                                   ArrayRef<MachineOperand> Cond,
+                                   Register DstReg, Register TrueReg,
+                                   Register FalseReg, int &CondCycles,
+                                   int &TrueCycles, int &FalseCycles) const {
   if (Cond.size() != 2)
     return false;
 
@@ -791,9 +1131,9 @@ bool PPCInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
 
 void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI,
-                                const DebugLoc &dl, unsigned DestReg,
-                                ArrayRef<MachineOperand> Cond, unsigned TrueReg,
-                                unsigned FalseReg) const {
+                                const DebugLoc &dl, Register DestReg,
+                                ArrayRef<MachineOperand> Cond, Register TrueReg,
+                                Register FalseReg) const {
   assert(Cond.size() == 2 &&
          "PPC branch conditions have two components!");
 
@@ -852,7 +1192,7 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
   case PPC::PRED_BIT_UNSET: SubIdx = 0; SwapOps = true; break;
   }
 
-  unsigned FirstReg =  SwapOps ? FalseReg : TrueReg,
+  Register FirstReg =  SwapOps ? FalseReg : TrueReg,
            SecondReg = SwapOps ? TrueReg  : FalseReg;
 
   // The first input register of isel cannot be r0. If it is a member
@@ -863,7 +1203,7 @@ void PPCInstrInfo::insertSelect(MachineBasicBlock &MBB,
     const TargetRegisterClass *FirstRC =
       MRI.getRegClass(FirstReg)->contains(PPC::X0) ?
         &PPC::G8RC_NOX0RegClass : &PPC::GPRC_NOR0RegClass;
-    unsigned OldFirstReg = FirstReg;
+    Register OldFirstReg = FirstReg;
     FirstReg = MRI.createVirtualRegister(FirstRC);
     BuildMI(MBB, MI, dl, get(TargetOpcode::COPY), FirstReg)
       .addReg(OldFirstReg);
@@ -1024,183 +1364,66 @@ void PPCInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     BuildMI(MBB, I, DL, MCID, DestReg).addReg(SrcReg, getKillRegState(KillSrc));
 }
 
-unsigned PPCInstrInfo::getStoreOpcodeForSpill(unsigned Reg,
-                                              const TargetRegisterClass *RC)
-                                              const {
-  const unsigned *OpcodesForSpill = getStoreOpcodesForSpillArray();
+static unsigned getSpillIndex(const TargetRegisterClass *RC) {
   int OpcodeIndex = 0;
 
-  if (RC != nullptr) {
-    if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
-        PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_Int4Spill;
-    } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
-               PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_Int8Spill;
-    } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_Float8Spill;
-    } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_Float4Spill;
-    } else if (PPC::SPERCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_SPESpill;
-    } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_CRSpill;
-    } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_CRBitSpill;
-    } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_VRVectorSpill;
-    } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_VSXVectorSpill;
-    } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_VectorFloat8Spill;
-    } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_VectorFloat4Spill;
-    } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_VRSaveSpill;
-    } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_QuadFloat8Spill;
-    } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_QuadFloat4Spill;
-    } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_QuadBitSpill;
-    } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_SpillToVSR;
-    } else {
-      llvm_unreachable("Unknown regclass!");
-    }
+  if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
+      PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_Int4Spill;
+  } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
+             PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_Int8Spill;
+  } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_Float8Spill;
+  } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_Float4Spill;
+  } else if (PPC::SPERCRegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_SPESpill;
+  } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_CRSpill;
+  } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_CRBitSpill;
+  } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_VRVectorSpill;
+  } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_VSXVectorSpill;
+  } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_VectorFloat8Spill;
+  } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_VectorFloat4Spill;
+  } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_VRSaveSpill;
+  } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_QuadFloat8Spill;
+  } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_QuadFloat4Spill;
+  } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_QuadBitSpill;
+  } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
+    OpcodeIndex = SOK_SpillToVSR;
   } else {
-    if (PPC::GPRCRegClass.contains(Reg) ||
-        PPC::GPRC_NOR0RegClass.contains(Reg)) {
-      OpcodeIndex = SOK_Int4Spill;
-    } else if (PPC::G8RCRegClass.contains(Reg) ||
-               PPC::G8RC_NOX0RegClass.contains(Reg)) {
-      OpcodeIndex = SOK_Int8Spill;
-    } else if (PPC::F8RCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_Float8Spill;
-    } else if (PPC::F4RCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_Float4Spill;
-    } else if (PPC::SPERCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_SPESpill;
-    } else if (PPC::CRRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_CRSpill;
-    } else if (PPC::CRBITRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_CRBitSpill;
-    } else if (PPC::VRRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_VRVectorSpill;
-    } else if (PPC::VSRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_VSXVectorSpill;
-    } else if (PPC::VSFRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_VectorFloat8Spill;
-    } else if (PPC::VSSRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_VectorFloat4Spill;
-    } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_VRSaveSpill;
-    } else if (PPC::QFRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_QuadFloat8Spill;
-    } else if (PPC::QSRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_QuadFloat4Spill;
-    } else if (PPC::QBRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_QuadBitSpill;
-    } else if (PPC::SPILLTOVSRRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_SpillToVSR;
-    } else {
-      llvm_unreachable("Unknown regclass!");
-    }
+    llvm_unreachable("Unknown regclass!");
   }
-  return OpcodesForSpill[OpcodeIndex];
+  return OpcodeIndex;
 }
 
 unsigned
-PPCInstrInfo::getLoadOpcodeForSpill(unsigned Reg,
-                                    const TargetRegisterClass *RC) const {
-  const unsigned *OpcodesForSpill = getLoadOpcodesForSpillArray();
-  int OpcodeIndex = 0;
+PPCInstrInfo::getStoreOpcodeForSpill(const TargetRegisterClass *RC) const {
+  const unsigned *OpcodesForSpill = getStoreOpcodesForSpillArray();
+  return OpcodesForSpill[getSpillIndex(RC)];
+}
 
-  if (RC != nullptr) {
-    if (PPC::GPRCRegClass.hasSubClassEq(RC) ||
-        PPC::GPRC_NOR0RegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_Int4Spill;
-    } else if (PPC::G8RCRegClass.hasSubClassEq(RC) ||
-               PPC::G8RC_NOX0RegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_Int8Spill;
-    } else if (PPC::F8RCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_Float8Spill;
-    } else if (PPC::F4RCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_Float4Spill;
-    } else if (PPC::SPERCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_SPESpill;
-    } else if (PPC::CRRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_CRSpill;
-    } else if (PPC::CRBITRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_CRBitSpill;
-    } else if (PPC::VRRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_VRVectorSpill;
-    } else if (PPC::VSRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_VSXVectorSpill;
-    } else if (PPC::VSFRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_VectorFloat8Spill;
-    } else if (PPC::VSSRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_VectorFloat4Spill;
-    } else if (PPC::VRSAVERCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_VRSaveSpill;
-    } else if (PPC::QFRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_QuadFloat8Spill;
-    } else if (PPC::QSRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_QuadFloat4Spill;
-    } else if (PPC::QBRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_QuadBitSpill;
-    } else if (PPC::SPILLTOVSRRCRegClass.hasSubClassEq(RC)) {
-      OpcodeIndex = SOK_SpillToVSR;
-    } else {
-      llvm_unreachable("Unknown regclass!");
-    }
-  } else {
-    if (PPC::GPRCRegClass.contains(Reg) ||
-        PPC::GPRC_NOR0RegClass.contains(Reg)) {
-      OpcodeIndex = SOK_Int4Spill;
-    } else if (PPC::G8RCRegClass.contains(Reg) ||
-               PPC::G8RC_NOX0RegClass.contains(Reg)) {
-      OpcodeIndex = SOK_Int8Spill;
-    } else if (PPC::F8RCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_Float8Spill;
-    } else if (PPC::F4RCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_Float4Spill;
-    } else if (PPC::SPERCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_SPESpill;
-    } else if (PPC::CRRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_CRSpill;
-    } else if (PPC::CRBITRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_CRBitSpill;
-    } else if (PPC::VRRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_VRVectorSpill;
-    } else if (PPC::VSRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_VSXVectorSpill;
-    } else if (PPC::VSFRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_VectorFloat8Spill;
-    } else if (PPC::VSSRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_VectorFloat4Spill;
-    } else if (PPC::VRSAVERCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_VRSaveSpill;
-    } else if (PPC::QFRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_QuadFloat8Spill;
-    } else if (PPC::QSRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_QuadFloat4Spill;
-    } else if (PPC::QBRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_QuadBitSpill;
-    } else if (PPC::SPILLTOVSRRCRegClass.contains(Reg)) {
-      OpcodeIndex = SOK_SpillToVSR;
-    } else {
-      llvm_unreachable("Unknown regclass!");
-    }
-  }
-  return OpcodesForSpill[OpcodeIndex];
+unsigned
+PPCInstrInfo::getLoadOpcodeForSpill(const TargetRegisterClass *RC) const {
+  const unsigned *OpcodesForSpill = getLoadOpcodesForSpillArray();
+  return OpcodesForSpill[getSpillIndex(RC)];
 }
 
 void PPCInstrInfo::StoreRegToStackSlot(
     MachineFunction &MF, unsigned SrcReg, bool isKill, int FrameIdx,
     const TargetRegisterClass *RC,
     SmallVectorImpl<MachineInstr *> &NewMIs) const {
-  unsigned Opcode = getStoreOpcodeForSpill(PPC::NoRegister, RC);
+  unsigned Opcode = getStoreOpcodeForSpill(RC);
   DebugLoc DL;
 
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
@@ -1221,24 +1444,13 @@ void PPCInstrInfo::StoreRegToStackSlot(
     FuncInfo->setHasNonRISpills();
 }
 
-void PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
-                                       MachineBasicBlock::iterator MI,
-                                       unsigned SrcReg, bool isKill,
-                                       int FrameIdx,
-                                       const TargetRegisterClass *RC,
-                                       const TargetRegisterInfo *TRI) const {
+void PPCInstrInfo::storeRegToStackSlotNoUpd(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned SrcReg,
+    bool isKill, int FrameIdx, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   SmallVector<MachineInstr *, 4> NewMIs;
 
-  // We need to avoid a situation in which the value from a VRRC register is
-  // spilled using an Altivec instruction and reloaded into a VSRC register
-  // using a VSX instruction. The issue with this is that the VSX
-  // load/store instructions swap the doublewords in the vector and the Altivec
-  // ones don't. The register classes on the spill/reload may be different if
-  // the register is defined using an Altivec instruction and is then used by a
-  // VSX instruction.
-  RC = updatedRC(RC);
-
   StoreRegToStackSlot(MF, SrcReg, isKill, FrameIdx, RC, NewMIs);
 
   for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
@@ -1248,16 +1460,33 @@ void PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FrameIdx),
       MachineMemOperand::MOStore, MFI.getObjectSize(FrameIdx),
-      MFI.getObjectAlignment(FrameIdx));
+      MFI.getObjectAlign(FrameIdx));
   NewMIs.back()->addMemOperand(MF, MMO);
 }
 
+void PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MI,
+                                       Register SrcReg, bool isKill,
+                                       int FrameIdx,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  // We need to avoid a situation in which the value from a VRRC register is
+  // spilled using an Altivec instruction and reloaded into a VSRC register
+  // using a VSX instruction. The issue with this is that the VSX
+  // load/store instructions swap the doublewords in the vector and the Altivec
+  // ones don't. The register classes on the spill/reload may be different if
+  // the register is defined using an Altivec instruction and is then used by a
+  // VSX instruction.
+  RC = updatedRC(RC);
+  storeRegToStackSlotNoUpd(MBB, MI, SrcReg, isKill, FrameIdx, RC, TRI);
+}
+
 void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
                                         unsigned DestReg, int FrameIdx,
                                         const TargetRegisterClass *RC,
                                         SmallVectorImpl<MachineInstr *> &NewMIs)
                                         const {
-  unsigned Opcode = getLoadOpcodeForSpill(PPC::NoRegister, RC);
+  unsigned Opcode = getLoadOpcodeForSpill(RC);
   NewMIs.push_back(addFrameReference(BuildMI(MF, DL, get(Opcode), DestReg),
                                      FrameIdx));
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
@@ -1273,12 +1502,10 @@ void PPCInstrInfo::LoadRegFromStackSlot(MachineFunction &MF, const DebugLoc &DL,
     FuncInfo->setHasNonRISpills();
 }
 
-void
-PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MI,
-                                   unsigned DestReg, int FrameIdx,
-                                   const TargetRegisterClass *RC,
-                                   const TargetRegisterInfo *TRI) const {
+void PPCInstrInfo::loadRegFromStackSlotNoUpd(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI, unsigned DestReg,
+    int FrameIdx, const TargetRegisterClass *RC,
+    const TargetRegisterInfo *TRI) const {
   MachineFunction &MF = *MBB.getParent();
   SmallVector<MachineInstr*, 4> NewMIs;
   DebugLoc DL;
@@ -1287,16 +1514,6 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   PPCFunctionInfo *FuncInfo = MF.getInfo<PPCFunctionInfo>();
   FuncInfo->setHasSpills();
 
-  // We need to avoid a situation in which the value from a VRRC register is
-  // spilled using an Altivec instruction and reloaded into a VSRC register
-  // using a VSX instruction. The issue with this is that the VSX
-  // load/store instructions swap the doublewords in the vector and the Altivec
-  // ones don't. The register classes on the spill/reload may be different if
-  // the register is defined using an Altivec instruction and is then used by a
-  // VSX instruction.
-  if (Subtarget.hasVSX() && RC == &PPC::VRRCRegClass)
-    RC = &PPC::VSRCRegClass;
-
   LoadRegFromStackSlot(MF, DL, DestReg, FrameIdx, RC, NewMIs);
 
   for (unsigned i = 0, e = NewMIs.size(); i != e; ++i)
@@ -1306,10 +1523,27 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FrameIdx),
       MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIdx),
-      MFI.getObjectAlignment(FrameIdx));
+      MFI.getObjectAlign(FrameIdx));
   NewMIs.back()->addMemOperand(MF, MMO);
 }
 
+void PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MI,
+                                        Register DestReg, int FrameIdx,
+                                        const TargetRegisterClass *RC,
+                                        const TargetRegisterInfo *TRI) const {
+  // We need to avoid a situation in which the value from a VRRC register is
+  // spilled using an Altivec instruction and reloaded into a VSRC register
+  // using a VSX instruction. The issue with this is that the VSX
+  // load/store instructions swap the doublewords in the vector and the Altivec
+  // ones don't. The register classes on the spill/reload may be different if
+  // the register is defined using an Altivec instruction and is then used by a
+  // VSX instruction.
+  RC = updatedRC(RC);
+
+  loadRegFromStackSlotNoUpd(MBB, MI, DestReg, FrameIdx, RC, TRI);
+}
+
 bool PPCInstrInfo::
 reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   assert(Cond.size() == 2 && "Invalid PPC branch opcode!");
@@ -1321,9 +1555,11 @@ reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const {
   return false;
 }
 
-bool PPCInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
-                                 unsigned Reg, MachineRegisterInfo *MRI) const {
-  // For some instructions, it is legal to fold ZERO into the RA register field.
+// For some instructions, it is legal to fold ZERO into the RA register field.
+// This function performs that fold by replacing the operand with PPC::ZERO,
+// it does not consider whether the load immediate zero is no longer in use.
+bool PPCInstrInfo::onlyFoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+                                     Register Reg) const {
   // A zero immediate should always be loaded with a single li.
   unsigned DefOpc = DefMI.getOpcode();
   if (DefOpc != PPC::LI && DefOpc != PPC::LI8)
@@ -1343,6 +1579,8 @@ bool PPCInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   if (UseMCID.isPseudo())
     return false;
 
+  // We need to find which of the User's operands is to be folded, that will be
+  // the operand that matches the given register ID.
   unsigned UseIdx;
   for (UseIdx = 0; UseIdx < UseMI.getNumOperands(); ++UseIdx)
     if (UseMI.getOperand(UseIdx).isReg() &&
@@ -1371,7 +1609,7 @@ bool PPCInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
   if (UseInfo->Constraints != 0)
     return false;
 
-  unsigned ZeroReg;
+  MCRegister ZeroReg;
   if (UseInfo->isLookupPtrRegClass()) {
     bool isPPC64 = Subtarget.isPPC64();
     ZeroReg = isPPC64 ? PPC::ZERO8 : PPC::ZERO;
@@ -1380,13 +1618,19 @@ bool PPCInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
               PPC::ZERO8 : PPC::ZERO;
   }
 
-  bool DeleteDef = MRI->hasOneNonDBGUse(Reg);
   UseMI.getOperand(UseIdx).setReg(ZeroReg);
+  return true;
+}
 
-  if (DeleteDef)
+// Folds zero into instructions which have a load immediate zero as an operand
+// but also recognize zero as immediate zero. If the definition of the load
+// has no more users it is deleted.
+bool PPCInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+                                 Register Reg, MachineRegisterInfo *MRI) const {
+  bool Changed = onlyFoldImmediate(UseMI, DefMI, Reg);
+  if (MRI->use_nodbg_empty(Reg))
     DefMI.eraseFromParent();
-
-  return true;
+  return Changed;
 }
 
 static bool MBBDefinesCTR(MachineBasicBlock &MBB) {
@@ -1423,17 +1667,6 @@ bool PPCInstrInfo::isPredicated(const MachineInstr &MI) const {
   return false;
 }
 
-bool PPCInstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
-  if (!MI.isTerminator())
-    return false;
-
-  // Conditional branch is a special case.
-  if (MI.isBranch() && !MI.isBarrier())
-    return true;
-
-  return !isPredicated(MI);
-}
-
 bool PPCInstrInfo::PredicateInstruction(MachineInstr &MI,
                                         ArrayRef<MachineOperand> Pred) const {
   unsigned OpC = MI.getOpcode();
@@ -1587,8 +1820,8 @@ bool PPCInstrInfo::DefinesPredicate(MachineInstr &MI,
   return Found;
 }
 
-bool PPCInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                                  unsigned &SrcReg2, int &Mask,
+bool PPCInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                                  Register &SrcReg2, int &Mask,
                                   int &Value) const {
   unsigned Opc = MI.getOpcode();
 
@@ -1617,8 +1850,8 @@ bool PPCInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
   }
 }
 
-bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
-                                        unsigned SrcReg2, int Mask, int Value,
+bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+                                        Register SrcReg2, int Mask, int Value,
                                         const MachineRegisterInfo *MRI) const {
   if (DisableCmpOpt)
     return false;
@@ -1646,8 +1879,8 @@ bool PPCInstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
   bool is64BitUnsignedCompare = OpC == PPC::CMPLDI || OpC == PPC::CMPLD;
 
   // Look through copies unless that gets us to a physical register.
-  unsigned ActualSrc = TRI->lookThruCopyLike(SrcReg, MRI);
-  if (Register::isVirtualRegister(ActualSrc))
+  Register ActualSrc = TRI->lookThruCopyLike(SrcReg, MRI);
+  if (ActualSrc.isVirtual())
     SrcReg = ActualSrc;
 
   // Get the unique definition of SrcReg.
@@ -2036,8 +2269,8 @@ PPCInstrInfo::getSerializableBitmaskMachineOperandTargetFlags() const {
   static const std::pair<unsigned, const char *> TargetFlags[] = {
       {MO_PLT, "ppc-plt"},
       {MO_PIC_FLAG, "ppc-pic"},
-      {MO_NLP_FLAG, "ppc-nlp"},
-      {MO_NLP_HIDDEN_FLAG, "ppc-nlp-hidden"}};
+      {MO_PCREL_FLAG, "ppc-pcrel"},
+      {MO_GOT_FLAG, "ppc-got"}};
   return makeArrayRef(TargetFlags);
 }
 
@@ -2330,7 +2563,8 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
   MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo();
   const TargetRegisterInfo *TRI = &getRegisterInfo();
   // If we're in SSA, get the defs through the MRI. Otherwise, only look
-  // within the basic block to see if the register is defined using an LI/LI8.
+  // within the basic block to see if the register is defined using an
+  // LI/LI8/ADDI/ADDI8.
   if (MRI->isSSA()) {
     for (int i = 1, e = MI.getNumOperands(); i < e; i++) {
       if (!MI.getOperand(i).isReg())
@@ -2341,9 +2575,16 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
       unsigned TrueReg = TRI->lookThruCopyLike(Reg, MRI);
       if (Register::isVirtualRegister(TrueReg)) {
         DefMI = MRI->getVRegDef(TrueReg);
-        if (DefMI->getOpcode() == PPC::LI || DefMI->getOpcode() == PPC::LI8) {
+        if (DefMI->getOpcode() == PPC::LI || DefMI->getOpcode() == PPC::LI8 ||
+            DefMI->getOpcode() == PPC::ADDI ||
+            DefMI->getOpcode() == PPC::ADDI8) {
           OpNoForForwarding = i;
-          break;
+          // The ADDI and LI operand maybe exist in one instruction at same
+          // time. we prefer to fold LI operand as LI only has one Imm operand
+          // and is more possible to be converted. So if current DefMI is
+          // ADDI/ADDI8, we continue to find possible LI/LI8.
+          if (DefMI->getOpcode() == PPC::LI || DefMI->getOpcode() == PPC::LI8)
+            break;
         }
       }
     }
@@ -2400,44 +2641,20 @@ MachineInstr *PPCInstrInfo::getForwardingDefMI(
   return OpNoForForwarding == ~0U ? nullptr : DefMI;
 }
 
+unsigned PPCInstrInfo::getSpillTarget() const {
+  return Subtarget.hasP9Vector() ? 1 : 0;
+}
+
 const unsigned *PPCInstrInfo::getStoreOpcodesForSpillArray() const {
-  static const unsigned OpcodesForSpill[2][SOK_LastOpcodeSpill] = {
-      // Power 8
-      {PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR,
-       PPC::SPILL_CRBIT, PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX,
-       PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb,
-       PPC::SPILLTOVSR_ST, PPC::EVSTDD},
-      // Power 9
-      {PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR,
-       PPC::SPILL_CRBIT, PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32,
-       PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb,
-       PPC::SPILLTOVSR_ST}};
-
-  return OpcodesForSpill[(Subtarget.hasP9Vector()) ? 1 : 0];
+  return StoreSpillOpcodesArray[getSpillTarget()];
 }
 
 const unsigned *PPCInstrInfo::getLoadOpcodesForSpillArray() const {
-  static const unsigned OpcodesForSpill[2][SOK_LastOpcodeSpill] = {
-      // Power 8
-      {PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,
-       PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX,
-       PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb,
-       PPC::SPILLTOVSR_LD, PPC::EVLDD},
-      // Power 9
-      {PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,
-       PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64, PPC::DFLOADf32,
-       PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb,
-       PPC::SPILLTOVSR_LD}};
-
-  return OpcodesForSpill[(Subtarget.hasP9Vector()) ? 1 : 0];
+  return LoadSpillOpcodesArray[getSpillTarget()];
 }
 
 void PPCInstrInfo::fixupIsDeadOrKill(MachineInstr &StartMI, MachineInstr &EndMI,
                                      unsigned RegNo) const {
-  const MachineRegisterInfo &MRI =
-      StartMI.getParent()->getParent()->getRegInfo();
-  if (MRI.isSSA())
-    return;
 
   // Instructions between [StartMI, EndMI] should be in same basic block.
   assert((StartMI.getParent() == EndMI.getParent()) &&
@@ -2588,6 +2805,13 @@ bool PPCInstrInfo::foldFrameOffset(MachineInstr &MI) const {
         return true;
     return false;
   };
+
+  // We are trying to replace the ImmOpNo with ScaleReg. Give up if it is
+  // treated as special zero when ScaleReg is R0/X0 register.
+  if (III.ZeroIsSpecialOrig == III.ImmOpNo &&
+      (ScaleReg == PPC::R0 || ScaleReg == PPC::X0))
+    return false;
+
   // Make sure no other def for ToBeChangedReg and ScaleReg between ADD Instr
   // and Imm Instr.
   if (NewDefFor(ToBeChangedReg, *ADDMI, MI) || NewDefFor(ScaleReg, *ADDMI, MI))
@@ -2631,6 +2855,10 @@ bool PPCInstrInfo::isADDIInstrEligibleForFolding(MachineInstr &ADDIMI,
   if (Opc != PPC::ADDI && Opc != PPC::ADDI8)
     return false;
 
+  // The operand may not necessarily be an immediate - it could be a relocation.
+  if (!ADDIMI.getOperand(2).isImm())
+    return false;
+
   Imm = ADDIMI.getOperand(2).getImm();
 
   return true;
@@ -2746,10 +2974,16 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
          "The forwarding operand needs to be valid at this point");
   bool IsForwardingOperandKilled = MI.getOperand(ForwardingOperand).isKill();
   bool KillFwdDefMI = !SeenIntermediateUse && IsForwardingOperandKilled;
-  Register ForwardingOperandReg = MI.getOperand(ForwardingOperand).getReg();
   if (KilledDef && KillFwdDefMI)
     *KilledDef = DefMI;
 
+  // If this is a imm instruction and its register operands is produced by ADDI,
+  // put the imm into imm inst directly.
+  if (RI.getMappedIdxOpcForImmOpc(MI.getOpcode()) !=
+          PPC::INSTRUCTION_LIST_END &&
+      transformToNewImmFormFedByAdd(MI, *DefMI, ForwardingOperand))
+    return true;
+
   ImmInstrInfo III;
   bool IsVFReg = MI.getOperand(0).isReg()
                      ? isVFRegister(MI.getOperand(0).getReg())
@@ -2763,228 +2997,17 @@ bool PPCInstrInfo::convertToImmediateForm(MachineInstr &MI,
                                  KillFwdDefMI))
     return true;
 
-  if ((DefMI->getOpcode() != PPC::LI && DefMI->getOpcode() != PPC::LI8) ||
-      !DefMI->getOperand(1).isImm())
-    return false;
-
-  int64_t Immediate = DefMI->getOperand(1).getImm();
-  // Sign-extend to 64-bits.
-  int64_t SExtImm = ((uint64_t)Immediate & ~0x7FFFuLL) != 0 ?
-    (Immediate | 0xFFFFFFFFFFFF0000) : Immediate;
-
   // If this is a reg+reg instruction that has a reg+imm form,
   // and one of the operands is produced by LI, convert it now.
-  if (HasImmForm)
-    return transformToImmFormFedByLI(MI, III, ForwardingOperand, *DefMI, SExtImm);
-
-  bool ReplaceWithLI = false;
-  bool Is64BitLI = false;
-  int64_t NewImm = 0;
-  bool SetCR = false;
-  unsigned Opc = MI.getOpcode();
-  switch (Opc) {
-  default: return false;
-
-  // FIXME: Any branches conditional on such a comparison can be made
-  // unconditional. At this time, this happens too infrequently to be worth
-  // the implementation effort, but if that ever changes, we could convert
-  // such a pattern here.
-  case PPC::CMPWI:
-  case PPC::CMPLWI:
-  case PPC::CMPDI:
-  case PPC::CMPLDI: {
-    // Doing this post-RA would require dataflow analysis to reliably find uses
-    // of the CR register set by the compare.
-    // No need to fixup killed/dead flag since this transformation is only valid
-    // before RA.
-    if (PostRA)
-      return false;
-    // If a compare-immediate is fed by an immediate and is itself an input of
-    // an ISEL (the most common case) into a COPY of the correct register.
-    bool Changed = false;
-    Register DefReg = MI.getOperand(0).getReg();
-    int64_t Comparand = MI.getOperand(2).getImm();
-    int64_t SExtComparand = ((uint64_t)Comparand & ~0x7FFFuLL) != 0 ?
-      (Comparand | 0xFFFFFFFFFFFF0000) : Comparand;
-
-    for (auto &CompareUseMI : MRI->use_instructions(DefReg)) {
-      unsigned UseOpc = CompareUseMI.getOpcode();
-      if (UseOpc != PPC::ISEL && UseOpc != PPC::ISEL8)
-        continue;
-      unsigned CRSubReg = CompareUseMI.getOperand(3).getSubReg();
-      Register TrueReg = CompareUseMI.getOperand(1).getReg();
-      Register FalseReg = CompareUseMI.getOperand(2).getReg();
-      unsigned RegToCopy = selectReg(SExtImm, SExtComparand, Opc, TrueReg,
-                                     FalseReg, CRSubReg);
-      if (RegToCopy == PPC::NoRegister)
-        continue;
-      // Can't use PPC::COPY to copy PPC::ZERO[8]. Convert it to LI[8] 0.
-      if (RegToCopy == PPC::ZERO || RegToCopy == PPC::ZERO8) {
-        CompareUseMI.setDesc(get(UseOpc == PPC::ISEL8 ? PPC::LI8 : PPC::LI));
-        replaceInstrOperandWithImm(CompareUseMI, 1, 0);
-        CompareUseMI.RemoveOperand(3);
-        CompareUseMI.RemoveOperand(2);
-        continue;
-      }
-      LLVM_DEBUG(
-          dbgs() << "Found LI -> CMPI -> ISEL, replacing with a copy.\n");
-      LLVM_DEBUG(DefMI->dump(); MI.dump(); CompareUseMI.dump());
-      LLVM_DEBUG(dbgs() << "Is converted to:\n");
-      // Convert to copy and remove unneeded operands.
-      CompareUseMI.setDesc(get(PPC::COPY));
-      CompareUseMI.RemoveOperand(3);
-      CompareUseMI.RemoveOperand(RegToCopy == TrueReg ? 2 : 1);
-      CmpIselsConverted++;
-      Changed = true;
-      LLVM_DEBUG(CompareUseMI.dump());
-    }
-    if (Changed)
-      return true;
-    // This may end up incremented multiple times since this function is called
-    // during a fixed-point transformation, but it is only meant to indicate the
-    // presence of this opportunity.
-    MissedConvertibleImmediateInstrs++;
-    return false;
-  }
-
-  // Immediate forms - may simply be convertable to an LI.
-  case PPC::ADDI:
-  case PPC::ADDI8: {
-    // Does the sum fit in a 16-bit signed field?
-    int64_t Addend = MI.getOperand(2).getImm();
-    if (isInt<16>(Addend + SExtImm)) {
-      ReplaceWithLI = true;
-      Is64BitLI = Opc == PPC::ADDI8;
-      NewImm = Addend + SExtImm;
-      break;
-    }
-    return false;
-  }
-  case PPC::RLDICL:
-  case PPC::RLDICL_rec:
-  case PPC::RLDICL_32:
-  case PPC::RLDICL_32_64: {
-    // Use APInt's rotate function.
-    int64_t SH = MI.getOperand(2).getImm();
-    int64_t MB = MI.getOperand(3).getImm();
-    APInt InVal((Opc == PPC::RLDICL || Opc == PPC::RLDICL_rec) ? 64 : 32,
-                SExtImm, true);
-    InVal = InVal.rotl(SH);
-    uint64_t Mask = (1LLU << (63 - MB + 1)) - 1;
-    InVal &= Mask;
-    // Can't replace negative values with an LI as that will sign-extend
-    // and not clear the left bits. If we're setting the CR bit, we will use
-    // ANDI_rec which won't sign extend, so that's safe.
-    if (isUInt<15>(InVal.getSExtValue()) ||
-        (Opc == PPC::RLDICL_rec && isUInt<16>(InVal.getSExtValue()))) {
-      ReplaceWithLI = true;
-      Is64BitLI = Opc != PPC::RLDICL_32;
-      NewImm = InVal.getSExtValue();
-      SetCR = Opc == PPC::RLDICL_rec;
-      break;
-    }
-    return false;
-  }
-  case PPC::RLWINM:
-  case PPC::RLWINM8:
-  case PPC::RLWINM_rec:
-  case PPC::RLWINM8_rec: {
-    int64_t SH = MI.getOperand(2).getImm();
-    int64_t MB = MI.getOperand(3).getImm();
-    int64_t ME = MI.getOperand(4).getImm();
-    APInt InVal(32, SExtImm, true);
-    InVal = InVal.rotl(SH);
-    // Set the bits (        MB + 32        ) to (        ME + 32        ).
-    uint64_t Mask = ((1LLU << (32 - MB)) - 1) & ~((1LLU << (31 - ME)) - 1);
-    InVal &= Mask;
-    // Can't replace negative values with an LI as that will sign-extend
-    // and not clear the left bits. If we're setting the CR bit, we will use
-    // ANDI_rec which won't sign extend, so that's safe.
-    bool ValueFits = isUInt<15>(InVal.getSExtValue());
-    ValueFits |= ((Opc == PPC::RLWINM_rec || Opc == PPC::RLWINM8_rec) &&
-                  isUInt<16>(InVal.getSExtValue()));
-    if (ValueFits) {
-      ReplaceWithLI = true;
-      Is64BitLI = Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8_rec;
-      NewImm = InVal.getSExtValue();
-      SetCR = Opc == PPC::RLWINM_rec || Opc == PPC::RLWINM8_rec;
-      break;
-    }
-    return false;
-  }
-  case PPC::ORI:
-  case PPC::ORI8:
-  case PPC::XORI:
-  case PPC::XORI8: {
-    int64_t LogicalImm = MI.getOperand(2).getImm();
-    int64_t Result = 0;
-    if (Opc == PPC::ORI || Opc == PPC::ORI8)
-      Result = LogicalImm | SExtImm;
-    else
-      Result = LogicalImm ^ SExtImm;
-    if (isInt<16>(Result)) {
-      ReplaceWithLI = true;
-      Is64BitLI = Opc == PPC::ORI8 || Opc == PPC::XORI8;
-      NewImm = Result;
-      break;
-    }
-    return false;
-  }
-  }
-
-  if (ReplaceWithLI) {
-    // We need to be careful with CR-setting instructions we're replacing.
-    if (SetCR) {
-      // We don't know anything about uses when we're out of SSA, so only
-      // replace if the new immediate will be reproduced.
-      bool ImmChanged = (SExtImm & NewImm) != NewImm;
-      if (PostRA && ImmChanged)
-        return false;
-
-      if (!PostRA) {
-        // If the defining load-immediate has no other uses, we can just replace
-        // the immediate with the new immediate.
-        if (MRI->hasOneUse(DefMI->getOperand(0).getReg()))
-          DefMI->getOperand(1).setImm(NewImm);
-
-        // If we're not using the GPR result of the CR-setting instruction, we
-        // just need to and with zero/non-zero depending on the new immediate.
-        else if (MRI->use_empty(MI.getOperand(0).getReg())) {
-          if (NewImm) {
-            assert(Immediate && "Transformation converted zero to non-zero?");
-            NewImm = Immediate;
-          }
-        }
-        else if (ImmChanged)
-          return false;
-      }
-    }
-
-    LLVM_DEBUG(dbgs() << "Replacing instruction:\n");
-    LLVM_DEBUG(MI.dump());
-    LLVM_DEBUG(dbgs() << "Fed by:\n");
-    LLVM_DEBUG(DefMI->dump());
-    LoadImmediateInfo LII;
-    LII.Imm = NewImm;
-    LII.Is64Bit = Is64BitLI;
-    LII.SetCR = SetCR;
-    // If we're setting the CR, the original load-immediate must be kept (as an
-    // operand to ANDI_rec/ANDI8_rec).
-    if (KilledDef && SetCR)
-      *KilledDef = nullptr;
-    replaceInstrWithLI(MI, LII);
-
-    // Fixup killed/dead flag after transformation.
-    // Pattern:
-    // ForwardingOperandReg = LI imm1
-    // y = op2 imm2, ForwardingOperandReg(killed)
-    if (IsForwardingOperandKilled)
-      fixupIsDeadOrKill(*DefMI, MI, ForwardingOperandReg);
+  if (HasImmForm &&
+      transformToImmFormFedByLI(MI, III, ForwardingOperand, *DefMI))
+    return true;
 
-    LLVM_DEBUG(dbgs() << "With:\n");
-    LLVM_DEBUG(MI.dump());
+  // If this is not a reg+reg, but the DefMI is LI/LI8, check if its user MI
+  // can be simpified to LI.
+  if (!HasImmForm && simplifyToLI(MI, *DefMI, ForwardingOperand, KilledDef))
     return true;
-  }
+
   return false;
 }
 
@@ -3501,6 +3524,10 @@ bool PPCInstrInfo::isDefMIElgibleForForwarding(MachineInstr &DefMI,
   RegMO = &DefMI.getOperand(1);
   ImmMO = &DefMI.getOperand(2);
 
+  // Before RA, ADDI first operand could be a frame index.
+  if (!RegMO->isReg())
+    return false;
+
   // This DefMI is elgible for forwarding if it is:
   // 1. add inst
   // 2. one of the operands is Imm/CPI/Global.
@@ -3549,7 +3576,8 @@ bool PPCInstrInfo::isRegElgibleForForwarding(
 bool PPCInstrInfo::isImmElgibleForForwarding(const MachineOperand &ImmMO,
                                              const MachineInstr &DefMI,
                                              const ImmInstrInfo &III,
-                                             int64_t &Imm) const {
+                                             int64_t &Imm,
+                                             int64_t BaseImm) const {
   assert(isAnImmediateOperand(ImmMO) && "ImmMO is NOT an immediate");
   if (DefMI.getOpcode() == PPC::ADDItocL) {
     // The operand for ADDItocL is CPI, which isn't imm at compiling time,
@@ -3563,19 +3591,21 @@ bool PPCInstrInfo::isImmElgibleForForwarding(const MachineOperand &ImmMO,
     // not just an immediate but also a multiple of 4, or 16 depending on the
     // load. A DForm load cannot be represented if it is a multiple of say 2.
     // XForm loads do not have this restriction.
-    if (ImmMO.isGlobal() &&
-        ImmMO.getGlobal()->getAlignment() < III.ImmMustBeMultipleOf)
-      return false;
+    if (ImmMO.isGlobal()) {
+      const DataLayout &DL = ImmMO.getGlobal()->getParent()->getDataLayout();
+      if (ImmMO.getGlobal()->getPointerAlignment(DL) < III.ImmMustBeMultipleOf)
+        return false;
+    }
 
     return true;
   }
 
   if (ImmMO.isImm()) {
     // It is Imm, we need to check if the Imm fit the range.
-    int64_t Immediate = ImmMO.getImm();
     // Sign-extend to 64-bits.
-    Imm = ((uint64_t)Immediate & ~0x7FFFuLL) != 0 ?
-      (Immediate | 0xFFFFFFFFFFFF0000) : Immediate;
+    // DefMI may be folded with another imm form instruction, the result Imm is
+    // the sum of Imm of DefMI and BaseImm which is from imm form instruction.
+    Imm = SignExtend64<16>(ImmMO.getImm() + BaseImm);
 
     if (Imm % III.ImmMustBeMultipleOf)
       return false;
@@ -3599,6 +3629,328 @@ bool PPCInstrInfo::isImmElgibleForForwarding(const MachineOperand &ImmMO,
   return true;
 }
 
+bool PPCInstrInfo::simplifyToLI(MachineInstr &MI, MachineInstr &DefMI,
+                                unsigned OpNoForForwarding,
+                                MachineInstr **KilledDef) const {
+  if ((DefMI.getOpcode() != PPC::LI && DefMI.getOpcode() != PPC::LI8) ||
+      !DefMI.getOperand(1).isImm())
+    return false;
+
+  MachineFunction *MF = MI.getParent()->getParent();
+  MachineRegisterInfo *MRI = &MF->getRegInfo();
+  bool PostRA = !MRI->isSSA();
+
+  int64_t Immediate = DefMI.getOperand(1).getImm();
+  // Sign-extend to 64-bits.
+  int64_t SExtImm = SignExtend64<16>(Immediate);
+
+  bool IsForwardingOperandKilled = MI.getOperand(OpNoForForwarding).isKill();
+  Register ForwardingOperandReg = MI.getOperand(OpNoForForwarding).getReg();
+
+  bool ReplaceWithLI = false;
+  bool Is64BitLI = false;
+  int64_t NewImm = 0;
+  bool SetCR = false;
+  unsigned Opc = MI.getOpcode();
+  switch (Opc) {
+  default:
+    return false;
+
+  // FIXME: Any branches conditional on such a comparison can be made
+  // unconditional. At this time, this happens too infrequently to be worth
+  // the implementation effort, but if that ever changes, we could convert
+  // such a pattern here.
+  case PPC::CMPWI:
+  case PPC::CMPLWI:
+  case PPC::CMPDI:
+  case PPC::CMPLDI: {
+    // Doing this post-RA would require dataflow analysis to reliably find uses
+    // of the CR register set by the compare.
+    // No need to fixup killed/dead flag since this transformation is only valid
+    // before RA.
+    if (PostRA)
+      return false;
+    // If a compare-immediate is fed by an immediate and is itself an input of
+    // an ISEL (the most common case) into a COPY of the correct register.
+    bool Changed = false;
+    Register DefReg = MI.getOperand(0).getReg();
+    int64_t Comparand = MI.getOperand(2).getImm();
+    int64_t SExtComparand = ((uint64_t)Comparand & ~0x7FFFuLL) != 0
+                                ? (Comparand | 0xFFFFFFFFFFFF0000)
+                                : Comparand;
+
+    for (auto &CompareUseMI : MRI->use_instructions(DefReg)) {
+      unsigned UseOpc = CompareUseMI.getOpcode();
+      if (UseOpc != PPC::ISEL && UseOpc != PPC::ISEL8)
+        continue;
+      unsigned CRSubReg = CompareUseMI.getOperand(3).getSubReg();
+      Register TrueReg = CompareUseMI.getOperand(1).getReg();
+      Register FalseReg = CompareUseMI.getOperand(2).getReg();
+      unsigned RegToCopy =
+          selectReg(SExtImm, SExtComparand, Opc, TrueReg, FalseReg, CRSubReg);
+      if (RegToCopy == PPC::NoRegister)
+        continue;
+      // Can't use PPC::COPY to copy PPC::ZERO[8]. Convert it to LI[8] 0.
+      if (RegToCopy == PPC::ZERO || RegToCopy == PPC::ZERO8) {
+        CompareUseMI.setDesc(get(UseOpc == PPC::ISEL8 ? PPC::LI8 : PPC::LI));
+        replaceInstrOperandWithImm(CompareUseMI, 1, 0);
+        CompareUseMI.RemoveOperand(3);
+        CompareUseMI.RemoveOperand(2);
+        continue;
+      }
+      LLVM_DEBUG(
+          dbgs() << "Found LI -> CMPI -> ISEL, replacing with a copy.\n");
+      LLVM_DEBUG(DefMI.dump(); MI.dump(); CompareUseMI.dump());
+      LLVM_DEBUG(dbgs() << "Is converted to:\n");
+      // Convert to copy and remove unneeded operands.
+      CompareUseMI.setDesc(get(PPC::COPY));
+      CompareUseMI.RemoveOperand(3);
+      CompareUseMI.RemoveOperand(RegToCopy == TrueReg ? 2 : 1);
+      CmpIselsConverted++;
+      Changed = true;
+      LLVM_DEBUG(CompareUseMI.dump());
+    }
+    if (Changed)
+      return true;
+    // This may end up incremented multiple times since this function is called
+    // during a fixed-point transformation, but it is only meant to indicate the
+    // presence of this opportunity.
+    MissedConvertibleImmediateInstrs++;
+    return false;
+  }
+
+  // Immediate forms - may simply be convertable to an LI.
+  case PPC::ADDI:
+  case PPC::ADDI8: {
+    // Does the sum fit in a 16-bit signed field?
+    int64_t Addend = MI.getOperand(2).getImm();
+    if (isInt<16>(Addend + SExtImm)) {
+      ReplaceWithLI = true;
+      Is64BitLI = Opc == PPC::ADDI8;
+      NewImm = Addend + SExtImm;
+      break;
+    }
+    return false;
+  }
+  case PPC::RLDICL:
+  case PPC::RLDICL_rec:
+  case PPC::RLDICL_32:
+  case PPC::RLDICL_32_64: {
+    // Use APInt's rotate function.
+    int64_t SH = MI.getOperand(2).getImm();
+    int64_t MB = MI.getOperand(3).getImm();
+    APInt InVal((Opc == PPC::RLDICL || Opc == PPC::RLDICL_rec) ? 64 : 32,
+                SExtImm, true);
+    InVal = InVal.rotl(SH);
+    uint64_t Mask = MB == 0 ? -1LLU : (1LLU << (63 - MB + 1)) - 1;
+    InVal &= Mask;
+    // Can't replace negative values with an LI as that will sign-extend
+    // and not clear the left bits. If we're setting the CR bit, we will use
+    // ANDI_rec which won't sign extend, so that's safe.
+    if (isUInt<15>(InVal.getSExtValue()) ||
+        (Opc == PPC::RLDICL_rec && isUInt<16>(InVal.getSExtValue()))) {
+      ReplaceWithLI = true;
+      Is64BitLI = Opc != PPC::RLDICL_32;
+      NewImm = InVal.getSExtValue();
+      SetCR = Opc == PPC::RLDICL_rec;
+      break;
+    }
+    return false;
+  }
+  case PPC::RLWINM:
+  case PPC::RLWINM8:
+  case PPC::RLWINM_rec:
+  case PPC::RLWINM8_rec: {
+    int64_t SH = MI.getOperand(2).getImm();
+    int64_t MB = MI.getOperand(3).getImm();
+    int64_t ME = MI.getOperand(4).getImm();
+    APInt InVal(32, SExtImm, true);
+    InVal = InVal.rotl(SH);
+    APInt Mask = APInt::getBitsSetWithWrap(32, 32 - ME - 1, 32 - MB);
+    InVal &= Mask;
+    // Can't replace negative values with an LI as that will sign-extend
+    // and not clear the left bits. If we're setting the CR bit, we will use
+    // ANDI_rec which won't sign extend, so that's safe.
+    bool ValueFits = isUInt<15>(InVal.getSExtValue());
+    ValueFits |= ((Opc == PPC::RLWINM_rec || Opc == PPC::RLWINM8_rec) &&
+                  isUInt<16>(InVal.getSExtValue()));
+    if (ValueFits) {
+      ReplaceWithLI = true;
+      Is64BitLI = Opc == PPC::RLWINM8 || Opc == PPC::RLWINM8_rec;
+      NewImm = InVal.getSExtValue();
+      SetCR = Opc == PPC::RLWINM_rec || Opc == PPC::RLWINM8_rec;
+      break;
+    }
+    return false;
+  }
+  case PPC::ORI:
+  case PPC::ORI8:
+  case PPC::XORI:
+  case PPC::XORI8: {
+    int64_t LogicalImm = MI.getOperand(2).getImm();
+    int64_t Result = 0;
+    if (Opc == PPC::ORI || Opc == PPC::ORI8)
+      Result = LogicalImm | SExtImm;
+    else
+      Result = LogicalImm ^ SExtImm;
+    if (isInt<16>(Result)) {
+      ReplaceWithLI = true;
+      Is64BitLI = Opc == PPC::ORI8 || Opc == PPC::XORI8;
+      NewImm = Result;
+      break;
+    }
+    return false;
+  }
+  }
+
+  if (ReplaceWithLI) {
+    // We need to be careful with CR-setting instructions we're replacing.
+    if (SetCR) {
+      // We don't know anything about uses when we're out of SSA, so only
+      // replace if the new immediate will be reproduced.
+      bool ImmChanged = (SExtImm & NewImm) != NewImm;
+      if (PostRA && ImmChanged)
+        return false;
+
+      if (!PostRA) {
+        // If the defining load-immediate has no other uses, we can just replace
+        // the immediate with the new immediate.
+        if (MRI->hasOneUse(DefMI.getOperand(0).getReg()))
+          DefMI.getOperand(1).setImm(NewImm);
+
+        // If we're not using the GPR result of the CR-setting instruction, we
+        // just need to and with zero/non-zero depending on the new immediate.
+        else if (MRI->use_empty(MI.getOperand(0).getReg())) {
+          if (NewImm) {
+            assert(Immediate && "Transformation converted zero to non-zero?");
+            NewImm = Immediate;
+          }
+        } else if (ImmChanged)
+          return false;
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "Replacing instruction:\n");
+    LLVM_DEBUG(MI.dump());
+    LLVM_DEBUG(dbgs() << "Fed by:\n");
+    LLVM_DEBUG(DefMI.dump());
+    LoadImmediateInfo LII;
+    LII.Imm = NewImm;
+    LII.Is64Bit = Is64BitLI;
+    LII.SetCR = SetCR;
+    // If we're setting the CR, the original load-immediate must be kept (as an
+    // operand to ANDI_rec/ANDI8_rec).
+    if (KilledDef && SetCR)
+      *KilledDef = nullptr;
+    replaceInstrWithLI(MI, LII);
+
+    // Fixup killed/dead flag after transformation.
+    // Pattern:
+    // ForwardingOperandReg = LI imm1
+    // y = op2 imm2, ForwardingOperandReg(killed)
+    if (IsForwardingOperandKilled)
+      fixupIsDeadOrKill(DefMI, MI, ForwardingOperandReg);
+
+    LLVM_DEBUG(dbgs() << "With:\n");
+    LLVM_DEBUG(MI.dump());
+    return true;
+  }
+  return false;
+}
+
+bool PPCInstrInfo::transformToNewImmFormFedByAdd(
+    MachineInstr &MI, MachineInstr &DefMI, unsigned OpNoForForwarding) const {
+  MachineRegisterInfo *MRI = &MI.getParent()->getParent()->getRegInfo();
+  bool PostRA = !MRI->isSSA();
+  // FIXME: extend this to post-ra. Need to do some change in getForwardingDefMI
+  // for post-ra.
+  if (PostRA)
+    return false;
+
+  // Only handle load/store.
+  if (!MI.mayLoadOrStore())
+    return false;
+
+  unsigned XFormOpcode = RI.getMappedIdxOpcForImmOpc(MI.getOpcode());
+
+  assert((XFormOpcode != PPC::INSTRUCTION_LIST_END) &&
+         "MI must have x-form opcode");
+
+  // get Imm Form info.
+  ImmInstrInfo III;
+  bool IsVFReg = MI.getOperand(0).isReg()
+                     ? isVFRegister(MI.getOperand(0).getReg())
+                     : false;
+
+  if (!instrHasImmForm(XFormOpcode, IsVFReg, III, PostRA))
+    return false;
+
+  if (!III.IsSummingOperands)
+    return false;
+
+  if (OpNoForForwarding != III.OpNoForForwarding)
+    return false;
+
+  MachineOperand ImmOperandMI = MI.getOperand(III.ImmOpNo);
+  if (!ImmOperandMI.isImm())
+    return false;
+
+  // Check DefMI.
+  MachineOperand *ImmMO = nullptr;
+  MachineOperand *RegMO = nullptr;
+  if (!isDefMIElgibleForForwarding(DefMI, III, ImmMO, RegMO))
+    return false;
+  assert(ImmMO && RegMO && "Imm and Reg operand must have been set");
+
+  // Check Imm.
+  // Set ImmBase from imm instruction as base and get new Imm inside
+  // isImmElgibleForForwarding.
+  int64_t ImmBase = ImmOperandMI.getImm();
+  int64_t Imm = 0;
+  if (!isImmElgibleForForwarding(*ImmMO, DefMI, III, Imm, ImmBase))
+    return false;
+
+  // Get killed info in case fixup needed after transformation.
+  unsigned ForwardKilledOperandReg = ~0U;
+  if (MI.getOperand(III.OpNoForForwarding).isKill())
+    ForwardKilledOperandReg = MI.getOperand(III.OpNoForForwarding).getReg();
+
+  // Do the transform
+  LLVM_DEBUG(dbgs() << "Replacing instruction:\n");
+  LLVM_DEBUG(MI.dump());
+  LLVM_DEBUG(dbgs() << "Fed by:\n");
+  LLVM_DEBUG(DefMI.dump());
+
+  MI.getOperand(III.OpNoForForwarding).setReg(RegMO->getReg());
+  MI.getOperand(III.OpNoForForwarding).setIsKill(RegMO->isKill());
+  MI.getOperand(III.ImmOpNo).setImm(Imm);
+
+  // FIXME: fix kill/dead flag if MI and DefMI are not in same basic block.
+  if (DefMI.getParent() == MI.getParent()) {
+    // Check if reg is killed between MI and DefMI.
+    auto IsKilledFor = [&](unsigned Reg) {
+      MachineBasicBlock::const_reverse_iterator It = MI;
+      MachineBasicBlock::const_reverse_iterator E = DefMI;
+      It++;
+      for (; It != E; ++It) {
+        if (It->killsRegister(Reg))
+          return true;
+      }
+      return false;
+    };
+
+    // Update kill flag
+    if (RegMO->isKill() || IsKilledFor(RegMO->getReg()))
+      fixupIsDeadOrKill(DefMI, MI, RegMO->getReg());
+    if (ForwardKilledOperandReg != ~0U)
+      fixupIsDeadOrKill(DefMI, MI, ForwardKilledOperandReg);
+  }
+
+  LLVM_DEBUG(dbgs() << "With:\n");
+  LLVM_DEBUG(MI.dump());
+  return true;
+}
+
 // If an X-Form instruction is fed by an add-immediate and one of its operands
 // is the literal zero, attempt to forward the source of the add-immediate to
 // the corresponding D-Form instruction with the displacement coming from
@@ -3718,8 +4070,15 @@ bool PPCInstrInfo::transformToImmFormFedByAdd(
 bool PPCInstrInfo::transformToImmFormFedByLI(MachineInstr &MI,
                                              const ImmInstrInfo &III,
                                              unsigned ConstantOpNo,
-                                             MachineInstr &DefMI,
-                                             int64_t Imm) const {
+                                             MachineInstr &DefMI) const {
+  // DefMI must be LI or LI8.
+  if ((DefMI.getOpcode() != PPC::LI && DefMI.getOpcode() != PPC::LI8) ||
+      !DefMI.getOperand(1).isImm())
+    return false;
+
+  // Get Imm operand and Sign-extend to 64-bits.
+  int64_t Imm = SignExtend64<16>(DefMI.getOperand(1).getImm());
+
   MachineRegisterInfo &MRI = MI.getParent()->getParent()->getRegInfo();
   bool PostRA = !MRI.isSSA();
   // Exit early if we can't convert this.
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.h b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
index 2fe8df0e1d68..d98597f48340 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.h
@@ -65,7 +65,9 @@ enum {
   NewDef_Shift = 6,
 
   /// This instruction is an X-Form memory operation.
-  XFormMemOp = 0x1 << (NewDef_Shift+1)
+  XFormMemOp = 0x1 << NewDef_Shift,
+  /// This instruction is prefixed.
+  Prefixed = 0x1 << (NewDef_Shift+1)
 };
 } // end namespace PPCII
 
@@ -108,10 +110,74 @@ struct LoadImmediateInfo {
   unsigned SetCR : 1;
 };
 
+// Index into the OpcodesForSpill array.
+enum SpillOpcodeKey {
+  SOK_Int4Spill,
+  SOK_Int8Spill,
+  SOK_Float8Spill,
+  SOK_Float4Spill,
+  SOK_CRSpill,
+  SOK_CRBitSpill,
+  SOK_VRVectorSpill,
+  SOK_VSXVectorSpill,
+  SOK_VectorFloat8Spill,
+  SOK_VectorFloat4Spill,
+  SOK_VRSaveSpill,
+  SOK_QuadFloat8Spill,
+  SOK_QuadFloat4Spill,
+  SOK_QuadBitSpill,
+  SOK_SpillToVSR,
+  SOK_SPESpill,
+  SOK_LastOpcodeSpill // This must be last on the enum.
+};
+
+// Define list of load and store spill opcodes.
+#define Pwr8LoadOpcodes                                                        \
+  {                                                                            \
+    PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,                    \
+        PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXVD2X, PPC::LXSDX, PPC::LXSSPX,    \
+        PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs, PPC::QVLFDXb,          \
+        PPC::SPILLTOVSR_LD, PPC::EVLDD                                         \
+  }
+
+#define Pwr9LoadOpcodes                                                        \
+  {                                                                            \
+    PPC::LWZ, PPC::LD, PPC::LFD, PPC::LFS, PPC::RESTORE_CR,                    \
+        PPC::RESTORE_CRBIT, PPC::LVX, PPC::LXV, PPC::DFLOADf64,                \
+        PPC::DFLOADf32, PPC::RESTORE_VRSAVE, PPC::QVLFDX, PPC::QVLFSXs,        \
+        PPC::QVLFDXb, PPC::SPILLTOVSR_LD                                       \
+  }
+
+#define Pwr8StoreOpcodes                                                       \
+  {                                                                            \
+    PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \
+        PPC::STVX, PPC::STXVD2X, PPC::STXSDX, PPC::STXSSPX, PPC::SPILL_VRSAVE, \
+        PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb, PPC::SPILLTOVSR_ST,        \
+        PPC::EVSTDD                                                            \
+  }
+
+#define Pwr9StoreOpcodes                                                       \
+  {                                                                            \
+    PPC::STW, PPC::STD, PPC::STFD, PPC::STFS, PPC::SPILL_CR, PPC::SPILL_CRBIT, \
+        PPC::STVX, PPC::STXV, PPC::DFSTOREf64, PPC::DFSTOREf32,                \
+        PPC::SPILL_VRSAVE, PPC::QVSTFDX, PPC::QVSTFSXs, PPC::QVSTFDXb,         \
+        PPC::SPILLTOVSR_ST                                                     \
+  }
+
+// Initialize arrays for load and store spill opcodes on supported subtargets.
+#define StoreOpcodesForSpill                                                   \
+  { Pwr8StoreOpcodes, Pwr9StoreOpcodes }
+#define LoadOpcodesForSpill                                                    \
+  { Pwr8LoadOpcodes, Pwr9LoadOpcodes }
+
 class PPCSubtarget;
 class PPCInstrInfo : public PPCGenInstrInfo {
   PPCSubtarget &Subtarget;
   const PPCRegisterInfo RI;
+  const unsigned StoreSpillOpcodesArray[2][SOK_LastOpcodeSpill] =
+      StoreOpcodesForSpill;
+  const unsigned LoadSpillOpcodesArray[2][SOK_LastOpcodeSpill] =
+      LoadOpcodesForSpill;
 
   void StoreRegToStackSlot(MachineFunction &MF, unsigned SrcReg, bool isKill,
                            int FrameIdx, const TargetRegisterClass *RC,
@@ -121,13 +187,21 @@ class PPCInstrInfo : public PPCGenInstrInfo {
                             const TargetRegisterClass *RC,
                             SmallVectorImpl<MachineInstr *> &NewMIs) const;
 
-  // If the inst has imm-form and one of its operand is produced by a LI,
-  // put the imm into the inst directly and remove the LI if possible.
+  // Replace the instruction with single LI if possible. \p DefMI must be LI or
+  // LI8.
+  bool simplifyToLI(MachineInstr &MI, MachineInstr &DefMI,
+                    unsigned OpNoForForwarding, MachineInstr **KilledDef) const;
+  // If the inst is imm-form and its register operand is produced by a ADDI, put
+  // the imm into the inst directly and remove the ADDI if possible.
+  bool transformToNewImmFormFedByAdd(MachineInstr &MI, MachineInstr &DefMI,
+                                     unsigned OpNoForForwarding) const;
+  // If the inst is x-form and has imm-form and one of its operand is produced
+  // by a LI, put the imm into the inst directly and remove the LI if possible.
   bool transformToImmFormFedByLI(MachineInstr &MI, const ImmInstrInfo &III,
-                                 unsigned ConstantOpNo, MachineInstr &DefMI,
-                                 int64_t Imm) const;
-  // If the inst has imm-form and one of its operand is produced by an
-  // add-immediate, try to transform it when possible.
+                                 unsigned ConstantOpNo,
+                                 MachineInstr &DefMI) const;
+  // If the inst is x-form and has imm-form and one of its operand is produced
+  // by an add-immediate, try to transform it when possible.
   bool transformToImmFormFedByAdd(MachineInstr &MI, const ImmInstrInfo &III,
                                   unsigned ConstantOpNo, MachineInstr &DefMI,
                                   bool KillDefMI) const;
@@ -151,13 +225,20 @@ class PPCInstrInfo : public PPCGenInstrInfo {
   bool isImmElgibleForForwarding(const MachineOperand &ImmMO,
                                  const MachineInstr &DefMI,
                                  const ImmInstrInfo &III,
-                                 int64_t &Imm) const;
+                                 int64_t &Imm,
+                                 int64_t BaseImm = 0) const;
   bool isRegElgibleForForwarding(const MachineOperand &RegMO,
                                  const MachineInstr &DefMI,
                                  const MachineInstr &MI, bool KillDefMI,
                                  bool &IsFwdFeederRegKilled) const;
+  unsigned getSpillTarget() const;
   const unsigned *getStoreOpcodesForSpillArray() const;
   const unsigned *getLoadOpcodesForSpillArray() const;
+  int16_t getFMAOpIdxInfo(unsigned Opcode) const;
+  void reassociateFMA(MachineInstr &Root, MachineCombinerPattern Pattern,
+                      SmallVectorImpl<MachineInstr *> &InsInstrs,
+                      SmallVectorImpl<MachineInstr *> &DelInstrs,
+                      DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const;
   virtual void anchor();
 
 protected:
@@ -187,6 +268,10 @@ public:
   bool isXFormMemOp(unsigned Opcode) const {
     return get(Opcode).TSFlags & PPCII::XFormMemOp;
   }
+  bool isPrefixed(unsigned Opcode) const {
+    return get(Opcode).TSFlags & PPCII::Prefixed;
+  }
+
   static bool isSameClassPhysRegCopy(unsigned Opcode) {
     unsigned CopyOpcodes[] =
       { PPC::OR, PPC::OR8, PPC::FMR, PPC::VOR, PPC::XXLOR, PPC::XXLORf,
@@ -233,6 +318,20 @@ public:
     return true;
   }
 
+  /// When getMachineCombinerPatterns() finds patterns, this function generates
+  /// the instructions that could replace the original code sequence
+  void genAlternativeCodeSequence(
+      MachineInstr &Root, MachineCombinerPattern Pattern,
+      SmallVectorImpl<MachineInstr *> &InsInstrs,
+      SmallVectorImpl<MachineInstr *> &DelInstrs,
+      DenseMap<unsigned, unsigned> &InstrIdxForVirtReg) const override;
+
+  /// Return true when there is potentially a faster code sequence for a fma
+  /// chain ending in \p Root. All potential patterns are output in the \p
+  /// P array.
+  bool getFMAPatterns(MachineInstr &Root,
+                      SmallVectorImpl<MachineCombinerPattern> &P) const;
+
   /// Return true when there is potentially a faster code sequence
   /// for an instruction chain ending in <Root>. All potential patterns are
   /// output in the <Pattern> array.
@@ -242,8 +341,24 @@ public:
 
   bool isAssociativeAndCommutative(const MachineInstr &Inst) const override;
 
+  /// On PowerPC, we try to reassociate FMA chain which will increase
+  /// instruction size. Set extension resource length limit to 1 for edge case.
+  /// Resource Length is calculated by scaled resource usage in getCycles().
+  /// Because of the division in getCycles(), it returns different cycles due to
+  /// legacy scaled resource usage. So new resource length may be same with
+  /// legacy or 1 bigger than legacy.
+  /// We need to execlude the 1 bigger case even the resource length is not
+  /// perserved for more FMA chain reassociations on PowerPC.
+  int getExtendResourceLenLimit() const override { return 1; }
+
+  void setSpecialOperandAttr(MachineInstr &OldMI1, MachineInstr &OldMI2,
+                             MachineInstr &NewMI1,
+                             MachineInstr &NewMI2) const override;
+
+  void setSpecialOperandAttr(MachineInstr &MI, uint16_t Flags) const override;
+
   bool isCoalescableExtInstr(const MachineInstr &MI,
-                             unsigned &SrcReg, unsigned &DstReg,
+                             Register &SrcReg, Register &DstReg,
                              unsigned &SubIdx) const override;
   unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
@@ -273,11 +388,12 @@ public:
 
   // Select analysis.
   bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
-                       unsigned, unsigned, int &, int &, int &) const override;
+                       Register, Register, Register, int &, int &,
+                       int &) const override;
   void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                    const DebugLoc &DL, unsigned DstReg,
-                    ArrayRef<MachineOperand> Cond, unsigned TrueReg,
-                    unsigned FalseReg) const override;
+                    const DebugLoc &DL, Register DstReg,
+                    ArrayRef<MachineOperand> Cond, Register TrueReg,
+                    Register FalseReg) const override;
 
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
@@ -285,28 +401,47 @@ public:
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           Register SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
+  // Emits a register spill without updating the register class for vector
+  // registers. This ensures that when we spill a vector register the
+  // element order in the register is the same as it was in memory.
+  void storeRegToStackSlotNoUpd(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MBBI,
+                                unsigned SrcReg, bool isKill, int FrameIndex,
+                                const TargetRegisterClass *RC,
+                                const TargetRegisterInfo *TRI) const;
+
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
-                            unsigned DestReg, int FrameIndex,
+                            Register DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
-  unsigned getStoreOpcodeForSpill(unsigned Reg,
-                                  const TargetRegisterClass *RC = nullptr) const;
+  // Emits a register reload without updating the register class for vector
+  // registers. This ensures that when we reload a vector register the
+  // element order in the register is the same as it was in memory.
+  void loadRegFromStackSlotNoUpd(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MBBI,
+                                 unsigned DestReg, int FrameIndex,
+                                 const TargetRegisterClass *RC,
+                                 const TargetRegisterInfo *TRI) const;
+
+  unsigned getStoreOpcodeForSpill(const TargetRegisterClass *RC) const;
 
-  unsigned getLoadOpcodeForSpill(unsigned Reg,
-                                 const TargetRegisterClass *RC = nullptr) const;
+  unsigned getLoadOpcodeForSpill(const TargetRegisterClass *RC) const;
 
   bool
   reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
 
-  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
                      MachineRegisterInfo *MRI) const override;
 
+  bool onlyFoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
+                         Register Reg) const;
+
   // If conversion by predication (only supported by some branch instructions).
   // All of the profitability checks always return true; it is always
   // profitable to use the predicated branches.
@@ -335,8 +470,6 @@ public:
   // Predication support.
   bool isPredicated(const MachineInstr &MI) const override;
 
-  bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
-
   bool PredicateInstruction(MachineInstr &MI,
                             ArrayRef<MachineOperand> Pred) const override;
 
@@ -348,11 +481,11 @@ public:
 
   // Comparison optimization.
 
-  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                      unsigned &SrcReg2, int &Mask, int &Value) const override;
+  bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                      Register &SrcReg2, int &Mask, int &Value) const override;
 
-  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
-                            unsigned SrcReg2, int Mask, int Value,
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+                            Register SrcReg2, int Mask, int Value,
                             const MachineRegisterInfo *MRI) const override;
 
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.td b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
index b38ca3af63f5..673ab63039cf 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.td
@@ -37,9 +37,6 @@ def SDT_PPCstore_scal_int_from_vsr : SDTypeProfile<0, 3, [
 def SDT_PPCVexts  : SDTypeProfile<1, 2, [
   SDTCisVT<0, f64>, SDTCisVT<1, f64>, SDTCisPtrTy<2>
 ]>;
-def SDT_PPCSExtVElems  : SDTypeProfile<1, 1, [
-  SDTCisVec<0>, SDTCisVec<1>
-]>;
 
 def SDT_PPCCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i32>,
                                            SDTCisVT<1, i32> ]>;
@@ -53,6 +50,10 @@ def SDT_PPCVecSplat : SDTypeProfile<1, 2, [ SDTCisVec<0>,
   SDTCisVec<1>, SDTCisInt<2>
 ]>;
 
+def SDT_PPCSpToDp : SDTypeProfile<1, 1, [ SDTCisVT<0, v2f64>,
+  SDTCisInt<1>
+]>;
+
 def SDT_PPCVecShift : SDTypeProfile<1, 3, [ SDTCisVec<0>,
   SDTCisVec<1>, SDTCisVec<2>, SDTCisPtrTy<3>
 ]>;
@@ -151,19 +152,19 @@ def PPClxsizx : SDNode<"PPCISD::LXSIZX", SDT_PPCLxsizx,
 def PPCstxsix : SDNode<"PPCISD::STXSIX", SDT_PPCstxsix,
                        [SDNPHasChain, SDNPMayStore]>;
 def PPCVexts  : SDNode<"PPCISD::VEXTS", SDT_PPCVexts, []>;
-def PPCSExtVElems  : SDNode<"PPCISD::SExtVElems", SDT_PPCSExtVElems, []>;
 
 // Extract FPSCR (not modeled at the DAG level).
 def PPCmffs   : SDNode<"PPCISD::MFFS",
-                       SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>, []>;
+                       SDTypeProfile<1, 0, [SDTCisVT<0, f64>]>,
+                       [SDNPHasChain]>;
 
 // Perform FADD in round-to-zero mode.
 def PPCfaddrtz: SDNode<"PPCISD::FADDRTZ", SDTFPBinOp, []>;
 
 
-def PPCfsel   : SDNode<"PPCISD::FSEL",  
+def PPCfsel   : SDNode<"PPCISD::FSEL",
    // Type constraint for fsel.
-   SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>, 
+   SDTypeProfile<1, 3, [SDTCisSameAs<0, 2>, SDTCisSameAs<0, 3>,
                         SDTCisFP<0>, SDTCisVT<1, f64>]>, []>;
 def PPCxsmaxc : SDNode<"PPCISD::XSMAXCDP", SDT_PPCFPMinMax, []>;
 def PPCxsminc : SDNode<"PPCISD::XSMINCDP", SDT_PPCFPMinMax, []>;
@@ -171,8 +172,6 @@ def PPChi       : SDNode<"PPCISD::Hi", SDTIntBinOp, []>;
 def PPClo       : SDNode<"PPCISD::Lo", SDTIntBinOp, []>;
 def PPCtoc_entry: SDNode<"PPCISD::TOC_ENTRY", SDTIntBinOp,
                          [SDNPMayLoad, SDNPMemOperand]>;
-def PPCvmaddfp  : SDNode<"PPCISD::VMADDFP", SDTFPTernaryOp, []>;
-def PPCvnmsubfp : SDNode<"PPCISD::VNMSUBFP", SDTFPTernaryOp, []>;
 
 def PPCppc32GOT : SDNode<"PPCISD::PPC32_GOT", SDTIntLeaf, []>;
 
@@ -199,6 +198,7 @@ def PPCaddiDtprelL   : SDNode<"PPCISD::ADDI_DTPREL_L", SDTIntBinOp>;
 
 def PPCvperm     : SDNode<"PPCISD::VPERM", SDT_PPCvperm, []>;
 def PPCxxsplt    : SDNode<"PPCISD::XXSPLT", SDT_PPCVecSplat, []>;
+def PPCxxspltidp : SDNode<"PPCISD::XXSPLTI_SP_TO_DP", SDT_PPCSpToDp, []>;
 def PPCvecinsert : SDNode<"PPCISD::VECINSERT", SDT_PPCVecInsert, []>;
 def PPCxxpermdi  : SDNode<"PPCISD::XXPERMDI", SDT_PPCxxpermdi, []>;
 def PPCvecshl    : SDNode<"PPCISD::VECSHL", SDT_PPCVecShift, []>;
@@ -221,6 +221,8 @@ def PPCsrl        : SDNode<"PPCISD::SRL"       , SDTIntShiftOp>;
 def PPCsra        : SDNode<"PPCISD::SRA"       , SDTIntShiftOp>;
 def PPCshl        : SDNode<"PPCISD::SHL"       , SDTIntShiftOp>;
 
+def PPCfnmsub     : SDNode<"PPCISD::FNMSUB"    , SDTFPTernaryOp>;
+
 def PPCextswsli : SDNode<"PPCISD::EXTSWSLI" , SDT_PPCextswsli>;
 
 // Move 2 i64 values into a VSX register
@@ -255,6 +257,9 @@ def PPCcall  : SDNode<"PPCISD::CALL", SDT_PPCCall,
 def PPCcall_nop  : SDNode<"PPCISD::CALL_NOP", SDT_PPCCall,
                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
                            SDNPVariadic]>;
+def PPCcall_notoc : SDNode<"PPCISD::CALL_NOTOC", SDT_PPCCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                            SDNPVariadic]>;
 def PPCmtctr      : SDNode<"PPCISD::MTCTR", SDT_PPCCall,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 def PPCbctrl : SDNode<"PPCISD::BCTRL", SDTNone,
@@ -318,11 +323,32 @@ def SDTDynOp  : SDTypeProfile<1, 2, []>;
 def SDTDynAreaOp  : SDTypeProfile<1, 1, []>;
 def PPCdynalloc   : SDNode<"PPCISD::DYNALLOC", SDTDynOp, [SDNPHasChain]>;
 def PPCdynareaoffset   : SDNode<"PPCISD::DYNAREAOFFSET", SDTDynAreaOp, [SDNPHasChain]>;
+def PPCprobedalloca : SDNode<"PPCISD::PROBED_ALLOCA", SDTDynOp, [SDNPHasChain]>;
+
+// PC Relative Specific Nodes
+def PPCmatpcreladdr : SDNode<"PPCISD::MAT_PCREL_ADDR", SDTIntUnaryOp, []>;
 
 //===----------------------------------------------------------------------===//
 // PowerPC specific transformation functions and pattern fragments.
 //
 
+// A floating point immediate that is not a positive zero and can be converted
+// to a single precision floating point non-denormal immediate without loss of
+// information.
+def nzFPImmAsi32 : PatLeaf<(fpimm), [{
+  APFloat APFloatOfN = N->getValueAPF();
+  return convertToNonDenormSingle(APFloatOfN) && !N->isExactlyValue(+0.0);
+}]>;
+
+// Convert the floating point immediate into a 32 bit floating point immediate
+// and get a i32 with the resulting bits.
+def getFPAs32BitInt : SDNodeXForm<fpimm, [{
+  APFloat APFloatOfN = N->getValueAPF();
+  convertToNonDenormSingle(APFloatOfN);
+  return CurDAG->getTargetConstant(APFloatOfN.bitcastToAPInt().getZExtValue(),
+                                   SDLoc(N), MVT::i32);
+}]>;
+
 def SHL32 : SDNodeXForm<imm, [{
   // Transformation function: 31 - imm
   return getI32Imm(31 - N->getZExtValue(), SDLoc(N));
@@ -386,9 +412,10 @@ def immZExt16  : PatLeaf<(imm), [{
   // field.  Used by instructions like 'ori'.
   return (uint64_t)N->getZExtValue() == (unsigned short)N->getZExtValue();
 }], LO16>;
-def immNonAllOneAnyExt8 : ImmLeaf<i32, [{ 
+def immNonAllOneAnyExt8 : ImmLeaf<i32, [{
   return (isInt<8>(Imm) && (Imm != -1)) || (isUInt<8>(Imm) && (Imm != 0xFF));
 }]>;
+def i32immNonAllOneNonZero : ImmLeaf<i32, [{ return Imm && (Imm != -1); }]>;
 def immSExt5NonZero : ImmLeaf<i32, [{ return Imm && isInt<5>(Imm); }]>;
 
 // imm16Shifted* - These match immediates where the low 16-bits are zero.  There
@@ -404,7 +431,7 @@ def imm16ShiftedZExt : PatLeaf<(imm), [{
 
 def imm16ShiftedSExt : PatLeaf<(imm), [{
   // imm16ShiftedSExt predicate - True if only bits in the top 16-bits of the
-  // immediate are set.  Used by instructions like 'addis'.  Identical to 
+  // immediate are set.  Used by instructions like 'addis'.  Identical to
   // imm16ShiftedZExt in 32-bit mode.
   if (N->getZExtValue() & 0xFFFF) return false;
   if (N->getValueType(0) == MVT::i32)
@@ -723,6 +750,27 @@ def s17imm  : Operand<i32> {
   let ParserMatchClass = PPCS17ImmAsmOperand;
   let DecoderMethod = "decodeSImmOperand<16>";
 }
+def PPCS34ImmAsmOperand : AsmOperandClass {
+  let Name = "S34Imm";
+  let PredicateMethod = "isS34Imm";
+  let RenderMethod = "addImmOperands";
+}
+def s34imm : Operand<i64> {
+  let PrintMethod = "printS34ImmOperand";
+  let EncoderMethod = "getImm34Encoding";
+  let ParserMatchClass = PPCS34ImmAsmOperand;
+  let DecoderMethod = "decodeSImmOperand<34>";
+}
+def PPCImmZeroAsmOperand : AsmOperandClass {
+  let Name = "ImmZero";
+  let PredicateMethod = "isImmZero";
+  let RenderMethod = "addImmOperands";
+}
+def immZero : Operand<i32> {
+  let PrintMethod = "printImmZeroOperand";
+  let ParserMatchClass = PPCImmZeroAsmOperand;
+  let DecoderMethod = "decodeImmZeroOperand";
+}
 
 def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
 
@@ -733,7 +781,9 @@ def PPCDirectBrAsmOperand : AsmOperandClass {
 def directbrtarget : Operand<OtherVT> {
   let PrintMethod = "printBranchOperand";
   let EncoderMethod = "getDirectBrEncoding";
+  let DecoderMethod = "decodeDirectBrTarget";
   let ParserMatchClass = PPCDirectBrAsmOperand;
+  let OperandType = "OPERAND_PCREL";
 }
 def absdirectbrtarget : Operand<OtherVT> {
   let PrintMethod = "printAbsBranchOperand";
@@ -747,7 +797,9 @@ def PPCCondBrAsmOperand : AsmOperandClass {
 def condbrtarget : Operand<OtherVT> {
   let PrintMethod = "printBranchOperand";
   let EncoderMethod = "getCondBrEncoding";
+  let DecoderMethod = "decodeCondBrTarget";
   let ParserMatchClass = PPCCondBrAsmOperand;
+  let OperandType = "OPERAND_PCREL";
 }
 def abscondbrtarget : Operand<OtherVT> {
   let PrintMethod = "printAbsBranchOperand";
@@ -757,7 +809,7 @@ def abscondbrtarget : Operand<OtherVT> {
 def calltarget : Operand<iPTR> {
   let PrintMethod = "printBranchOperand";
   let EncoderMethod = "getDirectBrEncoding";
-  let DecoderMethod = "DecodePCRel24BranchTarget";
+  let DecoderMethod = "decodeDirectBrTarget";
   let ParserMatchClass = PPCDirectBrAsmOperand;
   let OperandType = "OPERAND_PCREL";
 }
@@ -783,6 +835,30 @@ def PPCRegGxRCNoR0Operand : AsmOperandClass {
 def ptr_rc_nor0 : Operand<iPTR>, PointerLikeRegClass<1> {
   let ParserMatchClass = PPCRegGxRCNoR0Operand;
 }
+
+// New addressing modes with 34 bit immediates.
+def PPCDispRI34Operand : AsmOperandClass {
+  let Name = "DispRI34"; let PredicateMethod = "isS34Imm";
+  let RenderMethod = "addImmOperands";
+}
+def dispRI34 : Operand<iPTR> {
+  let ParserMatchClass = PPCDispRI34Operand;
+}
+def memri34 : Operand<iPTR> { // memri, imm is a 34-bit value.
+  let PrintMethod = "printMemRegImm34";
+  let MIOperandInfo = (ops dispRI34:$imm, ptr_rc_nor0:$reg);
+  let EncoderMethod = "getMemRI34Encoding";
+  let DecoderMethod = "decodeMemRI34Operands";
+}
+// memri, imm is a 34-bit value for pc-relative instructions where
+// base register is set to zero.
+def memri34_pcrel : Operand<iPTR> { // memri, imm is a 34-bit value.
+  let PrintMethod = "printMemRegImm34PCRel";
+  let MIOperandInfo = (ops dispRI34:$imm, immZero:$reg);
+  let EncoderMethod = "getMemRI34PCRelEncoding";
+  let DecoderMethod = "decodeMemRI34PCRelOperands";
+}
+
 // A version of ptr_rc usable with the asm parser.
 def PPCRegGxRCOperand : AsmOperandClass {
   let Name = "RegGxRC"; let PredicateMethod = "isRegNumber";
@@ -876,7 +952,7 @@ def spe2dis : Operand<iPTR> {   // SPE displacement where the imm is 2-aligned.
 }
 
 // A single-register address. This is used with the SjLj
-// pseudo-instructions which tranlates to LD/LWZ.  These instructions requires
+// pseudo-instructions which translates to LD/LWZ.  These instructions requires
 // G8RC_NOX0 registers.
 def memr : Operand<iPTR> {
   let MIOperandInfo = (ops ptr_rc_nor0:$ptrreg);
@@ -913,11 +989,11 @@ def iaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrImmX16",  [], []>;  // "stxv"
 
 // Below forms are all x-form addressing mode, use three different ones so we
 // can make a accurate check for x-form instructions in ISEL.
-// x-form addressing mode whose associated diplacement form is D.
+// x-form addressing mode whose associated displacement form is D.
 def xaddr  : ComplexPattern<iPTR, 2, "SelectAddrIdx",     [], []>;    // "stbx"
-// x-form addressing mode whose associated diplacement form is DS.
+// x-form addressing mode whose associated displacement form is DS.
 def xaddrX4 : ComplexPattern<iPTR, 2, "SelectAddrIdxX4",    [], []>;  // "stdx"
-// x-form addressing mode whose associated diplacement form is DQ.
+// x-form addressing mode whose associated displacement form is DQ.
 def xaddrX16 : ComplexPattern<iPTR, 2, "SelectAddrIdxX16",   [], []>; // "stxvx"
 
 def xoaddr : ComplexPattern<iPTR, 2, "SelectAddrIdxOnly",[], []>;
@@ -929,26 +1005,32 @@ def addr   : ComplexPattern<iPTR, 1, "SelectAddr",[], []>;
 /// This is just the offset part of iaddr, used for preinc.
 def iaddroff : ComplexPattern<iPTR, 1, "SelectAddrImmOffs", [], []>;
 
+// PC Relative Address
+def pcreladdr : ComplexPattern<iPTR, 1, "SelectAddrPCRel", [], []>;
+
 //===----------------------------------------------------------------------===//
 // PowerPC Instruction Predicate Definitions.
-def In32BitMode  : Predicate<"!PPCSubTarget->isPPC64()">;
-def In64BitMode  : Predicate<"PPCSubTarget->isPPC64()">;
-def IsBookE  : Predicate<"PPCSubTarget->isBookE()">;
-def IsNotBookE  : Predicate<"!PPCSubTarget->isBookE()">;
-def HasOnlyMSYNC : Predicate<"PPCSubTarget->hasOnlyMSYNC()">;
-def HasSYNC   : Predicate<"!PPCSubTarget->hasOnlyMSYNC()">;
-def IsPPC4xx  : Predicate<"PPCSubTarget->isPPC4xx()">;
-def IsPPC6xx  : Predicate<"PPCSubTarget->isPPC6xx()">;
-def IsE500  : Predicate<"PPCSubTarget->isE500()">;
-def HasSPE  : Predicate<"PPCSubTarget->hasSPE()">;
-def HasICBT : Predicate<"PPCSubTarget->hasICBT()">;
-def HasPartwordAtomics : Predicate<"PPCSubTarget->hasPartwordAtomics()">;
-def NoNaNsFPMath : Predicate<"TM.Options.NoNaNsFPMath">;
-def NaNsFPMath   : Predicate<"!TM.Options.NoNaNsFPMath">;
-def HasBPERMD : Predicate<"PPCSubTarget->hasBPERMD()">;
-def HasExtDiv : Predicate<"PPCSubTarget->hasExtDiv()">;
-def IsISA3_0 : Predicate<"PPCSubTarget->isISA3_0()">;
-def HasFPU : Predicate<"PPCSubTarget->hasFPU()">;
+def In32BitMode  : Predicate<"!Subtarget->isPPC64()">;
+def In64BitMode  : Predicate<"Subtarget->isPPC64()">;
+def IsBookE  : Predicate<"Subtarget->isBookE()">;
+def IsNotBookE  : Predicate<"!Subtarget->isBookE()">;
+def HasOnlyMSYNC : Predicate<"Subtarget->hasOnlyMSYNC()">;
+def HasSYNC   : Predicate<"!Subtarget->hasOnlyMSYNC()">;
+def IsPPC4xx  : Predicate<"Subtarget->isPPC4xx()">;
+def IsPPC6xx  : Predicate<"Subtarget->isPPC6xx()">;
+def IsE500  : Predicate<"Subtarget->isE500()">;
+def HasSPE  : Predicate<"Subtarget->hasSPE()">;
+def HasICBT : Predicate<"Subtarget->hasICBT()">;
+def HasPartwordAtomics : Predicate<"Subtarget->hasPartwordAtomics()">;
+def NoNaNsFPMath
+    : Predicate<"Subtarget->getTargetMachine().Options.NoNaNsFPMath">;
+def NaNsFPMath
+    : Predicate<"!Subtarget->getTargetMachine().Options.NoNaNsFPMath">;
+def HasBPERMD : Predicate<"Subtarget->hasBPERMD()">;
+def HasExtDiv : Predicate<"Subtarget->hasExtDiv()">;
+def IsISA3_0 : Predicate<"Subtarget->isISA3_0()">;
+def HasFPU : Predicate<"Subtarget->hasFPU()">;
+def PCRelativeMemops : Predicate<"Subtarget->hasPCRelativeMemops()">;
 
 //===----------------------------------------------------------------------===//
 // PowerPC Multiclass Definitions.
@@ -1318,7 +1400,20 @@ def DYNALLOC : PPCEmitTimePseudo<(outs gprc:$result), (ins gprc:$negsize, memri:
                              (PPCdynalloc i32:$negsize, iaddr:$fpsi))]>;
 def DYNAREAOFFSET : PPCEmitTimePseudo<(outs i32imm:$result), (ins memri:$fpsi), "#DYNAREAOFFSET",
                        [(set i32:$result, (PPCdynareaoffset iaddr:$fpsi))]>;
-                         
+// Probed alloca to support stack clash protection.
+let Defs = [R1], Uses = [R1], hasNoSchedulingInfo = 1 in {
+def PROBED_ALLOCA_32 : PPCCustomInserterPseudo<(outs gprc:$result),
+                         (ins gprc:$negsize, memri:$fpsi), "#PROBED_ALLOCA_32",
+                           [(set i32:$result,
+                             (PPCprobedalloca i32:$negsize, iaddr:$fpsi))]>;
+def PREPARE_PROBED_ALLOCA_32 : PPCEmitTimePseudo<(outs gprc:$fp,
+    gprc:$sp),
+    (ins gprc:$negsize, memri:$fpsi), "#PREPARE_PROBED_ALLOCA_32", []>;
+def PROBED_STACKALLOC_32 : PPCEmitTimePseudo<(outs gprc:$scratch, gprc:$temp),
+    (ins i64imm:$stacksize),
+    "#PROBED_STACKALLOC_32", []>;
+}
+
 // SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
 // instruction selection into a branch sequence.
 let PPC970_Single = 1 in {
@@ -1412,7 +1507,7 @@ let isTerminator = 1, isBarrier = 1, PPC970_Unit = 7 in {
 }
 
 // Set the float rounding mode.
-let Uses = [RM], Defs = [RM] in { 
+let Uses = [RM], Defs = [RM] in {
 def SETRNDi : PPCCustomInserterPseudo<(outs f8rc:$FRT), (ins u2imm:$RND),
                     "#SETRNDi", [(set f64:$FRT, (int_ppc_setrnd (i32 imm:$RND)))]>;
 
@@ -1724,6 +1819,8 @@ def RFEBB : XLForm_S<19, 146, (outs), (ins u1imm:$imm), "rfebb $imm",
                      IIC_BrB, [(PPCrfebb (i32 imm:$imm))]>,
                      PPC970_DGroup_Single;
 
+def : InstAlias<"rfebb", (RFEBB 1)>;
+
 // DCB* instructions.
 def DCBA   : DCB_Form<758, 0, (outs), (ins memrr:$dst), "dcba $dst",
                       IIC_LdStDCBF, [(int_ppc_dcba xoaddr:$dst)]>,
@@ -1777,6 +1874,11 @@ def : Pat<(prefetch xoaddr:$dst, (i32 1), imm, (i32 1)),
 def : Pat<(prefetch xoaddr:$dst, (i32 0), imm, (i32 0)),
           (ICBT 0, xoaddr:$dst)>, Requires<[HasICBT]>; // inst prefetch (for read)
 
+def : Pat<(int_ppc_dcbt_with_hint xoaddr:$dst, i32:$TH),
+          (DCBT i32:$TH, xoaddr:$dst)>;
+def : Pat<(int_ppc_dcbtst_with_hint xoaddr:$dst, i32:$TH),
+          (DCBTST i32:$TH, xoaddr:$dst)>;
+
 // Atomic operations
 // FIXME: some of these might be used with constant operands. This will result
 // in constant materialization instructions that may be redundant. We currently
@@ -1969,7 +2071,7 @@ def TD : XForm_1<31, 68, (outs), (ins u5imm:$to, g8rc:$rA, g8rc:$rB),
 // PPC32 Load Instructions.
 //
 
-// Unindexed (r+i) Loads. 
+// Unindexed (r+i) Loads.
 let PPC970_Unit = 2 in {
 def LBZ : DForm_1<34, (outs gprc:$rD), (ins memri:$src),
                   "lbz $rD, $src", IIC_LdStLoad,
@@ -2112,7 +2214,7 @@ def LFIWZX : XForm_25_memOp<31, 887, (outs f8rc:$frD), (ins memrr:$src),
 }
 
 // Load Multiple
-let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in 
+let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in
 def LMW : DForm_1<46, (outs gprc:$rD), (ins memri:$src),
                   "lmw $rD, $src", IIC_LdStLMW, []>;
 
@@ -2281,10 +2383,15 @@ let isCodeGenOnly = 1 in {
   }
 }
 
+// We used to have EIEIO as value but E[0-9A-Z] is a reserved name
+def EnforceIEIO : XForm_24_eieio<31, 854, (outs), (ins),
+                                 "eieio", IIC_LdStLoad, []>;
+
 def : Pat<(int_ppc_sync),   (SYNC 0)>, Requires<[HasSYNC]>;
 def : Pat<(int_ppc_lwsync), (SYNC 1)>, Requires<[HasSYNC]>;
 def : Pat<(int_ppc_sync),   (MSYNC)>, Requires<[HasOnlyMSYNC]>;
 def : Pat<(int_ppc_lwsync), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
+def : Pat<(int_ppc_eieio),  (EnforceIEIO)>;
 
 //===----------------------------------------------------------------------===//
 // PPC32 Arithmetic Instructions.
@@ -2331,6 +2438,9 @@ let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
 }
 }
 
+def : InstAlias<"li $rD, $imm", (ADDI gprc:$rD, ZERO, s16imm:$imm)>;
+def : InstAlias<"lis $rD, $imm", (ADDIS gprc:$rD, ZERO, s17imm:$imm)>;
+
 let PPC970_Unit = 1 in {  // FXU Operations.
 let Defs = [CR0] in {
 def ANDI_rec : DForm_4<28, (outs gprc:$dst), (ins gprc:$src1, u16imm:$src2),
@@ -2419,6 +2529,14 @@ defm SRAW : XForm_6rc<31, 792, (outs gprc:$rA), (ins gprc:$rS, gprc:$rB),
                       [(set i32:$rA, (PPCsra i32:$rS, i32:$rB))]>;
 }
 
+def : InstAlias<"mr $rA, $rB", (OR gprc:$rA, gprc:$rB, gprc:$rB)>;
+def : InstAlias<"mr. $rA, $rB", (OR_rec gprc:$rA, gprc:$rB, gprc:$rB)>;
+
+def : InstAlias<"not $rA, $rS", (NOR gprc:$rA, gprc:$rS, gprc:$rS)>;
+def : InstAlias<"not. $rA, $rS", (NOR_rec gprc:$rA, gprc:$rS, gprc:$rS)>;
+
+def : InstAlias<"nop", (ORI R0, R0, 0)>;
+
 let PPC970_Unit = 1 in {  // FXU Operations.
 let hasSideEffects = 0 in {
 defm SRAWI : XForm_10rc<31, 824, (outs gprc:$rA), (ins gprc:$rS, u5imm:$SH),
@@ -2465,7 +2583,7 @@ def FTDIV: XForm_17<63, 128, (outs crrc:$crD), (ins f8rc:$fA, f8rc:$fB),
 def FTSQRT: XForm_17a<63, 160, (outs crrc:$crD), (ins f8rc:$fB),
                       "ftsqrt $crD, $fB", IIC_FPCompare>;
 
-let Uses = [RM] in {
+let Uses = [RM], mayRaiseFPException = 1 in {
   let hasSideEffects = 0 in {
   defm FCTIW  : XForm_26r<63, 14, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fctiw", "$frD, $frB", IIC_FPGeneral,
@@ -2479,46 +2597,46 @@ let Uses = [RM] in {
 
   defm FRSP   : XForm_26r<63, 12, (outs f4rc:$frD), (ins f8rc:$frB),
                           "frsp", "$frD, $frB", IIC_FPGeneral,
-                          [(set f32:$frD, (fpround f64:$frB))]>;
+                          [(set f32:$frD, (any_fpround f64:$frB))]>;
 
   let Interpretation64Bit = 1, isCodeGenOnly = 1 in
   defm FRIND  : XForm_26r<63, 392, (outs f8rc:$frD), (ins f8rc:$frB),
                           "frin", "$frD, $frB", IIC_FPGeneral,
-                          [(set f64:$frD, (fround f64:$frB))]>;
+                          [(set f64:$frD, (any_fround f64:$frB))]>;
   defm FRINS  : XForm_26r<63, 392, (outs f4rc:$frD), (ins f4rc:$frB),
                           "frin", "$frD, $frB", IIC_FPGeneral,
-                          [(set f32:$frD, (fround f32:$frB))]>;
+                          [(set f32:$frD, (any_fround f32:$frB))]>;
   }
 
   let hasSideEffects = 0 in {
   let Interpretation64Bit = 1, isCodeGenOnly = 1 in
   defm FRIPD  : XForm_26r<63, 456, (outs f8rc:$frD), (ins f8rc:$frB),
                           "frip", "$frD, $frB", IIC_FPGeneral,
-                          [(set f64:$frD, (fceil f64:$frB))]>;
+                          [(set f64:$frD, (any_fceil f64:$frB))]>;
   defm FRIPS  : XForm_26r<63, 456, (outs f4rc:$frD), (ins f4rc:$frB),
                           "frip", "$frD, $frB", IIC_FPGeneral,
-                          [(set f32:$frD, (fceil f32:$frB))]>;
+                          [(set f32:$frD, (any_fceil f32:$frB))]>;
   let Interpretation64Bit = 1, isCodeGenOnly = 1 in
   defm FRIZD  : XForm_26r<63, 424, (outs f8rc:$frD), (ins f8rc:$frB),
                           "friz", "$frD, $frB", IIC_FPGeneral,
-                          [(set f64:$frD, (ftrunc f64:$frB))]>;
+                          [(set f64:$frD, (any_ftrunc f64:$frB))]>;
   defm FRIZS  : XForm_26r<63, 424, (outs f4rc:$frD), (ins f4rc:$frB),
                           "friz", "$frD, $frB", IIC_FPGeneral,
-                          [(set f32:$frD, (ftrunc f32:$frB))]>;
+                          [(set f32:$frD, (any_ftrunc f32:$frB))]>;
   let Interpretation64Bit = 1, isCodeGenOnly = 1 in
   defm FRIMD  : XForm_26r<63, 488, (outs f8rc:$frD), (ins f8rc:$frB),
                           "frim", "$frD, $frB", IIC_FPGeneral,
-                          [(set f64:$frD, (ffloor f64:$frB))]>;
+                          [(set f64:$frD, (any_ffloor f64:$frB))]>;
   defm FRIMS  : XForm_26r<63, 488, (outs f4rc:$frD), (ins f4rc:$frB),
                           "frim", "$frD, $frB", IIC_FPGeneral,
-                          [(set f32:$frD, (ffloor f32:$frB))]>;
+                          [(set f32:$frD, (any_ffloor f32:$frB))]>;
 
   defm FSQRT  : XForm_26r<63, 22, (outs f8rc:$frD), (ins f8rc:$frB),
                           "fsqrt", "$frD, $frB", IIC_FPSqrtD,
-                          [(set f64:$frD, (fsqrt f64:$frB))]>;
+                          [(set f64:$frD, (any_fsqrt f64:$frB))]>;
   defm FSQRTS : XForm_26r<59, 22, (outs f4rc:$frD), (ins f4rc:$frB),
                           "fsqrts", "$frD, $frB", IIC_FPSqrtS,
-                          [(set f32:$frD, (fsqrt f32:$frB))]>;
+                          [(set f32:$frD, (any_fsqrt f32:$frB))]>;
   }
   }
 }
@@ -2786,6 +2904,8 @@ def MCRXRX : X_BF3<31, 576, (outs crrc:$BF), (ins),
                    "mcrxrx $BF", IIC_BrMCRX>, Requires<[IsISA3_0]>;
 } // hasSideEffects = 0
 
+def : InstAlias<"mtcr $rA", (MTCRF 255, gprc:$rA)>;
+
 let Predicates = [HasFPU] in {
 // Custom inserter instruction to perform FADD in round-to-zero mode.
 let Uses = [RM] in {
@@ -2795,7 +2915,7 @@ let Uses = [RM] in {
 
 // The above pseudo gets expanded to make use of the following instructions
 // to manipulate FPSCR.  Note that FPSCR is not modeled at the DAG level.
-let Uses = [RM], Defs = [RM] in { 
+let Uses = [RM], Defs = [RM] in {
   def MTFSB0 : XForm_43<63, 70, (outs), (ins u5imm:$FM),
                         "mtfsb0 $FM", IIC_IntMTFSB0, []>,
                PPC970_DGroup_Single, PPC970_Unit_FPU;
@@ -2931,49 +3051,54 @@ defm SUBFZE : XOForm_3rc<31, 200, 0, (outs gprc:$rT), (ins gprc:$rA),
 }
 }
 
+def : InstAlias<"sub $rA, $rB, $rC", (SUBF gprc:$rA, gprc:$rC, gprc:$rB)>;
+def : InstAlias<"sub. $rA, $rB, $rC", (SUBF_rec gprc:$rA, gprc:$rC, gprc:$rB)>;
+def : InstAlias<"subc $rA, $rB, $rC", (SUBFC gprc:$rA, gprc:$rC, gprc:$rB)>;
+def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC_rec gprc:$rA, gprc:$rC, gprc:$rB)>;
+
 // A-Form instructions.  Most of the instructions executed in the FPU are of
 // this type.
 //
 let PPC970_Unit = 3, hasSideEffects = 0, Predicates = [HasFPU] in {  // FPU Operations.
 let Uses = [RM] in {
 let isCommutable = 1 in {
-  defm FMADD : AForm_1r<63, 29, 
+  defm FMADD : AForm_1r<63, 29,
                       (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
                       "fmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
-                      [(set f64:$FRT, (fma f64:$FRA, f64:$FRC, f64:$FRB))]>;
+                      [(set f64:$FRT, (any_fma f64:$FRA, f64:$FRC, f64:$FRB))]>;
   defm FMADDS : AForm_1r<59, 29,
                       (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
                       "fmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
-                      [(set f32:$FRT, (fma f32:$FRA, f32:$FRC, f32:$FRB))]>;
+                      [(set f32:$FRT, (any_fma f32:$FRA, f32:$FRC, f32:$FRB))]>;
   defm FMSUB : AForm_1r<63, 28,
                       (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
                       "fmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
                       [(set f64:$FRT,
-                            (fma f64:$FRA, f64:$FRC, (fneg f64:$FRB)))]>;
+                            (any_fma f64:$FRA, f64:$FRC, (fneg f64:$FRB)))]>;
   defm FMSUBS : AForm_1r<59, 28,
                       (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
                       "fmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
                       [(set f32:$FRT,
-                            (fma f32:$FRA, f32:$FRC, (fneg f32:$FRB)))]>;
+                            (any_fma f32:$FRA, f32:$FRC, (fneg f32:$FRB)))]>;
   defm FNMADD : AForm_1r<63, 31,
                       (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
                       "fnmadd", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
                       [(set f64:$FRT,
-                            (fneg (fma f64:$FRA, f64:$FRC, f64:$FRB)))]>;
+                            (fneg (any_fma f64:$FRA, f64:$FRC, f64:$FRB)))]>;
   defm FNMADDS : AForm_1r<59, 31,
                       (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
                       "fnmadds", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
                       [(set f32:$FRT,
-                            (fneg (fma f32:$FRA, f32:$FRC, f32:$FRB)))]>;
+                            (fneg (any_fma f32:$FRA, f32:$FRC, f32:$FRB)))]>;
   defm FNMSUB : AForm_1r<63, 30,
                       (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC, f8rc:$FRB),
                       "fnmsub", "$FRT, $FRA, $FRC, $FRB", IIC_FPFused,
-                      [(set f64:$FRT, (fneg (fma f64:$FRA, f64:$FRC,
+                      [(set f64:$FRT, (fneg (any_fma f64:$FRA, f64:$FRC,
                                                  (fneg f64:$FRB))))]>;
   defm FNMSUBS : AForm_1r<59, 30,
                       (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC, f4rc:$FRB),
                       "fnmsubs", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
-                      [(set f32:$FRT, (fneg (fma f32:$FRA, f32:$FRC,
+                      [(set f32:$FRT, (fneg (any_fma f32:$FRA, f32:$FRC,
                                                  (fneg f32:$FRB))))]>;
 } // isCommutable
 }
@@ -2990,43 +3115,43 @@ defm FSELS : AForm_1r<63, 23,
                       (outs f4rc:$FRT), (ins f8rc:$FRA, f4rc:$FRC, f4rc:$FRB),
                       "fsel", "$FRT, $FRA, $FRC, $FRB", IIC_FPGeneral,
                       [(set f32:$FRT, (PPCfsel f64:$FRA, f32:$FRC, f32:$FRB))]>;
-let Uses = [RM] in {
+let Uses = [RM], mayRaiseFPException = 1 in {
   let isCommutable = 1 in {
   defm FADD  : AForm_2r<63, 21,
                         (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
                         "fadd", "$FRT, $FRA, $FRB", IIC_FPAddSub,
-                        [(set f64:$FRT, (fadd f64:$FRA, f64:$FRB))]>;
+                        [(set f64:$FRT, (any_fadd f64:$FRA, f64:$FRB))]>;
   defm FADDS : AForm_2r<59, 21,
                         (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
                         "fadds", "$FRT, $FRA, $FRB", IIC_FPGeneral,
-                        [(set f32:$FRT, (fadd f32:$FRA, f32:$FRB))]>;
+                        [(set f32:$FRT, (any_fadd f32:$FRA, f32:$FRB))]>;
   } // isCommutable
   defm FDIV  : AForm_2r<63, 18,
                         (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
                         "fdiv", "$FRT, $FRA, $FRB", IIC_FPDivD,
-                        [(set f64:$FRT, (fdiv f64:$FRA, f64:$FRB))]>;
+                        [(set f64:$FRT, (any_fdiv f64:$FRA, f64:$FRB))]>;
   defm FDIVS : AForm_2r<59, 18,
                         (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
                         "fdivs", "$FRT, $FRA, $FRB", IIC_FPDivS,
-                        [(set f32:$FRT, (fdiv f32:$FRA, f32:$FRB))]>;
+                        [(set f32:$FRT, (any_fdiv f32:$FRA, f32:$FRB))]>;
   let isCommutable = 1 in {
   defm FMUL  : AForm_3r<63, 25,
                         (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRC),
                         "fmul", "$FRT, $FRA, $FRC", IIC_FPFused,
-                        [(set f64:$FRT, (fmul f64:$FRA, f64:$FRC))]>;
+                        [(set f64:$FRT, (any_fmul f64:$FRA, f64:$FRC))]>;
   defm FMULS : AForm_3r<59, 25,
                         (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRC),
                         "fmuls", "$FRT, $FRA, $FRC", IIC_FPGeneral,
-                        [(set f32:$FRT, (fmul f32:$FRA, f32:$FRC))]>;
+                        [(set f32:$FRT, (any_fmul f32:$FRA, f32:$FRC))]>;
   } // isCommutable
   defm FSUB  : AForm_2r<63, 20,
                         (outs f8rc:$FRT), (ins f8rc:$FRA, f8rc:$FRB),
                         "fsub", "$FRT, $FRA, $FRB", IIC_FPAddSub,
-                        [(set f64:$FRT, (fsub f64:$FRA, f64:$FRB))]>;
+                        [(set f64:$FRT, (any_fsub f64:$FRA, f64:$FRB))]>;
   defm FSUBS : AForm_2r<59, 20,
                         (outs f4rc:$FRT), (ins f4rc:$FRA, f4rc:$FRB),
                         "fsubs", "$FRT, $FRA, $FRB", IIC_FPGeneral,
-                        [(set f32:$FRT, (fsub f32:$FRA, f32:$FRB))]>;
+                        [(set f32:$FRT, (any_fsub f32:$FRA, f32:$FRB))]>;
   }
 }
 
@@ -3158,16 +3283,16 @@ def : Pat<(add i32:$in, (PPChi tblockaddress:$g, 0)),
           (ADDIS $in, tblockaddress:$g)>;
 
 // Support for thread-local storage.
-def PPC32GOT: PPCEmitTimePseudo<(outs gprc:$rD), (ins), "#PPC32GOT", 
+def PPC32GOT: PPCEmitTimePseudo<(outs gprc:$rD), (ins), "#PPC32GOT",
                 [(set i32:$rD, (PPCppc32GOT))]>;
 
 // Get the _GLOBAL_OFFSET_TABLE_ in PIC mode.
 // This uses two output registers, the first as the real output, the second as a
 // temporary register, used internally in code generation.
-def PPC32PICGOT: PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT", 
+def PPC32PICGOT: PPCEmitTimePseudo<(outs gprc:$rD, gprc:$rT), (ins), "#PPC32PICGOT",
                 []>, NoEncode<"$rT">;
 
-def LDgotTprelL32: PPCEmitTimePseudo<(outs gprc:$rD), (ins s16imm:$disp, gprc_nor0:$reg),
+def LDgotTprelL32: PPCEmitTimePseudo<(outs gprc_nor0:$rD), (ins s16imm:$disp, gprc_nor0:$reg),
                            "#LDgotTprelL32",
                            [(set i32:$rD,
                              (PPCldGotTprelL tglobaltlsaddr:$disp, i32:$reg))]>;
@@ -3302,15 +3427,19 @@ def : Pat<(atomic_fence (timm), (timm)), (SYNC 1)>, Requires<[HasSYNC]>;
 def : Pat<(atomic_fence (timm), (timm)), (MSYNC)>, Requires<[HasOnlyMSYNC]>;
 
 let Predicates = [HasFPU] in {
-// Additional FNMSUB patterns: -a*c + b == -(a*c - b)
-def : Pat<(fma (fneg f64:$A), f64:$C, f64:$B),
-          (FNMSUB $A, $C, $B)>;
-def : Pat<(fma f64:$A, (fneg f64:$C), f64:$B),
-          (FNMSUB $A, $C, $B)>;
-def : Pat<(fma (fneg f32:$A), f32:$C, f32:$B),
-          (FNMSUBS $A, $C, $B)>;
-def : Pat<(fma f32:$A, (fneg f32:$C), f32:$B),
-          (FNMSUBS $A, $C, $B)>;
+// Additional fnmsub patterns for custom node
+def : Pat<(PPCfnmsub f64:$A, f64:$B, f64:$C),
+          (FNMSUB $A, $B, $C)>;
+def : Pat<(PPCfnmsub f32:$A, f32:$B, f32:$C),
+          (FNMSUBS $A, $B, $C)>;
+def : Pat<(fneg (PPCfnmsub f64:$A, f64:$B, f64:$C)),
+          (FMSUB $A, $B, $C)>;
+def : Pat<(fneg (PPCfnmsub f32:$A, f32:$B, f32:$C)),
+          (FMSUBS $A, $B, $C)>;
+def : Pat<(PPCfnmsub f64:$A, f64:$B, (fneg f64:$C)),
+          (FNMADD $A, $B, $C)>;
+def : Pat<(PPCfnmsub f32:$A, f32:$B, (fneg f32:$C)),
+          (FNMADDS $A, $B, $C)>;
 
 // FCOPYSIGN's operand types need not agree.
 def : Pat<(fcopysign f64:$frB, f32:$frA),
@@ -3331,6 +3460,10 @@ def crnot : OutPatFrag<(ops node:$in),
 def       : Pat<(not i1:$in),
                 (crnot $in)>;
 
+// Prefixed instructions may require access to the above defs at a later
+// time so we include this after the def.
+include "PPCInstrPrefix.td"
+
 // Patterns for arithmetic i1 operations.
 def : Pat<(add i1:$a, i1:$b),
           (CRXOR $a, $b)>;
@@ -3510,7 +3643,7 @@ defm : ExtSetCCPat<SETEQ,
                               (RLWINM (CNTLZW $in), 27, 31, 31)>,
                    OutPatFrag<(ops node:$in),
                               (RLDICL (CNTLZD $in), 58, 63)> >;
- 
+
 defm : ExtSetCCPat<SETNE,
                    PatFrag<(ops node:$in, node:$cc),
                            (setcc $in, 0, $cc)>,
@@ -3518,7 +3651,7 @@ defm : ExtSetCCPat<SETNE,
                               (RLWINM (i32not (CNTLZW $in)), 27, 31, 31)>,
                    OutPatFrag<(ops node:$in),
                               (RLDICL (i64not (CNTLZD $in)), 58, 63)> >;
-                 
+
 defm : ExtSetCCPat<SETLT,
                    PatFrag<(ops node:$in, node:$cc),
                            (setcc $in, 0, $cc)>,
@@ -4128,10 +4261,6 @@ def ISYNC : XLForm_2_ext<19, 150, 0, 0, 0, (outs), (ins),
 def ICBI : XForm_1a<31, 982, (outs), (ins memrr:$src),
                     "icbi $src", IIC_LdStICBI, []>;
 
-// We used to have EIEIO as value but E[0-9A-Z] is a reserved name
-def EnforceIEIO : XForm_24_eieio<31, 854, (outs), (ins),
-                           "eieio", IIC_LdStLoad, []>;
-
 def WAIT : XForm_24_sync<31, 30, (outs), (ins i32imm:$L),
                          "wait $L", IIC_LdStLoad, []>;
 
@@ -4421,17 +4550,53 @@ def DCBFx  : PPCAsmPseudo<"dcbf $dst", (ins memrr:$dst)>;
 def DCBFL  : PPCAsmPseudo<"dcbfl $dst", (ins memrr:$dst)>;
 def DCBFLP : PPCAsmPseudo<"dcbflp $dst", (ins memrr:$dst)>;
 
+def : Pat<(int_ppc_isync),  (ISYNC)>;
+def : Pat<(int_ppc_dcbfl xoaddr:$dst),
+          (DCBF 1, xoaddr:$dst)>;
+def : Pat<(int_ppc_dcbflp xoaddr:$dst),
+          (DCBF 3, xoaddr:$dst)>;
+
 def : InstAlias<"crset $bx", (CREQV crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
 def : InstAlias<"crclr $bx", (CRXOR crbitrc:$bx, crbitrc:$bx, crbitrc:$bx)>;
 def : InstAlias<"crmove $bx, $by", (CROR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
 def : InstAlias<"crnot $bx, $by", (CRNOR crbitrc:$bx, crbitrc:$by, crbitrc:$by)>;
 
+def : InstAlias<"mftb $Rx", (MFTB gprc:$Rx, 268)>;
+def : InstAlias<"mftbl $Rx", (MFTB gprc:$Rx, 268)>;
+def : InstAlias<"mftbu $Rx", (MFTB gprc:$Rx, 269)>;
+
+def : InstAlias<"xnop", (XORI R0, R0, 0)>;
+
+foreach BR = 0-7 in {
+    def : InstAlias<"mfbr"#BR#" $Rx",
+                    (MFDCR gprc:$Rx, !add(BR, 0x80))>,
+                    Requires<[IsPPC4xx]>;
+    def : InstAlias<"mtbr"#BR#" $Rx",
+                    (MTDCR gprc:$Rx, !add(BR, 0x80))>,
+                    Requires<[IsPPC4xx]>;
+}
+
+def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>;
+def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>;
+
 def : InstAlias<"mtxer $Rx", (MTSPR 1, gprc:$Rx)>;
 def : InstAlias<"mfxer $Rx", (MFSPR gprc:$Rx, 1)>;
 
+def : InstAlias<"mtudscr $Rx", (MTSPR 3, gprc:$Rx)>;
+def : InstAlias<"mfudscr $Rx", (MFSPR gprc:$Rx, 3)>;
+
 def : InstAlias<"mfrtcu $Rx", (MFSPR gprc:$Rx, 4)>;
 def : InstAlias<"mfrtcl $Rx", (MFSPR gprc:$Rx, 5)>;
 
+def : InstAlias<"mtlr $Rx", (MTSPR 8, gprc:$Rx)>;
+def : InstAlias<"mflr $Rx", (MFSPR gprc:$Rx, 8)>;
+
+def : InstAlias<"mtctr $Rx", (MTSPR 9, gprc:$Rx)>;
+def : InstAlias<"mfctr $Rx", (MFSPR gprc:$Rx, 9)>;
+
+def : InstAlias<"mtuamr $Rx", (MTSPR 13, gprc:$Rx)>;
+def : InstAlias<"mfuamr $Rx", (MFSPR gprc:$Rx, 13)>;
+
 def : InstAlias<"mtdscr $Rx", (MTSPR 17, gprc:$Rx)>;
 def : InstAlias<"mfdscr $Rx", (MFSPR gprc:$Rx, 17)>;
 
@@ -4453,12 +4618,6 @@ def : InstAlias<"mfsrr0 $Rx", (MFSPR gprc:$Rx, 26)>;
 def : InstAlias<"mtsrr1 $Rx", (MTSPR 27, gprc:$Rx)>;
 def : InstAlias<"mfsrr1 $Rx", (MFSPR gprc:$Rx, 27)>;
 
-def : InstAlias<"mtsrr2 $Rx", (MTSPR 990, gprc:$Rx)>, Requires<[IsPPC4xx]>;
-def : InstAlias<"mfsrr2 $Rx", (MFSPR gprc:$Rx, 990)>, Requires<[IsPPC4xx]>;
-
-def : InstAlias<"mtsrr3 $Rx", (MTSPR 991, gprc:$Rx)>, Requires<[IsPPC4xx]>;
-def : InstAlias<"mfsrr3 $Rx", (MFSPR gprc:$Rx, 991)>, Requires<[IsPPC4xx]>;
-
 def : InstAlias<"mtcfar $Rx", (MTSPR 28, gprc:$Rx)>;
 def : InstAlias<"mfcfar $Rx", (MFSPR gprc:$Rx, 28)>;
 
@@ -4468,27 +4627,34 @@ def : InstAlias<"mfamr $Rx", (MFSPR gprc:$Rx, 29)>;
 def : InstAlias<"mtpid $Rx", (MTSPR 48, gprc:$Rx)>, Requires<[IsBookE]>;
 def : InstAlias<"mfpid $Rx", (MFSPR gprc:$Rx, 48)>, Requires<[IsBookE]>;
 
-def : InstAlias<"mftb $Rx", (MFTB gprc:$Rx, 268)>;
-def : InstAlias<"mftbl $Rx", (MFTB gprc:$Rx, 268)>;
-def : InstAlias<"mftbu $Rx", (MFTB gprc:$Rx, 269)>;
-
-def : InstAlias<"mttbl $Rx", (MTSPR 284, gprc:$Rx)>;
-def : InstAlias<"mttbu $Rx", (MTSPR 285, gprc:$Rx)>;
+foreach SPRG = 4-7 in {
+  def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 256))>,
+                  Requires<[IsBookE]>;
+  def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 256))>,
+                  Requires<[IsBookE]>;
+  def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>,
+                  Requires<[IsBookE]>;
+  def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>,
+                  Requires<[IsBookE]>;
+}
 
-def : InstAlias<"mftblo $Rx", (MFSPR gprc:$Rx, 989)>, Requires<[IsPPC4xx]>;
-def : InstAlias<"mttblo $Rx", (MTSPR 989, gprc:$Rx)>, Requires<[IsPPC4xx]>;
-def : InstAlias<"mftbhi $Rx", (MFSPR gprc:$Rx, 988)>, Requires<[IsPPC4xx]>;
-def : InstAlias<"mttbhi $Rx", (MTSPR 988, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+foreach SPRG = 0-3 in {
+  def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 272))>;
+  def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 272))>;
+  def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>;
+  def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>;
+}
 
-def : InstAlias<"xnop", (XORI R0, R0, 0)>;
+def : InstAlias<"mfasr $RT", (MFSPR gprc:$RT, 280)>;
+def : InstAlias<"mtasr $RT", (MTSPR 280, gprc:$RT)>;
 
-def : InstAlias<"mr $rA, $rB", (OR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
-def : InstAlias<"mr. $rA, $rB", (OR8_rec g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+def : InstAlias<"mttbl $Rx", (MTSPR 284, gprc:$Rx)>;
+def : InstAlias<"mttbu $Rx", (MTSPR 285, gprc:$Rx)>;
 
-def : InstAlias<"not $rA, $rB", (NOR8 g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
-def : InstAlias<"not. $rA, $rB", (NOR8_rec g8rc:$rA, g8rc:$rB, g8rc:$rB)>;
+def : InstAlias<"mfpvr $RT", (MFSPR gprc:$RT, 287)>;
 
-def : InstAlias<"mtcr $rA", (MTCRF8 255, g8rc:$rA)>;
+def : InstAlias<"mfspefscr $Rx", (MFSPR gprc:$Rx, 512)>;
+def : InstAlias<"mtspefscr $Rx", (MTSPR 512, gprc:$Rx)>;
 
 foreach BATR = 0-3 in {
     def : InstAlias<"mtdbatu "#BATR#", $Rx",
@@ -4517,86 +4683,36 @@ foreach BATR = 0-3 in {
                     Requires<[IsPPC6xx]>;
 }
 
-foreach BR = 0-7 in {
-    def : InstAlias<"mfbr"#BR#" $Rx",
-                    (MFDCR gprc:$Rx, !add(BR, 0x80))>,
-                    Requires<[IsPPC4xx]>;
-    def : InstAlias<"mtbr"#BR#" $Rx",
-                    (MTDCR gprc:$Rx, !add(BR, 0x80))>,
-                    Requires<[IsPPC4xx]>;
-}
-
-def : InstAlias<"mtdccr $Rx", (MTSPR 1018, gprc:$Rx)>, Requires<[IsPPC4xx]>;
-def : InstAlias<"mfdccr $Rx", (MFSPR gprc:$Rx, 1018)>, Requires<[IsPPC4xx]>;
-
-def : InstAlias<"mticcr $Rx", (MTSPR 1019, gprc:$Rx)>, Requires<[IsPPC4xx]>;
-def : InstAlias<"mficcr $Rx", (MFSPR gprc:$Rx, 1019)>, Requires<[IsPPC4xx]>;
-
-def : InstAlias<"mtdear $Rx", (MTSPR 981, gprc:$Rx)>, Requires<[IsPPC4xx]>;
-def : InstAlias<"mfdear $Rx", (MFSPR gprc:$Rx, 981)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mtppr $RT", (MTSPR 896, gprc:$RT)>;
+def : InstAlias<"mfppr $RT", (MFSPR gprc:$RT, 896)>;
 
 def : InstAlias<"mtesr $Rx", (MTSPR 980, gprc:$Rx)>, Requires<[IsPPC4xx]>;
 def : InstAlias<"mfesr $Rx", (MFSPR gprc:$Rx, 980)>, Requires<[IsPPC4xx]>;
 
-def : InstAlias<"mfspefscr $Rx", (MFSPR gprc:$Rx, 512)>;
-def : InstAlias<"mtspefscr $Rx", (MTSPR 512, gprc:$Rx)>;
+def : InstAlias<"mtdear $Rx", (MTSPR 981, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfdear $Rx", (MFSPR gprc:$Rx, 981)>, Requires<[IsPPC4xx]>;
 
 def : InstAlias<"mttcr $Rx", (MTSPR 986, gprc:$Rx)>, Requires<[IsPPC4xx]>;
 def : InstAlias<"mftcr $Rx", (MFSPR gprc:$Rx, 986)>, Requires<[IsPPC4xx]>;
 
-def LAx : PPCAsmPseudo<"la $rA, $addr", (ins gprc:$rA, memri:$addr)>;
-
-def SUBI : PPCAsmPseudo<"subi $rA, $rB, $imm",
-                        (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
-def SUBIS : PPCAsmPseudo<"subis $rA, $rB, $imm",
-                         (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
-def SUBIC : PPCAsmPseudo<"subic $rA, $rB, $imm",
-                         (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
-def SUBIC_rec : PPCAsmPseudo<"subic. $rA, $rB, $imm",
-                          (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
-
-def : InstAlias<"sub $rA, $rB, $rC", (SUBF8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
-def : InstAlias<"sub. $rA, $rB, $rC", (SUBF8_rec g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
-def : InstAlias<"subc $rA, $rB, $rC", (SUBFC8 g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
-def : InstAlias<"subc. $rA, $rB, $rC", (SUBFC8_rec g8rc:$rA, g8rc:$rC, g8rc:$rB)>;
-
-def : InstAlias<"mtmsrd $RS", (MTMSRD gprc:$RS, 0)>;
-def : InstAlias<"mtmsr $RS", (MTMSR gprc:$RS, 0)>;
-
-def : InstAlias<"mfasr $RT", (MFSPR gprc:$RT, 280)>;
-def : InstAlias<"mtasr $RT", (MTSPR 280, gprc:$RT)>;
+def : InstAlias<"mftbhi $Rx", (MFSPR gprc:$Rx, 988)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mttbhi $Rx", (MTSPR 988, gprc:$Rx)>, Requires<[IsPPC4xx]>;
 
-foreach SPRG = 0-3 in {
-  def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 272))>;
-  def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 272))>;
-  def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>;
-  def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 272), gprc:$RT)>;
-}
-foreach SPRG = 4-7 in {
-  def : InstAlias<"mfsprg $RT, "#SPRG, (MFSPR gprc:$RT, !add(SPRG, 256))>,
-                  Requires<[IsBookE]>;
-  def : InstAlias<"mfsprg"#SPRG#" $RT", (MFSPR gprc:$RT, !add(SPRG, 256))>,
-                  Requires<[IsBookE]>;
-  def : InstAlias<"mtsprg "#SPRG#", $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>,
-                  Requires<[IsBookE]>;
-  def : InstAlias<"mtsprg"#SPRG#" $RT", (MTSPR !add(SPRG, 256), gprc:$RT)>,
-                  Requires<[IsBookE]>;
-}
+def : InstAlias<"mftblo $Rx", (MFSPR gprc:$Rx, 989)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mttblo $Rx", (MTSPR 989, gprc:$Rx)>, Requires<[IsPPC4xx]>;
 
-def : InstAlias<"mtasr $RS", (MTSPR 280, gprc:$RS)>;
+def : InstAlias<"mtsrr2 $Rx", (MTSPR 990, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfsrr2 $Rx", (MFSPR gprc:$Rx, 990)>, Requires<[IsPPC4xx]>;
 
-def : InstAlias<"mfdec $RT", (MFSPR gprc:$RT, 22)>;
-def : InstAlias<"mtdec $RT", (MTSPR 22, gprc:$RT)>;
+def : InstAlias<"mtsrr3 $Rx", (MTSPR 991, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfsrr3 $Rx", (MFSPR gprc:$Rx, 991)>, Requires<[IsPPC4xx]>;
 
-def : InstAlias<"mfpvr $RT", (MFSPR gprc:$RT, 287)>;
+def : InstAlias<"mtdccr $Rx", (MTSPR 1018, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mfdccr $Rx", (MFSPR gprc:$Rx, 1018)>, Requires<[IsPPC4xx]>;
 
-def : InstAlias<"mfsdr1 $RT", (MFSPR gprc:$RT, 25)>;
-def : InstAlias<"mtsdr1 $RT", (MTSPR 25, gprc:$RT)>;
+def : InstAlias<"mticcr $Rx", (MTSPR 1019, gprc:$Rx)>, Requires<[IsPPC4xx]>;
+def : InstAlias<"mficcr $Rx", (MFSPR gprc:$Rx, 1019)>, Requires<[IsPPC4xx]>;
 
-def : InstAlias<"mfsrr0 $RT", (MFSPR gprc:$RT, 26)>;
-def : InstAlias<"mfsrr1 $RT", (MFSPR gprc:$RT, 27)>;
-def : InstAlias<"mtsrr0 $RT", (MTSPR 26, gprc:$RT)>;
-def : InstAlias<"mtsrr1 $RT", (MTSPR 27, gprc:$RT)>;
 
 def : InstAlias<"tlbie $RB", (TLBIE R0, gprc:$RB)>;
 
@@ -4609,6 +4725,17 @@ def : InstAlias<"tlbwehi $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 0)>,
 def : InstAlias<"tlbwelo $RS, $A", (TLBWE2 gprc:$RS, gprc:$A, 1)>,
                 Requires<[IsPPC4xx]>;
 
+def LAx : PPCAsmPseudo<"la $rA, $addr", (ins gprc:$rA, memri:$addr)>;
+
+def SUBI : PPCAsmPseudo<"subi $rA, $rB, $imm",
+                        (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBIS : PPCAsmPseudo<"subis $rA, $rB, $imm",
+                         (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBIC : PPCAsmPseudo<"subic $rA, $rB, $imm",
+                         (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+def SUBIC_rec : PPCAsmPseudo<"subic. $rA, $rB, $imm",
+                          (ins gprc:$rA, gprc:$rB, s16imm:$imm)>;
+
 def EXTLWI : PPCAsmPseudo<"extlwi $rA, $rS, $n, $b",
                           (ins gprc:$rA, gprc:$rS, u5imm:$n, u5imm:$b)>;
 def EXTLWI_rec : PPCAsmPseudo<"extlwi. $rA, $rS, $n, $b",
@@ -4646,6 +4773,13 @@ def CLRLSLWI : PPCAsmPseudo<"clrlslwi $rA, $rS, $b, $n",
 def CLRLSLWI_rec : PPCAsmPseudo<"clrlslwi. $rA, $rS, $b, $n",
                              (ins gprc:$rA, gprc:$rS, u5imm:$b, u5imm:$n)>;
 
+def : InstAlias<"isellt $rT, $rA, $rB",
+                (ISEL gprc:$rT, gprc_nor0:$rA, gprc:$rB, CR0LT)>;
+def : InstAlias<"iselgt $rT, $rA, $rB",
+                (ISEL gprc:$rT, gprc_nor0:$rA, gprc:$rB, CR0GT)>;
+def : InstAlias<"iseleq $rT, $rA, $rB",
+                (ISEL gprc:$rT, gprc_nor0:$rA, gprc:$rB, CR0EQ)>;
+
 def : InstAlias<"rotlwi $rA, $rS, $n", (RLWINM gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>;
 def : InstAlias<"rotlwi. $rA, $rS, $n", (RLWINM_rec gprc:$rA, gprc:$rS, u5imm:$n, 0, 31)>;
 def : InstAlias<"rotlw $rA, $rS, $rB", (RLWNM gprc:$rA, gprc:$rS, gprc:$rB, 0, 31)>;
@@ -4694,6 +4828,8 @@ def CLRLSLDI_rec : PPCAsmPseudo<"clrlsldi. $rA, $rS, $b, $n",
 def SUBPCIS : PPCAsmPseudo<"subpcis $RT, $D", (ins g8rc:$RT, s16imm:$D)>;
 
 def : InstAlias<"rotldi $rA, $rS, $n", (RLDICL g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
+def : InstAlias<"rotldi $rA, $rS, $n",
+                (RLDICL_32_64 g8rc:$rA, gprc:$rS, u6imm:$n, 0)>;
 def : InstAlias<"rotldi. $rA, $rS, $n", (RLDICL_rec g8rc:$rA, g8rc:$rS, u6imm:$n, 0)>;
 def : InstAlias<"rotld $rA, $rS, $rB", (RLDCL g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
 def : InstAlias<"rotld. $rA, $rS, $rB", (RLDCL_rec g8rc:$rA, g8rc:$rS, gprc:$rB, 0)>;
@@ -4892,6 +5028,8 @@ def : InstAlias<"cmp $bf, 1, $rA, $rB", (CMPD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
 def : InstAlias<"cmpli $bf, 1, $rA, $imm", (CMPLDI crrc:$bf, g8rc:$rA, u16imm64:$imm)>;
 def : InstAlias<"cmpl $bf, 1, $rA, $rB", (CMPLD crrc:$bf, g8rc:$rA, g8rc:$rB)>;
 
+def : InstAlias<"trap", (TW 31, R0, R0)>;
+
 multiclass TrapExtendedMnemonic<string name, int to> {
   def : InstAlias<"td"#name#"i $rA, $imm", (TDI to, g8rc:$rA, s16imm:$imm)>;
   def : InstAlias<"td"#name#" $rA, $rB", (TD to, g8rc:$rA, g8rc:$rB)>;
@@ -5025,8 +5163,11 @@ def RotateInsertByte1 {
   dag Left = (RLWIMI RotateInsertByte3.Left, Swap4.Bits, 8, 24, 31);
 }
 
-def : Pat<(i32 (bitreverse i32:$A)),
-  (RLDICL_32 RotateInsertByte1.Left, 0, 32)>;
+// Clear the upper half of the register when in 64-bit mode
+let Predicates = [In64BitMode] in
+def : Pat<(i32 (bitreverse i32:$A)), (RLDICL_32 RotateInsertByte1.Left, 0, 32)>;
+let Predicates = [In32BitMode] in
+def : Pat<(i32 (bitreverse i32:$A)), RotateInsertByte1.Left>;
 
 // Fast 64-bit reverse bits algorithm:
 // Step 1: 1-bit swap (swap odd 1-bit and even 1-bit):
diff --git a/llvm/lib/Target/PowerPC/PPCInstrPrefix.td b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
new file mode 100644
index 000000000000..2bab73418e10
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCInstrPrefix.td
@@ -0,0 +1,1035 @@
+//===----------------------------------------------------------------------===//
+// PowerPC ISA 3.1 specific type constraints.
+//
+
+def SDT_PPCSplat32 : SDTypeProfile<1, 3, [ SDTCisVT<0, v2i64>,
+  SDTCisVec<1>, SDTCisInt<2>, SDTCisInt<3>
+]>;
+
+//===----------------------------------------------------------------------===//
+// ISA 3.1 specific PPCISD nodes.
+//
+
+def PPCxxsplti32dx : SDNode<"PPCISD::XXSPLTI32DX", SDT_PPCSplat32, []>;
+
+//===----------------------------------------------------------------------===//
+
+// PC Relative flag (for instructions that use the address of the prefix for
+// address computations).
+class isPCRel { bit PCRel = 1; }
+
+// Top-level class for prefixed instructions.
+class PI<bits<6> pref, bits<6> opcode, dag OOL, dag IOL, string asmstr,
+         InstrItinClass itin> : Instruction {
+  field bits<64> Inst;
+  field bits<64> SoftFail = 0;
+  bit PCRel = 0; // Default value, set by isPCRel.
+  let Size = 8;
+
+  let Namespace = "PPC";
+  let OutOperandList = OOL;
+  let InOperandList = IOL;
+  let AsmString = asmstr;
+  let Itinerary = itin;
+  let Inst{0-5} = pref;
+  let Inst{32-37} = opcode;
+
+  bits<1> PPC970_First = 0;
+  bits<1> PPC970_Single = 0;
+  bits<1> PPC970_Cracked = 0;
+  bits<3> PPC970_Unit = 0;
+
+  /// These fields correspond to the fields in PPCInstrInfo.h.  Any changes to
+  /// these must be reflected there!  See comments there for what these are.
+  let TSFlags{0}   = PPC970_First;
+  let TSFlags{1}   = PPC970_Single;
+  let TSFlags{2}   = PPC970_Cracked;
+  let TSFlags{5-3} = PPC970_Unit;
+
+  bits<1> Prefixed = 1;  // This is a prefixed instruction.
+  let TSFlags{7}  = Prefixed;
+
+  // For cases where multiple instruction definitions really represent the
+  // same underlying instruction but with one definition for 64-bit arguments
+  // and one for 32-bit arguments, this bit breaks the degeneracy between
+  // the two forms and allows TableGen to generate mapping tables.
+  bit Interpretation64Bit = 0;
+
+  // Fields used for relation models.
+  string BaseName = "";
+}
+
+class MLS_DForm_R_SI34_RTA5_MEM<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                                InstrItinClass itin, list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<5> FRS;
+  bits<39> D_RA;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 2;
+  let Inst{8-10} = 0;
+  let Inst{11} = PCRel;
+  let Inst{12-13} = 0;
+  let Inst{14-31} = D_RA{33-16}; // d0
+
+  // The instruction.
+  let Inst{38-42} = FRS{4-0};
+  let Inst{43-47} = D_RA{38-34}; // RA
+  let Inst{48-63} = D_RA{15-0}; // d1
+}
+
+class MLS_DForm_R_SI34_RTA5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                            InstrItinClass itin, list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<5> RA;
+  bits<34> SI;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 2;
+  let Inst{8-10} = 0;
+  let Inst{11} = PCRel;
+  let Inst{12-13} = 0;
+  let Inst{14-31} = SI{33-16};
+
+  // The instruction.
+  let Inst{38-42} = RT;
+  let Inst{43-47} = RA;
+  let Inst{48-63} = SI{15-0};
+}
+
+class MLS_DForm_SI34_RT5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                         InstrItinClass itin, list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<34> SI;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 2;
+  let Inst{8-10} = 0;
+  let Inst{11} = 0;
+  let Inst{12-13} = 0;
+  let Inst{14-31} = SI{33-16};
+
+  // The instruction.
+  let Inst{38-42} = RT;
+  let Inst{43-47} = 0;
+  let Inst{48-63} = SI{15-0};
+}
+
+multiclass MLS_DForm_R_SI34_RTA5_p<bits<6> opcode, dag OOL, dag IOL,
+                                   dag PCRel_IOL, string asmstr,
+                                   InstrItinClass itin> {
+  def NAME : MLS_DForm_R_SI34_RTA5<opcode, OOL, IOL,
+                                   !strconcat(asmstr, ", 0"), itin, []>;
+  def pc : MLS_DForm_R_SI34_RTA5<opcode, OOL, PCRel_IOL,
+                                 !strconcat(asmstr, ", 1"), itin, []>, isPCRel;
+}
+
+class 8LS_DForm_R_SI34_RTA5<bits<6> opcode, dag OOL, dag IOL, string asmstr,
+                            InstrItinClass itin, list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<5> RT;
+  bits<39> D_RA;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-10} = 0;
+  let Inst{11} = PCRel;
+  let Inst{12-13} = 0;
+  let Inst{14-31} = D_RA{33-16}; // d0
+
+  // The instruction.
+  let Inst{38-42} = RT{4-0};
+  let Inst{43-47} = D_RA{38-34}; // RA
+  let Inst{48-63} = D_RA{15-0}; // d1
+}
+
+// 8LS:D-Form: [ 1 0 0 // R // d0
+//               PO TX T RA d1 ]
+class 8LS_DForm_R_SI34_XT6_RA5<bits<5> opcode, dag OOL, dag IOL, string asmstr,
+                               InstrItinClass itin, list<dag> pattern>
+  : PI<1, { opcode, ? }, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<39> D_RA;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 0;
+  let Inst{8} = 0;
+  let Inst{9-10} = 0; // reserved
+  let Inst{11} = PCRel;
+  let Inst{12-13} = 0; // reserved
+  let Inst{14-31} = D_RA{33-16}; // d0
+
+  // The instruction.
+  let Inst{37} = XT{5};
+  let Inst{38-42} = XT{4-0};
+  let Inst{43-47} = D_RA{38-34}; // RA
+  let Inst{48-63} = D_RA{15-0}; // d1
+}
+
+// X-Form: [PO T IMM VRB XO TX]
+class XForm_XT6_IMM5_VB5<bits<6> opcode, bits<10> xo, dag OOL, dag IOL,
+                         string asmstr, InstrItinClass itin, list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<5> VRB;
+  bits<5> IMM;
+
+  let Pattern = pattern;
+  let Inst{6-10} = XT{4-0};
+  let Inst{11-15} = IMM;
+  let Inst{16-20} = VRB;
+  let Inst{21-30} = xo;
+  let Inst{31} = XT{5};
+}
+
+class 8RR_XX4Form_IMM8_XTAB6<bits<6> opcode, bits<2> xo,
+                             dag OOL, dag IOL, string asmstr,
+                             InstrItinClass itin, list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+    bits<6> XT;
+    bits<6> XA;
+    bits<6> XB;
+    bits<6> XC;
+    bits<8> IMM;
+
+    let Pattern = pattern;
+
+    // The prefix.
+    let Inst{6-7} = 1;
+    let Inst{8} = 0;
+    let Inst{9-11} = 0;
+    let Inst{12-13} = 0;
+    let Inst{14-23} = 0;
+    let Inst{24-31} = IMM;
+
+    // The instruction.
+    let Inst{38-42} = XT{4-0};
+    let Inst{43-47} = XA{4-0};
+    let Inst{48-52} = XB{4-0};
+    let Inst{53-57} = XC{4-0};
+    let Inst{58-59} = xo;
+    let Inst{60} = XC{5};
+    let Inst{61} = XA{5};
+    let Inst{62} = XB{5};
+    let Inst{63} = XT{5};
+}
+
+class VXForm_RD5_N3_VB5<bits<11> xo, dag OOL, dag IOL, string asmstr,
+                        InstrItinClass itin, list<dag> pattern>
+  : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> RD;
+  bits<5> VB;
+  bits<3> N;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = RD;
+  let Inst{11-12} = 0;
+  let Inst{13-15} = N;
+  let Inst{16-20} = VB;
+  let Inst{21-31} = xo;
+}
+
+
+// VX-Form: [PO VRT / UIM RB XO].
+// We use VXForm_1 to implement it, that is, we use "VRA" (5 bit) to represent
+// "/ UIM" (unused bit followed by a 4-bit immediate)
+// Destructive (insert) forms are suffixed with _ins.
+class VXForm_VRT5_UIM5_RB5_ins<bits<11> xo, string opc, list<dag> pattern>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, u4imm:$UIM, g8rc:$rB),
+             !strconcat(opc, " $vD, $rB, $UIM"), IIC_VecGeneral, pattern>,
+             RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+
+// VX-Form: [PO VRT RA VRB XO].
+// Destructive (insert) forms are suffixed with _ins.
+class VXForm_VTB5_RA5_ins<bits<11> xo, string opc, list<dag> pattern>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, vrrc:$vB),
+             !strconcat(opc, " $vD, $rA, $vB"), IIC_VecGeneral, pattern>,
+             RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+
+// VX-Form: [PO VRT RA RB XO].
+// Destructive (insert) forms are suffixed with _ins.
+class VXForm_VRT5_RAB5_ins<bits<11> xo, string opc, list<dag> pattern>
+  : VXForm_1<xo, (outs vrrc:$vD), (ins vrrc:$vDi, g8rc:$rA, g8rc:$rB),
+             !strconcat(opc, " $vD, $rA, $rB"), IIC_VecGeneral, pattern>,
+             RegConstraint<"$vDi = $vD">, NoEncode<"$vDi">;
+
+// VN-Form: [PO VRT VRA VRB PS SD XO]
+// SD is "Shift Direction"
+class VNForm_VTAB5_SD3<bits<6> xo, bits<2> ps, dag OOL, dag IOL, string asmstr,
+                       InstrItinClass itin, list<dag> pattern>
+    : I<4, OOL, IOL, asmstr, itin> {
+  bits<5> VRT;
+  bits<5> VRA;
+  bits<5> VRB;
+  bits<3> SD;
+
+  let Pattern = pattern;
+
+  let Inst{6-10}  = VRT;
+  let Inst{11-15} = VRA;
+  let Inst{16-20} = VRB;
+  let Inst{21-22} = ps;
+  let Inst{23-25} = SD;
+  let Inst{26-31} = xo;
+}
+
+// 8RR:D-Form: [ 1 1 0 // // imm0
+//               PO T XO TX imm1 ].
+class 8RR_DForm_IMM32_XT6<bits<6> opcode, bits<4> xo, dag OOL, dag IOL,
+                          string asmstr, InstrItinClass itin,
+                          list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<32> IMM32;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 1;
+  let Inst{8-11} = 0;
+  let Inst{12-13} = 0; // reserved
+  let Inst{14-15} = 0; // reserved
+  let Inst{16-31} = IMM32{31-16};
+
+  // The instruction.
+  let Inst{38-42} = XT{4-0};
+  let Inst{43-46} = xo;
+  let Inst{47} = XT{5};
+  let Inst{48-63} = IMM32{15-0};
+}
+
+// 8RR:D-Form: [ 1 1 0 // // imm0
+//               PO T XO IX TX imm1 ].
+class 8RR_DForm_IMM32_XT6_IX<bits<6> opcode, bits<3> xo, dag OOL, dag IOL,
+                             string asmstr, InstrItinClass itin,
+                             list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bit IX;
+  bits<32> IMM32;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 1;
+  let Inst{8-11} = 0;
+  let Inst{12-13} = 0; // reserved
+  let Inst{14-15} = 0; // reserved
+  let Inst{16-31} = IMM32{31-16};
+
+  // The instruction.
+  let Inst{38-42} = XT{4-0};
+  let Inst{43-45} = xo;
+  let Inst{46} = IX;
+  let Inst{47} = XT{5};
+  let Inst{48-63} = IMM32{15-0};
+}
+
+class 8RR_XX4Form_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL,
+                         string asmstr, InstrItinClass itin, list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<6> XC;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 1;
+  let Inst{8-11} = 0;
+  let Inst{12-13} = 0;
+  let Inst{14-31} = 0;
+
+  // The instruction.
+  let Inst{38-42} = XT{4-0};
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-57} = XC{4-0};
+  let Inst{58-59} = xo;
+  let Inst{60} = XC{5};
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = XT{5};
+}
+
+class 8RR_XX4Form_IMM3_XTABC6<bits<6> opcode, bits<2> xo, dag OOL, dag IOL,
+                              string asmstr, InstrItinClass itin,
+                              list<dag> pattern>
+  : PI<1, opcode, OOL, IOL, asmstr, itin> {
+  bits<6> XT;
+  bits<6> XA;
+  bits<6> XB;
+  bits<6> XC;
+  bits<3> IMM;
+
+  let Pattern = pattern;
+
+  // The prefix.
+  let Inst{6-7} = 1;
+  let Inst{8-11} = 0;
+  let Inst{12-13} = 0;
+  let Inst{14-28} = 0;
+  let Inst{29-31} = IMM;
+
+  // The instruction.
+  let Inst{38-42} = XT{4-0};
+  let Inst{43-47} = XA{4-0};
+  let Inst{48-52} = XB{4-0};
+  let Inst{53-57} = XC{4-0};
+  let Inst{58-59} = xo;
+  let Inst{60} = XC{5};
+  let Inst{61} = XA{5};
+  let Inst{62} = XB{5};
+  let Inst{63} = XT{5};
+}
+
+// [PO BF / XO2 B XO BX /]
+class XX2_BF3_XO5_XB6_XO9<bits<6> opcode, bits<5> xo2, bits<9> xo, dag OOL,
+                          dag IOL, string asmstr, InstrItinClass itin,
+                          list<dag> pattern>
+  : I<opcode, OOL, IOL, asmstr, itin> {
+  bits<3> BF;
+  bits<6> XB;
+
+  let Pattern = pattern;
+
+  let Inst{6-8}   = BF;
+  let Inst{9-10}  = 0;
+  let Inst{11-15} = xo2;
+  let Inst{16-20} = XB{4-0};
+  let Inst{21-29} = xo;
+  let Inst{30}    = XB{5};
+  let Inst{31}    = 0;
+}
+
+multiclass MLS_DForm_R_SI34_RTA5_MEM_p<bits<6> opcode, dag OOL, dag IOL,
+                                       dag PCRel_IOL, string asmstr,
+                                       InstrItinClass itin> {
+  def NAME : MLS_DForm_R_SI34_RTA5_MEM<opcode, OOL, IOL,
+                                       !strconcat(asmstr, ", 0"), itin, []>;
+  def pc : MLS_DForm_R_SI34_RTA5_MEM<opcode, OOL, PCRel_IOL,
+                                     !strconcat(asmstr, ", 1"), itin, []>,
+                                     isPCRel;
+}
+
+multiclass 8LS_DForm_R_SI34_RTA5_p<bits<6> opcode, dag OOL, dag IOL,
+                                   dag PCRel_IOL, string asmstr,
+                                   InstrItinClass itin> {
+  def NAME : 8LS_DForm_R_SI34_RTA5<opcode, OOL, IOL,
+                                   !strconcat(asmstr, ", 0"), itin, []>;
+  def pc : 8LS_DForm_R_SI34_RTA5<opcode, OOL, PCRel_IOL,
+                                 !strconcat(asmstr, ", 1"), itin, []>, isPCRel;
+}
+
+multiclass 8LS_DForm_R_SI34_XT6_RA5_p<bits<5> opcode, dag OOL, dag IOL,
+                                      dag PCRel_IOL, string asmstr,
+                                      InstrItinClass itin> {
+  def NAME : 8LS_DForm_R_SI34_XT6_RA5<opcode, OOL, IOL,
+                                      !strconcat(asmstr, ", 0"), itin, []>;
+  def pc : 8LS_DForm_R_SI34_XT6_RA5<opcode, OOL, PCRel_IOL,
+                                    !strconcat(asmstr, ", 1"), itin, []>,
+                                    isPCRel;
+}
+
+def PrefixInstrs : Predicate<"Subtarget->hasPrefixInstrs()">;
+def IsISA3_1 : Predicate<"Subtarget->isISA3_1()">;
+
+let Predicates = [PrefixInstrs] in {
+  let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+    defm PADDI8 :
+      MLS_DForm_R_SI34_RTA5_p<14, (outs g8rc:$RT), (ins g8rc:$RA, s34imm:$SI),
+                              (ins immZero:$RA, s34imm:$SI),
+                              "paddi $RT, $RA, $SI", IIC_LdStLFD>;
+    let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+      def PLI8 : MLS_DForm_SI34_RT5<14, (outs g8rc:$RT),
+                                    (ins s34imm:$SI),
+                                    "pli $RT, $SI", IIC_IntSimple, []>;
+    }
+  }
+  defm PADDI :
+    MLS_DForm_R_SI34_RTA5_p<14, (outs gprc:$RT), (ins gprc:$RA, s34imm:$SI),
+                            (ins immZero:$RA, s34imm:$SI),
+                            "paddi $RT, $RA, $SI", IIC_LdStLFD>;
+  let isReMaterializable = 1, isAsCheapAsAMove = 1, isMoveImm = 1 in {
+    def PLI : MLS_DForm_SI34_RT5<14, (outs gprc:$RT),
+                                 (ins s34imm:$SI),
+                                 "pli $RT, $SI", IIC_IntSimple, []>;
+  }
+
+  let mayLoad = 1, mayStore = 0 in {
+    defm PLXV :
+      8LS_DForm_R_SI34_XT6_RA5_p<25, (outs vsrc:$XT), (ins memri34:$D_RA),
+                                 (ins memri34_pcrel:$D_RA), "plxv $XT, $D_RA",
+                                 IIC_LdStLFD>;
+    defm PLFS :
+      MLS_DForm_R_SI34_RTA5_MEM_p<48, (outs f4rc:$FRT), (ins memri34:$D_RA),
+                                  (ins memri34_pcrel:$D_RA), "plfs $FRT, $D_RA",
+                                  IIC_LdStLFD>;
+    defm PLFD :
+      MLS_DForm_R_SI34_RTA5_MEM_p<50, (outs f8rc:$FRT), (ins memri34:$D_RA),
+                                  (ins  memri34_pcrel:$D_RA), "plfd $FRT, $D_RA",
+                                  IIC_LdStLFD>;
+    defm PLXSSP :
+      8LS_DForm_R_SI34_RTA5_p<43, (outs vfrc:$VRT), (ins memri34:$D_RA),
+                              (ins memri34_pcrel:$D_RA), "plxssp $VRT, $D_RA",
+                              IIC_LdStLFD>;
+    defm PLXSD :
+      8LS_DForm_R_SI34_RTA5_p<42, (outs vfrc:$VRT), (ins memri34:$D_RA),
+                              (ins memri34_pcrel:$D_RA), "plxsd $VRT, $D_RA",
+                              IIC_LdStLFD>;
+    let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+      defm PLBZ8 :
+        MLS_DForm_R_SI34_RTA5_MEM_p<34, (outs g8rc:$RT), (ins memri34:$D_RA),
+                                    (ins memri34_pcrel:$D_RA), "plbz $RT, $D_RA",
+                                    IIC_LdStLFD>;
+      defm PLHZ8 :
+        MLS_DForm_R_SI34_RTA5_MEM_p<40, (outs g8rc:$RT), (ins memri34:$D_RA),
+                                    (ins memri34_pcrel:$D_RA), "plhz $RT, $D_RA",
+                                    IIC_LdStLFD>;
+      defm PLHA8 :
+        MLS_DForm_R_SI34_RTA5_MEM_p<42, (outs g8rc:$RT), (ins memri34:$D_RA),
+                                    (ins memri34_pcrel:$D_RA), "plha $RT, $D_RA",
+                                    IIC_LdStLFD>;
+      defm PLWA8 :
+        8LS_DForm_R_SI34_RTA5_p<41, (outs g8rc:$RT), (ins memri34:$D_RA),
+                                (ins memri34_pcrel:$D_RA), "plwa $RT, $D_RA",
+                                IIC_LdStLFD>;
+      defm PLWZ8 :
+        MLS_DForm_R_SI34_RTA5_MEM_p<32, (outs g8rc:$RT), (ins memri34:$D_RA),
+                                    (ins memri34_pcrel:$D_RA), "plwz $RT, $D_RA",
+                                    IIC_LdStLFD>;
+    }
+    defm PLBZ :
+      MLS_DForm_R_SI34_RTA5_MEM_p<34, (outs gprc:$RT), (ins memri34:$D_RA),
+                                  (ins memri34_pcrel:$D_RA), "plbz $RT, $D_RA",
+                                  IIC_LdStLFD>;
+    defm PLHZ :
+      MLS_DForm_R_SI34_RTA5_MEM_p<40, (outs gprc:$RT), (ins memri34:$D_RA),
+                                  (ins memri34_pcrel:$D_RA), "plhz $RT, $D_RA",
+                                  IIC_LdStLFD>;
+    defm PLHA :
+      MLS_DForm_R_SI34_RTA5_MEM_p<42, (outs gprc:$RT), (ins memri34:$D_RA),
+                                  (ins memri34_pcrel:$D_RA), "plha $RT, $D_RA",
+                                  IIC_LdStLFD>;
+    defm PLWZ :
+      MLS_DForm_R_SI34_RTA5_MEM_p<32, (outs gprc:$RT), (ins memri34:$D_RA),
+                                  (ins memri34_pcrel:$D_RA), "plwz $RT, $D_RA",
+                                  IIC_LdStLFD>;
+    defm PLWA :
+      8LS_DForm_R_SI34_RTA5_p<41, (outs gprc:$RT), (ins memri34:$D_RA),
+                              (ins memri34_pcrel:$D_RA), "plwa $RT, $D_RA",
+                              IIC_LdStLFD>;
+    defm PLD :
+      8LS_DForm_R_SI34_RTA5_p<57, (outs g8rc:$RT), (ins memri34:$D_RA),
+                              (ins memri34_pcrel:$D_RA), "pld $RT, $D_RA",
+                              IIC_LdStLFD>;
+  }
+
+  let mayStore = 1, mayLoad = 0 in {
+    defm PSTXV :
+      8LS_DForm_R_SI34_XT6_RA5_p<27, (outs), (ins vsrc:$XS, memri34:$D_RA),
+                                 (ins vsrc:$XS, memri34_pcrel:$D_RA),
+                                 "pstxv $XS, $D_RA", IIC_LdStLFD>;
+    defm PSTFS :
+      MLS_DForm_R_SI34_RTA5_MEM_p<52, (outs), (ins f4rc:$FRS, memri34:$D_RA),
+                                  (ins f4rc:$FRS, memri34_pcrel:$D_RA),
+                                  "pstfs $FRS, $D_RA", IIC_LdStLFD>;
+    defm PSTFD :
+      MLS_DForm_R_SI34_RTA5_MEM_p<54, (outs), (ins f8rc:$FRS, memri34:$D_RA),
+                                  (ins f8rc:$FRS, memri34_pcrel:$D_RA),
+                                  "pstfd $FRS, $D_RA", IIC_LdStLFD>;
+    defm PSTXSSP :
+      8LS_DForm_R_SI34_RTA5_p<47, (outs), (ins vfrc:$VRS, memri34:$D_RA),
+                              (ins vfrc:$VRS, memri34_pcrel:$D_RA),
+                              "pstxssp $VRS, $D_RA", IIC_LdStLFD>;
+    defm PSTXSD :
+      8LS_DForm_R_SI34_RTA5_p<46, (outs), (ins vfrc:$VRS, memri34:$D_RA),
+                              (ins vfrc:$VRS, memri34_pcrel:$D_RA),
+                              "pstxsd $VRS, $D_RA", IIC_LdStLFD>;
+    let Interpretation64Bit = 1, isCodeGenOnly = 1 in {
+      defm PSTB8 :
+        MLS_DForm_R_SI34_RTA5_MEM_p<38, (outs), (ins g8rc:$RS, memri34:$D_RA),
+                                    (ins g8rc:$RS, memri34_pcrel:$D_RA),
+                                    "pstb $RS, $D_RA", IIC_LdStLFD>;
+      defm PSTH8 :
+        MLS_DForm_R_SI34_RTA5_MEM_p<44, (outs), (ins g8rc:$RS, memri34:$D_RA),
+                                    (ins g8rc:$RS, memri34_pcrel:$D_RA),
+                                    "psth $RS, $D_RA", IIC_LdStLFD>;
+      defm PSTW8 :
+        MLS_DForm_R_SI34_RTA5_MEM_p<36, (outs), (ins g8rc:$RS, memri34:$D_RA),
+                                    (ins g8rc:$RS, memri34_pcrel:$D_RA),
+                                    "pstw $RS, $D_RA", IIC_LdStLFD>;
+    }
+    defm PSTB :
+      MLS_DForm_R_SI34_RTA5_MEM_p<38, (outs), (ins gprc:$RS, memri34:$D_RA),
+                                  (ins gprc:$RS, memri34_pcrel:$D_RA),
+                                  "pstb $RS, $D_RA", IIC_LdStLFD>;
+    defm PSTH :
+      MLS_DForm_R_SI34_RTA5_MEM_p<44, (outs), (ins gprc:$RS, memri34:$D_RA),
+                                  (ins gprc:$RS, memri34_pcrel:$D_RA),
+                                  "psth $RS, $D_RA", IIC_LdStLFD>;
+    defm PSTW :
+      MLS_DForm_R_SI34_RTA5_MEM_p<36, (outs), (ins gprc:$RS, memri34:$D_RA),
+                                  (ins gprc:$RS, memri34_pcrel:$D_RA),
+                                  "pstw $RS, $D_RA", IIC_LdStLFD>;
+    defm PSTD :
+      8LS_DForm_R_SI34_RTA5_p<61, (outs), (ins g8rc:$RS, memri34:$D_RA),
+                              (ins g8rc:$RS, memri34_pcrel:$D_RA),
+                              "pstd $RS, $D_RA", IIC_LdStLFD>;
+  }
+}
+
+// TODO: We have an added complexity of 500 here. This is only a temporary
+// solution to have tablegen consider these patterns first. The way we do
+// addressing for PowerPC is complex depending on available D form, X form, or
+// aligned D form loads/stores like DS and DQ forms. The prefixed
+// instructions in this file also add additional PC Relative loads/stores
+// and D form loads/stores with 34 bit immediates. It is very difficult to force
+// instruction selection to consistently pick these first without the current
+// added complexity. Once pc-relative implementation is complete, a set of
+// follow-up patches will address this refactoring and the AddedComplexity will
+// be removed.
+let Predicates = [PCRelativeMemops], AddedComplexity = 500 in {
+  // Load i32
+  def : Pat<(i32 (zextloadi8  (PPCmatpcreladdr pcreladdr:$ga))),
+            (PLBZpc $ga, 0)>;
+  def : Pat<(i32 (extloadi8   (PPCmatpcreladdr pcreladdr:$ga))),
+            (PLBZpc $ga, 0)>;
+  def : Pat<(i32 (sextloadi16 (PPCmatpcreladdr pcreladdr:$ga))),
+            (PLHApc $ga, 0)>;
+  def : Pat<(i32 (zextloadi16 (PPCmatpcreladdr pcreladdr:$ga))),
+            (PLHZpc $ga, 0)>;
+  def : Pat<(i32 (extloadi16 (PPCmatpcreladdr pcreladdr:$ga))),
+            (PLHZpc $ga, 0)>;
+  def : Pat<(i32 (load (PPCmatpcreladdr pcreladdr:$ga))), (PLWZpc $ga, 0)>;
+
+  // Store i32
+  def : Pat<(truncstorei8 i32:$RS, (PPCmatpcreladdr pcreladdr:$ga)),
+            (PSTBpc $RS, $ga, 0)>;
+  def : Pat<(truncstorei16 i32:$RS, (PPCmatpcreladdr pcreladdr:$ga)),
+            (PSTHpc $RS, $ga, 0)>;
+  def : Pat<(store i32:$RS, (PPCmatpcreladdr pcreladdr:$ga)),
+            (PSTWpc $RS, $ga, 0)>;
+
+  // Load i64
+  def : Pat<(i64 (zextloadi8  (PPCmatpcreladdr pcreladdr:$ga))),
+            (PLBZ8pc $ga, 0)>;
+  def : Pat<(i64 (extloadi8   (PPCmatpcreladdr pcreladdr:$ga))),
+            (PLBZ8pc $ga, 0)>;
+  def : Pat<(i64 (sextloadi16 (PPCmatpcreladdr pcreladdr:$ga))),
+            (PLHA8pc $ga, 0)>;
+  def : Pat<(i64 (zextloadi16 (PPCmatpcreladdr pcreladdr:$ga))),
+            (PLHZ8pc $ga, 0)>;
+  def : Pat<(i64 (extloadi16 (PPCmatpcreladdr pcreladdr:$ga))),
+            (PLHZ8pc $ga, 0)>;
+  def : Pat<(i64 (zextloadi32 (PPCmatpcreladdr pcreladdr:$ga))),
+            (PLWZ8pc $ga, 0)>;
+  def : Pat<(i64 (sextloadi32 (PPCmatpcreladdr pcreladdr:$ga))),
+            (PLWA8pc $ga, 0)>;
+  def : Pat<(i64 (extloadi32 (PPCmatpcreladdr pcreladdr:$ga))),
+            (PLWZ8pc $ga, 0)>;
+  def : Pat<(i64 (load (PPCmatpcreladdr pcreladdr:$ga))), (PLDpc $ga, 0)>;
+
+  // Store i64
+  def : Pat<(truncstorei8 i64:$RS, (PPCmatpcreladdr pcreladdr:$ga)),
+            (PSTB8pc $RS, $ga, 0)>;
+  def : Pat<(truncstorei16 i64:$RS, (PPCmatpcreladdr pcreladdr:$ga)),
+            (PSTH8pc $RS, $ga, 0)>;
+  def : Pat<(truncstorei32 i64:$RS, (PPCmatpcreladdr pcreladdr:$ga)),
+            (PSTW8pc $RS, $ga, 0)>;
+  def : Pat<(store i64:$RS, (PPCmatpcreladdr pcreladdr:$ga)),
+            (PSTDpc $RS, $ga, 0)>;
+
+  // Load f32
+  def : Pat<(f32 (load (PPCmatpcreladdr pcreladdr:$addr))), (PLFSpc $addr, 0)>;
+
+  // Store f32
+  def : Pat<(store f32:$FRS, (PPCmatpcreladdr pcreladdr:$ga)),
+            (PSTFSpc $FRS, $ga, 0)>;
+
+  // Load f64
+  def : Pat<(f64 (extloadf32 (PPCmatpcreladdr pcreladdr:$addr))),
+            (COPY_TO_REGCLASS (PLFSpc $addr, 0), VSFRC)>;
+  def : Pat<(f64 (load (PPCmatpcreladdr pcreladdr:$addr))), (PLFDpc $addr, 0)>;
+
+  // Store f64
+  def : Pat<(store f64:$FRS, (PPCmatpcreladdr pcreladdr:$ga)),
+            (PSTFDpc $FRS, $ga, 0)>;
+
+  // Load f128
+  def : Pat<(f128 (load (PPCmatpcreladdr pcreladdr:$addr))),
+            (COPY_TO_REGCLASS (PLXVpc $addr, 0), VRRC)>;
+
+  // Store f128
+  def : Pat<(store f128:$XS, (PPCmatpcreladdr pcreladdr:$ga)),
+            (PSTXVpc (COPY_TO_REGCLASS $XS, VSRC), $ga, 0)>;
+
+  // Load v4i32
+  def : Pat<(v4i32 (load (PPCmatpcreladdr pcreladdr:$addr))), (PLXVpc $addr, 0)>;
+
+  // Store v4i32
+  def : Pat<(store v4i32:$XS, (PPCmatpcreladdr pcreladdr:$ga)),
+            (PSTXVpc $XS, $ga, 0)>;
+
+  // Load v2i64
+  def : Pat<(v2i64 (load (PPCmatpcreladdr pcreladdr:$addr))), (PLXVpc $addr, 0)>;
+
+  // Store v2i64
+  def : Pat<(store v2i64:$XS, (PPCmatpcreladdr pcreladdr:$ga)),
+            (PSTXVpc $XS, $ga, 0)>;
+
+  // Load v4f32
+  def : Pat<(v4f32 (load (PPCmatpcreladdr pcreladdr:$addr))), (PLXVpc $addr, 0)>;
+
+  // Store v4f32
+  def : Pat<(store v4f32:$XS, (PPCmatpcreladdr pcreladdr:$ga)),
+            (PSTXVpc $XS, $ga, 0)>;
+
+  // Load v2f64
+  def : Pat<(v2f64 (load (PPCmatpcreladdr pcreladdr:$addr))), (PLXVpc $addr, 0)>;
+
+  // Store v2f64
+  def : Pat<(store v2f64:$XS, (PPCmatpcreladdr pcreladdr:$ga)),
+            (PSTXVpc $XS, $ga, 0)>;
+
+  // Atomic Load
+  def : Pat<(atomic_load_8 (PPCmatpcreladdr pcreladdr:$ga)),
+            (PLBZpc $ga, 0)>;
+  def : Pat<(atomic_load_16 (PPCmatpcreladdr pcreladdr:$ga)),
+            (PLHZpc $ga, 0)>;
+  def : Pat<(atomic_load_32 (PPCmatpcreladdr pcreladdr:$ga)),
+            (PLWZpc $ga, 0)>;
+  def : Pat<(atomic_load_64 (PPCmatpcreladdr pcreladdr:$ga)),
+            (PLDpc $ga, 0)>;
+
+  // Atomic Store
+  def : Pat<(atomic_store_8 (PPCmatpcreladdr pcreladdr:$ga), i32:$RS),
+            (PSTBpc $RS, $ga, 0)>;
+  def : Pat<(atomic_store_16 (PPCmatpcreladdr pcreladdr:$ga), i32:$RS),
+            (PSTHpc $RS, $ga, 0)>;
+  def : Pat<(atomic_store_32 (PPCmatpcreladdr pcreladdr:$ga), i32:$RS),
+            (PSTWpc $RS, $ga, 0)>;
+  def : Pat<(atomic_store_8 (PPCmatpcreladdr pcreladdr:$ga), i64:$RS),
+            (PSTB8pc $RS, $ga, 0)>;
+  def : Pat<(atomic_store_16 (PPCmatpcreladdr pcreladdr:$ga), i64:$RS),
+            (PSTH8pc $RS, $ga, 0)>;
+  def : Pat<(atomic_store_32 (PPCmatpcreladdr pcreladdr:$ga), i64:$RS),
+            (PSTW8pc $RS, $ga, 0)>;
+  def : Pat<(atomic_store_64 (PPCmatpcreladdr pcreladdr:$ga), i64:$RS),
+            (PSTDpc $RS, $ga, 0)>;
+
+  // Special Cases For PPCstore_scal_int_from_vsr
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)),
+              (PPCmatpcreladdr pcreladdr:$dst), 8),
+            (PSTXSDpc (XSCVDPSXDS f64:$src), $dst, 0)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)),
+              (PPCmatpcreladdr pcreladdr:$dst), 8),
+            (PSTXSDpc (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC), $dst, 0)>;
+
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)),
+              (PPCmatpcreladdr pcreladdr:$dst), 8),
+            (PSTXSDpc (XSCVDPUXDS f64:$src), $dst, 0)>;
+  def : Pat<(PPCstore_scal_int_from_vsr
+              (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)),
+              (PPCmatpcreladdr pcreladdr:$dst), 8),
+            (PSTXSDpc (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC), $dst, 0)>;
+
+  // If the PPCmatpcreladdr node is not caught by any other pattern it should be
+  // caught here and turned into a paddi instruction to materialize the address.
+  def : Pat<(PPCmatpcreladdr pcreladdr:$addr), (PADDI8pc 0, $addr)>;
+}
+
+let Predicates = [PrefixInstrs] in {
+  def XXSPLTIW : 8RR_DForm_IMM32_XT6<32, 3, (outs vsrc:$XT),
+                                     (ins i32imm:$IMM32),
+                                     "xxspltiw $XT, $IMM32", IIC_VecGeneral,
+                                     []>;
+  def XXSPLTIDP : 8RR_DForm_IMM32_XT6<32, 2, (outs vsrc:$XT),
+                                      (ins i32imm:$IMM32),
+                                      "xxspltidp $XT, $IMM32", IIC_VecGeneral,
+                                      [(set v2f64:$XT,
+                                            (PPCxxspltidp i32:$IMM32))]>;
+  def XXSPLTI32DX :
+      8RR_DForm_IMM32_XT6_IX<32, 0, (outs vsrc:$XT),
+                             (ins vsrc:$XTi, u1imm:$IX, i32imm:$IMM32),
+                             "xxsplti32dx $XT, $IX, $IMM32", IIC_VecGeneral,
+                             [(set v2i64:$XT,
+                                   (PPCxxsplti32dx v2i64:$XTi, i32:$IX,
+                                                   i32:$IMM32))]>,
+                             RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+  def XXPERMX :
+    8RR_XX4Form_IMM3_XTABC6<34, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
+                            vsrc:$XC, u3imm:$UIM),
+                            "xxpermx $XT, $XA, $XB, $XC, $UIM",
+                            IIC_VecPerm, []>;
+  def XXBLENDVB :
+    8RR_XX4Form_XTABC6<33, 0, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
+                       vsrc:$XC), "xxblendvb $XT, $XA, $XB, $XC",
+                       IIC_VecGeneral, []>;
+  def XXBLENDVH :
+    8RR_XX4Form_XTABC6<33, 1, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
+                       vsrc:$XC), "xxblendvh $XT, $XA, $XB, $XC",
+                       IIC_VecGeneral, []>;
+  def XXBLENDVW :
+    8RR_XX4Form_XTABC6<33, 2, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
+                       vsrc:$XC), "xxblendvw $XT, $XA, $XB, $XC",
+                       IIC_VecGeneral, []>;
+  def XXBLENDVD :
+    8RR_XX4Form_XTABC6<33, 3, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
+                       vsrc:$XC), "xxblendvd $XT, $XA, $XB, $XC",
+                       IIC_VecGeneral, []>;
+}
+
+let Predicates = [IsISA3_1] in {
+  def VSLDBI : VNForm_VTAB5_SD3<22, 0, (outs vrrc:$VRT),
+                                (ins vrrc:$VRA, vrrc:$VRB, u3imm:$SH),
+                                "vsldbi $VRT, $VRA, $VRB, $SH",
+                                IIC_VecGeneral, 
+                                [(set v16i8:$VRT,
+                                      (int_ppc_altivec_vsldbi v16i8:$VRA,
+                                                              v16i8:$VRB,
+                                                              i32:$SH))]>;
+  def VSRDBI : VNForm_VTAB5_SD3<22, 1, (outs vrrc:$VRT),
+                                (ins vrrc:$VRA, vrrc:$VRB, u3imm:$SH),
+                                "vsrdbi $VRT, $VRA, $VRB, $SH",
+                                IIC_VecGeneral,
+                                [(set v16i8:$VRT,
+                                      (int_ppc_altivec_vsrdbi v16i8:$VRA,
+                                                              v16i8:$VRB, 
+                                                              i32:$SH))]>;
+  def VINSW : 
+    VXForm_VRT5_UIM5_RB5_ins<207, "vinsw",
+                             [(set v4i32:$vD,
+                                   (int_ppc_altivec_vinsw v4i32:$vDi, i64:$rB,
+                                                          timm:$UIM))]>;
+  def VINSD :
+    VXForm_VRT5_UIM5_RB5_ins<463, "vinsd",
+                             [(set v2i64:$vD,
+                                   (int_ppc_altivec_vinsd v2i64:$vDi, i64:$rB,
+                                                          timm:$UIM))]>;
+  def VINSBVLX :
+    VXForm_VTB5_RA5_ins<15, "vinsbvlx",
+                        [(set v16i8:$vD,
+                              (int_ppc_altivec_vinsbvlx v16i8:$vDi, i64:$rA,
+                                                        v16i8:$vB))]>;
+  def VINSBVRX :
+    VXForm_VTB5_RA5_ins<271, "vinsbvrx",
+                        [(set v16i8:$vD,
+                              (int_ppc_altivec_vinsbvrx v16i8:$vDi, i64:$rA,
+                                                        v16i8:$vB))]>;
+  def VINSHVLX :
+    VXForm_VTB5_RA5_ins<79, "vinshvlx",
+                        [(set v8i16:$vD,
+                              (int_ppc_altivec_vinshvlx v8i16:$vDi, i64:$rA,
+                                                        v8i16:$vB))]>;
+  def VINSHVRX :
+    VXForm_VTB5_RA5_ins<335, "vinshvrx",
+                        [(set v8i16:$vD,
+                              (int_ppc_altivec_vinshvrx v8i16:$vDi, i64:$rA,
+                                                        v8i16:$vB))]>;
+  def VINSWVLX :
+    VXForm_VTB5_RA5_ins<143, "vinswvlx",
+                        [(set v4i32:$vD,
+                              (int_ppc_altivec_vinswvlx v4i32:$vDi, i64:$rA,
+                                                        v4i32:$vB))]>;
+  def VINSWVRX :
+    VXForm_VTB5_RA5_ins<399, "vinswvrx",
+                        [(set v4i32:$vD,
+                              (int_ppc_altivec_vinswvrx v4i32:$vDi, i64:$rA,
+                                                        v4i32:$vB))]>;
+  def VINSBLX :
+    VXForm_VRT5_RAB5_ins<527, "vinsblx",
+                         [(set v16i8:$vD,
+                               (int_ppc_altivec_vinsblx v16i8:$vDi, i64:$rA,
+                                                        i64:$rB))]>;
+  def VINSBRX :
+    VXForm_VRT5_RAB5_ins<783, "vinsbrx",
+                         [(set v16i8:$vD,
+                               (int_ppc_altivec_vinsbrx v16i8:$vDi, i64:$rA,
+                                                        i64:$rB))]>;
+  def VINSHLX :
+    VXForm_VRT5_RAB5_ins<591, "vinshlx",
+                         [(set v8i16:$vD,
+                               (int_ppc_altivec_vinshlx v8i16:$vDi, i64:$rA,
+                                                        i64:$rB))]>;
+  def VINSHRX :
+    VXForm_VRT5_RAB5_ins<847, "vinshrx",
+                         [(set v8i16:$vD,
+                               (int_ppc_altivec_vinshrx v8i16:$vDi, i64:$rA,
+                                                        i64:$rB))]>;
+  def VINSWLX :
+    VXForm_VRT5_RAB5_ins<655, "vinswlx",
+                         [(set v4i32:$vD,
+                               (int_ppc_altivec_vinswlx v4i32:$vDi, i64:$rA,
+                                                        i64:$rB))]>;
+  def VINSWRX :
+    VXForm_VRT5_RAB5_ins<911, "vinswrx",
+                         [(set v4i32:$vD,
+                               (int_ppc_altivec_vinswrx v4i32:$vDi, i64:$rA,
+                                                        i64:$rB))]>;
+  def VINSDLX :
+    VXForm_VRT5_RAB5_ins<719, "vinsdlx",
+                         [(set v2i64:$vD,
+                               (int_ppc_altivec_vinsdlx v2i64:$vDi, i64:$rA,
+                                                        i64:$rB))]>;
+  def VINSDRX :
+    VXForm_VRT5_RAB5_ins<975, "vinsdrx",
+                         [(set v2i64:$vD,
+                               (int_ppc_altivec_vinsdrx v2i64:$vDi, i64:$rA,
+                                                        i64:$rB))]>;
+
+   def VPDEPD : VXForm_1<1485, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vpdepd $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD,
+                         (int_ppc_altivec_vpdepd v2i64:$vA, v2i64:$vB))]>;
+   def VPEXTD : VXForm_1<1421, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vpextd $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD,
+                         (int_ppc_altivec_vpextd v2i64:$vA, v2i64:$vB))]>;
+   def PDEPD : XForm_6<31, 156, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                       "pdepd $rA, $rS, $rB", IIC_IntGeneral,
+                       [(set i64:$rA, (int_ppc_pdepd i64:$rS, i64:$rB))]>;
+   def PEXTD : XForm_6<31, 188, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                       "pextd $rA, $rS, $rB", IIC_IntGeneral,
+                       [(set i64:$rA, (int_ppc_pextd i64:$rS, i64:$rB))]>;
+   def VCFUGED : VXForm_1<1357, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                          "vcfuged $vD, $vA, $vB", IIC_VecGeneral,
+                          [(set v2i64:$vD,
+                          (int_ppc_altivec_vcfuged v2i64:$vA, v2i64:$vB))]>;
+   def VGNB : VXForm_RD5_N3_VB5<1228, (outs g8rc:$rD), (ins vrrc:$vB, u3imm:$N),
+                                "vgnb $rD, $vB, $N", IIC_VecGeneral,
+                                [(set i64:$rD,
+                                (int_ppc_altivec_vgnb v1i128:$vB, timm:$N))]>;
+   def CFUGED : XForm_6<31, 220, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                        "cfuged $rA, $rS, $rB", IIC_IntGeneral,
+                        [(set i64:$rA, (int_ppc_cfuged i64:$rS, i64:$rB))]>;
+   def XXEVAL :
+     8RR_XX4Form_IMM8_XTAB6<34, 1, (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB,
+                            vsrc:$XC, u8imm:$IMM),
+                            "xxeval $XT, $XA, $XB, $XC, $IMM", IIC_VecGeneral,
+                            [(set v2i64:$XT, (int_ppc_vsx_xxeval v2i64:$XA,
+                                  v2i64:$XB, v2i64:$XC, timm:$IMM))]>;
+   def VCLZDM : VXForm_1<1924, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vclzdm $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD,
+                         (int_ppc_altivec_vclzdm v2i64:$vA, v2i64:$vB))]>;
+   def VCTZDM : VXForm_1<1988, (outs vrrc:$vD), (ins vrrc:$vA, vrrc:$vB),
+                         "vctzdm $vD, $vA, $vB", IIC_VecGeneral,
+                         [(set v2i64:$vD,
+                         (int_ppc_altivec_vctzdm v2i64:$vA, v2i64:$vB))]>;
+   def CNTLZDM : XForm_6<31, 59, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                         "cntlzdm $rA, $rS, $rB", IIC_IntGeneral,
+                         [(set i64:$rA,
+                         (int_ppc_cntlzdm i64:$rS, i64:$rB))]>;
+   def CNTTZDM : XForm_6<31, 571, (outs g8rc:$rA), (ins g8rc:$rS, g8rc:$rB),
+                         "cnttzdm $rA, $rS, $rB", IIC_IntGeneral,
+                         [(set i64:$rA,
+                         (int_ppc_cnttzdm i64:$rS, i64:$rB))]>;
+   def XXGENPCVBM :
+     XForm_XT6_IMM5_VB5<60, 916, (outs vsrc:$XT), (ins vrrc:$VRB, s5imm:$IMM),
+                        "xxgenpcvbm $XT, $VRB, $IMM", IIC_VecGeneral, []>;
+   def XXGENPCVHM :
+     XForm_XT6_IMM5_VB5<60, 917, (outs vsrc:$XT), (ins vrrc:$VRB, s5imm:$IMM),
+                        "xxgenpcvhm $XT, $VRB, $IMM", IIC_VecGeneral, []>;
+   def XXGENPCVWM :
+     XForm_XT6_IMM5_VB5<60, 948, (outs vsrc:$XT), (ins vrrc:$VRB, s5imm:$IMM),
+                        "xxgenpcvwm $XT, $VRB, $IMM", IIC_VecGeneral, []>;
+   def XXGENPCVDM :
+     XForm_XT6_IMM5_VB5<60, 949, (outs vsrc:$XT), (ins vrrc:$VRB, s5imm:$IMM),
+                        "xxgenpcvdm $XT, $VRB, $IMM", IIC_VecGeneral, []>;
+   def VCLRLB : VXForm_1<397, (outs vrrc:$vD), (ins vrrc:$vA, gprc:$rB),
+                         "vclrlb $vD, $vA, $rB", IIC_VecGeneral,
+                         [(set v16i8:$vD,
+                               (int_ppc_altivec_vclrlb v16i8:$vA, i32:$rB))]>;
+   def VCLRRB : VXForm_1<461, (outs vrrc:$vD), (ins vrrc:$vA, gprc:$rB),
+                         "vclrrb $vD, $vA, $rB", IIC_VecGeneral,
+                         [(set v16i8:$vD,
+                               (int_ppc_altivec_vclrrb v16i8:$vA, i32:$rB))]>;
+
+  def XVTLSBB : XX2_BF3_XO5_XB6_XO9<60, 2, 475, (outs crrc:$BF), (ins vsrc:$XB),
+                                    "xvtlsbb $BF, $XB", IIC_VecGeneral, []>;
+
+  // The XFormMemOp flag for the following 8 instructions is set on
+  // the instruction format.
+  let mayLoad = 1, mayStore = 0 in {
+    def LXVRBX : X_XT6_RA5_RB5<31, 13, "lxvrbx", vsrc, []>;
+    def LXVRHX : X_XT6_RA5_RB5<31, 45, "lxvrhx", vsrc, []>;
+    def LXVRWX : X_XT6_RA5_RB5<31, 77, "lxvrwx", vsrc, []>;
+    def LXVRDX : X_XT6_RA5_RB5<31, 109, "lxvrdx", vsrc, []>;
+  }
+
+  let mayLoad = 0, mayStore = 1 in {
+    def STXVRBX : X_XS6_RA5_RB5<31, 141, "stxvrbx", vsrc, []>;
+    def STXVRHX : X_XS6_RA5_RB5<31, 173, "stxvrhx", vsrc, []>;
+    def STXVRWX : X_XS6_RA5_RB5<31, 205, "stxvrwx", vsrc, []>;
+    def STXVRDX : X_XS6_RA5_RB5<31, 237, "stxvrdx", vsrc, []>;
+  }
+}
+
+//---------------------------- Anonymous Patterns ----------------------------//
+let Predicates = [IsISA3_1] in {
+  def : Pat<(v16i8 (int_ppc_vsx_xxgenpcvbm v16i8:$VRB, imm:$IMM)),
+            (v16i8 (COPY_TO_REGCLASS (XXGENPCVBM $VRB, imm:$IMM), VRRC))>;
+  def : Pat<(v8i16 (int_ppc_vsx_xxgenpcvhm v8i16:$VRB, imm:$IMM)),
+            (v8i16 (COPY_TO_REGCLASS (XXGENPCVHM $VRB, imm:$IMM), VRRC))>;
+  def : Pat<(v4i32 (int_ppc_vsx_xxgenpcvwm v4i32:$VRB, imm:$IMM)),
+            (v4i32 (COPY_TO_REGCLASS (XXGENPCVWM $VRB, imm:$IMM), VRRC))>;
+  def : Pat<(v2i64 (int_ppc_vsx_xxgenpcvdm v2i64:$VRB, imm:$IMM)),
+            (v2i64 (COPY_TO_REGCLASS (XXGENPCVDM $VRB, imm:$IMM), VRRC))>;
+  def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, -1)),
+            (EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_lt)>;
+  def : Pat<(i32 (int_ppc_vsx_xvtlsbb v16i8:$XB, 0)),
+            (EXTRACT_SUBREG (XVTLSBB (COPY_TO_REGCLASS $XB, VSRC)), sub_eq)>;
+}
+
+let AddedComplexity = 400, Predicates = [PrefixInstrs] in {
+ def : Pat<(v4i32 (build_vector i32immNonAllOneNonZero:$A,
+                                i32immNonAllOneNonZero:$A,
+                                i32immNonAllOneNonZero:$A,
+                                i32immNonAllOneNonZero:$A)),
+           (v4i32 (XXSPLTIW imm:$A))>;
+ def : Pat<(f32 nzFPImmAsi32:$A),
+           (COPY_TO_REGCLASS (XXSPLTIDP (getFPAs32BitInt fpimm:$A)),
+                             VSFRC)>;
+ def : Pat<(f64 nzFPImmAsi32:$A),
+           (COPY_TO_REGCLASS (XXSPLTIDP (getFPAs32BitInt fpimm:$A)),
+                             VSFRC)>;
+}
+
+let Predicates = [PrefixInstrs] in {
+  def : Pat<(v16i8 (int_ppc_vsx_xxpermx v16i8:$A, v16i8:$B, v16i8:$C, timm:$D)),
+            (COPY_TO_REGCLASS (XXPERMX (COPY_TO_REGCLASS $A, VSRC),
+                                       (COPY_TO_REGCLASS $B, VSRC),
+                                       (COPY_TO_REGCLASS $C, VSRC), $D), VSRC)>;
+  def : Pat<(v16i8 (int_ppc_vsx_xxblendvb v16i8:$A, v16i8:$B, v16i8:$C)),
+            (COPY_TO_REGCLASS
+                   (XXBLENDVB (COPY_TO_REGCLASS $A, VSRC),
+                              (COPY_TO_REGCLASS $B, VSRC),
+                              (COPY_TO_REGCLASS $C, VSRC)), VSRC)>;
+  def : Pat<(v8i16 (int_ppc_vsx_xxblendvh v8i16:$A, v8i16:$B, v8i16:$C)),
+            (COPY_TO_REGCLASS
+                   (XXBLENDVH (COPY_TO_REGCLASS $A, VSRC),
+                              (COPY_TO_REGCLASS $B, VSRC),
+                              (COPY_TO_REGCLASS $C, VSRC)), VSRC)>;
+  def : Pat<(int_ppc_vsx_xxblendvw v4i32:$A, v4i32:$B, v4i32:$C),
+            (XXBLENDVW $A, $B, $C)>;
+  def : Pat<(int_ppc_vsx_xxblendvd v2i64:$A, v2i64:$B, v2i64:$C),
+            (XXBLENDVD $A, $B, $C)>;
+}
+
diff --git a/llvm/lib/Target/PowerPC/PPCInstrQPX.td b/llvm/lib/Target/PowerPC/PPCInstrQPX.td
index d67041d46d9f..2265af2815cb 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrQPX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrQPX.td
@@ -1,9 +1,9 @@
 //===- PPCInstrQPX.td - The PowerPC QPX Extension --*- tablegen -*-===//
-// 
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// 
+//
 //===----------------------------------------------------------------------===//
 //
 // This file describes the QPX extension to the PowerPC instruction set.
@@ -101,7 +101,7 @@ let FastIselShouldIgnore = 1 in // FastIsel should ignore all u12 instrs.
 //===----------------------------------------------------------------------===//
 // Instruction Definitions.
 
-def HasQPX : Predicate<"PPCSubTarget->hasQPX()">;
+def HasQPX : Predicate<"Subtarget->hasQPX()">;
 let Predicates = [HasQPX] in {
 let DecoderNamespace = "QPX" in {
 let hasSideEffects = 0 in { // QPX instructions don't have side effects.
@@ -167,48 +167,48 @@ let Uses = [RM] in {
 
   // Multiply-add instructions
   def QVFMADD : AForm_1<4, 29,
-                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB),
                       "qvfmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
                       [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC, v4f64:$FRB))]>;
   let isCodeGenOnly = 1 in
     def QVFMADDS : QPXA1_Int<0, 29, "qvfmadds", int_ppc_qpx_qvfmadds>;
   def QVFMADDSs : AForm_1<0, 29,
-                        (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+                        (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB),
                         "qvfmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
                         [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC, v4f32:$FRB))]>;
   def QVFNMADD : AForm_1<4, 31,
-                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB),
                       "qvfnmadd $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
                       [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC,
                                                    v4f64:$FRB)))]>;
   let isCodeGenOnly = 1 in
     def QVFNMADDS : QPXA1_Int<0, 31, "qvfnmadds", int_ppc_qpx_qvfnmadds>;
   def QVFNMADDSs : AForm_1<0, 31,
-                        (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+                        (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB),
                         "qvfnmadds $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
                         [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC,
                                                      v4f32:$FRB)))]>;
   def QVFMSUB : AForm_1<4, 28,
-                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB),
                       "qvfmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
                       [(set v4f64:$FRT, (fma v4f64:$FRA, v4f64:$FRC,
                                              (fneg v4f64:$FRB)))]>;
   let isCodeGenOnly = 1 in
     def QVFMSUBS : QPXA1_Int<0, 28, "qvfmsubs", int_ppc_qpx_qvfmsubs>;
   def QVFMSUBSs : AForm_1<0, 28,
-                      (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+                      (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB),
                       "qvfmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
                       [(set v4f32:$FRT, (fma v4f32:$FRA, v4f32:$FRC,
                                              (fneg v4f32:$FRB)))]>;
   def QVFNMSUB : AForm_1<4, 30,
-                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRB, qfrc:$FRC),
+                      (outs qfrc:$FRT), (ins qfrc:$FRA, qfrc:$FRC, qfrc:$FRB),
                       "qvfnmsub $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
                       [(set v4f64:$FRT, (fneg (fma v4f64:$FRA, v4f64:$FRC,
                                               (fneg v4f64:$FRB))))]>;
   let isCodeGenOnly = 1 in
     def QVFNMSUBS : QPXA1_Int<0, 30, "qvfnmsubs", int_ppc_qpx_qvfnmsubs>;
   def QVFNMSUBSs : AForm_1<0, 30,
-                      (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRB, qsrc:$FRC),
+                      (outs qsrc:$FRT), (ins qsrc:$FRA, qsrc:$FRC, qsrc:$FRB),
                       "qvfnmsubs $FRT, $FRA, $FRC, $FRB", IIC_FPFused,
                       [(set v4f32:$FRT, (fneg (fma v4f32:$FRA, v4f32:$FRC,
                                               (fneg v4f32:$FRB))))]>;
@@ -899,13 +899,13 @@ def : Pat<(int_ppc_qpx_qvfmul v4f64:$A, v4f64:$B),
 
 // Additional QVFNMSUB patterns: -a*c + b == -(a*c - b)
 def : Pat<(fma (fneg v4f64:$A), v4f64:$C, v4f64:$B),
-          (QVFNMSUB $A, $B, $C)>;
+          (QVFNMSUB $A, $C, $B)>;
 def : Pat<(fma v4f64:$A, (fneg v4f64:$C), v4f64:$B),
-          (QVFNMSUB $A, $B, $C)>;
+          (QVFNMSUB $A, $C, $B)>;
 def : Pat<(fma (fneg v4f32:$A), v4f32:$C, v4f32:$B),
-          (QVFNMSUBSs $A, $B, $C)>;
+          (QVFNMSUBSs $A, $C, $B)>;
 def : Pat<(fma v4f32:$A, (fneg v4f32:$C), v4f32:$B),
-          (QVFNMSUBSs $A, $B, $C)>;
+          (QVFNMSUBSs $A, $C, $B)>;
 
 def : Pat<(int_ppc_qpx_qvfmadd v4f64:$A, v4f64:$B, v4f64:$C),
           (QVFMADD $A, $B, $C)>;
@@ -1210,4 +1210,3 @@ def : Pat<(fmaxnum v4f32:$FRA, v4f32:$FRB),
                                  (QVFTSTNANbs $FRB, $FRB), (i32 7)),
                    $FRB, $FRA)>;
 }
-
diff --git a/llvm/lib/Target/PowerPC/PPCInstrSPE.td b/llvm/lib/Target/PowerPC/PPCInstrSPE.td
index 935c3044ae47..858eb0c9fe50 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrSPE.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrSPE.td
@@ -158,7 +158,7 @@ def EFDCFSF        : EFXForm_2a<755, (outs sperc:$RT), (ins spe4rc:$RB),
 
 def EFDCFSI        : EFXForm_2a<753, (outs sperc:$RT), (ins gprc:$RB),
                                 "efdcfsi $RT, $RB", IIC_FPDGeneral,
-                                [(set f64:$RT, (sint_to_fp i32:$RB))]>;
+                                [(set f64:$RT, (any_sint_to_fp i32:$RB))]>;
 
 def EFDCFSID       : EFXForm_2a<739, (outs sperc:$RT), (ins gprc:$RB),
                                 "efdcfsid $RT, $RB", IIC_FPDGeneral,
@@ -169,7 +169,7 @@ def EFDCFUF        : EFXForm_2a<754, (outs sperc:$RT), (ins spe4rc:$RB),
 
 def EFDCFUI        : EFXForm_2a<752, (outs sperc:$RT), (ins gprc:$RB),
                                 "efdcfui $RT, $RB", IIC_FPDGeneral,
-                                [(set f64:$RT, (uint_to_fp i32:$RB))]>;
+                                [(set f64:$RT, (any_uint_to_fp i32:$RB))]>;
 
 def EFDCFUID       : EFXForm_2a<738, (outs sperc:$RT), (ins gprc:$RB),
                                 "efdcfuid $RT, $RB", IIC_FPDGeneral,
@@ -197,7 +197,7 @@ def EFDCTSIDZ      : EFXForm_2a<747, (outs gprc:$RT), (ins sperc:$RB),
 
 def EFDCTSIZ       : EFXForm_2a<762, (outs gprc:$RT), (ins sperc:$RB),
                                 "efdctsiz $RT, $RB", IIC_FPDGeneral,
-                                [(set i32:$RT, (fp_to_sint f64:$RB))]>;
+                                [(set i32:$RT, (any_fp_to_sint f64:$RB))]>;
 
 def EFDCTUF        : EFXForm_2a<758, (outs sperc:$RT), (ins spe4rc:$RB),
                                 "efdctuf $RT, $RB", IIC_FPDGeneral, []>;
@@ -212,7 +212,7 @@ def EFDCTUIDZ      : EFXForm_2a<746, (outs gprc:$RT), (ins sperc:$RB),
 
 def EFDCTUIZ       : EFXForm_2a<760, (outs gprc:$RT), (ins sperc:$RB),
                                 "efdctuiz $RT, $RB", IIC_FPDGeneral,
-                                [(set i32:$RT, (fp_to_uint f64:$RB))]>;
+                                [(set i32:$RT, (any_fp_to_uint f64:$RB))]>;
 
 def EFDDIV         : EFXForm_1<745, (outs sperc:$RT), (ins sperc:$RA, sperc:$RB),
                                "efddiv $RT, $RA, $RB", IIC_FPDivD,
@@ -261,14 +261,14 @@ def EFSCFSF        : EFXForm_2a<723, (outs spe4rc:$RT), (ins spe4rc:$RB),
 
 def EFSCFSI        : EFXForm_2a<721, (outs spe4rc:$RT), (ins gprc:$RB),
                                 "efscfsi $RT, $RB", IIC_FPSGeneral,
-                                [(set f32:$RT, (sint_to_fp i32:$RB))]>;
+                                [(set f32:$RT, (any_sint_to_fp i32:$RB))]>;
 
 def EFSCFUF        : EFXForm_2a<722, (outs spe4rc:$RT), (ins spe4rc:$RB),
                                 "efscfuf $RT, $RB", IIC_FPSGeneral, []>;
 
 def EFSCFUI        : EFXForm_2a<720, (outs spe4rc:$RT), (ins gprc:$RB),
                                 "efscfui $RT, $RB", IIC_FPSGeneral,
-                                [(set f32:$RT, (uint_to_fp i32:$RB))]>;
+                                [(set f32:$RT, (any_uint_to_fp i32:$RB))]>;
 
 let isCompare = 1 in {
 def EFSCMPEQ       : EFXForm_3<718, (outs crrc:$crD), (ins spe4rc:$RA, spe4rc:$RB),
@@ -288,7 +288,7 @@ def EFSCTSI        : EFXForm_2a<725, (outs gprc:$RT), (ins spe4rc:$RB),
 
 def EFSCTSIZ       : EFXForm_2a<730, (outs gprc:$RT), (ins spe4rc:$RB),
                                 "efsctsiz $RT, $RB", IIC_FPSGeneral,
-                                [(set i32:$RT, (fp_to_sint f32:$RB))]>;
+                                [(set i32:$RT, (any_fp_to_sint f32:$RB))]>;
 
 def EFSCTUF        : EFXForm_2a<726, (outs sperc:$RT), (ins spe4rc:$RB),
                                 "efsctuf $RT, $RB", IIC_FPSGeneral, []>;
@@ -299,7 +299,7 @@ def EFSCTUI        : EFXForm_2a<724, (outs gprc:$RT), (ins spe4rc:$RB),
 
 def EFSCTUIZ       : EFXForm_2a<728, (outs gprc:$RT), (ins spe4rc:$RB),
                                 "efsctuiz $RT, $RB", IIC_FPSGeneral,
-                                [(set i32:$RT, (fp_to_uint f32:$RB))]>;
+                                [(set i32:$RT, (any_fp_to_uint f32:$RB))]>;
 
 def EFSDIV         : EFXForm_1<713, (outs spe4rc:$RT), (ins spe4rc:$RA, spe4rc:$RB),
                                "efsdiv $RT, $RA, $RB", IIC_FPDivD,
diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index be6b30ffa08b..9ba5058a6f81 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -1,9 +1,9 @@
 //===- PPCInstrVSX.td - The PowerPC VSX Extension --*- tablegen -*-===//
-// 
+//
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-// 
+//
 //===----------------------------------------------------------------------===//
 //
 // This file describes the VSX extension to the PowerPC instruction set.
@@ -25,6 +25,32 @@
 // ** in PPCVSXSwapRemoval::gatherVectorInstructions().                      **
 // ****************************************************************************
 
+// *********************************** NOTE ***********************************
+// ** When adding new anonymous patterns to this file, please add them to    **
+// ** the section titled Anonymous Patterns. Chances are that the existing   **
+// ** predicate blocks already contain a combination of features that you    **
+// ** are after. There is a list of blocks at the top of the section. If     **
+// ** you definitely need a new combination of predicates, please add that   **
+// ** combination to the list.                                               **
+// ** File Structure:                                                        **
+// ** - Custom PPCISD node definitions                                       **
+// ** - Predicate definitions: predicates to specify the subtargets for      **
+// **   which an instruction or pattern can be emitted.                      **
+// ** - Instruction formats: classes instantiated by the instructions.       **
+// **   These generally correspond to instruction formats in section 1.6 of  **
+// **   the ISA document.                                                    **
+// ** - Instruction definitions: the actual definitions of the instructions  **
+// **   often including input patterns that they match.                      **
+// ** - Helper DAG definitions: We define a number of dag objects to use as  **
+// **   input or output patterns for consciseness of the code.               **
+// ** - Anonymous patterns: input patterns that an instruction matches can   **
+// **   often not be specified as part of the instruction definition, so an  **
+// **   anonymous pattern must be specified mapping an input pattern to an   **
+// **   output pattern. These are generally guarded by subtarget predicates. **
+// ** - Instruction aliases: used to define extended mnemonics for assembly  **
+// **   printing (for example: xxswapd for xxpermdi with 0x2 as the imm).    **
+// ****************************************************************************
+
 def PPCRegVSRCAsmOperand : AsmOperandClass {
   let Name = "RegVSRC"; let PredicateMethod = "isVSRegNumber";
 }
@@ -89,6 +115,7 @@ def SDT_PPCst_vec_be : SDTypeProfile<0, 2, [
   SDTCisVec<0>, SDTCisPtrTy<1>
 ]>;
 
+//--------------------------- Custom PPC nodes -------------------------------//
 def PPClxvd2x  : SDNode<"PPCISD::LXVD2X", SDT_PPClxvd2x,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def PPCstxvd2x : SDNode<"PPCISD::STXVD2X", SDT_PPCstxvd2x,
@@ -111,7 +138,24 @@ def PPCldvsxlh : SDNode<"PPCISD::LD_VSX_LH", SDT_PPCldvsxlh,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def PPCldsplat : SDNode<"PPCISD::LD_SPLAT", SDT_PPCldsplat,
                         [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-
+def PPCSToV : SDNode<"PPCISD::SCALAR_TO_VECTOR_PERMUTED",
+                     SDTypeProfile<1, 1, []>, []>;
+
+//-------------------------- Predicate definitions ---------------------------//
+def HasVSX : Predicate<"Subtarget->hasVSX()">;
+def IsLittleEndian : Predicate<"Subtarget->isLittleEndian()">;
+def IsBigEndian : Predicate<"!Subtarget->isLittleEndian()">;
+def HasOnlySwappingMemOps : Predicate<"!Subtarget->hasP9Vector()">;
+def HasP8Vector : Predicate<"Subtarget->hasP8Vector()">;
+def HasDirectMove : Predicate<"Subtarget->hasDirectMove()">;
+def NoP9Vector : Predicate<"!Subtarget->hasP9Vector()">;
+def HasP9Vector : Predicate<"Subtarget->hasP9Vector()">;
+def NoP9Altivec : Predicate<"!Subtarget->hasP9Altivec()">;
+
+//--------------------- VSX-specific instruction formats ---------------------//
+// By default, all VSX instructions are to be selected over their Altivec
+// counter parts and they do not have unmodeled sideeffects.
+let AddedComplexity = 400, hasSideEffects = 0 in {
 multiclass XX3Form_Rcr<bits<6> opcode, bits<7> xo, string asmbase,
                     string asmstr, InstrItinClass itin, Intrinsic Int,
                     ValueType OutTy, ValueType InTy> {
@@ -144,14 +188,119 @@ class XX3Form_2s<bits<6> opcode, bits<5> xo, dag OOL, dag IOL, string asmstr,
     let XB = XA;
 }
 
-def HasVSX : Predicate<"PPCSubTarget->hasVSX()">;
-def IsLittleEndian : Predicate<"PPCSubTarget->isLittleEndian()">;
-def IsBigEndian : Predicate<"!PPCSubTarget->isLittleEndian()">;
-def HasOnlySwappingMemOps : Predicate<"!PPCSubTarget->hasP9Vector()">;
+let Predicates = [HasVSX, HasP9Vector] in {
+class X_VT5_XO5_VB5<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+                    list<dag> pattern>
+  : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vrrc:$vB),
+                  !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
+
+// [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /]
+class X_VT5_XO5_VB5_Ro<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+                       list<dag> pattern>
+  : X_VT5_XO5_VB5<opcode, xo2, xo, opc, pattern>, isRecordForm;
+
+// [PO VRT XO VRB XO /], but the VRB is only used the left 64 bits (or less),
+// So we use different operand class for VRB
+class X_VT5_XO5_VB5_TyVB<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+                         RegisterOperand vbtype, list<dag> pattern>
+  : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vbtype:$vB),
+                  !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
+
+// [PO VRT XO VRB XO /]
+class X_VT5_XO5_VB5_VSFR<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+                    list<dag> pattern>
+  : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vfrc:$vT), (ins vrrc:$vB),
+                  !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
+
+// [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /]
+class X_VT5_XO5_VB5_VSFR_Ro<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
+                       list<dag> pattern>
+  : X_VT5_XO5_VB5_VSFR<opcode, xo2, xo, opc, pattern>, isRecordForm;
+
+// [PO T XO B XO BX /]
+class XX2_RT5_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
+                      list<dag> pattern>
+  : XX2_RD5_XO5_RS6<opcode, xo2, xo, (outs g8rc:$rT), (ins vsfrc:$XB),
+                    !strconcat(opc, " $rT, $XB"), IIC_VecFP, pattern>;
 
-let Predicates = [HasVSX] in {
-let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
-let hasSideEffects = 0 in { // VSX instructions don't have side effects.
+// [PO T XO B XO BX TX]
+class XX2_XT6_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
+                      RegisterOperand vtype, list<dag> pattern>
+  : XX2_RD6_XO5_RS6<opcode, xo2, xo, (outs vtype:$XT), (ins vtype:$XB),
+                    !strconcat(opc, " $XT, $XB"), IIC_VecFP, pattern>;
+
+// [PO T A B XO AX BX TX], src and dest register use different operand class
+class XX3_XT5_XA5_XB5<bits<6> opcode, bits<8> xo, string opc,
+                RegisterOperand xty, RegisterOperand aty, RegisterOperand bty,
+                InstrItinClass itin, list<dag> pattern>
+  : XX3Form<opcode, xo, (outs xty:$XT), (ins aty:$XA, bty:$XB),
+            !strconcat(opc, " $XT, $XA, $XB"), itin, pattern>;
+
+// [PO VRT VRA VRB XO /]
+class X_VT5_VA5_VB5<bits<6> opcode, bits<10> xo, string opc,
+                    list<dag> pattern>
+  : XForm_1<opcode, xo, (outs vrrc:$vT), (ins vrrc:$vA, vrrc:$vB),
+            !strconcat(opc, " $vT, $vA, $vB"), IIC_VecFP, pattern>;
+
+// [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /]
+class X_VT5_VA5_VB5_Ro<bits<6> opcode, bits<10> xo, string opc,
+                       list<dag> pattern>
+  : X_VT5_VA5_VB5<opcode, xo, opc, pattern>, isRecordForm;
+
+// [PO VRT VRA VRB XO /]
+class X_VT5_VA5_VB5_FMA<bits<6> opcode, bits<10> xo, string opc,
+                        list<dag> pattern>
+  : XForm_1<opcode, xo, (outs vrrc:$vT), (ins vrrc:$vTi, vrrc:$vA, vrrc:$vB),
+            !strconcat(opc, " $vT, $vA, $vB"), IIC_VecFP, pattern>,
+            RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">;
+
+// [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /]
+class X_VT5_VA5_VB5_FMA_Ro<bits<6> opcode, bits<10> xo, string opc,
+                        list<dag> pattern>
+  : X_VT5_VA5_VB5_FMA<opcode, xo, opc, pattern>, isRecordForm;
+
+class Z23_VT5_R1_VB5_RMC2_EX1<bits<6> opcode, bits<8> xo, bit ex, string opc,
+                              list<dag> pattern>
+  : Z23Form_8<opcode, xo,
+              (outs vrrc:$vT), (ins u1imm:$r, vrrc:$vB, u2imm:$rmc),
+              !strconcat(opc, " $r, $vT, $vB, $rmc"), IIC_VecFP, pattern> {
+  let RC = ex;
+}
+
+// [PO BF // VRA VRB XO /]
+class X_BF3_VA5_VB5<bits<6> opcode, bits<10> xo, string opc,
+                    list<dag> pattern>
+  : XForm_17<opcode, xo, (outs crrc:$crD), (ins vrrc:$VA, vrrc:$VB),
+             !strconcat(opc, " $crD, $VA, $VB"), IIC_FPCompare> {
+  let Pattern = pattern;
+}
+
+// [PO T RA RB XO TX] almost equal to [PO S RA RB XO SX], but has different
+// "out" and "in" dag
+class X_XT6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
+                    RegisterOperand vtype, list<dag> pattern>
+  : XX1Form_memOp<opcode, xo, (outs vtype:$XT), (ins memrr:$src),
+            !strconcat(opc, " $XT, $src"), IIC_LdStLFD, pattern>;
+
+// [PO S RA RB XO SX]
+class X_XS6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
+                    RegisterOperand vtype, list<dag> pattern>
+  : XX1Form_memOp<opcode, xo, (outs), (ins vtype:$XT, memrr:$dst),
+            !strconcat(opc, " $XT, $dst"), IIC_LdStSTFD, pattern>;
+} // Predicates = HasP9Vector
+} // AddedComplexity = 400, hasSideEffects = 0
+
+multiclass ScalToVecWPermute<ValueType Ty, dag In, dag NonPermOut, dag PermOut> {
+  def : Pat<(Ty (scalar_to_vector In)), (Ty NonPermOut)>;
+  def : Pat<(Ty (PPCSToV In)), (Ty PermOut)>;
+}
+
+//-------------------------- Instruction definitions -------------------------//
+// VSX instructions require the VSX feature, they are to be selected over
+// equivalent Altivec patterns (as they address a larger register set) and
+// they do not have unmodeled side effects.
+let Predicates = [HasVSX], AddedComplexity = 400 in {
+let hasSideEffects = 0 in {
 
   // Load indexed instructions
   let mayLoad = 1, mayStore = 0 in {
@@ -213,53 +362,53 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
     }
   } // mayStore
 
-  let Uses = [RM] in {
+  let Uses = [RM], mayRaiseFPException = 1 in {
   // Add/Mul Instructions
   let isCommutable = 1 in {
     def XSADDDP : XX3Form<60, 32,
                           (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
                           "xsadddp $XT, $XA, $XB", IIC_VecFP,
-                          [(set f64:$XT, (fadd f64:$XA, f64:$XB))]>;
+                          [(set f64:$XT, (any_fadd f64:$XA, f64:$XB))]>;
     def XSMULDP : XX3Form<60, 48,
                           (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
                           "xsmuldp $XT, $XA, $XB", IIC_VecFP,
-                          [(set f64:$XT, (fmul f64:$XA, f64:$XB))]>;
+                          [(set f64:$XT, (any_fmul f64:$XA, f64:$XB))]>;
 
     def XVADDDP : XX3Form<60, 96,
                           (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                           "xvadddp $XT, $XA, $XB", IIC_VecFP,
-                          [(set v2f64:$XT, (fadd v2f64:$XA, v2f64:$XB))]>;
+                          [(set v2f64:$XT, (any_fadd v2f64:$XA, v2f64:$XB))]>;
 
     def XVADDSP : XX3Form<60, 64,
                           (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                           "xvaddsp $XT, $XA, $XB", IIC_VecFP,
-                          [(set v4f32:$XT, (fadd v4f32:$XA, v4f32:$XB))]>;
+                          [(set v4f32:$XT, (any_fadd v4f32:$XA, v4f32:$XB))]>;
 
     def XVMULDP : XX3Form<60, 112,
                           (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                           "xvmuldp $XT, $XA, $XB", IIC_VecFP,
-                          [(set v2f64:$XT, (fmul v2f64:$XA, v2f64:$XB))]>;
+                          [(set v2f64:$XT, (any_fmul v2f64:$XA, v2f64:$XB))]>;
 
     def XVMULSP : XX3Form<60, 80,
                           (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                           "xvmulsp $XT, $XA, $XB", IIC_VecFP,
-                          [(set v4f32:$XT, (fmul v4f32:$XA, v4f32:$XB))]>;
+                          [(set v4f32:$XT, (any_fmul v4f32:$XA, v4f32:$XB))]>;
   }
 
   // Subtract Instructions
   def XSSUBDP : XX3Form<60, 40,
                         (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
                         "xssubdp $XT, $XA, $XB", IIC_VecFP,
-                        [(set f64:$XT, (fsub f64:$XA, f64:$XB))]>;
+                        [(set f64:$XT, (any_fsub f64:$XA, f64:$XB))]>;
 
   def XVSUBDP : XX3Form<60, 104,
                         (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                         "xvsubdp $XT, $XA, $XB", IIC_VecFP,
-                        [(set v2f64:$XT, (fsub v2f64:$XA, v2f64:$XB))]>;
+                        [(set v2f64:$XT, (any_fsub v2f64:$XA, v2f64:$XB))]>;
   def XVSUBSP : XX3Form<60, 72,
                         (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                         "xvsubsp $XT, $XA, $XB", IIC_VecFP,
-                        [(set v4f32:$XT, (fsub v4f32:$XA, v4f32:$XB))]>;
+                        [(set v4f32:$XT, (any_fsub v4f32:$XA, v4f32:$XB))]>;
 
   // FMA Instructions
   let BaseName = "XSMADDADP" in {
@@ -267,7 +416,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
   def XSMADDADP : XX3Form<60, 33,
                           (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
                           "xsmaddadp $XT, $XA, $XB", IIC_VecFP,
-                          [(set f64:$XT, (fma f64:$XA, f64:$XB, f64:$XTi))]>,
+                          [(set f64:$XT, (any_fma f64:$XA, f64:$XB, f64:$XTi))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
@@ -283,7 +432,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
   def XSMSUBADP : XX3Form<60, 49,
                           (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
                           "xsmsubadp $XT, $XA, $XB", IIC_VecFP,
-                          [(set f64:$XT, (fma f64:$XA, f64:$XB, (fneg f64:$XTi)))]>,
+                          [(set f64:$XT, (any_fma f64:$XA, f64:$XB, (fneg f64:$XTi)))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
@@ -299,7 +448,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
   def XSNMADDADP : XX3Form<60, 161,
                           (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
                           "xsnmaddadp $XT, $XA, $XB", IIC_VecFP,
-                          [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, f64:$XTi)))]>,
+                          [(set f64:$XT, (fneg (any_fma f64:$XA, f64:$XB, f64:$XTi)))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
@@ -315,7 +464,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
   def XSNMSUBADP : XX3Form<60, 177,
                           (outs vsfrc:$XT), (ins vsfrc:$XTi, vsfrc:$XA, vsfrc:$XB),
                           "xsnmsubadp $XT, $XA, $XB", IIC_VecFP,
-                          [(set f64:$XT, (fneg (fma f64:$XA, f64:$XB, (fneg f64:$XTi))))]>,
+                          [(set f64:$XT, (fneg (any_fma f64:$XA, f64:$XB, (fneg f64:$XTi))))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
@@ -331,7 +480,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
   def XVMADDADP : XX3Form<60, 97,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvmaddadp $XT, $XA, $XB", IIC_VecFP,
-                          [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi))]>,
+                          [(set v2f64:$XT, (any_fma v2f64:$XA, v2f64:$XB, v2f64:$XTi))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
@@ -347,7 +496,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
   def XVMADDASP : XX3Form<60, 65,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvmaddasp $XT, $XA, $XB", IIC_VecFP,
-                          [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, v4f32:$XTi))]>,
+                          [(set v4f32:$XT, (any_fma v4f32:$XA, v4f32:$XB, v4f32:$XTi))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
@@ -363,7 +512,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
   def XVMSUBADP : XX3Form<60, 113,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvmsubadp $XT, $XA, $XB", IIC_VecFP,
-                          [(set v2f64:$XT, (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi)))]>,
+                          [(set v2f64:$XT, (any_fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi)))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
@@ -379,7 +528,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
   def XVMSUBASP : XX3Form<60, 81,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvmsubasp $XT, $XA, $XB", IIC_VecFP,
-                          [(set v4f32:$XT, (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi)))]>,
+                          [(set v4f32:$XT, (any_fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi)))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
@@ -395,7 +544,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
   def XVNMADDADP : XX3Form<60, 225,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvnmaddadp $XT, $XA, $XB", IIC_VecFP,
-                          [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, v2f64:$XTi)))]>,
+                          [(set v2f64:$XT, (fneg (any_fma v2f64:$XA, v2f64:$XB, v2f64:$XTi)))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
@@ -427,7 +576,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
   def XVNMSUBADP : XX3Form<60, 241,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvnmsubadp $XT, $XA, $XB", IIC_VecFP,
-                          [(set v2f64:$XT, (fneg (fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi))))]>,
+                          [(set v2f64:$XT, (fneg (any_fma v2f64:$XA, v2f64:$XB, (fneg v2f64:$XTi))))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
@@ -443,7 +592,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
   def XVNMSUBASP : XX3Form<60, 209,
                           (outs vsrc:$XT), (ins vsrc:$XTi, vsrc:$XA, vsrc:$XB),
                           "xvnmsubasp $XT, $XA, $XB", IIC_VecFP,
-                          [(set v4f32:$XT, (fneg (fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi))))]>,
+                          [(set v4f32:$XT, (fneg (any_fma v4f32:$XA, v4f32:$XB, (fneg v4f32:$XTi))))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
   let IsVSXFMAAlt = 1 in
@@ -458,11 +607,11 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
   def XSDIVDP : XX3Form<60, 56,
                         (outs vsfrc:$XT), (ins vsfrc:$XA, vsfrc:$XB),
                         "xsdivdp $XT, $XA, $XB", IIC_FPDivD,
-                        [(set f64:$XT, (fdiv f64:$XA, f64:$XB))]>;
+                        [(set f64:$XT, (any_fdiv f64:$XA, f64:$XB))]>;
   def XSSQRTDP : XX2Form<60, 75,
                         (outs vsfrc:$XT), (ins vsfrc:$XB),
                         "xssqrtdp $XT, $XB", IIC_FPSqrtD,
-                        [(set f64:$XT, (fsqrt f64:$XB))]>;
+                        [(set f64:$XT, (any_fsqrt f64:$XB))]>;
 
   def XSREDP : XX2Form<60, 90,
                         (outs vsfrc:$XT), (ins vsfrc:$XB),
@@ -483,20 +632,20 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
   def XVDIVDP : XX3Form<60, 120,
                         (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                         "xvdivdp $XT, $XA, $XB", IIC_FPDivD,
-                        [(set v2f64:$XT, (fdiv v2f64:$XA, v2f64:$XB))]>;
+                        [(set v2f64:$XT, (any_fdiv v2f64:$XA, v2f64:$XB))]>;
   def XVDIVSP : XX3Form<60, 88,
                         (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
                         "xvdivsp $XT, $XA, $XB", IIC_FPDivS,
-                        [(set v4f32:$XT, (fdiv v4f32:$XA, v4f32:$XB))]>;
+                        [(set v4f32:$XT, (any_fdiv v4f32:$XA, v4f32:$XB))]>;
 
   def XVSQRTDP : XX2Form<60, 203,
                         (outs vsrc:$XT), (ins vsrc:$XB),
                         "xvsqrtdp $XT, $XB", IIC_FPSqrtD,
-                        [(set v2f64:$XT, (fsqrt v2f64:$XB))]>;
+                        [(set v2f64:$XT, (any_fsqrt v2f64:$XB))]>;
   def XVSQRTSP : XX2Form<60, 139,
                         (outs vsrc:$XT), (ins vsrc:$XB),
                         "xvsqrtsp $XT, $XB", IIC_FPSqrtS,
-                        [(set v4f32:$XT, (fsqrt v4f32:$XB))]>;
+                        [(set v4f32:$XT, (any_fsqrt v4f32:$XB))]>;
 
   def XVTDIVDP : XX3Form_1<60, 125,
                          (outs crrc:$crD), (ins vsrc:$XA, vsrc:$XB),
@@ -740,65 +889,65 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
   def XSRDPI : XX2Form<60, 73,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xsrdpi $XT, $XB", IIC_VecFP,
-                      [(set f64:$XT, (fround f64:$XB))]>;
+                      [(set f64:$XT, (any_fround f64:$XB))]>;
   def XSRDPIC : XX2Form<60, 107,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xsrdpic $XT, $XB", IIC_VecFP,
-                      [(set f64:$XT, (fnearbyint f64:$XB))]>;
+                      [(set f64:$XT, (any_fnearbyint f64:$XB))]>;
   def XSRDPIM : XX2Form<60, 121,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xsrdpim $XT, $XB", IIC_VecFP,
-                      [(set f64:$XT, (ffloor f64:$XB))]>;
+                      [(set f64:$XT, (any_ffloor f64:$XB))]>;
   def XSRDPIP : XX2Form<60, 105,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xsrdpip $XT, $XB", IIC_VecFP,
-                      [(set f64:$XT, (fceil f64:$XB))]>;
+                      [(set f64:$XT, (any_fceil f64:$XB))]>;
   def XSRDPIZ : XX2Form<60, 89,
                       (outs vsfrc:$XT), (ins vsfrc:$XB),
                       "xsrdpiz $XT, $XB", IIC_VecFP,
-                      [(set f64:$XT, (ftrunc f64:$XB))]>;
+                      [(set f64:$XT, (any_ftrunc f64:$XB))]>;
 
   def XVRDPI : XX2Form<60, 201,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrdpi $XT, $XB", IIC_VecFP,
-                      [(set v2f64:$XT, (fround v2f64:$XB))]>;
+                      [(set v2f64:$XT, (any_fround v2f64:$XB))]>;
   def XVRDPIC : XX2Form<60, 235,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrdpic $XT, $XB", IIC_VecFP,
-                      [(set v2f64:$XT, (fnearbyint v2f64:$XB))]>;
+                      [(set v2f64:$XT, (any_fnearbyint v2f64:$XB))]>;
   def XVRDPIM : XX2Form<60, 249,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrdpim $XT, $XB", IIC_VecFP,
-                      [(set v2f64:$XT, (ffloor v2f64:$XB))]>;
+                      [(set v2f64:$XT, (any_ffloor v2f64:$XB))]>;
   def XVRDPIP : XX2Form<60, 233,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrdpip $XT, $XB", IIC_VecFP,
-                      [(set v2f64:$XT, (fceil v2f64:$XB))]>;
+                      [(set v2f64:$XT, (any_fceil v2f64:$XB))]>;
   def XVRDPIZ : XX2Form<60, 217,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrdpiz $XT, $XB", IIC_VecFP,
-                      [(set v2f64:$XT, (ftrunc v2f64:$XB))]>;
+                      [(set v2f64:$XT, (any_ftrunc v2f64:$XB))]>;
 
   def XVRSPI : XX2Form<60, 137,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrspi $XT, $XB", IIC_VecFP,
-                      [(set v4f32:$XT, (fround v4f32:$XB))]>;
+                      [(set v4f32:$XT, (any_fround v4f32:$XB))]>;
   def XVRSPIC : XX2Form<60, 171,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrspic $XT, $XB", IIC_VecFP,
-                      [(set v4f32:$XT, (fnearbyint v4f32:$XB))]>;
+                      [(set v4f32:$XT, (any_fnearbyint v4f32:$XB))]>;
   def XVRSPIM : XX2Form<60, 185,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrspim $XT, $XB", IIC_VecFP,
-                      [(set v4f32:$XT, (ffloor v4f32:$XB))]>;
+                      [(set v4f32:$XT, (any_ffloor v4f32:$XB))]>;
   def XVRSPIP : XX2Form<60, 169,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrspip $XT, $XB", IIC_VecFP,
-                      [(set v4f32:$XT, (fceil v4f32:$XB))]>;
+                      [(set v4f32:$XT, (any_fceil v4f32:$XB))]>;
   def XVRSPIZ : XX2Form<60, 153,
                       (outs vsrc:$XT), (ins vsrc:$XB),
                       "xvrspiz $XT, $XB", IIC_VecFP,
-                      [(set v4f32:$XT, (ftrunc v4f32:$XB))]>;
+                      [(set v4f32:$XT, (any_ftrunc v4f32:$XB))]>;
 
   // Max/Min Instructions
   let isCommutable = 1 in {
@@ -835,7 +984,7 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
                         [(set vsrc:$XT,
                               (int_ppc_vsx_xvminsp vsrc:$XA, vsrc:$XB))]>;
   } // isCommutable
-} // Uses = [RM]
+  } // Uses = [RM], mayRaiseFPException
 
   // Logical Instructions
   let isCommutable = 1 in
@@ -924,433 +1073,8 @@ let hasSideEffects = 0 in { // VSX instructions don't have side effects.
                        (outs vsrc:$XT), (ins vsfrc:$XB, u2imm:$UIM),
                        "xxspltw $XT, $XB, $UIM", IIC_VecPerm, []>;
 
-} // hasSideEffects
-
-// SELECT_CC_* - Used to implement the SELECT_CC DAG operation.  Expanded after
-// instruction selection into a branch sequence.
-let PPC970_Single = 1 in {
-
-  def SELECT_CC_VSRC: PPCCustomInserterPseudo<(outs vsrc:$dst),
-                             (ins crrc:$cond, vsrc:$T, vsrc:$F, i32imm:$BROPC),
-                             "#SELECT_CC_VSRC",
-                             []>;
-  def SELECT_VSRC: PPCCustomInserterPseudo<(outs vsrc:$dst),
-                          (ins crbitrc:$cond, vsrc:$T, vsrc:$F),
-                          "#SELECT_VSRC",
-                          [(set v2f64:$dst,
-                                (select i1:$cond, v2f64:$T, v2f64:$F))]>;
-  def SELECT_CC_VSFRC: PPCCustomInserterPseudo<(outs f8rc:$dst),
-                              (ins crrc:$cond, f8rc:$T, f8rc:$F,
-                               i32imm:$BROPC), "#SELECT_CC_VSFRC",
-                              []>;
-  def SELECT_VSFRC: PPCCustomInserterPseudo<(outs f8rc:$dst),
-                           (ins crbitrc:$cond, f8rc:$T, f8rc:$F),
-                           "#SELECT_VSFRC",
-                           [(set f64:$dst,
-                                 (select i1:$cond, f64:$T, f64:$F))]>;
-  def SELECT_CC_VSSRC: PPCCustomInserterPseudo<(outs f4rc:$dst),
-                              (ins crrc:$cond, f4rc:$T, f4rc:$F,
-                               i32imm:$BROPC), "#SELECT_CC_VSSRC",
-                              []>;
-  def SELECT_VSSRC: PPCCustomInserterPseudo<(outs f4rc:$dst),
-                           (ins crbitrc:$cond, f4rc:$T, f4rc:$F),
-                           "#SELECT_VSSRC",
-                           [(set f32:$dst,
-                                 (select i1:$cond, f32:$T, f32:$F))]>;
-} 
-} // AddedComplexity
-
-def : InstAlias<"xvmovdp $XT, $XB",
-                (XVCPSGNDP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
-def : InstAlias<"xvmovsp $XT, $XB",
-                (XVCPSGNSP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
-
-def : InstAlias<"xxspltd $XT, $XB, 0",
-                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>;
-def : InstAlias<"xxspltd $XT, $XB, 1",
-                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>;
-def : InstAlias<"xxmrghd $XT, $XA, $XB",
-                (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 0)>;
-def : InstAlias<"xxmrgld $XT, $XA, $XB",
-                (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>;
-def : InstAlias<"xxswapd $XT, $XB",
-                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>;
-def : InstAlias<"xxspltd $XT, $XB, 0",
-                (XXPERMDIs vsrc:$XT, vsfrc:$XB, 0)>;
-def : InstAlias<"xxspltd $XT, $XB, 1",
-                (XXPERMDIs vsrc:$XT, vsfrc:$XB, 3)>;
-def : InstAlias<"xxswapd $XT, $XB",
-                (XXPERMDIs vsrc:$XT, vsfrc:$XB, 2)>;
-
-let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
-
-def : Pat<(v4i32 (vnot_ppc v4i32:$A)),
-          (v4i32 (XXLNOR $A, $A))>;
-def : Pat<(v4i32 (or (and (vnot_ppc v4i32:$C), v4i32:$A),
-                     (and v4i32:$B, v4i32:$C))),
-          (v4i32 (XXSEL $A, $B, $C))>;
-
-let Predicates = [IsBigEndian] in {
-def : Pat<(v2f64 (scalar_to_vector f64:$A)),
-          (v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>;
-
-def : Pat<(f64 (extractelt v2f64:$S, 0)),
-          (f64 (EXTRACT_SUBREG $S, sub_64))>;
-def : Pat<(f64 (extractelt v2f64:$S, 1)),
-          (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
-}
-
-let Predicates = [IsLittleEndian] in {
-def : Pat<(v2f64 (scalar_to_vector f64:$A)),
-          (v2f64 (XXPERMDI (SUBREG_TO_REG (i64 1), $A, sub_64),
-                           (SUBREG_TO_REG (i64 1), $A, sub_64), 0))>;
-
-def : Pat<(f64 (extractelt v2f64:$S, 0)),
-          (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
-def : Pat<(f64 (extractelt v2f64:$S, 1)),
-          (f64 (EXTRACT_SUBREG $S, sub_64))>;
-}
-
-// Additional fnmsub patterns: -a*b + c == -(a*b - c)
-def : Pat<(fma (fneg f64:$A), f64:$B, f64:$C),
-          (XSNMSUBADP $C, $A, $B)>;
-def : Pat<(fma f64:$A, (fneg f64:$B), f64:$C),
-          (XSNMSUBADP $C, $A, $B)>;
-
-def : Pat<(fma (fneg v2f64:$A), v2f64:$B, v2f64:$C),
-          (XVNMSUBADP $C, $A, $B)>;
-def : Pat<(fma v2f64:$A, (fneg v2f64:$B), v2f64:$C),
-          (XVNMSUBADP $C, $A, $B)>;
-
-def : Pat<(fma (fneg v4f32:$A), v4f32:$B, v4f32:$C),
-          (XVNMSUBASP $C, $A, $B)>;
-def : Pat<(fma v4f32:$A, (fneg v4f32:$B), v4f32:$C),
-          (XVNMSUBASP $C, $A, $B)>;
-
-def : Pat<(v2f64 (bitconvert v4f32:$A)),
-          (COPY_TO_REGCLASS $A, VSRC)>;
-def : Pat<(v2f64 (bitconvert v4i32:$A)),
-          (COPY_TO_REGCLASS $A, VSRC)>;
-def : Pat<(v2f64 (bitconvert v8i16:$A)),
-          (COPY_TO_REGCLASS $A, VSRC)>;
-def : Pat<(v2f64 (bitconvert v16i8:$A)),
-          (COPY_TO_REGCLASS $A, VSRC)>;
-
-def : Pat<(v4f32 (bitconvert v2f64:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-def : Pat<(v4i32 (bitconvert v2f64:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-def : Pat<(v8i16 (bitconvert v2f64:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-def : Pat<(v16i8 (bitconvert v2f64:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-
-def : Pat<(v2i64 (bitconvert v4f32:$A)),
-          (COPY_TO_REGCLASS $A, VSRC)>;
-def : Pat<(v2i64 (bitconvert v4i32:$A)),
-          (COPY_TO_REGCLASS $A, VSRC)>;
-def : Pat<(v2i64 (bitconvert v8i16:$A)),
-          (COPY_TO_REGCLASS $A, VSRC)>;
-def : Pat<(v2i64 (bitconvert v16i8:$A)),
-          (COPY_TO_REGCLASS $A, VSRC)>;
-
-def : Pat<(v4f32 (bitconvert v2i64:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-def : Pat<(v4i32 (bitconvert v2i64:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-def : Pat<(v8i16 (bitconvert v2i64:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-def : Pat<(v16i8 (bitconvert v2i64:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-
-def : Pat<(v2f64 (bitconvert v2i64:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-def : Pat<(v2i64 (bitconvert v2f64:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-
-def : Pat<(v2f64 (bitconvert v1i128:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-def : Pat<(v1i128 (bitconvert v2f64:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-
-def : Pat<(v2i64 (bitconvert f128:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-def : Pat<(v4i32 (bitconvert f128:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-def : Pat<(v8i16 (bitconvert f128:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-def : Pat<(v16i8 (bitconvert f128:$A)),
-          (COPY_TO_REGCLASS $A, VRRC)>;
-
-def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 0)),
-          (v2f64 (XVCVSXWDP (v2i64 (XXMRGHW $C, $C))))>;
-def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 1)),
-          (v2f64 (XVCVSXWDP (v2i64 (XXMRGLW $C, $C))))>;
-
-def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 0)),
-          (v2f64 (XVCVUXWDP (v2i64 (XXMRGHW $C, $C))))>;
-def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)),
-          (v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>;
-
-def : Pat<(v2f64 (PPCfpexth v4f32:$C, 0)), (XVCVSPDP (XXMRGHW $C, $C))>;
-def : Pat<(v2f64 (PPCfpexth v4f32:$C, 1)), (XVCVSPDP (XXMRGLW $C, $C))>;
-
-// Loads.
-let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
-  def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>;
-
-  // Stores.
-  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
-            (STXVD2X $rS, xoaddr:$dst)>;
-  def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
-}
-
-// Load vector big endian order
-let Predicates = [IsLittleEndian, HasVSX] in {
-  def : Pat<(v2f64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
-  def : Pat<(PPCst_vec_be v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
-  def : Pat<(v4f32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
-  def : Pat<(PPCst_vec_be v4f32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
-  def : Pat<(v2i64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
-  def : Pat<(PPCst_vec_be v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
-  def : Pat<(v4i32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
-  def : Pat<(PPCst_vec_be v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
-}
-
-let Predicates = [IsBigEndian, HasVSX, HasOnlySwappingMemOps] in {
-  def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
-  def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
-  def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
-  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVW4X xoaddr:$src)>;
-  def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
-  def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
-  def : Pat<(store v4i32:$XT, xoaddr:$dst), (STXVW4X $XT, xoaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
-            (STXVW4X $rS, xoaddr:$dst)>;
-}
-
-// Permutes.
-def : Pat<(v2f64 (PPCxxswapd v2f64:$src)), (XXPERMDI $src, $src, 2)>;
-def : Pat<(v2i64 (PPCxxswapd v2i64:$src)), (XXPERMDI $src, $src, 2)>;
-def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>;
-def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>;
-def : Pat<(v2f64 (PPCswapNoChain v2f64:$src)), (XXPERMDI $src, $src, 2)>;
-
-// PPCvecshl XT, XA, XA, 2 can be selected to both XXSLDWI XT,XA,XA,2 and
-// XXSWAPD XT,XA (i.e. XXPERMDI XT,XA,XA,2), the later one is more profitable.
-def : Pat<(v4i32 (PPCvecshl v4i32:$src, v4i32:$src, 2)), (XXPERMDI $src, $src, 2)>;
-
-// Selects.
-def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
-          (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETULT)),
-          (SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLE)),
-          (SELECT_VSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETULE)),
-          (SELECT_VSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETEQ)),
-          (SELECT_VSRC (CREQV $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGE)),
-          (SELECT_VSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETUGE)),
-          (SELECT_VSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGT)),
-          (SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETUGT)),
-          (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETNE)),
-          (SELECT_VSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
-
-def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)),
-          (SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULT)),
-          (SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)),
-          (SELECT_VSFRC (CRORC  $lhs, $rhs), $tval, $fval)>;
-def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULE)),
-          (SELECT_VSFRC (CRORC  $rhs, $lhs), $tval, $fval)>;
-def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)),
-          (SELECT_VSFRC (CREQV $lhs, $rhs), $tval, $fval)>;
-def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)),
-          (SELECT_VSFRC (CRORC  $rhs, $lhs), $tval, $fval)>;
-def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGE)),
-          (SELECT_VSFRC (CRORC  $lhs, $rhs), $tval, $fval)>;
-def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)),
-          (SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)),
-          (SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
-          (SELECT_VSFRC (CRXOR $lhs, $rhs), $tval, $fval)>;
-
-// Divides.
-def : Pat<(int_ppc_vsx_xvdivsp v4f32:$A, v4f32:$B),
-          (XVDIVSP $A, $B)>;
-def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B),
-          (XVDIVDP $A, $B)>;
-
-// Reciprocal estimate
-def : Pat<(int_ppc_vsx_xvresp v4f32:$A),
-          (XVRESP $A)>;
-def : Pat<(int_ppc_vsx_xvredp v2f64:$A),
-          (XVREDP $A)>;
-
-// Recip. square root estimate
-def : Pat<(int_ppc_vsx_xvrsqrtesp v4f32:$A),
-          (XVRSQRTESP $A)>;
-def : Pat<(int_ppc_vsx_xvrsqrtedp v2f64:$A),
-          (XVRSQRTEDP $A)>;
-
-// Vector selection
-def : Pat<(v16i8 (vselect v16i8:$vA, v16i8:$vB, v16i8:$vC)),
-          (COPY_TO_REGCLASS 
-                 (XXSEL (COPY_TO_REGCLASS $vC, VSRC),
-                        (COPY_TO_REGCLASS $vB, VSRC), 
-                        (COPY_TO_REGCLASS $vA, VSRC)), VRRC)>;
-def : Pat<(v8i16 (vselect v8i16:$vA, v8i16:$vB, v8i16:$vC)),
-          (COPY_TO_REGCLASS 
-                 (XXSEL (COPY_TO_REGCLASS $vC, VSRC),
-                        (COPY_TO_REGCLASS $vB, VSRC), 
-                        (COPY_TO_REGCLASS $vA, VSRC)), VRRC)>;
-def : Pat<(vselect v4i32:$vA, v4i32:$vB, v4i32:$vC),
-          (XXSEL $vC, $vB, $vA)>;
-def : Pat<(vselect v2i64:$vA, v2i64:$vB, v2i64:$vC),
-          (XXSEL $vC, $vB, $vA)>;
-def : Pat<(vselect v4i32:$vA, v4f32:$vB, v4f32:$vC),
-          (XXSEL $vC, $vB, $vA)>;
-def : Pat<(vselect v2i64:$vA, v2f64:$vB, v2f64:$vC),
-          (XXSEL $vC, $vB, $vA)>;
-
-def : Pat<(v4f32 (fmaxnum v4f32:$src1, v4f32:$src2)),
-          (v4f32 (XVMAXSP $src1, $src2))>;
-def : Pat<(v4f32 (fminnum v4f32:$src1, v4f32:$src2)),
-          (v4f32 (XVMINSP $src1, $src2))>;
-def : Pat<(v2f64 (fmaxnum v2f64:$src1, v2f64:$src2)),
-          (v2f64 (XVMAXDP $src1, $src2))>;
-def : Pat<(v2f64 (fminnum v2f64:$src1, v2f64:$src2)),
-          (v2f64 (XVMINDP $src1, $src2))>;
-
-let Predicates = [IsLittleEndian] in {
-def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
-          (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
-def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
-          (f64 (XSCVSXDDP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
-def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
-          (f64 (XSCVUXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
-def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
-          (f64 (XSCVUXDDP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
-} // IsLittleEndian
-
-let Predicates = [IsBigEndian] in {
-def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
-          (f64 (XSCVSXDDP (COPY_TO_REGCLASS $S, VSFRC)))>;
-def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
-          (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
-def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
-          (f64 (XSCVUXDDP (COPY_TO_REGCLASS $S, VSFRC)))>;
-def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
-          (f64 (XSCVUXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
-} // IsBigEndian
-
-} // AddedComplexity
-} // HasVSX
-
-def FpMinMax {
-  dag F32Min = (COPY_TO_REGCLASS (XSMINDP (COPY_TO_REGCLASS $A, VSFRC),
-                                          (COPY_TO_REGCLASS $B, VSFRC)),
-                                 VSSRC);
-  dag F32Max = (COPY_TO_REGCLASS (XSMAXDP (COPY_TO_REGCLASS $A, VSFRC),
-                                          (COPY_TO_REGCLASS $B, VSFRC)),
-                                 VSSRC);
-}
-
-let AddedComplexity = 400, Predicates = [HasVSX] in {
-  // f32 Min.
-  def : Pat<(f32 (fminnum_ieee f32:$A, f32:$B)),
-            (f32 FpMinMax.F32Min)>;
-  def : Pat<(f32 (fminnum_ieee (fcanonicalize f32:$A), f32:$B)),
-            (f32 FpMinMax.F32Min)>;
-  def : Pat<(f32 (fminnum_ieee f32:$A, (fcanonicalize f32:$B))),
-            (f32 FpMinMax.F32Min)>;
-  def : Pat<(f32 (fminnum_ieee (fcanonicalize f32:$A), (fcanonicalize f32:$B))),
-            (f32 FpMinMax.F32Min)>;
-  // F32 Max.
-  def : Pat<(f32 (fmaxnum_ieee f32:$A, f32:$B)),
-            (f32 FpMinMax.F32Max)>;
-  def : Pat<(f32 (fmaxnum_ieee (fcanonicalize f32:$A), f32:$B)),
-            (f32 FpMinMax.F32Max)>;
-  def : Pat<(f32 (fmaxnum_ieee f32:$A, (fcanonicalize f32:$B))),
-            (f32 FpMinMax.F32Max)>;
-  def : Pat<(f32 (fmaxnum_ieee (fcanonicalize f32:$A), (fcanonicalize f32:$B))),
-            (f32 FpMinMax.F32Max)>;
-
-  // f64 Min.
-  def : Pat<(f64 (fminnum_ieee f64:$A, f64:$B)),
-            (f64 (XSMINDP $A, $B))>;
-  def : Pat<(f64 (fminnum_ieee (fcanonicalize f64:$A), f64:$B)),
-            (f64 (XSMINDP $A, $B))>;
-  def : Pat<(f64 (fminnum_ieee f64:$A, (fcanonicalize f64:$B))),
-            (f64 (XSMINDP $A, $B))>;
-  def : Pat<(f64 (fminnum_ieee (fcanonicalize f64:$A), (fcanonicalize f64:$B))),
-            (f64 (XSMINDP $A, $B))>;
-  // f64 Max.
-  def : Pat<(f64 (fmaxnum_ieee f64:$A, f64:$B)),
-            (f64 (XSMAXDP $A, $B))>;
-  def : Pat<(f64 (fmaxnum_ieee (fcanonicalize f64:$A), f64:$B)),
-            (f64 (XSMAXDP $A, $B))>;
-  def : Pat<(f64 (fmaxnum_ieee f64:$A, (fcanonicalize f64:$B))),
-            (f64 (XSMAXDP $A, $B))>;
-  def : Pat<(f64 (fmaxnum_ieee (fcanonicalize f64:$A), (fcanonicalize f64:$B))),
-            (f64 (XSMAXDP $A, $B))>;
-}
-
-def ScalarLoads {
-  dag Li8 =       (i32 (extloadi8 xoaddr:$src));
-  dag ZELi8 =     (i32 (zextloadi8 xoaddr:$src));
-  dag ZELi8i64 =  (i64 (zextloadi8 xoaddr:$src));
-  dag SELi8 =     (i32 (sext_inreg (extloadi8 xoaddr:$src), i8));
-  dag SELi8i64 =  (i64 (sext_inreg (extloadi8 xoaddr:$src), i8));
-
-  dag Li16 =      (i32 (extloadi16 xoaddr:$src));
-  dag ZELi16 =    (i32 (zextloadi16 xoaddr:$src));
-  dag ZELi16i64 = (i64 (zextloadi16 xoaddr:$src));
-  dag SELi16 =    (i32 (sextloadi16 xoaddr:$src));
-  dag SELi16i64 = (i64 (sextloadi16 xoaddr:$src));
-
-  dag Li32 = (i32 (load xoaddr:$src));
-}
-
-def DWToSPExtractConv {
-  dag El0US1 = (f32 (PPCfcfidus
-                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S1, 0))))));
-  dag El1US1 = (f32 (PPCfcfidus
-                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S1, 1))))));
-  dag El0US2 = (f32 (PPCfcfidus
-                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S2, 0))))));
-  dag El1US2 = (f32 (PPCfcfidus
-                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S2, 1))))));
-  dag El0SS1 = (f32 (PPCfcfids
-                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S1, 0))))));
-  dag El1SS1 = (f32 (PPCfcfids
-                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S1, 1))))));
-  dag El0SS2 = (f32 (PPCfcfids
-                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S2, 0))))));
-  dag El1SS2 = (f32 (PPCfcfids
-                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S2, 1))))));
-  dag BVU = (v4f32 (build_vector El0US1, El1US1, El0US2, El1US2));
-  dag BVS = (v4f32 (build_vector El0SS1, El1SS1, El0SS2, El1SS2));
-}
-
 // The following VSX instructions were introduced in Power ISA 2.07
-/* FIXME: if the operands are v2i64, these patterns will not match.
-   we should define new patterns or otherwise match the same patterns
-   when the elements are larger than i32.
-*/
-def HasP8Vector : Predicate<"PPCSubTarget->hasP8Vector()">;
-def HasDirectMove : Predicate<"PPCSubTarget->hasDirectMove()">;
-def NoP9Vector : Predicate<"!PPCSubTarget->hasP9Vector()">;
-let Predicates = [HasP8Vector] in {
-let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
+let Predicates = [HasVSX, HasP8Vector] in {
   let isCommutable = 1 in {
     def XXLEQV : XX3Form<60, 186,
                          (outs vsrc:$XT), (ins vsrc:$XA, vsrc:$XB),
@@ -1363,9 +1087,6 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                                                     v4i32:$XB)))]>;
   } // isCommutable
 
-  def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B),
-            (XXLEQV $A, $B)>;
-
   let isCodeGenOnly = 1, isMoveImm = 1, isAsCheapAsAMove = 1,
       isReMaterializable = 1 in {
     def XXLEQVOnes : XX3Form_SameOp<60, 186, (outs vsrc:$XT), (ins),
@@ -1379,7 +1100,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                        [(set v4i32:$XT, (or v4i32:$XA, (vnot_ppc v4i32:$XB)))]>;
 
   // VSX scalar loads introduced in ISA 2.07
-  let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
+  let mayLoad = 1, mayStore = 0 in {
     let CodeSize = 3 in
     def LXSSPX : XX1Form_memOp<31, 524, (outs vssrc:$XT), (ins memrr:$src),
                          "lxsspx $XT, $src", IIC_LdStLFD, []>;
@@ -1404,7 +1125,7 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
   } // mayLoad
 
   // VSX scalar stores introduced in ISA 2.07
-  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
+  let mayStore = 1, mayLoad = 0 in {
     let CodeSize = 3 in
     def STXSSPX : XX1Form_memOp<31, 652, (outs), (ins vssrc:$XT, memrr:$dst),
                           "stxsspx $XT, $dst", IIC_LdStSTFD, []>;
@@ -1422,64 +1143,42 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                       [(PPCstfiwx f64:$XT, xoaddr:$dst)]>;
   } // mayStore
 
-  def : Pat<(f64 (extloadf32 xoaddr:$src)),
-            (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$src), VSFRC)>;
-  def : Pat<(f32 (fpround (f64 (extloadf32 xoaddr:$src)))),
-            (f32 (XFLOADf32 xoaddr:$src))>;
-  def : Pat<(f64 (fpextend f32:$src)),
-            (COPY_TO_REGCLASS $src, VSFRC)>;
-
-  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
-            (SELECT_VSSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)),
-            (SELECT_VSSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)),
-            (SELECT_VSSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
-  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULE)),
-            (SELECT_VSSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
-  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)),
-            (SELECT_VSSRC (CREQV $lhs, $rhs), $tval, $fval)>;
-  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)),
-            (SELECT_VSSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
-  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGE)),
-            (SELECT_VSSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
-  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)),
-            (SELECT_VSSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
-  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGT)),
-            (SELECT_VSSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
-  def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
-            (SELECT_VSSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
-
   // VSX Elementary Scalar FP arithmetic (SP)
+  let mayRaiseFPException = 1 in {
   let isCommutable = 1 in {
     def XSADDSP : XX3Form<60, 0,
                           (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
                           "xsaddsp $XT, $XA, $XB", IIC_VecFP,
-                          [(set f32:$XT, (fadd f32:$XA, f32:$XB))]>;
+                          [(set f32:$XT, (any_fadd f32:$XA, f32:$XB))]>;
     def XSMULSP : XX3Form<60, 16,
                           (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
                           "xsmulsp $XT, $XA, $XB", IIC_VecFP,
-                          [(set f32:$XT, (fmul f32:$XA, f32:$XB))]>;
+                          [(set f32:$XT, (any_fmul f32:$XA, f32:$XB))]>;
   } // isCommutable
+
   def XSSUBSP : XX3Form<60, 8,
                         (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
                         "xssubsp $XT, $XA, $XB", IIC_VecFP,
-                        [(set f32:$XT, (fsub f32:$XA, f32:$XB))]>;
+                        [(set f32:$XT, (any_fsub f32:$XA, f32:$XB))]>;
   def XSDIVSP : XX3Form<60, 24,
                         (outs vssrc:$XT), (ins vssrc:$XA, vssrc:$XB),
                         "xsdivsp $XT, $XA, $XB", IIC_FPDivS,
-                        [(set f32:$XT, (fdiv f32:$XA, f32:$XB))]>;
+                        [(set f32:$XT, (any_fdiv f32:$XA, f32:$XB))]>;
+
   def XSRESP : XX2Form<60, 26,
                         (outs vssrc:$XT), (ins vssrc:$XB),
                         "xsresp $XT, $XB", IIC_VecFP,
                         [(set f32:$XT, (PPCfre f32:$XB))]>;
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let hasSideEffects = 1, mayRaiseFPException = 1 in
   def XSRSP : XX2Form<60, 281,
                         (outs vssrc:$XT), (ins vsfrc:$XB),
-                        "xsrsp $XT, $XB", IIC_VecFP, []>;
+                        "xsrsp $XT, $XB", IIC_VecFP,
+                        [(set f32:$XT, (any_fpround f64:$XB))]>;
   def XSSQRTSP : XX2Form<60, 11,
                         (outs vssrc:$XT), (ins vssrc:$XB),
                         "xssqrtsp $XT, $XB", IIC_FPSqrtS,
-                        [(set f32:$XT, (fsqrt f32:$XB))]>;
+                        [(set f32:$XT, (any_fsqrt f32:$XB))]>;
   def XSRSQRTESP : XX2Form<60, 10,
                            (outs vssrc:$XT), (ins vssrc:$XB),
                            "xsrsqrtesp $XT, $XB", IIC_VecFP,
@@ -1492,10 +1191,11 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                           (outs vssrc:$XT),
                           (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
                           "xsmaddasp $XT, $XA, $XB", IIC_VecFP,
-                          [(set f32:$XT, (fma f32:$XA, f32:$XB, f32:$XTi))]>,
+                          [(set f32:$XT, (any_fma f32:$XA, f32:$XB, f32:$XTi))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
-  let IsVSXFMAAlt = 1 in
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let IsVSXFMAAlt = 1, hasSideEffects = 1 in
   def XSMADDMSP : XX3Form<60, 9,
                           (outs vssrc:$XT),
                           (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
@@ -1510,11 +1210,12 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                           (outs vssrc:$XT),
                           (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
                           "xsmsubasp $XT, $XA, $XB", IIC_VecFP,
-                          [(set f32:$XT, (fma f32:$XA, f32:$XB,
+                          [(set f32:$XT, (any_fma f32:$XA, f32:$XB,
                                               (fneg f32:$XTi)))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
-  let IsVSXFMAAlt = 1 in
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let IsVSXFMAAlt = 1, hasSideEffects = 1 in
   def XSMSUBMSP : XX3Form<60, 25,
                           (outs vssrc:$XT),
                           (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
@@ -1529,11 +1230,12 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                           (outs vssrc:$XT),
                           (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
                           "xsnmaddasp $XT, $XA, $XB", IIC_VecFP,
-                          [(set f32:$XT, (fneg (fma f32:$XA, f32:$XB,
+                          [(set f32:$XT, (fneg (any_fma f32:$XA, f32:$XB,
                                                     f32:$XTi)))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
-  let IsVSXFMAAlt = 1 in
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let IsVSXFMAAlt = 1, hasSideEffects = 1 in
   def XSNMADDMSP : XX3Form<60, 137,
                           (outs vssrc:$XT),
                           (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
@@ -1548,11 +1250,12 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                           (outs vssrc:$XT),
                           (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
                           "xsnmsubasp $XT, $XA, $XB", IIC_VecFP,
-                          [(set f32:$XT, (fneg (fma f32:$XA, f32:$XB,
+                          [(set f32:$XT, (fneg (any_fma f32:$XA, f32:$XB,
                                                     (fneg f32:$XTi))))]>,
                           RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">,
                           AltVSXFMARel;
-  let IsVSXFMAAlt = 1 in
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let IsVSXFMAAlt = 1, hasSideEffects = 1 in
   def XSNMSUBMSP : XX3Form<60, 153,
                           (outs vssrc:$XT),
                           (ins vssrc:$XTi, vssrc:$XA, vssrc:$XB),
@@ -1561,12 +1264,6 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                           AltVSXFMARel;
   }
 
-  // Additional xsnmsubasp patterns: -a*b + c == -(a*b - c)
-  def : Pat<(fma (fneg f32:$A), f32:$B, f32:$C),
-            (XSNMSUBASP $C, $A, $B)>;
-  def : Pat<(fma f32:$A, (fneg f32:$B), f32:$C),
-            (XSNMSUBASP $C, $A, $B)>;
-
   // Single Precision Conversions (FP <-> INT)
   def XSCVSXDSP : XX2Form<60, 312,
                       (outs vssrc:$XT), (ins vsfrc:$XB),
@@ -1582,72 +1279,16 @@ let AddedComplexity = 400 in { // Prefer VSX patterns over non-VSX patterns.
                           "xscvdpspn $XT, $XB", IIC_VecFP, []>;
   def XSCVSPDPN : XX2Form<60, 331, (outs vssrc:$XT), (ins vsrc:$XB),
                           "xscvspdpn $XT, $XB", IIC_VecFP, []>;
+  } // mayRaiseFPException
 
-  let Predicates = [IsLittleEndian] in {
-  def : Pat<DWToSPExtractConv.El0SS1,
-            (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S1, $S1, 2), VSFRC)))>;
-  def : Pat<DWToSPExtractConv.El1SS1,
-            (f32 (XSCVSXDSP (COPY_TO_REGCLASS
-                              (f64 (COPY_TO_REGCLASS $S1, VSRC)), VSFRC)))>;
-  def : Pat<DWToSPExtractConv.El0US1,
-            (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S1, $S1, 2), VSFRC)))>;
-  def : Pat<DWToSPExtractConv.El1US1,
-            (f32 (XSCVUXDSP (COPY_TO_REGCLASS
-                              (f64 (COPY_TO_REGCLASS $S1, VSRC)), VSFRC)))>;
-  }
-
-  let Predicates = [IsBigEndian] in {
-  def : Pat<DWToSPExtractConv.El0SS1,
-            (f32 (XSCVSXDSP (COPY_TO_REGCLASS $S1, VSFRC)))>;
-  def : Pat<DWToSPExtractConv.El1SS1,
-            (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S1, $S1, 2), VSFRC)))>;
-  def : Pat<DWToSPExtractConv.El0US1,
-            (f32 (XSCVUXDSP (COPY_TO_REGCLASS $S1, VSFRC)))>;
-  def : Pat<DWToSPExtractConv.El1US1,
-            (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S1, $S1, 2), VSFRC)))>;
-  }
-
-  // Instructions for converting float to i64 feeding a store.
-  let Predicates = [NoP9Vector] in {
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 8),
-            (STXSDX (XSCVDPSXDS f64:$src), xoaddr:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 8),
-            (STXSDX (XSCVDPUXDS f64:$src), xoaddr:$dst)>;
-  }
-
-  // Instructions for converting float to i32 feeding a store.
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 4),
-            (STIWX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 4),
-            (STIWX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
-
-  def : Pat<(v2i64 (smax v2i64:$src1, v2i64:$src2)),
-            (v2i64 (VMAXSD (COPY_TO_REGCLASS $src1, VRRC),
-                           (COPY_TO_REGCLASS $src2, VRRC)))>;
-  def : Pat<(v2i64 (umax v2i64:$src1, v2i64:$src2)),
-            (v2i64 (VMAXUD (COPY_TO_REGCLASS $src1, VRRC),
-                           (COPY_TO_REGCLASS $src2, VRRC)))>;
-  def : Pat<(v2i64 (smin v2i64:$src1, v2i64:$src2)),
-            (v2i64 (VMINSD (COPY_TO_REGCLASS $src1, VRRC),
-                           (COPY_TO_REGCLASS $src2, VRRC)))>;
-  def : Pat<(v2i64 (umin v2i64:$src1, v2i64:$src2)),
-            (v2i64 (VMINUD (COPY_TO_REGCLASS $src1, VRRC),
-                           (COPY_TO_REGCLASS $src2, VRRC)))>;
-} // AddedComplexity = 400
-} // HasP8Vector
-
-let AddedComplexity = 400 in {
-let Predicates = [HasDirectMove] in {
+  let Predicates = [HasVSX, HasDirectMove] in {
   // VSX direct move instructions
   def MFVSRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsfrc:$XT),
                               "mfvsrd $rA, $XT", IIC_VecGeneral,
                               [(set i64:$rA, (PPCmfvsr f64:$XT))]>,
       Requires<[In64BitMode]>;
-  let isCodeGenOnly = 1 in
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let isCodeGenOnly = 1, hasSideEffects = 1 in
   def MFVRD : XX1_RS6_RD5_XO<31, 51, (outs g8rc:$rA), (ins vsrc:$XT),
                              "mfvsrd $rA, $XT", IIC_VecGeneral,
                              []>,
@@ -1655,7 +1296,8 @@ let Predicates = [HasDirectMove] in {
   def MFVSRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsfrc:$XT),
                                "mfvsrwz $rA, $XT", IIC_VecGeneral,
                                [(set i32:$rA, (PPCmfvsr f64:$XT))]>;
-  let isCodeGenOnly = 1 in
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let isCodeGenOnly = 1, hasSideEffects = 1 in
   def MFVRWZ : XX1_RS6_RD5_XO<31, 115, (outs gprc:$rA), (ins vsrc:$XT),
                                "mfvsrwz $rA, $XT", IIC_VecGeneral,
                                []>;
@@ -1663,7 +1305,8 @@ let Predicates = [HasDirectMove] in {
                               "mtvsrd $XT, $rA", IIC_VecGeneral,
                               [(set f64:$XT, (PPCmtvsra i64:$rA))]>,
       Requires<[In64BitMode]>;
-  let isCodeGenOnly = 1 in
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let isCodeGenOnly = 1, hasSideEffects = 1 in
   def MTVRD : XX1_RS6_RD5_XO<31, 179, (outs vsrc:$XT), (ins g8rc:$rA),
                               "mtvsrd $XT, $rA", IIC_VecGeneral,
                               []>,
@@ -1671,56 +1314,547 @@ let Predicates = [HasDirectMove] in {
   def MTVSRWA : XX1_RS6_RD5_XO<31, 211, (outs vsfrc:$XT), (ins gprc:$rA),
                                "mtvsrwa $XT, $rA", IIC_VecGeneral,
                                [(set f64:$XT, (PPCmtvsra i32:$rA))]>;
-  let isCodeGenOnly = 1 in
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let isCodeGenOnly = 1, hasSideEffects = 1 in
   def MTVRWA : XX1_RS6_RD5_XO<31, 211, (outs vsrc:$XT), (ins gprc:$rA),
                                "mtvsrwa $XT, $rA", IIC_VecGeneral,
                                []>;
   def MTVSRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsfrc:$XT), (ins gprc:$rA),
                                "mtvsrwz $XT, $rA", IIC_VecGeneral,
                                [(set f64:$XT, (PPCmtvsrz i32:$rA))]>;
-  let isCodeGenOnly = 1 in
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let isCodeGenOnly = 1, hasSideEffects = 1 in
   def MTVRWZ : XX1_RS6_RD5_XO<31, 243, (outs vsrc:$XT), (ins gprc:$rA),
                                "mtvsrwz $XT, $rA", IIC_VecGeneral,
                                []>;
-} // HasDirectMove
+  } // HasDirectMove
 
-let Predicates = [IsISA3_0, HasDirectMove] in {
-  def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA),
-                              "mtvsrws $XT, $rA", IIC_VecGeneral, []>;
+} // HasVSX, HasP8Vector
 
-  def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc_nox0:$rA, g8rc:$rB),
-                       "mtvsrdd $XT, $rA, $rB", IIC_VecGeneral,
-                       []>, Requires<[In64BitMode]>;
+let Predicates = [HasVSX, IsISA3_0, HasDirectMove] in {
+def MTVSRWS: XX1_RS6_RD5_XO<31, 403, (outs vsrc:$XT), (ins gprc:$rA),
+                            "mtvsrws $XT, $rA", IIC_VecGeneral, []>;
 
-  def MFVSRLD: XX1_RS6_RD5_XO<31, 307, (outs g8rc:$rA), (ins vsrc:$XT),
-                              "mfvsrld $rA, $XT", IIC_VecGeneral,
-                              []>, Requires<[In64BitMode]>;
+def MTVSRDD: XX1Form<31, 435, (outs vsrc:$XT), (ins g8rc_nox0:$rA, g8rc:$rB),
+                     "mtvsrdd $XT, $rA, $rB", IIC_VecGeneral,
+                     []>, Requires<[In64BitMode]>;
 
-} // IsISA3_0, HasDirectMove
-} // AddedComplexity = 400
+def MFVSRLD: XX1_RS6_RD5_XO<31, 307, (outs g8rc:$rA), (ins vsrc:$XT),
+                            "mfvsrld $rA, $XT", IIC_VecGeneral,
+                            []>, Requires<[In64BitMode]>;
 
-// We want to parse this from asm, but we don't want to emit this as it would
-// be emitted with a VSX reg. So leave Emit = 0 here.
-def : InstAlias<"mfvrd $rA, $XT",
-                (MFVRD g8rc:$rA, vrrc:$XT), 0>;
-def : InstAlias<"mffprd $rA, $src",
-                (MFVSRD g8rc:$rA, f8rc:$src)>;
-def : InstAlias<"mtvrd $XT, $rA",
-                (MTVRD vrrc:$XT, g8rc:$rA), 0>;
-def : InstAlias<"mtfprd $dst, $rA",
-                (MTVSRD f8rc:$dst, g8rc:$rA)>;
-def : InstAlias<"mfvrwz $rA, $XT",
-                (MFVRWZ gprc:$rA, vrrc:$XT), 0>;
-def : InstAlias<"mffprwz $rA, $src",
-                (MFVSRWZ gprc:$rA, f8rc:$src)>;
-def : InstAlias<"mtvrwa $XT, $rA",
-                (MTVRWA vrrc:$XT, gprc:$rA), 0>;
-def : InstAlias<"mtfprwa $dst, $rA",
-                (MTVSRWA f8rc:$dst, gprc:$rA)>;
-def : InstAlias<"mtvrwz $XT, $rA",
-                (MTVRWZ vrrc:$XT, gprc:$rA), 0>;
-def : InstAlias<"mtfprwz $dst, $rA",
-                (MTVSRWZ f8rc:$dst, gprc:$rA)>;
+} // HasVSX, IsISA3_0, HasDirectMove
+
+let Predicates = [HasVSX, HasP9Vector] in {
+  // Quad-Precision Scalar Move Instructions:
+  // Copy Sign
+  def XSCPSGNQP : X_VT5_VA5_VB5<63, 100, "xscpsgnqp",
+                                [(set f128:$vT,
+                                      (fcopysign f128:$vB, f128:$vA))]>;
+
+  // Absolute/Negative-Absolute/Negate
+  def XSABSQP   : X_VT5_XO5_VB5<63,  0, 804, "xsabsqp",
+                                [(set f128:$vT, (fabs f128:$vB))]>;
+  def XSNABSQP  : X_VT5_XO5_VB5<63,  8, 804, "xsnabsqp",
+                                [(set f128:$vT, (fneg (fabs f128:$vB)))]>;
+  def XSNEGQP   : X_VT5_XO5_VB5<63, 16, 804, "xsnegqp",
+                                [(set f128:$vT, (fneg f128:$vB))]>;
+
+  //===--------------------------------------------------------------------===//
+  // Quad-Precision Scalar Floating-Point Arithmetic Instructions:
+
+  // Add/Divide/Multiply/Subtract
+  let mayRaiseFPException = 1 in {
+  let isCommutable = 1 in {
+  def XSADDQP   : X_VT5_VA5_VB5   <63,   4, "xsaddqp",
+                                   [(set f128:$vT, (any_fadd f128:$vA, f128:$vB))]>;
+  def XSMULQP   : X_VT5_VA5_VB5   <63,  36, "xsmulqp",
+                                   [(set f128:$vT, (any_fmul f128:$vA, f128:$vB))]>;
+  }
+  def XSSUBQP   : X_VT5_VA5_VB5   <63, 516, "xssubqp" ,
+                                   [(set f128:$vT, (any_fsub f128:$vA, f128:$vB))]>;
+  def XSDIVQP   : X_VT5_VA5_VB5   <63, 548, "xsdivqp",
+                                   [(set f128:$vT, (any_fdiv f128:$vA, f128:$vB))]>;
+  // Square-Root
+  def XSSQRTQP  : X_VT5_XO5_VB5   <63, 27, 804, "xssqrtqp",
+                                   [(set f128:$vT, (any_fsqrt f128:$vB))]>;
+  // (Negative) Multiply-{Add/Subtract}
+  def XSMADDQP : X_VT5_VA5_VB5_FMA <63, 388, "xsmaddqp",
+                                    [(set f128:$vT,
+                                          (any_fma f128:$vA, f128:$vB, f128:$vTi))]>;
+  def XSMSUBQP  : X_VT5_VA5_VB5_FMA   <63, 420, "xsmsubqp"  ,
+                                       [(set f128:$vT,
+                                             (any_fma f128:$vA, f128:$vB,
+                                                      (fneg f128:$vTi)))]>;
+  def XSNMADDQP : X_VT5_VA5_VB5_FMA <63, 452, "xsnmaddqp",
+                                     [(set f128:$vT,
+                                           (fneg (any_fma f128:$vA, f128:$vB,
+                                                          f128:$vTi)))]>;
+  def XSNMSUBQP : X_VT5_VA5_VB5_FMA <63, 484, "xsnmsubqp",
+                                     [(set f128:$vT,
+                                           (fneg (any_fma f128:$vA, f128:$vB,
+                                                          (fneg f128:$vTi))))]>;
+
+  let isCommutable = 1 in {
+  def XSADDQPO : X_VT5_VA5_VB5_Ro<63, 4, "xsaddqpo",
+                                  [(set f128:$vT,
+                                  (int_ppc_addf128_round_to_odd
+                                  f128:$vA, f128:$vB))]>;
+  def XSMULQPO : X_VT5_VA5_VB5_Ro<63, 36, "xsmulqpo",
+                                  [(set f128:$vT,
+                                  (int_ppc_mulf128_round_to_odd
+                                  f128:$vA, f128:$vB))]>;
+  }
+  def XSSUBQPO : X_VT5_VA5_VB5_Ro<63, 516, "xssubqpo",
+                                  [(set f128:$vT,
+                                  (int_ppc_subf128_round_to_odd
+                                  f128:$vA, f128:$vB))]>;
+  def XSDIVQPO : X_VT5_VA5_VB5_Ro<63, 548, "xsdivqpo",
+                                  [(set f128:$vT,
+                                  (int_ppc_divf128_round_to_odd
+                                  f128:$vA, f128:$vB))]>;
+  def XSSQRTQPO : X_VT5_XO5_VB5_Ro<63, 27, 804, "xssqrtqpo",
+                                  [(set f128:$vT,
+                                  (int_ppc_sqrtf128_round_to_odd f128:$vB))]>;
+
+
+  def XSMADDQPO : X_VT5_VA5_VB5_FMA_Ro<63, 388, "xsmaddqpo",
+                                      [(set f128:$vT,
+                                      (int_ppc_fmaf128_round_to_odd
+                                      f128:$vA,f128:$vB,f128:$vTi))]>;
+
+  def XSMSUBQPO : X_VT5_VA5_VB5_FMA_Ro<63, 420, "xsmsubqpo" ,
+                                      [(set f128:$vT,
+                                      (int_ppc_fmaf128_round_to_odd
+                                      f128:$vA, f128:$vB, (fneg f128:$vTi)))]>;
+  def XSNMADDQPO: X_VT5_VA5_VB5_FMA_Ro<63, 452, "xsnmaddqpo",
+                                      [(set f128:$vT,
+                                      (fneg (int_ppc_fmaf128_round_to_odd
+                                      f128:$vA, f128:$vB, f128:$vTi)))]>;
+  def XSNMSUBQPO: X_VT5_VA5_VB5_FMA_Ro<63, 484, "xsnmsubqpo",
+                                      [(set f128:$vT,
+                                      (fneg (int_ppc_fmaf128_round_to_odd
+                                      f128:$vA, f128:$vB, (fneg f128:$vTi))))]>;
+  } // mayRaiseFPException
+
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  // QP Compare Ordered/Unordered
+  let hasSideEffects = 1 in {
+    def XSCMPOQP : X_BF3_VA5_VB5<63, 132, "xscmpoqp", []>;
+    def XSCMPUQP : X_BF3_VA5_VB5<63, 644, "xscmpuqp", []>;
+
+    // DP/QP Compare Exponents
+    def XSCMPEXPDP : XX3Form_1<60, 59,
+                               (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
+                               "xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>;
+    def XSCMPEXPQP : X_BF3_VA5_VB5<63, 164, "xscmpexpqp", []>;
+
+    // DP Compare ==, >=, >, !=
+    // Use vsrc for XT, because the entire register of XT is set.
+    // XT.dword[1] = 0x0000_0000_0000_0000
+    def XSCMPEQDP : XX3_XT5_XA5_XB5<60,  3, "xscmpeqdp", vsrc, vsfrc, vsfrc,
+                                    IIC_FPCompare, []>;
+    def XSCMPGEDP : XX3_XT5_XA5_XB5<60, 19, "xscmpgedp", vsrc, vsfrc, vsfrc,
+                                    IIC_FPCompare, []>;
+    def XSCMPGTDP : XX3_XT5_XA5_XB5<60, 11, "xscmpgtdp", vsrc, vsfrc, vsfrc,
+                                    IIC_FPCompare, []>;
+  }
+
+  //===--------------------------------------------------------------------===//
+  // Quad-Precision Floating-Point Conversion Instructions:
+
+  let mayRaiseFPException = 1 in {
+    // Convert DP -> QP
+    def XSCVDPQP  : X_VT5_XO5_VB5_TyVB<63, 22, 836, "xscvdpqp", vfrc,
+                                       [(set f128:$vT, (any_fpextend f64:$vB))]>;
+
+    // Round & Convert QP -> DP (dword[1] is set to zero)
+    def XSCVQPDP  : X_VT5_XO5_VB5_VSFR<63, 20, 836, "xscvqpdp" , []>;
+    def XSCVQPDPO : X_VT5_XO5_VB5_VSFR_Ro<63, 20, 836, "xscvqpdpo",
+                                          [(set f64:$vT,
+                                          (int_ppc_truncf128_round_to_odd
+                                          f128:$vB))]>;
+  }
+
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  // Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero)
+  let hasSideEffects = 1 in {
+    def XSCVQPSDZ : X_VT5_XO5_VB5<63, 25, 836, "xscvqpsdz", []>;
+    def XSCVQPSWZ : X_VT5_XO5_VB5<63,  9, 836, "xscvqpswz", []>;
+    def XSCVQPUDZ : X_VT5_XO5_VB5<63, 17, 836, "xscvqpudz", []>;
+    def XSCVQPUWZ : X_VT5_XO5_VB5<63,  1, 836, "xscvqpuwz", []>;
+  }
+
+  // Convert (Un)Signed DWord -> QP.
+  def XSCVSDQP  : X_VT5_XO5_VB5_TyVB<63, 10, 836, "xscvsdqp", vfrc, []>;
+  def XSCVUDQP  : X_VT5_XO5_VB5_TyVB<63,  2, 836, "xscvudqp", vfrc, []>;
+
+  // (Round &) Convert DP <-> HP
+  // Note! xscvdphp's src and dest register both use the left 64 bits, so we use
+  // vsfrc for src and dest register. xscvhpdp's src only use the left 16 bits,
+  // but we still use vsfrc for it.
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let hasSideEffects = 1 in {
+    def XSCVDPHP : XX2_XT6_XO5_XB6<60, 17, 347, "xscvdphp", vsfrc, []>;
+    def XSCVHPDP : XX2_XT6_XO5_XB6<60, 16, 347, "xscvhpdp", vsfrc, []>;
+  }
+
+  // Vector HP -> SP
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let hasSideEffects = 1 in
+  def XVCVHPSP : XX2_XT6_XO5_XB6<60, 24, 475, "xvcvhpsp", vsrc, []>;
+  def XVCVSPHP : XX2_XT6_XO5_XB6<60, 25, 475, "xvcvsphp", vsrc,
+                                 [(set v4f32:$XT,
+                                     (int_ppc_vsx_xvcvsphp v4f32:$XB))]>;
+
+  let mayRaiseFPException = 1 in {
+    // Round to Quad-Precision Integer [with Inexact]
+    def XSRQPI   : Z23_VT5_R1_VB5_RMC2_EX1<63,  5, 0, "xsrqpi" , []>;
+    def XSRQPIX  : Z23_VT5_R1_VB5_RMC2_EX1<63,  5, 1, "xsrqpix", []>;
+  }
+
+  // Round Quad-Precision to Double-Extended Precision (fp80)
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let hasSideEffects = 1 in
+  def XSRQPXP  : Z23_VT5_R1_VB5_RMC2_EX1<63, 37, 0, "xsrqpxp", []>;
+
+  //===--------------------------------------------------------------------===//
+  // Insert/Extract Instructions
+
+  // Insert Exponent DP/QP
+  // XT NOTE: XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let hasSideEffects = 1 in {
+    def XSIEXPDP : XX1Form <60, 918, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
+                            "xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>;
+    // vB NOTE: only vB.dword[0] is used, that's why we don't use
+    //          X_VT5_VA5_VB5 form
+    def XSIEXPQP : XForm_18<63, 868, (outs vrrc:$vT), (ins vrrc:$vA, vsfrc:$vB),
+                            "xsiexpqp $vT, $vA, $vB", IIC_VecFP, []>;
+  }
+
+  // Extract Exponent/Significand DP/QP
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let hasSideEffects = 1 in {
+    def XSXEXPDP : XX2_RT5_XO5_XB6<60,  0, 347, "xsxexpdp", []>;
+    def XSXSIGDP : XX2_RT5_XO5_XB6<60,  1, 347, "xsxsigdp", []>;
+
+    def XSXEXPQP : X_VT5_XO5_VB5  <63,  2, 804, "xsxexpqp", []>;
+    def XSXSIGQP : X_VT5_XO5_VB5  <63, 18, 804, "xsxsigqp", []>;
+  }
+
+  // Vector Insert Word
+  // XB NOTE: Only XB.dword[1] is used, but we use vsrc on XB.
+  def XXINSERTW   :
+    XX2_RD6_UIM5_RS6<60, 181, (outs vsrc:$XT),
+                     (ins vsrc:$XTi, vsrc:$XB, u4imm:$UIM),
+                     "xxinsertw $XT, $XB, $UIM", IIC_VecFP,
+                     [(set v4i32:$XT, (PPCvecinsert v4i32:$XTi, v4i32:$XB,
+                                                   imm32SExt16:$UIM))]>,
+                     RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
+
+  // Vector Extract Unsigned Word
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let hasSideEffects = 1 in
+  def XXEXTRACTUW : XX2_RD6_UIM5_RS6<60, 165,
+                                  (outs vsfrc:$XT), (ins vsrc:$XB, u4imm:$UIMM),
+                                  "xxextractuw $XT, $XB, $UIMM", IIC_VecFP, []>;
+
+  // Vector Insert Exponent DP/SP
+  def XVIEXPDP : XX3_XT5_XA5_XB5<60, 248, "xviexpdp", vsrc, vsrc, vsrc,
+    IIC_VecFP, [(set v2f64: $XT,(int_ppc_vsx_xviexpdp v2i64:$XA, v2i64:$XB))]>;
+  def XVIEXPSP : XX3_XT5_XA5_XB5<60, 216, "xviexpsp", vsrc, vsrc, vsrc,
+    IIC_VecFP, [(set v4f32: $XT,(int_ppc_vsx_xviexpsp v4i32:$XA, v4i32:$XB))]>;
+
+  // Vector Extract Exponent/Significand DP/SP
+  def XVXEXPDP : XX2_XT6_XO5_XB6<60,  0, 475, "xvxexpdp", vsrc,
+                                 [(set v2i64: $XT,
+                                  (int_ppc_vsx_xvxexpdp v2f64:$XB))]>;
+  def XVXEXPSP : XX2_XT6_XO5_XB6<60,  8, 475, "xvxexpsp", vsrc,
+                                 [(set v4i32: $XT,
+                                  (int_ppc_vsx_xvxexpsp v4f32:$XB))]>;
+  def XVXSIGDP : XX2_XT6_XO5_XB6<60,  1, 475, "xvxsigdp", vsrc,
+                                 [(set v2i64: $XT,
+                                  (int_ppc_vsx_xvxsigdp v2f64:$XB))]>;
+  def XVXSIGSP : XX2_XT6_XO5_XB6<60,  9, 475, "xvxsigsp", vsrc,
+                                 [(set v4i32: $XT,
+                                  (int_ppc_vsx_xvxsigsp v4f32:$XB))]>;
+
+  // Test Data Class SP/DP/QP
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let hasSideEffects = 1 in {
+    def XSTSTDCSP : XX2_BF3_DCMX7_RS6<60, 298,
+                                (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
+                                "xststdcsp $BF, $XB, $DCMX", IIC_VecFP, []>;
+    def XSTSTDCDP : XX2_BF3_DCMX7_RS6<60, 362,
+                                (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
+                                "xststdcdp $BF, $XB, $DCMX", IIC_VecFP, []>;
+    def XSTSTDCQP : X_BF3_DCMX7_RS5  <63, 708,
+                                (outs crrc:$BF), (ins u7imm:$DCMX, vrrc:$vB),
+                                "xststdcqp $BF, $vB, $DCMX", IIC_VecFP, []>;
+  }
+
+  // Vector Test Data Class SP/DP
+  def XVTSTDCSP : XX2_RD6_DCMX7_RS6<60, 13, 5,
+                              (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
+                              "xvtstdcsp $XT, $XB, $DCMX", IIC_VecFP,
+                              [(set v4i32: $XT,
+                               (int_ppc_vsx_xvtstdcsp v4f32:$XB, timm:$DCMX))]>;
+  def XVTSTDCDP : XX2_RD6_DCMX7_RS6<60, 15, 5,
+                              (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
+                              "xvtstdcdp $XT, $XB, $DCMX", IIC_VecFP,
+                              [(set v2i64: $XT,
+                               (int_ppc_vsx_xvtstdcdp v2f64:$XB, timm:$DCMX))]>;
+
+  // Maximum/Minimum Type-C/Type-J DP
+  def XSMAXCDP : XX3_XT5_XA5_XB5<60, 128, "xsmaxcdp", vsfrc, vsfrc, vsfrc,
+                                 IIC_VecFP,
+                                 [(set f64:$XT, (PPCxsmaxc f64:$XA, f64:$XB))]>;
+  def XSMINCDP : XX3_XT5_XA5_XB5<60, 136, "xsmincdp", vsfrc, vsfrc, vsfrc,
+                                 IIC_VecFP,
+                                 [(set f64:$XT, (PPCxsminc f64:$XA, f64:$XB))]>;
+
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let hasSideEffects = 1 in {
+    def XSMAXJDP : XX3_XT5_XA5_XB5<60, 144, "xsmaxjdp", vsrc, vsfrc, vsfrc,
+                                   IIC_VecFP, []>;
+    def XSMINJDP : XX3_XT5_XA5_XB5<60, 152, "xsminjdp", vsrc, vsfrc, vsfrc,
+                                   IIC_VecFP, []>;
+  }
+
+  // Vector Byte-Reverse H/W/D/Q Word
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let hasSideEffects = 1 in
+  def XXBRH : XX2_XT6_XO5_XB6<60,  7, 475, "xxbrh", vsrc, []>;
+  def XXBRW : XX2_XT6_XO5_XB6<60, 15, 475, "xxbrw", vsrc,
+    [(set v4i32:$XT, (bswap v4i32:$XB))]>;
+  def XXBRD : XX2_XT6_XO5_XB6<60, 23, 475, "xxbrd", vsrc,
+    [(set v2i64:$XT, (bswap v2i64:$XB))]>;
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let hasSideEffects = 1 in
+  def XXBRQ : XX2_XT6_XO5_XB6<60, 31, 475, "xxbrq", vsrc, []>;
+
+  // Vector Permute
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let hasSideEffects = 1 in {
+    def XXPERM  : XX3_XT5_XA5_XB5<60, 26, "xxperm" , vsrc, vsrc, vsrc,
+                                  IIC_VecPerm, []>;
+    def XXPERMR : XX3_XT5_XA5_XB5<60, 58, "xxpermr", vsrc, vsrc, vsrc,
+                                  IIC_VecPerm, []>;
+  }
+
+  // Vector Splat Immediate Byte
+  // FIXME: Setting the hasSideEffects flag here to match current behaviour.
+  let hasSideEffects = 1 in
+  def XXSPLTIB : X_RD6_IMM8<60, 360, (outs vsrc:$XT), (ins u8imm:$IMM8),
+                            "xxspltib $XT, $IMM8", IIC_VecPerm, []>;
+
+  // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in
+  // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging.
+  let mayLoad = 1, mayStore = 0 in {
+  // Load Vector
+  def LXV : DQ_RD6_RS5_DQ12<61, 1, (outs vsrc:$XT), (ins memrix16:$src),
+                            "lxv $XT, $src", IIC_LdStLFD, []>;
+  // Load DWord
+  def LXSD  : DSForm_1<57, 2, (outs vfrc:$vD), (ins memrix:$src),
+                       "lxsd $vD, $src", IIC_LdStLFD, []>;
+  // Load SP from src, convert it to DP, and place in dword[0]
+  def LXSSP : DSForm_1<57, 3, (outs vfrc:$vD), (ins memrix:$src),
+                       "lxssp $vD, $src", IIC_LdStLFD, []>;
+
+  // Load as Integer Byte/Halfword & Zero Indexed
+  def LXSIBZX : X_XT6_RA5_RB5<31, 781, "lxsibzx", vsfrc,
+                              [(set f64:$XT, (PPClxsizx xoaddr:$src, 1))]>;
+  def LXSIHZX : X_XT6_RA5_RB5<31, 813, "lxsihzx", vsfrc,
+                              [(set f64:$XT, (PPClxsizx xoaddr:$src, 2))]>;
+
+  // Load Vector Halfword*8/Byte*16 Indexed
+  def LXVH8X  : X_XT6_RA5_RB5<31, 812, "lxvh8x" , vsrc, []>;
+  def LXVB16X : X_XT6_RA5_RB5<31, 876, "lxvb16x", vsrc, []>;
+
+  // Load Vector Indexed
+  def LXVX    : X_XT6_RA5_RB5<31, 268, "lxvx"   , vsrc,
+                [(set v2f64:$XT, (load xaddrX16:$src))]>;
+  // Load Vector (Left-justified) with Length
+  def LXVL : XX1Form_memOp<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
+                   "lxvl $XT, $src, $rB", IIC_LdStLoad,
+                   [(set v4i32:$XT, (int_ppc_vsx_lxvl addr:$src, i64:$rB))]>;
+  def LXVLL : XX1Form_memOp<31,301, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
+                   "lxvll $XT, $src, $rB", IIC_LdStLoad,
+                   [(set v4i32:$XT, (int_ppc_vsx_lxvll addr:$src, i64:$rB))]>;
+
+  // Load Vector Word & Splat Indexed
+  def LXVWSX  : X_XT6_RA5_RB5<31, 364, "lxvwsx" , vsrc, []>;
+  } // mayLoad
+
+  // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in
+  // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging.
+  let mayStore = 1, mayLoad = 0 in {
+  // Store Vector
+  def STXV : DQ_RD6_RS5_DQ12<61, 5, (outs), (ins vsrc:$XT, memrix16:$dst),
+                             "stxv $XT, $dst", IIC_LdStSTFD, []>;
+  // Store DWord
+  def STXSD  : DSForm_1<61, 2, (outs), (ins vfrc:$vS, memrix:$dst),
+                        "stxsd $vS, $dst", IIC_LdStSTFD, []>;
+  // Convert DP of dword[0] to SP, and Store to dst
+  def STXSSP : DSForm_1<61, 3, (outs), (ins vfrc:$vS, memrix:$dst),
+                        "stxssp $vS, $dst", IIC_LdStSTFD, []>;
+
+  // Store as Integer Byte/Halfword Indexed
+  def STXSIBX  : X_XS6_RA5_RB5<31,  909, "stxsibx" , vsfrc,
+                               [(PPCstxsix f64:$XT, xoaddr:$dst, 1)]>;
+  def STXSIHX  : X_XS6_RA5_RB5<31,  941, "stxsihx" , vsfrc,
+                               [(PPCstxsix f64:$XT, xoaddr:$dst, 2)]>;
+  let isCodeGenOnly = 1 in {
+    def STXSIBXv  : X_XS6_RA5_RB5<31,  909, "stxsibx" , vsrc, []>;
+    def STXSIHXv  : X_XS6_RA5_RB5<31,  941, "stxsihx" , vsrc, []>;
+  }
+
+  // Store Vector Halfword*8/Byte*16 Indexed
+  def STXVH8X  : X_XS6_RA5_RB5<31,  940, "stxvh8x" , vsrc, []>;
+  def STXVB16X : X_XS6_RA5_RB5<31, 1004, "stxvb16x", vsrc, []>;
+
+  // Store Vector Indexed
+  def STXVX    : X_XS6_RA5_RB5<31,  396, "stxvx"   , vsrc,
+                 [(store v2f64:$XT, xaddrX16:$dst)]>;
+
+  // Store Vector (Left-justified) with Length
+  def STXVL : XX1Form_memOp<31, 397, (outs),
+                            (ins vsrc:$XT, memr:$dst, g8rc:$rB),
+                            "stxvl $XT, $dst, $rB", IIC_LdStLoad,
+                            [(int_ppc_vsx_stxvl v4i32:$XT, addr:$dst,
+                              i64:$rB)]>;
+  def STXVLL : XX1Form_memOp<31, 429, (outs),
+                            (ins vsrc:$XT, memr:$dst, g8rc:$rB),
+                            "stxvll $XT, $dst, $rB", IIC_LdStLoad,
+                            [(int_ppc_vsx_stxvll v4i32:$XT, addr:$dst,
+                              i64:$rB)]>;
+  } // mayStore
+
+  def DFLOADf32  : PPCPostRAExpPseudo<(outs vssrc:$XT), (ins memrix:$src),
+                          "#DFLOADf32",
+                          [(set f32:$XT, (load iaddrX4:$src))]>;
+  def DFLOADf64  : PPCPostRAExpPseudo<(outs vsfrc:$XT), (ins memrix:$src),
+                          "#DFLOADf64",
+                          [(set f64:$XT, (load iaddrX4:$src))]>;
+  def DFSTOREf32 : PPCPostRAExpPseudo<(outs), (ins vssrc:$XT, memrix:$dst),
+                          "#DFSTOREf32",
+                          [(store f32:$XT, iaddrX4:$dst)]>;
+  def DFSTOREf64 : PPCPostRAExpPseudo<(outs), (ins vsfrc:$XT, memrix:$dst),
+                          "#DFSTOREf64",
+                          [(store f64:$XT, iaddrX4:$dst)]>;
+
+  let mayStore = 1 in {
+    def SPILLTOVSR_STX : PseudoXFormMemOp<(outs),
+                                          (ins spilltovsrrc:$XT, memrr:$dst),
+                                          "#SPILLTOVSR_STX", []>;
+    def SPILLTOVSR_ST : PPCPostRAExpPseudo<(outs), (ins spilltovsrrc:$XT, memrix:$dst),
+                              "#SPILLTOVSR_ST", []>;
+  }
+  let mayLoad = 1 in {
+    def SPILLTOVSR_LDX : PseudoXFormMemOp<(outs spilltovsrrc:$XT),
+                                          (ins memrr:$src),
+                                          "#SPILLTOVSR_LDX", []>;
+    def SPILLTOVSR_LD : PPCPostRAExpPseudo<(outs spilltovsrrc:$XT), (ins memrix:$src),
+                              "#SPILLTOVSR_LD", []>;
+
+  }
+  } // HasP9Vector
+} // hasSideEffects = 0
+
+let PPC970_Single = 1, AddedComplexity = 400 in {
+
+  def SELECT_CC_VSRC: PPCCustomInserterPseudo<(outs vsrc:$dst),
+                             (ins crrc:$cond, vsrc:$T, vsrc:$F, i32imm:$BROPC),
+                             "#SELECT_CC_VSRC",
+                             []>;
+  def SELECT_VSRC: PPCCustomInserterPseudo<(outs vsrc:$dst),
+                          (ins crbitrc:$cond, vsrc:$T, vsrc:$F),
+                          "#SELECT_VSRC",
+                          [(set v2f64:$dst,
+                                (select i1:$cond, v2f64:$T, v2f64:$F))]>;
+  def SELECT_CC_VSFRC: PPCCustomInserterPseudo<(outs f8rc:$dst),
+                              (ins crrc:$cond, f8rc:$T, f8rc:$F,
+                               i32imm:$BROPC), "#SELECT_CC_VSFRC",
+                              []>;
+  def SELECT_VSFRC: PPCCustomInserterPseudo<(outs f8rc:$dst),
+                           (ins crbitrc:$cond, f8rc:$T, f8rc:$F),
+                           "#SELECT_VSFRC",
+                           [(set f64:$dst,
+                                 (select i1:$cond, f64:$T, f64:$F))]>;
+  def SELECT_CC_VSSRC: PPCCustomInserterPseudo<(outs f4rc:$dst),
+                              (ins crrc:$cond, f4rc:$T, f4rc:$F,
+                               i32imm:$BROPC), "#SELECT_CC_VSSRC",
+                              []>;
+  def SELECT_VSSRC: PPCCustomInserterPseudo<(outs f4rc:$dst),
+                           (ins crbitrc:$cond, f4rc:$T, f4rc:$F),
+                           "#SELECT_VSSRC",
+                           [(set f32:$dst,
+                                 (select i1:$cond, f32:$T, f32:$F))]>;
+}
+}
+
+//----------------------------- DAG Definitions ------------------------------//
+def FpMinMax {
+  dag F32Min = (COPY_TO_REGCLASS (XSMINDP (COPY_TO_REGCLASS $A, VSFRC),
+                                          (COPY_TO_REGCLASS $B, VSFRC)),
+                                 VSSRC);
+  dag F32Max = (COPY_TO_REGCLASS (XSMAXDP (COPY_TO_REGCLASS $A, VSFRC),
+                                          (COPY_TO_REGCLASS $B, VSFRC)),
+                                 VSSRC);
+}
+
+def ScalarLoads {
+  dag Li8 =       (i32 (extloadi8 xoaddr:$src));
+  dag ZELi8 =     (i32 (zextloadi8 xoaddr:$src));
+  dag ZELi8i64 =  (i64 (zextloadi8 xoaddr:$src));
+  dag SELi8 =     (i32 (sext_inreg (extloadi8 xoaddr:$src), i8));
+  dag SELi8i64 =  (i64 (sext_inreg (extloadi8 xoaddr:$src), i8));
+
+  dag Li16 =      (i32 (extloadi16 xoaddr:$src));
+  dag ZELi16 =    (i32 (zextloadi16 xoaddr:$src));
+  dag ZELi16i64 = (i64 (zextloadi16 xoaddr:$src));
+  dag SELi16 =    (i32 (sextloadi16 xoaddr:$src));
+  dag SELi16i64 = (i64 (sextloadi16 xoaddr:$src));
+
+  dag Li32 = (i32 (load xoaddr:$src));
+}
+
+def DWToSPExtractConv {
+  dag El0US1 = (f32 (PPCfcfidus
+                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S1, 0))))));
+  dag El1US1 = (f32 (PPCfcfidus
+                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S1, 1))))));
+  dag El0US2 = (f32 (PPCfcfidus
+                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S2, 0))))));
+  dag El1US2 = (f32 (PPCfcfidus
+                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S2, 1))))));
+  dag El0SS1 = (f32 (PPCfcfids
+                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S1, 0))))));
+  dag El1SS1 = (f32 (PPCfcfids
+                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S1, 1))))));
+  dag El0SS2 = (f32 (PPCfcfids
+                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S2, 0))))));
+  dag El1SS2 = (f32 (PPCfcfids
+                    (f64 (PPCmtvsra (i64 (vector_extract v2i64:$S2, 1))))));
+  dag BVU = (v4f32 (build_vector El0US1, El1US1, El0US2, El1US2));
+  dag BVS = (v4f32 (build_vector El0SS1, El1SS1, El0SS2, El1SS2));
+}
+
+def WToDPExtractConv {
+  dag El0S = (f64 (PPCfcfid (PPCmtvsra (extractelt v4i32:$A, 0))));
+  dag El1S = (f64 (PPCfcfid (PPCmtvsra (extractelt v4i32:$A, 1))));
+  dag El2S = (f64 (PPCfcfid (PPCmtvsra (extractelt v4i32:$A, 2))));
+  dag El3S = (f64 (PPCfcfid (PPCmtvsra (extractelt v4i32:$A, 3))));
+  dag El0U = (f64 (PPCfcfidu (PPCmtvsrz (extractelt v4i32:$A, 0))));
+  dag El1U = (f64 (PPCfcfidu (PPCmtvsrz (extractelt v4i32:$A, 1))));
+  dag El2U = (f64 (PPCfcfidu (PPCmtvsrz (extractelt v4i32:$A, 2))));
+  dag El3U = (f64 (PPCfcfidu (PPCmtvsrz (extractelt v4i32:$A, 3))));
+  dag BV02S = (v2f64 (build_vector El0S, El2S));
+  dag BV13S = (v2f64 (build_vector El1S, El3S));
+  dag BV02U = (v2f64 (build_vector El0U, El2U));
+  dag BV13U = (v2f64 (build_vector El1U, El3U));
+}
 
 /*  Direct moves of various widths from GPR's into VSR's. Each move lines
     the value up into element 0 (both BE and LE). Namely, entities smaller than
@@ -2038,1789 +2172,11 @@ def VectorExtractions {
   dag BE_VARIABLE_DOUBLE = (COPY_TO_REGCLASS BE_VDOUBLE_PERMUTE, VSRC);
 }
 
-def NoP9Altivec : Predicate<"!PPCSubTarget->hasP9Altivec()">;
-let AddedComplexity = 400 in {
-// v4f32 scalar <-> vector conversions (BE)
-let Predicates = [IsBigEndian, HasP8Vector] in {
-  def : Pat<(v4f32 (scalar_to_vector f32:$A)),
-            (v4f32 (XSCVDPSPN $A))>;
-  def : Pat<(f32 (vector_extract v4f32:$S, 0)),
-            (f32 (XSCVSPDPN $S))>;
-  def : Pat<(f32 (vector_extract v4f32:$S, 1)),
-            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
-  def : Pat<(f32 (vector_extract v4f32:$S, 2)),
-            (f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>;
-  def : Pat<(f32 (vector_extract v4f32:$S, 3)),
-            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
-  def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
-            (f32 VectorExtractions.BE_VARIABLE_FLOAT)>;
-} // IsBigEndian, HasP8Vector
-
-// Variable index vector_extract for v2f64 does not require P8Vector
-let Predicates = [IsBigEndian, HasVSX] in
-  def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
-            (f64 VectorExtractions.BE_VARIABLE_DOUBLE)>;
-
-let Predicates = [IsBigEndian, HasDirectMove] in {
-  // v16i8 scalar <-> vector conversions (BE)
-  def : Pat<(v16i8 (scalar_to_vector i32:$A)),
-            (v16i8 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64))>;
-  def : Pat<(v8i16 (scalar_to_vector i32:$A)),
-            (v8i16 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_HALF_0, sub_64))>;
-  def : Pat<(v4i32 (scalar_to_vector i32:$A)),
-            (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>;
-  def : Pat<(v2i64 (scalar_to_vector i64:$A)),
-            (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>;
-
-  // v2i64 scalar <-> vector conversions (BE)
-  def : Pat<(i64 (vector_extract v2i64:$S, 0)),
-            (i64 VectorExtractions.LE_DWORD_1)>;
-  def : Pat<(i64 (vector_extract v2i64:$S, 1)),
-            (i64 VectorExtractions.LE_DWORD_0)>;
-  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
-            (i64 VectorExtractions.BE_VARIABLE_DWORD)>;
-} // IsBigEndian, HasDirectMove
-
-let Predicates = [IsBigEndian, HasDirectMove, NoP9Altivec] in {
-  def : Pat<(i32 (vector_extract v16i8:$S, 0)),
-            (i32 VectorExtractions.LE_BYTE_15)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 1)),
-            (i32 VectorExtractions.LE_BYTE_14)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 2)),
-            (i32 VectorExtractions.LE_BYTE_13)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 3)),
-            (i32 VectorExtractions.LE_BYTE_12)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 4)),
-            (i32 VectorExtractions.LE_BYTE_11)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 5)),
-            (i32 VectorExtractions.LE_BYTE_10)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 6)),
-            (i32 VectorExtractions.LE_BYTE_9)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 7)),
-            (i32 VectorExtractions.LE_BYTE_8)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 8)),
-            (i32 VectorExtractions.LE_BYTE_7)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 9)),
-            (i32 VectorExtractions.LE_BYTE_6)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 10)),
-            (i32 VectorExtractions.LE_BYTE_5)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 11)),
-            (i32 VectorExtractions.LE_BYTE_4)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 12)),
-            (i32 VectorExtractions.LE_BYTE_3)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 13)),
-            (i32 VectorExtractions.LE_BYTE_2)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 14)),
-            (i32 VectorExtractions.LE_BYTE_1)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 15)),
-            (i32 VectorExtractions.LE_BYTE_0)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
-            (i32 VectorExtractions.BE_VARIABLE_BYTE)>;
-
-  // v8i16 scalar <-> vector conversions (BE)
-  def : Pat<(i32 (vector_extract v8i16:$S, 0)),
-            (i32 VectorExtractions.LE_HALF_7)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 1)),
-            (i32 VectorExtractions.LE_HALF_6)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 2)),
-            (i32 VectorExtractions.LE_HALF_5)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 3)),
-            (i32 VectorExtractions.LE_HALF_4)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 4)),
-            (i32 VectorExtractions.LE_HALF_3)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 5)),
-            (i32 VectorExtractions.LE_HALF_2)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
-            (i32 VectorExtractions.LE_HALF_1)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 7)),
-            (i32 VectorExtractions.LE_HALF_0)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
-            (i32 VectorExtractions.BE_VARIABLE_HALF)>;
-
-  // v4i32 scalar <-> vector conversions (BE)
-  def : Pat<(i32 (vector_extract v4i32:$S, 0)),
-            (i32 VectorExtractions.LE_WORD_3)>;
-  def : Pat<(i32 (vector_extract v4i32:$S, 1)),
-            (i32 VectorExtractions.LE_WORD_2)>;
-  def : Pat<(i32 (vector_extract v4i32:$S, 2)),
-            (i32 VectorExtractions.LE_WORD_1)>;
-  def : Pat<(i32 (vector_extract v4i32:$S, 3)),
-            (i32 VectorExtractions.LE_WORD_0)>;
-  def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
-            (i32 VectorExtractions.BE_VARIABLE_WORD)>;
-} // IsBigEndian, HasDirectMove, NoP9Altivec
-
-// v4f32 scalar <-> vector conversions (LE)
-let Predicates = [IsLittleEndian, HasP8Vector] in {
-  def : Pat<(v4f32 (scalar_to_vector f32:$A)),
-            (v4f32 (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 1))>;
-  def : Pat<(f32 (vector_extract v4f32:$S, 0)),
-            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
-  def : Pat<(f32 (vector_extract v4f32:$S, 1)),
-            (f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>;
-  def : Pat<(f32 (vector_extract v4f32:$S, 2)),
-            (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
-  def : Pat<(f32 (vector_extract v4f32:$S, 3)),
-            (f32 (XSCVSPDPN $S))>;
-  def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
-            (f32 VectorExtractions.LE_VARIABLE_FLOAT)>;
-} // IsLittleEndian, HasP8Vector
-
-// Variable index vector_extract for v2f64 does not require P8Vector
-let Predicates = [IsLittleEndian, HasVSX] in
-  def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
-            (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>;
-
-def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, xoaddr:$dst),
-            (STXVD2X $rS, xoaddr:$dst)>;
-def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, xoaddr:$dst),
-            (STXVW4X $rS, xoaddr:$dst)>;
-def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
-def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
-
-// Variable index unsigned vector_extract on Power9
-let Predicates = [HasP9Altivec, IsLittleEndian] in {
-  def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))),
-            (VEXTUBRX $Idx, $S)>;
-
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))),
-            (VEXTUHRX (RLWINM8 $Idx, 1, 28, 30), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))),
-            (VEXTUHRX (LI8 0), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))),
-            (VEXTUHRX (LI8 2), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))),
-            (VEXTUHRX (LI8 4), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))),
-            (VEXTUHRX (LI8 6), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))),
-            (VEXTUHRX (LI8 8), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))),
-            (VEXTUHRX (LI8 10), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))),
-            (VEXTUHRX (LI8 12), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))),
-            (VEXTUHRX (LI8 14), $S)>;
-
-  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
-            (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S)>;
-  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))),
-            (VEXTUWRX (LI8 0), $S)>;
-  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))),
-            (VEXTUWRX (LI8 4), $S)>;
-  // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
-  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))),
-            (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-	    (i32 VectorExtractions.LE_WORD_2), sub_32)>;
-  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))),
-            (VEXTUWRX (LI8 12), $S)>;
-
-  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
-            (EXTSW (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S))>;
-  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))),
-            (EXTSW (VEXTUWRX (LI8 0), $S))>;
-  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))),
-            (EXTSW (VEXTUWRX (LI8 4), $S))>;
-  // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
-  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))),
-            (EXTSW (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-	    (i32 VectorExtractions.LE_WORD_2), sub_32))>;
-  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))),
-            (EXTSW (VEXTUWRX (LI8 12), $S))>;
-
-  def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX $Idx, $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 0)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 0), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 1)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 1), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 2)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 2), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 3)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 3), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 4)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 4), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 5)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 5), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 6)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 6), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 7)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 7), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 8)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 8), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 9)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 9), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 10)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 10), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 11)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 11), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 12)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 12), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 13)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 13), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 14)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 14), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 15)),
-            (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 15), $S), sub_32))>;
-
-  def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
-            (i32 (EXTRACT_SUBREG (VEXTUHRX
-	    (RLWINM8 $Idx, 1, 28, 30), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 0)),
-            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 0), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 1)),
-            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 2), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 2)),
-            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 4), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 3)),
-            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 6), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 4)),
-            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 8), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 5)),
-            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 10), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
-            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 12), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
-            (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 14), $S), sub_32))>;
-
-  def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
-            (i32 (EXTRACT_SUBREG (VEXTUWRX
-	    (RLWINM8 $Idx, 2, 28, 29), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v4i32:$S, 0)),
-            (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 0), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v4i32:$S, 1)),
-            (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 4), $S), sub_32))>;
-  // For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
-  def : Pat<(i32 (vector_extract v4i32:$S, 2)),
-            (i32 VectorExtractions.LE_WORD_2)>;
-  def : Pat<(i32 (vector_extract v4i32:$S, 3)),
-            (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 12), $S), sub_32))>;
-}
-
-let Predicates = [HasP9Altivec, IsBigEndian] in {
-  def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))),
-            (VEXTUBLX $Idx, $S)>;
-
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))),
-            (VEXTUHLX (RLWINM8 $Idx, 1, 28, 30), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))),
-            (VEXTUHLX (LI8 0), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))),
-            (VEXTUHLX (LI8 2), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))),
-            (VEXTUHLX (LI8 4), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))),
-            (VEXTUHLX (LI8 6), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))),
-            (VEXTUHLX (LI8 8), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))),
-            (VEXTUHLX (LI8 10), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))),
-            (VEXTUHLX (LI8 12), $S)>;
-  def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))),
-            (VEXTUHLX (LI8 14), $S)>;
-
-  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
-            (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S)>;
-  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))),
-            (VEXTUWLX (LI8 0), $S)>;
-
-  // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
-  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))),
-            (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-	    (i32 VectorExtractions.LE_WORD_2), sub_32)>;
-  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))),
-            (VEXTUWLX (LI8 8), $S)>;
-  def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))),
-            (VEXTUWLX (LI8 12), $S)>;
-
-  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
-            (EXTSW (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S))>;
-  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))),
-            (EXTSW (VEXTUWLX (LI8 0), $S))>;
-  // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
-  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))),
-            (EXTSW (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
-	    (i32 VectorExtractions.LE_WORD_2), sub_32))>;
-  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))),
-            (EXTSW (VEXTUWLX (LI8 8), $S))>;
-  def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))),
-            (EXTSW (VEXTUWLX (LI8 12), $S))>;
-
-  def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX $Idx, $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 0)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 0), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 1)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 1), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 2)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 2), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 3)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 3), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 4)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 4), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 5)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 5), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 6)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 6), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 7)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 7), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 8)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 8), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 9)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 9), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 10)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 10), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 11)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 11), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 12)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 12), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 13)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 13), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 14)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 14), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 15)),
-            (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 15), $S), sub_32))>;
-
-  def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
-            (i32 (EXTRACT_SUBREG (VEXTUHLX
-	    (RLWINM8 $Idx, 1, 28, 30), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 0)),
-            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 0), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 1)),
-            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 2), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 2)),
-            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 4), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 3)),
-            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 6), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 4)),
-            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 8), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 5)),
-            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 10), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
-            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 12), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
-            (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 14), $S), sub_32))>;
-
-  def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
-            (i32 (EXTRACT_SUBREG (VEXTUWLX
-	    (RLWINM8 $Idx, 2, 28, 29), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v4i32:$S, 0)),
-            (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 0), $S), sub_32))>;
-  // For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
-  def : Pat<(i32 (vector_extract v4i32:$S, 1)),
-            (i32 VectorExtractions.LE_WORD_2)>;
-  def : Pat<(i32 (vector_extract v4i32:$S, 2)),
-            (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 8), $S), sub_32))>;
-  def : Pat<(i32 (vector_extract v4i32:$S, 3)),
-            (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 12), $S), sub_32))>;
-}
-
-let Predicates = [IsLittleEndian, HasDirectMove] in {
-  // v16i8 scalar <-> vector conversions (LE)
-  def : Pat<(v16i8 (scalar_to_vector i32:$A)),
-            (v16i8 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>;
-  def : Pat<(v8i16 (scalar_to_vector i32:$A)),
-            (v8i16 (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC))>;
-  def : Pat<(v4i32 (scalar_to_vector i32:$A)),
-            (v4i32 MovesToVSR.LE_WORD_0)>;
-  def : Pat<(v2i64 (scalar_to_vector i64:$A)),
-            (v2i64 MovesToVSR.LE_DWORD_0)>;
-  // v2i64 scalar <-> vector conversions (LE)
-  def : Pat<(i64 (vector_extract v2i64:$S, 0)),
-            (i64 VectorExtractions.LE_DWORD_0)>;
-  def : Pat<(i64 (vector_extract v2i64:$S, 1)),
-            (i64 VectorExtractions.LE_DWORD_1)>;
-  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
-            (i64 VectorExtractions.LE_VARIABLE_DWORD)>;
-} // IsLittleEndian, HasDirectMove
-
-let Predicates = [IsLittleEndian, HasDirectMove, NoP9Altivec] in {
-  def : Pat<(i32 (vector_extract v16i8:$S, 0)),
-            (i32 VectorExtractions.LE_BYTE_0)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 1)),
-            (i32 VectorExtractions.LE_BYTE_1)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 2)),
-            (i32 VectorExtractions.LE_BYTE_2)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 3)),
-            (i32 VectorExtractions.LE_BYTE_3)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 4)),
-            (i32 VectorExtractions.LE_BYTE_4)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 5)),
-            (i32 VectorExtractions.LE_BYTE_5)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 6)),
-            (i32 VectorExtractions.LE_BYTE_6)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 7)),
-            (i32 VectorExtractions.LE_BYTE_7)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 8)),
-            (i32 VectorExtractions.LE_BYTE_8)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 9)),
-            (i32 VectorExtractions.LE_BYTE_9)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 10)),
-            (i32 VectorExtractions.LE_BYTE_10)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 11)),
-            (i32 VectorExtractions.LE_BYTE_11)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 12)),
-            (i32 VectorExtractions.LE_BYTE_12)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 13)),
-            (i32 VectorExtractions.LE_BYTE_13)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 14)),
-            (i32 VectorExtractions.LE_BYTE_14)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, 15)),
-            (i32 VectorExtractions.LE_BYTE_15)>;
-  def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
-            (i32 VectorExtractions.LE_VARIABLE_BYTE)>;
-
-  // v8i16 scalar <-> vector conversions (LE)
-  def : Pat<(i32 (vector_extract v8i16:$S, 0)),
-            (i32 VectorExtractions.LE_HALF_0)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 1)),
-            (i32 VectorExtractions.LE_HALF_1)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 2)),
-            (i32 VectorExtractions.LE_HALF_2)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 3)),
-            (i32 VectorExtractions.LE_HALF_3)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 4)),
-            (i32 VectorExtractions.LE_HALF_4)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 5)),
-            (i32 VectorExtractions.LE_HALF_5)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 6)),
-            (i32 VectorExtractions.LE_HALF_6)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, 7)),
-            (i32 VectorExtractions.LE_HALF_7)>;
-  def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
-            (i32 VectorExtractions.LE_VARIABLE_HALF)>;
-
-  // v4i32 scalar <-> vector conversions (LE)
-  def : Pat<(i32 (vector_extract v4i32:$S, 0)),
-            (i32 VectorExtractions.LE_WORD_0)>;
-  def : Pat<(i32 (vector_extract v4i32:$S, 1)),
-            (i32 VectorExtractions.LE_WORD_1)>;
-  def : Pat<(i32 (vector_extract v4i32:$S, 2)),
-            (i32 VectorExtractions.LE_WORD_2)>;
-  def : Pat<(i32 (vector_extract v4i32:$S, 3)),
-            (i32 VectorExtractions.LE_WORD_3)>;
-  def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
-            (i32 VectorExtractions.LE_VARIABLE_WORD)>;
-} // IsLittleEndian, HasDirectMove, NoP9Altivec
-
-let Predicates = [HasDirectMove, HasVSX] in {
-// bitconvert f32 -> i32
-// (convert to 32-bit fp single, shift right 1 word, move to GPR)
-def : Pat<(i32 (bitconvert f32:$S)),
-          (i32 (MFVSRWZ (EXTRACT_SUBREG
-                          (XXSLDWI (XSCVDPSPN $S), (XSCVDPSPN $S), 3),
-                          sub_64)))>;
-// bitconvert i32 -> f32
-// (move to FPR, shift left 1 word, convert to 64-bit fp single)
-def : Pat<(f32 (bitconvert i32:$A)),
-          (f32 (XSCVSPDPN
-                 (XXSLDWI MovesToVSR.LE_WORD_1, MovesToVSR.LE_WORD_1, 1)))>;
-
-// bitconvert f64 -> i64
-// (move to GPR, nothing else needed)
-def : Pat<(i64 (bitconvert f64:$S)),
-          (i64 (MFVSRD $S))>;
-
-// bitconvert i64 -> f64
-// (move to FPR, nothing else needed)
-def : Pat<(f64 (bitconvert i64:$S)),
-          (f64 (MTVSRD $S))>;
-
-// Rounding to integer.
-def : Pat<(i64 (lrint f64:$S)),
-          (i64 (MFVSRD (FCTID $S)))>;
-def : Pat<(i64 (lrint f32:$S)),
-          (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>;
-def : Pat<(i64 (llrint f64:$S)),
-          (i64 (MFVSRD (FCTID $S)))>;
-def : Pat<(i64 (llrint f32:$S)),
-          (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>;
-def : Pat<(i64 (lround f64:$S)),
-          (i64 (MFVSRD (FCTID (XSRDPI $S))))>;
-def : Pat<(i64 (lround f32:$S)),
-          (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
-def : Pat<(i64 (llround f64:$S)),
-          (i64 (MFVSRD (FCTID (XSRDPI $S))))>;
-def : Pat<(i64 (llround f32:$S)),
-          (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
-}
-
-let Predicates = [HasVSX] in {
-// Rounding for single precision.
-def : Pat<(f32 (fround f32:$S)),
-          (f32 (COPY_TO_REGCLASS (XSRDPI
-                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-def : Pat<(f32 (fnearbyint f32:$S)),
-          (f32 (COPY_TO_REGCLASS (XSRDPIC
-                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-def : Pat<(f32 (ffloor f32:$S)),
-          (f32 (COPY_TO_REGCLASS (XSRDPIM
-                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-def : Pat<(f32 (fceil f32:$S)),
-          (f32 (COPY_TO_REGCLASS (XSRDPIP
-                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-def : Pat<(f32 (ftrunc f32:$S)),
-          (f32 (COPY_TO_REGCLASS (XSRDPIZ
-                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
-}
-
-// Materialize a zero-vector of long long
-def : Pat<(v2i64 immAllZerosV),
-          (v2i64 (XXLXORz))>;
-}
-
 def AlignValues {
   dag F32_TO_BE_WORD1 = (v4f32 (XXSLDWI (XSCVDPSPN $B), (XSCVDPSPN $B), 3));
   dag I32_TO_BE_WORD1 = (COPY_TO_REGCLASS (MTVSRWZ $B), VSRC);
 }
 
-// The following VSX instructions were introduced in Power ISA 3.0
-def HasP9Vector : Predicate<"PPCSubTarget->hasP9Vector()">;
-let AddedComplexity = 400, Predicates = [HasP9Vector] in {
-
-  // [PO VRT XO VRB XO /]
-  class X_VT5_XO5_VB5<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
-                      list<dag> pattern>
-    : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vrrc:$vB),
-                    !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
-
-  // [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /]
-  class X_VT5_XO5_VB5_Ro<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
-                         list<dag> pattern>
-    : X_VT5_XO5_VB5<opcode, xo2, xo, opc, pattern>, isRecordForm;
-
-  // [PO VRT XO VRB XO /], but the VRB is only used the left 64 bits (or less),
-  // So we use different operand class for VRB
-  class X_VT5_XO5_VB5_TyVB<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
-                           RegisterOperand vbtype, list<dag> pattern>
-    : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vrrc:$vT), (ins vbtype:$vB),
-                    !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
-
-  // [PO VRT XO VRB XO /]
-  class X_VT5_XO5_VB5_VSFR<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
-                      list<dag> pattern>
-    : X_RD5_XO5_RS5<opcode, xo2, xo, (outs vfrc:$vT), (ins vrrc:$vB),
-                    !strconcat(opc, " $vT, $vB"), IIC_VecFP, pattern>;
-
-  // [PO VRT XO VRB XO RO], Round to Odd version of [PO VRT XO VRB XO /]
-  class X_VT5_XO5_VB5_VSFR_Ro<bits<6> opcode, bits<5> xo2, bits<10> xo, string opc,
-                         list<dag> pattern>
-    : X_VT5_XO5_VB5_VSFR<opcode, xo2, xo, opc, pattern>, isRecordForm;
-
-  // [PO T XO B XO BX /]
-  class XX2_RT5_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
-                        list<dag> pattern>
-    : XX2_RD5_XO5_RS6<opcode, xo2, xo, (outs g8rc:$rT), (ins vsfrc:$XB),
-                      !strconcat(opc, " $rT, $XB"), IIC_VecFP, pattern>;
-
-  // [PO T XO B XO BX TX]
-  class XX2_XT6_XO5_XB6<bits<6> opcode, bits<5> xo2, bits<9> xo, string opc,
-                        RegisterOperand vtype, list<dag> pattern>
-    : XX2_RD6_XO5_RS6<opcode, xo2, xo, (outs vtype:$XT), (ins vtype:$XB),
-                      !strconcat(opc, " $XT, $XB"), IIC_VecFP, pattern>;
-
-  // [PO T A B XO AX BX TX], src and dest register use different operand class
-  class XX3_XT5_XA5_XB5<bits<6> opcode, bits<8> xo, string opc,
-                  RegisterOperand xty, RegisterOperand aty, RegisterOperand bty,
-                  InstrItinClass itin, list<dag> pattern>
-    : XX3Form<opcode, xo, (outs xty:$XT), (ins aty:$XA, bty:$XB),
-              !strconcat(opc, " $XT, $XA, $XB"), itin, pattern>;
-
-  // [PO VRT VRA VRB XO /]
-  class X_VT5_VA5_VB5<bits<6> opcode, bits<10> xo, string opc,
-                      list<dag> pattern>
-    : XForm_1<opcode, xo, (outs vrrc:$vT), (ins vrrc:$vA, vrrc:$vB),
-              !strconcat(opc, " $vT, $vA, $vB"), IIC_VecFP, pattern>;
-
-  // [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /]
-  class X_VT5_VA5_VB5_Ro<bits<6> opcode, bits<10> xo, string opc,
-                         list<dag> pattern>
-    : X_VT5_VA5_VB5<opcode, xo, opc, pattern>, isRecordForm;
-
-  // [PO VRT VRA VRB XO /]
-  class X_VT5_VA5_VB5_FMA<bits<6> opcode, bits<10> xo, string opc,
-                          list<dag> pattern>
-    : XForm_1<opcode, xo, (outs vrrc:$vT), (ins vrrc:$vTi, vrrc:$vA, vrrc:$vB),
-              !strconcat(opc, " $vT, $vA, $vB"), IIC_VecFP, pattern>,
-              RegConstraint<"$vTi = $vT">, NoEncode<"$vTi">;
-
-  // [PO VRT VRA VRB XO RO], Round to Odd version of [PO VRT VRA VRB XO /]
-  class X_VT5_VA5_VB5_FMA_Ro<bits<6> opcode, bits<10> xo, string opc,
-                          list<dag> pattern>
-    : X_VT5_VA5_VB5_FMA<opcode, xo, opc, pattern>, isRecordForm;
-
-  //===--------------------------------------------------------------------===//
-  // Quad-Precision Scalar Move Instructions:
-
-  // Copy Sign
-  def XSCPSGNQP : X_VT5_VA5_VB5<63, 100, "xscpsgnqp",
-                                [(set f128:$vT,
-                                      (fcopysign f128:$vB, f128:$vA))]>;
-
-  // Absolute/Negative-Absolute/Negate
-  def XSABSQP   : X_VT5_XO5_VB5<63,  0, 804, "xsabsqp",
-                                [(set f128:$vT, (fabs f128:$vB))]>;
-  def XSNABSQP  : X_VT5_XO5_VB5<63,  8, 804, "xsnabsqp",
-                                [(set f128:$vT, (fneg (fabs f128:$vB)))]>;
-  def XSNEGQP   : X_VT5_XO5_VB5<63, 16, 804, "xsnegqp",
-                                [(set f128:$vT, (fneg f128:$vB))]>;
-
-  //===--------------------------------------------------------------------===//
-  // Quad-Precision Scalar Floating-Point Arithmetic Instructions:
-
-  // Add/Divide/Multiply/Subtract
-  let isCommutable = 1 in {
-  def XSADDQP   : X_VT5_VA5_VB5   <63,   4, "xsaddqp",
-                                   [(set f128:$vT, (fadd f128:$vA, f128:$vB))]>;
-  def XSMULQP   : X_VT5_VA5_VB5   <63,  36, "xsmulqp",
-                                   [(set f128:$vT, (fmul f128:$vA, f128:$vB))]>;
-  }
-  def XSSUBQP   : X_VT5_VA5_VB5   <63, 516, "xssubqp" ,
-                                   [(set f128:$vT, (fsub f128:$vA, f128:$vB))]>;
-  def XSDIVQP   : X_VT5_VA5_VB5   <63, 548, "xsdivqp",
-                                   [(set f128:$vT, (fdiv f128:$vA, f128:$vB))]>;
-  // Square-Root
-  def XSSQRTQP  : X_VT5_XO5_VB5   <63, 27, 804, "xssqrtqp",
-                                   [(set f128:$vT, (fsqrt f128:$vB))]>;
-  // (Negative) Multiply-{Add/Subtract}
-  def XSMADDQP : X_VT5_VA5_VB5_FMA <63, 388, "xsmaddqp",
-                                    [(set f128:$vT,
-                                          (fma f128:$vA, f128:$vB,
-                                               f128:$vTi))]>;
-  def XSMSUBQP  : X_VT5_VA5_VB5_FMA   <63, 420, "xsmsubqp"  ,
-                                       [(set f128:$vT,
-                                             (fma f128:$vA, f128:$vB,
-                                                  (fneg f128:$vTi)))]>;
-  def XSNMADDQP : X_VT5_VA5_VB5_FMA <63, 452, "xsnmaddqp",
-                                     [(set f128:$vT,
-                                           (fneg (fma f128:$vA, f128:$vB,
-                                                      f128:$vTi)))]>;
-  def XSNMSUBQP : X_VT5_VA5_VB5_FMA <63, 484, "xsnmsubqp",
-                                     [(set f128:$vT,
-                                           (fneg (fma f128:$vA, f128:$vB,
-                                                      (fneg f128:$vTi))))]>;
-
-  let isCommutable = 1 in {
-  def XSADDQPO : X_VT5_VA5_VB5_Ro<63, 4, "xsaddqpo",
-                                  [(set f128:$vT,
-                                  (int_ppc_addf128_round_to_odd
-                                  f128:$vA, f128:$vB))]>;
-  def XSMULQPO : X_VT5_VA5_VB5_Ro<63, 36, "xsmulqpo",
-                                  [(set f128:$vT,
-                                  (int_ppc_mulf128_round_to_odd
-                                  f128:$vA, f128:$vB))]>;
-  }
-  def XSSUBQPO : X_VT5_VA5_VB5_Ro<63, 516, "xssubqpo",
-                                  [(set f128:$vT,
-                                  (int_ppc_subf128_round_to_odd
-                                  f128:$vA, f128:$vB))]>;
-  def XSDIVQPO : X_VT5_VA5_VB5_Ro<63, 548, "xsdivqpo",
-                                  [(set f128:$vT,
-                                  (int_ppc_divf128_round_to_odd
-                                  f128:$vA, f128:$vB))]>;
-  def XSSQRTQPO : X_VT5_XO5_VB5_Ro<63, 27, 804, "xssqrtqpo",
-                                  [(set f128:$vT,
-                                  (int_ppc_sqrtf128_round_to_odd f128:$vB))]>;
-
-
-  def XSMADDQPO : X_VT5_VA5_VB5_FMA_Ro<63, 388, "xsmaddqpo",
-                                      [(set f128:$vT,
-                                      (int_ppc_fmaf128_round_to_odd
-                                      f128:$vA,f128:$vB,f128:$vTi))]>;
-
-  def XSMSUBQPO : X_VT5_VA5_VB5_FMA_Ro<63, 420, "xsmsubqpo" ,
-                                      [(set f128:$vT,
-                                      (int_ppc_fmaf128_round_to_odd
-                                      f128:$vA, f128:$vB, (fneg f128:$vTi)))]>;
-  def XSNMADDQPO: X_VT5_VA5_VB5_FMA_Ro<63, 452, "xsnmaddqpo",
-                                      [(set f128:$vT,
-                                      (fneg (int_ppc_fmaf128_round_to_odd
-                                      f128:$vA, f128:$vB, f128:$vTi)))]>;
-  def XSNMSUBQPO: X_VT5_VA5_VB5_FMA_Ro<63, 484, "xsnmsubqpo",
-                                      [(set f128:$vT,
-                                      (fneg (int_ppc_fmaf128_round_to_odd
-                                      f128:$vA, f128:$vB, (fneg f128:$vTi))))]>;
-
-  // Additional fnmsub patterns: -a*b + c == -(a*b - c)
-  def : Pat<(fma (fneg f128:$A), f128:$B, f128:$C), (XSNMSUBQP $C, $A, $B)>;
-  def : Pat<(fma f128:$A, (fneg f128:$B), f128:$C), (XSNMSUBQP $C, $A, $B)>;
-
-  //===--------------------------------------------------------------------===//
-  // Quad/Double-Precision Compare Instructions:
-
-  // [PO BF // VRA VRB XO /]
-  class X_BF3_VA5_VB5<bits<6> opcode, bits<10> xo, string opc,
-                      list<dag> pattern>
-    : XForm_17<opcode, xo, (outs crrc:$crD), (ins vrrc:$VA, vrrc:$VB),
-               !strconcat(opc, " $crD, $VA, $VB"), IIC_FPCompare> {
-    let Pattern = pattern;
-  }
-
-  // QP Compare Ordered/Unordered
-  def XSCMPOQP : X_BF3_VA5_VB5<63, 132, "xscmpoqp", []>;
-  def XSCMPUQP : X_BF3_VA5_VB5<63, 644, "xscmpuqp", []>;
-
-  // DP/QP Compare Exponents
-  def XSCMPEXPDP : XX3Form_1<60, 59,
-                             (outs crrc:$crD), (ins vsfrc:$XA, vsfrc:$XB),
-                             "xscmpexpdp $crD, $XA, $XB", IIC_FPCompare, []>;
-  def XSCMPEXPQP : X_BF3_VA5_VB5<63, 164, "xscmpexpqp", []>;
-
-  // DP Compare ==, >=, >, !=
-  // Use vsrc for XT, because the entire register of XT is set.
-  // XT.dword[1] = 0x0000_0000_0000_0000
-  def XSCMPEQDP : XX3_XT5_XA5_XB5<60,  3, "xscmpeqdp", vsrc, vsfrc, vsfrc,
-                                  IIC_FPCompare, []>;
-  def XSCMPGEDP : XX3_XT5_XA5_XB5<60, 19, "xscmpgedp", vsrc, vsfrc, vsfrc,
-                                  IIC_FPCompare, []>;
-  def XSCMPGTDP : XX3_XT5_XA5_XB5<60, 11, "xscmpgtdp", vsrc, vsfrc, vsfrc,
-                                  IIC_FPCompare, []>;
-
-  //===--------------------------------------------------------------------===//
-  // Quad-Precision Floating-Point Conversion Instructions:
-
-  // Convert DP -> QP
-  def XSCVDPQP  : X_VT5_XO5_VB5_TyVB<63, 22, 836, "xscvdpqp", vfrc,
-                                     [(set f128:$vT, (fpextend f64:$vB))]>;
-
-  // Round & Convert QP -> DP (dword[1] is set to zero)
-  def XSCVQPDP  : X_VT5_XO5_VB5_VSFR<63, 20, 836, "xscvqpdp" , []>;
-  def XSCVQPDPO : X_VT5_XO5_VB5_VSFR_Ro<63, 20, 836, "xscvqpdpo",
-                                        [(set f64:$vT,
-                                        (int_ppc_truncf128_round_to_odd
-                                        f128:$vB))]>;
-
-  // Truncate & Convert QP -> (Un)Signed (D)Word (dword[1] is set to zero)
-  def XSCVQPSDZ : X_VT5_XO5_VB5<63, 25, 836, "xscvqpsdz", []>;
-  def XSCVQPSWZ : X_VT5_XO5_VB5<63,  9, 836, "xscvqpswz", []>;
-  def XSCVQPUDZ : X_VT5_XO5_VB5<63, 17, 836, "xscvqpudz", []>;
-  def XSCVQPUWZ : X_VT5_XO5_VB5<63,  1, 836, "xscvqpuwz", []>;
-
-  // Convert (Un)Signed DWord -> QP.
-  def XSCVSDQP  : X_VT5_XO5_VB5_TyVB<63, 10, 836, "xscvsdqp", vfrc, []>;
-  def : Pat<(f128 (sint_to_fp i64:$src)),
-            (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
-  def : Pat<(f128 (sint_to_fp (i64 (PPCmfvsr f64:$src)))),
-            (f128 (XSCVSDQP $src))>;
-  def : Pat<(f128 (sint_to_fp (i32 (PPCmfvsr f64:$src)))),
-            (f128 (XSCVSDQP (VEXTSW2Ds $src)))>;
-
-  def XSCVUDQP  : X_VT5_XO5_VB5_TyVB<63,  2, 836, "xscvudqp", vfrc, []>;
-  def : Pat<(f128 (uint_to_fp i64:$src)),
-            (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
-  def : Pat<(f128 (uint_to_fp (i64 (PPCmfvsr f64:$src)))),
-            (f128 (XSCVUDQP $src))>;
-
-  // Convert (Un)Signed Word -> QP.
-  def : Pat<(f128 (sint_to_fp i32:$src)),
-            (f128 (XSCVSDQP (MTVSRWA $src)))>;
-  def : Pat<(f128 (sint_to_fp (i32 (load xoaddr:$src)))),
-            (f128 (XSCVSDQP (LIWAX xoaddr:$src)))>;
-  def : Pat<(f128 (uint_to_fp i32:$src)),
-            (f128 (XSCVUDQP (MTVSRWZ $src)))>;
-  def : Pat<(f128 (uint_to_fp (i32 (load xoaddr:$src)))),
-            (f128 (XSCVUDQP (LIWZX xoaddr:$src)))>;
-
-  //===--------------------------------------------------------------------===//
-  // Round to Floating-Point Integer Instructions
-
-  // (Round &) Convert DP <-> HP
-  // Note! xscvdphp's src and dest register both use the left 64 bits, so we use
-  // vsfrc for src and dest register. xscvhpdp's src only use the left 16 bits,
-  // but we still use vsfrc for it.
-  def XSCVDPHP : XX2_XT6_XO5_XB6<60, 17, 347, "xscvdphp", vsfrc, []>;
-  def XSCVHPDP : XX2_XT6_XO5_XB6<60, 16, 347, "xscvhpdp", vsfrc, []>;
-
-  // Vector HP -> SP
-  def XVCVHPSP : XX2_XT6_XO5_XB6<60, 24, 475, "xvcvhpsp", vsrc, []>;
-  def XVCVSPHP : XX2_XT6_XO5_XB6<60, 25, 475, "xvcvsphp", vsrc,
-                                 [(set v4f32:$XT,
-                                     (int_ppc_vsx_xvcvsphp v4f32:$XB))]>;
-
-  // Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a
-  // separate pattern so that it can convert the input register class from
-  // VRRC(v8i16) to VSRC.
-  def : Pat<(v4f32 (int_ppc_vsx_xvcvhpsp v8i16:$A)),
-            (v4f32 (XVCVHPSP (COPY_TO_REGCLASS $A, VSRC)))>;
-
-  class Z23_VT5_R1_VB5_RMC2_EX1<bits<6> opcode, bits<8> xo, bit ex, string opc,
-                                list<dag> pattern>
-    : Z23Form_8<opcode, xo,
-                (outs vrrc:$vT), (ins u1imm:$r, vrrc:$vB, u2imm:$rmc),
-                !strconcat(opc, " $r, $vT, $vB, $rmc"), IIC_VecFP, pattern> {
-    let RC = ex;
-  }
-
-  // Round to Quad-Precision Integer [with Inexact]
-  def XSRQPI   : Z23_VT5_R1_VB5_RMC2_EX1<63,  5, 0, "xsrqpi" , []>;
-  def XSRQPIX  : Z23_VT5_R1_VB5_RMC2_EX1<63,  5, 1, "xsrqpix", []>;
-
-  // Use current rounding mode
-  def : Pat<(f128 (fnearbyint f128:$vB)), (f128 (XSRQPI 0, $vB, 3))>;
-  // Round to nearest, ties away from zero
-  def : Pat<(f128 (fround f128:$vB)), (f128 (XSRQPI 0, $vB, 0))>;
-  // Round towards Zero
-  def : Pat<(f128 (ftrunc f128:$vB)), (f128 (XSRQPI 1, $vB, 1))>;
-  // Round towards +Inf
-  def : Pat<(f128 (fceil f128:$vB)), (f128 (XSRQPI 1, $vB, 2))>;
-  // Round towards -Inf
-  def : Pat<(f128 (ffloor f128:$vB)), (f128 (XSRQPI 1, $vB, 3))>;
-
-  // Use current rounding mode, [with Inexact]
-  def : Pat<(f128 (frint f128:$vB)), (f128 (XSRQPIX 0, $vB, 3))>;
-
-  // Round Quad-Precision to Double-Extended Precision (fp80)
-  def XSRQPXP  : Z23_VT5_R1_VB5_RMC2_EX1<63, 37, 0, "xsrqpxp", []>;
-
-  //===--------------------------------------------------------------------===//
-  // Insert/Extract Instructions
-
-  // Insert Exponent DP/QP
-  // XT NOTE: XT.dword[1] = 0xUUUU_UUUU_UUUU_UUUU
-  def XSIEXPDP : XX1Form <60, 918, (outs vsrc:$XT), (ins g8rc:$rA, g8rc:$rB),
-                          "xsiexpdp $XT, $rA, $rB", IIC_VecFP, []>;
-  // vB NOTE: only vB.dword[0] is used, that's why we don't use
-  //          X_VT5_VA5_VB5 form
-  def XSIEXPQP : XForm_18<63, 868, (outs vrrc:$vT), (ins vrrc:$vA, vsfrc:$vB),
-                          "xsiexpqp $vT, $vA, $vB", IIC_VecFP, []>;
-
-  def : Pat<(f128 (int_ppc_scalar_insert_exp_qp f128:$vA, i64:$vB)),
-            (f128 (XSIEXPQP $vA, (MTVSRD $vB)))>;
-
-  // Extract Exponent/Significand DP/QP
-  def XSXEXPDP : XX2_RT5_XO5_XB6<60,  0, 347, "xsxexpdp", []>;
-  def XSXSIGDP : XX2_RT5_XO5_XB6<60,  1, 347, "xsxsigdp", []>;
-
-  def XSXEXPQP : X_VT5_XO5_VB5  <63,  2, 804, "xsxexpqp", []>;
-  def XSXSIGQP : X_VT5_XO5_VB5  <63, 18, 804, "xsxsigqp", []>;
-
-  def : Pat<(i64 (int_ppc_scalar_extract_expq  f128:$vA)),
-            (i64 (MFVSRD (EXTRACT_SUBREG
-                           (v2i64 (XSXEXPQP $vA)), sub_64)))>;
-
-  // Vector Insert Word
-  // XB NOTE: Only XB.dword[1] is used, but we use vsrc on XB.
-  def XXINSERTW   :
-    XX2_RD6_UIM5_RS6<60, 181, (outs vsrc:$XT),
-                     (ins vsrc:$XTi, vsrc:$XB, u4imm:$UIM),
-                     "xxinsertw $XT, $XB, $UIM", IIC_VecFP,
-                     [(set v4i32:$XT, (PPCvecinsert v4i32:$XTi, v4i32:$XB,
-                                                   imm32SExt16:$UIM))]>,
-                     RegConstraint<"$XTi = $XT">, NoEncode<"$XTi">;
-
-  // Vector Extract Unsigned Word
-  def XXEXTRACTUW : XX2_RD6_UIM5_RS6<60, 165,
-                                  (outs vsfrc:$XT), (ins vsrc:$XB, u4imm:$UIMM),
-                                  "xxextractuw $XT, $XB, $UIMM", IIC_VecFP, []>;
-
-  // Vector Insert Exponent DP/SP
-  def XVIEXPDP : XX3_XT5_XA5_XB5<60, 248, "xviexpdp", vsrc, vsrc, vsrc,
-    IIC_VecFP, [(set v2f64: $XT,(int_ppc_vsx_xviexpdp v2i64:$XA, v2i64:$XB))]>;
-  def XVIEXPSP : XX3_XT5_XA5_XB5<60, 216, "xviexpsp", vsrc, vsrc, vsrc,
-    IIC_VecFP, [(set v4f32: $XT,(int_ppc_vsx_xviexpsp v4i32:$XA, v4i32:$XB))]>;
-
-  // Vector Extract Exponent/Significand DP/SP
-  def XVXEXPDP : XX2_XT6_XO5_XB6<60,  0, 475, "xvxexpdp", vsrc,
-                                 [(set v2i64: $XT,
-                                  (int_ppc_vsx_xvxexpdp v2f64:$XB))]>;
-  def XVXEXPSP : XX2_XT6_XO5_XB6<60,  8, 475, "xvxexpsp", vsrc,
-                                 [(set v4i32: $XT,
-                                  (int_ppc_vsx_xvxexpsp v4f32:$XB))]>;
-  def XVXSIGDP : XX2_XT6_XO5_XB6<60,  1, 475, "xvxsigdp", vsrc,
-                                 [(set v2i64: $XT,
-                                  (int_ppc_vsx_xvxsigdp v2f64:$XB))]>;
-  def XVXSIGSP : XX2_XT6_XO5_XB6<60,  9, 475, "xvxsigsp", vsrc,
-                                 [(set v4i32: $XT,
-                                  (int_ppc_vsx_xvxsigsp v4f32:$XB))]>;
-
-  let AddedComplexity = 400, Predicates = [HasP9Vector] in {
-  // Extra patterns expanding to vector Extract Word/Insert Word
-  def : Pat<(v4i32 (int_ppc_vsx_xxinsertw v4i32:$A, v2i64:$B, imm:$IMM)),
-            (v4i32 (XXINSERTW $A, $B, imm:$IMM))>;
-  def : Pat<(v2i64 (int_ppc_vsx_xxextractuw v2i64:$A, imm:$IMM)),
-            (v2i64 (COPY_TO_REGCLASS (XXEXTRACTUW $A, imm:$IMM), VSRC))>;
-  } // AddedComplexity = 400, HasP9Vector
-
-  //===--------------------------------------------------------------------===//
-
-  // Test Data Class SP/DP/QP
-  def XSTSTDCSP : XX2_BF3_DCMX7_RS6<60, 298,
-                              (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
-                              "xststdcsp $BF, $XB, $DCMX", IIC_VecFP, []>;
-  def XSTSTDCDP : XX2_BF3_DCMX7_RS6<60, 362,
-                              (outs crrc:$BF), (ins u7imm:$DCMX, vsfrc:$XB),
-                              "xststdcdp $BF, $XB, $DCMX", IIC_VecFP, []>;
-  def XSTSTDCQP : X_BF3_DCMX7_RS5  <63, 708,
-                              (outs crrc:$BF), (ins u7imm:$DCMX, vrrc:$vB),
-                              "xststdcqp $BF, $vB, $DCMX", IIC_VecFP, []>;
-
-  // Vector Test Data Class SP/DP
-  def XVTSTDCSP : XX2_RD6_DCMX7_RS6<60, 13, 5,
-                              (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
-                              "xvtstdcsp $XT, $XB, $DCMX", IIC_VecFP,
-                              [(set v4i32: $XT,
-                               (int_ppc_vsx_xvtstdcsp v4f32:$XB, timm:$DCMX))]>;
-  def XVTSTDCDP : XX2_RD6_DCMX7_RS6<60, 15, 5,
-                              (outs vsrc:$XT), (ins u7imm:$DCMX, vsrc:$XB),
-                              "xvtstdcdp $XT, $XB, $DCMX", IIC_VecFP,
-                              [(set v2i64: $XT,
-                               (int_ppc_vsx_xvtstdcdp v2f64:$XB, timm:$DCMX))]>;
-
-  //===--------------------------------------------------------------------===//
-
-  // Maximum/Minimum Type-C/Type-J DP
-  def XSMAXCDP : XX3_XT5_XA5_XB5<60, 128, "xsmaxcdp", vsfrc, vsfrc, vsfrc,
-                                 IIC_VecFP,
-                                 [(set f64:$XT, (PPCxsmaxc f64:$XA, f64:$XB))]>;
-  def XSMAXJDP : XX3_XT5_XA5_XB5<60, 144, "xsmaxjdp", vsrc, vsfrc, vsfrc,
-                                 IIC_VecFP, []>;
-  def XSMINCDP : XX3_XT5_XA5_XB5<60, 136, "xsmincdp", vsfrc, vsfrc, vsfrc,
-                                 IIC_VecFP,
-                                 [(set f64:$XT, (PPCxsminc f64:$XA, f64:$XB))]>;
-  def XSMINJDP : XX3_XT5_XA5_XB5<60, 152, "xsminjdp", vsrc, vsfrc, vsfrc,
-                                 IIC_VecFP, []>;
-
-  //===--------------------------------------------------------------------===//
-
-  // Vector Byte-Reverse H/W/D/Q Word
-  def XXBRH : XX2_XT6_XO5_XB6<60,  7, 475, "xxbrh", vsrc, []>;
-  def XXBRW : XX2_XT6_XO5_XB6<60, 15, 475, "xxbrw", vsrc,
-    [(set v4i32:$XT, (bswap v4i32:$XB))]>;
-  def XXBRD : XX2_XT6_XO5_XB6<60, 23, 475, "xxbrd", vsrc,
-    [(set v2i64:$XT, (bswap v2i64:$XB))]>;
-  def XXBRQ : XX2_XT6_XO5_XB6<60, 31, 475, "xxbrq", vsrc, []>;
-
-  // Vector Reverse
-  def : Pat<(v8i16 (bswap v8i16 :$A)),
-            (v8i16 (COPY_TO_REGCLASS (XXBRH (COPY_TO_REGCLASS $A, VSRC)), VRRC))>;
-  def : Pat<(v1i128 (bswap v1i128 :$A)),
-            (v1i128 (COPY_TO_REGCLASS (XXBRQ (COPY_TO_REGCLASS $A, VSRC)), VRRC))>;
-
-  // Vector Permute
-  def XXPERM  : XX3_XT5_XA5_XB5<60, 26, "xxperm" , vsrc, vsrc, vsrc,
-                                IIC_VecPerm, []>;
-  def XXPERMR : XX3_XT5_XA5_XB5<60, 58, "xxpermr", vsrc, vsrc, vsrc,
-                                IIC_VecPerm, []>;
-
-  // Vector Splat Immediate Byte
-  def XXSPLTIB : X_RD6_IMM8<60, 360, (outs vsrc:$XT), (ins u8imm:$IMM8),
-                            "xxspltib $XT, $IMM8", IIC_VecPerm, []>;
-
-  //===--------------------------------------------------------------------===//
-  // Vector/Scalar Load/Store Instructions
-
-  // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in
-  // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging.
-  let mayLoad = 1, mayStore = 0, hasSideEffects = 0 in {
-  // Load Vector
-  def LXV : DQ_RD6_RS5_DQ12<61, 1, (outs vsrc:$XT), (ins memrix16:$src),
-                            "lxv $XT, $src", IIC_LdStLFD, []>;
-  // Load DWord
-  def LXSD  : DSForm_1<57, 2, (outs vfrc:$vD), (ins memrix:$src),
-                       "lxsd $vD, $src", IIC_LdStLFD, []>;
-  // Load SP from src, convert it to DP, and place in dword[0]
-  def LXSSP : DSForm_1<57, 3, (outs vfrc:$vD), (ins memrix:$src),
-                       "lxssp $vD, $src", IIC_LdStLFD, []>;
-
-  // [PO T RA RB XO TX] almost equal to [PO S RA RB XO SX], but has different
-  // "out" and "in" dag
-  class X_XT6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
-                      RegisterOperand vtype, list<dag> pattern>
-    : XX1Form_memOp<opcode, xo, (outs vtype:$XT), (ins memrr:$src),
-              !strconcat(opc, " $XT, $src"), IIC_LdStLFD, pattern>;
-
-  // Load as Integer Byte/Halfword & Zero Indexed
-  def LXSIBZX : X_XT6_RA5_RB5<31, 781, "lxsibzx", vsfrc,
-                              [(set f64:$XT, (PPClxsizx xoaddr:$src, 1))]>;
-  def LXSIHZX : X_XT6_RA5_RB5<31, 813, "lxsihzx", vsfrc,
-                              [(set f64:$XT, (PPClxsizx xoaddr:$src, 2))]>;
-
-  // Load Vector Halfword*8/Byte*16 Indexed
-  def LXVH8X  : X_XT6_RA5_RB5<31, 812, "lxvh8x" , vsrc, []>;
-  def LXVB16X : X_XT6_RA5_RB5<31, 876, "lxvb16x", vsrc, []>;
-
-  // Load Vector Indexed
-  def LXVX    : X_XT6_RA5_RB5<31, 268, "lxvx"   , vsrc,
-                [(set v2f64:$XT, (load xaddrX16:$src))]>;
-  // Load Vector (Left-justified) with Length
-  def LXVL : XX1Form_memOp<31, 269, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
-                   "lxvl $XT, $src, $rB", IIC_LdStLoad,
-                   [(set v4i32:$XT, (int_ppc_vsx_lxvl addr:$src, i64:$rB))]>;
-  def LXVLL : XX1Form_memOp<31,301, (outs vsrc:$XT), (ins memr:$src, g8rc:$rB),
-                   "lxvll $XT, $src, $rB", IIC_LdStLoad,
-                   [(set v4i32:$XT, (int_ppc_vsx_lxvll addr:$src, i64:$rB))]>;
-
-  // Load Vector Word & Splat Indexed
-  def LXVWSX  : X_XT6_RA5_RB5<31, 364, "lxvwsx" , vsrc, []>;
-  } // mayLoad
-
-  // When adding new D-Form loads/stores, be sure to update the ImmToIdxMap in
-  // PPCRegisterInfo::PPCRegisterInfo and maybe save yourself some debugging.
-  let mayStore = 1, mayLoad = 0, hasSideEffects = 0 in {
-  // Store Vector
-  def STXV : DQ_RD6_RS5_DQ12<61, 5, (outs), (ins vsrc:$XT, memrix16:$dst),
-                             "stxv $XT, $dst", IIC_LdStSTFD, []>;
-  // Store DWord
-  def STXSD  : DSForm_1<61, 2, (outs), (ins vfrc:$vS, memrix:$dst),
-                        "stxsd $vS, $dst", IIC_LdStSTFD, []>;
-  // Convert DP of dword[0] to SP, and Store to dst
-  def STXSSP : DSForm_1<61, 3, (outs), (ins vfrc:$vS, memrix:$dst),
-                        "stxssp $vS, $dst", IIC_LdStSTFD, []>;
-
-  // [PO S RA RB XO SX]
-  class X_XS6_RA5_RB5<bits<6> opcode, bits<10> xo, string opc,
-                      RegisterOperand vtype, list<dag> pattern>
-    : XX1Form_memOp<opcode, xo, (outs), (ins vtype:$XT, memrr:$dst),
-              !strconcat(opc, " $XT, $dst"), IIC_LdStSTFD, pattern>;
-
-  // Store as Integer Byte/Halfword Indexed
-  def STXSIBX  : X_XS6_RA5_RB5<31,  909, "stxsibx" , vsfrc,
-                               [(PPCstxsix f64:$XT, xoaddr:$dst, 1)]>;
-  def STXSIHX  : X_XS6_RA5_RB5<31,  941, "stxsihx" , vsfrc,
-                               [(PPCstxsix f64:$XT, xoaddr:$dst, 2)]>;
-  let isCodeGenOnly = 1 in {
-    def STXSIBXv  : X_XS6_RA5_RB5<31,  909, "stxsibx" , vsrc, []>;
-    def STXSIHXv  : X_XS6_RA5_RB5<31,  941, "stxsihx" , vsrc, []>;
-  }
-
-  // Store Vector Halfword*8/Byte*16 Indexed
-  def STXVH8X  : X_XS6_RA5_RB5<31,  940, "stxvh8x" , vsrc, []>;
-  def STXVB16X : X_XS6_RA5_RB5<31, 1004, "stxvb16x", vsrc, []>;
-
-  // Store Vector Indexed
-  def STXVX    : X_XS6_RA5_RB5<31,  396, "stxvx"   , vsrc,
-                 [(store v2f64:$XT, xaddrX16:$dst)]>;
-
-  // Store Vector (Left-justified) with Length
-  def STXVL : XX1Form_memOp<31, 397, (outs),
-                            (ins vsrc:$XT, memr:$dst, g8rc:$rB),
-                            "stxvl $XT, $dst, $rB", IIC_LdStLoad,
-                            [(int_ppc_vsx_stxvl v4i32:$XT, addr:$dst,
-                              i64:$rB)]>;
-  def STXVLL : XX1Form_memOp<31, 429, (outs),
-                            (ins vsrc:$XT, memr:$dst, g8rc:$rB),
-                            "stxvll $XT, $dst, $rB", IIC_LdStLoad,
-                            [(int_ppc_vsx_stxvll v4i32:$XT, addr:$dst,
-                              i64:$rB)]>;
-  } // mayStore
-
-  let Predicates = [IsLittleEndian] in {
-  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
-           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>;
-  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
-           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>;
-  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
-           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>;
-  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
-           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
-  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
-           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
-  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
-           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>;
-  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
-           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>;
-  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
-           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>;
-  }
-
-  let Predicates = [IsBigEndian] in {
-  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
-           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
-  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
-           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>;
-  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
-           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>;
-  def: Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
-           (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>;
-  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
-           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>;
-  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
-           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>;
-  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
-           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>;
-  def: Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
-           (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
-  }
-
-  // Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead
-  // of f64
-  def : Pat<(v8i16 (PPCmtvsrz i32:$A)),
-            (v8i16 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
-  def : Pat<(v16i8 (PPCmtvsrz i32:$A)),
-            (v16i8 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
-
-  // Patterns for which instructions from ISA 3.0 are a better match
-  let Predicates = [IsLittleEndian, HasP9Vector] in {
-  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
-            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
-  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
-            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
-  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
-            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
-  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
-            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
-  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
-            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
-  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
-            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>;
-  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
-            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>;
-  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
-            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
-  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
-            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
-  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
-            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>;
-  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)),
-            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>;
-  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)),
-            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
-  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)),
-            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
-  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)),
-            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>;
-  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 2)),
-            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
-  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
-            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
-
-  def : Pat<(v8i16 (PPCld_vec_be xoaddr:$src)),
-            (COPY_TO_REGCLASS (LXVH8X xoaddr:$src), VRRC)>;
-  def : Pat<(PPCst_vec_be v8i16:$rS, xoaddr:$dst),
-            (STXVH8X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
-
-  def : Pat<(v16i8 (PPCld_vec_be xoaddr:$src)),
-            (COPY_TO_REGCLASS (LXVB16X xoaddr:$src), VRRC)>;
-  def : Pat<(PPCst_vec_be v16i8:$rS, xoaddr:$dst),
-            (STXVB16X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
-  } // IsLittleEndian, HasP9Vector
-
-  let Predicates = [IsBigEndian, HasP9Vector] in {
-  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
-            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
-  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
-            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
-  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
-            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
-  def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
-            (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
-  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
-            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
-  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
-            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>;
-  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
-            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>;
-  def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
-            (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
-  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
-            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
-  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
-            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>;
-  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)),
-            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>;
-  def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)),
-            (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
-  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)),
-            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
-  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)),
-            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
-  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 2)),
-            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>;
-  def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
-            (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
-  } // IsBigEndian, HasP9Vector
-
-  // D-Form Load/Store
-  def : Pat<(v4i32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v4f32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v2i64 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v2f64 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
-  def : Pat<(f128  (quadwOffsetLoad iaddrX16:$src)),
-            (COPY_TO_REGCLASS (LXV memrix16:$src), VRRC)>;
-  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddrX16:$src)), (LXV memrix16:$src)>;
-  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddrX16:$src)), (LXV memrix16:$src)>;
-
-  def : Pat<(quadwOffsetStore v4f32:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(quadwOffsetStore v4i32:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(quadwOffsetStore v2f64:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(quadwOffsetStore  f128:$rS, iaddrX16:$dst),
-            (STXV (COPY_TO_REGCLASS $rS, VSRC), memrix16:$dst)>;
-  def : Pat<(quadwOffsetStore v2i64:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddrX16:$dst),
-            (STXV $rS, memrix16:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddrX16:$dst),
-            (STXV $rS, memrix16:$dst)>;
-
-
-  def : Pat<(v2f64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(v2i64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(v4f32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(v4i32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
-  def : Pat<(f128  (nonQuadwOffsetLoad xoaddr:$src)),
-            (COPY_TO_REGCLASS (LXVX xoaddr:$src), VRRC)>;
-  def : Pat<(nonQuadwOffsetStore f128:$rS, xoaddr:$dst),
-            (STXVX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
-  def : Pat<(nonQuadwOffsetStore v2f64:$rS, xoaddr:$dst),
-            (STXVX $rS, xoaddr:$dst)>;
-  def : Pat<(nonQuadwOffsetStore v2i64:$rS, xoaddr:$dst),
-            (STXVX $rS, xoaddr:$dst)>;
-  def : Pat<(nonQuadwOffsetStore v4f32:$rS, xoaddr:$dst),
-            (STXVX $rS, xoaddr:$dst)>;
-  def : Pat<(nonQuadwOffsetStore v4i32:$rS, xoaddr:$dst),
-            (STXVX $rS, xoaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
-            (STXVX $rS, xoaddr:$dst)>;
-  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
-            (STXVX $rS, xoaddr:$dst)>;
-
-  let AddedComplexity = 400 in {
-    // LIWAX - This instruction is used for sign extending i32 -> i64.
-    // LIWZX - This instruction will be emitted for i32, f32, and when
-    //         zero-extending i32 to i64 (zext i32 -> i64).
-    let Predicates = [IsLittleEndian] in {
-
-      def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 xoaddr:$src)))),
-                (v2i64 (XXPERMDIs
-                (COPY_TO_REGCLASS (LIWAX xoaddr:$src), VSRC), 2))>;
-
-      def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 xoaddr:$src)))),
-                (v2i64 (XXPERMDIs
-                (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>;
-
-      def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
-                (v4i32 (XXPERMDIs
-                (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>;
-
-      def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
-                (v4f32 (XXPERMDIs
-                (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 2))>;
-    }
-
-    let Predicates = [IsBigEndian] in {
-      def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 xoaddr:$src)))),
-                (v2i64 (COPY_TO_REGCLASS (LIWAX xoaddr:$src), VSRC))>;
-
-      def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 xoaddr:$src)))),
-                (v2i64 (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC))>;
-
-      def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
-                (v4i32 (XXSLDWIs
-                (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 1))>;
-
-      def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
-                (v4f32 (XXSLDWIs
-                (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 1))>;
-    }
-
-  }
-
-  // Build vectors from i8 loads
-  def : Pat<(v16i8 (scalar_to_vector ScalarLoads.Li8)),
-            (v16i8 (VSPLTBs 7, (LXSIBZX xoaddr:$src)))>;
-  def : Pat<(v8i16 (scalar_to_vector ScalarLoads.ZELi8)),
-            (v8i16 (VSPLTHs 3, (LXSIBZX xoaddr:$src)))>;
-  def : Pat<(v4i32 (scalar_to_vector ScalarLoads.ZELi8)),
-           (v4i32 (XXSPLTWs (LXSIBZX xoaddr:$src), 1))>;
-  def : Pat<(v2i64 (scalar_to_vector ScalarLoads.ZELi8i64)),
-            (v2i64 (XXPERMDIs (LXSIBZX xoaddr:$src), 0))>;
-  def : Pat<(v4i32 (scalar_to_vector ScalarLoads.SELi8)),
-            (v4i32 (XXSPLTWs (VEXTSB2Ws (LXSIBZX xoaddr:$src)), 1))>;
-  def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi8i64)),
-            (v2i64 (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0))>;
-
-  // Build vectors from i16 loads
-  def : Pat<(v8i16 (scalar_to_vector ScalarLoads.Li16)),
-            (v8i16 (VSPLTHs 3, (LXSIHZX xoaddr:$src)))>;
-  def : Pat<(v4i32 (scalar_to_vector ScalarLoads.ZELi16)),
-            (v4i32 (XXSPLTWs (LXSIHZX xoaddr:$src), 1))>;
-  def : Pat<(v2i64 (scalar_to_vector ScalarLoads.ZELi16i64)),
-           (v2i64 (XXPERMDIs (LXSIHZX xoaddr:$src), 0))>;
-  def : Pat<(v4i32 (scalar_to_vector ScalarLoads.SELi16)),
-            (v4i32 (XXSPLTWs (VEXTSH2Ws (LXSIHZX xoaddr:$src)), 1))>;
-  def : Pat<(v2i64 (scalar_to_vector ScalarLoads.SELi16i64)),
-            (v2i64 (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0))>;
-
-  let Predicates = [IsBigEndian, HasP9Vector] in {
-  // Scalar stores of i8
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 9)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 11)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 13)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 15)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 1)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 3)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 5)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 7)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
-
-  // Scalar stores of i16
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
-  } // IsBigEndian, HasP9Vector
-
-  let Predicates = [IsLittleEndian, HasP9Vector] in {
-  // Scalar stores of i8
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 7)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 5)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 3)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 1)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 15)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 13)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 11)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
-            (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 9)), VSRC), xoaddr:$dst)>;
-
-  // Scalar stores of i16
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
-  def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
-            (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
-  } // IsLittleEndian, HasP9Vector
-
-
-  // Vector sign extensions
-  def : Pat<(f64 (PPCVexts f64:$A, 1)),
-            (f64 (COPY_TO_REGCLASS (VEXTSB2Ds $A), VSFRC))>;
-  def : Pat<(f64 (PPCVexts f64:$A, 2)),
-            (f64 (COPY_TO_REGCLASS (VEXTSH2Ds $A), VSFRC))>;
-
-  def DFLOADf32  : PPCPostRAExpPseudo<(outs vssrc:$XT), (ins memrix:$src),
-                          "#DFLOADf32",
-                          [(set f32:$XT, (load iaddrX4:$src))]>;
-  def DFLOADf64  : PPCPostRAExpPseudo<(outs vsfrc:$XT), (ins memrix:$src),
-                          "#DFLOADf64",
-                          [(set f64:$XT, (load iaddrX4:$src))]>;
-  def DFSTOREf32 : PPCPostRAExpPseudo<(outs), (ins vssrc:$XT, memrix:$dst),
-                          "#DFSTOREf32",
-                          [(store f32:$XT, iaddrX4:$dst)]>;
-  def DFSTOREf64 : PPCPostRAExpPseudo<(outs), (ins vsfrc:$XT, memrix:$dst),
-                          "#DFSTOREf64",
-                          [(store f64:$XT, iaddrX4:$dst)]>;
-
-  def : Pat<(f64 (extloadf32 iaddrX4:$src)),
-            (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$src), VSFRC)>;
-  def : Pat<(f32 (fpround (f64 (extloadf32 iaddrX4:$src)))),
-            (f32 (DFLOADf32 iaddrX4:$src))>;
-
-  def : Pat<(v4f32 (PPCldvsxlh xaddr:$src)),
-            (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC)>;
-  def : Pat<(v4f32 (PPCldvsxlh iaddrX4:$src)),
-            (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC)>;
-
-  let AddedComplexity = 400 in {
-  // The following pseudoinstructions are used to ensure the utilization
-  // of all 64 VSX registers.
-    let Predicates = [IsLittleEndian, HasP9Vector] in {
-      def : Pat<(v2i64 (scalar_to_vector (i64 (load iaddrX4:$src)))),
-                (v2i64 (XXPERMDIs
-                (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC), 2))>;
-      def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddrX4:$src)))),
-                (v2i64 (XXPERMDIs
-		(COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC), 2))>;
-
-      def : Pat<(v2f64 (scalar_to_vector (f64 (load iaddrX4:$src)))),
-                (v2f64 (XXPERMDIs
-                (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC), 2))>;
-      def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddrX4:$src)))),
-                (v2f64 (XXPERMDIs
-                (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC), 2))>;
-      def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xaddrX4:$src),
-                (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
-                             sub_64), xaddrX4:$src)>;
-      def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xaddrX4:$src),
-                (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
-                             sub_64), xaddrX4:$src)>;
-      def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xaddrX4:$src),
-                (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
-      def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xaddrX4:$src),
-                (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
-      def : Pat<(store (i64 (extractelt v2i64:$A, 0)), iaddrX4:$src),
-                (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
-                             sub_64), iaddrX4:$src)>;
-      def : Pat<(store (f64 (extractelt v2f64:$A, 0)), iaddrX4:$src),
-                (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
-                            iaddrX4:$src)>;
-      def : Pat<(store (i64 (extractelt v2i64:$A, 1)), iaddrX4:$src),
-                (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
-      def : Pat<(store (f64 (extractelt v2f64:$A, 1)), iaddrX4:$src),
-                (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
-    } // IsLittleEndian, HasP9Vector
-
-    let Predicates = [IsBigEndian, HasP9Vector] in {
-      def : Pat<(v2i64 (scalar_to_vector (i64 (load iaddrX4:$src)))),
-                (v2i64 (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC))>;
-      def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddrX4:$src)))),
-                (v2i64 (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC))>;
-
-      def : Pat<(v2f64 (scalar_to_vector (f64 (load iaddrX4:$src)))),
-                (v2f64 (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC))>;
-      def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddrX4:$src)))),
-                (v2f64 (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC))>;
-      def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xaddrX4:$src),
-                (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
-                             sub_64), xaddrX4:$src)>;
-      def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xaddrX4:$src),
-                (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
-                             sub_64), xaddrX4:$src)>;
-      def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xaddrX4:$src),
-                (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
-      def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xaddrX4:$src),
-                (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
-      def : Pat<(store (i64 (extractelt v2i64:$A, 1)), iaddrX4:$src),
-                (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
-                             sub_64), iaddrX4:$src)>;
-      def : Pat<(store (f64 (extractelt v2f64:$A, 1)), iaddrX4:$src),
-                (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
-                             sub_64), iaddrX4:$src)>;
-      def : Pat<(store (i64 (extractelt v2i64:$A, 0)), iaddrX4:$src),
-                (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
-      def : Pat<(store (f64 (extractelt v2f64:$A, 0)), iaddrX4:$src),
-                (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
-    } // IsBigEndian, HasP9Vector
-  }
-
-  let Predicates = [IsBigEndian, HasP9Vector] in {
-
-    // (Un)Signed DWord vector extract -> QP
-    def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 0)))),
-              (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
-    def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 1)))),
-              (f128 (XSCVSDQP
-                      (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
-    def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 0)))),
-              (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
-    def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 1)))),
-              (f128 (XSCVUDQP
-                      (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
-
-    // (Un)Signed Word vector extract -> QP
-    def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, 1)))),
-              (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSW2D $src), sub_64)))>;
-    foreach Idx = [0,2,3] in {
-      def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, Idx)))),
-                (f128 (XSCVSDQP (EXTRACT_SUBREG
-                                (VEXTSW2D (VSPLTW Idx, $src)), sub_64)))>;
-    }
-    foreach Idx = 0-3 in {
-      def : Pat<(f128 (uint_to_fp (i32 (extractelt v4i32:$src, Idx)))),
-                (f128 (XSCVUDQP (XXEXTRACTUW $src, !shl(Idx, 2))))>;
-    }
-
-    // (Un)Signed HWord vector extract -> QP
-    foreach Idx = 0-7 in {
-      def : Pat<(f128 (sint_to_fp
-                        (i32 (sext_inreg
-                               (vector_extract v8i16:$src, Idx), i16)))),
-              (f128 (XSCVSDQP (EXTRACT_SUBREG
-                                (VEXTSH2D (VEXTRACTUH !add(Idx, Idx), $src)),
-                                sub_64)))>;
-      // The SDAG adds the `and` since an `i16` is being extracted as an `i32`.
-      def : Pat<(f128 (uint_to_fp
-                        (and (i32 (vector_extract v8i16:$src, Idx)), 65535))),
-                (f128 (XSCVUDQP (EXTRACT_SUBREG
-                                  (VEXTRACTUH !add(Idx, Idx), $src), sub_64)))>;
-    }
-
-    // (Un)Signed Byte vector extract -> QP
-    foreach Idx = 0-15 in {
-      def : Pat<(f128 (sint_to_fp
-                        (i32 (sext_inreg (vector_extract v16i8:$src, Idx),
-                                         i8)))),
-                (f128 (XSCVSDQP (EXTRACT_SUBREG
-                                  (VEXTSB2D (VEXTRACTUB Idx, $src)), sub_64)))>;
-      def : Pat<(f128 (uint_to_fp
-                        (and (i32 (vector_extract v16i8:$src, Idx)), 255))),
-                (f128 (XSCVUDQP
-                        (EXTRACT_SUBREG (VEXTRACTUB Idx, $src), sub_64)))>;
-    }
-
-    // Unsiged int in vsx register -> QP
-    def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))),
-              (f128 (XSCVUDQP
-                      (XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 4)))>;
-  } // IsBigEndian, HasP9Vector
-
-  let Predicates = [IsLittleEndian, HasP9Vector] in {
-
-    // (Un)Signed DWord vector extract -> QP
-    def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 0)))),
-              (f128 (XSCVSDQP
-                      (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
-    def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 1)))),
-              (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
-    def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 0)))),
-              (f128 (XSCVUDQP
-                      (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
-    def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 1)))),
-              (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
-
-    // (Un)Signed Word vector extract -> QP
-    foreach Idx = [[0,3],[1,2],[3,0]] in {
-      def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, !head(Idx))))),
-                (f128 (XSCVSDQP (EXTRACT_SUBREG
-                                  (VEXTSW2D (VSPLTW !head(!tail(Idx)), $src)),
-                                  sub_64)))>;
-    }
-    def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, 2)))),
-              (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSW2D $src), sub_64)))>;
-
-    foreach Idx = [[0,12],[1,8],[2,4],[3,0]] in {
-      def : Pat<(f128 (uint_to_fp (i32 (extractelt v4i32:$src, !head(Idx))))),
-                (f128 (XSCVUDQP (XXEXTRACTUW $src, !head(!tail(Idx)))))>;
-    }
-
-    // (Un)Signed HWord vector extract -> QP
-    // The Nested foreach lists identifies the vector element and corresponding
-    // register byte location.
-    foreach Idx = [[0,14],[1,12],[2,10],[3,8],[4,6],[5,4],[6,2],[7,0]] in {
-      def : Pat<(f128 (sint_to_fp
-                        (i32 (sext_inreg
-                               (vector_extract v8i16:$src, !head(Idx)), i16)))),
-                (f128 (XSCVSDQP
-                        (EXTRACT_SUBREG (VEXTSH2D
-                                          (VEXTRACTUH !head(!tail(Idx)), $src)),
-                                        sub_64)))>;
-      def : Pat<(f128 (uint_to_fp
-                        (and (i32 (vector_extract v8i16:$src, !head(Idx))),
-                             65535))),
-                (f128 (XSCVUDQP (EXTRACT_SUBREG
-                                  (VEXTRACTUH !head(!tail(Idx)), $src), sub_64)))>;
-    }
-
-    // (Un)Signed Byte vector extract -> QP
-    foreach Idx = [[0,15],[1,14],[2,13],[3,12],[4,11],[5,10],[6,9],[7,8],[8,7],
-                   [9,6],[10,5],[11,4],[12,3],[13,2],[14,1],[15,0]] in {
-      def : Pat<(f128 (sint_to_fp
-                        (i32 (sext_inreg
-                               (vector_extract v16i8:$src, !head(Idx)), i8)))),
-                (f128 (XSCVSDQP
-                        (EXTRACT_SUBREG
-                          (VEXTSB2D (VEXTRACTUB !head(!tail(Idx)), $src)),
-                          sub_64)))>;
-      def : Pat<(f128 (uint_to_fp
-                        (and (i32 (vector_extract v16i8:$src, !head(Idx))),
-                             255))),
-                (f128 (XSCVUDQP
-                        (EXTRACT_SUBREG
-                          (VEXTRACTUB !head(!tail(Idx)), $src), sub_64)))>;
-    }
-
-    // Unsiged int in vsx register -> QP
-    def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))),
-              (f128 (XSCVUDQP
-                      (XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 8)))>;
-  } // IsLittleEndian, HasP9Vector
-
-  // Convert (Un)Signed DWord in memory -> QP
-  def : Pat<(f128 (sint_to_fp (i64 (load xaddrX4:$src)))),
-            (f128 (XSCVSDQP (LXSDX xaddrX4:$src)))>;
-  def : Pat<(f128 (sint_to_fp (i64 (load iaddrX4:$src)))),
-            (f128 (XSCVSDQP (LXSD iaddrX4:$src)))>;
-  def : Pat<(f128 (uint_to_fp (i64 (load xaddrX4:$src)))),
-            (f128 (XSCVUDQP (LXSDX xaddrX4:$src)))>;
-  def : Pat<(f128 (uint_to_fp (i64 (load iaddrX4:$src)))),
-            (f128 (XSCVUDQP (LXSD iaddrX4:$src)))>;
-
-  // Convert Unsigned HWord in memory -> QP
-  def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi16)),
-            (f128 (XSCVUDQP (LXSIHZX xaddr:$src)))>;
-
-  // Convert Unsigned Byte in memory -> QP
-  def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi8)),
-            (f128 (XSCVUDQP (LXSIBZX xoaddr:$src)))>;
-
-  // Truncate & Convert QP -> (Un)Signed (D)Word.
-  def : Pat<(i64 (fp_to_sint f128:$src)), (i64 (MFVRD (XSCVQPSDZ $src)))>;
-  def : Pat<(i64 (fp_to_uint f128:$src)), (i64 (MFVRD (XSCVQPUDZ $src)))>;
-  def : Pat<(i32 (fp_to_sint f128:$src)),
-            (i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC)))>;
-  def : Pat<(i32 (fp_to_uint f128:$src)),
-            (i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC)))>;
-
-  // Instructions for store(fptosi).
-  // The 8-byte version is repeated here due to availability of D-Form STXSD.
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xaddrX4:$dst, 8),
-            (STXSDX (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
-                    xaddrX4:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), iaddrX4:$dst, 8),
-            (STXSD (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
-                   iaddrX4:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 4),
-            (STXSIWX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 2),
-            (STXSIHX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 1),
-            (STXSIBX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xaddrX4:$dst, 8),
-            (STXSDX (XSCVDPSXDS f64:$src), xaddrX4:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), iaddrX4:$dst, 8),
-            (STXSD (XSCVDPSXDS f64:$src), iaddrX4:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 2),
-            (STXSIHX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 1),
-            (STXSIBX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
-
-  // Instructions for store(fptoui).
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xaddrX4:$dst, 8),
-            (STXSDX (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
-                    xaddrX4:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), iaddrX4:$dst, 8),
-            (STXSD (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
-                   iaddrX4:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 4),
-            (STXSIWX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 2),
-            (STXSIHX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 1),
-            (STXSIBX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xaddrX4:$dst, 8),
-            (STXSDX (XSCVDPUXDS f64:$src), xaddrX4:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), iaddrX4:$dst, 8),
-            (STXSD (XSCVDPUXDS f64:$src), iaddrX4:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 2),
-            (STXSIHX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
-  def : Pat<(PPCstore_scal_int_from_vsr
-              (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 1),
-            (STXSIBX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
-
-  // Round & Convert QP -> DP/SP
-  def : Pat<(f64 (fpround f128:$src)), (f64 (XSCVQPDP $src))>;
-  def : Pat<(f32 (fpround f128:$src)), (f32 (XSRSP (XSCVQPDPO $src)))>;
-
-  // Convert SP -> QP
-  def : Pat<(f128 (fpextend f32:$src)),
-            (f128 (XSCVDPQP (COPY_TO_REGCLASS $src, VFRC)))>;
-
-  def : Pat<(f32 (PPCxsmaxc f32:$XA, f32:$XB)),
-            (f32 (COPY_TO_REGCLASS (XSMAXCDP (COPY_TO_REGCLASS $XA, VSSRC),
-                                             (COPY_TO_REGCLASS $XB, VSSRC)),
-                                   VSSRC))>;
-  def : Pat<(f32 (PPCxsminc f32:$XA, f32:$XB)),
-            (f32 (COPY_TO_REGCLASS (XSMINCDP (COPY_TO_REGCLASS $XA, VSSRC),
-                                             (COPY_TO_REGCLASS $XB, VSSRC)),
-                                   VSSRC))>;
-
-} // end HasP9Vector, AddedComplexity
-
-let AddedComplexity = 400 in {
-  let Predicates = [IsISA3_0, HasP9Vector, HasDirectMove, IsBigEndian] in {
-    def : Pat<(f128 (PPCbuild_fp128 i64:$rB, i64:$rA)),
-              (f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>;
-  }
-  let Predicates = [IsISA3_0, HasP9Vector, HasDirectMove, IsLittleEndian] in {
-    def : Pat<(f128 (PPCbuild_fp128 i64:$rA, i64:$rB)),
-              (f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>;
-  }
-}
-
-let Predicates = [HasP9Vector], hasSideEffects = 0 in {
-  let mayStore = 1 in {
-    def SPILLTOVSR_STX : PseudoXFormMemOp<(outs),
-                                          (ins spilltovsrrc:$XT, memrr:$dst),
-                                          "#SPILLTOVSR_STX", []>;
-    def SPILLTOVSR_ST : PPCPostRAExpPseudo<(outs), (ins spilltovsrrc:$XT, memrix:$dst),
-                              "#SPILLTOVSR_ST", []>;
-  }
-  let mayLoad = 1 in {
-    def SPILLTOVSR_LDX : PseudoXFormMemOp<(outs spilltovsrrc:$XT),
-                                          (ins memrr:$src),
-                                          "#SPILLTOVSR_LDX", []>;
-    def SPILLTOVSR_LD : PPCPostRAExpPseudo<(outs spilltovsrrc:$XT), (ins memrix:$src),
-                              "#SPILLTOVSR_LD", []>;
-
-  }
-}
 // Integer extend helper dags 32 -> 64
 def AnyExts {
   dag A = (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $A, sub_32);
@@ -3830,10 +2186,10 @@ def AnyExts {
 }
 
 def DblToFlt {
-  dag A0 = (f32 (fpround (f64 (extractelt v2f64:$A, 0))));
-  dag A1 = (f32 (fpround (f64 (extractelt v2f64:$A, 1))));
-  dag B0 = (f32 (fpround (f64 (extractelt v2f64:$B, 0))));
-  dag B1 = (f32 (fpround (f64 (extractelt v2f64:$B, 1))));
+  dag A0 = (f32 (any_fpround (f64 (extractelt v2f64:$A, 0))));
+  dag A1 = (f32 (any_fpround (f64 (extractelt v2f64:$A, 1))));
+  dag B0 = (f32 (any_fpround (f64 (extractelt v2f64:$B, 0))));
+  dag B1 = (f32 (any_fpround (f64 (extractelt v2f64:$B, 1))));
 }
 
 def ExtDbl {
@@ -4024,397 +2380,2261 @@ def MrgWords {
   dag CVCAU = (v4i32 (XVCVDPUXWS CA));
 }
 
-// Patterns for BUILD_VECTOR nodes.
+//---------------------------- Anonymous Patterns ----------------------------//
+// Predicate combinations are kept in roughly chronological order in terms of
+// instruction availability in the architecture. For example, VSX came in with
+// ISA 2.06 (Power7). There have since been additions in ISA 2.07 (Power8) and
+// ISA 3.0 (Power9). However, the granularity of features on later subtargets
+// is finer for various reasons. For example, we have Power8Vector,
+// Power8Altivec, DirectMove that all came in with ISA 2.07. The situation is
+// similar with ISA 3.0 with Power9Vector, Power9Altivec, IsISA3_0. Then there
+// are orthogonal predicates such as endianness for which the order was
+// arbitrarily chosen to be Big, Little.
+//
+// Predicate combinations available:
+// [HasVSX]
+// [HasVSX, IsBigEndian]
+// [HasVSX, IsLittleEndian]
+// [HasVSX, NoP9Vector]
+// [HasVSX, HasOnlySwappingMemOps]
+// [HasVSX, HasOnlySwappingMemOps, IsBigEndian]
+// [HasVSX, HasP8Vector]
+// [HasVSX, HasP8Vector, IsBigEndian]
+// [HasVSX, HasP8Vector, IsLittleEndian]
+// [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian]
+// [HasVSX, HasP8Vector, NoP9Vector, IsLittleEndian]
+// [HasVSX, HasDirectMove]
+// [HasVSX, HasDirectMove, IsBigEndian]
+// [HasVSX, HasDirectMove, IsLittleEndian]
+// [HasVSX, HasDirectMove, NoP9Altivec, IsBigEndian]
+// [HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian]
+// [HasVSX, HasDirectMove, NoP9Vector, IsBigEndian]
+// [HasVSX, HasDirectMove, NoP9Vector, IsLittleEndian]
+// [HasVSX, HasP9Vector]
+// [HasVSX, HasP9Vector, IsBigEndian]
+// [HasVSX, HasP9Vector, IsLittleEndian]
+// [HasVSX, HasP9Altivec]
+// [HasVSX, HasP9Altivec, IsBigEndian]
+// [HasVSX, HasP9Altivec, IsLittleEndian]
+// [HasVSX, IsISA3_0, HasDirectMove, IsBigEndian]
+// [HasVSX, IsISA3_0, HasDirectMove, IsLittleEndian]
+
 let AddedComplexity = 400 in {
+// Valid for any VSX subtarget, regardless of endianness.
+let Predicates = [HasVSX] in {
+def : Pat<(v4i32 (vnot_ppc v4i32:$A)),
+          (v4i32 (XXLNOR $A, $A))>;
+def : Pat<(v4i32 (or (and (vnot_ppc v4i32:$C), v4i32:$A),
+                     (and v4i32:$B, v4i32:$C))),
+          (v4i32 (XXSEL $A, $B, $C))>;
 
-  let Predicates = [HasVSX] in {
-    // Build vectors of floating point converted to i32.
-    def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.A,
-                                   DblToInt.A, DblToInt.A)),
-              (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS $A), VSRC), 1))>;
-    def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.A,
-                                   DblToUInt.A, DblToUInt.A)),
-              (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS $A), VSRC), 1))>;
-    def : Pat<(v2i64 (build_vector DblToLong.A, DblToLong.A)),
-              (v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC),
-                               (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC), 0))>;
-    def : Pat<(v2i64 (build_vector DblToULong.A, DblToULong.A)),
-              (v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC),
-                               (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC), 0))>;
-    def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)),
-              (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPSXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>;
-    def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
-              (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1))>;
-    def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
-              (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
-    def : Pat<(v2f64 (PPCldsplat xoaddr:$A)),
-              (v2f64 (LXVDSX xoaddr:$A))>;
-    def : Pat<(v2i64 (PPCldsplat xoaddr:$A)),
-              (v2i64 (LXVDSX xoaddr:$A))>;
-
-    // Build vectors of floating point converted to i64.
-    def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)),
-              (v2i64 (XXPERMDIs
-                       (COPY_TO_REGCLASS (XSCVDPSXDSs $A), VSFRC), 0))>;
-    def : Pat<(v2i64 (build_vector FltToULong.A, FltToULong.A)),
-              (v2i64 (XXPERMDIs
-                       (COPY_TO_REGCLASS (XSCVDPUXDSs $A), VSFRC), 0))>;
-    def : Pat<(v2i64 (scalar_to_vector DblToLongLoad.A)),
-              (v2i64 (XVCVDPSXDS (LXVDSX xoaddr:$A)))>;
-    def : Pat<(v2i64 (scalar_to_vector DblToULongLoad.A)),
-              (v2i64 (XVCVDPUXDS (LXVDSX xoaddr:$A)))>;
-  }
+// Additional fnmsub pattern for PPC specific ISD opcode
+def : Pat<(PPCfnmsub f64:$A, f64:$B, f64:$C),
+          (XSNMSUBADP $C, $A, $B)>;
+def : Pat<(fneg (PPCfnmsub f64:$A, f64:$B, f64:$C)),
+          (XSMSUBADP $C, $A, $B)>;
+def : Pat<(PPCfnmsub f64:$A, f64:$B, (fneg f64:$C)),
+          (XSNMADDADP $C, $A, $B)>;
 
-  let Predicates = [HasVSX, NoP9Vector] in {
-    // Load-and-splat with fp-to-int conversion (using X-Form VSX/FP loads).
-    def : Pat<(v4i32 (scalar_to_vector DblToIntLoad.A)),
-              (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPSXWS (XFLOADf64 xoaddr:$A)), VSRC), 1))>;
-    def : Pat<(v4i32 (scalar_to_vector DblToUIntLoad.A)),
-              (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPUXWS (XFLOADf64 xoaddr:$A)), VSRC), 1))>;
-    def : Pat<(v2i64 (scalar_to_vector FltToLongLoad.A)),
-              (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
-                                              (XFLOADf32 xoaddr:$A), VSFRC)), 0))>;
-    def : Pat<(v2i64 (scalar_to_vector FltToULongLoad.A)),
-              (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
-                                              (XFLOADf32 xoaddr:$A), VSFRC)), 0))>;
-  }
+def : Pat<(PPCfnmsub v2f64:$A, v2f64:$B, v2f64:$C),
+          (XVNMSUBADP $C, $A, $B)>;
+def : Pat<(fneg (PPCfnmsub v2f64:$A, v2f64:$B, v2f64:$C)),
+          (XVMSUBADP $C, $A, $B)>;
+def : Pat<(PPCfnmsub v2f64:$A, v2f64:$B, (fneg v2f64:$C)),
+          (XVNMADDADP $C, $A, $B)>;
 
-  let Predicates = [IsBigEndian, HasP8Vector] in {
-    def : Pat<DWToSPExtractConv.BVU,
-              (v4f32 (VPKUDUM (XXSLDWI (XVCVUXDSP $S1), (XVCVUXDSP $S1), 3),
-                              (XXSLDWI (XVCVUXDSP $S2), (XVCVUXDSP $S2), 3)))>;
-    def : Pat<DWToSPExtractConv.BVS,
-              (v4f32 (VPKUDUM (XXSLDWI (XVCVSXDSP $S1), (XVCVSXDSP $S1), 3),
-                              (XXSLDWI (XVCVSXDSP $S2), (XVCVSXDSP $S2), 3)))>;
-    def : Pat<(store (i32 (extractelt v4i32:$A, 1)), xoaddr:$src),
-              (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
-    def : Pat<(store (f32 (extractelt v4f32:$A, 1)), xoaddr:$src),
-              (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
-
-    // Elements in a register on a BE system are in order <0, 1, 2, 3>.
-    // The store instructions store the second word from the left.
-    // So to align element zero, we need to modulo-left-shift by 3 words.
-    // Similar logic applies for elements 2 and 3.
-    foreach Idx = [ [0,3], [2,1], [3,2] ] in {
-      def : Pat<(store (i32 (extractelt v4i32:$A, !head(Idx))), xoaddr:$src),
-                (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
-                                       sub_64), xoaddr:$src)>;
-      def : Pat<(store (f32 (extractelt v4f32:$A, !head(Idx))), xoaddr:$src),
-                (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
-                                       sub_64), xoaddr:$src)>;
-    }
-  }
+def : Pat<(PPCfnmsub v4f32:$A, v4f32:$B, v4f32:$C),
+          (XVNMSUBASP $C, $A, $B)>;
+def : Pat<(fneg (PPCfnmsub v4f32:$A, v4f32:$B, v4f32:$C)),
+          (XVMSUBASP $C, $A, $B)>;
+def : Pat<(PPCfnmsub v4f32:$A, v4f32:$B, (fneg v4f32:$C)),
+          (XVNMADDASP $C, $A, $B)>;
 
-  let Predicates = [HasP8Vector, IsBigEndian, NoP9Vector] in {
-    def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src),
-              (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
-    def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src),
-              (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
-    def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src),
-              (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
-                          xoaddr:$src)>;
-    def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src),
-              (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
-                          xoaddr:$src)>;
-   }
-
-  // Big endian, available on all targets with VSX
-  let Predicates = [IsBigEndian, HasVSX] in {
-    def : Pat<(v2f64 (build_vector f64:$A, f64:$B)),
-              (v2f64 (XXPERMDI
-                        (COPY_TO_REGCLASS $A, VSRC),
-                        (COPY_TO_REGCLASS $B, VSRC), 0))>;
-    // Using VMRGEW to assemble the final vector would be a lower latency
-    // solution. However, we choose to go with the slightly higher latency
-    // XXPERMDI for 2 reasons:
-    // 1. This is likely to occur in unrolled loops where regpressure is high,
-    //    so we want to use the latter as it has access to all 64 VSX registers.
-    // 2. Using Altivec instructions in this sequence would likely cause the
-    //    allocation of Altivec registers even for the loads which in turn would
-    //    force the use of LXSIWZX for the loads, adding a cycle of latency to
-    //    each of the loads which would otherwise be able to use LFIWZX.
-    def : Pat<(v4f32 (build_vector LoadFP.A, LoadFP.B, LoadFP.C, LoadFP.D)),
-              (v4f32 (XXPERMDI (XXMRGHW MrgFP.LD32A, MrgFP.LD32B),
-                               (XXMRGHW MrgFP.LD32C, MrgFP.LD32D), 3))>;
-    def : Pat<(v4f32 (build_vector f32:$A, f32:$B, f32:$C, f32:$D)),
-              (VMRGEW MrgFP.AC, MrgFP.BD)>;
-    def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
-                                   DblToFlt.B0, DblToFlt.B1)),
-              (v4f32 (VMRGEW MrgFP.ABhToFlt, MrgFP.ABlToFlt))>;
-
-    // Convert 4 doubles to a vector of ints.
-    def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.B,
-                                   DblToInt.C, DblToInt.D)),
-              (v4i32 (VMRGEW MrgWords.CVACS, MrgWords.CVBDS))>;
-    def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.B,
-                                   DblToUInt.C, DblToUInt.D)),
-              (v4i32 (VMRGEW MrgWords.CVACU, MrgWords.CVBDU))>;
-    def : Pat<(v4i32 (build_vector ExtDbl.A0S, ExtDbl.A1S,
-                                   ExtDbl.B0S, ExtDbl.B1S)),
-              (v4i32 (VMRGEW MrgWords.CVA0B0S, MrgWords.CVA1B1S))>;
-    def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U,
-                                   ExtDbl.B0U, ExtDbl.B1U)),
-              (v4i32 (VMRGEW MrgWords.CVA0B0U, MrgWords.CVA1B1U))>;
-  }
+def : Pat<(v2f64 (bitconvert v4f32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v4i32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v8i16:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2f64 (bitconvert v16i8:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
 
-  let Predicates = [IsLittleEndian, HasP8Vector] in {
-    def : Pat<DWToSPExtractConv.BVU,
-              (v4f32 (VPKUDUM (XXSLDWI (XVCVUXDSP $S2), (XVCVUXDSP $S2), 3),
-                              (XXSLDWI (XVCVUXDSP $S1), (XVCVUXDSP $S1), 3)))>;
-    def : Pat<DWToSPExtractConv.BVS,
-              (v4f32 (VPKUDUM (XXSLDWI (XVCVSXDSP $S2), (XVCVSXDSP $S2), 3),
-                              (XXSLDWI (XVCVSXDSP $S1), (XVCVSXDSP $S1), 3)))>;
-    def : Pat<(store (i32 (extractelt v4i32:$A, 2)), xoaddr:$src),
-              (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
-    def : Pat<(store (f32 (extractelt v4f32:$A, 2)), xoaddr:$src),
-              (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
-
-    // Elements in a register on a LE system are in order <3, 2, 1, 0>.
-    // The store instructions store the second word from the left.
-    // So to align element 3, we need to modulo-left-shift by 3 words.
-    // Similar logic applies for elements 0 and 1.
-    foreach Idx = [ [0,2], [1,1], [3,3] ] in {
-      def : Pat<(store (i32 (extractelt v4i32:$A, !head(Idx))), xoaddr:$src),
-                (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
-                                       sub_64), xoaddr:$src)>;
-      def : Pat<(store (f32 (extractelt v4f32:$A, !head(Idx))), xoaddr:$src),
-                (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
-                                       sub_64), xoaddr:$src)>;
-    }
-  }
+def : Pat<(v4f32 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v4i32 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v8i16 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v16i8 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
 
-  let Predicates = [HasP8Vector, IsLittleEndian, NoP9Vector] in {
-    def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src),
-              (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
-                          xoaddr:$src)>;
-    def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src),
-              (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
-                          xoaddr:$src)>;
-    def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src),
-              (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
-    def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src),
-              (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
-   }
-
-  let Predicates = [IsLittleEndian, HasVSX] in {
-  // Little endian, available on all targets with VSX
-    def : Pat<(v2f64 (build_vector f64:$A, f64:$B)),
-              (v2f64 (XXPERMDI
-                        (COPY_TO_REGCLASS $B, VSRC),
-                        (COPY_TO_REGCLASS $A, VSRC), 0))>;
-    // Using VMRGEW to assemble the final vector would be a lower latency
-    // solution. However, we choose to go with the slightly higher latency
-    // XXPERMDI for 2 reasons:
-    // 1. This is likely to occur in unrolled loops where regpressure is high,
-    //    so we want to use the latter as it has access to all 64 VSX registers.
-    // 2. Using Altivec instructions in this sequence would likely cause the
-    //    allocation of Altivec registers even for the loads which in turn would
-    //    force the use of LXSIWZX for the loads, adding a cycle of latency to
-    //    each of the loads which would otherwise be able to use LFIWZX.
-    def : Pat<(v4f32 (build_vector LoadFP.A, LoadFP.B, LoadFP.C, LoadFP.D)),
-              (v4f32 (XXPERMDI (XXMRGHW MrgFP.LD32D, MrgFP.LD32C),
-                               (XXMRGHW MrgFP.LD32B, MrgFP.LD32A), 3))>;
-    def : Pat<(v4f32 (build_vector f32:$D, f32:$C, f32:$B, f32:$A)),
-              (VMRGEW MrgFP.AC, MrgFP.BD)>;
-    def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
-                                   DblToFlt.B0, DblToFlt.B1)),
-              (v4f32 (VMRGEW MrgFP.BAhToFlt, MrgFP.BAlToFlt))>;
-
-    // Convert 4 doubles to a vector of ints.
-    def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.B,
-                                   DblToInt.C, DblToInt.D)),
-              (v4i32 (VMRGEW MrgWords.CVDBS, MrgWords.CVCAS))>;
-    def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.B,
-                                   DblToUInt.C, DblToUInt.D)),
-              (v4i32 (VMRGEW MrgWords.CVDBU, MrgWords.CVCAU))>;
-    def : Pat<(v4i32 (build_vector ExtDbl.A0S, ExtDbl.A1S,
-                                   ExtDbl.B0S, ExtDbl.B1S)),
-              (v4i32 (VMRGEW MrgWords.CVB1A1S, MrgWords.CVB0A0S))>;
-    def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U,
-                                   ExtDbl.B0U, ExtDbl.B1U)),
-              (v4i32 (VMRGEW MrgWords.CVB1A1U, MrgWords.CVB0A0U))>;
-  }
+def : Pat<(v2i64 (bitconvert v4f32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v4i32:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v8i16:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
+def : Pat<(v2i64 (bitconvert v16i8:$A)),
+          (COPY_TO_REGCLASS $A, VSRC)>;
 
-  let Predicates = [HasDirectMove] in {
-    // Endianness-neutral constant splat on P8 and newer targets. The reason
-    // for this pattern is that on targets with direct moves, we don't expand
-    // BUILD_VECTOR nodes for v4i32.
-    def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A,
-                                   immSExt5NonZero:$A, immSExt5NonZero:$A)),
-              (v4i32 (VSPLTISW imm:$A))>;
-  }
+def : Pat<(v4f32 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v4i32 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v8i16 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v16i8 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
 
-  let Predicates = [IsBigEndian, HasDirectMove, NoP9Vector] in {
-    // Big endian integer vectors using direct moves.
-    def : Pat<(v2i64 (build_vector i64:$A, i64:$B)),
-              (v2i64 (XXPERMDI
-                        (COPY_TO_REGCLASS (MTVSRD $A), VSRC),
-                        (COPY_TO_REGCLASS (MTVSRD $B), VSRC), 0))>;
-    def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
-              (XXPERMDI
-                (COPY_TO_REGCLASS
-                  (MTVSRD (RLDIMI AnyExts.B, AnyExts.A, 32, 0)), VSRC),
-                (COPY_TO_REGCLASS
-                  (MTVSRD (RLDIMI AnyExts.D, AnyExts.C, 32, 0)), VSRC), 0)>;
-    def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
-              (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
-  }
+def : Pat<(v2f64 (bitconvert v2i64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v2i64 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
 
-  let Predicates = [IsLittleEndian, HasDirectMove, NoP9Vector] in {
-    // Little endian integer vectors using direct moves.
-    def : Pat<(v2i64 (build_vector i64:$A, i64:$B)),
-              (v2i64 (XXPERMDI
-                        (COPY_TO_REGCLASS (MTVSRD $B), VSRC),
-                        (COPY_TO_REGCLASS (MTVSRD $A), VSRC), 0))>;
-    def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
-              (XXPERMDI
-                (COPY_TO_REGCLASS
-                  (MTVSRD (RLDIMI AnyExts.C, AnyExts.D, 32, 0)), VSRC),
-                (COPY_TO_REGCLASS
-                  (MTVSRD (RLDIMI AnyExts.A, AnyExts.B, 32, 0)), VSRC), 0)>;
-    def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
-              (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
-  }
+def : Pat<(v2f64 (bitconvert v1i128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v1i128 (bitconvert v2f64:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
 
-  let Predicates = [HasP8Vector] in {
-    def : Pat<(v1i128 (bitconvert (v16i8 immAllOnesV))),
-              (v1i128 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
-    def : Pat<(v2i64 (bitconvert (v16i8 immAllOnesV))),
-              (v2i64 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
-    def : Pat<(v8i16 (bitconvert (v16i8 immAllOnesV))),
-              (v8i16 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
-    def : Pat<(v16i8 (bitconvert (v16i8 immAllOnesV))),
-              (v16i8 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
-  }
+def : Pat<(v2i64 (bitconvert f128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v4i32 (bitconvert f128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v8i16 (bitconvert f128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
+def : Pat<(v16i8 (bitconvert f128:$A)),
+          (COPY_TO_REGCLASS $A, VRRC)>;
 
-  let Predicates = [HasP9Vector] in {
-    // Endianness-neutral patterns for const splats with ISA 3.0 instructions.
-    def : Pat<(v4i32 (scalar_to_vector i32:$A)),
-              (v4i32 (MTVSRWS $A))>;
-    def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
-              (v4i32 (MTVSRWS $A))>;
-    def : Pat<(v16i8 (build_vector immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
-                                   immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
-                                   immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
-                                   immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
-                                   immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
-                                   immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
-                                   immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
-                                   immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A)),
-              (v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>;
-    def : Pat<(v4i32 (scalar_to_vector FltToIntLoad.A)),
-              (v4i32 (XVCVSPSXWS (LXVWSX xoaddr:$A)))>;
-    def : Pat<(v4i32 (scalar_to_vector FltToUIntLoad.A)),
-              (v4i32 (XVCVSPUXWS (LXVWSX xoaddr:$A)))>;
-    def : Pat<(v4i32 (scalar_to_vector DblToIntLoadP9.A)),
-              (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPSXWS (DFLOADf64 iaddrX4:$A)), VSRC), 1))>;
-    def : Pat<(v4i32 (scalar_to_vector DblToUIntLoadP9.A)),
-              (v4i32 (XXSPLTW (COPY_TO_REGCLASS
-                                (XSCVDPUXWS (DFLOADf64 iaddrX4:$A)), VSRC), 1))>;
-    def : Pat<(v2i64 (scalar_to_vector FltToLongLoadP9.A)),
-              (v2i64 (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS
-                                              (DFLOADf32 iaddrX4:$A),
-                                              VSFRC)), 0))>;
-    def : Pat<(v2i64 (scalar_to_vector FltToULongLoadP9.A)),
-              (v2i64 (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS
-                                              (DFLOADf32 iaddrX4:$A),
-                                              VSFRC)), 0))>;
-    def : Pat<(v4f32 (PPCldsplat xoaddr:$A)),
-              (v4f32 (LXVWSX xoaddr:$A))>;
-    def : Pat<(v4i32 (PPCldsplat xoaddr:$A)),
-              (v4i32 (LXVWSX xoaddr:$A))>;
-  }
+def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 0)),
+          (v2f64 (XVCVSXWDP (v2i64 (XXMRGHW $C, $C))))>;
+def : Pat<(v2f64 (PPCsvec2fp v4i32:$C, 1)),
+          (v2f64 (XVCVSXWDP (v2i64 (XXMRGLW $C, $C))))>;
 
-  let Predicates = [IsISA3_0, HasDirectMove, IsBigEndian] in {
-    def : Pat<(i64 (extractelt v2i64:$A, 1)),
-              (i64 (MFVSRLD $A))>;
-    // Better way to build integer vectors if we have MTVSRDD. Big endian.
-    def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)),
-              (v2i64 (MTVSRDD $rB, $rA))>;
-    def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
-              (MTVSRDD
-                (RLDIMI AnyExts.B, AnyExts.A, 32, 0),
-                (RLDIMI AnyExts.D, AnyExts.C, 32, 0))>;
-  }
+def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 0)),
+          (v2f64 (XVCVUXWDP (v2i64 (XXMRGHW $C, $C))))>;
+def : Pat<(v2f64 (PPCuvec2fp v4i32:$C, 1)),
+          (v2f64 (XVCVUXWDP (v2i64 (XXMRGLW $C, $C))))>;
 
-  let Predicates = [IsISA3_0, HasDirectMove, IsLittleEndian] in {
-    def : Pat<(i64 (extractelt v2i64:$A, 0)),
-              (i64 (MFVSRLD $A))>;
-    // Better way to build integer vectors if we have MTVSRDD. Little endian.
-    def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)),
-              (v2i64 (MTVSRDD $rB, $rA))>;
-    def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
-              (MTVSRDD
-                (RLDIMI AnyExts.C, AnyExts.D, 32, 0),
-                (RLDIMI AnyExts.A, AnyExts.B, 32, 0))>;
-  }
-  // P9 Altivec instructions that can be used to build vectors.
-  // Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete
-  // with complexities of existing build vector patterns in this file.
-  let Predicates = [HasP9Altivec, IsLittleEndian] in {
-    def : Pat<(v2i64 (build_vector WordToDWord.LE_A0, WordToDWord.LE_A1)),
-              (v2i64 (VEXTSW2D $A))>;
-    def : Pat<(v2i64 (build_vector HWordToDWord.LE_A0, HWordToDWord.LE_A1)),
-              (v2i64 (VEXTSH2D $A))>;
-    def : Pat<(v4i32 (build_vector HWordToWord.LE_A0, HWordToWord.LE_A1,
-                      HWordToWord.LE_A2, HWordToWord.LE_A3)),
-              (v4i32 (VEXTSH2W $A))>;
-    def : Pat<(v4i32 (build_vector ByteToWord.LE_A0, ByteToWord.LE_A1,
-                      ByteToWord.LE_A2, ByteToWord.LE_A3)),
-              (v4i32 (VEXTSB2W $A))>;
-    def : Pat<(v2i64 (build_vector ByteToDWord.LE_A0, ByteToDWord.LE_A1)),
-              (v2i64 (VEXTSB2D $A))>;
-  }
+def : Pat<(v2f64 (PPCfpexth v4f32:$C, 0)), (XVCVSPDP (XXMRGHW $C, $C))>;
+def : Pat<(v2f64 (PPCfpexth v4f32:$C, 1)), (XVCVSPDP (XXMRGLW $C, $C))>;
 
-  let Predicates = [HasP9Altivec, IsBigEndian] in {
-    def : Pat<(v2i64 (build_vector WordToDWord.BE_A0, WordToDWord.BE_A1)),
-              (v2i64 (VEXTSW2D $A))>;
-    def : Pat<(v2i64 (build_vector HWordToDWord.BE_A0, HWordToDWord.BE_A1)),
-              (v2i64 (VEXTSH2D $A))>;
-    def : Pat<(v4i32 (build_vector HWordToWord.BE_A0, HWordToWord.BE_A1,
-                      HWordToWord.BE_A2, HWordToWord.BE_A3)),
-              (v4i32 (VEXTSH2W $A))>;
-    def : Pat<(v4i32 (build_vector ByteToWord.BE_A0, ByteToWord.BE_A1,
-                      ByteToWord.BE_A2, ByteToWord.BE_A3)),
-              (v4i32 (VEXTSB2W $A))>;
-    def : Pat<(v2i64 (build_vector ByteToDWord.BE_A0, ByteToDWord.BE_A1)),
-              (v2i64 (VEXTSB2D $A))>;
-  }
+// Permutes.
+def : Pat<(v2f64 (PPCxxswapd v2f64:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v2i64 (PPCxxswapd v2i64:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v4f32 (PPCxxswapd v4f32:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v4i32 (PPCxxswapd v4i32:$src)), (XXPERMDI $src, $src, 2)>;
+def : Pat<(v2f64 (PPCswapNoChain v2f64:$src)), (XXPERMDI $src, $src, 2)>;
 
-  let Predicates = [HasP9Altivec] in {
-    def: Pat<(v2i64 (PPCSExtVElems v16i8:$A)),
-              (v2i64 (VEXTSB2D $A))>;
-    def: Pat<(v2i64 (PPCSExtVElems v8i16:$A)),
-              (v2i64 (VEXTSH2D $A))>;
-    def: Pat<(v2i64 (PPCSExtVElems v4i32:$A)),
-              (v2i64 (VEXTSW2D $A))>;
-    def: Pat<(v4i32 (PPCSExtVElems v16i8:$A)),
-              (v4i32 (VEXTSB2W $A))>;
-    def: Pat<(v4i32 (PPCSExtVElems v8i16:$A)),
-              (v4i32 (VEXTSH2W $A))>;
-  }
+// PPCvecshl XT, XA, XA, 2 can be selected to both XXSLDWI XT,XA,XA,2 and
+// XXSWAPD XT,XA (i.e. XXPERMDI XT,XA,XA,2), the later one is more profitable.
+def : Pat<(v4i32 (PPCvecshl v4i32:$src, v4i32:$src, 2)),
+          (XXPERMDI $src, $src, 2)>;
+
+// Selects.
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLT)),
+          (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETULT)),
+          (SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETLE)),
+          (SELECT_VSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETULE)),
+          (SELECT_VSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETEQ)),
+          (SELECT_VSRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGE)),
+          (SELECT_VSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETUGE)),
+          (SELECT_VSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETGT)),
+          (SELECT_VSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETUGT)),
+          (SELECT_VSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(v2f64 (selectcc i1:$lhs, i1:$rhs, v2f64:$tval, v2f64:$fval, SETNE)),
+          (SELECT_VSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLT)),
+          (SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULT)),
+          (SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETLE)),
+          (SELECT_VSFRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETULE)),
+          (SELECT_VSFRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETEQ)),
+          (SELECT_VSFRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGE)),
+          (SELECT_VSFRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGE)),
+          (SELECT_VSFRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETGT)),
+          (SELECT_VSFRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETUGT)),
+          (SELECT_VSFRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f64 (selectcc i1:$lhs, i1:$rhs, f64:$tval, f64:$fval, SETNE)),
+          (SELECT_VSFRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+// Divides.
+def : Pat<(int_ppc_vsx_xvdivsp v4f32:$A, v4f32:$B),
+          (XVDIVSP $A, $B)>;
+def : Pat<(int_ppc_vsx_xvdivdp v2f64:$A, v2f64:$B),
+          (XVDIVDP $A, $B)>;
+
+// Reciprocal estimate
+def : Pat<(int_ppc_vsx_xvresp v4f32:$A),
+          (XVRESP $A)>;
+def : Pat<(int_ppc_vsx_xvredp v2f64:$A),
+          (XVREDP $A)>;
+
+// Recip. square root estimate
+def : Pat<(int_ppc_vsx_xvrsqrtesp v4f32:$A),
+          (XVRSQRTESP $A)>;
+def : Pat<(int_ppc_vsx_xvrsqrtedp v2f64:$A),
+          (XVRSQRTEDP $A)>;
+
+// Vector selection
+def : Pat<(v16i8 (vselect v16i8:$vA, v16i8:$vB, v16i8:$vC)),
+          (COPY_TO_REGCLASS
+                 (XXSEL (COPY_TO_REGCLASS $vC, VSRC),
+                        (COPY_TO_REGCLASS $vB, VSRC),
+                        (COPY_TO_REGCLASS $vA, VSRC)), VRRC)>;
+def : Pat<(v8i16 (vselect v8i16:$vA, v8i16:$vB, v8i16:$vC)),
+          (COPY_TO_REGCLASS
+                 (XXSEL (COPY_TO_REGCLASS $vC, VSRC),
+                        (COPY_TO_REGCLASS $vB, VSRC),
+                        (COPY_TO_REGCLASS $vA, VSRC)), VRRC)>;
+def : Pat<(vselect v4i32:$vA, v4i32:$vB, v4i32:$vC),
+          (XXSEL $vC, $vB, $vA)>;
+def : Pat<(vselect v2i64:$vA, v2i64:$vB, v2i64:$vC),
+          (XXSEL $vC, $vB, $vA)>;
+def : Pat<(vselect v4i32:$vA, v4f32:$vB, v4f32:$vC),
+          (XXSEL $vC, $vB, $vA)>;
+def : Pat<(vselect v2i64:$vA, v2f64:$vB, v2f64:$vC),
+          (XXSEL $vC, $vB, $vA)>;
+
+def : Pat<(v4f32 (any_fmaxnum v4f32:$src1, v4f32:$src2)),
+          (v4f32 (XVMAXSP $src1, $src2))>;
+def : Pat<(v4f32 (any_fminnum v4f32:$src1, v4f32:$src2)),
+          (v4f32 (XVMINSP $src1, $src2))>;
+def : Pat<(v2f64 (any_fmaxnum v2f64:$src1, v2f64:$src2)),
+          (v2f64 (XVMAXDP $src1, $src2))>;
+def : Pat<(v2f64 (any_fminnum v2f64:$src1, v2f64:$src2)),
+          (v2f64 (XVMINDP $src1, $src2))>;
+
+// f32 abs
+def : Pat<(f32 (fabs f32:$S)),
+          (f32 (COPY_TO_REGCLASS (XSABSDP
+               (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
+
+// f32 nabs
+def : Pat<(f32 (fneg (fabs f32:$S))),
+          (f32 (COPY_TO_REGCLASS (XSNABSDP
+               (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
+
+// f32 Min.
+def : Pat<(f32 (fminnum_ieee f32:$A, f32:$B)),
+          (f32 FpMinMax.F32Min)>;
+def : Pat<(f32 (fminnum_ieee (fcanonicalize f32:$A), f32:$B)),
+          (f32 FpMinMax.F32Min)>;
+def : Pat<(f32 (fminnum_ieee f32:$A, (fcanonicalize f32:$B))),
+          (f32 FpMinMax.F32Min)>;
+def : Pat<(f32 (fminnum_ieee (fcanonicalize f32:$A), (fcanonicalize f32:$B))),
+          (f32 FpMinMax.F32Min)>;
+// F32 Max.
+def : Pat<(f32 (fmaxnum_ieee f32:$A, f32:$B)),
+          (f32 FpMinMax.F32Max)>;
+def : Pat<(f32 (fmaxnum_ieee (fcanonicalize f32:$A), f32:$B)),
+          (f32 FpMinMax.F32Max)>;
+def : Pat<(f32 (fmaxnum_ieee f32:$A, (fcanonicalize f32:$B))),
+          (f32 FpMinMax.F32Max)>;
+def : Pat<(f32 (fmaxnum_ieee (fcanonicalize f32:$A), (fcanonicalize f32:$B))),
+          (f32 FpMinMax.F32Max)>;
+
+// f64 Min.
+def : Pat<(f64 (fminnum_ieee f64:$A, f64:$B)),
+          (f64 (XSMINDP $A, $B))>;
+def : Pat<(f64 (fminnum_ieee (fcanonicalize f64:$A), f64:$B)),
+          (f64 (XSMINDP $A, $B))>;
+def : Pat<(f64 (fminnum_ieee f64:$A, (fcanonicalize f64:$B))),
+          (f64 (XSMINDP $A, $B))>;
+def : Pat<(f64 (fminnum_ieee (fcanonicalize f64:$A), (fcanonicalize f64:$B))),
+          (f64 (XSMINDP $A, $B))>;
+// f64 Max.
+def : Pat<(f64 (fmaxnum_ieee f64:$A, f64:$B)),
+          (f64 (XSMAXDP $A, $B))>;
+def : Pat<(f64 (fmaxnum_ieee (fcanonicalize f64:$A), f64:$B)),
+          (f64 (XSMAXDP $A, $B))>;
+def : Pat<(f64 (fmaxnum_ieee f64:$A, (fcanonicalize f64:$B))),
+          (f64 (XSMAXDP $A, $B))>;
+def : Pat<(f64 (fmaxnum_ieee (fcanonicalize f64:$A), (fcanonicalize f64:$B))),
+          (f64 (XSMAXDP $A, $B))>;
+
+def : Pat<(int_ppc_vsx_stxvd2x_be v2f64:$rS, xoaddr:$dst),
+            (STXVD2X $rS, xoaddr:$dst)>;
+def : Pat<(int_ppc_vsx_stxvw4x_be v4i32:$rS, xoaddr:$dst),
+            (STXVW4X $rS, xoaddr:$dst)>;
+def : Pat<(v4i32 (int_ppc_vsx_lxvw4x_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+def : Pat<(v2f64 (int_ppc_vsx_lxvd2x_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+
+// Rounding for single precision.
+def : Pat<(f32 (any_fround f32:$S)),
+          (f32 (COPY_TO_REGCLASS (XSRDPI
+                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
+def : Pat<(f32 (any_fnearbyint f32:$S)),
+          (f32 (COPY_TO_REGCLASS (XSRDPIC
+                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
+def : Pat<(f32 (any_ffloor f32:$S)),
+          (f32 (COPY_TO_REGCLASS (XSRDPIM
+                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
+def : Pat<(f32 (any_fceil f32:$S)),
+          (f32 (COPY_TO_REGCLASS (XSRDPIP
+                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
+def : Pat<(f32 (any_ftrunc f32:$S)),
+          (f32 (COPY_TO_REGCLASS (XSRDPIZ
+                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
+def : Pat<(f32 (any_frint f32:$S)),
+          (f32 (COPY_TO_REGCLASS (XSRDPIC
+                                   (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
+def : Pat<(v4f32 (frint v4f32:$S)), (v4f32 (XVRSPIC $S))>;
+
+// Rounding for double precision.
+def : Pat<(f64 (frint f64:$S)), (f64 (XSRDPIC $S))>;
+def : Pat<(v2f64 (frint v2f64:$S)), (v2f64 (XVRDPIC $S))>;
+
+// Materialize a zero-vector of long long
+def : Pat<(v2i64 immAllZerosV),
+          (v2i64 (XXLXORz))>;
+
+// Build vectors of floating point converted to i32.
+def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.A,
+                               DblToInt.A, DblToInt.A)),
+          (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS $A), VSRC), 1))>;
+def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.A,
+                               DblToUInt.A, DblToUInt.A)),
+          (v4i32 (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS $A), VSRC), 1))>;
+def : Pat<(v2i64 (build_vector DblToLong.A, DblToLong.A)),
+          (v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC),
+                           (COPY_TO_REGCLASS (XSCVDPSXDS $A), VSRC), 0))>;
+def : Pat<(v2i64 (build_vector DblToULong.A, DblToULong.A)),
+          (v2i64 (XXPERMDI (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC),
+                           (COPY_TO_REGCLASS (XSCVDPUXDS $A), VSRC), 0))>;
+defm : ScalToVecWPermute<
+  v4i32, FltToIntLoad.A,
+  (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1),
+  (COPY_TO_REGCLASS (XSCVDPSXWSs (XFLOADf32 xoaddr:$A)), VSRC)>;
+defm : ScalToVecWPermute<
+  v4i32, FltToUIntLoad.A,
+  (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC), 1),
+  (COPY_TO_REGCLASS (XSCVDPUXWSs (XFLOADf32 xoaddr:$A)), VSRC)>;
+def : Pat<(v4f32 (build_vector f32:$A, f32:$A, f32:$A, f32:$A)),
+          (v4f32 (XXSPLTW (v4f32 (XSCVDPSPN $A)), 0))>;
+def : Pat<(v2f64 (PPCldsplat xoaddr:$A)),
+          (v2f64 (LXVDSX xoaddr:$A))>;
+def : Pat<(v2i64 (PPCldsplat xoaddr:$A)),
+          (v2i64 (LXVDSX xoaddr:$A))>;
+
+// Build vectors of floating point converted to i64.
+def : Pat<(v2i64 (build_vector FltToLong.A, FltToLong.A)),
+          (v2i64 (XXPERMDIs
+                   (COPY_TO_REGCLASS (XSCVDPSXDSs $A), VSFRC), 0))>;
+def : Pat<(v2i64 (build_vector FltToULong.A, FltToULong.A)),
+          (v2i64 (XXPERMDIs
+                   (COPY_TO_REGCLASS (XSCVDPUXDSs $A), VSFRC), 0))>;
+defm : ScalToVecWPermute<
+  v2i64, DblToLongLoad.A,
+  (XVCVDPSXDS (LXVDSX xoaddr:$A)), (XVCVDPSXDS (LXVDSX xoaddr:$A))>;
+defm : ScalToVecWPermute<
+  v2i64, DblToULongLoad.A,
+  (XVCVDPUXDS (LXVDSX xoaddr:$A)), (XVCVDPUXDS (LXVDSX xoaddr:$A))>;
+} // HasVSX
+
+// Any big endian VSX subtarget.
+let Predicates = [HasVSX, IsBigEndian] in {
+def : Pat<(v2f64 (scalar_to_vector f64:$A)),
+          (v2f64 (SUBREG_TO_REG (i64 1), $A, sub_64))>;
+
+def : Pat<(f64 (extractelt v2f64:$S, 0)),
+          (f64 (EXTRACT_SUBREG $S, sub_64))>;
+def : Pat<(f64 (extractelt v2f64:$S, 1)),
+          (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+          (f64 (XSCVSXDDP (COPY_TO_REGCLASS $S, VSFRC)))>;
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+          (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+          (f64 (XSCVUXDDP (COPY_TO_REGCLASS $S, VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+          (f64 (XSCVUXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+
+def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
+          (f64 VectorExtractions.BE_VARIABLE_DOUBLE)>;
+
+def : Pat<(v2f64 (build_vector f64:$A, f64:$B)),
+          (v2f64 (XXPERMDI
+                    (COPY_TO_REGCLASS $A, VSRC),
+                    (COPY_TO_REGCLASS $B, VSRC), 0))>;
+// Using VMRGEW to assemble the final vector would be a lower latency
+// solution. However, we choose to go with the slightly higher latency
+// XXPERMDI for 2 reasons:
+// 1. This is likely to occur in unrolled loops where regpressure is high,
+//    so we want to use the latter as it has access to all 64 VSX registers.
+// 2. Using Altivec instructions in this sequence would likely cause the
+//    allocation of Altivec registers even for the loads which in turn would
+//    force the use of LXSIWZX for the loads, adding a cycle of latency to
+//    each of the loads which would otherwise be able to use LFIWZX.
+def : Pat<(v4f32 (build_vector LoadFP.A, LoadFP.B, LoadFP.C, LoadFP.D)),
+          (v4f32 (XXPERMDI (XXMRGHW MrgFP.LD32A, MrgFP.LD32B),
+                           (XXMRGHW MrgFP.LD32C, MrgFP.LD32D), 3))>;
+def : Pat<(v4f32 (build_vector f32:$A, f32:$B, f32:$C, f32:$D)),
+          (VMRGEW MrgFP.AC, MrgFP.BD)>;
+def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
+                               DblToFlt.B0, DblToFlt.B1)),
+          (v4f32 (VMRGEW MrgFP.ABhToFlt, MrgFP.ABlToFlt))>;
+
+// Convert 4 doubles to a vector of ints.
+def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.B,
+                               DblToInt.C, DblToInt.D)),
+          (v4i32 (VMRGEW MrgWords.CVACS, MrgWords.CVBDS))>;
+def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.B,
+                               DblToUInt.C, DblToUInt.D)),
+          (v4i32 (VMRGEW MrgWords.CVACU, MrgWords.CVBDU))>;
+def : Pat<(v4i32 (build_vector ExtDbl.A0S, ExtDbl.A1S,
+                               ExtDbl.B0S, ExtDbl.B1S)),
+          (v4i32 (VMRGEW MrgWords.CVA0B0S, MrgWords.CVA1B1S))>;
+def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U,
+                               ExtDbl.B0U, ExtDbl.B1U)),
+          (v4i32 (VMRGEW MrgWords.CVA0B0U, MrgWords.CVA1B1U))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+                               (f64 (fpextend (extractelt v4f32:$A, 1))))),
+          (v2f64 (XVCVSPDP (XXMRGHW $A, $A)))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))),
+                               (f64 (fpextend (extractelt v4f32:$A, 0))))),
+          (v2f64 (XXPERMDI (XVCVSPDP (XXMRGHW $A, $A)),
+                           (XVCVSPDP (XXMRGHW $A, $A)), 2))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+                               (f64 (fpextend (extractelt v4f32:$A, 2))))),
+          (v2f64 (XVCVSPDP $A))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))),
+                               (f64 (fpextend (extractelt v4f32:$A, 3))))),
+          (v2f64 (XVCVSPDP (XXSLDWI $A, $A, 3)))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 2))),
+                               (f64 (fpextend (extractelt v4f32:$A, 3))))),
+          (v2f64 (XVCVSPDP (XXMRGLW $A, $A)))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 3))),
+                               (f64 (fpextend (extractelt v4f32:$A, 2))))),
+          (v2f64 (XXPERMDI (XVCVSPDP (XXMRGLW $A, $A)),
+                           (XVCVSPDP (XXMRGLW $A, $A)), 2))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+                               (f64 (fpextend (extractelt v4f32:$B, 0))))),
+          (v2f64 (XVCVSPDP (XXPERMDI $A, $B, 0)))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 3))),
+                               (f64 (fpextend (extractelt v4f32:$B, 3))))),
+          (v2f64 (XVCVSPDP (XXSLDWI (XXPERMDI $A, $B, 3),
+                                    (XXPERMDI $A, $B, 3), 1)))>;
+def : Pat<WToDPExtractConv.BV02S,
+          (v2f64 (XVCVSXWDP $A))>;
+def : Pat<WToDPExtractConv.BV13S,
+          (v2f64 (XVCVSXWDP (XXSLDWI $A, $A, 3)))>;
+def : Pat<WToDPExtractConv.BV02U,
+          (v2f64 (XVCVUXWDP $A))>;
+def : Pat<WToDPExtractConv.BV13U,
+          (v2f64 (XVCVUXWDP (XXSLDWI $A, $A, 3)))>;
+} // HasVSX, IsBigEndian
+
+// Any little endian VSX subtarget.
+let Predicates = [HasVSX, IsLittleEndian] in {
+defm : ScalToVecWPermute<v2f64, (f64 f64:$A),
+                         (XXPERMDI (SUBREG_TO_REG (i64 1), $A, sub_64),
+                                   (SUBREG_TO_REG (i64 1), $A, sub_64), 0),
+                         (SUBREG_TO_REG (i64 1), $A, sub_64)>;
+
+def : Pat<(f64 (extractelt v2f64:$S, 0)),
+          (f64 (EXTRACT_SUBREG (XXPERMDI $S, $S, 2), sub_64))>;
+def : Pat<(f64 (extractelt v2f64:$S, 1)),
+          (f64 (EXTRACT_SUBREG $S, sub_64))>;
+
+def : Pat<(v2f64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+def : Pat<(PPCst_vec_be v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+def : Pat<(v4f32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+def : Pat<(PPCst_vec_be v4f32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
+def : Pat<(v2i64 (PPCld_vec_be xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+def : Pat<(PPCst_vec_be v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+def : Pat<(v4i32 (PPCld_vec_be xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+def : Pat<(PPCst_vec_be v4i32:$rS, xoaddr:$dst), (STXVW4X $rS, xoaddr:$dst)>;
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+          (f64 (XSCVSXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+def : Pat<(f64 (PPCfcfid (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+          (f64 (XSCVSXDDP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 0))))),
+          (f64 (XSCVUXDDP (COPY_TO_REGCLASS (XXPERMDI $S, $S, 2), VSFRC)))>;
+def : Pat<(f64 (PPCfcfidu (PPCmtvsra (i64 (vector_extract v2i64:$S, 1))))),
+          (f64 (XSCVUXDDP (COPY_TO_REGCLASS (f64 (COPY_TO_REGCLASS $S, VSRC)), VSFRC)))>;
+
+def : Pat<(f64 (vector_extract v2f64:$S, i64:$Idx)),
+          (f64 VectorExtractions.LE_VARIABLE_DOUBLE)>;
+
+// Little endian, available on all targets with VSX
+def : Pat<(v2f64 (build_vector f64:$A, f64:$B)),
+          (v2f64 (XXPERMDI
+                    (COPY_TO_REGCLASS $B, VSRC),
+                    (COPY_TO_REGCLASS $A, VSRC), 0))>;
+// Using VMRGEW to assemble the final vector would be a lower latency
+// solution. However, we choose to go with the slightly higher latency
+// XXPERMDI for 2 reasons:
+// 1. This is likely to occur in unrolled loops where regpressure is high,
+//    so we want to use the latter as it has access to all 64 VSX registers.
+// 2. Using Altivec instructions in this sequence would likely cause the
+//    allocation of Altivec registers even for the loads which in turn would
+//    force the use of LXSIWZX for the loads, adding a cycle of latency to
+//    each of the loads which would otherwise be able to use LFIWZX.
+def : Pat<(v4f32 (build_vector LoadFP.A, LoadFP.B, LoadFP.C, LoadFP.D)),
+          (v4f32 (XXPERMDI (XXMRGHW MrgFP.LD32D, MrgFP.LD32C),
+                           (XXMRGHW MrgFP.LD32B, MrgFP.LD32A), 3))>;
+def : Pat<(v4f32 (build_vector f32:$D, f32:$C, f32:$B, f32:$A)),
+          (VMRGEW MrgFP.AC, MrgFP.BD)>;
+def : Pat<(v4f32 (build_vector DblToFlt.A0, DblToFlt.A1,
+                               DblToFlt.B0, DblToFlt.B1)),
+          (v4f32 (VMRGEW MrgFP.BAhToFlt, MrgFP.BAlToFlt))>;
+
+// Convert 4 doubles to a vector of ints.
+def : Pat<(v4i32 (build_vector DblToInt.A, DblToInt.B,
+                               DblToInt.C, DblToInt.D)),
+          (v4i32 (VMRGEW MrgWords.CVDBS, MrgWords.CVCAS))>;
+def : Pat<(v4i32 (build_vector DblToUInt.A, DblToUInt.B,
+                               DblToUInt.C, DblToUInt.D)),
+          (v4i32 (VMRGEW MrgWords.CVDBU, MrgWords.CVCAU))>;
+def : Pat<(v4i32 (build_vector ExtDbl.A0S, ExtDbl.A1S,
+                               ExtDbl.B0S, ExtDbl.B1S)),
+          (v4i32 (VMRGEW MrgWords.CVB1A1S, MrgWords.CVB0A0S))>;
+def : Pat<(v4i32 (build_vector ExtDbl.A0U, ExtDbl.A1U,
+                               ExtDbl.B0U, ExtDbl.B1U)),
+          (v4i32 (VMRGEW MrgWords.CVB1A1U, MrgWords.CVB0A0U))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+                               (f64 (fpextend (extractelt v4f32:$A, 1))))),
+          (v2f64 (XVCVSPDP (XXMRGLW $A, $A)))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))),
+                               (f64 (fpextend (extractelt v4f32:$A, 0))))),
+          (v2f64 (XXPERMDI (XVCVSPDP (XXMRGLW $A, $A)),
+                           (XVCVSPDP (XXMRGLW $A, $A)), 2))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+                               (f64 (fpextend (extractelt v4f32:$A, 2))))),
+          (v2f64 (XVCVSPDP (XXSLDWI $A, $A, 1)))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 1))),
+                               (f64 (fpextend (extractelt v4f32:$A, 3))))),
+          (v2f64 (XVCVSPDP $A))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 2))),
+                               (f64 (fpextend (extractelt v4f32:$A, 3))))),
+          (v2f64 (XVCVSPDP (XXMRGHW $A, $A)))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 3))),
+                               (f64 (fpextend (extractelt v4f32:$A, 2))))),
+          (v2f64 (XXPERMDI (XVCVSPDP (XXMRGHW $A, $A)),
+                           (XVCVSPDP (XXMRGHW $A, $A)), 2))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 0))),
+                               (f64 (fpextend (extractelt v4f32:$B, 0))))),
+          (v2f64 (XVCVSPDP (XXSLDWI (XXPERMDI $B, $A, 3),
+                                    (XXPERMDI $B, $A, 3), 1)))>;
+def : Pat<(v2f64 (build_vector (f64 (fpextend (extractelt v4f32:$A, 3))),
+                               (f64 (fpextend (extractelt v4f32:$B, 3))))),
+          (v2f64 (XVCVSPDP (XXPERMDI $B, $A, 0)))>;
+def : Pat<WToDPExtractConv.BV02S,
+          (v2f64 (XVCVSXWDP (XXSLDWI $A, $A, 1)))>;
+def : Pat<WToDPExtractConv.BV13S,
+          (v2f64 (XVCVSXWDP $A))>;
+def : Pat<WToDPExtractConv.BV02U,
+          (v2f64 (XVCVUXWDP (XXSLDWI $A, $A, 1)))>;
+def : Pat<WToDPExtractConv.BV13U,
+          (v2f64 (XVCVUXWDP $A))>;
+} // HasVSX, IsLittleEndian
+
+// Any pre-Power9 VSX subtarget.
+let Predicates = [HasVSX, NoP9Vector] in {
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 8),
+          (STXSDX (XSCVDPSXDS f64:$src), xoaddr:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 8),
+          (STXSDX (XSCVDPUXDS f64:$src), xoaddr:$dst)>;
+
+// Load-and-splat with fp-to-int conversion (using X-Form VSX/FP loads).
+defm : ScalToVecWPermute<
+  v4i32, DblToIntLoad.A,
+  (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS (XFLOADf64 xoaddr:$A)), VSRC), 1),
+  (COPY_TO_REGCLASS (XSCVDPSXWS (XFLOADf64 xoaddr:$A)), VSRC)>;
+defm : ScalToVecWPermute<
+  v4i32, DblToUIntLoad.A,
+  (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS (XFLOADf64 xoaddr:$A)), VSRC), 1),
+  (COPY_TO_REGCLASS (XSCVDPUXWS (XFLOADf64 xoaddr:$A)), VSRC)>;
+defm : ScalToVecWPermute<
+  v2i64, FltToLongLoad.A,
+  (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$A), VSFRC)), 0),
+  (SUBREG_TO_REG (i64 1), (XSCVDPSXDS (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$A),
+                                                        VSFRC)), sub_64)>;
+defm : ScalToVecWPermute<
+  v2i64, FltToULongLoad.A,
+  (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$A), VSFRC)), 0),
+  (SUBREG_TO_REG (i64 1), (XSCVDPUXDS (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$A),
+                                                        VSFRC)), sub_64)>;
+} // HasVSX, NoP9Vector
+
+// Any VSX subtarget that only has loads and stores that load in big endian
+// order regardless of endianness. This is really pre-Power9 subtargets.
+let Predicates = [HasVSX, HasOnlySwappingMemOps] in {
+  def : Pat<(v2f64 (PPClxvd2x xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+
+  // Stores.
+  def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
+            (STXVD2X $rS, xoaddr:$dst)>;
+  def : Pat<(PPCstxvd2x v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+} // HasVSX, HasOnlySwappingMemOps
+
+// Big endian VSX subtarget that only has loads and stores that always load
+// in big endian order. Really big endian pre-Power9 subtargets.
+let Predicates = [HasVSX, HasOnlySwappingMemOps, IsBigEndian] in {
+  def : Pat<(v2f64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+  def : Pat<(v2i64 (load xoaddr:$src)), (LXVD2X xoaddr:$src)>;
+  def : Pat<(v4i32 (load xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+  def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVW4X xoaddr:$src)>;
+  def : Pat<(store v2f64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+  def : Pat<(store v2i64:$rS, xoaddr:$dst), (STXVD2X $rS, xoaddr:$dst)>;
+  def : Pat<(store v4i32:$XT, xoaddr:$dst), (STXVW4X $XT, xoaddr:$dst)>;
+  def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
+            (STXVW4X $rS, xoaddr:$dst)>;
+} // HasVSX, HasOnlySwappingMemOps, IsBigEndian
+
+// Any Power8 VSX subtarget.
+let Predicates = [HasVSX, HasP8Vector] in {
+def : Pat<(int_ppc_vsx_xxleqv v4i32:$A, v4i32:$B),
+          (XXLEQV $A, $B)>;
+def : Pat<(f64 (extloadf32 xoaddr:$src)),
+          (COPY_TO_REGCLASS (XFLOADf32 xoaddr:$src), VSFRC)>;
+def : Pat<(f32 (fpround (f64 (extloadf32 xoaddr:$src)))),
+          (f32 (XFLOADf32 xoaddr:$src))>;
+def : Pat<(f64 (any_fpextend f32:$src)),
+          (COPY_TO_REGCLASS $src, VSFRC)>;
+
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLT)),
+          (SELECT_VSSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULT)),
+          (SELECT_VSSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETLE)),
+          (SELECT_VSSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETULE)),
+          (SELECT_VSSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETEQ)),
+          (SELECT_VSSRC (CREQV $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGE)),
+          (SELECT_VSSRC (CRORC  $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGE)),
+          (SELECT_VSSRC (CRORC  $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETGT)),
+          (SELECT_VSSRC (CRANDC $rhs, $lhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETUGT)),
+          (SELECT_VSSRC (CRANDC $lhs, $rhs), $tval, $fval)>;
+def : Pat<(f32 (selectcc i1:$lhs, i1:$rhs, f32:$tval, f32:$fval, SETNE)),
+          (SELECT_VSSRC (CRXOR $lhs, $rhs), $tval, $fval)>;
+
+// Additional fnmsub pattern for PPC specific ISD opcode
+def : Pat<(PPCfnmsub f32:$A, f32:$B, f32:$C),
+          (XSNMSUBASP $C, $A, $B)>;
+def : Pat<(fneg (PPCfnmsub f32:$A, f32:$B, f32:$C)),
+          (XSMSUBASP $C, $A, $B)>;
+def : Pat<(PPCfnmsub f32:$A, f32:$B, (fneg f32:$C)),
+          (XSNMADDASP $C, $A, $B)>;
+
+// f32 neg
+// Although XSNEGDP is available in P7, we want to select it starting from P8,
+// so that FNMSUBS can be selected for fneg-fmsub pattern on P7. (VSX version,
+// XSNMSUBASP, is available since P8)
+def : Pat<(f32 (fneg f32:$S)),
+          (f32 (COPY_TO_REGCLASS (XSNEGDP
+               (COPY_TO_REGCLASS $S, VSFRC)), VSSRC))>;
+
+// Instructions for converting float to i32 feeding a store.
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 4),
+          (STIWX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 4),
+          (STIWX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
+
+def : Pat<(v2i64 (smax v2i64:$src1, v2i64:$src2)),
+          (v2i64 (VMAXSD (COPY_TO_REGCLASS $src1, VRRC),
+                         (COPY_TO_REGCLASS $src2, VRRC)))>;
+def : Pat<(v2i64 (umax v2i64:$src1, v2i64:$src2)),
+          (v2i64 (VMAXUD (COPY_TO_REGCLASS $src1, VRRC),
+                         (COPY_TO_REGCLASS $src2, VRRC)))>;
+def : Pat<(v2i64 (smin v2i64:$src1, v2i64:$src2)),
+          (v2i64 (VMINSD (COPY_TO_REGCLASS $src1, VRRC),
+                         (COPY_TO_REGCLASS $src2, VRRC)))>;
+def : Pat<(v2i64 (umin v2i64:$src1, v2i64:$src2)),
+          (v2i64 (VMINUD (COPY_TO_REGCLASS $src1, VRRC),
+                         (COPY_TO_REGCLASS $src2, VRRC)))>;
+
+def : Pat<(v1i128 (bitconvert (v16i8 immAllOnesV))),
+          (v1i128 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
+def : Pat<(v2i64 (bitconvert (v16i8 immAllOnesV))),
+          (v2i64 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
+def : Pat<(v8i16 (bitconvert (v16i8 immAllOnesV))),
+          (v8i16 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
+def : Pat<(v16i8 (bitconvert (v16i8 immAllOnesV))),
+          (v16i8 (COPY_TO_REGCLASS(XXLEQVOnes), VSRC))>;
+} // HasVSX, HasP8Vector
+
+// Big endian Power8 VSX subtarget.
+let Predicates = [HasVSX, HasP8Vector, IsBigEndian] in {
+def : Pat<DWToSPExtractConv.El0SS1,
+          (f32 (XSCVSXDSP (COPY_TO_REGCLASS $S1, VSFRC)))>;
+def : Pat<DWToSPExtractConv.El1SS1,
+          (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S1, $S1, 2), VSFRC)))>;
+def : Pat<DWToSPExtractConv.El0US1,
+          (f32 (XSCVUXDSP (COPY_TO_REGCLASS $S1, VSFRC)))>;
+def : Pat<DWToSPExtractConv.El1US1,
+          (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S1, $S1, 2), VSFRC)))>;
+
+// v4f32 scalar <-> vector conversions (BE)
+def : Pat<(v4f32 (scalar_to_vector f32:$A)),
+          (v4f32 (XSCVDPSPN $A))>;
+def : Pat<(f32 (vector_extract v4f32:$S, 0)),
+          (f32 (XSCVSPDPN $S))>;
+def : Pat<(f32 (vector_extract v4f32:$S, 1)),
+          (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
+def : Pat<(f32 (vector_extract v4f32:$S, 2)),
+          (f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>;
+def : Pat<(f32 (vector_extract v4f32:$S, 3)),
+          (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
+def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+          (f32 VectorExtractions.BE_VARIABLE_FLOAT)>;
+
+def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
+          (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
+def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
+          (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>;
+def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
+          (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>;
+def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
+          (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>;
+def : Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
+          (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>;
+def : Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
+          (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>;
+def : Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
+          (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>;
+def : Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
+          (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
+
+// LIWAX - This instruction is used for sign extending i32 -> i64.
+// LIWZX - This instruction will be emitted for i32, f32, and when
+//         zero-extending i32 to i64 (zext i32 -> i64).
+def : Pat<(v2i64 (scalar_to_vector (i64 (sextloadi32 xoaddr:$src)))),
+          (v2i64 (COPY_TO_REGCLASS (LIWAX xoaddr:$src), VSRC))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (zextloadi32 xoaddr:$src)))),
+          (v2i64 (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC))>;
+def : Pat<(v4i32 (scalar_to_vector (i32 (load xoaddr:$src)))),
+          (v4i32 (XXSLDWIs
+          (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 1))>;
+def : Pat<(v4f32 (scalar_to_vector (f32 (load xoaddr:$src)))),
+          (v4f32 (XXSLDWIs
+          (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSRC), 1))>;
+
+def : Pat<DWToSPExtractConv.BVU,
+          (v4f32 (VPKUDUM (XXSLDWI (XVCVUXDSP $S1), (XVCVUXDSP $S1), 3),
+                          (XXSLDWI (XVCVUXDSP $S2), (XVCVUXDSP $S2), 3)))>;
+def : Pat<DWToSPExtractConv.BVS,
+          (v4f32 (VPKUDUM (XXSLDWI (XVCVSXDSP $S1), (XVCVSXDSP $S1), 3),
+                          (XXSLDWI (XVCVSXDSP $S2), (XVCVSXDSP $S2), 3)))>;
+def : Pat<(store (i32 (extractelt v4i32:$A, 1)), xoaddr:$src),
+          (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+def : Pat<(store (f32 (extractelt v4f32:$A, 1)), xoaddr:$src),
+          (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+
+// Elements in a register on a BE system are in order <0, 1, 2, 3>.
+// The store instructions store the second word from the left.
+// So to align element zero, we need to modulo-left-shift by 3 words.
+// Similar logic applies for elements 2 and 3.
+foreach Idx = [ [0,3], [2,1], [3,2] ] in {
+  def : Pat<(store (i32 (extractelt v4i32:$A, !head(Idx))), xoaddr:$src),
+            (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
+                                   sub_64), xoaddr:$src)>;
+  def : Pat<(store (f32 (extractelt v4f32:$A, !head(Idx))), xoaddr:$src),
+            (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
+                                   sub_64), xoaddr:$src)>;
 }
+} // HasVSX, HasP8Vector, IsBigEndian
+
+// Little endian Power8 VSX subtarget.
+let Predicates = [HasVSX, HasP8Vector, IsLittleEndian] in {
+def : Pat<DWToSPExtractConv.El0SS1,
+          (f32 (XSCVSXDSP (COPY_TO_REGCLASS (XXPERMDI $S1, $S1, 2), VSFRC)))>;
+def : Pat<DWToSPExtractConv.El1SS1,
+          (f32 (XSCVSXDSP (COPY_TO_REGCLASS
+                            (f64 (COPY_TO_REGCLASS $S1, VSRC)), VSFRC)))>;
+def : Pat<DWToSPExtractConv.El0US1,
+          (f32 (XSCVUXDSP (COPY_TO_REGCLASS (XXPERMDI $S1, $S1, 2), VSFRC)))>;
+def : Pat<DWToSPExtractConv.El1US1,
+          (f32 (XSCVUXDSP (COPY_TO_REGCLASS
+                            (f64 (COPY_TO_REGCLASS $S1, VSRC)), VSFRC)))>;
 
-// Put this P9Altivec related definition here since it's possible to be 
-// selected to VSX instruction xvnegsp, avoid possible undef.
-let Predicates = [HasP9Altivec] in {
+// v4f32 scalar <-> vector conversions (LE)
+  // The permuted version is no better than the version that puts the value
+  // into the right element because XSCVDPSPN is different from all the other
+  // instructions used for PPCSToV.
+  defm : ScalToVecWPermute<v4f32, (f32 f32:$A),
+                           (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 1),
+                           (XXSLDWI (XSCVDPSPN $A), (XSCVDPSPN $A), 3)>;
+def : Pat<(f32 (vector_extract v4f32:$S, 0)),
+          (f32 (XSCVSPDPN (XXSLDWI $S, $S, 3)))>;
+def : Pat<(f32 (vector_extract v4f32:$S, 1)),
+          (f32 (XSCVSPDPN (XXPERMDI $S, $S, 2)))>;
+def : Pat<(f32 (vector_extract v4f32:$S, 2)),
+          (f32 (XSCVSPDPN (XXSLDWI $S, $S, 1)))>;
+def : Pat<(f32 (vector_extract v4f32:$S, 3)),
+          (f32 (XSCVSPDPN $S))>;
+def : Pat<(f32 (vector_extract v4f32:$S, i64:$Idx)),
+          (f32 VectorExtractions.LE_VARIABLE_FLOAT)>;
+
+def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
+          (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 3))))>;
+def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
+          (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 2))))>;
+def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
+          (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 1))))>;
+def : Pat<(f32 (PPCfcfids (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
+          (f32 (XSCVSPDPN (XVCVSXWSP (XXSPLTW $A, 0))))>;
+def : Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 0)))))),
+          (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 3)), VSFRC))>;
+def : Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 1)))))),
+          (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 2)), VSFRC))>;
+def : Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 2)))))),
+          (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 1)), VSFRC))>;
+def : Pat<(f64 (PPCfcfid (f64 (PPCmtvsra (i32 (extractelt v4i32:$A, 3)))))),
+          (f64 (COPY_TO_REGCLASS (XVCVSXWDP (XXSPLTW $A, 0)), VSFRC))>;
+
+// LIWAX - This instruction is used for sign extending i32 -> i64.
+// LIWZX - This instruction will be emitted for i32, f32, and when
+//         zero-extending i32 to i64 (zext i32 -> i64).
+defm : ScalToVecWPermute<
+  v2i64, (i64 (sextloadi32 xoaddr:$src)),
+  (XXPERMDIs (COPY_TO_REGCLASS (LIWAX xoaddr:$src), VSFRC), 2),
+  (SUBREG_TO_REG (i64 1), (LIWAX xoaddr:$src), sub_64)>;
+
+defm : ScalToVecWPermute<
+  v2i64, (i64 (zextloadi32 xoaddr:$src)),
+  (XXPERMDIs (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSFRC), 2),
+  (SUBREG_TO_REG (i64 1), (LIWZX xoaddr:$src), sub_64)>;
+
+defm : ScalToVecWPermute<
+  v4i32, (i32 (load xoaddr:$src)),
+  (XXPERMDIs (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSFRC), 2),
+  (SUBREG_TO_REG (i64 1), (LIWZX xoaddr:$src), sub_64)>;
+
+defm : ScalToVecWPermute<
+  v4f32, (f32 (load xoaddr:$src)),
+  (XXPERMDIs (COPY_TO_REGCLASS (LIWZX xoaddr:$src), VSFRC), 2),
+  (SUBREG_TO_REG (i64 1), (LIWZX xoaddr:$src), sub_64)>;
+
+def : Pat<DWToSPExtractConv.BVU,
+          (v4f32 (VPKUDUM (XXSLDWI (XVCVUXDSP $S2), (XVCVUXDSP $S2), 3),
+                          (XXSLDWI (XVCVUXDSP $S1), (XVCVUXDSP $S1), 3)))>;
+def : Pat<DWToSPExtractConv.BVS,
+          (v4f32 (VPKUDUM (XXSLDWI (XVCVSXDSP $S2), (XVCVSXDSP $S2), 3),
+                          (XXSLDWI (XVCVSXDSP $S1), (XVCVSXDSP $S1), 3)))>;
+def : Pat<(store (i32 (extractelt v4i32:$A, 2)), xoaddr:$src),
+          (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+def : Pat<(store (f32 (extractelt v4f32:$A, 2)), xoaddr:$src),
+          (STIWX (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+
+// Elements in a register on a LE system are in order <3, 2, 1, 0>.
+// The store instructions store the second word from the left.
+// So to align element 3, we need to modulo-left-shift by 3 words.
+// Similar logic applies for elements 0 and 1.
+foreach Idx = [ [0,2], [1,1], [3,3] ] in {
+  def : Pat<(store (i32 (extractelt v4i32:$A, !head(Idx))), xoaddr:$src),
+            (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
+                                   sub_64), xoaddr:$src)>;
+  def : Pat<(store (f32 (extractelt v4f32:$A, !head(Idx))), xoaddr:$src),
+            (STIWX (EXTRACT_SUBREG (XXSLDWI $A, $A, !head(!tail(Idx))),
+                                   sub_64), xoaddr:$src)>;
+}
+} // HasVSX, HasP8Vector, IsLittleEndian
+
+// Big endian pre-Power9 VSX subtarget.
+let Predicates = [HasVSX, HasP8Vector, NoP9Vector, IsBigEndian] in {
+def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
+                      xoaddr:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
+                      xoaddr:$src)>;
+} // HasVSX, HasP8Vector, NoP9Vector, IsBigEndian
+
+// Little endian pre-Power9 VSX subtarget.
+let Predicates = [HasVSX, HasP8Vector, NoP9Vector, IsLittleEndian] in {
+def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xoaddr:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
+                      xoaddr:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xoaddr:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
+                      xoaddr:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xoaddr:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xoaddr:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xoaddr:$src)>;
+} // HasVSX, HasP8Vector, NoP9Vector, IsLittleEndian
+
+// Any VSX target with direct moves.
+let Predicates = [HasVSX, HasDirectMove] in {
+// bitconvert f32 -> i32
+// (convert to 32-bit fp single, shift right 1 word, move to GPR)
+def : Pat<(i32 (bitconvert f32:$S)),
+          (i32 (MFVSRWZ (EXTRACT_SUBREG
+                          (XXSLDWI (XSCVDPSPN $S), (XSCVDPSPN $S), 3),
+                          sub_64)))>;
+// bitconvert i32 -> f32
+// (move to FPR, shift left 1 word, convert to 64-bit fp single)
+def : Pat<(f32 (bitconvert i32:$A)),
+          (f32 (XSCVSPDPN
+                 (XXSLDWI MovesToVSR.LE_WORD_1, MovesToVSR.LE_WORD_1, 1)))>;
 
-  def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 0))),
-            (v4i32 (VABSDUW $A, $B))>;
+// bitconvert f64 -> i64
+// (move to GPR, nothing else needed)
+def : Pat<(i64 (bitconvert f64:$S)),
+          (i64 (MFVSRD $S))>;
 
-  def : Pat<(v8i16 (PPCvabsd v8i16:$A, v8i16:$B, (i32 0))),
-            (v8i16 (VABSDUH $A, $B))>;
+// bitconvert i64 -> f64
+// (move to FPR, nothing else needed)
+def : Pat<(f64 (bitconvert i64:$S)),
+          (f64 (MTVSRD $S))>;
 
-  def : Pat<(v16i8 (PPCvabsd v16i8:$A, v16i8:$B, (i32 0))),
-            (v16i8 (VABSDUB $A, $B))>;
+// Rounding to integer.
+def : Pat<(i64 (lrint f64:$S)),
+          (i64 (MFVSRD (FCTID $S)))>;
+def : Pat<(i64 (lrint f32:$S)),
+          (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>;
+def : Pat<(i64 (llrint f64:$S)),
+          (i64 (MFVSRD (FCTID $S)))>;
+def : Pat<(i64 (llrint f32:$S)),
+          (i64 (MFVSRD (FCTID (COPY_TO_REGCLASS $S, F8RC))))>;
+def : Pat<(i64 (lround f64:$S)),
+          (i64 (MFVSRD (FCTID (XSRDPI $S))))>;
+def : Pat<(i64 (lround f32:$S)),
+          (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
+def : Pat<(i64 (llround f64:$S)),
+          (i64 (MFVSRD (FCTID (XSRDPI $S))))>;
+def : Pat<(i64 (llround f32:$S)),
+          (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
+
+// Alternate patterns for PPCmtvsrz where the output is v8i16 or v16i8 instead
+// of f64
+def : Pat<(v8i16 (PPCmtvsrz i32:$A)),
+          (v8i16 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
+def : Pat<(v16i8 (PPCmtvsrz i32:$A)),
+          (v16i8 (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64))>;
+
+// Endianness-neutral constant splat on P8 and newer targets. The reason
+// for this pattern is that on targets with direct moves, we don't expand
+// BUILD_VECTOR nodes for v4i32.
+def : Pat<(v4i32 (build_vector immSExt5NonZero:$A, immSExt5NonZero:$A,
+                               immSExt5NonZero:$A, immSExt5NonZero:$A)),
+          (v4i32 (VSPLTISW imm:$A))>;
+} // HasVSX, HasDirectMove
+
+// Big endian VSX subtarget with direct moves.
+let Predicates = [HasVSX, HasDirectMove, IsBigEndian] in {
+// v16i8 scalar <-> vector conversions (BE)
+def : Pat<(v16i8 (scalar_to_vector i32:$A)),
+          (v16i8 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_BYTE_0, sub_64))>;
+def : Pat<(v8i16 (scalar_to_vector i32:$A)),
+          (v8i16 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_HALF_0, sub_64))>;
+def : Pat<(v4i32 (scalar_to_vector i32:$A)),
+          (v4i32 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_WORD_0, sub_64))>;
+def : Pat<(v2i64 (scalar_to_vector i64:$A)),
+          (v2i64 (SUBREG_TO_REG (i64 1), MovesToVSR.BE_DWORD_0, sub_64))>;
+
+// v2i64 scalar <-> vector conversions (BE)
+def : Pat<(i64 (vector_extract v2i64:$S, 0)),
+          (i64 VectorExtractions.LE_DWORD_1)>;
+def : Pat<(i64 (vector_extract v2i64:$S, 1)),
+          (i64 VectorExtractions.LE_DWORD_0)>;
+def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+          (i64 VectorExtractions.BE_VARIABLE_DWORD)>;
+} // HasVSX, HasDirectMove, IsBigEndian
+
+// Little endian VSX subtarget with direct moves.
+let Predicates = [HasVSX, HasDirectMove, IsLittleEndian] in {
+  // v16i8 scalar <-> vector conversions (LE)
+  defm : ScalToVecWPermute<v16i8, (i32 i32:$A),
+                           (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC),
+                           (COPY_TO_REGCLASS MovesToVSR.LE_WORD_1, VSRC)>;
+  defm : ScalToVecWPermute<v8i16, (i32 i32:$A),
+                           (COPY_TO_REGCLASS MovesToVSR.LE_WORD_0, VSRC),
+                           (COPY_TO_REGCLASS MovesToVSR.LE_WORD_1, VSRC)>;
+  defm : ScalToVecWPermute<v4i32, (i32 i32:$A), MovesToVSR.LE_WORD_0,
+                           (SUBREG_TO_REG (i64 1), (MTVSRWZ $A), sub_64)>;
+  defm : ScalToVecWPermute<v2i64, (i64 i64:$A), MovesToVSR.LE_DWORD_0,
+                           MovesToVSR.LE_DWORD_1>;
+
+  // v2i64 scalar <-> vector conversions (LE)
+  def : Pat<(i64 (vector_extract v2i64:$S, 0)),
+            (i64 VectorExtractions.LE_DWORD_0)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, 1)),
+            (i64 VectorExtractions.LE_DWORD_1)>;
+  def : Pat<(i64 (vector_extract v2i64:$S, i64:$Idx)),
+            (i64 VectorExtractions.LE_VARIABLE_DWORD)>;
+} // HasVSX, HasDirectMove, IsLittleEndian
+
+// Big endian pre-P9 VSX subtarget with direct moves.
+let Predicates = [HasVSX, HasDirectMove, NoP9Altivec, IsBigEndian] in {
+def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+          (i32 VectorExtractions.LE_BYTE_15)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+          (i32 VectorExtractions.LE_BYTE_14)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+          (i32 VectorExtractions.LE_BYTE_13)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+          (i32 VectorExtractions.LE_BYTE_12)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+          (i32 VectorExtractions.LE_BYTE_11)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+          (i32 VectorExtractions.LE_BYTE_10)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+          (i32 VectorExtractions.LE_BYTE_9)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+          (i32 VectorExtractions.LE_BYTE_8)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+          (i32 VectorExtractions.LE_BYTE_7)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+          (i32 VectorExtractions.LE_BYTE_6)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+          (i32 VectorExtractions.LE_BYTE_5)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+          (i32 VectorExtractions.LE_BYTE_4)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+          (i32 VectorExtractions.LE_BYTE_3)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+          (i32 VectorExtractions.LE_BYTE_2)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+          (i32 VectorExtractions.LE_BYTE_1)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+          (i32 VectorExtractions.LE_BYTE_0)>;
+def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+          (i32 VectorExtractions.BE_VARIABLE_BYTE)>;
+
+// v8i16 scalar <-> vector conversions (BE)
+def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+          (i32 VectorExtractions.LE_HALF_7)>;
+def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+          (i32 VectorExtractions.LE_HALF_6)>;
+def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+          (i32 VectorExtractions.LE_HALF_5)>;
+def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+          (i32 VectorExtractions.LE_HALF_4)>;
+def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+          (i32 VectorExtractions.LE_HALF_3)>;
+def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+          (i32 VectorExtractions.LE_HALF_2)>;
+def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+          (i32 VectorExtractions.LE_HALF_1)>;
+def : Pat<(i32 (vector_extract v8i16:$S, 7)),
+          (i32 VectorExtractions.LE_HALF_0)>;
+def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+          (i32 VectorExtractions.BE_VARIABLE_HALF)>;
+
+// v4i32 scalar <-> vector conversions (BE)
+def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+          (i32 VectorExtractions.LE_WORD_3)>;
+def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+          (i32 VectorExtractions.LE_WORD_2)>;
+def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+          (i32 VectorExtractions.LE_WORD_1)>;
+def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+          (i32 VectorExtractions.LE_WORD_0)>;
+def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+          (i32 VectorExtractions.BE_VARIABLE_WORD)>;
+} // HasVSX, HasDirectMove, NoP9Altivec, IsBigEndian
+
+// Little endian pre-P9 VSX subtarget with direct moves.
+let Predicates = [HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian] in {
+def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+          (i32 VectorExtractions.LE_BYTE_0)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+          (i32 VectorExtractions.LE_BYTE_1)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+          (i32 VectorExtractions.LE_BYTE_2)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+          (i32 VectorExtractions.LE_BYTE_3)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+          (i32 VectorExtractions.LE_BYTE_4)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+          (i32 VectorExtractions.LE_BYTE_5)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+          (i32 VectorExtractions.LE_BYTE_6)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+          (i32 VectorExtractions.LE_BYTE_7)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+          (i32 VectorExtractions.LE_BYTE_8)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+          (i32 VectorExtractions.LE_BYTE_9)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+          (i32 VectorExtractions.LE_BYTE_10)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+          (i32 VectorExtractions.LE_BYTE_11)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+          (i32 VectorExtractions.LE_BYTE_12)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+          (i32 VectorExtractions.LE_BYTE_13)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+          (i32 VectorExtractions.LE_BYTE_14)>;
+def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+          (i32 VectorExtractions.LE_BYTE_15)>;
+def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+          (i32 VectorExtractions.LE_VARIABLE_BYTE)>;
+
+// v8i16 scalar <-> vector conversions (LE)
+def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+          (i32 VectorExtractions.LE_HALF_0)>;
+def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+          (i32 VectorExtractions.LE_HALF_1)>;
+def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+          (i32 VectorExtractions.LE_HALF_2)>;
+def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+          (i32 VectorExtractions.LE_HALF_3)>;
+def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+          (i32 VectorExtractions.LE_HALF_4)>;
+def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+          (i32 VectorExtractions.LE_HALF_5)>;
+def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+          (i32 VectorExtractions.LE_HALF_6)>;
+def : Pat<(i32 (vector_extract v8i16:$S, 7)),
+          (i32 VectorExtractions.LE_HALF_7)>;
+def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+          (i32 VectorExtractions.LE_VARIABLE_HALF)>;
+
+// v4i32 scalar <-> vector conversions (LE)
+def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+          (i32 VectorExtractions.LE_WORD_0)>;
+def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+          (i32 VectorExtractions.LE_WORD_1)>;
+def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+          (i32 VectorExtractions.LE_WORD_2)>;
+def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+          (i32 VectorExtractions.LE_WORD_3)>;
+def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+          (i32 VectorExtractions.LE_VARIABLE_WORD)>;
+} // HasVSX, HasDirectMove, NoP9Altivec, IsLittleEndian
+
+// Big endian pre-Power9 VSX subtarget that has direct moves.
+let Predicates = [HasVSX, HasDirectMove, NoP9Vector, IsBigEndian] in {
+// Big endian integer vectors using direct moves.
+def : Pat<(v2i64 (build_vector i64:$A, i64:$B)),
+          (v2i64 (XXPERMDI
+                    (COPY_TO_REGCLASS (MTVSRD $A), VSRC),
+                    (COPY_TO_REGCLASS (MTVSRD $B), VSRC), 0))>;
+def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
+          (XXPERMDI
+            (COPY_TO_REGCLASS
+              (MTVSRD (RLDIMI AnyExts.B, AnyExts.A, 32, 0)), VSRC),
+            (COPY_TO_REGCLASS
+              (MTVSRD (RLDIMI AnyExts.D, AnyExts.C, 32, 0)), VSRC), 0)>;
+def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
+          (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
+} // HasVSX, HasDirectMove, NoP9Vector, IsBigEndian
+
+// Little endian pre-Power9 VSX subtarget that has direct moves.
+let Predicates = [HasVSX, HasDirectMove, NoP9Vector, IsLittleEndian] in {
+// Little endian integer vectors using direct moves.
+def : Pat<(v2i64 (build_vector i64:$A, i64:$B)),
+          (v2i64 (XXPERMDI
+                    (COPY_TO_REGCLASS (MTVSRD $B), VSRC),
+                    (COPY_TO_REGCLASS (MTVSRD $A), VSRC), 0))>;
+def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
+          (XXPERMDI
+            (COPY_TO_REGCLASS
+              (MTVSRD (RLDIMI AnyExts.C, AnyExts.D, 32, 0)), VSRC),
+            (COPY_TO_REGCLASS
+              (MTVSRD (RLDIMI AnyExts.A, AnyExts.B, 32, 0)), VSRC), 0)>;
+def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
+          (XXSPLTW (COPY_TO_REGCLASS (MTVSRWZ $A), VSRC), 1)>;
+}
+
+// Any Power9 VSX subtarget.
+let Predicates = [HasVSX, HasP9Vector] in {
+// Additional fnmsub pattern for PPC specific ISD opcode
+def : Pat<(PPCfnmsub f128:$A, f128:$B, f128:$C),
+          (XSNMSUBQP $C, $A, $B)>;
+def : Pat<(fneg (PPCfnmsub f128:$A, f128:$B, f128:$C)),
+          (XSMSUBQP $C, $A, $B)>;
+def : Pat<(PPCfnmsub f128:$A, f128:$B, (fneg f128:$C)),
+          (XSNMADDQP $C, $A, $B)>;
+
+def : Pat<(f128 (sint_to_fp i64:$src)),
+          (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+def : Pat<(f128 (sint_to_fp (i64 (PPCmfvsr f64:$src)))),
+          (f128 (XSCVSDQP $src))>;
+def : Pat<(f128 (sint_to_fp (i32 (PPCmfvsr f64:$src)))),
+          (f128 (XSCVSDQP (VEXTSW2Ds $src)))>;
+def : Pat<(f128 (uint_to_fp i64:$src)),
+          (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+def : Pat<(f128 (uint_to_fp (i64 (PPCmfvsr f64:$src)))),
+          (f128 (XSCVUDQP $src))>;
+
+// Convert (Un)Signed Word -> QP.
+def : Pat<(f128 (sint_to_fp i32:$src)),
+          (f128 (XSCVSDQP (MTVSRWA $src)))>;
+def : Pat<(f128 (sint_to_fp (i32 (load xoaddr:$src)))),
+          (f128 (XSCVSDQP (LIWAX xoaddr:$src)))>;
+def : Pat<(f128 (uint_to_fp i32:$src)),
+          (f128 (XSCVUDQP (MTVSRWZ $src)))>;
+def : Pat<(f128 (uint_to_fp (i32 (load xoaddr:$src)))),
+          (f128 (XSCVUDQP (LIWZX xoaddr:$src)))>;
+
+// Pattern for matching Vector HP -> Vector SP intrinsic. Defined as a
+// separate pattern so that it can convert the input register class from
+// VRRC(v8i16) to VSRC.
+def : Pat<(v4f32 (int_ppc_vsx_xvcvhpsp v8i16:$A)),
+          (v4f32 (XVCVHPSP (COPY_TO_REGCLASS $A, VSRC)))>;
+
+// Use current rounding mode
+def : Pat<(f128 (any_fnearbyint f128:$vB)), (f128 (XSRQPI 0, $vB, 3))>;
+// Round to nearest, ties away from zero
+def : Pat<(f128 (any_fround f128:$vB)), (f128 (XSRQPI 0, $vB, 0))>;
+// Round towards Zero
+def : Pat<(f128 (any_ftrunc f128:$vB)), (f128 (XSRQPI 1, $vB, 1))>;
+// Round towards +Inf
+def : Pat<(f128 (any_fceil f128:$vB)), (f128 (XSRQPI 1, $vB, 2))>;
+// Round towards -Inf
+def : Pat<(f128 (any_ffloor f128:$vB)), (f128 (XSRQPI 1, $vB, 3))>;
+// Use current rounding mode, [with Inexact]
+def : Pat<(f128 (any_frint f128:$vB)), (f128 (XSRQPIX 0, $vB, 3))>;
+
+def : Pat<(f128 (int_ppc_scalar_insert_exp_qp f128:$vA, i64:$vB)),
+          (f128 (XSIEXPQP $vA, (MTVSRD $vB)))>;
+
+def : Pat<(i64 (int_ppc_scalar_extract_expq  f128:$vA)),
+          (i64 (MFVSRD (EXTRACT_SUBREG
+                          (v2i64 (XSXEXPQP $vA)), sub_64)))>;
+
+// Extra patterns expanding to vector Extract Word/Insert Word
+def : Pat<(v4i32 (int_ppc_vsx_xxinsertw v4i32:$A, v2i64:$B, imm:$IMM)),
+          (v4i32 (XXINSERTW $A, $B, imm:$IMM))>;
+def : Pat<(v2i64 (int_ppc_vsx_xxextractuw v2i64:$A, imm:$IMM)),
+          (v2i64 (COPY_TO_REGCLASS (XXEXTRACTUW $A, imm:$IMM), VSRC))>;
+
+// Vector Reverse
+def : Pat<(v8i16 (bswap v8i16 :$A)),
+          (v8i16 (COPY_TO_REGCLASS (XXBRH (COPY_TO_REGCLASS $A, VSRC)), VRRC))>;
+def : Pat<(v1i128 (bswap v1i128 :$A)),
+          (v1i128 (COPY_TO_REGCLASS (XXBRQ (COPY_TO_REGCLASS $A, VSRC)), VRRC))>;
+
+// D-Form Load/Store
+def : Pat<(v4i32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
+def : Pat<(v4f32 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
+def : Pat<(v2i64 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
+def : Pat<(v2f64 (quadwOffsetLoad iaddrX16:$src)), (LXV memrix16:$src)>;
+def : Pat<(f128  (quadwOffsetLoad iaddrX16:$src)),
+          (COPY_TO_REGCLASS (LXV memrix16:$src), VRRC)>;
+def : Pat<(v4i32 (int_ppc_vsx_lxvw4x iaddrX16:$src)), (LXV memrix16:$src)>;
+def : Pat<(v2f64 (int_ppc_vsx_lxvd2x iaddrX16:$src)), (LXV memrix16:$src)>;
+
+def : Pat<(quadwOffsetStore v4f32:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
+def : Pat<(quadwOffsetStore v4i32:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
+def : Pat<(quadwOffsetStore v2f64:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
+def : Pat<(quadwOffsetStore  f128:$rS, iaddrX16:$dst),
+          (STXV (COPY_TO_REGCLASS $rS, VSRC), memrix16:$dst)>;
+def : Pat<(quadwOffsetStore v2i64:$rS, iaddrX16:$dst), (STXV $rS, memrix16:$dst)>;
+def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, iaddrX16:$dst),
+          (STXV $rS, memrix16:$dst)>;
+def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, iaddrX16:$dst),
+          (STXV $rS, memrix16:$dst)>;
+
+def : Pat<(v2f64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+def : Pat<(v2i64 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+def : Pat<(v4f32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+def : Pat<(v4i32 (nonQuadwOffsetLoad xoaddr:$src)), (LXVX xoaddr:$src)>;
+def : Pat<(v4i32 (int_ppc_vsx_lxvw4x xoaddr:$src)), (LXVX xoaddr:$src)>;
+def : Pat<(v2f64 (int_ppc_vsx_lxvd2x xoaddr:$src)), (LXVX xoaddr:$src)>;
+def : Pat<(f128  (nonQuadwOffsetLoad xoaddr:$src)),
+          (COPY_TO_REGCLASS (LXVX xoaddr:$src), VRRC)>;
+def : Pat<(nonQuadwOffsetStore f128:$rS, xoaddr:$dst),
+          (STXVX (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
+def : Pat<(nonQuadwOffsetStore v2f64:$rS, xoaddr:$dst),
+          (STXVX $rS, xoaddr:$dst)>;
+def : Pat<(nonQuadwOffsetStore v2i64:$rS, xoaddr:$dst),
+          (STXVX $rS, xoaddr:$dst)>;
+def : Pat<(nonQuadwOffsetStore v4f32:$rS, xoaddr:$dst),
+          (STXVX $rS, xoaddr:$dst)>;
+def : Pat<(nonQuadwOffsetStore v4i32:$rS, xoaddr:$dst),
+          (STXVX $rS, xoaddr:$dst)>;
+def : Pat<(int_ppc_vsx_stxvw4x v4i32:$rS, xoaddr:$dst),
+          (STXVX $rS, xoaddr:$dst)>;
+def : Pat<(int_ppc_vsx_stxvd2x v2f64:$rS, xoaddr:$dst),
+          (STXVX $rS, xoaddr:$dst)>;
+
+// Build vectors from i8 loads
+defm : ScalToVecWPermute<v16i8, ScalarLoads.Li8,
+                         (VSPLTBs 7, (LXSIBZX xoaddr:$src)),
+                         (VSPLTBs 7, (LXSIBZX xoaddr:$src))>;
+defm : ScalToVecWPermute<v8i16, ScalarLoads.ZELi8,
+                         (VSPLTHs 3, (LXSIBZX xoaddr:$src)),
+                         (VSPLTHs 3, (LXSIBZX xoaddr:$src))>;
+defm : ScalToVecWPermute<v4i32, ScalarLoads.ZELi8,
+                         (XXSPLTWs (LXSIBZX xoaddr:$src), 1),
+                         (XXSPLTWs (LXSIBZX xoaddr:$src), 1)>;
+defm : ScalToVecWPermute<v2i64, ScalarLoads.ZELi8i64,
+                         (XXPERMDIs (LXSIBZX xoaddr:$src), 0),
+                         (XXPERMDIs (LXSIBZX xoaddr:$src), 0)>;
+defm : ScalToVecWPermute<v4i32, ScalarLoads.SELi8,
+                         (XXSPLTWs (VEXTSB2Ws (LXSIBZX xoaddr:$src)), 1),
+                         (XXSPLTWs (VEXTSB2Ws (LXSIBZX xoaddr:$src)), 1)>;
+defm : ScalToVecWPermute<v2i64, ScalarLoads.SELi8i64,
+                         (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0),
+                         (XXPERMDIs (VEXTSB2Ds (LXSIBZX xoaddr:$src)), 0)>;
+
+// Build vectors from i16 loads
+defm : ScalToVecWPermute<v8i16, ScalarLoads.Li16,
+                         (VSPLTHs 3, (LXSIHZX xoaddr:$src)),
+                         (VSPLTHs 3, (LXSIHZX xoaddr:$src))>;
+defm : ScalToVecWPermute<v4i32, ScalarLoads.ZELi16,
+                         (XXSPLTWs (LXSIHZX xoaddr:$src), 1),
+                         (XXSPLTWs (LXSIHZX xoaddr:$src), 1)>;
+defm : ScalToVecWPermute<v2i64, ScalarLoads.ZELi16i64,
+                         (XXPERMDIs (LXSIHZX xoaddr:$src), 0),
+                         (XXPERMDIs (LXSIHZX xoaddr:$src), 0)>;
+defm : ScalToVecWPermute<v4i32, ScalarLoads.SELi16,
+                         (XXSPLTWs (VEXTSH2Ws (LXSIHZX xoaddr:$src)), 1),
+                         (XXSPLTWs (VEXTSH2Ws (LXSIHZX xoaddr:$src)), 1)>;
+defm : ScalToVecWPermute<v2i64, ScalarLoads.SELi16i64,
+                         (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0),
+                         (XXPERMDIs (VEXTSH2Ds (LXSIHZX xoaddr:$src)), 0)>;
+
+// Load/convert and convert/store patterns for f16.
+def : Pat<(f64 (extloadf16 xoaddr:$src)),
+          (f64 (XSCVHPDP (LXSIHZX xoaddr:$src)))>;
+def : Pat<(truncstoref16 f64:$src, xoaddr:$dst),
+          (STXSIHX (XSCVDPHP $src), xoaddr:$dst)>;
+def : Pat<(f32 (extloadf16 xoaddr:$src)),
+          (f32 (COPY_TO_REGCLASS (XSCVHPDP (LXSIHZX xoaddr:$src)), VSSRC))>;
+def : Pat<(truncstoref16 f32:$src, xoaddr:$dst),
+          (STXSIHX (XSCVDPHP (COPY_TO_REGCLASS $src, VSFRC)), xoaddr:$dst)>;
+def : Pat<(f64 (f16_to_fp i32:$A)),
+          (f64 (XSCVHPDP (MTVSRWZ $A)))>;
+def : Pat<(f32 (f16_to_fp i32:$A)),
+          (f32 (COPY_TO_REGCLASS (XSCVHPDP (MTVSRWZ $A)), VSSRC))>;
+def : Pat<(i32 (fp_to_f16 f32:$A)),
+          (i32 (MFVSRWZ (XSCVDPHP (COPY_TO_REGCLASS $A, VSFRC))))>;
+def : Pat<(i32 (fp_to_f16 f64:$A)), (i32 (MFVSRWZ (XSCVDPHP $A)))>;
+
+// Vector sign extensions
+def : Pat<(f64 (PPCVexts f64:$A, 1)),
+          (f64 (COPY_TO_REGCLASS (VEXTSB2Ds $A), VSFRC))>;
+def : Pat<(f64 (PPCVexts f64:$A, 2)),
+          (f64 (COPY_TO_REGCLASS (VEXTSH2Ds $A), VSFRC))>;
+
+def : Pat<(f64 (extloadf32 iaddrX4:$src)),
+          (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$src), VSFRC)>;
+def : Pat<(f32 (fpround (f64 (extloadf32 iaddrX4:$src)))),
+          (f32 (DFLOADf32 iaddrX4:$src))>;
+
+def : Pat<(v4f32 (PPCldvsxlh xaddr:$src)),
+          (COPY_TO_REGCLASS (XFLOADf64 xaddr:$src), VSRC)>;
+def : Pat<(v4f32 (PPCldvsxlh iaddrX4:$src)),
+          (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC)>;
+
+// Convert (Un)Signed DWord in memory -> QP
+def : Pat<(f128 (sint_to_fp (i64 (load xaddrX4:$src)))),
+          (f128 (XSCVSDQP (LXSDX xaddrX4:$src)))>;
+def : Pat<(f128 (sint_to_fp (i64 (load iaddrX4:$src)))),
+          (f128 (XSCVSDQP (LXSD iaddrX4:$src)))>;
+def : Pat<(f128 (uint_to_fp (i64 (load xaddrX4:$src)))),
+          (f128 (XSCVUDQP (LXSDX xaddrX4:$src)))>;
+def : Pat<(f128 (uint_to_fp (i64 (load iaddrX4:$src)))),
+          (f128 (XSCVUDQP (LXSD iaddrX4:$src)))>;
+
+// Convert Unsigned HWord in memory -> QP
+def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi16)),
+          (f128 (XSCVUDQP (LXSIHZX xaddr:$src)))>;
+
+// Convert Unsigned Byte in memory -> QP
+def : Pat<(f128 (uint_to_fp ScalarLoads.ZELi8)),
+          (f128 (XSCVUDQP (LXSIBZX xoaddr:$src)))>;
+
+// Truncate & Convert QP -> (Un)Signed (D)Word.
+def : Pat<(i64 (fp_to_sint f128:$src)), (i64 (MFVRD (XSCVQPSDZ $src)))>;
+def : Pat<(i64 (fp_to_uint f128:$src)), (i64 (MFVRD (XSCVQPUDZ $src)))>;
+def : Pat<(i32 (fp_to_sint f128:$src)),
+          (i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC)))>;
+def : Pat<(i32 (fp_to_uint f128:$src)),
+          (i32 (MFVSRWZ (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC)))>;
+
+// Instructions for store(fptosi).
+// The 8-byte version is repeated here due to availability of D-Form STXSD.
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xaddrX4:$dst, 8),
+          (STXSDX (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
+                  xaddrX4:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), iaddrX4:$dst, 8),
+          (STXSD (COPY_TO_REGCLASS (XSCVQPSDZ f128:$src), VFRC),
+                 iaddrX4:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 4),
+          (STXSIWX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 2),
+          (STXSIHX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f128:$src)), xoaddr:$dst, 1),
+          (STXSIBX (COPY_TO_REGCLASS (XSCVQPSWZ $src), VFRC), xoaddr:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xaddrX4:$dst, 8),
+          (STXSDX (XSCVDPSXDS f64:$src), xaddrX4:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), iaddrX4:$dst, 8),
+          (STXSD (XSCVDPSXDS f64:$src), iaddrX4:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 2),
+          (STXSIHX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_sint_in_vsr f64:$src)), xoaddr:$dst, 1),
+          (STXSIBX (XSCVDPSXWS f64:$src), xoaddr:$dst)>;
+
+// Instructions for store(fptoui).
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xaddrX4:$dst, 8),
+          (STXSDX (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
+                  xaddrX4:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), iaddrX4:$dst, 8),
+          (STXSD (COPY_TO_REGCLASS (XSCVQPUDZ f128:$src), VFRC),
+                 iaddrX4:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 4),
+          (STXSIWX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 2),
+          (STXSIHX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f128:$src)), xoaddr:$dst, 1),
+          (STXSIBX (COPY_TO_REGCLASS (XSCVQPUWZ $src), VFRC), xoaddr:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xaddrX4:$dst, 8),
+          (STXSDX (XSCVDPUXDS f64:$src), xaddrX4:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), iaddrX4:$dst, 8),
+          (STXSD (XSCVDPUXDS f64:$src), iaddrX4:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 2),
+          (STXSIHX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
+def : Pat<(PPCstore_scal_int_from_vsr
+            (f64 (PPCcv_fp_to_uint_in_vsr f64:$src)), xoaddr:$dst, 1),
+          (STXSIBX (XSCVDPUXWS f64:$src), xoaddr:$dst)>;
+
+// Round & Convert QP -> DP/SP
+def : Pat<(f64 (any_fpround f128:$src)), (f64 (XSCVQPDP $src))>;
+def : Pat<(f32 (any_fpround f128:$src)), (f32 (XSRSP (XSCVQPDPO $src)))>;
+
+// Convert SP -> QP
+def : Pat<(f128 (any_fpextend f32:$src)),
+          (f128 (XSCVDPQP (COPY_TO_REGCLASS $src, VFRC)))>;
+
+def : Pat<(f32 (PPCxsmaxc f32:$XA, f32:$XB)),
+          (f32 (COPY_TO_REGCLASS (XSMAXCDP (COPY_TO_REGCLASS $XA, VSSRC),
+                                           (COPY_TO_REGCLASS $XB, VSSRC)),
+                                 VSSRC))>;
+def : Pat<(f32 (PPCxsminc f32:$XA, f32:$XB)),
+          (f32 (COPY_TO_REGCLASS (XSMINCDP (COPY_TO_REGCLASS $XA, VSSRC),
+                                           (COPY_TO_REGCLASS $XB, VSSRC)),
+                                 VSSRC))>;
+
+// Endianness-neutral patterns for const splats with ISA 3.0 instructions.
+defm : ScalToVecWPermute<v4i32, (i32 i32:$A), (MTVSRWS $A), (MTVSRWS $A)>;
+def : Pat<(v4i32 (build_vector i32:$A, i32:$A, i32:$A, i32:$A)),
+          (v4i32 (MTVSRWS $A))>;
+def : Pat<(v16i8 (build_vector immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+                               immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+                               immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+                               immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+                               immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+                               immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+                               immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A,
+                               immNonAllOneAnyExt8:$A, immNonAllOneAnyExt8:$A)),
+          (v16i8 (COPY_TO_REGCLASS (XXSPLTIB imm:$A), VSRC))>;
+defm : ScalToVecWPermute<v4i32, FltToIntLoad.A,
+                         (XVCVSPSXWS (LXVWSX xoaddr:$A)),
+                         (XVCVSPSXWS (LXVWSX xoaddr:$A))>;
+defm : ScalToVecWPermute<v4i32, FltToUIntLoad.A,
+                         (XVCVSPUXWS (LXVWSX xoaddr:$A)),
+                         (XVCVSPUXWS (LXVWSX xoaddr:$A))>;
+defm : ScalToVecWPermute<
+  v4i32, DblToIntLoadP9.A,
+  (XXSPLTW (COPY_TO_REGCLASS (XSCVDPSXWS (DFLOADf64 iaddrX4:$A)), VSRC), 1),
+  (SUBREG_TO_REG (i64 1), (XSCVDPSXWS (DFLOADf64 iaddrX4:$A)), sub_64)>;
+defm : ScalToVecWPermute<
+  v4i32, DblToUIntLoadP9.A,
+  (XXSPLTW (COPY_TO_REGCLASS (XSCVDPUXWS (DFLOADf64 iaddrX4:$A)), VSRC), 1),
+  (SUBREG_TO_REG (i64 1), (XSCVDPUXWS (DFLOADf64 iaddrX4:$A)), sub_64)>;
+defm : ScalToVecWPermute<
+  v2i64, FltToLongLoadP9.A,
+  (XXPERMDIs (XSCVDPSXDS (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$A), VSFRC)), 0),
+  (SUBREG_TO_REG
+     (i64 1),
+     (XSCVDPSXDS (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$A), VSFRC)), sub_64)>;
+defm : ScalToVecWPermute<
+  v2i64, FltToULongLoadP9.A,
+  (XXPERMDIs (XSCVDPUXDS (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$A), VSFRC)), 0),
+  (SUBREG_TO_REG
+     (i64 1),
+     (XSCVDPUXDS (COPY_TO_REGCLASS (DFLOADf32 iaddrX4:$A), VSFRC)), sub_64)>;
+def : Pat<(v4f32 (PPCldsplat xoaddr:$A)),
+          (v4f32 (LXVWSX xoaddr:$A))>;
+def : Pat<(v4i32 (PPCldsplat xoaddr:$A)),
+          (v4i32 (LXVWSX xoaddr:$A))>;
+} // HasVSX, HasP9Vector
+
+// Big endian Power9 subtarget.
+let Predicates = [HasVSX, HasP9Vector, IsBigEndian] in {
+def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
+          (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
+def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
+          (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
+def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
+          (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
+def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
+          (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
+def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
+          (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
+def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
+          (f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>;
+def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
+          (f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>;
+def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
+          (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
+          (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
+          (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)),
+          (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)),
+          (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)),
+          (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)),
+          (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 2)),
+          (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
+          (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
+
+// Scalar stores of i8
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 9)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 11)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 13)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 15)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 1)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 3)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 5)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 7)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
+
+// Scalar stores of i16
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
+
+def : Pat<(v2i64 (scalar_to_vector (i64 (load iaddrX4:$src)))),
+          (v2i64 (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC))>;
+def : Pat<(v2i64 (scalar_to_vector (i64 (load xaddrX4:$src)))),
+          (v2i64 (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC))>;
+
+def : Pat<(v2f64 (scalar_to_vector (f64 (load iaddrX4:$src)))),
+          (v2f64 (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSRC))>;
+def : Pat<(v2f64 (scalar_to_vector (f64 (load xaddrX4:$src)))),
+          (v2f64 (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSRC))>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xaddrX4:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
+                       sub_64), xaddrX4:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xaddrX4:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
+                       sub_64), xaddrX4:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xaddrX4:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xaddrX4:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 1)), iaddrX4:$src),
+          (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
+                       sub_64), iaddrX4:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 1)), iaddrX4:$src),
+          (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
+                       sub_64), iaddrX4:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 0)), iaddrX4:$src),
+          (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 0)), iaddrX4:$src),
+          (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
+
+// (Un)Signed DWord vector extract -> QP
+def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 0)))),
+          (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 1)))),
+          (f128 (XSCVSDQP
+                  (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
+def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 0)))),
+          (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 1)))),
+          (f128 (XSCVUDQP
+                  (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
+
+// (Un)Signed Word vector extract -> QP
+def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, 1)))),
+          (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSW2D $src), sub_64)))>;
+foreach Idx = [0,2,3] in {
+  def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, Idx)))),
+            (f128 (XSCVSDQP (EXTRACT_SUBREG
+                            (VEXTSW2D (VSPLTW Idx, $src)), sub_64)))>;
+}
+foreach Idx = 0-3 in {
+  def : Pat<(f128 (uint_to_fp (i32 (extractelt v4i32:$src, Idx)))),
+            (f128 (XSCVUDQP (XXEXTRACTUW $src, !shl(Idx, 2))))>;
+}
+
+// (Un)Signed HWord vector extract -> QP
+foreach Idx = 0-7 in {
+  def : Pat<(f128 (sint_to_fp
+                    (i32 (sext_inreg
+                           (vector_extract v8i16:$src, Idx), i16)))),
+          (f128 (XSCVSDQP (EXTRACT_SUBREG
+                            (VEXTSH2D (VEXTRACTUH !add(Idx, Idx), $src)),
+                            sub_64)))>;
+  // The SDAG adds the `and` since an `i16` is being extracted as an `i32`.
+  def : Pat<(f128 (uint_to_fp
+                    (and (i32 (vector_extract v8i16:$src, Idx)), 65535))),
+            (f128 (XSCVUDQP (EXTRACT_SUBREG
+                              (VEXTRACTUH !add(Idx, Idx), $src), sub_64)))>;
+}
+
+// (Un)Signed Byte vector extract -> QP
+foreach Idx = 0-15 in {
+  def : Pat<(f128 (sint_to_fp
+                    (i32 (sext_inreg (vector_extract v16i8:$src, Idx),
+                                     i8)))),
+            (f128 (XSCVSDQP (EXTRACT_SUBREG
+                              (VEXTSB2D (VEXTRACTUB Idx, $src)), sub_64)))>;
+  def : Pat<(f128 (uint_to_fp
+                    (and (i32 (vector_extract v16i8:$src, Idx)), 255))),
+            (f128 (XSCVUDQP
+                    (EXTRACT_SUBREG (VEXTRACTUB Idx, $src), sub_64)))>;
+}
+
+// Unsiged int in vsx register -> QP
+def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))),
+          (f128 (XSCVUDQP
+                  (XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 4)))>;
+} // HasVSX, HasP9Vector, IsBigEndian
+
+// Little endian Power9 subtarget.
+let Predicates = [HasVSX, HasP9Vector, IsLittleEndian] in {
+def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
+          (f32 (XSCVUXDSP (XXEXTRACTUW $A, 12)))>;
+def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
+          (f32 (XSCVUXDSP (XXEXTRACTUW $A, 8)))>;
+def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
+          (f32 (XSCVUXDSP (XXEXTRACTUW $A, 4)))>;
+def : Pat<(f32 (PPCfcfidus (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
+          (f32 (XSCVUXDSP (XXEXTRACTUW $A, 0)))>;
+def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 0)))))),
+          (f64 (XSCVUXDDP (XXEXTRACTUW $A, 12)))>;
+def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 1)))))),
+          (f64 (XSCVUXDDP (XXEXTRACTUW $A, 8)))>;
+def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 2)))))),
+          (f64 (XSCVUXDDP (XXEXTRACTUW $A, 4)))>;
+def : Pat<(f64 (PPCfcfidu (f64 (PPCmtvsrz (i32 (extractelt v4i32:$A, 3)))))),
+          (f64 (XSCVUXDDP (XXEXTRACTUW $A, 0)))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 0)),
+          (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 12))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 1)),
+          (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 8))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 2)),
+          (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 4))>;
+def : Pat<(v4i32 (insertelt v4i32:$A, i32:$B, 3)),
+          (v4i32 (XXINSERTW v4i32:$A, AlignValues.I32_TO_BE_WORD1, 0))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 0)),
+          (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 12))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 1)),
+          (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 8))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 2)),
+          (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 4))>;
+def : Pat<(v4f32 (insertelt v4f32:$A, f32:$B, 3)),
+          (v4f32 (XXINSERTW v4f32:$A, AlignValues.F32_TO_BE_WORD1, 0))>;
+
+def : Pat<(v8i16 (PPCld_vec_be xoaddr:$src)),
+          (COPY_TO_REGCLASS (LXVH8X xoaddr:$src), VRRC)>;
+def : Pat<(PPCst_vec_be v8i16:$rS, xoaddr:$dst),
+          (STXVH8X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
+
+def : Pat<(v16i8 (PPCld_vec_be xoaddr:$src)),
+          (COPY_TO_REGCLASS (LXVB16X xoaddr:$src), VRRC)>;
+def : Pat<(PPCst_vec_be v16i8:$rS, xoaddr:$dst),
+          (STXVB16X (COPY_TO_REGCLASS $rS, VSRC), xoaddr:$dst)>;
+
+// Scalar stores of i8
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 0)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 1)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 7)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 2)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 3)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 5)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 4)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 5)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 3)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 6)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 7)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 1)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 8)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 9)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 15)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 10)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 11)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 13)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 12)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 13)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 11)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 14)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei8 (i32 (vector_extract v16i8:$S, 15)), xoaddr:$dst),
+          (STXSIBXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 9)), VSRC), xoaddr:$dst)>;
+
+// Scalar stores of i16
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 0)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 8)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 1)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 6)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 2)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 4)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 3)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 2)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 4)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS $S, VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 5)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 14)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 6)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 12)), VSRC), xoaddr:$dst)>;
+def : Pat<(truncstorei16 (i32 (vector_extract v8i16:$S, 7)), xoaddr:$dst),
+          (STXSIHXv (COPY_TO_REGCLASS (v16i8 (VSLDOI $S, $S, 10)), VSRC), xoaddr:$dst)>;
+
+defm : ScalToVecWPermute<
+  v2i64, (i64 (load iaddrX4:$src)),
+  (XXPERMDIs (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSFRC), 2),
+  (SUBREG_TO_REG (i64 1), (DFLOADf64 iaddrX4:$src), sub_64)>;
+defm : ScalToVecWPermute<
+  v2i64, (i64 (load xaddrX4:$src)),
+  (XXPERMDIs (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSFRC), 2),
+  (SUBREG_TO_REG (i64 1), (XFLOADf64 xaddrX4:$src), sub_64)>;
+defm : ScalToVecWPermute<
+  v2f64, (f64 (load iaddrX4:$src)),
+  (XXPERMDIs (COPY_TO_REGCLASS (DFLOADf64 iaddrX4:$src), VSFRC), 2),
+  (SUBREG_TO_REG (i64 1), (DFLOADf64 iaddrX4:$src), sub_64)>;
+defm : ScalToVecWPermute<
+  v2f64, (f64 (load xaddrX4:$src)),
+  (XXPERMDIs (COPY_TO_REGCLASS (XFLOADf64 xaddrX4:$src), VSFRC), 2),
+  (SUBREG_TO_REG (i64 1), (XFLOADf64 xaddrX4:$src), sub_64)>;
+
+def : Pat<(store (i64 (extractelt v2i64:$A, 0)), xaddrX4:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
+                       sub_64), xaddrX4:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 0)), xaddrX4:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
+                       sub_64), xaddrX4:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 1)), xaddrX4:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 1)), xaddrX4:$src),
+          (XFSTOREf64 (EXTRACT_SUBREG $A, sub_64), xaddrX4:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 0)), iaddrX4:$src),
+          (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2),
+                       sub_64), iaddrX4:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 0)), iaddrX4:$src),
+          (DFSTOREf64 (EXTRACT_SUBREG (XXPERMDI $A, $A, 2), sub_64),
+                      iaddrX4:$src)>;
+def : Pat<(store (i64 (extractelt v2i64:$A, 1)), iaddrX4:$src),
+          (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
+def : Pat<(store (f64 (extractelt v2f64:$A, 1)), iaddrX4:$src),
+          (DFSTOREf64 (EXTRACT_SUBREG $A, sub_64), iaddrX4:$src)>;
+
+// (Un)Signed DWord vector extract -> QP
+def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 0)))),
+          (f128 (XSCVSDQP
+                  (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
+def : Pat<(f128 (sint_to_fp (i64 (extractelt v2i64:$src, 1)))),
+          (f128 (XSCVSDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 0)))),
+          (f128 (XSCVUDQP
+                  (EXTRACT_SUBREG (XXPERMDI $src, $src, 3), sub_64)))>;
+def : Pat<(f128 (uint_to_fp (i64 (extractelt v2i64:$src, 1)))),
+          (f128 (XSCVUDQP (COPY_TO_REGCLASS $src, VFRC)))>;
+
+// (Un)Signed Word vector extract -> QP
+foreach Idx = [[0,3],[1,2],[3,0]] in {
+  def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, !head(Idx))))),
+            (f128 (XSCVSDQP (EXTRACT_SUBREG
+                              (VEXTSW2D (VSPLTW !head(!tail(Idx)), $src)),
+                              sub_64)))>;
+}
+def : Pat<(f128 (sint_to_fp (i32 (extractelt v4i32:$src, 2)))),
+          (f128 (XSCVSDQP (EXTRACT_SUBREG (VEXTSW2D $src), sub_64)))>;
 
-  // As PPCVABSD description, the last operand indicates whether do the
-  // sign bit flip.
-  def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 1))),
-            (v4i32 (VABSDUW (XVNEGSP $A), (XVNEGSP $B)))>;
+foreach Idx = [[0,12],[1,8],[2,4],[3,0]] in {
+  def : Pat<(f128 (uint_to_fp (i32 (extractelt v4i32:$src, !head(Idx))))),
+            (f128 (XSCVUDQP (XXEXTRACTUW $src, !head(!tail(Idx)))))>;
 }
+
+// (Un)Signed HWord vector extract -> QP
+// The Nested foreach lists identifies the vector element and corresponding
+// register byte location.
+foreach Idx = [[0,14],[1,12],[2,10],[3,8],[4,6],[5,4],[6,2],[7,0]] in {
+  def : Pat<(f128 (sint_to_fp
+                    (i32 (sext_inreg
+                           (vector_extract v8i16:$src, !head(Idx)), i16)))),
+            (f128 (XSCVSDQP
+                    (EXTRACT_SUBREG (VEXTSH2D
+                                      (VEXTRACTUH !head(!tail(Idx)), $src)),
+                                    sub_64)))>;
+  def : Pat<(f128 (uint_to_fp
+                    (and (i32 (vector_extract v8i16:$src, !head(Idx))),
+                         65535))),
+            (f128 (XSCVUDQP (EXTRACT_SUBREG
+                              (VEXTRACTUH !head(!tail(Idx)), $src), sub_64)))>;
+}
+
+// (Un)Signed Byte vector extract -> QP
+foreach Idx = [[0,15],[1,14],[2,13],[3,12],[4,11],[5,10],[6,9],[7,8],[8,7],
+               [9,6],[10,5],[11,4],[12,3],[13,2],[14,1],[15,0]] in {
+  def : Pat<(f128 (sint_to_fp
+                    (i32 (sext_inreg
+                           (vector_extract v16i8:$src, !head(Idx)), i8)))),
+            (f128 (XSCVSDQP
+                    (EXTRACT_SUBREG
+                      (VEXTSB2D (VEXTRACTUB !head(!tail(Idx)), $src)),
+                      sub_64)))>;
+  def : Pat<(f128 (uint_to_fp
+                    (and (i32 (vector_extract v16i8:$src, !head(Idx))),
+                         255))),
+            (f128 (XSCVUDQP
+                    (EXTRACT_SUBREG
+                      (VEXTRACTUB !head(!tail(Idx)), $src), sub_64)))>;
+}
+
+// Unsiged int in vsx register -> QP
+def : Pat<(f128 (uint_to_fp (i32 (PPCmfvsr f64:$src)))),
+          (f128 (XSCVUDQP
+                  (XXEXTRACTUW (SUBREG_TO_REG (i64 1), $src, sub_64), 8)))>;
+} // HasVSX, HasP9Vector, IsLittleEndian
+
+// Any Power9 VSX subtarget that supports Power9 Altivec.
+let Predicates = [HasVSX, HasP9Altivec] in {
+// Put this P9Altivec related definition here since it's possible to be
+// selected to VSX instruction xvnegsp, avoid possible undef.
+def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 0))),
+          (v4i32 (VABSDUW $A, $B))>;
+
+def : Pat<(v8i16 (PPCvabsd v8i16:$A, v8i16:$B, (i32 0))),
+          (v8i16 (VABSDUH $A, $B))>;
+
+def : Pat<(v16i8 (PPCvabsd v16i8:$A, v16i8:$B, (i32 0))),
+          (v16i8 (VABSDUB $A, $B))>;
+
+// As PPCVABSD description, the last operand indicates whether do the
+// sign bit flip.
+def : Pat<(v4i32 (PPCvabsd v4i32:$A, v4i32:$B, (i32 1))),
+          (v4i32 (VABSDUW (XVNEGSP $A), (XVNEGSP $B)))>;
+} // HasVSX, HasP9Altivec
+
+// Big endian Power9 VSX subtargets with P9 Altivec support.
+let Predicates = [HasVSX, HasP9Altivec, IsBigEndian] in {
+def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))),
+          (VEXTUBLX $Idx, $S)>;
+
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))),
+          (VEXTUHLX (RLWINM8 $Idx, 1, 28, 30), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))),
+          (VEXTUHLX (LI8 0), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))),
+          (VEXTUHLX (LI8 2), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))),
+          (VEXTUHLX (LI8 4), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))),
+          (VEXTUHLX (LI8 6), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))),
+          (VEXTUHLX (LI8 8), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))),
+          (VEXTUHLX (LI8 10), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))),
+          (VEXTUHLX (LI8 12), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))),
+          (VEXTUHLX (LI8 14), $S)>;
+
+def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
+          (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S)>;
+def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))),
+          (VEXTUWLX (LI8 0), $S)>;
+
+// For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
+def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+          (i32 VectorExtractions.LE_WORD_2), sub_32)>;
+def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))),
+          (VEXTUWLX (LI8 8), $S)>;
+def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))),
+          (VEXTUWLX (LI8 12), $S)>;
+
+def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
+          (EXTSW (VEXTUWLX (RLWINM8 $Idx, 2, 28, 29), $S))>;
+def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))),
+          (EXTSW (VEXTUWLX (LI8 0), $S))>;
+// For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
+def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))),
+          (EXTSW (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+          (i32 VectorExtractions.LE_WORD_2), sub_32))>;
+def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))),
+          (EXTSW (VEXTUWLX (LI8 8), $S))>;
+def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))),
+          (EXTSW (VEXTUWLX (LI8 12), $S))>;
+
+def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX $Idx, $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 0), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 1), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 2), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 3), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 4), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 5), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 6), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 7), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 8), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 9), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 10), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 11), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 12), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 13), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 14), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+          (i32 (EXTRACT_SUBREG (VEXTUBLX (LI8 15), $S), sub_32))>;
+
+def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+          (i32 (EXTRACT_SUBREG (VEXTUHLX
+          (RLWINM8 $Idx, 1, 28, 30), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+          (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 0), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+          (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 2), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+          (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 4), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+          (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 6), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+          (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 8), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+          (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 10), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+          (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 12), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+          (i32 (EXTRACT_SUBREG (VEXTUHLX (LI8 14), $S), sub_32))>;
+
+def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+          (i32 (EXTRACT_SUBREG (VEXTUWLX
+          (RLWINM8 $Idx, 2, 28, 29), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+          (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 0), $S), sub_32))>;
+// For extracting BE word 1, MFVSRWZ is better than VEXTUWLX
+def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+          (i32 VectorExtractions.LE_WORD_2)>;
+def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+          (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 8), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+          (i32 (EXTRACT_SUBREG (VEXTUWLX (LI8 12), $S), sub_32))>;
+
+// P9 Altivec instructions that can be used to build vectors.
+// Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete
+// with complexities of existing build vector patterns in this file.
+def : Pat<(v2i64 (build_vector WordToDWord.BE_A0, WordToDWord.BE_A1)),
+          (v2i64 (VEXTSW2D $A))>;
+def : Pat<(v2i64 (build_vector HWordToDWord.BE_A0, HWordToDWord.BE_A1)),
+          (v2i64 (VEXTSH2D $A))>;
+def : Pat<(v4i32 (build_vector HWordToWord.BE_A0, HWordToWord.BE_A1,
+                  HWordToWord.BE_A2, HWordToWord.BE_A3)),
+          (v4i32 (VEXTSH2W $A))>;
+def : Pat<(v4i32 (build_vector ByteToWord.BE_A0, ByteToWord.BE_A1,
+                  ByteToWord.BE_A2, ByteToWord.BE_A3)),
+          (v4i32 (VEXTSB2W $A))>;
+def : Pat<(v2i64 (build_vector ByteToDWord.BE_A0, ByteToDWord.BE_A1)),
+          (v2i64 (VEXTSB2D $A))>;
+} // HasVSX, HasP9Altivec, IsBigEndian
+
+// Little endian Power9 VSX subtargets with P9 Altivec support.
+let Predicates = [HasVSX, HasP9Altivec, IsLittleEndian] in {
+def : Pat<(i64 (anyext (i32 (vector_extract v16i8:$S, i64:$Idx)))),
+          (VEXTUBRX $Idx, $S)>;
+
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, i64:$Idx)))),
+          (VEXTUHRX (RLWINM8 $Idx, 1, 28, 30), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 0)))),
+          (VEXTUHRX (LI8 0), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 1)))),
+          (VEXTUHRX (LI8 2), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 2)))),
+          (VEXTUHRX (LI8 4), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 3)))),
+          (VEXTUHRX (LI8 6), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 4)))),
+          (VEXTUHRX (LI8 8), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 5)))),
+          (VEXTUHRX (LI8 10), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 6)))),
+          (VEXTUHRX (LI8 12), $S)>;
+def : Pat<(i64 (anyext (i32 (vector_extract v8i16:$S, 7)))),
+          (VEXTUHRX (LI8 14), $S)>;
+
+def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
+          (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S)>;
+def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 0)))),
+          (VEXTUWRX (LI8 0), $S)>;
+def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 1)))),
+          (VEXTUWRX (LI8 4), $S)>;
+// For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
+def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 2)))),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+          (i32 VectorExtractions.LE_WORD_2), sub_32)>;
+def : Pat<(i64 (zext (i32 (vector_extract v4i32:$S, 3)))),
+          (VEXTUWRX (LI8 12), $S)>;
+
+def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, i64:$Idx)))),
+          (EXTSW (VEXTUWRX (RLWINM8 $Idx, 2, 28, 29), $S))>;
+def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 0)))),
+          (EXTSW (VEXTUWRX (LI8 0), $S))>;
+def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 1)))),
+          (EXTSW (VEXTUWRX (LI8 4), $S))>;
+// For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
+def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 2)))),
+          (EXTSW (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+          (i32 VectorExtractions.LE_WORD_2), sub_32))>;
+def : Pat<(i64 (sext (i32 (vector_extract v4i32:$S, 3)))),
+          (EXTSW (VEXTUWRX (LI8 12), $S))>;
+
+def : Pat<(i32 (vector_extract v16i8:$S, i64:$Idx)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX $Idx, $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 0)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 0), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 1)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 1), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 2)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 2), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 3)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 3), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 4)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 4), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 5)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 5), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 6)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 6), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 7)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 7), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 8)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 8), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 9)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 9), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 10)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 10), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 11)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 11), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 12)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 12), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 13)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 13), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 14)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 14), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v16i8:$S, 15)),
+          (i32 (EXTRACT_SUBREG (VEXTUBRX (LI8 15), $S), sub_32))>;
+
+def : Pat<(i32 (vector_extract v8i16:$S, i64:$Idx)),
+          (i32 (EXTRACT_SUBREG (VEXTUHRX
+          (RLWINM8 $Idx, 1, 28, 30), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 0)),
+          (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 0), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 1)),
+          (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 2), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 2)),
+          (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 4), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 3)),
+          (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 6), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 4)),
+          (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 8), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 5)),
+          (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 10), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+          (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 12), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v8i16:$S, 6)),
+          (i32 (EXTRACT_SUBREG (VEXTUHRX (LI8 14), $S), sub_32))>;
+
+def : Pat<(i32 (vector_extract v4i32:$S, i64:$Idx)),
+          (i32 (EXTRACT_SUBREG (VEXTUWRX
+          (RLWINM8 $Idx, 2, 28, 29), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v4i32:$S, 0)),
+          (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 0), $S), sub_32))>;
+def : Pat<(i32 (vector_extract v4i32:$S, 1)),
+          (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 4), $S), sub_32))>;
+// For extracting LE word 2, MFVSRWZ is better than VEXTUWRX
+def : Pat<(i32 (vector_extract v4i32:$S, 2)),
+          (i32 VectorExtractions.LE_WORD_2)>;
+def : Pat<(i32 (vector_extract v4i32:$S, 3)),
+          (i32 (EXTRACT_SUBREG (VEXTUWRX (LI8 12), $S), sub_32))>;
+
+// P9 Altivec instructions that can be used to build vectors.
+// Adding them to PPCInstrVSX.td rather than PPCAltivecVSX.td to compete
+// with complexities of existing build vector patterns in this file.
+def : Pat<(v2i64 (build_vector WordToDWord.LE_A0, WordToDWord.LE_A1)),
+          (v2i64 (VEXTSW2D $A))>;
+def : Pat<(v2i64 (build_vector HWordToDWord.LE_A0, HWordToDWord.LE_A1)),
+          (v2i64 (VEXTSH2D $A))>;
+def : Pat<(v4i32 (build_vector HWordToWord.LE_A0, HWordToWord.LE_A1,
+                  HWordToWord.LE_A2, HWordToWord.LE_A3)),
+          (v4i32 (VEXTSH2W $A))>;
+def : Pat<(v4i32 (build_vector ByteToWord.LE_A0, ByteToWord.LE_A1,
+                  ByteToWord.LE_A2, ByteToWord.LE_A3)),
+          (v4i32 (VEXTSB2W $A))>;
+def : Pat<(v2i64 (build_vector ByteToDWord.LE_A0, ByteToDWord.LE_A1)),
+          (v2i64 (VEXTSB2D $A))>;
+} // HasVSX, HasP9Altivec, IsLittleEndian
+
+// Big endian VSX subtarget that supports additional direct moves from ISA3.0.
+let Predicates = [HasVSX, IsISA3_0, HasDirectMove, IsBigEndian] in {
+def : Pat<(i64 (extractelt v2i64:$A, 1)),
+          (i64 (MFVSRLD $A))>;
+// Better way to build integer vectors if we have MTVSRDD. Big endian.
+def : Pat<(v2i64 (build_vector i64:$rB, i64:$rA)),
+          (v2i64 (MTVSRDD $rB, $rA))>;
+def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
+          (MTVSRDD
+            (RLDIMI AnyExts.B, AnyExts.A, 32, 0),
+            (RLDIMI AnyExts.D, AnyExts.C, 32, 0))>;
+
+def : Pat<(f128 (PPCbuild_fp128 i64:$rB, i64:$rA)),
+          (f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>;
+} // HasVSX, IsISA3_0, HasDirectMove, IsBigEndian
+
+// Little endian VSX subtarget that supports direct moves from ISA3.0.
+let Predicates = [HasVSX, IsISA3_0, HasDirectMove, IsLittleEndian] in {
+def : Pat<(i64 (extractelt v2i64:$A, 0)),
+          (i64 (MFVSRLD $A))>;
+// Better way to build integer vectors if we have MTVSRDD. Little endian.
+def : Pat<(v2i64 (build_vector i64:$rA, i64:$rB)),
+          (v2i64 (MTVSRDD $rB, $rA))>;
+def : Pat<(v4i32 (build_vector i32:$A, i32:$B, i32:$C, i32:$D)),
+          (MTVSRDD
+            (RLDIMI AnyExts.C, AnyExts.D, 32, 0),
+            (RLDIMI AnyExts.A, AnyExts.B, 32, 0))>;
+
+def : Pat<(f128 (PPCbuild_fp128 i64:$rA, i64:$rB)),
+          (f128 (COPY_TO_REGCLASS (MTVSRDD $rB, $rA), VRRC))>;
+} // HasVSX, IsISA3_0, HasDirectMove, IsLittleEndian
+} // AddedComplexity = 400
+
+//---------------------------- Instruction aliases ---------------------------//
+def : InstAlias<"xvmovdp $XT, $XB",
+                (XVCPSGNDP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
+def : InstAlias<"xvmovsp $XT, $XB",
+                (XVCPSGNSP vsrc:$XT, vsrc:$XB, vsrc:$XB)>;
+
+def : InstAlias<"xxspltd $XT, $XB, 0",
+                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 0)>;
+def : InstAlias<"xxspltd $XT, $XB, 1",
+                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 3)>;
+def : InstAlias<"xxmrghd $XT, $XA, $XB",
+                (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 0)>;
+def : InstAlias<"xxmrgld $XT, $XA, $XB",
+                (XXPERMDI vsrc:$XT, vsrc:$XA, vsrc:$XB, 3)>;
+def : InstAlias<"xxswapd $XT, $XB",
+                (XXPERMDI vsrc:$XT, vsrc:$XB, vsrc:$XB, 2)>;
+def : InstAlias<"xxspltd $XT, $XB, 0",
+                (XXPERMDIs vsrc:$XT, vsfrc:$XB, 0)>;
+def : InstAlias<"xxspltd $XT, $XB, 1",
+                (XXPERMDIs vsrc:$XT, vsfrc:$XB, 3)>;
+def : InstAlias<"xxswapd $XT, $XB",
+                (XXPERMDIs vsrc:$XT, vsfrc:$XB, 2)>;
+def : InstAlias<"mfvrd $rA, $XT",
+                (MFVRD g8rc:$rA, vrrc:$XT), 0>;
+def : InstAlias<"mffprd $rA, $src",
+                (MFVSRD g8rc:$rA, f8rc:$src)>;
+def : InstAlias<"mtvrd $XT, $rA",
+                (MTVRD vrrc:$XT, g8rc:$rA), 0>;
+def : InstAlias<"mtfprd $dst, $rA",
+                (MTVSRD f8rc:$dst, g8rc:$rA)>;
+def : InstAlias<"mfvrwz $rA, $XT",
+                (MFVRWZ gprc:$rA, vrrc:$XT), 0>;
+def : InstAlias<"mffprwz $rA, $src",
+                (MFVSRWZ gprc:$rA, f8rc:$src)>;
+def : InstAlias<"mtvrwa $XT, $rA",
+                (MTVRWA vrrc:$XT, gprc:$rA), 0>;
+def : InstAlias<"mtfprwa $dst, $rA",
+                (MTVSRWA f8rc:$dst, gprc:$rA)>;
+def : InstAlias<"mtvrwz $XT, $rA",
+                (MTVRWZ vrrc:$XT, gprc:$rA), 0>;
+def : InstAlias<"mtfprwz $dst, $rA",
+                (MTVSRWZ f8rc:$dst, gprc:$rA)>;
diff --git a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
index b761f337533b..a7546d2be5d8 100644
--- a/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLoopInstrFormPrep.cpp
@@ -53,7 +53,6 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -74,6 +73,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <cassert>
 #include <iterator>
 #include <utility>
@@ -244,10 +244,10 @@ INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(PPCLoopInstrFormPrep, DEBUG_TYPE, name, false, false)
 
-static const std::string PHINodeNameSuffix    = ".phi";
-static const std::string CastNodeNameSuffix   = ".cast";
-static const std::string GEPNodeIncNameSuffix = ".inc";
-static const std::string GEPNodeOffNameSuffix = ".off";
+static constexpr StringRef PHINodeNameSuffix    = ".phi";
+static constexpr StringRef CastNodeNameSuffix   = ".cast";
+static constexpr StringRef GEPNodeIncNameSuffix = ".inc";
+static constexpr StringRef GEPNodeOffNameSuffix = ".off";
 
 FunctionPass *llvm::createPPCLoopInstrFormPrepPass(PPCTargetMachine &TM) {
   return new PPCLoopInstrFormPrep(TM);
@@ -263,7 +263,7 @@ static bool IsPtrInBounds(Value *BasePtr) {
   return false;
 }
 
-static std::string getInstrName(const Value *I, const std::string Suffix) {
+static std::string getInstrName(const Value *I, StringRef Suffix) {
   assert(I && "Invalid paramater!");
   if (I->hasName())
     return (I->getName() + Suffix).str();
diff --git a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
index 83cca11b27a3..2b0e604e0ccd 100644
--- a/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
+++ b/llvm/lib/Target/PowerPC/PPCLowerMASSVEntries.cpp
@@ -54,6 +54,7 @@ private:
   static StringRef getCPUSuffix(const PPCSubtarget *Subtarget);
   static std::string createMASSVFuncName(Function &Func,
                                          const PPCSubtarget *Subtarget);
+  bool handlePowSpecialCases(CallInst *CI, Function &Func, Module &M);
   bool lowerMASSVCall(CallInst *CI, Function &Func, Module &M,
                       const PPCSubtarget *Subtarget);
 };
@@ -96,6 +97,34 @@ PPCLowerMASSVEntries::createMASSVFuncName(Function &Func,
   return MASSVEntryName;
 }
 
+/// If there are proper fast-math flags, this function creates llvm.pow
+/// intrinsics when the exponent is 0.25 or 0.75.
+bool PPCLowerMASSVEntries::handlePowSpecialCases(CallInst *CI, Function &Func,
+                                                 Module &M) {
+  if (Func.getName() != "__powf4_massv" && Func.getName() != "__powd2_massv")
+    return false;
+
+  if (Constant *Exp = dyn_cast<Constant>(CI->getArgOperand(1)))
+    if (ConstantFP *CFP = dyn_cast<ConstantFP>(Exp->getSplatValue())) {
+      // If the argument is 0.75 or 0.25 it is cheaper to turn it into pow
+      // intrinsic so that it could be optimzed as sequence of sqrt's.
+      if (!CI->hasNoInfs() || !CI->hasApproxFunc())
+        return false;
+
+      if (!CFP->isExactlyValue(0.75) && !CFP->isExactlyValue(0.25))
+        return false;
+
+      if (CFP->isExactlyValue(0.25) && !CI->hasNoSignedZeros())
+        return false;
+
+      CI->setCalledFunction(
+          Intrinsic::getDeclaration(&M, Intrinsic::pow, CI->getType()));
+      return true;
+    }
+
+  return false;
+}
+
 /// Lowers generic MASSV entries to PowerPC subtarget-specific MASSV entries.
 /// e.g.: __sind2_massv --> __sind2_P9 for a Power9 subtarget.
 /// Both function prototypes and their callsites are updated during lowering.
@@ -105,11 +134,14 @@ bool PPCLowerMASSVEntries::lowerMASSVCall(CallInst *CI, Function &Func,
   if (CI->use_empty())
     return false;
 
+  // Handling pow(x, 0.25), pow(x, 0.75), powf(x, 0.25), powf(x, 0.75)
+  if (handlePowSpecialCases(CI, Func, M))
+    return true;
+
   std::string MASSVEntryName = createMASSVFuncName(Func, Subtarget);
   FunctionCallee FCache = M.getOrInsertFunction(
       MASSVEntryName, Func.getFunctionType(), Func.getAttributes());
 
-  CallSite CS(CI);
   CI->setCalledFunction(FCache);  
 
   return true;
diff --git a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
index b6496f189a3a..236f98f32e18 100644
--- a/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMCInstLower.cpp
@@ -29,10 +29,6 @@
 #include "llvm/Target/TargetLoweringObjectFile.h"
 using namespace llvm;
 
-static MachineModuleInfoMachO &getMachOMMI(AsmPrinter &AP) {
-  return AP.MMI->getObjFileInfo<MachineModuleInfoMachO>();
-}
-
 static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO,
                                       AsmPrinter &AP) {
   const TargetMachine &TM = AP.TM;
@@ -41,13 +37,6 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO,
   MCContext &Ctx = AP.OutContext;
 
   SmallString<128> Name;
-  StringRef Suffix;
-  if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG)
-    Suffix = "$non_lazy_ptr";
-
-  if (!Suffix.empty())
-    Name += DL.getPrivateGlobalPrefix();
-
   if (!MO.isGlobal()) {
     assert(MO.isSymbol() && "Isn't a symbol reference");
     Mangler::getNameWithPrefix(Name, MO.getSymbolName(), DL);
@@ -56,30 +45,13 @@ static MCSymbol *GetSymbolFromOperand(const MachineOperand &MO,
     TM.getNameWithPrefix(Name, GV, Mang);
   }
 
-  Name += Suffix;
   MCSymbol *Sym = Ctx.getOrCreateSymbol(Name);
 
-  // If the symbol reference is actually to a non_lazy_ptr, not to the symbol,
-  // then add the suffix.
-  if (MO.getTargetFlags() & PPCII::MO_NLP_FLAG) {
-    MachineModuleInfoMachO &MachO = getMachOMMI(AP);
-
-    MachineModuleInfoImpl::StubValueTy &StubSym = MachO.getGVStubEntry(Sym);
-
-    if (!StubSym.getPointer()) {
-      assert(MO.isGlobal() && "Extern symbol not handled yet");
-      StubSym = MachineModuleInfoImpl::
-                   StubValueTy(AP.getSymbol(MO.getGlobal()),
-                               !MO.getGlobal()->hasInternalLinkage());
-    }
-    return Sym;
-  }
-
   return Sym;
 }
 
 static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
-                              AsmPrinter &Printer, bool IsDarwin) {
+                              AsmPrinter &Printer) {
   MCContext &Ctx = Printer.OutContext;
   MCSymbolRefExpr::VariantKind RefKind = MCSymbolRefExpr::VK_None;
 
@@ -106,13 +78,30 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
       break;
   }
 
- if (MO.getTargetFlags() == PPCII::MO_PLT)
+  if (MO.getTargetFlags() == PPCII::MO_PLT)
     RefKind = MCSymbolRefExpr::VK_PLT;
+  else if (MO.getTargetFlags() == PPCII::MO_PCREL_FLAG)
+    RefKind = MCSymbolRefExpr::VK_PCREL;
+  else if (MO.getTargetFlags() == (PPCII::MO_PCREL_FLAG | PPCII::MO_GOT_FLAG))
+    RefKind = MCSymbolRefExpr::VK_PPC_GOT_PCREL;
 
-  const MachineFunction *MF = MO.getParent()->getParent()->getParent();
+  const MachineInstr *MI = MO.getParent();
+  const MachineFunction *MF = MI->getMF();
   const Module *M = MF->getFunction().getParent();
   const PPCSubtarget *Subtarget = &(MF->getSubtarget<PPCSubtarget>());
   const TargetMachine &TM = Printer.TM;
+
+  unsigned MIOpcode = MI->getOpcode();
+  assert((Subtarget->isUsingPCRelativeCalls() || MIOpcode != PPC::BL8_NOTOC) &&
+         "BL8_NOTOC is only valid when using PC Relative Calls.");
+  if (Subtarget->isUsingPCRelativeCalls()) {
+    if (MIOpcode == PPC::TAILB || MIOpcode == PPC::TAILB8 ||
+        MIOpcode == PPC::TCRETURNdi || MIOpcode == PPC::TCRETURNdi8 ||
+        MIOpcode == PPC::BL8_NOTOC) {
+      RefKind = MCSymbolRefExpr::VK_PPC_NOTOC;
+    }
+  }
+
   const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, RefKind, Ctx);
   // If -msecure-plt -fPIC, add 32768 to symbol.
   if (Subtarget->isSecurePlt() && TM.isPositionIndependent() &&
@@ -137,10 +126,10 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
   // Add ha16() / lo16() markers if required.
   switch (access) {
     case PPCII::MO_LO:
-      Expr = PPCMCExpr::createLo(Expr, IsDarwin, Ctx);
+      Expr = PPCMCExpr::createLo(Expr, Ctx);
       break;
     case PPCII::MO_HA:
-      Expr = PPCMCExpr::createHa(Expr, IsDarwin, Ctx);
+      Expr = PPCMCExpr::createHa(Expr, Ctx);
       break;
   }
 
@@ -148,20 +137,18 @@ static MCOperand GetSymbolRef(const MachineOperand &MO, const MCSymbol *Symbol,
 }
 
 void llvm::LowerPPCMachineInstrToMCInst(const MachineInstr *MI, MCInst &OutMI,
-                                        AsmPrinter &AP, bool IsDarwin) {
+                                        AsmPrinter &AP) {
   OutMI.setOpcode(MI->getOpcode());
 
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     MCOperand MCOp;
-    if (LowerPPCMachineOperandToMCOperand(MI->getOperand(i), MCOp, AP,
-                                          IsDarwin))
+    if (LowerPPCMachineOperandToMCOperand(MI->getOperand(i), MCOp, AP))
       OutMI.addOperand(MCOp);
   }
 }
 
 bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
-                                             MCOperand &OutMO, AsmPrinter &AP,
-                                             bool IsDarwin) {
+                                             MCOperand &OutMO, AsmPrinter &AP) {
   switch (MO.getType()) {
   default:
     llvm_unreachable("unknown operand type");
@@ -170,6 +157,9 @@ bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
     assert(MO.getReg() > PPC::NoRegister &&
            MO.getReg() < PPC::NUM_TARGET_REGS &&
            "Invalid register for this target!");
+    // Ignore all implicit register operands.
+    if (MO.isImplicit())
+      return false;
     OutMO = MCOperand::createReg(MO.getReg());
     return true;
   case MachineOperand::MO_Immediate:
@@ -181,20 +171,20 @@ bool llvm::LowerPPCMachineOperandToMCOperand(const MachineOperand &MO,
     return true;
   case MachineOperand::MO_GlobalAddress:
   case MachineOperand::MO_ExternalSymbol:
-    OutMO = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP, IsDarwin);
+    OutMO = GetSymbolRef(MO, GetSymbolFromOperand(MO, AP), AP);
     return true;
   case MachineOperand::MO_JumpTableIndex:
-    OutMO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP, IsDarwin);
+    OutMO = GetSymbolRef(MO, AP.GetJTISymbol(MO.getIndex()), AP);
     return true;
   case MachineOperand::MO_ConstantPoolIndex:
-    OutMO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP, IsDarwin);
+    OutMO = GetSymbolRef(MO, AP.GetCPISymbol(MO.getIndex()), AP);
     return true;
   case MachineOperand::MO_BlockAddress:
-    OutMO = GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()), AP,
-                         IsDarwin);
+    OutMO =
+        GetSymbolRef(MO, AP.GetBlockAddressSymbol(MO.getBlockAddress()), AP);
     return true;
   case MachineOperand::MO_MCSymbol:
-    OutMO = GetSymbolRef(MO, MO.getMCSymbol(), AP, IsDarwin);
+    OutMO = GetSymbolRef(MO, MO.getMCSymbol(), AP);
     return true;
   case MachineOperand::MO_RegisterMask:
     return false;
diff --git a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
index 74192cb20cd0..d2aba6bd6e8d 100644
--- a/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMIPeephole.cpp
@@ -57,6 +57,8 @@ STATISTIC(NumRotatesCollapsed,
           "Number of pairs of rotate left, clear left/right collapsed");
 STATISTIC(NumEXTSWAndSLDICombined,
           "Number of pairs of EXTSW and SLDI combined as EXTSWSLI");
+STATISTIC(NumLoadImmZeroFoldedAndRemoved,
+          "Number of LI(8) reg, 0 that are folded to r0 and removed");
 
 static cl::opt<bool>
 FixedPointRegToImm("ppc-reg-to-imm-fixed-point", cl::Hidden, cl::init(true),
@@ -124,9 +126,14 @@ public:
 
   // Main entry point for this pass.
   bool runOnMachineFunction(MachineFunction &MF) override {
+    initialize(MF);
+    // At this point, TOC pointer should not be used in a function that uses
+    // PC-Relative addressing.
+    assert((MF.getRegInfo().use_empty(PPC::X2) ||
+            !MF.getSubtarget<PPCSubtarget>().isUsingPCRelativeCalls()) &&
+           "TOC pointer used in a function using PC-Relative addressing!");
     if (skipFunction(MF.getFunction()))
       return false;
-    initialize(MF);
     return simplifyCode();
   }
 };
@@ -314,7 +321,22 @@ bool PPCMIPeephole::simplifyCode(void) {
 
       default:
         break;
-
+      case PPC::LI:
+      case PPC::LI8: {
+        // If we are materializing a zero, look for any use operands for which
+        // zero means immediate zero. All such operands can be replaced with
+        // PPC::ZERO.
+        if (!MI.getOperand(1).isImm() || MI.getOperand(1).getImm() != 0)
+          break;
+        unsigned MIDestReg = MI.getOperand(0).getReg();
+        for (MachineInstr& UseMI : MRI->use_instructions(MIDestReg))
+          Simplified |= TII->onlyFoldImmediate(UseMI, MI, MIDestReg);
+        if (MRI->use_nodbg_empty(MIDestReg)) {
+          ++NumLoadImmZeroFoldedAndRemoved;
+          ToErase = &MI;
+        }
+        break;
+      }
       case PPC::STD: {
         MachineFrameInfo &MFI = MF->getFrameInfo();
         if (MFI.hasVarSizedObjects() ||
@@ -541,10 +563,12 @@ bool PPCMIPeephole::simplifyCode(void) {
           if (!P1 || !P2)
             break;
 
-          // Remove the passed FRSP instruction if it only feeds this MI and
-          // set any uses of that FRSP (in this MI) to the source of the FRSP.
+          // Remove the passed FRSP/XSRSP instruction if it only feeds this MI
+          // and set any uses of that FRSP/XSRSP (in this MI) to the source of
+          // the FRSP/XSRSP.
           auto removeFRSPIfPossible = [&](MachineInstr *RoundInstr) {
-            if (RoundInstr->getOpcode() == PPC::FRSP &&
+            unsigned Opc = RoundInstr->getOpcode();
+            if ((Opc == PPC::FRSP || Opc == PPC::XSRSP) &&
                 MRI->hasOneNonDBGUse(RoundInstr->getOperand(0).getReg())) {
               Simplified = true;
               Register ConvReg1 = RoundInstr->getOperand(1).getReg();
@@ -554,7 +578,7 @@ bool PPCMIPeephole::simplifyCode(void) {
                 if (Use.getOperand(i).isReg() &&
                     Use.getOperand(i).getReg() == FRSPDefines)
                   Use.getOperand(i).setReg(ConvReg1);
-              LLVM_DEBUG(dbgs() << "Removing redundant FRSP:\n");
+              LLVM_DEBUG(dbgs() << "Removing redundant FRSP/XSRSP:\n");
               LLVM_DEBUG(RoundInstr->dump());
               LLVM_DEBUG(dbgs() << "As it feeds instruction:\n");
               LLVM_DEBUG(MI.dump());
@@ -882,11 +906,6 @@ bool PPCMIPeephole::simplifyCode(void) {
         // while in PowerPC ISA, lowerest bit is at index 63.
         APInt MaskSrc =
             APInt::getBitsSetWithWrap(32, 32 - MESrc - 1, 32 - MBSrc);
-        // Current APInt::getBitsSetWithWrap sets all bits to 0 if loBit is
-        // equal to highBit.
-        // If MBSrc - MESrc == 1, we expect a full set mask instead of Null.
-        if (SrcMaskFull && (MBSrc - MESrc == 1))
-          MaskSrc.setAllBits();
 
         APInt RotatedSrcMask = MaskSrc.rotl(SHMI);
         APInt FinalMask = RotatedSrcMask & MaskMI;
@@ -1540,6 +1559,12 @@ bool PPCMIPeephole::emitRLDICWhenLoweringJumpTables(MachineInstr &MI) {
   LLVM_DEBUG(dbgs() << "To: ");
   LLVM_DEBUG(MI.dump());
   NumRotatesCollapsed++;
+  // If SrcReg has no non-debug use it's safe to delete its def SrcMI.
+  if (MRI->use_nodbg_empty(SrcReg)) {
+    assert(!SrcMI->hasImplicitDef() &&
+           "Not expecting an implicit def with this instr.");
+    SrcMI->eraseFromParent();
+  }
   return true;
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
index 2f65d6a2855b..daf88589bb52 100644
--- a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.cpp
@@ -10,48 +10,55 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/MC/MCContext.h"
+#include "llvm/Support/CommandLine.h"
 
 using namespace llvm;
+static cl::opt<bool> PPCDisableNonVolatileCR(
+    "ppc-disable-non-volatile-cr",
+    cl::desc("Disable the use of non-volatile CR register fields"),
+    cl::init(false), cl::Hidden);
 
 void PPCFunctionInfo::anchor() {}
+PPCFunctionInfo::PPCFunctionInfo(const MachineFunction &MF)
+    : DisableNonVolatileCR(PPCDisableNonVolatileCR) {}
 
-MCSymbol *PPCFunctionInfo::getPICOffsetSymbol() const {
+MCSymbol *PPCFunctionInfo::getPICOffsetSymbol(MachineFunction &MF) const {
   const DataLayout &DL = MF.getDataLayout();
   return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
                                            Twine(MF.getFunctionNumber()) +
                                            "$poff");
 }
 
-MCSymbol *PPCFunctionInfo::getGlobalEPSymbol() const {
+MCSymbol *PPCFunctionInfo::getGlobalEPSymbol(MachineFunction &MF) const {
   const DataLayout &DL = MF.getDataLayout();
   return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
                                            "func_gep" +
                                            Twine(MF.getFunctionNumber()));
 }
 
-MCSymbol *PPCFunctionInfo::getLocalEPSymbol() const {
+MCSymbol *PPCFunctionInfo::getLocalEPSymbol(MachineFunction &MF) const {
   const DataLayout &DL = MF.getDataLayout();
   return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
                                            "func_lep" +
                                            Twine(MF.getFunctionNumber()));
 }
 
-MCSymbol *PPCFunctionInfo::getTOCOffsetSymbol() const {
+MCSymbol *PPCFunctionInfo::getTOCOffsetSymbol(MachineFunction &MF) const {
   const DataLayout &DL = MF.getDataLayout();
   return MF.getContext().getOrCreateSymbol(Twine(DL.getPrivateGlobalPrefix()) +
                                            "func_toc" +
                                            Twine(MF.getFunctionNumber()));
 }
 
-bool PPCFunctionInfo::isLiveInSExt(unsigned VReg) const {
-  for (const std::pair<unsigned, ISD::ArgFlagsTy> &LiveIn : LiveInAttrs)
+bool PPCFunctionInfo::isLiveInSExt(Register VReg) const {
+  for (const std::pair<Register, ISD::ArgFlagsTy> &LiveIn : LiveInAttrs)
     if (LiveIn.first == VReg)
       return LiveIn.second.isSExt();
   return false;
 }
 
-bool PPCFunctionInfo::isLiveInZExt(unsigned VReg) const {
-  for (const std::pair<unsigned, ISD::ArgFlagsTy> &LiveIn : LiveInAttrs)
+bool PPCFunctionInfo::isLiveInZExt(Register VReg) const {
+  for (const std::pair<Register, ISD::ArgFlagsTy> &LiveIn : LiveInAttrs)
     if (LiveIn.first == VReg)
       return LiveIn.second.isZExt();
   return false;
diff --git a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
index 2b341b5952c8..29ca53e273d7 100644
--- a/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCMachineFunctionInfo.h
@@ -65,6 +65,10 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// SpillsCR - Indicates whether CR is spilled in the current function.
   bool SpillsCR = false;
 
+  /// DisableNonVolatileCR - Indicates whether non-volatile CR fields would be
+  /// disabled.
+  bool DisableNonVolatileCR = false;
+
   /// Indicates whether VRSAVE is spilled in the current function.
   bool SpillsVRSAVE = false;
 
@@ -112,24 +116,17 @@ class PPCFunctionInfo : public MachineFunctionInfo {
   /// If any of CR[2-4] need to be saved in the prologue and restored in the
   /// epilogue then they are added to this array. This is used for the
   /// 64-bit SVR4 ABI.
-  SmallVector<unsigned, 3> MustSaveCRs;
-
-  /// Hold onto our MachineFunction context.
-  MachineFunction &MF;
+  SmallVector<Register, 3> MustSaveCRs;
 
   /// Whether this uses the PIC Base register or not.
   bool UsesPICBase = false;
 
-  /// True if this function has a subset of CSRs that is handled explicitly via
-  /// copies
-  bool IsSplitCSR = false;
-
   /// We keep track attributes for each live-in virtual registers
   /// to use SExt/ZExt flags in later optimization.
-  std::vector<std::pair<unsigned, ISD::ArgFlagsTy>> LiveInAttrs;
+  std::vector<std::pair<Register, ISD::ArgFlagsTy>> LiveInAttrs;
 
 public:
-  explicit PPCFunctionInfo(MachineFunction &MF) : MF(MF) {}
+  explicit PPCFunctionInfo(const MachineFunction &MF);
 
   int getFramePointerSaveIndex() const { return FramePointerSaveIndex; }
   void setFramePointerSaveIndex(int Idx) { FramePointerSaveIndex = Idx; }
@@ -175,6 +172,9 @@ public:
   void setSpillsCR()       { SpillsCR = true; }
   bool isCRSpilled() const { return SpillsCR; }
 
+  void setDisableNonVolatileCR() { DisableNonVolatileCR = true; }
+  bool isNonVolatileCRDisabled() const { return DisableNonVolatileCR; }
+
   void setSpillsVRSAVE()       { SpillsVRSAVE = true; }
   bool isVRSAVESpilled() const { return SpillsVRSAVE; }
 
@@ -200,36 +200,33 @@ public:
   void setVarArgsNumFPR(unsigned Num) { VarArgsNumFPR = Num; }
 
   /// This function associates attributes for each live-in virtual register.
-  void addLiveInAttr(unsigned VReg, ISD::ArgFlagsTy Flags) {
+  void addLiveInAttr(Register VReg, ISD::ArgFlagsTy Flags) {
     LiveInAttrs.push_back(std::make_pair(VReg, Flags));
   }
 
   /// This function returns true if the specified vreg is
   /// a live-in register and sign-extended.
-  bool isLiveInSExt(unsigned VReg) const;
+  bool isLiveInSExt(Register VReg) const;
 
   /// This function returns true if the specified vreg is
   /// a live-in register and zero-extended.
-  bool isLiveInZExt(unsigned VReg) const;
+  bool isLiveInZExt(Register VReg) const;
 
   int getCRSpillFrameIndex() const { return CRSpillFrameIndex; }
   void setCRSpillFrameIndex(int idx) { CRSpillFrameIndex = idx; }
 
-  const SmallVectorImpl<unsigned> &
+  const SmallVectorImpl<Register> &
     getMustSaveCRs() const { return MustSaveCRs; }
-  void addMustSaveCR(unsigned Reg) { MustSaveCRs.push_back(Reg); }
+  void addMustSaveCR(Register Reg) { MustSaveCRs.push_back(Reg); }
 
   void setUsesPICBase(bool uses) { UsesPICBase = uses; }
   bool usesPICBase() const { return UsesPICBase; }
 
-  bool isSplitCSR() const { return IsSplitCSR; }
-  void setIsSplitCSR(bool s) { IsSplitCSR = s; }
-
-  MCSymbol *getPICOffsetSymbol() const;
+  MCSymbol *getPICOffsetSymbol(MachineFunction &MF) const;
 
-  MCSymbol *getGlobalEPSymbol() const;
-  MCSymbol *getLocalEPSymbol() const;
-  MCSymbol *getTOCOffsetSymbol() const;
+  MCSymbol *getGlobalEPSymbol(MachineFunction &MF) const;
+  MCSymbol *getLocalEPSymbol(MachineFunction &MF) const;
+  MCSymbol *getTOCOffsetSymbol(MachineFunction &MF) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
index a38c8f475066..5649d7d13966 100644
--- a/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
+++ b/llvm/lib/Target/PowerPC/PPCMachineScheduler.cpp
@@ -15,6 +15,16 @@ static cl::opt<bool>
 DisableAddiLoadHeuristic("disable-ppc-sched-addi-load",
                          cl::desc("Disable scheduling addi instruction before" 
                                   "load for ppc"), cl::Hidden);
+static cl::opt<bool>
+    EnableAddiHeuristic("ppc-postra-bias-addi",
+                        cl::desc("Enable scheduling addi instruction as early"
+                                 "as possible post ra"),
+                        cl::Hidden, cl::init(true));
+
+static bool isADDIInstr(const GenericScheduler::SchedCandidate &Cand) {
+  return Cand.SU->getInstr()->getOpcode() == PPC::ADDI ||
+         Cand.SU->getInstr()->getOpcode() == PPC::ADDI8;
+}
 
 bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand,
                                                   SchedCandidate &TryCand,
@@ -22,19 +32,13 @@ bool PPCPreRASchedStrategy::biasAddiLoadCandidate(SchedCandidate &Cand,
   if (DisableAddiLoadHeuristic)
     return false;
 
-  auto isADDIInstr = [&] (const MachineInstr &Inst) {
-    return Inst.getOpcode() == PPC::ADDI || Inst.getOpcode() == PPC::ADDI8;
-  };
-
   SchedCandidate &FirstCand = Zone.isTop() ? TryCand : Cand;
   SchedCandidate &SecondCand = Zone.isTop() ? Cand : TryCand;
-  if (isADDIInstr(*FirstCand.SU->getInstr()) &&
-      SecondCand.SU->getInstr()->mayLoad()) {
+  if (isADDIInstr(FirstCand) && SecondCand.SU->getInstr()->mayLoad()) {
     TryCand.Reason = Stall;
     return true;
   }
-  if (FirstCand.SU->getInstr()->mayLoad() &&
-      isADDIInstr(*SecondCand.SU->getInstr())) {
+  if (FirstCand.SU->getInstr()->mayLoad() && isADDIInstr(SecondCand)) {
     TryCand.Reason = NoCand;
     return true;
   }
@@ -61,6 +65,38 @@ void PPCPreRASchedStrategy::tryCandidate(SchedCandidate &Cand,
     return;
 }
 
+bool PPCPostRASchedStrategy::biasAddiCandidate(SchedCandidate &Cand,
+                                               SchedCandidate &TryCand) const {
+  if (!EnableAddiHeuristic)
+    return false;
+
+  if (isADDIInstr(TryCand) && !isADDIInstr(Cand)) {
+    TryCand.Reason = Stall;
+    return true;
+  }
+  return false;
+}
+
+void PPCPostRASchedStrategy::tryCandidate(SchedCandidate &Cand,
+                                          SchedCandidate &TryCand) {
+  PostGenericScheduler::tryCandidate(Cand, TryCand);
+
+  if (!Cand.isValid())
+    return;
+
+  // Add powerpc post ra specific heuristic only when TryCand isn't selected or
+  // selected as node order.
+  if (TryCand.Reason != NodeOrder && TryCand.Reason != NoCand)
+    return;
+
+  // There are some benefits to schedule the ADDI as early as possible post ra
+  // to avoid stalled by vector instructions which take up all the hw units.
+  // And ADDI is usually used to post inc the loop indvar, which matters the
+  // performance.
+  if (biasAddiCandidate(Cand, TryCand))
+    return;
+}
+
 void PPCPostRASchedStrategy::enterMBB(MachineBasicBlock *MBB) {
   // Custom PPC PostRA specific behavior here.
   PostGenericScheduler::enterMBB(MBB);
diff --git a/llvm/lib/Target/PowerPC/PPCMachineScheduler.h b/llvm/lib/Target/PowerPC/PPCMachineScheduler.h
index 93532d9545a6..a9734ca71859 100644
--- a/llvm/lib/Target/PowerPC/PPCMachineScheduler.h
+++ b/llvm/lib/Target/PowerPC/PPCMachineScheduler.h
@@ -42,6 +42,9 @@ protected:
   SUnit *pickNode(bool &IsTopNode) override;
   void enterMBB(MachineBasicBlock *MBB) override;
   void leaveMBB() override;
+
+  void tryCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) override;
+  bool biasAddiCandidate(SchedCandidate &Cand, SchedCandidate &TryCand) const;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
new file mode 100644
index 000000000000..815dfd1402f4
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.cpp
@@ -0,0 +1,203 @@
+//===- PPCMacroFusion.cpp - PowerPC Macro Fusion --------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the PowerPC implementation of the DAG scheduling
+///  mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "PPC.h"
+#include "PPCSubtarget.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/CodeGen/MacroFusion.h"
+
+using namespace llvm;
+namespace {
+
+class FusionFeature {
+public:
+  typedef SmallDenseSet<unsigned> FusionOpSet;
+
+  enum FusionKind {
+  #define FUSION_KIND(KIND) FK_##KIND
+  #define FUSION_FEATURE(KIND, HAS_FEATURE, DEP_OP_IDX, OPSET1, OPSET2) \
+    FUSION_KIND(KIND),
+  #include "PPCMacroFusion.def"
+  FUSION_KIND(END)
+  };
+private:
+  // Each fusion feature is assigned with one fusion kind. All the
+  // instructions with the same fusion kind have the same fusion characteristic.
+  FusionKind Kd;
+  // True if this feature is enabled.
+  bool Supported;
+  // li rx, si
+  // load rt, ra, rx
+  // The dependent operand index in the second op(load). And the negative means
+  // it could be any one. 
+  int DepOpIdx;
+  // The first fusion op set.
+  FusionOpSet OpSet1;
+  // The second fusion op set.
+  FusionOpSet OpSet2;
+public:
+  FusionFeature(FusionKind Kind, bool HasFeature, int Index,
+                const FusionOpSet &First, const FusionOpSet &Second) :
+    Kd(Kind), Supported(HasFeature), DepOpIdx(Index), OpSet1(First), 
+    OpSet2(Second) {}
+
+  bool hasOp1(unsigned Opc) const { return OpSet1.count(Opc) != 0; }
+  bool hasOp2(unsigned Opc) const { return OpSet2.count(Opc) != 0; }
+  bool isSupported() const { return Supported; }
+  Optional<unsigned> depOpIdx() const {
+    if (DepOpIdx < 0)
+      return None;
+    return DepOpIdx;
+  }
+
+  FusionKind getKind() const { return Kd; }
+};
+
+static bool matchingRegOps(const MachineInstr &FirstMI,
+                           int FirstMIOpIndex,
+                           const MachineInstr &SecondMI,
+                           int SecondMIOpIndex) {
+  const MachineOperand &Op1 = FirstMI.getOperand(FirstMIOpIndex);
+  const MachineOperand &Op2 = SecondMI.getOperand(SecondMIOpIndex);
+  if (!Op1.isReg() || !Op2.isReg())
+    return false;
+
+  return Op1.getReg() == Op2.getReg();
+}
+
+// Return true if the FirstMI meets the constraints of SecondMI according to
+// fusion specification.
+static bool checkOpConstraints(FusionFeature::FusionKind Kd,
+                               const MachineInstr &FirstMI,
+                               const MachineInstr &SecondMI) {
+  switch (Kd) {
+  // The hardware didn't require any specific check for the fused instructions'
+  // operands. Therefore, return true to indicate that, it is fusable.
+  default: return true;
+  // [addi rt,ra,si - lxvd2x xt,ra,rb] etc.
+  case FusionFeature::FK_AddiLoad: {
+    // lxvd2x(ra) cannot be zero
+    const MachineOperand &RA = SecondMI.getOperand(1);
+    if (!RA.isReg())
+      return true;
+
+    return Register::isVirtualRegister(RA.getReg()) ||
+      (RA.getReg() != PPC::ZERO && RA.getReg() != PPC::ZERO8);
+  }
+  // [addis rt,ra,si - ld rt,ds(ra)] etc.
+  case FusionFeature::FK_AddisLoad: {
+    const MachineOperand &RT = SecondMI.getOperand(0);
+    if (!RT.isReg())
+      return true;
+
+    // Only check it for non-virtual register.
+    if (!Register::isVirtualRegister(RT.getReg()))
+      // addis(rt) = ld(ra) = ld(rt)
+      // ld(rt) cannot be zero
+      if (!matchingRegOps(SecondMI, 0, SecondMI, 2) ||
+          (RT.getReg() == PPC::ZERO || RT.getReg() == PPC::ZERO8))
+          return false;
+
+    // addis(si) first 12 bits must be all 1s or all 0s
+    const MachineOperand &SI = FirstMI.getOperand(2);
+    if (!SI.isImm())
+      return true;
+    int64_t Imm = SI.getImm();
+    if (((Imm & 0xFFF0) != 0) && ((Imm & 0xFFF0) != 0xFFF0))
+      return false;
+
+    // If si = 1111111111110000 and the msb of the d/ds field of the load equals 
+    // 1, then fusion does not occur.
+    if ((Imm & 0xFFF0) == 0xFFF0) {
+      const MachineOperand &D = SecondMI.getOperand(1);
+      if (!D.isImm())
+        return true;
+
+      // 14 bit for DS field, while 16 bit for D field.
+      int MSB = 15;
+      if (SecondMI.getOpcode() == PPC::LD)
+        MSB = 13;
+
+      return (D.getImm() & (1ULL << MSB)) == 0;
+    }
+    return true;
+  }
+  }
+
+  llvm_unreachable("All the cases should have been handled");
+  return true;
+}
+
+/// Check if the instr pair, FirstMI and SecondMI, should be fused together.
+/// Given SecondMI, when FirstMI is unspecified, then check if SecondMI may be
+/// part of a fused pair at all.
+static bool shouldScheduleAdjacent(const TargetInstrInfo &TII,
+                                   const TargetSubtargetInfo &TSI,
+                                   const MachineInstr *FirstMI,
+                                   const MachineInstr &SecondMI) {
+  // We use the PPC namespace to avoid the need to prefix opcodes with PPC:: in
+  // the def file.
+  using namespace PPC;
+
+  const PPCSubtarget &ST = static_cast<const PPCSubtarget&>(TSI);
+  static const FusionFeature FusionFeatures[] = {
+  #define FUSION_FEATURE(KIND, HAS_FEATURE, DEP_OP_IDX, OPSET1, OPSET2) { \
+    FusionFeature::FUSION_KIND(KIND), ST.HAS_FEATURE(), DEP_OP_IDX, { OPSET1 },\
+    { OPSET2 } },
+   #include "PPCMacroFusion.def"
+  };
+  #undef FUSION_KIND
+
+  for (auto &Feature : FusionFeatures) {
+    // Skip if the feature is not supported.
+    if (!Feature.isSupported())
+      continue;
+
+    // Only when the SecondMI is fusable, we are starting to look for the
+    // fusable FirstMI.
+    if (Feature.hasOp2(SecondMI.getOpcode())) {
+      // If FirstMI == nullptr, that means, we're only checking whether SecondMI
+      // can be fused at all.
+      if (!FirstMI)
+        return true;
+
+      // Checking if the FirstMI is fusable with the SecondMI.
+      if (!Feature.hasOp1(FirstMI->getOpcode()))
+        continue;
+
+      auto DepOpIdx = Feature.depOpIdx();
+      if (DepOpIdx.hasValue()) {
+        // Checking if the result of the FirstMI is the desired operand of the
+        // SecondMI if the DepOpIdx is set. Otherwise, ignore it.
+        if (!matchingRegOps(*FirstMI, 0, SecondMI, *DepOpIdx))
+          return false;
+      }
+  
+      // Checking more on the instruction operands.
+      if (checkOpConstraints(Feature.getKind(), *FirstMI, SecondMI))
+        return true;
+    }
+  }
+
+  return false;
+}
+
+} // end anonymous namespace
+
+namespace llvm {
+
+std::unique_ptr<ScheduleDAGMutation> createPowerPCMacroFusionDAGMutation () {
+  return createMacroFusionDAGMutation(shouldScheduleAdjacent);
+}
+
+} // end namespace llvm
diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.def b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
new file mode 100644
index 000000000000..c7e4e7c22e0a
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.def
@@ -0,0 +1,45 @@
+//=== ---- PPCMacroFusion.def - PowerPC MacroFuson Candidates -v-*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https)//llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier) Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains descriptions of the macro-fusion pair for PowerPC.
+//
+//===----------------------------------------------------------------------===//
+
+// NOTE: NO INCLUDE GUARD DESIRED!
+
+#ifndef FUSION_FEATURE
+
+// Each FUSION_FEATURE is assigned with one TYPE, and can be enabled/disabled
+// by HAS_FEATURE. The instructions pair is fusable only when the opcode
+// of the first instruction is in OPSET1, and the second instruction opcode is
+// in OPSET2. And if DEP_OP_IDX >=0, we will check the result of first OP is  
+// the operand of the second op with DEP_OP_IDX as its operand index. We assume
+// that the result of the first op is its operand zero. 
+#define FUSION_FEATURE(TYPE, HAS_FEATURE, DEP_OP_IDX, OPSET1, OPSET2)
+
+#endif
+
+#ifndef FUSION_OP_SET
+#define FUSION_OP_SET(...) __VA_ARGS__ 
+#endif
+
+// Power8 User Manual Section 10.1.12, Instruction Fusion 
+// {addi} followed by one of these {lxvd2x, lxvw4x, lxvdsx, lvebx, lvehx,
+// lvewx, lvx, lxsdx}
+FUSION_FEATURE(AddiLoad, hasAddiLoadFusion, 2, \
+               FUSION_OP_SET(ADDI, ADDI8, ADDItocL), \
+               FUSION_OP_SET(LXVD2X, LXVW4X, LXVDSX, LVEBX, LVEHX, LVEWX, \
+                             LVX, LXSDX))
+
+// {addis) followed by one of these {ld, lbz, lhz, lwz}
+FUSION_FEATURE(AddisLoad, hasAddisLoadFusion, 2, \
+               FUSION_OP_SET(ADDIS, ADDIS8, ADDIStocHA8), \
+               FUSION_OP_SET(LD, LBZ, LBZ8, LHZ, LHZ8, LWZ, LWZ8))
+
+#undef FUSION_FEATURE
+#undef FUSION_OP_SET
diff --git a/llvm/lib/Target/PowerPC/PPCMacroFusion.h b/llvm/lib/Target/PowerPC/PPCMacroFusion.h
new file mode 100644
index 000000000000..91cbedf4558f
--- /dev/null
+++ b/llvm/lib/Target/PowerPC/PPCMacroFusion.h
@@ -0,0 +1,22 @@
+//===- PPCMacroFusion.h - PowerPC Macro Fusion ----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+/// \file This file contains the PowerPC definition of the DAG scheduling
+/// mutation to pair instructions back to back.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/CodeGen/MachineScheduler.h"
+
+namespace llvm {
+
+/// Note that you have to add:
+///   DAG.addMutation(createPowerPCMacroFusionDAGMutation());
+/// to PPCPassConfig::createMachineScheduler() to have an effect.
+std::unique_ptr<ScheduleDAGMutation> createPowerPCMacroFusionDAGMutation();
+} // llvm
diff --git a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
index a4b4bf2973d1..4ea714ff15f7 100644
--- a/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
+++ b/llvm/lib/Target/PowerPC/PPCPreEmitPeephole.cpp
@@ -109,6 +109,16 @@ namespace {
           // Track the operand that kill Reg. We would unset the kill flag of
           // the operand if there is a following redundant load immediate.
           int KillIdx = AfterBBI->findRegisterUseOperandIdx(Reg, true, TRI);
+
+          // We can't just clear implicit kills, so if we encounter one, stop
+          // looking further.
+          if (KillIdx != -1 && AfterBBI->getOperand(KillIdx).isImplicit()) {
+            LLVM_DEBUG(dbgs()
+                       << "Encountered an implicit kill, cannot proceed: ");
+            LLVM_DEBUG(AfterBBI->dump());
+            break;
+          }
+
           if (KillIdx != -1) {
             assert(!DeadOrKillToUnset && "Shouldn't kill same register twice");
             DeadOrKillToUnset = &AfterBBI->getOperand(KillIdx);
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 01b97ba6ab20..35f5e1fbebcd 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -142,6 +142,8 @@ const MCPhysReg*
 PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>();
   if (MF->getFunction().getCallingConv() == CallingConv::AnyReg) {
+    if (!TM.isPPC64() && Subtarget.isAIXABI())
+      report_fatal_error("AnyReg unimplemented on 32-bit AIX.");
     if (Subtarget.hasVSX())
       return CSR_64_AllRegs_VSX_SaveList;
     if (Subtarget.hasAltivec())
@@ -149,21 +151,20 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_64_AllRegs_SaveList;
   }
 
-  if (Subtarget.isDarwinABI())
-    return TM.isPPC64()
-               ? (Subtarget.hasAltivec() ? CSR_Darwin64_Altivec_SaveList
-                                         : CSR_Darwin64_SaveList)
-               : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_SaveList
-                                         : CSR_Darwin32_SaveList);
-
-  if (TM.isPPC64() && MF->getInfo<PPCFunctionInfo>()->isSplitCSR())
-    return CSR_SRV464_TLS_PE_SaveList;
-
   // On PPC64, we might need to save r2 (but only if it is not reserved).
-  bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2);
+  // We do not need to treat R2 as callee-saved when using PC-Relative calls
+  // because any direct uses of R2 will cause it to be reserved. If the function
+  // is a leaf or the only uses of R2 are implicit uses for calls, the calls
+  // will use the @notoc relocation which will cause this function to set the
+  // st_other bit to 1, thereby communicating to its caller that it arbitrarily
+  // clobbers the TOC.
+  bool SaveR2 = MF->getRegInfo().isAllocatable(PPC::X2) &&
+                !Subtarget.isUsingPCRelativeCalls();
 
   // Cold calling convention CSRs.
   if (MF->getFunction().getCallingConv() == CallingConv::Cold) {
+    if (Subtarget.isAIXABI())
+      report_fatal_error("Cold calling unimplemented on AIX.");
     if (TM.isPPC64()) {
       if (Subtarget.hasAltivec())
         return SaveR2 ? CSR_SVR64_ColdCC_R2_Altivec_SaveList
@@ -181,12 +182,13 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   // Standard calling convention CSRs.
   if (TM.isPPC64()) {
     if (Subtarget.hasAltivec())
-      return SaveR2 ? CSR_SVR464_R2_Altivec_SaveList
-                    : CSR_SVR464_Altivec_SaveList;
-    return SaveR2 ? CSR_SVR464_R2_SaveList
-                  : CSR_SVR464_SaveList;
+      return SaveR2 ? CSR_PPC64_R2_Altivec_SaveList
+                    : CSR_PPC64_Altivec_SaveList;
+    return SaveR2 ? CSR_PPC64_R2_SaveList : CSR_PPC64_SaveList;
   }
   // 32-bit targets.
+  if (Subtarget.isAIXABI())
+    return CSR_AIX32_SaveList;
   if (Subtarget.hasAltivec())
     return CSR_SVR432_Altivec_SaveList;
   else if (Subtarget.hasSPE())
@@ -194,31 +196,6 @@ PPCRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
   return CSR_SVR432_SaveList;
 }
 
-const MCPhysReg *
-PPCRegisterInfo::getCalleeSavedRegsViaCopy(const MachineFunction *MF) const {
-  assert(MF && "Invalid MachineFunction pointer.");
-  const PPCSubtarget &Subtarget = MF->getSubtarget<PPCSubtarget>();
-  if (Subtarget.isDarwinABI())
-    return nullptr;
-  if (!TM.isPPC64())
-    return nullptr;
-  if (MF->getFunction().getCallingConv() != CallingConv::CXX_FAST_TLS)
-    return nullptr;
-  if (!MF->getInfo<PPCFunctionInfo>()->isSplitCSR())
-    return nullptr;
-
-  // On PPC64, we might need to save r2 (but only if it is not reserved).
-  bool SaveR2 = !getReservedRegs(*MF).test(PPC::X2);
-  if (Subtarget.hasAltivec())
-    return SaveR2
-      ? CSR_SVR464_R2_Altivec_ViaCopy_SaveList
-      : CSR_SVR464_Altivec_ViaCopy_SaveList;
-  else
-    return SaveR2
-      ? CSR_SVR464_R2_ViaCopy_SaveList
-      : CSR_SVR464_ViaCopy_SaveList;
-}
-
 const uint32_t *
 PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                       CallingConv::ID CC) const {
@@ -231,14 +208,9 @@ PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
     return CSR_64_AllRegs_RegMask;
   }
 
-  if (Subtarget.isDarwinABI())
-    return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_Darwin64_Altivec_RegMask
-                                                  : CSR_Darwin64_RegMask)
-                        : (Subtarget.hasAltivec() ? CSR_Darwin32_Altivec_RegMask
-                                                  : CSR_Darwin32_RegMask);
   if (Subtarget.isAIXABI()) {
     assert(!Subtarget.hasAltivec() && "Altivec is not implemented on AIX yet.");
-    return TM.isPPC64() ? CSR_AIX64_RegMask : CSR_AIX32_RegMask;
+    return TM.isPPC64() ? CSR_PPC64_RegMask : CSR_AIX32_RegMask;
   }
 
   if (CC == CallingConv::Cold) {
@@ -250,12 +222,12 @@ PPCRegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                                   : CSR_SVR32_ColdCC_RegMask));
   }
 
-  return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_SVR464_Altivec_RegMask
-                                                : CSR_SVR464_RegMask)
-                      : (Subtarget.hasAltivec() ? CSR_SVR432_Altivec_RegMask
-                                                : (Subtarget.hasSPE()
-                                                  ? CSR_SVR432_SPE_RegMask
-                                                  : CSR_SVR432_RegMask));
+  return TM.isPPC64() ? (Subtarget.hasAltivec() ? CSR_PPC64_Altivec_RegMask
+                                                : CSR_PPC64_RegMask)
+                      : (Subtarget.hasAltivec()
+                             ? CSR_SVR432_Altivec_RegMask
+                             : (Subtarget.hasSPE() ? CSR_SVR432_SPE_RegMask
+                                                   : CSR_SVR432_RegMask));
 }
 
 const uint32_t*
@@ -295,8 +267,7 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   markSuperRegs(Reserved, PPC::LR8);
   markSuperRegs(Reserved, PPC::RM);
 
-  if (!Subtarget.isDarwinABI() || !Subtarget.hasAltivec())
-    markSuperRegs(Reserved, PPC::VRSAVE);
+  markSuperRegs(Reserved, PPC::VRSAVE);
 
   // The SVR4 ABI reserves r2 and r13
   if (Subtarget.isSVR4ABI()) {
@@ -369,7 +340,8 @@ bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) co
     int FrIdx = Info[i].getFrameIdx();
     unsigned Reg = Info[i].getReg();
 
-    unsigned Opcode = InstrInfo->getStoreOpcodeForSpill(Reg);
+    const TargetRegisterClass *RC = getMinimalPhysRegClass(Reg);
+    unsigned Opcode = InstrInfo->getStoreOpcodeForSpill(RC);
     if (!MFI.isFixedObjectIndex(FrIdx)) {
       // This is not a fixed object. If it requires alignment then we may still
       // need to use the XForm.
@@ -389,7 +361,7 @@ bool PPCRegisterInfo::requiresFrameIndexScavenging(const MachineFunction &MF) co
   return false;
 }
 
-bool PPCRegisterInfo::isCallerPreservedPhysReg(unsigned PhysReg,
+bool PPCRegisterInfo::isCallerPreservedPhysReg(MCRegister PhysReg,
                                                const MachineFunction &MF) const {
   assert(Register::isPhysicalRegister(PhysReg));
   const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
@@ -508,15 +480,70 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
 
   // Get the maximum call stack size.
   unsigned maxCallFrameSize = MFI.getMaxCallFrameSize();
+  Align MaxAlign = MFI.getMaxAlign();
+  assert(isAligned(MaxAlign, maxCallFrameSize) &&
+         "Maximum call-frame size not sufficiently aligned");
+  (void)MaxAlign;
+
+  const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
+  const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
+  Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
+  bool KillNegSizeReg = MI.getOperand(1).isKill();
+  Register NegSizeReg = MI.getOperand(1).getReg();
+
+  prepareDynamicAlloca(II, NegSizeReg, KillNegSizeReg, Reg);
+  // Grow the stack and update the stack pointer link, then determine the
+  // address of new allocated space.
+  if (LP64) {
+    BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
+        .addReg(Reg, RegState::Kill)
+        .addReg(PPC::X1)
+        .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
+    BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
+        .addReg(PPC::X1)
+        .addImm(maxCallFrameSize);
+  } else {
+    BuildMI(MBB, II, dl, TII.get(PPC::STWUX), PPC::R1)
+        .addReg(Reg, RegState::Kill)
+        .addReg(PPC::R1)
+        .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
+    BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
+        .addReg(PPC::R1)
+        .addImm(maxCallFrameSize);
+  }
+
+  // Discard the DYNALLOC instruction.
+  MBB.erase(II);
+}
+
+/// To accomplish dynamic stack allocation, we have to calculate exact size
+/// subtracted from the stack pointer according alignment information and get
+/// previous frame pointer.
+void PPCRegisterInfo::prepareDynamicAlloca(MachineBasicBlock::iterator II,
+                                           Register &NegSizeReg,
+                                           bool &KillNegSizeReg,
+                                           Register &FramePointer) const {
+  // Get the instruction.
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  // Get the frame info.
+  MachineFrameInfo &MFI = MF.getFrameInfo();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  // Get the instruction info.
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  // Determine whether 64-bit pointers are used.
+  bool LP64 = TM.isPPC64();
+  DebugLoc dl = MI.getDebugLoc();
   // Get the total frame size.
   unsigned FrameSize = MFI.getStackSize();
 
   // Get stack alignments.
   const PPCFrameLowering *TFI = getFrameLowering(MF);
-  unsigned TargetAlign = TFI->getStackAlignment();
-  unsigned MaxAlign = MFI.getMaxAlignment();
-  assert((maxCallFrameSize & (MaxAlign-1)) == 0 &&
-         "Maximum call-frame size not sufficiently aligned");
+  Align TargetAlign = TFI->getStackAlign();
+  Align MaxAlign = MFI.getMaxAlign();
 
   // Determine the previous frame's address.  If FrameSize can't be
   // represented as 16 bits or we need special alignment, then we load the
@@ -526,32 +553,26 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
   // Fortunately, a frame greater than 32K is rare.
   const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
-  Register Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
 
   if (MaxAlign < TargetAlign && isInt<16>(FrameSize)) {
     if (LP64)
-      BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), Reg)
-        .addReg(PPC::X31)
-        .addImm(FrameSize);
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), FramePointer)
+          .addReg(PPC::X31)
+          .addImm(FrameSize);
     else
-      BuildMI(MBB, II, dl, TII.get(PPC::ADDI), Reg)
-        .addReg(PPC::R31)
-        .addImm(FrameSize);
+      BuildMI(MBB, II, dl, TII.get(PPC::ADDI), FramePointer)
+          .addReg(PPC::R31)
+          .addImm(FrameSize);
   } else if (LP64) {
-    BuildMI(MBB, II, dl, TII.get(PPC::LD), Reg)
-      .addImm(0)
-      .addReg(PPC::X1);
+    BuildMI(MBB, II, dl, TII.get(PPC::LD), FramePointer)
+        .addImm(0)
+        .addReg(PPC::X1);
   } else {
-    BuildMI(MBB, II, dl, TII.get(PPC::LWZ), Reg)
-      .addImm(0)
-      .addReg(PPC::R1);
+    BuildMI(MBB, II, dl, TII.get(PPC::LWZ), FramePointer)
+        .addImm(0)
+        .addReg(PPC::R1);
   }
-
-  bool KillNegSizeReg = MI.getOperand(1).isKill();
-  Register NegSizeReg = MI.getOperand(1).getReg();
-
-  // Grow the stack and update the stack pointer link, then determine the
-  // address of new allocated space.
+  // Determine the actual NegSizeReg according to alignment info.
   if (LP64) {
     if (MaxAlign > TargetAlign) {
       unsigned UnalNegSizeReg = NegSizeReg;
@@ -560,23 +581,15 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
       // Unfortunately, there is no andi, only andi., and we can't insert that
       // here because we might clobber cr0 while it is live.
       BuildMI(MBB, II, dl, TII.get(PPC::LI8), NegSizeReg)
-        .addImm(~(MaxAlign-1));
+          .addImm(~(MaxAlign.value() - 1));
 
       unsigned NegSizeReg1 = NegSizeReg;
       NegSizeReg = MF.getRegInfo().createVirtualRegister(G8RC);
       BuildMI(MBB, II, dl, TII.get(PPC::AND8), NegSizeReg)
-        .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg))
-        .addReg(NegSizeReg1, RegState::Kill);
+          .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg))
+          .addReg(NegSizeReg1, RegState::Kill);
       KillNegSizeReg = true;
     }
-
-    BuildMI(MBB, II, dl, TII.get(PPC::STDUX), PPC::X1)
-      .addReg(Reg, RegState::Kill)
-      .addReg(PPC::X1)
-      .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
-    BuildMI(MBB, II, dl, TII.get(PPC::ADDI8), MI.getOperand(0).getReg())
-      .addReg(PPC::X1)
-      .addImm(maxCallFrameSize);
   } else {
     if (MaxAlign > TargetAlign) {
       unsigned UnalNegSizeReg = NegSizeReg;
@@ -585,26 +598,47 @@ void PPCRegisterInfo::lowerDynamicAlloc(MachineBasicBlock::iterator II) const {
       // Unfortunately, there is no andi, only andi., and we can't insert that
       // here because we might clobber cr0 while it is live.
       BuildMI(MBB, II, dl, TII.get(PPC::LI), NegSizeReg)
-        .addImm(~(MaxAlign-1));
+          .addImm(~(MaxAlign.value() - 1));
 
       unsigned NegSizeReg1 = NegSizeReg;
       NegSizeReg = MF.getRegInfo().createVirtualRegister(GPRC);
       BuildMI(MBB, II, dl, TII.get(PPC::AND), NegSizeReg)
-        .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg))
-        .addReg(NegSizeReg1, RegState::Kill);
+          .addReg(UnalNegSizeReg, getKillRegState(KillNegSizeReg))
+          .addReg(NegSizeReg1, RegState::Kill);
       KillNegSizeReg = true;
     }
+  }
+}
 
-    BuildMI(MBB, II, dl, TII.get(PPC::STWUX), PPC::R1)
-      .addReg(Reg, RegState::Kill)
-      .addReg(PPC::R1)
-      .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
-    BuildMI(MBB, II, dl, TII.get(PPC::ADDI), MI.getOperand(0).getReg())
-      .addReg(PPC::R1)
-      .addImm(maxCallFrameSize);
+void PPCRegisterInfo::lowerPrepareProbedAlloca(
+    MachineBasicBlock::iterator II) const {
+  MachineInstr &MI = *II;
+  // Get the instruction's basic block.
+  MachineBasicBlock &MBB = *MI.getParent();
+  // Get the basic block's function.
+  MachineFunction &MF = *MBB.getParent();
+  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
+  // Get the instruction info.
+  const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
+  // Determine whether 64-bit pointers are used.
+  bool LP64 = TM.isPPC64();
+  DebugLoc dl = MI.getDebugLoc();
+  Register FramePointer = MI.getOperand(0).getReg();
+  Register FinalStackPtr = MI.getOperand(1).getReg();
+  bool KillNegSizeReg = MI.getOperand(2).isKill();
+  Register NegSizeReg = MI.getOperand(2).getReg();
+  prepareDynamicAlloca(II, NegSizeReg, KillNegSizeReg, FramePointer);
+  if (LP64) {
+    BuildMI(MBB, II, dl, TII.get(PPC::ADD8), FinalStackPtr)
+        .addReg(PPC::X1)
+        .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
+
+  } else {
+    BuildMI(MBB, II, dl, TII.get(PPC::ADD4), FinalStackPtr)
+        .addReg(PPC::R1)
+        .addReg(NegSizeReg, getKillRegState(KillNegSizeReg));
   }
 
-  // Discard the DYNALLOC instruction.
   MBB.erase(II);
 }
 
@@ -665,7 +699,7 @@ void PPCRegisterInfo::lowerCRSpilling(MachineBasicBlock::iterator II,
   // If the saved register wasn't CR0, shift the bits left so that they are in
   // CR0's slot.
   if (SrcReg != PPC::CR0) {
-    unsigned Reg1 = Reg;
+    Register Reg1 = Reg;
     Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
 
     // rlwinm rA, rA, ShiftBits, 0, 31.
@@ -710,7 +744,7 @@ void PPCRegisterInfo::lowerCRRestore(MachineBasicBlock::iterator II,
   // If the reloaded register isn't CR0, shift the bits right so that they are
   // in the right CR's slot.
   if (DestReg != PPC::CR0) {
-    unsigned Reg1 = Reg;
+    Register Reg1 = Reg;
     Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
 
     unsigned ShiftBits = getEncodingValue(DestReg)*4;
@@ -814,7 +848,7 @@ void PPCRegisterInfo::lowerCRBitSpilling(MachineBasicBlock::iterator II,
 
     // If the saved register wasn't CR0LT, shift the bits left so that the bit
     // to store is the first one. Mask all but that bit.
-    unsigned Reg1 = Reg;
+    Register Reg1 = Reg;
     Reg = MF.getRegInfo().createVirtualRegister(LP64 ? G8RC : GPRC);
 
     // rlwinm rA, rA, ShiftBits, 0, 0.
@@ -940,20 +974,17 @@ void PPCRegisterInfo::lowerVRSAVERestore(MachineBasicBlock::iterator II,
 }
 
 bool PPCRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
-                                           unsigned Reg, int &FrameIdx) const {
-  const PPCSubtarget &Subtarget = MF.getSubtarget<PPCSubtarget>();
-  // For the nonvolatile condition registers (CR2, CR3, CR4) in an SVR4
-  // ABI, return true to prevent allocating an additional frame slot.
-  // For 64-bit, the CR save area is at SP+8; the value of FrameIdx = 0
-  // is arbitrary and will be subsequently ignored.  For 32-bit, we have
-  // previously created the stack slot if needed, so return its FrameIdx.
-  if (Subtarget.isSVR4ABI() && PPC::CR2 <= Reg && Reg <= PPC::CR4) {
-    if (TM.isPPC64())
-      FrameIdx = 0;
-    else {
-      const PPCFunctionInfo *FI = MF.getInfo<PPCFunctionInfo>();
-      FrameIdx = FI->getCRSpillFrameIndex();
-    }
+                                           Register Reg, int &FrameIdx) const {
+  // For the nonvolatile condition registers (CR2, CR3, CR4) return true to
+  // prevent allocating an additional frame slot.
+  // For 64-bit ELF and AIX, the CR save area is in the linkage area at SP+8,
+  // for 32-bit AIX the CR save area is in the linkage area at SP+4.
+  // We have created a FrameIndex to that spill slot to keep the CalleSaveInfos
+  // valid.
+  // For 32-bit ELF, we have previously created the stack slot if needed, so
+  // return its FrameIdx.
+  if (PPC::CR2 <= Reg && Reg <= PPC::CR4) {
+    FrameIdx = MF.getInfo<PPCFunctionInfo>()->getCRSpillFrameIndex();
     return true;
   }
   return false;
@@ -1051,6 +1082,13 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
     return;
   }
 
+  if (FPSI && FrameIndex == FPSI &&
+      (OpC == PPC::PREPARE_PROBED_ALLOCA_64 ||
+       OpC == PPC::PREPARE_PROBED_ALLOCA_32)) {
+    lowerPrepareProbedAlloca(II);
+    return;
+  }
+
   // Special case for pseudo-ops SPILL_CR and RESTORE_CR, etc.
   if (OpC == PPC::SPILL_CR) {
     lowerCRSpilling(II, FrameIndex);
@@ -1122,7 +1160,7 @@ PPCRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   const TargetRegisterClass *G8RC = &PPC::G8RCRegClass;
   const TargetRegisterClass *GPRC = &PPC::GPRCRegClass;
   const TargetRegisterClass *RC = is64Bit ? G8RC : GPRC;
-  unsigned SRegHi = MF.getRegInfo().createVirtualRegister(RC),
+  Register SRegHi = MF.getRegInfo().createVirtualRegister(RC),
            SReg = MF.getRegInfo().createVirtualRegister(RC);
 
   // Insert a set of rA with the full offset value before the ld, st, or add
@@ -1245,10 +1283,10 @@ needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const {
 
 /// Insert defining instruction(s) for BaseReg to
 /// be a pointer to FrameIdx at the beginning of the basic block.
-void PPCRegisterInfo::
-materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                             unsigned BaseReg, int FrameIdx,
-                             int64_t Offset) const {
+void PPCRegisterInfo::materializeFrameBaseRegister(MachineBasicBlock *MBB,
+                                                   Register BaseReg,
+                                                   int FrameIdx,
+                                                   int64_t Offset) const {
   unsigned ADDriOpc = TM.isPPC64() ? PPC::ADDI8 : PPC::ADDI;
 
   MachineBasicBlock::iterator Ins = MBB->begin();
@@ -1267,7 +1305,7 @@ materializeFrameBaseRegister(MachineBasicBlock *MBB,
     .addFrameIndex(FrameIdx).addImm(Offset);
 }
 
-void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                                         int64_t Offset) const {
   unsigned FIOperandNum = 0;
   while (!MI.getOperand(FIOperandNum).isFI()) {
@@ -1292,7 +1330,7 @@ void PPCRegisterInfo::resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
 }
 
 bool PPCRegisterInfo::isFrameOffsetLegal(const MachineInstr *MI,
-                                         unsigned BaseReg,
+                                         Register BaseReg,
                                          int64_t Offset) const {
   unsigned FIOperandNum = 0;
   while (!MI->getOperand(FIOperandNum).isFI()) {
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
index a5fbb0c6ec64..61acd955e1cb 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -84,7 +84,6 @@ public:
 
   /// Code Generation virtual methods...
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
-  const MCPhysReg *getCalleeSavedRegsViaCopy(const MachineFunction *MF) const;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID CC) const override;
   const uint32_t *getNoPreservedMask() const override;
@@ -92,7 +91,8 @@ public:
   void adjustStackMapLiveOutMask(uint32_t *Mask) const override;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
-  bool isCallerPreservedPhysReg(unsigned PhysReg, const MachineFunction &MF) const override;
+  bool isCallerPreservedPhysReg(MCRegister PhysReg,
+                                const MachineFunction &MF) const override;
 
   /// We require the register scavenger.
   bool requiresRegisterScavenging(const MachineFunction &MF) const override {
@@ -101,16 +101,16 @@ public:
 
   bool requiresFrameIndexScavenging(const MachineFunction &MF) const override;
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
-    return true;
-  }
-
   bool requiresVirtualBaseRegisters(const MachineFunction &MF) const override {
     return true;
   }
 
   void lowerDynamicAlloc(MachineBasicBlock::iterator II) const;
   void lowerDynamicAreaOffset(MachineBasicBlock::iterator II) const;
+  void prepareDynamicAlloca(MachineBasicBlock::iterator II,
+                            Register &NegSizeReg, bool &KillNegSizeReg,
+                            Register &FramePointer) const;
+  void lowerPrepareProbedAlloca(MachineBasicBlock::iterator II) const;
   void lowerCRSpilling(MachineBasicBlock::iterator II,
                        unsigned FrameIndex) const;
   void lowerCRRestore(MachineBasicBlock::iterator II,
@@ -124,7 +124,7 @@ public:
   void lowerVRSAVERestore(MachineBasicBlock::iterator II,
                           unsigned FrameIndex) const;
 
-  bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+  bool hasReservedSpillSlot(const MachineFunction &MF, Register Reg,
                             int &FrameIdx) const override;
   void eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
                            unsigned FIOperandNum,
@@ -132,12 +132,12 @@ public:
 
   // Support for virtual base registers.
   bool needsFrameBaseReg(MachineInstr *MI, int64_t Offset) const override;
-  void materializeFrameBaseRegister(MachineBasicBlock *MBB,
-                                    unsigned BaseReg, int FrameIdx,
+  void materializeFrameBaseRegister(MachineBasicBlock *MBB, Register BaseReg,
+                                    int FrameIdx,
                                     int64_t Offset) const override;
-  void resolveFrameIndex(MachineInstr &MI, unsigned BaseReg,
+  void resolveFrameIndex(MachineInstr &MI, Register BaseReg,
                          int64_t Offset) const override;
-  bool isFrameOffsetLegal(const MachineInstr *MI, unsigned BaseReg,
+  bool isFrameOffsetLegal(const MachineInstr *MI, Register BaseReg,
                           int64_t Offset) const override;
 
   // Debug information queries.
diff --git a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
index 4719e947b172..b45757c1acc5 100644
--- a/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
+++ b/llvm/lib/Target/PowerPC/PPCRegisterInfo.td
@@ -156,7 +156,7 @@ foreach Index = 32-63 in {
   def VSX#Index : VSXReg<Index, "vs"#Index>;
 }
 
-// The reprsentation of r0 when treated as the constant 0.
+// The representation of r0 when treated as the constant 0.
 def ZERO  : GPR<0, "0">,    DwarfRegAlias<R0>;
 def ZERO8 : GP8<ZERO, "0">, DwarfRegAlias<X0>;
 
@@ -363,11 +363,23 @@ def CRBITRC : RegisterClass<"PPC", [i1], 32,
        CR1LT, CR1GT, CR1EQ, CR1UN,
        CR0LT, CR0GT, CR0EQ, CR0UN)> {
   let Size = 32;
+  let AltOrders = [(sub CRBITRC, CR2LT, CR2GT, CR2EQ, CR2UN, CR3LT, CR3GT,
+                        CR3EQ, CR3UN, CR4LT, CR4GT, CR4EQ, CR4UN)];
+  let AltOrderSelect = [{
+    return MF.getSubtarget<PPCSubtarget>().isELFv2ABI() &&
+           MF.getInfo<PPCFunctionInfo>()->isNonVolatileCRDisabled();
+  }];
 }
 
-def CRRC : RegisterClass<"PPC", [i32], 32, (add CR0, CR1, CR5, CR6,
-                                                CR7, CR2, CR3, CR4)>;
-
+def CRRC : RegisterClass<"PPC", [i32], 32,
+  (add CR0, CR1, CR5, CR6,
+       CR7, CR2, CR3, CR4)> {
+  let AltOrders = [(sub CRRC, CR2, CR3, CR4)];
+  let AltOrderSelect = [{
+    return MF.getSubtarget<PPCSubtarget>().isELFv2ABI() &&
+           MF.getInfo<PPCFunctionInfo>()->isNonVolatileCRDisabled();
+  }];
+}
 // The CTR registers are not allocatable because they're used by the
 // decrement-and-branch instructions, and thus need to stay live across
 // multiple basic blocks.
diff --git a/llvm/lib/Target/PowerPC/PPCScheduleP9.td b/llvm/lib/Target/PowerPC/PPCScheduleP9.td
index 6a79cca89194..0a1ae7e55b3c 100644
--- a/llvm/lib/Target/PowerPC/PPCScheduleP9.td
+++ b/llvm/lib/Target/PowerPC/PPCScheduleP9.td
@@ -20,7 +20,7 @@ def P9Model : SchedMachineModel {
 
   // Load latency is 4 or 5 cycles depending on the load. This latency assumes
   // that we have a cache hit. For a cache miss the load latency will be more.
-  // There are two instructions (lxvl, lxvll) that have a latencty of 6 cycles.
+  // There are two instructions (lxvl, lxvll) that have a latency of 6 cycles.
   // However it is not worth bumping this value up to 6 when the vast majority
   // of instructions are 4 or 5 cycles.
   let LoadLatency = 5;
@@ -40,9 +40,11 @@ def P9Model : SchedMachineModel {
 
   let CompleteModel = 1;
 
-  // Do not support QPX (Quad Processing eXtension) or SPE (Signal Procesing
-  // Engine) on Power 9.
-  let UnsupportedFeatures = [HasQPX, HasSPE];
+  // Do not support QPX (Quad Processing eXtension), SPE (Signal Processing
+  // Engine), prefixed instructions on Power 9, PC relative mem ops, or
+  // instructions introduced in ISA 3.1.
+  let UnsupportedFeatures = [HasQPX, HasSPE, PrefixInstrs, PCRelativeMemops,
+                             IsISA3_1];
 
 }
 
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
index 0997f68bd999..3836cc960394 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.cpp
@@ -78,6 +78,9 @@ void PPCSubtarget::initializeEnvironment() {
   HasP8Crypto = false;
   HasP9Vector = false;
   HasP9Altivec = false;
+  HasP10Vector = false;
+  HasPrefixInstrs = false;
+  HasPCRelativeMemops = false;
   HasFCPSGN = false;
   HasFSQRT = false;
   HasFRE = false;
@@ -102,7 +105,6 @@ void PPCSubtarget::initializeEnvironment() {
   FeatureMFTB = false;
   AllowsUnalignedFPAccess = false;
   DeprecatedDST = false;
-  HasLazyResolverStubs = false;
   HasICBT = false;
   HasInvariantFunctionDescriptors = false;
   HasPartwordAtomics = false;
@@ -110,19 +112,24 @@ void PPCSubtarget::initializeEnvironment() {
   IsQPXStackUnaligned = false;
   HasHTM = false;
   HasFloat128 = false;
+  HasFusion = false;
+  HasAddiLoadFusion = false;
+  HasAddisLoadFusion = false;
   IsISA3_0 = false;
+  IsISA3_1 = false;
   UseLongCalls = false;
   SecurePlt = false;
   VectorsUseTwoUnits = false;
   UsePPCPreRASchedStrategy = false;
   UsePPCPostRASchedStrategy = false;
+  PredictableSelectIsExpensive = false;
 
   HasPOPCNTD = POPCNTD_Unavailable;
 }
 
 void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   // Determine default and user specified characteristics
-  std::string CPUName = CPU;
+  std::string CPUName = std::string(CPU);
   if (CPUName.empty() || CPU == "generic") {
     // If cross-compiling with -march=ppc64le without -mcpu
     if (TargetTriple.getArch() == Triple::ppc64le)
@@ -144,10 +151,6 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   if (IsPPC64 && has64BitSupport())
     Use64BitRegs = true;
 
-  // Set up darwin-specific properties.
-  if (isDarwin())
-    HasLazyResolverStubs = true;
-
   if ((TargetTriple.isOSFreeBSD() && TargetTriple.getOSMajorVersion() >= 13) ||
       TargetTriple.isOSNetBSD() || TargetTriple.isOSOpenBSD() ||
       TargetTriple.isMusl())
@@ -174,26 +177,10 @@ void PPCSubtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
   IsLittleEndian = (TargetTriple.getArch() == Triple::ppc64le);
 }
 
-/// Return true if accesses to the specified global have to go through a dyld
-/// lazy resolution stub.  This means that an extra load is required to get the
-/// address of the global.
-bool PPCSubtarget::hasLazyResolverStub(const GlobalValue *GV) const {
-  if (!HasLazyResolverStubs)
-    return false;
-  if (!TM.shouldAssumeDSOLocal(*GV->getParent(), GV))
-    return true;
-  // 32 bit macho has no relocation for a-b if a is undefined, even if b is in
-  // the section that is being relocated. This means we have to use o load even
-  // for GVs that are known to be local to the dso.
-  if (GV->isDeclarationForLinker() || GV->hasCommonLinkage())
-    return true;
-  return false;
-}
-
 bool PPCSubtarget::enableMachineScheduler() const { return true; }
 
 bool PPCSubtarget::enableMachinePipeliner() const {
-  return (CPUDirective == PPC::DIR_PWR9) && EnableMachinePipeliner;
+  return getSchedModel().hasInstrSchedModel() && EnableMachinePipeliner;
 }
 
 bool PPCSubtarget::useDFAforSMS() const { return false; }
@@ -243,3 +230,8 @@ bool PPCSubtarget::isGVIndirectSymbol(const GlobalValue *GV) const {
 
 bool PPCSubtarget::isELFv2ABI() const { return TM.isELFv2ABI(); }
 bool PPCSubtarget::isPPC64() const { return TM.isPPC64(); }
+
+bool PPCSubtarget::isUsingPCRelativeCalls() const {
+  return isPPC64() && hasPCRelativeMemops() && isELFv2ABI() &&
+         CodeModel::Medium == getTargetMachine().getCodeModel();
+}
diff --git a/llvm/lib/Target/PowerPC/PPCSubtarget.h b/llvm/lib/Target/PowerPC/PPCSubtarget.h
index 044e982740e9..ec329022c457 100644
--- a/llvm/lib/Target/PowerPC/PPCSubtarget.h
+++ b/llvm/lib/Target/PowerPC/PPCSubtarget.h
@@ -34,36 +34,36 @@ class StringRef;
 
 namespace PPC {
   // -m directive values.
-  enum {
-    DIR_NONE,
-    DIR_32,
-    DIR_440,
-    DIR_601,
-    DIR_602,
-    DIR_603,
-    DIR_7400,
-    DIR_750,
-    DIR_970,
-    DIR_A2,
-    DIR_E500,
-    DIR_E500mc,
-    DIR_E5500,
-    DIR_PWR3,
-    DIR_PWR4,
-    DIR_PWR5,
-    DIR_PWR5X,
-    DIR_PWR6,
-    DIR_PWR6X,
-    DIR_PWR7,
-    DIR_PWR8,
-    DIR_PWR9,
-    DIR_PWR_FUTURE,
-    DIR_64
-  };
+enum {
+  DIR_NONE,
+  DIR_32,
+  DIR_440,
+  DIR_601,
+  DIR_602,
+  DIR_603,
+  DIR_7400,
+  DIR_750,
+  DIR_970,
+  DIR_A2,
+  DIR_E500,
+  DIR_E500mc,
+  DIR_E5500,
+  DIR_PWR3,
+  DIR_PWR4,
+  DIR_PWR5,
+  DIR_PWR5X,
+  DIR_PWR6,
+  DIR_PWR6X,
+  DIR_PWR7,
+  DIR_PWR8,
+  DIR_PWR9,
+  DIR_PWR10,
+  DIR_PWR_FUTURE,
+  DIR_64
+};
 }
 
 class GlobalValue;
-class TargetMachine;
 
 class PPCSubtarget : public PPCGenSubtargetInfo {
 public:
@@ -105,6 +105,9 @@ protected:
   bool HasP8Crypto;
   bool HasP9Vector;
   bool HasP9Altivec;
+  bool HasP10Vector;
+  bool HasPrefixInstrs;
+  bool HasPCRelativeMemops;
   bool HasFCPSGN;
   bool HasFSQRT;
   bool HasFRE, HasFRES, HasFRSQRTE, HasFRSQRTES;
@@ -126,7 +129,6 @@ protected:
   bool FeatureMFTB;
   bool AllowsUnalignedFPAccess;
   bool DeprecatedDST;
-  bool HasLazyResolverStubs;
   bool IsLittleEndian;
   bool HasICBT;
   bool HasInvariantFunctionDescriptors;
@@ -134,12 +136,17 @@ protected:
   bool HasDirectMove;
   bool HasHTM;
   bool HasFloat128;
+  bool HasFusion;
+  bool HasAddiLoadFusion;
+  bool HasAddisLoadFusion;
   bool IsISA3_0;
+  bool IsISA3_1;
   bool UseLongCalls;
   bool SecurePlt;
   bool VectorsUseTwoUnits;
   bool UsePPCPreRASchedStrategy;
   bool UsePPCPostRASchedStrategy;
+  bool PredictableSelectIsExpensive;
 
   POPCNTDKind HasPOPCNTD;
 
@@ -230,11 +237,6 @@ public:
   /// the individual condition register bits.
   bool useCRBits() const { return UseCRBits; }
 
-  /// hasLazyResolverStub - Return true if accesses to the specified global have
-  /// to go through a dyld lazy resolution stub.  This means that an extra load
-  /// is required to get the address of the global.
-  bool hasLazyResolverStub(const GlobalValue *GV) const;
-
   // isLittleEndian - True if generating little-endian code
   bool isLittleEndian() const { return IsLittleEndian; }
 
@@ -261,6 +263,9 @@ public:
   bool hasP8Crypto() const { return HasP8Crypto; }
   bool hasP9Vector() const { return HasP9Vector; }
   bool hasP9Altivec() const { return HasP9Altivec; }
+  bool hasP10Vector() const { return HasP10Vector; }
+  bool hasPrefixInstrs() const { return HasPrefixInstrs; }
+  bool hasPCRelativeMemops() const { return HasPCRelativeMemops; }
   bool hasMFOCRF() const { return HasMFOCRF; }
   bool hasISEL() const { return HasISEL; }
   bool hasBPERMD() const { return HasBPERMD; }
@@ -294,16 +299,24 @@ public:
     return Align(16);
   }
 
-  // DarwinABI has a 224-byte red zone. PPC32 SVR4ABI(Non-DarwinABI) has no
-  // red zone and PPC64 SVR4ABI has a 288-byte red zone.
   unsigned  getRedZoneSize() const {
-    return isDarwinABI() ? 224 : (isPPC64() ? 288 : 0);
+    if (isPPC64())
+      // 288 bytes = 18*8 (FPRs) + 18*8 (GPRs, GPR13 reserved)
+      return 288;
+
+    // AIX PPC32: 220 bytes = 18*8 (FPRs) + 19*4 (GPRs);
+    // PPC32 SVR4ABI has no redzone.
+    return isAIXABI() ? 220 : 0;
   }
 
   bool hasHTM() const { return HasHTM; }
   bool hasFloat128() const { return HasFloat128; }
   bool isISA3_0() const { return IsISA3_0; }
+  bool isISA3_1() const { return IsISA3_1; }
   bool useLongCalls() const { return UseLongCalls; }
+  bool hasFusion() const { return HasFusion; }
+  bool hasAddiLoadFusion() const { return HasAddiLoadFusion; }
+  bool hasAddisLoadFusion() const { return HasAddisLoadFusion; }
   bool needsSwapsForVSXMemOps() const {
     return hasVSX() && isLittleEndian() && !hasP9Vector();
   }
@@ -312,8 +325,6 @@ public:
 
   const Triple &getTargetTriple() const { return TargetTriple; }
 
-  /// isDarwin - True if this is any darwin platform.
-  bool isDarwin() const { return TargetTriple.isMacOSX(); }
   /// isBGQ - True if this is a BG/Q platform.
   bool isBGQ() const { return TargetTriple.getVendor() == Triple::BGQ; }
 
@@ -321,13 +332,13 @@ public:
   bool isTargetMachO() const { return TargetTriple.isOSBinFormatMachO(); }
   bool isTargetLinux() const { return TargetTriple.isOSLinux(); }
 
-  bool isDarwinABI() const { return isTargetMachO() || isDarwin(); }
   bool isAIXABI() const { return TargetTriple.isOSAIX(); }
-  bool isSVR4ABI() const { return !isDarwinABI() && !isAIXABI(); }
+  bool isSVR4ABI() const { return !isAIXABI(); }
   bool isELFv2ABI() const;
 
   bool is64BitELFABI() const { return  isSVR4ABI() && isPPC64(); }
   bool is32BitELFABI() const { return  isSVR4ABI() && !isPPC64(); }
+  bool isUsingPCRelativeCalls() const;
 
   /// Originally, this function return hasISEL(). Now we always enable it,
   /// but may expand the ISEL instruction later.
@@ -389,6 +400,10 @@ public:
   }
 
   bool isXRaySupported() const override { return IsPPC64 && IsLittleEndian; }
+
+  bool isPredictableSelectIsExpensive() const {
+    return PredictableSelectIsExpensive;
+  }
 };
 } // End llvm namespace
 
diff --git a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
index 17e1196eea59..4b809e0c8553 100644
--- a/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTLSDynamicCall.cpp
@@ -78,9 +78,9 @@ protected:
         Register OutReg = MI.getOperand(0).getReg();
         Register InReg = MI.getOperand(1).getReg();
         DebugLoc DL = MI.getDebugLoc();
-        unsigned GPR3 = Is64Bit ? PPC::X3 : PPC::R3;
+        Register GPR3 = Is64Bit ? PPC::X3 : PPC::R3;
         unsigned Opc1, Opc2;
-        const unsigned OrigRegs[] = {OutReg, InReg, GPR3};
+        const Register OrigRegs[] = {OutReg, InReg, GPR3};
 
         switch (MI.getOpcode()) {
         default:
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 2caf4c99a1f8..f15f9c7f4942 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -14,6 +14,7 @@
 #include "MCTargetDesc/PPCMCTargetDesc.h"
 #include "PPC.h"
 #include "PPCMachineScheduler.h"
+#include "PPCMacroFusion.h"
 #include "PPCSubtarget.h"
 #include "PPCTargetObjectFile.h"
 #include "PPCTargetTransformInfo.h"
@@ -158,7 +159,7 @@ static std::string getDataLayoutString(const Triple &T) {
 
 static std::string computeFSAdditions(StringRef FS, CodeGenOpt::Level OL,
                                       const Triple &TT) {
-  std::string FullFS = FS;
+  std::string FullFS = std::string(FS);
 
   // Make sure 64-bit features are available when CPUname is generic
   if (TT.getArch() == Triple::ppc64 || TT.getArch() == Triple::ppc64le) {
@@ -223,6 +224,9 @@ static PPCTargetMachine::PPCABI computeTargetABI(const Triple &TT,
 
 static Reloc::Model getEffectiveRelocModel(const Triple &TT,
                                            Optional<Reloc::Model> RM) {
+  assert((!TT.isOSAIX() || !RM.hasValue() || *RM == Reloc::PIC_) &&
+         "Invalid relocation model for AIX.");
+
   if (RM.hasValue())
     return *RM;
 
@@ -230,8 +234,8 @@ static Reloc::Model getEffectiveRelocModel(const Triple &TT,
   if (TT.isOSDarwin())
     return Reloc::DynamicNoPIC;
 
-  // Big Endian PPC is PIC by default.
-  if (TT.getArch() == Triple::ppc64)
+  // Big Endian PPC and AIX default to PIC.
+  if (TT.getArch() == Triple::ppc64 || TT.isOSAIX())
     return Reloc::PIC_;
 
   // Rest are static by default.
@@ -272,6 +276,9 @@ static ScheduleDAGInstrs *createPPCMachineScheduler(MachineSchedContext *C) {
                           std::make_unique<GenericScheduler>(C));
   // add DAG Mutations here.
   DAG->addMutation(createCopyConstrainDAGMutation(DAG->TII, DAG->TRI));
+  if (ST.hasFusion())
+    DAG->addMutation(createPowerPCMacroFusionDAGMutation());
+
   return DAG;
 }
 
@@ -283,6 +290,8 @@ static ScheduleDAGInstrs *createPPCPostMachineScheduler(
                       std::make_unique<PPCPostRASchedStrategy>(C) :
                       std::make_unique<PostGenericScheduler>(C), true);
   // add DAG Mutations here.
+  if (ST.hasFusion())
+    DAG->addMutation(createPowerPCMacroFusionDAGMutation());
   return DAG;
 }
 
@@ -495,7 +504,7 @@ void PPCPassConfig::addPreRegAlloc() {
     // PPCTLSDynamicCallPass uses LiveIntervals which previously dependent on
     // LiveVariables. This (unnecessary) dependency has been removed now,
     // however a stage-2 clang build fails without LiveVariables computed here.
-    addPass(&LiveVariablesID, false);
+    addPass(&LiveVariablesID);
     addPass(createPPCTLSDynamicCallPass());
   }
   if (EnableExtraTOCRegDeps)
@@ -522,9 +531,9 @@ void PPCPassConfig::addPreEmitPass() {
   addPass(createPPCExpandISELPass());
 
   if (getOptLevel() != CodeGenOpt::None)
-    addPass(createPPCEarlyReturnPass(), false);
+    addPass(createPPCEarlyReturnPass());
   // Must run branch selection immediately preceding the asm printer.
-  addPass(createPPCBranchSelectionPass(), false);
+  addPass(createPPCBranchSelectionPass());
 }
 
 TargetTransformInfo
diff --git a/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp b/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
index e237fab1b267..d52c9f92bd1d 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetObjectFile.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "PPCTargetObjectFile.h"
+#include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Mangler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -18,7 +19,6 @@ void
 PPC64LinuxTargetObjectFile::
 Initialize(MCContext &Ctx, const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
 }
 
 MCSection *PPC64LinuxTargetObjectFile::SelectSectionForGlobal(
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
index e05699cc95ec..53556ffc267d 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.cpp
@@ -33,6 +33,10 @@ EnablePPCColdCC("ppc-enable-coldcc", cl::Hidden, cl::init(false),
                 cl::desc("Enable using coldcc calling conv for cold "
                          "internal functions"));
 
+static cl::opt<bool>
+LsrNoInsnsCost("ppc-lsr-no-insns-cost", cl::Hidden, cl::init(false),
+               cl::desc("Do not add instruction count to lsr cost model"));
+
 // The latency of mtctr is only justified if there are more than 4
 // comparisons that will be removed as a result.
 static cl::opt<unsigned>
@@ -55,9 +59,10 @@ PPCTTIImpl::getPopcntSupport(unsigned TyWidth) {
   return TTI::PSK_Software;
 }
 
-int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+                              TTI::TargetCostKind CostKind) {
   if (DisablePPCConstHoist)
-    return BaseT::getIntImmCost(Imm, Ty);
+    return BaseT::getIntImmCost(Imm, Ty, CostKind);
 
   assert(Ty->isIntegerTy());
 
@@ -85,9 +90,10 @@ int PPCTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
 }
 
 int PPCTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
-                                    const APInt &Imm, Type *Ty) {
+                                    const APInt &Imm, Type *Ty,
+                                    TTI::TargetCostKind CostKind) {
   if (DisablePPCConstHoist)
-    return BaseT::getIntImmCostIntrin(IID, Idx, Imm, Ty);
+    return BaseT::getIntImmCostIntrin(IID, Idx, Imm, Ty, CostKind);
 
   assert(Ty->isIntegerTy());
 
@@ -115,13 +121,14 @@ int PPCTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
       return TTI::TCC_Free;
     break;
   }
-  return PPCTTIImpl::getIntImmCost(Imm, Ty);
+  return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
 }
 
 int PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
-                                  const APInt &Imm, Type *Ty) {
+                                  const APInt &Imm, Type *Ty,
+                                  TTI::TargetCostKind CostKind) {
   if (DisablePPCConstHoist)
-    return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty);
+    return BaseT::getIntImmCostInst(Opcode, Idx, Imm, Ty, CostKind);
 
   assert(Ty->isIntegerTy());
 
@@ -199,22 +206,28 @@ int PPCTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
       return TTI::TCC_Free;
   }
 
-  return PPCTTIImpl::getIntImmCost(Imm, Ty);
+  return PPCTTIImpl::getIntImmCost(Imm, Ty, CostKind);
 }
 
-unsigned PPCTTIImpl::getUserCost(const User *U,
-                                 ArrayRef<const Value *> Operands) {
+unsigned
+PPCTTIImpl::getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                        TTI::TargetCostKind CostKind) {
+  // We already implement getCastInstrCost and getMemoryOpCost where we perform
+  // the vector adjustment there.
+  if (isa<CastInst>(U) || isa<LoadInst>(U) || isa<StoreInst>(U))
+    return BaseT::getUserCost(U, Operands, CostKind);
+
   if (U->getType()->isVectorTy()) {
     // Instructions that need to be split should cost more.
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, U->getType());
-    return LT.first * BaseT::getUserCost(U, Operands);
+    return LT.first * BaseT::getUserCost(U, Operands, CostKind);
   }
 
-  return BaseT::getUserCost(U, Operands);
+  return BaseT::getUserCost(U, Operands, CostKind);
 }
 
-bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
-                             TargetLibraryInfo *LibInfo) {
+bool PPCTTIImpl::mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
+                             SmallPtrSetImpl<const Value *> &Visited) {
   const PPCTargetMachine &TM = ST->getTargetMachine();
 
   // Loop through the inline asm constraints and look for something that
@@ -233,8 +246,11 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
 
   // Determining the address of a TLS variable results in a function call in
   // certain TLS models.
-  std::function<bool(const Value*)> memAddrUsesCTR =
-    [&memAddrUsesCTR, &TM](const Value *MemAddr) -> bool {
+  std::function<bool(const Value *)> memAddrUsesCTR =
+      [&memAddrUsesCTR, &TM, &Visited](const Value *MemAddr) -> bool {
+    // No need to traverse again if we already checked this operand.
+    if (!Visited.insert(MemAddr).second)
+      return false;
     const auto *GV = dyn_cast<GlobalValue>(MemAddr);
     if (!GV) {
       // Recurse to check for constants that refer to TLS global variables.
@@ -264,7 +280,7 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
        J != JE; ++J) {
     if (CallInst *CI = dyn_cast<CallInst>(J)) {
       // Inline ASM is okay, unless it clobbers the ctr register.
-      if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledValue())) {
+      if (InlineAsm *IA = dyn_cast<InlineAsm>(CI->getCalledOperand())) {
         if (asmClobbersCTR(IA))
           return true;
         continue;
@@ -277,7 +293,7 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
         if (F->getIntrinsicID() != Intrinsic::not_intrinsic) {
           switch (F->getIntrinsicID()) {
           default: continue;
-          // If we have a call to ppc_is_decremented_ctr_nonzero, or ppc_mtctr
+          // If we have a call to loop_decrement or set_loop_iterations,
           // we're definitely using CTR.
           case Intrinsic::set_loop_iterations:
           case Intrinsic::loop_decrement:
@@ -308,6 +324,7 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
               return true;
             else
               continue; // ISD::FCOPYSIGN is never a library call.
+          case Intrinsic::fma:                Opcode = ISD::FMA;        break;
           case Intrinsic::sqrt:               Opcode = ISD::FSQRT;      break;
           case Intrinsic::floor:              Opcode = ISD::FFLOOR;     break;
           case Intrinsic::ceil:               Opcode = ISD::FCEIL;      break;
@@ -412,8 +429,9 @@ bool PPCTTIImpl::mightUseCTR(BasicBlock *BB,
 
       return true;
     } else if (isa<BinaryOperator>(J) &&
-               J->getType()->getScalarType()->isPPC_FP128Ty()) {
-      // Most operations on ppc_f128 values become calls.
+               (J->getType()->getScalarType()->isFP128Ty() ||
+                J->getType()->getScalarType()->isPPC_FP128Ty())) {
+      // Most operations on f128 or ppc_f128 values become calls.
       return true;
     } else if (isa<UIToFPInst>(J) || isa<SIToFPInst>(J) ||
                isa<FPToUIInst>(J) || isa<FPToSIInst>(J)) {
@@ -498,9 +516,10 @@ bool PPCTTIImpl::isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
 
   // We don't want to spill/restore the counter register, and so we don't
   // want to use the counter register if the loop contains calls.
+  SmallPtrSet<const Value *, 4> Visited;
   for (Loop::block_iterator I = L->block_begin(), IE = L->block_end();
        I != IE; ++I)
-    if (mightUseCTR(*I, LibInfo))
+    if (mightUseCTR(*I, LibInfo, Visited))
       return false;
 
   SmallVector<BasicBlock*, 4> ExitingBlocks;
@@ -549,6 +568,10 @@ void PPCTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   BaseT::getUnrollingPreferences(L, SE, UP);
 }
 
+void PPCTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                                       TTI::PeelingPreferences &PP) {
+  BaseT::getPeelingPreferences(L, SE, PP);
+}
 // This function returns true to allow using coldcc calling convention.
 // Returning true results in coldcc being used for functions which are cold at
 // all call sites when the callers of the functions are not calling any other
@@ -637,11 +660,12 @@ unsigned PPCTTIImpl::getCacheLineSize() const {
   if (CacheLineSize.getNumOccurrences() > 0)
     return CacheLineSize;
 
-  // On P7, P8 or P9 we have a cache line size of 128.
+  // Starting with P7 we have a cache line size of 128.
   unsigned Directive = ST->getCPUDirective();
   // Assume that Future CPU has the same cache line size as the others.
   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
-      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR_FUTURE)
+      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
+      Directive == PPC::DIR_PWR_FUTURE)
     return 128;
 
   // On other processors return a default of 64 bytes.
@@ -673,9 +697,11 @@ unsigned PPCTTIImpl::getMaxInterleaveFactor(unsigned VF) {
   // For P7 and P8, floating-point instructions have a 6-cycle latency and
   // there are two execution units, so unroll by 12x for latency hiding.
   // FIXME: the same for P9 as previous gen until POWER9 scheduling is ready
+  // FIXME: the same for P10 as previous gen until POWER10 scheduling is ready
   // Assume that future is the same as the others.
   if (Directive == PPC::DIR_PWR7 || Directive == PPC::DIR_PWR8 ||
-      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR_FUTURE)
+      Directive == PPC::DIR_PWR9 || Directive == PPC::DIR_PWR10 ||
+      Directive == PPC::DIR_PWR_FUTURE)
     return 12;
 
   // For most things, modern systems have two execution units (and
@@ -711,6 +737,7 @@ int PPCTTIImpl::vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1,
 }
 
 int PPCTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                       TTI::TargetCostKind CostKind,
                                        TTI::OperandValueKind Op1Info,
                                        TTI::OperandValueKind Op2Info,
                                        TTI::OperandValueProperties Opd1PropInfo,
@@ -718,9 +745,15 @@ int PPCTTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
                                        ArrayRef<const Value *> Args,
                                        const Instruction *CxtI) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
+  // TODO: Handle more cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+                                         Op2Info, Opd1PropInfo,
+                                         Opd2PropInfo, Args, CxtI);
 
   // Fallback to the default implementation.
-  int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+  int Cost = BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+                                           Op2Info,
                                            Opd1PropInfo, Opd2PropInfo);
   return vectorCostAdjustment(Cost, Opcode, Ty, nullptr);
 }
@@ -739,17 +772,33 @@ int PPCTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
                               nullptr);
 }
 
+int PPCTTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return Opcode == Instruction::PHI ? 0 : 1;
+  // Branches are assumed to be predicted.
+  return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
+}
+
 int PPCTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 TTI::TargetCostKind CostKind,
                                  const Instruction *I) {
   assert(TLI->InstructionOpcodeToISD(Opcode) && "Invalid opcode");
 
-  int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src);
-  return vectorCostAdjustment(Cost, Opcode, Dst, Src);
+  int Cost = BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+  Cost = vectorCostAdjustment(Cost, Opcode, Dst, Src);
+  // TODO: Allow non-throughput costs that aren't binary.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return Cost == 0 ? 0 : 1;
+  return Cost;
 }
 
 int PPCTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   TTI::TargetCostKind CostKind,
                                    const Instruction *I) {
-  int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+  int Cost = BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+  // TODO: Handle other cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return Cost;
   return vectorCostAdjustment(Cost, Opcode, ValTy, nullptr);
 }
 
@@ -828,13 +877,22 @@ int PPCTTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
 
 int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                 MaybeAlign Alignment, unsigned AddressSpace,
+                                TTI::TargetCostKind CostKind,
                                 const Instruction *I) {
+  if (TLI->getValueType(DL, Src,  true) == MVT::Other)
+    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                  CostKind);
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
          "Invalid Opcode");
 
-  int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace);
+  int Cost = BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                    CostKind);
+  // TODO: Handle other cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return Cost;
+
   Cost = vectorCostAdjustment(Cost, Opcode, Src, nullptr);
 
   bool IsAltivecType = ST->hasAltivec() &&
@@ -856,7 +914,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 
   // Aligned loads and stores are easy.
   unsigned SrcBytes = LT.second.getStoreSize();
-  if (!SrcBytes || !Alignment || Alignment >= SrcBytes)
+  if (!SrcBytes || !Alignment || *Alignment >= SrcBytes)
     return Cost;
 
   // If we can use the permutation-based load sequence, then this is also
@@ -868,7 +926,7 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   // longer true.
   if (Opcode == Instruction::Load &&
       ((!ST->hasP8Vector() && IsAltivecType) || IsQPXType) &&
-      Alignment >= LT.second.getScalarType().getStoreSize())
+      *Alignment >= LT.second.getScalarType().getStoreSize())
     return Cost + LT.first; // Add the cost of the permutations.
 
   // For VSX, we can do unaligned loads and stores on Altivec/VSX types. On the
@@ -893,22 +951,20 @@ int PPCTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
   // stores, loads are expanded using the vector-load + permutation sequence,
   // which is much less expensive).
   if (Src->isVectorTy() && Opcode == Instruction::Store)
-    for (int i = 0, e = Src->getVectorNumElements(); i < e; ++i)
+    for (int i = 0, e = cast<FixedVectorType>(Src)->getNumElements(); i < e;
+         ++i)
       Cost += getVectorInstrCost(Instruction::ExtractElement, Src, i);
 
   return Cost;
 }
 
-int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                           unsigned Factor,
-                                           ArrayRef<unsigned> Indices,
-                                           unsigned Alignment,
-                                           unsigned AddressSpace,
-                                           bool UseMaskForCond,
-                                           bool UseMaskForGaps) {
+int PPCTTIImpl::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+    bool UseMaskForCond, bool UseMaskForGaps) {
   if (UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace,
+                                             Alignment, AddressSpace, CostKind,
                                              UseMaskForCond, UseMaskForGaps);
 
   assert(isa<VectorType>(VecTy) &&
@@ -919,7 +975,8 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
 
   // Firstly, the cost of load/store operation.
   int Cost =
-      getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace);
+      getMemoryOpCost(Opcode, VecTy, MaybeAlign(Alignment), AddressSpace,
+                      CostKind);
 
   // PPC, for both Altivec/VSX and QPX, support cheap arbitrary permutations
   // (at least in the sense that there need only be one non-loop-invariant
@@ -931,18 +988,9 @@ int PPCTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   return Cost;
 }
 
-unsigned PPCTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-      ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF) {
-  return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
-}
-
-unsigned PPCTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-      ArrayRef<Type*> Tys, FastMathFlags FMF,
-      unsigned ScalarizationCostPassed) {
-  if (ID == Intrinsic::bswap && ST->hasP9Vector())
-    return TLI->getTypeLegalizationCost(DL, RetTy).first;
-  return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys, FMF,
-                                      ScalarizationCostPassed);
+unsigned PPCTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                                           TTI::TargetCostKind CostKind) {
+  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
 bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
@@ -967,3 +1015,16 @@ bool PPCTTIImpl::canSaveCmp(Loop *L, BranchInst **BI, ScalarEvolution *SE,
   *BI = HWLoopInfo.ExitBranch;
   return true;
 }
+
+bool PPCTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
+                               TargetTransformInfo::LSRCost &C2) {
+  // PowerPC default behaviour here is "instruction number 1st priority".
+  // If LsrNoInsnsCost is set, call default implementation.
+  if (!LsrNoInsnsCost)
+    return std::tie(C1.Insns, C1.NumRegs, C1.AddRecCost, C1.NumIVMuls,
+                    C1.NumBaseAdds, C1.ScaleCost, C1.ImmCost, C1.SetupCost) <
+           std::tie(C2.Insns, C2.NumRegs, C2.AddRecCost, C2.NumIVMuls,
+                    C2.NumBaseAdds, C2.ScaleCost, C2.ImmCost, C2.SetupCost);
+  else
+    return TargetTransformInfoImplBase::isLSRCostLess(C1, C2);
+}
diff --git a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
index 35388d14f606..d998521084e1 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
+++ b/llvm/lib/Target/PowerPC/PPCTargetTransformInfo.h
@@ -33,7 +33,8 @@ class PPCTTIImpl : public BasicTTIImplBase<PPCTTIImpl> {
 
   const PPCSubtarget *getST() const { return ST; }
   const PPCTargetLowering *getTLI() const { return TLI; }
-  bool mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo);
+  bool mightUseCTR(BasicBlock *BB, TargetLibraryInfo *LibInfo,
+                   SmallPtrSetImpl<const Value *> &Visited);
 
 public:
   explicit PPCTTIImpl(const PPCTargetMachine *TM, const Function &F)
@@ -44,14 +45,16 @@ public:
   /// @{
 
   using BaseT::getIntImmCost;
-  int getIntImmCost(const APInt &Imm, Type *Ty);
+  int getIntImmCost(const APInt &Imm, Type *Ty,
+                    TTI::TargetCostKind CostKind);
 
   int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                        Type *Ty);
+                        Type *Ty, TTI::TargetCostKind CostKind);
   int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                          Type *Ty);
+                          Type *Ty, TTI::TargetCostKind CostKind);
 
-  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
+  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands,
+                       TTI::TargetCostKind CostKind);
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
   bool isHardwareLoopProfitable(Loop *L, ScalarEvolution &SE,
@@ -63,6 +66,10 @@ public:
                   TargetLibraryInfo *LibInfo);
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
+  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                             TTI::PeelingPreferences &PP);
+  bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
+                     TargetTransformInfo::LSRCost &C2);
 
   /// @}
 
@@ -87,6 +94,7 @@ public:
   int vectorCostAdjustment(int Cost, unsigned Opcode, Type *Ty1, Type *Ty2);
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
@@ -95,24 +103,24 @@ public:
       const Instruction *CxtI = nullptr);
   int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       TTI::TargetCostKind CostKind,
                        const Instruction *I = nullptr);
+  int getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         TTI::TargetCostKind CostKind,
                          const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
-                      unsigned AddressSpace, const Instruction *I = nullptr);
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                 unsigned Factor,
-                                 ArrayRef<unsigned> Indices,
-                                 unsigned Alignment,
-                                 unsigned AddressSpace,
-                                 bool UseMaskForCond = false,
-                                 bool UseMaskForGaps = false);
-  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-            ArrayRef<Value*> Args, FastMathFlags FMF, unsigned VF);
-  unsigned getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-            ArrayRef<Type*> Tys, FastMathFlags FMF,
-            unsigned ScalarizationCostPassed = UINT_MAX);
+                      unsigned AddressSpace,
+                      TTI::TargetCostKind CostKind,
+                      const Instruction *I = nullptr);
+  int getInterleavedMemoryOpCost(
+      unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+      Align Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      bool UseMaskForCond = false, bool UseMaskForGaps = false);
+  unsigned getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                                 TTI::TargetCostKind CostKind);
 
   /// @}
 };
diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index 53562f42a184..407f980bd35e 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -10,10 +10,13 @@
 #include "MCTargetDesc/RISCVMCExpr.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
 #include "MCTargetDesc/RISCVTargetStreamer.h"
+#include "RISCVInstrInfo.h"
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "Utils/RISCVBaseInfo.h"
 #include "Utils/RISCVMatInt.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -32,6 +35,7 @@
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MathExtras.h"
+#include "llvm/Support/RISCVAttributes.h"
 #include "llvm/Support/TargetRegistry.h"
 
 #include <limits>
@@ -50,9 +54,16 @@ STATISTIC(RISCVNumInstrsCompressed,
 namespace {
 struct RISCVOperand;
 
+struct ParserOptionsSet {
+  bool IsPicEnabled;
+};
+
 class RISCVAsmParser : public MCTargetAsmParser {
   SmallVector<FeatureBitset, 4> FeatureBitStack;
 
+  SmallVector<ParserOptionsSet, 4> ParserOptionsStack;
+  ParserOptionsSet ParserOptions;
+
   SMLoc getLoc() const { return getParser().getTok().getLoc(); }
   bool isRV64() const { return getSTI().hasFeature(RISCV::Feature64Bit); }
   bool isRV32E() const { return getSTI().hasFeature(RISCV::FeatureRV32E); }
@@ -74,6 +85,8 @@ class RISCVAsmParser : public MCTargetAsmParser {
                                bool MatchingInlineAsm) override;
 
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
 
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
@@ -118,6 +131,9 @@ class RISCVAsmParser : public MCTargetAsmParser {
   // 'add' is an overloaded mnemonic.
   bool checkPseudoAddTPRel(MCInst &Inst, OperandVector &Operands);
 
+  // Check instruction constraints.
+  bool validateInstruction(MCInst &Inst, OperandVector &Operands);
+
   /// Helper for processing MC instructions that have been successfully matched
   /// by MatchAndEmitInstruction. Modifications to the emitted instructions,
   /// like the expansion of pseudo instructions (e.g., "li"), can be performed
@@ -138,11 +154,15 @@ class RISCVAsmParser : public MCTargetAsmParser {
   OperandMatchResultTy parseOperandWithModifier(OperandVector &Operands);
   OperandMatchResultTy parseBareSymbol(OperandVector &Operands);
   OperandMatchResultTy parseCallSymbol(OperandVector &Operands);
+  OperandMatchResultTy parsePseudoJumpSymbol(OperandVector &Operands);
   OperandMatchResultTy parseJALOffset(OperandVector &Operands);
+  OperandMatchResultTy parseVTypeI(OperandVector &Operands);
+  OperandMatchResultTy parseMaskReg(OperandVector &Operands);
 
   bool parseOperand(OperandVector &Operands, StringRef Mnemonic);
 
   bool parseDirectiveOption();
+  bool parseDirectiveAttribute();
 
   void setFeatureBits(uint64_t Feature, StringRef FeatureString) {
     if (!(getSTI().getFeatureBits()[Feature])) {
@@ -152,6 +172,10 @@ class RISCVAsmParser : public MCTargetAsmParser {
     }
   }
 
+  bool getFeatureBits(uint64_t Feature) {
+    return getSTI().getFeatureBits()[Feature];
+  }
+
   void clearFeatureBits(uint64_t Feature, StringRef FeatureString) {
     if (getSTI().getFeatureBits()[Feature]) {
       MCSubtargetInfo &STI = copySTI();
@@ -161,10 +185,15 @@ class RISCVAsmParser : public MCTargetAsmParser {
   }
 
   void pushFeatureBits() {
+    assert(FeatureBitStack.size() == ParserOptionsStack.size() &&
+           "These two stacks must be kept synchronized");
     FeatureBitStack.push_back(getSTI().getFeatureBits());
+    ParserOptionsStack.push_back(ParserOptions);
   }
 
   bool popFeatureBits() {
+    assert(FeatureBitStack.size() == ParserOptionsStack.size() &&
+           "These two stacks must be kept synchronized");
     if (FeatureBitStack.empty())
       return true;
 
@@ -172,8 +201,13 @@ class RISCVAsmParser : public MCTargetAsmParser {
     copySTI().setFeatureBits(FeatureBits);
     setAvailableFeatures(ComputeAvailableFeatures(FeatureBits));
 
+    ParserOptions = ParserOptionsStack.pop_back_val();
+
     return false;
   }
+
+  std::unique_ptr<RISCVOperand> defaultMaskRegOp() const;
+
 public:
   enum RISCVMatchResultTy {
     Match_Dummy = FIRST_TARGET_MATCH_RESULT_TY,
@@ -195,17 +229,21 @@ public:
     Parser.addAliasForDirective(".dword", ".8byte");
     setAvailableFeatures(ComputeAvailableFeatures(STI.getFeatureBits()));
 
-    if (Options.ABIName.back() == 'f' &&
+    auto ABIName = StringRef(Options.ABIName);
+    if (ABIName.endswith("f") &&
         !getSTI().getFeatureBits()[RISCV::FeatureStdExtF]) {
       errs() << "Hard-float 'f' ABI can't be used for a target that "
                 "doesn't support the F instruction set extension (ignoring "
                 "target-abi)\n";
-    } else if (Options.ABIName.back() == 'd' &&
+    } else if (ABIName.endswith("d") &&
                !getSTI().getFeatureBits()[RISCV::FeatureStdExtD]) {
       errs() << "Hard-float 'd' ABI can't be used for a target that "
                 "doesn't support the D instruction set extension (ignoring "
                 "target-abi)\n";
     }
+
+    const MCObjectFileInfo *MOFI = Parser.getContext().getObjectFileInfo();
+    ParserOptions.IsPicEnabled = MOFI->isPositionIndependent();
   }
 };
 
@@ -217,7 +255,8 @@ struct RISCVOperand : public MCParsedAsmOperand {
     Token,
     Register,
     Immediate,
-    SystemRegister
+    SystemRegister,
+    VType,
   } Kind;
 
   bool IsRV64;
@@ -238,12 +277,32 @@ struct RISCVOperand : public MCParsedAsmOperand {
     // e.g.: read/write or user/supervisor/machine privileges.
   };
 
+  enum class VSEW {
+    SEW_8 = 0,
+    SEW_16,
+    SEW_32,
+    SEW_64,
+    SEW_128,
+    SEW_256,
+    SEW_512,
+    SEW_1024,
+  };
+
+  enum class VLMUL { LMUL_1 = 0, LMUL_2, LMUL_4, LMUL_8 };
+
+  struct VTypeOp {
+    VSEW Sew;
+    VLMUL Lmul;
+    unsigned Encoding;
+  };
+
   SMLoc StartLoc, EndLoc;
   union {
     StringRef Tok;
     RegOp Reg;
     ImmOp Imm;
     struct SysRegOp SysReg;
+    struct VTypeOp VType;
   };
 
   RISCVOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
@@ -267,14 +326,21 @@ public:
     case KindTy::SystemRegister:
       SysReg = o.SysReg;
       break;
+    case KindTy::VType:
+      VType = o.VType;
+      break;
     }
   }
 
   bool isToken() const override { return Kind == KindTy::Token; }
   bool isReg() const override { return Kind == KindTy::Register; }
+  bool isV0Reg() const {
+    return Kind == KindTy::Register && Reg.RegNum == RISCV::V0;
+  }
   bool isImm() const override { return Kind == KindTy::Immediate; }
   bool isMem() const override { return false; }
   bool isSystemRegister() const { return Kind == KindTy::SystemRegister; }
+  bool isVType() const { return Kind == KindTy::VType; }
 
   bool isGPR() const {
     return Kind == KindTy::Register &&
@@ -336,6 +402,16 @@ public:
             VK == RISCVMCExpr::VK_RISCV_CALL_PLT);
   }
 
+  bool isPseudoJumpSymbol() const {
+    int64_t Imm;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    // Must be of 'immediate' type but not a constant.
+    if (!isImm() || evaluateConstantImm(getImm(), Imm, VK))
+      return false;
+    return RISCVAsmParser::classifySymbolRef(getImm(), VK, Imm) &&
+           VK == RISCVMCExpr::VK_RISCV_CALL;
+  }
+
   bool isTPRelAddSymbol() const {
     int64_t Imm;
     RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
@@ -348,6 +424,8 @@ public:
 
   bool isCSRSystemRegister() const { return isSystemRegister(); }
 
+  bool isVTypeI() const { return isVType(); }
+
   /// Return true if the operand is a valid for the fence instruction e.g.
   /// ('iorw').
   bool isFenceArg() const {
@@ -425,6 +503,17 @@ public:
     return (isRV64() && isUInt<6>(Imm)) || isUInt<5>(Imm);
   }
 
+  bool isUImmLog2XLenHalf() const {
+    int64_t Imm;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    if (!isImm())
+      return false;
+    if (!evaluateConstantImm(getImm(), Imm, VK) ||
+        VK != RISCVMCExpr::VK_RISCV_None)
+      return false;
+    return (isRV64() && isUInt<5>(Imm)) || isUInt<4>(Imm);
+  }
+
   bool isUImm5() const {
     int64_t Imm;
     RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
@@ -444,6 +533,15 @@ public:
            VK == RISCVMCExpr::VK_RISCV_None;
   }
 
+  bool isSImm5() const {
+    if (!isImm())
+      return false;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    int64_t Imm;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    return IsConstantImm && isInt<5>(Imm) && VK == RISCVMCExpr::VK_RISCV_None;
+  }
+
   bool isSImm6() const {
     if (!isImm())
       return false;
@@ -451,7 +549,7 @@ public:
     int64_t Imm;
     bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
     return IsConstantImm && isInt<6>(Imm) &&
-           VK == RISCVMCExpr::VK_RISCV_None;
+	    VK == RISCVMCExpr::VK_RISCV_None;
   }
 
   bool isSImm6NonZero() const {
@@ -609,6 +707,16 @@ public:
     return IsConstantImm && (Imm == 0) && VK == RISCVMCExpr::VK_RISCV_None;
   }
 
+  bool isSImm5Plus1() const {
+    if (!isImm())
+      return false;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    int64_t Imm;
+    bool IsConstantImm = evaluateConstantImm(getImm(), Imm, VK);
+    return IsConstantImm && isInt<5>(Imm - 1) &&
+           VK == RISCVMCExpr::VK_RISCV_None;
+  }
+
   /// getStartLoc - Gets location of the first token of this operand
   SMLoc getStartLoc() const override { return StartLoc; }
   /// getEndLoc - Gets location of the last token of this operand
@@ -636,6 +744,51 @@ public:
     return Tok;
   }
 
+  static StringRef getSEWStr(VSEW Sew) {
+    switch (Sew) {
+    case VSEW::SEW_8:
+      return "e8";
+    case VSEW::SEW_16:
+      return "e16";
+    case VSEW::SEW_32:
+      return "e32";
+    case VSEW::SEW_64:
+      return "e64";
+    case VSEW::SEW_128:
+      return "e128";
+    case VSEW::SEW_256:
+      return "e256";
+    case VSEW::SEW_512:
+      return "e512";
+    case VSEW::SEW_1024:
+      return "e1024";
+    }
+    return "";
+  }
+
+  static StringRef getLMULStr(VLMUL Lmul) {
+    switch (Lmul) {
+    case VLMUL::LMUL_1:
+      return "m1";
+    case VLMUL::LMUL_2:
+      return "m2";
+    case VLMUL::LMUL_4:
+      return "m4";
+    case VLMUL::LMUL_8:
+      return "m8";
+    }
+    return "";
+  }
+
+  StringRef getVType(SmallString<32> &Buf) const {
+    assert(Kind == KindTy::VType && "Invalid access!");
+    Buf.append(getSEWStr(VType.Sew));
+    Buf.append(",");
+    Buf.append(getLMULStr(VType.Lmul));
+
+    return Buf.str();
+  }
+
   void print(raw_ostream &OS) const override {
     switch (Kind) {
     case KindTy::Immediate:
@@ -651,6 +804,10 @@ public:
     case KindTy::SystemRegister:
       OS << "<sysreg: " << getSysReg() << '>';
       break;
+    case KindTy::VType:
+      SmallString<32> VTypeBuf;
+      OS << "<vtype: " << getVType(VTypeBuf) << '>';
+      break;
     }
   }
 
@@ -695,6 +852,20 @@ public:
     return Op;
   }
 
+  static std::unique_ptr<RISCVOperand> createVType(APInt Sew, APInt Lmul,
+                                                   SMLoc S, bool IsRV64) {
+    auto Op = std::make_unique<RISCVOperand>(KindTy::VType);
+    Sew.ashrInPlace(3);
+    unsigned SewLog2 = Sew.logBase2();
+    unsigned LmulLog2 = Lmul.logBase2();
+    Op->VType.Sew = static_cast<VSEW>(SewLog2);
+    Op->VType.Lmul = static_cast<VLMUL>(LmulLog2);
+    Op->VType.Encoding = (SewLog2 << 2) | LmulLog2;
+    Op->StartLoc = S;
+    Op->IsRV64 = IsRV64;
+    return Op;
+  }
+
   void addExpr(MCInst &Inst, const MCExpr *Expr) const {
     assert(Expr && "Expr shouldn't be null!");
     int64_t Imm = 0;
@@ -718,6 +889,16 @@ public:
     addExpr(Inst, getImm());
   }
 
+  void addSImm5Plus1Operands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    int64_t Imm = 0;
+    RISCVMCExpr::VariantKind VK = RISCVMCExpr::VK_RISCV_None;
+    bool IsConstant = evaluateConstantImm(getImm(), Imm, VK);
+    assert(IsConstant && "Expect constant value!");
+    (void)IsConstant;
+    Inst.addOperand(MCOperand::createImm(Imm - 1));
+  }
+
   void addFenceArgOperands(MCInst &Inst, unsigned N) const {
     assert(N == 1 && "Invalid number of operands!");
     // isFenceArg has validated the operand, meaning this cast is safe
@@ -742,6 +923,11 @@ public:
     Inst.addOperand(MCOperand::createImm(SysReg.Encoding));
   }
 
+  void addVTypeIOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createImm(VType.Encoding));
+  }
+
   // Returns the rounding mode represented by this RISCVOperand. Should only
   // be called after checking isFRMArg.
   RISCVFPRndMode::RoundingMode getRoundingMode() const {
@@ -819,6 +1005,8 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   default:
     break;
   case Match_Success:
+    if (validateInstruction(Inst, Operands))
+      return true;
     return processInstruction(Inst, IDLoc, Operands, Out);
   case Match_MissingFeature: {
     assert(MissingFeatures.any() && "Unknown missing features!");
@@ -885,6 +1073,10 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     if (isRV64())
       return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 6) - 1);
     return generateImmOutOfRangeError(Operands, ErrorInfo, 1, (1 << 5) - 1);
+  case Match_InvalidUImmLog2XLenHalf:
+    if (isRV64())
+      return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 5) - 1);
+    return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 4) - 1);
   case Match_InvalidUImm5:
     return generateImmOutOfRangeError(Operands, ErrorInfo, 0, (1 << 5) - 1);
   case Match_InvalidSImm6:
@@ -975,6 +1167,10 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "operand must be a bare symbol name");
   }
+  case Match_InvalidPseudoJumpSymbol: {
+    SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc, "operand must be a valid jump target");
+  }
   case Match_InvalidCallSymbol: {
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "operand must be a bare symbol name");
@@ -983,6 +1179,20 @@ bool RISCVAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
     return Error(ErrorLoc, "operand must be a symbol with %tprel_add modifier");
   }
+  case Match_InvalidVTypeI: {
+    SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc,
+                 "operand must be e[8|16|32|64|128|256|512|1024],m[1|2|4|8]");
+  }
+  case Match_InvalidVMaskRegister: {
+    SMLoc ErrorLoc = ((RISCVOperand &)*Operands[ErrorInfo]).getStartLoc();
+    return Error(ErrorLoc, "operand must be v0.t");
+  }
+  case Match_InvalidSImm5Plus1: {
+    return generateImmOutOfRangeError(Operands, ErrorInfo, -(1 << 4) + 1,
+                                      (1 << 4),
+                                      "immediate must be in the range");
+  }
   }
 
   llvm_unreachable("Unknown match type detected!");
@@ -1009,17 +1219,25 @@ static bool matchRegisterNameHelper(bool IsRV32E, Register &RegNo,
 
 bool RISCVAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                    SMLoc &EndLoc) {
+  if (tryParseRegister(RegNo, StartLoc, EndLoc) != MatchOperand_Success)
+    return Error(StartLoc, "invalid register name");
+  return false;
+}
+
+OperandMatchResultTy RISCVAsmParser::tryParseRegister(unsigned &RegNo,
+                                                      SMLoc &StartLoc,
+                                                      SMLoc &EndLoc) {
   const AsmToken &Tok = getParser().getTok();
   StartLoc = Tok.getLoc();
   EndLoc = Tok.getEndLoc();
   RegNo = 0;
   StringRef Name = getLexer().getTok().getIdentifier();
 
-  if (matchRegisterNameHelper(isRV32E(), (Register&)RegNo, Name))
-    return Error(StartLoc, "invalid register name");
+  if (matchRegisterNameHelper(isRV32E(), (Register &)RegNo, Name))
+    return MatchOperand_NoMatch;
 
   getParser().Lex(); // Eat identifier token.
-  return false;
+  return MatchOperand_Success;
 }
 
 OperandMatchResultTy RISCVAsmParser::parseRegister(OperandVector &Operands,
@@ -1112,6 +1330,8 @@ RISCVAsmParser::parseCSRSystemRegister(OperandVector &Operands) {
       return MatchOperand_ParseFail;
 
     auto SysReg = RISCVSysReg::lookupSysRegByName(Identifier);
+    if (!SysReg)
+      SysReg = RISCVSysReg::lookupSysRegByAltName(Identifier);
     // Accept a named Sys Reg if the required features are present.
     if (SysReg) {
       if (!SysReg->haveRequiredFeatures(getSTI().getFeatureBits())) {
@@ -1286,6 +1506,27 @@ OperandMatchResultTy RISCVAsmParser::parseCallSymbol(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+OperandMatchResultTy
+RISCVAsmParser::parsePseudoJumpSymbol(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+  const MCExpr *Res;
+
+  if (getParser().parseExpression(Res))
+    return MatchOperand_ParseFail;
+
+  if (Res->getKind() != MCExpr::ExprKind::SymbolRef ||
+      cast<MCSymbolRefExpr>(Res)->getKind() ==
+          MCSymbolRefExpr::VariantKind::VK_PLT) {
+    Error(S, "operand must be a valid jump target");
+    return MatchOperand_ParseFail;
+  }
+
+  Res = RISCVMCExpr::create(Res, RISCVMCExpr::VK_RISCV_CALL, getContext());
+  Operands.push_back(RISCVOperand::createImm(Res, S, E, isRV64()));
+  return MatchOperand_Success;
+}
+
 OperandMatchResultTy RISCVAsmParser::parseJALOffset(OperandVector &Operands) {
   // Parsing jal operands is fiddly due to the `jal foo` and `jal ra, foo`
   // both being acceptable forms. When parsing `jal ra, foo` this function
@@ -1303,6 +1544,74 @@ OperandMatchResultTy RISCVAsmParser::parseJALOffset(OperandVector &Operands) {
   return parseImmediate(Operands);
 }
 
+OperandMatchResultTy RISCVAsmParser::parseVTypeI(OperandVector &Operands) {
+  SMLoc S = getLoc();
+  if (getLexer().getKind() != AsmToken::Identifier)
+    return MatchOperand_NoMatch;
+
+  // Parse "e8,m1"
+  StringRef Name = getLexer().getTok().getIdentifier();
+  if (!Name.consume_front("e"))
+    return MatchOperand_NoMatch;
+  APInt Sew(16, Name, 10);
+  if (Sew != 8 && Sew != 16 && Sew != 32 && Sew != 64 && Sew != 128 &&
+      Sew != 256 && Sew != 512 && Sew != 1024)
+    return MatchOperand_NoMatch;
+  getLexer().Lex();
+
+  if (getLexer().getKind() == AsmToken::EndOfStatement) {
+    Operands.push_back(
+        RISCVOperand::createVType(Sew, APInt(16, 1), S, isRV64()));
+
+    return MatchOperand_Success;
+  }
+
+  if (!getLexer().is(AsmToken::Comma))
+    return MatchOperand_NoMatch;
+  getLexer().Lex();
+
+  Name = getLexer().getTok().getIdentifier();
+  if (!Name.consume_front("m"))
+    return MatchOperand_NoMatch;
+  APInt Lmul(16, Name, 10);
+  if (Lmul != 1 && Lmul != 2 && Lmul != 4 && Lmul != 8)
+    return MatchOperand_NoMatch;
+  getLexer().Lex();
+
+  if (getLexer().getKind() != AsmToken::EndOfStatement)
+    return MatchOperand_NoMatch;
+
+  Operands.push_back(RISCVOperand::createVType(Sew, Lmul, S, isRV64()));
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy RISCVAsmParser::parseMaskReg(OperandVector &Operands) {
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_NoMatch;
+  case AsmToken::Identifier:
+    StringRef Name = getLexer().getTok().getIdentifier();
+    if (!Name.consume_back(".t")) {
+      Error(getLoc(), "expected '.t' suffix");
+      return MatchOperand_ParseFail;
+    }
+    Register RegNo;
+    matchRegisterNameHelper(isRV32E(), RegNo, Name);
+
+    if (RegNo == RISCV::NoRegister)
+      return MatchOperand_NoMatch;
+    if (RegNo != RISCV::V0)
+      return MatchOperand_NoMatch;
+    SMLoc S = getLoc();
+    SMLoc E = SMLoc::getFromPointer(S.getPointer() - 1);
+    getLexer().Lex();
+    Operands.push_back(RISCVOperand::createReg(RegNo, S, E, isRV64()));
+  }
+
+  return MatchOperand_Success;
+}
+
 OperandMatchResultTy
 RISCVAsmParser::parseMemOpBaseReg(OperandVector &Operands) {
   if (getLexer().isNot(AsmToken::LParen)) {
@@ -1532,6 +1841,8 @@ bool RISCVAsmParser::ParseDirective(AsmToken DirectiveID) {
 
   if (IDVal == ".option")
     return parseDirectiveOption();
+  else if (IDVal == ".attribute")
+    return parseDirectiveAttribute();
 
   return true;
 }
@@ -1598,6 +1909,30 @@ bool RISCVAsmParser::parseDirectiveOption() {
     return false;
   }
 
+  if (Option == "pic") {
+    getTargetStreamer().emitDirectiveOptionPIC();
+
+    Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement))
+      return Error(Parser.getTok().getLoc(),
+                   "unexpected token, expected end of statement");
+
+    ParserOptions.IsPicEnabled = true;
+    return false;
+  }
+
+  if (Option == "nopic") {
+    getTargetStreamer().emitDirectiveOptionNoPIC();
+
+    Parser.Lex();
+    if (Parser.getTok().isNot(AsmToken::EndOfStatement))
+      return Error(Parser.getTok().getLoc(),
+                   "unexpected token, expected end of statement");
+
+    ParserOptions.IsPicEnabled = false;
+    return false;
+  }
+
   if (Option == "relax") {
     getTargetStreamer().emitDirectiveOptionRelax();
 
@@ -1630,12 +1965,157 @@ bool RISCVAsmParser::parseDirectiveOption() {
   return false;
 }
 
+/// parseDirectiveAttribute
+///  ::= .attribute expression ',' ( expression | "string" )
+///  ::= .attribute identifier ',' ( expression | "string" )
+bool RISCVAsmParser::parseDirectiveAttribute() {
+  MCAsmParser &Parser = getParser();
+  int64_t Tag;
+  SMLoc TagLoc;
+  TagLoc = Parser.getTok().getLoc();
+  if (Parser.getTok().is(AsmToken::Identifier)) {
+    StringRef Name = Parser.getTok().getIdentifier();
+    Optional<unsigned> Ret =
+        ELFAttrs::attrTypeFromString(Name, RISCVAttrs::RISCVAttributeTags);
+    if (!Ret.hasValue()) {
+      Error(TagLoc, "attribute name not recognised: " + Name);
+      return false;
+    }
+    Tag = Ret.getValue();
+    Parser.Lex();
+  } else {
+    const MCExpr *AttrExpr;
+
+    TagLoc = Parser.getTok().getLoc();
+    if (Parser.parseExpression(AttrExpr))
+      return true;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(AttrExpr);
+    if (check(!CE, TagLoc, "expected numeric constant"))
+      return true;
+
+    Tag = CE->getValue();
+  }
+
+  if (Parser.parseToken(AsmToken::Comma, "comma expected"))
+    return true;
+
+  StringRef StringValue;
+  int64_t IntegerValue = 0;
+  bool IsIntegerValue = true;
+
+  // RISC-V attributes have a string value if the tag number is odd
+  // and an integer value if the tag number is even.
+  if (Tag % 2)
+    IsIntegerValue = false;
+
+  SMLoc ValueExprLoc = Parser.getTok().getLoc();
+  if (IsIntegerValue) {
+    const MCExpr *ValueExpr;
+    if (Parser.parseExpression(ValueExpr))
+      return true;
+
+    const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(ValueExpr);
+    if (!CE)
+      return Error(ValueExprLoc, "expected numeric constant");
+    IntegerValue = CE->getValue();
+  } else {
+    if (Parser.getTok().isNot(AsmToken::String))
+      return Error(Parser.getTok().getLoc(), "expected string constant");
+
+    StringValue = Parser.getTok().getStringContents();
+    Parser.Lex();
+  }
+
+  if (Parser.parseToken(AsmToken::EndOfStatement,
+                        "unexpected token in '.attribute' directive"))
+    return true;
+
+  if (Tag == RISCVAttrs::ARCH) {
+    StringRef Arch = StringValue;
+    if (Arch.consume_front("rv32"))
+      clearFeatureBits(RISCV::Feature64Bit, "64bit");
+    else if (Arch.consume_front("rv64"))
+      setFeatureBits(RISCV::Feature64Bit, "64bit");
+    else
+      return Error(ValueExprLoc, "bad arch string " + Arch);
+
+    while (!Arch.empty()) {
+      if (Arch[0] == 'i')
+        clearFeatureBits(RISCV::FeatureRV32E, "e");
+      else if (Arch[0] == 'e')
+        setFeatureBits(RISCV::FeatureRV32E, "e");
+      else if (Arch[0] == 'g') {
+        clearFeatureBits(RISCV::FeatureRV32E, "e");
+        setFeatureBits(RISCV::FeatureStdExtM, "m");
+        setFeatureBits(RISCV::FeatureStdExtA, "a");
+        setFeatureBits(RISCV::FeatureStdExtF, "f");
+        setFeatureBits(RISCV::FeatureStdExtD, "d");
+      } else if (Arch[0] == 'm')
+        setFeatureBits(RISCV::FeatureStdExtM, "m");
+      else if (Arch[0] == 'a')
+        setFeatureBits(RISCV::FeatureStdExtA, "a");
+      else if (Arch[0] == 'f')
+        setFeatureBits(RISCV::FeatureStdExtF, "f");
+      else if (Arch[0] == 'd') {
+        setFeatureBits(RISCV::FeatureStdExtF, "f");
+        setFeatureBits(RISCV::FeatureStdExtD, "d");
+      } else if (Arch[0] == 'c') {
+        setFeatureBits(RISCV::FeatureStdExtC, "c");
+      } else
+        return Error(ValueExprLoc, "bad arch string " + Arch);
+
+      Arch = Arch.drop_front(1);
+      int major = 0;
+      int minor = 0;
+      Arch.consumeInteger(10, major);
+      Arch.consume_front("p");
+      Arch.consumeInteger(10, minor);
+      if (major != 0 || minor != 0) {
+        Arch = Arch.drop_until([](char c) { return c == '_' || c == '"'; });
+        Arch = Arch.drop_while([](char c) { return c == '_'; });
+      }
+    }
+  }
+
+  if (IsIntegerValue)
+    getTargetStreamer().emitAttribute(Tag, IntegerValue);
+  else {
+    if (Tag != RISCVAttrs::ARCH) {
+      getTargetStreamer().emitTextAttribute(Tag, StringValue);
+    } else {
+      std::string formalArchStr = "rv32";
+      if (getFeatureBits(RISCV::Feature64Bit))
+        formalArchStr = "rv64";
+      if (getFeatureBits(RISCV::FeatureRV32E))
+        formalArchStr = (Twine(formalArchStr) + "e1p9").str();
+      else
+        formalArchStr = (Twine(formalArchStr) + "i2p0").str();
+
+      if (getFeatureBits(RISCV::FeatureStdExtM))
+        formalArchStr = (Twine(formalArchStr) + "_m2p0").str();
+      if (getFeatureBits(RISCV::FeatureStdExtA))
+        formalArchStr = (Twine(formalArchStr) + "_a2p0").str();
+      if (getFeatureBits(RISCV::FeatureStdExtF))
+        formalArchStr = (Twine(formalArchStr) + "_f2p0").str();
+      if (getFeatureBits(RISCV::FeatureStdExtD))
+        formalArchStr = (Twine(formalArchStr) + "_d2p0").str();
+      if (getFeatureBits(RISCV::FeatureStdExtC))
+        formalArchStr = (Twine(formalArchStr) + "_c2p0").str();
+
+      getTargetStreamer().emitTextAttribute(Tag, formalArchStr);
+    }
+  }
+
+  return false;
+}
+
 void RISCVAsmParser::emitToStreamer(MCStreamer &S, const MCInst &Inst) {
   MCInst CInst;
   bool Res = compressInst(CInst, Inst, getSTI(), S.getContext());
   if (Res)
     ++RISCVNumInstrsCompressed;
-  S.EmitInstruction((Res ? CInst : Inst), getSTI());
+  S.emitInstruction((Res ? CInst : Inst), getSTI());
 }
 
 void RISCVAsmParser::emitLoadImm(Register DestReg, int64_t Value,
@@ -1671,7 +2151,7 @@ void RISCVAsmParser::emitAuipcInstPair(MCOperand DestReg, MCOperand TmpReg,
 
   MCSymbol *TmpLabel = Ctx.createTempSymbol(
       "pcrel_hi", /* AlwaysAddSuffix */ true, /* CanBeUnnamed */ false);
-  Out.EmitLabel(TmpLabel);
+  Out.emitLabel(TmpLabel);
 
   const RISCVMCExpr *SymbolHi = RISCVMCExpr::create(Symbol, VKHi, Ctx);
   emitToStreamer(
@@ -1716,8 +2196,7 @@ void RISCVAsmParser::emitLoadAddress(MCInst &Inst, SMLoc IDLoc,
   const MCExpr *Symbol = Inst.getOperand(1).getExpr();
   unsigned SecondOpcode;
   RISCVMCExpr::VariantKind VKHi;
-  // FIXME: Should check .option (no)pic when implemented
-  if (getContext().getObjectFileInfo()->isPositionIndependent()) {
+  if (ParserOptions.IsPicEnabled) {
     SecondOpcode = isRV64() ? RISCV::LD : RISCV::LW;
     VKHi = RISCVMCExpr::VK_RISCV_GOT_HI;
   } else {
@@ -1788,6 +2267,89 @@ bool RISCVAsmParser::checkPseudoAddTPRel(MCInst &Inst,
   return false;
 }
 
+std::unique_ptr<RISCVOperand> RISCVAsmParser::defaultMaskRegOp() const {
+  return RISCVOperand::createReg(RISCV::NoRegister, llvm::SMLoc(),
+                                 llvm::SMLoc(), isRV64());
+}
+
+bool RISCVAsmParser::validateInstruction(MCInst &Inst,
+                                         OperandVector &Operands) {
+  const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+  unsigned TargetFlags =
+      (MCID.TSFlags >> RISCV::ConstraintOffset) & RISCV::ConstraintMask;
+  if (TargetFlags == RISCV::NoConstraint)
+    return false;
+
+  unsigned DestReg = Inst.getOperand(0).getReg();
+  // Operands[1] will be the first operand, DestReg.
+  SMLoc Loc = Operands[1]->getStartLoc();
+  if ((TargetFlags == RISCV::WidenV) || (TargetFlags == RISCV::WidenW) ||
+      (TargetFlags == RISCV::SlideUp) || (TargetFlags == RISCV::Vrgather) ||
+      (TargetFlags == RISCV::Vcompress)) {
+    if (TargetFlags != RISCV::WidenW) {
+      unsigned Src2Reg = Inst.getOperand(1).getReg();
+      if (DestReg == Src2Reg)
+        return Error(Loc, "The destination vector register group cannot overlap"
+                          " the source vector register group.");
+      if (TargetFlags == RISCV::WidenV) {
+        // Assume DestReg LMUL is 2 at least for widening/narrowing operations.
+        if (DestReg + 1 == Src2Reg)
+          return Error(Loc,
+                       "The destination vector register group cannot overlap"
+                       " the source vector register group.");
+      }
+    }
+    if (Inst.getOperand(2).isReg()) {
+      unsigned Src1Reg = Inst.getOperand(2).getReg();
+      if (DestReg == Src1Reg)
+        return Error(Loc, "The destination vector register group cannot overlap"
+                          " the source vector register group.");
+      if (TargetFlags == RISCV::WidenV || TargetFlags == RISCV::WidenW) {
+        // Assume DestReg LMUL is 2 at least for widening/narrowing operations.
+        if (DestReg + 1 == Src1Reg)
+          return Error(Loc,
+                       "The destination vector register group cannot overlap"
+                       " the source vector register group.");
+      }
+    }
+    if (Inst.getNumOperands() == 4) {
+      unsigned MaskReg = Inst.getOperand(3).getReg();
+
+      if (DestReg == MaskReg)
+        return Error(Loc, "The destination vector register group cannot overlap"
+                          " the mask register.");
+    }
+  } else if (TargetFlags == RISCV::Narrow) {
+    unsigned Src2Reg = Inst.getOperand(1).getReg();
+    if (DestReg == Src2Reg)
+      return Error(Loc, "The destination vector register group cannot overlap"
+                        " the source vector register group.");
+    // Assume Src2Reg LMUL is 2 at least for widening/narrowing operations.
+    if (DestReg == Src2Reg + 1)
+      return Error(Loc, "The destination vector register group cannot overlap"
+                        " the source vector register group.");
+  } else if (TargetFlags == RISCV::WidenCvt || TargetFlags == RISCV::Iota) {
+    unsigned Src2Reg = Inst.getOperand(1).getReg();
+    if (DestReg == Src2Reg)
+      return Error(Loc, "The destination vector register group cannot overlap"
+                        " the source vector register group.");
+    if (TargetFlags == RISCV::WidenCvt) {
+      // Assume DestReg LMUL is 2 at least for widening/narrowing operations.
+      if (DestReg + 1 == Src2Reg)
+        return Error(Loc, "The destination vector register group cannot overlap"
+                          " the source vector register group.");
+    }
+    if (Inst.getNumOperands() == 3) {
+      unsigned MaskReg = Inst.getOperand(2).getReg();
+
+      if (DestReg == MaskReg)
+        return Error(Loc, "The destination vector register group cannot overlap"
+                          " the mask register.");
+    }
+  }
+  return false;
+}
+
 bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
                                         OperandVector &Operands,
                                         MCStreamer &Out) {
diff --git a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
index 1461a40227bf..37edc19398a5 100644
--- a/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
+++ b/llvm/lib/Target/RISCV/Disassembler/RISCVDisassembler.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCDisassembler/MCDisassembler.h"
 #include "llvm/MC/MCFixedLenDisassembler.h"
 #include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/Support/Endian.h"
@@ -31,10 +32,12 @@ typedef MCDisassembler::DecodeStatus DecodeStatus;
 
 namespace {
 class RISCVDisassembler : public MCDisassembler {
+  std::unique_ptr<MCInstrInfo const> const MCII;
 
 public:
-  RISCVDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
-      : MCDisassembler(STI, Ctx) {}
+  RISCVDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
+                    MCInstrInfo const *MCII)
+      : MCDisassembler(STI, Ctx), MCII(MCII) {}
 
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
@@ -45,7 +48,7 @@ public:
 static MCDisassembler *createRISCVDisassembler(const Target &T,
                                                const MCSubtargetInfo &STI,
                                                MCContext &Ctx) {
-  return new RISCVDisassembler(STI, Ctx);
+  return new RISCVDisassembler(STI, Ctx, T.createMCInstrInfo());
 }
 
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVDisassembler() {
@@ -148,6 +151,33 @@ static DecodeStatus DecodeGPRCRegisterClass(MCInst &Inst, uint64_t RegNo,
   return MCDisassembler::Success;
 }
 
+static DecodeStatus DecodeVRRegisterClass(MCInst &Inst, uint64_t RegNo,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  if (RegNo >= 32)
+    return MCDisassembler::Fail;
+
+  Register Reg = RISCV::V0 + RegNo;
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus decodeVMaskReg(MCInst &Inst, uint64_t RegNo,
+                                   uint64_t Address, const void *Decoder) {
+  Register Reg = RISCV::NoRegister;
+  switch (RegNo) {
+  default:
+    return MCDisassembler::Fail;
+  case 0:
+    Reg = RISCV::V0;
+    break;
+  case 1:
+    break;
+  }
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
 // Add implied SP operand for instructions *SP compressed instructions. The SP
 // operand isn't explicitly encoded in the instruction.
 static void addImplySP(MCInst &Inst, int64_t Address, const void *Decoder) {
@@ -349,6 +379,19 @@ DecodeStatus RISCVDisassembler::getInstruction(MCInst &MI, uint64_t &Size,
       }
     }
 
+    if (STI.getFeatureBits()[RISCV::FeatureExtZbproposedc] &&
+        STI.getFeatureBits()[RISCV::FeatureStdExtC]) {
+      LLVM_DEBUG(
+          dbgs() << "Trying RVBC32 table (BitManip 16-bit Instruction):\n");
+      // Calling the auto-generated decoder function.
+      Result = decodeInstruction(DecoderTableRVBC16, MI, Insn, Address,
+                                 this, STI);
+      if (Result != MCDisassembler::Fail) {
+        Size = 2;
+        return Result;
+      }
+    }
+
     LLVM_DEBUG(dbgs() << "Trying RISCV_C table (16-bit Instruction):\n");
     // Calling the auto-generated decoder function.
     Result = decodeInstruction(DecoderTable16, MI, Insn, Address, this, STI);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
index 5881a0a86ef7..bb1f1cc7f49a 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.cpp
@@ -9,6 +9,7 @@
 #include "RISCVAsmBackend.h"
 #include "RISCVMCExpr.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDirectives.h"
@@ -22,14 +23,75 @@
 
 using namespace llvm;
 
+Optional<MCFixupKind> RISCVAsmBackend::getFixupKind(StringRef Name) const {
+  if (STI.getTargetTriple().isOSBinFormatELF()) {
+    unsigned Type;
+    Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/RISCV.def"
+#undef ELF_RELOC
+               .Default(-1u);
+    if (Type != -1u)
+      return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type);
+  }
+  return None;
+}
+
+const MCFixupKindInfo &
+RISCVAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
+  const static MCFixupKindInfo Infos[] = {
+      // This table *must* be in the order that the fixup_* kinds are defined in
+      // RISCVFixupKinds.h.
+      //
+      // name                      offset bits  flags
+      {"fixup_riscv_hi20", 12, 20, 0},
+      {"fixup_riscv_lo12_i", 20, 12, 0},
+      {"fixup_riscv_lo12_s", 0, 32, 0},
+      {"fixup_riscv_pcrel_hi20", 12, 20,
+       MCFixupKindInfo::FKF_IsPCRel | MCFixupKindInfo::FKF_IsTarget},
+      {"fixup_riscv_pcrel_lo12_i", 20, 12,
+       MCFixupKindInfo::FKF_IsPCRel | MCFixupKindInfo::FKF_IsTarget},
+      {"fixup_riscv_pcrel_lo12_s", 0, 32,
+       MCFixupKindInfo::FKF_IsPCRel | MCFixupKindInfo::FKF_IsTarget},
+      {"fixup_riscv_got_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_riscv_tprel_hi20", 12, 20, 0},
+      {"fixup_riscv_tprel_lo12_i", 20, 12, 0},
+      {"fixup_riscv_tprel_lo12_s", 0, 32, 0},
+      {"fixup_riscv_tprel_add", 0, 0, 0},
+      {"fixup_riscv_tls_got_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_riscv_tls_gd_hi20", 12, 20, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_riscv_jal", 12, 20, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_riscv_branch", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_riscv_rvc_jump", 2, 11, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_riscv_rvc_branch", 0, 16, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_riscv_call", 0, 64, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_riscv_call_plt", 0, 64, MCFixupKindInfo::FKF_IsPCRel},
+      {"fixup_riscv_relax", 0, 0, 0},
+      {"fixup_riscv_align", 0, 0, 0}};
+  static_assert((array_lengthof(Infos)) == RISCV::NumTargetFixupKinds,
+                "Not all fixup kinds added to Infos array");
+
+  // Fixup kinds from .reloc directive are like R_RISCV_NONE. They
+  // do not require any extra processing.
+  if (Kind >= FirstLiteralRelocationKind)
+    return MCAsmBackend::getFixupKindInfo(FK_NONE);
+
+  if (Kind < FirstTargetFixupKind)
+    return MCAsmBackend::getFixupKindInfo(Kind);
+
+  assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+         "Invalid kind!");
+  return Infos[Kind - FirstTargetFixupKind];
+}
+
 // If linker relaxation is enabled, or the relax option had previously been
 // enabled, always emit relocations even if the fixup can be resolved. This is
 // necessary for correctness as offsets may change during relaxation.
 bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
                                             const MCFixup &Fixup,
                                             const MCValue &Target) {
-  bool ShouldForce = false;
-
+  if (Fixup.getKind() >= FirstLiteralRelocationKind)
+    return true;
   switch (Fixup.getTargetKind()) {
   default:
     break;
@@ -44,40 +106,9 @@ bool RISCVAsmBackend::shouldForceRelocation(const MCAssembler &Asm,
   case RISCV::fixup_riscv_tls_got_hi20:
   case RISCV::fixup_riscv_tls_gd_hi20:
     return true;
-  case RISCV::fixup_riscv_pcrel_lo12_i:
-  case RISCV::fixup_riscv_pcrel_lo12_s:
-    // For pcrel_lo12, force a relocation if the target of the corresponding
-    // pcrel_hi20 is not in the same fragment.
-    const MCFixup *T = cast<RISCVMCExpr>(Fixup.getValue())->getPCRelHiFixup();
-    if (!T) {
-      Asm.getContext().reportError(Fixup.getLoc(),
-                                   "could not find corresponding %pcrel_hi");
-      return false;
-    }
-
-    switch (T->getTargetKind()) {
-    default:
-      llvm_unreachable("Unexpected fixup kind for pcrel_lo12");
-      break;
-    case RISCV::fixup_riscv_got_hi20:
-    case RISCV::fixup_riscv_tls_got_hi20:
-    case RISCV::fixup_riscv_tls_gd_hi20:
-      ShouldForce = true;
-      break;
-    case RISCV::fixup_riscv_pcrel_hi20: {
-      MCFragment *TFragment = T->getValue()->findAssociatedFragment();
-      MCFragment *FixupFragment = Fixup.getValue()->findAssociatedFragment();
-      assert(FixupFragment && "We should have a fragment for this fixup");
-      ShouldForce =
-          !TFragment || TFragment->getParent() != FixupFragment->getParent();
-      break;
-    }
-    }
-    break;
   }
 
-  return ShouldForce || STI.getFeatureBits()[RISCV::FeatureRelax] ||
-         ForceRelocs;
+  return STI.getFeatureBits()[RISCV::FeatureRelax] || ForceRelocs;
 }
 
 bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
@@ -108,10 +139,10 @@ bool RISCVAsmBackend::fixupNeedsRelaxationAdvanced(const MCFixup &Fixup,
   }
 }
 
-void RISCVAsmBackend::relaxInstruction(const MCInst &Inst,
-                                       const MCSubtargetInfo &STI,
-                                       MCInst &Res) const {
+void RISCVAsmBackend::relaxInstruction(MCInst &Inst,
+                                       const MCSubtargetInfo &STI) const {
   // TODO: replace this with call to auto generated uncompressinstr() function.
+  MCInst Res;
   switch (Inst.getOpcode()) {
   default:
     llvm_unreachable("Opcode not expected!");
@@ -142,6 +173,7 @@ void RISCVAsmBackend::relaxInstruction(const MCInst &Inst,
     Res.addOperand(Inst.getOperand(0));
     break;
   }
+  Inst = std::move(Res);
 }
 
 // Given a compressed control flow instruction this function returns
@@ -284,13 +316,77 @@ static uint64_t adjustFixupValue(const MCFixup &Fixup, uint64_t Value,
   }
 }
 
+bool RISCVAsmBackend::evaluateTargetFixup(
+    const MCAssembler &Asm, const MCAsmLayout &Layout, const MCFixup &Fixup,
+    const MCFragment *DF, const MCValue &Target, uint64_t &Value,
+    bool &WasForced) {
+  const MCFixup *AUIPCFixup;
+  const MCFragment *AUIPCDF;
+  MCValue AUIPCTarget;
+  switch (Fixup.getTargetKind()) {
+  default:
+    llvm_unreachable("Unexpected fixup kind!");
+  case RISCV::fixup_riscv_pcrel_hi20:
+    AUIPCFixup = &Fixup;
+    AUIPCDF = DF;
+    AUIPCTarget = Target;
+    break;
+  case RISCV::fixup_riscv_pcrel_lo12_i:
+  case RISCV::fixup_riscv_pcrel_lo12_s: {
+    AUIPCFixup = cast<RISCVMCExpr>(Fixup.getValue())->getPCRelHiFixup(&AUIPCDF);
+    if (!AUIPCFixup) {
+      Asm.getContext().reportError(Fixup.getLoc(),
+                                   "could not find corresponding %pcrel_hi");
+      return true;
+    }
+
+    // MCAssembler::evaluateFixup will emit an error for this case when it sees
+    // the %pcrel_hi, so don't duplicate it when also seeing the %pcrel_lo.
+    const MCExpr *AUIPCExpr = AUIPCFixup->getValue();
+    if (!AUIPCExpr->evaluateAsRelocatable(AUIPCTarget, &Layout, AUIPCFixup))
+      return true;
+    break;
+  }
+  }
+
+  if (!AUIPCTarget.getSymA() || AUIPCTarget.getSymB())
+    return false;
+
+  const MCSymbolRefExpr *A = AUIPCTarget.getSymA();
+  const MCSymbol &SA = A->getSymbol();
+  if (A->getKind() != MCSymbolRefExpr::VK_None || SA.isUndefined())
+    return false;
+
+  auto *Writer = Asm.getWriterPtr();
+  if (!Writer)
+    return false;
+
+  bool IsResolved = Writer->isSymbolRefDifferenceFullyResolvedImpl(
+      Asm, SA, *AUIPCDF, false, true);
+  if (!IsResolved)
+    return false;
+
+  Value = Layout.getSymbolOffset(SA) + AUIPCTarget.getConstant();
+  Value -= Layout.getFragmentOffset(AUIPCDF) + AUIPCFixup->getOffset();
+
+  if (shouldForceRelocation(Asm, *AUIPCFixup, AUIPCTarget)) {
+    WasForced = true;
+    return false;
+  }
+
+  return true;
+}
+
 void RISCVAsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                  const MCValue &Target,
                                  MutableArrayRef<char> Data, uint64_t Value,
                                  bool IsResolved,
                                  const MCSubtargetInfo *STI) const {
+  MCFixupKind Kind = Fixup.getKind();
+  if (Kind >= FirstLiteralRelocationKind)
+    return;
   MCContext &Ctx = Asm.getContext();
-  MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+  MCFixupKindInfo Info = getFixupKindInfo(Kind);
   if (!Value)
     return; // Doesn't change encoding.
   // Apply any target-specific value adjustments.
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
index 254249c87dc8..090132af3585 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVAsmBackend.h
@@ -65,6 +65,11 @@ public:
                                      const MCAsmLayout &Layout,
                                      MCAlignFragment &AF) override;
 
+  bool evaluateTargetFixup(const MCAssembler &Asm, const MCAsmLayout &Layout,
+                           const MCFixup &Fixup, const MCFragment *DF,
+                           const MCValue &Target, uint64_t &Value,
+                           bool &WasForced) override;
+
   void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                   const MCValue &Target, MutableArrayRef<char> Data,
                   uint64_t Value, bool IsResolved,
@@ -92,52 +97,16 @@ public:
     return RISCV::NumTargetFixupKinds;
   }
 
-  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
-    const static MCFixupKindInfo Infos[] = {
-      // This table *must* be in the order that the fixup_* kinds are defined in
-      // RISCVFixupKinds.h.
-      //
-      // name                      offset bits  flags
-      { "fixup_riscv_hi20",         12,     20,  0 },
-      { "fixup_riscv_lo12_i",       20,     12,  0 },
-      { "fixup_riscv_lo12_s",        0,     32,  0 },
-      { "fixup_riscv_pcrel_hi20",   12,     20,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_pcrel_lo12_i", 20,     12,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_pcrel_lo12_s",  0,     32,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_got_hi20",     12,     20,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_tprel_hi20",   12,     20,  0 },
-      { "fixup_riscv_tprel_lo12_i", 20,     12,  0 },
-      { "fixup_riscv_tprel_lo12_s",  0,     32,  0 },
-      { "fixup_riscv_tprel_add",     0,      0,  0 },
-      { "fixup_riscv_tls_got_hi20", 12,     20,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_tls_gd_hi20",  12,     20,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_jal",          12,     20,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_branch",        0,     32,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_rvc_jump",      2,     11,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_rvc_branch",    0,     16,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_call",          0,     64,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_call_plt",      0,     64,  MCFixupKindInfo::FKF_IsPCRel },
-      { "fixup_riscv_relax",         0,      0,  0 },
-      { "fixup_riscv_align",         0,      0,  0 }
-    };
-    static_assert((array_lengthof(Infos)) == RISCV::NumTargetFixupKinds,
-                  "Not all fixup kinds added to Infos array");
-
-    if (Kind < FirstTargetFixupKind)
-      return MCAsmBackend::getFixupKindInfo(Kind);
-
-    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
-           "Invalid kind!");
-    return Infos[Kind - FirstTargetFixupKind];
-  }
+  Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
 
   bool mayNeedRelaxation(const MCInst &Inst,
                          const MCSubtargetInfo &STI) const override;
   unsigned getRelaxedOpcode(unsigned Op) const;
 
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override;
-
+  void relaxInstruction(MCInst &Inst,
+                        const MCSubtargetInfo &STI) const override;
 
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
index 08b75795ed4b..b38ba2bff582 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFObjectWriter.cpp
@@ -52,6 +52,8 @@ unsigned RISCVELFObjectWriter::getRelocType(MCContext &Ctx,
   const MCExpr *Expr = Fixup.getValue();
   // Determine the type of the relocation
   unsigned Kind = Fixup.getTargetKind();
+  if (Kind >= FirstLiteralRelocationKind)
+    return Kind - FirstLiteralRelocationKind;
   if (IsPCRel) {
     switch (Kind) {
     default:
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
index 40fa195f3790..079dc919928a 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.cpp
@@ -15,14 +15,18 @@
 #include "RISCVMCTargetDesc.h"
 #include "Utils/RISCVBaseInfo.h"
 #include "llvm/BinaryFormat/ELF.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/Support/LEB128.h"
+#include "llvm/Support/RISCVAttributes.h"
 
 using namespace llvm;
 
 // This part is for ELF object output.
 RISCVTargetELFStreamer::RISCVTargetELFStreamer(MCStreamer &S,
                                                const MCSubtargetInfo &STI)
-    : RISCVTargetStreamer(S) {
+    : RISCVTargetStreamer(S), CurrentVendor("riscv") {
   MCAssembler &MCA = getStreamer().getAssembler();
   const FeatureBitset &Features = STI.getFeatureBits();
   auto &MAB = static_cast<RISCVAsmBackend &>(MCA.getBackend());
@@ -62,7 +66,104 @@ MCELFStreamer &RISCVTargetELFStreamer::getStreamer() {
 
 void RISCVTargetELFStreamer::emitDirectiveOptionPush() {}
 void RISCVTargetELFStreamer::emitDirectiveOptionPop() {}
+void RISCVTargetELFStreamer::emitDirectiveOptionPIC() {}
+void RISCVTargetELFStreamer::emitDirectiveOptionNoPIC() {}
 void RISCVTargetELFStreamer::emitDirectiveOptionRVC() {}
 void RISCVTargetELFStreamer::emitDirectiveOptionNoRVC() {}
 void RISCVTargetELFStreamer::emitDirectiveOptionRelax() {}
 void RISCVTargetELFStreamer::emitDirectiveOptionNoRelax() {}
+
+void RISCVTargetELFStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  setAttributeItem(Attribute, Value, /*OverwriteExisting=*/true);
+}
+
+void RISCVTargetELFStreamer::emitTextAttribute(unsigned Attribute,
+                                               StringRef String) {
+  setAttributeItem(Attribute, String, /*OverwriteExisting=*/true);
+}
+
+void RISCVTargetELFStreamer::emitIntTextAttribute(unsigned Attribute,
+                                                  unsigned IntValue,
+                                                  StringRef StringValue) {
+  setAttributeItems(Attribute, IntValue, StringValue,
+                    /*OverwriteExisting=*/true);
+}
+
+void RISCVTargetELFStreamer::finishAttributeSection() {
+  if (Contents.empty())
+    return;
+
+  if (AttributeSection) {
+    Streamer.SwitchSection(AttributeSection);
+  } else {
+    MCAssembler &MCA = getStreamer().getAssembler();
+    AttributeSection = MCA.getContext().getELFSection(
+        ".riscv.attributes", ELF::SHT_RISCV_ATTRIBUTES, 0);
+    Streamer.SwitchSection(AttributeSection);
+
+    Streamer.emitInt8(ELFAttrs::Format_Version);
+  }
+
+  // Vendor size + Vendor name + '\0'
+  const size_t VendorHeaderSize = 4 + CurrentVendor.size() + 1;
+
+  // Tag + Tag Size
+  const size_t TagHeaderSize = 1 + 4;
+
+  const size_t ContentsSize = calculateContentSize();
+
+  Streamer.emitInt32(VendorHeaderSize + TagHeaderSize + ContentsSize);
+  Streamer.emitBytes(CurrentVendor);
+  Streamer.emitInt8(0); // '\0'
+
+  Streamer.emitInt8(ELFAttrs::File);
+  Streamer.emitInt32(TagHeaderSize + ContentsSize);
+
+  // Size should have been accounted for already, now
+  // emit each field as its type (ULEB or String).
+  for (AttributeItem item : Contents) {
+    Streamer.emitULEB128IntValue(item.Tag);
+    switch (item.Type) {
+    default:
+      llvm_unreachable("Invalid attribute type");
+    case AttributeType::Numeric:
+      Streamer.emitULEB128IntValue(item.IntValue);
+      break;
+    case AttributeType::Text:
+      Streamer.emitBytes(item.StringValue);
+      Streamer.emitInt8(0); // '\0'
+      break;
+    case AttributeType::NumericAndText:
+      Streamer.emitULEB128IntValue(item.IntValue);
+      Streamer.emitBytes(item.StringValue);
+      Streamer.emitInt8(0); // '\0'
+      break;
+    }
+  }
+
+  Contents.clear();
+}
+
+size_t RISCVTargetELFStreamer::calculateContentSize() const {
+  size_t Result = 0;
+  for (AttributeItem item : Contents) {
+    switch (item.Type) {
+    case AttributeType::Hidden:
+      break;
+    case AttributeType::Numeric:
+      Result += getULEB128Size(item.Tag);
+      Result += getULEB128Size(item.IntValue);
+      break;
+    case AttributeType::Text:
+      Result += getULEB128Size(item.Tag);
+      Result += item.StringValue.size() + 1; // string + '\0'
+      break;
+    case AttributeType::NumericAndText:
+      Result += getULEB128Size(item.Tag);
+      Result += getULEB128Size(item.IntValue);
+      Result += item.StringValue.size() + 1; // string + '\0';
+      break;
+    }
+  }
+  return Result;
+}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
index 138df786eaf3..392c87054d43 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVELFStreamer.h
@@ -15,16 +15,94 @@
 namespace llvm {
 
 class RISCVTargetELFStreamer : public RISCVTargetStreamer {
+private:
+  enum class AttributeType { Hidden, Numeric, Text, NumericAndText };
+
+  struct AttributeItem {
+    AttributeType Type;
+    unsigned Tag;
+    unsigned IntValue;
+    std::string StringValue;
+  };
+
+  StringRef CurrentVendor;
+  SmallVector<AttributeItem, 64> Contents;
+
+  MCSection *AttributeSection = nullptr;
+
+  AttributeItem *getAttributeItem(unsigned Attribute) {
+    for (size_t i = 0; i < Contents.size(); ++i)
+      if (Contents[i].Tag == Attribute)
+        return &Contents[i];
+    return nullptr;
+  }
+
+  void setAttributeItem(unsigned Attribute, unsigned Value,
+                        bool OverwriteExisting) {
+    // Look for existing attribute item.
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->Type = AttributeType::Numeric;
+      Item->IntValue = Value;
+      return;
+    }
+
+    // Create new attribute item.
+    Contents.push_back({AttributeType::Numeric, Attribute, Value, ""});
+  }
+
+  void setAttributeItem(unsigned Attribute, StringRef Value,
+                        bool OverwriteExisting) {
+    // Look for existing attribute item.
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->Type = AttributeType::Text;
+      Item->StringValue = std::string(Value);
+      return;
+    }
+
+    // Create new attribute item.
+    Contents.push_back({AttributeType::Text, Attribute, 0, std::string(Value)});
+  }
+
+  void setAttributeItems(unsigned Attribute, unsigned IntValue,
+                         StringRef StringValue, bool OverwriteExisting) {
+    // Look for existing attribute item.
+    if (AttributeItem *Item = getAttributeItem(Attribute)) {
+      if (!OverwriteExisting)
+        return;
+      Item->Type = AttributeType::NumericAndText;
+      Item->IntValue = IntValue;
+      Item->StringValue = std::string(StringValue);
+      return;
+    }
+
+    // Create new attribute item.
+    Contents.push_back({AttributeType::NumericAndText, Attribute, IntValue,
+                        std::string(StringValue)});
+  }
+
+  void emitAttribute(unsigned Attribute, unsigned Value) override;
+  void emitTextAttribute(unsigned Attribute, StringRef String) override;
+  void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
+                            StringRef StringValue) override;
+  void finishAttributeSection() override;
+  size_t calculateContentSize() const;
+
 public:
   MCELFStreamer &getStreamer();
   RISCVTargetELFStreamer(MCStreamer &S, const MCSubtargetInfo &STI);
 
-  virtual void emitDirectiveOptionPush();
-  virtual void emitDirectiveOptionPop();
-  virtual void emitDirectiveOptionRVC();
-  virtual void emitDirectiveOptionNoRVC();
-  virtual void emitDirectiveOptionRelax();
-  virtual void emitDirectiveOptionNoRelax();
+  void emitDirectiveOptionPush() override;
+  void emitDirectiveOptionPop() override;
+  void emitDirectiveOptionPIC() override;
+  void emitDirectiveOptionNoPIC() override;
+  void emitDirectiveOptionRVC() override;
+  void emitDirectiveOptionNoRVC() override;
+  void emitDirectiveOptionRelax() override;
+  void emitDirectiveOptionNoRelax() override;
 };
 }
 #endif
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
index 22bb80ae34e2..eae3e13dbe40 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.cpp
@@ -73,7 +73,7 @@ void RISCVInstPrinter::printInst(const MCInst *MI, uint64_t Address,
     Res = uncompressInst(UncompressedMI, *MI, MRI, STI);
   if (Res)
     NewMI = const_cast<MCInst *>(&UncompressedMI);
-  if (NoAliases || !printAliasInstr(NewMI, STI, O))
+  if (NoAliases || !printAliasInstr(NewMI, Address, STI, O))
     printInstruction(NewMI, Address, STI, O);
   printAnnotation(O, Annot);
 }
@@ -150,6 +150,39 @@ void RISCVInstPrinter::printAtomicMemOp(const MCInst *MI, unsigned OpNo,
   return;
 }
 
+void RISCVInstPrinter::printVTypeI(const MCInst *MI, unsigned OpNo,
+                                   const MCSubtargetInfo &STI, raw_ostream &O) {
+  unsigned Imm = MI->getOperand(OpNo).getImm();
+  unsigned Sew = (Imm >> 2) & 0x7;
+  unsigned Lmul = Imm & 0x3;
+
+  Lmul = 0x1 << Lmul;
+  Sew = 0x1 << (Sew + 3);
+  O << "e" << Sew << ",m" << Lmul;
+}
+
+void RISCVInstPrinter::printVMaskReg(const MCInst *MI, unsigned OpNo,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNo);
+
+  assert(MO.isReg() && "printVMaskReg can only print register operands");
+  if (MO.getReg() == RISCV::NoRegister)
+    return;
+  O << ", ";
+  printRegName(O, MO.getReg());
+  O << ".t";
+}
+
+void RISCVInstPrinter::printSImm5Plus1(const MCInst *MI, unsigned OpNo,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNo);
+
+  assert(MO.isImm() && "printSImm5Plus1 can only print constant operands");
+  O << MO.getImm() + 1;
+}
+
 const char *RISCVInstPrinter::getRegisterName(unsigned RegNo) {
   return getRegisterName(RegNo, ArchRegNames ? RISCV::NoRegAltName
                                              : RISCV::ABIRegAltName);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
index aeb2ea636060..fdaa00c5f8eb 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVInstPrinter.h
@@ -17,7 +17,6 @@
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
-class MCOperand;
 
 class RISCVInstPrinter : public MCInstPrinter {
 public:
@@ -41,14 +40,20 @@ public:
                    raw_ostream &O);
   void printAtomicMemOp(const MCInst *MI, unsigned OpNo,
                         const MCSubtargetInfo &STI, raw_ostream &O);
+  void printVTypeI(const MCInst *MI, unsigned OpNo, const MCSubtargetInfo &STI,
+                   raw_ostream &O);
+  void printVMaskReg(const MCInst *MI, unsigned OpNo,
+                     const MCSubtargetInfo &STI, raw_ostream &O);
+  void printSImm5Plus1(const MCInst *MI, unsigned OpNo,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, uint64_t Address,
                         const MCSubtargetInfo &STI, raw_ostream &O);
-  bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
-                       raw_ostream &O);
-  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                               unsigned PrintMethodIdx,
+  bool printAliasInstr(const MCInst *MI, uint64_t Address,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                               unsigned OpIdx, unsigned PrintMethodIdx,
                                const MCSubtargetInfo &STI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
   static const char *getRegisterName(unsigned RegNo, unsigned AltIdx);
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
index de99960848a5..816206c477df 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCCodeEmitter.cpp
@@ -80,6 +80,10 @@ public:
   unsigned getImmOpValue(const MCInst &MI, unsigned OpNo,
                          SmallVectorImpl<MCFixup> &Fixups,
                          const MCSubtargetInfo &STI) const;
+
+  unsigned getVMaskReg(const MCInst &MI, unsigned OpNo,
+                       SmallVectorImpl<MCFixup> &Fixups,
+                       const MCSubtargetInfo &STI) const;
 };
 } // end anonymous namespace
 
@@ -89,13 +93,14 @@ MCCodeEmitter *llvm::createRISCVMCCodeEmitter(const MCInstrInfo &MCII,
   return new RISCVMCCodeEmitter(Ctx, MCII);
 }
 
-// Expand PseudoCALL(Reg) and PseudoTAIL to AUIPC and JALR with relocation
-// types. We expand PseudoCALL(Reg) and PseudoTAIL while encoding, meaning AUIPC
-// and JALR won't go through RISCV MC to MC compressed instruction
-// transformation. This is acceptable because AUIPC has no 16-bit form and
-// C_JALR have no immediate operand field.  We let linker relaxation deal with
-// it. When linker relaxation enabled, AUIPC and JALR have chance relax to JAL.
-// If C extension is enabled, JAL has chance relax to C_JAL.
+// Expand PseudoCALL(Reg), PseudoTAIL and PseudoJump to AUIPC and JALR with
+// relocation types. We expand those pseudo-instructions while encoding them,
+// meaning AUIPC and JALR won't go through RISCV MC to MC compressed
+// instruction transformation. This is acceptable because AUIPC has no 16-bit
+// form and C_JALR has no immediate operand field.  We let linker relaxation
+// deal with it. When linker relaxation is enabled, AUIPC and JALR have a
+// chance to relax to JAL.
+// If the C extension is enabled, JAL has a chance relax to C_JAL.
 void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS,
                                             SmallVectorImpl<MCFixup> &Fixups,
                                             const MCSubtargetInfo &STI) const {
@@ -108,9 +113,12 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS,
   } else if (MI.getOpcode() == RISCV::PseudoCALLReg) {
     Func = MI.getOperand(1);
     Ra = MI.getOperand(0).getReg();
-  } else {
+  } else if (MI.getOpcode() == RISCV::PseudoCALL) {
     Func = MI.getOperand(0);
     Ra = RISCV::X1;
+  } else if (MI.getOpcode() == RISCV::PseudoJump) {
+    Func = MI.getOperand(1);
+    Ra = MI.getOperand(0).getReg();
   }
   uint32_t Binary;
 
@@ -125,8 +133,9 @@ void RISCVMCCodeEmitter::expandFunctionCall(const MCInst &MI, raw_ostream &OS,
   Binary = getBinaryCodeForInstr(TmpInst, Fixups, STI);
   support::endian::write(OS, Binary, support::little);
 
-  if (MI.getOpcode() == RISCV::PseudoTAIL)
-    // Emit JALR X0, X6, 0
+  if (MI.getOpcode() == RISCV::PseudoTAIL ||
+      MI.getOpcode() == RISCV::PseudoJump)
+    // Emit JALR X0, Ra, 0
     TmpInst = MCInstBuilder(RISCV::JALR).addReg(RISCV::X0).addReg(Ra).addImm(0);
   else
     // Emit JALR Ra, Ra, 0
@@ -180,9 +189,13 @@ void RISCVMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   // Get byte count of instruction.
   unsigned Size = Desc.getSize();
 
+  // RISCVInstrInfo::getInstSizeInBytes hard-codes the number of expanded
+  // instructions for each pseudo, and must be updated when adding new pseudos
+  // or changing existing ones.
   if (MI.getOpcode() == RISCV::PseudoCALLReg ||
       MI.getOpcode() == RISCV::PseudoCALL ||
-      MI.getOpcode() == RISCV::PseudoTAIL) {
+      MI.getOpcode() == RISCV::PseudoTAIL ||
+      MI.getOpcode() == RISCV::PseudoJump) {
     expandFunctionCall(MI, OS, Fixups, STI);
     MCNumEmitted += 2;
     return;
@@ -368,4 +381,20 @@ unsigned RISCVMCCodeEmitter::getImmOpValue(const MCInst &MI, unsigned OpNo,
   return 0;
 }
 
+unsigned RISCVMCCodeEmitter::getVMaskReg(const MCInst &MI, unsigned OpNo,
+                                         SmallVectorImpl<MCFixup> &Fixups,
+                                         const MCSubtargetInfo &STI) const {
+  MCOperand MO = MI.getOperand(OpNo);
+  assert(MO.isReg() && "Expected a register.");
+
+  switch (MO.getReg()) {
+  default:
+    llvm_unreachable("Invalid mask register.");
+  case RISCV::V0:
+    return 0;
+  case RISCV::NoRegister:
+    return 1;
+  }
+}
+
 #include "RISCVGenMCCodeEmitter.inc"
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
index 7aa9b5e7d683..2a6f372e50be 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.cpp
@@ -47,7 +47,7 @@ void RISCVMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
     OS << ')';
 }
 
-const MCFixup *RISCVMCExpr::getPCRelHiFixup() const {
+const MCFixup *RISCVMCExpr::getPCRelHiFixup(const MCFragment **DFOut) const {
   MCValue AUIPCLoc;
   if (!getSubExpr()->evaluateAsRelocatable(AUIPCLoc, nullptr, nullptr))
     return nullptr;
@@ -81,6 +81,8 @@ const MCFixup *RISCVMCExpr::getPCRelHiFixup() const {
     case RISCV::fixup_riscv_tls_got_hi20:
     case RISCV::fixup_riscv_tls_gd_hi20:
     case RISCV::fixup_riscv_pcrel_hi20:
+      if (DFOut)
+        *DFOut = DF;
       return &F;
     }
   }
@@ -88,74 +90,9 @@ const MCFixup *RISCVMCExpr::getPCRelHiFixup() const {
   return nullptr;
 }
 
-bool RISCVMCExpr::evaluatePCRelLo(MCValue &Res, const MCAsmLayout *Layout,
-                                  const MCFixup *Fixup) const {
-  // VK_RISCV_PCREL_LO has to be handled specially.  The MCExpr inside is
-  // actually the location of a auipc instruction with a VK_RISCV_PCREL_HI fixup
-  // pointing to the real target.  We need to generate an MCValue in the form of
-  // (<real target> + <offset from this fixup to the auipc fixup>).  The Fixup
-  // is pcrel relative to the VK_RISCV_PCREL_LO fixup, so we need to add the
-  // offset to the VK_RISCV_PCREL_HI Fixup from VK_RISCV_PCREL_LO to correct.
-
-  // Don't try to evaluate if the fixup will be forced as a relocation (e.g.
-  // as linker relaxation is enabled). If we evaluated pcrel_lo in this case,
-  // the modified fixup will be converted into a relocation that no longer
-  // points to the pcrel_hi as the linker requires.
-  auto &RAB =
-      static_cast<RISCVAsmBackend &>(Layout->getAssembler().getBackend());
-  if (RAB.willForceRelocations())
-    return false;
-
-  MCValue AUIPCLoc;
-  if (!getSubExpr()->evaluateAsValue(AUIPCLoc, *Layout))
-    return false;
-
-  const MCSymbolRefExpr *AUIPCSRE = AUIPCLoc.getSymA();
-  // Don't try to evaluate %pcrel_hi/%pcrel_lo pairs that cross fragment
-  // boundries.
-  if (!AUIPCSRE ||
-      findAssociatedFragment() != AUIPCSRE->findAssociatedFragment())
-    return false;
-
-  const MCSymbol *AUIPCSymbol = &AUIPCSRE->getSymbol();
-  if (!AUIPCSymbol)
-    return false;
-
-  const MCFixup *TargetFixup = getPCRelHiFixup();
-  if (!TargetFixup)
-    return false;
-
-  if ((unsigned)TargetFixup->getKind() != RISCV::fixup_riscv_pcrel_hi20)
-    return false;
-
-  MCValue Target;
-  if (!TargetFixup->getValue()->evaluateAsValue(Target, *Layout))
-    return false;
-
-  if (!Target.getSymA() || !Target.getSymA()->getSymbol().isInSection())
-    return false;
-
-  if (&Target.getSymA()->getSymbol().getSection() !=
-      findAssociatedFragment()->getParent())
-    return false;
-
-  // We must use TargetFixup rather than AUIPCSymbol here. They will almost
-  // always have the same offset, except for the case when AUIPCSymbol is at
-  // the end of a fragment and the fixup comes from offset 0 in the next
-  // fragment.
-  uint64_t AUIPCOffset = TargetFixup->getOffset();
-
-  Res = MCValue::get(Target.getSymA(), nullptr,
-                     Target.getConstant() + (Fixup->getOffset() - AUIPCOffset));
-  return true;
-}
-
 bool RISCVMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
                                             const MCAsmLayout *Layout,
                                             const MCFixup *Fixup) const {
-  if (Kind == VK_RISCV_PCREL_LO && evaluatePCRelLo(Res, Layout, Fixup))
-    return true;
-
   if (!getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup))
     return false;
 
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
index 921df376f3df..77038cee4e9d 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCExpr.h
@@ -19,7 +19,7 @@
 namespace llvm {
 
 class StringRef;
-class MCOperand;
+
 class RISCVMCExpr : public MCTargetExpr {
 public:
   enum VariantKind {
@@ -46,9 +46,6 @@ private:
 
   int64_t evaluateAsInt64(int64_t Value) const;
 
-  bool evaluatePCRelLo(MCValue &Res, const MCAsmLayout *Layout,
-                       const MCFixup *Fixup) const;
-
   explicit RISCVMCExpr(const MCExpr *Expr, VariantKind Kind)
       : Expr(Expr), Kind(Kind) {}
 
@@ -61,11 +58,11 @@ public:
   const MCExpr *getSubExpr() const { return Expr; }
 
   /// Get the corresponding PC-relative HI fixup that a VK_RISCV_PCREL_LO
-  /// points to.
+  /// points to, and optionally the fragment containing it.
   ///
   /// \returns nullptr if this isn't a VK_RISCV_PCREL_LO pointing to a
   /// known PC-relative HI fixup.
-  const MCFixup *getPCRelHiFixup() const;
+  const MCFixup *getPCRelHiFixup(const MCFragment **DFOut) const;
 
   void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
   bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
index c37482be3c2c..a474224e1a4e 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.cpp
@@ -20,6 +20,7 @@
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/CodeGen/Register.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCInstrAnalysis.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/MC/MCStreamer.h"
@@ -56,7 +57,7 @@ static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
   MCAsmInfo *MAI = new RISCVMCAsmInfo(TT);
 
   Register SP = MRI.getDwarfRegNum(RISCV::X2, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, SP, 0);
+  MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, SP, 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
@@ -64,7 +65,7 @@ static MCAsmInfo *createRISCVMCAsmInfo(const MCRegisterInfo &MRI,
 
 static MCSubtargetInfo *createRISCVMCSubtargetInfo(const Triple &TT,
                                                    StringRef CPU, StringRef FS) {
-  std::string CPUName = CPU;
+  std::string CPUName = std::string(CPU);
   if (CPUName.empty())
     CPUName = TT.isArch64Bit() ? "generic-rv64" : "generic-rv32";
   return createRISCVMCSubtargetInfoImpl(TT, CPUName, FS);
@@ -93,6 +94,49 @@ static MCTargetStreamer *createRISCVAsmTargetStreamer(MCStreamer &S,
   return new RISCVTargetAsmStreamer(S, OS);
 }
 
+static MCTargetStreamer *createRISCVNullTargetStreamer(MCStreamer &S) {
+  return new RISCVTargetStreamer(S);
+}
+
+namespace {
+
+class RISCVMCInstrAnalysis : public MCInstrAnalysis {
+public:
+  explicit RISCVMCInstrAnalysis(const MCInstrInfo *Info)
+      : MCInstrAnalysis(Info) {}
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                      uint64_t &Target) const override {
+    if (isConditionalBranch(Inst)) {
+      int64_t Imm;
+      if (Size == 2)
+        Imm = Inst.getOperand(1).getImm();
+      else
+        Imm = Inst.getOperand(2).getImm();
+      Target = Addr + Imm;
+      return true;
+    }
+
+    if (Inst.getOpcode() == RISCV::C_JAL || Inst.getOpcode() == RISCV::C_J) {
+      Target = Addr + Inst.getOperand(0).getImm();
+      return true;
+    }
+
+    if (Inst.getOpcode() == RISCV::JAL) {
+      Target = Addr + Inst.getOperand(1).getImm();
+      return true;
+    }
+
+    return false;
+  }
+};
+
+} // end anonymous namespace
+
+static MCInstrAnalysis *createRISCVInstrAnalysis(const MCInstrInfo *Info) {
+  return new RISCVMCInstrAnalysis(Info);
+}
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTargetMC() {
   for (Target *T : {&getTheRISCV32Target(), &getTheRISCV64Target()}) {
     TargetRegistry::RegisterMCAsmInfo(*T, createRISCVMCAsmInfo);
@@ -104,8 +148,12 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVTargetMC() {
     TargetRegistry::RegisterMCSubtargetInfo(*T, createRISCVMCSubtargetInfo);
     TargetRegistry::RegisterObjectTargetStreamer(
         *T, createRISCVObjectTargetStreamer);
+    TargetRegistry::RegisterMCInstrAnalysis(*T, createRISCVInstrAnalysis);
 
     // Register the asm target streamer.
     TargetRegistry::RegisterAsmTargetStreamer(*T, createRISCVAsmTargetStreamer);
+    // Register the null target streamer.
+    TargetRegistry::RegisterNullTargetStreamer(*T,
+                                               createRISCVNullTargetStreamer);
   }
 }
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
index b30997533ddf..5216a689715a 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVMCTargetDesc.h
@@ -26,11 +26,7 @@ class MCInstrInfo;
 class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
-class StringRef;
 class Target;
-class Triple;
-class raw_ostream;
-class raw_pwrite_stream;
 
 MCCodeEmitter *createRISCVMCCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo &MRI,
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
index 913e1f744192..54a2fb288579 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.cpp
@@ -11,12 +11,59 @@
 //===----------------------------------------------------------------------===//
 
 #include "RISCVTargetStreamer.h"
+#include "RISCVSubtarget.h"
 #include "llvm/Support/FormattedStream.h"
+#include "llvm/Support/RISCVAttributes.h"
 
 using namespace llvm;
 
 RISCVTargetStreamer::RISCVTargetStreamer(MCStreamer &S) : MCTargetStreamer(S) {}
 
+void RISCVTargetStreamer::finish() { finishAttributeSection(); }
+
+void RISCVTargetStreamer::emitDirectiveOptionPush() {}
+void RISCVTargetStreamer::emitDirectiveOptionPop() {}
+void RISCVTargetStreamer::emitDirectiveOptionPIC() {}
+void RISCVTargetStreamer::emitDirectiveOptionNoPIC() {}
+void RISCVTargetStreamer::emitDirectiveOptionRVC() {}
+void RISCVTargetStreamer::emitDirectiveOptionNoRVC() {}
+void RISCVTargetStreamer::emitDirectiveOptionRelax() {}
+void RISCVTargetStreamer::emitDirectiveOptionNoRelax() {}
+void RISCVTargetStreamer::emitAttribute(unsigned Attribute, unsigned Value) {}
+void RISCVTargetStreamer::finishAttributeSection() {}
+void RISCVTargetStreamer::emitTextAttribute(unsigned Attribute,
+                                            StringRef String) {}
+void RISCVTargetStreamer::emitIntTextAttribute(unsigned Attribute,
+                                               unsigned IntValue,
+                                               StringRef StringValue) {}
+
+void RISCVTargetStreamer::emitTargetAttributes(const MCSubtargetInfo &STI) {
+  if (STI.hasFeature(RISCV::FeatureRV32E))
+    emitAttribute(RISCVAttrs::STACK_ALIGN, RISCVAttrs::ALIGN_4);
+  else
+    emitAttribute(RISCVAttrs::STACK_ALIGN, RISCVAttrs::ALIGN_16);
+
+  std::string Arch = "rv32";
+  if (STI.hasFeature(RISCV::Feature64Bit))
+    Arch = "rv64";
+  if (STI.hasFeature(RISCV::FeatureRV32E))
+    Arch += "e1p9";
+  else
+    Arch += "i2p0";
+  if (STI.hasFeature(RISCV::FeatureStdExtM))
+    Arch += "_m2p0";
+  if (STI.hasFeature(RISCV::FeatureStdExtA))
+    Arch += "_a2p0";
+  if (STI.hasFeature(RISCV::FeatureStdExtF))
+    Arch += "_f2p0";
+  if (STI.hasFeature(RISCV::FeatureStdExtD))
+    Arch += "_d2p0";
+  if (STI.hasFeature(RISCV::FeatureStdExtC))
+    Arch += "_c2p0";
+
+  emitTextAttribute(RISCVAttrs::ARCH, Arch);
+}
+
 // This part is for ascii assembly output
 RISCVTargetAsmStreamer::RISCVTargetAsmStreamer(MCStreamer &S,
                                                formatted_raw_ostream &OS)
@@ -30,6 +77,14 @@ void RISCVTargetAsmStreamer::emitDirectiveOptionPop() {
   OS << "\t.option\tpop\n";
 }
 
+void RISCVTargetAsmStreamer::emitDirectiveOptionPIC() {
+  OS << "\t.option\tpic\n";
+}
+
+void RISCVTargetAsmStreamer::emitDirectiveOptionNoPIC() {
+  OS << "\t.option\tnopic\n";
+}
+
 void RISCVTargetAsmStreamer::emitDirectiveOptionRVC() {
   OS << "\t.option\trvc\n";
 }
@@ -45,3 +100,18 @@ void RISCVTargetAsmStreamer::emitDirectiveOptionRelax() {
 void RISCVTargetAsmStreamer::emitDirectiveOptionNoRelax() {
   OS << "\t.option\tnorelax\n";
 }
+
+void RISCVTargetAsmStreamer::emitAttribute(unsigned Attribute, unsigned Value) {
+  OS << "\t.attribute\t" << Attribute << ", " << Twine(Value) << "\n";
+}
+
+void RISCVTargetAsmStreamer::emitTextAttribute(unsigned Attribute,
+                                               StringRef String) {
+  OS << "\t.attribute\t" << Attribute << ", \"" << String << "\"\n";
+}
+
+void RISCVTargetAsmStreamer::emitIntTextAttribute(unsigned Attribute,
+                                                  unsigned IntValue,
+                                                  StringRef StringValue) {}
+
+void RISCVTargetAsmStreamer::finishAttributeSection() {}
diff --git a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
index 1becc134b2a2..32fa20f25d82 100644
--- a/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
+++ b/llvm/lib/Target/RISCV/MCTargetDesc/RISCVTargetStreamer.h
@@ -10,30 +10,49 @@
 #define LLVM_LIB_TARGET_RISCV_RISCVTARGETSTREAMER_H
 
 #include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
 
 namespace llvm {
 
 class RISCVTargetStreamer : public MCTargetStreamer {
 public:
   RISCVTargetStreamer(MCStreamer &S);
-
-  virtual void emitDirectiveOptionPush() = 0;
-  virtual void emitDirectiveOptionPop() = 0;
-  virtual void emitDirectiveOptionRVC() = 0;
-  virtual void emitDirectiveOptionNoRVC() = 0;
-  virtual void emitDirectiveOptionRelax() = 0;
-  virtual void emitDirectiveOptionNoRelax() = 0;
+  void finish() override;
+
+  virtual void emitDirectiveOptionPush();
+  virtual void emitDirectiveOptionPop();
+  virtual void emitDirectiveOptionPIC();
+  virtual void emitDirectiveOptionNoPIC();
+  virtual void emitDirectiveOptionRVC();
+  virtual void emitDirectiveOptionNoRVC();
+  virtual void emitDirectiveOptionRelax();
+  virtual void emitDirectiveOptionNoRelax();
+  virtual void emitAttribute(unsigned Attribute, unsigned Value);
+  virtual void finishAttributeSection();
+  virtual void emitTextAttribute(unsigned Attribute, StringRef String);
+  virtual void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
+                                    StringRef StringValue);
+
+  void emitTargetAttributes(const MCSubtargetInfo &STI);
 };
 
 // This part is for ascii assembly output
 class RISCVTargetAsmStreamer : public RISCVTargetStreamer {
   formatted_raw_ostream &OS;
 
+  void finishAttributeSection() override;
+  void emitAttribute(unsigned Attribute, unsigned Value) override;
+  void emitTextAttribute(unsigned Attribute, StringRef String) override;
+  void emitIntTextAttribute(unsigned Attribute, unsigned IntValue,
+                            StringRef StringValue) override;
+
 public:
   RISCVTargetAsmStreamer(MCStreamer &S, formatted_raw_ostream &OS);
 
   void emitDirectiveOptionPush() override;
   void emitDirectiveOptionPop() override;
+  void emitDirectiveOptionPIC() override;
+  void emitDirectiveOptionNoPIC() override;
   void emitDirectiveOptionRVC() override;
   void emitDirectiveOptionNoRVC() override;
   void emitDirectiveOptionRelax() override;
diff --git a/llvm/lib/Target/RISCV/RISCV.h b/llvm/lib/Target/RISCV/RISCV.h
index f23f742a4782..9baa2cc2741a 100644
--- a/llvm/lib/Target/RISCV/RISCV.h
+++ b/llvm/lib/Target/RISCV/RISCV.h
@@ -43,6 +43,9 @@ void initializeRISCVMergeBaseOffsetOptPass(PassRegistry &);
 FunctionPass *createRISCVExpandPseudoPass();
 void initializeRISCVExpandPseudoPass(PassRegistry &);
 
+FunctionPass *createRISCVExpandAtomicPseudoPass();
+void initializeRISCVExpandAtomicPseudoPass(PassRegistry &);
+
 InstructionSelector *createRISCVInstructionSelector(const RISCVTargetMachine &,
                                                     RISCVSubtarget &,
                                                     RISCVRegisterBankInfo &);
diff --git a/llvm/lib/Target/RISCV/RISCV.td b/llvm/lib/Target/RISCV/RISCV.td
index 82afa13aece3..f0583f691936 100644
--- a/llvm/lib/Target/RISCV/RISCV.td
+++ b/llvm/lib/Target/RISCV/RISCV.td
@@ -16,21 +16,21 @@ def FeatureStdExtM
     : SubtargetFeature<"m", "HasStdExtM", "true",
                        "'M' (Integer Multiplication and Division)">;
 def HasStdExtM : Predicate<"Subtarget->hasStdExtM()">,
-                           AssemblerPredicate<"FeatureStdExtM",
+                           AssemblerPredicate<(all_of FeatureStdExtM),
                            "'M' (Integer Multiplication and Division)">;
 
 def FeatureStdExtA
     : SubtargetFeature<"a", "HasStdExtA", "true",
                        "'A' (Atomic Instructions)">;
 def HasStdExtA : Predicate<"Subtarget->hasStdExtA()">,
-                           AssemblerPredicate<"FeatureStdExtA",
+                           AssemblerPredicate<(all_of FeatureStdExtA),
                            "'A' (Atomic Instructions)">;
 
 def FeatureStdExtF
     : SubtargetFeature<"f", "HasStdExtF", "true",
                        "'F' (Single-Precision Floating-Point)">;
 def HasStdExtF : Predicate<"Subtarget->hasStdExtF()">,
-                           AssemblerPredicate<"FeatureStdExtF",
+                           AssemblerPredicate<(all_of FeatureStdExtF),
                            "'F' (Single-Precision Floating-Point)">;
 
 def FeatureStdExtD
@@ -38,30 +38,130 @@ def FeatureStdExtD
                        "'D' (Double-Precision Floating-Point)",
                        [FeatureStdExtF]>;
 def HasStdExtD : Predicate<"Subtarget->hasStdExtD()">,
-                           AssemblerPredicate<"FeatureStdExtD",
+                           AssemblerPredicate<(all_of FeatureStdExtD),
                            "'D' (Double-Precision Floating-Point)">;
 
 def FeatureStdExtC
     : SubtargetFeature<"c", "HasStdExtC", "true",
                        "'C' (Compressed Instructions)">;
 def HasStdExtC : Predicate<"Subtarget->hasStdExtC()">,
-                           AssemblerPredicate<"FeatureStdExtC",
+                           AssemblerPredicate<(all_of FeatureStdExtC),
                            "'C' (Compressed Instructions)">;
 
-def FeatureRVCHints
-    : SubtargetFeature<"rvc-hints", "EnableRVCHintInstrs", "true",
-                       "Enable RVC Hint Instructions.">;
+def FeatureExtZbb
+    : SubtargetFeature<"experimental-zbb", "HasStdExtZbb", "true",
+                       "'Zbb' (Base 'B' Instructions)">;
+def HasStdExtZbb : Predicate<"Subtarget->hasStdExtZbb()">,
+                             AssemblerPredicate<(all_of FeatureExtZbb),
+                             "'Zbb' (Base 'B' Instructions)">;
+
+def FeatureExtZbc
+    : SubtargetFeature<"experimental-zbc", "HasStdExtZbc", "true",
+                       "'Zbc' (Carry-Less 'B' Instructions)">;
+def HasStdExtZbc : Predicate<"Subtarget->hasStdExtZbc()">,
+                             AssemblerPredicate<(all_of FeatureExtZbc),
+                             "'Zbc' (Carry-Less 'B' Instructions)">;
+
+def FeatureExtZbe
+    : SubtargetFeature<"experimental-zbe", "HasStdExtZbe", "true",
+                       "'Zbe' (Extract-Deposit 'B' Instructions)">;
+def HasStdExtZbe : Predicate<"Subtarget->hasStdExtZbe()">,
+                             AssemblerPredicate<(all_of FeatureExtZbe),
+                             "'Zbe' (Extract-Deposit 'B' Instructions)">;
+
+def FeatureExtZbf
+    : SubtargetFeature<"experimental-zbf", "HasStdExtZbf", "true",
+                       "'Zbf' (Bit-Field 'B' Instructions)">;
+def HasStdExtZbf : Predicate<"Subtarget->hasStdExtZbf()">,
+                             AssemblerPredicate<(all_of FeatureExtZbf),
+                             "'Zbf' (Bit-Field 'B' Instructions)">;
+
+def FeatureExtZbm
+    : SubtargetFeature<"experimental-zbm", "HasStdExtZbm", "true",
+                       "'Zbm' (Matrix 'B' Instructions)">;
+def HasStdExtZbm : Predicate<"Subtarget->hasStdExtZbm()">,
+                             AssemblerPredicate<(all_of FeatureExtZbm),
+                             "'Zbm' (Matrix 'B' Instructions)">;
+
+def FeatureExtZbp
+    : SubtargetFeature<"experimental-zbp", "HasStdExtZbp", "true",
+                       "'Zbp' (Permutation 'B' Instructions)">;
+def HasStdExtZbp : Predicate<"Subtarget->hasStdExtZbp()">,
+                             AssemblerPredicate<(all_of FeatureExtZbp),
+                             "'Zbp' (Permutation 'B' Instructions)">;
+
+def FeatureExtZbr
+    : SubtargetFeature<"experimental-zbr", "HasStdExtZbr", "true",
+                       "'Zbr' (Polynomial Reduction 'B' Instructions)">;
+def HasStdExtZbr : Predicate<"Subtarget->hasStdExtZbr()">,
+                             AssemblerPredicate<(all_of FeatureExtZbr),
+                             "'Zbr' (Polynomial Reduction 'B' Instructions)">;
+
+def FeatureExtZbs
+    : SubtargetFeature<"experimental-zbs", "HasStdExtZbs", "true",
+                       "'Zbs' (Single-Bit 'B' Instructions)">;
+def HasStdExtZbs : Predicate<"Subtarget->hasStdExtZbs()">,
+                             AssemblerPredicate<(all_of FeatureExtZbs),
+                             "'Zbs' (Single-Bit 'B' Instructions)">;
+
+def FeatureExtZbt
+    : SubtargetFeature<"experimental-zbt", "HasStdExtZbt", "true",
+                       "'Zbt' (Ternary 'B' Instructions)">;
+def HasStdExtZbt : Predicate<"Subtarget->hasStdExtZbt()">,
+                             AssemblerPredicate<(all_of FeatureExtZbt),
+                             "'Zbt' (Ternary 'B' Instructions)">;
+
+// Some instructions belong to both the basic and the permutation
+// subextensions. They should be enabled if either has been specified.
+def HasStdExtZbbOrZbp
+    : Predicate<"Subtarget->hasStdExtZbb() || Subtarget->hasStdExtZbp()">,
+                AssemblerPredicate<(any_of FeatureExtZbb, FeatureExtZbp)>;
+
+def FeatureExtZbproposedc
+    : SubtargetFeature<"experimental-zbproposedc", "HasStdExtZbproposedc", "true",
+                       "'Zbproposedc' (Proposed Compressed 'B' Instructions)">;
+def HasStdExtZbproposedc : Predicate<"Subtarget->hasStdExtZbproposedc()">,
+                           AssemblerPredicate<(all_of FeatureExtZbproposedc),
+                           "'Zbproposedc' (Proposed Compressed 'B' Instructions)">;
+
+def FeatureStdExtB
+    : SubtargetFeature<"experimental-b", "HasStdExtB", "true",
+                       "'B' (Bit Manipulation Instructions)",
+                       [FeatureExtZbb,
+                        FeatureExtZbc,
+                        FeatureExtZbe,
+                        FeatureExtZbf,
+                        FeatureExtZbm,
+                        FeatureExtZbp,
+                        FeatureExtZbr,
+                        FeatureExtZbs,
+                        FeatureExtZbt]>;
+def HasStdExtB : Predicate<"Subtarget->hasStdExtB()">,
+                           AssemblerPredicate<(all_of FeatureStdExtB),
+                           "'B' (Bit Manipulation Instructions)">;
+
+def FeatureNoRVCHints
+    : SubtargetFeature<"no-rvc-hints", "EnableRVCHintInstrs", "false",
+                       "Disable RVC Hint Instructions.">;
 def HasRVCHints : Predicate<"Subtarget->enableRVCHintInstrs()">,
-                            AssemblerPredicate<"FeatureRVCHints",
-                            "RVC Hint Instructions">;
+                  AssemblerPredicate<(all_of(not FeatureNoRVCHints)),
+                                     "RVC Hint Instructions">;
+
+def FeatureStdExtV
+    : SubtargetFeature<"experimental-v", "HasStdExtV", "true",
+                       "'V' (Vector Instructions)",
+                       [FeatureStdExtF]>;
+def HasStdExtV : Predicate<"Subtarget->hasStdExtV()">,
+                           AssemblerPredicate<(all_of FeatureStdExtV),
+                           "'V' (Vector Instructions)">;
 
 def Feature64Bit
     : SubtargetFeature<"64bit", "HasRV64", "true", "Implements RV64">;
 def IsRV64 : Predicate<"Subtarget->is64Bit()">,
-                       AssemblerPredicate<"Feature64Bit",
+                       AssemblerPredicate<(all_of Feature64Bit),
                        "RV64I Base Instruction Set">;
 def IsRV32 : Predicate<"!Subtarget->is64Bit()">,
-                       AssemblerPredicate<"!Feature64Bit",
+                       AssemblerPredicate<(all_of (not Feature64Bit)),
                        "RV32I Base Instruction Set">;
 
 def RV64           : HwMode<"+64bit">;
@@ -71,7 +171,7 @@ def FeatureRV32E
     : SubtargetFeature<"e", "IsRV32E", "true",
                        "Implements RV32E (provides 16 rather than 32 GPRs)">;
 def IsRV32E : Predicate<"Subtarget->isRV32E()">,
-                        AssemblerPredicate<"FeatureRV32E">;
+                        AssemblerPredicate<(all_of FeatureRV32E)>;
 
 def FeatureRelax
     : SubtargetFeature<"relax", "EnableLinkerRelax", "true",
@@ -82,6 +182,9 @@ foreach i = {1-31} in
         SubtargetFeature<"reserve-x"#i, "UserReservedRegister[RISCV::X"#i#"]",
                          "true", "Reserve X"#i>;
 
+def FeatureSaveRestore : SubtargetFeature<"save-restore", "EnableSaveRestore",
+                                          "true", "Enable save/restore.">;
+
 //===----------------------------------------------------------------------===//
 // Named operands for CSR instructions.
 //===----------------------------------------------------------------------===//
@@ -92,19 +195,26 @@ include "RISCVSystemOperands.td"
 // Registers, calling conventions, instruction descriptions.
 //===----------------------------------------------------------------------===//
 
+include "RISCVSchedule.td"
 include "RISCVRegisterInfo.td"
 include "RISCVCallingConv.td"
 include "RISCVInstrInfo.td"
 include "RISCVRegisterBanks.td"
+include "RISCVSchedRocket32.td"
+include "RISCVSchedRocket64.td"
 
 //===----------------------------------------------------------------------===//
 // RISC-V processors supported.
 //===----------------------------------------------------------------------===//
 
-def : ProcessorModel<"generic-rv32", NoSchedModel, [FeatureRVCHints]>;
+def : ProcessorModel<"generic-rv32", NoSchedModel, []>;
+
+def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit]>;
+
+def : ProcessorModel<"rocket-rv32", Rocket32Model, []>;
+
+def : ProcessorModel<"rocket-rv64", Rocket64Model, [Feature64Bit]>;
 
-def : ProcessorModel<"generic-rv64", NoSchedModel, [Feature64Bit,
-                     FeatureRVCHints]>;
 
 //===----------------------------------------------------------------------===//
 // Define the RISC-V target.
diff --git a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
index f4aa28bcc0c1..8955994b1c2e 100644
--- a/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
+++ b/llvm/lib/Target/RISCV/RISCVAsmPrinter.cpp
@@ -11,9 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "RISCV.h"
 #include "MCTargetDesc/RISCVInstPrinter.h"
 #include "MCTargetDesc/RISCVMCExpr.h"
+#include "MCTargetDesc/RISCVTargetStreamer.h"
+#include "RISCV.h"
 #include "RISCVTargetMachine.h"
 #include "TargetInfo/RISCVTargetInfo.h"
 #include "llvm/ADT/Statistic.h"
@@ -37,14 +38,18 @@ STATISTIC(RISCVNumInstrsCompressed,
 
 namespace {
 class RISCVAsmPrinter : public AsmPrinter {
+  const MCSubtargetInfo *STI;
+
 public:
   explicit RISCVAsmPrinter(TargetMachine &TM,
                            std::unique_ptr<MCStreamer> Streamer)
-      : AsmPrinter(TM, std::move(Streamer)) {}
+      : AsmPrinter(TM, std::move(Streamer)), STI(TM.getMCSubtargetInfo()) {}
 
   StringRef getPassName() const override { return "RISCV Assembly Printer"; }
 
-  void EmitInstruction(const MachineInstr *MI) override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  void emitInstruction(const MachineInstr *MI) override;
 
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        const char *ExtraCode, raw_ostream &OS) override;
@@ -59,6 +64,12 @@ public:
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
     return LowerRISCVMachineOperandToMCOperand(MO, MCOp, *this);
   }
+
+  void emitStartOfAsmFile(Module &M) override;
+  void emitEndOfAsmFile(Module &M) override;
+
+private:
+  void emitAttributes();
 };
 }
 
@@ -66,8 +77,7 @@ public:
 #include "RISCVGenCompressInstEmitter.inc"
 void RISCVAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
   MCInst CInst;
-  bool Res = compressInst(CInst, Inst, *TM.getMCSubtargetInfo(),
-                          OutStreamer->getContext());
+  bool Res = compressInst(CInst, Inst, *STI, OutStreamer->getContext());
   if (Res)
     ++RISCVNumInstrsCompressed;
   AsmPrinter::EmitToStreamer(*OutStreamer, Res ? CInst : Inst);
@@ -77,7 +87,7 @@ void RISCVAsmPrinter::EmitToStreamer(MCStreamer &S, const MCInst &Inst) {
 // instructions) auto-generated.
 #include "RISCVGenMCPseudoLowering.inc"
 
-void RISCVAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void RISCVAsmPrinter::emitInstruction(const MachineInstr *MI) {
   // Do any auto-generated pseudo lowerings.
   if (emitPseudoExpansionLowering(*OutStreamer, MI))
     return;
@@ -154,6 +164,45 @@ bool RISCVAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return AsmPrinter::PrintAsmMemoryOperand(MI, OpNo, ExtraCode, OS);
 }
 
+bool RISCVAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
+  // Set the current MCSubtargetInfo to a copy which has the correct
+  // feature bits for the current MachineFunction
+  MCSubtargetInfo &NewSTI =
+    OutStreamer->getContext().getSubtargetCopy(*TM.getMCSubtargetInfo());
+  NewSTI.setFeatureBits(MF.getSubtarget().getFeatureBits());
+  STI = &NewSTI;
+
+  SetupMachineFunction(MF);
+  emitFunctionBody();
+  return false;
+}
+
+void RISCVAsmPrinter::emitStartOfAsmFile(Module &M) {
+  if (TM.getTargetTriple().isOSBinFormatELF())
+    emitAttributes();
+}
+
+void RISCVAsmPrinter::emitEndOfAsmFile(Module &M) {
+  RISCVTargetStreamer &RTS =
+      static_cast<RISCVTargetStreamer &>(*OutStreamer->getTargetStreamer());
+
+  if (TM.getTargetTriple().isOSBinFormatELF())
+    RTS.finishAttributeSection();
+}
+
+void RISCVAsmPrinter::emitAttributes() {
+  RISCVTargetStreamer &RTS =
+      static_cast<RISCVTargetStreamer &>(*OutStreamer->getTargetStreamer());
+
+  const Triple &TT = TM.getTargetTriple();
+  StringRef CPU = TM.getTargetCPU();
+  StringRef FS = TM.getTargetFeatureString();
+  const RISCVTargetMachine &RTM = static_cast<const RISCVTargetMachine &>(TM);
+  const RISCVSubtarget STI(TT, CPU, FS, /*ABIName=*/"", RTM);
+
+  RTS.emitTargetAttributes(STI);
+}
+
 // Force static initialization.
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeRISCVAsmPrinter() {
   RegisterAsmPrinter<RISCVAsmPrinter> X(getTheRISCV32Target());
diff --git a/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
new file mode 100644
index 000000000000..26ce16486bd9
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVExpandAtomicPseudoInsts.cpp
@@ -0,0 +1,618 @@
+//===-- RISCVExpandAtomicPseudoInsts.cpp - Expand atomic pseudo instrs. ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains a pass that expands atomic pseudo instructions into
+// target instructions. This pass should be run at the last possible moment,
+// avoiding the possibility for other passes to break the requirements for
+// forward progress in the LR/SC block.
+//
+//===----------------------------------------------------------------------===//
+
+#include "RISCV.h"
+#include "RISCVInstrInfo.h"
+#include "RISCVTargetMachine.h"
+
+#include "llvm/CodeGen/LivePhysRegs.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+
+using namespace llvm;
+
+#define RISCV_EXPAND_ATOMIC_PSEUDO_NAME                                        \
+  "RISCV atomic pseudo instruction expansion pass"
+
+namespace {
+
+class RISCVExpandAtomicPseudo : public MachineFunctionPass {
+public:
+  const RISCVInstrInfo *TII;
+  static char ID;
+
+  RISCVExpandAtomicPseudo() : MachineFunctionPass(ID) {
+    initializeRISCVExpandAtomicPseudoPass(*PassRegistry::getPassRegistry());
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return RISCV_EXPAND_ATOMIC_PSEUDO_NAME;
+  }
+
+private:
+  bool expandMBB(MachineBasicBlock &MBB);
+  bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+                MachineBasicBlock::iterator &NextMBBI);
+  bool expandAtomicBinOp(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI, AtomicRMWInst::BinOp,
+                         bool IsMasked, int Width,
+                         MachineBasicBlock::iterator &NextMBBI);
+  bool expandAtomicMinMaxOp(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI,
+                            AtomicRMWInst::BinOp, bool IsMasked, int Width,
+                            MachineBasicBlock::iterator &NextMBBI);
+  bool expandAtomicCmpXchg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, bool IsMasked,
+                           int Width, MachineBasicBlock::iterator &NextMBBI);
+};
+
+char RISCVExpandAtomicPseudo::ID = 0;
+
+bool RISCVExpandAtomicPseudo::runOnMachineFunction(MachineFunction &MF) {
+  TII = static_cast<const RISCVInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  bool Modified = false;
+  for (auto &MBB : MF)
+    Modified |= expandMBB(MBB);
+  return Modified;
+}
+
+bool RISCVExpandAtomicPseudo::expandMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+  while (MBBI != E) {
+    MachineBasicBlock::iterator NMBBI = std::next(MBBI);
+    Modified |= expandMI(MBB, MBBI, NMBBI);
+    MBBI = NMBBI;
+  }
+
+  return Modified;
+}
+
+bool RISCVExpandAtomicPseudo::expandMI(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       MachineBasicBlock::iterator &NextMBBI) {
+  // RISCVInstrInfo::getInstSizeInBytes hard-codes the number of expanded
+  // instructions for each pseudo, and must be updated when adding new pseudos
+  // or changing existing ones.
+  switch (MBBI->getOpcode()) {
+  case RISCV::PseudoAtomicLoadNand32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 32,
+                             NextMBBI);
+  case RISCV::PseudoAtomicLoadNand64:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 64,
+                             NextMBBI);
+  case RISCV::PseudoMaskedAtomicSwap32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, true, 32,
+                             NextMBBI);
+  case RISCV::PseudoMaskedAtomicLoadAdd32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Add, true, 32, NextMBBI);
+  case RISCV::PseudoMaskedAtomicLoadSub32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Sub, true, 32, NextMBBI);
+  case RISCV::PseudoMaskedAtomicLoadNand32:
+    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, true, 32,
+                             NextMBBI);
+  case RISCV::PseudoMaskedAtomicLoadMax32:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Max, true, 32,
+                                NextMBBI);
+  case RISCV::PseudoMaskedAtomicLoadMin32:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Min, true, 32,
+                                NextMBBI);
+  case RISCV::PseudoMaskedAtomicLoadUMax32:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMax, true, 32,
+                                NextMBBI);
+  case RISCV::PseudoMaskedAtomicLoadUMin32:
+    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMin, true, 32,
+                                NextMBBI);
+  case RISCV::PseudoCmpXchg32:
+    return expandAtomicCmpXchg(MBB, MBBI, false, 32, NextMBBI);
+  case RISCV::PseudoCmpXchg64:
+    return expandAtomicCmpXchg(MBB, MBBI, false, 64, NextMBBI);
+  case RISCV::PseudoMaskedCmpXchg32:
+    return expandAtomicCmpXchg(MBB, MBBI, true, 32, NextMBBI);
+  }
+
+  return false;
+}
+
+static unsigned getLRForRMW32(AtomicOrdering Ordering) {
+  switch (Ordering) {
+  default:
+    llvm_unreachable("Unexpected AtomicOrdering");
+  case AtomicOrdering::Monotonic:
+    return RISCV::LR_W;
+  case AtomicOrdering::Acquire:
+    return RISCV::LR_W_AQ;
+  case AtomicOrdering::Release:
+    return RISCV::LR_W;
+  case AtomicOrdering::AcquireRelease:
+    return RISCV::LR_W_AQ;
+  case AtomicOrdering::SequentiallyConsistent:
+    return RISCV::LR_W_AQ_RL;
+  }
+}
+
+static unsigned getSCForRMW32(AtomicOrdering Ordering) {
+  switch (Ordering) {
+  default:
+    llvm_unreachable("Unexpected AtomicOrdering");
+  case AtomicOrdering::Monotonic:
+    return RISCV::SC_W;
+  case AtomicOrdering::Acquire:
+    return RISCV::SC_W;
+  case AtomicOrdering::Release:
+    return RISCV::SC_W_RL;
+  case AtomicOrdering::AcquireRelease:
+    return RISCV::SC_W_RL;
+  case AtomicOrdering::SequentiallyConsistent:
+    return RISCV::SC_W_AQ_RL;
+  }
+}
+
+static unsigned getLRForRMW64(AtomicOrdering Ordering) {
+  switch (Ordering) {
+  default:
+    llvm_unreachable("Unexpected AtomicOrdering");
+  case AtomicOrdering::Monotonic:
+    return RISCV::LR_D;
+  case AtomicOrdering::Acquire:
+    return RISCV::LR_D_AQ;
+  case AtomicOrdering::Release:
+    return RISCV::LR_D;
+  case AtomicOrdering::AcquireRelease:
+    return RISCV::LR_D_AQ;
+  case AtomicOrdering::SequentiallyConsistent:
+    return RISCV::LR_D_AQ_RL;
+  }
+}
+
+static unsigned getSCForRMW64(AtomicOrdering Ordering) {
+  switch (Ordering) {
+  default:
+    llvm_unreachable("Unexpected AtomicOrdering");
+  case AtomicOrdering::Monotonic:
+    return RISCV::SC_D;
+  case AtomicOrdering::Acquire:
+    return RISCV::SC_D;
+  case AtomicOrdering::Release:
+    return RISCV::SC_D_RL;
+  case AtomicOrdering::AcquireRelease:
+    return RISCV::SC_D_RL;
+  case AtomicOrdering::SequentiallyConsistent:
+    return RISCV::SC_D_AQ_RL;
+  }
+}
+
+static unsigned getLRForRMW(AtomicOrdering Ordering, int Width) {
+  if (Width == 32)
+    return getLRForRMW32(Ordering);
+  if (Width == 64)
+    return getLRForRMW64(Ordering);
+  llvm_unreachable("Unexpected LR width\n");
+}
+
+static unsigned getSCForRMW(AtomicOrdering Ordering, int Width) {
+  if (Width == 32)
+    return getSCForRMW32(Ordering);
+  if (Width == 64)
+    return getSCForRMW64(Ordering);
+  llvm_unreachable("Unexpected SC width\n");
+}
+
+static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
+                                   DebugLoc DL, MachineBasicBlock *ThisMBB,
+                                   MachineBasicBlock *LoopMBB,
+                                   MachineBasicBlock *DoneMBB,
+                                   AtomicRMWInst::BinOp BinOp, int Width) {
+  Register DestReg = MI.getOperand(0).getReg();
+  Register ScratchReg = MI.getOperand(1).getReg();
+  Register AddrReg = MI.getOperand(2).getReg();
+  Register IncrReg = MI.getOperand(3).getReg();
+  AtomicOrdering Ordering =
+      static_cast<AtomicOrdering>(MI.getOperand(4).getImm());
+
+  // .loop:
+  //   lr.[w|d] dest, (addr)
+  //   binop scratch, dest, val
+  //   sc.[w|d] scratch, scratch, (addr)
+  //   bnez scratch, loop
+  BuildMI(LoopMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg)
+      .addReg(AddrReg);
+  switch (BinOp) {
+  default:
+    llvm_unreachable("Unexpected AtomicRMW BinOp");
+  case AtomicRMWInst::Nand:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::AND), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    BuildMI(LoopMBB, DL, TII->get(RISCV::XORI), ScratchReg)
+        .addReg(ScratchReg)
+        .addImm(-1);
+    break;
+  }
+  BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width)), ScratchReg)
+      .addReg(AddrReg)
+      .addReg(ScratchReg);
+  BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
+      .addReg(ScratchReg)
+      .addReg(RISCV::X0)
+      .addMBB(LoopMBB);
+}
+
+static void insertMaskedMerge(const RISCVInstrInfo *TII, DebugLoc DL,
+                              MachineBasicBlock *MBB, Register DestReg,
+                              Register OldValReg, Register NewValReg,
+                              Register MaskReg, Register ScratchReg) {
+  assert(OldValReg != ScratchReg && "OldValReg and ScratchReg must be unique");
+  assert(OldValReg != MaskReg && "OldValReg and MaskReg must be unique");
+  assert(ScratchReg != MaskReg && "ScratchReg and MaskReg must be unique");
+
+  // We select bits from newval and oldval using:
+  // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge
+  // r = oldval ^ ((oldval ^ newval) & masktargetdata);
+  BuildMI(MBB, DL, TII->get(RISCV::XOR), ScratchReg)
+      .addReg(OldValReg)
+      .addReg(NewValReg);
+  BuildMI(MBB, DL, TII->get(RISCV::AND), ScratchReg)
+      .addReg(ScratchReg)
+      .addReg(MaskReg);
+  BuildMI(MBB, DL, TII->get(RISCV::XOR), DestReg)
+      .addReg(OldValReg)
+      .addReg(ScratchReg);
+}
+
+static void doMaskedAtomicBinOpExpansion(
+    const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL,
+    MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopMBB,
+    MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width) {
+  assert(Width == 32 && "Should never need to expand masked 64-bit operations");
+  Register DestReg = MI.getOperand(0).getReg();
+  Register ScratchReg = MI.getOperand(1).getReg();
+  Register AddrReg = MI.getOperand(2).getReg();
+  Register IncrReg = MI.getOperand(3).getReg();
+  Register MaskReg = MI.getOperand(4).getReg();
+  AtomicOrdering Ordering =
+      static_cast<AtomicOrdering>(MI.getOperand(5).getImm());
+
+  // .loop:
+  //   lr.w destreg, (alignedaddr)
+  //   binop scratch, destreg, incr
+  //   xor scratch, destreg, scratch
+  //   and scratch, scratch, masktargetdata
+  //   xor scratch, destreg, scratch
+  //   sc.w scratch, scratch, (alignedaddr)
+  //   bnez scratch, loop
+  BuildMI(LoopMBB, DL, TII->get(getLRForRMW32(Ordering)), DestReg)
+      .addReg(AddrReg);
+  switch (BinOp) {
+  default:
+    llvm_unreachable("Unexpected AtomicRMW BinOp");
+  case AtomicRMWInst::Xchg:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::ADDI), ScratchReg)
+        .addReg(IncrReg)
+        .addImm(0);
+    break;
+  case AtomicRMWInst::Add:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::ADD), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::Sub:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::SUB), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    break;
+  case AtomicRMWInst::Nand:
+    BuildMI(LoopMBB, DL, TII->get(RISCV::AND), ScratchReg)
+        .addReg(DestReg)
+        .addReg(IncrReg);
+    BuildMI(LoopMBB, DL, TII->get(RISCV::XORI), ScratchReg)
+        .addReg(ScratchReg)
+        .addImm(-1);
+    break;
+  }
+
+  insertMaskedMerge(TII, DL, LoopMBB, ScratchReg, DestReg, ScratchReg, MaskReg,
+                    ScratchReg);
+
+  BuildMI(LoopMBB, DL, TII->get(getSCForRMW32(Ordering)), ScratchReg)
+      .addReg(AddrReg)
+      .addReg(ScratchReg);
+  BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
+      .addReg(ScratchReg)
+      .addReg(RISCV::X0)
+      .addMBB(LoopMBB);
+}
+
+bool RISCVExpandAtomicPseudo::expandAtomicBinOp(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width,
+    MachineBasicBlock::iterator &NextMBBI) {
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+
+  MachineFunction *MF = MBB.getParent();
+  auto LoopMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  // Insert new MBBs.
+  MF->insert(++MBB.getIterator(), LoopMBB);
+  MF->insert(++LoopMBB->getIterator(), DoneMBB);
+
+  // Set up successors and transfer remaining instructions to DoneMBB.
+  LoopMBB->addSuccessor(LoopMBB);
+  LoopMBB->addSuccessor(DoneMBB);
+  DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
+  DoneMBB->transferSuccessors(&MBB);
+  MBB.addSuccessor(LoopMBB);
+
+  if (!IsMasked)
+    doAtomicBinOpExpansion(TII, MI, DL, &MBB, LoopMBB, DoneMBB, BinOp, Width);
+  else
+    doMaskedAtomicBinOpExpansion(TII, MI, DL, &MBB, LoopMBB, DoneMBB, BinOp,
+                                 Width);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+
+  LivePhysRegs LiveRegs;
+  computeAndAddLiveIns(LiveRegs, *LoopMBB);
+  computeAndAddLiveIns(LiveRegs, *DoneMBB);
+
+  return true;
+}
+
+static void insertSext(const RISCVInstrInfo *TII, DebugLoc DL,
+                       MachineBasicBlock *MBB, Register ValReg,
+                       Register ShamtReg) {
+  BuildMI(MBB, DL, TII->get(RISCV::SLL), ValReg)
+      .addReg(ValReg)
+      .addReg(ShamtReg);
+  BuildMI(MBB, DL, TII->get(RISCV::SRA), ValReg)
+      .addReg(ValReg)
+      .addReg(ShamtReg);
+}
+
+bool RISCVExpandAtomicPseudo::expandAtomicMinMaxOp(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width,
+    MachineBasicBlock::iterator &NextMBBI) {
+  assert(IsMasked == true &&
+         "Should only need to expand masked atomic max/min");
+  assert(Width == 32 && "Should never need to expand masked 64-bit operations");
+
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
+  auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  // Insert new MBBs.
+  MF->insert(++MBB.getIterator(), LoopHeadMBB);
+  MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB);
+  MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB);
+  MF->insert(++LoopTailMBB->getIterator(), DoneMBB);
+
+  // Set up successors and transfer remaining instructions to DoneMBB.
+  LoopHeadMBB->addSuccessor(LoopIfBodyMBB);
+  LoopHeadMBB->addSuccessor(LoopTailMBB);
+  LoopIfBodyMBB->addSuccessor(LoopTailMBB);
+  LoopTailMBB->addSuccessor(LoopHeadMBB);
+  LoopTailMBB->addSuccessor(DoneMBB);
+  DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
+  DoneMBB->transferSuccessors(&MBB);
+  MBB.addSuccessor(LoopHeadMBB);
+
+  Register DestReg = MI.getOperand(0).getReg();
+  Register Scratch1Reg = MI.getOperand(1).getReg();
+  Register Scratch2Reg = MI.getOperand(2).getReg();
+  Register AddrReg = MI.getOperand(3).getReg();
+  Register IncrReg = MI.getOperand(4).getReg();
+  Register MaskReg = MI.getOperand(5).getReg();
+  bool IsSigned = BinOp == AtomicRMWInst::Min || BinOp == AtomicRMWInst::Max;
+  AtomicOrdering Ordering =
+      static_cast<AtomicOrdering>(MI.getOperand(IsSigned ? 7 : 6).getImm());
+
+  //
+  // .loophead:
+  //   lr.w destreg, (alignedaddr)
+  //   and scratch2, destreg, mask
+  //   mv scratch1, destreg
+  //   [sext scratch2 if signed min/max]
+  //   ifnochangeneeded scratch2, incr, .looptail
+  BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW32(Ordering)), DestReg)
+      .addReg(AddrReg);
+  BuildMI(LoopHeadMBB, DL, TII->get(RISCV::AND), Scratch2Reg)
+      .addReg(DestReg)
+      .addReg(MaskReg);
+  BuildMI(LoopHeadMBB, DL, TII->get(RISCV::ADDI), Scratch1Reg)
+      .addReg(DestReg)
+      .addImm(0);
+
+  switch (BinOp) {
+  default:
+    llvm_unreachable("Unexpected AtomicRMW BinOp");
+  case AtomicRMWInst::Max: {
+    insertSext(TII, DL, LoopHeadMBB, Scratch2Reg, MI.getOperand(6).getReg());
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGE))
+        .addReg(Scratch2Reg)
+        .addReg(IncrReg)
+        .addMBB(LoopTailMBB);
+    break;
+  }
+  case AtomicRMWInst::Min: {
+    insertSext(TII, DL, LoopHeadMBB, Scratch2Reg, MI.getOperand(6).getReg());
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGE))
+        .addReg(IncrReg)
+        .addReg(Scratch2Reg)
+        .addMBB(LoopTailMBB);
+    break;
+  }
+  case AtomicRMWInst::UMax:
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGEU))
+        .addReg(Scratch2Reg)
+        .addReg(IncrReg)
+        .addMBB(LoopTailMBB);
+    break;
+  case AtomicRMWInst::UMin:
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGEU))
+        .addReg(IncrReg)
+        .addReg(Scratch2Reg)
+        .addMBB(LoopTailMBB);
+    break;
+  }
+
+  // .loopifbody:
+  //   xor scratch1, destreg, incr
+  //   and scratch1, scratch1, mask
+  //   xor scratch1, destreg, scratch1
+  insertMaskedMerge(TII, DL, LoopIfBodyMBB, Scratch1Reg, DestReg, IncrReg,
+                    MaskReg, Scratch1Reg);
+
+  // .looptail:
+  //   sc.w scratch1, scratch1, (addr)
+  //   bnez scratch1, loop
+  BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering)), Scratch1Reg)
+      .addReg(AddrReg)
+      .addReg(Scratch1Reg);
+  BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
+      .addReg(Scratch1Reg)
+      .addReg(RISCV::X0)
+      .addMBB(LoopHeadMBB);
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+
+  LivePhysRegs LiveRegs;
+  computeAndAddLiveIns(LiveRegs, *LoopHeadMBB);
+  computeAndAddLiveIns(LiveRegs, *LoopIfBodyMBB);
+  computeAndAddLiveIns(LiveRegs, *LoopTailMBB);
+  computeAndAddLiveIns(LiveRegs, *DoneMBB);
+
+  return true;
+}
+
+bool RISCVExpandAtomicPseudo::expandAtomicCmpXchg(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsMasked,
+    int Width, MachineBasicBlock::iterator &NextMBBI) {
+  MachineInstr &MI = *MBBI;
+  DebugLoc DL = MI.getDebugLoc();
+  MachineFunction *MF = MBB.getParent();
+  auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+  auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
+
+  // Insert new MBBs.
+  MF->insert(++MBB.getIterator(), LoopHeadMBB);
+  MF->insert(++LoopHeadMBB->getIterator(), LoopTailMBB);
+  MF->insert(++LoopTailMBB->getIterator(), DoneMBB);
+
+  // Set up successors and transfer remaining instructions to DoneMBB.
+  LoopHeadMBB->addSuccessor(LoopTailMBB);
+  LoopHeadMBB->addSuccessor(DoneMBB);
+  LoopTailMBB->addSuccessor(DoneMBB);
+  LoopTailMBB->addSuccessor(LoopHeadMBB);
+  DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
+  DoneMBB->transferSuccessors(&MBB);
+  MBB.addSuccessor(LoopHeadMBB);
+
+  Register DestReg = MI.getOperand(0).getReg();
+  Register ScratchReg = MI.getOperand(1).getReg();
+  Register AddrReg = MI.getOperand(2).getReg();
+  Register CmpValReg = MI.getOperand(3).getReg();
+  Register NewValReg = MI.getOperand(4).getReg();
+  AtomicOrdering Ordering =
+      static_cast<AtomicOrdering>(MI.getOperand(IsMasked ? 6 : 5).getImm());
+
+  if (!IsMasked) {
+    // .loophead:
+    //   lr.[w|d] dest, (addr)
+    //   bne dest, cmpval, done
+    BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg)
+        .addReg(AddrReg);
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BNE))
+        .addReg(DestReg)
+        .addReg(CmpValReg)
+        .addMBB(DoneMBB);
+    // .looptail:
+    //   sc.[w|d] scratch, newval, (addr)
+    //   bnez scratch, loophead
+    BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width)), ScratchReg)
+        .addReg(AddrReg)
+        .addReg(NewValReg);
+    BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
+        .addReg(ScratchReg)
+        .addReg(RISCV::X0)
+        .addMBB(LoopHeadMBB);
+  } else {
+    // .loophead:
+    //   lr.w dest, (addr)
+    //   and scratch, dest, mask
+    //   bne scratch, cmpval, done
+    Register MaskReg = MI.getOperand(5).getReg();
+    BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg)
+        .addReg(AddrReg);
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::AND), ScratchReg)
+        .addReg(DestReg)
+        .addReg(MaskReg);
+    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BNE))
+        .addReg(ScratchReg)
+        .addReg(CmpValReg)
+        .addMBB(DoneMBB);
+
+    // .looptail:
+    //   xor scratch, dest, newval
+    //   and scratch, scratch, mask
+    //   xor scratch, dest, scratch
+    //   sc.w scratch, scratch, (adrr)
+    //   bnez scratch, loophead
+    insertMaskedMerge(TII, DL, LoopTailMBB, ScratchReg, DestReg, NewValReg,
+                      MaskReg, ScratchReg);
+    BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width)), ScratchReg)
+        .addReg(AddrReg)
+        .addReg(ScratchReg);
+    BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
+        .addReg(ScratchReg)
+        .addReg(RISCV::X0)
+        .addMBB(LoopHeadMBB);
+  }
+
+  NextMBBI = MBB.end();
+  MI.eraseFromParent();
+
+  LivePhysRegs LiveRegs;
+  computeAndAddLiveIns(LiveRegs, *LoopHeadMBB);
+  computeAndAddLiveIns(LiveRegs, *LoopTailMBB);
+  computeAndAddLiveIns(LiveRegs, *DoneMBB);
+
+  return true;
+}
+
+} // end of anonymous namespace
+
+INITIALIZE_PASS(RISCVExpandAtomicPseudo, "riscv-expand-atomic-pseudo",
+                RISCV_EXPAND_ATOMIC_PSEUDO_NAME, false, false)
+
+namespace llvm {
+
+FunctionPass *createRISCVExpandAtomicPseudoPass() {
+  return new RISCVExpandAtomicPseudo();
+}
+
+} // end of namespace llvm
diff --git a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
index 84bce0f48562..504355fb8bf8 100644
--- a/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/RISCV/RISCVExpandPseudoInsts.cpp
@@ -43,17 +43,6 @@ private:
   bool expandMBB(MachineBasicBlock &MBB);
   bool expandMI(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
                 MachineBasicBlock::iterator &NextMBBI);
-  bool expandAtomicBinOp(MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator MBBI, AtomicRMWInst::BinOp,
-                         bool IsMasked, int Width,
-                         MachineBasicBlock::iterator &NextMBBI);
-  bool expandAtomicMinMaxOp(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI,
-                            AtomicRMWInst::BinOp, bool IsMasked, int Width,
-                            MachineBasicBlock::iterator &NextMBBI);
-  bool expandAtomicCmpXchg(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, bool IsMasked,
-                           int Width, MachineBasicBlock::iterator &NextMBBI);
   bool expandAuipcInstPair(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
                            MachineBasicBlock::iterator &NextMBBI,
@@ -98,41 +87,10 @@ bool RISCVExpandPseudo::expandMBB(MachineBasicBlock &MBB) {
 bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
                                  MachineBasicBlock::iterator &NextMBBI) {
+  // RISCVInstrInfo::getInstSizeInBytes hard-codes the number of expanded
+  // instructions for each pseudo, and must be updated when adding new pseudos
+  // or changing existing ones.
   switch (MBBI->getOpcode()) {
-  case RISCV::PseudoAtomicLoadNand32:
-    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 32,
-                             NextMBBI);
-  case RISCV::PseudoAtomicLoadNand64:
-    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, false, 64,
-                             NextMBBI);
-  case RISCV::PseudoMaskedAtomicSwap32:
-    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Xchg, true, 32,
-                             NextMBBI);
-  case RISCV::PseudoMaskedAtomicLoadAdd32:
-    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Add, true, 32, NextMBBI);
-  case RISCV::PseudoMaskedAtomicLoadSub32:
-    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Sub, true, 32, NextMBBI);
-  case RISCV::PseudoMaskedAtomicLoadNand32:
-    return expandAtomicBinOp(MBB, MBBI, AtomicRMWInst::Nand, true, 32,
-                             NextMBBI);
-  case RISCV::PseudoMaskedAtomicLoadMax32:
-    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Max, true, 32,
-                                NextMBBI);
-  case RISCV::PseudoMaskedAtomicLoadMin32:
-    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::Min, true, 32,
-                                NextMBBI);
-  case RISCV::PseudoMaskedAtomicLoadUMax32:
-    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMax, true, 32,
-                                NextMBBI);
-  case RISCV::PseudoMaskedAtomicLoadUMin32:
-    return expandAtomicMinMaxOp(MBB, MBBI, AtomicRMWInst::UMin, true, 32,
-                                NextMBBI);
-  case RISCV::PseudoCmpXchg32:
-    return expandAtomicCmpXchg(MBB, MBBI, false, 32, NextMBBI);
-  case RISCV::PseudoCmpXchg64:
-    return expandAtomicCmpXchg(MBB, MBBI, false, 64, NextMBBI);
-  case RISCV::PseudoMaskedCmpXchg32:
-    return expandAtomicCmpXchg(MBB, MBBI, true, 32, NextMBBI);
   case RISCV::PseudoLLA:
     return expandLoadLocalAddress(MBB, MBBI, NextMBBI);
   case RISCV::PseudoLA:
@@ -146,481 +104,6 @@ bool RISCVExpandPseudo::expandMI(MachineBasicBlock &MBB,
   return false;
 }
 
-static unsigned getLRForRMW32(AtomicOrdering Ordering) {
-  switch (Ordering) {
-  default:
-    llvm_unreachable("Unexpected AtomicOrdering");
-  case AtomicOrdering::Monotonic:
-    return RISCV::LR_W;
-  case AtomicOrdering::Acquire:
-    return RISCV::LR_W_AQ;
-  case AtomicOrdering::Release:
-    return RISCV::LR_W;
-  case AtomicOrdering::AcquireRelease:
-    return RISCV::LR_W_AQ;
-  case AtomicOrdering::SequentiallyConsistent:
-    return RISCV::LR_W_AQ_RL;
-  }
-}
-
-static unsigned getSCForRMW32(AtomicOrdering Ordering) {
-  switch (Ordering) {
-  default:
-    llvm_unreachable("Unexpected AtomicOrdering");
-  case AtomicOrdering::Monotonic:
-    return RISCV::SC_W;
-  case AtomicOrdering::Acquire:
-    return RISCV::SC_W;
-  case AtomicOrdering::Release:
-    return RISCV::SC_W_RL;
-  case AtomicOrdering::AcquireRelease:
-    return RISCV::SC_W_RL;
-  case AtomicOrdering::SequentiallyConsistent:
-    return RISCV::SC_W_AQ_RL;
-  }
-}
-
-static unsigned getLRForRMW64(AtomicOrdering Ordering) {
-  switch (Ordering) {
-  default:
-    llvm_unreachable("Unexpected AtomicOrdering");
-  case AtomicOrdering::Monotonic:
-    return RISCV::LR_D;
-  case AtomicOrdering::Acquire:
-    return RISCV::LR_D_AQ;
-  case AtomicOrdering::Release:
-    return RISCV::LR_D;
-  case AtomicOrdering::AcquireRelease:
-    return RISCV::LR_D_AQ;
-  case AtomicOrdering::SequentiallyConsistent:
-    return RISCV::LR_D_AQ_RL;
-  }
-}
-
-static unsigned getSCForRMW64(AtomicOrdering Ordering) {
-  switch (Ordering) {
-  default:
-    llvm_unreachable("Unexpected AtomicOrdering");
-  case AtomicOrdering::Monotonic:
-    return RISCV::SC_D;
-  case AtomicOrdering::Acquire:
-    return RISCV::SC_D;
-  case AtomicOrdering::Release:
-    return RISCV::SC_D_RL;
-  case AtomicOrdering::AcquireRelease:
-    return RISCV::SC_D_RL;
-  case AtomicOrdering::SequentiallyConsistent:
-    return RISCV::SC_D_AQ_RL;
-  }
-}
-
-static unsigned getLRForRMW(AtomicOrdering Ordering, int Width) {
-  if (Width == 32)
-    return getLRForRMW32(Ordering);
-  if (Width == 64)
-    return getLRForRMW64(Ordering);
-  llvm_unreachable("Unexpected LR width\n");
-}
-
-static unsigned getSCForRMW(AtomicOrdering Ordering, int Width) {
-  if (Width == 32)
-    return getSCForRMW32(Ordering);
-  if (Width == 64)
-    return getSCForRMW64(Ordering);
-  llvm_unreachable("Unexpected SC width\n");
-}
-
-static void doAtomicBinOpExpansion(const RISCVInstrInfo *TII, MachineInstr &MI,
-                                   DebugLoc DL, MachineBasicBlock *ThisMBB,
-                                   MachineBasicBlock *LoopMBB,
-                                   MachineBasicBlock *DoneMBB,
-                                   AtomicRMWInst::BinOp BinOp, int Width) {
-  Register DestReg = MI.getOperand(0).getReg();
-  Register ScratchReg = MI.getOperand(1).getReg();
-  Register AddrReg = MI.getOperand(2).getReg();
-  Register IncrReg = MI.getOperand(3).getReg();
-  AtomicOrdering Ordering =
-      static_cast<AtomicOrdering>(MI.getOperand(4).getImm());
-
-  // .loop:
-  //   lr.[w|d] dest, (addr)
-  //   binop scratch, dest, val
-  //   sc.[w|d] scratch, scratch, (addr)
-  //   bnez scratch, loop
-  BuildMI(LoopMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg)
-      .addReg(AddrReg);
-  switch (BinOp) {
-  default:
-    llvm_unreachable("Unexpected AtomicRMW BinOp");
-  case AtomicRMWInst::Nand:
-    BuildMI(LoopMBB, DL, TII->get(RISCV::AND), ScratchReg)
-        .addReg(DestReg)
-        .addReg(IncrReg);
-    BuildMI(LoopMBB, DL, TII->get(RISCV::XORI), ScratchReg)
-        .addReg(ScratchReg)
-        .addImm(-1);
-    break;
-  }
-  BuildMI(LoopMBB, DL, TII->get(getSCForRMW(Ordering, Width)), ScratchReg)
-      .addReg(AddrReg)
-      .addReg(ScratchReg);
-  BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
-      .addReg(ScratchReg)
-      .addReg(RISCV::X0)
-      .addMBB(LoopMBB);
-}
-
-static void insertMaskedMerge(const RISCVInstrInfo *TII, DebugLoc DL,
-                              MachineBasicBlock *MBB, Register DestReg,
-                              Register OldValReg, Register NewValReg,
-                              Register MaskReg, Register ScratchReg) {
-  assert(OldValReg != ScratchReg && "OldValReg and ScratchReg must be unique");
-  assert(OldValReg != MaskReg && "OldValReg and MaskReg must be unique");
-  assert(ScratchReg != MaskReg && "ScratchReg and MaskReg must be unique");
-
-  // We select bits from newval and oldval using:
-  // https://graphics.stanford.edu/~seander/bithacks.html#MaskedMerge
-  // r = oldval ^ ((oldval ^ newval) & masktargetdata);
-  BuildMI(MBB, DL, TII->get(RISCV::XOR), ScratchReg)
-      .addReg(OldValReg)
-      .addReg(NewValReg);
-  BuildMI(MBB, DL, TII->get(RISCV::AND), ScratchReg)
-      .addReg(ScratchReg)
-      .addReg(MaskReg);
-  BuildMI(MBB, DL, TII->get(RISCV::XOR), DestReg)
-      .addReg(OldValReg)
-      .addReg(ScratchReg);
-}
-
-static void doMaskedAtomicBinOpExpansion(
-    const RISCVInstrInfo *TII, MachineInstr &MI, DebugLoc DL,
-    MachineBasicBlock *ThisMBB, MachineBasicBlock *LoopMBB,
-    MachineBasicBlock *DoneMBB, AtomicRMWInst::BinOp BinOp, int Width) {
-  assert(Width == 32 && "Should never need to expand masked 64-bit operations");
-  Register DestReg = MI.getOperand(0).getReg();
-  Register ScratchReg = MI.getOperand(1).getReg();
-  Register AddrReg = MI.getOperand(2).getReg();
-  Register IncrReg = MI.getOperand(3).getReg();
-  Register MaskReg = MI.getOperand(4).getReg();
-  AtomicOrdering Ordering =
-      static_cast<AtomicOrdering>(MI.getOperand(5).getImm());
-
-  // .loop:
-  //   lr.w destreg, (alignedaddr)
-  //   binop scratch, destreg, incr
-  //   xor scratch, destreg, scratch
-  //   and scratch, scratch, masktargetdata
-  //   xor scratch, destreg, scratch
-  //   sc.w scratch, scratch, (alignedaddr)
-  //   bnez scratch, loop
-  BuildMI(LoopMBB, DL, TII->get(getLRForRMW32(Ordering)), DestReg)
-      .addReg(AddrReg);
-  switch (BinOp) {
-  default:
-    llvm_unreachable("Unexpected AtomicRMW BinOp");
-  case AtomicRMWInst::Xchg:
-    BuildMI(LoopMBB, DL, TII->get(RISCV::ADDI), ScratchReg)
-        .addReg(IncrReg)
-        .addImm(0);
-    break;
-  case AtomicRMWInst::Add:
-    BuildMI(LoopMBB, DL, TII->get(RISCV::ADD), ScratchReg)
-        .addReg(DestReg)
-        .addReg(IncrReg);
-    break;
-  case AtomicRMWInst::Sub:
-    BuildMI(LoopMBB, DL, TII->get(RISCV::SUB), ScratchReg)
-        .addReg(DestReg)
-        .addReg(IncrReg);
-    break;
-  case AtomicRMWInst::Nand:
-    BuildMI(LoopMBB, DL, TII->get(RISCV::AND), ScratchReg)
-        .addReg(DestReg)
-        .addReg(IncrReg);
-    BuildMI(LoopMBB, DL, TII->get(RISCV::XORI), ScratchReg)
-        .addReg(ScratchReg)
-        .addImm(-1);
-    break;
-  }
-
-  insertMaskedMerge(TII, DL, LoopMBB, ScratchReg, DestReg, ScratchReg, MaskReg,
-                    ScratchReg);
-
-  BuildMI(LoopMBB, DL, TII->get(getSCForRMW32(Ordering)), ScratchReg)
-      .addReg(AddrReg)
-      .addReg(ScratchReg);
-  BuildMI(LoopMBB, DL, TII->get(RISCV::BNE))
-      .addReg(ScratchReg)
-      .addReg(RISCV::X0)
-      .addMBB(LoopMBB);
-}
-
-bool RISCVExpandPseudo::expandAtomicBinOp(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width,
-    MachineBasicBlock::iterator &NextMBBI) {
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-
-  MachineFunction *MF = MBB.getParent();
-  auto LoopMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-  auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-
-  // Insert new MBBs.
-  MF->insert(++MBB.getIterator(), LoopMBB);
-  MF->insert(++LoopMBB->getIterator(), DoneMBB);
-
-  // Set up successors and transfer remaining instructions to DoneMBB.
-  LoopMBB->addSuccessor(LoopMBB);
-  LoopMBB->addSuccessor(DoneMBB);
-  DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
-  DoneMBB->transferSuccessors(&MBB);
-  MBB.addSuccessor(LoopMBB);
-
-  if (!IsMasked)
-    doAtomicBinOpExpansion(TII, MI, DL, &MBB, LoopMBB, DoneMBB, BinOp, Width);
-  else
-    doMaskedAtomicBinOpExpansion(TII, MI, DL, &MBB, LoopMBB, DoneMBB, BinOp,
-                                 Width);
-
-  NextMBBI = MBB.end();
-  MI.eraseFromParent();
-
-  LivePhysRegs LiveRegs;
-  computeAndAddLiveIns(LiveRegs, *LoopMBB);
-  computeAndAddLiveIns(LiveRegs, *DoneMBB);
-
-  return true;
-}
-
-static void insertSext(const RISCVInstrInfo *TII, DebugLoc DL,
-                       MachineBasicBlock *MBB, Register ValReg,
-                       Register ShamtReg) {
-  BuildMI(MBB, DL, TII->get(RISCV::SLL), ValReg)
-      .addReg(ValReg)
-      .addReg(ShamtReg);
-  BuildMI(MBB, DL, TII->get(RISCV::SRA), ValReg)
-      .addReg(ValReg)
-      .addReg(ShamtReg);
-}
-
-bool RISCVExpandPseudo::expandAtomicMinMaxOp(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    AtomicRMWInst::BinOp BinOp, bool IsMasked, int Width,
-    MachineBasicBlock::iterator &NextMBBI) {
-  assert(IsMasked == true &&
-         "Should only need to expand masked atomic max/min");
-  assert(Width == 32 && "Should never need to expand masked 64-bit operations");
-
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-  MachineFunction *MF = MBB.getParent();
-  auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-  auto LoopIfBodyMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-  auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-  auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-
-  // Insert new MBBs.
-  MF->insert(++MBB.getIterator(), LoopHeadMBB);
-  MF->insert(++LoopHeadMBB->getIterator(), LoopIfBodyMBB);
-  MF->insert(++LoopIfBodyMBB->getIterator(), LoopTailMBB);
-  MF->insert(++LoopTailMBB->getIterator(), DoneMBB);
-
-  // Set up successors and transfer remaining instructions to DoneMBB.
-  LoopHeadMBB->addSuccessor(LoopIfBodyMBB);
-  LoopHeadMBB->addSuccessor(LoopTailMBB);
-  LoopIfBodyMBB->addSuccessor(LoopTailMBB);
-  LoopTailMBB->addSuccessor(LoopHeadMBB);
-  LoopTailMBB->addSuccessor(DoneMBB);
-  DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
-  DoneMBB->transferSuccessors(&MBB);
-  MBB.addSuccessor(LoopHeadMBB);
-
-  Register DestReg = MI.getOperand(0).getReg();
-  Register Scratch1Reg = MI.getOperand(1).getReg();
-  Register Scratch2Reg = MI.getOperand(2).getReg();
-  Register AddrReg = MI.getOperand(3).getReg();
-  Register IncrReg = MI.getOperand(4).getReg();
-  Register MaskReg = MI.getOperand(5).getReg();
-  bool IsSigned = BinOp == AtomicRMWInst::Min || BinOp == AtomicRMWInst::Max;
-  AtomicOrdering Ordering =
-      static_cast<AtomicOrdering>(MI.getOperand(IsSigned ? 7 : 6).getImm());
-
-  //
-  // .loophead:
-  //   lr.w destreg, (alignedaddr)
-  //   and scratch2, destreg, mask
-  //   mv scratch1, destreg
-  //   [sext scratch2 if signed min/max]
-  //   ifnochangeneeded scratch2, incr, .looptail
-  BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW32(Ordering)), DestReg)
-      .addReg(AddrReg);
-  BuildMI(LoopHeadMBB, DL, TII->get(RISCV::AND), Scratch2Reg)
-      .addReg(DestReg)
-      .addReg(MaskReg);
-  BuildMI(LoopHeadMBB, DL, TII->get(RISCV::ADDI), Scratch1Reg)
-      .addReg(DestReg)
-      .addImm(0);
-
-  switch (BinOp) {
-  default:
-    llvm_unreachable("Unexpected AtomicRMW BinOp");
-  case AtomicRMWInst::Max: {
-    insertSext(TII, DL, LoopHeadMBB, Scratch2Reg, MI.getOperand(6).getReg());
-    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGE))
-        .addReg(Scratch2Reg)
-        .addReg(IncrReg)
-        .addMBB(LoopTailMBB);
-    break;
-  }
-  case AtomicRMWInst::Min: {
-    insertSext(TII, DL, LoopHeadMBB, Scratch2Reg, MI.getOperand(6).getReg());
-    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGE))
-        .addReg(IncrReg)
-        .addReg(Scratch2Reg)
-        .addMBB(LoopTailMBB);
-    break;
-  }
-  case AtomicRMWInst::UMax:
-    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGEU))
-        .addReg(Scratch2Reg)
-        .addReg(IncrReg)
-        .addMBB(LoopTailMBB);
-    break;
-  case AtomicRMWInst::UMin:
-    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BGEU))
-        .addReg(IncrReg)
-        .addReg(Scratch2Reg)
-        .addMBB(LoopTailMBB);
-    break;
-  }
-
-  // .loopifbody:
-  //   xor scratch1, destreg, incr
-  //   and scratch1, scratch1, mask
-  //   xor scratch1, destreg, scratch1
-  insertMaskedMerge(TII, DL, LoopIfBodyMBB, Scratch1Reg, DestReg, IncrReg,
-                    MaskReg, Scratch1Reg);
-
-  // .looptail:
-  //   sc.w scratch1, scratch1, (addr)
-  //   bnez scratch1, loop
-  BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW32(Ordering)), Scratch1Reg)
-      .addReg(AddrReg)
-      .addReg(Scratch1Reg);
-  BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
-      .addReg(Scratch1Reg)
-      .addReg(RISCV::X0)
-      .addMBB(LoopHeadMBB);
-
-  NextMBBI = MBB.end();
-  MI.eraseFromParent();
-
-  LivePhysRegs LiveRegs;
-  computeAndAddLiveIns(LiveRegs, *LoopHeadMBB);
-  computeAndAddLiveIns(LiveRegs, *LoopIfBodyMBB);
-  computeAndAddLiveIns(LiveRegs, *LoopTailMBB);
-  computeAndAddLiveIns(LiveRegs, *DoneMBB);
-
-  return true;
-}
-
-bool RISCVExpandPseudo::expandAtomicCmpXchg(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, bool IsMasked,
-    int Width, MachineBasicBlock::iterator &NextMBBI) {
-  MachineInstr &MI = *MBBI;
-  DebugLoc DL = MI.getDebugLoc();
-  MachineFunction *MF = MBB.getParent();
-  auto LoopHeadMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-  auto LoopTailMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-  auto DoneMBB = MF->CreateMachineBasicBlock(MBB.getBasicBlock());
-
-  // Insert new MBBs.
-  MF->insert(++MBB.getIterator(), LoopHeadMBB);
-  MF->insert(++LoopHeadMBB->getIterator(), LoopTailMBB);
-  MF->insert(++LoopTailMBB->getIterator(), DoneMBB);
-
-  // Set up successors and transfer remaining instructions to DoneMBB.
-  LoopHeadMBB->addSuccessor(LoopTailMBB);
-  LoopHeadMBB->addSuccessor(DoneMBB);
-  LoopTailMBB->addSuccessor(DoneMBB);
-  LoopTailMBB->addSuccessor(LoopHeadMBB);
-  DoneMBB->splice(DoneMBB->end(), &MBB, MI, MBB.end());
-  DoneMBB->transferSuccessors(&MBB);
-  MBB.addSuccessor(LoopHeadMBB);
-
-  Register DestReg = MI.getOperand(0).getReg();
-  Register ScratchReg = MI.getOperand(1).getReg();
-  Register AddrReg = MI.getOperand(2).getReg();
-  Register CmpValReg = MI.getOperand(3).getReg();
-  Register NewValReg = MI.getOperand(4).getReg();
-  AtomicOrdering Ordering =
-      static_cast<AtomicOrdering>(MI.getOperand(IsMasked ? 6 : 5).getImm());
-
-  if (!IsMasked) {
-    // .loophead:
-    //   lr.[w|d] dest, (addr)
-    //   bne dest, cmpval, done
-    BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg)
-        .addReg(AddrReg);
-    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BNE))
-        .addReg(DestReg)
-        .addReg(CmpValReg)
-        .addMBB(DoneMBB);
-    // .looptail:
-    //   sc.[w|d] scratch, newval, (addr)
-    //   bnez scratch, loophead
-    BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width)), ScratchReg)
-        .addReg(AddrReg)
-        .addReg(NewValReg);
-    BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
-        .addReg(ScratchReg)
-        .addReg(RISCV::X0)
-        .addMBB(LoopHeadMBB);
-  } else {
-    // .loophead:
-    //   lr.w dest, (addr)
-    //   and scratch, dest, mask
-    //   bne scratch, cmpval, done
-    Register MaskReg = MI.getOperand(5).getReg();
-    BuildMI(LoopHeadMBB, DL, TII->get(getLRForRMW(Ordering, Width)), DestReg)
-        .addReg(AddrReg);
-    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::AND), ScratchReg)
-        .addReg(DestReg)
-        .addReg(MaskReg);
-    BuildMI(LoopHeadMBB, DL, TII->get(RISCV::BNE))
-        .addReg(ScratchReg)
-        .addReg(CmpValReg)
-        .addMBB(DoneMBB);
-
-    // .looptail:
-    //   xor scratch, dest, newval
-    //   and scratch, scratch, mask
-    //   xor scratch, dest, scratch
-    //   sc.w scratch, scratch, (adrr)
-    //   bnez scratch, loophead
-    insertMaskedMerge(TII, DL, LoopTailMBB, ScratchReg, DestReg, NewValReg,
-                      MaskReg, ScratchReg);
-    BuildMI(LoopTailMBB, DL, TII->get(getSCForRMW(Ordering, Width)), ScratchReg)
-        .addReg(AddrReg)
-        .addReg(ScratchReg);
-    BuildMI(LoopTailMBB, DL, TII->get(RISCV::BNE))
-        .addReg(ScratchReg)
-        .addReg(RISCV::X0)
-        .addMBB(LoopHeadMBB);
-  }
-
-  NextMBBI = MBB.end();
-  MI.eraseFromParent();
-
-  LivePhysRegs LiveRegs;
-  computeAndAddLiveIns(LiveRegs, *LoopHeadMBB);
-  computeAndAddLiveIns(LiveRegs, *LoopTailMBB);
-  computeAndAddLiveIns(LiveRegs, *DoneMBB);
-
-  return true;
-}
-
 bool RISCVExpandPseudo::expandAuipcInstPair(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
     MachineBasicBlock::iterator &NextMBBI, unsigned FlagsHi,
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index c60fc3fc6b42..43adc7426c79 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -23,6 +23,100 @@
 
 using namespace llvm;
 
+// Get the ID of the libcall used for spilling and restoring callee saved
+// registers. The ID is representative of the number of registers saved or
+// restored by the libcall, except it is zero-indexed - ID 0 corresponds to a
+// single register.
+static int getLibCallID(const MachineFunction &MF,
+                        const std::vector<CalleeSavedInfo> &CSI) {
+  const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+
+  if (CSI.empty() || !RVFI->useSaveRestoreLibCalls(MF))
+    return -1;
+
+  Register MaxReg = RISCV::NoRegister;
+  for (auto &CS : CSI)
+    // RISCVRegisterInfo::hasReservedSpillSlot assigns negative frame indexes to
+    // registers which can be saved by libcall.
+    if (CS.getFrameIdx() < 0)
+      MaxReg = std::max(MaxReg.id(), CS.getReg());
+
+  if (MaxReg == RISCV::NoRegister)
+    return -1;
+
+  switch (MaxReg) {
+  default:
+    llvm_unreachable("Something has gone wrong!");
+  case /*s11*/ RISCV::X27: return 12;
+  case /*s10*/ RISCV::X26: return 11;
+  case /*s9*/  RISCV::X25: return 10;
+  case /*s8*/  RISCV::X24: return 9;
+  case /*s7*/  RISCV::X23: return 8;
+  case /*s6*/  RISCV::X22: return 7;
+  case /*s5*/  RISCV::X21: return 6;
+  case /*s4*/  RISCV::X20: return 5;
+  case /*s3*/  RISCV::X19: return 4;
+  case /*s2*/  RISCV::X18: return 3;
+  case /*s1*/  RISCV::X9:  return 2;
+  case /*s0*/  RISCV::X8:  return 1;
+  case /*ra*/  RISCV::X1:  return 0;
+  }
+}
+
+// Get the name of the libcall used for spilling callee saved registers.
+// If this function will not use save/restore libcalls, then return a nullptr.
+static const char *
+getSpillLibCallName(const MachineFunction &MF,
+                    const std::vector<CalleeSavedInfo> &CSI) {
+  static const char *const SpillLibCalls[] = {
+    "__riscv_save_0",
+    "__riscv_save_1",
+    "__riscv_save_2",
+    "__riscv_save_3",
+    "__riscv_save_4",
+    "__riscv_save_5",
+    "__riscv_save_6",
+    "__riscv_save_7",
+    "__riscv_save_8",
+    "__riscv_save_9",
+    "__riscv_save_10",
+    "__riscv_save_11",
+    "__riscv_save_12"
+  };
+
+  int LibCallID = getLibCallID(MF, CSI);
+  if (LibCallID == -1)
+    return nullptr;
+  return SpillLibCalls[LibCallID];
+}
+
+// Get the name of the libcall used for restoring callee saved registers.
+// If this function will not use save/restore libcalls, then return a nullptr.
+static const char *
+getRestoreLibCallName(const MachineFunction &MF,
+                      const std::vector<CalleeSavedInfo> &CSI) {
+  static const char *const RestoreLibCalls[] = {
+    "__riscv_restore_0",
+    "__riscv_restore_1",
+    "__riscv_restore_2",
+    "__riscv_restore_3",
+    "__riscv_restore_4",
+    "__riscv_restore_5",
+    "__riscv_restore_6",
+    "__riscv_restore_7",
+    "__riscv_restore_8",
+    "__riscv_restore_9",
+    "__riscv_restore_10",
+    "__riscv_restore_11",
+    "__riscv_restore_12"
+  };
+
+  int LibCallID = getLibCallID(MF, CSI);
+  if (LibCallID == -1)
+    return nullptr;
+  return RestoreLibCalls[LibCallID];
+}
+
 bool RISCVFrameLowering::hasFP(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
@@ -48,10 +142,10 @@ void RISCVFrameLowering::determineFrameLayout(MachineFunction &MF) const {
   uint64_t FrameSize = MFI.getStackSize();
 
   // Get the alignment.
-  unsigned StackAlign = getStackAlignment();
+  Align StackAlign = getStackAlign();
   if (RI->needsStackRealignment(MF)) {
-    unsigned MaxStackAlign = std::max(StackAlign, MFI.getMaxAlignment());
-    FrameSize += (MaxStackAlign - StackAlign);
+    Align MaxStackAlign = std::max(StackAlign, MFI.getMaxAlign());
+    FrameSize += (MaxStackAlign.value() - StackAlign.value());
     StackAlign = MaxStackAlign;
   }
 
@@ -105,6 +199,17 @@ static Register getFPReg(const RISCVSubtarget &STI) { return RISCV::X8; }
 // Returns the register used to hold the stack pointer.
 static Register getSPReg(const RISCVSubtarget &STI) { return RISCV::X2; }
 
+static SmallVector<CalleeSavedInfo, 8>
+getNonLibcallCSI(const std::vector<CalleeSavedInfo> &CSI) {
+  SmallVector<CalleeSavedInfo, 8> NonLibcallCSI;
+
+  for (auto &CS : CSI)
+    if (CS.getFrameIdx() >= 0)
+      NonLibcallCSI.push_back(CS);
+
+  return NonLibcallCSI;
+}
+
 void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
                                       MachineBasicBlock &MBB) const {
   MachineFrameInfo &MFI = MF.getFrameInfo();
@@ -117,6 +222,11 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   Register SPReg = getSPReg(STI);
   Register BPReg = RISCVABI::getBPReg();
 
+  // Since spillCalleeSavedRegisters may have inserted a libcall, skip past
+  // any instructions marked as FrameSetup
+  while (MBBI != MBB.end() && MBBI->getFlag(MachineInstr::FrameSetup))
+    ++MBBI;
+
   // Debug location must be unknown since the first debug location is used
   // to determine the end of the prologue.
   DebugLoc DL;
@@ -124,12 +234,38 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
   // Determine the correct frame layout
   determineFrameLayout(MF);
 
+  // If libcalls are used to spill and restore callee-saved registers, the frame
+  // has two sections; the opaque section managed by the libcalls, and the
+  // section managed by MachineFrameInfo which can also hold callee saved
+  // registers in fixed stack slots, both of which have negative frame indices.
+  // This gets even more complicated when incoming arguments are passed via the
+  // stack, as these too have negative frame indices. An example is detailed
+  // below:
+  //
+  //  | incoming arg | <- FI[-3]
+  //  | libcallspill |
+  //  | calleespill  | <- FI[-2]
+  //  | calleespill  | <- FI[-1]
+  //  | this_frame   | <- FI[0]
+  //
+  // For negative frame indices, the offset from the frame pointer will differ
+  // depending on which of these groups the frame index applies to.
+  // The following calculates the correct offset knowing the number of callee
+  // saved registers spilt by the two methods.
+  if (int LibCallRegs = getLibCallID(MF, MFI.getCalleeSavedInfo()) + 1) {
+    // Calculate the size of the frame managed by the libcall. The libcalls are
+    // implemented such that the stack will always be 16 byte aligned.
+    unsigned LibCallFrameSize = alignTo((STI.getXLen() / 8) * LibCallRegs, 16);
+    RVFI->setLibCallStackSize(LibCallFrameSize);
+  }
+
   // FIXME (note copied from Lanai): This appears to be overallocating.  Needs
   // investigation. Get the number of bytes to allocate from the FrameInfo.
   uint64_t StackSize = MFI.getStackSize();
+  uint64_t RealStackSize = StackSize + RVFI->getLibCallStackSize();
 
   // Early exit if there is no need to allocate on the stack
-  if (StackSize == 0 && !MFI.adjustsStack())
+  if (RealStackSize == 0 && !MFI.adjustsStack())
     return;
 
   // If the stack pointer has been marked as reserved, then produce an error if
@@ -140,31 +276,42 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
 
   uint64_t FirstSPAdjustAmount = getFirstSPAdjustAmount(MF);
   // Split the SP adjustment to reduce the offsets of callee saved spill.
-  if (FirstSPAdjustAmount)
+  if (FirstSPAdjustAmount) {
     StackSize = FirstSPAdjustAmount;
+    RealStackSize = FirstSPAdjustAmount;
+  }
 
   // Allocate space on the stack if necessary.
   adjustReg(MBB, MBBI, DL, SPReg, SPReg, -StackSize, MachineInstr::FrameSetup);
 
-  // Emit ".cfi_def_cfa_offset StackSize"
+  // Emit ".cfi_def_cfa_offset RealStackSize"
   unsigned CFIIndex = MF.addFrameInst(
-      MCCFIInstruction::createDefCfaOffset(nullptr, -StackSize));
+      MCCFIInstruction::cfiDefCfaOffset(nullptr, RealStackSize));
   BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
 
+  const auto &CSI = MFI.getCalleeSavedInfo();
+
   // The frame pointer is callee-saved, and code has been generated for us to
   // save it to the stack. We need to skip over the storing of callee-saved
   // registers as the frame pointer must be modified after it has been saved
   // to the stack, not before.
   // FIXME: assumes exactly one instruction is used to save each callee-saved
   // register.
-  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
-  std::advance(MBBI, CSI.size());
+  std::advance(MBBI, getNonLibcallCSI(CSI).size());
 
   // Iterate over list of callee-saved registers and emit .cfi_offset
   // directives.
   for (const auto &Entry : CSI) {
-    int64_t Offset = MFI.getObjectOffset(Entry.getFrameIdx());
+    int FrameIdx = Entry.getFrameIdx();
+    int64_t Offset;
+    // Offsets for objects with fixed locations (IE: those saved by libcall) are
+    // simply calculated from the frame index.
+    if (FrameIdx < 0)
+      Offset = FrameIdx * (int64_t) STI.getXLen() / 8;
+    else
+      Offset = MFI.getObjectOffset(Entry.getFrameIdx()) -
+               RVFI->getLibCallStackSize();
     Register Reg = Entry.getReg();
     unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createOffset(
         nullptr, RI->getDwarfRegNum(Reg, true), Offset));
@@ -179,11 +326,12 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
           MF.getFunction(), "Frame pointer required, but has been reserved."});
 
     adjustReg(MBB, MBBI, DL, FPReg, SPReg,
-              StackSize - RVFI->getVarArgsSaveSize(), MachineInstr::FrameSetup);
+              RealStackSize - RVFI->getVarArgsSaveSize(),
+              MachineInstr::FrameSetup);
 
-    // Emit ".cfi_def_cfa $fp, 0"
-    unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::createDefCfa(
-        nullptr, RI->getDwarfRegNum(FPReg, true), 0));
+    // Emit ".cfi_def_cfa $fp, RVFI->getVarArgsSaveSize()"
+    unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
+        nullptr, RI->getDwarfRegNum(FPReg, true), RVFI->getVarArgsSaveSize()));
     BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
         .addCFIIndex(CFIIndex);
   }
@@ -201,7 +349,7 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
     if (!hasFP(MF)) {
       // Emit ".cfi_def_cfa_offset StackSize"
       unsigned CFIIndex = MF.addFrameInst(
-          MCCFIInstruction::createDefCfaOffset(nullptr, -MFI.getStackSize()));
+          MCCFIInstruction::cfiDefCfaOffset(nullptr, MFI.getStackSize()));
       BuildMI(MBB, MBBI, DL, TII->get(TargetOpcode::CFI_INSTRUCTION))
           .addCFIIndex(CFIIndex);
     }
@@ -211,15 +359,15 @@ void RISCVFrameLowering::emitPrologue(MachineFunction &MF,
     // Realign Stack
     const RISCVRegisterInfo *RI = STI.getRegisterInfo();
     if (RI->needsStackRealignment(MF)) {
-      unsigned MaxAlignment = MFI.getMaxAlignment();
+      Align MaxAlignment = MFI.getMaxAlign();
 
       const RISCVInstrInfo *TII = STI.getInstrInfo();
-      if (isInt<12>(-(int)MaxAlignment)) {
+      if (isInt<12>(-(int)MaxAlignment.value())) {
         BuildMI(MBB, MBBI, DL, TII->get(RISCV::ANDI), SPReg)
             .addReg(SPReg)
-            .addImm(-(int)MaxAlignment);
+            .addImm(-(int)MaxAlignment.value());
       } else {
-        unsigned ShiftAmount = countTrailingZeros(MaxAlignment);
+        unsigned ShiftAmount = Log2(MaxAlignment);
         Register VR =
             MF.getRegInfo().createVirtualRegister(&RISCV::GPRRegClass);
         BuildMI(MBB, MBBI, DL, TII->get(RISCV::SRLI), VR)
@@ -264,15 +412,26 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
     // last instruction.
     if (!MBBI->isTerminator())
       MBBI = std::next(MBBI);
+
+    // If callee-saved registers are saved via libcall, place stack adjustment
+    // before this call.
+    while (MBBI != MBB.begin() &&
+           std::prev(MBBI)->getFlag(MachineInstr::FrameDestroy))
+      --MBBI;
   }
 
+  const auto &CSI = getNonLibcallCSI(MFI.getCalleeSavedInfo());
+
   // Skip to before the restores of callee-saved registers
   // FIXME: assumes exactly one instruction is used to restore each
   // callee-saved register.
-  auto LastFrameDestroy = std::prev(MBBI, MFI.getCalleeSavedInfo().size());
+  auto LastFrameDestroy = MBBI;
+  if (!CSI.empty())
+    LastFrameDestroy = std::prev(MBBI, CSI.size());
 
   uint64_t StackSize = MFI.getStackSize();
-  uint64_t FPOffset = StackSize - RVFI->getVarArgsSaveSize();
+  uint64_t RealStackSize = StackSize + RVFI->getLibCallStackSize();
+  uint64_t FPOffset = RealStackSize - RVFI->getVarArgsSaveSize();
 
   // Restore the stack pointer using the value of the frame pointer. Only
   // necessary if the stack pointer was modified, meaning the stack size is
@@ -302,7 +461,7 @@ void RISCVFrameLowering::emitEpilogue(MachineFunction &MF,
 
 int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
                                                int FI,
-                                               unsigned &FrameReg) const {
+                                               Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RI = MF.getSubtarget().getRegisterInfo();
   const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
@@ -310,7 +469,7 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
   // Callee-saved registers should be referenced relative to the stack
   // pointer (positive offset), otherwise use the frame pointer (negative
   // offset).
-  const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
+  const auto &CSI = getNonLibcallCSI(MFI.getCalleeSavedInfo());
   int MinCSFI = 0;
   int MaxCSFI = -1;
 
@@ -330,7 +489,7 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
     if (FirstSPAdjustAmount)
       Offset += FirstSPAdjustAmount;
     else
-      Offset += MF.getFrameInfo().getStackSize();
+      Offset += MFI.getStackSize();
   } else if (RI->needsStackRealignment(MF) && !MFI.isFixedObjectIndex(FI)) {
     // If the stack was realigned, the frame pointer is set in order to allow
     // SP to be restored, so we need another base register to record the stack
@@ -339,13 +498,20 @@ int RISCVFrameLowering::getFrameIndexReference(const MachineFunction &MF,
       FrameReg = RISCVABI::getBPReg();
     else
       FrameReg = RISCV::X2;
-    Offset += MF.getFrameInfo().getStackSize();
+    Offset += MFI.getStackSize();
+    if (FI < 0)
+      Offset += RVFI->getLibCallStackSize();
   } else {
     FrameReg = RI->getFrameRegister(MF);
-    if (hasFP(MF))
+    if (hasFP(MF)) {
       Offset += RVFI->getVarArgsSaveSize();
-    else
-      Offset += MF.getFrameInfo().getStackSize();
+      if (FI >= 0)
+        Offset -= RVFI->getLibCallStackSize();
+    } else {
+      Offset += MFI.getStackSize();
+      if (FI < 0)
+        Offset += RVFI->getLibCallStackSize();
+    }
   }
   return Offset;
 }
@@ -407,8 +573,8 @@ void RISCVFrameLowering::processFunctionBeforeFrameFinalized(
   // still needs an emergency spill slot for branch relaxation. This case
   // would currently be missed.
   if (!isInt<11>(MFI.estimateStackSize(MF))) {
-    int RegScavFI = MFI.CreateStackObject(
-        RegInfo->getSpillSize(*RC), RegInfo->getSpillAlignment(*RC), false);
+    int RegScavFI = MFI.CreateStackObject(RegInfo->getSpillSize(*RC),
+                                          RegInfo->getSpillAlign(*RC), false);
     RS->addScavengingFrameIndex(RegScavFI);
   }
 }
@@ -461,16 +627,17 @@ MachineBasicBlock::iterator RISCVFrameLowering::eliminateCallFramePseudoInstr(
 //   add     sp,sp,-64
 uint64_t
 RISCVFrameLowering::getFirstSPAdjustAmount(const MachineFunction &MF) const {
+  const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const std::vector<CalleeSavedInfo> &CSI = MFI.getCalleeSavedInfo();
   uint64_t StackSize = MFI.getStackSize();
-  uint64_t StackAlign = getStackAlignment();
 
-  // FIXME: Disable SplitSPAdjust if save-restore libcall enabled when the patch
-  //        landing. The callee saved registers will be pushed by the
-  //        save-restore libcalls, so we don't have to split the SP adjustment
-  //        in this case.
-  //
+  // Disable SplitSPAdjust if save-restore libcall used. The callee saved
+  // registers will be pushed by the save-restore libcalls, so we don't have to
+  // split the SP adjustment in this case.
+  if (RVFI->getLibCallStackSize())
+    return 0;
+
   // Return the FirstSPAdjustAmount if the StackSize can not fit in signed
   // 12-bit and there exists a callee saved register need to be pushed.
   if (!isInt<12>(StackSize) && (CSI.size() > 0)) {
@@ -480,7 +647,130 @@ RISCVFrameLowering::getFirstSPAdjustAmount(const MachineFunction &MF) const {
     // load/store instruction and we have to stick with the stack alignment.
     // 2048 is 16-byte alignment. The stack alignment for RV32 and RV64 is 16,
     // for RV32E is 4. So (2048 - StackAlign) will satisfy the stack alignment.
-    return 2048 - StackAlign;
+    return 2048 - getStackAlign().value();
   }
   return 0;
 }
+
+bool RISCVFrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return true;
+
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+  DebugLoc DL;
+  if (MI != MBB.end() && !MI->isDebugInstr())
+    DL = MI->getDebugLoc();
+
+  const char *SpillLibCall = getSpillLibCallName(*MF, CSI);
+  if (SpillLibCall) {
+    // Add spill libcall via non-callee-saved register t0.
+    BuildMI(MBB, MI, DL, TII.get(RISCV::PseudoCALLReg), RISCV::X5)
+        .addExternalSymbol(SpillLibCall, RISCVII::MO_CALL)
+        .setMIFlag(MachineInstr::FrameSetup);
+
+    // Add registers spilled in libcall as liveins.
+    for (auto &CS : CSI)
+      MBB.addLiveIn(CS.getReg());
+  }
+
+  // Manually spill values not spilled by libcall.
+  const auto &NonLibcallCSI = getNonLibcallCSI(CSI);
+  for (auto &CS : NonLibcallCSI) {
+    // Insert the spill to the stack frame.
+    Register Reg = CS.getReg();
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.storeRegToStackSlot(MBB, MI, Reg, true, CS.getFrameIdx(), RC, TRI);
+  }
+
+  return true;
+}
+
+bool RISCVFrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
+  if (CSI.empty())
+    return true;
+
+  MachineFunction *MF = MBB.getParent();
+  const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
+  DebugLoc DL;
+  if (MI != MBB.end() && !MI->isDebugInstr())
+    DL = MI->getDebugLoc();
+
+  // Manually restore values not restored by libcall. Insert in reverse order.
+  // loadRegFromStackSlot can insert multiple instructions.
+  const auto &NonLibcallCSI = getNonLibcallCSI(CSI);
+  for (auto &CS : reverse(NonLibcallCSI)) {
+    Register Reg = CS.getReg();
+    const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
+    TII.loadRegFromStackSlot(MBB, MI, Reg, CS.getFrameIdx(), RC, TRI);
+    assert(MI != MBB.begin() && "loadRegFromStackSlot didn't insert any code!");
+  }
+
+  const char *RestoreLibCall = getRestoreLibCallName(*MF, CSI);
+  if (RestoreLibCall) {
+    // Add restore libcall via tail call.
+    MachineBasicBlock::iterator NewMI =
+        BuildMI(MBB, MI, DL, TII.get(RISCV::PseudoTAIL))
+            .addExternalSymbol(RestoreLibCall, RISCVII::MO_CALL)
+            .setMIFlag(MachineInstr::FrameDestroy);
+
+    // Remove trailing returns, since the terminator is now a tail call to the
+    // restore function.
+    if (MI != MBB.end() && MI->getOpcode() == RISCV::PseudoRET) {
+      NewMI->copyImplicitOps(*MF, *MI);
+      MI->eraseFromParent();
+    }
+  }
+
+  return true;
+}
+
+bool RISCVFrameLowering::canUseAsPrologue(const MachineBasicBlock &MBB) const {
+  MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+  const MachineFunction *MF = MBB.getParent();
+  const auto *RVFI = MF->getInfo<RISCVMachineFunctionInfo>();
+
+  if (!RVFI->useSaveRestoreLibCalls(*MF))
+    return true;
+
+  // Inserting a call to a __riscv_save libcall requires the use of the register
+  // t0 (X5) to hold the return address. Therefore if this register is already
+  // used we can't insert the call.
+
+  RegScavenger RS;
+  RS.enterBasicBlock(*TmpMBB);
+  return !RS.isRegUsed(RISCV::X5);
+}
+
+bool RISCVFrameLowering::canUseAsEpilogue(const MachineBasicBlock &MBB) const {
+  const MachineFunction *MF = MBB.getParent();
+  MachineBasicBlock *TmpMBB = const_cast<MachineBasicBlock *>(&MBB);
+  const auto *RVFI = MF->getInfo<RISCVMachineFunctionInfo>();
+
+  if (!RVFI->useSaveRestoreLibCalls(*MF))
+    return true;
+
+  // Using the __riscv_restore libcalls to restore CSRs requires a tail call.
+  // This means if we still need to continue executing code within this function
+  // the restore cannot take place in this basic block.
+
+  if (MBB.succ_size() > 1)
+    return false;
+
+  MachineBasicBlock *SuccMBB =
+      MBB.succ_empty() ? TmpMBB->getFallThrough() : *MBB.succ_begin();
+
+  // Doing a tail call should be safe if there are no successors, because either
+  // we have a returning block or the end of the block is unreachable, so the
+  // restore will be eliminated regardless.
+  if (!SuccMBB)
+    return true;
+
+  // The successor can only contain a return, since we would effectively be
+  // replacing the successor with our own tail return at the end of our block.
+  return SuccMBB->isReturnBlock() && SuccMBB->size() == 1;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index 3a16cf93cf10..1517c847a04c 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -30,7 +30,7 @@ public:
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
+                             Register &FrameReg) const override;
 
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
@@ -46,12 +46,24 @@ public:
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
+  bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                 MachineBasicBlock::iterator MI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
+                                 const TargetRegisterInfo *TRI) const override;
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
+                              const TargetRegisterInfo *TRI) const override;
 
   // Get the first stack adjustment amount for SplitSPAdjust.
   // Return 0 if we don't want to to split the SP adjustment in prologue and
   // epilogue.
   uint64_t getFirstSPAdjustAmount(const MachineFunction &MF) const;
 
+  bool canUseAsPrologue(const MachineBasicBlock &MBB) const override;
+  bool canUseAsEpilogue(const MachineBasicBlock &MBB) const override;
+
 protected:
   const RISCVSubtarget &STI;
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index f66d06c20e37..a0ae05081adc 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -10,55 +10,19 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "RISCVISelDAGToDAG.h"
 #include "MCTargetDesc/RISCVMCTargetDesc.h"
-#include "RISCV.h"
-#include "RISCVTargetMachine.h"
 #include "Utils/RISCVMatInt.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/SelectionDAGISel.h"
+#include "llvm/Support/Alignment.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "riscv-isel"
 
-// RISCV-specific code to select RISCV machine instructions for
-// SelectionDAG operations.
-namespace {
-class RISCVDAGToDAGISel final : public SelectionDAGISel {
-  const RISCVSubtarget *Subtarget = nullptr;
-
-public:
-  explicit RISCVDAGToDAGISel(RISCVTargetMachine &TargetMachine)
-      : SelectionDAGISel(TargetMachine) {}
-
-  StringRef getPassName() const override {
-    return "RISCV DAG->DAG Pattern Instruction Selection";
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override {
-    Subtarget = &MF.getSubtarget<RISCVSubtarget>();
-    return SelectionDAGISel::runOnMachineFunction(MF);
-  }
-
-  void PostprocessISelDAG() override;
-
-  void Select(SDNode *Node) override;
-
-  bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
-                                    std::vector<SDValue> &OutOps) override;
-
-  bool SelectAddrFI(SDValue Addr, SDValue &Base);
-
-// Include the pieces autogenerated from the target description.
-#include "RISCVGenDAGISel.inc"
-
-private:
-  void doPeepholeLoadStoreADDI();
-};
-}
-
 void RISCVDAGToDAGISel::PostprocessISelDAG() {
   doPeepholeLoadStoreADDI();
 }
@@ -111,6 +75,30 @@ void RISCVDAGToDAGISel::Select(SDNode *Node) {
   EVT VT = Node->getValueType(0);
 
   switch (Opcode) {
+  case ISD::ADD: {
+    // Optimize (add r, imm) to (addi (addi r, imm0) imm1) if applicable. The
+    // immediate must be in specific ranges and have a single use.
+    if (auto *ConstOp = dyn_cast<ConstantSDNode>(Node->getOperand(1))) {
+      if (!(ConstOp->hasOneUse()))
+        break;
+      // The imm must be in range [-4096,-2049] or [2048,4094].
+      int64_t Imm = ConstOp->getSExtValue();
+      if (!(-4096 <= Imm && Imm <= -2049) && !(2048 <= Imm && Imm <= 4094))
+        break;
+      // Break the imm to imm0+imm1.
+      SDLoc DL(Node);
+      EVT VT = Node->getValueType(0);
+      const SDValue ImmOp0 = CurDAG->getTargetConstant(Imm - Imm / 2, DL, VT);
+      const SDValue ImmOp1 = CurDAG->getTargetConstant(Imm / 2, DL, VT);
+      auto *NodeAddi0 = CurDAG->getMachineNode(RISCV::ADDI, DL, VT,
+                                               Node->getOperand(0), ImmOp0);
+      auto *NodeAddi1 = CurDAG->getMachineNode(RISCV::ADDI, DL, VT,
+                                               SDValue(NodeAddi0, 0), ImmOp1);
+      ReplaceNode(Node, NodeAddi1);
+      return;
+    }
+    break;
+  }
   case ISD::Constant: {
     auto ConstNode = cast<ConstantSDNode>(Node);
     if (VT == XLenVT && ConstNode->isNullValue()) {
@@ -197,8 +185,9 @@ bool RISCVDAGToDAGISel::SelectAddrFI(SDValue Addr, SDValue &Base) {
 }
 
 // Merge an ADDI into the offset of a load/store instruction where possible.
-// (load (add base, off), 0) -> (load base, off)
-// (store val, (add base, off)) -> (store val, base, off)
+// (load (addi base, off1), off2) -> (load base, off1+off2)
+// (store val, (addi base, off1), off2) -> (store val, base, off1+off2)
+// This is possible when off1+off2 fits a 12-bit immediate.
 void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
   SelectionDAG::allnodes_iterator Position(CurDAG->getRoot().getNode());
   ++Position;
@@ -239,10 +228,7 @@ void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
       break;
     }
 
-    // Currently, the load/store offset must be 0 to be considered for this
-    // peephole optimisation.
-    if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)) ||
-        N->getConstantOperandVal(OffsetOpIdx) != 0)
+    if (!isa<ConstantSDNode>(N->getOperand(OffsetOpIdx)))
       continue;
 
     SDValue Base = N->getOperand(BaseOpIdx);
@@ -252,14 +238,39 @@ void RISCVDAGToDAGISel::doPeepholeLoadStoreADDI() {
       continue;
 
     SDValue ImmOperand = Base.getOperand(1);
+    uint64_t Offset2 = N->getConstantOperandVal(OffsetOpIdx);
 
     if (auto Const = dyn_cast<ConstantSDNode>(ImmOperand)) {
-      ImmOperand = CurDAG->getTargetConstant(
-          Const->getSExtValue(), SDLoc(ImmOperand), ImmOperand.getValueType());
+      int64_t Offset1 = Const->getSExtValue();
+      int64_t CombinedOffset = Offset1 + Offset2;
+      if (!isInt<12>(CombinedOffset))
+        continue;
+      ImmOperand = CurDAG->getTargetConstant(CombinedOffset, SDLoc(ImmOperand),
+                                             ImmOperand.getValueType());
     } else if (auto GA = dyn_cast<GlobalAddressSDNode>(ImmOperand)) {
+      // If the off1 in (addi base, off1) is a global variable's address (its
+      // low part, really), then we can rely on the alignment of that variable
+      // to provide a margin of safety before off1 can overflow the 12 bits.
+      // Check if off2 falls within that margin; if so off1+off2 can't overflow.
+      const DataLayout &DL = CurDAG->getDataLayout();
+      Align Alignment = GA->getGlobal()->getPointerAlignment(DL);
+      if (Offset2 != 0 && Alignment <= Offset2)
+        continue;
+      int64_t Offset1 = GA->getOffset();
+      int64_t CombinedOffset = Offset1 + Offset2;
       ImmOperand = CurDAG->getTargetGlobalAddress(
           GA->getGlobal(), SDLoc(ImmOperand), ImmOperand.getValueType(),
-          GA->getOffset(), GA->getTargetFlags());
+          CombinedOffset, GA->getTargetFlags());
+    } else if (auto CP = dyn_cast<ConstantPoolSDNode>(ImmOperand)) {
+      // Ditto.
+      Align Alignment = CP->getAlign();
+      if (Offset2 != 0 && Alignment <= Offset2)
+        continue;
+      int64_t Offset1 = CP->getOffset();
+      int64_t CombinedOffset = Offset1 + Offset2;
+      ImmOperand = CurDAG->getTargetConstantPool(
+          CP->getConstVal(), ImmOperand.getValueType(), CP->getAlign(),
+          CombinedOffset, CP->getTargetFlags());
     } else {
       continue;
     }
diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
new file mode 100644
index 000000000000..dcf733ec3675
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.h
@@ -0,0 +1,56 @@
+//===---- RISCVISelDAGToDAG.h - A dag to dag inst selector for RISCV ------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines an instruction selector for the RISCV target.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_RISCV_RISCVISELDAGTODAG_H
+#define LLVM_LIB_TARGET_RISCV_RISCVISELDAGTODAG_H
+
+#include "RISCV.h"
+#include "RISCVTargetMachine.h"
+#include "llvm/CodeGen/SelectionDAGISel.h"
+
+// RISCV-specific code to select RISCV machine instructions for
+// SelectionDAG operations.
+namespace llvm {
+class RISCVDAGToDAGISel : public SelectionDAGISel {
+  const RISCVSubtarget *Subtarget = nullptr;
+
+public:
+  explicit RISCVDAGToDAGISel(RISCVTargetMachine &TargetMachine)
+      : SelectionDAGISel(TargetMachine) {}
+
+  StringRef getPassName() const override {
+    return "RISCV DAG->DAG Pattern Instruction Selection";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override {
+    Subtarget = &MF.getSubtarget<RISCVSubtarget>();
+    return SelectionDAGISel::runOnMachineFunction(MF);
+  }
+
+  void PostprocessISelDAG() override;
+
+  void Select(SDNode *Node) override;
+
+  bool SelectInlineAsmMemoryOperand(const SDValue &Op, unsigned ConstraintID,
+                                    std::vector<SDValue> &OutOps) override;
+
+  bool SelectAddrFI(SDValue Addr, SDValue &Base);
+
+// Include the pieces autogenerated from the target description.
+#include "RISCVGenDAGISel.inc"
+
+private:
+  void doPeepholeLoadStoreADDI();
+};
+}
+
+#endif
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 5a2cffbc824c..91fc69b5bc10 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -33,6 +33,7 @@
 #include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
@@ -197,6 +198,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
     setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   }
 
+  if (Subtarget.is64Bit() &&
+      !(Subtarget.hasStdExtD() || Subtarget.hasStdExtF())) {
+    setOperationAction(ISD::FP_TO_UINT, MVT::i32, Custom);
+    setOperationAction(ISD::FP_TO_SINT, MVT::i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
+    setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::i32, Custom);
+  }
+
   setOperationAction(ISD::GlobalAddress, XLenVT, Custom);
   setOperationAction(ISD::BlockAddress, XLenVT, Custom);
   setOperationAction(ISD::ConstantPool, XLenVT, Custom);
@@ -210,6 +219,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
   setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
+  setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
 
   if (Subtarget.hasStdExtA()) {
     setMaxAtomicSizeInBitsSupported(Subtarget.getXLen());
@@ -227,6 +237,12 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
 
   // Effectively disable jump table generation.
   setMinimumJumpTableEntries(INT_MAX);
+
+  // Jumps are expensive, compared to logic
+  setJumpIsExpensive();
+
+  // We can use any register for comparisons
+  setHasMultipleConditionRegisters();
 }
 
 EVT RISCVTargetLowering::getSetCCResultType(const DataLayout &DL, LLVMContext &,
@@ -336,6 +352,17 @@ bool RISCVTargetLowering::isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const {
   return Subtarget.is64Bit() && SrcVT == MVT::i32 && DstVT == MVT::i64;
 }
 
+bool RISCVTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+                                       bool ForCodeSize) const {
+  if (VT == MVT::f32 && !Subtarget.hasStdExtF())
+    return false;
+  if (VT == MVT::f64 && !Subtarget.hasStdExtD())
+    return false;
+  if (Imm.isNegZero())
+    return false;
+  return Imm.isZero();
+}
+
 bool RISCVTargetLowering::hasBitPreservingFPLogic(EVT VT) const {
   return (VT == MVT::f32 && Subtarget.hasStdExtF()) ||
          (VT == MVT::f64 && Subtarget.hasStdExtD());
@@ -418,6 +445,8 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     SDValue FPConv = DAG.getNode(RISCVISD::FMV_W_X_RV64, DL, MVT::f32, NewOp0);
     return FPConv;
   }
+  case ISD::INTRINSIC_WO_CHAIN:
+    return LowerINTRINSIC_WO_CHAIN(Op, DAG);
   }
 }
 
@@ -434,7 +463,7 @@ static SDValue getTargetNode(BlockAddressSDNode *N, SDLoc DL, EVT Ty,
 
 static SDValue getTargetNode(ConstantPoolSDNode *N, SDLoc DL, EVT Ty,
                              SelectionDAG &DAG, unsigned Flags) {
-  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlignment(),
+  return DAG.getTargetConstantPool(N->getConstVal(), Ty, N->getAlign(),
                                    N->getOffset(), Flags);
 }
 
@@ -821,6 +850,20 @@ SDValue RISCVTargetLowering::lowerShiftRightParts(SDValue Op, SelectionDAG &DAG,
   return DAG.getMergeValues(Parts, DL);
 }
 
+SDValue RISCVTargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
+                                                     SelectionDAG &DAG) const {
+  unsigned IntNo = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
+  SDLoc DL(Op);
+  switch (IntNo) {
+  default:
+    return SDValue();    // Don't custom lower most intrinsics.
+  case Intrinsic::thread_pointer: {
+    EVT PtrVT = getPointerTy(DAG.getDataLayout());
+    return DAG.getRegister(RISCV::X4, PtrVT);
+  }
+  }
+}
+
 // Returns the opcode of the target-specific SDNode that implements the 32-bit
 // form of the given Opcode.
 static RISCVISD::NodeType getRISCVWOpcode(unsigned Opcode) {
@@ -876,6 +919,32 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
   switch (N->getOpcode()) {
   default:
     llvm_unreachable("Don't know how to custom type legalize this operation!");
+  case ISD::STRICT_FP_TO_SINT:
+  case ISD::STRICT_FP_TO_UINT:
+  case ISD::FP_TO_SINT:
+  case ISD::FP_TO_UINT: {
+    bool IsStrict = N->isStrictFPOpcode();
+    assert(N->getValueType(0) == MVT::i32 && Subtarget.is64Bit() &&
+           "Unexpected custom legalisation");
+    SDValue Op0 = IsStrict ? N->getOperand(1) : N->getOperand(0);
+    RTLIB::Libcall LC;
+    if (N->getOpcode() == ISD::FP_TO_SINT ||
+        N->getOpcode() == ISD::STRICT_FP_TO_SINT)
+      LC = RTLIB::getFPTOSINT(Op0.getValueType(), N->getValueType(0));
+    else
+      LC = RTLIB::getFPTOUINT(Op0.getValueType(), N->getValueType(0));
+    MakeLibCallOptions CallOptions;
+    EVT OpVT = Op0.getValueType();
+    CallOptions.setTypeListBeforeSoften(OpVT, N->getValueType(0), true);
+    SDValue Chain = IsStrict ? N->getOperand(0) : SDValue();
+    SDValue Result;
+    std::tie(Result, Chain) =
+        makeLibCall(DAG, LC, N->getValueType(0), Op0, CallOptions, DL, Chain);
+    Results.push_back(Result);
+    if (IsStrict)
+      Results.push_back(Chain);
+    break;
+  }
   case ISD::READCYCLECOUNTER: {
     assert(!Subtarget.is64Bit() &&
            "READCYCLECOUNTER only has custom type legalization on riscv32");
@@ -884,8 +953,8 @@ void RISCVTargetLowering::ReplaceNodeResults(SDNode *N,
     SDValue RCW =
         DAG.getNode(RISCVISD::READ_CYCLE_WIDE, DL, VTs, N->getOperand(0));
 
-    Results.push_back(RCW);
-    Results.push_back(RCW.getValue(1));
+    Results.push_back(
+        DAG.getNode(ISD::BUILD_PAIR, DL, MVT::i64, RCW, RCW.getValue(1)));
     Results.push_back(RCW.getValue(2));
     break;
   }
@@ -1172,13 +1241,13 @@ static MachineBasicBlock *emitSplitF64Pseudo(MachineInstr &MI,
   Register HiReg = MI.getOperand(1).getReg();
   Register SrcReg = MI.getOperand(2).getReg();
   const TargetRegisterClass *SrcRC = &RISCV::FPR64RegClass;
-  int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();
+  int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex(MF);
 
   TII.storeRegToStackSlot(*BB, MI, SrcReg, MI.getOperand(2).isKill(), FI, SrcRC,
                           RI);
   MachineMemOperand *MMO =
       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
-                              MachineMemOperand::MOLoad, 8, 8);
+                              MachineMemOperand::MOLoad, 8, Align(8));
   BuildMI(*BB, MI, DL, TII.get(RISCV::LW), LoReg)
       .addFrameIndex(FI)
       .addImm(0)
@@ -1204,11 +1273,11 @@ static MachineBasicBlock *emitBuildPairF64Pseudo(MachineInstr &MI,
   Register LoReg = MI.getOperand(1).getReg();
   Register HiReg = MI.getOperand(2).getReg();
   const TargetRegisterClass *DstRC = &RISCV::FPR64RegClass;
-  int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex();
+  int FI = MF.getInfo<RISCVMachineFunctionInfo>()->getMoveF64FrameIndex(MF);
 
   MachineMemOperand *MMO =
       MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, FI),
-                              MachineMemOperand::MOStore, 8, 8);
+                              MachineMemOperand::MOStore, 8, Align(8));
   BuildMI(*BB, MI, DL, TII.get(RISCV::SW))
       .addReg(LoReg, getKillRegState(MI.getOperand(1).isKill()))
       .addFrameIndex(FI)
@@ -1430,14 +1499,15 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
                                      VA1.getLocVT(), CCValAssign::Full));
   } else {
     // Both halves must be passed on the stack, with proper alignment.
-    unsigned StackAlign = std::max(XLenInBytes, ArgFlags1.getOrigAlign());
+    Align StackAlign =
+        std::max(Align(XLenInBytes), ArgFlags1.getNonZeroOrigAlign());
     State.addLoc(
         CCValAssign::getMem(VA1.getValNo(), VA1.getValVT(),
                             State.AllocateStack(XLenInBytes, StackAlign),
                             VA1.getLocVT(), CCValAssign::Full));
     State.addLoc(CCValAssign::getMem(
-        ValNo2, ValVT2, State.AllocateStack(XLenInBytes, XLenInBytes), LocVT2,
-        CCValAssign::Full));
+        ValNo2, ValVT2, State.AllocateStack(XLenInBytes, Align(XLenInBytes)),
+        LocVT2, CCValAssign::Full));
     return false;
   }
 
@@ -1448,8 +1518,8 @@ static bool CC_RISCVAssign2XLen(unsigned XLen, CCState &State, CCValAssign VA1,
   } else {
     // The second half is passed via the stack, without additional alignment.
     State.addLoc(CCValAssign::getMem(
-        ValNo2, ValVT2, State.AllocateStack(XLenInBytes, XLenInBytes), LocVT2,
-        CCValAssign::Full));
+        ValNo2, ValVT2, State.AllocateStack(XLenInBytes, Align(XLenInBytes)),
+        LocVT2, CCValAssign::Full));
   }
 
   return false;
@@ -1517,7 +1587,7 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
   // original type is larger than 2*XLEN, so the register alignment rule does
   // not apply.
   unsigned TwoXLenInBytes = (2 * XLen) / 8;
-  if (!IsFixed && ArgFlags.getOrigAlign() == TwoXLenInBytes &&
+  if (!IsFixed && ArgFlags.getNonZeroOrigAlign() == TwoXLenInBytes &&
       DL.getTypeAllocSize(OrigTy) == TwoXLenInBytes) {
     unsigned RegIdx = State.getFirstUnallocated(ArgGPRs);
     // Skip 'odd' register if necessary.
@@ -1544,13 +1614,13 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
     Register Reg = State.AllocateReg(ArgGPRs);
     LocVT = MVT::i32;
     if (!Reg) {
-      unsigned StackOffset = State.AllocateStack(8, 8);
+      unsigned StackOffset = State.AllocateStack(8, Align(8));
       State.addLoc(
           CCValAssign::getMem(ValNo, ValVT, StackOffset, LocVT, LocInfo));
       return false;
     }
     if (!State.AllocateReg(ArgGPRs))
-      State.AllocateStack(4, 4);
+      State.AllocateStack(4, Align(4));
     State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
     return false;
   }
@@ -1590,7 +1660,8 @@ static bool CC_RISCV(const DataLayout &DL, RISCVABI::ABI ABI, unsigned ValNo,
     Reg = State.AllocateReg(ArgFPR64s, ArgFPR32s);
   else
     Reg = State.AllocateReg(ArgGPRs);
-  unsigned StackOffset = Reg ? 0 : State.AllocateStack(XLen / 8, XLen / 8);
+  unsigned StackOffset =
+      Reg ? 0 : State.AllocateStack(XLen / 8, Align(XLen / 8));
 
   // If we reach this point and PendingLocs is non-empty, we must be at the
   // end of a split argument that must be passed indirectly.
@@ -1645,7 +1716,7 @@ void RISCVTargetLowering::analyzeInputArgs(
 
     RISCVABI::ABI ABI = MF.getSubtarget<RISCVSubtarget>().getTargetABI();
     if (CC_RISCV(MF.getDataLayout(), ABI, i, ArgVT, ArgVT, CCValAssign::Full,
-                 ArgFlags, CCInfo, /*IsRet=*/true, IsRet, ArgTy)) {
+                 ArgFlags, CCInfo, /*IsFixed=*/true, IsRet, ArgTy)) {
       LLVM_DEBUG(dbgs() << "InputArg #" << i << " has unhandled type "
                         << EVT(ArgVT).getEVTString() << '\n');
       llvm_unreachable(nullptr);
@@ -1859,13 +1930,13 @@ static bool CC_RISCV_FastCC(unsigned ValNo, MVT ValVT, MVT LocVT,
   }
 
   if (LocVT == MVT::i32 || LocVT == MVT::f32) {
-    unsigned Offset4 = State.AllocateStack(4, 4);
+    unsigned Offset4 = State.AllocateStack(4, Align(4));
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset4, LocVT, LocInfo));
     return false;
   }
 
   if (LocVT == MVT::i64 || LocVT == MVT::f64) {
-    unsigned Offset5 = State.AllocateStack(8, 8);
+    unsigned Offset5 = State.AllocateStack(8, Align(8));
     State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset5, LocVT, LocInfo));
     return false;
   }
@@ -2124,7 +2195,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   if (IsTailCall)
     ++NumTailCalls;
-  else if (CLI.CS && CLI.CS.isMustTailCall())
+  else if (CLI.CB && CLI.CB->isMustTailCall())
     report_fatal_error("failed to perform tail call elimination on a call "
                        "site marked musttail");
 
@@ -2140,17 +2211,17 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
     SDValue Arg = OutVals[i];
     unsigned Size = Flags.getByValSize();
-    unsigned Align = Flags.getByValAlign();
+    Align Alignment = Flags.getNonZeroByValAlign();
 
-    int FI = MF.getFrameInfo().CreateStackObject(Size, Align, /*isSS=*/false);
+    int FI =
+        MF.getFrameInfo().CreateStackObject(Size, Alignment, /*isSS=*/false);
     SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     SDValue SizeNode = DAG.getConstant(Size, DL, XLenVT);
 
-    Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Align,
+    Chain = DAG.getMemcpy(Chain, DL, FIPtr, Arg, SizeNode, Alignment,
                           /*IsVolatile=*/false,
-                          /*AlwaysInline=*/false,
-                          IsTailCall, MachinePointerInfo(),
-                          MachinePointerInfo());
+                          /*AlwaysInline=*/false, IsTailCall,
+                          MachinePointerInfo(), MachinePointerInfo());
     ByValArgs.push_back(FIPtr);
   }
 
@@ -2325,6 +2396,7 @@ SDValue RISCVTargetLowering::LowerCall(CallLoweringInfo &CLI,
   }
 
   Chain = DAG.getNode(RISCVISD::CALL, DL, NodeTys, Ops);
+  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   Glue = Chain.getValue(1);
 
   // Mark the end of the call, which is glued to the call itself.
@@ -2494,6 +2566,10 @@ void RISCVTargetLowering::validateCCReservedRegs(
         F, "Argument register required, but has been reserved."});
 }
 
+bool RISCVTargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
+  return CI->isTailCall();
+}
+
 const char *RISCVTargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((RISCVISD::NodeType)Opcode) {
   case RISCVISD::FIRST_NUMBER:
@@ -2883,12 +2959,12 @@ Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
   return Result;
 }
 
-unsigned RISCVTargetLowering::getExceptionPointerRegister(
+Register RISCVTargetLowering::getExceptionPointerRegister(
     const Constant *PersonalityFn) const {
   return RISCV::X10;
 }
 
-unsigned RISCVTargetLowering::getExceptionSelectorRegister(
+Register RISCVTargetLowering::getExceptionSelectorRegister(
     const Constant *PersonalityFn) const {
   return RISCV::X11;
 }
@@ -2903,6 +2979,26 @@ bool RISCVTargetLowering::shouldExtendTypeInLibCall(EVT Type) const {
   return true;
 }
 
+bool RISCVTargetLowering::decomposeMulByConstant(LLVMContext &Context, EVT VT,
+                                                 SDValue C) const {
+  // Check integral scalar types.
+  if (VT.isScalarInteger()) {
+    // Do not perform the transformation on riscv32 with the M extension.
+    if (!Subtarget.is64Bit() && Subtarget.hasStdExtM())
+      return false;
+    if (auto *ConstNode = dyn_cast<ConstantSDNode>(C.getNode())) {
+      if (ConstNode->getAPIntValue().getBitWidth() > 8 * sizeof(int64_t))
+        return false;
+      int64_t Imm = ConstNode->getSExtValue();
+      if (isPowerOf2_64(Imm + 1) || isPowerOf2_64(Imm - 1) ||
+          isPowerOf2_64(1 - Imm) || isPowerOf2_64(-1 - Imm))
+        return true;
+    }
+  }
+
+  return false;
+}
+
 #define GET_REGISTER_MATCHER
 #include "RISCVGenAsmMatcher.inc"
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index b2ad75d67024..e420e879efc9 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -74,6 +74,8 @@ public:
   bool isTruncateFree(EVT SrcVT, EVT DstVT) const override;
   bool isZExtFree(SDValue Val, EVT VT2) const override;
   bool isSExtCheaperThanZExt(EVT SrcVT, EVT DstVT) const override;
+  bool isFPImmLegal(const APFloat &Imm, EVT VT,
+                    bool ForCodeSize) const override;
 
   bool hasBitPreservingFPLogic(EVT VT) const override;
 
@@ -114,6 +116,7 @@ public:
   bool convertSetCCLogicToBitwiseLogic(EVT VT) const override {
     return VT.isScalarInteger();
   }
+  bool convertSelectOfConstantsToMath(EVT VT) const override { return true; }
 
   bool shouldInsertFencesForAtomic(const Instruction *I) const override {
     return isa<LoadInst>(I) || isa<StoreInst>(I);
@@ -127,6 +130,10 @@ public:
     return ISD::SIGN_EXTEND;
   }
 
+  ISD::NodeType getExtendForAtomicCmpSwapArg() const override {
+    return ISD::SIGN_EXTEND;
+  }
+
   bool shouldExpandShift(SelectionDAG &DAG, SDNode *N) const override {
     if (DAG.getMachineFunction().getFunction().hasMinSize())
       return false;
@@ -137,12 +144,12 @@ public:
 
   /// If a physical register, this returns the register that receives the
   /// exception address on entry to an EH pad.
-  unsigned
+  Register
   getExceptionPointerRegister(const Constant *PersonalityFn) const override;
 
   /// If a physical register, this returns the register that receives the
   /// exception typeid on entry to a landing pad.
-  unsigned
+  Register
   getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
   bool shouldExtendTypeInLibCall(EVT Type) const override;
@@ -154,13 +161,6 @@ public:
   Register getRegisterByName(const char *RegName, LLT VT,
                              const MachineFunction &MF) const override;
 
-private:
-  void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
-                        const SmallVectorImpl<ISD::InputArg> &Ins,
-                        bool IsRet) const;
-  void analyzeOutputArgs(MachineFunction &MF, CCState &CCInfo,
-                         const SmallVectorImpl<ISD::OutputArg> &Outs,
-                         bool IsRet, CallLoweringInfo *CLI) const;
   // Lower incoming arguments, copy physregs into vregs
   SDValue LowerFormalArguments(SDValue Chain, CallingConv::ID CallConv,
                                bool IsVarArg,
@@ -177,10 +177,38 @@ private:
                       SelectionDAG &DAG) const override;
   SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
                     SmallVectorImpl<SDValue> &InVals) const override;
+
   bool shouldConvertConstantLoadToIntImm(const APInt &Imm,
                                          Type *Ty) const override {
     return true;
   }
+  bool mayBeEmittedAsTailCall(const CallInst *CI) const override;
+  bool shouldConsiderGEPOffsetSplit() const override { return true; }
+
+  bool decomposeMulByConstant(LLVMContext &Context, EVT VT,
+                              SDValue C) const override;
+
+  TargetLowering::AtomicExpansionKind
+  shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
+  Value *emitMaskedAtomicRMWIntrinsic(IRBuilder<> &Builder, AtomicRMWInst *AI,
+                                      Value *AlignedAddr, Value *Incr,
+                                      Value *Mask, Value *ShiftAmt,
+                                      AtomicOrdering Ord) const override;
+  TargetLowering::AtomicExpansionKind
+  shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CI) const override;
+  Value *emitMaskedAtomicCmpXchgIntrinsic(IRBuilder<> &Builder,
+                                          AtomicCmpXchgInst *CI,
+                                          Value *AlignedAddr, Value *CmpVal,
+                                          Value *NewVal, Value *Mask,
+                                          AtomicOrdering Ord) const override;
+
+private:
+  void analyzeInputArgs(MachineFunction &MF, CCState &CCInfo,
+                        const SmallVectorImpl<ISD::InputArg> &Ins,
+                        bool IsRet) const;
+  void analyzeOutputArgs(MachineFunction &MF, CCState &CCInfo,
+                         const SmallVectorImpl<ISD::OutputArg> &Outs,
+                         bool IsRet, CallLoweringInfo *CLI) const;
 
   template <class NodeTy>
   SDValue getAddr(NodeTy *N, SelectionDAG &DAG, bool IsLocal = true) const;
@@ -189,7 +217,6 @@ private:
                            bool UseGOT) const;
   SDValue getDynamicTLSAddr(GlobalAddressSDNode *N, SelectionDAG &DAG) const;
 
-  bool shouldConsiderGEPOffsetSplit() const override { return true; }
   SDValue lowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerConstantPool(SDValue Op, SelectionDAG &DAG) const;
@@ -200,24 +227,12 @@ private:
   SDValue lowerRETURNADDR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerShiftLeftParts(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerShiftRightParts(SDValue Op, SelectionDAG &DAG, bool IsSRA) const;
+  SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const;
 
   bool isEligibleForTailCallOptimization(
       CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
       const SmallVector<CCValAssign, 16> &ArgLocs) const;
 
-  TargetLowering::AtomicExpansionKind
-  shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
-  virtual Value *emitMaskedAtomicRMWIntrinsic(
-      IRBuilder<> &Builder, AtomicRMWInst *AI, Value *AlignedAddr, Value *Incr,
-      Value *Mask, Value *ShiftAmt, AtomicOrdering Ord) const override;
-  TargetLowering::AtomicExpansionKind
-  shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *CI) const override;
-  virtual Value *
-  emitMaskedAtomicCmpXchgIntrinsic(IRBuilder<> &Builder, AtomicCmpXchgInst *CI,
-                                   Value *AlignedAddr, Value *CmpVal,
-                                   Value *NewVal, Value *Mask,
-                                   AtomicOrdering Ord) const override;
-
   /// Generate error diagnostics if any register used by CC has been marked
   /// reserved.
   void validateCCReservedRegs(
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormats.td b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
index 7229ebfe1db0..a47945a6a515 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrFormats.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormats.td
@@ -49,6 +49,19 @@ def InstFormatCB     : InstFormat<15>;
 def InstFormatCJ     : InstFormat<16>;
 def InstFormatOther  : InstFormat<17>;
 
+class RISCVVConstraint<bits<4> val> {
+  bits<4> Value = val;
+}
+def NoConstraint : RISCVVConstraint<0>;
+def WidenV       : RISCVVConstraint<1>;
+def WidenW       : RISCVVConstraint<2>;
+def WidenCvt     : RISCVVConstraint<3>;
+def Narrow       : RISCVVConstraint<4>;
+def Iota         : RISCVVConstraint<5>;
+def SlideUp      : RISCVVConstraint<6>;
+def Vrgather     : RISCVVConstraint<7>;
+def Vcompress    : RISCVVConstraint<8>;
+
 // The following opcode names match those given in Table 19.1 in the
 // RISC-V User-level ISA specification ("RISC-V base opcode map").
 class RISCVOpcode<bits<7> val> {
@@ -71,6 +84,7 @@ def OPC_MSUB      : RISCVOpcode<0b1000111>;
 def OPC_NMSUB     : RISCVOpcode<0b1001011>;
 def OPC_NMADD     : RISCVOpcode<0b1001111>;
 def OPC_OP_FP     : RISCVOpcode<0b1010011>;
+def OPC_OP_V      : RISCVOpcode<0b1010111>;
 def OPC_BRANCH    : RISCVOpcode<0b1100011>;
 def OPC_JALR      : RISCVOpcode<0b1100111>;
 def OPC_JAL       : RISCVOpcode<0b1101111>;
@@ -99,11 +113,16 @@ class RVInst<dag outs, dag ins, string opcodestr, string argstr,
   let Pattern = pattern;
 
   let TSFlags{4-0} = format.Value;
+
+  // Defaults
+  RISCVVConstraint RVVConstraint = NoConstraint;
+  let TSFlags{8-5} = RVVConstraint.Value;
 }
 
 // Pseudo instructions
 class Pseudo<dag outs, dag ins, list<dag> pattern, string opcodestr = "", string argstr = "">
-    : RVInst<outs, ins, opcodestr, argstr, pattern, InstFormatPseudo> {
+    : RVInst<outs, ins, opcodestr, argstr, pattern, InstFormatPseudo>,
+      Sched<[]> {
   let isPseudo = 1;
   let isCodeGenOnly = 1;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
new file mode 100644
index 000000000000..e5f154966ba6
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrFormatsV.td
@@ -0,0 +1,300 @@
+//===-- RISCVInstrFormatsV.td - RISCV V Instruction Formats --*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+//  This file describes the RISC-V V extension instruction formats.
+//
+//===----------------------------------------------------------------------===//
+
+class RISCVVFormat<bits<3> val> {
+  bits<3> Value = val;
+}
+def OPIVV : RISCVVFormat<0b000>;
+def OPFVV : RISCVVFormat<0b001>;
+def OPMVV : RISCVVFormat<0b010>;
+def OPIVI : RISCVVFormat<0b011>;
+def OPIVX : RISCVVFormat<0b100>;
+def OPFVF : RISCVVFormat<0b101>;
+def OPMVX : RISCVVFormat<0b110>;
+
+class RISCVMOP<bits<3> val> {
+  bits<3> Value = val;
+}
+def MOPLDUnitStrideU : RISCVMOP<0b000>;
+def MOPLDStridedU    : RISCVMOP<0b010>;
+def MOPLDIndexedU    : RISCVMOP<0b011>;
+def MOPLDUnitStrideS : RISCVMOP<0b100>;
+def MOPLDStridedS    : RISCVMOP<0b110>;
+def MOPLDIndexedS    : RISCVMOP<0b111>;
+
+def MOPSTUnitStride  : RISCVMOP<0b000>;
+def MOPSTStrided     : RISCVMOP<0b010>;
+def MOPSTIndexedOrder: RISCVMOP<0b011>;
+def MOPSTIndexedUnOrd: RISCVMOP<0b111>;
+
+class RISCVLSUMOP<bits<5> val> {
+  bits<5> Value = val;
+}
+def LUMOPUnitStride  : RISCVLSUMOP<0b00000>;
+def LUMOPUnitStrideWholeReg : RISCVLSUMOP<0b01000>;
+def LUMOPUnitStrideFF: RISCVLSUMOP<0b10000>;
+def SUMOPUnitStride  : RISCVLSUMOP<0b00000>;
+def SUMOPUnitStrideWholeReg : RISCVLSUMOP<0b01000>;
+
+class RISCVWidth<bits<3> val> {
+  bits<3> Value = val;
+}
+def LSWidthVByte : RISCVWidth<0b000>;
+def LSWidthVHalf : RISCVWidth<0b101>;
+def LSWidthVWord : RISCVWidth<0b110>;
+def LSWidthVSEW  : RISCVWidth<0b111>;
+
+class RVInstSetVLi<dag outs, dag ins, string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatI> {
+  bits<5> rs1;
+  bits<5> rd;
+  bits<11> vtypei;
+
+  let Inst{31} = 0;
+  let Inst{30-20} = vtypei;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = 0b111;
+  let Inst{11-7} = rd;
+  let Opcode = OPC_OP_V.Value;
+
+  let Defs = [VTYPE, VL];
+}
+
+class RVInstSetVL<dag outs, dag ins, string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
+  bits<5> rs2;
+  bits<5> rs1;
+  bits<5> rd;
+
+  let Inst{31} = 1;
+  let Inst{30-25} = 0b000000;
+  let Inst{24-20} = rs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = 0b111;
+  let Inst{11-7} = rd;
+  let Opcode = OPC_OP_V.Value;
+
+  let Defs = [VTYPE, VL];
+}
+
+class RVInstVV<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
+               string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
+  bits<5> vs2;
+  bits<5> vs1;
+  bits<5> vd;
+  bit vm;
+
+  let Inst{31-26} = funct6;
+  let Inst{25} = vm;
+  let Inst{24-20} = vs2;
+  let Inst{19-15} = vs1;
+  let Inst{14-12} = opv.Value;
+  let Inst{11-7} = vd;
+  let Opcode = OPC_OP_V.Value;
+
+  let Uses = [VTYPE, VL];
+}
+
+class RVInstVX<bits<6> funct6, RISCVVFormat opv, dag outs, dag ins,
+                string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
+  bits<5> vs2;
+  bits<5> rs1;
+  bits<5> vd;
+  bit vm;
+
+  let Inst{31-26} = funct6;
+  let Inst{25} = vm;
+  let Inst{24-20} = vs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = opv.Value;
+  let Inst{11-7} = vd;
+  let Opcode = OPC_OP_V.Value;
+
+  let Uses = [VTYPE, VL];
+}
+
+class RVInstV2<bits<6> funct6, bits<5> vs2, RISCVVFormat opv, dag outs, dag ins,
+                string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
+  bits<5> rs1;
+  bits<5> vd;
+  bit vm;
+
+  let Inst{31-26} = funct6;
+  let Inst{25} = vm;
+  let Inst{24-20} = vs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = opv.Value;
+  let Inst{11-7} = vd;
+  let Opcode = OPC_OP_V.Value;
+
+  let Uses = [VTYPE, VL];
+}
+
+class RVInstIVI<bits<6> funct6, dag outs, dag ins, string opcodestr,
+                string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
+  bits<5> vs2;
+  bits<5> imm;
+  bits<5> vd;
+  bit vm;
+
+  let Inst{31-26} = funct6;
+  let Inst{25} = vm;
+  let Inst{24-20} = vs2;
+  let Inst{19-15} = imm;
+  let Inst{14-12} = 0b011;
+  let Inst{11-7} = vd;
+  let Opcode = OPC_OP_V.Value;
+
+  let Uses = [VTYPE, VL];
+}
+
+class RVInstV<bits<6> funct6, bits<5> vs1, RISCVVFormat opv, dag outs,
+              dag ins, string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
+  bits<5> vs2;
+  bits<5> vd;
+  bit vm;
+
+  let Inst{31-26} = funct6;
+  let Inst{25} = vm;
+  let Inst{24-20} = vs2;
+  let Inst{19-15} = vs1;
+  let Inst{14-12} = opv.Value;
+  let Inst{11-7} = vd;
+  let Opcode = OPC_OP_V.Value;
+
+  let Uses = [VTYPE, VL];
+}
+
+class RVInstVLU<bits<3> nf, RISCVMOP mop, RISCVLSUMOP lumop,
+                RISCVWidth width, dag outs, dag ins, string opcodestr,
+                string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
+  bits<5> rs1;
+  bits<5> vd;
+  bit vm;
+
+  let Inst{31-29} = nf;
+  let Inst{28-26} = mop.Value;
+  let Inst{25} = vm;
+  let Inst{24-20} = lumop.Value;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = width.Value;
+  let Inst{11-7} = vd;
+  let Opcode = OPC_LOAD_FP.Value;
+
+  let Uses = [VTYPE, VL];
+}
+
+class RVInstVLS<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+                dag outs, dag ins, string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
+  bits<5> rs2;
+  bits<5> rs1;
+  bits<5> vd;
+  bit vm;
+
+  let Inst{31-29} = nf;
+  let Inst{28-26} = mop.Value;
+  let Inst{25} = vm;
+  let Inst{24-20} = rs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = width.Value;
+  let Inst{11-7} = vd;
+  let Opcode = OPC_LOAD_FP.Value;
+
+  let Uses = [VTYPE, VL];
+}
+
+class RVInstVLX<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+                dag outs, dag ins, string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
+  bits<5> vs2;
+  bits<5> rs1;
+  bits<5> vd;
+  bit vm;
+
+  let Inst{31-29} = nf;
+  let Inst{28-26} = mop.Value;
+  let Inst{25} = vm;
+  let Inst{24-20} = vs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = width.Value;
+  let Inst{11-7} = vd;
+  let Opcode = OPC_LOAD_FP.Value;
+
+  let Uses = [VTYPE, VL];
+}
+
+class RVInstVSU<bits<3> nf, RISCVMOP mop, RISCVLSUMOP sumop,
+                RISCVWidth width, dag outs, dag ins, string opcodestr,
+                string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
+  bits<5> rs1;
+  bits<5> vs3;
+  bit vm;
+
+  let Inst{31-29} = nf;
+  let Inst{28-26} = mop.Value;
+  let Inst{25} = vm;
+  let Inst{24-20} = sumop.Value;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = width.Value;
+  let Inst{11-7} = vs3;
+  let Opcode = OPC_STORE_FP.Value;
+
+  let Uses = [VTYPE, VL];
+}
+
+class RVInstVSS<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+                dag outs, dag ins, string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
+  bits<5> rs2;
+  bits<5> rs1;
+  bits<5> vs3;
+  bit vm;
+
+  let Inst{31-29} = nf;
+  let Inst{28-26} = mop.Value;
+  let Inst{25} = vm;
+  let Inst{24-20} = rs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = width.Value;
+  let Inst{11-7} = vs3;
+  let Opcode = OPC_STORE_FP.Value;
+
+  let Uses = [VTYPE, VL];
+}
+
+class RVInstVSX<bits<3> nf, RISCVMOP mop, RISCVWidth width,
+                dag outs, dag ins, string opcodestr, string argstr>
+    : RVInst<outs, ins, opcodestr, argstr, [], InstFormatR> {
+  bits<5> vs2;
+  bits<5> rs1;
+  bits<5> vs3;
+  bit vm;
+
+  let Inst{31-29} = nf;
+  let Inst{28-26} = mop.Value;
+  let Inst{25} = vm;
+  let Inst{24-20} = vs2;
+  let Inst{19-15} = rs1;
+  let Inst{14-12} = width.Value;
+  let Inst{11-7} = vs3;
+  let Opcode = OPC_STORE_FP.Value;
+
+  let Uses = [VTYPE, VL];
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index 3b416ce3d3f4..d39ec505127c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -24,14 +24,14 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
+using namespace llvm;
+
 #define GEN_CHECK_COMPRESS_INSTR
 #include "RISCVGenCompressInstEmitter.inc"
 
 #define GET_INSTRINFO_CTOR_DTOR
 #include "RISCVGenInstrInfo.inc"
 
-using namespace llvm;
-
 RISCVInstrInfo::RISCVInstrInfo(RISCVSubtarget &STI)
     : RISCVGenInstrInfo(RISCV::ADJCALLSTACKDOWN, RISCV::ADJCALLSTACKUP),
       STI(STI) {}
@@ -76,10 +76,10 @@ unsigned RISCVInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
     break;
   }
 
-  if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
-      MI.getOperand(1).getImm() == 0) {
-    FrameIndex = MI.getOperand(0).getIndex();
-    return MI.getOperand(2).getReg();
+  if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+      MI.getOperand(2).getImm() == 0) {
+    FrameIndex = MI.getOperand(1).getIndex();
+    return MI.getOperand(0).getReg();
   }
 
   return 0;
@@ -112,7 +112,7 @@ void RISCVInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
 void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator I,
-                                         unsigned SrcReg, bool IsKill, int FI,
+                                         Register SrcReg, bool IsKill, int FI,
                                          const TargetRegisterClass *RC,
                                          const TargetRegisterInfo *TRI) const {
   DebugLoc DL;
@@ -139,7 +139,7 @@ void RISCVInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
 void RISCVInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator I,
-                                          unsigned DstReg, int FI,
+                                          Register DstReg, int FI,
                                           const TargetRegisterClass *RC,
                                           const TargetRegisterInfo *TRI) const {
   DebugLoc DL;
@@ -342,7 +342,7 @@ unsigned RISCVInstrInfo::insertBranch(
     *BytesAdded = 0;
 
   // Shouldn't be a fall through.
-  assert(TBB && "InsertBranch must not be told to insert a fallthrough");
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
   assert((Cond.size() == 3 || Cond.size() == 0) &&
          "RISCV branch conditions have two components!");
 
@@ -471,14 +471,38 @@ unsigned RISCVInstrInfo::getInstSizeInBytes(const MachineInstr &MI) const {
   case TargetOpcode::KILL:
   case TargetOpcode::DBG_VALUE:
     return 0;
+  // These values are determined based on RISCVExpandAtomicPseudoInsts,
+  // RISCVExpandPseudoInsts and RISCVMCCodeEmitter, depending on where the
+  // pseudos are expanded.
   case RISCV::PseudoCALLReg:
   case RISCV::PseudoCALL:
+  case RISCV::PseudoJump:
   case RISCV::PseudoTAIL:
   case RISCV::PseudoLLA:
   case RISCV::PseudoLA:
   case RISCV::PseudoLA_TLS_IE:
   case RISCV::PseudoLA_TLS_GD:
     return 8;
+  case RISCV::PseudoAtomicLoadNand32:
+  case RISCV::PseudoAtomicLoadNand64:
+    return 20;
+  case RISCV::PseudoMaskedAtomicSwap32:
+  case RISCV::PseudoMaskedAtomicLoadAdd32:
+  case RISCV::PseudoMaskedAtomicLoadSub32:
+    return 28;
+  case RISCV::PseudoMaskedAtomicLoadNand32:
+    return 32;
+  case RISCV::PseudoMaskedAtomicLoadMax32:
+  case RISCV::PseudoMaskedAtomicLoadMin32:
+    return 44;
+  case RISCV::PseudoMaskedAtomicLoadUMax32:
+  case RISCV::PseudoMaskedAtomicLoadUMin32:
+    return 36;
+  case RISCV::PseudoCmpXchg32:
+  case RISCV::PseudoCmpXchg64:
+    return 16;
+  case RISCV::PseudoMaskedCmpXchg32:
+    return 32;
   case TargetOpcode::INLINEASM:
   case TargetOpcode::INLINEASM_BR: {
     const MachineFunction &MF = *MI.getParent()->getParent();
@@ -777,6 +801,8 @@ void RISCVInstrInfo::buildOutlinedFrame(
     }
   }
 
+  MBB.addLiveIn(RISCV::X5);
+
   // Add in a return instruction to the end of the outlined frame.
   MBB.insert(MBB.end(), BuildMI(MF, DebugLoc(), get(RISCV::JALR))
       .addReg(RISCV::X0, RegState::Define)
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 625b61875133..21bc508cdc9c 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -38,13 +38,13 @@ public:
                    bool KillSrc) const override;
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+                           MachineBasicBlock::iterator MBBI, Register SrcReg,
                            bool IsKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI, unsigned DstReg,
+                            MachineBasicBlock::iterator MBBI, Register DstReg,
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
@@ -133,5 +133,24 @@ public:
 protected:
   const RISCVSubtarget &STI;
 };
-}
+
+namespace RISCV {
+// Match with the definitions in RISCVInstrFormatsV.td
+enum RVVConstraintType {
+  NoConstraint = 0,
+  WidenV = 1,
+  WidenW = 2,
+  WidenCvt = 3,
+  Narrow = 4,
+  Iota = 5,
+  SlideUp = 6,
+  Vrgather = 7,
+  Vcompress = 8,
+
+  ConstraintOffset = 5,
+  ConstraintMask = 0b1111
+};
+} // end namespace RISCV
+
+} // end namespace llvm
 #endif
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.td b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
index 8e9ad4965583..b9483062ddeb 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.td
@@ -144,6 +144,20 @@ def simm12 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<12>(Imm);}]> {
   let OperandNamespace = "RISCVOp";
 }
 
+// A 12-bit signed immediate plus one where the imm range will be -2047~2048.
+def simm12_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
+  [{return (isInt<12>(Imm) && Imm != -2048) || Imm == 2048;}]> {
+  let ParserMatchClass = SImmAsmOperand<12>;
+  let EncoderMethod = "getImmOpValue";
+  let DecoderMethod = "decodeSImmOperand<12>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return (isInt<12>(Imm) && Imm != -2048) || Imm == 2048;
+    return MCOp.isBareSymbolRef();
+  }];
+}
+
 // A 13-bit signed immediate where the least significant bit is zero.
 def simm13_lsb0 : Operand<OtherVT> {
   let ParserMatchClass = SImmAsmOperand<13, "Lsb0">;
@@ -222,6 +236,18 @@ def call_symbol : Operand<XLenVT> {
   let ParserMatchClass = CallSymbol;
 }
 
+def PseudoJumpSymbol : AsmOperandClass {
+  let Name = "PseudoJumpSymbol";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "InvalidPseudoJumpSymbol";
+  let ParserMethod = "parsePseudoJumpSymbol";
+}
+
+// A bare symbol used for pseudo jumps only.
+def pseudo_jump_symbol : Operand<XLenVT> {
+  let ParserMatchClass = PseudoJumpSymbol;
+}
+
 def TPRelAddSymbol : AsmOperandClass {
   let Name = "TPRelAddSymbol";
   let RenderMethod = "addImmOperands";
@@ -284,6 +310,12 @@ def HI20 : SDNodeXForm<imm, [{
                                    SDLoc(N), N->getValueType(0));
 }]>;
 
+// Return the negation of an immediate value.
+def NegImm : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(-N->getSExtValue(), SDLoc(N),
+                                   N->getValueType(0));
+}]>;
+
 //===----------------------------------------------------------------------===//
 // Instruction Formats
 //===----------------------------------------------------------------------===//
@@ -298,7 +330,8 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class BranchCC_rri<bits<3> funct3, string opcodestr>
     : RVInstB<funct3, OPC_BRANCH, (outs),
               (ins GPR:$rs1, GPR:$rs2, simm13_lsb0:$imm12),
-              opcodestr, "$rs1, $rs2, $imm12"> {
+              opcodestr, "$rs1, $rs2, $imm12">,
+      Sched<[WriteJmp, ReadJmp, ReadJmp]> {
   let isBranch = 1;
   let isTerminator = 1;
 }
@@ -320,13 +353,15 @@ class Store_rri<bits<3> funct3, string opcodestr>
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class ALU_ri<bits<3> funct3, string opcodestr>
     : RVInstI<funct3, OPC_OP_IMM, (outs GPR:$rd), (ins GPR:$rs1, simm12:$imm12),
-              opcodestr, "$rd, $rs1, $imm12">;
+              opcodestr, "$rd, $rs1, $imm12">,
+      Sched<[WriteIALU, ReadIALU]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class Shift_ri<bit arithshift, bits<3> funct3, string opcodestr>
     : RVInstIShift<arithshift, funct3, OPC_OP_IMM, (outs GPR:$rd),
                    (ins GPR:$rs1, uimmlog2xlen:$shamt), opcodestr,
-                   "$rd, $rs1, $shamt">;
+                   "$rd, $rs1, $shamt">,
+      Sched<[WriteShift, ReadShift]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class ALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
@@ -336,19 +371,20 @@ class ALU_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
 class CSR_ir<bits<3> funct3, string opcodestr>
     : RVInstI<funct3, OPC_SYSTEM, (outs GPR:$rd), (ins csr_sysreg:$imm12, GPR:$rs1),
-              opcodestr, "$rd, $imm12, $rs1">;
+              opcodestr, "$rd, $imm12, $rs1">, Sched<[WriteCSR, ReadCSR]>;
 
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
 class CSR_ii<bits<3> funct3, string opcodestr>
     : RVInstI<funct3, OPC_SYSTEM, (outs GPR:$rd),
               (ins csr_sysreg:$imm12, uimm5:$rs1),
-              opcodestr, "$rd, $imm12, $rs1">;
+              opcodestr, "$rd, $imm12, $rs1">, Sched<[WriteCSR]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class ShiftW_ri<bit arithshift, bits<3> funct3, string opcodestr>
     : RVInstIShiftW<arithshift, funct3, OPC_OP_IMM_32, (outs GPR:$rd),
                     (ins GPR:$rs1, uimm5:$shamt), opcodestr,
-                    "$rd, $rs1, $shamt">;
+                    "$rd, $rs1, $shamt">,
+      Sched<[WriteShift32, ReadShift32]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class ALUW_rr<bits<7> funct7, bits<3> funct3, string opcodestr>
@@ -367,19 +403,20 @@ class Priv<string opcodestr, bits<7> funct7>
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
 let isReMaterializable = 1, isAsCheapAsAMove = 1 in
 def LUI : RVInstU<OPC_LUI, (outs GPR:$rd), (ins uimm20_lui:$imm20),
-                  "lui", "$rd, $imm20">;
+                  "lui", "$rd, $imm20">, Sched<[WriteIALU]>;
 
 def AUIPC : RVInstU<OPC_AUIPC, (outs GPR:$rd), (ins uimm20_auipc:$imm20),
-                    "auipc", "$rd, $imm20">;
+                    "auipc", "$rd, $imm20">, Sched<[WriteIALU]>;
 
 let isCall = 1 in
 def JAL : RVInstJ<OPC_JAL, (outs GPR:$rd), (ins simm21_lsb0_jal:$imm20),
-                  "jal", "$rd, $imm20">;
+                  "jal", "$rd, $imm20">, Sched<[WriteJal]>;
 
 let isCall = 1 in
 def JALR : RVInstI<0b000, OPC_JALR, (outs GPR:$rd),
                    (ins GPR:$rs1, simm12:$imm12),
-                   "jalr", "$rd, ${imm12}(${rs1})">;
+                   "jalr", "$rd, ${imm12}(${rs1})">,
+           Sched<[WriteJalr, ReadJalr]>;
 } // hasSideEffects = 0, mayLoad = 0, mayStore = 0
 
 def BEQ  : BranchCC_rri<0b000, "beq">;
@@ -389,15 +426,15 @@ def BGE  : BranchCC_rri<0b101, "bge">;
 def BLTU : BranchCC_rri<0b110, "bltu">;
 def BGEU : BranchCC_rri<0b111, "bgeu">;
 
-def LB  : Load_ri<0b000, "lb">;
-def LH  : Load_ri<0b001, "lh">;
-def LW  : Load_ri<0b010, "lw">;
-def LBU : Load_ri<0b100, "lbu">;
-def LHU : Load_ri<0b101, "lhu">;
+def LB  : Load_ri<0b000, "lb">, Sched<[WriteLDB, ReadMemBase]>;
+def LH  : Load_ri<0b001, "lh">, Sched<[WriteLDH, ReadMemBase]>;
+def LW  : Load_ri<0b010, "lw">, Sched<[WriteLDW, ReadMemBase]>;
+def LBU : Load_ri<0b100, "lbu">, Sched<[WriteLDB, ReadMemBase]>;
+def LHU : Load_ri<0b101, "lhu">, Sched<[WriteLDH, ReadMemBase]>;
 
-def SB : Store_rri<0b000, "sb">;
-def SH : Store_rri<0b001, "sh">;
-def SW : Store_rri<0b010, "sw">;
+def SB : Store_rri<0b000, "sb">, Sched<[WriteSTB, ReadStoreData, ReadMemBase]>;
+def SH : Store_rri<0b001, "sh">, Sched<[WriteSTH, ReadStoreData, ReadMemBase]>;
+def SW : Store_rri<0b010, "sw">, Sched<[WriteSTW, ReadStoreData, ReadMemBase]>;
 
 // ADDI isn't always rematerializable, but isReMaterializable will be used as
 // a hint which is verified in isReallyTriviallyReMaterializable.
@@ -418,21 +455,21 @@ def SLLI : Shift_ri<0, 0b001, "slli">;
 def SRLI : Shift_ri<0, 0b101, "srli">;
 def SRAI : Shift_ri<1, 0b101, "srai">;
 
-def ADD  : ALU_rr<0b0000000, 0b000, "add">;
-def SUB  : ALU_rr<0b0100000, 0b000, "sub">;
-def SLL  : ALU_rr<0b0000000, 0b001, "sll">;
-def SLT  : ALU_rr<0b0000000, 0b010, "slt">;
-def SLTU : ALU_rr<0b0000000, 0b011, "sltu">;
-def XOR  : ALU_rr<0b0000000, 0b100, "xor">;
-def SRL  : ALU_rr<0b0000000, 0b101, "srl">;
-def SRA  : ALU_rr<0b0100000, 0b101, "sra">;
-def OR   : ALU_rr<0b0000000, 0b110, "or">;
-def AND  : ALU_rr<0b0000000, 0b111, "and">;
+def ADD  : ALU_rr<0b0000000, 0b000, "add">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def SUB  : ALU_rr<0b0100000, 0b000, "sub">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def SLL  : ALU_rr<0b0000000, 0b001, "sll">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def SLT  : ALU_rr<0b0000000, 0b010, "slt">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def SLTU : ALU_rr<0b0000000, 0b011, "sltu">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def XOR  : ALU_rr<0b0000000, 0b100, "xor">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def SRL  : ALU_rr<0b0000000, 0b101, "srl">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def SRA  : ALU_rr<0b0100000, 0b101, "sra">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def OR   : ALU_rr<0b0000000, 0b110, "or">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def AND  : ALU_rr<0b0000000, 0b111, "and">, Sched<[WriteIALU, ReadIALU, ReadIALU]>;
 
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
 def FENCE : RVInstI<0b000, OPC_MISC_MEM, (outs),
                     (ins fencearg:$pred, fencearg:$succ),
-                    "fence", "$pred, $succ"> {
+                    "fence", "$pred, $succ">, Sched<[]> {
   bits<4> pred;
   bits<4> succ;
 
@@ -441,25 +478,26 @@ def FENCE : RVInstI<0b000, OPC_MISC_MEM, (outs),
   let imm12 = {0b0000,pred,succ};
 }
 
-def FENCE_TSO : RVInstI<0b000, OPC_MISC_MEM, (outs), (ins), "fence.tso", ""> {
+def FENCE_TSO : RVInstI<0b000, OPC_MISC_MEM, (outs), (ins), "fence.tso", "">, Sched<[]> {
   let rs1 = 0;
   let rd = 0;
   let imm12 = {0b1000,0b0011,0b0011};
 }
 
-def FENCE_I : RVInstI<0b001, OPC_MISC_MEM, (outs), (ins), "fence.i", ""> {
+def FENCE_I : RVInstI<0b001, OPC_MISC_MEM, (outs), (ins), "fence.i", "">, Sched<[]> {
   let rs1 = 0;
   let rd = 0;
   let imm12 = 0;
 }
 
-def ECALL : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "ecall", ""> {
+def ECALL : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "ecall", "">, Sched<[WriteJmp]> {
   let rs1 = 0;
   let rd = 0;
   let imm12 = 0;
 }
 
-def EBREAK : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "ebreak", ""> {
+def EBREAK : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "ebreak", "">,
+             Sched<[]> {
   let rs1 = 0;
   let rd = 0;
   let imm12 = 1;
@@ -468,7 +506,8 @@ def EBREAK : RVInstI<0b000, OPC_SYSTEM, (outs), (ins), "ebreak", ""> {
 // This is a de facto standard (as set by GNU binutils) 32-bit unimplemented
 // instruction (i.e., it should always trap, if your implementation has invalid
 // instruction traps).
-def UNIMP : RVInstI<0b001, OPC_SYSTEM, (outs), (ins), "unimp", ""> {
+def UNIMP : RVInstI<0b001, OPC_SYSTEM, (outs), (ins), "unimp", "">,
+            Sched<[]> {
   let rs1 = 0;
   let rd = 0;
   let imm12 = 0b110000000000;
@@ -486,24 +525,30 @@ def CSRRCI : CSR_ii<0b111, "csrrci">;
 /// RV64I instructions
 
 let Predicates = [IsRV64] in {
-def LWU   : Load_ri<0b110, "lwu">;
-def LD    : Load_ri<0b011, "ld">;
-def SD    : Store_rri<0b011, "sd">;
+def LWU   : Load_ri<0b110, "lwu">, Sched<[WriteLDWU, ReadMemBase]>;
+def LD    : Load_ri<0b011, "ld">, Sched<[WriteLDD, ReadMemBase]>;
+def SD    : Store_rri<0b011, "sd">, Sched<[WriteSTD, ReadStoreData, ReadMemBase]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def ADDIW : RVInstI<0b000, OPC_OP_IMM_32, (outs GPR:$rd),
                     (ins GPR:$rs1, simm12:$imm12),
-                    "addiw", "$rd, $rs1, $imm12">;
+                    "addiw", "$rd, $rs1, $imm12">,
+            Sched<[WriteIALU32, ReadIALU32]>;
 
 def SLLIW : ShiftW_ri<0, 0b001, "slliw">;
 def SRLIW : ShiftW_ri<0, 0b101, "srliw">;
 def SRAIW : ShiftW_ri<1, 0b101, "sraiw">;
 
-def ADDW  : ALUW_rr<0b0000000, 0b000, "addw">;
-def SUBW  : ALUW_rr<0b0100000, 0b000, "subw">;
-def SLLW  : ALUW_rr<0b0000000, 0b001, "sllw">;
-def SRLW  : ALUW_rr<0b0000000, 0b101, "srlw">;
-def SRAW  : ALUW_rr<0b0100000, 0b101, "sraw">;
+def ADDW  : ALUW_rr<0b0000000, 0b000, "addw">,
+            Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
+def SUBW  : ALUW_rr<0b0100000, 0b000, "subw">,
+            Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
+def SLLW  : ALUW_rr<0b0000000, 0b001, "sllw">,
+            Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
+def SRLW  : ALUW_rr<0b0000000, 0b101, "srlw">,
+            Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
+def SRAW  : ALUW_rr<0b0100000, 0b101, "sraw">,
+            Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
 } // Predicates = [IsRV64]
 
 //===----------------------------------------------------------------------===//
@@ -511,26 +556,26 @@ def SRAW  : ALUW_rr<0b0100000, 0b101, "sraw">;
 //===----------------------------------------------------------------------===//
 
 let isBarrier = 1, isReturn = 1, isTerminator = 1 in {
-def URET : Priv<"uret", 0b0000000> {
+def URET : Priv<"uret", 0b0000000>, Sched<[]> {
   let rd = 0;
   let rs1 = 0;
   let rs2 = 0b00010;
 }
 
-def SRET : Priv<"sret", 0b0001000> {
+def SRET : Priv<"sret", 0b0001000>, Sched<[]> {
   let rd = 0;
   let rs1 = 0;
   let rs2 = 0b00010;
 }
 
-def MRET : Priv<"mret", 0b0011000> {
+def MRET : Priv<"mret", 0b0011000>, Sched<[]> {
   let rd = 0;
   let rs1 = 0;
   let rs2 = 0b00010;
 }
 } // isBarrier = 1, isReturn = 1, isTerminator = 1
 
-def WFI : Priv<"wfi", 0b0001000> {
+def WFI : Priv<"wfi", 0b0001000>, Sched<[]> {
   let rd = 0;
   let rs1 = 0;
   let rs2 = 0b00101;
@@ -539,9 +584,21 @@ def WFI : Priv<"wfi", 0b0001000> {
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
 def SFENCE_VMA : RVInstR<0b0001001, 0b000, OPC_SYSTEM, (outs),
                          (ins GPR:$rs1, GPR:$rs2),
-                         "sfence.vma", "$rs1, $rs2"> {
+                         "sfence.vma", "$rs1, $rs2">, Sched<[]> {
+  let rd = 0;
+}
+
+//===----------------------------------------------------------------------===//
+// Debug instructions
+//===----------------------------------------------------------------------===//
+
+let isBarrier = 1, isReturn = 1, isTerminator = 1 in {
+def DRET : Priv<"dret", 0b0111101>, Sched<[]> {
   let rd = 0;
+  let rs1 = 0;
+  let rs2 = 0b10010;
 }
+} // isBarrier = 1, isReturn = 1, isTerminator = 1
 
 //===----------------------------------------------------------------------===//
 // Assembler Pseudo Instructions (User-Level ISA, Version 2.2, Chapter 20)
@@ -739,7 +796,7 @@ def : MnemonicAlias<"sbreak", "ebreak">;
 //
 // Naming convention: For 'generic' pattern classes, we use the naming
 // convention PatTy1Ty2. For pattern classes which offer a more complex
-// expension, prefix the class name, e.g. BccPat.
+// expansion, prefix the class name, e.g. BccPat.
 //===----------------------------------------------------------------------===//
 
 /// Generic pattern classes
@@ -832,12 +889,12 @@ def : PatGprSimm12<setult, SLTIU>;
 // handled by a RISC-V instruction.
 def : Pat<(seteq GPR:$rs1, 0), (SLTIU GPR:$rs1, 1)>;
 def : Pat<(seteq GPR:$rs1, GPR:$rs2), (SLTIU (XOR GPR:$rs1, GPR:$rs2), 1)>;
-def : Pat<(seteq GPR:$rs1, simm12:$imm12),
-          (SLTIU (XORI GPR:$rs1, simm12:$imm12), 1)>;
+def : Pat<(seteq GPR:$rs1, simm12_plus1:$imm12),
+          (SLTIU (ADDI GPR:$rs1, (NegImm simm12_plus1:$imm12)), 1)>;
 def : Pat<(setne GPR:$rs1, 0), (SLTU X0, GPR:$rs1)>;
 def : Pat<(setne GPR:$rs1, GPR:$rs2), (SLTU X0, (XOR GPR:$rs1, GPR:$rs2))>;
-def : Pat<(setne GPR:$rs1, simm12:$imm12),
-          (SLTU X0, (XORI GPR:$rs1, simm12:$imm12))>;
+def : Pat<(setne GPR:$rs1, simm12_plus1:$imm12),
+          (SLTU X0, (ADDI GPR:$rs1, (NegImm simm12_plus1:$imm12)))>;
 def : Pat<(setugt GPR:$rs1, GPR:$rs2), (SLTU GPR:$rs2, GPR:$rs1)>;
 def : Pat<(setuge GPR:$rs1, GPR:$rs2), (XORI (SLTU GPR:$rs1, GPR:$rs2), 1)>;
 def : Pat<(setule GPR:$rs1, GPR:$rs2), (XORI (SLTU GPR:$rs2, GPR:$rs1), 1)>;
@@ -955,6 +1012,12 @@ def : Pat<(riscv_tail (iPTR tglobaladdr:$dst)),
 def : Pat<(riscv_tail (iPTR texternalsym:$dst)),
           (PseudoTAIL texternalsym:$dst)>;
 
+let isCall = 0, isBarrier = 0, isCodeGenOnly = 0, hasSideEffects = 0,
+    mayStore = 0, mayLoad = 0 in
+def PseudoJump : Pseudo<(outs GPR:$rd), (ins pseudo_jump_symbol:$target), []> {
+  let AsmString = "jump\t$target, $rd";
+}
+
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0,
     isAsmParserOnly = 1 in
 def PseudoLLA : Pseudo<(outs GPR:$dst), (ins bare_symbol:$src), [],
@@ -1113,3 +1176,5 @@ include "RISCVInstrInfoA.td"
 include "RISCVInstrInfoF.td"
 include "RISCVInstrInfoD.td"
 include "RISCVInstrInfoC.td"
+include "RISCVInstrInfoB.td"
+include "RISCVInstrInfoV.td"
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
index 7321f4bd9d2f..7fce37519b93 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoA.td
@@ -77,31 +77,51 @@ multiclass AtomicStPat<PatFrag StoreOp, RVInst Inst, RegisterClass StTy> {
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtA] in {
-defm LR_W       : LR_r_aq_rl<0b010, "lr.w">;
-defm SC_W       : AMO_rr_aq_rl<0b00011, 0b010, "sc.w">;
-defm AMOSWAP_W  : AMO_rr_aq_rl<0b00001, 0b010, "amoswap.w">;
-defm AMOADD_W   : AMO_rr_aq_rl<0b00000, 0b010, "amoadd.w">;
-defm AMOXOR_W   : AMO_rr_aq_rl<0b00100, 0b010, "amoxor.w">;
-defm AMOAND_W   : AMO_rr_aq_rl<0b01100, 0b010, "amoand.w">;
-defm AMOOR_W    : AMO_rr_aq_rl<0b01000, 0b010, "amoor.w">;
-defm AMOMIN_W   : AMO_rr_aq_rl<0b10000, 0b010, "amomin.w">;
-defm AMOMAX_W   : AMO_rr_aq_rl<0b10100, 0b010, "amomax.w">;
-defm AMOMINU_W  : AMO_rr_aq_rl<0b11000, 0b010, "amominu.w">;
-defm AMOMAXU_W  : AMO_rr_aq_rl<0b11100, 0b010, "amomaxu.w">;
+defm LR_W       : LR_r_aq_rl<0b010, "lr.w">, Sched<[WriteAtomicLDW, ReadAtomicLDW]>;
+defm SC_W       : AMO_rr_aq_rl<0b00011, 0b010, "sc.w">,
+                  Sched<[WriteAtomicSTW, ReadAtomicSTW, ReadAtomicSTW]>;
+defm AMOSWAP_W  : AMO_rr_aq_rl<0b00001, 0b010, "amoswap.w">,
+                  Sched<[WriteAtomicW, ReadAtomicWA, ReadAtomicWD]>;
+defm AMOADD_W   : AMO_rr_aq_rl<0b00000, 0b010, "amoadd.w">,
+                  Sched<[WriteAtomicW, ReadAtomicWA, ReadAtomicWD]>;
+defm AMOXOR_W   : AMO_rr_aq_rl<0b00100, 0b010, "amoxor.w">,
+                  Sched<[WriteAtomicW, ReadAtomicWA, ReadAtomicWD]>;
+defm AMOAND_W   : AMO_rr_aq_rl<0b01100, 0b010, "amoand.w">,
+                  Sched<[WriteAtomicW, ReadAtomicWA, ReadAtomicWD]>;
+defm AMOOR_W    : AMO_rr_aq_rl<0b01000, 0b010, "amoor.w">,
+                  Sched<[WriteAtomicW, ReadAtomicWA, ReadAtomicWD]>;
+defm AMOMIN_W   : AMO_rr_aq_rl<0b10000, 0b010, "amomin.w">,
+                  Sched<[WriteAtomicW, ReadAtomicWA, ReadAtomicWD]>;
+defm AMOMAX_W   : AMO_rr_aq_rl<0b10100, 0b010, "amomax.w">,
+                  Sched<[WriteAtomicW, ReadAtomicWA, ReadAtomicWD]>;
+defm AMOMINU_W  : AMO_rr_aq_rl<0b11000, 0b010, "amominu.w">,
+                  Sched<[WriteAtomicW, ReadAtomicWA, ReadAtomicWD]>;
+defm AMOMAXU_W  : AMO_rr_aq_rl<0b11100, 0b010, "amomaxu.w">,
+                  Sched<[WriteAtomicW, ReadAtomicWA, ReadAtomicWD]>;
 } // Predicates = [HasStdExtA]
 
 let Predicates = [HasStdExtA, IsRV64] in {
-defm LR_D       : LR_r_aq_rl<0b011, "lr.d">;
-defm SC_D       : AMO_rr_aq_rl<0b00011, 0b011, "sc.d">;
-defm AMOSWAP_D  : AMO_rr_aq_rl<0b00001, 0b011, "amoswap.d">;
-defm AMOADD_D   : AMO_rr_aq_rl<0b00000, 0b011, "amoadd.d">;
-defm AMOXOR_D   : AMO_rr_aq_rl<0b00100, 0b011, "amoxor.d">;
-defm AMOAND_D   : AMO_rr_aq_rl<0b01100, 0b011, "amoand.d">;
-defm AMOOR_D    : AMO_rr_aq_rl<0b01000, 0b011, "amoor.d">;
-defm AMOMIN_D   : AMO_rr_aq_rl<0b10000, 0b011, "amomin.d">;
-defm AMOMAX_D   : AMO_rr_aq_rl<0b10100, 0b011, "amomax.d">;
-defm AMOMINU_D  : AMO_rr_aq_rl<0b11000, 0b011, "amominu.d">;
-defm AMOMAXU_D  : AMO_rr_aq_rl<0b11100, 0b011, "amomaxu.d">;
+defm LR_D       : LR_r_aq_rl<0b011, "lr.d">, Sched<[WriteAtomicLDD, ReadAtomicLDD]>;
+defm SC_D       : AMO_rr_aq_rl<0b00011, 0b011, "sc.d">,
+                  Sched<[WriteAtomicSTD, ReadAtomicSTD, ReadAtomicSTD]>;
+defm AMOSWAP_D  : AMO_rr_aq_rl<0b00001, 0b011, "amoswap.d">,
+                  Sched<[WriteAtomicD, ReadAtomicDA, ReadAtomicDD]>;
+defm AMOADD_D   : AMO_rr_aq_rl<0b00000, 0b011, "amoadd.d">,
+                  Sched<[WriteAtomicD, ReadAtomicDA, ReadAtomicDD]>;
+defm AMOXOR_D   : AMO_rr_aq_rl<0b00100, 0b011, "amoxor.d">,
+                  Sched<[WriteAtomicD, ReadAtomicDA, ReadAtomicDD]>;
+defm AMOAND_D   : AMO_rr_aq_rl<0b01100, 0b011, "amoand.d">,
+                  Sched<[WriteAtomicD, ReadAtomicDA, ReadAtomicDD]>;
+defm AMOOR_D    : AMO_rr_aq_rl<0b01000, 0b011, "amoor.d">,
+                  Sched<[WriteAtomicD, ReadAtomicDA, ReadAtomicDD]>;
+defm AMOMIN_D   : AMO_rr_aq_rl<0b10000, 0b011, "amomin.d">,
+                  Sched<[WriteAtomicD, ReadAtomicDA, ReadAtomicDD]>;
+defm AMOMAX_D   : AMO_rr_aq_rl<0b10100, 0b011, "amomax.d">,
+                  Sched<[WriteAtomicD, ReadAtomicDA, ReadAtomicDD]>;
+defm AMOMINU_D  : AMO_rr_aq_rl<0b11000, 0b011, "amominu.d">,
+                  Sched<[WriteAtomicD, ReadAtomicDA, ReadAtomicDD]>;
+defm AMOMAXU_D  : AMO_rr_aq_rl<0b11100, 0b011, "amomaxu.d">,
+                  Sched<[WriteAtomicD, ReadAtomicDA, ReadAtomicDD]>;
 } // Predicates = [HasStdExtA, IsRV64]
 
 //===----------------------------------------------------------------------===//
@@ -215,13 +235,13 @@ class PseudoMaskedAMOUMinUMax
 
 class PseudoMaskedAMOPat<Intrinsic intrin, Pseudo AMOInst>
     : Pat<(intrin GPR:$addr, GPR:$incr, GPR:$mask, timm:$ordering),
-          (AMOInst GPR:$addr, GPR:$incr, GPR:$mask, imm:$ordering)>;
+          (AMOInst GPR:$addr, GPR:$incr, GPR:$mask, timm:$ordering)>;
 
 class PseudoMaskedAMOMinMaxPat<Intrinsic intrin, Pseudo AMOInst>
     : Pat<(intrin GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt,
            timm:$ordering),
           (AMOInst GPR:$addr, GPR:$incr, GPR:$mask, GPR:$shiftamt,
-           imm:$ordering)>;
+           timm:$ordering)>;
 
 def PseudoMaskedAtomicSwap32 : PseudoMaskedAMO;
 def : PseudoMaskedAMOPat<int_riscv_masked_atomicrmw_xchg_i32,
@@ -290,7 +310,7 @@ def PseudoMaskedCmpXchg32
 def : Pat<(int_riscv_masked_cmpxchg_i32
             GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering),
           (PseudoMaskedCmpXchg32
-            GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering)>;
+            GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering)>;
 
 } // Predicates = [HasStdExtA]
 
@@ -367,5 +387,5 @@ defm : PseudoCmpXchgPat<"atomic_cmp_swap_64", PseudoCmpXchg64>;
 def : Pat<(int_riscv_masked_cmpxchg_i64
             GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering),
           (PseudoMaskedCmpXchg32
-            GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, imm:$ordering)>;
+            GPR:$addr, GPR:$cmpval, GPR:$newval, GPR:$mask, timm:$ordering)>;
 } // Predicates = [HasStdExtA, IsRV64]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoB.td b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
new file mode 100644
index 000000000000..34a463626e29
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoB.td
@@ -0,0 +1,634 @@
+//===-- RISCVInstrInfoB.td - RISC-V 'B' instructions -------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the RISC-V instructions from the standard 'B' Bitmanip
+// extension, version 0.92.
+// This version is still experimental as the 'B' extension hasn't been
+// ratified yet.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Operand definitions.
+//===----------------------------------------------------------------------===//
+
+def UImmLog2XLenHalfAsmOperand : AsmOperandClass {
+  let Name = "UImmLog2XLenHalf";
+  let RenderMethod = "addImmOperands";
+  let DiagnosticType = "InvalidUImmLog2XLenHalf";
+}
+
+def shfl_uimm : Operand<XLenVT>, ImmLeaf<XLenVT, [{
+  if (Subtarget->is64Bit())
+    return isUInt<5>(Imm);
+  return isUInt<4>(Imm);
+}]> {
+  let ParserMatchClass = UImmLog2XLenHalfAsmOperand;
+  let DecoderMethod = "decodeUImmOperand<5>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (!MCOp.evaluateAsConstantImm(Imm))
+      return false;
+    if (STI.getTargetTriple().isArch64Bit())
+      return  isUInt<5>(Imm);
+    return isUInt<4>(Imm);
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction class templates
+//===----------------------------------------------------------------------===//
+
+// Some of these templates should be moved to RISCVInstrFormats.td once the B
+// extension has been ratified.
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVBUnary<bits<7> funct7, bits<5> funct5, bits<3> funct3,
+               RISCVOpcode opcode, string opcodestr>
+    : RVInstR<funct7, funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1),
+              opcodestr, "$rd, $rs1"> {
+  let Inst{24-20} = funct5;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVBALUW_ri<bits<3> funct3, string opcodestr>
+    : RVInstI<funct3, OPC_OP_IMM_32, (outs GPR:$rd),
+              (ins GPR:$rs1, simm12:$imm12), opcodestr, "$rd, $rs1, $imm12">;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVBShift_ri<bits<5> funct5, bits<3> funct3, RISCVOpcode opcode,
+                  string opcodestr>
+    : RVInstI<funct3, opcode, (outs GPR:$rd),
+              (ins GPR:$rs1, uimmlog2xlen:$shamt), opcodestr,
+              "$rd, $rs1, $shamt"> {
+  bits<6> shamt;
+
+  let Inst{31-27} = funct5;
+  // NOTE: the bit op(26)=1 is used to select funnel shifts. All other
+  // shifts operations and operations that live in the encoding space
+  // of the shifts (single bit operations, grev, gorc) use op(26) = 0
+  let Inst{26} = 0;
+  let Inst{25-20} = shamt;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVBShiftW_ri<bits<7> funct7, bits<3> funct3, RISCVOpcode opcode,
+                   string opcodestr>
+    : RVInstI<funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, uimm5:$shamt),
+              opcodestr, "$rd, $rs1, $shamt"> {
+  bits<5> shamt;
+
+  let Inst{31-25} = funct7;
+  let Inst{24-20} = shamt;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVBShfl_ri<bits<6> funct6, bits<3> funct3, RISCVOpcode opcode,
+                 string opcodestr>
+    : RVInstI<funct3, opcode, (outs GPR:$rd), (ins GPR:$rs1, shfl_uimm:$shamt),
+              opcodestr, "$rd, $rs1, $shamt"> {
+  bits<6> shamt;
+
+  let Inst{31-26} = funct6;
+  let Inst{25-20} = shamt;
+}
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVBTernaryR<bits<2> funct2, bits<3> funct3_b, RISCVOpcode opcode,
+                  string opcodestr, string argstr>
+    : RVInstR4<funct2, opcode, (outs GPR:$rd),
+               (ins GPR:$rs1, GPR:$rs2, GPR:$rs3), opcodestr, argstr> {
+  let Inst{14-12} = funct3_b;
+}
+
+// Currently used by FSRI only
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVBTernaryImm6<bits<3> funct3_b, RISCVOpcode opcode,
+                     string opcodestr, string argstr>
+    : RVInstR4<0b10, opcode, (outs GPR:$rd),
+               (ins GPR:$rs1, GPR:$rs3, uimmlog2xlen:$shamt),
+               opcodestr, argstr> {
+  bits<6> shamt;
+
+  // NOTE: the first argument of RVInstR4 is hardcoded to 0b10 like the other
+  // funnel shift instructions. The second bit of the argument though is
+  // overwritten by the shamt as the encoding of this particular instruction
+  // requires. This is to obtain op(26) = 1 as required by funnel shift
+  // instructions without the need of a confusing argument in the definition
+  // of the instruction.
+  let Inst{25-20} = shamt;
+  let Inst{14-12} = funct3_b;
+}
+
+// Currently used by FSRIW only
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVBTernaryImm5<bits<2> funct2, bits<3> funct3_b, RISCVOpcode opcode,
+                     string opcodestr, string argstr>
+    : RVInstR4<funct2, opcode, (outs GPR:$rd),
+               (ins GPR:$rs1, GPR:$rs3, uimm5:$shamt), opcodestr, argstr> {
+  bits<5> shamt;
+
+  let Inst{24-20} = shamt;
+  let Inst{14-12} = funct3_b;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtZbbOrZbp] in {
+def ANDN  : ALU_rr<0b0100000, 0b111, "andn">, Sched<[]>;
+def ORN   : ALU_rr<0b0100000, 0b110, "orn">, Sched<[]>;
+def XNOR  : ALU_rr<0b0100000, 0b100, "xnor">, Sched<[]>;
+} // Predicates = [HasStdExtZbbOrZbp]
+
+let Predicates = [HasStdExtZbb] in {
+def SLO  : ALU_rr<0b0010000, 0b001, "slo">, Sched<[]>;
+def SRO  : ALU_rr<0b0010000, 0b101, "sro">, Sched<[]>;
+} // Predicates = [HasStdExtZbb]
+
+let Predicates = [HasStdExtZbbOrZbp] in {
+def ROL   : ALU_rr<0b0110000, 0b001, "rol">, Sched<[]>;
+def ROR   : ALU_rr<0b0110000, 0b101, "ror">, Sched<[]>;
+} // Predicates = [HasStdExtZbbOrZbp]
+
+let Predicates = [HasStdExtZbs] in {
+def SBCLR : ALU_rr<0b0100100, 0b001, "sbclr">, Sched<[]>;
+def SBSET : ALU_rr<0b0010100, 0b001, "sbset">, Sched<[]>;
+def SBINV : ALU_rr<0b0110100, 0b001, "sbinv">, Sched<[]>;
+def SBEXT : ALU_rr<0b0100100, 0b101, "sbext">, Sched<[]>;
+} // Predicates = [HasStdExtZbs]
+
+let Predicates = [HasStdExtZbp] in {
+def GORC : ALU_rr<0b0010100, 0b101, "gorc">, Sched<[]>;
+def GREV : ALU_rr<0b0110100, 0b101, "grev">, Sched<[]>;
+} // Predicates = [HasStdExtZbp]
+
+let Predicates = [HasStdExtZbb] in {
+def SLOI : RVBShift_ri<0b00100, 0b001, OPC_OP_IMM, "sloi">, Sched<[]>;
+def SROI : RVBShift_ri<0b00100, 0b101, OPC_OP_IMM, "sroi">, Sched<[]>;
+} // Predicates = [HasStdExtZbb]
+
+let Predicates = [HasStdExtZbbOrZbp] in
+def RORI  : RVBShift_ri<0b01100, 0b101, OPC_OP_IMM, "rori">, Sched<[]>;
+
+let Predicates = [HasStdExtZbs] in {
+def SBCLRI : RVBShift_ri<0b01001, 0b001, OPC_OP_IMM, "sbclri">, Sched<[]>;
+def SBSETI : RVBShift_ri<0b00101, 0b001, OPC_OP_IMM, "sbseti">, Sched<[]>;
+def SBINVI : RVBShift_ri<0b01101, 0b001, OPC_OP_IMM, "sbinvi">, Sched<[]>;
+def SBEXTI : RVBShift_ri<0b01001, 0b101, OPC_OP_IMM, "sbexti">, Sched<[]>;
+} // Predicates = [HasStdExtZbs]
+
+let Predicates = [HasStdExtZbp] in {
+def GREVI : RVBShift_ri<0b01101, 0b101, OPC_OP_IMM, "grevi">, Sched<[]>;
+def GORCI : RVBShift_ri<0b00101, 0b101, OPC_OP_IMM, "gorci">, Sched<[]>;
+} // Predicates = [HasStdExtZbp]
+
+let Predicates = [HasStdExtZbt] in {
+def CMIX : RVBTernaryR<0b11, 0b001, OPC_OP, "cmix", "$rd, $rs2, $rs1, $rs3">,
+           Sched<[]>;
+def CMOV : RVBTernaryR<0b11, 0b101, OPC_OP, "cmov", "$rd, $rs2, $rs1, $rs3">,
+           Sched<[]>;
+def FSL  : RVBTernaryR<0b10, 0b001, OPC_OP, "fsl", "$rd, $rs1, $rs3, $rs2">,
+           Sched<[]>;
+def FSR  : RVBTernaryR<0b10, 0b101, OPC_OP, "fsr", "$rd, $rs1, $rs3, $rs2">,
+           Sched<[]>;
+def FSRI : RVBTernaryImm6<0b101, OPC_OP_IMM, "fsri",
+                          "$rd, $rs1, $rs3, $shamt">, Sched<[]>;
+} // Predicates = [HasStdExtZbt]
+
+let Predicates = [HasStdExtZbb] in {
+def CLZ  : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0010011>, "clz">,
+           Sched<[]>;
+def CTZ  : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0010011>, "ctz">,
+           Sched<[]>;
+def PCNT : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0010011>, "pcnt">,
+           Sched<[]>;
+} // Predicates = [HasStdExtZbb]
+
+let Predicates = [HasStdExtZbm, IsRV64] in
+def BMATFLIP : RVBUnary<0b0110000, 0b00011, 0b001, RISCVOpcode<0b0010011>,
+                        "bmatflip">, Sched<[]>;
+
+let Predicates = [HasStdExtZbb] in {
+def SEXTB : RVBUnary<0b0110000, 0b00100, 0b001, RISCVOpcode<0b0010011>,
+                     "sext.b">, Sched<[]>;
+def SEXTH : RVBUnary<0b0110000, 0b00101, 0b001, RISCVOpcode<0b0010011>,
+                     "sext.h">, Sched<[]>;
+} // Predicates = [HasStdExtZbb]
+
+let Predicates = [HasStdExtZbr] in {
+def CRC32B : RVBUnary<0b0110000, 0b10000, 0b001, RISCVOpcode<0b0010011>,
+                      "crc32.b">, Sched<[]>;
+def CRC32H : RVBUnary<0b0110000, 0b10001, 0b001, RISCVOpcode<0b0010011>,
+                      "crc32.h">, Sched<[]>;
+def CRC32W : RVBUnary<0b0110000, 0b10010, 0b001, RISCVOpcode<0b0010011>,
+                      "crc32.w">, Sched<[]>;
+} // Predicates = [HasStdExtZbr]
+
+let Predicates = [HasStdExtZbr, IsRV64] in
+def CRC32D  : RVBUnary<0b0110000, 0b10011, 0b001, RISCVOpcode<0b0010011>,
+                       "crc32.d">, Sched<[]>;
+
+let Predicates = [HasStdExtZbr] in {
+def CRC32CB : RVBUnary<0b0110000, 0b11000, 0b001, RISCVOpcode<0b0010011>,
+                       "crc32c.b">, Sched<[]>;
+def CRC32CH : RVBUnary<0b0110000, 0b11001, 0b001, RISCVOpcode<0b0010011>,
+                       "crc32c.h">, Sched<[]>;
+def CRC32CW : RVBUnary<0b0110000, 0b11010, 0b001, RISCVOpcode<0b0010011>,
+                       "crc32c.w">, Sched<[]>;
+} // Predicates = [HasStdExtZbr]
+
+let Predicates = [HasStdExtZbr, IsRV64] in
+def CRC32CD : RVBUnary<0b0110000, 0b11011, 0b001, RISCVOpcode<0b0010011>,
+                       "crc32c.d">, Sched<[]>;
+
+let Predicates = [HasStdExtZbc] in {
+def CLMUL  : ALU_rr<0b0000101, 0b001, "clmul">, Sched<[]>;
+def CLMULR : ALU_rr<0b0000101, 0b010, "clmulr">, Sched<[]>;
+def CLMULH : ALU_rr<0b0000101, 0b011, "clmulh">, Sched<[]>;
+} // Predicates = [HasStdExtZbc]
+
+let Predicates = [HasStdExtZbb] in {
+def MIN  : ALU_rr<0b0000101, 0b100, "min">, Sched<[]>;
+def MAX  : ALU_rr<0b0000101, 0b101, "max">, Sched<[]>;
+def MINU : ALU_rr<0b0000101, 0b110, "minu">, Sched<[]>;
+def MAXU : ALU_rr<0b0000101, 0b111, "maxu">, Sched<[]>;
+} // Predicates = [HasStdExtZbb]
+
+let Predicates = [HasStdExtZbp] in {
+def SHFL   : ALU_rr<0b0000100, 0b001, "shfl">, Sched<[]>;
+def UNSHFL : ALU_rr<0b0000100, 0b101, "unshfl">, Sched<[]>;
+} // Predicates = [HasStdExtZbp]
+
+let Predicates = [HasStdExtZbe] in {
+def BDEP : ALU_rr<0b0100100, 0b110, "bdep">, Sched<[]>;
+def BEXT : ALU_rr<0b0000100, 0b110, "bext">, Sched<[]>;
+} // Predicates = [HasStdExtZbe]
+
+let Predicates = [HasStdExtZbbOrZbp] in {
+def PACK  : ALU_rr<0b0000100, 0b100, "pack">, Sched<[]>;
+def PACKU : ALU_rr<0b0100100, 0b100, "packu">, Sched<[]>;
+} // Predicates = [HasStdExtZbbOrZbp]
+
+let Predicates = [HasStdExtZbm, IsRV64] in {
+def BMATOR   : ALU_rr<0b0000100, 0b011, "bmator">, Sched<[]>;
+def BMATXOR  : ALU_rr<0b0100100, 0b011, "bmatxor">, Sched<[]>;
+} // Predicates = [HasStdExtZbm, IsRV64]
+
+let Predicates = [HasStdExtZbbOrZbp] in
+def PACKH : ALU_rr<0b0000100, 0b111, "packh">, Sched<[]>;
+
+let Predicates = [HasStdExtZbf] in
+def BFP : ALU_rr<0b0100100, 0b111, "bfp">, Sched<[]>;
+
+let Predicates = [HasStdExtZbp] in {
+def SHFLI   : RVBShfl_ri<0b000010, 0b001, OPC_OP_IMM, "shfli">, Sched<[]>;
+def UNSHFLI : RVBShfl_ri<0b000010, 0b101, OPC_OP_IMM, "unshfli">, Sched<[]>;
+} // Predicates = [HasStdExtZbp]
+
+let Predicates = [HasStdExtZbb, IsRV64] in {
+def ADDIWU : RVBALUW_ri<0b100, "addiwu">, Sched<[]>;
+def SLLIUW : RVBShift_ri<0b00001, 0b001, OPC_OP_IMM_32, "slliu.w">, Sched<[]>;
+def ADDWU : ALUW_rr<0b0000101, 0b000, "addwu">, Sched<[]>;
+def SUBWU : ALUW_rr<0b0100101, 0b000, "subwu">, Sched<[]>;
+def ADDUW : ALUW_rr<0b0000100, 0b000, "addu.w">, Sched<[]>;
+def SUBUW : ALUW_rr<0b0100100, 0b000, "subu.w">, Sched<[]>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+
+let Predicates = [HasStdExtZbb, IsRV64] in {
+def SLOW   : ALUW_rr<0b0010000, 0b001, "slow">, Sched<[]>;
+def SROW   : ALUW_rr<0b0010000, 0b101, "srow">, Sched<[]>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+def ROLW  : ALUW_rr<0b0110000, 0b001, "rolw">, Sched<[]>;
+def RORW  : ALUW_rr<0b0110000, 0b101, "rorw">, Sched<[]>;
+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+
+let Predicates = [HasStdExtZbs, IsRV64] in {
+def SBCLRW : ALUW_rr<0b0100100, 0b001, "sbclrw">, Sched<[]>;
+def SBSETW : ALUW_rr<0b0010100, 0b001, "sbsetw">, Sched<[]>;
+def SBINVW : ALUW_rr<0b0110100, 0b001, "sbinvw">, Sched<[]>;
+def SBEXTW : ALUW_rr<0b0100100, 0b101, "sbextw">, Sched<[]>;
+} // Predicates = [HasStdExtZbs, IsRV64]
+
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def GORCW  : ALUW_rr<0b0010100, 0b101, "gorcw">, Sched<[]>;
+def GREVW  : ALUW_rr<0b0110100, 0b101, "grevw">, Sched<[]>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+
+let Predicates = [HasStdExtZbb, IsRV64] in {
+def SLOIW  : RVBShiftW_ri<0b0010000, 0b001, OPC_OP_IMM_32, "sloiw">, Sched<[]>;
+def SROIW  : RVBShiftW_ri<0b0010000, 0b101, OPC_OP_IMM_32, "sroiw">, Sched<[]>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in
+def RORIW : RVBShiftW_ri<0b0110000, 0b101, OPC_OP_IMM_32, "roriw">, Sched<[]>;
+
+let Predicates = [HasStdExtZbs, IsRV64] in {
+def SBCLRIW : RVBShiftW_ri<0b0100100, 0b001, OPC_OP_IMM_32, "sbclriw">,
+              Sched<[]>;
+def SBSETIW : RVBShiftW_ri<0b0010100, 0b001, OPC_OP_IMM_32, "sbsetiw">,
+              Sched<[]>;
+def SBINVIW : RVBShiftW_ri<0b0110100, 0b001, OPC_OP_IMM_32, "sbinviw">,
+              Sched<[]>;
+} // Predicates = [HasStdExtZbs, IsRV64]
+
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def GORCIW : RVBShiftW_ri<0b0010100, 0b101, OPC_OP_IMM_32, "gorciw">, Sched<[]>;
+def GREVIW : RVBShiftW_ri<0b0110100, 0b101, OPC_OP_IMM_32, "greviw">, Sched<[]>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+
+let Predicates = [HasStdExtZbt, IsRV64] in {
+def FSLW  : RVBTernaryR<0b10, 0b001, OPC_OP_32,
+                        "fslw", "$rd, $rs1, $rs3, $rs2">, Sched<[]>;
+def FSRW  : RVBTernaryR<0b10, 0b101, OPC_OP_32, "fsrw",
+                        "$rd, $rs1, $rs3, $rs2">, Sched<[]>;
+def FSRIW : RVBTernaryImm5<0b10, 0b101, OPC_OP_IMM_32,
+                           "fsriw", "$rd, $rs1, $rs3, $shamt">, Sched<[]>;
+} // Predicates = [HasStdExtZbt, IsRV64]
+
+let Predicates = [HasStdExtZbb, IsRV64] in {
+def CLZW   : RVBUnary<0b0110000, 0b00000, 0b001, RISCVOpcode<0b0011011>,
+                      "clzw">, Sched<[]>;
+def CTZW   : RVBUnary<0b0110000, 0b00001, 0b001, RISCVOpcode<0b0011011>,
+                      "ctzw">, Sched<[]>;
+def PCNTW  : RVBUnary<0b0110000, 0b00010, 0b001, RISCVOpcode<0b0011011>,
+                      "pcntw">, Sched<[]>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+
+let Predicates = [HasStdExtZbc, IsRV64] in {
+def CLMULW  : ALUW_rr<0b0000101, 0b001, "clmulw">, Sched<[]>;
+def CLMULRW : ALUW_rr<0b0000101, 0b010, "clmulrw">, Sched<[]>;
+def CLMULHW : ALUW_rr<0b0000101, 0b011, "clmulhw">, Sched<[]>;
+} // Predicates = [HasStdExtZbc, IsRV64]
+
+let Predicates = [HasStdExtZbp, IsRV64] in {
+def SHFLW   : ALUW_rr<0b0000100, 0b001, "shflw">, Sched<[]>;
+def UNSHFLW : ALUW_rr<0b0000100, 0b101, "unshflw">, Sched<[]>;
+} // Predicates = [HasStdExtZbp, IsRV64]
+
+let Predicates = [HasStdExtZbe, IsRV64] in {
+def BDEPW : ALUW_rr<0b0100100, 0b110, "bdepw">, Sched<[]>;
+def BEXTW : ALUW_rr<0b0000100, 0b110, "bextw">, Sched<[]>;
+} // Predicates = [HasStdExtZbe, IsRV64]
+
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+def PACKW  : ALUW_rr<0b0000100, 0b100, "packw">, Sched<[]>;
+def PACKUW : ALUW_rr<0b0100100, 0b100, "packuw">, Sched<[]>;
+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+
+let Predicates = [HasStdExtZbf, IsRV64] in
+def BFPW : ALUW_rr<0b0100100, 0b111, "bfpw">, Sched<[]>;
+
+//===----------------------------------------------------------------------===//
+// Future compressed instructions
+//===----------------------------------------------------------------------===//
+
+// The presence of these instructions in the B extension is purely experimental
+// and they should be moved to the C extension as soon as they are ratified.
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
+class RVBInstC<bits<2> funct2, string opcodestr>
+    : RVInst16<(outs GPRC:$rs_wb), (ins GPRC:$rs), opcodestr, "$rs", [],
+               InstFormatCR> {
+  bits<3> rs;
+  let Constraints = "$rs = $rs_wb";
+
+  let Inst{15-12} = 0b0110;
+  let Inst{11-10} = funct2;
+  let Inst{9-7} = rs;
+  let Inst{6-0} = 0b0000001;
+}
+
+// The namespace RVBC exists to avoid encoding conflicts with the compressed
+// instructions c.addi16sp and c.lui already implemented in the C extension.
+
+let DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtC] in {
+def C_NOT : RVBInstC<0b00, "c.not">, Sched<[]>;
+def C_NEG : RVBInstC<0b01, "c.neg">, Sched<[]>;
+} // DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtC]
+
+let DecoderNamespace = "RVBC", Predicates = [HasStdExtZbproposedc, HasStdExtZbbOrZbp, HasStdExtC, IsRV64] in
+def C_ZEXTW : RVBInstC<0b10, "c.zext.w">, Sched<[]>;
+
+//===----------------------------------------------------------------------===//
+// Pseudo Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtZbb, IsRV32] in {
+def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF)>;
+def : InstAlias<"zext.h $rd, $rs", (PACK GPR:$rd, GPR:$rs, X0)>;
+} // Predicates = [HasStdExtZbb, IsRV32]
+
+let Predicates = [HasStdExtZbb, IsRV64] in {
+def : InstAlias<"zext.b $rd, $rs", (ANDI GPR:$rd, GPR:$rs, 0xFF)>;
+def : InstAlias<"zext.h $rd, $rs", (PACKW GPR:$rd, GPR:$rs, X0)>;
+def : InstAlias<"zext.w $rd, $rs", (PACK GPR:$rd, GPR:$rs, X0)>;
+} // Predicates = [HasStdExtZbb, IsRV64]
+
+let Predicates = [HasStdExtZbbOrZbp] in {
+def : InstAlias<"rev.p $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b00001)>,
+      Sched<[]>;
+def : InstAlias<"rev2.n $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00010)>,
+      Sched<[]>;
+def : InstAlias<"rev.n $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b00011)>,
+      Sched<[]>;
+def : InstAlias<"rev4.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00100)>,
+      Sched<[]>;
+def : InstAlias<"rev2.b $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b00110)>,
+      Sched<[]>;
+def : InstAlias<"rev.b $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b00111)>,
+      Sched<[]>;
+def : InstAlias<"rev8.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01000)>,
+      Sched<[]>;
+def : InstAlias<"rev4.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01100)>,
+      Sched<[]>;
+def : InstAlias<"rev2.h $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b01110)>,
+      Sched<[]>;
+def : InstAlias<"rev.h $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b01111)>,
+      Sched<[]>;
+
+def : InstAlias<"zip.n $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b0001)>,
+      Sched<[]>;
+def : InstAlias<"unzip.n $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b0001)>,
+      Sched<[]>;
+def : InstAlias<"zip2.b $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b0010)>,
+      Sched<[]>;
+def : InstAlias<"unzip2.b $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0010)>,
+      Sched<[]>;
+def : InstAlias<"zip.b $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b0011)>,
+      Sched<[]>;
+def : InstAlias<"unzip.b $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b0011)>,
+      Sched<[]>;
+def : InstAlias<"zip4.h $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b0100)>,
+      Sched<[]>;
+def : InstAlias<"unzip4.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0100)>,
+      Sched<[]>;
+def : InstAlias<"zip2.h $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b0110)>,
+      Sched<[]>;
+def : InstAlias<"unzip2.h $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b0110)>,
+      Sched<[]>;
+def : InstAlias<"zip.h $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b0111)>,
+      Sched<[]>;
+def : InstAlias<"unzip.h $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b0111)>,
+      Sched<[]>;
+
+def : InstAlias<"orc.p $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b00001)>,
+      Sched<[]>;
+def : InstAlias<"orc2.n $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00010)>,
+      Sched<[]>;
+def : InstAlias<"orc.n $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b00011)>,
+      Sched<[]>;
+def : InstAlias<"orc4.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00100)>,
+      Sched<[]>;
+def : InstAlias<"orc2.b $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b00110)>,
+      Sched<[]>;
+def : InstAlias<"orc.b $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b00111)>,
+      Sched<[]>;
+def : InstAlias<"orc8.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01000)>,
+      Sched<[]>;
+def : InstAlias<"orc4.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01100)>,
+      Sched<[]>;
+def : InstAlias<"orc2.h $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b01110)>,
+      Sched<[]>;
+def : InstAlias<"orc.h $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b01111)>,
+      Sched<[]>;
+} // Predicates = [HasStdExtZbbOrZbp]
+
+let Predicates = [HasStdExtZbbOrZbp, IsRV32] in {
+def : InstAlias<"rev16 $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b10000)>, Sched<[]>;
+def : InstAlias<"rev8 $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b11000)>, Sched<[]>;
+def : InstAlias<"rev4 $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b11100)>, Sched<[]>;
+def : InstAlias<"rev2 $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b11110)>, Sched<[]>;
+def : InstAlias<"rev $rd, $rs",   (GREVI GPR:$rd, GPR:$rs, 0b11111)>, Sched<[]>;
+
+def : InstAlias<"zip8 $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b1000)>,
+      Sched<[]>;
+def : InstAlias<"unzip8 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1000)>,
+      Sched<[]>;
+def : InstAlias<"zip4 $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b1100)>,
+      Sched<[]>;
+def : InstAlias<"unzip4 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1100)>,
+      Sched<[]>;
+def : InstAlias<"zip2 $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b1110)>,
+      Sched<[]>;
+def : InstAlias<"unzip2 $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b1110)>,
+      Sched<[]>;
+def : InstAlias<"zip $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b1111)>,
+      Sched<[]>;
+def : InstAlias<"unzip $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b1111)>,
+      Sched<[]>;
+
+def : InstAlias<"orc16 $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b10000)>, Sched<[]>;
+def : InstAlias<"orc8 $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b11000)>, Sched<[]>;
+def : InstAlias<"orc4 $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b11100)>, Sched<[]>;
+def : InstAlias<"orc2 $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b11110)>, Sched<[]>;
+def : InstAlias<"orc $rd, $rs",   (GORCI GPR:$rd, GPR:$rs, 0b11111)>, Sched<[]>;
+} // Predicates = [HasStdExtZbbOrZbp, IsRV32]
+
+let Predicates = [HasStdExtZbbOrZbp, IsRV64] in {
+def : InstAlias<"rev16.w $rd, $rs", (GREVI GPR:$rd, GPR:$rs, 0b010000)>,
+      Sched<[]>;
+def : InstAlias<"rev8.w $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b011000)>,
+      Sched<[]>;
+def : InstAlias<"rev4.w $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b011100)>,
+      Sched<[]>;
+def : InstAlias<"rev2.w $rd, $rs",  (GREVI GPR:$rd, GPR:$rs, 0b011110)>,
+      Sched<[]>;
+def : InstAlias<"rev.w $rd, $rs",   (GREVI GPR:$rd, GPR:$rs, 0b011111)>,
+      Sched<[]>;
+def : InstAlias<"rev32 $rd, $rs",   (GREVI GPR:$rd, GPR:$rs, 0b100000)>,
+      Sched<[]>;
+def : InstAlias<"rev16 $rd, $rs",   (GREVI GPR:$rd, GPR:$rs, 0b110000)>,
+      Sched<[]>;
+def : InstAlias<"rev8 $rd, $rs",    (GREVI GPR:$rd, GPR:$rs, 0b111000)>,
+      Sched<[]>;
+def : InstAlias<"rev4 $rd, $rs",    (GREVI GPR:$rd, GPR:$rs, 0b111100)>,
+      Sched<[]>;
+def : InstAlias<"rev2 $rd, $rs",    (GREVI GPR:$rd, GPR:$rs, 0b111110)>,
+      Sched<[]>;
+def : InstAlias<"rev $rd, $rs",     (GREVI GPR:$rd, GPR:$rs, 0b111111)>,
+      Sched<[]>;
+
+def : InstAlias<"zip8.w $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b01000)>,
+      Sched<[]>;
+def : InstAlias<"unzip8.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01000)>,
+      Sched<[]>;
+def : InstAlias<"zip4.w $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b01100)>,
+      Sched<[]>;
+def : InstAlias<"unzip4.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01100)>,
+      Sched<[]>;
+def : InstAlias<"zip2.w $rd, $rs",   (SHFLI   GPR:$rd, GPR:$rs, 0b01110)>,
+      Sched<[]>;
+def : InstAlias<"unzip2.w $rd, $rs", (UNSHFLI GPR:$rd, GPR:$rs, 0b01110)>,
+      Sched<[]>;
+def : InstAlias<"zip.w $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b01111)>,
+      Sched<[]>;
+def : InstAlias<"unzip.w $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b01111)>,
+      Sched<[]>;
+def : InstAlias<"zip16 $rd, $rs",    (SHFLI   GPR:$rd, GPR:$rs, 0b10000)>,
+      Sched<[]>;
+def : InstAlias<"unzip16 $rd, $rs",  (UNSHFLI GPR:$rd, GPR:$rs, 0b10000)>,
+      Sched<[]>;
+def : InstAlias<"zip8 $rd, $rs",     (SHFLI   GPR:$rd, GPR:$rs, 0b11000)>,
+      Sched<[]>;
+def : InstAlias<"unzip8 $rd, $rs",   (UNSHFLI GPR:$rd, GPR:$rs, 0b11000)>,
+      Sched<[]>;
+def : InstAlias<"zip4 $rd, $rs",     (SHFLI   GPR:$rd, GPR:$rs, 0b11100)>,
+      Sched<[]>;
+def : InstAlias<"unzip4 $rd, $rs",   (UNSHFLI GPR:$rd, GPR:$rs, 0b11100)>,
+      Sched<[]>;
+def : InstAlias<"zip2 $rd, $rs",     (SHFLI   GPR:$rd, GPR:$rs, 0b11110)>,
+      Sched<[]>;
+def : InstAlias<"unzip2 $rd, $rs",   (UNSHFLI GPR:$rd, GPR:$rs, 0b11110)>,
+      Sched<[]>;
+def : InstAlias<"zip $rd, $rs",      (SHFLI   GPR:$rd, GPR:$rs, 0b11111)>,
+      Sched<[]>;
+def : InstAlias<"unzip $rd, $rs",    (UNSHFLI GPR:$rd, GPR:$rs, 0b11111)>,
+      Sched<[]>;
+
+def : InstAlias<"orc16.w $rd, $rs", (GORCI GPR:$rd, GPR:$rs, 0b010000)>,
+      Sched<[]>;
+def : InstAlias<"orc8.w $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b011000)>,
+      Sched<[]>;
+def : InstAlias<"orc4.w $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b011100)>,
+      Sched<[]>;
+def : InstAlias<"orc2.w $rd, $rs",  (GORCI GPR:$rd, GPR:$rs, 0b011110)>,
+      Sched<[]>;
+def : InstAlias<"orc.w $rd, $rs",   (GORCI GPR:$rd, GPR:$rs, 0b011111)>,
+      Sched<[]>;
+def : InstAlias<"orc32 $rd, $rs",   (GORCI GPR:$rd, GPR:$rs, 0b100000)>,
+      Sched<[]>;
+def : InstAlias<"orc16 $rd, $rs",   (GORCI GPR:$rd, GPR:$rs, 0b110000)>,
+      Sched<[]>;
+def : InstAlias<"orc8 $rd, $rs",    (GORCI GPR:$rd, GPR:$rs, 0b111000)>,
+      Sched<[]>;
+def : InstAlias<"orc4 $rd, $rs",    (GORCI GPR:$rd, GPR:$rs, 0b111100)>,
+      Sched<[]>;
+def : InstAlias<"orc2 $rd, $rs",    (GORCI GPR:$rd, GPR:$rs, 0b111110)>,
+      Sched<[]>;
+def : InstAlias<"orc $rd, $rs",     (GORCI GPR:$rd, GPR:$rs, 0b111111)>,
+      Sched<[]>;
+} // Predicates = [HasStdExtZbbOrZbp, IsRV64]
+
+//===----------------------------------------------------------------------===//
+// Compressed Instruction patterns
+//===----------------------------------------------------------------------===//
+let Predicates = [HasStdExtZbproposedc, HasStdExtC] in {
+def : CompressPat<(XORI GPRC:$rs1, GPRC:$rs1, -1),
+                  (C_NOT GPRC:$rs1)>;
+def : CompressPat<(SUB GPRC:$rs1, X0, GPRC:$rs1),
+                  (C_NEG GPRC:$rs1)>;
+} // Predicates = [HasStdExtZbproposedc, HasStdExtC]
+
+let Predicates = [HasStdExtZbproposedc, HasStdExtZbbOrZbp, HasStdExtC, IsRV64] in {
+def : CompressPat<(PACK GPRC:$rs1, GPRC:$rs1, X0),
+                  (C_ZEXTW GPRC:$rs1)>;
+} // Predicates = [HasStdExtZbproposedc, HasStdExtC, IsRV64]
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index fa0050f107b2..f68767847ade 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -282,7 +282,8 @@ let Predicates = [HasStdExtC] in {
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [X2] in
 def C_ADDI4SPN : RVInst16CIW<0b000, 0b00, (outs GPRC:$rd),
                              (ins SP:$rs1, uimm10_lsb00nonzero:$imm),
-                             "c.addi4spn", "$rd, $rs1, $imm"> {
+                             "c.addi4spn", "$rd, $rs1, $imm">,
+                             Sched<[WriteIALU, ReadIALU]> {
   bits<5> rs1;
   let Inst{12-11} = imm{5-4};
   let Inst{10-7} = imm{9-6};
@@ -291,13 +292,15 @@ def C_ADDI4SPN : RVInst16CIW<0b000, 0b00, (outs GPRC:$rd),
 }
 
 let Predicates = [HasStdExtC, HasStdExtD] in
-def C_FLD  : CLoad_ri<0b001, "c.fld", FPR64C, uimm8_lsb000> {
+def C_FLD  : CLoad_ri<0b001, "c.fld", FPR64C, uimm8_lsb000>,
+             Sched<[WriteFLD64, ReadMemBase]> {
   bits<8> imm;
   let Inst{12-10} = imm{5-3};
   let Inst{6-5} = imm{7-6};
 }
 
-def C_LW : CLoad_ri<0b010, "c.lw", GPRC, uimm7_lsb00> {
+def C_LW : CLoad_ri<0b010, "c.lw", GPRC, uimm7_lsb00>,
+           Sched<[WriteLDW, ReadMemBase]> {
   bits<7> imm;
   let Inst{12-10} = imm{5-3};
   let Inst{6} = imm{2};
@@ -306,7 +309,8 @@ def C_LW : CLoad_ri<0b010, "c.lw", GPRC, uimm7_lsb00> {
 
 let DecoderNamespace = "RISCV32Only_",
     Predicates = [HasStdExtC, HasStdExtF, IsRV32] in
-def C_FLW  : CLoad_ri<0b011, "c.flw", FPR32C, uimm7_lsb00> {
+def C_FLW  : CLoad_ri<0b011, "c.flw", FPR32C, uimm7_lsb00>,
+             Sched<[WriteFLD32, ReadMemBase]> {
   bits<7> imm;
   let Inst{12-10} = imm{5-3};
   let Inst{6} = imm{2};
@@ -314,20 +318,23 @@ def C_FLW  : CLoad_ri<0b011, "c.flw", FPR32C, uimm7_lsb00> {
 }
 
 let Predicates = [HasStdExtC, IsRV64] in
-def C_LD : CLoad_ri<0b011, "c.ld", GPRC, uimm8_lsb000> {
+def C_LD : CLoad_ri<0b011, "c.ld", GPRC, uimm8_lsb000>,
+           Sched<[WriteLDD, ReadMemBase]> {
   bits<8> imm;
   let Inst{12-10} = imm{5-3};
   let Inst{6-5} = imm{7-6};
 }
 
 let Predicates = [HasStdExtC, HasStdExtD] in
-def C_FSD  : CStore_rri<0b101, "c.fsd", FPR64C, uimm8_lsb000> {
+def C_FSD  : CStore_rri<0b101, "c.fsd", FPR64C, uimm8_lsb000>,
+             Sched<[WriteFST64, ReadStoreData, ReadMemBase]> {
   bits<8> imm;
   let Inst{12-10} = imm{5-3};
   let Inst{6-5} = imm{7-6};
 }
 
-def C_SW : CStore_rri<0b110, "c.sw", GPRC, uimm7_lsb00> {
+def C_SW : CStore_rri<0b110, "c.sw", GPRC, uimm7_lsb00>,
+           Sched<[WriteSTW, ReadStoreData, ReadMemBase]> {
   bits<7> imm;
   let Inst{12-10} = imm{5-3};
   let Inst{6} = imm{2};
@@ -336,7 +343,8 @@ def C_SW : CStore_rri<0b110, "c.sw", GPRC, uimm7_lsb00> {
 
 let DecoderNamespace = "RISCV32Only_",
     Predicates = [HasStdExtC, HasStdExtF, IsRV32]  in
-def C_FSW  : CStore_rri<0b111, "c.fsw", FPR32C, uimm7_lsb00> {
+def C_FSW  : CStore_rri<0b111, "c.fsw", FPR32C, uimm7_lsb00>,
+             Sched<[WriteFST32, ReadStoreData, ReadMemBase]> {
   bits<7> imm;
   let Inst{12-10} = imm{5-3};
   let Inst{6} = imm{2};
@@ -344,14 +352,16 @@ def C_FSW  : CStore_rri<0b111, "c.fsw", FPR32C, uimm7_lsb00> {
 }
 
 let Predicates = [HasStdExtC, IsRV64] in
-def C_SD : CStore_rri<0b111, "c.sd", GPRC, uimm8_lsb000> {
+def C_SD : CStore_rri<0b111, "c.sd", GPRC, uimm8_lsb000>,
+           Sched<[WriteSTD, ReadStoreData, ReadMemBase]> {
   bits<8> imm;
   let Inst{12-10} = imm{5-3};
   let Inst{6-5} = imm{7-6};
 }
 
 let rd = 0, imm = 0, hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">
+def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">,
+            Sched<[WriteNop]>
 {
   let Inst{6-2} = 0;
 }
@@ -359,7 +369,8 @@ def C_NOP : RVInst16CI<0b000, 0b01, (outs), (ins), "c.nop", "">
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
                         (ins GPRNoX0:$rd, simm6nonzero:$imm),
-                        "c.addi", "$rd, $imm"> {
+                        "c.addi", "$rd, $imm">,
+             Sched<[WriteIALU, ReadIALU]> {
   let Constraints = "$rd = $rd_wb";
   let Inst{6-2} = imm{4-0};
 }
@@ -367,7 +378,8 @@ def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_ADDI_NOP : RVInst16CI<0b000, 0b01, (outs GPRX0:$rd_wb),
                             (ins GPRX0:$rd, immzero:$imm),
-                            "c.addi", "$rd, $imm"> {
+                            "c.addi", "$rd, $imm">,
+                 Sched<[WriteIALU, ReadIALU]> {
   let Constraints = "$rd = $rd_wb";
   let Inst{6-2} = 0;
   let isAsmParserOnly = 1;
@@ -377,27 +389,30 @@ let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1,
     DecoderNamespace = "RISCV32Only_", Defs = [X1],
     Predicates = [HasStdExtC, IsRV32]  in
 def C_JAL : RVInst16CJ<0b001, 0b01, (outs), (ins simm12_lsb0:$offset),
-                       "c.jal", "$offset">;
+                       "c.jal", "$offset">, Sched<[WriteJal]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
     Predicates = [HasStdExtC, IsRV64] in
 def C_ADDIW : RVInst16CI<0b001, 0b01, (outs GPRNoX0:$rd_wb),
                          (ins GPRNoX0:$rd, simm6:$imm),
-                         "c.addiw", "$rd, $imm"> {
+                         "c.addiw", "$rd, $imm">,
+              Sched<[WriteIALU32, ReadIALU32]> {
   let Constraints = "$rd = $rd_wb";
   let Inst{6-2} = imm{4-0};
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_LI : RVInst16CI<0b010, 0b01, (outs GPRNoX0:$rd), (ins simm6:$imm),
-                      "c.li", "$rd, $imm"> {
+                      "c.li", "$rd, $imm">,
+           Sched<[WriteIALU]> {
   let Inst{6-2} = imm{4-0};
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_ADDI16SP : RVInst16CI<0b011, 0b01, (outs SP:$rd_wb),
                             (ins SP:$rd, simm10_lsb0000nonzero:$imm),
-                            "c.addi16sp", "$rd, $imm"> {
+                            "c.addi16sp", "$rd, $imm">,
+                 Sched<[WriteIALU, ReadIALU]> {
   let Constraints = "$rd = $rd_wb";
   let Inst{12} = imm{9};
   let Inst{11-7} = 2;
@@ -410,78 +425,93 @@ def C_ADDI16SP : RVInst16CI<0b011, 0b01, (outs SP:$rd_wb),
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_LUI : RVInst16CI<0b011, 0b01, (outs GPRNoX0X2:$rd),
                        (ins c_lui_imm:$imm),
-                       "c.lui", "$rd, $imm"> {
+                       "c.lui", "$rd, $imm">,
+            Sched<[WriteIALU]> {
   let Inst{6-2} = imm{4-0};
 }
 
-def C_SRLI : Shift_right<0b00, "c.srli", GPRC, uimmlog2xlennonzero>;
-def C_SRAI : Shift_right<0b01, "c.srai", GPRC, uimmlog2xlennonzero>;
+def C_SRLI : Shift_right<0b00, "c.srli", GPRC, uimmlog2xlennonzero>,
+             Sched<[WriteShift, ReadShift]>;
+def C_SRAI : Shift_right<0b01, "c.srai", GPRC, uimmlog2xlennonzero>,
+             Sched<[WriteShift, ReadShift]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_ANDI : RVInst16CB<0b100, 0b01, (outs GPRC:$rs1_wb), (ins GPRC:$rs1, simm6:$imm),
-                        "c.andi", "$rs1, $imm"> {
+                        "c.andi", "$rs1, $imm">,
+             Sched<[WriteIALU, ReadIALU]> {
   let Constraints = "$rs1 = $rs1_wb";
   let Inst{12} = imm{5};
   let Inst{11-10} = 0b10;
   let Inst{6-2} = imm{4-0};
 }
 
-def C_SUB  : CS_ALU<0b100011, 0b00, "c.sub", GPRC>;
-def C_XOR  : CS_ALU<0b100011, 0b01, "c.xor", GPRC>;
-def C_OR   : CS_ALU<0b100011, 0b10, "c.or" , GPRC>;
-def C_AND  : CS_ALU<0b100011, 0b11, "c.and", GPRC>;
+def C_SUB  : CS_ALU<0b100011, 0b00, "c.sub", GPRC>,
+             Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def C_XOR  : CS_ALU<0b100011, 0b01, "c.xor", GPRC>,
+             Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def C_OR   : CS_ALU<0b100011, 0b10, "c.or" , GPRC>,
+             Sched<[WriteIALU, ReadIALU, ReadIALU]>;
+def C_AND  : CS_ALU<0b100011, 0b11, "c.and", GPRC>,
+             Sched<[WriteIALU, ReadIALU, ReadIALU]>;
 
 let Predicates = [HasStdExtC, IsRV64] in {
-def C_SUBW : CS_ALU<0b100111, 0b00, "c.subw", GPRC>;
-def C_ADDW : CS_ALU<0b100111, 0b01, "c.addw", GPRC>;
+def C_SUBW : CS_ALU<0b100111, 0b00, "c.subw", GPRC>,
+             Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
+def C_ADDW : CS_ALU<0b100111, 0b01, "c.addw", GPRC>,
+             Sched<[WriteIALU32, ReadIALU32, ReadIALU32]>;
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_J : RVInst16CJ<0b101, 0b01, (outs), (ins simm12_lsb0:$offset),
-                     "c.j", "$offset"> {
+                     "c.j", "$offset">, Sched<[WriteJmp]> {
   let isBranch = 1;
   let isTerminator=1;
   let isBarrier=1;
 }
 
-def C_BEQZ : Bcz<0b110, "c.beqz",  seteq, GPRC>;
-def C_BNEZ : Bcz<0b111, "c.bnez",  setne, GPRC>;
+def C_BEQZ : Bcz<0b110, "c.beqz",  seteq, GPRC>, Sched<[WriteJmp]>;
+def C_BNEZ : Bcz<0b111, "c.bnez",  setne, GPRC>, Sched<[WriteJmp]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_SLLI : RVInst16CI<0b000, 0b10, (outs GPRNoX0:$rd_wb),
                         (ins GPRNoX0:$rd, uimmlog2xlennonzero:$imm),
-                        "c.slli" ,"$rd, $imm"> {
+                        "c.slli" ,"$rd, $imm">,
+             Sched<[WriteShift, ReadShift]> {
   let Constraints = "$rd = $rd_wb";
   let Inst{6-2} = imm{4-0};
 }
 
 let Predicates = [HasStdExtC, HasStdExtD] in
-def C_FLDSP  : CStackLoad<0b001, "c.fldsp", FPR64, uimm9_lsb000> {
+def C_FLDSP  : CStackLoad<0b001, "c.fldsp", FPR64, uimm9_lsb000>,
+               Sched<[WriteFLD64, ReadMemBase]> {
   let Inst{6-5} = imm{4-3};
   let Inst{4-2} = imm{8-6};
 }
 
-def C_LWSP : CStackLoad<0b010, "c.lwsp", GPRNoX0, uimm8_lsb00> {
+def C_LWSP : CStackLoad<0b010, "c.lwsp", GPRNoX0, uimm8_lsb00>,
+             Sched<[WriteLDW, ReadMemBase]> {
   let Inst{6-4} = imm{4-2};
   let Inst{3-2} = imm{7-6};
 }
 
 let DecoderNamespace = "RISCV32Only_",
     Predicates = [HasStdExtC, HasStdExtF, IsRV32] in
-def C_FLWSP  : CStackLoad<0b011, "c.flwsp", FPR32, uimm8_lsb00> {
+def C_FLWSP  : CStackLoad<0b011, "c.flwsp", FPR32, uimm8_lsb00>,
+               Sched<[WriteFLD32, ReadMemBase]> {
   let Inst{6-4} = imm{4-2};
   let Inst{3-2} = imm{7-6};
 }
 
 let Predicates = [HasStdExtC, IsRV64] in
-def C_LDSP : CStackLoad<0b011, "c.ldsp", GPRNoX0, uimm9_lsb000> {
+def C_LDSP : CStackLoad<0b011, "c.ldsp", GPRNoX0, uimm9_lsb000>,
+             Sched<[WriteLDD, ReadMemBase]> {
   let Inst{6-5} = imm{4-3};
   let Inst{4-2} = imm{8-6};
 }
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_JR : RVInst16CR<0b1000, 0b10, (outs), (ins GPRNoX0:$rs1),
-                      "c.jr", "$rs1"> {
+                      "c.jr", "$rs1">, Sched<[WriteJmpReg]> {
   let isBranch = 1;
   let isBarrier = 1;
   let isTerminator = 1;
@@ -491,43 +521,49 @@ def C_JR : RVInst16CR<0b1000, 0b10, (outs), (ins GPRNoX0:$rs1),
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_MV : RVInst16CR<0b1000, 0b10, (outs GPRNoX0:$rs1), (ins GPRNoX0:$rs2),
-                      "c.mv", "$rs1, $rs2">;
+                      "c.mv", "$rs1, $rs2">,
+           Sched<[WriteIALU, ReadIALU]>;
 
 let rs1 = 0, rs2 = 0, hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
-def C_EBREAK : RVInst16CR<0b1001, 0b10, (outs), (ins), "c.ebreak", "">;
+def C_EBREAK : RVInst16CR<0b1001, 0b10, (outs), (ins), "c.ebreak", "">, Sched<[]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0,
     isCall=1, Defs=[X1], rs2 = 0 in
 def C_JALR : RVInst16CR<0b1001, 0b10, (outs), (ins GPRNoX0:$rs1),
-                        "c.jalr", "$rs1">;
+                        "c.jalr", "$rs1">, Sched<[WriteJalr, ReadJalr]>;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 def C_ADD : RVInst16CR<0b1001, 0b10, (outs GPRNoX0:$rs1_wb),
                        (ins GPRNoX0:$rs1, GPRNoX0:$rs2),
-                       "c.add", "$rs1, $rs2"> {
+                       "c.add", "$rs1, $rs2">,
+            Sched<[WriteIALU, ReadIALU, ReadIALU]> {
   let Constraints = "$rs1 = $rs1_wb";
 }
 
 let Predicates = [HasStdExtC, HasStdExtD] in
-def C_FSDSP  : CStackStore<0b101, "c.fsdsp", FPR64, uimm9_lsb000> {
+def C_FSDSP  : CStackStore<0b101, "c.fsdsp", FPR64, uimm9_lsb000>,
+               Sched<[WriteFST64, ReadStoreData, ReadMemBase]> {
   let Inst{12-10} = imm{5-3};
   let Inst{9-7}   = imm{8-6};
 }
 
-def C_SWSP : CStackStore<0b110, "c.swsp", GPR, uimm8_lsb00> {
+def C_SWSP : CStackStore<0b110, "c.swsp", GPR, uimm8_lsb00>,
+             Sched<[WriteSTW, ReadStoreData, ReadMemBase]> {
   let Inst{12-9} = imm{5-2};
   let Inst{8-7}  = imm{7-6};
 }
 
 let DecoderNamespace = "RISCV32Only_",
     Predicates = [HasStdExtC, HasStdExtF, IsRV32] in
-def C_FSWSP  : CStackStore<0b111, "c.fswsp", FPR32, uimm8_lsb00> {
+def C_FSWSP  : CStackStore<0b111, "c.fswsp", FPR32, uimm8_lsb00>,
+               Sched<[WriteFST32, ReadStoreData, ReadMemBase]> {
   let Inst{12-9} = imm{5-2};
   let Inst{8-7}  = imm{7-6};
 }
 
 let Predicates = [HasStdExtC, IsRV64] in
-def C_SDSP : CStackStore<0b111, "c.sdsp", GPR, uimm9_lsb000> {
+def C_SDSP : CStackStore<0b111, "c.sdsp", GPR, uimm9_lsb000>,
+             Sched<[WriteSTD, ReadStoreData, ReadMemBase]> {
   let Inst{12-10} = imm{5-3};
   let Inst{9-7}   = imm{8-6};
 }
@@ -535,7 +571,8 @@ def C_SDSP : CStackStore<0b111, "c.sdsp", GPR, uimm9_lsb000> {
 // The all zeros pattern isn't a valid RISC-V instruction. It's used by GNU
 // binutils as 16-bit instruction known to be unimplemented (i.e., trapping).
 let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in
-def C_UNIMP : RVInst16<(outs), (ins), "c.unimp", "", [], InstFormatOther> {
+def C_UNIMP : RVInst16<(outs), (ins), "c.unimp", "", [], InstFormatOther>,
+              Sched<[]> {
   let Inst{15-0} = 0;
 }
 
@@ -551,7 +588,7 @@ let Predicates = [HasStdExtC, HasRVCHints], hasSideEffects = 0, mayLoad = 0,
 
 let rd = 0 in
 def C_NOP_HINT : RVInst16CI<0b000, 0b01, (outs), (ins simm6nonzero:$imm),
-                            "c.nop", "$imm"> {
+                            "c.nop", "$imm">, Sched<[WriteNop]> {
   let Inst{6-2} = imm{4-0};
   let DecoderMethod = "decodeRVCInstrSImm";
 }
@@ -559,7 +596,8 @@ def C_NOP_HINT : RVInst16CI<0b000, 0b01, (outs), (ins simm6nonzero:$imm),
 // Just a different syntax for the c.nop hint: c.addi x0, simm6 vs c.nop simm6.
 def C_ADDI_HINT_X0 : RVInst16CI<0b000, 0b01, (outs GPRX0:$rd_wb),
                                 (ins GPRX0:$rd, simm6nonzero:$imm),
-                                "c.addi", "$rd, $imm"> {
+                                "c.addi", "$rd, $imm">,
+                     Sched<[WriteIALU, ReadIALU]> {
   let Constraints = "$rd = $rd_wb";
   let Inst{6-2} = imm{4-0};
   let isAsmParserOnly = 1;
@@ -567,14 +605,16 @@ def C_ADDI_HINT_X0 : RVInst16CI<0b000, 0b01, (outs GPRX0:$rd_wb),
 
 def C_ADDI_HINT_IMM_ZERO : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
                                       (ins GPRNoX0:$rd, immzero:$imm),
-                                      "c.addi", "$rd, $imm"> {
+                                      "c.addi", "$rd, $imm">,
+                           Sched<[WriteIALU, ReadIALU]> {
   let Constraints = "$rd = $rd_wb";
   let Inst{6-2} = 0;
   let isAsmParserOnly = 1;
 }
 
 def C_LI_HINT : RVInst16CI<0b010, 0b01, (outs GPRX0:$rd), (ins simm6:$imm),
-                           "c.li", "$rd, $imm"> {
+                           "c.li", "$rd, $imm">,
+                Sched<[WriteIALU]> {
   let Inst{6-2} = imm{4-0};
   let Inst{11-7} = 0;
   let DecoderMethod = "decodeRVCInstrRdSImm";
@@ -582,14 +622,15 @@ def C_LI_HINT : RVInst16CI<0b010, 0b01, (outs GPRX0:$rd), (ins simm6:$imm),
 
 def C_LUI_HINT : RVInst16CI<0b011, 0b01, (outs GPRX0:$rd),
                             (ins c_lui_imm:$imm),
-                            "c.lui", "$rd, $imm"> {
+                            "c.lui", "$rd, $imm">,
+                 Sched<[WriteIALU]> {
   let Inst{6-2} = imm{4-0};
   let Inst{11-7} = 0;
   let DecoderMethod = "decodeRVCInstrRdSImm";
 }
 
 def C_MV_HINT : RVInst16CR<0b1000, 0b10, (outs GPRX0:$rs1), (ins GPRNoX0:$rs2),
-                           "c.mv", "$rs1, $rs2">
+                           "c.mv", "$rs1, $rs2">, Sched<[WriteIALU, ReadIALU]>
 {
   let Inst{11-7} = 0;
   let DecoderMethod = "decodeRVCInstrRdRs2";
@@ -597,7 +638,8 @@ def C_MV_HINT : RVInst16CR<0b1000, 0b10, (outs GPRX0:$rs1), (ins GPRNoX0:$rs2),
 
 def C_ADD_HINT : RVInst16CR<0b1001, 0b10, (outs GPRX0:$rs1_wb),
                             (ins GPRX0:$rs1, GPRNoX0:$rs2),
-                            "c.add", "$rs1, $rs2"> {
+                            "c.add", "$rs1, $rs2">,
+                 Sched<[WriteIALU, ReadIALU, ReadIALU]> {
   let Constraints = "$rs1 = $rs1_wb";
   let Inst{11-7} = 0;
   let DecoderMethod = "decodeRVCInstrRdRs1Rs2";
@@ -605,7 +647,8 @@ def C_ADD_HINT : RVInst16CR<0b1001, 0b10, (outs GPRX0:$rs1_wb),
 
 def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb),
                              (ins GPRX0:$rd, uimmlog2xlennonzero:$imm),
-                             "c.slli" ,"$rd, $imm"> {
+                             "c.slli" ,"$rd, $imm">,
+                  Sched<[WriteShift, ReadShift]> {
   let Constraints = "$rd = $rd_wb";
   let Inst{6-2} = imm{4-0};
   let Inst{11-7} = 0;
@@ -613,7 +656,8 @@ def C_SLLI_HINT : RVInst16CI<0b000, 0b10, (outs GPRX0:$rd_wb),
 }
 
 def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd),
-                               "c.slli64" ,"$rd"> {
+                               "c.slli64" ,"$rd">,
+                    Sched<[WriteShift, ReadShift]> {
   let Constraints = "$rd = $rd_wb";
   let Inst{6-2} = 0;
   let Inst{12} = 0;
@@ -621,7 +665,8 @@ def C_SLLI64_HINT : RVInst16CI<0b000, 0b10, (outs GPR:$rd_wb), (ins GPR:$rd),
 
 def C_SRLI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb),
                                (ins GPRC:$rd),
-                               "c.srli64", "$rd"> {
+                               "c.srli64", "$rd">,
+                    Sched<[WriteShift, ReadShift]> {
   let Constraints = "$rd = $rd_wb";
   let Inst{6-2} = 0;
   let Inst{11-10} = 0;
@@ -630,7 +675,8 @@ def C_SRLI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb),
 
 def C_SRAI64_HINT : RVInst16CI<0b100, 0b01, (outs GPRC:$rd_wb),
                                (ins GPRC:$rd),
-                               "c.srai64", "$rd"> {
+                               "c.srai64", "$rd">,
+                    Sched<[WriteShift, ReadShift]> {
   let Constraints = "$rd = $rd_wb";
   let Inst{6-2} = 0;
   let Inst{11-10} = 1;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
index b5343e8a8309..6c36f53cd563 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoD.td
@@ -57,7 +57,8 @@ class FPALUDDynFrmAlias<FPALUD_rr_frm Inst, string OpcodeStr>
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class FPCmpD_rr<bits<3> funct3, string opcodestr>
     : RVInstR<0b1010001, funct3, OPC_OP_FP, (outs GPR:$rd),
-              (ins FPR64:$rs1, FPR64:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+              (ins FPR64:$rs1, FPR64:$rs2), opcodestr, "$rd, $rs1, $rs2">,
+      Sched<[WriteFCmp64, ReadFCmp64, ReadFCmp64]>;
 
 //===----------------------------------------------------------------------===//
 // Instructions
@@ -68,7 +69,8 @@ let Predicates = [HasStdExtD] in {
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
 def FLD : RVInstI<0b011, OPC_LOAD_FP, (outs FPR64:$rd),
                   (ins GPR:$rs1, simm12:$imm12),
-                  "fld", "$rd, ${imm12}(${rs1})">;
+                  "fld", "$rd, ${imm12}(${rs1})">,
+          Sched<[WriteFLD64, ReadFMemBase]>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
 // reflecting the order these fields are specified in the instruction
@@ -76,43 +78,60 @@ def FLD : RVInstI<0b011, OPC_LOAD_FP, (outs FPR64:$rd),
 let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
 def FSD : RVInstS<0b011, OPC_STORE_FP, (outs),
                   (ins FPR64:$rs2, GPR:$rs1, simm12:$imm12),
-                   "fsd", "$rs2, ${imm12}(${rs1})">;
+                   "fsd", "$rs2, ${imm12}(${rs1})">,
+          Sched<[WriteFST64, ReadStoreData, ReadFMemBase]>;
 
-def FMADD_D  : FPFMAD_rrr_frm<OPC_MADD, "fmadd.d">;
+def FMADD_D  : FPFMAD_rrr_frm<OPC_MADD, "fmadd.d">,
+               Sched<[WriteFMulAdd64, ReadFMulAdd64, ReadFMulAdd64, ReadFMulAdd64]>;
 def          : FPFMADDynFrmAlias<FMADD_D, "fmadd.d">;
-def FMSUB_D  : FPFMAD_rrr_frm<OPC_MSUB, "fmsub.d">;
+def FMSUB_D  : FPFMAD_rrr_frm<OPC_MSUB, "fmsub.d">,
+               Sched<[WriteFMulSub64, ReadFMulSub64, ReadFMulSub64, ReadFMulSub64]>;
 def          : FPFMADDynFrmAlias<FMSUB_D, "fmsub.d">;
-def FNMSUB_D : FPFMAD_rrr_frm<OPC_NMSUB, "fnmsub.d">;
+def FNMSUB_D : FPFMAD_rrr_frm<OPC_NMSUB, "fnmsub.d">,
+               Sched<[WriteFMulSub64, ReadFMulSub64, ReadFMulSub64, ReadFMulSub64]>;
 def          : FPFMADDynFrmAlias<FNMSUB_D, "fnmsub.d">;
-def FNMADD_D : FPFMAD_rrr_frm<OPC_NMADD, "fnmadd.d">;
+def FNMADD_D : FPFMAD_rrr_frm<OPC_NMADD, "fnmadd.d">,
+               Sched<[WriteFMulAdd64, ReadFMulAdd64, ReadFMulAdd64, ReadFMulAdd64]>;
 def          : FPFMADDynFrmAlias<FNMADD_D, "fnmadd.d">;
 
-def FADD_D : FPALUD_rr_frm<0b0000001, "fadd.d">;
+def FADD_D : FPALUD_rr_frm<0b0000001, "fadd.d">,
+             Sched<[WriteFALU64, ReadFALU64, ReadFALU64]>;
 def        : FPALUDDynFrmAlias<FADD_D, "fadd.d">;
-def FSUB_D : FPALUD_rr_frm<0b0000101, "fsub.d">;
+def FSUB_D : FPALUD_rr_frm<0b0000101, "fsub.d">,
+             Sched<[WriteFALU64, ReadFALU64, ReadFALU64]>;
 def        : FPALUDDynFrmAlias<FSUB_D, "fsub.d">;
-def FMUL_D : FPALUD_rr_frm<0b0001001, "fmul.d">;
+def FMUL_D : FPALUD_rr_frm<0b0001001, "fmul.d">,
+             Sched<[WriteFMul64, ReadFMul64, ReadFMul64]>;
 def        : FPALUDDynFrmAlias<FMUL_D, "fmul.d">;
-def FDIV_D : FPALUD_rr_frm<0b0001101, "fdiv.d">;
+def FDIV_D : FPALUD_rr_frm<0b0001101, "fdiv.d">,
+             Sched<[WriteFDiv64, ReadFDiv64, ReadFDiv64]>;
 def        : FPALUDDynFrmAlias<FDIV_D, "fdiv.d">;
 
-def FSQRT_D : FPUnaryOp_r_frm<0b0101101, FPR64, FPR64, "fsqrt.d"> {
+def FSQRT_D : FPUnaryOp_r_frm<0b0101101, FPR64, FPR64, "fsqrt.d">,
+              Sched<[WriteFSqrt64, ReadFSqrt64]> {
   let rs2 = 0b00000;
 }
 def         : FPUnaryOpDynFrmAlias<FSQRT_D, "fsqrt.d", FPR64, FPR64>;
 
-def FSGNJ_D  : FPALUD_rr<0b0010001, 0b000, "fsgnj.d">;
-def FSGNJN_D : FPALUD_rr<0b0010001, 0b001, "fsgnjn.d">;
-def FSGNJX_D : FPALUD_rr<0b0010001, 0b010, "fsgnjx.d">;
-def FMIN_D   : FPALUD_rr<0b0010101, 0b000, "fmin.d">;
-def FMAX_D   : FPALUD_rr<0b0010101, 0b001, "fmax.d">;
-
-def FCVT_S_D : FPUnaryOp_r_frm<0b0100000, FPR32, FPR64, "fcvt.s.d"> {
+def FSGNJ_D  : FPALUD_rr<0b0010001, 0b000, "fsgnj.d">,
+               Sched<[WriteFSGNJ64, ReadFSGNJ64, ReadFSGNJ64]>;
+def FSGNJN_D : FPALUD_rr<0b0010001, 0b001, "fsgnjn.d">,
+               Sched<[WriteFSGNJ64, ReadFSGNJ64, ReadFSGNJ64]>;
+def FSGNJX_D : FPALUD_rr<0b0010001, 0b010, "fsgnjx.d">,
+               Sched<[WriteFSGNJ64, ReadFSGNJ64, ReadFSGNJ64]>;
+def FMIN_D   : FPALUD_rr<0b0010101, 0b000, "fmin.d">,
+               Sched<[WriteFMinMax64, ReadFMinMax64, ReadFMinMax64]>;
+def FMAX_D   : FPALUD_rr<0b0010101, 0b001, "fmax.d">,
+               Sched<[WriteFMinMax64, ReadFMinMax64, ReadFMinMax64]>;
+
+def FCVT_S_D : FPUnaryOp_r_frm<0b0100000, FPR32, FPR64, "fcvt.s.d">,
+               Sched<[WriteFCvtF64ToF32, ReadFCvtF64ToF32]> {
   let rs2 = 0b00001;
 }
 def          : FPUnaryOpDynFrmAlias<FCVT_S_D, "fcvt.s.d", FPR32, FPR64>;
 
-def FCVT_D_S : FPUnaryOp_r<0b0100001, 0b000, FPR64, FPR32, "fcvt.d.s"> {
+def FCVT_D_S : FPUnaryOp_r<0b0100001, 0b000, FPR64, FPR32, "fcvt.d.s">,
+               Sched<[WriteFCvtF32ToF64, ReadFCvtF32ToF64]> {
   let rs2 = 0b00000;
 }
 
@@ -120,55 +139,66 @@ def FEQ_D : FPCmpD_rr<0b010, "feq.d">;
 def FLT_D : FPCmpD_rr<0b001, "flt.d">;
 def FLE_D : FPCmpD_rr<0b000, "fle.d">;
 
-def FCLASS_D : FPUnaryOp_r<0b1110001, 0b001, GPR, FPR64, "fclass.d"> {
+def FCLASS_D : FPUnaryOp_r<0b1110001, 0b001, GPR, FPR64, "fclass.d">,
+               Sched<[WriteFClass64, ReadFClass64]> {
   let rs2 = 0b00000;
 }
 
-def FCVT_W_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.w.d"> {
+def FCVT_W_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.w.d">,
+               Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]> {
   let rs2 = 0b00000;
 }
 def          : FPUnaryOpDynFrmAlias<FCVT_W_D, "fcvt.w.d", GPR, FPR64>;
 
-def FCVT_WU_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.wu.d"> {
+def FCVT_WU_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.wu.d">,
+                Sched<[WriteFCvtF64ToI32, ReadFCvtF64ToI32]> {
   let rs2 = 0b00001;
 }
 def           : FPUnaryOpDynFrmAlias<FCVT_WU_D, "fcvt.wu.d", GPR, FPR64>;
 
-def FCVT_D_W : FPUnaryOp_r<0b1101001, 0b000, FPR64, GPR, "fcvt.d.w"> {
+def FCVT_D_W : FPUnaryOp_r<0b1101001, 0b000, FPR64, GPR, "fcvt.d.w">,
+               Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]> {
   let rs2 = 0b00000;
 }
 
-def FCVT_D_WU : FPUnaryOp_r<0b1101001, 0b000, FPR64, GPR, "fcvt.d.wu"> {
+def FCVT_D_WU : FPUnaryOp_r<0b1101001, 0b000, FPR64, GPR, "fcvt.d.wu">,
+                Sched<[WriteFCvtI32ToF64, ReadFCvtI32ToF64]> {
   let rs2 = 0b00001;
 }
 } // Predicates = [HasStdExtD]
 
 let Predicates = [HasStdExtD, IsRV64] in {
-def FCVT_L_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.l.d"> {
+def FCVT_L_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.l.d">,
+               Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]> {
   let rs2 = 0b00010;
 }
 def          : FPUnaryOpDynFrmAlias<FCVT_L_D, "fcvt.l.d", GPR, FPR64>;
 
-def FCVT_LU_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.lu.d"> {
+def FCVT_LU_D : FPUnaryOp_r_frm<0b1100001, GPR, FPR64, "fcvt.lu.d">,
+                Sched<[WriteFCvtF64ToI64, ReadFCvtF64ToI64]> {
   let rs2 = 0b00011;
 }
 def           : FPUnaryOpDynFrmAlias<FCVT_LU_D, "fcvt.lu.d", GPR, FPR64>;
 
-def FMV_X_D : FPUnaryOp_r<0b1110001, 0b000, GPR, FPR64, "fmv.x.d"> {
+def FMV_X_D : FPUnaryOp_r<0b1110001, 0b000, GPR, FPR64, "fmv.x.d">,
+              Sched<[WriteFMovF64ToI64, ReadFMovF64ToI64]> {
   let rs2 = 0b00000;
 }
 
-def FCVT_D_L : FPUnaryOp_r_frm<0b1101001, FPR64, GPR, "fcvt.d.l"> {
+def FCVT_D_L : FPUnaryOp_r_frm<0b1101001, FPR64, GPR, "fcvt.d.l">,
+               Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]> {
   let rs2 = 0b00010;
 }
 def          : FPUnaryOpDynFrmAlias<FCVT_D_L, "fcvt.d.l", FPR64, GPR>;
 
-def FCVT_D_LU : FPUnaryOp_r_frm<0b1101001, FPR64, GPR, "fcvt.d.lu"> {
+def FCVT_D_LU : FPUnaryOp_r_frm<0b1101001, FPR64, GPR, "fcvt.d.lu">,
+                Sched<[WriteFCvtI64ToF64, ReadFCvtI64ToF64]> {
   let rs2 = 0b00011;
 }
 def           : FPUnaryOpDynFrmAlias<FCVT_D_LU, "fcvt.d.lu", FPR64, GPR>;
 
-def FMV_D_X : FPUnaryOp_r<0b1111001, 0b000, FPR64, GPR, "fmv.d.x"> {
+def FMV_D_X : FPUnaryOp_r<0b1111001, 0b000, FPR64, GPR, "fmv.d.x">,
+              Sched<[WriteFMovI64ToF64, ReadFMovI64ToF64]> {
   let rs2 = 0b00000;
 }
 } // Predicates = [HasStdExtD, IsRV64]
@@ -276,11 +306,15 @@ def : PatFpr64Fpr64<setole, FLE_D>;
 def : Pat<(seto FPR64:$rs1, FPR64:$rs2),
           (AND (FEQ_D FPR64:$rs1, FPR64:$rs1),
                (FEQ_D FPR64:$rs2, FPR64:$rs2))>;
+def : Pat<(seto FPR64:$rs1, FPR64:$rs1),
+          (FEQ_D $rs1, $rs1)>;
 
 def : Pat<(setuo FPR64:$rs1, FPR64:$rs2),
           (SLTIU (AND (FEQ_D FPR64:$rs1, FPR64:$rs1),
                       (FEQ_D FPR64:$rs2, FPR64:$rs2)),
                  1)>;
+def : Pat<(setuo FPR64:$rs1, FPR64:$rs1),
+          (SLTIU (FEQ_D $rs1, $rs1), 1)>;
 
 def Select_FPR64_Using_CC_GPR : SelectCC_rrirr<FPR64, GPR>;
 
@@ -309,6 +343,10 @@ def SplitF64Pseudo
 } // Predicates = [HasStdExtD]
 
 let Predicates = [HasStdExtD, IsRV32] in {
+
+/// Float constants
+def : Pat<(f64 (fpimm0)), (FCVT_D_W X0)>;
+
 // double->[u]int. Round-to-zero must be used.
 def : Pat<(fp_to_sint FPR64:$rs1), (FCVT_W_D FPR64:$rs1, 0b001)>;
 def : Pat<(fp_to_uint FPR64:$rs1), (FCVT_WU_D FPR64:$rs1, 0b001)>;
@@ -319,6 +357,10 @@ def : Pat<(uint_to_fp GPR:$rs1), (FCVT_D_WU GPR:$rs1)>;
 } // Predicates = [HasStdExtD, IsRV32]
 
 let Predicates = [HasStdExtD, IsRV64] in {
+
+/// Float constants
+def : Pat<(f64 (fpimm0)), (FMV_D_X X0)>;
+
 def : Pat<(bitconvert GPR:$rs1), (FMV_D_X GPR:$rs1)>;
 def : Pat<(bitconvert FPR64:$rs1), (FMV_X_D FPR64:$rs1)>;
 
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
index 3b73c865ea17..ce5c3abb6a06 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoF.td
@@ -93,7 +93,8 @@ class FPUnaryOpDynFrmAlias<FPUnaryOp_r_frm Inst, string OpcodeStr,
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
 class FPCmpS_rr<bits<3> funct3, string opcodestr>
     : RVInstR<0b1010000, funct3, OPC_OP_FP, (outs GPR:$rd),
-              (ins FPR32:$rs1, FPR32:$rs2), opcodestr, "$rd, $rs1, $rs2">;
+              (ins FPR32:$rs1, FPR32:$rs2), opcodestr, "$rd, $rs1, $rs2">,
+      Sched<[WriteFCmp32, ReadFCmp32, ReadFCmp32]>;
 
 //===----------------------------------------------------------------------===//
 // Instructions
@@ -103,7 +104,8 @@ let Predicates = [HasStdExtF] in {
 let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in
 def FLW : RVInstI<0b010, OPC_LOAD_FP, (outs FPR32:$rd),
                   (ins GPR:$rs1, simm12:$imm12),
-                   "flw", "$rd, ${imm12}(${rs1})">;
+                   "flw", "$rd, ${imm12}(${rs1})">,
+          Sched<[WriteFLD32, ReadFMemBase]>;
 
 // Operands for stores are in the order srcreg, base, offset rather than
 // reflecting the order these fields are specified in the instruction
@@ -111,48 +113,66 @@ def FLW : RVInstI<0b010, OPC_LOAD_FP, (outs FPR32:$rd),
 let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in
 def FSW : RVInstS<0b010, OPC_STORE_FP, (outs),
                   (ins FPR32:$rs2, GPR:$rs1, simm12:$imm12),
-                   "fsw", "$rs2, ${imm12}(${rs1})">;
+                   "fsw", "$rs2, ${imm12}(${rs1})">,
+          Sched<[WriteFST32, ReadStoreData, ReadFMemBase]>;
 
-def FMADD_S  : FPFMAS_rrr_frm<OPC_MADD, "fmadd.s">;
+def FMADD_S  : FPFMAS_rrr_frm<OPC_MADD, "fmadd.s">,
+               Sched<[WriteFMulAdd32, ReadFMulAdd32, ReadFMulAdd32, ReadFMulAdd32]>;
 def          : FPFMASDynFrmAlias<FMADD_S, "fmadd.s">;
-def FMSUB_S  : FPFMAS_rrr_frm<OPC_MSUB, "fmsub.s">;
+def FMSUB_S  : FPFMAS_rrr_frm<OPC_MSUB, "fmsub.s">,
+               Sched<[WriteFMulSub32, ReadFMulSub32, ReadFMulSub32, ReadFMulSub32]>;
 def          : FPFMASDynFrmAlias<FMSUB_S, "fmsub.s">;
-def FNMSUB_S : FPFMAS_rrr_frm<OPC_NMSUB, "fnmsub.s">;
+def FNMSUB_S : FPFMAS_rrr_frm<OPC_NMSUB, "fnmsub.s">,
+               Sched<[WriteFMulSub32, ReadFMulSub32, ReadFMulSub32, ReadFMulSub32]>;
 def          : FPFMASDynFrmAlias<FNMSUB_S, "fnmsub.s">;
-def FNMADD_S : FPFMAS_rrr_frm<OPC_NMADD, "fnmadd.s">;
+def FNMADD_S : FPFMAS_rrr_frm<OPC_NMADD, "fnmadd.s">,
+               Sched<[WriteFMulAdd32, ReadFMulAdd32, ReadFMulAdd32, ReadFMulAdd32]>;
 def          : FPFMASDynFrmAlias<FNMADD_S, "fnmadd.s">;
 
-def FADD_S : FPALUS_rr_frm<0b0000000, "fadd.s">;
+def FADD_S : FPALUS_rr_frm<0b0000000, "fadd.s">,
+             Sched<[WriteFALU32, ReadFALU32, ReadFALU32]>;
 def        : FPALUSDynFrmAlias<FADD_S, "fadd.s">;
-def FSUB_S : FPALUS_rr_frm<0b0000100, "fsub.s">;
+def FSUB_S : FPALUS_rr_frm<0b0000100, "fsub.s">,
+             Sched<[WriteFALU32, ReadFALU32, ReadFALU32]>;
 def        : FPALUSDynFrmAlias<FSUB_S, "fsub.s">;
-def FMUL_S : FPALUS_rr_frm<0b0001000, "fmul.s">;
+def FMUL_S : FPALUS_rr_frm<0b0001000, "fmul.s">,
+             Sched<[WriteFMul32, ReadFMul32, ReadFMul32]>;
 def        : FPALUSDynFrmAlias<FMUL_S, "fmul.s">;
-def FDIV_S : FPALUS_rr_frm<0b0001100, "fdiv.s">;
+def FDIV_S : FPALUS_rr_frm<0b0001100, "fdiv.s">,
+             Sched<[WriteFDiv32, ReadFDiv32, ReadFDiv32]>;
 def        : FPALUSDynFrmAlias<FDIV_S, "fdiv.s">;
 
-def FSQRT_S : FPUnaryOp_r_frm<0b0101100, FPR32, FPR32, "fsqrt.s"> {
+def FSQRT_S : FPUnaryOp_r_frm<0b0101100, FPR32, FPR32, "fsqrt.s">,
+              Sched<[WriteFSqrt32, ReadFSqrt32]> {
   let rs2 = 0b00000;
 }
 def         : FPUnaryOpDynFrmAlias<FSQRT_S, "fsqrt.s", FPR32, FPR32>;
 
-def FSGNJ_S  : FPALUS_rr<0b0010000, 0b000, "fsgnj.s">;
-def FSGNJN_S : FPALUS_rr<0b0010000, 0b001, "fsgnjn.s">;
-def FSGNJX_S : FPALUS_rr<0b0010000, 0b010, "fsgnjx.s">;
-def FMIN_S   : FPALUS_rr<0b0010100, 0b000, "fmin.s">;
-def FMAX_S   : FPALUS_rr<0b0010100, 0b001, "fmax.s">;
-
-def FCVT_W_S : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.w.s"> {
+def FSGNJ_S  : FPALUS_rr<0b0010000, 0b000, "fsgnj.s">,
+               Sched<[WriteFSGNJ32, ReadFSGNJ32, ReadFSGNJ32]>;
+def FSGNJN_S : FPALUS_rr<0b0010000, 0b001, "fsgnjn.s">,
+               Sched<[WriteFSGNJ32, ReadFSGNJ32, ReadFSGNJ32]>;
+def FSGNJX_S : FPALUS_rr<0b0010000, 0b010, "fsgnjx.s">,
+               Sched<[WriteFSGNJ32, ReadFSGNJ32, ReadFSGNJ32]>;
+def FMIN_S   : FPALUS_rr<0b0010100, 0b000, "fmin.s">,
+               Sched<[WriteFMinMax32, ReadFMinMax32, ReadFMinMax32]>;
+def FMAX_S   : FPALUS_rr<0b0010100, 0b001, "fmax.s">,
+               Sched<[WriteFMinMax32, ReadFMinMax32, ReadFMinMax32]>;
+
+def FCVT_W_S : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.w.s">,
+               Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]> {
   let rs2 = 0b00000;
 }
 def          : FPUnaryOpDynFrmAlias<FCVT_W_S, "fcvt.w.s", GPR, FPR32>;
 
-def FCVT_WU_S : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.wu.s"> {
+def FCVT_WU_S : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.wu.s">,
+                Sched<[WriteFCvtF32ToI32, ReadFCvtF32ToI32]> {
   let rs2 = 0b00001;
 }
 def           : FPUnaryOpDynFrmAlias<FCVT_WU_S, "fcvt.wu.s", GPR, FPR32>;
 
-def FMV_X_W : FPUnaryOp_r<0b1110000, 0b000, GPR, FPR32, "fmv.x.w"> {
+def FMV_X_W : FPUnaryOp_r<0b1110000, 0b000, GPR, FPR32, "fmv.x.w">,
+              Sched<[WriteFMovF32ToI32, ReadFMovF32ToI32]> {
   let rs2 = 0b00000;
 }
 
@@ -160,42 +180,50 @@ def FEQ_S : FPCmpS_rr<0b010, "feq.s">;
 def FLT_S : FPCmpS_rr<0b001, "flt.s">;
 def FLE_S : FPCmpS_rr<0b000, "fle.s">;
 
-def FCLASS_S : FPUnaryOp_r<0b1110000, 0b001, GPR, FPR32, "fclass.s"> {
+def FCLASS_S : FPUnaryOp_r<0b1110000, 0b001, GPR, FPR32, "fclass.s">,
+               Sched<[WriteFClass32, ReadFClass32]> {
   let rs2 = 0b00000;
 }
 
-def FCVT_S_W : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.w"> {
+def FCVT_S_W : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.w">,
+               Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]> {
   let rs2 = 0b00000;
 }
 def          : FPUnaryOpDynFrmAlias<FCVT_S_W, "fcvt.s.w", FPR32, GPR>;
 
-def FCVT_S_WU : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.wu"> {
+def FCVT_S_WU : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.wu">,
+                Sched<[WriteFCvtI32ToF32, ReadFCvtI32ToF32]> {
   let rs2 = 0b00001;
 }
 def           : FPUnaryOpDynFrmAlias<FCVT_S_WU, "fcvt.s.wu", FPR32, GPR>;
 
-def FMV_W_X : FPUnaryOp_r<0b1111000, 0b000, FPR32, GPR, "fmv.w.x"> {
+def FMV_W_X : FPUnaryOp_r<0b1111000, 0b000, FPR32, GPR, "fmv.w.x">,
+              Sched<[WriteFMovI32ToF32, ReadFMovI32ToF32]> {
   let rs2 = 0b00000;
 }
 } // Predicates = [HasStdExtF]
 
 let Predicates = [HasStdExtF, IsRV64] in {
-def FCVT_L_S  : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.l.s"> {
+def FCVT_L_S  : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.l.s">,
+                Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]> {
   let rs2 = 0b00010;
 }
 def           : FPUnaryOpDynFrmAlias<FCVT_L_S, "fcvt.l.s", GPR, FPR32>;
 
-def FCVT_LU_S  : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.lu.s"> {
+def FCVT_LU_S  : FPUnaryOp_r_frm<0b1100000, GPR, FPR32, "fcvt.lu.s">,
+                 Sched<[WriteFCvtF32ToI64, ReadFCvtF32ToI64]> {
   let rs2 = 0b00011;
 }
 def            : FPUnaryOpDynFrmAlias<FCVT_LU_S, "fcvt.lu.s", GPR, FPR32>;
 
-def FCVT_S_L : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.l"> {
+def FCVT_S_L : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.l">,
+               Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]> {
   let rs2 = 0b00010;
 }
 def          : FPUnaryOpDynFrmAlias<FCVT_S_L, "fcvt.s.l", FPR32, GPR>;
 
-def FCVT_S_LU : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.lu"> {
+def FCVT_S_LU : FPUnaryOp_r_frm<0b1101000, FPR32, GPR, "fcvt.s.lu">,
+                Sched<[WriteFCvtI64ToF32, ReadFCvtI64ToF32]> {
   let rs2 = 0b00011;
 }
 def           : FPUnaryOpDynFrmAlias<FCVT_S_LU, "fcvt.s.lu", FPR32, GPR>;
@@ -258,6 +286,9 @@ def PseudoFSW  : PseudoStore<"fsw", FPR32>;
 // Pseudo-instructions and codegen patterns
 //===----------------------------------------------------------------------===//
 
+/// Floating point constants
+def fpimm0 : PatLeaf<(fpimm), [{ return N->isExactlyValue(+0.0); }]>;
+
 /// Generic pattern classes
 class PatFpr32Fpr32<SDPatternOperator OpNode, RVInstR Inst>
     : Pat<(OpNode FPR32:$rs1, FPR32:$rs2), (Inst $rs1, $rs2)>;
@@ -267,6 +298,9 @@ class PatFpr32Fpr32DynFrm<SDPatternOperator OpNode, RVInstRFrm Inst>
 
 let Predicates = [HasStdExtF] in {
 
+/// Float constants
+def : Pat<(f32 (fpimm0)), (FMV_W_X X0)>;
+
 /// Float conversion operations
 
 // Moves (no conversion)
@@ -332,11 +366,15 @@ def : PatFpr32Fpr32<setole, FLE_S>;
 def : Pat<(seto FPR32:$rs1, FPR32:$rs2),
           (AND (FEQ_S FPR32:$rs1, FPR32:$rs1),
                (FEQ_S FPR32:$rs2, FPR32:$rs2))>;
+def : Pat<(seto FPR32:$rs1, FPR32:$rs1),
+          (FEQ_S $rs1, $rs1)>;
 
 def : Pat<(setuo FPR32:$rs1, FPR32:$rs2),
           (SLTIU (AND (FEQ_S FPR32:$rs1, FPR32:$rs1),
                       (FEQ_S FPR32:$rs2, FPR32:$rs2)),
                  1)>;
+def : Pat<(setuo FPR32:$rs1, FPR32:$rs1),
+          (SLTIU (FEQ_S $rs1, $rs1), 1)>;
 
 def Select_FPR32_Using_CC_GPR : SelectCC_rrirr<FPR32, GPR>;
 
@@ -360,16 +398,6 @@ def : Pat<(sint_to_fp GPR:$rs1), (FCVT_S_W $rs1, 0b111)>;
 def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_WU $rs1, 0b111)>;
 } // Predicates = [HasStdExtF, IsRV32]
 
-let Predicates = [HasStdExtF, IsRV32] in {
-// FP->[u]int. Round-to-zero must be used
-def : Pat<(fp_to_sint FPR32:$rs1), (FCVT_W_S $rs1, 0b001)>;
-def : Pat<(fp_to_uint FPR32:$rs1), (FCVT_WU_S $rs1, 0b001)>;
-
-// [u]int->fp. Match GCC and default to using dynamic rounding mode.
-def : Pat<(sint_to_fp GPR:$rs1), (FCVT_S_W $rs1, 0b111)>;
-def : Pat<(uint_to_fp GPR:$rs1), (FCVT_S_WU $rs1, 0b111)>;
-} // Predicates = [HasStdExtF, IsRV32]
-
 let Predicates = [HasStdExtF, IsRV64] in {
 def : Pat<(riscv_fmv_w_x_rv64 GPR:$src), (FMV_W_X GPR:$src)>;
 def : Pat<(riscv_fmv_x_anyextw_rv64 FPR32:$src), (FMV_X_W FPR32:$src)>;
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
index e75151ba99c7..987534aadd79 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoM.td
@@ -24,22 +24,35 @@ def riscv_remuw : SDNode<"RISCVISD::REMUW", SDTIntBinOp>;
 //===----------------------------------------------------------------------===//
 
 let Predicates = [HasStdExtM] in {
-def MUL     : ALU_rr<0b0000001, 0b000, "mul">;
-def MULH    : ALU_rr<0b0000001, 0b001, "mulh">;
-def MULHSU  : ALU_rr<0b0000001, 0b010, "mulhsu">;
-def MULHU   : ALU_rr<0b0000001, 0b011, "mulhu">;
-def DIV     : ALU_rr<0b0000001, 0b100, "div">;
-def DIVU    : ALU_rr<0b0000001, 0b101, "divu">;
-def REM     : ALU_rr<0b0000001, 0b110, "rem">;
-def REMU    : ALU_rr<0b0000001, 0b111, "remu">;
+def MUL     : ALU_rr<0b0000001, 0b000, "mul">,
+              Sched<[WriteIMul, ReadIMul, ReadIMul]>;
+def MULH    : ALU_rr<0b0000001, 0b001, "mulh">,
+              Sched<[WriteIMul, ReadIMul, ReadIMul]>;
+def MULHSU  : ALU_rr<0b0000001, 0b010, "mulhsu">,
+              Sched<[WriteIMul, ReadIMul, ReadIMul]>;
+def MULHU   : ALU_rr<0b0000001, 0b011, "mulhu">,
+              Sched<[WriteIMul, ReadIMul, ReadIMul]>;
+def DIV     : ALU_rr<0b0000001, 0b100, "div">,
+              Sched<[WriteIDiv, ReadIDiv, ReadIDiv]>;
+def DIVU    : ALU_rr<0b0000001, 0b101, "divu">,
+              Sched<[WriteIDiv, ReadIDiv, ReadIDiv]>;
+def REM     : ALU_rr<0b0000001, 0b110, "rem">,
+              Sched<[WriteIDiv, ReadIDiv, ReadIDiv]>;
+def REMU    : ALU_rr<0b0000001, 0b111, "remu">,
+              Sched<[WriteIDiv, ReadIDiv, ReadIDiv]>;
 } // Predicates = [HasStdExtM]
 
 let Predicates = [HasStdExtM, IsRV64] in {
-def MULW    : ALUW_rr<0b0000001, 0b000, "mulw">;
-def DIVW    : ALUW_rr<0b0000001, 0b100, "divw">;
-def DIVUW   : ALUW_rr<0b0000001, 0b101, "divuw">;
-def REMW    : ALUW_rr<0b0000001, 0b110, "remw">;
-def REMUW   : ALUW_rr<0b0000001, 0b111, "remuw">;
+def MULW    : ALUW_rr<0b0000001, 0b000, "mulw">,
+              Sched<[WriteIMul32, ReadIMul32, ReadIMul32]>;
+def DIVW    : ALUW_rr<0b0000001, 0b100, "divw">,
+              Sched<[WriteIDiv32, ReadIDiv32, ReadIDiv32]>;
+def DIVUW   : ALUW_rr<0b0000001, 0b101, "divuw">,
+              Sched<[WriteIDiv32, ReadIDiv32, ReadIDiv32]>;
+def REMW    : ALUW_rr<0b0000001, 0b110, "remw">,
+              Sched<[WriteIDiv32, ReadIDiv32, ReadIDiv32]>;
+def REMUW   : ALUW_rr<0b0000001, 0b111, "remuw">,
+              Sched<[WriteIDiv32, ReadIDiv32, ReadIDiv32]>;
 } // Predicates = [HasStdExtM, IsRV64]
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoV.td b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
new file mode 100644
index 000000000000..1c7f53fecb8c
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoV.td
@@ -0,0 +1,873 @@
+//===-- RISCVInstrInfoV.td - RISC-V 'V' instructions -------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// This file describes the RISC-V instructions from the standard 'V' Vector
+/// extension, version 0.8.
+/// This version is still experimental as the 'V' extension hasn't been
+/// ratified yet.
+///
+//===----------------------------------------------------------------------===//
+
+include "RISCVInstrFormatsV.td"
+
+//===----------------------------------------------------------------------===//
+// Operand and SDNode transformation definitions.
+//===----------------------------------------------------------------------===//
+
+def VTypeIAsmOperand : AsmOperandClass {
+  let Name = "VTypeI";
+  let ParserMethod = "parseVTypeI";
+  let DiagnosticType = "InvalidVTypeI";
+}
+
+def VTypeIOp : Operand<XLenVT> {
+  let ParserMatchClass = VTypeIAsmOperand;
+  let PrintMethod = "printVTypeI";
+  let DecoderMethod = "decodeUImmOperand<11>";
+}
+
+def VRegAsmOperand : AsmOperandClass {
+  let Name = "RVVRegOpOperand";
+  let RenderMethod = "addRegOperands";
+  let PredicateMethod = "isReg";
+  let ParserMethod = "parseRegister";
+}
+
+def VRegOp : RegisterOperand<VR> {
+  let ParserMatchClass = VRegAsmOperand;
+  let PrintMethod = "printOperand";
+}
+
+def VMaskAsmOperand : AsmOperandClass {
+  let Name = "RVVMaskRegOpOperand";
+  let RenderMethod = "addRegOperands";
+  let PredicateMethod = "isV0Reg";
+  let ParserMethod = "parseMaskReg";
+  let IsOptional = 1;
+  let DefaultMethod = "defaultMaskRegOp";
+  let DiagnosticType = "InvalidVMaskRegister";
+}
+
+def VMaskOp : RegisterOperand<VMV0> {
+  let ParserMatchClass = VMaskAsmOperand;
+  let PrintMethod = "printVMaskReg";
+  let EncoderMethod = "getVMaskReg";
+  let DecoderMethod = "decodeVMaskReg";
+}
+
+def simm5 : Operand<XLenVT>, ImmLeaf<XLenVT, [{return isInt<5>(Imm);}]> {
+  let ParserMatchClass = SImmAsmOperand<5>;
+  let EncoderMethod = "getImmOpValue";
+  let DecoderMethod = "decodeSImmOperand<5>";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isInt<5>(Imm);
+    return MCOp.isBareSymbolRef();
+  }];
+}
+
+def SImm5Plus1AsmOperand : AsmOperandClass {
+  let Name = "SImm5Plus1";
+  let RenderMethod = "addSImm5Plus1Operands";
+  let DiagnosticType = "InvalidSImm5Plus1";
+}
+
+def simm5_plus1 : Operand<XLenVT>, ImmLeaf<XLenVT,
+                                           [{return isInt<5>(Imm - 1);}]> {
+  let ParserMatchClass = SImm5Plus1AsmOperand;
+  let PrintMethod = "printSImm5Plus1";
+  let MCOperandPredicate = [{
+    int64_t Imm;
+    if (MCOp.evaluateAsConstantImm(Imm))
+      return isInt<5>(Imm - 1);
+    return MCOp.isBareSymbolRef();
+  }];
+}
+
+//===----------------------------------------------------------------------===//
+// Instruction class templates
+//===----------------------------------------------------------------------===//
+
+let hasSideEffects = 0, mayLoad = 1, mayStore = 0 in {
+// load vd, (rs1), vm
+class VUnitStrideLoad<RISCVMOP mop, RISCVLSUMOP lumop, RISCVWidth width,
+                        string opcodestr>
+    : RVInstVLU<0b000, mop, lumop, width, (outs VRegOp:$vd),
+                (ins GPR:$rs1, VMaskOp:$vm), opcodestr, "$vd, (${rs1})$vm">;
+
+// load vd, (rs1), rs2, vm
+class VStridedLoad<RISCVMOP mop, RISCVWidth width, string opcodestr>
+    : RVInstVLS<0b000, mop, width, (outs VRegOp:$vd),
+                (ins GPR:$rs1, GPR:$rs2, VMaskOp:$vm), opcodestr,
+                "$vd, (${rs1}), $rs2$vm">;
+
+// load vd, (rs1), vs2, vm
+class VIndexedLoad<RISCVMOP mop, RISCVWidth width, string opcodestr>
+    : RVInstVLX<0b000, mop, width, (outs VRegOp:$vd),
+                (ins GPR:$rs1, VRegOp:$vs2, VMaskOp:$vm), opcodestr,
+                "$vd, (${rs1}), $vs2$vm">;
+
+// vl<nf>r.v vd, (rs1)
+class VWholeLoad<bits<3> nf, string opcodestr>
+    : RVInstVLU<nf, MOPLDUnitStrideU, LUMOPUnitStrideWholeReg,
+                LSWidthVSEW, (outs VRegOp:$vd), (ins GPR:$rs1),
+                opcodestr, "$vd, (${rs1})"> {
+  let vm = 1;
+  let Uses = [];
+}
+} // hasSideEffects = 0, mayLoad = 1, mayStore = 0
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 1 in {
+// store vd, vs3, (rs1), vm
+class VUnitStrideStore<RISCVMOP mop, RISCVLSUMOP sumop, RISCVWidth width,
+                         string opcodestr>
+    : RVInstVSU<0b000, mop, sumop, width, (outs),
+                (ins VRegOp:$vs3, GPR:$rs1, VMaskOp:$vm), opcodestr,
+                "$vs3, (${rs1})$vm">;
+
+// store vd, vs3, (rs1), rs2, vm
+class VStridedStore<RISCVMOP mop, RISCVWidth width, string opcodestr>
+    : RVInstVSS<0b000, mop, width, (outs),
+                (ins VRegOp:$vs3, GPR:$rs1, GPR:$rs2, VMaskOp:$vm),
+                opcodestr, "$vs3, (${rs1}), $rs2$vm">;
+
+// store vd, vs3, (rs1), vs2, vm
+class VIndexedStore<RISCVMOP mop, RISCVWidth width, string opcodestr>
+    : RVInstVSX<0b000, mop, width, (outs),
+                (ins VRegOp:$vs3, GPR:$rs1, VRegOp:$vs2, VMaskOp:$vm),
+                opcodestr, "$vs3, (${rs1}), $vs2$vm">;
+
+// vs<nf>r.v vd, (rs1)
+class VWholeStore<bits<3> nf, string opcodestr>
+    : RVInstVSU<nf, MOPSTUnitStride, SUMOPUnitStrideWholeReg,
+                LSWidthVSEW, (outs), (ins VRegOp:$vs3, GPR:$rs1),
+                opcodestr, "$vs3, (${rs1})"> {
+  let vm = 1;
+  let Uses = [];
+}
+} // hasSideEffects = 0, mayLoad = 0, mayStore = 1
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+// op vd, vs2, vs1, vm
+class VALUVV<bits<6> funct6, RISCVVFormat opv, string opcodestr>
+    : RVInstVV<funct6, opv, (outs VRegOp:$vd),
+                (ins VRegOp:$vs2, VRegOp:$vs1, VMaskOp:$vm),
+                opcodestr, "$vd, $vs2, $vs1$vm">;
+
+// op vd, vs2, vs1, v0 (without mask, use v0 as carry input)
+class VALUmVV<bits<6> funct6, RISCVVFormat opv, string opcodestr>
+    : RVInstVV<funct6, opv, (outs VRegOp:$vd),
+                (ins VRegOp:$vs2, VRegOp:$vs1, VMV0:$v0),
+                opcodestr, "$vd, $vs2, $vs1, v0"> {
+  let vm = 0;
+}
+
+// op vd, vs1, vs2, vm (reverse the order of vs1 and vs2)
+class VALUrVV<bits<6> funct6, RISCVVFormat opv, string opcodestr>
+    : RVInstVV<funct6, opv, (outs VRegOp:$vd),
+                (ins VRegOp:$vs1, VRegOp:$vs2, VMaskOp:$vm),
+                opcodestr, "$vd, $vs1, $vs2$vm">;
+
+// op vd, vs1, vs2
+class VALUVVNoVm<bits<6> funct6, RISCVVFormat opv, string opcodestr>
+    : RVInstVV<funct6, opv, (outs VRegOp:$vd),
+               (ins VRegOp:$vs2, VRegOp:$vs1),
+               opcodestr, "$vd, $vs2, $vs1"> {
+  let vm = 1;
+}
+
+// op vd, vs2, rs1, vm
+class VALUVX<bits<6> funct6, RISCVVFormat opv, string opcodestr>
+    : RVInstVX<funct6, opv, (outs VRegOp:$vd),
+                (ins VRegOp:$vs2, GPR:$rs1, VMaskOp:$vm),
+                opcodestr, "$vd, $vs2, $rs1$vm">;
+
+// op vd, vs2, rs1, v0 (without mask, use v0 as carry input)
+class VALUmVX<bits<6> funct6, RISCVVFormat opv, string opcodestr>
+    : RVInstVX<funct6, opv, (outs VRegOp:$vd),
+                (ins VRegOp:$vs2, GPR:$rs1, VMV0:$v0),
+                opcodestr, "$vd, $vs2, $rs1, v0"> {
+  let vm = 0;
+}
+
+// op vd, rs1, vs2, vm (reverse the order of rs1 and vs2)
+class VALUrVX<bits<6> funct6, RISCVVFormat opv, string opcodestr>
+    : RVInstVX<funct6, opv, (outs VRegOp:$vd),
+                (ins GPR:$rs1, VRegOp:$vs2, VMaskOp:$vm),
+                opcodestr, "$vd, $rs1, $vs2$vm">;
+
+// op vd, vs1, vs2
+class VALUVXNoVm<bits<6> funct6, RISCVVFormat opv, string opcodestr>
+    : RVInstVX<funct6, opv, (outs VRegOp:$vd),
+               (ins VRegOp:$vs2, GPR:$rs1),
+               opcodestr, "$vd, $vs2, $rs1"> {
+  let vm = 1;
+}
+
+// op vd, vs2, imm, vm
+class VALUVI<bits<6> funct6, string opcodestr, Operand optype = simm5>
+    : RVInstIVI<funct6, (outs VRegOp:$vd),
+                (ins VRegOp:$vs2, optype:$imm, VMaskOp:$vm),
+                opcodestr, "$vd, $vs2, $imm$vm">;
+
+// op vd, vs2, imm, v0 (without mask, use v0 as carry input)
+class VALUmVI<bits<6> funct6, string opcodestr, Operand optype = simm5>
+    : RVInstIVI<funct6, (outs VRegOp:$vd),
+                (ins VRegOp:$vs2, optype:$imm, VMV0:$v0),
+                opcodestr, "$vd, $vs2, $imm, v0"> {
+  let vm = 0;
+}
+
+// op vd, vs2, imm, vm
+class VALUVINoVm<bits<6> funct6, string opcodestr, Operand optype = simm5>
+    : RVInstIVI<funct6, (outs VRegOp:$vd),
+                (ins VRegOp:$vs2, optype:$imm),
+                opcodestr, "$vd, $vs2, $imm"> {
+  let vm = 1;
+}
+
+// op vd, vs2, rs1, vm (Float)
+class VALUVF<bits<6> funct6, RISCVVFormat opv, string opcodestr>
+    : RVInstVX<funct6, opv, (outs VRegOp:$vd),
+                (ins VRegOp:$vs2, FPR32:$rs1, VMaskOp:$vm),
+                opcodestr, "$vd, $vs2, $rs1$vm">;
+
+// op vd, rs1, vs2, vm (Float) (with mask, reverse the order of rs1 and vs2)
+class VALUrVF<bits<6> funct6, RISCVVFormat opv, string opcodestr>
+    : RVInstVX<funct6, opv, (outs VRegOp:$vd),
+                (ins FPR32:$rs1, VRegOp:$vs2, VMaskOp:$vm),
+                opcodestr, "$vd, $rs1, $vs2$vm">;
+
+// op vd, vs2, vm (use vs1 as instruction encoding)
+class VALUVs2<bits<6> funct6, bits<5> vs1, RISCVVFormat opv, string opcodestr>
+    : RVInstV<funct6, vs1, opv, (outs VRegOp:$vd),
+               (ins VRegOp:$vs2, VMaskOp:$vm),
+               opcodestr, "$vd, $vs2$vm">;
+} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+
+//===----------------------------------------------------------------------===//
+// Combination of instruction classes.
+// Use these multiclasses to define instructions more easily.
+//===----------------------------------------------------------------------===//
+multiclass VALU_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>;
+}
+
+multiclass VALU_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V  : VALUVV<funct6, OPIVV, opcodestr # "." # vw # "v">;
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
+}
+
+multiclass VALUr_IV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUrVV<funct6, OPIVV, opcodestr # "." # vw # "v">;
+  def X : VALUrVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
+}
+
+multiclass VALU_IV_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5, string vw = "v"> {
+  def X  : VALUVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
+  def I  : VALUVI<funct6, opcodestr # "." # vw # "i", optype>;
+}
+
+multiclass VALU_IV_V<string opcodestr, bits<6> funct6> {
+  def _VS  : VALUVV<funct6, OPIVV, opcodestr # ".vs">;
+}
+
+multiclass VALUr_IV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def X : VALUrVX<funct6, OPIVX, opcodestr # "." # vw # "x">;
+}
+
+multiclass VALU_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPMVV, opcodestr # "." # vw # "v">;
+  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
+}
+
+multiclass VALU_MV_V<string opcodestr, bits<6> funct6> {
+  def _VS : VALUVV<funct6, OPMVV, opcodestr # ".vs">;
+}
+
+multiclass VALU_MV_Mask<string opcodestr, bits<6> funct6, string vm = "v"> {
+  def M : VALUVVNoVm<funct6, OPMVV, opcodestr # "." # vm # "m">;
+}
+
+multiclass VALU_MV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def X  : VALUVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
+}
+
+multiclass VALUr_MV_V_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUrVV<funct6, OPMVV, opcodestr # "." # vw # "v">;
+  def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
+}
+
+multiclass VALUr_MV_X<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def X : VALUrVX<funct6, OPMVX, opcodestr # "." # vw # "x">;
+}
+
+multiclass VALU_MV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPMVV, opcodestr>;
+}
+
+multiclass VALUm_IV_V_X_I<string opcodestr, bits<6> funct6> {
+  def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">;
+  def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">;
+  def IM : VALUmVI<funct6, opcodestr # ".vim">;
+}
+
+multiclass VALUm_IV_V_X<string opcodestr, bits<6> funct6> {
+  def VM : VALUmVV<funct6, OPIVV, opcodestr # ".vvm">;
+  def XM : VALUmVX<funct6, OPIVX, opcodestr # ".vxm">;
+}
+
+multiclass VALUNoVm_IV_V_X_I<string opcodestr, bits<6> funct6, Operand optype = simm5> {
+  def V : VALUVVNoVm<funct6, OPIVV, opcodestr # ".vv">;
+  def X : VALUVXNoVm<funct6, OPIVX, opcodestr # ".vx">;
+  def I : VALUVINoVm<funct6, opcodestr # ".vi", optype>;
+}
+
+multiclass VALUNoVm_IV_V_X<string opcodestr, bits<6> funct6> {
+  def V : VALUVVNoVm<funct6, OPIVV, opcodestr # ".vv">;
+  def X : VALUVXNoVm<funct6, OPIVX, opcodestr # ".vx">;
+}
+
+multiclass VALU_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUVV<funct6, OPFVV, opcodestr # "." # vw # "v">;
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">;
+}
+
+multiclass VALU_FV_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def F : VALUVF<funct6, OPFVF, opcodestr # "." # vw # "f">;
+}
+
+multiclass VALUr_FV_V_F<string opcodestr, bits<6> funct6, string vw = "v"> {
+  def V : VALUrVV<funct6, OPFVV, opcodestr # "." # vw # "v">;
+  def F : VALUrVF<funct6, OPFVF, opcodestr # "." # vw # "f">;
+}
+
+multiclass VALU_FV_V<string opcodestr, bits<6> funct6> {
+  def _VS : VALUVV<funct6, OPFVV, opcodestr # ".vs">;
+}
+
+multiclass VALU_FV_VS2<string opcodestr, bits<6> funct6, bits<5> vs1> {
+  def "" : VALUVs2<funct6, vs1, OPFVV, opcodestr>;
+}
+
+//===----------------------------------------------------------------------===//
+// Instructions
+//===----------------------------------------------------------------------===//
+
+let Predicates = [HasStdExtV] in {
+let hasSideEffects = 1, mayLoad = 0, mayStore = 0 in {
+def VSETVLI : RVInstSetVLi<(outs GPR:$rd), (ins GPR:$rs1, VTypeIOp:$vtypei),
+                           "vsetvli", "$rd, $rs1, $vtypei">;
+
+def VSETVL : RVInstSetVL<(outs GPR:$rd), (ins GPR:$rs1, GPR:$rs2),
+                         "vsetvl", "$rd, $rs1, $rs2">;
+} // hasSideEffects = 1, mayLoad = 0, mayStore = 0
+
+// Vector Unit-Stride Instructions
+def VLB_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStride, LSWidthVByte, "vlb.v">;
+def VLH_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStride, LSWidthVHalf, "vlh.v">;
+def VLW_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStride, LSWidthVWord, "vlw.v">;
+
+def VLBU_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStride, LSWidthVByte, "vlbu.v">;
+def VLHU_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStride, LSWidthVHalf, "vlhu.v">;
+def VLWU_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStride, LSWidthVWord, "vlwu.v">;
+
+def VLE_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStride, LSWidthVSEW, "vle.v">;
+
+def VLBFF_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStrideFF, LSWidthVByte, "vlbff.v">;
+def VLHFF_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStrideFF, LSWidthVHalf, "vlhff.v">;
+def VLWFF_V : VUnitStrideLoad<MOPLDUnitStrideS, LUMOPUnitStrideFF, LSWidthVWord, "vlwff.v">;
+
+def VLBUFF_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStrideFF, LSWidthVByte, "vlbuff.v">;
+def VLHUFF_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStrideFF, LSWidthVHalf, "vlhuff.v">;
+def VLWUFF_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStrideFF, LSWidthVWord, "vlwuff.v">;
+
+def VLEFF_V : VUnitStrideLoad<MOPLDUnitStrideU, LUMOPUnitStrideFF, LSWidthVSEW, "vleff.v">;
+
+def VSB_V : VUnitStrideStore<MOPSTUnitStride, SUMOPUnitStride, LSWidthVByte, "vsb.v">;
+def VSH_V : VUnitStrideStore<MOPSTUnitStride, SUMOPUnitStride, LSWidthVHalf, "vsh.v">;
+def VSW_V : VUnitStrideStore<MOPSTUnitStride, SUMOPUnitStride, LSWidthVWord, "vsw.v">;
+
+def VSE_V : VUnitStrideStore<MOPSTUnitStride, SUMOPUnitStride, LSWidthVSEW, "vse.v">;
+
+// Vector Strided Instructions
+def VLSB_V : VStridedLoad<MOPLDStridedS, LSWidthVByte, "vlsb.v">;
+def VLSH_V : VStridedLoad<MOPLDStridedS, LSWidthVHalf, "vlsh.v">;
+def VLSW_V : VStridedLoad<MOPLDStridedS, LSWidthVWord, "vlsw.v">;
+
+def VLSBU_V : VStridedLoad<MOPLDStridedU, LSWidthVByte, "vlsbu.v">;
+def VLSHU_V : VStridedLoad<MOPLDStridedU, LSWidthVHalf, "vlshu.v">;
+def VLSWU_V : VStridedLoad<MOPLDStridedU, LSWidthVWord, "vlswu.v">;
+
+def VLSE_V : VStridedLoad<MOPLDStridedU, LSWidthVSEW, "vlse.v">;
+
+def VSSB_V : VStridedStore<MOPSTStrided, LSWidthVByte, "vssb.v">;
+def VSSH_V : VStridedStore<MOPSTStrided, LSWidthVHalf, "vssh.v">;
+def VSSW_V : VStridedStore<MOPSTStrided, LSWidthVWord, "vssw.v">;
+def VSSE_V : VStridedStore<MOPSTStrided, LSWidthVSEW, "vsse.v">;
+
+// Vector Indexed Instructions
+def VLXB_V : VIndexedLoad<MOPLDIndexedS, LSWidthVByte, "vlxb.v">;
+def VLXH_V : VIndexedLoad<MOPLDIndexedS, LSWidthVHalf, "vlxh.v">;
+def VLXW_V : VIndexedLoad<MOPLDIndexedS, LSWidthVWord, "vlxw.v">;
+
+def VLXBU_V : VIndexedLoad<MOPLDIndexedU, LSWidthVByte, "vlxbu.v">;
+def VLXHU_V : VIndexedLoad<MOPLDIndexedU, LSWidthVHalf, "vlxhu.v">;
+def VLXWU_V : VIndexedLoad<MOPLDIndexedU, LSWidthVWord, "vlxwu.v">;
+
+def VLXE_V : VIndexedLoad<MOPLDIndexedU, LSWidthVSEW, "vlxe.v">;
+
+def VSXB_V : VIndexedStore<MOPSTIndexedOrder, LSWidthVByte, "vsxb.v">;
+def VSXH_V : VIndexedStore<MOPSTIndexedOrder, LSWidthVHalf, "vsxh.v">;
+def VSXW_V : VIndexedStore<MOPSTIndexedOrder, LSWidthVWord, "vsxw.v">;
+def VSXE_V : VIndexedStore<MOPSTIndexedOrder, LSWidthVSEW, "vsxe.v">;
+
+def VSUXB_V : VIndexedStore<MOPSTIndexedUnOrd, LSWidthVByte, "vsuxb.v">;
+def VSUXH_V : VIndexedStore<MOPSTIndexedUnOrd, LSWidthVHalf, "vsuxh.v">;
+def VSUXW_V : VIndexedStore<MOPSTIndexedUnOrd, LSWidthVWord, "vsuxw.v">;
+def VSUXE_V : VIndexedStore<MOPSTIndexedUnOrd, LSWidthVSEW, "vsuxe.v">;
+
+def VL1R_V : VWholeLoad<0, "vl1r.v">;
+def VS1R_V : VWholeStore<0, "vs1r.v">;
+
+// Vector Single-Width Integer Add and Subtract
+defm VADD_V : VALU_IV_V_X_I<"vadd", 0b000000>;
+defm VSUB_V : VALU_IV_V_X<"vsub", 0b000010>;
+defm VRSUB_V : VALU_IV_X_I<"vrsub", 0b000011>;
+
+// Vector Widening Integer Add/Subtract
+// Refer to 11.2 Widening Vector Arithmetic Instructions
+// The destination vector register group cannot overlap a source vector
+// register group of a different element width (including the mask register
+// if masked), otherwise an illegal instruction exception is raised.
+let Constraints = "@earlyclobber $vd" in {
+let RVVConstraint = WidenV in {
+defm VWADDU_V : VALU_MV_V_X<"vwaddu", 0b110000>;
+defm VWSUBU_V : VALU_MV_V_X<"vwsubu", 0b110010>;
+defm VWADD_V : VALU_MV_V_X<"vwadd", 0b110001>;
+defm VWSUB_V : VALU_MV_V_X<"vwsub", 0b110011>;
+} // RVVConstraint = WidenV
+// Set earlyclobber for following instructions for second and mask operands.
+// This has the downside that the earlyclobber constraint is too coarse and
+// will impose unnecessary restrictions by not allowing the destination to
+// overlap with the first (wide) operand.
+let RVVConstraint = WidenW in {
+defm VWADDU_W : VALU_MV_V_X<"vwaddu", 0b110100, "w">;
+defm VWSUBU_W : VALU_MV_V_X<"vwsubu", 0b110110, "w">;
+defm VWADD_W : VALU_MV_V_X<"vwadd", 0b110101, "w">;
+defm VWSUB_W : VALU_MV_V_X<"vwsub", 0b110111, "w">;
+} // RVVConstraint = WidenW
+} // Constraints = "@earlyclobber $vd"
+
+def : InstAlias<"vwcvt.x.x.v $vd, $vs$vm",
+                (VWADD_VX VRegOp:$vd, VRegOp:$vs, X0, VMaskOp:$vm)>;
+def : InstAlias<"vwcvtu.x.x.v $vd, $vs$vm",
+                (VWADDU_VX VRegOp:$vd, VRegOp:$vs, X0, VMaskOp:$vm)>;
+
+// Vector Integer Add-with-Carry / Subtract-with-Borrow Instructions
+defm VADC_V : VALUm_IV_V_X_I<"vadc", 0b010000>;
+defm VMADC_V : VALUm_IV_V_X_I<"vmadc", 0b010001>;
+defm VMADC_V : VALUNoVm_IV_V_X_I<"vmadc", 0b010001>;
+defm VSBC_V : VALUm_IV_V_X<"vsbc", 0b010010>;
+defm VMSBC_V : VALUm_IV_V_X<"vmsbc", 0b010011>;
+defm VMSBC_V : VALUNoVm_IV_V_X<"vmsbc", 0b010011>;
+
+// Vector Bitwise Logical Instructions
+defm VAND_V : VALU_IV_V_X_I<"vand", 0b001001>;
+defm VOR_V : VALU_IV_V_X_I<"vor", 0b001010>;
+defm VXOR_V : VALU_IV_V_X_I<"vxor", 0b001011>;
+
+def : InstAlias<"vnot.v $vd, $vs$vm",
+                (VXOR_VI VRegOp:$vd, VRegOp:$vs, -1, VMaskOp:$vm)>;
+
+// Vector Single-Width Bit Shift Instructions
+defm VSLL_V : VALU_IV_V_X_I<"vsll", 0b100101, uimm5>;
+defm VSRL_V : VALU_IV_V_X_I<"vsrl", 0b101000, uimm5>;
+defm VSRA_V : VALU_IV_V_X_I<"vsra", 0b101001, uimm5>;
+
+// Vector Narrowing Integer Right Shift Instructions
+// Refer to 11.3. Narrowing Vector Arithmetic Instructions
+// The destination vector register group cannot overlap the first source
+// vector register group (specified by vs2). The destination vector register
+// group cannot overlap the mask register if used, unless LMUL=1.
+let Constraints = "@earlyclobber $vd", RVVConstraint = Narrow in {
+defm VNSRL_W : VALU_IV_V_X_I<"vnsrl", 0b101100, uimm5, "w">;
+defm VNSRA_W : VALU_IV_V_X_I<"vnsra", 0b101101, uimm5, "w">;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = Narrow
+
+// Vector Integer Comparison Instructions
+defm VMSEQ_V : VALU_IV_V_X_I<"vmseq", 0b011000>;
+defm VMSNE_V : VALU_IV_V_X_I<"vmsne", 0b011001>;
+defm VMSLTU_V : VALU_IV_V_X<"vmsltu", 0b011010>;
+defm VMSLT_V : VALU_IV_V_X<"vmslt", 0b011011>;
+defm VMSLEU_V : VALU_IV_V_X_I<"vmsleu", 0b011100>;
+defm VMSLE_V : VALU_IV_V_X_I<"vmsle", 0b011101>;
+defm VMSGTU_V : VALU_IV_X_I<"vmsgtu", 0b011110>;
+defm VMSGT_V : VALU_IV_X_I<"vmsgt", 0b011111>;
+
+def : InstAlias<"vmsgtu.vv $vd, $va, $vb$vm",
+                (VMSLTU_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+def : InstAlias<"vmsgt.vv $vd, $va, $vb$vm",
+                (VMSLT_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+def : InstAlias<"vmsgeu.vv $vd, $va, $vb$vm",
+                (VMSLEU_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+def : InstAlias<"vmsge.vv $vd, $va, $vb$vm",
+                (VMSLE_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+def : InstAlias<"vmsltu.vi $vd, $va, $imm$vm",
+                (VMSLEU_VI VRegOp:$vd, VRegOp:$va, simm5_plus1:$imm,
+                 VMaskOp:$vm), 0>;
+def : InstAlias<"vmslt.vi $vd, $va, $imm$vm",
+                (VMSLE_VI VRegOp:$vd, VRegOp:$va, simm5_plus1:$imm,
+                 VMaskOp:$vm), 0>;
+def : InstAlias<"vmsgeu.vi $vd, $va, $imm$vm",
+                (VMSGTU_VI VRegOp:$vd, VRegOp:$va, simm5_plus1:$imm,
+                 VMaskOp:$vm), 0>;
+def : InstAlias<"vmsge.vi $vd, $va, $imm$vm",
+                (VMSGT_VI VRegOp:$vd, VRegOp:$va, simm5_plus1:$imm,
+                 VMaskOp:$vm), 0>;
+
+// Vector Integer Min/Max Instructions
+defm VMINU_V : VALU_IV_V_X<"vminu", 0b000100>;
+defm VMIN_V : VALU_IV_V_X<"vmin", 0b000101>;
+defm VMAXU_V : VALU_IV_V_X<"vmaxu", 0b000110>;
+defm VMAX_V : VALU_IV_V_X<"vmax", 0b000111>;
+
+// Vector Single-Width Integer Multiply Instructions
+defm VMUL_V : VALU_MV_V_X<"vmul", 0b100101>;
+defm VMULH_V : VALU_MV_V_X<"vmulh", 0b100111>;
+defm VMULHU_V : VALU_MV_V_X<"vmulhu", 0b100100>;
+defm VMULHSU_V : VALU_MV_V_X<"vmulhsu", 0b100110>;
+
+// Vector Integer Divide Instructions
+defm VDIVU_V : VALU_MV_V_X<"vdivu", 0b100000>;
+defm VDIV_V : VALU_MV_V_X<"vdiv", 0b100001>;
+defm VREMU_V : VALU_MV_V_X<"vremu", 0b100010>;
+defm VREM_V : VALU_MV_V_X<"vrem", 0b100011>;
+
+// Vector Widening Integer Multiply Instructions
+let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
+defm VWMUL_V : VALU_MV_V_X<"vwmul", 0b111011>;
+defm VWMULU_V : VALU_MV_V_X<"vwmulu", 0b111000>;
+defm VWMULSU_V : VALU_MV_V_X<"vwmulsu", 0b111010>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
+
+// Vector Single-Width Integer Multiply-Add Instructions
+defm VMACC_V : VALUr_MV_V_X<"vmacc", 0b101101>;
+defm VNMSAC_V : VALUr_MV_V_X<"vnmsac", 0b101111>;
+defm VMADD_V : VALUr_MV_V_X<"vmadd", 0b101001>;
+defm VNMSUB_V : VALUr_MV_V_X<"vnmsub", 0b101011>;
+
+// Vector Widening Integer Multiply-Add Instructions
+let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
+defm VWMACCU_V : VALUr_MV_V_X<"vwmaccu", 0b111100>;
+defm VWMACC_V : VALUr_MV_V_X<"vwmacc", 0b111101>;
+defm VWMACCSU_V : VALUr_MV_V_X<"vwmaccsu", 0b111111>;
+defm VWMACCUS_V : VALUr_MV_X<"vwmaccus", 0b111110>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
+
+// Vector Integer Merge Instructions
+defm VMERGE_V : VALUm_IV_V_X_I<"vmerge", 0b010111>;
+
+// Vector Integer Move Instructions
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vs2 = 0, vm = 1 in {
+// op vd, vs1
+def VMV_V_V : RVInstVV<0b010111, OPIVV, (outs VRegOp:$vd),
+                       (ins VRegOp:$vs1), "vmv.v.v", "$vd, $vs1">;
+// op vd, rs1
+def VMV_V_X : RVInstVX<0b010111, OPIVX, (outs VRegOp:$vd),
+                       (ins GPR:$rs1), "vmv.v.x", "$vd, $rs1">;
+// op vd, imm
+def VMV_V_I : RVInstIVI<0b010111, (outs VRegOp:$vd),
+                       (ins simm5:$imm), "vmv.v.i", "$vd, $imm">;
+} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+
+// Vector Fixed-Point Arithmetic Instructions
+defm VSADDU_V : VALU_IV_V_X_I<"vsaddu", 0b100000>;
+defm VSADD_V : VALU_IV_V_X_I<"vsadd", 0b100001>;
+defm VSSUBU_V : VALU_IV_V_X<"vssubu", 0b100010>;
+defm VSSUB_V : VALU_IV_V_X<"vssub", 0b100011>;
+
+// Vector Single-Width Averaging Add and Subtract
+defm VAADDU_V : VALU_MV_V_X<"vaaddu", 0b001000>;
+defm VAADD_V : VALU_MV_V_X<"vaadd", 0b001001>;
+defm VASUBU_V : VALU_MV_V_X<"vasubu", 0b001010>;
+defm VASUB_V : VALU_MV_V_X<"vasub", 0b001011>;
+
+// Vector Single-Width Fractional Multiply with Rounding and Saturation
+defm VSMUL_V : VALU_IV_V_X<"vsmul", 0b100111>;
+
+// Vector Single-Width Scaling Shift Instructions
+defm VSSRL_V : VALU_IV_V_X_I<"vssrl", 0b101010, uimm5>;
+defm VSSRA_V : VALU_IV_V_X_I<"vssra", 0b101011, uimm5>;
+
+// Vector Narrowing Fixed-Point Clip Instructions
+let Constraints = "@earlyclobber $vd", RVVConstraint = Narrow in {
+defm VNCLIPU_W : VALU_IV_V_X_I<"vnclipu", 0b101110, uimm5, "w">;
+defm VNCLIP_W : VALU_IV_V_X_I<"vnclip", 0b101111, uimm5, "w">;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = Narrow
+
+// Vector Single-Width Floating-Point Add/Subtract Instructions
+defm VFADD_V : VALU_FV_V_F<"vfadd", 0b000000>;
+defm VFSUB_V : VALU_FV_V_F<"vfsub", 0b000010>;
+defm VFRSUB_V : VALU_FV_F<"vfrsub", 0b100111>;
+
+// Vector Widening Floating-Point Add/Subtract Instructions
+let Constraints = "@earlyclobber $vd" in {
+let RVVConstraint = WidenV in {
+defm VFWADD_V : VALU_FV_V_F<"vfwadd", 0b110000>;
+defm VFWSUB_V : VALU_FV_V_F<"vfwsub", 0b110010>;
+} // RVVConstraint = WidenV
+// Set earlyclobber for following instructions for second and mask operands.
+// This has the downside that the earlyclobber constraint is too coarse and
+// will impose unnecessary restrictions by not allowing the destination to
+// overlap with the first (wide) operand.
+let RVVConstraint = WidenW in {
+defm VFWADD_W : VALU_FV_V_F<"vfwadd", 0b110100, "w">;
+defm VFWSUB_W : VALU_FV_V_F<"vfwsub", 0b110110, "w">;
+} // RVVConstraint = WidenW
+} // Constraints = "@earlyclobber $vd"
+
+// Vector Single-Width Floating-Point Multiply/Divide Instructions
+defm VFMUL_V : VALU_FV_V_F<"vfmul", 0b100100>;
+defm VFDIV_V : VALU_FV_V_F<"vfdiv", 0b100000>;
+defm VFRDIV_V : VALU_FV_F<"vfrdiv", 0b100001>;
+
+// Vector Widening Floating-Point Multiply
+let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
+defm VFWMUL_V : VALU_FV_V_F<"vfwmul", 0b111000>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
+
+// Vector Single-Width Floating-Point Fused Multiply-Add Instructions
+defm VFMACC_V : VALUr_FV_V_F<"vfmacc", 0b101100>;
+defm VFNMACC_V : VALUr_FV_V_F<"vfnmacc", 0b101101>;
+defm VFMSAC_V : VALUr_FV_V_F<"vfmsac", 0b101110>;
+defm VFNMSAC_V : VALUr_FV_V_F<"vfnmsac", 0b101111>;
+defm VFMADD_V : VALUr_FV_V_F<"vfmadd", 0b101000>;
+defm VFNMADD_V : VALUr_FV_V_F<"vfnmadd", 0b101001>;
+defm VFMSUB_V : VALUr_FV_V_F<"vfmsub", 0b101010>;
+defm VFNMSUB_V : VALUr_FV_V_F<"vfnmsub", 0b101011>;
+
+// Vector Widening Floating-Point Fused Multiply-Add Instructions
+let Constraints = "@earlyclobber $vd", RVVConstraint = WidenV in {
+defm VFWMACC_V : VALUr_FV_V_F<"vfwmacc", 0b111100>;
+defm VFWNMACC_V : VALUr_FV_V_F<"vfwnmacc", 0b111101>;
+defm VFWMSAC_V : VALUr_FV_V_F<"vfwmsac", 0b111110>;
+defm VFWNMSAC_V : VALUr_FV_V_F<"vfwnmsac", 0b111111>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenV
+
+// Vector Floating-Point Square-Root Instruction
+defm VFSQRT_V : VALU_FV_VS2<"vfsqrt.v", 0b100011, 0b00000>;
+
+// Vector Floating-Point MIN/MAX Instructions
+defm VFMIN_V : VALU_FV_V_F<"vfmin", 0b000100>;
+defm VFMAX_V : VALU_FV_V_F<"vfmax", 0b000110>;
+
+// Vector Floating-Point Sign-Injection Instructions
+defm VFSGNJ_V : VALU_FV_V_F<"vfsgnj", 0b001000>;
+defm VFSGNJN_V : VALU_FV_V_F<"vfsgnjn", 0b001001>;
+defm VFSGNJX_V : VALU_FV_V_F<"vfsgnjx", 0b001010>;
+
+// Vector Floating-Point Compare Instructions
+defm VMFEQ_V : VALU_FV_V_F<"vmfeq", 0b011000>;
+defm VMFNE_V : VALU_FV_V_F<"vmfne", 0b011100>;
+defm VMFLT_V : VALU_FV_V_F<"vmflt", 0b011011>;
+defm VMFLE_V : VALU_FV_V_F<"vmfle", 0b011001>;
+defm VMFGT_V : VALU_FV_F<"vmfgt", 0b011101>;
+defm VMFGE_V : VALU_FV_F<"vmfge", 0b011111>;
+
+def : InstAlias<"vmfgt.vv $vd, $va, $vb$vm",
+                (VMFLT_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+def : InstAlias<"vmfge.vv $vd, $va, $vb$vm",
+                (VMFLE_VV VRegOp:$vd, VRegOp:$vb, VRegOp:$va, VMaskOp:$vm), 0>;
+
+// Vector Floating-Point Classify Instruction
+defm VFCLASS_V : VALU_FV_VS2<"vfclass.v", 0b100011, 0b10000>;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+// Vector Floating-Point Merge Instruction
+def VFMERGE_VFM : RVInstVX<0b010111, OPFVF, (outs VRegOp:$vd),
+                           (ins VRegOp:$vs2, FPR32:$rs1, VMV0:$v0),
+                           "vfmerge.vfm", "$vd, $vs2, $rs1, v0"> {
+  let vm = 0;
+}
+
+// Vector Floating-Point Move Instruction
+def VFMV_V_F : RVInstVX<0b010111, OPFVF, (outs VRegOp:$vd),
+                       (ins FPR32:$rs1), "vfmv.v.f", "$vd, $rs1"> {
+  let vs2 = 0;
+  let vm = 1;
+}
+} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+
+// Single-Width Floating-Point/Integer Type-Convert Instructions
+defm VFCVT_XU_F_V : VALU_FV_VS2<"vfcvt.xu.f.v", 0b100010, 0b00000>;
+defm VFCVT_X_F_V : VALU_FV_VS2<"vfcvt.x.f.v", 0b100010, 0b00001>;
+defm VFCVT_F_XU_V : VALU_FV_VS2<"vfcvt.f.xu.v", 0b100010, 0b00010>;
+defm VFCVT_F_X_V : VALU_FV_VS2<"vfcvt.f.x.v", 0b100010, 0b00011>;
+
+// Widening Floating-Point/Integer Type-Convert Instructions
+let Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt in {
+defm VFWCVT_XU_F_V : VALU_FV_VS2<"vfwcvt.xu.f.v", 0b100010, 0b01000>;
+defm VFWCVT_X_F_V : VALU_FV_VS2<"vfwcvt.x.f.v", 0b100010, 0b01001>;
+defm VFWCVT_F_XU_V : VALU_FV_VS2<"vfwcvt.f.xu.v", 0b100010, 0b01010>;
+defm VFWCVT_F_X_V : VALU_FV_VS2<"vfwcvt.f.x.v", 0b100010, 0b01011>;
+defm VFWCVT_F_F_V : VALU_FV_VS2<"vfwcvt.f.f.v", 0b100010, 0b01100>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = WidenCvt
+
+// Narrowing Floating-Point/Integer Type-Convert Instructions
+let Constraints = "@earlyclobber $vd", RVVConstraint = Narrow in {
+defm VFNCVT_XU_F_W : VALU_FV_VS2<"vfncvt.xu.f.w", 0b100010, 0b10000>;
+defm VFNCVT_X_F_W : VALU_FV_VS2<"vfncvt.x.f.w", 0b100010, 0b10001>;
+defm VFNCVT_F_XU_W : VALU_FV_VS2<"vfncvt.f.xu.w", 0b100010, 0b10010>;
+defm VFNCVT_F_X_W : VALU_FV_VS2<"vfncvt.f.x.w", 0b100010, 0b10011>;
+defm VFNCVT_F_F_W : VALU_FV_VS2<"vfncvt.f.f.w", 0b100010, 0b10100>;
+defm VFNCVT_ROD_F_F_W : VALU_FV_VS2<"vfncvt.rod.f.f.w", 0b100010, 0b10101>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = Narrow
+
+// Vector Single-Width Integer Reduction Instructions
+defm VREDSUM : VALU_MV_V<"vredsum", 0b000000>;
+defm VREDMAXU : VALU_MV_V<"vredmaxu", 0b000110>;
+defm VREDMAX : VALU_MV_V<"vredmax", 0b000111>;
+defm VREDMINU : VALU_MV_V<"vredminu", 0b000100>;
+defm VREDMIN : VALU_MV_V<"vredmin", 0b000101>;
+defm VREDAND : VALU_MV_V<"vredand", 0b000001>;
+defm VREDOR : VALU_MV_V<"vredor", 0b000010>;
+defm VREDXOR : VALU_MV_V<"vredxor", 0b000011>;
+
+// Vector Widening Integer Reduction Instructions
+let Constraints = "@earlyclobber $vd" in {
+// Set earlyclobber for following instructions for second and mask operands.
+// This has the downside that the earlyclobber constraint is too coarse and
+// will impose unnecessary restrictions by not allowing the destination to
+// overlap with the first (wide) operand.
+defm VWREDSUMU : VALU_IV_V<"vwredsumu", 0b110000>;
+defm VWREDSUM : VALU_IV_V<"vwredsum", 0b110001>;
+} // Constraints = "@earlyclobber $vd"
+
+// Vector Single-Width Floating-Point Reduction Instructions
+defm VFREDOSUM : VALU_FV_V<"vfredosum", 0b000011>;
+defm VFREDSUM : VALU_FV_V<"vfredsum", 0b000001>;
+defm VFREDMAX : VALU_FV_V<"vfredmax", 0b000111>;
+defm VFREDMIN : VALU_FV_V<"vfredmin", 0b000101>;
+
+// Vector Widening Floating-Point Reduction Instructions
+let Constraints = "@earlyclobber $vd" in {
+// Set earlyclobber for following instructions for second and mask operands.
+// This has the downside that the earlyclobber constraint is too coarse and
+// will impose unnecessary restrictions by not allowing the destination to
+// overlap with the first (wide) operand.
+defm VFWREDOSUM : VALU_FV_V<"vfwredosum", 0b110011>;
+defm VFWREDSUM : VALU_FV_V<"vfwredsum", 0b110001>;
+} // Constraints = "@earlyclobber $vd"
+
+// Vector Mask-Register Logical Instructions
+defm VMAND_M : VALU_MV_Mask<"vmand", 0b011001, "m">;
+defm VMNAND_M : VALU_MV_Mask<"vmnand", 0b011101, "m">;
+defm VMANDNOT_M : VALU_MV_Mask<"vmandnot", 0b011000, "m">;
+defm VMXOR_M : VALU_MV_Mask<"vmxor", 0b011011, "m">;
+defm VMOR_M : VALU_MV_Mask<"vmor", 0b011010, "m">;
+defm VMNOR_M : VALU_MV_Mask<"vmnor", 0b011110, "m">;
+defm VMORNOT_M : VALU_MV_Mask<"vmornot", 0b011100, "m">;
+defm VMXNOR_M : VALU_MV_Mask<"vmxnor", 0b011111, "m">;
+
+def : InstAlias<"vmcpy.m $vd, $vs",
+                (VMAND_MM VRegOp:$vd, VRegOp:$vs, VRegOp:$vs)>;
+def : InstAlias<"vmclr.m $vd",
+                (VMXOR_MM VRegOp:$vd, VRegOp:$vd, VRegOp:$vd)>;
+def : InstAlias<"vmset.m $vd",
+                (VMXNOR_MM VRegOp:$vd, VRegOp:$vd, VRegOp:$vd)>;
+def : InstAlias<"vmnot.m $vd, $vs",
+                (VMNAND_MM VRegOp:$vd, VRegOp:$vs, VRegOp:$vs)>;
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+// Vector mask population count vpopc
+def VPOPC_M : RVInstV<0b010000, 0b10000, OPMVV, (outs GPR:$vd),
+                        (ins VRegOp:$vs2, VMaskOp:$vm),
+                        "vpopc.m", "$vd, $vs2$vm">;
+
+// vfirst find-first-set mask bit
+def VFIRST_M : RVInstV<0b010000, 0b10001, OPMVV, (outs GPR:$vd),
+                        (ins VRegOp:$vs2, VMaskOp:$vm),
+                        "vfirst.m", "$vd, $vs2$vm">;
+} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+
+// vmsbf.m set-before-first mask bit
+defm VMSBF_M : VALU_MV_VS2<"vmsbf.m", 0b010100, 0b00001>;
+
+// vmsif.m set-including-first mask bit
+defm VMSIF_M : VALU_MV_VS2<"vmsif.m", 0b010100, 0b00011>;
+
+// vmsof.m set-only-first mask bit
+defm VMSOF_M : VALU_MV_VS2<"vmsof.m", 0b010100, 0b00010>;
+
+// Vector Iota Instruction
+let Constraints = "@earlyclobber $vd", RVVConstraint = Iota in {
+defm VIOTA_M : VALU_MV_VS2<"viota.m", 0b010100, 0b10000>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = Iota
+
+// Vector Element Index Instruction
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+def VID_V : RVInstV<0b010100, 0b10001, OPMVV, (outs VRegOp:$vd),
+                      (ins VMaskOp:$vm), "vid.v", "$vd$vm"> {
+  let vs2 = 0;
+}
+
+// Integer Scalar Move Instructions
+let vm = 1 in {
+def VMV_X_S : RVInstV<0b010000, 0b00000, OPMVV, (outs GPR:$vd),
+                      (ins VRegOp:$vs2), "vmv.x.s", "$vd, $vs2">;
+def VMV_S_X : RVInstV2<0b010000, 0b00000, OPMVX, (outs VRegOp:$vd),
+                      (ins GPR:$rs1), "vmv.s.x", "$vd, $rs1">;
+
+}
+} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1 in {
+// Floating-Point Scalar Move Instructions
+def VFMV_F_S : RVInstV<0b010000, 0b00000, OPFVV, (outs FPR32:$vd),
+                      (ins VRegOp:$vs2), "vfmv.f.s", "$vd, $vs2">;
+def VFMV_S_F : RVInstV2<0b010000, 0b00000, OPFVF, (outs VRegOp:$vd),
+                      (ins FPR32:$rs1), "vfmv.s.f", "$vd, $rs1">;
+
+} // hasSideEffects = 0, mayLoad = 0, mayStore = 0, vm = 1
+
+// Vector Slide Instructions
+let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
+defm VSLIDEUP_V : VALU_IV_X_I<"vslideup", 0b001110, uimm5>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
+defm VSLIDEDOWN_V : VALU_IV_X_I<"vslidedown", 0b001111, uimm5>;
+
+let Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp in {
+defm VSLIDE1UP_V : VALU_MV_X<"vslide1up", 0b001110>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = SlideUp
+defm VSLIDE1DOWN_V : VALU_MV_X<"vslide1down", 0b001111>;
+
+// Vector Register Gather Instruction
+let Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather in {
+defm VRGATHER_V : VALU_IV_V_X_I<"vrgather", 0b001100, uimm5>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = Vrgather
+
+// Vector Compress Instruction
+let Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress in {
+defm VCOMPRESS_V : VALU_MV_Mask<"vcompress", 0b010111>;
+} // Constraints = "@earlyclobber $vd", RVVConstraint = Vcompress
+
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
+foreach nf = [1, 2, 4, 8] in {
+  def VMV#nf#R_V  : RVInstV<0b100111, !add(nf, -1), OPIVI, (outs VRegOp:$vd),
+                            (ins VRegOp:$vs2), "vmv" # nf # "r.v",
+                            "$vd, $vs2"> {
+    let Uses = [];
+    let vm = 1;
+  }
+}
+} // hasSideEffects = 0, mayLoad = 0, mayStore = 0
+} // Predicates = [HasStdExtV]
diff --git a/llvm/lib/Target/RISCV/RISCVInstructionSelector.cpp b/llvm/lib/Target/RISCV/RISCVInstructionSelector.cpp
index 5bd09a546114..4d1f47da209d 100644
--- a/llvm/lib/Target/RISCV/RISCVInstructionSelector.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstructionSelector.cpp
@@ -16,6 +16,7 @@
 #include "RISCVTargetMachine.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
 #include "llvm/CodeGen/GlobalISel/InstructionSelectorImpl.h"
+#include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "riscv-isel"
diff --git a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
index 585bff2bc20a..c379a8d8f0d6 100644
--- a/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVMachineFunctionInfo.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_RISCV_RISCVMACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_RISCV_RISCVMACHINEFUNCTIONINFO_H
 
+#include "RISCVSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 
@@ -22,7 +23,6 @@ namespace llvm {
 /// and contains private RISCV-specific information for each MachineFunction.
 class RISCVMachineFunctionInfo : public MachineFunctionInfo {
 private:
-  MachineFunction &MF;
   /// FrameIndex for start of varargs area
   int VarArgsFrameIndex = 0;
   /// Size of the save area used for varargs
@@ -30,9 +30,11 @@ private:
   /// FrameIndex used for transferring values between 64-bit FPRs and a pair
   /// of 32-bit GPRs via the stack.
   int MoveF64FrameIndex = -1;
+  /// Size of any opaque stack adjustment due to save/restore libcalls.
+  unsigned LibCallStackSize = 0;
 
 public:
-  RISCVMachineFunctionInfo(MachineFunction &MF) : MF(MF) {}
+  RISCVMachineFunctionInfo(const MachineFunction &MF) {}
 
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Index) { VarArgsFrameIndex = Index; }
@@ -40,11 +42,22 @@ public:
   unsigned getVarArgsSaveSize() const { return VarArgsSaveSize; }
   void setVarArgsSaveSize(int Size) { VarArgsSaveSize = Size; }
 
-  int getMoveF64FrameIndex() {
+  int getMoveF64FrameIndex(MachineFunction &MF) {
     if (MoveF64FrameIndex == -1)
-      MoveF64FrameIndex = MF.getFrameInfo().CreateStackObject(8, 8, false);
+      MoveF64FrameIndex =
+          MF.getFrameInfo().CreateStackObject(8, Align(8), false);
     return MoveF64FrameIndex;
   }
+
+  unsigned getLibCallStackSize() const { return LibCallStackSize; }
+  void setLibCallStackSize(unsigned Size) { LibCallStackSize = Size; }
+
+  bool useSaveRestoreLibCalls(const MachineFunction &MF) const {
+    // We cannot use fixed locations for the callee saved spill slots if the
+    // function uses a varargs save area.
+    return MF.getSubtarget<RISCVSubtarget>().enableSaveRestore() &&
+           VarArgsSaveSize == 0 && !MF.getFrameInfo().hasTailCall();
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
index 1d41994ef1e3..cb7d55eb0f0c 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "RISCVRegisterInfo.h"
 #include "RISCV.h"
+#include "RISCVMachineFunctionInfo.h"
 #include "RISCVSubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -34,6 +35,8 @@ static_assert(RISCV::F31_F == RISCV::F0_F + 31,
 static_assert(RISCV::F1_D == RISCV::F0_D + 1, "Register list not consecutive");
 static_assert(RISCV::F31_D == RISCV::F0_D + 31,
               "Register list not consecutive");
+static_assert(RISCV::V1 == RISCV::V0 + 1, "Register list not consecutive");
+static_assert(RISCV::V31 == RISCV::V0 + 31, "Register list not consecutive");
 
 RISCVRegisterInfo::RISCVRegisterInfo(unsigned HwMode)
     : RISCVGenRegisterInfo(RISCV::X1, /*DwarfFlavour*/0, /*EHFlavor*/0,
@@ -91,11 +94,11 @@ BitVector RISCVRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
 }
 
 bool RISCVRegisterInfo::isAsmClobberable(const MachineFunction &MF,
-                                         unsigned PhysReg) const {
+                                         MCRegister PhysReg) const {
   return !MF.getSubtarget<RISCVSubtarget>().isRegisterReservedByUser(PhysReg);
 }
 
-bool RISCVRegisterInfo::isConstantPhysReg(unsigned PhysReg) const {
+bool RISCVRegisterInfo::isConstantPhysReg(MCRegister PhysReg) const {
   return PhysReg == RISCV::X0;
 }
 
@@ -103,6 +106,39 @@ const uint32_t *RISCVRegisterInfo::getNoPreservedMask() const {
   return CSR_NoRegs_RegMask;
 }
 
+// Frame indexes representing locations of CSRs which are given a fixed location
+// by save/restore libcalls.
+static const std::map<unsigned, int> FixedCSRFIMap = {
+  {/*ra*/  RISCV::X1,   -1},
+  {/*s0*/  RISCV::X8,   -2},
+  {/*s1*/  RISCV::X9,   -3},
+  {/*s2*/  RISCV::X18,  -4},
+  {/*s3*/  RISCV::X19,  -5},
+  {/*s4*/  RISCV::X20,  -6},
+  {/*s5*/  RISCV::X21,  -7},
+  {/*s6*/  RISCV::X22,  -8},
+  {/*s7*/  RISCV::X23,  -9},
+  {/*s8*/  RISCV::X24,  -10},
+  {/*s9*/  RISCV::X25,  -11},
+  {/*s10*/ RISCV::X26,  -12},
+  {/*s11*/ RISCV::X27,  -13}
+};
+
+bool RISCVRegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
+                                             Register Reg,
+                                             int &FrameIdx) const {
+  const auto *RVFI = MF.getInfo<RISCVMachineFunctionInfo>();
+  if (!RVFI->useSaveRestoreLibCalls(MF))
+    return false;
+
+  auto FII = FixedCSRFIMap.find(Reg);
+  if (FII == FixedCSRFIMap.end())
+    return false;
+
+  FrameIdx = FII->second;
+  return true;
+}
+
 void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
                                             int SPAdj, unsigned FIOperandNum,
                                             RegScavenger *RS) const {
@@ -115,7 +151,7 @@ void RISCVRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   DebugLoc DL = MI.getDebugLoc();
 
   int FrameIndex = MI.getOperand(FIOperandNum).getIndex();
-  unsigned FrameReg;
+  Register FrameReg;
   int Offset =
       getFrameLowering(MF)->getFrameIndexReference(MF, FrameIndex, FrameReg) +
       MI.getOperand(FIOperandNum + 1).getImm();
@@ -156,13 +192,6 @@ const uint32_t *
 RISCVRegisterInfo::getCallPreservedMask(const MachineFunction & MF,
                                         CallingConv::ID /*CC*/) const {
   auto &Subtarget = MF.getSubtarget<RISCVSubtarget>();
-  if (MF.getFunction().hasFnAttribute("interrupt")) {
-    if (Subtarget.hasStdExtD())
-      return CSR_XLEN_F64_Interrupt_RegMask;
-    if (Subtarget.hasStdExtF())
-      return CSR_XLEN_F32_Interrupt_RegMask;
-    return CSR_Interrupt_RegMask;
-  }
 
   switch (Subtarget.getTargetABI()) {
   default:
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
index 30b639517fde..ffbb60abf755 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.h
@@ -31,12 +31,15 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
   bool isAsmClobberable(const MachineFunction &MF,
-                        unsigned PhysReg) const override;
+                        MCRegister PhysReg) const override;
 
-  bool isConstantPhysReg(unsigned PhysReg) const override;
+  bool isConstantPhysReg(MCRegister PhysReg) const override;
 
   const uint32_t *getNoPreservedMask() const override;
 
+  bool hasReservedSpillSlot(const MachineFunction &MF, Register Reg,
+                            int &FrameIdx) const override;
+
   void eliminateFrameIndex(MachineBasicBlock::iterator MI, int SPAdj,
                            unsigned FIOperandNum,
                            RegScavenger *RS = nullptr) const override;
@@ -51,10 +54,6 @@ struct RISCVRegisterInfo : public RISCVGenRegisterInfo {
     return true;
   }
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction &) const override {
-    return true;
-  }
-
   const TargetRegisterClass *
   getPointerRegClass(const MachineFunction &MF,
                      unsigned Kind = 0) const override {
diff --git a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
index 82b37afd0805..7544b4b3b845 100644
--- a/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
+++ b/llvm/lib/Target/RISCV/RISCVRegisterInfo.td
@@ -33,7 +33,21 @@ class RISCVReg64<RISCVReg32 subreg> : Register<""> {
   let AltNames = subreg.AltNames;
 }
 
+class RISCVRegWithSubRegs<bits<5> Enc, string n, list<Register> subregs,
+                          list<string> alt = []>
+      : RegisterWithSubRegs<n, subregs> {
+  let HWEncoding{4-0} = Enc;
+  let AltNames = alt;
+}
+
 def ABIRegAltName : RegAltNameIndex;
+
+def sub_vrm2    : SubRegIndex<64, -1>;
+def sub_vrm2_hi : SubRegIndex<64, -1>;
+def sub_vrm4    : SubRegIndex<128, -1>;
+def sub_vrm4_hi : SubRegIndex<128, -1>;
+def sub_vrm8    : SubRegIndex<256, -1>;
+def sub_vrm8_hi : SubRegIndex<256, -1>;
 } // Namespace = "RISCV"
 
 // Integer registers
@@ -233,3 +247,88 @@ def FPR64C : RegisterClass<"RISCV", [f64], 64, (add
   (sequence "F%u_D", 10, 15),
   (sequence "F%u_D", 8, 9)
 )>;
+
+// Vector registers
+let RegAltNameIndices = [ABIRegAltName] in {
+  foreach Index = 0-31 in {
+    def V#Index : RISCVReg<Index, "v"#Index, ["v"#Index]>, DwarfRegNum<[!add(Index, 64)]>;
+  }
+
+  foreach Index = [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22,
+                   24, 26, 28, 30] in {
+    def V#Index#M2 : RISCVRegWithSubRegs<Index, "v"#Index,
+                       [!cast<Register>("V"#Index),
+                        !cast<Register>("V"#!add(Index, 1))],
+                       ["v"#Index]>,
+                     DwarfRegAlias<!cast<Register>("V"#Index)> {
+      let SubRegIndices = [sub_vrm2, sub_vrm2_hi];
+    }
+  }
+
+  foreach Index = [0, 4, 8, 12, 16, 20, 24, 28] in {
+    def V#Index#M4 : RISCVRegWithSubRegs<Index, "v"#Index,
+                       [!cast<Register>("V"#Index#"M2"),
+                        !cast<Register>("V"#!add(Index, 2)#"M2")],
+                       ["v"#Index]>,
+                     DwarfRegAlias<!cast<Register>("V"#Index)> {
+      let SubRegIndices = [sub_vrm4, sub_vrm4_hi];
+    }
+  }
+
+  foreach Index = [0, 8, 16, 24] in {
+    def V#Index#M8 : RISCVRegWithSubRegs<Index, "v"#Index,
+                       [!cast<Register>("V"#Index#"M4"),
+                        !cast<Register>("V"#!add(Index, 4)#"M4")],
+                       ["v"#Index]>,
+                     DwarfRegAlias<!cast<Register>("V"#Index)> {
+      let SubRegIndices = [sub_vrm8, sub_vrm8_hi];
+    }
+  }
+
+  def VTYPE  : RISCVReg<0, "vtype", ["vtype"]>;
+  def VL     : RISCVReg<0, "vl", ["vl"]>;
+}
+
+class RegisterTypes<list<ValueType> reg_types> {
+  list<ValueType> types = reg_types;
+}
+
+// The order of registers represents the preferred allocation sequence,
+// meaning caller-save regs are listed before callee-save.
+def VR : RegisterClass<"RISCV", [nxv8i8, nxv4i16, nxv2i32, nxv1i64],
+                         64, (add
+    (sequence "V%u", 25, 31),
+    (sequence "V%u", 8, 24),
+    (sequence "V%u", 0, 7)
+  )> {
+  let Size = 64;
+}
+
+def VRM2 : RegisterClass<"RISCV", [nxv16i8, nxv8i16, nxv4i32, nxv2i64], 64,
+                         (add V26M2, V28M2, V30M2, V8M2, V10M2, V12M2, V14M2, V16M2,
+                              V18M2, V20M2, V22M2, V24M2, V0M2, V2M2, V4M2, V6M2)> {
+  let Size = 128;
+}
+
+def VRM4 : RegisterClass<"RISCV", [nxv32i8, nxv16i16, nxv8i32, nxv4i64], 64,
+                         (add V28M4, V8M4, V12M4, V16M4, V20M4, V24M4, V0M4, V4M4)> {
+  let Size = 256;
+}
+
+def VRM8 : RegisterClass<"RISCV", [nxv32i16, nxv16i32, nxv8i64], 64,
+                         (add V8M8, V16M8, V24M8, V0M8)> {
+  let Size = 512;
+}
+
+def VMaskVT : RegisterTypes<[nxv1i1, nxv2i1, nxv4i1, nxv8i1, nxv16i1, nxv32i1]>;
+
+def VM : RegisterClass<"RISCV", VMaskVT.types, 64, (add
+    (sequence "V%u", 25, 31),
+    (sequence "V%u", 8, 24),
+    (sequence "V%u", 0, 7))> {
+  let Size = 64;
+}
+
+def VMV0 : RegisterClass<"RISCV", VMaskVT.types, 64, (add V0)> {
+  let Size = 64;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket32.td b/llvm/lib/Target/RISCV/RISCVSchedRocket32.td
new file mode 100644
index 000000000000..305e2b9b5927
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSchedRocket32.td
@@ -0,0 +1,227 @@
+//==- RISCVSchedRocket32.td - Rocket Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See MCSchedule.h for details.
+
+// Rocket machine model for scheduling and other instruction cost heuristics.
+def Rocket32Model : SchedMachineModel {
+  let MicroOpBufferSize = 0; // Explicitly set to zero since Rocket is in-order.
+  let IssueWidth = 1;        // 1 micro-ops are dispatched per cycle.
+  let LoadLatency = 3;
+  let MispredictPenalty = 3;
+  let CompleteModel = 1;
+  let UnsupportedFeatures = [HasStdExtV];
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
+// Rocket is in-order.
+
+let BufferSize = 0 in {
+def Rocket32UnitALU        : ProcResource<1>; // Int ALU
+def Rocket32UnitIMul       : ProcResource<1>; // Int Multiply
+def Rocket32UnitMem        : ProcResource<1>; // Load/Store
+def Rocket32UnitB          : ProcResource<1>; // Branch
+
+def Rocket32UnitFPALU      : ProcResource<1>; // FP ALU
+}
+
+let BufferSize = 1 in {
+def Rocket32UnitIDiv       : ProcResource<1>; // Int Division
+def Rocket32UnitFPDivSqrt  : ProcResource<1>; // FP Divide/Sqrt'
+}
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types which both map the ProcResources and
+// set the latency.
+
+let SchedModel = Rocket32Model in {
+
+def : WriteRes<WriteJmp, [Rocket32UnitB]>;
+def : WriteRes<WriteJal, [Rocket32UnitB]>;
+def : WriteRes<WriteJalr, [Rocket32UnitB]>;
+def : WriteRes<WriteJmpReg, [Rocket32UnitB]>;
+
+def : WriteRes<WriteIALU, [Rocket32UnitALU]>;
+def : WriteRes<WriteShift, [Rocket32UnitALU]>;
+
+// Multiplies on Rocket differ by implementation; placeholder until
+// we can determine how to read from command line
+def : WriteRes<WriteIMul, [Rocket32UnitIMul]> { let Latency = 4; }
+
+// 32-bit divides have worse case latency of 34 cycle
+def : WriteRes<WriteIDiv, [Rocket32UnitIDiv]> {
+  let Latency = 34;
+  let ResourceCycles = [34];
+}
+
+// Memory
+def : WriteRes<WriteSTB, [Rocket32UnitMem]>;
+def : WriteRes<WriteSTH, [Rocket32UnitMem]>;
+def : WriteRes<WriteSTW, [Rocket32UnitMem]>;
+def : WriteRes<WriteFST32, [Rocket32UnitMem]>;
+def : WriteRes<WriteFST64, [Rocket32UnitMem]>;
+
+let Latency = 3 in {
+def : WriteRes<WriteLDB, [Rocket32UnitMem]>;
+def : WriteRes<WriteLDH, [Rocket32UnitMem]>;
+def : WriteRes<WriteCSR, [Rocket32UnitALU]>;
+}
+
+let Latency = 2 in {
+def : WriteRes<WriteLDW, [Rocket32UnitMem]>;
+def : WriteRes<WriteFLD32, [Rocket32UnitMem]>;
+def : WriteRes<WriteFLD64, [Rocket32UnitMem]>;
+
+def : WriteRes<WriteAtomicW, [Rocket32UnitMem]>;
+def : WriteRes<WriteAtomicLDW, [Rocket32UnitMem]>;
+}
+
+def : WriteRes<WriteAtomicSTW, [Rocket32UnitMem]>;
+
+// Most FP single precision operations are 4 cycles
+let Latency = 4 in {
+def : WriteRes<WriteFALU32, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFSGNJ32, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFMinMax32, [Rocket32UnitFPALU]>;
+}
+
+// Most FP double precision operations are 6 cycles
+let Latency = 6 in {
+def : WriteRes<WriteFALU64, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFSGNJ64, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFMinMax64, [Rocket32UnitFPALU]>;
+}
+
+let Latency = 2 in {
+def : WriteRes<WriteFCvtI32ToF32, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCvtI32ToF64, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCvtF32ToI32, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCvtF64ToI32, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCvtF32ToF64, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCvtF64ToF32, [Rocket32UnitFPALU]>;
+
+def : WriteRes<WriteFClass32, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFClass64, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCmp32, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCmp64, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFMovF32ToI32, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFMovI32ToF32, [Rocket32UnitFPALU]>;
+}
+
+let Latency = 5 in {
+def : WriteRes<WriteFMul32, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFMulAdd32, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFMulSub32, [Rocket32UnitFPALU]>;
+}
+
+let Latency = 7 in {
+def : WriteRes<WriteFMul64, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFMulAdd64, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFMulSub64, [Rocket32UnitFPALU]>;
+}
+
+// FP Divide unit on Rocket is not pipelined, so set resource cycles to latency
+let Latency = 20, ResourceCycles = [20] in {
+def : WriteRes<WriteFDiv32, [Rocket32UnitFPDivSqrt]>;
+def : WriteRes<WriteFDiv64, [Rocket32UnitFPDivSqrt]>;
+}
+
+// FP Sqrt unit on Rocket is not pipelined, so set resource cycles to latency
+def : WriteRes<WriteFSqrt32, [Rocket32UnitFPDivSqrt]> { let Latency = 20;
+                                                        let ResourceCycles = [20];}
+def : WriteRes<WriteFSqrt64, [Rocket32UnitFPDivSqrt]> { let Latency = 25;
+                                                        let ResourceCycles = [25];}
+
+def : WriteRes<WriteNop, []>;
+
+def : InstRW<[WriteIALU], (instrs COPY)>;
+
+let Unsupported = 1 in {
+def : WriteRes<WriteIALU32, []>;
+def : WriteRes<WriteShift32, []>;
+def : WriteRes<WriteIMul32, []>;
+def : WriteRes<WriteIDiv32, []>;
+def : WriteRes<WriteSTD, []>;
+def : WriteRes<WriteLDWU, []>;
+def : WriteRes<WriteLDD, []>;
+def : WriteRes<WriteAtomicD, []>;
+def : WriteRes<WriteAtomicLDD, []>;
+def : WriteRes<WriteAtomicSTD, []>;
+def : WriteRes<WriteFCvtI64ToF32, []>;
+def : WriteRes<WriteFCvtI64ToF64, []>;
+def : WriteRes<WriteFCvtF64ToI64, []>;
+def : WriteRes<WriteFCvtF32ToI64, []>;
+def : WriteRes<WriteFMovI64ToF64, []>;
+def : WriteRes<WriteFMovF64ToI64, []>;
+}
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types with cycles.
+// Dummy definitions for RocketCore.
+def : ReadAdvance<ReadJmp, 0>;
+def : ReadAdvance<ReadJalr, 0>;
+def : ReadAdvance<ReadCSR, 0>;
+def : ReadAdvance<ReadStoreData, 0>;
+def : ReadAdvance<ReadMemBase, 0>;
+def : ReadAdvance<ReadIALU, 0>;
+def : ReadAdvance<ReadIALU32, 0>;
+def : ReadAdvance<ReadShift, 0>;
+def : ReadAdvance<ReadShift32, 0>;
+def : ReadAdvance<ReadIDiv, 0>;
+def : ReadAdvance<ReadIDiv32, 0>;
+def : ReadAdvance<ReadIMul, 0>;
+def : ReadAdvance<ReadIMul32, 0>;
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+def : ReadAdvance<ReadFALU32, 0>;
+def : ReadAdvance<ReadFALU64, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMulAdd32, 0>;
+def : ReadAdvance<ReadFMulSub32, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMulAdd64, 0>;
+def : ReadAdvance<ReadFMulSub64, 0>;
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSchedRocket64.td b/llvm/lib/Target/RISCV/RISCVSchedRocket64.td
new file mode 100644
index 000000000000..e8514a275c45
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSchedRocket64.td
@@ -0,0 +1,228 @@
+//==- RISCVSchedRocket64.td - Rocket Scheduling Definitions -*- tablegen -*-=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// ===---------------------------------------------------------------------===//
+// The following definitions describe the simpler per-operand machine model.
+// This works with MachineScheduler. See MCSchedule.h for details.
+
+// Rocket machine model for scheduling and other instruction cost heuristics.
+def Rocket64Model : SchedMachineModel {
+  let MicroOpBufferSize = 0; // Explicitly set to zero since Rocket is in-order.
+  let IssueWidth = 1;        // 1 micro-ops are dispatched per cycle.
+  let LoadLatency = 3;
+  let MispredictPenalty = 3;
+  let UnsupportedFeatures = [HasStdExtV];
+}
+
+//===----------------------------------------------------------------------===//
+// Define each kind of processor resource and number available.
+
+// Modeling each pipeline as a ProcResource using the BufferSize = 0 since
+// Rocket is in-order.
+
+let BufferSize = 0 in {
+def Rocket64UnitALU        : ProcResource<1>; // Int ALU
+def Rocket64UnitIMul       : ProcResource<1>; // Int Multiply
+def Rocket64UnitMem        : ProcResource<1>; // Load/Store
+def Rocket64UnitB          : ProcResource<1>; // Branch
+
+def Rocket64UnitFPALU      : ProcResource<1>; // FP ALU
+}
+
+let BufferSize = 1 in {
+def Rocket64UnitIDiv       : ProcResource<1>; // Int Division
+def Rocket64UnitFPDivSqrt  : ProcResource<1>; // FP Divide/Sqrt
+}
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedWrite types which both map the ProcResources and
+// set the latency.
+
+let SchedModel = Rocket64Model in {
+
+def : WriteRes<WriteJmp, [Rocket64UnitB]>;
+def : WriteRes<WriteJal, [Rocket64UnitB]>;
+def : WriteRes<WriteJalr, [Rocket64UnitB]>;
+def : WriteRes<WriteJmpReg, [Rocket64UnitB]>;
+
+def : WriteRes<WriteIALU32, [Rocket64UnitALU]>;
+def : WriteRes<WriteIALU, [Rocket64UnitALU]>;
+def : WriteRes<WriteShift32, [Rocket64UnitALU]>;
+def : WriteRes<WriteShift, [Rocket64UnitALU]>;
+
+let Latency = 4 in {
+def : WriteRes<WriteIMul, [Rocket64UnitIMul]>;
+def : WriteRes<WriteIMul32, [Rocket64UnitIMul]>;
+}
+
+// Integer divide varies based on operand magnitude and sign; worse case latency is 34.
+def : WriteRes<WriteIDiv32, [Rocket64UnitIDiv]> {
+  let Latency = 34;
+  let ResourceCycles = [34];
+}
+def : WriteRes<WriteIDiv, [Rocket64UnitIDiv]> {
+  let Latency = 33;
+  let ResourceCycles = [33];
+}
+
+// Memory
+def : WriteRes<WriteSTB, [Rocket64UnitMem]>;
+def : WriteRes<WriteSTH, [Rocket64UnitMem]>;
+def : WriteRes<WriteSTW, [Rocket64UnitMem]>;
+def : WriteRes<WriteSTD, [Rocket64UnitMem]>;
+def : WriteRes<WriteFST32, [Rocket64UnitMem]>;
+def : WriteRes<WriteFST64, [Rocket64UnitMem]>;
+
+let Latency = 3 in {
+def : WriteRes<WriteLDB, [Rocket64UnitMem]>;
+def : WriteRes<WriteLDH, [Rocket64UnitMem]>;
+def : WriteRes<WriteCSR, [Rocket64UnitALU]>;
+}
+
+let Latency = 2 in {
+def : WriteRes<WriteLDW, [Rocket64UnitMem]>;
+def : WriteRes<WriteLDWU, [Rocket64UnitMem]>;
+def : WriteRes<WriteLDD, [Rocket64UnitMem]>;
+def : WriteRes<WriteFLD32, [Rocket64UnitMem]>;
+def : WriteRes<WriteFLD64, [Rocket64UnitMem]>;
+
+def : WriteRes<WriteAtomicW, [Rocket64UnitMem]>;
+def : WriteRes<WriteAtomicD, [Rocket64UnitMem]>;
+
+def : WriteRes<WriteAtomicLDW, [Rocket64UnitMem]>;
+def : WriteRes<WriteAtomicLDD, [Rocket64UnitMem]>;
+}
+
+def : WriteRes<WriteAtomicSTW, [Rocket64UnitMem]>;
+def : WriteRes<WriteAtomicSTD, [Rocket64UnitMem]>;
+
+// Most FP single precision operations are 4 cycles
+let Latency = 4 in {
+def : WriteRes<WriteFALU32, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFSGNJ32, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFMinMax32, [Rocket64UnitFPALU]>;
+}
+
+let Latency = 6 in {
+// Most FP double precision operations are 6 cycles
+def : WriteRes<WriteFALU64, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFSGNJ64, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFMinMax64, [Rocket64UnitFPALU]>;
+}
+
+// Conversion instructions
+let Latency = 2 in {
+def : WriteRes<WriteFCvtI32ToF32, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCvtI32ToF64, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCvtI64ToF32, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCvtI64ToF64, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCvtF32ToI32, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCvtF32ToI64, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCvtF64ToI32, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCvtF64ToI64, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCvtF32ToF64, [Rocket32UnitFPALU]>;
+def : WriteRes<WriteFCvtF64ToF32, [Rocket32UnitFPALU]>;
+
+def : WriteRes<WriteFClass32, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFClass64, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFCmp32, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFCmp64, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFMovF32ToI32, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFMovI32ToF32, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFMovF64ToI64, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFMovI64ToF64, [Rocket64UnitFPALU]>;
+}
+
+let Latency = 5 in {
+def : WriteRes<WriteFMul32, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFMulAdd32, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFMulSub32, [Rocket64UnitFPALU]>;
+}
+
+let Latency = 7 in {
+def : WriteRes<WriteFMul64, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFMulAdd64, [Rocket64UnitFPALU]>;
+def : WriteRes<WriteFMulSub64, [Rocket64UnitFPALU]>;
+}
+
+// FP Divide unit on Rocket is not pipelined, so set resource cycles to latency
+let Latency = 20, ResourceCycles = [20] in {
+def : WriteRes<WriteFDiv32, [Rocket64UnitFPDivSqrt]>;
+def : WriteRes<WriteFDiv64, [Rocket64UnitFPDivSqrt]>;
+}
+
+// FP Sqrt unit on Rocket is not pipelined, so set resource cycles to latency
+def : WriteRes<WriteFSqrt32, [Rocket64UnitFPDivSqrt]> { let Latency = 20;
+                                                        let ResourceCycles = [20]; }
+def : WriteRes<WriteFSqrt64, [Rocket64UnitFPDivSqrt]> { let Latency = 25;
+                                                        let ResourceCycles = [25]; }
+
+def : WriteRes<WriteNop, []>;
+
+def : InstRW<[WriteIALU], (instrs COPY)>;
+
+//===----------------------------------------------------------------------===//
+// Subtarget-specific SchedRead types with cycles.
+// Dummy definitions for RocketCore.
+def : ReadAdvance<ReadJmp, 0>;
+def : ReadAdvance<ReadJalr, 0>;
+def : ReadAdvance<ReadCSR, 0>;
+def : ReadAdvance<ReadStoreData, 0>;
+def : ReadAdvance<ReadMemBase, 0>;
+def : ReadAdvance<ReadIALU, 0>;
+def : ReadAdvance<ReadIALU32, 0>;
+def : ReadAdvance<ReadShift, 0>;
+def : ReadAdvance<ReadShift32, 0>;
+def : ReadAdvance<ReadIDiv, 0>;
+def : ReadAdvance<ReadIDiv32, 0>;
+def : ReadAdvance<ReadIMul, 0>;
+def : ReadAdvance<ReadIMul32, 0>;
+def : ReadAdvance<ReadAtomicWA, 0>;
+def : ReadAdvance<ReadAtomicWD, 0>;
+def : ReadAdvance<ReadAtomicDA, 0>;
+def : ReadAdvance<ReadAtomicDD, 0>;
+def : ReadAdvance<ReadAtomicLDW, 0>;
+def : ReadAdvance<ReadAtomicLDD, 0>;
+def : ReadAdvance<ReadAtomicSTW, 0>;
+def : ReadAdvance<ReadAtomicSTD, 0>;
+def : ReadAdvance<ReadFMemBase, 0>;
+def : ReadAdvance<ReadFALU32, 0>;
+def : ReadAdvance<ReadFALU64, 0>;
+def : ReadAdvance<ReadFMul32, 0>;
+def : ReadAdvance<ReadFMulAdd32, 0>;
+def : ReadAdvance<ReadFMulSub32, 0>;
+def : ReadAdvance<ReadFMul64, 0>;
+def : ReadAdvance<ReadFMulAdd64, 0>;
+def : ReadAdvance<ReadFMulSub64, 0>;
+def : ReadAdvance<ReadFDiv32, 0>;
+def : ReadAdvance<ReadFDiv64, 0>;
+def : ReadAdvance<ReadFSqrt32, 0>;
+def : ReadAdvance<ReadFSqrt64, 0>;
+def : ReadAdvance<ReadFCmp32, 0>;
+def : ReadAdvance<ReadFCmp64, 0>;
+def : ReadAdvance<ReadFSGNJ32, 0>;
+def : ReadAdvance<ReadFSGNJ64, 0>;
+def : ReadAdvance<ReadFMinMax32, 0>;
+def : ReadAdvance<ReadFMinMax64, 0>;
+def : ReadAdvance<ReadFCvtF32ToI32, 0>;
+def : ReadAdvance<ReadFCvtF32ToI64, 0>;
+def : ReadAdvance<ReadFCvtF64ToI32, 0>;
+def : ReadAdvance<ReadFCvtF64ToI64, 0>;
+def : ReadAdvance<ReadFCvtI32ToF32, 0>;
+def : ReadAdvance<ReadFCvtI32ToF64, 0>;
+def : ReadAdvance<ReadFCvtI64ToF32, 0>;
+def : ReadAdvance<ReadFCvtI64ToF64, 0>;
+def : ReadAdvance<ReadFCvtF32ToF64, 0>;
+def : ReadAdvance<ReadFCvtF64ToF32, 0>;
+def : ReadAdvance<ReadFMovF32ToI32, 0>;
+def : ReadAdvance<ReadFMovI32ToF32, 0>;
+def : ReadAdvance<ReadFMovF64ToI64, 0>;
+def : ReadAdvance<ReadFMovI64ToF64, 0>;
+def : ReadAdvance<ReadFClass32, 0>;
+def : ReadAdvance<ReadFClass64, 0>;
+}
diff --git a/llvm/lib/Target/RISCV/RISCVSchedule.td b/llvm/lib/Target/RISCV/RISCVSchedule.td
new file mode 100644
index 000000000000..bbcd03d46236
--- /dev/null
+++ b/llvm/lib/Target/RISCV/RISCVSchedule.td
@@ -0,0 +1,147 @@
+//===-- RISCVSchedule.td - RISCV Scheduling Definitions -------*- tablegen -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+/// Define scheduler resources associated with def operands.
+def WriteIALU       : SchedWrite;    // 32 or 64-bit integer ALU operations
+def WriteIALU32     : SchedWrite;    // 32-bit integer ALU operations on RV64I
+def WriteShift32    : SchedWrite;    // 32-bit shift operations on RV64Ix
+def WriteShift      : SchedWrite;    // 32 or 64-bit shift operations
+def WriteIDiv       : SchedWrite;    // 32-bit or 64-bit divide and remainder
+def WriteIDiv32     : SchedWrite;    // 32-bit divide and remainder on RV64I
+def WriteIMul       : SchedWrite;    // 32-bit or 64-bit multiply
+def WriteIMul32     : SchedWrite;    // 32-bit multiply on RV64I
+def WriteJmp        : SchedWrite;    // Jump
+def WriteJal        : SchedWrite;    // Jump and link
+def WriteJalr       : SchedWrite;    // Jump and link register
+def WriteJmpReg     : SchedWrite;    // Jump register
+def WriteNop        : SchedWrite;
+def WriteLDB        : SchedWrite;    // Load byte
+def WriteLDH        : SchedWrite;    // Load half-word
+def WriteLDW        : SchedWrite;    // Load word
+def WriteLDWU       : SchedWrite;    // Load word unsigned
+def WriteLDD        : SchedWrite;    // Load double-word
+def WriteCSR        : SchedWrite;    // CSR instructions
+def WriteSTB        : SchedWrite;    // Store byte
+def WriteSTH        : SchedWrite;    // Store half-word
+def WriteSTW        : SchedWrite;    // Store word
+def WriteSTD        : SchedWrite;    // Store double-word
+def WriteAtomicW    : SchedWrite;    //Atomic memory operation word size
+def WriteAtomicD    : SchedWrite;    //Atomic memory operation double word size
+def WriteAtomicLDW  : SchedWrite;    // Atomic load word
+def WriteAtomicLDD  : SchedWrite;    // Atomic load double word
+def WriteAtomicSTW  : SchedWrite;    // Atomic store word
+def WriteAtomicSTD  : SchedWrite;    // Atomic store double word
+def WriteFALU32     : SchedWrite;    // FP 32-bit computation
+def WriteFALU64     : SchedWrite;    // FP 64-bit computation
+def WriteFMul32     : SchedWrite;    // 32-bit floating point multiply
+def WriteFMulAdd32  : SchedWrite;    // 32-bit floating point multiply add
+def WriteFMulSub32  : SchedWrite;    // 32-bit floating point multiply sub
+def WriteFMul64     : SchedWrite;    // 64-bit floating point multiply
+def WriteFMulAdd64  : SchedWrite;      // 64-bit floating point multiply add
+def WriteFMulSub64  : SchedWrite;    // 64-bit floating point multiply sub
+def WriteFDiv32     : SchedWrite;    // 32-bit floating point divide
+def WriteFDiv64     : SchedWrite;    // 64-bit floating point divide
+def WriteFSqrt32    : SchedWrite;    // 32-bit floating point sqrt
+def WriteFSqrt64    : SchedWrite;    // 64-bit floating point sqrt
+
+// Integer to float conversions
+def WriteFCvtI32ToF32  : SchedWrite;
+def WriteFCvtI32ToF64  : SchedWrite;
+def WriteFCvtI64ToF32  : SchedWrite;    // RV64I only
+def WriteFCvtI64ToF64  : SchedWrite;    // RV64I only
+
+//Float to integer conversions
+def WriteFCvtF32ToI32  : SchedWrite;
+def WriteFCvtF32ToI64  : SchedWrite;    // RV64I only
+def WriteFCvtF64ToI32  : SchedWrite;
+def WriteFCvtF64ToI64  : SchedWrite;    // RV64I only
+
+// Float to float conversions
+def WriteFCvtF32ToF64  : SchedWrite;
+def WriteFCvtF64ToF32  : SchedWrite;
+
+def WriteFConv32    : SchedWrite;    // 32-bit floating point convert
+def WriteFConv64    : SchedWrite;    // 64-bit floating point convert
+def WriteFClass32   : SchedWrite;    // 32-bit floating point classify
+def WriteFClass64   : SchedWrite;    // 64-bit floating point classify
+def WriteFCmp32     : SchedWrite;    // 32-bit floating point compare
+def WriteFCmp64     : SchedWrite;    // 64-bit floating point compare
+def WriteFSGNJ32    : SchedWrite;    // 32-bit floating point sign-injection
+def WriteFSGNJ64    : SchedWrite;    // 64-bit floating point sign-injection
+def WriteFMinMax32  : SchedWrite;    // 32-bit floating point min or max
+def WriteFMinMax64  : SchedWrite;    // 64-bit floating point min or max
+
+def WriteFMovF32ToI32     : SchedWrite;
+def WriteFMovI32ToF32     : SchedWrite;
+def WriteFMovF64ToI64     : SchedWrite;    // RV64I only
+def WriteFMovI64ToF64     : SchedWrite;    // RV64I only
+
+def WriteFMov32       : SchedWrite;    // 32-bit floating point move
+def WriteFMov64       : SchedWrite;    // 64-bit floating point move
+def WriteFLD32        : SchedWrite;    // Floating point sp load
+def WriteFLD64        : SchedWrite;    // Floating point dp load
+def WriteFST32        : SchedWrite;    // Floating point sp store
+def WriteFST64        : SchedWrite;    // Floating point dp store
+
+/// Define scheduler resources associated with use operands.
+def ReadJmp         : SchedRead;
+def ReadJalr        : SchedRead;
+def ReadCSR         : SchedRead;
+def ReadMemBase     : SchedRead;
+def ReadFMemBase    : SchedRead;
+def ReadStoreData   : SchedRead;
+def ReadIALU        : SchedRead;
+def ReadIALU32      : SchedRead;    // 32-bit integer ALU operations on RV64I
+def ReadShift       : SchedRead;
+def ReadShift32     : SchedRead;    // 32-bit shift operations on RV64Ix
+def ReadIDiv        : SchedRead;
+def ReadIDiv32      : SchedRead;
+def ReadIMul        : SchedRead;
+def ReadIMul32      : SchedRead;
+def ReadAtomicWA    : SchedRead;
+def ReadAtomicWD    : SchedRead;
+def ReadAtomicDA    : SchedRead;
+def ReadAtomicDD    : SchedRead;
+def ReadAtomicLDW   : SchedRead;    // Atomic load word
+def ReadAtomicLDD   : SchedRead;    // Atomic load double word
+def ReadAtomicSTW   : SchedRead;    // Atomic store word
+def ReadAtomicSTD   : SchedRead;    // Atomic store double word
+def ReadFALU32      : SchedRead;    // FP 32-bit computation
+def ReadFALU64      : SchedRead;    // FP 64-bit computation
+def ReadFMul32      : SchedRead;    // 32-bit floating point multiply
+def ReadFMulAdd32   : SchedRead;    // 32-bit floating point multiply add
+def ReadFMulSub32   : SchedRead;    // 32-bit floating point multiply sub
+def ReadFMul64      : SchedRead;    // 64-bit floating point multiply
+def ReadFMulAdd64   : SchedRead;    // 64-bit floating point multiply add
+def ReadFMulSub64   : SchedRead;    // 64-bit floating point multiply sub
+def ReadFDiv32      : SchedRead;    // 32-bit floating point divide
+def ReadFDiv64      : SchedRead;    // 64-bit floating point divide
+def ReadFSqrt32     : SchedRead;    // 32-bit floating point sqrt
+def ReadFSqrt64     : SchedRead;    // 64-bit floating point sqrt
+def ReadFCmp32      : SchedRead;
+def ReadFCmp64      : SchedRead;
+def ReadFSGNJ32     : SchedRead;
+def ReadFSGNJ64     : SchedRead;
+def ReadFMinMax32   : SchedRead;
+def ReadFMinMax64   : SchedRead;
+def ReadFCvtF32ToI32     : SchedRead;
+def ReadFCvtF32ToI64     : SchedRead;
+def ReadFCvtF64ToI32     : SchedRead;
+def ReadFCvtF64ToI64     : SchedRead;
+def ReadFCvtI32ToF32     : SchedRead;
+def ReadFCvtI32ToF64     : SchedRead;
+def ReadFCvtI64ToF32     : SchedRead;
+def ReadFCvtI64ToF64     : SchedRead;
+def ReadFMovF32ToI32     : SchedRead;
+def ReadFMovI32ToF32     : SchedRead;
+def ReadFMovF64ToI64     : SchedRead;
+def ReadFMovI64ToF64     : SchedRead;
+def ReadFCvtF32ToF64     : SchedRead;
+def ReadFCvtF64ToF32     : SchedRead;
+def ReadFClass32         : SchedRead;
+def ReadFClass64         : SchedRead;
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
index 83e7e2d52cc1..47a48c820a29 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.cpp
@@ -33,7 +33,7 @@ RISCVSubtarget &RISCVSubtarget::initializeSubtargetDependencies(
     const Triple &TT, StringRef CPU, StringRef FS, StringRef ABIName) {
   // Determine default and user-specified characteristics
   bool Is64Bit = TT.isArch64Bit();
-  std::string CPUName = CPU;
+  std::string CPUName = std::string(CPU);
   if (CPUName.empty())
     CPUName = Is64Bit ? "generic-rv64" : "generic-rv32";
   ParseSubtargetFeatures(CPUName, FS);
diff --git a/llvm/lib/Target/RISCV/RISCVSubtarget.h b/llvm/lib/Target/RISCV/RISCVSubtarget.h
index 605d4abcc9ae..fe1285f23b15 100644
--- a/llvm/lib/Target/RISCV/RISCVSubtarget.h
+++ b/llvm/lib/Target/RISCV/RISCVSubtarget.h
@@ -39,10 +39,23 @@ class RISCVSubtarget : public RISCVGenSubtargetInfo {
   bool HasStdExtF = false;
   bool HasStdExtD = false;
   bool HasStdExtC = false;
+  bool HasStdExtB = false;
+  bool HasStdExtZbb = false;
+  bool HasStdExtZbc = false;
+  bool HasStdExtZbe = false;
+  bool HasStdExtZbf = false;
+  bool HasStdExtZbm = false;
+  bool HasStdExtZbp = false;
+  bool HasStdExtZbr = false;
+  bool HasStdExtZbs = false;
+  bool HasStdExtZbt = false;
+  bool HasStdExtZbproposedc = false;
+  bool HasStdExtV = false;
   bool HasRV64 = false;
   bool IsRV32E = false;
   bool EnableLinkerRelax = false;
-  bool EnableRVCHintInstrs = false;
+  bool EnableRVCHintInstrs = true;
+  bool EnableSaveRestore = false;
   unsigned XLen = 32;
   MVT XLenVT = MVT::i32;
   RISCVABI::ABI TargetABI = RISCVABI::ABI_Unknown;
@@ -87,10 +100,23 @@ public:
   bool hasStdExtF() const { return HasStdExtF; }
   bool hasStdExtD() const { return HasStdExtD; }
   bool hasStdExtC() const { return HasStdExtC; }
+  bool hasStdExtB() const { return HasStdExtB; }
+  bool hasStdExtZbb() const { return HasStdExtZbb; }
+  bool hasStdExtZbc() const { return HasStdExtZbc; }
+  bool hasStdExtZbe() const { return HasStdExtZbe; }
+  bool hasStdExtZbf() const { return HasStdExtZbf; }
+  bool hasStdExtZbm() const { return HasStdExtZbm; }
+  bool hasStdExtZbp() const { return HasStdExtZbp; }
+  bool hasStdExtZbr() const { return HasStdExtZbr; }
+  bool hasStdExtZbs() const { return HasStdExtZbs; }
+  bool hasStdExtZbt() const { return HasStdExtZbt; }
+  bool hasStdExtZbproposedc() const { return HasStdExtZbproposedc; }
+  bool hasStdExtV() const { return HasStdExtV; }
   bool is64Bit() const { return HasRV64; }
   bool isRV32E() const { return IsRV32E; }
   bool enableLinkerRelax() const { return EnableLinkerRelax; }
   bool enableRVCHintInstrs() const { return EnableRVCHintInstrs; }
+  bool enableSaveRestore() const { return EnableSaveRestore; }
   MVT getXLenVT() const { return XLenVT; }
   unsigned getXLen() const { return XLen; }
   RISCVABI::ABI getTargetABI() const { return TargetABI; }
diff --git a/llvm/lib/Target/RISCV/RISCVSystemOperands.td b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
index a46a32c4e7f2..8e75647bd4a9 100644
--- a/llvm/lib/Target/RISCV/RISCVSystemOperands.td
+++ b/llvm/lib/Target/RISCV/RISCVSystemOperands.td
@@ -20,6 +20,8 @@ include "llvm/TableGen/SearchableTable.td"
 class SysReg<string name, bits<12> op> {
   string Name = name;
   bits<12> Encoding = op;
+  // A maximum of one alias is supported right now.
+  string AltName = name;
   // FIXME: add these additional fields when needed.
   // Privilege Access: Read and Write = 0, 1, 2; Read-Only = 3.
   // Privilege Mode: User = 0, System = 1 or Machine = 3.
@@ -36,7 +38,7 @@ class SysReg<string name, bits<12> op> {
 def SysRegsList : GenericTable {
   let FilterClass = "SysReg";
   // FIXME: add "ReadWrite", "Mode", "Extra", "Number" fields when needed.
-  let Fields = [ "Name", "Encoding", "FeaturesRequired", "isRV32Only" ];
+  let Fields = [ "Name", "Encoding", "AltName", "FeaturesRequired", "isRV32Only" ];
 
   let PrimaryKey = [ "Encoding" ];
   let PrimaryKeyName = "lookupSysRegByEncoding";
@@ -47,6 +49,11 @@ def lookupSysRegByName : SearchIndex {
   let Key = [ "Name" ];
 }
 
+def lookupSysRegByAltName : SearchIndex {
+  let Table = SysRegsList;
+  let Key = [ "AltName" ];
+}
+
 // The following CSR encodings match those given in Tables 2.2,
 // 2.3, 2.4 and  2.5 in the RISC-V Instruction Set Manual
 // Volume II: Privileged Architecture.
@@ -303,6 +310,7 @@ def: SysReg<"mhpmcounter31h", 0xB9F>;
 //===--------------------------
 // Machine Counter Setup
 //===--------------------------
+def : SysReg<"mcountinhibit", 0x320>;
 def : SysReg<"mhpmevent3", 0x323>;
 def : SysReg<"mhpmevent4", 0x324>;
 def : SysReg<"mhpmevent5", 0x325>;
@@ -346,4 +354,19 @@ def : SysReg<"tdata3", 0x7A3>;
 //===-----------------------------------------------
 def : SysReg<"dcsr", 0x7B0>;
 def : SysReg<"dpc", 0x7B1>;
-def : SysReg<"dscratch", 0x7B2>;
+
+// "dscratch" is an alternative name for "dscratch0" which appeared in earlier
+// drafts of the RISC-V debug spec
+let AltName = "dscratch" in
+def : SysReg<"dscratch0", 0x7B2>;
+def : SysReg<"dscratch1", 0x7B3>;
+
+//===-----------------------------------------------
+// User Vector CSRs
+//===-----------------------------------------------
+def : SysReg<"vstart", 0x008>;
+def : SysReg<"vxsat", 0x009>;
+def : SysReg<"vxrm", 0x00A>;
+def : SysReg<"vl", 0xC20>;
+def : SysReg<"vtype", 0xC21>;
+def : SysReg<"vlenb", 0xC22>;
diff --git a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
index 2bb26988c7da..75683e2fd8e9 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetMachine.cpp
@@ -15,6 +15,7 @@
 #include "RISCVTargetObjectFile.h"
 #include "RISCVTargetTransformInfo.h"
 #include "TargetInfo/RISCVTargetInfo.h"
+#include "Utils/RISCVBaseInfo.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/GlobalISel/IRTranslator.h"
@@ -89,8 +90,17 @@ RISCVTargetMachine::getSubtargetImpl(const Function &F) const {
     // creation will depend on the TM and the code generation flags on the
     // function that reside in TargetOptions.
     resetTargetOptions(F);
-    I = std::make_unique<RISCVSubtarget>(TargetTriple, CPU, FS,
-                                         Options.MCOptions.getABIName(), *this);
+    auto ABIName = Options.MCOptions.getABIName();
+    if (const MDString *ModuleTargetABI = dyn_cast_or_null<MDString>(
+            F.getParent()->getModuleFlag("target-abi"))) {
+      auto TargetABI = RISCVABI::getTargetABI(ABIName);
+      if (TargetABI != RISCVABI::ABI_Unknown &&
+          ModuleTargetABI->getString() != ABIName) {
+        report_fatal_error("-target-abi option != target-abi module flag");
+      }
+      ABIName = ModuleTargetABI->getString();
+    }
+    I = std::make_unique<RISCVSubtarget>(TargetTriple, CPU, FS, ABIName, *this);
   }
   return I.get();
 }
@@ -118,6 +128,7 @@ public:
   bool addGlobalInstructionSelect() override;
   void addPreEmitPass() override;
   void addPreEmitPass2() override;
+  void addPreSched2() override;
   void addPreRegAlloc() override;
 };
 }
@@ -157,13 +168,16 @@ bool RISCVPassConfig::addGlobalInstructionSelect() {
   return false;
 }
 
+void RISCVPassConfig::addPreSched2() {}
+
 void RISCVPassConfig::addPreEmitPass() { addPass(&BranchRelaxationPassID); }
 
 void RISCVPassConfig::addPreEmitPass2() {
+  addPass(createRISCVExpandPseudoPass());
   // Schedule the expansion of AMOs at the last possible moment, avoiding the
   // possibility for other passes to break the requirements for forward
   // progress in the LR/SC block.
-  addPass(createRISCVExpandPseudoPass());
+  addPass(createRISCVExpandAtomicPseudoPass());
 }
 
 void RISCVPassConfig::addPreRegAlloc() {
diff --git a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
index bbd45c970d3d..fba86b463764 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.cpp
@@ -17,7 +17,6 @@ using namespace llvm;
 void RISCVELFTargetObjectFile::Initialize(MCContext &Ctx,
                                           const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
 
   SmallDataSection = getContext().getELFSection(
       ".sdata", ELF::SHT_PROGBITS, ELF::SHF_WRITE | ELF::SHF_ALLOC);
@@ -105,10 +104,11 @@ bool RISCVELFTargetObjectFile::isConstantInSmallSection(
 
 MCSection *RISCVELFTargetObjectFile::getSectionForConstant(
     const DataLayout &DL, SectionKind Kind, const Constant *C,
-    unsigned &Align) const {
+    Align &Alignment) const {
   if (isConstantInSmallSection(DL, C))
     return SmallDataSection;
 
   // Otherwise, we work the same as ELF.
-  return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C, Align);
+  return TargetLoweringObjectFileELF::getSectionForConstant(DL, Kind, C,
+                                                            Alignment);
 }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
index b2daaaa9d364..830a7d813c15 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetObjectFile.h
@@ -12,7 +12,6 @@
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 
 namespace llvm {
-class RISCVTargetMachine;
 
 /// This implementation is used for RISCV ELF targets.
 class RISCVELFTargetObjectFile : public TargetLoweringObjectFileELF {
@@ -36,7 +35,7 @@ public:
 
   MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
                                    const Constant *C,
-                                   unsigned &Align) const override;
+                                   Align &Alignment) const override;
 
   void getModuleMetadata(Module &M) override;
 
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index 90fcd679c523..bd78f801c59a 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -15,7 +15,8 @@ using namespace llvm;
 
 #define DEBUG_TYPE "riscvtti"
 
-int RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+                                TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy() &&
          "getIntImmCost can only estimate cost of materialising integers");
 
@@ -30,7 +31,7 @@ int RISCVTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
 }
 
 int RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                                Type *Ty) {
+                                Type *Ty, TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy() &&
          "getIntImmCost can only estimate cost of materialising integers");
 
@@ -78,7 +79,7 @@ int RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &
     }
 
     // Otherwise, use the full materialisation cost.
-    return getIntImmCost(Imm, Ty);
+    return getIntImmCost(Imm, Ty, CostKind);
   }
 
   // By default, prevent hoisting.
@@ -86,7 +87,8 @@ int RISCVTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &
 }
 
 int RISCVTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
-                                      const APInt &Imm, Type *Ty) {
+                                      const APInt &Imm, Type *Ty,
+                                      TTI::TargetCostKind CostKind) {
   // Prevent hoisting in unknown cases.
   return TTI::TCC_Free;
 }
diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
index d219ba81bb56..392700707760 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.h
@@ -41,12 +41,13 @@ public:
       : BaseT(TM, F.getParent()->getDataLayout()), ST(TM->getSubtargetImpl(F)),
         TLI(ST->getTargetLowering()) {}
 
-  int getIntImmCost(const APInt &Imm, Type *Ty);
-  int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+  int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
+  int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty,
+                        TTI::TargetCostKind CostKind);
   int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                          Type *Ty);
+                          Type *Ty, TTI::TargetCostKind CostKind);
 };
 
 } // end namespace llvm
 
-#endif // LLVM_LIB_TARGET_RISCV_RISCVTARGETTRANSFORMINFO_H
-\ No newline at end of file
+#endif // LLVM_LIB_TARGET_RISCV_RISCVTARGETTRANSFORMINFO_H
diff --git a/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp b/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp
index 432ebb294d46..43b1f8b80c5f 100644
--- a/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp
+++ b/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.cpp
@@ -12,16 +12,7 @@ namespace RISCVSysReg {
 namespace RISCVABI {
 ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
                      StringRef ABIName) {
-  auto TargetABI = StringSwitch<ABI>(ABIName)
-                       .Case("ilp32", ABI_ILP32)
-                       .Case("ilp32f", ABI_ILP32F)
-                       .Case("ilp32d", ABI_ILP32D)
-                       .Case("ilp32e", ABI_ILP32E)
-                       .Case("lp64", ABI_LP64)
-                       .Case("lp64f", ABI_LP64F)
-                       .Case("lp64d", ABI_LP64D)
-                       .Default(ABI_Unknown);
-
+  auto TargetABI = getTargetABI(ABIName);
   bool IsRV64 = TT.isArch64Bit();
   bool IsRV32E = FeatureBits[RISCV::FeatureRV32E];
 
@@ -58,6 +49,19 @@ ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
   return ABI_ILP32;
 }
 
+ABI getTargetABI(StringRef ABIName) {
+  auto TargetABI = StringSwitch<ABI>(ABIName)
+                       .Case("ilp32", ABI_ILP32)
+                       .Case("ilp32f", ABI_ILP32F)
+                       .Case("ilp32d", ABI_ILP32D)
+                       .Case("ilp32e", ABI_ILP32E)
+                       .Case("lp64", ABI_LP64)
+                       .Case("lp64f", ABI_LP64F)
+                       .Case("lp64d", ABI_LP64D)
+                       .Default(ABI_Unknown);
+  return TargetABI;
+}
+
 // To avoid the BP value clobbered by a function call, we need to choose a
 // callee saved register to save the value. RV32E only has X8 and X9 as callee
 // saved registers and X8 will be used as fp. So we choose X9 as bp.
diff --git a/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h b/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h
index cf078df9609a..4e6cdd8606b1 100644
--- a/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h
+++ b/llvm/lib/Target/RISCV/Utils/RISCVBaseInfo.h
@@ -45,7 +45,7 @@ enum {
   InstFormatCJ = 16,
   InstFormatOther = 17,
 
-  InstFormatMask = 31
+  InstFormatMask = 31,
 };
 
 // RISC-V Specific Machine Operand Flags
@@ -157,6 +157,7 @@ namespace RISCVSysReg {
 struct SysReg {
   const char *Name;
   unsigned Encoding;
+  const char *AltName;
   // FIXME: add these additional fields when needed.
   // Privilege Access: Read, Write, Read-Only.
   // unsigned ReadWrite;
@@ -202,6 +203,8 @@ enum ABI {
 ABI computeTargetABI(const Triple &TT, FeatureBitset FeatureBits,
                      StringRef ABIName);
 
+ABI getTargetABI(StringRef ABIName);
+
 // Returns the register used to hold the stack pointer after realignment.
 Register getBPReg();
 
diff --git a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
index 2d3137f38821..16e159621672 100644
--- a/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
+++ b/llvm/lib/Target/Sparc/AsmParser/SparcAsmParser.cpp
@@ -68,6 +68,8 @@ class SparcAsmParser : public MCTargetAsmParser {
                                uint64_t &ErrorInfo,
                                bool MatchingInlineAsm) override;
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
   bool ParseDirective(AsmToken DirectiveID) override;
@@ -600,7 +602,7 @@ bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
     }
 
     for (const MCInst &I : Instructions) {
-      Out.EmitInstruction(I, getSTI());
+      Out.emitInstruction(I, getSTI());
     }
     return false;
   }
@@ -630,20 +632,29 @@ bool SparcAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
 
 bool SparcAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
                                    SMLoc &EndLoc) {
+  if (tryParseRegister(RegNo, StartLoc, EndLoc) != MatchOperand_Success)
+    return Error(StartLoc, "invalid register name");
+  return false;
+}
+
+OperandMatchResultTy SparcAsmParser::tryParseRegister(unsigned &RegNo,
+                                                      SMLoc &StartLoc,
+                                                      SMLoc &EndLoc) {
   const AsmToken &Tok = Parser.getTok();
   StartLoc = Tok.getLoc();
   EndLoc = Tok.getEndLoc();
   RegNo = 0;
   if (getLexer().getKind() != AsmToken::Percent)
-    return false;
+    return MatchOperand_Success;
   Parser.Lex();
   unsigned regKind = SparcOperand::rk_None;
   if (matchRegisterName(Tok, RegNo, regKind)) {
     Parser.Lex();
-    return false;
+    return MatchOperand_Success;
   }
 
-  return Error(StartLoc, "invalid register name");
+  getLexer().UnLex(Tok);
+  return MatchOperand_NoMatch;
 }
 
 static void applyMnemonicAliases(StringRef &Mnemonic,
diff --git a/llvm/lib/Target/Sparc/LeonFeatures.td b/llvm/lib/Target/Sparc/LeonFeatures.td
index e0ea4e9c7645..75273eff1868 100755
--- a/llvm/lib/Target/Sparc/LeonFeatures.td
+++ b/llvm/lib/Target/Sparc/LeonFeatures.td
@@ -16,9 +16,9 @@
 
 //support to casa instruction; for leon3 subtarget only
 def UMACSMACSupport : SubtargetFeature<
-  "hasumacsmac", 
-  "HasUmacSmac", 
-  "true", 
+  "hasumacsmac",
+  "HasUmacSmac",
+  "true",
   "Enable UMAC and SMAC for LEON3 and LEON4 processors"
 >;
 
@@ -30,9 +30,9 @@ def UMACSMACSupport : SubtargetFeature<
 
 //support to casa instruction; for leon3 subtarget only
 def LeonCASA : SubtargetFeature<
-  "hasleoncasa", 
-  "HasLeonCasa", 
-  "true", 
+  "hasleoncasa",
+  "HasLeonCasa",
+  "true",
   "Enable CASA instruction for LEON3 and LEON4 processors"
 >;
 
@@ -40,7 +40,7 @@ def InsertNOPLoad: SubtargetFeature<
   "insertnopload",
   "InsertNOPLoad",
   "true",
-  "LEON3 erratum fix: Insert a NOP instruction after every single-cycle load instruction when the next instruction is another load/store instruction" 
+  "LEON3 erratum fix: Insert a NOP instruction after every single-cycle load instruction when the next instruction is another load/store instruction"
 >;
 
 def DetectRoundChange : SubtargetFeature<
@@ -55,7 +55,7 @@ def FixAllFDIVSQRT : SubtargetFeature<
   "fixallfdivsqrt",
   "FixAllFDIVSQRT",
   "true",
-  "LEON erratum fix: Fix FDIVS/FDIVD/FSQRTS/FSQRTD instructions with NOPs and floating-point store" 
+  "LEON erratum fix: Fix FDIVS/FDIVD/FSQRTS/FSQRTD instructions with NOPs and floating-point store"
 >;
 
 def LeonCycleCounter
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
index 2e8fa0dbaf4c..83c44e0682ce 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcAsmBackend.cpp
@@ -15,6 +15,7 @@
 #include "llvm/MC/MCObjectWriter.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Support/EndianStream.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
@@ -270,8 +271,8 @@ namespace {
       llvm_unreachable("fixupNeedsRelaxation() unimplemented");
       return false;
     }
-    void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                          MCInst &Res) const override {
+    void relaxInstruction(MCInst &Inst,
+                          const MCSubtargetInfo &STI) const override {
       // FIXME.
       llvm_unreachable("relaxInstruction() unimplemented");
     }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
index 8a673de69911..f6728a070736 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.cpp
@@ -46,7 +46,8 @@ void SparcInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const
 void SparcInstPrinter::printInst(const MCInst *MI, uint64_t Address,
                                  StringRef Annot, const MCSubtargetInfo &STI,
                                  raw_ostream &O) {
-  if (!printAliasInstr(MI, STI, O) && !printSparcAliasInstr(MI, STI, O))
+  if (!printAliasInstr(MI, Address, STI, O) &&
+      !printSparcAliasInstr(MI, STI, O))
     printInstruction(MI, Address, STI, O);
   printAnnotation(O, Annot);
 }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
index cb85fe98ed42..11587f165ef2 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcInstPrinter.h
@@ -33,10 +33,10 @@ public:
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, uint64_t Address,
                         const MCSubtargetInfo &STI, raw_ostream &O);
-  bool printAliasInstr(const MCInst *MI, const MCSubtargetInfo &STI,
-                       raw_ostream &O);
-  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                               unsigned PrintMethodIdx,
+  bool printAliasInstr(const MCInst *MI, uint64_t Address,
+                       const MCSubtargetInfo &STI, raw_ostream &O);
+  void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                               unsigned OpIdx, unsigned PrintMethodIdx,
                                const MCSubtargetInfo &STI, raw_ostream &O);
   static const char *getRegisterName(unsigned RegNo);
 
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
index 1a2a040990ae..c5cc2ea34bb7 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCAsmInfo.cpp
@@ -42,8 +42,6 @@ SparcELFMCAsmInfo::SparcELFMCAsmInfo(const Triple &TheTriple) {
 
   SunStyleELFSectionSwitchSyntax = true;
   UsesELFSectionDirectiveForBSS = true;
-
-  UseIntegratedAssembler = true;
 }
 
 const MCExpr*
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index 7eb27f55baac..fb2bcdc6c91b 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -37,7 +37,7 @@ static MCAsmInfo *createSparcMCAsmInfo(const MCRegisterInfo &MRI,
                                        const MCTargetOptions &Options) {
   MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
   unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
+  MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, Reg, 0);
   MAI->addInitialFrameState(Inst);
   return MAI;
 }
@@ -47,7 +47,7 @@ static MCAsmInfo *createSparcV9MCAsmInfo(const MCRegisterInfo &MRI,
                                          const MCTargetOptions &Options) {
   MCAsmInfo *MAI = new SparcELFMCAsmInfo(TT);
   unsigned Reg = MRI.getDwarfRegNum(SP::O6, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 2047);
+  MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, Reg, 2047);
   MAI->addInitialFrameState(Inst);
   return MAI;
 }
diff --git a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
index e5699bb1c133..f360946b9a79 100644
--- a/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
+++ b/llvm/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.h
@@ -27,10 +27,6 @@ class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
 class Target;
-class Triple;
-class StringRef;
-class raw_pwrite_stream;
-class raw_ostream;
 
 MCCodeEmitter *createSparcMCCodeEmitter(const MCInstrInfo &MCII,
                                         const MCRegisterInfo &MRI,
diff --git a/llvm/lib/Target/Sparc/Sparc.h b/llvm/lib/Target/Sparc/Sparc.h
index 967c463f5281..aabc4f149829 100644
--- a/llvm/lib/Target/Sparc/Sparc.h
+++ b/llvm/lib/Target/Sparc/Sparc.h
@@ -21,7 +21,6 @@
 namespace llvm {
   class FunctionPass;
   class SparcTargetMachine;
-  class formatted_raw_ostream;
   class AsmPrinter;
   class MCInst;
   class MachineInstr;
diff --git a/llvm/lib/Target/Sparc/Sparc.td b/llvm/lib/Target/Sparc/Sparc.td
index ca6147edc46b..da95602309a1 100644
--- a/llvm/lib/Target/Sparc/Sparc.td
+++ b/llvm/lib/Target/Sparc/Sparc.td
@@ -150,7 +150,7 @@ def : Processor<"ut699", LEON3Itineraries,
                 [FeatureLeon, InsertNOPLoad, FeatureNoFSMULD, FeatureNoFMULS, FixAllFDIVSQRT]>;
 
 // LEON3 FT (GR712RC). Provides features for the GR712RC processor.
-// - covers all the erratum fixed for LEON3 and support for the CASA instruction. 
+// - covers all the erratum fixed for LEON3 and support for the CASA instruction.
 def : Processor<"gr712rc", LEON3Itineraries,
                 [FeatureLeon, LeonCASA]>;
 
@@ -158,9 +158,9 @@ def : Processor<"gr712rc", LEON3Itineraries,
 def : Processor<"leon4", LEON4Itineraries,
                 [FeatureLeon, UMACSMACSupport, LeonCASA]>;
 
-// LEON 4 FT (GR740) 
+// LEON 4 FT (GR740)
 // TO DO: Place-holder: Processor specific features will be added *very* soon here.
-def : Processor<"gr740", LEON4Itineraries, 
+def : Processor<"gr740", LEON4Itineraries,
                 [FeatureLeon, UMACSMACSupport, LeonCASA, LeonCycleCounter,
                  FeaturePWRPSR]>;
 
diff --git a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
index f0caf3bc284f..069e43c6f544 100644
--- a/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
+++ b/llvm/lib/Target/Sparc/SparcAsmPrinter.cpp
@@ -52,8 +52,8 @@ namespace {
     void printMemOperand(const MachineInstr *MI, int opNum, raw_ostream &OS,
                          const char *Modifier = nullptr);
 
-    void EmitFunctionBodyStart() override;
-    void EmitInstruction(const MachineInstr *MI) override;
+    void emitFunctionBodyStart() override;
+    void emitInstruction(const MachineInstr *MI) override;
 
     static const char *getRegisterName(unsigned RegNo) {
       return SparcInstPrinter::getRegisterName(RegNo);
@@ -108,7 +108,7 @@ static void EmitCall(MCStreamer &OutStreamer,
   MCInst CallInst;
   CallInst.setOpcode(SP::CALL);
   CallInst.addOperand(Callee);
-  OutStreamer.EmitInstruction(CallInst, STI);
+  OutStreamer.emitInstruction(CallInst, STI);
 }
 
 static void EmitSETHI(MCStreamer &OutStreamer,
@@ -119,7 +119,7 @@ static void EmitSETHI(MCStreamer &OutStreamer,
   SETHIInst.setOpcode(SP::SETHIi);
   SETHIInst.addOperand(RD);
   SETHIInst.addOperand(Imm);
-  OutStreamer.EmitInstruction(SETHIInst, STI);
+  OutStreamer.emitInstruction(SETHIInst, STI);
 }
 
 static void EmitBinary(MCStreamer &OutStreamer, unsigned Opcode,
@@ -131,7 +131,7 @@ static void EmitBinary(MCStreamer &OutStreamer, unsigned Opcode,
   Inst.addOperand(RD);
   Inst.addOperand(RS1);
   Inst.addOperand(Src2);
-  OutStreamer.EmitInstruction(Inst, STI);
+  OutStreamer.emitInstruction(Inst, STI);
 }
 
 static void EmitOR(MCStreamer &OutStreamer,
@@ -233,15 +233,15 @@ void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
   //   or  <MO>, %lo(_GLOBAL_OFFSET_TABLE_+(<EndLabel>-<StartLabel>))), <MO>
   //   add <MO>, %o7, <MO>
 
-  OutStreamer->EmitLabel(StartLabel);
+  OutStreamer->emitLabel(StartLabel);
   MCOperand Callee =  createPCXCallOP(EndLabel, OutContext);
   EmitCall(*OutStreamer, Callee, STI);
-  OutStreamer->EmitLabel(SethiLabel);
+  OutStreamer->emitLabel(SethiLabel);
   MCOperand hiImm = createPCXRelExprOp(SparcMCExpr::VK_Sparc_PC22,
                                        GOTLabel, StartLabel, SethiLabel,
                                        OutContext);
   EmitSETHI(*OutStreamer, hiImm, MCRegOP, STI);
-  OutStreamer->EmitLabel(EndLabel);
+  OutStreamer->emitLabel(EndLabel);
   MCOperand loImm = createPCXRelExprOp(SparcMCExpr::VK_Sparc_PC10,
                                        GOTLabel, StartLabel, EndLabel,
                                        OutContext);
@@ -249,8 +249,7 @@ void SparcAsmPrinter::LowerGETPCXAndEmitMCInsts(const MachineInstr *MI,
   EmitADD(*OutStreamer, MCRegOP, RegO7, MCRegOP, STI);
 }
 
-void SparcAsmPrinter::EmitInstruction(const MachineInstr *MI)
-{
+void SparcAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   switch (MI->getOpcode()) {
   default: break;
@@ -270,7 +269,7 @@ void SparcAsmPrinter::EmitInstruction(const MachineInstr *MI)
   } while ((++I != E) && I->isInsideBundle()); // Delay slot check.
 }
 
-void SparcAsmPrinter::EmitFunctionBodyStart() {
+void SparcAsmPrinter::emitFunctionBodyStart() {
   if (!MF->getSubtarget<SparcSubtarget>().is64Bit())
     return;
 
diff --git a/llvm/lib/Target/Sparc/SparcCallingConv.td b/llvm/lib/Target/Sparc/SparcCallingConv.td
index 4be432211f1d..db540d6f0c42 100644
--- a/llvm/lib/Target/Sparc/SparcCallingConv.td
+++ b/llvm/lib/Target/Sparc/SparcCallingConv.td
@@ -67,7 +67,7 @@ def RetCC_Sparc32 : CallingConv<[
 // bits of an integer register while the float goes in a floating point
 // register.
 //
-// The difference is encoded in LLVM IR using the inreg atttribute on function
+// The difference is encoded in LLVM IR using the inreg attribute on function
 // arguments:
 //
 //   C:   void f(float, float);
diff --git a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
index 0f74f2bb344c..8d8424641cd9 100644
--- a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -104,7 +104,7 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF,
   // rather than reporting an error, as would be sensible. This is
   // poor, but fixing that bogosity is going to be a large project.
   // For now, just see if it's lied, and report an error here.
-  if (!NeedsStackRealignment && MFI.getMaxAlignment() > getStackAlignment())
+  if (!NeedsStackRealignment && MFI.getMaxAlign() > getStackAlign())
     report_fatal_error("Function \"" + Twine(MF.getName()) + "\" required "
                        "stack re-alignment, but LLVM couldn't handle it "
                        "(probably because it has a dynamic alloca).");
@@ -146,9 +146,7 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF,
 
   // Finally, ensure that the size is sufficiently aligned for the
   // data on the stack.
-  if (MFI.getMaxAlignment() > 0) {
-    NumBytes = alignTo(NumBytes, MFI.getMaxAlignment());
-  }
+  NumBytes = alignTo(NumBytes, MFI.getMaxAlign());
 
   // Update stack size with corrected value.
   MFI.setStackSize(NumBytes);
@@ -189,9 +187,10 @@ void SparcFrameLowering::emitPrologue(MachineFunction &MF,
       regUnbiased = SP::O6;
 
     // andn %regUnbiased, MaxAlign-1, %regUnbiased
-    int MaxAlign = MFI.getMaxAlignment();
+    Align MaxAlign = MFI.getMaxAlign();
     BuildMI(MBB, MBBI, dl, TII.get(SP::ANDNri), regUnbiased)
-      .addReg(regUnbiased).addImm(MaxAlign - 1);
+        .addReg(regUnbiased)
+        .addImm(MaxAlign.value() - 1U);
 
     if (Bias) {
       // add %g1, -BIAS, %o6
@@ -258,9 +257,9 @@ bool SparcFrameLowering::hasFP(const MachineFunction &MF) const {
       MFI.isFrameAddressTaken();
 }
 
-
-int SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
-                                               unsigned &FrameReg) const {
+int SparcFrameLowering::getFrameIndexReference(const MachineFunction &MF,
+                                               int FI,
+                                               Register &FrameReg) const {
   const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const SparcRegisterInfo *RegInfo = Subtarget.getRegisterInfo();
diff --git a/llvm/lib/Target/Sparc/SparcFrameLowering.h b/llvm/lib/Target/Sparc/SparcFrameLowering.h
index 8e6001da05db..3ec9dc8b85dd 100644
--- a/llvm/lib/Target/Sparc/SparcFrameLowering.h
+++ b/llvm/lib/Target/Sparc/SparcFrameLowering.h
@@ -39,7 +39,7 @@ public:
                             RegScavenger *RS = nullptr) const override;
 
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
+                             Register &FrameReg) const override;
 
   /// targetHandlesStackFrameRounding - Returns true if the target is
   /// responsible for rounding up the stack frame (probably at emitPrologue
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index d853d0608519..116352e08382 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -59,23 +59,21 @@ static bool CC_Sparc_Assign_Split_64(unsigned &ValNo, MVT &ValVT,
     SP::I0, SP::I1, SP::I2, SP::I3, SP::I4, SP::I5
   };
   // Try to get first reg.
-  if (unsigned Reg = State.AllocateReg(RegList)) {
+  if (Register Reg = State.AllocateReg(RegList)) {
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   } else {
     // Assign whole thing in stack.
-    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
-                                           State.AllocateStack(8,4),
-                                           LocVT, LocInfo));
+    State.addLoc(CCValAssign::getCustomMem(
+        ValNo, ValVT, State.AllocateStack(8, Align(4)), LocVT, LocInfo));
     return true;
   }
 
   // Try to get second reg.
-  if (unsigned Reg = State.AllocateReg(RegList))
+  if (Register Reg = State.AllocateReg(RegList))
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   else
-    State.addLoc(CCValAssign::getCustomMem(ValNo, ValVT,
-                                           State.AllocateStack(4,4),
-                                           LocVT, LocInfo));
+    State.addLoc(CCValAssign::getCustomMem(
+        ValNo, ValVT, State.AllocateStack(4, Align(4)), LocVT, LocInfo));
   return true;
 }
 
@@ -88,13 +86,13 @@ static bool CC_Sparc_Assign_Ret_Split_64(unsigned &ValNo, MVT &ValVT,
   };
 
   // Try to get first reg.
-  if (unsigned Reg = State.AllocateReg(RegList))
+  if (Register Reg = State.AllocateReg(RegList))
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   else
     return false;
 
   // Try to get second reg.
-  if (unsigned Reg = State.AllocateReg(RegList))
+  if (Register Reg = State.AllocateReg(RegList))
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   else
     return false;
@@ -112,7 +110,7 @@ static bool CC_Sparc64_Full(unsigned &ValNo, MVT &ValVT,
 
   // Stack space is allocated for all arguments starting from [%fp+BIAS+128].
   unsigned size      = (LocVT == MVT::f128) ? 16 : 8;
-  unsigned alignment = (LocVT == MVT::f128) ? 16 : 8;
+  Align alignment = (LocVT == MVT::f128) ? Align(16) : Align(8);
   unsigned Offset = State.AllocateStack(size, alignment);
   unsigned Reg = 0;
 
@@ -152,7 +150,7 @@ static bool CC_Sparc64_Half(unsigned &ValNo, MVT &ValVT,
                             MVT &LocVT, CCValAssign::LocInfo &LocInfo,
                             ISD::ArgFlagsTy &ArgFlags, CCState &State) {
   assert(LocVT.getSizeInBits() == 32 && "Can't handle non-32 bits locations");
-  unsigned Offset = State.AllocateStack(4, 4);
+  unsigned Offset = State.AllocateStack(4, Align(4));
 
   if (LocVT == MVT::f32 && Offset < 16*8) {
     // Promote floats to %f0-%f31.
@@ -266,7 +264,7 @@ SparcTargetLowering::LowerReturn_32(SDValue Chain, CallingConv::ID CallConv,
   // If the function returns a struct, copy the SRetReturnReg to I0
   if (MF.getFunction().hasStructRetAttr()) {
     SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
-    unsigned Reg = SFI->getSRetReturnReg();
+    Register Reg = SFI->getSRetReturnReg();
     if (!Reg)
       llvm_unreachable("sret virtual register not created in the entry block");
     auto PtrVT = getPointerTy(DAG.getDataLayout());
@@ -431,7 +429,7 @@ SDValue SparcTargetLowering::LowerFormalArguments_32(
           SDValue FIPtr = DAG.getFrameIndex(FrameIdx, MVT::i32);
           LoVal = DAG.getLoad(MVT::i32, dl, Chain, FIPtr, MachinePointerInfo());
         } else {
-          unsigned loReg = MF.addLiveIn(NextVA.getLocReg(),
+          Register loReg = MF.addLiveIn(NextVA.getLocReg(),
                                         &SP::IntRegsRegClass);
           LoVal = DAG.getCopyFromReg(Chain, dl, loReg, MVT::i32);
         }
@@ -522,7 +520,7 @@ SDValue SparcTargetLowering::LowerFormalArguments_32(
   if (MF.getFunction().hasStructRetAttr()) {
     // Copy the SRet Argument to SRetReturnReg.
     SparcMachineFunctionInfo *SFI = MF.getInfo<SparcMachineFunctionInfo>();
-    unsigned Reg = SFI->getSRetReturnReg();
+    Register Reg = SFI->getSRetReturnReg();
     if (!Reg) {
       Reg = MF.getRegInfo().createVirtualRegister(&SP::IntRegsRegClass);
       SFI->setSRetReturnReg(Reg);
@@ -597,7 +595,7 @@ SDValue SparcTargetLowering::LowerFormalArguments_64(
       // All integer register arguments are promoted by the caller to i64.
 
       // Create a virtual register for the promoted live-in value.
-      unsigned VReg = MF.addLiveIn(VA.getLocReg(),
+      Register VReg = MF.addLiveIn(VA.getLocReg(),
                                    getRegClassFor(VA.getLocVT()));
       SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
 
@@ -668,7 +666,7 @@ SDValue SparcTargetLowering::LowerFormalArguments_64(
   // of how many arguments were actually passed.
   SmallVector<SDValue, 8> OutChains;
   for (; ArgOffset < 6*8; ArgOffset += 8) {
-    unsigned VReg = MF.addLiveIn(SP::I0 + ArgOffset/8, &SP::I64RegsRegClass);
+    Register VReg = MF.addLiveIn(SP::I0 + ArgOffset/8, &SP::I64RegsRegClass);
     SDValue VArg = DAG.getCopyFromReg(Chain, DL, VReg, MVT::i64);
     int FI = MF.getFrameInfo().CreateFixedObject(8, ArgOffset + ArgArea, true);
     auto PtrVT = getPointerTy(MF.getDataLayout());
@@ -692,9 +690,9 @@ SparcTargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 }
 
 static bool hasReturnsTwiceAttr(SelectionDAG &DAG, SDValue Callee,
-                                ImmutableCallSite CS) {
-  if (CS)
-    return CS.hasFnAttr(Attribute::ReturnsTwice);
+                                const CallBase *Call) {
+  if (Call)
+    return Call->hasFnAttr(Attribute::ReturnsTwice);
 
   const Function *CalleeFn = nullptr;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee)) {
@@ -753,14 +751,14 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
 
     SDValue Arg = OutVals[i];
     unsigned Size = Flags.getByValSize();
-    unsigned Align = Flags.getByValAlign();
+    Align Alignment = Flags.getNonZeroByValAlign();
 
     if (Size > 0U) {
-      int FI = MFI.CreateStackObject(Size, Align, false);
+      int FI = MFI.CreateStackObject(Size, Alignment, false);
       SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
       SDValue SizeNode = DAG.getConstant(Size, dl, MVT::i32);
 
-      Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Align,
+      Chain = DAG.getMemcpy(Chain, dl, FIPtr, Arg, SizeNode, Alignment,
                             false,        // isVolatile,
                             (Size <= 32), // AlwaysInline if size <= 32,
                             false,        // isTailCall
@@ -931,12 +929,12 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
   // stuck together.
   SDValue InFlag;
   for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
-    unsigned Reg = toCallerWindow(RegsToPass[i].first);
+    Register Reg = toCallerWindow(RegsToPass[i].first);
     Chain = DAG.getCopyToReg(Chain, dl, Reg, RegsToPass[i].second, InFlag);
     InFlag = Chain.getValue(1);
   }
 
-  bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
+  bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CB);
 
   // If the callee is a GlobalAddress node (quite common, every direct call is)
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
@@ -1018,7 +1016,7 @@ SparcTargetLowering::LowerCall_32(TargetLowering::CallLoweringInfo &CLI,
 // this table could be generated automatically from RegInfo.
 Register SparcTargetLowering::getRegisterByName(const char* RegName, LLT VT,
                                                 const MachineFunction &MF) const {
-  Register Reg = StringSwitch<unsigned>(RegName)
+  Register Reg = StringSwitch<Register>(RegName)
     .Case("i0", SP::I0).Case("i1", SP::I1).Case("i2", SP::I2).Case("i3", SP::I3)
     .Case("i4", SP::I4).Case("i5", SP::I5).Case("i6", SP::I6).Case("i7", SP::I7)
     .Case("o0", SP::O0).Case("o1", SP::O1).Case("o2", SP::O2).Case("o3", SP::O3)
@@ -1060,7 +1058,7 @@ static void fixupVariableFloatArgs(SmallVectorImpl<CCValAssign> &ArgLocs,
     CCValAssign NewVA;
 
     // Determine the offset into the argument array.
-    unsigned firstReg = (ValTy == MVT::f64) ? SP::D0 : SP::Q0;
+    Register firstReg = (ValTy == MVT::f64) ? SP::D0 : SP::Q0;
     unsigned argSize  = (ValTy == MVT::f64) ? 8 : 16;
     unsigned Offset = argSize * (VA.getLocReg() - firstReg);
     assert(Offset < 16*8 && "Offset out of range, bad register enum?");
@@ -1127,7 +1125,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   // Collect the set of registers to pass to the function and their values.
   // This will be emitted as a sequence of CopyToReg nodes glued to the call
   // instruction.
-  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
 
   // Collect chains from all the memory opeations that copy arguments to the
   // stack. They must follow the stack pointer adjustment above and precede the
@@ -1243,7 +1241,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
   // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
   // Likewise ExternalSymbol -> TargetExternalSymbol.
   SDValue Callee = CLI.Callee;
-  bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CS);
+  bool hasReturnsTwice = hasReturnsTwiceAttr(DAG, Callee, CLI.CB);
   unsigned TF = isPositionIndependent() ? SparcMCExpr::VK_Sparc_WPLT30 : 0;
   if (GlobalAddressSDNode *G = dyn_cast<GlobalAddressSDNode>(Callee))
     Callee = DAG.getTargetGlobalAddress(G->getGlobal(), DL, PtrVT, 0, TF);
@@ -1292,7 +1290,7 @@ SparcTargetLowering::LowerCall_64(TargetLowering::CallLoweringInfo &CLI,
 
   // Set inreg flag manually for codegen generated library calls that
   // return float.
-  if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CS)
+  if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CB)
     CLI.Ins[0].Flags.setInReg();
 
   RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_Sparc64);
@@ -1467,6 +1465,7 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
 
   // Turn FP extload into load/fpextend
   for (MVT VT : MVT::fp_valuetypes()) {
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::f16, Expand);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f32, Expand);
     setLoadExtAction(ISD::EXTLOAD, VT, MVT::f64, Expand);
   }
@@ -1476,6 +1475,8 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
     setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
 
   // Turn FP truncstore into trunc + store.
+  setTruncStoreAction(MVT::f32, MVT::f16, Expand);
+  setTruncStoreAction(MVT::f64, MVT::f16, Expand);
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
   setTruncStoreAction(MVT::f128, MVT::f32, Expand);
   setTruncStoreAction(MVT::f128, MVT::f64, Expand);
@@ -1517,6 +1518,12 @@ SparcTargetLowering::SparcTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::FP_TO_UINT, MVT::i64, Custom);
   setOperationAction(ISD::UINT_TO_FP, MVT::i64, Custom);
 
+  // Lower f16 conversion operations into library calls
+  setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
+  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
+
   setOperationAction(ISD::BITCAST, MVT::f32, Expand);
   setOperationAction(ISD::BITCAST, MVT::i32, Expand);
 
@@ -1906,10 +1913,8 @@ SDValue SparcTargetLowering::withTargetFlags(SDValue Op, unsigned TF,
                                       GA->getOffset(), TF);
 
   if (const ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(Op))
-    return DAG.getTargetConstantPool(CP->getConstVal(),
-                                     CP->getValueType(0),
-                                     CP->getAlignment(),
-                                     CP->getOffset(), TF);
+    return DAG.getTargetConstantPool(CP->getConstVal(), CP->getValueType(0),
+                                     CP->getAlign(), CP->getOffset(), TF);
 
   if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
     return DAG.getTargetBlockAddress(BA->getBlockAddress(),
@@ -2131,7 +2136,7 @@ SDValue SparcTargetLowering::LowerF128_LibCallArg(SDValue Chain,
 
   if (ArgTy->isFP128Ty()) {
     // Create a stack object and pass the pointer to the library function.
-    int FI = MFI.CreateStackObject(16, 8, false);
+    int FI = MFI.CreateStackObject(16, Align(8), false);
     SDValue FIPtr = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
     Chain = DAG.getStore(Chain, DL, Entry.Node, FIPtr, MachinePointerInfo(),
                          /* Alignment = */ 8);
@@ -2162,7 +2167,7 @@ SparcTargetLowering::LowerF128Op(SDValue Op, SelectionDAG &DAG,
   if (RetTy->isFP128Ty()) {
     // Create a Stack Object to receive the return value of type f128.
     ArgListEntry Entry;
-    int RetFI = MFI.CreateStackObject(16, 8, false);
+    int RetFI = MFI.CreateStackObject(16, Align(8), false);
     RetPtr = DAG.getFrameIndex(RetFI, PtrVT);
     Entry.Node = RetPtr;
     Entry.Ty   = PointerType::getUnqual(RetTy);
@@ -2239,54 +2244,54 @@ SDValue SparcTargetLowering::LowerF128Compare(SDValue LHS, SDValue RHS,
 
   switch(SPCC) {
   default: {
-    SDValue RHS = DAG.getTargetConstant(0, DL, Result.getValueType());
+    SDValue RHS = DAG.getConstant(0, DL, Result.getValueType());
     SPCC = SPCC::ICC_NE;
     return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
   }
   case SPCC::FCC_UL : {
     SDValue Mask   = DAG.getConstant(1, DL, Result.getValueType());
     Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
-    SDValue RHS    = DAG.getTargetConstant(0, DL, Result.getValueType());
+    SDValue RHS    = DAG.getConstant(0, DL, Result.getValueType());
     SPCC = SPCC::ICC_NE;
     return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
   }
   case SPCC::FCC_ULE: {
-    SDValue RHS = DAG.getTargetConstant(2, DL, Result.getValueType());
+    SDValue RHS = DAG.getConstant(2, DL, Result.getValueType());
     SPCC = SPCC::ICC_NE;
     return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
   }
   case SPCC::FCC_UG :  {
-    SDValue RHS = DAG.getTargetConstant(1, DL, Result.getValueType());
+    SDValue RHS = DAG.getConstant(1, DL, Result.getValueType());
     SPCC = SPCC::ICC_G;
     return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
   }
   case SPCC::FCC_UGE: {
-    SDValue RHS = DAG.getTargetConstant(1, DL, Result.getValueType());
+    SDValue RHS = DAG.getConstant(1, DL, Result.getValueType());
     SPCC = SPCC::ICC_NE;
     return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
   }
 
   case SPCC::FCC_U  :  {
-    SDValue RHS = DAG.getTargetConstant(3, DL, Result.getValueType());
+    SDValue RHS = DAG.getConstant(3, DL, Result.getValueType());
     SPCC = SPCC::ICC_E;
     return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
   }
   case SPCC::FCC_O  :  {
-    SDValue RHS = DAG.getTargetConstant(3, DL, Result.getValueType());
+    SDValue RHS = DAG.getConstant(3, DL, Result.getValueType());
     SPCC = SPCC::ICC_NE;
     return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
   }
   case SPCC::FCC_LG :  {
     SDValue Mask   = DAG.getConstant(3, DL, Result.getValueType());
     Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
-    SDValue RHS    = DAG.getTargetConstant(0, DL, Result.getValueType());
+    SDValue RHS    = DAG.getConstant(0, DL, Result.getValueType());
     SPCC = SPCC::ICC_NE;
     return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
   }
   case SPCC::FCC_UE : {
     SDValue Mask   = DAG.getConstant(3, DL, Result.getValueType());
     Result = DAG.getNode(ISD::AND, DL, Result.getValueType(), Result, Mask);
-    SDValue RHS    = DAG.getTargetConstant(0, DL, Result.getValueType());
+    SDValue RHS    = DAG.getConstant(0, DL, Result.getValueType());
     SPCC = SPCC::ICC_E;
     return DAG.getNode(SPISD::CMPICC, DL, MVT::Glue, Result, RHS);
   }
@@ -2544,15 +2549,16 @@ static SDValue LowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG,
                                        const SparcSubtarget *Subtarget) {
   SDValue Chain = Op.getOperand(0);  // Legalize the chain.
   SDValue Size  = Op.getOperand(1);  // Legalize the size.
-  unsigned Align = cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue();
-  unsigned StackAlign = Subtarget->getFrameLowering()->getStackAlignment();
+  MaybeAlign Alignment =
+      cast<ConstantSDNode>(Op.getOperand(2))->getMaybeAlignValue();
+  Align StackAlign = Subtarget->getFrameLowering()->getStackAlign();
   EVT VT = Size->getValueType(0);
   SDLoc dl(Op);
 
   // TODO: implement over-aligned alloca. (Note: also implies
   // supporting support for overaligned function frames + dynamic
   // allocations, at all, which currently isn't supported)
-  if (Align > StackAlign) {
+  if (Alignment && *Alignment > StackAlign) {
     const MachineFunction &MF = DAG.getMachineFunction();
     report_fatal_error("Function \"" + Twine(MF.getName()) + "\": "
                        "over-aligned dynamic alloca not supported.");
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h
index 2838ca4bdc66..c6d0011b88a5 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.h
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -103,14 +103,14 @@ namespace llvm {
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
-    unsigned
+    Register
     getExceptionPointerRegister(const Constant *PersonalityFn) const override {
       return SP::I0;
     }
 
     /// If a physical register, this returns the register that receives the
     /// exception typeid on entry to a landing pad.
-    unsigned
+    Register
     getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
       return SP::I1;
     }
diff --git a/llvm/lib/Target/Sparc/SparcInstrAliases.td b/llvm/lib/Target/Sparc/SparcInstrAliases.td
index d4d056ea0af6..4a0e8c856f27 100644
--- a/llvm/lib/Target/Sparc/SparcInstrAliases.td
+++ b/llvm/lib/Target/Sparc/SparcInstrAliases.td
@@ -281,7 +281,7 @@ defm : int_cond_alias<"pos",  0b1110>;
 defm : int_cond_alias<"neg",  0b0110>;
 defm : int_cond_alias<"vc",   0b1111>;
 defm : int_cond_alias<"vs",   0b0111>;
-let EmitPriority = 0 in 
+let EmitPriority = 0 in
 {
   defm : int_cond_alias<"",     0b1000>; // same as a; gnu asm, not in manual
   defm : int_cond_alias<"nz",   0b1001>; // same as ne
@@ -306,7 +306,7 @@ defm : fp_cond_alias<"uge",   0b1100>;
 defm : fp_cond_alias<"le",    0b1101>;
 defm : fp_cond_alias<"ule",   0b1110>;
 defm : fp_cond_alias<"o",     0b1111>;
-let EmitPriority = 0 in 
+let EmitPriority = 0 in
 {
   defm : fp_cond_alias<"",      0b1000>; // same as a; gnu asm, not in manual
   defm : fp_cond_alias<"nz",    0b0001>; // same as ne
diff --git a/llvm/lib/Target/Sparc/SparcInstrFormats.td b/llvm/lib/Target/Sparc/SparcInstrFormats.td
index fbf08b49d60c..2d8f063f7ed1 100644
--- a/llvm/lib/Target/Sparc/SparcInstrFormats.td
+++ b/llvm/lib/Target/Sparc/SparcInstrFormats.td
@@ -24,7 +24,7 @@ class InstSP<dag outs, dag ins, string asmstr, list<dag> pattern,
 
   let DecoderNamespace = "Sparc";
   field bits<32> SoftFail = 0;
-  
+
   let Itinerary = itin;
 }
 
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
index 31185aa508af..dc3a41c63098 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.cpp
@@ -393,7 +393,7 @@ void SparcInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
 void SparcInstrInfo::
 storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                    unsigned SrcReg, bool isKill, int FI,
+                    Register SrcReg, bool isKill, int FI,
                     const TargetRegisterClass *RC,
                     const TargetRegisterInfo *TRI) const {
   DebugLoc DL;
@@ -403,7 +403,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   const MachineFrameInfo &MFI = MF->getFrameInfo();
   MachineMemOperand *MMO = MF->getMachineMemOperand(
       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
-      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+      MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
 
   // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
   if (RC == &SP::I64RegsRegClass)
@@ -432,7 +432,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
 
 void SparcInstrInfo::
 loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
-                     unsigned DestReg, int FI,
+                     Register DestReg, int FI,
                      const TargetRegisterClass *RC,
                      const TargetRegisterInfo *TRI) const {
   DebugLoc DL;
@@ -442,7 +442,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   const MachineFrameInfo &MFI = MF->getFrameInfo();
   MachineMemOperand *MMO = MF->getMachineMemOperand(
       MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
-      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+      MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
 
   if (RC == &SP::I64RegsRegClass)
     BuildMI(MBB, I, DL, get(SP::LDXri), DestReg).addFrameIndex(FI).addImm(0)
@@ -468,11 +468,10 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     llvm_unreachable("Can't load this register from stack slot");
 }
 
-unsigned SparcInstrInfo::getGlobalBaseReg(MachineFunction *MF) const
-{
+Register SparcInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
   SparcMachineFunctionInfo *SparcFI = MF->getInfo<SparcMachineFunctionInfo>();
-  unsigned GlobalBaseReg = SparcFI->getGlobalBaseReg();
-  if (GlobalBaseReg != 0)
+  Register GlobalBaseReg = SparcFI->getGlobalBaseReg();
+  if (GlobalBaseReg)
     return GlobalBaseReg;
 
   // Insert the set of GlobalBaseReg into the first MBB of the function
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.h b/llvm/lib/Target/Sparc/SparcInstrInfo.h
index f0b3dde6dec3..b25de8e5a690 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.h
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.h
@@ -86,17 +86,17 @@ public:
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           Register SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
-                            unsigned DestReg, int FrameIndex,
+                            Register DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
-  unsigned getGlobalBaseReg(MachineFunction *MF) const;
+  Register getGlobalBaseReg(MachineFunction *MF) const;
 
   // Lower pseudo instructions after register allocation.
   bool expandPostRAPseudo(MachineInstr &MI) const override;
diff --git a/llvm/lib/Target/Sparc/SparcInstrInfo.td b/llvm/lib/Target/Sparc/SparcInstrInfo.td
index f26f4a1c1a84..8b01313c7911 100644
--- a/llvm/lib/Target/Sparc/SparcInstrInfo.td
+++ b/llvm/lib/Target/Sparc/SparcInstrInfo.td
@@ -27,12 +27,12 @@ def Is32Bit : Predicate<"!Subtarget->is64Bit()">;
 def Is64Bit : Predicate<"Subtarget->is64Bit()">;
 
 def UseSoftMulDiv : Predicate<"Subtarget->useSoftMulDiv()">,
-              AssemblerPredicate<"FeatureSoftMulDiv">;
+              AssemblerPredicate<(all_of FeatureSoftMulDiv)>;
 
 // HasV9 - This predicate is true when the target processor supports V9
 // instructions.  Note that the machine may be running in 32-bit mode.
 def HasV9   : Predicate<"Subtarget->isV9()">,
-              AssemblerPredicate<"FeatureV9">;
+              AssemblerPredicate<(all_of FeatureV9)>;
 
 // HasNoV9 - This predicate is true when the target doesn't have V9
 // instructions.  Use of this is just a hack for the isel not having proper
@@ -41,11 +41,11 @@ def HasNoV9 : Predicate<"!Subtarget->isV9()">;
 
 // HasVIS - This is true when the target processor has VIS extensions.
 def HasVIS : Predicate<"Subtarget->isVIS()">,
-             AssemblerPredicate<"FeatureVIS">;
+             AssemblerPredicate<(all_of FeatureVIS)>;
 def HasVIS2 : Predicate<"Subtarget->isVIS2()">,
-             AssemblerPredicate<"FeatureVIS2">;
+             AssemblerPredicate<(all_of FeatureVIS2)>;
 def HasVIS3 : Predicate<"Subtarget->isVIS3()">,
-             AssemblerPredicate<"FeatureVIS3">;
+             AssemblerPredicate<(all_of FeatureVIS3)>;
 
 // HasHardQuad - This is true when the target processor supports quad floating
 // point instructions.
@@ -58,7 +58,7 @@ def HasLeonCASA : Predicate<"Subtarget->hasLeonCasa()">;
 // HasPWRPSR - This is true when the target processor supports partial
 // writes to the PSR register that only affects the ET field.
 def HasPWRPSR : Predicate<"Subtarget->hasPWRPSR()">,
-                AssemblerPredicate<"FeaturePWRPSR">;
+                AssemblerPredicate<(all_of FeaturePWRPSR)>;
 
 // HasUMAC_SMAC - This is true when the target processor supports the
 // UMAC and SMAC instructions
@@ -529,7 +529,7 @@ let DecoderMethod = "DecodeLoadCP", Defs = [CPSR] in {
                        "ld [$addr], %csr", []>;
   }
 }
-  
+
 let DecoderMethod = "DecodeLoadFP" in
   let Defs = [FSR] in {
     let rd = 0 in {
@@ -571,12 +571,12 @@ let DecoderMethod = "DecodeStoreQFP" in
   defm STQF  : StoreA<"stq", 0b100110, 0b110110, store, QFPRegs, f128>,
                Requires<[HasV9, HasHardQuad]>;
 
-let DecoderMethod = "DecodeStoreCP" in 
-  defm STC   : Store<"st", 0b110100, store, CoprocRegs, i32>; 
-  
-let DecoderMethod = "DecodeStoreCPPair" in 
+let DecoderMethod = "DecodeStoreCP" in
+  defm STC   : Store<"st", 0b110100, store, CoprocRegs, i32>;
+
+let DecoderMethod = "DecodeStoreCPPair" in
   defm STDC   : Store<"std", 0b110111, store, CoprocPair, v2i32, IIC_std>;
-  
+
 let DecoderMethod = "DecodeStoreCP", rd = 0 in {
   let Defs = [CPSR] in {
     def STCSRrr : F3_1<3, 0b110101, (outs MEMrr:$addr), (ins),
@@ -897,7 +897,7 @@ def CBCOND  : CPBranchSP<(ins brtarget:$imm22, CCOp:$cond),
                           [(SPbrfcc bb:$imm22, imm:$cond)]>;
 def CBCONDA : CPBranchSPA<(ins brtarget:$imm22, CCOp:$cond),
                            "cb$cond,a $imm22", []>;
-                           
+
 // Section B.24 - Call and Link Instruction, p. 125
 // This is the only Format 1 instruction
 let Uses = [O6],
@@ -1080,7 +1080,7 @@ let hasSideEffects = 1, rd = 0, rs1 = 0b01111, rs2 = 0 in
   def STBAR : F3_1<2, 0b101000, (outs), (ins), "stbar", []>;
 
 
-// Section B.31 - Unimplmented Instruction
+// Section B.31 - Unimplemented Instruction
 let rd = 0 in
   def UNIMP : F2_1<0b000, (outs), (ins i32imm:$imm22),
                   "unimp $imm22", []>;
@@ -1186,7 +1186,7 @@ def FABSS : F3_3u<2, 0b110100, 0b000001001,
 // Floating-point Square Root Instructions, p.145
 // FSQRTS generates an erratum on LEON processors, so by disabling this instruction
 // this will be promoted to use FSQRTD with doubles instead.
-let Predicates = [HasNoFdivSqrtFix] in 
+let Predicates = [HasNoFdivSqrtFix] in
 def FSQRTS : F3_3u<2, 0b110100, 0b000101001,
                   (outs FPRegs:$rd), (ins FPRegs:$rs2),
                   "fsqrts $rs2, $rd",
@@ -1515,8 +1515,8 @@ let Predicates = [HasV9], hasSideEffects = 1, rd = 0, rs1 = 0b01111 in
  def MEMBARi : F3_2<2, 0b101000, (outs), (ins MembarTag:$simm13),
                     "membar $simm13", []>;
 
-// The CAS instruction, unlike other instructions, only comes in a 
-// form which requires an ASI be provided. The ASI value hardcoded 
+// The CAS instruction, unlike other instructions, only comes in a
+// form which requires an ASI be provided. The ASI value hardcoded
 // here is ASI_PRIMARY, the default unprivileged ASI for SparcV9.
 let Predicates = [HasV9], Constraints = "$swap = $rd", asi = 0b10000000 in
   def CASrr: F3_1_asi<3, 0b111100,
@@ -1536,18 +1536,18 @@ let Predicates = [HasLeonCASA], Constraints = "$swap = $rd", asi = 0b00001010 in
                  "casa [$rs1] 10, $rs2, $rd",
                  [(set i32:$rd,
                      (atomic_cmp_swap_32 iPTR:$rs1, i32:$rs2, i32:$swap))]>;
-                 
+
 // CASA supported on some LEON3 and all LEON4 processors. Same pattern as
 // CASrr, above, but with a different ASI. This version is supported for
-// inline assembly lowering only. 
+// inline assembly lowering only.
 let Predicates = [HasLeonCASA], Constraints = "$swap = $rd" in
   def CASArr: F3_1_asi<3, 0b111100,
                 (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2,
                                      IntRegs:$swap, i8imm:$asi),
                  "casa [$rs1] $asi, $rs2, $rd", []>;
-                
+
 // TODO: Add DAG sequence to lower these instructions. Currently, only provided
-// as inline assembler-supported instructions. 
+// as inline assembler-supported instructions.
 let Predicates = [HasUMAC_SMAC], Defs = [Y, ASR18], Uses = [Y, ASR18] in {
   def SMACrr :  F3_1<2, 0b111111,
                    (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2, ASRRegs:$asr18),
@@ -1558,12 +1558,12 @@ let Predicates = [HasUMAC_SMAC], Defs = [Y, ASR18], Uses = [Y, ASR18] in {
                   (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13, ASRRegs:$asr18),
                    "smac $rs1, $simm13, $rd",
                    [], IIC_smac_umac>;
-                 
+
   def UMACrr :  F3_1<2, 0b111110,
                   (outs IntRegs:$rd), (ins IntRegs:$rs1, IntRegs:$rs2, ASRRegs:$asr18),
                    "umac $rs1, $rs2, $rd",
                    [], IIC_smac_umac>;
-                 
+
   def UMACri :  F3_2<2, 0b111110,
                   (outs IntRegs:$rd), (ins IntRegs:$rs1, simm13Op:$simm13, ASRRegs:$asr18),
                    "umac $rs1, $simm13, $rd",
diff --git a/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h b/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h
index fe5705878693..d557c8ea22e2 100644
--- a/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h
+++ b/llvm/lib/Target/Sparc/SparcMachineFunctionInfo.h
@@ -19,14 +19,14 @@ namespace llvm {
   class SparcMachineFunctionInfo : public MachineFunctionInfo {
     virtual void anchor();
   private:
-    unsigned GlobalBaseReg;
+    Register GlobalBaseReg;
 
     /// VarArgsFrameOffset - Frame offset to start of varargs area.
     int VarArgsFrameOffset;
 
     /// SRetReturnReg - Holds the virtual register into which the sret
     /// argument is passed.
-    unsigned SRetReturnReg;
+    Register SRetReturnReg;
 
     /// IsLeafProc - True if the function is a leaf procedure.
     bool IsLeafProc;
@@ -38,14 +38,14 @@ namespace llvm {
       : GlobalBaseReg(0), VarArgsFrameOffset(0), SRetReturnReg(0),
         IsLeafProc(false) {}
 
-    unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
-    void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+    Register getGlobalBaseReg() const { return GlobalBaseReg; }
+    void setGlobalBaseReg(Register Reg) { GlobalBaseReg = Reg; }
 
     int getVarArgsFrameOffset() const { return VarArgsFrameOffset; }
     void setVarArgsFrameOffset(int Offset) { VarArgsFrameOffset = Offset; }
 
-    unsigned getSRetReturnReg() const { return SRetReturnReg; }
-    void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+    Register getSRetReturnReg() const { return SRetReturnReg; }
+    void setSRetReturnReg(Register Reg) { SRetReturnReg = Reg; }
 
     void setLeafProc(bool rhs) { IsLeafProc = rhs; }
     bool isLeafProc() const { return IsLeafProc; }
diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
index 19a90e98db7e..990dbe23e7ac 100644
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.cpp
@@ -173,7 +173,7 @@ SparcRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   const SparcSubtarget &Subtarget = MF.getSubtarget<SparcSubtarget>();
   const SparcFrameLowering *TFI = getFrameLowering(MF);
 
-  unsigned FrameReg;
+  Register FrameReg;
   int Offset;
   Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg);
 
diff --git a/llvm/lib/Target/Sparc/SparcRegisterInfo.td b/llvm/lib/Target/Sparc/SparcRegisterInfo.td
index 98959d512955..8225bc21e8fe 100644
--- a/llvm/lib/Target/Sparc/SparcRegisterInfo.td
+++ b/llvm/lib/Target/Sparc/SparcRegisterInfo.td
@@ -359,14 +359,14 @@ let isAllocatable = 0 in {
   // Ancillary state registers
   def ASRRegs : RegisterClass<"SP", [i32], 32,
                               (add Y, (sequence "ASR%u", 1, 31))>;
-                            
+
   // This register class should not be used to hold i64 values.
   def CoprocRegs : RegisterClass<"SP", [i32], 32,
                                 (add (sequence "C%u", 0, 31))>;
 
   // Should be in the same order as CoprocRegs.
   def CoprocPair : RegisterClass<"SP", [v2i32], 64,
-    (add C0_C1,   C2_C3,   C4_C5,   C6_C7,   
+    (add C0_C1,   C2_C3,   C4_C5,   C6_C7,
          C8_C9,   C10_C11, C12_C13, C14_C15,
          C16_C17, C18_C19, C20_C21, C22_C23,
          C24_C25, C26_C27, C28_C29, C30_C31)>;
diff --git a/llvm/lib/Target/Sparc/SparcSchedule.td b/llvm/lib/Target/Sparc/SparcSchedule.td
index 31e43c9bd95d..0f05372b7050 100755
--- a/llvm/lib/Target/Sparc/SparcSchedule.td
+++ b/llvm/lib/Target/Sparc/SparcSchedule.td
@@ -1,4 +1,4 @@
-//===-- SparcSchedule.td - Describe the Sparc Itineries ----*- tablegen -*-===//
+//===-- SparcSchedule.td - Describe the Sparc Itineraries ----*- tablegen -*-=//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
diff --git a/llvm/lib/Target/Sparc/SparcSubtarget.cpp b/llvm/lib/Target/Sparc/SparcSubtarget.cpp
index 075a002a358d..dbc6cf8e5b86 100644
--- a/llvm/lib/Target/Sparc/SparcSubtarget.cpp
+++ b/llvm/lib/Target/Sparc/SparcSubtarget.cpp
@@ -50,7 +50,7 @@ SparcSubtarget &SparcSubtarget::initializeSubtargetDependencies(StringRef CPU,
   HasLeonCycleCounter = false;
 
   // Determine default and user specified characteristics
-  std::string CPUName = CPU;
+  std::string CPUName = std::string(CPU);
   if (CPUName.empty())
     CPUName = (Is64Bit) ? "v9" : "v8";
 
diff --git a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
index 76f387842f73..d48d94e2faf1 100644
--- a/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -99,7 +99,8 @@ SparcTargetMachine::SparcTargetMachine(
                             CM, getEffectiveRelocModel(RM), is64bit, JIT),
                         OL),
       TLOF(std::make_unique<SparcELFTargetObjectFile>()),
-      Subtarget(TT, CPU, FS, *this, is64bit), is64Bit(is64bit) {
+      Subtarget(TT, std::string(CPU), std::string(FS), *this, is64bit),
+      is64Bit(is64bit) {
   initAsmInfo();
 }
 
diff --git a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
index e6ad4d2d67aa..c03510fa090d 100644
--- a/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
+++ b/llvm/lib/Target/Sparc/SparcTargetObjectFile.cpp
@@ -11,13 +11,13 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/CodeGen/TargetLowering.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
 void SparcELFTargetObjectFile::Initialize(MCContext &Ctx,
                                           const TargetMachine &TM) {
   TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
 }
 
 const MCExpr *SparcELFTargetObjectFile::getTTypeGlobalReference(
diff --git a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
index 607266d552a6..d5a3a19446c7 100644
--- a/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
+++ b/llvm/lib/Target/SystemZ/AsmParser/SystemZAsmParser.cpp
@@ -53,8 +53,6 @@ enum RegisterKind {
   GRH32Reg,
   GR64Reg,
   GR128Reg,
-  ADDR32Reg,
-  ADDR64Reg,
   FP32Reg,
   FP64Reg,
   FP128Reg,
@@ -109,7 +107,7 @@ private:
 
   // Base + Disp + Index, where Base and Index are LLVM registers or 0.
   // MemKind says what type of memory this is and RegKind says what type
-  // the base register has (ADDR32Reg or ADDR64Reg).  Length is the operand
+  // the base register has (GR32Reg or GR64Reg).  Length is the operand
   // length for D(L,B)-style operands, otherwise it is null.
   struct MemOp {
     unsigned Base : 12;
@@ -348,8 +346,8 @@ public:
   bool isGRX32() const { return false; }
   bool isGR64() const { return isReg(GR64Reg); }
   bool isGR128() const { return isReg(GR128Reg); }
-  bool isADDR32() const { return isReg(ADDR32Reg); }
-  bool isADDR64() const { return isReg(ADDR64Reg); }
+  bool isADDR32() const { return isReg(GR32Reg); }
+  bool isADDR64() const { return isReg(GR64Reg); }
   bool isADDR128() const { return false; }
   bool isFP32() const { return isReg(FP32Reg); }
   bool isFP64() const { return isReg(FP64Reg); }
@@ -361,16 +359,16 @@ public:
   bool isAR32() const { return isReg(AR32Reg); }
   bool isCR64() const { return isReg(CR64Reg); }
   bool isAnyReg() const { return (isReg() || isImm(0, 15)); }
-  bool isBDAddr32Disp12() const { return isMemDisp12(BDMem, ADDR32Reg); }
-  bool isBDAddr32Disp20() const { return isMemDisp20(BDMem, ADDR32Reg); }
-  bool isBDAddr64Disp12() const { return isMemDisp12(BDMem, ADDR64Reg); }
-  bool isBDAddr64Disp20() const { return isMemDisp20(BDMem, ADDR64Reg); }
-  bool isBDXAddr64Disp12() const { return isMemDisp12(BDXMem, ADDR64Reg); }
-  bool isBDXAddr64Disp20() const { return isMemDisp20(BDXMem, ADDR64Reg); }
-  bool isBDLAddr64Disp12Len4() const { return isMemDisp12Len4(ADDR64Reg); }
-  bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(ADDR64Reg); }
-  bool isBDRAddr64Disp12() const { return isMemDisp12(BDRMem, ADDR64Reg); }
-  bool isBDVAddr64Disp12() const { return isMemDisp12(BDVMem, ADDR64Reg); }
+  bool isBDAddr32Disp12() const { return isMemDisp12(BDMem, GR32Reg); }
+  bool isBDAddr32Disp20() const { return isMemDisp20(BDMem, GR32Reg); }
+  bool isBDAddr64Disp12() const { return isMemDisp12(BDMem, GR64Reg); }
+  bool isBDAddr64Disp20() const { return isMemDisp20(BDMem, GR64Reg); }
+  bool isBDXAddr64Disp12() const { return isMemDisp12(BDXMem, GR64Reg); }
+  bool isBDXAddr64Disp20() const { return isMemDisp20(BDXMem, GR64Reg); }
+  bool isBDLAddr64Disp12Len4() const { return isMemDisp12Len4(GR64Reg); }
+  bool isBDLAddr64Disp12Len8() const { return isMemDisp12Len8(GR64Reg); }
+  bool isBDRAddr64Disp12() const { return isMemDisp12(BDRMem, GR64Reg); }
+  bool isBDVAddr64Disp12() const { return isMemDisp12(BDVMem, GR64Reg); }
   bool isU1Imm() const { return isImm(0, 1); }
   bool isU2Imm() const { return isImm(0, 3); }
   bool isU3Imm() const { return isImm(0, 7); }
@@ -405,26 +403,24 @@ private:
     SMLoc StartLoc, EndLoc;
   };
 
-  bool parseRegister(Register &Reg);
+  bool parseRegister(Register &Reg, bool RestoreOnFailure = false);
 
-  bool parseRegister(Register &Reg, RegisterGroup Group, const unsigned *Regs,
-                     bool IsAddress = false);
+  bool parseIntegerRegister(Register &Reg, RegisterGroup Group);
 
   OperandMatchResultTy parseRegister(OperandVector &Operands,
-                                     RegisterGroup Group, const unsigned *Regs,
                                      RegisterKind Kind);
 
   OperandMatchResultTy parseAnyRegister(OperandVector &Operands);
 
-  bool parseAddress(bool &HaveReg1, Register &Reg1,
-                    bool &HaveReg2, Register &Reg2,
-                    const MCExpr *&Disp, const MCExpr *&Length);
+  bool parseAddress(bool &HaveReg1, Register &Reg1, bool &HaveReg2,
+                    Register &Reg2, const MCExpr *&Disp, const MCExpr *&Length,
+                    bool HasLength = false, bool HasVectorIndex = false);
   bool parseAddressRegister(Register &Reg);
 
   bool ParseDirectiveInsn(SMLoc L);
 
   OperandMatchResultTy parseAddress(OperandVector &Operands,
-                                    MemoryKind MemKind, const unsigned *Regs,
+                                    MemoryKind MemKind,
                                     RegisterKind RegKind);
 
   OperandMatchResultTy parsePCRel(OperandVector &Operands, int64_t MinVal,
@@ -449,6 +445,10 @@ public:
   // Override MCTargetAsmParser.
   bool ParseDirective(AsmToken DirectiveID) override;
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc,
+                     bool RestoreOnFailure);
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
   bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
                         SMLoc NameLoc, OperandVector &Operands) override;
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -458,76 +458,78 @@ public:
 
   // Used by the TableGen code to parse particular operand types.
   OperandMatchResultTy parseGR32(OperandVector &Operands) {
-    return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, GR32Reg);
+    return parseRegister(Operands, GR32Reg);
   }
   OperandMatchResultTy parseGRH32(OperandVector &Operands) {
-    return parseRegister(Operands, RegGR, SystemZMC::GRH32Regs, GRH32Reg);
+    return parseRegister(Operands, GRH32Reg);
   }
   OperandMatchResultTy parseGRX32(OperandVector &Operands) {
     llvm_unreachable("GRX32 should only be used for pseudo instructions");
   }
   OperandMatchResultTy parseGR64(OperandVector &Operands) {
-    return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, GR64Reg);
+    return parseRegister(Operands, GR64Reg);
   }
   OperandMatchResultTy parseGR128(OperandVector &Operands) {
-    return parseRegister(Operands, RegGR, SystemZMC::GR128Regs, GR128Reg);
+    return parseRegister(Operands, GR128Reg);
   }
   OperandMatchResultTy parseADDR32(OperandVector &Operands) {
-    return parseRegister(Operands, RegGR, SystemZMC::GR32Regs, ADDR32Reg);
+    // For the AsmParser, we will accept %r0 for ADDR32 as well.
+    return parseRegister(Operands, GR32Reg);
   }
   OperandMatchResultTy parseADDR64(OperandVector &Operands) {
-    return parseRegister(Operands, RegGR, SystemZMC::GR64Regs, ADDR64Reg);
+    // For the AsmParser, we will accept %r0 for ADDR64 as well.
+    return parseRegister(Operands, GR64Reg);
   }
   OperandMatchResultTy parseADDR128(OperandVector &Operands) {
     llvm_unreachable("Shouldn't be used as an operand");
   }
   OperandMatchResultTy parseFP32(OperandVector &Operands) {
-    return parseRegister(Operands, RegFP, SystemZMC::FP32Regs, FP32Reg);
+    return parseRegister(Operands, FP32Reg);
   }
   OperandMatchResultTy parseFP64(OperandVector &Operands) {
-    return parseRegister(Operands, RegFP, SystemZMC::FP64Regs, FP64Reg);
+    return parseRegister(Operands, FP64Reg);
   }
   OperandMatchResultTy parseFP128(OperandVector &Operands) {
-    return parseRegister(Operands, RegFP, SystemZMC::FP128Regs, FP128Reg);
+    return parseRegister(Operands, FP128Reg);
   }
   OperandMatchResultTy parseVR32(OperandVector &Operands) {
-    return parseRegister(Operands, RegV, SystemZMC::VR32Regs, VR32Reg);
+    return parseRegister(Operands, VR32Reg);
   }
   OperandMatchResultTy parseVR64(OperandVector &Operands) {
-    return parseRegister(Operands, RegV, SystemZMC::VR64Regs, VR64Reg);
+    return parseRegister(Operands, VR64Reg);
   }
   OperandMatchResultTy parseVF128(OperandVector &Operands) {
     llvm_unreachable("Shouldn't be used as an operand");
   }
   OperandMatchResultTy parseVR128(OperandVector &Operands) {
-    return parseRegister(Operands, RegV, SystemZMC::VR128Regs, VR128Reg);
+    return parseRegister(Operands, VR128Reg);
   }
   OperandMatchResultTy parseAR32(OperandVector &Operands) {
-    return parseRegister(Operands, RegAR, SystemZMC::AR32Regs, AR32Reg);
+    return parseRegister(Operands, AR32Reg);
   }
   OperandMatchResultTy parseCR64(OperandVector &Operands) {
-    return parseRegister(Operands, RegCR, SystemZMC::CR64Regs, CR64Reg);
+    return parseRegister(Operands, CR64Reg);
   }
   OperandMatchResultTy parseAnyReg(OperandVector &Operands) {
     return parseAnyRegister(Operands);
   }
   OperandMatchResultTy parseBDAddr32(OperandVector &Operands) {
-    return parseAddress(Operands, BDMem, SystemZMC::GR32Regs, ADDR32Reg);
+    return parseAddress(Operands, BDMem, GR32Reg);
   }
   OperandMatchResultTy parseBDAddr64(OperandVector &Operands) {
-    return parseAddress(Operands, BDMem, SystemZMC::GR64Regs, ADDR64Reg);
+    return parseAddress(Operands, BDMem, GR64Reg);
   }
   OperandMatchResultTy parseBDXAddr64(OperandVector &Operands) {
-    return parseAddress(Operands, BDXMem, SystemZMC::GR64Regs, ADDR64Reg);
+    return parseAddress(Operands, BDXMem, GR64Reg);
   }
   OperandMatchResultTy parseBDLAddr64(OperandVector &Operands) {
-    return parseAddress(Operands, BDLMem, SystemZMC::GR64Regs, ADDR64Reg);
+    return parseAddress(Operands, BDLMem, GR64Reg);
   }
   OperandMatchResultTy parseBDRAddr64(OperandVector &Operands) {
-    return parseAddress(Operands, BDRMem, SystemZMC::GR64Regs, ADDR64Reg);
+    return parseAddress(Operands, BDRMem, GR64Reg);
   }
   OperandMatchResultTy parseBDVAddr64(OperandVector &Operands) {
-    return parseAddress(Operands, BDVMem, SystemZMC::GR64Regs, ADDR64Reg);
+    return parseAddress(Operands, BDVMem, GR64Reg);
   }
   OperandMatchResultTy parsePCRel12(OperandVector &Operands) {
     return parsePCRel(Operands, -(1LL << 12), (1LL << 12) - 1, false);
@@ -691,27 +693,37 @@ void SystemZOperand::print(raw_ostream &OS) const {
 }
 
 // Parse one register of the form %<prefix><number>.
-bool SystemZAsmParser::parseRegister(Register &Reg) {
+bool SystemZAsmParser::parseRegister(Register &Reg, bool RestoreOnFailure) {
   Reg.StartLoc = Parser.getTok().getLoc();
 
   // Eat the % prefix.
   if (Parser.getTok().isNot(AsmToken::Percent))
     return Error(Parser.getTok().getLoc(), "register expected");
+  const AsmToken &PercentTok = Parser.getTok();
   Parser.Lex();
 
   // Expect a register name.
-  if (Parser.getTok().isNot(AsmToken::Identifier))
+  if (Parser.getTok().isNot(AsmToken::Identifier)) {
+    if (RestoreOnFailure)
+      getLexer().UnLex(PercentTok);
     return Error(Reg.StartLoc, "invalid register");
+  }
 
   // Check that there's a prefix.
   StringRef Name = Parser.getTok().getString();
-  if (Name.size() < 2)
+  if (Name.size() < 2) {
+    if (RestoreOnFailure)
+      getLexer().UnLex(PercentTok);
     return Error(Reg.StartLoc, "invalid register");
+  }
   char Prefix = Name[0];
 
   // Treat the rest of the register name as a register number.
-  if (Name.substr(1).getAsInteger(10, Reg.Num))
+  if (Name.substr(1).getAsInteger(10, Reg.Num)) {
+    if (RestoreOnFailure)
+      getLexer().UnLex(PercentTok);
     return Error(Reg.StartLoc, "invalid register");
+  }
 
   // Look for valid combinations of prefix and number.
   if (Prefix == 'r' && Reg.Num < 16)
@@ -724,49 +736,102 @@ bool SystemZAsmParser::parseRegister(Register &Reg) {
     Reg.Group = RegAR;
   else if (Prefix == 'c' && Reg.Num < 16)
     Reg.Group = RegCR;
-  else
+  else {
+    if (RestoreOnFailure)
+      getLexer().UnLex(PercentTok);
     return Error(Reg.StartLoc, "invalid register");
+  }
 
   Reg.EndLoc = Parser.getTok().getLoc();
   Parser.Lex();
   return false;
 }
 
-// Parse a register of group Group.  If Regs is nonnull, use it to map
-// the raw register number to LLVM numbering, with zero entries
-// indicating an invalid register.  IsAddress says whether the
-// register appears in an address context. Allow FP Group if expecting
-// RegV Group, since the f-prefix yields the FP group even while used
-// with vector instructions.
-bool SystemZAsmParser::parseRegister(Register &Reg, RegisterGroup Group,
-                                     const unsigned *Regs, bool IsAddress) {
-  if (parseRegister(Reg))
-    return true;
-  if (Reg.Group != Group && !(Reg.Group == RegFP && Group == RegV))
-    return Error(Reg.StartLoc, "invalid operand for instruction");
-  if (Regs && Regs[Reg.Num] == 0)
-    return Error(Reg.StartLoc, "invalid register pair");
-  if (Reg.Num == 0 && IsAddress)
-    return Error(Reg.StartLoc, "%r0 used in an address");
-  if (Regs)
-    Reg.Num = Regs[Reg.Num];
-  return false;
-}
-
-// Parse a register and add it to Operands.  The other arguments are as above.
+// Parse a register of kind Kind and add it to Operands.
 OperandMatchResultTy
-SystemZAsmParser::parseRegister(OperandVector &Operands, RegisterGroup Group,
-                                const unsigned *Regs, RegisterKind Kind) {
-  if (Parser.getTok().isNot(AsmToken::Percent))
+SystemZAsmParser::parseRegister(OperandVector &Operands, RegisterKind Kind) {
+  Register Reg;
+  RegisterGroup Group;
+  switch (Kind) {
+  case GR32Reg:
+  case GRH32Reg:
+  case GR64Reg:
+  case GR128Reg:
+    Group = RegGR;
+    break;
+  case FP32Reg:
+  case FP64Reg:
+  case FP128Reg:
+    Group = RegFP;
+    break;
+  case VR32Reg:
+  case VR64Reg:
+  case VR128Reg:
+    Group = RegV;
+    break;
+  case AR32Reg:
+    Group = RegAR;
+    break;
+  case CR64Reg:
+    Group = RegCR;
+    break;
+  }
+
+  // Handle register names of the form %<prefix><number>
+  if (Parser.getTok().is(AsmToken::Percent)) {
+    if (parseRegister(Reg))
+      return MatchOperand_ParseFail;
+
+    // Check the parsed register group "Reg.Group" with the expected "Group"
+    // Have to error out if user specified wrong prefix.
+    switch (Group) {
+    case RegGR:
+    case RegFP:
+    case RegAR:
+    case RegCR:
+      if (Group != Reg.Group) {
+        Error(Reg.StartLoc, "invalid operand for instruction");
+        return MatchOperand_ParseFail;
+      }
+      break;
+    case RegV:
+      if (Reg.Group != RegV && Reg.Group != RegFP) {
+        Error(Reg.StartLoc, "invalid operand for instruction");
+        return MatchOperand_ParseFail;
+      }
+      break;
+    }
+  } else if (Parser.getTok().is(AsmToken::Integer)) {
+    if (parseIntegerRegister(Reg, Group))
+      return MatchOperand_ParseFail;
+  }
+  // Otherwise we didn't match a register operand.
+  else
     return MatchOperand_NoMatch;
 
-  Register Reg;
-  bool IsAddress = (Kind == ADDR32Reg || Kind == ADDR64Reg);
-  if (parseRegister(Reg, Group, Regs, IsAddress))
+  // Determine the LLVM register number according to Kind.
+  const unsigned *Regs;
+  switch (Kind) {
+  case GR32Reg:  Regs = SystemZMC::GR32Regs;  break;
+  case GRH32Reg: Regs = SystemZMC::GRH32Regs; break;
+  case GR64Reg:  Regs = SystemZMC::GR64Regs;  break;
+  case GR128Reg: Regs = SystemZMC::GR128Regs; break;
+  case FP32Reg:  Regs = SystemZMC::FP32Regs;  break;
+  case FP64Reg:  Regs = SystemZMC::FP64Regs;  break;
+  case FP128Reg: Regs = SystemZMC::FP128Regs; break;
+  case VR32Reg:  Regs = SystemZMC::VR32Regs;  break;
+  case VR64Reg:  Regs = SystemZMC::VR64Regs;  break;
+  case VR128Reg: Regs = SystemZMC::VR128Regs; break;
+  case AR32Reg:  Regs = SystemZMC::AR32Regs;  break;
+  case CR64Reg:  Regs = SystemZMC::CR64Regs;  break;
+  }
+  if (Regs[Reg.Num] == 0) {
+    Error(Reg.StartLoc, "invalid register pair");
     return MatchOperand_ParseFail;
+  }
 
-  Operands.push_back(SystemZOperand::createReg(Kind, Reg.Num,
-                                               Reg.StartLoc, Reg.EndLoc));
+  Operands.push_back(
+      SystemZOperand::createReg(Kind, Regs[Reg.Num], Reg.StartLoc, Reg.EndLoc));
   return MatchOperand_Success;
 }
 
@@ -831,11 +896,39 @@ SystemZAsmParser::parseAnyRegister(OperandVector &Operands) {
   return MatchOperand_Success;
 }
 
+bool SystemZAsmParser::parseIntegerRegister(Register &Reg,
+                                            RegisterGroup Group) {
+  Reg.StartLoc = Parser.getTok().getLoc();
+  // We have an integer token
+  const MCExpr *Register;
+  if (Parser.parseExpression(Register))
+    return true;
+
+  const auto *CE = dyn_cast<MCConstantExpr>(Register);
+  if (!CE)
+    return true;
+
+  int64_t MaxRegNum = (Group == RegV) ? 31 : 15;
+  int64_t Value = CE->getValue();
+  if (Value < 0 || Value > MaxRegNum) {
+    Error(Parser.getTok().getLoc(), "invalid register");
+    return true;
+  }
+
+  // Assign the Register Number
+  Reg.Num = (unsigned)Value;
+  Reg.Group = Group;
+  Reg.EndLoc = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+
+  // At this point, successfully parsed an integer register.
+  return false;
+}
+
 // Parse a memory operand into Reg1, Reg2, Disp, and Length.
 bool SystemZAsmParser::parseAddress(bool &HaveReg1, Register &Reg1,
                                     bool &HaveReg2, Register &Reg2,
-                                    const MCExpr *&Disp,
-                                    const MCExpr *&Length) {
+                                    const MCExpr *&Disp, const MCExpr *&Length,
+                                    bool HasLength, bool HasVectorIndex) {
   // Parse the displacement, which must always be present.
   if (getParser().parseExpression(Disp))
     return true;
@@ -844,6 +937,27 @@ bool SystemZAsmParser::parseAddress(bool &HaveReg1, Register &Reg1,
   HaveReg1 = false;
   HaveReg2 = false;
   Length = nullptr;
+
+  // If we have a scenario as below:
+  //   vgef %v0, 0(0), 0
+  // This is an example of a "BDVMem" instruction type.
+  //
+  // So when we parse this as an integer register, the register group
+  // needs to be tied to "RegV". Usually when the prefix is passed in
+  // as %<prefix><reg-number> its easy to check which group it should belong to
+  // However, if we're passing in just the integer there's no real way to
+  // "check" what register group it should belong to.
+  //
+  // When the user passes in the register as an integer, the user assumes that
+  // the compiler is responsible for substituting it as the right kind of
+  // register. Whereas, when the user specifies a "prefix", the onus is on
+  // the user to make sure they pass in the right kind of register.
+  //
+  // The restriction only applies to the first Register (i.e. Reg1). Reg2 is
+  // always a general register. Reg1 should be of group RegV if "HasVectorIndex"
+  // (i.e. insn is of type BDVMem) is true.
+  RegisterGroup RegGroup = HasVectorIndex ? RegV : RegGR;
+
   if (getLexer().is(AsmToken::LParen)) {
     Parser.Lex();
 
@@ -852,18 +966,47 @@ bool SystemZAsmParser::parseAddress(bool &HaveReg1, Register &Reg1,
       HaveReg1 = true;
       if (parseRegister(Reg1))
         return true;
+    }
+    // So if we have an integer as the first token in ([tok1], ..), it could:
+    // 1. Refer to a "Register" (i.e X,R,V fields in BD[X|R|V]Mem type of
+    // instructions)
+    // 2. Refer to a "Length" field (i.e L field in BDLMem type of instructions)
+    else if (getLexer().is(AsmToken::Integer)) {
+      if (HasLength) {
+        // Instruction has a "Length" field, safe to parse the first token as
+        // the "Length" field
+        if (getParser().parseExpression(Length))
+          return true;
+      } else {
+        // Otherwise, if the instruction has no "Length" field, parse the
+        // token as a "Register". We don't have to worry about whether the
+        // instruction is invalid here, because the caller will take care of
+        // error reporting.
+        HaveReg1 = true;
+        if (parseIntegerRegister(Reg1, RegGroup))
+          return true;
+      }
     } else {
-      // Parse the length.
-      if (getParser().parseExpression(Length))
-        return true;
+      // If its not an integer or a percent token, then if the instruction
+      // is reported to have a "Length" then, parse it as "Length".
+      if (HasLength) {
+        if (getParser().parseExpression(Length))
+          return true;
+      }
     }
 
     // Check whether there's a second register.
     if (getLexer().is(AsmToken::Comma)) {
       Parser.Lex();
       HaveReg2 = true;
-      if (parseRegister(Reg2))
-        return true;
+
+      if (getLexer().is(AsmToken::Integer)) {
+        if (parseIntegerRegister(Reg2, RegGR))
+          return true;
+      } else {
+        if (parseRegister(Reg2))
+          return true;
+      }
     }
 
     // Consume the closing bracket.
@@ -883,9 +1026,6 @@ SystemZAsmParser::parseAddressRegister(Register &Reg) {
   } else if (Reg.Group != RegGR) {
     Error(Reg.StartLoc, "invalid address register");
     return true;
-  } else if (Reg.Num == 0) {
-    Error(Reg.StartLoc, "%r0 used in an address");
-    return true;
   }
   return false;
 }
@@ -894,16 +1034,27 @@ SystemZAsmParser::parseAddressRegister(Register &Reg) {
 // are as above.
 OperandMatchResultTy
 SystemZAsmParser::parseAddress(OperandVector &Operands, MemoryKind MemKind,
-                               const unsigned *Regs, RegisterKind RegKind) {
+                               RegisterKind RegKind) {
   SMLoc StartLoc = Parser.getTok().getLoc();
   unsigned Base = 0, Index = 0, LengthReg = 0;
   Register Reg1, Reg2;
   bool HaveReg1, HaveReg2;
   const MCExpr *Disp;
   const MCExpr *Length;
-  if (parseAddress(HaveReg1, Reg1, HaveReg2, Reg2, Disp, Length))
+
+  bool HasLength = (MemKind == BDLMem) ? true : false;
+  bool HasVectorIndex = (MemKind == BDVMem) ? true : false;
+  if (parseAddress(HaveReg1, Reg1, HaveReg2, Reg2, Disp, Length, HasLength,
+                   HasVectorIndex))
     return MatchOperand_ParseFail;
 
+  const unsigned *Regs;
+  switch (RegKind) {
+  case GR32Reg: Regs = SystemZMC::GR32Regs; break;
+  case GR64Reg: Regs = SystemZMC::GR64Regs; break;
+  default: llvm_unreachable("invalid RegKind");
+  }
+
   switch (MemKind) {
   case BDMem:
     // If we have Reg1, it must be an address register.
@@ -912,11 +1063,7 @@ SystemZAsmParser::parseAddress(OperandVector &Operands, MemoryKind MemKind,
         return MatchOperand_ParseFail;
       Base = Regs[Reg1.Num];
     }
-    // There must be no Reg2 or length.
-    if (Length) {
-      Error(StartLoc, "invalid use of length addressing");
-      return MatchOperand_ParseFail;
-    }
+    // There must be no Reg2.
     if (HaveReg2) {
       Error(StartLoc, "invalid use of indexed addressing");
       return MatchOperand_ParseFail;
@@ -940,11 +1087,6 @@ SystemZAsmParser::parseAddress(OperandVector &Operands, MemoryKind MemKind,
         return MatchOperand_ParseFail;
       Base = Regs[Reg2.Num];
     }
-    // There must be no length.
-    if (Length) {
-      Error(StartLoc, "invalid use of length addressing");
-      return MatchOperand_ParseFail;
-    }
     break;
   case BDLMem:
     // If we have Reg2, it must be an address register.
@@ -977,11 +1119,6 @@ SystemZAsmParser::parseAddress(OperandVector &Operands, MemoryKind MemKind,
         return MatchOperand_ParseFail;
       Base = Regs[Reg2.Num];
     }
-    // There must be no length.
-    if (Length) {
-      Error(StartLoc, "invalid use of length addressing");
-      return MatchOperand_ParseFail;
-    }
     break;
   case BDVMem:
     // We must have Reg1, and it must be a vector register.
@@ -996,16 +1133,11 @@ SystemZAsmParser::parseAddress(OperandVector &Operands, MemoryKind MemKind,
         return MatchOperand_ParseFail;
       Base = Regs[Reg2.Num];
     }
-    // There must be no length.
-    if (Length) {
-      Error(StartLoc, "invalid use of length addressing");
-      return MatchOperand_ParseFail;
-    }
     break;
   }
 
   SMLoc EndLoc =
-    SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+      SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
   Operands.push_back(SystemZOperand::createMem(MemKind, RegKind, Base, Disp,
                                                Index, Length, LengthReg,
                                                StartLoc, EndLoc));
@@ -1118,15 +1250,15 @@ bool SystemZAsmParser::ParseDirectiveInsn(SMLoc L) {
   }
 
   // Emit as a regular instruction.
-  Parser.getStreamer().EmitInstruction(Inst, getSTI());
+  Parser.getStreamer().emitInstruction(Inst, getSTI());
 
   return false;
 }
 
 bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
-                                     SMLoc &EndLoc) {
+                                     SMLoc &EndLoc, bool RestoreOnFailure) {
   Register Reg;
-  if (parseRegister(Reg))
+  if (parseRegister(Reg, RestoreOnFailure))
     return true;
   if (Reg.Group == RegGR)
     RegNo = SystemZMC::GR64Regs[Reg.Num];
@@ -1143,6 +1275,25 @@ bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
   return false;
 }
 
+bool SystemZAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                     SMLoc &EndLoc) {
+  return ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/false);
+}
+
+OperandMatchResultTy SystemZAsmParser::tryParseRegister(unsigned &RegNo,
+                                                        SMLoc &StartLoc,
+                                                        SMLoc &EndLoc) {
+  bool Result =
+      ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/true);
+  bool PendingErrors = getParser().hasPendingError();
+  getParser().clearPendingErrors();
+  if (PendingErrors)
+    return MatchOperand_ParseFail;
+  if (Result)
+    return MatchOperand_NoMatch;
+  return MatchOperand_Success;
+}
+
 bool SystemZAsmParser::ParseInstruction(ParseInstructionInfo &Info,
                                         StringRef Name, SMLoc NameLoc,
                                         OperandVector &Operands) {
@@ -1215,7 +1366,8 @@ bool SystemZAsmParser::parseOperand(OperandVector &Operands,
   bool HaveReg1, HaveReg2;
   const MCExpr *Expr;
   const MCExpr *Length;
-  if (parseAddress(HaveReg1, Reg1, HaveReg2, Reg2, Expr, Length))
+  if (parseAddress(HaveReg1, Reg1, HaveReg2, Reg2, Expr, Length,
+                   /*HasLength*/ true, /*HasVectorIndex*/ true))
     return true;
   // If the register combination is not valid for any instruction, reject it.
   // Otherwise, fall back to reporting an unrecognized instruction.
@@ -1252,7 +1404,7 @@ bool SystemZAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   switch (MatchResult) {
   case Match_Success:
     Inst.setLoc(IDLoc);
-    Out.EmitInstruction(Inst, getSTI());
+    Out.emitInstruction(Inst, getSTI());
     return false;
 
   case Match_MissingFeature: {
@@ -1322,7 +1474,7 @@ SystemZAsmParser::parsePCRel(OperandVector &Operands, int64_t MinVal,
     }
     int64_t Value = CE->getValue();
     MCSymbol *Sym = Ctx.createTempSymbol();
-    Out.EmitLabel(Sym);
+    Out.emitLabel(Sym);
     const MCExpr *Base = MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_None,
                                                  Ctx);
     Expr = Value == 0 ? Base : MCBinaryExpr::createAdd(Base, Expr, Ctx);
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp
index 5893b227c08c..fac363cae713 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.cpp
@@ -155,7 +155,8 @@ void SystemZInstPrinter::printPCRelOperand(const MCInst *MI, int OpNum,
     MO.getExpr()->print(O, &MAI);
 }
 
-void SystemZInstPrinter::printPCRelTLSOperand(const MCInst *MI, int OpNum,
+void SystemZInstPrinter::printPCRelTLSOperand(const MCInst *MI,
+                                              uint64_t Address, int OpNum,
                                               raw_ostream &O) {
   // Output the PC-relative operand.
   printPCRelOperand(MI, OpNum, O);
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
index 5628e9252f03..cfe1bd89c3eb 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZInstPrinter.h
@@ -46,6 +46,10 @@ public:
 private:
   // Print various types of operand.
   void printOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printOperand(const MCInst *MI, uint64_t /*Address*/, unsigned OpNum,
+                    raw_ostream &O) {
+    printOperand(MI, OpNum, O);
+  }
   void printBDAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printBDXAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printBDLAddrOperand(const MCInst *MI, int OpNum, raw_ostream &O);
@@ -65,7 +69,12 @@ private:
   void printU32ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printU48ImmOperand(const MCInst *MI, int OpNum, raw_ostream &O);
   void printPCRelOperand(const MCInst *MI, int OpNum, raw_ostream &O);
-  void printPCRelTLSOperand(const MCInst *MI, int OpNum, raw_ostream &O);
+  void printPCRelOperand(const MCInst *MI, uint64_t /*Address*/, int OpNum,
+                         raw_ostream &O) {
+    printPCRelOperand(MI, OpNum, O);
+  }
+  void printPCRelTLSOperand(const MCInst *MI, uint64_t Address, int OpNum,
+                            raw_ostream &O);
 
   // Print the mnemonic for a condition-code mask ("ne", "lh", etc.)
   // This forms part of the instruction name rather than the operand list.
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
index 23d8585095cc..e62f5040898f 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmBackend.cpp
@@ -63,10 +63,6 @@ public:
                             const MCAsmLayout &Layout) const override {
     return false;
   }
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override {
-    llvm_unreachable("SystemZ does do not have assembler relaxation");
-  }
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
   std::unique_ptr<MCObjectTargetWriter>
   createObjectTargetWriter() const override {
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
index d6cdacfcab92..e540ff4e4811 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCAsmInfo.cpp
@@ -23,6 +23,4 @@ SystemZMCAsmInfo::SystemZMCAsmInfo(const Triple &TT) {
   UsesELFSectionDirectiveForBSS = true;
   SupportsDebugInformation = true;
   ExceptionsType = ExceptionHandling::DwarfCFI;
-
-  UseIntegratedAssembler = true;
 }
diff --git a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
index eb2112674a12..f2ef1ad6c698 100644
--- a/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
+++ b/llvm/lib/Target/SystemZ/MCTargetDesc/SystemZMCTargetDesc.cpp
@@ -150,10 +150,9 @@ static MCAsmInfo *createSystemZMCAsmInfo(const MCRegisterInfo &MRI,
                                          const Triple &TT,
                                          const MCTargetOptions &Options) {
   MCAsmInfo *MAI = new SystemZMCAsmInfo(TT);
-  MCCFIInstruction Inst =
-      MCCFIInstruction::createDefCfa(nullptr,
-                                     MRI.getDwarfRegNum(SystemZ::R15D, true),
-                                     SystemZMC::CFAOffsetFromInitialSP);
+  MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(
+      nullptr, MRI.getDwarfRegNum(SystemZ::R15D, true),
+      SystemZMC::CFAOffsetFromInitialSP);
   MAI->addInitialFrameState(Inst);
   return MAI;
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZ.h b/llvm/lib/Target/SystemZ/SystemZ.h
index 0808160f627c..bedbd061ea5c 100644
--- a/llvm/lib/Target/SystemZ/SystemZ.h
+++ b/llvm/lib/Target/SystemZ/SystemZ.h
@@ -193,6 +193,7 @@ FunctionPass *createSystemZElimComparePass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZShortenInstPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZLongBranchPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZLDCleanupPass(SystemZTargetMachine &TM);
+FunctionPass *createSystemZCopyPhysRegsPass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZPostRewritePass(SystemZTargetMachine &TM);
 FunctionPass *createSystemZTDCPass();
 } // end namespace llvm
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
index 67c4aa08f90d..4109bfc11337 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.cpp
@@ -92,9 +92,9 @@ static void lowerAlignmentHint(const MachineInstr *MI, MCInst &LoweredMI,
     return;
   const MachineMemOperand *MMO = *MI->memoperands_begin();
   unsigned AlignmentHint = 0;
-  if (MMO->getAlignment() >= 16)
+  if (MMO->getAlign() >= Align(16))
     AlignmentHint = 4;
-  else if (MMO->getAlignment() >= 8)
+  else if (MMO->getAlign() >= Align(8))
     AlignmentHint = 3;
   if (AlignmentHint == 0)
     return;
@@ -124,7 +124,7 @@ static MCInst lowerSubvectorStore(const MachineInstr *MI, unsigned Opcode) {
     .addImm(0);
 }
 
-void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void SystemZAsmPrinter::emitInstruction(const MachineInstr *MI) {
   SystemZMCInstLower Lower(MF->getContext(), *this);
   MCInst LoweredMI;
   switch (MI->getOpcode()) {
@@ -479,7 +479,7 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   // that instead.
   case SystemZ::Trap: {
     MCSymbol *DotSym = OutContext.createTempSymbol();
-    OutStreamer->EmitLabel(DotSym);
+    OutStreamer->emitLabel(DotSym);
 
     const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(DotSym, OutContext);
     const MCConstantExpr *ConstExpr = MCConstantExpr::create(2, OutContext);
@@ -492,7 +492,7 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   // to the relative immediate field of the jump instruction. (eg. "jo .+2")
   case SystemZ::CondTrap: {
     MCSymbol *DotSym = OutContext.createTempSymbol();
-    OutStreamer->EmitLabel(DotSym);
+    OutStreamer->emitLabel(DotSym);
 
     const MCSymbolRefExpr *Expr = MCSymbolRefExpr::create(DotSym, OutContext);
     const MCConstantExpr *ConstExpr = MCConstantExpr::create(2, OutContext);
@@ -522,7 +522,6 @@ void SystemZAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   EmitToStreamer(*OutStreamer, LoweredMI);
 }
 
-
 // Emit the largest nop instruction smaller than or equal to NumBytes
 // bytes.  Return the size of nop emitted.
 static unsigned EmitNop(MCContext &OutContext, MCStreamer &OutStreamer,
@@ -532,22 +531,22 @@ static unsigned EmitNop(MCContext &OutContext, MCStreamer &OutStreamer,
     return 0;
   }
   else if (NumBytes < 4) {
-    OutStreamer.EmitInstruction(MCInstBuilder(SystemZ::BCRAsm)
-                                  .addImm(0).addReg(SystemZ::R0D), STI);
+    OutStreamer.emitInstruction(
+        MCInstBuilder(SystemZ::BCRAsm).addImm(0).addReg(SystemZ::R0D), STI);
     return 2;
   }
   else if (NumBytes < 6) {
-    OutStreamer.EmitInstruction(MCInstBuilder(SystemZ::BCAsm)
-                                  .addImm(0).addReg(0).addImm(0).addReg(0),
-                                STI);
+    OutStreamer.emitInstruction(
+        MCInstBuilder(SystemZ::BCAsm).addImm(0).addReg(0).addImm(0).addReg(0),
+        STI);
     return 4;
   }
   else {
     MCSymbol *DotSym = OutContext.createTempSymbol();
     const MCSymbolRefExpr *Dot = MCSymbolRefExpr::create(DotSym, OutContext);
-    OutStreamer.EmitLabel(DotSym);
-    OutStreamer.EmitInstruction(MCInstBuilder(SystemZ::BRCLAsm)
-                                  .addImm(0).addExpr(Dot), STI);
+    OutStreamer.emitLabel(DotSym);
+    OutStreamer.emitInstruction(
+        MCInstBuilder(SystemZ::BRCLAsm).addImm(0).addExpr(Dot), STI);
     return 6;
   }
 }
@@ -560,9 +559,9 @@ void SystemZAsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
     OutStreamer->PushSection();
     OutStreamer->SwitchSection(
         Ctx.getELFSection("__mcount_loc", ELF::SHT_PROGBITS, ELF::SHF_ALLOC));
-    OutStreamer->EmitSymbolValue(DotSym, 8);
+    OutStreamer->emitSymbolValue(DotSym, 8);
     OutStreamer->PopSection();
-    OutStreamer->EmitLabel(DotSym);
+    OutStreamer->emitLabel(DotSym);
   }
 
   if (MF->getFunction().hasFnAttribute("mnop-mcount")) {
@@ -573,8 +572,9 @@ void SystemZAsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
   MCSymbol *fentry = Ctx.getOrCreateSymbol("__fentry__");
   const MCSymbolRefExpr *Op =
       MCSymbolRefExpr::create(fentry, MCSymbolRefExpr::VK_PLT, Ctx);
-  OutStreamer->EmitInstruction(MCInstBuilder(SystemZ::BRASL)
-                       .addReg(SystemZ::R0D).addExpr(Op), getSubtargetInfo());
+  OutStreamer->emitInstruction(
+      MCInstBuilder(SystemZ::BRASL).addReg(SystemZ::R0D).addExpr(Op),
+      getSubtargetInfo());
 }
 
 void SystemZAsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
@@ -585,7 +585,7 @@ void SystemZAsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
 
   auto &Ctx = OutStreamer->getContext();
   MCSymbol *MILabel = Ctx.createTempSymbol();
-  OutStreamer->EmitLabel(MILabel);
+  OutStreamer->emitLabel(MILabel);
   
   SM.recordStackMap(*MILabel, MI);
   assert(NumNOPBytes % 2 == 0 && "Invalid number of NOP bytes requested!");
@@ -618,7 +618,7 @@ void SystemZAsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
                                         SystemZMCInstLower &Lower) {
   auto &Ctx = OutStreamer->getContext();
   MCSymbol *MILabel = Ctx.createTempSymbol();
-  OutStreamer->EmitLabel(MILabel);
+  OutStreamer->emitLabel(MILabel);
 
   SM.recordPatchPoint(*MILabel, MI);
   PatchPointOpers Opers(&MI);
@@ -685,8 +685,8 @@ getModifierVariantKind(SystemZCP::SystemZCPModifier Modifier) {
   llvm_unreachable("Invalid SystemCPModifier!");
 }
 
-void SystemZAsmPrinter::
-EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
+void SystemZAsmPrinter::emitMachineConstantPoolValue(
+    MachineConstantPoolValue *MCPV) {
   auto *ZCPV = static_cast<SystemZConstantPoolValue*>(MCPV);
 
   const MCExpr *Expr =
@@ -695,7 +695,7 @@ EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) {
                             OutContext);
   uint64_t Size = getDataLayout().getTypeAllocSize(ZCPV->getType());
 
-  OutStreamer->EmitValue(Expr, Size);
+  OutStreamer->emitValue(Expr, Size);
 }
 
 bool SystemZAsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -719,7 +719,7 @@ bool SystemZAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
-void SystemZAsmPrinter::EmitEndOfAsmFile(Module &M) {
+void SystemZAsmPrinter::emitEndOfAsmFile(Module &M) {
   emitStackMaps(SM);
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
index d01a17c2ebe2..2d7562c7238d 100644
--- a/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
+++ b/llvm/lib/Target/SystemZ/SystemZAsmPrinter.h
@@ -32,9 +32,9 @@ public:
 
   // Override AsmPrinter.
   StringRef getPassName() const override { return "SystemZ Assembly Printer"; }
-  void EmitInstruction(const MachineInstr *MI) override;
-  void EmitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
-  void EmitEndOfAsmFile(Module &M) override;
+  void emitInstruction(const MachineInstr *MI) override;
+  void emitMachineConstantPoolValue(MachineConstantPoolValue *MCPV) override;
+  void emitEndOfAsmFile(Module &M) override;
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        const char *ExtraCode, raw_ostream &OS) override;
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/SystemZ/SystemZCallingConv.h b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
index 4432adc6a269..d4c7ce07420b 100644
--- a/llvm/lib/Target/SystemZ/SystemZCallingConv.h
+++ b/llvm/lib/Target/SystemZ/SystemZCallingConv.h
@@ -108,7 +108,7 @@ inline bool CC_SystemZ_I128Indirect(unsigned &ValNo, MVT &ValVT,
   // the location (register or stack slot) for the indirect pointer.
   // (This duplicates the usual i64 calling convention rules.)
   unsigned Reg = State.AllocateReg(SystemZ::ArgGPRs);
-  unsigned Offset = Reg ? 0 : State.AllocateStack(8, 8);
+  unsigned Offset = Reg ? 0 : State.AllocateStack(8, Align(8));
 
   // Use that same location for all the pending parts.
   for (auto &It : PendingMembers) {
diff --git a/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp b/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
index ffeee4da95cc..86c6b2985385 100644
--- a/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.cpp
@@ -25,13 +25,12 @@ SystemZConstantPoolValue::Create(const GlobalValue *GV,
   return new SystemZConstantPoolValue(GV, Modifier);
 }
 
-int SystemZConstantPoolValue::
-getExistingMachineCPValue(MachineConstantPool *CP, unsigned Alignment) {
-  unsigned AlignMask = Alignment - 1;
+int SystemZConstantPoolValue::getExistingMachineCPValue(MachineConstantPool *CP,
+                                                        Align Alignment) {
   const std::vector<MachineConstantPoolEntry> &Constants = CP->getConstants();
   for (unsigned I = 0, E = Constants.size(); I != E; ++I) {
     if (Constants[I].isMachineConstantPoolEntry() &&
-        (Constants[I].getAlignment() & AlignMask) == 0) {
+        Constants[I].getAlign() >= Alignment) {
       auto *ZCPV =
         static_cast<SystemZConstantPoolValue *>(Constants[I].Val.MachineCPVal);
       if (ZCPV->GV == GV && ZCPV->Modifier == Modifier)
diff --git a/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h b/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h
index 6cb7710abdfe..da610ab45070 100644
--- a/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h
+++ b/llvm/lib/Target/SystemZ/SystemZConstantPoolValue.h
@@ -43,7 +43,7 @@ public:
 
   // Override MachineConstantPoolValue.
   int getExistingMachineCPValue(MachineConstantPool *CP,
-                                unsigned Alignment) override;
+                                Align Alignment) override;
   void addSelectionDAGCSEId(FoldingSetNodeID &ID) override;
   void print(raw_ostream &O) const override;
 
diff --git a/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp b/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp
new file mode 100644
index 000000000000..7d21d29d270e
--- /dev/null
+++ b/llvm/lib/Target/SystemZ/SystemZCopyPhysRegs.cpp
@@ -0,0 +1,120 @@
+//===---------- SystemZPhysRegCopy.cpp - Handle phys reg copies -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass makes sure that a COPY of a physical register will be
+// implementable after register allocation in copyPhysReg() (this could be
+// done in EmitInstrWithCustomInserter() instead if COPY instructions would
+// be passed to it).
+//
+//===----------------------------------------------------------------------===//
+
+#include "SystemZMachineFunctionInfo.h"
+#include "SystemZTargetMachine.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/CodeGen/TargetRegisterInfo.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define SYSTEMZ_COPYPHYSREGS_NAME "SystemZ Copy Physregs"
+
+namespace llvm {
+  void initializeSystemZCopyPhysRegsPass(PassRegistry&);
+}
+
+namespace {
+
+class SystemZCopyPhysRegs : public MachineFunctionPass {
+public:
+  static char ID;
+  SystemZCopyPhysRegs()
+    : MachineFunctionPass(ID), TII(nullptr), MRI(nullptr) {
+    initializeSystemZCopyPhysRegsPass(*PassRegistry::getPassRegistry());
+  }
+
+  StringRef getPassName() const override { return SYSTEMZ_COPYPHYSREGS_NAME; }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+
+private:
+
+  bool visitMBB(MachineBasicBlock &MBB);
+
+  const SystemZInstrInfo *TII;
+  MachineRegisterInfo *MRI;
+};
+
+char SystemZCopyPhysRegs::ID = 0;
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(SystemZCopyPhysRegs, "systemz-copy-physregs",
+                SYSTEMZ_COPYPHYSREGS_NAME, false, false)
+
+FunctionPass *llvm::createSystemZCopyPhysRegsPass(SystemZTargetMachine &TM) {
+  return new SystemZCopyPhysRegs();
+}
+
+void SystemZCopyPhysRegs::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.setPreservesCFG();
+  MachineFunctionPass::getAnalysisUsage(AU);
+}
+
+bool SystemZCopyPhysRegs::visitMBB(MachineBasicBlock &MBB) {
+  bool Modified = false;
+
+  // Certain special registers can only be copied from a subset of the
+  // default register class of the type. It is therefore necessary to create
+  // the target copy instructions before regalloc instead of in copyPhysReg().
+  for (MachineBasicBlock::iterator MBBI = MBB.begin(), E = MBB.end();
+       MBBI != E; ) {
+    MachineInstr *MI = &*MBBI++;
+    if (!MI->isCopy())
+      continue;
+
+    DebugLoc DL = MI->getDebugLoc();
+    Register SrcReg = MI->getOperand(1).getReg();
+    Register DstReg = MI->getOperand(0).getReg();
+    if (DstReg.isVirtual() &&
+        (SrcReg == SystemZ::CC || SystemZ::AR32BitRegClass.contains(SrcReg))) {
+      Register Tmp = MRI->createVirtualRegister(&SystemZ::GR32BitRegClass);
+      if (SrcReg == SystemZ::CC)
+        BuildMI(MBB, MI, DL, TII->get(SystemZ::IPM), Tmp);
+      else
+        BuildMI(MBB, MI, DL, TII->get(SystemZ::EAR), Tmp).addReg(SrcReg);
+      MI->getOperand(1).setReg(Tmp);
+      Modified = true;
+    }
+    else if (SrcReg.isVirtual() &&
+             SystemZ::AR32BitRegClass.contains(DstReg)) {
+      Register Tmp = MRI->createVirtualRegister(&SystemZ::GR32BitRegClass);
+      MI->getOperand(0).setReg(Tmp);
+      BuildMI(MBB, MBBI, DL, TII->get(SystemZ::SAR), DstReg).addReg(Tmp);
+      Modified = true;
+    }
+  }
+
+  return Modified;
+}
+
+bool SystemZCopyPhysRegs::runOnMachineFunction(MachineFunction &F) {
+  TII = static_cast<const SystemZInstrInfo *>(F.getSubtarget().getInstrInfo());
+  MRI = &F.getRegInfo();
+
+  bool Modified = false;
+  for (auto &MBB : F)
+    Modified |= visitMBB(MBB);
+
+  return Modified;
+}
+
diff --git a/llvm/lib/Target/SystemZ/SystemZFeatures.td b/llvm/lib/Target/SystemZ/SystemZFeatures.td
index dae795e845b0..28f58cb310af 100644
--- a/llvm/lib/Target/SystemZ/SystemZFeatures.td
+++ b/llvm/lib/Target/SystemZ/SystemZFeatures.td
@@ -10,13 +10,13 @@
 //
 //===----------------------------------------------------------------------===//
 
-class SystemZFeature<string extname, string intname, string desc>
-  : Predicate<"Subtarget->has"##intname##"()">,
-    AssemblerPredicate<"Feature"##intname, extname>,
-    SubtargetFeature<extname, "Has"##intname, "true", desc>;
+class SystemZFeature<string extname, string intname, dag featdag, string desc>
+  : Predicate<"Subtarget->has"#intname#"()">,
+    AssemblerPredicate<featdag, extname>,
+    SubtargetFeature<extname, "Has"#intname, "true", desc>;
 
 class SystemZMissingFeature<string intname>
-  : Predicate<"!Subtarget->has"##intname##"()">;
+  : Predicate<"!Subtarget->has"#intname#"()">;
 
 class SystemZFeatureList<list<SystemZFeature> x> {
   list<SystemZFeature> List = x;
@@ -25,6 +25,13 @@ class SystemZFeatureList<list<SystemZFeature> x> {
 class SystemZFeatureAdd<list<SystemZFeature> x, list<SystemZFeature> y>
   : SystemZFeatureList<!listconcat(x, y)>;
 
+// This feature is added as a subtarget feature whenever the function is
+// compiled to use soft-float.
+def FeatureSoftFloat : SystemZFeature<
+  "soft-float", "SoftFloat", (all_of FeatureSoftFloat),
+  "Use software emulation for floating point"
+>;
+
 //===----------------------------------------------------------------------===//
 //
 // New features added in the Ninth Edition of the z/Architecture
@@ -32,54 +39,54 @@ class SystemZFeatureAdd<list<SystemZFeature> x, list<SystemZFeature> y>
 //===----------------------------------------------------------------------===//
 
 def FeatureDistinctOps : SystemZFeature<
-  "distinct-ops", "DistinctOps",
+  "distinct-ops", "DistinctOps", (all_of FeatureDistinctOps),
   "Assume that the distinct-operands facility is installed"
 >;
 
 def FeatureFastSerialization : SystemZFeature<
-  "fast-serialization", "FastSerialization",
+  "fast-serialization", "FastSerialization", (all_of FeatureFastSerialization),
   "Assume that the fast-serialization facility is installed"
 >;
 
 def FeatureFPExtension : SystemZFeature<
-  "fp-extension", "FPExtension",
+  "fp-extension", "FPExtension", (all_of FeatureFPExtension),
   "Assume that the floating-point extension facility is installed"
 >;
 
 def FeatureHighWord : SystemZFeature<
-  "high-word", "HighWord",
+  "high-word", "HighWord", (all_of FeatureHighWord),
   "Assume that the high-word facility is installed"
 >;
 
 def FeatureInterlockedAccess1 : SystemZFeature<
-  "interlocked-access1", "InterlockedAccess1",
+  "interlocked-access1", "InterlockedAccess1", (all_of FeatureInterlockedAccess1),
   "Assume that interlocked-access facility 1 is installed"
 >;
 def FeatureNoInterlockedAccess1 : SystemZMissingFeature<"InterlockedAccess1">;
 
 def FeatureLoadStoreOnCond : SystemZFeature<
-  "load-store-on-cond", "LoadStoreOnCond",
+  "load-store-on-cond", "LoadStoreOnCond", (all_of FeatureLoadStoreOnCond),
   "Assume that the load/store-on-condition facility is installed"
 >;
 def FeatureNoLoadStoreOnCond : SystemZMissingFeature<"LoadStoreOnCond">;
 
 def FeaturePopulationCount : SystemZFeature<
-  "population-count", "PopulationCount",
+  "population-count", "PopulationCount", (all_of FeaturePopulationCount),
   "Assume that the population-count facility is installed"
 >;
 
 def FeatureMessageSecurityAssist3 : SystemZFeature<
-  "message-security-assist-extension3", "MessageSecurityAssist3",
+  "message-security-assist-extension3", "MessageSecurityAssist3", (all_of FeatureMessageSecurityAssist3),
   "Assume that the message-security-assist extension facility 3 is installed"
 >;
 
 def FeatureMessageSecurityAssist4 : SystemZFeature<
-  "message-security-assist-extension4", "MessageSecurityAssist4",
+  "message-security-assist-extension4", "MessageSecurityAssist4", (all_of FeatureMessageSecurityAssist4),
   "Assume that the message-security-assist extension facility 4 is installed"
 >;
 
 def FeatureResetReferenceBitsMultiple : SystemZFeature<
-  "reset-reference-bits-multiple", "ResetReferenceBitsMultiple",
+  "reset-reference-bits-multiple", "ResetReferenceBitsMultiple", (all_of FeatureResetReferenceBitsMultiple),
   "Assume that the reset-reference-bits-multiple facility is installed"
 >;
 
@@ -103,37 +110,37 @@ def Arch9NewFeatures : SystemZFeatureList<[
 //===----------------------------------------------------------------------===//
 
 def FeatureExecutionHint : SystemZFeature<
-  "execution-hint", "ExecutionHint",
+  "execution-hint", "ExecutionHint", (all_of FeatureExecutionHint),
   "Assume that the execution-hint facility is installed"
 >;
 
 def FeatureLoadAndTrap : SystemZFeature<
-  "load-and-trap", "LoadAndTrap",
+  "load-and-trap", "LoadAndTrap", (all_of FeatureLoadAndTrap),
   "Assume that the load-and-trap facility is installed"
 >;
 
 def FeatureMiscellaneousExtensions : SystemZFeature<
-  "miscellaneous-extensions", "MiscellaneousExtensions",
+  "miscellaneous-extensions", "MiscellaneousExtensions", (all_of FeatureMiscellaneousExtensions),
   "Assume that the miscellaneous-extensions facility is installed"
 >;
 
 def FeatureProcessorAssist : SystemZFeature<
-  "processor-assist", "ProcessorAssist",
+  "processor-assist", "ProcessorAssist", (all_of FeatureProcessorAssist),
   "Assume that the processor-assist facility is installed"
 >;
 
 def FeatureTransactionalExecution : SystemZFeature<
-  "transactional-execution", "TransactionalExecution",
+  "transactional-execution", "TransactionalExecution", (all_of FeatureTransactionalExecution),
   "Assume that the transactional-execution facility is installed"
 >;
 
 def FeatureDFPZonedConversion : SystemZFeature<
-  "dfp-zoned-conversion", "DFPZonedConversion",
+  "dfp-zoned-conversion", "DFPZonedConversion", (all_of FeatureDFPZonedConversion),
   "Assume that the DFP zoned-conversion facility is installed"
 >;
 
 def FeatureEnhancedDAT2 : SystemZFeature<
-  "enhanced-dat-2", "EnhancedDAT2",
+  "enhanced-dat-2", "EnhancedDAT2", (all_of FeatureEnhancedDAT2),
   "Assume that the enhanced-DAT facility 2 is installed"
 >;
 
@@ -154,27 +161,27 @@ def Arch10NewFeatures : SystemZFeatureList<[
 //===----------------------------------------------------------------------===//
 
 def FeatureLoadAndZeroRightmostByte : SystemZFeature<
-  "load-and-zero-rightmost-byte", "LoadAndZeroRightmostByte",
+  "load-and-zero-rightmost-byte", "LoadAndZeroRightmostByte", (all_of FeatureLoadAndZeroRightmostByte),
   "Assume that the load-and-zero-rightmost-byte facility is installed"
 >;
 
 def FeatureLoadStoreOnCond2 : SystemZFeature<
-  "load-store-on-cond-2", "LoadStoreOnCond2",
+  "load-store-on-cond-2", "LoadStoreOnCond2", (all_of FeatureLoadStoreOnCond2),
   "Assume that the load/store-on-condition facility 2 is installed"
 >;
 
 def FeatureMessageSecurityAssist5 : SystemZFeature<
-  "message-security-assist-extension5", "MessageSecurityAssist5",
+  "message-security-assist-extension5", "MessageSecurityAssist5", (all_of FeatureMessageSecurityAssist5),
   "Assume that the message-security-assist extension facility 5 is installed"
 >;
 
 def FeatureDFPPackedConversion : SystemZFeature<
-  "dfp-packed-conversion", "DFPPackedConversion",
+  "dfp-packed-conversion", "DFPPackedConversion", (all_of FeatureDFPPackedConversion),
   "Assume that the DFP packed-conversion facility is installed"
 >;
 
 def FeatureVector : SystemZFeature<
-  "vector", "Vector",
+  "vector", "Vector", (all_of FeatureVector),
   "Assume that the vectory facility is installed"
 >;
 def FeatureNoVector : SystemZMissingFeature<"Vector">;
@@ -194,38 +201,38 @@ def Arch11NewFeatures : SystemZFeatureList<[
 //===----------------------------------------------------------------------===//
 
 def FeatureMiscellaneousExtensions2 : SystemZFeature<
-  "miscellaneous-extensions-2", "MiscellaneousExtensions2",
+  "miscellaneous-extensions-2", "MiscellaneousExtensions2", (all_of FeatureMiscellaneousExtensions2),
   "Assume that the miscellaneous-extensions facility 2 is installed"
 >;
 
 def FeatureGuardedStorage : SystemZFeature<
-  "guarded-storage", "GuardedStorage",
+  "guarded-storage", "GuardedStorage", (all_of FeatureGuardedStorage),
   "Assume that the guarded-storage facility is installed"
 >;
 
 def FeatureMessageSecurityAssist7 : SystemZFeature<
-  "message-security-assist-extension7", "MessageSecurityAssist7",
+  "message-security-assist-extension7", "MessageSecurityAssist7", (all_of FeatureMessageSecurityAssist7),
   "Assume that the message-security-assist extension facility 7 is installed"
 >;
 
 def FeatureMessageSecurityAssist8 : SystemZFeature<
-  "message-security-assist-extension8", "MessageSecurityAssist8",
+  "message-security-assist-extension8", "MessageSecurityAssist8", (all_of FeatureMessageSecurityAssist8),
   "Assume that the message-security-assist extension facility 8 is installed"
 >;
 
 def FeatureVectorEnhancements1 : SystemZFeature<
-  "vector-enhancements-1", "VectorEnhancements1",
+  "vector-enhancements-1", "VectorEnhancements1", (all_of FeatureVectorEnhancements1),
   "Assume that the vector enhancements facility 1 is installed"
 >;
 def FeatureNoVectorEnhancements1 : SystemZMissingFeature<"VectorEnhancements1">;
 
 def FeatureVectorPackedDecimal : SystemZFeature<
-  "vector-packed-decimal", "VectorPackedDecimal",
+  "vector-packed-decimal", "VectorPackedDecimal", (all_of FeatureVectorPackedDecimal),
   "Assume that the vector packed decimal facility is installed"
 >;
 
 def FeatureInsertReferenceBitsMultiple : SystemZFeature<
-  "insert-reference-bits-multiple", "InsertReferenceBitsMultiple",
+  "insert-reference-bits-multiple", "InsertReferenceBitsMultiple", (all_of FeatureInsertReferenceBitsMultiple),
   "Assume that the insert-reference-bits-multiple facility is installed"
 >;
 
@@ -246,32 +253,32 @@ def Arch12NewFeatures : SystemZFeatureList<[
 //===----------------------------------------------------------------------===//
 
 def FeatureMiscellaneousExtensions3 : SystemZFeature<
-  "miscellaneous-extensions-3", "MiscellaneousExtensions3",
+  "miscellaneous-extensions-3", "MiscellaneousExtensions3", (all_of FeatureMiscellaneousExtensions3),
   "Assume that the miscellaneous-extensions facility 3 is installed"
 >;
 
 def FeatureMessageSecurityAssist9 : SystemZFeature<
-  "message-security-assist-extension9", "MessageSecurityAssist9",
+  "message-security-assist-extension9", "MessageSecurityAssist9", (all_of FeatureMessageSecurityAssist9),
   "Assume that the message-security-assist extension facility 9 is installed"
 >;
 
 def FeatureVectorEnhancements2 : SystemZFeature<
-  "vector-enhancements-2", "VectorEnhancements2",
+  "vector-enhancements-2", "VectorEnhancements2", (all_of FeatureVectorEnhancements2),
   "Assume that the vector enhancements facility 2 is installed"
 >;
 
 def FeatureVectorPackedDecimalEnhancement : SystemZFeature<
-  "vector-packed-decimal-enhancement", "VectorPackedDecimalEnhancement",
+  "vector-packed-decimal-enhancement", "VectorPackedDecimalEnhancement", (all_of FeatureVectorPackedDecimalEnhancement),
   "Assume that the vector packed decimal enhancement facility is installed"
 >;
 
 def FeatureEnhancedSort : SystemZFeature<
-  "enhanced-sort", "EnhancedSort",
+  "enhanced-sort", "EnhancedSort", (all_of FeatureEnhancedSort),
   "Assume that the enhanced-sort facility is installed"
 >;
 
 def FeatureDeflateConversion : SystemZFeature<
-  "deflate-conversion", "DeflateConversion",
+  "deflate-conversion", "DeflateConversion", (all_of FeatureDeflateConversion),
   "Assume that the deflate-conversion facility is installed"
 >;
 
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 3cdf6bf98ee0..985722fdcab4 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -17,6 +17,7 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/RegisterScavenging.h"
 #include "llvm/IR/Function.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -62,18 +63,6 @@ SystemZFrameLowering::SystemZFrameLowering()
     RegSpillOffsets[SpillOffsetTable[I].Reg] = SpillOffsetTable[I].Offset;
 }
 
-static bool usePackedStack(MachineFunction &MF) {
-  bool HasPackedStackAttr = MF.getFunction().hasFnAttribute("packed-stack");
-  bool IsVarArg = MF.getFunction().isVarArg();
-  bool CallConv = MF.getFunction().getCallingConv() != CallingConv::GHC;
-  bool BackChain = MF.getFunction().hasFnAttribute("backchain");
-  bool FramAddressTaken = MF.getFrameInfo().isFrameAddressTaken();
-  if (HasPackedStackAttr && BackChain)
-    report_fatal_error("packed-stack with backchain is currently unsupported.");
-  return HasPackedStackAttr && !IsVarArg && CallConv && !BackChain &&
-         !FramAddressTaken;
-}
-
 bool SystemZFrameLowering::
 assignCalleeSavedSpillSlots(MachineFunction &MF,
                             const TargetRegisterInfo *TRI,
@@ -87,71 +76,44 @@ assignCalleeSavedSpillSlots(MachineFunction &MF,
   unsigned LowGPR = 0;
   unsigned HighGPR = SystemZ::R15D;
   int StartSPOffset = SystemZMC::CallFrameSize;
-  int CurrOffset;
-  if (!usePackedStack(MF)) {
-    for (auto &CS : CSI) {
-      unsigned Reg = CS.getReg();
-      int Offset = RegSpillOffsets[Reg];
-      if (Offset) {
-        if (SystemZ::GR64BitRegClass.contains(Reg) && StartSPOffset > Offset) {
-          LowGPR = Reg;
-          StartSPOffset = Offset;
-        }
-        Offset -= SystemZMC::CallFrameSize;
-        int FrameIdx = MFFrame.CreateFixedSpillStackObject(8, Offset);
-        CS.setFrameIdx(FrameIdx);
-      } else
-        CS.setFrameIdx(INT32_MAX);
-    }
-
-    // Save the range of call-saved registers, for use by the
-    // prologue/epilogue inserters.
-    ZFI->setRestoreGPRRegs(LowGPR, HighGPR, StartSPOffset);
-    if (IsVarArg) {
-      // Also save the GPR varargs, if any.  R6D is call-saved, so would
-      // already be included, but we also need to handle the call-clobbered
-      // argument registers.
-      unsigned FirstGPR = ZFI->getVarArgsFirstGPR();
-      if (FirstGPR < SystemZ::NumArgGPRs) {
-        unsigned Reg = SystemZ::ArgGPRs[FirstGPR];
-        int Offset = RegSpillOffsets[Reg];
-        if (StartSPOffset > Offset) {
-          LowGPR = Reg; StartSPOffset = Offset;
-        }
+  for (auto &CS : CSI) {
+    unsigned Reg = CS.getReg();
+    int Offset = getRegSpillOffset(MF, Reg);
+    if (Offset) {
+      if (SystemZ::GR64BitRegClass.contains(Reg) && StartSPOffset > Offset) {
+        LowGPR = Reg;
+        StartSPOffset = Offset;
       }
-    }
-    ZFI->setSpillGPRRegs(LowGPR, HighGPR, StartSPOffset);
+      Offset -= SystemZMC::CallFrameSize;
+      int FrameIdx = MFFrame.CreateFixedSpillStackObject(8, Offset);
+      CS.setFrameIdx(FrameIdx);
+    } else
+      CS.setFrameIdx(INT32_MAX);
+  }
 
-    CurrOffset = -SystemZMC::CallFrameSize;
-  } else {
-    // Packed stack: put all the GPRs at the top of the Register save area.
-    uint32_t LowGR64Num = UINT32_MAX;
-    for (auto &CS : CSI) {
-      unsigned Reg = CS.getReg();
-      if (SystemZ::GR64BitRegClass.contains(Reg)) {
-        unsigned GR64Num = SystemZMC::getFirstReg(Reg);
-        int Offset = -8 * (15 - GR64Num + 1);
-        if (LowGR64Num > GR64Num) {
-          LowGR64Num = GR64Num;
-          StartSPOffset = SystemZMC::CallFrameSize + Offset;
-        }
-        int FrameIdx = MFFrame.CreateFixedSpillStackObject(8, Offset);
-        CS.setFrameIdx(FrameIdx);
-      } else
-        CS.setFrameIdx(INT32_MAX);
+  // Save the range of call-saved registers, for use by the
+  // prologue/epilogue inserters.
+  ZFI->setRestoreGPRRegs(LowGPR, HighGPR, StartSPOffset);
+  if (IsVarArg) {
+    // Also save the GPR varargs, if any.  R6D is call-saved, so would
+    // already be included, but we also need to handle the call-clobbered
+    // argument registers.
+    unsigned FirstGPR = ZFI->getVarArgsFirstGPR();
+    if (FirstGPR < SystemZ::NumArgGPRs) {
+      unsigned Reg = SystemZ::ArgGPRs[FirstGPR];
+      int Offset = getRegSpillOffset(MF, Reg);
+      if (StartSPOffset > Offset) {
+        LowGPR = Reg; StartSPOffset = Offset;
+      }
     }
-    if (LowGR64Num < UINT32_MAX)
-      LowGPR = SystemZMC::GR64Regs[LowGR64Num];
-
-    // Save the range of call-saved registers, for use by the
-    // prologue/epilogue inserters.
-    ZFI->setRestoreGPRRegs(LowGPR, HighGPR, StartSPOffset);
-    ZFI->setSpillGPRRegs(LowGPR, HighGPR, StartSPOffset);
-
-    CurrOffset = LowGPR ? -(SystemZMC::CallFrameSize - StartSPOffset) : 0;
   }
+  ZFI->setSpillGPRRegs(LowGPR, HighGPR, StartSPOffset);
 
   // Create fixed stack objects for the remaining registers.
+  int CurrOffset = -SystemZMC::CallFrameSize;
+  if (usePackedStack(MF))
+    CurrOffset += StartSPOffset;
+
   for (auto &CS : CSI) {
     if (CS.getFrameIdx() != INT32_MAX)
       continue;
@@ -234,11 +196,9 @@ static void addSavedGPR(MachineBasicBlock &MBB, MachineInstrBuilder &MIB,
   }
 }
 
-bool SystemZFrameLowering::
-spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MBBI,
-                          const std::vector<CalleeSavedInfo> &CSI,
-                          const TargetRegisterInfo *TRI) const {
+bool SystemZFrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return false;
 
@@ -296,11 +256,9 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
-bool SystemZFrameLowering::
-restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MBBI,
-                            std::vector<CalleeSavedInfo> &CSI,
-                            const TargetRegisterInfo *TRI) const {
+bool SystemZFrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return false;
 
@@ -358,9 +316,10 @@ void SystemZFrameLowering::
 processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                     RegScavenger *RS) const {
   MachineFrameInfo &MFFrame = MF.getFrameInfo();
+  bool BackChain = MF.getFunction().hasFnAttribute("backchain");
 
-  if (!usePackedStack(MF))
-    // Always create the full incoming register save area.
+  if (!usePackedStack(MF) || BackChain)
+    // Create the incoming register save area.
     getOrCreateFramePointerSaveIndex(MF);
 
   // Get the size of our stack frame to be allocated ...
@@ -382,16 +341,15 @@ processFunctionBeforeFrameFinalized(MachineFunction &MF,
     // are outside the reach of an unsigned 12-bit displacement.
     // Create 2 for the case where both addresses in an MVC are
     // out of range.
-    RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, 8, false));
-    RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, 8, false));
+    RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, Align(8), false));
+    RS->addScavengingFrameIndex(MFFrame.CreateStackObject(8, Align(8), false));
   }
 }
 
 // Emit instructions before MBBI (in MBB) to add NumBytes to Reg.
 static void emitIncrement(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator &MBBI,
-                          const DebugLoc &DL,
-                          unsigned Reg, int64_t NumBytes,
+                          MachineBasicBlock::iterator &MBBI, const DebugLoc &DL,
+                          Register Reg, int64_t NumBytes,
                           const TargetInstrInfo *TII) {
   while (NumBytes) {
     unsigned Opcode;
@@ -416,12 +374,39 @@ static void emitIncrement(MachineBasicBlock &MBB,
   }
 }
 
+// Add CFI for the new CFA offset.
+static void buildCFAOffs(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI,
+                         const DebugLoc &DL, int Offset,
+                         const SystemZInstrInfo *ZII) {
+  unsigned CFIIndex = MBB.getParent()->addFrameInst(
+    MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset));
+  BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+    .addCFIIndex(CFIIndex);
+}
+
+// Add CFI for the new frame location.
+static void buildDefCFAReg(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI,
+                           const DebugLoc &DL, unsigned Reg,
+                           const SystemZInstrInfo *ZII) {
+  MachineFunction &MF = *MBB.getParent();
+  MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  unsigned RegNum = MRI->getDwarfRegNum(Reg, true);
+  unsigned CFIIndex = MF.addFrameInst(
+                        MCCFIInstruction::createDefCfaRegister(nullptr, RegNum));
+  BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
+    .addCFIIndex(CFIIndex);
+}
+
 void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
                                         MachineBasicBlock &MBB) const {
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
+  const SystemZSubtarget &STI = MF.getSubtarget<SystemZSubtarget>();
+  const SystemZTargetLowering &TLI = *STI.getTargetLowering();
   MachineFrameInfo &MFFrame = MF.getFrameInfo();
-  auto *ZII =
-      static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  auto *ZII = static_cast<const SystemZInstrInfo *>(STI.getInstrInfo());
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
   MachineBasicBlock::iterator MBBI = MBB.begin();
   MachineModuleInfo &MMI = MF.getMMI();
@@ -504,19 +489,31 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
 
     // Allocate StackSize bytes.
     int64_t Delta = -int64_t(StackSize);
-    emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII);
-
-    // Add CFI for the allocation.
-    unsigned CFIIndex = MF.addFrameInst(
-        MCCFIInstruction::createDefCfaOffset(nullptr, SPOffsetFromCFA + Delta));
-    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
+    const unsigned ProbeSize = TLI.getStackProbeSize(MF);
+    bool FreeProbe = (ZFI->getSpillGPRRegs().GPROffset &&
+           (ZFI->getSpillGPRRegs().GPROffset + StackSize) < ProbeSize);
+    if (!FreeProbe &&
+        MF.getSubtarget().getTargetLowering()->hasInlineStackProbe(MF)) {
+      // Stack probing may involve looping, but splitting the prologue block
+      // is not possible at this point since it would invalidate the
+      // SaveBlocks / RestoreBlocks sets of PEI in the single block function
+      // case. Build a pseudo to be handled later by inlineStackProbe().
+      BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::PROBED_STACKALLOC))
+        .addImm(StackSize);
+    }
+    else {
+      emitIncrement(MBB, MBBI, DL, SystemZ::R15D, Delta, ZII);
+      buildCFAOffs(MBB, MBBI, DL, SPOffsetFromCFA + Delta, ZII);
+    }
     SPOffsetFromCFA += Delta;
 
-    if (StoreBackchain)
+    if (StoreBackchain) {
+      // The back chain is stored topmost with packed-stack.
+      int Offset = usePackedStack(MF) ? SystemZMC::CallFrameSize - 8 : 0;
       BuildMI(MBB, MBBI, DL, ZII->get(SystemZ::STG))
-        .addReg(SystemZ::R1D, RegState::Kill).addReg(SystemZ::R15D).addImm(0)
-        .addReg(0);
+        .addReg(SystemZ::R1D, RegState::Kill).addReg(SystemZ::R15D)
+        .addImm(Offset).addReg(0);
+    }
   }
 
   if (HasFP) {
@@ -525,11 +522,7 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
       .addReg(SystemZ::R15D);
 
     // Add CFI for the new frame location.
-    unsigned HardFP = MRI->getDwarfRegNum(SystemZ::R11D, true);
-    unsigned CFIIndex = MF.addFrameInst(
-        MCCFIInstruction::createDefCfaRegister(nullptr, HardFP));
-    BuildMI(MBB, MBBI, DL, ZII->get(TargetOpcode::CFI_INSTRUCTION))
-        .addCFIIndex(CFIIndex);
+    buildDefCFAReg(MBB, MBBI, DL, SystemZ::R11D, ZII);
 
     // Mark the FramePtr as live at the beginning of every block except
     // the entry block.  (We'll have marked R11 as live on entry when
@@ -560,7 +553,7 @@ void SystemZFrameLowering::emitPrologue(MachineFunction &MF,
 
     // Add CFI for the this save.
     unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-    unsigned IgnoredFrameReg;
+    Register IgnoredFrameReg;
     int64_t Offset =
         getFrameIndexReference(MF, Save.getFrameIdx(), IgnoredFrameReg);
 
@@ -622,6 +615,91 @@ void SystemZFrameLowering::emitEpilogue(MachineFunction &MF,
   }
 }
 
+void SystemZFrameLowering::inlineStackProbe(MachineFunction &MF,
+                                            MachineBasicBlock &PrologMBB) const {
+  auto *ZII =
+    static_cast<const SystemZInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const SystemZSubtarget &STI = MF.getSubtarget<SystemZSubtarget>();
+  const SystemZTargetLowering &TLI = *STI.getTargetLowering();
+
+  MachineInstr *StackAllocMI = nullptr;
+  for (MachineInstr &MI : PrologMBB)
+    if (MI.getOpcode() == SystemZ::PROBED_STACKALLOC) {
+      StackAllocMI = &MI;
+      break;
+    }
+  if (StackAllocMI == nullptr)
+    return;
+  uint64_t StackSize = StackAllocMI->getOperand(0).getImm();
+  const unsigned ProbeSize = TLI.getStackProbeSize(MF);
+  uint64_t NumFullBlocks = StackSize / ProbeSize;
+  uint64_t Residual = StackSize % ProbeSize;
+  int64_t SPOffsetFromCFA = -SystemZMC::CFAOffsetFromInitialSP;
+  MachineBasicBlock *MBB = &PrologMBB;
+  MachineBasicBlock::iterator MBBI = StackAllocMI;
+  const DebugLoc DL = StackAllocMI->getDebugLoc();
+
+  // Allocate a block of Size bytes on the stack and probe it.
+  auto allocateAndProbe = [&](MachineBasicBlock &InsMBB,
+                              MachineBasicBlock::iterator InsPt, unsigned Size,
+                              bool EmitCFI) -> void {
+    emitIncrement(InsMBB, InsPt, DL, SystemZ::R15D, -int64_t(Size), ZII);
+    if (EmitCFI) {
+      SPOffsetFromCFA -= Size;
+      buildCFAOffs(InsMBB, InsPt, DL, SPOffsetFromCFA, ZII);
+    }
+    // Probe by means of a volatile compare.
+    MachineMemOperand *MMO = MF.getMachineMemOperand(MachinePointerInfo(),
+      MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad, 8, Align(1));
+    BuildMI(InsMBB, InsPt, DL, ZII->get(SystemZ::CG))
+      .addReg(SystemZ::R0D, RegState::Undef)
+      .addReg(SystemZ::R15D).addImm(Size - 8).addReg(0)
+      .addMemOperand(MMO);
+  };
+
+  if (NumFullBlocks < 3) {
+    // Emit unrolled probe statements.
+    for (unsigned int i = 0; i < NumFullBlocks; i++)
+      allocateAndProbe(*MBB, MBBI, ProbeSize, true/*EmitCFI*/);
+  } else {
+    // Emit a loop probing the pages.
+    uint64_t LoopAlloc = ProbeSize * NumFullBlocks;
+    SPOffsetFromCFA -= LoopAlloc;
+
+    BuildMI(*MBB, MBBI, DL, ZII->get(SystemZ::LGR), SystemZ::R1D)
+      .addReg(SystemZ::R15D);
+    buildDefCFAReg(*MBB, MBBI, DL, SystemZ::R1D, ZII);
+    emitIncrement(*MBB, MBBI, DL, SystemZ::R1D, -int64_t(LoopAlloc), ZII);
+    buildCFAOffs(*MBB, MBBI, DL, -int64_t(SystemZMC::CallFrameSize + LoopAlloc),
+                 ZII);
+
+    MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MBBI, MBB);
+    MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(MBB);
+    MBB->addSuccessor(LoopMBB);
+    LoopMBB->addSuccessor(LoopMBB);
+    LoopMBB->addSuccessor(DoneMBB);
+
+    MBB = LoopMBB;
+    allocateAndProbe(*MBB, MBB->end(), ProbeSize, false/*EmitCFI*/);
+    BuildMI(*MBB, MBB->end(), DL, ZII->get(SystemZ::CLGR))
+      .addReg(SystemZ::R15D).addReg(SystemZ::R1D);
+    BuildMI(*MBB, MBB->end(), DL, ZII->get(SystemZ::BRC))
+      .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_GT).addMBB(MBB);
+
+    MBB = DoneMBB;
+    MBBI = DoneMBB->begin();
+    buildDefCFAReg(*MBB, MBBI, DL, SystemZ::R15D, ZII);
+
+    recomputeLiveIns(*DoneMBB);
+    recomputeLiveIns(*LoopMBB);
+  }
+
+  if (Residual)
+    allocateAndProbe(*MBB, MBBI, Residual, true/*EmitCFI*/);
+
+  StackAllocMI->eraseFromParent();
+}
+
 bool SystemZFrameLowering::hasFP(const MachineFunction &MF) const {
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           MF.getFrameInfo().hasVarSizedObjects() ||
@@ -639,7 +717,7 @@ SystemZFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 
 int SystemZFrameLowering::getFrameIndexReference(const MachineFunction &MF,
                                                  int FI,
-                                                 unsigned &FrameReg) const {
+                                                 Register &FrameReg) const {
   // Our incoming SP is actually SystemZMC::CallFrameSize below the CFA, so
   // add that difference here.
   int64_t Offset =
@@ -664,14 +742,43 @@ eliminateCallFramePseudoInstr(MachineFunction &MF,
   }
 }
 
+unsigned SystemZFrameLowering::getRegSpillOffset(MachineFunction &MF,
+                                                 Register Reg) const {
+  bool IsVarArg = MF.getFunction().isVarArg();
+  bool BackChain = MF.getFunction().hasFnAttribute("backchain");
+  bool SoftFloat = MF.getSubtarget<SystemZSubtarget>().hasSoftFloat();
+  unsigned Offset = RegSpillOffsets[Reg];
+  if (usePackedStack(MF) && !(IsVarArg && !SoftFloat)) {
+    if (SystemZ::GR64BitRegClass.contains(Reg))
+      // Put all GPRs at the top of the Register save area with packed
+      // stack. Make room for the backchain if needed.
+      Offset += BackChain ? 24 : 32;
+    else
+      Offset = 0;
+  }
+  return Offset;
+}
+
 int SystemZFrameLowering::
 getOrCreateFramePointerSaveIndex(MachineFunction &MF) const {
   SystemZMachineFunctionInfo *ZFI = MF.getInfo<SystemZMachineFunctionInfo>();
   int FI = ZFI->getFramePointerSaveIndex();
   if (!FI) {
     MachineFrameInfo &MFFrame = MF.getFrameInfo();
-    FI = MFFrame.CreateFixedObject(8, -SystemZMC::CallFrameSize, false);
+    // The back chain is stored topmost with packed-stack.
+    int Offset = usePackedStack(MF) ? -8 : -SystemZMC::CallFrameSize;
+    FI = MFFrame.CreateFixedObject(8, Offset, false);
     ZFI->setFramePointerSaveIndex(FI);
   }
   return FI;
 }
+
+bool SystemZFrameLowering::usePackedStack(MachineFunction &MF) const {
+  bool HasPackedStackAttr = MF.getFunction().hasFnAttribute("packed-stack");
+  bool BackChain = MF.getFunction().hasFnAttribute("backchain");
+  bool SoftFloat = MF.getSubtarget<SystemZSubtarget>().hasSoftFloat();
+  if (HasPackedStackAttr && BackChain && !SoftFloat)
+    report_fatal_error("packed-stack + backchain + hard-float is unsupported.");
+  bool CallConv = MF.getFunction().getCallingConv() != CallingConv::GHC;
+  return HasPackedStackAttr && CallConv;
+}
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index 4189a92b8294..8752acc7e5ae 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -32,33 +32,36 @@ public:
                             RegScavenger *RS) const override;
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
                                  const TargetRegisterInfo *TRI) const override;
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBII,
-                                   std::vector<CalleeSavedInfo> &CSI,
-                                   const TargetRegisterInfo *TRI) const
-    override;
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MBBII,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
+                              const TargetRegisterInfo *TRI) const override;
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                            RegScavenger *RS) const override;
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
+  void inlineStackProbe(MachineFunction &MF,
+                        MachineBasicBlock &PrologMBB) const override;
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
+                             Register &FrameReg) const override;
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
 
   // Return the byte offset from the incoming stack pointer of Reg's
-  // ABI-defined save slot.  Return 0 if no slot is defined for Reg.
-  unsigned getRegSpillOffset(unsigned Reg) const {
-    return RegSpillOffsets[Reg];
-  }
+  // ABI-defined save slot.  Return 0 if no slot is defined for Reg.  Adjust
+  // the offset in case MF has packed-stack.
+  unsigned getRegSpillOffset(MachineFunction &MF, Register Reg) const;
 
   // Get or create the frame index of where the old frame pointer is stored.
   int getOrCreateFramePointerSaveIndex(MachineFunction &MF) const;
+
+  bool usePackedStack(MachineFunction &MF) const;
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
index 3927a977e6fc..37328684399b 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelDAGToDAG.cpp
@@ -1456,7 +1456,8 @@ bool SystemZDAGToDAGISel::storeLoadCanUseBlockBinary(SDNode *N,
   auto *StoreA = cast<StoreSDNode>(N);
   auto *LoadA = cast<LoadSDNode>(StoreA->getValue().getOperand(1 - I));
   auto *LoadB = cast<LoadSDNode>(StoreA->getValue().getOperand(I));
-  return !LoadA->isVolatile() && canUseBlockOperation(StoreA, LoadB);
+  return !LoadA->isVolatile() && LoadA->getMemoryVT() == LoadB->getMemoryVT() &&
+         canUseBlockOperation(StoreA, LoadB);
 }
 
 void SystemZDAGToDAGISel::Select(SDNode *Node) {
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index c73905d3357a..eb1e51341ec4 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -88,25 +88,27 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   else
     addRegisterClass(MVT::i32, &SystemZ::GR32BitRegClass);
   addRegisterClass(MVT::i64, &SystemZ::GR64BitRegClass);
-  if (Subtarget.hasVector()) {
-    addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
-    addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
-  } else {
-    addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
-    addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
-  }
-  if (Subtarget.hasVectorEnhancements1())
-    addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass);
-  else
-    addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
+  if (!useSoftFloat()) {
+    if (Subtarget.hasVector()) {
+      addRegisterClass(MVT::f32, &SystemZ::VR32BitRegClass);
+      addRegisterClass(MVT::f64, &SystemZ::VR64BitRegClass);
+    } else {
+      addRegisterClass(MVT::f32, &SystemZ::FP32BitRegClass);
+      addRegisterClass(MVT::f64, &SystemZ::FP64BitRegClass);
+    }
+    if (Subtarget.hasVectorEnhancements1())
+      addRegisterClass(MVT::f128, &SystemZ::VR128BitRegClass);
+    else
+      addRegisterClass(MVT::f128, &SystemZ::FP128BitRegClass);
 
-  if (Subtarget.hasVector()) {
-    addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass);
-    addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
-    addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
-    addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
-    addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
-    addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
+    if (Subtarget.hasVector()) {
+      addRegisterClass(MVT::v16i8, &SystemZ::VR128BitRegClass);
+      addRegisterClass(MVT::v8i16, &SystemZ::VR128BitRegClass);
+      addRegisterClass(MVT::v4i32, &SystemZ::VR128BitRegClass);
+      addRegisterClass(MVT::v2i64, &SystemZ::VR128BitRegClass);
+      addRegisterClass(MVT::v4f32, &SystemZ::VR128BitRegClass);
+      addRegisterClass(MVT::v2f64, &SystemZ::VR128BitRegClass);
+    }
   }
 
   // Compute derived properties from the register classes
@@ -639,12 +641,16 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   setTargetDAGCombine(ISD::FP_ROUND);
   setTargetDAGCombine(ISD::STRICT_FP_ROUND);
   setTargetDAGCombine(ISD::FP_EXTEND);
+  setTargetDAGCombine(ISD::SINT_TO_FP);
+  setTargetDAGCombine(ISD::UINT_TO_FP);
   setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
   setTargetDAGCombine(ISD::BSWAP);
   setTargetDAGCombine(ISD::SDIV);
   setTargetDAGCombine(ISD::UDIV);
   setTargetDAGCombine(ISD::SREM);
   setTargetDAGCombine(ISD::UREM);
+  setTargetDAGCombine(ISD::INTRINSIC_VOID);
+  setTargetDAGCombine(ISD::INTRINSIC_W_CHAIN);
 
   // Handle intrinsics.
   setOperationAction(ISD::INTRINSIC_W_CHAIN, MVT::Other, Custom);
@@ -666,6 +672,10 @@ SystemZTargetLowering::SystemZTargetLowering(const TargetMachine &TM,
   IsStrictFPEnabled = true;
 }
 
+bool SystemZTargetLowering::useSoftFloat() const {
+  return Subtarget.hasSoftFloat();
+}
+
 EVT SystemZTargetLowering::getSetCCResultType(const DataLayout &DL,
                                               LLVMContext &, EVT VT) const {
   if (!VT.isVector())
@@ -816,6 +826,15 @@ bool SystemZTargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
   return SystemZVectorConstantInfo(Imm).isVectorConstantLegal(Subtarget);
 }
 
+/// Returns true if stack probing through inline assembly is requested.
+bool SystemZTargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
+  // If the function specifically requests inline stack probes, emit them.
+  if (MF.getFunction().hasFnAttribute("probe-stack"))
+    return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+           "inline-asm";
+  return false;
+}
+
 bool SystemZTargetLowering::isLegalICmpImmediate(int64_t Imm) const {
   // We can use CGFI or CLGFI.
   return isInt<32>(Imm) || isUInt<32>(Imm);
@@ -1123,12 +1142,14 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
       return std::make_pair(0U, &SystemZ::GRH32BitRegClass);
 
     case 'f': // Floating-point register
-      if (VT == MVT::f64)
-        return std::make_pair(0U, &SystemZ::FP64BitRegClass);
-      else if (VT == MVT::f128)
-        return std::make_pair(0U, &SystemZ::FP128BitRegClass);
-      return std::make_pair(0U, &SystemZ::FP32BitRegClass);
-
+      if (!useSoftFloat()) {
+        if (VT == MVT::f64)
+          return std::make_pair(0U, &SystemZ::FP64BitRegClass);
+        else if (VT == MVT::f128)
+          return std::make_pair(0U, &SystemZ::FP128BitRegClass);
+        return std::make_pair(0U, &SystemZ::FP32BitRegClass);
+      }
+      break;
     case 'v': // Vector register
       if (Subtarget.hasVector()) {
         if (VT == MVT::f32)
@@ -1156,6 +1177,9 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
                                  SystemZMC::GR64Regs, 16);
     }
     if (Constraint[1] == 'f') {
+      if (useSoftFloat())
+        return std::make_pair(
+            0u, static_cast<const TargetRegisterClass *>(nullptr));
       if (VT == MVT::f32)
         return parseRegisterNumber(Constraint, &SystemZ::FP32BitRegClass,
                                    SystemZMC::FP32Regs, 16);
@@ -1166,6 +1190,9 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
                                  SystemZMC::FP64Regs, 16);
     }
     if (Constraint[1] == 'v') {
+      if (!Subtarget.hasVector())
+        return std::make_pair(
+            0u, static_cast<const TargetRegisterClass *>(nullptr));
       if (VT == MVT::f32)
         return parseRegisterNumber(Constraint, &SystemZ::VR32BitRegClass,
                                    SystemZMC::VR32Regs, 32);
@@ -1179,6 +1206,19 @@ SystemZTargetLowering::getRegForInlineAsmConstraint(
   return TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 }
 
+// FIXME? Maybe this could be a TableGen attribute on some registers and
+// this table could be generated automatically from RegInfo.
+Register SystemZTargetLowering::getRegisterByName(const char *RegName, LLT VT,
+                                                  const MachineFunction &MF) const {
+
+  Register Reg = StringSwitch<Register>(RegName)
+                   .Case("r15", SystemZ::R15D)
+                   .Default(0);
+  if (Reg)
+    return Reg;
+  report_fatal_error("Invalid register name global variable");
+}
+
 void SystemZTargetLowering::
 LowerAsmOperandForConstraint(SDValue Op, std::string &Constraint,
                              std::vector<SDValue> &Ops,
@@ -1437,17 +1477,19 @@ SDValue SystemZTargetLowering::LowerFormalArguments(
 
     // ...and a similar frame index for the caller-allocated save area
     // that will be used to store the incoming registers.
-    int64_t RegSaveOffset = -SystemZMC::CallFrameSize;
+    int64_t RegSaveOffset =
+      -SystemZMC::CallFrameSize + TFL->getRegSpillOffset(MF, SystemZ::R2D) - 16;
     unsigned RegSaveIndex = MFI.CreateFixedObject(1, RegSaveOffset, true);
     FuncInfo->setRegSaveFrameIndex(RegSaveIndex);
 
     // Store the FPR varargs in the reserved frame slots.  (We store the
     // GPRs as part of the prologue.)
-    if (NumFixedFPRs < SystemZ::NumArgFPRs) {
+    if (NumFixedFPRs < SystemZ::NumArgFPRs && !useSoftFloat()) {
       SDValue MemOps[SystemZ::NumArgFPRs];
       for (unsigned I = NumFixedFPRs; I < SystemZ::NumArgFPRs; ++I) {
-        unsigned Offset = TFL->getRegSpillOffset(SystemZ::ArgFPRs[I]);
-        int FI = MFI.CreateFixedObject(8, RegSaveOffset + Offset, true);
+        unsigned Offset = TFL->getRegSpillOffset(MF, SystemZ::ArgFPRs[I]);
+        int FI =
+          MFI.CreateFixedObject(8, -SystemZMC::CallFrameSize + Offset, true);
         SDValue FIN = DAG.getFrameIndex(FI, getPointerTy(DAG.getDataLayout()));
         unsigned VReg = MF.addLiveIn(SystemZ::ArgFPRs[I],
                                      &SystemZ::FP64BitRegClass);
@@ -1633,6 +1675,7 @@ SystemZTargetLowering::LowerCall(CallLoweringInfo &CLI,
   if (IsTailCall)
     return DAG.getNode(SystemZISD::SIBCALL, DL, NodeTys, Ops);
   Chain = DAG.getNode(SystemZISD::CALL, DL, NodeTys, Ops);
+  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   Glue = Chain.getValue(1);
 
   // Mark the end of the call, which is glued to the call itself.
@@ -2020,8 +2063,9 @@ static void adjustSubwordCmp(SelectionDAG &DAG, const SDLoc &DL,
 
   // We must have an 8- or 16-bit load.
   auto *Load = cast<LoadSDNode>(C.Op0);
-  unsigned NumBits = Load->getMemoryVT().getStoreSizeInBits();
-  if (NumBits != 8 && NumBits != 16)
+  unsigned NumBits = Load->getMemoryVT().getSizeInBits();
+  if ((NumBits != 8 && NumBits != 16) ||
+      NumBits != Load->getMemoryVT().getStoreSizeInBits())
     return;
 
   // The load must be an extending one and the constant must be within the
@@ -2161,15 +2205,6 @@ static bool shouldSwapCmpOperands(const Comparison &C) {
   return false;
 }
 
-// Return a version of comparison CC mask CCMask in which the LT and GT
-// actions are swapped.
-static unsigned reverseCCMask(unsigned CCMask) {
-  return ((CCMask & SystemZ::CCMASK_CMP_EQ) |
-          (CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) |
-          (CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) |
-          (CCMask & SystemZ::CCMASK_CMP_UO));
-}
-
 // Check whether C tests for equality between X and Y and whether X - Y
 // or Y - X is also computed.  In that case it's better to compare the
 // result of the subtraction against zero.
@@ -2205,7 +2240,7 @@ static void adjustForFNeg(Comparison &C) {
       SDNode *N = *I;
       if (N->getOpcode() == ISD::FNEG) {
         C.Op0 = SDValue(N, 0);
-        C.CCMask = reverseCCMask(C.CCMask);
+        C.CCMask = SystemZ::reverseCCMask(C.CCMask);
         return;
       }
     }
@@ -2572,7 +2607,7 @@ static Comparison getCmp(SelectionDAG &DAG, SDValue CmpOp0, SDValue CmpOp1,
 
   if (shouldSwapCmpOperands(C)) {
     std::swap(C.Op0, C.Op1);
-    C.CCMask = reverseCCMask(C.CCMask);
+    C.CCMask = SystemZ::reverseCCMask(C.CCMask);
   }
 
   adjustForTestUnderMask(DAG, DL, C);
@@ -3103,7 +3138,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
       SystemZConstantPoolValue *CPV =
         SystemZConstantPoolValue::Create(GV, SystemZCP::TLSGD);
 
-      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getConstantPool(CPV, PtrVT, Align(8));
       Offset = DAG.getLoad(
           PtrVT, DL, DAG.getEntryNode(), Offset,
           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
@@ -3118,7 +3153,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
       SystemZConstantPoolValue *CPV =
         SystemZConstantPoolValue::Create(GV, SystemZCP::TLSLDM);
 
-      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getConstantPool(CPV, PtrVT, Align(8));
       Offset = DAG.getLoad(
           PtrVT, DL, DAG.getEntryNode(), Offset,
           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
@@ -3136,7 +3171,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
       // Add the per-symbol offset.
       CPV = SystemZConstantPoolValue::Create(GV, SystemZCP::DTPOFF);
 
-      SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, 8);
+      SDValue DTPOffset = DAG.getConstantPool(CPV, PtrVT, Align(8));
       DTPOffset = DAG.getLoad(
           PtrVT, DL, DAG.getEntryNode(), DTPOffset,
           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
@@ -3161,7 +3196,7 @@ SDValue SystemZTargetLowering::lowerGlobalTLSAddress(GlobalAddressSDNode *Node,
       SystemZConstantPoolValue *CPV =
         SystemZConstantPoolValue::Create(GV, SystemZCP::NTPOFF);
 
-      Offset = DAG.getConstantPool(CPV, PtrVT, 8);
+      Offset = DAG.getConstantPool(CPV, PtrVT, Align(8));
       Offset = DAG.getLoad(
           PtrVT, DL, DAG.getEntryNode(), Offset,
           MachinePointerInfo::getConstantPool(DAG.getMachineFunction()));
@@ -3202,11 +3237,11 @@ SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
 
   SDValue Result;
   if (CP->isMachineConstantPoolEntry())
-    Result = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
-                                       CP->getAlignment());
+    Result =
+        DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT, CP->getAlign());
   else
-    Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
-                                       CP->getAlignment(), CP->getOffset());
+    Result = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign(),
+                                       CP->getOffset());
 
   // Use LARL to load the address of the constant pool entry.
   return DAG.getNode(SystemZISD::PCREL_WRAPPER, DL, PtrVT, Result);
@@ -3214,6 +3249,8 @@ SDValue SystemZTargetLowering::lowerConstantPool(ConstantPoolSDNode *CP,
 
 SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
                                               SelectionDAG &DAG) const {
+  auto *TFL =
+      static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
   MachineFunction &MF = DAG.getMachineFunction();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MFI.setFrameAddressIsTaken(true);
@@ -3222,9 +3259,12 @@ SDValue SystemZTargetLowering::lowerFRAMEADDR(SDValue Op,
   unsigned Depth = cast<ConstantSDNode>(Op.getOperand(0))->getZExtValue();
   EVT PtrVT = getPointerTy(DAG.getDataLayout());
 
+  // Return null if the back chain is not present.
+  bool HasBackChain = MF.getFunction().hasFnAttribute("backchain");
+  if (TFL->usePackedStack(MF) && !HasBackChain)
+    return DAG.getConstant(0, DL, PtrVT);
+
   // By definition, the frame address is the address of the back chain.
-  auto *TFL =
-      static_cast<const SystemZFrameLowering *>(Subtarget.getFrameLowering());
   int BackChainIdx = TFL->getOrCreateFramePointerSaveIndex(MF);
   SDValue BackChain = DAG.getFrameIndex(BackChainIdx, PtrVT);
 
@@ -3355,9 +3395,9 @@ SDValue SystemZTargetLowering::lowerVACOPY(SDValue Op,
   SDLoc DL(Op);
 
   return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(32, DL),
-                       /*Align*/8, /*isVolatile*/false, /*AlwaysInline*/false,
-                       /*isTailCall*/false,
-                       MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
+                       Align(8), /*isVolatile*/ false, /*AlwaysInline*/ false,
+                       /*isTailCall*/ false, MachinePointerInfo(DstSV),
+                       MachinePointerInfo(SrcSV));
 }
 
 SDValue SystemZTargetLowering::
@@ -3398,10 +3438,17 @@ lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const {
                               DAG.getConstant(ExtraAlignSpace, DL, MVT::i64));
 
   // Get the new stack pointer value.
-  SDValue NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace);
-
-  // Copy the new stack pointer back.
-  Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
+  SDValue NewSP;
+  if (hasInlineStackProbe(MF)) {
+    NewSP = DAG.getNode(SystemZISD::PROBED_ALLOCA, DL,
+                DAG.getVTList(MVT::i64, MVT::Other), Chain, OldSP, NeededSpace);
+    Chain = NewSP.getValue(1);
+  }
+  else {
+    NewSP = DAG.getNode(ISD::SUB, DL, MVT::i64, OldSP, NeededSpace);
+    // Copy the new stack pointer back.
+    Chain = DAG.getCopyToReg(Chain, DL, SPReg, NewSP);
+  }
 
   // The allocated data lives above the 160 bytes allocated for the standard
   // frame, plus any outgoing stack arguments.  We don't know how much that
@@ -3995,7 +4042,7 @@ SDValue SystemZTargetLowering::lowerATOMIC_CMP_SWAP(SDValue Op,
 }
 
 MachineMemOperand::Flags
-SystemZTargetLowering::getMMOFlags(const Instruction &I) const {
+SystemZTargetLowering::getTargetMMOFlags(const Instruction &I) const {
   // Because of how we convert atomic_load and atomic_store to normal loads and
   // stores in the DAG, we need to ensure that the MMOs are marked volatile
   // since DAGCombine hasn't been updated to account for atomic, but non
@@ -4362,7 +4409,7 @@ static bool getShuffleInput(const SmallVectorImpl<int> &Bytes, unsigned Start,
 }
 
 // Bytes is a VPERM-like permute vector, except that -1 is used for
-// undefined bytes.  Return true if it can be performed using VSLDI.
+// undefined bytes.  Return true if it can be performed using VSLDB.
 // When returning true, set StartIndex to the shift amount and OpNo0
 // and OpNo1 to the VPERM operands that should be used as the first
 // and second shift operand respectively.
@@ -4420,23 +4467,86 @@ static SDValue getPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
   return Op;
 }
 
+static bool isZeroVector(SDValue N) {
+  if (N->getOpcode() == ISD::BITCAST)
+    N = N->getOperand(0);
+  if (N->getOpcode() == ISD::SPLAT_VECTOR)
+    if (auto *Op = dyn_cast<ConstantSDNode>(N->getOperand(0)))
+      return Op->getZExtValue() == 0;
+  return ISD::isBuildVectorAllZeros(N.getNode());
+}
+
+// Return the index of the zero/undef vector, or UINT32_MAX if not found.
+static uint32_t findZeroVectorIdx(SDValue *Ops, unsigned Num) {
+  for (unsigned I = 0; I < Num ; I++)
+    if (isZeroVector(Ops[I]))
+      return I;
+  return UINT32_MAX;
+}
+
 // Bytes is a VPERM-like permute vector, except that -1 is used for
 // undefined bytes.  Implement it on operands Ops[0] and Ops[1] using
-// VSLDI or VPERM.
+// VSLDB or VPERM.
 static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
                                      SDValue *Ops,
                                      const SmallVectorImpl<int> &Bytes) {
   for (unsigned I = 0; I < 2; ++I)
     Ops[I] = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Ops[I]);
 
-  // First see whether VSLDI can be used.
+  // First see whether VSLDB can be used.
   unsigned StartIndex, OpNo0, OpNo1;
   if (isShlDoublePermute(Bytes, StartIndex, OpNo0, OpNo1))
     return DAG.getNode(SystemZISD::SHL_DOUBLE, DL, MVT::v16i8, Ops[OpNo0],
                        Ops[OpNo1],
                        DAG.getTargetConstant(StartIndex, DL, MVT::i32));
 
-  // Fall back on VPERM.  Construct an SDNode for the permute vector.
+  // Fall back on VPERM.  Construct an SDNode for the permute vector.  Try to
+  // eliminate a zero vector by reusing any zero index in the permute vector.
+  unsigned ZeroVecIdx = findZeroVectorIdx(&Ops[0], 2);
+  if (ZeroVecIdx != UINT32_MAX) {
+    bool MaskFirst = true;
+    int ZeroIdx = -1;
+    for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
+      unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
+      unsigned Byte = unsigned(Bytes[I]) % SystemZ::VectorBytes;
+      if (OpNo == ZeroVecIdx && I == 0) {
+        // If the first byte is zero, use mask as first operand.
+        ZeroIdx = 0;
+        break;
+      }
+      if (OpNo != ZeroVecIdx && Byte == 0) {
+        // If mask contains a zero, use it by placing that vector first.
+        ZeroIdx = I + SystemZ::VectorBytes;
+        MaskFirst = false;
+        break;
+      }
+    }
+    if (ZeroIdx != -1) {
+      SDValue IndexNodes[SystemZ::VectorBytes];
+      for (unsigned I = 0; I < SystemZ::VectorBytes; ++I) {
+        if (Bytes[I] >= 0) {
+          unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
+          unsigned Byte = unsigned(Bytes[I]) % SystemZ::VectorBytes;
+          if (OpNo == ZeroVecIdx)
+            IndexNodes[I] = DAG.getConstant(ZeroIdx, DL, MVT::i32);
+          else {
+            unsigned BIdx = MaskFirst ? Byte + SystemZ::VectorBytes : Byte;
+            IndexNodes[I] = DAG.getConstant(BIdx, DL, MVT::i32);
+          }
+        } else
+          IndexNodes[I] = DAG.getUNDEF(MVT::i32);
+      }
+      SDValue Mask = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
+      SDValue Src = ZeroVecIdx == 0 ? Ops[1] : Ops[0];
+      if (MaskFirst)
+        return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Mask, Src,
+                           Mask);
+      else
+        return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Src, Mask,
+                           Mask);
+    }
+  }
+
   SDValue IndexNodes[SystemZ::VectorBytes];
   for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
     if (Bytes[I] >= 0)
@@ -4444,16 +4554,20 @@ static SDValue getGeneralPermuteNode(SelectionDAG &DAG, const SDLoc &DL,
     else
       IndexNodes[I] = DAG.getUNDEF(MVT::i32);
   SDValue Op2 = DAG.getBuildVector(MVT::v16i8, DL, IndexNodes);
-  return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0], Ops[1], Op2);
+  return DAG.getNode(SystemZISD::PERMUTE, DL, MVT::v16i8, Ops[0],
+                     (!Ops[1].isUndef() ? Ops[1] : Ops[0]), Op2);
 }
 
 namespace {
 // Describes a general N-operand vector shuffle.
 struct GeneralShuffle {
-  GeneralShuffle(EVT vt) : VT(vt) {}
+  GeneralShuffle(EVT vt) : VT(vt), UnpackFromEltSize(UINT_MAX) {}
   void addUndef();
   bool add(SDValue, unsigned);
   SDValue getNode(SelectionDAG &, const SDLoc &);
+  void tryPrepareForUnpack();
+  bool unpackWasPrepared() { return UnpackFromEltSize <= 4; }
+  SDValue insertUnpackIfPrepared(SelectionDAG &DAG, const SDLoc &DL, SDValue Op);
 
   // The operands of the shuffle.
   SmallVector<SDValue, SystemZ::VectorBytes> Ops;
@@ -4465,6 +4579,9 @@ struct GeneralShuffle {
 
   // The type of the shuffle result.
   EVT VT;
+
+  // Holds a value of 1, 2 or 4 if a final unpack has been prepared for.
+  unsigned UnpackFromEltSize;
 };
 }
 
@@ -4547,6 +4664,9 @@ SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
   if (Ops.size() == 0)
     return DAG.getUNDEF(VT);
 
+  // Use a single unpack if possible as the last operation.
+  tryPrepareForUnpack();
+
   // Make sure that there are at least two shuffle operands.
   if (Ops.size() == 1)
     Ops.push_back(DAG.getUNDEF(MVT::v16i8));
@@ -4612,13 +4732,117 @@ SDValue GeneralShuffle::getNode(SelectionDAG &DAG, const SDLoc &DL) {
   // to VPERM.
   unsigned OpNo0, OpNo1;
   SDValue Op;
-  if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
+  if (unpackWasPrepared() && Ops[1].isUndef())
+    Op = Ops[0];
+  else if (const Permute *P = matchPermute(Bytes, OpNo0, OpNo1))
     Op = getPermuteNode(DAG, DL, *P, Ops[OpNo0], Ops[OpNo1]);
   else
     Op = getGeneralPermuteNode(DAG, DL, &Ops[0], Bytes);
+
+  Op = insertUnpackIfPrepared(DAG, DL, Op);
+
   return DAG.getNode(ISD::BITCAST, DL, VT, Op);
 }
 
+#ifndef NDEBUG
+static void dumpBytes(const SmallVectorImpl<int> &Bytes, std::string Msg) {
+  dbgs() << Msg.c_str() << " { ";
+  for (unsigned i = 0; i < Bytes.size(); i++)
+    dbgs() << Bytes[i] << " ";
+  dbgs() << "}\n";
+}
+#endif
+
+// If the Bytes vector matches an unpack operation, prepare to do the unpack
+// after all else by removing the zero vector and the effect of the unpack on
+// Bytes.
+void GeneralShuffle::tryPrepareForUnpack() {
+  uint32_t ZeroVecOpNo = findZeroVectorIdx(&Ops[0], Ops.size());
+  if (ZeroVecOpNo == UINT32_MAX || Ops.size() == 1)
+    return;
+
+  // Only do this if removing the zero vector reduces the depth, otherwise
+  // the critical path will increase with the final unpack.
+  if (Ops.size() > 2 &&
+      Log2_32_Ceil(Ops.size()) == Log2_32_Ceil(Ops.size() - 1))
+    return;
+
+  // Find an unpack that would allow removing the zero vector from Ops.
+  UnpackFromEltSize = 1;
+  for (; UnpackFromEltSize <= 4; UnpackFromEltSize *= 2) {
+    bool MatchUnpack = true;
+    SmallVector<int, SystemZ::VectorBytes> SrcBytes;
+    for (unsigned Elt = 0; Elt < SystemZ::VectorBytes; Elt++) {
+      unsigned ToEltSize = UnpackFromEltSize * 2;
+      bool IsZextByte = (Elt % ToEltSize) < UnpackFromEltSize;
+      if (!IsZextByte)
+        SrcBytes.push_back(Bytes[Elt]);
+      if (Bytes[Elt] != -1) {
+        unsigned OpNo = unsigned(Bytes[Elt]) / SystemZ::VectorBytes;
+        if (IsZextByte != (OpNo == ZeroVecOpNo)) {
+          MatchUnpack = false;
+          break;
+        }
+      }
+    }
+    if (MatchUnpack) {
+      if (Ops.size() == 2) {
+        // Don't use unpack if a single source operand needs rearrangement.
+        for (unsigned i = 0; i < SystemZ::VectorBytes / 2; i++)
+          if (SrcBytes[i] != -1 && SrcBytes[i] % 16 != int(i)) {
+            UnpackFromEltSize = UINT_MAX;
+            return;
+          }
+      }
+      break;
+    }
+  }
+  if (UnpackFromEltSize > 4)
+    return;
+
+  LLVM_DEBUG(dbgs() << "Preparing for final unpack of element size "
+             << UnpackFromEltSize << ". Zero vector is Op#" << ZeroVecOpNo
+             << ".\n";
+             dumpBytes(Bytes, "Original Bytes vector:"););
+
+  // Apply the unpack in reverse to the Bytes array.
+  unsigned B = 0;
+  for (unsigned Elt = 0; Elt < SystemZ::VectorBytes;) {
+    Elt += UnpackFromEltSize;
+    for (unsigned i = 0; i < UnpackFromEltSize; i++, Elt++, B++)
+      Bytes[B] = Bytes[Elt];
+  }
+  while (B < SystemZ::VectorBytes)
+    Bytes[B++] = -1;
+
+  // Remove the zero vector from Ops
+  Ops.erase(&Ops[ZeroVecOpNo]);
+  for (unsigned I = 0; I < SystemZ::VectorBytes; ++I)
+    if (Bytes[I] >= 0) {
+      unsigned OpNo = unsigned(Bytes[I]) / SystemZ::VectorBytes;
+      if (OpNo > ZeroVecOpNo)
+        Bytes[I] -= SystemZ::VectorBytes;
+    }
+
+  LLVM_DEBUG(dumpBytes(Bytes, "Resulting Bytes vector, zero vector removed:");
+             dbgs() << "\n";);
+}
+
+SDValue GeneralShuffle::insertUnpackIfPrepared(SelectionDAG &DAG,
+                                               const SDLoc &DL,
+                                               SDValue Op) {
+  if (!unpackWasPrepared())
+    return Op;
+  unsigned InBits = UnpackFromEltSize * 8;
+  EVT InVT = MVT::getVectorVT(MVT::getIntegerVT(InBits),
+                                SystemZ::VectorBits / InBits);
+  SDValue PackedOp = DAG.getNode(ISD::BITCAST, DL, InVT, Op);
+  unsigned OutBits = InBits * 2;
+  EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(OutBits),
+                               SystemZ::VectorBits / OutBits);
+  return DAG.getNode(SystemZISD::UNPACKL_HIGH, DL, OutVT, PackedOp);
+}
+
 // Return true if the given BUILD_VECTOR is a scalar-to-vector conversion.
 static bool isScalarToVector(SDValue Op) {
   for (unsigned I = 1, E = Op.getNumOperands(); I != E; ++I)
@@ -5013,9 +5237,8 @@ SystemZTargetLowering::lowerEXTRACT_VECTOR_ELT(SDValue Op,
   return DAG.getNode(ISD::BITCAST, DL, VT, Res);
 }
 
-SDValue
-SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
-                                              unsigned UnpackHigh) const {
+SDValue SystemZTargetLowering::
+lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
   SDValue PackedOp = Op.getOperand(0);
   EVT OutVT = Op.getValueType();
   EVT InVT = PackedOp.getValueType();
@@ -5025,11 +5248,39 @@ SystemZTargetLowering::lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
     FromBits *= 2;
     EVT OutVT = MVT::getVectorVT(MVT::getIntegerVT(FromBits),
                                  SystemZ::VectorBits / FromBits);
-    PackedOp = DAG.getNode(UnpackHigh, SDLoc(PackedOp), OutVT, PackedOp);
+    PackedOp =
+      DAG.getNode(SystemZISD::UNPACK_HIGH, SDLoc(PackedOp), OutVT, PackedOp);
   } while (FromBits != ToBits);
   return PackedOp;
 }
 
+// Lower a ZERO_EXTEND_VECTOR_INREG to a vector shuffle with a zero vector.
+SDValue SystemZTargetLowering::
+lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const {
+  SDValue PackedOp = Op.getOperand(0);
+  SDLoc DL(Op);
+  EVT OutVT = Op.getValueType();
+  EVT InVT = PackedOp.getValueType();
+  unsigned InNumElts = InVT.getVectorNumElements();
+  unsigned OutNumElts = OutVT.getVectorNumElements();
+  unsigned NumInPerOut = InNumElts / OutNumElts;
+
+  SDValue ZeroVec =
+    DAG.getSplatVector(InVT, DL, DAG.getConstant(0, DL, InVT.getScalarType()));
+
+  SmallVector<int, 16> Mask(InNumElts);
+  unsigned ZeroVecElt = InNumElts;
+  for (unsigned PackedElt = 0; PackedElt < OutNumElts; PackedElt++) {
+    unsigned MaskElt = PackedElt * NumInPerOut;
+    unsigned End = MaskElt + NumInPerOut - 1;
+    for (; MaskElt < End; MaskElt++)
+      Mask[MaskElt] = ZeroVecElt++;
+    Mask[MaskElt] = PackedElt;
+  }
+  SDValue Shuf = DAG.getVectorShuffle(InVT, DL, PackedOp, ZeroVec, Mask);
+  return DAG.getNode(ISD::BITCAST, DL, OutVT, Shuf);
+}
+
 SDValue SystemZTargetLowering::lowerShift(SDValue Op, SelectionDAG &DAG,
                                           unsigned ByScalar) const {
   // Look for cases where a vector shift can use the *_BY_SCALAR form.
@@ -5195,9 +5446,9 @@ SDValue SystemZTargetLowering::LowerOperation(SDValue Op,
   case ISD::EXTRACT_VECTOR_ELT:
     return lowerEXTRACT_VECTOR_ELT(Op, DAG);
   case ISD::SIGN_EXTEND_VECTOR_INREG:
-    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACK_HIGH);
+    return lowerSIGN_EXTEND_VECTOR_INREG(Op, DAG);
   case ISD::ZERO_EXTEND_VECTOR_INREG:
-    return lowerExtendVectorInreg(Op, DAG, SystemZISD::UNPACKL_HIGH);
+    return lowerZERO_EXTEND_VECTOR_INREG(Op, DAG);
   case ISD::SHL:
     return lowerShift(Op, DAG, SystemZISD::VSHL_BY_SCALAR);
   case ISD::SRL:
@@ -5315,6 +5566,7 @@ const char *SystemZTargetLowering::getTargetNodeName(unsigned Opcode) const {
     OPCODE(BR_CCMASK);
     OPCODE(SELECT_CCMASK);
     OPCODE(ADJDYNALLOC);
+    OPCODE(PROBED_ALLOCA);
     OPCODE(POPCNT);
     OPCODE(SMUL_LOHI);
     OPCODE(UMUL_LOHI);
@@ -6056,6 +6308,32 @@ SDValue SystemZTargetLowering::combineFP_EXTEND(
   return SDValue();
 }
 
+SDValue SystemZTargetLowering::combineINT_TO_FP(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  if (DCI.Level != BeforeLegalizeTypes)
+    return SDValue();
+  unsigned Opcode = N->getOpcode();
+  EVT OutVT = N->getValueType(0);
+  SelectionDAG &DAG = DCI.DAG;
+  SDValue Op = N->getOperand(0);
+  unsigned OutScalarBits = OutVT.getScalarSizeInBits();
+  unsigned InScalarBits = Op->getValueType(0).getScalarSizeInBits();
+
+  // Insert an extension before type-legalization to avoid scalarization, e.g.:
+  // v2f64 = uint_to_fp v2i16
+  // =>
+  // v2f64 = uint_to_fp (v2i64 zero_extend v2i16)
+  if (OutVT.isVector() && OutScalarBits > InScalarBits) {
+    MVT ExtVT = MVT::getVectorVT(MVT::getIntegerVT(OutVT.getScalarSizeInBits()),
+                                 OutVT.getVectorNumElements());
+    unsigned ExtOpcode =
+      (Opcode == ISD::UINT_TO_FP ? ISD::ZERO_EXTEND : ISD::SIGN_EXTEND);
+    SDValue ExtOp = DAG.getNode(ExtOpcode, SDLoc(N), ExtVT, Op);
+    return DAG.getNode(Opcode, SDLoc(N), OutVT, ExtOp);
+  }
+  return SDValue();
+}
+
 SDValue SystemZTargetLowering::combineBSWAP(
     SDNode *N, DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -6243,15 +6521,7 @@ static bool combineCCMask(SDValue &CCReg, int &CCValid, int &CCMask) {
       return false;
 
     // Compute the effective CC mask for the new branch or select.
-    switch (CCMask) {
-    case SystemZ::CCMASK_CMP_EQ: break;
-    case SystemZ::CCMASK_CMP_NE: break;
-    case SystemZ::CCMASK_CMP_LT: CCMask = SystemZ::CCMASK_CMP_GT; break;
-    case SystemZ::CCMASK_CMP_GT: CCMask = SystemZ::CCMASK_CMP_LT; break;
-    case SystemZ::CCMASK_CMP_LE: CCMask = SystemZ::CCMASK_CMP_GE; break;
-    case SystemZ::CCMASK_CMP_GE: CCMask = SystemZ::CCMASK_CMP_LE; break;
-    default: return false;
-    }
+    CCMask = SystemZ::reverseCCMask(CCMask);
 
     // Return the updated CCReg link.
     CCReg = IPM->getOperand(0);
@@ -6367,6 +6637,34 @@ SDValue SystemZTargetLowering::combineIntDIVREM(
   return SDValue();
 }
 
+SDValue SystemZTargetLowering::combineINTRINSIC(
+    SDNode *N, DAGCombinerInfo &DCI) const {
+  SelectionDAG &DAG = DCI.DAG;
+
+  unsigned Id = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
+  switch (Id) {
+  // VECTOR LOAD (RIGHTMOST) WITH LENGTH with a length operand of 15
+  // or larger is simply a vector load.
+  case Intrinsic::s390_vll:
+  case Intrinsic::s390_vlrl:
+    if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(2)))
+      if (C->getZExtValue() >= 15)
+        return DAG.getLoad(N->getValueType(0), SDLoc(N), N->getOperand(0),
+                           N->getOperand(3), MachinePointerInfo());
+    break;
+  // Likewise for VECTOR STORE (RIGHTMOST) WITH LENGTH.
+  case Intrinsic::s390_vstl:
+  case Intrinsic::s390_vstrl:
+    if (auto *C = dyn_cast<ConstantSDNode>(N->getOperand(3)))
+      if (C->getZExtValue() >= 15)
+        return DAG.getStore(N->getOperand(0), SDLoc(N), N->getOperand(2),
+                            N->getOperand(4), MachinePointerInfo());
+    break;
+  }
+
+  return SDValue();
+}
+
 SDValue SystemZTargetLowering::unwrapAddress(SDValue N) const {
   if (N->getOpcode() == SystemZISD::PCREL_WRAPPER)
     return N->getOperand(0);
@@ -6391,6 +6689,8 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::FP_ROUND:           return combineFP_ROUND(N, DCI);
   case ISD::STRICT_FP_EXTEND:
   case ISD::FP_EXTEND:          return combineFP_EXTEND(N, DCI);
+  case ISD::SINT_TO_FP:
+  case ISD::UINT_TO_FP:         return combineINT_TO_FP(N, DCI);
   case ISD::BSWAP:              return combineBSWAP(N, DCI);
   case SystemZISD::BR_CCMASK:   return combineBR_CCMASK(N, DCI);
   case SystemZISD::SELECT_CCMASK: return combineSELECT_CCMASK(N, DCI);
@@ -6399,6 +6699,8 @@ SDValue SystemZTargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::UDIV:
   case ISD::SREM:
   case ISD::UREM:               return combineIntDIVREM(N, DCI);
+  case ISD::INTRINSIC_W_CHAIN:
+  case ISD::INTRINSIC_VOID:     return combineINTRINSIC(N, DCI);
   }
 
   return SDValue();
@@ -6580,7 +6882,7 @@ SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
       APInt SrcDemE = getDemandedSrcElements(Op, DemandedElts, 0);
       Known = DAG.computeKnownBits(SrcOp, SrcDemE, Depth + 1);
       if (IsLogical) {
-        Known = Known.zext(BitWidth, true);
+        Known = Known.zext(BitWidth);
       } else
         Known = Known.sext(BitWidth);
       break;
@@ -6609,7 +6911,7 @@ SystemZTargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
   // Known has the width of the source operand(s). Adjust if needed to match
   // the passed bitwidth.
   if (Known.getBitWidth() != BitWidth)
-    Known = Known.zextOrTrunc(BitWidth, false);
+    Known = Known.anyextOrTrunc(BitWidth);
 }
 
 static unsigned computeNumSignBitsBinOp(SDValue Op, const APInt &DemandedElts,
@@ -6690,38 +6992,29 @@ SystemZTargetLowering::ComputeNumSignBitsForTargetNode(
   return 1;
 }
 
+unsigned
+SystemZTargetLowering::getStackProbeSize(MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = Subtarget.getFrameLowering();
+  unsigned StackAlign = TFI->getStackAlignment();
+  assert(StackAlign >=1 && isPowerOf2_32(StackAlign) &&
+         "Unexpected stack alignment");
+  // The default stack probe size is 4096 if the function has no
+  // stack-probe-size attribute.
+  unsigned StackProbeSize = 4096;
+  const Function &Fn = MF.getFunction();
+  if (Fn.hasFnAttribute("stack-probe-size"))
+    Fn.getFnAttribute("stack-probe-size")
+        .getValueAsString()
+        .getAsInteger(0, StackProbeSize);
+  // Round down to the stack alignment.
+  StackProbeSize &= ~(StackAlign - 1);
+  return StackProbeSize ? StackProbeSize : StackAlign;
+}
+
 //===----------------------------------------------------------------------===//
 // Custom insertion
 //===----------------------------------------------------------------------===//
 
-// Create a new basic block after MBB.
-static MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB) {
-  MachineFunction &MF = *MBB->getParent();
-  MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
-  MF.insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
-  return NewMBB;
-}
-
-// Split MBB after MI and return the new block (the one that contains
-// instructions after MI).
-static MachineBasicBlock *splitBlockAfter(MachineBasicBlock::iterator MI,
-                                          MachineBasicBlock *MBB) {
-  MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
-  NewMBB->splice(NewMBB->begin(), MBB,
-                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
-  NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
-  return NewMBB;
-}
-
-// Split MBB before MI and return the new block (the one that contains MI).
-static MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI,
-                                           MachineBasicBlock *MBB) {
-  MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
-  NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end());
-  NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
-  return NewMBB;
-}
-
 // Force base value Base into a register before MI.  Return the register.
 static Register forceReg(MachineInstr &MI, MachineOperand &Base,
                          const SystemZInstrInfo *TII) {
@@ -6859,8 +7152,6 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI,
   for (MachineBasicBlock::iterator NextMIIt =
          std::next(MachineBasicBlock::iterator(MI));
        NextMIIt != MBB->end(); ++NextMIIt) {
-    if (NextMIIt->definesRegister(SystemZ::CC))
-      break;
     if (isSelectPseudo(*NextMIIt)) {
       assert(NextMIIt->getOperand(3).getImm() == CCValid &&
              "Bad CCValid operands since CC was not redefined.");
@@ -6871,6 +7162,9 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI,
       }
       break;
     }
+    if (NextMIIt->definesRegister(SystemZ::CC) ||
+        NextMIIt->usesCustomInsertionHook())
+      break;
     bool User = false;
     for (auto SelMI : Selects)
       if (NextMIIt->readsVirtualRegister(SelMI->getOperand(0).getReg())) {
@@ -6891,8 +7185,8 @@ SystemZTargetLowering::emitSelect(MachineInstr &MI,
   bool CCKilled =
       (LastMI->killsRegister(SystemZ::CC) || checkCCKill(*LastMI, MBB));
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *JoinMBB  = splitBlockAfter(LastMI, MBB);
-  MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
+  MachineBasicBlock *JoinMBB  = SystemZ::splitBlockAfter(LastMI, MBB);
+  MachineBasicBlock *FalseMBB = SystemZ::emitBlockAfter(StartMBB);
 
   // Unless CC was killed in the last Select instruction, mark it as
   // live-in to both FalseMBB and JoinMBB.
@@ -6985,8 +7279,8 @@ MachineBasicBlock *SystemZTargetLowering::emitCondStore(MachineInstr &MI,
     CCMask ^= CCValid;
 
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *JoinMBB  = splitBlockBefore(MI, MBB);
-  MachineBasicBlock *FalseMBB = emitBlockAfter(StartMBB);
+  MachineBasicBlock *JoinMBB  = SystemZ::splitBlockBefore(MI, MBB);
+  MachineBasicBlock *FalseMBB = SystemZ::emitBlockAfter(StartMBB);
 
   // Unless CC was killed in the CondStore instruction, mark it as
   // live-in to both FalseMBB and JoinMBB.
@@ -7069,8 +7363,8 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadBinary(
 
   // Insert a basic block for the main loop.
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *DoneMBB  = splitBlockBefore(MI, MBB);
-  MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
+  MachineBasicBlock *DoneMBB  = SystemZ::splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB  = SystemZ::emitBlockAfter(StartMBB);
 
   //  StartMBB:
   //   ...
@@ -7187,10 +7481,10 @@ MachineBasicBlock *SystemZTargetLowering::emitAtomicLoadMinMax(
 
   // Insert 3 basic blocks for the loop.
   MachineBasicBlock *StartMBB  = MBB;
-  MachineBasicBlock *DoneMBB   = splitBlockBefore(MI, MBB);
-  MachineBasicBlock *LoopMBB   = emitBlockAfter(StartMBB);
-  MachineBasicBlock *UseAltMBB = emitBlockAfter(LoopMBB);
-  MachineBasicBlock *UpdateMBB = emitBlockAfter(UseAltMBB);
+  MachineBasicBlock *DoneMBB   = SystemZ::splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB   = SystemZ::emitBlockAfter(StartMBB);
+  MachineBasicBlock *UseAltMBB = SystemZ::emitBlockAfter(LoopMBB);
+  MachineBasicBlock *UpdateMBB = SystemZ::emitBlockAfter(UseAltMBB);
 
   //  StartMBB:
   //   ...
@@ -7298,9 +7592,9 @@ SystemZTargetLowering::emitAtomicCmpSwapW(MachineInstr &MI,
 
   // Insert 2 basic blocks for the loop.
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *DoneMBB  = splitBlockBefore(MI, MBB);
-  MachineBasicBlock *LoopMBB  = emitBlockAfter(StartMBB);
-  MachineBasicBlock *SetMBB   = emitBlockAfter(LoopMBB);
+  MachineBasicBlock *DoneMBB  = SystemZ::splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB  = SystemZ::emitBlockAfter(StartMBB);
+  MachineBasicBlock *SetMBB   = SystemZ::emitBlockAfter(LoopMBB);
 
   //  StartMBB:
   //   ...
@@ -7460,7 +7754,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
   // When generating more than one CLC, all but the last will need to
   // branch to the end when a difference is found.
   MachineBasicBlock *EndMBB = (Length > 256 && Opcode == SystemZ::CLC ?
-                               splitBlockAfter(MI, MBB) : nullptr);
+                               SystemZ::splitBlockAfter(MI, MBB) : nullptr);
 
   // Check for the loop form, in which operand 5 is the trip count.
   if (MI.getNumExplicitOperands() > 5) {
@@ -7484,9 +7778,10 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     Register NextCountReg = MRI.createVirtualRegister(RC);
 
     MachineBasicBlock *StartMBB = MBB;
-    MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
-    MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
-    MachineBasicBlock *NextMBB = (EndMBB ? emitBlockAfter(LoopMBB) : LoopMBB);
+    MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB);
+    MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB);
+    MachineBasicBlock *NextMBB =
+        (EndMBB ? SystemZ::emitBlockAfter(LoopMBB) : LoopMBB);
 
     //  StartMBB:
     //   # fall through to LoopMMB
@@ -7602,7 +7897,7 @@ MachineBasicBlock *SystemZTargetLowering::emitMemMemWrapper(
     // If there's another CLC to go, branch to the end if a difference
     // was found.
     if (EndMBB && Length > 0) {
-      MachineBasicBlock *NextMBB = splitBlockBefore(MI, MBB);
+      MachineBasicBlock *NextMBB = SystemZ::splitBlockBefore(MI, MBB);
       BuildMI(MBB, DL, TII->get(SystemZ::BRC))
         .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_NE)
         .addMBB(EndMBB);
@@ -7642,8 +7937,8 @@ MachineBasicBlock *SystemZTargetLowering::emitStringWrapper(
   uint64_t End2Reg  = MRI.createVirtualRegister(RC);
 
   MachineBasicBlock *StartMBB = MBB;
-  MachineBasicBlock *DoneMBB = splitBlockBefore(MI, MBB);
-  MachineBasicBlock *LoopMBB = emitBlockAfter(StartMBB);
+  MachineBasicBlock *DoneMBB = SystemZ::splitBlockBefore(MI, MBB);
+  MachineBasicBlock *LoopMBB = SystemZ::emitBlockAfter(StartMBB);
 
   //  StartMBB:
   //   # fall through to LoopMMB
@@ -7754,6 +8049,97 @@ MachineBasicBlock *SystemZTargetLowering::emitLoadAndTestCmp0(
   return MBB;
 }
 
+MachineBasicBlock *SystemZTargetLowering::emitProbedAlloca(
+    MachineInstr &MI, MachineBasicBlock *MBB) const {
+  MachineFunction &MF = *MBB->getParent();
+  MachineRegisterInfo *MRI = &MF.getRegInfo();
+  const SystemZInstrInfo *TII =
+      static_cast<const SystemZInstrInfo *>(Subtarget.getInstrInfo());
+  DebugLoc DL = MI.getDebugLoc();
+  const unsigned ProbeSize = getStackProbeSize(MF);
+  Register DstReg = MI.getOperand(0).getReg();
+  Register SizeReg = MI.getOperand(2).getReg();
+
+  MachineBasicBlock *StartMBB = MBB;
+  MachineBasicBlock *DoneMBB  = SystemZ::splitBlockAfter(MI, MBB);
+  MachineBasicBlock *LoopTestMBB  = SystemZ::emitBlockAfter(StartMBB);
+  MachineBasicBlock *LoopBodyMBB = SystemZ::emitBlockAfter(LoopTestMBB);
+  MachineBasicBlock *TailTestMBB = SystemZ::emitBlockAfter(LoopBodyMBB);
+  MachineBasicBlock *TailMBB = SystemZ::emitBlockAfter(TailTestMBB);
+
+  MachineMemOperand *VolLdMMO = MF.getMachineMemOperand(MachinePointerInfo(),
+    MachineMemOperand::MOVolatile | MachineMemOperand::MOLoad, 8, Align(1));
+
+  Register PHIReg = MRI->createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+  Register IncReg = MRI->createVirtualRegister(&SystemZ::ADDR64BitRegClass);
+
+  //  LoopTestMBB
+  //  BRC TailTestMBB
+  //  # fallthrough to LoopBodyMBB
+  StartMBB->addSuccessor(LoopTestMBB);
+  MBB = LoopTestMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::PHI), PHIReg)
+    .addReg(SizeReg)
+    .addMBB(StartMBB)
+    .addReg(IncReg)
+    .addMBB(LoopBodyMBB);
+  BuildMI(MBB, DL, TII->get(SystemZ::CLGFI))
+    .addReg(PHIReg)
+    .addImm(ProbeSize);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_LT)
+    .addMBB(TailTestMBB);
+  MBB->addSuccessor(LoopBodyMBB);
+  MBB->addSuccessor(TailTestMBB);
+
+  //  LoopBodyMBB: Allocate and probe by means of a volatile compare.
+  //  J LoopTestMBB
+  MBB = LoopBodyMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::SLGFI), IncReg)
+    .addReg(PHIReg)
+    .addImm(ProbeSize);
+  BuildMI(MBB, DL, TII->get(SystemZ::SLGFI), SystemZ::R15D)
+    .addReg(SystemZ::R15D)
+    .addImm(ProbeSize);
+  BuildMI(MBB, DL, TII->get(SystemZ::CG)).addReg(SystemZ::R15D)
+    .addReg(SystemZ::R15D).addImm(ProbeSize - 8).addReg(0)
+    .setMemRefs(VolLdMMO);
+  BuildMI(MBB, DL, TII->get(SystemZ::J)).addMBB(LoopTestMBB);
+  MBB->addSuccessor(LoopTestMBB);
+
+  //  TailTestMBB
+  //  BRC DoneMBB
+  //  # fallthrough to TailMBB
+  MBB = TailTestMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::CGHI))
+    .addReg(PHIReg)
+    .addImm(0);
+  BuildMI(MBB, DL, TII->get(SystemZ::BRC))
+    .addImm(SystemZ::CCMASK_ICMP).addImm(SystemZ::CCMASK_CMP_EQ)
+    .addMBB(DoneMBB);
+  MBB->addSuccessor(TailMBB);
+  MBB->addSuccessor(DoneMBB);
+
+  //  TailMBB
+  //  # fallthrough to DoneMBB
+  MBB = TailMBB;
+  BuildMI(MBB, DL, TII->get(SystemZ::SLGR), SystemZ::R15D)
+    .addReg(SystemZ::R15D)
+    .addReg(PHIReg);
+  BuildMI(MBB, DL, TII->get(SystemZ::CG)).addReg(SystemZ::R15D)
+    .addReg(SystemZ::R15D).addImm(-8).addReg(PHIReg)
+    .setMemRefs(VolLdMMO);
+  MBB->addSuccessor(DoneMBB);
+
+  //  DoneMBB
+  MBB = DoneMBB;
+  BuildMI(*MBB, MBB->begin(), DL, TII->get(TargetOpcode::COPY), DstReg)
+    .addReg(SystemZ::R15D);
+
+  MI.eraseFromParent();
+  return DoneMBB;
+}
+
 MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *MBB) const {
   switch (MI.getOpcode()) {
@@ -8014,6 +8400,9 @@ MachineBasicBlock *SystemZTargetLowering::EmitInstrWithCustomInserter(
   case SystemZ::LTXBRCompare_VecPseudo:
     return emitLoadAndTestCmp0(MI, MBB, SystemZ::LTXBR);
 
+  case SystemZ::PROBED_ALLOCA:
+    return emitProbedAlloca(MI, MBB);
+
   case TargetOpcode::STACKMAP:
   case TargetOpcode::PATCHPOINT:
     return emitPatchPoint(MI, MBB);
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index defcaa6eb6eb..27637762296a 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -83,6 +83,10 @@ enum NodeType : unsigned {
   // base of the dynamically-allocatable area.
   ADJDYNALLOC,
 
+  // For allocating stack space when using stack clash protector.
+  // Allocation is performed by block, and each block is probed.
+  PROBED_ALLOCA,
+
   // Count number of bits set in operand 0 per byte.
   POPCNT,
 
@@ -393,6 +397,8 @@ public:
   explicit SystemZTargetLowering(const TargetMachine &TM,
                                  const SystemZSubtarget &STI);
 
+  bool useSoftFloat() const override;
+
   // Override TargetLowering.
   MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
     return MVT::i32;
@@ -426,6 +432,7 @@ public:
                                   EVT VT) const override;
   bool isFPImmLegal(const APFloat &Imm, EVT VT,
                     bool ForCodeSize) const override;
+  bool hasInlineStackProbe(MachineFunction &MF) const override;
   bool isLegalICmpImmediate(int64_t Imm) const override;
   bool isLegalAddImmediate(int64_t Imm) const override;
   bool isLegalAddressingMode(const DataLayout &DL, const AddrMode &AM, Type *Ty,
@@ -437,6 +444,14 @@ public:
                                       bool *Fast) const override;
   bool isTruncateFree(Type *, Type *) const override;
   bool isTruncateFree(EVT, EVT) const override;
+
+  bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
+                            bool MathUsed) const override {
+    // Form add and sub with overflow intrinsics regardless of any extra
+    // users of the math result.
+    return VT == MVT::i32 || VT == MVT::i64;
+  }
+
   const char *getTargetNodeName(unsigned Opcode) const override;
   std::pair<unsigned, const TargetRegisterClass *>
   getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
@@ -471,16 +486,19 @@ public:
     return TargetLowering::getInlineAsmMemConstraint(ConstraintCode);
   }
 
+  Register getRegisterByName(const char *RegName, LLT VT,
+                             const MachineFunction &MF) const override;
+
   /// If a physical register, this returns the register that receives the
   /// exception address on entry to an EH pad.
-  unsigned
+  Register
   getExceptionPointerRegister(const Constant *PersonalityFn) const override {
     return SystemZ::R6D;
   }
 
   /// If a physical register, this returns the register that receives the
   /// exception typeid on entry to a landing pad.
-  unsigned
+  Register
   getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
     return SystemZ::R7D;
   }
@@ -543,6 +561,8 @@ public:
     return true;
   }
 
+  unsigned getStackProbeSize(MachineFunction &MF) const;
+
 private:
   const SystemZSubtarget &Subtarget;
 
@@ -607,8 +627,8 @@ private:
   SDValue lowerSCALAR_TO_VECTOR(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerEXTRACT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const;
-  SDValue lowerExtendVectorInreg(SDValue Op, SelectionDAG &DAG,
-                                 unsigned UnpackHigh) const;
+  SDValue lowerSIGN_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerZERO_EXTEND_VECTOR_INREG(SDValue Op, SelectionDAG &DAG) const;
   SDValue lowerShift(SDValue Op, SelectionDAG &DAG, unsigned ByScalar) const;
 
   bool canTreatAsByteVector(EVT VT) const;
@@ -629,11 +649,13 @@ private:
   SDValue combineJOIN_DWORDS(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineFP_ROUND(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineFP_EXTEND(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineINT_TO_FP(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineBSWAP(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineBR_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineSELECT_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineGET_CCMASK(SDNode *N, DAGCombinerInfo &DCI) const;
   SDValue combineIntDIVREM(SDNode *N, DAGCombinerInfo &DCI) const;
+  SDValue combineINTRINSIC(SDNode *N, DAGCombinerInfo &DCI) const;
 
   SDValue unwrapAddress(SDValue N) const override;
 
@@ -676,8 +698,11 @@ private:
   MachineBasicBlock *emitLoadAndTestCmp0(MachineInstr &MI,
                                          MachineBasicBlock *MBB,
                                          unsigned Opcode) const;
+  MachineBasicBlock *emitProbedAlloca(MachineInstr &MI,
+                                      MachineBasicBlock *MBB) const;
 
-  MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;
+  MachineMemOperand::Flags
+  getTargetMMOFlags(const Instruction &I) const override;
   const TargetRegisterClass *getRepRegClassFor(MVT VT) const override;
 };
 
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h b/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
index ec7639e71f81..9fc786f92635 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrBuilder.h
@@ -17,7 +17,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 
 namespace llvm {
 
@@ -36,7 +35,7 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI) {
   int64_t Offset = 0;
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
-      MFFrame.getObjectSize(FI), MFFrame.getObjectAlignment(FI));
+      MFFrame.getObjectSize(FI), MFFrame.getObjectAlign(FI));
   return MIB.addFrameIndex(FI).addImm(Offset).addReg(0).addMemOperand(MMO);
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFP.td b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
index 6d03274fe8a6..337164d55e5f 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFP.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFP.td
@@ -438,8 +438,8 @@ let Uses = [FPC], mayRaiseFPException = 1,
     def ADBR : BinaryRRE<"adbr", 0xB31A, any_fadd, FP64,  FP64>;
     def AXBR : BinaryRRE<"axbr", 0xB34A, any_fadd, FP128, FP128>;
   }
-  def AEB : BinaryRXE<"aeb", 0xED0A, any_fadd, FP32, load, 4>;
-  def ADB : BinaryRXE<"adb", 0xED1A, any_fadd, FP64, load, 8>;
+  defm AEB : BinaryRXEAndPseudo<"aeb", 0xED0A, any_fadd, FP32, load, 4>;
+  defm ADB : BinaryRXEAndPseudo<"adb", 0xED1A, any_fadd, FP64, load, 8>;
 }
 
 // Subtraction.
@@ -449,8 +449,8 @@ let Uses = [FPC], mayRaiseFPException = 1,
   def SDBR : BinaryRRE<"sdbr", 0xB31B, any_fsub, FP64,  FP64>;
   def SXBR : BinaryRRE<"sxbr", 0xB34B, any_fsub, FP128, FP128>;
 
-  def SEB : BinaryRXE<"seb",  0xED0B, any_fsub, FP32, load, 4>;
-  def SDB : BinaryRXE<"sdb",  0xED1B, any_fsub, FP64, load, 8>;
+  defm SEB : BinaryRXEAndPseudo<"seb",  0xED0B, any_fsub, FP32, load, 4>;
+  defm SDB : BinaryRXEAndPseudo<"sdb",  0xED1B, any_fsub, FP64, load, 8>;
 }
 
 // Multiplication.
@@ -460,8 +460,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
     def MDBR  : BinaryRRE<"mdbr",  0xB31C, any_fmul, FP64,  FP64>;
     def MXBR  : BinaryRRE<"mxbr",  0xB34C, any_fmul, FP128, FP128>;
   }
-  def MEEB : BinaryRXE<"meeb", 0xED17, any_fmul, FP32, load, 4>;
-  def MDB  : BinaryRXE<"mdb",  0xED1C, any_fmul, FP64, load, 8>;
+  defm MEEB : BinaryRXEAndPseudo<"meeb", 0xED17, any_fmul, FP32, load, 4>;
+  defm MDB  : BinaryRXEAndPseudo<"mdb",  0xED1C, any_fmul, FP64, load, 8>;
 }
 
 // f64 multiplication of two FP32 registers.
@@ -503,8 +503,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def MAEBR : TernaryRRD<"maebr", 0xB30E, z_any_fma, FP32, FP32>;
   def MADBR : TernaryRRD<"madbr", 0xB31E, z_any_fma, FP64, FP64>;
 
-  def MAEB : TernaryRXF<"maeb", 0xED0E, z_any_fma, FP32, FP32, load, 4>;
-  def MADB : TernaryRXF<"madb", 0xED1E, z_any_fma, FP64, FP64, load, 8>;
+  defm MAEB : TernaryRXFAndPseudo<"maeb", 0xED0E, z_any_fma, FP32, FP32, load, 4>;
+  defm MADB : TernaryRXFAndPseudo<"madb", 0xED1E, z_any_fma, FP64, FP64, load, 8>;
 }
 
 // Fused multiply-subtract.
@@ -512,8 +512,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def MSEBR : TernaryRRD<"msebr", 0xB30F, z_any_fms, FP32, FP32>;
   def MSDBR : TernaryRRD<"msdbr", 0xB31F, z_any_fms, FP64, FP64>;
 
-  def MSEB : TernaryRXF<"mseb", 0xED0F, z_any_fms, FP32, FP32, load, 4>;
-  def MSDB : TernaryRXF<"msdb", 0xED1F, z_any_fms, FP64, FP64, load, 8>;
+  defm MSEB : TernaryRXFAndPseudo<"mseb", 0xED0F, z_any_fms, FP32, FP32, load, 4>;
+  defm MSDB : TernaryRXFAndPseudo<"msdb", 0xED1F, z_any_fms, FP64, FP64, load, 8>;
 }
 
 // Division.
@@ -522,8 +522,8 @@ let Uses = [FPC], mayRaiseFPException = 1 in {
   def DDBR : BinaryRRE<"ddbr", 0xB31D, any_fdiv, FP64,  FP64>;
   def DXBR : BinaryRRE<"dxbr", 0xB34D, any_fdiv, FP128, FP128>;
 
-  def DEB : BinaryRXE<"deb", 0xED0D, any_fdiv, FP32, load, 4>;
-  def DDB : BinaryRXE<"ddb", 0xED1D, any_fdiv, FP64, load, 8>;
+  defm DEB : BinaryRXEAndPseudo<"deb", 0xED0D, any_fdiv, FP32, load, 4>;
+  defm DDB : BinaryRXEAndPseudo<"ddb", 0xED1D, any_fdiv, FP64, load, 8>;
 }
 
 // Divide to integer.
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
index f064d33ac2f3..50f1e09c6ee5 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrFormats.td
@@ -2334,49 +2334,49 @@ class FixedCmpBranchRSYb<CondVariant V, string mnemonic, bits<16> opcode,
 
 class BranchUnaryRI<string mnemonic, bits<12> opcode, RegisterOperand cls>
   : InstRIb<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget16:$RI2),
-            mnemonic##"\t$R1, $RI2", []> {
+            mnemonic#"\t$R1, $RI2", []> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
 }
 
 class BranchUnaryRIL<string mnemonic, bits<12> opcode, RegisterOperand cls>
   : InstRILb<opcode, (outs cls:$R1), (ins cls:$R1src, brtarget32:$RI2),
-             mnemonic##"\t$R1, $RI2", []> {
+             mnemonic#"\t$R1, $RI2", []> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
 }
 
 class BranchUnaryRR<string mnemonic, bits<8> opcode, RegisterOperand cls>
   : InstRR<opcode, (outs cls:$R1), (ins cls:$R1src, GR64:$R2),
-           mnemonic##"\t$R1, $R2", []> {
+           mnemonic#"\t$R1, $R2", []> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
 }
 
 class BranchUnaryRRE<string mnemonic, bits<16> opcode, RegisterOperand cls>
   : InstRRE<opcode, (outs cls:$R1), (ins cls:$R1src, GR64:$R2),
-            mnemonic##"\t$R1, $R2", []> {
+            mnemonic#"\t$R1, $R2", []> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
 }
 
 class BranchUnaryRX<string mnemonic, bits<8> opcode, RegisterOperand cls>
   : InstRXa<opcode, (outs cls:$R1), (ins cls:$R1src, bdxaddr12only:$XBD2),
-            mnemonic##"\t$R1, $XBD2", []> {
+            mnemonic#"\t$R1, $XBD2", []> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
 }
 
 class BranchUnaryRXY<string mnemonic, bits<16> opcode, RegisterOperand cls>
   : InstRXYa<opcode, (outs cls:$R1), (ins cls:$R1src, bdxaddr20only:$XBD2),
-             mnemonic##"\t$R1, $XBD2", []> {
+             mnemonic#"\t$R1, $XBD2", []> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
 }
 
 class BranchBinaryRSI<string mnemonic, bits<8> opcode, RegisterOperand cls>
   : InstRSI<opcode, (outs cls:$R1), (ins cls:$R1src, cls:$R3, brtarget16:$RI2),
-            mnemonic##"\t$R1, $R3, $RI2", []> {
+            mnemonic#"\t$R1, $R3, $RI2", []> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
 }
@@ -2384,7 +2384,7 @@ class BranchBinaryRSI<string mnemonic, bits<8> opcode, RegisterOperand cls>
 class BranchBinaryRIEe<string mnemonic, bits<16> opcode, RegisterOperand cls>
   : InstRIEe<opcode, (outs cls:$R1),
              (ins cls:$R1src, cls:$R3, brtarget16:$RI2),
-             mnemonic##"\t$R1, $R3, $RI2", []> {
+             mnemonic#"\t$R1, $R3, $RI2", []> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
 }
@@ -2392,7 +2392,7 @@ class BranchBinaryRIEe<string mnemonic, bits<16> opcode, RegisterOperand cls>
 class BranchBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls>
   : InstRSa<opcode, (outs cls:$R1),
             (ins cls:$R1src, cls:$R3, bdaddr12only:$BD2),
-            mnemonic##"\t$R1, $R3, $BD2", []> {
+            mnemonic#"\t$R1, $R3, $BD2", []> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
 }
@@ -2400,7 +2400,7 @@ class BranchBinaryRS<string mnemonic, bits<8> opcode, RegisterOperand cls>
 class BranchBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls>
   : InstRSYa<opcode,
              (outs cls:$R1), (ins cls:$R1src, cls:$R3, bdaddr20only:$BD2),
-             mnemonic##"\t$R1, $R3, $BD2", []> {
+             mnemonic#"\t$R1, $R3, $BD2", []> {
   let Constraints = "$R1 = $R1src";
   let DisableEncoding = "$R1src";
 }
@@ -2421,7 +2421,7 @@ class LoadMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
 
 multiclass LoadMultipleRSPair<string mnemonic, bits<8> rsOpcode,
                               bits<16> rsyOpcode, RegisterOperand cls> {
-  let DispKey = mnemonic ## #cls in {
+  let DispKey = mnemonic # cls in {
     let DispSize = "12" in
       def "" : LoadMultipleRS<mnemonic, rsOpcode, cls, bdaddr12pair>;
     let DispSize = "20" in
@@ -2487,7 +2487,7 @@ class StoreRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 multiclass StoreRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
                        SDPatternOperator operator, RegisterOperand cls,
                        bits<5> bytes> {
-  let DispKey = mnemonic ## #cls in {
+  let DispKey = mnemonic # cls in {
     let DispSize = "12" in
       def "" : StoreRX<mnemonic, rxOpcode, operator, cls, bytes, bdxaddr12pair>;
     let DispSize = "20" in
@@ -2567,7 +2567,7 @@ class StoreMultipleRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
 
 multiclass StoreMultipleRSPair<string mnemonic, bits<8> rsOpcode,
                                bits<16> rsyOpcode, RegisterOperand cls> {
-  let DispKey = mnemonic ## #cls in {
+  let DispKey = mnemonic # cls in {
     let DispSize = "12" in
       def "" : StoreMultipleRS<mnemonic, rsOpcode, cls, bdaddr12pair>;
     let DispSize = "20" in
@@ -2807,6 +2807,10 @@ class CondUnaryRSY<string mnemonic, bits<16> opcode,
   let mayLoad = 1;
   let AccessBytes = bytes;
   let CCMaskLast = 1;
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let MemKey = mnemonic#cls;
+  let MemType = "target";
 }
 
 // Like CondUnaryRSY, but used for the raw assembly form.  The condition-code
@@ -2884,7 +2888,7 @@ class UnaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 multiclass UnaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
                        SDPatternOperator operator, RegisterOperand cls,
                        bits<5> bytes> {
-  let DispKey = mnemonic ## #cls in {
+  let DispKey = mnemonic # cls in {
     let DispSize = "12" in
       def "" : UnaryRX<mnemonic, rxOpcode, operator, cls, bytes, bdxaddr12pair>;
     let DispSize = "20" in
@@ -2907,13 +2911,15 @@ class UnaryVRIaGeneric<string mnemonic, bits<16> opcode, ImmOpWithPattern imm>
 
 class UnaryVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                 TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m4 = 0,
-                bits<4> m5 = 0>
+                bits<4> m5 = 0, string fp_mnemonic = "">
   : InstVRRa<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2),
              mnemonic#"\t$V1, $V2",
              [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2)))]> {
   let M3 = type;
   let M4 = m4;
   let M5 = m5;
+  let OpKey = fp_mnemonic#!subst("VR", "FP", !cast<string>(tr1.op));
+  let OpType = "reg";
 }
 
 class UnaryVRRaGeneric<string mnemonic, bits<16> opcode, bits<4> m4 = 0,
@@ -2948,7 +2954,7 @@ multiclass UnaryExtraVRRaSPair<string mnemonic, bits<16> opcode,
   def : InstAlias<mnemonic#"\t$V1, $V2",
                   (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2, 0)>;
   let Defs = [CC] in
-    def S : UnaryVRRa<mnemonic##"s", opcode, operator_cc, tr1, tr2,
+    def S : UnaryVRRa<mnemonic#"s", opcode, operator_cc, tr1, tr2,
                       type, 0, 1>;
 }
 
@@ -2992,17 +2998,17 @@ multiclass UnaryVRXAlign<string mnemonic, bits<16> opcode> {
 class SideEffectBinaryRX<string mnemonic, bits<8> opcode,
                          RegisterOperand cls>
   : InstRXa<opcode, (outs), (ins cls:$R1, bdxaddr12only:$XBD2),
-            mnemonic##"\t$R1, $XBD2", []>;
+            mnemonic#"\t$R1, $XBD2", []>;
 
 class SideEffectBinaryRXY<string mnemonic, bits<16> opcode,
                           RegisterOperand cls>
   : InstRXYa<opcode, (outs), (ins cls:$R1, bdxaddr20only:$XBD2),
-             mnemonic##"\t$R1, $XBD2", []>;
+             mnemonic#"\t$R1, $XBD2", []>;
 
 class SideEffectBinaryRILPC<string mnemonic, bits<12> opcode,
                             RegisterOperand cls>
   : InstRILb<opcode, (outs), (ins cls:$R1, pcrel32:$RI2),
-             mnemonic##"\t$R1, $RI2", []> {
+             mnemonic#"\t$R1, $RI2", []> {
   // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
   // However, BDXs have two extra operands and are therefore 6 units more
   // complex.
@@ -3045,16 +3051,16 @@ class SideEffectBinarySIL<string mnemonic, bits<16> opcode,
 
 class SideEffectBinarySSa<string mnemonic, bits<8> opcode>
   : InstSSa<opcode, (outs), (ins bdladdr12onlylen8:$BDL1, bdaddr12only:$BD2),
-            mnemonic##"\t$BDL1, $BD2", []>;
+            mnemonic#"\t$BDL1, $BD2", []>;
 
 class SideEffectBinarySSb<string mnemonic, bits<8> opcode>
   : InstSSb<opcode,
             (outs), (ins bdladdr12onlylen4:$BDL1, bdladdr12onlylen4:$BDL2),
-            mnemonic##"\t$BDL1, $BDL2", []>;
+            mnemonic#"\t$BDL1, $BDL2", []>;
 
 class SideEffectBinarySSf<string mnemonic, bits<8> opcode>
   : InstSSf<opcode, (outs), (ins bdaddr12only:$BD1, bdladdr12onlylen8:$BDL2),
-            mnemonic##"\t$BD1, $BDL2", []>;
+            mnemonic#"\t$BD1, $BDL2", []>;
 
 class SideEffectBinarySSE<string mnemonic, bits<16> opcode>
   : InstSSE<opcode, (outs), (ins bdaddr12only:$BD1, bdaddr12only:$BD2),
@@ -3211,6 +3217,8 @@ class CondBinaryRRF<string mnemonic, bits<16> opcode, RegisterOperand cls1,
   let CCMaskLast = 1;
   let NumOpsKey = !subst("loc", "sel", mnemonic);
   let NumOpsValue = "2";
+  let OpKey = mnemonic#cls1;
+  let OpType = "reg";
 }
 
 // Like CondBinaryRRF, but used for the raw assembly form.  The condition-code
@@ -3252,6 +3260,8 @@ class CondBinaryRRFa<string mnemonic, bits<16> opcode, RegisterOperand cls1,
   let CCMaskLast = 1;
   let NumOpsKey = mnemonic;
   let NumOpsValue = "3";
+  let OpKey = mnemonic#cls1;
+  let OpType = "reg";
 }
 
 // Like CondBinaryRRFa, but used for the raw assembly form.  The condition-code
@@ -3299,7 +3309,7 @@ multiclass BinaryRIAndK<string mnemonic, bits<12> opcode1, bits<16> opcode2,
                         ImmOpWithPattern imm> {
   let NumOpsKey = mnemonic in {
     let NumOpsValue = "3" in
-      def K : BinaryRIE<mnemonic##"k", opcode2, operator, cls, imm>,
+      def K : BinaryRIE<mnemonic#"k", opcode2, operator, cls, imm>,
               Requires<[FeatureDistinctOps]>;
     let NumOpsValue = "2" in
       def "" : BinaryRI<mnemonic, opcode1, operator, cls, imm>;
@@ -3376,7 +3386,7 @@ multiclass BinaryRSAndK<string mnemonic, bits<8> opcode1, bits<16> opcode2,
                         SDPatternOperator operator, RegisterOperand cls> {
   let NumOpsKey = mnemonic in {
     let NumOpsValue = "3" in
-      def K  : BinaryRSY<mnemonic##"k", opcode2, operator, cls>,
+      def K  : BinaryRSY<mnemonic#"k", opcode2, operator, cls>,
                Requires<[FeatureDistinctOps]>;
     let NumOpsValue = "2" in
       def "" : BinaryRS<mnemonic, opcode1, operator, cls>;
@@ -3448,7 +3458,7 @@ class BinaryRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 multiclass BinaryRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
                         SDPatternOperator operator, RegisterOperand cls,
                         SDPatternOperator load, bits<5> bytes> {
-  let DispKey = mnemonic ## #cls in {
+  let DispKey = mnemonic # cls in {
     let DispSize = "12" in
       def "" : BinaryRX<mnemonic, rxOpcode, operator, cls, load, bytes,
                         bdxaddr12pair>;
@@ -3479,7 +3489,7 @@ class BinarySIY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 multiclass BinarySIPair<string mnemonic, bits<8> siOpcode,
                         bits<16> siyOpcode, SDPatternOperator operator,
                         Operand imm> {
-  let DispKey = mnemonic ## #cls in {
+  let DispKey = mnemonic # cls in {
     let DispSize = "12" in
       def "" : BinarySI<mnemonic, siOpcode, operator, imm, bdaddr12pair>;
     let DispSize = "20" in
@@ -3575,7 +3585,7 @@ multiclass BinaryVRRbSPair<string mnemonic, bits<16> opcode,
   def "" : BinaryVRRb<mnemonic, opcode, operator, tr1, tr2, type,
                       !and (modifier, 14)>;
   let Defs = [CC] in
-    def S : BinaryVRRb<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+    def S : BinaryVRRb<mnemonic#"s", opcode, operator_cc, tr1, tr2, type,
                        !add (!and (modifier, 14), 1)>;
 }
 
@@ -3604,7 +3614,7 @@ multiclass BinaryExtraVRRbSPair<string mnemonic, bits<16> opcode,
                   (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
                                             tr2.op:$V3, 0)>;
   let Defs = [CC] in
-    def S : BinaryVRRb<mnemonic##"s", opcode, operator_cc, tr1, tr2, type, 1>;
+    def S : BinaryVRRb<mnemonic#"s", opcode, operator_cc, tr1, tr2, type, 1>;
 }
 
 multiclass BinaryExtraVRRbSPairGeneric<string mnemonic, bits<16> opcode> {
@@ -3619,7 +3629,7 @@ multiclass BinaryExtraVRRbSPairGeneric<string mnemonic, bits<16> opcode> {
 
 class BinaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                  TypedReg tr1, TypedReg tr2, bits<4> type = 0, bits<4> m5 = 0,
-                 bits<4> m6 = 0>
+                 bits<4> m6 = 0, string fp_mnemonic = "">
   : InstVRRc<opcode, (outs tr1.op:$V1), (ins tr2.op:$V2, tr2.op:$V3),
              mnemonic#"\t$V1, $V2, $V3",
              [(set (tr1.vt tr1.op:$V1), (operator (tr2.vt tr2.op:$V2),
@@ -3627,6 +3637,8 @@ class BinaryVRRc<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M4 = type;
   let M5 = m5;
   let M6 = m6;
+  let OpKey = fp_mnemonic#"MemFold"#!subst("VR", "FP", !cast<string>(tr1.op));
+  let OpType = "reg";
 }
 
 class BinaryVRRcGeneric<string mnemonic, bits<16> opcode, bits<4> m5 = 0,
@@ -3655,7 +3667,7 @@ multiclass BinaryVRRcSPair<string mnemonic, bits<16> opcode,
   def "" : BinaryVRRc<mnemonic, opcode, operator, tr1, tr2, type,
                       m5, !and (modifier, 14)>;
   let Defs = [CC] in
-    def S : BinaryVRRc<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+    def S : BinaryVRRc<mnemonic#"s", opcode, operator_cc, tr1, tr2, type,
                        m5, !add (!and (modifier, 14), 1)>;
 }
 
@@ -3752,7 +3764,7 @@ class StoreBinaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
 multiclass StoreBinaryRSPair<string mnemonic, bits<8> rsOpcode,
                              bits<16> rsyOpcode, RegisterOperand cls,
                              bits<5> bytes> {
-  let DispKey = mnemonic ## #cls in {
+  let DispKey = mnemonic # cls in {
     let DispSize = "12" in
       def "" : StoreBinaryRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
     let DispSize = "20" in
@@ -3892,7 +3904,7 @@ class CompareRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 multiclass CompareRXPair<string mnemonic, bits<8> rxOpcode, bits<16> rxyOpcode,
                          SDPatternOperator operator, RegisterOperand cls,
                          SDPatternOperator load, bits<5> bytes> {
-  let DispKey = mnemonic ## #cls in {
+  let DispKey = mnemonic # cls in {
     let DispSize = "12" in
       def "" : CompareRX<mnemonic, rxOpcode, operator, cls,
                          load, bytes, bdxaddr12pair>;
@@ -3920,7 +3932,7 @@ class CompareRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
 
 multiclass CompareRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
                          RegisterOperand cls, bits<5> bytes> {
-  let DispKey = mnemonic ## #cls in {
+  let DispKey = mnemonic # cls in {
     let DispSize = "12" in
       def "" : CompareRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
     let DispSize = "20" in
@@ -3931,7 +3943,7 @@ multiclass CompareRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
 class CompareSSb<string mnemonic, bits<8> opcode>
   : InstSSb<opcode,
             (outs), (ins bdladdr12onlylen4:$BDL1, bdladdr12onlylen4:$BDL2),
-            mnemonic##"\t$BDL1, $BDL2", []> {
+            mnemonic#"\t$BDL1, $BDL2", []> {
   let isCompare = 1;
   let mayLoad = 1;
 }
@@ -3978,7 +3990,7 @@ multiclass CompareSIPair<string mnemonic, bits<8> siOpcode, bits<16> siyOpcode,
 }
 
 class CompareVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                  TypedReg tr, bits<4> type>
+                  TypedReg tr, bits<4> type, string fp_mnemonic = "">
   : InstVRRa<opcode, (outs), (ins tr.op:$V1, tr.op:$V2),
              mnemonic#"\t$V1, $V2",
              [(set CC, (operator (tr.vt tr.op:$V1), (tr.vt tr.op:$V2)))]> {
@@ -3986,6 +3998,8 @@ class CompareVRRa<string mnemonic, bits<16> opcode, SDPatternOperator operator,
   let M3 = type;
   let M4 = 0;
   let M5 = 0;
+  let OpKey = fp_mnemonic#!subst("VR", "FP", !cast<string>(tr.op));
+  let OpType = "reg";
 }
 
 class CompareVRRaGeneric<string mnemonic, bits<16> opcode>
@@ -4043,7 +4057,7 @@ class TestVRRg<string mnemonic, bits<16> opcode>
 class SideEffectTernarySSc<string mnemonic, bits<8> opcode>
   : InstSSc<opcode, (outs), (ins bdladdr12onlylen4:$BDL1,
                                  shift12only:$BD2, imm32zx4:$I3),
-            mnemonic##"\t$BDL1, $BD2, $I3", []>;
+            mnemonic#"\t$BDL1, $BD2, $I3", []>;
 
 class SideEffectTernaryRRFa<string mnemonic, bits<16> opcode,
                             RegisterOperand cls1, RegisterOperand cls2,
@@ -4179,7 +4193,7 @@ class TernaryRSY<string mnemonic, bits<16> opcode, RegisterOperand cls,
 
 multiclass TernaryRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
                          RegisterOperand cls, bits<5> bytes> {
-  let DispKey = mnemonic ## #cls in {
+  let DispKey = mnemonic # cls in {
     let DispSize = "12" in
       def "" : TernaryRS<mnemonic, rsOpcode, cls, bytes, bdaddr12pair>;
     let DispSize = "20" in
@@ -4303,7 +4317,7 @@ multiclass TernaryOptVRRbSPair<string mnemonic, bits<16> opcode,
                   (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
                                             tr2.op:$V3, 0)>;
   let Defs = [CC] in
-    def S : TernaryVRRb<mnemonic##"s", opcode, operator_cc, tr1, tr2, type,
+    def S : TernaryVRRb<mnemonic#"s", opcode, operator_cc, tr1, tr2, type,
                         imm32zx4even_timm, !add(!and (modifier, 14), 1)>;
   def : InstAlias<mnemonic#"s\t$V1, $V2, $V3",
                   (!cast<Instruction>(NAME#"S") tr1.op:$V1, tr2.op:$V2,
@@ -4371,7 +4385,7 @@ class TernaryVRRdGeneric<string mnemonic, bits<16> opcode>
 }
 
 // Ternary operation where the assembler mnemonic has an extra operand to
-// optionally allow specifiying arbitrary M6 values.
+// optionally allow specifying arbitrary M6 values.
 multiclass TernaryExtraVRRd<string mnemonic, bits<16> opcode,
                              SDPatternOperator operator,
                              TypedReg tr1, TypedReg tr2, bits<4> type> {
@@ -4399,7 +4413,8 @@ multiclass TernaryExtraVRRdGeneric<string mnemonic, bits<16> opcode> {
 }
 
 class TernaryVRRe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
-                  TypedReg tr1, TypedReg tr2, bits<4> m5 = 0, bits<4> type = 0>
+                  TypedReg tr1, TypedReg tr2, bits<4> m5 = 0, bits<4> type = 0,
+                  string fp_mnemonic = "">
   : InstVRRe<opcode, (outs tr1.op:$V1),
              (ins tr2.op:$V2, tr2.op:$V3, tr1.op:$V4),
              mnemonic#"\t$V1, $V2, $V3, $V4",
@@ -4408,6 +4423,8 @@ class TernaryVRRe<string mnemonic, bits<16> opcode, SDPatternOperator operator,
                                                   (tr1.vt tr1.op:$V4)))]> {
   let M5 = m5;
   let M6 = type;
+  let OpKey = fp_mnemonic#"MemFold"#!subst("VR", "FP", !cast<string>(tr1.op));
+  let OpType = "reg";
 }
 
 class TernaryVRReFloatGeneric<string mnemonic, bits<16> opcode>
@@ -4536,7 +4553,7 @@ multiclass QuaternaryOptVRRdSPair<string mnemonic, bits<16> opcode,
                   (!cast<Instruction>(NAME) tr1.op:$V1, tr2.op:$V2,
                                             tr2.op:$V3, tr2.op:$V4, 0)>;
   let Defs = [CC] in
-    def S : QuaternaryVRRd<mnemonic##"s", opcode, operator_cc,
+    def S : QuaternaryVRRd<mnemonic#"s", opcode, operator_cc,
                            tr1, tr2, tr2, tr2, type,
                            imm32zx4even_timm, !add (!and (modifier, 14), 1)>;
   def : InstAlias<mnemonic#"s\t$V1, $V2, $V3, $V4",
@@ -4630,7 +4647,7 @@ class CmpSwapRSY<string mnemonic, bits<16> opcode, SDPatternOperator operator,
 
 multiclass CmpSwapRSPair<string mnemonic, bits<8> rsOpcode, bits<16> rsyOpcode,
                          SDPatternOperator operator, RegisterOperand cls> {
-  let DispKey = mnemonic ## #cls in {
+  let DispKey = mnemonic # cls in {
     let DispSize = "12" in
       def "" : CmpSwapRS<mnemonic, rsOpcode, operator, cls, bdaddr12pair>;
     let DispSize = "20" in
@@ -4650,13 +4667,13 @@ class RotateSelectRIEf<string mnemonic, bits<16> opcode, RegisterOperand cls1,
 
 class PrefetchRXY<string mnemonic, bits<16> opcode, SDPatternOperator operator>
   : InstRXYb<opcode, (outs), (ins imm32zx4:$M1, bdxaddr20only:$XBD2),
-             mnemonic##"\t$M1, $XBD2",
+             mnemonic#"\t$M1, $XBD2",
              [(operator imm32zx4_timm:$M1, bdxaddr20only:$XBD2)]>;
 
 class PrefetchRILPC<string mnemonic, bits<12> opcode,
                     SDPatternOperator operator>
   : InstRILc<opcode, (outs), (ins imm32zx4_timm:$M1, pcrel32:$RI2),
-             mnemonic##"\t$M1, $RI2",
+             mnemonic#"\t$M1, $RI2",
              [(operator imm32zx4_timm:$M1, pcrel32:$RI2)]> {
   // We want PC-relative addresses to be tried ahead of BD and BDX addresses.
   // However, BDXs have two extra operands and are therefore 6 units more
@@ -4765,7 +4782,9 @@ multiclass BinaryRIAndKPseudo<string key, SDPatternOperator operator,
 class MemFoldPseudo<string mnemonic, RegisterOperand cls, bits<5> bytes,
                     AddressingMode mode>
   : Pseudo<(outs cls:$R1), (ins cls:$R2, mode:$XBD2), []> {
-    let OpKey = mnemonic#"rk"#cls;
+    let OpKey = !subst("mscrk", "msrkc",
+                !subst("msgcrk", "msgrkc",
+                mnemonic#"rk"#cls));
     let OpType = "mem";
     let MemKey = mnemonic#cls;
     let MemType = "pseudo";
@@ -4775,6 +4794,40 @@ class MemFoldPseudo<string mnemonic, RegisterOperand cls, bits<5> bytes,
     let hasNoSchedulingInfo = 1;
 }
 
+// Same as MemFoldPseudo but for mapping a W... vector instruction
+class MemFoldPseudo_FP<string mnemonic, RegisterOperand cls, bits<5> bytes,
+                    AddressingMode mode>
+  : MemFoldPseudo<mnemonic, cls, bytes, mode> {
+    let OpKey = mnemonic#"r"#"MemFold"#cls;
+}
+
+class MemFoldPseudo_FPTern<string mnemonic, RegisterOperand cls, bits<5> bytes,
+                           AddressingMode mode>
+  : Pseudo<(outs cls:$R1), (ins cls:$R2, cls:$R3, mode:$XBD2), []> {
+    let OpKey = mnemonic#"r"#"MemFold"#cls;
+    let OpType = "mem";
+    let MemKey = mnemonic#cls;
+    let MemType = "pseudo";
+    let mayLoad = 1;
+    let AccessBytes = bytes;
+    let HasIndex = 1;
+    let hasNoSchedulingInfo = 1;
+}
+
+// Same as MemFoldPseudo but for Load On Condition with CC operands.
+class MemFoldPseudo_CondMove<string mnemonic, RegisterOperand cls, bits<5> bytes,
+                             AddressingMode mode>
+  : Pseudo<(outs cls:$R1),
+           (ins cls:$R2, mode:$XBD2, cond4:$valid, cond4:$M3), []> {
+    let OpKey = !subst("loc", "sel", mnemonic)#"r"#cls;
+    let OpType = "mem";
+    let MemKey = mnemonic#cls;
+    let MemType = "pseudo";
+    let mayLoad = 1;
+    let AccessBytes = bytes;
+    let hasNoSchedulingInfo = 1;
+}
+
 // Like CompareRI, but expanded after RA depending on the choice of register.
 class CompareRIPseudo<SDPatternOperator operator, RegisterOperand cls,
                       ImmOpWithPattern imm>
@@ -4813,6 +4866,8 @@ class CondBinaryRRFPseudo<string mnemonic, RegisterOperand cls1,
   let CCMaskLast = 1;
   let NumOpsKey = !subst("loc", "sel", mnemonic);
   let NumOpsValue = "2";
+  let OpKey = mnemonic#cls1;
+  let OpType = "reg";
 }
 
 // Like CondBinaryRRFa, but expanded after RA depending on the choice of
@@ -4826,6 +4881,8 @@ class CondBinaryRRFaPseudo<string mnemonic, RegisterOperand cls1,
   let CCMaskLast = 1;
   let NumOpsKey = mnemonic;
   let NumOpsValue = "3";
+  let OpKey = mnemonic#cls1;
+  let OpType = "reg";
 }
 
 // Like CondBinaryRIE, but expanded after RA depending on the choice of
@@ -4842,8 +4899,9 @@ class CondBinaryRIEPseudo<RegisterOperand cls, ImmOpWithPattern imm>
 
 // Like CondUnaryRSY, but expanded after RA depending on the choice of
 // register.
-class CondUnaryRSYPseudo<SDPatternOperator operator, RegisterOperand cls,
-                         bits<5> bytes, AddressingMode mode = bdaddr20only>
+class CondUnaryRSYPseudo<string mnemonic, SDPatternOperator operator,
+                         RegisterOperand cls, bits<5> bytes,
+                         AddressingMode mode = bdaddr20only>
   : Pseudo<(outs cls:$R1),
            (ins cls:$R1src, mode:$BD2, cond4:$valid, cond4:$R3),
            [(set cls:$R1,
@@ -4854,6 +4912,10 @@ class CondUnaryRSYPseudo<SDPatternOperator operator, RegisterOperand cls,
   let mayLoad = 1;
   let AccessBytes = bytes;
   let CCMaskLast = 1;
+  let OpKey = mnemonic#"r"#cls;
+  let OpType = "mem";
+  let MemKey = mnemonic#cls;
+  let MemType = "target";
 }
 
 // Like CondStoreRSY, but expanded after RA depending on the choice of
@@ -5039,7 +5101,6 @@ multiclass BinaryRXYAndPseudo<string mnemonic, bits<16> opcode,
                               SDPatternOperator operator, RegisterOperand cls,
                               SDPatternOperator load, bits<5> bytes,
                               AddressingMode mode = bdxaddr20only> {
-
   def "" : BinaryRXY<mnemonic, opcode, operator, cls, load, bytes, mode> {
     let MemKey = mnemonic#cls;
     let MemType = "target";
@@ -5052,7 +5113,7 @@ multiclass BinaryRXPairAndPseudo<string mnemonic, bits<8> rxOpcode,
                                  bits<16> rxyOpcode, SDPatternOperator operator,
                                  RegisterOperand cls,
                                  SDPatternOperator load, bits<5> bytes> {
-  let DispKey = mnemonic ## #cls in {
+  let DispKey = mnemonic # cls in {
     def "" : BinaryRX<mnemonic, rxOpcode, operator, cls, load, bytes,
                       bdxaddr12pair> {
       let DispSize = "12";
@@ -5066,6 +5127,43 @@ multiclass BinaryRXPairAndPseudo<string mnemonic, bits<8> rxOpcode,
   def _MemFoldPseudo : MemFoldPseudo<mnemonic, cls, bytes, bdxaddr12pair>;
 }
 
+multiclass BinaryRXEAndPseudo<string mnemonic, bits<16> opcode,
+                              SDPatternOperator operator, RegisterOperand cls,
+                              SDPatternOperator load, bits<5> bytes> {
+  def "" : BinaryRXE<mnemonic, opcode, operator, cls, load, bytes> {
+    let MemKey = mnemonic#cls;
+    let MemType = "target";
+  }
+  def _MemFoldPseudo : MemFoldPseudo_FP<mnemonic, cls, bytes, bdxaddr12pair>;
+}
+
+multiclass TernaryRXFAndPseudo<string mnemonic, bits<16> opcode,
+                               SDPatternOperator operator, RegisterOperand cls1,
+                               RegisterOperand cls2, SDPatternOperator load,
+                               bits<5> bytes> {
+  def "" : TernaryRXF<mnemonic, opcode, operator, cls1, cls2, load, bytes> {
+    let MemKey = mnemonic#cls1;
+    let MemType = "target";
+  }
+  def _MemFoldPseudo : MemFoldPseudo_FPTern<mnemonic, cls1, bytes, bdxaddr12pair>;
+}
+
+multiclass CondUnaryRSYPairAndMemFold<string mnemonic, bits<16> opcode,
+                                      SDPatternOperator operator,
+                                      RegisterOperand cls, bits<5> bytes,
+                                      AddressingMode mode = bdaddr20only> {
+  defm "" : CondUnaryRSYPair<mnemonic, opcode, operator, cls, bytes, mode>;
+  def _MemFoldPseudo : MemFoldPseudo_CondMove<mnemonic, cls, bytes, mode>;
+}
+
+multiclass CondUnaryRSYPseudoAndMemFold<string mnemonic,
+                                        SDPatternOperator operator,
+                                        RegisterOperand cls, bits<5> bytes,
+                                        AddressingMode mode = bdaddr20only> {
+  def "" : CondUnaryRSYPseudo<mnemonic, operator, cls, bytes, mode>;
+  def _MemFoldPseudo : MemFoldPseudo_CondMove<mnemonic, cls, bytes, mode>;
+}
+
 // Define an instruction that operates on two fixed-length blocks of memory,
 // and associated pseudo instructions for operating on blocks of any size.
 // The Sequence form uses a straight-line sequence of instructions and
@@ -5086,7 +5184,7 @@ multiclass MemorySS<string mnemonic, bits<8> opcode,
   }
 }
 
-// The same, but setting a CC result as comparion operator.
+// The same, but setting a CC result as comparison operator.
 multiclass CompareMemorySS<string mnemonic, bits<8> opcode,
                           SDPatternOperator sequence, SDPatternOperator loop> {
   def "" : SideEffectBinarySSa<mnemonic, opcode>;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
index 97c8fa7aa32e..223cfcba2fac 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.cpp
@@ -513,8 +513,8 @@ unsigned SystemZInstrInfo::insertBranch(MachineBasicBlock &MBB,
   return Count;
 }
 
-bool SystemZInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                                      unsigned &SrcReg2, int &Mask,
+bool SystemZInstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                                      Register &SrcReg2, int &Mask,
                                       int &Value) const {
   assert(MI.isCompare() && "Caller should have checked for a comparison");
 
@@ -532,8 +532,9 @@ bool SystemZInstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
 
 bool SystemZInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
                                        ArrayRef<MachineOperand> Pred,
-                                       unsigned TrueReg, unsigned FalseReg,
-                                       int &CondCycles, int &TrueCycles,
+                                       Register DstReg, Register TrueReg,
+                                       Register FalseReg, int &CondCycles,
+                                       int &TrueCycles,
                                        int &FalseCycles) const {
   // Not all subtargets have LOCR instructions.
   if (!STI.hasLoadStoreOnCond())
@@ -565,10 +566,10 @@ bool SystemZInstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
 
 void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
                                     MachineBasicBlock::iterator I,
-                                    const DebugLoc &DL, unsigned DstReg,
+                                    const DebugLoc &DL, Register DstReg,
                                     ArrayRef<MachineOperand> Pred,
-                                    unsigned TrueReg,
-                                    unsigned FalseReg) const {
+                                    Register TrueReg,
+                                    Register FalseReg) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const TargetRegisterClass *RC = MRI.getRegClass(DstReg);
 
@@ -606,7 +607,7 @@ void SystemZInstrInfo::insertSelect(MachineBasicBlock &MBB,
 }
 
 bool SystemZInstrInfo::FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
-                                     unsigned Reg,
+                                     Register Reg,
                                      MachineRegisterInfo *MRI) const {
   unsigned DefOpc = DefMI.getOpcode();
   if (DefOpc != SystemZ::LHIMux && DefOpc != SystemZ::LHI &&
@@ -819,18 +820,11 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     return;
   }
 
-  // Move CC value from/to a GR32.
-  if (SrcReg == SystemZ::CC) {
-    auto MIB = BuildMI(MBB, MBBI, DL, get(SystemZ::IPM), DestReg);
-    if (KillSrc) {
-      const MachineFunction *MF = MBB.getParent();
-      const TargetRegisterInfo *TRI = MF->getSubtarget().getRegisterInfo();
-      MIB->addRegisterKilled(SrcReg, TRI);
-    }
-    return;
-  }
+  // Move CC value from a GR32.
   if (DestReg == SystemZ::CC) {
-    BuildMI(MBB, MBBI, DL, get(SystemZ::TMLH))
+    unsigned Opcode =
+      SystemZ::GR32BitRegClass.contains(SrcReg) ? SystemZ::TMLH : SystemZ::TMHH;
+    BuildMI(MBB, MBBI, DL, get(Opcode))
       .addReg(SrcReg, getKillRegState(KillSrc))
       .addImm(3 << (SystemZ::IPM_CC - 16));
     return;
@@ -855,12 +849,6 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
     Opcode = SystemZ::VLR;
   else if (SystemZ::AR32BitRegClass.contains(DestReg, SrcReg))
     Opcode = SystemZ::CPYA;
-  else if (SystemZ::AR32BitRegClass.contains(DestReg) &&
-           SystemZ::GR32BitRegClass.contains(SrcReg))
-    Opcode = SystemZ::SAR;
-  else if (SystemZ::GR32BitRegClass.contains(DestReg) &&
-           SystemZ::AR32BitRegClass.contains(SrcReg))
-    Opcode = SystemZ::EAR;
   else
     llvm_unreachable("Impossible reg-to-reg copy");
 
@@ -869,7 +857,7 @@ void SystemZInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 }
 
 void SystemZInstrInfo::storeRegToStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned SrcReg,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register SrcReg,
     bool isKill, int FrameIdx, const TargetRegisterClass *RC,
     const TargetRegisterInfo *TRI) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -884,7 +872,7 @@ void SystemZInstrInfo::storeRegToStackSlot(
 }
 
 void SystemZInstrInfo::loadRegFromStackSlot(
-    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, unsigned DestReg,
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, Register DestReg,
     int FrameIdx, const TargetRegisterClass *RC,
     const TargetRegisterInfo *TRI) const {
   DebugLoc DL = MBBI != MBB.end() ? MBBI->getDebugLoc() : DebugLoc();
@@ -1005,33 +993,36 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     MachineBasicBlock::iterator InsertPt, int FrameIndex,
     LiveIntervals *LIS, VirtRegMap *VRM) const {
   const TargetRegisterInfo *TRI = MF.getSubtarget().getRegisterInfo();
+  MachineRegisterInfo &MRI = MF.getRegInfo();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   unsigned Size = MFI.getObjectSize(FrameIndex);
   unsigned Opcode = MI.getOpcode();
 
+  // Check CC liveness if new instruction introduces a dead def of CC.
+  MCRegUnitIterator CCUnit(SystemZ::CC, TRI);
+  SlotIndex MISlot = SlotIndex();
+  LiveRange *CCLiveRange = nullptr;
+  bool CCLiveAtMI = true;
+  if (LIS) {
+    MISlot = LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot();
+    CCLiveRange = &LIS->getRegUnit(*CCUnit);
+    CCLiveAtMI = CCLiveRange->liveAt(MISlot);
+  }
+  ++CCUnit;
+  assert(!CCUnit.isValid() && "CC only has one reg unit.");
+
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
-    if (LIS != nullptr && (Opcode == SystemZ::LA || Opcode == SystemZ::LAY) &&
+    if (!CCLiveAtMI && (Opcode == SystemZ::LA || Opcode == SystemZ::LAY) &&
         isInt<8>(MI.getOperand(2).getImm()) && !MI.getOperand(3).getReg()) {
-
-      // Check CC liveness, since new instruction introduces a dead
-      // def of CC.
-      MCRegUnitIterator CCUnit(SystemZ::CC, TRI);
-      LiveRange &CCLiveRange = LIS->getRegUnit(*CCUnit);
-      ++CCUnit;
-      assert(!CCUnit.isValid() && "CC only has one reg unit.");
-      SlotIndex MISlot =
-          LIS->getSlotIndexes()->getInstructionIndex(MI).getRegSlot();
-      if (!CCLiveRange.liveAt(MISlot)) {
-        // LA(Y) %reg, CONST(%reg) -> AGSI %mem, CONST
-        MachineInstr *BuiltMI = BuildMI(*InsertPt->getParent(), InsertPt,
-                                        MI.getDebugLoc(), get(SystemZ::AGSI))
-                                    .addFrameIndex(FrameIndex)
-                                    .addImm(0)
-                                    .addImm(MI.getOperand(2).getImm());
-        BuiltMI->findRegisterDefOperand(SystemZ::CC)->setIsDead(true);
-        CCLiveRange.createDeadDef(MISlot, LIS->getVNInfoAllocator());
-        return BuiltMI;
-      }
+      // LA(Y) %reg, CONST(%reg) -> AGSI %mem, CONST
+      MachineInstr *BuiltMI = BuildMI(*InsertPt->getParent(), InsertPt,
+                                      MI.getDebugLoc(), get(SystemZ::AGSI))
+        .addFrameIndex(FrameIndex)
+        .addImm(0)
+        .addImm(MI.getOperand(2).getImm());
+      BuiltMI->findRegisterDefOperand(SystemZ::CC)->setIsDead(true);
+      CCLiveRange->createDeadDef(MISlot, LIS->getVNInfoAllocator());
+      return BuiltMI;
     }
     return nullptr;
   }
@@ -1090,6 +1081,32 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
     return BuiltMI;
   }
 
+  unsigned MemImmOpc = 0;
+  switch (Opcode) {
+  case SystemZ::LHIMux:
+  case SystemZ::LHI:    MemImmOpc = SystemZ::MVHI;  break;
+  case SystemZ::LGHI:   MemImmOpc = SystemZ::MVGHI; break;
+  case SystemZ::CHIMux:
+  case SystemZ::CHI:    MemImmOpc = SystemZ::CHSI;  break;
+  case SystemZ::CGHI:   MemImmOpc = SystemZ::CGHSI; break;
+  case SystemZ::CLFIMux:
+  case SystemZ::CLFI:
+    if (isUInt<16>(MI.getOperand(1).getImm()))
+      MemImmOpc = SystemZ::CLFHSI;
+    break;
+  case SystemZ::CLGFI:
+    if (isUInt<16>(MI.getOperand(1).getImm()))
+      MemImmOpc = SystemZ::CLGHSI;
+    break;
+  default: break;
+  }
+  if (MemImmOpc)
+    return BuildMI(*InsertPt->getParent(), InsertPt, MI.getDebugLoc(),
+                   get(MemImmOpc))
+               .addFrameIndex(FrameIndex)
+               .addImm(0)
+               .addImm(MI.getOperand(1).getImm());
+
   if (Opcode == SystemZ::LGDR || Opcode == SystemZ::LDGR) {
     bool Op0IsGPR = (Opcode == SystemZ::LGDR);
     bool Op1IsGPR = (Opcode == SystemZ::LDGR);
@@ -1159,57 +1176,144 @@ MachineInstr *SystemZInstrInfo::foldMemoryOperandImpl(
   }
 
   // If the spilled operand is the final one or the instruction is
-  // commutable, try to change <INSN>R into <INSN>.
+  // commutable, try to change <INSN>R into <INSN>.  Don't introduce a def of
+  // CC if it is live and MI does not define it.
   unsigned NumOps = MI.getNumExplicitOperands();
   int MemOpcode = SystemZ::getMemOpcode(Opcode);
+  if (MemOpcode == -1 ||
+      (CCLiveAtMI && !MI.definesRegister(SystemZ::CC) &&
+       get(MemOpcode).hasImplicitDefOfPhysReg(SystemZ::CC)))
+    return nullptr;
+
+  // Check if all other vregs have a usable allocation in the case of vector
+  // to FP conversion.
+  const MCInstrDesc &MCID = MI.getDesc();
+  for (unsigned I = 0, E = MCID.getNumOperands(); I != E; ++I) {
+    const MCOperandInfo &MCOI = MCID.OpInfo[I];
+    if (MCOI.OperandType != MCOI::OPERAND_REGISTER || I == OpNum)
+      continue;
+    const TargetRegisterClass *RC = TRI->getRegClass(MCOI.RegClass);
+    if (RC == &SystemZ::VR32BitRegClass || RC == &SystemZ::VR64BitRegClass) {
+      Register Reg = MI.getOperand(I).getReg();
+      Register PhysReg = Register::isVirtualRegister(Reg)
+                             ? (VRM ? VRM->getPhys(Reg) : Register())
+                             : Reg;
+      if (!PhysReg ||
+          !(SystemZ::FP32BitRegClass.contains(PhysReg) ||
+            SystemZ::FP64BitRegClass.contains(PhysReg) ||
+            SystemZ::VF128BitRegClass.contains(PhysReg)))
+        return nullptr;
+    }
+  }
+  // Fused multiply and add/sub need to have the same dst and accumulator reg.
+  bool FusedFPOp = (Opcode == SystemZ::WFMADB || Opcode == SystemZ::WFMASB ||
+                    Opcode == SystemZ::WFMSDB || Opcode == SystemZ::WFMSSB);
+  if (FusedFPOp) {
+    Register DstReg = VRM->getPhys(MI.getOperand(0).getReg());
+    Register AccReg = VRM->getPhys(MI.getOperand(3).getReg());
+    if (OpNum == 0 || OpNum == 3 || DstReg != AccReg)
+      return nullptr;
+  }
+
+  // Try to swap compare operands if possible.
+  bool NeedsCommute = false;
+  if ((MI.getOpcode() == SystemZ::CR || MI.getOpcode() == SystemZ::CGR ||
+       MI.getOpcode() == SystemZ::CLR || MI.getOpcode() == SystemZ::CLGR ||
+       MI.getOpcode() == SystemZ::WFCDB || MI.getOpcode() == SystemZ::WFCSB ||
+       MI.getOpcode() == SystemZ::WFKDB || MI.getOpcode() == SystemZ::WFKSB) &&
+      OpNum == 0 && prepareCompareSwapOperands(MI))
+    NeedsCommute = true;
+
+  bool CCOperands = false;
+  if (MI.getOpcode() == SystemZ::LOCRMux || MI.getOpcode() == SystemZ::LOCGR ||
+      MI.getOpcode() == SystemZ::SELRMux || MI.getOpcode() == SystemZ::SELGR) {
+    assert(MI.getNumOperands() == 6 && NumOps == 5 &&
+           "LOCR/SELR instruction operands corrupt?");
+    NumOps -= 2;
+    CCOperands = true;
+  }
 
   // See if this is a 3-address instruction that is convertible to 2-address
   // and suitable for folding below.  Only try this with virtual registers
   // and a provided VRM (during regalloc).
-  bool NeedsCommute = false;
-  if (SystemZ::getTwoOperandOpcode(Opcode) != -1 && MemOpcode != -1) {
+  if (NumOps == 3 && SystemZ::getTargetMemOpcode(MemOpcode) != -1) {
     if (VRM == nullptr)
-      MemOpcode = -1;
+      return nullptr;
     else {
-      assert(NumOps == 3 && "Expected two source registers.");
       Register DstReg = MI.getOperand(0).getReg();
       Register DstPhys =
           (Register::isVirtualRegister(DstReg) ? VRM->getPhys(DstReg) : DstReg);
       Register SrcReg = (OpNum == 2 ? MI.getOperand(1).getReg()
                                     : ((OpNum == 1 && MI.isCommutable())
                                            ? MI.getOperand(2).getReg()
-                                         : Register()));
+                                           : Register()));
       if (DstPhys && !SystemZ::GRH32BitRegClass.contains(DstPhys) && SrcReg &&
           Register::isVirtualRegister(SrcReg) &&
           DstPhys == VRM->getPhys(SrcReg))
         NeedsCommute = (OpNum == 1);
       else
-        MemOpcode = -1;
+        return nullptr;
     }
   }
 
-  if (MemOpcode >= 0) {
-    if ((OpNum == NumOps - 1) || NeedsCommute) {
-      const MCInstrDesc &MemDesc = get(MemOpcode);
-      uint64_t AccessBytes = SystemZII::getAccessSize(MemDesc.TSFlags);
-      assert(AccessBytes != 0 && "Size of access should be known");
-      assert(AccessBytes <= Size && "Access outside the frame index");
-      uint64_t Offset = Size - AccessBytes;
-      MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
-                                        MI.getDebugLoc(), get(MemOpcode));
+  if ((OpNum == NumOps - 1) || NeedsCommute || FusedFPOp) {
+    const MCInstrDesc &MemDesc = get(MemOpcode);
+    uint64_t AccessBytes = SystemZII::getAccessSize(MemDesc.TSFlags);
+    assert(AccessBytes != 0 && "Size of access should be known");
+    assert(AccessBytes <= Size && "Access outside the frame index");
+    uint64_t Offset = Size - AccessBytes;
+    MachineInstrBuilder MIB = BuildMI(*InsertPt->getParent(), InsertPt,
+                                      MI.getDebugLoc(), get(MemOpcode));
+    if (MI.isCompare()) {
+      assert(NumOps == 2 && "Expected 2 register operands for a compare.");
+      MIB.add(MI.getOperand(NeedsCommute ? 1 : 0));
+    }
+    else if (FusedFPOp) {
+      MIB.add(MI.getOperand(0));
+      MIB.add(MI.getOperand(3));
+      MIB.add(MI.getOperand(OpNum == 1 ? 2 : 1));
+    }
+    else {
       MIB.add(MI.getOperand(0));
       if (NeedsCommute)
         MIB.add(MI.getOperand(2));
       else
         for (unsigned I = 1; I < OpNum; ++I)
           MIB.add(MI.getOperand(I));
-      MIB.addFrameIndex(FrameIndex).addImm(Offset);
-      if (MemDesc.TSFlags & SystemZII::HasIndex)
-        MIB.addReg(0);
-      transferDeadCC(&MI, MIB);
-      transferMIFlag(&MI, MIB, MachineInstr::NoSWrap);
-      return MIB;
     }
+    MIB.addFrameIndex(FrameIndex).addImm(Offset);
+    if (MemDesc.TSFlags & SystemZII::HasIndex)
+      MIB.addReg(0);
+    if (CCOperands) {
+      unsigned CCValid = MI.getOperand(NumOps).getImm();
+      unsigned CCMask = MI.getOperand(NumOps + 1).getImm();
+      MIB.addImm(CCValid);
+      MIB.addImm(NeedsCommute ? CCMask ^ CCValid : CCMask);
+    }
+    if (MIB->definesRegister(SystemZ::CC) &&
+        (!MI.definesRegister(SystemZ::CC) ||
+         MI.registerDefIsDead(SystemZ::CC))) {
+      MIB->addRegisterDead(SystemZ::CC, TRI);
+      if (CCLiveRange)
+        CCLiveRange->createDeadDef(MISlot, LIS->getVNInfoAllocator());
+    }
+    // Constrain the register classes if converted from a vector opcode. The
+    // allocated regs are in an FP reg-class per previous check above.
+    for (const MachineOperand &MO : MIB->operands())
+      if (MO.isReg() && Register::isVirtualRegister(MO.getReg())) {
+        unsigned Reg = MO.getReg();
+        if (MRI.getRegClass(Reg) == &SystemZ::VR32BitRegClass)
+          MRI.setRegClass(Reg, &SystemZ::FP32BitRegClass);
+        else if (MRI.getRegClass(Reg) == &SystemZ::VR64BitRegClass)
+          MRI.setRegClass(Reg, &SystemZ::FP64BitRegClass);
+        else if (MRI.getRegClass(Reg) == &SystemZ::VR128BitRegClass)
+          MRI.setRegClass(Reg, &SystemZ::VF128BitRegClass);
+      }
+
+    transferDeadCC(&MI, MIB);
+    transferMIFlag(&MI, MIB, MachineInstr::NoSWrap);
+    transferMIFlag(&MI, MIB, MachineInstr::NoFPExcept);
+    return MIB;
   }
 
   return nullptr;
@@ -1718,6 +1822,80 @@ unsigned SystemZInstrInfo::getFusedCompare(unsigned Opcode,
   return 0;
 }
 
+bool SystemZInstrInfo::
+prepareCompareSwapOperands(MachineBasicBlock::iterator const MBBI) const {
+  assert(MBBI->isCompare() && MBBI->getOperand(0).isReg() &&
+         MBBI->getOperand(1).isReg() && !MBBI->mayLoad() &&
+         "Not a compare reg/reg.");
+
+  MachineBasicBlock *MBB = MBBI->getParent();
+  bool CCLive = true;
+  SmallVector<MachineInstr *, 4> CCUsers;
+  for (MachineBasicBlock::iterator Itr = std::next(MBBI);
+       Itr != MBB->end(); ++Itr) {
+    if (Itr->readsRegister(SystemZ::CC)) {
+      unsigned Flags = Itr->getDesc().TSFlags;
+      if ((Flags & SystemZII::CCMaskFirst) || (Flags & SystemZII::CCMaskLast))
+        CCUsers.push_back(&*Itr);
+      else
+        return false;
+    }
+    if (Itr->definesRegister(SystemZ::CC)) {
+      CCLive = false;
+      break;
+    }
+  }
+  if (CCLive) {
+    LivePhysRegs LiveRegs(*MBB->getParent()->getSubtarget().getRegisterInfo());
+    LiveRegs.addLiveOuts(*MBB);
+    if (LiveRegs.contains(SystemZ::CC))
+      return false;
+  }
+
+  // Update all CC users.
+  for (unsigned Idx = 0; Idx < CCUsers.size(); ++Idx) {
+    unsigned Flags = CCUsers[Idx]->getDesc().TSFlags;
+    unsigned FirstOpNum = ((Flags & SystemZII::CCMaskFirst) ?
+                           0 : CCUsers[Idx]->getNumExplicitOperands() - 2);
+    MachineOperand &CCMaskMO = CCUsers[Idx]->getOperand(FirstOpNum + 1);
+    unsigned NewCCMask = SystemZ::reverseCCMask(CCMaskMO.getImm());
+    CCMaskMO.setImm(NewCCMask);
+  }
+
+  return true;
+}
+
+unsigned SystemZ::reverseCCMask(unsigned CCMask) {
+  return ((CCMask & SystemZ::CCMASK_CMP_EQ) |
+          (CCMask & SystemZ::CCMASK_CMP_GT ? SystemZ::CCMASK_CMP_LT : 0) |
+          (CCMask & SystemZ::CCMASK_CMP_LT ? SystemZ::CCMASK_CMP_GT : 0) |
+          (CCMask & SystemZ::CCMASK_CMP_UO));
+}
+
+MachineBasicBlock *SystemZ::emitBlockAfter(MachineBasicBlock *MBB) {
+  MachineFunction &MF = *MBB->getParent();
+  MachineBasicBlock *NewMBB = MF.CreateMachineBasicBlock(MBB->getBasicBlock());
+  MF.insert(std::next(MachineFunction::iterator(MBB)), NewMBB);
+  return NewMBB;
+}
+
+MachineBasicBlock *SystemZ::splitBlockAfter(MachineBasicBlock::iterator MI,
+                                            MachineBasicBlock *MBB) {
+  MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
+  NewMBB->splice(NewMBB->begin(), MBB,
+                 std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  return NewMBB;
+}
+
+MachineBasicBlock *SystemZ::splitBlockBefore(MachineBasicBlock::iterator MI,
+                                             MachineBasicBlock *MBB) {
+  MachineBasicBlock *NewMBB = emitBlockAfter(MBB);
+  NewMBB->splice(NewMBB->begin(), MBB, MI, MBB->end());
+  NewMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  return NewMBB;
+}
+
 unsigned SystemZInstrInfo::getLoadAndTrap(unsigned Opcode) const {
   if (!STI.hasLoadAndTrap())
     return 0;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
index 8391970c7d9d..72dafc3c93c2 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.h
@@ -155,6 +155,20 @@ enum FusedCompareType {
 namespace SystemZ {
 int getTwoOperandOpcode(uint16_t Opcode);
 int getTargetMemOpcode(uint16_t Opcode);
+
+// Return a version of comparison CC mask CCMask in which the LT and GT
+// actions are swapped.
+unsigned reverseCCMask(unsigned CCMask);
+
+// Create a new basic block after MBB.
+MachineBasicBlock *emitBlockAfter(MachineBasicBlock *MBB);
+// Split MBB after MI and return the new block (the one that contains
+// instructions after MI).
+MachineBasicBlock *splitBlockAfter(MachineBasicBlock::iterator MI,
+                                   MachineBasicBlock *MBB);
+// Split MBB before MI and return the new block (the one that contains MI).
+MachineBasicBlock *splitBlockBefore(MachineBasicBlock::iterator MI,
+                                    MachineBasicBlock *MBB);
 }
 
 class SystemZInstrInfo : public SystemZGenInstrInfo {
@@ -219,15 +233,16 @@ public:
                         MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
-  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                      unsigned &SrcReg2, int &Mask, int &Value) const override;
-  bool canInsertSelect(const MachineBasicBlock&, ArrayRef<MachineOperand> Cond,
-                       unsigned, unsigned, int&, int&, int&) const override;
+  bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                      Register &SrcReg2, int &Mask, int &Value) const override;
+  bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
+                       Register, Register, Register, int &, int &,
+                       int &) const override;
   void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                    const DebugLoc &DL, unsigned DstReg,
-                    ArrayRef<MachineOperand> Cond, unsigned TrueReg,
-                    unsigned FalseReg) const override;
-  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, unsigned Reg,
+                    const DebugLoc &DL, Register DstReg,
+                    ArrayRef<MachineOperand> Cond, Register TrueReg,
+                    Register FalseReg) const override;
+  bool FoldImmediate(MachineInstr &UseMI, MachineInstr &DefMI, Register Reg,
                      MachineRegisterInfo *MRI) const override;
   bool isPredicable(const MachineInstr &MI) const override;
   bool isProfitableToIfCvt(MachineBasicBlock &MBB, unsigned NumCycles,
@@ -247,12 +262,12 @@ public:
                    bool KillSrc) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MBBI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           Register SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
-                            unsigned DestReg, int FrameIdx,
+                            Register DestReg, int FrameIdx,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
   MachineInstr *convertToThreeAddress(MachineFunction::iterator &MFI,
@@ -313,6 +328,12 @@ public:
                            SystemZII::FusedCompareType Type,
                            const MachineInstr *MI = nullptr) const;
 
+  // Try to find all CC users of the compare instruction (MBBI) and update
+  // all of them to maintain equivalent behavior after swapping the compare
+  // operands. Return false if not all users can be conclusively found and
+  // handled. The compare instruction is *not* changed.
+  bool prepareCompareSwapOperands(MachineBasicBlock::iterator MBBI) const;
+
   // If Opcode is a LOAD opcode for with an associated LOAD AND TRAP
   // operation exists, returh the opcode for the latter, otherwise return 0.
   unsigned getLoadAndTrap(unsigned Opcode) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
index 9579dcc0d1b6..d5d56ecf6e47 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrInfo.td
@@ -29,6 +29,15 @@ let hasNoSchedulingInfo = 1, hasSideEffects = 1 in {
 def ADJDYNALLOC : Pseudo<(outs GR64:$dst), (ins dynalloc12only:$src),
                          [(set GR64:$dst, dynalloc12only:$src)]>;
 
+let Defs = [R15D, CC], Uses = [R15D], hasNoSchedulingInfo = 1,
+    usesCustomInserter = 1 in
+  def PROBED_ALLOCA : Pseudo<(outs GR64:$dst),
+                             (ins GR64:$oldSP, GR64:$space),
+           [(set GR64:$dst, (z_probed_alloca GR64:$oldSP, GR64:$space))]>;
+
+let Defs = [R1D, R15D, CC], Uses = [R15D], hasNoSchedulingInfo = 1,
+    hasSideEffects = 1 in
+  def PROBED_STACKALLOC : Pseudo<(outs), (ins i64imm:$stacksize), []>;
 
 //===----------------------------------------------------------------------===//
 // Branch instructions
@@ -492,7 +501,7 @@ let Predicates = [FeatureMiscellaneousExtensions3], Uses = [CC] in {
   let isCommutable = 1 in {
     // Expands to SELR or SELFHR or a branch-and-move sequence,
     // depending on the choice of registers.
-    def  SELRMux : CondBinaryRRFaPseudo<"selrmux", GRX32, GRX32, GRX32>;
+    def  SELRMux : CondBinaryRRFaPseudo<"MUXselr", GRX32, GRX32, GRX32>;
     defm SELFHR  : CondBinaryRRFaPair<"selfhr", 0xB9C0, GRH32, GRH32, GRH32>;
     defm SELR    : CondBinaryRRFaPair<"selr",   0xB9F0, GR32, GR32, GR32>;
     defm SELGR   : CondBinaryRRFaPair<"selgr",  0xB9E3, GR64, GR64, GR64>;
@@ -525,13 +534,13 @@ let Predicates = [FeatureLoadStoreOnCond2], Uses = [CC] in {
   let isCommutable = 1 in {
     // Expands to LOCR or LOCFHR or a branch-and-move sequence,
     // depending on the choice of registers.
-    def LOCRMux : CondBinaryRRFPseudo<"locrmux", GRX32, GRX32>;
+    def LOCRMux : CondBinaryRRFPseudo<"MUXlocr", GRX32, GRX32>;
     defm LOCFHR : CondBinaryRRFPair<"locfhr", 0xB9E0, GRH32, GRH32>;
   }
 
   // Load on condition.  Matched via DAG pattern.
   // Expands to LOC or LOCFH, depending on the choice of register.
-  def LOCMux : CondUnaryRSYPseudo<simple_load, GRX32, 4>;
+  defm LOCMux : CondUnaryRSYPseudoAndMemFold<"MUXloc", simple_load, GRX32, 4>;
   defm LOCFH : CondUnaryRSYPair<"locfh", 0xEBE0, simple_load, GRH32, 4>;
 
   // Store on condition.  Expanded from CondStore* pseudos.
@@ -564,7 +573,7 @@ let Predicates = [FeatureLoadStoreOnCond], Uses = [CC] in {
 
   // Load on condition.  Matched via DAG pattern.
   defm LOC  : CondUnaryRSYPair<"loc",  0xEBF2, simple_load, GR32, 4>;
-  defm LOCG : CondUnaryRSYPair<"locg", 0xEBE2, simple_load, GR64, 8>;
+  defm LOCG : CondUnaryRSYPairAndMemFold<"locg", 0xEBE2, simple_load, GR64, 8>;
 
   // Store on condition.  Expanded from CondStore* pseudos.
   defm STOC  : CondStoreRSYPair<"stoc",  0xEBF3, GR32, 4>;
@@ -1348,8 +1357,8 @@ def  MSG  : BinaryRXY<"msg",  0xE30C, mul, GR64, load, 8>;
 
 // Multiplication of memory, setting the condition code.
 let Predicates = [FeatureMiscellaneousExtensions2], Defs = [CC] in {
-  def MSC  : BinaryRXY<"msc",  0xE353, null_frag, GR32, load, 4>;
-  def MSGC : BinaryRXY<"msgc", 0xE383, null_frag, GR64, load, 8>;
+  defm MSC  : BinaryRXYAndPseudo<"msc",  0xE353, null_frag, GR32, load, 4>;
+  defm MSGC : BinaryRXYAndPseudo<"msgc", 0xE383, null_frag, GR64, load, 8>;
 }
 
 // Multiplication of a register, producing two results.
diff --git a/llvm/lib/Target/SystemZ/SystemZInstrVector.td b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
index c945122ee577..e73f1e429c3c 100644
--- a/llvm/lib/Target/SystemZ/SystemZInstrVector.td
+++ b/llvm/lib/Target/SystemZ/SystemZInstrVector.td
@@ -177,9 +177,13 @@ let Predicates = [FeatureVector] in {
 
 let Predicates = [FeatureVectorPackedDecimal] in {
   // Load rightmost with length.  The number of loaded bytes is only known
-  // at run time.
-  def VLRL : BinaryVSI<"vlrl", 0xE635, int_s390_vlrl, 0>;
+  // at run time.  Note that while the instruction will accept immediate
+  // lengths larger that 15 at runtime, those will always result in a trap,
+  // so we never emit them here.
+  def VLRL : BinaryVSI<"vlrl", 0xE635, null_frag, 0>;
   def VLRLR : BinaryVRSd<"vlrlr", 0xE637, int_s390_vlrl, 0>;
+  def : Pat<(int_s390_vlrl imm32zx4:$len, bdaddr12only:$addr),
+            (VLRL bdaddr12only:$addr, imm32zx4:$len)>;
 }
 
 // Use replicating loads if we're inserting a single element into an
@@ -243,9 +247,13 @@ let Predicates = [FeatureVector] in {
 
 let Predicates = [FeatureVectorPackedDecimal] in {
   // Store rightmost with length.  The number of stored bytes is only known
-  // at run time.
-  def VSTRL : StoreLengthVSI<"vstrl", 0xE63D, int_s390_vstrl, 0>;
+  // at run time.  Note that while the instruction will accept immediate
+  // lengths larger that 15 at runtime, those will always result in a trap,
+  // so we never emit them here.
+  def VSTRL : StoreLengthVSI<"vstrl", 0xE63D, null_frag, 0>;
   def VSTRLR : StoreLengthVRSd<"vstrlr", 0xE63F, int_s390_vstrl, 0>;
+  def : Pat<(int_s390_vstrl VR128:$val, imm32zx4:$len, bdaddr12only:$addr),
+            (VSTRL VR128:$val, bdaddr12only:$addr, imm32zx4:$len)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -463,49 +471,56 @@ defm : GenericVectorOps<v2f64, v2i64>;
 //===----------------------------------------------------------------------===//
 
 let Predicates = [FeatureVector] in {
-  // Add.
-  def VA  : BinaryVRRcGeneric<"va", 0xE7F3>;
-  def VAB : BinaryVRRc<"vab", 0xE7F3, add, v128b, v128b, 0>;
-  def VAH : BinaryVRRc<"vah", 0xE7F3, add, v128h, v128h, 1>;
-  def VAF : BinaryVRRc<"vaf", 0xE7F3, add, v128f, v128f, 2>;
-  def VAG : BinaryVRRc<"vag", 0xE7F3, add, v128g, v128g, 3>;
-  def VAQ : BinaryVRRc<"vaq", 0xE7F3, int_s390_vaq, v128q, v128q, 4>;
-
-  // Add compute carry.
-  def VACC  : BinaryVRRcGeneric<"vacc", 0xE7F1>;
-  def VACCB : BinaryVRRc<"vaccb", 0xE7F1, int_s390_vaccb, v128b, v128b, 0>;
-  def VACCH : BinaryVRRc<"vacch", 0xE7F1, int_s390_vacch, v128h, v128h, 1>;
-  def VACCF : BinaryVRRc<"vaccf", 0xE7F1, int_s390_vaccf, v128f, v128f, 2>;
-  def VACCG : BinaryVRRc<"vaccg", 0xE7F1, int_s390_vaccg, v128g, v128g, 3>;
-  def VACCQ : BinaryVRRc<"vaccq", 0xE7F1, int_s390_vaccq, v128q, v128q, 4>;
-
-  // Add with carry.
-  def VAC  : TernaryVRRdGeneric<"vac", 0xE7BB>;
-  def VACQ : TernaryVRRd<"vacq", 0xE7BB, int_s390_vacq, v128q, v128q, 4>;
-
-  // Add with carry compute carry.
-  def VACCC  : TernaryVRRdGeneric<"vaccc", 0xE7B9>;
-  def VACCCQ : TernaryVRRd<"vacccq", 0xE7B9, int_s390_vacccq, v128q, v128q, 4>;
+  let isCommutable = 1 in {
+    // Add.
+    def VA  : BinaryVRRcGeneric<"va", 0xE7F3>;
+    def VAB : BinaryVRRc<"vab", 0xE7F3, add, v128b, v128b, 0>;
+    def VAH : BinaryVRRc<"vah", 0xE7F3, add, v128h, v128h, 1>;
+    def VAF : BinaryVRRc<"vaf", 0xE7F3, add, v128f, v128f, 2>;
+    def VAG : BinaryVRRc<"vag", 0xE7F3, add, v128g, v128g, 3>;
+    def VAQ : BinaryVRRc<"vaq", 0xE7F3, int_s390_vaq, v128q, v128q, 4>;
+  }
+
+  let isCommutable = 1 in {
+    // Add compute carry.
+    def VACC  : BinaryVRRcGeneric<"vacc", 0xE7F1>;
+    def VACCB : BinaryVRRc<"vaccb", 0xE7F1, int_s390_vaccb, v128b, v128b, 0>;
+    def VACCH : BinaryVRRc<"vacch", 0xE7F1, int_s390_vacch, v128h, v128h, 1>;
+    def VACCF : BinaryVRRc<"vaccf", 0xE7F1, int_s390_vaccf, v128f, v128f, 2>;
+    def VACCG : BinaryVRRc<"vaccg", 0xE7F1, int_s390_vaccg, v128g, v128g, 3>;
+    def VACCQ : BinaryVRRc<"vaccq", 0xE7F1, int_s390_vaccq, v128q, v128q, 4>;
+
+    // Add with carry.
+    def VAC  : TernaryVRRdGeneric<"vac", 0xE7BB>;
+    def VACQ : TernaryVRRd<"vacq", 0xE7BB, int_s390_vacq, v128q, v128q, 4>;
+
+    // Add with carry compute carry.
+    def VACCC  : TernaryVRRdGeneric<"vaccc", 0xE7B9>;
+    def VACCCQ : TernaryVRRd<"vacccq", 0xE7B9, int_s390_vacccq, v128q, v128q, 4>;
+ }
 
   // And.
-  def VN : BinaryVRRc<"vn", 0xE768, null_frag, v128any, v128any>;
+  let isCommutable = 1 in
+    def VN : BinaryVRRc<"vn", 0xE768, null_frag, v128any, v128any>;
 
   // And with complement.
   def VNC : BinaryVRRc<"vnc", 0xE769, null_frag, v128any, v128any>;
 
-  // Average.
-  def VAVG  : BinaryVRRcGeneric<"vavg", 0xE7F2>;
-  def VAVGB : BinaryVRRc<"vavgb", 0xE7F2, int_s390_vavgb, v128b, v128b, 0>;
-  def VAVGH : BinaryVRRc<"vavgh", 0xE7F2, int_s390_vavgh, v128h, v128h, 1>;
-  def VAVGF : BinaryVRRc<"vavgf", 0xE7F2, int_s390_vavgf, v128f, v128f, 2>;
-  def VAVGG : BinaryVRRc<"vavgg", 0xE7F2, int_s390_vavgg, v128g, v128g, 3>;
-
-  // Average logical.
-  def VAVGL  : BinaryVRRcGeneric<"vavgl", 0xE7F0>;
-  def VAVGLB : BinaryVRRc<"vavglb", 0xE7F0, int_s390_vavglb, v128b, v128b, 0>;
-  def VAVGLH : BinaryVRRc<"vavglh", 0xE7F0, int_s390_vavglh, v128h, v128h, 1>;
-  def VAVGLF : BinaryVRRc<"vavglf", 0xE7F0, int_s390_vavglf, v128f, v128f, 2>;
-  def VAVGLG : BinaryVRRc<"vavglg", 0xE7F0, int_s390_vavglg, v128g, v128g, 3>;
+  let isCommutable = 1 in {
+    // Average.
+    def VAVG  : BinaryVRRcGeneric<"vavg", 0xE7F2>;
+    def VAVGB : BinaryVRRc<"vavgb", 0xE7F2, int_s390_vavgb, v128b, v128b, 0>;
+    def VAVGH : BinaryVRRc<"vavgh", 0xE7F2, int_s390_vavgh, v128h, v128h, 1>;
+    def VAVGF : BinaryVRRc<"vavgf", 0xE7F2, int_s390_vavgf, v128f, v128f, 2>;
+    def VAVGG : BinaryVRRc<"vavgg", 0xE7F2, int_s390_vavgg, v128g, v128g, 3>;
+
+    // Average logical.
+    def VAVGL  : BinaryVRRcGeneric<"vavgl", 0xE7F0>;
+    def VAVGLB : BinaryVRRc<"vavglb", 0xE7F0, int_s390_vavglb, v128b, v128b, 0>;
+    def VAVGLH : BinaryVRRc<"vavglh", 0xE7F0, int_s390_vavglh, v128h, v128h, 1>;
+    def VAVGLF : BinaryVRRc<"vavglf", 0xE7F0, int_s390_vavglf, v128f, v128f, 2>;
+    def VAVGLG : BinaryVRRc<"vavglg", 0xE7F0, int_s390_vavglg, v128g, v128g, 3>;
+  }
 
   // Checksum.
   def VCKSM : BinaryVRRc<"vcksm", 0xE766, int_s390_vcksm, v128f, v128f>;
@@ -524,12 +539,14 @@ let Predicates = [FeatureVector] in {
   def VCTZF : UnaryVRRa<"vctzf", 0xE752, cttz, v128f, v128f, 2>;
   def VCTZG : UnaryVRRa<"vctzg", 0xE752, cttz, v128g, v128g, 3>;
 
-  // Not exclusive or.
-  let Predicates = [FeatureVectorEnhancements1] in
-    def VNX : BinaryVRRc<"vnx", 0xE76C, null_frag, v128any, v128any>;
+  let isCommutable = 1 in {
+    // Not exclusive or.
+    let Predicates = [FeatureVectorEnhancements1] in
+      def VNX : BinaryVRRc<"vnx", 0xE76C, null_frag, v128any, v128any>;
 
-  // Exclusive or.
-  def VX : BinaryVRRc<"vx", 0xE76D, null_frag, v128any, v128any>;
+    // Exclusive or.
+    def VX : BinaryVRRc<"vx", 0xE76D, null_frag, v128any, v128any>;
+  }
 
   // Galois field multiply sum.
   def VGFM  : BinaryVRRcGeneric<"vgfm", 0xE7B4>;
@@ -559,135 +576,145 @@ let Predicates = [FeatureVector] in {
   def VLPF : UnaryVRRa<"vlpf", 0xE7DF, z_viabs32, v128f, v128f, 2>;
   def VLPG : UnaryVRRa<"vlpg", 0xE7DF, z_viabs64, v128g, v128g, 3>;
 
-  // Maximum.
-  def VMX  : BinaryVRRcGeneric<"vmx", 0xE7FF>;
-  def VMXB : BinaryVRRc<"vmxb", 0xE7FF, null_frag, v128b, v128b, 0>;
-  def VMXH : BinaryVRRc<"vmxh", 0xE7FF, null_frag, v128h, v128h, 1>;
-  def VMXF : BinaryVRRc<"vmxf", 0xE7FF, null_frag, v128f, v128f, 2>;
-  def VMXG : BinaryVRRc<"vmxg", 0xE7FF, null_frag, v128g, v128g, 3>;
-
-  // Maximum logical.
-  def VMXL  : BinaryVRRcGeneric<"vmxl", 0xE7FD>;
-  def VMXLB : BinaryVRRc<"vmxlb", 0xE7FD, null_frag, v128b, v128b, 0>;
-  def VMXLH : BinaryVRRc<"vmxlh", 0xE7FD, null_frag, v128h, v128h, 1>;
-  def VMXLF : BinaryVRRc<"vmxlf", 0xE7FD, null_frag, v128f, v128f, 2>;
-  def VMXLG : BinaryVRRc<"vmxlg", 0xE7FD, null_frag, v128g, v128g, 3>;
+  let isCommutable = 1 in {
+    // Maximum.
+    def VMX  : BinaryVRRcGeneric<"vmx", 0xE7FF>;
+    def VMXB : BinaryVRRc<"vmxb", 0xE7FF, null_frag, v128b, v128b, 0>;
+    def VMXH : BinaryVRRc<"vmxh", 0xE7FF, null_frag, v128h, v128h, 1>;
+    def VMXF : BinaryVRRc<"vmxf", 0xE7FF, null_frag, v128f, v128f, 2>;
+    def VMXG : BinaryVRRc<"vmxg", 0xE7FF, null_frag, v128g, v128g, 3>;
+
+    // Maximum logical.
+    def VMXL  : BinaryVRRcGeneric<"vmxl", 0xE7FD>;
+    def VMXLB : BinaryVRRc<"vmxlb", 0xE7FD, null_frag, v128b, v128b, 0>;
+    def VMXLH : BinaryVRRc<"vmxlh", 0xE7FD, null_frag, v128h, v128h, 1>;
+    def VMXLF : BinaryVRRc<"vmxlf", 0xE7FD, null_frag, v128f, v128f, 2>;
+    def VMXLG : BinaryVRRc<"vmxlg", 0xE7FD, null_frag, v128g, v128g, 3>;
+  }
 
-  // Minimum.
-  def VMN  : BinaryVRRcGeneric<"vmn", 0xE7FE>;
-  def VMNB : BinaryVRRc<"vmnb", 0xE7FE, null_frag, v128b, v128b, 0>;
-  def VMNH : BinaryVRRc<"vmnh", 0xE7FE, null_frag, v128h, v128h, 1>;
-  def VMNF : BinaryVRRc<"vmnf", 0xE7FE, null_frag, v128f, v128f, 2>;
-  def VMNG : BinaryVRRc<"vmng", 0xE7FE, null_frag, v128g, v128g, 3>;
-
-  // Minimum logical.
-  def VMNL  : BinaryVRRcGeneric<"vmnl", 0xE7FC>;
-  def VMNLB : BinaryVRRc<"vmnlb", 0xE7FC, null_frag, v128b, v128b, 0>;
-  def VMNLH : BinaryVRRc<"vmnlh", 0xE7FC, null_frag, v128h, v128h, 1>;
-  def VMNLF : BinaryVRRc<"vmnlf", 0xE7FC, null_frag, v128f, v128f, 2>;
-  def VMNLG : BinaryVRRc<"vmnlg", 0xE7FC, null_frag, v128g, v128g, 3>;
-
-  // Multiply and add low.
-  def VMAL   : TernaryVRRdGeneric<"vmal", 0xE7AA>;
-  def VMALB  : TernaryVRRd<"vmalb",  0xE7AA, z_muladd, v128b, v128b, 0>;
-  def VMALHW : TernaryVRRd<"vmalhw", 0xE7AA, z_muladd, v128h, v128h, 1>;
-  def VMALF  : TernaryVRRd<"vmalf",  0xE7AA, z_muladd, v128f, v128f, 2>;
-
-  // Multiply and add high.
-  def VMAH  : TernaryVRRdGeneric<"vmah", 0xE7AB>;
-  def VMAHB : TernaryVRRd<"vmahb", 0xE7AB, int_s390_vmahb, v128b, v128b, 0>;
-  def VMAHH : TernaryVRRd<"vmahh", 0xE7AB, int_s390_vmahh, v128h, v128h, 1>;
-  def VMAHF : TernaryVRRd<"vmahf", 0xE7AB, int_s390_vmahf, v128f, v128f, 2>;
-
-  // Multiply and add logical high.
-  def VMALH  : TernaryVRRdGeneric<"vmalh", 0xE7A9>;
-  def VMALHB : TernaryVRRd<"vmalhb", 0xE7A9, int_s390_vmalhb, v128b, v128b, 0>;
-  def VMALHH : TernaryVRRd<"vmalhh", 0xE7A9, int_s390_vmalhh, v128h, v128h, 1>;
-  def VMALHF : TernaryVRRd<"vmalhf", 0xE7A9, int_s390_vmalhf, v128f, v128f, 2>;
-
-  // Multiply and add even.
-  def VMAE  : TernaryVRRdGeneric<"vmae", 0xE7AE>;
-  def VMAEB : TernaryVRRd<"vmaeb", 0xE7AE, int_s390_vmaeb, v128h, v128b, 0>;
-  def VMAEH : TernaryVRRd<"vmaeh", 0xE7AE, int_s390_vmaeh, v128f, v128h, 1>;
-  def VMAEF : TernaryVRRd<"vmaef", 0xE7AE, int_s390_vmaef, v128g, v128f, 2>;
-
-  // Multiply and add logical even.
-  def VMALE  : TernaryVRRdGeneric<"vmale", 0xE7AC>;
-  def VMALEB : TernaryVRRd<"vmaleb", 0xE7AC, int_s390_vmaleb, v128h, v128b, 0>;
-  def VMALEH : TernaryVRRd<"vmaleh", 0xE7AC, int_s390_vmaleh, v128f, v128h, 1>;
-  def VMALEF : TernaryVRRd<"vmalef", 0xE7AC, int_s390_vmalef, v128g, v128f, 2>;
-
-  // Multiply and add odd.
-  def VMAO  : TernaryVRRdGeneric<"vmao", 0xE7AF>;
-  def VMAOB : TernaryVRRd<"vmaob", 0xE7AF, int_s390_vmaob, v128h, v128b, 0>;
-  def VMAOH : TernaryVRRd<"vmaoh", 0xE7AF, int_s390_vmaoh, v128f, v128h, 1>;
-  def VMAOF : TernaryVRRd<"vmaof", 0xE7AF, int_s390_vmaof, v128g, v128f, 2>;
-
-  // Multiply and add logical odd.
-  def VMALO  : TernaryVRRdGeneric<"vmalo", 0xE7AD>;
-  def VMALOB : TernaryVRRd<"vmalob", 0xE7AD, int_s390_vmalob, v128h, v128b, 0>;
-  def VMALOH : TernaryVRRd<"vmaloh", 0xE7AD, int_s390_vmaloh, v128f, v128h, 1>;
-  def VMALOF : TernaryVRRd<"vmalof", 0xE7AD, int_s390_vmalof, v128g, v128f, 2>;
-
-  // Multiply high.
-  def VMH  : BinaryVRRcGeneric<"vmh", 0xE7A3>;
-  def VMHB : BinaryVRRc<"vmhb", 0xE7A3, int_s390_vmhb, v128b, v128b, 0>;
-  def VMHH : BinaryVRRc<"vmhh", 0xE7A3, int_s390_vmhh, v128h, v128h, 1>;
-  def VMHF : BinaryVRRc<"vmhf", 0xE7A3, int_s390_vmhf, v128f, v128f, 2>;
-
-  // Multiply logical high.
-  def VMLH  : BinaryVRRcGeneric<"vmlh", 0xE7A1>;
-  def VMLHB : BinaryVRRc<"vmlhb", 0xE7A1, int_s390_vmlhb, v128b, v128b, 0>;
-  def VMLHH : BinaryVRRc<"vmlhh", 0xE7A1, int_s390_vmlhh, v128h, v128h, 1>;
-  def VMLHF : BinaryVRRc<"vmlhf", 0xE7A1, int_s390_vmlhf, v128f, v128f, 2>;
-
-  // Multiply low.
-  def VML   : BinaryVRRcGeneric<"vml", 0xE7A2>;
-  def VMLB  : BinaryVRRc<"vmlb",  0xE7A2, mul, v128b, v128b, 0>;
-  def VMLHW : BinaryVRRc<"vmlhw", 0xE7A2, mul, v128h, v128h, 1>;
-  def VMLF  : BinaryVRRc<"vmlf",  0xE7A2, mul, v128f, v128f, 2>;
-
-  // Multiply even.
-  def VME  : BinaryVRRcGeneric<"vme", 0xE7A6>;
-  def VMEB : BinaryVRRc<"vmeb", 0xE7A6, int_s390_vmeb, v128h, v128b, 0>;
-  def VMEH : BinaryVRRc<"vmeh", 0xE7A6, int_s390_vmeh, v128f, v128h, 1>;
-  def VMEF : BinaryVRRc<"vmef", 0xE7A6, int_s390_vmef, v128g, v128f, 2>;
-
-  // Multiply logical even.
-  def VMLE  : BinaryVRRcGeneric<"vmle", 0xE7A4>;
-  def VMLEB : BinaryVRRc<"vmleb", 0xE7A4, int_s390_vmleb, v128h, v128b, 0>;
-  def VMLEH : BinaryVRRc<"vmleh", 0xE7A4, int_s390_vmleh, v128f, v128h, 1>;
-  def VMLEF : BinaryVRRc<"vmlef", 0xE7A4, int_s390_vmlef, v128g, v128f, 2>;
-
-  // Multiply odd.
-  def VMO  : BinaryVRRcGeneric<"vmo", 0xE7A7>;
-  def VMOB : BinaryVRRc<"vmob", 0xE7A7, int_s390_vmob, v128h, v128b, 0>;
-  def VMOH : BinaryVRRc<"vmoh", 0xE7A7, int_s390_vmoh, v128f, v128h, 1>;
-  def VMOF : BinaryVRRc<"vmof", 0xE7A7, int_s390_vmof, v128g, v128f, 2>;
-
-  // Multiply logical odd.
-  def VMLO  : BinaryVRRcGeneric<"vmlo", 0xE7A5>;
-  def VMLOB : BinaryVRRc<"vmlob", 0xE7A5, int_s390_vmlob, v128h, v128b, 0>;
-  def VMLOH : BinaryVRRc<"vmloh", 0xE7A5, int_s390_vmloh, v128f, v128h, 1>;
-  def VMLOF : BinaryVRRc<"vmlof", 0xE7A5, int_s390_vmlof, v128g, v128f, 2>;
+  let isCommutable = 1 in {
+    // Minimum.
+    def VMN  : BinaryVRRcGeneric<"vmn", 0xE7FE>;
+    def VMNB : BinaryVRRc<"vmnb", 0xE7FE, null_frag, v128b, v128b, 0>;
+    def VMNH : BinaryVRRc<"vmnh", 0xE7FE, null_frag, v128h, v128h, 1>;
+    def VMNF : BinaryVRRc<"vmnf", 0xE7FE, null_frag, v128f, v128f, 2>;
+    def VMNG : BinaryVRRc<"vmng", 0xE7FE, null_frag, v128g, v128g, 3>;
+
+    // Minimum logical.
+    def VMNL  : BinaryVRRcGeneric<"vmnl", 0xE7FC>;
+    def VMNLB : BinaryVRRc<"vmnlb", 0xE7FC, null_frag, v128b, v128b, 0>;
+    def VMNLH : BinaryVRRc<"vmnlh", 0xE7FC, null_frag, v128h, v128h, 1>;
+    def VMNLF : BinaryVRRc<"vmnlf", 0xE7FC, null_frag, v128f, v128f, 2>;
+    def VMNLG : BinaryVRRc<"vmnlg", 0xE7FC, null_frag, v128g, v128g, 3>;
+  }
+
+  let isCommutable = 1 in {
+    // Multiply and add low.
+    def VMAL   : TernaryVRRdGeneric<"vmal", 0xE7AA>;
+    def VMALB  : TernaryVRRd<"vmalb",  0xE7AA, z_muladd, v128b, v128b, 0>;
+    def VMALHW : TernaryVRRd<"vmalhw", 0xE7AA, z_muladd, v128h, v128h, 1>;
+    def VMALF  : TernaryVRRd<"vmalf",  0xE7AA, z_muladd, v128f, v128f, 2>;
+
+    // Multiply and add high.
+    def VMAH  : TernaryVRRdGeneric<"vmah", 0xE7AB>;
+    def VMAHB : TernaryVRRd<"vmahb", 0xE7AB, int_s390_vmahb, v128b, v128b, 0>;
+    def VMAHH : TernaryVRRd<"vmahh", 0xE7AB, int_s390_vmahh, v128h, v128h, 1>;
+    def VMAHF : TernaryVRRd<"vmahf", 0xE7AB, int_s390_vmahf, v128f, v128f, 2>;
+
+    // Multiply and add logical high.
+    def VMALH  : TernaryVRRdGeneric<"vmalh", 0xE7A9>;
+    def VMALHB : TernaryVRRd<"vmalhb", 0xE7A9, int_s390_vmalhb, v128b, v128b, 0>;
+    def VMALHH : TernaryVRRd<"vmalhh", 0xE7A9, int_s390_vmalhh, v128h, v128h, 1>;
+    def VMALHF : TernaryVRRd<"vmalhf", 0xE7A9, int_s390_vmalhf, v128f, v128f, 2>;
+
+    // Multiply and add even.
+    def VMAE  : TernaryVRRdGeneric<"vmae", 0xE7AE>;
+    def VMAEB : TernaryVRRd<"vmaeb", 0xE7AE, int_s390_vmaeb, v128h, v128b, 0>;
+    def VMAEH : TernaryVRRd<"vmaeh", 0xE7AE, int_s390_vmaeh, v128f, v128h, 1>;
+    def VMAEF : TernaryVRRd<"vmaef", 0xE7AE, int_s390_vmaef, v128g, v128f, 2>;
+
+    // Multiply and add logical even.
+    def VMALE  : TernaryVRRdGeneric<"vmale", 0xE7AC>;
+    def VMALEB : TernaryVRRd<"vmaleb", 0xE7AC, int_s390_vmaleb, v128h, v128b, 0>;
+    def VMALEH : TernaryVRRd<"vmaleh", 0xE7AC, int_s390_vmaleh, v128f, v128h, 1>;
+    def VMALEF : TernaryVRRd<"vmalef", 0xE7AC, int_s390_vmalef, v128g, v128f, 2>;
+
+    // Multiply and add odd.
+    def VMAO  : TernaryVRRdGeneric<"vmao", 0xE7AF>;
+    def VMAOB : TernaryVRRd<"vmaob", 0xE7AF, int_s390_vmaob, v128h, v128b, 0>;
+    def VMAOH : TernaryVRRd<"vmaoh", 0xE7AF, int_s390_vmaoh, v128f, v128h, 1>;
+    def VMAOF : TernaryVRRd<"vmaof", 0xE7AF, int_s390_vmaof, v128g, v128f, 2>;
+
+    // Multiply and add logical odd.
+    def VMALO  : TernaryVRRdGeneric<"vmalo", 0xE7AD>;
+    def VMALOB : TernaryVRRd<"vmalob", 0xE7AD, int_s390_vmalob, v128h, v128b, 0>;
+    def VMALOH : TernaryVRRd<"vmaloh", 0xE7AD, int_s390_vmaloh, v128f, v128h, 1>;
+    def VMALOF : TernaryVRRd<"vmalof", 0xE7AD, int_s390_vmalof, v128g, v128f, 2>;
+  }
+
+  let isCommutable = 1 in {
+    // Multiply high.
+    def VMH  : BinaryVRRcGeneric<"vmh", 0xE7A3>;
+    def VMHB : BinaryVRRc<"vmhb", 0xE7A3, int_s390_vmhb, v128b, v128b, 0>;
+    def VMHH : BinaryVRRc<"vmhh", 0xE7A3, int_s390_vmhh, v128h, v128h, 1>;
+    def VMHF : BinaryVRRc<"vmhf", 0xE7A3, int_s390_vmhf, v128f, v128f, 2>;
+
+    // Multiply logical high.
+    def VMLH  : BinaryVRRcGeneric<"vmlh", 0xE7A1>;
+    def VMLHB : BinaryVRRc<"vmlhb", 0xE7A1, int_s390_vmlhb, v128b, v128b, 0>;
+    def VMLHH : BinaryVRRc<"vmlhh", 0xE7A1, int_s390_vmlhh, v128h, v128h, 1>;
+    def VMLHF : BinaryVRRc<"vmlhf", 0xE7A1, int_s390_vmlhf, v128f, v128f, 2>;
+
+    // Multiply low.
+    def VML   : BinaryVRRcGeneric<"vml", 0xE7A2>;
+    def VMLB  : BinaryVRRc<"vmlb",  0xE7A2, mul, v128b, v128b, 0>;
+    def VMLHW : BinaryVRRc<"vmlhw", 0xE7A2, mul, v128h, v128h, 1>;
+    def VMLF  : BinaryVRRc<"vmlf",  0xE7A2, mul, v128f, v128f, 2>;
+
+    // Multiply even.
+    def VME  : BinaryVRRcGeneric<"vme", 0xE7A6>;
+    def VMEB : BinaryVRRc<"vmeb", 0xE7A6, int_s390_vmeb, v128h, v128b, 0>;
+    def VMEH : BinaryVRRc<"vmeh", 0xE7A6, int_s390_vmeh, v128f, v128h, 1>;
+    def VMEF : BinaryVRRc<"vmef", 0xE7A6, int_s390_vmef, v128g, v128f, 2>;
+
+    // Multiply logical even.
+    def VMLE  : BinaryVRRcGeneric<"vmle", 0xE7A4>;
+    def VMLEB : BinaryVRRc<"vmleb", 0xE7A4, int_s390_vmleb, v128h, v128b, 0>;
+    def VMLEH : BinaryVRRc<"vmleh", 0xE7A4, int_s390_vmleh, v128f, v128h, 1>;
+    def VMLEF : BinaryVRRc<"vmlef", 0xE7A4, int_s390_vmlef, v128g, v128f, 2>;
+
+    // Multiply odd.
+    def VMO  : BinaryVRRcGeneric<"vmo", 0xE7A7>;
+    def VMOB : BinaryVRRc<"vmob", 0xE7A7, int_s390_vmob, v128h, v128b, 0>;
+    def VMOH : BinaryVRRc<"vmoh", 0xE7A7, int_s390_vmoh, v128f, v128h, 1>;
+    def VMOF : BinaryVRRc<"vmof", 0xE7A7, int_s390_vmof, v128g, v128f, 2>;
+
+    // Multiply logical odd.
+    def VMLO  : BinaryVRRcGeneric<"vmlo", 0xE7A5>;
+    def VMLOB : BinaryVRRc<"vmlob", 0xE7A5, int_s390_vmlob, v128h, v128b, 0>;
+    def VMLOH : BinaryVRRc<"vmloh", 0xE7A5, int_s390_vmloh, v128f, v128h, 1>;
+    def VMLOF : BinaryVRRc<"vmlof", 0xE7A5, int_s390_vmlof, v128g, v128f, 2>;
+  }
 
   // Multiply sum logical.
-  let Predicates = [FeatureVectorEnhancements1] in {
+  let Predicates = [FeatureVectorEnhancements1], isCommutable = 1 in {
     def VMSL  : QuaternaryVRRdGeneric<"vmsl", 0xE7B8>;
     def VMSLG : QuaternaryVRRd<"vmslg", 0xE7B8, int_s390_vmslg,
                                v128q, v128g, v128g, v128q, 3>;
   }
 
   // Nand.
-  let Predicates = [FeatureVectorEnhancements1] in
+  let Predicates = [FeatureVectorEnhancements1], isCommutable = 1 in
     def VNN : BinaryVRRc<"vnn", 0xE76E, null_frag, v128any, v128any>;
 
   // Nor.
-  def VNO : BinaryVRRc<"vno", 0xE76B, null_frag, v128any, v128any>;
+  let isCommutable = 1 in
+    def VNO : BinaryVRRc<"vno", 0xE76B, null_frag, v128any, v128any>;
   def : InstAlias<"vnot\t$V1, $V2", (VNO VR128:$V1, VR128:$V2, VR128:$V2), 0>;
 
   // Or.
-  def VO : BinaryVRRc<"vo", 0xE76A, null_frag, v128any, v128any>;
+  let isCommutable = 1 in
+    def VO : BinaryVRRc<"vo", 0xE76A, null_frag, v128any, v128any>;
 
   // Or with complement.
   let Predicates = [FeatureVectorEnhancements1] in
@@ -1017,13 +1044,15 @@ multiclass VectorRounding<Instruction insn, TypedReg tr> {
 
 let Predicates = [FeatureVector] in {
   // Add.
-  let Uses = [FPC], mayRaiseFPException = 1 in {
+  let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in {
     def VFA   : BinaryVRRcFloatGeneric<"vfa", 0xE7E3>;
     def VFADB : BinaryVRRc<"vfadb", 0xE7E3, any_fadd, v128db, v128db, 3, 0>;
-    def WFADB : BinaryVRRc<"wfadb", 0xE7E3, any_fadd, v64db, v64db, 3, 8>;
+    def WFADB : BinaryVRRc<"wfadb", 0xE7E3, any_fadd, v64db, v64db, 3, 8, 0,
+                           "adbr">;
     let Predicates = [FeatureVectorEnhancements1] in {
       def VFASB : BinaryVRRc<"vfasb", 0xE7E3, any_fadd, v128sb, v128sb, 2, 0>;
-      def WFASB : BinaryVRRc<"wfasb", 0xE7E3, any_fadd, v32sb, v32sb, 2, 8>;
+      def WFASB : BinaryVRRc<"wfasb", 0xE7E3, any_fadd, v32sb, v32sb, 2, 8, 0,
+                             "aebr">;
       def WFAXB : BinaryVRRc<"wfaxb", 0xE7E3, any_fadd, v128xb, v128xb, 4, 8>;
     }
   }
@@ -1104,10 +1133,12 @@ let Predicates = [FeatureVector] in {
   let Uses = [FPC], mayRaiseFPException = 1 in {
     def VFD   : BinaryVRRcFloatGeneric<"vfd", 0xE7E5>;
     def VFDDB : BinaryVRRc<"vfddb", 0xE7E5, any_fdiv, v128db, v128db, 3, 0>;
-    def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, any_fdiv, v64db, v64db, 3, 8>;
+    def WFDDB : BinaryVRRc<"wfddb", 0xE7E5, any_fdiv, v64db, v64db, 3, 8, 0,
+                           "ddbr">;
     let Predicates = [FeatureVectorEnhancements1] in {
       def VFDSB : BinaryVRRc<"vfdsb", 0xE7E5, any_fdiv, v128sb, v128sb, 2, 0>;
-      def WFDSB : BinaryVRRc<"wfdsb", 0xE7E5, any_fdiv, v32sb, v32sb, 2, 8>;
+      def WFDSB : BinaryVRRc<"wfdsb", 0xE7E5, any_fdiv, v32sb, v32sb, 2, 8, 0,
+                             "debr">;
       def WFDXB : BinaryVRRc<"wfdxb", 0xE7E5, any_fdiv, v128xb, v128xb, 4, 8>;
     }
   }
@@ -1135,7 +1166,8 @@ let Predicates = [FeatureVector] in {
   let Uses = [FPC], mayRaiseFPException = 1 in {
     def VLDE  : UnaryVRRaFloatGeneric<"vlde", 0xE7C4>;
     def VLDEB : UnaryVRRa<"vldeb", 0xE7C4, z_any_vextend, v128db, v128sb, 2, 0>;
-    def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, any_fpextend, v64db, v32sb, 2, 8>;
+    def WLDEB : UnaryVRRa<"wldeb", 0xE7C4, any_fpextend, v64db, v32sb, 2, 8, 0,
+                          "ldebr">;
   }
   let Predicates = [FeatureVectorEnhancements1] in {
     let Uses = [FPC], mayRaiseFPException = 1 in {
@@ -1178,7 +1210,7 @@ let Predicates = [FeatureVector] in {
     def : FPMinMax<insn, any_fmaximum, tr, 1>;
   }
   let Predicates = [FeatureVectorEnhancements1] in {
-    let Uses = [FPC], mayRaiseFPException = 1 in {
+    let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in {
       def VFMAX   : TernaryVRRcFloatGeneric<"vfmax", 0xE7EF>;
       def VFMAXDB : TernaryVRRcFloat<"vfmaxdb", 0xE7EF, int_s390_vfmaxdb,
                                      v128db, v128db, 3, 0>;
@@ -1204,7 +1236,7 @@ let Predicates = [FeatureVector] in {
     def : FPMinMax<insn, any_fminimum, tr, 1>;
   }
   let Predicates = [FeatureVectorEnhancements1] in {
-    let Uses = [FPC], mayRaiseFPException = 1 in {
+    let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in {
       def VFMIN   : TernaryVRRcFloatGeneric<"vfmin", 0xE7EE>;
       def VFMINDB : TernaryVRRcFloat<"vfmindb", 0xE7EE, int_s390_vfmindb,
                                      v128db, v128db, 3, 0>;
@@ -1225,43 +1257,49 @@ let Predicates = [FeatureVector] in {
   }
 
   // Multiply.
-  let Uses = [FPC], mayRaiseFPException = 1 in {
+  let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in {
     def VFM   : BinaryVRRcFloatGeneric<"vfm", 0xE7E7>;
     def VFMDB : BinaryVRRc<"vfmdb", 0xE7E7, any_fmul, v128db, v128db, 3, 0>;
-    def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, any_fmul, v64db, v64db, 3, 8>;
+    def WFMDB : BinaryVRRc<"wfmdb", 0xE7E7, any_fmul, v64db, v64db, 3, 8, 0,
+                           "mdbr">;
     let Predicates = [FeatureVectorEnhancements1] in {
       def VFMSB : BinaryVRRc<"vfmsb", 0xE7E7, any_fmul, v128sb, v128sb, 2, 0>;
-      def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, any_fmul, v32sb, v32sb, 2, 8>;
+      def WFMSB : BinaryVRRc<"wfmsb", 0xE7E7, any_fmul, v32sb, v32sb, 2, 8, 0,
+                             "meebr">;
       def WFMXB : BinaryVRRc<"wfmxb", 0xE7E7, any_fmul, v128xb, v128xb, 4, 8>;
     }
   }
 
   // Multiply and add.
-  let Uses = [FPC], mayRaiseFPException = 1 in {
+  let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in {
     def VFMA   : TernaryVRReFloatGeneric<"vfma", 0xE78F>;
     def VFMADB : TernaryVRRe<"vfmadb", 0xE78F, any_fma, v128db, v128db, 0, 3>;
-    def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, any_fma, v64db, v64db, 8, 3>;
+    def WFMADB : TernaryVRRe<"wfmadb", 0xE78F, any_fma, v64db, v64db, 8, 3,
+                             "madbr">;
     let Predicates = [FeatureVectorEnhancements1] in {
       def VFMASB : TernaryVRRe<"vfmasb", 0xE78F, any_fma, v128sb, v128sb, 0, 2>;
-      def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, any_fma, v32sb, v32sb, 8, 2>;
+      def WFMASB : TernaryVRRe<"wfmasb", 0xE78F, any_fma, v32sb, v32sb, 8, 2,
+                               "maebr">;
       def WFMAXB : TernaryVRRe<"wfmaxb", 0xE78F, any_fma, v128xb, v128xb, 8, 4>;
     }
   }
 
   // Multiply and subtract.
-  let Uses = [FPC], mayRaiseFPException = 1 in {
+  let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1 in {
     def VFMS   : TernaryVRReFloatGeneric<"vfms", 0xE78E>;
     def VFMSDB : TernaryVRRe<"vfmsdb", 0xE78E, any_fms, v128db, v128db, 0, 3>;
-    def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, any_fms, v64db, v64db, 8, 3>;
+    def WFMSDB : TernaryVRRe<"wfmsdb", 0xE78E, any_fms, v64db, v64db, 8, 3,
+                             "msdbr">;
     let Predicates = [FeatureVectorEnhancements1] in {
       def VFMSSB : TernaryVRRe<"vfmssb", 0xE78E, any_fms, v128sb, v128sb, 0, 2>;
-      def WFMSSB : TernaryVRRe<"wfmssb", 0xE78E, any_fms, v32sb, v32sb, 8, 2>;
+      def WFMSSB : TernaryVRRe<"wfmssb", 0xE78E, any_fms, v32sb, v32sb, 8, 2,
+                               "msebr">;
       def WFMSXB : TernaryVRRe<"wfmsxb", 0xE78E, any_fms, v128xb, v128xb, 8, 4>;
     }
   }
 
   // Negative multiply and add.
-  let Uses = [FPC], mayRaiseFPException = 1,
+  let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1,
       Predicates = [FeatureVectorEnhancements1] in {
     def VFNMA   : TernaryVRReFloatGeneric<"vfnma", 0xE79F>;
     def VFNMADB : TernaryVRRe<"vfnmadb", 0xE79F, any_fnma, v128db, v128db, 0, 3>;
@@ -1272,7 +1310,7 @@ let Predicates = [FeatureVector] in {
   }
 
   // Negative multiply and subtract.
-  let Uses = [FPC], mayRaiseFPException = 1,
+  let Uses = [FPC], mayRaiseFPException = 1, isCommutable = 1,
       Predicates = [FeatureVectorEnhancements1] in {
     def VFNMS   : TernaryVRReFloatGeneric<"vfnms", 0xE79E>;
     def VFNMSDB : TernaryVRRe<"vfnmsdb", 0xE79E, any_fnms, v128db, v128db, 0, 3>;
@@ -1323,10 +1361,12 @@ let Predicates = [FeatureVector] in {
   let Uses = [FPC], mayRaiseFPException = 1 in {
     def VFSQ   : UnaryVRRaFloatGeneric<"vfsq", 0xE7CE>;
     def VFSQDB : UnaryVRRa<"vfsqdb", 0xE7CE, any_fsqrt, v128db, v128db, 3, 0>;
-    def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, any_fsqrt, v64db, v64db, 3, 8>;
+    def WFSQDB : UnaryVRRa<"wfsqdb", 0xE7CE, any_fsqrt, v64db, v64db, 3, 8, 0,
+                           "sqdbr">;
     let Predicates = [FeatureVectorEnhancements1] in {
       def VFSQSB : UnaryVRRa<"vfsqsb", 0xE7CE, any_fsqrt, v128sb, v128sb, 2, 0>;
-      def WFSQSB : UnaryVRRa<"wfsqsb", 0xE7CE, any_fsqrt, v32sb, v32sb, 2, 8>;
+      def WFSQSB : UnaryVRRa<"wfsqsb", 0xE7CE, any_fsqrt, v32sb, v32sb, 2, 8, 0,
+                             "sqebr">;
       def WFSQXB : UnaryVRRa<"wfsqxb", 0xE7CE, any_fsqrt, v128xb, v128xb, 4, 8>;
     }
   }
@@ -1335,10 +1375,12 @@ let Predicates = [FeatureVector] in {
   let Uses = [FPC], mayRaiseFPException = 1 in {
     def VFS   : BinaryVRRcFloatGeneric<"vfs", 0xE7E2>;
     def VFSDB : BinaryVRRc<"vfsdb", 0xE7E2, any_fsub, v128db, v128db, 3, 0>;
-    def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, any_fsub, v64db, v64db, 3, 8>;
+    def WFSDB : BinaryVRRc<"wfsdb", 0xE7E2, any_fsub, v64db, v64db, 3, 8, 0,
+                           "sdbr">;
     let Predicates = [FeatureVectorEnhancements1] in {
       def VFSSB : BinaryVRRc<"vfssb", 0xE7E2, any_fsub, v128sb, v128sb, 2, 0>;
-      def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, any_fsub, v32sb, v32sb, 2, 8>;
+      def WFSSB : BinaryVRRc<"wfssb", 0xE7E2, any_fsub, v32sb, v32sb, 2, 8, 0,
+                             "sebr">;
       def WFSXB : BinaryVRRc<"wfsxb", 0xE7E2, any_fsub, v128xb, v128xb, 4, 8>;
     }
   }
@@ -1364,9 +1406,9 @@ let Predicates = [FeatureVector] in {
   // Compare scalar.
   let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in {
     def WFC   : CompareVRRaFloatGeneric<"wfc", 0xE7CB>;
-    def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_any_fcmp, v64db, 3>;
+    def WFCDB : CompareVRRa<"wfcdb", 0xE7CB, z_any_fcmp, v64db, 3, "cdbr">;
     let Predicates = [FeatureVectorEnhancements1] in {
-      def WFCSB : CompareVRRa<"wfcsb", 0xE7CB, z_any_fcmp, v32sb, 2>;
+      def WFCSB : CompareVRRa<"wfcsb", 0xE7CB, z_any_fcmp, v32sb, 2, "cebr">;
       def WFCXB : CompareVRRa<"wfcxb", 0xE7CB, z_any_fcmp, v128xb, 4>;
     }
   }
@@ -1374,9 +1416,9 @@ let Predicates = [FeatureVector] in {
   // Compare and signal scalar.
   let Uses = [FPC], mayRaiseFPException = 1, Defs = [CC] in {
     def WFK   : CompareVRRaFloatGeneric<"wfk", 0xE7CA>;
-    def WFKDB : CompareVRRa<"wfkdb", 0xE7CA, z_strict_fcmps, v64db, 3>;
+    def WFKDB : CompareVRRa<"wfkdb", 0xE7CA, z_strict_fcmps, v64db, 3, "kdbr">;
     let Predicates = [FeatureVectorEnhancements1] in {
-      def WFKSB : CompareVRRa<"wfksb", 0xE7CA, z_strict_fcmps, v32sb, 2>;
+      def WFKSB : CompareVRRa<"wfksb", 0xE7CA, z_strict_fcmps, v32sb, 2, "kebr">;
       def WFKXB : CompareVRRa<"wfkxb", 0xE7CA, z_strict_fcmps, v128xb, 4>;
     }
   }
@@ -1545,7 +1587,7 @@ def : VectorReplicateScalar<v16i8, VREPB, 7>;
 def : VectorReplicateScalar<v8i16, VREPH, 3>;
 def : VectorReplicateScalar<v4i32, VREPF, 1>;
 
-// i64 replications are just a single isntruction.
+// i64 replications are just a single instruction.
 def : Pat<(v2i64 (z_replicate GR64:$scalar)),
           (VLVGP GR64:$scalar, GR64:$scalar)>;
 
diff --git a/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h b/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
index d1f6511ceea3..f755d5cd3d5b 100644
--- a/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZMachineFunctionInfo.h
@@ -29,8 +29,8 @@ class SystemZMachineFunctionInfo : public MachineFunctionInfo {
 
   SystemZ::GPRRegs SpillGPRRegs;
   SystemZ::GPRRegs RestoreGPRRegs;
-  unsigned VarArgsFirstGPR;
-  unsigned VarArgsFirstFPR;
+  Register VarArgsFirstGPR;
+  Register VarArgsFirstFPR;
   unsigned VarArgsFrameIndex;
   unsigned RegSaveFrameIndex;
   int FramePointerSaveIndex;
@@ -47,7 +47,7 @@ public:
   // this function and the SP offset for the STMG.  These are 0 if no GPRs
   // need to be saved or restored.
   SystemZ::GPRRegs getSpillGPRRegs() const { return SpillGPRRegs; }
-  void setSpillGPRRegs(unsigned Low, unsigned High, unsigned Offs) {
+  void setSpillGPRRegs(Register Low, Register High, unsigned Offs) {
     SpillGPRRegs.LowGPR = Low;
     SpillGPRRegs.HighGPR = High;
     SpillGPRRegs.GPROffset = Offs;
@@ -57,7 +57,7 @@ public:
   // this function and the SP offset for the LMG.  These are 0 if no GPRs
   // need to be saved or restored.
   SystemZ::GPRRegs getRestoreGPRRegs() const { return RestoreGPRRegs; }
-  void setRestoreGPRRegs(unsigned Low, unsigned High, unsigned Offs) {
+  void setRestoreGPRRegs(Register Low, Register High, unsigned Offs) {
     RestoreGPRRegs.LowGPR = Low;
     RestoreGPRRegs.HighGPR = High;
     RestoreGPRRegs.GPROffset = Offs;
@@ -65,12 +65,12 @@ public:
 
   // Get and set the number of fixed (as opposed to variable) arguments
   // that are passed in GPRs to this function.
-  unsigned getVarArgsFirstGPR() const { return VarArgsFirstGPR; }
-  void setVarArgsFirstGPR(unsigned GPR) { VarArgsFirstGPR = GPR; }
+  Register getVarArgsFirstGPR() const { return VarArgsFirstGPR; }
+  void setVarArgsFirstGPR(Register GPR) { VarArgsFirstGPR = GPR; }
 
   // Likewise FPRs.
-  unsigned getVarArgsFirstFPR() const { return VarArgsFirstFPR; }
-  void setVarArgsFirstFPR(unsigned FPR) { VarArgsFirstFPR = FPR; }
+  Register getVarArgsFirstFPR() const { return VarArgsFirstFPR; }
+  void setVarArgsFirstFPR(Register FPR) { VarArgsFirstFPR = FPR; }
 
   // Get and set the frame index of the first stack vararg.
   unsigned getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
diff --git a/llvm/lib/Target/SystemZ/SystemZOperands.td b/llvm/lib/Target/SystemZ/SystemZOperands.td
index bd40f6d7bf40..a883daad73e7 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperands.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperands.td
@@ -22,8 +22,8 @@ class ImmediateTLSAsmOperand<string name>
 }
 
 class ImmediateOp<ValueType vt, string asmop> : Operand<vt> {
-  let PrintMethod = "print"##asmop##"Operand";
-  let DecoderMethod = "decode"##asmop##"Operand";
+  let PrintMethod = "print"#asmop#"Operand";
+  let DecoderMethod = "decode"#asmop#"Operand";
   let ParserMatchClass = !cast<AsmOperandClass>(asmop);
   let OperandType = "OPERAND_IMMEDIATE";
 }
@@ -52,14 +52,14 @@ multiclass Immediate<ValueType vt, code pred, SDNodeXForm xform, string asmop> {
 
 // Constructs an asm operand for a PC-relative address.  SIZE says how
 // many bits there are.
-class PCRelAsmOperand<string size> : ImmediateAsmOperand<"PCRel"##size> {
+class PCRelAsmOperand<string size> : ImmediateAsmOperand<"PCRel"#size> {
   let PredicateMethod = "isImm";
-  let ParserMethod = "parsePCRel"##size;
+  let ParserMethod = "parsePCRel"#size;
 }
 class PCRelTLSAsmOperand<string size>
-  : ImmediateTLSAsmOperand<"PCRelTLS"##size> {
+  : ImmediateTLSAsmOperand<"PCRelTLS"#size> {
   let PredicateMethod = "isImmTLS";
-  let ParserMethod = "parsePCRelTLS"##size;
+  let ParserMethod = "parsePCRelTLS"#size;
 }
 
 // Constructs an operand for a PC-relative address with address type VT.
@@ -92,9 +92,9 @@ class PCRelAddress<ValueType vt, string self, AsmOperandClass asmop>
 class AddressAsmOperand<string format, string bitsize, string dispsize,
                         string length = "">
   : AsmOperandClass {
-  let Name = format##bitsize##"Disp"##dispsize##length;
-  let ParserMethod = "parse"##format##bitsize;
-  let RenderMethod = "add"##format##"Operands";
+  let Name = format#bitsize#"Disp"#dispsize#length;
+  let ParserMethod = "parse"#format#bitsize;
+  let RenderMethod = "add"#format#"Operands";
 }
 
 // Constructs an instruction operand for an addressing mode.  FORMAT,
@@ -103,15 +103,15 @@ class AddressAsmOperand<string format, string bitsize, string dispsize,
 // (base register, displacement, etc.).
 class AddressOperand<string bitsize, string dispsize, string length,
                      string format, dag operands>
-  : Operand<!cast<ValueType>("i"##bitsize)> {
-  let PrintMethod = "print"##format##"Operand";
-  let EncoderMethod = "get"##format##dispsize##length##"Encoding";
+  : Operand<!cast<ValueType>("i"#bitsize)> {
+  let PrintMethod = "print"#format#"Operand";
+  let EncoderMethod = "get"#format#dispsize#length#"Encoding";
   let DecoderMethod =
-    "decode"##format##bitsize##"Disp"##dispsize##length##"Operand";
+    "decode"#format#bitsize#"Disp"#dispsize#length#"Operand";
   let OperandType = "OPERAND_MEMORY";
   let MIOperandInfo = operands;
   let ParserMatchClass =
-    !cast<AddressAsmOperand>(format##bitsize##"Disp"##dispsize##length);
+    !cast<AddressAsmOperand>(format#bitsize#"Disp"#dispsize#length);
 }
 
 // Constructs both a DAG pattern and instruction operand for an addressing mode.
@@ -126,45 +126,45 @@ class AddressOperand<string bitsize, string dispsize, string length,
 class AddressingMode<string seltype, string bitsize, string dispsize,
                      string suffix, string length, int numops, string format,
                      dag operands>
-  : ComplexPattern<!cast<ValueType>("i"##bitsize), numops,
-                   "select"##seltype##dispsize##suffix##length,
+  : ComplexPattern<!cast<ValueType>("i"#bitsize), numops,
+                   "select"#seltype#dispsize#suffix#length,
                    [add, sub, or, frameindex, z_adjdynalloc]>,
     AddressOperand<bitsize, dispsize, length, format, operands>;
 
 // An addressing mode with a base and displacement but no index.
 class BDMode<string type, string bitsize, string dispsize, string suffix>
   : AddressingMode<type, bitsize, dispsize, suffix, "", 2, "BDAddr",
-                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
-                        !cast<Operand>("disp"##dispsize##"imm"##bitsize))>;
+                   (ops !cast<RegisterOperand>("ADDR"#bitsize),
+                        !cast<Operand>("disp"#dispsize#"imm"#bitsize))>;
 
 // An addressing mode with a base, displacement and index.
 class BDXMode<string type, string bitsize, string dispsize, string suffix>
   : AddressingMode<type, bitsize, dispsize, suffix, "", 3, "BDXAddr",
-                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
-                        !cast<Operand>("disp"##dispsize##"imm"##bitsize),
-                        !cast<RegisterOperand>("ADDR"##bitsize))>;
+                   (ops !cast<RegisterOperand>("ADDR"#bitsize),
+                        !cast<Operand>("disp"#dispsize#"imm"#bitsize),
+                        !cast<RegisterOperand>("ADDR"#bitsize))>;
 
 // A BDMode paired with an immediate length operand of LENSIZE bits.
 class BDLMode<string type, string bitsize, string dispsize, string suffix,
               string lensize>
-  : AddressingMode<type, bitsize, dispsize, suffix, "Len"##lensize, 3,
+  : AddressingMode<type, bitsize, dispsize, suffix, "Len"#lensize, 3,
                    "BDLAddr",
-                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
-                        !cast<Operand>("disp"##dispsize##"imm"##bitsize),
-                        !cast<Operand>("imm"##bitsize))>;
+                   (ops !cast<RegisterOperand>("ADDR"#bitsize),
+                        !cast<Operand>("disp"#dispsize#"imm"#bitsize),
+                        !cast<Operand>("imm"#bitsize))>;
 
 // A BDMode paired with a register length operand.
 class BDRMode<string type, string bitsize, string dispsize, string suffix>
   : AddressingMode<type, bitsize, dispsize, suffix, "", 3, "BDRAddr",
-                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
-                        !cast<Operand>("disp"##dispsize##"imm"##bitsize),
-                        !cast<RegisterOperand>("GR"##bitsize))>;
+                   (ops !cast<RegisterOperand>("ADDR"#bitsize),
+                        !cast<Operand>("disp"#dispsize#"imm"#bitsize),
+                        !cast<RegisterOperand>("GR"#bitsize))>;
 
 // An addressing mode with a base, displacement and a vector index.
 class BDVMode<string bitsize, string dispsize>
   : AddressOperand<bitsize, dispsize, "", "BDVAddr",
-                   (ops !cast<RegisterOperand>("ADDR"##bitsize),
-                        !cast<Operand>("disp"##dispsize##"imm"##bitsize),
+                   (ops !cast<RegisterOperand>("ADDR"#bitsize),
+                        !cast<Operand>("disp"#dispsize#"imm"#bitsize),
                         !cast<RegisterOperand>("VR128"))>;
 
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/SystemZ/SystemZOperators.td b/llvm/lib/Target/SystemZ/SystemZOperators.td
index a6a72903e573..81af5fd854db 100644
--- a/llvm/lib/Target/SystemZ/SystemZOperators.td
+++ b/llvm/lib/Target/SystemZ/SystemZOperators.td
@@ -40,6 +40,10 @@ def SDT_ZWrapOffset         : SDTypeProfile<1, 2,
                                              SDTCisSameAs<0, 2>,
                                              SDTCisPtrTy<0>]>;
 def SDT_ZAdjDynAlloc        : SDTypeProfile<1, 0, [SDTCisVT<0, i64>]>;
+def SDT_ZProbedAlloca       : SDTypeProfile<1, 2,
+                                            [SDTCisSameAs<0, 1>,
+                                             SDTCisSameAs<0, 2>,
+                                             SDTCisPtrTy<0>]>;
 def SDT_ZGR128Binary        : SDTypeProfile<1, 2,
                                             [SDTCisVT<0, untyped>,
                                              SDTCisInt<1>,
@@ -269,6 +273,8 @@ def z_select_ccmask_1   : SDNode<"SystemZISD::SELECT_CCMASK",
                                  SDT_ZSelectCCMask>;
 def z_ipm_1             : SDNode<"SystemZISD::IPM", SDT_ZIPM>;
 def z_adjdynalloc       : SDNode<"SystemZISD::ADJDYNALLOC", SDT_ZAdjDynAlloc>;
+def z_probed_alloca     : SDNode<"SystemZISD::PROBED_ALLOCA", SDT_ZProbedAlloca,
+                                 [SDNPHasChain]>;
 def z_popcnt            : SDNode<"SystemZISD::POPCNT", SDTIntUnaryOp>;
 def z_smul_lohi         : SDNode<"SystemZISD::SMUL_LOHI", SDT_ZGR128Binary>;
 def z_umul_lohi         : SDNode<"SystemZISD::UMUL_LOHI", SDT_ZGR128Binary>;
@@ -374,7 +380,7 @@ def z_vstrsz_cc         : SDNode<"SystemZISD::VSTRSZ_CC",
 def z_vftci             : SDNode<"SystemZISD::VFTCI", SDT_ZVecBinaryConvIntCC>;
 
 class AtomicWOp<string name, SDTypeProfile profile = SDT_ZAtomicLoadBinaryW>
-  : SDNode<"SystemZISD::"##name, profile,
+  : SDNode<"SystemZISD::"#name, profile,
            [SDNPHasChain, SDNPMayStore, SDNPMayLoad, SDNPMemOperand]>;
 
 def z_atomic_swapw      : AtomicWOp<"ATOMIC_SWAPW">;
diff --git a/llvm/lib/Target/SystemZ/SystemZPatterns.td b/llvm/lib/Target/SystemZ/SystemZPatterns.td
index 501a69488397..e3190eddb9f1 100644
--- a/llvm/lib/Target/SystemZ/SystemZPatterns.td
+++ b/llvm/lib/Target/SystemZ/SystemZPatterns.td
@@ -57,10 +57,10 @@ multiclass RMWIByte<SDPatternOperator operator, AddressingMode mode,
 // The inserted operand is loaded using LOAD from an address of mode MODE.
 multiclass InsertMem<string type, Instruction insn, RegisterOperand cls,
                      SDPatternOperator load, AddressingMode mode> {
-  def : Pat<(!cast<SDPatternOperator>("or_as_"##type)
+  def : Pat<(!cast<SDPatternOperator>("or_as_"#type)
               cls:$src1, (load mode:$src2)),
             (insn cls:$src1, mode:$src2)>;
-  def : Pat<(!cast<SDPatternOperator>("or_as_rev"##type)
+  def : Pat<(!cast<SDPatternOperator>("or_as_rev"#type)
               (load mode:$src2), cls:$src1),
             (insn cls:$src1, mode:$src2)>;
 }
@@ -167,7 +167,7 @@ class FPConversion<Instruction insn, SDPatternOperator operator, TypedReg tr1,
   : Pat<(tr1.vt (operator (tr2.vt tr2.op:$vec))),
         (insn tr2.op:$vec, suppress, mode)>;
 
-// Use INSN to perform mininum/maximum operation OPERATOR on type TR.
+// Use INSN to perform minimum/maximum operation OPERATOR on type TR.
 // FUNCTION is the type of minimum/maximum function to perform.
 class FPMinMax<Instruction insn, SDPatternOperator operator, TypedReg tr,
                bits<4> function>
diff --git a/llvm/lib/Target/SystemZ/SystemZProcessors.td b/llvm/lib/Target/SystemZ/SystemZProcessors.td
index af33a0300552..57c2411b8dcf 100644
--- a/llvm/lib/Target/SystemZ/SystemZProcessors.td
+++ b/llvm/lib/Target/SystemZ/SystemZProcessors.td
@@ -9,7 +9,7 @@
 // Processor definitions.
 //
 // For compatibility with other compilers on the platform, each model can
-// be identifed either by the system name (e.g. z10) or the level of the
+// be identified either by the system name (e.g. z10) or the level of the
 // architecture the model supports, as identified by the edition level
 // of the z/Architecture Principles of Operation document (e.g. arch8).
 //
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
index 0d5e7af92523..fe2aaca8429a 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.cpp
@@ -73,13 +73,10 @@ static void addHints(ArrayRef<MCPhysReg> Order,
       Hints.push_back(Reg);
 }
 
-bool
-SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
-                                           ArrayRef<MCPhysReg> Order,
-                                           SmallVectorImpl<MCPhysReg> &Hints,
-                                           const MachineFunction &MF,
-                                           const VirtRegMap *VRM,
-                                           const LiveRegMatrix *Matrix) const {
+bool SystemZRegisterInfo::getRegAllocationHints(
+    Register VirtReg, ArrayRef<MCPhysReg> Order,
+    SmallVectorImpl<MCPhysReg> &Hints, const MachineFunction &MF,
+    const VirtRegMap *VRM, const LiveRegMatrix *Matrix) const {
   const MachineRegisterInfo *MRI = &MF.getRegInfo();
   const SystemZSubtarget &Subtarget = MF.getSubtarget<SystemZSubtarget>();
   const TargetRegisterInfo *TRI = Subtarget.getRegisterInfo();
@@ -134,11 +131,11 @@ SystemZRegisterInfo::getRegAllocationHints(unsigned VirtReg,
   }
 
   if (MRI->getRegClass(VirtReg) == &SystemZ::GRX32BitRegClass) {
-    SmallVector<unsigned, 8> Worklist;
-    SmallSet<unsigned, 4> DoneRegs;
+    SmallVector<Register, 8> Worklist;
+    SmallSet<Register, 4> DoneRegs;
     Worklist.push_back(VirtReg);
     while (Worklist.size()) {
-      unsigned Reg = Worklist.pop_back_val();
+      Register Reg = Worklist.pop_back_val();
       if (!DoneRegs.insert(Reg).second)
         continue;
 
@@ -267,14 +264,14 @@ SystemZRegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator MI,
 
   // Decompose the frame index into a base and offset.
   int FrameIndex = MI->getOperand(FIOperandNum).getIndex();
-  unsigned BasePtr;
+  Register BasePtr;
   int64_t Offset = (TFI->getFrameIndexReference(MF, FrameIndex, BasePtr) +
                     MI->getOperand(FIOperandNum + 1).getImm());
 
   // Special handling of dbg_value instructions.
   if (MI->isDebugValue()) {
     MI->getOperand(FIOperandNum).ChangeToRegister(BasePtr, /*isDef*/ false);
-    MI->getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+    MI->getDebugOffset().ChangeToImmediate(Offset);
     return;
   }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
index 7044efef1ac6..9f2cca0c83f6 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.h
@@ -58,11 +58,9 @@ public:
   const TargetRegisterClass *
   getCrossCopyRegClass(const TargetRegisterClass *RC) const override;
 
-  bool getRegAllocationHints(unsigned VirtReg,
-                             ArrayRef<MCPhysReg> Order,
+  bool getRegAllocationHints(Register VirtReg, ArrayRef<MCPhysReg> Order,
                              SmallVectorImpl<MCPhysReg> &Hints,
-                             const MachineFunction &MF,
-                             const VirtRegMap *VRM,
+                             const MachineFunction &MF, const VirtRegMap *VRM,
                              const LiveRegMatrix *Matrix) const override;
 
   // Override TargetRegisterInfo.h.
@@ -72,9 +70,6 @@ public:
   bool requiresFrameIndexScavenging(const MachineFunction &MF) const override {
     return true;
   }
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override {
-    return true;
-  }
   const MCPhysReg *getCalleeSavedRegs(const MachineFunction *MF) const override;
   const uint32_t *getCallPreservedMask(const MachineFunction &MF,
                                        CallingConv::ID CC) const override;
diff --git a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
index 3567b0f3acf8..a85862e62749 100644
--- a/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
+++ b/llvm/lib/Target/SystemZ/SystemZRegisterInfo.td
@@ -35,15 +35,15 @@ multiclass SystemZRegClass<string name, list<ValueType> types, int size,
                            dag regList, bit allocatable = 1> {
   def AsmOperand : AsmOperandClass {
     let Name = name;
-    let ParserMethod = "parse"##name;
+    let ParserMethod = "parse"#name;
     let RenderMethod = "addRegOperands";
   }
   let isAllocatable = allocatable in
     def Bit : RegisterClass<"SystemZ", types, size, regList> {
       let Size = size;
     }
-  def "" : RegisterOperand<!cast<RegisterClass>(name##"Bit")> {
-    let ParserMatchClass = !cast<AsmOperandClass>(name##"AsmOperand");
+  def "" : RegisterOperand<!cast<RegisterClass>(name#"Bit")> {
+    let ParserMatchClass = !cast<AsmOperandClass>(name#"AsmOperand");
   }
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
index 47c925dcf730..6b4f35e5ba2b 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.cpp
@@ -47,7 +47,7 @@ static SDValue emitMemMem(SelectionDAG &DAG, const SDLoc &DL, unsigned Sequence,
 
 SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, unsigned Align, bool IsVolatile, bool AlwaysInline,
+    SDValue Size, Align Alignment, bool IsVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   if (IsVolatile)
     return SDValue();
@@ -74,7 +74,7 @@ static SDValue memsetStore(SelectionDAG &DAG, const SDLoc &DL, SDValue Chain,
 
 SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst,
-    SDValue Byte, SDValue Size, unsigned Align, bool IsVolatile,
+    SDValue Byte, SDValue Size, Align Alignment, bool IsVolatile,
     MachinePointerInfo DstPtrInfo) const {
   EVT PtrVT = Dst.getValueType();
 
@@ -97,20 +97,22 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
         unsigned Size1 = Bytes == 16 ? 8 : 1 << findLastSet(Bytes);
         unsigned Size2 = Bytes - Size1;
         SDValue Chain1 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size1,
-                                     Align, DstPtrInfo);
+                                     Alignment.value(), DstPtrInfo);
         if (Size2 == 0)
           return Chain1;
         Dst = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
                           DAG.getConstant(Size1, DL, PtrVT));
         DstPtrInfo = DstPtrInfo.getWithOffset(Size1);
-        SDValue Chain2 = memsetStore(DAG, DL, Chain, Dst, ByteVal, Size2,
-                                     std::min(Align, Size1), DstPtrInfo);
+        SDValue Chain2 = memsetStore(
+            DAG, DL, Chain, Dst, ByteVal, Size2,
+            std::min((unsigned)Alignment.value(), Size1), DstPtrInfo);
         return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Chain1, Chain2);
       }
     } else {
       // Handle one and two bytes using STC.
       if (Bytes <= 2) {
-        SDValue Chain1 = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Align);
+        SDValue Chain1 =
+            DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment);
         if (Bytes == 1)
           return Chain1;
         SDValue Dst2 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
@@ -131,7 +133,7 @@ SDValue SystemZSelectionDAGInfo::EmitTargetCodeForMemset(
 
     // Copy the byte to the first location and then use MVC to copy
     // it to the rest.
-    Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Align);
+    Chain = DAG.getStore(Chain, DL, Byte, Dst, DstPtrInfo, Alignment);
     SDValue DstPlus1 = DAG.getNode(ISD::ADD, DL, PtrVT, Dst,
                                    DAG.getConstant(1, DL, PtrVT));
     return emitMemMem(DAG, DL, SystemZISD::MVC, SystemZISD::MVC_LOOP,
diff --git a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
index 7d63bae83cf3..a4a5b1fbdf90 100644
--- a/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZSelectionDAGInfo.h
@@ -25,14 +25,15 @@ public:
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &DL,
                                   SDValue Chain, SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align, bool IsVolatile,
-                                  bool AlwaysInline,
+                                  SDValue Size, Align Alignment,
+                                  bool IsVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const override;
 
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL,
                                   SDValue Chain, SDValue Dst, SDValue Byte,
-                                  SDValue Size, unsigned Align, bool IsVolatile,
+                                  SDValue Size, Align Alignment,
+                                  bool IsVolatile,
                                   MachinePointerInfo DstPtrInfo) const override;
 
   std::pair<SDValue, SDValue>
diff --git a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
index f6184cec795a..3d27b70d6ef9 100644
--- a/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZShortenInst.cpp
@@ -46,6 +46,7 @@ private:
   bool shortenOn001(MachineInstr &MI, unsigned Opcode);
   bool shortenOn001AddCC(MachineInstr &MI, unsigned Opcode);
   bool shortenFPConv(MachineInstr &MI, unsigned Opcode);
+  bool shortenFusedFPOp(MachineInstr &MI, unsigned Opcode);
 
   const SystemZInstrInfo *TII;
   const TargetRegisterInfo *TRI;
@@ -64,7 +65,7 @@ SystemZShortenInst::SystemZShortenInst(const SystemZTargetMachine &tm)
 
 // Tie operands if MI has become a two-address instruction.
 static void tieOpsIfNeeded(MachineInstr &MI) {
-  if (MI.getDesc().getOperandConstraint(0, MCOI::TIED_TO) &&
+  if (MI.getDesc().getOperandConstraint(1, MCOI::TIED_TO) == 0 &&
       !MI.getOperand(0).isTied())
     MI.tieOperands(0, 1);
 }
@@ -175,6 +176,32 @@ bool SystemZShortenInst::shortenFPConv(MachineInstr &MI, unsigned Opcode) {
   return false;
 }
 
+bool SystemZShortenInst::shortenFusedFPOp(MachineInstr &MI, unsigned Opcode) {
+  MachineOperand &DstMO = MI.getOperand(0);
+  MachineOperand &LHSMO = MI.getOperand(1);
+  MachineOperand &RHSMO = MI.getOperand(2);
+  MachineOperand &AccMO = MI.getOperand(3);
+  if (SystemZMC::getFirstReg(DstMO.getReg()) < 16 &&
+      SystemZMC::getFirstReg(LHSMO.getReg()) < 16 &&
+      SystemZMC::getFirstReg(RHSMO.getReg()) < 16 &&
+      SystemZMC::getFirstReg(AccMO.getReg()) < 16 &&
+      DstMO.getReg() == AccMO.getReg()) {
+    MachineOperand Lhs(LHSMO);
+    MachineOperand Rhs(RHSMO);
+    MachineOperand Src(AccMO);
+    MI.RemoveOperand(3);
+    MI.RemoveOperand(2);
+    MI.RemoveOperand(1);
+    MI.setDesc(TII->get(Opcode));
+    MachineInstrBuilder(*MI.getParent()->getParent(), &MI)
+        .add(Src)
+        .add(Lhs)
+        .add(Rhs);
+    return true;
+  }
+  return false;
+}
+
 // Process all instructions in MBB.  Return true if something changed.
 bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
   bool Changed = false;
@@ -235,6 +262,22 @@ bool SystemZShortenInst::processBlock(MachineBasicBlock &MBB) {
       Changed |= shortenOn001(MI, SystemZ::MEEBR);
       break;
 
+    case SystemZ::WFMADB:
+      Changed |= shortenFusedFPOp(MI, SystemZ::MADBR);
+      break;
+
+    case SystemZ::WFMASB:
+      Changed |= shortenFusedFPOp(MI, SystemZ::MAEBR);
+      break;
+
+    case SystemZ::WFMSDB:
+      Changed |= shortenFusedFPOp(MI, SystemZ::MSDBR);
+      break;
+
+    case SystemZ::WFMSSB:
+      Changed |= shortenFusedFPOp(MI, SystemZ::MSEBR);
+      break;
+
     case SystemZ::WFLCDB:
       Changed |= shortenOn01(MI, SystemZ::LCDFR);
       break;
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
index 5e8af81842c4..68e0b7ae66a4 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.cpp
@@ -9,6 +9,7 @@
 #include "SystemZSubtarget.h"
 #include "MCTargetDesc/SystemZMCTargetDesc.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -28,11 +29,16 @@ void SystemZSubtarget::anchor() {}
 
 SystemZSubtarget &
 SystemZSubtarget::initializeSubtargetDependencies(StringRef CPU, StringRef FS) {
-  std::string CPUName = CPU;
+  StringRef CPUName = CPU;
   if (CPUName.empty())
     CPUName = "generic";
   // Parse features string.
   ParseSubtargetFeatures(CPUName, FS);
+
+  // -msoft-float implies -mno-vx.
+  if (HasSoftFloat)
+    HasVector = false;
+
   return *this;
 }
 
@@ -57,7 +63,7 @@ SystemZSubtarget::SystemZSubtarget(const Triple &TT, const std::string &CPU,
       HasInsertReferenceBitsMultiple(false),
       HasMiscellaneousExtensions3(false), HasMessageSecurityAssist9(false),
       HasVectorEnhancements2(false), HasVectorPackedDecimalEnhancement(false),
-      HasEnhancedSort(false), HasDeflateConversion(false),
+      HasEnhancedSort(false), HasDeflateConversion(false), HasSoftFloat(false),
       TargetTriple(TT), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
       TLInfo(TM, *this), TSInfo(), FrameLowering() {}
 
@@ -68,9 +74,12 @@ bool SystemZSubtarget::enableSubRegLiveness() const {
 
 bool SystemZSubtarget::isPC32DBLSymbol(const GlobalValue *GV,
                                        CodeModel::Model CM) const {
-  // PC32DBL accesses require the low bit to be clear.  Note that a zero
-  // value selects the default alignment and is therefore OK.
-  if (GV->getAlignment() == 1)
+  // PC32DBL accesses require the low bit to be clear.
+  //
+  // FIXME: Explicitly check for functions: the datalayout is currently
+  // missing information about function pointers.
+  const DataLayout &DL = GV->getParent()->getDataLayout();
+  if (GV->getPointerAlignment(DL) == 1 && !GV->getValueType()->isFunctionTy())
     return false;
 
   // For the small model, all locally-binding symbols are in range.
diff --git a/llvm/lib/Target/SystemZ/SystemZSubtarget.h b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
index fa3f65d93c91..4b49c37fe4e6 100644
--- a/llvm/lib/Target/SystemZ/SystemZSubtarget.h
+++ b/llvm/lib/Target/SystemZ/SystemZSubtarget.h
@@ -68,6 +68,7 @@ protected:
   bool HasVectorPackedDecimalEnhancement;
   bool HasEnhancedSort;
   bool HasDeflateConversion;
+  bool HasSoftFloat;
 
 private:
   Triple TargetTriple;
@@ -239,6 +240,9 @@ public:
   // Return true if the target has the deflate-conversion facility.
   bool hasDeflateConversion() const { return HasDeflateConversion; }
 
+  // Return true if soft float should be used.
+  bool hasSoftFloat() const { return HasSoftFloat; }
+
   // Return true if GV can be accessed using LARL for reloc model RM
   // and code model CM.
   bool isPC32DBLSymbol(const GlobalValue *GV, CodeModel::Model CM) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZTDC.cpp b/llvm/lib/Target/SystemZ/SystemZTDC.cpp
index f103812eb096..7cb7dca2ea28 100644
--- a/llvm/lib/Target/SystemZ/SystemZTDC.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTDC.cpp
@@ -44,7 +44,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "SystemZ.h"
+#include "SystemZSubtarget.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -53,6 +55,7 @@
 #include "llvm/IR/IntrinsicsS390.h"
 #include "llvm/IR/LegacyPassManager.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Target/TargetMachine.h"
 #include <deque>
 #include <set>
 
@@ -72,6 +75,11 @@ public:
   }
 
   bool runOnFunction(Function &F) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<TargetPassConfig>();
+ }
+
 private:
   // Maps seen instructions that can be mapped to a TDC, values are
   // (TDC operand, TDC mask, worthy flag) triples.
@@ -310,6 +318,12 @@ void SystemZTDCPass::convertLogicOp(BinaryOperator &I) {
 }
 
 bool SystemZTDCPass::runOnFunction(Function &F) {
+  auto &TPC = getAnalysis<TargetPassConfig>();
+  if (TPC.getTM<TargetMachine>()
+          .getSubtarget<SystemZSubtarget>(F)
+          .hasSoftFloat())
+    return false;
+
   ConvertedInsts.clear();
   LogicOpsWorklist.clear();
   PossibleJunk.clear();
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index dfcdb5356485..3f467b200852 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -40,8 +40,10 @@ static bool UsesVectorABI(StringRef CPU, StringRef FS) {
   // This is the case by default if CPU is z13 or later, and can be
   // overridden via "[+-]vector" feature string elements.
   bool VectorABI = true;
+  bool SoftFloat = false;
   if (CPU.empty() || CPU == "generic" ||
-      CPU == "z10" || CPU == "z196" || CPU == "zEC12")
+      CPU == "z10" || CPU == "z196" || CPU == "zEC12" ||
+      CPU == "arch8" || CPU == "arch9" || CPU == "arch10")
     VectorABI = false;
 
   SmallVector<StringRef, 3> Features;
@@ -51,9 +53,13 @@ static bool UsesVectorABI(StringRef CPU, StringRef FS) {
       VectorABI = true;
     if (Feature == "-vector")
       VectorABI = false;
+    if (Feature == "soft-float" || Feature == "+soft-float")
+      SoftFloat = true;
+    if (Feature == "-soft-float")
+      SoftFloat = false;
   }
 
-  return VectorABI;
+  return VectorABI && !SoftFloat;
 }
 
 static std::string computeDataLayout(const Triple &TT, StringRef CPU,
@@ -154,13 +160,46 @@ SystemZTargetMachine::SystemZTargetMachine(const Target &T, const Triple &TT,
           getEffectiveRelocModel(RM),
           getEffectiveSystemZCodeModel(CM, getEffectiveRelocModel(RM), JIT),
           OL),
-      TLOF(std::make_unique<TargetLoweringObjectFileELF>()),
-      Subtarget(TT, CPU, FS, *this) {
+      TLOF(std::make_unique<TargetLoweringObjectFileELF>()) {
   initAsmInfo();
 }
 
 SystemZTargetMachine::~SystemZTargetMachine() = default;
 
+const SystemZSubtarget *
+SystemZTargetMachine::getSubtargetImpl(const Function &F) const {
+  Attribute CPUAttr = F.getFnAttribute("target-cpu");
+  Attribute FSAttr = F.getFnAttribute("target-features");
+
+  std::string CPU = !CPUAttr.hasAttribute(Attribute::None)
+                        ? CPUAttr.getValueAsString().str()
+                        : TargetCPU;
+  std::string FS = !FSAttr.hasAttribute(Attribute::None)
+                       ? FSAttr.getValueAsString().str()
+                       : TargetFS;
+
+  // FIXME: This is related to the code below to reset the target options,
+  // we need to know whether or not the soft float flag is set on the
+  // function, so we can enable it as a subtarget feature.
+  bool softFloat =
+    F.hasFnAttribute("use-soft-float") &&
+    F.getFnAttribute("use-soft-float").getValueAsString() == "true";
+
+  if (softFloat)
+    FS += FS.empty() ? "+soft-float" : ",+soft-float";
+
+  auto &I = SubtargetMap[CPU + FS];
+  if (!I) {
+    // This needs to be done before we create a new subtarget since any
+    // creation will depend on the TM and the code generation flags on the
+    // function that reside in TargetOptions.
+    resetTargetOptions(F);
+    I = std::make_unique<SystemZSubtarget>(TargetTriple, CPU, FS, *this);
+  }
+
+  return I.get();
+}
+
 namespace {
 
 /// SystemZ Code Generator Pass Configuration Options.
@@ -183,6 +222,7 @@ public:
   void addIRPasses() override;
   bool addInstSelector() override;
   bool addILPOpts() override;
+  void addPreRegAlloc() override;
   void addPostRewrite() override;
   void addPostRegAlloc() override;
   void addPreSched2() override;
@@ -214,6 +254,10 @@ bool SystemZPassConfig::addILPOpts() {
   return true;
 }
 
+void SystemZPassConfig::addPreRegAlloc() {
+  addPass(createSystemZCopyPhysRegsPass(getSystemZTargetMachine()));
+}
+
 void SystemZPassConfig::addPostRewrite() {
   addPass(createSystemZPostRewritePass(getSystemZTargetMachine()));
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.h b/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
index ac04a080f580..9ea03e104fc9 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.h
@@ -26,7 +26,8 @@ namespace llvm {
 
 class SystemZTargetMachine : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
-  SystemZSubtarget Subtarget;
+
+  mutable StringMap<std::unique_ptr<SystemZSubtarget>> SubtargetMap;
 
 public:
   SystemZTargetMachine(const Target &T, const Triple &TT, StringRef CPU,
@@ -35,11 +36,11 @@ public:
                        CodeGenOpt::Level OL, bool JIT);
   ~SystemZTargetMachine() override;
 
-  const SystemZSubtarget *getSubtargetImpl() const { return &Subtarget; }
-
-  const SystemZSubtarget *getSubtargetImpl(const Function &) const override {
-    return &Subtarget;
-  }
+  const SystemZSubtarget *getSubtargetImpl(const Function &) const override;
+  // DO NOT IMPLEMENT: There is no such thing as a valid default subtarget,
+  // subtargets are per-function entities based on the target-specific
+  // attributes of each function.
+  const SystemZSubtarget *getSubtargetImpl() const = delete;
 
   // Override LLVMTargetMachine
   TargetPassConfig *createPassConfig(PassManagerBase &PM) override;
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
index acec3c533585..864200e5f71c 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.cpp
@@ -30,7 +30,8 @@ using namespace llvm;
 //
 //===----------------------------------------------------------------------===//
 
-int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+                                  TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -63,7 +64,8 @@ int SystemZTTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
 }
 
 int SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
-                                  const APInt &Imm, Type *Ty) {
+                                  const APInt &Imm, Type *Ty,
+                                  TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -177,11 +179,12 @@ int SystemZTTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx,
     break;
   }
 
-  return SystemZTTIImpl::getIntImmCost(Imm, Ty);
+  return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
 }
 
 int SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
-                                        const APInt &Imm, Type *Ty) {
+                                        const APInt &Imm, Type *Ty,
+                                        TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -226,7 +229,7 @@ int SystemZTTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
       return TTI::TCC_Free;
     break;
   }
-  return SystemZTTIImpl::getIntImmCost(Imm, Ty);
+  return SystemZTTIImpl::getIntImmCost(Imm, Ty, CostKind);
 }
 
 TargetTransformInfo::PopcntSupportKind
@@ -246,8 +249,7 @@ void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   for (auto &BB : L->blocks())
     for (auto &I : *BB) {
       if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
-        ImmutableCallSite CS(&I);
-        if (const Function *F = CS.getCalledFunction()) {
+        if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
           if (isLoweredToCall(F))
             HasCall = true;
           if (F->getIntrinsicID() == Intrinsic::memcpy ||
@@ -259,7 +261,8 @@ void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
       }
       if (isa<StoreInst>(&I)) {
         Type *MemAccessTy = I.getOperand(0)->getType();
-        NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, None, 0);
+        NumStores += getMemoryOpCost(Instruction::Store, MemAccessTy, None, 0,
+                                     TTI::TCK_RecipThroughput);
       }
     }
 
@@ -291,6 +294,10 @@ void SystemZTTIImpl::getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
   UP.Force = true;
 }
 
+void SystemZTTIImpl::getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                                           TTI::PeelingPreferences &PP) {
+  BaseT::getPeelingPreferences(L, SE, PP);
+}
 
 bool SystemZTTIImpl::isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                                    TargetTransformInfo::LSRCost &C2) {
@@ -323,6 +330,23 @@ unsigned SystemZTTIImpl::getRegisterBitWidth(bool Vector) const {
   return 0;
 }
 
+unsigned SystemZTTIImpl::getMinPrefetchStride(unsigned NumMemAccesses,
+                                              unsigned NumStridedMemAccesses,
+                                              unsigned NumPrefetches,
+                                              bool HasCall) const {
+  // Don't prefetch a loop with many far apart accesses.
+  if (NumPrefetches > 16)
+    return UINT_MAX;
+
+  // Emit prefetch instructions for smaller strides in cases where we think
+  // the hardware prefetcher might not be able to keep up.
+  if (NumStridedMemAccesses > 32 &&
+      NumStridedMemAccesses == NumMemAccesses && !HasCall)
+    return 1;
+
+  return ST->hasMiscellaneousExtensions3() ? 8192 : 2048;
+}
+
 bool SystemZTTIImpl::hasDivRemOp(Type *DataType, bool IsSigned) {
   EVT VT = TLI->getValueType(DL, DataType);
   return (VT.isScalarInteger() && TLI->isTypeLegal(VT));
@@ -341,18 +365,25 @@ static unsigned getScalarSizeInBits(Type *Ty) {
 // type until it is legal. This would e.g. return 4 for <6 x i64>, instead of
 // 3.
 static unsigned getNumVectorRegs(Type *Ty) {
-  assert(Ty->isVectorTy() && "Expected vector type");
-  unsigned WideBits = getScalarSizeInBits(Ty) * Ty->getVectorNumElements();
+  auto *VTy = cast<FixedVectorType>(Ty);
+  unsigned WideBits = getScalarSizeInBits(Ty) * VTy->getNumElements();
   assert(WideBits > 0 && "Could not compute size of vector");
   return ((WideBits % 128U) ? ((WideBits / 128U) + 1) : (WideBits / 128U));
 }
 
 int SystemZTTIImpl::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, TTI::OperandValueKind Op1Info,
+    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+    TTI::OperandValueKind Op1Info,
     TTI::OperandValueKind Op2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
     const Instruction *CxtI) {
 
+  // TODO: Handle more cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+                                         Op2Info, Opd1PropInfo,
+                                         Opd2PropInfo, Args, CxtI);
+
   // TODO: return a good value for BB-VECTORIZER that includes the
   // immediate loads, which we do not want to count for the loop
   // vectorizer, since they are hopefully hoisted out of the loop. This
@@ -391,10 +422,59 @@ int SystemZTTIImpl::getArithmeticInstrCost(
     }
   }
 
-  if (Ty->isVectorTy()) {
-    assert(ST->hasVector() &&
-           "getArithmeticInstrCost() called with vector type.");
-    unsigned VF = Ty->getVectorNumElements();
+  if (!Ty->isVectorTy()) {
+    // These FP operations are supported with a dedicated instruction for
+    // float, double and fp128 (base implementation assumes float generally
+    // costs 2).
+    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
+        Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
+      return 1;
+
+    // There is no native support for FRem.
+    if (Opcode == Instruction::FRem)
+      return LIBCALL_COST;
+
+    // Give discount for some combined logical operations if supported.
+    if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
+      if (Opcode == Instruction::Xor) {
+        for (const Value *A : Args) {
+          if (const Instruction *I = dyn_cast<Instruction>(A))
+            if (I->hasOneUse() &&
+                (I->getOpcode() == Instruction::And ||
+                 I->getOpcode() == Instruction::Or ||
+                 I->getOpcode() == Instruction::Xor))
+              return 0;
+        }
+      }
+      else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
+        for (const Value *A : Args) {
+          if (const Instruction *I = dyn_cast<Instruction>(A))
+            if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
+              return 0;
+        }
+      }
+    }
+
+    // Or requires one instruction, although it has custom handling for i64.
+    if (Opcode == Instruction::Or)
+      return 1;
+
+    if (Opcode == Instruction::Xor && ScalarBits == 1) {
+      if (ST->hasLoadStoreOnCond2())
+        return 5; // 2 * (li 0; loc 1); xor
+      return 7; // 2 * ipm sequences ; xor ; shift ; compare
+    }
+
+    if (DivRemConstPow2)
+      return (SignedDivRem ? SDivPow2Cost : 1);
+    if (DivRemConst)
+      return DivMulSeqCost;
+    if (SignedDivRem || UnsignedDivRem)
+      return DivInstrCost;
+  }
+  else if (ST->hasVector()) {
+    auto *VTy = cast<FixedVectorType>(Ty);
+    unsigned VF = VTy->getNumElements();
     unsigned NumVectors = getNumVectorRegs(Ty);
 
     // These vector operations are custom handled, but are still supported
@@ -407,7 +487,7 @@ int SystemZTTIImpl::getArithmeticInstrCost(
     if (DivRemConstPow2)
       return (NumVectors * (SignedDivRem ? SDivPow2Cost : 1));
     if (DivRemConst)
-      return VF * DivMulSeqCost + getScalarizationOverhead(Ty, Args);
+      return VF * DivMulSeqCost + getScalarizationOverhead(VTy, Args);
     if ((SignedDivRem || UnsignedDivRem) && VF > 4)
       // Temporary hack: disable high vectorization factors with integer
       // division/remainder, which will get scalarized and handled with
@@ -429,8 +509,8 @@ int SystemZTTIImpl::getArithmeticInstrCost(
         // Return the cost of multiple scalar invocation plus the cost of
         // inserting and extracting the values.
         unsigned ScalarCost =
-            getArithmeticInstrCost(Opcode, Ty->getScalarType());
-        unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(Ty, Args);
+            getArithmeticInstrCost(Opcode, Ty->getScalarType(), CostKind);
+        unsigned Cost = (VF * ScalarCost) + getScalarizationOverhead(VTy, Args);
         // FIXME: VF 2 for these FP operations are currently just as
         // expensive as for VF 4.
         if (VF == 2)
@@ -447,101 +527,51 @@ int SystemZTTIImpl::getArithmeticInstrCost(
 
     // There is no native support for FRem.
     if (Opcode == Instruction::FRem) {
-      unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(Ty, Args);
+      unsigned Cost = (VF * LIBCALL_COST) + getScalarizationOverhead(VTy, Args);
       // FIXME: VF 2 for float is currently just as expensive as for VF 4.
       if (VF == 2 && ScalarBits == 32)
         Cost *= 2;
       return Cost;
     }
   }
-  else {  // Scalar:
-    // These FP operations are supported with a dedicated instruction for
-    // float, double and fp128 (base implementation assumes float generally
-    // costs 2).
-    if (Opcode == Instruction::FAdd || Opcode == Instruction::FSub ||
-        Opcode == Instruction::FMul || Opcode == Instruction::FDiv)
-      return 1;
-
-    // There is no native support for FRem.
-    if (Opcode == Instruction::FRem)
-      return LIBCALL_COST;
-
-    // Give discount for some combined logical operations if supported.
-    if (Args.size() == 2 && ST->hasMiscellaneousExtensions3()) {
-      if (Opcode == Instruction::Xor) {
-        for (const Value *A : Args) {
-          if (const Instruction *I = dyn_cast<Instruction>(A))
-            if (I->hasOneUse() &&
-                (I->getOpcode() == Instruction::And ||
-                 I->getOpcode() == Instruction::Or ||
-                 I->getOpcode() == Instruction::Xor))
-              return 0;
-        }
-      }
-      else if (Opcode == Instruction::Or || Opcode == Instruction::And) {
-        for (const Value *A : Args) {
-          if (const Instruction *I = dyn_cast<Instruction>(A))
-            if (I->hasOneUse() && I->getOpcode() == Instruction::Xor)
-              return 0;
-        }
-      }
-    }
-
-    // Or requires one instruction, although it has custom handling for i64.
-    if (Opcode == Instruction::Or)
-      return 1;
-
-    if (Opcode == Instruction::Xor && ScalarBits == 1) {
-      if (ST->hasLoadStoreOnCond2())
-        return 5; // 2 * (li 0; loc 1); xor
-      return 7; // 2 * ipm sequences ; xor ; shift ; compare
-    }
-
-    if (DivRemConstPow2)
-      return (SignedDivRem ? SDivPow2Cost : 1);
-    if (DivRemConst)
-      return DivMulSeqCost;
-    if (SignedDivRem || UnsignedDivRem)
-      return DivInstrCost;
-  }
 
   // Fallback to the default implementation.
-  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info,
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info,
                                        Opd1PropInfo, Opd2PropInfo, Args, CxtI);
 }
 
-int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
-                                   Type *SubTp) {
-  assert (Tp->isVectorTy());
-  assert (ST->hasVector() && "getShuffleCost() called.");
-  unsigned NumVectors = getNumVectorRegs(Tp);
+int SystemZTTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp,
+                                   int Index, VectorType *SubTp) {
+  if (ST->hasVector()) {
+    unsigned NumVectors = getNumVectorRegs(Tp);
 
-  // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
+    // TODO: Since fp32 is expanded, the shuffle cost should always be 0.
 
-  // FP128 values are always in scalar registers, so there is no work
-  // involved with a shuffle, except for broadcast. In that case register
-  // moves are done with a single instruction per element.
-  if (Tp->getScalarType()->isFP128Ty())
-    return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
+    // FP128 values are always in scalar registers, so there is no work
+    // involved with a shuffle, except for broadcast. In that case register
+    // moves are done with a single instruction per element.
+    if (Tp->getScalarType()->isFP128Ty())
+      return (Kind == TargetTransformInfo::SK_Broadcast ? NumVectors - 1 : 0);
 
-  switch (Kind) {
-  case  TargetTransformInfo::SK_ExtractSubvector:
-    // ExtractSubvector Index indicates start offset.
+    switch (Kind) {
+    case  TargetTransformInfo::SK_ExtractSubvector:
+      // ExtractSubvector Index indicates start offset.
 
-    // Extracting a subvector from first index is a noop.
-    return (Index == 0 ? 0 : NumVectors);
+      // Extracting a subvector from first index is a noop.
+      return (Index == 0 ? 0 : NumVectors);
 
-  case TargetTransformInfo::SK_Broadcast:
-    // Loop vectorizer calls here to figure out the extra cost of
-    // broadcasting a loaded value to all elements of a vector. Since vlrep
-    // loads and replicates with a single instruction, adjust the returned
-    // value.
-    return NumVectors - 1;
+    case TargetTransformInfo::SK_Broadcast:
+      // Loop vectorizer calls here to figure out the extra cost of
+      // broadcasting a loaded value to all elements of a vector. Since vlrep
+      // loads and replicates with a single instruction, adjust the returned
+      // value.
+      return NumVectors - 1;
 
-  default:
+    default:
 
-    // SystemZ supports single instruction permutation / replication.
-    return NumVectors;
+      // SystemZ supports single instruction permutation / replication.
+      return NumVectors;
+    }
   }
 
   return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
@@ -564,8 +594,9 @@ getVectorTruncCost(Type *SrcTy, Type *DstTy) {
   assert (SrcTy->isVectorTy() && DstTy->isVectorTy());
   assert (SrcTy->getPrimitiveSizeInBits() > DstTy->getPrimitiveSizeInBits() &&
           "Packing must reduce size of vector type.");
-  assert (SrcTy->getVectorNumElements() == DstTy->getVectorNumElements() &&
-          "Packing should not change number of elements.");
+  assert(cast<FixedVectorType>(SrcTy)->getNumElements() ==
+             cast<FixedVectorType>(DstTy)->getNumElements() &&
+         "Packing should not change number of elements.");
 
   // TODO: Since fp32 is expanded, the extract cost should always be 0.
 
@@ -580,7 +611,7 @@ getVectorTruncCost(Type *SrcTy, Type *DstTy) {
 
   unsigned Cost = 0;
   unsigned Log2Diff = getElSizeLog2Diff(SrcTy, DstTy);
-  unsigned VF = SrcTy->getVectorNumElements();
+  unsigned VF = cast<FixedVectorType>(SrcTy)->getNumElements();
   for (unsigned P = 0; P < Log2Diff; ++P) {
     if (NumParts > 1)
       NumParts /= 2;
@@ -642,7 +673,7 @@ static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
     // Return the potentially vectorized type based on 'I' and 'VF'.  'I' may
     // be either scalar or already vectorized with a same or lesser VF.
     Type *ElTy = OpTy->getScalarType();
-    return VectorType::get(ElTy, VF);
+    return FixedVectorType::get(ElTy, VF);
   }
 
   return nullptr;
@@ -653,8 +684,8 @@ static Type *getCmpOpsType(const Instruction *I, unsigned VF = 1) {
 unsigned SystemZTTIImpl::
 getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
                               const Instruction *I) {
-  assert (Dst->isVectorTy());
-  unsigned VF = Dst->getVectorNumElements();
+  auto *DstVTy = cast<FixedVectorType>(Dst);
+  unsigned VF = DstVTy->getNumElements();
   unsigned Cost = 0;
   // If we know what the widths of the compared operands, get any cost of
   // converting it to match Dst. Otherwise assume same widths.
@@ -668,14 +699,50 @@ getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
 }
 
 int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                     TTI::TargetCostKind CostKind,
                                      const Instruction *I) {
+  // FIXME: Can the logic below also be used for these cost kinds?
+  if (CostKind == TTI::TCK_CodeSize || CostKind == TTI::TCK_SizeAndLatency) {
+    int BaseCost = BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
+    return BaseCost == 0 ? BaseCost : 1;
+  }
+
   unsigned DstScalarBits = Dst->getScalarSizeInBits();
   unsigned SrcScalarBits = Src->getScalarSizeInBits();
 
-  if (Src->isVectorTy()) {
-    assert (ST->hasVector() && "getCastInstrCost() called with vector type.");
-    assert (Dst->isVectorTy());
-    unsigned VF = Src->getVectorNumElements();
+  if (!Src->isVectorTy()) {
+    assert (!Dst->isVectorTy());
+
+    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
+      if (SrcScalarBits >= 32 ||
+          (I != nullptr && isa<LoadInst>(I->getOperand(0))))
+        return 1;
+      return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
+    }
+
+    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
+        Src->isIntegerTy(1)) {
+      if (ST->hasLoadStoreOnCond2())
+        return 2; // li 0; loc 1
+
+      // This should be extension of a compare i1 result, which is done with
+      // ipm and a varying sequence of instructions.
+      unsigned Cost = 0;
+      if (Opcode == Instruction::SExt)
+        Cost = (DstScalarBits < 64 ? 3 : 4);
+      if (Opcode == Instruction::ZExt)
+        Cost = 3;
+      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
+      if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
+        // If operands of an fp-type was compared, this costs +1.
+        Cost++;
+      return Cost;
+    }
+  }
+  else if (ST->hasVector()) {
+    auto *SrcVecTy = cast<FixedVectorType>(Src);
+    auto *DstVecTy = cast<FixedVectorType>(Dst);
+    unsigned VF = SrcVecTy->getNumElements();
     unsigned NumDstVectors = getNumVectorRegs(Dst);
     unsigned NumSrcVectors = getNumVectorRegs(Src);
 
@@ -720,7 +787,7 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
       // inserting and extracting the values. Base implementation does not
       // realize float->int gets scalarized.
       unsigned ScalarCost = getCastInstrCost(Opcode, Dst->getScalarType(),
-                                             Src->getScalarType());
+                                             Src->getScalarType(), CostKind);
       unsigned TotCost = VF * ScalarCost;
       bool NeedsInserts = true, NeedsExtracts = true;
       // FP128 registers do not get inserted or extracted.
@@ -731,8 +798,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
           (Opcode == Instruction::FPToSI || Opcode == Instruction::FPToUI))
         NeedsExtracts = false;
 
-      TotCost += getScalarizationOverhead(Src, false, NeedsExtracts);
-      TotCost += getScalarizationOverhead(Dst, NeedsInserts, false);
+      TotCost += getScalarizationOverhead(SrcVecTy, false, NeedsExtracts);
+      TotCost += getScalarizationOverhead(DstVecTy, NeedsInserts, false);
 
       // FIXME: VF 2 for float<->i32 is currently just as expensive as for VF 4.
       if (VF == 2 && SrcScalarBits == 32 && DstScalarBits == 32)
@@ -743,7 +810,8 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
 
     if (Opcode == Instruction::FPTrunc) {
       if (SrcScalarBits == 128)  // fp128 -> double/float + inserts of elements.
-        return VF /*ldxbr/lexbr*/ + getScalarizationOverhead(Dst, true, false);
+        return VF /*ldxbr/lexbr*/ +
+               getScalarizationOverhead(DstVecTy, true, false);
       else // double -> float
         return VF / 2 /*vledb*/ + std::max(1U, VF / 4 /*vperm*/);
     }
@@ -756,40 +824,11 @@ int SystemZTTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
         return VF * 2;
       }
       // -> fp128.  VF * lxdb/lxeb + extraction of elements.
-      return VF + getScalarizationOverhead(Src, false, true);
+      return VF + getScalarizationOverhead(SrcVecTy, false, true);
     }
   }
-  else { // Scalar
-    assert (!Dst->isVectorTy());
-
-    if (Opcode == Instruction::SIToFP || Opcode == Instruction::UIToFP) {
-      if (SrcScalarBits >= 32 ||
-          (I != nullptr && isa<LoadInst>(I->getOperand(0))))
-        return 1;
-      return SrcScalarBits > 1 ? 2 /*i8/i16 extend*/ : 5 /*branch seq.*/;
-    }
 
-    if ((Opcode == Instruction::ZExt || Opcode == Instruction::SExt) &&
-        Src->isIntegerTy(1)) {
-      if (ST->hasLoadStoreOnCond2())
-        return 2; // li 0; loc 1
-
-      // This should be extension of a compare i1 result, which is done with
-      // ipm and a varying sequence of instructions.
-      unsigned Cost = 0;
-      if (Opcode == Instruction::SExt)
-        Cost = (DstScalarBits < 64 ? 3 : 4);
-      if (Opcode == Instruction::ZExt)
-        Cost = 3;
-      Type *CmpOpTy = ((I != nullptr) ? getCmpOpsType(I) : nullptr);
-      if (CmpOpTy != nullptr && CmpOpTy->isFloatingPointTy())
-        // If operands of an fp-type was compared, this costs +1.
-        Cost++;
-      return Cost;
-    }
-  }
-
-  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
+  return BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I);
 }
 
 // Scalar i8 / i16 operations will typically be made after first extending
@@ -805,10 +844,38 @@ static unsigned getOperandsExtensionCost(const Instruction *I) {
 }
 
 int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
-                                       Type *CondTy, const Instruction *I) {
-  if (ValTy->isVectorTy()) {
-    assert (ST->hasVector() && "getCmpSelInstrCost() called with vector type.");
-    unsigned VF = ValTy->getVectorNumElements();
+                                       Type *CondTy,
+                                       TTI::TargetCostKind CostKind,
+                                       const Instruction *I) {
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind);
+
+  if (!ValTy->isVectorTy()) {
+    switch (Opcode) {
+    case Instruction::ICmp: {
+      // A loaded value compared with 0 with multiple users becomes Load and
+      // Test. The load is then not foldable, so return 0 cost for the ICmp.
+      unsigned ScalarBits = ValTy->getScalarSizeInBits();
+      if (I != nullptr && ScalarBits >= 32)
+        if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
+          if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
+            if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
+                C->getZExtValue() == 0)
+              return 0;
+
+      unsigned Cost = 1;
+      if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
+        Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
+      return Cost;
+    }
+    case Instruction::Select:
+      if (ValTy->isFloatingPointTy())
+        return 4; // No load on condition for FP - costs a conditional jump.
+      return 1; // Load On Condition / Select Register.
+    }
+  }
+  else if (ST->hasVector()) {
+    unsigned VF = cast<FixedVectorType>(ValTy)->getNumElements();
 
     // Called with a compare instruction.
     if (Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) {
@@ -856,32 +923,8 @@ int SystemZTTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy,
       return getNumVectorRegs(ValTy) /*vsel*/ + PackCost;
     }
   }
-  else { // Scalar
-    switch (Opcode) {
-    case Instruction::ICmp: {
-      // A loaded value compared with 0 with multiple users becomes Load and
-      // Test. The load is then not foldable, so return 0 cost for the ICmp.
-      unsigned ScalarBits = ValTy->getScalarSizeInBits();
-      if (I != nullptr && ScalarBits >= 32)
-        if (LoadInst *Ld = dyn_cast<LoadInst>(I->getOperand(0)))
-          if (const ConstantInt *C = dyn_cast<ConstantInt>(I->getOperand(1)))
-            if (!Ld->hasOneUse() && Ld->getParent() == I->getParent() &&
-                C->getZExtValue() == 0)
-              return 0;
-
-      unsigned Cost = 1;
-      if (ValTy->isIntegerTy() && ValTy->getScalarSizeInBits() <= 16)
-        Cost += (I != nullptr ? getOperandsExtensionCost(I) : 2);
-      return Cost;
-    }
-    case Instruction::Select:
-      if (ValTy->isFloatingPointTy())
-        return 4; // No load on condition for FP - costs a conditional jump.
-      return 1; // Load On Condition / Select Register.
-    }
-  }
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, nullptr);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind);
 }
 
 int SystemZTTIImpl::
@@ -995,9 +1038,14 @@ static bool isBswapIntrinsicCall(const Value *V) {
 
 int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                     MaybeAlign Alignment, unsigned AddressSpace,
+                                    TTI::TargetCostKind CostKind,
                                     const Instruction *I) {
   assert(!Src->isVoidTy() && "Invalid type");
 
+  // TODO: Handle other cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return 1;
+
   if (!Src->isVectorTy() && Opcode == Instruction::Load && I != nullptr) {
     // Store the load or its truncated or extended value in FoldedValue.
     const Instruction *FoldedValue = nullptr;
@@ -1058,16 +1106,13 @@ int SystemZTTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 // needed for using / defining the vector operands. The SystemZ version does
 // roughly the same but bases the computations on vector permutations
 // instead.
-int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                               unsigned Factor,
-                                               ArrayRef<unsigned> Indices,
-                                               unsigned Alignment,
-                                               unsigned AddressSpace,
-                                               bool UseMaskForCond,
-                                               bool UseMaskForGaps) {
+int SystemZTTIImpl::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+    bool UseMaskForCond, bool UseMaskForGaps) {
   if (UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace,
+                                             Alignment, AddressSpace, CostKind,
                                              UseMaskForCond, UseMaskForGaps);
   assert(isa<VectorType>(VecTy) &&
          "Expect a vector type for interleaved memory op");
@@ -1075,7 +1120,7 @@ int SystemZTTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
   // Return the ceiling of dividing A by B.
   auto ceil = [](unsigned A, unsigned B) { return (A + B - 1) / B; };
 
-  unsigned NumElts = VecTy->getVectorNumElements();
+  unsigned NumElts = cast<FixedVectorType>(VecTy)->getNumElements();
   assert(Factor > 1 && NumElts % Factor == 0 && "Invalid interleave factor");
   unsigned VF = NumElts / Factor;
   unsigned NumEltsPerVecReg = (128U / getScalarSizeInBits(VecTy));
@@ -1125,22 +1170,10 @@ static int getVectorIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy) {
   return -1;
 }
 
-int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                          ArrayRef<Value *> Args,
-                                          FastMathFlags FMF, unsigned VF) {
-  int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
-  if (Cost != -1)
-    return Cost;
-  return BaseT::getIntrinsicInstrCost(ID, RetTy, Args, FMF, VF);
-}
-
-int SystemZTTIImpl::getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                                          ArrayRef<Type *> Tys,
-                                          FastMathFlags FMF,
-                                          unsigned ScalarizationCostPassed) {
-  int Cost = getVectorIntrinsicInstrCost(ID, RetTy);
+int SystemZTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                                          TTI::TargetCostKind CostKind) {
+  int Cost = getVectorIntrinsicInstrCost(ICA.getID(), ICA.getReturnType());
   if (Cost != -1)
     return Cost;
-  return BaseT::getIntrinsicInstrCost(ID, RetTy, Tys,
-                                      FMF, ScalarizationCostPassed);
+  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
index bc4d066881c1..7f8f7f6f923f 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
+++ b/llvm/lib/Target/SystemZ/SystemZTargetTransformInfo.h
@@ -38,17 +38,21 @@ public:
 
   unsigned getInliningThresholdMultiplier() { return 3; }
 
-  int getIntImmCost(const APInt &Imm, Type *Ty);
+  int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
 
-  int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+  int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
+                        Type *Ty, TTI::TargetCostKind CostKind);
   int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                          Type *Ty);
+                          Type *Ty, TTI::TargetCostKind CostKind);
 
   TTI::PopcntSupportKind getPopcntSupport(unsigned TyWidth);
 
   void getUnrollingPreferences(Loop *L, ScalarEvolution &SE,
                                TTI::UnrollingPreferences &UP);
 
+  void getPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                             TTI::PeelingPreferences &PP);
+
   bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                      TargetTransformInfo::LSRCost &C2);
   /// @}
@@ -60,8 +64,12 @@ public:
   unsigned getRegisterBitWidth(bool Vector) const;
 
   unsigned getCacheLineSize() const override { return 256; }
-  unsigned getPrefetchDistance() const override { return 2000; }
-  unsigned getMinPrefetchStride() const override { return 2048; }
+  unsigned getPrefetchDistance() const override { return 4500; }
+  unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                unsigned NumStridedMemAccesses,
+                                unsigned NumPrefetches,
+                                bool HasCall) const override;
+  bool enableWritePrefetching() const override { return true; }
 
   bool hasDivRemOp(Type *DataType, bool IsSigned);
   bool prefersVectorizedAddressing() { return false; }
@@ -71,40 +79,39 @@ public:
 
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
       const Instruction *CxtI = nullptr);
-  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+  int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
+                     VectorType *SubTp);
   unsigned getVectorTruncCost(Type *SrcTy, Type *DstTy);
   unsigned getVectorBitmaskConversionCost(Type *SrcTy, Type *DstTy);
   unsigned getBoolVecToIntConversionCost(unsigned Opcode, Type *Dst,
                                          const Instruction *I);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       TTI::TargetCostKind CostKind,
                        const Instruction *I = nullptr);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         TTI::TargetCostKind CostKind,
                          const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
   bool isFoldableLoad(const LoadInst *Ld, const Instruction *&FoldedValue);
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
-                      unsigned AddressSpace, const Instruction *I = nullptr);
-
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                 unsigned Factor,
-                                 ArrayRef<unsigned> Indices,
-                                 unsigned Alignment,
-                                 unsigned AddressSpace,
-                                 bool UseMaskForCond = false,
-                                 bool UseMaskForGaps = false);
-
-  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                            ArrayRef<Value *> Args, FastMathFlags FMF,
-                            unsigned VF = 1);
-  int getIntrinsicInstrCost(Intrinsic::ID ID, Type *RetTy,
-                            ArrayRef<Type *> Tys, FastMathFlags FMF,
-                            unsigned ScalarizationCostPassed = UINT_MAX);
+                      unsigned AddressSpace, TTI::TargetCostKind CostKind,
+                      const Instruction *I = nullptr);
+
+  int getInterleavedMemoryOpCost(
+      unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+      Align Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      bool UseMaskForCond = false, bool UseMaskForGaps = false);
+
+  int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                            TTI::TargetCostKind CostKind);
   /// @}
 };
 
diff --git a/llvm/lib/Target/Target.cpp b/llvm/lib/Target/Target.cpp
index 8a46c77492c5..7411687e2ca3 100644
--- a/llvm/lib/Target/Target.cpp
+++ b/llvm/lib/Target/Target.cpp
@@ -111,11 +111,11 @@ unsigned long long LLVMABISizeOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
 }
 
 unsigned LLVMABIAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
-  return unwrap(TD)->getABITypeAlignment(unwrap(Ty));
+  return unwrap(TD)->getABITypeAlign(unwrap(Ty)).value();
 }
 
 unsigned LLVMCallFrameAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
-  return unwrap(TD)->getABITypeAlignment(unwrap(Ty));
+  return unwrap(TD)->getABITypeAlign(unwrap(Ty)).value();
 }
 
 unsigned LLVMPreferredAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
@@ -124,7 +124,9 @@ unsigned LLVMPreferredAlignmentOfType(LLVMTargetDataRef TD, LLVMTypeRef Ty) {
 
 unsigned LLVMPreferredAlignmentOfGlobal(LLVMTargetDataRef TD,
                                         LLVMValueRef GlobalVar) {
-  return unwrap(TD)->getPreferredAlignment(unwrap<GlobalVariable>(GlobalVar));
+  return unwrap(TD)
+      ->getPreferredAlign(unwrap<GlobalVariable>(GlobalVar))
+      .value();
 }
 
 unsigned LLVMElementAtOffset(LLVMTargetDataRef TD, LLVMTypeRef StructTy,
diff --git a/llvm/lib/Target/TargetLoweringObjectFile.cpp b/llvm/lib/Target/TargetLoweringObjectFile.cpp
index dcd3934de0fa..eea0aeea2c45 100644
--- a/llvm/lib/Target/TargetLoweringObjectFile.cpp
+++ b/llvm/lib/Target/TargetLoweringObjectFile.cpp
@@ -19,10 +19,12 @@
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalVariable.h"
 #include "llvm/IR/Mangler.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/MC/SectionKind.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Target/TargetMachine.h"
@@ -38,11 +40,10 @@ using namespace llvm;
 /// lowering implementations a chance to set up their default sections.
 void TargetLoweringObjectFile::Initialize(MCContext &ctx,
                                           const TargetMachine &TM) {
-  Ctx = &ctx;
   // `Initialize` can be called more than once.
   delete Mang;
   Mang = new Mangler();
-  InitMCObjectFileInfo(TM.getTargetTriple(), TM.isPositionIndependent(), *Ctx,
+  InitMCObjectFileInfo(TM.getTargetTriple(), TM.isPositionIndependent(), ctx,
                        TM.getCodeModel() == CodeModel::Large);
 
   // Reset various EH DWARF encodings.
@@ -121,7 +122,7 @@ MCSymbol *TargetLoweringObjectFile::getSymbolWithGlobalValueBase(
   NameStr += GV->getParent()->getDataLayout().getPrivateGlobalPrefix();
   TM.getNameWithPrefix(NameStr, GV, *Mang);
   NameStr.append(Suffix.begin(), Suffix.end());
-  return Ctx->getOrCreateSymbol(NameStr);
+  return getContext().getOrCreateSymbol(NameStr);
 }
 
 MCSymbol *TargetLoweringObjectFile::getCFIPersonalitySymbol(
@@ -142,13 +143,17 @@ void TargetLoweringObjectFile::emitPersonalityValue(MCStreamer &Streamer,
 /// may be overridden by the target implementation.
 SectionKind TargetLoweringObjectFile::getKindForGlobal(const GlobalObject *GO,
                                                        const TargetMachine &TM){
-  assert(!GO->isDeclaration() && !GO->hasAvailableExternallyLinkage() &&
+  assert(!GO->isDeclarationForLinker() &&
          "Can only be used for global definitions");
 
   // Functions are classified as text sections.
   if (isa<Function>(GO))
     return SectionKind::getText();
 
+  // Basic blocks are classified as text sections.
+  if (isa<BasicBlock>(GO))
+    return SectionKind::getText();
+
   // Global variables require more detailed analysis.
   const auto *GVar = cast<GlobalVariable>(GO);
 
@@ -268,12 +273,21 @@ MCSection *TargetLoweringObjectFile::SectionForGlobal(
   return SelectSectionForGlobal(GO, Kind, TM);
 }
 
+/// This method computes the appropriate section to emit the specified global
+/// variable or function definition. This should not be passed external (or
+/// available externally) globals.
+MCSection *
+TargetLoweringObjectFile::SectionForGlobal(const GlobalObject *GO,
+                                           const TargetMachine &TM) const {
+  return SectionForGlobal(GO, getKindForGlobal(GO, TM), TM);
+}
+
 MCSection *TargetLoweringObjectFile::getSectionForJumpTable(
     const Function &F, const TargetMachine &TM) const {
-  unsigned Align = 0;
+  Align Alignment(1);
   return getSectionForConstant(F.getParent()->getDataLayout(),
                                SectionKind::getReadOnly(), /*C=*/nullptr,
-                               Align);
+                               Alignment);
 }
 
 bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection(
@@ -295,13 +309,19 @@ bool TargetLoweringObjectFile::shouldPutJumpTableInFunctionSection(
 /// information, return a section that it should be placed in.
 MCSection *TargetLoweringObjectFile::getSectionForConstant(
     const DataLayout &DL, SectionKind Kind, const Constant *C,
-    unsigned &Align) const {
+    Align &Alignment) const {
   if (Kind.isReadOnly() && ReadOnlySection != nullptr)
     return ReadOnlySection;
 
   return DataSection;
 }
 
+MCSection *TargetLoweringObjectFile::getSectionForMachineBasicBlock(
+    const Function &F, const MachineBasicBlock &MBB,
+    const TargetMachine &TM) const {
+  return nullptr;
+}
+
 /// getTTypeGlobalReference - Return an MCExpr to use for a
 /// reference to the specified global variable from exception
 /// handling information.
@@ -327,7 +347,7 @@ getTTypeReference(const MCSymbolRefExpr *Sym, unsigned Encoding,
     // Emit a label to the streamer for the current position.  This gives us
     // .-foo addressing.
     MCSymbol *PCSym = getContext().createTempSymbol();
-    Streamer.EmitLabel(PCSym);
+    Streamer.emitLabel(PCSym);
     const MCExpr *PC = MCSymbolRefExpr::create(PCSym, getContext());
     return MCBinaryExpr::createSub(Sym, PC, getContext());
   }
@@ -337,7 +357,7 @@ getTTypeReference(const MCSymbolRefExpr *Sym, unsigned Encoding,
 const MCExpr *TargetLoweringObjectFile::getDebugThreadLocalSymbol(const MCSymbol *Sym) const {
   // FIXME: It's not clear what, if any, default this should have - perhaps a
   // null return could mean 'no location' & we should just do that here.
-  return MCSymbolRefExpr::create(Sym, *Ctx);
+  return MCSymbolRefExpr::create(Sym, getContext());
 }
 
 void TargetLoweringObjectFile::getNameWithPrefix(
diff --git a/llvm/lib/Target/TargetMachine.cpp b/llvm/lib/Target/TargetMachine.cpp
index 97a1eb2f190a..074e9fde79e6 100644
--- a/llvm/lib/Target/TargetMachine.cpp
+++ b/llvm/lib/Target/TargetMachine.cpp
@@ -34,10 +34,10 @@ using namespace llvm;
 TargetMachine::TargetMachine(const Target &T, StringRef DataLayoutString,
                              const Triple &TT, StringRef CPU, StringRef FS,
                              const TargetOptions &Options)
-    : TheTarget(T), DL(DataLayoutString), TargetTriple(TT), TargetCPU(CPU),
-      TargetFS(FS), AsmInfo(nullptr), MRI(nullptr), MII(nullptr), STI(nullptr),
-      RequireStructuredCFG(false), O0WantsFastISel(false),
-      DefaultOptions(Options), Options(Options) {}
+    : TheTarget(T), DL(DataLayoutString), TargetTriple(TT),
+      TargetCPU(std::string(CPU)), TargetFS(std::string(FS)), AsmInfo(nullptr),
+      MRI(nullptr), MII(nullptr), STI(nullptr), RequireStructuredCFG(false),
+      O0WantsFastISel(false), DefaultOptions(Options), Options(Options) {}
 
 TargetMachine::~TargetMachine() = default;
 
@@ -46,17 +46,17 @@ bool TargetMachine::isPositionIndependent() const {
 }
 
 /// Reset the target options based on the function's attributes.
+/// setFunctionAttributes should have made the raw attribute value consistent
+/// with the command line flag if used.
+//
 // FIXME: This function needs to go away for a number of reasons:
 // a) global state on the TargetMachine is terrible in general,
 // b) these target options should be passed only on the function
 //    and not on the TargetMachine (via TargetOptions) at all.
 void TargetMachine::resetTargetOptions(const Function &F) const {
-#define RESET_OPTION(X, Y)                                                     \
-  do {                                                                         \
-    if (F.hasFnAttribute(Y))                                                   \
-      Options.X = (F.getFnAttribute(Y).getValueAsString() == "true");          \
-    else                                                                       \
-      Options.X = DefaultOptions.X;                                            \
+#define RESET_OPTION(X, Y)                                              \
+  do {                                                                  \
+    Options.X = (F.getFnAttribute(Y).getValueAsString() == "true");     \
   } while (0)
 
   RESET_OPTION(UnsafeFPMath, "unsafe-fp-math");
@@ -193,6 +193,14 @@ bool TargetMachine::shouldAssumeDSOLocal(const Module &M,
     // Check if we can use copy relocations.
     if (!(GV && GV->isThreadLocal()) && RM == Reloc::Static)
       return true;
+  } else if (TT.isOSBinFormatELF()) {
+    // If dso_local allows AsmPrinter::getSymbolPreferLocal to use a local
+    // alias, set the flag. We cannot set dso_local for other global values,
+    // because otherwise direct accesses to a probably interposable symbol (even
+    // if the codegen assumes not) will be rejected by the linker.
+    if (!GV || !GV->canBenefitFromLocalAlias())
+      return false;
+    return TT.isX86() && M.noSemanticInterposition();
   }
 
   // ELF & wasm support preemption of other symbols.
@@ -258,6 +266,10 @@ void TargetMachine::getNameWithPrefix(SmallVectorImpl<char> &Name,
 
 MCSymbol *TargetMachine::getSymbol(const GlobalValue *GV) const {
   const TargetLoweringObjectFile *TLOF = getObjFileLowering();
+  // XCOFF symbols could have special naming convention.
+  if (MCSymbol *TargetSymbol = TLOF->getTargetSymbol(GV, *this))
+    return TargetSymbol;
+
   SmallString<128> NameStr;
   getNameWithPrefix(NameStr, GV, TLOF->getMangler());
   return TLOF->getContext().getOrCreateSymbol(NameStr);
diff --git a/llvm/lib/Target/TargetMachineC.cpp b/llvm/lib/Target/TargetMachineC.cpp
index a38633e1f27e..60fe84cadacc 100644
--- a/llvm/lib/Target/TargetMachineC.cpp
+++ b/llvm/lib/Target/TargetMachineC.cpp
@@ -164,12 +164,12 @@ char* LLVMGetTargetMachineTriple(LLVMTargetMachineRef T) {
 }
 
 char* LLVMGetTargetMachineCPU(LLVMTargetMachineRef T) {
-  std::string StringRep = unwrap(T)->getTargetCPU();
+  std::string StringRep = std::string(unwrap(T)->getTargetCPU());
   return strdup(StringRep.c_str());
 }
 
 char* LLVMGetTargetMachineFeatureString(LLVMTargetMachineRef T) {
-  std::string StringRep = unwrap(T)->getTargetFeatureString();
+  std::string StringRep = std::string(unwrap(T)->getTargetFeatureString());
   return strdup(StringRep.c_str());
 }
 
diff --git a/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
new file mode 100644
index 000000000000..7a899b4b38e2
--- /dev/null
+++ b/llvm/lib/Target/VE/AsmParser/VEAsmParser.cpp
@@ -0,0 +1,1454 @@
+//===-- VEAsmParser.cpp - Parse VE assembly to MCInst instructions --------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/VEMCExpr.h"
+#include "MCTargetDesc/VEMCTargetDesc.h"
+#include "TargetInfo/VETargetInfo.h"
+#include "VE.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCParser/MCAsmLexer.h"
+#include "llvm/MC/MCParser/MCAsmParser.h"
+#include "llvm/MC/MCParser/MCParsedAsmOperand.h"
+#include "llvm/MC/MCParser/MCTargetAsmParser.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCStreamer.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/TargetRegistry.h"
+#include "llvm/Support/raw_ostream.h"
+#include <algorithm>
+#include <memory>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ve-asmparser"
+
+namespace {
+
+class VEOperand;
+
+class VEAsmParser : public MCTargetAsmParser {
+  MCAsmParser &Parser;
+
+  /// @name Auto-generated Match Functions
+  /// {
+
+#define GET_ASSEMBLER_HEADER
+#include "VEGenAsmMatcher.inc"
+
+  /// }
+
+  // public interface of the MCTargetAsmParser.
+  bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                               OperandVector &Operands, MCStreamer &Out,
+                               uint64_t &ErrorInfo,
+                               bool MatchingInlineAsm) override;
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  int parseRegisterName(unsigned (*matchFn)(StringRef));
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
+  bool ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                        SMLoc NameLoc, OperandVector &Operands) override;
+  bool ParseDirective(AsmToken DirectiveID) override;
+
+  unsigned validateTargetOperandClass(MCParsedAsmOperand &Op,
+                                      unsigned Kind) override;
+
+  // Custom parse functions for VE specific operands.
+  OperandMatchResultTy parseMEMOperand(OperandVector &Operands);
+  OperandMatchResultTy parseMEMAsOperand(OperandVector &Operands);
+  OperandMatchResultTy parseCCOpOperand(OperandVector &Operands);
+  OperandMatchResultTy parseRDOpOperand(OperandVector &Operands);
+  OperandMatchResultTy parseMImmOperand(OperandVector &Operands);
+  OperandMatchResultTy parseOperand(OperandVector &Operands, StringRef Name);
+  OperandMatchResultTy parseVEAsmOperand(std::unique_ptr<VEOperand> &Operand);
+
+  // Helper function to parse expression with a symbol.
+  const MCExpr *extractModifierFromExpr(const MCExpr *E,
+                                        VEMCExpr::VariantKind &Variant);
+  const MCExpr *fixupVariantKind(const MCExpr *E);
+  bool parseExpression(const MCExpr *&EVal);
+
+  // Split the mnemonic stripping conditional code and quantifiers
+  StringRef splitMnemonic(StringRef Name, SMLoc NameLoc,
+                          OperandVector *Operands);
+
+public:
+  VEAsmParser(const MCSubtargetInfo &sti, MCAsmParser &parser,
+              const MCInstrInfo &MII, const MCTargetOptions &Options)
+      : MCTargetAsmParser(Options, sti, MII), Parser(parser) {
+    // Initialize the set of available features.
+    setAvailableFeatures(ComputeAvailableFeatures(getSTI().getFeatureBits()));
+  }
+};
+
+} // end anonymous namespace
+
+static const MCPhysReg I32Regs[64] = {
+    VE::SW0,  VE::SW1,  VE::SW2,  VE::SW3,  VE::SW4,  VE::SW5,  VE::SW6,
+    VE::SW7,  VE::SW8,  VE::SW9,  VE::SW10, VE::SW11, VE::SW12, VE::SW13,
+    VE::SW14, VE::SW15, VE::SW16, VE::SW17, VE::SW18, VE::SW19, VE::SW20,
+    VE::SW21, VE::SW22, VE::SW23, VE::SW24, VE::SW25, VE::SW26, VE::SW27,
+    VE::SW28, VE::SW29, VE::SW30, VE::SW31, VE::SW32, VE::SW33, VE::SW34,
+    VE::SW35, VE::SW36, VE::SW37, VE::SW38, VE::SW39, VE::SW40, VE::SW41,
+    VE::SW42, VE::SW43, VE::SW44, VE::SW45, VE::SW46, VE::SW47, VE::SW48,
+    VE::SW49, VE::SW50, VE::SW51, VE::SW52, VE::SW53, VE::SW54, VE::SW55,
+    VE::SW56, VE::SW57, VE::SW58, VE::SW59, VE::SW60, VE::SW61, VE::SW62,
+    VE::SW63};
+
+static const MCPhysReg F32Regs[64] = {
+    VE::SF0,  VE::SF1,  VE::SF2,  VE::SF3,  VE::SF4,  VE::SF5,  VE::SF6,
+    VE::SF7,  VE::SF8,  VE::SF9,  VE::SF10, VE::SF11, VE::SF12, VE::SF13,
+    VE::SF14, VE::SF15, VE::SF16, VE::SF17, VE::SF18, VE::SF19, VE::SF20,
+    VE::SF21, VE::SF22, VE::SF23, VE::SF24, VE::SF25, VE::SF26, VE::SF27,
+    VE::SF28, VE::SF29, VE::SF30, VE::SF31, VE::SF32, VE::SF33, VE::SF34,
+    VE::SF35, VE::SF36, VE::SF37, VE::SF38, VE::SF39, VE::SF40, VE::SF41,
+    VE::SF42, VE::SF43, VE::SF44, VE::SF45, VE::SF46, VE::SF47, VE::SF48,
+    VE::SF49, VE::SF50, VE::SF51, VE::SF52, VE::SF53, VE::SF54, VE::SF55,
+    VE::SF56, VE::SF57, VE::SF58, VE::SF59, VE::SF60, VE::SF61, VE::SF62,
+    VE::SF63};
+
+static const MCPhysReg F128Regs[32] = {
+    VE::Q0,  VE::Q1,  VE::Q2,  VE::Q3,  VE::Q4,  VE::Q5,  VE::Q6,  VE::Q7,
+    VE::Q8,  VE::Q9,  VE::Q10, VE::Q11, VE::Q12, VE::Q13, VE::Q14, VE::Q15,
+    VE::Q16, VE::Q17, VE::Q18, VE::Q19, VE::Q20, VE::Q21, VE::Q22, VE::Q23,
+    VE::Q24, VE::Q25, VE::Q26, VE::Q27, VE::Q28, VE::Q29, VE::Q30, VE::Q31};
+
+static const MCPhysReg MISCRegs[31] = {
+    VE::USRCC,      VE::PSW,        VE::SAR,        VE::NoRegister,
+    VE::NoRegister, VE::NoRegister, VE::NoRegister, VE::PMMR,
+    VE::PMCR0,      VE::PMCR1,      VE::PMCR2,      VE::PMCR3,
+    VE::NoRegister, VE::NoRegister, VE::NoRegister, VE::NoRegister,
+    VE::PMC0,       VE::PMC1,       VE::PMC2,       VE::PMC3,
+    VE::PMC4,       VE::PMC5,       VE::PMC6,       VE::PMC7,
+    VE::PMC8,       VE::PMC9,       VE::PMC10,      VE::PMC11,
+    VE::PMC12,      VE::PMC13,      VE::PMC14};
+
+namespace {
+
+/// VEOperand - Instances of this class represent a parsed VE machine
+/// instruction.
+class VEOperand : public MCParsedAsmOperand {
+private:
+  enum KindTy {
+    k_Token,
+    k_Register,
+    k_Immediate,
+    // SX-Aurora ASX form is disp(index, base).
+    k_MemoryRegRegImm,  // base=reg, index=reg, disp=imm
+    k_MemoryRegImmImm,  // base=reg, index=imm, disp=imm
+    k_MemoryZeroRegImm, // base=0, index=reg, disp=imm
+    k_MemoryZeroImmImm, // base=0, index=imm, disp=imm
+    // SX-Aurora AS form is disp(base).
+    k_MemoryRegImm,  // base=reg, disp=imm
+    k_MemoryZeroImm, // base=0, disp=imm
+    // Other special cases for Aurora VE
+    k_CCOp,   // condition code
+    k_RDOp,   // rounding mode
+    k_MImmOp, // Special immediate value of sequential bit stream of 0 or 1.
+  } Kind;
+
+  SMLoc StartLoc, EndLoc;
+
+  struct Token {
+    const char *Data;
+    unsigned Length;
+  };
+
+  struct RegOp {
+    unsigned RegNum;
+  };
+
+  struct ImmOp {
+    const MCExpr *Val;
+  };
+
+  struct MemOp {
+    unsigned Base;
+    unsigned IndexReg;
+    const MCExpr *Index;
+    const MCExpr *Offset;
+  };
+
+  struct CCOp {
+    unsigned CCVal;
+  };
+
+  struct RDOp {
+    unsigned RDVal;
+  };
+
+  struct MImmOp {
+    const MCExpr *Val;
+    bool M0Flag;
+  };
+
+  union {
+    struct Token Tok;
+    struct RegOp Reg;
+    struct ImmOp Imm;
+    struct MemOp Mem;
+    struct CCOp CC;
+    struct RDOp RD;
+    struct MImmOp MImm;
+  };
+
+public:
+  VEOperand(KindTy K) : MCParsedAsmOperand(), Kind(K) {}
+
+  bool isToken() const override { return Kind == k_Token; }
+  bool isReg() const override { return Kind == k_Register; }
+  bool isImm() const override { return Kind == k_Immediate; }
+  bool isMem() const override {
+    return isMEMrri() || isMEMrii() || isMEMzri() || isMEMzii() || isMEMri() ||
+           isMEMzi();
+  }
+  bool isMEMrri() const { return Kind == k_MemoryRegRegImm; }
+  bool isMEMrii() const { return Kind == k_MemoryRegImmImm; }
+  bool isMEMzri() const { return Kind == k_MemoryZeroRegImm; }
+  bool isMEMzii() const { return Kind == k_MemoryZeroImmImm; }
+  bool isMEMri() const { return Kind == k_MemoryRegImm; }
+  bool isMEMzi() const { return Kind == k_MemoryZeroImm; }
+  bool isCCOp() const { return Kind == k_CCOp; }
+  bool isRDOp() const { return Kind == k_RDOp; }
+  bool isZero() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const auto *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Value = ConstExpr->getValue();
+      return Value == 0;
+    }
+    return false;
+  }
+  bool isUImm0to2() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const auto *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Value = ConstExpr->getValue();
+      return Value >= 0 && Value < 3;
+    }
+    return false;
+  }
+  bool isUImm1() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const auto *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Value = ConstExpr->getValue();
+      return isUInt<1>(Value);
+    }
+    return false;
+  }
+  bool isUImm2() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const auto *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Value = ConstExpr->getValue();
+      return isUInt<2>(Value);
+    }
+    return false;
+  }
+  bool isUImm3() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const auto *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Value = ConstExpr->getValue();
+      return isUInt<3>(Value);
+    }
+    return false;
+  }
+  bool isUImm6() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const auto *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Value = ConstExpr->getValue();
+      return isUInt<6>(Value);
+    }
+    return false;
+  }
+  bool isUImm7() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const auto *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Value = ConstExpr->getValue();
+      return isUInt<7>(Value);
+    }
+    return false;
+  }
+  bool isSImm7() {
+    if (!isImm())
+      return false;
+
+    // Constant case
+    if (const auto *ConstExpr = dyn_cast<MCConstantExpr>(Imm.Val)) {
+      int64_t Value = ConstExpr->getValue();
+      return isInt<7>(Value);
+    }
+    return false;
+  }
+  bool isMImm() const {
+    if (Kind != k_MImmOp)
+      return false;
+
+    // Constant case
+    if (const auto *ConstExpr = dyn_cast<MCConstantExpr>(MImm.Val)) {
+      int64_t Value = ConstExpr->getValue();
+      return isUInt<6>(Value);
+    }
+    return false;
+  }
+
+  StringRef getToken() const {
+    assert(Kind == k_Token && "Invalid access!");
+    return StringRef(Tok.Data, Tok.Length);
+  }
+
+  unsigned getReg() const override {
+    assert((Kind == k_Register) && "Invalid access!");
+    return Reg.RegNum;
+  }
+
+  const MCExpr *getImm() const {
+    assert((Kind == k_Immediate) && "Invalid access!");
+    return Imm.Val;
+  }
+
+  unsigned getMemBase() const {
+    assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryRegImmImm ||
+            Kind == k_MemoryRegImm) &&
+           "Invalid access!");
+    return Mem.Base;
+  }
+
+  unsigned getMemIndexReg() const {
+    assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryZeroRegImm) &&
+           "Invalid access!");
+    return Mem.IndexReg;
+  }
+
+  const MCExpr *getMemIndex() const {
+    assert((Kind == k_MemoryRegImmImm || Kind == k_MemoryZeroImmImm) &&
+           "Invalid access!");
+    return Mem.Index;
+  }
+
+  const MCExpr *getMemOffset() const {
+    assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryRegImmImm ||
+            Kind == k_MemoryZeroImmImm || Kind == k_MemoryZeroRegImm ||
+            Kind == k_MemoryRegImm || Kind == k_MemoryZeroImm) &&
+           "Invalid access!");
+    return Mem.Offset;
+  }
+
+  void setMemOffset(const MCExpr *off) {
+    assert((Kind == k_MemoryRegRegImm || Kind == k_MemoryRegImmImm ||
+            Kind == k_MemoryZeroImmImm || Kind == k_MemoryZeroRegImm ||
+            Kind == k_MemoryRegImm || Kind == k_MemoryZeroImm) &&
+           "Invalid access!");
+    Mem.Offset = off;
+  }
+
+  unsigned getCCVal() const {
+    assert((Kind == k_CCOp) && "Invalid access!");
+    return CC.CCVal;
+  }
+
+  unsigned getRDVal() const {
+    assert((Kind == k_RDOp) && "Invalid access!");
+    return RD.RDVal;
+  }
+
+  const MCExpr *getMImmVal() const {
+    assert((Kind == k_MImmOp) && "Invalid access!");
+    return MImm.Val;
+  }
+  bool getM0Flag() const {
+    assert((Kind == k_MImmOp) && "Invalid access!");
+    return MImm.M0Flag;
+  }
+
+  /// getStartLoc - Get the location of the first token of this operand.
+  SMLoc getStartLoc() const override { return StartLoc; }
+  /// getEndLoc - Get the location of the last token of this operand.
+  SMLoc getEndLoc() const override { return EndLoc; }
+
+  void print(raw_ostream &OS) const override {
+    switch (Kind) {
+    case k_Token:
+      OS << "Token: " << getToken() << "\n";
+      break;
+    case k_Register:
+      OS << "Reg: #" << getReg() << "\n";
+      break;
+    case k_Immediate:
+      OS << "Imm: " << getImm() << "\n";
+      break;
+    case k_MemoryRegRegImm:
+      assert(getMemOffset() != nullptr);
+      OS << "Mem: #" << getMemBase() << "+#" << getMemIndexReg() << "+"
+         << *getMemOffset() << "\n";
+      break;
+    case k_MemoryRegImmImm:
+      assert(getMemIndex() != nullptr && getMemOffset() != nullptr);
+      OS << "Mem: #" << getMemBase() << "+" << *getMemIndex() << "+"
+         << *getMemOffset() << "\n";
+      break;
+    case k_MemoryZeroRegImm:
+      assert(getMemOffset() != nullptr);
+      OS << "Mem: 0+#" << getMemIndexReg() << "+" << *getMemOffset() << "\n";
+      break;
+    case k_MemoryZeroImmImm:
+      assert(getMemIndex() != nullptr && getMemOffset() != nullptr);
+      OS << "Mem: 0+" << *getMemIndex() << "+" << *getMemOffset() << "\n";
+      break;
+    case k_MemoryRegImm:
+      assert(getMemOffset() != nullptr);
+      OS << "Mem: #" << getMemBase() << "+" << *getMemOffset() << "\n";
+      break;
+    case k_MemoryZeroImm:
+      assert(getMemOffset() != nullptr);
+      OS << "Mem: 0+" << *getMemOffset() << "\n";
+      break;
+    case k_CCOp:
+      OS << "CCOp: " << getCCVal() << "\n";
+      break;
+    case k_RDOp:
+      OS << "RDOp: " << getRDVal() << "\n";
+      break;
+    case k_MImmOp:
+      OS << "MImm: (" << getMImmVal() << (getM0Flag() ? ")0" : ")1") << "\n";
+      break;
+    }
+  }
+
+  void addRegOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    Inst.addOperand(MCOperand::createReg(getReg()));
+  }
+
+  void addImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const MCExpr *Expr = getImm();
+    addExpr(Inst, Expr);
+  }
+
+  void addZeroOperands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void addUImm0to2Operands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void addUImm1Operands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void addUImm2Operands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void addUImm3Operands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void addUImm6Operands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void addUImm7Operands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void addSImm7Operands(MCInst &Inst, unsigned N) const {
+    addImmOperands(Inst, N);
+  }
+
+  void addExpr(MCInst &Inst, const MCExpr *Expr) const {
+    // Add as immediate when possible.  Null MCExpr = 0.
+    if (!Expr)
+      Inst.addOperand(MCOperand::createImm(0));
+    else if (const auto *CE = dyn_cast<MCConstantExpr>(Expr))
+      Inst.addOperand(MCOperand::createImm(CE->getValue()));
+    else
+      Inst.addOperand(MCOperand::createExpr(Expr));
+  }
+
+  void addMEMrriOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createReg(getMemBase()));
+    Inst.addOperand(MCOperand::createReg(getMemIndexReg()));
+    addExpr(Inst, getMemOffset());
+  }
+
+  void addMEMriiOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createReg(getMemBase()));
+    addExpr(Inst, getMemIndex());
+    addExpr(Inst, getMemOffset());
+  }
+
+  void addMEMzriOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createImm(0));
+    Inst.addOperand(MCOperand::createReg(getMemIndexReg()));
+    addExpr(Inst, getMemOffset());
+  }
+
+  void addMEMziiOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 3 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createImm(0));
+    addExpr(Inst, getMemIndex());
+    addExpr(Inst, getMemOffset());
+  }
+
+  void addMEMriOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createReg(getMemBase()));
+    addExpr(Inst, getMemOffset());
+  }
+
+  void addMEMziOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 2 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createImm(0));
+    addExpr(Inst, getMemOffset());
+  }
+
+  void addCCOpOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createImm(getCCVal()));
+  }
+
+  void addRDOpOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+
+    Inst.addOperand(MCOperand::createImm(getRDVal()));
+  }
+
+  void addMImmOperands(MCInst &Inst, unsigned N) const {
+    assert(N == 1 && "Invalid number of operands!");
+    const auto *ConstExpr = dyn_cast<MCConstantExpr>(getMImmVal());
+    assert(ConstExpr && "Null operands!");
+    int64_t Value = ConstExpr->getValue();
+    if (getM0Flag())
+      Value += 64;
+    Inst.addOperand(MCOperand::createImm(Value));
+  }
+
+  static std::unique_ptr<VEOperand> CreateToken(StringRef Str, SMLoc S) {
+    auto Op = std::make_unique<VEOperand>(k_Token);
+    Op->Tok.Data = Str.data();
+    Op->Tok.Length = Str.size();
+    Op->StartLoc = S;
+    Op->EndLoc = S;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand> CreateReg(unsigned RegNum, SMLoc S,
+                                              SMLoc E) {
+    auto Op = std::make_unique<VEOperand>(k_Register);
+    Op->Reg.RegNum = RegNum;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand> CreateImm(const MCExpr *Val, SMLoc S,
+                                              SMLoc E) {
+    auto Op = std::make_unique<VEOperand>(k_Immediate);
+    Op->Imm.Val = Val;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand> CreateCCOp(unsigned CCVal, SMLoc S,
+                                               SMLoc E) {
+    auto Op = std::make_unique<VEOperand>(k_CCOp);
+    Op->CC.CCVal = CCVal;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand> CreateRDOp(unsigned RDVal, SMLoc S,
+                                               SMLoc E) {
+    auto Op = std::make_unique<VEOperand>(k_RDOp);
+    Op->RD.RDVal = RDVal;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand> CreateMImm(const MCExpr *Val, bool Flag,
+                                               SMLoc S, SMLoc E) {
+    auto Op = std::make_unique<VEOperand>(k_MImmOp);
+    Op->MImm.Val = Val;
+    Op->MImm.M0Flag = Flag;
+    Op->StartLoc = S;
+    Op->EndLoc = E;
+    return Op;
+  }
+
+  static bool MorphToI32Reg(VEOperand &Op) {
+    unsigned Reg = Op.getReg();
+    unsigned regIdx = Reg - VE::SX0;
+    if (regIdx > 63)
+      return false;
+    Op.Reg.RegNum = I32Regs[regIdx];
+    return true;
+  }
+
+  static bool MorphToF32Reg(VEOperand &Op) {
+    unsigned Reg = Op.getReg();
+    unsigned regIdx = Reg - VE::SX0;
+    if (regIdx > 63)
+      return false;
+    Op.Reg.RegNum = F32Regs[regIdx];
+    return true;
+  }
+
+  static bool MorphToF128Reg(VEOperand &Op) {
+    unsigned Reg = Op.getReg();
+    unsigned regIdx = Reg - VE::SX0;
+    if (regIdx % 2 || regIdx > 63)
+      return false;
+    Op.Reg.RegNum = F128Regs[regIdx / 2];
+    return true;
+  }
+
+  static bool MorphToMISCReg(VEOperand &Op) {
+    const auto *ConstExpr = dyn_cast<MCConstantExpr>(Op.getImm());
+    if (!ConstExpr)
+      return false;
+    unsigned regIdx = ConstExpr->getValue();
+    if (regIdx > 31 || MISCRegs[regIdx] == VE::NoRegister)
+      return false;
+    Op.Kind = k_Register;
+    Op.Reg.RegNum = MISCRegs[regIdx];
+    return true;
+  }
+
+  static std::unique_ptr<VEOperand>
+  MorphToMEMri(unsigned Base, std::unique_ptr<VEOperand> Op) {
+    const MCExpr *Imm = Op->getImm();
+    Op->Kind = k_MemoryRegImm;
+    Op->Mem.Base = Base;
+    Op->Mem.IndexReg = 0;
+    Op->Mem.Index = nullptr;
+    Op->Mem.Offset = Imm;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand>
+  MorphToMEMzi(std::unique_ptr<VEOperand> Op) {
+    const MCExpr *Imm = Op->getImm();
+    Op->Kind = k_MemoryZeroImm;
+    Op->Mem.Base = 0;
+    Op->Mem.IndexReg = 0;
+    Op->Mem.Index = nullptr;
+    Op->Mem.Offset = Imm;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand>
+  MorphToMEMrri(unsigned Base, unsigned Index, std::unique_ptr<VEOperand> Op) {
+    const MCExpr *Imm = Op->getImm();
+    Op->Kind = k_MemoryRegRegImm;
+    Op->Mem.Base = Base;
+    Op->Mem.IndexReg = Index;
+    Op->Mem.Index = nullptr;
+    Op->Mem.Offset = Imm;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand>
+  MorphToMEMrii(unsigned Base, const MCExpr *Index,
+                std::unique_ptr<VEOperand> Op) {
+    const MCExpr *Imm = Op->getImm();
+    Op->Kind = k_MemoryRegImmImm;
+    Op->Mem.Base = Base;
+    Op->Mem.IndexReg = 0;
+    Op->Mem.Index = Index;
+    Op->Mem.Offset = Imm;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand>
+  MorphToMEMzri(unsigned Index, std::unique_ptr<VEOperand> Op) {
+    const MCExpr *Imm = Op->getImm();
+    Op->Kind = k_MemoryZeroRegImm;
+    Op->Mem.Base = 0;
+    Op->Mem.IndexReg = Index;
+    Op->Mem.Index = nullptr;
+    Op->Mem.Offset = Imm;
+    return Op;
+  }
+
+  static std::unique_ptr<VEOperand>
+  MorphToMEMzii(const MCExpr *Index, std::unique_ptr<VEOperand> Op) {
+    const MCExpr *Imm = Op->getImm();
+    Op->Kind = k_MemoryZeroImmImm;
+    Op->Mem.Base = 0;
+    Op->Mem.IndexReg = 0;
+    Op->Mem.Index = Index;
+    Op->Mem.Offset = Imm;
+    return Op;
+  }
+};
+
+} // end anonymous namespace
+
+bool VEAsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
+                                          OperandVector &Operands,
+                                          MCStreamer &Out, uint64_t &ErrorInfo,
+                                          bool MatchingInlineAsm) {
+  MCInst Inst;
+  unsigned MatchResult =
+      MatchInstructionImpl(Operands, Inst, ErrorInfo, MatchingInlineAsm);
+  switch (MatchResult) {
+  case Match_Success:
+    Inst.setLoc(IDLoc);
+    Out.emitInstruction(Inst, getSTI());
+    return false;
+
+  case Match_MissingFeature:
+    return Error(IDLoc,
+                 "instruction requires a CPU feature not currently enabled");
+
+  case Match_InvalidOperand: {
+    SMLoc ErrorLoc = IDLoc;
+    if (ErrorInfo != ~0ULL) {
+      if (ErrorInfo >= Operands.size())
+        return Error(IDLoc, "too few operands for instruction");
+
+      ErrorLoc = ((VEOperand &)*Operands[ErrorInfo]).getStartLoc();
+      if (ErrorLoc == SMLoc())
+        ErrorLoc = IDLoc;
+    }
+
+    return Error(ErrorLoc, "invalid operand for instruction");
+  }
+  case Match_MnemonicFail:
+    return Error(IDLoc, "invalid instruction mnemonic");
+  }
+  llvm_unreachable("Implement any new match types added!");
+}
+
+bool VEAsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                SMLoc &EndLoc) {
+  if (tryParseRegister(RegNo, StartLoc, EndLoc) != MatchOperand_Success)
+    return Error(StartLoc, "invalid register name");
+  return false;
+}
+
+/// Parses a register name using a given matching function.
+/// Checks for lowercase or uppercase if necessary.
+int VEAsmParser::parseRegisterName(unsigned (*matchFn)(StringRef)) {
+  StringRef Name = Parser.getTok().getString();
+
+  int RegNum = matchFn(Name);
+
+  // GCC supports case insensitive register names. All of the VE registers
+  // are all lower case.
+  if (RegNum == VE::NoRegister) {
+    RegNum = matchFn(Name.lower());
+  }
+
+  return RegNum;
+}
+
+/// Maps from the set of all register names to a register number.
+/// \note Generated by TableGen.
+static unsigned MatchRegisterName(StringRef Name);
+
+/// Maps from the set of all alternative registernames to a register number.
+/// \note Generated by TableGen.
+static unsigned MatchRegisterAltName(StringRef Name);
+
+OperandMatchResultTy
+VEAsmParser::tryParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) {
+  const AsmToken Tok = Parser.getTok();
+  StartLoc = Tok.getLoc();
+  EndLoc = Tok.getEndLoc();
+  RegNo = 0;
+  if (getLexer().getKind() != AsmToken::Percent)
+    return MatchOperand_NoMatch;
+  Parser.Lex();
+
+  RegNo = parseRegisterName(&MatchRegisterName);
+  if (RegNo == VE::NoRegister)
+    RegNo = parseRegisterName(&MatchRegisterAltName);
+
+  if (RegNo != VE::NoRegister) {
+    Parser.Lex();
+    return MatchOperand_Success;
+  }
+
+  getLexer().UnLex(Tok);
+  return MatchOperand_NoMatch;
+}
+
+static StringRef parseCC(StringRef Name, unsigned Prefix, unsigned Suffix,
+                         bool IntegerCC, bool OmitCC, SMLoc NameLoc,
+                         OperandVector *Operands) {
+  // Parse instructions with a conditional code. For example, 'bne' is
+  // converted into two operands 'b' and 'ne'.
+  StringRef Cond = Name.slice(Prefix, Suffix);
+  VECC::CondCode CondCode =
+      IntegerCC ? stringToVEICondCode(Cond) : stringToVEFCondCode(Cond);
+
+  // If OmitCC is enabled, CC_AT and CC_AF is treated as a part of mnemonic.
+  if (CondCode != VECC::UNKNOWN &&
+      (!OmitCC || (CondCode != VECC::CC_AT && CondCode != VECC::CC_AF))) {
+    StringRef SuffixStr = Name.substr(Suffix);
+    // Push "b".
+    Name = Name.slice(0, Prefix);
+    Operands->push_back(VEOperand::CreateToken(Name, NameLoc));
+    // Push $cond part.
+    SMLoc CondLoc = SMLoc::getFromPointer(NameLoc.getPointer() + Prefix);
+    SMLoc SuffixLoc = SMLoc::getFromPointer(NameLoc.getPointer() + Suffix);
+    Operands->push_back(VEOperand::CreateCCOp(CondCode, CondLoc, SuffixLoc));
+    // push suffix like ".l.t"
+    if (!SuffixStr.empty())
+      Operands->push_back(VEOperand::CreateToken(SuffixStr, SuffixLoc));
+  } else {
+    Operands->push_back(VEOperand::CreateToken(Name, NameLoc));
+  }
+  return Name;
+}
+
+static StringRef parseRD(StringRef Name, unsigned Prefix, SMLoc NameLoc,
+                         OperandVector *Operands) {
+  // Parse instructions with a conditional code. For example, 'cvt.w.d.sx.rz'
+  // is converted into two operands 'cvt.w.d.sx' and '.rz'.
+  StringRef RD = Name.substr(Prefix);
+  VERD::RoundingMode RoundingMode = stringToVERD(RD);
+
+  if (RoundingMode != VERD::UNKNOWN) {
+    Name = Name.slice(0, Prefix);
+    // push 1st like `cvt.w.d.sx`
+    Operands->push_back(VEOperand::CreateToken(Name, NameLoc));
+    SMLoc SuffixLoc =
+        SMLoc::getFromPointer(NameLoc.getPointer() + (RD.data() - Name.data()));
+    SMLoc SuffixEnd =
+        SMLoc::getFromPointer(NameLoc.getPointer() + (RD.end() - Name.data()));
+    // push $round if it has rounding mode
+    Operands->push_back(
+        VEOperand::CreateRDOp(RoundingMode, SuffixLoc, SuffixEnd));
+  } else {
+    Operands->push_back(VEOperand::CreateToken(Name, NameLoc));
+  }
+  return Name;
+}
+
+// Split the mnemonic into ASM operand, conditional code and instruction
+// qualifier (half-word, byte).
+StringRef VEAsmParser::splitMnemonic(StringRef Name, SMLoc NameLoc,
+                                     OperandVector *Operands) {
+  // Create the leading tokens for the mnemonic
+  StringRef Mnemonic = Name;
+
+  if (Name[0] == 'b') {
+    // Match b?? or br??.
+    size_t Start = 1;
+    size_t Next = Name.find('.');
+    // Adjust position of CondCode.
+    if (Name.size() > 1 && Name[1] == 'r')
+      Start = 2;
+    // Check suffix.
+    bool ICC = true;
+    if (Next + 1 < Name.size() &&
+        (Name[Next + 1] == 'd' || Name[Next + 1] == 's'))
+      ICC = false;
+    Mnemonic = parseCC(Name, Start, Next, ICC, true, NameLoc, Operands);
+  } else if (Name.startswith("cmov.l.") || Name.startswith("cmov.w.") ||
+             Name.startswith("cmov.d.") || Name.startswith("cmov.s.")) {
+    bool ICC = Name[5] == 'l' || Name[5] == 'w';
+    Mnemonic = parseCC(Name, 7, Name.size(), ICC, false, NameLoc, Operands);
+  } else if (Name.startswith("cvt.w.d.sx") || Name.startswith("cvt.w.d.zx") ||
+             Name.startswith("cvt.w.s.sx") || Name.startswith("cvt.w.s.zx")) {
+    Mnemonic = parseRD(Name, 10, NameLoc, Operands);
+  } else if (Name.startswith("cvt.l.d")) {
+    Mnemonic = parseRD(Name, 7, NameLoc, Operands);
+  } else {
+    Operands->push_back(VEOperand::CreateToken(Mnemonic, NameLoc));
+  }
+
+  return Mnemonic;
+}
+
+static void applyMnemonicAliases(StringRef &Mnemonic,
+                                 const FeatureBitset &Features,
+                                 unsigned VariantID);
+
+bool VEAsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
+                                   SMLoc NameLoc, OperandVector &Operands) {
+  // If the target architecture uses MnemonicAlias, call it here to parse
+  // operands correctly.
+  applyMnemonicAliases(Name, getAvailableFeatures(), 0);
+
+  // Split name to first token and the rest, e.g. "bgt.l.t" to "b", "gt", and
+  // ".l.t".  We treat "b" as a mnemonic, "gt" as first operand, and ".l.t"
+  // as second operand.
+  StringRef Mnemonic = splitMnemonic(Name, NameLoc, &Operands);
+
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    // Read the first operand.
+    if (parseOperand(Operands, Mnemonic) != MatchOperand_Success) {
+      SMLoc Loc = getLexer().getLoc();
+      return Error(Loc, "unexpected token");
+    }
+
+    while (getLexer().is(AsmToken::Comma)) {
+      Parser.Lex(); // Eat the comma.
+      // Parse and remember the operand.
+      if (parseOperand(Operands, Mnemonic) != MatchOperand_Success) {
+        SMLoc Loc = getLexer().getLoc();
+        return Error(Loc, "unexpected token");
+      }
+    }
+  }
+  if (getLexer().isNot(AsmToken::EndOfStatement)) {
+    SMLoc Loc = getLexer().getLoc();
+    return Error(Loc, "unexpected token");
+  }
+  Parser.Lex(); // Consume the EndOfStatement.
+  return false;
+}
+
+bool VEAsmParser::ParseDirective(AsmToken DirectiveID) {
+  // Let the MC layer to handle other directives.
+  return true;
+}
+
+/// Extract \code @lo32/@hi32/etc \endcode modifier from expression.
+/// Recursively scan the expression and check for VK_VE_HI32/LO32/etc
+/// symbol variants.  If all symbols with modifier use the same
+/// variant, return the corresponding VEMCExpr::VariantKind,
+/// and a modified expression using the default symbol variant.
+/// Otherwise, return NULL.
+const MCExpr *
+VEAsmParser::extractModifierFromExpr(const MCExpr *E,
+                                     VEMCExpr::VariantKind &Variant) {
+  MCContext &Context = getParser().getContext();
+  Variant = VEMCExpr::VK_VE_None;
+
+  switch (E->getKind()) {
+  case MCExpr::Target:
+  case MCExpr::Constant:
+    return nullptr;
+
+  case MCExpr::SymbolRef: {
+    const MCSymbolRefExpr *SRE = cast<MCSymbolRefExpr>(E);
+
+    switch (SRE->getKind()) {
+    case MCSymbolRefExpr::VK_None:
+      // Use VK_VE_REFLONG to a symbol without modifiers.
+      Variant = VEMCExpr::VK_VE_REFLONG;
+      break;
+    case MCSymbolRefExpr::VK_VE_HI32:
+      Variant = VEMCExpr::VK_VE_HI32;
+      break;
+    case MCSymbolRefExpr::VK_VE_LO32:
+      Variant = VEMCExpr::VK_VE_LO32;
+      break;
+    case MCSymbolRefExpr::VK_VE_PC_HI32:
+      Variant = VEMCExpr::VK_VE_PC_HI32;
+      break;
+    case MCSymbolRefExpr::VK_VE_PC_LO32:
+      Variant = VEMCExpr::VK_VE_PC_LO32;
+      break;
+    case MCSymbolRefExpr::VK_VE_GOT_HI32:
+      Variant = VEMCExpr::VK_VE_GOT_HI32;
+      break;
+    case MCSymbolRefExpr::VK_VE_GOT_LO32:
+      Variant = VEMCExpr::VK_VE_GOT_LO32;
+      break;
+    case MCSymbolRefExpr::VK_VE_GOTOFF_HI32:
+      Variant = VEMCExpr::VK_VE_GOTOFF_HI32;
+      break;
+    case MCSymbolRefExpr::VK_VE_GOTOFF_LO32:
+      Variant = VEMCExpr::VK_VE_GOTOFF_LO32;
+      break;
+    case MCSymbolRefExpr::VK_VE_PLT_HI32:
+      Variant = VEMCExpr::VK_VE_PLT_HI32;
+      break;
+    case MCSymbolRefExpr::VK_VE_PLT_LO32:
+      Variant = VEMCExpr::VK_VE_PLT_LO32;
+      break;
+    case MCSymbolRefExpr::VK_VE_TLS_GD_HI32:
+      Variant = VEMCExpr::VK_VE_TLS_GD_HI32;
+      break;
+    case MCSymbolRefExpr::VK_VE_TLS_GD_LO32:
+      Variant = VEMCExpr::VK_VE_TLS_GD_LO32;
+      break;
+    case MCSymbolRefExpr::VK_VE_TPOFF_HI32:
+      Variant = VEMCExpr::VK_VE_TPOFF_HI32;
+      break;
+    case MCSymbolRefExpr::VK_VE_TPOFF_LO32:
+      Variant = VEMCExpr::VK_VE_TPOFF_LO32;
+      break;
+    default:
+      return nullptr;
+    }
+
+    return MCSymbolRefExpr::create(&SRE->getSymbol(), Context);
+  }
+
+  case MCExpr::Unary: {
+    const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
+    const MCExpr *Sub = extractModifierFromExpr(UE->getSubExpr(), Variant);
+    if (!Sub)
+      return nullptr;
+    return MCUnaryExpr::create(UE->getOpcode(), Sub, Context);
+  }
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+    VEMCExpr::VariantKind LHSVariant, RHSVariant;
+    const MCExpr *LHS = extractModifierFromExpr(BE->getLHS(), LHSVariant);
+    const MCExpr *RHS = extractModifierFromExpr(BE->getRHS(), RHSVariant);
+
+    if (!LHS && !RHS)
+      return nullptr;
+
+    if (!LHS)
+      LHS = BE->getLHS();
+    if (!RHS)
+      RHS = BE->getRHS();
+
+    if (LHSVariant == VEMCExpr::VK_VE_None)
+      Variant = RHSVariant;
+    else if (RHSVariant == VEMCExpr::VK_VE_None)
+      Variant = LHSVariant;
+    else if (LHSVariant == RHSVariant)
+      Variant = LHSVariant;
+    else
+      return nullptr;
+
+    return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, Context);
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
+const MCExpr *VEAsmParser::fixupVariantKind(const MCExpr *E) {
+  MCContext &Context = getParser().getContext();
+
+  switch (E->getKind()) {
+  case MCExpr::Target:
+  case MCExpr::Constant:
+  case MCExpr::SymbolRef:
+    return E;
+
+  case MCExpr::Unary: {
+    const MCUnaryExpr *UE = cast<MCUnaryExpr>(E);
+    const MCExpr *Sub = fixupVariantKind(UE->getSubExpr());
+    if (Sub == UE->getSubExpr())
+      return E;
+    return MCUnaryExpr::create(UE->getOpcode(), Sub, Context);
+  }
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(E);
+    const MCExpr *LHS = fixupVariantKind(BE->getLHS());
+    const MCExpr *RHS = fixupVariantKind(BE->getRHS());
+    if (LHS == BE->getLHS() && RHS == BE->getRHS())
+      return E;
+    return MCBinaryExpr::create(BE->getOpcode(), LHS, RHS, Context);
+  }
+  }
+
+  llvm_unreachable("Invalid expression kind!");
+}
+
+/// ParseExpression.  This differs from the default "parseExpression" in that
+/// it handles modifiers.
+bool VEAsmParser::parseExpression(const MCExpr *&EVal) {
+  // Handle \code symbol @lo32/@hi32/etc \endcode.
+  if (getParser().parseExpression(EVal))
+    return true;
+
+  // Convert MCSymbolRefExpr with VK_* to MCExpr with VK_*.
+  EVal = fixupVariantKind(EVal);
+  VEMCExpr::VariantKind Variant;
+  const MCExpr *E = extractModifierFromExpr(EVal, Variant);
+  if (E)
+    EVal = VEMCExpr::create(Variant, E, getParser().getContext());
+
+  return false;
+}
+
+OperandMatchResultTy VEAsmParser::parseMEMOperand(OperandVector &Operands) {
+  LLVM_DEBUG(dbgs() << "parseMEMOperand\n");
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  SMLoc E = Tok.getEndLoc();
+  // Parse ASX format
+  //   disp
+  //   disp(, base)
+  //   disp(index)
+  //   disp(index, base)
+  //   (, base)
+  //   (index)
+  //   (index, base)
+
+  std::unique_ptr<VEOperand> Offset;
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_NoMatch;
+
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+  case AsmToken::Dot:
+  case AsmToken::Identifier: {
+    const MCExpr *EVal;
+    if (!parseExpression(EVal))
+      Offset = VEOperand::CreateImm(EVal, S, E);
+    else
+      return MatchOperand_NoMatch;
+    break;
+  }
+
+  case AsmToken::LParen:
+    // empty disp (= 0)
+    Offset =
+        VEOperand::CreateImm(MCConstantExpr::create(0, getContext()), S, E);
+    break;
+  }
+
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_ParseFail;
+
+  case AsmToken::EndOfStatement:
+    Operands.push_back(VEOperand::MorphToMEMzii(
+        MCConstantExpr::create(0, getContext()), std::move(Offset)));
+    return MatchOperand_Success;
+
+  case AsmToken::LParen:
+    Parser.Lex(); // Eat the (
+    break;
+  }
+
+  const MCExpr *IndexValue = nullptr;
+  unsigned IndexReg = 0;
+
+  switch (getLexer().getKind()) {
+  default:
+    if (ParseRegister(IndexReg, S, E))
+      return MatchOperand_ParseFail;
+    break;
+
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+  case AsmToken::Dot:
+    if (getParser().parseExpression(IndexValue, E))
+      return MatchOperand_ParseFail;
+    break;
+
+  case AsmToken::Comma:
+    // empty index
+    IndexValue = MCConstantExpr::create(0, getContext());
+    break;
+  }
+
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_ParseFail;
+
+  case AsmToken::RParen:
+    Parser.Lex(); // Eat the )
+    Operands.push_back(
+        IndexValue ? VEOperand::MorphToMEMzii(IndexValue, std::move(Offset))
+                   : VEOperand::MorphToMEMzri(IndexReg, std::move(Offset)));
+    return MatchOperand_Success;
+
+  case AsmToken::Comma:
+    Parser.Lex(); // Eat the ,
+    break;
+  }
+
+  unsigned BaseReg = 0;
+  if (ParseRegister(BaseReg, S, E))
+    return MatchOperand_ParseFail;
+
+  if (!Parser.getTok().is(AsmToken::RParen))
+    return MatchOperand_ParseFail;
+
+  Parser.Lex(); // Eat the )
+  Operands.push_back(
+      IndexValue
+          ? VEOperand::MorphToMEMrii(BaseReg, IndexValue, std::move(Offset))
+          : VEOperand::MorphToMEMrri(BaseReg, IndexReg, std::move(Offset)));
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy VEAsmParser::parseMEMAsOperand(OperandVector &Operands) {
+  LLVM_DEBUG(dbgs() << "parseMEMAsOperand\n");
+  const AsmToken &Tok = Parser.getTok();
+  SMLoc S = Tok.getLoc();
+  SMLoc E = Tok.getEndLoc();
+  // Parse AS format
+  //   disp
+  //   disp(, base)
+  //   disp(base)
+  //   disp()
+  //   (, base)
+  //   (base)
+  //   base
+
+  unsigned BaseReg = VE::NoRegister;
+  std::unique_ptr<VEOperand> Offset;
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_NoMatch;
+
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+  case AsmToken::Dot:
+  case AsmToken::Identifier: {
+    const MCExpr *EVal;
+    if (!parseExpression(EVal))
+      Offset = VEOperand::CreateImm(EVal, S, E);
+    else
+      return MatchOperand_NoMatch;
+    break;
+  }
+
+  case AsmToken::Percent:
+    if (ParseRegister(BaseReg, S, E))
+      return MatchOperand_NoMatch;
+    Offset =
+        VEOperand::CreateImm(MCConstantExpr::create(0, getContext()), S, E);
+    break;
+
+  case AsmToken::LParen:
+    // empty disp (= 0)
+    Offset =
+        VEOperand::CreateImm(MCConstantExpr::create(0, getContext()), S, E);
+    break;
+  }
+
+  switch (getLexer().getKind()) {
+  default:
+    return MatchOperand_ParseFail;
+
+  case AsmToken::EndOfStatement:
+  case AsmToken::Comma:
+    Operands.push_back(BaseReg != VE::NoRegister
+                           ? VEOperand::MorphToMEMri(BaseReg, std::move(Offset))
+                           : VEOperand::MorphToMEMzi(std::move(Offset)));
+    return MatchOperand_Success;
+
+  case AsmToken::LParen:
+    if (BaseReg != VE::NoRegister)
+      return MatchOperand_ParseFail;
+    Parser.Lex(); // Eat the (
+    break;
+  }
+
+  switch (getLexer().getKind()) {
+  default:
+    if (ParseRegister(BaseReg, S, E))
+      return MatchOperand_ParseFail;
+    break;
+
+  case AsmToken::Comma:
+    Parser.Lex(); // Eat the ,
+    if (ParseRegister(BaseReg, S, E))
+      return MatchOperand_ParseFail;
+    break;
+
+  case AsmToken::RParen:
+    break;
+  }
+
+  if (!Parser.getTok().is(AsmToken::RParen))
+    return MatchOperand_ParseFail;
+
+  Parser.Lex(); // Eat the )
+  Operands.push_back(BaseReg != VE::NoRegister
+                         ? VEOperand::MorphToMEMri(BaseReg, std::move(Offset))
+                         : VEOperand::MorphToMEMzi(std::move(Offset)));
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy VEAsmParser::parseMImmOperand(OperandVector &Operands) {
+  LLVM_DEBUG(dbgs() << "parseMImmOperand\n");
+
+  // Parsing "(" + number + ")0/1"
+  const AsmToken Tok1 = Parser.getTok();
+  if (!Tok1.is(AsmToken::LParen))
+    return MatchOperand_NoMatch;
+
+  Parser.Lex(); // Eat the '('.
+
+  const AsmToken Tok2 = Parser.getTok();
+  SMLoc E;
+  const MCExpr *EVal;
+  if (!Tok2.is(AsmToken::Integer) || getParser().parseExpression(EVal, E)) {
+    getLexer().UnLex(Tok1);
+    return MatchOperand_NoMatch;
+  }
+
+  const AsmToken Tok3 = Parser.getTok();
+  if (!Tok3.is(AsmToken::RParen)) {
+    getLexer().UnLex(Tok2);
+    getLexer().UnLex(Tok1);
+    return MatchOperand_NoMatch;
+  }
+  Parser.Lex(); // Eat the ')'.
+
+  const AsmToken &Tok4 = Parser.getTok();
+  StringRef Suffix = Tok4.getString();
+  if (Suffix != "1" && Suffix != "0") {
+    getLexer().UnLex(Tok3);
+    getLexer().UnLex(Tok2);
+    getLexer().UnLex(Tok1);
+    return MatchOperand_NoMatch;
+  }
+  Parser.Lex(); // Eat the value.
+  SMLoc EndLoc = SMLoc::getFromPointer(Suffix.end());
+  Operands.push_back(
+      VEOperand::CreateMImm(EVal, Suffix == "0", Tok1.getLoc(), EndLoc));
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy VEAsmParser::parseOperand(OperandVector &Operands,
+                                               StringRef Mnemonic) {
+  LLVM_DEBUG(dbgs() << "parseOperand\n");
+  OperandMatchResultTy ResTy = MatchOperandParserImpl(Operands, Mnemonic);
+
+  // If there wasn't a custom match, try the generic matcher below. Otherwise,
+  // there was a match, but an error occurred, in which case, just return that
+  // the operand parsing failed.
+  if (ResTy == MatchOperand_Success || ResTy == MatchOperand_ParseFail)
+    return ResTy;
+
+  switch (getLexer().getKind()) {
+  case AsmToken::LParen:
+    // FIXME: Parsing "(" + %vreg + ", " + %vreg + ")"
+    // FALLTHROUGH
+  default: {
+    std::unique_ptr<VEOperand> Op;
+    ResTy = parseVEAsmOperand(Op);
+    if (ResTy != MatchOperand_Success || !Op)
+      return MatchOperand_ParseFail;
+
+    // Push the parsed operand into the list of operands
+    Operands.push_back(std::move(Op));
+
+    if (!Parser.getTok().is(AsmToken::LParen))
+      break;
+
+    // FIXME: Parsing %vec-reg + "(" + %sclar-reg/number + ")"
+    break;
+  }
+  }
+
+  return MatchOperand_Success;
+}
+
+OperandMatchResultTy
+VEAsmParser::parseVEAsmOperand(std::unique_ptr<VEOperand> &Op) {
+  LLVM_DEBUG(dbgs() << "parseVEAsmOperand\n");
+  SMLoc S = Parser.getTok().getLoc();
+  SMLoc E = SMLoc::getFromPointer(Parser.getTok().getLoc().getPointer() - 1);
+  const MCExpr *EVal;
+
+  Op = nullptr;
+  switch (getLexer().getKind()) {
+  default:
+    break;
+
+  case AsmToken::Percent:
+    unsigned RegNo;
+    if (tryParseRegister(RegNo, S, E) == MatchOperand_Success)
+      Op = VEOperand::CreateReg(RegNo, S, E);
+    break;
+
+  case AsmToken::Minus:
+  case AsmToken::Integer:
+  case AsmToken::Dot:
+  case AsmToken::Identifier:
+    if (!parseExpression(EVal))
+      Op = VEOperand::CreateImm(EVal, S, E);
+    break;
+  }
+  return (Op) ? MatchOperand_Success : MatchOperand_ParseFail;
+}
+
+// Force static initialization.
+extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeVEAsmParser() {
+  RegisterMCAsmParser<VEAsmParser> A(getTheVETarget());
+}
+
+#define GET_REGISTER_MATCHER
+#define GET_MATCHER_IMPLEMENTATION
+#include "VEGenAsmMatcher.inc"
+
+unsigned VEAsmParser::validateTargetOperandClass(MCParsedAsmOperand &GOp,
+                                                 unsigned Kind) {
+  VEOperand &Op = (VEOperand &)GOp;
+
+  // VE uses identical register name for all registers like both
+  // F32 and I32 uses "%s23".  Need to convert the name of them
+  // for validation.
+  switch (Kind) {
+  default:
+    break;
+  case MCK_F32:
+    if (Op.isReg() && VEOperand::MorphToF32Reg(Op))
+      return MCTargetAsmParser::Match_Success;
+    break;
+  case MCK_I32:
+    if (Op.isReg() && VEOperand::MorphToI32Reg(Op))
+      return MCTargetAsmParser::Match_Success;
+    break;
+  case MCK_F128:
+    if (Op.isReg() && VEOperand::MorphToF128Reg(Op))
+      return MCTargetAsmParser::Match_Success;
+    break;
+  case MCK_MISC:
+    if (Op.isImm() && VEOperand::MorphToMISCReg(Op))
+      return MCTargetAsmParser::Match_Success;
+    break;
+  }
+  return Match_InvalidOperand;
+}
diff --git a/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
new file mode 100644
index 000000000000..35885a4e3cae
--- /dev/null
+++ b/llvm/lib/Target/VE/Disassembler/VEDisassembler.cpp
@@ -0,0 +1,560 @@
+//===- VEDisassembler.cpp - Disassembler for VE -----------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is part of the VE Disassembler.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/VEMCTargetDesc.h"
+#include "TargetInfo/VETargetInfo.h"
+#include "VE.h"
+#include "llvm/MC/MCAsmInfo.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCDisassembler/MCDisassembler.h"
+#include "llvm/MC/MCFixedLenDisassembler.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ve-disassembler"
+
+typedef MCDisassembler::DecodeStatus DecodeStatus;
+
+namespace {
+
+/// A disassembler class for VE.
+class VEDisassembler : public MCDisassembler {
+public:
+  VEDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx)
+      : MCDisassembler(STI, Ctx) {}
+  virtual ~VEDisassembler() {}
+
+  DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
+                              ArrayRef<uint8_t> Bytes, uint64_t Address,
+                              raw_ostream &CStream) const override;
+};
+} // namespace
+
+static MCDisassembler *createVEDisassembler(const Target &T,
+                                            const MCSubtargetInfo &STI,
+                                            MCContext &Ctx) {
+  return new VEDisassembler(STI, Ctx);
+}
+
+extern "C" void LLVMInitializeVEDisassembler() {
+  // Register the disassembler.
+  TargetRegistry::RegisterMCDisassembler(getTheVETarget(),
+                                         createVEDisassembler);
+}
+
+static const unsigned I32RegDecoderTable[] = {
+    VE::SW0,  VE::SW1,  VE::SW2,  VE::SW3,  VE::SW4,  VE::SW5,  VE::SW6,
+    VE::SW7,  VE::SW8,  VE::SW9,  VE::SW10, VE::SW11, VE::SW12, VE::SW13,
+    VE::SW14, VE::SW15, VE::SW16, VE::SW17, VE::SW18, VE::SW19, VE::SW20,
+    VE::SW21, VE::SW22, VE::SW23, VE::SW24, VE::SW25, VE::SW26, VE::SW27,
+    VE::SW28, VE::SW29, VE::SW30, VE::SW31, VE::SW32, VE::SW33, VE::SW34,
+    VE::SW35, VE::SW36, VE::SW37, VE::SW38, VE::SW39, VE::SW40, VE::SW41,
+    VE::SW42, VE::SW43, VE::SW44, VE::SW45, VE::SW46, VE::SW47, VE::SW48,
+    VE::SW49, VE::SW50, VE::SW51, VE::SW52, VE::SW53, VE::SW54, VE::SW55,
+    VE::SW56, VE::SW57, VE::SW58, VE::SW59, VE::SW60, VE::SW61, VE::SW62,
+    VE::SW63};
+
+static const unsigned I64RegDecoderTable[] = {
+    VE::SX0,  VE::SX1,  VE::SX2,  VE::SX3,  VE::SX4,  VE::SX5,  VE::SX6,
+    VE::SX7,  VE::SX8,  VE::SX9,  VE::SX10, VE::SX11, VE::SX12, VE::SX13,
+    VE::SX14, VE::SX15, VE::SX16, VE::SX17, VE::SX18, VE::SX19, VE::SX20,
+    VE::SX21, VE::SX22, VE::SX23, VE::SX24, VE::SX25, VE::SX26, VE::SX27,
+    VE::SX28, VE::SX29, VE::SX30, VE::SX31, VE::SX32, VE::SX33, VE::SX34,
+    VE::SX35, VE::SX36, VE::SX37, VE::SX38, VE::SX39, VE::SX40, VE::SX41,
+    VE::SX42, VE::SX43, VE::SX44, VE::SX45, VE::SX46, VE::SX47, VE::SX48,
+    VE::SX49, VE::SX50, VE::SX51, VE::SX52, VE::SX53, VE::SX54, VE::SX55,
+    VE::SX56, VE::SX57, VE::SX58, VE::SX59, VE::SX60, VE::SX61, VE::SX62,
+    VE::SX63};
+
+static const unsigned F32RegDecoderTable[] = {
+    VE::SF0,  VE::SF1,  VE::SF2,  VE::SF3,  VE::SF4,  VE::SF5,  VE::SF6,
+    VE::SF7,  VE::SF8,  VE::SF9,  VE::SF10, VE::SF11, VE::SF12, VE::SF13,
+    VE::SF14, VE::SF15, VE::SF16, VE::SF17, VE::SF18, VE::SF19, VE::SF20,
+    VE::SF21, VE::SF22, VE::SF23, VE::SF24, VE::SF25, VE::SF26, VE::SF27,
+    VE::SF28, VE::SF29, VE::SF30, VE::SF31, VE::SF32, VE::SF33, VE::SF34,
+    VE::SF35, VE::SF36, VE::SF37, VE::SF38, VE::SF39, VE::SF40, VE::SF41,
+    VE::SF42, VE::SF43, VE::SF44, VE::SF45, VE::SF46, VE::SF47, VE::SF48,
+    VE::SF49, VE::SF50, VE::SF51, VE::SF52, VE::SF53, VE::SF54, VE::SF55,
+    VE::SF56, VE::SF57, VE::SF58, VE::SF59, VE::SF60, VE::SF61, VE::SF62,
+    VE::SF63};
+
+static const unsigned F128RegDecoderTable[] = {
+    VE::Q0,  VE::Q1,  VE::Q2,  VE::Q3,  VE::Q4,  VE::Q5,  VE::Q6,  VE::Q7,
+    VE::Q8,  VE::Q9,  VE::Q10, VE::Q11, VE::Q12, VE::Q13, VE::Q14, VE::Q15,
+    VE::Q16, VE::Q17, VE::Q18, VE::Q19, VE::Q20, VE::Q21, VE::Q22, VE::Q23,
+    VE::Q24, VE::Q25, VE::Q26, VE::Q27, VE::Q28, VE::Q29, VE::Q30, VE::Q31};
+
+static const unsigned MiscRegDecoderTable[] = {
+    VE::USRCC,      VE::PSW,        VE::SAR,        VE::NoRegister,
+    VE::NoRegister, VE::NoRegister, VE::NoRegister, VE::PMMR,
+    VE::PMCR0,      VE::PMCR1,      VE::PMCR2,      VE::PMCR3,
+    VE::NoRegister, VE::NoRegister, VE::NoRegister, VE::NoRegister,
+    VE::PMC0,       VE::PMC1,       VE::PMC2,       VE::PMC3,
+    VE::PMC4,       VE::PMC5,       VE::PMC6,       VE::PMC7,
+    VE::PMC8,       VE::PMC9,       VE::PMC10,      VE::PMC11,
+    VE::PMC12,      VE::PMC13,      VE::PMC14};
+
+static DecodeStatus DecodeI32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (RegNo > 63)
+    return MCDisassembler::Fail;
+  unsigned Reg = I32RegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeI64RegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (RegNo > 63)
+    return MCDisassembler::Fail;
+  unsigned Reg = I64RegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeF32RegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const void *Decoder) {
+  if (RegNo > 63)
+    return MCDisassembler::Fail;
+  unsigned Reg = F32RegDecoderTable[RegNo];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeF128RegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  if (RegNo % 2 || RegNo > 63)
+    return MCDisassembler::Fail;
+  unsigned Reg = F128RegDecoderTable[RegNo / 2];
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMISCRegisterClass(MCInst &Inst, unsigned RegNo,
+                                            uint64_t Address,
+                                            const void *Decoder) {
+  if (RegNo > 30)
+    return MCDisassembler::Fail;
+  unsigned Reg = MiscRegDecoderTable[RegNo];
+  if (Reg == VE::NoRegister)
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::createReg(Reg));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeASX(MCInst &Inst, uint64_t insn, uint64_t Address,
+                              const void *Decoder);
+static DecodeStatus DecodeLoadI32(MCInst &Inst, uint64_t insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeStoreI32(MCInst &Inst, uint64_t insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLoadI64(MCInst &Inst, uint64_t insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeStoreI64(MCInst &Inst, uint64_t insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLoadF32(MCInst &Inst, uint64_t insn, uint64_t Address,
+                                  const void *Decoder);
+static DecodeStatus DecodeStoreF32(MCInst &Inst, uint64_t insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeLoadASI64(MCInst &Inst, uint64_t insn,
+                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeStoreASI64(MCInst &Inst, uint64_t insn,
+                                     uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeTS1AMI64(MCInst &Inst, uint64_t insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeTS1AMI32(MCInst &Inst, uint64_t insn,
+                                   uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeCASI64(MCInst &Inst, uint64_t insn, uint64_t Address,
+                                 const void *Decoder);
+static DecodeStatus DecodeCASI32(MCInst &Inst, uint64_t insn, uint64_t Address,
+                                 const void *Decoder);
+static DecodeStatus DecodeCall(MCInst &Inst, uint64_t insn, uint64_t Address,
+                               const void *Decoder);
+static DecodeStatus DecodeSIMM7(MCInst &Inst, uint64_t insn, uint64_t Address,
+                                const void *Decoder);
+static DecodeStatus DecodeSIMM32(MCInst &Inst, uint64_t insn, uint64_t Address,
+                                 const void *Decoder);
+static DecodeStatus DecodeCCOperand(MCInst &Inst, uint64_t insn,
+                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeRDOperand(MCInst &Inst, uint64_t insn,
+                                    uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeBranchCondition(MCInst &Inst, uint64_t insn,
+                                          uint64_t Address,
+                                          const void *Decoder);
+static DecodeStatus DecodeBranchConditionAlways(MCInst &Inst, uint64_t insn,
+                                                uint64_t Address,
+                                                const void *Decoder);
+
+#include "VEGenDisassemblerTables.inc"
+
+/// Read four bytes from the ArrayRef and return 32 bit word.
+static DecodeStatus readInstruction64(ArrayRef<uint8_t> Bytes, uint64_t Address,
+                                      uint64_t &Size, uint64_t &Insn,
+                                      bool IsLittleEndian) {
+  // We want to read exactly 8 Bytes of data.
+  if (Bytes.size() < 8) {
+    Size = 0;
+    return MCDisassembler::Fail;
+  }
+
+  Insn = IsLittleEndian
+             ? ((uint64_t)Bytes[0] << 0) | ((uint64_t)Bytes[1] << 8) |
+                   ((uint64_t)Bytes[2] << 16) | ((uint64_t)Bytes[3] << 24) |
+                   ((uint64_t)Bytes[4] << 32) | ((uint64_t)Bytes[5] << 40) |
+                   ((uint64_t)Bytes[6] << 48) | ((uint64_t)Bytes[7] << 56)
+             : ((uint64_t)Bytes[7] << 0) | ((uint64_t)Bytes[6] << 8) |
+                   ((uint64_t)Bytes[5] << 16) | ((uint64_t)Bytes[4] << 24) |
+                   ((uint64_t)Bytes[3] << 32) | ((uint64_t)Bytes[2] << 40) |
+                   ((uint64_t)Bytes[1] << 48) | ((uint64_t)Bytes[0] << 56);
+
+  return MCDisassembler::Success;
+}
+
+DecodeStatus VEDisassembler::getInstruction(MCInst &Instr, uint64_t &Size,
+                                            ArrayRef<uint8_t> Bytes,
+                                            uint64_t Address,
+                                            raw_ostream &CStream) const {
+  uint64_t Insn;
+  bool isLittleEndian = getContext().getAsmInfo()->isLittleEndian();
+  DecodeStatus Result =
+      readInstruction64(Bytes, Address, Size, Insn, isLittleEndian);
+  if (Result == MCDisassembler::Fail)
+    return MCDisassembler::Fail;
+
+  // Calling the auto-generated decoder function.
+
+  Result = decodeInstruction(DecoderTableVE64, Instr, Insn, Address, this, STI);
+
+  if (Result != MCDisassembler::Fail) {
+    Size = 8;
+    return Result;
+  }
+
+  return MCDisassembler::Fail;
+}
+
+typedef DecodeStatus (*DecodeFunc)(MCInst &MI, unsigned RegNo, uint64_t Address,
+                                   const void *Decoder);
+
+static DecodeStatus DecodeASX(MCInst &MI, uint64_t insn, uint64_t Address,
+                              const void *Decoder) {
+  unsigned sy = fieldFromInstruction(insn, 40, 7);
+  bool cy = fieldFromInstruction(insn, 47, 1);
+  unsigned sz = fieldFromInstruction(insn, 32, 7);
+  bool cz = fieldFromInstruction(insn, 39, 1);
+  uint64_t simm32 = SignExtend64<32>(fieldFromInstruction(insn, 0, 32));
+  DecodeStatus status;
+
+  // Decode sz.
+  if (cz) {
+    status = DecodeI64RegisterClass(MI, sz, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  } else {
+    MI.addOperand(MCOperand::createImm(0));
+  }
+
+  // Decode sy.
+  if (cy) {
+    status = DecodeI64RegisterClass(MI, sy, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  } else {
+    MI.addOperand(MCOperand::createImm(SignExtend32<7>(sy)));
+  }
+
+  // Decode simm32.
+  MI.addOperand(MCOperand::createImm(simm32));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeAS(MCInst &MI, uint64_t insn, uint64_t Address,
+                             const void *Decoder) {
+  unsigned sz = fieldFromInstruction(insn, 32, 7);
+  bool cz = fieldFromInstruction(insn, 39, 1);
+  uint64_t simm32 = SignExtend64<32>(fieldFromInstruction(insn, 0, 32));
+  DecodeStatus status;
+
+  // Decode sz.
+  if (cz) {
+    status = DecodeI64RegisterClass(MI, sz, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  } else {
+    MI.addOperand(MCOperand::createImm(0));
+  }
+
+  // Decode simm32.
+  MI.addOperand(MCOperand::createImm(simm32));
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMem(MCInst &MI, uint64_t insn, uint64_t Address,
+                              const void *Decoder, bool isLoad,
+                              DecodeFunc DecodeSX) {
+  unsigned sx = fieldFromInstruction(insn, 48, 7);
+
+  DecodeStatus status;
+  if (isLoad) {
+    status = DecodeSX(MI, sx, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+
+  status = DecodeASX(MI, insn, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  if (!isLoad) {
+    status = DecodeSX(MI, sx, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeMemAS(MCInst &MI, uint64_t insn, uint64_t Address,
+                                const void *Decoder, bool isLoad,
+                                DecodeFunc DecodeSX) {
+  unsigned sx = fieldFromInstruction(insn, 48, 7);
+
+  DecodeStatus status;
+  if (isLoad) {
+    status = DecodeSX(MI, sx, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+
+  status = DecodeAS(MI, insn, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  if (!isLoad) {
+    status = DecodeSX(MI, sx, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  }
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeLoadI32(MCInst &Inst, uint64_t insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true, DecodeI32RegisterClass);
+}
+
+static DecodeStatus DecodeStoreI32(MCInst &Inst, uint64_t insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false, DecodeI32RegisterClass);
+}
+
+static DecodeStatus DecodeLoadI64(MCInst &Inst, uint64_t insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true, DecodeI64RegisterClass);
+}
+
+static DecodeStatus DecodeStoreI64(MCInst &Inst, uint64_t insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false, DecodeI64RegisterClass);
+}
+
+static DecodeStatus DecodeLoadF32(MCInst &Inst, uint64_t insn, uint64_t Address,
+                                  const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true, DecodeF32RegisterClass);
+}
+
+static DecodeStatus DecodeStoreF32(MCInst &Inst, uint64_t insn,
+                                   uint64_t Address, const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, false, DecodeF32RegisterClass);
+}
+
+static DecodeStatus DecodeLoadASI64(MCInst &Inst, uint64_t insn,
+                                    uint64_t Address, const void *Decoder) {
+  return DecodeMemAS(Inst, insn, Address, Decoder, true,
+                     DecodeI64RegisterClass);
+}
+
+static DecodeStatus DecodeStoreASI64(MCInst &Inst, uint64_t insn,
+                                     uint64_t Address, const void *Decoder) {
+  return DecodeMemAS(Inst, insn, Address, Decoder, false,
+                     DecodeI64RegisterClass);
+}
+
+static DecodeStatus DecodeCAS(MCInst &MI, uint64_t insn, uint64_t Address,
+                              const void *Decoder, bool isImmOnly, bool isUImm,
+                              DecodeFunc DecodeSX) {
+  unsigned sx = fieldFromInstruction(insn, 48, 7);
+  bool cy = fieldFromInstruction(insn, 47, 1);
+  unsigned sy = fieldFromInstruction(insn, 40, 7);
+
+  // Add $sx.
+  DecodeStatus status;
+  status = DecodeSX(MI, sx, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Add $disp($sz).
+  status = DecodeAS(MI, insn, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  // Add $sy.
+  if (cy && !isImmOnly) {
+    status = DecodeSX(MI, sy, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  } else {
+    if (isUImm)
+      MI.addOperand(MCOperand::createImm(sy));
+    else
+      MI.addOperand(MCOperand::createImm(SignExtend32<7>(sy)));
+  }
+
+  // Add $sd.
+  status = DecodeSX(MI, sx, Address, Decoder);
+  if (status != MCDisassembler::Success)
+    return status;
+
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeTS1AMI64(MCInst &MI, uint64_t insn, uint64_t Address,
+                                   const void *Decoder) {
+  return DecodeCAS(MI, insn, Address, Decoder, false, true,
+                   DecodeI64RegisterClass);
+}
+
+static DecodeStatus DecodeTS1AMI32(MCInst &MI, uint64_t insn, uint64_t Address,
+                                   const void *Decoder) {
+  return DecodeCAS(MI, insn, Address, Decoder, false, true,
+                   DecodeI32RegisterClass);
+}
+
+static DecodeStatus DecodeCASI64(MCInst &MI, uint64_t insn, uint64_t Address,
+                                 const void *Decoder) {
+  return DecodeCAS(MI, insn, Address, Decoder, false, false,
+                   DecodeI64RegisterClass);
+}
+
+static DecodeStatus DecodeCASI32(MCInst &MI, uint64_t insn, uint64_t Address,
+                                 const void *Decoder) {
+  return DecodeCAS(MI, insn, Address, Decoder, false, false,
+                   DecodeI32RegisterClass);
+}
+
+static DecodeStatus DecodeCall(MCInst &Inst, uint64_t insn, uint64_t Address,
+                               const void *Decoder) {
+  return DecodeMem(Inst, insn, Address, Decoder, true, DecodeI64RegisterClass);
+}
+
+static DecodeStatus DecodeSIMM7(MCInst &MI, uint64_t insn, uint64_t Address,
+                                const void *Decoder) {
+  uint64_t tgt = SignExtend64<7>(insn);
+  MI.addOperand(MCOperand::createImm(tgt));
+  return MCDisassembler::Success;
+}
+
+static DecodeStatus DecodeSIMM32(MCInst &MI, uint64_t insn, uint64_t Address,
+                                 const void *Decoder) {
+  uint64_t tgt = SignExtend64<32>(insn);
+  MI.addOperand(MCOperand::createImm(tgt));
+  return MCDisassembler::Success;
+}
+
+static bool isIntegerBCKind(MCInst &MI) {
+
+#define BCm_kind(NAME)                                                         \
+  case NAME##rri:                                                              \
+  case NAME##rzi:                                                              \
+  case NAME##iri:                                                              \
+  case NAME##izi:                                                              \
+  case NAME##rri_nt:                                                           \
+  case NAME##rzi_nt:                                                           \
+  case NAME##iri_nt:                                                           \
+  case NAME##izi_nt:                                                           \
+  case NAME##rri_t:                                                            \
+  case NAME##rzi_t:                                                            \
+  case NAME##iri_t:                                                            \
+  case NAME##izi_t:
+
+#define BCRm_kind(NAME)                                                        \
+  case NAME##rr:                                                               \
+  case NAME##ir:                                                               \
+  case NAME##rr_nt:                                                            \
+  case NAME##ir_nt:                                                            \
+  case NAME##rr_t:                                                             \
+  case NAME##ir_t:
+
+  {
+    using namespace llvm::VE;
+    switch (MI.getOpcode()) {
+      BCm_kind(BCFL) BCm_kind(BCFW) BCRm_kind(BRCFL)
+          BCRm_kind(BRCFW) return true;
+    }
+  }
+#undef BCm_kind
+
+  return false;
+}
+
+// Decode CC Operand field.
+static DecodeStatus DecodeCCOperand(MCInst &MI, uint64_t cf, uint64_t Address,
+                                    const void *Decoder) {
+  MI.addOperand(MCOperand::createImm(VEValToCondCode(cf, isIntegerBCKind(MI))));
+  return MCDisassembler::Success;
+}
+
+// Decode RD Operand field.
+static DecodeStatus DecodeRDOperand(MCInst &MI, uint64_t cf, uint64_t Address,
+                                    const void *Decoder) {
+  MI.addOperand(MCOperand::createImm(VEValToRD(cf)));
+  return MCDisassembler::Success;
+}
+
+// Decode branch condition instruction and CCOperand field in it.
+static DecodeStatus DecodeBranchCondition(MCInst &MI, uint64_t insn,
+                                          uint64_t Address,
+                                          const void *Decoder) {
+  unsigned cf = fieldFromInstruction(insn, 48, 4);
+  bool cy = fieldFromInstruction(insn, 47, 1);
+  unsigned sy = fieldFromInstruction(insn, 40, 7);
+
+  // Decode cf.
+  MI.addOperand(MCOperand::createImm(VEValToCondCode(cf, isIntegerBCKind(MI))));
+
+  // Decode sy.
+  DecodeStatus status;
+  if (cy) {
+    status = DecodeI64RegisterClass(MI, sy, Address, Decoder);
+    if (status != MCDisassembler::Success)
+      return status;
+  } else {
+    MI.addOperand(MCOperand::createImm(SignExtend32<7>(sy)));
+  }
+
+  // Decode MEMri.
+  return DecodeAS(MI, insn, Address, Decoder);
+}
+
+static DecodeStatus DecodeBranchConditionAlways(MCInst &MI, uint64_t insn,
+                                                uint64_t Address,
+                                                const void *Decoder) {
+  // Decode MEMri.
+  return DecodeAS(MI, insn, Address, Decoder);
+}
diff --git a/llvm/lib/Target/VE/InstPrinter/VEInstPrinter.cpp b/llvm/lib/Target/VE/InstPrinter/VEInstPrinter.cpp
deleted file mode 100644
index 4e7bcd36c32a..000000000000
--- a/llvm/lib/Target/VE/InstPrinter/VEInstPrinter.cpp
+++ /dev/null
@@ -1,118 +0,0 @@
-//===-- VEInstPrinter.cpp - Convert VE MCInst to assembly syntax -----------==//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This class prints an VE MCInst to a .s file.
-//
-//===----------------------------------------------------------------------===//
-
-#include "VEInstPrinter.h"
-#include "VE.h"
-#include "llvm/MC/MCExpr.h"
-#include "llvm/MC/MCInst.h"
-#include "llvm/MC/MCRegisterInfo.h"
-#include "llvm/MC/MCSubtargetInfo.h"
-#include "llvm/MC/MCSymbol.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "ve-asmprinter"
-
-// The generated AsmMatcher VEGenAsmWriter uses "VE" as the target
-// namespace.
-namespace llvm {
-namespace VE {
-using namespace VE;
-}
-} // namespace llvm
-
-#define GET_INSTRUCTION_NAME
-#define PRINT_ALIAS_INSTR
-#include "VEGenAsmWriter.inc"
-
-void VEInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
-  OS << '%' << StringRef(getRegisterName(RegNo)).lower();
-}
-
-void VEInstPrinter::printInst(const MCInst *MI, uint64_t Address,
-                              StringRef Annot, const MCSubtargetInfo &STI,
-                              raw_ostream &OS) {
-  if (!printAliasInstr(MI, STI, OS))
-    printInstruction(MI, Address, STI, OS);
-  printAnnotation(OS, Annot);
-}
-
-void VEInstPrinter::printOperand(const MCInst *MI, int opNum,
-                                 const MCSubtargetInfo &STI, raw_ostream &O) {
-  const MCOperand &MO = MI->getOperand(opNum);
-
-  if (MO.isReg()) {
-    printRegName(O, MO.getReg());
-    return;
-  }
-
-  if (MO.isImm()) {
-    switch (MI->getOpcode()) {
-    default:
-      // Expects signed 32bit literals
-      assert(isInt<32>(MO.getImm()) && "Immediate too large");
-      int32_t TruncatedImm = static_cast<int32_t>(MO.getImm());
-      O << TruncatedImm;
-      return;
-    }
-  }
-
-  assert(MO.isExpr() && "Unknown operand kind in printOperand");
-  MO.getExpr()->print(O, &MAI);
-}
-
-void VEInstPrinter::printMemASXOperand(const MCInst *MI, int opNum,
-                                       const MCSubtargetInfo &STI,
-                                       raw_ostream &O, const char *Modifier) {
-  // If this is an ADD operand, emit it like normal operands.
-  if (Modifier && !strcmp(Modifier, "arith")) {
-    printOperand(MI, opNum, STI, O);
-    O << ", ";
-    printOperand(MI, opNum + 1, STI, O);
-    return;
-  }
-
-  const MCOperand &MO = MI->getOperand(opNum + 1);
-  if (!MO.isImm() || MO.getImm() != 0) {
-    printOperand(MI, opNum + 1, STI, O);
-  }
-  O << "(,";
-  printOperand(MI, opNum, STI, O);
-  O << ")";
-}
-
-void VEInstPrinter::printMemASOperand(const MCInst *MI, int opNum,
-                                      const MCSubtargetInfo &STI,
-                                      raw_ostream &O, const char *Modifier) {
-  // If this is an ADD operand, emit it like normal operands.
-  if (Modifier && !strcmp(Modifier, "arith")) {
-    printOperand(MI, opNum, STI, O);
-    O << ", ";
-    printOperand(MI, opNum + 1, STI, O);
-    return;
-  }
-
-  const MCOperand &MO = MI->getOperand(opNum + 1);
-  if (!MO.isImm() || MO.getImm() != 0) {
-    printOperand(MI, opNum + 1, STI, O);
-  }
-  O << "(";
-  printOperand(MI, opNum, STI, O);
-  O << ")";
-}
-
-void VEInstPrinter::printCCOperand(const MCInst *MI, int opNum,
-                                   const MCSubtargetInfo &STI, raw_ostream &O) {
-  int CC = (int)MI->getOperand(opNum).getImm();
-  O << VECondCodeToString((VECC::CondCodes)CC);
-}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
new file mode 100644
index 000000000000..9a6ae90b5c73
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEAsmBackend.cpp
@@ -0,0 +1,224 @@
+//===-- VEAsmBackend.cpp - VE Assembler Backend ---------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/VEFixupKinds.h"
+#include "MCTargetDesc/VEMCTargetDesc.h"
+#include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixupKindInfo.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/TargetRegistry.h"
+
+using namespace llvm;
+
+static uint64_t adjustFixupValue(unsigned Kind, uint64_t Value) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_Data_2:
+  case FK_Data_4:
+  case FK_Data_8:
+  case FK_PCRel_1:
+  case FK_PCRel_2:
+  case FK_PCRel_4:
+  case FK_PCRel_8:
+    return Value;
+  case VE::fixup_ve_hi32:
+  case VE::fixup_ve_pc_hi32:
+  case VE::fixup_ve_got_hi32:
+  case VE::fixup_ve_gotoff_hi32:
+  case VE::fixup_ve_plt_hi32:
+  case VE::fixup_ve_tls_gd_hi32:
+  case VE::fixup_ve_tpoff_hi32:
+    return (Value >> 32) & 0xffffffff;
+  case VE::fixup_ve_reflong:
+  case VE::fixup_ve_lo32:
+  case VE::fixup_ve_pc_lo32:
+  case VE::fixup_ve_got_lo32:
+  case VE::fixup_ve_gotoff_lo32:
+  case VE::fixup_ve_plt_lo32:
+  case VE::fixup_ve_tls_gd_lo32:
+  case VE::fixup_ve_tpoff_lo32:
+    return Value & 0xffffffff;
+  }
+}
+
+/// getFixupKindNumBytes - The number of bytes the fixup may change.
+static unsigned getFixupKindNumBytes(unsigned Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unknown fixup kind!");
+  case FK_Data_1:
+  case FK_PCRel_1:
+    return 1;
+  case FK_Data_2:
+  case FK_PCRel_2:
+    return 2;
+    return 4;
+  case FK_Data_4:
+  case FK_PCRel_4:
+  case VE::fixup_ve_reflong:
+  case VE::fixup_ve_hi32:
+  case VE::fixup_ve_lo32:
+  case VE::fixup_ve_pc_hi32:
+  case VE::fixup_ve_pc_lo32:
+  case VE::fixup_ve_got_hi32:
+  case VE::fixup_ve_got_lo32:
+  case VE::fixup_ve_gotoff_hi32:
+  case VE::fixup_ve_gotoff_lo32:
+  case VE::fixup_ve_plt_hi32:
+  case VE::fixup_ve_plt_lo32:
+  case VE::fixup_ve_tls_gd_hi32:
+  case VE::fixup_ve_tls_gd_lo32:
+  case VE::fixup_ve_tpoff_hi32:
+  case VE::fixup_ve_tpoff_lo32:
+    return 4;
+  case FK_Data_8:
+  case FK_PCRel_8:
+    return 8;
+  }
+}
+
+namespace {
+class VEAsmBackend : public MCAsmBackend {
+protected:
+  const Target &TheTarget;
+
+public:
+  VEAsmBackend(const Target &T) : MCAsmBackend(support::little), TheTarget(T) {}
+
+  unsigned getNumFixupKinds() const override { return VE::NumTargetFixupKinds; }
+
+  const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override {
+    const static MCFixupKindInfo Infos[VE::NumTargetFixupKinds] = {
+        // name, offset, bits, flags
+        {"fixup_ve_reflong", 0, 32, 0},
+        {"fixup_ve_hi32", 0, 32, 0},
+        {"fixup_ve_lo32", 0, 32, 0},
+        {"fixup_ve_pc_hi32", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_ve_pc_lo32", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
+        {"fixup_ve_got_hi32", 0, 32, 0},
+        {"fixup_ve_got_lo32", 0, 32, 0},
+        {"fixup_ve_gotoff_hi32", 0, 32, 0},
+        {"fixup_ve_gotoff_lo32", 0, 32, 0},
+        {"fixup_ve_plt_hi32", 0, 32, 0},
+        {"fixup_ve_plt_lo32", 0, 32, 0},
+        {"fixup_ve_tls_gd_hi32", 0, 32, 0},
+        {"fixup_ve_tls_gd_lo32", 0, 32, 0},
+        {"fixup_ve_tpoff_hi32", 0, 32, 0},
+        {"fixup_ve_tpoff_lo32", 0, 32, 0},
+    };
+
+    if (Kind < FirstTargetFixupKind)
+      return MCAsmBackend::getFixupKindInfo(Kind);
+
+    assert(unsigned(Kind - FirstTargetFixupKind) < getNumFixupKinds() &&
+           "Invalid kind!");
+    return Infos[Kind - FirstTargetFixupKind];
+  }
+
+  bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
+                             const MCValue &Target) override {
+    switch ((VE::Fixups)Fixup.getKind()) {
+    default:
+      return false;
+    case VE::fixup_ve_tls_gd_hi32:
+    case VE::fixup_ve_tls_gd_lo32:
+    case VE::fixup_ve_tpoff_hi32:
+    case VE::fixup_ve_tpoff_lo32:
+      return true;
+    }
+  }
+
+  bool mayNeedRelaxation(const MCInst &Inst,
+                         const MCSubtargetInfo &STI) const override {
+    // Not implemented yet.  For example, if we have a branch with
+    // lager than SIMM32 immediate value, we want to relaxation such
+    // branch instructions.
+    return false;
+  }
+
+  /// fixupNeedsRelaxation - Target specific predicate for whether a given
+  /// fixup requires the associated instruction to be relaxed.
+  bool fixupNeedsRelaxation(const MCFixup &Fixup, uint64_t Value,
+                            const MCRelaxableFragment *DF,
+                            const MCAsmLayout &Layout) const override {
+    // Not implemented yet.  For example, if we have a branch with
+    // lager than SIMM32 immediate value, we want to relaxation such
+    // branch instructions.
+    return false;
+  }
+  void relaxInstruction(MCInst &Inst,
+                        const MCSubtargetInfo &STI) const override {
+    // Aurora VE doesn't support relaxInstruction yet.
+    llvm_unreachable("relaxInstruction() should not be called");
+  }
+
+  bool writeNopData(raw_ostream &OS, uint64_t Count) const override {
+    if ((Count % 8) != 0)
+      return false;
+
+    for (uint64_t i = 0; i < Count; i += 8)
+      support::endian::write<uint64_t>(OS, 0x7900000000000000ULL,
+                                       support::little);
+
+    return true;
+  }
+};
+
+class ELFVEAsmBackend : public VEAsmBackend {
+  Triple::OSType OSType;
+
+public:
+  ELFVEAsmBackend(const Target &T, Triple::OSType OSType)
+      : VEAsmBackend(T), OSType(OSType) {}
+
+  void applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
+                  const MCValue &Target, MutableArrayRef<char> Data,
+                  uint64_t Value, bool IsResolved,
+                  const MCSubtargetInfo *STI) const override {
+    Value = adjustFixupValue(Fixup.getKind(), Value);
+    if (!Value)
+      return; // Doesn't change encoding.
+
+    MCFixupKindInfo Info = getFixupKindInfo(Fixup.getKind());
+
+    // Shift the value into position.
+    Value <<= Info.TargetOffset;
+
+    unsigned NumBytes = getFixupKindNumBytes(Fixup.getKind());
+    unsigned Offset = Fixup.getOffset();
+    assert(Offset + NumBytes <= Data.size() && "Invalid fixup offset!");
+    // For each byte of the fragment that the fixup touches, mask in the bits
+    // from the fixup value. The Value has been "split up" into the
+    // appropriate bitfields above.
+    for (unsigned i = 0; i != NumBytes; ++i) {
+      unsigned Idx = Endian == support::little ? i : (NumBytes - 1) - i;
+      Data[Offset + Idx] |= static_cast<uint8_t>((Value >> (i * 8)) & 0xff);
+    }
+  }
+
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    uint8_t OSABI = MCELFObjectTargetWriter::getOSABI(OSType);
+    return createVEELFObjectWriter(OSABI);
+  }
+};
+} // end anonymous namespace
+
+MCAsmBackend *llvm::createVEAsmBackend(const Target &T,
+                                       const MCSubtargetInfo &STI,
+                                       const MCRegisterInfo &MRI,
+                                       const MCTargetOptions &Options) {
+  return new ELFVEAsmBackend(T, STI.getTargetTriple().getOS());
+}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
new file mode 100644
index 000000000000..741e8320a941
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEELFObjectWriter.cpp
@@ -0,0 +1,135 @@
+//===-- VEELFObjectWriter.cpp - VE ELF Writer -----------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "VEFixupKinds.h"
+#include "VEMCExpr.h"
+#include "VEMCTargetDesc.h"
+#include "llvm/MC/MCELFObjectWriter.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCObjectWriter.h"
+#include "llvm/MC/MCValue.h"
+#include "llvm/Support/ErrorHandling.h"
+
+using namespace llvm;
+
+namespace {
+class VEELFObjectWriter : public MCELFObjectTargetWriter {
+public:
+  VEELFObjectWriter(uint8_t OSABI)
+      : MCELFObjectTargetWriter(/* Is64Bit */ true, OSABI, ELF::EM_VE,
+                                /* HasRelocationAddend */ true) {}
+
+  ~VEELFObjectWriter() override {}
+
+protected:
+  unsigned getRelocType(MCContext &Ctx, const MCValue &Target,
+                        const MCFixup &Fixup, bool IsPCRel) const override;
+
+  bool needsRelocateWithSymbol(const MCSymbol &Sym,
+                               unsigned Type) const override;
+};
+} // namespace
+
+unsigned VEELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
+                                         const MCFixup &Fixup,
+                                         bool IsPCRel) const {
+  if (const VEMCExpr *SExpr = dyn_cast<VEMCExpr>(Fixup.getValue())) {
+    if (SExpr->getKind() == VEMCExpr::VK_VE_PC_LO32)
+      return ELF::R_VE_PC_LO32;
+  }
+
+  if (IsPCRel) {
+    switch (Fixup.getTargetKind()) {
+    default:
+      llvm_unreachable("Unimplemented fixup -> relocation");
+    case FK_PCRel_1:
+      llvm_unreachable("Unimplemented fixup fk_data_1 -> relocation");
+    case FK_PCRel_2:
+      llvm_unreachable("Unimplemented fixup fk_data_2 -> relocation");
+    // FIXME: relative kind?
+    case FK_PCRel_4:
+      return ELF::R_VE_REFLONG;
+    case FK_PCRel_8:
+      return ELF::R_VE_REFQUAD;
+    case VE::fixup_ve_pc_hi32:
+      return ELF::R_VE_PC_HI32;
+    case VE::fixup_ve_pc_lo32:
+      return ELF::R_VE_PC_LO32;
+    }
+  }
+
+  switch (Fixup.getTargetKind()) {
+  default:
+    llvm_unreachable("Unimplemented fixup -> relocation");
+  case FK_Data_1:
+    llvm_unreachable("Unimplemented fixup fk_data_1 -> relocation");
+  case FK_Data_2:
+    llvm_unreachable("Unimplemented fixup fk_data_2 -> relocation");
+  case FK_Data_4:
+    return ELF::R_VE_REFLONG;
+  case FK_Data_8:
+    return ELF::R_VE_REFQUAD;
+  case VE::fixup_ve_reflong:
+    return ELF::R_VE_REFLONG;
+  case VE::fixup_ve_hi32:
+    return ELF::R_VE_HI32;
+  case VE::fixup_ve_lo32:
+    return ELF::R_VE_LO32;
+  case VE::fixup_ve_pc_hi32:
+    llvm_unreachable("Unimplemented fixup pc_hi32 -> relocation");
+  case VE::fixup_ve_pc_lo32:
+    llvm_unreachable("Unimplemented fixup pc_lo32 -> relocation");
+  case VE::fixup_ve_got_hi32:
+    return ELF::R_VE_GOT_HI32;
+  case VE::fixup_ve_got_lo32:
+    return ELF::R_VE_GOT_LO32;
+  case VE::fixup_ve_gotoff_hi32:
+    return ELF::R_VE_GOTOFF_HI32;
+  case VE::fixup_ve_gotoff_lo32:
+    return ELF::R_VE_GOTOFF_LO32;
+  case VE::fixup_ve_plt_hi32:
+    return ELF::R_VE_PLT_HI32;
+  case VE::fixup_ve_plt_lo32:
+    return ELF::R_VE_PLT_LO32;
+  case VE::fixup_ve_tls_gd_hi32:
+    return ELF::R_VE_TLS_GD_HI32;
+  case VE::fixup_ve_tls_gd_lo32:
+    return ELF::R_VE_TLS_GD_LO32;
+  case VE::fixup_ve_tpoff_hi32:
+    return ELF::R_VE_TPOFF_HI32;
+  case VE::fixup_ve_tpoff_lo32:
+    return ELF::R_VE_TPOFF_LO32;
+  }
+
+  return ELF::R_VE_NONE;
+}
+
+bool VEELFObjectWriter::needsRelocateWithSymbol(const MCSymbol &Sym,
+                                                unsigned Type) const {
+  switch (Type) {
+  default:
+    return false;
+
+  // All relocations that use a GOT need a symbol, not an offset, as
+  // the offset of the symbol within the section is irrelevant to
+  // where the GOT entry is. Don't need to list all the TLS entries,
+  // as they're all marked as requiring a symbol anyways.
+  case ELF::R_VE_GOT_HI32:
+  case ELF::R_VE_GOT_LO32:
+  case ELF::R_VE_GOTOFF_HI32:
+  case ELF::R_VE_GOTOFF_LO32:
+  case ELF::R_VE_TLS_GD_HI32:
+  case ELF::R_VE_TLS_GD_LO32:
+    return true;
+  }
+}
+
+std::unique_ptr<MCObjectTargetWriter>
+llvm::createVEELFObjectWriter(uint8_t OSABI) {
+  return std::make_unique<VEELFObjectWriter>(OSABI);
+}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h b/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h
new file mode 100644
index 000000000000..5d5dc1c5c891
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEFixupKinds.h
@@ -0,0 +1,61 @@
+//===-- VEFixupKinds.h - VE Specific Fixup Entries --------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_VE_MCTARGETDESC_VEFIXUPKINDS_H
+#define LLVM_LIB_TARGET_VE_MCTARGETDESC_VEFIXUPKINDS_H
+
+#include "llvm/MC/MCFixup.h"
+
+namespace llvm {
+namespace VE {
+enum Fixups {
+  /// fixup_ve_reflong - 32-bit fixup corresponding to foo
+  fixup_ve_reflong = FirstTargetFixupKind,
+
+  /// fixup_ve_hi32 - 32-bit fixup corresponding to foo@hi
+  fixup_ve_hi32,
+
+  /// fixup_ve_lo32 - 32-bit fixup corresponding to foo@lo
+  fixup_ve_lo32,
+
+  /// fixup_ve_pc_hi32 - 32-bit fixup corresponding to foo@pc_hi
+  fixup_ve_pc_hi32,
+
+  /// fixup_ve_pc_lo32 - 32-bit fixup corresponding to foo@pc_lo
+  fixup_ve_pc_lo32,
+
+  /// fixup_ve_got_hi32 - 32-bit fixup corresponding to foo@got_hi
+  fixup_ve_got_hi32,
+
+  /// fixup_ve_got_lo32 - 32-bit fixup corresponding to foo@got_lo
+  fixup_ve_got_lo32,
+
+  /// fixup_ve_gotoff_hi32 - 32-bit fixup corresponding to foo@gotoff_hi
+  fixup_ve_gotoff_hi32,
+
+  /// fixup_ve_gotoff_lo32 - 32-bit fixup corresponding to foo@gotoff_lo
+  fixup_ve_gotoff_lo32,
+
+  /// fixup_ve_plt_hi32/lo32
+  fixup_ve_plt_hi32,
+  fixup_ve_plt_lo32,
+
+  /// fixups for Thread Local Storage
+  fixup_ve_tls_gd_hi32,
+  fixup_ve_tls_gd_lo32,
+  fixup_ve_tpoff_hi32,
+  fixup_ve_tpoff_lo32,
+
+  // Marker
+  LastTargetFixupKind,
+  NumTargetFixupKinds = LastTargetFixupKind - FirstTargetFixupKind
+};
+} // namespace VE
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp
new file mode 100644
index 000000000000..1fe9423e01b8
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.cpp
@@ -0,0 +1,227 @@
+//===-- VEInstPrinter.cpp - Convert VE MCInst to assembly syntax -----------==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This class prints an VE MCInst to a .s file.
+//
+//===----------------------------------------------------------------------===//
+
+#include "VEInstPrinter.h"
+#include "VE.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "ve-asmprinter"
+
+// The generated AsmMatcher VEGenAsmWriter uses "VE" as the target
+// namespace.
+namespace llvm {
+namespace VE {
+using namespace VE;
+}
+} // namespace llvm
+
+#define GET_INSTRUCTION_NAME
+#define PRINT_ALIAS_INSTR
+#include "VEGenAsmWriter.inc"
+
+void VEInstPrinter::printRegName(raw_ostream &OS, unsigned RegNo) const {
+  // Generic registers have identical register name among register classes.
+  unsigned AltIdx = VE::AsmName;
+  // Misc registers have each own name, so no use alt-names.
+  if (MRI.getRegClass(VE::MISCRegClassID).contains(RegNo))
+    AltIdx = VE::NoRegAltName;
+  OS << '%' << getRegisterName(RegNo, AltIdx);
+}
+
+void VEInstPrinter::printInst(const MCInst *MI, uint64_t Address,
+                              StringRef Annot, const MCSubtargetInfo &STI,
+                              raw_ostream &OS) {
+  if (!printAliasInstr(MI, Address, STI, OS))
+    printInstruction(MI, Address, STI, OS);
+  printAnnotation(OS, Annot);
+}
+
+void VEInstPrinter::printOperand(const MCInst *MI, int OpNum,
+                                 const MCSubtargetInfo &STI, raw_ostream &O) {
+  const MCOperand &MO = MI->getOperand(OpNum);
+
+  if (MO.isReg()) {
+    printRegName(O, MO.getReg());
+    return;
+  }
+
+  if (MO.isImm()) {
+    switch (MI->getOpcode()) {
+    default:
+      // Expects signed 32bit literals
+      int32_t TruncatedImm = static_cast<int32_t>(MO.getImm());
+      O << TruncatedImm;
+      return;
+    }
+  }
+
+  assert(MO.isExpr() && "Unknown operand kind in printOperand");
+  MO.getExpr()->print(O, &MAI);
+}
+
+void VEInstPrinter::printMemASXOperand(const MCInst *MI, int OpNum,
+                                       const MCSubtargetInfo &STI,
+                                       raw_ostream &O, const char *Modifier) {
+  // If this is an ADD operand, emit it like normal operands.
+  if (Modifier && !strcmp(Modifier, "arith")) {
+    printOperand(MI, OpNum, STI, O);
+    O << ", ";
+    printOperand(MI, OpNum + 1, STI, O);
+    return;
+  }
+
+  if (MI->getOperand(OpNum + 2).isImm() &&
+      MI->getOperand(OpNum + 2).getImm() == 0) {
+    // don't print "+0"
+  } else {
+    printOperand(MI, OpNum + 2, STI, O);
+  }
+  if (MI->getOperand(OpNum + 1).isImm() &&
+      MI->getOperand(OpNum + 1).getImm() == 0 &&
+      MI->getOperand(OpNum).isImm() && MI->getOperand(OpNum).getImm() == 0) {
+    if (MI->getOperand(OpNum + 2).isImm() &&
+        MI->getOperand(OpNum + 2).getImm() == 0) {
+      O << "0";
+    } else {
+      // don't print "+0,+0"
+    }
+  } else {
+    O << "(";
+    if (MI->getOperand(OpNum + 1).isImm() &&
+        MI->getOperand(OpNum + 1).getImm() == 0) {
+      // don't print "+0"
+    } else {
+      printOperand(MI, OpNum + 1, STI, O);
+    }
+    if (MI->getOperand(OpNum).isImm() && MI->getOperand(OpNum).getImm() == 0) {
+      // don't print "+0"
+    } else {
+      O << ", ";
+      printOperand(MI, OpNum, STI, O);
+    }
+    O << ")";
+  }
+}
+
+void VEInstPrinter::printMemASOperandASX(const MCInst *MI, int OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O, const char *Modifier) {
+  // If this is an ADD operand, emit it like normal operands.
+  if (Modifier && !strcmp(Modifier, "arith")) {
+    printOperand(MI, OpNum, STI, O);
+    O << ", ";
+    printOperand(MI, OpNum + 1, STI, O);
+    return;
+  }
+
+  if (MI->getOperand(OpNum + 1).isImm() &&
+      MI->getOperand(OpNum + 1).getImm() == 0) {
+    // don't print "+0"
+  } else {
+    printOperand(MI, OpNum + 1, STI, O);
+  }
+  if (MI->getOperand(OpNum).isImm() && MI->getOperand(OpNum).getImm() == 0) {
+    if (MI->getOperand(OpNum + 1).isImm() &&
+        MI->getOperand(OpNum + 1).getImm() == 0) {
+      O << "0";
+    } else {
+      // don't print "(0)"
+    }
+  } else {
+    O << "(, ";
+    printOperand(MI, OpNum, STI, O);
+    O << ")";
+  }
+}
+
+void VEInstPrinter::printMemASOperandRRM(const MCInst *MI, int OpNum,
+                                         const MCSubtargetInfo &STI,
+                                         raw_ostream &O, const char *Modifier) {
+  // If this is an ADD operand, emit it like normal operands.
+  if (Modifier && !strcmp(Modifier, "arith")) {
+    printOperand(MI, OpNum, STI, O);
+    O << ", ";
+    printOperand(MI, OpNum + 1, STI, O);
+    return;
+  }
+
+  if (MI->getOperand(OpNum + 1).isImm() &&
+      MI->getOperand(OpNum + 1).getImm() == 0) {
+    // don't print "+0"
+  } else {
+    printOperand(MI, OpNum + 1, STI, O);
+  }
+  if (MI->getOperand(OpNum).isImm() && MI->getOperand(OpNum).getImm() == 0) {
+    if (MI->getOperand(OpNum + 1).isImm() &&
+        MI->getOperand(OpNum + 1).getImm() == 0) {
+      O << "0";
+    } else {
+      // don't print "(0)"
+    }
+  } else {
+    O << "(";
+    printOperand(MI, OpNum, STI, O);
+    O << ")";
+  }
+}
+
+void VEInstPrinter::printMemASOperandHM(const MCInst *MI, int OpNum,
+                                        const MCSubtargetInfo &STI,
+                                        raw_ostream &O, const char *Modifier) {
+  // If this is an ADD operand, emit it like normal operands.
+  if (Modifier && !strcmp(Modifier, "arith")) {
+    printOperand(MI, OpNum, STI, O);
+    O << ", ";
+    printOperand(MI, OpNum + 1, STI, O);
+    return;
+  }
+
+  if (MI->getOperand(OpNum + 1).isImm() &&
+      MI->getOperand(OpNum + 1).getImm() == 0) {
+    // don't print "+0"
+  } else {
+    printOperand(MI, OpNum + 1, STI, O);
+  }
+  O << "(";
+  if (MI->getOperand(OpNum).isReg())
+    printOperand(MI, OpNum, STI, O);
+  O << ")";
+}
+
+void VEInstPrinter::printMImmOperand(const MCInst *MI, int OpNum,
+                                     const MCSubtargetInfo &STI,
+                                     raw_ostream &O) {
+  int MImm = (int)MI->getOperand(OpNum).getImm() & 0x7f;
+  if (MImm > 63)
+    O << "(" << MImm - 64 << ")0";
+  else
+    O << "(" << MImm << ")1";
+}
+
+void VEInstPrinter::printCCOperand(const MCInst *MI, int OpNum,
+                                   const MCSubtargetInfo &STI, raw_ostream &O) {
+  int CC = (int)MI->getOperand(OpNum).getImm();
+  O << VECondCodeToString((VECC::CondCode)CC);
+}
+
+void VEInstPrinter::printRDOperand(const MCInst *MI, int OpNum,
+                                   const MCSubtargetInfo &STI, raw_ostream &O) {
+  int RD = (int)MI->getOperand(OpNum).getImm();
+  O << VERDToString((VERD::RoundingMode)RD);
+}
diff --git a/llvm/lib/Target/VE/InstPrinter/VEInstPrinter.h b/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.h
index 05a53d59e878..657cc513b3c5 100644
--- a/llvm/lib/Target/VE/InstPrinter/VEInstPrinter.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEInstPrinter.h
@@ -13,6 +13,7 @@
 #ifndef LLVM_LIB_TARGET_VE_INSTPRINTER_VEINSTPRINTER_H
 #define LLVM_LIB_TARGET_VE_INSTPRINTER_VEINSTPRINTER_H
 
+#include "VEMCTargetDesc.h"
 #include "llvm/MC/MCInstPrinter.h"
 
 namespace llvm {
@@ -28,20 +29,32 @@ public:
                  const MCSubtargetInfo &STI, raw_ostream &OS) override;
 
   // Autogenerated by tblgen.
-  bool printAliasInstr(const MCInst *, const MCSubtargetInfo &, raw_ostream &);
+  bool printAliasInstr(const MCInst *, uint64_t Address,
+                       const MCSubtargetInfo &, raw_ostream &);
   void printInstruction(const MCInst *, uint64_t, const MCSubtargetInfo &,
                         raw_ostream &);
-  static const char *getRegisterName(unsigned RegNo);
+  static const char *getRegisterName(unsigned RegNo,
+                                     unsigned AltIdx = VE::NoRegAltName);
 
-  void printOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
+  void printOperand(const MCInst *MI, int OpNum, const MCSubtargetInfo &STI,
                     raw_ostream &OS);
-  void printMemASXOperand(const MCInst *MI, int opNum,
+  void printMemASXOperand(const MCInst *MI, int OpNum,
                           const MCSubtargetInfo &STI, raw_ostream &OS,
                           const char *Modifier = nullptr);
-  void printMemASOperand(const MCInst *MI, int opNum,
-                         const MCSubtargetInfo &STI, raw_ostream &OS,
-                         const char *Modifier = nullptr);
-  void printCCOperand(const MCInst *MI, int opNum, const MCSubtargetInfo &STI,
+  void printMemASOperandASX(const MCInst *MI, int OpNum,
+                            const MCSubtargetInfo &STI, raw_ostream &OS,
+                            const char *Modifier = nullptr);
+  void printMemASOperandRRM(const MCInst *MI, int OpNum,
+                            const MCSubtargetInfo &STI, raw_ostream &OS,
+                            const char *Modifier = nullptr);
+  void printMemASOperandHM(const MCInst *MI, int OpNum,
+                           const MCSubtargetInfo &STI, raw_ostream &OS,
+                           const char *Modifier = nullptr);
+  void printMImmOperand(const MCInst *MI, int OpNum, const MCSubtargetInfo &STI,
+                        raw_ostream &OS);
+  void printCCOperand(const MCInst *MI, int OpNum, const MCSubtargetInfo &STI,
+                      raw_ostream &OS);
+  void printRDOperand(const MCInst *MI, int OpNum, const MCSubtargetInfo &STI,
                       raw_ostream &OS);
 };
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
index 9f29fc092c69..76824335239b 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCAsmInfo.cpp
@@ -37,4 +37,5 @@ VEELFMCAsmInfo::VEELFMCAsmInfo(const Triple &TheTriple) {
   UsesELFSectionDirectiveForBSS = true;
 
   SupportsDebugInformation = true;
+  UseIntegratedAssembler = false;
 }
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
new file mode 100644
index 000000000000..d50d8fcae9da
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCCodeEmitter.cpp
@@ -0,0 +1,165 @@
+//===-- VEMCCodeEmitter.cpp - Convert VE code to machine code -------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements the VEMCCodeEmitter class.
+//
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/VEFixupKinds.h"
+#include "VE.h"
+#include "VEMCExpr.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCExpr.h"
+#include "llvm/MC/MCFixup.h"
+#include "llvm/MC/MCInst.h"
+#include "llvm/MC/MCInstrInfo.h"
+#include "llvm/MC/MCRegisterInfo.h"
+#include "llvm/MC/MCSubtargetInfo.h"
+#include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/EndianStream.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <cassert>
+#include <cstdint>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "mccodeemitter"
+
+STATISTIC(MCNumEmitted, "Number of MC instructions emitted");
+
+namespace {
+
+class VEMCCodeEmitter : public MCCodeEmitter {
+  const MCInstrInfo &MCII;
+  MCContext &Ctx;
+
+public:
+  VEMCCodeEmitter(const MCInstrInfo &mcii, MCContext &ctx)
+      : MCII(mcii), Ctx(ctx) {}
+  VEMCCodeEmitter(const VEMCCodeEmitter &) = delete;
+  VEMCCodeEmitter &operator=(const VEMCCodeEmitter &) = delete;
+  ~VEMCCodeEmitter() override = default;
+
+  void encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                         SmallVectorImpl<MCFixup> &Fixups,
+                         const MCSubtargetInfo &STI) const override;
+
+  // getBinaryCodeForInstr - TableGen'erated function for getting the
+  // binary encoding for an instruction.
+  uint64_t getBinaryCodeForInstr(const MCInst &MI,
+                                 SmallVectorImpl<MCFixup> &Fixups,
+                                 const MCSubtargetInfo &STI) const;
+
+  /// getMachineOpValue - Return binary encoding of operand. If the machine
+  /// operand requires relocation, record the relocation and return zero.
+  unsigned getMachineOpValue(const MCInst &MI, const MCOperand &MO,
+                             SmallVectorImpl<MCFixup> &Fixups,
+                             const MCSubtargetInfo &STI) const;
+
+  uint64_t getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                  SmallVectorImpl<MCFixup> &Fixups,
+                                  const MCSubtargetInfo &STI) const;
+  uint64_t getCCOpValue(const MCInst &MI, unsigned OpNo,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const;
+  uint64_t getRDOpValue(const MCInst &MI, unsigned OpNo,
+                        SmallVectorImpl<MCFixup> &Fixups,
+                        const MCSubtargetInfo &STI) const;
+
+private:
+  FeatureBitset computeAvailableFeatures(const FeatureBitset &FB) const;
+  void
+  verifyInstructionPredicates(const MCInst &MI,
+                              const FeatureBitset &AvailableFeatures) const;
+};
+
+} // end anonymous namespace
+
+void VEMCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  verifyInstructionPredicates(MI,
+                              computeAvailableFeatures(STI.getFeatureBits()));
+
+  uint64_t Bits = getBinaryCodeForInstr(MI, Fixups, STI);
+  support::endian::write<uint64_t>(OS, Bits, support::little);
+
+  ++MCNumEmitted; // Keep track of the # of mi's emitted.
+}
+
+unsigned VEMCCodeEmitter::getMachineOpValue(const MCInst &MI,
+                                            const MCOperand &MO,
+                                            SmallVectorImpl<MCFixup> &Fixups,
+                                            const MCSubtargetInfo &STI) const {
+  if (MO.isReg())
+    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg());
+
+  if (MO.isImm())
+    return MO.getImm();
+
+  assert(MO.isExpr());
+  const MCExpr *Expr = MO.getExpr();
+  if (const VEMCExpr *SExpr = dyn_cast<VEMCExpr>(Expr)) {
+    MCFixupKind Kind = (MCFixupKind)SExpr->getFixupKind();
+    Fixups.push_back(MCFixup::create(0, Expr, Kind));
+    return 0;
+  }
+
+  int64_t Res;
+  if (Expr->evaluateAsAbsolute(Res))
+    return Res;
+
+  llvm_unreachable("Unhandled expression!");
+  return 0;
+}
+
+uint64_t
+VEMCCodeEmitter::getBranchTargetOpValue(const MCInst &MI, unsigned OpNo,
+                                        SmallVectorImpl<MCFixup> &Fixups,
+                                        const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isReg() || MO.isImm())
+    return getMachineOpValue(MI, MO, Fixups, STI);
+
+  Fixups.push_back(
+      MCFixup::create(0, MO.getExpr(), (MCFixupKind)VE::fixup_ve_pc_lo32));
+  return 0;
+}
+
+uint64_t VEMCCodeEmitter::getCCOpValue(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm())
+    return VECondCodeToVal(
+        static_cast<VECC::CondCode>(getMachineOpValue(MI, MO, Fixups, STI)));
+  return 0;
+}
+
+uint64_t VEMCCodeEmitter::getRDOpValue(const MCInst &MI, unsigned OpNo,
+                                       SmallVectorImpl<MCFixup> &Fixups,
+                                       const MCSubtargetInfo &STI) const {
+  const MCOperand &MO = MI.getOperand(OpNo);
+  if (MO.isImm())
+    return VERDToVal(static_cast<VERD::RoundingMode>(
+        getMachineOpValue(MI, MO, Fixups, STI)));
+  return 0;
+}
+
+#define ENABLE_INSTR_PREDICATE_VERIFIER
+#include "VEGenMCCodeEmitter.inc"
+
+MCCodeEmitter *llvm::createVEMCCodeEmitter(const MCInstrInfo &MCII,
+                                           const MCRegisterInfo &MRI,
+                                           MCContext &Ctx) {
+  return new VEMCCodeEmitter(MCII, Ctx);
+}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
new file mode 100644
index 000000000000..a3ce3b3309be
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.cpp
@@ -0,0 +1,225 @@
+//===-- VEMCExpr.cpp - VE specific MC expression classes ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file contains the implementation of the assembly expression modifiers
+// accepted by the VE architecture (e.g. "%hi", "%lo", ...).
+//
+//===----------------------------------------------------------------------===//
+
+#include "VEMCExpr.h"
+#include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCContext.h"
+#include "llvm/MC/MCObjectStreamer.h"
+#include "llvm/MC/MCSymbolELF.h"
+#include "llvm/BinaryFormat/ELF.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "vemcexpr"
+
+const VEMCExpr *VEMCExpr::create(VariantKind Kind, const MCExpr *Expr,
+                                 MCContext &Ctx) {
+  return new (Ctx) VEMCExpr(Kind, Expr);
+}
+
+void VEMCExpr::printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const {
+
+  bool closeParen = printVariantKind(OS, Kind);
+
+  const MCExpr *Expr = getSubExpr();
+  Expr->print(OS, MAI);
+
+  if (closeParen)
+    OS << ')';
+  printVariantKindSuffix(OS, Kind);
+}
+
+bool VEMCExpr::printVariantKind(raw_ostream &OS, VariantKind Kind) {
+  switch (Kind) {
+  case VK_VE_None:
+  case VK_VE_REFLONG:
+    return false;
+
+  case VK_VE_HI32:
+  case VK_VE_LO32:
+  case VK_VE_PC_HI32:
+  case VK_VE_PC_LO32:
+  case VK_VE_GOT_HI32:
+  case VK_VE_GOT_LO32:
+  case VK_VE_GOTOFF_HI32:
+  case VK_VE_GOTOFF_LO32:
+  case VK_VE_PLT_HI32:
+  case VK_VE_PLT_LO32:
+  case VK_VE_TLS_GD_HI32:
+  case VK_VE_TLS_GD_LO32:
+  case VK_VE_TPOFF_HI32:
+  case VK_VE_TPOFF_LO32:
+    // Use suffix for these variant kinds
+    return false;
+  }
+  return true;
+}
+
+void VEMCExpr::printVariantKindSuffix(raw_ostream &OS, VariantKind Kind) {
+  switch (Kind) {
+  case VK_VE_None:
+  case VK_VE_REFLONG:
+    break;
+  case VK_VE_HI32:
+    OS << "@hi";
+    break;
+  case VK_VE_LO32:
+    OS << "@lo";
+    break;
+  case VK_VE_PC_HI32:
+    OS << "@pc_hi";
+    break;
+  case VK_VE_PC_LO32:
+    OS << "@pc_lo";
+    break;
+  case VK_VE_GOT_HI32:
+    OS << "@got_hi";
+    break;
+  case VK_VE_GOT_LO32:
+    OS << "@got_lo";
+    break;
+  case VK_VE_GOTOFF_HI32:
+    OS << "@gotoff_hi";
+    break;
+  case VK_VE_GOTOFF_LO32:
+    OS << "@gotoff_lo";
+    break;
+  case VK_VE_PLT_HI32:
+    OS << "@plt_hi";
+    break;
+  case VK_VE_PLT_LO32:
+    OS << "@plt_lo";
+    break;
+  case VK_VE_TLS_GD_HI32:
+    OS << "@tls_gd_hi";
+    break;
+  case VK_VE_TLS_GD_LO32:
+    OS << "@tls_gd_lo";
+    break;
+  case VK_VE_TPOFF_HI32:
+    OS << "@tpoff_hi";
+    break;
+  case VK_VE_TPOFF_LO32:
+    OS << "@tpoff_lo";
+    break;
+  }
+}
+
+VEMCExpr::VariantKind VEMCExpr::parseVariantKind(StringRef name) {
+  return StringSwitch<VEMCExpr::VariantKind>(name)
+      .Case("hi", VK_VE_HI32)
+      .Case("lo", VK_VE_LO32)
+      .Case("pc_hi", VK_VE_PC_HI32)
+      .Case("pc_lo", VK_VE_PC_LO32)
+      .Case("got_hi", VK_VE_GOT_HI32)
+      .Case("got_lo", VK_VE_GOT_LO32)
+      .Case("gotoff_hi", VK_VE_GOTOFF_HI32)
+      .Case("gotoff_lo", VK_VE_GOTOFF_LO32)
+      .Case("plt_hi", VK_VE_PLT_HI32)
+      .Case("plt_lo", VK_VE_PLT_LO32)
+      .Case("tls_gd_hi", VK_VE_TLS_GD_HI32)
+      .Case("tls_gd_lo", VK_VE_TLS_GD_LO32)
+      .Case("tpoff_hi", VK_VE_TPOFF_HI32)
+      .Case("tpoff_lo", VK_VE_TPOFF_LO32)
+      .Default(VK_VE_None);
+}
+
+VE::Fixups VEMCExpr::getFixupKind(VEMCExpr::VariantKind Kind) {
+  switch (Kind) {
+  default:
+    llvm_unreachable("Unhandled VEMCExpr::VariantKind");
+  case VK_VE_REFLONG:
+    return VE::fixup_ve_reflong;
+  case VK_VE_HI32:
+    return VE::fixup_ve_hi32;
+  case VK_VE_LO32:
+    return VE::fixup_ve_lo32;
+  case VK_VE_PC_HI32:
+    return VE::fixup_ve_pc_hi32;
+  case VK_VE_PC_LO32:
+    return VE::fixup_ve_pc_lo32;
+  case VK_VE_GOT_HI32:
+    return VE::fixup_ve_got_hi32;
+  case VK_VE_GOT_LO32:
+    return VE::fixup_ve_got_lo32;
+  case VK_VE_GOTOFF_HI32:
+    return VE::fixup_ve_gotoff_hi32;
+  case VK_VE_GOTOFF_LO32:
+    return VE::fixup_ve_gotoff_lo32;
+  case VK_VE_PLT_HI32:
+    return VE::fixup_ve_plt_hi32;
+  case VK_VE_PLT_LO32:
+    return VE::fixup_ve_plt_lo32;
+  case VK_VE_TLS_GD_HI32:
+    return VE::fixup_ve_tls_gd_hi32;
+  case VK_VE_TLS_GD_LO32:
+    return VE::fixup_ve_tls_gd_lo32;
+  case VK_VE_TPOFF_HI32:
+    return VE::fixup_ve_tpoff_hi32;
+  case VK_VE_TPOFF_LO32:
+    return VE::fixup_ve_tpoff_lo32;
+  }
+}
+
+bool VEMCExpr::evaluateAsRelocatableImpl(MCValue &Res,
+                                         const MCAsmLayout *Layout,
+                                         const MCFixup *Fixup) const {
+  return getSubExpr()->evaluateAsRelocatable(Res, Layout, Fixup);
+}
+
+static void fixELFSymbolsInTLSFixupsImpl(const MCExpr *Expr, MCAssembler &Asm) {
+  switch (Expr->getKind()) {
+  case MCExpr::Target:
+    llvm_unreachable("Can't handle nested target expr!");
+    break;
+
+  case MCExpr::Constant:
+    break;
+
+  case MCExpr::Binary: {
+    const MCBinaryExpr *BE = cast<MCBinaryExpr>(Expr);
+    fixELFSymbolsInTLSFixupsImpl(BE->getLHS(), Asm);
+    fixELFSymbolsInTLSFixupsImpl(BE->getRHS(), Asm);
+    break;
+  }
+
+  case MCExpr::SymbolRef: {
+    // We're known to be under a TLS fixup, so any symbol should be
+    // modified. There should be only one.
+    const MCSymbolRefExpr &SymRef = *cast<MCSymbolRefExpr>(Expr);
+    cast<MCSymbolELF>(SymRef.getSymbol()).setType(ELF::STT_TLS);
+    break;
+  }
+
+  case MCExpr::Unary:
+    fixELFSymbolsInTLSFixupsImpl(cast<MCUnaryExpr>(Expr)->getSubExpr(), Asm);
+    break;
+  }
+}
+
+void VEMCExpr::visitUsedExpr(MCStreamer &Streamer) const {
+  Streamer.visitUsedExpr(*getSubExpr());
+}
+
+void VEMCExpr::fixELFSymbolsInTLSFixups(MCAssembler &Asm) const {
+  switch (getKind()) {
+  default:
+    return;
+  case VK_VE_TLS_GD_HI32:
+  case VK_VE_TLS_GD_LO32:
+  case VK_VE_TPOFF_HI32:
+  case VK_VE_TPOFF_LO32:
+    break;
+  }
+  fixELFSymbolsInTLSFixupsImpl(getSubExpr(), Asm);
+}
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
new file mode 100644
index 000000000000..2b0c44576099
--- /dev/null
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCExpr.h
@@ -0,0 +1,95 @@
+//====- VEMCExpr.h - VE specific MC expression classes --------*- C++ -*-=====//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes VE-specific MCExprs, used for modifiers like
+// "%hi" or "%lo" etc.,
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_VE_MCTARGETDESC_VEMCEXPR_H
+#define LLVM_LIB_TARGET_VE_MCTARGETDESC_VEMCEXPR_H
+
+#include "VEFixupKinds.h"
+#include "llvm/MC/MCExpr.h"
+
+namespace llvm {
+
+class StringRef;
+class VEMCExpr : public MCTargetExpr {
+public:
+  enum VariantKind {
+    VK_VE_None,
+    VK_VE_REFLONG,
+    VK_VE_HI32,
+    VK_VE_LO32,
+    VK_VE_PC_HI32,
+    VK_VE_PC_LO32,
+    VK_VE_GOT_HI32,
+    VK_VE_GOT_LO32,
+    VK_VE_GOTOFF_HI32,
+    VK_VE_GOTOFF_LO32,
+    VK_VE_PLT_HI32,
+    VK_VE_PLT_LO32,
+    VK_VE_TLS_GD_HI32,
+    VK_VE_TLS_GD_LO32,
+    VK_VE_TPOFF_HI32,
+    VK_VE_TPOFF_LO32,
+  };
+
+private:
+  const VariantKind Kind;
+  const MCExpr *Expr;
+
+  explicit VEMCExpr(VariantKind Kind, const MCExpr *Expr)
+      : Kind(Kind), Expr(Expr) {}
+
+public:
+  /// @name Construction
+  /// @{
+
+  static const VEMCExpr *create(VariantKind Kind, const MCExpr *Expr,
+                                MCContext &Ctx);
+  /// @}
+  /// @name Accessors
+  /// @{
+
+  /// getOpcode - Get the kind of this expression.
+  VariantKind getKind() const { return Kind; }
+
+  /// getSubExpr - Get the child of this expression.
+  const MCExpr *getSubExpr() const { return Expr; }
+
+  /// getFixupKind - Get the fixup kind of this expression.
+  VE::Fixups getFixupKind() const { return getFixupKind(Kind); }
+
+  /// @}
+  void printImpl(raw_ostream &OS, const MCAsmInfo *MAI) const override;
+  bool evaluateAsRelocatableImpl(MCValue &Res, const MCAsmLayout *Layout,
+                                 const MCFixup *Fixup) const override;
+  void visitUsedExpr(MCStreamer &Streamer) const override;
+  MCFragment *findAssociatedFragment() const override {
+    return getSubExpr()->findAssociatedFragment();
+  }
+
+  void fixELFSymbolsInTLSFixups(MCAssembler &Asm) const override;
+
+  static bool classof(const MCExpr *E) {
+    return E->getKind() == MCExpr::Target;
+  }
+
+  static bool classof(const VEMCExpr *) { return true; }
+
+  static VariantKind parseVariantKind(StringRef name);
+  static bool printVariantKind(raw_ostream &OS, VariantKind Kind);
+  static void printVariantKindSuffix(raw_ostream &OS, VariantKind Kind);
+  static VE::Fixups getFixupKind(VariantKind Kind);
+};
+
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
index b228617058a6..a39cffc8f4a6 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.cpp
@@ -11,7 +11,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "VEMCTargetDesc.h"
-#include "InstPrinter/VEInstPrinter.h"
+#include "TargetInfo/VETargetInfo.h"
+#include "VEInstPrinter.h"
 #include "VEMCAsmInfo.h"
 #include "VETargetStreamer.h"
 #include "llvm/MC/MCInstrInfo.h"
@@ -35,7 +36,7 @@ static MCAsmInfo *createVEMCAsmInfo(const MCRegisterInfo &MRI, const Triple &TT,
                                     const MCTargetOptions &Options) {
   MCAsmInfo *MAI = new VEELFMCAsmInfo(TT);
   unsigned Reg = MRI.getDwarfRegNum(VE::SX11, true);
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, Reg, 0);
+  MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, Reg, 0);
   MAI->addInitialFrameState(Inst);
   return MAI;
 }
@@ -93,6 +94,12 @@ extern "C" void LLVMInitializeVETargetMC() {
     // Register the MC subtarget info.
     TargetRegistry::RegisterMCSubtargetInfo(*T, createVEMCSubtargetInfo);
 
+    // Register the MC Code Emitter.
+    TargetRegistry::RegisterMCCodeEmitter(*T, createVEMCCodeEmitter);
+
+    // Register the asm backend.
+    TargetRegistry::RegisterMCAsmBackend(*T, createVEAsmBackend);
+
     // Register the object target streamer.
     TargetRegistry::RegisterObjectTargetStreamer(*T,
                                                  createObjectTargetStreamer);
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
index 24a5c8209be2..7fb8a556aa74 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
+++ b/llvm/lib/Target/VE/MCTargetDesc/VEMCTargetDesc.h
@@ -22,7 +22,7 @@ class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
 class MCInstrInfo;
-class MCObjectWriter;
+class MCObjectTargetWriter;
 class MCRegisterInfo;
 class MCSubtargetInfo;
 class MCTargetOptions;
@@ -32,8 +32,12 @@ class StringRef;
 class raw_pwrite_stream;
 class raw_ostream;
 
-Target &getTheVETarget();
-
+MCCodeEmitter *createVEMCCodeEmitter(const MCInstrInfo &MCII,
+                                     const MCRegisterInfo &MRI, MCContext &Ctx);
+MCAsmBackend *createVEAsmBackend(const Target &T, const MCSubtargetInfo &STI,
+                                 const MCRegisterInfo &MRI,
+                                 const MCTargetOptions &Options);
+std::unique_ptr<MCObjectTargetWriter> createVEELFObjectWriter(uint8_t OSABI);
 } // namespace llvm
 
 // Defines symbolic names for VE registers.  This defines a mapping from
diff --git a/llvm/lib/Target/VE/MCTargetDesc/VETargetStreamer.cpp b/llvm/lib/Target/VE/MCTargetDesc/VETargetStreamer.cpp
index dfe94bbaaa4b..29f5afb67ac1 100644
--- a/llvm/lib/Target/VE/MCTargetDesc/VETargetStreamer.cpp
+++ b/llvm/lib/Target/VE/MCTargetDesc/VETargetStreamer.cpp
@@ -11,7 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "VETargetStreamer.h"
-#include "InstPrinter/VEInstPrinter.h"
+#include "VEInstPrinter.h"
 #include "llvm/Support/FormattedStream.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
index be68fe7d2429..65bd142fe0db 100644
--- a/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
+++ b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.cpp
@@ -6,8 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "VE.h"
-#include "llvm/IR/Module.h"
+#include "TargetInfo/VETargetInfo.h"
 #include "llvm/Support/TargetRegistry.h"
 
 using namespace llvm;
diff --git a/llvm/lib/Target/VE/TargetInfo/VETargetInfo.h b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.h
new file mode 100644
index 000000000000..7879e6f069a1
--- /dev/null
+++ b/llvm/lib/Target/VE/TargetInfo/VETargetInfo.h
@@ -0,0 +1,20 @@
+//===-- VETargetInfo.h - VE Target Implementation ---------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_VE_TARGETINFO_VETARGETINFO_H
+#define LLVM_LIB_TARGET_VE_TARGETINFO_VETARGETINFO_H
+
+namespace llvm {
+
+class Target;
+
+Target &getTheVETarget();
+
+} // namespace llvm
+
+#endif // LLVM_LIB_TARGET_VE_TARGETINFO_VETARGETINFO_H
diff --git a/llvm/lib/Target/VE/VE.h b/llvm/lib/Target/VE/VE.h
index 9b61f2b63f36..7ed7797cbb83 100644
--- a/llvm/lib/Target/VE/VE.h
+++ b/llvm/lib/Target/VE/VE.h
@@ -15,6 +15,7 @@
 #define LLVM_LIB_TARGET_VE_VE_H
 
 #include "MCTargetDesc/VEMCTargetDesc.h"
+#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Target/TargetMachine.h"
 
@@ -37,36 +38,50 @@ namespace llvm {
 // Enums corresponding to VE condition codes, both icc's and fcc's.  These
 // values must be kept in sync with the ones in the .td file.
 namespace VECC {
-enum CondCodes {
+enum CondCode {
   // Integer comparison
-  CC_IG =  0,  // Greater
-  CC_IL =  1,  // Less
+  CC_IG = 0,  // Greater
+  CC_IL = 1,  // Less
   CC_INE = 2, // Not Equal
   CC_IEQ = 3, // Equal
   CC_IGE = 4, // Greater or Equal
   CC_ILE = 5, // Less or Equal
 
   // Floating point comparison
-  CC_AF =     0 + 6, // Never
-  CC_G =      1 + 6, // Greater
-  CC_L =      2 + 6, // Less
-  CC_NE =     3 + 6, // Not Equal
-  CC_EQ =     4 + 6, // Equal
-  CC_GE =     5 + 6, // Greater or Equal
-  CC_LE =     6 + 6, // Less or Equal
-  CC_NUM =    7 + 6, // Number
-  CC_NAN =    8 + 6, // NaN
-  CC_GNAN =   9 + 6, // Greater or NaN
-  CC_LNAN =  10 + 6, // Less or NaN
+  CC_AF = 0 + 6,     // Never
+  CC_G = 1 + 6,      // Greater
+  CC_L = 2 + 6,      // Less
+  CC_NE = 3 + 6,     // Not Equal
+  CC_EQ = 4 + 6,     // Equal
+  CC_GE = 5 + 6,     // Greater or Equal
+  CC_LE = 6 + 6,     // Less or Equal
+  CC_NUM = 7 + 6,    // Number
+  CC_NAN = 8 + 6,    // NaN
+  CC_GNAN = 9 + 6,   // Greater or NaN
+  CC_LNAN = 10 + 6,  // Less or NaN
   CC_NENAN = 11 + 6, // Not Equal or NaN
   CC_EQNAN = 12 + 6, // Equal or NaN
   CC_GENAN = 13 + 6, // Greater or Equal or NaN
   CC_LENAN = 14 + 6, // Less or Equal or NaN
-  CC_AT =    15 + 6, // Always
+  CC_AT = 15 + 6,    // Always
+  UNKNOWN
+};
+}
+// Enums corresponding to VE Rounding Mode.  These values must be kept in
+// sync with the ones in the .td file.
+namespace VERD {
+enum RoundingMode {
+  RD_NONE = 0, // According to PSW
+  RD_RZ = 8,   // Round toward Zero
+  RD_RP = 9,   // Round toward Plus infinity
+  RD_RM = 10,  // Round toward Minus infinity
+  RD_RN = 11,  // Round to Nearest (ties to Even)
+  RD_RA = 12,  // Round to Nearest (ties to Away)
+  UNKNOWN
 };
 }
 
-inline static const char *VECondCodeToString(VECC::CondCodes CC) {
+inline static const char *VECondCodeToString(VECC::CondCode CC) {
   switch (CC) {
   case VECC::CC_IG:    return "gt";
   case VECC::CC_IL:    return "lt";
@@ -90,20 +105,252 @@ inline static const char *VECondCodeToString(VECC::CondCodes CC) {
   case VECC::CC_GENAN: return "genan";
   case VECC::CC_LENAN: return "lenan";
   case VECC::CC_AT:    return "at";
+  default:
+    llvm_unreachable("Invalid cond code");
+  }
+}
+
+inline static VECC::CondCode stringToVEICondCode(StringRef S) {
+  return StringSwitch<VECC::CondCode>(S)
+      .Case("gt", VECC::CC_IG)
+      .Case("lt", VECC::CC_IL)
+      .Case("ne", VECC::CC_INE)
+      .Case("eq", VECC::CC_IEQ)
+      .Case("ge", VECC::CC_IGE)
+      .Case("le", VECC::CC_ILE)
+      .Case("af", VECC::CC_AF)
+      .Case("at", VECC::CC_AT)
+      .Case("", VECC::CC_AT)
+      .Default(VECC::UNKNOWN);
+}
+
+inline static VECC::CondCode stringToVEFCondCode(StringRef S) {
+  return StringSwitch<VECC::CondCode>(S)
+      .Case("gt", VECC::CC_G)
+      .Case("lt", VECC::CC_L)
+      .Case("ne", VECC::CC_NE)
+      .Case("eq", VECC::CC_EQ)
+      .Case("ge", VECC::CC_GE)
+      .Case("le", VECC::CC_LE)
+      .Case("num", VECC::CC_NUM)
+      .Case("nan", VECC::CC_NAN)
+      .Case("gtnan", VECC::CC_GNAN)
+      .Case("ltnan", VECC::CC_LNAN)
+      .Case("nenan", VECC::CC_NENAN)
+      .Case("eqnan", VECC::CC_EQNAN)
+      .Case("genan", VECC::CC_GENAN)
+      .Case("lenan", VECC::CC_LENAN)
+      .Case("af", VECC::CC_AF)
+      .Case("at", VECC::CC_AT)
+      .Case("", VECC::CC_AT)
+      .Default(VECC::UNKNOWN);
+}
+
+inline static unsigned VECondCodeToVal(VECC::CondCode CC) {
+  switch (CC) {
+  case VECC::CC_IG:
+    return 1;
+  case VECC::CC_IL:
+    return 2;
+  case VECC::CC_INE:
+    return 3;
+  case VECC::CC_IEQ:
+    return 4;
+  case VECC::CC_IGE:
+    return 5;
+  case VECC::CC_ILE:
+    return 6;
+  case VECC::CC_AF:
+    return 0;
+  case VECC::CC_G:
+    return 1;
+  case VECC::CC_L:
+    return 2;
+  case VECC::CC_NE:
+    return 3;
+  case VECC::CC_EQ:
+    return 4;
+  case VECC::CC_GE:
+    return 5;
+  case VECC::CC_LE:
+    return 6;
+  case VECC::CC_NUM:
+    return 7;
+  case VECC::CC_NAN:
+    return 8;
+  case VECC::CC_GNAN:
+    return 9;
+  case VECC::CC_LNAN:
+    return 10;
+  case VECC::CC_NENAN:
+    return 11;
+  case VECC::CC_EQNAN:
+    return 12;
+  case VECC::CC_GENAN:
+    return 13;
+  case VECC::CC_LENAN:
+    return 14;
+  case VECC::CC_AT:
+    return 15;
+  default:
+    llvm_unreachable("Invalid cond code");
+  }
+}
+
+inline static VECC::CondCode VEValToCondCode(unsigned Val, bool IsInteger) {
+  if (IsInteger) {
+    switch (Val) {
+    case 0:
+      return VECC::CC_AF;
+    case 1:
+      return VECC::CC_IG;
+    case 2:
+      return VECC::CC_IL;
+    case 3:
+      return VECC::CC_INE;
+    case 4:
+      return VECC::CC_IEQ;
+    case 5:
+      return VECC::CC_IGE;
+    case 6:
+      return VECC::CC_ILE;
+    case 15:
+      return VECC::CC_AT;
+    }
+  } else {
+    switch (Val) {
+    case 0:
+      return VECC::CC_AF;
+    case 1:
+      return VECC::CC_G;
+    case 2:
+      return VECC::CC_L;
+    case 3:
+      return VECC::CC_NE;
+    case 4:
+      return VECC::CC_EQ;
+    case 5:
+      return VECC::CC_GE;
+    case 6:
+      return VECC::CC_LE;
+    case 7:
+      return VECC::CC_NUM;
+    case 8:
+      return VECC::CC_NAN;
+    case 9:
+      return VECC::CC_GNAN;
+    case 10:
+      return VECC::CC_LNAN;
+    case 11:
+      return VECC::CC_NENAN;
+    case 12:
+      return VECC::CC_EQNAN;
+    case 13:
+      return VECC::CC_GENAN;
+    case 14:
+      return VECC::CC_LENAN;
+    case 15:
+      return VECC::CC_AT;
+    }
   }
   llvm_unreachable("Invalid cond code");
 }
 
-// Different to Hi_32/Lo_32 the HI32 and LO32 functions
-// preserve the correct numerical value
-// on the LLVM data type for MC immediates (int64_t).
-inline static int64_t HI32(int64_t imm) {
-  return (int32_t)(imm >> 32);
+inline static const char *VERDToString(VERD::RoundingMode R) {
+  switch (R) {
+  case VERD::RD_NONE:
+    return "";
+  case VERD::RD_RZ:
+    return ".rz";
+  case VERD::RD_RP:
+    return ".rp";
+  case VERD::RD_RM:
+    return ".rm";
+  case VERD::RD_RN:
+    return ".rn";
+  case VERD::RD_RA:
+    return ".ra";
+  default:
+    llvm_unreachable("Invalid branch predicate");
+  }
 }
 
-inline static int64_t LO32(int64_t imm) {
-  return (int32_t)(imm);
+inline static VERD::RoundingMode stringToVERD(StringRef S) {
+  return StringSwitch<VERD::RoundingMode>(S)
+      .Case("", VERD::RD_NONE)
+      .Case(".rz", VERD::RD_RZ)
+      .Case(".rp", VERD::RD_RP)
+      .Case(".rm", VERD::RD_RM)
+      .Case(".rn", VERD::RD_RN)
+      .Case(".ra", VERD::RD_RA)
+      .Default(VERD::UNKNOWN);
 }
 
+inline static unsigned VERDToVal(VERD::RoundingMode R) {
+  switch (R) {
+  case VERD::RD_NONE:
+  case VERD::RD_RZ:
+  case VERD::RD_RP:
+  case VERD::RD_RM:
+  case VERD::RD_RN:
+  case VERD::RD_RA:
+    return static_cast<unsigned>(R);
+  default:
+    break;
+  }
+  llvm_unreachable("Invalid branch predicates");
+}
+
+inline static VERD::RoundingMode VEValToRD(unsigned Val) {
+  switch (Val) {
+  case static_cast<unsigned>(VERD::RD_NONE):
+    return VERD::RD_NONE;
+  case static_cast<unsigned>(VERD::RD_RZ):
+    return VERD::RD_RZ;
+  case static_cast<unsigned>(VERD::RD_RP):
+    return VERD::RD_RP;
+  case static_cast<unsigned>(VERD::RD_RM):
+    return VERD::RD_RM;
+  case static_cast<unsigned>(VERD::RD_RN):
+    return VERD::RD_RN;
+  case static_cast<unsigned>(VERD::RD_RA):
+    return VERD::RD_RA;
+  default:
+    break;
+  }
+  llvm_unreachable("Invalid branch predicates");
+}
+
+// MImm - Special immediate value of sequential bit stream of 0 or 1.
+//   See VEInstrInfo.td for details.
+inline static bool isMImmVal(uint64_t Val) {
+  if (Val == 0) {
+    // (0)1 is 0
+    return true;
+  }
+  if (isMask_64(Val)) {
+    // (m)0 patterns
+    return true;
+  }
+  // (m)1 patterns
+  return (Val & (1UL << 63)) && isShiftedMask_64(Val);
+}
+
+inline static bool isMImm32Val(uint32_t Val) {
+  if (Val == 0) {
+    // (0)1 is 0
+    return true;
+  }
+  if (isMask_32(Val)) {
+    // (m)0 patterns
+    return true;
+  }
+  // (m)1 patterns
+  return (Val & (1 << 31)) && isShiftedMask_32(Val);
+}
+
+inline unsigned M0(unsigned Val) { return Val + 64; }
+inline unsigned M1(unsigned Val) { return Val; }
+
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Target/VE/VE.td b/llvm/lib/Target/VE/VE.td
index 7404321b1a06..617a6ea458b6 100644
--- a/llvm/lib/Target/VE/VE.td
+++ b/llvm/lib/Target/VE/VE.td
@@ -29,6 +29,13 @@ include "VEInstrInfo.td"
 
 def VEInstrInfo : InstrInfo;
 
+def VEAsmParser : AsmParser {
+  // Use both VE register name matcher to accept "S0~S63" register names
+  // and default register matcher to accept other registeres.
+  let AllowDuplicateRegisterNames = 1;
+  let ShouldEmitMatchRegisterAltName = 1;
+}
+
 //===----------------------------------------------------------------------===//
 // VE processors supported.
 //===----------------------------------------------------------------------===//
@@ -51,6 +58,7 @@ def VEAsmWriter : AsmWriter {
 def VE : Target {
   // Pull in Instruction Info:
   let InstructionSet = VEInstrInfo;
+  let AssemblyParsers = [VEAsmParser];
   let AssemblyWriters = [VEAsmWriter];
   let AllowRegisterRenaming = 1;
 }
diff --git a/llvm/lib/Target/VE/VEAsmPrinter.cpp b/llvm/lib/Target/VE/VEAsmPrinter.cpp
index 918f2a1acdaf..86e3aa3d3fa1 100644
--- a/llvm/lib/Target/VE/VEAsmPrinter.cpp
+++ b/llvm/lib/Target/VE/VEAsmPrinter.cpp
@@ -11,8 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "InstPrinter/VEInstPrinter.h"
+#include "MCTargetDesc/VEInstPrinter.h"
+#include "MCTargetDesc/VEMCExpr.h"
 #include "MCTargetDesc/VETargetStreamer.h"
+#include "TargetInfo/VETargetInfo.h"
 #include "VE.h"
 #include "VEInstrInfo.h"
 #include "VETargetMachine.h"
@@ -46,7 +48,14 @@ public:
 
   StringRef getPassName() const override { return "VE Assembly Printer"; }
 
-  void EmitInstruction(const MachineInstr *MI) override;
+  void lowerGETGOTAndEmitMCInsts(const MachineInstr *MI,
+                                 const MCSubtargetInfo &STI);
+  void lowerGETFunPLTAndEmitMCInsts(const MachineInstr *MI,
+                                    const MCSubtargetInfo &STI);
+  void lowerGETTLSAddrAndEmitMCInsts(const MachineInstr *MI,
+                                     const MCSubtargetInfo &STI);
+
+  void emitInstruction(const MachineInstr *MI) override;
 
   static const char *getRegisterName(unsigned RegNo) {
     return VEInstPrinter::getRegisterName(RegNo);
@@ -54,7 +63,265 @@ public:
 };
 } // end of anonymous namespace
 
-void VEAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+static MCOperand createVEMCOperand(VEMCExpr::VariantKind Kind, MCSymbol *Sym,
+                                   MCContext &OutContext) {
+  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Sym, OutContext);
+  const VEMCExpr *expr = VEMCExpr::create(Kind, MCSym, OutContext);
+  return MCOperand::createExpr(expr);
+}
+
+static MCOperand createGOTRelExprOp(VEMCExpr::VariantKind Kind,
+                                    MCSymbol *GOTLabel, MCContext &OutContext) {
+  const MCSymbolRefExpr *GOT = MCSymbolRefExpr::create(GOTLabel, OutContext);
+  const VEMCExpr *expr = VEMCExpr::create(Kind, GOT, OutContext);
+  return MCOperand::createExpr(expr);
+}
+
+static void emitSIC(MCStreamer &OutStreamer, MCOperand &RD,
+                    const MCSubtargetInfo &STI) {
+  MCInst SICInst;
+  SICInst.setOpcode(VE::SIC);
+  SICInst.addOperand(RD);
+  OutStreamer.emitInstruction(SICInst, STI);
+}
+
+static void emitBSIC(MCStreamer &OutStreamer, MCOperand &R1, MCOperand &R2,
+                     const MCSubtargetInfo &STI) {
+  MCInst BSICInst;
+  BSICInst.setOpcode(VE::BSICrii);
+  BSICInst.addOperand(R1);
+  BSICInst.addOperand(R2);
+  MCOperand czero = MCOperand::createImm(0);
+  BSICInst.addOperand(czero);
+  BSICInst.addOperand(czero);
+  OutStreamer.emitInstruction(BSICInst, STI);
+}
+
+static void emitLEAzzi(MCStreamer &OutStreamer, MCOperand &Imm, MCOperand &RD,
+                       const MCSubtargetInfo &STI) {
+  MCInst LEAInst;
+  LEAInst.setOpcode(VE::LEAzii);
+  LEAInst.addOperand(RD);
+  MCOperand CZero = MCOperand::createImm(0);
+  LEAInst.addOperand(CZero);
+  LEAInst.addOperand(CZero);
+  LEAInst.addOperand(Imm);
+  OutStreamer.emitInstruction(LEAInst, STI);
+}
+
+static void emitLEASLzzi(MCStreamer &OutStreamer, MCOperand &Imm, MCOperand &RD,
+                         const MCSubtargetInfo &STI) {
+  MCInst LEASLInst;
+  LEASLInst.setOpcode(VE::LEASLzii);
+  LEASLInst.addOperand(RD);
+  MCOperand CZero = MCOperand::createImm(0);
+  LEASLInst.addOperand(CZero);
+  LEASLInst.addOperand(CZero);
+  LEASLInst.addOperand(Imm);
+  OutStreamer.emitInstruction(LEASLInst, STI);
+}
+
+static void emitLEAzii(MCStreamer &OutStreamer, MCOperand &RS1, MCOperand &Imm,
+                       MCOperand &RD, const MCSubtargetInfo &STI) {
+  MCInst LEAInst;
+  LEAInst.setOpcode(VE::LEAzii);
+  LEAInst.addOperand(RD);
+  MCOperand CZero = MCOperand::createImm(0);
+  LEAInst.addOperand(CZero);
+  LEAInst.addOperand(RS1);
+  LEAInst.addOperand(Imm);
+  OutStreamer.emitInstruction(LEAInst, STI);
+}
+
+static void emitLEASLrri(MCStreamer &OutStreamer, MCOperand &RS1,
+                         MCOperand &RS2, MCOperand &Imm, MCOperand &RD,
+                         const MCSubtargetInfo &STI) {
+  MCInst LEASLInst;
+  LEASLInst.setOpcode(VE::LEASLrri);
+  LEASLInst.addOperand(RD);
+  LEASLInst.addOperand(RS1);
+  LEASLInst.addOperand(RS2);
+  LEASLInst.addOperand(Imm);
+  OutStreamer.emitInstruction(LEASLInst, STI);
+}
+
+static void emitBinary(MCStreamer &OutStreamer, unsigned Opcode, MCOperand &RS1,
+                       MCOperand &Src2, MCOperand &RD,
+                       const MCSubtargetInfo &STI) {
+  MCInst Inst;
+  Inst.setOpcode(Opcode);
+  Inst.addOperand(RD);
+  Inst.addOperand(RS1);
+  Inst.addOperand(Src2);
+  OutStreamer.emitInstruction(Inst, STI);
+}
+
+static void emitANDrm(MCStreamer &OutStreamer, MCOperand &RS1, MCOperand &Imm,
+                      MCOperand &RD, const MCSubtargetInfo &STI) {
+  emitBinary(OutStreamer, VE::ANDrm, RS1, Imm, RD, STI);
+}
+
+static void emitHiLo(MCStreamer &OutStreamer, MCSymbol *GOTSym,
+                     VEMCExpr::VariantKind HiKind, VEMCExpr::VariantKind LoKind,
+                     MCOperand &RD, MCContext &OutContext,
+                     const MCSubtargetInfo &STI) {
+
+  MCOperand hi = createVEMCOperand(HiKind, GOTSym, OutContext);
+  MCOperand lo = createVEMCOperand(LoKind, GOTSym, OutContext);
+  emitLEAzzi(OutStreamer, lo, RD, STI);
+  MCOperand M032 = MCOperand::createImm(M0(32));
+  emitANDrm(OutStreamer, RD, M032, RD, STI);
+  emitLEASLzzi(OutStreamer, hi, RD, STI);
+}
+
+void VEAsmPrinter::lowerGETGOTAndEmitMCInsts(const MachineInstr *MI,
+                                             const MCSubtargetInfo &STI) {
+  MCSymbol *GOTLabel =
+      OutContext.getOrCreateSymbol(Twine("_GLOBAL_OFFSET_TABLE_"));
+
+  const MachineOperand &MO = MI->getOperand(0);
+  MCOperand MCRegOP = MCOperand::createReg(MO.getReg());
+
+  if (!isPositionIndependent()) {
+    // Just load the address of GOT to MCRegOP.
+    switch (TM.getCodeModel()) {
+    default:
+      llvm_unreachable("Unsupported absolute code model");
+    case CodeModel::Small:
+    case CodeModel::Medium:
+    case CodeModel::Large:
+      emitHiLo(*OutStreamer, GOTLabel, VEMCExpr::VK_VE_HI32,
+               VEMCExpr::VK_VE_LO32, MCRegOP, OutContext, STI);
+      break;
+    }
+    return;
+  }
+
+  MCOperand RegGOT = MCOperand::createReg(VE::SX15); // GOT
+  MCOperand RegPLT = MCOperand::createReg(VE::SX16); // PLT
+
+  // lea %got, _GLOBAL_OFFSET_TABLE_@PC_LO(-24)
+  // and %got, %got, (32)0
+  // sic %plt
+  // lea.sl %got, _GLOBAL_OFFSET_TABLE_@PC_HI(%got, %plt)
+  MCOperand cim24 = MCOperand::createImm(-24);
+  MCOperand loImm =
+      createGOTRelExprOp(VEMCExpr::VK_VE_PC_LO32, GOTLabel, OutContext);
+  emitLEAzii(*OutStreamer, cim24, loImm, MCRegOP, STI);
+  MCOperand M032 = MCOperand::createImm(M0(32));
+  emitANDrm(*OutStreamer, MCRegOP, M032, MCRegOP, STI);
+  emitSIC(*OutStreamer, RegPLT, STI);
+  MCOperand hiImm =
+      createGOTRelExprOp(VEMCExpr::VK_VE_PC_HI32, GOTLabel, OutContext);
+  emitLEASLrri(*OutStreamer, RegGOT, RegPLT, hiImm, MCRegOP, STI);
+}
+
+void VEAsmPrinter::lowerGETFunPLTAndEmitMCInsts(const MachineInstr *MI,
+                                                const MCSubtargetInfo &STI) {
+  const MachineOperand &MO = MI->getOperand(0);
+  MCOperand MCRegOP = MCOperand::createReg(MO.getReg());
+  const MachineOperand &Addr = MI->getOperand(1);
+  MCSymbol *AddrSym = nullptr;
+
+  switch (Addr.getType()) {
+  default:
+    llvm_unreachable("<unknown operand type>");
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    report_fatal_error("MBB is not supported yet");
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    report_fatal_error("ConstantPool is not supported yet");
+    return;
+  case MachineOperand::MO_ExternalSymbol:
+    AddrSym = GetExternalSymbolSymbol(Addr.getSymbolName());
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    AddrSym = getSymbol(Addr.getGlobal());
+    break;
+  }
+
+  if (!isPositionIndependent()) {
+    llvm_unreachable("Unsupported uses of %plt in not PIC code");
+    return;
+  }
+
+  MCOperand RegPLT = MCOperand::createReg(VE::SX16); // PLT
+
+  // lea %dst, %plt_lo(func)(-24)
+  // and %dst, %dst, (32)0
+  // sic %plt                            ; FIXME: is it safe to use %plt here?
+  // lea.sl %dst, %plt_hi(func)(%dst, %plt)
+  MCOperand cim24 = MCOperand::createImm(-24);
+  MCOperand loImm =
+      createGOTRelExprOp(VEMCExpr::VK_VE_PLT_LO32, AddrSym, OutContext);
+  emitLEAzii(*OutStreamer, cim24, loImm, MCRegOP, STI);
+  MCOperand M032 = MCOperand::createImm(M0(32));
+  emitANDrm(*OutStreamer, MCRegOP, M032, MCRegOP, STI);
+  emitSIC(*OutStreamer, RegPLT, STI);
+  MCOperand hiImm =
+      createGOTRelExprOp(VEMCExpr::VK_VE_PLT_HI32, AddrSym, OutContext);
+  emitLEASLrri(*OutStreamer, MCRegOP, RegPLT, hiImm, MCRegOP, STI);
+}
+
+void VEAsmPrinter::lowerGETTLSAddrAndEmitMCInsts(const MachineInstr *MI,
+                                                 const MCSubtargetInfo &STI) {
+  const MachineOperand &Addr = MI->getOperand(0);
+  MCSymbol *AddrSym = nullptr;
+
+  switch (Addr.getType()) {
+  default:
+    llvm_unreachable("<unknown operand type>");
+    return;
+  case MachineOperand::MO_MachineBasicBlock:
+    report_fatal_error("MBB is not supported yet");
+    return;
+  case MachineOperand::MO_ConstantPoolIndex:
+    report_fatal_error("ConstantPool is not supported yet");
+    return;
+  case MachineOperand::MO_ExternalSymbol:
+    AddrSym = GetExternalSymbolSymbol(Addr.getSymbolName());
+    break;
+  case MachineOperand::MO_GlobalAddress:
+    AddrSym = getSymbol(Addr.getGlobal());
+    break;
+  }
+
+  MCOperand RegLR = MCOperand::createReg(VE::SX10);  // LR
+  MCOperand RegS0 = MCOperand::createReg(VE::SX0);   // S0
+  MCOperand RegS12 = MCOperand::createReg(VE::SX12); // S12
+  MCSymbol *GetTLSLabel = OutContext.getOrCreateSymbol(Twine("__tls_get_addr"));
+
+  // lea %s0, sym@tls_gd_lo(-24)
+  // and %s0, %s0, (32)0
+  // sic %lr
+  // lea.sl %s0, sym@tls_gd_hi(%s0, %lr)
+  // lea %s12, __tls_get_addr@plt_lo(8)
+  // and %s12, %s12, (32)0
+  // lea.sl %s12, __tls_get_addr@plt_hi(%s12, %lr)
+  // bsic %lr, (, %s12)
+  MCOperand cim24 = MCOperand::createImm(-24);
+  MCOperand loImm =
+      createGOTRelExprOp(VEMCExpr::VK_VE_TLS_GD_LO32, AddrSym, OutContext);
+  emitLEAzii(*OutStreamer, cim24, loImm, RegS0, STI);
+  MCOperand M032 = MCOperand::createImm(M0(32));
+  emitANDrm(*OutStreamer, RegS0, M032, RegS0, STI);
+  emitSIC(*OutStreamer, RegLR, STI);
+  MCOperand hiImm =
+      createGOTRelExprOp(VEMCExpr::VK_VE_TLS_GD_HI32, AddrSym, OutContext);
+  emitLEASLrri(*OutStreamer, RegS0, RegLR, hiImm, RegS0, STI);
+  MCOperand ci8 = MCOperand::createImm(8);
+  MCOperand loImm2 =
+      createGOTRelExprOp(VEMCExpr::VK_VE_PLT_LO32, GetTLSLabel, OutContext);
+  emitLEAzii(*OutStreamer, ci8, loImm2, RegS12, STI);
+  emitANDrm(*OutStreamer, RegS12, M032, RegS12, STI);
+  MCOperand hiImm2 =
+      createGOTRelExprOp(VEMCExpr::VK_VE_PLT_HI32, GetTLSLabel, OutContext);
+  emitLEASLrri(*OutStreamer, RegS12, RegLR, hiImm2, RegS12, STI);
+  emitBSIC(*OutStreamer, RegLR, RegS12, STI);
+}
+
+void VEAsmPrinter::emitInstruction(const MachineInstr *MI) {
 
   switch (MI->getOpcode()) {
   default:
@@ -62,7 +329,17 @@ void VEAsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case TargetOpcode::DBG_VALUE:
     // FIXME: Debug Value.
     return;
+  case VE::GETGOT:
+    lowerGETGOTAndEmitMCInsts(MI, getSubtargetInfo());
+    return;
+  case VE::GETFUNPLT:
+    lowerGETFunPLTAndEmitMCInsts(MI, getSubtargetInfo());
+    return;
+  case VE::GETTLSADDR:
+    lowerGETTLSAddrAndEmitMCInsts(MI, getSubtargetInfo());
+    return;
   }
+
   MachineBasicBlock::const_instr_iterator I = MI->getIterator();
   MachineBasicBlock::const_instr_iterator E = MI->getParent()->instr_end();
   do {
diff --git a/llvm/lib/Target/VE/VECallingConv.td b/llvm/lib/Target/VE/VECallingConv.td
index 1a9097c79dd4..4f04dae884ab 100644
--- a/llvm/lib/Target/VE/VECallingConv.td
+++ b/llvm/lib/Target/VE/VECallingConv.td
@@ -13,7 +13,77 @@
 //===----------------------------------------------------------------------===//
 // Aurora VE
 //===----------------------------------------------------------------------===//
+def CC_VE_C_Stack: CallingConv<[
+  // float --> need special handling like below.
+  //    0      4
+  //    +------+------+
+  //    | empty| float|
+  //    +------+------+
+  CCIfType<[f32], CCCustom<"allocateFloat">>,
+
+  // All of the rest are assigned to the stack in 8-byte aligned units.
+  CCAssignToStack<0, 8>
+]>;
+
+def CC_VE : CallingConv<[
+  // All arguments get passed in generic registers if there is space.
+
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // bool, char, int, enum, long --> generic integer 32 bit registers
+  CCIfType<[i32], CCAssignToRegWithShadow<
+    [SW0, SW1, SW2, SW3, SW4, SW5, SW6, SW7],
+    [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+
+  // float --> generic floating point 32 bit registers
+  CCIfType<[f32], CCAssignToRegWithShadow<
+    [SF0, SF1, SF2, SF3, SF4, SF5, SF6, SF7],
+    [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+
+  // long long/double --> generic 64 bit registers
+  CCIfType<[i64, f64],
+           CCAssignToReg<[SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+
+  // Alternatively, they are assigned to the stack in 8-byte aligned units.
+  CCDelegateTo<CC_VE_C_Stack>
+]>;
+
+// All arguments get passed in stack for varargs function or non-prototyped
+// function.
+def CC_VE2 : CallingConv<[
+  // float --> need special handling like below.
+  //    0      4
+  //    +------+------+
+  //    | empty| float|
+  //    +------+------+
+  CCIfType<[f32], CCCustom<"allocateFloat">>,
+
+  CCAssignToStack<0, 8>
+]>;
+
+def RetCC_VE : CallingConv<[
+  // Promote i1/i8/i16 arguments to i32.
+  CCIfType<[i1, i8, i16], CCPromoteToType<i32>>,
+
+  // bool, char, int, enum, long --> generic integer 32 bit registers
+  CCIfType<[i32], CCAssignToRegWithShadow<
+    [SW0, SW1, SW2, SW3, SW4, SW5, SW6, SW7],
+    [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+
+  // float --> generic floating point 32 bit registers
+  CCIfType<[f32], CCAssignToRegWithShadow<
+    [SF0, SF1, SF2, SF3, SF4, SF5, SF6, SF7],
+    [SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+
+  // long long/double --> generic 64 bit registers
+  CCIfType<[i64, f64],
+           CCAssignToReg<[SX0, SX1, SX2, SX3, SX4, SX5, SX6, SX7]>>,
+]>;
 
 // Callee-saved registers
 def CSR : CalleeSavedRegs<(add (sequence "SX%u", 18, 33))>;
 def CSR_NoRegs : CalleeSavedRegs<(add)>;
+
+// PreserveAll (clobbers s62,s63) - used for ve_grow_stack
+def CSR_preserve_all : CalleeSavedRegs<(add (sequence "SX%u", 0, 61))>;
diff --git a/llvm/lib/Target/VE/VEFrameLowering.cpp b/llvm/lib/Target/VE/VEFrameLowering.cpp
index ef5b5f055911..8b10e6466123 100644
--- a/llvm/lib/Target/VE/VEFrameLowering.cpp
+++ b/llvm/lib/Target/VE/VEFrameLowering.cpp
@@ -12,6 +12,7 @@
 
 #include "VEFrameLowering.h"
 #include "VEInstrInfo.h"
+#include "VEMachineFunctionInfo.h"
 #include "VESubtarget.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -29,12 +30,13 @@ using namespace llvm;
 
 VEFrameLowering::VEFrameLowering(const VESubtarget &ST)
     : TargetFrameLowering(TargetFrameLowering::StackGrowsDown, Align(16), 0,
-                          Align(16)) {}
+                          Align(16)),
+      STI(ST) {}
 
 void VEFrameLowering::emitPrologueInsns(MachineFunction &MF,
                                         MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MBBI,
-                                        int NumBytes,
+                                        uint64_t NumBytes,
                                         bool RequireFPUpdate) const {
 
   DebugLoc dl;
@@ -46,24 +48,35 @@ void VEFrameLowering::emitPrologueInsns(MachineFunction &MF,
   //    st %lr, 8(,%sp)
   //    st %got, 24(,%sp)
   //    st %plt, 32(,%sp)
+  //    st %s17, 40(,%sp) iff this function is using s17 as BP
   //    or %fp, 0, %sp
 
-  BuildMI(MBB, MBBI, dl, TII.get(VE::STSri))
+  BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
       .addReg(VE::SX11)
       .addImm(0)
+      .addImm(0)
       .addReg(VE::SX9);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::STSri))
+  BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
       .addReg(VE::SX11)
+      .addImm(0)
       .addImm(8)
       .addReg(VE::SX10);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::STSri))
+  BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
       .addReg(VE::SX11)
+      .addImm(0)
       .addImm(24)
       .addReg(VE::SX15);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::STSri))
+  BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
       .addReg(VE::SX11)
+      .addImm(0)
       .addImm(32)
       .addReg(VE::SX16);
+  if (hasBP(MF))
+    BuildMI(MBB, MBBI, dl, TII.get(VE::STrii))
+        .addReg(VE::SX11)
+        .addImm(0)
+        .addImm(40)
+        .addReg(VE::SX17);
   BuildMI(MBB, MBBI, dl, TII.get(VE::ORri), VE::SX9)
       .addReg(VE::SX11)
       .addImm(0);
@@ -72,7 +85,7 @@ void VEFrameLowering::emitPrologueInsns(MachineFunction &MF,
 void VEFrameLowering::emitEpilogueInsns(MachineFunction &MF,
                                         MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MBBI,
-                                        int NumBytes,
+                                        uint64_t NumBytes,
                                         bool RequireFPUpdate) const {
 
   DebugLoc dl;
@@ -81,6 +94,7 @@ void VEFrameLowering::emitEpilogueInsns(MachineFunction &MF,
   // Insert following codes here as epilogue
   //
   //    or %sp, 0, %fp
+  //    ld %s17, 40(,%sp) iff this function is using s17 as BP
   //    ld %got, 32(,%sp)
   //    ld %plt, 24(,%sp)
   //    ld %lr, 8(,%sp)
@@ -89,30 +103,40 @@ void VEFrameLowering::emitEpilogueInsns(MachineFunction &MF,
   BuildMI(MBB, MBBI, dl, TII.get(VE::ORri), VE::SX11)
       .addReg(VE::SX9)
       .addImm(0);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::LDSri), VE::SX16)
+  if (hasBP(MF))
+    BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX17)
+        .addReg(VE::SX11)
+        .addImm(0)
+        .addImm(40);
+  BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX16)
       .addReg(VE::SX11)
+      .addImm(0)
       .addImm(32);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::LDSri), VE::SX15)
+  BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX15)
       .addReg(VE::SX11)
+      .addImm(0)
       .addImm(24);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::LDSri), VE::SX10)
+  BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX10)
       .addReg(VE::SX11)
+      .addImm(0)
       .addImm(8);
-  BuildMI(MBB, MBBI, dl, TII.get(VE::LDSri), VE::SX9)
+  BuildMI(MBB, MBBI, dl, TII.get(VE::LDrii), VE::SX9)
       .addReg(VE::SX11)
+      .addImm(0)
       .addImm(0);
 }
 
 void VEFrameLowering::emitSPAdjustment(MachineFunction &MF,
                                        MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MBBI,
-                                       int NumBytes) const {
+                                       int64_t NumBytes,
+                                       MaybeAlign MaybeAlign) const {
   DebugLoc dl;
   const VEInstrInfo &TII =
       *static_cast<const VEInstrInfo *>(MF.getSubtarget().getInstrInfo());
 
   if (NumBytes >= -64 && NumBytes < 63) {
-    BuildMI(MBB, MBBI, dl, TII.get(VE::ADXri), VE::SX11)
+    BuildMI(MBB, MBBI, dl, TII.get(VE::ADDSLri), VE::SX11)
         .addReg(VE::SX11)
         .addImm(NumBytes);
     return;
@@ -123,20 +147,28 @@ void VEFrameLowering::emitSPAdjustment(MachineFunction &MF,
   //   lea     %s13,%lo(NumBytes)
   //   and     %s13,%s13,(32)0
   //   lea.sl  %sp,%hi(NumBytes)(%sp, %s13)
-  BuildMI(MBB, MBBI, dl, TII.get(VE::LEAzzi), VE::SX13)
-      .addImm(LO32(NumBytes));
-  BuildMI(MBB, MBBI, dl, TII.get(VE::ANDrm0), VE::SX13)
+  BuildMI(MBB, MBBI, dl, TII.get(VE::LEAzii), VE::SX13)
+      .addImm(0)
+      .addImm(0)
+      .addImm(Lo_32(NumBytes));
+  BuildMI(MBB, MBBI, dl, TII.get(VE::ANDrm), VE::SX13)
       .addReg(VE::SX13)
-      .addImm(32);
+      .addImm(M0(32));
   BuildMI(MBB, MBBI, dl, TII.get(VE::LEASLrri), VE::SX11)
       .addReg(VE::SX11)
       .addReg(VE::SX13)
-      .addImm(HI32(NumBytes));
+      .addImm(Hi_32(NumBytes));
+
+  if (MaybeAlign) {
+    // and %sp, %sp, Align-1
+    BuildMI(MBB, MBBI, dl, TII.get(VE::ANDrm), VE::SX11)
+        .addReg(VE::SX11)
+        .addImm(M1(64 - Log2_64(MaybeAlign.valueOrOne().value())));
+  }
 }
 
 void VEFrameLowering::emitSPExtend(MachineFunction &MF, MachineBasicBlock &MBB,
-                                   MachineBasicBlock::iterator MBBI,
-                                   int NumBytes) const {
+                                   MachineBasicBlock::iterator MBBI) const {
   DebugLoc dl;
   const VEInstrInfo &TII =
       *static_cast<const VEInstrInfo *>(MF.getSubtarget().getInstrInfo());
@@ -175,11 +207,8 @@ void VEFrameLowering::emitPrologue(MachineFunction &MF,
                                    MachineBasicBlock &MBB) const {
   assert(&MF.front() == &MBB && "Shrink-wrapping not yet supported");
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  const VESubtarget &Subtarget = MF.getSubtarget<VESubtarget>();
-  const VEInstrInfo &TII =
-      *static_cast<const VEInstrInfo *>(Subtarget.getInstrInfo());
-  const VERegisterInfo &RegInfo =
-      *static_cast<const VERegisterInfo *>(Subtarget.getRegisterInfo());
+  const VEInstrInfo &TII = *STI.getInstrInfo();
+  const VERegisterInfo &RegInfo = *STI.getRegisterInfo();
   MachineBasicBlock::iterator MBBI = MBB.begin();
   // Debug location must be unknown since the first debug location is used
   // to determine the end of the prologue.
@@ -191,39 +220,22 @@ void VEFrameLowering::emitPrologue(MachineFunction &MF,
   // rather than reporting an error, as would be sensible. This is
   // poor, but fixing that bogosity is going to be a large project.
   // For now, just see if it's lied, and report an error here.
-  if (!NeedsStackRealignment && MFI.getMaxAlignment() > getStackAlignment())
+  if (!NeedsStackRealignment && MFI.getMaxAlign() > getStackAlign())
     report_fatal_error("Function \"" + Twine(MF.getName()) +
                        "\" required "
                        "stack re-alignment, but LLVM couldn't handle it "
                        "(probably because it has a dynamic alloca).");
 
   // Get the number of bytes to allocate from the FrameInfo
-  int NumBytes = (int)MFI.getStackSize();
-  // The VE ABI requires a reserved 176-byte area in the user's stack, starting
-  // at %sp + 16. This is for the callee Register Save Area (RSA).
-  //
-  // We therefore need to add that offset to the total stack size
-  // after all the stack objects are placed by
-  // PrologEpilogInserter calculateFrameObjectOffsets. However, since the stack
-  // needs to be aligned *after* the extra size is added, we need to disable
-  // calculateFrameObjectOffsets's built-in stack alignment, by having
-  // targetHandlesStackFrameRounding return true.
-
-  // Add the extra call frame stack size, if needed. (This is the same
-  // code as in PrologEpilogInserter, but also gets disabled by
-  // targetHandlesStackFrameRounding)
-  if (MFI.adjustsStack() && hasReservedCallFrame(MF))
-    NumBytes += MFI.getMaxCallFrameSize();
-
-  // Adds the VE subtarget-specific spill area to the stack
-  // size. Also ensures target-required alignment.
-  NumBytes = Subtarget.getAdjustedFrameSize(NumBytes);
+  uint64_t NumBytes = MFI.getStackSize();
+
+  // The VE ABI requires a reserved 176 bytes area at the top
+  // of stack as described in VESubtarget.cpp.  So, we adjust it here.
+  NumBytes = STI.getAdjustedFrameSize(NumBytes);
 
   // Finally, ensure that the size is sufficiently aligned for the
   // data on the stack.
-  if (MFI.getMaxAlignment() > 0) {
-    NumBytes = alignTo(NumBytes, MFI.getMaxAlignment());
-  }
+  NumBytes = alignTo(NumBytes, MFI.getMaxAlign());
 
   // Update stack size with corrected value.
   MFI.setStackSize(NumBytes);
@@ -232,16 +244,25 @@ void VEFrameLowering::emitPrologue(MachineFunction &MF,
   emitPrologueInsns(MF, MBB, MBBI, NumBytes, true);
 
   // Emit stack adjust instructions
-  emitSPAdjustment(MF, MBB, MBBI, -NumBytes);
+  MaybeAlign RuntimeAlign =
+      NeedsStackRealignment ? MaybeAlign(MFI.getMaxAlign()) : None;
+  emitSPAdjustment(MF, MBB, MBBI, -(int64_t)NumBytes, RuntimeAlign);
+
+  if (hasBP(MF)) {
+    // Copy SP to BP.
+    BuildMI(MBB, MBBI, dl, TII.get(VE::ORri), VE::SX17)
+        .addReg(VE::SX11)
+        .addImm(0);
+  }
 
   // Emit stack extend instructions
-  emitSPExtend(MF, MBB, MBBI, -NumBytes);
+  emitSPExtend(MF, MBB, MBBI);
 
-  unsigned regFP = RegInfo.getDwarfRegNum(VE::SX9, true);
+  Register RegFP = RegInfo.getDwarfRegNum(VE::SX9, true);
 
   // Emit ".cfi_def_cfa_register 30".
   unsigned CFIIndex =
-      MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, regFP));
+      MF.addFrameInst(MCCFIInstruction::createDefCfaRegister(nullptr, RegFP));
   BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
 
@@ -256,7 +277,7 @@ MachineBasicBlock::iterator VEFrameLowering::eliminateCallFramePseudoInstr(
     MachineBasicBlock::iterator I) const {
   if (!hasReservedCallFrame(MF)) {
     MachineInstr &MI = *I;
-    int Size = MI.getOperand(0).getImm();
+    int64_t Size = MI.getOperand(0).getImm();
     if (MI.getOpcode() == VE::ADJCALLSTACKDOWN)
       Size = -Size;
 
@@ -272,20 +293,17 @@ void VEFrameLowering::emitEpilogue(MachineFunction &MF,
   DebugLoc dl = MBBI->getDebugLoc();
   MachineFrameInfo &MFI = MF.getFrameInfo();
 
-  int NumBytes = (int)MFI.getStackSize();
+  uint64_t NumBytes = MFI.getStackSize();
 
   // Emit Epilogue instructions to restore %lr
   emitEpilogueInsns(MF, MBB, MBBI, NumBytes, true);
 }
 
-bool VEFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
-  // Reserve call frame if there are no variable sized objects on the stack.
-  return !MF.getFrameInfo().hasVarSizedObjects();
-}
-
 // hasFP - Return true if the specified function should have a dedicated frame
-// pointer register.  This is true if the function has variable sized allocas or
-// if frame pointer elimination is disabled.
+// pointer register.  This is true if the function has variable sized allocas
+// or if frame pointer elimination is disabled.  For the case of VE, we don't
+// implement FP eliminator yet, but we returns false from this function to
+// not refer fp from generated code.
 bool VEFrameLowering::hasFP(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
@@ -295,13 +313,41 @@ bool VEFrameLowering::hasFP(const MachineFunction &MF) const {
          MFI.isFrameAddressTaken();
 }
 
+bool VEFrameLowering::hasBP(const MachineFunction &MF) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const TargetRegisterInfo *TRI = STI.getRegisterInfo();
+
+  return MFI.hasVarSizedObjects() && TRI->needsStackRealignment(MF);
+}
+
 int VEFrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
-                                            unsigned &FrameReg) const {
-  // Addressable stack objects are accessed using neg. offsets from
-  // %fp, or positive offsets from %sp.
+                                            Register &FrameReg) const {
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const VERegisterInfo *RegInfo = STI.getRegisterInfo();
+  const VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+  bool isFixed = MFI.isFixedObjectIndex(FI);
+
   int64_t FrameOffset = MF.getFrameInfo().getObjectOffset(FI);
-  FrameReg = VE::SX11; // %sp
-  return FrameOffset + MF.getFrameInfo().getStackSize();
+
+  if (FuncInfo->isLeafProc()) {
+    // If there's a leaf proc, all offsets need to be %sp-based,
+    // because we haven't caused %fp to actually point to our frame.
+    FrameReg = VE::SX11; // %sp
+    return FrameOffset + MF.getFrameInfo().getStackSize();
+  }
+  if (RegInfo->needsStackRealignment(MF) && !isFixed) {
+    // If there is dynamic stack realignment, all local object
+    // references need to be via %sp or %s17 (bp), to take account
+    // of the re-alignment.
+    if (hasBP(MF))
+      FrameReg = VE::SX17; // %bp
+    else
+      FrameReg = VE::SX11; // %sp
+    return FrameOffset + MF.getFrameInfo().getStackSize();
+  }
+  // Finally, default to using %fp.
+  FrameReg = RegInfo->getFrameRegister(MF);
+  return FrameOffset;
 }
 
 bool VEFrameLowering::isLeafProc(MachineFunction &MF) const {
@@ -321,5 +367,8 @@ void VEFrameLowering::determineCalleeSaves(MachineFunction &MF,
                                            RegScavenger *RS) const {
   TargetFrameLowering::determineCalleeSaves(MF, SavedRegs, RS);
 
-  assert(isLeafProc(MF) && "TODO implement for non-leaf procs");
+  if (isLeafProc(MF)) {
+    VEMachineFunctionInfo *MFI = MF.getInfo<VEMachineFunctionInfo>();
+    MFI->setLeafProc(true);
+  }
 }
diff --git a/llvm/lib/Target/VE/VEFrameLowering.h b/llvm/lib/Target/VE/VEFrameLowering.h
index 97e31d21aa43..b548d663c504 100644
--- a/llvm/lib/Target/VE/VEFrameLowering.h
+++ b/llvm/lib/Target/VE/VEFrameLowering.h
@@ -28,23 +28,28 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitPrologueInsns(MachineFunction &MF, MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator MBBI, int NumBytes,
+                         MachineBasicBlock::iterator MBBI, uint64_t NumBytes,
                          bool RequireFPUpdate) const;
   void emitEpilogueInsns(MachineFunction &MF, MachineBasicBlock &MBB,
-                         MachineBasicBlock::iterator MBBI, int NumBytes,
+                         MachineBasicBlock::iterator MBBI, uint64_t NumBytes,
                          bool RequireFPUpdate) const;
 
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
 
-  bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  bool hasBP(const MachineFunction &MF) const;
   bool hasFP(const MachineFunction &MF) const override;
+  // VE reserves argument space always for call sites in the function
+  // immediately on entry of the current function.
+  bool hasReservedCallFrame(const MachineFunction &MF) const override {
+    return true;
+  }
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS = nullptr) const override;
 
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
+                             Register &FrameReg) const override;
 
   const SpillSlot *
   getCalleeSavedSpillSlots(unsigned &NumEntries) const override {
@@ -58,10 +63,8 @@ public:
     return Offsets;
   }
 
-  /// targetHandlesStackFrameRounding - Returns true if the target is
-  /// responsible for rounding up the stack frame (probably at emitPrologue
-  /// time).
-  bool targetHandlesStackFrameRounding() const override { return true; }
+protected:
+  const VESubtarget &STI;
 
 private:
   // Returns true if MF is a leaf procedure.
@@ -69,11 +72,12 @@ private:
 
   // Emits code for adjusting SP in function prologue/epilogue.
   void emitSPAdjustment(MachineFunction &MF, MachineBasicBlock &MBB,
-                        MachineBasicBlock::iterator MBBI, int NumBytes) const;
+                        MachineBasicBlock::iterator MBBI, int64_t NumBytes,
+                        MaybeAlign MayAlign = MaybeAlign()) const;
 
   // Emits code for extending SP in function prologue/epilogue.
   void emitSPExtend(MachineFunction &MF, MachineBasicBlock &MBB,
-                    MachineBasicBlock::iterator MBBI, int NumBytes) const;
+                    MachineBasicBlock::iterator MBBI) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/VEISelDAGToDAG.cpp b/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
index 43030993efb9..f3d067d55fdb 100644
--- a/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
+++ b/llvm/lib/Target/VE/VEISelDAGToDAG.cpp
@@ -23,6 +23,105 @@ using namespace llvm;
 // Instruction Selector Implementation
 //===----------------------------------------------------------------------===//
 
+/// Convert a DAG integer condition code to a VE ICC condition.
+inline static VECC::CondCode intCondCode2Icc(ISD::CondCode CC) {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown integer condition code!");
+  case ISD::SETEQ:
+    return VECC::CC_IEQ;
+  case ISD::SETNE:
+    return VECC::CC_INE;
+  case ISD::SETLT:
+    return VECC::CC_IL;
+  case ISD::SETGT:
+    return VECC::CC_IG;
+  case ISD::SETLE:
+    return VECC::CC_ILE;
+  case ISD::SETGE:
+    return VECC::CC_IGE;
+  case ISD::SETULT:
+    return VECC::CC_IL;
+  case ISD::SETULE:
+    return VECC::CC_ILE;
+  case ISD::SETUGT:
+    return VECC::CC_IG;
+  case ISD::SETUGE:
+    return VECC::CC_IGE;
+  }
+}
+
+/// Convert a DAG floating point condition code to a VE FCC condition.
+inline static VECC::CondCode fpCondCode2Fcc(ISD::CondCode CC) {
+  switch (CC) {
+  default:
+    llvm_unreachable("Unknown fp condition code!");
+  case ISD::SETFALSE:
+    return VECC::CC_AF;
+  case ISD::SETEQ:
+  case ISD::SETOEQ:
+    return VECC::CC_EQ;
+  case ISD::SETNE:
+  case ISD::SETONE:
+    return VECC::CC_NE;
+  case ISD::SETLT:
+  case ISD::SETOLT:
+    return VECC::CC_L;
+  case ISD::SETGT:
+  case ISD::SETOGT:
+    return VECC::CC_G;
+  case ISD::SETLE:
+  case ISD::SETOLE:
+    return VECC::CC_LE;
+  case ISD::SETGE:
+  case ISD::SETOGE:
+    return VECC::CC_GE;
+  case ISD::SETO:
+    return VECC::CC_NUM;
+  case ISD::SETUO:
+    return VECC::CC_NAN;
+  case ISD::SETUEQ:
+    return VECC::CC_EQNAN;
+  case ISD::SETUNE:
+    return VECC::CC_NENAN;
+  case ISD::SETULT:
+    return VECC::CC_LNAN;
+  case ISD::SETUGT:
+    return VECC::CC_GNAN;
+  case ISD::SETULE:
+    return VECC::CC_LENAN;
+  case ISD::SETUGE:
+    return VECC::CC_GENAN;
+  case ISD::SETTRUE:
+    return VECC::CC_AT;
+  }
+}
+
+/// getImmVal - get immediate representation of integer value
+inline static uint64_t getImmVal(const ConstantSDNode *N) {
+  return N->getSExtValue();
+}
+
+/// getFpImmVal - get immediate representation of floating point value
+inline static uint64_t getFpImmVal(const ConstantFPSDNode *N) {
+  const APInt &Imm = N->getValueAPF().bitcastToAPInt();
+  uint64_t Val = Imm.getZExtValue();
+  if (Imm.getBitWidth() == 32) {
+    // Immediate value of float place places at higher bits on VE.
+    Val <<= 32;
+  }
+  return Val;
+}
+
+/// convMImmVal - Convert a mimm integer immediate value to target immediate.
+inline static uint64_t convMImmVal(uint64_t Val) {
+  if (Val == 0)
+    return 0; // (0)1
+  if (Val & (1UL << 63))
+    return countLeadingOnes(Val);       // (m)1
+  return countLeadingZeros(Val) | 0x40; // (m)0
+}
+
 //===--------------------------------------------------------------------===//
 /// VEDAGToDAGISel - VE specific code to select VE machine
 /// instructions for SelectionDAG operations.
@@ -43,15 +142,172 @@ public:
 
   void Select(SDNode *N) override;
 
+  // Complex Pattern Selectors.
+  bool selectADDRrri(SDValue N, SDValue &Base, SDValue &Index, SDValue &Offset);
+  bool selectADDRrii(SDValue N, SDValue &Base, SDValue &Index, SDValue &Offset);
+  bool selectADDRzri(SDValue N, SDValue &Base, SDValue &Index, SDValue &Offset);
+  bool selectADDRzii(SDValue N, SDValue &Base, SDValue &Index, SDValue &Offset);
+  bool selectADDRri(SDValue N, SDValue &Base, SDValue &Offset);
+
   StringRef getPassName() const override {
     return "VE DAG->DAG Pattern Instruction Selection";
   }
 
   // Include the pieces autogenerated from the target description.
 #include "VEGenDAGISel.inc"
+
+private:
+  SDNode *getGlobalBaseReg();
+
+  bool matchADDRrr(SDValue N, SDValue &Base, SDValue &Index);
+  bool matchADDRri(SDValue N, SDValue &Base, SDValue &Offset);
 };
 } // end anonymous namespace
 
+bool VEDAGToDAGISel::selectADDRrri(SDValue Addr, SDValue &Base, SDValue &Index,
+                                   SDValue &Offset) {
+  if (Addr.getOpcode() == ISD::FrameIndex)
+    return false;
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
+    return false; // direct calls.
+
+  SDValue LHS, RHS;
+  if (matchADDRri(Addr, LHS, RHS)) {
+    if (matchADDRrr(LHS, Base, Index)) {
+      Offset = RHS;
+      return true;
+    }
+    // Return false to try selectADDRrii.
+    return false;
+  }
+  if (matchADDRrr(Addr, LHS, RHS)) {
+    if (matchADDRri(RHS, Index, Offset)) {
+      Base = LHS;
+      return true;
+    }
+    if (matchADDRri(LHS, Base, Offset)) {
+      Index = RHS;
+      return true;
+    }
+    Base = LHS;
+    Index = RHS;
+    Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+    return true;
+  }
+  return false; // Let the reg+imm(=0) pattern catch this!
+}
+
+bool VEDAGToDAGISel::selectADDRrii(SDValue Addr, SDValue &Base, SDValue &Index,
+                                   SDValue &Offset) {
+  if (matchADDRri(Addr, Base, Offset)) {
+    Index = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+    return true;
+  }
+
+  Base = Addr;
+  Index = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+  return true;
+}
+
+bool VEDAGToDAGISel::selectADDRzri(SDValue Addr, SDValue &Base, SDValue &Index,
+                                   SDValue &Offset) {
+  // Prefer ADDRrii.
+  return false;
+}
+
+bool VEDAGToDAGISel::selectADDRzii(SDValue Addr, SDValue &Base, SDValue &Index,
+                                   SDValue &Offset) {
+  if (dyn_cast<FrameIndexSDNode>(Addr)) {
+    return false;
+  }
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
+    return false; // direct calls.
+
+  if (ConstantSDNode *CN = cast<ConstantSDNode>(Addr)) {
+    if (isInt<32>(CN->getSExtValue())) {
+      Base = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+      Index = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+      Offset =
+          CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(Addr), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
+bool VEDAGToDAGISel::selectADDRri(SDValue Addr, SDValue &Base,
+                                  SDValue &Offset) {
+  if (matchADDRri(Addr, Base, Offset))
+    return true;
+
+  Base = Addr;
+  Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+  return true;
+}
+
+bool VEDAGToDAGISel::matchADDRrr(SDValue Addr, SDValue &Base, SDValue &Index) {
+  if (dyn_cast<FrameIndexSDNode>(Addr))
+    return false;
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
+    return false; // direct calls.
+
+  if (Addr.getOpcode() == ISD::ADD) {
+    ; // Nothing to do here.
+  } else if (Addr.getOpcode() == ISD::OR) {
+    // We want to look through a transform in InstCombine and DAGCombiner that
+    // turns 'add' into 'or', so we can treat this 'or' exactly like an 'add'.
+    if (!CurDAG->haveNoCommonBitsSet(Addr.getOperand(0), Addr.getOperand(1)))
+      return false;
+  } else {
+    return false;
+  }
+
+  if (Addr.getOperand(0).getOpcode() == VEISD::Lo ||
+      Addr.getOperand(1).getOpcode() == VEISD::Lo)
+    return false; // Let the LEASL patterns catch this!
+
+  Base = Addr.getOperand(0);
+  Index = Addr.getOperand(1);
+  return true;
+}
+
+bool VEDAGToDAGISel::matchADDRri(SDValue Addr, SDValue &Base, SDValue &Offset) {
+  auto AddrTy = Addr->getValueType(0);
+  if (FrameIndexSDNode *FIN = dyn_cast<FrameIndexSDNode>(Addr)) {
+    Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), AddrTy);
+    Offset = CurDAG->getTargetConstant(0, SDLoc(Addr), MVT::i32);
+    return true;
+  }
+  if (Addr.getOpcode() == ISD::TargetExternalSymbol ||
+      Addr.getOpcode() == ISD::TargetGlobalAddress ||
+      Addr.getOpcode() == ISD::TargetGlobalTLSAddress)
+    return false; // direct calls.
+
+  if (CurDAG->isBaseWithConstantOffset(Addr)) {
+    ConstantSDNode *CN = cast<ConstantSDNode>(Addr.getOperand(1));
+    if (isInt<32>(CN->getSExtValue())) {
+      if (FrameIndexSDNode *FIN =
+              dyn_cast<FrameIndexSDNode>(Addr.getOperand(0))) {
+        // Constant offset from frame ref.
+        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), AddrTy);
+      } else {
+        Base = Addr.getOperand(0);
+      }
+      Offset =
+          CurDAG->getTargetConstant(CN->getZExtValue(), SDLoc(Addr), MVT::i32);
+      return true;
+    }
+  }
+  return false;
+}
+
 void VEDAGToDAGISel::Select(SDNode *N) {
   SDLoc dl(N);
   if (N->isMachineOpcode()) {
@@ -59,9 +315,22 @@ void VEDAGToDAGISel::Select(SDNode *N) {
     return; // Already selected.
   }
 
+  switch (N->getOpcode()) {
+  case VEISD::GLOBAL_BASE_REG:
+    ReplaceNode(N, getGlobalBaseReg());
+    return;
+  }
+
   SelectCode(N);
 }
 
+SDNode *VEDAGToDAGISel::getGlobalBaseReg() {
+  Register GlobalBaseReg = Subtarget->getInstrInfo()->getGlobalBaseReg(MF);
+  return CurDAG
+      ->getRegister(GlobalBaseReg, TLI->getPointerTy(CurDAG->getDataLayout()))
+      .getNode();
+}
+
 /// createVEISelDag - This pass converts a legalized DAG into a
 /// VE-specific DAG, ready for instruction scheduling.
 ///
diff --git a/llvm/lib/Target/VE/VEISelLowering.cpp b/llvm/lib/Target/VE/VEISelLowering.cpp
index aa6c3c08bd75..ab720545dd83 100644
--- a/llvm/lib/Target/VE/VEISelLowering.cpp
+++ b/llvm/lib/Target/VE/VEISelLowering.cpp
@@ -12,6 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "VEISelLowering.h"
+#include "MCTargetDesc/VEMCExpr.h"
+#include "VEMachineFunctionInfo.h"
 #include "VERegisterInfo.h"
 #include "VETargetMachine.h"
 #include "llvm/ADT/StringSwitch.h"
@@ -36,14 +38,37 @@ using namespace llvm;
 // Calling Convention Implementation
 //===----------------------------------------------------------------------===//
 
+static bool allocateFloat(unsigned ValNo, MVT ValVT, MVT LocVT,
+                          CCValAssign::LocInfo LocInfo,
+                          ISD::ArgFlagsTy ArgFlags, CCState &State) {
+  switch (LocVT.SimpleTy) {
+  case MVT::f32: {
+    // Allocate stack like below
+    //    0      4
+    //    +------+------+
+    //    | empty| float|
+    //    +------+------+
+    // Use align=8 for dummy area to align the beginning of these 2 area.
+    State.AllocateStack(4, Align(8)); // for empty area
+    // Use align=4 for value to place it at just after the dummy area.
+    unsigned Offset = State.AllocateStack(4, Align(4)); // for float value area
+    State.addLoc(CCValAssign::getMem(ValNo, ValVT, Offset, LocVT, LocInfo));
+    return true;
+  }
+  default:
+    return false;
+  }
+}
+
 #include "VEGenCallingConv.inc"
 
 bool VETargetLowering::CanLowerReturn(
     CallingConv::ID CallConv, MachineFunction &MF, bool IsVarArg,
     const SmallVectorImpl<ISD::OutputArg> &Outs, LLVMContext &Context) const {
-  assert(!IsVarArg && "TODO implement var args");
-  assert(Outs.empty() && "TODO implement return values");
-  return true; // TODO support more than 'ret void'
+  CCAssignFn *RetCC = RetCC_VE;
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState CCInfo(CallConv, IsVarArg, MF, RVLocs, Context);
+  return CCInfo.CheckReturn(Outs, RetCC);
 }
 
 SDValue
@@ -52,12 +77,57 @@ VETargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
                               const SmallVectorImpl<ISD::OutputArg> &Outs,
                               const SmallVectorImpl<SDValue> &OutVals,
                               const SDLoc &DL, SelectionDAG &DAG) const {
-  assert(!IsVarArg && "TODO implement var args");
-  assert(Outs.empty() && "TODO implement return values");
-  assert(OutVals.empty() && "TODO implement return values");
+  // CCValAssign - represent the assignment of the return value to locations.
+  SmallVector<CCValAssign, 16> RVLocs;
+
+  // CCState - Info about the registers and stack slot.
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
 
+  // Analyze return values.
+  CCInfo.AnalyzeReturn(Outs, RetCC_VE);
+
+  SDValue Flag;
   SmallVector<SDValue, 4> RetOps(1, Chain);
+
+  // Copy the result values into the output registers.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    assert(VA.isRegLoc() && "Can only return in registers!");
+    SDValue OutVal = OutVals[i];
+
+    // Integer return values must be sign or zero extended by the callee.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      OutVal = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), OutVal);
+      break;
+    case CCValAssign::ZExt:
+      OutVal = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), OutVal);
+      break;
+    case CCValAssign::AExt:
+      OutVal = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), OutVal);
+      break;
+    default:
+      llvm_unreachable("Unknown loc info!");
+    }
+
+    assert(!VA.needsCustom() && "Unexpected custom lowering");
+
+    Chain = DAG.getCopyToReg(Chain, DL, VA.getLocReg(), OutVal, Flag);
+
+    // Guarantee that all emitted copies are stuck together with flags.
+    Flag = Chain.getValue(1);
+    RetOps.push_back(DAG.getRegister(VA.getLocReg(), VA.getLocVT()));
+  }
+
   RetOps[0] = Chain; // Update chain.
+
+  // Add the flag if we have it.
+  if (Flag.getNode())
+    RetOps.push_back(Flag);
+
   return DAG.getNode(VEISD::RET_FLAG, DL, MVT::Other, RetOps);
 }
 
@@ -65,8 +135,89 @@ SDValue VETargetLowering::LowerFormalArguments(
     SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &DL,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
-  assert(!IsVarArg && "TODO implement var args");
-  assert(Ins.empty() && "TODO implement input arguments");
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Get the base offset of the incoming arguments stack space.
+  unsigned ArgsBaseOffset = 176;
+  // Get the size of the preserved arguments area
+  unsigned ArgsPreserved = 64;
+
+  // Analyze arguments according to CC_VE.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CallConv, IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  // Allocate the preserved area first.
+  CCInfo.AllocateStack(ArgsPreserved, Align(8));
+  // We already allocated the preserved area, so the stack offset computed
+  // by CC_VE would be correct now.
+  CCInfo.AnalyzeFormalArguments(Ins, CC_VE);
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    if (VA.isRegLoc()) {
+      // This argument is passed in a register.
+      // All integer register arguments are promoted by the caller to i64.
+
+      // Create a virtual register for the promoted live-in value.
+      unsigned VReg =
+          MF.addLiveIn(VA.getLocReg(), getRegClassFor(VA.getLocVT()));
+      SDValue Arg = DAG.getCopyFromReg(Chain, DL, VReg, VA.getLocVT());
+
+      // Get the high bits for i32 struct elements.
+      if (VA.getValVT() == MVT::i32 && VA.needsCustom())
+        Arg = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), Arg,
+                          DAG.getConstant(32, DL, MVT::i32));
+
+      // The caller promoted the argument, so insert an Assert?ext SDNode so we
+      // won't promote the value again in this function.
+      switch (VA.getLocInfo()) {
+      case CCValAssign::SExt:
+        Arg = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), Arg,
+                          DAG.getValueType(VA.getValVT()));
+        break;
+      case CCValAssign::ZExt:
+        Arg = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), Arg,
+                          DAG.getValueType(VA.getValVT()));
+        break;
+      default:
+        break;
+      }
+
+      // Truncate the register down to the argument type.
+      if (VA.isExtInLoc())
+        Arg = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), Arg);
+
+      InVals.push_back(Arg);
+      continue;
+    }
+
+    // The registers are exhausted. This argument was passed on the stack.
+    assert(VA.isMemLoc());
+    // The CC_VE_Full/Half functions compute stack offsets relative to the
+    // beginning of the arguments area at %fp+176.
+    unsigned Offset = VA.getLocMemOffset() + ArgsBaseOffset;
+    unsigned ValSize = VA.getValVT().getSizeInBits() / 8;
+    int FI = MF.getFrameInfo().CreateFixedObject(ValSize, Offset, true);
+    InVals.push_back(
+        DAG.getLoad(VA.getValVT(), DL, Chain,
+                    DAG.getFrameIndex(FI, getPointerTy(MF.getDataLayout())),
+                    MachinePointerInfo::getFixedStack(MF, FI)));
+  }
+
+  if (!IsVarArg)
+    return Chain;
+
+  // This function takes variable arguments, some of which may have been passed
+  // in registers %s0-%s8.
+  //
+  // The va_start intrinsic needs to know the offset to the first variable
+  // argument.
+  // TODO: need to calculate offset correctly once we support f128.
+  unsigned ArgOffset = ArgLocs.size() * 8;
+  VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+  // Skip the 176 bytes of register save area.
+  FuncInfo->setVarArgsFrameOffset(ArgOffset + ArgsBaseOffset);
+
   return Chain;
 }
 
@@ -78,7 +229,7 @@ Register VETargetLowering::getRegisterByName(const char *RegName, LLT VT,
                      .Case("sp", VE::SX11)    // Stack pointer
                      .Case("fp", VE::SX9)     // Frame pointer
                      .Case("sl", VE::SX8)     // Stack limit
-                     .Case("lr", VE::SX10)    // Link regsiter
+                     .Case("lr", VE::SX10)    // Link register
                      .Case("tp", VE::SX14)    // Thread pointer
                      .Case("outer", VE::SX12) // Outer regiser
                      .Case("info", VE::SX17)  // Info area register
@@ -96,6 +247,314 @@ Register VETargetLowering::getRegisterByName(const char *RegName, LLT VT,
 // TargetLowering Implementation
 //===----------------------------------------------------------------------===//
 
+SDValue VETargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                                    SmallVectorImpl<SDValue> &InVals) const {
+  SelectionDAG &DAG = CLI.DAG;
+  SDLoc DL = CLI.DL;
+  SDValue Chain = CLI.Chain;
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // VE target does not yet support tail call optimization.
+  CLI.IsTailCall = false;
+
+  // Get the base offset of the outgoing arguments stack space.
+  unsigned ArgsBaseOffset = 176;
+  // Get the size of the preserved arguments area
+  unsigned ArgsPreserved = 8 * 8u;
+
+  // Analyze operands of the call, assigning locations to each operand.
+  SmallVector<CCValAssign, 16> ArgLocs;
+  CCState CCInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), ArgLocs,
+                 *DAG.getContext());
+  // Allocate the preserved area first.
+  CCInfo.AllocateStack(ArgsPreserved, Align(8));
+  // We already allocated the preserved area, so the stack offset computed
+  // by CC_VE would be correct now.
+  CCInfo.AnalyzeCallOperands(CLI.Outs, CC_VE);
+
+  // VE requires to use both register and stack for varargs or no-prototyped
+  // functions.
+  bool UseBoth = CLI.IsVarArg;
+
+  // Analyze operands again if it is required to store BOTH.
+  SmallVector<CCValAssign, 16> ArgLocs2;
+  CCState CCInfo2(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(),
+                  ArgLocs2, *DAG.getContext());
+  if (UseBoth)
+    CCInfo2.AnalyzeCallOperands(CLI.Outs, CC_VE2);
+
+  // Get the size of the outgoing arguments stack space requirement.
+  unsigned ArgsSize = CCInfo.getNextStackOffset();
+
+  // Keep stack frames 16-byte aligned.
+  ArgsSize = alignTo(ArgsSize, 16);
+
+  // Adjust the stack pointer to make room for the arguments.
+  // FIXME: Use hasReservedCallFrame to avoid %sp adjustments around all calls
+  // with more than 6 arguments.
+  Chain = DAG.getCALLSEQ_START(Chain, ArgsSize, 0, DL);
+
+  // Collect the set of registers to pass to the function and their values.
+  // This will be emitted as a sequence of CopyToReg nodes glued to the call
+  // instruction.
+  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+
+  // Collect chains from all the memory opeations that copy arguments to the
+  // stack. They must follow the stack pointer adjustment above and precede the
+  // call instruction itself.
+  SmallVector<SDValue, 8> MemOpChains;
+
+  // VE needs to get address of callee function in a register
+  // So, prepare to copy it to SX12 here.
+
+  // If the callee is a GlobalAddress node (quite common, every direct call is)
+  // turn it into a TargetGlobalAddress node so that legalize doesn't hack it.
+  // Likewise ExternalSymbol -> TargetExternalSymbol.
+  SDValue Callee = CLI.Callee;
+
+  bool IsPICCall = isPositionIndependent();
+
+  // PC-relative references to external symbols should go through $stub.
+  // If so, we need to prepare GlobalBaseReg first.
+  const TargetMachine &TM = DAG.getTarget();
+  const Module *Mod = DAG.getMachineFunction().getFunction().getParent();
+  const GlobalValue *GV = nullptr;
+  auto *CalleeG = dyn_cast<GlobalAddressSDNode>(Callee);
+  if (CalleeG)
+    GV = CalleeG->getGlobal();
+  bool Local = TM.shouldAssumeDSOLocal(*Mod, GV);
+  bool UsePlt = !Local;
+  MachineFunction &MF = DAG.getMachineFunction();
+
+  // Turn GlobalAddress/ExternalSymbol node into a value node
+  // containing the address of them here.
+  if (CalleeG) {
+    if (IsPICCall) {
+      if (UsePlt)
+        Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
+      Callee = DAG.getTargetGlobalAddress(GV, DL, PtrVT, 0, 0);
+      Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
+    } else {
+      Callee =
+          makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
+    }
+  } else if (ExternalSymbolSDNode *E = dyn_cast<ExternalSymbolSDNode>(Callee)) {
+    if (IsPICCall) {
+      if (UsePlt)
+        Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
+      Callee = DAG.getTargetExternalSymbol(E->getSymbol(), PtrVT, 0);
+      Callee = DAG.getNode(VEISD::GETFUNPLT, DL, PtrVT, Callee);
+    } else {
+      Callee =
+          makeHiLoPair(Callee, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
+    }
+  }
+
+  RegsToPass.push_back(std::make_pair(VE::SX12, Callee));
+
+  for (unsigned i = 0, e = ArgLocs.size(); i != e; ++i) {
+    CCValAssign &VA = ArgLocs[i];
+    SDValue Arg = CLI.OutVals[i];
+
+    // Promote the value if needed.
+    switch (VA.getLocInfo()) {
+    default:
+      llvm_unreachable("Unknown location info!");
+    case CCValAssign::Full:
+      break;
+    case CCValAssign::SExt:
+      Arg = DAG.getNode(ISD::SIGN_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::ZExt:
+      Arg = DAG.getNode(ISD::ZERO_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    case CCValAssign::AExt:
+      Arg = DAG.getNode(ISD::ANY_EXTEND, DL, VA.getLocVT(), Arg);
+      break;
+    }
+
+    if (VA.isRegLoc()) {
+      RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
+      if (!UseBoth)
+        continue;
+      VA = ArgLocs2[i];
+    }
+
+    assert(VA.isMemLoc());
+
+    // Create a store off the stack pointer for this argument.
+    SDValue StackPtr = DAG.getRegister(VE::SX11, PtrVT);
+    // The argument area starts at %fp+176 in the callee frame,
+    // %sp+176 in ours.
+    SDValue PtrOff =
+        DAG.getIntPtrConstant(VA.getLocMemOffset() + ArgsBaseOffset, DL);
+    PtrOff = DAG.getNode(ISD::ADD, DL, PtrVT, StackPtr, PtrOff);
+    MemOpChains.push_back(
+        DAG.getStore(Chain, DL, Arg, PtrOff, MachinePointerInfo()));
+  }
+
+  // Emit all stores, make sure they occur before the call.
+  if (!MemOpChains.empty())
+    Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOpChains);
+
+  // Build a sequence of CopyToReg nodes glued together with token chain and
+  // glue operands which copy the outgoing args into registers. The InGlue is
+  // necessary since all emitted instructions must be stuck together in order
+  // to pass the live physical registers.
+  SDValue InGlue;
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i) {
+    Chain = DAG.getCopyToReg(Chain, DL, RegsToPass[i].first,
+                             RegsToPass[i].second, InGlue);
+    InGlue = Chain.getValue(1);
+  }
+
+  // Build the operands for the call instruction itself.
+  SmallVector<SDValue, 8> Ops;
+  Ops.push_back(Chain);
+  for (unsigned i = 0, e = RegsToPass.size(); i != e; ++i)
+    Ops.push_back(DAG.getRegister(RegsToPass[i].first,
+                                  RegsToPass[i].second.getValueType()));
+
+  // Add a register mask operand representing the call-preserved registers.
+  const VERegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const uint32_t *Mask =
+      TRI->getCallPreservedMask(DAG.getMachineFunction(), CLI.CallConv);
+  assert(Mask && "Missing call preserved mask for calling convention");
+  Ops.push_back(DAG.getRegisterMask(Mask));
+
+  // Make sure the CopyToReg nodes are glued to the call instruction which
+  // consumes the registers.
+  if (InGlue.getNode())
+    Ops.push_back(InGlue);
+
+  // Now the call itself.
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  Chain = DAG.getNode(VEISD::CALL, DL, NodeTys, Ops);
+  InGlue = Chain.getValue(1);
+
+  // Revert the stack pointer immediately after the call.
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(ArgsSize, DL, true),
+                             DAG.getIntPtrConstant(0, DL, true), InGlue, DL);
+  InGlue = Chain.getValue(1);
+
+  // Now extract the return values. This is more or less the same as
+  // LowerFormalArguments.
+
+  // Assign locations to each value returned by this call.
+  SmallVector<CCValAssign, 16> RVLocs;
+  CCState RVInfo(CLI.CallConv, CLI.IsVarArg, DAG.getMachineFunction(), RVLocs,
+                 *DAG.getContext());
+
+  // Set inreg flag manually for codegen generated library calls that
+  // return float.
+  if (CLI.Ins.size() == 1 && CLI.Ins[0].VT == MVT::f32 && !CLI.CB)
+    CLI.Ins[0].Flags.setInReg();
+
+  RVInfo.AnalyzeCallResult(CLI.Ins, RetCC_VE);
+
+  // Copy all of the result registers out of their specified physreg.
+  for (unsigned i = 0; i != RVLocs.size(); ++i) {
+    CCValAssign &VA = RVLocs[i];
+    unsigned Reg = VA.getLocReg();
+
+    // When returning 'inreg {i32, i32 }', two consecutive i32 arguments can
+    // reside in the same register in the high and low bits. Reuse the
+    // CopyFromReg previous node to avoid duplicate copies.
+    SDValue RV;
+    if (RegisterSDNode *SrcReg = dyn_cast<RegisterSDNode>(Chain.getOperand(1)))
+      if (SrcReg->getReg() == Reg && Chain->getOpcode() == ISD::CopyFromReg)
+        RV = Chain.getValue(0);
+
+    // But usually we'll create a new CopyFromReg for a different register.
+    if (!RV.getNode()) {
+      RV = DAG.getCopyFromReg(Chain, DL, Reg, RVLocs[i].getLocVT(), InGlue);
+      Chain = RV.getValue(1);
+      InGlue = Chain.getValue(2);
+    }
+
+    // Get the high bits for i32 struct elements.
+    if (VA.getValVT() == MVT::i32 && VA.needsCustom())
+      RV = DAG.getNode(ISD::SRL, DL, VA.getLocVT(), RV,
+                       DAG.getConstant(32, DL, MVT::i32));
+
+    // The callee promoted the return value, so insert an Assert?ext SDNode so
+    // we won't promote the value again in this function.
+    switch (VA.getLocInfo()) {
+    case CCValAssign::SExt:
+      RV = DAG.getNode(ISD::AssertSext, DL, VA.getLocVT(), RV,
+                       DAG.getValueType(VA.getValVT()));
+      break;
+    case CCValAssign::ZExt:
+      RV = DAG.getNode(ISD::AssertZext, DL, VA.getLocVT(), RV,
+                       DAG.getValueType(VA.getValVT()));
+      break;
+    default:
+      break;
+    }
+
+    // Truncate the register down to the return value type.
+    if (VA.isExtInLoc())
+      RV = DAG.getNode(ISD::TRUNCATE, DL, VA.getValVT(), RV);
+
+    InVals.push_back(RV);
+  }
+
+  return Chain;
+}
+
+/// isFPImmLegal - Returns true if the target can instruction select the
+/// specified FP immediate natively. If false, the legalizer will
+/// materialize the FP immediate as a load from a constant pool.
+bool VETargetLowering::isFPImmLegal(const APFloat &Imm, EVT VT,
+                                    bool ForCodeSize) const {
+  return VT == MVT::f32 || VT == MVT::f64;
+}
+
+/// Determine if the target supports unaligned memory accesses.
+///
+/// This function returns true if the target allows unaligned memory accesses
+/// of the specified type in the given address space. If true, it also returns
+/// whether the unaligned memory access is "fast" in the last argument by
+/// reference. This is used, for example, in situations where an array
+/// copy/move/set is converted to a sequence of store operations. Its use
+/// helps to ensure that such replacements don't generate code that causes an
+/// alignment error (trap) on the target machine.
+bool VETargetLowering::allowsMisalignedMemoryAccesses(EVT VT,
+                                                      unsigned AddrSpace,
+                                                      unsigned Align,
+                                                      MachineMemOperand::Flags,
+                                                      bool *Fast) const {
+  if (Fast) {
+    // It's fast anytime on VE
+    *Fast = true;
+  }
+  return true;
+}
+
+bool VETargetLowering::hasAndNot(SDValue Y) const {
+  EVT VT = Y.getValueType();
+
+  // VE doesn't have vector and not instruction.
+  if (VT.isVector())
+    return false;
+
+  // VE allows different immediate values for X and Y where ~X & Y.
+  // Only simm7 works for X, and only mimm works for Y on VE.  However, this
+  // function is used to check whether an immediate value is OK for and-not
+  // instruction as both X and Y.  Generating additional instruction to
+  // retrieve an immediate value is no good since the purpose of this
+  // function is to convert a series of 3 instructions to another series of
+  // 3 instructions with better parallelism.  Therefore, we return false
+  // for all immediate values now.
+  // FIXME: Change hasAndNot function to have two operands to make it work
+  //        correctly with Aurora VE.
+  if (isa<ConstantSDNode>(Y))
+    return false;
+
+  // It's ok for generic registers.
+  return true;
+}
+
 VETargetLowering::VETargetLowering(const TargetMachine &TM,
                                    const VESubtarget &STI)
     : TargetLowering(TM), Subtarget(&STI) {
@@ -108,7 +567,87 @@ VETargetLowering::VETargetLowering(const TargetMachine &TM,
   setBooleanVectorContents(ZeroOrOneBooleanContent);
 
   // Set up the register classes.
+  addRegisterClass(MVT::i32, &VE::I32RegClass);
   addRegisterClass(MVT::i64, &VE::I64RegClass);
+  addRegisterClass(MVT::f32, &VE::F32RegClass);
+  addRegisterClass(MVT::f64, &VE::I64RegClass);
+
+  /// Load & Store {
+  for (MVT FPVT : MVT::fp_valuetypes()) {
+    for (MVT OtherFPVT : MVT::fp_valuetypes()) {
+      // Turn FP extload into load/fpextend
+      setLoadExtAction(ISD::EXTLOAD, FPVT, OtherFPVT, Expand);
+
+      // Turn FP truncstore into trunc + store.
+      setTruncStoreAction(FPVT, OtherFPVT, Expand);
+    }
+  }
+
+  // VE doesn't have i1 sign extending load
+  for (MVT VT : MVT::integer_valuetypes()) {
+    setLoadExtAction(ISD::SEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::ZEXTLOAD, VT, MVT::i1, Promote);
+    setLoadExtAction(ISD::EXTLOAD, VT, MVT::i1, Promote);
+    setTruncStoreAction(VT, MVT::i1, Expand);
+  }
+  /// } Load & Store
+
+  // Custom legalize address nodes into LO/HI parts.
+  MVT PtrVT = MVT::getIntegerVT(TM.getPointerSizeInBits(0));
+  setOperationAction(ISD::BlockAddress, PtrVT, Custom);
+  setOperationAction(ISD::GlobalAddress, PtrVT, Custom);
+  setOperationAction(ISD::GlobalTLSAddress, PtrVT, Custom);
+
+  /// VAARG handling {
+  setOperationAction(ISD::VASTART, MVT::Other, Custom);
+  // VAARG needs to be lowered to access with 8 bytes alignment.
+  setOperationAction(ISD::VAARG, MVT::Other, Custom);
+  // Use the default implementation.
+  setOperationAction(ISD::VACOPY, MVT::Other, Expand);
+  setOperationAction(ISD::VAEND, MVT::Other, Expand);
+  /// } VAARG handling
+
+  /// Stack {
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i32, Custom);
+  setOperationAction(ISD::DYNAMIC_STACKALLOC, MVT::i64, Custom);
+  /// } Stack
+
+  /// Int Ops {
+  for (MVT IntVT : {MVT::i32, MVT::i64}) {
+    // VE has no REM or DIVREM operations.
+    setOperationAction(ISD::UREM, IntVT, Expand);
+    setOperationAction(ISD::SREM, IntVT, Expand);
+    setOperationAction(ISD::SDIVREM, IntVT, Expand);
+    setOperationAction(ISD::UDIVREM, IntVT, Expand);
+
+    setOperationAction(ISD::CTTZ, IntVT, Expand);
+    setOperationAction(ISD::ROTL, IntVT, Expand);
+    setOperationAction(ISD::ROTR, IntVT, Expand);
+
+    // Use isel patterns for i32 and i64
+    setOperationAction(ISD::BSWAP, IntVT, Legal);
+    setOperationAction(ISD::CTLZ, IntVT, Legal);
+    setOperationAction(ISD::CTPOP, IntVT, Legal);
+
+    // Use isel patterns for i64, Promote i32
+    LegalizeAction Act = (IntVT == MVT::i32) ? Promote : Legal;
+    setOperationAction(ISD::BITREVERSE, IntVT, Act);
+  }
+  /// } Int Ops
+
+  /// Conversion {
+  // VE doesn't have instructions for fp<->uint, so expand them by llvm
+  setOperationAction(ISD::FP_TO_UINT, MVT::i32, Promote); // use i64
+  setOperationAction(ISD::UINT_TO_FP, MVT::i32, Promote); // use i64
+  setOperationAction(ISD::FP_TO_UINT, MVT::i64, Expand);
+  setOperationAction(ISD::UINT_TO_FP, MVT::i64, Expand);
+
+  // fp16 not supported
+  for (MVT FPVT : MVT::fp_valuetypes()) {
+    setOperationAction(ISD::FP16_TO_FP, FPVT, Expand);
+    setOperationAction(ISD::FP_TO_FP16, FPVT, Expand);
+  }
+  /// } Conversion
 
   setStackPointerRegisterToSaveRestore(VE::SX11);
 
@@ -122,16 +661,316 @@ VETargetLowering::VETargetLowering(const TargetMachine &TM,
 }
 
 const char *VETargetLowering::getTargetNodeName(unsigned Opcode) const {
+#define TARGET_NODE_CASE(NAME)                                                 \
+  case VEISD::NAME:                                                            \
+    return "VEISD::" #NAME;
   switch ((VEISD::NodeType)Opcode) {
   case VEISD::FIRST_NUMBER:
     break;
-  case VEISD::RET_FLAG:
-    return "VEISD::RET_FLAG";
+    TARGET_NODE_CASE(Lo)
+    TARGET_NODE_CASE(Hi)
+    TARGET_NODE_CASE(GETFUNPLT)
+    TARGET_NODE_CASE(GETSTACKTOP)
+    TARGET_NODE_CASE(GETTLSADDR)
+    TARGET_NODE_CASE(CALL)
+    TARGET_NODE_CASE(RET_FLAG)
+    TARGET_NODE_CASE(GLOBAL_BASE_REG)
   }
+#undef TARGET_NODE_CASE
   return nullptr;
 }
 
 EVT VETargetLowering::getSetCCResultType(const DataLayout &, LLVMContext &,
                                          EVT VT) const {
-  return MVT::i64;
+  return MVT::i32;
+}
+
+// Convert to a target node and set target flags.
+SDValue VETargetLowering::withTargetFlags(SDValue Op, unsigned TF,
+                                          SelectionDAG &DAG) const {
+  if (const GlobalAddressSDNode *GA = dyn_cast<GlobalAddressSDNode>(Op))
+    return DAG.getTargetGlobalAddress(GA->getGlobal(), SDLoc(GA),
+                                      GA->getValueType(0), GA->getOffset(), TF);
+
+  if (const BlockAddressSDNode *BA = dyn_cast<BlockAddressSDNode>(Op))
+    return DAG.getTargetBlockAddress(BA->getBlockAddress(), Op.getValueType(),
+                                     0, TF);
+
+  if (const ExternalSymbolSDNode *ES = dyn_cast<ExternalSymbolSDNode>(Op))
+    return DAG.getTargetExternalSymbol(ES->getSymbol(), ES->getValueType(0),
+                                       TF);
+
+  llvm_unreachable("Unhandled address SDNode");
+}
+
+// Split Op into high and low parts according to HiTF and LoTF.
+// Return an ADD node combining the parts.
+SDValue VETargetLowering::makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
+                                       SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT VT = Op.getValueType();
+  SDValue Hi = DAG.getNode(VEISD::Hi, DL, VT, withTargetFlags(Op, HiTF, DAG));
+  SDValue Lo = DAG.getNode(VEISD::Lo, DL, VT, withTargetFlags(Op, LoTF, DAG));
+  return DAG.getNode(ISD::ADD, DL, VT, Hi, Lo);
+}
+
+// Build SDNodes for producing an address from a GlobalAddress, ConstantPool,
+// or ExternalSymbol SDNode.
+SDValue VETargetLowering::makeAddress(SDValue Op, SelectionDAG &DAG) const {
+  SDLoc DL(Op);
+  EVT PtrVT = Op.getValueType();
+
+  // Handle PIC mode first. VE needs a got load for every variable!
+  if (isPositionIndependent()) {
+    // GLOBAL_BASE_REG codegen'ed with call. Inform MFI that this
+    // function has calls.
+    MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+    MFI.setHasCalls(true);
+    auto GlobalN = dyn_cast<GlobalAddressSDNode>(Op);
+
+    if (isa<ConstantPoolSDNode>(Op) ||
+        (GlobalN && GlobalN->getGlobal()->hasLocalLinkage())) {
+      // Create following instructions for local linkage PIC code.
+      //     lea %s35, %gotoff_lo(.LCPI0_0)
+      //     and %s35, %s35, (32)0
+      //     lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35)
+      //     adds.l %s35, %s15, %s35                  ; %s15 is GOT
+      // FIXME: use lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35, %s15)
+      SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOTOFF_HI32,
+                                  VEMCExpr::VK_VE_GOTOFF_LO32, DAG);
+      SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
+      return DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
+    }
+    // Create following instructions for not local linkage PIC code.
+    //     lea %s35, %got_lo(.LCPI0_0)
+    //     and %s35, %s35, (32)0
+    //     lea.sl %s35, %got_hi(.LCPI0_0)(%s35)
+    //     adds.l %s35, %s15, %s35                  ; %s15 is GOT
+    //     ld     %s35, (,%s35)
+    // FIXME: use lea.sl %s35, %gotoff_hi(.LCPI0_0)(%s35, %s15)
+    SDValue HiLo = makeHiLoPair(Op, VEMCExpr::VK_VE_GOT_HI32,
+                                VEMCExpr::VK_VE_GOT_LO32, DAG);
+    SDValue GlobalBase = DAG.getNode(VEISD::GLOBAL_BASE_REG, DL, PtrVT);
+    SDValue AbsAddr = DAG.getNode(ISD::ADD, DL, PtrVT, GlobalBase, HiLo);
+    return DAG.getLoad(PtrVT, DL, DAG.getEntryNode(), AbsAddr,
+                       MachinePointerInfo::getGOT(DAG.getMachineFunction()));
+  }
+
+  // This is one of the absolute code models.
+  switch (getTargetMachine().getCodeModel()) {
+  default:
+    llvm_unreachable("Unsupported absolute code model");
+  case CodeModel::Small:
+  case CodeModel::Medium:
+  case CodeModel::Large:
+    // abs64.
+    return makeHiLoPair(Op, VEMCExpr::VK_VE_HI32, VEMCExpr::VK_VE_LO32, DAG);
+  }
+}
+
+/// Custom Lower {
+
+SDValue VETargetLowering::LowerGlobalAddress(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  return makeAddress(Op, DAG);
+}
+
+SDValue VETargetLowering::LowerBlockAddress(SDValue Op,
+                                            SelectionDAG &DAG) const {
+  return makeAddress(Op, DAG);
+}
+
+SDValue
+VETargetLowering::LowerToTLSGeneralDynamicModel(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  SDLoc dl(Op);
+
+  // Generate the following code:
+  //   t1: ch,glue = callseq_start t0, 0, 0
+  //   t2: i64,ch,glue = VEISD::GETTLSADDR t1, label, t1:1
+  //   t3: ch,glue = callseq_end t2, 0, 0, t2:2
+  //   t4: i64,ch,glue = CopyFromReg t3, Register:i64 $sx0, t3:1
+  SDValue Label = withTargetFlags(Op, 0, DAG);
+  EVT PtrVT = Op.getValueType();
+
+  // Lowering the machine isd will make sure everything is in the right
+  // location.
+  SDValue Chain = DAG.getEntryNode();
+  SDVTList NodeTys = DAG.getVTList(MVT::Other, MVT::Glue);
+  const uint32_t *Mask = Subtarget->getRegisterInfo()->getCallPreservedMask(
+      DAG.getMachineFunction(), CallingConv::C);
+  Chain = DAG.getCALLSEQ_START(Chain, 64, 0, dl);
+  SDValue Args[] = {Chain, Label, DAG.getRegisterMask(Mask), Chain.getValue(1)};
+  Chain = DAG.getNode(VEISD::GETTLSADDR, dl, NodeTys, Args);
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(64, dl, true),
+                             DAG.getIntPtrConstant(0, dl, true),
+                             Chain.getValue(1), dl);
+  Chain = DAG.getCopyFromReg(Chain, dl, VE::SX0, PtrVT, Chain.getValue(1));
+
+  // GETTLSADDR will be codegen'ed as call. Inform MFI that function has calls.
+  MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
+  MFI.setHasCalls(true);
+
+  // Also generate code to prepare a GOT register if it is PIC.
+  if (isPositionIndependent()) {
+    MachineFunction &MF = DAG.getMachineFunction();
+    Subtarget->getInstrInfo()->getGlobalBaseReg(&MF);
+  }
+
+  return Chain;
+}
+
+SDValue VETargetLowering::LowerGlobalTLSAddress(SDValue Op,
+                                                SelectionDAG &DAG) const {
+  // The current implementation of nld (2.26) doesn't allow local exec model
+  // code described in VE-tls_v1.1.pdf (*1) as its input. Instead, we always
+  // generate the general dynamic model code sequence.
+  //
+  // *1: https://www.nec.com/en/global/prod/hpc/aurora/document/VE-tls_v1.1.pdf
+  return LowerToTLSGeneralDynamicModel(Op, DAG);
+}
+
+SDValue VETargetLowering::LowerVASTART(SDValue Op, SelectionDAG &DAG) const {
+  MachineFunction &MF = DAG.getMachineFunction();
+  VEMachineFunctionInfo *FuncInfo = MF.getInfo<VEMachineFunctionInfo>();
+  auto PtrVT = getPointerTy(DAG.getDataLayout());
+
+  // Need frame address to find the address of VarArgsFrameIndex.
+  MF.getFrameInfo().setFrameAddressIsTaken(true);
+
+  // vastart just stores the address of the VarArgsFrameIndex slot into the
+  // memory location argument.
+  SDLoc DL(Op);
+  SDValue Offset =
+      DAG.getNode(ISD::ADD, DL, PtrVT, DAG.getRegister(VE::SX9, PtrVT),
+                  DAG.getIntPtrConstant(FuncInfo->getVarArgsFrameOffset(), DL));
+  const Value *SV = cast<SrcValueSDNode>(Op.getOperand(2))->getValue();
+  return DAG.getStore(Op.getOperand(0), DL, Offset, Op.getOperand(1),
+                      MachinePointerInfo(SV));
+}
+
+SDValue VETargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
+  SDNode *Node = Op.getNode();
+  EVT VT = Node->getValueType(0);
+  SDValue InChain = Node->getOperand(0);
+  SDValue VAListPtr = Node->getOperand(1);
+  EVT PtrVT = VAListPtr.getValueType();
+  const Value *SV = cast<SrcValueSDNode>(Node->getOperand(2))->getValue();
+  SDLoc DL(Node);
+  SDValue VAList =
+      DAG.getLoad(PtrVT, DL, InChain, VAListPtr, MachinePointerInfo(SV));
+  SDValue Chain = VAList.getValue(1);
+  SDValue NextPtr;
+
+  if (VT == MVT::f32) {
+    // float --> need special handling like below.
+    //    0      4
+    //    +------+------+
+    //    | empty| float|
+    //    +------+------+
+    // Increment the pointer, VAList, by 8 to the next vaarg.
+    NextPtr =
+        DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
+    // Then, adjust VAList.
+    unsigned InternalOffset = 4;
+    VAList = DAG.getNode(ISD::ADD, DL, PtrVT, VAList,
+                         DAG.getConstant(InternalOffset, DL, PtrVT));
+  } else {
+    // Increment the pointer, VAList, by 8 to the next vaarg.
+    NextPtr =
+        DAG.getNode(ISD::ADD, DL, PtrVT, VAList, DAG.getIntPtrConstant(8, DL));
+  }
+
+  // Store the incremented VAList to the legalized pointer.
+  InChain = DAG.getStore(Chain, DL, NextPtr, VAListPtr, MachinePointerInfo(SV));
+
+  // Load the actual argument out of the pointer VAList.
+  // We can't count on greater alignment than the word size.
+  return DAG.getLoad(VT, DL, InChain, VAList, MachinePointerInfo(),
+                     std::min(PtrVT.getSizeInBits(), VT.getSizeInBits()) / 8);
+}
+
+SDValue VETargetLowering::lowerDYNAMIC_STACKALLOC(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  // Generate following code.
+  //   (void)__llvm_grow_stack(size);
+  //   ret = GETSTACKTOP;        // pseudo instruction
+  SDLoc DL(Op);
+
+  // Get the inputs.
+  SDNode *Node = Op.getNode();
+  SDValue Chain = Op.getOperand(0);
+  SDValue Size = Op.getOperand(1);
+  MaybeAlign Alignment(Op.getConstantOperandVal(2));
+  EVT VT = Node->getValueType(0);
+
+  // Chain the dynamic stack allocation so that it doesn't modify the stack
+  // pointer when other instructions are using the stack.
+  Chain = DAG.getCALLSEQ_START(Chain, 0, 0, DL);
+
+  const TargetFrameLowering &TFI = *Subtarget->getFrameLowering();
+  Align StackAlign = TFI.getStackAlign();
+  bool NeedsAlign = Alignment.valueOrOne() > StackAlign;
+
+  // Prepare arguments
+  TargetLowering::ArgListTy Args;
+  TargetLowering::ArgListEntry Entry;
+  Entry.Node = Size;
+  Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+  Args.push_back(Entry);
+  if (NeedsAlign) {
+    Entry.Node = DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT);
+    Entry.Ty = Entry.Node.getValueType().getTypeForEVT(*DAG.getContext());
+    Args.push_back(Entry);
+  }
+  Type *RetTy = Type::getVoidTy(*DAG.getContext());
+
+  EVT PtrVT = Op.getValueType();
+  SDValue Callee;
+  if (NeedsAlign) {
+    Callee = DAG.getTargetExternalSymbol("__ve_grow_stack_align", PtrVT, 0);
+  } else {
+    Callee = DAG.getTargetExternalSymbol("__ve_grow_stack", PtrVT, 0);
+  }
+
+  TargetLowering::CallLoweringInfo CLI(DAG);
+  CLI.setDebugLoc(DL)
+      .setChain(Chain)
+      .setCallee(CallingConv::PreserveAll, RetTy, Callee, std::move(Args))
+      .setDiscardResult(true);
+  std::pair<SDValue, SDValue> pair = LowerCallTo(CLI);
+  Chain = pair.second;
+  SDValue Result = DAG.getNode(VEISD::GETSTACKTOP, DL, VT, Chain);
+  if (NeedsAlign) {
+    Result = DAG.getNode(ISD::ADD, DL, VT, Result,
+                         DAG.getConstant((Alignment->value() - 1ULL), DL, VT));
+    Result = DAG.getNode(ISD::AND, DL, VT, Result,
+                         DAG.getConstant(~(Alignment->value() - 1ULL), DL, VT));
+  }
+  //  Chain = Result.getValue(1);
+  Chain = DAG.getCALLSEQ_END(Chain, DAG.getIntPtrConstant(0, DL, true),
+                             DAG.getIntPtrConstant(0, DL, true), SDValue(), DL);
+
+  SDValue Ops[2] = {Result, Chain};
+  return DAG.getMergeValues(Ops, DL);
+}
+
+SDValue VETargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
+  switch (Op.getOpcode()) {
+  default:
+    llvm_unreachable("Should not custom lower this!");
+  case ISD::BlockAddress:
+    return LowerBlockAddress(Op, DAG);
+  case ISD::DYNAMIC_STACKALLOC:
+    return lowerDYNAMIC_STACKALLOC(Op, DAG);
+  case ISD::GlobalAddress:
+    return LowerGlobalAddress(Op, DAG);
+  case ISD::GlobalTLSAddress:
+    return LowerGlobalTLSAddress(Op, DAG);
+  case ISD::VASTART:
+    return LowerVASTART(Op, DAG);
+  case ISD::VAARG:
+    return LowerVAARG(Op, DAG);
+  }
 }
+/// } Custom Lower
diff --git a/llvm/lib/Target/VE/VEISelLowering.h b/llvm/lib/Target/VE/VEISelLowering.h
index 39b3610a0c3a..4633220efaa1 100644
--- a/llvm/lib/Target/VE/VEISelLowering.h
+++ b/llvm/lib/Target/VE/VEISelLowering.h
@@ -23,7 +23,18 @@ class VESubtarget;
 namespace VEISD {
 enum NodeType : unsigned {
   FIRST_NUMBER = ISD::BUILTIN_OP_END,
-  RET_FLAG, // Return with a flag operand.
+
+  Hi,
+  Lo, // Hi/Lo operations, typically on a global address.
+
+  GETFUNPLT,   // load function address through %plt insturction
+  GETTLSADDR,  // load address for TLS access
+  GETSTACKTOP, // retrieve address of stack top (first address of
+               // locals and temporaries)
+
+  CALL,            // A call instruction.
+  RET_FLAG,        // Return with a flag operand.
+  GLOBAL_BASE_REG, // Global base reg for PIC.
 };
 }
 
@@ -34,6 +45,9 @@ public:
   VETargetLowering(const TargetMachine &TM, const VESubtarget &STI);
 
   const char *getTargetNodeName(unsigned Opcode) const override;
+  MVT getScalarShiftAmountTy(const DataLayout &, EVT) const override {
+    return MVT::i32;
+  }
 
   Register getRegisterByName(const char *RegName, LLT VT,
                              const MachineFunction &MF) const override;
@@ -48,6 +62,9 @@ public:
                                const SDLoc &dl, SelectionDAG &DAG,
                                SmallVectorImpl<SDValue> &InVals) const override;
 
+  SDValue LowerCall(TargetLowering::CallLoweringInfo &CLI,
+                    SmallVectorImpl<SDValue> &InVals) const override;
+
   bool CanLowerReturn(CallingConv::ID CallConv, MachineFunction &MF,
                       bool isVarArg,
                       const SmallVectorImpl<ISD::OutputArg> &ArgsFlags,
@@ -56,6 +73,36 @@ public:
                       const SmallVectorImpl<ISD::OutputArg> &Outs,
                       const SmallVectorImpl<SDValue> &OutVals, const SDLoc &dl,
                       SelectionDAG &DAG) const override;
+
+  /// Custom Lower {
+  SDValue LowerOperation(SDValue Op, SelectionDAG &DAG) const override;
+
+  SDValue LowerVASTART(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerVAARG(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerGlobalTLSAddress(SDValue Op, SelectionDAG &DAG) const;
+  SDValue LowerToTLSGeneralDynamicModel(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerDYNAMIC_STACKALLOC(SDValue Op, SelectionDAG &DAG) const;
+  /// } Custom Lower
+
+  SDValue withTargetFlags(SDValue Op, unsigned TF, SelectionDAG &DAG) const;
+  SDValue makeHiLoPair(SDValue Op, unsigned HiTF, unsigned LoTF,
+                       SelectionDAG &DAG) const;
+  SDValue makeAddress(SDValue Op, SelectionDAG &DAG) const;
+
+  bool isFPImmLegal(const APFloat &Imm, EVT VT,
+                    bool ForCodeSize) const override;
+  /// Returns true if the target allows unaligned memory accesses of the
+  /// specified type.
+  bool allowsMisalignedMemoryAccesses(EVT VT, unsigned AS, unsigned Align,
+                                      MachineMemOperand::Flags Flags,
+                                      bool *Fast) const override;
+
+  // Block s/udiv lowering for now
+  bool isIntDivCheap(EVT VT, AttributeList Attr) const override { return true; }
+
+  bool hasAndNot(SDValue Y) const override;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Target/VE/VEInstrFormats.td b/llvm/lib/Target/VE/VEInstrFormats.td
index a8d3e786ba89..0c02411ff916 100644
--- a/llvm/lib/Target/VE/VEInstrFormats.td
+++ b/llvm/lib/Target/VE/VEInstrFormats.td
@@ -6,6 +6,20 @@
 //
 //===----------------------------------------------------------------------===//
 
+// SX-Aurora uses little endian, but instructions are encoded little bit
+// different manner.  Therefore, we need to tranlate the address of each
+// bitfield described in ISA documentation like below.
+//
+// ISA   |  InstrFormats.td
+// ---------------------------
+// 0-7   => 63-56
+// 8     => 55
+// 32-63 => 31-0
+
+//===----------------------------------------------------------------------===//
+// Instruction Format
+//===----------------------------------------------------------------------===//
+
 class InstVE<dag outs, dag ins, string asmstr, list<dag> pattern>
    : Instruction {
   field bits<64> Inst;
@@ -14,7 +28,7 @@ class InstVE<dag outs, dag ins, string asmstr, list<dag> pattern>
   let Size = 8;
 
   bits<8> op;
-  let Inst{0-7} = op;
+  let Inst{63-56} = op;
 
   dag OutOperandList = outs;
   dag InOperandList = ins;
@@ -25,50 +39,154 @@ class InstVE<dag outs, dag ins, string asmstr, list<dag> pattern>
   field bits<64> SoftFail = 0;
 }
 
-class RM<bits<8>opVal, dag outs, dag ins, string asmstr, list<dag> pattern=[]>
+//-----------------------------------------------------------------------------
+// Section 5.1 RM Type
+//
+// RM type has sx, sy, sz, and imm32.
+// The effective address is generated by sz + sy + imm32.
+//-----------------------------------------------------------------------------
+
+class RM<bits<8>opVal, dag outs, dag ins, string asmstr, list<dag> pattern = []>
    : InstVE<outs, ins, asmstr, pattern> {
   bits<1>  cx = 0;
   bits<7>  sx;
-  bits<1>  cy = 0;
+  bits<1>  cy = 1;
+  bits<7>  sz;      // defines sz prior to sy to assign from sz
+  bits<7>  sy;
+  bits<1>  cz = 1;
+  bits<32> imm32;
+  let op = opVal;
+  let Inst{55} = cx;
+  let Inst{54-48} = sx;
+  let Inst{47} = cy;
+  let Inst{46-40} = sy;
+  let Inst{39} = cz;
+  let Inst{38-32} = sz;
+  let Inst{31-0}  = imm32;
+}
+
+//-----------------------------------------------------------------------------
+// Section 5.2 RRM Type
+//
+// RRM type is identical to RM, but the effective address is generated
+// by sz + imm32.  The sy field is used by other purposes.
+//-----------------------------------------------------------------------------
+
+class RRM<bits<8>opVal, dag outs, dag ins, string asmstr,
+          list<dag> pattern = []>
+   : RM<opVal, outs, ins, asmstr, pattern>;
+
+// RRMHM type is to load/store host memory
+// It is similar to RRM and not use sy.
+class RRMHM<bits<8>opVal, dag outs, dag ins, string asmstr,
+            list<dag> pattern = []>
+   : RRM<opVal, outs, ins, asmstr, pattern> {
+  bits<2> ry = 0;
+  let cy = 0;
+  let sy{6-2} = 0;
+  let sy{1-0} = ry;
+}
+
+//-----------------------------------------------------------------------------
+// Section 5.3 CF Type
+//
+// CF type is used for control flow.
+//-----------------------------------------------------------------------------
+
+class CF<bits<8>opVal, dag outs, dag ins, string asmstr, list<dag> pattern = []>
+   : InstVE<outs, ins, asmstr, pattern> {
+  bits<1>  cx = 0;
+  bits<1>  cx2 = 0;
+  bits<2>  bpf = 0;
+  bits<4>  cf;
+  bits<1>  cy = 1;
   bits<7>  sy;
-  bits<1>  cz = 0;
+  bits<1>  cz = 1;
   bits<7>  sz;
-  bits<32> imm32 = 0;
+  bits<32> imm32;
   let op = opVal;
-  let Inst{15} = cx;
-  let Inst{14-8} = sx;
-  let Inst{23} = cy;
-  let Inst{22-16} = sy;
-  let Inst{31} = cz;
-  let Inst{30-24} = sz;
-  let Inst{63-32}  = imm32;
+  let Inst{55} = cx;
+  let Inst{54} = cx2;
+  let Inst{53-52} = bpf;
+  let Inst{51-48} = cf;
+  let Inst{47} = cy;
+  let Inst{46-40} = sy;
+  let Inst{39} = cz;
+  let Inst{38-32} = sz;
+  let Inst{31-0}  = imm32;
 }
 
-class RR<bits<8>opVal, dag outs, dag ins, string asmstr>
-   : RM<opVal, outs, ins, asmstr> {
+//-----------------------------------------------------------------------------
+// Section 5.4 RR Type
+//
+// RR type is for generic arithmetic instructions.
+//-----------------------------------------------------------------------------
+
+class RR<bits<8>opVal, dag outs, dag ins, string asmstr, list<dag> pattern = []>
+   : InstVE<outs, ins, asmstr, pattern> {
+  bits<1>  cx = 0;
+  bits<7>  sx;
+  bits<1>  cy = 1;
+  bits<7>  sy;
+  bits<1>  cz = 1;
+  bits<7>  sz;          // m field places at the top sz field
+  bits<8>  vx = 0;
+  bits<8>  vz = 0;
   bits<1> cw = 0;
   bits<1> cw2 = 0;
   bits<4> cfw = 0;
-  let imm32{0-23} = 0;
-  let imm32{24} = cw;
-  let imm32{25} = cw2;
-  let imm32{26-27} = 0;
-  let imm32{28-31} = cfw;
+  let op = opVal;
+  let Inst{55} = cx;
+  let Inst{54-48} = sx;
+  let Inst{47} = cy;
+  let Inst{46-40} = sy;
+  let Inst{39} = cz;
+  let Inst{38-32} = sz;
+  let Inst{31-24} = vx;
+  let Inst{23-16} = 0;
+  let Inst{15-8} = vz;
+  let Inst{7} = cw;
+  let Inst{6} = cw2;
+  let Inst{5-4} = 0;
+  let Inst{3-0} = cfw;
 }
 
-class CF<bits<8>opVal, dag outs, dag ins, string asmstr, list<dag> pattern=[]>
-   : RM<opVal, outs, ins, asmstr, pattern> {
-  bits<1>  cx2;
-  bits<2>  bpf;
-  bits<4>  cf;
-  let cx = 0;
-  let sx{6} = cx2;
-  let sx{5-4} = bpf;
-  let sx{3-0} = cf;
+// RRFENCE type is special RR type for a FENCE instruction.
+class RRFENCE<bits<8>opVal, dag outs, dag ins, string asmstr,
+              list<dag> pattern = []>
+   : InstVE<outs, ins, asmstr, pattern> {
+  bits<1> avo = 0;
+  bits<1> lf = 0;
+  bits<1> sf = 0;
+  bits<1> c2 = 0;
+  bits<1> c1 = 0;
+  bits<1> c0 = 0;
+  let op = opVal;
+  let Inst{55} = avo;
+  let Inst{54-50} = 0;
+  let Inst{49} = lf;
+  let Inst{48} = sf;
+  let Inst{47-43} = 0;
+  let Inst{42} = c2;
+  let Inst{41} = c1;
+  let Inst{40} = c0;
+  let Inst{39-0} = 0;
 }
 
+//-----------------------------------------------------------------------------
+// Section 5.5 RW Type
+//-----------------------------------------------------------------------------
+
+//-----------------------------------------------------------------------------
+// Section 5.6 RVM Type
+//-----------------------------------------------------------------------------
+
+//-----------------------------------------------------------------------------
+// Section 5.7 RV Type
+//-----------------------------------------------------------------------------
+
 // Pseudo instructions.
-class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern=[]>
+class Pseudo<dag outs, dag ins, string asmstr, list<dag> pattern = []>
    : InstVE<outs, ins, asmstr, pattern> {
   let isCodeGenOnly = 1;
   let isPseudo = 1;
diff --git a/llvm/lib/Target/VE/VEInstrInfo.cpp b/llvm/lib/Target/VE/VEInstrInfo.cpp
index bc382dcef7c3..86b2ac2078b1 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.cpp
+++ b/llvm/lib/Target/VE/VEInstrInfo.cpp
@@ -12,6 +12,7 @@
 
 #include "VEInstrInfo.h"
 #include "VE.h"
+#include "VEMachineFunctionInfo.h"
 #include "VESubtarget.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
@@ -24,7 +25,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#define DEBUG_TYPE "ve"
+#define DEBUG_TYPE "ve-instr-info"
 
 using namespace llvm;
 
@@ -35,8 +36,441 @@ using namespace llvm;
 void VEInstrInfo::anchor() {}
 
 VEInstrInfo::VEInstrInfo(VESubtarget &ST)
-    : VEGenInstrInfo(VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI(),
-      Subtarget(ST) {}
+    : VEGenInstrInfo(VE::ADJCALLSTACKDOWN, VE::ADJCALLSTACKUP), RI() {}
+
+static bool IsIntegerCC(unsigned CC) { return (CC < VECC::CC_AF); }
+
+static VECC::CondCode GetOppositeBranchCondition(VECC::CondCode CC) {
+  switch (CC) {
+  case VECC::CC_IG:
+    return VECC::CC_ILE;
+  case VECC::CC_IL:
+    return VECC::CC_IGE;
+  case VECC::CC_INE:
+    return VECC::CC_IEQ;
+  case VECC::CC_IEQ:
+    return VECC::CC_INE;
+  case VECC::CC_IGE:
+    return VECC::CC_IL;
+  case VECC::CC_ILE:
+    return VECC::CC_IG;
+  case VECC::CC_AF:
+    return VECC::CC_AT;
+  case VECC::CC_G:
+    return VECC::CC_LENAN;
+  case VECC::CC_L:
+    return VECC::CC_GENAN;
+  case VECC::CC_NE:
+    return VECC::CC_EQNAN;
+  case VECC::CC_EQ:
+    return VECC::CC_NENAN;
+  case VECC::CC_GE:
+    return VECC::CC_LNAN;
+  case VECC::CC_LE:
+    return VECC::CC_GNAN;
+  case VECC::CC_NUM:
+    return VECC::CC_NAN;
+  case VECC::CC_NAN:
+    return VECC::CC_NUM;
+  case VECC::CC_GNAN:
+    return VECC::CC_LE;
+  case VECC::CC_LNAN:
+    return VECC::CC_GE;
+  case VECC::CC_NENAN:
+    return VECC::CC_EQ;
+  case VECC::CC_EQNAN:
+    return VECC::CC_NE;
+  case VECC::CC_GENAN:
+    return VECC::CC_L;
+  case VECC::CC_LENAN:
+    return VECC::CC_G;
+  case VECC::CC_AT:
+    return VECC::CC_AF;
+  case VECC::UNKNOWN:
+    return VECC::UNKNOWN;
+  }
+  llvm_unreachable("Invalid cond code");
+}
+
+// Treat br.l [BRCF AT] as unconditional branch
+static bool isUncondBranchOpcode(int Opc) {
+  return Opc == VE::BRCFLa    || Opc == VE::BRCFWa    ||
+         Opc == VE::BRCFLa_nt || Opc == VE::BRCFWa_nt ||
+         Opc == VE::BRCFLa_t  || Opc == VE::BRCFWa_t  ||
+         Opc == VE::BRCFDa    || Opc == VE::BRCFSa    ||
+         Opc == VE::BRCFDa_nt || Opc == VE::BRCFSa_nt ||
+         Opc == VE::BRCFDa_t  || Opc == VE::BRCFSa_t;
+}
+
+static bool isCondBranchOpcode(int Opc) {
+  return Opc == VE::BRCFLrr    || Opc == VE::BRCFLir    ||
+         Opc == VE::BRCFLrr_nt || Opc == VE::BRCFLir_nt ||
+         Opc == VE::BRCFLrr_t  || Opc == VE::BRCFLir_t  ||
+         Opc == VE::BRCFWrr    || Opc == VE::BRCFWir    ||
+         Opc == VE::BRCFWrr_nt || Opc == VE::BRCFWir_nt ||
+         Opc == VE::BRCFWrr_t  || Opc == VE::BRCFWir_t  ||
+         Opc == VE::BRCFDrr    || Opc == VE::BRCFDir    ||
+         Opc == VE::BRCFDrr_nt || Opc == VE::BRCFDir_nt ||
+         Opc == VE::BRCFDrr_t  || Opc == VE::BRCFDir_t  ||
+         Opc == VE::BRCFSrr    || Opc == VE::BRCFSir    ||
+         Opc == VE::BRCFSrr_nt || Opc == VE::BRCFSir_nt ||
+         Opc == VE::BRCFSrr_t  || Opc == VE::BRCFSir_t;
+}
+
+static bool isIndirectBranchOpcode(int Opc) {
+  return Opc == VE::BCFLari    || Opc == VE::BCFLari    ||
+         Opc == VE::BCFLari_nt || Opc == VE::BCFLari_nt ||
+         Opc == VE::BCFLari_t  || Opc == VE::BCFLari_t  ||
+         Opc == VE::BCFLari    || Opc == VE::BCFLari    ||
+         Opc == VE::BCFLari_nt || Opc == VE::BCFLari_nt ||
+         Opc == VE::BCFLari_t  || Opc == VE::BCFLari_t;
+}
+
+static void parseCondBranch(MachineInstr *LastInst, MachineBasicBlock *&Target,
+                            SmallVectorImpl<MachineOperand> &Cond) {
+  Cond.push_back(MachineOperand::CreateImm(LastInst->getOperand(0).getImm()));
+  Cond.push_back(LastInst->getOperand(1));
+  Cond.push_back(LastInst->getOperand(2));
+  Target = LastInst->getOperand(3).getMBB();
+}
+
+bool VEInstrInfo::analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                                MachineBasicBlock *&FBB,
+                                SmallVectorImpl<MachineOperand> &Cond,
+                                bool AllowModify) const {
+  MachineBasicBlock::iterator I = MBB.getLastNonDebugInstr();
+  if (I == MBB.end())
+    return false;
+
+  if (!isUnpredicatedTerminator(*I))
+    return false;
+
+  // Get the last instruction in the block.
+  MachineInstr *LastInst = &*I;
+  unsigned LastOpc = LastInst->getOpcode();
+
+  // If there is only one terminator instruction, process it.
+  if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+    if (isUncondBranchOpcode(LastOpc)) {
+      TBB = LastInst->getOperand(0).getMBB();
+      return false;
+    }
+    if (isCondBranchOpcode(LastOpc)) {
+      // Block ends with fall-through condbranch.
+      parseCondBranch(LastInst, TBB, Cond);
+      return false;
+    }
+    return true; // Can't handle indirect branch.
+  }
+
+  // Get the instruction before it if it is a terminator.
+  MachineInstr *SecondLastInst = &*I;
+  unsigned SecondLastOpc = SecondLastInst->getOpcode();
+
+  // If AllowModify is true and the block ends with two or more unconditional
+  // branches, delete all but the first unconditional branch.
+  if (AllowModify && isUncondBranchOpcode(LastOpc)) {
+    while (isUncondBranchOpcode(SecondLastOpc)) {
+      LastInst->eraseFromParent();
+      LastInst = SecondLastInst;
+      LastOpc = LastInst->getOpcode();
+      if (I == MBB.begin() || !isUnpredicatedTerminator(*--I)) {
+        // Return now the only terminator is an unconditional branch.
+        TBB = LastInst->getOperand(0).getMBB();
+        return false;
+      }
+      SecondLastInst = &*I;
+      SecondLastOpc = SecondLastInst->getOpcode();
+    }
+  }
+
+  // If there are three terminators, we don't know what sort of block this is.
+  if (SecondLastInst && I != MBB.begin() && isUnpredicatedTerminator(*--I))
+    return true;
+
+  // If the block ends with a B and a Bcc, handle it.
+  if (isCondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    parseCondBranch(SecondLastInst, TBB, Cond);
+    FBB = LastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // If the block ends with two unconditional branches, handle it.  The second
+  // one is not executed.
+  if (isUncondBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    TBB = SecondLastInst->getOperand(0).getMBB();
+    return false;
+  }
+
+  // ...likewise if it ends with an indirect branch followed by an unconditional
+  // branch.
+  if (isIndirectBranchOpcode(SecondLastOpc) && isUncondBranchOpcode(LastOpc)) {
+    I = LastInst;
+    if (AllowModify)
+      I->eraseFromParent();
+    return true;
+  }
+
+  // Otherwise, can't handle this.
+  return true;
+}
+
+unsigned VEInstrInfo::insertBranch(MachineBasicBlock &MBB,
+                                   MachineBasicBlock *TBB,
+                                   MachineBasicBlock *FBB,
+                                   ArrayRef<MachineOperand> Cond,
+                                   const DebugLoc &DL, int *BytesAdded) const {
+  assert(TBB && "insertBranch must not be told to insert a fallthrough");
+  assert((Cond.size() == 3 || Cond.size() == 0) &&
+         "VE branch conditions should have three component!");
+  assert(!BytesAdded && "code size not handled");
+  if (Cond.empty()) {
+    // Uncondition branch
+    assert(!FBB && "Unconditional branch with multiple successors!");
+    BuildMI(&MBB, DL, get(VE::BRCFLa_t))
+        .addMBB(TBB);
+    return 1;
+  }
+
+  // Conditional branch
+  //   (BRCFir CC sy sz addr)
+  assert(Cond[0].isImm() && Cond[2].isReg() && "not implemented");
+
+  unsigned opc[2];
+  const TargetRegisterInfo *TRI = &getRegisterInfo();
+  MachineFunction *MF = MBB.getParent();
+  const MachineRegisterInfo &MRI = MF->getRegInfo();
+  unsigned Reg = Cond[2].getReg();
+  if (IsIntegerCC(Cond[0].getImm())) {
+    if (TRI->getRegSizeInBits(Reg, MRI) == 32) {
+      opc[0] = VE::BRCFWir;
+      opc[1] = VE::BRCFWrr;
+    } else {
+      opc[0] = VE::BRCFLir;
+      opc[1] = VE::BRCFLrr;
+    }
+  } else {
+    if (TRI->getRegSizeInBits(Reg, MRI) == 32) {
+      opc[0] = VE::BRCFSir;
+      opc[1] = VE::BRCFSrr;
+    } else {
+      opc[0] = VE::BRCFDir;
+      opc[1] = VE::BRCFDrr;
+    }
+  }
+  if (Cond[1].isImm()) {
+      BuildMI(&MBB, DL, get(opc[0]))
+          .add(Cond[0]) // condition code
+          .add(Cond[1]) // lhs
+          .add(Cond[2]) // rhs
+          .addMBB(TBB);
+  } else {
+      BuildMI(&MBB, DL, get(opc[1]))
+          .add(Cond[0])
+          .add(Cond[1])
+          .add(Cond[2])
+          .addMBB(TBB);
+  }
+
+  if (!FBB)
+    return 1;
+
+  BuildMI(&MBB, DL, get(VE::BRCFLa_t))
+      .addMBB(FBB);
+  return 2;
+}
+
+unsigned VEInstrInfo::removeBranch(MachineBasicBlock &MBB,
+                                   int *BytesRemoved) const {
+  assert(!BytesRemoved && "code size not handled");
+
+  MachineBasicBlock::iterator I = MBB.end();
+  unsigned Count = 0;
+  while (I != MBB.begin()) {
+    --I;
+
+    if (I->isDebugValue())
+      continue;
+
+    if (!isUncondBranchOpcode(I->getOpcode()) &&
+        !isCondBranchOpcode(I->getOpcode()))
+      break; // Not a branch
+
+    I->eraseFromParent();
+    I = MBB.end();
+    ++Count;
+  }
+  return Count;
+}
+
+bool VEInstrInfo::reverseBranchCondition(
+    SmallVectorImpl<MachineOperand> &Cond) const {
+  VECC::CondCode CC = static_cast<VECC::CondCode>(Cond[0].getImm());
+  Cond[0].setImm(GetOppositeBranchCondition(CC));
+  return false;
+}
+
+static bool IsAliasOfSX(Register Reg) {
+  return VE::I8RegClass.contains(Reg) || VE::I16RegClass.contains(Reg) ||
+         VE::I32RegClass.contains(Reg) || VE::I64RegClass.contains(Reg) ||
+         VE::F32RegClass.contains(Reg);
+}
+
+void VEInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator I, const DebugLoc &DL,
+                              MCRegister DestReg, MCRegister SrcReg,
+                              bool KillSrc) const {
+
+  if (IsAliasOfSX(SrcReg) && IsAliasOfSX(DestReg)) {
+    BuildMI(MBB, I, DL, get(VE::ORri), DestReg)
+        .addReg(SrcReg, getKillRegState(KillSrc))
+        .addImm(0);
+  } else {
+    const TargetRegisterInfo *TRI = &getRegisterInfo();
+    dbgs() << "Impossible reg-to-reg copy from " << printReg(SrcReg, TRI)
+           << " to " << printReg(DestReg, TRI) << "\n";
+    llvm_unreachable("Impossible reg-to-reg copy");
+  }
+}
+
+/// isLoadFromStackSlot - If the specified machine instruction is a direct
+/// load from a stack slot, return the virtual or physical register number of
+/// the destination along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than loading from the stack slot.
+unsigned VEInstrInfo::isLoadFromStackSlot(const MachineInstr &MI,
+                                          int &FrameIndex) const {
+  if (MI.getOpcode() == VE::LDrii ||    // I64
+      MI.getOpcode() == VE::LDLSXrii || // I32
+      MI.getOpcode() == VE::LDUrii      // F32
+  ) {
+    if (MI.getOperand(1).isFI() && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0 && MI.getOperand(3).isImm() &&
+        MI.getOperand(3).getImm() == 0) {
+      FrameIndex = MI.getOperand(1).getIndex();
+      return MI.getOperand(0).getReg();
+    }
+  }
+  return 0;
+}
+
+/// isStoreToStackSlot - If the specified machine instruction is a direct
+/// store to a stack slot, return the virtual or physical register number of
+/// the source reg along with the FrameIndex of the loaded stack slot.  If
+/// not, return 0.  This predicate must return 0 if the instruction has
+/// any side effects other than storing to the stack slot.
+unsigned VEInstrInfo::isStoreToStackSlot(const MachineInstr &MI,
+                                         int &FrameIndex) const {
+  if (MI.getOpcode() == VE::STrii ||  // I64
+      MI.getOpcode() == VE::STLrii || // I32
+      MI.getOpcode() == VE::STUrii    // F32
+  ) {
+    if (MI.getOperand(0).isFI() && MI.getOperand(1).isImm() &&
+        MI.getOperand(1).getImm() == 0 && MI.getOperand(2).isImm() &&
+        MI.getOperand(2).getImm() == 0) {
+      FrameIndex = MI.getOperand(0).getIndex();
+      return MI.getOperand(3).getReg();
+    }
+  }
+  return 0;
+}
+
+void VEInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
+                                      MachineBasicBlock::iterator I,
+                                      Register SrcReg, bool isKill, int FI,
+                                      const TargetRegisterClass *RC,
+                                      const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end())
+    DL = I->getDebugLoc();
+
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOStore,
+      MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+
+  // On the order of operands here: think "[FrameIdx + 0] = SrcReg".
+  if (RC == &VE::I64RegClass) {
+    BuildMI(MBB, I, DL, get(VE::STrii))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addImm(0)
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addMemOperand(MMO);
+  } else if (RC == &VE::I32RegClass) {
+    BuildMI(MBB, I, DL, get(VE::STLrii))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addImm(0)
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addMemOperand(MMO);
+  } else if (RC == &VE::F32RegClass) {
+    BuildMI(MBB, I, DL, get(VE::STUrii))
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addImm(0)
+        .addReg(SrcReg, getKillRegState(isKill))
+        .addMemOperand(MMO);
+  } else
+    report_fatal_error("Can't store this register to stack slot");
+}
+
+void VEInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator I,
+                                       Register DestReg, int FI,
+                                       const TargetRegisterClass *RC,
+                                       const TargetRegisterInfo *TRI) const {
+  DebugLoc DL;
+  if (I != MBB.end())
+    DL = I->getDebugLoc();
+
+  MachineFunction *MF = MBB.getParent();
+  const MachineFrameInfo &MFI = MF->getFrameInfo();
+  MachineMemOperand *MMO = MF->getMachineMemOperand(
+      MachinePointerInfo::getFixedStack(*MF, FI), MachineMemOperand::MOLoad,
+      MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
+
+  if (RC == &VE::I64RegClass) {
+    BuildMI(MBB, I, DL, get(VE::LDrii), DestReg)
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addImm(0)
+        .addMemOperand(MMO);
+  } else if (RC == &VE::I32RegClass) {
+    BuildMI(MBB, I, DL, get(VE::LDLSXrii), DestReg)
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addImm(0)
+        .addMemOperand(MMO);
+  } else if (RC == &VE::F32RegClass) {
+    BuildMI(MBB, I, DL, get(VE::LDUrii), DestReg)
+        .addFrameIndex(FI)
+        .addImm(0)
+        .addImm(0)
+        .addMemOperand(MMO);
+  } else
+    report_fatal_error("Can't load this register from stack slot");
+}
+
+Register VEInstrInfo::getGlobalBaseReg(MachineFunction *MF) const {
+  VEMachineFunctionInfo *VEFI = MF->getInfo<VEMachineFunctionInfo>();
+  Register GlobalBaseReg = VEFI->getGlobalBaseReg();
+  if (GlobalBaseReg != 0)
+    return GlobalBaseReg;
+
+  // We use %s15 (%got) as a global base register
+  GlobalBaseReg = VE::SX15;
+
+  // Insert a pseudo instruction to set the GlobalBaseReg into the first
+  // MBB of the function
+  MachineBasicBlock &FirstMBB = MF->front();
+  MachineBasicBlock::iterator MBBI = FirstMBB.begin();
+  DebugLoc dl;
+  BuildMI(FirstMBB, MBBI, dl, get(VE::GETGOT), GlobalBaseReg);
+  VEFI->setGlobalBaseReg(GlobalBaseReg);
+  return GlobalBaseReg;
+}
 
 bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   switch (MI.getOpcode()) {
@@ -47,6 +481,9 @@ bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     MI.eraseFromParent(); // The pseudo instruction is gone now.
     return true;
   }
+  case VE::GETSTACKTOP: {
+    return expandGetStackTopPseudo(MI);
+  }
   }
   return false;
 }
@@ -54,8 +491,8 @@ bool VEInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
 bool VEInstrInfo::expandExtendStackPseudo(MachineInstr &MI) const {
   MachineBasicBlock &MBB = *MI.getParent();
   MachineFunction &MF = *MBB.getParent();
-  const VEInstrInfo &TII =
-      *static_cast<const VEInstrInfo *>(MF.getSubtarget().getInstrInfo());
+  const VESubtarget &STI = MF.getSubtarget<VESubtarget>();
+  const VEInstrInfo &TII = *STI.getInstrInfo();
   DebugLoc dl = MBB.findDebugLoc(MI);
 
   // Create following instructions and multiple basic blocks.
@@ -91,7 +528,7 @@ bool VEInstrInfo::expandExtendStackPseudo(MachineInstr &MI) const {
   // Next, add the true and fallthrough blocks as its successors.
   BB->addSuccessor(syscallMBB);
   BB->addSuccessor(sinkMBB);
-  BuildMI(BB, dl, TII.get(VE::BCRLrr))
+  BuildMI(BB, dl, TII.get(VE::BRCFLrr_t))
       .addImm(VECC::CC_IGE)
       .addReg(VE::SX11) // %sp
       .addReg(VE::SX8)  // %sl
@@ -102,23 +539,26 @@ bool VEInstrInfo::expandExtendStackPseudo(MachineInstr &MI) const {
   // Update machine-CFG edges
   BB->addSuccessor(sinkMBB);
 
-  BuildMI(BB, dl, TII.get(VE::LDSri), VE::SX61)
+  BuildMI(BB, dl, TII.get(VE::LDrii), VE::SX61)
       .addReg(VE::SX14)
+      .addImm(0)
       .addImm(0x18);
   BuildMI(BB, dl, TII.get(VE::ORri), VE::SX62)
       .addReg(VE::SX0)
       .addImm(0);
-  BuildMI(BB, dl, TII.get(VE::LEAzzi), VE::SX63)
+  BuildMI(BB, dl, TII.get(VE::LEAzii), VE::SX63)
+      .addImm(0)
+      .addImm(0)
       .addImm(0x13b);
-  BuildMI(BB, dl, TII.get(VE::SHMri))
+  BuildMI(BB, dl, TII.get(VE::SHMLri))
       .addReg(VE::SX61)
       .addImm(0)
       .addReg(VE::SX63);
-  BuildMI(BB, dl, TII.get(VE::SHMri))
+  BuildMI(BB, dl, TII.get(VE::SHMLri))
       .addReg(VE::SX61)
       .addImm(8)
       .addReg(VE::SX8);
-  BuildMI(BB, dl, TII.get(VE::SHMri))
+  BuildMI(BB, dl, TII.get(VE::SHMLri))
       .addReg(VE::SX61)
       .addImm(16)
       .addReg(VE::SX11);
@@ -131,3 +571,35 @@ bool VEInstrInfo::expandExtendStackPseudo(MachineInstr &MI) const {
   MI.eraseFromParent(); // The pseudo instruction is gone now.
   return true;
 }
+
+bool VEInstrInfo::expandGetStackTopPseudo(MachineInstr &MI) const {
+  MachineBasicBlock *MBB = MI.getParent();
+  MachineFunction &MF = *MBB->getParent();
+  const VESubtarget &STI = MF.getSubtarget<VESubtarget>();
+  const VEInstrInfo &TII = *STI.getInstrInfo();
+  DebugLoc DL = MBB->findDebugLoc(MI);
+
+  // Create following instruction
+  //
+  //   dst = %sp + target specific frame + the size of parameter area
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+  const VEFrameLowering &TFL = *STI.getFrameLowering();
+
+  // The VE ABI requires a reserved 176 bytes area at the top
+  // of stack as described in VESubtarget.cpp.  So, we adjust it here.
+  unsigned NumBytes = STI.getAdjustedFrameSize(0);
+
+  // Also adds the size of parameter area.
+  if (MFI.adjustsStack() && TFL.hasReservedCallFrame(MF))
+    NumBytes += MFI.getMaxCallFrameSize();
+
+  BuildMI(*MBB, MI, DL, TII.get(VE::LEArii))
+      .addDef(MI.getOperand(0).getReg())
+      .addReg(VE::SX11)
+      .addImm(0)
+      .addImm(NumBytes);
+
+  MI.eraseFromParent(); // The pseudo instruction is gone now.
+  return true;
+}
diff --git a/llvm/lib/Target/VE/VEInstrInfo.h b/llvm/lib/Target/VE/VEInstrInfo.h
index 6a26d0e95275..7b6662df1d60 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.h
+++ b/llvm/lib/Target/VE/VEInstrInfo.h
@@ -25,7 +25,6 @@ class VESubtarget;
 
 class VEInstrInfo : public VEGenInstrInfo {
   const VERegisterInfo RI;
-  const VESubtarget &Subtarget;
   virtual void anchor();
 
 public:
@@ -37,10 +36,52 @@ public:
   ///
   const VERegisterInfo &getRegisterInfo() const { return RI; }
 
+  /// Branch Analysis & Modification {
+  bool analyzeBranch(MachineBasicBlock &MBB, MachineBasicBlock *&TBB,
+                     MachineBasicBlock *&FBB,
+                     SmallVectorImpl<MachineOperand> &Cond,
+                     bool AllowModify = false) const override;
+
+  unsigned removeBranch(MachineBasicBlock &MBB,
+                        int *BytesRemoved = nullptr) const override;
+
+  unsigned insertBranch(MachineBasicBlock &MBB, MachineBasicBlock *TBB,
+                        MachineBasicBlock *FBB, ArrayRef<MachineOperand> Cond,
+                        const DebugLoc &DL,
+                        int *BytesAdded = nullptr) const override;
+
+  bool
+  reverseBranchCondition(SmallVectorImpl<MachineOperand> &Cond) const override;
+  /// } Branch Analysis & Modification
+
+  void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
+                   const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
+                   bool KillSrc) const override;
+
+  /// Stack Spill & Reload {
+  unsigned isLoadFromStackSlot(const MachineInstr &MI,
+                               int &FrameIndex) const override;
+  unsigned isStoreToStackSlot(const MachineInstr &MI,
+                              int &FrameIndex) const override;
+  void storeRegToStackSlot(MachineBasicBlock &MBB,
+                           MachineBasicBlock::iterator MBBI, Register SrcReg,
+                           bool isKill, int FrameIndex,
+                           const TargetRegisterClass *RC,
+                           const TargetRegisterInfo *TRI) const override;
+
+  void loadRegFromStackSlot(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI, Register DestReg,
+                            int FrameIndex, const TargetRegisterClass *RC,
+                            const TargetRegisterInfo *TRI) const override;
+  /// } Stack Spill & Reload
+
+  Register getGlobalBaseReg(MachineFunction *MF) const;
+
   // Lower pseudo instructions after register allocation.
   bool expandPostRAPseudo(MachineInstr &MI) const override;
 
   bool expandExtendStackPseudo(MachineInstr &MI) const;
+  bool expandGetStackTopPseudo(MachineInstr &MI) const;
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/VE/VEInstrInfo.td b/llvm/lib/Target/VE/VEInstrInfo.td
index dc671aaa3f8d..8500f8ef1292 100644
--- a/llvm/lib/Target/VE/VEInstrInfo.td
+++ b/llvm/lib/Target/VE/VEInstrInfo.td
@@ -17,6 +17,94 @@
 include "VEInstrFormats.td"
 
 //===----------------------------------------------------------------------===//
+// Helper functions to retrieve target constants.
+//
+// VE instructions have a space to hold following immediates
+//   $sy has 7 bits to represent simm7, uimm7, simm7fp, or uimm7fp.
+//   $sz also has 7 bits to represent mimm or mimmfp.
+//   $disp has 32 bits to represent simm32.
+//
+// The mimm is a special immediate value of sequential bit stream of 0 or 1.
+//     `(m)0`: Represents 0 sequence then 1 sequence like 0b00...0011...11,
+//             where `m` is equal to the number of leading zeros.
+//     `(m)1`: Represents 1 sequence then 0 sequence like 0b11...1100...00,
+//             where `m` is equal to the number of leading ones.
+// Each bit of mimm's 7 bits is used like below:
+//     bit 6  : If `(m)0`, this bit is 1.  Otherwise, this bit is 0.
+//     bit 5-0: Represents the m (0-63).
+// Use `!add(m, 64)` to generates an immediate value in pattern matchings.
+//
+// The floating point immediate value is not something like compacted value.
+// It is simple integer representation, so it works rarely.
+//     e.g. 0.0 (0x00000000) or -2.0 (0xC0000000=(2)1).
+//===----------------------------------------------------------------------===//
+
+def ULO7 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue() & 0x7f,
+                                   SDLoc(N), MVT::i32);
+}]>;
+def LO7 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(SignExtend32(N->getSExtValue(), 7),
+                                   SDLoc(N), MVT::i32);
+}]>;
+def MIMM : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(convMImmVal(getImmVal(N)),
+                                   SDLoc(N), MVT::i32);
+}]>;
+def LO32 : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(Lo_32(N->getZExtValue()),
+                                   SDLoc(N), MVT::i32);
+}]>;
+def HI32 : SDNodeXForm<imm, [{
+  // Transformation function: shift the immediate value down into the low bits.
+  return CurDAG->getTargetConstant(Hi_32(N->getZExtValue()),
+                                   SDLoc(N), MVT::i32);
+}]>;
+
+def LO7FP : SDNodeXForm<fpimm, [{
+  uint64_t Val = getFpImmVal(N);
+  return CurDAG->getTargetConstant(SignExtend32(Val, 7), SDLoc(N), MVT::i32);
+}]>;
+def MIMMFP : SDNodeXForm<fpimm, [{
+  return CurDAG->getTargetConstant(convMImmVal(getFpImmVal(N)),
+                                   SDLoc(N), MVT::i32);
+}]>;
+def LOFP32 : SDNodeXForm<fpimm, [{
+  return CurDAG->getTargetConstant(Lo_32(getFpImmVal(N) & 0xffffffff),
+                                   SDLoc(N), MVT::i32);
+}]>;
+def HIFP32 : SDNodeXForm<fpimm, [{
+  return CurDAG->getTargetConstant(Hi_32(getFpImmVal(N)), SDLoc(N), MVT::i32);
+}]>;
+
+def icond2cc : SDNodeXForm<cond, [{
+  VECC::CondCode VECC = intCondCode2Icc(N->get());
+  return CurDAG->getTargetConstant(VECC, SDLoc(N), MVT::i32);
+}]>;
+
+def icond2ccSwap : SDNodeXForm<cond, [{
+  ISD::CondCode CC = getSetCCSwappedOperands(N->get());
+  VECC::CondCode VECC = intCondCode2Icc(CC);
+  return CurDAG->getTargetConstant(VECC, SDLoc(N), MVT::i32);
+}]>;
+
+def fcond2cc : SDNodeXForm<cond, [{
+  VECC::CondCode VECC = fpCondCode2Fcc(N->get());
+  return CurDAG->getTargetConstant(VECC, SDLoc(N), MVT::i32);
+}]>;
+
+def fcond2ccSwap : SDNodeXForm<cond, [{
+  ISD::CondCode CC = getSetCCSwappedOperands(N->get());
+  VECC::CondCode VECC = fpCondCode2Fcc(CC);
+  return CurDAG->getTargetConstant(VECC, SDLoc(N), MVT::i32);
+}]>;
+
+def CCOP : SDNodeXForm<imm, [{
+  return CurDAG->getTargetConstant(N->getZExtValue(),
+                                   SDLoc(N), MVT::i32);
+}]>;
+
+//===----------------------------------------------------------------------===//
 // Feature predicates.
 //===----------------------------------------------------------------------===//
 
@@ -24,42 +112,302 @@ include "VEInstrFormats.td"
 // Instruction Pattern Stuff
 //===----------------------------------------------------------------------===//
 
-def simm7       : PatLeaf<(imm), [{ return isInt<7>(N->getSExtValue()); }]>;
+// zero
+def ZeroAsmOperand : AsmOperandClass {
+  let Name = "Zero";
+}
+def zero : Operand<i32>, PatLeaf<(imm), [{
+    return N->getSExtValue() == 0; }]> {
+  let ParserMatchClass = ZeroAsmOperand;
+}
+
+// uimm0to2 - Special immediate value represents 0, 1, and 2.
+def UImm0to2AsmOperand : AsmOperandClass {
+  let Name = "UImm0to2";
+}
+def uimm0to2 : Operand<i32>, PatLeaf<(imm), [{
+    return N->getZExtValue() < 3; }], ULO7> {
+  let ParserMatchClass = UImm0to2AsmOperand;
+}
+
+// uimm1 - Generic immediate value.
+def UImm1AsmOperand : AsmOperandClass {
+  let Name = "UImm1";
+}
+def uimm1 : Operand<i32>, PatLeaf<(imm), [{
+    return isUInt<1>(N->getZExtValue()); }], ULO7> {
+  let ParserMatchClass = UImm1AsmOperand;
+}
+
+// uimm2 - Generic immediate value.
+def UImm2AsmOperand : AsmOperandClass {
+  let Name = "UImm2";
+}
+def uimm2 : Operand<i32>, PatLeaf<(imm), [{
+    return isUInt<2>(N->getZExtValue()); }], ULO7> {
+  let ParserMatchClass = UImm2AsmOperand;
+}
+
+// uimm3 - Generic immediate value.
+def UImm3AsmOperand : AsmOperandClass {
+  let Name = "UImm3";
+}
+def uimm3 : Operand<i32>, PatLeaf<(imm), [{
+    return isUInt<3>(N->getZExtValue()); }], ULO7> {
+  let ParserMatchClass = UImm3AsmOperand;
+}
+
+// uimm6 - Generic immediate value.
+def UImm6AsmOperand : AsmOperandClass {
+  let Name = "UImm6";
+}
+def uimm6 : Operand<i32>, PatLeaf<(imm), [{
+    return isUInt<6>(N->getZExtValue()); }], ULO7> {
+  let ParserMatchClass = UImm6AsmOperand;
+}
+
+// uimm7 - Generic immediate value.
+def UImm7AsmOperand : AsmOperandClass {
+  let Name = "UImm7";
+}
+def uimm7 : Operand<i32>, PatLeaf<(imm), [{
+    return isUInt<7>(N->getZExtValue()); }], ULO7> {
+  let ParserMatchClass = UImm7AsmOperand;
+}
+
+// simm7 - Generic immediate value.
+def SImm7AsmOperand : AsmOperandClass {
+  let Name = "SImm7";
+}
+def simm7 : Operand<i32>, PatLeaf<(imm), [{
+    return isInt<7>(N->getSExtValue()); }], LO7> {
+  let ParserMatchClass = SImm7AsmOperand;
+  let DecoderMethod = "DecodeSIMM7";
+}
+
+// mimm - Special immediate value of sequential bit stream of 0 or 1.
+def MImmAsmOperand : AsmOperandClass {
+  let Name = "MImm";
+  let ParserMethod = "parseMImmOperand";
+}
+def mimm : Operand<i32>, PatLeaf<(imm), [{
+    return isMImmVal(getImmVal(N)); }], MIMM> {
+  let ParserMatchClass = MImmAsmOperand;
+  let PrintMethod = "printMImmOperand";
+}
+
+// simm7fp - Generic fp immediate value.
+def simm7fp : Operand<i32>, PatLeaf<(fpimm), [{
+    return isInt<7>(getFpImmVal(N));
+  }], LO7FP> {
+  let ParserMatchClass = SImm7AsmOperand;
+  let DecoderMethod = "DecodeSIMM7";
+}
+
+// mimmfp - Special fp immediate value of sequential bit stream of 0 or 1.
+def mimmfp : Operand<i32>, PatLeaf<(fpimm), [{
+    return isMImmVal(getFpImmVal(N)); }], MIMMFP> {
+  let ParserMatchClass = MImmAsmOperand;
+  let PrintMethod = "printMImmOperand";
+}
+
+// mimmfp32 - 32 bit width mimmfp
+//   Float value places at higher bits, so ignore lower 32 bits.
+def mimmfp32 : Operand<i32>, PatLeaf<(fpimm), [{
+    return isMImm32Val(getFpImmVal(N) >> 32); }], MIMMFP> {
+  let ParserMatchClass = MImmAsmOperand;
+  let PrintMethod = "printMImmOperand";
+}
+
+// other generic patterns to use in pattern matchings
 def simm32      : PatLeaf<(imm), [{ return isInt<32>(N->getSExtValue()); }]>;
-def uimm6       : PatLeaf<(imm), [{ return isUInt<6>(N->getZExtValue()); }]>;
+def uimm32      : PatLeaf<(imm), [{ return isUInt<32>(N->getZExtValue()); }]>;
+def lomsbzero   : PatLeaf<(imm), [{ return (N->getZExtValue() & 0x80000000)
+                                      == 0; }]>;
+def lozero      : PatLeaf<(imm), [{ return (N->getZExtValue() & 0xffffffff)
+                                      == 0; }]>;
+def fplomsbzero : PatLeaf<(fpimm), [{ return (getFpImmVal(N) & 0x80000000)
+                                        == 0; }]>;
+def fplozero    : PatLeaf<(fpimm), [{ return (getFpImmVal(N) & 0xffffffff)
+                                        == 0; }]>;
+
+def CCSIOp : PatLeaf<(cond), [{
+  switch (N->get()) {
+  default:          return true;
+  case ISD::SETULT:
+  case ISD::SETULE:
+  case ISD::SETUGT:
+  case ISD::SETUGE: return false;
+  }
+}]>;
+
+def CCUIOp : PatLeaf<(cond), [{
+  switch (N->get()) {
+  default:         return true;
+  case ISD::SETLT:
+  case ISD::SETLE:
+  case ISD::SETGT:
+  case ISD::SETGE: return false;
+  }
+}]>;
 
-// ASX format of memory address
-def MEMri : Operand<iPTR> {
+//===----------------------------------------------------------------------===//
+// Addressing modes.
+// SX-Aurora has following fields.
+//    sz: register or 0
+//    sy: register or immediate (-64 to 63)
+//    disp: immediate (-2147483648 to 2147483647)
+//
+// There are two kinds of instruction.
+//    ASX format uses sz + sy + disp.
+//    AS format uses sz + disp.
+//
+// Moreover, there are four kinds of assembly instruction format.
+//    ASX format uses "disp", "disp(, sz)", "disp(sy)", "disp(sy, sz)",
+//    "(, sz)", "(sy)", or "(sy, sz)".
+//    AS format uses "disp", "disp(, sz)", or "(, sz)" in general.
+//    AS format in RRM format uses "disp", "disp(sz)", or "(sz)".
+//    AS format in RRM format for host memory access uses "sz", "(sz)",
+//    or "disp(sz)".
+//
+// We defined them below.
+//
+// ASX format:
+//    MEMrri, MEMrii, MEMzri, MEMzii
+// AS format:
+//    MEMriASX, MEMziASX    : simple AS format
+//    MEMriRRM, MEMziRRM    : AS format in RRM format
+//    MEMriHM, MEMziHM      : AS format in RRM format for host memory access
+//===----------------------------------------------------------------------===//
+
+// DAG selections for both ASX and AS formats.
+def ADDRrri : ComplexPattern<iPTR, 3, "selectADDRrri", [frameindex], []>;
+def ADDRrii : ComplexPattern<iPTR, 3, "selectADDRrii", [frameindex], []>;
+def ADDRzri : ComplexPattern<iPTR, 3, "selectADDRzri", [], []>;
+def ADDRzii : ComplexPattern<iPTR, 3, "selectADDRzii", [], []>;
+def ADDRri : ComplexPattern<iPTR, 2, "selectADDRri", [frameindex], []>;
+def ADDRzi : ComplexPattern<iPTR, 2, "selectADDRzi", [], []>;
+
+// ASX format.
+def VEMEMrriAsmOperand : AsmOperandClass {
+  let Name = "MEMrri";
+  let ParserMethod = "parseMEMOperand";
+}
+def VEMEMriiAsmOperand : AsmOperandClass {
+  let Name = "MEMrii";
+  let ParserMethod = "parseMEMOperand";
+}
+def VEMEMzriAsmOperand : AsmOperandClass {
+  let Name = "MEMzri";
+  let ParserMethod = "parseMEMOperand";
+}
+def VEMEMziiAsmOperand : AsmOperandClass {
+  let Name = "MEMzii";
+  let ParserMethod = "parseMEMOperand";
+}
+
+// ASX format uses single assembly instruction format.
+def MEMrri : Operand<iPTR> {
+  let PrintMethod = "printMemASXOperand";
+  let MIOperandInfo = (ops ptr_rc, ptr_rc, i32imm);
+  let ParserMatchClass = VEMEMrriAsmOperand;
+}
+def MEMrii : Operand<iPTR> {
   let PrintMethod = "printMemASXOperand";
-  let MIOperandInfo = (ops ptr_rc, i64imm);
+  let MIOperandInfo = (ops ptr_rc, i32imm, i32imm);
+  let ParserMatchClass = VEMEMriiAsmOperand;
+}
+def MEMzri : Operand<iPTR> {
+  let PrintMethod = "printMemASXOperand";
+  let MIOperandInfo = (ops i32imm /* = 0 */, ptr_rc, i32imm);
+  let ParserMatchClass = VEMEMzriAsmOperand;
+}
+def MEMzii : Operand<iPTR> {
+  let PrintMethod = "printMemASXOperand";
+  let MIOperandInfo = (ops i32imm /* = 0 */, i32imm, i32imm);
+  let ParserMatchClass = VEMEMziiAsmOperand;
 }
 
-// AS format of memory address
-def MEMASri : Operand<iPTR> {
-  let PrintMethod = "printMemASOperand";
-  let MIOperandInfo = (ops ptr_rc, i64imm);
+// AS format.
+def VEMEMriAsmOperand : AsmOperandClass {
+  let Name = "MEMri";
+  let ParserMethod = "parseMEMAsOperand";
+}
+def VEMEMziAsmOperand : AsmOperandClass {
+  let Name = "MEMzi";
+  let ParserMethod = "parseMEMAsOperand";
 }
 
-// Branch targets have OtherVT type.
-def brtarget32 : Operand<OtherVT> {
-  let EncoderMethod = "getBranchTarget32OpValue";
+// AS format uses multiple assembly instruction formats
+//   1. AS generic assembly instruction format:
+def MEMriASX : Operand<iPTR> {
+  let PrintMethod = "printMemASOperandASX";
+  let MIOperandInfo = (ops ptr_rc, i32imm);
+  let ParserMatchClass = VEMEMriAsmOperand;
+}
+def MEMziASX : Operand<iPTR> {
+  let PrintMethod = "printMemASOperandASX";
+  let MIOperandInfo = (ops i32imm /* = 0 */, i32imm);
+  let ParserMatchClass = VEMEMziAsmOperand;
 }
 
-def simm7Op64 : Operand<i64> {
-  let DecoderMethod = "DecodeSIMM7";
+//   2. AS RRM style assembly instruction format:
+def MEMriRRM : Operand<iPTR> {
+  let PrintMethod = "printMemASOperandRRM";
+  let MIOperandInfo = (ops ptr_rc, i32imm);
+  let ParserMatchClass = VEMEMriAsmOperand;
+}
+def MEMziRRM : Operand<iPTR> {
+  let PrintMethod = "printMemASOperandRRM";
+  let MIOperandInfo = (ops i32imm /* = 0 */, i32imm);
+  let ParserMatchClass = VEMEMziAsmOperand;
 }
 
-def simm32Op64 : Operand<i64> {
-  let DecoderMethod = "DecodeSIMM32";
+//   3. AS HM style assembly instruction format:
+def MEMriHM : Operand<iPTR> {
+  let PrintMethod = "printMemASOperandHM";
+  let MIOperandInfo = (ops ptr_rc, i32imm);
+  let ParserMatchClass = VEMEMriAsmOperand;
+}
+def MEMziHM : Operand<iPTR> {
+  let PrintMethod = "printMemASOperandHM";
+  let MIOperandInfo = (ops i32imm /* = 0 */, i32imm);
+  let ParserMatchClass = VEMEMziAsmOperand;
 }
 
-def uimm6Op64 : Operand<i64> {
-  let DecoderMethod = "DecodeUIMM6";
+//===----------------------------------------------------------------------===//
+// Other operands.
+//===----------------------------------------------------------------------===//
+
+// Branch targets have OtherVT type.
+def brtarget32 : Operand<OtherVT> {
+  let EncoderMethod = "getBranchTargetOpValue";
+  let DecoderMethod = "DecodeSIMM32";
 }
 
 // Operand for printing out a condition code.
-let PrintMethod = "printCCOperand" in
-  def CCOp : Operand<i32>;
+def CCOpAsmOperand : AsmOperandClass { let Name = "CCOp"; }
+def CCOp : Operand<i32>, ImmLeaf<i32, [{
+    return Imm >= 0 && Imm < 22; }], CCOP> {
+  let PrintMethod = "printCCOperand";
+  let DecoderMethod = "DecodeCCOperand";
+  let EncoderMethod = "getCCOpValue";
+  let ParserMatchClass = CCOpAsmOperand;
+}
+
+// Operand for a rounding mode code.
+def RDOpAsmOperand : AsmOperandClass {
+  let Name = "RDOp";
+}
+def RDOp : Operand<i32> {
+  let PrintMethod = "printRDOperand";
+  let DecoderMethod = "DecodeRDOperand";
+  let EncoderMethod = "getRDOpValue";
+  let ParserMatchClass = RDOpAsmOperand;
+}
+
+def VEhi    : SDNode<"VEISD::Hi", SDTIntUnaryOp>;
+def VElo    : SDNode<"VEISD::Lo", SDTIntUnaryOp>;
 
 //  These are target-independent nodes, but have target-specific formats.
 def SDT_SPCallSeqStart : SDCallSeqStart<[ SDTCisVT<0, i64>,
@@ -72,10 +420,29 @@ def callseq_start : SDNode<"ISD::CALLSEQ_START", SDT_SPCallSeqStart,
 def callseq_end   : SDNode<"ISD::CALLSEQ_END",   SDT_SPCallSeqEnd,
                            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
-// def SDT_SPCall    : SDTypeProfile<0, -1, [SDTCisVT<0, i64>]>;
+def SDT_SPCall    : SDTypeProfile<0, -1, [SDTCisVT<0, i64>]>;
+def call          : SDNode<"VEISD::CALL", SDT_SPCall,
+                           [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                            SDNPVariadic]>;
 
 def retflag       : SDNode<"VEISD::RET_FLAG", SDTNone,
                            [SDNPHasChain, SDNPOptInGlue, SDNPVariadic]>;
+
+def getGOT        : Operand<iPTR>;
+
+// GETFUNPLT for PIC
+def GetFunPLT : SDNode<"VEISD::GETFUNPLT", SDTIntUnaryOp>;
+
+// GETTLSADDR for TLS
+def GetTLSAddr : SDNode<"VEISD::GETTLSADDR", SDT_SPCall,
+                        [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue,
+                         SDNPVariadic]>;
+
+// GETSTACKTOP
+def GetStackTop : SDNode<"VEISD::GETSTACKTOP", SDTNone,
+                        [SDNPHasChain, SDNPSideEffect]>;
+
+
 //===----------------------------------------------------------------------===//
 // VE Flag Conditions
 //===----------------------------------------------------------------------===//
@@ -107,168 +474,1243 @@ def CC_LENAN : CC_VAL<20>;  // Less or Equal or NaN
 def CC_AT    : CC_VAL<21>;  // Always true
 
 //===----------------------------------------------------------------------===//
+// VE Rounding Mode
+//===----------------------------------------------------------------------===//
+
+// Note that these values must be kept in sync with the VERD::RoundingMode enum
+// values.
+class RD_VAL<int N> : PatLeaf<(i32 N)>;
+def RD_NONE  : RD_VAL< 0>;  // According to PSW
+def RD_RZ    : RD_VAL< 8>;  // Round toward Zero
+def RD_RP    : RD_VAL< 9>;  // Round toward Plus infinity
+def RD_RM    : RD_VAL<10>;  // Round toward Minus infinity
+def RD_RN    : RD_VAL<11>;  // Round to Nearest (ties to Even)
+def RD_RA    : RD_VAL<12>;  // Round to Nearest (ties to Away)
+
+//===----------------------------------------------------------------------===//
 // VE Multiclasses for common instruction formats
 //===----------------------------------------------------------------------===//
 
-multiclass RMm<string opcStr, bits<8>opc,
-               RegisterClass RC, ValueType Ty, Operand immOp, Operand immOp2> {
-  def rri : RM<
-    opc, (outs RC:$sx), (ins RC:$sy, RC:$sz, immOp2:$imm32),
-    !strconcat(opcStr, " $sx, ${imm32}($sy, ${sz})")> {
-    let cy = 1;
-    let cz = 1;
-    let hasSideEffects = 0;
+// Multiclass for generic RR type instructions
+let hasSideEffects = 0 in
+multiclass RRbm<string opcStr, bits<8>opc,
+                RegisterClass RCo, ValueType Tyo,
+                RegisterClass RCi, ValueType Tyi,
+                SDPatternOperator OpNode = null_frag,
+                Operand immOp = simm7, Operand mOp = mimm> {
+  def rr : RR<opc, (outs RCo:$sx), (ins RCi:$sy, RCi:$sz),
+              !strconcat(opcStr, " $sx, $sy, $sz"),
+              [(set Tyo:$sx, (OpNode Tyi:$sy, Tyi:$sz))]>;
+  // VE calculates (OpNode $sy, $sz), but llvm requires to have immediate
+  // in RHS, so we use following definition.
+  let cy = 0 in
+  def ri : RR<opc, (outs RCo:$sx), (ins RCi:$sz, immOp:$sy),
+              !strconcat(opcStr, " $sx, $sy, $sz"),
+              [(set Tyo:$sx, (OpNode Tyi:$sz, (Tyi immOp:$sy)))]>;
+  let cz = 0 in
+  def rm : RR<opc, (outs RCo:$sx), (ins RCi:$sy, mOp:$sz),
+              !strconcat(opcStr, " $sx, $sy, $sz"),
+              [(set Tyo:$sx, (OpNode Tyi:$sy, (Tyi mOp:$sz)))]>;
+  let cy = 0, cz = 0 in
+  def im : RR<opc, (outs RCo:$sx), (ins immOp:$sy, mOp:$sz),
+              !strconcat(opcStr, " $sx, $sy, $sz"),
+              [(set Tyo:$sx, (OpNode (Tyi immOp:$sy), (Tyi mOp:$sz)))]>;
+}
+
+// Multiclass for non-commutative RR type instructions
+let hasSideEffects = 0 in
+multiclass RRNCbm<string opcStr, bits<8>opc,
+                RegisterClass RCo, ValueType Tyo,
+                RegisterClass RCi, ValueType Tyi,
+                SDPatternOperator OpNode = null_frag,
+                Operand immOp = simm7, Operand mOp = mimm> {
+  def rr : RR<opc, (outs RCo:$sx), (ins RCi:$sy, RCi:$sz),
+              !strconcat(opcStr, " $sx, $sy, $sz"),
+              [(set Tyo:$sx, (OpNode Tyi:$sy, Tyi:$sz))]>;
+  let cy = 0 in
+  def ir : RR<opc, (outs RCo:$sx), (ins immOp:$sy, RCi:$sz),
+              !strconcat(opcStr, " $sx, $sy, $sz"),
+              [(set Tyo:$sx, (OpNode (Tyi immOp:$sy), Tyi:$sz))]>;
+  let cz = 0 in
+  def rm : RR<opc, (outs RCo:$sx), (ins RCi:$sy, mOp:$sz),
+              !strconcat(opcStr, " $sx, $sy, $sz"),
+              [(set Tyo:$sx, (OpNode Tyi:$sy, (Tyi mOp:$sz)))]>;
+  let cy = 0, cz = 0 in
+  def im : RR<opc, (outs RCo:$sx), (ins immOp:$sy, mOp:$sz),
+              !strconcat(opcStr, " $sx, $sy, $sz"),
+              [(set Tyo:$sx, (OpNode (Tyi immOp:$sy), (Tyi mOp:$sz)))]>;
+}
+
+// Generic RR multiclass with 2 arguments.
+//   e.g. ADDUL, ADDSWSX, ADDSWZX, and etc.
+multiclass RRm<string opcStr, bits<8>opc,
+               RegisterClass RC, ValueType Ty,
+               SDPatternOperator OpNode = null_frag,
+               Operand immOp = simm7, Operand mOp = mimm> :
+  RRbm<opcStr, opc, RC, Ty, RC, Ty, OpNode, immOp, mOp>;
+
+// Generic RR multiclass for non-commutative instructions with 2 arguments.
+//   e.g. SUBUL, SUBUW, SUBSWSX, and etc.
+multiclass RRNCm<string opcStr, bits<8>opc,
+                 RegisterClass RC, ValueType Ty,
+                 SDPatternOperator OpNode = null_frag,
+                 Operand immOp = simm7, Operand mOp = mimm> :
+  RRNCbm<opcStr, opc, RC, Ty, RC, Ty, OpNode, immOp, mOp>;
+
+// Generic RR multiclass for floating point instructions with 2 arguments.
+//   e.g. FADDD, FADDS, FSUBD, and etc.
+multiclass RRFm<string opcStr, bits<8>opc,
+                RegisterClass RC, ValueType Ty,
+                SDPatternOperator OpNode = null_frag,
+                Operand immOp = simm7fp, Operand mOp = mimmfp> :
+  RRNCbm<opcStr, opc, RC, Ty, RC, Ty, OpNode, immOp, mOp>;
+
+// Generic RR multiclass for shift instructions with 2 arguments.
+//   e.g. SLL, SRL, SLAWSX, and etc.
+let hasSideEffects = 0 in
+multiclass RRIm<string opcStr, bits<8>opc,
+                RegisterClass RC, ValueType Ty,
+                SDPatternOperator OpNode = null_frag> {
+  def rr : RR<opc, (outs RC:$sx), (ins RC:$sz, I32:$sy),
+              !strconcat(opcStr, " $sx, $sz, $sy"),
+              [(set Ty:$sx, (OpNode Ty:$sz, i32:$sy))]>;
+  let cz = 0 in
+  def mr : RR<opc, (outs RC:$sx), (ins mimm:$sz, I32:$sy),
+              !strconcat(opcStr, " $sx, $sz, $sy"),
+              [(set Ty:$sx, (OpNode (Ty mimm:$sz), i32:$sy))]>;
+  let cy = 0 in
+  def ri : RR<opc, (outs RC:$sx), (ins RC:$sz, uimm7:$sy),
+              !strconcat(opcStr, " $sx, $sz, $sy"),
+              [(set Ty:$sx, (OpNode Ty:$sz, (i32 uimm7:$sy)))]>;
+  let cy = 0, cz = 0 in
+  def mi : RR<opc, (outs RC:$sx), (ins mimm:$sz, uimm7:$sy),
+              !strconcat(opcStr, " $sx, $sz, $sy"),
+              [(set Ty:$sx, (OpNode (Ty mimm:$sz), (i32 uimm7:$sy)))]>;
+}
+
+// Special RR multiclass for 128 bits shift left instruction.
+//   e.g. SLD
+let Constraints = "$hi = $sx", DisableEncoding = "$hi", hasSideEffects = 0 in
+multiclass RRILDm<string opcStr, bits<8>opc,
+                  RegisterClass RC, ValueType Ty,
+                  SDPatternOperator OpNode = null_frag> {
+  def rrr : RR<opc, (outs RC:$sx), (ins RC:$hi, RC:$sz, I32:$sy),
+              !strconcat(opcStr, " $sx, $sz, $sy")>;
+  let cz = 0 in
+  def rmr : RR<opc, (outs RC:$sx), (ins RC:$hi, mimm:$sz, I32:$sy),
+              !strconcat(opcStr, " $sx, $sz, $sy")>;
+  let cy = 0 in
+  def rri : RR<opc, (outs RC:$sx), (ins RC:$hi, RC:$sz, uimm7:$sy),
+              !strconcat(opcStr, " $sx, $sz, $sy")>;
+  let cy = 0, cz = 0 in
+  def rmi : RR<opc, (outs RC:$sx), (ins RC:$hi, mimm:$sz, uimm7:$sy),
+              !strconcat(opcStr, " $sx, $sz, $sy")>;
+}
+
+// Special RR multiclass for 128 bits shift right instruction.
+//   e.g. SRD
+let Constraints = "$low = $sx", DisableEncoding = "$low", hasSideEffects = 0 in
+multiclass RRIRDm<string opcStr, bits<8>opc,
+                  RegisterClass RC, ValueType Ty,
+                  SDPatternOperator OpNode = null_frag> {
+  def rrr : RR<opc, (outs RC:$sx), (ins RC:$sz, RC:$low, I32:$sy),
+              !strconcat(opcStr, " $sx, $sz, $sy")>;
+  let cz = 0 in
+  def mrr : RR<opc, (outs RC:$sx), (ins mimm:$sz, RC:$low, I32:$sy),
+              !strconcat(opcStr, " $sx, $sz, $sy")>;
+  let cy = 0 in
+  def rri : RR<opc, (outs RC:$sx), (ins RC:$sz, RC:$low, uimm7:$sy),
+              !strconcat(opcStr, " $sx, $sz, $sy")>;
+  let cy = 0, cz = 0 in
+  def mri : RR<opc, (outs RC:$sx), (ins mimm:$sz, RC:$low, uimm7:$sy),
+              !strconcat(opcStr, " $sx, $sz, $sy")>;
+}
+
+// Generic RR multiclass with an argument.
+//   e.g. LDZ, PCNT, and  BRV
+let cy = 0, sy = 0, hasSideEffects = 0 in
+multiclass RRI1m<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty,
+                 SDPatternOperator OpNode = null_frag> {
+  def r : RR<opc, (outs RC:$sx), (ins RC:$sz), !strconcat(opcStr, " $sx, $sz"),
+             [(set Ty:$sx, (OpNode Ty:$sz))]>;
+  let cz = 0 in
+  def m : RR<opc, (outs RC:$sx), (ins mimm:$sz),
+             !strconcat(opcStr, " $sx, $sz"),
+             [(set Ty:$sx, (OpNode (Ty mimm:$sz)))]>;
+}
+
+// Special RR multiclass for MRG instruction.
+//   e.g. MRG
+let Constraints = "$sx = $sd", DisableEncoding = "$sd", hasSideEffects = 0 in
+multiclass RRMRGm<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty> {
+  def rr : RR<opc, (outs RC:$sx), (ins RC:$sy, RC:$sz, RC:$sd),
+              !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cy = 0 in
+  def ir : RR<opc, (outs RC:$sx), (ins simm7:$sy, RC:$sz, RC:$sd),
+              !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cz = 0 in
+  def rm : RR<opc, (outs RC:$sx), (ins RC:$sy, mimm:$sz, RC:$sd),
+              !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cy = 0, cz = 0 in
+  def im : RR<opc, (outs RC:$sx), (ins simm7:$sy, mimm:$sz, RC:$sd),
+              !strconcat(opcStr, " $sx, $sy, $sz")>;
+}
+
+// Special RR multiclass for BSWP instruction.
+//   e.g. BSWP
+let hasSideEffects = 0 in
+multiclass RRSWPm<string opcStr, bits<8>opc,
+                  RegisterClass RC, ValueType Ty,
+                  SDPatternOperator OpNode = null_frag> {
+  let cy = 0 in
+  def ri : RR<opc, (outs RC:$sx), (ins RC:$sz, uimm1:$sy),
+              !strconcat(opcStr, " $sx, $sz, $sy"),
+              [(set Ty:$sx, (OpNode Ty:$sz, (i32 uimm1:$sy)))]>;
+  let cy = 0, cz = 0 in
+  def mi : RR<opc, (outs RC:$sx), (ins mimm:$sz, uimm1:$sy),
+              !strconcat(opcStr, " $sx, $sz, $sy"),
+              [(set Ty:$sx, (OpNode (Ty mimm:$sz), (i32 uimm1:$sy)))]>;
+}
+
+// Multiclass for CMOV instructions.
+//   e.g. CMOVL, CMOVW, CMOVD, and etc.
+let Constraints = "$sx = $sd", DisableEncoding = "$sd", hasSideEffects = 0,
+    cfw = ? in
+multiclass RRCMOVm<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty> {
+  def rr : RR<opc, (outs I64:$sx), (ins CCOp:$cfw, RC:$sy, I64:$sz, I64:$sd),
+              !strconcat(opcStr, " $sx, $sz, $sy")>;
+  let cy = 0 in
+  def ir : RR<opc, (outs I64:$sx),
+              (ins CCOp:$cfw, simm7:$sy, I64:$sz, I64:$sd),
+              !strconcat(opcStr, " $sx, $sz, $sy")>;
+  let cz = 0 in
+  def rm : RR<opc, (outs I64:$sx),
+              (ins CCOp:$cfw, RC:$sy, mimm:$sz, I64:$sd),
+              !strconcat(opcStr, " $sx, $sz, $sy")>;
+  let cy = 0, cz = 0 in
+  def im : RR<opc, (outs I64:$sx),
+              (ins CCOp:$cfw, simm7:$sy, mimm:$sz, I64:$sd),
+              !strconcat(opcStr, " $sx, $sz, $sy")>;
+}
+
+// Multiclass for floating point conversion instructions.
+//   e.g. CVTWDSX, CVTWDZX, CVTWSSX, and etc.
+// sz{3-0} = rounding mode
+let cz = 0, hasSideEffects = 0 in
+multiclass CVTRDm<string opcStr, bits<8> opc, RegisterClass RCo, ValueType Tyo,
+                  RegisterClass RCi, ValueType Tyi> {
+  def r : RR<opc, (outs RCo:$sx), (ins RDOp:$rd, RCi:$sy),
+             !strconcat(opcStr, "${rd} $sx, $sy")> {
+    bits<4> rd;
+    let sz{5-4} = 0;
+    let sz{3-0} = rd;
   }
-  def zzi : RM<
-    opc, (outs RC:$sx), (ins immOp2:$imm32),
-    !strconcat(opcStr, " $sx, $imm32")> {
-    let cy = 0;
-    let sy = 0;
-    let cz = 0;
-    let sz = 0;
-    let hasSideEffects = 0;
+  let cy = 0 in
+  def i : RR<opc, (outs RCo:$sx), (ins RDOp:$rd, simm7:$sy),
+             !strconcat(opcStr, "${rd} $sx, $sy")> {
+    bits<4> rd;
+    let sz{5-4} = 0;
+    let sz{3-0} = rd;
   }
 }
 
-// Multiclass for RR type instructions
+// Multiclass for floating point conversion instructions.
+//   e.g. CVTDW, CVTSW, CVTDL, and etc.
+let cz = 0, sz = 0, hasSideEffects = 0 in
+multiclass CVTm<string opcStr, bits<8> opc, RegisterClass RCo, ValueType Tyo,
+                RegisterClass RCi, ValueType Tyi,
+                SDPatternOperator OpNode = null_frag> {
+  def r : RR<opc, (outs RCo:$sx), (ins RCi:$sy),
+             !strconcat(opcStr, " $sx, $sy"),
+             [(set Tyo:$sx, (OpNode Tyi:$sy))]>;
+  let cy = 0 in
+  def i : RR<opc, (outs RCo:$sx), (ins simm7:$sy),
+             !strconcat(opcStr, " $sx, $sy")>;
+}
 
-multiclass RRmrr<string opcStr, bits<8>opc,
-                 RegisterClass RCo, ValueType Tyo,
-                 RegisterClass RCi, ValueType Tyi> {
-  def rr : RR<opc, (outs RCo:$sx), (ins RCi:$sy, RCi:$sz),
-              !strconcat(opcStr, " $sx, $sy, $sz")>
-           { let cy = 1; let cz = 1; let hasSideEffects = 0; }
+// Multiclass for PFCH instructions.
+//   e.g. PFCH
+let sx = 0, hasSideEffects = 0 in
+multiclass PFCHm<string opcStr, bits<8>opc> {
+  def rri : RM<opc, (outs), (ins MEMrri:$addr), !strconcat(opcStr, " $addr"),
+               [(prefetch ADDRrri:$addr, imm, imm, (i32 1))]>;
+  let cy = 0 in
+  def rii : RM<opc, (outs), (ins MEMrii:$addr), !strconcat(opcStr, " $addr"),
+               [(prefetch ADDRrii:$addr, imm, imm, (i32 1))]>;
+  let cz = 0 in
+  def zri : RM<opc, (outs), (ins MEMzri:$addr), !strconcat(opcStr, " $addr"),
+               [(prefetch ADDRzri:$addr, imm, imm, (i32 1))]>;
+  let cy = 0, cz = 0 in
+  def zii : RM<opc, (outs), (ins MEMzii:$addr), !strconcat(opcStr, " $addr"),
+               [(prefetch ADDRzii:$addr, imm, imm, (i32 1))]>;
 }
 
-multiclass RRmri<string opcStr, bits<8>opc,
-                 RegisterClass RCo, ValueType Tyo,
-                 RegisterClass RCi, ValueType Tyi, Operand immOp> {
-  // VE calculates (OpNode $sy, $sz), but llvm requires to have immediate
-  // in RHS, so we use following definition.
-  def ri : RR<opc, (outs RCo:$sx), (ins RCi:$sz, immOp:$sy),
-              !strconcat(opcStr, " $sx, $sy, $sz")>
-           { let cy = 0; let cz = 1; let hasSideEffects = 0; }
+// Multiclass for CAS instructions.
+//   e.g. TS1AML, TS1AMW, TS2AM, and etc.
+let Constraints = "$dest = $sd", DisableEncoding = "$sd",
+    mayStore=1, mayLoad = 1, hasSideEffects = 0 in
+multiclass RRCAStgm<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty,
+                    Operand immOp, Operand MEM, Operand ADDR,
+                    SDPatternOperator OpNode = null_frag> {
+  def r : RRM<opc, (outs RC:$dest), (ins MEM:$addr, RC:$sy, RC:$sd),
+              !strconcat(opcStr, " $dest, $addr, $sy"),
+              [(set Ty:$dest, (OpNode ADDR:$addr, Ty:$sy, Ty:$sd))]>;
+  let cy = 0 in
+  def i : RRM<opc, (outs RC:$dest), (ins MEM:$addr, immOp:$sy, RC:$sd),
+              !strconcat(opcStr, " $dest, $addr, $sy"),
+              [(set Ty:$dest, (OpNode ADDR:$addr, (Ty immOp:$sy), Ty:$sd))]>;
+}
+multiclass RRCASm<string opcStr, bits<8>opc, RegisterClass RC, ValueType Ty,
+                  Operand immOp, SDPatternOperator OpNode = null_frag> {
+  defm ri : RRCAStgm<opcStr, opc, RC, Ty, immOp, MEMriRRM, ADDRri, OpNode>;
+  let cz = 0 in
+  defm zi : RRCAStgm<opcStr, opc, RC, Ty, immOp, MEMziRRM, ADDRzi, OpNode>;
 }
 
-multiclass RRmiz<string opcStr, bits<8>opc,
-                 RegisterClass RCo, ValueType Tyo,
-                 RegisterClass RCi, ValueType Tyi, Operand immOp> {
-  def zi : RR<opc, (outs RCo:$sx), (ins immOp:$sy),
-              !strconcat(opcStr, " $sx, $sy")>
-           { let cy = 0; let cz = 0; let sz = 0; let hasSideEffects = 0; }
+// Multiclass for branch instructions
+//   e.g. BCFL, BCFW, BCFD, and etc.
+let isBranch = 1, isTerminator = 1, isIndirectBranch = 1, hasSideEffects = 0 in
+multiclass BCbpfm<string opcStr, string cmpStr, bits<8> opc, dag cond,
+                  Operand ADDR> {
+  let bpf = 0 /* NONE */ in
+  def "" : CF<opc, (outs), !con(cond, (ins ADDR:$addr)),
+              !strconcat(opcStr, " ", cmpStr, "$addr")>;
+  let bpf = 2 /* NOT TaKEN */ in
+  def _nt : CF<opc, (outs), !con(cond, (ins ADDR:$addr)),
+               !strconcat(opcStr, ".nt ", cmpStr, "$addr")>;
+  let bpf = 3 /* TaKEN */ in
+  def _t : CF<opc, (outs), !con(cond, (ins ADDR:$addr)),
+              !strconcat(opcStr, ".t ", cmpStr, "$addr")>;
+}
+multiclass BCtgm<string opcStr, string cmpStr, bits<8> opc, dag cond> {
+  defm ri : BCbpfm<opcStr, cmpStr, opc, cond, MEMriASX>;
+  let cz = 0 in defm zi : BCbpfm<opcStr, cmpStr, opc, cond, MEMziASX>;
+}
+multiclass BCm<string opcStr, string opcStrAt, string opcStrAf, bits<8> opc,
+               RegisterClass RC, Operand immOp> {
+  let DecoderMethod = "DecodeBranchCondition" in
+  defm r : BCtgm<opcStr, "$comp, ", opc, (ins CCOp:$cond, RC:$comp)>;
+  let DecoderMethod = "DecodeBranchCondition", cy = 0 in
+  defm i : BCtgm<opcStr, "$comp, ", opc, (ins CCOp:$cond, immOp:$comp)>;
+  let DecoderMethod = "DecodeBranchConditionAlways", cy = 0, sy = 0,
+      cf = 15 /* AT */, isBarrier = 1 in
+  defm a : BCtgm<opcStrAt, "", opc, (ins)>;
+  let DecoderMethod = "DecodeBranchConditionAlways", cy = 0, sy = 0,
+      cf = 0 /* AF */ in
+  defm na : BCtgm<opcStrAf, "", opc, (ins)>;
 }
 
-multiclass RRNDmrm<string opcStr, bits<8>opc,
-                   RegisterClass RCo, ValueType Tyo,
-                   RegisterClass RCi, ValueType Tyi, Operand immOp2> {
-  def rm0 : RR<opc, (outs RCo:$sx), (ins RCi:$sy, immOp2:$sz),
-               !strconcat(opcStr, " $sx, $sy, (${sz})0")> {
-              let cy = 1;
-              let cz = 0;
-              let sz{6} = 1;
-              // (guess) tblgen conservatively assumes hasSideEffects when
-              // it fails to infer from a pattern.
-              let hasSideEffects = 0;
-            }
+// Multiclass for relative branch instructions
+//   e.g. BRCFL, BRCFW, BRCFD, and etc.
+let isBranch = 1, isTerminator = 1, hasSideEffects = 0 in
+multiclass BCRbpfm<string opcStr, string cmpStr, bits<8> opc, dag cond> {
+  let bpf = 0 /* NONE */ in
+  def "" : CF<opc, (outs), !con(cond, (ins brtarget32:$imm32)),
+              !strconcat(opcStr, " ", cmpStr, "$imm32")>;
+  let bpf = 2 /* NOT TaKEN */ in
+  def _nt : CF<opc, (outs), !con(cond, (ins brtarget32:$imm32)),
+               !strconcat(opcStr, ".nt ", cmpStr, "$imm32")>;
+  let bpf = 3 /* TaKEN */ in
+  def _t : CF<opc, (outs), !con(cond, (ins brtarget32:$imm32)),
+              !strconcat(opcStr, ".t ", cmpStr, "$imm32")>;
+}
+multiclass BCRm<string opcStr, string opcStrAt, string opcStrAf, bits<8> opc,
+               RegisterClass RC, Operand immOp> {
+  defm rr : BCRbpfm<opcStr, "$sy, $sz, ", opc, (ins CCOp:$cf, RC:$sy, RC:$sz)>;
+  let cy = 0 in
+  defm ir : BCRbpfm<opcStr, "$sy, $sz, ", opc, (ins CCOp:$cf, immOp:$sy, RC:$sz)>;
+  let cy = 0, sy = 0, cz = 0, sz = 0, cf = 15 /* AT */, isBarrier = 1 in
+  defm a : BCRbpfm<opcStrAt, "", opc, (ins)>;
+  let cy = 0, sy = 0, cz = 0, sz = 0, cf = 0 /* AF */ in
+  defm na : BCRbpfm<opcStrAf, "", opc, (ins)>;
 }
 
-// Used by add, mul, div, and similar commutative instructions
-//   The order of operands are "$sx, $sy, $sz"
+// Multiclass for communication register instructions.
+//   e.g. LCR
+let hasSideEffects = 1 in
+multiclass LOADCRm<string opcStr, bits<8>opc, RegisterClass RC> {
+  def rr : RR<opc, (outs RC:$sx), (ins RC:$sz, RC:$sy),
+              !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cy = 0 in def ri : RR<opc, (outs RC:$sx), (ins RC:$sz, simm7:$sy),
+                            !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cz = 0 in def zr : RR<opc, (outs RC:$sx), (ins zero:$sz, RC:$sy),
+                            !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cy = 0, cz = 0 in
+  def zi : RR<opc, (outs RC:$sx), (ins zero:$sz, simm7:$sy),
+              !strconcat(opcStr, " $sx, $sy, $sz")>;
+}
 
-multiclass RRm<string opcStr, bits<8>opc,
-               RegisterClass RC, ValueType Ty, Operand immOp, Operand immOp2> :
-  RRmrr<opcStr, opc, RC, Ty, RC, Ty>,
-  RRmri<opcStr, opc, RC, Ty, RC, Ty, immOp>,
-  RRmiz<opcStr, opc, RC, Ty, RC, Ty, immOp>,
-  RRNDmrm<opcStr, opc, RC, Ty, RC, Ty, immOp2>;
-
-// Branch multiclass
-let isBranch = 1, isTerminator = 1, hasDelaySlot = 1 in
-multiclass BCRm<string opcStr, string opcStrAt, bits<8> opc,
-                RegisterClass RC, ValueType Ty, Operand immOp, Operand immOp2> {
-  def rr : CF<
-    opc, (outs),
-    (ins CCOp:$cf, RC:$sy, RC:$sz, brtarget32:$imm32),
-    !strconcat(opcStr, " $sy, $sz, $imm32")> {
-    let cy = 1;
-    let cz = 1;
-    let hasSideEffects = 0;
-  }
+// Multiclass for communication register instructions.
+//   e.g. SCR
+let hasSideEffects = 1 in
+multiclass STORECRm<string opcStr, bits<8>opc, RegisterClass RC> {
+  def rr : RR<opc, (outs), (ins RC:$sz, RC:$sy, RC:$sx),
+              !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cy = 0 in def ri : RR<opc, (outs), (ins RC:$sz, simm7:$sy, RC:$sx),
+                            !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cz = 0 in def zr : RR<opc, (outs), (ins zero:$sz, RC:$sy, RC:$sx),
+                            !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cy = 0, cz = 0 in
+  def zi : RR<opc, (outs), (ins zero:$sz, simm7:$sy, RC:$sx),
+              !strconcat(opcStr, " $sx, $sy, $sz")>;
 }
 
+// Multiclass for communication register instructions.
+//   e.g. FIDCR
+let cz = 0, hasSideEffects = 1 in
+multiclass FIDCRm<string opcStr, bits<8>opc, RegisterClass RC> {
+  def ri : RR<opc, (outs RC:$sx), (ins RC:$sy, uimm3:$sz),
+              !strconcat(opcStr, " $sx, $sy, $sz")>;
+  let cy = 0 in def ii : RR<opc, (outs RC:$sx), (ins simm7:$sy, uimm3:$sz),
+                            !strconcat(opcStr, " $sx, $sy, $sz")>;
+}
+
+// Multiclass for LHM instruction.
+let mayLoad = 1, hasSideEffects = 0 in
+multiclass LHMm<string opcStr, bits<8> opc, RegisterClass RC> {
+  def ri : RRMHM<opc, (outs RC:$dest), (ins MEMriHM:$addr),
+                 !strconcat(opcStr, " $dest, $addr")>;
+  let cz = 0 in
+  def zi : RRMHM<opc, (outs RC:$dest), (ins MEMziHM:$addr),
+                 !strconcat(opcStr, " $dest, $addr")>;
+}
+
+// Multiclass for SHM instruction.
+let mayStore = 1, hasSideEffects = 0 in
+multiclass SHMm<string opcStr, bits<8> opc, RegisterClass RC> {
+  def ri : RRMHM<opc, (outs), (ins MEMriHM:$addr, RC:$sx),
+                 !strconcat(opcStr, " $sx, $addr")>;
+  let cz = 0 in
+  def zi : RRMHM<opc, (outs), (ins MEMziHM:$addr, RC:$sx),
+                 !strconcat(opcStr, " $sx, $addr")>;
+}
 
 //===----------------------------------------------------------------------===//
 // Instructions
+//
+// Define all scalar instructions defined in SX-Aurora TSUBASA Architecture
+// Guide here.  As those mnemonics, we use mnemonics defined in Vector Engine
+// Assembly Language Reference Manual.
 //===----------------------------------------------------------------------===//
 
-// LEA and LEASL instruction (load 32 bit imm to low or high part)
-let cx = 0 in
-defm LEA : RMm<"lea", 0x06, I64, i64, simm7Op64, simm32Op64>;
+//-----------------------------------------------------------------------------
+// Section 8.2 - Load/Store instructions
+//-----------------------------------------------------------------------------
+
+// Multiclass for generic RM instructions
+multiclass RMm<string opcStr, bits<8>opc, RegisterClass RC> {
+  def rri : RM<opc, (outs RC:$dest), (ins MEMrri:$addr),
+               !strconcat(opcStr, " $dest, $addr"), []>;
+  let cy = 0 in
+  def rii : RM<opc, (outs RC:$dest), (ins MEMrii:$addr),
+               !strconcat(opcStr, " $dest, $addr"), []>;
+  let cz = 0 in
+  def zri : RM<opc, (outs RC:$dest), (ins MEMzri:$addr),
+               !strconcat(opcStr, " $dest, $addr"), []>;
+  let cy = 0, cz = 0 in
+  def zii : RM<opc, (outs RC:$dest), (ins MEMzii:$addr),
+               !strconcat(opcStr, " $dest, $addr"), []>;
+}
+
+// Section 8.2.1 - LEA
+let cx = 0, DecoderMethod = "DecodeLoadI64" in
+defm LEA : RMm<"lea", 0x06, I64>;
+let cx = 1, DecoderMethod = "DecodeLoadI64" in
+defm LEASL : RMm<"lea.sl", 0x06, I64>;
+let cx = 0, DecoderMethod = "DecodeLoadI32", isCodeGenOnly = 1 in
+defm LEA32 : RMm<"lea", 0x06, I32>;
+
+def : Pat<(iPTR ADDRrri:$addr), (LEArri MEMrri:$addr)>;
+def : Pat<(iPTR ADDRrii:$addr), (LEArii MEMrii:$addr)>;
+def : Pat<(add I64:$base, simm32:$disp), (LEArii $base, 0, (LO32 $disp))>;
+def : Pat<(add I64:$base, lozero:$disp), (LEASLrii $base, 0, (HI32 $disp))>;
+def : Pat<(add I32:$base, simm32:$disp),
+          (LEA32rii (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $base, sub_i32), 0,
+                    (LO32 $disp))>;
+
+def lea_add : PatFrags<(ops node:$base, node:$idx, node:$disp),
+                       [(add (add node:$base, node:$idx), node:$disp),
+                        (add (add node:$base, node:$disp), node:$idx)]>;
+def : Pat<(lea_add I64:$base, simm7:$idx, simm32:$disp),
+          (LEArii $base, (LO7 $idx), (LO32 $disp))>;
+def : Pat<(lea_add I64:$base, I64:$idx, simm32:$disp),
+          (LEArri $base, $idx, (LO32 $disp))>;
+def : Pat<(lea_add I64:$base, simm7:$idx, lozero:$disp),
+          (LEASLrii $base, (LO7 $idx), (HI32 $disp))>;
+def : Pat<(lea_add I64:$base, I64:$idx, lozero:$disp),
+          (LEASLrri $base, $idx, (HI32 $disp))>;
+
+// Multiclass for load instructions.
+let mayLoad = 1, hasSideEffects = 0 in
+multiclass LOADm<string opcStr, bits<8> opc, RegisterClass RC, ValueType Ty,
+                 SDPatternOperator OpNode = null_frag> {
+  def rri : RM<opc, (outs RC:$dest), (ins MEMrri:$addr),
+               !strconcat(opcStr, " $dest, $addr"),
+               [(set Ty:$dest, (OpNode ADDRrri:$addr))]>;
+  let cy = 0 in
+  def rii : RM<opc, (outs RC:$dest), (ins MEMrii:$addr),
+               !strconcat(opcStr, " $dest, $addr"),
+               [(set Ty:$dest, (OpNode ADDRrii:$addr))]>;
+  let cz = 0 in
+  def zri : RM<opc, (outs RC:$dest), (ins MEMzri:$addr),
+               !strconcat(opcStr, " $dest, $addr"),
+               [(set Ty:$dest, (OpNode ADDRzri:$addr))]>;
+  let cy = 0, cz = 0 in
+  def zii : RM<opc, (outs RC:$dest), (ins MEMzii:$addr),
+               !strconcat(opcStr, " $dest, $addr"),
+               [(set Ty:$dest, (OpNode ADDRzii:$addr))]>;
+}
+
+// Section 8.2.2 - LDS
+let DecoderMethod = "DecodeLoadI64" in
+defm LD : LOADm<"ld", 0x01, I64, i64, load>;
+def : Pat<(f64 (load ADDRrri:$addr)), (LDrri MEMrri:$addr)>;
+def : Pat<(f64 (load ADDRrii:$addr)), (LDrii MEMrii:$addr)>;
+def : Pat<(f64 (load ADDRzri:$addr)), (LDzri MEMzri:$addr)>;
+def : Pat<(f64 (load ADDRzii:$addr)), (LDzii MEMzii:$addr)>;
+
+// Section 8.2.3 - LDU
+let DecoderMethod = "DecodeLoadF32" in
+defm LDU : LOADm<"ldu", 0x02, F32, f32, load>;
+
+// Section 8.2.4 - LDL
+let DecoderMethod = "DecodeLoadI32" in
+defm LDLSX : LOADm<"ldl.sx", 0x03, I32, i32, load>;
+let cx = 1, DecoderMethod = "DecodeLoadI32" in
+defm LDLZX : LOADm<"ldl.zx", 0x03, I32, i32, load>;
+
+// Section 8.2.5 - LD2B
+let DecoderMethod = "DecodeLoadI32" in
+defm LD2BSX : LOADm<"ld2b.sx", 0x04, I32, i32, sextloadi16>;
+let cx = 1, DecoderMethod = "DecodeLoadI32" in
+defm LD2BZX : LOADm<"ld2b.zx", 0x04, I32, i32, zextloadi16>;
+
+// Section 8.2.6 - LD1B
+let DecoderMethod = "DecodeLoadI32" in
+defm LD1BSX : LOADm<"ld1b.sx", 0x05, I32, i32, sextloadi8>;
+let cx = 1, DecoderMethod = "DecodeLoadI32" in
+defm LD1BZX : LOADm<"ld1b.zx", 0x05, I32, i32, zextloadi8>;
+
+// Multiclass for store instructions.
+let mayStore = 1 in
+multiclass STOREm<string opcStr, bits<8> opc, RegisterClass RC, ValueType Ty,
+                  SDPatternOperator OpNode = null_frag> {
+  def rri : RM<opc, (outs), (ins MEMrri:$addr, RC:$sx),
+               !strconcat(opcStr, " $sx, $addr"),
+               [(OpNode Ty:$sx, ADDRrri:$addr)]>;
+  let cy = 0 in
+  def rii : RM<opc, (outs), (ins MEMrii:$addr, RC:$sx),
+               !strconcat(opcStr, " $sx, $addr"),
+               [(OpNode Ty:$sx, ADDRrii:$addr)]>;
+  let cz = 0 in
+  def zri : RM<opc, (outs), (ins MEMzri:$addr, RC:$sx),
+               !strconcat(opcStr, " $sx, $addr"),
+               [(OpNode Ty:$sx, ADDRzri:$addr)]>;
+  let cy = 0, cz = 0 in
+  def zii : RM<opc, (outs), (ins MEMzii:$addr, RC:$sx),
+               !strconcat(opcStr, " $sx, $addr"),
+               [(OpNode Ty:$sx, ADDRzii:$addr)]>;
+}
+
+// Section 8.2.7 - STS
+let DecoderMethod = "DecodeStoreI64" in
+defm ST : STOREm<"st", 0x11, I64, i64, store>;
+def : Pat<(store f64:$src, ADDRrri:$addr), (STrri MEMrri:$addr, $src)>;
+def : Pat<(store f64:$src, ADDRrii:$addr), (STrii MEMrii:$addr, $src)>;
+def : Pat<(store f64:$src, ADDRzri:$addr), (STzri MEMzri:$addr, $src)>;
+def : Pat<(store f64:$src, ADDRzii:$addr), (STzii MEMzii:$addr, $src)>;
+
+// Section 8.2.8 - STU
+let DecoderMethod = "DecodeStoreF32" in
+defm STU : STOREm<"stu", 0x12, F32, f32, store>;
+
+// Section 8.2.9 - STL
+let DecoderMethod = "DecodeStoreI32" in
+defm STL : STOREm<"stl", 0x13, I32, i32, store>;
+
+// Section 8.2.10 - ST2B
+let DecoderMethod = "DecodeStoreI32" in
+defm ST2B : STOREm<"st2b", 0x14, I32, i32, truncstorei16>;
+
+// Section 8.2.11 - ST1B
+let DecoderMethod = "DecodeStoreI32" in
+defm ST1B : STOREm<"st1b", 0x15, I32, i32, truncstorei8>;
+
+// Section 8.2.12 - DLDS
+let DecoderMethod = "DecodeLoadI64" in
+defm DLD : LOADm<"dld", 0x09, I64, i64, load>;
+
+// Section 8.2.13 - DLDU
+let DecoderMethod = "DecodeLoadF32" in
+defm DLDU : LOADm<"dldu", 0x0a, F32, f32, load>;
+
+// Section 8.2.14 - DLDL
+let DecoderMethod = "DecodeLoadI32" in
+defm DLDLSX : LOADm<"dldl.sx", 0x0b, I32, i32, load>;
+let cx = 1, DecoderMethod = "DecodeLoadI32" in
+defm DLDLZX : LOADm<"dldl.zx", 0x0b, I32, i32, load>;
+
+// Section 8.2.15 - PFCH
+let DecoderMethod = "DecodeASX" in
+defm PFCH : PFCHm<"pfch", 0x0c>;
+
+// Section 8.2.16 - TS1AM (Test and Set 1 AM)
+let DecoderMethod = "DecodeTS1AMI64" in
+defm TS1AML : RRCASm<"ts1am.l", 0x42, I64, i64, uimm7>;
+let DecoderMethod = "DecodeTS1AMI32", cx = 1 in
+defm TS1AMW : RRCASm<"ts1am.w", 0x42, I32, i32, uimm7>;
+
+// Section 8.2.17 - TS2AM (Test and Set 2 AM)
+let DecoderMethod = "DecodeTS1AMI64" in
+defm TS2AM : RRCASm<"ts2am", 0x43, I64, i64, uimm7>;
+
+// Section 8.2.18 - TS3AM (Test and Set 3 AM)
+let DecoderMethod = "DecodeTS1AMI64" in
+defm TS3AM : RRCASm<"ts3am", 0x52, I64, i64, uimm1>;
+
+// Section 8.2.19 - ATMAM (Atomic AM)
+let DecoderMethod = "DecodeTS1AMI64" in
+defm ATMAM : RRCASm<"atmam", 0x53, I64, i64, uimm0to2>;
+
+// Section 8.2.20 - CAS (Compare and Swap)
+let DecoderMethod = "DecodeCASI64" in
+defm CASL : RRCASm<"cas.l", 0x62, I64, i64, simm7>;
+let DecoderMethod = "DecodeCASI32", cx = 1 in
+defm CASW : RRCASm<"cas.w", 0x62, I32, i32, simm7>;
+
+//-----------------------------------------------------------------------------
+// Section 8.3 - Transfer Control Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.3.1 - FENCE (Fence)
+let hasSideEffects = 1 in {
+  let avo = 1 in def FENCEI : RRFENCE<0x20, (outs), (ins), "fencei">;
+  def FENCEM : RRFENCE<0x20, (outs), (ins uimm2:$kind), "fencem $kind"> {
+    bits<2> kind;
+    let lf = kind{1};
+    let sf = kind{0};
+  }
+  def FENCEC : RRFENCE<0x20, (outs), (ins uimm3:$kind), "fencec $kind"> {
+    bits<3> kind;
+    let c2 = kind{2};
+    let c1 = kind{1};
+    let c0 = kind{0};
+  }
+}
+
+// Section 8.3.2 - SVOB (Set Vector Out-of-order memory access Boundary)
+let sx = 0, cy = 0, sy = 0, cz = 0, sz = 0, hasSideEffects = 1 in
+def SVOB : RR<0x30, (outs), (ins), "svob">;
+
+//-----------------------------------------------------------------------------
+// Section 8.4 - Fixed-point Operation Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.4.1 - ADD (Add)
+defm ADDUL : RRm<"addu.l", 0x48, I64, i64>;
+let cx = 1 in defm ADDUW : RRm<"addu.w", 0x48, I32, i32>;
+
+// Section 8.4.2 - ADS (Add Single)
+defm ADDSWSX : RRm<"adds.w.sx", 0x4A, I32, i32, add>;
+let cx = 1 in defm ADDSWZX : RRm<"adds.w.zx", 0x4A, I32, i32>;
+
+// Section 8.4.3 - ADX (Add)
+defm ADDSL : RRm<"adds.l", 0x59, I64, i64, add>;
+
+// Section 8.4.4 - SUB (Subtract)
+defm SUBUL : RRNCm<"subu.l", 0x58, I64, i64>;
+let cx = 1 in defm SUBUW : RRNCm<"subu.w", 0x58, I32, i32>;
+
+// Section 8.4.5 - SBS (Subtract Single)
+defm SUBSWSX : RRNCm<"subs.w.sx", 0x5A, I32, i32, sub>;
+let cx = 1 in defm SUBSWZX : RRNCm<"subs.w.zx", 0x5A, I32, i32>;
+
+// Section 8.4.6 - SBX (Subtract)
+defm SUBSL : RRNCm<"subs.l", 0x5B, I64, i64, sub>;
+
+// Section 8.4.7 - MPY (Multiply)
+defm MULUL : RRm<"mulu.l", 0x49, I64, i64>;
+let cx = 1 in defm MULUW : RRm<"mulu.w", 0x49, I32, i32>;
+
+// Section 8.4.8 - MPS (Multiply Single)
+defm MULSWSX : RRm<"muls.w.sx", 0x4B, I32, i32, mul>;
+let cx = 1 in defm MULSWZX : RRm<"muls.w.zx", 0x4B, I32, i32>;
+
+// Section 8.4.9 - MPX (Multiply)
+defm MULSL : RRm<"muls.l", 0x6E, I64, i64, mul>;
+
+// Section 8.4.10 - MPD (Multiply)
+defm MULSLW : RRbm<"muls.l.w", 0x6B, I64, i64, I32, i32>;
+
+// Section 8.4.11 - DIV (Divide)
+defm DIVUL : RRNCm<"divu.l", 0x6F, I64, i64, udiv>;
+let cx = 1 in defm DIVUW : RRNCm<"divu.w", 0x6F, I32, i32, udiv>;
+
+// Section 8.4.12 - DVS (Divide Single)
+defm DIVSWSX : RRNCm<"divs.w.sx", 0x7B, I32, i32, sdiv>;
+let cx = 1 in defm DIVSWZX : RRNCm<"divs.w.zx", 0x7B, I32, i32>;
+
+// Section 8.4.13 - DVX (Divide)
+defm DIVSL : RRNCm<"divs.l", 0x7F, I64, i64, sdiv>;
+
+// Section 8.4.14 - CMP (Compare)
+defm CMPUL : RRNCm<"cmpu.l", 0x55, I64, i64>;
+let cx = 1 in defm CMPUW : RRNCm<"cmpu.w", 0x55, I32, i32>;
+
+// Section 8.4.15 - CPS (Compare Single)
+defm CMPSWSX : RRNCm<"cmps.w.sx", 0x7A, I32, i32>;
+let cx = 1 in defm CMPSWZX : RRNCm<"cmps.w.zx", 0x7A, I32, i32>;
+
+// Section 8.4.16 - CPX (Compare)
+defm CMPSL : RRNCm<"cmps.l", 0x6A, I64, i64>;
+
+// Section 8.4.17 - CMS (Compare and Select Maximum/Minimum Single)
+// cx: sx/zx, cw: max/min
+defm MAXSWSX : RRm<"maxs.w.sx", 0x78, I32, i32>;
+let cx = 1 in defm MAXSWZX : RRm<"maxs.w.zx", 0x78, I32, i32>;
+let cw = 1 in defm MINSWSX : RRm<"mins.w.sx", 0x78, I32, i32>;
+let cx = 1, cw = 1 in defm MINSWZX : RRm<"mins.w.zx", 0x78, I32, i32>;
+
+// Section 8.4.18 - CMX (Compare and Select Maximum/Minimum)
+defm MAXSL : RRm<"maxs.l", 0x68, I64, i64>;
+let cw = 1 in defm MINSL : RRm<"mins.l", 0x68, I64, i64>;
+
+//-----------------------------------------------------------------------------
+// Section 8.5 - Logical Operation Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.5.1 - AND (AND)
+defm AND : RRm<"and", 0x44, I64, i64, and>;
+let isCodeGenOnly = 1 in defm AND32 : RRm<"and", 0x44, I32, i32, and>;
+
+// Section 8.5.2 - OR (OR)
+defm OR : RRm<"or", 0x45, I64, i64, or>;
+let isCodeGenOnly = 1 in defm OR32 : RRm<"or", 0x45, I32, i32, or>;
+
+// Section 8.5.3 - XOR (Exclusive OR)
+defm XOR : RRm<"xor", 0x46, I64, i64, xor>;
+let isCodeGenOnly = 1 in defm XOR32 : RRm<"xor", 0x46, I32, i32, xor>;
+
+// Section 8.5.4 - EQV (Equivalence)
+defm EQV : RRm<"eqv", 0x47, I64, i64>;
+
+// Section 8.5.5 - NND (Negate AND)
+def and_not : PatFrags<(ops node:$x, node:$y),
+                       [(and (not node:$x), node:$y)]>;
+defm NND : RRNCm<"nnd", 0x54, I64, i64, and_not>;
+
+// Section 8.5.6 - MRG (Merge)
+defm MRG : RRMRGm<"mrg", 0x56, I64, i64>;
+
+// Section 8.5.7 - LDZ (Leading Zero Count)
+defm LDZ : RRI1m<"ldz", 0x67, I64, i64, ctlz>;
+
+// Section 8.5.8 - PCNT (Population Count)
+defm PCNT : RRI1m<"pcnt", 0x38, I64, i64, ctpop>;
+
+// Section 8.5.9 - BRV (Bit Reverse)
+defm BRV : RRI1m<"brv", 0x39, I64, i64, bitreverse>;
+
+// Section 8.5.10 - BSWP (Byte Swap)
+defm BSWP : RRSWPm<"bswp", 0x2B, I64, i64>;
+
+// Section 8.5.11 - CMOV (Conditional Move)
+let cw = 0, cw2 = 0 in defm CMOVL : RRCMOVm<"cmov.l.${cfw}", 0x3B, I64, i64>;
+let cw = 1, cw2 = 0 in defm CMOVW : RRCMOVm<"cmov.w.${cfw}", 0x3B, I32, i32>;
+let cw = 0, cw2 = 1 in defm CMOVD : RRCMOVm<"cmov.d.${cfw}", 0x3B, I64, f64>;
+let cw = 1, cw2 = 1 in defm CMOVS : RRCMOVm<"cmov.s.${cfw}", 0x3B, F32, f32>;
+def : MnemonicAlias<"cmov.l", "cmov.l.at">;
+def : MnemonicAlias<"cmov.w", "cmov.w.at">;
+def : MnemonicAlias<"cmov.d", "cmov.d.at">;
+def : MnemonicAlias<"cmov.s", "cmov.s.at">;
+
+//-----------------------------------------------------------------------------
+// Section 8.6 - Shift Operation Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.6.1 - SLL (Shift Left Logical)
+defm SLL : RRIm<"sll", 0x65, I64, i64, shl>;
+
+// Section 8.6.2 - SLD (Shift Left Double)
+defm SLD : RRILDm<"sld", 0x64, I64, i64>;
+
+// Section 8.6.3 - SRL (Shift Right Logical)
+defm SRL : RRIm<"srl", 0x75, I64, i64, srl>;
+
+// Section 8.6.4 - SRD (Shift Right Double)
+defm SRD : RRIRDm<"srd", 0x74, I64, i64>;
+
+// Section 8.6.5 - SLA (Shift Left Arithmetic)
+defm SLAWSX : RRIm<"sla.w.sx", 0x66, I32, i32, shl>;
+let cx = 1 in defm SLAWZX : RRIm<"sla.w.zx", 0x66, I32, i32>;
+
+// Section 8.6.6 - SLAX (Shift Left Arithmetic)
+defm SLAL : RRIm<"sla.l", 0x57, I64, i64>;
+
+// Section 8.6.7 - SRA (Shift Right Arithmetic)
+defm SRAWSX : RRIm<"sra.w.sx", 0x76, I32, i32, sra>;
+let cx = 1 in defm SRAWZX : RRIm<"sra.w.zx", 0x76, I32, i32>;
+
+// Section 8.6.8 - SRAX (Shift Right Arithmetic)
+defm SRAL : RRIm<"sra.l", 0x77, I64, i64, sra>;
+
+def : Pat<(i32 (srl i32:$src, (i32 simm7:$val))),
+          (EXTRACT_SUBREG (SRLri (ANDrm (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+            $src, sub_i32), !add(32, 64)), imm:$val), sub_i32)>;
+def : Pat<(i32 (srl i32:$src, i32:$val)),
+          (EXTRACT_SUBREG (SRLrr (ANDrm (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+            $src, sub_i32), !add(32, 64)), $val), sub_i32)>;
+
+//-----------------------------------------------------------------------------
+// Section 8.7 - Floating-point Arithmetic Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.7.1 - FAD (Floating Add)
+defm FADDD : RRFm<"fadd.d", 0x4C, I64, f64, fadd>;
+let cx = 1 in
+defm FADDS : RRFm<"fadd.s", 0x4C, F32, f32, fadd, simm7fp, mimmfp32>;
+
+// Section 8.7.2 - FSB (Floating Subtract)
+defm FSUBD : RRFm<"fsub.d", 0x5C, I64, f64, fsub>;
+let cx = 1 in
+defm FSUBS : RRFm<"fsub.s", 0x5C, F32, f32, fsub, simm7fp, mimmfp32>;
+
+// Section 8.7.3 - FMP (Floating Multiply)
+defm FMULD : RRFm<"fmul.d", 0x4D, I64, f64, fmul>;
+let cx = 1 in
+defm FMULS : RRFm<"fmul.s", 0x4D, F32, f32, fmul, simm7fp, mimmfp32>;
+
+// Section 8.7.4 - FDV (Floating Divide)
+defm FDIVD : RRFm<"fdiv.d", 0x5D, I64, f64, fdiv>;
+let cx = 1 in
+defm FDIVS : RRFm<"fdiv.s", 0x5D, F32, f32, fdiv, simm7fp, mimmfp32>;
+
+// Section 8.7.5 - FCP (Floating Compare)
+defm FCMPD : RRFm<"fcmp.d", 0x7E, I64, f64>;
+let cx = 1 in
+defm FCMPS : RRFm<"fcmp.s", 0x7E, F32, f32, null_frag, simm7fp, mimmfp32>;
+
+// Section 8.7.6 - CMS (Compare and Select Maximum/Minimum Single)
+// cx: double/float, cw: max/min
+let cw = 0, cx = 0 in
+defm FMAXD : RRFm<"fmax.d", 0x3E, I64, f64, fmaxnum>;
+let cw = 0, cx = 1 in
+defm FMAXS : RRFm<"fmax.s", 0x3E, F32, f32, fmaxnum, simm7fp, mimmfp32>;
+let cw = 1, cx = 0 in
+defm FMIND : RRFm<"fmin.d", 0x3E, I64, f64, fminnum>;
+let cw = 1, cx = 1 in
+defm FMINS : RRFm<"fmin.s", 0x3E, F32, f32, fminnum, simm7fp, mimmfp32>;
+
+// Section 8.7.7 - FAQ (Floating Add Quadruple)
+defm FADDQ : RRFm<"fadd.q", 0x6C, F128, f128>;
+
+// Section 8.7.8 - FSQ (Floating Subtract Quadruple)
+defm FSUBQ : RRFm<"fsub.q", 0x7C, F128, f128>;
+
+// Section 8.7.9 - FMQ (Floating Subtract Quadruple)
+defm FMULQ : RRFm<"fmul.q", 0x6D, F128, f128>;
+
+// Section 8.7.10 - FCQ (Floating Compare Quadruple)
+defm FCMPQ : RRNCbm<"fcmp.q", 0x7D, I64, f64, F128, f128, null_frag, simm7fp,
+                    mimmfp>;
+
+// Section 8.7.11 - FIX (Convert to Fixed Point)
+// cx: double/float, cw: sx/zx, sz{0-3} = round
+let cx = 0, cw = 0 /* sign extend */ in
+defm CVTWDSX : CVTRDm<"cvt.w.d.sx", 0x4E, I32, i32, I64, f64>;
+let cx = 0, cw = 1 /* zero extend */ in
+defm CVTWDZX : CVTRDm<"cvt.w.d.zx", 0x4E, I32, i32, I64, f64>;
+let cx = 1, cw = 0 /* sign extend */ in
+defm CVTWSSX : CVTRDm<"cvt.w.s.sx", 0x4E, I32, i32, F32, f32>;
+let cx = 1, cw = 1 /* zero extend */ in
+defm CVTWSZX : CVTRDm<"cvt.w.s.zx", 0x4E, I32, i32, F32, f32>;
+
+// Section 8.7.12 - FIXX (Convert to Fixed Point)
+defm CVTLD : CVTRDm<"cvt.l.d", 0x4F, I64, i64, I64, f64>;
+
+// Section 8.7.13 - FLT (Convert to Floating Point)
+defm CVTDW : CVTm<"cvt.d.w", 0x5E, I64, f64, I32, i32, sint_to_fp>;
+let cx = 1 in
+defm CVTSW : CVTm<"cvt.s.w", 0x5E, F32, f32, I32, i32, sint_to_fp>;
+
+// Section 8.7.14 - FLTX (Convert to Floating Point)
+defm CVTDL : CVTm<"cvt.d.l", 0x5F, I64, f64, I64, i64, sint_to_fp>;
+
+// Section 8.7.15 - CVS (Convert to Single-format)
+defm CVTSD : CVTm<"cvt.s.d", 0x1F, F32, f32, I64, f64, fpround>;
+let cx = 1 in
+defm CVTSQ : CVTm<"cvt.s.q", 0x1F, F32, f32, F128, f128>;
+
+// Section 8.7.16 - CVD (Convert to Double-format)
+defm CVTDS : CVTm<"cvt.d.s", 0x0F, I64, f64, F32, f32, fpextend>;
+let cx = 1 in
+defm CVTDQ : CVTm<"cvt.d.q", 0x0F, I64, f64, F128, f128>;
+
+// Section 8.7.17 - CVQ (Convert to Single-format)
+defm CVTQD : CVTm<"cvt.q.d", 0x2D, F128, f128, I64, f64>;
 let cx = 1 in
-defm LEASL : RMm<"lea.sl", 0x06, I64, i64, simm7Op64, simm32Op64>;
+defm CVTQS : CVTm<"cvt.q.s", 0x2D, F128, f128, F32, f32>;
+
+//-----------------------------------------------------------------------------
+// Section 8.8 - Branch instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.8.1 - BC (Branch on Codition)
+defm BCFL : BCm<"b${cond}.l", "b.l", "baf.l", 0x19, I64, simm7>;
+
+// Indirect branch aliases
+def : Pat<(brind I64:$reg), (BCFLari_t $reg, 0)>;
+def : Pat<(brind tblockaddress:$imm), (BCFLazi_t 0, $imm)>;
+
+// Return instruction is a special case of jump.
+let Uses = [SX10], bpf = 3 /* TAKEN */, cf = 15 /* AT */, cy = 0, sy = 0,
+    sz = 10 /* SX10 */, imm32 = 0, isReturn = 1, isTerminator = 1,
+    isBarrier = 1, isCodeGenOnly = 1, hasSideEffects = 0 in
+def RET : CF<0x19, (outs), (ins), "b.l.t (, %s10)", [(retflag)]>;
+
+// Section 8.8.2 - BCS (Branch on Condition Single)
+defm BCFW : BCm<"b${cond}.w", "b.w", "baf.w", 0x1B, I32, simm7>;
+
+// Section 8.8.3 - BCF (Branch on Condition Floating Point)
+defm BCFD : BCm<"b${cond}.d", "b.d", "baf.d", 0x1C, I64, simm7fp>;
+let cx = 1 in
+defm BCFS : BCm<"b${cond}.s", "b.s", "baf.s", 0x1C, F32, simm7fp>;
+
+// Section 8.8.4 - BCR (Branch on Condition Relative)
+let cx = 0, cx2 = 0 in
+defm BRCFL : BCRm<"br${cf}.l", "br.l", "braf.l", 0x18, I64, simm7>;
+let cx = 1, cx2 = 0 in
+defm BRCFW : BCRm<"br${cf}.w", "br.w", "braf.w", 0x18, I32, simm7>;
+let cx = 0, cx2 = 1 in
+defm BRCFD : BCRm<"br${cf}.d", "br.d", "braf.d", 0x18, I64, simm7fp>;
+let cx = 1, cx2 = 1 in
+defm BRCFS : BCRm<"br${cf}.s", "br.s", "braf.s", 0x18, F32, simm7fp>;
+
+// Section 8.8.5 - BSIC (Branch and Save IC)
+let isCall = 1, hasSideEffects = 0, DecoderMethod = "DecodeCall" in
+defm BSIC : RMm<"bsic", 0x08, I64>;
+
+// Call instruction is a special case of BSIC.
+let Defs = [SX10], sx = 10 /* SX10 */, cy = 0, sy = 0, imm32 = 0,
+    isCall = 1, isCodeGenOnly = 1, hasSideEffects = 0 in
+def CALLr : RM<0x08, (outs), (ins I64:$sz, variable_ops),
+               "bsic %s10, (, $sz)", [(call i64:$sz)]>;
 
-// 5.3.2.2. Fixed-Point Arithmetic Operation Instructions
+//-----------------------------------------------------------------------------
+// Section 8.19 - Control Instructions
+//-----------------------------------------------------------------------------
 
-// ADX instruction
-let cx = 0 in
-defm ADX : RRm<"adds.l", 0x59, I64, i64, simm7Op64, uimm6Op64>;
+// Section 8.19.1 - SIC (Save Instruction Counter)
+let cy = 0, sy = 0, cz = 0, sz = 0, hasSideEffects = 1, Uses = [IC] in
+def SIC : RR<0x28, (outs I32:$sx), (ins), "sic $sx">;
 
-// 5.3.2.3. Logical Arithmetic Operation Instructions
+// Section 8.19.2 - LPM (Load Program Mode Flags)
+let sx = 0, cz = 0, sz = 0, hasSideEffects = 1, Defs = [PSW] in
+def LPM : RR<0x3a, (outs), (ins I64:$sy), "lpm $sy">;
 
-let cx = 0 in {
-  defm AND : RRm<"and", 0x44, I64, i64, simm7Op64, uimm6Op64>;
-  defm OR : RRm<"or", 0x45, I64, i64, simm7Op64, uimm6Op64>;
+// Section 8.19.3 - SPM (Save Program Mode Flags)
+let cy = 0, sy = 0, cz = 0, sz = 0, hasSideEffects = 1, Uses = [PSW] in
+def SPM : RR<0x2a, (outs I64:$sx), (ins), "spm $sx">;
+
+// Section 8.19.4 - LFR (Load Flag Register)
+let sx = 0, cz = 0, sz = 0, hasSideEffects = 1, Defs = [PSW] in {
+  def LFRr : RR<0x69, (outs), (ins I64:$sy), "lfr $sy">;
+  let cy = 0 in def LFRi : RR<0x69, (outs), (ins uimm6:$sy), "lfr $sy">;
 }
 
-// Load and Store instructions
-// As 1st step, only uses sz and imm32 to represent $addr
-let mayLoad = 1, hasSideEffects = 0 in {
-let cy = 0, sy = 0, cz = 1 in {
-let cx = 0 in
-def LDSri : RM<
-    0x01, (outs I64:$sx), (ins MEMri:$addr),
-    "ld $sx, $addr">;
+// Section 8.19.5 - SFR (Save Flag Register)
+let cy = 0, sy = 0, cz = 0, sz = 0, hasSideEffects = 1, Uses = [PSW] in
+def SFR : RR<0x29, (outs I64:$sx), (ins), "sfr $sx">;
+
+// Section 8.19.6 - SMIR (Save Miscellaneous Register)
+let cy = 0, cz = 0, sz = 0, hasSideEffects = 1 in {
+  def SMIR : RR<0x22, (outs I64:$sx), (ins MISC:$sy), "smir $sx, $sy">;
+}
+
+// Section 8.19.7 - NOP (No Operation)
+let sx = 0, cy = 0, sy = 0, cz = 0, sz = 0, hasSideEffects = 0 in
+def NOP : RR<0x79, (outs), (ins), "nop">;
+
+// Section 8.19.8 - MONC (Monitor Call)
+let sx = 0, cy = 0, sy = 0, cz = 0, sz = 0, hasSideEffects = 1 in {
+  def MONC : RR<0x3F, (outs), (ins), "monc">;
+  let cx = 1, isTrap = 1 in def MONCHDB : RR<0x3F, (outs), (ins), "monc.hdb">;
 }
+
+// Section 8.19.9 - LCR (Load Communication Register)
+defm LCR : LOADCRm<"lcr", 0x40, I64>;
+
+// Section 8.19.10 - SCR (Save Communication Register)
+defm SCR : STORECRm<"scr", 0x50, I64>;
+
+// Section 8.19.11 - TSCR (Test & Set Communication Register)
+defm TSCR : LOADCRm<"tscr", 0x41, I64>;
+
+// Section 8.19.12 - FIDCR (Fetch & Increment/Decrement CR)
+defm FIDCR : FIDCRm<"fidcr", 0x51, I64>;
+
+//-----------------------------------------------------------------------------
+// Section 8.20 - Host Memory Access Instructions
+//-----------------------------------------------------------------------------
+
+// Section 8.20.1 - LHM (Load Host Memory)
+let ry = 3, DecoderMethod = "DecodeLoadASI64" in
+defm LHML : LHMm<"lhm.l", 0x21, I64>;
+let ry = 2, DecoderMethod = "DecodeLoadASI64" in
+defm LHMW : LHMm<"lhm.w", 0x21, I64>;
+let ry = 1, DecoderMethod = "DecodeLoadASI64" in
+defm LHMH : LHMm<"lhm.h", 0x21, I64>;
+let ry = 0, DecoderMethod = "DecodeLoadASI64" in
+defm LHMB : LHMm<"lhm.b", 0x21, I64>;
+
+// Section 8.20.2 - SHM (Store Host Memory)
+let ry = 3, DecoderMethod = "DecodeStoreASI64" in
+defm SHML : SHMm<"shm.l", 0x31, I64>;
+let ry = 2, DecoderMethod = "DecodeStoreASI64" in
+defm SHMW : SHMm<"shm.w", 0x31, I64>;
+let ry = 1, DecoderMethod = "DecodeStoreASI64" in
+defm SHMH : SHMm<"shm.h", 0x31, I64>;
+let ry = 0, DecoderMethod = "DecodeStoreASI64" in
+defm SHMB : SHMm<"shm.b", 0x31, I64>;
+
+//===----------------------------------------------------------------------===//
+// Instructions for CodeGenOnly
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// Pattern Matchings
+//===----------------------------------------------------------------------===//
+
+// Small immediates.
+def : Pat<(i32 simm7:$val), (OR32im (LO7 $val), 0)>;
+def : Pat<(i64 simm7:$val), (ORim (LO7 $val), 0)>;
+// Medium immediates.
+def : Pat<(i32 simm32:$val), (LEA32zii 0, 0, (LO32 $val))>;
+def : Pat<(i64 simm32:$val), (LEAzii 0, 0, (LO32 $val))>;
+def : Pat<(i64 uimm32:$val), (ANDrm (LEAzii 0, 0, (LO32 $val)), !add(32, 64))>;
+// Arbitrary immediates.
+def : Pat<(i64 lozero:$val),
+          (LEASLzii 0, 0, (HI32 imm:$val))>;
+def : Pat<(i64 lomsbzero:$val),
+          (LEASLrii (LEAzii 0, 0, (LO32 imm:$val)), 0, (HI32 imm:$val))>;
+def : Pat<(i64 imm:$val),
+          (LEASLrii (ANDrm (LEAzii 0, 0, (LO32 imm:$val)), !add(32, 64)), 0,
+                    (HI32 imm:$val))>;
+
+// floating point
+def : Pat<(f32 fpimm:$val),
+          (EXTRACT_SUBREG (LEASLzii 0, 0, (HIFP32 $val)), sub_f32)>;
+def : Pat<(f64 fplozero:$val),
+          (LEASLzii 0, 0, (HIFP32 $val))>;
+def : Pat<(f64 fplomsbzero:$val),
+          (LEASLrii (LEAzii 0, 0, (LOFP32 $val)), 0, (HIFP32 $val))>;
+def : Pat<(f64 fpimm:$val),
+          (LEASLrii (ANDrm (LEAzii 0, 0, (LOFP32 $val)), !add(32, 64)), 0,
+                    (HIFP32 $val))>;
+
+// The same integer registers are used for i32 and i64 values.
+// When registers hold i32 values, the high bits are unused.
+
+// TODO Use standard expansion for shift-based lowering of sext_inreg
+
+// Cast to i1
+def : Pat<(sext_inreg I32:$src, i1),
+          (SRAWSXri (SLAWSXri $src, 31), 31)>;
+def : Pat<(sext_inreg I64:$src, i1),
+          (SRALri (SLLri $src, 63), 63)>;
+
+// Cast to i8
+def : Pat<(sext_inreg I32:$src, i8),
+          (SRAWSXri (SLAWSXri $src, 24), 24)>;
+def : Pat<(sext_inreg I64:$src, i8),
+          (SRALri (SLLri $src, 56), 56)>;
+def : Pat<(sext_inreg (i32 (trunc i64:$src)), i8),
+          (EXTRACT_SUBREG (SRALri (SLLri $src, 56), 56), sub_i32)>;
+def : Pat<(and (trunc i64:$src), 0xff),
+          (AND32rm (EXTRACT_SUBREG $src, sub_i32), !add(56, 64))>;
+
+// Cast to i16
+def : Pat<(sext_inreg I32:$src, i16),
+          (SRAWSXri (SLAWSXri $src, 16), 16)>;
+def : Pat<(sext_inreg I64:$src, i16),
+          (SRALri (SLLri $src, 48), 48)>;
+def : Pat<(sext_inreg (i32 (trunc i64:$src)), i16),
+          (EXTRACT_SUBREG (SRALri (SLLri $src, 48), 48), sub_i32)>;
+def : Pat<(and (trunc i64:$src), 0xffff),
+          (AND32rm (EXTRACT_SUBREG $src, sub_i32), !add(48, 64))>;
+
+// Cast to i32
+def : Pat<(i32 (trunc i64:$src)),
+          (ADDSWSXrm (EXTRACT_SUBREG $src, sub_i32), 0)>;
+def : Pat<(i32 (fp_to_sint I64:$reg)), (CVTWDSXr RD_RZ, $reg)>;
+def : Pat<(i32 (fp_to_sint F32:$reg)), (CVTWSSXr RD_RZ, $reg)>;
+
+// Cast to i64
+def : Pat<(sext_inreg I64:$src, i32),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+            (ADDSWSXrm (EXTRACT_SUBREG $src, sub_i32), 0), sub_i32)>;
+def : Pat<(i64 (sext i32:$sy)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (ADDSWSXrm $sy, 0), sub_i32)>;
+def : Pat<(i64 (zext i32:$sy)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (ADDSWZXrm $sy, 0), sub_i32)>;
+def : Pat<(i64 (fp_to_sint f32:$sy)), (CVTLDr RD_RZ, (CVTDSr $sy))>;
+def : Pat<(i64 (fp_to_sint I64:$reg)), (CVTLDr RD_RZ, $reg)>;
+
+// Cast to f32
+def : Pat<(f32 (sint_to_fp i64:$sy)), (CVTSDr (CVTDLr i64:$sy))>;
+
+def : Pat<(i64 (anyext i32:$sy)),
+          (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $sy, sub_i32)>;
+
+
+// extload, sextload and zextload stuff
+multiclass EXT64m<SDPatternOperator from,
+                  SDPatternOperator torri,
+                  SDPatternOperator torii,
+                  SDPatternOperator tozri,
+                  SDPatternOperator tozii> {
+  def : Pat<(i64 (from ADDRrri:$addr)),
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (torri MEMrri:$addr),
+                           sub_i32)>;
+  def : Pat<(i64 (from ADDRrii:$addr)),
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (torii MEMrii:$addr),
+                           sub_i32)>;
+  def : Pat<(i64 (from ADDRzri:$addr)),
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (tozri MEMzri:$addr),
+                           sub_i32)>;
+  def : Pat<(i64 (from ADDRzii:$addr)),
+            (INSERT_SUBREG (i64 (IMPLICIT_DEF)), (tozii MEMzii:$addr),
+                           sub_i32)>;
 }
+defm : EXT64m<sextloadi8, LD1BSXrri, LD1BSXrii, LD1BSXzri, LD1BSXzii>;
+defm : EXT64m<zextloadi8, LD1BZXrri, LD1BZXrii, LD1BZXzri, LD1BZXzii>;
+defm : EXT64m<extloadi8, LD1BZXrri, LD1BZXrii, LD1BZXzri, LD1BZXzii>;
+defm : EXT64m<sextloadi16, LD2BSXrri, LD2BSXrii, LD2BSXzri, LD2BSXzii>;
+defm : EXT64m<zextloadi16, LD2BZXrri, LD2BZXrii, LD2BZXzri, LD2BZXzii>;
+defm : EXT64m<extloadi16, LD2BZXrri, LD2BZXrii, LD2BZXzri, LD2BZXzii>;
+defm : EXT64m<sextloadi32, LDLSXrri, LDLSXrii, LDLSXzri, LDLSXzii>;
+defm : EXT64m<zextloadi32, LDLZXrri, LDLZXrii, LDLZXzri, LDLZXzii>;
+defm : EXT64m<extloadi32, LDLSXrri, LDLSXrii, LDLSXzri, LDLSXzii>;
 
-let mayStore = 1, hasSideEffects = 0 in {
-let cx = 0, cy = 0, sy = 0, cz = 1 in {
-def STSri : RM<
-    0x11, (outs), (ins MEMri:$addr, I64:$sx),
-    "st $sx, $addr">;
+// anyextload
+multiclass EXT32m<SDPatternOperator from,
+                  SDPatternOperator torri,
+                  SDPatternOperator torii,
+                  SDPatternOperator tozri,
+                  SDPatternOperator tozii> {
+  def : Pat<(from ADDRrri:$addr), (torri MEMrri:$addr)>;
+  def : Pat<(from ADDRrii:$addr), (torii MEMrii:$addr)>;
+  def : Pat<(from ADDRzri:$addr), (tozri MEMzri:$addr)>;
+  def : Pat<(from ADDRzii:$addr), (tozii MEMzii:$addr)>;
 }
+defm : EXT32m<extloadi8, LD1BZXrri, LD1BZXrii, LD1BZXzri, LD1BZXzii>;
+defm : EXT32m<extloadi16, LD2BZXrri, LD2BZXrii, LD2BZXzri, LD2BZXzii>;
+
+// truncstore
+multiclass TRUNC64m<SDPatternOperator from,
+                    SDPatternOperator torri,
+                    SDPatternOperator torii,
+                    SDPatternOperator tozri,
+                    SDPatternOperator tozii> {
+  def : Pat<(from i64:$src, ADDRrri:$addr),
+            (torri MEMrri:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
+  def : Pat<(from i64:$src, ADDRrii:$addr),
+            (torii MEMrii:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
+  def : Pat<(from i64:$src, ADDRzri:$addr),
+            (tozri MEMzri:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
+  def : Pat<(from i64:$src, ADDRzii:$addr),
+            (tozii MEMzii:$addr, (EXTRACT_SUBREG $src, sub_i32))>;
 }
+defm : TRUNC64m<truncstorei8, ST1Brri, ST1Brii, ST1Bzri, ST1Bzii>;
+defm : TRUNC64m<truncstorei16, ST2Brri, ST2Brii, ST2Bzri, ST2Bzii>;
+defm : TRUNC64m<truncstorei32, STLrri, STLrii, STLzri, ST1Bzii>;
+
+// Address calculation and its optimization
+def : Pat<(VEhi tglobaladdr:$in), (LEASLzii 0, 0, tglobaladdr:$in)>;
+def : Pat<(VElo tglobaladdr:$in),
+          (ANDrm (LEAzii 0, 0, tglobaladdr:$in), !add(32, 64))>;
+def : Pat<(add (VEhi tglobaladdr:$in1), (VElo tglobaladdr:$in2)),
+          (LEASLrii (ANDrm (LEAzii 0, 0, tglobaladdr:$in2), !add(32, 64)), 0,
+                    (tglobaladdr:$in1))>;
 
-// Return instruction is also a special case of jump.
-let cx = 0, cx2 = 0, bpf = 0 /* NONE */, cf = 15 /* AT */, cy = 0, sy = 0,
-    cz = 1, sz = 0x10 /* SX10 */, imm32 = 0, Uses = [SX10],
-    isReturn = 1, isTerminator = 1, hasDelaySlot = 1, isBarrier = 1,
-    isCodeGenOnly = 1, hasSideEffects = 0 in
-def RET : CF<
-    0x19, (outs), (ins),
-    "b.l (,%lr)",
-    [(retflag)]>;
+// GlobalTLS address calculation and its optimization
+def : Pat<(VEhi tglobaltlsaddr:$in), (LEASLzii 0, 0, tglobaltlsaddr:$in)>;
+def : Pat<(VElo tglobaltlsaddr:$in),
+          (ANDrm (LEAzii 0, 0, tglobaltlsaddr:$in), !add(32, 64))>;
+def : Pat<(add (VEhi tglobaltlsaddr:$in1), (VElo tglobaltlsaddr:$in2)),
+          (LEASLrii (ANDrm (LEAzii 0, 0, tglobaltlsaddr:$in2), !add(32, 64)), 0,
+                    (tglobaltlsaddr:$in1))>;
 
-// Branch instruction
-let cx = 0, cx2 = 0, bpf = 0 /* NONE */ in
-defm BCRL : BCRm<"br${cf}.l", "br.l", 0x18, I64, i64, simm7Op64, uimm6Op64>;
+// Address calculation and its optimization
+def : Pat<(VEhi texternalsym:$in), (LEASLzii 0, 0, texternalsym:$in)>;
+def : Pat<(VElo texternalsym:$in),
+          (ANDrm (LEAzii 0, 0, texternalsym:$in), !add(32, 64))>;
+def : Pat<(add (VEhi texternalsym:$in1), (VElo texternalsym:$in2)),
+          (LEASLrii (ANDrm (LEAzii 0, 0, texternalsym:$in2), !add(32, 64)), 0,
+                    (texternalsym:$in1))>;
 
-let cx = 0, cy = 0, cz = 1, hasSideEffects = 0 in {
-let sy = 3 in
-def SHMri : RM<
-    0x31, (outs), (ins MEMASri:$addr, I64:$sx),
-    "shm.l $sx, $addr">;
+// Branches
+def : Pat<(br bb:$addr), (BRCFLa bb:$addr)>;
+
+// brcc
+// integer brcc
+multiclass BRCCIm<ValueType ty, SDPatternOperator BrOpNode1,
+                 SDPatternOperator BrOpNode2,
+                 SDPatternOperator CmpOpNode1,
+                 SDPatternOperator CmpOpNode2> {
+  def : Pat<(brcc CCSIOp:$cond, ty:$l, simm7:$r, bb:$addr),
+            (BrOpNode2 (icond2ccSwap $cond), (LO7 $r), $l, bb:$addr)>;
+  def : Pat<(brcc CCSIOp:$cond, ty:$l, ty:$r, bb:$addr),
+            (BrOpNode1 (icond2cc $cond), $l, $r, bb:$addr)>;
+  def : Pat<(brcc CCUIOp:$cond, ty:$l, simm7:$r, bb:$addr),
+            (BrOpNode2 (icond2cc $cond), 0, (CmpOpNode2 (LO7 $r), $l),
+                       bb:$addr)>;
+  def : Pat<(brcc CCUIOp:$cond, ty:$l, ty:$r, bb:$addr),
+            (BrOpNode2 (icond2cc $cond), 0, (CmpOpNode1 $r, $l), bb:$addr)>;
 }
+defm : BRCCIm<i32, BRCFWrr, BRCFWir, CMPUWrr, CMPUWir>;
+defm : BRCCIm<i64, BRCFLrr, BRCFLir, CMPULrr, CMPULir>;
 
-let cx = 0, sx = 0, cy = 0, sy = 0, cz = 0, sz = 0, hasSideEffects = 0 in
-def MONC : RR<
-    0x3F, (outs), (ins),
-    "monc">;
+// floating point brcc
+multiclass BRCCFm<ValueType ty, SDPatternOperator BrOpNode1,
+                 SDPatternOperator BrOpNode2> {
+  def : Pat<(brcc cond:$cond, ty:$l, simm7fp:$r, bb:$addr),
+            (BrOpNode2 (fcond2ccSwap $cond), (LO7FP $r), $l, bb:$addr)>;
+  def : Pat<(brcc cond:$cond, ty:$l, ty:$r, bb:$addr),
+            (BrOpNode1 (fcond2cc $cond), $l, $r, bb:$addr)>;
+}
+defm : BRCCFm<f32, BRCFSrr, BRCFSir>;
+defm : BRCCFm<f64, BRCFDrr, BRCFDir>;
 
 //===----------------------------------------------------------------------===//
 // Pseudo Instructions
 //===----------------------------------------------------------------------===//
 
+// GETGOT for PIC
+let Defs = [SX15 /* %got */, SX16 /* %plt */], hasSideEffects = 0 in {
+  def GETGOT : Pseudo<(outs getGOT:$getpcseq), (ins), "$getpcseq">;
+}
+
+// GETFUNPLT for PIC
+let hasSideEffects = 0 in
+def GETFUNPLT : Pseudo<(outs I64:$dst), (ins i64imm:$addr),
+                       "$dst, $addr",
+                       [(set iPTR:$dst, (GetFunPLT tglobaladdr:$addr))] >;
+
+def : Pat<(GetFunPLT tglobaladdr:$dst),
+          (GETFUNPLT tglobaladdr:$dst)>;
+def : Pat<(GetFunPLT texternalsym:$dst),
+          (GETFUNPLT texternalsym:$dst)>;
+
+// GETTLSADDR for TLS
+let Defs = [SX0, SX10, SX12], hasSideEffects = 0 in
+def GETTLSADDR : Pseudo<(outs), (ins i64imm:$addr),
+                        "# GETTLSADDR $addr",
+                        [(GetTLSAddr tglobaltlsaddr:$addr)] >;
+
+def : Pat<(GetTLSAddr tglobaltlsaddr:$dst),
+          (GETTLSADDR tglobaltlsaddr:$dst)>;
+
 let Defs = [SX11], Uses = [SX11], hasSideEffects = 0 in {
 def ADJCALLSTACKDOWN : Pseudo<(outs), (ins i64imm:$amt, i64imm:$amt2),
                               "# ADJCALLSTACKDOWN $amt, $amt2",
@@ -286,3 +1728,278 @@ let  hasSideEffects = 0 in
 def EXTEND_STACK_GUARD : Pseudo<(outs), (ins),
                                 "# EXTEND STACK GUARD",
                                 []>;
+
+// Dynamic stack allocation yields a __llvm_grow_stack for VE targets.
+// These calls are needed to probe the stack when allocating more over
+// %s8 (%sl - stack limit).
+
+let Uses = [SX11], hasSideEffects = 1 in
+def GETSTACKTOP : Pseudo<(outs I64:$dst), (ins),
+                         "# GET STACK TOP",
+                         [(set iPTR:$dst, (GetStackTop))]>;
+// SETCC pattern matches
+//
+//   CMP  %tmp, lhs, rhs     ; compare lhs and rhs
+//   or   %res, 0, (0)1      ; initialize by 0
+//   CMOV %res, (63)0, %tmp  ; set 1 if %tmp is true
+
+def : Pat<(i32 (setcc i64:$LHS, i64:$RHS, CCSIOp:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVLrm (icond2cc $cond),
+                       (CMPSLrr i64:$LHS, i64:$RHS),
+                       !add(63, 64),
+                       (ORim 0, 0)), sub_i32)>;
+
+def : Pat<(i32 (setcc i64:$LHS, i64:$RHS, CCUIOp:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVLrm (icond2cc $cond),
+                       (CMPULrr i64:$LHS, i64:$RHS),
+                       !add(63, 64),
+                       (ORim 0, 0)), sub_i32)>;
+
+def : Pat<(i32 (setcc i32:$LHS, i32:$RHS, CCSIOp:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVWrm (icond2cc $cond),
+                       (CMPSWSXrr i32:$LHS, i32:$RHS),
+                       !add(63, 64),
+                       (ORim 0, 0)), sub_i32)>;
+
+def : Pat<(i32 (setcc i32:$LHS, i32:$RHS, CCUIOp:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVWrm (icond2cc $cond),
+                       (CMPUWrr i32:$LHS, i32:$RHS),
+                       !add(63, 64),
+                       (ORim 0, 0)), sub_i32)>;
+
+def : Pat<(i32 (setcc f64:$LHS, f64:$RHS, cond:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVDrm (fcond2cc $cond),
+                       (FCMPDrr f64:$LHS, f64:$RHS),
+                       !add(63, 64),
+                       (ORim 0, 0)), sub_i32)>;
+
+def : Pat<(i32 (setcc f32:$LHS, f32:$RHS, cond:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVSrm (fcond2cc $cond),
+                       (FCMPSrr f32:$LHS, f32:$RHS),
+                       !add(63, 64),
+                       (ORim 0, 0)), sub_i32)>;
+
+// Special SELECTCC pattern matches
+// Use min/max for better performance.
+//
+//   MAX/MIN  %res, %lhs, %rhs
+
+def : Pat<(f64 (selectcc f64:$LHS, f64:$RHS, f64:$LHS, f64:$RHS, SETOGT)),
+          (FMAXDrr $LHS, $RHS)>;
+def : Pat<(f32 (selectcc f32:$LHS, f32:$RHS, f32:$LHS, f32:$RHS, SETOGT)),
+          (FMAXSrr $LHS, $RHS)>;
+def : Pat<(i64 (selectcc i64:$LHS, i64:$RHS, i64:$LHS, i64:$RHS, SETGT)),
+          (MAXSLrr $LHS, $RHS)>;
+def : Pat<(i32 (selectcc i32:$LHS, i32:$RHS, i32:$LHS, i32:$RHS, SETGT)),
+          (MAXSWSXrr $LHS, $RHS)>;
+def : Pat<(f64 (selectcc f64:$LHS, f64:$RHS, f64:$LHS, f64:$RHS, SETOGE)),
+          (FMAXDrr $LHS, $RHS)>;
+def : Pat<(f32 (selectcc f32:$LHS, f32:$RHS, f32:$LHS, f32:$RHS, SETOGE)),
+          (FMAXSrr $LHS, $RHS)>;
+def : Pat<(i64 (selectcc i64:$LHS, i64:$RHS, i64:$LHS, i64:$RHS, SETGE)),
+          (MAXSLrr $LHS, $RHS)>;
+def : Pat<(i32 (selectcc i32:$LHS, i32:$RHS, i32:$LHS, i32:$RHS, SETGE)),
+          (MAXSWSXrr $LHS, $RHS)>;
+
+def : Pat<(f64 (selectcc f64:$LHS, f64:$RHS, f64:$LHS, f64:$RHS, SETOLT)),
+          (FMINDrr $LHS, $RHS)>;
+def : Pat<(f32 (selectcc f32:$LHS, f32:$RHS, f32:$LHS, f32:$RHS, SETOLT)),
+          (FMINSrr $LHS, $RHS)>;
+def : Pat<(i64 (selectcc i64:$LHS, i64:$RHS, i64:$LHS, i64:$RHS, SETLT)),
+          (MINSLrr $LHS, $RHS)>;
+def : Pat<(i32 (selectcc i32:$LHS, i32:$RHS, i32:$LHS, i32:$RHS, SETLT)),
+          (MINSWSXrr $LHS, $RHS)>;
+def : Pat<(f64 (selectcc f64:$LHS, f64:$RHS, f64:$LHS, f64:$RHS, SETOLE)),
+          (FMINDrr $LHS, $RHS)>;
+def : Pat<(f32 (selectcc f32:$LHS, f32:$RHS, f32:$LHS, f32:$RHS, SETOLE)),
+          (FMINSrr $LHS, $RHS)>;
+def : Pat<(i64 (selectcc i64:$LHS, i64:$RHS, i64:$LHS, i64:$RHS, SETLE)),
+          (MINSLrr $LHS, $RHS)>;
+def : Pat<(i32 (selectcc i32:$LHS, i32:$RHS, i32:$LHS, i32:$RHS, SETLE)),
+          (MINSWSXrr $LHS, $RHS)>;
+
+// Generic SELECTCC pattern matches
+//
+//   CMP  %tmp, %l, %r       ; compare %l and %r
+//   or   %res, %f, (0)1     ; initialize by %f
+//   CMOV %res, %t, %tmp     ; set %t if %tmp is true
+
+// selectcc for i64 result
+def : Pat<(i64 (selectcc i32:$l, i32:$r, i64:$t, i64:$f, CCSIOp:$cond)),
+          (CMOVWrr (icond2cc $cond), (CMPSWSXrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc i32:$l, i32:$r, i64:$t, i64:$f, CCUIOp:$cond)),
+          (CMOVWrr (icond2cc $cond), (CMPUWrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc i64:$l, i64:$r, i64:$t, i64:$f, CCSIOp:$cond)),
+          (CMOVLrr (icond2cc $cond), (CMPSLrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc i64:$l, i64:$r, i64:$t, i64:$f, CCUIOp:$cond)),
+          (CMOVLrr (icond2cc $cond), (CMPULrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc f32:$l, f32:$r, i64:$t, i64:$f, cond:$cond)),
+          (CMOVSrr (fcond2cc $cond), (FCMPSrr $l, $r), $t, $f)>;
+def : Pat<(i64 (selectcc f64:$l, f64:$r, i64:$t, i64:$f, cond:$cond)),
+          (CMOVDrr (fcond2cc $cond), (FCMPDrr $l, $r), $t, $f)>;
+
+// selectcc for i32 result
+def : Pat<(i32 (selectcc i32:$l, i32:$r, i32:$t, i32:$f, CCSIOp:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVWrr (icond2cc $cond),
+                       (CMPSWSXrr $l, $r),
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
+              sub_i32)>;
+def : Pat<(i32 (selectcc i32:$l, i32:$r, i32:$t, i32:$f, CCUIOp:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVWrr (icond2cc $cond),
+                       (CMPUWrr $l, $r),
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
+              sub_i32)>;
+def : Pat<(i32 (selectcc i64:$l, i64:$r, i32:$t, i32:$f, CCSIOp:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVLrr (icond2cc $cond),
+                       (CMPSLrr $l, $r),
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
+              sub_i32)>;
+def : Pat<(i32 (selectcc i64:$l, i64:$r, i32:$t, i32:$f, CCUIOp:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVLrr (icond2cc $cond),
+                       (CMPULrr $l, $r),
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
+              sub_i32)>;
+def : Pat<(i32 (selectcc f32:$l, f32:$r, i32:$t, i32:$f, cond:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVSrr (fcond2cc $cond),
+                       (FCMPSrr $l, $r),
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
+              sub_i32)>;
+def : Pat<(i32 (selectcc f64:$l, f64:$r, i32:$t, i32:$f, cond:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVDrr (fcond2cc $cond),
+                       (FCMPDrr $l, $r),
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
+              sub_i32)>;
+
+// selectcc for f64 result
+def : Pat<(f64 (selectcc i32:$l, i32:$r, f64:$t, f64:$f, CCSIOp:$cond)),
+          (CMOVWrr (icond2cc $cond), (CMPSWSXrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc i32:$l, i32:$r, f64:$t, f64:$f, CCUIOp:$cond)),
+          (CMOVWrr (icond2cc $cond), (CMPUWrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc i64:$l, i64:$r, f64:$t, f64:$f, CCSIOp:$cond)),
+          (CMOVLrr (icond2cc $cond), (CMPSLrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc i64:$l, i64:$r, f64:$t, f64:$f, CCUIOp:$cond)),
+          (CMOVLrr (icond2cc $cond), (CMPULrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc f32:$l, f32:$r, f64:$t, f64:$f, cond:$cond)),
+          (CMOVSrr (fcond2cc $cond), (FCMPSrr $l, $r), $t, $f)>;
+def : Pat<(f64 (selectcc f64:$l, f64:$r, f64:$t, f64:$f, cond:$cond)),
+          (CMOVDrr (fcond2cc $cond), (FCMPDrr $l, $r), $t, $f)>;
+
+// selectcc for f32 result
+def : Pat<(f32 (selectcc i32:$l, i32:$r, f32:$t, f32:$f, CCSIOp:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVWrr (icond2cc $cond),
+                       (CMPSWSXrr $l, $r),
+                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
+                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
+              sub_f32)>;
+def : Pat<(f32 (selectcc i32:$l, i32:$r, f32:$t, f32:$f, CCUIOp:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVWrr (icond2cc $cond),
+                       (CMPUWrr $l, $r),
+                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
+                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
+              sub_f32)>;
+def : Pat<(f32 (selectcc i64:$l, i64:$r, f32:$t, f32:$f, CCSIOp:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVLrr (icond2cc $cond),
+                       (CMPSLrr $l, $r),
+                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
+                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
+              sub_f32)>;
+def : Pat<(f32 (selectcc i64:$l, i64:$r, f32:$t, f32:$f, CCUIOp:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVLrr (icond2cc $cond),
+                       (CMPULrr $l, $r),
+                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
+                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
+              sub_f32)>;
+def : Pat<(f32 (selectcc f32:$l, f32:$r, f32:$t, f32:$f, cond:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVSrr (fcond2cc $cond),
+                       (FCMPSrr $l, $r),
+                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
+                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
+              sub_f32)>;
+def : Pat<(f32 (selectcc f64:$l, f64:$r, f32:$t, f32:$f, cond:$cond)),
+          (EXTRACT_SUBREG
+              (CMOVDrr (fcond2cc $cond),
+                       (FCMPDrr $l, $r),
+                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $t, sub_f32),
+                       (INSERT_SUBREG (f64 (IMPLICIT_DEF)), $f, sub_f32)),
+              sub_f32)>;
+
+// Generic SELECT pattern matches
+// Use cmov.w for all cases since %pred holds i32.
+//
+//   CMOV.w.ne %res, %tval, %tmp  ; set tval if %tmp is true
+
+def : Pat<(i64 (select i32:$pred, i64:$t, i64:$f)),
+          (CMOVWrr CC_INE, $pred, $t, $f)>;
+
+def : Pat<(i32 (select i32:$pred, i32:$t, i32:$f)),
+          (EXTRACT_SUBREG
+              (CMOVWrr CC_INE, $pred,
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_i32),
+                       (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_i32)),
+              sub_i32)>;
+
+def : Pat<(f64 (select i32:$pred, f64:$t, f64:$f)),
+          (CMOVWrr CC_INE, $pred, $t, $f)>;
+
+def : Pat<(f32 (select i32:$pred, f32:$t, f32:$f)),
+          (EXTRACT_SUBREG
+            (CMOVWrr CC_INE, $pred,
+                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $t, sub_f32),
+                     (INSERT_SUBREG (i64 (IMPLICIT_DEF)), $f, sub_f32)),
+            sub_f32)>;
+
+// bitconvert
+def : Pat<(f64 (bitconvert i64:$src)), (COPY_TO_REGCLASS $src, I64)>;
+def : Pat<(i64 (bitconvert f64:$src)), (COPY_TO_REGCLASS $src, I64)>;
+
+def : Pat<(i32 (bitconvert f32:$op)),
+          (EXTRACT_SUBREG (SRALri (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+            $op, sub_f32), 32), sub_i32)>;
+def : Pat<(f32 (bitconvert i32:$op)),
+          (EXTRACT_SUBREG (SLLri (INSERT_SUBREG (i64 (IMPLICIT_DEF)),
+            $op, sub_i32), 32), sub_f32)>;
+
+// Bits operations pattern matchings.
+def : Pat<(i32 (ctpop i32:$src)),
+          (EXTRACT_SUBREG (PCNTr (ANDrm (INSERT_SUBREG
+            (i64 (IMPLICIT_DEF)), $src, sub_i32), !add(32, 64))), sub_i32)>;
+def : Pat<(i32 (ctlz i32:$src)),
+          (EXTRACT_SUBREG (LDZr (SLLri (INSERT_SUBREG
+            (i64 (IMPLICIT_DEF)), $src, sub_i32), 32)), sub_i32)>;
+def : Pat<(i64 (bswap i64:$src)),
+          (BSWPri $src, 0)>;
+def : Pat<(i32 (bswap i32:$src)),
+          (EXTRACT_SUBREG (BSWPri (INSERT_SUBREG
+            (i64 (IMPLICIT_DEF)), $src, sub_i32), 1), sub_i32)>;
+
+// Several special pattern matches to optimize code
+
+def : Pat<(i32 (and i32:$lhs, 0xff)),
+          (AND32rm $lhs, !add(56, 64))>;
+def : Pat<(i32 (and i32:$lhs, 0xffff)),
+          (AND32rm $lhs, !add(48, 64))>;
+def : Pat<(i32 (and i32:$lhs, 0xffffffff)),
+          (AND32rm $lhs, !add(32, 64))>;
diff --git a/llvm/lib/Target/VE/VEMCInstLower.cpp b/llvm/lib/Target/VE/VEMCInstLower.cpp
index 6c8fc3536c34..9815610510e1 100644
--- a/llvm/lib/Target/VE/VEMCInstLower.cpp
+++ b/llvm/lib/Target/VE/VEMCInstLower.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "MCTargetDesc/VEMCExpr.h"
 #include "VE.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -27,9 +28,16 @@ using namespace llvm;
 static MCOperand LowerSymbolOperand(const MachineInstr *MI,
                                     const MachineOperand &MO,
                                     const MCSymbol *Symbol, AsmPrinter &AP) {
+  VEMCExpr::VariantKind Kind = (VEMCExpr::VariantKind)MO.getTargetFlags();
 
-  const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::create(Symbol, AP.OutContext);
-  return MCOperand::createExpr(MCSym);
+  const MCExpr *Expr = MCSymbolRefExpr::create(Symbol, AP.OutContext);
+  // Add offset iff MO is not jump table info or machine basic block.
+  if (!MO.isJTI() && !MO.isMBB() && MO.getOffset())
+    Expr = MCBinaryExpr::createAdd(
+        Expr, MCConstantExpr::create(MO.getOffset(), AP.OutContext),
+        AP.OutContext);
+  Expr = VEMCExpr::create(Kind, Expr, AP.OutContext);
+  return MCOperand::createExpr(Expr);
 }
 
 static MCOperand LowerOperand(const MachineInstr *MI, const MachineOperand &MO,
@@ -43,6 +51,11 @@ static MCOperand LowerOperand(const MachineInstr *MI, const MachineOperand &MO,
       break;
     return MCOperand::createReg(MO.getReg());
 
+  case MachineOperand::MO_ExternalSymbol:
+    return LowerSymbolOperand(
+        MI, MO, AP.GetExternalSymbolSymbol(MO.getSymbolName()), AP);
+  case MachineOperand::MO_GlobalAddress:
+    return LowerSymbolOperand(MI, MO, AP.getSymbol(MO.getGlobal()), AP);
   case MachineOperand::MO_Immediate:
     return MCOperand::createImm(MO.getImm());
 
diff --git a/llvm/lib/Target/VE/VEMachineFunctionInfo.cpp b/llvm/lib/Target/VE/VEMachineFunctionInfo.cpp
new file mode 100644
index 000000000000..1addfc7174eb
--- /dev/null
+++ b/llvm/lib/Target/VE/VEMachineFunctionInfo.cpp
@@ -0,0 +1,13 @@
+//===-- VEMachineFunctionInfo.cpp - VE Machine Function Info --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "VEMachineFunctionInfo.h"
+
+using namespace llvm;
+
+void VEMachineFunctionInfo::anchor() {}
diff --git a/llvm/lib/Target/VE/VEMachineFunctionInfo.h b/llvm/lib/Target/VE/VEMachineFunctionInfo.h
new file mode 100644
index 000000000000..16b25fed3f11
--- /dev/null
+++ b/llvm/lib/Target/VE/VEMachineFunctionInfo.h
@@ -0,0 +1,48 @@
+//===- VEMachineFunctionInfo.h - VE Machine Function Info -------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file declares  VE specific per-machine-function information.
+//
+//===----------------------------------------------------------------------===//
+#ifndef LLVM_LIB_TARGET_VE_VEMACHINEFUNCTIONINFO_H
+#define LLVM_LIB_TARGET_VE_VEMACHINEFUNCTIONINFO_H
+
+#include "llvm/CodeGen/MachineFunction.h"
+
+namespace llvm {
+
+class VEMachineFunctionInfo : public MachineFunctionInfo {
+  virtual void anchor();
+
+private:
+  Register GlobalBaseReg;
+
+  /// VarArgsFrameOffset - Frame offset to start of varargs area.
+  int VarArgsFrameOffset;
+
+  /// IsLeafProc - True if the function is a leaf procedure.
+  bool IsLeafProc;
+
+public:
+  VEMachineFunctionInfo()
+      : GlobalBaseReg(), VarArgsFrameOffset(0), IsLeafProc(false) {}
+  explicit VEMachineFunctionInfo(MachineFunction &MF)
+      : GlobalBaseReg(), VarArgsFrameOffset(0), IsLeafProc(false) {}
+
+  Register getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(Register Reg) { GlobalBaseReg = Reg; }
+
+  int getVarArgsFrameOffset() const { return VarArgsFrameOffset; }
+  void setVarArgsFrameOffset(int Offset) { VarArgsFrameOffset = Offset; }
+
+  void setLeafProc(bool rhs) { IsLeafProc = rhs; }
+  bool isLeafProc() const { return IsLeafProc; }
+};
+} // namespace llvm
+
+#endif
diff --git a/llvm/lib/Target/VE/VERegisterInfo.cpp b/llvm/lib/Target/VE/VERegisterInfo.cpp
index e1ff614abc20..5783a8df69d2 100644
--- a/llvm/lib/Target/VE/VERegisterInfo.cpp
+++ b/llvm/lib/Target/VE/VERegisterInfo.cpp
@@ -34,12 +34,22 @@ VERegisterInfo::VERegisterInfo() : VEGenRegisterInfo(VE::SX10) {}
 
 const MCPhysReg *
 VERegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
-  return CSR_SaveList;
+  switch (MF->getFunction().getCallingConv()) {
+  default:
+    return CSR_SaveList;
+  case CallingConv::PreserveAll:
+    return CSR_preserve_all_SaveList;
+  }
 }
 
 const uint32_t *VERegisterInfo::getCallPreservedMask(const MachineFunction &MF,
                                                      CallingConv::ID CC) const {
-  return CSR_RegMask;
+  switch (CC) {
+  default:
+    return CSR_RegMask;
+  case CallingConv::PreserveAll:
+    return CSR_preserve_all_RegMask;
+  }
 }
 
 const uint32_t *VERegisterInfo::getNoPreservedMask() const {
@@ -48,26 +58,34 @@ const uint32_t *VERegisterInfo::getNoPreservedMask() const {
 
 BitVector VERegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   BitVector Reserved(getNumRegs());
-  Reserved.set(VE::SX8);  // stack limit
-  Reserved.set(VE::SX9);  // frame pointer
-  Reserved.set(VE::SX10); // link register (return address)
-  Reserved.set(VE::SX11); // stack pointer
 
-  Reserved.set(VE::SX12); // outer register
-  Reserved.set(VE::SX13); // id register for dynamic linker
-
-  Reserved.set(VE::SX14); // thread pointer
-  Reserved.set(VE::SX15); // global offset table register
-  Reserved.set(VE::SX16); // procedure linkage table register
-  Reserved.set(VE::SX17); // linkage-area register
-
-  // sx18-sx33 are callee-saved registers
-  // sx34-sx63 are temporary registers
+  const Register ReservedRegs[] = {
+      VE::SX8,  // Stack limit
+      VE::SX9,  // Frame pointer
+      VE::SX10, // Link register (return address)
+      VE::SX11, // Stack pointer
+
+      // FIXME: maybe not need to be reserved
+      VE::SX12, // Outer register
+      VE::SX13, // Id register for dynamic linker
+
+      VE::SX14, // Thread pointer
+      VE::SX15, // Global offset table register
+      VE::SX16, // Procedure linkage table register
+      VE::SX17, // Linkage-area register
+                // sx18-sx33 are callee-saved registers
+                // sx34-sx63 are temporary registers
+  };
+
+  for (auto R : ReservedRegs)
+    for (MCRegAliasIterator ItAlias(R, this, true); ItAlias.isValid();
+         ++ItAlias)
+      Reserved.set(*ItAlias);
 
   return Reserved;
 }
 
-bool VERegisterInfo::isConstantPhysReg(unsigned PhysReg) const { return false; }
+bool VERegisterInfo::isConstantPhysReg(MCRegister PhysReg) const { return false; }
 
 const TargetRegisterClass *
 VERegisterInfo::getPointerRegClass(const MachineFunction &MF,
@@ -77,12 +95,12 @@ VERegisterInfo::getPointerRegClass(const MachineFunction &MF,
 
 static void replaceFI(MachineFunction &MF, MachineBasicBlock::iterator II,
                       MachineInstr &MI, const DebugLoc &dl,
-                      unsigned FIOperandNum, int Offset, unsigned FramePtr) {
+                      unsigned FIOperandNum, int Offset, Register FrameReg) {
   // Replace frame index with a frame pointer reference directly.
   // VE has 32 bit offset field, so no need to expand a target instruction.
   // Directly encode it.
-  MI.getOperand(FIOperandNum).ChangeToRegister(FramePtr, false);
-  MI.getOperand(FIOperandNum + 1).ChangeToImmediate(Offset);
+  MI.getOperand(FIOperandNum).ChangeToRegister(FrameReg, false);
+  MI.getOperand(FIOperandNum + 2).ChangeToImmediate(Offset);
 }
 
 void VERegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
@@ -96,11 +114,11 @@ void VERegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
   MachineFunction &MF = *MI.getParent()->getParent();
   const VEFrameLowering *TFI = getFrameLowering(MF);
 
-  unsigned FrameReg;
+  Register FrameReg;
   int Offset;
   Offset = TFI->getFrameIndexReference(MF, FrameIndex, FrameReg);
 
-  Offset += MI.getOperand(FIOperandNum + 1).getImm();
+  Offset += MI.getOperand(FIOperandNum + 2).getImm();
 
   replaceFI(MF, II, MI, dl, FIOperandNum, Offset, FrameReg);
 }
diff --git a/llvm/lib/Target/VE/VERegisterInfo.h b/llvm/lib/Target/VE/VERegisterInfo.h
index 9cb475f5e174..9a32da16bea6 100644
--- a/llvm/lib/Target/VE/VERegisterInfo.h
+++ b/llvm/lib/Target/VE/VERegisterInfo.h
@@ -30,7 +30,7 @@ public:
   const uint32_t *getNoPreservedMask() const override;
 
   BitVector getReservedRegs(const MachineFunction &MF) const override;
-  bool isConstantPhysReg(unsigned PhysReg) const override;
+  bool isConstantPhysReg(MCRegister PhysReg) const override;
 
   const TargetRegisterClass *getPointerRegClass(const MachineFunction &MF,
                                                 unsigned Kind) const override;
diff --git a/llvm/lib/Target/VE/VERegisterInfo.td b/llvm/lib/Target/VE/VERegisterInfo.td
index ef5b9c09705a..29708d35c730 100644
--- a/llvm/lib/Target/VE/VERegisterInfo.td
+++ b/llvm/lib/Target/VE/VERegisterInfo.td
@@ -10,28 +10,135 @@
 //  Declarations that describe the VE register file
 //===----------------------------------------------------------------------===//
 
-class VEReg<bits<7> Enc, string n> : Register<n> {
+class VEReg<bits<7> enc, string n, list<Register> subregs = [],
+            list<string> altNames = [], list<Register> aliases = []>
+        : Register<n, altNames> {
   let HWEncoding{15-7} = 0;
-  let HWEncoding{6-0} = Enc;
+  let HWEncoding{6-0} = enc;
   let Namespace = "VE";
-}
-
-// Registers are identified with 7-bit ID numbers.
-// R - 64-bit integer or floating-point registers
-class R<bits<7> Enc, string n, list<Register> subregs = [],
-        list<Register> aliases = []>: VEReg<Enc, n> {
   let SubRegs = subregs;
   let Aliases = aliases;
 }
 
+class VEMiscReg<bits<6> enc, string n>: Register<n> {
+  let HWEncoding{15-6} = 0;
+  let HWEncoding{5-0} = enc;
+  let Namespace = "VE";
+}
+
+let Namespace = "VE" in {
+  def sub_i8      : SubRegIndex<8, 56>;         // Low 8 bit (56..63)
+  def sub_i16     : SubRegIndex<16, 48>;        // Low 16 bit (48..63)
+  def sub_i32     : SubRegIndex<32, 32>;        // Low 32 bit (32..63)
+  def sub_f32     : SubRegIndex<32>;            // High 32 bit (0..31)
+  def sub_even    : SubRegIndex<64>;            // High 64 bit (0..63)
+  def sub_odd     : SubRegIndex<64, 64>;        // Low 64 bit (64..127)
+  def AsmName     : RegAltNameIndex;
+}
+
+//-----------------------------------------------------------------------------
+// Miscellaneous Registers
+//-----------------------------------------------------------------------------
+
+def USRCC : VEMiscReg<0, "usrcc">;      // User clock counter
+def PSW : VEMiscReg<1, "psw">;          // Program status word
+def SAR : VEMiscReg<2, "sar">;          // Store address register
+def PMMR : VEMiscReg<7, "pmmr">;        // Performance monitor mode register
+
+// Performance monitor configuration registers
+foreach I = 0-3 in
+  def PMCR#I : VEMiscReg<!add(8,I), "pmcr"#I>;
+
+// Performance monitor counter
+foreach I = 0-14 in
+  def PMC#I : VEMiscReg<!add(16,I), "pmc"#I>;
+
+// Register classes.
+def MISC : RegisterClass<"VE", [i64], 64,
+                         (add USRCC, PSW, SAR, PMMR,
+                              (sequence "PMCR%u", 0, 3),
+                              (sequence "PMC%u", 0, 14))>;
+
+//-----------------------------------------------------------------------------
+// Instruction Counter Register
+//-----------------------------------------------------------------------------
+
+def IC : VEMiscReg<62, "ic">;
+
+//-----------------------------------------------------------------------------
+// Gneric Registers
+//-----------------------------------------------------------------------------
+
+let RegAltNameIndices = [AsmName] in {
+
+// Generic integer registers - 8 bits wide
+foreach I = 0-63 in
+  def SB#I : VEReg<I, "sb"#I, [], ["s"#I]>, DwarfRegNum<[I]>;
+
+// Generic integer registers - 16 bits wide
+let SubRegIndices = [sub_i8] in
+foreach I = 0-63 in
+  def SH#I : VEReg<I, "sh"#I, [!cast<VEReg>("SB"#I)], ["s"#I]>,
+                   DwarfRegNum<[I]>;
+
+// Generic integer registers - 32 bits wide
+let SubRegIndices = [sub_i16] in
+foreach I = 0-63 in
+  def SW#I : VEReg<I, "sw"#I, [!cast<VEReg>("SH"#I)], ["s"#I]>,
+                   DwarfRegNum<[I]>;
+
+// Generic floating point registers - 32 bits wide
+//   NOTE: Mark SF#I as alias of SW#I temporary to avoid register allocation
+//         problem.
+foreach I = 0-63 in
+  def SF#I : VEReg<I, "sf"#I, [], ["s"#I], [!cast<VEReg>("SW"#I)]>,
+                   DwarfRegNum<[I]>;
+
 // Generic integer registers - 64 bits wide
+let SubRegIndices = [sub_i32, sub_f32], CoveredBySubRegs = 1 in
 foreach I = 0-63 in
-  def SX#I : R<I, "S"#I, []>,
-             DwarfRegNum<[I]>;
+  def SX#I : VEReg<I, "s"#I, [!cast<VEReg>("SW"#I), !cast<VEReg>("SF"#I)],
+                   ["s"#I]>, DwarfRegNum<[I]>;
+
+// Aliases of the S* registers used to hold 128-bit for values (long doubles).
+// Following foreach represents something like:
+//   def Q0 : VEReg<0, "q0", [SX0, SX1], ["s0"]>;
+//   def Q1 : VEReg<2, "q2", [SX2, SX3], ["s2"]>;
+//   ...
+let SubRegIndices = [sub_even, sub_odd], CoveredBySubRegs = 1 in
+foreach I = 0-31 in
+  def Q#I : VEReg<!shl(I,1), "q"#I,
+                  [!cast<VEReg>("SX"#!shl(I,1)),
+                   !cast<VEReg>("SX"#!add(!shl(I,1),1))],
+                  ["s"#!shl(I,1)]>;
+
+} // RegAltNameIndices = [AsmName]
 
 // Register classes.
 //
 // The register order is defined in terms of the preferred
 // allocation order.
-def I64 : RegisterClass<"VE", [i64], 64,
-                        (sequence "SX%u", 0, 63)>;
+def I8  : RegisterClass<"VE", [i8], 8,
+                        (add (sequence "SB%u", 0, 7),
+                             (sequence "SB%u", 34, 63),
+                             (sequence "SB%u", 8, 33))>;
+def I16 : RegisterClass<"VE", [i16], 16,
+                        (add (sequence "SH%u", 0, 7),
+                             (sequence "SH%u", 34, 63),
+                             (sequence "SH%u", 8, 33))>;
+def I32 : RegisterClass<"VE", [i32], 32,
+                        (add (sequence "SW%u", 0, 7),
+                             (sequence "SW%u", 34, 63),
+                             (sequence "SW%u", 8, 33))>;
+def I64 : RegisterClass<"VE", [i64, f64], 64,
+                        (add (sequence "SX%u", 0, 7),
+                             (sequence "SX%u", 34, 63),
+                             (sequence "SX%u", 8, 33))>;
+def F32 : RegisterClass<"VE", [f32], 32,
+                        (add (sequence "SF%u", 0, 7),
+                             (sequence "SF%u", 34, 63),
+                             (sequence "SF%u", 8, 33))>;
+def F128 : RegisterClass<"VE", [f128], 128,
+                        (add (sequence "Q%u", 0, 3),
+                             (sequence "Q%u", 17, 31),
+                             (sequence "Q%u", 4, 16))>;
diff --git a/llvm/lib/Target/VE/VESubtarget.cpp b/llvm/lib/Target/VE/VESubtarget.cpp
index 861e88cdb583..a0b78d95e3cf 100644
--- a/llvm/lib/Target/VE/VESubtarget.cpp
+++ b/llvm/lib/Target/VE/VESubtarget.cpp
@@ -28,7 +28,7 @@ void VESubtarget::anchor() {}
 VESubtarget &VESubtarget::initializeSubtargetDependencies(StringRef CPU,
                                                           StringRef FS) {
   // Determine default and user specified characteristics
-  std::string CPUName = CPU;
+  std::string CPUName = std::string(CPU);
   if (CPUName.empty())
     CPUName = "ve";
 
diff --git a/llvm/lib/Target/VE/VESubtarget.h b/llvm/lib/Target/VE/VESubtarget.h
index e9637cc16023..f3a2c206162e 100644
--- a/llvm/lib/Target/VE/VESubtarget.h
+++ b/llvm/lib/Target/VE/VESubtarget.h
@@ -42,7 +42,7 @@ public:
               const TargetMachine &TM);
 
   const VEInstrInfo *getInstrInfo() const override { return &InstrInfo; }
-  const TargetFrameLowering *getFrameLowering() const override {
+  const VEFrameLowering *getFrameLowering() const override {
     return &FrameLowering;
   }
   const VERegisterInfo *getRegisterInfo() const override {
diff --git a/llvm/lib/Target/VE/VETargetMachine.cpp b/llvm/lib/Target/VE/VETargetMachine.cpp
index 46f5c0dc1805..08b55eebbc98 100644
--- a/llvm/lib/Target/VE/VETargetMachine.cpp
+++ b/llvm/lib/Target/VE/VETargetMachine.cpp
@@ -10,6 +10,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "VETargetMachine.h"
+#include "TargetInfo/VETargetInfo.h"
 #include "VE.h"
 #include "VETargetTransformInfo.h"
 #include "llvm/CodeGen/Passes.h"
@@ -40,8 +41,8 @@ static std::string computeDataLayout(const Triple &T) {
   // VE supports 32 bit and 64 bits integer on registers
   Ret += "-n32:64";
 
-  // Stack alignment is 64 bits
-  Ret += "-S64";
+  // Stack alignment is 128 bits
+  Ret += "-S128";
 
   return Ret;
 }
@@ -73,7 +74,8 @@ VETargetMachine::VETargetMachine(const Target &T, const Triple &TT,
     : LLVMTargetMachine(T, computeDataLayout(TT), TT, CPU, FS, Options,
                         getEffectiveRelocModel(RM),
                         getEffectiveCodeModel(CM, CodeModel::Small), OL),
-      TLOF(createTLOF()), Subtarget(TT, CPU, FS, *this) {
+      TLOF(createTLOF()),
+      Subtarget(TT, std::string(CPU), std::string(FS), *this) {
   initAsmInfo();
 }
 
diff --git a/llvm/lib/Target/VE/VETargetMachine.h b/llvm/lib/Target/VE/VETargetMachine.h
index 3191d59ec1c8..041d3b197ec3 100644
--- a/llvm/lib/Target/VE/VETargetMachine.h
+++ b/llvm/lib/Target/VE/VETargetMachine.h
@@ -50,6 +50,8 @@ public:
   bool isMachineVerifierClean() const override { return false; }
 
   TargetTransformInfo getTargetTransformInfo(const Function &F) override;
+
+  unsigned getSjLjDataSize() const override { return 64; }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
index ea99cee3eb3b..e29d85d7588d 100644
--- a/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
+++ b/llvm/lib/Target/WebAssembly/AsmParser/WebAssemblyAsmParser.cpp
@@ -164,6 +164,7 @@ class WebAssemblyAsmParser final : public MCTargetAsmParser {
 
   // Much like WebAssemblyAsmPrinter in the backend, we have to own these.
   std::vector<std::unique_ptr<wasm::WasmSignature>> Signatures;
+  std::vector<std::unique_ptr<std::string>> Names;
 
   // Order of labels, directives and instructions in a .s file have no
   // syntactical enforcement. This class is a callback from the actual parser,
@@ -214,6 +215,11 @@ public:
                      SMLoc & /*EndLoc*/) override {
     llvm_unreachable("ParseRegister is not implemented.");
   }
+  OperandMatchResultTy tryParseRegister(unsigned & /*RegNo*/,
+                                        SMLoc & /*StartLoc*/,
+                                        SMLoc & /*EndLoc*/) override {
+    llvm_unreachable("tryParseRegister is not implemented.");
+  }
 
   bool error(const Twine &Msg, const AsmToken &Tok) {
     return Parser.Error(Tok.getLoc(), Msg + Tok.getString());
@@ -227,6 +233,12 @@ public:
     Signatures.push_back(std::move(Sig));
   }
 
+  StringRef storeName(StringRef Name) {
+    std::unique_ptr<std::string> N = std::make_unique<std::string>(Name);
+    Names.push_back(std::move(N));
+    return *Names.back();
+  }
+
   std::pair<StringRef, StringRef> nestingString(NestingType NT) {
     switch (NT) {
     case Function:
@@ -310,6 +322,8 @@ public:
       return wasm::ValType::V128;
     if (Type == "exnref")
       return wasm::ValType::EXNREF;
+    if (Type == "externref")
+      return wasm::ValType::EXTERNREF;
     return Optional<wasm::ValType>();
   }
 
@@ -430,7 +444,7 @@ public:
     Name = StringRef(NameLoc.getPointer(), Name.size());
 
     // WebAssembly has instructions with / in them, which AsmLexer parses
-    // as seperate tokens, so if we find such tokens immediately adjacent (no
+    // as separate tokens, so if we find such tokens immediately adjacent (no
     // whitespace), expand the name to include them:
     for (;;) {
       auto &Sep = Lexer.getTok();
@@ -688,7 +702,7 @@ public:
       // WebAssemblyAsmPrinter::EmitFunctionBodyStart.
       // TODO: would be good to factor this into a common function, but the
       // assembler and backend really don't share any common code, and this code
-      // parses the locals seperately.
+      // parses the locals separately.
       auto SymName = expectIdent();
       if (SymName.empty())
         return true;
@@ -720,7 +734,7 @@ public:
         return true;
       auto ExportName = expectIdent();
       auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
-      WasmSym->setExportName(ExportName);
+      WasmSym->setExportName(storeName(ExportName));
       TOut.emitExportName(WasmSym, ExportName);
     }
 
@@ -732,7 +746,7 @@ public:
         return true;
       auto ImportModule = expectIdent();
       auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
-      WasmSym->setImportModule(ImportModule);
+      WasmSym->setImportModule(storeName(ImportModule));
       TOut.emitImportModule(WasmSym, ImportModule);
     }
 
@@ -744,7 +758,7 @@ public:
         return true;
       auto ImportName = expectIdent();
       auto WasmSym = cast<MCSymbolWasm>(Ctx.getOrCreateSymbol(SymName));
-      WasmSym->setImportName(ImportName);
+      WasmSym->setImportName(storeName(ImportName));
       TOut.emitImportName(WasmSym, ImportName);
     }
 
@@ -787,7 +801,7 @@ public:
         return error("Cannot parse .int expression: ", Lexer.getTok());
       size_t NumBits = 0;
       DirectiveID.getString().drop_front(4).getAsInteger(10, NumBits);
-      Out.EmitValue(Val, NumBits / 8, End);
+      Out.emitValue(Val, NumBits / 8, End);
       return expect(AsmToken::EndOfStatement, "EOL");
     }
 
@@ -796,7 +810,7 @@ public:
       std::string S;
       if (Parser.parseEscapedString(S))
         return error("Cannot parse string constant: ", Lexer.getTok());
-      Out.EmitBytes(StringRef(S.c_str(), S.length() + 1));
+      Out.emitBytes(StringRef(S.c_str(), S.length() + 1));
       return expect(AsmToken::EndOfStatement, "EOL");
     }
 
@@ -834,7 +848,17 @@ public:
         if (Op0.getImm() == -1)
           Op0.setImm(Align);
       }
-      Out.EmitInstruction(Inst, getSTI());
+      if (getSTI().getTargetTriple().isArch64Bit()) {
+        // Upgrade 32-bit loads/stores to 64-bit. These mostly differ by having
+        // an offset64 arg instead of offset32, but to the assembler matcher
+        // they're both immediates so don't get selected for.
+        auto Opc64 = WebAssembly::getWasm64Opcode(
+            static_cast<uint16_t>(Inst.getOpcode()));
+        if (Opc64 >= 0) {
+          Inst.setOpcode(Opc64);
+        }
+      }
+      Out.emitInstruction(Inst, getSTI());
       if (CurrentState == EndFunction) {
         onEndOfFunction();
       } else {
@@ -879,6 +903,9 @@ public:
     auto SecName = ".text." + SymName;
     auto WS = getContext().getWasmSection(SecName, SectionKind::getText());
     getStreamer().SwitchSection(WS);
+    // Also generate DWARF for this section if requested.
+    if (getContext().getGenDwarfForAssembly())
+      getContext().addGenDwarfSection(WS);
   }
 
   void onEndOfFunction() {
@@ -886,7 +913,7 @@ public:
     // user.
     if (!LastFunctionLabel) return;
     auto TempSym = getContext().createLinkerPrivateTempSymbol();
-    getStreamer().EmitLabel(TempSym);
+    getStreamer().emitLabel(TempSym);
     auto Start = MCSymbolRefExpr::create(LastFunctionLabel, getContext());
     auto End = MCSymbolRefExpr::create(TempSym, getContext());
     auto Expr =
diff --git a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
index a8cb5d18537c..42fa6d58fffd 100644
--- a/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
+++ b/llvm/lib/Target/WebAssembly/Disassembler/WebAssemblyDisassembler.cpp
@@ -46,9 +46,10 @@ class WebAssemblyDisassembler final : public MCDisassembler {
   DecodeStatus getInstruction(MCInst &Instr, uint64_t &Size,
                               ArrayRef<uint8_t> Bytes, uint64_t Address,
                               raw_ostream &CStream) const override;
-  DecodeStatus onSymbolStart(StringRef Name, uint64_t &Size,
-                             ArrayRef<uint8_t> Bytes, uint64_t Address,
-                             raw_ostream &CStream) const override;
+  Optional<DecodeStatus> onSymbolStart(SymbolInfoTy &Symbol, uint64_t &Size,
+                                       ArrayRef<uint8_t> Bytes,
+                                       uint64_t Address,
+                                       raw_ostream &CStream) const override;
 
 public:
   WebAssemblyDisassembler(const MCSubtargetInfo &STI, MCContext &Ctx,
@@ -120,29 +121,29 @@ bool parseImmediate(MCInst &MI, uint64_t &Size, ArrayRef<uint8_t> Bytes) {
   return true;
 }
 
-MCDisassembler::DecodeStatus WebAssemblyDisassembler::onSymbolStart(
-    StringRef Name, uint64_t &Size, ArrayRef<uint8_t> Bytes, uint64_t Address,
-    raw_ostream &CStream) const {
+Optional<MCDisassembler::DecodeStatus> WebAssemblyDisassembler::onSymbolStart(
+    SymbolInfoTy &Symbol, uint64_t &Size, ArrayRef<uint8_t> Bytes,
+    uint64_t Address, raw_ostream &CStream) const {
   Size = 0;
   if (Address == 0) {
     // Start of a code section: we're parsing only the function count.
     int64_t FunctionCount;
     if (!nextLEB(FunctionCount, Bytes, Size, false))
-      return MCDisassembler::Fail;
+      return None;
     outs() << "        # " << FunctionCount << " functions in section.";
   } else {
     // Parse the start of a single function.
     int64_t BodySize, LocalEntryCount;
     if (!nextLEB(BodySize, Bytes, Size, false) ||
         !nextLEB(LocalEntryCount, Bytes, Size, false))
-      return MCDisassembler::Fail;
+      return None;
     if (LocalEntryCount) {
       outs() << "        .local ";
       for (int64_t I = 0; I < LocalEntryCount; I++) {
         int64_t Count, Type;
         if (!nextLEB(Count, Bytes, Size, false) ||
             !nextLEB(Type, Bytes, Size, false))
-          return MCDisassembler::Fail;
+          return None;
         for (int64_t J = 0; J < Count; J++) {
           if (I || J)
             outs() << ", ";
@@ -198,6 +199,7 @@ MCDisassembler::DecodeStatus WebAssemblyDisassembler::getInstruction(
     case WebAssembly::OPERAND_GLOBAL:
     case WebAssembly::OPERAND_FUNCTION32:
     case WebAssembly::OPERAND_OFFSET32:
+    case WebAssembly::OPERAND_OFFSET64:
     case WebAssembly::OPERAND_P2ALIGN:
     case WebAssembly::OPERAND_TYPEINDEX:
     case WebAssembly::OPERAND_EVENT:
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
index 8314de41021f..8ecd7c53621d 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyAsmBackend.cpp
@@ -64,9 +64,6 @@ public:
     return false;
   }
 
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override {}
-
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 
@@ -80,6 +77,7 @@ WebAssemblyAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"fixup_sleb128_i32", 0, 5 * 8, 0},
       {"fixup_sleb128_i64", 0, 10 * 8, 0},
       {"fixup_uleb128_i32", 0, 5 * 8, 0},
+      {"fixup_uleb128_i64", 0, 10 * 8, 0},
   };
 
   if (Kind < FirstTargetFixupKind)
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h
index 33e8de282955..92708dadd3e0 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyFixupKinds.h
@@ -17,6 +17,7 @@ enum Fixups {
   fixup_sleb128_i32 = FirstTargetFixupKind, // 32-bit signed
   fixup_sleb128_i64,                        // 64-bit signed
   fixup_uleb128_i32,                        // 32-bit unsigned
+  fixup_uleb128_i64,                        // 64-bit unsigned
 
   // Marker
   LastTargetFixupKind,
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
index b262e06e55e7..f60b5fcd14ec 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.cpp
@@ -54,17 +54,28 @@ void WebAssemblyInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   // Print any additional variadic operands.
   const MCInstrDesc &Desc = MII.get(MI->getOpcode());
   if (Desc.isVariadic()) {
-    if (Desc.getNumOperands() == 0 && MI->getNumOperands() > 0)
+    if ((Desc.getNumOperands() == 0 && MI->getNumOperands() > 0) ||
+        Desc.variadicOpsAreDefs())
       OS << "\t";
-    for (auto I = Desc.getNumOperands(), E = MI->getNumOperands(); I < E; ++I) {
-      // FIXME: For CALL_INDIRECT_VOID, don't print a leading comma, because
-      // we have an extra flags operand which is not currently printed, for
-      // compatiblity reasons.
-      if (I != 0 && ((MI->getOpcode() != WebAssembly::CALL_INDIRECT_VOID &&
-                      MI->getOpcode() != WebAssembly::CALL_INDIRECT_VOID_S) ||
-                     I != Desc.getNumOperands()))
+    unsigned Start = Desc.getNumOperands();
+    unsigned NumVariadicDefs = 0;
+    if (Desc.variadicOpsAreDefs()) {
+      // The number of variadic defs is encoded in an immediate by MCInstLower
+      NumVariadicDefs = MI->getOperand(0).getImm();
+      Start = 1;
+    }
+    bool NeedsComma = Desc.getNumOperands() > 0 && !Desc.variadicOpsAreDefs();
+    for (auto I = Start, E = MI->getNumOperands(); I < E; ++I) {
+      if (MI->getOpcode() == WebAssembly::CALL_INDIRECT &&
+          I - Start == NumVariadicDefs) {
+        // Skip type and flags arguments when printing for tests
+        ++I;
+        continue;
+      }
+      if (NeedsComma)
         OS << ", ";
-      printOperand(MI, I, OS);
+      printOperand(MI, I, OS, I - Start < NumVariadicDefs);
+      NeedsComma = true;
     }
   }
 
@@ -207,20 +218,21 @@ static std::string toString(const APFloat &FP) {
 }
 
 void WebAssemblyInstPrinter::printOperand(const MCInst *MI, unsigned OpNo,
-                                          raw_ostream &O) {
+                                          raw_ostream &O, bool IsVariadicDef) {
   const MCOperand &Op = MI->getOperand(OpNo);
   if (Op.isReg()) {
+    const MCInstrDesc &Desc = MII.get(MI->getOpcode());
     unsigned WAReg = Op.getReg();
     if (int(WAReg) >= 0)
       printRegName(O, WAReg);
-    else if (OpNo >= MII.get(MI->getOpcode()).getNumDefs())
+    else if (OpNo >= Desc.getNumDefs() && !IsVariadicDef)
       O << "$pop" << WebAssemblyFunctionInfo::getWARegStackId(WAReg);
     else if (WAReg != WebAssemblyFunctionInfo::UnusedReg)
       O << "$push" << WebAssemblyFunctionInfo::getWARegStackId(WAReg);
     else
       O << "$drop";
     // Add a '=' suffix if this is a def.
-    if (OpNo < MII.get(MI->getOpcode()).getNumDefs())
+    if (OpNo < MII.get(MI->getOpcode()).getNumDefs() || IsVariadicDef)
       O << '=';
   } else if (Op.isImm()) {
     O << Op.getImm();
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
index bee85507f044..1387a1928b3f 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyInstPrinter.h
@@ -41,7 +41,8 @@ public:
                  const MCSubtargetInfo &STI, raw_ostream &OS) override;
 
   // Used by tblegen code.
-  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printOperand(const MCInst *MI, unsigned OpNo, raw_ostream &O,
+                    bool IsVariadicDef = false);
   void printBrList(const MCInst *MI, unsigned OpNo, raw_ostream &O);
   void printWebAssemblyP2AlignOperand(const MCInst *MI, unsigned OpNo,
                                       raw_ostream &O);
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
index 1a4c57e66d2f..dfed3451e45b 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCCodeEmitter.cpp
@@ -158,6 +158,10 @@ void WebAssemblyMCCodeEmitter::encodeInstruction(
       case WebAssembly::OPERAND_EVENT:
         FixupKind = MCFixupKind(WebAssembly::fixup_uleb128_i32);
         break;
+      case WebAssembly::OPERAND_OFFSET64:
+        FixupKind = MCFixupKind(WebAssembly::fixup_uleb128_i64);
+        PaddedSize = 10;
+        break;
       default:
         llvm_unreachable("unexpected symbolic operand kind");
       }
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
index b339860a381d..02b310628ee1 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyMCTargetDesc.h
@@ -24,14 +24,10 @@ namespace llvm {
 
 class MCAsmBackend;
 class MCCodeEmitter;
-class MCContext;
 class MCInstrInfo;
 class MCObjectTargetWriter;
-class MCSubtargetInfo;
 class MVT;
-class Target;
 class Triple;
-class raw_pwrite_stream;
 
 MCCodeEmitter *createWebAssemblyMCCodeEmitter(const MCInstrInfo &MCII);
 
@@ -68,6 +64,8 @@ enum OperandType {
   OPERAND_FUNCTION32,
   /// 32-bit unsigned memory offsets.
   OPERAND_OFFSET32,
+  /// 64-bit unsigned memory offsets.
+  OPERAND_OFFSET64,
   /// p2align immediate for load and store address alignment.
   OPERAND_P2ALIGN,
   /// signature immediate for block/loop.
@@ -149,216 +147,121 @@ wasm::ValType toValType(const MVT &Ty);
 /// Return the default p2align value for a load or store with the given opcode.
 inline unsigned GetDefaultP2AlignAny(unsigned Opc) {
   switch (Opc) {
-  case WebAssembly::LOAD8_S_I32:
-  case WebAssembly::LOAD8_S_I32_S:
-  case WebAssembly::LOAD8_U_I32:
-  case WebAssembly::LOAD8_U_I32_S:
-  case WebAssembly::LOAD8_S_I64:
-  case WebAssembly::LOAD8_S_I64_S:
-  case WebAssembly::LOAD8_U_I64:
-  case WebAssembly::LOAD8_U_I64_S:
-  case WebAssembly::ATOMIC_LOAD8_U_I32:
-  case WebAssembly::ATOMIC_LOAD8_U_I32_S:
-  case WebAssembly::ATOMIC_LOAD8_U_I64:
-  case WebAssembly::ATOMIC_LOAD8_U_I64_S:
-  case WebAssembly::STORE8_I32:
-  case WebAssembly::STORE8_I32_S:
-  case WebAssembly::STORE8_I64:
-  case WebAssembly::STORE8_I64_S:
-  case WebAssembly::ATOMIC_STORE8_I32:
-  case WebAssembly::ATOMIC_STORE8_I32_S:
-  case WebAssembly::ATOMIC_STORE8_I64:
-  case WebAssembly::ATOMIC_STORE8_I64_S:
-  case WebAssembly::ATOMIC_RMW8_U_ADD_I32:
-  case WebAssembly::ATOMIC_RMW8_U_ADD_I32_S:
-  case WebAssembly::ATOMIC_RMW8_U_ADD_I64:
-  case WebAssembly::ATOMIC_RMW8_U_ADD_I64_S:
-  case WebAssembly::ATOMIC_RMW8_U_SUB_I32:
-  case WebAssembly::ATOMIC_RMW8_U_SUB_I32_S:
-  case WebAssembly::ATOMIC_RMW8_U_SUB_I64:
-  case WebAssembly::ATOMIC_RMW8_U_SUB_I64_S:
-  case WebAssembly::ATOMIC_RMW8_U_AND_I32:
-  case WebAssembly::ATOMIC_RMW8_U_AND_I32_S:
-  case WebAssembly::ATOMIC_RMW8_U_AND_I64:
-  case WebAssembly::ATOMIC_RMW8_U_AND_I64_S:
-  case WebAssembly::ATOMIC_RMW8_U_OR_I32:
-  case WebAssembly::ATOMIC_RMW8_U_OR_I32_S:
-  case WebAssembly::ATOMIC_RMW8_U_OR_I64:
-  case WebAssembly::ATOMIC_RMW8_U_OR_I64_S:
-  case WebAssembly::ATOMIC_RMW8_U_XOR_I32:
-  case WebAssembly::ATOMIC_RMW8_U_XOR_I32_S:
-  case WebAssembly::ATOMIC_RMW8_U_XOR_I64:
-  case WebAssembly::ATOMIC_RMW8_U_XOR_I64_S:
-  case WebAssembly::ATOMIC_RMW8_U_XCHG_I32:
-  case WebAssembly::ATOMIC_RMW8_U_XCHG_I32_S:
-  case WebAssembly::ATOMIC_RMW8_U_XCHG_I64:
-  case WebAssembly::ATOMIC_RMW8_U_XCHG_I64_S:
-  case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I32:
-  case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I32_S:
-  case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I64:
-  case WebAssembly::ATOMIC_RMW8_U_CMPXCHG_I64_S:
-  case WebAssembly::LOAD_SPLAT_v8x16:
-  case WebAssembly::LOAD_SPLAT_v8x16_S:
+#define WASM_LOAD_STORE(NAME) \
+  case WebAssembly::NAME##_A32: \
+  case WebAssembly::NAME##_A64: \
+  case WebAssembly::NAME##_A32_S: \
+  case WebAssembly::NAME##_A64_S:
+  WASM_LOAD_STORE(LOAD8_S_I32)
+  WASM_LOAD_STORE(LOAD8_U_I32)
+  WASM_LOAD_STORE(LOAD8_S_I64)
+  WASM_LOAD_STORE(LOAD8_U_I64)
+  WASM_LOAD_STORE(ATOMIC_LOAD8_U_I32)
+  WASM_LOAD_STORE(ATOMIC_LOAD8_U_I64)
+  WASM_LOAD_STORE(STORE8_I32)
+  WASM_LOAD_STORE(STORE8_I64)
+  WASM_LOAD_STORE(ATOMIC_STORE8_I32)
+  WASM_LOAD_STORE(ATOMIC_STORE8_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW8_U_ADD_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW8_U_ADD_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW8_U_SUB_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW8_U_SUB_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW8_U_AND_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW8_U_AND_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW8_U_OR_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW8_U_OR_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW8_U_XOR_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW8_U_XOR_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW8_U_XCHG_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW8_U_XCHG_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW8_U_CMPXCHG_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW8_U_CMPXCHG_I64)
+  WASM_LOAD_STORE(LOAD_SPLAT_v8x16)
     return 0;
-  case WebAssembly::LOAD16_S_I32:
-  case WebAssembly::LOAD16_S_I32_S:
-  case WebAssembly::LOAD16_U_I32:
-  case WebAssembly::LOAD16_U_I32_S:
-  case WebAssembly::LOAD16_S_I64:
-  case WebAssembly::LOAD16_S_I64_S:
-  case WebAssembly::LOAD16_U_I64:
-  case WebAssembly::LOAD16_U_I64_S:
-  case WebAssembly::ATOMIC_LOAD16_U_I32:
-  case WebAssembly::ATOMIC_LOAD16_U_I32_S:
-  case WebAssembly::ATOMIC_LOAD16_U_I64:
-  case WebAssembly::ATOMIC_LOAD16_U_I64_S:
-  case WebAssembly::STORE16_I32:
-  case WebAssembly::STORE16_I32_S:
-  case WebAssembly::STORE16_I64:
-  case WebAssembly::STORE16_I64_S:
-  case WebAssembly::ATOMIC_STORE16_I32:
-  case WebAssembly::ATOMIC_STORE16_I32_S:
-  case WebAssembly::ATOMIC_STORE16_I64:
-  case WebAssembly::ATOMIC_STORE16_I64_S:
-  case WebAssembly::ATOMIC_RMW16_U_ADD_I32:
-  case WebAssembly::ATOMIC_RMW16_U_ADD_I32_S:
-  case WebAssembly::ATOMIC_RMW16_U_ADD_I64:
-  case WebAssembly::ATOMIC_RMW16_U_ADD_I64_S:
-  case WebAssembly::ATOMIC_RMW16_U_SUB_I32:
-  case WebAssembly::ATOMIC_RMW16_U_SUB_I32_S:
-  case WebAssembly::ATOMIC_RMW16_U_SUB_I64:
-  case WebAssembly::ATOMIC_RMW16_U_SUB_I64_S:
-  case WebAssembly::ATOMIC_RMW16_U_AND_I32:
-  case WebAssembly::ATOMIC_RMW16_U_AND_I32_S:
-  case WebAssembly::ATOMIC_RMW16_U_AND_I64:
-  case WebAssembly::ATOMIC_RMW16_U_AND_I64_S:
-  case WebAssembly::ATOMIC_RMW16_U_OR_I32:
-  case WebAssembly::ATOMIC_RMW16_U_OR_I32_S:
-  case WebAssembly::ATOMIC_RMW16_U_OR_I64:
-  case WebAssembly::ATOMIC_RMW16_U_OR_I64_S:
-  case WebAssembly::ATOMIC_RMW16_U_XOR_I32:
-  case WebAssembly::ATOMIC_RMW16_U_XOR_I32_S:
-  case WebAssembly::ATOMIC_RMW16_U_XOR_I64:
-  case WebAssembly::ATOMIC_RMW16_U_XOR_I64_S:
-  case WebAssembly::ATOMIC_RMW16_U_XCHG_I32:
-  case WebAssembly::ATOMIC_RMW16_U_XCHG_I32_S:
-  case WebAssembly::ATOMIC_RMW16_U_XCHG_I64:
-  case WebAssembly::ATOMIC_RMW16_U_XCHG_I64_S:
-  case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I32:
-  case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I32_S:
-  case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I64:
-  case WebAssembly::ATOMIC_RMW16_U_CMPXCHG_I64_S:
-  case WebAssembly::LOAD_SPLAT_v16x8:
-  case WebAssembly::LOAD_SPLAT_v16x8_S:
+  WASM_LOAD_STORE(LOAD16_S_I32)
+  WASM_LOAD_STORE(LOAD16_U_I32)
+  WASM_LOAD_STORE(LOAD16_S_I64)
+  WASM_LOAD_STORE(LOAD16_U_I64)
+  WASM_LOAD_STORE(ATOMIC_LOAD16_U_I32)
+  WASM_LOAD_STORE(ATOMIC_LOAD16_U_I64)
+  WASM_LOAD_STORE(STORE16_I32)
+  WASM_LOAD_STORE(STORE16_I64)
+  WASM_LOAD_STORE(ATOMIC_STORE16_I32)
+  WASM_LOAD_STORE(ATOMIC_STORE16_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW16_U_ADD_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW16_U_ADD_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW16_U_SUB_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW16_U_SUB_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW16_U_AND_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW16_U_AND_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW16_U_OR_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW16_U_OR_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW16_U_XOR_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW16_U_XOR_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW16_U_XCHG_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW16_U_XCHG_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW16_U_CMPXCHG_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW16_U_CMPXCHG_I64)
+  WASM_LOAD_STORE(LOAD_SPLAT_v16x8)
     return 1;
-  case WebAssembly::LOAD_I32:
-  case WebAssembly::LOAD_I32_S:
-  case WebAssembly::LOAD_F32:
-  case WebAssembly::LOAD_F32_S:
-  case WebAssembly::STORE_I32:
-  case WebAssembly::STORE_I32_S:
-  case WebAssembly::STORE_F32:
-  case WebAssembly::STORE_F32_S:
-  case WebAssembly::LOAD32_S_I64:
-  case WebAssembly::LOAD32_S_I64_S:
-  case WebAssembly::LOAD32_U_I64:
-  case WebAssembly::LOAD32_U_I64_S:
-  case WebAssembly::STORE32_I64:
-  case WebAssembly::STORE32_I64_S:
-  case WebAssembly::ATOMIC_LOAD_I32:
-  case WebAssembly::ATOMIC_LOAD_I32_S:
-  case WebAssembly::ATOMIC_LOAD32_U_I64:
-  case WebAssembly::ATOMIC_LOAD32_U_I64_S:
-  case WebAssembly::ATOMIC_STORE_I32:
-  case WebAssembly::ATOMIC_STORE_I32_S:
-  case WebAssembly::ATOMIC_STORE32_I64:
-  case WebAssembly::ATOMIC_STORE32_I64_S:
-  case WebAssembly::ATOMIC_RMW_ADD_I32:
-  case WebAssembly::ATOMIC_RMW_ADD_I32_S:
-  case WebAssembly::ATOMIC_RMW32_U_ADD_I64:
-  case WebAssembly::ATOMIC_RMW32_U_ADD_I64_S:
-  case WebAssembly::ATOMIC_RMW_SUB_I32:
-  case WebAssembly::ATOMIC_RMW_SUB_I32_S:
-  case WebAssembly::ATOMIC_RMW32_U_SUB_I64:
-  case WebAssembly::ATOMIC_RMW32_U_SUB_I64_S:
-  case WebAssembly::ATOMIC_RMW_AND_I32:
-  case WebAssembly::ATOMIC_RMW_AND_I32_S:
-  case WebAssembly::ATOMIC_RMW32_U_AND_I64:
-  case WebAssembly::ATOMIC_RMW32_U_AND_I64_S:
-  case WebAssembly::ATOMIC_RMW_OR_I32:
-  case WebAssembly::ATOMIC_RMW_OR_I32_S:
-  case WebAssembly::ATOMIC_RMW32_U_OR_I64:
-  case WebAssembly::ATOMIC_RMW32_U_OR_I64_S:
-  case WebAssembly::ATOMIC_RMW_XOR_I32:
-  case WebAssembly::ATOMIC_RMW_XOR_I32_S:
-  case WebAssembly::ATOMIC_RMW32_U_XOR_I64:
-  case WebAssembly::ATOMIC_RMW32_U_XOR_I64_S:
-  case WebAssembly::ATOMIC_RMW_XCHG_I32:
-  case WebAssembly::ATOMIC_RMW_XCHG_I32_S:
-  case WebAssembly::ATOMIC_RMW32_U_XCHG_I64:
-  case WebAssembly::ATOMIC_RMW32_U_XCHG_I64_S:
-  case WebAssembly::ATOMIC_RMW_CMPXCHG_I32:
-  case WebAssembly::ATOMIC_RMW_CMPXCHG_I32_S:
-  case WebAssembly::ATOMIC_RMW32_U_CMPXCHG_I64:
-  case WebAssembly::ATOMIC_RMW32_U_CMPXCHG_I64_S:
-  case WebAssembly::ATOMIC_NOTIFY:
-  case WebAssembly::ATOMIC_NOTIFY_S:
-  case WebAssembly::ATOMIC_WAIT_I32:
-  case WebAssembly::ATOMIC_WAIT_I32_S:
-  case WebAssembly::LOAD_SPLAT_v32x4:
-  case WebAssembly::LOAD_SPLAT_v32x4_S:
+  WASM_LOAD_STORE(LOAD_I32)
+  WASM_LOAD_STORE(LOAD_F32)
+  WASM_LOAD_STORE(STORE_I32)
+  WASM_LOAD_STORE(STORE_F32)
+  WASM_LOAD_STORE(LOAD32_S_I64)
+  WASM_LOAD_STORE(LOAD32_U_I64)
+  WASM_LOAD_STORE(STORE32_I64)
+  WASM_LOAD_STORE(ATOMIC_LOAD_I32)
+  WASM_LOAD_STORE(ATOMIC_LOAD32_U_I64)
+  WASM_LOAD_STORE(ATOMIC_STORE_I32)
+  WASM_LOAD_STORE(ATOMIC_STORE32_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW_ADD_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW32_U_ADD_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW_SUB_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW32_U_SUB_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW_AND_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW32_U_AND_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW_OR_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW32_U_OR_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW_XOR_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW32_U_XOR_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW_XCHG_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW32_U_XCHG_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW_CMPXCHG_I32)
+  WASM_LOAD_STORE(ATOMIC_RMW32_U_CMPXCHG_I64)
+  WASM_LOAD_STORE(ATOMIC_NOTIFY)
+  WASM_LOAD_STORE(ATOMIC_WAIT_I32)
+  WASM_LOAD_STORE(LOAD_SPLAT_v32x4)
     return 2;
-  case WebAssembly::LOAD_I64:
-  case WebAssembly::LOAD_I64_S:
-  case WebAssembly::LOAD_F64:
-  case WebAssembly::LOAD_F64_S:
-  case WebAssembly::STORE_I64:
-  case WebAssembly::STORE_I64_S:
-  case WebAssembly::STORE_F64:
-  case WebAssembly::STORE_F64_S:
-  case WebAssembly::ATOMIC_LOAD_I64:
-  case WebAssembly::ATOMIC_LOAD_I64_S:
-  case WebAssembly::ATOMIC_STORE_I64:
-  case WebAssembly::ATOMIC_STORE_I64_S:
-  case WebAssembly::ATOMIC_RMW_ADD_I64:
-  case WebAssembly::ATOMIC_RMW_ADD_I64_S:
-  case WebAssembly::ATOMIC_RMW_SUB_I64:
-  case WebAssembly::ATOMIC_RMW_SUB_I64_S:
-  case WebAssembly::ATOMIC_RMW_AND_I64:
-  case WebAssembly::ATOMIC_RMW_AND_I64_S:
-  case WebAssembly::ATOMIC_RMW_OR_I64:
-  case WebAssembly::ATOMIC_RMW_OR_I64_S:
-  case WebAssembly::ATOMIC_RMW_XOR_I64:
-  case WebAssembly::ATOMIC_RMW_XOR_I64_S:
-  case WebAssembly::ATOMIC_RMW_XCHG_I64:
-  case WebAssembly::ATOMIC_RMW_XCHG_I64_S:
-  case WebAssembly::ATOMIC_RMW_CMPXCHG_I64:
-  case WebAssembly::ATOMIC_RMW_CMPXCHG_I64_S:
-  case WebAssembly::ATOMIC_WAIT_I64:
-  case WebAssembly::ATOMIC_WAIT_I64_S:
-  case WebAssembly::LOAD_SPLAT_v64x2:
-  case WebAssembly::LOAD_SPLAT_v64x2_S:
-  case WebAssembly::LOAD_EXTEND_S_v8i16:
-  case WebAssembly::LOAD_EXTEND_S_v8i16_S:
-  case WebAssembly::LOAD_EXTEND_U_v8i16:
-  case WebAssembly::LOAD_EXTEND_U_v8i16_S:
-  case WebAssembly::LOAD_EXTEND_S_v4i32:
-  case WebAssembly::LOAD_EXTEND_S_v4i32_S:
-  case WebAssembly::LOAD_EXTEND_U_v4i32:
-  case WebAssembly::LOAD_EXTEND_U_v4i32_S:
-  case WebAssembly::LOAD_EXTEND_S_v2i64:
-  case WebAssembly::LOAD_EXTEND_S_v2i64_S:
-  case WebAssembly::LOAD_EXTEND_U_v2i64:
-  case WebAssembly::LOAD_EXTEND_U_v2i64_S:
+  WASM_LOAD_STORE(LOAD_I64)
+  WASM_LOAD_STORE(LOAD_F64)
+  WASM_LOAD_STORE(STORE_I64)
+  WASM_LOAD_STORE(STORE_F64)
+  WASM_LOAD_STORE(ATOMIC_LOAD_I64)
+  WASM_LOAD_STORE(ATOMIC_STORE_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW_ADD_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW_SUB_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW_AND_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW_OR_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW_XOR_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW_XCHG_I64)
+  WASM_LOAD_STORE(ATOMIC_RMW_CMPXCHG_I64)
+  WASM_LOAD_STORE(ATOMIC_WAIT_I64)
+  WASM_LOAD_STORE(LOAD_SPLAT_v64x2)
+  WASM_LOAD_STORE(LOAD_EXTEND_S_v8i16)
+  WASM_LOAD_STORE(LOAD_EXTEND_U_v8i16)
+  WASM_LOAD_STORE(LOAD_EXTEND_S_v4i32)
+  WASM_LOAD_STORE(LOAD_EXTEND_U_v4i32)
+  WASM_LOAD_STORE(LOAD_EXTEND_S_v2i64)
+  WASM_LOAD_STORE(LOAD_EXTEND_U_v2i64)
     return 3;
-  case WebAssembly::LOAD_V128:
-  case WebAssembly::LOAD_V128_S:
-  case WebAssembly::STORE_V128:
-  case WebAssembly::STORE_V128_S:
+  WASM_LOAD_STORE(LOAD_V128)
+  WASM_LOAD_STORE(STORE_V128)
     return 4;
   default:
     return -1;
   }
+#undef WASM_LOAD_STORE
 }
 
 inline unsigned GetDefaultP2Align(unsigned Opc) {
@@ -441,30 +344,8 @@ inline bool isTee(unsigned Opc) {
 
 inline bool isCallDirect(unsigned Opc) {
   switch (Opc) {
-  case WebAssembly::CALL_VOID:
-  case WebAssembly::CALL_VOID_S:
-  case WebAssembly::CALL_i32:
-  case WebAssembly::CALL_i32_S:
-  case WebAssembly::CALL_i64:
-  case WebAssembly::CALL_i64_S:
-  case WebAssembly::CALL_f32:
-  case WebAssembly::CALL_f32_S:
-  case WebAssembly::CALL_f64:
-  case WebAssembly::CALL_f64_S:
-  case WebAssembly::CALL_v16i8:
-  case WebAssembly::CALL_v16i8_S:
-  case WebAssembly::CALL_v8i16:
-  case WebAssembly::CALL_v8i16_S:
-  case WebAssembly::CALL_v4i32:
-  case WebAssembly::CALL_v4i32_S:
-  case WebAssembly::CALL_v2i64:
-  case WebAssembly::CALL_v2i64_S:
-  case WebAssembly::CALL_v4f32:
-  case WebAssembly::CALL_v4f32_S:
-  case WebAssembly::CALL_v2f64:
-  case WebAssembly::CALL_v2f64_S:
-  case WebAssembly::CALL_exnref:
-  case WebAssembly::CALL_exnref_S:
+  case WebAssembly::CALL:
+  case WebAssembly::CALL_S:
   case WebAssembly::RET_CALL:
   case WebAssembly::RET_CALL_S:
     return true;
@@ -475,30 +356,8 @@ inline bool isCallDirect(unsigned Opc) {
 
 inline bool isCallIndirect(unsigned Opc) {
   switch (Opc) {
-  case WebAssembly::CALL_INDIRECT_VOID:
-  case WebAssembly::CALL_INDIRECT_VOID_S:
-  case WebAssembly::CALL_INDIRECT_i32:
-  case WebAssembly::CALL_INDIRECT_i32_S:
-  case WebAssembly::CALL_INDIRECT_i64:
-  case WebAssembly::CALL_INDIRECT_i64_S:
-  case WebAssembly::CALL_INDIRECT_f32:
-  case WebAssembly::CALL_INDIRECT_f32_S:
-  case WebAssembly::CALL_INDIRECT_f64:
-  case WebAssembly::CALL_INDIRECT_f64_S:
-  case WebAssembly::CALL_INDIRECT_v16i8:
-  case WebAssembly::CALL_INDIRECT_v16i8_S:
-  case WebAssembly::CALL_INDIRECT_v8i16:
-  case WebAssembly::CALL_INDIRECT_v8i16_S:
-  case WebAssembly::CALL_INDIRECT_v4i32:
-  case WebAssembly::CALL_INDIRECT_v4i32_S:
-  case WebAssembly::CALL_INDIRECT_v2i64:
-  case WebAssembly::CALL_INDIRECT_v2i64_S:
-  case WebAssembly::CALL_INDIRECT_v4f32:
-  case WebAssembly::CALL_INDIRECT_v4f32_S:
-  case WebAssembly::CALL_INDIRECT_v2f64:
-  case WebAssembly::CALL_INDIRECT_v2f64_S:
-  case WebAssembly::CALL_INDIRECT_exnref:
-  case WebAssembly::CALL_INDIRECT_exnref_S:
+  case WebAssembly::CALL_INDIRECT:
+  case WebAssembly::CALL_INDIRECT_S:
   case WebAssembly::RET_CALL_INDIRECT:
   case WebAssembly::RET_CALL_INDIRECT_S:
     return true;
@@ -507,66 +366,15 @@ inline bool isCallIndirect(unsigned Opc) {
   }
 }
 
-/// Returns the operand number of a callee, assuming the argument is a call
-/// instruction.
-inline unsigned getCalleeOpNo(unsigned Opc) {
-  switch (Opc) {
-  case WebAssembly::CALL_VOID:
-  case WebAssembly::CALL_VOID_S:
-  case WebAssembly::CALL_INDIRECT_VOID:
-  case WebAssembly::CALL_INDIRECT_VOID_S:
-  case WebAssembly::RET_CALL:
-  case WebAssembly::RET_CALL_S:
-  case WebAssembly::RET_CALL_INDIRECT:
-  case WebAssembly::RET_CALL_INDIRECT_S:
-    return 0;
-  case WebAssembly::CALL_i32:
-  case WebAssembly::CALL_i32_S:
-  case WebAssembly::CALL_i64:
-  case WebAssembly::CALL_i64_S:
-  case WebAssembly::CALL_f32:
-  case WebAssembly::CALL_f32_S:
-  case WebAssembly::CALL_f64:
-  case WebAssembly::CALL_f64_S:
-  case WebAssembly::CALL_v16i8:
-  case WebAssembly::CALL_v16i8_S:
-  case WebAssembly::CALL_v8i16:
-  case WebAssembly::CALL_v8i16_S:
-  case WebAssembly::CALL_v4i32:
-  case WebAssembly::CALL_v4i32_S:
-  case WebAssembly::CALL_v2i64:
-  case WebAssembly::CALL_v2i64_S:
-  case WebAssembly::CALL_v4f32:
-  case WebAssembly::CALL_v4f32_S:
-  case WebAssembly::CALL_v2f64:
-  case WebAssembly::CALL_v2f64_S:
-  case WebAssembly::CALL_exnref:
-  case WebAssembly::CALL_exnref_S:
-  case WebAssembly::CALL_INDIRECT_i32:
-  case WebAssembly::CALL_INDIRECT_i32_S:
-  case WebAssembly::CALL_INDIRECT_i64:
-  case WebAssembly::CALL_INDIRECT_i64_S:
-  case WebAssembly::CALL_INDIRECT_f32:
-  case WebAssembly::CALL_INDIRECT_f32_S:
-  case WebAssembly::CALL_INDIRECT_f64:
-  case WebAssembly::CALL_INDIRECT_f64_S:
-  case WebAssembly::CALL_INDIRECT_v16i8:
-  case WebAssembly::CALL_INDIRECT_v16i8_S:
-  case WebAssembly::CALL_INDIRECT_v8i16:
-  case WebAssembly::CALL_INDIRECT_v8i16_S:
-  case WebAssembly::CALL_INDIRECT_v4i32:
-  case WebAssembly::CALL_INDIRECT_v4i32_S:
-  case WebAssembly::CALL_INDIRECT_v2i64:
-  case WebAssembly::CALL_INDIRECT_v2i64_S:
-  case WebAssembly::CALL_INDIRECT_v4f32:
-  case WebAssembly::CALL_INDIRECT_v4f32_S:
-  case WebAssembly::CALL_INDIRECT_v2f64:
-  case WebAssembly::CALL_INDIRECT_v2f64_S:
-  case WebAssembly::CALL_INDIRECT_exnref:
-  case WebAssembly::CALL_INDIRECT_exnref_S:
-    return 1;
+inline bool isBrTable(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::BR_TABLE_I32:
+  case WebAssembly::BR_TABLE_I32_S:
+  case WebAssembly::BR_TABLE_I64:
+  case WebAssembly::BR_TABLE_I64_S:
+    return true;
   default:
-    llvm_unreachable("Not a call instruction");
+    return false;
   }
 }
 
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
index 7c21ed5f974e..e954eeaebb14 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.cpp
@@ -28,7 +28,7 @@ WebAssemblyTargetStreamer::WebAssemblyTargetStreamer(MCStreamer &S)
     : MCTargetStreamer(S) {}
 
 void WebAssemblyTargetStreamer::emitValueType(wasm::ValType Type) {
-  Streamer.EmitIntValue(uint8_t(Type), 1);
+  Streamer.emitIntValue(uint8_t(Type), 1);
 }
 
 WebAssemblyTargetAsmStreamer::WebAssemblyTargetAsmStreamer(
@@ -113,9 +113,9 @@ void WebAssemblyTargetWasmStreamer::emitLocal(ArrayRef<wasm::ValType> Types) {
       ++Grouped.back().second;
   }
 
-  Streamer.EmitULEB128IntValue(Grouped.size());
+  Streamer.emitULEB128IntValue(Grouped.size());
   for (auto Pair : Grouped) {
-    Streamer.EmitULEB128IntValue(Pair.second);
+    Streamer.emitULEB128IntValue(Pair.second);
     emitValueType(Pair.first);
   }
 }
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
index 9aee1a06c956..d6fba05c9986 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyTargetStreamer.h
@@ -21,7 +21,6 @@
 
 namespace llvm {
 
-class MCWasmStreamer;
 class MCSymbolWasm;
 
 /// WebAssembly-specific streamer interface, to implement support
diff --git a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
index e7a599e3e175..779e921c1d94 100644
--- a/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
+++ b/llvm/lib/Target/WebAssembly/MCTargetDesc/WebAssemblyWasmObjectWriter.cpp
@@ -78,7 +78,8 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
       return wasm::R_WASM_TABLE_INDEX_REL_SLEB;
     case MCSymbolRefExpr::VK_WASM_MBREL:
       assert(SymA.isData());
-      return wasm::R_WASM_MEMORY_ADDR_REL_SLEB;
+      return is64Bit() ? wasm::R_WASM_MEMORY_ADDR_REL_SLEB64
+                       : wasm::R_WASM_MEMORY_ADDR_REL_SLEB;
     case MCSymbolRefExpr::VK_WASM_TYPEINDEX:
       return wasm::R_WASM_TYPE_INDEX_LEB;
     default:
@@ -91,7 +92,8 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
       return wasm::R_WASM_TABLE_INDEX_SLEB;
     return wasm::R_WASM_MEMORY_ADDR_SLEB;
   case WebAssembly::fixup_sleb128_i64:
-    llvm_unreachable("fixup_sleb128_i64 not implemented yet");
+    assert(SymA.isData());
+    return wasm::R_WASM_MEMORY_ADDR_SLEB64;
   case WebAssembly::fixup_uleb128_i32:
     if (SymA.isGlobal())
       return wasm::R_WASM_GLOBAL_INDEX_LEB;
@@ -100,9 +102,14 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
     if (SymA.isEvent())
       return wasm::R_WASM_EVENT_INDEX_LEB;
     return wasm::R_WASM_MEMORY_ADDR_LEB;
+  case WebAssembly::fixup_uleb128_i64:
+    assert(SymA.isData());
+    return wasm::R_WASM_MEMORY_ADDR_LEB64;
   case FK_Data_4:
     if (SymA.isFunction())
       return wasm::R_WASM_TABLE_INDEX_I32;
+    if (SymA.isGlobal())
+      return wasm::R_WASM_GLOBAL_INDEX_I32;
     if (auto Section = static_cast<const MCSectionWasm *>(
             getFixupSection(Fixup.getValue()))) {
       if (Section->getKind().isText())
@@ -111,6 +118,9 @@ unsigned WebAssemblyWasmObjectWriter::getRelocType(const MCValue &Target,
         return wasm::R_WASM_SECTION_OFFSET_I32;
     }
     return wasm::R_WASM_MEMORY_ADDR_I32;
+  case FK_Data_8:
+    assert(SymA.isData());
+    return wasm::R_WASM_MEMORY_ADDR_I64;
   default:
     llvm_unreachable("unimplemented fixup kind");
   }
diff --git a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
index 87317f8a7f1e..f9a96819905f 100644
--- a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.cpp
@@ -32,3 +32,9 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTargetInfo() {
   RegisterTarget<Triple::wasm64> Y(getTheWebAssemblyTarget64(), "wasm64",
                                    "WebAssembly 64-bit", "WebAssembly");
 }
+
+// Defines llvm::WebAssembly::getWasm64Opcode llvm::WebAssembly::getStackOpcode
+// which have to be in a shared location between CodeGen and MC.
+#define GET_INSTRMAP_INFO 1
+#define GET_INSTRINFO_ENUM 1
+#include "WebAssemblyGenInstrInfo.inc"
diff --git a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h
index a7427f78c72c..be7a632331c8 100644
--- a/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h
+++ b/llvm/lib/Target/WebAssembly/TargetInfo/WebAssemblyTargetInfo.h
@@ -21,6 +21,13 @@ class Target;
 Target &getTheWebAssemblyTarget32();
 Target &getTheWebAssemblyTarget64();
 
+namespace WebAssembly {
+
+int getStackOpcode(unsigned short Opcode);
+int getWasm64Opcode(unsigned short Opcode);
+
+} // namespace WebAssembly
+
 } // namespace llvm
 
 #endif // LLVM_LIB_TARGET_WEBASSEMBLY_TARGETINFO_WEBASSEMBLYTARGETINFO_H
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.h b/llvm/lib/Target/WebAssembly/WebAssembly.h
index fcd48e0096b6..9ce02f7731e0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.h
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.h
@@ -44,6 +44,7 @@ FunctionPass *createWebAssemblyOptimizeLiveIntervals();
 FunctionPass *createWebAssemblyMemIntrinsicResults();
 FunctionPass *createWebAssemblyRegStackify();
 FunctionPass *createWebAssemblyRegColoring();
+FunctionPass *createWebAssemblyFixBrTableDefaults();
 FunctionPass *createWebAssemblyFixIrreducibleControlFlow();
 FunctionPass *createWebAssemblyLateEHPrepare();
 FunctionPass *createWebAssemblyCFGSort();
@@ -51,8 +52,8 @@ FunctionPass *createWebAssemblyCFGStackify();
 FunctionPass *createWebAssemblyExplicitLocals();
 FunctionPass *createWebAssemblyLowerBrUnless();
 FunctionPass *createWebAssemblyRegNumbering();
+FunctionPass *createWebAssemblyDebugFixup();
 FunctionPass *createWebAssemblyPeephole();
-FunctionPass *createWebAssemblyCallIndirectFixup();
 
 // PassRegistry initialization declarations.
 void initializeWebAssemblyAddMissingPrototypesPass(PassRegistry &);
@@ -68,6 +69,7 @@ void initializeWebAssemblyOptimizeLiveIntervalsPass(PassRegistry &);
 void initializeWebAssemblyMemIntrinsicResultsPass(PassRegistry &);
 void initializeWebAssemblyRegStackifyPass(PassRegistry &);
 void initializeWebAssemblyRegColoringPass(PassRegistry &);
+void initializeWebAssemblyFixBrTableDefaultsPass(PassRegistry &);
 void initializeWebAssemblyFixIrreducibleControlFlowPass(PassRegistry &);
 void initializeWebAssemblyLateEHPreparePass(PassRegistry &);
 void initializeWebAssemblyExceptionInfoPass(PassRegistry &);
@@ -76,11 +78,20 @@ void initializeWebAssemblyCFGStackifyPass(PassRegistry &);
 void initializeWebAssemblyExplicitLocalsPass(PassRegistry &);
 void initializeWebAssemblyLowerBrUnlessPass(PassRegistry &);
 void initializeWebAssemblyRegNumberingPass(PassRegistry &);
+void initializeWebAssemblyDebugFixupPass(PassRegistry &);
 void initializeWebAssemblyPeepholePass(PassRegistry &);
-void initializeWebAssemblyCallIndirectFixupPass(PassRegistry &);
 
 namespace WebAssembly {
-enum TargetIndex { TI_LOCAL_START, TI_GLOBAL_START, TI_OPERAND_STACK_START };
+enum TargetIndex {
+  // Followed by a local index (ULEB).
+  TI_LOCAL,
+  // Followed by an absolute global index (ULEB). DEPRECATED.
+  TI_GLOBAL_FIXED,
+  TI_OPERAND_STACK,
+  // Followed by a compilation unit relative global index (uint32_t)
+  // that will have an associated relocation.
+  TI_GLOBAL_RELOC
+};
 } // end namespace WebAssembly
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/WebAssembly/WebAssembly.td b/llvm/lib/Target/WebAssembly/WebAssembly.td
index b0b8a9b996a3..2c18bf2c3abe 100644
--- a/llvm/lib/Target/WebAssembly/WebAssembly.td
+++ b/llvm/lib/Target/WebAssembly/WebAssembly.td
@@ -66,6 +66,10 @@ def FeatureMutableGlobals :
       SubtargetFeature<"mutable-globals", "HasMutableGlobals", "true",
                        "Enable mutable globals">;
 
+def FeatureReferenceTypes :
+      SubtargetFeature<"reference-types", "HasReferenceTypes", "true",
+                       "Enable reference types">;
+
 //===----------------------------------------------------------------------===//
 // Architectures.
 //===----------------------------------------------------------------------===//
@@ -98,7 +102,8 @@ def : ProcessorModel<"generic", NoSchedModel, []>;
 def : ProcessorModel<"bleeding-edge", NoSchedModel,
                       [FeatureSIMD128, FeatureAtomics,
                        FeatureNontrappingFPToInt, FeatureSignExt,
-                       FeatureMutableGlobals]>;
+                       FeatureMutableGlobals, FeatureBulkMemory,
+                       FeatureTailCall]>;
 
 //===----------------------------------------------------------------------===//
 // Target Declaration
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
index b7a701f15782..530a55cda0e5 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAddMissingPrototypes.cpp
@@ -132,7 +132,7 @@ bool WebAssemblyAddMissingPrototypes::runOnModule(Module &M) {
   for (auto &Pair : Replacements) {
     Function *OldF = Pair.first;
     Function *NewF = Pair.second;
-    std::string Name = OldF->getName();
+    std::string Name = std::string(OldF->getName());
     M.getFunctionList().push_back(NewF);
     OldF->replaceAllUsesWith(
         ConstantExpr::getPointerBitCastOrAddrSpaceCast(NewF, OldF->getType()));
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
index adcb24b4be53..96fa13d30729 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.cpp
@@ -85,7 +85,7 @@ WebAssemblyTargetStreamer *WebAssemblyAsmPrinter::getTargetStreamer() {
 // WebAssemblyAsmPrinter Implementation.
 //===----------------------------------------------------------------------===//
 
-void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
+void WebAssemblyAsmPrinter::emitEndOfAsmFile(Module &M) {
   for (auto &It : OutContext.getSymbols()) {
     // Emit a .globaltype and .eventtype declaration.
     auto Sym = cast<MCSymbolWasm>(It.getValue());
@@ -103,7 +103,7 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
     if (F.isDeclarationForLinker()) {
       SmallVector<MVT, 4> Results;
       SmallVector<MVT, 4> Params;
-      computeSignatureVTs(F.getFunctionType(), F, TM, Params, Results);
+      computeSignatureVTs(F.getFunctionType(), &F, F, TM, Params, Results);
       auto *Sym = cast<MCSymbolWasm>(getSymbol(&F));
       Sym->setType(wasm::WASM_SYMBOL_TYPE_FUNCTION);
       if (!Sym->getSignature()) {
@@ -122,14 +122,14 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
           F.hasFnAttribute("wasm-import-module")) {
         StringRef Name =
             F.getFnAttribute("wasm-import-module").getValueAsString();
-        Sym->setImportModule(Name);
+        Sym->setImportModule(storeName(Name));
         getTargetStreamer()->emitImportModule(Sym, Name);
       }
       if (TM.getTargetTriple().isOSBinFormatWasm() &&
           F.hasFnAttribute("wasm-import-name")) {
         StringRef Name =
             F.getFnAttribute("wasm-import-name").getValueAsString();
-        Sym->setImportName(Name);
+        Sym->setImportName(storeName(Name));
         getTargetStreamer()->emitImportName(Sym, Name);
       }
     }
@@ -137,7 +137,7 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
     if (F.hasFnAttribute("wasm-export-name")) {
       auto *Sym = cast<MCSymbolWasm>(getSymbol(&F));
       StringRef Name = F.getFnAttribute("wasm-export-name").getValueAsString();
-      Sym->setExportName(Name);
+      Sym->setExportName(storeName(Name));
       getTargetStreamer()->emitExportName(Sym, Name);
     }
   }
@@ -167,7 +167,7 @@ void WebAssemblyAsmPrinter::EmitEndOfAsmFile(Module &M) {
       MCSectionWasm *MySection =
           OutContext.getWasmSection(SectionName, SectionKind::getMetadata());
       OutStreamer->SwitchSection(MySection);
-      OutStreamer->EmitBytes(Contents->getString());
+      OutStreamer->emitBytes(Contents->getString());
       OutStreamer->PopSection();
     }
   }
@@ -208,19 +208,19 @@ void WebAssemblyAsmPrinter::EmitProducerInfo(Module &M) {
         ".custom_section.producers", SectionKind::getMetadata());
     OutStreamer->PushSection();
     OutStreamer->SwitchSection(Producers);
-    OutStreamer->EmitULEB128IntValue(FieldCount);
+    OutStreamer->emitULEB128IntValue(FieldCount);
     for (auto &Producers : {std::make_pair("language", &Languages),
             std::make_pair("processed-by", &Tools)}) {
       if (Producers.second->empty())
         continue;
-      OutStreamer->EmitULEB128IntValue(strlen(Producers.first));
-      OutStreamer->EmitBytes(Producers.first);
-      OutStreamer->EmitULEB128IntValue(Producers.second->size());
+      OutStreamer->emitULEB128IntValue(strlen(Producers.first));
+      OutStreamer->emitBytes(Producers.first);
+      OutStreamer->emitULEB128IntValue(Producers.second->size());
       for (auto &Producer : *Producers.second) {
-        OutStreamer->EmitULEB128IntValue(Producer.first.size());
-        OutStreamer->EmitBytes(Producer.first);
-        OutStreamer->EmitULEB128IntValue(Producer.second.size());
-        OutStreamer->EmitBytes(Producer.second);
+        OutStreamer->emitULEB128IntValue(Producer.first.size());
+        OutStreamer->emitBytes(Producer.first);
+        OutStreamer->emitULEB128IntValue(Producer.second.size());
+        OutStreamer->emitBytes(Producer.second);
       }
     }
     OutStreamer->PopSection();
@@ -230,20 +230,20 @@ void WebAssemblyAsmPrinter::EmitProducerInfo(Module &M) {
 void WebAssemblyAsmPrinter::EmitTargetFeatures(Module &M) {
   struct FeatureEntry {
     uint8_t Prefix;
-    StringRef Name;
+    std::string Name;
   };
 
   // Read target features and linkage policies from module metadata
   SmallVector<FeatureEntry, 4> EmittedFeatures;
-  for (const SubtargetFeatureKV &KV : WebAssemblyFeatureKV) {
-    std::string MDKey = (StringRef("wasm-feature-") + KV.Key).str();
+  auto EmitFeature = [&](std::string Feature) {
+    std::string MDKey = (StringRef("wasm-feature-") + Feature).str();
     Metadata *Policy = M.getModuleFlag(MDKey);
     if (Policy == nullptr)
-      continue;
+      return;
 
     FeatureEntry Entry;
     Entry.Prefix = 0;
-    Entry.Name = KV.Key;
+    Entry.Name = Feature;
 
     if (auto *MD = cast<ConstantAsMetadata>(Policy))
       if (auto *I = cast<ConstantInt>(MD->getValue()))
@@ -253,10 +253,16 @@ void WebAssemblyAsmPrinter::EmitTargetFeatures(Module &M) {
     if (Entry.Prefix != wasm::WASM_FEATURE_PREFIX_USED &&
         Entry.Prefix != wasm::WASM_FEATURE_PREFIX_REQUIRED &&
         Entry.Prefix != wasm::WASM_FEATURE_PREFIX_DISALLOWED)
-      continue;
+      return;
 
     EmittedFeatures.push_back(Entry);
+  };
+
+  for (const SubtargetFeatureKV &KV : WebAssemblyFeatureKV) {
+    EmitFeature(KV.Key);
   }
+  // This pseudo-feature tells the linker whether shared memory would be safe
+  EmitFeature("shared-mem");
 
   if (EmittedFeatures.size() == 0)
     return;
@@ -267,30 +273,31 @@ void WebAssemblyAsmPrinter::EmitTargetFeatures(Module &M) {
   OutStreamer->PushSection();
   OutStreamer->SwitchSection(FeaturesSection);
 
-  OutStreamer->EmitULEB128IntValue(EmittedFeatures.size());
+  OutStreamer->emitULEB128IntValue(EmittedFeatures.size());
   for (auto &F : EmittedFeatures) {
-    OutStreamer->EmitIntValue(F.Prefix, 1);
-    OutStreamer->EmitULEB128IntValue(F.Name.size());
-    OutStreamer->EmitBytes(F.Name);
+    OutStreamer->emitIntValue(F.Prefix, 1);
+    OutStreamer->emitULEB128IntValue(F.Name.size());
+    OutStreamer->emitBytes(F.Name);
   }
 
   OutStreamer->PopSection();
 }
 
-void WebAssemblyAsmPrinter::EmitConstantPool() {
+void WebAssemblyAsmPrinter::emitConstantPool() {
   assert(MF->getConstantPool()->getConstants().empty() &&
          "WebAssembly disables constant pools");
 }
 
-void WebAssemblyAsmPrinter::EmitJumpTableInfo() {
+void WebAssemblyAsmPrinter::emitJumpTableInfo() {
   // Nothing to do; jump tables are incorporated into the instruction stream.
 }
 
-void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
+void WebAssemblyAsmPrinter::emitFunctionBodyStart() {
   const Function &F = MF->getFunction();
   SmallVector<MVT, 1> ResultVTs;
   SmallVector<MVT, 4> ParamVTs;
-  computeSignatureVTs(F.getFunctionType(), F, TM, ParamVTs, ResultVTs);
+  computeSignatureVTs(F.getFunctionType(), &F, F, TM, ParamVTs, ResultVTs);
+
   auto Signature = signatureFromMVTs(ResultVTs, ParamVTs);
   auto *WasmSym = cast<MCSymbolWasm>(CurrentFnSym);
   WasmSym->setSignature(Signature.get());
@@ -312,10 +319,10 @@ void WebAssemblyAsmPrinter::EmitFunctionBodyStart() {
   valTypesFromMVTs(MFI->getLocals(), Locals);
   getTargetStreamer()->emitLocal(Locals);
 
-  AsmPrinter::EmitFunctionBodyStart();
+  AsmPrinter::emitFunctionBodyStart();
 }
 
-void WebAssemblyAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void WebAssemblyAsmPrinter::emitInstruction(const MachineInstr *MI) {
   LLVM_DEBUG(dbgs() << "EmitInstruction: " << *MI << '\n');
 
   switch (MI->getOpcode()) {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
index 4e55c81dec38..d9281568638d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyAsmPrinter.h
@@ -16,9 +16,7 @@
 #include "llvm/Target/TargetMachine.h"
 
 namespace llvm {
-class MCSymbol;
 class WebAssemblyTargetStreamer;
-class WebAssemblyMCInstLower;
 
 class LLVM_LIBRARY_VISIBILITY WebAssemblyAsmPrinter final : public AsmPrinter {
   const WebAssemblySubtarget *Subtarget;
@@ -26,6 +24,13 @@ class LLVM_LIBRARY_VISIBILITY WebAssemblyAsmPrinter final : public AsmPrinter {
   WebAssemblyFunctionInfo *MFI;
   // TODO: Do the uniquing of Signatures here instead of ObjectFileWriter?
   std::vector<std::unique_ptr<wasm::WasmSignature>> Signatures;
+  std::vector<std::unique_ptr<std::string>> Names;
+
+  StringRef storeName(StringRef Name) {
+    std::unique_ptr<std::string> N = std::make_unique<std::string>(Name);
+    Names.push_back(std::move(N));
+    return *Names.back();
+  }
 
 public:
   explicit WebAssemblyAsmPrinter(TargetMachine &TM,
@@ -57,13 +62,13 @@ public:
   // AsmPrinter Implementation.
   //===------------------------------------------------------------------===//
 
-  void EmitEndOfAsmFile(Module &M) override;
+  void emitEndOfAsmFile(Module &M) override;
   void EmitProducerInfo(Module &M);
   void EmitTargetFeatures(Module &M);
-  void EmitJumpTableInfo() override;
-  void EmitConstantPool() override;
-  void EmitFunctionBodyStart() override;
-  void EmitInstruction(const MachineInstr *MI) override;
+  void emitJumpTableInfo() override;
+  void emitConstantPool() override;
+  void emitFunctionBodyStart() override;
+  void emitInstruction(const MachineInstr *MI) override;
   bool PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
                        const char *ExtraCode, raw_ostream &OS) override;
   bool PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
index c069af9eed62..8442b49e25f4 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGSort.cpp
@@ -79,7 +79,6 @@ template <> bool ConcreteRegion<MachineLoop>::isLoop() const { return true; }
 class RegionInfo {
   const MachineLoopInfo &MLI;
   const WebAssemblyExceptionInfo &WEI;
-  std::vector<const Region *> Regions;
   DenseMap<const MachineLoop *, std::unique_ptr<Region>> LoopMap;
   DenseMap<const WebAssemblyException *, std::unique_ptr<Region>> ExceptionMap;
 
@@ -93,7 +92,14 @@ public:
     const auto *WE = WEI.getExceptionFor(MBB);
     if (!ML && !WE)
       return nullptr;
-    if ((ML && !WE) || (ML && WE && ML->getNumBlocks() < WE->getNumBlocks())) {
+    // We determine subregion relationship by domination of their headers, i.e.,
+    // if region A's header dominates region B's header, B is a subregion of A.
+    // WebAssemblyException contains BBs in all its subregions (loops or
+    // exceptions), but MachineLoop may not, because MachineLoop does not contain
+    // BBs that don't have a path to its header even if they are dominated by
+    // its header. So here we should use WE->contains(ML->getHeader()), but not
+    // ML->contains(WE->getHeader()).
+    if ((ML && !WE) || (ML && WE && WE->contains(ML->getHeader()))) {
       // If the smallest region containing MBB is a loop
       if (LoopMap.count(ML))
         return LoopMap[ML].get();
@@ -152,9 +158,17 @@ static void maybeUpdateTerminator(MachineBasicBlock *MBB) {
     AllAnalyzable &= Term.isBranch() && !Term.isIndirectBranch();
   }
   assert((AnyBarrier || AllAnalyzable) &&
-         "AnalyzeBranch needs to analyze any block with a fallthrough");
+         "analyzeBranch needs to analyze any block with a fallthrough");
+
+  // Find the layout successor from the original block order.
+  MachineFunction *MF = MBB->getParent();
+  MachineBasicBlock *OriginalSuccessor =
+      unsigned(MBB->getNumber() + 1) < MF->getNumBlockIDs()
+          ? MF->getBlockNumbered(MBB->getNumber() + 1)
+          : nullptr;
+
   if (AllAnalyzable)
-    MBB->updateTerminator();
+    MBB->updateTerminator(OriginalSuccessor);
 }
 
 namespace {
@@ -241,9 +255,12 @@ struct Entry {
 static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
                        const WebAssemblyExceptionInfo &WEI,
                        const MachineDominatorTree &MDT) {
+  // Remember original layout ordering, so we can update terminators after
+  // reordering to point to the original layout successor.
+  MF.RenumberBlocks();
+
   // Prepare for a topological sort: Record the number of predecessors each
   // block has, ignoring loop backedges.
-  MF.RenumberBlocks();
   SmallVector<unsigned, 16> NumPredsLeft(MF.getNumBlockIDs(), 0);
   for (MachineBasicBlock &MBB : MF) {
     unsigned N = MBB.pred_size();
@@ -368,6 +385,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
     const Region *Region = RI.getRegionFor(&MBB);
 
     if (Region && &MBB == Region->getHeader()) {
+      // Region header.
       if (Region->isLoop()) {
         // Loop header. The loop predecessor should be sorted above, and the
         // other predecessors should be backedges below.
@@ -377,7 +395,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
               "Loop header predecessors must be loop predecessors or "
               "backedges");
       } else {
-        // Not a loop header. All predecessors should be sorted above.
+        // Exception header. All predecessors should be sorted above.
         for (auto Pred : MBB.predecessors())
           assert(Pred->getNumber() < MBB.getNumber() &&
                  "Non-loop-header predecessors should be topologically sorted");
@@ -386,7 +404,7 @@ static void sortBlocks(MachineFunction &MF, const MachineLoopInfo &MLI,
              "Regions should be declared at most once.");
 
     } else {
-      // Not a loop header. All predecessors should be sorted above.
+      // Not a region header. All predecessors should be sorted above.
       for (auto Pred : MBB.predecessors())
         assert(Pred->getNumber() < MBB.getNumber() &&
                "Non-loop-header predecessors should be topologically sorted");
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
index 7e867edaaa27..8cbfc98e8197 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyCFGStackify.cpp
@@ -31,6 +31,7 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
+#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-cfg-stackify"
@@ -277,11 +278,19 @@ void WebAssemblyCFGStackify::placeBlockMarker(MachineBasicBlock &MBB) {
 #endif
     }
 
-    // All previously inserted BLOCK/TRY markers should be after the BLOCK
-    // because they are all nested blocks.
+    // If there is a previously placed BLOCK/TRY marker and its corresponding
+    // END marker is before the current BLOCK's END marker, that should be
+    // placed after this BLOCK. Otherwise it should be placed before this BLOCK
+    // marker.
     if (MI.getOpcode() == WebAssembly::BLOCK ||
-        MI.getOpcode() == WebAssembly::TRY)
-      AfterSet.insert(&MI);
+        MI.getOpcode() == WebAssembly::TRY) {
+      if (BeginToEnd[&MI]->getParent()->getNumber() <= MBB.getNumber())
+        AfterSet.insert(&MI);
+#ifndef NDEBUG
+      else
+        BeforeSet.insert(&MI);
+#endif
+    }
 
 #ifndef NDEBUG
     // All END_(BLOCK|LOOP|TRY) markers should be before the BLOCK.
@@ -661,9 +670,28 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
     MachineBasicBlock *EHPadLayoutPred = MBB.getPrevNode();
     MachineBasicBlock *Cont = BeginToEnd[EHPadToTry[&MBB]]->getParent();
     bool Analyzable = !TII.analyzeBranch(*EHPadLayoutPred, TBB, FBB, Cond);
+    // This condition means either
+    // 1. This BB ends with a single unconditional branch whose destinaion is
+    //    Cont.
+    // 2. This BB ends with a conditional branch followed by an unconditional
+    //    branch, and the unconditional branch's destination is Cont.
+    // In both cases, we want to remove the last (= unconditional) branch.
     if (Analyzable && ((Cond.empty() && TBB && TBB == Cont) ||
-                       (!Cond.empty() && FBB && FBB == Cont)))
-      TII.removeBranch(*EHPadLayoutPred);
+                       (!Cond.empty() && FBB && FBB == Cont))) {
+      bool ErasedUncondBr = false;
+      (void)ErasedUncondBr;
+      for (auto I = EHPadLayoutPred->end(), E = EHPadLayoutPred->begin();
+           I != E; --I) {
+        auto PrevI = std::prev(I);
+        if (PrevI->isTerminator()) {
+          assert(PrevI->getOpcode() == WebAssembly::BR);
+          PrevI->eraseFromParent();
+          ErasedUncondBr = true;
+          break;
+        }
+      }
+      assert(ErasedUncondBr && "Unconditional branch not erased!");
+    }
   }
 
   // When there are block / end_block markers that overlap with try / end_try
@@ -705,12 +733,30 @@ void WebAssemblyCFGStackify::removeUnnecessaryInstrs(MachineFunction &MF) {
   }
 }
 
+// Get the appropriate copy opcode for the given register class.
+static unsigned getCopyOpcode(const TargetRegisterClass *RC) {
+  if (RC == &WebAssembly::I32RegClass)
+    return WebAssembly::COPY_I32;
+  if (RC == &WebAssembly::I64RegClass)
+    return WebAssembly::COPY_I64;
+  if (RC == &WebAssembly::F32RegClass)
+    return WebAssembly::COPY_F32;
+  if (RC == &WebAssembly::F64RegClass)
+    return WebAssembly::COPY_F64;
+  if (RC == &WebAssembly::V128RegClass)
+    return WebAssembly::COPY_V128;
+  if (RC == &WebAssembly::EXNREFRegClass)
+    return WebAssembly::COPY_EXNREF;
+  llvm_unreachable("Unexpected register class");
+}
+
 // When MBB is split into MBB and Split, we should unstackify defs in MBB that
 // have their uses in Split.
 static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB,
                                          MachineBasicBlock &Split,
                                          WebAssemblyFunctionInfo &MFI,
-                                         MachineRegisterInfo &MRI) {
+                                         MachineRegisterInfo &MRI,
+                                         const WebAssemblyInstrInfo &TII) {
   for (auto &MI : Split) {
     for (auto &MO : MI.explicit_uses()) {
       if (!MO.isReg() || Register::isPhysicalRegister(MO.getReg()))
@@ -720,6 +766,47 @@ static void unstackifyVRegsUsedInSplitBB(MachineBasicBlock &MBB,
           MFI.unstackifyVReg(MO.getReg());
     }
   }
+
+  // In RegStackify, when a register definition is used multiple times,
+  //    Reg = INST ...
+  //    INST ..., Reg, ...
+  //    INST ..., Reg, ...
+  //    INST ..., Reg, ...
+  //
+  // we introduce a TEE, which has the following form:
+  //    DefReg = INST ...
+  //    TeeReg, Reg = TEE_... DefReg
+  //    INST ..., TeeReg, ...
+  //    INST ..., Reg, ...
+  //    INST ..., Reg, ...
+  // with DefReg and TeeReg stackified but Reg not stackified.
+  //
+  // But the invariant that TeeReg should be stackified can be violated while we
+  // unstackify registers in the split BB above. In this case, we convert TEEs
+  // into two COPYs. This COPY will be eventually eliminated in ExplicitLocals.
+  //    DefReg = INST ...
+  //    TeeReg = COPY DefReg
+  //    Reg = COPY DefReg
+  //    INST ..., TeeReg, ...
+  //    INST ..., Reg, ...
+  //    INST ..., Reg, ...
+  for (auto I = MBB.begin(), E = MBB.end(); I != E;) {
+    MachineInstr &MI = *I++;
+    if (!WebAssembly::isTee(MI.getOpcode()))
+      continue;
+    Register TeeReg = MI.getOperand(0).getReg();
+    Register Reg = MI.getOperand(1).getReg();
+    Register DefReg = MI.getOperand(2).getReg();
+    if (!MFI.isVRegStackified(TeeReg)) {
+      // Now we are not using TEE anymore, so unstackify DefReg too
+      MFI.unstackifyVReg(DefReg);
+      unsigned CopyOpc = getCopyOpcode(MRI.getRegClass(DefReg));
+      BuildMI(MBB, &MI, MI.getDebugLoc(), TII.get(CopyOpc), TeeReg)
+          .addReg(DefReg);
+      BuildMI(MBB, &MI, MI.getDebugLoc(), TII.get(CopyOpc), Reg).addReg(DefReg);
+      MI.eraseFromParent();
+    }
+  }
 }
 
 bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
@@ -866,6 +953,10 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
   // In new CFG, <destination to branch to, register containing exnref>
   DenseMap<MachineBasicBlock *, unsigned> BrDestToExnReg;
 
+  // Destinations for branches that will be newly added, for which a new
+  // BLOCK/END_BLOCK markers are necessary.
+  SmallVector<MachineBasicBlock *, 8> BrDests;
+
   // Gather possibly throwing calls (i.e., previously invokes) whose current
   // unwind destination is not the same as the original CFG.
   for (auto &MBB : reverse(MF)) {
@@ -1036,7 +1127,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
     BrDest->insert(BrDest->end(), EndTry->removeFromParent());
     // Take out the handler body from EH pad to the new branch destination BB.
     BrDest->splice(BrDest->end(), EHPad, SplitPos, EHPad->end());
-    unstackifyVRegsUsedInSplitBB(*EHPad, *BrDest, MFI, MRI);
+    unstackifyVRegsUsedInSplitBB(*EHPad, *BrDest, MFI, MRI, TII);
     // Fix predecessor-successor relationship.
     BrDest->transferSuccessors(EHPad);
     EHPad->addSuccessor(BrDest);
@@ -1075,6 +1166,7 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
                         ? DebugLoc()
                         : EHPadLayoutPred->rbegin()->getDebugLoc();
       BuildMI(EHPadLayoutPred, DL, TII.get(WebAssembly::BR)).addMBB(Cont);
+      BrDests.push_back(Cont);
     }
   }
 
@@ -1109,6 +1201,9 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
       MachineInstr *RangeBegin = nullptr, *RangeEnd = nullptr;
       std::tie(RangeBegin, RangeEnd) = Range;
       auto *MBB = RangeBegin->getParent();
+      // Store the first function call from this range, because RangeBegin can
+      // be moved to point EH_LABEL before the call
+      MachineInstr *RangeBeginCall = RangeBegin;
 
       // Include possible EH_LABELs in the range
       if (RangeBegin->getIterator() != MBB->begin() &&
@@ -1126,9 +1221,27 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
         }
       }
 
+      // Local expression tree before the first call of this range should go
+      // after the nested TRY.
+      SmallPtrSet<const MachineInstr *, 4> AfterSet;
+      AfterSet.insert(RangeBegin);
+      AfterSet.insert(RangeBeginCall);
+      for (auto I = MachineBasicBlock::iterator(RangeBeginCall),
+                E = MBB->begin();
+           I != E; --I) {
+        if (std::prev(I)->isDebugInstr() || std::prev(I)->isPosition())
+          continue;
+        if (WebAssembly::isChild(*std::prev(I), MFI))
+          AfterSet.insert(&*std::prev(I));
+        else
+          break;
+      }
+
       // Create the nested try instruction.
+      auto InsertPos = getLatestInsertPos(
+          MBB, SmallPtrSet<const MachineInstr *, 4>(), AfterSet);
       MachineInstr *NestedTry =
-          BuildMI(*MBB, *RangeBegin, RangeBegin->getDebugLoc(),
+          BuildMI(*MBB, InsertPos, RangeBegin->getDebugLoc(),
                   TII.get(WebAssembly::TRY))
               .addImm(int64_t(WebAssembly::BlockType::Void));
 
@@ -1152,13 +1265,21 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
       // new nested continuation BB.
       NestedCont->splice(NestedCont->end(), MBB,
                          std::next(RangeEnd->getIterator()), MBB->end());
-      unstackifyVRegsUsedInSplitBB(*MBB, *NestedCont, MFI, MRI);
+      unstackifyVRegsUsedInSplitBB(*MBB, *NestedCont, MFI, MRI, TII);
       registerTryScope(NestedTry, NestedEndTry, NestedEHPad);
 
       // Fix predecessor-successor relationship.
       NestedCont->transferSuccessors(MBB);
-      if (EHPad)
+      if (EHPad) {
         NestedCont->removeSuccessor(EHPad);
+        // If EHPad does not have any predecessors left after removing
+        // NextedCont predecessor, remove its successor too, because this EHPad
+        // is not reachable from the entry BB anyway. We can't remove EHPad BB
+        // itself because it can contain 'catch' or 'end', which are necessary
+        // for keeping try-catch-end structure.
+        if (EHPad->pred_empty())
+          EHPad->removeSuccessor(BrDest);
+      }
       MBB->addSuccessor(NestedEHPad);
       MBB->addSuccessor(NestedCont);
       NestedEHPad->addSuccessor(BrDest);
@@ -1190,10 +1311,14 @@ bool WebAssemblyCFGStackify::fixUnwindMismatches(MachineFunction &MF) {
   // Recompute the dominator tree.
   getAnalysis<MachineDominatorTree>().runOnMachineFunction(MF);
 
-  // Place block markers for newly added branches.
-  SmallVector <MachineBasicBlock *, 8> BrDests;
-  for (auto &P : BrDestToTryRanges)
-    BrDests.push_back(P.first);
+  // Place block markers for newly added branches, if necessary.
+
+  // If we've created an appendix BB and a branch to it, place a block/end_block
+  // marker for that. For some new branches, those branch destination BBs start
+  // with a hoisted end_try marker, so we don't need a new marker there.
+  if (AppendixBB)
+    BrDests.push_back(AppendixBB);
+
   llvm::sort(BrDests,
              [&](const MachineBasicBlock *A, const MachineBasicBlock *B) {
                auto ANum = A->getNumber();
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
deleted file mode 100644
index 2537e6042b1e..000000000000
--- a/llvm/lib/Target/WebAssembly/WebAssemblyCallIndirectFixup.cpp
+++ /dev/null
@@ -1,150 +0,0 @@
-//===-- WebAssemblyCallIndirectFixup.cpp - Fix call_indirects -------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-/// \file
-/// This file converts pseudo call_indirect instructions into real
-/// call_indirects.
-///
-/// The order of arguments for a call_indirect is the arguments to the function
-/// call, followed by the function pointer. There's no natural way to express
-/// a machineinstr with varargs followed by one more arg, so we express it as
-/// the function pointer followed by varargs, then rewrite it here.
-///
-/// We need to rewrite the order of the arguments on the machineinstrs
-/// themselves so that register stackification knows the order they'll be
-/// executed in.
-///
-//===----------------------------------------------------------------------===//
-
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h" // for WebAssembly::ARGUMENT_*
-#include "WebAssembly.h"
-#include "WebAssemblyMachineFunctionInfo.h"
-#include "WebAssemblySubtarget.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/CodeGen/LiveIntervals.h"
-#include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
-#include "llvm/CodeGen/MachineDominators.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-using namespace llvm;
-
-#define DEBUG_TYPE "wasm-call-indirect-fixup"
-
-namespace {
-class WebAssemblyCallIndirectFixup final : public MachineFunctionPass {
-  StringRef getPassName() const override {
-    return "WebAssembly CallIndirect Fixup";
-  }
-
-  bool runOnMachineFunction(MachineFunction &MF) override;
-
-public:
-  static char ID; // Pass identification, replacement for typeid
-  WebAssemblyCallIndirectFixup() : MachineFunctionPass(ID) {}
-};
-} // end anonymous namespace
-
-char WebAssemblyCallIndirectFixup::ID = 0;
-INITIALIZE_PASS(WebAssemblyCallIndirectFixup, DEBUG_TYPE,
-                "Rewrite call_indirect argument orderings", false, false)
-
-FunctionPass *llvm::createWebAssemblyCallIndirectFixup() {
-  return new WebAssemblyCallIndirectFixup();
-}
-
-static unsigned getNonPseudoCallIndirectOpcode(const MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-    using namespace WebAssembly;
-  case PCALL_INDIRECT_VOID:
-    return CALL_INDIRECT_VOID;
-  case PCALL_INDIRECT_i32:
-    return CALL_INDIRECT_i32;
-  case PCALL_INDIRECT_i64:
-    return CALL_INDIRECT_i64;
-  case PCALL_INDIRECT_f32:
-    return CALL_INDIRECT_f32;
-  case PCALL_INDIRECT_f64:
-    return CALL_INDIRECT_f64;
-  case PCALL_INDIRECT_v16i8:
-    return CALL_INDIRECT_v16i8;
-  case PCALL_INDIRECT_v8i16:
-    return CALL_INDIRECT_v8i16;
-  case PCALL_INDIRECT_v4i32:
-    return CALL_INDIRECT_v4i32;
-  case PCALL_INDIRECT_v2i64:
-    return CALL_INDIRECT_v2i64;
-  case PCALL_INDIRECT_v4f32:
-    return CALL_INDIRECT_v4f32;
-  case PCALL_INDIRECT_v2f64:
-    return CALL_INDIRECT_v2f64;
-  case PCALL_INDIRECT_exnref:
-    return CALL_INDIRECT_exnref;
-  case PRET_CALL_INDIRECT:
-    return RET_CALL_INDIRECT;
-  default:
-    return INSTRUCTION_LIST_END;
-  }
-}
-
-static bool isPseudoCallIndirect(const MachineInstr &MI) {
-  return getNonPseudoCallIndirectOpcode(MI) !=
-         WebAssembly::INSTRUCTION_LIST_END;
-}
-
-bool WebAssemblyCallIndirectFixup::runOnMachineFunction(MachineFunction &MF) {
-  LLVM_DEBUG(dbgs() << "********** Fixing up CALL_INDIRECTs **********\n"
-                    << "********** Function: " << MF.getName() << '\n');
-
-  bool Changed = false;
-  const WebAssemblyInstrInfo *TII =
-      MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
-
-  for (MachineBasicBlock &MBB : MF) {
-    for (MachineInstr &MI : MBB) {
-      if (isPseudoCallIndirect(MI)) {
-        LLVM_DEBUG(dbgs() << "Found call_indirect: " << MI << '\n');
-
-        // Rewrite pseudo to non-pseudo
-        const MCInstrDesc &Desc = TII->get(getNonPseudoCallIndirectOpcode(MI));
-        MI.setDesc(Desc);
-
-        // Rewrite argument order
-        SmallVector<MachineOperand, 8> Ops;
-
-        // Set up a placeholder for the type signature immediate.
-        Ops.push_back(MachineOperand::CreateImm(0));
-
-        // Set up the flags immediate, which currently has no defined flags
-        // so it's always zero.
-        Ops.push_back(MachineOperand::CreateImm(0));
-
-        for (const MachineOperand &MO :
-             make_range(MI.operands_begin() + MI.getDesc().getNumDefs() + 1,
-                        MI.operands_begin() + MI.getNumExplicitOperands()))
-          Ops.push_back(MO);
-        Ops.push_back(MI.getOperand(MI.getDesc().getNumDefs()));
-
-        // Replace the instructions operands.
-        while (MI.getNumOperands() > MI.getDesc().getNumDefs())
-          MI.RemoveOperand(MI.getNumOperands() - 1);
-        for (const MachineOperand &MO : Ops)
-          MI.addOperand(MO);
-
-        LLVM_DEBUG(dbgs() << "  After transform: " << MI);
-        Changed = true;
-      }
-    }
-  }
-
-  LLVM_DEBUG(dbgs() << "\nDone fixing up CALL_INDIRECTs\n\n");
-
-  return Changed;
-}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp
new file mode 100644
index 000000000000..655e30a29eff
--- /dev/null
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyDebugFixup.cpp
@@ -0,0 +1,138 @@
+//===-- WebAssemblyDebugFixup.cpp - Debug Fixup ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file
+/// Several prior passes may "stackify" registers, here we ensure any references
+/// in such registers in debug_value instructions become stack relative also.
+/// This is done in a separate pass such that not all previous passes need to
+/// track stack depth when values get stackified.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
+#include "WebAssemblySubtarget.h"
+#include "WebAssemblyUtilities.h"
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-debug-fixup"
+
+namespace {
+class WebAssemblyDebugFixup final : public MachineFunctionPass {
+  StringRef getPassName() const override { return "WebAssembly Debug Fixup"; }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyDebugFixup() : MachineFunctionPass(ID) {}
+};
+} // end anonymous namespace
+
+char WebAssemblyDebugFixup::ID = 0;
+INITIALIZE_PASS(
+    WebAssemblyDebugFixup, DEBUG_TYPE,
+    "Ensures debug_value's that have been stackified become stack relative",
+    false, false)
+
+FunctionPass *llvm::createWebAssemblyDebugFixup() {
+  return new WebAssemblyDebugFixup();
+}
+
+bool WebAssemblyDebugFixup::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "********** Debug Fixup **********\n"
+                       "********** Function: "
+                    << MF.getName() << '\n');
+
+  WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+
+  struct StackElem {
+    unsigned Reg;
+    MachineInstr *DebugValue;
+  };
+  std::vector<StackElem> Stack;
+  for (MachineBasicBlock &MBB : MF) {
+    // We may insert into this list.
+    for (auto MII = MBB.begin(); MII != MBB.end(); ++MII) {
+      MachineInstr &MI = *MII;
+      if (MI.isDebugValue()) {
+        auto &MO = MI.getOperand(0);
+        // Also check if not a $noreg: likely a DBG_VALUE we just inserted.
+        if (MO.isReg() && MO.getReg().isValid() &&
+            MFI.isVRegStackified(MO.getReg())) {
+          // Found a DBG_VALUE with a stackified register we will
+          // change into a stack operand.
+          // Search for register rather than assume it is on top (which it
+          // typically is if it appears right after the def), since
+          // DBG_VALUE's may shift under some circumstances.
+          for (auto &Elem : reverse(Stack)) {
+            if (MO.getReg() == Elem.Reg) {
+              auto Depth = static_cast<unsigned>(&Elem - &Stack[0]);
+              LLVM_DEBUG(dbgs() << "Debug Value VReg " << MO.getReg()
+                                << " -> Stack Relative " << Depth << "\n");
+              MO.ChangeToTargetIndex(WebAssembly::TI_OPERAND_STACK, Depth);
+              // Save the DBG_VALUE instruction that defined this stackified
+              // variable since later we need it to construct another one on
+              // pop.
+              Elem.DebugValue = &MI;
+              break;
+            }
+          }
+          // If the Reg was not found, we have a DBG_VALUE outside of its
+          // def-use range, and we leave it unmodified as reg, which means
+          // it will be culled later.
+        }
+      } else {
+        // Track stack depth.
+        for (MachineOperand &MO : reverse(MI.explicit_uses())) {
+          if (MO.isReg() && MFI.isVRegStackified(MO.getReg())) {
+            auto Prev = Stack.back();
+            Stack.pop_back();
+            assert(Prev.Reg == MO.getReg() &&
+                   "WebAssemblyDebugFixup: Pop: Register not matched!");
+            if (Prev.DebugValue) {
+              // This stackified reg is a variable that started life at
+              // Prev.DebugValue, so now that we're popping it we must insert
+              // a $noreg DBG_VALUE for the variable to end it, right after
+              // the current instruction.
+              BuildMI(*Prev.DebugValue->getParent(), std::next(MII),
+                      Prev.DebugValue->getDebugLoc(), TII->get(WebAssembly::DBG_VALUE), false,
+                      Register(), Prev.DebugValue->getOperand(2).getMetadata(),
+                      Prev.DebugValue->getOperand(3).getMetadata());
+            }
+          }
+        }
+        for (MachineOperand &MO : MI.defs()) {
+          if (MO.isReg() && MFI.isVRegStackified(MO.getReg())) {
+            Stack.push_back({MO.getReg(), nullptr});
+          }
+        }
+      }
+    }
+    assert(Stack.empty() &&
+           "WebAssemblyDebugFixup: Stack not empty at end of basic block!");
+  }
+
+  return true;
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
index 114a50a3055d..159fb4c00ddc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyDebugValueManager.cpp
@@ -31,7 +31,7 @@ void WebAssemblyDebugValueManager::move(MachineInstr *Insert) {
 
 void WebAssemblyDebugValueManager::updateReg(unsigned Reg) {
   for (auto *DBI : DbgValues)
-    DBI->getOperand(0).setReg(Reg);
+    DBI->getDebugOperand(0).setReg(Reg);
 }
 
 void WebAssemblyDebugValueManager::clone(MachineInstr *Insert,
@@ -40,14 +40,14 @@ void WebAssemblyDebugValueManager::clone(MachineInstr *Insert,
   MachineFunction *MF = MBB->getParent();
   for (MachineInstr *DBI : reverse(DbgValues)) {
     MachineInstr *Clone = MF->CloneMachineInstr(DBI);
-    Clone->getOperand(0).setReg(NewReg);
+    Clone->getDebugOperand(0).setReg(NewReg);
     MBB->insert(Insert, Clone);
   }
 }
 
 void WebAssemblyDebugValueManager::replaceWithLocal(unsigned LocalId) {
   for (auto *DBI : DbgValues) {
-    MachineOperand &Op = DBI->getOperand(0);
-    Op.ChangeToTargetIndex(llvm::WebAssembly::TI_LOCAL_START, LocalId);
+    MachineOperand &Op = DBI->getDebugOperand(0);
+    Op.ChangeToTargetIndex(llvm::WebAssembly::TI_LOCAL, LocalId);
   }
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
index a511b320b56b..c75de7aa207f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.cpp
@@ -46,14 +46,14 @@ bool WebAssemblyExceptionInfo::runOnMachineFunction(MachineFunction &MF) {
 void WebAssemblyExceptionInfo::recalculate(
     MachineDominatorTree &MDT, const MachineDominanceFrontier &MDF) {
   // Postorder traversal of the dominator tree.
-  SmallVector<WebAssemblyException *, 8> Exceptions;
+  SmallVector<std::unique_ptr<WebAssemblyException>, 8> Exceptions;
   for (auto DomNode : post_order(&MDT)) {
     MachineBasicBlock *EHPad = DomNode->getBlock();
     if (!EHPad->isEHPad())
       continue;
-    auto *WE = new WebAssemblyException(EHPad);
-    discoverAndMapException(WE, MDT, MDF);
-    Exceptions.push_back(WE);
+    auto WE = std::make_unique<WebAssemblyException>(EHPad);
+    discoverAndMapException(WE.get(), MDT, MDF);
+    Exceptions.push_back(std::move(WE));
   }
 
   // Add BBs to exceptions
@@ -64,17 +64,21 @@ void WebAssemblyExceptionInfo::recalculate(
       WE->addBlock(MBB);
   }
 
+  SmallVector<WebAssemblyException*, 8> ExceptionPointers;
+  ExceptionPointers.reserve(Exceptions.size());
+
   // Add subexceptions to exceptions
-  for (auto *WE : Exceptions) {
+  for (auto &WE : Exceptions) {
+    ExceptionPointers.push_back(WE.get());
     if (WE->getParentException())
-      WE->getParentException()->getSubExceptions().push_back(WE);
+      WE->getParentException()->getSubExceptions().push_back(std::move(WE));
     else
-      addTopLevelException(WE);
+      addTopLevelException(std::move(WE));
   }
 
   // For convenience, Blocks and SubExceptions are inserted in postorder.
   // Reverse the lists.
-  for (auto *WE : Exceptions) {
+  for (auto *WE : ExceptionPointers) {
     WE->reverseBlock();
     std::reverse(WE->getSubExceptions().begin(), WE->getSubExceptions().end());
   }
@@ -82,7 +86,6 @@ void WebAssemblyExceptionInfo::recalculate(
 
 void WebAssemblyExceptionInfo::releaseMemory() {
   BBMap.clear();
-  DeleteContainerPointers(TopLevelExceptions);
   TopLevelExceptions.clear();
 }
 
@@ -181,6 +184,6 @@ raw_ostream &operator<<(raw_ostream &OS, const WebAssemblyException &WE) {
 }
 
 void WebAssemblyExceptionInfo::print(raw_ostream &OS, const Module *) const {
-  for (auto *WE : TopLevelExceptions)
+  for (auto &WE : TopLevelExceptions)
     WE->print(OS);
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
index 9a90d7df7d47..50151ec8da5a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExceptionInfo.h
@@ -43,13 +43,12 @@ class WebAssemblyException {
   MachineBasicBlock *EHPad = nullptr;
 
   WebAssemblyException *ParentException = nullptr;
-  std::vector<WebAssemblyException *> SubExceptions;
+  std::vector<std::unique_ptr<WebAssemblyException>> SubExceptions;
   std::vector<MachineBasicBlock *> Blocks;
   SmallPtrSet<const MachineBasicBlock *, 8> BlockSet;
 
 public:
   WebAssemblyException(MachineBasicBlock *EHPad) : EHPad(EHPad) {}
-  ~WebAssemblyException() { DeleteContainerPointers(SubExceptions); }
   WebAssemblyException(const WebAssemblyException &) = delete;
   const WebAssemblyException &operator=(const WebAssemblyException &) = delete;
 
@@ -83,14 +82,16 @@ public:
   unsigned getNumBlocks() const { return Blocks.size(); }
   std::vector<MachineBasicBlock *> &getBlocksVector() { return Blocks; }
 
-  const std::vector<WebAssemblyException *> &getSubExceptions() const {
+  const std::vector<std::unique_ptr<WebAssemblyException>> &getSubExceptions() const {
     return SubExceptions;
   }
-  std::vector<WebAssemblyException *> &getSubExceptions() {
+  std::vector<std::unique_ptr<WebAssemblyException>> &getSubExceptions() {
     return SubExceptions;
   }
-  void addSubException(WebAssemblyException *E) { SubExceptions.push_back(E); }
-  using iterator = typename std::vector<WebAssemblyException *>::const_iterator;
+  void addSubException(std::unique_ptr<WebAssemblyException> E) {
+    SubExceptions.push_back(std::move(E));
+  }
+  using iterator = typename decltype(SubExceptions)::const_iterator;
   iterator begin() const { return SubExceptions.begin(); }
   iterator end() const { return SubExceptions.end(); }
 
@@ -117,7 +118,7 @@ raw_ostream &operator<<(raw_ostream &OS, const WebAssemblyException &WE);
 class WebAssemblyExceptionInfo final : public MachineFunctionPass {
   // Mapping of basic blocks to the innermost exception they occur in
   DenseMap<const MachineBasicBlock *, WebAssemblyException *> BBMap;
-  std::vector<WebAssemblyException *> TopLevelExceptions;
+  std::vector<std::unique_ptr<WebAssemblyException>> TopLevelExceptions;
 
   void discoverAndMapException(WebAssemblyException *WE,
                                const MachineDominatorTree &MDT,
@@ -156,9 +157,9 @@ public:
     BBMap[MBB] = WE;
   }
 
-  void addTopLevelException(WebAssemblyException *WE) {
+  void addTopLevelException(std::unique_ptr<WebAssemblyException> WE) {
     assert(!WE->getParentException() && "Not a top level exception!");
-    TopLevelExceptions.push_back(WE);
+    TopLevelExceptions.push_back(std::move(WE));
   }
 
   void print(raw_ostream &OS, const Module *M = nullptr) const override;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
index acbd4c9921b0..55925bcbe771 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyExplicitLocals.cpp
@@ -31,16 +31,6 @@ using namespace llvm;
 
 #define DEBUG_TYPE "wasm-explicit-locals"
 
-// A command-line option to disable this pass, and keep implicit locals
-// for the purpose of testing with lit/llc ONLY.
-// This produces output which is not valid WebAssembly, and is not supported
-// by assemblers/disassemblers and other MC based tools.
-static cl::opt<bool> WasmDisableExplicitLocals(
-    "wasm-disable-explicit-locals", cl::Hidden,
-    cl::desc("WebAssembly: output implicit locals in"
-             " instruction output for test purposes only."),
-    cl::init(false));
-
 namespace {
 class WebAssemblyExplicitLocals final : public MachineFunctionPass {
   StringRef getPassName() const override {
@@ -69,13 +59,28 @@ FunctionPass *llvm::createWebAssemblyExplicitLocals() {
   return new WebAssemblyExplicitLocals();
 }
 
+static void checkFrameBase(WebAssemblyFunctionInfo &MFI, unsigned Local,
+                           unsigned Reg) {
+  // Mark a local for the frame base vreg.
+  if (MFI.isFrameBaseVirtual() && Reg == MFI.getFrameBaseVreg()) {
+    LLVM_DEBUG({
+      dbgs() << "Allocating local " << Local << "for VReg "
+             << Register::virtReg2Index(Reg) << '\n';
+    });
+    MFI.setFrameBaseLocal(Local);
+  }
+}
+
 /// Return a local id number for the given register, assigning it a new one
 /// if it doesn't yet have one.
 static unsigned getLocalId(DenseMap<unsigned, unsigned> &Reg2Local,
-                           unsigned &CurLocal, unsigned Reg) {
+                           WebAssemblyFunctionInfo &MFI, unsigned &CurLocal,
+                           unsigned Reg) {
   auto P = Reg2Local.insert(std::make_pair(Reg, CurLocal));
-  if (P.second)
+  if (P.second) {
+    checkFrameBase(MFI, CurLocal, Reg);
     ++CurLocal;
+  }
   return P.first->second;
 }
 
@@ -168,11 +173,18 @@ static MVT typeForRegClass(const TargetRegisterClass *RC) {
 /// start of the expression tree.
 static MachineInstr *findStartOfTree(MachineOperand &MO,
                                      MachineRegisterInfo &MRI,
-                                     WebAssemblyFunctionInfo &MFI) {
+                                     const WebAssemblyFunctionInfo &MFI) {
   Register Reg = MO.getReg();
   assert(MFI.isVRegStackified(Reg));
   MachineInstr *Def = MRI.getVRegDef(Reg);
 
+  // If this instruction has any non-stackified defs, it is the start
+  for (auto DefReg : Def->defs()) {
+    if (!MFI.isVRegStackified(DefReg.getReg())) {
+      return Def;
+    }
+  }
+
   // Find the first stackified use and proceed from there.
   for (MachineOperand &DefMO : Def->explicit_uses()) {
     if (!DefMO.isReg())
@@ -189,10 +201,6 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
                        "********** Function: "
                     << MF.getName() << '\n');
 
-  // Disable this pass if directed to do so.
-  if (WasmDisableExplicitLocals)
-    return false;
-
   bool Changed = false;
   MachineRegisterInfo &MRI = MF.getRegInfo();
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
@@ -210,7 +218,9 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
       break;
     Register Reg = MI.getOperand(0).getReg();
     assert(!MFI.isVRegStackified(Reg));
-    Reg2Local[Reg] = static_cast<unsigned>(MI.getOperand(1).getImm());
+    auto Local = static_cast<unsigned>(MI.getOperand(1).getImm());
+    Reg2Local[Reg] = Local;
+    checkFrameBase(MFI, Local, Reg);
     MI.eraseFromParent();
     Changed = true;
   }
@@ -233,6 +243,12 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
       if (MI.isDebugInstr() || MI.isLabel())
         continue;
 
+      if (MI.getOpcode() == WebAssembly::IMPLICIT_DEF) {
+        MI.eraseFromParent();
+        Changed = true;
+        continue;
+      }
+
       // Replace tee instructions with local.tee. The difference is that tee
       // instructions have two defs, while local.tee instructions have one def
       // and an index of a local to write to.
@@ -244,18 +260,18 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
 
         // Stackify the input if it isn't stackified yet.
         if (!MFI.isVRegStackified(OldReg)) {
-          unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+          unsigned LocalId = getLocalId(Reg2Local, MFI, CurLocal, OldReg);
           Register NewReg = MRI.createVirtualRegister(RC);
           unsigned Opc = getLocalGetOpcode(RC);
           BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Opc), NewReg)
               .addImm(LocalId);
           MI.getOperand(2).setReg(NewReg);
-          MFI.stackifyVReg(NewReg);
+          MFI.stackifyVReg(MRI, NewReg);
         }
 
         // Replace the TEE with a LOCAL_TEE.
         unsigned LocalId =
-            getLocalId(Reg2Local, CurLocal, MI.getOperand(1).getReg());
+            getLocalId(Reg2Local, MFI, CurLocal, MI.getOperand(1).getReg());
         unsigned Opc = getLocalTeeOpcode(RC);
         BuildMI(MBB, &MI, MI.getDebugLoc(), TII->get(Opc),
                 MI.getOperand(0).getReg())
@@ -269,20 +285,13 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
         continue;
       }
 
-      // Insert local.sets for any defs that aren't stackified yet. Currently
-      // we handle at most one def.
-      assert(MI.getDesc().getNumDefs() <= 1);
-      if (MI.getDesc().getNumDefs() == 1) {
-        Register OldReg = MI.getOperand(0).getReg();
+      // Insert local.sets for any defs that aren't stackified yet.
+      for (auto &Def : MI.defs()) {
+        Register OldReg = Def.getReg();
         if (!MFI.isVRegStackified(OldReg)) {
           const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
           Register NewReg = MRI.createVirtualRegister(RC);
           auto InsertPt = std::next(MI.getIterator());
-          if (MI.getOpcode() == WebAssembly::IMPLICIT_DEF) {
-            MI.eraseFromParent();
-            Changed = true;
-            continue;
-          }
           if (UseEmpty[Register::virtReg2Index(OldReg)]) {
             unsigned Opc = getDropOpcode(RC);
             MachineInstr *Drop =
@@ -290,8 +299,10 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
                     .addReg(NewReg);
             // After the drop instruction, this reg operand will not be used
             Drop->getOperand(0).setIsKill();
+            if (MFI.isFrameBaseVirtual() && OldReg == MFI.getFrameBaseVreg())
+              MFI.clearFrameBaseVreg();
           } else {
-            unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+            unsigned LocalId = getLocalId(Reg2Local, MFI, CurLocal, OldReg);
             unsigned Opc = getLocalSetOpcode(RC);
 
             WebAssemblyDebugValueManager(&MI).replaceWithLocal(LocalId);
@@ -300,12 +311,12 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
                 .addImm(LocalId)
                 .addReg(NewReg);
           }
-          MI.getOperand(0).setReg(NewReg);
           // This register operand of the original instruction is now being used
           // by the inserted drop or local.set instruction, so make it not dead
           // yet.
-          MI.getOperand(0).setIsDead(false);
-          MFI.stackifyVReg(NewReg);
+          Def.setReg(NewReg);
+          Def.setIsDead(false);
+          MFI.stackifyVReg(MRI, NewReg);
           Changed = true;
         }
       }
@@ -323,7 +334,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
         // immediates.
         if (MO.isDef()) {
           assert(MI.isInlineAsm());
-          unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+          unsigned LocalId = getLocalId(Reg2Local, MFI, CurLocal, OldReg);
           // If this register operand is tied to another operand, we can't
           // change it to an immediate. Untie it first.
           MI.untieRegOperand(MI.getOperandNo(&MO));
@@ -341,7 +352,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
         // Our contract with inline asm register operands is to provide local
         // indices as immediates.
         if (MI.isInlineAsm()) {
-          unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+          unsigned LocalId = getLocalId(Reg2Local, MFI, CurLocal, OldReg);
           // Untie it first if this reg operand is tied to another operand.
           MI.untieRegOperand(MI.getOperandNo(&MO));
           MO.ChangeToImmediate(LocalId);
@@ -349,7 +360,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
         }
 
         // Insert a local.get.
-        unsigned LocalId = getLocalId(Reg2Local, CurLocal, OldReg);
+        unsigned LocalId = getLocalId(Reg2Local, MFI, CurLocal, OldReg);
         const TargetRegisterClass *RC = MRI.getRegClass(OldReg);
         Register NewReg = MRI.createVirtualRegister(RC);
         unsigned Opc = getLocalGetOpcode(RC);
@@ -357,7 +368,7 @@ bool WebAssemblyExplicitLocals::runOnMachineFunction(MachineFunction &MF) {
             BuildMI(MBB, InsertPt, MI.getDebugLoc(), TII->get(Opc), NewReg)
                 .addImm(LocalId);
         MO.setReg(NewReg);
-        MFI.stackifyVReg(NewReg);
+        MFI.stackifyVReg(MRI, NewReg);
         Changed = true;
       }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
index c932f985489a..8a0092a3f298 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFastISel.cpp
@@ -640,6 +640,9 @@ bool WebAssemblyFastISel::fastLowerArguments() {
   if (F->isVarArg())
     return false;
 
+  if (FuncInfo.Fn->getCallingConv() == CallingConv::Swift)
+    return false;
+
   unsigned I = 0;
   for (auto const &Arg : F->args()) {
     const AttributeList &Attrs = F->getAttributes();
@@ -754,17 +757,18 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
   if (Func && Func->isIntrinsic())
     return false;
 
+  if (Call->getCallingConv() == CallingConv::Swift)
+    return false;
+
   bool IsDirect = Func != nullptr;
-  if (!IsDirect && isa<ConstantExpr>(Call->getCalledValue()))
+  if (!IsDirect && isa<ConstantExpr>(Call->getCalledOperand()))
     return false;
 
   FunctionType *FuncTy = Call->getFunctionType();
-  unsigned Opc;
+  unsigned Opc = IsDirect ? WebAssembly::CALL : WebAssembly::CALL_INDIRECT;
   bool IsVoid = FuncTy->getReturnType()->isVoidTy();
   unsigned ResultReg;
-  if (IsVoid) {
-    Opc = IsDirect ? WebAssembly::CALL_VOID : WebAssembly::PCALL_INDIRECT_VOID;
-  } else {
+  if (!IsVoid) {
     if (!Subtarget->hasSIMD128() && Call->getType()->isVectorTy())
       return false;
 
@@ -774,54 +778,36 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
     case MVT::i8:
     case MVT::i16:
     case MVT::i32:
-      Opc = IsDirect ? WebAssembly::CALL_i32 : WebAssembly::PCALL_INDIRECT_i32;
       ResultReg = createResultReg(&WebAssembly::I32RegClass);
       break;
     case MVT::i64:
-      Opc = IsDirect ? WebAssembly::CALL_i64 : WebAssembly::PCALL_INDIRECT_i64;
       ResultReg = createResultReg(&WebAssembly::I64RegClass);
       break;
     case MVT::f32:
-      Opc = IsDirect ? WebAssembly::CALL_f32 : WebAssembly::PCALL_INDIRECT_f32;
       ResultReg = createResultReg(&WebAssembly::F32RegClass);
       break;
     case MVT::f64:
-      Opc = IsDirect ? WebAssembly::CALL_f64 : WebAssembly::PCALL_INDIRECT_f64;
       ResultReg = createResultReg(&WebAssembly::F64RegClass);
       break;
     case MVT::v16i8:
-      Opc = IsDirect ? WebAssembly::CALL_v16i8
-                     : WebAssembly::PCALL_INDIRECT_v16i8;
       ResultReg = createResultReg(&WebAssembly::V128RegClass);
       break;
     case MVT::v8i16:
-      Opc = IsDirect ? WebAssembly::CALL_v8i16
-                     : WebAssembly::PCALL_INDIRECT_v8i16;
       ResultReg = createResultReg(&WebAssembly::V128RegClass);
       break;
     case MVT::v4i32:
-      Opc = IsDirect ? WebAssembly::CALL_v4i32
-                     : WebAssembly::PCALL_INDIRECT_v4i32;
       ResultReg = createResultReg(&WebAssembly::V128RegClass);
       break;
     case MVT::v2i64:
-      Opc = IsDirect ? WebAssembly::CALL_v2i64
-                     : WebAssembly::PCALL_INDIRECT_v2i64;
       ResultReg = createResultReg(&WebAssembly::V128RegClass);
       break;
     case MVT::v4f32:
-      Opc = IsDirect ? WebAssembly::CALL_v4f32
-                     : WebAssembly::PCALL_INDIRECT_v4f32;
       ResultReg = createResultReg(&WebAssembly::V128RegClass);
       break;
     case MVT::v2f64:
-      Opc = IsDirect ? WebAssembly::CALL_v2f64
-                     : WebAssembly::PCALL_INDIRECT_v2f64;
       ResultReg = createResultReg(&WebAssembly::V128RegClass);
       break;
     case MVT::exnref:
-      Opc = IsDirect ? WebAssembly::CALL_exnref
-                     : WebAssembly::PCALL_INDIRECT_exnref;
       ResultReg = createResultReg(&WebAssembly::EXNREFRegClass);
       break;
     default:
@@ -861,7 +847,7 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
 
   unsigned CalleeReg = 0;
   if (!IsDirect) {
-    CalleeReg = getRegForValue(Call->getCalledValue());
+    CalleeReg = getRegForValue(Call->getCalledOperand());
     if (!CalleeReg)
       return false;
   }
@@ -871,14 +857,20 @@ bool WebAssemblyFastISel::selectCall(const Instruction *I) {
   if (!IsVoid)
     MIB.addReg(ResultReg, RegState::Define);
 
-  if (IsDirect)
+  if (IsDirect) {
     MIB.addGlobalAddress(Func);
-  else
-    MIB.addReg(CalleeReg);
+  } else {
+    // Add placeholders for the type index and immediate flags
+    MIB.addImm(0);
+    MIB.addImm(0);
+  }
 
   for (unsigned ArgReg : Args)
     MIB.addReg(ArgReg);
 
+  if (!IsDirect)
+    MIB.addReg(CalleeReg);
+
   if (!IsVoid)
     updateValueMap(Call, ResultReg);
   return true;
@@ -1168,30 +1160,31 @@ bool WebAssemblyFastISel::selectLoad(const Instruction *I) {
 
   unsigned Opc;
   const TargetRegisterClass *RC;
+  bool A64 = Subtarget->hasAddr64();
   switch (getSimpleType(Load->getType())) {
   case MVT::i1:
   case MVT::i8:
-    Opc = WebAssembly::LOAD8_U_I32;
+    Opc = A64 ? WebAssembly::LOAD8_U_I32_A64 : WebAssembly::LOAD8_U_I32_A32;
     RC = &WebAssembly::I32RegClass;
     break;
   case MVT::i16:
-    Opc = WebAssembly::LOAD16_U_I32;
+    Opc = A64 ? WebAssembly::LOAD16_U_I32_A64 : WebAssembly::LOAD16_U_I32_A32;
     RC = &WebAssembly::I32RegClass;
     break;
   case MVT::i32:
-    Opc = WebAssembly::LOAD_I32;
+    Opc = A64 ? WebAssembly::LOAD_I32_A64 : WebAssembly::LOAD_I32_A32;
     RC = &WebAssembly::I32RegClass;
     break;
   case MVT::i64:
-    Opc = WebAssembly::LOAD_I64;
+    Opc = A64 ? WebAssembly::LOAD_I64_A64 : WebAssembly::LOAD_I64_A32;
     RC = &WebAssembly::I64RegClass;
     break;
   case MVT::f32:
-    Opc = WebAssembly::LOAD_F32;
+    Opc = A64 ? WebAssembly::LOAD_F32_A64 : WebAssembly::LOAD_F32_A32;
     RC = &WebAssembly::F32RegClass;
     break;
   case MVT::f64:
-    Opc = WebAssembly::LOAD_F64;
+    Opc = A64 ? WebAssembly::LOAD_F64_A64 : WebAssembly::LOAD_F64_A32;
     RC = &WebAssembly::F64RegClass;
     break;
   default:
@@ -1224,27 +1217,28 @@ bool WebAssemblyFastISel::selectStore(const Instruction *I) {
 
   unsigned Opc;
   bool VTIsi1 = false;
+  bool A64 = Subtarget->hasAddr64();
   switch (getSimpleType(Store->getValueOperand()->getType())) {
   case MVT::i1:
     VTIsi1 = true;
     LLVM_FALLTHROUGH;
   case MVT::i8:
-    Opc = WebAssembly::STORE8_I32;
+    Opc = A64 ? WebAssembly::STORE8_I32_A64 : WebAssembly::STORE8_I32_A32;
     break;
   case MVT::i16:
-    Opc = WebAssembly::STORE16_I32;
+    Opc = A64 ? WebAssembly::STORE16_I32_A64 : WebAssembly::STORE16_I32_A32;
     break;
   case MVT::i32:
-    Opc = WebAssembly::STORE_I32;
+    Opc = A64 ? WebAssembly::STORE_I32_A64 : WebAssembly::STORE_I32_A32;
     break;
   case MVT::i64:
-    Opc = WebAssembly::STORE_I64;
+    Opc = A64 ? WebAssembly::STORE_I64_A64 : WebAssembly::STORE_I64_A32;
     break;
   case MVT::f32:
-    Opc = WebAssembly::STORE_F32;
+    Opc = A64 ? WebAssembly::STORE_F32_A64 : WebAssembly::STORE_F32_A32;
     break;
   case MVT::f64:
-    Opc = WebAssembly::STORE_F64;
+    Opc = A64 ? WebAssembly::STORE_F64_A64 : WebAssembly::STORE_F64_A32;
     break;
   default:
     return false;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
new file mode 100644
index 000000000000..7f805b34b499
--- /dev/null
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixBrTableDefaults.cpp
@@ -0,0 +1,155 @@
+//=- WebAssemblyFixBrTableDefaults.cpp - Fix br_table default branch targets -//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// \file This file implements a pass that eliminates redundant range checks
+/// guarding br_table instructions. Since jump tables on most targets cannot
+/// handle out of range indices, LLVM emits these checks before most jump
+/// tables. But br_table takes a default branch target as an argument, so it
+/// does not need the range checks.
+///
+//===----------------------------------------------------------------------===//
+
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/Pass.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "wasm-fix-br-table-defaults"
+
+namespace {
+
+class WebAssemblyFixBrTableDefaults final : public MachineFunctionPass {
+  StringRef getPassName() const override {
+    return "WebAssembly Fix br_table Defaults";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+public:
+  static char ID; // Pass identification, replacement for typeid
+  WebAssemblyFixBrTableDefaults() : MachineFunctionPass(ID) {}
+};
+
+char WebAssemblyFixBrTableDefaults::ID = 0;
+
+// `MI` is a br_table instruction with a dummy default target argument. This
+// function finds and adds the default target argument and removes any redundant
+// range check preceding the br_table. Returns the MBB that the br_table is
+// moved into so it can be removed from further consideration, or nullptr if the
+// br_table cannot be optimized.
+MachineBasicBlock *fixBrTable(MachineInstr &MI, MachineBasicBlock *MBB,
+                              MachineFunction &MF) {
+  // Get the header block, which contains the redundant range check.
+  assert(MBB->pred_size() == 1 && "Expected a single guard predecessor");
+  auto *HeaderMBB = *MBB->pred_begin();
+
+  // Find the conditional jump to the default target. If it doesn't exist, the
+  // default target is unreachable anyway, so we can keep the existing dummy
+  // target.
+  MachineBasicBlock *TBB = nullptr, *FBB = nullptr;
+  SmallVector<MachineOperand, 2> Cond;
+  const auto &TII = *MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  bool Analyzed = !TII.analyzeBranch(*HeaderMBB, TBB, FBB, Cond);
+  assert(Analyzed && "Could not analyze jump header branches");
+  (void)Analyzed;
+
+  // Here are the possible outcomes. '_' is nullptr, `J` is the jump table block
+  // aka MBB, 'D' is the default block.
+  //
+  // TBB | FBB | Meaning
+  //  _  |  _  | No default block, header falls through to jump table
+  //  J  |  _  | No default block, header jumps to the jump table
+  //  D  |  _  | Header jumps to the default and falls through to the jump table
+  //  D  |  J  | Header jumps to the default and also to the jump table
+  if (TBB && TBB != MBB) {
+    assert((FBB == nullptr || FBB == MBB) &&
+           "Expected jump or fallthrough to br_table block");
+    assert(Cond.size() == 2 && Cond[1].isReg() && "Unexpected condition info");
+
+    // If the range check checks an i64 value, we cannot optimize it out because
+    // the i64 index is truncated to an i32, making values over 2^32
+    // indistinguishable from small numbers. There are also other strange edge
+    // cases that can arise in practice that we don't want to reason about, so
+    // conservatively only perform the optimization if the range check is the
+    // normal case of an i32.gt_u.
+    MachineRegisterInfo &MRI = MF.getRegInfo();
+    auto *RangeCheck = MRI.getVRegDef(Cond[1].getReg());
+    assert(RangeCheck != nullptr);
+    if (RangeCheck->getOpcode() != WebAssembly::GT_U_I32)
+      return nullptr;
+
+    // Remove the dummy default target and install the real one.
+    MI.RemoveOperand(MI.getNumExplicitOperands() - 1);
+    MI.addOperand(MF, MachineOperand::CreateMBB(TBB));
+  }
+
+  // Remove any branches from the header and splice in the jump table instead
+  TII.removeBranch(*HeaderMBB, nullptr);
+  HeaderMBB->splice(HeaderMBB->end(), MBB, MBB->begin(), MBB->end());
+
+  // Update CFG to skip the old jump table block. Remove shared successors
+  // before transferring to avoid duplicated successors.
+  HeaderMBB->removeSuccessor(MBB);
+  for (auto &Succ : MBB->successors())
+    if (HeaderMBB->isSuccessor(Succ))
+      HeaderMBB->removeSuccessor(Succ);
+  HeaderMBB->transferSuccessorsAndUpdatePHIs(MBB);
+
+  // Remove the old jump table block from the function
+  MF.erase(MBB);
+
+  return HeaderMBB;
+}
+
+bool WebAssemblyFixBrTableDefaults::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "********** Fixing br_table Default Targets **********\n"
+                       "********** Function: "
+                    << MF.getName() << '\n');
+
+  bool Changed = false;
+  SmallPtrSet<MachineBasicBlock *, 16> MBBSet;
+  for (auto &MBB : MF)
+    MBBSet.insert(&MBB);
+
+  while (!MBBSet.empty()) {
+    MachineBasicBlock *MBB = *MBBSet.begin();
+    MBBSet.erase(MBB);
+    for (auto &MI : *MBB) {
+      if (WebAssembly::isBrTable(MI)) {
+        auto *Fixed = fixBrTable(MI, MBB, MF);
+        if (Fixed != nullptr) {
+          MBBSet.erase(Fixed);
+          Changed = true;
+        }
+        break;
+      }
+    }
+  }
+
+  if (Changed) {
+    // We rewrote part of the function; recompute relevant things.
+    MF.RenumberBlocks();
+    return true;
+  }
+
+  return false;
+}
+
+} // end anonymous namespace
+
+INITIALIZE_PASS(WebAssemblyFixBrTableDefaults, DEBUG_TYPE,
+                "Removes range checks and sets br_table default targets", false,
+                false)
+
+FunctionPass *llvm::createWebAssemblyFixBrTableDefaults() {
+  return new WebAssemblyFixBrTableDefaults();
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
index 6b1bbd7a2b07..7abb6fa8905c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixFunctionBitcasts.cpp
@@ -23,7 +23,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "WebAssembly.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -73,11 +72,11 @@ static void findUses(Value *V, Function &F,
     else if (auto *A = dyn_cast<GlobalAlias>(U.getUser()))
       findUses(A, F, Uses, ConstantBCs);
     else if (U.get()->getType() != F.getType()) {
-      CallSite CS(U.getUser());
-      if (!CS)
+      CallBase *CB = dyn_cast<CallBase>(U.getUser());
+      if (!CB)
         // Skip uses that aren't immediately called
         continue;
-      Value *Callee = CS.getCalledValue();
+      Value *Callee = CB->getCalledOperand();
       if (Callee != V)
         // Skip calls where the function isn't the callee
         continue;
@@ -244,6 +243,10 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
 
   // Collect all the places that need wrappers.
   for (Function &F : M) {
+    // Skip to fix when the function is swiftcc because swiftcc allows
+    // bitcast type difference for swiftself and swifterror.
+    if (F.getCallingConv() == CallingConv::Swift)
+      continue;
     findUses(&F, F, Uses, ConstantBCs);
 
     // If we have a "main" function, and its type isn't
@@ -304,7 +307,7 @@ bool FixFunctionBitcasts::runOnModule(Module &M) {
   if (CallMain) {
     Main->setName("__original_main");
     auto *MainWrapper =
-        cast<Function>(CallMain->getCalledValue()->stripPointerCasts());
+        cast<Function>(CallMain->getCalledOperand()->stripPointerCasts());
     delete CallMain;
     if (Main->isDeclaration()) {
       // The wrapper is not needed in this case as we don't need to export
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
index 157ea9d525c9..1ceae59dc993 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFixIrreducibleControlFlow.cpp
@@ -66,6 +66,17 @@ namespace {
 using BlockVector = SmallVector<MachineBasicBlock *, 4>;
 using BlockSet = SmallPtrSet<MachineBasicBlock *, 4>;
 
+static BlockVector getSortedEntries(const BlockSet &Entries) {
+  BlockVector SortedEntries(Entries.begin(), Entries.end());
+  llvm::sort(SortedEntries,
+             [](const MachineBasicBlock *A, const MachineBasicBlock *B) {
+               auto ANum = A->getNumber();
+               auto BNum = B->getNumber();
+               return ANum < BNum;
+             });
+  return SortedEntries;
+}
+
 // Calculates reachability in a region. Ignores branches to blocks outside of
 // the region, and ignores branches to the region entry (for the case where
 // the region is the inner part of a loop).
@@ -241,7 +252,6 @@ public:
 bool WebAssemblyFixIrreducibleControlFlow::processRegion(
     MachineBasicBlock *Entry, BlockSet &Blocks, MachineFunction &MF) {
   bool Changed = false;
-
   // Remove irreducibility before processing child loops, which may take
   // multiple iterations.
   while (true) {
@@ -249,12 +259,18 @@ bool WebAssemblyFixIrreducibleControlFlow::processRegion(
 
     bool FoundIrreducibility = false;
 
-    for (auto *LoopEntry : Graph.getLoopEntries()) {
+    for (auto *LoopEntry : getSortedEntries(Graph.getLoopEntries())) {
       // Find mutual entries - all entries which can reach this one, and
       // are reached by it (that always includes LoopEntry itself). All mutual
       // entries must be in the same loop, so if we have more than one, then we
       // have irreducible control flow.
       //
+      // (Note that we need to sort the entries here, as otherwise the order can
+      // matter: being mutual is a symmetric relationship, and each set of
+      // mutuals will be handled properly no matter which we see first. However,
+      // there can be multiple disjoint sets of mutuals, and which we process
+      // first changes the output.)
+      //
       // Note that irreducibility may involve inner loops, e.g. imagine A
       // starts one loop, and it has B inside it which starts an inner loop.
       // If we add a branch from all the way on the outside to B, then in a
@@ -325,13 +341,7 @@ void WebAssemblyFixIrreducibleControlFlow::makeSingleEntryLoop(
   assert(Entries.size() >= 2);
 
   // Sort the entries to ensure a deterministic build.
-  BlockVector SortedEntries(Entries.begin(), Entries.end());
-  llvm::sort(SortedEntries,
-             [&](const MachineBasicBlock *A, const MachineBasicBlock *B) {
-               auto ANum = A->getNumber();
-               auto BNum = B->getNumber();
-               return ANum < BNum;
-             });
+  BlockVector SortedEntries = getSortedEntries(Entries);
 
 #ifndef NDEBUG
   for (auto Block : SortedEntries)
@@ -403,31 +413,33 @@ void WebAssemblyFixIrreducibleControlFlow::makeSingleEntryLoop(
   }
 
   // Record if each entry has a layout predecessor. This map stores
-  // <<Predecessor is within the loop?, loop entry>, layout predecessor>
-  std::map<std::pair<bool, MachineBasicBlock *>, MachineBasicBlock *>
+  // <<loop entry, Predecessor is within the loop?>, layout predecessor>
+  DenseMap<PointerIntPair<MachineBasicBlock *, 1, bool>, MachineBasicBlock *>
       EntryToLayoutPred;
-  for (auto *Pred : AllPreds)
+  for (auto *Pred : AllPreds) {
+    bool PredInLoop = InLoop.count(Pred);
     for (auto *Entry : Pred->successors())
       if (Entries.count(Entry) && Pred->isLayoutSuccessor(Entry))
-        EntryToLayoutPred[std::make_pair(InLoop.count(Pred), Entry)] = Pred;
+        EntryToLayoutPred[{Entry, PredInLoop}] = Pred;
+  }
 
   // We need to create at most two routing blocks per entry: one for
   // predecessors outside the loop and one for predecessors inside the loop.
   // This map stores
-  // <<Predecessor is within the loop?, loop entry>, routing block>
-  std::map<std::pair<bool, MachineBasicBlock *>, MachineBasicBlock *> Map;
+  // <<loop entry, Predecessor is within the loop?>, routing block>
+  DenseMap<PointerIntPair<MachineBasicBlock *, 1, bool>, MachineBasicBlock *>
+      Map;
   for (auto *Pred : AllPreds) {
     bool PredInLoop = InLoop.count(Pred);
     for (auto *Entry : Pred->successors()) {
-      if (!Entries.count(Entry) ||
-          Map.count(std::make_pair(InLoop.count(Pred), Entry)))
+      if (!Entries.count(Entry) || Map.count({Entry, PredInLoop}))
         continue;
       // If there exists a layout predecessor of this entry and this predecessor
       // is not that, we rather create a routing block after that layout
       // predecessor to save a branch.
-      if (EntryToLayoutPred.count(std::make_pair(PredInLoop, Entry)) &&
-          EntryToLayoutPred[std::make_pair(PredInLoop, Entry)] != Pred)
-        continue;
+      if (auto *OtherPred = EntryToLayoutPred.lookup({Entry, PredInLoop}))
+        if (OtherPred != Pred)
+          continue;
 
       // This is a successor we need to rewrite.
       MachineBasicBlock *Routing = MF.CreateMachineBasicBlock();
@@ -443,7 +455,7 @@ void WebAssemblyFixIrreducibleControlFlow::makeSingleEntryLoop(
           .addImm(Indices[Entry]);
       BuildMI(Routing, DebugLoc(), TII.get(WebAssembly::BR)).addMBB(Dispatch);
       Routing->addSuccessor(Dispatch);
-      Map[std::make_pair(PredInLoop, Entry)] = Routing;
+      Map[{Entry, PredInLoop}] = Routing;
     }
   }
 
@@ -453,12 +465,12 @@ void WebAssemblyFixIrreducibleControlFlow::makeSingleEntryLoop(
     for (MachineInstr &Term : Pred->terminators())
       for (auto &Op : Term.explicit_uses())
         if (Op.isMBB() && Indices.count(Op.getMBB()))
-          Op.setMBB(Map[std::make_pair(PredInLoop, Op.getMBB())]);
+          Op.setMBB(Map[{Op.getMBB(), PredInLoop}]);
 
     for (auto *Succ : Pred->successors()) {
       if (!Entries.count(Succ))
         continue;
-      auto *Routing = Map[std::make_pair(PredInLoop, Succ)];
+      auto *Routing = Map[{Succ, PredInLoop}];
       Pred->replaceSuccessor(Succ, Routing);
     }
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 71eeebfada4b..95669932e73f 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -19,6 +19,7 @@
 
 #include "WebAssemblyFrameLowering.h"
 #include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "WebAssembly.h"
 #include "WebAssemblyInstrInfo.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
@@ -86,8 +87,8 @@ bool WebAssemblyFrameLowering::needsSPForLocalFrame(
 }
 
 // In function with EH pads, we need to make a copy of the value of
-// __stack_pointer global in SP32 register, in order to use it when restoring
-// __stack_pointer after an exception is caught.
+// __stack_pointer global in SP32/64 register, in order to use it when
+// restoring __stack_pointer after an exception is caught.
 bool WebAssemblyFrameLowering::needsPrologForEH(
     const MachineFunction &MF) const {
   auto EHType = MF.getTarget().getMCAsmInfo()->getExceptionHandlingType();
@@ -122,6 +123,57 @@ bool WebAssemblyFrameLowering::needsSPWriteback(
   return needsSPForLocalFrame(MF) && !CanUseRedZone;
 }
 
+unsigned WebAssemblyFrameLowering::getSPReg(const MachineFunction &MF) {
+  return MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()
+             ? WebAssembly::SP64
+             : WebAssembly::SP32;
+}
+
+unsigned WebAssemblyFrameLowering::getFPReg(const MachineFunction &MF) {
+  return MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()
+             ? WebAssembly::FP64
+             : WebAssembly::FP32;
+}
+
+unsigned
+WebAssemblyFrameLowering::getOpcConst(const MachineFunction &MF) {
+  return MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()
+             ? WebAssembly::CONST_I64
+             : WebAssembly::CONST_I32;
+}
+
+unsigned WebAssemblyFrameLowering::getOpcAdd(const MachineFunction &MF) {
+  return MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()
+             ? WebAssembly::ADD_I64
+             : WebAssembly::ADD_I32;
+}
+
+unsigned WebAssemblyFrameLowering::getOpcSub(const MachineFunction &MF) {
+  return MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()
+             ? WebAssembly::SUB_I64
+             : WebAssembly::SUB_I32;
+}
+
+unsigned WebAssemblyFrameLowering::getOpcAnd(const MachineFunction &MF) {
+  return MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()
+             ? WebAssembly::AND_I64
+             : WebAssembly::AND_I32;
+}
+
+unsigned
+WebAssemblyFrameLowering::getOpcGlobGet(const MachineFunction &MF) {
+  return MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()
+             ? WebAssembly::GLOBAL_GET_I64
+             : WebAssembly::GLOBAL_GET_I32;
+}
+
+unsigned
+WebAssemblyFrameLowering::getOpcGlobSet(const MachineFunction &MF) {
+  return MF.getSubtarget<WebAssemblySubtarget>().hasAddr64()
+             ? WebAssembly::GLOBAL_SET_I64
+             : WebAssembly::GLOBAL_SET_I32;
+}
+
 void WebAssemblyFrameLowering::writeSPToGlobal(
     unsigned SrcReg, MachineFunction &MF, MachineBasicBlock &MBB,
     MachineBasicBlock::iterator &InsertStore, const DebugLoc &DL) const {
@@ -129,7 +181,8 @@ void WebAssemblyFrameLowering::writeSPToGlobal(
 
   const char *ES = "__stack_pointer";
   auto *SPSymbol = MF.createExternalSymbolName(ES);
-  BuildMI(MBB, InsertStore, DL, TII->get(WebAssembly::GLOBAL_SET_I32))
+
+  BuildMI(MBB, InsertStore, DL, TII->get(getOpcGlobSet(MF)))
       .addExternalSymbol(SPSymbol)
       .addReg(SrcReg);
 }
@@ -140,11 +193,12 @@ WebAssemblyFrameLowering::eliminateCallFramePseudoInstr(
     MachineBasicBlock::iterator I) const {
   assert(!I->getOperand(0).getImm() && (hasFP(MF) || hasBP(MF)) &&
          "Call frame pseudos should only be used for dynamic stack adjustment");
-  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &ST = MF.getSubtarget<WebAssemblySubtarget>();
+  const auto *TII = ST.getInstrInfo();
   if (I->getOpcode() == TII->getCallFrameDestroyOpcode() &&
       needsSPWriteback(MF)) {
     DebugLoc DL = I->getDebugLoc();
-    writeSPToGlobal(WebAssembly::SP32, MF, MBB, I, DL);
+    writeSPToGlobal(getSPReg(MF), MF, MBB, I, DL);
   }
   return MBB.erase(I);
 }
@@ -160,7 +214,8 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
     return;
   uint64_t StackSize = MFI.getStackSize();
 
-  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &ST = MF.getSubtarget<WebAssemblySubtarget>();
+  const auto *TII = ST.getInstrInfo();
   auto &MRI = MF.getRegInfo();
 
   auto InsertPt = MBB.begin();
@@ -171,13 +226,13 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
 
   const TargetRegisterClass *PtrRC =
       MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
-  unsigned SPReg = WebAssembly::SP32;
+  unsigned SPReg = getSPReg(MF);
   if (StackSize)
     SPReg = MRI.createVirtualRegister(PtrRC);
 
   const char *ES = "__stack_pointer";
   auto *SPSymbol = MF.createExternalSymbolName(ES);
-  BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::GLOBAL_GET_I32), SPReg)
+  BuildMI(MBB, InsertPt, DL, TII->get(getOpcGlobGet(MF)), SPReg)
       .addExternalSymbol(SPSymbol);
 
   bool HasBP = hasBP(MF);
@@ -191,34 +246,30 @@ void WebAssemblyFrameLowering::emitPrologue(MachineFunction &MF,
   if (StackSize) {
     // Subtract the frame size
     Register OffsetReg = MRI.createVirtualRegister(PtrRC);
-    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+    BuildMI(MBB, InsertPt, DL, TII->get(getOpcConst(MF)), OffsetReg)
         .addImm(StackSize);
-    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::SUB_I32),
-            WebAssembly::SP32)
+    BuildMI(MBB, InsertPt, DL, TII->get(getOpcSub(MF)), getSPReg(MF))
         .addReg(SPReg)
         .addReg(OffsetReg);
   }
   if (HasBP) {
     Register BitmaskReg = MRI.createVirtualRegister(PtrRC);
-    unsigned Alignment = MFI.getMaxAlignment();
-    assert((1u << countTrailingZeros(Alignment)) == Alignment &&
-           "Alignment must be a power of 2");
-    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), BitmaskReg)
-        .addImm((int)~(Alignment - 1));
-    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::AND_I32),
-            WebAssembly::SP32)
-        .addReg(WebAssembly::SP32)
+    Align Alignment = MFI.getMaxAlign();
+    BuildMI(MBB, InsertPt, DL, TII->get(getOpcConst(MF)), BitmaskReg)
+        .addImm((int64_t) ~(Alignment.value() - 1));
+    BuildMI(MBB, InsertPt, DL, TII->get(getOpcAnd(MF)), getSPReg(MF))
+        .addReg(getSPReg(MF))
         .addReg(BitmaskReg);
   }
   if (hasFP(MF)) {
     // Unlike most conventional targets (where FP points to the saved FP),
     // FP points to the bottom of the fixed-size locals, so we can use positive
     // offsets in load/store instructions.
-    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::COPY), WebAssembly::FP32)
-        .addReg(WebAssembly::SP32);
+    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::COPY), getFPReg(MF))
+        .addReg(getSPReg(MF));
   }
   if (StackSize && needsSPWriteback(MF)) {
-    writeSPToGlobal(WebAssembly::SP32, MF, MBB, InsertPt, DL);
+    writeSPToGlobal(getSPReg(MF), MF, MBB, InsertPt, DL);
   }
 }
 
@@ -227,7 +278,8 @@ void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF,
   uint64_t StackSize = MF.getFrameInfo().getStackSize();
   if (!needsSP(MF) || !needsSPWriteback(MF))
     return;
-  const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
+  auto &ST = MF.getSubtarget<WebAssemblySubtarget>();
+  const auto *TII = ST.getInstrInfo();
   auto &MRI = MF.getRegInfo();
   auto InsertPt = MBB.getFirstTerminator();
   DebugLoc DL;
@@ -238,6 +290,7 @@ void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF,
   // Restore the stack pointer. If we had fixed-size locals, add the offset
   // subtracted in the prolog.
   unsigned SPReg = 0;
+  unsigned SPFPReg = hasFP(MF) ? getFPReg(MF) : getSPReg(MF);
   if (hasBP(MF)) {
     auto FI = MF.getInfo<WebAssemblyFunctionInfo>();
     SPReg = FI->getBasePointerVreg();
@@ -245,17 +298,34 @@ void WebAssemblyFrameLowering::emitEpilogue(MachineFunction &MF,
     const TargetRegisterClass *PtrRC =
         MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
     Register OffsetReg = MRI.createVirtualRegister(PtrRC);
-    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::CONST_I32), OffsetReg)
+    BuildMI(MBB, InsertPt, DL, TII->get(getOpcConst(MF)), OffsetReg)
         .addImm(StackSize);
-    // In the epilog we don't need to write the result back to the SP32 physreg
-    // because it won't be used again. We can use a stackified register instead.
+    // In the epilog we don't need to write the result back to the SP32/64
+    // physreg because it won't be used again. We can use a stackified register
+    // instead.
     SPReg = MRI.createVirtualRegister(PtrRC);
-    BuildMI(MBB, InsertPt, DL, TII->get(WebAssembly::ADD_I32), SPReg)
-        .addReg(hasFP(MF) ? WebAssembly::FP32 : WebAssembly::SP32)
+    BuildMI(MBB, InsertPt, DL, TII->get(getOpcAdd(MF)), SPReg)
+        .addReg(SPFPReg)
         .addReg(OffsetReg);
   } else {
-    SPReg = hasFP(MF) ? WebAssembly::FP32 : WebAssembly::SP32;
+    SPReg = SPFPReg;
   }
 
   writeSPToGlobal(SPReg, MF, MBB, InsertPt, DL);
 }
+
+TargetFrameLowering::DwarfFrameBase
+WebAssemblyFrameLowering::getDwarfFrameBase(const MachineFunction &MF) const {
+  DwarfFrameBase Loc;
+  Loc.Kind = DwarfFrameBase::WasmFrameBase;
+  const WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
+  if (needsSP(MF) && MFI.isFrameBaseVirtual()) {
+    unsigned LocalNum = MFI.getFrameBaseLocal();
+    Loc.Location.WasmLoc = {WebAssembly::TI_LOCAL, LocalNum};
+  } else {
+    // TODO: This should work on a breakpoint at a function with no frame,
+    // but probably won't work for traversing up the stack.
+    Loc.Location.WasmLoc = {WebAssembly::TI_GLOBAL_RELOC, 0};
+  }
+  return Loc;
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
index fdc0f561dcd9..e16f639ff22b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -18,7 +18,6 @@
 #include "llvm/CodeGen/TargetFrameLowering.h"
 
 namespace llvm {
-class MachineFrameInfo;
 
 class WebAssemblyFrameLowering final : public TargetFrameLowering {
 public:
@@ -44,6 +43,7 @@ public:
 
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
+  DwarfFrameBase getDwarfFrameBase(const MachineFunction &MF) const override;
 
   bool needsPrologForEH(const MachineFunction &MF) const;
 
@@ -53,6 +53,15 @@ public:
                        MachineBasicBlock::iterator &InsertStore,
                        const DebugLoc &DL) const;
 
+  static unsigned getSPReg(const MachineFunction &MF);
+  static unsigned getFPReg(const MachineFunction &MF);
+  static unsigned getOpcConst(const MachineFunction &MF);
+  static unsigned getOpcAdd(const MachineFunction &MF);
+  static unsigned getOpcSub(const MachineFunction &MF);
+  static unsigned getOpcAnd(const MachineFunction &MF);
+  static unsigned getOpcGlobGet(const MachineFunction &MF);
+  static unsigned getOpcGlobSet(const MachineFunction &MF);
+
 private:
   bool hasBP(const MachineFunction &MF) const;
   bool needsSPForLocalFrame(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
index ba04fd4eb9dd..dee1c4e28149 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISD.def
@@ -13,8 +13,7 @@
 
 // NOTE: NO INCLUDE GUARD DESIRED!
 
-HANDLE_NODETYPE(CALL1)
-HANDLE_NODETYPE(CALL0)
+HANDLE_NODETYPE(CALL)
 HANDLE_NODETYPE(RET_CALL)
 HANDLE_NODETYPE(RETURN)
 HANDLE_NODETYPE(ARGUMENT)
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
index 531a07b829c8..d1a696f854f8 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelDAGToDAG.cpp
@@ -53,11 +53,6 @@ public:
 
     Subtarget = &MF.getSubtarget<WebAssemblySubtarget>();
 
-    // Wasm64 is not fully supported right now (and is not specified)
-    if (Subtarget->hasAddr64())
-      report_fatal_error(
-          "64-bit WebAssembly (wasm64) is not currently supported");
-
     return SelectionDAGISel::runOnMachineFunction(MF);
   }
 
@@ -82,6 +77,13 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
     return;
   }
 
+  MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout());
+  auto GlobalGetIns = PtrVT == MVT::i64 ? WebAssembly::GLOBAL_GET_I64
+                                        : WebAssembly::GLOBAL_GET_I32;
+  auto ConstIns =
+      PtrVT == MVT::i64 ? WebAssembly::CONST_I64 : WebAssembly::CONST_I32;
+  auto AddIns = PtrVT == MVT::i64 ? WebAssembly::ADD_I64 : WebAssembly::ADD_I32;
+
   // Few custom selection stuff.
   SDLoc DL(Node);
   MachineFunction &MF = CurDAG->getMachineFunction();
@@ -145,20 +147,16 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
                          false);
     }
 
-    MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout());
-    assert(PtrVT == MVT::i32 && "only wasm32 is supported for now");
-
     SDValue TLSBaseSym = CurDAG->getTargetExternalSymbol("__tls_base", PtrVT);
     SDValue TLSOffsetSym = CurDAG->getTargetGlobalAddress(
         GA->getGlobal(), DL, PtrVT, GA->getOffset(), 0);
 
-    MachineSDNode *TLSBase = CurDAG->getMachineNode(WebAssembly::GLOBAL_GET_I32,
-                                                    DL, MVT::i32, TLSBaseSym);
-    MachineSDNode *TLSOffset = CurDAG->getMachineNode(
-        WebAssembly::CONST_I32, DL, MVT::i32, TLSOffsetSym);
-    MachineSDNode *TLSAddress =
-        CurDAG->getMachineNode(WebAssembly::ADD_I32, DL, MVT::i32,
-                               SDValue(TLSBase, 0), SDValue(TLSOffset, 0));
+    MachineSDNode *TLSBase =
+        CurDAG->getMachineNode(GlobalGetIns, DL, PtrVT, TLSBaseSym);
+    MachineSDNode *TLSOffset =
+        CurDAG->getMachineNode(ConstIns, DL, PtrVT, TLSOffsetSym);
+    MachineSDNode *TLSAddress = CurDAG->getMachineNode(
+        AddIns, DL, PtrVT, SDValue(TLSBase, 0), SDValue(TLSOffset, 0));
     ReplaceNode(Node, TLSAddress);
     return;
   }
@@ -167,22 +165,16 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(0))->getZExtValue();
     switch (IntNo) {
     case Intrinsic::wasm_tls_size: {
-      MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout());
-      assert(PtrVT == MVT::i32 && "only wasm32 is supported for now");
-
       MachineSDNode *TLSSize = CurDAG->getMachineNode(
-          WebAssembly::GLOBAL_GET_I32, DL, PtrVT,
-          CurDAG->getTargetExternalSymbol("__tls_size", MVT::i32));
+          GlobalGetIns, DL, PtrVT,
+          CurDAG->getTargetExternalSymbol("__tls_size", PtrVT));
       ReplaceNode(Node, TLSSize);
       return;
     }
     case Intrinsic::wasm_tls_align: {
-      MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout());
-      assert(PtrVT == MVT::i32 && "only wasm32 is supported for now");
-
       MachineSDNode *TLSAlign = CurDAG->getMachineNode(
-          WebAssembly::GLOBAL_GET_I32, DL, PtrVT,
-          CurDAG->getTargetExternalSymbol("__tls_align", MVT::i32));
+          GlobalGetIns, DL, PtrVT,
+          CurDAG->getTargetExternalSymbol("__tls_align", PtrVT));
       ReplaceNode(Node, TLSAlign);
       return;
     }
@@ -193,11 +185,8 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
     unsigned IntNo = cast<ConstantSDNode>(Node->getOperand(1))->getZExtValue();
     switch (IntNo) {
     case Intrinsic::wasm_tls_base: {
-      MVT PtrVT = TLI->getPointerTy(CurDAG->getDataLayout());
-      assert(PtrVT == MVT::i32 && "only wasm32 is supported for now");
-
       MachineSDNode *TLSBase = CurDAG->getMachineNode(
-          WebAssembly::GLOBAL_GET_I32, DL, MVT::i32, MVT::Other,
+          GlobalGetIns, DL, PtrVT, MVT::Other,
           CurDAG->getTargetExternalSymbol("__tls_base", PtrVT),
           Node->getOperand(0));
       ReplaceNode(Node, TLSBase);
@@ -206,6 +195,35 @@ void WebAssemblyDAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
+  case WebAssemblyISD::CALL:
+  case WebAssemblyISD::RET_CALL: {
+    // CALL has both variable operands and variable results, but ISel only
+    // supports one or the other. Split calls into two nodes glued together, one
+    // for the operands and one for the results. These two nodes will be
+    // recombined in a custom inserter hook into a single MachineInstr.
+    SmallVector<SDValue, 16> Ops;
+    for (size_t i = 1; i < Node->getNumOperands(); ++i) {
+      SDValue Op = Node->getOperand(i);
+      if (i == 1 && Op->getOpcode() == WebAssemblyISD::Wrapper)
+        Op = Op->getOperand(0);
+      Ops.push_back(Op);
+    }
+
+    // Add the chain last
+    Ops.push_back(Node->getOperand(0));
+    MachineSDNode *CallParams =
+        CurDAG->getMachineNode(WebAssembly::CALL_PARAMS, DL, MVT::Glue, Ops);
+
+    unsigned Results = Node->getOpcode() == WebAssemblyISD::CALL
+                           ? WebAssembly::CALL_RESULTS
+                           : WebAssembly::RET_CALL_RESULTS;
+
+    SDValue Link(CallParams, 0);
+    MachineSDNode *CallResults =
+        CurDAG->getMachineNode(Results, DL, Node->getVTList(), Link);
+    ReplaceNode(Node, CallResults);
+    return;
+  }
 
   default:
     break;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
index 5b177c0c5d9d..a9b9eceb4130 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp
@@ -61,8 +61,6 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     addRegisterClass(MVT::v8i16, &WebAssembly::V128RegClass);
     addRegisterClass(MVT::v4i32, &WebAssembly::V128RegClass);
     addRegisterClass(MVT::v4f32, &WebAssembly::V128RegClass);
-  }
-  if (Subtarget->hasUnimplementedSIMD128()) {
     addRegisterClass(MVT::v2i64, &WebAssembly::V128RegClass);
     addRegisterClass(MVT::v2f64, &WebAssembly::V128RegClass);
   }
@@ -116,97 +114,81 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
     for (auto T : {MVT::i32, MVT::i64})
       setOperationAction(Op, T, Expand);
     if (Subtarget->hasSIMD128())
-      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
+      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64})
         setOperationAction(Op, T, Expand);
-    if (Subtarget->hasUnimplementedSIMD128())
-      setOperationAction(Op, MVT::v2i64, Expand);
   }
 
   // SIMD-specific configuration
   if (Subtarget->hasSIMD128()) {
+    // Hoist bitcasts out of shuffles
+    setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
+
     // Support saturating add for i8x16 and i16x8
     for (auto Op : {ISD::SADDSAT, ISD::UADDSAT})
       for (auto T : {MVT::v16i8, MVT::v8i16})
         setOperationAction(Op, T, Legal);
 
+    // Support integer abs
+    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
+      setOperationAction(ISD::ABS, T, Legal);
+
     // Custom lower BUILD_VECTORs to minimize number of replace_lanes
-    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
+    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64,
+                   MVT::v2f64})
       setOperationAction(ISD::BUILD_VECTOR, T, Custom);
-    if (Subtarget->hasUnimplementedSIMD128())
-      for (auto T : {MVT::v2i64, MVT::v2f64})
-        setOperationAction(ISD::BUILD_VECTOR, T, Custom);
 
     // We have custom shuffle lowering to expose the shuffle mask
-    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
+    for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64,
+                   MVT::v2f64})
       setOperationAction(ISD::VECTOR_SHUFFLE, T, Custom);
-    if (Subtarget->hasUnimplementedSIMD128())
-      for (auto T: {MVT::v2i64, MVT::v2f64})
-        setOperationAction(ISD::VECTOR_SHUFFLE, T, Custom);
 
     // Custom lowering since wasm shifts must have a scalar shift amount
-    for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL}) {
-      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
+    for (auto Op : {ISD::SHL, ISD::SRA, ISD::SRL})
+      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64})
         setOperationAction(Op, T, Custom);
-      if (Subtarget->hasUnimplementedSIMD128())
-        setOperationAction(Op, MVT::v2i64, Custom);
-    }
 
     // Custom lower lane accesses to expand out variable indices
-    for (auto Op : {ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT}) {
-      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
+    for (auto Op : {ISD::EXTRACT_VECTOR_ELT, ISD::INSERT_VECTOR_ELT})
+      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64,
+                     MVT::v2f64})
         setOperationAction(Op, T, Custom);
-      if (Subtarget->hasUnimplementedSIMD128())
-        for (auto T : {MVT::v2i64, MVT::v2f64})
-          setOperationAction(Op, T, Custom);
-    }
 
-    // There is no i64x2.mul instruction
-    setOperationAction(ISD::MUL, MVT::v2i64, Expand);
+    // There is no i8x16.mul instruction
+    setOperationAction(ISD::MUL, MVT::v16i8, Expand);
 
     // There are no vector select instructions
-    for (auto Op : {ISD::VSELECT, ISD::SELECT_CC, ISD::SELECT}) {
-      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32})
+    for (auto Op : {ISD::VSELECT, ISD::SELECT_CC, ISD::SELECT})
+      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v4f32, MVT::v2i64,
+                     MVT::v2f64})
         setOperationAction(Op, T, Expand);
-      if (Subtarget->hasUnimplementedSIMD128())
-        for (auto T : {MVT::v2i64, MVT::v2f64})
-          setOperationAction(Op, T, Expand);
-    }
 
     // Expand integer operations supported for scalars but not SIMD
     for (auto Op : {ISD::CTLZ, ISD::CTTZ, ISD::CTPOP, ISD::SDIV, ISD::UDIV,
-                    ISD::SREM, ISD::UREM, ISD::ROTL, ISD::ROTR}) {
-      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
+                    ISD::SREM, ISD::UREM, ISD::ROTL, ISD::ROTR})
+      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32, MVT::v2i64})
         setOperationAction(Op, T, Expand);
-      if (Subtarget->hasUnimplementedSIMD128())
-        setOperationAction(Op, MVT::v2i64, Expand);
-    }
 
     // But we do have integer min and max operations
-    if (Subtarget->hasUnimplementedSIMD128()) {
-      for (auto Op : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
-        for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
-          setOperationAction(Op, T, Legal);
-    }
+    for (auto Op : {ISD::SMIN, ISD::SMAX, ISD::UMIN, ISD::UMAX})
+      for (auto T : {MVT::v16i8, MVT::v8i16, MVT::v4i32})
+        setOperationAction(Op, T, Legal);
 
     // Expand float operations supported for scalars but not SIMD
     for (auto Op : {ISD::FCEIL, ISD::FFLOOR, ISD::FTRUNC, ISD::FNEARBYINT,
                     ISD::FCOPYSIGN, ISD::FLOG, ISD::FLOG2, ISD::FLOG10,
-                    ISD::FEXP, ISD::FEXP2, ISD::FRINT}) {
-      setOperationAction(Op, MVT::v4f32, Expand);
-      if (Subtarget->hasUnimplementedSIMD128())
-        setOperationAction(Op, MVT::v2f64, Expand);
-    }
+                    ISD::FEXP, ISD::FEXP2, ISD::FRINT})
+      for (auto T : {MVT::v4f32, MVT::v2f64})
+        setOperationAction(Op, T, Expand);
 
     // Expand operations not supported for i64x2 vectors
-    if (Subtarget->hasUnimplementedSIMD128())
-      for (unsigned CC = 0; CC < ISD::SETCC_INVALID; ++CC)
-        setCondCodeAction(static_cast<ISD::CondCode>(CC), MVT::v2i64, Custom);
-
-    // Expand additional SIMD ops that V8 hasn't implemented yet
-    if (!Subtarget->hasUnimplementedSIMD128()) {
-      setOperationAction(ISD::FSQRT, MVT::v4f32, Expand);
-      setOperationAction(ISD::FDIV, MVT::v4f32, Expand);
-    }
+    for (unsigned CC = 0; CC < ISD::SETCC_INVALID; ++CC)
+      setCondCodeAction(static_cast<ISD::CondCode>(CC), MVT::v2i64, Custom);
+
+    // 64x2 conversions are not in the spec
+    for (auto Op :
+         {ISD::SINT_TO_FP, ISD::UINT_TO_FP, ISD::FP_TO_SINT, ISD::FP_TO_UINT})
+      for (auto T : {MVT::v2i64, MVT::v2f64})
+        setOperationAction(Op, T, Expand);
   }
 
   // As a special case, these operators use the type to mean the type to
@@ -227,6 +209,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
   setOperationAction(ISD::DYNAMIC_STACKALLOC, MVTPtr, Expand);
 
   setOperationAction(ISD::FrameIndex, MVT::i32, Custom);
+  setOperationAction(ISD::FrameIndex, MVT::i64, Custom);
   setOperationAction(ISD::CopyToReg, MVT::Other, Custom);
 
   // Expand these forms; we pattern-match the forms that we can handle in isel.
@@ -259,12 +242,10 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
       }
     }
     // But some vector extending loads are legal
-    if (Subtarget->hasUnimplementedSIMD128()) {
-      for (auto Ext : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
-        setLoadExtAction(Ext, MVT::v8i16, MVT::v8i8, Legal);
-        setLoadExtAction(Ext, MVT::v4i32, MVT::v4i16, Legal);
-        setLoadExtAction(Ext, MVT::v2i64, MVT::v2i32, Legal);
-      }
+    for (auto Ext : {ISD::EXTLOAD, ISD::SEXTLOAD, ISD::ZEXTLOAD}) {
+      setLoadExtAction(Ext, MVT::v8i16, MVT::v8i8, Legal);
+      setLoadExtAction(Ext, MVT::v4i32, MVT::v4i16, Legal);
+      setLoadExtAction(Ext, MVT::v2i64, MVT::v2i32, Legal);
     }
   }
 
@@ -273,6 +254,7 @@ WebAssemblyTargetLowering::WebAssemblyTargetLowering(
 
   // Trap lowers to wasm unreachable
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
+  setOperationAction(ISD::DEBUGTRAP, MVT::Other, Legal);
 
   // Exception handling intrinsics
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
@@ -434,6 +416,58 @@ static MachineBasicBlock *LowerFPToInt(MachineInstr &MI, DebugLoc DL,
   return DoneMBB;
 }
 
+static MachineBasicBlock *LowerCallResults(MachineInstr &CallResults,
+                                           DebugLoc DL, MachineBasicBlock *BB,
+                                           const TargetInstrInfo &TII) {
+  MachineInstr &CallParams = *CallResults.getPrevNode();
+  assert(CallParams.getOpcode() == WebAssembly::CALL_PARAMS);
+  assert(CallResults.getOpcode() == WebAssembly::CALL_RESULTS ||
+         CallResults.getOpcode() == WebAssembly::RET_CALL_RESULTS);
+
+  bool IsIndirect = CallParams.getOperand(0).isReg();
+  bool IsRetCall = CallResults.getOpcode() == WebAssembly::RET_CALL_RESULTS;
+
+  unsigned CallOp;
+  if (IsIndirect && IsRetCall) {
+    CallOp = WebAssembly::RET_CALL_INDIRECT;
+  } else if (IsIndirect) {
+    CallOp = WebAssembly::CALL_INDIRECT;
+  } else if (IsRetCall) {
+    CallOp = WebAssembly::RET_CALL;
+  } else {
+    CallOp = WebAssembly::CALL;
+  }
+
+  MachineFunction &MF = *BB->getParent();
+  const MCInstrDesc &MCID = TII.get(CallOp);
+  MachineInstrBuilder MIB(MF, MF.CreateMachineInstr(MCID, DL));
+
+  // Move the function pointer to the end of the arguments for indirect calls
+  if (IsIndirect) {
+    auto FnPtr = CallParams.getOperand(0);
+    CallParams.RemoveOperand(0);
+    CallParams.addOperand(FnPtr);
+  }
+
+  for (auto Def : CallResults.defs())
+    MIB.add(Def);
+
+  // Add placeholders for the type index and immediate flags
+  if (IsIndirect) {
+    MIB.addImm(0);
+    MIB.addImm(0);
+  }
+
+  for (auto Use : CallParams.uses())
+    MIB.add(Use);
+
+  BB->insert(CallResults.getIterator(), MIB);
+  CallParams.eraseFromParent();
+  CallResults.eraseFromParent();
+
+  return BB;
+}
+
 MachineBasicBlock *WebAssemblyTargetLowering::EmitInstrWithCustomInserter(
     MachineInstr &MI, MachineBasicBlock *BB) const {
   const TargetInstrInfo &TII = *Subtarget->getInstrInfo();
@@ -466,7 +500,9 @@ MachineBasicBlock *WebAssemblyTargetLowering::EmitInstrWithCustomInserter(
   case WebAssembly::FP_TO_UINT_I64_F64:
     return LowerFPToInt(MI, DL, BB, TII, true, true, true,
                         WebAssembly::I64_TRUNC_U_F64);
-    llvm_unreachable("Unexpected instruction to emit with custom inserter");
+  case WebAssembly::CALL_RESULTS:
+  case WebAssembly::RET_CALL_RESULTS:
+    return LowerCallResults(MI, DL, BB, TII);
   }
 }
 
@@ -565,8 +601,6 @@ bool WebAssemblyTargetLowering::isIntDivCheap(EVT VT,
 }
 
 bool WebAssemblyTargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
-  if (!Subtarget->hasUnimplementedSIMD128())
-    return false;
   MVT ExtT = ExtVal.getSimpleValueType();
   MVT MemT = cast<LoadSDNode>(ExtVal->getOperand(0))->getSimpleValueType(0);
   return (ExtT == MVT::v8i16 && MemT == MVT::v8i8) ||
@@ -580,7 +614,11 @@ EVT WebAssemblyTargetLowering::getSetCCResultType(const DataLayout &DL,
   if (VT.isVector())
     return VT.changeVectorElementTypeToInteger();
 
-  return TargetLowering::getSetCCResultType(DL, C, VT);
+  // So far, all branch instructions in Wasm take an I32 condition.
+  // The default TargetLowering::getSetCCResultType returns the pointer size,
+  // which would be useful to reduce instruction counts when testing
+  // against 64-bit pointers/values if at some point Wasm supports that.
+  return EVT::getIntegerVT(C, 32);
 }
 
 bool WebAssemblyTargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
@@ -648,7 +686,8 @@ static bool callingConvSupported(CallingConv::ID CallConv) {
          CallConv == CallingConv::PreserveMost ||
          CallConv == CallingConv::PreserveAll ||
          CallConv == CallingConv::CXX_FAST_TLS ||
-         CallConv == CallingConv::WASM_EmscriptenInvoke;
+         CallConv == CallingConv::WASM_EmscriptenInvoke ||
+         CallConv == CallingConv::Swift;
 }
 
 SDValue
@@ -670,41 +709,57 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
     fail(DL, DAG, "WebAssembly doesn't support patch point yet");
 
   if (CLI.IsTailCall) {
-    bool MustTail = CLI.CS && CLI.CS.isMustTailCall();
-    if (Subtarget->hasTailCall() && !CLI.IsVarArg) {
-      // Do not tail call unless caller and callee return types match
-      const Function &F = MF.getFunction();
-      const TargetMachine &TM = getTargetMachine();
-      Type *RetTy = F.getReturnType();
-      SmallVector<MVT, 4> CallerRetTys;
-      SmallVector<MVT, 4> CalleeRetTys;
-      computeLegalValueVTs(F, TM, RetTy, CallerRetTys);
-      computeLegalValueVTs(F, TM, CLI.RetTy, CalleeRetTys);
-      bool TypesMatch = CallerRetTys.size() == CalleeRetTys.size() &&
-                        std::equal(CallerRetTys.begin(), CallerRetTys.end(),
-                                   CalleeRetTys.begin());
-      if (!TypesMatch) {
-        // musttail in this case would be an LLVM IR validation failure
-        assert(!MustTail);
-        CLI.IsTailCall = false;
-      }
-    } else {
+    auto NoTail = [&](const char *Msg) {
+      if (CLI.CB && CLI.CB->isMustTailCall())
+        fail(DL, DAG, Msg);
       CLI.IsTailCall = false;
-      if (MustTail) {
-        if (CLI.IsVarArg) {
-          // The return would pop the argument buffer
-          fail(DL, DAG, "WebAssembly does not support varargs tail calls");
-        } else {
-          fail(DL, DAG, "WebAssembly 'tail-call' feature not enabled");
+    };
+
+    if (!Subtarget->hasTailCall())
+      NoTail("WebAssembly 'tail-call' feature not enabled");
+
+    // Varargs calls cannot be tail calls because the buffer is on the stack
+    if (CLI.IsVarArg)
+      NoTail("WebAssembly does not support varargs tail calls");
+
+    // Do not tail call unless caller and callee return types match
+    const Function &F = MF.getFunction();
+    const TargetMachine &TM = getTargetMachine();
+    Type *RetTy = F.getReturnType();
+    SmallVector<MVT, 4> CallerRetTys;
+    SmallVector<MVT, 4> CalleeRetTys;
+    computeLegalValueVTs(F, TM, RetTy, CallerRetTys);
+    computeLegalValueVTs(F, TM, CLI.RetTy, CalleeRetTys);
+    bool TypesMatch = CallerRetTys.size() == CalleeRetTys.size() &&
+                      std::equal(CallerRetTys.begin(), CallerRetTys.end(),
+                                 CalleeRetTys.begin());
+    if (!TypesMatch)
+      NoTail("WebAssembly tail call requires caller and callee return types to "
+             "match");
+
+    // If pointers to local stack values are passed, we cannot tail call
+    if (CLI.CB) {
+      for (auto &Arg : CLI.CB->args()) {
+        Value *Val = Arg.get();
+        // Trace the value back through pointer operations
+        while (true) {
+          Value *Src = Val->stripPointerCastsAndAliases();
+          if (auto *GEP = dyn_cast<GetElementPtrInst>(Src))
+            Src = GEP->getPointerOperand();
+          if (Val == Src)
+            break;
+          Val = Src;
+        }
+        if (isa<AllocaInst>(Val)) {
+          NoTail(
+              "WebAssembly does not support tail calling with stack arguments");
+          break;
         }
       }
     }
   }
 
   SmallVectorImpl<ISD::InputArg> &Ins = CLI.Ins;
-  if (Ins.size() > 1)
-    fail(DL, DAG, "WebAssembly doesn't support more than 1 returned value yet");
-
   SmallVectorImpl<ISD::OutputArg> &Outs = CLI.Outs;
   SmallVectorImpl<SDValue> &OutVals = CLI.OutVals;
 
@@ -717,10 +772,14 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
     std::swap(OutVals[0], OutVals[1]);
   }
 
+  bool HasSwiftSelfArg = false;
+  bool HasSwiftErrorArg = false;
   unsigned NumFixedArgs = 0;
   for (unsigned I = 0; I < Outs.size(); ++I) {
     const ISD::OutputArg &Out = Outs[I];
     SDValue &OutVal = OutVals[I];
+    HasSwiftSelfArg |= Out.Flags.isSwiftSelf();
+    HasSwiftErrorArg |= Out.Flags.isSwiftError();
     if (Out.Flags.isNest())
       fail(DL, DAG, "WebAssembly hasn't implemented nest arguments");
     if (Out.Flags.isInAlloca())
@@ -732,13 +791,13 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
     if (Out.Flags.isByVal() && Out.Flags.getByValSize() != 0) {
       auto &MFI = MF.getFrameInfo();
       int FI = MFI.CreateStackObject(Out.Flags.getByValSize(),
-                                     Out.Flags.getByValAlign(),
+                                     Out.Flags.getNonZeroByValAlign(),
                                      /*isSS=*/false);
       SDValue SizeNode =
           DAG.getConstant(Out.Flags.getByValSize(), DL, MVT::i32);
       SDValue FINode = DAG.getFrameIndex(FI, getPointerTy(Layout));
       Chain = DAG.getMemcpy(
-          Chain, DL, FINode, OutVal, SizeNode, Out.Flags.getByValAlign(),
+          Chain, DL, FINode, OutVal, SizeNode, Out.Flags.getNonZeroByValAlign(),
           /*isVolatile*/ false, /*AlwaysInline=*/false,
           /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
       OutVal = FINode;
@@ -750,6 +809,29 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
   bool IsVarArg = CLI.IsVarArg;
   auto PtrVT = getPointerTy(Layout);
 
+  // For swiftcc, emit additional swiftself and swifterror arguments
+  // if there aren't. These additional arguments are also added for callee
+  // signature They are necessary to match callee and caller signature for
+  // indirect call.
+  if (CallConv == CallingConv::Swift) {
+    if (!HasSwiftSelfArg) {
+      NumFixedArgs++;
+      ISD::OutputArg Arg;
+      Arg.Flags.setSwiftSelf();
+      CLI.Outs.push_back(Arg);
+      SDValue ArgVal = DAG.getUNDEF(PtrVT);
+      CLI.OutVals.push_back(ArgVal);
+    }
+    if (!HasSwiftErrorArg) {
+      NumFixedArgs++;
+      ISD::OutputArg Arg;
+      Arg.Flags.setSwiftError();
+      CLI.Outs.push_back(Arg);
+      SDValue ArgVal = DAG.getUNDEF(PtrVT);
+      CLI.OutVals.push_back(ArgVal);
+    }
+  }
+
   // Analyze operands of the call, assigning locations to each operand.
   SmallVector<CCValAssign, 16> ArgLocs;
   CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
@@ -763,10 +845,10 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
       EVT VT = Arg.getValueType();
       assert(VT != MVT::iPTR && "Legalized args should be concrete");
       Type *Ty = VT.getTypeForEVT(*DAG.getContext());
-      unsigned Align = std::max(Out.Flags.getOrigAlign(),
-                                Layout.getABITypeAlignment(Ty));
-      unsigned Offset = CCInfo.AllocateStack(Layout.getTypeAllocSize(Ty),
-                                             Align);
+      Align Alignment =
+          std::max(Out.Flags.getNonZeroOrigAlign(), Layout.getABITypeAlign(Ty));
+      unsigned Offset =
+          CCInfo.AllocateStack(Layout.getTypeAllocSize(Ty), Alignment);
       CCInfo.addLoc(CCValAssign::getMem(ArgLocs.size(), VT.getSimpleVT(),
                                         Offset, VT.getSimpleVT(),
                                         CCValAssign::Full));
@@ -838,7 +920,7 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
     if (In.Flags.isInConsecutiveRegsLast())
       fail(DL, DAG,
            "WebAssembly hasn't implemented cons regs last return values");
-    // Ignore In.getOrigAlign() because all our arguments are passed in
+    // Ignore In.getNonZeroOrigAlign() because all our arguments are passed in
     // registers.
     InTys.push_back(In.VT);
   }
@@ -851,17 +933,13 @@ WebAssemblyTargetLowering::LowerCall(CallLoweringInfo &CLI,
 
   InTys.push_back(MVT::Other);
   SDVTList InTyList = DAG.getVTList(InTys);
-  SDValue Res =
-      DAG.getNode(Ins.empty() ? WebAssemblyISD::CALL0 : WebAssemblyISD::CALL1,
-                  DL, InTyList, Ops);
-  if (Ins.empty()) {
-    Chain = Res;
-  } else {
-    InVals.push_back(Res);
-    Chain = Res.getValue(1);
-  }
+  SDValue Res = DAG.getNode(WebAssemblyISD::CALL, DL, InTyList, Ops);
 
-  return Chain;
+  for (size_t I = 0; I < Ins.size(); ++I)
+    InVals.push_back(Res.getValue(I));
+
+  // Return the chain
+  return Res.getValue(Ins.size());
 }
 
 bool WebAssemblyTargetLowering::CanLowerReturn(
@@ -916,7 +994,11 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
   // of the incoming values before they're represented by virtual registers.
   MF.getRegInfo().addLiveIn(WebAssembly::ARGUMENTS);
 
+  bool HasSwiftErrorArg = false;
+  bool HasSwiftSelfArg = false;
   for (const ISD::InputArg &In : Ins) {
+    HasSwiftSelfArg |= In.Flags.isSwiftSelf();
+    HasSwiftErrorArg |= In.Flags.isSwiftError();
     if (In.Flags.isInAlloca())
       fail(DL, DAG, "WebAssembly hasn't implemented inalloca arguments");
     if (In.Flags.isNest())
@@ -925,7 +1007,7 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
       fail(DL, DAG, "WebAssembly hasn't implemented cons regs arguments");
     if (In.Flags.isInConsecutiveRegsLast())
       fail(DL, DAG, "WebAssembly hasn't implemented cons regs last arguments");
-    // Ignore In.getOrigAlign() because all our arguments are passed in
+    // Ignore In.getNonZeroOrigAlign() because all our arguments are passed in
     // registers.
     InVals.push_back(In.Used ? DAG.getNode(WebAssemblyISD::ARGUMENT, DL, In.VT,
                                            DAG.getTargetConstant(InVals.size(),
@@ -936,6 +1018,19 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
     MFI->addParam(In.VT);
   }
 
+  // For swiftcc, emit additional swiftself and swifterror arguments
+  // if there aren't. These additional arguments are also added for callee
+  // signature They are necessary to match callee and caller signature for
+  // indirect call.
+  auto PtrVT = getPointerTy(MF.getDataLayout());
+  if (CallConv == CallingConv::Swift) {
+    if (!HasSwiftSelfArg) {
+      MFI->addParam(PtrVT);
+    }
+    if (!HasSwiftErrorArg) {
+      MFI->addParam(PtrVT);
+    }
+  }
   // Varargs are copied into a buffer allocated by the caller, and a pointer to
   // the buffer is passed as an argument.
   if (IsVarArg) {
@@ -953,8 +1048,8 @@ SDValue WebAssemblyTargetLowering::LowerFormalArguments(
   // Record the number and types of arguments and results.
   SmallVector<MVT, 4> Params;
   SmallVector<MVT, 4> Results;
-  computeSignatureVTs(MF.getFunction().getFunctionType(), MF.getFunction(),
-                      DAG.getTarget(), Params, Results);
+  computeSignatureVTs(MF.getFunction().getFunctionType(), &MF.getFunction(),
+                      MF.getFunction(), DAG.getTarget(), Params, Results);
   for (MVT VT : Results)
     MFI->addResult(VT);
   // TODO: Use signatures in WebAssemblyMachineFunctionInfo too and unify
@@ -1190,11 +1285,10 @@ SDValue WebAssemblyTargetLowering::LowerBR_JT(SDValue Op,
   for (auto MBB : MBBs)
     Ops.push_back(DAG.getBasicBlock(MBB));
 
-  // TODO: For now, we just pick something arbitrary for a default case for now.
-  // We really want to sniff out the guard and put in the real default case (and
-  // delete the guard).
-  Ops.push_back(DAG.getBasicBlock(MBBs[0]));
-
+  // Add the first MBB as a dummy default target for now. This will be replaced
+  // with the proper default target (and the preceding range check eliminated)
+  // if possible by WebAssemblyFixBrTableDefaults.
+  Ops.push_back(DAG.getBasicBlock(*MBBs.begin()));
   return DAG.getNode(WebAssemblyISD::BR_TABLE, DL, MVT::Other, Ops);
 }
 
@@ -1262,6 +1356,24 @@ SDValue WebAssemblyTargetLowering::LowerIntrinsic(SDValue Op,
                            Op.getOperand(3)  // thrown value
                        });
   }
+
+  case Intrinsic::wasm_shuffle: {
+    // Drop in-chain and replace undefs, but otherwise pass through unchanged
+    SDValue Ops[18];
+    size_t OpIdx = 0;
+    Ops[OpIdx++] = Op.getOperand(1);
+    Ops[OpIdx++] = Op.getOperand(2);
+    while (OpIdx < 18) {
+      const SDValue &MaskIdx = Op.getOperand(OpIdx + 1);
+      if (MaskIdx.isUndef() ||
+          cast<ConstantSDNode>(MaskIdx.getNode())->getZExtValue() >= 32) {
+        Ops[OpIdx++] = DAG.getConstant(0, DL, MVT::i32);
+      } else {
+        Ops[OpIdx++] = MaskIdx;
+      }
+    }
+    return DAG.getNode(WebAssemblyISD::SHUFFLE, DL, Op.getValueType(), Ops);
+  }
   }
 }
 
@@ -1270,39 +1382,42 @@ WebAssemblyTargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op,
                                                   SelectionDAG &DAG) const {
   SDLoc DL(Op);
   // If sign extension operations are disabled, allow sext_inreg only if operand
-  // is a vector extract. SIMD does not depend on sign extension operations, but
-  // allowing sext_inreg in this context lets us have simple patterns to select
-  // extract_lane_s instructions. Expanding sext_inreg everywhere would be
-  // simpler in this file, but would necessitate large and brittle patterns to
-  // undo the expansion and select extract_lane_s instructions.
+  // is a vector extract of an i8 or i16 lane. SIMD does not depend on sign
+  // extension operations, but allowing sext_inreg in this context lets us have
+  // simple patterns to select extract_lane_s instructions. Expanding sext_inreg
+  // everywhere would be simpler in this file, but would necessitate large and
+  // brittle patterns to undo the expansion and select extract_lane_s
+  // instructions.
   assert(!Subtarget->hasSignExt() && Subtarget->hasSIMD128());
-  if (Op.getOperand(0).getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
-    const SDValue &Extract = Op.getOperand(0);
-    MVT VecT = Extract.getOperand(0).getSimpleValueType();
-    MVT ExtractedLaneT = static_cast<VTSDNode *>(Op.getOperand(1).getNode())
-                             ->getVT()
-                             .getSimpleVT();
-    MVT ExtractedVecT =
-        MVT::getVectorVT(ExtractedLaneT, 128 / ExtractedLaneT.getSizeInBits());
-    if (ExtractedVecT == VecT)
-      return Op;
-    // Bitcast vector to appropriate type to ensure ISel pattern coverage
-    const SDValue &Index = Extract.getOperand(1);
-    unsigned IndexVal =
-        static_cast<ConstantSDNode *>(Index.getNode())->getZExtValue();
-    unsigned Scale =
-        ExtractedVecT.getVectorNumElements() / VecT.getVectorNumElements();
-    assert(Scale > 1);
-    SDValue NewIndex =
-        DAG.getConstant(IndexVal * Scale, DL, Index.getValueType());
-    SDValue NewExtract = DAG.getNode(
-        ISD::EXTRACT_VECTOR_ELT, DL, Extract.getValueType(),
-        DAG.getBitcast(ExtractedVecT, Extract.getOperand(0)), NewIndex);
-    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Op.getValueType(),
-                       NewExtract, Op.getOperand(1));
-  }
-  // Otherwise expand
-  return SDValue();
+  if (Op.getOperand(0).getOpcode() != ISD::EXTRACT_VECTOR_ELT)
+    return SDValue();
+
+  const SDValue &Extract = Op.getOperand(0);
+  MVT VecT = Extract.getOperand(0).getSimpleValueType();
+  if (VecT.getVectorElementType().getSizeInBits() > 32)
+    return SDValue();
+  MVT ExtractedLaneT =
+      cast<VTSDNode>(Op.getOperand(1).getNode())->getVT().getSimpleVT();
+  MVT ExtractedVecT =
+      MVT::getVectorVT(ExtractedLaneT, 128 / ExtractedLaneT.getSizeInBits());
+  if (ExtractedVecT == VecT)
+    return Op;
+
+  // Bitcast vector to appropriate type to ensure ISel pattern coverage
+  const SDNode *Index = Extract.getOperand(1).getNode();
+  if (!isa<ConstantSDNode>(Index))
+    return SDValue();
+  unsigned IndexVal = cast<ConstantSDNode>(Index)->getZExtValue();
+  unsigned Scale =
+      ExtractedVecT.getVectorNumElements() / VecT.getVectorNumElements();
+  assert(Scale > 1);
+  SDValue NewIndex =
+      DAG.getConstant(IndexVal * Scale, DL, Index->getValueType(0));
+  SDValue NewExtract = DAG.getNode(
+      ISD::EXTRACT_VECTOR_ELT, DL, Extract.getValueType(),
+      DAG.getBitcast(ExtractedVecT, Extract.getOperand(0)), NewIndex);
+  return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, Op.getValueType(), NewExtract,
+                     Op.getOperand(1));
 }
 
 SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
@@ -1311,7 +1426,7 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   const EVT VecT = Op.getValueType();
   const EVT LaneT = Op.getOperand(0).getValueType();
   const size_t Lanes = Op.getNumOperands();
-  bool CanSwizzle = Subtarget->hasUnimplementedSIMD128() && VecT == MVT::v16i8;
+  bool CanSwizzle = VecT == MVT::v16i8;
 
   // BUILD_VECTORs are lowered to the instruction that initializes the highest
   // possible number of lanes at once followed by a sequence of replace_lane
@@ -1410,38 +1525,37 @@ SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op,
   // original instruction
   std::function<bool(size_t, const SDValue &)> IsLaneConstructed;
   SDValue Result;
-  if (Subtarget->hasUnimplementedSIMD128()) {
-    // Prefer swizzles over vector consts over splats
-    if (NumSwizzleLanes >= NumSplatLanes &&
-        NumSwizzleLanes >= NumConstantLanes) {
-      Result = DAG.getNode(WebAssemblyISD::SWIZZLE, DL, VecT, SwizzleSrc,
-                           SwizzleIndices);
-      auto Swizzled = std::make_pair(SwizzleSrc, SwizzleIndices);
-      IsLaneConstructed = [&, Swizzled](size_t I, const SDValue &Lane) {
-        return Swizzled == GetSwizzleSrcs(I, Lane);
-      };
-    } else if (NumConstantLanes >= NumSplatLanes) {
-      SmallVector<SDValue, 16> ConstLanes;
-      for (const SDValue &Lane : Op->op_values()) {
-        if (IsConstant(Lane)) {
-          ConstLanes.push_back(Lane);
-        } else if (LaneT.isFloatingPoint()) {
-          ConstLanes.push_back(DAG.getConstantFP(0, DL, LaneT));
-        } else {
-          ConstLanes.push_back(DAG.getConstant(0, DL, LaneT));
-        }
+  // Prefer swizzles over vector consts over splats
+  if (NumSwizzleLanes >= NumSplatLanes &&
+      (!Subtarget->hasUnimplementedSIMD128() ||
+       NumSwizzleLanes >= NumConstantLanes)) {
+    Result = DAG.getNode(WebAssemblyISD::SWIZZLE, DL, VecT, SwizzleSrc,
+                         SwizzleIndices);
+    auto Swizzled = std::make_pair(SwizzleSrc, SwizzleIndices);
+    IsLaneConstructed = [&, Swizzled](size_t I, const SDValue &Lane) {
+      return Swizzled == GetSwizzleSrcs(I, Lane);
+    };
+  } else if (NumConstantLanes >= NumSplatLanes &&
+             Subtarget->hasUnimplementedSIMD128()) {
+    SmallVector<SDValue, 16> ConstLanes;
+    for (const SDValue &Lane : Op->op_values()) {
+      if (IsConstant(Lane)) {
+        ConstLanes.push_back(Lane);
+      } else if (LaneT.isFloatingPoint()) {
+        ConstLanes.push_back(DAG.getConstantFP(0, DL, LaneT));
+      } else {
+        ConstLanes.push_back(DAG.getConstant(0, DL, LaneT));
       }
-      Result = DAG.getBuildVector(VecT, DL, ConstLanes);
-      IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
-        return IsConstant(Lane);
-      };
     }
+    Result = DAG.getBuildVector(VecT, DL, ConstLanes);
+    IsLaneConstructed = [&](size_t _, const SDValue &Lane) {
+      return IsConstant(Lane);
+    };
   }
   if (!Result) {
     // Use a splat, but possibly a load_splat
     LoadSDNode *SplattedLoad;
-    if (Subtarget->hasUnimplementedSIMD128() &&
-        (SplattedLoad = dyn_cast<LoadSDNode>(SplatValue)) &&
+    if ((SplattedLoad = dyn_cast<LoadSDNode>(SplatValue)) &&
         SplattedLoad->getMemoryVT() == VecT.getVectorElementType()) {
       Result = DAG.getMemIntrinsicNode(
           WebAssemblyISD::LOAD_SPLAT, DL, DAG.getVTList(VecT),
@@ -1502,7 +1616,6 @@ SDValue WebAssemblyTargetLowering::LowerSETCC(SDValue Op,
   // expanding all i64x2 SETCC nodes, but that seems to expand f64x2 SETCC nodes
   // (which return i64x2 results) as well. So instead we manually unroll i64x2
   // comparisons here.
-  assert(Subtarget->hasUnimplementedSIMD128());
   assert(Op->getOperand(0)->getSimpleValueType(0) == MVT::v2i64);
   SmallVector<SDValue, 2> LHS, RHS;
   DAG.ExtractVectorElements(Op->getOperand(0), LHS);
@@ -1536,22 +1649,25 @@ static SDValue unrollVectorShift(SDValue Op, SelectionDAG &DAG) {
     return DAG.UnrollVectorOp(Op.getNode());
   // Otherwise mask the shift value to get proper semantics from 32-bit shift
   SDLoc DL(Op);
-  SDValue ShiftVal = Op.getOperand(1);
-  uint64_t MaskVal = LaneT.getSizeInBits() - 1;
-  SDValue MaskedShiftVal = DAG.getNode(
-      ISD::AND,                    // mask opcode
-      DL, ShiftVal.getValueType(), // masked value type
-      ShiftVal,                    // original shift value operand
-      DAG.getConstant(MaskVal, DL, ShiftVal.getValueType()) // mask operand
-  );
-
-  return DAG.UnrollVectorOp(
-      DAG.getNode(Op.getOpcode(),        // original shift opcode
-                  DL, Op.getValueType(), // original return type
-                  Op.getOperand(0),      // original vector operand,
-                  MaskedShiftVal         // new masked shift value operand
-                  )
-          .getNode());
+  size_t NumLanes = Op.getSimpleValueType().getVectorNumElements();
+  SDValue Mask = DAG.getConstant(LaneT.getSizeInBits() - 1, DL, MVT::i32);
+  unsigned ShiftOpcode = Op.getOpcode();
+  SmallVector<SDValue, 16> ShiftedElements;
+  DAG.ExtractVectorElements(Op.getOperand(0), ShiftedElements, 0, 0, MVT::i32);
+  SmallVector<SDValue, 16> ShiftElements;
+  DAG.ExtractVectorElements(Op.getOperand(1), ShiftElements, 0, 0, MVT::i32);
+  SmallVector<SDValue, 16> UnrolledOps;
+  for (size_t i = 0; i < NumLanes; ++i) {
+    SDValue MaskedShiftValue =
+        DAG.getNode(ISD::AND, DL, MVT::i32, ShiftElements[i], Mask);
+    SDValue ShiftedValue = ShiftedElements[i];
+    if (ShiftOpcode == ISD::SRA)
+      ShiftedValue = DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, MVT::i32,
+                                 ShiftedValue, DAG.getValueType(LaneT));
+    UnrolledOps.push_back(
+        DAG.getNode(ShiftOpcode, DL, MVT::i32, ShiftedValue, MaskedShiftValue));
+  }
+  return DAG.getBuildVector(Op.getValueType(), DL, UnrolledOps);
 }
 
 SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
@@ -1561,19 +1677,13 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
   // Only manually lower vector shifts
   assert(Op.getSimpleValueType().isVector());
 
-  // Unroll non-splat vector shifts
-  BuildVectorSDNode *ShiftVec;
-  SDValue SplatVal;
-  if (!(ShiftVec = dyn_cast<BuildVectorSDNode>(Op.getOperand(1).getNode())) ||
-      !(SplatVal = ShiftVec->getSplatValue()))
+  auto ShiftVal = DAG.getSplatValue(Op.getOperand(1));
+  if (!ShiftVal)
     return unrollVectorShift(Op, DAG);
 
-  // All splats except i64x2 const splats are handled by patterns
-  auto *SplatConst = dyn_cast<ConstantSDNode>(SplatVal);
-  if (!SplatConst || Op.getSimpleValueType() != MVT::v2i64)
-    return Op;
+  // Use anyext because none of the high bits can affect the shift
+  ShiftVal = DAG.getAnyExtOrTrunc(ShiftVal, DL, MVT::i32);
 
-  // i64x2 const splats are custom lowered to avoid unnecessary wraps
   unsigned Opcode;
   switch (Op.getOpcode()) {
   case ISD::SHL:
@@ -1588,11 +1698,45 @@ SDValue WebAssemblyTargetLowering::LowerShift(SDValue Op,
   default:
     llvm_unreachable("unexpected opcode");
   }
-  APInt Shift = SplatConst->getAPIntValue().zextOrTrunc(32);
-  return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(0),
-                     DAG.getConstant(Shift, DL, MVT::i32));
+
+  return DAG.getNode(Opcode, DL, Op.getValueType(), Op.getOperand(0), ShiftVal);
 }
 
 //===----------------------------------------------------------------------===//
-//                          WebAssembly Optimization Hooks
+//   Custom DAG combine hooks
 //===----------------------------------------------------------------------===//
+static SDValue
+performVECTOR_SHUFFLECombine(SDNode *N, TargetLowering::DAGCombinerInfo &DCI) {
+  auto &DAG = DCI.DAG;
+  auto Shuffle = cast<ShuffleVectorSDNode>(N);
+
+  // Hoist vector bitcasts that don't change the number of lanes out of unary
+  // shuffles, where they are less likely to get in the way of other combines.
+  // (shuffle (vNxT1 (bitcast (vNxT0 x))), undef, mask) ->
+  //  (vNxT1 (bitcast (vNxT0 (shuffle x, undef, mask))))
+  SDValue Bitcast = N->getOperand(0);
+  if (Bitcast.getOpcode() != ISD::BITCAST)
+    return SDValue();
+  if (!N->getOperand(1).isUndef())
+    return SDValue();
+  SDValue CastOp = Bitcast.getOperand(0);
+  MVT SrcType = CastOp.getSimpleValueType();
+  MVT DstType = Bitcast.getSimpleValueType();
+  if (!SrcType.is128BitVector() ||
+      SrcType.getVectorNumElements() != DstType.getVectorNumElements())
+    return SDValue();
+  SDValue NewShuffle = DAG.getVectorShuffle(
+      SrcType, SDLoc(N), CastOp, DAG.getUNDEF(SrcType), Shuffle->getMask());
+  return DAG.getBitcast(DstType, NewShuffle);
+}
+
+SDValue
+WebAssemblyTargetLowering::PerformDAGCombine(SDNode *N,
+                                             DAGCombinerInfo &DCI) const {
+  switch (N->getOpcode()) {
+  default:
+    return SDValue();
+  case ISD::VECTOR_SHUFFLE:
+    return performVECTOR_SHUFFLECombine(N, DCI);
+  }
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
index 58e088a0ba50..b8e612377529 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.h
@@ -39,7 +39,6 @@ enum NodeType : unsigned {
 } // end namespace WebAssemblyISD
 
 class WebAssemblySubtarget;
-class WebAssemblyTargetMachine;
 
 class WebAssemblyTargetLowering final : public TargetLowering {
 public:
@@ -119,6 +118,11 @@ private:
   SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerAccessVectorElement(SDValue Op, SelectionDAG &DAG) const;
   SDValue LowerShift(SDValue Op, SelectionDAG &DAG) const;
+
+  // Custom DAG combine hooks
+  SDValue
+  PerformDAGCombine(SDNode *N,
+                    TargetLowering::DAGCombinerInfo &DCI) const override;
 };
 
 namespace WebAssembly {
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
index a9a99d38f9f1..256b77e33db9 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrAtomics.td
@@ -13,10 +13,11 @@
 
 let UseNamedOperandTable = 1 in
 multiclass ATOMIC_I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
-                    list<dag> pattern_r, string asmstr_r = "",
-                    string asmstr_s = "", bits<32> atomic_op = -1> {
+                    list<dag> pattern_r, string asmstr_r,
+                    string asmstr_s, bits<32> atomic_op,
+                    string is64 = "false"> {
   defm "" : I<oops_r, iops_r, oops_s, iops_s, pattern_r, asmstr_r, asmstr_s,
-              !or(0xfe00, !and(0xff, atomic_op))>,
+              !or(0xfe00, !and(0xff, atomic_op)), is64>,
             Requires<[HasAtomics]>;
 }
 
@@ -32,85 +33,166 @@ multiclass ATOMIC_NRI<dag oops, dag iops, list<dag> pattern, string asmstr = "",
 //===----------------------------------------------------------------------===//
 
 let hasSideEffects = 1 in {
-defm ATOMIC_NOTIFY :
+defm ATOMIC_NOTIFY_A32 :
   ATOMIC_I<(outs I32:$dst),
            (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$count),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
            "atomic.notify \t$dst, ${off}(${addr})${p2align}, $count",
-           "atomic.notify \t${off}${p2align}", 0x00>;
+           "atomic.notify \t${off}${p2align}", 0x00, "false">;
+defm ATOMIC_NOTIFY_A64 :
+  ATOMIC_I<(outs I32:$dst),
+           (ins P2Align:$p2align, offset64_op:$off, I64:$addr, I32:$count),
+           (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+           "atomic.notify \t$dst, ${off}(${addr})${p2align}, $count",
+           "atomic.notify \t${off}${p2align}", 0x00, "true">;
 let mayLoad = 1 in {
-defm ATOMIC_WAIT_I32 :
+defm ATOMIC_WAIT_I32_A32 :
   ATOMIC_I<(outs I32:$dst),
            (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I32:$exp,
                 I64:$timeout),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
            "i32.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
-           "i32.atomic.wait \t${off}${p2align}", 0x01>;
-defm ATOMIC_WAIT_I64 :
+           "i32.atomic.wait \t${off}${p2align}", 0x01, "false">;
+defm ATOMIC_WAIT_I32_A64 :
+  ATOMIC_I<(outs I32:$dst),
+           (ins P2Align:$p2align, offset64_op:$off, I64:$addr, I32:$exp,
+                I64:$timeout),
+           (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+           "i32.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
+           "i32.atomic.wait \t${off}${p2align}", 0x01, "true">;
+defm ATOMIC_WAIT_I64_A32 :
   ATOMIC_I<(outs I32:$dst),
            (ins P2Align:$p2align, offset32_op:$off, I32:$addr, I64:$exp,
                 I64:$timeout),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
            "i64.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
-           "i64.atomic.wait \t${off}${p2align}", 0x02>;
+           "i64.atomic.wait \t${off}${p2align}", 0x02, "false">;
+defm ATOMIC_WAIT_I64_A64 :
+  ATOMIC_I<(outs I32:$dst),
+           (ins P2Align:$p2align, offset64_op:$off, I64:$addr, I64:$exp,
+                I64:$timeout),
+           (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+           "i64.atomic.wait \t$dst, ${off}(${addr})${p2align}, $exp, $timeout",
+           "i64.atomic.wait \t${off}${p2align}", 0x02, "true">;
 } // mayLoad = 1
 } // hasSideEffects = 1
 
 let Predicates = [HasAtomics] in {
 // Select notifys with no constant offset.
-def NotifyPatNoOffset :
+def NotifyPatNoOffset_A32 :
   Pat<(i32 (int_wasm_atomic_notify I32:$addr, I32:$count)),
-      (ATOMIC_NOTIFY 0, 0, I32:$addr, I32:$count)>;
+      (ATOMIC_NOTIFY_A32 0, 0, I32:$addr, I32:$count)>,
+  Requires<[HasAddr32]>;
+def NotifyPatNoOffset_A64 :
+  Pat<(i32 (int_wasm_atomic_notify I64:$addr, I32:$count)),
+      (ATOMIC_NOTIFY_A64 0, 0, I64:$addr, I32:$count)>,
+  Requires<[HasAddr64]>;
 
 // Select notifys with a constant offset.
 
 // Pattern with address + immediate offset
-class NotifyPatImmOff<PatFrag operand> :
-  Pat<(i32 (int_wasm_atomic_notify (operand I32:$addr, imm:$off), I32:$count)),
-      (ATOMIC_NOTIFY 0, imm:$off, I32:$addr, I32:$count)>;
-def : NotifyPatImmOff<regPlusImm>;
-def : NotifyPatImmOff<or_is_add>;
+multiclass NotifyPatImmOff<PatFrag operand, string inst> {
+  def : Pat<(i32 (int_wasm_atomic_notify (operand I32:$addr, imm:$off),
+                  I32:$count)),
+            (!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, I32:$count)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(i32 (int_wasm_atomic_notify (operand I64:$addr, imm:$off),
+                  I32:$count)),
+            (!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, I32:$count)>,
+        Requires<[HasAddr64]>;
+}
+defm : NotifyPatImmOff<regPlusImm, "ATOMIC_NOTIFY">;
+defm : NotifyPatImmOff<or_is_add, "ATOMIC_NOTIFY">;
 
 // Select notifys with just a constant offset.
-def NotifyPatOffsetOnly :
+def NotifyPatOffsetOnly_A32 :
+  Pat<(i32 (int_wasm_atomic_notify imm:$off, I32:$count)),
+      (ATOMIC_NOTIFY_A32 0, imm:$off, (CONST_I32 0), I32:$count)>,
+  Requires<[HasAddr32]>;
+def NotifyPatOffsetOnly_A64 :
   Pat<(i32 (int_wasm_atomic_notify imm:$off, I32:$count)),
-      (ATOMIC_NOTIFY 0, imm:$off, (CONST_I32 0), I32:$count)>;
+      (ATOMIC_NOTIFY_A64 0, imm:$off, (CONST_I64 0), I32:$count)>,
+  Requires<[HasAddr64]>;
 
-def NotifyPatGlobalAddrOffOnly :
+def NotifyPatGlobalAddrOffOnly_A32 :
   Pat<(i32 (int_wasm_atomic_notify (WebAssemblywrapper tglobaladdr:$off),
                                    I32:$count)),
-      (ATOMIC_NOTIFY 0, tglobaladdr:$off, (CONST_I32 0), I32:$count)>;
+      (ATOMIC_NOTIFY_A32 0, tglobaladdr:$off, (CONST_I32 0), I32:$count)>,
+  Requires<[HasAddr32]>;
+def NotifyPatGlobalAddrOffOnly_A64 :
+  Pat<(i32 (int_wasm_atomic_notify (WebAssemblywrapper tglobaladdr:$off),
+                                   I32:$count)),
+      (ATOMIC_NOTIFY_A64 0, tglobaladdr:$off, (CONST_I64 0), I32:$count)>,
+  Requires<[HasAddr64]>;
 
 // Select waits with no constant offset.
-class WaitPatNoOffset<ValueType ty, Intrinsic kind, NI inst> :
-  Pat<(i32 (kind I32:$addr, ty:$exp, I64:$timeout)),
-      (inst 0, 0, I32:$addr, ty:$exp, I64:$timeout)>;
-def : WaitPatNoOffset<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
-def : WaitPatNoOffset<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
+multiclass WaitPatNoOffset<ValueType ty, Intrinsic kind,
+                      string inst> {
+  def : Pat<(i32 (kind I32:$addr, ty:$exp, I64:$timeout)),
+            (!cast<NI>(inst#_A32) 0, 0, I32:$addr, ty:$exp, I64:$timeout)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(i32 (kind I64:$addr, ty:$exp, I64:$timeout)),
+            (!cast<NI>(inst#_A64) 0, 0, I64:$addr, ty:$exp, I64:$timeout)>,
+        Requires<[HasAddr64]>;
+}
+defm : WaitPatNoOffset<i32, int_wasm_atomic_wait_i32, "ATOMIC_WAIT_I32">;
+defm : WaitPatNoOffset<i64, int_wasm_atomic_wait_i64, "ATOMIC_WAIT_I64">;
+defm : WaitPatNoOffset<i32, int_wasm_atomic_wait_i32, "ATOMIC_WAIT_I32">;
+defm : WaitPatNoOffset<i64, int_wasm_atomic_wait_i64, "ATOMIC_WAIT_I64">;
 
 // Select waits with a constant offset.
 
 // Pattern with address + immediate offset
-class WaitPatImmOff<ValueType ty, Intrinsic kind, PatFrag operand, NI inst> :
-  Pat<(i32 (kind (operand I32:$addr, imm:$off), ty:$exp, I64:$timeout)),
-      (inst 0, imm:$off, I32:$addr, ty:$exp, I64:$timeout)>;
-def : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, regPlusImm, ATOMIC_WAIT_I32>;
-def : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, or_is_add, ATOMIC_WAIT_I32>;
-def : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, regPlusImm, ATOMIC_WAIT_I64>;
-def : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, or_is_add, ATOMIC_WAIT_I64>;
-
-// Select wait_i32, ATOMIC_WAIT_I32s with just a constant offset.
-class WaitPatOffsetOnly<ValueType ty, Intrinsic kind, NI inst> :
-  Pat<(i32 (kind imm:$off, ty:$exp, I64:$timeout)),
-      (inst 0, imm:$off, (CONST_I32 0), ty:$exp, I64:$timeout)>;
-def : WaitPatOffsetOnly<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
-def : WaitPatOffsetOnly<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
-
-class WaitPatGlobalAddrOffOnly<ValueType ty, Intrinsic kind, NI inst> :
-  Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, I64:$timeout)),
-      (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp, I64:$timeout)>;
-def : WaitPatGlobalAddrOffOnly<i32, int_wasm_atomic_wait_i32, ATOMIC_WAIT_I32>;
-def : WaitPatGlobalAddrOffOnly<i64, int_wasm_atomic_wait_i64, ATOMIC_WAIT_I64>;
+multiclass WaitPatImmOff<ValueType ty, Intrinsic kind, PatFrag operand,
+                         string inst> {
+  def : Pat<(i32 (kind (operand I32:$addr, imm:$off), ty:$exp, I64:$timeout)),
+            (!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, ty:$exp,
+              I64:$timeout)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(i32 (kind (operand I64:$addr, imm:$off), ty:$exp, I64:$timeout)),
+            (!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, ty:$exp,
+              I64:$timeout)>,
+        Requires<[HasAddr64]>;
+}
+defm : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, regPlusImm,
+                     "ATOMIC_WAIT_I32">;
+defm : WaitPatImmOff<i32, int_wasm_atomic_wait_i32, or_is_add,
+                     "ATOMIC_WAIT_I32">;
+defm : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, regPlusImm,
+                     "ATOMIC_WAIT_I64">;
+defm : WaitPatImmOff<i64, int_wasm_atomic_wait_i64, or_is_add,
+                     "ATOMIC_WAIT_I64">;
+
+// Select wait_i32, "ATOMIC_WAIT_I32s with just a constant offset.
+multiclass WaitPatOffsetOnly<ValueType ty, Intrinsic kind, string inst> {
+  def : Pat<(i32 (kind imm:$off, ty:$exp, I64:$timeout)),
+            (!cast<NI>(inst#_A32) 0, imm:$off, (CONST_I32 0), ty:$exp,
+               I64:$timeout)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(i32 (kind imm:$off, ty:$exp, I64:$timeout)),
+            (!cast<NI>(inst#_A64) 0, imm:$off, (CONST_I64 0), ty:$exp,
+               I64:$timeout)>,
+        Requires<[HasAddr64]>;
+}
+defm : WaitPatOffsetOnly<i32, int_wasm_atomic_wait_i32, "ATOMIC_WAIT_I32">;
+defm : WaitPatOffsetOnly<i64, int_wasm_atomic_wait_i64, "ATOMIC_WAIT_I64">;
+
+multiclass WaitPatGlobalAddrOffOnly<ValueType ty, Intrinsic kind, string inst> {
+  def : Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp,
+                  I64:$timeout)),
+            (!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp,
+               I64:$timeout)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(i32 (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp,
+                  I64:$timeout)),
+            (!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$exp,
+               I64:$timeout)>,
+        Requires<[HasAddr64]>;
+}
+defm : WaitPatGlobalAddrOffOnly<i32, int_wasm_atomic_wait_i32,
+                                "ATOMIC_WAIT_I32">;
+defm : WaitPatGlobalAddrOffOnly<i64, int_wasm_atomic_wait_i64,
+                                "ATOMIC_WAIT_I64">;
 } // Predicates = [HasAtomics]
 
 //===----------------------------------------------------------------------===//
@@ -131,8 +213,8 @@ defm ATOMIC_FENCE : ATOMIC_NRI<(outs), (ins i8imm:$flags), [], "atomic.fence",
 //===----------------------------------------------------------------------===//
 
 multiclass AtomicLoad<WebAssemblyRegClass rc, string name, int atomic_op> {
-  defm "" : WebAssemblyLoad<rc, name, !or(0xfe00, !and(0xff, atomic_op))>,
-            Requires<[HasAtomics]>;
+  defm "" : WebAssemblyLoad<rc, name, !or(0xfe00, !and(0xff, atomic_op)),
+                            [HasAtomics]>;
 }
 
 defm ATOMIC_LOAD_I32 : AtomicLoad<I32, "i32.atomic.load", 0x10>;
@@ -140,23 +222,23 @@ defm ATOMIC_LOAD_I64 : AtomicLoad<I64, "i64.atomic.load", 0x11>;
 
 // Select loads with no constant offset.
 let Predicates = [HasAtomics] in {
-def : LoadPatNoOffset<i32, atomic_load_32, ATOMIC_LOAD_I32>;
-def : LoadPatNoOffset<i64, atomic_load_64, ATOMIC_LOAD_I64>;
+defm : LoadPatNoOffset<i32, atomic_load_32, "ATOMIC_LOAD_I32">;
+defm : LoadPatNoOffset<i64, atomic_load_64, "ATOMIC_LOAD_I64">;
 
 // Select loads with a constant offset.
 
 // Pattern with address + immediate offset
-def : LoadPatImmOff<i32, atomic_load_32, regPlusImm, ATOMIC_LOAD_I32>;
-def : LoadPatImmOff<i64, atomic_load_64, regPlusImm, ATOMIC_LOAD_I64>;
-def : LoadPatImmOff<i32, atomic_load_32, or_is_add, ATOMIC_LOAD_I32>;
-def : LoadPatImmOff<i64, atomic_load_64, or_is_add, ATOMIC_LOAD_I64>;
+defm : LoadPatImmOff<i32, atomic_load_32, regPlusImm, "ATOMIC_LOAD_I32">;
+defm : LoadPatImmOff<i64, atomic_load_64, regPlusImm, "ATOMIC_LOAD_I64">;
+defm : LoadPatImmOff<i32, atomic_load_32, or_is_add, "ATOMIC_LOAD_I32">;
+defm : LoadPatImmOff<i64, atomic_load_64, or_is_add, "ATOMIC_LOAD_I64">;
 
 // Select loads with just a constant offset.
-def : LoadPatOffsetOnly<i32, atomic_load_32, ATOMIC_LOAD_I32>;
-def : LoadPatOffsetOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
+defm : LoadPatOffsetOnly<i32, atomic_load_32, "ATOMIC_LOAD_I32">;
+defm : LoadPatOffsetOnly<i64, atomic_load_64, "ATOMIC_LOAD_I64">;
 
-def : LoadPatGlobalAddrOffOnly<i32, atomic_load_32, ATOMIC_LOAD_I32>;
-def : LoadPatGlobalAddrOffOnly<i64, atomic_load_64, ATOMIC_LOAD_I64>;
+defm : LoadPatGlobalAddrOffOnly<i32, atomic_load_32, "ATOMIC_LOAD_I32">;
+defm : LoadPatGlobalAddrOffOnly<i64, atomic_load_64, "ATOMIC_LOAD_I64">;
 
 } // Predicates = [HasAtomics]
 
@@ -205,62 +287,62 @@ def sext_aload_16_64 :
 
 let Predicates = [HasAtomics] in {
 // Select zero-extending loads with no constant offset.
-def : LoadPatNoOffset<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
-def : LoadPatNoOffset<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
-def : LoadPatNoOffset<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatNoOffset<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-def : LoadPatNoOffset<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
+defm : LoadPatNoOffset<i32, zext_aload_8_32, "ATOMIC_LOAD8_U_I32">;
+defm : LoadPatNoOffset<i32, zext_aload_16_32, "ATOMIC_LOAD16_U_I32">;
+defm : LoadPatNoOffset<i64, zext_aload_8_64, "ATOMIC_LOAD8_U_I64">;
+defm : LoadPatNoOffset<i64, zext_aload_16_64, "ATOMIC_LOAD16_U_I64">;
+defm : LoadPatNoOffset<i64, zext_aload_32_64, "ATOMIC_LOAD32_U_I64">;
 
 // Select sign-extending loads with no constant offset
-def : LoadPatNoOffset<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatNoOffset<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatNoOffset<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatNoOffset<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
+defm : LoadPatNoOffset<i32, atomic_load_8, "ATOMIC_LOAD8_U_I32">;
+defm : LoadPatNoOffset<i32, atomic_load_16, "ATOMIC_LOAD16_U_I32">;
+defm : LoadPatNoOffset<i64, sext_aload_8_64, "ATOMIC_LOAD8_U_I64">;
+defm : LoadPatNoOffset<i64, sext_aload_16_64, "ATOMIC_LOAD16_U_I64">;
 // 32->64 sext load gets selected as i32.atomic.load, i64.extend_i32_s
 
 // Zero-extending loads with constant offset
-def : LoadPatImmOff<i32, zext_aload_8_32, regPlusImm, ATOMIC_LOAD8_U_I32>;
-def : LoadPatImmOff<i32, zext_aload_16_32, regPlusImm, ATOMIC_LOAD16_U_I32>;
-def : LoadPatImmOff<i32, zext_aload_8_32, or_is_add, ATOMIC_LOAD8_U_I32>;
-def : LoadPatImmOff<i32, zext_aload_16_32, or_is_add, ATOMIC_LOAD16_U_I32>;
-def : LoadPatImmOff<i64, zext_aload_8_64, regPlusImm, ATOMIC_LOAD8_U_I64>;
-def : LoadPatImmOff<i64, zext_aload_16_64, regPlusImm, ATOMIC_LOAD16_U_I64>;
-def : LoadPatImmOff<i64, zext_aload_32_64, regPlusImm, ATOMIC_LOAD32_U_I64>;
-def : LoadPatImmOff<i64, zext_aload_8_64, or_is_add, ATOMIC_LOAD8_U_I64>;
-def : LoadPatImmOff<i64, zext_aload_16_64, or_is_add, ATOMIC_LOAD16_U_I64>;
-def : LoadPatImmOff<i64, zext_aload_32_64, or_is_add, ATOMIC_LOAD32_U_I64>;
+defm : LoadPatImmOff<i32, zext_aload_8_32, regPlusImm, "ATOMIC_LOAD8_U_I32">;
+defm : LoadPatImmOff<i32, zext_aload_16_32, regPlusImm, "ATOMIC_LOAD16_U_I32">;
+defm : LoadPatImmOff<i32, zext_aload_8_32, or_is_add, "ATOMIC_LOAD8_U_I32">;
+defm : LoadPatImmOff<i32, zext_aload_16_32, or_is_add, "ATOMIC_LOAD16_U_I32">;
+defm : LoadPatImmOff<i64, zext_aload_8_64, regPlusImm, "ATOMIC_LOAD8_U_I64">;
+defm : LoadPatImmOff<i64, zext_aload_16_64, regPlusImm, "ATOMIC_LOAD16_U_I64">;
+defm : LoadPatImmOff<i64, zext_aload_32_64, regPlusImm, "ATOMIC_LOAD32_U_I64">;
+defm : LoadPatImmOff<i64, zext_aload_8_64, or_is_add, "ATOMIC_LOAD8_U_I64">;
+defm : LoadPatImmOff<i64, zext_aload_16_64, or_is_add, "ATOMIC_LOAD16_U_I64">;
+defm : LoadPatImmOff<i64, zext_aload_32_64, or_is_add, "ATOMIC_LOAD32_U_I64">;
 
 // Sign-extending loads with constant offset
-def : LoadPatImmOff<i32, atomic_load_8, regPlusImm, ATOMIC_LOAD8_U_I32>;
-def : LoadPatImmOff<i32, atomic_load_16, regPlusImm, ATOMIC_LOAD16_U_I32>;
-def : LoadPatImmOff<i32, atomic_load_8, or_is_add, ATOMIC_LOAD8_U_I32>;
-def : LoadPatImmOff<i32, atomic_load_16, or_is_add, ATOMIC_LOAD16_U_I32>;
-def : LoadPatImmOff<i64, sext_aload_8_64, regPlusImm, ATOMIC_LOAD8_U_I64>;
-def : LoadPatImmOff<i64, sext_aload_16_64, regPlusImm, ATOMIC_LOAD16_U_I64>;
-def : LoadPatImmOff<i64, sext_aload_8_64, or_is_add, ATOMIC_LOAD8_U_I64>;
-def : LoadPatImmOff<i64, sext_aload_16_64, or_is_add, ATOMIC_LOAD16_U_I64>;
+defm : LoadPatImmOff<i32, atomic_load_8, regPlusImm, "ATOMIC_LOAD8_U_I32">;
+defm : LoadPatImmOff<i32, atomic_load_16, regPlusImm, "ATOMIC_LOAD16_U_I32">;
+defm : LoadPatImmOff<i32, atomic_load_8, or_is_add, "ATOMIC_LOAD8_U_I32">;
+defm : LoadPatImmOff<i32, atomic_load_16, or_is_add, "ATOMIC_LOAD16_U_I32">;
+defm : LoadPatImmOff<i64, sext_aload_8_64, regPlusImm, "ATOMIC_LOAD8_U_I64">;
+defm : LoadPatImmOff<i64, sext_aload_16_64, regPlusImm, "ATOMIC_LOAD16_U_I64">;
+defm : LoadPatImmOff<i64, sext_aload_8_64, or_is_add, "ATOMIC_LOAD8_U_I64">;
+defm : LoadPatImmOff<i64, sext_aload_16_64, or_is_add, "ATOMIC_LOAD16_U_I64">;
 // No 32->64 patterns, just use i32.atomic.load and i64.extend_s/i64
 
 // Extending loads with just a constant offset
-def : LoadPatOffsetOnly<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
-def : LoadPatOffsetOnly<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
-def : LoadPatOffsetOnly<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatOffsetOnly<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-def : LoadPatOffsetOnly<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
-def : LoadPatOffsetOnly<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatOffsetOnly<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatOffsetOnly<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatOffsetOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-
-def : LoadPatGlobalAddrOffOnly<i32, zext_aload_8_32, ATOMIC_LOAD8_U_I32>;
-def : LoadPatGlobalAddrOffOnly<i32, zext_aload_16_32, ATOMIC_LOAD16_U_I32>;
-def : LoadPatGlobalAddrOffOnly<i64, zext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatGlobalAddrOffOnly<i64, zext_aload_16_64, ATOMIC_LOAD16_U_I64>;
-def : LoadPatGlobalAddrOffOnly<i64, zext_aload_32_64, ATOMIC_LOAD32_U_I64>;
-def : LoadPatGlobalAddrOffOnly<i32, atomic_load_8, ATOMIC_LOAD8_U_I32>;
-def : LoadPatGlobalAddrOffOnly<i32, atomic_load_16, ATOMIC_LOAD16_U_I32>;
-def : LoadPatGlobalAddrOffOnly<i64, sext_aload_8_64, ATOMIC_LOAD8_U_I64>;
-def : LoadPatGlobalAddrOffOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
+defm : LoadPatOffsetOnly<i32, zext_aload_8_32, "ATOMIC_LOAD8_U_I32">;
+defm : LoadPatOffsetOnly<i32, zext_aload_16_32, "ATOMIC_LOAD16_U_I32">;
+defm : LoadPatOffsetOnly<i64, zext_aload_8_64, "ATOMIC_LOAD8_U_I64">;
+defm : LoadPatOffsetOnly<i64, zext_aload_16_64, "ATOMIC_LOAD16_U_I64">;
+defm : LoadPatOffsetOnly<i64, zext_aload_32_64, "ATOMIC_LOAD32_U_I64">;
+defm : LoadPatOffsetOnly<i32, atomic_load_8, "ATOMIC_LOAD8_U_I32">;
+defm : LoadPatOffsetOnly<i32, atomic_load_16, "ATOMIC_LOAD16_U_I32">;
+defm : LoadPatOffsetOnly<i64, sext_aload_8_64, "ATOMIC_LOAD8_U_I64">;
+defm : LoadPatOffsetOnly<i64, sext_aload_16_64, "ATOMIC_LOAD16_U_I64">;
+
+defm : LoadPatGlobalAddrOffOnly<i32, zext_aload_8_32, "ATOMIC_LOAD8_U_I32">;
+defm : LoadPatGlobalAddrOffOnly<i32, zext_aload_16_32, "ATOMIC_LOAD16_U_I32">;
+defm : LoadPatGlobalAddrOffOnly<i64, zext_aload_8_64, "ATOMIC_LOAD8_U_I64">;
+defm : LoadPatGlobalAddrOffOnly<i64, zext_aload_16_64, "ATOMIC_LOAD16_U_I64">;
+defm : LoadPatGlobalAddrOffOnly<i64, zext_aload_32_64, "ATOMIC_LOAD32_U_I64">;
+defm : LoadPatGlobalAddrOffOnly<i32, atomic_load_8, "ATOMIC_LOAD8_U_I32">;
+defm : LoadPatGlobalAddrOffOnly<i32, atomic_load_16, "ATOMIC_LOAD16_U_I32">;
+defm : LoadPatGlobalAddrOffOnly<i64, sext_aload_8_64, "ATOMIC_LOAD8_U_I64">;
+defm : LoadPatGlobalAddrOffOnly<i64, sext_aload_16_64, "ATOMIC_LOAD16_U_I64">;
 
 } // Predicates = [HasAtomics]
 
@@ -269,8 +351,8 @@ def : LoadPatGlobalAddrOffOnly<i64, sext_aload_16_64, ATOMIC_LOAD16_U_I64>;
 //===----------------------------------------------------------------------===//
 
 multiclass AtomicStore<WebAssemblyRegClass rc, string name, int atomic_op> {
-  defm "" : WebAssemblyStore<rc, name, !or(0xfe00, !and(0xff, atomic_op))>,
-            Requires<[HasAtomics]>;
+  defm "" : WebAssemblyStore<rc, name, !or(0xfe00, !and(0xff, atomic_op)),
+                             [HasAtomics]>;
 }
 
 defm ATOMIC_STORE_I32 : AtomicStore<I32, "i32.atomic.store", 0x17>;
@@ -284,33 +366,54 @@ defm ATOMIC_STORE_I64 : AtomicStore<I64, "i64.atomic.store", 0x18>;
 let Predicates = [HasAtomics] in {
 
 // Select stores with no constant offset.
-class AStorePatNoOffset<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(kind I32:$addr, ty:$val), (inst 0, 0, I32:$addr, ty:$val)>;
-def : AStorePatNoOffset<i32, atomic_store_32, ATOMIC_STORE_I32>;
-def : AStorePatNoOffset<i64, atomic_store_64, ATOMIC_STORE_I64>;
+multiclass AStorePatNoOffset<ValueType ty, PatFrag kind, string inst> {
+  def : Pat<(kind I32:$addr, ty:$val),
+            (!cast<NI>(inst#_A32) 0, 0, I32:$addr, ty:$val)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(kind I64:$addr, ty:$val),
+            (!cast<NI>(inst#_A64) 0, 0, I64:$addr, ty:$val)>,
+        Requires<[HasAddr64]>;
+}
+defm : AStorePatNoOffset<i32, atomic_store_32, "ATOMIC_STORE_I32">;
+defm : AStorePatNoOffset<i64, atomic_store_64, "ATOMIC_STORE_I64">;
 
 // Select stores with a constant offset.
 
 // Pattern with address + immediate offset
-class AStorePatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
-  Pat<(kind (operand I32:$addr, imm:$off), ty:$val),
-      (inst 0, imm:$off, I32:$addr, ty:$val)>;
-def : AStorePatImmOff<i32, atomic_store_32, regPlusImm, ATOMIC_STORE_I32>;
-def : AStorePatImmOff<i64, atomic_store_64, regPlusImm, ATOMIC_STORE_I64>;
-def : AStorePatImmOff<i32, atomic_store_32, or_is_add, ATOMIC_STORE_I32>;
-def : AStorePatImmOff<i64, atomic_store_64, or_is_add, ATOMIC_STORE_I64>;
+multiclass AStorePatImmOff<ValueType ty, PatFrag kind, PatFrag operand,
+                           string inst> {
+  def : Pat<(kind (operand I32:$addr, imm:$off), ty:$val),
+            (!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, ty:$val)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(kind (operand I64:$addr, imm:$off), ty:$val),
+            (!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, ty:$val)>,
+        Requires<[HasAddr64]>;
+}
+defm : AStorePatImmOff<i32, atomic_store_32, regPlusImm, "ATOMIC_STORE_I32">;
+defm : AStorePatImmOff<i64, atomic_store_64, regPlusImm, "ATOMIC_STORE_I64">;
 
 // Select stores with just a constant offset.
-class AStorePatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(kind imm:$off, ty:$val), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
-def : AStorePatOffsetOnly<i32, atomic_store_32, ATOMIC_STORE_I32>;
-def : AStorePatOffsetOnly<i64, atomic_store_64, ATOMIC_STORE_I64>;
-
-class AStorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(kind (WebAssemblywrapper tglobaladdr:$off), ty:$val),
-      (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>;
-def : AStorePatGlobalAddrOffOnly<i32, atomic_store_32, ATOMIC_STORE_I32>;
-def : AStorePatGlobalAddrOffOnly<i64, atomic_store_64, ATOMIC_STORE_I64>;
+multiclass AStorePatOffsetOnly<ValueType ty, PatFrag kind, string inst> {
+  def : Pat<(kind imm:$off, ty:$val),
+            (!cast<NI>(inst#_A32) 0, imm:$off, (CONST_I32 0), ty:$val)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(kind imm:$off, ty:$val),
+            (!cast<NI>(inst#_A64) 0, imm:$off, (CONST_I64 0), ty:$val)>,
+        Requires<[HasAddr64]>;
+}
+defm : AStorePatOffsetOnly<i32, atomic_store_32, "ATOMIC_STORE_I32">;
+defm : AStorePatOffsetOnly<i64, atomic_store_64, "ATOMIC_STORE_I64">;
+
+multiclass AStorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind, string inst> {
+  def : Pat<(kind (WebAssemblywrapper tglobaladdr:$off), ty:$val),
+            (!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(kind (WebAssemblywrapper tglobaladdr:$off), ty:$val),
+            (!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$val)>,
+        Requires<[HasAddr64]>;
+}
+defm : AStorePatGlobalAddrOffOnly<i32, atomic_store_32, "ATOMIC_STORE_I32">;
+defm : AStorePatGlobalAddrOffOnly<i64, atomic_store_64, "ATOMIC_STORE_I64">;
 
 } // Predicates = [HasAtomics]
 
@@ -336,36 +439,40 @@ def trunc_astore_32_64 : trunc_astore_64<atomic_store_32>;
 let Predicates = [HasAtomics] in {
 
 // Truncating stores with no constant offset
-def : AStorePatNoOffset<i32, atomic_store_8, ATOMIC_STORE8_I32>;
-def : AStorePatNoOffset<i32, atomic_store_16, ATOMIC_STORE16_I32>;
-def : AStorePatNoOffset<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
-def : AStorePatNoOffset<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
-def : AStorePatNoOffset<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
+defm : AStorePatNoOffset<i32, atomic_store_8, "ATOMIC_STORE8_I32">;
+defm : AStorePatNoOffset<i32, atomic_store_16, "ATOMIC_STORE16_I32">;
+defm : AStorePatNoOffset<i64, trunc_astore_8_64, "ATOMIC_STORE8_I64">;
+defm : AStorePatNoOffset<i64, trunc_astore_16_64, "ATOMIC_STORE16_I64">;
+defm : AStorePatNoOffset<i64, trunc_astore_32_64, "ATOMIC_STORE32_I64">;
 
 // Truncating stores with a constant offset
-def : AStorePatImmOff<i32, atomic_store_8, regPlusImm, ATOMIC_STORE8_I32>;
-def : AStorePatImmOff<i32, atomic_store_16, regPlusImm, ATOMIC_STORE16_I32>;
-def : AStorePatImmOff<i64, trunc_astore_8_64, regPlusImm, ATOMIC_STORE8_I64>;
-def : AStorePatImmOff<i64, trunc_astore_16_64, regPlusImm, ATOMIC_STORE16_I64>;
-def : AStorePatImmOff<i64, trunc_astore_32_64, regPlusImm, ATOMIC_STORE32_I64>;
-def : AStorePatImmOff<i32, atomic_store_8, or_is_add, ATOMIC_STORE8_I32>;
-def : AStorePatImmOff<i32, atomic_store_16, or_is_add, ATOMIC_STORE16_I32>;
-def : AStorePatImmOff<i64, trunc_astore_8_64, or_is_add, ATOMIC_STORE8_I64>;
-def : AStorePatImmOff<i64, trunc_astore_16_64, or_is_add, ATOMIC_STORE16_I64>;
-def : AStorePatImmOff<i64, trunc_astore_32_64, or_is_add, ATOMIC_STORE32_I64>;
+defm : AStorePatImmOff<i32, atomic_store_8, regPlusImm, "ATOMIC_STORE8_I32">;
+defm : AStorePatImmOff<i32, atomic_store_16, regPlusImm, "ATOMIC_STORE16_I32">;
+defm : AStorePatImmOff<i64, trunc_astore_8_64, regPlusImm, "ATOMIC_STORE8_I64">;
+defm : AStorePatImmOff<i64, trunc_astore_16_64, regPlusImm,
+                       "ATOMIC_STORE16_I64">;
+defm : AStorePatImmOff<i64, trunc_astore_32_64, regPlusImm,
+                       "ATOMIC_STORE32_I64">;
+defm : AStorePatImmOff<i32, atomic_store_8, or_is_add, "ATOMIC_STORE8_I32">;
+defm : AStorePatImmOff<i32, atomic_store_16, or_is_add, "ATOMIC_STORE16_I32">;
+defm : AStorePatImmOff<i64, trunc_astore_8_64, or_is_add, "ATOMIC_STORE8_I64">;
+defm : AStorePatImmOff<i64, trunc_astore_16_64, or_is_add,
+                       "ATOMIC_STORE16_I64">;
+defm : AStorePatImmOff<i64, trunc_astore_32_64, or_is_add,
+                       "ATOMIC_STORE32_I64">;
 
 // Truncating stores with just a constant offset
-def : AStorePatOffsetOnly<i32, atomic_store_8, ATOMIC_STORE8_I32>;
-def : AStorePatOffsetOnly<i32, atomic_store_16, ATOMIC_STORE16_I32>;
-def : AStorePatOffsetOnly<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
-def : AStorePatOffsetOnly<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
-def : AStorePatOffsetOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
-
-def : AStorePatGlobalAddrOffOnly<i32, atomic_store_8, ATOMIC_STORE8_I32>;
-def : AStorePatGlobalAddrOffOnly<i32, atomic_store_16, ATOMIC_STORE16_I32>;
-def : AStorePatGlobalAddrOffOnly<i64, trunc_astore_8_64, ATOMIC_STORE8_I64>;
-def : AStorePatGlobalAddrOffOnly<i64, trunc_astore_16_64, ATOMIC_STORE16_I64>;
-def : AStorePatGlobalAddrOffOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
+defm : AStorePatOffsetOnly<i32, atomic_store_8, "ATOMIC_STORE8_I32">;
+defm : AStorePatOffsetOnly<i32, atomic_store_16, "ATOMIC_STORE16_I32">;
+defm : AStorePatOffsetOnly<i64, trunc_astore_8_64, "ATOMIC_STORE8_I64">;
+defm : AStorePatOffsetOnly<i64, trunc_astore_16_64, "ATOMIC_STORE16_I64">;
+defm : AStorePatOffsetOnly<i64, trunc_astore_32_64, "ATOMIC_STORE32_I64">;
+
+defm : AStorePatGlobalAddrOffOnly<i32, atomic_store_8, "ATOMIC_STORE8_I32">;
+defm : AStorePatGlobalAddrOffOnly<i32, atomic_store_16, "ATOMIC_STORE16_I32">;
+defm : AStorePatGlobalAddrOffOnly<i64, trunc_astore_8_64, "ATOMIC_STORE8_I64">;
+defm : AStorePatGlobalAddrOffOnly<i64, trunc_astore_16_64, "ATOMIC_STORE16_I64">;
+defm : AStorePatGlobalAddrOffOnly<i64, trunc_astore_32_64, "ATOMIC_STORE32_I64">;
 
 } // Predicates = [HasAtomics]
 
@@ -375,12 +482,18 @@ def : AStorePatGlobalAddrOffOnly<i64, trunc_astore_32_64, ATOMIC_STORE32_I64>;
 
 multiclass WebAssemblyBinRMW<WebAssemblyRegClass rc, string name,
                              int atomic_op> {
-  defm "" :
+  defm "_A32" :
     ATOMIC_I<(outs rc:$dst),
              (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
              (outs), (ins P2Align:$p2align, offset32_op:$off), [],
              !strconcat(name, "\t$dst, ${off}(${addr})${p2align}, $val"),
-             !strconcat(name, "\t${off}${p2align}"), atomic_op>;
+             !strconcat(name, "\t${off}${p2align}"), atomic_op, "false">;
+  defm "_A64" :
+    ATOMIC_I<(outs rc:$dst),
+             (ins P2Align:$p2align, offset64_op:$off, I64:$addr, rc:$val),
+             (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+             !strconcat(name, "\t$dst, ${off}(${addr})${p2align}, $val"),
+             !strconcat(name, "\t${off}${p2align}"), atomic_op, "true">;
 }
 
 defm ATOMIC_RMW_ADD_I32 : WebAssemblyBinRMW<I32, "i32.atomic.rmw.add", 0x1e>;
@@ -464,56 +577,78 @@ defm ATOMIC_RMW32_U_XCHG_I64 :
   WebAssemblyBinRMW<I64, "i64.atomic.rmw32.xchg_u", 0x47>;
 
 // Select binary RMWs with no constant offset.
-class BinRMWPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(ty (kind I32:$addr, ty:$val)), (inst 0, 0, I32:$addr, ty:$val)>;
+multiclass BinRMWPatNoOffset<ValueType ty, PatFrag kind, string inst> {
+  def : Pat<(ty (kind I32:$addr, ty:$val)),
+            (!cast<NI>(inst#_A32) 0, 0, I32:$addr, ty:$val)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(ty (kind I64:$addr, ty:$val)),
+            (!cast<NI>(inst#_A64) 0, 0, I64:$addr, ty:$val)>,
+        Requires<[HasAddr64]>;
+}
 
 // Select binary RMWs with a constant offset.
 
 // Pattern with address + immediate offset
-class BinRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
-  Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$val)),
-      (inst 0, imm:$off, I32:$addr, ty:$val)>;
+multiclass BinRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand,
+                           string inst> {
+  def : Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$val)),
+            (!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, ty:$val)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(ty (kind (operand I64:$addr, imm:$off), ty:$val)),
+            (!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, ty:$val)>,
+        Requires<[HasAddr64]>;
+}
 
 // Select binary RMWs with just a constant offset.
-class BinRMWPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(ty (kind imm:$off, ty:$val)),
-      (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
+multiclass BinRMWPatOffsetOnly<ValueType ty, PatFrag kind, string inst> {
+  def : Pat<(ty (kind imm:$off, ty:$val)),
+            (!cast<NI>(inst#_A32) 0, imm:$off, (CONST_I32 0), ty:$val)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(ty (kind imm:$off, ty:$val)),
+            (!cast<NI>(inst#_A64) 0, imm:$off, (CONST_I64 0), ty:$val)>,
+        Requires<[HasAddr64]>;
+}
 
-class BinRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$val)),
-      (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>;
+multiclass BinRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> {
+  def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$val)),
+            (!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$val)),
+            (!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$val)>,
+        Requires<[HasAddr64]>;
+}
 
 // Patterns for various addressing modes.
-multiclass BinRMWPattern<PatFrag rmw_32, PatFrag rmw_64, NI inst_32,
-                         NI inst_64> {
-  def : BinRMWPatNoOffset<i32, rmw_32, inst_32>;
-  def : BinRMWPatNoOffset<i64, rmw_64, inst_64>;
+multiclass BinRMWPattern<PatFrag rmw_32, PatFrag rmw_64, string inst_32,
+                         string inst_64> {
+  defm : BinRMWPatNoOffset<i32, rmw_32, inst_32>;
+  defm : BinRMWPatNoOffset<i64, rmw_64, inst_64>;
 
-  def : BinRMWPatImmOff<i32, rmw_32, regPlusImm, inst_32>;
-  def : BinRMWPatImmOff<i64, rmw_64, regPlusImm, inst_64>;
-  def : BinRMWPatImmOff<i32, rmw_32, or_is_add, inst_32>;
-  def : BinRMWPatImmOff<i64, rmw_64, or_is_add, inst_64>;
+  defm : BinRMWPatImmOff<i32, rmw_32, regPlusImm, inst_32>;
+  defm : BinRMWPatImmOff<i64, rmw_64, regPlusImm, inst_64>;
+  defm : BinRMWPatImmOff<i32, rmw_32, or_is_add, inst_32>;
+  defm : BinRMWPatImmOff<i64, rmw_64, or_is_add, inst_64>;
 
-  def : BinRMWPatOffsetOnly<i32, rmw_32, inst_32>;
-  def : BinRMWPatOffsetOnly<i64, rmw_64, inst_64>;
+  defm : BinRMWPatOffsetOnly<i32, rmw_32, inst_32>;
+  defm : BinRMWPatOffsetOnly<i64, rmw_64, inst_64>;
 
-  def : BinRMWPatGlobalAddrOffOnly<i32, rmw_32, inst_32>;
-  def : BinRMWPatGlobalAddrOffOnly<i64, rmw_64, inst_64>;
+  defm : BinRMWPatGlobalAddrOffOnly<i32, rmw_32, inst_32>;
+  defm : BinRMWPatGlobalAddrOffOnly<i64, rmw_64, inst_64>;
 }
 
 let Predicates = [HasAtomics] in {
-defm : BinRMWPattern<atomic_load_add_32, atomic_load_add_64, ATOMIC_RMW_ADD_I32,
-                     ATOMIC_RMW_ADD_I64>;
-defm : BinRMWPattern<atomic_load_sub_32, atomic_load_sub_64, ATOMIC_RMW_SUB_I32,
-                     ATOMIC_RMW_SUB_I64>;
-defm : BinRMWPattern<atomic_load_and_32, atomic_load_and_64, ATOMIC_RMW_AND_I32,
-                     ATOMIC_RMW_AND_I64>;
-defm : BinRMWPattern<atomic_load_or_32, atomic_load_or_64, ATOMIC_RMW_OR_I32,
-                     ATOMIC_RMW_OR_I64>;
-defm : BinRMWPattern<atomic_load_xor_32, atomic_load_xor_64, ATOMIC_RMW_XOR_I32,
-                     ATOMIC_RMW_XOR_I64>;
-defm : BinRMWPattern<atomic_swap_32, atomic_swap_64, ATOMIC_RMW_XCHG_I32,
-                     ATOMIC_RMW_XCHG_I64>;
+defm : BinRMWPattern<atomic_load_add_32, atomic_load_add_64,
+                     "ATOMIC_RMW_ADD_I32", "ATOMIC_RMW_ADD_I64">;
+defm : BinRMWPattern<atomic_load_sub_32, atomic_load_sub_64,
+                     "ATOMIC_RMW_SUB_I32", "ATOMIC_RMW_SUB_I64">;
+defm : BinRMWPattern<atomic_load_and_32, atomic_load_and_64,
+                     "ATOMIC_RMW_AND_I32", "ATOMIC_RMW_AND_I64">;
+defm : BinRMWPattern<atomic_load_or_32, atomic_load_or_64,
+                     "ATOMIC_RMW_OR_I32", "ATOMIC_RMW_OR_I64">;
+defm : BinRMWPattern<atomic_load_xor_32, atomic_load_xor_64,
+                     "ATOMIC_RMW_XOR_I32", "ATOMIC_RMW_XOR_I64">;
+defm : BinRMWPattern<atomic_swap_32, atomic_swap_64,
+                     "ATOMIC_RMW_XCHG_I32", "ATOMIC_RMW_XCHG_I64">;
 } // Predicates = [HasAtomics]
 
 // Truncating & zero-extending binary RMW patterns.
@@ -556,87 +691,93 @@ multiclass BinRMWTruncExtPattern<
   PatFrag rmw_8, PatFrag rmw_16, PatFrag rmw_32, PatFrag rmw_64,
   NI inst8_32, NI inst16_32, NI inst8_64, NI inst16_64, NI inst32_64> {
   // Truncating-extending binary RMWs with no constant offset
-  def : BinRMWPatNoOffset<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
-  def : BinRMWPatNoOffset<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
-  def : BinRMWPatNoOffset<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
-  def : BinRMWPatNoOffset<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
-  def : BinRMWPatNoOffset<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+  defm : BinRMWPatNoOffset<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  defm : BinRMWPatNoOffset<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  defm : BinRMWPatNoOffset<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  defm : BinRMWPatNoOffset<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+  defm : BinRMWPatNoOffset<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
 
-  def : BinRMWPatNoOffset<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
-  def : BinRMWPatNoOffset<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
-  def : BinRMWPatNoOffset<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
-  def : BinRMWPatNoOffset<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+  defm : BinRMWPatNoOffset<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  defm : BinRMWPatNoOffset<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  defm : BinRMWPatNoOffset<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  defm : BinRMWPatNoOffset<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
 
   // Truncating-extending binary RMWs with a constant offset
-  def : BinRMWPatImmOff<i32, zext_bin_rmw_8_32<rmw_8>, regPlusImm, inst8_32>;
-  def : BinRMWPatImmOff<i32, zext_bin_rmw_16_32<rmw_16>, regPlusImm, inst16_32>;
-  def : BinRMWPatImmOff<i64, zext_bin_rmw_8_64<rmw_8>, regPlusImm, inst8_64>;
-  def : BinRMWPatImmOff<i64, zext_bin_rmw_16_64<rmw_16>, regPlusImm, inst16_64>;
-  def : BinRMWPatImmOff<i64, zext_bin_rmw_32_64<rmw_32>, regPlusImm, inst32_64>;
-  def : BinRMWPatImmOff<i32, zext_bin_rmw_8_32<rmw_8>, or_is_add, inst8_32>;
-  def : BinRMWPatImmOff<i32, zext_bin_rmw_16_32<rmw_16>, or_is_add, inst16_32>;
-  def : BinRMWPatImmOff<i64, zext_bin_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
-  def : BinRMWPatImmOff<i64, zext_bin_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
-  def : BinRMWPatImmOff<i64, zext_bin_rmw_32_64<rmw_32>, or_is_add, inst32_64>;
-
-  def : BinRMWPatImmOff<i32, sext_bin_rmw_8_32<rmw_8>, regPlusImm, inst8_32>;
-  def : BinRMWPatImmOff<i32, sext_bin_rmw_16_32<rmw_16>, regPlusImm, inst16_32>;
-  def : BinRMWPatImmOff<i64, sext_bin_rmw_8_64<rmw_8>, regPlusImm, inst8_64>;
-  def : BinRMWPatImmOff<i64, sext_bin_rmw_16_64<rmw_16>, regPlusImm, inst16_64>;
-  def : BinRMWPatImmOff<i32, sext_bin_rmw_8_32<rmw_8>, or_is_add, inst8_32>;
-  def : BinRMWPatImmOff<i32, sext_bin_rmw_16_32<rmw_16>, or_is_add, inst16_32>;
-  def : BinRMWPatImmOff<i64, sext_bin_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
-  def : BinRMWPatImmOff<i64, sext_bin_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
+  defm : BinRMWPatImmOff<i32, zext_bin_rmw_8_32<rmw_8>, regPlusImm, inst8_32>;
+  defm : BinRMWPatImmOff<i32, zext_bin_rmw_16_32<rmw_16>, regPlusImm,
+                         inst16_32>;
+  defm : BinRMWPatImmOff<i64, zext_bin_rmw_8_64<rmw_8>, regPlusImm, inst8_64>;
+  defm : BinRMWPatImmOff<i64, zext_bin_rmw_16_64<rmw_16>, regPlusImm,
+                         inst16_64>;
+  defm : BinRMWPatImmOff<i64, zext_bin_rmw_32_64<rmw_32>, regPlusImm,
+                         inst32_64>;
+  defm : BinRMWPatImmOff<i32, zext_bin_rmw_8_32<rmw_8>, or_is_add, inst8_32>;
+  defm : BinRMWPatImmOff<i32, zext_bin_rmw_16_32<rmw_16>, or_is_add, inst16_32>;
+  defm : BinRMWPatImmOff<i64, zext_bin_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
+  defm : BinRMWPatImmOff<i64, zext_bin_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
+  defm : BinRMWPatImmOff<i64, zext_bin_rmw_32_64<rmw_32>, or_is_add, inst32_64>;
+
+  defm : BinRMWPatImmOff<i32, sext_bin_rmw_8_32<rmw_8>, regPlusImm, inst8_32>;
+  defm : BinRMWPatImmOff<i32, sext_bin_rmw_16_32<rmw_16>, regPlusImm,
+                         inst16_32>;
+  defm : BinRMWPatImmOff<i64, sext_bin_rmw_8_64<rmw_8>, regPlusImm, inst8_64>;
+  defm : BinRMWPatImmOff<i64, sext_bin_rmw_16_64<rmw_16>, regPlusImm,
+                         inst16_64>;
+  defm : BinRMWPatImmOff<i32, sext_bin_rmw_8_32<rmw_8>, or_is_add, inst8_32>;
+  defm : BinRMWPatImmOff<i32, sext_bin_rmw_16_32<rmw_16>, or_is_add, inst16_32>;
+  defm : BinRMWPatImmOff<i64, sext_bin_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
+  defm : BinRMWPatImmOff<i64, sext_bin_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
 
   // Truncating-extending binary RMWs with just a constant offset
-  def : BinRMWPatOffsetOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
-  def : BinRMWPatOffsetOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
-  def : BinRMWPatOffsetOnly<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
-  def : BinRMWPatOffsetOnly<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
-  def : BinRMWPatOffsetOnly<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
-
-  def : BinRMWPatOffsetOnly<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
-  def : BinRMWPatOffsetOnly<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
-  def : BinRMWPatOffsetOnly<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
-  def : BinRMWPatOffsetOnly<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
-
-  def : BinRMWPatGlobalAddrOffOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
-  def : BinRMWPatGlobalAddrOffOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
-  def : BinRMWPatGlobalAddrOffOnly<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
-  def : BinRMWPatGlobalAddrOffOnly<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
-  def : BinRMWPatGlobalAddrOffOnly<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
-
-  def : BinRMWPatGlobalAddrOffOnly<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
-  def : BinRMWPatGlobalAddrOffOnly<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
-  def : BinRMWPatGlobalAddrOffOnly<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
-  def : BinRMWPatGlobalAddrOffOnly<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+  defm : BinRMWPatOffsetOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  defm : BinRMWPatOffsetOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  defm : BinRMWPatOffsetOnly<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  defm : BinRMWPatOffsetOnly<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+  defm : BinRMWPatOffsetOnly<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+
+  defm : BinRMWPatOffsetOnly<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  defm : BinRMWPatOffsetOnly<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  defm : BinRMWPatOffsetOnly<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  defm : BinRMWPatOffsetOnly<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
+
+  defm : BinRMWPatGlobalAddrOffOnly<i32, zext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  defm : BinRMWPatGlobalAddrOffOnly<i32, zext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  defm : BinRMWPatGlobalAddrOffOnly<i64, zext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  defm : BinRMWPatGlobalAddrOffOnly<i64, zext_bin_rmw_16_64<rmw_16>, inst16_64>;
+  defm : BinRMWPatGlobalAddrOffOnly<i64, zext_bin_rmw_32_64<rmw_32>, inst32_64>;
+
+  defm : BinRMWPatGlobalAddrOffOnly<i32, sext_bin_rmw_8_32<rmw_8>, inst8_32>;
+  defm : BinRMWPatGlobalAddrOffOnly<i32, sext_bin_rmw_16_32<rmw_16>, inst16_32>;
+  defm : BinRMWPatGlobalAddrOffOnly<i64, sext_bin_rmw_8_64<rmw_8>, inst8_64>;
+  defm : BinRMWPatGlobalAddrOffOnly<i64, sext_bin_rmw_16_64<rmw_16>, inst16_64>;
 }
 
 let Predicates = [HasAtomics] in {
 defm : BinRMWTruncExtPattern<
   atomic_load_add_8, atomic_load_add_16, atomic_load_add_32, atomic_load_add_64,
-  ATOMIC_RMW8_U_ADD_I32, ATOMIC_RMW16_U_ADD_I32,
-  ATOMIC_RMW8_U_ADD_I64, ATOMIC_RMW16_U_ADD_I64, ATOMIC_RMW32_U_ADD_I64>;
+  "ATOMIC_RMW8_U_ADD_I32", "ATOMIC_RMW16_U_ADD_I32",
+  "ATOMIC_RMW8_U_ADD_I64", "ATOMIC_RMW16_U_ADD_I64", "ATOMIC_RMW32_U_ADD_I64">;
 defm : BinRMWTruncExtPattern<
   atomic_load_sub_8, atomic_load_sub_16, atomic_load_sub_32, atomic_load_sub_64,
-  ATOMIC_RMW8_U_SUB_I32, ATOMIC_RMW16_U_SUB_I32,
-  ATOMIC_RMW8_U_SUB_I64, ATOMIC_RMW16_U_SUB_I64, ATOMIC_RMW32_U_SUB_I64>;
+  "ATOMIC_RMW8_U_SUB_I32", "ATOMIC_RMW16_U_SUB_I32",
+  "ATOMIC_RMW8_U_SUB_I64", "ATOMIC_RMW16_U_SUB_I64", "ATOMIC_RMW32_U_SUB_I64">;
 defm : BinRMWTruncExtPattern<
   atomic_load_and_8, atomic_load_and_16, atomic_load_and_32, atomic_load_and_64,
-  ATOMIC_RMW8_U_AND_I32, ATOMIC_RMW16_U_AND_I32,
-  ATOMIC_RMW8_U_AND_I64, ATOMIC_RMW16_U_AND_I64, ATOMIC_RMW32_U_AND_I64>;
+  "ATOMIC_RMW8_U_AND_I32", "ATOMIC_RMW16_U_AND_I32",
+  "ATOMIC_RMW8_U_AND_I64", "ATOMIC_RMW16_U_AND_I64", "ATOMIC_RMW32_U_AND_I64">;
 defm : BinRMWTruncExtPattern<
   atomic_load_or_8, atomic_load_or_16, atomic_load_or_32, atomic_load_or_64,
-  ATOMIC_RMW8_U_OR_I32, ATOMIC_RMW16_U_OR_I32,
-  ATOMIC_RMW8_U_OR_I64, ATOMIC_RMW16_U_OR_I64, ATOMIC_RMW32_U_OR_I64>;
+  "ATOMIC_RMW8_U_OR_I32", "ATOMIC_RMW16_U_OR_I32",
+  "ATOMIC_RMW8_U_OR_I64", "ATOMIC_RMW16_U_OR_I64", "ATOMIC_RMW32_U_OR_I64">;
 defm : BinRMWTruncExtPattern<
   atomic_load_xor_8, atomic_load_xor_16, atomic_load_xor_32, atomic_load_xor_64,
-  ATOMIC_RMW8_U_XOR_I32, ATOMIC_RMW16_U_XOR_I32,
-  ATOMIC_RMW8_U_XOR_I64, ATOMIC_RMW16_U_XOR_I64, ATOMIC_RMW32_U_XOR_I64>;
+  "ATOMIC_RMW8_U_XOR_I32", "ATOMIC_RMW16_U_XOR_I32",
+  "ATOMIC_RMW8_U_XOR_I64", "ATOMIC_RMW16_U_XOR_I64", "ATOMIC_RMW32_U_XOR_I64">;
 defm : BinRMWTruncExtPattern<
   atomic_swap_8, atomic_swap_16, atomic_swap_32, atomic_swap_64,
-  ATOMIC_RMW8_U_XCHG_I32, ATOMIC_RMW16_U_XCHG_I32,
-  ATOMIC_RMW8_U_XCHG_I64, ATOMIC_RMW16_U_XCHG_I64, ATOMIC_RMW32_U_XCHG_I64>;
+  "ATOMIC_RMW8_U_XCHG_I32", "ATOMIC_RMW16_U_XCHG_I32",
+  "ATOMIC_RMW8_U_XCHG_I64", "ATOMIC_RMW16_U_XCHG_I64",
+  "ATOMIC_RMW32_U_XCHG_I64">;
 } // Predicates = [HasAtomics]
 
 //===----------------------------------------------------------------------===//
@@ -651,13 +792,20 @@ defm : BinRMWTruncExtPattern<
 
 multiclass WebAssemblyTerRMW<WebAssemblyRegClass rc, string name,
                              int atomic_op> {
-  defm "" :
+  defm "_A32" :
     ATOMIC_I<(outs rc:$dst),
              (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$exp,
                   rc:$new_),
              (outs), (ins P2Align:$p2align, offset32_op:$off), [],
              !strconcat(name, "\t$dst, ${off}(${addr})${p2align}, $exp, $new_"),
-             !strconcat(name, "\t${off}${p2align}"), atomic_op>;
+             !strconcat(name, "\t${off}${p2align}"), atomic_op, "false">;
+  defm "_A64" :
+    ATOMIC_I<(outs rc:$dst),
+             (ins P2Align:$p2align, offset64_op:$off, I64:$addr, rc:$exp,
+                  rc:$new_),
+             (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+             !strconcat(name, "\t$dst, ${off}(${addr})${p2align}, $exp, $new_"),
+             !strconcat(name, "\t${off}${p2align}"), atomic_op, "true">;
 }
 
 defm ATOMIC_RMW_CMPXCHG_I32 :
@@ -676,47 +824,70 @@ defm ATOMIC_RMW32_U_CMPXCHG_I64 :
   WebAssemblyTerRMW<I64, "i64.atomic.rmw32.cmpxchg_u", 0x4e>;
 
 // Select ternary RMWs with no constant offset.
-class TerRMWPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(ty (kind I32:$addr, ty:$exp, ty:$new)),
-      (inst 0, 0, I32:$addr, ty:$exp, ty:$new)>;
+multiclass TerRMWPatNoOffset<ValueType ty, PatFrag kind, string inst> {
+  def : Pat<(ty (kind I32:$addr, ty:$exp, ty:$new)),
+            (!cast<NI>(inst#_A32) 0, 0, I32:$addr, ty:$exp, ty:$new)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(ty (kind I64:$addr, ty:$exp, ty:$new)),
+            (!cast<NI>(inst#_A64) 0, 0, I64:$addr, ty:$exp, ty:$new)>,
+        Requires<[HasAddr64]>;
+}
 
 // Select ternary RMWs with a constant offset.
 
 // Pattern with address + immediate offset
-class TerRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
-  Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$exp, ty:$new)),
-      (inst 0, imm:$off, I32:$addr, ty:$exp, ty:$new)>;
+multiclass TerRMWPatImmOff<ValueType ty, PatFrag kind, PatFrag operand,
+                           string inst> {
+  def : Pat<(ty (kind (operand I32:$addr, imm:$off), ty:$exp, ty:$new)),
+            (!cast<NI>(inst#_A32) 0, imm:$off, I32:$addr, ty:$exp, ty:$new)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(ty (kind (operand I64:$addr, imm:$off), ty:$exp, ty:$new)),
+            (!cast<NI>(inst#_A64) 0, imm:$off, I64:$addr, ty:$exp, ty:$new)>,
+        Requires<[HasAddr64]>;
+}
 
 // Select ternary RMWs with just a constant offset.
-class TerRMWPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(ty (kind imm:$off, ty:$exp, ty:$new)),
-      (inst 0, imm:$off, (CONST_I32 0), ty:$exp, ty:$new)>;
+multiclass TerRMWPatOffsetOnly<ValueType ty, PatFrag kind, string inst> {
+  def : Pat<(ty (kind imm:$off, ty:$exp, ty:$new)),
+            (!cast<NI>(inst#_A32) 0, imm:$off, (CONST_I32 0), ty:$exp,
+              ty:$new)>;
+  def : Pat<(ty (kind imm:$off, ty:$exp, ty:$new)),
+            (!cast<NI>(inst#_A64) 0, imm:$off, (CONST_I64 0), ty:$exp,
+              ty:$new)>;
+}
 
-class TerRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, ty:$new)),
-      (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp, ty:$new)>;
+multiclass TerRMWPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, string inst> {
+  def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, ty:$new)),
+            (!cast<NI>(inst#_A32) 0, tglobaladdr:$off, (CONST_I32 0), ty:$exp,
+              ty:$new)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off), ty:$exp, ty:$new)),
+            (!cast<NI>(inst#_A64) 0, tglobaladdr:$off, (CONST_I64 0), ty:$exp,
+              ty:$new)>,
+        Requires<[HasAddr64]>;
+}
 
 // Patterns for various addressing modes.
-multiclass TerRMWPattern<PatFrag rmw_32, PatFrag rmw_64, NI inst_32,
-                         NI inst_64> {
-  def : TerRMWPatNoOffset<i32, rmw_32, inst_32>;
-  def : TerRMWPatNoOffset<i64, rmw_64, inst_64>;
+multiclass TerRMWPattern<PatFrag rmw_32, PatFrag rmw_64, string inst_32,
+                         string inst_64> {
+  defm : TerRMWPatNoOffset<i32, rmw_32, inst_32>;
+  defm : TerRMWPatNoOffset<i64, rmw_64, inst_64>;
 
-  def : TerRMWPatImmOff<i32, rmw_32, regPlusImm, inst_32>;
-  def : TerRMWPatImmOff<i64, rmw_64, regPlusImm, inst_64>;
-  def : TerRMWPatImmOff<i32, rmw_32, or_is_add, inst_32>;
-  def : TerRMWPatImmOff<i64, rmw_64, or_is_add, inst_64>;
+  defm : TerRMWPatImmOff<i32, rmw_32, regPlusImm, inst_32>;
+  defm : TerRMWPatImmOff<i64, rmw_64, regPlusImm, inst_64>;
+  defm : TerRMWPatImmOff<i32, rmw_32, or_is_add, inst_32>;
+  defm : TerRMWPatImmOff<i64, rmw_64, or_is_add, inst_64>;
 
-  def : TerRMWPatOffsetOnly<i32, rmw_32, inst_32>;
-  def : TerRMWPatOffsetOnly<i64, rmw_64, inst_64>;
+  defm : TerRMWPatOffsetOnly<i32, rmw_32, inst_32>;
+  defm : TerRMWPatOffsetOnly<i64, rmw_64, inst_64>;
 
-  def : TerRMWPatGlobalAddrOffOnly<i32, rmw_32, inst_32>;
-  def : TerRMWPatGlobalAddrOffOnly<i64, rmw_64, inst_64>;
+  defm : TerRMWPatGlobalAddrOffOnly<i32, rmw_32, inst_32>;
+  defm : TerRMWPatGlobalAddrOffOnly<i64, rmw_64, inst_64>;
 }
 
 let Predicates = [HasAtomics] in
 defm : TerRMWPattern<atomic_cmp_swap_32, atomic_cmp_swap_64,
-                     ATOMIC_RMW_CMPXCHG_I32, ATOMIC_RMW_CMPXCHG_I64>;
+                     "ATOMIC_RMW_CMPXCHG_I32", "ATOMIC_RMW_CMPXCHG_I64">;
 
 // Truncating & zero-extending ternary RMW patterns.
 // DAG legalization & optimization before instruction selection may introduce
@@ -759,67 +930,73 @@ class sext_ter_rmw_16_64<PatFrag kind> : sext_ter_rmw_8_64<kind>;
 // Patterns for various addressing modes for truncating-extending ternary RMWs.
 multiclass TerRMWTruncExtPattern<
   PatFrag rmw_8, PatFrag rmw_16, PatFrag rmw_32, PatFrag rmw_64,
-  NI inst8_32, NI inst16_32, NI inst8_64, NI inst16_64, NI inst32_64> {
+  string inst8_32, string inst16_32, string inst8_64, string inst16_64,
+  string inst32_64> {
   // Truncating-extending ternary RMWs with no constant offset
-  def : TerRMWPatNoOffset<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>;
-  def : TerRMWPatNoOffset<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>;
-  def : TerRMWPatNoOffset<i64, zext_ter_rmw_8_64<rmw_8>, inst8_64>;
-  def : TerRMWPatNoOffset<i64, zext_ter_rmw_16_64<rmw_16>, inst16_64>;
-  def : TerRMWPatNoOffset<i64, zext_ter_rmw_32_64<rmw_32>, inst32_64>;
+  defm : TerRMWPatNoOffset<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>;
+  defm : TerRMWPatNoOffset<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>;
+  defm : TerRMWPatNoOffset<i64, zext_ter_rmw_8_64<rmw_8>, inst8_64>;
+  defm : TerRMWPatNoOffset<i64, zext_ter_rmw_16_64<rmw_16>, inst16_64>;
+  defm : TerRMWPatNoOffset<i64, zext_ter_rmw_32_64<rmw_32>, inst32_64>;
 
-  def : TerRMWPatNoOffset<i32, sext_ter_rmw_8_32<rmw_8>, inst8_32>;
-  def : TerRMWPatNoOffset<i32, sext_ter_rmw_16_32<rmw_16>, inst16_32>;
-  def : TerRMWPatNoOffset<i64, sext_ter_rmw_8_64<rmw_8>, inst8_64>;
-  def : TerRMWPatNoOffset<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>;
+  defm : TerRMWPatNoOffset<i32, sext_ter_rmw_8_32<rmw_8>, inst8_32>;
+  defm : TerRMWPatNoOffset<i32, sext_ter_rmw_16_32<rmw_16>, inst16_32>;
+  defm : TerRMWPatNoOffset<i64, sext_ter_rmw_8_64<rmw_8>, inst8_64>;
+  defm : TerRMWPatNoOffset<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>;
 
   // Truncating-extending ternary RMWs with a constant offset
-  def : TerRMWPatImmOff<i32, zext_ter_rmw_8_32<rmw_8>, regPlusImm, inst8_32>;
-  def : TerRMWPatImmOff<i32, zext_ter_rmw_16_32<rmw_16>, regPlusImm, inst16_32>;
-  def : TerRMWPatImmOff<i64, zext_ter_rmw_8_64<rmw_8>, regPlusImm, inst8_64>;
-  def : TerRMWPatImmOff<i64, zext_ter_rmw_16_64<rmw_16>, regPlusImm, inst16_64>;
-  def : TerRMWPatImmOff<i64, zext_ter_rmw_32_64<rmw_32>, regPlusImm, inst32_64>;
-  def : TerRMWPatImmOff<i32, zext_ter_rmw_8_32<rmw_8>, or_is_add, inst8_32>;
-  def : TerRMWPatImmOff<i32, zext_ter_rmw_16_32<rmw_16>, or_is_add, inst16_32>;
-  def : TerRMWPatImmOff<i64, zext_ter_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
-  def : TerRMWPatImmOff<i64, zext_ter_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
-  def : TerRMWPatImmOff<i64, zext_ter_rmw_32_64<rmw_32>, or_is_add, inst32_64>;
-
-  def : TerRMWPatImmOff<i32, sext_ter_rmw_8_32<rmw_8>, regPlusImm, inst8_32>;
-  def : TerRMWPatImmOff<i32, sext_ter_rmw_16_32<rmw_16>, regPlusImm, inst16_32>;
-  def : TerRMWPatImmOff<i64, sext_ter_rmw_8_64<rmw_8>, regPlusImm, inst8_64>;
-  def : TerRMWPatImmOff<i64, sext_ter_rmw_16_64<rmw_16>, regPlusImm, inst16_64>;
-  def : TerRMWPatImmOff<i32, sext_ter_rmw_8_32<rmw_8>, or_is_add, inst8_32>;
-  def : TerRMWPatImmOff<i32, sext_ter_rmw_16_32<rmw_16>, or_is_add, inst16_32>;
-  def : TerRMWPatImmOff<i64, sext_ter_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
-  def : TerRMWPatImmOff<i64, sext_ter_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
+  defm : TerRMWPatImmOff<i32, zext_ter_rmw_8_32<rmw_8>, regPlusImm, inst8_32>;
+  defm : TerRMWPatImmOff<i32, zext_ter_rmw_16_32<rmw_16>, regPlusImm,
+                         inst16_32>;
+  defm : TerRMWPatImmOff<i64, zext_ter_rmw_8_64<rmw_8>, regPlusImm, inst8_64>;
+  defm : TerRMWPatImmOff<i64, zext_ter_rmw_16_64<rmw_16>, regPlusImm,
+                         inst16_64>;
+  defm : TerRMWPatImmOff<i64, zext_ter_rmw_32_64<rmw_32>, regPlusImm,
+                         inst32_64>;
+  defm : TerRMWPatImmOff<i32, zext_ter_rmw_8_32<rmw_8>, or_is_add, inst8_32>;
+  defm : TerRMWPatImmOff<i32, zext_ter_rmw_16_32<rmw_16>, or_is_add, inst16_32>;
+  defm : TerRMWPatImmOff<i64, zext_ter_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
+  defm : TerRMWPatImmOff<i64, zext_ter_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
+  defm : TerRMWPatImmOff<i64, zext_ter_rmw_32_64<rmw_32>, or_is_add, inst32_64>;
+
+  defm : TerRMWPatImmOff<i32, sext_ter_rmw_8_32<rmw_8>, regPlusImm, inst8_32>;
+  defm : TerRMWPatImmOff<i32, sext_ter_rmw_16_32<rmw_16>, regPlusImm,
+                         inst16_32>;
+  defm : TerRMWPatImmOff<i64, sext_ter_rmw_8_64<rmw_8>, regPlusImm, inst8_64>;
+  defm : TerRMWPatImmOff<i64, sext_ter_rmw_16_64<rmw_16>, regPlusImm,
+                         inst16_64>;
+  defm : TerRMWPatImmOff<i32, sext_ter_rmw_8_32<rmw_8>, or_is_add, inst8_32>;
+  defm : TerRMWPatImmOff<i32, sext_ter_rmw_16_32<rmw_16>, or_is_add, inst16_32>;
+  defm : TerRMWPatImmOff<i64, sext_ter_rmw_8_64<rmw_8>, or_is_add, inst8_64>;
+  defm : TerRMWPatImmOff<i64, sext_ter_rmw_16_64<rmw_16>, or_is_add, inst16_64>;
 
   // Truncating-extending ternary RMWs with just a constant offset
-  def : TerRMWPatOffsetOnly<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>;
-  def : TerRMWPatOffsetOnly<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>;
-  def : TerRMWPatOffsetOnly<i64, zext_ter_rmw_8_64<rmw_8>, inst8_64>;
-  def : TerRMWPatOffsetOnly<i64, zext_ter_rmw_16_64<rmw_16>, inst16_64>;
-  def : TerRMWPatOffsetOnly<i64, zext_ter_rmw_32_64<rmw_32>, inst32_64>;
-
-  def : TerRMWPatOffsetOnly<i32, sext_ter_rmw_8_32<rmw_8>, inst8_32>;
-  def : TerRMWPatOffsetOnly<i32, sext_ter_rmw_16_32<rmw_16>, inst16_32>;
-  def : TerRMWPatOffsetOnly<i64, sext_ter_rmw_8_64<rmw_8>, inst8_64>;
-  def : TerRMWPatOffsetOnly<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>;
-
-  def : TerRMWPatGlobalAddrOffOnly<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>;
-  def : TerRMWPatGlobalAddrOffOnly<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>;
-  def : TerRMWPatGlobalAddrOffOnly<i64, zext_ter_rmw_8_64<rmw_8>, inst8_64>;
-  def : TerRMWPatGlobalAddrOffOnly<i64, zext_ter_rmw_16_64<rmw_16>, inst16_64>;
-  def : TerRMWPatGlobalAddrOffOnly<i64, zext_ter_rmw_32_64<rmw_32>, inst32_64>;
-
-  def : TerRMWPatGlobalAddrOffOnly<i32, sext_ter_rmw_8_32<rmw_8>, inst8_32>;
-  def : TerRMWPatGlobalAddrOffOnly<i32, sext_ter_rmw_16_32<rmw_16>, inst16_32>;
-  def : TerRMWPatGlobalAddrOffOnly<i64, sext_ter_rmw_8_64<rmw_8>, inst8_64>;
-  def : TerRMWPatGlobalAddrOffOnly<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>;
+  defm : TerRMWPatOffsetOnly<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>;
+  defm : TerRMWPatOffsetOnly<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>;
+  defm : TerRMWPatOffsetOnly<i64, zext_ter_rmw_8_64<rmw_8>, inst8_64>;
+  defm : TerRMWPatOffsetOnly<i64, zext_ter_rmw_16_64<rmw_16>, inst16_64>;
+  defm : TerRMWPatOffsetOnly<i64, zext_ter_rmw_32_64<rmw_32>, inst32_64>;
+
+  defm : TerRMWPatOffsetOnly<i32, sext_ter_rmw_8_32<rmw_8>, inst8_32>;
+  defm : TerRMWPatOffsetOnly<i32, sext_ter_rmw_16_32<rmw_16>, inst16_32>;
+  defm : TerRMWPatOffsetOnly<i64, sext_ter_rmw_8_64<rmw_8>, inst8_64>;
+  defm : TerRMWPatOffsetOnly<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>;
+
+  defm : TerRMWPatGlobalAddrOffOnly<i32, zext_ter_rmw_8_32<rmw_8>, inst8_32>;
+  defm : TerRMWPatGlobalAddrOffOnly<i32, zext_ter_rmw_16_32<rmw_16>, inst16_32>;
+  defm : TerRMWPatGlobalAddrOffOnly<i64, zext_ter_rmw_8_64<rmw_8>, inst8_64>;
+  defm : TerRMWPatGlobalAddrOffOnly<i64, zext_ter_rmw_16_64<rmw_16>, inst16_64>;
+  defm : TerRMWPatGlobalAddrOffOnly<i64, zext_ter_rmw_32_64<rmw_32>, inst32_64>;
+
+  defm : TerRMWPatGlobalAddrOffOnly<i32, sext_ter_rmw_8_32<rmw_8>, inst8_32>;
+  defm : TerRMWPatGlobalAddrOffOnly<i32, sext_ter_rmw_16_32<rmw_16>, inst16_32>;
+  defm : TerRMWPatGlobalAddrOffOnly<i64, sext_ter_rmw_8_64<rmw_8>, inst8_64>;
+  defm : TerRMWPatGlobalAddrOffOnly<i64, sext_ter_rmw_16_64<rmw_16>, inst16_64>;
 }
 
 let Predicates = [HasAtomics] in
 defm : TerRMWTruncExtPattern<
   atomic_cmp_swap_8, atomic_cmp_swap_16, atomic_cmp_swap_32, atomic_cmp_swap_64,
-  ATOMIC_RMW8_U_CMPXCHG_I32, ATOMIC_RMW16_U_CMPXCHG_I32,
-  ATOMIC_RMW8_U_CMPXCHG_I64, ATOMIC_RMW16_U_CMPXCHG_I64,
-  ATOMIC_RMW32_U_CMPXCHG_I64>;
+  "ATOMIC_RMW8_U_CMPXCHG_I32", "ATOMIC_RMW16_U_CMPXCHG_I32",
+  "ATOMIC_RMW8_U_CMPXCHG_I64", "ATOMIC_RMW16_U_CMPXCHG_I64",
+  "ATOMIC_RMW32_U_CMPXCHG_I64">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
index 05735cf6d31f..3e9ef6fbc7ea 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrBulkMemory.td
@@ -33,39 +33,43 @@ def wasm_memset_t : SDTypeProfile<0, 4,
 def wasm_memset : SDNode<"WebAssemblyISD::MEMORY_FILL", wasm_memset_t,
                          [SDNPHasChain, SDNPMayStore]>;
 
+multiclass BulkMemoryOps<WebAssemblyRegClass rc, string B> {
+
 let mayStore = 1, hasSideEffects = 1 in
-defm MEMORY_INIT :
+defm MEMORY_INIT_A#B :
   BULK_I<(outs),
-         (ins i32imm_op:$seg, i32imm_op:$idx, I32:$dest,
-              I32:$offset, I32:$size),
+         (ins i32imm_op:$seg, i32imm_op:$idx, rc:$dest,
+              rc:$offset, rc:$size),
          (outs), (ins i32imm_op:$seg, i32imm_op:$idx),
-         [(int_wasm_memory_init (i32 timm:$seg), (i32 timm:$idx), I32:$dest,
-            I32:$offset, I32:$size
-          )],
+         [],
          "memory.init\t$seg, $idx, $dest, $offset, $size",
          "memory.init\t$seg, $idx", 0x08>;
 
 let hasSideEffects = 1 in
 defm DATA_DROP :
   BULK_I<(outs), (ins i32imm_op:$seg), (outs), (ins i32imm_op:$seg),
-         [(int_wasm_data_drop (i32 timm:$seg))],
+         [],
          "data.drop\t$seg", "data.drop\t$seg", 0x09>;
 
 let mayLoad = 1, mayStore = 1 in
-defm MEMORY_COPY :
+defm MEMORY_COPY_A#B :
   BULK_I<(outs), (ins i32imm_op:$src_idx, i32imm_op:$dst_idx,
-                      I32:$dst, I32:$src, I32:$len),
+                      rc:$dst, rc:$src, rc:$len),
          (outs), (ins i32imm_op:$src_idx, i32imm_op:$dst_idx),
          [(wasm_memcpy (i32 imm:$src_idx), (i32 imm:$dst_idx),
-           I32:$dst, I32:$src, I32:$len
+           rc:$dst, rc:$src, rc:$len
          )],
          "memory.copy\t$src_idx, $dst_idx, $dst, $src, $len",
          "memory.copy\t$src_idx, $dst_idx", 0x0a>;
 
 let mayStore = 1 in
-defm MEMORY_FILL :
-  BULK_I<(outs), (ins i32imm_op:$idx, I32:$dst, I32:$value, I32:$size),
+defm MEMORY_FILL_A#B :
+  BULK_I<(outs), (ins i32imm_op:$idx, rc:$dst, I32:$value, rc:$size),
          (outs), (ins i32imm_op:$idx),
-         [(wasm_memset (i32 imm:$idx), I32:$dst, I32:$value, I32:$size)],
+         [(wasm_memset (i32 imm:$idx), rc:$dst, I32:$value, rc:$size)],
          "memory.fill\t$idx, $dst, $value, $size",
          "memory.fill\t$idx", 0x0b>;
+}
+
+defm : BulkMemoryOps<I32, "32">;
+defm : BulkMemoryOps<I64, "64">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
index 703c15d58c93..b997c1c16fcb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrCall.td
@@ -23,155 +23,56 @@ defm ADJCALLSTACKUP : NRI<(outs), (ins i32imm:$amt, i32imm:$amt2),
                           [(WebAssemblycallseq_end timm:$amt, timm:$amt2)]>;
 } // Uses = [SP32, SP64], Defs = [SP32, SP64], isCodeGenOnly = 1
 
-multiclass CALL<ValueType vt, WebAssemblyRegClass rt, string prefix,
-                list<Predicate> preds = []> {
-  defm CALL_#vt :
-    I<(outs rt:$dst), (ins function32_op:$callee, variable_ops),
-      (outs), (ins function32_op:$callee),
-      [(set (vt rt:$dst), (WebAssemblycall1 (i32 imm:$callee)))],
-      !strconcat(prefix, "call\t$dst, $callee"),
-      !strconcat(prefix, "call\t$callee"),
-      0x10>,
-    Requires<preds>;
 
-  let isCodeGenOnly = 1 in
-  defm PCALL_INDIRECT_#vt :
-    I<(outs rt:$dst), (ins I32:$callee, variable_ops),
-      (outs), (ins I32:$callee),
-      [(set (vt rt:$dst), (WebAssemblycall1 I32:$callee))],
-      "PSEUDO CALL INDIRECT\t$callee",
-      "PSEUDO CALL INDIRECT\t$callee">,
-    Requires<preds>;
+let Uses = [SP32, SP64], isCall = 1 in {
 
-  defm CALL_INDIRECT_#vt :
-    I<(outs rt:$dst),
-      (ins TypeIndex:$type, i32imm:$flags, variable_ops),
-      (outs), (ins TypeIndex:$type, i32imm:$flags),
-      [],
-      !strconcat(prefix, "call_indirect\t$dst"),
-      !strconcat(prefix, "call_indirect\t$type"),
-      0x11>,
-    Requires<preds>;
-}
+// CALL should take both variadic arguments and produce variadic results, but
+// this is not possible to model directly. Instead, we select calls to a
+// CALL_PARAMS taking variadic arguments linked with a CALL_RESULTS that handles
+// producing the call's variadic results. We recombine the two in a custom
+// inserter hook after DAG ISel, so passes over MachineInstrs will only ever
+// observe CALL nodes with all of the expected variadic uses and defs.
+let isPseudo = 1 in
+defm CALL_PARAMS :
+  I<(outs), (ins function32_op:$callee, variable_ops),
+    (outs), (ins function32_op:$callee), [],
+    "call_params\t$callee", "call_params\t$callee", -1>;
 
-let Uses = [SP32, SP64], isCall = 1 in {
-defm "" : CALL<i32, I32, "i32.">;
-defm "" : CALL<i64, I64, "i64.">;
-defm "" : CALL<f32, F32, "f32.">;
-defm "" : CALL<f64, F64, "f64.">;
-defm "" : CALL<exnref, EXNREF, "exnref.", [HasExceptionHandling]>;
-defm "" : CALL<v16i8, V128, "v128.", [HasSIMD128]>;
-defm "" : CALL<v8i16, V128, "v128.", [HasSIMD128]>;
-defm "" : CALL<v4i32, V128, "v128.", [HasSIMD128]>;
-defm "" : CALL<v2i64, V128, "v128.", [HasSIMD128]>;
-defm "" : CALL<v4f32, V128, "v128.", [HasSIMD128]>;
-defm "" : CALL<v2f64, V128, "v128.", [HasSIMD128]>;
+let variadicOpsAreDefs = 1, usesCustomInserter = 1, isPseudo = 1 in
+defm CALL_RESULTS :
+  I<(outs), (ins variable_ops), (outs), (ins), [],
+     "call_results", "call_results", -1>;
 
-let IsCanonical = 1 in {
-defm CALL_VOID :
+let variadicOpsAreDefs = 1, usesCustomInserter = 1, isPseudo = 1 in
+defm RET_CALL_RESULTS :
+  I<(outs), (ins variable_ops), (outs), (ins), [],
+     "return_call_results", "return_call_results", -1>;
+
+let variadicOpsAreDefs = 1 in
+defm CALL :
   I<(outs), (ins function32_op:$callee, variable_ops),
-    (outs), (ins function32_op:$callee),
-    [(WebAssemblycall0 (i32 imm:$callee))],
-    "call    \t$callee", "call\t$callee", 0x10>;
+    (outs), (ins function32_op:$callee), [],
+    "call", "call\t$callee", 0x10>;
 
-let isReturn = 1 in
+let variadicOpsAreDefs = 1 in
+defm CALL_INDIRECT :
+ I<(outs), (ins TypeIndex:$type, i32imm:$flags, variable_ops),
+   (outs), (ins TypeIndex:$type, i32imm:$flags), [],
+   "call_indirect", "call_indirect\t$type", 0x11>;
+
+let isReturn = 1, isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in
 defm RET_CALL :
   I<(outs), (ins function32_op:$callee, variable_ops),
-    (outs), (ins function32_op:$callee),
-    [(WebAssemblyretcall (i32 imm:$callee))],
+    (outs), (ins function32_op:$callee), [],
     "return_call    \t$callee", "return_call\t$callee", 0x12>,
   Requires<[HasTailCall]>;
 
-let isCodeGenOnly = 1 in
-defm PCALL_INDIRECT_VOID :
-  I<(outs), (ins I32:$callee, variable_ops),
-    (outs), (ins I32:$callee),
-    [(WebAssemblycall0 I32:$callee)],
-    "PSEUDO CALL INDIRECT\t$callee",
-    "PSEUDO CALL INDIRECT\t$callee">;
-
-defm CALL_INDIRECT_VOID :
-  I<(outs), (ins TypeIndex:$type, i32imm:$flags, variable_ops),
-    (outs), (ins TypeIndex:$type, i32imm:$flags),
-    [],
-    "call_indirect\t", "call_indirect\t$type",
-    0x11>;
-
 let isReturn = 1 in
 defm RET_CALL_INDIRECT :
   I<(outs), (ins TypeIndex:$type, i32imm:$flags, variable_ops),
-    (outs), (ins TypeIndex:$type, i32imm:$flags),
-    [],
+    (outs), (ins TypeIndex:$type, i32imm:$flags), [],
     "return_call_indirect\t", "return_call_indirect\t$type",
     0x13>,
   Requires<[HasTailCall]>;
 
-let isCodeGenOnly = 1, isReturn = 1 in
-defm PRET_CALL_INDIRECT:
-    I<(outs), (ins I32:$callee, variable_ops),
-      (outs), (ins I32:$callee),
-      [(WebAssemblyretcall I32:$callee)],
-      "PSEUDO RET_CALL INDIRECT\t$callee",
-      "PSEUDO RET_CALL INDIRECT\t$callee">,
-    Requires<[HasTailCall]>;
-
-} // IsCanonical = 1
 } // Uses = [SP32,SP64], isCall = 1
-
-// Patterns for matching a direct call to a global address.
-def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
-          (CALL_i32 tglobaladdr:$callee)>;
-def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
-          (CALL_i64 tglobaladdr:$callee)>;
-def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
-          (CALL_f32 tglobaladdr:$callee)>;
-def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
-          (CALL_f64 tglobaladdr:$callee)>;
-def : Pat<(v16i8 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
-          (CALL_v16i8 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
-def : Pat<(v8i16 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
-          (CALL_v8i16 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
-def : Pat<(v4i32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
-          (CALL_v4i32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
-def : Pat<(v2i64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
-          (CALL_v2i64 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
-def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
-          (CALL_v4f32 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
-def : Pat<(v2f64 (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
-          (CALL_v2f64 tglobaladdr:$callee)>, Requires<[HasSIMD128]>;
-def : Pat<(exnref (WebAssemblycall1 (WebAssemblywrapper tglobaladdr:$callee))),
-          (CALL_exnref tglobaladdr:$callee)>,
-      Requires<[HasExceptionHandling]>;
-def : Pat<(WebAssemblycall0 (WebAssemblywrapper tglobaladdr:$callee)),
-          (CALL_VOID tglobaladdr:$callee)>;
-def : Pat<(WebAssemblyretcall (WebAssemblywrapper tglobaladdr:$callee)),
-          (RET_CALL tglobaladdr:$callee)>, Requires<[HasTailCall]>;
-
-// Patterns for matching a direct call to an external symbol.
-def : Pat<(i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
-          (CALL_i32 texternalsym:$callee)>;
-def : Pat<(i64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
-          (CALL_i64 texternalsym:$callee)>;
-def : Pat<(f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
-          (CALL_f32 texternalsym:$callee)>;
-def : Pat<(f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
-          (CALL_f64 texternalsym:$callee)>;
-def : Pat<(v16i8 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
-          (CALL_v16i8 texternalsym:$callee)>, Requires<[HasSIMD128]>;
-def : Pat<(v8i16 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
-          (CALL_v8i16 texternalsym:$callee)>, Requires<[HasSIMD128]>;
-def : Pat<(v4i32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
-          (CALL_v4i32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
-def : Pat<(v2i64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
-          (CALL_v2i64 texternalsym:$callee)>, Requires<[HasSIMD128]>;
-def : Pat<(v4f32 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
-          (CALL_v4f32 texternalsym:$callee)>, Requires<[HasSIMD128]>;
-def : Pat<(v2f64 (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
-          (CALL_v2f64 texternalsym:$callee)>, Requires<[HasSIMD128]>;
-def : Pat<(exnref (WebAssemblycall1 (WebAssemblywrapper texternalsym:$callee))),
-          (CALL_exnref texternalsym:$callee)>,
-      Requires<[HasExceptionHandling]>;
-def : Pat<(WebAssemblycall0 (WebAssemblywrapper texternalsym:$callee)),
-          (CALL_VOID texternalsym:$callee)>;
-def : Pat<(WebAssemblyretcall (WebAssemblywrapper texternalsym:$callee)),
-          (RET_CALL texternalsym:$callee)>, Requires<[HasTailCall]>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
index 1afc9a8790dc..171dd9a67beb 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrControl.td
@@ -40,21 +40,25 @@ def brlist : Operand<i32> {
   let PrintMethod = "printBrList";
 }
 
-// TODO: SelectionDAG's lowering insists on using a pointer as the index for
-// jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode
-// currently.
-let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
+// Duplicating a BR_TABLE is almost never a good idea. In particular, it can
+// lead to some nasty irreducibility due to tail merging when the br_table is in
+// a loop.
+let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1, isNotDuplicable = 1 in {
+
 defm BR_TABLE_I32 : I<(outs), (ins I32:$index, variable_ops),
                       (outs), (ins brlist:$brl),
                       [(WebAssemblybr_table I32:$index)],
                       "br_table \t$index", "br_table \t$brl",
                       0x0e>;
+// TODO: SelectionDAG's lowering insists on using a pointer as the index for
+// jump tables, so in practice we don't ever use BR_TABLE_I64 in wasm32 mode
+// currently.
 defm BR_TABLE_I64 : I<(outs), (ins I64:$index, variable_ops),
                       (outs), (ins brlist:$brl),
                       [(WebAssemblybr_table I64:$index)],
                       "br_table \t$index", "br_table \t$brl",
                       0x0e>;
-} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1, isNotDuplicable = 1
 
 // This is technically a control-flow instruction, since all it affects is the
 // IP.
@@ -85,8 +89,8 @@ defm END_FUNCTION : NRI<(outs), (ins), [], "end_function", 0x0b>;
 } // Uses = [VALUE_STACK], Defs = [VALUE_STACK]
 
 
-let isTerminator = 1, hasCtrlDep = 1, isBarrier = 1 in {
-
+let hasCtrlDep = 1, isBarrier = 1 in {
+let isTerminator = 1 in {
 let isReturn = 1 in {
 
 defm RETURN : I<(outs), (ins variable_ops), (outs), (ins),
@@ -99,8 +103,21 @@ defm FALLTHROUGH_RETURN : I<(outs), (ins variable_ops), (outs), (ins), []>;
 
 } // isReturn = 1
 
+let isTrap = 1 in
 defm UNREACHABLE : NRI<(outs), (ins), [(trap)], "unreachable", 0x00>;
-} // isTerminator = 1, hasCtrlDep = 1, isBarrier = 1
+
+} // isTerminator = 1
+
+// debugtrap explicitly returns despite trapping because it is supposed to just
+// get the attention of the debugger. Unfortunately, because UNREACHABLE is a
+// terminator, lowering debugtrap to UNREACHABLE can create an invalid
+// MachineBasicBlock when there is additional code after it. Lower it to this
+// non-terminator version instead.
+// TODO: Actually execute the debugger statement when running on the Web
+let isTrap = 1 in
+defm DEBUG_UNREACHABLE : NRI<(outs), (ins), [(debugtrap)], "unreachable", 0x00>;
+
+} // hasCtrlDep = 1, isBarrier = 1
 
 //===----------------------------------------------------------------------===//
 // Exception handling instructions
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
index aff4d20d8d82..0a4289c4959b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrFormats.td
@@ -14,11 +14,13 @@
 // WebAssembly Instruction Format.
 // We instantiate 2 of these for every actual instruction (register based
 // and stack based), see below.
-class WebAssemblyInst<bits<32> inst, string asmstr, string stack> : StackRel,
-  Instruction {
+class WebAssemblyInst<bits<32> inst, string asmstr, string stack, string is64>
+  : StackRel, Wasm64Rel, Instruction {
   bits<32> Inst = inst; // Instruction encoding.
   string StackBased = stack;
   string BaseName = NAME;
+  string IsWasm64 = is64;
+  string Wasm32Name = !subst("_A64", "_A32", NAME);
   let Namespace   = "WebAssembly";
   let Pattern     = [];
   let AsmString   = asmstr;
@@ -29,8 +31,8 @@ class WebAssemblyInst<bits<32> inst, string asmstr, string stack> : StackRel,
 
 // Normal instructions. Default instantiation of a WebAssemblyInst.
 class NI<dag oops, dag iops, list<dag> pattern, string stack,
-         string asmstr = "", bits<32> inst = -1>
-    : WebAssemblyInst<inst, asmstr, stack> {
+         string asmstr = "", bits<32> inst = -1, string is64 = "false">
+    : WebAssemblyInst<inst, asmstr, stack, is64> {
   dag OutOperandList = oops;
   dag InOperandList  = iops;
   let Pattern        = pattern;
@@ -52,11 +54,11 @@ class NI<dag oops, dag iops, list<dag> pattern, string stack,
 // there is always an equivalent pair of instructions.
 multiclass I<dag oops_r, dag iops_r, dag oops_s, dag iops_s,
              list<dag> pattern_r, string asmstr_r = "", string asmstr_s = "",
-             bits<32> inst = -1> {
+             bits<32> inst = -1, string is64 = "false"> {
   let isCodeGenOnly = 1 in
-  def "" : NI<oops_r, iops_r, pattern_r, "false", asmstr_r, inst>;
+  def "" : NI<oops_r, iops_r, pattern_r, "false", asmstr_r, inst, is64>;
   let BaseName = NAME in
-  def _S : NI<oops_s, iops_s, [], "true", asmstr_s, inst>;
+  def _S : NI<oops_s, iops_s, [], "true", asmstr_s, inst, is64>;
 }
 
 // For instructions that have no register ops, so both sets are the same.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
index 221dacaf821b..6fe1fd2b5c5a 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.cpp
@@ -235,8 +235,9 @@ bool WebAssemblyInstrInfo::reverseBranchCondition(
 ArrayRef<std::pair<int, const char *>>
 WebAssemblyInstrInfo::getSerializableTargetIndices() const {
   static const std::pair<int, const char *> TargetIndices[] = {
-      {WebAssembly::TI_LOCAL_START, "wasm-local-start"},
-      {WebAssembly::TI_GLOBAL_START, "wasm-global-start"},
-      {WebAssembly::TI_OPERAND_STACK_START, "wasm-operator-stack-start"}};
+      {WebAssembly::TI_LOCAL, "wasm-local"},
+      {WebAssembly::TI_GLOBAL_FIXED, "wasm-global-fixed"},
+      {WebAssembly::TI_OPERAND_STACK, "wasm-operand-stack"},
+      {WebAssembly::TI_GLOBAL_RELOC, "wasm-global-reloc"}};
   return makeArrayRef(TargetIndices);
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
index 044901481381..5ff0d73534a6 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrInfo.td
@@ -24,43 +24,47 @@ def HasAddr64 : Predicate<"Subtarget->hasAddr64()">;
 
 def HasSIMD128 :
     Predicate<"Subtarget->hasSIMD128()">,
-    AssemblerPredicate<"FeatureSIMD128", "simd128">;
+    AssemblerPredicate<(all_of FeatureSIMD128), "simd128">;
 
 def HasUnimplementedSIMD128 :
     Predicate<"Subtarget->hasUnimplementedSIMD128()">,
-    AssemblerPredicate<"FeatureUnimplementedSIMD128", "unimplemented-simd128">;
+    AssemblerPredicate<(all_of FeatureUnimplementedSIMD128), "unimplemented-simd128">;
 
 def HasAtomics :
     Predicate<"Subtarget->hasAtomics()">,
-    AssemblerPredicate<"FeatureAtomics", "atomics">;
+    AssemblerPredicate<(all_of FeatureAtomics), "atomics">;
 
 def HasMultivalue :
     Predicate<"Subtarget->hasMultivalue()">,
-    AssemblerPredicate<"FeatureMultivalue", "multivalue">;
+    AssemblerPredicate<(all_of FeatureMultivalue), "multivalue">;
 
 def HasNontrappingFPToInt :
     Predicate<"Subtarget->hasNontrappingFPToInt()">,
-    AssemblerPredicate<"FeatureNontrappingFPToInt", "nontrapping-fptoint">;
+    AssemblerPredicate<(all_of FeatureNontrappingFPToInt), "nontrapping-fptoint">;
 
 def NotHasNontrappingFPToInt :
     Predicate<"!Subtarget->hasNontrappingFPToInt()">,
-    AssemblerPredicate<"!FeatureNontrappingFPToInt", "nontrapping-fptoint">;
+    AssemblerPredicate<(all_of (not FeatureNontrappingFPToInt)), "nontrapping-fptoint">;
 
 def HasSignExt :
     Predicate<"Subtarget->hasSignExt()">,
-    AssemblerPredicate<"FeatureSignExt", "sign-ext">;
+    AssemblerPredicate<(all_of FeatureSignExt), "sign-ext">;
 
 def HasTailCall :
     Predicate<"Subtarget->hasTailCall()">,
-    AssemblerPredicate<"FeatureTailCall", "tail-call">;
+    AssemblerPredicate<(all_of FeatureTailCall), "tail-call">;
 
 def HasExceptionHandling :
     Predicate<"Subtarget->hasExceptionHandling()">,
-    AssemblerPredicate<"FeatureExceptionHandling", "exception-handling">;
+    AssemblerPredicate<(all_of FeatureExceptionHandling), "exception-handling">;
 
 def HasBulkMemory :
     Predicate<"Subtarget->hasBulkMemory()">,
-    AssemblerPredicate<"FeatureBulkMemory", "bulk-memory">;
+    AssemblerPredicate<(all_of FeatureBulkMemory), "bulk-memory">;
+
+def HasReferenceTypes :
+    Predicate<"Subtarget->hasReferenceTypes()">,
+    AssemblerPredicate<(all_of FeatureReferenceTypes), "reference-types">;
 
 //===----------------------------------------------------------------------===//
 // WebAssembly-specific DAG Node Types.
@@ -91,15 +95,6 @@ def WebAssemblycallseq_start :
 def WebAssemblycallseq_end :
     SDNode<"ISD::CALLSEQ_END", SDT_WebAssemblyCallSeqEnd,
            [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
-def WebAssemblycall0 : SDNode<"WebAssemblyISD::CALL0",
-                              SDT_WebAssemblyCall0,
-                              [SDNPHasChain, SDNPVariadic]>;
-def WebAssemblycall1 : SDNode<"WebAssemblyISD::CALL1",
-                              SDT_WebAssemblyCall1,
-                              [SDNPHasChain, SDNPVariadic]>;
-def WebAssemblyretcall : SDNode<"WebAssemblyISD::RET_CALL",
-                                SDT_WebAssemblyCall0,
-                                [SDNPHasChain, SDNPVariadic]>;
 def WebAssemblybr_table : SDNode<"WebAssemblyISD::BR_TABLE",
                                  SDT_WebAssemblyBrTable,
                                  [SDNPHasChain, SDNPVariadic]>;
@@ -171,6 +166,9 @@ def function32_op : Operand<i32>;
 let OperandType = "OPERAND_OFFSET32" in
 def offset32_op : Operand<i32>;
 
+let OperandType = "OPERAND_OFFSET64" in
+def offset64_op : Operand<i64>;
+
 let OperandType = "OPERAND_P2ALIGN" in {
 def P2Align : Operand<i32> {
   let PrintMethod = "printWebAssemblyP2AlignOperand";
@@ -205,6 +203,19 @@ def getStackOpcode : InstrMapping {
 }
 
 //===----------------------------------------------------------------------===//
+// WebAssembly 32 to 64-bit instruction mapping
+//===----------------------------------------------------------------------===//
+
+class Wasm64Rel;
+def getWasm64Opcode : InstrMapping {
+  let FilterClass = "Wasm64Rel";
+  let RowFields = ["Wasm32Name"];
+  let ColFields = ["IsWasm64"];
+  let KeyCol = ["false"];
+  let ValueCols = [["true"]];
+}
+
+//===----------------------------------------------------------------------===//
 // WebAssembly Instruction Format Definitions.
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
index eba9b80d3286..b3c63cc1f884 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrMemory.td
@@ -12,7 +12,6 @@
 //===----------------------------------------------------------------------===//
 
 // TODO:
-//  - HasAddr64
 //  - WebAssemblyTargetLowering having to do with atomics
 //  - Each has optional alignment.
 
@@ -41,181 +40,222 @@ def or_is_add : PatFrag<(ops node:$lhs, node:$rhs), (or node:$lhs, node:$rhs),[{
 // offsets folded into them, so we can just use add.
 
 // Defines atomic and non-atomic loads, regular and extending.
-multiclass WebAssemblyLoad<WebAssemblyRegClass rc, string Name, int Opcode> {
-  let mayLoad = 1, UseNamedOperandTable = 1 in
-  defm "": I<(outs rc:$dst),
-             (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
-             (outs), (ins P2Align:$p2align, offset32_op:$off),
-             [], !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}"),
-             !strconcat(Name, "\t${off}${p2align}"), Opcode>;
+multiclass WebAssemblyLoad<WebAssemblyRegClass rc, string Name, int Opcode,
+                           list<Predicate> reqs = []> {
+  let mayLoad = 1, UseNamedOperandTable = 1 in {
+  defm "_A32": I<(outs rc:$dst),
+                 (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+                 (outs), (ins P2Align:$p2align, offset32_op:$off),
+                 [], !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}"),
+                 !strconcat(Name, "\t${off}${p2align}"), Opcode, "false">,
+               Requires<reqs>;
+  defm "_A64": I<(outs rc:$dst),
+                 (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
+                 (outs), (ins P2Align:$p2align, offset64_op:$off),
+                 [], !strconcat(Name, "\t$dst, ${off}(${addr})${p2align}"),
+                 !strconcat(Name, "\t${off}${p2align}"), Opcode, "true">,
+               Requires<reqs>;
+  }
 }
 
 // Basic load.
 // FIXME: When we can break syntax compatibility, reorder the fields in the
 // asmstrings to match the binary encoding.
-defm LOAD_I32 : WebAssemblyLoad<I32, "i32.load", 0x28>;
-defm LOAD_I64 : WebAssemblyLoad<I64, "i64.load", 0x29>;
-defm LOAD_F32 : WebAssemblyLoad<F32, "f32.load", 0x2a>;
-defm LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b>;
+defm LOAD_I32 : WebAssemblyLoad<I32, "i32.load", 0x28, []>;
+defm LOAD_I64 : WebAssemblyLoad<I64, "i64.load", 0x29, []>;
+defm LOAD_F32 : WebAssemblyLoad<F32, "f32.load", 0x2a, []>;
+defm LOAD_F64 : WebAssemblyLoad<F64, "f64.load", 0x2b, []>;
 
 // Select loads with no constant offset.
-class LoadPatNoOffset<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(ty (kind I32:$addr)), (inst 0, 0, I32:$addr)>;
-
-def : LoadPatNoOffset<i32, load, LOAD_I32>;
-def : LoadPatNoOffset<i64, load, LOAD_I64>;
-def : LoadPatNoOffset<f32, load, LOAD_F32>;
-def : LoadPatNoOffset<f64, load, LOAD_F64>;
+multiclass LoadPatNoOffset<ValueType ty, PatFrag kind, string inst> {
+  def : Pat<(ty (kind I32:$addr)), (!cast<NI>(inst # "_A32") 0, 0, I32:$addr)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(ty (kind I64:$addr)), (!cast<NI>(inst # "_A64") 0, 0, I64:$addr)>,
+        Requires<[HasAddr64]>;
+}
 
+defm : LoadPatNoOffset<i32, load, "LOAD_I32">;
+defm : LoadPatNoOffset<i64, load, "LOAD_I64">;
+defm : LoadPatNoOffset<f32, load, "LOAD_F32">;
+defm : LoadPatNoOffset<f64, load, "LOAD_F64">;
 
 // Select loads with a constant offset.
 
 // Pattern with address + immediate offset
-class LoadPatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
-  Pat<(ty (kind (operand I32:$addr, imm:$off))), (inst 0, imm:$off, I32:$addr)>;
-
-def : LoadPatImmOff<i32, load, regPlusImm, LOAD_I32>;
-def : LoadPatImmOff<i64, load, regPlusImm, LOAD_I64>;
-def : LoadPatImmOff<f32, load, regPlusImm, LOAD_F32>;
-def : LoadPatImmOff<f64, load, regPlusImm, LOAD_F64>;
-def : LoadPatImmOff<i32, load, or_is_add, LOAD_I32>;
-def : LoadPatImmOff<i64, load, or_is_add, LOAD_I64>;
-def : LoadPatImmOff<f32, load, or_is_add, LOAD_F32>;
-def : LoadPatImmOff<f64, load, or_is_add, LOAD_F64>;
+multiclass LoadPatImmOff<ValueType ty, PatFrag kind, PatFrag operand,
+                         string inst> {
+  def : Pat<(ty (kind (operand I32:$addr, imm:$off))),
+            (!cast<NI>(inst # "_A32") 0, imm:$off, I32:$addr)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(ty (kind (operand I64:$addr, imm:$off))),
+            (!cast<NI>(inst # "_A64") 0, imm:$off, I64:$addr)>,
+        Requires<[HasAddr64]>;
+}
 
-// Select loads with just a constant offset.
-class LoadPatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(ty (kind imm:$off)), (inst 0, imm:$off, (CONST_I32 0))>;
+defm : LoadPatImmOff<i32, load, regPlusImm, "LOAD_I32">;
+defm : LoadPatImmOff<i64, load, regPlusImm, "LOAD_I64">;
+defm : LoadPatImmOff<f32, load, regPlusImm, "LOAD_F32">;
+defm : LoadPatImmOff<f64, load, regPlusImm, "LOAD_F64">;
+defm : LoadPatImmOff<i32, load, or_is_add, "LOAD_I32">;
+defm : LoadPatImmOff<i64, load, or_is_add, "LOAD_I64">;
+defm : LoadPatImmOff<f32, load, or_is_add, "LOAD_F32">;
+defm : LoadPatImmOff<f64, load, or_is_add, "LOAD_F64">;
 
-def : LoadPatOffsetOnly<i32, load, LOAD_I32>;
-def : LoadPatOffsetOnly<i64, load, LOAD_I64>;
-def : LoadPatOffsetOnly<f32, load, LOAD_F32>;
-def : LoadPatOffsetOnly<f64, load, LOAD_F64>;
+// Select loads with just a constant offset.
+multiclass LoadPatOffsetOnly<ValueType ty, PatFrag kind, string inst> {
+  def : Pat<(ty (kind imm:$off)),
+            (!cast<NI>(inst # "_A32") 0, imm:$off, (CONST_I32 0))>,
+        Requires<[HasAddr32]>;
+  def : Pat<(ty (kind imm:$off)),
+            (!cast<NI>(inst # "_A64") 0, imm:$off, (CONST_I64 0))>,
+        Requires<[HasAddr64]>;
+}
 
-class LoadPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off))),
-      (inst 0, tglobaladdr:$off, (CONST_I32 0))>, Requires<[IsNotPIC]>;
+defm : LoadPatOffsetOnly<i32, load, "LOAD_I32">;
+defm : LoadPatOffsetOnly<i64, load, "LOAD_I64">;
+defm : LoadPatOffsetOnly<f32, load, "LOAD_F32">;
+defm : LoadPatOffsetOnly<f64, load, "LOAD_F64">;
+
+multiclass LoadPatGlobalAddrOffOnly<ValueType ty, PatFrag kind, string inst> {
+  def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off))),
+            (!cast<NI>(inst # "_A32") 0, tglobaladdr:$off, (CONST_I32 0))>,
+        Requires<[IsNotPIC, HasAddr32]>;
+  def : Pat<(ty (kind (WebAssemblywrapper tglobaladdr:$off))),
+            (!cast<NI>(inst # "_A64") 0, tglobaladdr:$off, (CONST_I64 0))>,
+        Requires<[IsNotPIC, HasAddr64]>;
+}
 
-def : LoadPatGlobalAddrOffOnly<i32, load, LOAD_I32>;
-def : LoadPatGlobalAddrOffOnly<i64, load, LOAD_I64>;
-def : LoadPatGlobalAddrOffOnly<f32, load, LOAD_F32>;
-def : LoadPatGlobalAddrOffOnly<f64, load, LOAD_F64>;
+defm : LoadPatGlobalAddrOffOnly<i32, load, "LOAD_I32">;
+defm : LoadPatGlobalAddrOffOnly<i64, load, "LOAD_I64">;
+defm : LoadPatGlobalAddrOffOnly<f32, load, "LOAD_F32">;
+defm : LoadPatGlobalAddrOffOnly<f64, load, "LOAD_F64">;
 
 // Extending load.
-defm LOAD8_S_I32 : WebAssemblyLoad<I32, "i32.load8_s", 0x2c>;
-defm LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.load8_u", 0x2d>;
-defm LOAD16_S_I32 : WebAssemblyLoad<I32, "i32.load16_s", 0x2e>;
-defm LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.load16_u", 0x2f>;
-defm LOAD8_S_I64 : WebAssemblyLoad<I64, "i64.load8_s", 0x30>;
-defm LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.load8_u", 0x31>;
-defm LOAD16_S_I64 : WebAssemblyLoad<I64, "i64.load16_s", 0x32>;
-defm LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.load16_u", 0x33>;
-defm LOAD32_S_I64 : WebAssemblyLoad<I64, "i64.load32_s", 0x34>;
-defm LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.load32_u", 0x35>;
+defm LOAD8_S_I32 : WebAssemblyLoad<I32, "i32.load8_s", 0x2c, []>;
+defm LOAD8_U_I32 : WebAssemblyLoad<I32, "i32.load8_u", 0x2d, []>;
+defm LOAD16_S_I32 : WebAssemblyLoad<I32, "i32.load16_s", 0x2e, []>;
+defm LOAD16_U_I32 : WebAssemblyLoad<I32, "i32.load16_u", 0x2f, []>;
+defm LOAD8_S_I64 : WebAssemblyLoad<I64, "i64.load8_s", 0x30, []>;
+defm LOAD8_U_I64 : WebAssemblyLoad<I64, "i64.load8_u", 0x31, []>;
+defm LOAD16_S_I64 : WebAssemblyLoad<I64, "i64.load16_s", 0x32, []>;
+defm LOAD16_U_I64 : WebAssemblyLoad<I64, "i64.load16_u", 0x33, []>;
+defm LOAD32_S_I64 : WebAssemblyLoad<I64, "i64.load32_s", 0x34, []>;
+defm LOAD32_U_I64 : WebAssemblyLoad<I64, "i64.load32_u", 0x35, []>;
 
 // Select extending loads with no constant offset.
-def : LoadPatNoOffset<i32, sextloadi8, LOAD8_S_I32>;
-def : LoadPatNoOffset<i32, zextloadi8, LOAD8_U_I32>;
-def : LoadPatNoOffset<i32, sextloadi16, LOAD16_S_I32>;
-def : LoadPatNoOffset<i32, zextloadi16, LOAD16_U_I32>;
-def : LoadPatNoOffset<i64, sextloadi8, LOAD8_S_I64>;
-def : LoadPatNoOffset<i64, zextloadi8, LOAD8_U_I64>;
-def : LoadPatNoOffset<i64, sextloadi16, LOAD16_S_I64>;
-def : LoadPatNoOffset<i64, zextloadi16, LOAD16_U_I64>;
-def : LoadPatNoOffset<i64, sextloadi32, LOAD32_S_I64>;
-def : LoadPatNoOffset<i64, zextloadi32, LOAD32_U_I64>;
+defm : LoadPatNoOffset<i32, sextloadi8, "LOAD8_S_I32">;
+defm : LoadPatNoOffset<i32, zextloadi8, "LOAD8_U_I32">;
+defm : LoadPatNoOffset<i32, sextloadi16, "LOAD16_S_I32">;
+defm : LoadPatNoOffset<i32, zextloadi16, "LOAD16_U_I32">;
+defm : LoadPatNoOffset<i64, sextloadi8, "LOAD8_S_I64">;
+defm : LoadPatNoOffset<i64, zextloadi8, "LOAD8_U_I64">;
+defm : LoadPatNoOffset<i64, sextloadi16, "LOAD16_S_I64">;
+defm : LoadPatNoOffset<i64, zextloadi16, "LOAD16_U_I64">;
+defm : LoadPatNoOffset<i64, sextloadi32, "LOAD32_S_I64">;
+defm : LoadPatNoOffset<i64, zextloadi32, "LOAD32_U_I64">;
 
 // Select extending loads with a constant offset.
-def : LoadPatImmOff<i32, sextloadi8, regPlusImm, LOAD8_S_I32>;
-def : LoadPatImmOff<i32, zextloadi8, regPlusImm, LOAD8_U_I32>;
-def : LoadPatImmOff<i32, sextloadi16, regPlusImm, LOAD16_S_I32>;
-def : LoadPatImmOff<i32, zextloadi16, regPlusImm, LOAD16_U_I32>;
-def : LoadPatImmOff<i64, sextloadi8, regPlusImm, LOAD8_S_I64>;
-def : LoadPatImmOff<i64, zextloadi8, regPlusImm, LOAD8_U_I64>;
-def : LoadPatImmOff<i64, sextloadi16, regPlusImm, LOAD16_S_I64>;
-def : LoadPatImmOff<i64, zextloadi16, regPlusImm, LOAD16_U_I64>;
-def : LoadPatImmOff<i64, sextloadi32, regPlusImm, LOAD32_S_I64>;
-def : LoadPatImmOff<i64, zextloadi32, regPlusImm, LOAD32_U_I64>;
-
-def : LoadPatImmOff<i32, sextloadi8, or_is_add, LOAD8_S_I32>;
-def : LoadPatImmOff<i32, zextloadi8, or_is_add, LOAD8_U_I32>;
-def : LoadPatImmOff<i32, sextloadi16, or_is_add, LOAD16_S_I32>;
-def : LoadPatImmOff<i32, zextloadi16, or_is_add, LOAD16_U_I32>;
-def : LoadPatImmOff<i64, sextloadi8, or_is_add, LOAD8_S_I64>;
-def : LoadPatImmOff<i64, zextloadi8, or_is_add, LOAD8_U_I64>;
-def : LoadPatImmOff<i64, sextloadi16, or_is_add, LOAD16_S_I64>;
-def : LoadPatImmOff<i64, zextloadi16, or_is_add, LOAD16_U_I64>;
-def : LoadPatImmOff<i64, sextloadi32, or_is_add, LOAD32_S_I64>;
-def : LoadPatImmOff<i64, zextloadi32, or_is_add, LOAD32_U_I64>;
+defm : LoadPatImmOff<i32, sextloadi8, regPlusImm, "LOAD8_S_I32">;
+defm : LoadPatImmOff<i32, zextloadi8, regPlusImm, "LOAD8_U_I32">;
+defm : LoadPatImmOff<i32, sextloadi16, regPlusImm, "LOAD16_S_I32">;
+defm : LoadPatImmOff<i32, zextloadi16, regPlusImm, "LOAD16_U_I32">;
+defm : LoadPatImmOff<i64, sextloadi8, regPlusImm, "LOAD8_S_I64">;
+defm : LoadPatImmOff<i64, zextloadi8, regPlusImm, "LOAD8_U_I64">;
+defm : LoadPatImmOff<i64, sextloadi16, regPlusImm, "LOAD16_S_I64">;
+defm : LoadPatImmOff<i64, zextloadi16, regPlusImm, "LOAD16_U_I64">;
+defm : LoadPatImmOff<i64, sextloadi32, regPlusImm, "LOAD32_S_I64">;
+defm : LoadPatImmOff<i64, zextloadi32, regPlusImm, "LOAD32_U_I64">;
+
+defm : LoadPatImmOff<i32, sextloadi8, or_is_add, "LOAD8_S_I32">;
+defm : LoadPatImmOff<i32, zextloadi8, or_is_add, "LOAD8_U_I32">;
+defm : LoadPatImmOff<i32, sextloadi16, or_is_add, "LOAD16_S_I32">;
+defm : LoadPatImmOff<i32, zextloadi16, or_is_add, "LOAD16_U_I32">;
+defm : LoadPatImmOff<i64, sextloadi8, or_is_add, "LOAD8_S_I64">;
+defm : LoadPatImmOff<i64, zextloadi8, or_is_add, "LOAD8_U_I64">;
+defm : LoadPatImmOff<i64, sextloadi16, or_is_add, "LOAD16_S_I64">;
+defm : LoadPatImmOff<i64, zextloadi16, or_is_add, "LOAD16_U_I64">;
+defm : LoadPatImmOff<i64, sextloadi32, or_is_add, "LOAD32_S_I64">;
+defm : LoadPatImmOff<i64, zextloadi32, or_is_add, "LOAD32_U_I64">;
 
 // Select extending loads with just a constant offset.
-def : LoadPatOffsetOnly<i32, sextloadi8, LOAD8_S_I32>;
-def : LoadPatOffsetOnly<i32, zextloadi8, LOAD8_U_I32>;
-def : LoadPatOffsetOnly<i32, sextloadi16, LOAD16_S_I32>;
-def : LoadPatOffsetOnly<i32, zextloadi16, LOAD16_U_I32>;
-
-def : LoadPatOffsetOnly<i64, sextloadi8, LOAD8_S_I64>;
-def : LoadPatOffsetOnly<i64, zextloadi8, LOAD8_U_I64>;
-def : LoadPatOffsetOnly<i64, sextloadi16, LOAD16_S_I64>;
-def : LoadPatOffsetOnly<i64, zextloadi16, LOAD16_U_I64>;
-def : LoadPatOffsetOnly<i64, sextloadi32, LOAD32_S_I64>;
-def : LoadPatOffsetOnly<i64, zextloadi32, LOAD32_U_I64>;
-
-def : LoadPatGlobalAddrOffOnly<i32, sextloadi8, LOAD8_S_I32>;
-def : LoadPatGlobalAddrOffOnly<i32, zextloadi8, LOAD8_U_I32>;
-def : LoadPatGlobalAddrOffOnly<i32, sextloadi16, LOAD16_S_I32>;
-def : LoadPatGlobalAddrOffOnly<i32, zextloadi16, LOAD16_U_I32>;
-def : LoadPatGlobalAddrOffOnly<i64, sextloadi8, LOAD8_S_I64>;
-def : LoadPatGlobalAddrOffOnly<i64, zextloadi8, LOAD8_U_I64>;
-def : LoadPatGlobalAddrOffOnly<i64, sextloadi16, LOAD16_S_I64>;
-def : LoadPatGlobalAddrOffOnly<i64, zextloadi16, LOAD16_U_I64>;
-def : LoadPatGlobalAddrOffOnly<i64, sextloadi32, LOAD32_S_I64>;
-def : LoadPatGlobalAddrOffOnly<i64, zextloadi32, LOAD32_U_I64>;
+defm : LoadPatOffsetOnly<i32, sextloadi8, "LOAD8_S_I32">;
+defm : LoadPatOffsetOnly<i32, zextloadi8, "LOAD8_U_I32">;
+defm : LoadPatOffsetOnly<i32, sextloadi16, "LOAD16_S_I32">;
+defm : LoadPatOffsetOnly<i32, zextloadi16, "LOAD16_U_I32">;
+
+defm : LoadPatOffsetOnly<i64, sextloadi8, "LOAD8_S_I64">;
+defm : LoadPatOffsetOnly<i64, zextloadi8, "LOAD8_U_I64">;
+defm : LoadPatOffsetOnly<i64, sextloadi16, "LOAD16_S_I64">;
+defm : LoadPatOffsetOnly<i64, zextloadi16, "LOAD16_U_I64">;
+defm : LoadPatOffsetOnly<i64, sextloadi32, "LOAD32_S_I64">;
+defm : LoadPatOffsetOnly<i64, zextloadi32, "LOAD32_U_I64">;
+
+defm : LoadPatGlobalAddrOffOnly<i32, sextloadi8, "LOAD8_S_I32">;
+defm : LoadPatGlobalAddrOffOnly<i32, zextloadi8, "LOAD8_U_I32">;
+defm : LoadPatGlobalAddrOffOnly<i32, sextloadi16, "LOAD16_S_I32">;
+defm : LoadPatGlobalAddrOffOnly<i32, zextloadi16, "LOAD16_U_I32">;
+defm : LoadPatGlobalAddrOffOnly<i64, sextloadi8, "LOAD8_S_I64">;
+defm : LoadPatGlobalAddrOffOnly<i64, zextloadi8, "LOAD8_U_I64">;
+defm : LoadPatGlobalAddrOffOnly<i64, sextloadi16, "LOAD16_S_I64">;
+defm : LoadPatGlobalAddrOffOnly<i64, zextloadi16, "LOAD16_U_I64">;
+defm : LoadPatGlobalAddrOffOnly<i64, sextloadi32, "LOAD32_S_I64">;
+defm : LoadPatGlobalAddrOffOnly<i64, zextloadi32, "LOAD32_U_I64">;
 
 // Resolve "don't care" extending loads to zero-extending loads. This is
 // somewhat arbitrary, but zero-extending is conceptually simpler.
 
 // Select "don't care" extending loads with no constant offset.
-def : LoadPatNoOffset<i32, extloadi8, LOAD8_U_I32>;
-def : LoadPatNoOffset<i32, extloadi16, LOAD16_U_I32>;
-def : LoadPatNoOffset<i64, extloadi8, LOAD8_U_I64>;
-def : LoadPatNoOffset<i64, extloadi16, LOAD16_U_I64>;
-def : LoadPatNoOffset<i64, extloadi32, LOAD32_U_I64>;
+defm : LoadPatNoOffset<i32, extloadi8, "LOAD8_U_I32">;
+defm : LoadPatNoOffset<i32, extloadi16, "LOAD16_U_I32">;
+defm : LoadPatNoOffset<i64, extloadi8, "LOAD8_U_I64">;
+defm : LoadPatNoOffset<i64, extloadi16, "LOAD16_U_I64">;
+defm : LoadPatNoOffset<i64, extloadi32, "LOAD32_U_I64">;
 
 // Select "don't care" extending loads with a constant offset.
-def : LoadPatImmOff<i32, extloadi8, regPlusImm, LOAD8_U_I32>;
-def : LoadPatImmOff<i32, extloadi16, regPlusImm, LOAD16_U_I32>;
-def : LoadPatImmOff<i64, extloadi8, regPlusImm, LOAD8_U_I64>;
-def : LoadPatImmOff<i64, extloadi16, regPlusImm, LOAD16_U_I64>;
-def : LoadPatImmOff<i64, extloadi32, regPlusImm, LOAD32_U_I64>;
-def : LoadPatImmOff<i32, extloadi8, or_is_add, LOAD8_U_I32>;
-def : LoadPatImmOff<i32, extloadi16, or_is_add, LOAD16_U_I32>;
-def : LoadPatImmOff<i64, extloadi8, or_is_add, LOAD8_U_I64>;
-def : LoadPatImmOff<i64, extloadi16, or_is_add, LOAD16_U_I64>;
-def : LoadPatImmOff<i64, extloadi32, or_is_add, LOAD32_U_I64>;
+defm : LoadPatImmOff<i32, extloadi8, regPlusImm, "LOAD8_U_I32">;
+defm : LoadPatImmOff<i32, extloadi16, regPlusImm, "LOAD16_U_I32">;
+defm : LoadPatImmOff<i64, extloadi8, regPlusImm, "LOAD8_U_I64">;
+defm : LoadPatImmOff<i64, extloadi16, regPlusImm, "LOAD16_U_I64">;
+defm : LoadPatImmOff<i64, extloadi32, regPlusImm, "LOAD32_U_I64">;
+defm : LoadPatImmOff<i32, extloadi8, or_is_add, "LOAD8_U_I32">;
+defm : LoadPatImmOff<i32, extloadi16, or_is_add, "LOAD16_U_I32">;
+defm : LoadPatImmOff<i64, extloadi8, or_is_add, "LOAD8_U_I64">;
+defm : LoadPatImmOff<i64, extloadi16, or_is_add, "LOAD16_U_I64">;
+defm : LoadPatImmOff<i64, extloadi32, or_is_add, "LOAD32_U_I64">;
 
 // Select "don't care" extending loads with just a constant offset.
-def : LoadPatOffsetOnly<i32, extloadi8, LOAD8_U_I32>;
-def : LoadPatOffsetOnly<i32, extloadi16, LOAD16_U_I32>;
-def : LoadPatOffsetOnly<i64, extloadi8, LOAD8_U_I64>;
-def : LoadPatOffsetOnly<i64, extloadi16, LOAD16_U_I64>;
-def : LoadPatOffsetOnly<i64, extloadi32, LOAD32_U_I64>;
-def : LoadPatGlobalAddrOffOnly<i32, extloadi8, LOAD8_U_I32>;
-def : LoadPatGlobalAddrOffOnly<i32, extloadi16, LOAD16_U_I32>;
-def : LoadPatGlobalAddrOffOnly<i64, extloadi8, LOAD8_U_I64>;
-def : LoadPatGlobalAddrOffOnly<i64, extloadi16, LOAD16_U_I64>;
-def : LoadPatGlobalAddrOffOnly<i64, extloadi32, LOAD32_U_I64>;
+defm : LoadPatOffsetOnly<i32, extloadi8, "LOAD8_U_I32">;
+defm : LoadPatOffsetOnly<i32, extloadi16, "LOAD16_U_I32">;
+defm : LoadPatOffsetOnly<i64, extloadi8, "LOAD8_U_I64">;
+defm : LoadPatOffsetOnly<i64, extloadi16, "LOAD16_U_I64">;
+defm : LoadPatOffsetOnly<i64, extloadi32, "LOAD32_U_I64">;
+defm : LoadPatGlobalAddrOffOnly<i32, extloadi8, "LOAD8_U_I32">;
+defm : LoadPatGlobalAddrOffOnly<i32, extloadi16, "LOAD16_U_I32">;
+defm : LoadPatGlobalAddrOffOnly<i64, extloadi8, "LOAD8_U_I64">;
+defm : LoadPatGlobalAddrOffOnly<i64, extloadi16, "LOAD16_U_I64">;
+defm : LoadPatGlobalAddrOffOnly<i64, extloadi32, "LOAD32_U_I64">;
 
 // Defines atomic and non-atomic stores, regular and truncating
-multiclass WebAssemblyStore<WebAssemblyRegClass rc, string Name, int Opcode> {
+multiclass WebAssemblyStore<WebAssemblyRegClass rc, string Name, int Opcode,
+                            list<Predicate> reqs = []> {
   let mayStore = 1, UseNamedOperandTable = 1 in
-  defm "" : I<(outs),
-              (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
-              (outs),
-              (ins P2Align:$p2align, offset32_op:$off), [],
-              !strconcat(Name, "\t${off}(${addr})${p2align}, $val"),
-              !strconcat(Name, "\t${off}${p2align}"), Opcode>;
+  defm "_A32" : I<(outs),
+                  (ins P2Align:$p2align, offset32_op:$off, I32:$addr, rc:$val),
+                  (outs),
+                  (ins P2Align:$p2align, offset32_op:$off), [],
+                  !strconcat(Name, "\t${off}(${addr})${p2align}, $val"),
+                  !strconcat(Name, "\t${off}${p2align}"), Opcode, "false">,
+                Requires<reqs>;
+  let mayStore = 1, UseNamedOperandTable = 1 in
+  defm "_A64" : I<(outs),
+                  (ins P2Align:$p2align, offset64_op:$off, I64:$addr, rc:$val),
+                  (outs),
+                  (ins P2Align:$p2align, offset64_op:$off), [],
+                  !strconcat(Name, "\t${off}(${addr})${p2align}, $val"),
+                  !strconcat(Name, "\t${off}${p2align}"), Opcode, "true">,
+                Requires<reqs>;
 }
+
 // Basic store.
 // Note: WebAssembly inverts SelectionDAG's usual operand order.
 defm STORE_I32  : WebAssemblyStore<I32, "i32.store", 0x36>;
@@ -224,43 +264,68 @@ defm STORE_F32  : WebAssemblyStore<F32, "f32.store", 0x38>;
 defm STORE_F64  : WebAssemblyStore<F64, "f64.store", 0x39>;
 
 // Select stores with no constant offset.
-class StorePatNoOffset<ValueType ty, PatFrag node, NI inst> :
-  Pat<(node ty:$val, I32:$addr), (inst 0, 0, I32:$addr, ty:$val)>;
+multiclass StorePatNoOffset<ValueType ty, PatFrag node, string inst> {
+  def : Pat<(node ty:$val, I32:$addr),
+            (!cast<NI>(inst # "_A32") 0, 0, I32:$addr, ty:$val)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(node ty:$val, I64:$addr),
+            (!cast<NI>(inst # "_A64") 0, 0, I64:$addr, ty:$val)>,
+        Requires<[HasAddr64]>;
+}
 
-def : StorePatNoOffset<i32, store, STORE_I32>;
-def : StorePatNoOffset<i64, store, STORE_I64>;
-def : StorePatNoOffset<f32, store, STORE_F32>;
-def : StorePatNoOffset<f64, store, STORE_F64>;
+defm : StorePatNoOffset<i32, store, "STORE_I32">;
+defm : StorePatNoOffset<i64, store, "STORE_I64">;
+defm : StorePatNoOffset<f32, store, "STORE_F32">;
+defm : StorePatNoOffset<f64, store, "STORE_F64">;
 
 // Select stores with a constant offset.
-class StorePatImmOff<ValueType ty, PatFrag kind, PatFrag operand, NI inst> :
-  Pat<(kind ty:$val, (operand I32:$addr, imm:$off)),
-      (inst 0, imm:$off, I32:$addr, ty:$val)>;
-
-def : StorePatImmOff<i32, store, regPlusImm, STORE_I32>;
-def : StorePatImmOff<i64, store, regPlusImm, STORE_I64>;
-def : StorePatImmOff<f32, store, regPlusImm, STORE_F32>;
-def : StorePatImmOff<f64, store, regPlusImm, STORE_F64>;
-def : StorePatImmOff<i32, store, or_is_add, STORE_I32>;
-def : StorePatImmOff<i64, store, or_is_add, STORE_I64>;
-def : StorePatImmOff<f32, store, or_is_add, STORE_F32>;
-def : StorePatImmOff<f64, store, or_is_add, STORE_F64>;
+multiclass StorePatImmOff<ValueType ty, PatFrag kind, PatFrag operand,
+                          string inst> {
+  def : Pat<(kind ty:$val, (operand I32:$addr, imm:$off)),
+            (!cast<NI>(inst # "_A32") 0, imm:$off, I32:$addr, ty:$val)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(kind ty:$val, (operand I64:$addr, imm:$off)),
+            (!cast<NI>(inst # "_A64") 0, imm:$off, I64:$addr, ty:$val)>,
+        Requires<[HasAddr64]>;
+}
+
+defm : StorePatImmOff<i32, store, regPlusImm, "STORE_I32">;
+defm : StorePatImmOff<i64, store, regPlusImm, "STORE_I64">;
+defm : StorePatImmOff<f32, store, regPlusImm, "STORE_F32">;
+defm : StorePatImmOff<f64, store, regPlusImm, "STORE_F64">;
+defm : StorePatImmOff<i32, store, or_is_add, "STORE_I32">;
+defm : StorePatImmOff<i64, store, or_is_add, "STORE_I64">;
+defm : StorePatImmOff<f32, store, or_is_add, "STORE_F32">;
+defm : StorePatImmOff<f64, store, or_is_add, "STORE_F64">;
 
 // Select stores with just a constant offset.
-class StorePatOffsetOnly<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(kind ty:$val, imm:$off), (inst 0, imm:$off, (CONST_I32 0), ty:$val)>;
-def : StorePatOffsetOnly<i32, store, STORE_I32>;
-def : StorePatOffsetOnly<i64, store, STORE_I64>;
-def : StorePatOffsetOnly<f32, store, STORE_F32>;
-def : StorePatOffsetOnly<f64, store, STORE_F64>;
-
-class StorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind, NI inst> :
-  Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)),
-      (inst 0, tglobaladdr:$off, (CONST_I32 0), ty:$val)>, Requires<[IsNotPIC]>;
-def : StorePatGlobalAddrOffOnly<i32, store, STORE_I32>;
-def : StorePatGlobalAddrOffOnly<i64, store, STORE_I64>;
-def : StorePatGlobalAddrOffOnly<f32, store, STORE_F32>;
-def : StorePatGlobalAddrOffOnly<f64, store, STORE_F64>;
+multiclass StorePatOffsetOnly<ValueType ty, PatFrag kind, string inst> {
+  def : Pat<(kind ty:$val, imm:$off),
+            (!cast<NI>(inst # "_A32") 0, imm:$off, (CONST_I32 0), ty:$val)>,
+        Requires<[HasAddr32]>;
+  def : Pat<(kind ty:$val, imm:$off),
+            (!cast<NI>(inst # "_A64") 0, imm:$off, (CONST_I64 0), ty:$val)>,
+        Requires<[HasAddr64]>;
+}
+defm : StorePatOffsetOnly<i32, store, "STORE_I32">;
+defm : StorePatOffsetOnly<i64, store, "STORE_I64">;
+defm : StorePatOffsetOnly<f32, store, "STORE_F32">;
+defm : StorePatOffsetOnly<f64, store, "STORE_F64">;
+
+multiclass StorePatGlobalAddrOffOnly<ValueType ty, PatFrag kind, string inst> {
+  def : Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)),
+            (!cast<NI>(inst # "_A32") 0, tglobaladdr:$off, (CONST_I32 0),
+             ty:$val)>,
+        Requires<[IsNotPIC, HasAddr32]>;
+  def : Pat<(kind ty:$val, (WebAssemblywrapper tglobaladdr:$off)),
+            (!cast<NI>(inst # "_A64") 0, tglobaladdr:$off, (CONST_I64 0),
+             ty:$val)>,
+        Requires<[IsNotPIC, HasAddr64]>;
+}
+defm : StorePatGlobalAddrOffOnly<i32, store, "STORE_I32">;
+defm : StorePatGlobalAddrOffOnly<i64, store, "STORE_I64">;
+defm : StorePatGlobalAddrOffOnly<f32, store, "STORE_F32">;
+defm : StorePatGlobalAddrOffOnly<f64, store, "STORE_F64">;
 
 // Truncating store.
 defm STORE8_I32 : WebAssemblyStore<I32, "i32.store8", 0x3a>;
@@ -270,51 +335,54 @@ defm STORE16_I64 : WebAssemblyStore<I64, "i64.store16", 0x3d>;
 defm STORE32_I64 : WebAssemblyStore<I64, "i64.store32", 0x3e>;
 
 // Select truncating stores with no constant offset.
-def : StorePatNoOffset<i32, truncstorei8, STORE8_I32>;
-def : StorePatNoOffset<i32, truncstorei16, STORE16_I32>;
-def : StorePatNoOffset<i64, truncstorei8, STORE8_I64>;
-def : StorePatNoOffset<i64, truncstorei16, STORE16_I64>;
-def : StorePatNoOffset<i64, truncstorei32, STORE32_I64>;
+defm : StorePatNoOffset<i32, truncstorei8, "STORE8_I32">;
+defm : StorePatNoOffset<i32, truncstorei16, "STORE16_I32">;
+defm : StorePatNoOffset<i64, truncstorei8, "STORE8_I64">;
+defm : StorePatNoOffset<i64, truncstorei16, "STORE16_I64">;
+defm : StorePatNoOffset<i64, truncstorei32, "STORE32_I64">;
 
 // Select truncating stores with a constant offset.
-def : StorePatImmOff<i32, truncstorei8, regPlusImm, STORE8_I32>;
-def : StorePatImmOff<i32, truncstorei16, regPlusImm, STORE16_I32>;
-def : StorePatImmOff<i64, truncstorei8, regPlusImm, STORE8_I64>;
-def : StorePatImmOff<i64, truncstorei16, regPlusImm, STORE16_I64>;
-def : StorePatImmOff<i64, truncstorei32, regPlusImm, STORE32_I64>;
-def : StorePatImmOff<i32, truncstorei8, or_is_add, STORE8_I32>;
-def : StorePatImmOff<i32, truncstorei16, or_is_add, STORE16_I32>;
-def : StorePatImmOff<i64, truncstorei8, or_is_add, STORE8_I64>;
-def : StorePatImmOff<i64, truncstorei16, or_is_add, STORE16_I64>;
-def : StorePatImmOff<i64, truncstorei32, or_is_add, STORE32_I64>;
+defm : StorePatImmOff<i32, truncstorei8, regPlusImm, "STORE8_I32">;
+defm : StorePatImmOff<i32, truncstorei16, regPlusImm, "STORE16_I32">;
+defm : StorePatImmOff<i64, truncstorei8, regPlusImm, "STORE8_I64">;
+defm : StorePatImmOff<i64, truncstorei16, regPlusImm, "STORE16_I64">;
+defm : StorePatImmOff<i64, truncstorei32, regPlusImm, "STORE32_I64">;
+defm : StorePatImmOff<i32, truncstorei8, or_is_add, "STORE8_I32">;
+defm : StorePatImmOff<i32, truncstorei16, or_is_add, "STORE16_I32">;
+defm : StorePatImmOff<i64, truncstorei8, or_is_add, "STORE8_I64">;
+defm : StorePatImmOff<i64, truncstorei16, or_is_add, "STORE16_I64">;
+defm : StorePatImmOff<i64, truncstorei32, or_is_add, "STORE32_I64">;
 
 // Select truncating stores with just a constant offset.
-def : StorePatOffsetOnly<i32, truncstorei8, STORE8_I32>;
-def : StorePatOffsetOnly<i32, truncstorei16, STORE16_I32>;
-def : StorePatOffsetOnly<i64, truncstorei8, STORE8_I64>;
-def : StorePatOffsetOnly<i64, truncstorei16, STORE16_I64>;
-def : StorePatOffsetOnly<i64, truncstorei32, STORE32_I64>;
-def : StorePatGlobalAddrOffOnly<i32, truncstorei8, STORE8_I32>;
-def : StorePatGlobalAddrOffOnly<i32, truncstorei16, STORE16_I32>;
-def : StorePatGlobalAddrOffOnly<i64, truncstorei8, STORE8_I64>;
-def : StorePatGlobalAddrOffOnly<i64, truncstorei16, STORE16_I64>;
-def : StorePatGlobalAddrOffOnly<i64, truncstorei32, STORE32_I64>;
-
+defm : StorePatOffsetOnly<i32, truncstorei8, "STORE8_I32">;
+defm : StorePatOffsetOnly<i32, truncstorei16, "STORE16_I32">;
+defm : StorePatOffsetOnly<i64, truncstorei8, "STORE8_I64">;
+defm : StorePatOffsetOnly<i64, truncstorei16, "STORE16_I64">;
+defm : StorePatOffsetOnly<i64, truncstorei32, "STORE32_I64">;
+defm : StorePatGlobalAddrOffOnly<i32, truncstorei8, "STORE8_I32">;
+defm : StorePatGlobalAddrOffOnly<i32, truncstorei16, "STORE16_I32">;
+defm : StorePatGlobalAddrOffOnly<i64, truncstorei8, "STORE8_I64">;
+defm : StorePatGlobalAddrOffOnly<i64, truncstorei16, "STORE16_I64">;
+defm : StorePatGlobalAddrOffOnly<i64, truncstorei32, "STORE32_I64">;
+
+multiclass MemoryOps<WebAssemblyRegClass rc, string B> {
 // Current memory size.
-defm MEMORY_SIZE_I32 : I<(outs I32:$dst), (ins i32imm:$flags),
+defm MEMORY_SIZE_A#B : I<(outs rc:$dst), (ins i32imm:$flags),
                          (outs), (ins i32imm:$flags),
-                         [(set I32:$dst,
+                         [(set rc:$dst,
                            (int_wasm_memory_size (i32 imm:$flags)))],
                          "memory.size\t$dst, $flags", "memory.size\t$flags",
-                         0x3f>,
-                       Requires<[HasAddr32]>;
+                         0x3f>;
 
 // Grow memory.
-defm MEMORY_GROW_I32 : I<(outs I32:$dst), (ins i32imm:$flags, I32:$delta),
+defm MEMORY_GROW_A#B : I<(outs rc:$dst), (ins i32imm:$flags, rc:$delta),
                          (outs), (ins i32imm:$flags),
-                         [(set I32:$dst,
+                         [(set rc:$dst,
                            (int_wasm_memory_grow (i32 imm:$flags),
-                             I32:$delta))],
+                             rc:$delta))],
                          "memory.grow\t$dst, $flags, $delta",
-                         "memory.grow\t$flags", 0x40>,
-                       Requires<[HasAddr32]>;
+                         "memory.grow\t$flags", 0x40>;
+}
+
+defm : MemoryOps<I32, "32">;
+defm : MemoryOps<I64, "64">;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
index afe89de60b36..14d723750f07 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrRef.td
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 ///
 /// \file
-/// WebAssembly refence type operand codegen constructs.
+/// WebAssembly reference type operand codegen constructs.
 ///
 //===----------------------------------------------------------------------===//
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
index 64033c993e3f..4f3da2f35c61 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td
@@ -40,119 +40,150 @@ def LaneIdx#SIZE : ImmLeaf<i32, "return 0 <= Imm && Imm < "#SIZE#";">;
 //===----------------------------------------------------------------------===//
 
 // Load: v128.load
-let mayLoad = 1, UseNamedOperandTable = 1 in
-defm LOAD_V128 :
+let mayLoad = 1, UseNamedOperandTable = 1 in {
+defm LOAD_V128_A32 :
   SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
          (outs), (ins P2Align:$p2align, offset32_op:$off), [],
          "v128.load\t$dst, ${off}(${addr})$p2align",
          "v128.load\t$off$p2align", 0>;
+defm LOAD_V128_A64 :
+  SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
+         (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+         "v128.load\t$dst, ${off}(${addr})$p2align",
+         "v128.load\t$off$p2align", 0>;
+}
 
 // Def load and store patterns from WebAssemblyInstrMemory.td for vector types
 foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
-def : LoadPatNoOffset<vec_t, load, LOAD_V128>;
-def : LoadPatImmOff<vec_t, load, regPlusImm, LOAD_V128>;
-def : LoadPatImmOff<vec_t, load, or_is_add, LOAD_V128>;
-def : LoadPatOffsetOnly<vec_t, load, LOAD_V128>;
-def : LoadPatGlobalAddrOffOnly<vec_t, load, LOAD_V128>;
+defm : LoadPatNoOffset<vec_t, load, "LOAD_V128">;
+defm : LoadPatImmOff<vec_t, load, regPlusImm, "LOAD_V128">;
+defm : LoadPatImmOff<vec_t, load, or_is_add, "LOAD_V128">;
+defm : LoadPatOffsetOnly<vec_t, load, "LOAD_V128">;
+defm : LoadPatGlobalAddrOffOnly<vec_t, load, "LOAD_V128">;
 }
 
 // vNxM.load_splat
 multiclass SIMDLoadSplat<string vec, bits<32> simdop> {
-  let mayLoad = 1, UseNamedOperandTable = 1,
-      Predicates = [HasUnimplementedSIMD128] in
-  defm LOAD_SPLAT_#vec :
-    SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
-           (outs), (ins P2Align:$p2align, offset32_op:$off), [],
+  let mayLoad = 1, UseNamedOperandTable = 1 in {
+  defm LOAD_SPLAT_#vec#_A32 :
+    SIMD_I<(outs V128:$dst),
+           (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+           (outs),
+           (ins P2Align:$p2align, offset32_op:$off), [],
+           vec#".load_splat\t$dst, ${off}(${addr})$p2align",
+           vec#".load_splat\t$off$p2align", simdop>;
+  defm LOAD_SPLAT_#vec#_A64 :
+    SIMD_I<(outs V128:$dst),
+           (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
+           (outs),
+           (ins P2Align:$p2align, offset64_op:$off), [],
            vec#".load_splat\t$dst, ${off}(${addr})$p2align",
            vec#".load_splat\t$off$p2align", simdop>;
+  }
 }
 
-defm "" : SIMDLoadSplat<"v8x16", 194>;
-defm "" : SIMDLoadSplat<"v16x8", 195>;
-defm "" : SIMDLoadSplat<"v32x4", 196>;
-defm "" : SIMDLoadSplat<"v64x2", 197>;
+defm "" : SIMDLoadSplat<"v8x16", 7>;
+defm "" : SIMDLoadSplat<"v16x8", 8>;
+defm "" : SIMDLoadSplat<"v32x4", 9>;
+defm "" : SIMDLoadSplat<"v64x2", 10>;
 
 def wasm_load_splat_t : SDTypeProfile<1, 1, [SDTCisPtrTy<1>]>;
 def wasm_load_splat : SDNode<"WebAssemblyISD::LOAD_SPLAT", wasm_load_splat_t,
                              [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def load_splat : PatFrag<(ops node:$addr), (wasm_load_splat node:$addr)>;
 
-let Predicates = [HasUnimplementedSIMD128] in
 foreach args = [["v16i8", "v8x16"], ["v8i16", "v16x8"], ["v4i32", "v32x4"],
                 ["v2i64", "v64x2"], ["v4f32", "v32x4"], ["v2f64", "v64x2"]] in {
-def : LoadPatNoOffset<!cast<ValueType>(args[0]),
-                      load_splat,
-                      !cast<NI>("LOAD_SPLAT_"#args[1])>;
-def : LoadPatImmOff<!cast<ValueType>(args[0]),
-                    load_splat,
-                    regPlusImm,
-                    !cast<NI>("LOAD_SPLAT_"#args[1])>;
-def : LoadPatImmOff<!cast<ValueType>(args[0]),
-                    load_splat,
-                    or_is_add,
-                    !cast<NI>("LOAD_SPLAT_"#args[1])>;
-def : LoadPatOffsetOnly<!cast<ValueType>(args[0]),
-                        load_splat,
-                        !cast<NI>("LOAD_SPLAT_"#args[1])>;
-def : LoadPatGlobalAddrOffOnly<!cast<ValueType>(args[0]),
-                               load_splat,
-                               !cast<NI>("LOAD_SPLAT_"#args[1])>;
+defm : LoadPatNoOffset<!cast<ValueType>(args[0]),
+                       load_splat,
+                       "LOAD_SPLAT_"#args[1]>;
+defm : LoadPatImmOff<!cast<ValueType>(args[0]),
+                     load_splat,
+                     regPlusImm,
+                     "LOAD_SPLAT_"#args[1]>;
+defm : LoadPatImmOff<!cast<ValueType>(args[0]),
+                     load_splat,
+                     or_is_add,
+                     "LOAD_SPLAT_"#args[1]>;
+defm : LoadPatOffsetOnly<!cast<ValueType>(args[0]),
+                         load_splat,
+                         "LOAD_SPLAT_"#args[1]>;
+defm : LoadPatGlobalAddrOffOnly<!cast<ValueType>(args[0]),
+                                load_splat,
+                                "LOAD_SPLAT_"#args[1]>;
 }
 
 // Load and extend
 multiclass SIMDLoadExtend<ValueType vec_t, string name, bits<32> simdop> {
-  let mayLoad = 1, UseNamedOperandTable = 1,
-      Predicates = [HasUnimplementedSIMD128] in {
-  defm LOAD_EXTEND_S_#vec_t :
-    SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+  let mayLoad = 1, UseNamedOperandTable = 1 in {
+  defm LOAD_EXTEND_S_#vec_t#_A32 :
+    SIMD_I<(outs V128:$dst),
+           (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
            name#"_s\t$dst, ${off}(${addr})$p2align",
            name#"_s\t$off$p2align", simdop>;
-  defm LOAD_EXTEND_U_#vec_t :
-    SIMD_I<(outs V128:$dst), (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
+  defm LOAD_EXTEND_U_#vec_t#_A32 :
+    SIMD_I<(outs V128:$dst),
+           (ins P2Align:$p2align, offset32_op:$off, I32:$addr),
            (outs), (ins P2Align:$p2align, offset32_op:$off), [],
            name#"_u\t$dst, ${off}(${addr})$p2align",
            name#"_u\t$off$p2align", !add(simdop, 1)>;
+  defm LOAD_EXTEND_S_#vec_t#_A64 :
+    SIMD_I<(outs V128:$dst),
+           (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
+           (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+           name#"_s\t$dst, ${off}(${addr})$p2align",
+           name#"_s\t$off$p2align", simdop>;
+  defm LOAD_EXTEND_U_#vec_t#_A64 :
+    SIMD_I<(outs V128:$dst),
+           (ins P2Align:$p2align, offset64_op:$off, I64:$addr),
+           (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+           name#"_u\t$dst, ${off}(${addr})$p2align",
+           name#"_u\t$off$p2align", !add(simdop, 1)>;
   }
 }
 
-defm "" : SIMDLoadExtend<v8i16, "i16x8.load8x8", 210>;
-defm "" : SIMDLoadExtend<v4i32, "i32x4.load16x4", 212>;
-defm "" : SIMDLoadExtend<v2i64, "i64x2.load32x2", 214>;
+defm "" : SIMDLoadExtend<v8i16, "i16x8.load8x8", 1>;
+defm "" : SIMDLoadExtend<v4i32, "i32x4.load16x4", 3>;
+defm "" : SIMDLoadExtend<v2i64, "i64x2.load32x2", 5>;
 
-let Predicates = [HasUnimplementedSIMD128] in
 foreach types = [[v8i16, i8], [v4i32, i16], [v2i64, i32]] in
 foreach exts = [["sextloadv", "_S"],
                 ["zextloadv", "_U"],
                 ["extloadv", "_U"]] in {
-def : LoadPatNoOffset<types[0], !cast<PatFrag>(exts[0]#types[1]),
-                      !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
-def : LoadPatImmOff<types[0], !cast<PatFrag>(exts[0]#types[1]), regPlusImm,
-                    !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
-def : LoadPatImmOff<types[0], !cast<PatFrag>(exts[0]#types[1]), or_is_add,
-                    !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
-def : LoadPatOffsetOnly<types[0], !cast<PatFrag>(exts[0]#types[1]),
-                        !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
-def : LoadPatGlobalAddrOffOnly<types[0], !cast<PatFrag>(exts[0]#types[1]),
-                               !cast<NI>("LOAD_EXTEND"#exts[1]#"_"#types[0])>;
+defm : LoadPatNoOffset<types[0], !cast<PatFrag>(exts[0]#types[1]),
+                       "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
+defm : LoadPatImmOff<types[0], !cast<PatFrag>(exts[0]#types[1]), regPlusImm,
+                     "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
+defm : LoadPatImmOff<types[0], !cast<PatFrag>(exts[0]#types[1]), or_is_add,
+                     "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
+defm : LoadPatOffsetOnly<types[0], !cast<PatFrag>(exts[0]#types[1]),
+                         "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
+defm : LoadPatGlobalAddrOffOnly<types[0], !cast<PatFrag>(exts[0]#types[1]),
+                                "LOAD_EXTEND"#exts[1]#"_"#types[0]>;
 }
 
 
 // Store: v128.store
-let mayStore = 1, UseNamedOperandTable = 1 in
-defm STORE_V128 :
+let mayStore = 1, UseNamedOperandTable = 1 in {
+defm STORE_V128_A32 :
   SIMD_I<(outs), (ins P2Align:$p2align, offset32_op:$off, I32:$addr, V128:$vec),
          (outs), (ins P2Align:$p2align, offset32_op:$off), [],
          "v128.store\t${off}(${addr})$p2align, $vec",
-         "v128.store\t$off$p2align", 1>;
-
+         "v128.store\t$off$p2align", 11>;
+defm STORE_V128_A64 :
+  SIMD_I<(outs), (ins P2Align:$p2align, offset64_op:$off, I64:$addr, V128:$vec),
+         (outs), (ins P2Align:$p2align, offset64_op:$off), [],
+         "v128.store\t${off}(${addr})$p2align, $vec",
+         "v128.store\t$off$p2align", 11>;
+}
 foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in {
 // Def load and store patterns from WebAssemblyInstrMemory.td for vector types
-def : StorePatNoOffset<vec_t, store, STORE_V128>;
-def : StorePatImmOff<vec_t, store, regPlusImm, STORE_V128>;
-def : StorePatImmOff<vec_t, store, or_is_add, STORE_V128>;
-def : StorePatOffsetOnly<vec_t, store, STORE_V128>;
-def : StorePatGlobalAddrOffOnly<vec_t, store, STORE_V128>;
+defm : StorePatNoOffset<vec_t, store, "STORE_V128">;
+defm : StorePatImmOff<vec_t, store, regPlusImm, "STORE_V128">;
+defm : StorePatImmOff<vec_t, store, or_is_add, "STORE_V128">;
+defm : StorePatOffsetOnly<vec_t, store, "STORE_V128">;
+defm : StorePatGlobalAddrOffOnly<vec_t, store, "STORE_V128">;
 }
 
 //===----------------------------------------------------------------------===//
@@ -166,7 +197,7 @@ multiclass ConstVec<ValueType vec_t, dag ops, dag pat, string args> {
   defm CONST_V128_#vec_t : SIMD_I<(outs V128:$dst), ops, (outs), ops,
                                   [(set V128:$dst, (vec_t pat))],
                                   "v128.const\t$dst, "#args,
-                                  "v128.const\t"#args, 2>;
+                                  "v128.const\t"#args, 12>;
 }
 
 defm "" : ConstVec<v16i8,
@@ -244,7 +275,7 @@ defm SHUFFLE :
          "v8x16.shuffle\t"#
            "$m0, $m1, $m2, $m3, $m4, $m5, $m6, $m7, "#
            "$m8, $m9, $mA, $mB, $mC, $mD, $mE, $mF",
-         3>;
+         13>;
 
 // Shuffles after custom lowering
 def wasm_shuffle_t : SDTypeProfile<1, 18, []>;
@@ -273,12 +304,11 @@ def : Pat<(vec_t (wasm_shuffle (vec_t V128:$x), (vec_t V128:$y),
 // Swizzle lanes: v8x16.swizzle
 def wasm_swizzle_t : SDTypeProfile<1, 2, []>;
 def wasm_swizzle : SDNode<"WebAssemblyISD::SWIZZLE", wasm_swizzle_t>;
-let Predicates = [HasUnimplementedSIMD128] in
 defm SWIZZLE :
   SIMD_I<(outs V128:$dst), (ins V128:$src, V128:$mask), (outs), (ins),
          [(set (v16i8 V128:$dst),
            (wasm_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)))],
-         "v8x16.swizzle\t$dst, $src, $mask", "v8x16.swizzle", 192>;
+         "v8x16.swizzle\t$dst, $src, $mask", "v8x16.swizzle", 14>;
 
 def : Pat<(int_wasm_swizzle (v16i8 V128:$src), (v16i8 V128:$mask)),
           (SWIZZLE V128:$src, V128:$mask)>;
@@ -298,19 +328,17 @@ def splat16 : PatFrag<(ops node:$x), (build_vector
 
 multiclass Splat<ValueType vec_t, string vec, WebAssemblyRegClass reg_t,
                  PatFrag splat_pat, bits<32> simdop> {
-  // Prefer splats over v128.const for const splats (65 is lowest that works)
-  let AddedComplexity = 65 in
   defm SPLAT_#vec_t : SIMD_I<(outs V128:$dst), (ins reg_t:$x), (outs), (ins),
                              [(set (vec_t V128:$dst), (splat_pat reg_t:$x))],
                              vec#".splat\t$dst, $x", vec#".splat", simdop>;
 }
 
-defm "" : Splat<v16i8, "i8x16", I32, splat16, 4>;
-defm "" : Splat<v8i16, "i16x8", I32, splat8, 8>;
-defm "" : Splat<v4i32, "i32x4", I32, splat4, 12>;
-defm "" : Splat<v2i64, "i64x2", I64, splat2, 15>;
-defm "" : Splat<v4f32, "f32x4", F32, splat4, 18>;
-defm "" : Splat<v2f64, "f64x2", F64, splat2, 21>;
+defm "" : Splat<v16i8, "i8x16", I32, splat16, 15>;
+defm "" : Splat<v8i16, "i16x8", I32, splat8, 16>;
+defm "" : Splat<v4i32, "i32x4", I32, splat4, 17>;
+defm "" : Splat<v2i64, "i64x2", I64, splat2, 18>;
+defm "" : Splat<v4f32, "f32x4", F32, splat4, 19>;
+defm "" : Splat<v2f64, "f64x2", F64, splat2, 20>;
 
 // scalar_to_vector leaves high lanes undefined, so can be a splat
 class ScalarSplatPat<ValueType vec_t, ValueType lane_t,
@@ -330,82 +358,49 @@ def : ScalarSplatPat<v2f64, f64, F64>;
 //===----------------------------------------------------------------------===//
 
 // Extract lane as a scalar: extract_lane / extract_lane_s / extract_lane_u
-multiclass ExtractLane<ValueType vec_t, string vec, ImmLeaf imm_t,
-                       WebAssemblyRegClass reg_t, bits<32> simdop,
-                       string suffix = "", SDNode extract = vector_extract> {
+multiclass ExtractLane<ValueType vec_t, string vec, WebAssemblyRegClass reg_t,
+                       bits<32> simdop, string suffix = ""> {
   defm EXTRACT_LANE_#vec_t#suffix :
       SIMD_I<(outs reg_t:$dst), (ins V128:$vec, vec_i8imm_op:$idx),
-             (outs), (ins vec_i8imm_op:$idx),
-             [(set reg_t:$dst, (extract (vec_t V128:$vec), (i32 imm_t:$idx)))],
+             (outs), (ins vec_i8imm_op:$idx), [],
              vec#".extract_lane"#suffix#"\t$dst, $vec, $idx",
              vec#".extract_lane"#suffix#"\t$idx", simdop>;
 }
 
-multiclass ExtractPat<ValueType lane_t, int mask> {
-  def _s : PatFrag<(ops node:$vec, node:$idx),
-                   (i32 (sext_inreg
-                     (i32 (vector_extract
-                       node:$vec,
-                       node:$idx
-                     )),
-                     lane_t
-                   ))>;
-  def _u : PatFrag<(ops node:$vec, node:$idx),
-                   (i32 (and
-                     (i32 (vector_extract
-                       node:$vec,
-                       node:$idx
-                     )),
-                     (i32 mask)
-                   ))>;
-}
-
-defm extract_i8x16 : ExtractPat<i8, 0xff>;
-defm extract_i16x8 : ExtractPat<i16, 0xffff>;
-
-multiclass ExtractLaneExtended<string sign, bits<32> baseInst> {
-  defm "" : ExtractLane<v16i8, "i8x16", LaneIdx16, I32, baseInst, sign,
-                        !cast<PatFrag>("extract_i8x16"#sign)>;
-  defm "" : ExtractLane<v8i16, "i16x8", LaneIdx8, I32, !add(baseInst, 4), sign,
-                        !cast<PatFrag>("extract_i16x8"#sign)>;
-}
-
-defm "" : ExtractLaneExtended<"_s", 5>;
-let Predicates = [HasUnimplementedSIMD128] in
-defm "" : ExtractLaneExtended<"_u", 6>;
-defm "" : ExtractLane<v4i32, "i32x4", LaneIdx4, I32, 13>;
-defm "" : ExtractLane<v2i64, "i64x2", LaneIdx2, I64, 16>;
-defm "" : ExtractLane<v4f32, "f32x4", LaneIdx4, F32, 19>;
-defm "" : ExtractLane<v2f64, "f64x2", LaneIdx2, F64, 22>;
-
-// It would be more conventional to use unsigned extracts, but v8
-// doesn't implement them yet
-def : Pat<(i32 (vector_extract (v16i8 V128:$vec), (i32 LaneIdx16:$idx))),
-          (EXTRACT_LANE_v16i8_s V128:$vec, (i32 LaneIdx16:$idx))>;
-def : Pat<(i32 (vector_extract (v8i16 V128:$vec), (i32 LaneIdx8:$idx))),
-          (EXTRACT_LANE_v8i16_s V128:$vec, (i32 LaneIdx8:$idx))>;
-
-// Lower undef lane indices to zero
-def : Pat<(and (i32 (vector_extract (v16i8 V128:$vec), undef)), (i32 0xff)),
-          (EXTRACT_LANE_v16i8_u V128:$vec, 0)>;
-def : Pat<(and (i32 (vector_extract (v8i16 V128:$vec), undef)), (i32 0xffff)),
-          (EXTRACT_LANE_v8i16_u V128:$vec, 0)>;
-def : Pat<(i32 (vector_extract (v16i8 V128:$vec), undef)),
-          (EXTRACT_LANE_v16i8_u V128:$vec, 0)>;
-def : Pat<(i32 (vector_extract (v8i16 V128:$vec), undef)),
-          (EXTRACT_LANE_v8i16_u V128:$vec, 0)>;
-def : Pat<(sext_inreg (i32 (vector_extract (v16i8 V128:$vec), undef)), i8),
-          (EXTRACT_LANE_v16i8_s V128:$vec, 0)>;
-def : Pat<(sext_inreg (i32 (vector_extract (v8i16 V128:$vec), undef)), i16),
-          (EXTRACT_LANE_v8i16_s V128:$vec, 0)>;
-def : Pat<(vector_extract (v4i32 V128:$vec), undef),
-          (EXTRACT_LANE_v4i32 V128:$vec, 0)>;
-def : Pat<(vector_extract (v2i64 V128:$vec), undef),
-          (EXTRACT_LANE_v2i64 V128:$vec, 0)>;
-def : Pat<(vector_extract (v4f32 V128:$vec), undef),
-          (EXTRACT_LANE_v4f32 V128:$vec, 0)>;
-def : Pat<(vector_extract (v2f64 V128:$vec), undef),
-          (EXTRACT_LANE_v2f64 V128:$vec, 0)>;
+defm "" : ExtractLane<v16i8, "i8x16", I32, 21, "_s">;
+defm "" : ExtractLane<v16i8, "i8x16", I32, 22, "_u">;
+defm "" : ExtractLane<v8i16, "i16x8", I32, 24, "_s">;
+defm "" : ExtractLane<v8i16, "i16x8", I32, 25, "_u">;
+defm "" : ExtractLane<v4i32, "i32x4", I32, 27>;
+defm "" : ExtractLane<v2i64, "i64x2", I64, 29>;
+defm "" : ExtractLane<v4f32, "f32x4", F32, 31>;
+defm "" : ExtractLane<v2f64, "f64x2", F64, 33>;
+
+def : Pat<(vector_extract (v16i8 V128:$vec), (i32 LaneIdx16:$idx)),
+          (EXTRACT_LANE_v16i8_u V128:$vec, imm:$idx)>;
+def : Pat<(vector_extract (v8i16 V128:$vec), (i32 LaneIdx8:$idx)),
+          (EXTRACT_LANE_v8i16_u V128:$vec, imm:$idx)>;
+def : Pat<(vector_extract (v4i32 V128:$vec), (i32 LaneIdx4:$idx)),
+          (EXTRACT_LANE_v4i32 V128:$vec, imm:$idx)>;
+def : Pat<(vector_extract (v4f32 V128:$vec), (i32 LaneIdx4:$idx)),
+          (EXTRACT_LANE_v4f32 V128:$vec, imm:$idx)>;
+def : Pat<(vector_extract (v2i64 V128:$vec), (i32 LaneIdx2:$idx)),
+          (EXTRACT_LANE_v2i64 V128:$vec, imm:$idx)>;
+def : Pat<(vector_extract (v2f64 V128:$vec), (i32 LaneIdx2:$idx)),
+          (EXTRACT_LANE_v2f64 V128:$vec, imm:$idx)>;
+
+def : Pat<
+  (sext_inreg (vector_extract (v16i8 V128:$vec), (i32 LaneIdx16:$idx)), i8),
+  (EXTRACT_LANE_v16i8_s V128:$vec, imm:$idx)>;
+def : Pat<
+  (and (vector_extract (v16i8 V128:$vec), (i32 LaneIdx16:$idx)), (i32 0xff)),
+  (EXTRACT_LANE_v16i8_u V128:$vec, imm:$idx)>;
+def : Pat<
+  (sext_inreg (vector_extract (v8i16 V128:$vec), (i32 LaneIdx8:$idx)), i16),
+  (EXTRACT_LANE_v8i16_s V128:$vec, imm:$idx)>;
+def : Pat<
+  (and (vector_extract (v8i16 V128:$vec), (i32 LaneIdx8:$idx)), (i32 0xffff)),
+  (EXTRACT_LANE_v8i16_u V128:$vec, imm:$idx)>;
 
 // Replace lane value: replace_lane
 multiclass ReplaceLane<ValueType vec_t, string vec, ImmLeaf imm_t,
@@ -420,12 +415,12 @@ multiclass ReplaceLane<ValueType vec_t, string vec, ImmLeaf imm_t,
              vec#".replace_lane\t$idx", simdop>;
 }
 
-defm "" : ReplaceLane<v16i8, "i8x16", LaneIdx16, I32, i32, 7>;
-defm "" : ReplaceLane<v8i16, "i16x8", LaneIdx8, I32, i32, 11>;
-defm "" : ReplaceLane<v4i32, "i32x4", LaneIdx4, I32, i32, 14>;
-defm "" : ReplaceLane<v2i64, "i64x2", LaneIdx2, I64, i64, 17>;
-defm "" : ReplaceLane<v4f32, "f32x4", LaneIdx4, F32, f32, 20>;
-defm "" : ReplaceLane<v2f64, "f64x2", LaneIdx2, F64, f64, 23>;
+defm "" : ReplaceLane<v16i8, "i8x16", LaneIdx16, I32, i32, 23>;
+defm "" : ReplaceLane<v8i16, "i16x8", LaneIdx8, I32, i32, 26>;
+defm "" : ReplaceLane<v4i32, "i32x4", LaneIdx4, I32, i32, 28>;
+defm "" : ReplaceLane<v2i64, "i64x2", LaneIdx2, I64, i64, 30>;
+defm "" : ReplaceLane<v4f32, "f32x4", LaneIdx4, F32, f32, 32>;
+defm "" : ReplaceLane<v2f64, "f64x2", LaneIdx2, F64, f64, 34>;
 
 // Lower undef lane indices to zero
 def : Pat<(vector_insert (v16i8 V128:$vec), I32:$x, undef),
@@ -471,35 +466,35 @@ multiclass SIMDConditionFP<string name, CondCode cond, bits<32> baseInst> {
 
 // Equality: eq
 let isCommutable = 1 in {
-defm EQ : SIMDConditionInt<"eq", SETEQ, 24>;
-defm EQ : SIMDConditionFP<"eq", SETOEQ, 64>;
+defm EQ : SIMDConditionInt<"eq", SETEQ, 35>;
+defm EQ : SIMDConditionFP<"eq", SETOEQ, 65>;
 } // isCommutable = 1
 
 // Non-equality: ne
 let isCommutable = 1 in {
-defm NE : SIMDConditionInt<"ne", SETNE, 25>;
-defm NE : SIMDConditionFP<"ne", SETUNE, 65>;
+defm NE : SIMDConditionInt<"ne", SETNE, 36>;
+defm NE : SIMDConditionFP<"ne", SETUNE, 66>;
 } // isCommutable = 1
 
 // Less than: lt_s / lt_u / lt
-defm LT_S : SIMDConditionInt<"lt_s", SETLT, 26>;
-defm LT_U : SIMDConditionInt<"lt_u", SETULT, 27>;
-defm LT : SIMDConditionFP<"lt", SETOLT, 66>;
+defm LT_S : SIMDConditionInt<"lt_s", SETLT, 37>;
+defm LT_U : SIMDConditionInt<"lt_u", SETULT, 38>;
+defm LT : SIMDConditionFP<"lt", SETOLT, 67>;
 
 // Greater than: gt_s / gt_u / gt
-defm GT_S : SIMDConditionInt<"gt_s", SETGT, 28>;
-defm GT_U : SIMDConditionInt<"gt_u", SETUGT, 29>;
-defm GT : SIMDConditionFP<"gt", SETOGT, 67>;
+defm GT_S : SIMDConditionInt<"gt_s", SETGT, 39>;
+defm GT_U : SIMDConditionInt<"gt_u", SETUGT, 40>;
+defm GT : SIMDConditionFP<"gt", SETOGT, 68>;
 
 // Less than or equal: le_s / le_u / le
-defm LE_S : SIMDConditionInt<"le_s", SETLE, 30>;
-defm LE_U : SIMDConditionInt<"le_u", SETULE, 31>;
-defm LE : SIMDConditionFP<"le", SETOLE, 68>;
+defm LE_S : SIMDConditionInt<"le_s", SETLE, 41>;
+defm LE_U : SIMDConditionInt<"le_u", SETULE, 42>;
+defm LE : SIMDConditionFP<"le", SETOLE, 69>;
 
 // Greater than or equal: ge_s / ge_u / ge
-defm GE_S : SIMDConditionInt<"ge_s", SETGE, 32>;
-defm GE_U : SIMDConditionInt<"ge_u", SETUGE, 33>;
-defm GE : SIMDConditionFP<"ge", SETOGE, 69>;
+defm GE_S : SIMDConditionInt<"ge_s", SETGE, 43>;
+defm GE_U : SIMDConditionInt<"ge_u", SETUGE, 44>;
+defm GE : SIMDConditionFP<"ge", SETOGE, 70>;
 
 // Lower float comparisons that don't care about NaN to standard WebAssembly
 // float comparisons. These instructions are generated with nnan and in the
@@ -548,19 +543,18 @@ multiclass SIMDUnary<ValueType vec_t, string vec, SDNode node, string name,
 
 // Bitwise logic: v128.not
 foreach vec_t = [v16i8, v8i16, v4i32, v2i64] in
-defm NOT: SIMDUnary<vec_t, "v128", vnot, "not", 76>;
+defm NOT: SIMDUnary<vec_t, "v128", vnot, "not", 77>;
 
 // Bitwise logic: v128.and / v128.or / v128.xor
 let isCommutable = 1 in {
-defm AND : SIMDBitwise<and, "and", 77>;
-defm OR : SIMDBitwise<or, "or", 78>;
-defm XOR : SIMDBitwise<xor, "xor", 79>;
+defm AND : SIMDBitwise<and, "and", 78>;
+defm OR : SIMDBitwise<or, "or", 80>;
+defm XOR : SIMDBitwise<xor, "xor", 81>;
 } // isCommutable = 1
 
 // Bitwise logic: v128.andnot
 def andnot : PatFrag<(ops node:$left, node:$right), (and $left, (vnot $right))>;
-let Predicates = [HasUnimplementedSIMD128] in
-defm ANDNOT : SIMDBitwise<andnot, "andnot", 216>;
+defm ANDNOT : SIMDBitwise<andnot, "andnot", 79>;
 
 // Bitwise select: v128.bitselect
 foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
@@ -571,7 +565,7 @@ foreach vec_t = [v16i8, v8i16, v4i32, v2i64, v4f32, v2f64] in
                (vec_t V128:$v1), (vec_t V128:$v2), (vec_t V128:$c)
              ))
            )],
-           "v128.bitselect\t$dst, $v1, $v2, $c", "v128.bitselect", 80>;
+           "v128.bitselect\t$dst, $v1, $v2, $c", "v128.bitselect", 82>;
 
 // Bitselect is equivalent to (c & v1) | (~c & v2)
 foreach vec_t = [v16i8, v8i16, v4i32, v2i64] in
@@ -586,9 +580,9 @@ foreach vec_t = [v16i8, v8i16, v4i32, v2i64] in
 
 multiclass SIMDUnaryInt<SDNode node, string name, bits<32> baseInst> {
   defm "" : SIMDUnary<v16i8, "i8x16", node, name, baseInst>;
-  defm "" : SIMDUnary<v8i16, "i16x8", node, name, !add(baseInst, 17)>;
-  defm "" : SIMDUnary<v4i32, "i32x4", node, name, !add(baseInst, 34)>;
-  defm "" : SIMDUnary<v2i64, "i64x2", node, name, !add(baseInst, 51)>;
+  defm "" : SIMDUnary<v8i16, "i16x8", node, name, !add(baseInst, 32)>;
+  defm "" : SIMDUnary<v4i32, "i32x4", node, name, !add(baseInst, 64)>;
+  defm "" : SIMDUnary<v2i64, "i64x2", node, name, !add(baseInst, 96)>;
 }
 
 multiclass SIMDReduceVec<ValueType vec_t, string vec, SDNode op, string name,
@@ -600,22 +594,25 @@ multiclass SIMDReduceVec<ValueType vec_t, string vec, SDNode op, string name,
 
 multiclass SIMDReduce<SDNode op, string name, bits<32> baseInst> {
   defm "" : SIMDReduceVec<v16i8, "i8x16", op, name, baseInst>;
-  defm "" : SIMDReduceVec<v8i16, "i16x8", op, name, !add(baseInst, 17)>;
-  defm "" : SIMDReduceVec<v4i32, "i32x4", op, name, !add(baseInst, 34)>;
-  defm "" : SIMDReduceVec<v2i64, "i64x2", op, name, !add(baseInst, 51)>;
+  defm "" : SIMDReduceVec<v8i16, "i16x8", op, name, !add(baseInst, 32)>;
+  defm "" : SIMDReduceVec<v4i32, "i32x4", op, name, !add(baseInst, 64)>;
+  defm "" : SIMDReduceVec<v2i64, "i64x2", op, name, !add(baseInst, 96)>;
 }
 
 // Integer vector negation
 def ivneg : PatFrag<(ops node:$in), (sub immAllZerosV, node:$in)>;
 
+// Integer absolute value: abs
+defm ABS : SIMDUnaryInt<abs, "abs", 96>;
+
 // Integer negation: neg
-defm NEG : SIMDUnaryInt<ivneg, "neg", 81>;
+defm NEG : SIMDUnaryInt<ivneg, "neg", 97>;
 
 // Any lane true: any_true
-defm ANYTRUE : SIMDReduce<int_wasm_anytrue, "any_true", 82>;
+defm ANYTRUE : SIMDReduce<int_wasm_anytrue, "any_true", 98>;
 
 // All lanes true: all_true
-defm ALLTRUE : SIMDReduce<int_wasm_alltrue, "all_true", 83>;
+defm ALLTRUE : SIMDReduce<int_wasm_alltrue, "all_true", 99>;
 
 // Reductions already return 0 or 1, so and 1, setne 0, and seteq 1
 // can be folded out
@@ -639,109 +636,108 @@ def : Pat<(i32 (seteq
           (i32 (!cast<NI>(reduction[1]#"_"#ty) (ty V128:$x)))>;
 }
 
+multiclass SIMDBitmask<ValueType vec_t, string vec, bits<32> simdop> {
+  defm _#vec_t : SIMD_I<(outs I32:$dst), (ins V128:$vec), (outs), (ins),
+                         [(set I32:$dst,
+                           (i32 (int_wasm_bitmask (vec_t V128:$vec)))
+                         )],
+                         vec#".bitmask\t$dst, $vec", vec#".bitmask", simdop>;
+}
+
+defm BITMASK : SIMDBitmask<v16i8, "i8x16", 100>;
+defm BITMASK : SIMDBitmask<v8i16, "i16x8", 132>;
+defm BITMASK : SIMDBitmask<v4i32, "i32x4", 164>;
+
 //===----------------------------------------------------------------------===//
 // Bit shifts
 //===----------------------------------------------------------------------===//
 
-multiclass SIMDShift<ValueType vec_t, string vec, SDNode node, dag shift_vec,
-                     string name, bits<32> simdop> {
+multiclass SIMDShift<ValueType vec_t, string vec, SDNode node, string name,
+                     bits<32> simdop> {
   defm _#vec_t : SIMD_I<(outs V128:$dst), (ins V128:$vec, I32:$x),
                         (outs), (ins),
-                        [(set (vec_t V128:$dst),
-                          (node V128:$vec, (vec_t shift_vec)))],
+                        [(set (vec_t V128:$dst), (node V128:$vec, I32:$x))],
                         vec#"."#name#"\t$dst, $vec, $x", vec#"."#name, simdop>;
 }
 
 multiclass SIMDShiftInt<SDNode node, string name, bits<32> baseInst> {
-  defm "" : SIMDShift<v16i8, "i8x16", node, (splat16 I32:$x), name, baseInst>;
-  defm "" : SIMDShift<v8i16, "i16x8", node, (splat8 I32:$x), name,
-                      !add(baseInst, 17)>;
-  defm "" : SIMDShift<v4i32, "i32x4", node, (splat4 I32:$x), name,
-                      !add(baseInst, 34)>;
-  defm "" : SIMDShift<v2i64, "i64x2", node, (splat2 (i64 (zext I32:$x))),
-                      name, !add(baseInst, 51)>;
+  defm "" : SIMDShift<v16i8, "i8x16", node, name, baseInst>;
+  defm "" : SIMDShift<v8i16, "i16x8", node, name, !add(baseInst, 32)>;
+  defm "" : SIMDShift<v4i32, "i32x4", node, name, !add(baseInst, 64)>;
+  defm "" : SIMDShift<v2i64, "i64x2", node, name, !add(baseInst, 96)>;
 }
 
-// Left shift by scalar: shl
-defm SHL : SIMDShiftInt<shl, "shl", 84>;
-
-// Right shift by scalar: shr_s / shr_u
-defm SHR_S : SIMDShiftInt<sra, "shr_s", 85>;
-defm SHR_U : SIMDShiftInt<srl, "shr_u", 86>;
-
-// Truncate i64 shift operands to i32s, except if they are already i32s
-foreach shifts = [[shl, SHL_v2i64], [sra, SHR_S_v2i64], [srl, SHR_U_v2i64]] in {
-def : Pat<(v2i64 (shifts[0]
-            (v2i64 V128:$vec),
-            (v2i64 (splat2 (i64 (sext I32:$x))))
-          )),
-          (v2i64 (shifts[1] (v2i64 V128:$vec), (i32 I32:$x)))>;
-def : Pat<(v2i64 (shifts[0] (v2i64 V128:$vec), (v2i64 (splat2 I64:$x)))),
-          (v2i64 (shifts[1] (v2i64 V128:$vec), (I32_WRAP_I64 I64:$x)))>;
-}
-
-// 2xi64 shifts with constant shift amounts are custom lowered to avoid wrapping
+// WebAssembly SIMD shifts are nonstandard in that the shift amount is
+// an i32 rather than a vector, so they need custom nodes.
 def wasm_shift_t : SDTypeProfile<1, 2,
   [SDTCisVec<0>, SDTCisSameAs<0, 1>, SDTCisVT<2, i32>]
 >;
 def wasm_shl : SDNode<"WebAssemblyISD::VEC_SHL", wasm_shift_t>;
 def wasm_shr_s : SDNode<"WebAssemblyISD::VEC_SHR_S", wasm_shift_t>;
 def wasm_shr_u : SDNode<"WebAssemblyISD::VEC_SHR_U", wasm_shift_t>;
-foreach shifts = [[wasm_shl, SHL_v2i64],
-                  [wasm_shr_s, SHR_S_v2i64],
-                  [wasm_shr_u, SHR_U_v2i64]] in
-def : Pat<(v2i64 (shifts[0] (v2i64 V128:$vec), I32:$x)),
-          (v2i64 (shifts[1] (v2i64 V128:$vec), I32:$x))>;
+
+// Left shift by scalar: shl
+defm SHL : SIMDShiftInt<wasm_shl, "shl", 107>;
+
+// Right shift by scalar: shr_s / shr_u
+defm SHR_S : SIMDShiftInt<wasm_shr_s, "shr_s", 108>;
+defm SHR_U : SIMDShiftInt<wasm_shr_u, "shr_u", 109>;
 
 //===----------------------------------------------------------------------===//
 // Integer binary arithmetic
 //===----------------------------------------------------------------------===//
 
+multiclass SIMDBinaryIntNoI8x16<SDNode node, string name, bits<32> baseInst> {
+  defm "" : SIMDBinary<v8i16, "i16x8", node, name, !add(baseInst, 32)>;
+  defm "" : SIMDBinary<v4i32, "i32x4", node, name, !add(baseInst, 64)>;
+  defm "" : SIMDBinary<v2i64, "i64x2", node, name, !add(baseInst, 96)>;
+}
+
 multiclass SIMDBinaryIntSmall<SDNode node, string name, bits<32> baseInst> {
   defm "" : SIMDBinary<v16i8, "i8x16", node, name, baseInst>;
-  defm "" : SIMDBinary<v8i16, "i16x8", node, name, !add(baseInst, 17)>;
+  defm "" : SIMDBinary<v8i16, "i16x8", node, name, !add(baseInst, 32)>;
 }
 
 multiclass SIMDBinaryIntNoI64x2<SDNode node, string name, bits<32> baseInst> {
   defm "" : SIMDBinaryIntSmall<node, name, baseInst>;
-  defm "" : SIMDBinary<v4i32, "i32x4", node, name, !add(baseInst, 34)>;
+  defm "" : SIMDBinary<v4i32, "i32x4", node, name, !add(baseInst, 64)>;
 }
 
 multiclass SIMDBinaryInt<SDNode node, string name, bits<32> baseInst> {
   defm "" : SIMDBinaryIntNoI64x2<node, name, baseInst>;
-  defm "" : SIMDBinary<v2i64, "i64x2", node, name, !add(baseInst, 51)>;
+  defm "" : SIMDBinary<v2i64, "i64x2", node, name, !add(baseInst, 96)>;
 }
 
 // Integer addition: add / add_saturate_s / add_saturate_u
 let isCommutable = 1 in {
-defm ADD : SIMDBinaryInt<add, "add", 87>;
-defm ADD_SAT_S : SIMDBinaryIntSmall<saddsat, "add_saturate_s", 88>;
-defm ADD_SAT_U : SIMDBinaryIntSmall<uaddsat, "add_saturate_u", 89>;
+defm ADD : SIMDBinaryInt<add, "add", 110>;
+defm ADD_SAT_S : SIMDBinaryIntSmall<saddsat, "add_saturate_s", 111>;
+defm ADD_SAT_U : SIMDBinaryIntSmall<uaddsat, "add_saturate_u", 112>;
 } // isCommutable = 1
 
 // Integer subtraction: sub / sub_saturate_s / sub_saturate_u
-defm SUB : SIMDBinaryInt<sub, "sub", 90>;
+defm SUB : SIMDBinaryInt<sub, "sub", 113>;
 defm SUB_SAT_S :
-  SIMDBinaryIntSmall<int_wasm_sub_saturate_signed, "sub_saturate_s", 91>;
+  SIMDBinaryIntSmall<int_wasm_sub_saturate_signed, "sub_saturate_s", 114>;
 defm SUB_SAT_U :
-  SIMDBinaryIntSmall<int_wasm_sub_saturate_unsigned, "sub_saturate_u", 92>;
+  SIMDBinaryIntSmall<int_wasm_sub_saturate_unsigned, "sub_saturate_u", 115>;
 
 // Integer multiplication: mul
 let isCommutable = 1 in
-defm MUL : SIMDBinaryIntNoI64x2<mul, "mul", 93>;
+defm MUL : SIMDBinaryIntNoI8x16<mul, "mul", 117>;
 
 // Integer min_s / min_u / max_s / max_u
 let isCommutable = 1 in {
-defm MIN_S : SIMDBinaryIntNoI64x2<smin, "min_s", 94>;
-defm MIN_U : SIMDBinaryIntNoI64x2<umin, "min_u", 95>;
-defm MAX_S : SIMDBinaryIntNoI64x2<smax, "max_s", 96>;
-defm MAX_U : SIMDBinaryIntNoI64x2<umax, "max_u", 97>;
+defm MIN_S : SIMDBinaryIntNoI64x2<smin, "min_s", 118>;
+defm MIN_U : SIMDBinaryIntNoI64x2<umin, "min_u", 119>;
+defm MAX_S : SIMDBinaryIntNoI64x2<smax, "max_s", 120>;
+defm MAX_U : SIMDBinaryIntNoI64x2<umax, "max_u", 121>;
 } // isCommutable = 1
 
 // Integer unsigned rounding average: avgr_u
-let isCommutable = 1, Predicates = [HasUnimplementedSIMD128] in {
-defm AVGR_U : SIMDBinary<v16i8, "i8x16", int_wasm_avgr_unsigned, "avgr_u", 217>;
-defm AVGR_U : SIMDBinary<v8i16, "i16x8", int_wasm_avgr_unsigned, "avgr_u", 218>;
+let isCommutable = 1 in {
+defm AVGR_U : SIMDBinary<v16i8, "i8x16", int_wasm_avgr_unsigned, "avgr_u", 123>;
+defm AVGR_U : SIMDBinary<v8i16, "i16x8", int_wasm_avgr_unsigned, "avgr_u", 155>;
 }
 
 def add_nuw : PatFrag<(ops node:$lhs, node:$rhs),
@@ -749,12 +745,12 @@ def add_nuw : PatFrag<(ops node:$lhs, node:$rhs),
                       "return N->getFlags().hasNoUnsignedWrap();">;
 
 foreach nodes = [[v16i8, splat16], [v8i16, splat8]] in
-def : Pat<(srl
+def : Pat<(wasm_shr_u
             (add_nuw
               (add_nuw (nodes[0] V128:$lhs), (nodes[0] V128:$rhs)),
               (nodes[1] (i32 1))
             ),
-            (nodes[0] (nodes[1] (i32 1)))
+            (i32 1)
           ),
           (!cast<NI>("AVGR_U_"#nodes[0]) V128:$lhs, V128:$rhs)>;
 
@@ -763,7 +759,7 @@ let isCommutable = 1 in
 defm DOT : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
                   [(set V128:$dst, (int_wasm_dot V128:$lhs, V128:$rhs))],
                   "i32x4.dot_i16x8_s\t$dst, $lhs, $rhs", "i32x4.dot_i16x8_s",
-                  219>;
+                  180>;
 
 //===----------------------------------------------------------------------===//
 // Floating-point unary arithmetic
@@ -771,18 +767,27 @@ defm DOT : SIMD_I<(outs V128:$dst), (ins V128:$lhs, V128:$rhs), (outs), (ins),
 
 multiclass SIMDUnaryFP<SDNode node, string name, bits<32> baseInst> {
   defm "" : SIMDUnary<v4f32, "f32x4", node, name, baseInst>;
-  defm "" : SIMDUnary<v2f64, "f64x2", node, name, !add(baseInst, 11)>;
+  defm "" : SIMDUnary<v2f64, "f64x2", node, name, !add(baseInst, 12)>;
 }
 
 // Absolute value: abs
-defm ABS : SIMDUnaryFP<fabs, "abs", 149>;
+defm ABS : SIMDUnaryFP<fabs, "abs", 224>;
 
 // Negation: neg
-defm NEG : SIMDUnaryFP<fneg, "neg", 150>;
+defm NEG : SIMDUnaryFP<fneg, "neg", 225>;
 
 // Square root: sqrt
-let Predicates = [HasUnimplementedSIMD128] in
-defm SQRT : SIMDUnaryFP<fsqrt, "sqrt", 151>;
+defm SQRT : SIMDUnaryFP<fsqrt, "sqrt", 227>;
+
+// Rounding: ceil, floor, trunc, nearest
+defm CEIL : SIMDUnary<v4f32, "f32x4", int_wasm_ceil, "ceil", 216>;
+defm FLOOR : SIMDUnary<v4f32, "f32x4", int_wasm_floor, "floor", 217>;
+defm TRUNC: SIMDUnary<v4f32, "f32x4", int_wasm_trunc, "trunc", 218>;
+defm NEAREST: SIMDUnary<v4f32, "f32x4", int_wasm_nearest, "nearest", 219>;
+defm CEIL : SIMDUnary<v2f64, "f64x2", int_wasm_ceil, "ceil", 220>;
+defm FLOOR : SIMDUnary<v2f64, "f64x2", int_wasm_floor, "floor", 221>;
+defm TRUNC: SIMDUnary<v2f64, "f64x2", int_wasm_trunc, "trunc", 222>;
+defm NEAREST: SIMDUnary<v2f64, "f64x2", int_wasm_nearest, "nearest", 223>;
 
 //===----------------------------------------------------------------------===//
 // Floating-point binary arithmetic
@@ -790,29 +795,34 @@ defm SQRT : SIMDUnaryFP<fsqrt, "sqrt", 151>;
 
 multiclass SIMDBinaryFP<SDNode node, string name, bits<32> baseInst> {
   defm "" : SIMDBinary<v4f32, "f32x4", node, name, baseInst>;
-  defm "" : SIMDBinary<v2f64, "f64x2", node, name, !add(baseInst, 11)>;
+  defm "" : SIMDBinary<v2f64, "f64x2", node, name, !add(baseInst, 12)>;
 }
 
 // Addition: add
 let isCommutable = 1 in
-defm ADD : SIMDBinaryFP<fadd, "add", 154>;
+defm ADD : SIMDBinaryFP<fadd, "add", 228>;
 
 // Subtraction: sub
-defm SUB : SIMDBinaryFP<fsub, "sub", 155>;
+defm SUB : SIMDBinaryFP<fsub, "sub", 229>;
 
 // Multiplication: mul
 let isCommutable = 1 in
-defm MUL : SIMDBinaryFP<fmul, "mul", 156>;
+defm MUL : SIMDBinaryFP<fmul, "mul", 230>;
 
 // Division: div
-let Predicates = [HasUnimplementedSIMD128] in
-defm DIV : SIMDBinaryFP<fdiv, "div", 157>;
+defm DIV : SIMDBinaryFP<fdiv, "div", 231>;
 
 // NaN-propagating minimum: min
-defm MIN : SIMDBinaryFP<fminimum, "min", 158>;
+defm MIN : SIMDBinaryFP<fminimum, "min", 232>;
 
 // NaN-propagating maximum: max
-defm MAX : SIMDBinaryFP<fmaximum, "max", 159>;
+defm MAX : SIMDBinaryFP<fmaximum, "max", 233>;
+
+// Pseudo-minimum: pmin
+defm PMIN : SIMDBinaryFP<int_wasm_pmin, "pmin", 234>;
+
+// Pseudo-maximum: pmax
+defm PMAX : SIMDBinaryFP<int_wasm_pmax, "pmax", 235>;
 
 //===----------------------------------------------------------------------===//
 // Conversions
@@ -826,17 +836,13 @@ multiclass SIMDConvert<ValueType vec_t, ValueType arg_t, SDNode op,
            name#"\t$dst, $vec", name, simdop>;
 }
 
-// Integer to floating point: convert
-defm "" : SIMDConvert<v4f32, v4i32, sint_to_fp, "f32x4.convert_i32x4_s", 175>;
-defm "" : SIMDConvert<v4f32, v4i32, uint_to_fp, "f32x4.convert_i32x4_u", 176>;
-defm "" : SIMDConvert<v2f64, v2i64, sint_to_fp, "f64x2.convert_i64x2_s", 177>;
-defm "" : SIMDConvert<v2f64, v2i64, uint_to_fp, "f64x2.convert_i64x2_u", 178>;
-
 // Floating point to integer with saturation: trunc_sat
-defm "" : SIMDConvert<v4i32, v4f32, fp_to_sint, "i32x4.trunc_sat_f32x4_s", 171>;
-defm "" : SIMDConvert<v4i32, v4f32, fp_to_uint, "i32x4.trunc_sat_f32x4_u", 172>;
-defm "" : SIMDConvert<v2i64, v2f64, fp_to_sint, "i64x2.trunc_sat_f64x2_s", 173>;
-defm "" : SIMDConvert<v2i64, v2f64, fp_to_uint, "i64x2.trunc_sat_f64x2_u", 174>;
+defm "" : SIMDConvert<v4i32, v4f32, fp_to_sint, "i32x4.trunc_sat_f32x4_s", 248>;
+defm "" : SIMDConvert<v4i32, v4f32, fp_to_uint, "i32x4.trunc_sat_f32x4_u", 249>;
+
+// Integer to floating point: convert
+defm "" : SIMDConvert<v4f32, v4i32, sint_to_fp, "f32x4.convert_i32x4_s", 250>;
+defm "" : SIMDConvert<v4f32, v4i32, uint_to_fp, "f32x4.convert_i32x4_u", 251>;
 
 // Widening operations
 multiclass SIMDWiden<ValueType vec_t, string vec, ValueType arg_t, string arg,
@@ -851,8 +857,8 @@ multiclass SIMDWiden<ValueType vec_t, string vec, ValueType arg_t, string arg,
                         vec#".widen_high_"#arg#"_u", !add(baseInst, 3)>;
 }
 
-defm "" : SIMDWiden<v8i16, "i16x8", v16i8, "i8x16", 202>;
-defm "" : SIMDWiden<v4i32, "i32x4", v8i16, "i16x8", 206>;
+defm "" : SIMDWiden<v8i16, "i16x8", v16i8, "i8x16", 135>;
+defm "" : SIMDWiden<v4i32, "i32x4", v8i16, "i16x8", 167>;
 
 // Narrowing operations
 multiclass SIMDNarrow<ValueType vec_t, string vec, ValueType arg_t, string arg,
@@ -871,18 +877,14 @@ multiclass SIMDNarrow<ValueType vec_t, string vec, ValueType arg_t, string arg,
            !add(baseInst, 1)>;
 }
 
-defm "" : SIMDNarrow<v16i8, "i8x16", v8i16, "i16x8", 198>;
-defm "" : SIMDNarrow<v8i16, "i16x8", v4i32, "i32x4", 200>;
+defm "" : SIMDNarrow<v16i8, "i8x16", v8i16, "i16x8", 101>;
+defm "" : SIMDNarrow<v8i16, "i16x8", v4i32, "i32x4", 133>;
 
 // Lower llvm.wasm.trunc.saturate.* to saturating instructions
 def : Pat<(v4i32 (int_wasm_trunc_saturate_signed (v4f32 V128:$src))),
           (fp_to_sint_v4i32_v4f32 (v4f32 V128:$src))>;
 def : Pat<(v4i32 (int_wasm_trunc_saturate_unsigned (v4f32 V128:$src))),
           (fp_to_uint_v4i32_v4f32 (v4f32 V128:$src))>;
-def : Pat<(v2i64 (int_wasm_trunc_saturate_signed (v2f64 V128:$src))),
-          (fp_to_sint_v2i64_v2f64 (v2f64 V128:$src))>;
-def : Pat<(v2i64 (int_wasm_trunc_saturate_unsigned (v2f64 V128:$src))),
-          (fp_to_uint_v2i64_v2f64 (v2f64 V128:$src))>;
 
 // Bitcasts are nops
 // Matching bitcast t1 to t1 causes strange errors, so avoid repeating types
@@ -914,5 +916,5 @@ multiclass SIMDQFM<ValueType vec_t, string vec, bits<32> baseInst> {
            vec#".qfms\t$dst, $a, $b, $c", vec#".qfms", !add(baseInst, 1)>;
 }
 
-defm "" : SIMDQFM<v4f32, "f32x4", 0x98>;
-defm "" : SIMDQFM<v2f64, "f64x2", 0xa3>;
+defm "" : SIMDQFM<v4f32, "f32x4", 252>;
+defm "" : SIMDQFM<v2f64, "f64x2", 254>;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
index 75d04252cbe9..346938daf1aa 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLateEHPrepare.cpp
@@ -20,6 +20,7 @@
 #include "llvm/CodeGen/WasmEHFuncInfo.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-late-eh-prepare"
@@ -31,12 +32,16 @@ class WebAssemblyLateEHPrepare final : public MachineFunctionPass {
   }
 
   bool runOnMachineFunction(MachineFunction &MF) override;
+  void recordCatchRetBBs(MachineFunction &MF);
   bool addCatches(MachineFunction &MF);
   bool replaceFuncletReturns(MachineFunction &MF);
   bool removeUnnecessaryUnreachables(MachineFunction &MF);
   bool addExceptionExtraction(MachineFunction &MF);
   bool restoreStackPointer(MachineFunction &MF);
 
+  MachineBasicBlock *getMatchingEHPad(MachineInstr *MI);
+  SmallSet<MachineBasicBlock *, 8> CatchRetBBs;
+
 public:
   static char ID; // Pass identification, replacement for typeid
   WebAssemblyLateEHPrepare() : MachineFunctionPass(ID) {}
@@ -57,7 +62,8 @@ FunctionPass *llvm::createWebAssemblyLateEHPrepare() {
 // possible search paths should be the same.
 // Returns nullptr in case it does not find any EH pad in the search, or finds
 // multiple different EH pads.
-static MachineBasicBlock *getMatchingEHPad(MachineInstr *MI) {
+MachineBasicBlock *
+WebAssemblyLateEHPrepare::getMatchingEHPad(MachineInstr *MI) {
   MachineFunction *MF = MI->getParent()->getParent();
   SmallVector<MachineBasicBlock *, 2> WL;
   SmallPtrSet<MachineBasicBlock *, 2> Visited;
@@ -76,7 +82,9 @@ static MachineBasicBlock *getMatchingEHPad(MachineInstr *MI) {
     }
     if (MBB == &MF->front())
       return nullptr;
-    WL.append(MBB->pred_begin(), MBB->pred_end());
+    for (auto *Pred : MBB->predecessors())
+      if (!CatchRetBBs.count(Pred)) // We don't go into child scopes
+        WL.push_back(Pred);
   }
   return EHPad;
 }
@@ -110,6 +118,7 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) {
 
   bool Changed = false;
   if (MF.getFunction().hasPersonalityFn()) {
+    recordCatchRetBBs(MF);
     Changed |= addCatches(MF);
     Changed |= replaceFuncletReturns(MF);
   }
@@ -121,6 +130,21 @@ bool WebAssemblyLateEHPrepare::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
+// Record which BB ends with 'CATCHRET' instruction, because this will be
+// replaced with BRs later. This set of 'CATCHRET' BBs is necessary in
+// 'getMatchingEHPad' function.
+void WebAssemblyLateEHPrepare::recordCatchRetBBs(MachineFunction &MF) {
+  CatchRetBBs.clear();
+  for (auto &MBB : MF) {
+    auto Pos = MBB.getFirstTerminator();
+    if (Pos == MBB.end())
+      continue;
+    MachineInstr *TI = &*Pos;
+    if (TI->getOpcode() == WebAssembly::CATCHRET)
+      CatchRetBBs.insert(&MBB);
+  }
+}
+
 // Add catch instruction to beginning of catchpads and cleanuppads.
 bool WebAssemblyLateEHPrepare::addCatches(MachineFunction &MF) {
   bool Changed = false;
@@ -343,7 +367,7 @@ bool WebAssemblyLateEHPrepare::addExceptionExtraction(MachineFunction &MF) {
              "There is no __clang_call_terminate() function");
       Register Reg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
       BuildMI(ElseMBB, DL, TII.get(WebAssembly::CONST_I32), Reg).addImm(0);
-      BuildMI(ElseMBB, DL, TII.get(WebAssembly::CALL_VOID))
+      BuildMI(ElseMBB, DL, TII.get(WebAssembly::CALL))
           .addGlobalAddress(ClangCallTerminateFn)
           .addReg(Reg);
       BuildMI(ElseMBB, DL, TII.get(WebAssembly::UNREACHABLE));
@@ -384,8 +408,8 @@ bool WebAssemblyLateEHPrepare::restoreStackPointer(MachineFunction &MF) {
       ++InsertPos;
     if (InsertPos->getOpcode() == WebAssembly::CATCH)
       ++InsertPos;
-    FrameLowering->writeSPToGlobal(WebAssembly::SP32, MF, MBB, InsertPos,
-                                   MBB.begin()->getDebugLoc());
+    FrameLowering->writeSPToGlobal(FrameLowering->getSPReg(MF), MF, MBB,
+                                   InsertPos, MBB.begin()->getDebugLoc());
   }
   return Changed;
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
index 4314aa611549..01b3aa887738 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerBrUnless.cpp
@@ -191,7 +191,7 @@ bool WebAssemblyLowerBrUnless::runOnMachineFunction(MachineFunction &MF) {
         Register Tmp = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
         BuildMI(MBB, MI, MI->getDebugLoc(), TII.get(WebAssembly::EQZ_I32), Tmp)
             .addReg(Cond);
-        MFI.stackifyVReg(Tmp);
+        MFI.stackifyVReg(MRI, Tmp);
         Cond = Tmp;
         Inverted = true;
       }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
index d1f3acbd221e..5fce4a600510 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerEmscriptenEHSjLj.cpp
@@ -208,7 +208,8 @@
 ///===----------------------------------------------------------------------===//
 
 #include "WebAssembly.h"
-#include "llvm/IR/CallSite.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/Support/CommandLine.h"
@@ -220,10 +221,10 @@ using namespace llvm;
 #define DEBUG_TYPE "wasm-lower-em-ehsjlj"
 
 static cl::list<std::string>
-    EHWhitelist("emscripten-cxx-exceptions-whitelist",
+    EHAllowlist("emscripten-cxx-exceptions-allowed",
                 cl::desc("The list of function names in which Emscripten-style "
                          "exception handling is enabled (see emscripten "
-                         "EMSCRIPTEN_CATCHING_WHITELIST options)"),
+                         "EMSCRIPTEN_CATCHING_ALLOWED options)"),
                 cl::CommaSeparated);
 
 namespace {
@@ -247,8 +248,8 @@ class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
   DenseMap<int, Function *> FindMatchingCatches;
   // Map of <function signature string, invoke_ wrappers>
   StringMap<Function *> InvokeWrappers;
-  // Set of whitelisted function names for exception handling
-  std::set<std::string> EHWhitelistSet;
+  // Set of allowed function names for exception handling
+  std::set<std::string> EHAllowlistSet;
 
   StringRef getPassName() const override {
     return "WebAssembly Lower Emscripten Exceptions";
@@ -258,13 +259,13 @@ class WebAssemblyLowerEmscriptenEHSjLj final : public ModulePass {
   bool runSjLjOnFunction(Function &F);
   Function *getFindMatchingCatch(Module &M, unsigned NumClauses);
 
-  template <typename CallOrInvoke> Value *wrapInvoke(CallOrInvoke *CI);
-  void wrapTestSetjmp(BasicBlock *BB, Instruction *InsertPt, Value *Threw,
+  Value *wrapInvoke(CallBase *CI);
+  void wrapTestSetjmp(BasicBlock *BB, DebugLoc DL, Value *Threw,
                       Value *SetjmpTable, Value *SetjmpTableSize, Value *&Label,
                       Value *&LongjmpResult, BasicBlock *&EndBB);
-  template <typename CallOrInvoke> Function *getInvokeWrapper(CallOrInvoke *CI);
+  Function *getInvokeWrapper(CallBase *CI);
 
-  bool areAllExceptionsAllowed() const { return EHWhitelistSet.empty(); }
+  bool areAllExceptionsAllowed() const { return EHAllowlistSet.empty(); }
   bool canLongjmp(Module &M, const Value *Callee) const;
   bool isEmAsmCall(Module &M, const Value *Callee) const;
 
@@ -275,7 +276,7 @@ public:
 
   WebAssemblyLowerEmscriptenEHSjLj(bool EnableEH = true, bool EnableSjLj = true)
       : ModulePass(ID), EnableEH(EnableEH), EnableSjLj(EnableSjLj) {
-    EHWhitelistSet.insert(EHWhitelist.begin(), EHWhitelist.end());
+    EHAllowlistSet.insert(EHAllowlist.begin(), EHAllowlist.end());
   }
   bool runOnModule(Module &M) override;
 
@@ -337,13 +338,31 @@ static std::string getSignature(FunctionType *FTy) {
   if (FTy->isVarArg())
     OS << "_...";
   Sig = OS.str();
-  Sig.erase(remove_if(Sig, isspace), Sig.end());
+  Sig.erase(remove_if(Sig, isSpace), Sig.end());
   // When s2wasm parses .s file, a comma means the end of an argument. So a
   // mangled function name can contain any character but a comma.
   std::replace(Sig.begin(), Sig.end(), ',', '.');
   return Sig;
 }
 
+static Function *getEmscriptenFunction(FunctionType *Ty, const Twine &Name,
+                                       Module *M) {
+  Function* F = Function::Create(Ty, GlobalValue::ExternalLinkage, Name, M);
+  // Tell the linker that this function is expected to be imported from the
+  // 'env' module.
+  if (!F->hasFnAttribute("wasm-import-module")) {
+    llvm::AttrBuilder B;
+    B.addAttribute("wasm-import-module", "env");
+    F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+  }
+  if (!F->hasFnAttribute("wasm-import-name")) {
+    llvm::AttrBuilder B;
+    B.addAttribute("wasm-import-name", F->getName());
+    F->addAttributes(llvm::AttributeList::FunctionIndex, B);
+  }
+  return F;
+}
+
 // Returns __cxa_find_matching_catch_N function, where N = NumClauses + 2.
 // This is because a landingpad instruction contains two more arguments, a
 // personality function and a cleanup bit, and __cxa_find_matching_catch_N
@@ -357,9 +376,8 @@ WebAssemblyLowerEmscriptenEHSjLj::getFindMatchingCatch(Module &M,
   PointerType *Int8PtrTy = Type::getInt8PtrTy(M.getContext());
   SmallVector<Type *, 16> Args(NumClauses, Int8PtrTy);
   FunctionType *FTy = FunctionType::get(Int8PtrTy, Args, false);
-  Function *F = Function::Create(
-      FTy, GlobalValue::ExternalLinkage,
-      "__cxa_find_matching_catch_" + Twine(NumClauses + 2), &M);
+  Function *F = getEmscriptenFunction(
+      FTy, "__cxa_find_matching_catch_" + Twine(NumClauses + 2), &M);
   FindMatchingCatches[NumClauses] = F;
   return F;
 }
@@ -371,15 +389,14 @@ WebAssemblyLowerEmscriptenEHSjLj::getFindMatchingCatch(Module &M,
 // %__THREW__.val = __THREW__; __THREW__ = 0;
 // Returns %__THREW__.val, which indicates whether an exception is thrown (or
 // whether longjmp occurred), for future use.
-template <typename CallOrInvoke>
-Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
+Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallBase *CI) {
   LLVMContext &C = CI->getModule()->getContext();
 
   // If we are calling a function that is noreturn, we must remove that
   // attribute. The code we insert here does expect it to return, after we
   // catch the exception.
   if (CI->doesNotReturn()) {
-    if (auto *F = dyn_cast<Function>(CI->getCalledValue()))
+    if (auto *F = CI->getCalledFunction())
       F->removeFnAttr(Attribute::NoReturn);
     CI->removeAttribute(AttributeList::FunctionIndex, Attribute::NoReturn);
   }
@@ -395,7 +412,7 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
   SmallVector<Value *, 16> Args;
   // Put the pointer to the callee as first argument, so it can be called
   // within the invoke wrapper later
-  Args.push_back(CI->getCalledValue());
+  Args.push_back(CI->getCalledOperand());
   Args.append(CI->arg_begin(), CI->arg_end());
   CallInst *NewCall = IRB.CreateCall(getInvokeWrapper(CI), Args);
   NewCall->takeName(CI);
@@ -443,18 +460,10 @@ Value *WebAssemblyLowerEmscriptenEHSjLj::wrapInvoke(CallOrInvoke *CI) {
 }
 
 // Get matching invoke wrapper based on callee signature
-template <typename CallOrInvoke>
-Function *WebAssemblyLowerEmscriptenEHSjLj::getInvokeWrapper(CallOrInvoke *CI) {
+Function *WebAssemblyLowerEmscriptenEHSjLj::getInvokeWrapper(CallBase *CI) {
   Module *M = CI->getModule();
   SmallVector<Type *, 16> ArgTys;
-  Value *Callee = CI->getCalledValue();
-  FunctionType *CalleeFTy;
-  if (auto *F = dyn_cast<Function>(Callee))
-    CalleeFTy = F->getFunctionType();
-  else {
-    auto *CalleeTy = cast<PointerType>(Callee->getType())->getElementType();
-    CalleeFTy = cast<FunctionType>(CalleeTy);
-  }
+  FunctionType *CalleeFTy = CI->getFunctionType();
 
   std::string Sig = getSignature(CalleeFTy);
   if (InvokeWrappers.find(Sig) != InvokeWrappers.end())
@@ -467,8 +476,7 @@ Function *WebAssemblyLowerEmscriptenEHSjLj::getInvokeWrapper(CallOrInvoke *CI) {
 
   FunctionType *FTy = FunctionType::get(CalleeFTy->getReturnType(), ArgTys,
                                         CalleeFTy->isVarArg());
-  Function *F =
-      Function::Create(FTy, GlobalValue::ExternalLinkage, "__invoke_" + Sig, M);
+  Function *F = getEmscriptenFunction(FTy, "__invoke_" + Sig, M);
   InvokeWrappers[Sig] = F;
   return F;
 }
@@ -538,13 +546,13 @@ bool WebAssemblyLowerEmscriptenEHSjLj::isEmAsmCall(Module &M,
 // As output parameters. returns %label, %longjmp_result, and the BB the last
 // instruction (%longjmp_result = ...) is in.
 void WebAssemblyLowerEmscriptenEHSjLj::wrapTestSetjmp(
-    BasicBlock *BB, Instruction *InsertPt, Value *Threw, Value *SetjmpTable,
+    BasicBlock *BB, DebugLoc DL, Value *Threw, Value *SetjmpTable,
     Value *SetjmpTableSize, Value *&Label, Value *&LongjmpResult,
     BasicBlock *&EndBB) {
   Function *F = BB->getParent();
   LLVMContext &C = BB->getModule()->getContext();
   IRBuilder<> IRB(C);
-  IRB.SetInsertPoint(InsertPt);
+  IRB.SetCurrentDebugLocation(DL);
 
   // if (%__THREW__.val != 0 & threwValue != 0)
   IRB.SetInsertPoint(BB);
@@ -639,12 +647,11 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
   // exception handling and setjmp/longjmp handling
   ThrewGV = getGlobalVariableI32(M, IRB, "__THREW__");
   ThrewValueGV = getGlobalVariableI32(M, IRB, "__threwValue");
-  GetTempRet0Func =
-      Function::Create(FunctionType::get(IRB.getInt32Ty(), false),
-                       GlobalValue::ExternalLinkage, "getTempRet0", &M);
-  SetTempRet0Func = Function::Create(
+  GetTempRet0Func = getEmscriptenFunction(
+      FunctionType::get(IRB.getInt32Ty(), false), "getTempRet0", &M);
+  SetTempRet0Func = getEmscriptenFunction(
       FunctionType::get(IRB.getVoidTy(), IRB.getInt32Ty(), false),
-      GlobalValue::ExternalLinkage, "setTempRet0", &M);
+      "setTempRet0", &M);
   GetTempRet0Func->setDoesNotThrow();
   SetTempRet0Func->setDoesNotThrow();
 
@@ -655,14 +662,12 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
     // Register __resumeException function
     FunctionType *ResumeFTy =
         FunctionType::get(IRB.getVoidTy(), IRB.getInt8PtrTy(), false);
-    ResumeF = Function::Create(ResumeFTy, GlobalValue::ExternalLinkage,
-                               "__resumeException", &M);
+    ResumeF = getEmscriptenFunction(ResumeFTy, "__resumeException", &M);
 
     // Register llvm_eh_typeid_for function
     FunctionType *EHTypeIDTy =
         FunctionType::get(IRB.getInt32Ty(), IRB.getInt8PtrTy(), false);
-    EHTypeIDF = Function::Create(EHTypeIDTy, GlobalValue::ExternalLinkage,
-                                 "llvm_eh_typeid_for", &M);
+    EHTypeIDF = getEmscriptenFunction(EHTypeIDTy, "llvm_eh_typeid_for", &M);
 
     for (Function &F : M) {
       if (F.isDeclaration())
@@ -678,34 +683,30 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runOnModule(Module &M) {
     if (LongjmpF) {
       // Replace all uses of longjmp with emscripten_longjmp_jmpbuf, which is
       // defined in JS code
-      EmLongjmpJmpbufF = Function::Create(LongjmpF->getFunctionType(),
-                                          GlobalValue::ExternalLinkage,
-                                          "emscripten_longjmp_jmpbuf", &M);
-
+      EmLongjmpJmpbufF = getEmscriptenFunction(LongjmpF->getFunctionType(),
+                                               "emscripten_longjmp_jmpbuf", &M);
       LongjmpF->replaceAllUsesWith(EmLongjmpJmpbufF);
     }
 
     if (SetjmpF) {
       // Register saveSetjmp function
       FunctionType *SetjmpFTy = SetjmpF->getFunctionType();
-      SmallVector<Type *, 4> Params = {SetjmpFTy->getParamType(0),
-                                       IRB.getInt32Ty(), Type::getInt32PtrTy(C),
-                                       IRB.getInt32Ty()};
       FunctionType *FTy =
-          FunctionType::get(Type::getInt32PtrTy(C), Params, false);
-      SaveSetjmpF =
-          Function::Create(FTy, GlobalValue::ExternalLinkage, "saveSetjmp", &M);
+          FunctionType::get(Type::getInt32PtrTy(C),
+                            {SetjmpFTy->getParamType(0), IRB.getInt32Ty(),
+                             Type::getInt32PtrTy(C), IRB.getInt32Ty()},
+                            false);
+      SaveSetjmpF = getEmscriptenFunction(FTy, "saveSetjmp", &M);
 
       // Register testSetjmp function
-      Params = {IRB.getInt32Ty(), Type::getInt32PtrTy(C), IRB.getInt32Ty()};
-      FTy = FunctionType::get(IRB.getInt32Ty(), Params, false);
-      TestSetjmpF =
-          Function::Create(FTy, GlobalValue::ExternalLinkage, "testSetjmp", &M);
+      FTy = FunctionType::get(
+          IRB.getInt32Ty(),
+          {IRB.getInt32Ty(), Type::getInt32PtrTy(C), IRB.getInt32Ty()}, false);
+      TestSetjmpF = getEmscriptenFunction(FTy, "testSetjmp", &M);
 
       FTy = FunctionType::get(IRB.getVoidTy(),
                               {IRB.getInt32Ty(), IRB.getInt32Ty()}, false);
-      EmLongjmpF = Function::Create(FTy, GlobalValue::ExternalLinkage,
-                                    "emscripten_longjmp", &M);
+      EmLongjmpF = getEmscriptenFunction(FTy, "emscripten_longjmp", &M);
 
       // Only traverse functions that uses setjmp in order not to insert
       // unnecessary prep / cleanup code in every function
@@ -744,17 +745,18 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
   bool Changed = false;
   SmallVector<Instruction *, 64> ToErase;
   SmallPtrSet<LandingPadInst *, 32> LandingPads;
-  bool AllowExceptions =
-      areAllExceptionsAllowed() || EHWhitelistSet.count(F.getName());
+  bool AllowExceptions = areAllExceptionsAllowed() ||
+                         EHAllowlistSet.count(std::string(F.getName()));
 
   for (BasicBlock &BB : F) {
     auto *II = dyn_cast<InvokeInst>(BB.getTerminator());
     if (!II)
       continue;
+    Changed = true;
     LandingPads.insert(II->getLandingPadInst());
     IRB.SetInsertPoint(II);
 
-    bool NeedInvoke = AllowExceptions && canThrow(II->getCalledValue());
+    bool NeedInvoke = AllowExceptions && canThrow(II->getCalledOperand());
     if (NeedInvoke) {
       // Wrap invoke with invoke wrapper and generate preamble/postamble
       Value *Threw = wrapInvoke(II);
@@ -769,7 +771,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
       // call+branch
       SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());
       CallInst *NewCall =
-          IRB.CreateCall(II->getFunctionType(), II->getCalledValue(), Args);
+          IRB.CreateCall(II->getFunctionType(), II->getCalledOperand(), Args);
       NewCall->takeName(II);
       NewCall->setCallingConv(II->getCallingConv());
       NewCall->setDebugLoc(II->getDebugLoc());
@@ -791,6 +793,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
       auto *RI = dyn_cast<ResumeInst>(&I);
       if (!RI)
         continue;
+      Changed = true;
 
       // Split the input into legal values
       Value *Input = RI->getValue();
@@ -815,6 +818,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
         continue;
       if (Callee->getIntrinsicID() != Intrinsic::eh_typeid_for)
         continue;
+      Changed = true;
 
       IRB.SetInsertPoint(CI);
       CallInst *NewCI =
@@ -830,7 +834,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
     if (auto *LPI = dyn_cast<LandingPadInst>(I))
       LandingPads.insert(LPI);
   }
-  Changed = !LandingPads.empty();
+  Changed |= !LandingPads.empty();
 
   // Handle all the landingpad for this function together, as multiple invokes
   // may share a single lp
@@ -871,6 +875,27 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runEHOnFunction(Function &F) {
   return Changed;
 }
 
+// This tries to get debug info from the instruction before which a new
+// instruction will be inserted, and if there's no debug info in that
+// instruction, tries to get the info instead from the previous instruction (if
+// any). If none of these has debug info and a DISubprogram is provided, it
+// creates a dummy debug info with the first line of the function, because IR
+// verifier requires all inlinable callsites should have debug info when both a
+// caller and callee have DISubprogram. If none of these conditions are met,
+// returns empty info.
+static DebugLoc getOrCreateDebugLoc(const Instruction *InsertBefore,
+                                    DISubprogram *SP) {
+  assert(InsertBefore);
+  if (InsertBefore->getDebugLoc())
+    return InsertBefore->getDebugLoc();
+  const Instruction *Prev = InsertBefore->getPrevNode();
+  if (Prev && Prev->getDebugLoc())
+    return Prev->getDebugLoc();
+  if (SP)
+    return DILocation::get(SP->getContext(), SP->getLine(), 1, SP);
+  return DebugLoc();
+}
+
 bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
   Module &M = *F.getParent();
   LLVMContext &C = F.getContext();
@@ -888,13 +913,22 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
   // this instruction to a constant 4, because this value will be used in
   // SSAUpdater.AddAvailableValue(...) later.
   BasicBlock &EntryBB = F.getEntryBlock();
+  DebugLoc FirstDL = getOrCreateDebugLoc(&*EntryBB.begin(), F.getSubprogram());
   BinaryOperator *SetjmpTableSize = BinaryOperator::Create(
       Instruction::Add, IRB.getInt32(4), IRB.getInt32(0), "setjmpTableSize",
       &*EntryBB.getFirstInsertionPt());
+  SetjmpTableSize->setDebugLoc(FirstDL);
   // setjmpTable = (int *) malloc(40);
   Instruction *SetjmpTable = CallInst::CreateMalloc(
       SetjmpTableSize, IRB.getInt32Ty(), IRB.getInt32Ty(), IRB.getInt32(40),
       nullptr, nullptr, "setjmpTable");
+  SetjmpTable->setDebugLoc(FirstDL);
+  // CallInst::CreateMalloc may return a bitcast instruction if the result types
+  // mismatch. We need to set the debug loc for the original call too.
+  auto *MallocCall = SetjmpTable->stripPointerCasts();
+  if (auto *MallocCallI = dyn_cast<Instruction>(MallocCall)) {
+    MallocCallI->setDebugLoc(FirstDL);
+  }
   // setjmpTable[0] = 0;
   IRB.SetInsertPoint(SetjmpTableSize);
   IRB.CreateStore(IRB.getInt32(0), SetjmpTable);
@@ -963,7 +997,7 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
       if (!CI)
         continue;
 
-      const Value *Callee = CI->getCalledValue();
+      const Value *Callee = CI->getCalledOperand();
       if (!canLongjmp(M, Callee))
         continue;
       if (isEmAsmCall(M, Callee))
@@ -1024,12 +1058,13 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
       Value *Label = nullptr;
       Value *LongjmpResult = nullptr;
       BasicBlock *EndBB = nullptr;
-      wrapTestSetjmp(BB, CI, Threw, SetjmpTable, SetjmpTableSize, Label,
-                     LongjmpResult, EndBB);
+      wrapTestSetjmp(BB, CI->getDebugLoc(), Threw, SetjmpTable, SetjmpTableSize,
+                     Label, LongjmpResult, EndBB);
       assert(Label && LongjmpResult && EndBB);
 
       // Create switch instruction
       IRB.SetInsertPoint(EndBB);
+      IRB.SetCurrentDebugLocation(EndBB->getInstList().back().getDebugLoc());
       SwitchInst *SI = IRB.CreateSwitch(Label, Tail, SetjmpRetPHIs.size());
       // -1 means no longjmp happened, continue normally (will hit the default
       // switch case). 0 means a longjmp that is not ours to handle, needs a
@@ -1053,8 +1088,17 @@ bool WebAssemblyLowerEmscriptenEHSjLj::runSjLjOnFunction(Function &F) {
   // Free setjmpTable buffer before each return instruction
   for (BasicBlock &BB : F) {
     Instruction *TI = BB.getTerminator();
-    if (isa<ReturnInst>(TI))
-      CallInst::CreateFree(SetjmpTable, TI);
+    if (isa<ReturnInst>(TI)) {
+      DebugLoc DL = getOrCreateDebugLoc(TI, F.getSubprogram());
+      auto *Free = CallInst::CreateFree(SetjmpTable, TI);
+      Free->setDebugLoc(DL);
+      // CallInst::CreateFree may create a bitcast instruction if its argument
+      // types mismatch. We need to set the debug loc for the bitcast too.
+      if (auto *FreeCallI = dyn_cast<CallInst>(Free)) {
+        if (auto *BitCastI = dyn_cast<BitCastInst>(FreeCallI->getArgOperand(0)))
+          BitCastI->setDebugLoc(DL);
+      }
+    }
   }
 
   // Every call to saveSetjmp can change setjmpTable and setjmpTableSize
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
index 750b2233e67a..9ccbee819c35 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyLowerGlobalDtors.cpp
@@ -76,9 +76,13 @@ bool LowerGlobalDtors::runOnModule(Module &M) {
       !ETy->getTypeAtIndex(2U)->isPointerTy())
     return false; // Not (int, ptr, ptr).
 
-  // Collect the contents of @llvm.global_dtors, collated by priority and
-  // associated symbol.
-  std::map<uint16_t, MapVector<Constant *, std::vector<Constant *>>> DtorFuncs;
+  // Collect the contents of @llvm.global_dtors, ordered by priority. Within a
+  // priority, sequences of destructors with the same associated object are
+  // recorded so that we can register them as a group.
+  std::map<
+      uint16_t,
+      std::vector<std::pair<Constant *, std::vector<Constant *>>>
+  > DtorFuncs;
   for (Value *O : InitList->operands()) {
     auto *CS = dyn_cast<ConstantStruct>(O);
     if (!CS)
@@ -96,7 +100,14 @@ bool LowerGlobalDtors::runOnModule(Module &M) {
     Constant *Associated = CS->getOperand(2);
     Associated = cast<Constant>(Associated->stripPointerCasts());
 
-    DtorFuncs[PriorityValue][Associated].push_back(DtorFunc);
+    auto &AtThisPriority = DtorFuncs[PriorityValue];
+    if (AtThisPriority.empty() || AtThisPriority.back().first != Associated) {
+        std::vector<Constant *> NewList;
+        NewList.push_back(DtorFunc);
+        AtThisPriority.push_back(std::make_pair(Associated, NewList));
+    } else {
+        AtThisPriority.back().second.push_back(DtorFunc);
+    }
   }
   if (DtorFuncs.empty())
     return false;
@@ -131,14 +142,19 @@ bool LowerGlobalDtors::runOnModule(Module &M) {
   // first function with __cxa_atexit.
   for (auto &PriorityAndMore : DtorFuncs) {
     uint16_t Priority = PriorityAndMore.first;
-    for (auto &AssociatedAndMore : PriorityAndMore.second) {
+    uint64_t Id = 0;
+    auto &AtThisPriority = PriorityAndMore.second;
+    for (auto &AssociatedAndMore : AtThisPriority) {
       Constant *Associated = AssociatedAndMore.first;
+      auto ThisId = Id++;
 
       Function *CallDtors = Function::Create(
           AtExitFuncTy, Function::PrivateLinkage,
           "call_dtors" +
               (Priority != UINT16_MAX ? (Twine(".") + Twine(Priority))
                                       : Twine()) +
+              (AtThisPriority.size() > 1 ? Twine("$") + Twine(ThisId)
+                                         : Twine()) +
               (!Associated->isNullValue() ? (Twine(".") + Associated->getName())
                                           : Twine()),
           &M);
@@ -146,7 +162,7 @@ bool LowerGlobalDtors::runOnModule(Module &M) {
       FunctionType *VoidVoid = FunctionType::get(Type::getVoidTy(C),
                                                  /*isVarArg=*/false);
 
-      for (auto Dtor : AssociatedAndMore.second)
+      for (auto Dtor : reverse(AssociatedAndMore.second))
         CallInst::Create(VoidVoid, Dtor, "", BB);
       ReturnInst::Create(C, BB);
 
@@ -155,6 +171,8 @@ bool LowerGlobalDtors::runOnModule(Module &M) {
           "register_call_dtors" +
               (Priority != UINT16_MAX ? (Twine(".") + Twine(Priority))
                                       : Twine()) +
+              (AtThisPriority.size() > 1 ? Twine("$") + Twine(ThisId)
+                                         : Twine()) +
               (!Associated->isNullValue() ? (Twine(".") + Associated->getName())
                                           : Twine()),
           &M);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
index 59c10243c545..304dca2ebfe4 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMCInstLower.cpp
@@ -13,10 +13,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "WebAssemblyMCInstLower.h"
+#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
+#include "TargetInfo/WebAssemblyTargetInfo.h"
 #include "WebAssemblyAsmPrinter.h"
 #include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblyRuntimeLibcallSignatures.h"
-#include "MCTargetDesc/WebAssemblyMCTargetDesc.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/IR/Constants.h"
@@ -29,11 +30,6 @@
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
 
-// Defines llvm::WebAssembly::getStackOpcode to convert register instructions to
-// stack instructions
-#define GET_INSTRMAP_INFO 1
-#include "WebAssemblyGenInstrInfo.inc"
-
 // This disables the removal of registers when lowering into MC, as required
 // by some current tests.
 cl::opt<bool>
@@ -56,7 +52,8 @@ WebAssemblyMCInstLower::GetGlobalAddressSymbol(const MachineOperand &MO) const {
 
     SmallVector<MVT, 1> ResultMVTs;
     SmallVector<MVT, 4> ParamMVTs;
-    computeSignatureVTs(FuncTy, CurrentFunc, TM, ParamMVTs, ResultMVTs);
+    const auto *const F = dyn_cast<Function>(Global);
+    computeSignatureVTs(FuncTy, F, CurrentFunc, TM, ParamMVTs, ResultMVTs);
 
     auto Signature = signatureFromMVTs(ResultMVTs, ParamMVTs);
     WasmSym->setSignature(Signature.get());
@@ -84,8 +81,9 @@ MCSymbol *WebAssemblyMCInstLower::GetExternalSymbolSymbol(
         strcmp(Name, "__stack_pointer") == 0 || strcmp(Name, "__tls_base") == 0;
     WasmSym->setType(wasm::WASM_SYMBOL_TYPE_GLOBAL);
     WasmSym->setGlobalType(wasm::WasmGlobalType{
-        uint8_t(Subtarget.hasAddr64() ? wasm::WASM_TYPE_I64
-                                      : wasm::WASM_TYPE_I32),
+        uint8_t(Subtarget.hasAddr64() && strcmp(Name, "__table_base") != 0
+                    ? wasm::WASM_TYPE_I64
+                    : wasm::WASM_TYPE_I32),
         Mutable});
     return WasmSym;
   }
@@ -208,6 +206,7 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
   OutMI.setOpcode(MI->getOpcode());
 
   const MCInstrDesc &Desc = MI->getDesc();
+  unsigned NumVariadicDefs = MI->getNumExplicitDefs() - Desc.getNumDefs();
   for (unsigned I = 0, E = MI->getNumOperands(); I != E; ++I) {
     const MachineOperand &MO = MI->getOperand(I);
 
@@ -229,9 +228,10 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
       MCOp = MCOperand::createReg(WAReg);
       break;
     }
-    case MachineOperand::MO_Immediate:
-      if (I < Desc.NumOperands) {
-        const MCOperandInfo &Info = Desc.OpInfo[I];
+    case MachineOperand::MO_Immediate: {
+      unsigned DescIndex = I - NumVariadicDefs;
+      if (DescIndex < Desc.NumOperands) {
+        const MCOperandInfo &Info = Desc.OpInfo[DescIndex];
         if (Info.OperandType == WebAssembly::OPERAND_TYPEINDEX) {
           SmallVector<wasm::ValType, 4> Returns;
           SmallVector<wasm::ValType, 4> Params;
@@ -270,6 +270,7 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
       }
       MCOp = MCOperand::createImm(MO.getImm());
       break;
+    }
     case MachineOperand::MO_FPImmediate: {
       // TODO: MC converts all floating point immediate operands to double.
       // This is fine for numeric values, but may cause NaNs to change bits.
@@ -306,13 +307,15 @@ void WebAssemblyMCInstLower::lower(const MachineInstr *MI,
 
   if (!WasmKeepRegisters)
     removeRegisterOperands(MI, OutMI);
+  else if (Desc.variadicOpsAreDefs())
+    OutMI.insert(OutMI.begin(), MCOperand::createImm(MI->getNumExplicitDefs()));
 }
 
 static void removeRegisterOperands(const MachineInstr *MI, MCInst &OutMI) {
   // Remove all uses of stackified registers to bring the instruction format
   // into its final stack form used thruout MC, and transition opcodes to
   // their _S variant.
-  // We do this seperate from the above code that still may need these
+  // We do this separate from the above code that still may need these
   // registers for e.g. call_indirect signatures.
   // See comments in lib/Target/WebAssembly/WebAssemblyInstrFormats.td for
   // details.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
index e4cc2389147b..adee2f0553f9 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.cpp
@@ -16,14 +16,15 @@
 #include "WebAssemblyISelLowering.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/CodeGen/Analysis.h"
+#include "llvm/Target/TargetMachine.h"
 using namespace llvm;
 
 WebAssemblyFunctionInfo::~WebAssemblyFunctionInfo() = default; // anchor.
 
-void WebAssemblyFunctionInfo::initWARegs() {
+void WebAssemblyFunctionInfo::initWARegs(MachineRegisterInfo &MRI) {
   assert(WARegs.empty());
   unsigned Reg = UnusedReg;
-  WARegs.resize(MF.getRegInfo().getNumVirtRegs(), Reg);
+  WARegs.resize(MRI.getNumVirtRegs(), Reg);
 }
 
 void llvm::computeLegalValueVTs(const Function &F, const TargetMachine &TM,
@@ -42,15 +43,17 @@ void llvm::computeLegalValueVTs(const Function &F, const TargetMachine &TM,
   }
 }
 
-void llvm::computeSignatureVTs(const FunctionType *Ty, const Function &F,
+void llvm::computeSignatureVTs(const FunctionType *Ty,
+                               const Function *TargetFunc,
+                               const Function &ContextFunc,
                                const TargetMachine &TM,
                                SmallVectorImpl<MVT> &Params,
                                SmallVectorImpl<MVT> &Results) {
-  computeLegalValueVTs(F, TM, Ty->getReturnType(), Results);
+  computeLegalValueVTs(ContextFunc, TM, Ty->getReturnType(), Results);
 
   MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits());
   if (Results.size() > 1 &&
-      !TM.getSubtarget<WebAssemblySubtarget>(F).hasMultivalue()) {
+      !TM.getSubtarget<WebAssemblySubtarget>(ContextFunc).hasMultivalue()) {
     // WebAssembly can't lower returns of multiple values without demoting to
     // sret unless multivalue is enabled (see
     // WebAssemblyTargetLowering::CanLowerReturn). So replace multiple return
@@ -60,9 +63,28 @@ void llvm::computeSignatureVTs(const FunctionType *Ty, const Function &F,
   }
 
   for (auto *Param : Ty->params())
-    computeLegalValueVTs(F, TM, Param, Params);
+    computeLegalValueVTs(ContextFunc, TM, Param, Params);
   if (Ty->isVarArg())
     Params.push_back(PtrVT);
+
+  // For swiftcc, emit additional swiftself and swifterror parameters
+  // if there aren't. These additional parameters are also passed for caller.
+  // They are necessary to match callee and caller signature for indirect
+  // call.
+
+  if (TargetFunc && TargetFunc->getCallingConv() == CallingConv::Swift) {
+    MVT PtrVT = MVT::getIntegerVT(TM.createDataLayout().getPointerSizeInBits());
+    bool HasSwiftErrorArg = false;
+    bool HasSwiftSelfArg = false;
+    for (const auto &Arg : TargetFunc->args()) {
+      HasSwiftErrorArg |= Arg.hasAttribute(Attribute::SwiftError);
+      HasSwiftSelfArg |= Arg.hasAttribute(Attribute::SwiftSelf);
+    }
+    if (!HasSwiftErrorArg)
+      Params.push_back(PtrVT);
+    if (!HasSwiftSelfArg)
+      Params.push_back(PtrVT);
+  }
 }
 
 void llvm::valTypesFromMVTs(const ArrayRef<MVT> &In,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
index 16e2f4392984..ca164fdd182c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMachineFunctionInfo.h
@@ -30,8 +30,6 @@ struct WebAssemblyFunctionInfo;
 /// This class is derived from MachineFunctionInfo and contains private
 /// WebAssembly-specific information for each MachineFunction.
 class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
-  MachineFunction &MF;
-
   std::vector<MVT> Params;
   std::vector<MVT> Results;
   std::vector<MVT> Locals;
@@ -55,12 +53,18 @@ class WebAssemblyFunctionInfo final : public MachineFunctionInfo {
   // A virtual register holding the base pointer for functions that have
   // overaligned values on the user stack.
   unsigned BasePtrVreg = -1U;
+  // A virtual register holding the frame base. This is either FP or SP
+  // after it has been replaced by a vreg
+  unsigned FrameBaseVreg = -1U;
+  // The local holding the frame base. This is either FP or SP
+  // after WebAssemblyExplicitLocals
+  unsigned FrameBaseLocal = -1U;
 
   // Function properties.
   bool CFGStackified = false;
 
 public:
-  explicit WebAssemblyFunctionInfo(MachineFunction &MF) : MF(MF) {}
+  explicit WebAssemblyFunctionInfo(MachineFunction &MF) {}
   ~WebAssemblyFunctionInfo() override;
   void initializeBaseYamlFields(const yaml::WebAssemblyFunctionInfo &YamlMFI);
 
@@ -90,12 +94,25 @@ public:
     assert(BasePtrVreg != -1U && "Base ptr vreg hasn't been set");
     return BasePtrVreg;
   }
+  void setFrameBaseVreg(unsigned Reg) { FrameBaseVreg = Reg; }
+  unsigned getFrameBaseVreg() const {
+    assert(FrameBaseVreg != -1U && "Frame base vreg hasn't been set");
+    return FrameBaseVreg;
+  }
+  void clearFrameBaseVreg() { FrameBaseVreg = -1U; }
+  // Return true if the frame base physreg has been replaced by a virtual reg.
+  bool isFrameBaseVirtual() const { return FrameBaseVreg != -1U; }
+  void setFrameBaseLocal(unsigned Local) { FrameBaseLocal = Local; }
+  unsigned getFrameBaseLocal() const {
+    assert(FrameBaseLocal != -1U && "Frame base local hasn't been set");
+    return FrameBaseLocal;
+  }
   void setBasePointerVreg(unsigned Reg) { BasePtrVreg = Reg; }
 
   static const unsigned UnusedReg = -1u;
 
-  void stackifyVReg(unsigned VReg) {
-    assert(MF.getRegInfo().getUniqueVRegDef(VReg));
+  void stackifyVReg(MachineRegisterInfo &MRI, unsigned VReg) {
+    assert(MRI.getUniqueVRegDef(VReg));
     auto I = Register::virtReg2Index(VReg);
     if (I >= VRegStackified.size())
       VRegStackified.resize(I + 1);
@@ -113,7 +130,7 @@ public:
     return VRegStackified.test(I);
   }
 
-  void initWARegs();
+  void initWARegs(MachineRegisterInfo &MRI);
   void setWAReg(unsigned VReg, unsigned WAReg) {
     assert(WAReg != UnusedReg);
     auto I = Register::virtReg2Index(VReg);
@@ -140,9 +157,10 @@ void computeLegalValueVTs(const Function &F, const TargetMachine &TM, Type *Ty,
                           SmallVectorImpl<MVT> &ValueVTs);
 
 // Compute the signature for a given FunctionType (Ty). Note that it's not the
-// signature for F (F is just used to get varous context)
-void computeSignatureVTs(const FunctionType *Ty, const Function &F,
-                         const TargetMachine &TM, SmallVectorImpl<MVT> &Params,
+// signature for ContextFunc (ContextFunc is just used to get varous context)
+void computeSignatureVTs(const FunctionType *Ty, const Function *TargetFunc,
+                         const Function &ContextFunc, const TargetMachine &TM,
+                         SmallVectorImpl<MVT> &Params,
                          SmallVectorImpl<MVT> &Results);
 
 void valTypesFromMVTs(const ArrayRef<MVT> &In,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
index ac428fcc826a..9aea65cba280 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyMemIntrinsicResults.cpp
@@ -201,8 +201,7 @@ bool WebAssemblyMemIntrinsicResults::runOnMachineFunction(MachineFunction &MF) {
       switch (MI.getOpcode()) {
       default:
         break;
-      case WebAssembly::CALL_i32:
-      case WebAssembly::CALL_i64:
+      case WebAssembly::CALL:
         Changed |= optimizeCall(MBB, MI, MRI, MDT, LIS, TLI, LibInfo);
         break;
       }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
index 0bd30791e57c..a2da0ea849e0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeLiveIntervals.cpp
@@ -20,6 +20,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "WebAssembly.h"
+#include "WebAssemblyMachineFunctionInfo.h"
 #include "WebAssemblySubtarget.h"
 #include "llvm/CodeGen/LiveIntervals.h"
 #include "llvm/CodeGen/MachineBlockFrequencyInfo.h"
@@ -82,10 +83,22 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(
   SmallVector<LiveInterval *, 4> SplitLIs;
   for (unsigned I = 0, E = MRI.getNumVirtRegs(); I < E; ++I) {
     unsigned Reg = Register::index2VirtReg(I);
+    auto &TRI = *MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
+
     if (MRI.reg_nodbg_empty(Reg))
       continue;
 
     LIS.splitSeparateComponents(LIS.getInterval(Reg), SplitLIs);
+    if (Reg == TRI.getFrameRegister(MF) && SplitLIs.size() > 0) {
+      // The live interval for the frame register was split, resulting in a new
+      // VReg. For now we only support debug info output for a single frame base
+      // value for the function, so just use the last one. It will certainly be
+      // wrong for some part of the function, but until we are able to track
+      // values through live-range splitting and stackification, it will have to
+      // do.
+      MF.getInfo<WebAssemblyFunctionInfo>()->setFrameBaseVreg(
+          SplitLIs.back()->reg);
+    }
     SplitLIs.clear();
   }
 
@@ -103,5 +116,5 @@ bool WebAssemblyOptimizeLiveIntervals::runOnMachineFunction(
     }
   }
 
-  return false;
+  return true;
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
index 9b60596e42b4..96390de8f5e7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyOptimizeReturned.cpp
@@ -42,7 +42,7 @@ public:
   static char ID;
   OptimizeReturned() : FunctionPass(ID) {}
 
-  void visitCallSite(CallSite CS);
+  void visitCallBase(CallBase &CB);
 };
 } // End anonymous namespace
 
@@ -55,17 +55,16 @@ FunctionPass *llvm::createWebAssemblyOptimizeReturned() {
   return new OptimizeReturned();
 }
 
-void OptimizeReturned::visitCallSite(CallSite CS) {
-  for (unsigned I = 0, E = CS.getNumArgOperands(); I < E; ++I)
-    if (CS.paramHasAttr(I, Attribute::Returned)) {
-      Instruction *Inst = CS.getInstruction();
-      Value *Arg = CS.getArgOperand(I);
+void OptimizeReturned::visitCallBase(CallBase &CB) {
+  for (unsigned I = 0, E = CB.getNumArgOperands(); I < E; ++I)
+    if (CB.paramHasAttr(I, Attribute::Returned)) {
+      Value *Arg = CB.getArgOperand(I);
       // Ignore constants, globals, undef, etc.
       if (isa<Constant>(Arg))
         continue;
       // Like replaceDominatedUsesWith but using Instruction/Use dominance.
-      Arg->replaceUsesWithIf(Inst,
-                             [&](Use &U) { return DT->dominates(Inst, U); });
+      Arg->replaceUsesWithIf(&CB,
+                             [&](Use &U) { return DT->dominates(&CB, U); });
     }
 }
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
index ea6cd09a604c..a587c9d23d2b 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyPeephole.cpp
@@ -66,7 +66,7 @@ static bool maybeRewriteToDrop(unsigned OldReg, unsigned NewReg,
     Register NewReg = MRI.createVirtualRegister(MRI.getRegClass(OldReg));
     MO.setReg(NewReg);
     MO.setIsDead();
-    MFI.stackifyVReg(NewReg);
+    MFI.stackifyVReg(MRI, NewReg);
   }
   return Changed;
 }
@@ -121,7 +121,7 @@ static bool maybeRewriteToFallthrough(MachineInstr &MI, MachineBasicBlock &MBB,
       BuildMI(MBB, MI, MI.getDebugLoc(), TII.get(CopyLocalOpc), NewReg)
           .addReg(Reg);
       MO.setReg(NewReg);
-      MFI.stackifyVReg(NewReg);
+      MFI.stackifyVReg(MRI, NewReg);
     }
   }
 
@@ -149,8 +149,7 @@ bool WebAssemblyPeephole::runOnMachineFunction(MachineFunction &MF) {
       switch (MI.getOpcode()) {
       default:
         break;
-      case WebAssembly::CALL_i32:
-      case WebAssembly::CALL_i64: {
+      case WebAssembly::CALL: {
         MachineOperand &Op1 = MI.getOperand(1);
         if (Op1.isSymbol()) {
           StringRef Name(Op1.getSymbolName());
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
index 043b6f1b7d18..20fe2b2b7bfc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegColoring.cpp
@@ -157,6 +157,9 @@ bool WebAssemblyRegColoring::runOnMachineFunction(MachineFunction &MF) {
     Changed |= Old != New;
     UsedColors.set(Color);
     Assignments[Color].push_back(LI);
+    // If we reassigned the stack pointer, update the debug frame base info.
+    if (Old != New && MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Old)
+      MFI.setFrameBaseVreg(New);
     LLVM_DEBUG(dbgs() << "Assigning vreg" << Register::virtReg2Index(LI->reg)
                       << " to vreg" << Register::virtReg2Index(New) << "\n");
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
index 72e7a7cf5042..b655014f4a90 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegNumbering.cpp
@@ -66,7 +66,7 @@ bool WebAssemblyRegNumbering::runOnMachineFunction(MachineFunction &MF) {
   WebAssemblyFunctionInfo &MFI = *MF.getInfo<WebAssemblyFunctionInfo>();
   MachineRegisterInfo &MRI = MF.getRegInfo();
 
-  MFI.initWARegs();
+  MFI.initWARegs(MRI);
 
   // WebAssembly argument registers are in the same index space as local
   // variables. Assign the numbers for them first.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
index 421d353a89e8..1d4e2e3a8f9e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegStackify.cpp
@@ -36,6 +36,7 @@
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include <iterator>
 using namespace llvm;
 
 #define DEBUG_TYPE "wasm-reg-stackify"
@@ -120,6 +121,7 @@ static void convertImplicitDefToConstZero(MachineInstr *MI,
         Type::getDoubleTy(MF.getFunction().getContext())));
     MI->addOperand(MachineOperand::CreateFPImm(Val));
   } else if (RegClass == &WebAssembly::V128RegClass) {
+    // TODO: Replace this with v128.const 0 once that is supported in V8
     Register TempReg = MRI.createVirtualRegister(&WebAssembly::I32RegClass);
     MI->setDesc(TII->get(WebAssembly::SPLAT_v4i32));
     MI->addOperand(MachineOperand::CreateReg(TempReg, false));
@@ -135,12 +137,12 @@ static void convertImplicitDefToConstZero(MachineInstr *MI,
 // Determine whether a call to the callee referenced by
 // MI->getOperand(CalleeOpNo) reads memory, writes memory, and/or has side
 // effects.
-static void queryCallee(const MachineInstr &MI, unsigned CalleeOpNo, bool &Read,
-                        bool &Write, bool &Effects, bool &StackPointer) {
+static void queryCallee(const MachineInstr &MI, bool &Read, bool &Write,
+                        bool &Effects, bool &StackPointer) {
   // All calls can use the stack pointer.
   StackPointer = true;
 
-  const MachineOperand &MO = MI.getOperand(CalleeOpNo);
+  const MachineOperand &MO = WebAssembly::getCalleeOp(MI);
   if (MO.isGlobal()) {
     const Constant *GV = MO.getGlobal();
     if (const auto *GA = dyn_cast<GlobalAlias>(GV))
@@ -246,14 +248,14 @@ static void query(const MachineInstr &MI, AliasAnalysis &AA, bool &Read,
   }
 
   // Check for writes to __stack_pointer global.
-  if (MI.getOpcode() == WebAssembly::GLOBAL_SET_I32 &&
+  if ((MI.getOpcode() == WebAssembly::GLOBAL_SET_I32 ||
+       MI.getOpcode() == WebAssembly::GLOBAL_SET_I64) &&
       strcmp(MI.getOperand(0).getSymbolName(), "__stack_pointer") == 0)
     StackPointer = true;
 
   // Analyze calls.
   if (MI.isCall()) {
-    unsigned CalleeOpNo = WebAssembly::getCalleeOpNo(MI.getOpcode());
-    queryCallee(MI, CalleeOpNo, Read, Write, Effects, StackPointer);
+    queryCallee(MI, Read, Write, Effects, StackPointer);
   }
 }
 
@@ -313,25 +315,59 @@ static bool hasOneUse(unsigned Reg, MachineInstr *Def, MachineRegisterInfo &MRI,
 // walking the block.
 // TODO: Compute memory dependencies in a way that uses AliasAnalysis to be
 // more precise.
-static bool isSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
-                         AliasAnalysis &AA, const MachineRegisterInfo &MRI) {
-  assert(Def->getParent() == Insert->getParent());
+static bool isSafeToMove(const MachineOperand *Def, const MachineOperand *Use,
+                         const MachineInstr *Insert, AliasAnalysis &AA,
+                         const WebAssemblyFunctionInfo &MFI,
+                         const MachineRegisterInfo &MRI) {
+  const MachineInstr *DefI = Def->getParent();
+  const MachineInstr *UseI = Use->getParent();
+  assert(DefI->getParent() == Insert->getParent());
+  assert(UseI->getParent() == Insert->getParent());
+
+  // The first def of a multivalue instruction can be stackified by moving,
+  // since the later defs can always be placed into locals if necessary. Later
+  // defs can only be stackified if all previous defs are already stackified
+  // since ExplicitLocals will not know how to place a def in a local if a
+  // subsequent def is stackified. But only one def can be stackified by moving
+  // the instruction, so it must be the first one.
+  //
+  // TODO: This could be loosened to be the first *live* def, but care would
+  // have to be taken to ensure the drops of the initial dead defs can be
+  // placed. This would require checking that no previous defs are used in the
+  // same instruction as subsequent defs.
+  if (Def != DefI->defs().begin())
+    return false;
+
+  // If any subsequent def is used prior to the current value by the same
+  // instruction in which the current value is used, we cannot
+  // stackify. Stackifying in this case would require that def moving below the
+  // current def in the stack, which cannot be achieved, even with locals.
+  for (const auto &SubsequentDef : drop_begin(DefI->defs(), 1)) {
+    for (const auto &PriorUse : UseI->uses()) {
+      if (&PriorUse == Use)
+        break;
+      if (PriorUse.isReg() && SubsequentDef.getReg() == PriorUse.getReg())
+        return false;
+    }
+  }
+
+  // If moving is a semantic nop, it is always allowed
+  const MachineBasicBlock *MBB = DefI->getParent();
+  auto NextI = std::next(MachineBasicBlock::const_iterator(DefI));
+  for (auto E = MBB->end(); NextI != E && NextI->isDebugInstr(); ++NextI)
+    ;
+  if (NextI == Insert)
+    return true;
 
   // 'catch' and 'extract_exception' should be the first instruction of a BB and
   // cannot move.
-  if (Def->getOpcode() == WebAssembly::CATCH ||
-      Def->getOpcode() == WebAssembly::EXTRACT_EXCEPTION_I32) {
-    const MachineBasicBlock *MBB = Def->getParent();
-    auto NextI = std::next(MachineBasicBlock::const_iterator(Def));
-    for (auto E = MBB->end(); NextI != E && NextI->isDebugInstr(); ++NextI)
-      ;
-    if (NextI != Insert)
-      return false;
-  }
+  if (DefI->getOpcode() == WebAssembly::CATCH ||
+      DefI->getOpcode() == WebAssembly::EXTRACT_EXCEPTION_I32)
+    return false;
 
   // Check for register dependencies.
   SmallVector<unsigned, 4> MutableRegisters;
-  for (const MachineOperand &MO : Def->operands()) {
+  for (const MachineOperand &MO : DefI->operands()) {
     if (!MO.isReg() || MO.isUndef())
       continue;
     Register Reg = MO.getReg();
@@ -361,7 +397,7 @@ static bool isSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
   }
 
   bool Read = false, Write = false, Effects = false, StackPointer = false;
-  query(*Def, AA, Read, Write, Effects, StackPointer);
+  query(*DefI, AA, Read, Write, Effects, StackPointer);
 
   // If the instruction does not access memory and has no side effects, it has
   // no additional dependencies.
@@ -369,8 +405,8 @@ static bool isSafeToMove(const MachineInstr *Def, const MachineInstr *Insert,
   if (!Read && !Write && !Effects && !StackPointer && !HasMutableRegisters)
     return true;
 
-  // Scan through the intervening instructions between Def and Insert.
-  MachineBasicBlock::const_iterator D(Def), I(Insert);
+  // Scan through the intervening instructions between DefI and Insert.
+  MachineBasicBlock::const_iterator D(DefI), I(Insert);
   for (--I; I != D; --I) {
     bool InterveningRead = false;
     bool InterveningWrite = false;
@@ -495,7 +531,7 @@ static MachineInstr *moveForSingleUse(unsigned Reg, MachineOperand &Op,
   if (MRI.hasOneDef(Reg) && MRI.hasOneUse(Reg)) {
     // No one else is using this register for anything so we can just stackify
     // it in place.
-    MFI.stackifyVReg(Reg);
+    MFI.stackifyVReg(MRI, Reg);
   } else {
     // The register may have unrelated uses or defs; create a new register for
     // just our one def and use so that we can stackify it.
@@ -512,7 +548,7 @@ static MachineInstr *moveForSingleUse(unsigned Reg, MachineOperand &Op,
                      LIS.getInstructionIndex(*Op.getParent()).getRegSlot(),
                      /*RemoveDeadValNo=*/true);
 
-    MFI.stackifyVReg(NewReg);
+    MFI.stackifyVReg(MRI, NewReg);
 
     DefDIs.updateReg(NewReg);
 
@@ -541,7 +577,7 @@ static MachineInstr *rematerializeCheapDef(
   MachineInstr *Clone = &*std::prev(Insert);
   LIS.InsertMachineInstrInMaps(*Clone);
   LIS.createAndComputeVirtRegInterval(NewReg);
-  MFI.stackifyVReg(NewReg);
+  MFI.stackifyVReg(MRI, NewReg);
   imposeStackOrdering(Clone);
 
   LLVM_DEBUG(dbgs() << " - Cloned to "; Clone->dump());
@@ -632,8 +668,8 @@ static MachineInstr *moveAndTeeForMultiUse(
   // Finish stackifying the new regs.
   LIS.createAndComputeVirtRegInterval(TeeReg);
   LIS.createAndComputeVirtRegInterval(DefReg);
-  MFI.stackifyVReg(DefReg);
-  MFI.stackifyVReg(TeeReg);
+  MFI.stackifyVReg(MRI, DefReg);
+  MFI.stackifyVReg(MRI, TeeReg);
   imposeStackOrdering(Def);
   imposeStackOrdering(Tee);
 
@@ -801,32 +837,32 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
       CommutingState Commuting;
       TreeWalkerState TreeWalker(Insert);
       while (!TreeWalker.done()) {
-        MachineOperand &Op = TreeWalker.pop();
+        MachineOperand &Use = TreeWalker.pop();
 
         // We're only interested in explicit virtual register operands.
-        if (!Op.isReg())
+        if (!Use.isReg())
           continue;
 
-        Register Reg = Op.getReg();
-        assert(Op.isUse() && "explicit_uses() should only iterate over uses");
-        assert(!Op.isImplicit() &&
+        Register Reg = Use.getReg();
+        assert(Use.isUse() && "explicit_uses() should only iterate over uses");
+        assert(!Use.isImplicit() &&
                "explicit_uses() should only iterate over explicit operands");
         if (Register::isPhysicalRegister(Reg))
           continue;
 
         // Identify the definition for this register at this point.
-        MachineInstr *Def = getVRegDef(Reg, Insert, MRI, LIS);
-        if (!Def)
+        MachineInstr *DefI = getVRegDef(Reg, Insert, MRI, LIS);
+        if (!DefI)
           continue;
 
         // Don't nest an INLINE_ASM def into anything, because we don't have
         // constraints for $pop outputs.
-        if (Def->isInlineAsm())
+        if (DefI->isInlineAsm())
           continue;
 
         // Argument instructions represent live-in registers and not real
         // instructions.
-        if (WebAssembly::isArgument(Def->getOpcode()))
+        if (WebAssembly::isArgument(DefI->getOpcode()))
           continue;
 
         // Currently catch's return value register cannot be stackified, because
@@ -843,28 +879,38 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
         // register should be assigned to a local to be propagated across
         // 'block' boundary now.
         //
-        // TODO Fix this once we support the multi-value proposal.
-        if (Def->getOpcode() == WebAssembly::CATCH)
+        // TODO: Fix this once we support the multivalue blocks
+        if (DefI->getOpcode() == WebAssembly::CATCH)
           continue;
 
+        MachineOperand *Def = DefI->findRegisterDefOperand(Reg);
+        assert(Def != nullptr);
+
         // Decide which strategy to take. Prefer to move a single-use value
         // over cloning it, and prefer cloning over introducing a tee.
         // For moving, we require the def to be in the same block as the use;
         // this makes things simpler (LiveIntervals' handleMove function only
         // supports intra-block moves) and it's MachineSink's job to catch all
         // the sinking opportunities anyway.
-        bool SameBlock = Def->getParent() == &MBB;
-        bool CanMove = SameBlock && isSafeToMove(Def, Insert, AA, MRI) &&
+        bool SameBlock = DefI->getParent() == &MBB;
+        bool CanMove = SameBlock &&
+                       isSafeToMove(Def, &Use, Insert, AA, MFI, MRI) &&
                        !TreeWalker.isOnStack(Reg);
-        if (CanMove && hasOneUse(Reg, Def, MRI, MDT, LIS)) {
-          Insert = moveForSingleUse(Reg, Op, Def, MBB, Insert, LIS, MFI, MRI);
-        } else if (shouldRematerialize(*Def, AA, TII)) {
+        if (CanMove && hasOneUse(Reg, DefI, MRI, MDT, LIS)) {
+          Insert = moveForSingleUse(Reg, Use, DefI, MBB, Insert, LIS, MFI, MRI);
+
+          // If we are removing the frame base reg completely, remove the debug
+          // info as well.
+          // TODO: Encode this properly as a stackified value.
+          if (MFI.isFrameBaseVirtual() && MFI.getFrameBaseVreg() == Reg)
+            MFI.clearFrameBaseVreg();
+        } else if (shouldRematerialize(*DefI, AA, TII)) {
           Insert =
-              rematerializeCheapDef(Reg, Op, *Def, MBB, Insert->getIterator(),
+              rematerializeCheapDef(Reg, Use, *DefI, MBB, Insert->getIterator(),
                                     LIS, MFI, MRI, TII, TRI);
-        } else if (CanMove &&
-                   oneUseDominatesOtherUses(Reg, Op, MBB, MRI, MDT, LIS, MFI)) {
-          Insert = moveAndTeeForMultiUse(Reg, Op, Def, MBB, Insert, LIS, MFI,
+        } else if (CanMove && oneUseDominatesOtherUses(Reg, Use, MBB, MRI, MDT,
+                                                       LIS, MFI)) {
+          Insert = moveAndTeeForMultiUse(Reg, Use, DefI, MBB, Insert, LIS, MFI,
                                          MRI, TII);
         } else {
           // We failed to stackify the operand. If the problem was ordering
@@ -875,6 +921,25 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
           continue;
         }
 
+        // Stackifying a multivalue def may unlock in-place stackification of
+        // subsequent defs. TODO: Handle the case where the consecutive uses are
+        // not all in the same instruction.
+        auto *SubsequentDef = Insert->defs().begin();
+        auto *SubsequentUse = &Use;
+        while (SubsequentDef != Insert->defs().end() &&
+               SubsequentUse != Use.getParent()->uses().end()) {
+          if (!SubsequentDef->isReg() || !SubsequentUse->isReg())
+            break;
+          unsigned DefReg = SubsequentDef->getReg();
+          unsigned UseReg = SubsequentUse->getReg();
+          // TODO: This single-use restriction could be relaxed by using tees
+          if (DefReg != UseReg || !MRI.hasOneUse(DefReg))
+            break;
+          MFI.stackifyVReg(MRI, DefReg);
+          ++SubsequentDef;
+          ++SubsequentUse;
+        }
+
         // If the instruction we just stackified is an IMPLICIT_DEF, convert it
         // to a constant 0 so that the def is explicit, and the push/pop
         // correspondence is maintained.
@@ -912,18 +977,20 @@ bool WebAssemblyRegStackify::runOnMachineFunction(MachineFunction &MF) {
     for (MachineInstr &MI : MBB) {
       if (MI.isDebugInstr())
         continue;
-      for (MachineOperand &MO : reverse(MI.explicit_operands())) {
+      for (MachineOperand &MO : reverse(MI.explicit_uses())) {
         if (!MO.isReg())
           continue;
         Register Reg = MO.getReg();
-
-        if (MFI.isVRegStackified(Reg)) {
-          if (MO.isDef())
-            Stack.push_back(Reg);
-          else
-            assert(Stack.pop_back_val() == Reg &&
-                   "Register stack pop should be paired with a push");
-        }
+        if (MFI.isVRegStackified(Reg))
+          assert(Stack.pop_back_val() == Reg &&
+                 "Register stack pop should be paired with a push");
+      }
+      for (MachineOperand &MO : MI.defs()) {
+        if (!MO.isReg())
+          continue;
+        Register Reg = MO.getReg();
+        if (MFI.isVRegStackified(Reg))
+          Stack.push_back(MO.getReg());
       }
     }
     // TODO: Generalize this code to support keeping values on the stack across
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
index 789a025794ea..130589c9df8c 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRegisterInfo.cpp
@@ -88,16 +88,17 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex(
 
   // If this is an address being added to a constant, fold the frame offset
   // into the constant.
-  if (MI.getOpcode() == WebAssembly::ADD_I32) {
+  if (MI.getOpcode() == WebAssemblyFrameLowering::getOpcAdd(MF)) {
     MachineOperand &OtherMO = MI.getOperand(3 - FIOperandNum);
     if (OtherMO.isReg()) {
       Register OtherMOReg = OtherMO.getReg();
       if (Register::isVirtualRegister(OtherMOReg)) {
         MachineInstr *Def = MF.getRegInfo().getUniqueVRegDef(OtherMOReg);
         // TODO: For now we just opportunistically do this in the case where
-        // the CONST_I32 happens to have exactly one def and one use. We
+        // the CONST_I32/64 happens to have exactly one def and one use. We
         // should generalize this to optimize in more cases.
-        if (Def && Def->getOpcode() == WebAssembly::CONST_I32 &&
+        if (Def && Def->getOpcode() ==
+              WebAssemblyFrameLowering::getOpcConst(MF) &&
             MRI.hasOneNonDBGUse(Def->getOperand(0).getReg())) {
           MachineOperand &ImmMO = Def->getOperand(1);
           ImmMO.setImm(ImmMO.getImm() + uint32_t(FrameOffset));
@@ -109,20 +110,22 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex(
     }
   }
 
-  // Otherwise create an i32.add SP, offset and make it the operand.
+  // Otherwise create an i32/64.add SP, offset and make it the operand.
   const auto *TII = MF.getSubtarget<WebAssemblySubtarget>().getInstrInfo();
 
   unsigned FIRegOperand = FrameRegister;
   if (FrameOffset) {
-    // Create i32.add SP, offset and make it the operand.
+    // Create i32/64.add SP, offset and make it the operand.
     const TargetRegisterClass *PtrRC =
         MRI.getTargetRegisterInfo()->getPointerRegClass(MF);
     Register OffsetOp = MRI.createVirtualRegister(PtrRC);
-    BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::CONST_I32),
+    BuildMI(MBB, *II, II->getDebugLoc(),
+            TII->get(WebAssemblyFrameLowering::getOpcConst(MF)),
             OffsetOp)
         .addImm(FrameOffset);
     FIRegOperand = MRI.createVirtualRegister(PtrRC);
-    BuildMI(MBB, *II, II->getDebugLoc(), TII->get(WebAssembly::ADD_I32),
+    BuildMI(MBB, *II, II->getDebugLoc(),
+            TII->get(WebAssemblyFrameLowering::getOpcAdd(MF)),
             FIRegOperand)
         .addReg(FrameRegister)
         .addReg(OffsetOp);
@@ -132,6 +135,10 @@ void WebAssemblyRegisterInfo::eliminateFrameIndex(
 
 Register
 WebAssemblyRegisterInfo::getFrameRegister(const MachineFunction &MF) const {
+  // If the PReg has been replaced by a VReg, return that.
+  const auto &MFI = MF.getInfo<WebAssemblyFunctionInfo>();
+  if (MFI->isFrameBaseVirtual())
+    return MFI->getFrameBaseVreg();
   static const unsigned Regs[2][2] = {
       /*            !isArch64Bit       isArch64Bit      */
       /* !hasFP */ {WebAssembly::SP32, WebAssembly::SP64},
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
index 5eafd6c54e78..9f5d6b2a9a47 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyReplacePhysRegs.cpp
@@ -67,7 +67,7 @@ bool WebAssemblyReplacePhysRegs::runOnMachineFunction(MachineFunction &MF) {
   });
 
   MachineRegisterInfo &MRI = MF.getRegInfo();
-  const auto &TRI = *MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
+  auto &TRI = *MF.getSubtarget<WebAssemblySubtarget>().getRegisterInfo();
   bool Changed = false;
 
   assert(!mustPreserveAnalysisID(LiveIntervalsID) &&
@@ -88,8 +88,18 @@ bool WebAssemblyReplacePhysRegs::runOnMachineFunction(MachineFunction &MF) {
     for (auto I = MRI.reg_begin(PReg), E = MRI.reg_end(); I != E;) {
       MachineOperand &MO = *I++;
       if (!MO.isImplicit()) {
-        if (VReg == WebAssembly::NoRegister)
+        if (VReg == WebAssembly::NoRegister) {
           VReg = MRI.createVirtualRegister(RC);
+          if (PReg == TRI.getFrameRegister(MF)) {
+            auto FI = MF.getInfo<WebAssemblyFunctionInfo>();
+            assert(!FI->isFrameBaseVirtual());
+            FI->setFrameBaseVreg(VReg);
+            LLVM_DEBUG({
+              dbgs() << "replacing preg " << PReg << " with " << VReg << " ("
+                     << Register::virtReg2Index(VReg) << ")\n";
+            });
+          }
+        }
         MO.setReg(VReg);
         if (MO.getParent()->isDebugValue())
           MO.setIsDebug();
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index c6cf7b6bc551..6456026f4ba7 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -82,6 +82,7 @@ enum RuntimeLibcallSignature {
   func_iPTR_i32,
   func_iPTR_i64,
   func_iPTR_i64_i64,
+  func_iPTR_i64_i64_i32,
   func_iPTR_i64_i64_i64_i64,
   func_iPTR_i64_i64_i64_i64_i64_i64,
   i32_func_i64_i64,
@@ -173,10 +174,13 @@ struct RuntimeLibcallSignatureTable {
     Table[RTLIB::FMA_F128] = func_iPTR_i64_i64_i64_i64_i64_i64;
     Table[RTLIB::POWI_F32] = f32_func_f32_i32;
     Table[RTLIB::POWI_F64] = f64_func_f64_i32;
-    Table[RTLIB::POWI_F128] = func_iPTR_i64_i64_i64_i64;
+    Table[RTLIB::POWI_F128] = func_iPTR_i64_i64_i32;
     Table[RTLIB::SQRT_F32] = f32_func_f32;
     Table[RTLIB::SQRT_F64] = f64_func_f64;
     Table[RTLIB::SQRT_F128] = func_iPTR_i64_i64;
+    Table[RTLIB::CBRT_F32] = f32_func_f32;
+    Table[RTLIB::CBRT_F64] = f64_func_f64;
+    Table[RTLIB::CBRT_F128] = func_iPTR_i64_i64;
     Table[RTLIB::LOG_F32] = f32_func_f32;
     Table[RTLIB::LOG_F64] = f64_func_f64;
     Table[RTLIB::LOG_F128] = func_iPTR_i64_i64;
@@ -829,6 +833,12 @@ void llvm::getLibcallSignature(const WebAssemblySubtarget &Subtarget,
     Params.push_back(wasm::ValType::I64);
     Params.push_back(wasm::ValType::I64);
     break;
+  case func_iPTR_i64_i64_i32:
+    Params.push_back(PtrTy);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I64);
+    Params.push_back(wasm::ValType::I32);
+    break;
   case func_iPTR_i64_i64_i64_i64:
     Params.push_back(PtrTy);
     Params.push_back(wasm::ValType::I64);
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
index 890e4b8e4e2a..16e05150c64e 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.cpp
@@ -20,40 +20,40 @@ WebAssemblySelectionDAGInfo::~WebAssemblySelectionDAGInfo() = default; // anchor
 
 SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, unsigned Align, bool IsVolatile, bool AlwaysInline,
+    SDValue Size, Align Alignment, bool IsVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
-  if (!DAG.getMachineFunction()
-           .getSubtarget<WebAssemblySubtarget>()
-           .hasBulkMemory())
+  auto &ST = DAG.getMachineFunction().getSubtarget<WebAssemblySubtarget>();
+  if (!ST.hasBulkMemory())
     return SDValue();
 
   SDValue MemIdx = DAG.getConstant(0, DL, MVT::i32);
+  auto LenMVT = ST.hasAddr64() ? MVT::i64 : MVT::i32;
   return DAG.getNode(WebAssemblyISD::MEMORY_COPY, DL, MVT::Other,
                      {Chain, MemIdx, MemIdx, Dst, Src,
-                      DAG.getZExtOrTrunc(Size, DL, MVT::i32)});
+                      DAG.getZExtOrTrunc(Size, DL, LenMVT)});
 }
 
 SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemmove(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Op1, SDValue Op2,
-    SDValue Op3, unsigned Align, bool IsVolatile,
+    SDValue Op3, Align Alignment, bool IsVolatile,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
-  return EmitTargetCodeForMemcpy(DAG, DL, Chain, Op1, Op2, Op3, Align,
-                                 IsVolatile, false, DstPtrInfo,
-                                 SrcPtrInfo);
+  return EmitTargetCodeForMemcpy(DAG, DL, Chain, Op1, Op2, Op3,
+                                 Alignment, IsVolatile, false,
+                                 DstPtrInfo, SrcPtrInfo);
 }
 
 SDValue WebAssemblySelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &DL, SDValue Chain, SDValue Dst, SDValue Val,
-    SDValue Size, unsigned Align, bool IsVolatile,
+    SDValue Size, Align Alignment, bool IsVolatile,
     MachinePointerInfo DstPtrInfo) const {
-  if (!DAG.getMachineFunction()
-           .getSubtarget<WebAssemblySubtarget>()
-           .hasBulkMemory())
+  auto &ST = DAG.getMachineFunction().getSubtarget<WebAssemblySubtarget>();
+  if (!ST.hasBulkMemory())
     return SDValue();
 
   SDValue MemIdx = DAG.getConstant(0, DL, MVT::i32);
+  auto LenMVT = ST.hasAddr64() ? MVT::i64 : MVT::i32;
   // Only low byte matters for val argument, so anyext the i8
   return DAG.getNode(WebAssemblyISD::MEMORY_FILL, DL, MVT::Other, Chain, MemIdx,
                      Dst, DAG.getAnyExtOrTrunc(Val, DL, MVT::i32),
-                     DAG.getZExtOrTrunc(Size, DL, MVT::i32));
+                     DAG.getZExtOrTrunc(Size, DL, LenMVT));
 }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
index 0b90ece27dff..f4d2132fd3af 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySelectionDAGInfo.h
@@ -24,18 +24,19 @@ public:
   ~WebAssemblySelectionDAGInfo() override;
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
-                                  SDValue Op3, unsigned Align, bool isVolatile,
+                                  SDValue Op3, Align Alignment, bool isVolatile,
                                   bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const override;
-  SDValue EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl,
-                                   SDValue Chain, SDValue Op1, SDValue Op2,
-                                   SDValue Op3, unsigned Align, bool isVolatile,
-                                   MachinePointerInfo DstPtrInfo,
-                                   MachinePointerInfo SrcPtrInfo) const override;
+  SDValue
+  EmitTargetCodeForMemmove(SelectionDAG &DAG, const SDLoc &dl, SDValue Chain,
+                           SDValue Op1, SDValue Op2, SDValue Op3,
+                           Align Alignment, bool isVolatile,
+                           MachinePointerInfo DstPtrInfo,
+                           MachinePointerInfo SrcPtrInfo) const override;
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &DL,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
-                                  SDValue Op3, unsigned Align, bool IsVolatile,
+                                  SDValue Op3, Align Alignment, bool IsVolatile,
                                   MachinePointerInfo DstPtrInfo) const override;
 };
 
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
index a249ccf17638..89ae45722e42 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySetP2AlignOperands.cpp
@@ -65,7 +65,7 @@ static void rewriteP2Align(MachineInstr &MI, unsigned OperandNo) {
   assert(MI.getDesc().OpInfo[OperandNo].OperandType ==
              WebAssembly::OPERAND_P2ALIGN &&
          "Load and store instructions should have a p2align operand");
-  uint64_t P2Align = Log2_64((*MI.memoperands_begin())->getAlignment());
+  uint64_t P2Align = Log2((*MI.memoperands_begin())->getAlign());
 
   // WebAssembly does not currently support supernatural alignment.
   P2Align = std::min(P2Align,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
index 196a74565285..cacf5ab078a0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.cpp
@@ -25,13 +25,15 @@ using namespace llvm;
 #include "WebAssemblyGenSubtargetInfo.inc"
 
 WebAssemblySubtarget &
-WebAssemblySubtarget::initializeSubtargetDependencies(StringRef FS) {
+WebAssemblySubtarget::initializeSubtargetDependencies(StringRef CPU,
+                                                      StringRef FS) {
   // Determine default and user-specified characteristics
+  LLVM_DEBUG(llvm::dbgs() << "initializeSubtargetDependencies\n");
 
-  if (CPUString.empty())
-    CPUString = "generic";
+  if (CPU.empty())
+    CPU = "generic";
 
-  ParseSubtargetFeatures(CPUString, FS);
+  ParseSubtargetFeatures(CPU, FS);
   return *this;
 }
 
@@ -39,10 +41,9 @@ WebAssemblySubtarget::WebAssemblySubtarget(const Triple &TT,
                                            const std::string &CPU,
                                            const std::string &FS,
                                            const TargetMachine &TM)
-    : WebAssemblyGenSubtargetInfo(TT, CPU, FS), CPUString(CPU),
-      TargetTriple(TT), FrameLowering(),
-      InstrInfo(initializeSubtargetDependencies(FS)), TSInfo(),
-      TLInfo(TM, *this) {}
+    : WebAssemblyGenSubtargetInfo(TT, CPU, FS), TargetTriple(TT),
+      FrameLowering(), InstrInfo(initializeSubtargetDependencies(CPU, FS)),
+      TSInfo(), TLInfo(TM, *this) {}
 
 bool WebAssemblySubtarget::enableAtomicExpand() const {
   // If atomics are disabled, atomic ops are lowered instead of expanded
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
index 8db2120f9834..8b95a3ddb837 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblySubtarget.h
@@ -47,9 +47,7 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
   bool HasMultivalue = false;
   bool HasMutableGlobals = false;
   bool HasTailCall = false;
-
-  /// String name of used CPU.
-  std::string CPUString;
+  bool HasReferenceTypes = false;
 
   /// What processor and OS we're targeting.
   Triple TargetTriple;
@@ -59,9 +57,8 @@ class WebAssemblySubtarget final : public WebAssemblyGenSubtargetInfo {
   WebAssemblySelectionDAGInfo TSInfo;
   WebAssemblyTargetLowering TLInfo;
 
-  /// Initializes using CPUString and the passed in feature string so that we
-  /// can use initializer lists for subtarget initialization.
-  WebAssemblySubtarget &initializeSubtargetDependencies(StringRef FS);
+  WebAssemblySubtarget &initializeSubtargetDependencies(StringRef CPU,
+                                                        StringRef FS);
 
 public:
   /// This constructor initializes the data members to match that
@@ -104,6 +101,7 @@ public:
   bool hasMultivalue() const { return HasMultivalue; }
   bool hasMutableGlobals() const { return HasMutableGlobals; }
   bool hasTailCall() const { return HasTailCall; }
+  bool hasReferenceTypes() const { return HasReferenceTypes; }
 
   /// Parses features string setting specified subtarget options. Definition of
   /// function is auto generated by tblgen.
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
index 4291b48c16be..7bf655c925a4 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.cpp
@@ -45,6 +45,16 @@ static cl::opt<bool> EnableEmSjLj(
     cl::desc("WebAssembly Emscripten-style setjmp/longjmp handling"),
     cl::init(false));
 
+// A command-line option to keep implicit locals
+// for the purpose of testing with lit/llc ONLY.
+// This produces output which is not valid WebAssembly, and is not supported
+// by assemblers/disassemblers and other MC based tools.
+static cl::opt<bool> WasmDisableExplicitLocals(
+    "wasm-disable-explicit-locals", cl::Hidden,
+    cl::desc("WebAssembly: output implicit locals in"
+             " instruction output for test purposes only."),
+    cl::init(false));
+
 extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
   // Register the target.
   RegisterTargetMachine<WebAssemblyTargetMachine> X(
@@ -75,8 +85,8 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeWebAssemblyTarget() {
   initializeWebAssemblyExplicitLocalsPass(PR);
   initializeWebAssemblyLowerBrUnlessPass(PR);
   initializeWebAssemblyRegNumberingPass(PR);
+  initializeWebAssemblyDebugFixupPass(PR);
   initializeWebAssemblyPeepholePass(PR);
-  initializeWebAssemblyCallIndirectFixupPass(PR);
 }
 
 //===----------------------------------------------------------------------===//
@@ -210,8 +220,8 @@ private:
   FeatureBitset coalesceFeatures(const Module &M) {
     FeatureBitset Features =
         WasmTM
-            ->getSubtargetImpl(WasmTM->getTargetCPU(),
-                               WasmTM->getTargetFeatureString())
+            ->getSubtargetImpl(std::string(WasmTM->getTargetCPU()),
+                               std::string(WasmTM->getTargetFeatureString()))
             ->getFeatureBits();
     for (auto &F : M)
       Features |= WasmTM->getSubtargetImpl(F)->getFeatureBits();
@@ -274,21 +284,22 @@ private:
 
   void recordFeatures(Module &M, const FeatureBitset &Features, bool Stripped) {
     for (const SubtargetFeatureKV &KV : WebAssemblyFeatureKV) {
-      std::string MDKey = (StringRef("wasm-feature-") + KV.Key).str();
-      if (KV.Value == WebAssembly::FeatureAtomics && Stripped) {
-        // "atomics" is special: code compiled without atomics may have had its
-        // atomics lowered to nonatomic operations. In that case, atomics is
-        // disallowed to prevent unsafe linking with atomics-enabled objects.
-        assert(!Features[WebAssembly::FeatureAtomics] ||
-               !Features[WebAssembly::FeatureBulkMemory]);
-        M.addModuleFlag(Module::ModFlagBehavior::Error, MDKey,
-                        wasm::WASM_FEATURE_PREFIX_DISALLOWED);
-      } else if (Features[KV.Value]) {
-        // Otherwise features are marked Used or not mentioned
+      if (Features[KV.Value]) {
+        // Mark features as used
+        std::string MDKey = (StringRef("wasm-feature-") + KV.Key).str();
         M.addModuleFlag(Module::ModFlagBehavior::Error, MDKey,
                         wasm::WASM_FEATURE_PREFIX_USED);
       }
     }
+    // Code compiled without atomics or bulk-memory may have had its atomics or
+    // thread-local data lowered to nonatomic operations or non-thread-local
+    // data. In that case, we mark the pseudo-feature "shared-mem" as disallowed
+    // to tell the linker that it would be unsafe to allow this code ot be used
+    // in a module with shared memory.
+    if (Stripped) {
+      M.addModuleFlag(Module::ModFlagBehavior::Error, "wasm-feature-shared-mem",
+                      wasm::WASM_FEATURE_PREFIX_DISALLOWED);
+    }
   }
 };
 char CoalesceFeaturesAndStripAtomics::ID = 0;
@@ -395,6 +406,10 @@ bool WebAssemblyPassConfig::addInstSelector() {
   // it's inconvenient to collect. Collect it now, and update the immediate
   // operands.
   addPass(createWebAssemblySetP2AlignOperands());
+
+  // Eliminate range checks and add default targets to br_table instructions.
+  addPass(createWebAssemblyFixBrTableDefaults());
+
   return false;
 }
 
@@ -423,11 +438,6 @@ void WebAssemblyPassConfig::addPostRegAlloc() {
 void WebAssemblyPassConfig::addPreEmitPass() {
   TargetPassConfig::addPreEmitPass();
 
-  // Rewrite pseudo call_indirect instructions as real instructions.
-  // This needs to run before register stackification, because we change the
-  // order of the arguments.
-  addPass(createWebAssemblyCallIndirectFixup());
-
   // Eliminate multiple-entry loops.
   addPass(createWebAssemblyFixIrreducibleControlFlow());
 
@@ -472,7 +482,8 @@ void WebAssemblyPassConfig::addPreEmitPass() {
   addPass(createWebAssemblyCFGStackify());
 
   // Insert explicit local.get and local.set operators.
-  addPass(createWebAssemblyExplicitLocals());
+  if (!WasmDisableExplicitLocals)
+    addPass(createWebAssemblyExplicitLocals());
 
   // Lower br_unless into br_if.
   addPass(createWebAssemblyLowerBrUnless());
@@ -483,6 +494,10 @@ void WebAssemblyPassConfig::addPreEmitPass() {
 
   // Create a mapping from LLVM CodeGen virtual registers to wasm registers.
   addPass(createWebAssemblyRegNumbering());
+
+  // Fix debug_values whose defs have been stackified.
+  if (!WasmDisableExplicitLocals)
+    addPass(createWebAssemblyDebugFixup());
 }
 
 yaml::MachineFunctionInfo *
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
index 850e6b9a9e9e..dd5b39773313 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetMachine.h
@@ -47,7 +47,7 @@ public:
 
   TargetTransformInfo getTargetTransformInfo(const Function &F) override;
 
-  bool usesPhysRegsForPEI() const override { return false; }
+  bool usesPhysRegsForValues() const override { return false; }
 
   yaml::MachineFunctionInfo *createDefaultFuncInfoYAML() const override;
   yaml::MachineFunctionInfo *
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
index ac8ad927d334..28703a2787e0 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.cpp
@@ -44,13 +44,14 @@ unsigned WebAssemblyTTIImpl::getRegisterBitWidth(bool Vector) const {
 }
 
 unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
-    unsigned Opcode, Type *Ty, TTI::OperandValueKind Opd1Info,
+    unsigned Opcode, Type *Ty, TTI::TargetCostKind CostKind,
+    TTI::OperandValueKind Opd1Info,
     TTI::OperandValueKind Opd2Info, TTI::OperandValueProperties Opd1PropInfo,
     TTI::OperandValueProperties Opd2PropInfo, ArrayRef<const Value *> Args,
     const Instruction *CxtI) {
 
   unsigned Cost = BasicTTIImplBase<WebAssemblyTTIImpl>::getArithmeticInstrCost(
-      Opcode, Ty, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
+      Opcode, Ty, CostKind, Opd1Info, Opd2Info, Opd1PropInfo, Opd2PropInfo);
 
   if (auto *VTy = dyn_cast<VectorType>(Ty)) {
     switch (Opcode) {
@@ -62,10 +63,11 @@ unsigned WebAssemblyTTIImpl::getArithmeticInstrCost(
       // approxmation.
       if (Opd2Info != TTI::OK_UniformValue &&
           Opd2Info != TTI::OK_UniformConstantValue)
-        Cost = VTy->getNumElements() *
-               (TargetTransformInfo::TCC_Basic +
-                getArithmeticInstrCost(Opcode, VTy->getElementType()) +
-                TargetTransformInfo::TCC_Basic);
+        Cost =
+            cast<FixedVectorType>(VTy)->getNumElements() *
+            (TargetTransformInfo::TCC_Basic +
+             getArithmeticInstrCost(Opcode, VTy->getElementType(), CostKind) +
+             TargetTransformInfo::TCC_Basic);
       break;
     }
   }
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
index 2731dda10bec..79588a9f5669 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyTargetTransformInfo.h
@@ -57,6 +57,7 @@ public:
   unsigned getRegisterBitWidth(bool Vector) const;
   unsigned getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
index a237da8154ab..bc2bb4fd6935 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.cpp
@@ -49,7 +49,7 @@ bool WebAssembly::mayThrow(const MachineInstr &MI) {
   if (!MI.isCall())
     return false;
 
-  const MachineOperand &MO = MI.getOperand(getCalleeOpNo(MI.getOpcode()));
+  const MachineOperand &MO = getCalleeOp(MI);
   assert(MO.isGlobal() || MO.isSymbol());
 
   if (MO.isSymbol()) {
@@ -79,3 +79,20 @@ bool WebAssembly::mayThrow(const MachineInstr &MI) {
   // original LLVm IR? (Even when the callee may throw)
   return true;
 }
+
+const MachineOperand &WebAssembly::getCalleeOp(const MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case WebAssembly::CALL:
+  case WebAssembly::CALL_S:
+  case WebAssembly::RET_CALL:
+  case WebAssembly::RET_CALL_S:
+    return MI.getOperand(MI.getNumExplicitDefs());
+  case WebAssembly::CALL_INDIRECT:
+  case WebAssembly::CALL_INDIRECT_S:
+  case WebAssembly::RET_CALL_INDIRECT:
+  case WebAssembly::RET_CALL_INDIRECT_S:
+    return MI.getOperand(MI.getNumOperands() - 1);
+  default:
+    llvm_unreachable("Not a call instruction");
+  }
+}
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
index 26cf84de89b9..4f0ed43a2481 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyUtilities.h
@@ -44,6 +44,10 @@ template <typename T> MachineBasicBlock *getBottom(const T *Unit) {
   return Bottom;
 }
 
+/// Returns the operand number of a callee, assuming the argument is a call
+/// instruction.
+const MachineOperand &getCalleeOp(const MachineInstr &MI);
+
 } // end namespace WebAssembly
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt b/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
index 701b347bcbd7..c9f7574b9a41 100644
--- a/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
+++ b/llvm/lib/Target/WebAssembly/known_gcc_test_failures.txt
@@ -86,7 +86,6 @@ lifetime2.C  # violates C++ DR1696
 
 # WASI doesn't have stdjmp.h yet
 pr56982.c
-simd-2.C
 
 # WASI doesn't have pthread.h yet
 thread_local3.C
diff --git a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
index d37d812df485..a3014b2aba92 100644
--- a/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
+++ b/llvm/lib/Target/X86/AsmParser/X86AsmParser.cpp
@@ -31,6 +31,7 @@
 #include "llvm/MC/MCStreamer.h"
 #include "llvm/MC/MCSubtargetInfo.h"
 #include "llvm/MC/MCSymbol.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/TargetRegistry.h"
 #include "llvm/Support/raw_ostream.h"
@@ -39,6 +40,11 @@
 
 using namespace llvm;
 
+static cl::opt<bool> LVIInlineAsmHardening(
+    "x86-experimental-lvi-inline-asm-hardening",
+    cl::desc("Harden inline assembly code that may be vulnerable to Load Value"
+             " Injection (LVI). This feature is experimental."), cl::Hidden);
+
 static bool checkScale(unsigned Scale, StringRef &ErrMsg) {
   if (Scale != 1 && Scale != 2 && Scale != 4 && Scale != 8) {
     ErrMsg = "scale factor in address must be 1, 2, 4 or 8";
@@ -74,7 +80,7 @@ class X86AsmParser : public MCTargetAsmParser {
 
   enum VEXEncoding {
     VEXEncoding_Default,
-    VEXEncoding_VEX2,
+    VEXEncoding_VEX,
     VEXEncoding_VEX3,
     VEXEncoding_EVEX,
   };
@@ -326,6 +332,7 @@ private:
     IES_PLUS,
     IES_MINUS,
     IES_OFFSET,
+    IES_CAST,
     IES_NOT,
     IES_MULTIPLY,
     IES_DIVIDE,
@@ -352,6 +359,7 @@ private:
     bool MemExpr;
     bool OffsetOperator;
     SMLoc OffsetOperatorLoc;
+    StringRef CurType;
 
     bool setSymRef(const MCExpr *Val, StringRef ID, StringRef &ErrMsg) {
       if (Sym) {
@@ -379,6 +387,7 @@ private:
     unsigned getScale() { return Scale; }
     const MCExpr *getSym() { return Sym; }
     StringRef getSymName() { return SymName; }
+    StringRef getType() { return CurType; }
     int64_t getImm() { return Imm + IC.execute(); }
     bool isValidEndState() {
       return State == IES_RBRAC || State == IES_INTEGER;
@@ -611,9 +620,9 @@ private:
     }
     bool onIdentifierExpr(const MCExpr *SymRef, StringRef SymRefName,
                           const InlineAsmIdentifierInfo &IDInfo,
-                          bool ParsingInlineAsm, StringRef &ErrMsg) {
+                          bool ParsingMSInlineAsm, StringRef &ErrMsg) {
       // InlineAsm: Treat an enum value as an integer
-      if (ParsingInlineAsm)
+      if (ParsingMSInlineAsm)
         if (IDInfo.isKind(InlineAsmIdentifierInfo::IK_EnumVal))
           return onInteger(IDInfo.Enum.EnumVal, ErrMsg);
       // Treat a symbolic constant like an integer
@@ -624,6 +633,7 @@ private:
       default:
         State = IES_ERROR;
         break;
+      case IES_CAST:
       case IES_PLUS:
       case IES_MINUS:
       case IES_NOT:
@@ -634,7 +644,7 @@ private:
         MemExpr = true;
         State = IES_INTEGER;
         IC.pushOperand(IC_IMM);
-        if (ParsingInlineAsm)
+        if (ParsingMSInlineAsm)
           Info = IDInfo;
         break;
       }
@@ -736,6 +746,7 @@ private:
         IC.pushOperator(IC_PLUS);
         break;
       case IES_INIT:
+      case IES_CAST:
         assert(!BracCount && "BracCount should be zero on parsing's start");
         State = IES_LBRAC;
         break;
@@ -808,6 +819,7 @@ private:
       case IES_INTEGER:
       case IES_OFFSET:
       case IES_REGISTER:
+      case IES_RBRAC:
       case IES_RPAREN:
         State = IES_RPAREN;
         IC.pushOperator(IC_RPAREN);
@@ -815,7 +827,7 @@ private:
       }
     }
     bool onOffset(const MCExpr *Val, SMLoc OffsetLoc, StringRef ID,
-                  const InlineAsmIdentifierInfo &IDInfo, bool ParsingInlineAsm,
+                  const InlineAsmIdentifierInfo &IDInfo, bool ParsingMSInlineAsm,
                   StringRef &ErrMsg) {
       PrevState = State;
       switch (State) {
@@ -833,13 +845,26 @@ private:
         // As we cannot yet resolve the actual value (offset), we retain
         // the requested semantics by pushing a '0' to the operands stack
         IC.pushOperand(IC_IMM);
-        if (ParsingInlineAsm) {
+        if (ParsingMSInlineAsm) {
           Info = IDInfo;
         }
         break;
       }
       return false;
     }
+    void onCast(StringRef Type) {
+      PrevState = State;
+      switch (State) {
+      default:
+        State = IES_ERROR;
+        break;
+      case IES_LPAREN:
+        setType(Type);
+        State = IES_CAST;
+        break;
+      }
+    }
+    void setType(StringRef Type) { CurType = Type; }
   };
 
   bool Error(SMLoc L, const Twine &Msg, SMRange Range = None,
@@ -858,6 +883,11 @@ private:
     return nullptr;
   }
 
+  bool MatchRegisterByName(unsigned &RegNo, StringRef RegName, SMLoc StartLoc,
+                           SMLoc EndLoc);
+  bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc,
+                     bool RestoreOnFailure);
+
   std::unique_ptr<X86Operand> DefaultMemSIOperand(SMLoc Loc);
   std::unique_ptr<X86Operand> DefaultMemDIOperand(SMLoc Loc);
   bool IsSIReg(unsigned Reg);
@@ -896,10 +926,10 @@ private:
 
   bool ParseIntelMemoryOperandSize(unsigned &Size);
   std::unique_ptr<X86Operand>
-  CreateMemForInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
-                        unsigned IndexReg, unsigned Scale, SMLoc Start,
-                        SMLoc End, unsigned Size, StringRef Identifier,
-                        const InlineAsmIdentifierInfo &Info);
+  CreateMemForMSInlineAsm(unsigned SegReg, const MCExpr *Disp, unsigned BaseReg,
+                          unsigned IndexReg, unsigned Scale, SMLoc Start,
+                          SMLoc End, unsigned Size, StringRef Identifier,
+                          const InlineAsmIdentifierInfo &Info);
 
   bool parseDirectiveEven(SMLoc L);
   bool ParseDirectiveCode(StringRef IDVal, SMLoc L);
@@ -927,9 +957,14 @@ private:
   bool validateInstruction(MCInst &Inst, const OperandVector &Ops);
   bool processInstruction(MCInst &Inst, const OperandVector &Ops);
 
-  /// Wrapper around MCStreamer::EmitInstruction(). Possibly adds
+  // Load Value Injection (LVI) Mitigations for machine code
+  void emitWarningForSpecialLVIInstruction(SMLoc Loc);
+  void applyLVICFIMitigation(MCInst &Inst, MCStreamer &Out);
+  void applyLVILoadHardeningMitigation(MCInst &Inst, MCStreamer &Out);
+
+  /// Wrapper around MCStreamer::emitInstruction(). Possibly adds
   /// instrumentation around Inst.
-  void EmitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out);
+  void emitInstruction(MCInst &Inst, OperandVector &Operands, MCStreamer &Out);
 
   bool MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
                                OperandVector &Operands, MCStreamer &Out,
@@ -1023,6 +1058,8 @@ public:
   }
 
   bool ParseRegister(unsigned &RegNo, SMLoc &StartLoc, SMLoc &EndLoc) override;
+  OperandMatchResultTy tryParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                        SMLoc &EndLoc) override;
 
   bool parsePrimaryExpr(const MCExpr *&Res, SMLoc &EndLoc) override;
 
@@ -1129,36 +1166,21 @@ static bool CheckBaseRegAndIndexRegAndScale(unsigned BaseReg, unsigned IndexReg,
   return checkScale(Scale, ErrMsg);
 }
 
-bool X86AsmParser::ParseRegister(unsigned &RegNo,
-                                 SMLoc &StartLoc, SMLoc &EndLoc) {
-  MCAsmParser &Parser = getParser();
-  RegNo = 0;
-  const AsmToken &PercentTok = Parser.getTok();
-  StartLoc = PercentTok.getLoc();
-
+bool X86AsmParser::MatchRegisterByName(unsigned &RegNo, StringRef RegName,
+                                       SMLoc StartLoc, SMLoc EndLoc) {
   // If we encounter a %, ignore it. This code handles registers with and
   // without the prefix, unprefixed registers can occur in cfi directives.
-  if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent))
-    Parser.Lex(); // Eat percent token.
+  RegName.consume_front("%");
 
-  const AsmToken &Tok = Parser.getTok();
-  EndLoc = Tok.getEndLoc();
-
-  if (Tok.isNot(AsmToken::Identifier)) {
-    if (isParsingIntelSyntax()) return true;
-    return Error(StartLoc, "invalid register name",
-                 SMRange(StartLoc, EndLoc));
-  }
-
-  RegNo = MatchRegisterName(Tok.getString());
+  RegNo = MatchRegisterName(RegName);
 
   // If the match failed, try the register name as lowercase.
   if (RegNo == 0)
-    RegNo = MatchRegisterName(Tok.getString().lower());
+    RegNo = MatchRegisterName(RegName.lower());
 
   // The "flags" and "mxcsr" registers cannot be referenced directly.
   // Treat it as an identifier instead.
-  if (isParsingInlineAsm() && isParsingIntelSyntax() &&
+  if (isParsingMSInlineAsm() && isParsingIntelSyntax() &&
       (RegNo == X86::EFLAGS || RegNo == X86::MXCSR))
     RegNo = 0;
 
@@ -1172,27 +1194,137 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
         X86MCRegisterClasses[X86::GR64RegClassID].contains(RegNo) ||
         X86II::isX86_64NonExtLowByteReg(RegNo) ||
         X86II::isX86_64ExtendedReg(RegNo)) {
-      StringRef RegName = Tok.getString();
-      Parser.Lex(); // Eat register name.
       return Error(StartLoc,
                    "register %" + RegName + " is only available in 64-bit mode",
                    SMRange(StartLoc, EndLoc));
     }
   }
 
+  // If this is "db[0-15]", match it as an alias
+  // for dr[0-15].
+  if (RegNo == 0 && RegName.startswith("db")) {
+    if (RegName.size() == 3) {
+      switch (RegName[2]) {
+      case '0':
+        RegNo = X86::DR0;
+        break;
+      case '1':
+        RegNo = X86::DR1;
+        break;
+      case '2':
+        RegNo = X86::DR2;
+        break;
+      case '3':
+        RegNo = X86::DR3;
+        break;
+      case '4':
+        RegNo = X86::DR4;
+        break;
+      case '5':
+        RegNo = X86::DR5;
+        break;
+      case '6':
+        RegNo = X86::DR6;
+        break;
+      case '7':
+        RegNo = X86::DR7;
+        break;
+      case '8':
+        RegNo = X86::DR8;
+        break;
+      case '9':
+        RegNo = X86::DR9;
+        break;
+      }
+    } else if (RegName.size() == 4 && RegName[2] == '1') {
+      switch (RegName[3]) {
+      case '0':
+        RegNo = X86::DR10;
+        break;
+      case '1':
+        RegNo = X86::DR11;
+        break;
+      case '2':
+        RegNo = X86::DR12;
+        break;
+      case '3':
+        RegNo = X86::DR13;
+        break;
+      case '4':
+        RegNo = X86::DR14;
+        break;
+      case '5':
+        RegNo = X86::DR15;
+        break;
+      }
+    }
+  }
+
+  if (RegNo == 0) {
+    if (isParsingIntelSyntax())
+      return true;
+    return Error(StartLoc, "invalid register name", SMRange(StartLoc, EndLoc));
+  }
+  return false;
+}
+
+bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                 SMLoc &EndLoc, bool RestoreOnFailure) {
+  MCAsmParser &Parser = getParser();
+  MCAsmLexer &Lexer = getLexer();
+  RegNo = 0;
+
+  SmallVector<AsmToken, 5> Tokens;
+  auto OnFailure = [RestoreOnFailure, &Lexer, &Tokens]() {
+    if (RestoreOnFailure) {
+      while (!Tokens.empty()) {
+        Lexer.UnLex(Tokens.pop_back_val());
+      }
+    }
+  };
+
+  const AsmToken &PercentTok = Parser.getTok();
+  StartLoc = PercentTok.getLoc();
+
+  // If we encounter a %, ignore it. This code handles registers with and
+  // without the prefix, unprefixed registers can occur in cfi directives.
+  if (!isParsingIntelSyntax() && PercentTok.is(AsmToken::Percent)) {
+    Tokens.push_back(PercentTok);
+    Parser.Lex(); // Eat percent token.
+  }
+
+  const AsmToken &Tok = Parser.getTok();
+  EndLoc = Tok.getEndLoc();
+
+  if (Tok.isNot(AsmToken::Identifier)) {
+    OnFailure();
+    if (isParsingIntelSyntax()) return true;
+    return Error(StartLoc, "invalid register name",
+                 SMRange(StartLoc, EndLoc));
+  }
+
+  if (MatchRegisterByName(RegNo, Tok.getString(), StartLoc, EndLoc)) {
+    OnFailure();
+    return true;
+  }
+
   // Parse "%st" as "%st(0)" and "%st(1)", which is multiple tokens.
   if (RegNo == X86::ST0) {
+    Tokens.push_back(Tok);
     Parser.Lex(); // Eat 'st'
 
     // Check to see if we have '(4)' after %st.
-    if (getLexer().isNot(AsmToken::LParen))
+    if (Lexer.isNot(AsmToken::LParen))
       return false;
     // Lex the paren.
-    getParser().Lex();
+    Tokens.push_back(Parser.getTok());
+    Parser.Lex();
 
     const AsmToken &IntTok = Parser.getTok();
-    if (IntTok.isNot(AsmToken::Integer))
+    if (IntTok.isNot(AsmToken::Integer)) {
+      OnFailure();
       return Error(IntTok.getLoc(), "expected stack index");
+    }
     switch (IntTok.getIntVal()) {
     case 0: RegNo = X86::ST0; break;
     case 1: RegNo = X86::ST1; break;
@@ -1202,11 +1334,18 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
     case 5: RegNo = X86::ST5; break;
     case 6: RegNo = X86::ST6; break;
     case 7: RegNo = X86::ST7; break;
-    default: return Error(IntTok.getLoc(), "invalid stack index");
+    default:
+      OnFailure();
+      return Error(IntTok.getLoc(), "invalid stack index");
     }
 
-    if (getParser().Lex().isNot(AsmToken::RParen))
+    // Lex IntTok
+    Tokens.push_back(IntTok);
+    Parser.Lex();
+    if (Lexer.isNot(AsmToken::RParen)) {
+      OnFailure();
       return Error(Parser.getTok().getLoc(), "expected ')'");
+    }
 
     EndLoc = Parser.getTok().getEndLoc();
     Parser.Lex(); // Eat ')'
@@ -1215,41 +1354,8 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
 
   EndLoc = Parser.getTok().getEndLoc();
 
-  // If this is "db[0-15]", match it as an alias
-  // for dr[0-15].
-  if (RegNo == 0 && Tok.getString().startswith("db")) {
-    if (Tok.getString().size() == 3) {
-      switch (Tok.getString()[2]) {
-      case '0': RegNo = X86::DR0; break;
-      case '1': RegNo = X86::DR1; break;
-      case '2': RegNo = X86::DR2; break;
-      case '3': RegNo = X86::DR3; break;
-      case '4': RegNo = X86::DR4; break;
-      case '5': RegNo = X86::DR5; break;
-      case '6': RegNo = X86::DR6; break;
-      case '7': RegNo = X86::DR7; break;
-      case '8': RegNo = X86::DR8; break;
-      case '9': RegNo = X86::DR9; break;
-      }
-    } else if (Tok.getString().size() == 4 && Tok.getString()[2] == '1') {
-      switch (Tok.getString()[3]) {
-      case '0': RegNo = X86::DR10; break;
-      case '1': RegNo = X86::DR11; break;
-      case '2': RegNo = X86::DR12; break;
-      case '3': RegNo = X86::DR13; break;
-      case '4': RegNo = X86::DR14; break;
-      case '5': RegNo = X86::DR15; break;
-      }
-    }
-
-    if (RegNo != 0) {
-      EndLoc = Parser.getTok().getEndLoc();
-      Parser.Lex(); // Eat it.
-      return false;
-    }
-  }
-
   if (RegNo == 0) {
+    OnFailure();
     if (isParsingIntelSyntax()) return true;
     return Error(StartLoc, "invalid register name",
                  SMRange(StartLoc, EndLoc));
@@ -1259,6 +1365,25 @@ bool X86AsmParser::ParseRegister(unsigned &RegNo,
   return false;
 }
 
+bool X86AsmParser::ParseRegister(unsigned &RegNo, SMLoc &StartLoc,
+                                 SMLoc &EndLoc) {
+  return ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/false);
+}
+
+OperandMatchResultTy X86AsmParser::tryParseRegister(unsigned &RegNo,
+                                                    SMLoc &StartLoc,
+                                                    SMLoc &EndLoc) {
+  bool Result =
+      ParseRegister(RegNo, StartLoc, EndLoc, /*RestoreOnFailure=*/true);
+  bool PendingErrors = getParser().hasPendingError();
+  getParser().clearPendingErrors();
+  if (PendingErrors)
+    return MatchOperand_ParseFail;
+  if (Result)
+    return MatchOperand_NoMatch;
+  return MatchOperand_Success;
+}
+
 std::unique_ptr<X86Operand> X86AsmParser::DefaultMemSIOperand(SMLoc Loc) {
   bool Parse32 = is32BitMode() || Code16GCC;
   unsigned Basereg = is64BitMode() ? X86::RSI : (Parse32 ? X86::ESI : X86::SI);
@@ -1405,7 +1530,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseOperand() {
   return ParseATTOperand();
 }
 
-std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
+std::unique_ptr<X86Operand> X86AsmParser::CreateMemForMSInlineAsm(
     unsigned SegReg, const MCExpr *Disp, unsigned BaseReg, unsigned IndexReg,
     unsigned Scale, SMLoc Start, SMLoc End, unsigned Size, StringRef Identifier,
     const InlineAsmIdentifierInfo &Info) {
@@ -1445,8 +1570,9 @@ std::unique_ptr<X86Operand> X86AsmParser::CreateMemForInlineAsm(
   } else {
     BaseReg = BaseReg ? BaseReg : 1;
     return X86Operand::CreateMem(getPointerWidth(), SegReg, Disp, BaseReg,
-                                 IndexReg, Scale, Start, End, Size, Identifier,
-                                 Decl, FrontendSize);
+                                 IndexReg, Scale, Start, End, Size,
+                                 /*DefaultBaseReg=*/X86::RIP, Identifier, Decl,
+                                 FrontendSize);
   }
 }
 
@@ -1483,7 +1609,7 @@ bool X86AsmParser::ParseIntelNamedOperator(StringRef Name,
       return true;
     StringRef ErrMsg;
     ParseError =
-        SM.onOffset(Val, OffsetLoc, ID, Info, isParsingInlineAsm(), ErrMsg);
+        SM.onOffset(Val, OffsetLoc, ID, Info, isParsingMSInlineAsm(), ErrMsg);
     if (ParseError)
       return Error(SMLoc::getFromPointer(Name.data()), ErrMsg);
   } else {
@@ -1525,12 +1651,51 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       SMLoc IdentLoc = Tok.getLoc();
       StringRef Identifier = Tok.getString();
       UpdateLocLex = false;
-      // Register
+      // (MASM only) <TYPE> PTR operator
+      if (Parser.isParsingMasm()) {
+        const AsmToken &NextTok = getLexer().peekTok();
+        if (NextTok.is(AsmToken::Identifier) &&
+            NextTok.getIdentifier().equals_lower("ptr")) {
+          SM.onCast(Identifier);
+          // Eat type and PTR.
+          consumeToken();
+          End = consumeToken();
+          break;
+        }
+      }
+      // Register, or (MASM only) <register>.<field>
       unsigned Reg;
-      if (Tok.is(AsmToken::Identifier) && !ParseRegister(Reg, IdentLoc, End)) {
-        if (SM.onRegister(Reg, ErrMsg))
-          return Error(Tok.getLoc(), ErrMsg);
-        break;
+      if (Tok.is(AsmToken::Identifier)) {
+        if (!ParseRegister(Reg, IdentLoc, End, /*RestoreOnFailure=*/true)) {
+          if (SM.onRegister(Reg, ErrMsg))
+            return Error(IdentLoc, ErrMsg);
+          break;
+        }
+        if (Parser.isParsingMasm()) {
+          const std::pair<StringRef, StringRef> IDField =
+              Tok.getString().split('.');
+          const StringRef ID = IDField.first, Field = IDField.second;
+          SMLoc IDEndLoc = SMLoc::getFromPointer(ID.data() + ID.size());
+          if (!Field.empty() &&
+              !MatchRegisterByName(Reg, ID, IdentLoc, IDEndLoc)) {
+            if (SM.onRegister(Reg, ErrMsg))
+              return Error(IdentLoc, ErrMsg);
+
+            StringRef Type;
+            unsigned Offset = 0;
+            SMLoc FieldStartLoc = SMLoc::getFromPointer(Field.data());
+            if (Parser.lookUpField(Field, Type, Offset))
+              return Error(FieldStartLoc, "unknown offset");
+            else if (SM.onPlus(ErrMsg))
+              return Error(getTok().getLoc(), ErrMsg);
+            else if (SM.onInteger(Offset, ErrMsg))
+              return Error(IdentLoc, ErrMsg);
+            SM.setType(Type);
+
+            End = consumeToken();
+            break;
+          }
+        }
       }
       // Operator synonymous ("not", "or" etc.)
       bool ParseError = false;
@@ -1542,37 +1707,40 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
       // Symbol reference, when parsing assembly content
       InlineAsmIdentifierInfo Info;
       const MCExpr *Val;
-      if (!isParsingInlineAsm()) {
-        if (getParser().parsePrimaryExpr(Val, End)) {
-          return Error(Tok.getLoc(), "Unexpected identifier!");
-        } else if (SM.onIdentifierExpr(Val, Identifier, Info, false, ErrMsg)) {
-          return Error(IdentLoc, ErrMsg);
-        } else
+      if (isParsingMSInlineAsm() || Parser.isParsingMasm()) {
+        // MS Dot Operator expression
+        if (Identifier.count('.') &&
+            (PrevTK == AsmToken::RBrac || PrevTK == AsmToken::RParen)) {
+          if (ParseIntelDotOperator(SM, End))
+            return true;
           break;
+        }
       }
-      // MS InlineAsm operators (TYPE/LENGTH/SIZE)
-      if (unsigned OpKind = IdentifyIntelInlineAsmOperator(Identifier)) {
-        if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) {
-          if (SM.onInteger(Val, ErrMsg))
-            return Error(IdentLoc, ErrMsg);
-        } else
-          return true;
-        break;
-      }
-      // MS Dot Operator expression
-      if (Identifier.count('.') && PrevTK == AsmToken::RBrac) {
-        if (ParseIntelDotOperator(SM, End))
+      if (isParsingMSInlineAsm()) {
+        // MS InlineAsm operators (TYPE/LENGTH/SIZE)
+        if (unsigned OpKind = IdentifyIntelInlineAsmOperator(Identifier)) {
+          if (int64_t Val = ParseIntelInlineAsmOperator(OpKind)) {
+            if (SM.onInteger(Val, ErrMsg))
+              return Error(IdentLoc, ErrMsg);
+          } else
+            return true;
+          break;
+        }
+        // MS InlineAsm identifier
+        // Call parseIdentifier() to combine @ with the identifier behind it.
+        if (TK == AsmToken::At && Parser.parseIdentifier(Identifier))
+          return Error(IdentLoc, "expected identifier");
+        if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End))
           return true;
+        else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg))
+          return Error(IdentLoc, ErrMsg);
         break;
       }
-      // MS InlineAsm identifier
-      // Call parseIdentifier() to combine @ with the identifier behind it.
-      if (TK == AsmToken::At && Parser.parseIdentifier(Identifier))
-        return Error(IdentLoc, "expected identifier");
-      if (ParseIntelInlineAsmIdentifier(Val, Identifier, Info, false, End))
-        return true;
-      else if (SM.onIdentifierExpr(Val, Identifier, Info, true, ErrMsg))
+      if (getParser().parsePrimaryExpr(Val, End)) {
+        return Error(Tok.getLoc(), "Unexpected identifier!");
+      } else if (SM.onIdentifierExpr(Val, Identifier, Info, false, ErrMsg)) {
         return Error(IdentLoc, ErrMsg);
+      }
       break;
     }
     case AsmToken::Integer: {
@@ -1593,8 +1761,8 @@ bool X86AsmParser::ParseIntelExpression(IntelExprStateMachine &SM, SMLoc &End) {
             return Error(Loc, "invalid reference to undefined symbol");
           StringRef Identifier = Sym->getName();
           InlineAsmIdentifierInfo Info;
-          if (SM.onIdentifierExpr(Val, Identifier, Info,
-              isParsingInlineAsm(), ErrMsg))
+          if (SM.onIdentifierExpr(Val, Identifier, Info, isParsingMSInlineAsm(),
+                                  ErrMsg))
             return Error(Loc, ErrMsg);
           End = consumeToken();
         } else {
@@ -1688,7 +1856,7 @@ bool X86AsmParser::ParseIntelInlineAsmIdentifier(
     const MCExpr *&Val, StringRef &Identifier, InlineAsmIdentifierInfo &Info,
     bool IsUnevaluatedOperand, SMLoc &End, bool IsParsingOffsetOperator) {
   MCAsmParser &Parser = getParser();
-  assert(isParsingInlineAsm() && "Expected to be parsing inline assembly.");
+  assert(isParsingMSInlineAsm() && "Expected to be parsing inline assembly.");
   Val = nullptr;
 
   StringRef LineBuf(Identifier.data());
@@ -1777,9 +1945,11 @@ X86AsmParser::ParseRoundingModeOp(SMLoc Start) {
 }
 
 /// Parse the '.' operator.
-bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End) {
+bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM,
+                                         SMLoc &End) {
   const AsmToken &Tok = getTok();
-  unsigned Offset;
+  StringRef Type;
+  unsigned Offset = 0;
 
   // Drop the optional '.'.
   StringRef DotDispStr = Tok.getString();
@@ -1791,10 +1961,15 @@ bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End)
     APInt DotDisp;
     DotDispStr.getAsInteger(10, DotDisp);
     Offset = DotDisp.getZExtValue();
-  } else if (isParsingInlineAsm() && Tok.is(AsmToken::Identifier)) {
-    std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
-    if (SemaCallback->LookupInlineAsmField(BaseMember.first, BaseMember.second,
-                                           Offset))
+  } else if ((isParsingMSInlineAsm() || getParser().isParsingMasm()) &&
+             Tok.is(AsmToken::Identifier)) {
+    const std::pair<StringRef, StringRef> BaseMember = DotDispStr.split('.');
+    const StringRef Base = BaseMember.first, Member = BaseMember.second;
+    if (getParser().lookUpField(SM.getType(), DotDispStr, Type, Offset) &&
+        getParser().lookUpField(SM.getSymName(), DotDispStr, Type, Offset) &&
+        getParser().lookUpField(DotDispStr, Type, Offset) &&
+        (!SemaCallback ||
+         SemaCallback->LookupInlineAsmField(Base, Member, Offset)))
       return Error(Tok.getLoc(), "Unable to lookup field reference!");
   } else
     return Error(Tok.getLoc(), "Unexpected token type!");
@@ -1805,6 +1980,7 @@ bool X86AsmParser::ParseIntelDotOperator(IntelExprStateMachine &SM, SMLoc &End)
   while (Tok.getLoc().getPointer() < DotExprEndLoc)
     Lex();
   SM.addImm(Offset);
+  SM.setType(Type);
   return false;
 }
 
@@ -1816,7 +1992,7 @@ bool X86AsmParser::ParseIntelOffsetOperator(const MCExpr *&Val, StringRef &ID,
   // Eat offset, mark start of identifier.
   SMLoc Start = Lex().getLoc();
   ID = getTok().getString();
-  if (!isParsingInlineAsm()) {
+  if (!isParsingMSInlineAsm()) {
     if ((getTok().isNot(AsmToken::Identifier) &&
          getTok().isNot(AsmToken::String)) ||
         getParser().parsePrimaryExpr(Val, End))
@@ -1939,7 +2115,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   if (ParseIntelExpression(SM, End))
     return nullptr;
 
-  if (isParsingInlineAsm())
+  if (isParsingMSInlineAsm())
     RewriteIntelExpression(SM, Start, Tok.getLoc());
 
   int64_t Imm = SM.getImm();
@@ -1953,7 +2129,7 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
   // RegNo != 0 specifies a valid segment register,
   // and we are parsing a segment override
   if (!SM.isMemExpr() && !RegNo) {
-    if (isParsingInlineAsm() && SM.isOffsetOperator()) {
+    if (isParsingMSInlineAsm() && SM.isOffsetOperator()) {
       const InlineAsmIdentifierInfo Info = SM.getIdentifierInfo();
       if (Info.isKind(InlineAsmIdentifierInfo::IK_Var)) {
         // Disp includes the address of a variable; make sure this is recorded
@@ -2005,10 +2181,18 @@ std::unique_ptr<X86Operand> X86AsmParser::ParseIntelOperand() {
       CheckBaseRegAndIndexRegAndScale(BaseReg, IndexReg, Scale, is64BitMode(),
                                       ErrMsg))
     return ErrorOperand(Start, ErrMsg);
-  if (isParsingInlineAsm())
-    return CreateMemForInlineAsm(RegNo, Disp, BaseReg, IndexReg,
-                                 Scale, Start, End, Size, SM.getSymName(),
-                                 SM.getIdentifierInfo());
+  if (isParsingMSInlineAsm())
+    return CreateMemForMSInlineAsm(RegNo, Disp, BaseReg, IndexReg, Scale, Start,
+                                   End, Size, SM.getSymName(),
+                                   SM.getIdentifierInfo());
+
+  // When parsing x64 MS-style assembly, all memory operands default to
+  // RIP-relative when interpreted as non-absolute references.
+  if (Parser.isParsingMasm() && is64BitMode())
+    return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp, BaseReg,
+                                 IndexReg, Scale, Start, End, Size,
+                                 /*DefaultBaseReg=*/X86::RIP);
+
   if (!(BaseReg || IndexReg || RegNo))
     return X86Operand::CreateMem(getPointerWidth(), Disp, Start, End, Size);
   return X86Operand::CreateMem(getPointerWidth(), RegNo, Disp,
@@ -2420,8 +2604,8 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
         return Error(Parser.getTok().getLoc(), "Expected '}'");
       Parser.Lex(); // Eat curly.
 
-      if (Prefix == "vex2")
-        ForcedVEXEncoding = VEXEncoding_VEX2;
+      if (Prefix == "vex" || Prefix == "vex2")
+        ForcedVEXEncoding = VEXEncoding_VEX;
       else if (Prefix == "vex3")
         ForcedVEXEncoding = VEXEncoding_VEX3;
       else if (Prefix == "evex")
@@ -2711,7 +2895,7 @@ bool X86AsmParser::ParseInstruction(ParseInstructionInfo &Info, StringRef Name,
     // In MS inline asm curly braces mark the beginning/end of a block,
     // therefore they should be interepreted as end of statement
     CurlyAsEndOfStatement =
-        isParsingIntelSyntax() && isParsingInlineAsm() &&
+        isParsingIntelSyntax() && isParsingMSInlineAsm() &&
         (getLexer().is(AsmToken::LCurly) || getLexer().is(AsmToken::RCurly));
     if (getLexer().isNot(AsmToken::EndOfStatement) && !CurlyAsEndOfStatement)
       return TokError("unexpected token in argument list");
@@ -3096,9 +3280,122 @@ bool X86AsmParser::validateInstruction(MCInst &Inst, const OperandVector &Ops) {
 
 static const char *getSubtargetFeatureName(uint64_t Val);
 
-void X86AsmParser::EmitInstruction(MCInst &Inst, OperandVector &Operands,
+void X86AsmParser::emitWarningForSpecialLVIInstruction(SMLoc Loc) {
+  Warning(Loc, "Instruction may be vulnerable to LVI and "
+               "requires manual mitigation");
+  Note(SMLoc(), "See https://software.intel.com/"
+                "security-software-guidance/insights/"
+                "deep-dive-load-value-injection#specialinstructions"
+                " for more information");
+}
+
+/// RET instructions and also instructions that indirect calls/jumps from memory
+/// combine a load and a branch within a single instruction. To mitigate these
+/// instructions against LVI, they must be decomposed into separate load and
+/// branch instructions, with an LFENCE in between. For more details, see:
+/// - X86LoadValueInjectionRetHardening.cpp
+/// - X86LoadValueInjectionIndirectThunks.cpp
+/// - https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection
+///
+/// Returns `true` if a mitigation was applied or warning was emitted.
+void X86AsmParser::applyLVICFIMitigation(MCInst &Inst, MCStreamer &Out) {
+  // Information on control-flow instructions that require manual mitigation can
+  // be found here:
+  // https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions
+  switch (Inst.getOpcode()) {
+  case X86::RETW:
+  case X86::RETL:
+  case X86::RETQ:
+  case X86::RETIL:
+  case X86::RETIQ:
+  case X86::RETIW: {
+    MCInst ShlInst, FenceInst;
+    bool Parse32 = is32BitMode() || Code16GCC;
+    unsigned Basereg =
+        is64BitMode() ? X86::RSP : (Parse32 ? X86::ESP : X86::SP);
+    const MCExpr *Disp = MCConstantExpr::create(0, getContext());
+    auto ShlMemOp = X86Operand::CreateMem(getPointerWidth(), /*SegReg=*/0, Disp,
+                                          /*BaseReg=*/Basereg, /*IndexReg=*/0,
+                                          /*Scale=*/1, SMLoc{}, SMLoc{}, 0);
+    ShlInst.setOpcode(X86::SHL64mi);
+    ShlMemOp->addMemOperands(ShlInst, 5);
+    ShlInst.addOperand(MCOperand::createImm(0));
+    FenceInst.setOpcode(X86::LFENCE);
+    Out.emitInstruction(ShlInst, getSTI());
+    Out.emitInstruction(FenceInst, getSTI());
+    return;
+  }
+  case X86::JMP16m:
+  case X86::JMP32m:
+  case X86::JMP64m:
+  case X86::CALL16m:
+  case X86::CALL32m:
+  case X86::CALL64m:
+    emitWarningForSpecialLVIInstruction(Inst.getLoc());
+    return;
+  }
+}
+
+/// To mitigate LVI, every instruction that performs a load can be followed by
+/// an LFENCE instruction to squash any potential mis-speculation. There are
+/// some instructions that require additional considerations, and may requre
+/// manual mitigation. For more details, see:
+/// https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection
+///
+/// Returns `true` if a mitigation was applied or warning was emitted.
+void X86AsmParser::applyLVILoadHardeningMitigation(MCInst &Inst,
+                                                   MCStreamer &Out) {
+  auto Opcode = Inst.getOpcode();
+  auto Flags = Inst.getFlags();
+  if ((Flags & X86::IP_HAS_REPEAT) || (Flags & X86::IP_HAS_REPEAT_NE)) {
+    // Information on REP string instructions that require manual mitigation can
+    // be found here:
+    // https://software.intel.com/security-software-guidance/insights/deep-dive-load-value-injection#specialinstructions
+    switch (Opcode) {
+    case X86::CMPSB:
+    case X86::CMPSW:
+    case X86::CMPSL:
+    case X86::CMPSQ:
+    case X86::SCASB:
+    case X86::SCASW:
+    case X86::SCASL:
+    case X86::SCASQ:
+      emitWarningForSpecialLVIInstruction(Inst.getLoc());
+      return;
+    }
+  } else if (Opcode == X86::REP_PREFIX || Opcode == X86::REPNE_PREFIX) {
+    // If a REP instruction is found on its own line, it may or may not be
+    // followed by a vulnerable instruction. Emit a warning just in case.
+    emitWarningForSpecialLVIInstruction(Inst.getLoc());
+    return;
+  }
+
+  const MCInstrDesc &MCID = MII.get(Inst.getOpcode());
+
+  // Can't mitigate after terminators or calls. A control flow change may have
+  // already occurred.
+  if (MCID.isTerminator() || MCID.isCall())
+    return;
+
+  // LFENCE has the mayLoad property, don't double fence.
+  if (MCID.mayLoad() && Inst.getOpcode() != X86::LFENCE) {
+    MCInst FenceInst;
+    FenceInst.setOpcode(X86::LFENCE);
+    Out.emitInstruction(FenceInst, getSTI());
+  }
+}
+
+void X86AsmParser::emitInstruction(MCInst &Inst, OperandVector &Operands,
                                    MCStreamer &Out) {
-  Out.EmitInstruction(Inst, getSTI());
+  if (LVIInlineAsmHardening &&
+      getSTI().getFeatureBits()[X86::FeatureLVIControlFlowIntegrity])
+    applyLVICFIMitigation(Inst, Out);
+
+  Out.emitInstruction(Inst, getSTI());
+
+  if (LVIInlineAsmHardening &&
+      getSTI().getFeatureBits()[X86::FeatureLVILoadHardening])
+    applyLVILoadHardeningMitigation(Inst, Out);
 }
 
 bool X86AsmParser::MatchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
@@ -3133,7 +3430,7 @@ void X86AsmParser::MatchFPUWaitAlias(SMLoc IDLoc, X86Operand &Op,
     Inst.setOpcode(X86::WAIT);
     Inst.setLoc(IDLoc);
     if (!MatchingInlineAsm)
-      EmitInstruction(Inst, Operands, Out);
+      emitInstruction(Inst, Operands, Out);
     Operands[0] = X86Operand::CreateToken(Repl, IDLoc);
   }
 }
@@ -3170,7 +3467,7 @@ unsigned X86AsmParser::checkTargetMatchPredicate(MCInst &Inst) {
       (MCID.TSFlags & X86II::EncodingMask) != X86II::EVEX)
     return Match_Unsupported;
 
-  if ((ForcedVEXEncoding == VEXEncoding_VEX2 ||
+  if ((ForcedVEXEncoding == VEXEncoding_VEX ||
        ForcedVEXEncoding == VEXEncoding_VEX3) &&
       (MCID.TSFlags & X86II::EncodingMask) != X86II::VEX)
     return Match_Unsupported;
@@ -3240,7 +3537,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
 
     Inst.setLoc(IDLoc);
     if (!MatchingInlineAsm)
-      EmitInstruction(Inst, Operands, Out);
+      emitInstruction(Inst, Operands, Out);
     Opcode = Inst.getOpcode();
     return false;
   case Match_InvalidImmUnsignedi4: {
@@ -3282,20 +3579,47 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
   // Otherwise, we assume that this may be an integer instruction, which comes
   // in 8/16/32/64-bit forms using the b,w,l,q suffixes respectively.
   const char *Suffixes = Base[0] != 'f' ? "bwlq" : "slt\0";
+  // MemSize corresponding to Suffixes.  { 8, 16, 32, 64 }    { 32, 64, 80, 0 }
+  const char *MemSize = Base[0] != 'f' ? "\x08\x10\x20\x40" : "\x20\x40\x50\0";
 
   // Check for the various suffix matches.
   uint64_t ErrorInfoIgnore;
   FeatureBitset ErrorInfoMissingFeatures; // Init suppresses compiler warnings.
   unsigned Match[4];
 
+  // Some instruction like VPMULDQ is NOT the variant of VPMULD but a new one.
+  // So we should make sure the suffix matcher only works for memory variant
+  // that has the same size with the suffix.
+  // FIXME: This flag is a workaround for legacy instructions that didn't
+  // declare non suffix variant assembly.
+  bool HasVectorReg = false;
+  X86Operand *MemOp = nullptr;
+  for (const auto &Op : Operands) {
+    X86Operand *X86Op = static_cast<X86Operand *>(Op.get());
+    if (X86Op->isVectorReg())
+      HasVectorReg = true;
+    else if (X86Op->isMem()) {
+      MemOp = X86Op;
+      assert(MemOp->Mem.Size == 0 && "Memory size always 0 under ATT syntax");
+      // Have we found an unqualified memory operand,
+      // break. IA allows only one memory operand.
+      break;
+    }
+  }
+
   for (unsigned I = 0, E = array_lengthof(Match); I != E; ++I) {
     Tmp.back() = Suffixes[I];
-    Match[I] = MatchInstruction(Operands, Inst, ErrorInfoIgnore,
-                                MissingFeatures, MatchingInlineAsm,
-                                isParsingIntelSyntax());
-    // If this returned as a missing feature failure, remember that.
-    if (Match[I] == Match_MissingFeature)
-      ErrorInfoMissingFeatures = MissingFeatures;
+    if (MemOp && HasVectorReg)
+      MemOp->Mem.Size = MemSize[I];
+    Match[I] = Match_MnemonicFail;
+    if (MemOp || !HasVectorReg) {
+      Match[I] =
+          MatchInstruction(Operands, Inst, ErrorInfoIgnore, MissingFeatures,
+                           MatchingInlineAsm, isParsingIntelSyntax());
+      // If this returned as a missing feature failure, remember that.
+      if (Match[I] == Match_MissingFeature)
+        ErrorInfoMissingFeatures = MissingFeatures;
+    }
   }
 
   // Restore the old token.
@@ -3309,7 +3633,7 @@ bool X86AsmParser::MatchAndEmitATTInstruction(SMLoc IDLoc, unsigned &Opcode,
   if (NumSuccessfulMatches == 1) {
     Inst.setLoc(IDLoc);
     if (!MatchingInlineAsm)
-      EmitInstruction(Inst, Operands, Out);
+      emitInstruction(Inst, Operands, Out);
     Opcode = Inst.getOpcode();
     return false;
   }
@@ -3562,7 +3886,7 @@ bool X86AsmParser::MatchAndEmitIntelInstruction(SMLoc IDLoc, unsigned &Opcode,
         ;
     Inst.setLoc(IDLoc);
     if (!MatchingInlineAsm)
-      EmitInstruction(Inst, Operands, Out);
+      emitInstruction(Inst, Operands, Out);
     Opcode = Inst.getOpcode();
     return false;
   } else if (NumSuccessfulMatches > 1) {
@@ -3684,9 +4008,9 @@ bool X86AsmParser::parseDirectiveEven(SMLoc L) {
     Section = getStreamer().getCurrentSectionOnly();
   }
   if (Section->UseCodeAlign())
-    getStreamer().EmitCodeAlignment(2, 0);
+    getStreamer().emitCodeAlignment(2, 0);
   else
-    getStreamer().EmitValueToAlignment(2, 0, 1, 0);
+    getStreamer().emitValueToAlignment(2, 0, 1, 0);
   return false;
 }
 
@@ -3699,7 +4023,7 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
     Parser.Lex();
     if (!is16BitMode()) {
       SwitchMode(X86::Mode16Bit);
-      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+      getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
     }
   } else if (IDVal == ".code16gcc") {
     // .code16gcc parses as if in 32-bit mode, but emits code in 16-bit mode.
@@ -3707,19 +4031,19 @@ bool X86AsmParser::ParseDirectiveCode(StringRef IDVal, SMLoc L) {
     Code16GCC = true;
     if (!is16BitMode()) {
       SwitchMode(X86::Mode16Bit);
-      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code16);
+      getParser().getStreamer().emitAssemblerFlag(MCAF_Code16);
     }
   } else if (IDVal == ".code32") {
     Parser.Lex();
     if (!is32BitMode()) {
       SwitchMode(X86::Mode32Bit);
-      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code32);
+      getParser().getStreamer().emitAssemblerFlag(MCAF_Code32);
     }
   } else if (IDVal == ".code64") {
     Parser.Lex();
     if (!is64BitMode()) {
       SwitchMode(X86::Mode64Bit);
-      getParser().getStreamer().EmitAssemblerFlag(MCAF_Code64);
+      getParser().getStreamer().emitAssemblerFlag(MCAF_Code64);
     }
   } else {
     Error(L, "unknown directive " + IDVal);
diff --git a/llvm/lib/Target/X86/AsmParser/X86Operand.h b/llvm/lib/Target/X86/AsmParser/X86Operand.h
index d831a63b04ee..5cf4516ede97 100644
--- a/llvm/lib/Target/X86/AsmParser/X86Operand.h
+++ b/llvm/lib/Target/X86/AsmParser/X86Operand.h
@@ -17,9 +17,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCParser/MCParsedAsmOperand.h"
-#include "llvm/MC/MCRegisterInfo.h"
 #include "llvm/Support/Casting.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/SMLoc.h"
 #include <cassert>
 #include <memory>
@@ -60,6 +58,7 @@ struct X86Operand final : public MCParsedAsmOperand {
     unsigned SegReg;
     const MCExpr *Disp;
     unsigned BaseReg;
+    unsigned DefaultBaseReg;
     unsigned IndexReg;
     unsigned Scale;
     unsigned Size;
@@ -184,6 +183,10 @@ struct X86Operand final : public MCParsedAsmOperand {
     assert(Kind == Memory && "Invalid access!");
     return Mem.BaseReg;
   }
+  unsigned getMemDefaultBaseReg() const {
+    assert(Kind == Memory && "Invalid access!");
+    return Mem.DefaultBaseReg;
+  }
   unsigned getMemIndexReg() const {
     assert(Kind == Memory && "Invalid access!");
     return Mem.IndexReg;
@@ -312,6 +315,11 @@ struct X86Operand final : public MCParsedAsmOperand {
   bool isMem512() const {
     return Kind == Memory && (!Mem.Size || Mem.Size == 512);
   }
+
+  bool isSibMem() const {
+    return isMem() && Mem.BaseReg != X86::RIP && Mem.BaseReg != X86::EIP;
+  }
+
   bool isMemIndexReg(unsigned LowR, unsigned HighR) const {
     assert(Kind == Memory && "Invalid access!");
     return Mem.IndexReg >= LowR && Mem.IndexReg <= HighR;
@@ -458,6 +466,14 @@ struct X86Operand final : public MCParsedAsmOperand {
       X86MCRegisterClasses[X86::GR64RegClassID].contains(getReg()));
   }
 
+  bool isVectorReg() const {
+    return Kind == Register &&
+           (X86MCRegisterClasses[X86::VR64RegClassID].contains(getReg()) ||
+            X86MCRegisterClasses[X86::VR128XRegClassID].contains(getReg()) ||
+            X86MCRegisterClasses[X86::VR256XRegClassID].contains(getReg()) ||
+            X86MCRegisterClasses[X86::VR512RegClassID].contains(getReg()));
+  }
+
   bool isVK1Pair() const {
     return Kind == Register &&
       X86MCRegisterClasses[X86::VK1RegClassID].contains(getReg());
@@ -540,7 +556,10 @@ struct X86Operand final : public MCParsedAsmOperand {
 
   void addMemOperands(MCInst &Inst, unsigned N) const {
     assert((N == 5) && "Invalid number of operands!");
-    Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+    if (getMemBaseReg())
+      Inst.addOperand(MCOperand::createReg(getMemBaseReg()));
+    else
+      Inst.addOperand(MCOperand::createReg(getMemDefaultBaseReg()));
     Inst.addOperand(MCOperand::createImm(getMemScale()));
     Inst.addOperand(MCOperand::createReg(getMemIndexReg()));
     addExpr(Inst, getMemDisp());
@@ -633,6 +652,7 @@ struct X86Operand final : public MCParsedAsmOperand {
     Res->Mem.SegReg   = 0;
     Res->Mem.Disp     = Disp;
     Res->Mem.BaseReg  = 0;
+    Res->Mem.DefaultBaseReg = 0;
     Res->Mem.IndexReg = 0;
     Res->Mem.Scale    = 1;
     Res->Mem.Size     = Size;
@@ -648,11 +668,14 @@ struct X86Operand final : public MCParsedAsmOperand {
   static std::unique_ptr<X86Operand>
   CreateMem(unsigned ModeSize, unsigned SegReg, const MCExpr *Disp,
             unsigned BaseReg, unsigned IndexReg, unsigned Scale, SMLoc StartLoc,
-            SMLoc EndLoc, unsigned Size = 0, StringRef SymName = StringRef(),
-            void *OpDecl = nullptr, unsigned FrontendSize = 0) {
+            SMLoc EndLoc, unsigned Size = 0,
+            unsigned DefaultBaseReg = X86::NoRegister,
+            StringRef SymName = StringRef(), void *OpDecl = nullptr,
+            unsigned FrontendSize = 0) {
     // We should never just have a displacement, that should be parsed as an
     // absolute memory operand.
-    assert((SegReg || BaseReg || IndexReg) && "Invalid memory operand!");
+    assert((SegReg || BaseReg || IndexReg || DefaultBaseReg) &&
+           "Invalid memory operand!");
 
     // The scale should always be one of {1,2,4,8}.
     assert(((Scale == 1 || Scale == 2 || Scale == 4 || Scale == 8)) &&
@@ -661,6 +684,7 @@ struct X86Operand final : public MCParsedAsmOperand {
     Res->Mem.SegReg   = SegReg;
     Res->Mem.Disp     = Disp;
     Res->Mem.BaseReg  = BaseReg;
+    Res->Mem.DefaultBaseReg = DefaultBaseReg;
     Res->Mem.IndexReg = IndexReg;
     Res->Mem.Scale    = Scale;
     Res->Mem.Size     = Size;
diff --git a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
index ea8c606d1564..a7fa1eb9a5ee 100644
--- a/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
+++ b/llvm/lib/Target/X86/Disassembler/X86Disassembler.cpp
@@ -776,6 +776,10 @@ static int readModRM(struct InternalInstruction *insn) {
       return prefix##_YMM0 + index;                                            \
     case TYPE_XMM:                                                             \
       return prefix##_XMM0 + index;                                            \
+    case TYPE_TMM:                                                             \
+      if (index > 7)                                                           \
+        *valid = 0;                                                            \
+      return prefix##_TMM0 + index;                                            \
     case TYPE_VK:                                                              \
       index &= 0xf;                                                            \
       if (index > 7)                                                           \
@@ -849,6 +853,7 @@ static int fixupReg(struct InternalInstruction *insn,
     if (!valid)
       return -1;
     break;
+  case ENCODING_SIB:
   CASE_ENCODING_RM:
     if (insn->eaBase >= insn->eaRegBase) {
       insn->eaBase = (EABase)fixupRMValue(
@@ -1533,6 +1538,15 @@ static int readOperands(struct InternalInstruction *insn) {
       if (Op.encoding != ENCODING_REG && insn->eaDisplacement == EA_DISP_8)
         insn->displacement *= 1 << (Op.encoding - ENCODING_VSIB);
       break;
+    case ENCODING_SIB:
+      // Reject if SIB wasn't used.
+      if (insn->eaBase != EA_BASE_sib && insn->eaBase != EA_BASE_sib64)
+        return -1;
+      if (readModRM(insn))
+        return -1;
+      if (fixupReg(insn, &Op))
+        return -1;
+      break;
     case ENCODING_REG:
     CASE_ENCODING_RM:
       if (readModRM(insn))
@@ -2006,9 +2020,11 @@ static bool translateRMRegister(MCInst &mcInst,
 /// @param mcInst       - The MCInst to append to.
 /// @param insn         - The instruction to extract Mod, R/M, and SIB fields
 ///                       from.
+/// @param ForceSIB     - The instruction must use SIB.
 /// @return             - 0 on success; nonzero otherwise
 static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
-                              const MCDisassembler *Dis) {
+                              const MCDisassembler *Dis,
+                              bool ForceSIB = false) {
   // Addresses in an MCInst are represented as five operands:
   //   1. basereg       (register)  The R/M base, or (if there is a SIB) the
   //                                SIB base
@@ -2067,11 +2083,12 @@ static bool translateRMMemory(MCInst &mcInst, InternalInstruction &insn,
       // -Any base register used other than ESP/RSP/R12D/R12. Using these as a
       //  base always requires a SIB byte.
       // -A scale other than 1 is used.
-      if (insn.sibScale != 1 ||
-          (insn.sibBase == SIB_BASE_NONE && insn.mode != MODE_64BIT) ||
-          (insn.sibBase != SIB_BASE_NONE &&
-           insn.sibBase != SIB_BASE_ESP && insn.sibBase != SIB_BASE_RSP &&
-           insn.sibBase != SIB_BASE_R12D && insn.sibBase != SIB_BASE_R12)) {
+      if (!ForceSIB &&
+          (insn.sibScale != 1 ||
+           (insn.sibBase == SIB_BASE_NONE && insn.mode != MODE_64BIT) ||
+           (insn.sibBase != SIB_BASE_NONE &&
+            insn.sibBase != SIB_BASE_ESP && insn.sibBase != SIB_BASE_RSP &&
+            insn.sibBase != SIB_BASE_R12D && insn.sibBase != SIB_BASE_R12))) {
         indexReg = MCOperand::createReg(insn.addressSize == 4 ? X86::EIZ :
                                                                 X86::RIZ);
       } else
@@ -2182,6 +2199,7 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_XMM:
   case TYPE_YMM:
   case TYPE_ZMM:
+  case TYPE_TMM:
   case TYPE_VK_PAIR:
   case TYPE_VK:
   case TYPE_DEBUGREG:
@@ -2193,6 +2211,8 @@ static bool translateRM(MCInst &mcInst, const OperandSpecifier &operand,
   case TYPE_MVSIBY:
   case TYPE_MVSIBZ:
     return translateRMMemory(mcInst, insn, Dis);
+  case TYPE_MSIB:
+    return translateRMMemory(mcInst, insn, Dis, true);
   }
 }
 
@@ -2242,6 +2262,7 @@ static bool translateOperand(MCInst &mcInst, const OperandSpecifier &operand,
     return false;
   case ENCODING_WRITEMASK:
     return translateMaskRegister(mcInst, insn.writemask);
+  case ENCODING_SIB:
   CASE_ENCODING_RM:
   CASE_ENCODING_VSIB:
     return translateRM(mcInst, operand, insn, Dis);
diff --git a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
index 147fe46d81b9..4318c17f03a0 100644
--- a/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
+++ b/llvm/lib/Target/X86/Disassembler/X86DisassemblerDecoder.h
@@ -19,9 +19,6 @@
 #include "llvm/Support/X86DisassemblerDecoderCommon.h"
 
 namespace llvm {
-
-class MCInstrInfo;
-
 namespace X86Disassembler {
 
 // Accessor functions for various fields of an Intel instruction
@@ -383,6 +380,17 @@ namespace X86Disassembler {
   ENTRY(BND2)         \
   ENTRY(BND3)
 
+#undef  REGS_TMM
+#define REGS_TMM  \
+  ENTRY(TMM0)     \
+  ENTRY(TMM1)     \
+  ENTRY(TMM2)     \
+  ENTRY(TMM3)     \
+  ENTRY(TMM4)     \
+  ENTRY(TMM5)     \
+  ENTRY(TMM6)     \
+  ENTRY(TMM7)
+
 #define ALL_EA_BASES  \
   EA_BASES_16BIT      \
   EA_BASES_32BIT      \
@@ -407,6 +415,7 @@ namespace X86Disassembler {
   REGS_DEBUG          \
   REGS_CONTROL        \
   REGS_BOUND          \
+  REGS_TMM            \
   ENTRY(RIP)
 
 /// All possible values of the base field for effective-address
diff --git a/llvm/lib/Target/X86/ImmutableGraph.h b/llvm/lib/Target/X86/ImmutableGraph.h
new file mode 100644
index 000000000000..56738e9cfa73
--- /dev/null
+++ b/llvm/lib/Target/X86/ImmutableGraph.h
@@ -0,0 +1,445 @@
+//==========-- ImmutableGraph.h - A fast DAG implementation ---------=========//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Description: ImmutableGraph is a fast DAG implementation that cannot be
+/// modified, except by creating a new ImmutableGraph. ImmutableGraph is
+/// implemented as two arrays: one containing nodes, and one containing edges.
+/// The advantages to this implementation are two-fold:
+/// 1. Iteration and traversal operations benefit from cache locality.
+/// 2. Operations on sets of nodes/edges are efficient, and representations of
+///    those sets in memory are compact. For instance, a set of edges is
+///    implemented as a bit vector, wherein each bit corresponds to one edge in
+///    the edge array. This implies a lower bound of 64x spatial improvement
+///    over, e.g., an llvm::DenseSet or llvm::SmallSet. It also means that
+///    insert/erase/contains operations complete in negligible constant time:
+///    insert and erase require one load and one store, and contains requires
+///    just one load.
+///
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H
+#define LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H
+
+#include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/GraphTraits.h"
+#include "llvm/ADT/STLExtras.h"
+#include <algorithm>
+#include <iterator>
+#include <utility>
+#include <vector>
+
+namespace llvm {
+
+template <typename NodeValueT, typename EdgeValueT> class ImmutableGraph {
+  using Traits = GraphTraits<ImmutableGraph<NodeValueT, EdgeValueT> *>;
+  template <typename> friend class ImmutableGraphBuilder;
+
+public:
+  using node_value_type = NodeValueT;
+  using edge_value_type = EdgeValueT;
+  using size_type = int;
+  class Node;
+  class Edge {
+    friend class ImmutableGraph;
+    template <typename> friend class ImmutableGraphBuilder;
+
+    const Node *Dest;
+    edge_value_type Value;
+
+  public:
+    const Node *getDest() const { return Dest; };
+    const edge_value_type &getValue() const { return Value; }
+  };
+  class Node {
+    friend class ImmutableGraph;
+    template <typename> friend class ImmutableGraphBuilder;
+
+    const Edge *Edges;
+    node_value_type Value;
+
+  public:
+    const node_value_type &getValue() const { return Value; }
+
+    const Edge *edges_begin() const { return Edges; }
+    // Nodes are allocated sequentially. Edges for a node are stored together.
+    // The end of this Node's edges is the beginning of the next node's edges.
+    // An extra node was allocated to hold the end pointer for the last real
+    // node.
+    const Edge *edges_end() const { return (this + 1)->Edges; }
+    ArrayRef<Edge> edges() const {
+      return makeArrayRef(edges_begin(), edges_end());
+    }
+  };
+
+protected:
+  ImmutableGraph(std::unique_ptr<Node[]> Nodes, std::unique_ptr<Edge[]> Edges,
+                 size_type NodesSize, size_type EdgesSize)
+      : Nodes(std::move(Nodes)), Edges(std::move(Edges)), NodesSize(NodesSize),
+        EdgesSize(EdgesSize) {}
+  ImmutableGraph(const ImmutableGraph &) = delete;
+  ImmutableGraph(ImmutableGraph &&) = delete;
+  ImmutableGraph &operator=(const ImmutableGraph &) = delete;
+  ImmutableGraph &operator=(ImmutableGraph &&) = delete;
+
+public:
+  ArrayRef<Node> nodes() const { return makeArrayRef(Nodes.get(), NodesSize); }
+  const Node *nodes_begin() const { return nodes().begin(); }
+  const Node *nodes_end() const { return nodes().end(); }
+
+  ArrayRef<Edge> edges() const { return makeArrayRef(Edges.get(), EdgesSize); }
+  const Edge *edges_begin() const { return edges().begin(); }
+  const Edge *edges_end() const { return edges().end(); }
+
+  size_type nodes_size() const { return NodesSize; }
+  size_type edges_size() const { return EdgesSize; }
+
+  // Node N must belong to this ImmutableGraph.
+  size_type getNodeIndex(const Node &N) const {
+    return std::distance(nodes_begin(), &N);
+  }
+  // Edge E must belong to this ImmutableGraph.
+  size_type getEdgeIndex(const Edge &E) const {
+    return std::distance(edges_begin(), &E);
+  }
+
+  // FIXME: Could NodeSet and EdgeSet be templated to share code?
+  class NodeSet {
+    const ImmutableGraph &G;
+    BitVector V;
+
+  public:
+    NodeSet(const ImmutableGraph &G, bool ContainsAll = false)
+        : G{G}, V{static_cast<unsigned>(G.nodes_size()), ContainsAll} {}
+    bool insert(const Node &N) {
+      size_type Idx = G.getNodeIndex(N);
+      bool AlreadyExists = V.test(Idx);
+      V.set(Idx);
+      return !AlreadyExists;
+    }
+    void erase(const Node &N) {
+      size_type Idx = G.getNodeIndex(N);
+      V.reset(Idx);
+    }
+    bool contains(const Node &N) const {
+      size_type Idx = G.getNodeIndex(N);
+      return V.test(Idx);
+    }
+    void clear() { V.reset(); }
+    size_type empty() const { return V.none(); }
+    /// Return the number of elements in the set
+    size_type count() const { return V.count(); }
+    /// Return the size of the set's domain
+    size_type size() const { return V.size(); }
+    /// Set union
+    NodeSet &operator|=(const NodeSet &RHS) {
+      assert(&this->G == &RHS.G);
+      V |= RHS.V;
+      return *this;
+    }
+    /// Set intersection
+    NodeSet &operator&=(const NodeSet &RHS) {
+      assert(&this->G == &RHS.G);
+      V &= RHS.V;
+      return *this;
+    }
+    /// Set disjoint union
+    NodeSet &operator^=(const NodeSet &RHS) {
+      assert(&this->G == &RHS.G);
+      V ^= RHS.V;
+      return *this;
+    }
+
+    using index_iterator = typename BitVector::const_set_bits_iterator;
+    index_iterator index_begin() const { return V.set_bits_begin(); }
+    index_iterator index_end() const { return V.set_bits_end(); }
+    void set(size_type Idx) { V.set(Idx); }
+    void reset(size_type Idx) { V.reset(Idx); }
+
+    class iterator {
+      const NodeSet &Set;
+      size_type Current;
+
+      void advance() {
+        assert(Current != -1);
+        Current = Set.V.find_next(Current);
+      }
+
+    public:
+      iterator(const NodeSet &Set, size_type Begin)
+          : Set{Set}, Current{Begin} {}
+      iterator operator++(int) {
+        iterator Tmp = *this;
+        advance();
+        return Tmp;
+      }
+      iterator &operator++() {
+        advance();
+        return *this;
+      }
+      Node *operator*() const {
+        assert(Current != -1);
+        return Set.G.nodes_begin() + Current;
+      }
+      bool operator==(const iterator &other) const {
+        assert(&this->Set == &other.Set);
+        return this->Current == other.Current;
+      }
+      bool operator!=(const iterator &other) const { return !(*this == other); }
+    };
+
+    iterator begin() const { return iterator{*this, V.find_first()}; }
+    iterator end() const { return iterator{*this, -1}; }
+  };
+
+  class EdgeSet {
+    const ImmutableGraph &G;
+    BitVector V;
+
+  public:
+    EdgeSet(const ImmutableGraph &G, bool ContainsAll = false)
+        : G{G}, V{static_cast<unsigned>(G.edges_size()), ContainsAll} {}
+    bool insert(const Edge &E) {
+      size_type Idx = G.getEdgeIndex(E);
+      bool AlreadyExists = V.test(Idx);
+      V.set(Idx);
+      return !AlreadyExists;
+    }
+    void erase(const Edge &E) {
+      size_type Idx = G.getEdgeIndex(E);
+      V.reset(Idx);
+    }
+    bool contains(const Edge &E) const {
+      size_type Idx = G.getEdgeIndex(E);
+      return V.test(Idx);
+    }
+    void clear() { V.reset(); }
+    bool empty() const { return V.none(); }
+    /// Return the number of elements in the set
+    size_type count() const { return V.count(); }
+    /// Return the size of the set's domain
+    size_type size() const { return V.size(); }
+    /// Set union
+    EdgeSet &operator|=(const EdgeSet &RHS) {
+      assert(&this->G == &RHS.G);
+      V |= RHS.V;
+      return *this;
+    }
+    /// Set intersection
+    EdgeSet &operator&=(const EdgeSet &RHS) {
+      assert(&this->G == &RHS.G);
+      V &= RHS.V;
+      return *this;
+    }
+    /// Set disjoint union
+    EdgeSet &operator^=(const EdgeSet &RHS) {
+      assert(&this->G == &RHS.G);
+      V ^= RHS.V;
+      return *this;
+    }
+
+    using index_iterator = typename BitVector::const_set_bits_iterator;
+    index_iterator index_begin() const { return V.set_bits_begin(); }
+    index_iterator index_end() const { return V.set_bits_end(); }
+    void set(size_type Idx) { V.set(Idx); }
+    void reset(size_type Idx) { V.reset(Idx); }
+
+    class iterator {
+      const EdgeSet &Set;
+      size_type Current;
+
+      void advance() {
+        assert(Current != -1);
+        Current = Set.V.find_next(Current);
+      }
+
+    public:
+      iterator(const EdgeSet &Set, size_type Begin)
+          : Set{Set}, Current{Begin} {}
+      iterator operator++(int) {
+        iterator Tmp = *this;
+        advance();
+        return Tmp;
+      }
+      iterator &operator++() {
+        advance();
+        return *this;
+      }
+      Edge *operator*() const {
+        assert(Current != -1);
+        return Set.G.edges_begin() + Current;
+      }
+      bool operator==(const iterator &other) const {
+        assert(&this->Set == &other.Set);
+        return this->Current == other.Current;
+      }
+      bool operator!=(const iterator &other) const { return !(*this == other); }
+    };
+
+    iterator begin() const { return iterator{*this, V.find_first()}; }
+    iterator end() const { return iterator{*this, -1}; }
+  };
+
+private:
+  std::unique_ptr<Node[]> Nodes;
+  std::unique_ptr<Edge[]> Edges;
+  size_type NodesSize;
+  size_type EdgesSize;
+};
+
+template <typename GraphT> class ImmutableGraphBuilder {
+  using node_value_type = typename GraphT::node_value_type;
+  using edge_value_type = typename GraphT::edge_value_type;
+  static_assert(
+      std::is_base_of<ImmutableGraph<node_value_type, edge_value_type>,
+                      GraphT>::value,
+      "Template argument to ImmutableGraphBuilder must derive from "
+      "ImmutableGraph<>");
+  using size_type = typename GraphT::size_type;
+  using NodeSet = typename GraphT::NodeSet;
+  using Node = typename GraphT::Node;
+  using EdgeSet = typename GraphT::EdgeSet;
+  using Edge = typename GraphT::Edge;
+  using BuilderEdge = std::pair<edge_value_type, size_type>;
+  using EdgeList = std::vector<BuilderEdge>;
+  using BuilderVertex = std::pair<node_value_type, EdgeList>;
+  using VertexVec = std::vector<BuilderVertex>;
+
+public:
+  using BuilderNodeRef = size_type;
+
+  BuilderNodeRef addVertex(const node_value_type &V) {
+    auto I = AdjList.emplace(AdjList.end(), V, EdgeList{});
+    return std::distance(AdjList.begin(), I);
+  }
+
+  void addEdge(const edge_value_type &E, BuilderNodeRef From,
+               BuilderNodeRef To) {
+    AdjList[From].second.emplace_back(E, To);
+  }
+
+  bool empty() const { return AdjList.empty(); }
+
+  template <typename... ArgT> std::unique_ptr<GraphT> get(ArgT &&... Args) {
+    size_type VertexSize = AdjList.size(), EdgeSize = 0;
+    for (const auto &V : AdjList) {
+      EdgeSize += V.second.size();
+    }
+    auto VertexArray =
+        std::make_unique<Node[]>(VertexSize + 1 /* terminator node */);
+    auto EdgeArray = std::make_unique<Edge[]>(EdgeSize);
+    size_type VI = 0, EI = 0;
+    for (; VI < VertexSize; ++VI) {
+      VertexArray[VI].Value = std::move(AdjList[VI].first);
+      VertexArray[VI].Edges = &EdgeArray[EI];
+      auto NumEdges = static_cast<size_type>(AdjList[VI].second.size());
+      for (size_type VEI = 0; VEI < NumEdges; ++VEI, ++EI) {
+        auto &E = AdjList[VI].second[VEI];
+        EdgeArray[EI].Value = std::move(E.first);
+        EdgeArray[EI].Dest = &VertexArray[E.second];
+      }
+    }
+    assert(VI == VertexSize && EI == EdgeSize && "ImmutableGraph malformed");
+    VertexArray[VI].Edges = &EdgeArray[EdgeSize]; // terminator node
+    return std::make_unique<GraphT>(std::move(VertexArray),
+                                    std::move(EdgeArray), VertexSize, EdgeSize,
+                                    std::forward<ArgT>(Args)...);
+  }
+
+  template <typename... ArgT>
+  static std::unique_ptr<GraphT> trim(const GraphT &G, const NodeSet &TrimNodes,
+                                      const EdgeSet &TrimEdges,
+                                      ArgT &&... Args) {
+    size_type NewVertexSize = G.nodes_size() - TrimNodes.count();
+    size_type NewEdgeSize = G.edges_size() - TrimEdges.count();
+    auto NewVertexArray =
+        std::make_unique<Node[]>(NewVertexSize + 1 /* terminator node */);
+    auto NewEdgeArray = std::make_unique<Edge[]>(NewEdgeSize);
+
+    // Walk the nodes and determine the new index for each node.
+    size_type NewNodeIndex = 0;
+    std::vector<size_type> RemappedNodeIndex(G.nodes_size());
+    for (const Node &N : G.nodes()) {
+      if (TrimNodes.contains(N))
+        continue;
+      RemappedNodeIndex[G.getNodeIndex(N)] = NewNodeIndex++;
+    }
+    assert(NewNodeIndex == NewVertexSize &&
+           "Should have assigned NewVertexSize indices");
+
+    size_type VertexI = 0, EdgeI = 0;
+    for (const Node &N : G.nodes()) {
+      if (TrimNodes.contains(N))
+        continue;
+      NewVertexArray[VertexI].Value = N.getValue();
+      NewVertexArray[VertexI].Edges = &NewEdgeArray[EdgeI];
+      for (const Edge &E : N.edges()) {
+        if (TrimEdges.contains(E))
+          continue;
+        NewEdgeArray[EdgeI].Value = E.getValue();
+        size_type DestIdx = G.getNodeIndex(*E.getDest());
+        size_type NewIdx = RemappedNodeIndex[DestIdx];
+        assert(NewIdx < NewVertexSize);
+        NewEdgeArray[EdgeI].Dest = &NewVertexArray[NewIdx];
+        ++EdgeI;
+      }
+      ++VertexI;
+    }
+    assert(VertexI == NewVertexSize && EdgeI == NewEdgeSize &&
+           "Gadget graph malformed");
+    NewVertexArray[VertexI].Edges = &NewEdgeArray[NewEdgeSize]; // terminator
+    return std::make_unique<GraphT>(std::move(NewVertexArray),
+                                    std::move(NewEdgeArray), NewVertexSize,
+                                    NewEdgeSize, std::forward<ArgT>(Args)...);
+  }
+
+private:
+  VertexVec AdjList;
+};
+
+template <typename NodeValueT, typename EdgeValueT>
+struct GraphTraits<ImmutableGraph<NodeValueT, EdgeValueT> *> {
+  using GraphT = ImmutableGraph<NodeValueT, EdgeValueT>;
+  using NodeRef = typename GraphT::Node const *;
+  using EdgeRef = typename GraphT::Edge const &;
+
+  static NodeRef edge_dest(EdgeRef E) { return E.getDest(); }
+  using ChildIteratorType =
+      mapped_iterator<typename GraphT::Edge const *, decltype(&edge_dest)>;
+
+  static NodeRef getEntryNode(GraphT *G) { return G->nodes_begin(); }
+  static ChildIteratorType child_begin(NodeRef N) {
+    return {N->edges_begin(), &edge_dest};
+  }
+  static ChildIteratorType child_end(NodeRef N) {
+    return {N->edges_end(), &edge_dest};
+  }
+
+  static NodeRef getNode(typename GraphT::Node const &N) { return NodeRef{&N}; }
+  using nodes_iterator =
+      mapped_iterator<typename GraphT::Node const *, decltype(&getNode)>;
+  static nodes_iterator nodes_begin(GraphT *G) {
+    return {G->nodes_begin(), &getNode};
+  }
+  static nodes_iterator nodes_end(GraphT *G) {
+    return {G->nodes_end(), &getNode};
+  }
+
+  using ChildEdgeIteratorType = typename GraphT::Edge const *;
+
+  static ChildEdgeIteratorType child_edge_begin(NodeRef N) {
+    return N->edges_begin();
+  }
+  static ChildEdgeIteratorType child_edge_end(NodeRef N) {
+    return N->edges_end();
+  }
+  static typename GraphT::size_type size(GraphT *G) { return G->nodes_size(); }
+};
+
+} // end namespace llvm
+
+#endif // LLVM_LIB_TARGET_X86_IMMUTABLEGRAPH_H
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
index 675a9c377b12..0134b4efce72 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.cpp
@@ -56,7 +56,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   if (MI->getOpcode() == X86::CALLpcrel32 &&
       (STI.getFeatureBits()[X86::Mode64Bit])) {
     OS << "\tcallq\t";
-    printPCRelImm(MI, 0, OS);
+    printPCRelImm(MI, Address, 0, OS);
   }
   // data16 and data32 both have the same encoding of 0x66. While data32 is
   // valid only in 16 bit systems, data16 is valid in the rest.
@@ -68,8 +68,7 @@ void X86ATTInstPrinter::printInst(const MCInst *MI, uint64_t Address,
    OS << "\tdata32";
   }
   // Try to print any aliases first.
-  else if (!printAliasInstr(MI, OS) &&
-           !printVecCompareInstr(MI, OS))
+  else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS))
     printInstruction(MI, Address, OS);
 
   // Next always print the annotation.
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
index 3d5d384dc4a0..51ddae61d251 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ATTInstPrinter.h
@@ -30,9 +30,10 @@ public:
 
   // Autogenerated by tblgen, returns true if we successfully printed an
   // alias.
-  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
-  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                               unsigned PrintMethodIdx, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                               unsigned OpIdx, unsigned PrintMethodIdx,
+                               raw_ostream &O);
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &OS);
@@ -46,13 +47,6 @@ public:
   void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &OS);
   void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
 
-  void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    printMemReference(MI, OpNo, O);
-  }
-  void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    printMemReference(MI, OpNo, O);
-  }
-
   void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     printMemReference(MI, OpNo, O);
   }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
index dffda5217675..bf3b6bcb5463 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86AsmBackend.cpp
@@ -12,7 +12,9 @@
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/BinaryFormat/MachO.h"
 #include "llvm/MC/MCAsmBackend.h"
+#include "llvm/MC/MCAsmLayout.h"
 #include "llvm/MC/MCAssembler.h"
+#include "llvm/MC/MCCodeEmitter.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCDwarf.h"
 #include "llvm/MC/MCELFObjectWriter.h"
@@ -60,10 +62,9 @@ public:
       else if (BranchType == "indirect")
         addKind(X86::AlignBranchIndirect);
       else {
-        report_fatal_error(
-            "'-x86-align-branch 'The branches's type is combination of jcc, "
-            "fused, jmp, call, ret, indirect.(plus separated)",
-            false);
+        errs() << "invalid argument " << BranchType.str()
+               << " to -x86-align-branch=; each element must be one of: fused, "
+                  "jcc, jmp, call, ret, indirect.(plus separated)\n";
       }
     }
   }
@@ -85,13 +86,14 @@ cl::opt<unsigned> X86AlignBranchBoundary(
 
 cl::opt<X86AlignBranchKind, true, cl::parser<std::string>> X86AlignBranch(
     "x86-align-branch",
-    cl::desc("Specify types of branches to align (plus separated list of "
-             "types). The branches's types are combination of jcc, fused, "
-             "jmp, call, ret, indirect."),
-    cl::value_desc("jcc indicates conditional jumps, fused indicates fused "
-                   "conditional jumps, jmp indicates unconditional jumps, call "
-                   "indicates direct and indirect calls, ret indicates rets, "
-                   "indirect indicates indirect jumps."),
+    cl::desc(
+        "Specify types of branches to align (plus separated list of types):"
+             "\njcc      indicates conditional jumps"
+             "\nfused    indicates fused conditional jumps"
+             "\njmp      indicates direct unconditional jumps"
+             "\ncall     indicates direct and indirect calls"
+             "\nret      indicates rets"
+             "\nindirect indicates indirect unconditional jumps"),
     cl::location(X86AlignBranchKindLoc));
 
 cl::opt<bool> X86AlignBranchWithin32BBoundaries(
@@ -102,6 +104,18 @@ cl::opt<bool> X86AlignBranchWithin32BBoundaries(
         "assumptions about labels corresponding to particular instructions, "
         "and should be used with caution."));
 
+cl::opt<unsigned> X86PadMaxPrefixSize(
+    "x86-pad-max-prefix-size", cl::init(0),
+    cl::desc("Maximum number of prefixes to use for padding"));
+
+cl::opt<bool> X86PadForAlign(
+    "x86-pad-for-align", cl::init(true), cl::Hidden,
+    cl::desc("Pad previous instructions to implement align directives"));
+
+cl::opt<bool> X86PadForBranchAlign(
+    "x86-pad-for-branch-align", cl::init(true), cl::Hidden,
+    cl::desc("Pad previous instructions to implement branch alignment"));
+
 class X86ELFObjectWriter : public MCELFObjectTargetWriter {
 public:
   X86ELFObjectWriter(bool is64Bit, uint8_t OSABI, uint16_t EMachine,
@@ -114,14 +128,18 @@ class X86AsmBackend : public MCAsmBackend {
   std::unique_ptr<const MCInstrInfo> MCII;
   X86AlignBranchKind AlignBranchType;
   Align AlignBoundary;
+  unsigned TargetPrefixMax = 0;
 
-  bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const;
-
-  bool needAlign(MCObjectStreamer &OS) const;
-  bool needAlignInst(const MCInst &Inst) const;
-  MCBoundaryAlignFragment *
-  getOrCreateBoundaryAlignFragment(MCObjectStreamer &OS) const;
   MCInst PrevInst;
+  MCBoundaryAlignFragment *PendingBA = nullptr;
+  std::pair<MCFragment *, size_t> PrevInstPosition;
+  bool CanPadInst;
+
+  uint8_t determinePaddingPrefix(const MCInst &Inst) const;
+  bool isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const;
+  bool needAlign(const MCInst &Inst) const;
+  bool canPadBranches(MCObjectStreamer &OS) const;
+  bool canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const;
 
 public:
   X86AsmBackend(const Target &T, const MCSubtargetInfo &STI)
@@ -142,11 +160,14 @@ public:
       AlignBoundary = assumeAligned(X86AlignBranchBoundary);
     if (X86AlignBranch.getNumOccurrences())
       AlignBranchType = X86AlignBranchKindLoc;
+    if (X86PadMaxPrefixSize.getNumOccurrences())
+      TargetPrefixMax = X86PadMaxPrefixSize;
   }
 
   bool allowAutoPadding() const override;
-  void alignBranchesBegin(MCObjectStreamer &OS, const MCInst &Inst) override;
-  void alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) override;
+  bool allowEnhancedRelaxation() const override;
+  void emitInstructionBegin(MCObjectStreamer &OS, const MCInst &Inst) override;
+  void emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) override;
 
   unsigned getNumFixupKinds() const override {
     return X86::NumTargetFixupKinds;
@@ -155,7 +176,7 @@ public:
   Optional<MCFixupKind> getFixupKind(StringRef Name) const override;
 
   const MCFixupKindInfo &getFixupKindInfo(MCFixupKind Kind) const override;
-  
+
   bool shouldForceRelocation(const MCAssembler &Asm, const MCFixup &Fixup,
                              const MCValue &Target) override;
 
@@ -171,22 +192,34 @@ public:
                             const MCRelaxableFragment *DF,
                             const MCAsmLayout &Layout) const override;
 
-  void relaxInstruction(const MCInst &Inst, const MCSubtargetInfo &STI,
-                        MCInst &Res) const override;
+  void relaxInstruction(MCInst &Inst,
+                        const MCSubtargetInfo &STI) const override;
+
+  bool padInstructionViaRelaxation(MCRelaxableFragment &RF,
+                                   MCCodeEmitter &Emitter,
+                                   unsigned &RemainingSize) const;
+
+  bool padInstructionViaPrefix(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
+                               unsigned &RemainingSize) const;
+
+  bool padInstructionEncoding(MCRelaxableFragment &RF, MCCodeEmitter &Emitter,
+                              unsigned &RemainingSize) const;
+
+  void finishLayout(MCAssembler const &Asm, MCAsmLayout &Layout) const override;
 
   bool writeNopData(raw_ostream &OS, uint64_t Count) const override;
 };
 } // end anonymous namespace
 
-static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool is16BitMode) {
+static unsigned getRelaxedOpcodeBranch(const MCInst &Inst, bool Is16BitMode) {
   unsigned Op = Inst.getOpcode();
   switch (Op) {
   default:
     return Op;
   case X86::JCC_1:
-    return (is16BitMode) ? X86::JCC_2 : X86::JCC_4;
+    return (Is16BitMode) ? X86::JCC_2 : X86::JCC_4;
   case X86::JMP_1:
-    return (is16BitMode) ? X86::JMP_2 : X86::JMP_4;
+    return (Is16BitMode) ? X86::JMP_2 : X86::JMP_4;
   }
 }
 
@@ -275,11 +308,11 @@ static unsigned getRelaxedOpcodeArith(const MCInst &Inst) {
   }
 }
 
-static unsigned getRelaxedOpcode(const MCInst &Inst, bool is16BitMode) {
+static unsigned getRelaxedOpcode(const MCInst &Inst, bool Is16BitMode) {
   unsigned R = getRelaxedOpcodeArith(Inst);
   if (R != Inst.getOpcode())
     return R;
-  return getRelaxedOpcodeBranch(Inst, is16BitMode);
+  return getRelaxedOpcodeBranch(Inst, Is16BitMode);
 }
 
 static X86::CondCode getCondFromBranch(const MCInst &MI,
@@ -316,6 +349,11 @@ static bool isRIPRelative(const MCInst &MI, const MCInstrInfo &MCII) {
   return (BaseReg == X86::RIP);
 }
 
+/// Check if the instruction is a prefix.
+static bool isPrefix(const MCInst &MI, const MCInstrInfo &MCII) {
+  return X86II::isPrefix(MCII.get(MI.getOpcode()).TSFlags);
+}
+
 /// Check if the instruction is valid as the first instruction in macro fusion.
 static bool isFirstMacroFusibleInst(const MCInst &Inst,
                                     const MCInstrInfo &MCII) {
@@ -327,6 +365,69 @@ static bool isFirstMacroFusibleInst(const MCInst &Inst,
   return FIK != X86::FirstMacroFusionInstKind::Invalid;
 }
 
+/// X86 can reduce the bytes of NOP by padding instructions with prefixes to
+/// get a better peformance in some cases. Here, we determine which prefix is
+/// the most suitable.
+///
+/// If the instruction has a segment override prefix, use the existing one.
+/// If the target is 64-bit, use the CS.
+/// If the target is 32-bit,
+///   - If the instruction has a ESP/EBP base register, use SS.
+///   - Otherwise use DS.
+uint8_t X86AsmBackend::determinePaddingPrefix(const MCInst &Inst) const {
+  assert((STI.hasFeature(X86::Mode32Bit) || STI.hasFeature(X86::Mode64Bit)) &&
+         "Prefixes can be added only in 32-bit or 64-bit mode.");
+  const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
+  // Determine where the memory operand starts, if present.
+  int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
+  if (MemoryOperand != -1)
+    MemoryOperand += X86II::getOperandBias(Desc);
+
+  unsigned SegmentReg = 0;
+  if (MemoryOperand >= 0) {
+    // Check for explicit segment override on memory operand.
+    SegmentReg = Inst.getOperand(MemoryOperand + X86::AddrSegmentReg).getReg();
+  }
+
+  switch (TSFlags & X86II::FormMask) {
+  default:
+    break;
+  case X86II::RawFrmDstSrc: {
+    // Check segment override opcode prefix as needed (not for %ds).
+    if (Inst.getOperand(2).getReg() != X86::DS)
+      SegmentReg = Inst.getOperand(2).getReg();
+    break;
+  }
+  case X86II::RawFrmSrc: {
+    // Check segment override opcode prefix as needed (not for %ds).
+    if (Inst.getOperand(1).getReg() != X86::DS)
+      SegmentReg = Inst.getOperand(1).getReg();
+    break;
+  }
+  case X86II::RawFrmMemOffs: {
+    // Check segment override opcode prefix as needed.
+    SegmentReg = Inst.getOperand(1).getReg();
+    break;
+  }
+  }
+
+  if (SegmentReg != 0)
+    return X86::getSegmentOverridePrefixForReg(SegmentReg);
+
+  if (STI.hasFeature(X86::Mode64Bit))
+    return X86::CS_Encoding;
+
+  if (MemoryOperand >= 0) {
+    unsigned BaseRegNum = MemoryOperand + X86::AddrBaseReg;
+    unsigned BaseReg = Inst.getOperand(BaseRegNum).getReg();
+    if (BaseReg == X86::ESP || BaseReg == X86::EBP)
+      return X86::SS_Encoding;
+  }
+  return X86::DS_Encoding;
+}
+
 /// Check if the two instructions will be macro-fused on the target cpu.
 bool X86AsmBackend::isMacroFused(const MCInst &Cmp, const MCInst &Jcc) const {
   const MCInstrDesc &InstDesc = MCII->get(Jcc.getOpcode());
@@ -355,19 +456,122 @@ static bool hasVariantSymbol(const MCInst &MI) {
 }
 
 bool X86AsmBackend::allowAutoPadding() const {
-  return (AlignBoundary != Align::None() &&
-          AlignBranchType != X86::AlignBranchNone);
+  return (AlignBoundary != Align(1) && AlignBranchType != X86::AlignBranchNone);
+}
+
+bool X86AsmBackend::allowEnhancedRelaxation() const {
+  return allowAutoPadding() && TargetPrefixMax != 0 && X86PadForBranchAlign;
+}
+
+/// X86 has certain instructions which enable interrupts exactly one
+/// instruction *after* the instruction which stores to SS.  Return true if the
+/// given instruction has such an interrupt delay slot.
+static bool hasInterruptDelaySlot(const MCInst &Inst) {
+  switch (Inst.getOpcode()) {
+  case X86::POPSS16:
+  case X86::POPSS32:
+  case X86::STI:
+    return true;
+
+  case X86::MOV16sr:
+  case X86::MOV32sr:
+  case X86::MOV64sr:
+  case X86::MOV16sm:
+    if (Inst.getOperand(0).getReg() == X86::SS)
+      return true;
+    break;
+  }
+  return false;
+}
+
+/// Check if the instruction to be emitted is right after any data.
+static bool
+isRightAfterData(MCFragment *CurrentFragment,
+                 const std::pair<MCFragment *, size_t> &PrevInstPosition) {
+  MCFragment *F = CurrentFragment;
+  // Empty data fragments may be created to prevent further data being
+  // added into the previous fragment, we need to skip them since they
+  // have no contents.
+  for (; isa_and_nonnull<MCDataFragment>(F); F = F->getPrevNode())
+    if (cast<MCDataFragment>(F)->getContents().size() != 0)
+      break;
+
+  // Since data is always emitted into a DataFragment, our check strategy is
+  // simple here.
+  //   - If the fragment is a DataFragment
+  //     - If it's not the fragment where the previous instruction is,
+  //       returns true.
+  //     - If it's the fragment holding the previous instruction but its
+  //       size changed since the the previous instruction was emitted into
+  //       it, returns true.
+  //     - Otherwise returns false.
+  //   - If the fragment is not a DataFragment, returns false.
+  if (auto *DF = dyn_cast_or_null<MCDataFragment>(F))
+    return DF != PrevInstPosition.first ||
+           DF->getContents().size() != PrevInstPosition.second;
+
+  return false;
+}
+
+/// \returns the fragment size if it has instructions, otherwise returns 0.
+static size_t getSizeForInstFragment(const MCFragment *F) {
+  if (!F || !F->hasInstructions())
+    return 0;
+  // MCEncodedFragmentWithContents being templated makes this tricky.
+  switch (F->getKind()) {
+  default:
+    llvm_unreachable("Unknown fragment with instructions!");
+  case MCFragment::FT_Data:
+    return cast<MCDataFragment>(*F).getContents().size();
+  case MCFragment::FT_Relaxable:
+    return cast<MCRelaxableFragment>(*F).getContents().size();
+  case MCFragment::FT_CompactEncodedInst:
+    return cast<MCCompactEncodedInstFragment>(*F).getContents().size();
+  }
+}
+
+/// Return true if we can insert NOP or prefixes automatically before the
+/// the instruction to be emitted.
+bool X86AsmBackend::canPadInst(const MCInst &Inst, MCObjectStreamer &OS) const {
+  if (hasVariantSymbol(Inst))
+    // Linker may rewrite the instruction with variant symbol operand(e.g.
+    // TLSCALL).
+    return false;
+
+  if (hasInterruptDelaySlot(PrevInst))
+    // If this instruction follows an interrupt enabling instruction with a one
+    // instruction delay, inserting a nop would change behavior.
+    return false;
+
+  if (isPrefix(PrevInst, *MCII))
+    // If this instruction follows a prefix, inserting a nop/prefix would change
+    // semantic.
+    return false;
+
+  if (isPrefix(Inst, *MCII))
+    // If this instruction is a prefix, inserting a prefix would change
+    // semantic.
+    return false;
+
+  if (isRightAfterData(OS.getCurrentFragment(), PrevInstPosition))
+    // If this instruction follows any data, there is no clear
+    // instruction boundary, inserting a nop/prefix would change semantic.
+    return false;
+
+  return true;
 }
 
-bool X86AsmBackend::needAlign(MCObjectStreamer &OS) const {
+bool X86AsmBackend::canPadBranches(MCObjectStreamer &OS) const {
   if (!OS.getAllowAutoPadding())
     return false;
   assert(allowAutoPadding() && "incorrect initialization!");
 
-  MCAssembler &Assembler = OS.getAssembler();
-  MCSection *Sec = OS.getCurrentSectionOnly();
+  // We only pad in text section.
+  if (!OS.getCurrentSectionOnly()->getKind().isText())
+    return false;
+
   // To be Done: Currently don't deal with Bundle cases.
-  if (Assembler.isBundlingEnabled() && Sec->isBundleLocked())
+  if (OS.getAssembler().isBundlingEnabled())
     return false;
 
   // Branches only need to be aligned in 32-bit or 64-bit mode.
@@ -377,59 +581,42 @@ bool X86AsmBackend::needAlign(MCObjectStreamer &OS) const {
   return true;
 }
 
-/// Check if the instruction operand needs to be aligned. Padding is disabled
-/// before intruction which may be rewritten by linker(e.g. TLSCALL).
-bool X86AsmBackend::needAlignInst(const MCInst &Inst) const {
-  // Linker may rewrite the instruction with variant symbol operand.
-  if (hasVariantSymbol(Inst))
-    return false;
-
-  const MCInstrDesc &InstDesc = MCII->get(Inst.getOpcode());
-  return (InstDesc.isConditionalBranch() &&
+/// Check if the instruction operand needs to be aligned.
+bool X86AsmBackend::needAlign(const MCInst &Inst) const {
+  const MCInstrDesc &Desc = MCII->get(Inst.getOpcode());
+  return (Desc.isConditionalBranch() &&
           (AlignBranchType & X86::AlignBranchJcc)) ||
-         (InstDesc.isUnconditionalBranch() &&
+         (Desc.isUnconditionalBranch() &&
           (AlignBranchType & X86::AlignBranchJmp)) ||
-         (InstDesc.isCall() &&
-          (AlignBranchType & X86::AlignBranchCall)) ||
-         (InstDesc.isReturn() &&
-          (AlignBranchType & X86::AlignBranchRet)) ||
-         (InstDesc.isIndirectBranch() &&
+         (Desc.isCall() && (AlignBranchType & X86::AlignBranchCall)) ||
+         (Desc.isReturn() && (AlignBranchType & X86::AlignBranchRet)) ||
+         (Desc.isIndirectBranch() &&
           (AlignBranchType & X86::AlignBranchIndirect));
 }
 
-static bool canReuseBoundaryAlignFragment(const MCBoundaryAlignFragment &F) {
-  // If a MCBoundaryAlignFragment has not been used to emit NOP,we can reuse it.
-  return !F.canEmitNops();
-}
+/// Insert BoundaryAlignFragment before instructions to align branches.
+void X86AsmBackend::emitInstructionBegin(MCObjectStreamer &OS,
+                                         const MCInst &Inst) {
+  CanPadInst = canPadInst(Inst, OS);
 
-MCBoundaryAlignFragment *
-X86AsmBackend::getOrCreateBoundaryAlignFragment(MCObjectStreamer &OS) const {
-  auto *F = dyn_cast_or_null<MCBoundaryAlignFragment>(OS.getCurrentFragment());
-  if (!F || !canReuseBoundaryAlignFragment(*F)) {
-    F = new MCBoundaryAlignFragment(AlignBoundary);
-    OS.insert(F);
-  }
-  return F;
-}
+  if (!canPadBranches(OS))
+    return;
+
+  if (!isMacroFused(PrevInst, Inst))
+    // Macro fusion doesn't happen indeed, clear the pending.
+    PendingBA = nullptr;
 
-/// Insert MCBoundaryAlignFragment before instructions to align branches.
-void X86AsmBackend::alignBranchesBegin(MCObjectStreamer &OS,
-                                       const MCInst &Inst) {
-  if (!needAlign(OS))
+  if (!CanPadInst)
     return;
 
-  MCFragment *CF = OS.getCurrentFragment();
-  bool NeedAlignFused = AlignBranchType & X86::AlignBranchFused;
-  if (NeedAlignFused && isMacroFused(PrevInst, Inst) && CF) {
+  if (PendingBA && OS.getCurrentFragment()->getPrevNode() == PendingBA) {
     // Macro fusion actually happens and there is no other fragment inserted
-    // after the previous instruction. NOP can be emitted in PF to align fused
-    // jcc.
-    if (auto *PF =
-            dyn_cast_or_null<MCBoundaryAlignFragment>(CF->getPrevNode())) {
-      const_cast<MCBoundaryAlignFragment *>(PF)->setEmitNops(true);
-      const_cast<MCBoundaryAlignFragment *>(PF)->setFused(true);
-    }
-  } else if (needAlignInst(Inst)) {
+    // after the previous instruction.
+    //
+    // Do nothing here since we already inserted a BoudaryAlign fragment when
+    // we met the first instruction in the fused pair and we'll tie them
+    // together in emitInstructionEnd.
+    //
     // Note: When there is at least one fragment, such as MCAlignFragment,
     // inserted after the previous instruction, e.g.
     //
@@ -441,34 +628,41 @@ void X86AsmBackend::alignBranchesBegin(MCObjectStreamer &OS,
     //
     // We will treat the JCC as a unfused branch although it may be fused
     // with the CMP.
-    auto *F = getOrCreateBoundaryAlignFragment(OS);
-    F->setEmitNops(true);
-    F->setFused(false);
-  } else if (NeedAlignFused && isFirstMacroFusibleInst(Inst, *MCII)) {
-    // We don't know if macro fusion happens until the reaching the next
-    // instruction, so a place holder is put here if necessary.
-    getOrCreateBoundaryAlignFragment(OS);
+    return;
   }
 
-  PrevInst = Inst;
+  if (needAlign(Inst) || ((AlignBranchType & X86::AlignBranchFused) &&
+                          isFirstMacroFusibleInst(Inst, *MCII))) {
+    // If we meet a unfused branch or the first instuction in a fusiable pair,
+    // insert a BoundaryAlign fragment.
+    OS.insert(PendingBA = new MCBoundaryAlignFragment(AlignBoundary));
+  }
 }
 
-/// Insert a MCBoundaryAlignFragment to mark the end of the branch to be aligned
-/// if necessary.
-void X86AsmBackend::alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) {
-  if (!needAlign(OS))
+/// Set the last fragment to be aligned for the BoundaryAlignFragment.
+void X86AsmBackend::emitInstructionEnd(MCObjectStreamer &OS, const MCInst &Inst) {
+  PrevInst = Inst;
+  MCFragment *CF = OS.getCurrentFragment();
+  PrevInstPosition = std::make_pair(CF, getSizeForInstFragment(CF));
+  if (auto *F = dyn_cast_or_null<MCRelaxableFragment>(CF))
+    F->setAllowAutoPadding(CanPadInst);
+
+  if (!canPadBranches(OS))
     return;
-  // If the branch is emitted into a MCRelaxableFragment, we can determine the
-  // size of the branch easily in MCAssembler::relaxBoundaryAlign. When the
-  // branch is fused, the fused branch(macro fusion pair) must be emitted into
-  // two fragments. Or when the branch is unfused, the branch must be emitted
-  // into one fragment. The MCRelaxableFragment naturally marks the end of the
-  // fused or unfused branch.
-  // Otherwise, we need to insert a MCBoundaryAlignFragment to mark the end of
-  // the branch. This MCBoundaryAlignFragment may be reused to emit NOP to align
-  // other branch.
-  if (needAlignInst(Inst) && !isa<MCRelaxableFragment>(OS.getCurrentFragment()))
-    OS.insert(new MCBoundaryAlignFragment(AlignBoundary));
+
+  if (!needAlign(Inst) || !PendingBA)
+    return;
+
+  // Tie the aligned instructions into a a pending BoundaryAlign.
+  PendingBA->setLastFragment(CF);
+  PendingBA = nullptr;
+
+  // We need to ensure that further data isn't added to the current
+  // DataFragment, so that we can get the size of instructions later in
+  // MCAssembler::relaxBoundaryAlign. The easiest way is to insert a new empty
+  // DataFragment.
+  if (isa_and_nonnull<MCDataFragment>(CF))
+    OS.insert(new MCDataFragment());
 
   // Update the maximum alignment on the current section if necessary.
   MCSection *Sec = OS.getCurrentSectionOnly();
@@ -478,13 +672,23 @@ void X86AsmBackend::alignBranchesEnd(MCObjectStreamer &OS, const MCInst &Inst) {
 
 Optional<MCFixupKind> X86AsmBackend::getFixupKind(StringRef Name) const {
   if (STI.getTargetTriple().isOSBinFormatELF()) {
+    unsigned Type;
     if (STI.getTargetTriple().getArch() == Triple::x86_64) {
-      if (Name == "R_X86_64_NONE")
-        return FK_NONE;
+      Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/x86_64.def"
+#undef ELF_RELOC
+                 .Default(-1u);
     } else {
-      if (Name == "R_386_NONE")
-        return FK_NONE;
+      Type = llvm::StringSwitch<unsigned>(Name)
+#define ELF_RELOC(X, Y) .Case(#X, Y)
+#include "llvm/BinaryFormat/ELFRelocs/i386.def"
+#undef ELF_RELOC
+                 .Default(-1u);
     }
+    if (Type == -1u)
+      return None;
+    return static_cast<MCFixupKind>(FirstLiteralRelocationKind + Type);
   }
   return MCAsmBackend::getFixupKind(Name);
 }
@@ -502,6 +706,11 @@ const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
       {"reloc_branch_4byte_pcrel", 0, 32, MCFixupKindInfo::FKF_IsPCRel},
   };
 
+  // Fixup kinds from .reloc directive are like R_386_NONE/R_X86_64_NONE. They
+  // do not require any extra processing.
+  if (Kind >= FirstLiteralRelocationKind)
+    return MCAsmBackend::getFixupKindInfo(FK_NONE);
+
   if (Kind < FirstTargetFixupKind)
     return MCAsmBackend::getFixupKindInfo(Kind);
 
@@ -514,7 +723,7 @@ const MCFixupKindInfo &X86AsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
 bool X86AsmBackend::shouldForceRelocation(const MCAssembler &,
                                           const MCFixup &Fixup,
                                           const MCValue &) {
-  return Fixup.getKind() == FK_NONE;
+  return Fixup.getKind() >= FirstLiteralRelocationKind;
 }
 
 static unsigned getFixupKindSize(unsigned Kind) {
@@ -556,7 +765,10 @@ void X86AsmBackend::applyFixup(const MCAssembler &Asm, const MCFixup &Fixup,
                                MutableArrayRef<char> Data,
                                uint64_t Value, bool IsResolved,
                                const MCSubtargetInfo *STI) const {
-  unsigned Size = getFixupKindSize(Fixup.getKind());
+  unsigned Kind = Fixup.getKind();
+  if (Kind >= FirstLiteralRelocationKind)
+    return;
+  unsigned Size = getFixupKindSize(Kind);
 
   assert(Fixup.getOffset() + Size <= Data.size() && "Invalid fixup offset!");
 
@@ -613,12 +825,11 @@ bool X86AsmBackend::fixupNeedsRelaxation(const MCFixup &Fixup,
 
 // FIXME: Can tblgen help at all here to verify there aren't other instructions
 // we can relax?
-void X86AsmBackend::relaxInstruction(const MCInst &Inst,
-                                     const MCSubtargetInfo &STI,
-                                     MCInst &Res) const {
+void X86AsmBackend::relaxInstruction(MCInst &Inst,
+                                     const MCSubtargetInfo &STI) const {
   // The only relaxations X86 does is from a 1byte pcrel to a 4byte pcrel.
-  bool is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
-  unsigned RelaxedOp = getRelaxedOpcode(Inst, is16BitMode);
+  bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+  unsigned RelaxedOp = getRelaxedOpcode(Inst, Is16BitMode);
 
   if (RelaxedOp == Inst.getOpcode()) {
     SmallString<256> Tmp;
@@ -628,8 +839,232 @@ void X86AsmBackend::relaxInstruction(const MCInst &Inst,
     report_fatal_error("unexpected instruction to relax: " + OS.str());
   }
 
-  Res = Inst;
-  Res.setOpcode(RelaxedOp);
+  Inst.setOpcode(RelaxedOp);
+}
+
+/// Return true if this instruction has been fully relaxed into it's most
+/// general available form.
+static bool isFullyRelaxed(const MCRelaxableFragment &RF) {
+  auto &Inst = RF.getInst();
+  auto &STI = *RF.getSubtargetInfo();
+  bool Is16BitMode = STI.getFeatureBits()[X86::Mode16Bit];
+  return getRelaxedOpcode(Inst, Is16BitMode) == Inst.getOpcode();
+}
+
+bool X86AsmBackend::padInstructionViaPrefix(MCRelaxableFragment &RF,
+                                            MCCodeEmitter &Emitter,
+                                            unsigned &RemainingSize) const {
+  if (!RF.getAllowAutoPadding())
+    return false;
+  // If the instruction isn't fully relaxed, shifting it around might require a
+  // larger value for one of the fixups then can be encoded.  The outer loop
+  // will also catch this before moving to the next instruction, but we need to
+  // prevent padding this single instruction as well.
+  if (!isFullyRelaxed(RF))
+    return false;
+
+  const unsigned OldSize = RF.getContents().size();
+  if (OldSize == 15)
+    return false;
+
+  const unsigned MaxPossiblePad = std::min(15 - OldSize, RemainingSize);
+  const unsigned RemainingPrefixSize = [&]() -> unsigned {
+    SmallString<15> Code;
+    raw_svector_ostream VecOS(Code);
+    Emitter.emitPrefix(RF.getInst(), VecOS, STI);
+    assert(Code.size() < 15 && "The number of prefixes must be less than 15.");
+
+    // TODO: It turns out we need a decent amount of plumbing for the target
+    // specific bits to determine number of prefixes its safe to add.  Various
+    // targets (older chips mostly, but also Atom family) encounter decoder
+    // stalls with too many prefixes.  For testing purposes, we set the value
+    // externally for the moment.
+    unsigned ExistingPrefixSize = Code.size();
+    if (TargetPrefixMax <= ExistingPrefixSize)
+      return 0;
+    return TargetPrefixMax - ExistingPrefixSize;
+  }();
+  const unsigned PrefixBytesToAdd =
+      std::min(MaxPossiblePad, RemainingPrefixSize);
+  if (PrefixBytesToAdd == 0)
+    return false;
+
+  const uint8_t Prefix = determinePaddingPrefix(RF.getInst());
+
+  SmallString<256> Code;
+  Code.append(PrefixBytesToAdd, Prefix);
+  Code.append(RF.getContents().begin(), RF.getContents().end());
+  RF.getContents() = Code;
+
+  // Adjust the fixups for the change in offsets
+  for (auto &F : RF.getFixups()) {
+    F.setOffset(F.getOffset() + PrefixBytesToAdd);
+  }
+
+  RemainingSize -= PrefixBytesToAdd;
+  return true;
+}
+
+bool X86AsmBackend::padInstructionViaRelaxation(MCRelaxableFragment &RF,
+                                                MCCodeEmitter &Emitter,
+                                                unsigned &RemainingSize) const {
+  if (isFullyRelaxed(RF))
+    // TODO: There are lots of other tricks we could apply for increasing
+    // encoding size without impacting performance.
+    return false;
+
+  MCInst Relaxed = RF.getInst();
+  relaxInstruction(Relaxed, *RF.getSubtargetInfo());
+
+  SmallVector<MCFixup, 4> Fixups;
+  SmallString<15> Code;
+  raw_svector_ostream VecOS(Code);
+  Emitter.encodeInstruction(Relaxed, VecOS, Fixups, *RF.getSubtargetInfo());
+  const unsigned OldSize = RF.getContents().size();
+  const unsigned NewSize = Code.size();
+  assert(NewSize >= OldSize && "size decrease during relaxation?");
+  unsigned Delta = NewSize - OldSize;
+  if (Delta > RemainingSize)
+    return false;
+  RF.setInst(Relaxed);
+  RF.getContents() = Code;
+  RF.getFixups() = Fixups;
+  RemainingSize -= Delta;
+  return true;
+}
+
+bool X86AsmBackend::padInstructionEncoding(MCRelaxableFragment &RF,
+                                           MCCodeEmitter &Emitter,
+                                           unsigned &RemainingSize) const {
+  bool Changed = false;
+  if (RemainingSize != 0)
+    Changed |= padInstructionViaRelaxation(RF, Emitter, RemainingSize);
+  if (RemainingSize != 0)
+    Changed |= padInstructionViaPrefix(RF, Emitter, RemainingSize);
+  return Changed;
+}
+
+void X86AsmBackend::finishLayout(MCAssembler const &Asm,
+                                 MCAsmLayout &Layout) const {
+  // See if we can further relax some instructions to cut down on the number of
+  // nop bytes required for code alignment.  The actual win is in reducing
+  // instruction count, not number of bytes.  Modern X86-64 can easily end up
+  // decode limited.  It is often better to reduce the number of instructions
+  // (i.e. eliminate nops) even at the cost of increasing the size and
+  // complexity of others.
+  if (!X86PadForAlign && !X86PadForBranchAlign)
+    return;
+
+  DenseSet<MCFragment *> LabeledFragments;
+  for (const MCSymbol &S : Asm.symbols())
+    LabeledFragments.insert(S.getFragment(false));
+
+  for (MCSection &Sec : Asm) {
+    if (!Sec.getKind().isText())
+      continue;
+
+    SmallVector<MCRelaxableFragment *, 4> Relaxable;
+    for (MCSection::iterator I = Sec.begin(), IE = Sec.end(); I != IE; ++I) {
+      MCFragment &F = *I;
+
+      if (LabeledFragments.count(&F))
+        Relaxable.clear();
+
+      if (F.getKind() == MCFragment::FT_Data ||
+          F.getKind() == MCFragment::FT_CompactEncodedInst)
+        // Skip and ignore
+        continue;
+
+      if (F.getKind() == MCFragment::FT_Relaxable) {
+        auto &RF = cast<MCRelaxableFragment>(*I);
+        Relaxable.push_back(&RF);
+        continue;
+      }
+
+      auto canHandle = [](MCFragment &F) -> bool {
+        switch (F.getKind()) {
+        default:
+          return false;
+        case MCFragment::FT_Align:
+          return X86PadForAlign;
+        case MCFragment::FT_BoundaryAlign:
+          return X86PadForBranchAlign;
+        }
+      };
+      // For any unhandled kind, assume we can't change layout.
+      if (!canHandle(F)) {
+        Relaxable.clear();
+        continue;
+      }
+
+#ifndef NDEBUG
+      const uint64_t OrigOffset = Layout.getFragmentOffset(&F);
+#endif
+      const uint64_t OrigSize = Asm.computeFragmentSize(Layout, F);
+
+      // To keep the effects local, prefer to relax instructions closest to
+      // the align directive.  This is purely about human understandability
+      // of the resulting code.  If we later find a reason to expand
+      // particular instructions over others, we can adjust.
+      MCFragment *FirstChangedFragment = nullptr;
+      unsigned RemainingSize = OrigSize;
+      while (!Relaxable.empty() && RemainingSize != 0) {
+        auto &RF = *Relaxable.pop_back_val();
+        // Give the backend a chance to play any tricks it wishes to increase
+        // the encoding size of the given instruction.  Target independent code
+        // will try further relaxation, but target's may play further tricks.
+        if (padInstructionEncoding(RF, Asm.getEmitter(), RemainingSize))
+          FirstChangedFragment = &RF;
+
+        // If we have an instruction which hasn't been fully relaxed, we can't
+        // skip past it and insert bytes before it.  Changing its starting
+        // offset might require a larger negative offset than it can encode.
+        // We don't need to worry about larger positive offsets as none of the
+        // possible offsets between this and our align are visible, and the
+        // ones afterwards aren't changing.
+        if (!isFullyRelaxed(RF))
+          break;
+      }
+      Relaxable.clear();
+
+      if (FirstChangedFragment) {
+        // Make sure the offsets for any fragments in the effected range get
+        // updated.  Note that this (conservatively) invalidates the offsets of
+        // those following, but this is not required.
+        Layout.invalidateFragmentsFrom(FirstChangedFragment);
+      }
+
+      // BoundaryAlign explicitly tracks it's size (unlike align)
+      if (F.getKind() == MCFragment::FT_BoundaryAlign)
+        cast<MCBoundaryAlignFragment>(F).setSize(RemainingSize);
+
+#ifndef NDEBUG
+      const uint64_t FinalOffset = Layout.getFragmentOffset(&F);
+      const uint64_t FinalSize = Asm.computeFragmentSize(Layout, F);
+      assert(OrigOffset + OrigSize == FinalOffset + FinalSize &&
+             "can't move start of next fragment!");
+      assert(FinalSize == RemainingSize && "inconsistent size computation?");
+#endif
+
+      // If we're looking at a boundary align, make sure we don't try to pad
+      // its target instructions for some following directive.  Doing so would
+      // break the alignment of the current boundary align.
+      if (auto *BF = dyn_cast<MCBoundaryAlignFragment>(&F)) {
+        const MCFragment *LastFragment = BF->getLastFragment();
+        if (!LastFragment)
+          continue;
+        while (&*I != LastFragment)
+          ++I;
+      }
+    }
+  }
+
+  // The layout is done. Mark every fragment as valid.
+  for (unsigned int i = 0, n = Layout.getSectionOrder().size(); i != n; ++i) {
+    MCSection &Section = *Layout.getSectionOrder()[i];
+    Layout.getFragmentOffset(&*Section.getFragmentList().rbegin());
+    Asm.computeFragmentSize(Layout, *Section.getFragmentList().rbegin());
+  }
 }
 
 /// Write a sequence of optimal nops to the output, covering \p Count
@@ -661,7 +1096,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
 
   // This CPU doesn't support long nops. If needed add more.
   // FIXME: We could generated something better than plain 0x90.
-  if (!STI.getFeatureBits()[X86::FeatureNOPL]) {
+  if (!STI.hasFeature(X86::FeatureNOPL) && !STI.hasFeature(X86::Mode64Bit)) {
     for (uint64_t i = 0; i < Count; ++i)
       OS << '\x90';
     return true;
@@ -670,7 +1105,7 @@ bool X86AsmBackend::writeNopData(raw_ostream &OS, uint64_t Count) const {
   // 15-bytes is the longest single NOP instruction, but 10-bytes is
   // commonly the longest that can be efficiently decoded.
   uint64_t MaxNopLength = 10;
-  if (STI.getFeatureBits()[X86::ProcIntelSLM])
+  if (STI.getFeatureBits()[X86::FeatureFast7ByteNOP])
     MaxNopLength = 7;
   else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
     MaxNopLength = 15;
@@ -811,6 +1246,7 @@ class DarwinX86AsmBackend : public X86AsmBackend {
   enum { CU_NUM_SAVED_REGS = 6 };
 
   mutable unsigned SavedRegs[CU_NUM_SAVED_REGS];
+  Triple TT;
   bool Is64Bit;
 
   unsigned OffsetSize;                   ///< Offset of a "push" instruction.
@@ -838,10 +1274,140 @@ protected:
     return 1;
   }
 
+private:
+  /// Get the compact unwind number for a given register. The number
+  /// corresponds to the enum lists in compact_unwind_encoding.h.
+  int getCompactUnwindRegNum(unsigned Reg) const {
+    static const MCPhysReg CU32BitRegs[7] = {
+      X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
+    };
+    static const MCPhysReg CU64BitRegs[] = {
+      X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
+    };
+    const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
+    for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
+      if (*CURegs == Reg)
+        return Idx;
+
+    return -1;
+  }
+
+  /// Return the registers encoded for a compact encoding with a frame
+  /// pointer.
+  uint32_t encodeCompactUnwindRegistersWithFrame() const {
+    // Encode the registers in the order they were saved --- 3-bits per
+    // register. The list of saved registers is assumed to be in reverse
+    // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
+    uint32_t RegEnc = 0;
+    for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
+      unsigned Reg = SavedRegs[i];
+      if (Reg == 0) break;
+
+      int CURegNum = getCompactUnwindRegNum(Reg);
+      if (CURegNum == -1) return ~0U;
+
+      // Encode the 3-bit register number in order, skipping over 3-bits for
+      // each register.
+      RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
+    }
+
+    assert((RegEnc & 0x3FFFF) == RegEnc &&
+           "Invalid compact register encoding!");
+    return RegEnc;
+  }
+
+  /// Create the permutation encoding used with frameless stacks. It is
+  /// passed the number of registers to be saved and an array of the registers
+  /// saved.
+  uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
+    // The saved registers are numbered from 1 to 6. In order to encode the
+    // order in which they were saved, we re-number them according to their
+    // place in the register order. The re-numbering is relative to the last
+    // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
+    // that order:
+    //
+    //    Orig  Re-Num
+    //    ----  ------
+    //     6       6
+    //     2       2
+    //     4       3
+    //     5       3
+    //
+    for (unsigned i = 0; i < RegCount; ++i) {
+      int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
+      if (CUReg == -1) return ~0U;
+      SavedRegs[i] = CUReg;
+    }
+
+    // Reverse the list.
+    std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
+
+    uint32_t RenumRegs[CU_NUM_SAVED_REGS];
+    for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
+      unsigned Countless = 0;
+      for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
+        if (SavedRegs[j] < SavedRegs[i])
+          ++Countless;
+
+      RenumRegs[i] = SavedRegs[i] - Countless - 1;
+    }
+
+    // Take the renumbered values and encode them into a 10-bit number.
+    uint32_t permutationEncoding = 0;
+    switch (RegCount) {
+    case 6:
+      permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
+                             + 6 * RenumRegs[2] +  2 * RenumRegs[3]
+                             +     RenumRegs[4];
+      break;
+    case 5:
+      permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
+                             + 6 * RenumRegs[3] +  2 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 4:
+      permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
+                             + 3 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 3:
+      permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
+                             +     RenumRegs[5];
+      break;
+    case 2:
+      permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
+      break;
+    case 1:
+      permutationEncoding |=       RenumRegs[5];
+      break;
+    }
+
+    assert((permutationEncoding & 0x3FF) == permutationEncoding &&
+           "Invalid compact register encoding!");
+    return permutationEncoding;
+  }
+
+public:
+  DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI,
+                      const MCSubtargetInfo &STI)
+      : X86AsmBackend(T, STI), MRI(MRI), TT(STI.getTargetTriple()),
+        Is64Bit(TT.isArch64Bit()) {
+    memset(SavedRegs, 0, sizeof(SavedRegs));
+    OffsetSize = Is64Bit ? 8 : 4;
+    MoveInstrSize = Is64Bit ? 3 : 2;
+    StackDivide = Is64Bit ? 8 : 4;
+  }
+
+  std::unique_ptr<MCObjectTargetWriter>
+  createObjectTargetWriter() const override {
+    uint32_t CPUType = cantFail(MachO::getCPUType(TT));
+    uint32_t CPUSubType = cantFail(MachO::getCPUSubType(TT));
+    return createX86MachObjectWriter(Is64Bit, CPUType, CPUSubType);
+  }
+
   /// Implementation of algorithm to generate the compact unwind encoding
   /// for the CFI instructions.
   uint32_t
-  generateCompactUnwindEncodingImpl(ArrayRef<MCCFIInstruction> Instrs) const {
+  generateCompactUnwindEncoding(ArrayRef<MCCFIInstruction> Instrs) const override {
     if (Instrs.empty()) return 0;
 
     // Reset the saved registers.
@@ -904,7 +1470,7 @@ protected:
         //  L0:
         //     .cfi_def_cfa_offset 80
         //
-        StackSize = std::abs(Inst.getOffset()) / StackDivide;
+        StackSize = Inst.getOffset() / StackDivide;
         ++NumDefCFAOffsets;
         break;
       }
@@ -991,168 +1557,6 @@ protected:
 
     return CompactUnwindEncoding;
   }
-
-private:
-  /// Get the compact unwind number for a given register. The number
-  /// corresponds to the enum lists in compact_unwind_encoding.h.
-  int getCompactUnwindRegNum(unsigned Reg) const {
-    static const MCPhysReg CU32BitRegs[7] = {
-      X86::EBX, X86::ECX, X86::EDX, X86::EDI, X86::ESI, X86::EBP, 0
-    };
-    static const MCPhysReg CU64BitRegs[] = {
-      X86::RBX, X86::R12, X86::R13, X86::R14, X86::R15, X86::RBP, 0
-    };
-    const MCPhysReg *CURegs = Is64Bit ? CU64BitRegs : CU32BitRegs;
-    for (int Idx = 1; *CURegs; ++CURegs, ++Idx)
-      if (*CURegs == Reg)
-        return Idx;
-
-    return -1;
-  }
-
-  /// Return the registers encoded for a compact encoding with a frame
-  /// pointer.
-  uint32_t encodeCompactUnwindRegistersWithFrame() const {
-    // Encode the registers in the order they were saved --- 3-bits per
-    // register. The list of saved registers is assumed to be in reverse
-    // order. The registers are numbered from 1 to CU_NUM_SAVED_REGS.
-    uint32_t RegEnc = 0;
-    for (int i = 0, Idx = 0; i != CU_NUM_SAVED_REGS; ++i) {
-      unsigned Reg = SavedRegs[i];
-      if (Reg == 0) break;
-
-      int CURegNum = getCompactUnwindRegNum(Reg);
-      if (CURegNum == -1) return ~0U;
-
-      // Encode the 3-bit register number in order, skipping over 3-bits for
-      // each register.
-      RegEnc |= (CURegNum & 0x7) << (Idx++ * 3);
-    }
-
-    assert((RegEnc & 0x3FFFF) == RegEnc &&
-           "Invalid compact register encoding!");
-    return RegEnc;
-  }
-
-  /// Create the permutation encoding used with frameless stacks. It is
-  /// passed the number of registers to be saved and an array of the registers
-  /// saved.
-  uint32_t encodeCompactUnwindRegistersWithoutFrame(unsigned RegCount) const {
-    // The saved registers are numbered from 1 to 6. In order to encode the
-    // order in which they were saved, we re-number them according to their
-    // place in the register order. The re-numbering is relative to the last
-    // re-numbered register. E.g., if we have registers {6, 2, 4, 5} saved in
-    // that order:
-    //
-    //    Orig  Re-Num
-    //    ----  ------
-    //     6       6
-    //     2       2
-    //     4       3
-    //     5       3
-    //
-    for (unsigned i = 0; i < RegCount; ++i) {
-      int CUReg = getCompactUnwindRegNum(SavedRegs[i]);
-      if (CUReg == -1) return ~0U;
-      SavedRegs[i] = CUReg;
-    }
-
-    // Reverse the list.
-    std::reverse(&SavedRegs[0], &SavedRegs[CU_NUM_SAVED_REGS]);
-
-    uint32_t RenumRegs[CU_NUM_SAVED_REGS];
-    for (unsigned i = CU_NUM_SAVED_REGS - RegCount; i < CU_NUM_SAVED_REGS; ++i){
-      unsigned Countless = 0;
-      for (unsigned j = CU_NUM_SAVED_REGS - RegCount; j < i; ++j)
-        if (SavedRegs[j] < SavedRegs[i])
-          ++Countless;
-
-      RenumRegs[i] = SavedRegs[i] - Countless - 1;
-    }
-
-    // Take the renumbered values and encode them into a 10-bit number.
-    uint32_t permutationEncoding = 0;
-    switch (RegCount) {
-    case 6:
-      permutationEncoding |= 120 * RenumRegs[0] + 24 * RenumRegs[1]
-                             + 6 * RenumRegs[2] +  2 * RenumRegs[3]
-                             +     RenumRegs[4];
-      break;
-    case 5:
-      permutationEncoding |= 120 * RenumRegs[1] + 24 * RenumRegs[2]
-                             + 6 * RenumRegs[3] +  2 * RenumRegs[4]
-                             +     RenumRegs[5];
-      break;
-    case 4:
-      permutationEncoding |=  60 * RenumRegs[2] + 12 * RenumRegs[3]
-                             + 3 * RenumRegs[4] +      RenumRegs[5];
-      break;
-    case 3:
-      permutationEncoding |=  20 * RenumRegs[3] +  4 * RenumRegs[4]
-                             +     RenumRegs[5];
-      break;
-    case 2:
-      permutationEncoding |=   5 * RenumRegs[4] +      RenumRegs[5];
-      break;
-    case 1:
-      permutationEncoding |=       RenumRegs[5];
-      break;
-    }
-
-    assert((permutationEncoding & 0x3FF) == permutationEncoding &&
-           "Invalid compact register encoding!");
-    return permutationEncoding;
-  }
-
-public:
-  DarwinX86AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                      const MCSubtargetInfo &STI, bool Is64Bit)
-    : X86AsmBackend(T, STI), MRI(MRI), Is64Bit(Is64Bit) {
-    memset(SavedRegs, 0, sizeof(SavedRegs));
-    OffsetSize = Is64Bit ? 8 : 4;
-    MoveInstrSize = Is64Bit ? 3 : 2;
-    StackDivide = Is64Bit ? 8 : 4;
-  }
-};
-
-class DarwinX86_32AsmBackend : public DarwinX86AsmBackend {
-public:
-  DarwinX86_32AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                         const MCSubtargetInfo &STI)
-      : DarwinX86AsmBackend(T, MRI, STI, false) {}
-
-  std::unique_ptr<MCObjectTargetWriter>
-  createObjectTargetWriter() const override {
-    return createX86MachObjectWriter(/*Is64Bit=*/false,
-                                     MachO::CPU_TYPE_I386,
-                                     MachO::CPU_SUBTYPE_I386_ALL);
-  }
-
-  /// Generate the compact unwind encoding for the CFI instructions.
-  uint32_t generateCompactUnwindEncoding(
-                             ArrayRef<MCCFIInstruction> Instrs) const override {
-    return generateCompactUnwindEncodingImpl(Instrs);
-  }
-};
-
-class DarwinX86_64AsmBackend : public DarwinX86AsmBackend {
-  const MachO::CPUSubTypeX86 Subtype;
-public:
-  DarwinX86_64AsmBackend(const Target &T, const MCRegisterInfo &MRI,
-                         const MCSubtargetInfo &STI, MachO::CPUSubTypeX86 st)
-      : DarwinX86AsmBackend(T, MRI, STI, true), Subtype(st) {}
-
-  std::unique_ptr<MCObjectTargetWriter>
-  createObjectTargetWriter() const override {
-    return createX86MachObjectWriter(/*Is64Bit=*/true, MachO::CPU_TYPE_X86_64,
-                                     Subtype);
-  }
-
-  /// Generate the compact unwind encoding for the CFI instructions.
-  uint32_t generateCompactUnwindEncoding(
-                             ArrayRef<MCCFIInstruction> Instrs) const override {
-    return generateCompactUnwindEncodingImpl(Instrs);
-  }
 };
 
 } // end anonymous namespace
@@ -1163,7 +1567,7 @@ MCAsmBackend *llvm::createX86_32AsmBackend(const Target &T,
                                            const MCTargetOptions &Options) {
   const Triple &TheTriple = STI.getTargetTriple();
   if (TheTriple.isOSBinFormatMachO())
-    return new DarwinX86_32AsmBackend(T, MRI, STI);
+    return new DarwinX86AsmBackend(T, MRI, STI);
 
   if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
     return new WindowsX86AsmBackend(T, false, STI);
@@ -1181,13 +1585,8 @@ MCAsmBackend *llvm::createX86_64AsmBackend(const Target &T,
                                            const MCRegisterInfo &MRI,
                                            const MCTargetOptions &Options) {
   const Triple &TheTriple = STI.getTargetTriple();
-  if (TheTriple.isOSBinFormatMachO()) {
-    MachO::CPUSubTypeX86 CS =
-        StringSwitch<MachO::CPUSubTypeX86>(TheTriple.getArchName())
-            .Case("x86_64h", MachO::CPU_SUBTYPE_X86_64_H)
-            .Default(MachO::CPU_SUBTYPE_X86_64_ALL);
-    return new DarwinX86_64AsmBackend(T, MRI, STI, CS);
-  }
+  if (TheTriple.isOSBinFormatMachO())
+    return new DarwinX86AsmBackend(T, MRI, STI);
 
   if (TheTriple.isOSWindows() && TheTriple.isOSBinFormatCOFF())
     return new WindowsX86AsmBackend(T, true, STI);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
index a4f8dd669e1e..79f07d3c7792 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86BaseInfo.h
@@ -91,7 +91,7 @@ namespace X86 {
     COND_G = 15,
     LAST_VALID_COND = COND_G,
 
-    // Artificial condition codes. These are used by AnalyzeBranch
+    // Artificial condition codes. These are used by analyzeBranch
     // to indicate a block terminated with two conditional branches that together
     // form a compound condition. They occur in code using FCMP_OEQ or FCMP_UNE,
     // which can't be represented on x86 with a single condition. These
@@ -356,6 +356,39 @@ namespace X86 {
     AlignBranchRet = 1U << 4,
     AlignBranchIndirect = 1U << 5
   };
+
+  /// Defines the encoding values for segment override prefix.
+  enum EncodingOfSegmentOverridePrefix : uint8_t {
+    CS_Encoding = 0x2E,
+    DS_Encoding = 0x3E,
+    ES_Encoding = 0x26,
+    FS_Encoding = 0x64,
+    GS_Encoding = 0x65,
+    SS_Encoding = 0x36
+  };
+
+  /// Given a segment register, return the encoding of the segment override
+  /// prefix for it.
+  inline EncodingOfSegmentOverridePrefix
+  getSegmentOverridePrefixForReg(unsigned Reg) {
+    switch (Reg) {
+    default:
+      llvm_unreachable("Unknown segment register!");
+    case X86::CS:
+      return CS_Encoding;
+    case X86::DS:
+      return DS_Encoding;
+    case X86::ES:
+      return ES_Encoding;
+    case X86::FS:
+      return FS_Encoding;
+    case X86::GS:
+      return GS_Encoding;
+    case X86::SS:
+      return SS_Encoding;
+    }
+  }
+
 } // end namespace X86;
 
 /// X86II - This namespace holds all of the target specific flags that
@@ -581,90 +614,107 @@ namespace X86II {
     /// in the lower 4 bits of the opcode.
     AddCCFrm = 9,
 
+    /// PrefixByte - This form is used for instructions that represent a prefix
+    /// byte like data16 or rep.
+    PrefixByte = 10,
+
     /// MRM[0-7][rm] - These forms are used to represent instructions that use
     /// a Mod/RM byte, and use the middle field to hold extended opcode
     /// information.  In the intel manual these are represented as /0, /1, ...
     ///
 
+    // Instructions operate on a register Reg/Opcode operand not the r/m field.
+    MRMr0 = 21,
+
+    /// MRMSrcMem - But force to use the SIB field.
+    MRMSrcMemFSIB  = 22,
+
+    /// MRMDestMem - But force to use the SIB field.
+    MRMDestMemFSIB = 23,
+
     /// MRMDestMem - This form is used for instructions that use the Mod/RM byte
     /// to specify a destination, which in this case is memory.
     ///
-    MRMDestMem     = 32,
+    MRMDestMem     = 24,
 
     /// MRMSrcMem - This form is used for instructions that use the Mod/RM byte
     /// to specify a source, which in this case is memory.
     ///
-    MRMSrcMem      = 33,
+    MRMSrcMem      = 25,
 
     /// MRMSrcMem4VOp3 - This form is used for instructions that encode
     /// operand 3 with VEX.VVVV and load from memory.
     ///
-    MRMSrcMem4VOp3 = 34,
+    MRMSrcMem4VOp3 = 26,
 
     /// MRMSrcMemOp4 - This form is used for instructions that use the Mod/RM
     /// byte to specify the fourth source, which in this case is memory.
     ///
-    MRMSrcMemOp4   = 35,
+    MRMSrcMemOp4   = 27,
 
     /// MRMSrcMemCC - This form is used for instructions that use the Mod/RM
     /// byte to specify the operands and also encodes a condition code.
     ///
-    MRMSrcMemCC    = 36,
+    MRMSrcMemCC    = 28,
 
     /// MRMXm - This form is used for instructions that use the Mod/RM byte
     /// to specify a memory source, but doesn't use the middle field. And has
     /// a condition code.
     ///
-    MRMXmCC = 38,
+    MRMXmCC = 30,
 
     /// MRMXm - This form is used for instructions that use the Mod/RM byte
     /// to specify a memory source, but doesn't use the middle field.
     ///
-    MRMXm = 39,
+    MRMXm = 31,
 
     // Next, instructions that operate on a memory r/m operand...
-    MRM0m = 40,  MRM1m = 41,  MRM2m = 42,  MRM3m = 43, // Format /0 /1 /2 /3
-    MRM4m = 44,  MRM5m = 45,  MRM6m = 46,  MRM7m = 47, // Format /4 /5 /6 /7
+    MRM0m = 32,  MRM1m = 33,  MRM2m = 34,  MRM3m = 35, // Format /0 /1 /2 /3
+    MRM4m = 36,  MRM5m = 37,  MRM6m = 38,  MRM7m = 39, // Format /4 /5 /6 /7
 
     /// MRMDestReg - This form is used for instructions that use the Mod/RM byte
     /// to specify a destination, which in this case is a register.
     ///
-    MRMDestReg     = 48,
+    MRMDestReg     = 40,
 
     /// MRMSrcReg - This form is used for instructions that use the Mod/RM byte
     /// to specify a source, which in this case is a register.
     ///
-    MRMSrcReg      = 49,
+    MRMSrcReg      = 41,
 
     /// MRMSrcReg4VOp3 - This form is used for instructions that encode
     /// operand 3 with VEX.VVVV and do not load from memory.
     ///
-    MRMSrcReg4VOp3 = 50,
+    MRMSrcReg4VOp3 = 42,
 
     /// MRMSrcRegOp4 - This form is used for instructions that use the Mod/RM
     /// byte to specify the fourth source, which in this case is a register.
     ///
-    MRMSrcRegOp4   = 51,
+    MRMSrcRegOp4   = 43,
 
     /// MRMSrcRegCC - This form is used for instructions that use the Mod/RM
     /// byte to specify the operands and also encodes a condition code
     ///
-    MRMSrcRegCC    = 52,
+    MRMSrcRegCC    = 44,
 
     /// MRMXCCr - This form is used for instructions that use the Mod/RM byte
     /// to specify a register source, but doesn't use the middle field. And has
     /// a condition code.
     ///
-    MRMXrCC = 54,
+    MRMXrCC = 46,
 
     /// MRMXr - This form is used for instructions that use the Mod/RM byte
     /// to specify a register source, but doesn't use the middle field.
     ///
-    MRMXr = 55,
+    MRMXr = 47,
 
     // Instructions that operate on a register r/m operand...
-    MRM0r = 56,  MRM1r = 57,  MRM2r = 58,  MRM3r = 59, // Format /0 /1 /2 /3
-    MRM4r = 60,  MRM5r = 61,  MRM6r = 62,  MRM7r = 63, // Format /4 /5 /6 /7
+    MRM0r = 48,  MRM1r = 49,  MRM2r = 50,  MRM3r = 51, // Format /0 /1 /2 /3
+    MRM4r = 52,  MRM5r = 53,  MRM6r = 54,  MRM7r = 55, // Format /4 /5 /6 /7
+
+    // Instructions that operate that have mod=11 and an opcode but ignore r/m.
+    MRM0X = 56,  MRM1X = 57,  MRM2X = 58,  MRM3X = 59, // Format /0 /1 /2 /3
+    MRM4X = 60,  MRM5X = 61,  MRM6X = 62,  MRM7X = 63, // Format /4 /5 /6 /7
 
     /// MRM_XX - A mod/rm byte of exactly 0xXX.
     MRM_C0 = 64,  MRM_C1 = 65,  MRM_C2 = 66,  MRM_C3 = 67,
@@ -900,6 +950,16 @@ namespace X86II {
     NOTRACK = 1ULL << NoTrackShift
   };
 
+  /// \returns true if the instruction with given opcode is a prefix.
+  inline bool isPrefix(uint64_t TSFlags) {
+    return (TSFlags & X86II::FormMask) == PrefixByte;
+  }
+
+  /// \returns true if the instruction with given opcode is a pseudo.
+  inline bool isPseudo(uint64_t TSFlags) {
+    return (TSFlags & X86II::FormMask) == Pseudo;
+  }
+
   /// \returns the "base" X86 opcode for the specified machine
   /// instruction.
   inline uint8_t getBaseOpcodeFor(uint64_t TSFlags) {
@@ -1028,10 +1088,13 @@ namespace X86II {
     case X86II::RawFrmDst:
     case X86II::RawFrmDstSrc:
     case X86II::AddCCFrm:
+    case X86II::PrefixByte:
       return -1;
     case X86II::MRMDestMem:
+    case X86II::MRMDestMemFSIB:
       return 0;
     case X86II::MRMSrcMem:
+    case X86II::MRMSrcMemFSIB:
       // Start from 1, skip any registers encoded in VEX_VVVV or I8IMM, or a
       // mask register.
       return 1 + HasVEX_4V + HasEVEX_K;
@@ -1051,12 +1114,18 @@ namespace X86II {
     case X86II::MRMSrcRegOp4:
     case X86II::MRMSrcRegCC:
     case X86II::MRMXrCC:
+    case X86II::MRMr0:
     case X86II::MRMXr:
     case X86II::MRM0r: case X86II::MRM1r:
     case X86II::MRM2r: case X86II::MRM3r:
     case X86II::MRM4r: case X86II::MRM5r:
     case X86II::MRM6r: case X86II::MRM7r:
       return -1;
+    case X86II::MRM0X: case X86II::MRM1X:
+    case X86II::MRM2X: case X86II::MRM3X:
+    case X86II::MRM4X: case X86II::MRM5X:
+    case X86II::MRM6X: case X86II::MRM7X:
+      return -1;
     case X86II::MRMXmCC:
     case X86II::MRMXm:
     case X86II::MRM0m: case X86II::MRM1m:
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
index bd009da60851..292dd17e2f51 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ELFObjectWriter.cpp
@@ -317,8 +317,10 @@ static unsigned getRelocType32(MCContext &Ctx,
 unsigned X86ELFObjectWriter::getRelocType(MCContext &Ctx, const MCValue &Target,
                                           const MCFixup &Fixup,
                                           bool IsPCRel) const {
-  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
   MCFixupKind Kind = Fixup.getKind();
+  if (Kind >= FirstLiteralRelocationKind)
+    return Kind - FirstLiteralRelocationKind;
+  MCSymbolRefExpr::VariantKind Modifier = Target.getAccessVariant();
   X86_64RelType Type = getType64(Kind, Modifier, IsPCRel);
   if (getEMachine() == ELF::EM_X86_64)
     return getRelocType64(Ctx, Fixup.getLoc(), Modifier, Type, IsPCRel, Kind);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
index 73b1969b4e82..b51011e2c52f 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstComments.cpp
@@ -15,7 +15,7 @@
 #include "X86ATTInstPrinter.h"
 #include "X86BaseInfo.h"
 #include "X86MCTargetDesc.h"
-#include "Utils/X86ShuffleDecode.h"
+#include "X86ShuffleDecode.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrInfo.h"
 #include "llvm/Support/raw_ostream.h"
@@ -199,6 +199,40 @@ using namespace llvm;
   CASE_AVX512_INS_COMMON(Inst##SD, Z, m_Int)      \
   CASE_AVX512_INS_COMMON(Inst##SS, Z, m_Int)
 
+#define CASE_FMA4(Inst, suf)                      \
+  CASE_AVX_INS_COMMON(Inst, 4, suf)               \
+  CASE_AVX_INS_COMMON(Inst, 4Y, suf)
+
+#define CASE_FMA4_PACKED_RR(Inst)                 \
+  CASE_FMA4(Inst##PD, rr)                         \
+  CASE_FMA4(Inst##PS, rr)
+
+#define CASE_FMA4_PACKED_RM(Inst)                 \
+  CASE_FMA4(Inst##PD, rm)                         \
+  CASE_FMA4(Inst##PS, rm)
+
+#define CASE_FMA4_PACKED_MR(Inst)                 \
+  CASE_FMA4(Inst##PD, mr)                         \
+  CASE_FMA4(Inst##PS, mr)
+
+#define CASE_FMA4_SCALAR_RR(Inst)                 \
+  CASE_AVX_INS_COMMON(Inst##SD4, , rr)            \
+  CASE_AVX_INS_COMMON(Inst##SS4, , rr)            \
+  CASE_AVX_INS_COMMON(Inst##SD4, , rr_Int)        \
+  CASE_AVX_INS_COMMON(Inst##SS4, , rr_Int)
+
+#define CASE_FMA4_SCALAR_RM(Inst)                 \
+  CASE_AVX_INS_COMMON(Inst##SD4, , rm)            \
+  CASE_AVX_INS_COMMON(Inst##SS4, , rm)            \
+  CASE_AVX_INS_COMMON(Inst##SD4, , rm_Int)        \
+  CASE_AVX_INS_COMMON(Inst##SS4, , rm_Int)
+
+#define CASE_FMA4_SCALAR_MR(Inst)                 \
+  CASE_AVX_INS_COMMON(Inst##SD4, , mr)            \
+  CASE_AVX_INS_COMMON(Inst##SS4, , mr)            \
+  CASE_AVX_INS_COMMON(Inst##SD4, , mr_Int)        \
+  CASE_AVX_INS_COMMON(Inst##SS4, , mr_Int)
+
 static unsigned getVectorRegSize(unsigned RegNo) {
   if (X86::ZMM0 <= RegNo && RegNo <= X86::ZMM31)
     return 512;
@@ -247,14 +281,15 @@ static void printMasking(raw_ostream &OS, const MCInst *MI,
     OS << " {z}";
 }
 
-static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) {
+static bool printFMAComments(const MCInst *MI, raw_ostream &OS,
+                             const MCInstrInfo &MCII) {
   const char *Mul1Name = nullptr, *Mul2Name = nullptr, *AccName = nullptr;
   unsigned NumOperands = MI->getNumOperands();
   bool RegForm = false;
   bool Negate = false;
   StringRef AccStr = "+";
 
-  // The operands for FMA instructions without rounding fall into two forms.
+  // The operands for FMA3 instructions without rounding fall into two forms:
   //  dest, src1, src2, src3
   //  dest, src1, mask, src2, src3
   // Where src3 is either a register or 5 memory address operands. So to find
@@ -262,9 +297,112 @@ static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) {
   // index from the end by taking into account memory vs register form when
   // finding src2.
 
+  // The operands for FMA4 instructions:
+  //  dest, src1, src2, src3
+  // Where src2 OR src3 are either a register or 5 memory address operands. So
+  // to find dest and src1 we can index from the front, src2 (reg/mem) follows
+  // and then src3 (reg) will be at the end.
+
   switch (MI->getOpcode()) {
   default:
     return false;
+
+  CASE_FMA4_PACKED_RR(FMADD)
+  CASE_FMA4_SCALAR_RR(FMADD)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+  CASE_FMA4_PACKED_RM(FMADD)
+  CASE_FMA4_SCALAR_RM(FMADD)
+    Mul2Name = getRegName(MI->getOperand(2).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    break;
+  CASE_FMA4_PACKED_MR(FMADD)
+  CASE_FMA4_SCALAR_MR(FMADD)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    break;
+
+  CASE_FMA4_PACKED_RR(FMSUB)
+  CASE_FMA4_SCALAR_RR(FMSUB)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+  CASE_FMA4_PACKED_RM(FMSUB)
+  CASE_FMA4_SCALAR_RM(FMSUB)
+    Mul2Name = getRegName(MI->getOperand(2).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-";
+    break;
+  CASE_FMA4_PACKED_MR(FMSUB)
+  CASE_FMA4_SCALAR_MR(FMSUB)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-";
+    break;
+
+  CASE_FMA4_PACKED_RR(FNMADD)
+  CASE_FMA4_SCALAR_RR(FNMADD)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+  CASE_FMA4_PACKED_RM(FNMADD)
+  CASE_FMA4_SCALAR_RM(FNMADD)
+    Mul2Name = getRegName(MI->getOperand(2).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    Negate = true;
+    break;
+  CASE_FMA4_PACKED_MR(FNMADD)
+  CASE_FMA4_SCALAR_MR(FNMADD)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    Negate = true;
+    break;
+
+  CASE_FMA4_PACKED_RR(FNMSUB)
+  CASE_FMA4_SCALAR_RR(FNMSUB)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+  CASE_FMA4_PACKED_RM(FNMSUB)
+  CASE_FMA4_SCALAR_RM(FNMSUB)
+    Mul2Name = getRegName(MI->getOperand(2).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-";
+    Negate = true;
+    break;
+  CASE_FMA4_PACKED_MR(FNMSUB)
+  CASE_FMA4_SCALAR_MR(FNMSUB)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-";
+    Negate = true;
+    break;
+
+  CASE_FMA4_PACKED_RR(FMADDSUB)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+  CASE_FMA4_PACKED_RM(FMADDSUB)
+    Mul2Name = getRegName(MI->getOperand(2).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "+/-";
+    break;
+  CASE_FMA4_PACKED_MR(FMADDSUB)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "+/-";
+    break;
+
+  CASE_FMA4_PACKED_RR(FMSUBADD)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    LLVM_FALLTHROUGH;
+  CASE_FMA4_PACKED_RM(FMSUBADD)
+    Mul2Name = getRegName(MI->getOperand(2).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-/+";
+    break;
+  CASE_FMA4_PACKED_MR(FMSUBADD)
+    AccName = getRegName(MI->getOperand(NumOperands - 1).getReg());
+    Mul1Name = getRegName(MI->getOperand(1).getReg());
+    AccStr = "-/+";
+    break;
+
   CASE_FMA_PACKED_REG(FMADD132)
   CASE_FMA_SCALAR_REG(FMADD132)
     Mul2Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
@@ -476,8 +614,9 @@ static bool printFMA3Comments(const MCInst *MI, raw_ostream &OS) {
   if (!Mul2Name) Mul2Name = "mem";
   if (!AccName)  AccName = "mem";
 
-  OS << DestName << " = ";
-  // TODO: Print masking information?
+  OS << DestName;
+  printMasking(OS, MI, MCII);
+  OS << " = ";
 
   if (Negate)
     OS << '-';
@@ -504,7 +643,7 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   unsigned NumOperands = MI->getNumOperands();
   bool RegForm = false;
 
-  if (printFMA3Comments(MI, OS))
+  if (printFMAComments(MI, OS, MCII))
     return true;
 
   switch (MI->getOpcode()) {
@@ -669,14 +808,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::PSLLDQri:
   case X86::VPSLLDQri:
   case X86::VPSLLDQYri:
-  case X86::VPSLLDQZ128rr:
-  case X86::VPSLLDQZ256rr:
-  case X86::VPSLLDQZrr:
+  case X86::VPSLLDQZ128ri:
+  case X86::VPSLLDQZ256ri:
+  case X86::VPSLLDQZri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     LLVM_FALLTHROUGH;
-  case X86::VPSLLDQZ128rm:
-  case X86::VPSLLDQZ256rm:
-  case X86::VPSLLDQZrm:
+  case X86::VPSLLDQZ128mi:
+  case X86::VPSLLDQZ256mi:
+  case X86::VPSLLDQZmi:
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(NumOperands - 1).isImm())
       DecodePSLLDQMask(getRegOperandNumElts(MI, 8, 0),
@@ -687,14 +826,14 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
   case X86::PSRLDQri:
   case X86::VPSRLDQri:
   case X86::VPSRLDQYri:
-  case X86::VPSRLDQZ128rr:
-  case X86::VPSRLDQZ256rr:
-  case X86::VPSRLDQZrr:
+  case X86::VPSRLDQZ128ri:
+  case X86::VPSRLDQZ256ri:
+  case X86::VPSRLDQZri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     LLVM_FALLTHROUGH;
-  case X86::VPSRLDQZ128rm:
-  case X86::VPSRLDQZ256rm:
-  case X86::VPSRLDQZrm:
+  case X86::VPSRLDQZ128mi:
+  case X86::VPSRLDQZ256mi:
+  case X86::VPSRLDQZmi:
     DestName = getRegName(MI->getOperand(0).getReg());
     if (MI->getOperand(NumOperands - 1).isImm())
       DecodePSRLDQMask(getRegOperandNumElts(MI, 8, 0),
@@ -1178,28 +1317,28 @@ bool llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeSubVectorBroadcast(16, 8, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
-  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, r)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, rr)
     Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     LLVM_FALLTHROUGH;
-  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, m)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z128, rm)
     DecodeSubVectorBroadcast(4, 2, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
-  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, r)
-  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, r)
+  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, rr)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, rr)
     Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     LLVM_FALLTHROUGH;
-  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, m)
-  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, m)
+  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z256, rm)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z256, rm)
     DecodeSubVectorBroadcast(8, 2, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
-  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, r)
-  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, r)
+  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, rr)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, rr)
     Src1Name = getRegName(MI->getOperand(NumOperands - 1).getReg());
     LLVM_FALLTHROUGH;
-  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, m)
-  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, m)
+  CASE_AVX512_INS_COMMON(BROADCASTF32X2, Z, rm)
+  CASE_AVX512_INS_COMMON(BROADCASTI32X2, Z, rm)
     DecodeSubVectorBroadcast(16, 2, ShuffleMask);
     DestName = getRegName(MI->getOperand(0).getReg());
     break;
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
index a21555076976..33d70fdb1214 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.cpp
@@ -13,6 +13,7 @@
 
 #include "X86InstPrinterCommon.h"
 #include "X86BaseInfo.h"
+#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/MCInstrDesc.h"
@@ -287,16 +288,23 @@ void X86InstPrinterCommon::printRoundingControl(const MCInst *MI, unsigned Op,
   }
 }
 
-/// printPCRelImm - This is used to print an immediate value that ends up
-/// being encoded as a pc-relative value (e.g. for jumps and calls).  In
-/// Intel-style these print slightly differently than normal immediates.
-/// for example, a $ is not emitted.
-void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, unsigned OpNo,
-                                         raw_ostream &O) {
+/// value (e.g. for jumps and calls). In Intel-style these print slightly
+/// differently than normal immediates. For example, a $ is not emitted.
+///
+/// \p Address The address of the next instruction.
+/// \see MCInstPrinter::printInst
+void X86InstPrinterCommon::printPCRelImm(const MCInst *MI, uint64_t Address,
+                                         unsigned OpNo, raw_ostream &O) {
   const MCOperand &Op = MI->getOperand(OpNo);
-  if (Op.isImm())
-    O << formatImm(Op.getImm());
-  else {
+  if (Op.isImm()) {
+    if (PrintBranchImmAsAddress) {
+      uint64_t Target = Address + Op.getImm();
+      if (MAI.getCodePointerSize() == 4)
+        Target &= 0xffffffff;
+      O << formatHex(Target);
+    } else
+      O << formatImm(Op.getImm());
+  } else {
     assert(Op.isExpr() && "unknown pcrel immediate operand");
     // If a symbolic branch target was added as a constant expression then print
     // that address in hex.
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
index 8e28f24b619a..bb12ede3b729 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86InstPrinterCommon.h
@@ -29,7 +29,9 @@ public:
   void printVPCMPMnemonic(const MCInst *MI, raw_ostream &OS);
   void printCMPMnemonic(const MCInst *MI, bool IsVCmp, raw_ostream &OS);
   void printRoundingControl(const MCInst *MI, unsigned Op, raw_ostream &O);
-  void printPCRelImm(const MCInst *MI, unsigned OpNo, raw_ostream &O);
+  void printPCRelImm(const MCInst *MI, uint64_t Address, unsigned OpNo,
+                     raw_ostream &O);
+
 protected:
   void printInstFlags(const MCInst *MI, raw_ostream &O);
   void printOptionalSegReg(const MCInst *MI, unsigned OpNo, raw_ostream &O);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
index f4bb0fbf62cd..d1eb4d09851d 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.cpp
@@ -45,8 +45,7 @@ void X86IntelInstPrinter::printInst(const MCInst *MI, uint64_t Address,
   if (MI->getOpcode() == X86::DATA16_PREFIX &&
       STI.getFeatureBits()[X86::Mode16Bit]) {
     OS << "\tdata32";
-  } else if (!printAliasInstr(MI, OS) &&
-             !printVecCompareInstr(MI, OS))
+  } else if (!printAliasInstr(MI, Address, OS) && !printVecCompareInstr(MI, OS))
     printInstruction(MI, Address, OS);
 
   // Next always print the annotation.
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
index b409b20cbea8..82baf611df03 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86IntelInstPrinter.h
@@ -31,9 +31,10 @@ public:
 
   // Autogenerated by tblgen, returns true if we successfully printed an
   // alias.
-  bool printAliasInstr(const MCInst *MI, raw_ostream &OS);
-  void printCustomAliasOperand(const MCInst *MI, unsigned OpIdx,
-                               unsigned PrintMethodIdx, raw_ostream &O);
+  bool printAliasInstr(const MCInst *MI, uint64_t Address, raw_ostream &OS);
+  void printCustomAliasOperand(const MCInst *MI, uint64_t Address,
+                               unsigned OpIdx, unsigned PrintMethodIdx,
+                               raw_ostream &O);
 
   // Autogenerated by tblgen.
   void printInstruction(const MCInst *MI, uint64_t Address, raw_ostream &O);
@@ -47,14 +48,6 @@ public:
   void printU8Imm(const MCInst *MI, unsigned Op, raw_ostream &O);
   void printSTiRegOperand(const MCInst *MI, unsigned OpNo, raw_ostream &OS);
 
-  void printanymem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    printMemReference(MI, OpNo, O);
-  }
-
-  void printopaquemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
-    printMemReference(MI, OpNo, O);
-  }
-
   void printbytemem(const MCInst *MI, unsigned OpNo, raw_ostream &O) {
     O << "byte ptr ";
     printMemReference(MI, OpNo, O);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
index d986c829d98e..c294da6baffa 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.cpp
@@ -71,8 +71,6 @@ X86MCAsmInfoDarwin::X86MCAsmInfoDarwin(const Triple &T) {
   // (actually, must, since otherwise the non-extern relocations we produce
   // overwhelm ld64's tiny little mind and it fails).
   DwarfFDESymbolsUseAbsDiff = true;
-
-  UseIntegratedAssembler = true;
 }
 
 X86_64MCAsmInfoDarwin::X86_64MCAsmInfoDarwin(const Triple &Triple)
@@ -102,10 +100,6 @@ X86ELFMCAsmInfo::X86ELFMCAsmInfo(const Triple &T) {
 
   // Exceptions handling
   ExceptionsType = ExceptionHandling::DwarfCFI;
-
-  // Always enable the integrated assembler by default.
-  // Clang also enabled it when the OS is Solaris but that is redundant here.
-  UseIntegratedAssembler = true;
 }
 
 const MCExpr *
@@ -141,8 +135,16 @@ X86MCAsmInfoMicrosoft::X86MCAsmInfoMicrosoft(const Triple &Triple) {
   TextAlignFillValue = 0x90;
 
   AllowAtInName = true;
+}
 
-  UseIntegratedAssembler = true;
+void X86MCAsmInfoMicrosoftMASM::anchor() { }
+
+X86MCAsmInfoMicrosoftMASM::X86MCAsmInfoMicrosoftMASM(const Triple &Triple)
+    : X86MCAsmInfoMicrosoft(Triple) {
+  DollarIsPC = true;
+  SeparatorString = "\n";
+  CommentString = ";";
+  AllowSymbolAtNameStart = true;
 }
 
 void X86MCAsmInfoGNUCOFF::anchor() { }
@@ -164,6 +166,4 @@ X86MCAsmInfoGNUCOFF::X86MCAsmInfoGNUCOFF(const Triple &Triple) {
   TextAlignFillValue = 0x90;
 
   AllowAtInName = true;
-
-  UseIntegratedAssembler = true;
 }
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
index b2369647a40f..ce8e84fb96b9 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCAsmInfo.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
 #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCASMINFO_H
 
-#include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCAsmInfoCOFF.h"
 #include "llvm/MC/MCAsmInfoDarwin.h"
 #include "llvm/MC/MCAsmInfoELF.h"
@@ -49,6 +48,13 @@ public:
   explicit X86MCAsmInfoMicrosoft(const Triple &Triple);
 };
 
+class X86MCAsmInfoMicrosoftMASM : public X86MCAsmInfoMicrosoft {
+  void anchor() override;
+
+public:
+  explicit X86MCAsmInfoMicrosoftMASM(const Triple &Triple);
+};
+
 class X86MCAsmInfoGNUCOFF : public MCAsmInfoGNUCOFF {
   void anchor() override;
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
index 54a293702bd0..7dea0760a831 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCCodeEmitter.cpp
@@ -55,83 +55,64 @@ public:
                          const MCSubtargetInfo &STI) const override;
 
 private:
-  unsigned getX86RegNum(const MCOperand &MO) const {
-    return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
-  }
+  unsigned getX86RegNum(const MCOperand &MO) const;
 
-  unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const {
-    return Ctx.getRegisterInfo()->getEncodingValue(
-        MI.getOperand(OpNum).getReg());
-  }
+  unsigned getX86RegEncoding(const MCInst &MI, unsigned OpNum) const;
 
   /// \param MI a single low-level machine instruction.
   /// \param OpNum the operand #.
   /// \returns true if the OpNumth operand of MI  require a bit to be set in
   /// REX prefix.
-  bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const {
-    return (getX86RegEncoding(MI, OpNum) >> 3) & 1;
-  }
-
-  void emitByte(uint8_t C, unsigned &CurByte, raw_ostream &OS) const {
-    OS << (char)C;
-    ++CurByte;
-  }
-
-  void emitConstant(uint64_t Val, unsigned Size, unsigned &CurByte,
-                    raw_ostream &OS) const {
-    // Output the constant in little endian byte order.
-    for (unsigned i = 0; i != Size; ++i) {
-      emitByte(Val & 255, CurByte, OS);
-      Val >>= 8;
-    }
-  }
+  bool isREXExtendedReg(const MCInst &MI, unsigned OpNum) const;
 
   void emitImmediate(const MCOperand &Disp, SMLoc Loc, unsigned ImmSize,
-                     MCFixupKind FixupKind, unsigned &CurByte, raw_ostream &OS,
+                     MCFixupKind FixupKind, uint64_t StartByte, raw_ostream &OS,
                      SmallVectorImpl<MCFixup> &Fixups, int ImmOffset = 0) const;
 
-  static uint8_t modRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) {
-    assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
-    return RM | (RegOpcode << 3) | (Mod << 6);
-  }
-
   void emitRegModRMByte(const MCOperand &ModRMReg, unsigned RegOpcodeFld,
-                        unsigned &CurByte, raw_ostream &OS) const {
-    emitByte(modRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)), CurByte, OS);
-  }
+                        raw_ostream &OS) const;
 
   void emitSIBByte(unsigned SS, unsigned Index, unsigned Base,
-                   unsigned &CurByte, raw_ostream &OS) const {
-    // SIB byte is in the same format as the modRMByte.
-    emitByte(modRMByte(SS, Index, Base), CurByte, OS);
-  }
+                   raw_ostream &OS) const;
 
   void emitMemModRMByte(const MCInst &MI, unsigned Op, unsigned RegOpcodeField,
-                        uint64_t TSFlags, bool Rex, unsigned &CurByte,
+                        uint64_t TSFlags, bool HasREX, uint64_t StartByte,
                         raw_ostream &OS, SmallVectorImpl<MCFixup> &Fixups,
-                        const MCSubtargetInfo &STI) const;
+                        const MCSubtargetInfo &STI,
+                        bool ForceSIB = false) const;
 
-  void emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp, unsigned &CurByte,
-                  bool &Rex, const MCInst &MI, const MCInstrDesc &Desc,
-                  const MCSubtargetInfo &STI, raw_ostream &OS) const;
+  bool emitPrefixImpl(unsigned &CurOp, const MCInst &MI,
+                      const MCSubtargetInfo &STI, raw_ostream &OS) const;
 
-  void emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
-                           const MCInst &MI, const MCInstrDesc &Desc,
+  void emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
                            raw_ostream &OS) const;
 
-  void emitSegmentOverridePrefix(unsigned &CurByte, unsigned SegOperand,
-                                 const MCInst &MI, raw_ostream &OS) const;
+  void emitSegmentOverridePrefix(unsigned SegOperand, const MCInst &MI,
+                                 raw_ostream &OS) const;
 
-  bool emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte, int MemOperand,
-                        const MCInst &MI, const MCInstrDesc &Desc,
+  bool emitOpcodePrefix(int MemOperand, const MCInst &MI,
                         const MCSubtargetInfo &STI, raw_ostream &OS) const;
 
-  uint8_t determineREXPrefix(const MCInst &MI, uint64_t TSFlags, int MemOperand,
-                             const MCInstrDesc &Desc) const;
+  bool emitREXPrefix(int MemOperand, const MCInst &MI, raw_ostream &OS) const;
 };
 
 } // end anonymous namespace
 
+static uint8_t modRMByte(unsigned Mod, unsigned RegOpcode, unsigned RM) {
+  assert(Mod < 4 && RegOpcode < 8 && RM < 8 && "ModRM Fields out of range!");
+  return RM | (RegOpcode << 3) | (Mod << 6);
+}
+
+static void emitByte(uint8_t C, raw_ostream &OS) { OS << static_cast<char>(C); }
+
+static void emitConstant(uint64_t Val, unsigned Size, raw_ostream &OS) {
+  // Output the constant in little endian byte order.
+  for (unsigned i = 0; i != Size; ++i) {
+    emitByte(Val & 255, OS);
+    Val >>= 8;
+  }
+}
+
 /// \returns true if this signed displacement fits in a 8-bit sign-extended
 /// field.
 static bool isDisp8(int Value) { return Value == (int8_t)Value; }
@@ -275,7 +256,8 @@ static bool hasSecRelSymbolRef(const MCExpr *Expr) {
 static bool isPCRel32Branch(const MCInst &MI, const MCInstrInfo &MCII) {
   unsigned Opcode = MI.getOpcode();
   const MCInstrDesc &Desc = MCII.get(Opcode);
-  if ((Opcode != X86::CALL64pcrel32 && Opcode != X86::JMP_4) ||
+  if ((Opcode != X86::CALL64pcrel32 && Opcode != X86::JMP_4 &&
+       Opcode != X86::JCC_4) ||
       getImmFixupKind(Desc.TSFlags) != FK_PCRel_4)
     return false;
 
@@ -288,9 +270,27 @@ static bool isPCRel32Branch(const MCInst &MI, const MCInstrInfo &MCII) {
   return Ref && Ref->getKind() == MCSymbolRefExpr::VK_None;
 }
 
+unsigned X86MCCodeEmitter::getX86RegNum(const MCOperand &MO) const {
+  return Ctx.getRegisterInfo()->getEncodingValue(MO.getReg()) & 0x7;
+}
+
+unsigned X86MCCodeEmitter::getX86RegEncoding(const MCInst &MI,
+                                             unsigned OpNum) const {
+  return Ctx.getRegisterInfo()->getEncodingValue(MI.getOperand(OpNum).getReg());
+}
+
+/// \param MI a single low-level machine instruction.
+/// \param OpNum the operand #.
+/// \returns true if the OpNumth operand of MI  require a bit to be set in
+/// REX prefix.
+bool X86MCCodeEmitter::isREXExtendedReg(const MCInst &MI,
+                                        unsigned OpNum) const {
+  return (getX86RegEncoding(MI, OpNum) >> 3) & 1;
+}
+
 void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
                                      unsigned Size, MCFixupKind FixupKind,
-                                     unsigned &CurByte, raw_ostream &OS,
+                                     uint64_t StartByte, raw_ostream &OS,
                                      SmallVectorImpl<MCFixup> &Fixups,
                                      int ImmOffset) const {
   const MCExpr *Expr = nullptr;
@@ -299,7 +299,7 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
     // relocation, emit it now.
     if (FixupKind != FK_PCRel_1 && FixupKind != FK_PCRel_2 &&
         FixupKind != FK_PCRel_4) {
-      emitConstant(DispOp.getImm() + ImmOffset, Size, CurByte, OS);
+      emitConstant(DispOp.getImm() + ImmOffset, Size, OS);
       return;
     }
     Expr = MCConstantExpr::create(DispOp.getImm(), Ctx);
@@ -322,7 +322,7 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
       }
 
       if (Kind == GOT_Normal)
-        ImmOffset = CurByte;
+        ImmOffset = static_cast<int>(OS.tell() - StartByte);
     } else if (Expr->getKind() == MCExpr::SymbolRef) {
       if (hasSecRelSymbolRef(Expr)) {
         FixupKind = MCFixupKind(FK_SecRel_4);
@@ -361,16 +361,30 @@ void X86MCCodeEmitter::emitImmediate(const MCOperand &DispOp, SMLoc Loc,
                                    Ctx);
 
   // Emit a symbolic constant as a fixup and 4 zeros.
-  Fixups.push_back(MCFixup::create(CurByte, Expr, FixupKind, Loc));
-  emitConstant(0, Size, CurByte, OS);
+  Fixups.push_back(MCFixup::create(static_cast<uint32_t>(OS.tell() - StartByte),
+                                   Expr, FixupKind, Loc));
+  emitConstant(0, Size, OS);
+}
+
+void X86MCCodeEmitter::emitRegModRMByte(const MCOperand &ModRMReg,
+                                        unsigned RegOpcodeFld,
+                                        raw_ostream &OS) const {
+  emitByte(modRMByte(3, RegOpcodeFld, getX86RegNum(ModRMReg)), OS);
+}
+
+void X86MCCodeEmitter::emitSIBByte(unsigned SS, unsigned Index, unsigned Base,
+                                   raw_ostream &OS) const {
+  // SIB byte is in the same format as the modRMByte.
+  emitByte(modRMByte(SS, Index, Base), OS);
 }
 
 void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
                                         unsigned RegOpcodeField,
-                                        uint64_t TSFlags, bool Rex,
-                                        unsigned &CurByte, raw_ostream &OS,
+                                        uint64_t TSFlags, bool HasREX,
+                                        uint64_t StartByte, raw_ostream &OS,
                                         SmallVectorImpl<MCFixup> &Fixups,
-                                        const MCSubtargetInfo &STI) const {
+                                        const MCSubtargetInfo &STI,
+                                        bool ForceSIB) const {
   const MCOperand &Disp = MI.getOperand(Op + X86::AddrDisp);
   const MCOperand &Base = MI.getOperand(Op + X86::AddrBaseReg);
   const MCOperand &Scale = MI.getOperand(Op + X86::AddrScaleAmt);
@@ -383,8 +397,9 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       BaseReg == X86::EIP) { // [disp32+rIP] in X86-64 mode
     assert(STI.hasFeature(X86::Mode64Bit) &&
            "Rip-relative addressing requires 64-bit mode");
-    assert(IndexReg.getReg() == 0 && "Invalid rip-relative address");
-    emitByte(modRMByte(0, RegOpcodeField, 5), CurByte, OS);
+    assert(IndexReg.getReg() == 0 && !ForceSIB &&
+           "Invalid rip-relative address");
+    emitByte(modRMByte(0, RegOpcodeField, 5), OS);
 
     unsigned Opcode = MI.getOpcode();
     // movq loads are handled with a special relocation form which allows the
@@ -395,7 +410,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       default:
         return X86::reloc_riprel_4byte;
       case X86::MOV64rm:
-        assert(Rex);
+        assert(HasREX);
         return X86::reloc_riprel_4byte_movq_load;
       case X86::CALL64m:
       case X86::JMP64m:
@@ -409,8 +424,8 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       case X86::SBB64rm:
       case X86::SUB64rm:
       case X86::XOR64rm:
-        return Rex ? X86::reloc_riprel_4byte_relax_rex
-                   : X86::reloc_riprel_4byte_relax;
+        return HasREX ? X86::reloc_riprel_4byte_relax_rex
+                      : X86::reloc_riprel_4byte_relax;
       }
     }();
 
@@ -425,7 +440,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
                       ? X86II::getSizeOfImm(TSFlags)
                       : 0;
 
-    emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS,
+    emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), StartByte, OS,
                   Fixups, -ImmSize);
     return;
   }
@@ -472,23 +487,23 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       if (Disp.isImm() && isDisp8(Disp.getImm())) {
         if (Disp.getImm() == 0 && RMfield != 6) {
           // There is no displacement; just the register.
-          emitByte(modRMByte(0, RegOpcodeField, RMfield), CurByte, OS);
+          emitByte(modRMByte(0, RegOpcodeField, RMfield), OS);
           return;
         }
         // Use the [REG]+disp8 form, including for [BP] which cannot be encoded.
-        emitByte(modRMByte(1, RegOpcodeField, RMfield), CurByte, OS);
-        emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+        emitByte(modRMByte(1, RegOpcodeField, RMfield), OS);
+        emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups);
         return;
       }
       // This is the [REG]+disp16 case.
-      emitByte(modRMByte(2, RegOpcodeField, RMfield), CurByte, OS);
+      emitByte(modRMByte(2, RegOpcodeField, RMfield), OS);
     } else {
       // There is no BaseReg; this is the plain [disp16] case.
-      emitByte(modRMByte(0, RegOpcodeField, 6), CurByte, OS);
+      emitByte(modRMByte(0, RegOpcodeField, 6), OS);
     }
 
     // Emit 16-bit displacement for plain disp16 or [REG]+disp16 cases.
-    emitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, CurByte, OS, Fixups);
+    emitImmediate(Disp, MI.getLoc(), 2, FK_Data_2, StartByte, OS, Fixups);
     return;
   }
 
@@ -498,7 +513,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
   // 2-7) and absolute references.
 
   if ( // The SIB byte must be used if there is an index register.
-      IndexReg.getReg() == 0 &&
+      !ForceSIB && IndexReg.getReg() == 0 &&
       // The SIB byte must be used if the base is ESP/RSP/R12, all of which
       // encode to an R/M value of 4, which indicates that a SIB byte is
       // present.
@@ -508,8 +523,8 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       (!STI.hasFeature(X86::Mode64Bit) || BaseReg != 0)) {
 
     if (BaseReg == 0) { // [disp32]     in X86-32 mode
-      emitByte(modRMByte(0, RegOpcodeField, 5), CurByte, OS);
-      emitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, CurByte, OS, Fixups);
+      emitByte(modRMByte(0, RegOpcodeField, 5), OS);
+      emitImmediate(Disp, MI.getLoc(), 4, FK_Data_4, StartByte, OS, Fixups);
       return;
     }
 
@@ -519,7 +534,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
     // by emitting a displacement of 0 below.
     if (BaseRegNo != N86::EBP) {
       if (Disp.isImm() && Disp.getImm() == 0) {
-        emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+        emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS);
         return;
       }
 
@@ -530,7 +545,7 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
           // This is exclusively used by call *a@tlscall(base). The relocation
           // (R_386_TLSCALL or R_X86_64_TLSCALL) applies to the beginning.
           Fixups.push_back(MCFixup::create(0, Sym, FK_NONE, MI.getLoc()));
-          emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), CurByte, OS);
+          emitByte(modRMByte(0, RegOpcodeField, BaseRegNo), OS);
           return;
         }
       }
@@ -539,27 +554,27 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
     // Otherwise, if the displacement fits in a byte, encode as [REG+disp8].
     if (Disp.isImm()) {
       if (!HasEVEX && isDisp8(Disp.getImm())) {
-        emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
-        emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups);
+        emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS);
+        emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups);
         return;
       }
       // Try EVEX compressed 8-bit displacement first; if failed, fall back to
       // 32-bit displacement.
       int CDisp8 = 0;
       if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
-        emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), CurByte, OS);
-        emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+        emitByte(modRMByte(1, RegOpcodeField, BaseRegNo), OS);
+        emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups,
                       CDisp8 - Disp.getImm());
         return;
       }
     }
 
     // Otherwise, emit the most general non-SIB encoding: [REG+disp32]
-    emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), CurByte, OS);
+    emitByte(modRMByte(2, RegOpcodeField, BaseRegNo), OS);
     unsigned Opcode = MI.getOpcode();
     unsigned FixupKind = Opcode == X86::MOV32rm ? X86::reloc_signed_4byte_relax
                                                 : X86::reloc_signed_4byte;
-    emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), CurByte, OS,
+    emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(FixupKind), StartByte, OS,
                   Fixups);
     return;
   }
@@ -575,30 +590,30 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
   if (BaseReg == 0) {
     // If there is no base register, we emit the special case SIB byte with
     // MOD=0, BASE=5, to JUST get the index, scale, and displacement.
-    emitByte(modRMByte(0, RegOpcodeField, 4), CurByte, OS);
+    emitByte(modRMByte(0, RegOpcodeField, 4), OS);
     ForceDisp32 = true;
   } else if (!Disp.isImm()) {
     // Emit the normal disp32 encoding.
-    emitByte(modRMByte(2, RegOpcodeField, 4), CurByte, OS);
+    emitByte(modRMByte(2, RegOpcodeField, 4), OS);
     ForceDisp32 = true;
   } else if (Disp.getImm() == 0 &&
              // Base reg can't be anything that ends up with '5' as the base
              // reg, it is the magic [*] nomenclature that indicates no base.
              BaseRegNo != N86::EBP) {
     // Emit no displacement ModR/M byte
-    emitByte(modRMByte(0, RegOpcodeField, 4), CurByte, OS);
+    emitByte(modRMByte(0, RegOpcodeField, 4), OS);
   } else if (!HasEVEX && isDisp8(Disp.getImm())) {
     // Emit the disp8 encoding.
-    emitByte(modRMByte(1, RegOpcodeField, 4), CurByte, OS);
+    emitByte(modRMByte(1, RegOpcodeField, 4), OS);
     ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
   } else if (HasEVEX && isCDisp8(TSFlags, Disp.getImm(), CDisp8)) {
     // Emit the disp8 encoding.
-    emitByte(modRMByte(1, RegOpcodeField, 4), CurByte, OS);
+    emitByte(modRMByte(1, RegOpcodeField, 4), OS);
     ForceDisp8 = true; // Make sure to force 8 bit disp if Base=EBP
     ImmOffset = CDisp8 - Disp.getImm();
   } else {
     // Emit the normal disp32 encoding.
-    emitByte(modRMByte(2, RegOpcodeField, 4), CurByte, OS);
+    emitByte(modRMByte(2, RegOpcodeField, 4), OS);
   }
 
   // Calculate what the SS field value should be...
@@ -613,77 +628,78 @@ void X86MCCodeEmitter::emitMemModRMByte(const MCInst &MI, unsigned Op,
       IndexRegNo = getX86RegNum(IndexReg);
     else // Examples: [ESP+1*<noreg>+4] or [scaled idx]+disp32 (MOD=0,BASE=5)
       IndexRegNo = 4;
-    emitSIBByte(SS, IndexRegNo, 5, CurByte, OS);
+    emitSIBByte(SS, IndexRegNo, 5, OS);
   } else {
     unsigned IndexRegNo;
     if (IndexReg.getReg())
       IndexRegNo = getX86RegNum(IndexReg);
     else
       IndexRegNo = 4; // For example [ESP+1*<noreg>+4]
-    emitSIBByte(SS, IndexRegNo, getX86RegNum(Base), CurByte, OS);
+    emitSIBByte(SS, IndexRegNo, getX86RegNum(Base), OS);
   }
 
   // Do we need to output a displacement?
   if (ForceDisp8)
-    emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, CurByte, OS, Fixups,
+    emitImmediate(Disp, MI.getLoc(), 1, FK_Data_1, StartByte, OS, Fixups,
                   ImmOffset);
   else if (ForceDisp32 || Disp.getImm() != 0)
     emitImmediate(Disp, MI.getLoc(), 4, MCFixupKind(X86::reloc_signed_4byte),
-                  CurByte, OS, Fixups);
+                  StartByte, OS, Fixups);
 }
 
-void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp,
-                                  unsigned &CurByte, bool &Rex,
-                                  const MCInst &MI, const MCInstrDesc &Desc,
-                                  const MCSubtargetInfo &STI,
-                                  raw_ostream &OS) const {
+/// Emit all instruction prefixes.
+///
+/// \returns true if REX prefix is used, otherwise returns false.
+bool X86MCCodeEmitter::emitPrefixImpl(unsigned &CurOp, const MCInst &MI,
+                                      const MCSubtargetInfo &STI,
+                                      raw_ostream &OS) const {
+  uint64_t TSFlags = MCII.get(MI.getOpcode()).TSFlags;
   // Determine where the memory operand starts, if present.
   int MemoryOperand = X86II::getMemoryOperandNo(TSFlags);
-  if (MemoryOperand != -1)
-    MemoryOperand += CurOp;
-
   // Emit segment override opcode prefix as needed.
-  if (MemoryOperand >= 0)
-    emitSegmentOverridePrefix(CurByte, MemoryOperand + X86::AddrSegmentReg, MI,
-                              OS);
+  if (MemoryOperand != -1) {
+    MemoryOperand += CurOp;
+    emitSegmentOverridePrefix(MemoryOperand + X86::AddrSegmentReg, MI, OS);
+  }
 
   // Emit the repeat opcode prefix as needed.
   unsigned Flags = MI.getFlags();
   if (TSFlags & X86II::REP || Flags & X86::IP_HAS_REPEAT)
-    emitByte(0xF3, CurByte, OS);
+    emitByte(0xF3, OS);
   if (Flags & X86::IP_HAS_REPEAT_NE)
-    emitByte(0xF2, CurByte, OS);
+    emitByte(0xF2, OS);
 
   // Emit the address size opcode prefix as needed.
-  bool need_address_override;
+  bool NeedAddressOverride;
   uint64_t AdSize = TSFlags & X86II::AdSizeMask;
   if ((STI.hasFeature(X86::Mode16Bit) && AdSize == X86II::AdSize32) ||
       (STI.hasFeature(X86::Mode32Bit) && AdSize == X86II::AdSize16) ||
       (STI.hasFeature(X86::Mode64Bit) && AdSize == X86II::AdSize32)) {
-    need_address_override = true;
+    NeedAddressOverride = true;
   } else if (MemoryOperand < 0) {
-    need_address_override = false;
+    NeedAddressOverride = false;
   } else if (STI.hasFeature(X86::Mode64Bit)) {
     assert(!is16BitMemOperand(MI, MemoryOperand, STI));
-    need_address_override = is32BitMemOperand(MI, MemoryOperand);
+    NeedAddressOverride = is32BitMemOperand(MI, MemoryOperand);
   } else if (STI.hasFeature(X86::Mode32Bit)) {
     assert(!is64BitMemOperand(MI, MemoryOperand));
-    need_address_override = is16BitMemOperand(MI, MemoryOperand, STI);
+    NeedAddressOverride = is16BitMemOperand(MI, MemoryOperand, STI);
   } else {
     assert(STI.hasFeature(X86::Mode16Bit));
     assert(!is64BitMemOperand(MI, MemoryOperand));
-    need_address_override = !is16BitMemOperand(MI, MemoryOperand, STI);
+    NeedAddressOverride = !is16BitMemOperand(MI, MemoryOperand, STI);
   }
 
-  if (need_address_override)
-    emitByte(0x67, CurByte, OS);
+  if (NeedAddressOverride)
+    emitByte(0x67, OS);
 
   // Encoding type for this instruction.
   uint64_t Encoding = TSFlags & X86II::EncodingMask;
-  if (Encoding == 0)
-    Rex = emitOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, STI, OS);
+  bool HasREX = false;
+  if (Encoding)
+    emitVEXOpcodePrefix(MemoryOperand, MI, OS);
   else
-    emitVEXOpcodePrefix(TSFlags, CurByte, MemoryOperand, MI, Desc, OS);
+    HasREX = emitOpcodePrefix(MemoryOperand, MI, STI, OS);
 
   uint64_t Form = TSFlags & X86II::FormMask;
   switch (Form) {
@@ -697,11 +713,11 @@ void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp,
            "SI and DI register sizes do not match");
     // Emit segment override opcode prefix as needed (not for %ds).
     if (MI.getOperand(2).getReg() != X86::DS)
-      emitSegmentOverridePrefix(CurByte, 2, MI, OS);
+      emitSegmentOverridePrefix(2, MI, OS);
     // Emit AdSize prefix as needed.
     if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) ||
         (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI))
-      emitByte(0x67, CurByte, OS);
+      emitByte(0x67, OS);
     CurOp += 3; // Consume operands.
     break;
   }
@@ -709,11 +725,11 @@ void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp,
     unsigned siReg = MI.getOperand(0).getReg();
     // Emit segment override opcode prefix as needed (not for %ds).
     if (MI.getOperand(1).getReg() != X86::DS)
-      emitSegmentOverridePrefix(CurByte, 1, MI, OS);
+      emitSegmentOverridePrefix(1, MI, OS);
     // Emit AdSize prefix as needed.
     if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::ESI) ||
         (STI.hasFeature(X86::Mode32Bit) && siReg == X86::SI))
-      emitByte(0x67, CurByte, OS);
+      emitByte(0x67, OS);
     CurOp += 2; // Consume operands.
     break;
   }
@@ -722,24 +738,26 @@ void X86MCCodeEmitter::emitPrefixImpl(uint64_t TSFlags, unsigned &CurOp,
     // Emit AdSize prefix as needed.
     if ((!STI.hasFeature(X86::Mode32Bit) && siReg == X86::EDI) ||
         (STI.hasFeature(X86::Mode32Bit) && siReg == X86::DI))
-      emitByte(0x67, CurByte, OS);
+      emitByte(0x67, OS);
     ++CurOp; // Consume operand.
     break;
   }
   case X86II::RawFrmMemOffs: {
     // Emit segment override opcode prefix as needed.
-    emitSegmentOverridePrefix(CurByte, 1, MI, OS);
+    emitSegmentOverridePrefix(1, MI, OS);
     break;
   }
   }
+
+  return HasREX;
 }
 
-/// emitVEXOpcodePrefix - AVX instructions are encoded using a opcode prefix
-/// called VEX.
-void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
-                                           int MemOperand, const MCInst &MI,
-                                           const MCInstrDesc &Desc,
+/// AVX instructions are encoded using a opcode prefix called VEX.
+void X86MCCodeEmitter::emitVEXOpcodePrefix(int MemOperand, const MCInst &MI,
                                            raw_ostream &OS) const {
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
   assert(!(TSFlags & X86II::LOCK) && "Can't have LOCK VEX.");
 
   uint64_t Encoding = TSFlags & X86II::EncodingMask;
@@ -868,8 +886,11 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   switch (TSFlags & X86II::FormMask) {
   default:
     llvm_unreachable("Unexpected form in emitVEXOpcodePrefix!");
+  case X86II::MRM_C0:
   case X86II::RawFrm:
+  case X86II::PrefixByte:
     break;
+  case X86II::MRMDestMemFSIB:
   case X86II::MRMDestMem: {
     // MRMDestMem instructions forms:
     //  MemAddr, src1(ModR/M)
@@ -900,6 +921,7 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     EVEX_R2 = ~(RegEnc >> 4) & 1;
     break;
   }
+  case X86II::MRMSrcMemFSIB:
   case X86II::MRMSrcMem: {
     // MRMSrcMem instructions forms:
     //  src1(ModR/M), MemAddr
@@ -1081,6 +1103,15 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
       EncodeRC = true;
     break;
   }
+  case X86II::MRMr0: {
+    // MRMr0 instructions forms:
+    //  11:rrr:000
+    //  dst(ModR/M)
+    unsigned RegEnc = getX86RegEncoding(MI, CurOp++);
+    VEX_R = ~(RegEnc >> 3) & 1;
+    EVEX_R2 = ~(RegEnc >> 4) & 1;
+    break;
+  }
   case X86II::MRM0r:
   case X86II::MRM1r:
   case X86II::MRM2r:
@@ -1127,15 +1158,15 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     // Can we use the 2 byte VEX prefix?
     if (!(MI.getFlags() & X86::IP_USE_VEX3) && Encoding == X86II::VEX &&
         VEX_B && VEX_X && !VEX_W && (VEX_5M == 1)) {
-      emitByte(0xC5, CurByte, OS);
-      emitByte(LastByte | (VEX_R << 7), CurByte, OS);
+      emitByte(0xC5, OS);
+      emitByte(LastByte | (VEX_R << 7), OS);
       return;
     }
 
     // 3 byte VEX prefix
-    emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, CurByte, OS);
-    emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, CurByte, OS);
-    emitByte(LastByte | (VEX_W << 7), CurByte, OS);
+    emitByte(Encoding == X86II::XOP ? 0x8F : 0xC4, OS);
+    emitByte(VEX_R << 7 | VEX_X << 6 | VEX_B << 5 | VEX_5M, OS);
+    emitByte(LastByte | (VEX_W << 7), OS);
   } else {
     assert(Encoding == X86II::EVEX && "unknown encoding!");
     // EVEX opcode prefix can have 4 bytes
@@ -1146,144 +1177,137 @@ void X86MCCodeEmitter::emitVEXOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
     assert((VEX_5M & 0x3) == VEX_5M &&
            "More than 2 significant bits in VEX.m-mmmm fields for EVEX!");
 
-    emitByte(0x62, CurByte, OS);
+    emitByte(0x62, OS);
     emitByte((VEX_R << 7) | (VEX_X << 6) | (VEX_B << 5) | (EVEX_R2 << 4) |
                  VEX_5M,
-             CurByte, OS);
-    emitByte((VEX_W << 7) | (VEX_4V << 3) | (EVEX_U << 2) | VEX_PP, CurByte,
              OS);
+    emitByte((VEX_W << 7) | (VEX_4V << 3) | (EVEX_U << 2) | VEX_PP, OS);
     if (EncodeRC)
       emitByte((EVEX_z << 7) | (EVEX_rc << 5) | (EVEX_b << 4) | (EVEX_V2 << 3) |
                    EVEX_aaa,
-               CurByte, OS);
+               OS);
     else
       emitByte((EVEX_z << 7) | (EVEX_L2 << 6) | (VEX_L << 5) | (EVEX_b << 4) |
                    (EVEX_V2 << 3) | EVEX_aaa,
-               CurByte, OS);
+               OS);
   }
 }
 
-/// Determine if the MCInst has to be encoded with a X86-64 REX prefix which
-/// specifies 1) 64-bit instructions, 2) non-default operand size, and 3) use
-/// of X86-64 extended registers.
-uint8_t X86MCCodeEmitter::determineREXPrefix(const MCInst &MI, uint64_t TSFlags,
-                                             int MemOperand,
-                                             const MCInstrDesc &Desc) const {
-  uint8_t REX = 0;
-  bool UsesHighByteReg = false;
-
-  if (TSFlags & X86II::REX_W)
-    REX |= 1 << 3; // set REX.W
+/// Emit REX prefix which specifies
+///   1) 64-bit instructions,
+///   2) non-default operand size, and
+///   3) use of X86-64 extended registers.
+///
+/// \returns true if REX prefix is used, otherwise returns false.
+bool X86MCCodeEmitter::emitREXPrefix(int MemOperand, const MCInst &MI,
+                                     raw_ostream &OS) const {
+  uint8_t REX = [&, MemOperand]() {
+    uint8_t REX = 0;
+    bool UsesHighByteReg = false;
+
+    const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+    uint64_t TSFlags = Desc.TSFlags;
+
+    if (TSFlags & X86II::REX_W)
+      REX |= 1 << 3; // set REX.W
+
+    if (MI.getNumOperands() == 0)
+      return REX;
+
+    unsigned NumOps = MI.getNumOperands();
+    unsigned CurOp = X86II::getOperandBias(Desc);
+
+    // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
+    for (unsigned i = CurOp; i != NumOps; ++i) {
+      const MCOperand &MO = MI.getOperand(i);
+      if (!MO.isReg())
+        continue;
+      unsigned Reg = MO.getReg();
+      if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
+        UsesHighByteReg = true;
+      if (X86II::isX86_64NonExtLowByteReg(Reg))
+        // FIXME: The caller of determineREXPrefix slaps this prefix onto
+        // anything that returns non-zero.
+        REX |= 0x40; // REX fixed encoding prefix
+    }
 
-  if (MI.getNumOperands() == 0)
+    switch (TSFlags & X86II::FormMask) {
+    case X86II::AddRegFrm:
+      REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+      break;
+    case X86II::MRMSrcReg:
+    case X86II::MRMSrcRegCC:
+      REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+      REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+      break;
+    case X86II::MRMSrcMem:
+    case X86II::MRMSrcMemCC:
+      REX |= isREXExtendedReg(MI, CurOp++) << 2;                        // REX.R
+      REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0;  // REX.B
+      REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
+      CurOp += X86::AddrNumOperands;
+      break;
+    case X86II::MRMDestReg:
+      REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+      REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+      break;
+    case X86II::MRMDestMem:
+      REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0;  // REX.B
+      REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
+      CurOp += X86::AddrNumOperands;
+      REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+      break;
+    case X86II::MRMXmCC:
+    case X86II::MRMXm:
+    case X86II::MRM0m:
+    case X86II::MRM1m:
+    case X86II::MRM2m:
+    case X86II::MRM3m:
+    case X86II::MRM4m:
+    case X86II::MRM5m:
+    case X86II::MRM6m:
+    case X86II::MRM7m:
+      REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0;  // REX.B
+      REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
+      break;
+    case X86II::MRMXrCC:
+    case X86II::MRMXr:
+    case X86II::MRM0r:
+    case X86II::MRM1r:
+    case X86II::MRM2r:
+    case X86II::MRM3r:
+    case X86II::MRM4r:
+    case X86II::MRM5r:
+    case X86II::MRM6r:
+    case X86II::MRM7r:
+      REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
+      break;
+    case X86II::MRMr0:
+      REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
+      break;
+    case X86II::MRMDestMemFSIB:
+      llvm_unreachable("FSIB format never need REX prefix!");
+    }
+    if (REX && UsesHighByteReg)
+      report_fatal_error(
+          "Cannot encode high byte register in REX-prefixed instruction");
     return REX;
+  }();
 
-  unsigned NumOps = MI.getNumOperands();
-  unsigned CurOp = X86II::getOperandBias(Desc);
-
-  // If it accesses SPL, BPL, SIL, or DIL, then it requires a 0x40 REX prefix.
-  for (unsigned i = CurOp; i != NumOps; ++i) {
-    const MCOperand &MO = MI.getOperand(i);
-    if (!MO.isReg())
-      continue;
-    unsigned Reg = MO.getReg();
-    if (Reg == X86::AH || Reg == X86::BH || Reg == X86::CH || Reg == X86::DH)
-      UsesHighByteReg = true;
-    if (X86II::isX86_64NonExtLowByteReg(Reg))
-      // FIXME: The caller of determineREXPrefix slaps this prefix onto anything
-      // that returns non-zero.
-      REX |= 0x40; // REX fixed encoding prefix
-  }
-
-  switch (TSFlags & X86II::FormMask) {
-  case X86II::AddRegFrm:
-    REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
-    break;
-  case X86II::MRMSrcReg:
-  case X86II::MRMSrcRegCC:
-    REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
-    REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
-    break;
-  case X86II::MRMSrcMem:
-  case X86II::MRMSrcMemCC:
-    REX |= isREXExtendedReg(MI, CurOp++) << 2;                        // REX.R
-    REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0;  // REX.B
-    REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
-    CurOp += X86::AddrNumOperands;
-    break;
-  case X86II::MRMDestReg:
-    REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
-    REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
-    break;
-  case X86II::MRMDestMem:
-    REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0;  // REX.B
-    REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
-    CurOp += X86::AddrNumOperands;
-    REX |= isREXExtendedReg(MI, CurOp++) << 2; // REX.R
-    break;
-  case X86II::MRMXmCC:
-  case X86II::MRMXm:
-  case X86II::MRM0m:
-  case X86II::MRM1m:
-  case X86II::MRM2m:
-  case X86II::MRM3m:
-  case X86II::MRM4m:
-  case X86II::MRM5m:
-  case X86II::MRM6m:
-  case X86II::MRM7m:
-    REX |= isREXExtendedReg(MI, MemOperand + X86::AddrBaseReg) << 0;  // REX.B
-    REX |= isREXExtendedReg(MI, MemOperand + X86::AddrIndexReg) << 1; // REX.X
-    break;
-  case X86II::MRMXrCC:
-  case X86II::MRMXr:
-  case X86II::MRM0r:
-  case X86II::MRM1r:
-  case X86II::MRM2r:
-  case X86II::MRM3r:
-  case X86II::MRM4r:
-  case X86II::MRM5r:
-  case X86II::MRM6r:
-  case X86II::MRM7r:
-    REX |= isREXExtendedReg(MI, CurOp++) << 0; // REX.B
-    break;
-  }
-  if (REX && UsesHighByteReg)
-    report_fatal_error(
-        "Cannot encode high byte register in REX-prefixed instruction");
+  if (!REX)
+    return false;
 
-  return REX;
+  emitByte(0x40 | REX, OS);
+  return true;
 }
 
 /// Emit segment override opcode prefix as needed.
-void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned &CurByte,
-                                                 unsigned SegOperand,
+void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned SegOperand,
                                                  const MCInst &MI,
                                                  raw_ostream &OS) const {
   // Check for explicit segment override on memory operand.
-  switch (MI.getOperand(SegOperand).getReg()) {
-  default:
-    llvm_unreachable("Unknown segment register!");
-  case 0:
-    break;
-  case X86::CS:
-    emitByte(0x2E, CurByte, OS);
-    break;
-  case X86::SS:
-    emitByte(0x36, CurByte, OS);
-    break;
-  case X86::DS:
-    emitByte(0x3E, CurByte, OS);
-    break;
-  case X86::ES:
-    emitByte(0x26, CurByte, OS);
-    break;
-  case X86::FS:
-    emitByte(0x64, CurByte, OS);
-    break;
-  case X86::GS:
-    emitByte(0x65, CurByte, OS);
-    break;
-  }
+  if (unsigned Reg = MI.getOperand(SegOperand).getReg())
+    emitByte(X86::getSegmentOverridePrefixForReg(Reg), OS);
 }
 
 /// Emit all instruction prefixes prior to the opcode.
@@ -1291,48 +1315,44 @@ void X86MCCodeEmitter::emitSegmentOverridePrefix(unsigned &CurByte,
 /// \param MemOperand the operand # of the start of a memory operand if present.
 /// If not present, it is -1.
 ///
-/// \returns true if a REX prefix was used.
-bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
-                                        int MemOperand, const MCInst &MI,
-                                        const MCInstrDesc &Desc,
+/// \returns true if REX prefix is used, otherwise returns false.
+bool X86MCCodeEmitter::emitOpcodePrefix(int MemOperand, const MCInst &MI,
                                         const MCSubtargetInfo &STI,
                                         raw_ostream &OS) const {
-  bool Ret = false;
+  const MCInstrDesc &Desc = MCII.get(MI.getOpcode());
+  uint64_t TSFlags = Desc.TSFlags;
+
   // Emit the operand size opcode prefix as needed.
   if ((TSFlags & X86II::OpSizeMask) ==
       (STI.hasFeature(X86::Mode16Bit) ? X86II::OpSize32 : X86II::OpSize16))
-    emitByte(0x66, CurByte, OS);
+    emitByte(0x66, OS);
 
   // Emit the LOCK opcode prefix.
   if (TSFlags & X86II::LOCK || MI.getFlags() & X86::IP_HAS_LOCK)
-    emitByte(0xF0, CurByte, OS);
+    emitByte(0xF0, OS);
 
   // Emit the NOTRACK opcode prefix.
   if (TSFlags & X86II::NOTRACK || MI.getFlags() & X86::IP_HAS_NOTRACK)
-    emitByte(0x3E, CurByte, OS);
+    emitByte(0x3E, OS);
 
   switch (TSFlags & X86II::OpPrefixMask) {
   case X86II::PD: // 66
-    emitByte(0x66, CurByte, OS);
+    emitByte(0x66, OS);
     break;
   case X86II::XS: // F3
-    emitByte(0xF3, CurByte, OS);
+    emitByte(0xF3, OS);
     break;
   case X86II::XD: // F2
-    emitByte(0xF2, CurByte, OS);
+    emitByte(0xF2, OS);
     break;
   }
 
   // Handle REX prefix.
-  // FIXME: Can this come before F2 etc to simplify emission?
-  if (STI.hasFeature(X86::Mode64Bit)) {
-    if (uint8_t REX = determineREXPrefix(MI, TSFlags, MemOperand, Desc)) {
-      emitByte(0x40 | REX, CurByte, OS);
-      Ret = true;
-    }
-  } else {
-    assert(!(TSFlags & X86II::REX_W) && "REX.W requires 64bit mode.");
-  }
+  assert((STI.hasFeature(X86::Mode64Bit) || !(TSFlags & X86II::REX_W)) &&
+         "REX.W requires 64bit mode.");
+  bool HasREX = STI.hasFeature(X86::Mode64Bit)
+                    ? emitREXPrefix(MemOperand, MI, OS)
+                    : false;
 
   // 0x0F escape code must be emitted just before the opcode.
   switch (TSFlags & X86II::OpMapMask) {
@@ -1340,19 +1360,20 @@ bool X86MCCodeEmitter::emitOpcodePrefix(uint64_t TSFlags, unsigned &CurByte,
   case X86II::T8:        // 0F 38
   case X86II::TA:        // 0F 3A
   case X86II::ThreeDNow: // 0F 0F, second 0F emitted by caller.
-    emitByte(0x0F, CurByte, OS);
+    emitByte(0x0F, OS);
     break;
   }
 
   switch (TSFlags & X86II::OpMapMask) {
   case X86II::T8: // 0F 38
-    emitByte(0x38, CurByte, OS);
+    emitByte(0x38, OS);
     break;
   case X86II::TA: // 0F 3A
-    emitByte(0x3A, CurByte, OS);
+    emitByte(0x3A, OS);
     break;
   }
-  return Ret;
+
+  return HasREX;
 }
 
 void X86MCCodeEmitter::emitPrefix(const MCInst &MI, raw_ostream &OS,
@@ -1362,16 +1383,12 @@ void X86MCCodeEmitter::emitPrefix(const MCInst &MI, raw_ostream &OS,
   uint64_t TSFlags = Desc.TSFlags;
 
   // Pseudo instructions don't get encoded.
-  if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+  if (X86II::isPseudo(TSFlags))
     return;
 
   unsigned CurOp = X86II::getOperandBias(Desc);
 
-  // Keep track of the current byte being emitted.
-  unsigned CurByte = 0;
-
-  bool Rex = false;
-  emitPrefixImpl(TSFlags, CurOp, CurByte, Rex, MI, Desc, STI, OS);
+  emitPrefixImpl(CurOp, MI, STI, OS);
 }
 
 void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
@@ -1382,17 +1399,15 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   uint64_t TSFlags = Desc.TSFlags;
 
   // Pseudo instructions don't get encoded.
-  if ((TSFlags & X86II::FormMask) == X86II::Pseudo)
+  if (X86II::isPseudo(TSFlags))
     return;
 
   unsigned NumOps = Desc.getNumOperands();
   unsigned CurOp = X86II::getOperandBias(Desc);
 
-  // Keep track of the current byte being emitted.
-  unsigned CurByte = 0;
+  uint64_t StartByte = OS.tell();
 
-  bool Rex = false;
-  emitPrefixImpl(TSFlags, CurOp, CurByte, Rex, MI, Desc, STI, OS);
+  bool HasREX = emitPrefixImpl(CurOp, MI, STI, OS);
 
   // It uses the VEX.VVVV field?
   bool HasVEX_4V = TSFlags & X86II::VEX_4V;
@@ -1422,7 +1437,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::RawFrmDstSrc:
   case X86II::RawFrmSrc:
   case X86II::RawFrmDst:
-    emitByte(BaseOpcode, CurByte, OS);
+  case X86II::PrefixByte:
+    emitByte(BaseOpcode, OS);
     break;
   case X86II::AddCCFrm: {
     // This will be added to the opcode in the fallthrough.
@@ -1431,47 +1447,47 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     --NumOps; // Drop the operand from the end.
     LLVM_FALLTHROUGH;
   case X86II::RawFrm:
-    emitByte(BaseOpcode + OpcodeOffset, CurByte, OS);
+    emitByte(BaseOpcode + OpcodeOffset, OS);
 
     if (!STI.hasFeature(X86::Mode64Bit) || !isPCRel32Branch(MI, MCII))
       break;
 
     const MCOperand &Op = MI.getOperand(CurOp++);
     emitImmediate(Op, MI.getLoc(), X86II::getSizeOfImm(TSFlags),
-                  MCFixupKind(X86::reloc_branch_4byte_pcrel), CurByte, OS,
+                  MCFixupKind(X86::reloc_branch_4byte_pcrel), StartByte, OS,
                   Fixups);
     break;
   }
   case X86II::RawFrmMemOffs:
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
                   X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
-                  CurByte, OS, Fixups);
+                  StartByte, OS, Fixups);
     ++CurOp; // skip segment operand
     break;
   case X86II::RawFrmImm8:
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
                   X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
-                  CurByte, OS, Fixups);
-    emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, CurByte,
+                  StartByte, OS, Fixups);
+    emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 1, FK_Data_1, StartByte,
                   OS, Fixups);
     break;
   case X86II::RawFrmImm16:
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
                   X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
-                  CurByte, OS, Fixups);
-    emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, CurByte,
+                  StartByte, OS, Fixups);
+    emitImmediate(MI.getOperand(CurOp++), MI.getLoc(), 2, FK_Data_2, StartByte,
                   OS, Fixups);
     break;
 
   case X86II::AddRegFrm:
-    emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++)), CurByte, OS);
+    emitByte(BaseOpcode + getX86RegNum(MI.getOperand(CurOp++)), OS);
     break;
 
   case X86II::MRMDestReg: {
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     unsigned SrcRegNum = CurOp + 1;
 
     if (HasEVEX_K) // Skip writemask
@@ -1481,12 +1497,13 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
       ++SrcRegNum;
 
     emitRegModRMByte(MI.getOperand(CurOp),
-                     getX86RegNum(MI.getOperand(SrcRegNum)), CurByte, OS);
+                     getX86RegNum(MI.getOperand(SrcRegNum)), OS);
     CurOp = SrcRegNum + 1;
     break;
   }
+  case X86II::MRMDestMemFSIB:
   case X86II::MRMDestMem: {
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     unsigned SrcRegNum = CurOp + X86::AddrNumOperands;
 
     if (HasEVEX_K) // Skip writemask
@@ -1495,13 +1512,14 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     if (HasVEX_4V) // Skip 1st src (which is encoded in VEX_VVVV)
       ++SrcRegNum;
 
+    bool ForceSIB = (Form == X86II::MRMDestMemFSIB);
     emitMemModRMByte(MI, CurOp, getX86RegNum(MI.getOperand(SrcRegNum)), TSFlags,
-                     Rex, CurByte, OS, Fixups, STI);
+                     HasREX, StartByte, OS, Fixups, STI, ForceSIB);
     CurOp = SrcRegNum + 1;
     break;
   }
   case X86II::MRMSrcReg: {
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     unsigned SrcRegNum = CurOp + 1;
 
     if (HasEVEX_K) // Skip writemask
@@ -1511,7 +1529,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
       ++SrcRegNum;
 
     emitRegModRMByte(MI.getOperand(SrcRegNum),
-                     getX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+                     getX86RegNum(MI.getOperand(CurOp)), OS);
     CurOp = SrcRegNum + 1;
     if (HasVEX_I8Reg)
       I8RegNum = getX86RegEncoding(MI, CurOp++);
@@ -1521,17 +1539,17 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     break;
   }
   case X86II::MRMSrcReg4VOp3: {
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     unsigned SrcRegNum = CurOp + 1;
 
     emitRegModRMByte(MI.getOperand(SrcRegNum),
-                     getX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+                     getX86RegNum(MI.getOperand(CurOp)), OS);
     CurOp = SrcRegNum + 1;
     ++CurOp; // Encoded in VEX.VVVV
     break;
   }
   case X86II::MRMSrcRegOp4: {
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     unsigned SrcRegNum = CurOp + 1;
 
     // Skip 1st src (which is encoded in VEX_VVVV)
@@ -1542,7 +1560,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     I8RegNum = getX86RegEncoding(MI, SrcRegNum++);
 
     emitRegModRMByte(MI.getOperand(SrcRegNum),
-                     getX86RegNum(MI.getOperand(CurOp)), CurByte, OS);
+                     getX86RegNum(MI.getOperand(CurOp)), OS);
     CurOp = SrcRegNum + 1;
     break;
   }
@@ -1551,12 +1569,13 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     unsigned SecondOp = CurOp++;
 
     unsigned CC = MI.getOperand(CurOp++).getImm();
-    emitByte(BaseOpcode + CC, CurByte, OS);
+    emitByte(BaseOpcode + CC, OS);
 
     emitRegModRMByte(MI.getOperand(SecondOp),
-                     getX86RegNum(MI.getOperand(FirstOp)), CurByte, OS);
+                     getX86RegNum(MI.getOperand(FirstOp)), OS);
     break;
   }
+  case X86II::MRMSrcMemFSIB:
   case X86II::MRMSrcMem: {
     unsigned FirstMemOp = CurOp + 1;
 
@@ -1566,10 +1585,11 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     if (HasVEX_4V)
       ++FirstMemOp; // Skip the register source (which is encoded in VEX_VVVV).
 
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
 
+    bool ForceSIB = (Form == X86II::MRMSrcMemFSIB);
     emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
-                     TSFlags, Rex, CurByte, OS, Fixups, STI);
+                     TSFlags, HasREX, StartByte, OS, Fixups, STI, ForceSIB);
     CurOp = FirstMemOp + X86::AddrNumOperands;
     if (HasVEX_I8Reg)
       I8RegNum = getX86RegEncoding(MI, CurOp++);
@@ -1578,10 +1598,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRMSrcMem4VOp3: {
     unsigned FirstMemOp = CurOp + 1;
 
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
 
     emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
-                     TSFlags, Rex, CurByte, OS, Fixups, STI);
+                     TSFlags, HasREX, StartByte, OS, Fixups, STI);
     CurOp = FirstMemOp + X86::AddrNumOperands;
     ++CurOp; // Encoded in VEX.VVVV.
     break;
@@ -1595,10 +1615,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     assert(HasVEX_I8Reg && "MRMSrcRegOp4 should imply VEX_I8Reg");
     I8RegNum = getX86RegEncoding(MI, FirstMemOp++);
 
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
 
     emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(CurOp)),
-                     TSFlags, Rex, CurByte, OS, Fixups, STI);
+                     TSFlags, HasREX, StartByte, OS, Fixups, STI);
     CurOp = FirstMemOp + X86::AddrNumOperands;
     break;
   }
@@ -1608,10 +1628,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     CurOp = FirstMemOp + X86::AddrNumOperands;
 
     unsigned CC = MI.getOperand(CurOp++).getImm();
-    emitByte(BaseOpcode + CC, CurByte, OS);
+    emitByte(BaseOpcode + CC, OS);
 
     emitMemModRMByte(MI, FirstMemOp, getX86RegNum(MI.getOperand(RegOp)),
-                     TSFlags, Rex, CurByte, OS, Fixups, STI);
+                     TSFlags, HasREX, StartByte, OS, Fixups, STI);
     break;
   }
 
@@ -1619,8 +1639,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     unsigned RegOp = CurOp++;
 
     unsigned CC = MI.getOperand(CurOp++).getImm();
-    emitByte(BaseOpcode + CC, CurByte, OS);
-    emitRegModRMByte(MI.getOperand(RegOp), 0, CurByte, OS);
+    emitByte(BaseOpcode + CC, OS);
+    emitRegModRMByte(MI.getOperand(RegOp), 0, OS);
     break;
   }
 
@@ -1637,10 +1657,13 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
       ++CurOp;
     if (HasEVEX_K) // Skip writemask
       ++CurOp;
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     emitRegModRMByte(MI.getOperand(CurOp++),
-                     (Form == X86II::MRMXr) ? 0 : Form - X86II::MRM0r, CurByte,
-                     OS);
+                     (Form == X86II::MRMXr) ? 0 : Form - X86II::MRM0r, OS);
+    break;
+  case X86II::MRMr0:
+    emitByte(BaseOpcode, OS);
+    emitByte(modRMByte(3, getX86RegNum(MI.getOperand(CurOp++)),0), OS);
     break;
 
   case X86II::MRMXmCC: {
@@ -1648,9 +1671,10 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     CurOp = FirstMemOp + X86::AddrNumOperands;
 
     unsigned CC = MI.getOperand(CurOp++).getImm();
-    emitByte(BaseOpcode + CC, CurByte, OS);
+    emitByte(BaseOpcode + CC, OS);
 
-    emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, Rex, CurByte, OS, Fixups, STI);
+    emitMemModRMByte(MI, FirstMemOp, 0, TSFlags, HasREX, StartByte, OS, Fixups,
+                     STI);
     break;
   }
 
@@ -1667,13 +1691,25 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
       ++CurOp;
     if (HasEVEX_K) // Skip writemask
       ++CurOp;
-    emitByte(BaseOpcode, CurByte, OS);
+    emitByte(BaseOpcode, OS);
     emitMemModRMByte(MI, CurOp,
                      (Form == X86II::MRMXm) ? 0 : Form - X86II::MRM0m, TSFlags,
-                     Rex, CurByte, OS, Fixups, STI);
+                     HasREX, StartByte, OS, Fixups, STI);
     CurOp += X86::AddrNumOperands;
     break;
 
+  case X86II::MRM0X:
+  case X86II::MRM1X:
+  case X86II::MRM2X:
+  case X86II::MRM3X:
+  case X86II::MRM4X:
+  case X86II::MRM5X:
+  case X86II::MRM6X:
+  case X86II::MRM7X:
+    emitByte(BaseOpcode, OS);
+    emitByte(0xC0 + ((Form - X86II::MRM0X) << 3), OS);
+    break;
+
   case X86II::MRM_C0:
   case X86II::MRM_C1:
   case X86II::MRM_C2:
@@ -1738,8 +1774,8 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
   case X86II::MRM_FD:
   case X86II::MRM_FE:
   case X86II::MRM_FF:
-    emitByte(BaseOpcode, CurByte, OS);
-    emitByte(0xC0 + Form - X86II::MRM_C0, CurByte, OS);
+    emitByte(BaseOpcode, OS);
+    emitByte(0xC0 + Form - X86II::MRM_C0, OS);
     break;
   }
 
@@ -1754,7 +1790,7 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
       I8RegNum |= Val;
     }
     emitImmediate(MCOperand::createImm(I8RegNum), MI.getLoc(), 1, FK_Data_1,
-                  CurByte, OS, Fixups);
+                  StartByte, OS, Fixups);
   } else {
     // If there is a remaining operand, it must be a trailing immediate. Emit it
     // according to the right size for the instruction. Some instructions
@@ -1762,13 +1798,15 @@ void X86MCCodeEmitter::encodeInstruction(const MCInst &MI, raw_ostream &OS,
     while (CurOp != NumOps && NumOps - CurOp <= 2) {
       emitImmediate(MI.getOperand(CurOp++), MI.getLoc(),
                     X86II::getSizeOfImm(TSFlags), getImmFixupKind(TSFlags),
-                    CurByte, OS, Fixups);
+                    StartByte, OS, Fixups);
     }
   }
 
   if ((TSFlags & X86II::OpMapMask) == X86II::ThreeDNow)
-    emitByte(X86II::getBaseOpcodeFor(TSFlags), CurByte, OS);
+    emitByte(X86II::getBaseOpcodeFor(TSFlags), OS);
 
+  assert(OS.tell() - StartByte <= 15 &&
+         "The size of instruction must be no longer than 15.");
 #ifndef NDEBUG
   // FIXME: Verify.
   if (/*!Desc.isVariadic() &&*/ CurOp != NumOps) {
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 049a3a815984..81110ba666e9 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -30,10 +30,6 @@
 #include "llvm/Support/Host.h"
 #include "llvm/Support/TargetRegistry.h"
 
-#if _MSC_VER
-#include <intrin.h>
-#endif
-
 using namespace llvm;
 
 #define GET_REGINFO_MC_DESC
@@ -294,7 +290,7 @@ MCSubtargetInfo *X86_MC::createX86MCSubtargetInfo(const Triple &TT,
   if (!FS.empty())
     ArchFS = (Twine(ArchFS) + "," + FS).str();
 
-  std::string CPUName = CPU;
+  std::string CPUName = std::string(CPU);
   if (CPUName.empty())
     CPUName = "generic";
 
@@ -335,7 +331,10 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
     MAI = new X86ELFMCAsmInfo(TheTriple);
   } else if (TheTriple.isWindowsMSVCEnvironment() ||
              TheTriple.isWindowsCoreCLREnvironment()) {
-    MAI = new X86MCAsmInfoMicrosoft(TheTriple);
+    if (Options.getAssemblyLanguage().equals_lower("masm"))
+      MAI = new X86MCAsmInfoMicrosoftMASM(TheTriple);
+    else
+      MAI = new X86MCAsmInfoMicrosoft(TheTriple);
   } else if (TheTriple.isOSCygMing() ||
              TheTriple.isWindowsItaniumEnvironment()) {
     MAI = new X86MCAsmInfoGNUCOFF(TheTriple);
@@ -350,7 +349,7 @@ static MCAsmInfo *createX86MCAsmInfo(const MCRegisterInfo &MRI,
 
   // Initial state of the frame pointer is esp+stackGrowth.
   unsigned StackPtr = is64Bit ? X86::RSP : X86::ESP;
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(
+  MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(
       nullptr, MRI.getDwarfRegNum(StackPtr, true), -stackGrowth);
   MAI->addInitialFrameState(Inst);
 
@@ -401,6 +400,9 @@ public:
   findPltEntries(uint64_t PltSectionVA, ArrayRef<uint8_t> PltContents,
                  uint64_t GotSectionVA,
                  const Triple &TargetTriple) const override;
+
+  bool evaluateBranch(const MCInst &Inst, uint64_t Addr, uint64_t Size,
+                      uint64_t &Target) const override;
   Optional<uint64_t> evaluateMemoryOperandAddress(const MCInst &Inst,
                                                   uint64_t Addr,
                                                   uint64_t Size) const override;
@@ -519,6 +521,15 @@ std::vector<std::pair<uint64_t, uint64_t>> X86MCInstrAnalysis::findPltEntries(
     }
 }
 
+bool X86MCInstrAnalysis::evaluateBranch(const MCInst &Inst, uint64_t Addr,
+                                        uint64_t Size, uint64_t &Target) const {
+  if (Inst.getNumOperands() == 0 ||
+      Info->get(Inst.getOpcode()).OpInfo[0].OperandType != MCOI::OPERAND_PCREL)
+    return false;
+  Target = Addr + Size + Inst.getOperand(0).getImm();
+  return true;
+}
+
 Optional<uint64_t> X86MCInstrAnalysis::evaluateMemoryOperandAddress(
     const MCInst &Inst, uint64_t Addr, uint64_t Size) const {
   const MCInstrDesc &MCID = Info->get(Inst.getOpcode());
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
index 0c789061f0e1..e8c72be1d9b6 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.h
@@ -13,27 +13,28 @@
 #ifndef LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
 #define LLVM_LIB_TARGET_X86_MCTARGETDESC_X86MCTARGETDESC_H
 
-#include "llvm/MC/MCRegister.h"
-#include "llvm/MC/MCStreamer.h"
-#include "llvm/Support/DataTypes.h"
+#include <memory>
 #include <string>
 
 namespace llvm {
+class formatted_raw_ostream;
 class MCAsmBackend;
 class MCCodeEmitter;
 class MCContext;
+class MCInst;
+class MCInstPrinter;
 class MCInstrInfo;
 class MCObjectTargetWriter;
 class MCObjectWriter;
+class MCRegister;
 class MCRegisterInfo;
+class MCStreamer;
 class MCSubtargetInfo;
-class MCRelocationInfo;
 class MCTargetOptions;
+class MCTargetStreamer;
 class Target;
 class Triple;
 class StringRef;
-class raw_ostream;
-class raw_pwrite_stream;
 
 /// Flavour of dwarf regnumbers
 ///
diff --git a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
index 48fd3e0b7ab9..62c1c399a606 100644
--- a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.cpp
@@ -12,7 +12,9 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86ShuffleDecode.h"
+#include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 
 //===----------------------------------------------------------------------===//
 //  Vector Mask Decoding
@@ -141,9 +143,6 @@ void DecodeVALIGNMask(unsigned NumElts, unsigned Imm,
     ShuffleMask.push_back(i + Imm);
 }
 
-/// DecodePSHUFMask - This decodes the shuffle masks for pshufw, pshufd, and vpermilp*.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
 void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
                      SmallVectorImpl<int> &ShuffleMask) {
   unsigned Size = NumElts * ScalarBits;
@@ -197,9 +196,6 @@ void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask) {
     ShuffleMask.push_back(h);
 }
 
-/// DecodeSHUFPMask - This decodes the shuffle masks for shufp*. VT indicates
-/// the type of the vector allowing it to handle different datatypes and vector
-/// widths.
 void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits,
                      unsigned Imm, SmallVectorImpl<int> &ShuffleMask) {
   unsigned NumLaneElts = 128 / ScalarBits;
@@ -217,9 +213,6 @@ void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits,
   }
 }
 
-/// DecodeUNPCKHMask - This decodes the shuffle masks for unpckhps/unpckhpd
-/// and punpckh*. VT indicates the type of the vector allowing it to handle
-/// different datatypes and vector widths.
 void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits,
                       SmallVectorImpl<int> &ShuffleMask) {
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -236,9 +229,6 @@ void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits,
   }
 }
 
-/// DecodeUNPCKLMask - This decodes the shuffle masks for unpcklps/unpcklpd
-/// and punpckl*. VT indicates the type of the vector allowing it to handle
-/// different datatypes and vector widths.
 void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits,
                       SmallVectorImpl<int> &ShuffleMask) {
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -255,13 +245,11 @@ void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits,
   }
 }
 
-/// Decodes a broadcast of the first element of a vector.
 void DecodeVectorBroadcast(unsigned NumElts,
                            SmallVectorImpl<int> &ShuffleMask) {
   ShuffleMask.append(NumElts, 0);
 }
 
-/// Decodes a broadcast of a subvector to a larger vector type.
 void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts,
                               SmallVectorImpl<int> &ShuffleMask) {
   unsigned Scale = DstNumElts / SrcNumElts;
@@ -271,9 +259,6 @@ void DecodeSubVectorBroadcast(unsigned DstNumElts, unsigned SrcNumElts,
       ShuffleMask.push_back(j);
 }
 
-/// Decode a shuffle packed values at 128-bit granularity
-/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
-/// immediate mask into a shuffle mask.
 void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize,
                                unsigned Imm,
                                SmallVectorImpl<int> &ShuffleMask) {
@@ -374,7 +359,6 @@ void DecodeVPPERMMask(ArrayRef<uint64_t> RawMask, const APInt &UndefElts,
   }
 }
 
-/// DecodeVPERMMask - this decodes the shuffle masks for VPERMQ/VPERMPD.
 void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
                      SmallVectorImpl<int> &ShuffleMask) {
   for (unsigned l = 0; l != NumElts; l += 4)
@@ -384,32 +368,31 @@ void DecodeVPERMMask(unsigned NumElts, unsigned Imm,
 
 void DecodeZeroExtendMask(unsigned SrcScalarBits, unsigned DstScalarBits,
                           unsigned NumDstElts, bool IsAnyExtend,
-                          SmallVectorImpl<int> &Mask) {
+                          SmallVectorImpl<int> &ShuffleMask) {
   unsigned Scale = DstScalarBits / SrcScalarBits;
   assert(SrcScalarBits < DstScalarBits &&
          "Expected zero extension mask to increase scalar size");
 
+  int Sentinel = IsAnyExtend ? SM_SentinelUndef : SM_SentinelZero;
   for (unsigned i = 0; i != NumDstElts; i++) {
-    Mask.push_back(i);
-    for (unsigned j = 1; j != Scale; j++)
-      Mask.push_back(IsAnyExtend ? SM_SentinelUndef : SM_SentinelZero);
+    ShuffleMask.push_back(i);
+    ShuffleMask.append(Scale - 1, Sentinel);
   }
 }
 
 void DecodeZeroMoveLowMask(unsigned NumElts,
                            SmallVectorImpl<int> &ShuffleMask) {
   ShuffleMask.push_back(0);
-  for (unsigned i = 1; i < NumElts; i++)
-    ShuffleMask.push_back(SM_SentinelZero);
+  ShuffleMask.append(NumElts - 1, SM_SentinelZero);
 }
 
 void DecodeScalarMoveMask(unsigned NumElts, bool IsLoad,
-                          SmallVectorImpl<int> &Mask) {
+                          SmallVectorImpl<int> &ShuffleMask) {
   // First element comes from the first element of second source.
   // Remaining elements: Load zero extends / Move copies from first source.
-  Mask.push_back(NumElts);
+  ShuffleMask.push_back(NumElts);
   for (unsigned i = 1; i < NumElts; i++)
-    Mask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
+    ShuffleMask.push_back(IsLoad ? static_cast<int>(SM_SentinelZero) : i);
 }
 
 void DecodeEXTRQIMask(unsigned NumElts, unsigned EltSize, int Len, int Idx,
diff --git a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h b/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.h
index f52785063071..4ef9959f7a27 100644
--- a/llvm/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86ShuffleDecode.h
@@ -14,15 +14,16 @@
 #ifndef LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
 #define LLVM_LIB_TARGET_X86_UTILS_X86SHUFFLEDECODE_H
 
-#include "llvm/ADT/APInt.h"
-#include "llvm/ADT/SmallVector.h"
+#include <cstdint>
 
 //===----------------------------------------------------------------------===//
 //  Vector Mask Decoding
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
+class APInt;
 template <typename T> class ArrayRef;
+template <typename T> class SmallVectorImpl;
 
 enum { SM_SentinelUndef = -1, SM_SentinelZero = -2 };
 
@@ -61,20 +62,14 @@ void DecodeVALIGNMask(unsigned NumElts, unsigned Imm,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes the shuffle masks for pshufd/pshufw/vpermilpd/vpermilps.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
 void DecodePSHUFMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
                      SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes the shuffle masks for pshufhw.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
 void DecodePSHUFHWMask(unsigned NumElts, unsigned Imm,
                        SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes the shuffle masks for pshuflw.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
 void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm,
                        SmallVectorImpl<int> &ShuffleMask);
 
@@ -82,20 +77,14 @@ void DecodePSHUFLWMask(unsigned NumElts, unsigned Imm,
 void DecodePSWAPMask(unsigned NumElts, SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes the shuffle masks for shufp*.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
 void DecodeSHUFPMask(unsigned NumElts, unsigned ScalarBits, unsigned Imm,
                      SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes the shuffle masks for unpckhps/unpckhpd and punpckh*.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
 void DecodeUNPCKHMask(unsigned NumElts, unsigned ScalarBits,
                       SmallVectorImpl<int> &ShuffleMask);
 
 /// Decodes the shuffle masks for unpcklps/unpcklpd and punpckl*.
-/// VT indicates the type of the vector allowing it to handle different
-/// datatypes and vector widths.
 void DecodeUNPCKLMask(unsigned NumElts, unsigned ScalarBits,
                       SmallVectorImpl<int> &ShuffleMask);
 
@@ -119,6 +108,7 @@ void DecodeVPERM2X128Mask(unsigned NumElts, unsigned Imm,
                           SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a shuffle packed values at 128-bit granularity
+/// (SHUFF32x4/SHUFF64x2/SHUFI32x4/SHUFI64x2)
 /// immediate mask into a shuffle mask.
 void decodeVSHUF64x2FamilyMask(unsigned NumElts, unsigned ScalarSize,
                                unsigned Imm, SmallVectorImpl<int> &ShuffleMask);
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
index db624378d517..3bebcc24fd3a 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFStreamer.cpp
@@ -28,7 +28,7 @@ public:
   void EmitWinEHHandlerData(SMLoc Loc) override;
   void EmitWindowsUnwindTables() override;
   void EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) override;
-  void FinishImpl() override;
+  void finishImpl() override;
 };
 
 void X86WinCOFFStreamer::EmitWinEHHandlerData(SMLoc Loc) {
@@ -52,11 +52,11 @@ void X86WinCOFFStreamer::EmitCVFPOData(const MCSymbol *ProcSym, SMLoc Loc) {
   XTS->emitFPOData(ProcSym, Loc);
 }
 
-void X86WinCOFFStreamer::FinishImpl() {
-  EmitFrames(nullptr);
+void X86WinCOFFStreamer::finishImpl() {
+  emitFrames(nullptr);
   EmitWindowsUnwindTables();
 
-  MCWinCOFFStreamer::FinishImpl();
+  MCWinCOFFStreamer::finishImpl();
 }
 }
 
diff --git a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
index d5494ef12370..11251fb2b2ba 100644
--- a/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
+++ b/llvm/lib/Target/X86/MCTargetDesc/X86WinCOFFTargetStreamer.cpp
@@ -159,7 +159,7 @@ bool X86WinCOFFTargetStreamer::checkInFPOPrologue(SMLoc L) {
 
 MCSymbol *X86WinCOFFTargetStreamer::emitFPOLabel() {
   MCSymbol *Label = getContext().createTempSymbol("cfi", true);
-  getStreamer().EmitLabel(Label);
+  getStreamer().emitLabel(Label);
   return Label;
 }
 
@@ -372,13 +372,13 @@ void FPOStateMachine::emitFrameDataRecord(MCStreamer &OS, MCSymbol *Label) {
 
   OS.emitAbsoluteSymbolDiff(Label, FPO->Begin, 4); // RvaStart
   OS.emitAbsoluteSymbolDiff(FPO->End, Label, 4);   // CodeSize
-  OS.EmitIntValue(LocalSize, 4);
-  OS.EmitIntValue(FPO->ParamsSize, 4);
-  OS.EmitIntValue(MaxStackSize, 4);
-  OS.EmitIntValue(FrameFuncStrTabOff, 4); // FrameFunc
+  OS.emitInt32(LocalSize);
+  OS.emitInt32(FPO->ParamsSize);
+  OS.emitInt32(MaxStackSize);
+  OS.emitInt32(FrameFuncStrTabOff); // FrameFunc
   OS.emitAbsoluteSymbolDiff(FPO->PrologueEnd, Label, 2);
-  OS.EmitIntValue(SavedRegSize, 2);
-  OS.EmitIntValue(CurFlags, 4);
+  OS.emitInt16(SavedRegSize);
+  OS.emitInt32(CurFlags);
 }
 
 /// Compute and emit the real CodeView FrameData subsection.
@@ -398,12 +398,12 @@ bool X86WinCOFFTargetStreamer::emitFPOData(const MCSymbol *ProcSym, SMLoc L) {
   MCSymbol *FrameBegin = Ctx.createTempSymbol(),
            *FrameEnd = Ctx.createTempSymbol();
 
-  OS.EmitIntValue(unsigned(DebugSubsectionKind::FrameData), 4);
+  OS.emitInt32(unsigned(DebugSubsectionKind::FrameData));
   OS.emitAbsoluteSymbolDiff(FrameEnd, FrameBegin, 4);
-  OS.EmitLabel(FrameBegin);
+  OS.emitLabel(FrameBegin);
 
   // Start with the RVA of the function in question.
-  OS.EmitValue(MCSymbolRefExpr::create(FPO->Function,
+  OS.emitValue(MCSymbolRefExpr::create(FPO->Function,
                                        MCSymbolRefExpr::VK_COFF_IMGREL32, Ctx),
                4);
 
@@ -437,8 +437,8 @@ bool X86WinCOFFTargetStreamer::emitFPOData(const MCSymbol *ProcSym, SMLoc L) {
     FSM.emitFrameDataRecord(OS, Inst.Label);
   }
 
-  OS.EmitValueToAlignment(4, 0);
-  OS.EmitLabel(FrameEnd);
+  OS.emitValueToAlignment(4, 0);
+  OS.emitLabel(FrameEnd);
   return false;
 }
 
diff --git a/llvm/lib/Target/X86/X86.h b/llvm/lib/Target/X86/X86.h
index 0481a40d462a..91ba4e3d091e 100644
--- a/llvm/lib/Target/X86/X86.h
+++ b/llvm/lib/Target/X86/X86.h
@@ -19,9 +19,7 @@
 namespace llvm {
 
 class FunctionPass;
-class ImmutablePass;
 class InstructionSelector;
-class ModulePass;
 class PassRegistry;
 class X86RegisterBankInfo;
 class X86Subtarget;
@@ -120,7 +118,7 @@ FunctionPass *createX86DomainReassignmentPass();
 FunctionPass *createX86EvexToVexInsts();
 
 /// This pass creates the thunks for the retpoline feature.
-FunctionPass *createX86RetpolineThunksPass();
+FunctionPass *createX86IndirectThunksPass();
 
 /// This pass ensures instructions featuring a memory operand
 /// have distinctive <LineNumber, Discriminator> (with respect to eachother)
@@ -129,11 +127,23 @@ FunctionPass *createX86DiscriminateMemOpsPass();
 /// This pass applies profiling information to insert cache prefetches.
 FunctionPass *createX86InsertPrefetchPass();
 
+/// This pass insert wait instruction after X87 instructions which could raise
+/// fp exceptions when strict-fp enabled.
+FunctionPass *createX86InsertX87waitPass();
+
+/// This pass optimizes arithmetic based on knowledge that is only used by
+/// a reduction sequence and is therefore safe to reassociate in interesting
+/// ways.
+FunctionPass *createX86PartialReductionPass();
+
 InstructionSelector *createX86InstructionSelector(const X86TargetMachine &TM,
                                                   X86Subtarget &,
                                                   X86RegisterBankInfo &);
 
+FunctionPass *createX86LoadValueInjectionLoadHardeningPass();
+FunctionPass *createX86LoadValueInjectionRetHardeningPass();
 FunctionPass *createX86SpeculativeLoadHardeningPass();
+FunctionPass *createX86SpeculativeExecutionSideEffectSuppression();
 
 void initializeEvexToVexInstPassPass(PassRegistry &);
 void initializeFixupBWInstPassPass(PassRegistry &);
@@ -141,15 +151,21 @@ void initializeFixupLEAPassPass(PassRegistry &);
 void initializeFPSPass(PassRegistry &);
 void initializeWinEHStatePassPass(PassRegistry &);
 void initializeX86AvoidSFBPassPass(PassRegistry &);
+void initializeX86AvoidTrailingCallPassPass(PassRegistry &);
 void initializeX86CallFrameOptimizationPass(PassRegistry &);
 void initializeX86CmovConverterPassPass(PassRegistry &);
 void initializeX86CondBrFoldingPassPass(PassRegistry &);
 void initializeX86DomainReassignmentPass(PassRegistry &);
 void initializeX86ExecutionDomainFixPass(PassRegistry &);
 void initializeX86ExpandPseudoPass(PassRegistry &);
+void initializeX86FixupSetCCPassPass(PassRegistry &);
 void initializeX86FlagsCopyLoweringPassPass(PassRegistry &);
+void initializeX86LoadValueInjectionLoadHardeningPassPass(PassRegistry &);
+void initializeX86LoadValueInjectionRetHardeningPassPass(PassRegistry &);
 void initializeX86OptimizeLEAPassPass(PassRegistry &);
+void initializeX86PartialReductionPass(PassRegistry &);
 void initializeX86SpeculativeLoadHardeningPassPass(PassRegistry &);
+void initializeX86SpeculativeExecutionSideEffectSuppressionPass(PassRegistry &);
 
 namespace X86AS {
 enum : unsigned {
diff --git a/llvm/lib/Target/X86/X86.td b/llvm/lib/Target/X86/X86.td
index a2b11d55f650..dc1ff72add49 100644
--- a/llvm/lib/Target/X86/X86.td
+++ b/llvm/lib/Target/X86/X86.td
@@ -52,13 +52,16 @@ def FeatureXSAVE   : SubtargetFeature<"xsave", "HasXSAVE", "true",
                                        "Support xsave instructions">;
 
 def FeatureXSAVEOPT: SubtargetFeature<"xsaveopt", "HasXSAVEOPT", "true",
-                                       "Support xsaveopt instructions">;
+                                       "Support xsaveopt instructions",
+                                       [FeatureXSAVE]>;
 
 def FeatureXSAVEC  : SubtargetFeature<"xsavec", "HasXSAVEC", "true",
-                                       "Support xsavec instructions">;
+                                       "Support xsavec instructions",
+                                       [FeatureXSAVE]>;
 
 def FeatureXSAVES  : SubtargetFeature<"xsaves", "HasXSAVES", "true",
-                                       "Support xsaves instructions">;
+                                       "Support xsaves instructions",
+                                       [FeatureXSAVE]>;
 
 def FeatureSSE1    : SubtargetFeature<"sse", "X86SSELevel", "SSE1",
                                       "Enable SSE instructions">;
@@ -246,6 +249,14 @@ def FeaturePTWRITE  : SubtargetFeature<"ptwrite", "HasPTWRITE", "true",
 // target-feature attribute.
 def FeatureDeprecatedMPX : SubtargetFeature<"mpx", "DeprecatedHasMPX", "false",
                                       "Deprecated. Support MPX instructions">;
+def FeatureAMXTILE     : SubtargetFeature<"amx-tile", "HasAMXTILE", "true",
+                                      "Support AMX-TILE instructions">;
+def FeatureAMXINT8     : SubtargetFeature<"amx-int8", "HasAMXINT8", "true",
+                                      "Support AMX-INT8 instructions",
+                                      [FeatureAMXTILE]>;
+def FeatureAMXBF16     : SubtargetFeature<"amx-bf16", "HasAMXBF16", "true",
+                                      "Support AMX-BF16 instructions",
+                                      [FeatureAMXTILE]>;
 def FeatureLEAForSP : SubtargetFeature<"lea-sp", "UseLeaForSP", "true",
                                      "Use LEA for adjusting the stack pointer">;
 def FeatureSlowDivide32 : SubtargetFeature<"idivl-to-divb",
@@ -273,6 +284,10 @@ def FeatureWAITPKG  : SubtargetFeature<"waitpkg", "HasWAITPKG", "true",
                                       "Wait and pause enhancements">;
 def FeatureENQCMD : SubtargetFeature<"enqcmd", "HasENQCMD", "true",
                                      "Has ENQCMD instructions">;
+def FeatureSERIALIZE : SubtargetFeature<"serialize", "HasSERIALIZE", "true",
+                                        "Has serialize instruction">;
+def FeatureTSXLDTRK : SubtargetFeature<"tsxldtrk", "HasTSXLDTRK", "true",
+                                       "Support TSXLDTRK instructions">;
 // On some processors, instructions that implicitly take two memory operands are
 // slow. In practice, this means that CALL, PUSH, and POP with memory operands
 // should be avoided in favor of a MOV + register CALL/PUSH/POP.
@@ -329,6 +344,11 @@ def FeatureFastLZCNT
     : SubtargetFeature<
           "fast-lzcnt", "HasFastLZCNT", "true",
           "LZCNT instructions are as fast as most simple integer ops">;
+// If the target can efficiently decode NOPs upto 7-bytes in length.
+def FeatureFast7ByteNOP
+    : SubtargetFeature<
+          "fast-7bytenop", "HasFast7ByteNOP", "true",
+          "Target can quickly decode up to 7 byte NOPs">;
 // If the target can efficiently decode NOPs upto 11-bytes in length.
 def FeatureFast11ByteNOP
     : SubtargetFeature<
@@ -426,6 +446,31 @@ def FeatureRetpolineExternalThunk
           "ourselves. Only has effect when combined with some other retpoline "
           "feature", [FeatureRetpolineIndirectCalls]>;
 
+// Mitigate LVI attacks against indirect calls/branches and call returns
+def FeatureLVIControlFlowIntegrity
+    : SubtargetFeature<
+          "lvi-cfi", "UseLVIControlFlowIntegrity", "true",
+          "Prevent indirect calls/branches from using a memory operand, and "
+          "precede all indirect calls/branches from a register with an "
+          "LFENCE instruction to serialize control flow. Also decompose RET "
+          "instructions into a POP+LFENCE+JMP sequence.">;
+
+// Enable SESES to mitigate speculative execution attacks
+def FeatureSpeculativeExecutionSideEffectSuppression
+    : SubtargetFeature<
+          "seses", "UseSpeculativeExecutionSideEffectSuppression", "true",
+          "Prevent speculative execution side channel timing attacks by "
+          "inserting a speculation barrier before memory reads, memory writes, "
+          "and conditional branches. Implies LVI Control Flow integrity.",
+          [FeatureLVIControlFlowIntegrity]>;
+
+// Mitigate LVI attacks against data loads
+def FeatureLVILoadHardening
+    : SubtargetFeature<
+          "lvi-load-hardening", "UseLVILoadHardening", "true",
+          "Insert LFENCE instructions to prevent data speculatively injected "
+          "into loads from being used maliciously.">;
+
 // Direct Move instructions.
 def FeatureMOVDIRI  : SubtargetFeature<"movdiri", "HasMOVDIRI", "true",
                                        "Support movdiri instruction">;
@@ -546,7 +591,8 @@ def ProcessorFeatures {
                                                   FeatureSlow3OpsLEA,
                                                   FeatureFastScalarFSQRT,
                                                   FeatureFastSHLDRotate,
-                                                  FeatureMergeToThreeWayBranch];
+                                                  FeatureMergeToThreeWayBranch,
+                                                  FeatureFast15ByteNOP];
   list<SubtargetFeature> SNBSpecificFeatures = [FeatureSlowUAMem32,
                                                 FeaturePOPCNTFalseDeps];
   list<SubtargetFeature> SNBInheritableFeatures =
@@ -728,6 +774,7 @@ def ProcessorFeatures {
   list<SubtargetFeature> SLMSpecificFeatures = [ProcIntelSLM,
                                                 FeatureSlowDivide64,
                                                 FeatureSlowPMULLD,
+                                                FeatureFast7ByteNOP,
                                                 FeaturePOPCNTFalseDeps];
   list<SubtargetFeature> SLMInheritableFeatures =
     !listconcat(AtomInheritableFeatures, SLMAdditionalFeatures);
@@ -762,15 +809,13 @@ def ProcessorFeatures {
     !listconcat(GLPInheritableFeatures, GLPSpecificFeatures);
 
   // Tremont
-  list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLDEMOTE,
-                                                  FeatureGFNI,
-                                                  FeatureMOVDIRI,
-                                                  FeatureMOVDIR64B,
-                                                  FeatureWAITPKG];
+  list<SubtargetFeature> TRMAdditionalFeatures = [FeatureCLWB,
+                                                  FeatureGFNI];
   list<SubtargetFeature> TRMSpecificFeatures = [FeatureUseGLMDivSqrtCosts];
+  list<SubtargetFeature> TRMInheritableFeatures =
+    !listconcat(GLPInheritableFeatures, TRMAdditionalFeatures);
   list<SubtargetFeature> TRMFeatures =
-    !listconcat(GLPInheritableFeatures, TRMAdditionalFeatures,
-                TRMSpecificFeatures);
+    !listconcat(TRMInheritableFeatures, TRMSpecificFeatures);
 
   // Knights Landing
   list<SubtargetFeature> KNLFeatures = [FeatureX87,
@@ -822,6 +867,7 @@ def ProcessorFeatures {
                                                          FeatureFXSR,
                                                          FeatureNOPL,
                                                          FeatureCMPXCHG16B,
+                                                         FeaturePRFCHW,
                                                          FeatureLZCNT,
                                                          FeaturePOPCNT,
                                                          FeatureSlowSHLD,
@@ -917,6 +963,8 @@ def ProcessorFeatures {
   // Excavator
   list<SubtargetFeature> BdVer4AdditionalFeatures = [FeatureAVX2,
                                                      FeatureBMI2,
+                                                     FeatureMOVBE,
+                                                     FeatureRDRAND,
                                                      FeatureMWAITX];
   list<SubtargetFeature> BdVer4InheritableFeatures =
     !listconcat(BdVer3InheritableFeatures, BdVer4AdditionalFeatures);
@@ -977,7 +1025,7 @@ def ProcessorFeatures {
 class Proc<string Name, list<SubtargetFeature> Features>
  : ProcessorModel<Name, GenericModel, Features>;
 
-// NOTE: CMPXCHG8B is here for legacy compatbility so that it is only disabled
+// NOTE: CMPXCHG8B is here for legacy compatibility so that it is only disabled
 // if i386/i486 is specifically requested.
 def : Proc<"generic",         [FeatureX87, FeatureSlowUAMem16,
                                FeatureCMPXCHG8B, FeatureInsertVZEROUPPER]>;
@@ -1240,6 +1288,7 @@ def : ProcessorModel<"x86-64", SandyBridgeModel, [
   FeatureNOPL,
   Feature64Bit,
   FeatureSlow3OpsLEA,
+  FeatureSlowDivide64,
   FeatureSlowIncDec,
   FeatureMacroFusion,
   FeatureInsertVZEROUPPER
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.cpp b/llvm/lib/Target/X86/X86AsmPrinter.cpp
index 39d16e7999cd..aa03217d155d 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.cpp
+++ b/llvm/lib/Target/X86/X86AsmPrinter.cpp
@@ -18,6 +18,7 @@
 #include "TargetInfo/X86TargetInfo.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
+#include "X86Subtarget.h"
 #include "llvm/BinaryFormat/COFF.h"
 #include "llvm/BinaryFormat/ELF.h"
 #include "llvm/CodeGen/MachineConstantPool.h"
@@ -40,6 +41,8 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MachineValueType.h"
 #include "llvm/Support/TargetRegistry.h"
+#include "llvm/Target/TargetMachine.h"
+
 using namespace llvm;
 
 X86AsmPrinter::X86AsmPrinter(TargetMachine &TM,
@@ -76,7 +79,7 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   }
 
   // Emit the rest of the function body.
-  EmitFunctionBody();
+  emitFunctionBody();
 
   // Emit the XRay table for this function.
   emitXRayTable();
@@ -87,7 +90,7 @@ bool X86AsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   return false;
 }
 
-void X86AsmPrinter::EmitFunctionBodyStart() {
+void X86AsmPrinter::emitFunctionBodyStart() {
   if (EmitFPOData) {
     if (auto *XTS =
         static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer()))
@@ -97,7 +100,7 @@ void X86AsmPrinter::EmitFunctionBodyStart() {
   }
 }
 
-void X86AsmPrinter::EmitFunctionBodyEnd() {
+void X86AsmPrinter::emitFunctionBodyEnd() {
   if (EmitFPOData) {
     if (auto *XTS =
             static_cast<X86TargetStreamer *>(OutStreamer->getTargetStreamer()))
@@ -124,7 +127,7 @@ void X86AsmPrinter::PrintSymbolOperand(const MachineOperand &MO,
         MO.getTargetFlags() == X86II::MO_DARWIN_NONLAZY_PIC_BASE)
       GVSym = getSymbolWithGlobalValueBase(GV, "$non_lazy_ptr");
     else
-      GVSym = getSymbol(GV);
+      GVSym = getSymbolPreferLocal(*GV);
 
     // Handle dllimport linkage.
     if (MO.getTargetFlags() == X86II::MO_DLLIMPORT)
@@ -404,7 +407,7 @@ void X86AsmPrinter::PrintIntelMemReference(const MachineInstr *MI,
 static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
                               char Mode, raw_ostream &O) {
   Register Reg = MO.getReg();
-  bool EmitPercent = true;
+  bool EmitPercent = MO.getParent()->getInlineAsmDialect() == InlineAsm::AD_ATT;
 
   if (!X86::GR8RegClass.contains(Reg) &&
       !X86::GR16RegClass.contains(Reg) &&
@@ -443,6 +446,42 @@ static bool printAsmMRegister(X86AsmPrinter &P, const MachineOperand &MO,
   return false;
 }
 
+static bool printAsmVRegister(X86AsmPrinter &P, const MachineOperand &MO,
+                              char Mode, raw_ostream &O) {
+  unsigned Reg = MO.getReg();
+  bool EmitPercent = MO.getParent()->getInlineAsmDialect() == InlineAsm::AD_ATT;
+
+  unsigned Index;
+  if (X86::VR128XRegClass.contains(Reg))
+    Index = Reg - X86::XMM0;
+  else if (X86::VR256XRegClass.contains(Reg))
+    Index = Reg - X86::YMM0;
+  else if (X86::VR512RegClass.contains(Reg))
+    Index = Reg - X86::ZMM0;
+  else
+    return true;
+
+  switch (Mode) {
+  default: // Unknown mode.
+    return true;
+  case 'x': // Print V4SFmode register
+    Reg = X86::XMM0 + Index;
+    break;
+  case 't': // Print V8SFmode register
+    Reg = X86::YMM0 + Index;
+    break;
+  case 'g': // Print V16SFmode register
+    Reg = X86::ZMM0 + Index;
+    break;
+  }
+
+  if (EmitPercent)
+    O << '%';
+
+  O << X86ATTInstPrinter::getRegisterName(Reg);
+  return false;
+}
+
 /// PrintAsmOperand - Print out an operand for an inline asm expression.
 ///
 bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
@@ -517,6 +556,14 @@ bool X86AsmPrinter::PrintAsmOperand(const MachineInstr *MI, unsigned OpNo,
       PrintOperand(MI, OpNo, O);
       return false;
 
+    case 'x': // Print V4SFmode register
+    case 't': // Print V8SFmode register
+    case 'g': // Print V16SFmode register
+      if (MO.isReg())
+        return printAsmVRegister(*this, MO, ExtraCode[0], O);
+      PrintOperand(MI, OpNo, O);
+      return false;
+
     case 'P': // This is the operand of a call, treat specially.
       PrintPCRelImm(MI, OpNo, O);
       return false;
@@ -575,7 +622,7 @@ bool X86AsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI, unsigned OpNo,
   return false;
 }
 
-void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
+void X86AsmPrinter::emitStartOfAsmFile(Module &M) {
   const Triple &TT = TM.getTargetTriple();
 
   if (TT.isOSBinFormatELF()) {
@@ -597,17 +644,17 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
 
       // Emitting note header.
       int WordSize = TT.isArch64Bit() ? 8 : 4;
-      EmitAlignment(WordSize == 4 ? Align(4) : Align(8));
-      OutStreamer->EmitIntValue(4, 4 /*size*/); // data size for "GNU\0"
-      OutStreamer->EmitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size
-      OutStreamer->EmitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4 /*size*/);
-      OutStreamer->EmitBytes(StringRef("GNU", 4)); // note name
+      emitAlignment(WordSize == 4 ? Align(4) : Align(8));
+      OutStreamer->emitIntValue(4, 4 /*size*/); // data size for "GNU\0"
+      OutStreamer->emitIntValue(8 + WordSize, 4 /*size*/); // Elf_Prop size
+      OutStreamer->emitIntValue(ELF::NT_GNU_PROPERTY_TYPE_0, 4 /*size*/);
+      OutStreamer->emitBytes(StringRef("GNU", 4)); // note name
 
       // Emitting an Elf_Prop for the CET properties.
-      OutStreamer->EmitIntValue(ELF::GNU_PROPERTY_X86_FEATURE_1_AND, 4);
-      OutStreamer->EmitIntValue(4, 4);               // data size
-      OutStreamer->EmitIntValue(FeatureFlagsAnd, 4); // data
-      EmitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding
+      OutStreamer->emitInt32(ELF::GNU_PROPERTY_X86_FEATURE_1_AND);
+      OutStreamer->emitInt32(4);                          // data size
+      OutStreamer->emitInt32(FeatureFlagsAnd);            // data
+      emitAlignment(WordSize == 4 ? Align(4) : Align(8)); // padding
 
       OutStreamer->endSection(Nt);
       OutStreamer->SwitchSection(Cur);
@@ -639,30 +686,30 @@ void X86AsmPrinter::EmitStartOfAsmFile(Module &M) {
     if (M.getModuleFlag("cfguard"))
       Feat00Flags |= 0x800; // Object is CFG-aware.
 
-    OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
-    OutStreamer->EmitAssignment(
+    OutStreamer->emitSymbolAttribute(S, MCSA_Global);
+    OutStreamer->emitAssignment(
         S, MCConstantExpr::create(Feat00Flags, MMI->getContext()));
   }
-  OutStreamer->EmitSyntaxDirective();
+  OutStreamer->emitSyntaxDirective();
 
   // If this is not inline asm and we're in 16-bit
   // mode prefix assembly with .code16.
   bool is16 = TT.getEnvironment() == Triple::CODE16;
   if (M.getModuleInlineAsm().empty() && is16)
-    OutStreamer->EmitAssemblerFlag(MCAF_Code16);
+    OutStreamer->emitAssemblerFlag(MCAF_Code16);
 }
 
 static void
 emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
                          MachineModuleInfoImpl::StubValueTy &MCSym) {
   // L_foo$stub:
-  OutStreamer.EmitLabel(StubLabel);
+  OutStreamer.emitLabel(StubLabel);
   //   .indirect_symbol _foo
-  OutStreamer.EmitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
+  OutStreamer.emitSymbolAttribute(MCSym.getPointer(), MCSA_IndirectSymbol);
 
   if (MCSym.getInt())
     // External to current translation unit.
-    OutStreamer.EmitIntValue(0, 4/*size*/);
+    OutStreamer.emitIntValue(0, 4/*size*/);
   else
     // Internal to current translation unit.
     //
@@ -670,7 +717,7 @@ emitNonLazySymbolPointer(MCStreamer &OutStreamer, MCSymbol *StubLabel,
     // pointers need to be indirect and pc-rel. We accomplish this by
     // using NLPs; however, sometimes the types are local to the file.
     // We need to fill in the value for the NLP in those cases.
-    OutStreamer.EmitValue(
+    OutStreamer.emitValue(
         MCSymbolRefExpr::create(MCSym.getPointer(), OutStreamer.getContext()),
         4 /*size*/);
 }
@@ -698,7 +745,7 @@ static void emitNonLazyStubs(MachineModuleInfo *MMI, MCStreamer &OutStreamer) {
   }
 }
 
-void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
+void X86AsmPrinter::emitEndOfAsmFile(Module &M) {
   const Triple &TT = TM.getTargetTriple();
 
   if (TT.isOSBinFormatMachO()) {
@@ -715,7 +762,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
     // points). If this doesn't occur, the linker can safely perform dead code
     // stripping. Since LLVM never generates code that does this, it is always
     // safe to set.
-    OutStreamer->EmitAssemblerFlag(MCAF_SubsectionsViaSymbols);
+    OutStreamer->emitAssemblerFlag(MCAF_SubsectionsViaSymbols);
   } else if (TT.isOSBinFormatCOFF()) {
     if (MMI->usesMSVCFloatingPoint()) {
       // In Windows' libcmt.lib, there is a file which is linked in only if the
@@ -734,7 +781,7 @@ void X86AsmPrinter::EmitEndOfAsmFile(Module &M) {
       StringRef SymbolName =
           (TT.getArch() == Triple::x86) ? "__fltused" : "_fltused";
       MCSymbol *S = MMI->getContext().getOrCreateSymbol(SymbolName);
-      OutStreamer->EmitSymbolAttribute(S, MCSA_Global);
+      OutStreamer->emitSymbolAttribute(S, MCSA_Global);
       return;
     }
     emitStackMaps(SM);
diff --git a/llvm/lib/Target/X86/X86AsmPrinter.h b/llvm/lib/Target/X86/X86AsmPrinter.h
index ee79401dc80d..eb485fa2ecef 100644
--- a/llvm/lib/Target/X86/X86AsmPrinter.h
+++ b/llvm/lib/Target/X86/X86AsmPrinter.h
@@ -9,12 +9,9 @@
 #ifndef LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
 #define LLVM_LIB_TARGET_X86_X86ASMPRINTER_H
 
-#include "X86Subtarget.h"
 #include "llvm/CodeGen/AsmPrinter.h"
 #include "llvm/CodeGen/FaultMaps.h"
 #include "llvm/CodeGen/StackMaps.h"
-#include "llvm/MC/MCCodeEmitter.h"
-#include "llvm/Target/TargetMachine.h"
 
 // Implemented in X86MCInstLower.cpp
 namespace {
@@ -22,8 +19,10 @@ namespace {
 }
 
 namespace llvm {
+class MCCodeEmitter;
 class MCStreamer;
-class MCSymbol;
+class X86Subtarget;
+class TargetMachine;
 
 class LLVM_LIBRARY_VISIBILITY X86AsmPrinter : public AsmPrinter {
   const X86Subtarget *Subtarget = nullptr;
@@ -123,14 +122,14 @@ public:
 
   const X86Subtarget &getSubtarget() const { return *Subtarget; }
 
-  void EmitStartOfAsmFile(Module &M) override;
+  void emitStartOfAsmFile(Module &M) override;
 
-  void EmitEndOfAsmFile(Module &M) override;
+  void emitEndOfAsmFile(Module &M) override;
 
-  void EmitInstruction(const MachineInstr *MI) override;
+  void emitInstruction(const MachineInstr *MI) override;
 
-  void EmitBasicBlockEnd(const MachineBasicBlock &MBB) override {
-    AsmPrinter::EmitBasicBlockEnd(MBB);
+  void emitBasicBlockEnd(const MachineBasicBlock &MBB) override {
+    AsmPrinter::emitBasicBlockEnd(MBB);
     SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
   }
 
@@ -147,8 +146,8 @@ public:
   }
 
   bool runOnMachineFunction(MachineFunction &F) override;
-  void EmitFunctionBodyStart() override;
-  void EmitFunctionBodyEnd() override;
+  void emitFunctionBodyStart() override;
+  void emitFunctionBodyEnd() override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
index 0f1d4b51062e..9f1fece1b9dd 100644
--- a/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
+++ b/llvm/lib/Target/X86/X86AvoidStoreForwardingBlocks.cpp
@@ -1,4 +1,4 @@
-//===- X86AvoidStoreForwardingBlockis.cpp - Avoid HW Store Forward Block --===//
+//===- X86AvoidStoreForwardingBlocks.cpp - Avoid HW Store Forward Block ---===//
 //
 // Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
 // See https://llvm.org/LICENSE.txt for license information.
@@ -33,6 +33,7 @@
 // transformation done here is correct regardless to other memory accesses.
 //===----------------------------------------------------------------------===//
 
+#include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/Analysis/AliasAnalysis.h"
@@ -287,7 +288,7 @@ static unsigned getYMMtoXMMStoreOpcode(unsigned StoreOpcode) {
   return 0;
 }
 
-static int getAddrOffset(MachineInstr *MI) {
+static int getAddrOffset(const MachineInstr *MI) {
   const MCInstrDesc &Descl = MI->getDesc();
   int AddrOffset = X86II::getMemoryOperandNo(Descl.TSFlags);
   assert(AddrOffset != -1 && "Expected Memory Operand");
@@ -310,11 +311,11 @@ static MachineOperand &getDispOperand(MachineInstr *MI) {
 // TODO: Consider expanding to other addressing modes in the future
 static bool isRelevantAddressingMode(MachineInstr *MI) {
   int AddrOffset = getAddrOffset(MI);
-  MachineOperand &Base = getBaseOperand(MI);
-  MachineOperand &Disp = getDispOperand(MI);
-  MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
-  MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
-  MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
+  const MachineOperand &Base = getBaseOperand(MI);
+  const MachineOperand &Disp = getDispOperand(MI);
+  const MachineOperand &Scale = MI->getOperand(AddrOffset + X86::AddrScaleAmt);
+  const MachineOperand &Index = MI->getOperand(AddrOffset + X86::AddrIndexReg);
+  const MachineOperand &Segment = MI->getOperand(AddrOffset + X86::AddrSegmentReg);
 
   if (!((Base.isReg() && Base.getReg() != X86::NoRegister) || Base.isFI()))
     return false;
@@ -410,9 +411,8 @@ void X86AvoidSFBPass::buildCopy(MachineInstr *LoadInst, unsigned NLoadOpcode,
   // If the load and store are consecutive, use the loadInst location to
   // reduce register pressure.
   MachineInstr *StInst = StoreInst;
-  auto PrevInstrIt = skipDebugInstructionsBackward(
-      std::prev(MachineBasicBlock::instr_iterator(StoreInst)),
-      MBB->instr_begin());
+  auto PrevInstrIt = prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),
+                                MBB->instr_begin());
   if (PrevInstrIt.getNodePtr() == LoadInst)
     StInst = LoadInst;
   MachineInstr *NewStore =
@@ -498,9 +498,10 @@ void X86AvoidSFBPass::buildCopies(int Size, MachineInstr *LoadInst,
 static void updateKillStatus(MachineInstr *LoadInst, MachineInstr *StoreInst) {
   MachineOperand &LoadBase = getBaseOperand(LoadInst);
   MachineOperand &StoreBase = getBaseOperand(StoreInst);
-  auto StorePrevNonDbgInstr = skipDebugInstructionsBackward(
-          std::prev(MachineBasicBlock::instr_iterator(StoreInst)),
-          LoadInst->getParent()->instr_begin()).getNodePtr();
+  auto *StorePrevNonDbgInstr =
+      prev_nodbg(MachineBasicBlock::instr_iterator(StoreInst),
+                 LoadInst->getParent()->instr_begin())
+          .getNodePtr();
   if (LoadBase.isReg()) {
     MachineInstr *LastLoad = LoadInst->getPrevNode();
     // If the original load and store to xmm/ymm were consecutive
@@ -550,11 +551,8 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
         if (StoreMI.getParent() == MI.getParent() &&
             isPotentialBlockedMemCpyPair(MI.getOpcode(), StoreMI.getOpcode()) &&
             isRelevantAddressingMode(&MI) &&
-            isRelevantAddressingMode(&StoreMI)) {
-          assert(MI.hasOneMemOperand() &&
-                 "Expected one memory operand for load instruction");
-          assert(StoreMI.hasOneMemOperand() &&
-                 "Expected one memory operand for store instruction");
+            isRelevantAddressingMode(&StoreMI) &&
+            MI.hasOneMemOperand() && StoreMI.hasOneMemOperand()) {
           if (!alias(**MI.memoperands_begin(), **StoreMI.memoperands_begin()))
             BlockedLoadsStoresPairs.push_back(std::make_pair(&MI, &StoreMI));
         }
@@ -563,7 +561,7 @@ void X86AvoidSFBPass::findPotentiallylBlockedCopies(MachineFunction &MF) {
 }
 
 unsigned X86AvoidSFBPass::getRegSizeInBytes(MachineInstr *LoadInst) {
-  auto TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
+  const auto *TRC = TII->getRegClass(TII->get(LoadInst->getOpcode()), 0, TRI,
                               *LoadInst->getParent()->getParent());
   return TRI->getRegSizeInBits(*TRC) / 8;
 }
@@ -616,8 +614,8 @@ void X86AvoidSFBPass::breakBlockedCopies(
 
 static bool hasSameBaseOpValue(MachineInstr *LoadInst,
                                MachineInstr *StoreInst) {
-  MachineOperand &LoadBase = getBaseOperand(LoadInst);
-  MachineOperand &StoreBase = getBaseOperand(StoreInst);
+  const MachineOperand &LoadBase = getBaseOperand(LoadInst);
+  const MachineOperand &StoreBase = getBaseOperand(StoreInst);
   if (LoadBase.isReg() != StoreBase.isReg())
     return false;
   if (LoadBase.isReg())
@@ -691,13 +689,12 @@ bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
 
     SmallVector<MachineInstr *, 2> PotentialBlockers =
         findPotentialBlockers(LoadInst);
-    for (auto PBInst : PotentialBlockers) {
+    for (auto *PBInst : PotentialBlockers) {
       if (!isPotentialBlockingStoreInst(PBInst->getOpcode(),
                                         LoadInst->getOpcode()) ||
-          !isRelevantAddressingMode(PBInst))
+          !isRelevantAddressingMode(PBInst) || !PBInst->hasOneMemOperand())
         continue;
       int64_t PBstDispImm = getDispOperand(PBInst).getImm();
-      assert(PBInst->hasOneMemOperand() && "Expected One Memory Operand");
       unsigned PBstSize = (*PBInst->memoperands_begin())->getSize();
       // This check doesn't cover all cases, but it will suffice for now.
       // TODO: take branch probability into consideration, if the blocking
@@ -727,7 +724,7 @@ bool X86AvoidSFBPass::runOnMachineFunction(MachineFunction &MF) {
     ForRemoval.push_back(LoadInst);
     ForRemoval.push_back(StoreInst);
   }
-  for (auto RemovedInst : ForRemoval) {
+  for (auto *RemovedInst : ForRemoval) {
     RemovedInst->eraseFromParent();
   }
   ForRemoval.clear();
diff --git a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
index fb4f9e2901dc..0899783d5f60 100644
--- a/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
+++ b/llvm/lib/Target/X86/X86AvoidTrailingCall.cpp
@@ -6,10 +6,29 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// The Windows x64 unwinder has trouble unwinding the stack when a return
-// address points to the end of the function. This pass maintains the invariant
-// that every return address is inside the bounds of its parent function or
-// funclet by inserting int3 if the last instruction would otherwise be a call.
+// The Windows x64 unwinder decodes the instruction stream during unwinding.
+// The unwinder decodes forward from the current PC to detect epilogue code
+// patterns.
+//
+// First, this means that there must be an instruction after every
+// call instruction for the unwinder to decode. LLVM must maintain the invariant
+// that the last instruction of a function or funclet is not a call, or the
+// unwinder may decode into the next function. Similarly, a call may not
+// immediately precede an epilogue code pattern. As of this writing, the
+// SEH_Epilogue pseudo instruction takes care of that.
+//
+// Second, all non-tail call jump targets must be within the *half-open*
+// interval of the bounds of the function. The unwinder distinguishes between
+// internal jump instructions and tail calls in an epilogue sequence by checking
+// the jump target against the function bounds from the .pdata section. This
+// means that the last regular MBB of an LLVM function must not be empty if
+// there are regular jumps targeting it.
+//
+// This pass upholds these invariants by ensuring that blocks at the end of a
+// function or funclet are a) not empty and b) do not end in a CALL instruction.
+//
+// Unwinder implementation for reference:
+// https://github.com/dotnet/coreclr/blob/a9f3fc16483eecfc47fb79c362811d870be02249/src/unwinder/amd64/unwinder_amd64.cpp#L1015
 //
 //===----------------------------------------------------------------------===//
 
@@ -18,33 +37,35 @@
 #include "X86Subtarget.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 
-#define DEBUG_TYPE "x86-avoid-trailing-call"
+#define AVOIDCALL_DESC "X86 avoid trailing call pass"
+#define AVOIDCALL_NAME "x86-avoid-trailing-call"
+
+#define DEBUG_TYPE AVOIDCALL_NAME
 
 using namespace llvm;
 
 namespace {
-
 class X86AvoidTrailingCallPass : public MachineFunctionPass {
 public:
   X86AvoidTrailingCallPass() : MachineFunctionPass(ID) {}
 
   bool runOnMachineFunction(MachineFunction &MF) override;
 
-private:
-  StringRef getPassName() const override {
-    return "X86 avoid trailing call pass";
-  }
   static char ID;
+
+private:
+  StringRef getPassName() const override { return AVOIDCALL_DESC; }
 };
+} // end anonymous namespace
 
 char X86AvoidTrailingCallPass::ID = 0;
 
-} // end anonymous namespace
-
 FunctionPass *llvm::createX86AvoidTrailingCallPass() {
   return new X86AvoidTrailingCallPass();
 }
 
+INITIALIZE_PASS(X86AvoidTrailingCallPass, AVOIDCALL_NAME, AVOIDCALL_DESC, false, false)
+
 // A real instruction is a non-meta, non-pseudo instruction.  Some pseudos
 // expand to nothing, and some expand to code. This logic conservatively assumes
 // they might expand to nothing.
@@ -62,6 +83,11 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) {
   const X86InstrInfo &TII = *STI.getInstrInfo();
   assert(STI.isTargetWin64() && "pass only runs on Win64");
 
+  // We don't need to worry about any of the invariants described above if there
+  // is no unwind info (CFI).
+  if (!MF.hasWinCFI())
+    return false;
+
   // FIXME: Perhaps this pass should also replace SEH_Epilogue by inserting nops
   // before epilogues.
 
@@ -73,33 +99,34 @@ bool X86AvoidTrailingCallPass::runOnMachineFunction(MachineFunction &MF) {
     if (NextMBB && !NextMBB->isEHFuncletEntry())
       continue;
 
-    // Find the last real instruction in this block, or previous blocks if this
-    // block is empty.
-    MachineBasicBlock::reverse_iterator LastRealInstr;
-    for (MachineBasicBlock &RMBB :
-         make_range(MBB.getReverseIterator(), MF.rend())) {
-      LastRealInstr = llvm::find_if(reverse(RMBB), isRealInstruction);
-      if (LastRealInstr != RMBB.rend())
-        break;
-    }
-
-    // Do nothing if this function or funclet has no instructions.
-    if (LastRealInstr == MF.begin()->rend())
-      continue;
+    // Find the last real instruction in this block.
+    auto LastRealInstr = llvm::find_if(reverse(MBB), isRealInstruction);
 
-    // If this is a call instruction, insert int3 right after it with the same
-    // DebugLoc. Convert back to a forward iterator and advance the insertion
-    // position once.
-    if (isCallInstruction(*LastRealInstr)) {
+    // If the block is empty or the last real instruction is a call instruction,
+    // insert an int3. If there is a call instruction, insert the int3 between
+    // the call and any labels or other meta instructions. If the block is
+    // empty, insert at block end.
+    bool IsEmpty = LastRealInstr == MBB.rend();
+    bool IsCall = !IsEmpty && isCallInstruction(*LastRealInstr);
+    if (IsEmpty || IsCall) {
       LLVM_DEBUG({
-        dbgs() << "inserting int3 after trailing call instruction:\n";
-        LastRealInstr->dump();
-        dbgs() << '\n';
+        if (IsCall) {
+          dbgs() << "inserting int3 after trailing call instruction:\n";
+          LastRealInstr->dump();
+          dbgs() << '\n';
+        } else {
+          dbgs() << "inserting int3 in trailing empty MBB:\n";
+          MBB.dump();
+        }
       });
 
-      MachineBasicBlock::iterator MBBI = std::next(LastRealInstr.getReverse());
-      BuildMI(*LastRealInstr->getParent(), MBBI, LastRealInstr->getDebugLoc(),
-              TII.get(X86::INT3));
+      MachineBasicBlock::iterator MBBI = MBB.end();
+      DebugLoc DL;
+      if (IsCall) {
+        MBBI = std::next(LastRealInstr.getReverse());
+        DL = LastRealInstr->getDebugLoc();
+      }
+      BuildMI(MBB, MBBI, DL, TII.get(X86::INT3));
       Changed = true;
     }
   }
diff --git a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
index f8faa572dffc..caa1f7952475 100644
--- a/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
+++ b/llvm/lib/Target/X86/X86CallFrameOptimization.cpp
@@ -17,6 +17,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/X86BaseInfo.h"
+#include "X86.h"
 #include "X86FrameLowering.h"
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
@@ -162,14 +163,13 @@ bool X86CallFrameOptimization::isLegal(MachineFunction &MF) {
   // memory for arguments.
   unsigned FrameSetupOpcode = TII->getCallFrameSetupOpcode();
   unsigned FrameDestroyOpcode = TII->getCallFrameDestroyOpcode();
-  bool UseStackProbe =
-      !STI->getTargetLowering()->getStackProbeSymbolName(MF).empty();
+  bool EmitStackProbeCall = STI->getTargetLowering()->hasStackProbeSymbol(MF);
   unsigned StackProbeSize = STI->getTargetLowering()->getStackProbeSize(MF);
   for (MachineBasicBlock &BB : MF) {
     bool InsideFrameSequence = false;
     for (MachineInstr &MI : BB) {
       if (MI.getOpcode() == FrameSetupOpcode) {
-        if (TII->getFrameSize(MI) >= StackProbeSize && UseStackProbe)
+        if (TII->getFrameSize(MI) >= StackProbeSize && EmitStackProbeCall)
           return false;
         if (InsideFrameSequence)
           return false;
@@ -199,7 +199,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
   if (CannotReserveFrame)
     return true;
 
-  unsigned StackAlign = TFL->getStackAlignment();
+  Align StackAlign = TFL->getStackAlign();
 
   int64_t Advantage = 0;
   for (auto CC : CallSeqVector) {
@@ -222,7 +222,7 @@ bool X86CallFrameOptimization::isProfitable(MachineFunction &MF,
       // We'll need a add after the call.
       Advantage -= 3;
       // If we have to realign the stack, we'll also need a sub before
-      if (CC.ExpectedDist % StackAlign)
+      if (!isAligned(StackAlign, CC.ExpectedDist))
         Advantage -= 3;
       // Now, for each push, we save ~3 bytes. For small constants, we actually,
       // save more (up to 5 bytes), but 3 should be a good approximation.
@@ -531,6 +531,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
           PushOpcode = Is64Bit ? X86::PUSH64i8 : X86::PUSH32i8;
       }
       Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode)).add(PushOp);
+      Push->cloneMemRefs(MF, *Store);
       break;
     case X86::MOV32mr:
     case X86::MOV64mr: {
@@ -550,7 +551,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
 
       // If PUSHrmm is not slow on this target, try to fold the source of the
       // push into the instruction.
-      bool SlowPUSHrmm = STI->isAtom() || STI->isSLM();
+      bool SlowPUSHrmm = STI->slowTwoMemOps();
 
       // Check that this is legal to fold. Right now, we're extremely
       // conservative about that.
@@ -562,6 +563,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
         unsigned NumOps = DefMov->getDesc().getNumOperands();
         for (unsigned i = NumOps - X86::AddrNumOperands; i != NumOps; ++i)
           Push->addOperand(DefMov->getOperand(i));
+        Push->cloneMergedMemRefs(MF, {&*DefMov, &*Store});
 
         DefMov->eraseFromParent();
       } else {
@@ -569,6 +571,7 @@ void X86CallFrameOptimization::adjustCallSequence(MachineFunction &MF,
         Push = BuildMI(MBB, Context.Call, DL, TII->get(PushOpcode))
                    .addReg(Reg)
                    .getInstr();
+        Push->cloneMemRefs(MF, *Store);
       }
       break;
     }
diff --git a/llvm/lib/Target/X86/X86CallLowering.cpp b/llvm/lib/Target/X86/X86CallLowering.cpp
index 57bf799cf89c..319dc9470604 100644
--- a/llvm/lib/Target/X86/X86CallLowering.cpp
+++ b/llvm/lib/Target/X86/X86CallLowering.cpp
@@ -108,17 +108,15 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
                            MachinePointerInfo &MPO) override {
     LLT p0 = LLT::pointer(0, DL.getPointerSizeInBits(0));
     LLT SType = LLT::scalar(DL.getPointerSizeInBits(0));
-    Register SPReg = MRI.createGenericVirtualRegister(p0);
-    MIRBuilder.buildCopy(SPReg, STI.getRegisterInfo()->getStackRegister());
+    auto SPReg =
+        MIRBuilder.buildCopy(p0, STI.getRegisterInfo()->getStackRegister());
 
-    Register OffsetReg = MRI.createGenericVirtualRegister(SType);
-    MIRBuilder.buildConstant(OffsetReg, Offset);
+    auto OffsetReg = MIRBuilder.buildConstant(SType, Offset);
 
-    Register AddrReg = MRI.createGenericVirtualRegister(p0);
-    MIRBuilder.buildPtrAdd(AddrReg, SPReg, OffsetReg);
+    auto AddrReg = MIRBuilder.buildPtrAdd(p0, SPReg, OffsetReg);
 
     MPO = MachinePointerInfo::getStack(MIRBuilder.getMF(), Offset);
-    return AddrReg;
+    return AddrReg.getReg(0);
   }
 
   void assignValueToReg(Register ValVReg, Register PhysReg,
@@ -139,7 +137,7 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
     if (PhysRegSize > ValSize && LocSize == ValSize) {
       assert((PhysRegSize == 128 || PhysRegSize == 80)  && "We expect that to be 128 bit");
       auto MIB = MIRBuilder.buildAnyExt(LLT::scalar(PhysRegSize), ValVReg);
-      ExtReg = MIB->getOperand(0).getReg();
+      ExtReg = MIB.getReg(0);
     } else
       ExtReg = extendRegister(ValVReg, VA);
 
@@ -148,10 +146,12 @@ struct OutgoingValueHandler : public CallLowering::ValueHandler {
 
   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
+    MachineFunction &MF = MIRBuilder.getMF();
     Register ExtReg = extendRegister(ValVReg, VA);
-    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
-        MPO, MachineMemOperand::MOStore, VA.getLocVT().getStoreSize(),
-        /* Alignment */ 1);
+
+    auto MMO = MF.getMachineMemOperand(MPO, MachineMemOperand::MOStore,
+                                       VA.getLocVT().getStoreSize(),
+                                       inferAlignFromPtrInfo(MF, MPO));
     MIRBuilder.buildStore(ExtReg, Addr, *MMO);
   }
 
@@ -240,17 +240,17 @@ struct IncomingValueHandler : public CallLowering::ValueHandler {
     int FI = MFI.CreateFixedObject(Size, Offset, true);
     MPO = MachinePointerInfo::getFixedStack(MIRBuilder.getMF(), FI);
 
-    Register AddrReg = MRI.createGenericVirtualRegister(
-        LLT::pointer(0, DL.getPointerSizeInBits(0)));
-    MIRBuilder.buildFrameIndex(AddrReg, FI);
-    return AddrReg;
+    return MIRBuilder
+        .buildFrameIndex(LLT::pointer(0, DL.getPointerSizeInBits(0)), FI)
+        .getReg(0);
   }
 
   void assignValueToAddress(Register ValVReg, Register Addr, uint64_t Size,
                             MachinePointerInfo &MPO, CCValAssign &VA) override {
-    auto MMO = MIRBuilder.getMF().getMachineMemOperand(
+    MachineFunction &MF = MIRBuilder.getMF();
+    auto MMO = MF.getMachineMemOperand(
         MPO, MachineMemOperand::MOLoad | MachineMemOperand::MOInvariant, Size,
-        1);
+        inferAlignFromPtrInfo(MF, MPO));
     MIRBuilder.buildLoad(ValVReg, Addr, *MMO);
   }
 
diff --git a/llvm/lib/Target/X86/X86CallLowering.h b/llvm/lib/Target/X86/X86CallLowering.h
index 444a0c7d0122..b5ea7782896b 100644
--- a/llvm/lib/Target/X86/X86CallLowering.h
+++ b/llvm/lib/Target/X86/X86CallLowering.h
@@ -14,12 +14,12 @@
 #ifndef LLVM_LIB_TARGET_X86_X86CALLLOWERING_H
 #define LLVM_LIB_TARGET_X86_X86CALLLOWERING_H
 
-#include "llvm/ADT/ArrayRef.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
 #include <functional>
 
 namespace llvm {
 
+template <typename T> class ArrayRef;
 class DataLayout;
 class MachineRegisterInfo;
 class X86TargetLowering;
diff --git a/llvm/lib/Target/X86/X86CallingConv.cpp b/llvm/lib/Target/X86/X86CallingConv.cpp
index aee344a26764..c899db60e016 100644
--- a/llvm/lib/Target/X86/X86CallingConv.cpp
+++ b/llvm/lib/Target/X86/X86CallingConv.cpp
@@ -60,7 +60,7 @@ static bool CC_X86_32_RegCall_Assign2Regs(unsigned &ValNo, MVT &ValVT,
     State.addLoc(CCValAssign::getCustomReg(ValNo, ValVT, Reg, LocVT, LocInfo));
   }
 
-  // Successful in allocating regsiters - stop scanning next rules.
+  // Successful in allocating registers - stop scanning next rules.
   return true;
 }
 
@@ -166,7 +166,7 @@ static bool CC_X86_64_VectorCall(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
           State.getMachineFunction().getSubtarget().getRegisterInfo();
       if (TRI->regsOverlap(Reg, X86::XMM4) ||
           TRI->regsOverlap(Reg, X86::XMM5))
-        State.AllocateStack(8, 8);
+        State.AllocateStack(8, Align(8));
 
       if (!ArgFlags.isHva()) {
         State.addLoc(CCValAssign::getReg(ValNo, ValVT, Reg, LocVT, LocInfo));
@@ -281,7 +281,7 @@ static bool CC_X86_32_MCUInReg(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     if (UseRegs)
       It.convertToReg(State.AllocateReg(RegList[FirstFree++]));
     else
-      It.convertToMem(State.AllocateStack(4, 4));
+      It.convertToMem(State.AllocateStack(4, Align(4)));
     State.addLoc(It);
   }
 
@@ -305,7 +305,7 @@ static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
   if (ArgCount == 1 && ValNo == 0) {
     // If we have one argument, the argument is five stack slots big, at fixed
     // offset zero.
-    Offset = State.AllocateStack(5 * SlotSize, 4);
+    Offset = State.AllocateStack(5 * SlotSize, Align(4));
   } else if (ArgCount == 2 && ValNo == 0) {
     // If we have two arguments, the stack slot is *after* the error code
     // argument. Pretend it doesn't consume stack space, and account for it when
@@ -316,7 +316,7 @@ static bool CC_X86_Intr(unsigned &ValNo, MVT &ValVT, MVT &LocVT,
     // appears first on the stack, and is then followed by the five slot
     // interrupt struct.
     Offset = 0;
-    (void)State.AllocateStack(6 * SlotSize, 4);
+    (void)State.AllocateStack(6 * SlotSize, Align(4));
   } else {
     report_fatal_error("unsupported x86 interrupt prototype");
   }
diff --git a/llvm/lib/Target/X86/X86CallingConv.td b/llvm/lib/Target/X86/X86CallingConv.td
index db1aef2fd09d..802e694999b6 100644
--- a/llvm/lib/Target/X86/X86CallingConv.td
+++ b/llvm/lib/Target/X86/X86CallingConv.td
@@ -789,8 +789,9 @@ def CC_X86_32_Vector_Darwin : CallingConv<[
 /// CC_X86_32_Common - In all X86-32 calling conventions, extra integers and FP
 /// values are spilled on the stack.
 def CC_X86_32_Common : CallingConv<[
-  // Handles byval parameters.
+  // Handles byval/preallocated parameters.
   CCIfByVal<CCPassByVal<4, 4>>,
+  CCIfPreallocated<CCPassByVal<4, 4>>,
 
   // The first 3 float or double arguments, if marked 'inreg' and if the call
   // is not a vararg call and if SSE2 is available, are passed in SSE registers.
@@ -1145,7 +1146,7 @@ def CSR_64_Intel_OCL_BI       : CalleeSavedRegs<(add CSR_64,
 def CSR_64_Intel_OCL_BI_AVX    : CalleeSavedRegs<(add CSR_64,
                                                   (sequence "YMM%u", 8, 15))>;
 
-def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RDI, RSI, R14, R15,
+def CSR_64_Intel_OCL_BI_AVX512 : CalleeSavedRegs<(add RBX, RSI, R14, R15,
                                                   (sequence "ZMM%u", 16, 31),
                                                   K4, K5, K6, K7)>;
 
diff --git a/llvm/lib/Target/X86/X86CmovConversion.cpp b/llvm/lib/Target/X86/X86CmovConversion.cpp
index fe43bf4cbbce..fe5cb3ae2bf6 100644
--- a/llvm/lib/Target/X86/X86CmovConversion.cpp
+++ b/llvm/lib/Target/X86/X86CmovConversion.cpp
@@ -364,12 +364,13 @@ bool X86CmovConverterPass::collectCmovCandidates(
 /// \param TrueOpDepth depth cost of CMOV true value operand.
 /// \param FalseOpDepth depth cost of CMOV false value operand.
 static unsigned getDepthOfOptCmov(unsigned TrueOpDepth, unsigned FalseOpDepth) {
-  //===--------------------------------------------------------------------===//
-  // With no info about branch weight, we assume 50% for each value operand.
-  // Thus, depth of optimized CMOV instruction is the rounded up average of
-  // its True-Operand-Value-Depth and False-Operand-Value-Depth.
-  //===--------------------------------------------------------------------===//
-  return (TrueOpDepth + FalseOpDepth + 1) / 2;
+  // The depth of the result after branch conversion is
+  // TrueOpDepth * TrueOpProbability + FalseOpDepth * FalseOpProbability.
+  // As we have no info about branch weight, we assume 75% for one and 25% for
+  // the other, and pick the result with the largest resulting depth.
+  return std::max(
+      divideCeil(TrueOpDepth * 3 + FalseOpDepth, 4),
+      divideCeil(FalseOpDepth * 3 + TrueOpDepth, 4));
 }
 
 bool X86CmovConverterPass::checkForProfitableCmovCandidates(
diff --git a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
index 7051550d52e6..2ff8ee19561b 100644
--- a/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
+++ b/llvm/lib/Target/X86/X86DiscriminateMemOps.cpp
@@ -29,7 +29,7 @@ using namespace llvm;
 static cl::opt<bool> EnableDiscriminateMemops(
     DEBUG_TYPE, cl::init(false),
     cl::desc("Generate unique debug info for each instruction with a memory "
-             "operand. Should be enabled for profile-drived cache prefetching, "
+             "operand. Should be enabled for profile-driven cache prefetching, "
              "both in the build of the binary being profiled, as well as in "
              "the build of the binary consuming the profile."),
     cl::Hidden);
diff --git a/llvm/lib/Target/X86/X86DomainReassignment.cpp b/llvm/lib/Target/X86/X86DomainReassignment.cpp
index 438b9fd8eebb..488ee51f1d89 100644
--- a/llvm/lib/Target/X86/X86DomainReassignment.cpp
+++ b/llvm/lib/Target/X86/X86DomainReassignment.cpp
@@ -283,7 +283,7 @@ public:
 // A converter is identified by <destination domain, source opcode>
 typedef std::pair<int, unsigned> InstrConverterBaseKeyTy;
 
-typedef DenseMap<InstrConverterBaseKeyTy, InstrConverterBase *>
+typedef DenseMap<InstrConverterBaseKeyTy, std::unique_ptr<InstrConverterBase>>
     InstrConverterBaseMap;
 
 /// A closure is a set of virtual register representing all of the edges in
@@ -471,8 +471,8 @@ void X86DomainReassignment::encloseInstr(Closure &C, MachineInstr *MI) {
   // instruction.
   for (int i = 0; i != NumDomains; ++i) {
     if (C.isLegal((RegDomain)i)) {
-      InstrConverterBase *IC = Converters.lookup({i, MI->getOpcode()});
-      if (!IC || !IC->isLegal(MI, TII))
+      auto I = Converters.find({i, MI->getOpcode()});
+      if (I == Converters.end() || !I->second->isLegal(MI, TII))
         C.setIllegal((RegDomain)i);
     }
   }
@@ -484,8 +484,8 @@ double X86DomainReassignment::calculateCost(const Closure &C,
 
   double Cost = 0.0;
   for (auto *MI : C.instructions())
-    Cost +=
-        Converters.lookup({DstDomain, MI->getOpcode()})->getExtraCost(MI, MRI);
+    Cost += Converters.find({DstDomain, MI->getOpcode()})
+                ->second->getExtraCost(MI, MRI);
   return Cost;
 }
 
@@ -501,8 +501,8 @@ void X86DomainReassignment::reassign(const Closure &C, RegDomain Domain) const {
   // appropriate converter.
   SmallVector<MachineInstr *, 8> ToErase;
   for (auto *MI : C.instructions())
-    if (Converters.lookup({Domain, MI->getOpcode()})
-            ->convertInstr(MI, TII, MRI))
+    if (Converters.find({Domain, MI->getOpcode()})
+            ->second->convertInstr(MI, TII, MRI))
       ToErase.push_back(MI);
 
   // Iterate all registers in the closure, replace them with registers in the
@@ -606,19 +606,21 @@ void X86DomainReassignment::buildClosure(Closure &C, unsigned Reg) {
 
 void X86DomainReassignment::initConverters() {
   Converters[{MaskDomain, TargetOpcode::PHI}] =
-      new InstrIgnore(TargetOpcode::PHI);
+      std::make_unique<InstrIgnore>(TargetOpcode::PHI);
 
   Converters[{MaskDomain, TargetOpcode::IMPLICIT_DEF}] =
-      new InstrIgnore(TargetOpcode::IMPLICIT_DEF);
+      std::make_unique<InstrIgnore>(TargetOpcode::IMPLICIT_DEF);
 
   Converters[{MaskDomain, TargetOpcode::INSERT_SUBREG}] =
-      new InstrReplaceWithCopy(TargetOpcode::INSERT_SUBREG, 2);
+      std::make_unique<InstrReplaceWithCopy>(TargetOpcode::INSERT_SUBREG, 2);
 
   Converters[{MaskDomain, TargetOpcode::COPY}] =
-      new InstrCOPYReplacer(TargetOpcode::COPY, MaskDomain, TargetOpcode::COPY);
+      std::make_unique<InstrCOPYReplacer>(TargetOpcode::COPY, MaskDomain,
+                                          TargetOpcode::COPY);
 
   auto createReplacerDstCOPY = [&](unsigned From, unsigned To) {
-    Converters[{MaskDomain, From}] = new InstrReplacerDstCOPY(From, To);
+    Converters[{MaskDomain, From}] =
+        std::make_unique<InstrReplacerDstCOPY>(From, To);
   };
 
   createReplacerDstCOPY(X86::MOVZX32rm16, X86::KMOVWkm);
@@ -638,7 +640,7 @@ void X86DomainReassignment::initConverters() {
   }
 
   auto createReplacer = [&](unsigned From, unsigned To) {
-    Converters[{MaskDomain, From}] = new InstrReplacer(From, To);
+    Converters[{MaskDomain, From}] = std::make_unique<InstrReplacer>(From, To);
   };
 
   createReplacer(X86::MOV16rm, X86::KMOVWkm);
@@ -779,8 +781,6 @@ bool X86DomainReassignment::runOnMachineFunction(MachineFunction &MF) {
     }
   }
 
-  DeleteContainerSeconds(Converters);
-
   LLVM_DEBUG(
       dbgs() << "***** Machine Function after Domain Reassignment *****\n");
   LLVM_DEBUG(MF.print(dbgs()));
diff --git a/llvm/lib/Target/X86/X86EvexToVex.cpp b/llvm/lib/Target/X86/X86EvexToVex.cpp
index f1cf9b94c9e5..540ad98b6d54 100755
--- a/llvm/lib/Target/X86/X86EvexToVex.cpp
+++ b/llvm/lib/Target/X86/X86EvexToVex.cpp
@@ -237,11 +237,9 @@ bool EvexToVexInstPass::CompressEvexToVexImpl(MachineInstr &MI) const {
   // Make sure the tables are sorted.
   static std::atomic<bool> TableChecked(false);
   if (!TableChecked.load(std::memory_order_relaxed)) {
-    assert(std::is_sorted(std::begin(X86EvexToVex128CompressTable),
-                          std::end(X86EvexToVex128CompressTable)) &&
+    assert(llvm::is_sorted(X86EvexToVex128CompressTable) &&
            "X86EvexToVex128CompressTable is not sorted!");
-    assert(std::is_sorted(std::begin(X86EvexToVex256CompressTable),
-                          std::end(X86EvexToVex256CompressTable)) &&
+    assert(llvm::is_sorted(X86EvexToVex256CompressTable) &&
            "X86EvexToVex256CompressTable is not sorted!");
     TableChecked.store(true, std::memory_order_relaxed);
   }
diff --git a/llvm/lib/Target/X86/X86ExpandPseudo.cpp b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
index d35d65914b34..c47ef4708e91 100644
--- a/llvm/lib/Target/X86/X86ExpandPseudo.cpp
+++ b/llvm/lib/Target/X86/X86ExpandPseudo.cpp
@@ -275,7 +275,10 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
 
     MachineInstr &NewMI = *std::prev(MBBI);
     NewMI.copyImplicitOps(*MBBI->getParent()->getParent(), *MBBI);
-    MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI);
+
+    // Update the call site info.
+    if (MBBI->isCandidateForCallSiteEntry())
+      MBB.getParent()->moveCallSiteInfo(&*MBBI, &NewMI);
 
     // Delete the pseudo instruction TCRETURN.
     MBB.erase(MBBI);
@@ -331,14 +334,6 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     MBB.erase(MBBI);
     return true;
   }
-  case X86::EH_RESTORE: {
-    // Restore ESP and EBP, and optionally ESI if required.
-    bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(
-        MBB.getParent()->getFunction().getPersonalityFn()));
-    X86FL->restoreWin32EHStackPointers(MBB, MBBI, DL, /*RestoreSP=*/IsSEH);
-    MBBI->eraseFromParent();
-    return true;
-  }
   case X86::LCMPXCHG8B_SAVE_EBX:
   case X86::LCMPXCHG16B_SAVE_RBX: {
     // Perform the following transformation.
@@ -371,6 +366,82 @@ bool X86ExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
     MBBI->eraseFromParent();
     return true;
   }
+  // Loading/storing mask pairs requires two kmov operations. The second one of
+  // these needs a 2 byte displacement relative to the specified address (with
+  // 32 bit spill size). The pairs of 1bit masks up to 16 bit masks all use the
+  // same spill size, they all are stored using MASKPAIR16STORE, loaded using
+  // MASKPAIR16LOAD.
+  //
+  // The displacement value might wrap around in theory, thus the asserts in
+  // both cases.
+  case X86::MASKPAIR16LOAD: {
+    int64_t Disp = MBBI->getOperand(1 + X86::AddrDisp).getImm();
+    assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
+    Register Reg = MBBI->getOperand(0).getReg();
+    bool DstIsDead = MBBI->getOperand(0).isDead();
+    Register Reg0 = TRI->getSubReg(Reg, X86::sub_mask_0);
+    Register Reg1 = TRI->getSubReg(Reg, X86::sub_mask_1);
+
+    auto MIBLo = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWkm))
+      .addReg(Reg0, RegState::Define | getDeadRegState(DstIsDead));
+    auto MIBHi = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWkm))
+      .addReg(Reg1, RegState::Define | getDeadRegState(DstIsDead));
+
+    for (int i = 0; i < X86::AddrNumOperands; ++i) {
+      MIBLo.add(MBBI->getOperand(1 + i));
+      if (i == X86::AddrDisp)
+        MIBHi.addImm(Disp + 2);
+      else
+        MIBHi.add(MBBI->getOperand(1 + i));
+    }
+
+    // Split the memory operand, adjusting the offset and size for the halves.
+    MachineMemOperand *OldMMO = MBBI->memoperands().front();
+    MachineFunction *MF = MBB.getParent();
+    MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, 2);
+    MachineMemOperand *MMOHi = MF->getMachineMemOperand(OldMMO, 2, 2);
+
+    MIBLo.setMemRefs(MMOLo);
+    MIBHi.setMemRefs(MMOHi);
+
+    // Delete the pseudo.
+    MBB.erase(MBBI);
+    return true;
+  }
+  case X86::MASKPAIR16STORE: {
+    int64_t Disp = MBBI->getOperand(X86::AddrDisp).getImm();
+    assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
+    Register Reg = MBBI->getOperand(X86::AddrNumOperands).getReg();
+    bool SrcIsKill = MBBI->getOperand(X86::AddrNumOperands).isKill();
+    Register Reg0 = TRI->getSubReg(Reg, X86::sub_mask_0);
+    Register Reg1 = TRI->getSubReg(Reg, X86::sub_mask_1);
+
+    auto MIBLo = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWmk));
+    auto MIBHi = BuildMI(MBB, MBBI, DL, TII->get(X86::KMOVWmk));
+
+    for (int i = 0; i < X86::AddrNumOperands; ++i) {
+      MIBLo.add(MBBI->getOperand(i));
+      if (i == X86::AddrDisp)
+        MIBHi.addImm(Disp + 2);
+      else
+        MIBHi.add(MBBI->getOperand(i));
+    }
+    MIBLo.addReg(Reg0, getKillRegState(SrcIsKill));
+    MIBHi.addReg(Reg1, getKillRegState(SrcIsKill));
+
+    // Split the memory operand, adjusting the offset and size for the halves.
+    MachineMemOperand *OldMMO = MBBI->memoperands().front();
+    MachineFunction *MF = MBB.getParent();
+    MachineMemOperand *MMOLo = MF->getMachineMemOperand(OldMMO, 0, 2);
+    MachineMemOperand *MMOHi = MF->getMachineMemOperand(OldMMO, 2, 2);
+
+    MIBLo.setMemRefs(MMOLo);
+    MIBHi.setMemRefs(MMOHi);
+
+    // Delete the pseudo.
+    MBB.erase(MBBI);
+    return true;
+  }
   case TargetOpcode::ICALL_BRANCH_FUNNEL:
     ExpandICallBranchFunnel(&MBB, MBBI);
     return true;
diff --git a/llvm/lib/Target/X86/X86FastISel.cpp b/llvm/lib/Target/X86/X86FastISel.cpp
index 1dbf40683564..b305940139c0 100644
--- a/llvm/lib/Target/X86/X86FastISel.cpp
+++ b/llvm/lib/Target/X86/X86FastISel.cpp
@@ -26,7 +26,6 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DebugInfo.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -498,7 +497,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, unsigned ValReg, bool ValIsKill,
   default: return false;
   case MVT::i1: {
     // Mask out all but lowest bit.
-    unsigned AndResult = createResultReg(&X86::GR8RegClass);
+    Register AndResult = createResultReg(&X86::GR8RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(X86::AND8ri), AndResult)
       .addReg(ValReg, getKillRegState(ValIsKill)).addImm(1);
@@ -691,7 +690,7 @@ bool X86FastISel::X86FastEmitStore(EVT VT, const Value *Val,
     }
   }
 
-  unsigned ValReg = getRegForValue(Val);
+  Register ValReg = getRegForValue(Val);
   if (ValReg == 0)
     return false;
 
@@ -761,9 +760,9 @@ bool X86FastISel::handleConstantAddresses(const Value *V, X86AddressMode &AM) {
 
       // Ok, we need to do a load from a stub.  If we've already loaded from
       // this stub, reuse the loaded pointer, otherwise emit the load now.
-      DenseMap<const Value *, unsigned>::iterator I = LocalValueMap.find(V);
-      unsigned LoadReg;
-      if (I != LocalValueMap.end() && I->second != 0) {
+      DenseMap<const Value *, Register>::iterator I = LocalValueMap.find(V);
+      Register LoadReg;
+      if (I != LocalValueMap.end() && I->second) {
         LoadReg = I->second;
       } else {
         // Issue load from stub.
@@ -1128,10 +1127,8 @@ bool X86FastISel::X86SelectStore(const Instruction *I) {
   if (!isTypeLegal(Val->getType(), VT, /*AllowI1=*/true))
     return false;
 
-  unsigned Alignment = S->getAlignment();
-  unsigned ABIAlignment = DL.getABITypeAlignment(Val->getType());
-  if (Alignment == 0) // Ensure that codegen never sees alignment 0
-    Alignment = ABIAlignment;
+  Align Alignment = S->getAlign();
+  Align ABIAlignment = DL.getABITypeAlign(Val->getType());
   bool Aligned = Alignment >= ABIAlignment;
 
   X86AddressMode AM;
@@ -1196,7 +1193,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
     CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
     const Value *RV = Ret->getOperand(0);
-    unsigned Reg = getRegForValue(RV);
+    Register Reg = getRegForValue(RV);
     if (Reg == 0)
       return false;
 
@@ -1264,7 +1261,7 @@ bool X86FastISel::X86SelectRet(const Instruction *I) {
   // We saved the argument into a virtual register in the entry block,
   // so now we copy the value out and into %rax/%eax.
   if (F.hasStructRetAttr() && CC != CallingConv::Swift) {
-    unsigned Reg = X86MFInfo->getSRetReturnReg();
+    Register Reg = X86MFInfo->getSRetReturnReg();
     assert(Reg &&
            "SRetReturnReg should have been set in LowerFormalArguments()!");
     unsigned RetReg = Subtarget->isTarget64BitLP64() ? X86::RAX : X86::EAX;
@@ -1322,14 +1319,9 @@ bool X86FastISel::X86SelectLoad(const Instruction *I) {
   if (!X86SelectAddress(Ptr, AM))
     return false;
 
-  unsigned Alignment = LI->getAlignment();
-  unsigned ABIAlignment = DL.getABITypeAlignment(LI->getType());
-  if (Alignment == 0) // Ensure that codegen never sees alignment 0
-    Alignment = ABIAlignment;
-
   unsigned ResultReg = 0;
   if (!X86FastEmitLoad(VT, AM, createMachineMemOperandFor(LI), ResultReg,
-                       Alignment))
+                       LI->getAlign().value()))
     return false;
 
   updateValueMap(I, ResultReg);
@@ -1392,7 +1384,7 @@ static unsigned X86ChooseCmpImmediateOpcode(EVT VT, const ConstantInt *RHSC) {
 
 bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT,
                                      const DebugLoc &CurDbgLoc) {
-  unsigned Op0Reg = getRegForValue(Op0);
+  Register Op0Reg = getRegForValue(Op0);
   if (Op0Reg == 0) return false;
 
   // Handle 'null' like i32/i64 0.
@@ -1414,7 +1406,7 @@ bool X86FastISel::X86FastEmitCompare(const Value *Op0, const Value *Op1, EVT VT,
   unsigned CompareOpc = X86ChooseCmpOpcode(VT, Subtarget);
   if (CompareOpc == 0) return false;
 
-  unsigned Op1Reg = getRegForValue(Op1);
+  Register Op1Reg = getRegForValue(Op1);
   if (Op1Reg == 0) return false;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, CurDbgLoc, TII.get(CompareOpc))
     .addReg(Op0Reg)
@@ -1487,8 +1479,8 @@ bool X86FastISel::X86SelectCmp(const Instruction *I) {
     if (!X86FastEmitCompare(LHS, RHS, VT, I->getDebugLoc()))
       return false;
 
-    unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
-    unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+    Register FlagReg1 = createResultReg(&X86::GR8RegClass);
+    Register FlagReg2 = createResultReg(&X86::GR8RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
             FlagReg1).addImm(SETFOpc[0]);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
@@ -1522,7 +1514,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
   if (!TLI.isTypeLegal(DstVT))
     return false;
 
-  unsigned ResultReg = getRegForValue(I->getOperand(0));
+  Register ResultReg = getRegForValue(I->getOperand(0));
   if (ResultReg == 0)
     return false;
 
@@ -1548,7 +1540,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
     default: llvm_unreachable("Unexpected zext to i64 source type");
     }
 
-    unsigned Result32 = createResultReg(&X86::GR32RegClass);
+    Register Result32 = createResultReg(&X86::GR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(MovInst), Result32)
       .addReg(ResultReg);
 
@@ -1559,7 +1551,7 @@ bool X86FastISel::X86SelectZExt(const Instruction *I) {
   } else if (DstVT == MVT::i16) {
     // i8->i16 doesn't exist in the autogenerated isel table. Need to zero
     // extend to 32-bits and then extract down to 16-bits.
-    unsigned Result32 = createResultReg(&X86::GR32RegClass);
+    Register Result32 = createResultReg(&X86::GR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVZX32rr8),
             Result32).addReg(ResultReg);
 
@@ -1581,7 +1573,7 @@ bool X86FastISel::X86SelectSExt(const Instruction *I) {
   if (!TLI.isTypeLegal(DstVT))
     return false;
 
-  unsigned ResultReg = getRegForValue(I->getOperand(0));
+  Register ResultReg = getRegForValue(I->getOperand(0));
   if (ResultReg == 0)
     return false;
 
@@ -1589,7 +1581,7 @@ bool X86FastISel::X86SelectSExt(const Instruction *I) {
   MVT SrcVT = TLI.getSimpleValueType(DL, I->getOperand(0)->getType());
   if (SrcVT == MVT::i1) {
     // Set the high bits to zero.
-    unsigned ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg,
+    Register ZExtReg = fastEmitZExtFromI1(MVT::i8, ResultReg,
                                           /*TODO: Kill=*/false);
     if (ZExtReg == 0)
       return false;
@@ -1605,7 +1597,7 @@ bool X86FastISel::X86SelectSExt(const Instruction *I) {
   if (DstVT == MVT::i16) {
     // i8->i16 doesn't exist in the autogenerated isel table. Need to sign
     // extend to 32-bits and then extract down to 16-bits.
-    unsigned Result32 = createResultReg(&X86::GR32RegClass);
+    Register Result32 = createResultReg(&X86::GR32RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOVSX32rr8),
             Result32).addReg(ResultReg);
 
@@ -1720,7 +1712,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
       case MVT::i64: TestOpc = X86::TEST64ri32; break;
       }
       if (TestOpc) {
-        unsigned OpReg = getRegForValue(TI->getOperand(0));
+        Register OpReg = getRegForValue(TI->getOperand(0));
         if (OpReg == 0) return false;
 
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TestOpc))
@@ -1742,7 +1734,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
   } else if (foldX86XALUIntrinsic(CC, BI, BI->getCondition())) {
     // Fake request the condition, otherwise the intrinsic might be completely
     // optimized away.
-    unsigned TmpReg = getRegForValue(BI->getCondition());
+    Register TmpReg = getRegForValue(BI->getCondition());
     if (TmpReg == 0)
       return false;
 
@@ -1755,7 +1747,7 @@ bool X86FastISel::X86SelectBranch(const Instruction *I) {
   // Otherwise do a clumsy setcc and re-test it.
   // Note that i1 essentially gets ANY_EXTEND'ed to i8 where it isn't used
   // in an explicit cast, so make sure to handle that correctly.
-  unsigned OpReg = getRegForValue(BI->getCondition());
+  Register OpReg = getRegForValue(BI->getCondition());
   if (OpReg == 0) return false;
 
   // In case OpReg is a K register, COPY to a GPR
@@ -1824,10 +1816,10 @@ bool X86FastISel::X86SelectShift(const Instruction *I) {
   if (!isTypeLegal(I->getType(), VT))
     return false;
 
-  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  Register Op0Reg = getRegForValue(I->getOperand(0));
   if (Op0Reg == 0) return false;
 
-  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  Register Op1Reg = getRegForValue(I->getOperand(1));
   if (Op1Reg == 0) return false;
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpcode::COPY),
           CReg).addReg(Op1Reg);
@@ -1839,7 +1831,7 @@ bool X86FastISel::X86SelectShift(const Instruction *I) {
             TII.get(TargetOpcode::KILL), X86::CL)
       .addReg(CReg, RegState::Kill);
 
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(OpReg), ResultReg)
     .addReg(Op0Reg);
   updateValueMap(I, ResultReg);
@@ -1933,10 +1925,10 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
 
   const DivRemEntry &TypeEntry = OpTable[TypeIndex];
   const DivRemEntry::DivRemResult &OpEntry = TypeEntry.ResultTable[OpIndex];
-  unsigned Op0Reg = getRegForValue(I->getOperand(0));
+  Register Op0Reg = getRegForValue(I->getOperand(0));
   if (Op0Reg == 0)
     return false;
-  unsigned Op1Reg = getRegForValue(I->getOperand(1));
+  Register Op1Reg = getRegForValue(I->getOperand(1));
   if (Op1Reg == 0)
     return false;
 
@@ -1949,7 +1941,7 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(OpEntry.OpSignExtend));
     else {
-      unsigned Zero32 = createResultReg(&X86::GR32RegClass);
+      Register Zero32 = createResultReg(&X86::GR32RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(X86::MOV32r0), Zero32);
 
@@ -1986,8 +1978,8 @@ bool X86FastISel::X86SelectDivRem(const Instruction *I) {
   if ((I->getOpcode() == Instruction::SRem ||
        I->getOpcode() == Instruction::URem) &&
       OpEntry.DivRemResultReg == X86::AH && Subtarget->is64Bit()) {
-    unsigned SourceSuperReg = createResultReg(&X86::GR16RegClass);
-    unsigned ResultSuperReg = createResultReg(&X86::GR16RegClass);
+    Register SourceSuperReg = createResultReg(&X86::GR16RegClass);
+    Register ResultSuperReg = createResultReg(&X86::GR16RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(Copy), SourceSuperReg).addReg(X86::AX);
 
@@ -2066,15 +2058,15 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
       return false;
 
     if (SETFOpc) {
-      unsigned FlagReg1 = createResultReg(&X86::GR8RegClass);
-      unsigned FlagReg2 = createResultReg(&X86::GR8RegClass);
+      Register FlagReg1 = createResultReg(&X86::GR8RegClass);
+      Register FlagReg2 = createResultReg(&X86::GR8RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
               FlagReg1).addImm(SETFOpc[0]);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
               FlagReg2).addImm(SETFOpc[1]);
       auto const &II = TII.get(SETFOpc[2]);
       if (II.getNumDefs()) {
-        unsigned TmpReg = createResultReg(&X86::GR8RegClass);
+        Register TmpReg = createResultReg(&X86::GR8RegClass);
         BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, II, TmpReg)
           .addReg(FlagReg2).addReg(FlagReg1);
       } else {
@@ -2086,7 +2078,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
   } else if (foldX86XALUIntrinsic(CC, I, Cond)) {
     // Fake request the condition, otherwise the intrinsic might be completely
     // optimized away.
-    unsigned TmpReg = getRegForValue(Cond);
+    Register TmpReg = getRegForValue(Cond);
     if (TmpReg == 0)
       return false;
 
@@ -2099,7 +2091,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
     // accurate. If we read more than the lsb, we may see non-zero values
     // whereas lsb is zero. Therefore, we have to truncate Op0Reg to i1 for
     // the select. This is achieved by performing TEST against 1.
-    unsigned CondReg = getRegForValue(Cond);
+    Register CondReg = getRegForValue(Cond);
     if (CondReg == 0)
       return false;
     bool CondIsKill = hasTrivialKill(Cond);
@@ -2122,10 +2114,10 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
   const Value *LHS = I->getOperand(1);
   const Value *RHS = I->getOperand(2);
 
-  unsigned RHSReg = getRegForValue(RHS);
+  Register RHSReg = getRegForValue(RHS);
   bool RHSIsKill = hasTrivialKill(RHS);
 
-  unsigned LHSReg = getRegForValue(LHS);
+  Register LHSReg = getRegForValue(LHS);
   bool LHSIsKill = hasTrivialKill(LHS);
 
   if (!LHSReg || !RHSReg)
@@ -2133,7 +2125,7 @@ bool X86FastISel::X86FastEmitCMoveSelect(MVT RetVT, const Instruction *I) {
 
   const TargetRegisterInfo &TRI = *Subtarget->getRegisterInfo();
   unsigned Opc = X86::getCMovOpcode(TRI.getRegSizeInBits(*RC)/8);
-  unsigned ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill,
+  Register ResultReg = fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill,
                                         LHSReg, LHSIsKill, CC);
   updateValueMap(I, ResultReg);
   return true;
@@ -2182,19 +2174,19 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
   const Value *LHS = I->getOperand(1);
   const Value *RHS = I->getOperand(2);
 
-  unsigned LHSReg = getRegForValue(LHS);
+  Register LHSReg = getRegForValue(LHS);
   bool LHSIsKill = hasTrivialKill(LHS);
 
-  unsigned RHSReg = getRegForValue(RHS);
+  Register RHSReg = getRegForValue(RHS);
   bool RHSIsKill = hasTrivialKill(RHS);
 
-  unsigned CmpLHSReg = getRegForValue(CmpLHS);
+  Register CmpLHSReg = getRegForValue(CmpLHS);
   bool CmpLHSIsKill = hasTrivialKill(CmpLHS);
 
-  unsigned CmpRHSReg = getRegForValue(CmpRHS);
+  Register CmpRHSReg = getRegForValue(CmpRHS);
   bool CmpRHSIsKill = hasTrivialKill(CmpRHS);
 
-  if (!LHSReg || !RHSReg || !CmpLHS || !CmpRHS)
+  if (!LHSReg || !RHSReg || !CmpLHSReg || !CmpRHSReg)
     return false;
 
   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
@@ -2207,12 +2199,12 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
 
     unsigned CmpOpcode =
       (RetVT == MVT::f32) ? X86::VCMPSSZrr : X86::VCMPSDZrr;
-    unsigned CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill,
+    Register CmpReg = fastEmitInst_rri(CmpOpcode, VK1, CmpLHSReg, CmpLHSIsKill,
                                        CmpRHSReg, CmpRHSIsKill, CC);
 
     // Need an IMPLICIT_DEF for the input that is used to generate the upper
     // bits of the result register since its not based on any of the inputs.
-    unsigned ImplicitDefReg = createResultReg(VR128X);
+    Register ImplicitDefReg = createResultReg(VR128X);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
 
@@ -2241,9 +2233,9 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
     unsigned BlendOpcode =
       (RetVT == MVT::f32) ? X86::VBLENDVPSrr : X86::VBLENDVPDrr;
 
-    unsigned CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill,
+    Register CmpReg = fastEmitInst_rri(CmpOpcode, RC, CmpLHSReg, CmpLHSIsKill,
                                        CmpRHSReg, CmpRHSIsKill, CC);
-    unsigned VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
+    Register VBlendReg = fastEmitInst_rrr(BlendOpcode, VR128, RHSReg, RHSIsKill,
                                           LHSReg, LHSIsKill, CmpReg, true);
     ResultReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -2263,13 +2255,13 @@ bool X86FastISel::X86FastEmitSSESelect(MVT RetVT, const Instruction *I) {
     }
 
     const TargetRegisterClass *VR128 = &X86::VR128RegClass;
-    unsigned CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
+    Register CmpReg = fastEmitInst_rri(Opc[0], RC, CmpLHSReg, CmpLHSIsKill,
                                        CmpRHSReg, CmpRHSIsKill, CC);
-    unsigned AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false,
+    Register AndReg = fastEmitInst_rr(Opc[1], VR128, CmpReg, /*IsKill=*/false,
                                       LHSReg, LHSIsKill);
-    unsigned AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true,
+    Register AndNReg = fastEmitInst_rr(Opc[2], VR128, CmpReg, /*IsKill=*/true,
                                        RHSReg, RHSIsKill);
-    unsigned OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true,
+    Register OrReg = fastEmitInst_rr(Opc[3], VR128, AndNReg, /*IsKill=*/true,
                                      AndReg, /*IsKill=*/true);
     ResultReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
@@ -2317,7 +2309,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
     if (!X86FastEmitCompare(CmpLHS, CmpRHS, CmpVT, CI->getDebugLoc()))
       return false;
   } else {
-    unsigned CondReg = getRegForValue(Cond);
+    Register CondReg = getRegForValue(Cond);
     if (CondReg == 0)
       return false;
     bool CondIsKill = hasTrivialKill(Cond);
@@ -2340,10 +2332,10 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
   const Value *LHS = I->getOperand(1);
   const Value *RHS = I->getOperand(2);
 
-  unsigned LHSReg = getRegForValue(LHS);
+  Register LHSReg = getRegForValue(LHS);
   bool LHSIsKill = hasTrivialKill(LHS);
 
-  unsigned RHSReg = getRegForValue(RHS);
+  Register RHSReg = getRegForValue(RHS);
   bool RHSIsKill = hasTrivialKill(RHS);
 
   if (!LHSReg || !RHSReg)
@@ -2351,7 +2343,7 @@ bool X86FastISel::X86FastEmitPseudoSelect(MVT RetVT, const Instruction *I) {
 
   const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
 
-  unsigned ResultReg =
+  Register ResultReg =
     fastEmitInst_rri(Opc, RC, RHSReg, RHSIsKill, LHSReg, LHSIsKill, CC);
   updateValueMap(I, ResultReg);
   return true;
@@ -2373,12 +2365,12 @@ bool X86FastISel::X86SelectSelect(const Instruction *I) {
     }
     // No need for a select anymore - this is an unconditional move.
     if (Opnd) {
-      unsigned OpReg = getRegForValue(Opnd);
+      Register OpReg = getRegForValue(Opnd);
       if (OpReg == 0)
         return false;
       bool OpIsKill = hasTrivialKill(Opnd);
       const TargetRegisterClass *RC = TLI.getRegClassFor(RetVT);
-      unsigned ResultReg = createResultReg(RC);
+      Register ResultReg = createResultReg(RC);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), ResultReg)
         .addReg(OpReg, getKillRegState(OpIsKill));
@@ -2419,7 +2411,7 @@ bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) {
     return false;
 
   // Select integer to float/double conversion.
-  unsigned OpReg = getRegForValue(I->getOperand(0));
+  Register OpReg = getRegForValue(I->getOperand(0));
   if (OpReg == 0)
     return false;
 
@@ -2448,10 +2440,10 @@ bool X86FastISel::X86SelectIntToFP(const Instruction *I, bool IsSigned) {
 
   MVT DstVT = TLI.getValueType(DL, I->getType()).getSimpleVT();
   const TargetRegisterClass *RC = TLI.getRegClassFor(DstVT);
-  unsigned ImplicitDefReg = createResultReg(RC);
+  Register ImplicitDefReg = createResultReg(RC);
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
           TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
-  unsigned ResultReg =
+  Register ResultReg =
       fastEmitInst_rr(Opcode, RC, ImplicitDefReg, true, OpReg, false);
   updateValueMap(I, ResultReg);
   return true;
@@ -2474,7 +2466,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
          "Instruction must be an FPExt or FPTrunc!");
   bool HasAVX = Subtarget->hasAVX();
 
-  unsigned OpReg = getRegForValue(I->getOperand(0));
+  Register OpReg = getRegForValue(I->getOperand(0));
   if (OpReg == 0)
     return false;
 
@@ -2486,7 +2478,7 @@ bool X86FastISel::X86SelectFPExtOrFPTrunc(const Instruction *I,
 
   }
 
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   MachineInstrBuilder MIB;
   MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(TargetOpc),
                 ResultReg);
@@ -2537,7 +2529,7 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
   if (!TLI.isTypeLegal(SrcVT))
     return false;
 
-  unsigned InputReg = getRegForValue(I->getOperand(0));
+  Register InputReg = getRegForValue(I->getOperand(0));
   if (!InputReg)
     // Unhandled operand.  Halt "fast" selection and bail.
     return false;
@@ -2549,7 +2541,7 @@ bool X86FastISel::X86SelectTrunc(const Instruction *I) {
   }
 
   // Issue an extract_subreg.
-  unsigned ResultReg = fastEmitInst_extractsubreg(MVT::i8,
+  Register ResultReg = fastEmitInst_extractsubreg(MVT::i8,
                                                   InputReg, false,
                                                   X86::sub_8bit);
   if (!ResultReg)
@@ -2608,7 +2600,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       return false;
 
     const Value *Op = II->getArgOperand(0);
-    unsigned InputReg = getRegForValue(Op);
+    Register InputReg = getRegForValue(Op);
     if (InputReg == 0)
       return false;
 
@@ -2632,12 +2624,15 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       // used to provide rounding control: use MXCSR.RC, encoded as 0b100.
       // It's consistent with the other FP instructions, which are usually
       // controlled by MXCSR.
-      InputReg = fastEmitInst_ri(X86::VCVTPS2PHrr, RC, InputReg, false, 4);
+      unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPS2PHZ128rr
+                                         : X86::VCVTPS2PHrr;
+      InputReg = fastEmitInst_ri(Opc, RC, InputReg, false, 4);
 
       // Move the lower 32-bits of ResultReg to another register of class GR32.
+      Opc = Subtarget->hasAVX512() ? X86::VMOVPDI2DIZrr
+                                   : X86::VMOVPDI2DIrr;
       ResultReg = createResultReg(&X86::GR32RegClass);
-      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
-              TII.get(X86::VMOVPDI2DIrr), ResultReg)
+      BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
           .addReg(InputReg, RegState::Kill);
 
       // The result value is in the lower 16-bits of ResultReg.
@@ -2645,19 +2640,21 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       ResultReg = fastEmitInst_extractsubreg(MVT::i16, ResultReg, true, RegIdx);
     } else {
       assert(Op->getType()->isIntegerTy(16) && "Expected a 16-bit integer!");
-      // Explicitly sign-extend the input to 32-bit.
-      InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::SIGN_EXTEND, InputReg,
+      // Explicitly zero-extend the input to 32-bit.
+      InputReg = fastEmit_r(MVT::i16, MVT::i32, ISD::ZERO_EXTEND, InputReg,
                             /*Kill=*/false);
 
       // The following SCALAR_TO_VECTOR will be expanded into a VMOVDI2PDIrr.
       InputReg = fastEmit_r(MVT::i32, MVT::v4i32, ISD::SCALAR_TO_VECTOR,
                             InputReg, /*Kill=*/true);
 
-      InputReg = fastEmitInst_r(X86::VCVTPH2PSrr, RC, InputReg, /*Kill=*/true);
+      unsigned Opc = Subtarget->hasVLX() ? X86::VCVTPH2PSZ128rr
+                                         : X86::VCVTPH2PSrr;
+      InputReg = fastEmitInst_r(Opc, RC, InputReg, /*Kill=*/true);
 
       // The result value is in the lower 32-bits of ResultReg.
       // Emit an explicit copy from register class VR128 to register class FR32.
-      ResultReg = createResultReg(&X86::FR32RegClass);
+      ResultReg = createResultReg(TLI.getRegClassFor(MVT::f32));
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::COPY), ResultReg)
           .addReg(InputReg, RegState::Kill);
@@ -2700,7 +2697,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     // Always make a copy of the frame register to a vreg first, so that we
     // never directly reference the frame register (the TwoAddressInstruction-
     // Pass doesn't like that).
-    unsigned SrcReg = createResultReg(RC);
+    Register SrcReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), SrcReg).addReg(FrameReg);
 
@@ -2830,7 +2827,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
     }
 
     const Value *SrcVal = II->getArgOperand(0);
-    unsigned SrcReg = getRegForValue(SrcVal);
+    Register SrcReg = getRegForValue(SrcVal);
 
     if (SrcReg == 0)
       return false;
@@ -2843,7 +2840,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
               TII.get(TargetOpcode::IMPLICIT_DEF), ImplicitDefReg);
     }
 
-    unsigned ResultReg = createResultReg(RC);
+    Register ResultReg = createResultReg(RC);
     MachineInstrBuilder MIB;
     MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc),
                   ResultReg);
@@ -2903,7 +2900,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       BaseOpc = X86ISD::UMUL; CondCode = X86::COND_O; break;
     }
 
-    unsigned LHSReg = getRegForValue(LHS);
+    Register LHSReg = getRegForValue(LHS);
     if (LHSReg == 0)
       return false;
     bool LHSIsKill = hasTrivialKill(LHS);
@@ -2974,7 +2971,7 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       return false;
 
     // Assign to a GPR since the overflow return value is lowered to a SETcc.
-    unsigned ResultReg2 = createResultReg(&X86::GR8RegClass);
+    Register ResultReg2 = createResultReg(&X86::GR8RegClass);
     assert((ResultReg+1) == ResultReg2 && "Nonconsecutive result registers.");
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::SETCCr),
             ResultReg2).addImm(CondCode);
@@ -3041,11 +3038,11 @@ bool X86FastISel::fastLowerIntrinsicCall(const IntrinsicInst *II) {
       Op = IE->getOperand(0);
     }
 
-    unsigned Reg = getRegForValue(Op);
+    Register Reg = getRegForValue(Op);
     if (Reg == 0)
       return false;
 
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg)
       .addReg(Reg);
 
@@ -3139,11 +3136,11 @@ bool X86FastISel::fastLowerArguments() {
     case MVT::f32: LLVM_FALLTHROUGH;
     case MVT::f64: SrcReg = XMMArgRegs[FPRIdx++]; break;
     }
-    unsigned DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
+    Register DstReg = FuncInfo.MF->addLiveIn(SrcReg, RC);
     // FIXME: Unfortunately it's necessary to emit a copy from the livein copy.
     // Without this, EmitLiveInCopies may eliminate the livein if its only
     // use is a bitcast (which isn't turned into an instruction).
-    unsigned ResultReg = createResultReg(RC);
+    Register ResultReg = createResultReg(RC);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
             TII.get(TargetOpcode::COPY), ResultReg)
       .addReg(DstReg, getKillRegState(true));
@@ -3154,7 +3151,7 @@ bool X86FastISel::fastLowerArguments() {
 
 static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
                                                   CallingConv::ID CC,
-                                                  ImmutableCallSite *CS) {
+                                                  const CallBase *CB) {
   if (Subtarget->is64Bit())
     return 0;
   if (Subtarget->getTargetTriple().isOSMSVCRT())
@@ -3163,9 +3160,9 @@ static unsigned computeBytesPoppedByCalleeForSRet(const X86Subtarget *Subtarget,
       CC == CallingConv::HiPE || CC == CallingConv::Tail)
     return 0;
 
-  if (CS)
-    if (CS->arg_empty() || !CS->paramHasAttr(0, Attribute::StructRet) ||
-        CS->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU())
+  if (CB)
+    if (CB->arg_empty() || !CB->paramHasAttr(0, Attribute::StructRet) ||
+        CB->paramHasAttr(0, Attribute::InReg) || Subtarget->isTargetMCU())
       return 0;
 
   return 4;
@@ -3186,14 +3183,12 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   bool Is64Bit        = Subtarget->is64Bit();
   bool IsWin64        = Subtarget->isCallingConvWin64(CC);
 
-  const CallInst *CI =
-      CLI.CS ? dyn_cast<CallInst>(CLI.CS->getInstruction()) : nullptr;
+  const CallInst *CI = dyn_cast_or_null<CallInst>(CLI.CB);
   const Function *CalledFn = CI ? CI->getCalledFunction() : nullptr;
 
   // Call / invoke instructions with NoCfCheck attribute require special
   // handling.
-  const auto *II =
-      CLI.CS ? dyn_cast<InvokeInst>(CLI.CS->getInstruction()) : nullptr;
+  const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
   if ((CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck()))
     return false;
 
@@ -3202,8 +3197,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
       (CalledFn && CalledFn->hasFnAttribute("no_caller_saved_registers")))
     return false;
 
-  // Functions using retpoline for indirect calls need to use SDISel.
-  if (Subtarget->useRetpolineIndirectCalls())
+  // Functions using thunks for indirect calls need to use SDISel.
+  if (Subtarget->useIndirectThunkCalls())
     return false;
 
   // Handle only C, fastcc, and webkit_js calling conventions for now.
@@ -3239,11 +3234,11 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     return false;
 
   // Don't know about inalloca yet.
-  if (CLI.CS && CLI.CS->hasInAllocaArgument())
+  if (CLI.CB && CLI.CB->hasInAllocaArgument())
     return false;
 
   for (auto Flag : CLI.OutFlags)
-    if (Flag.isSwiftError())
+    if (Flag.isSwiftError() || Flag.isPreallocated())
       return false;
 
   SmallVector<MVT, 16> OutVTs;
@@ -3269,9 +3264,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
     MVT VT;
     auto *TI = dyn_cast<TruncInst>(Val);
     unsigned ResultReg;
-    if (TI && TI->getType()->isIntegerTy(1) && CLI.CS &&
-              (TI->getParent() == CLI.CS->getInstruction()->getParent()) &&
-              TI->hasOneUse()) {
+    if (TI && TI->getType()->isIntegerTy(1) && CLI.CB &&
+        (TI->getParent() == CLI.CB->getParent()) && TI->hasOneUse()) {
       Value *PrevVal = TI->getOperand(0);
       ResultReg = getRegForValue(PrevVal);
 
@@ -3284,7 +3278,8 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
       ResultReg =
         fastEmit_ri(VT, VT, ISD::AND, ResultReg, hasTrivialKill(PrevVal), 1);
     } else {
-      if (!isTypeLegal(Val->getType(), VT))
+      if (!isTypeLegal(Val->getType(), VT) ||
+          (VT.isVector() && VT.getVectorElementType() == MVT::i1))
         return false;
       ResultReg = getRegForValue(Val);
     }
@@ -3302,7 +3297,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
 
   // Allocate shadow area for Win64
   if (IsWin64)
-    CCInfo.AllocateStack(32, 8);
+    CCInfo.AllocateStack(32, Align(8));
 
   CCInfo.AnalyzeCallOperands(OutVTs, OutFlags, CC_X86);
 
@@ -3406,7 +3401,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
               TII.get(TargetOpcode::COPY), VA.getLocReg()).addReg(ArgReg);
       OutRegs.push_back(VA.getLocReg());
     } else {
-      assert(VA.isMemLoc());
+      assert(VA.isMemLoc() && "Unknown value location!");
 
       // Don't emit stores for undef values.
       if (isa<UndefValue>(ArgVal))
@@ -3417,7 +3412,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
       AM.Base.Reg = RegInfo->getStackRegister();
       AM.Disp = LocMemOffset;
       ISD::ArgFlagsTy Flags = OutFlags[VA.getValNo()];
-      unsigned Alignment = DL.getABITypeAlignment(ArgVal->getType());
+      Align Alignment = DL.getABITypeAlign(ArgVal->getType());
       MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
           MachinePointerInfo::getStack(*FuncInfo.MF, LocMemOffset),
           MachineMemOperand::MOStore, ArgVT.getStoreSize(), Alignment);
@@ -3537,7 +3532,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
       X86::isCalleePop(CC, Subtarget->is64Bit(), IsVarArg,
                        TM.Options.GuaranteedTailCallOpt)
           ? NumBytes // Callee pops everything.
-          : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CS);
+          : computeBytesPoppedByCalleeForSRet(Subtarget, CC, CLI.CB);
   unsigned AdjStackUp = TII.getCallFrameDestroyOpcode();
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(AdjStackUp))
     .addImm(NumBytes).addImm(NumBytesForCalleeToPop);
@@ -3549,7 +3544,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
   CCRetInfo.AnalyzeCallResult(Ins, RetCC_X86);
 
   // Copy all of the result registers out of their specified physreg.
-  unsigned ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
+  Register ResultReg = FuncInfo.CreateRegs(CLI.RetTy);
   for (unsigned i = 0; i != RVLocs.size(); ++i) {
     CCValAssign &VA = RVLocs[i];
     EVT CopyVT = VA.getValVT();
@@ -3582,7 +3577,7 @@ bool X86FastISel::fastLowerCall(CallLoweringInfo &CLI) {
       EVT ResVT = VA.getValVT();
       unsigned Opc = ResVT == MVT::f32 ? X86::ST_Fp80m32 : X86::ST_Fp80m64;
       unsigned MemSize = ResVT.getSizeInBits()/8;
-      int FI = MFI.CreateStackObject(MemSize, MemSize, false);
+      int FI = MFI.CreateStackObject(MemSize, Align(MemSize), false);
       addFrameReference(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                                 TII.get(Opc)), FI)
         .addReg(CopyReg);
@@ -3647,7 +3642,7 @@ X86FastISel::fastSelectInstruction(const Instruction *I)  {
       return X86SelectZExt(I);
     if (DstVT.bitsLT(SrcVT))
       return X86SelectTrunc(I);
-    unsigned Reg = getRegForValue(I->getOperand(0));
+    Register Reg = getRegForValue(I->getOperand(0));
     if (Reg == 0) return false;
     updateValueMap(I, Reg);
     return true;
@@ -3668,13 +3663,18 @@ X86FastISel::fastSelectInstruction(const Instruction *I)  {
         DstVT.getVectorElementType() == MVT::i1)
       return false;
 
-    unsigned Reg = getRegForValue(I->getOperand(0));
-    if (Reg == 0)
+    Register Reg = getRegForValue(I->getOperand(0));
+    if (!Reg)
       return false;
 
-    // No instruction is needed for conversion. Reuse the register used by
-    // the fist operand.
-    updateValueMap(I, Reg);
+    // Emit a reg-reg copy so we don't propagate cached known bits information
+    // with the wrong VT if we fall out of fast isel after selecting this.
+    const TargetRegisterClass *DstClass = TLI.getRegClassFor(DstVT);
+    Register ResultReg = createResultReg(DstClass);
+    BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
+              TII.get(TargetOpcode::COPY), ResultReg).addReg(Reg);
+
+    updateValueMap(I, ResultReg);
     return true;
   }
   }
@@ -3688,7 +3688,7 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
 
   uint64_t Imm = CI->getZExtValue();
   if (Imm == 0) {
-    unsigned SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
+    Register SrcReg = fastEmitInst_(X86::MOV32r0, &X86::GR32RegClass);
     switch (VT.SimpleTy) {
     default: llvm_unreachable("Unexpected value type");
     case MVT::i1:
@@ -3701,7 +3701,7 @@ unsigned X86FastISel::X86MaterializeInt(const ConstantInt *CI, MVT VT) {
     case MVT::i32:
       return SrcReg;
     case MVT::i64: {
-      unsigned ResultReg = createResultReg(&X86::GR64RegClass);
+      Register ResultReg = createResultReg(&X86::GR64RegClass);
       BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
               TII.get(TargetOpcode::SUBREG_TO_REG), ResultReg)
         .addImm(0).addReg(SrcReg).addImm(X86::sub_32bit);
@@ -3769,11 +3769,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
   }
 
   // MachineConstantPool wants an explicit alignment.
-  unsigned Align = DL.getPrefTypeAlignment(CFP->getType());
-  if (Align == 0) {
-    // Alignment of vector types. FIXME!
-    Align = DL.getTypeAllocSize(CFP->getType());
-  }
+  Align Alignment = DL.getPrefTypeAlign(CFP->getType());
 
   // x86-32 PIC requires a PIC base register for constant pools.
   unsigned PICBase = 0;
@@ -3786,11 +3782,12 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
     PICBase = X86::RIP;
 
   // Create the load from the constant pool.
-  unsigned CPI = MCP.getConstantPoolIndex(CFP, Align);
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
+  unsigned CPI = MCP.getConstantPoolIndex(CFP, Alignment);
+  Register ResultReg = createResultReg(TLI.getRegClassFor(VT.SimpleTy));
 
-  if (CM == CodeModel::Large) {
-    unsigned AddrReg = createResultReg(&X86::GR64RegClass);
+  // Large code model only applies to 64-bit mode.
+  if (Subtarget->is64Bit() && CM == CodeModel::Large) {
+    Register AddrReg = createResultReg(&X86::GR64RegClass);
     BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(X86::MOV64ri),
             AddrReg)
       .addConstantPoolIndex(CPI, 0, OpFlag);
@@ -3799,7 +3796,7 @@ unsigned X86FastISel::X86MaterializeFP(const ConstantFP *CFP, MVT VT) {
     addDirectMem(MIB, AddrReg);
     MachineMemOperand *MMO = FuncInfo.MF->getMachineMemOperand(
         MachinePointerInfo::getConstantPool(*FuncInfo.MF),
-        MachineMemOperand::MOLoad, DL.getPointerSize(), Align);
+        MachineMemOperand::MOLoad, DL.getPointerSize(), Alignment);
     MIB->addMemOperand(*FuncInfo.MF, MMO);
     return ResultReg;
   }
@@ -3824,7 +3821,7 @@ unsigned X86FastISel::X86MaterializeGV(const GlobalValue *GV, MVT VT) {
         AM.IndexReg == 0 && AM.Disp == 0 && AM.GV == nullptr)
       return AM.Base.Reg;
 
-    unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+    Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
     if (TM.getRelocationModel() == Reloc::Static &&
         TLI.getPointerTy(DL) == MVT::i64) {
       // The displacement code could be more than 32 bits away so we need to use
@@ -3883,7 +3880,7 @@ unsigned X86FastISel::fastMaterializeAlloca(const AllocaInst *C) {
           ? (Subtarget->isTarget64BitILP32() ? X86::LEA64_32r : X86::LEA32r)
           : X86::LEA64r;
   const TargetRegisterClass *RC = TLI.getRegClassFor(TLI.getPointerTy(DL));
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   addFullAddress(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc,
                          TII.get(Opc), ResultReg), AM);
   return ResultReg;
@@ -3916,7 +3913,7 @@ unsigned X86FastISel::fastMaterializeFloatZero(const ConstantFP *CF) {
     return 0;
   }
 
-  unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
+  Register ResultReg = createResultReg(TLI.getRegClassFor(VT));
   BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DbgLoc, TII.get(Opc), ResultReg);
   return ResultReg;
 }
@@ -3932,16 +3929,12 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
   const X86InstrInfo &XII = (const X86InstrInfo &)TII;
 
   unsigned Size = DL.getTypeAllocSize(LI->getType());
-  unsigned Alignment = LI->getAlignment();
-
-  if (Alignment == 0)  // Ensure that codegen never sees alignment 0
-    Alignment = DL.getABITypeAlignment(LI->getType());
 
   SmallVector<MachineOperand, 8> AddrOps;
   AM.getFullAddress(AddrOps);
 
   MachineInstr *Result = XII.foldMemoryOperandImpl(
-      *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, Alignment,
+      *FuncInfo.MF, *MI, OpNo, AddrOps, FuncInfo.InsertPt, Size, LI->getAlign(),
       /*AllowCommute=*/true);
   if (!Result)
     return false;
@@ -3958,7 +3951,7 @@ bool X86FastISel::tryToFoldLoadIntoMI(MachineInstr *MI, unsigned OpNo,
     if (!MO.isReg() || MO.isDef() || MO.getReg() != AM.IndexReg)
       continue;
     // Found the index reg, now try to rewrite it.
-    unsigned IndexReg = constrainOperandRegClass(Result->getDesc(),
+    Register IndexReg = constrainOperandRegClass(Result->getDesc(),
                                                  MO.getReg(), OperandNo);
     if (IndexReg == MO.getReg())
       continue;
@@ -3980,7 +3973,7 @@ unsigned X86FastISel::fastEmitInst_rrrr(unsigned MachineInstOpcode,
                                         unsigned Op3, bool Op3IsKill) {
   const MCInstrDesc &II = TII.get(MachineInstOpcode);
 
-  unsigned ResultReg = createResultReg(RC);
+  Register ResultReg = createResultReg(RC);
   Op0 = constrainOperandRegClass(II, Op0, II.getNumDefs());
   Op1 = constrainOperandRegClass(II, Op1, II.getNumDefs() + 1);
   Op2 = constrainOperandRegClass(II, Op2, II.getNumDefs() + 2);
diff --git a/llvm/lib/Target/X86/X86FixupBWInsts.cpp b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
index f8c4a2adb851..78de041329e2 100644
--- a/llvm/lib/Target/X86/X86FixupBWInsts.cpp
+++ b/llvm/lib/Target/X86/X86FixupBWInsts.cpp
@@ -350,7 +350,7 @@ MachineInstr *FixupBWInstPass::tryReplaceExtend(unsigned New32BitOpcode,
     return nullptr;
 
   // Don't interfere with formation of CBW instructions which should be a
-  // shorter encoding than even the MOVSX32rr8. It's also immunte to partial
+  // shorter encoding than even the MOVSX32rr8. It's also immune to partial
   // merge issues on Intel CPUs.
   if (MI->getOpcode() == X86::MOVSX16rr8 &&
       MI->getOperand(0).getReg() == X86::AX &&
diff --git a/llvm/lib/Target/X86/X86FixupLEAs.cpp b/llvm/lib/Target/X86/X86FixupLEAs.cpp
index 9ac401bb0253..424279038921 100644
--- a/llvm/lib/Target/X86/X86FixupLEAs.cpp
+++ b/llvm/lib/Target/X86/X86FixupLEAs.cpp
@@ -16,8 +16,11 @@
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/CodeGen/LazyMachineBlockFrequencyInfo.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineSizeOpts.h"
 #include "llvm/CodeGen/Passes.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 #include "llvm/Support/Debug.h"
@@ -111,6 +114,12 @@ public:
         MachineFunctionProperties::Property::NoVRegs);
   }
 
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<ProfileSummaryInfoWrapperPass>();
+    AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+    MachineFunctionPass::getAnalysisUsage(AU);
+  }
+
 private:
   TargetSchedModel TSM;
   const X86InstrInfo *TII = nullptr;
@@ -205,21 +214,27 @@ bool FixupLEAPass::runOnMachineFunction(MachineFunction &MF) {
   TSM.init(&ST);
   TII = ST.getInstrInfo();
   TRI = ST.getRegisterInfo();
+  auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+  auto *MBFI = (PSI && PSI->hasProfileSummary())
+                   ? &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI()
+                   : nullptr;
 
   LLVM_DEBUG(dbgs() << "Start X86FixupLEAs\n";);
   for (MachineBasicBlock &MBB : MF) {
     // First pass. Try to remove or optimize existing LEAs.
+    bool OptIncDecPerBB =
+        OptIncDec || llvm::shouldOptimizeForSize(&MBB, PSI, MBFI);
     for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
       if (!isLEA(I->getOpcode()))
         continue;
 
-      if (optTwoAddrLEA(I, MBB, OptIncDec, UseLEAForSP))
+      if (optTwoAddrLEA(I, MBB, OptIncDecPerBB, UseLEAForSP))
         continue;
 
       if (IsSlowLEA)
         processInstructionForSlowLEA(I, MBB);
       else if (IsSlow3OpsLEA)
-        processInstrForSlow3OpLEA(I, MBB, OptIncDec);
+        processInstrForSlow3OpLEA(I, MBB, OptIncDecPerBB);
     }
 
     // Second pass for creating LEAs. This may reverse some of the
diff --git a/llvm/lib/Target/X86/X86FixupSetCC.cpp b/llvm/lib/Target/X86/X86FixupSetCC.cpp
index 924f429fc138..09668d7c5468 100644
--- a/llvm/lib/Target/X86/X86FixupSetCC.cpp
+++ b/llvm/lib/Target/X86/X86FixupSetCC.cpp
@@ -36,6 +36,8 @@ STATISTIC(NumSubstZexts, "Number of setcc + zext pairs substituted");
 namespace {
 class X86FixupSetCCPass : public MachineFunctionPass {
 public:
+  static char ID;
+
   X86FixupSetCCPass() : MachineFunctionPass(ID) {}
 
   StringRef getPassName() const override { return "X86 Fixup SetCC"; }
@@ -47,12 +49,12 @@ private:
   const X86InstrInfo *TII = nullptr;
 
   enum { SearchBound = 16 };
-
-  static char ID;
 };
+} // end anonymous namespace
 
 char X86FixupSetCCPass::ID = 0;
-}
+
+INITIALIZE_PASS(X86FixupSetCCPass, DEBUG_TYPE, DEBUG_TYPE, false, false)
 
 FunctionPass *llvm::createX86FixupSetCC() { return new X86FixupSetCCPass(); }
 
diff --git a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
index b1d2de29c896..831695dabcd8 100644
--- a/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
+++ b/llvm/lib/Target/X86/X86FlagsCopyLowering.cpp
@@ -124,10 +124,6 @@ private:
                       MachineInstr &JmpI, CondRegArray &CondRegs);
   void rewriteCopy(MachineInstr &MI, MachineOperand &FlagUse,
                    MachineInstr &CopyDefI);
-  void rewriteSetCarryExtended(MachineBasicBlock &TestMBB,
-                               MachineBasicBlock::iterator TestPos,
-                               DebugLoc TestLoc, MachineInstr &SetBI,
-                               MachineOperand &FlagUse, CondRegArray &CondRegs);
   void rewriteSetCC(MachineBasicBlock &TestMBB,
                     MachineBasicBlock::iterator TestPos, DebugLoc TestLoc,
                     MachineInstr &SetCCI, MachineOperand &FlagUse,
@@ -165,6 +161,7 @@ enum class FlagArithMnemonic {
   RCL,
   RCR,
   SBB,
+  SETB,
 };
 } // namespace
 
@@ -235,6 +232,10 @@ static FlagArithMnemonic getMnemonicFromOpcode(unsigned Opcode) {
   case X86::ADOX32rm:
   case X86::ADOX64rm:
     return FlagArithMnemonic::ADOX;
+
+  case X86::SETB_C32r:
+  case X86::SETB_C64r:
+    return FlagArithMnemonic::SETB;
   }
 }
 
@@ -638,24 +639,9 @@ bool X86FlagsCopyLoweringPass::runOnMachineFunction(MachineFunction &MF) {
           // logic.
           FlagsKilled = true;
 
-          switch (MI.getOpcode()) {
-          case X86::SETB_C8r:
-          case X86::SETB_C16r:
-          case X86::SETB_C32r:
-          case X86::SETB_C64r:
-            // Use custom lowering for arithmetic that is merely extending the
-            // carry flag. We model this as the SETB_C* pseudo instructions.
-            rewriteSetCarryExtended(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
-                                    CondRegs);
-            break;
-
-          default:
-            // Generically handle remaining uses as arithmetic instructions.
-            rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
-                              CondRegs);
-            break;
-          }
-          break;
+          // Generically handle remaining uses as arithmetic instructions.
+          rewriteArithmetic(*TestMBB, TestPos, TestLoc, MI, *FlagUse,
+                            CondRegs);
         }
 
         // If this was the last use of the flags, we're done.
@@ -821,6 +807,7 @@ void X86FlagsCopyLoweringPass::rewriteArithmetic(
   case FlagArithMnemonic::RCL:
   case FlagArithMnemonic::RCR:
   case FlagArithMnemonic::SBB:
+  case FlagArithMnemonic::SETB:
     Cond = X86::COND_B; // CF == 1
     // Set up an addend that when one is added will need a carry due to not
     // having a higher bit available.
@@ -959,130 +946,6 @@ void X86FlagsCopyLoweringPass::rewriteCopy(MachineInstr &MI,
   MI.eraseFromParent();
 }
 
-void X86FlagsCopyLoweringPass::rewriteSetCarryExtended(
-    MachineBasicBlock &TestMBB, MachineBasicBlock::iterator TestPos,
-    DebugLoc TestLoc, MachineInstr &SetBI, MachineOperand &FlagUse,
-    CondRegArray &CondRegs) {
-  // This routine is only used to handle pseudos for setting a register to zero
-  // or all ones based on CF. This is essentially the sign extended from 1-bit
-  // form of SETB and modeled with the SETB_C* pseudos. They require special
-  // handling as they aren't normal SETcc instructions and are lowered to an
-  // EFLAGS clobbering operation (SBB typically). One simplifying aspect is that
-  // they are only provided in reg-defining forms. A complicating factor is that
-  // they can define many different register widths.
-  assert(SetBI.getOperand(0).isReg() &&
-         "Cannot have a non-register defined operand to this variant of SETB!");
-
-  // Little helper to do the common final step of replacing the register def'ed
-  // by this SETB instruction with a new register and removing the SETB
-  // instruction.
-  auto RewriteToReg = [&](unsigned Reg) {
-    MRI->replaceRegWith(SetBI.getOperand(0).getReg(), Reg);
-    SetBI.eraseFromParent();
-  };
-
-  // Grab the register class used for this particular instruction.
-  auto &SetBRC = *MRI->getRegClass(SetBI.getOperand(0).getReg());
-
-  MachineBasicBlock &MBB = *SetBI.getParent();
-  auto SetPos = SetBI.getIterator();
-  auto SetLoc = SetBI.getDebugLoc();
-
-  auto AdjustReg = [&](unsigned Reg) {
-    auto &OrigRC = *MRI->getRegClass(Reg);
-    if (&OrigRC == &SetBRC)
-      return Reg;
-
-    unsigned NewReg;
-
-    int OrigRegSize = TRI->getRegSizeInBits(OrigRC) / 8;
-    int TargetRegSize = TRI->getRegSizeInBits(SetBRC) / 8;
-    assert(OrigRegSize <= 8 && "No GPRs larger than 64-bits!");
-    assert(TargetRegSize <= 8 && "No GPRs larger than 64-bits!");
-    int SubRegIdx[] = {X86::NoSubRegister, X86::sub_8bit, X86::sub_16bit,
-                       X86::NoSubRegister, X86::sub_32bit};
-
-    // If the original size is smaller than the target *and* is smaller than 4
-    // bytes, we need to explicitly zero extend it. We always extend to 4-bytes
-    // to maximize the chance of being able to CSE that operation and to avoid
-    // partial dependency stalls extending to 2-bytes.
-    if (OrigRegSize < TargetRegSize && OrigRegSize < 4) {
-      NewReg = MRI->createVirtualRegister(&X86::GR32RegClass);
-      BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOVZX32rr8), NewReg)
-          .addReg(Reg);
-      if (&SetBRC == &X86::GR32RegClass)
-        return NewReg;
-      Reg = NewReg;
-      OrigRegSize = 4;
-    }
-
-    NewReg = MRI->createVirtualRegister(&SetBRC);
-    if (OrigRegSize < TargetRegSize) {
-      BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::SUBREG_TO_REG),
-              NewReg)
-          .addImm(0)
-          .addReg(Reg)
-          .addImm(SubRegIdx[OrigRegSize]);
-    } else if (OrigRegSize > TargetRegSize) {
-      if (TargetRegSize == 1 && !Subtarget->is64Bit()) {
-        // Need to constrain the register class.
-        MRI->constrainRegClass(Reg, &X86::GR32_ABCDRegClass);
-      }
-
-      BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY),
-              NewReg)
-          .addReg(Reg, 0, SubRegIdx[TargetRegSize]);
-    } else {
-      BuildMI(MBB, SetPos, SetLoc, TII->get(TargetOpcode::COPY), NewReg)
-          .addReg(Reg);
-    }
-    return NewReg;
-  };
-
-  unsigned &CondReg = CondRegs[X86::COND_B];
-  if (!CondReg)
-    CondReg = promoteCondToReg(TestMBB, TestPos, TestLoc, X86::COND_B);
-
-  // Adjust the condition to have the desired register width by zero-extending
-  // as needed.
-  // FIXME: We should use a better API to avoid the local reference and using a
-  // different variable here.
-  unsigned ExtCondReg = AdjustReg(CondReg);
-
-  // Now we need to turn this into a bitmask. We do this by subtracting it from
-  // zero.
-  Register ZeroReg = MRI->createVirtualRegister(&X86::GR32RegClass);
-  BuildMI(MBB, SetPos, SetLoc, TII->get(X86::MOV32r0), ZeroReg);
-  ZeroReg = AdjustReg(ZeroReg);
-
-  unsigned Sub;
-  switch (SetBI.getOpcode()) {
-  case X86::SETB_C8r:
-    Sub = X86::SUB8rr;
-    break;
-
-  case X86::SETB_C16r:
-    Sub = X86::SUB16rr;
-    break;
-
-  case X86::SETB_C32r:
-    Sub = X86::SUB32rr;
-    break;
-
-  case X86::SETB_C64r:
-    Sub = X86::SUB64rr;
-    break;
-
-  default:
-    llvm_unreachable("Invalid SETB_C* opcode!");
-  }
-  Register ResultReg = MRI->createVirtualRegister(&SetBRC);
-  BuildMI(MBB, SetPos, SetLoc, TII->get(Sub), ResultReg)
-      .addReg(ZeroReg)
-      .addReg(ExtCondReg);
-  return RewriteToReg(ResultReg);
-}
-
 void X86FlagsCopyLoweringPass::rewriteSetCC(MachineBasicBlock &TestMBB,
                                             MachineBasicBlock::iterator TestPos,
                                             DebugLoc TestLoc,
diff --git a/llvm/lib/Target/X86/X86FloatingPoint.cpp b/llvm/lib/Target/X86/X86FloatingPoint.cpp
index 13bbd6ccfce4..e6ee46957500 100644
--- a/llvm/lib/Target/X86/X86FloatingPoint.cpp
+++ b/llvm/lib/Target/X86/X86FloatingPoint.cpp
@@ -1364,6 +1364,9 @@ void FPS::handleTwoArgFP(MachineBasicBlock::iterator &I) {
   MBB->remove(&*I++);
   I = BuildMI(*MBB, I, dl, TII->get(Opcode)).addReg(getSTReg(NotTOS));
 
+  if (!MI.mayRaiseFPException())
+    I->setFlag(MachineInstr::MIFlag::NoFPExcept);
+
   // If both operands are killed, pop one off of the stack in addition to
   // overwriting the other one.
   if (KillsOp0 && KillsOp1 && Op0 != Op1) {
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 799c1f5d1285..c7ca6fb2a4fc 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -17,6 +17,7 @@
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -32,6 +33,12 @@
 #include "llvm/Target/TargetOptions.h"
 #include <cstdlib>
 
+#define DEBUG_TYPE "x86-fl"
+
+STATISTIC(NumFrameLoopProbe, "Number of loop stack probes used in prologue");
+STATISTIC(NumFrameExtraProbe,
+          "Number of extra stack probes generated in prologue");
+
 using namespace llvm;
 
 X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
@@ -50,7 +57,8 @@ X86FrameLowering::X86FrameLowering(const X86Subtarget &STI,
 
 bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   return !MF.getFrameInfo().hasVarSizedObjects() &&
-         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
+         !MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences() &&
+         !MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall();
 }
 
 /// canSimplifyCallFramePseudos - If there is a reserved call frame, the
@@ -60,6 +68,7 @@ bool X86FrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
 bool
 X86FrameLowering::canSimplifyCallFramePseudos(const MachineFunction &MF) const {
   return hasReservedCallFrame(MF) ||
+         MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
          (hasFP(MF) && !TRI->needsStackRealignment(MF)) ||
          TRI->hasBasePointer(MF);
 }
@@ -83,10 +92,10 @@ X86FrameLowering::needsFrameIndexResolution(const MachineFunction &MF) const {
 bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
-          TRI->needsStackRealignment(MF) ||
-          MFI.hasVarSizedObjects() ||
+          TRI->needsStackRealignment(MF) || MFI.hasVarSizedObjects() ||
           MFI.isFrameAddressTaken() || MFI.hasOpaqueSPAdjustment() ||
           MF.getInfo<X86MachineFunctionInfo>()->getForceFramePointer() ||
+          MF.getInfo<X86MachineFunctionInfo>()->hasPreallocatedCall() ||
           MF.callsUnwindInit() || MF.hasEHFunclets() || MF.callsEHReturn() ||
           MFI.hasStackMap() || MFI.hasPatchPoint() ||
           MFI.hasCopyImplyingStackAdjustment());
@@ -257,7 +266,20 @@ void X86FrameLowering::emitSPUpdate(MachineBasicBlock &MBB,
 
   uint64_t Chunk = (1LL << 31) - 1;
 
-  if (Offset > Chunk) {
+  MachineFunction &MF = *MBB.getParent();
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const X86TargetLowering &TLI = *STI.getTargetLowering();
+  const bool EmitInlineStackProbe = TLI.hasInlineStackProbe(MF);
+
+  // It's ok to not take into account large chunks when probing, as the
+  // allocation is split in smaller chunks anyway.
+  if (EmitInlineStackProbe && !InEpilogue) {
+
+    // This pseudo-instruction is going to be expanded, potentially using a
+    // loop, by inlineStackProbe().
+    BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING)).addImm(Offset);
+    return;
+  } else if (Offset > Chunk) {
     // Rather than emit a long series of instructions for large offsets,
     // load the offset into a register and do one sub/add
     unsigned Reg = 0;
@@ -381,8 +403,8 @@ MachineInstrBuilder X86FrameLowering::BuildStackAdjustment(
   } else {
     bool IsSub = Offset < 0;
     uint64_t AbsOffset = IsSub ? -Offset : Offset;
-    unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset)
-                         : getADDriOpcode(Uses64BitFramePtr, AbsOffset);
+    const unsigned Opc = IsSub ? getSUBriOpcode(Uses64BitFramePtr, AbsOffset)
+                               : getADDriOpcode(Uses64BitFramePtr, AbsOffset);
     MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
              .addReg(StackPtr)
              .addImm(AbsOffset);
@@ -457,9 +479,32 @@ void X86FrameLowering::BuildCFI(MachineBasicBlock &MBB,
       .addCFIIndex(CFIIndex);
 }
 
+/// Emits Dwarf Info specifying offsets of callee saved registers and
+/// frame pointer. This is called only when basic block sections are enabled.
+void X86FrameLowering::emitCalleeSavedFrameMoves(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) const {
+  MachineFunction &MF = *MBB.getParent();
+  if (!hasFP(MF)) {
+    emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);
+    return;
+  }
+  const MachineModuleInfo &MMI = MF.getMMI();
+  const MCRegisterInfo *MRI = MMI.getContext().getRegisterInfo();
+  const unsigned FramePtr = TRI->getFrameRegister(MF);
+  const unsigned MachineFramePtr =
+      STI.isTarget64BitILP32() ? unsigned(getX86SubSuperRegister(FramePtr, 64))
+                               : FramePtr;
+  unsigned DwarfReg = MRI->getDwarfRegNum(MachineFramePtr, true);
+  // Offset = space for return address + size of the frame pointer itself.
+  unsigned Offset = (Is64Bit ? 8 : 4) + (Uses64BitFramePtr ? 8 : 4);
+  BuildCFI(MBB, MBBI, DebugLoc{},
+           MCCFIInstruction::createOffset(nullptr, DwarfReg, -Offset));
+  emitCalleeSavedFrameMoves(MBB, MBBI, DebugLoc{}, true);
+}
+
 void X86FrameLowering::emitCalleeSavedFrameMoves(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
-    const DebugLoc &DL) const {
+    const DebugLoc &DL, bool IsPrologue) const {
   MachineFunction &MF = *MBB.getParent();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   MachineModuleInfo &MMI = MF.getMMI();
@@ -474,10 +519,15 @@ void X86FrameLowering::emitCalleeSavedFrameMoves(
          I = CSI.begin(), E = CSI.end(); I != E; ++I) {
     int64_t Offset = MFI.getObjectOffset(I->getFrameIdx());
     unsigned Reg = I->getReg();
-
     unsigned DwarfReg = MRI->getDwarfRegNum(Reg, true);
-    BuildCFI(MBB, MBBI, DL,
-             MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+
+    if (IsPrologue) {
+      BuildCFI(MBB, MBBI, DL,
+               MCCFIInstruction::createOffset(nullptr, DwarfReg, Offset));
+    } else {
+      BuildCFI(MBB, MBBI, DL,
+               MCCFIInstruction::createRestore(nullptr, DwarfReg));
+    }
   }
 }
 
@@ -488,7 +538,8 @@ void X86FrameLowering::emitStackProbe(MachineFunction &MF,
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
   if (STI.isTargetWindowsCoreCLR()) {
     if (InProlog) {
-      emitStackProbeInlineStub(MF, MBB, MBBI, DL, true);
+      BuildMI(MBB, MBBI, DL, TII.get(X86::STACKALLOC_W_PROBING))
+          .addImm(0 /* no explicit stack size */);
     } else {
       emitStackProbeInline(MF, MBB, MBBI, DL, false);
     }
@@ -499,26 +550,13 @@ void X86FrameLowering::emitStackProbe(MachineFunction &MF,
 
 void X86FrameLowering::inlineStackProbe(MachineFunction &MF,
                                         MachineBasicBlock &PrologMBB) const {
-  const StringRef ChkStkStubSymbol = "__chkstk_stub";
-  MachineInstr *ChkStkStub = nullptr;
-
-  for (MachineInstr &MI : PrologMBB) {
-    if (MI.isCall() && MI.getOperand(0).isSymbol() &&
-        ChkStkStubSymbol == MI.getOperand(0).getSymbolName()) {
-      ChkStkStub = &MI;
-      break;
-    }
-  }
-
-  if (ChkStkStub != nullptr) {
-    assert(!ChkStkStub->isBundled() &&
-           "Not expecting bundled instructions here");
-    MachineBasicBlock::iterator MBBI = std::next(ChkStkStub->getIterator());
-    assert(std::prev(MBBI) == ChkStkStub &&
-           "MBBI expected after __chkstk_stub.");
-    DebugLoc DL = PrologMBB.findDebugLoc(MBBI);
-    emitStackProbeInline(MF, PrologMBB, MBBI, DL, true);
-    ChkStkStub->eraseFromParent();
+  auto Where = llvm::find_if(PrologMBB, [](MachineInstr &MI) {
+    return MI.getOpcode() == X86::STACKALLOC_W_PROBING;
+  });
+  if (Where != PrologMBB.end()) {
+    DebugLoc DL = PrologMBB.findDebugLoc(Where);
+    emitStackProbeInline(MF, PrologMBB, Where, DL, true);
+    Where->eraseFromParent();
   }
 }
 
@@ -528,6 +566,167 @@ void X86FrameLowering::emitStackProbeInline(MachineFunction &MF,
                                             const DebugLoc &DL,
                                             bool InProlog) const {
   const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  if (STI.isTargetWindowsCoreCLR() && STI.is64Bit())
+    emitStackProbeInlineWindowsCoreCLR64(MF, MBB, MBBI, DL, InProlog);
+  else
+    emitStackProbeInlineGeneric(MF, MBB, MBBI, DL, InProlog);
+}
+
+void X86FrameLowering::emitStackProbeInlineGeneric(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
+  MachineInstr &AllocWithProbe = *MBBI;
+  uint64_t Offset = AllocWithProbe.getOperand(0).getImm();
+
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const X86TargetLowering &TLI = *STI.getTargetLowering();
+  assert(!(STI.is64Bit() && STI.isTargetWindowsCoreCLR()) &&
+         "different expansion expected for CoreCLR 64 bit");
+
+  const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+  uint64_t ProbeChunk = StackProbeSize * 8;
+
+  // Synthesize a loop or unroll it, depending on the number of iterations.
+  if (Offset > ProbeChunk) {
+    emitStackProbeInlineGenericLoop(MF, MBB, MBBI, DL, Offset);
+  } else {
+    emitStackProbeInlineGenericBlock(MF, MBB, MBBI, DL, Offset);
+  }
+}
+
+void X86FrameLowering::emitStackProbeInlineGenericBlock(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+    uint64_t Offset) const {
+
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const X86TargetLowering &TLI = *STI.getTargetLowering();
+  const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
+  const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
+  const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+  uint64_t CurrentOffset = 0;
+  // 0 Thanks to return address being saved on the stack
+  uint64_t CurrentProbeOffset = 0;
+
+  // For the first N - 1 pages, just probe. I tried to take advantage of
+  // natural probes but it implies much more logic and there was very few
+  // interesting natural probes to interleave.
+  while (CurrentOffset + StackProbeSize < Offset) {
+    MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+                           .addReg(StackPtr)
+                           .addImm(StackProbeSize)
+                           .setMIFlag(MachineInstr::FrameSetup);
+    MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+
+
+    addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(MovMIOpc))
+                     .setMIFlag(MachineInstr::FrameSetup),
+                 StackPtr, false, 0)
+        .addImm(0)
+        .setMIFlag(MachineInstr::FrameSetup);
+    NumFrameExtraProbe++;
+    CurrentOffset += StackProbeSize;
+    CurrentProbeOffset += StackProbeSize;
+  }
+
+  uint64_t ChunkSize = Offset - CurrentOffset;
+  MachineInstr *MI = BuildMI(MBB, MBBI, DL, TII.get(Opc), StackPtr)
+                         .addReg(StackPtr)
+                         .addImm(ChunkSize)
+                         .setMIFlag(MachineInstr::FrameSetup);
+  MI->getOperand(3).setIsDead(); // The EFLAGS implicit def is dead.
+}
+
+void X86FrameLowering::emitStackProbeInlineGenericLoop(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL,
+    uint64_t Offset) const {
+  assert(Offset && "null offset");
+
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
+  const X86TargetLowering &TLI = *STI.getTargetLowering();
+  const unsigned MovMIOpc = Is64Bit ? X86::MOV64mi32 : X86::MOV32mi;
+  const uint64_t StackProbeSize = TLI.getStackProbeSize(MF);
+
+  // Synthesize a loop
+  NumFrameLoopProbe++;
+  const BasicBlock *LLVM_BB = MBB.getBasicBlock();
+
+  MachineBasicBlock *testMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *tailMBB = MF.CreateMachineBasicBlock(LLVM_BB);
+
+  MachineFunction::iterator MBBIter = ++MBB.getIterator();
+  MF.insert(MBBIter, testMBB);
+  MF.insert(MBBIter, tailMBB);
+
+  Register FinalStackProbed = Uses64BitFramePtr ? X86::R11 : X86::R11D;
+  BuildMI(MBB, MBBI, DL, TII.get(TargetOpcode::COPY), FinalStackProbed)
+      .addReg(StackPtr)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // save loop bound
+  {
+    const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, Offset);
+    BuildMI(MBB, MBBI, DL, TII.get(Opc), FinalStackProbed)
+        .addReg(FinalStackProbed)
+        .addImm(Offset / StackProbeSize * StackProbeSize)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // allocate a page
+  {
+    const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, StackProbeSize);
+    BuildMI(testMBB, DL, TII.get(Opc), StackPtr)
+        .addReg(StackPtr)
+        .addImm(StackProbeSize)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // touch the page
+  addRegOffset(BuildMI(testMBB, DL, TII.get(MovMIOpc))
+                   .setMIFlag(MachineInstr::FrameSetup),
+               StackPtr, false, 0)
+      .addImm(0)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // cmp with stack pointer bound
+  BuildMI(testMBB, DL, TII.get(Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
+      .addReg(StackPtr)
+      .addReg(FinalStackProbed)
+      .setMIFlag(MachineInstr::FrameSetup);
+
+  // jump
+  BuildMI(testMBB, DL, TII.get(X86::JCC_1))
+      .addMBB(testMBB)
+      .addImm(X86::COND_NE)
+      .setMIFlag(MachineInstr::FrameSetup);
+  testMBB->addSuccessor(testMBB);
+  testMBB->addSuccessor(tailMBB);
+
+  // BB management
+  tailMBB->splice(tailMBB->end(), &MBB, MBBI, MBB.end());
+  tailMBB->transferSuccessorsAndUpdatePHIs(&MBB);
+  MBB.addSuccessor(testMBB);
+
+  // handle tail
+  unsigned TailOffset = Offset % StackProbeSize;
+  if (TailOffset) {
+    const unsigned Opc = getSUBriOpcode(Uses64BitFramePtr, TailOffset);
+    BuildMI(*tailMBB, tailMBB->begin(), DL, TII.get(Opc), StackPtr)
+        .addReg(StackPtr)
+        .addImm(TailOffset)
+        .setMIFlag(MachineInstr::FrameSetup);
+  }
+
+  // Update Live In information
+  recomputeLiveIns(*testMBB);
+  recomputeLiveIns(*tailMBB);
+}
+
+void X86FrameLowering::emitStackProbeInlineWindowsCoreCLR64(
+    MachineFunction &MF, MachineBasicBlock &MBB,
+    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
+  const X86Subtarget &STI = MF.getSubtarget<X86Subtarget>();
   assert(STI.is64Bit() && "different expansion needed for 32 bit");
   assert(STI.isTargetWindowsCoreCLR() && "custom expansion expects CoreCLR");
   const TargetInstrInfo &TII = *STI.getInstrInfo();
@@ -765,10 +964,10 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
                                           bool InProlog) const {
   bool IsLargeCodeModel = MF.getTarget().getCodeModel() == CodeModel::Large;
 
-  // FIXME: Add retpoline support and remove this.
-  if (Is64Bit && IsLargeCodeModel && STI.useRetpolineIndirectCalls())
+  // FIXME: Add indirect thunk support and remove this.
+  if (Is64Bit && IsLargeCodeModel && STI.useIndirectThunkCalls())
     report_fatal_error("Emitting stack probe calls on 64-bit with the large "
-                       "code model and retpoline not yet implemented.");
+                       "code model and indirect thunks not yet implemented.");
 
   unsigned CallOp;
   if (Is64Bit)
@@ -821,16 +1020,6 @@ void X86FrameLowering::emitStackProbeCall(MachineFunction &MF,
   }
 }
 
-void X86FrameLowering::emitStackProbeInlineStub(
-    MachineFunction &MF, MachineBasicBlock &MBB,
-    MachineBasicBlock::iterator MBBI, const DebugLoc &DL, bool InProlog) const {
-
-  assert(InProlog && "ChkStkStub called outside prolog!");
-
-  BuildMI(MBB, MBBI, DL, TII.get(X86::CALLpcrel32))
-      .addExternalSymbol("__chkstk_stub");
-}
-
 static unsigned calculateSetFPREG(uint64_t SPAdjust) {
   // Win64 ABI has a less restrictive limitation of 240; 128 works equally well
   // and might require smaller successive adjustments.
@@ -846,15 +1035,15 @@ static unsigned calculateSetFPREG(uint64_t SPAdjust) {
 // go with the minimum SlotSize.
 uint64_t X86FrameLowering::calculateMaxStackAlign(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
-  uint64_t MaxAlign = MFI.getMaxAlignment(); // Desired stack alignment.
-  unsigned StackAlign = getStackAlignment();
+  Align MaxAlign = MFI.getMaxAlign(); // Desired stack alignment.
+  Align StackAlign = getStackAlign();
   if (MF.getFunction().hasFnAttribute("stackrealign")) {
     if (MFI.hasCalls())
       MaxAlign = (StackAlign > MaxAlign) ? StackAlign : MaxAlign;
     else if (MaxAlign < SlotSize)
-      MaxAlign = SlotSize;
+      MaxAlign = Align(SlotSize);
   }
-  return MaxAlign;
+  return MaxAlign.value();
 }
 
 void X86FrameLowering::BuildStackAlignAND(MachineBasicBlock &MBB,
@@ -1014,7 +1203,8 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     X86FI->setCalleeSavedFrameSize(
       X86FI->getCalleeSavedFrameSize() - TailCallReturnAddrDelta);
 
-  bool UseStackProbe = !STI.getTargetLowering()->getStackProbeSymbolName(MF).empty();
+  const bool EmitStackProbeCall =
+      STI.getTargetLowering()->hasStackProbeSymbol(MF);
   unsigned StackProbeSize = STI.getTargetLowering()->getStackProbeSize(MF);
 
   // Re-align the stack on 64-bit if the x86-interrupt calling convention is
@@ -1032,11 +1222,10 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   // pointer, calls, or dynamic alloca then we do not need to adjust the
   // stack pointer (we fit in the Red Zone). We also check that we don't
   // push and pop from the stack.
-  if (has128ByteRedZone(MF) &&
-      !TRI->needsStackRealignment(MF) &&
+  if (has128ByteRedZone(MF) && !TRI->needsStackRealignment(MF) &&
       !MFI.hasVarSizedObjects() &&             // No dynamic alloca.
       !MFI.adjustsStack() &&                   // No calls.
-      !UseStackProbe &&                        // No stack probes.
+      !EmitStackProbeCall &&                   // No stack probes.
       !MFI.hasCopyImplyingStackAdjustment() && // Don't push and pop.
       !MF.shouldSplitStack()) {                // Regular stack
     uint64_t MinSize = X86FI->getCalleeSavedFrameSize();
@@ -1115,7 +1304,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
       BuildCFI(MBB, MBBI, DL,
-               MCCFIInstruction::createDefCfaOffset(nullptr, 2 * stackGrowth));
+               MCCFIInstruction::cfiDefCfaOffset(nullptr, -2 * stackGrowth));
 
       // Change the rule for the FramePtr to be an "offset" rule.
       unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
@@ -1192,7 +1381,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
       BuildCFI(MBB, MBBI, DL,
-               MCCFIInstruction::createDefCfaOffset(nullptr, StackOffset));
+               MCCFIInstruction::cfiDefCfaOffset(nullptr, -StackOffset));
       StackOffset += stackGrowth;
     }
 
@@ -1237,7 +1426,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
   uint64_t AlignedNumBytes = NumBytes;
   if (IsWin64Prologue && !IsFunclet && TRI->needsStackRealignment(MF))
     AlignedNumBytes = alignTo(AlignedNumBytes, MaxAlign);
-  if (AlignedNumBytes >= StackProbeSize && UseStackProbe) {
+  if (AlignedNumBytes >= StackProbeSize && EmitStackProbeCall) {
     assert(!X86FI->getUsesRedZone() &&
            "The Red Zone is not accounted for in stack probes");
 
@@ -1323,17 +1512,17 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64rm), Establisher),
                    Establisher, false, PSPSlotOffset)
           .addMemOperand(MF.getMachineMemOperand(
-              NoInfo, MachineMemOperand::MOLoad, SlotSize, SlotSize));
+              NoInfo, MachineMemOperand::MOLoad, SlotSize, Align(SlotSize)));
       ;
       // Save the root establisher back into the current funclet's (mostly
       // empty) frame, in case a sub-funclet or the GC needs it.
       addRegOffset(BuildMI(MBB, MBBI, DL, TII.get(X86::MOV64mr)), StackPtr,
                    false, PSPSlotOffset)
           .addReg(Establisher)
-          .addMemOperand(
-              MF.getMachineMemOperand(NoInfo, MachineMemOperand::MOStore |
-                                                  MachineMemOperand::MOVolatile,
-                                      SlotSize, SlotSize));
+          .addMemOperand(MF.getMachineMemOperand(
+              NoInfo,
+              MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
+              SlotSize, Align(SlotSize)));
     }
     SPOrEstablisher = Establisher;
   } else {
@@ -1370,7 +1559,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     // into the registration node so that the runtime will restore it for us.
     if (!MBB.isCleanupFuncletEntry()) {
       assert(Personality == EHPersonality::MSVC_CXX);
-      unsigned FrameReg;
+      Register FrameReg;
       int FI = MF.getWinEHFuncInfo()->EHRegNodeFrameIndex;
       int64_t EHRegOffset = getFrameIndexReference(MF, FI, FrameReg);
       // ESP is the first field, so no extra displacement is needed.
@@ -1389,7 +1578,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       if (unsigned Reg = TII.isStoreToStackSlot(FrameInstr, FI)) {
         if (X86::FR64RegClass.contains(Reg)) {
           int Offset;
-          unsigned IgnoredFrameReg;
+          Register IgnoredFrameReg;
           if (IsWin64Prologue && IsFunclet)
             Offset = getWin64EHFrameIndexRef(MF, FI, IgnoredFrameReg);
           else
@@ -1423,7 +1612,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
         .addReg(StackPtr)
         .addMemOperand(MF.getMachineMemOperand(
             PSPInfo, MachineMemOperand::MOStore | MachineMemOperand::MOVolatile,
-            SlotSize, SlotSize));
+            SlotSize, Align(SlotSize)));
   }
 
   // Realign stack after we spilled callee-saved registers (so that we'll be
@@ -1464,7 +1653,7 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
       // it recovers the frame pointer from the base pointer rather than the
       // other way around.
       unsigned Opm = Uses64BitFramePtr ? X86::MOV64mr : X86::MOV32mr;
-      unsigned UsedReg;
+      Register UsedReg;
       int Offset =
           getFrameIndexReference(MF, X86FI->getSEHFramePtrSaveIndex(), UsedReg);
       assert(UsedReg == BasePtr);
@@ -1479,12 +1668,13 @@ void X86FrameLowering::emitPrologue(MachineFunction &MF,
     if (!HasFP && NumBytes) {
       // Define the current CFA rule to use the provided offset.
       assert(StackSize);
-      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
-                                  nullptr, -StackSize + stackGrowth));
+      BuildCFI(
+          MBB, MBBI, DL,
+          MCCFIInstruction::cfiDefCfaOffset(nullptr, StackSize - stackGrowth));
     }
 
     // Emit DWARF info specifying the offsets of the callee-saved registers.
-    emitCalleeSavedFrameMoves(MBB, MBBI, DL);
+    emitCalleeSavedFrameMoves(MBB, MBBI, DL, true);
   }
 
   // X86 Interrupt handling function cannot assume anything about the direction
@@ -1541,7 +1731,7 @@ static bool isFuncletReturnInstr(MachineInstr &MI) {
 unsigned
 X86FrameLowering::getPSPSlotOffsetFromSP(const MachineFunction &MF) const {
   const WinEHFuncInfo &Info = *MF.getWinEHFuncInfo();
-  unsigned SPReg;
+  Register SPReg;
   int Offset = getFrameIndexReferencePreferSP(MF, Info.PSPSymFrameIdx, SPReg,
                                               /*IgnoreSPUpdates*/ true);
   assert(Offset >= 0 && SPReg == TRI->getStackRegister());
@@ -1573,7 +1763,7 @@ X86FrameLowering::getWinEHFuncletFrameSize(const MachineFunction &MF) const {
   // RBP is not included in the callee saved register block. After pushing RBP,
   // everything is 16 byte aligned. Everything we allocate before an outgoing
   // call must also be 16 byte aligned.
-  unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlignment());
+  unsigned FrameSizeMinusRBP = alignTo(CSSize + UsedSize, getStackAlign());
   // Subtract out the size of the callee saved registers. This is how much stack
   // each funclet will allocate.
   return FrameSizeMinusRBP + XMMSize - CSSize;
@@ -1634,6 +1824,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
   }
   uint64_t SEHStackAllocAmt = NumBytes;
 
+  // AfterPop is the position to insert .cfi_restore.
+  MachineBasicBlock::iterator AfterPop = MBBI;
   if (HasFP) {
     // Pop EBP.
     BuildMI(MBB, MBBI, DL, TII.get(Is64Bit ? X86::POP64r : X86::POP32r),
@@ -1642,8 +1834,15 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     if (NeedsDwarfCFI) {
       unsigned DwarfStackPtr =
           TRI->getDwarfRegNum(Is64Bit ? X86::RSP : X86::ESP, true);
-      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfa(
-                                  nullptr, DwarfStackPtr, -SlotSize));
+      BuildCFI(MBB, MBBI, DL,
+               MCCFIInstruction::cfiDefCfa(nullptr, DwarfStackPtr, SlotSize));
+      if (!MBB.succ_empty() && !MBB.isReturnBlock()) {
+        unsigned DwarfFramePtr = TRI->getDwarfRegNum(MachineFramePtr, true);
+        BuildCFI(MBB, AfterPop, DL,
+                 MCCFIInstruction::createRestore(nullptr, DwarfFramePtr));
+        --MBBI;
+        --AfterPop;
+      }
       --MBBI;
     }
   }
@@ -1711,8 +1910,8 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
     emitSPUpdate(MBB, MBBI, DL, NumBytes, /*InEpilogue=*/true);
     if (!hasFP(MF) && NeedsDwarfCFI) {
       // Define the current CFA rule to use the provided offset.
-      BuildCFI(MBB, MBBI, DL, MCCFIInstruction::createDefCfaOffset(
-                                  nullptr, -CSSize - SlotSize));
+      BuildCFI(MBB, MBBI, DL,
+               MCCFIInstruction::cfiDefCfaOffset(nullptr, CSSize + SlotSize));
     }
     --MBBI;
   }
@@ -1738,11 +1937,18 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
       if (Opc == X86::POP32r || Opc == X86::POP64r) {
         Offset += SlotSize;
         BuildCFI(MBB, MBBI, DL,
-                 MCCFIInstruction::createDefCfaOffset(nullptr, Offset));
+                 MCCFIInstruction::cfiDefCfaOffset(nullptr, -Offset));
       }
     }
   }
 
+  // Emit DWARF info specifying the restores of the callee-saved registers.
+  // For epilogue with return inside or being other block without successor,
+  // no need to generate .cfi_restore for callee-saved registers.
+  if (NeedsDwarfCFI && !MBB.succ_empty() && !MBB.isReturnBlock()) {
+    emitCalleeSavedFrameMoves(MBB, AfterPop, DL, false);
+  }
+
   if (Terminator == MBB.end() || !isTailCallOpcode(Terminator->getOpcode())) {
     // Add the return addr area delta back since we are not tail calling.
     int Offset = -1 * X86FI->getTCReturnAddrDelta();
@@ -1756,7 +1962,7 @@ void X86FrameLowering::emitEpilogue(MachineFunction &MF,
 }
 
 int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
-                                             unsigned &FrameReg) const {
+                                             Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
   bool IsFixed = MFI.isFixedObjectIndex(FI);
@@ -1821,7 +2027,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
       // Skip the saved EBP.
       return Offset + SlotSize + FPDelta;
     } else {
-      assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
+      assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
       return Offset + StackSize;
     }
   } else if (TRI->needsStackRealignment(MF)) {
@@ -1829,7 +2035,7 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
       // Skip the saved EBP.
       return Offset + SlotSize + FPDelta;
     } else {
-      assert((-(Offset + StackSize)) % MFI.getObjectAlignment(FI) == 0);
+      assert(isAligned(MFI.getObjectAlign(FI), -(Offset + StackSize)));
       return Offset + StackSize;
     }
     // FIXME: Support tail calls
@@ -1849,8 +2055,8 @@ int X86FrameLowering::getFrameIndexReference(const MachineFunction &MF, int FI,
   return Offset + FPDelta;
 }
 
-int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF,
-                                              int FI, unsigned &FrameReg) const {
+int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
+                                              Register &FrameReg) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
   const auto& WinEHXMMSlotInfo = X86FI->getWinEHXMMSlotInfo();
@@ -1860,21 +2066,21 @@ int X86FrameLowering::getWin64EHFrameIndexRef(const MachineFunction &MF,
     return getFrameIndexReference(MF, FI, FrameReg);
 
   FrameReg = TRI->getStackRegister();
-  return alignDown(MFI.getMaxCallFrameSize(), getStackAlignment()) + it->second;
+  return alignDown(MFI.getMaxCallFrameSize(), getStackAlign().value()) +
+         it->second;
 }
 
 int X86FrameLowering::getFrameIndexReferenceSP(const MachineFunction &MF,
-                                               int FI, unsigned &FrameReg,
+                                               int FI, Register &FrameReg,
                                                int Adjustment) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   FrameReg = TRI->getStackRegister();
   return MFI.getObjectOffset(FI) - getOffsetOfLocalArea() + Adjustment;
 }
 
-int
-X86FrameLowering::getFrameIndexReferencePreferSP(const MachineFunction &MF,
-                                                 int FI, unsigned &FrameReg,
-                                                 bool IgnoreSPUpdates) const {
+int X86FrameLowering::getFrameIndexReferencePreferSP(
+    const MachineFunction &MF, int FI, Register &FrameReg,
+    bool IgnoreSPUpdates) const {
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   // Does not include any dynamic realign.
@@ -1985,7 +2191,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
   if (this->TRI->hasBasePointer(MF)) {
     // Allocate a spill slot for EBP if we have a base pointer and EH funclets.
     if (MF.hasEHFunclets()) {
-      int FI = MFI.CreateSpillStackObject(SlotSize, SlotSize);
+      int FI = MFI.CreateSpillStackObject(SlotSize, Align(SlotSize));
       X86FI->setHasSEHFramePtrSave(true);
       X86FI->setSEHFramePtrSaveIndex(FI);
     }
@@ -2038,16 +2244,16 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
 
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg, VT);
     unsigned Size = TRI->getSpillSize(*RC);
-    unsigned Align = TRI->getSpillAlignment(*RC);
+    Align Alignment = TRI->getSpillAlign(*RC);
     // ensure alignment
     assert(SpillSlotOffset < 0 && "SpillSlotOffset should always < 0 on X86");
-    SpillSlotOffset = -alignTo(-SpillSlotOffset, Align);
+    SpillSlotOffset = -alignTo(-SpillSlotOffset, Alignment);
 
     // spill into slot
     SpillSlotOffset -= Size;
     int SlotIndex = MFI.CreateFixedSpillStackObject(Size, SpillSlotOffset);
     CSI[i - 1].setFrameIdx(SlotIndex);
-    MFI.ensureMaxAlignment(Align);
+    MFI.ensureMaxAlignment(Alignment);
 
     // Save the start offset and size of XMM in stack frame for funclets.
     if (X86::VR128RegClass.contains(Reg)) {
@@ -2061,8 +2267,7 @@ bool X86FrameLowering::assignCalleeSavedSpillSlots(
 
 bool X86FrameLowering::spillCalleeSavedRegisters(
     MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-    const std::vector<CalleeSavedInfo> &CSI,
-    const TargetRegisterInfo *TRI) const {
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   DebugLoc DL = MBB.findDebugLoc(MI);
 
   // Don't save CSRs in 32-bit EH funclets. The caller saves EBX, EBP, ESI, EDI
@@ -2161,10 +2366,9 @@ void X86FrameLowering::emitCatchRetReturnValue(MachineBasicBlock &MBB,
   CatchRetTarget->setHasAddressTaken();
 }
 
-bool X86FrameLowering::restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                               MachineBasicBlock::iterator MI,
-                                          std::vector<CalleeSavedInfo> &CSI,
-                                          const TargetRegisterInfo *TRI) const {
+bool X86FrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return false;
 
@@ -2493,9 +2697,9 @@ void X86FrameLowering::adjustForSegmentedStacks(
     // is laid out within 2^31 bytes of each function body, but this seems
     // to be sufficient for JIT.
     // FIXME: Add retpoline support and remove the error here..
-    if (STI.useRetpolineIndirectCalls())
+    if (STI.useIndirectThunkCalls())
       report_fatal_error("Emitting morestack calls on 64-bit with the large "
-                         "code model and retpoline not yet implemented.");
+                         "code model and thunks not yet implemented.");
     BuildMI(allocMBB, DL, TII.get(X86::CALL64m))
         .addReg(X86::RIP)
         .addImm(0)
@@ -2799,6 +3003,12 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
   I = MBB.erase(I);
   auto InsertPos = skipDebugInstructionsForward(I, MBB.end());
 
+  // Try to avoid emitting dead SP adjustments if the block end is unreachable,
+  // typically because the function is marked noreturn (abort, throw,
+  // assert_fail, etc).
+  if (isDestroy && blockEndIsUnreachable(MBB, I))
+    return I;
+
   if (!reserveCallFrame) {
     // If the stack pointer can be changed after prologue, turn the
     // adjcallstackup instruction into a 'sub ESP, <amt>' and the
@@ -2807,8 +3017,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     // We need to keep the stack aligned properly.  To do this, we round the
     // amount of space needed for the outgoing arguments up to the next
     // alignment boundary.
-    unsigned StackAlign = getStackAlignment();
-    Amount = alignTo(Amount, StackAlign);
+    Amount = alignTo(Amount, getStackAlign());
 
     const Function &F = MF.getFunction();
     bool WindowsCFI = MF.getTarget().getMCAsmInfo()->usesWindowsCFI();
@@ -2881,13 +3090,7 @@ eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
     return I;
   }
 
-  if (isDestroy && InternalAmt && !blockEndIsUnreachable(MBB, I)) {
-    // If we are performing frame pointer elimination and if the callee pops
-    // something off the stack pointer, add it back.  We do this until we have
-    // more advanced stack pointer tracking ability.
-    // We are not tracking the stack pointer adjustment by the callee, so make
-    // sure we restore the stack pointer immediately after the call, there may
-    // be spill code inserted between the CALL and ADJCALLSTACKUP instructions.
+  if (InternalAmt) {
     MachineBasicBlock::iterator CI = I;
     MachineBasicBlock::iterator B = MBB.begin();
     while (CI != B && !std::prev(CI)->isCall())
@@ -2964,7 +3167,7 @@ MachineBasicBlock::iterator X86FrameLowering::restoreWin32EHStackPointers(
         .setMIFlag(MachineInstr::FrameSetup);
   }
 
-  unsigned UsedReg;
+  Register UsedReg;
   int EHRegOffset = getFrameIndexReference(MF, FI, UsedReg);
   int EndOffset = -EHRegOffset - EHRegSize;
   FuncInfo.EHRegNodeEndOffset = EndOffset;
@@ -3003,8 +3206,8 @@ int X86FrameLowering::getInitialCFAOffset(const MachineFunction &MF) const {
   return TRI->getSlotSize();
 }
 
-unsigned X86FrameLowering::getInitialCFARegister(const MachineFunction &MF)
-    const {
+Register
+X86FrameLowering::getInitialCFARegister(const MachineFunction &MF) const {
   return TRI->getDwarfRegNum(StackPtr, true);
 }
 
@@ -3014,7 +3217,7 @@ struct X86FrameSortingObject {
   bool IsValid = false;         // true if we care about this Object.
   unsigned ObjectIndex = 0;     // Index of Object into MFI list.
   unsigned ObjectSize = 0;      // Size of Object in bytes.
-  unsigned ObjectAlignment = 1; // Alignment of Object in bytes.
+  Align ObjectAlignment = Align(1); // Alignment of Object in bytes.
   unsigned ObjectNumUses = 0;   // Object static number of uses.
 };
 
@@ -3099,7 +3302,7 @@ void X86FrameLowering::orderFrameObjects(
   for (auto &Obj : ObjectsToAllocate) {
     SortingObjects[Obj].IsValid = true;
     SortingObjects[Obj].ObjectIndex = Obj;
-    SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlignment(Obj);
+    SortingObjects[Obj].ObjectAlignment = MFI.getObjectAlign(Obj);
     // Set the size.
     int ObjectSize = MFI.getObjectSize(Obj);
     if (ObjectSize == 0)
@@ -3192,7 +3395,7 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
       int FrameIndex = H.CatchObj.FrameIndex;
       if (FrameIndex != INT_MAX) {
         // Ensure alignment.
-        unsigned Align = MFI.getObjectAlignment(FrameIndex);
+        unsigned Align = MFI.getObjectAlign(FrameIndex).value();
         MinFixedObjOffset -= std::abs(MinFixedObjOffset) % Align;
         MinFixedObjOffset -= MFI.getObjectSize(FrameIndex);
         MFI.setObjectOffset(FrameIndex, MinFixedObjOffset);
@@ -3219,3 +3422,24 @@ void X86FrameLowering::processFunctionBeforeFrameFinalized(
                     UnwindHelpFI)
       .addImm(-2);
 }
+
+void X86FrameLowering::processFunctionBeforeFrameIndicesReplaced(
+    MachineFunction &MF, RegScavenger *RS) const {
+  if (STI.is32Bit() && MF.hasEHFunclets())
+    restoreWinEHStackPointersInParent(MF);
+}
+
+void X86FrameLowering::restoreWinEHStackPointersInParent(
+    MachineFunction &MF) const {
+  // 32-bit functions have to restore stack pointers when control is transferred
+  // back to the parent function. These blocks are identified as eh pads that
+  // are not funclet entries.
+  bool IsSEH = isAsynchronousEHPersonality(
+      classifyEHPersonality(MF.getFunction().getPersonalityFn()));
+  for (MachineBasicBlock &MBB : MF) {
+    bool NeedsRestore = MBB.isEHPad() && !MBB.isEHFuncletEntry();
+    if (NeedsRestore)
+      restoreWin32EHStackPointers(MBB, MBB.begin(), DebugLoc(),
+                                  /*RestoreSP=*/IsSEH);
+  }
+}
diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index 2103d6471ead..c0b4be95f88d 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -58,9 +58,14 @@ public:
   void inlineStackProbe(MachineFunction &MF,
                         MachineBasicBlock &PrologMBB) const override;
 
+  void
+  emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
+                            MachineBasicBlock::iterator MBBI) const override;
+
   void emitCalleeSavedFrameMoves(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MBBI,
-                                 const DebugLoc &DL) const;
+                                 const DebugLoc &DL,
+                                 bool IsPrologue) const override;
 
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
@@ -83,13 +88,14 @@ public:
 
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
-                                 const std::vector<CalleeSavedInfo> &CSI,
+                                 ArrayRef<CalleeSavedInfo> CSI,
                                  const TargetRegisterInfo *TRI) const override;
 
-  bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MI,
-                                  std::vector<CalleeSavedInfo> &CSI,
-                                  const TargetRegisterInfo *TRI) const override;
+  bool
+  restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              MutableArrayRef<CalleeSavedInfo> CSI,
+                              const TargetRegisterInfo *TRI) const override;
 
   bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
@@ -97,14 +103,14 @@ public:
   bool needsFrameIndexResolution(const MachineFunction &MF) const override;
 
   int getFrameIndexReference(const MachineFunction &MF, int FI,
-                             unsigned &FrameReg) const override;
+                             Register &FrameReg) const override;
 
-  int getWin64EHFrameIndexRef(const MachineFunction &MF,
-                              int FI, unsigned &SPReg) const;
-  int getFrameIndexReferenceSP(const MachineFunction &MF,
-                               int FI, unsigned &SPReg, int Adjustment) const;
+  int getWin64EHFrameIndexRef(const MachineFunction &MF, int FI,
+                              Register &SPReg) const;
+  int getFrameIndexReferenceSP(const MachineFunction &MF, int FI,
+                               Register &SPReg, int Adjustment) const;
   int getFrameIndexReferencePreferSP(const MachineFunction &MF, int FI,
-                                     unsigned &FrameReg,
+                                     Register &FrameReg,
                                      bool IgnoreSPUpdates) const override;
 
   MachineBasicBlock::iterator
@@ -116,6 +122,10 @@ public:
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                            RegScavenger *RS) const override;
 
+  void
+  processFunctionBeforeFrameIndicesReplaced(MachineFunction &MF,
+                                            RegScavenger *RS) const override;
+
   /// Check the instruction before/after the passed instruction. If
   /// it is an ADD/SUB/LEA instruction it is deleted argument and the
   /// stack adjustment is returned as a positive value for ADD/LEA and
@@ -169,12 +179,14 @@ public:
                               MachineBasicBlock::iterator MBBI,
                               const DebugLoc &DL, bool RestoreSP = false) const;
 
+  void restoreWinEHStackPointersInParent(MachineFunction &MF) const;
+
   int getInitialCFAOffset(const MachineFunction &MF) const override;
 
-  unsigned getInitialCFARegister(const MachineFunction &MF) const override;
+  Register getInitialCFARegister(const MachineFunction &MF) const override;
 
   /// Return true if the function has a redzone (accessible bytes past the
-  /// frame of the top of stack function) as part of it's ABI.  
+  /// frame of the top of stack function) as part of it's ABI.
   bool has128ByteRedZone(const MachineFunction& MF) const;
 
 private:
@@ -189,11 +201,33 @@ private:
   void emitStackProbeInline(MachineFunction &MF, MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MBBI,
                             const DebugLoc &DL, bool InProlog) const;
+  void emitStackProbeInlineWindowsCoreCLR64(MachineFunction &MF,
+                                            MachineBasicBlock &MBB,
+                                            MachineBasicBlock::iterator MBBI,
+                                            const DebugLoc &DL,
+                                            bool InProlog) const;
+  void emitStackProbeInlineGeneric(MachineFunction &MF, MachineBasicBlock &MBB,
+                                   MachineBasicBlock::iterator MBBI,
+                                   const DebugLoc &DL, bool InProlog) const;
+
+  void emitStackProbeInlineGenericBlock(MachineFunction &MF,
+                                        MachineBasicBlock &MBB,
+                                        MachineBasicBlock::iterator MBBI,
+                                        const DebugLoc &DL,
+                                        uint64_t Offset) const;
+
+  void emitStackProbeInlineGenericLoop(MachineFunction &MF,
+                                       MachineBasicBlock &MBB,
+                                       MachineBasicBlock::iterator MBBI,
+                                       const DebugLoc &DL,
+                                       uint64_t Offset) const;
 
   /// Emit a stub to later inline the target stack probe.
-  void emitStackProbeInlineStub(MachineFunction &MF, MachineBasicBlock &MBB,
-                                MachineBasicBlock::iterator MBBI,
-                                const DebugLoc &DL, bool InProlog) const;
+  MachineInstr *emitStackProbeInlineStub(MachineFunction &MF,
+                                         MachineBasicBlock &MBB,
+                                         MachineBasicBlock::iterator MBBI,
+                                         const DebugLoc &DL,
+                                         bool InProlog) const;
 
   /// Aligns the stack pointer by ANDing it with -MaxAlign.
   void BuildStackAlignAND(MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
index bf33f399db28..3cd80cb04ab8 100644
--- a/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -17,8 +17,6 @@
 #include "X86Subtarget.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/ConstantRange.h"
@@ -31,9 +29,6 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
-#include "llvm/Support/raw_ostream.h"
-#include "llvm/Target/TargetMachine.h"
-#include "llvm/Target/TargetOptions.h"
 #include <stdint.h>
 using namespace llvm;
 
@@ -45,6 +40,10 @@ static cl::opt<bool> AndImmShrink("x86-and-imm-shrink", cl::init(true),
     cl::desc("Enable setting constant bits to reduce size of mask immediates"),
     cl::Hidden);
 
+static cl::opt<bool> EnablePromoteAnyextLoad(
+    "x86-promote-anyext-load", cl::init(true),
+    cl::desc("Enable promoting aligned anyext load to wider load"), cl::Hidden);
+
 //===----------------------------------------------------------------------===//
 //                      Pattern Matcher Implementation
 //===----------------------------------------------------------------------===//
@@ -72,14 +71,14 @@ namespace {
     const char *ES;
     MCSymbol *MCSym;
     int JT;
-    unsigned Align;    // CP alignment.
+    Align Alignment;            // CP alignment.
     unsigned char SymbolFlags;  // X86II::MO_*
     bool NegateIndex = false;
 
     X86ISelAddressMode()
         : BaseType(RegBase), Base_FrameIndex(0), Scale(1), IndexReg(), Disp(0),
           Segment(), GV(nullptr), CP(nullptr), BlockAddr(nullptr), ES(nullptr),
-          MCSym(nullptr), JT(-1), Align(0), SymbolFlags(X86II::MO_NO_FLAG) {}
+          MCSym(nullptr), JT(-1), SymbolFlags(X86II::MO_NO_FLAG) {}
 
     bool hasSymbolicDisplacement() const {
       return GV != nullptr || CP != nullptr || ES != nullptr ||
@@ -145,7 +144,7 @@ namespace {
         dbgs() << MCSym;
       else
         dbgs() << "nul";
-      dbgs() << " JT" << JT << " Align" << Align << '\n';
+      dbgs() << " JT" << JT << " Align" << Alignment.value() << '\n';
     }
 #endif
   };
@@ -161,10 +160,6 @@ namespace {
     /// make the right decision when generating code for different targets.
     const X86Subtarget *Subtarget;
 
-    /// If true, selector should try to optimize for code size instead of
-    /// performance.
-    bool OptForSize;
-
     /// If true, selector should try to optimize for minimum code size.
     bool OptForMinSize;
 
@@ -173,7 +168,7 @@ namespace {
 
   public:
     explicit X86DAGToDAGISel(X86TargetMachine &tm, CodeGenOpt::Level OptLevel)
-        : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr), OptForSize(false),
+        : SelectionDAGISel(tm, OptLevel), Subtarget(nullptr),
           OptForMinSize(false), IndirectTlsSegRefs(false) {}
 
     StringRef getPassName() const override {
@@ -187,16 +182,15 @@ namespace {
                              "indirect-tls-seg-refs");
 
       // OptFor[Min]Size are used in pattern predicates that isel is matching.
-      OptForSize = MF.getFunction().hasOptSize();
       OptForMinSize = MF.getFunction().hasMinSize();
-      assert((!OptForMinSize || OptForSize) &&
+      assert((!OptForMinSize || MF.getFunction().hasOptSize()) &&
              "OptForMinSize implies OptForSize");
 
       SelectionDAGISel::runOnMachineFunction(MF);
       return true;
     }
 
-    void EmitFunctionEntryCode() override;
+    void emitFunctionEntryCode() override;
 
     bool IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const override;
 
@@ -221,9 +215,9 @@ namespace {
     bool selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
                     SDValue &Scale, SDValue &Index, SDValue &Disp,
                     SDValue &Segment);
-    bool selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
-                          SDValue &Scale, SDValue &Index, SDValue &Disp,
-                          SDValue &Segment);
+    bool selectVectorAddr(MemSDNode *Parent, SDValue BasePtr, SDValue IndexOp,
+                          SDValue ScaleOp, SDValue &Base, SDValue &Scale,
+                          SDValue &Index, SDValue &Disp, SDValue &Segment);
     bool selectMOV64Imm32(SDValue N, SDValue &Imm);
     bool selectLEAAddr(SDValue N, SDValue &Base,
                        SDValue &Scale, SDValue &Index, SDValue &Disp,
@@ -234,11 +228,6 @@ namespace {
     bool selectTLSADDRAddr(SDValue N, SDValue &Base,
                            SDValue &Scale, SDValue &Index, SDValue &Disp,
                            SDValue &Segment);
-    bool selectScalarSSELoad(SDNode *Root, SDNode *Parent, SDValue N,
-                             SDValue &Base, SDValue &Scale,
-                             SDValue &Index, SDValue &Disp,
-                             SDValue &Segment,
-                             SDValue &NodeWithChain);
     bool selectRelocImm(SDValue N, SDValue &Op);
 
     bool tryFoldLoad(SDNode *Root, SDNode *P, SDValue N,
@@ -259,6 +248,8 @@ namespace {
                           SDValue &Index, SDValue &Disp,
                           SDValue &Segment);
 
+    bool isProfitableToFormMaskedOp(SDNode *N) const;
+
     /// Implement addressing mode selection for inline asm expressions.
     bool SelectInlineAsmMemoryOperand(const SDValue &Op,
                                       unsigned ConstraintID,
@@ -300,8 +291,8 @@ namespace {
                                               MVT::i32, AM.Disp,
                                               AM.SymbolFlags);
       else if (AM.CP)
-        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32,
-                                             AM.Align, AM.Disp, AM.SymbolFlags);
+        Disp = CurDAG->getTargetConstantPool(AM.CP, MVT::i32, AM.Alignment,
+                                             AM.Disp, AM.SymbolFlags);
       else if (AM.ES) {
         assert(!AM.Disp && "Non-zero displacement is ignored with ES.");
         Disp = CurDAG->getTargetExternalSymbol(AM.ES, MVT::i32, AM.SymbolFlags);
@@ -368,9 +359,10 @@ namespace {
         if (User->getNumOperands() != 2)
           continue;
 
-        // If this can match to INC/DEC, don't count it as a use.
-        if (User->getOpcode() == ISD::ADD &&
-            (isOneConstant(SDValue(N, 0)) || isAllOnesConstant(SDValue(N, 0))))
+        // If this is a sign-extended 8-bit integer immediate used in an ALU
+        // instruction, there is probably an opcode encoding to save space.
+        auto *C = dyn_cast<ConstantSDNode>(N);
+        if (C && isInt<8>(C->getSExtValue()))
           continue;
 
         // Immediates that are used for offsets as part of stack
@@ -475,14 +467,6 @@ namespace {
 
     bool isSExtAbsoluteSymbolRef(unsigned Width, SDNode *N) const;
 
-    /// Returns whether this is a relocatable immediate in the range
-    /// [-2^Width .. 2^Width-1].
-    template <unsigned Width> bool isSExtRelocImm(SDNode *N) const {
-      if (auto *CN = dyn_cast<ConstantSDNode>(N))
-        return isInt<Width>(CN->getSExtValue());
-      return isSExtAbsoluteSymbolRef(Width, N);
-    }
-
     // Indicates we should prefer to use a non-temporal load for this load.
     bool useNonTemporalLoad(LoadSDNode *N) const {
       if (!N->isNonTemporal())
@@ -513,8 +497,8 @@ namespace {
     bool shrinkAndImmediate(SDNode *N);
     bool isMaskZeroExtended(SDNode *N) const;
     bool tryShiftAmountMod(SDNode *N);
-    bool combineIncDecVector(SDNode *Node);
     bool tryShrinkShlLogicImm(SDNode *N);
+    bool tryVPTERNLOG(SDNode *N);
     bool tryVPTESTM(SDNode *Root, SDValue Setcc, SDValue Mask);
     bool tryMatchBitSelect(SDNode *N);
 
@@ -581,12 +565,6 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
   if (!N.hasOneUse())
     return false;
 
-  // FIXME: Temporary hack to prevent strict floating point nodes from
-  // folding into masked operations illegally.
-  if (U == Root && Root->getOpcode() == ISD::VSELECT &&
-      N.getOpcode() != ISD::LOAD && N.getOpcode() != X86ISD::VBROADCAST_LOAD)
-    return false;
-
   if (N.getOpcode() != ISD::LOAD)
     return true;
 
@@ -650,6 +628,11 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
         if ((U->getOpcode() == ISD::ADD || U->getOpcode() == ISD::SUB) &&
             (-Imm->getAPIntValue()).isSignedIntN(8))
           return false;
+
+        if ((U->getOpcode() == X86ISD::ADD || U->getOpcode() == X86ISD::SUB) &&
+            (-Imm->getAPIntValue()).isSignedIntN(8) &&
+            hasNoCarryFlagUses(SDValue(U, 1)))
+          return false;
       }
 
       // If the other operand is a TLS address, we should fold it instead.
@@ -724,6 +707,20 @@ X86DAGToDAGISel::IsProfitableToFold(SDValue N, SDNode *U, SDNode *Root) const {
   return true;
 }
 
+// Indicates it is profitable to form an AVX512 masked operation. Returning
+// false will favor a masked register-register masked move or vblendm and the
+// operation will be selected separately.
+bool X86DAGToDAGISel::isProfitableToFormMaskedOp(SDNode *N) const {
+  assert(
+      (N->getOpcode() == ISD::VSELECT || N->getOpcode() == X86ISD::SELECTS) &&
+      "Unexpected opcode!");
+
+  // If the operation has additional users, the operation will be duplicated.
+  // Check the use count to prevent that.
+  // FIXME: Are there cheap opcodes we might want to duplicate?
+  return N->getOperand(1).hasOneUse();
+}
+
 /// Replace the original chain operand of the call with
 /// load's chain operand and move load below the call's chain operand.
 static void moveBelowOrigChain(SelectionDAG *CurDAG, SDValue Load,
@@ -799,6 +796,7 @@ static bool isCalleeLoad(SDValue Callee, SDValue &Chain, bool HasCallSeq) {
 }
 
 void X86DAGToDAGISel::PreprocessISelDAG() {
+  bool MadeChange = false;
   for (SelectionDAG::allnodes_iterator I = CurDAG->allnodes_begin(),
        E = CurDAG->allnodes_end(); I != E; ) {
     SDNode *N = &*I++; // Preincrement iterator to avoid invalidation issues.
@@ -811,11 +809,111 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       --I;
       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
       ++I;
-      CurDAG->DeleteNode(N);
+      MadeChange = true;
       continue;
     }
 
+    /// Convert vector increment or decrement to sub/add with an all-ones
+    /// constant:
+    /// add X, <1, 1...> --> sub X, <-1, -1...>
+    /// sub X, <1, 1...> --> add X, <-1, -1...>
+    /// The all-ones vector constant can be materialized using a pcmpeq
+    /// instruction that is commonly recognized as an idiom (has no register
+    /// dependency), so that's better/smaller than loading a splat 1 constant.
+    if ((N->getOpcode() == ISD::ADD || N->getOpcode() == ISD::SUB) &&
+        N->getSimpleValueType(0).isVector()) {
+
+      APInt SplatVal;
+      if (X86::isConstantSplat(N->getOperand(1), SplatVal) &&
+          SplatVal.isOneValue()) {
+        SDLoc DL(N);
+
+        MVT VT = N->getSimpleValueType(0);
+        unsigned NumElts = VT.getSizeInBits() / 32;
+        SDValue AllOnes =
+            CurDAG->getAllOnesConstant(DL, MVT::getVectorVT(MVT::i32, NumElts));
+        AllOnes = CurDAG->getBitcast(VT, AllOnes);
+
+        unsigned NewOpcode = N->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
+        SDValue Res =
+            CurDAG->getNode(NewOpcode, DL, VT, N->getOperand(0), AllOnes);
+        --I;
+        CurDAG->ReplaceAllUsesWith(N, Res.getNode());
+        ++I;
+        MadeChange = true;
+        continue;
+      }
+    }
+
     switch (N->getOpcode()) {
+    case X86ISD::VBROADCAST: {
+      MVT VT = N->getSimpleValueType(0);
+      // Emulate v32i16/v64i8 broadcast without BWI.
+      if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) {
+        MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8;
+        SDLoc dl(N);
+        SDValue NarrowBCast =
+            CurDAG->getNode(X86ISD::VBROADCAST, dl, NarrowVT, N->getOperand(0));
+        SDValue Res =
+            CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
+                            NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
+        unsigned Index = VT == MVT::v32i16 ? 16 : 32;
+        Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
+                              CurDAG->getIntPtrConstant(Index, dl));
+
+        --I;
+        CurDAG->ReplaceAllUsesWith(N, Res.getNode());
+        ++I;
+        MadeChange = true;
+        continue;
+      }
+
+      break;
+    }
+    case X86ISD::VBROADCAST_LOAD: {
+      MVT VT = N->getSimpleValueType(0);
+      // Emulate v32i16/v64i8 broadcast without BWI.
+      if (!Subtarget->hasBWI() && (VT == MVT::v32i16 || VT == MVT::v64i8)) {
+        MVT NarrowVT = VT == MVT::v32i16 ? MVT::v16i16 : MVT::v32i8;
+        auto *MemNode = cast<MemSDNode>(N);
+        SDLoc dl(N);
+        SDVTList VTs = CurDAG->getVTList(NarrowVT, MVT::Other);
+        SDValue Ops[] = {MemNode->getChain(), MemNode->getBasePtr()};
+        SDValue NarrowBCast = CurDAG->getMemIntrinsicNode(
+            X86ISD::VBROADCAST_LOAD, dl, VTs, Ops, MemNode->getMemoryVT(),
+            MemNode->getMemOperand());
+        SDValue Res =
+            CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, CurDAG->getUNDEF(VT),
+                            NarrowBCast, CurDAG->getIntPtrConstant(0, dl));
+        unsigned Index = VT == MVT::v32i16 ? 16 : 32;
+        Res = CurDAG->getNode(ISD::INSERT_SUBVECTOR, dl, VT, Res, NarrowBCast,
+                              CurDAG->getIntPtrConstant(Index, dl));
+
+        --I;
+        SDValue To[] = {Res, NarrowBCast.getValue(1)};
+        CurDAG->ReplaceAllUsesWith(N, To);
+        ++I;
+        MadeChange = true;
+        continue;
+      }
+
+      break;
+    }
+    case ISD::VSELECT: {
+      // Replace VSELECT with non-mask conditions with with BLENDV.
+      if (N->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
+        break;
+
+      assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
+      SDValue Blendv =
+          CurDAG->getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
+                          N->getOperand(0), N->getOperand(1), N->getOperand(2));
+      --I;
+      CurDAG->ReplaceAllUsesWith(N, Blendv.getNode());
+      ++I;
+      MadeChange = true;
+      continue;
+    }
     case ISD::FP_ROUND:
     case ISD::STRICT_FP_ROUND:
     case ISD::FP_TO_SINT:
@@ -849,7 +947,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       --I;
       CurDAG->ReplaceAllUsesWith(N, Res.getNode());
       ++I;
-      CurDAG->DeleteNode(N);
+      MadeChange = true;
       continue;
     }
     case ISD::SHL:
@@ -872,27 +970,33 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       --I;
       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
       ++I;
-      CurDAG->DeleteNode(N);
+      MadeChange = true;
       continue;
     }
     case ISD::ANY_EXTEND:
     case ISD::ANY_EXTEND_VECTOR_INREG: {
       // Replace vector any extend with the zero extend equivalents so we don't
       // need 2 sets of patterns. Ignore vXi1 extensions.
-      if (!N->getValueType(0).isVector() ||
-          N->getOperand(0).getScalarValueSizeInBits() == 1)
+      if (!N->getValueType(0).isVector())
         break;
 
-      unsigned NewOpc = N->getOpcode() == ISD::ANY_EXTEND
-                            ? ISD::ZERO_EXTEND
-                            : ISD::ZERO_EXTEND_VECTOR_INREG;
+      unsigned NewOpc;
+      if (N->getOperand(0).getScalarValueSizeInBits() == 1) {
+        assert(N->getOpcode() == ISD::ANY_EXTEND &&
+               "Unexpected opcode for mask vector!");
+        NewOpc = ISD::SIGN_EXTEND;
+      } else {
+        NewOpc = N->getOpcode() == ISD::ANY_EXTEND
+                              ? ISD::ZERO_EXTEND
+                              : ISD::ZERO_EXTEND_VECTOR_INREG;
+      }
 
       SDValue Res = CurDAG->getNode(NewOpc, SDLoc(N), N->getValueType(0),
                                     N->getOperand(0));
       --I;
       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
       ++I;
-      CurDAG->DeleteNode(N);
+      MadeChange = true;
       continue;
     }
     case ISD::FCEIL:
@@ -936,7 +1040,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       --I;
       CurDAG->ReplaceAllUsesWith(N, Res.getNode());
       ++I;
-      CurDAG->DeleteNode(N);
+      MadeChange = true;
       continue;
     }
     case X86ISD::FANDN:
@@ -979,7 +1083,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       --I;
       CurDAG->ReplaceAllUsesOfValueWith(SDValue(N, 0), Res);
       ++I;
-      CurDAG->DeleteNode(N);
+      MadeChange = true;
       continue;
     }
     }
@@ -987,7 +1091,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     if (OptLevel != CodeGenOpt::None &&
         // Only do this when the target can fold the load into the call or
         // jmp.
-        !Subtarget->useRetpolineIndirectCalls() &&
+        !Subtarget->useIndirectThunkCalls() &&
         ((N->getOpcode() == X86ISD::CALL && !Subtarget->slowTwoMemOps()) ||
          (N->getOpcode() == X86ISD::TC_RETURN &&
           (Subtarget->is64Bit() ||
@@ -1018,6 +1122,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
         continue;
       moveBelowOrigChain(CurDAG, Load, SDValue(N, 0), Chain);
       ++NumLoadMoved;
+      MadeChange = true;
       continue;
     }
 
@@ -1064,14 +1169,17 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       // operations.  Based on this, decide what we want to do.
       MVT MemVT = (N->getOpcode() == ISD::FP_ROUND) ? DstVT : SrcVT;
       SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+      int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
+      MachinePointerInfo MPI =
+          MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
       SDLoc dl(N);
 
       // FIXME: optimize the case where the src/dest is a load or store?
 
-      SDValue Store = CurDAG->getTruncStore(CurDAG->getEntryNode(), dl, N->getOperand(0),
-                                          MemTmp, MachinePointerInfo(), MemVT);
-      SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store, MemTmp,
-                                          MachinePointerInfo(), MemVT);
+      SDValue Store = CurDAG->getTruncStore(
+          CurDAG->getEntryNode(), dl, N->getOperand(0), MemTmp, MPI, MemVT);
+      SDValue Result = CurDAG->getExtLoad(ISD::EXTLOAD, dl, DstVT, Store,
+                                          MemTmp, MPI, MemVT);
 
       // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
       // extload we created.  This will cause general havok on the dag because
@@ -1117,6 +1225,9 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       // operations.  Based on this, decide what we want to do.
       MVT MemVT = (N->getOpcode() == ISD::STRICT_FP_ROUND) ? DstVT : SrcVT;
       SDValue MemTmp = CurDAG->CreateStackTemporary(MemVT);
+      int SPFI = cast<FrameIndexSDNode>(MemTmp)->getIndex();
+      MachinePointerInfo MPI =
+          MachinePointerInfo::getFixedStack(CurDAG->getMachineFunction(), SPFI);
       SDLoc dl(N);
 
       // FIXME: optimize the case where the src/dest is a load or store?
@@ -1127,7 +1238,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
         SDVTList VTs = CurDAG->getVTList(MVT::Other);
         SDValue Ops[] = {N->getOperand(0), N->getOperand(1), MemTmp};
         Store = CurDAG->getMemIntrinsicNode(X86ISD::FST, dl, VTs, Ops, MemVT,
-                                            MachinePointerInfo(), 0,
+                                            MPI, /*Align*/ None,
                                             MachineMemOperand::MOStore);
         if (N->getFlags().hasNoFPExcept()) {
           SDNodeFlags Flags = Store->getFlags();
@@ -1137,15 +1248,15 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
       } else {
         assert(SrcVT == MemVT && "Unexpected VT!");
         Store = CurDAG->getStore(N->getOperand(0), dl, N->getOperand(1), MemTmp,
-                                 MachinePointerInfo());
+                                 MPI);
       }
 
       if (!DstIsSSE) {
         SDVTList VTs = CurDAG->getVTList(DstVT, MVT::Other);
         SDValue Ops[] = {Store, MemTmp};
-        Result = CurDAG->getMemIntrinsicNode(X86ISD::FLD, dl, VTs, Ops, MemVT,
-                                             MachinePointerInfo(), 0,
-                                             MachineMemOperand::MOLoad);
+        Result = CurDAG->getMemIntrinsicNode(
+            X86ISD::FLD, dl, VTs, Ops, MemVT, MPI,
+            /*Align*/ None, MachineMemOperand::MOLoad);
         if (N->getFlags().hasNoFPExcept()) {
           SDNodeFlags Flags = Result->getFlags();
           Flags.setNoFPExcept(true);
@@ -1153,8 +1264,7 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
         }
       } else {
         assert(DstVT == MemVT && "Unexpected VT!");
-        Result =
-            CurDAG->getLoad(DstVT, dl, Store, MemTmp, MachinePointerInfo());
+        Result = CurDAG->getLoad(DstVT, dl, Store, MemTmp, MPI);
       }
 
       // We're about to replace all uses of the FP_ROUND/FP_EXTEND with the
@@ -1171,13 +1281,12 @@ void X86DAGToDAGISel::PreprocessISelDAG() {
     // Now that we did that, the node is dead.  Increment the iterator to the
     // next node to process, then delete N.
     ++I;
-    CurDAG->DeleteNode(N);
+    MadeChange = true;
   }
 
-  // The load+call transform above can leave some dead nodes in the graph. Make
-  // sure we remove them. Its possible some of the other transforms do to so
-  // just remove dead nodes unconditionally.
-  CurDAG->RemoveDeadNodes();
+  // Remove any dead nodes that may have been left behind.
+  if (MadeChange)
+    CurDAG->RemoveDeadNodes();
 }
 
 // Look for a redundant movzx/movsx that can occur after an 8-bit divrem.
@@ -1275,6 +1384,8 @@ void X86DAGToDAGISel::PostprocessISelDAG() {
                           And.getOperand(6)  /* Chain */ };
         MachineSDNode *Test = CurDAG->getMachineNode(NewOpc, SDLoc(N),
                                                      MVT::i32, MVT::Other, Ops);
+        CurDAG->setNodeMemRefs(
+            Test, cast<MachineSDNode>(And.getNode())->memoperands());
         ReplaceUses(N, Test);
         MadeChange = true;
         continue;
@@ -1390,7 +1501,7 @@ void X86DAGToDAGISel::emitSpecialCodeForMain() {
   }
 }
 
-void X86DAGToDAGISel::EmitFunctionEntryCode() {
+void X86DAGToDAGISel::emitFunctionEntryCode() {
   // If this is main, emit special code for main.
   const Function &F = MF->getFunction();
   if (F.hasExternalLinkage() && F.getName() == "main")
@@ -1409,18 +1520,20 @@ static bool isDispSafeForFrameIndex(int64_t Val) {
 
 bool X86DAGToDAGISel::foldOffsetIntoAddress(uint64_t Offset,
                                             X86ISelAddressMode &AM) {
-  // If there's no offset to fold, we don't need to do any work.
-  if (Offset == 0)
-    return false;
+  // We may have already matched a displacement and the caller just added the
+  // symbolic displacement. So we still need to do the checks even if Offset
+  // is zero.
+
+  int64_t Val = AM.Disp + Offset;
 
   // Cannot combine ExternalSymbol displacements with integer offsets.
-  if (AM.ES || AM.MCSym)
+  if (Val != 0 && (AM.ES || AM.MCSym))
     return true;
 
-  int64_t Val = AM.Disp + Offset;
   CodeModel::Model M = TM.getCodeModel();
   if (Subtarget->is64Bit()) {
-    if (!X86::isOffsetSuitableForCodeModel(Val, M,
+    if (Val != 0 &&
+        !X86::isOffsetSuitableForCodeModel(Val, M,
                                            AM.hasSymbolicDisplacement()))
       return true;
     // In addition to the checks required for a register base, check that
@@ -1449,13 +1562,13 @@ bool X86DAGToDAGISel::matchLoadInAddress(LoadSDNode *N, X86ISelAddressMode &AM){
         (Subtarget->isTargetGlibc() || Subtarget->isTargetAndroid() ||
          Subtarget->isTargetFuchsia()))
       switch (N->getPointerInfo().getAddrSpace()) {
-      case 256:
+      case X86AS::GS:
         AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
         return false;
-      case 257:
+      case X86AS::FS:
         AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
         return false;
-      // Address space 258 is not handled here, because it is not used to
+      // Address space X86AS::SS is not handled here, because it is not used to
       // address TLS areas.
       }
 
@@ -1505,7 +1618,7 @@ bool X86DAGToDAGISel::matchWrapper(SDValue N, X86ISelAddressMode &AM) {
     Offset = G->getOffset();
   } else if (ConstantPoolSDNode *CP = dyn_cast<ConstantPoolSDNode>(N0)) {
     AM.CP = CP->getConstVal();
-    AM.Align = CP->getAlignment();
+    AM.Alignment = CP->getAlign();
     AM.SymbolFlags = CP->getTargetFlags();
     Offset = CP->getOffset();
   } else if (ExternalSymbolSDNode *S = dyn_cast<ExternalSymbolSDNode>(N0)) {
@@ -1583,9 +1696,10 @@ bool X86DAGToDAGISel::matchAdd(SDValue &N, X86ISelAddressMode &AM,
     return false;
   AM = Backup;
 
-  // Try again after commuting the operands.
-  if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM, Depth+1) &&
-      !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth+1))
+  // Try again after commutating the operands.
+  if (!matchAddressRecursively(Handle.getValue().getOperand(1), AM,
+                               Depth + 1) &&
+      !matchAddressRecursively(Handle.getValue().getOperand(0), AM, Depth + 1))
     return false;
   AM = Backup;
 
@@ -1782,7 +1896,7 @@ static bool foldMaskAndShiftToScale(SelectionDAG &DAG, SDValue N,
 
   // There is nothing we can do here unless the mask is removing some bits.
   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
-  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+  if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
 
   // We also need to ensure that mask is a continuous run of bits.
   if (countTrailingOnes(Mask >> MaskTZ) + MaskTZ + MaskLZ != 64) return true;
@@ -1877,7 +1991,7 @@ static bool foldMaskedShiftToBEXTR(SelectionDAG &DAG, SDValue N,
 
   // There is nothing we can do here unless the mask is removing some bits.
   // Also, the addressing mode can only represent shifts of 1, 2, or 3 bits.
-  if (AMShiftAmt <= 0 || AMShiftAmt > 3) return true;
+  if (AMShiftAmt == 0 || AMShiftAmt > 3) return true;
 
   MVT VT = N.getSimpleValueType();
   SDLoc DL(N);
@@ -2280,15 +2394,16 @@ bool X86DAGToDAGISel::matchVectorAddress(SDValue N, X86ISelAddressMode &AM) {
   return matchAddressBase(N, AM);
 }
 
-bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
-                                       SDValue &Scale, SDValue &Index,
-                                       SDValue &Disp, SDValue &Segment) {
+bool X86DAGToDAGISel::selectVectorAddr(MemSDNode *Parent, SDValue BasePtr,
+                                       SDValue IndexOp, SDValue ScaleOp,
+                                       SDValue &Base, SDValue &Scale,
+                                       SDValue &Index, SDValue &Disp,
+                                       SDValue &Segment) {
   X86ISelAddressMode AM;
-  auto *Mgs = cast<X86MaskedGatherScatterSDNode>(Parent);
-  AM.IndexReg = Mgs->getIndex();
-  AM.Scale = cast<ConstantSDNode>(Mgs->getScale())->getZExtValue();
+  AM.IndexReg = IndexOp;
+  AM.Scale = cast<ConstantSDNode>(ScaleOp)->getZExtValue();
 
-  unsigned AddrSpace = cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
+  unsigned AddrSpace = Parent->getPointerInfo().getAddrSpace();
   if (AddrSpace == X86AS::GS)
     AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
   if (AddrSpace == X86AS::FS)
@@ -2296,11 +2411,11 @@ bool X86DAGToDAGISel::selectVectorAddr(SDNode *Parent, SDValue N, SDValue &Base,
   if (AddrSpace == X86AS::SS)
     AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
 
-  SDLoc DL(N);
-  MVT VT = N.getSimpleValueType();
+  SDLoc DL(BasePtr);
+  MVT VT = BasePtr.getSimpleValueType();
 
   // Try to match into the base and displacement fields.
-  if (matchVectorAddress(N, AM))
+  if (matchVectorAddress(BasePtr, AM))
     return false;
 
   getAddressOperands(AM, DL, VT, Base, Scale, Index, Disp, Segment);
@@ -2331,12 +2446,11 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
       Parent->getOpcode() != X86ISD::EH_SJLJ_LONGJMP) { // longjmp
     unsigned AddrSpace =
       cast<MemSDNode>(Parent)->getPointerInfo().getAddrSpace();
-    // AddrSpace 256 -> GS, 257 -> FS, 258 -> SS.
-    if (AddrSpace == 256)
+    if (AddrSpace == X86AS::GS)
       AM.Segment = CurDAG->getRegister(X86::GS, MVT::i16);
-    if (AddrSpace == 257)
+    if (AddrSpace == X86AS::FS)
       AM.Segment = CurDAG->getRegister(X86::FS, MVT::i16);
-    if (AddrSpace == 258)
+    if (AddrSpace == X86AS::SS)
       AM.Segment = CurDAG->getRegister(X86::SS, MVT::i16);
   }
 
@@ -2351,86 +2465,7 @@ bool X86DAGToDAGISel::selectAddr(SDNode *Parent, SDValue N, SDValue &Base,
   return true;
 }
 
-// We can only fold a load if all nodes between it and the root node have a
-// single use. If there are additional uses, we could end up duplicating the
-// load.
-static bool hasSingleUsesFromRoot(SDNode *Root, SDNode *User) {
-  while (User != Root) {
-    if (!User->hasOneUse())
-      return false;
-    User = *User->use_begin();
-  }
-
-  return true;
-}
-
-/// Match a scalar SSE load. In particular, we want to match a load whose top
-/// elements are either undef or zeros. The load flavor is derived from the
-/// type of N, which is either v4f32 or v2f64.
-///
-/// We also return:
-///   PatternChainNode: this is the matched node that has a chain input and
-///   output.
-bool X86DAGToDAGISel::selectScalarSSELoad(SDNode *Root, SDNode *Parent,
-                                          SDValue N, SDValue &Base,
-                                          SDValue &Scale, SDValue &Index,
-                                          SDValue &Disp, SDValue &Segment,
-                                          SDValue &PatternNodeWithChain) {
-  if (!hasSingleUsesFromRoot(Root, Parent))
-    return false;
-
-  // We can allow a full vector load here since narrowing a load is ok unless
-  // it's volatile or atomic.
-  if (ISD::isNON_EXTLoad(N.getNode())) {
-    LoadSDNode *LD = cast<LoadSDNode>(N);
-    if (LD->isSimple() &&
-        IsProfitableToFold(N, LD, Root) &&
-        IsLegalToFold(N, Parent, Root, OptLevel)) {
-      PatternNodeWithChain = N;
-      return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
-                        Segment);
-    }
-  }
-
-  // We can also match the special zero extended load opcode.
-  if (N.getOpcode() == X86ISD::VZEXT_LOAD) {
-    PatternNodeWithChain = N;
-    if (IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
-        IsLegalToFold(PatternNodeWithChain, Parent, Root, OptLevel)) {
-      auto *MI = cast<MemIntrinsicSDNode>(PatternNodeWithChain);
-      return selectAddr(MI, MI->getBasePtr(), Base, Scale, Index, Disp,
-                        Segment);
-    }
-  }
-
-  // Need to make sure that the SCALAR_TO_VECTOR and load are both only used
-  // once. Otherwise the load might get duplicated and the chain output of the
-  // duplicate load will not be observed by all dependencies.
-  if (N.getOpcode() == ISD::SCALAR_TO_VECTOR && N.getNode()->hasOneUse()) {
-    PatternNodeWithChain = N.getOperand(0);
-    if (ISD::isNON_EXTLoad(PatternNodeWithChain.getNode()) &&
-        IsProfitableToFold(PatternNodeWithChain, N.getNode(), Root) &&
-        IsLegalToFold(PatternNodeWithChain, N.getNode(), Root, OptLevel)) {
-      LoadSDNode *LD = cast<LoadSDNode>(PatternNodeWithChain);
-      return selectAddr(LD, LD->getBasePtr(), Base, Scale, Index, Disp,
-                        Segment);
-    }
-  }
-
-  return false;
-}
-
-
 bool X86DAGToDAGISel::selectMOV64Imm32(SDValue N, SDValue &Imm) {
-  if (const ConstantSDNode *CN = dyn_cast<ConstantSDNode>(N)) {
-    uint64_t ImmVal = CN->getZExtValue();
-    if (!isUInt<32>(ImmVal))
-      return false;
-
-    Imm = CurDAG->getTargetConstant(ImmVal, SDLoc(N), MVT::i64);
-    return true;
-  }
-
   // In static codegen with small code model, we can get the address of a label
   // into a register with 'movl'
   if (N->getOpcode() != X86ISD::Wrapper)
@@ -2604,12 +2639,6 @@ bool X86DAGToDAGISel::selectTLSADDRAddr(SDValue N, SDValue &Base,
 }
 
 bool X86DAGToDAGISel::selectRelocImm(SDValue N, SDValue &Op) {
-  if (auto *CN = dyn_cast<ConstantSDNode>(N)) {
-    Op = CurDAG->getTargetConstant(CN->getAPIntValue(), SDLoc(CN),
-                                   N.getValueType());
-    return true;
-  }
-
   // Keep track of the original value type and whether this value was
   // truncated. If we see a truncation from pointer type to VT that truncates
   // bits that are known to be zero, we can use a narrow reference.
@@ -3896,49 +3925,82 @@ bool X86DAGToDAGISel::tryShrinkShlLogicImm(SDNode *N) {
   return true;
 }
 
-/// Convert vector increment or decrement to sub/add with an all-ones constant:
-/// add X, <1, 1...> --> sub X, <-1, -1...>
-/// sub X, <1, 1...> --> add X, <-1, -1...>
-/// The all-ones vector constant can be materialized using a pcmpeq instruction
-/// that is commonly recognized as an idiom (has no register dependency), so
-/// that's better/smaller than loading a splat 1 constant.
-bool X86DAGToDAGISel::combineIncDecVector(SDNode *Node) {
-  assert((Node->getOpcode() == ISD::ADD || Node->getOpcode() == ISD::SUB) &&
-         "Unexpected opcode for increment/decrement transform");
-
-  EVT VT = Node->getValueType(0);
-  assert(VT.isVector() && "Should only be called for vectors.");
-
-  SDValue X = Node->getOperand(0);
-  SDValue OneVec = Node->getOperand(1);
+// Try to match two logic ops to a VPTERNLOG.
+// FIXME: Handle inverted inputs?
+// FIXME: Handle more complex patterns that use an operand more than once?
+bool X86DAGToDAGISel::tryVPTERNLOG(SDNode *N) {
+  MVT NVT = N->getSimpleValueType(0);
 
-  APInt SplatVal;
-  if (!X86::isConstantSplat(OneVec, SplatVal) || !SplatVal.isOneValue())
+  // Make sure we support VPTERNLOG.
+  if (!NVT.isVector() || !Subtarget->hasAVX512() ||
+      NVT.getVectorElementType() == MVT::i1)
     return false;
 
-  SDLoc DL(Node);
-  SDValue OneConstant, AllOnesVec;
+  // We need VLX for 128/256-bit.
+  if (!(Subtarget->hasVLX() || NVT.is512BitVector()))
+    return false;
 
-  APInt Ones = APInt::getAllOnesValue(32);
-  assert(VT.getSizeInBits() % 32 == 0 &&
-         "Expected bit count to be a multiple of 32");
-  OneConstant = CurDAG->getConstant(Ones, DL, MVT::i32);
-  insertDAGNode(*CurDAG, X, OneConstant);
+  unsigned Opc1 = N->getOpcode();
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
 
-  unsigned NumElts = VT.getSizeInBits() / 32;
-  assert(NumElts > 0 && "Expected to get non-empty vector.");
-  AllOnesVec = CurDAG->getSplatBuildVector(MVT::getVectorVT(MVT::i32, NumElts),
-                                           DL, OneConstant);
-  insertDAGNode(*CurDAG, X, AllOnesVec);
+  auto isLogicOp = [](unsigned Opc) {
+    return Opc == ISD::AND || Opc == ISD::OR || Opc == ISD::XOR ||
+           Opc == X86ISD::ANDNP;
+  };
 
-  AllOnesVec = CurDAG->getBitcast(VT, AllOnesVec);
-  insertDAGNode(*CurDAG, X, AllOnesVec);
+  SDValue A, B, C;
+  unsigned Opc2;
+  if (isLogicOp(N1.getOpcode()) && N1.hasOneUse()) {
+    Opc2 = N1.getOpcode();
+    A = N0;
+    B = N1.getOperand(0);
+    C = N1.getOperand(1);
+  } else if (isLogicOp(N0.getOpcode()) && N0.hasOneUse()) {
+    Opc2 = N0.getOpcode();
+    A = N1;
+    B = N0.getOperand(0);
+    C = N0.getOperand(1);
+  } else
+    return false;
 
-  unsigned NewOpcode = Node->getOpcode() == ISD::ADD ? ISD::SUB : ISD::ADD;
-  SDValue NewNode = CurDAG->getNode(NewOpcode, DL, VT, X, AllOnesVec);
+  uint64_t Imm;
+  switch (Opc1) {
+  default: llvm_unreachable("Unexpected opcode!");
+  case ISD::AND:
+    switch (Opc2) {
+    default: llvm_unreachable("Unexpected opcode!");
+    case ISD::AND:      Imm = 0x80; break;
+    case ISD::OR:       Imm = 0xe0; break;
+    case ISD::XOR:      Imm = 0x60; break;
+    case X86ISD::ANDNP: Imm = 0x20; break;
+    }
+    break;
+  case ISD::OR:
+    switch (Opc2) {
+    default: llvm_unreachable("Unexpected opcode!");
+    case ISD::AND:      Imm = 0xf8; break;
+    case ISD::OR:       Imm = 0xfe; break;
+    case ISD::XOR:      Imm = 0xf6; break;
+    case X86ISD::ANDNP: Imm = 0xf2; break;
+    }
+    break;
+  case ISD::XOR:
+    switch (Opc2) {
+    default: llvm_unreachable("Unexpected opcode!");
+    case ISD::AND:      Imm = 0x78; break;
+    case ISD::OR:       Imm = 0x1e; break;
+    case ISD::XOR:      Imm = 0x96; break;
+    case X86ISD::ANDNP: Imm = 0xd2; break;
+    }
+    break;
+  }
 
-  ReplaceNode(Node, NewNode.getNode());
-  SelectCode(NewNode.getNode());
+  SDLoc DL(N);
+  SDValue New = CurDAG->getNode(X86ISD::VPTERNLOG, DL, NVT, A, B, C,
+                                CurDAG->getTargetConstant(Imm, DL, MVT::i8));
+  ReplaceNode(N, New.getNode());
+  SelectCode(New.getNode());
   return true;
 }
 
@@ -4014,159 +4076,50 @@ bool X86DAGToDAGISel::shrinkAndImmediate(SDNode *And) {
 
 static unsigned getVPTESTMOpc(MVT TestVT, bool IsTestN, bool FoldedLoad,
                               bool FoldedBCast, bool Masked) {
-  if (Masked) {
-    if (FoldedLoad) {
-      switch (TestVT.SimpleTy) {
-      default: llvm_unreachable("Unexpected VT!");
-      case MVT::v16i8:
-        return IsTestN ? X86::VPTESTNMBZ128rmk : X86::VPTESTMBZ128rmk;
-      case MVT::v8i16:
-        return IsTestN ? X86::VPTESTNMWZ128rmk : X86::VPTESTMWZ128rmk;
-      case MVT::v4i32:
-        return IsTestN ? X86::VPTESTNMDZ128rmk : X86::VPTESTMDZ128rmk;
-      case MVT::v2i64:
-        return IsTestN ? X86::VPTESTNMQZ128rmk : X86::VPTESTMQZ128rmk;
-      case MVT::v32i8:
-        return IsTestN ? X86::VPTESTNMBZ256rmk : X86::VPTESTMBZ256rmk;
-      case MVT::v16i16:
-        return IsTestN ? X86::VPTESTNMWZ256rmk : X86::VPTESTMWZ256rmk;
-      case MVT::v8i32:
-        return IsTestN ? X86::VPTESTNMDZ256rmk : X86::VPTESTMDZ256rmk;
-      case MVT::v4i64:
-        return IsTestN ? X86::VPTESTNMQZ256rmk : X86::VPTESTMQZ256rmk;
-      case MVT::v64i8:
-        return IsTestN ? X86::VPTESTNMBZrmk : X86::VPTESTMBZrmk;
-      case MVT::v32i16:
-        return IsTestN ? X86::VPTESTNMWZrmk : X86::VPTESTMWZrmk;
-      case MVT::v16i32:
-        return IsTestN ? X86::VPTESTNMDZrmk : X86::VPTESTMDZrmk;
-      case MVT::v8i64:
-        return IsTestN ? X86::VPTESTNMQZrmk : X86::VPTESTMQZrmk;
-      }
-    }
-
-    if (FoldedBCast) {
-      switch (TestVT.SimpleTy) {
-      default: llvm_unreachable("Unexpected VT!");
-      case MVT::v4i32:
-        return IsTestN ? X86::VPTESTNMDZ128rmbk : X86::VPTESTMDZ128rmbk;
-      case MVT::v2i64:
-        return IsTestN ? X86::VPTESTNMQZ128rmbk : X86::VPTESTMQZ128rmbk;
-      case MVT::v8i32:
-        return IsTestN ? X86::VPTESTNMDZ256rmbk : X86::VPTESTMDZ256rmbk;
-      case MVT::v4i64:
-        return IsTestN ? X86::VPTESTNMQZ256rmbk : X86::VPTESTMQZ256rmbk;
-      case MVT::v16i32:
-        return IsTestN ? X86::VPTESTNMDZrmbk : X86::VPTESTMDZrmbk;
-      case MVT::v8i64:
-        return IsTestN ? X86::VPTESTNMQZrmbk : X86::VPTESTMQZrmbk;
-      }
-    }
-
-    switch (TestVT.SimpleTy) {
-    default: llvm_unreachable("Unexpected VT!");
-    case MVT::v16i8:
-      return IsTestN ? X86::VPTESTNMBZ128rrk : X86::VPTESTMBZ128rrk;
-    case MVT::v8i16:
-      return IsTestN ? X86::VPTESTNMWZ128rrk : X86::VPTESTMWZ128rrk;
-    case MVT::v4i32:
-      return IsTestN ? X86::VPTESTNMDZ128rrk : X86::VPTESTMDZ128rrk;
-    case MVT::v2i64:
-      return IsTestN ? X86::VPTESTNMQZ128rrk : X86::VPTESTMQZ128rrk;
-    case MVT::v32i8:
-      return IsTestN ? X86::VPTESTNMBZ256rrk : X86::VPTESTMBZ256rrk;
-    case MVT::v16i16:
-      return IsTestN ? X86::VPTESTNMWZ256rrk : X86::VPTESTMWZ256rrk;
-    case MVT::v8i32:
-      return IsTestN ? X86::VPTESTNMDZ256rrk : X86::VPTESTMDZ256rrk;
-    case MVT::v4i64:
-      return IsTestN ? X86::VPTESTNMQZ256rrk : X86::VPTESTMQZ256rrk;
-    case MVT::v64i8:
-      return IsTestN ? X86::VPTESTNMBZrrk : X86::VPTESTMBZrrk;
-    case MVT::v32i16:
-      return IsTestN ? X86::VPTESTNMWZrrk : X86::VPTESTMWZrrk;
-    case MVT::v16i32:
-      return IsTestN ? X86::VPTESTNMDZrrk : X86::VPTESTMDZrrk;
-    case MVT::v8i64:
-      return IsTestN ? X86::VPTESTNMQZrrk : X86::VPTESTMQZrrk;
-    }
-  }
+#define VPTESTM_CASE(VT, SUFFIX) \
+case MVT::VT: \
+  if (Masked) \
+    return IsTestN ? X86::VPTESTNM##SUFFIX##k: X86::VPTESTM##SUFFIX##k; \
+  return IsTestN ? X86::VPTESTNM##SUFFIX : X86::VPTESTM##SUFFIX;
+
+
+#define VPTESTM_BROADCAST_CASES(SUFFIX) \
+default: llvm_unreachable("Unexpected VT!"); \
+VPTESTM_CASE(v4i32, DZ128##SUFFIX) \
+VPTESTM_CASE(v2i64, QZ128##SUFFIX) \
+VPTESTM_CASE(v8i32, DZ256##SUFFIX) \
+VPTESTM_CASE(v4i64, QZ256##SUFFIX) \
+VPTESTM_CASE(v16i32, DZ##SUFFIX) \
+VPTESTM_CASE(v8i64, QZ##SUFFIX)
+
+#define VPTESTM_FULL_CASES(SUFFIX) \
+VPTESTM_BROADCAST_CASES(SUFFIX) \
+VPTESTM_CASE(v16i8, BZ128##SUFFIX) \
+VPTESTM_CASE(v8i16, WZ128##SUFFIX) \
+VPTESTM_CASE(v32i8, BZ256##SUFFIX) \
+VPTESTM_CASE(v16i16, WZ256##SUFFIX) \
+VPTESTM_CASE(v64i8, BZ##SUFFIX) \
+VPTESTM_CASE(v32i16, WZ##SUFFIX)
 
   if (FoldedLoad) {
     switch (TestVT.SimpleTy) {
-    default: llvm_unreachable("Unexpected VT!");
-    case MVT::v16i8:
-      return IsTestN ? X86::VPTESTNMBZ128rm : X86::VPTESTMBZ128rm;
-    case MVT::v8i16:
-      return IsTestN ? X86::VPTESTNMWZ128rm : X86::VPTESTMWZ128rm;
-    case MVT::v4i32:
-      return IsTestN ? X86::VPTESTNMDZ128rm : X86::VPTESTMDZ128rm;
-    case MVT::v2i64:
-      return IsTestN ? X86::VPTESTNMQZ128rm : X86::VPTESTMQZ128rm;
-    case MVT::v32i8:
-      return IsTestN ? X86::VPTESTNMBZ256rm : X86::VPTESTMBZ256rm;
-    case MVT::v16i16:
-      return IsTestN ? X86::VPTESTNMWZ256rm : X86::VPTESTMWZ256rm;
-    case MVT::v8i32:
-      return IsTestN ? X86::VPTESTNMDZ256rm : X86::VPTESTMDZ256rm;
-    case MVT::v4i64:
-      return IsTestN ? X86::VPTESTNMQZ256rm : X86::VPTESTMQZ256rm;
-    case MVT::v64i8:
-      return IsTestN ? X86::VPTESTNMBZrm : X86::VPTESTMBZrm;
-    case MVT::v32i16:
-      return IsTestN ? X86::VPTESTNMWZrm : X86::VPTESTMWZrm;
-    case MVT::v16i32:
-      return IsTestN ? X86::VPTESTNMDZrm : X86::VPTESTMDZrm;
-    case MVT::v8i64:
-      return IsTestN ? X86::VPTESTNMQZrm : X86::VPTESTMQZrm;
+    VPTESTM_FULL_CASES(rm)
     }
   }
 
   if (FoldedBCast) {
     switch (TestVT.SimpleTy) {
-    default: llvm_unreachable("Unexpected VT!");
-    case MVT::v4i32:
-      return IsTestN ? X86::VPTESTNMDZ128rmb : X86::VPTESTMDZ128rmb;
-    case MVT::v2i64:
-      return IsTestN ? X86::VPTESTNMQZ128rmb : X86::VPTESTMQZ128rmb;
-    case MVT::v8i32:
-      return IsTestN ? X86::VPTESTNMDZ256rmb : X86::VPTESTMDZ256rmb;
-    case MVT::v4i64:
-      return IsTestN ? X86::VPTESTNMQZ256rmb : X86::VPTESTMQZ256rmb;
-    case MVT::v16i32:
-      return IsTestN ? X86::VPTESTNMDZrmb : X86::VPTESTMDZrmb;
-    case MVT::v8i64:
-      return IsTestN ? X86::VPTESTNMQZrmb : X86::VPTESTMQZrmb;
+    VPTESTM_BROADCAST_CASES(rmb)
     }
   }
 
   switch (TestVT.SimpleTy) {
-  default: llvm_unreachable("Unexpected VT!");
-  case MVT::v16i8:
-    return IsTestN ? X86::VPTESTNMBZ128rr : X86::VPTESTMBZ128rr;
-  case MVT::v8i16:
-    return IsTestN ? X86::VPTESTNMWZ128rr : X86::VPTESTMWZ128rr;
-  case MVT::v4i32:
-    return IsTestN ? X86::VPTESTNMDZ128rr : X86::VPTESTMDZ128rr;
-  case MVT::v2i64:
-    return IsTestN ? X86::VPTESTNMQZ128rr : X86::VPTESTMQZ128rr;
-  case MVT::v32i8:
-    return IsTestN ? X86::VPTESTNMBZ256rr : X86::VPTESTMBZ256rr;
-  case MVT::v16i16:
-    return IsTestN ? X86::VPTESTNMWZ256rr : X86::VPTESTMWZ256rr;
-  case MVT::v8i32:
-    return IsTestN ? X86::VPTESTNMDZ256rr : X86::VPTESTMDZ256rr;
-  case MVT::v4i64:
-    return IsTestN ? X86::VPTESTNMQZ256rr : X86::VPTESTMQZ256rr;
-  case MVT::v64i8:
-    return IsTestN ? X86::VPTESTNMBZrr : X86::VPTESTMBZrr;
-  case MVT::v32i16:
-    return IsTestN ? X86::VPTESTNMWZrr : X86::VPTESTMWZrr;
-  case MVT::v16i32:
-    return IsTestN ? X86::VPTESTNMDZrr : X86::VPTESTMDZrr;
-  case MVT::v8i64:
-    return IsTestN ? X86::VPTESTNMQZrr : X86::VPTESTMQZrr;
+  VPTESTM_FULL_CASES(rr)
   }
+
+#undef VPTESTM_FULL_CASES
+#undef VPTESTM_BROADCAST_CASES
+#undef VPTESTM_CASE
 }
 
 // Try to create VPTESTM instruction. If InMask is not null, it will be used
@@ -4477,8 +4430,39 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
 
       break;
     }
+    case Intrinsic::x86_tileloadd64:
+    case Intrinsic::x86_tileloaddt164:
+    case Intrinsic::x86_tilestored64: {
+      if (!Subtarget->hasAMXTILE())
+        break;
+      unsigned Opc;
+      switch (IntNo) {
+      default: llvm_unreachable("Unexpected intrinsic!");
+      case Intrinsic::x86_tileloadd64:   Opc = X86::PTILELOADD; break;
+      case Intrinsic::x86_tileloaddt164: Opc = X86::PTILELOADDT1; break;
+      case Intrinsic::x86_tilestored64:  Opc = X86::PTILESTORED; break;
+      }
+      // FIXME: Match displacement and scale.
+      unsigned TIndex = Node->getConstantOperandVal(2);
+      SDValue TReg = getI8Imm(TIndex, dl);
+      SDValue Base = Node->getOperand(3);
+      SDValue Scale = getI8Imm(1, dl);
+      SDValue Index = Node->getOperand(4);
+      SDValue Disp = CurDAG->getTargetConstant(0, dl, MVT::i32);
+      SDValue Segment = CurDAG->getRegister(0, MVT::i16);
+      SDValue Chain = Node->getOperand(0);
+      MachineSDNode *CNode;
+      if (Opc == X86::PTILESTORED) {
+        SDValue Ops[] = { Base, Scale, Index, Disp, Segment, TReg, Chain };
+        CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+      } else {
+        SDValue Ops[] = { TReg, Base, Scale, Index, Disp, Segment, Chain };
+        CNode = CurDAG->getMachineNode(Opc, dl, MVT::Other, Ops);
+      }
+      ReplaceNode(Node, CNode);
+      return;
+    }
     }
-
     break;
   }
   case ISD::BRIND: {
@@ -4490,9 +4474,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       // Converts a 32-bit register to a 64-bit, zero-extended version of
       // it. This is needed because x86-64 can do many things, but jmp %r32
       // ain't one of them.
-      const SDValue &Target = Node->getOperand(1);
-      assert(Target.getSimpleValueType() == llvm::MVT::i32);
-      SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, EVT(MVT::i64));
+      SDValue Target = Node->getOperand(1);
+      assert(Target.getValueType() == MVT::i32 && "Unexpected VT!");
+      SDValue ZextTarget = CurDAG->getZExtOrTrunc(Target, dl, MVT::i64);
       SDValue Brind = CurDAG->getNode(ISD::BRIND, dl, MVT::Other,
                                       Node->getOperand(0), ZextTarget);
       ReplaceNode(Node, Brind.getNode());
@@ -4516,21 +4500,6 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     }
     break;
 
-  case ISD::VSELECT: {
-    // Replace VSELECT with non-mask conditions with with BLENDV.
-    if (Node->getOperand(0).getValueType().getVectorElementType() == MVT::i1)
-      break;
-
-    assert(Subtarget->hasSSE41() && "Expected SSE4.1 support!");
-    SDValue Blendv = CurDAG->getNode(
-        X86ISD::BLENDV, SDLoc(Node), Node->getValueType(0), Node->getOperand(0),
-        Node->getOperand(1), Node->getOperand(2));
-    ReplaceNode(Node, Blendv.getNode());
-    SelectCode(Blendv.getNode());
-    // We already called ReplaceUses.
-    return;
-  }
-
   case ISD::SRL:
     if (matchBitExtract(Node))
       return;
@@ -4569,24 +4538,21 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
   case ISD::XOR:
     if (tryShrinkShlLogicImm(Node))
       return;
-
     if (Opcode == ISD::OR && tryMatchBitSelect(Node))
       return;
+    if (tryVPTERNLOG(Node))
+      return;
 
     LLVM_FALLTHROUGH;
   case ISD::ADD:
   case ISD::SUB: {
-    if ((Opcode == ISD::ADD || Opcode == ISD::SUB) && NVT.isVector() &&
-        combineIncDecVector(Node))
-      return;
-
     // Try to avoid folding immediates with multiple uses for optsize.
     // This code tries to select to register form directly to avoid going
     // through the isel table which might fold the immediate. We can't change
     // the patterns on the add/sub/and/or/xor with immediate paterns in the
     // tablegen files to check immediate use count without making the patterns
     // unavailable to the fast-isel table.
-    if (!OptForSize)
+    if (!CurDAG->shouldOptForSize())
       break;
 
     // Only handle i8/i16/i32/i64.
@@ -4720,7 +4686,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
 
     SDValue Tmp0, Tmp1, Tmp2, Tmp3, Tmp4;
     bool FoldedLoad = tryFoldLoad(Node, N1, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
-    // Multiply is commmutative.
+    // Multiply is commutative.
     if (!FoldedLoad) {
       FoldedLoad = tryFoldLoad(Node, N0, Tmp0, Tmp1, Tmp2, Tmp3, Tmp4);
       if (FoldedLoad)
@@ -4772,31 +4738,31 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue N1 = Node->getOperand(1);
 
     unsigned Opc, MOpc;
-    bool isSigned = Opcode == ISD::SMUL_LOHI;
-    if (!isSigned) {
-      switch (NVT.SimpleTy) {
-      default: llvm_unreachable("Unsupported VT!");
-      case MVT::i32: Opc = X86::MUL32r; MOpc = X86::MUL32m; break;
-      case MVT::i64: Opc = X86::MUL64r; MOpc = X86::MUL64m; break;
-      }
-    } else {
-      switch (NVT.SimpleTy) {
-      default: llvm_unreachable("Unsupported VT!");
-      case MVT::i32: Opc = X86::IMUL32r; MOpc = X86::IMUL32m; break;
-      case MVT::i64: Opc = X86::IMUL64r; MOpc = X86::IMUL64m; break;
-      }
-    }
-
-    unsigned SrcReg, LoReg, HiReg;
-    switch (Opc) {
-    default: llvm_unreachable("Unknown MUL opcode!");
-    case X86::IMUL32r:
-    case X86::MUL32r:
-      SrcReg = LoReg = X86::EAX; HiReg = X86::EDX;
+    unsigned LoReg, HiReg;
+    bool IsSigned = Opcode == ISD::SMUL_LOHI;
+    bool UseMULX = !IsSigned && Subtarget->hasBMI2();
+    bool UseMULXHi = UseMULX && SDValue(Node, 0).use_empty();
+    switch (NVT.SimpleTy) {
+    default: llvm_unreachable("Unsupported VT!");
+    case MVT::i32:
+      Opc  = UseMULXHi ? X86::MULX32Hrr :
+             UseMULX ? X86::MULX32rr :
+             IsSigned ? X86::IMUL32r : X86::MUL32r;
+      MOpc = UseMULXHi ? X86::MULX32Hrm :
+             UseMULX ? X86::MULX32rm :
+             IsSigned ? X86::IMUL32m : X86::MUL32m;
+      LoReg = UseMULX ? X86::EDX : X86::EAX;
+      HiReg = X86::EDX;
       break;
-    case X86::IMUL64r:
-    case X86::MUL64r:
-      SrcReg = LoReg = X86::RAX; HiReg = X86::RDX;
+    case MVT::i64:
+      Opc  = UseMULXHi ? X86::MULX64Hrr :
+             UseMULX ? X86::MULX64rr :
+             IsSigned ? X86::IMUL64r : X86::MUL64r;
+      MOpc = UseMULXHi ? X86::MULX64Hrm :
+             UseMULX ? X86::MULX64rm :
+             IsSigned ? X86::IMUL64m : X86::MUL64m;
+      LoReg = UseMULX ? X86::RDX : X86::RAX;
+      HiReg = X86::RDX;
       break;
     }
 
@@ -4809,17 +4775,31 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
         std::swap(N0, N1);
     }
 
-    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, SrcReg,
+    SDValue InFlag = CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, LoReg,
                                           N0, SDValue()).getValue(1);
+    SDValue ResHi, ResLo;
     if (foldedLoad) {
       SDValue Chain;
       MachineSDNode *CNode = nullptr;
       SDValue Ops[] = { Tmp0, Tmp1, Tmp2, Tmp3, Tmp4, N1.getOperand(0),
                         InFlag };
-      SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
-      CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
-      Chain = SDValue(CNode, 0);
-      InFlag = SDValue(CNode, 1);
+      if (UseMULXHi) {
+        SDVTList VTs = CurDAG->getVTList(NVT, MVT::Other);
+        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+        Chain = SDValue(CNode, 1);
+      } else if (UseMULX) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT, MVT::Other);
+        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+        Chain = SDValue(CNode, 2);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Other, MVT::Glue);
+        CNode = CurDAG->getMachineNode(MOpc, dl, VTs, Ops);
+        Chain = SDValue(CNode, 0);
+        InFlag = SDValue(CNode, 1);
+      }
 
       // Update the chain.
       ReplaceUses(N1.getValue(1), Chain);
@@ -4827,27 +4807,42 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
     } else {
       SDValue Ops[] = { N1, InFlag };
-      SDVTList VTs = CurDAG->getVTList(MVT::Glue);
-      SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
-      InFlag = SDValue(CNode, 0);
+      if (UseMULXHi) {
+        SDVTList VTs = CurDAG->getVTList(NVT);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+      } else if (UseMULX) {
+        SDVTList VTs = CurDAG->getVTList(NVT, NVT);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+        ResHi = SDValue(CNode, 0);
+        ResLo = SDValue(CNode, 1);
+      } else {
+        SDVTList VTs = CurDAG->getVTList(MVT::Glue);
+        SDNode *CNode = CurDAG->getMachineNode(Opc, dl, VTs, Ops);
+        InFlag = SDValue(CNode, 0);
+      }
     }
 
     // Copy the low half of the result, if it is needed.
     if (!SDValue(Node, 0).use_empty()) {
-      assert(LoReg && "Register for low half is not defined!");
-      SDValue ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
-                                             NVT, InFlag);
-      InFlag = ResLo.getValue(2);
+      if (!ResLo) {
+        assert(LoReg && "Register for low half is not defined!");
+        ResLo = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, LoReg,
+                                       NVT, InFlag);
+        InFlag = ResLo.getValue(2);
+      }
       ReplaceUses(SDValue(Node, 0), ResLo);
       LLVM_DEBUG(dbgs() << "=> "; ResLo.getNode()->dump(CurDAG);
                  dbgs() << '\n');
     }
     // Copy the high half of the result, if it is needed.
     if (!SDValue(Node, 1).use_empty()) {
-      assert(HiReg && "Register for high half is not defined!");
-      SDValue ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
-                                             NVT, InFlag);
-      InFlag = ResHi.getValue(2);
+      if (!ResHi) {
+        assert(HiReg && "Register for high half is not defined!");
+        ResHi = CurDAG->getCopyFromReg(CurDAG->getEntryNode(), dl, HiReg,
+                                       NVT, InFlag);
+        InFlag = ResHi.getValue(2);
+      }
       ReplaceUses(SDValue(Node, 1), ResHi);
       LLVM_DEBUG(dbgs() << "=> "; ResHi.getNode()->dump(CurDAG);
                  dbgs() << '\n');
@@ -4862,23 +4857,23 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
 
-    unsigned Opc, MOpc;
+    unsigned ROpc, MOpc;
     bool isSigned = Opcode == ISD::SDIVREM;
     if (!isSigned) {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
-      case MVT::i8:  Opc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
-      case MVT::i16: Opc = X86::DIV16r; MOpc = X86::DIV16m; break;
-      case MVT::i32: Opc = X86::DIV32r; MOpc = X86::DIV32m; break;
-      case MVT::i64: Opc = X86::DIV64r; MOpc = X86::DIV64m; break;
+      case MVT::i8:  ROpc = X86::DIV8r;  MOpc = X86::DIV8m;  break;
+      case MVT::i16: ROpc = X86::DIV16r; MOpc = X86::DIV16m; break;
+      case MVT::i32: ROpc = X86::DIV32r; MOpc = X86::DIV32m; break;
+      case MVT::i64: ROpc = X86::DIV64r; MOpc = X86::DIV64m; break;
       }
     } else {
       switch (NVT.SimpleTy) {
       default: llvm_unreachable("Unsupported VT!");
-      case MVT::i8:  Opc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
-      case MVT::i16: Opc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
-      case MVT::i32: Opc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
-      case MVT::i64: Opc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
+      case MVT::i8:  ROpc = X86::IDIV8r;  MOpc = X86::IDIV8m;  break;
+      case MVT::i16: ROpc = X86::IDIV16r; MOpc = X86::IDIV16m; break;
+      case MVT::i32: ROpc = X86::IDIV32r; MOpc = X86::IDIV32m; break;
+      case MVT::i64: ROpc = X86::IDIV64r; MOpc = X86::IDIV64m; break;
       }
     }
 
@@ -4943,7 +4938,9 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
           SDValue(CurDAG->getMachineNode(SExtOpcode, dl, MVT::Glue, InFlag),0);
       } else {
         // Zero out the high part, effectively zero extending the input.
-        SDValue ClrNode = SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, NVT), 0);
+        SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
+        SDValue ClrNode =
+            SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0);
         switch (NVT.SimpleTy) {
         case MVT::i16:
           ClrNode =
@@ -4985,7 +4982,7 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
       CurDAG->setNodeMemRefs(CNode, {cast<LoadSDNode>(N1)->getMemOperand()});
     } else {
       InFlag =
-        SDValue(CurDAG->getMachineNode(Opc, dl, MVT::Glue, N1, InFlag), 0);
+        SDValue(CurDAG->getMachineNode(ROpc, dl, MVT::Glue, N1, InFlag), 0);
     }
 
     // Prevent use of AH in a REX instruction by explicitly copying it to
@@ -5034,6 +5031,77 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     return;
   }
 
+  case X86ISD::FCMP:
+  case X86ISD::STRICT_FCMP:
+  case X86ISD::STRICT_FCMPS: {
+    bool IsStrictCmp = Node->getOpcode() == X86ISD::STRICT_FCMP ||
+                       Node->getOpcode() == X86ISD::STRICT_FCMPS;
+    SDValue N0 = Node->getOperand(IsStrictCmp ? 1 : 0);
+    SDValue N1 = Node->getOperand(IsStrictCmp ? 2 : 1);
+
+    // Save the original VT of the compare.
+    MVT CmpVT = N0.getSimpleValueType();
+
+    // Floating point needs special handling if we don't have FCOMI.
+    if (Subtarget->hasCMov())
+      break;
+
+    bool IsSignaling = Node->getOpcode() == X86ISD::STRICT_FCMPS;
+
+    unsigned Opc;
+    switch (CmpVT.SimpleTy) {
+    default: llvm_unreachable("Unexpected type!");
+    case MVT::f32:
+      Opc = IsSignaling ? X86::COM_Fpr32 : X86::UCOM_Fpr32;
+      break;
+    case MVT::f64:
+      Opc = IsSignaling ? X86::COM_Fpr64 : X86::UCOM_Fpr64;
+      break;
+    case MVT::f80:
+      Opc = IsSignaling ? X86::COM_Fpr80 : X86::UCOM_Fpr80;
+      break;
+    }
+
+    SDValue Cmp;
+    SDValue Chain =
+        IsStrictCmp ? Node->getOperand(0) : CurDAG->getEntryNode();
+    if (IsStrictCmp) {
+      SDVTList VTs = CurDAG->getVTList(MVT::i16, MVT::Other);
+      Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {N0, N1, Chain}), 0);
+      Chain = Cmp.getValue(1);
+    } else {
+      Cmp = SDValue(CurDAG->getMachineNode(Opc, dl, MVT::i16, N0, N1), 0);
+    }
+
+    // Move FPSW to AX.
+    SDValue FPSW = CurDAG->getCopyToReg(Chain, dl, X86::FPSW, Cmp, SDValue());
+    Chain = FPSW;
+    SDValue FNSTSW =
+        SDValue(CurDAG->getMachineNode(X86::FNSTSW16r, dl, MVT::i16, FPSW,
+                                       FPSW.getValue(1)),
+                0);
+
+    // Extract upper 8-bits of AX.
+    SDValue Extract =
+        CurDAG->getTargetExtractSubreg(X86::sub_8bit_hi, dl, MVT::i8, FNSTSW);
+
+    // Move AH into flags.
+    // Some 64-bit targets lack SAHF support, but they do support FCOMI.
+    assert(Subtarget->hasLAHFSAHF() &&
+           "Target doesn't support SAHF or FCOMI?");
+    SDValue AH = CurDAG->getCopyToReg(Chain, dl, X86::AH, Extract, SDValue());
+    Chain = AH;
+    SDValue SAHF = SDValue(
+        CurDAG->getMachineNode(X86::SAHF, dl, MVT::i32, AH.getValue(1)), 0);
+
+    if (IsStrictCmp)
+      ReplaceUses(SDValue(Node, 1), Chain);
+
+    ReplaceUses(SDValue(Node, 0), SAHF);
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
+
   case X86ISD::CMP: {
     SDValue N0 = Node->getOperand(0);
     SDValue N1 = Node->getOperand(1);
@@ -5267,6 +5335,279 @@ void X86DAGToDAGISel::Select(SDNode *Node) {
     if (foldLoadStoreIntoMemOperand(Node))
       return;
     break;
+
+  case X86ISD::SETCC_CARRY: {
+    // We have to do this manually because tblgen will put the eflags copy in
+    // the wrong place if we use an extract_subreg in the pattern.
+    MVT VT = Node->getSimpleValueType(0);
+
+    // Copy flags to the EFLAGS register and glue it to next node.
+    SDValue EFLAGS =
+        CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
+                             Node->getOperand(1), SDValue());
+
+    // Create a 64-bit instruction if the result is 64-bits otherwise use the
+    // 32-bit version.
+    unsigned Opc = VT == MVT::i64 ? X86::SETB_C64r : X86::SETB_C32r;
+    MVT SetVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
+    SDValue Result = SDValue(
+        CurDAG->getMachineNode(Opc, dl, SetVT, EFLAGS, EFLAGS.getValue(1)), 0);
+
+    // For less than 32-bits we need to extract from the 32-bit node.
+    if (VT == MVT::i8 || VT == MVT::i16) {
+      int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
+      Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
+    }
+
+    ReplaceUses(SDValue(Node, 0), Result);
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
+  case X86ISD::SBB: {
+    if (isNullConstant(Node->getOperand(0)) &&
+        isNullConstant(Node->getOperand(1))) {
+      MVT VT = Node->getSimpleValueType(0);
+
+      // Create zero.
+      SDVTList VTs = CurDAG->getVTList(MVT::i32, MVT::i32);
+      SDValue Zero =
+          SDValue(CurDAG->getMachineNode(X86::MOV32r0, dl, VTs, None), 0);
+      if (VT == MVT::i64) {
+        Zero = SDValue(
+            CurDAG->getMachineNode(
+                TargetOpcode::SUBREG_TO_REG, dl, MVT::i64,
+                CurDAG->getTargetConstant(0, dl, MVT::i64), Zero,
+                CurDAG->getTargetConstant(X86::sub_32bit, dl, MVT::i32)),
+            0);
+      }
+
+      // Copy flags to the EFLAGS register and glue it to next node.
+      SDValue EFLAGS =
+          CurDAG->getCopyToReg(CurDAG->getEntryNode(), dl, X86::EFLAGS,
+                               Node->getOperand(2), SDValue());
+
+      // Create a 64-bit instruction if the result is 64-bits otherwise use the
+      // 32-bit version.
+      unsigned Opc = VT == MVT::i64 ? X86::SBB64rr : X86::SBB32rr;
+      MVT SBBVT = VT == MVT::i64 ? MVT::i64 : MVT::i32;
+      VTs = CurDAG->getVTList(SBBVT, MVT::i32);
+      SDValue Result =
+          SDValue(CurDAG->getMachineNode(Opc, dl, VTs, {Zero, Zero, EFLAGS,
+                                         EFLAGS.getValue(1)}),
+                  0);
+
+      // Replace the flag use.
+      ReplaceUses(SDValue(Node, 1), Result.getValue(1));
+
+      // Replace the result use.
+      if (!SDValue(Node, 0).use_empty()) {
+        // For less than 32-bits we need to extract from the 32-bit node.
+        if (VT == MVT::i8 || VT == MVT::i16) {
+          int SubIndex = VT == MVT::i16 ? X86::sub_16bit : X86::sub_8bit;
+          Result = CurDAG->getTargetExtractSubreg(SubIndex, dl, VT, Result);
+        }
+        ReplaceUses(SDValue(Node, 0), Result);
+      }
+
+      CurDAG->RemoveDeadNode(Node);
+      return;
+    }
+    break;
+  }
+  case X86ISD::MGATHER: {
+    auto *Mgt = cast<X86MaskedGatherSDNode>(Node);
+    SDValue IndexOp = Mgt->getIndex();
+    SDValue Mask = Mgt->getMask();
+    MVT IndexVT = IndexOp.getSimpleValueType();
+    MVT ValueVT = Node->getSimpleValueType(0);
+    MVT MaskVT = Mask.getSimpleValueType();
+
+    // This is just to prevent crashes if the nodes are malformed somehow. We're
+    // otherwise only doing loose type checking in here based on type what
+    // a type constraint would say just like table based isel.
+    if (!ValueVT.isVector() || !MaskVT.isVector())
+      break;
+
+    unsigned NumElts = ValueVT.getVectorNumElements();
+    MVT ValueSVT = ValueVT.getVectorElementType();
+
+    bool IsFP = ValueSVT.isFloatingPoint();
+    unsigned EltSize = ValueSVT.getSizeInBits();
+
+    unsigned Opc = 0;
+    bool AVX512Gather = MaskVT.getVectorElementType() == MVT::i1;
+    if (AVX512Gather) {
+      if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERDPSZ128rm : X86::VPGATHERDDZ128rm;
+      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERDPSZ256rm : X86::VPGATHERDDZ256rm;
+      else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERDPSZrm : X86::VPGATHERDDZrm;
+      else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERDPDZ128rm : X86::VPGATHERDQZ128rm;
+      else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERDPDZ256rm : X86::VPGATHERDQZ256rm;
+      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERDPDZrm : X86::VPGATHERDQZrm;
+      else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERQPSZ128rm : X86::VPGATHERQDZ128rm;
+      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERQPSZ256rm : X86::VPGATHERQDZ256rm;
+      else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERQPSZrm : X86::VPGATHERQDZrm;
+      else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERQPDZ128rm : X86::VPGATHERQQZ128rm;
+      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERQPDZ256rm : X86::VPGATHERQQZ256rm;
+      else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERQPDZrm : X86::VPGATHERQQZrm;
+    } else {
+      assert(EVT(MaskVT) == EVT(ValueVT).changeVectorElementTypeToInteger() &&
+             "Unexpected mask VT!");
+      if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERDPSrm : X86::VPGATHERDDrm;
+      else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERDPSYrm : X86::VPGATHERDDYrm;
+      else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERDPDrm : X86::VPGATHERDQrm;
+      else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERDPDYrm : X86::VPGATHERDQYrm;
+      else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERQPSrm : X86::VPGATHERQDrm;
+      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
+        Opc = IsFP ? X86::VGATHERQPSYrm : X86::VPGATHERQDYrm;
+      else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERQPDrm : X86::VPGATHERQQrm;
+      else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
+        Opc = IsFP ? X86::VGATHERQPDYrm : X86::VPGATHERQQYrm;
+    }
+
+    if (!Opc)
+      break;
+
+    SDValue Base, Scale, Index, Disp, Segment;
+    if (!selectVectorAddr(Mgt, Mgt->getBasePtr(), IndexOp, Mgt->getScale(),
+                          Base, Scale, Index, Disp, Segment))
+      break;
+
+    SDValue PassThru = Mgt->getPassThru();
+    SDValue Chain = Mgt->getChain();
+    // Gather instructions have a mask output not in the ISD node.
+    SDVTList VTs = CurDAG->getVTList(ValueVT, MaskVT, MVT::Other);
+
+    MachineSDNode *NewNode;
+    if (AVX512Gather) {
+      SDValue Ops[] = {PassThru, Mask, Base,    Scale,
+                       Index,    Disp, Segment, Chain};
+      NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
+    } else {
+      SDValue Ops[] = {PassThru, Base,    Scale, Index,
+                       Disp,     Segment, Mask,  Chain};
+      NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
+    }
+    CurDAG->setNodeMemRefs(NewNode, {Mgt->getMemOperand()});
+    ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 0));
+    ReplaceUses(SDValue(Node, 1), SDValue(NewNode, 2));
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
+  case X86ISD::MSCATTER: {
+    auto *Sc = cast<X86MaskedScatterSDNode>(Node);
+    SDValue Value = Sc->getValue();
+    SDValue IndexOp = Sc->getIndex();
+    MVT IndexVT = IndexOp.getSimpleValueType();
+    MVT ValueVT = Value.getSimpleValueType();
+
+    // This is just to prevent crashes if the nodes are malformed somehow. We're
+    // otherwise only doing loose type checking in here based on type what
+    // a type constraint would say just like table based isel.
+    if (!ValueVT.isVector())
+      break;
+
+    unsigned NumElts = ValueVT.getVectorNumElements();
+    MVT ValueSVT = ValueVT.getVectorElementType();
+
+    bool IsFP = ValueSVT.isFloatingPoint();
+    unsigned EltSize = ValueSVT.getSizeInBits();
+
+    unsigned Opc;
+    if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 32)
+      Opc = IsFP ? X86::VSCATTERDPSZ128mr : X86::VPSCATTERDDZ128mr;
+    else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 32)
+      Opc = IsFP ? X86::VSCATTERDPSZ256mr : X86::VPSCATTERDDZ256mr;
+    else if (IndexVT == MVT::v16i32 && NumElts == 16 && EltSize == 32)
+      Opc = IsFP ? X86::VSCATTERDPSZmr : X86::VPSCATTERDDZmr;
+    else if (IndexVT == MVT::v4i32 && NumElts == 2 && EltSize == 64)
+      Opc = IsFP ? X86::VSCATTERDPDZ128mr : X86::VPSCATTERDQZ128mr;
+    else if (IndexVT == MVT::v4i32 && NumElts == 4 && EltSize == 64)
+      Opc = IsFP ? X86::VSCATTERDPDZ256mr : X86::VPSCATTERDQZ256mr;
+    else if (IndexVT == MVT::v8i32 && NumElts == 8 && EltSize == 64)
+      Opc = IsFP ? X86::VSCATTERDPDZmr : X86::VPSCATTERDQZmr;
+    else if (IndexVT == MVT::v2i64 && NumElts == 4 && EltSize == 32)
+      Opc = IsFP ? X86::VSCATTERQPSZ128mr : X86::VPSCATTERQDZ128mr;
+    else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 32)
+      Opc = IsFP ? X86::VSCATTERQPSZ256mr : X86::VPSCATTERQDZ256mr;
+    else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 32)
+      Opc = IsFP ? X86::VSCATTERQPSZmr : X86::VPSCATTERQDZmr;
+    else if (IndexVT == MVT::v2i64 && NumElts == 2 && EltSize == 64)
+      Opc = IsFP ? X86::VSCATTERQPDZ128mr : X86::VPSCATTERQQZ128mr;
+    else if (IndexVT == MVT::v4i64 && NumElts == 4 && EltSize == 64)
+      Opc = IsFP ? X86::VSCATTERQPDZ256mr : X86::VPSCATTERQQZ256mr;
+    else if (IndexVT == MVT::v8i64 && NumElts == 8 && EltSize == 64)
+      Opc = IsFP ? X86::VSCATTERQPDZmr : X86::VPSCATTERQQZmr;
+    else
+      break;
+
+    SDValue Base, Scale, Index, Disp, Segment;
+    if (!selectVectorAddr(Sc, Sc->getBasePtr(), IndexOp, Sc->getScale(),
+                          Base, Scale, Index, Disp, Segment))
+      break;
+
+    SDValue Mask = Sc->getMask();
+    SDValue Chain = Sc->getChain();
+    // Scatter instructions have a mask output not in the ISD node.
+    SDVTList VTs = CurDAG->getVTList(Mask.getValueType(), MVT::Other);
+    SDValue Ops[] = {Base, Scale, Index, Disp, Segment, Mask, Value, Chain};
+
+    MachineSDNode *NewNode = CurDAG->getMachineNode(Opc, SDLoc(dl), VTs, Ops);
+    CurDAG->setNodeMemRefs(NewNode, {Sc->getMemOperand()});
+    ReplaceUses(SDValue(Node, 0), SDValue(NewNode, 1));
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
+  case ISD::PREALLOCATED_SETUP: {
+    auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+    auto CallId = MFI->getPreallocatedIdForCallSite(
+        cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
+    SDValue Chain = Node->getOperand(0);
+    SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
+    MachineSDNode *New = CurDAG->getMachineNode(
+        TargetOpcode::PREALLOCATED_SETUP, dl, MVT::Other, CallIdValue, Chain);
+    ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Chain
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
+  case ISD::PREALLOCATED_ARG: {
+    auto *MFI = CurDAG->getMachineFunction().getInfo<X86MachineFunctionInfo>();
+    auto CallId = MFI->getPreallocatedIdForCallSite(
+        cast<SrcValueSDNode>(Node->getOperand(1))->getValue());
+    SDValue Chain = Node->getOperand(0);
+    SDValue CallIdValue = CurDAG->getTargetConstant(CallId, dl, MVT::i32);
+    SDValue ArgIndex = Node->getOperand(2);
+    SDValue Ops[3];
+    Ops[0] = CallIdValue;
+    Ops[1] = ArgIndex;
+    Ops[2] = Chain;
+    MachineSDNode *New = CurDAG->getMachineNode(
+        TargetOpcode::PREALLOCATED_ARG, dl,
+        CurDAG->getVTList(TLI->getPointerTy(CurDAG->getDataLayout()),
+                          MVT::Other),
+        Ops);
+    ReplaceUses(SDValue(Node, 0), SDValue(New, 0)); // Arg pointer
+    ReplaceUses(SDValue(Node, 1), SDValue(New, 1)); // Chain
+    CurDAG->RemoveDeadNode(Node);
+    return;
+  }
   }
 
   SelectCode(Node);
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0f152968ddfd..450927aaf5cc 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -12,7 +12,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "X86ISelLowering.h"
-#include "Utils/X86ShuffleDecode.h"
+#include "MCTargetDesc/X86ShuffleDecode.h"
+#include "X86.h"
 #include "X86CallingConv.h"
 #include "X86FrameLowering.h"
 #include "X86InstrBuilder.h"
@@ -28,6 +29,7 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/CodeGen/IntrinsicLowering.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
@@ -37,7 +39,6 @@
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -75,13 +76,6 @@ static cl::opt<int> ExperimentalPrefLoopAlignment(
         " of the loop header PC will be 0)."),
     cl::Hidden);
 
-// Added in 10.0.
-static cl::opt<bool> EnableOldKNLABI(
-    "x86-enable-old-knl-abi", cl::init(false),
-    cl::desc("Enables passing v32i16 and v64i8 in 2 YMM registers instead of "
-             "one ZMM register on AVX512F, but not AVX512BW targets."),
-    cl::Hidden);
-
 static cl::opt<bool> MulConstantOptimization(
     "mul-constant-optimization", cl::init(true),
     cl::desc("Replace 'mul x, Const' with more effective instructions like "
@@ -164,7 +158,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   // If we don't have cmpxchg8b(meaing this is a 386/486), limit atomic size to
   // 32 bits so the AtomicExpandPass will expand it so we don't need cmpxchg8b.
-  // FIXME: Should we be limitting the atomic size on other configs? Default is
+  // FIXME: Should we be limiting the atomic size on other configs? Default is
   // 1024.
   if (!Subtarget.hasCmpxchg8b())
     setMaxAtomicSizeInBitsSupported(32);
@@ -190,12 +184,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTruncStoreAction(MVT::f64, MVT::f32, Expand);
 
   // SETOEQ and SETUNE require checking two conditions.
-  setCondCodeAction(ISD::SETOEQ, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETOEQ, MVT::f64, Expand);
-  setCondCodeAction(ISD::SETOEQ, MVT::f80, Expand);
-  setCondCodeAction(ISD::SETUNE, MVT::f32, Expand);
-  setCondCodeAction(ISD::SETUNE, MVT::f64, Expand);
-  setCondCodeAction(ISD::SETUNE, MVT::f80, Expand);
+  for (auto VT : {MVT::f32, MVT::f64, MVT::f80}) {
+    setCondCodeAction(ISD::SETOEQ, VT, Expand);
+    setCondCodeAction(ISD::SETUNE, VT, Expand);
+  }
 
   // Integer absolute.
   if (Subtarget.hasCMov()) {
@@ -206,10 +198,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 
   // Funnel shifts.
   for (auto ShiftOp : {ISD::FSHL, ISD::FSHR}) {
+    // For slow shld targets we only lower for code size.
+    LegalizeAction ShiftDoubleAction = Subtarget.isSHLDSlow() ? Custom : Legal;
+
+    setOperationAction(ShiftOp             , MVT::i8   , Custom);
     setOperationAction(ShiftOp             , MVT::i16  , Custom);
-    setOperationAction(ShiftOp             , MVT::i32  , Custom);
+    setOperationAction(ShiftOp             , MVT::i32  , ShiftDoubleAction);
     if (Subtarget.is64Bit())
-      setOperationAction(ShiftOp           , MVT::i64  , Custom);
+      setOperationAction(ShiftOp           , MVT::i64  , ShiftDoubleAction);
   }
 
   if (!Subtarget.useSoftFloat()) {
@@ -270,6 +266,16 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i32, Custom);
     setOperationAction(ISD::FP_TO_UINT,        MVT::i64, Custom);
     setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::i64, Custom);
+
+    setOperationAction(ISD::LRINT,             MVT::f32, Custom);
+    setOperationAction(ISD::LRINT,             MVT::f64, Custom);
+    setOperationAction(ISD::LLRINT,            MVT::f32, Custom);
+    setOperationAction(ISD::LLRINT,            MVT::f64, Custom);
+
+    if (!Subtarget.is64Bit()) {
+      setOperationAction(ISD::LRINT,  MVT::i64, Custom);
+      setOperationAction(ISD::LLRINT, MVT::i64, Custom);
+    }
   }
 
   // Handle address space casts between mixed sized pointers.
@@ -347,34 +353,28 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationPromotedToType(ISD::CTLZ           , MVT::i8   , MVT::i32);
     setOperationPromotedToType(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , MVT::i32);
   } else {
-    setOperationAction(ISD::CTLZ           , MVT::i8   , Custom);
-    setOperationAction(ISD::CTLZ           , MVT::i16  , Custom);
-    setOperationAction(ISD::CTLZ           , MVT::i32  , Custom);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i8   , Custom);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i16  , Custom);
-    setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i32  , Custom);
-    if (Subtarget.is64Bit()) {
-      setOperationAction(ISD::CTLZ         , MVT::i64  , Custom);
-      setOperationAction(ISD::CTLZ_ZERO_UNDEF, MVT::i64, Custom);
+    for (auto VT : {MVT::i8, MVT::i16, MVT::i32, MVT::i64}) {
+      if (VT == MVT::i64 && !Subtarget.is64Bit())
+        continue;
+      setOperationAction(ISD::CTLZ           , VT, Custom);
+      setOperationAction(ISD::CTLZ_ZERO_UNDEF, VT, Custom);
     }
   }
 
-  // Special handling for half-precision floating point conversions.
-  // If we don't have F16C support, then lower half float conversions
-  // into library calls.
-  if (Subtarget.useSoftFloat() || !Subtarget.hasF16C()) {
-    setOperationAction(ISD::FP16_TO_FP, MVT::f32, Expand);
-    setOperationAction(ISD::FP_TO_FP16, MVT::f32, Expand);
+  for (auto Op : {ISD::FP16_TO_FP, ISD::STRICT_FP16_TO_FP, ISD::FP_TO_FP16,
+                  ISD::STRICT_FP_TO_FP16}) {
+    // Special handling for half-precision floating point conversions.
+    // If we don't have F16C support, then lower half float conversions
+    // into library calls.
+    setOperationAction(
+        Op, MVT::f32,
+        (!Subtarget.useSoftFloat() && Subtarget.hasF16C()) ? Custom : Expand);
+    // There's never any support for operations beyond MVT::f32.
+    setOperationAction(Op, MVT::f64, Expand);
+    setOperationAction(Op, MVT::f80, Expand);
+    setOperationAction(Op, MVT::f128, Expand);
   }
 
-  // There's never any support for operations beyond MVT::f32.
-  setOperationAction(ISD::FP16_TO_FP, MVT::f64, Expand);
-  setOperationAction(ISD::FP16_TO_FP, MVT::f80, Expand);
-  setOperationAction(ISD::FP16_TO_FP, MVT::f128, Expand);
-  setOperationAction(ISD::FP_TO_FP16, MVT::f64, Expand);
-  setOperationAction(ISD::FP_TO_FP16, MVT::f80, Expand);
-  setOperationAction(ISD::FP_TO_FP16, MVT::f128, Expand);
-
   setLoadExtAction(ISD::EXTLOAD, MVT::f32, MVT::f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f64, MVT::f16, Expand);
   setLoadExtAction(ISD::EXTLOAD, MVT::f80, MVT::f16, Expand);
@@ -542,7 +542,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FGETSIGN, MVT::i64, Custom);
     setOperationAction(ISD::FGETSIGN, MVT::i32, Custom);
 
-  } else if (!useSoftFloat() && X86ScalarSSEf32 && (UseX87 || Is64Bit)) {
+  } else if (!Subtarget.useSoftFloat() && X86ScalarSSEf32 &&
+             (UseX87 || Is64Bit)) {
     // Use SSE for f32, x87 for f64.
     // Set up the FP register classes.
     addRegisterClass(MVT::f32, &X86::FR32RegClass);
@@ -663,8 +664,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FMA, MVT::f80, Expand);
     setOperationAction(ISD::LROUND, MVT::f80, Expand);
     setOperationAction(ISD::LLROUND, MVT::f80, Expand);
-    setOperationAction(ISD::LRINT, MVT::f80, Expand);
-    setOperationAction(ISD::LLRINT, MVT::f80, Expand);
+    setOperationAction(ISD::LRINT, MVT::f80, Custom);
+    setOperationAction(ISD::LLRINT, MVT::f80, Custom);
 
     // Handle constrained floating-point operations of scalar.
     setOperationAction(ISD::STRICT_FADD     , MVT::f80, Legal);
@@ -1038,8 +1039,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::ROTL,               MVT::v4i32, Custom);
     setOperationAction(ISD::ROTL,               MVT::v8i16, Custom);
 
-    // With AVX512, expanding (and promoting the shifts) is better.
-    if (!Subtarget.hasAVX512())
+    // With 512-bit registers or AVX512VL+BW, expanding (and promoting the
+    // shifts) is better.
+    if (!Subtarget.useAVX512Regs() &&
+        !(Subtarget.hasBWI() && Subtarget.hasVLX()))
       setOperationAction(ISD::ROTL,             MVT::v16i8, Custom);
 
     setOperationAction(ISD::STRICT_FSQRT,       MVT::v2f64, Legal);
@@ -1078,6 +1081,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FRINT,      RoundedTy,  Legal);
       setOperationAction(ISD::FNEARBYINT,        RoundedTy,  Legal);
       setOperationAction(ISD::STRICT_FNEARBYINT, RoundedTy,  Legal);
+
+      setOperationAction(ISD::FROUND,            RoundedTy,  Custom);
     }
 
     setOperationAction(ISD::SMAX,               MVT::v16i8, Legal);
@@ -1170,6 +1175,9 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::STRICT_FRINT,      VT, Legal);
       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
+
+      setOperationAction(ISD::FROUND,            VT, Custom);
+
       setOperationAction(ISD::FNEG,              VT, Custom);
       setOperationAction(ISD::FABS,              VT, Custom);
       setOperationAction(ISD::FCOPYSIGN,         VT, Custom);
@@ -1221,7 +1229,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::ROTL,              MVT::v16i16, Custom);
 
     // With BWI, expanding (and promoting the shifts) is the better.
-    if (!Subtarget.hasBWI())
+    if (!Subtarget.useBWIRegs())
       setOperationAction(ISD::ROTL,            MVT::v32i8,  Custom);
 
     setOperationAction(ISD::SELECT,            MVT::v4f64, Custom);
@@ -1412,19 +1420,23 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::ANY_EXTEND,  VT, Custom);
     }
 
-    for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
+    for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
       setOperationAction(ISD::ADD,              VT, Custom);
       setOperationAction(ISD::SUB,              VT, Custom);
       setOperationAction(ISD::MUL,              VT, Custom);
+      setOperationAction(ISD::UADDSAT,          VT, Custom);
+      setOperationAction(ISD::SADDSAT,          VT, Custom);
+      setOperationAction(ISD::USUBSAT,          VT, Custom);
+      setOperationAction(ISD::SSUBSAT,          VT, Custom);
+      setOperationAction(ISD::VSELECT,          VT,  Expand);
+    }
+
+    for (auto VT : { MVT::v2i1, MVT::v4i1, MVT::v8i1, MVT::v16i1 }) {
       setOperationAction(ISD::SETCC,            VT, Custom);
       setOperationAction(ISD::STRICT_FSETCC,    VT, Custom);
       setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
       setOperationAction(ISD::SELECT,           VT, Custom);
       setOperationAction(ISD::TRUNCATE,         VT, Custom);
-      setOperationAction(ISD::UADDSAT,          VT, Custom);
-      setOperationAction(ISD::SADDSAT,          VT, Custom);
-      setOperationAction(ISD::USUBSAT,          VT, Custom);
-      setOperationAction(ISD::SSUBSAT,          VT, Custom);
 
       setOperationAction(ISD::BUILD_VECTOR,     VT, Custom);
       setOperationAction(ISD::CONCAT_VECTORS,   VT, Custom);
@@ -1432,7 +1444,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::INSERT_SUBVECTOR, VT, Custom);
       setOperationAction(ISD::INSERT_VECTOR_ELT, VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,   VT,  Custom);
-      setOperationAction(ISD::VSELECT,          VT,  Expand);
     }
 
     for (auto VT : { MVT::v1i1, MVT::v2i1, MVT::v4i1, MVT::v8i1 })
@@ -1443,10 +1454,14 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // elements. 512-bits can be disabled based on prefer-vector-width and
   // required-vector-width function attributes.
   if (!Subtarget.useSoftFloat() && Subtarget.useAVX512Regs()) {
+    bool HasBWI = Subtarget.hasBWI();
+
     addRegisterClass(MVT::v16i32, &X86::VR512RegClass);
     addRegisterClass(MVT::v16f32, &X86::VR512RegClass);
     addRegisterClass(MVT::v8i64,  &X86::VR512RegClass);
     addRegisterClass(MVT::v8f64,  &X86::VR512RegClass);
+    addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
+    addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
 
     for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
       setLoadExtAction(ExtType, MVT::v16i32, MVT::v16i8,  Legal);
@@ -1454,6 +1469,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i8,   Legal);
       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i16,  Legal);
       setLoadExtAction(ExtType, MVT::v8i64,  MVT::v8i32,  Legal);
+      if (HasBWI)
+        setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
     }
 
     for (MVT VT : { MVT::v16f32, MVT::v8f64 }) {
@@ -1497,6 +1514,8 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setTruncStoreAction(MVT::v8i64,   MVT::v8i32,  Legal);
     setTruncStoreAction(MVT::v16i32,  MVT::v16i8,  Legal);
     setTruncStoreAction(MVT::v16i32,  MVT::v16i16, Legal);
+    if (HasBWI)
+      setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
 
     // With 512-bit vectors and no VLX, we prefer to widen MLOAD/MSTORE
     // to 512-bit rather than use the AVX2 instructions so that we can use
@@ -1509,19 +1528,26 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       }
     }
 
-    setOperationAction(ISD::TRUNCATE,           MVT::v8i32, Custom);
-    setOperationAction(ISD::TRUNCATE,           MVT::v16i16, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,        MVT::v16i32, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i64, Custom);
-    setOperationAction(ISD::ANY_EXTEND,         MVT::v16i32, Custom);
-    setOperationAction(ISD::ANY_EXTEND,         MVT::v8i64, Custom);
-    setOperationAction(ISD::SIGN_EXTEND,        MVT::v16i32, Custom);
-    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i64, Custom);
+    setOperationAction(ISD::TRUNCATE,    MVT::v8i32,  Legal);
+    setOperationAction(ISD::TRUNCATE,    MVT::v16i16, Legal);
+    setOperationAction(ISD::TRUNCATE,    MVT::v32i8,  HasBWI ? Legal : Custom);
+    setOperationAction(ISD::TRUNCATE,    MVT::v16i64, Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v32i16, Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v16i32, Custom);
+    setOperationAction(ISD::ZERO_EXTEND, MVT::v8i64,  Custom);
+    setOperationAction(ISD::ANY_EXTEND,  MVT::v32i16, Custom);
+    setOperationAction(ISD::ANY_EXTEND,  MVT::v16i32, Custom);
+    setOperationAction(ISD::ANY_EXTEND,  MVT::v8i64,  Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v32i16, Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v16i32, Custom);
+    setOperationAction(ISD::SIGN_EXTEND, MVT::v8i64,  Custom);
 
-    // Need to custom widen this if we don't have AVX512BW.
-    setOperationAction(ISD::ANY_EXTEND,         MVT::v8i8, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,        MVT::v8i8, Custom);
-    setOperationAction(ISD::SIGN_EXTEND,        MVT::v8i8, Custom);
+    if (HasBWI) {
+      // Extends from v64i1 masks to 512-bit vectors.
+      setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
+      setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
+      setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);
+    }
 
     for (auto VT : { MVT::v16f32, MVT::v8f64 }) {
       setOperationAction(ISD::FFLOOR,            VT, Legal);
@@ -1535,47 +1561,69 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::FNEARBYINT,        VT, Legal);
       setOperationAction(ISD::STRICT_FNEARBYINT, VT, Legal);
 
-      setOperationAction(ISD::SELECT,           VT, Custom);
+      setOperationAction(ISD::FROUND,            VT, Custom);
     }
 
-    // Without BWI we need to use custom lowering to handle MVT::v64i8 input.
-    for (auto VT : {MVT::v16i32, MVT::v8i64, MVT::v64i8}) {
+    for (auto VT : {MVT::v32i16, MVT::v16i32, MVT::v8i64}) {
       setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, VT, Custom);
       setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, VT, Custom);
     }
 
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8f64,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v8i64,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16f32,  Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v16i32,  Custom);
+    setOperationAction(ISD::ADD, MVT::v32i16, HasBWI ? Legal : Custom);
+    setOperationAction(ISD::SUB, MVT::v32i16, HasBWI ? Legal : Custom);
+    setOperationAction(ISD::ADD, MVT::v64i8,  HasBWI ? Legal : Custom);
+    setOperationAction(ISD::SUB, MVT::v64i8,  HasBWI ? Legal : Custom);
+
+    setOperationAction(ISD::MUL, MVT::v8i64,  Custom);
+    setOperationAction(ISD::MUL, MVT::v16i32, Legal);
+    setOperationAction(ISD::MUL, MVT::v32i16, HasBWI ? Legal : Custom);
+    setOperationAction(ISD::MUL, MVT::v64i8,  Custom);
 
-    setOperationAction(ISD::MUL,                MVT::v8i64, Custom);
-    setOperationAction(ISD::MUL,                MVT::v16i32, Legal);
+    setOperationAction(ISD::MULHU, MVT::v16i32, Custom);
+    setOperationAction(ISD::MULHS, MVT::v16i32, Custom);
+    setOperationAction(ISD::MULHS, MVT::v32i16, HasBWI ? Legal : Custom);
+    setOperationAction(ISD::MULHU, MVT::v32i16, HasBWI ? Legal : Custom);
+    setOperationAction(ISD::MULHS, MVT::v64i8,  Custom);
+    setOperationAction(ISD::MULHU, MVT::v64i8,  Custom);
 
-    setOperationAction(ISD::MULHU,              MVT::v16i32,  Custom);
-    setOperationAction(ISD::MULHS,              MVT::v16i32,  Custom);
+    setOperationAction(ISD::BITREVERSE, MVT::v64i8,  Custom);
 
+    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
+      setOperationAction(ISD::SRL,              VT, Custom);
+      setOperationAction(ISD::SHL,              VT, Custom);
+      setOperationAction(ISD::SRA,              VT, Custom);
+      setOperationAction(ISD::SETCC,            VT, Custom);
+
+      // The condition codes aren't legal in SSE/AVX and under AVX512 we use
+      // setcc all the way to isel and prefer SETGT in some isel patterns.
+      setCondCodeAction(ISD::SETLT, VT, Custom);
+      setCondCodeAction(ISD::SETLE, VT, Custom);
+    }
     for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
       setOperationAction(ISD::SMAX,             VT, Legal);
       setOperationAction(ISD::UMAX,             VT, Legal);
       setOperationAction(ISD::SMIN,             VT, Legal);
       setOperationAction(ISD::UMIN,             VT, Legal);
       setOperationAction(ISD::ABS,              VT, Legal);
-      setOperationAction(ISD::SRL,              VT, Custom);
-      setOperationAction(ISD::SHL,              VT, Custom);
-      setOperationAction(ISD::SRA,              VT, Custom);
       setOperationAction(ISD::CTPOP,            VT, Custom);
       setOperationAction(ISD::ROTL,             VT, Custom);
       setOperationAction(ISD::ROTR,             VT, Custom);
-      setOperationAction(ISD::SETCC,            VT, Custom);
       setOperationAction(ISD::STRICT_FSETCC,    VT, Custom);
       setOperationAction(ISD::STRICT_FSETCCS,   VT, Custom);
-      setOperationAction(ISD::SELECT,           VT, Custom);
+    }
 
-      // The condition codes aren't legal in SSE/AVX and under AVX512 we use
-      // setcc all the way to isel and prefer SETGT in some isel patterns.
-      setCondCodeAction(ISD::SETLT, VT, Custom);
-      setCondCodeAction(ISD::SETLE, VT, Custom);
+    for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+      setOperationAction(ISD::ABS,     VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::CTPOP,   VT, Subtarget.hasBITALG() ? Legal : Custom);
+      setOperationAction(ISD::CTLZ,    VT, Custom);
+      setOperationAction(ISD::SMAX,    VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::UMAX,    VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::SMIN,    VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::UMIN,    VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::UADDSAT, VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::SADDSAT, VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::USUBSAT, VT, HasBWI ? Legal : Custom);
+      setOperationAction(ISD::SSUBSAT, VT, HasBWI ? Legal : Custom);
     }
 
     if (Subtarget.hasDQI()) {
@@ -1610,36 +1658,42 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
                      MVT::v8f32, MVT::v4f64 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Legal);
 
+    for (auto VT : { MVT::v64i8, MVT::v32i16, MVT::v16i32, MVT::v8i64,
+                     MVT::v16f32, MVT::v8f64 }) {
+      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Legal);
+      setOperationAction(ISD::SELECT,             VT, Custom);
+      setOperationAction(ISD::VSELECT,            VT, Custom);
+      setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
+      setOperationAction(ISD::EXTRACT_VECTOR_ELT, VT, Custom);
+      setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::SCALAR_TO_VECTOR,   VT, Custom);
+      setOperationAction(ISD::INSERT_VECTOR_ELT,  VT, Custom);
+    }
+
     for (auto VT : { MVT::v16i32, MVT::v8i64, MVT::v16f32, MVT::v8f64 }) {
-      setOperationAction(ISD::VECTOR_SHUFFLE,      VT, Custom);
-      setOperationAction(ISD::INSERT_VECTOR_ELT,   VT, Custom);
-      setOperationAction(ISD::BUILD_VECTOR,        VT, Custom);
-      setOperationAction(ISD::VSELECT,             VT, Custom);
-      setOperationAction(ISD::EXTRACT_VECTOR_ELT,  VT, Custom);
-      setOperationAction(ISD::SCALAR_TO_VECTOR,    VT, Custom);
-      setOperationAction(ISD::INSERT_SUBVECTOR,    VT, Legal);
       setOperationAction(ISD::MLOAD,               VT, Legal);
       setOperationAction(ISD::MSTORE,              VT, Legal);
       setOperationAction(ISD::MGATHER,             VT, Custom);
       setOperationAction(ISD::MSCATTER,            VT, Custom);
     }
-    if (!Subtarget.hasBWI()) {
-      // Need to custom split v32i16/v64i8 bitcasts.
-      setOperationAction(ISD::BITCAST, MVT::v32i16, Custom);
-      setOperationAction(ISD::BITCAST, MVT::v64i8,  Custom);
-
-      // Better to split these into two 256-bit ops.
-      setOperationAction(ISD::BITREVERSE, MVT::v8i64, Custom);
-      setOperationAction(ISD::BITREVERSE, MVT::v16i32, Custom);
+    if (HasBWI) {
+      for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
+        setOperationAction(ISD::MLOAD,        VT, Legal);
+        setOperationAction(ISD::MSTORE,       VT, Legal);
+      }
+    } else {
+      setOperationAction(ISD::STORE, MVT::v32i16, Custom);
+      setOperationAction(ISD::STORE, MVT::v64i8,  Custom);
     }
 
     if (Subtarget.hasVBMI2()) {
-      for (auto VT : { MVT::v16i32, MVT::v8i64 }) {
+      for (auto VT : { MVT::v32i16, MVT::v16i32, MVT::v8i64 }) {
         setOperationAction(ISD::FSHL, VT, Custom);
         setOperationAction(ISD::FSHR, VT, Custom);
       }
     }
-  }// has  AVX-512
+  }// useAVX512Regs
 
   // This block controls legalization for operations that don't have
   // pre-AVX512 equivalents. Without VLX we use 512-bit operations for
@@ -1667,6 +1721,19 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::STRICT_UINT_TO_FP, MVT::v4i32,
                        Subtarget.hasVLX() ? Legal : Custom);
 
+    if (Subtarget.hasDQI()) {
+      // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
+      // v2f32 UINT_TO_FP is already custom under SSE2.
+      assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
+             isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
+             "Unexpected operation action!");
+      // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
+      setOperationAction(ISD::FP_TO_SINT,        MVT::v2f32, Custom);
+      setOperationAction(ISD::FP_TO_UINT,        MVT::v2f32, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
+      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
+    }
+
     for (auto VT : { MVT::v2i64, MVT::v4i64 }) {
       setOperationAction(ISD::SMAX, VT, Legal);
       setOperationAction(ISD::UMAX, VT, Legal);
@@ -1746,12 +1813,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
       setOperationAction(ISD::SELECT,             VT, Custom);
       setOperationAction(ISD::BUILD_VECTOR,       VT, Custom);
       setOperationAction(ISD::VECTOR_SHUFFLE,     VT, Custom);
+      setOperationAction(ISD::CONCAT_VECTORS,     VT, Custom);
+      setOperationAction(ISD::INSERT_SUBVECTOR,   VT, Custom);
     }
 
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i1, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i1, Custom);
-    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i1, Custom);
-    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i1, Custom);
     for (auto VT : { MVT::v16i1, MVT::v32i1 })
       setOperationAction(ISD::EXTRACT_SUBVECTOR, VT, Custom);
 
@@ -1759,93 +1824,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i8, Custom);
     setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i8, Custom);
     setOperationAction(ISD::ANY_EXTEND,         MVT::v32i8, Custom);
-  }
-
-  // This block controls legalization for v32i16 and v64i8. 512-bits can be
-  // disabled based on prefer-vector-width and required-vector-width function
-  // attributes.
-  if (!Subtarget.useSoftFloat() && Subtarget.useBWIRegs()) {
-    addRegisterClass(MVT::v32i16, &X86::VR512RegClass);
-    addRegisterClass(MVT::v64i8,  &X86::VR512RegClass);
-
-    // Extends from v64i1 masks to 512-bit vectors.
-    setOperationAction(ISD::SIGN_EXTEND,        MVT::v64i8, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,        MVT::v64i8, Custom);
-    setOperationAction(ISD::ANY_EXTEND,         MVT::v64i8, Custom);
-
-    setOperationAction(ISD::MUL,                MVT::v32i16, Legal);
-    setOperationAction(ISD::MUL,                MVT::v64i8, Custom);
-    setOperationAction(ISD::MULHS,              MVT::v32i16, Legal);
-    setOperationAction(ISD::MULHU,              MVT::v32i16, Legal);
-    setOperationAction(ISD::MULHS,              MVT::v64i8, Custom);
-    setOperationAction(ISD::MULHU,              MVT::v64i8, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v32i16, Custom);
-    setOperationAction(ISD::CONCAT_VECTORS,     MVT::v64i8, Custom);
-    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v32i16, Legal);
-    setOperationAction(ISD::INSERT_SUBVECTOR,   MVT::v64i8, Legal);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v32i16, Custom);
-    setOperationAction(ISD::EXTRACT_VECTOR_ELT, MVT::v64i8, Custom);
-    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v32i16, Custom);
-    setOperationAction(ISD::SCALAR_TO_VECTOR,   MVT::v64i8, Custom);
-    setOperationAction(ISD::SIGN_EXTEND,        MVT::v32i16, Custom);
-    setOperationAction(ISD::ZERO_EXTEND,        MVT::v32i16, Custom);
-    setOperationAction(ISD::ANY_EXTEND,         MVT::v32i16, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v32i16, Custom);
-    setOperationAction(ISD::VECTOR_SHUFFLE,     MVT::v64i8, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v32i16, Custom);
-    setOperationAction(ISD::INSERT_VECTOR_ELT,  MVT::v64i8, Custom);
-    setOperationAction(ISD::TRUNCATE,           MVT::v32i8, Custom);
-    setOperationAction(ISD::BITREVERSE,         MVT::v64i8, Custom);
-
-    setOperationAction(ISD::SIGN_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
-    setOperationAction(ISD::ZERO_EXTEND_VECTOR_INREG, MVT::v32i16, Custom);
-
-    setTruncStoreAction(MVT::v32i16,  MVT::v32i8, Legal);
-
-    for (auto VT : { MVT::v64i8, MVT::v32i16 }) {
-      setOperationAction(ISD::BUILD_VECTOR, VT, Custom);
-      setOperationAction(ISD::VSELECT,      VT, Custom);
-      setOperationAction(ISD::ABS,          VT, Legal);
-      setOperationAction(ISD::SRL,          VT, Custom);
-      setOperationAction(ISD::SHL,          VT, Custom);
-      setOperationAction(ISD::SRA,          VT, Custom);
-      setOperationAction(ISD::MLOAD,        VT, Legal);
-      setOperationAction(ISD::MSTORE,       VT, Legal);
-      setOperationAction(ISD::CTPOP,        VT, Custom);
-      setOperationAction(ISD::CTLZ,         VT, Custom);
-      setOperationAction(ISD::SMAX,         VT, Legal);
-      setOperationAction(ISD::UMAX,         VT, Legal);
-      setOperationAction(ISD::SMIN,         VT, Legal);
-      setOperationAction(ISD::UMIN,         VT, Legal);
-      setOperationAction(ISD::SETCC,        VT, Custom);
-      setOperationAction(ISD::UADDSAT,      VT, Legal);
-      setOperationAction(ISD::SADDSAT,      VT, Legal);
-      setOperationAction(ISD::USUBSAT,      VT, Legal);
-      setOperationAction(ISD::SSUBSAT,      VT, Legal);
-      setOperationAction(ISD::SELECT,       VT, Custom);
-
-      // The condition codes aren't legal in SSE/AVX and under AVX512 we use
-      // setcc all the way to isel and prefer SETGT in some isel patterns.
-      setCondCodeAction(ISD::SETLT, VT, Custom);
-      setCondCodeAction(ISD::SETLE, VT, Custom);
-    }
-
-    for (auto ExtType : {ISD::ZEXTLOAD, ISD::SEXTLOAD}) {
-      setLoadExtAction(ExtType, MVT::v32i16, MVT::v32i8, Legal);
-    }
-
-    if (Subtarget.hasBITALG()) {
-      for (auto VT : { MVT::v64i8, MVT::v32i16 })
-        setOperationAction(ISD::CTPOP, VT, Legal);
-    }
 
-    if (Subtarget.hasVBMI2()) {
-      setOperationAction(ISD::FSHL, MVT::v32i16, Custom);
-      setOperationAction(ISD::FSHR, MVT::v32i16, Custom);
-    }
-  }
-
-  if (!Subtarget.useSoftFloat() && Subtarget.hasBWI()) {
     for (auto VT : { MVT::v32i8, MVT::v16i8, MVT::v16i16, MVT::v8i16 }) {
       setOperationAction(ISD::MLOAD,  VT, Subtarget.hasVLX() ? Legal : Custom);
       setOperationAction(ISD::MSTORE, VT, Subtarget.hasVLX() ? Legal : Custom);
@@ -1874,19 +1853,6 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setTruncStoreAction(MVT::v4i32, MVT::v4i8,  Legal);
     setTruncStoreAction(MVT::v4i32, MVT::v4i16, Legal);
 
-    if (Subtarget.hasDQI()) {
-      // Fast v2f32 SINT_TO_FP( v2i64 ) custom conversion.
-      // v2f32 UINT_TO_FP is already custom under SSE2.
-      assert(isOperationCustom(ISD::UINT_TO_FP, MVT::v2f32) &&
-             isOperationCustom(ISD::STRICT_UINT_TO_FP, MVT::v2f32) &&
-             "Unexpected operation action!");
-      // v2i64 FP_TO_S/UINT(v2f32) custom conversion.
-      setOperationAction(ISD::FP_TO_SINT,        MVT::v2f32, Custom);
-      setOperationAction(ISD::FP_TO_UINT,        MVT::v2f32, Custom);
-      setOperationAction(ISD::STRICT_FP_TO_SINT, MVT::v2f32, Custom);
-      setOperationAction(ISD::STRICT_FP_TO_UINT, MVT::v2f32, Custom);
-    }
-
     if (Subtarget.hasBWI()) {
       setTruncStoreAction(MVT::v16i16,  MVT::v16i8, Legal);
       setTruncStoreAction(MVT::v8i16,   MVT::v8i8,  Legal);
@@ -1983,6 +1949,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   // We have target-specific dag combine patterns for the following nodes:
   setTargetDAGCombine(ISD::VECTOR_SHUFFLE);
   setTargetDAGCombine(ISD::SCALAR_TO_VECTOR);
+  setTargetDAGCombine(ISD::INSERT_VECTOR_ELT);
   setTargetDAGCombine(ISD::EXTRACT_VECTOR_ELT);
   setTargetDAGCombine(ISD::CONCAT_VECTORS);
   setTargetDAGCombine(ISD::INSERT_SUBVECTOR);
@@ -2000,6 +1967,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTargetDAGCombine(ISD::FSUB);
   setTargetDAGCombine(ISD::FNEG);
   setTargetDAGCombine(ISD::FMA);
+  setTargetDAGCombine(ISD::STRICT_FMA);
   setTargetDAGCombine(ISD::FMINNUM);
   setTargetDAGCombine(ISD::FMAXNUM);
   setTargetDAGCombine(ISD::SUB);
@@ -2024,6 +1992,10 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
   setTargetDAGCombine(ISD::XOR);
   setTargetDAGCombine(ISD::MSCATTER);
   setTargetDAGCombine(ISD::MGATHER);
+  setTargetDAGCombine(ISD::FP16_TO_FP);
+  setTargetDAGCombine(ISD::FP_EXTEND);
+  setTargetDAGCombine(ISD::STRICT_FP_EXTEND);
+  setTargetDAGCombine(ISD::FP_ROUND);
 
   computeRegisterProperties(Subtarget.getRegisterInfo());
 
@@ -2075,7 +2047,8 @@ SDValue X86TargetLowering::emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
 
 TargetLoweringBase::LegalizeTypeAction
 X86TargetLowering::getPreferredVectorAction(MVT VT) const {
-  if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
+  if ((VT == MVT::v32i1 || VT == MVT::v64i1) && Subtarget.hasAVX512() &&
+      !Subtarget.hasBWI())
     return TypeSplitVector;
 
   if (VT.getVectorNumElements() != 1 &&
@@ -2085,51 +2058,73 @@ X86TargetLowering::getPreferredVectorAction(MVT VT) const {
   return TargetLoweringBase::getPreferredVectorAction(VT);
 }
 
+static std::pair<MVT, unsigned>
+handleMaskRegisterForCallingConv(unsigned NumElts, CallingConv::ID CC,
+                                 const X86Subtarget &Subtarget) {
+  // v2i1/v4i1/v8i1/v16i1 all pass in xmm registers unless the calling
+  // convention is one that uses k registers.
+  if (NumElts == 2)
+    return {MVT::v2i64, 1};
+  if (NumElts == 4)
+    return {MVT::v4i32, 1};
+  if (NumElts == 8 && CC != CallingConv::X86_RegCall &&
+      CC != CallingConv::Intel_OCL_BI)
+    return {MVT::v8i16, 1};
+  if (NumElts == 16 && CC != CallingConv::X86_RegCall &&
+      CC != CallingConv::Intel_OCL_BI)
+    return {MVT::v16i8, 1};
+  // v32i1 passes in ymm unless we have BWI and the calling convention is
+  // regcall.
+  if (NumElts == 32 && (!Subtarget.hasBWI() || CC != CallingConv::X86_RegCall))
+    return {MVT::v32i8, 1};
+  // Split v64i1 vectors if we don't have v64i8 available.
+  if (NumElts == 64 && Subtarget.hasBWI() && CC != CallingConv::X86_RegCall) {
+    if (Subtarget.useAVX512Regs())
+      return {MVT::v64i8, 1};
+    return {MVT::v32i8, 2};
+  }
+
+  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
+  if (!isPowerOf2_32(NumElts) || (NumElts == 64 && !Subtarget.hasBWI()) ||
+      NumElts > 64)
+    return {MVT::i8, NumElts};
+
+  return {MVT::INVALID_SIMPLE_VALUE_TYPE, 0};
+}
+
 MVT X86TargetLowering::getRegisterTypeForCallingConv(LLVMContext &Context,
                                                      CallingConv::ID CC,
                                                      EVT VT) const {
-  // v32i1 vectors should be promoted to v32i8 to match avx2.
-  if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
-    return MVT::v32i8;
-  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
-      Subtarget.hasAVX512() &&
-      (!isPowerOf2_32(VT.getVectorNumElements()) ||
-       (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
-       (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
-    return MVT::i8;
-  // Split v64i1 vectors if we don't have v64i8 available.
-  if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
-      CC != CallingConv::X86_RegCall)
-    return MVT::v32i1;
-  // FIXME: Should we just make these types legal and custom split operations?
-  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
-      Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
-    return MVT::v16i32;
+      Subtarget.hasAVX512()) {
+    unsigned NumElts = VT.getVectorNumElements();
+
+    MVT RegisterVT;
+    unsigned NumRegisters;
+    std::tie(RegisterVT, NumRegisters) =
+        handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+    if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+      return RegisterVT;
+  }
+
   return TargetLowering::getRegisterTypeForCallingConv(Context, CC, VT);
 }
 
 unsigned X86TargetLowering::getNumRegistersForCallingConv(LLVMContext &Context,
                                                           CallingConv::ID CC,
                                                           EVT VT) const {
-  // v32i1 vectors should be promoted to v32i8 to match avx2.
-  if (VT == MVT::v32i1 && Subtarget.hasAVX512() && !Subtarget.hasBWI())
-    return 1;
-  // Break wide or odd vXi1 vectors into scalars to match avx2 behavior.
   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
-      Subtarget.hasAVX512() &&
-      (!isPowerOf2_32(VT.getVectorNumElements()) ||
-       (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
-       (VT.getVectorNumElements() > 64 && Subtarget.hasBWI())))
-    return VT.getVectorNumElements();
-  // Split v64i1 vectors if we don't have v64i8 available.
-  if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
-      CC != CallingConv::X86_RegCall)
-    return 2;
-  // FIXME: Should we just make these types legal and custom split operations?
-  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !EnableOldKNLABI &&
-      Subtarget.useAVX512Regs() && !Subtarget.hasBWI())
-    return 1;
+      Subtarget.hasAVX512()) {
+    unsigned NumElts = VT.getVectorNumElements();
+
+    MVT RegisterVT;
+    unsigned NumRegisters;
+    std::tie(RegisterVT, NumRegisters) =
+        handleMaskRegisterForCallingConv(NumElts, CC, Subtarget);
+    if (RegisterVT != MVT::INVALID_SIMPLE_VALUE_TYPE)
+      return NumRegisters;
+  }
+
   return TargetLowering::getNumRegistersForCallingConv(Context, CC, VT);
 }
 
@@ -2140,8 +2135,8 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
       Subtarget.hasAVX512() &&
       (!isPowerOf2_32(VT.getVectorNumElements()) ||
-       (VT.getVectorNumElements() > 16 && !Subtarget.hasBWI()) ||
-       (VT.getVectorNumElements() > 64 && Subtarget.hasBWI()))) {
+       (VT.getVectorNumElements() == 64 && !Subtarget.hasBWI()) ||
+       VT.getVectorNumElements() > 64)) {
     RegisterVT = MVT::i8;
     IntermediateVT = MVT::i1;
     NumIntermediates = VT.getVectorNumElements();
@@ -2151,7 +2146,7 @@ unsigned X86TargetLowering::getVectorTypeBreakdownForCallingConv(
   // Split v64i1 vectors if we don't have v64i8 available.
   if (VT == MVT::v64i1 && Subtarget.hasBWI() && !Subtarget.useAVX512Regs() &&
       CC != CallingConv::X86_RegCall) {
-    RegisterVT = MVT::v32i1;
+    RegisterVT = MVT::v32i8;
     IntermediateVT = MVT::v32i1;
     NumIntermediates = 2;
     return 2;
@@ -2194,20 +2189,20 @@ EVT X86TargetLowering::getSetCCResultType(const DataLayout &DL,
 
 /// Helper for getByValTypeAlignment to determine
 /// the desired ByVal argument alignment.
-static void getMaxByValAlign(Type *Ty, unsigned &MaxAlign) {
+static void getMaxByValAlign(Type *Ty, Align &MaxAlign) {
   if (MaxAlign == 16)
     return;
   if (VectorType *VTy = dyn_cast<VectorType>(Ty)) {
-    if (VTy->getBitWidth() == 128)
-      MaxAlign = 16;
+    if (VTy->getPrimitiveSizeInBits().getFixedSize() == 128)
+      MaxAlign = Align(16);
   } else if (ArrayType *ATy = dyn_cast<ArrayType>(Ty)) {
-    unsigned EltAlign = 0;
+    Align EltAlign;
     getMaxByValAlign(ATy->getElementType(), EltAlign);
     if (EltAlign > MaxAlign)
       MaxAlign = EltAlign;
   } else if (StructType *STy = dyn_cast<StructType>(Ty)) {
     for (auto *EltTy : STy->elements()) {
-      unsigned EltAlign = 0;
+      Align EltAlign;
       getMaxByValAlign(EltTy, EltAlign);
       if (EltAlign > MaxAlign)
         MaxAlign = EltAlign;
@@ -2225,46 +2220,34 @@ unsigned X86TargetLowering::getByValTypeAlignment(Type *Ty,
                                                   const DataLayout &DL) const {
   if (Subtarget.is64Bit()) {
     // Max of 8 and alignment of type.
-    unsigned TyAlign = DL.getABITypeAlignment(Ty);
+    Align TyAlign = DL.getABITypeAlign(Ty);
     if (TyAlign > 8)
-      return TyAlign;
+      return TyAlign.value();
     return 8;
   }
 
-  unsigned Align = 4;
+  Align Alignment(4);
   if (Subtarget.hasSSE1())
-    getMaxByValAlign(Ty, Align);
-  return Align;
-}
-
-/// Returns the target specific optimal type for load
-/// and store operations as a result of memset, memcpy, and memmove
-/// lowering. If DstAlign is zero that means it's safe to destination
-/// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
-/// means there isn't a need to check it against alignment requirement,
-/// probably because the source does not need to be loaded. If 'IsMemset' is
-/// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
-/// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
-/// source is constant so it does not need to be loaded.
+    getMaxByValAlign(Ty, Alignment);
+  return Alignment.value();
+}
+
 /// It returns EVT::Other if the type should be determined using generic
 /// target-independent logic.
 /// For vector ops we check that the overall size isn't larger than our
 /// preferred vector width.
 EVT X86TargetLowering::getOptimalMemOpType(
-    uint64_t Size, unsigned DstAlign, unsigned SrcAlign, bool IsMemset,
-    bool ZeroMemset, bool MemcpyStrSrc,
-    const AttributeList &FuncAttributes) const {
+    const MemOp &Op, const AttributeList &FuncAttributes) const {
   if (!FuncAttributes.hasFnAttribute(Attribute::NoImplicitFloat)) {
-    if (Size >= 16 && (!Subtarget.isUnalignedMem16Slow() ||
-                       ((DstAlign == 0 || DstAlign >= 16) &&
-                        (SrcAlign == 0 || SrcAlign >= 16)))) {
+    if (Op.size() >= 16 &&
+        (!Subtarget.isUnalignedMem16Slow() || Op.isAligned(Align(16)))) {
       // FIXME: Check if unaligned 64-byte accesses are slow.
-      if (Size >= 64 && Subtarget.hasAVX512() &&
+      if (Op.size() >= 64 && Subtarget.hasAVX512() &&
           (Subtarget.getPreferVectorWidth() >= 512)) {
         return Subtarget.hasBWI() ? MVT::v64i8 : MVT::v16i32;
       }
       // FIXME: Check if unaligned 32-byte accesses are slow.
-      if (Size >= 32 && Subtarget.hasAVX() &&
+      if (Op.size() >= 32 && Subtarget.hasAVX() &&
           (Subtarget.getPreferVectorWidth() >= 256)) {
         // Although this isn't a well-supported type for AVX1, we'll let
         // legalization and shuffle lowering produce the optimal codegen. If we
@@ -2280,8 +2263,8 @@ EVT X86TargetLowering::getOptimalMemOpType(
       if (Subtarget.hasSSE1() && (Subtarget.is64Bit() || Subtarget.hasX87()) &&
           (Subtarget.getPreferVectorWidth() >= 128))
         return MVT::v4f32;
-    } else if ((!IsMemset || ZeroMemset) && !MemcpyStrSrc && Size >= 8 &&
-               !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
+    } else if (((Op.isMemcpy() && !Op.isMemcpyStrSrc()) || Op.isZeroMemset()) &&
+               Op.size() >= 8 && !Subtarget.is64Bit() && Subtarget.hasSSE2()) {
       // Do not use f64 to lower memcpy if source is string constant. It's
       // better to use i32 to avoid the loads.
       // Also, do not use f64 to lower memset unless this is a memset of zeros.
@@ -2294,7 +2277,7 @@ EVT X86TargetLowering::getOptimalMemOpType(
   // This is a compromise. If we reach here, unaligned accesses may be slow on
   // this target. However, creating smaller, aligned accesses could be even
   // slower and would certainly be a lot more code.
-  if (Subtarget.is64Bit() && Size >= 8)
+  if (Subtarget.is64Bit() && Op.size() >= 8)
     return MVT::i64;
   return MVT::i32;
 }
@@ -2611,7 +2594,7 @@ static SDValue lowerMasksToReg(const SDValue &ValArg, const EVT &ValLoc,
 /// Breaks v64i1 value into two registers and adds the new node to the DAG
 static void Passv64i1ArgInRegs(
     const SDLoc &Dl, SelectionDAG &DAG, SDValue &Arg,
-    SmallVectorImpl<std::pair<unsigned, SDValue>> &RegsToPass, CCValAssign &VA,
+    SmallVectorImpl<std::pair<Register, SDValue>> &RegsToPass, CCValAssign &VA,
     CCValAssign &NextVA, const X86Subtarget &Subtarget) {
   assert(Subtarget.hasBWI() && "Expected AVX512BW target!");
   assert(Subtarget.is32Bit() && "Expecting 32 bit target");
@@ -2656,14 +2639,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   CCState CCInfo(CallConv, isVarArg, MF, RVLocs, *DAG.getContext());
   CCInfo.AnalyzeReturn(Outs, RetCC_X86);
 
-  SDValue Flag;
-  SmallVector<SDValue, 6> RetOps;
-  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
-  // Operand #1 = Bytes To Pop
-  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
-                   MVT::i32));
-
-  // Copy the result values into the output registers.
+  SmallVector<std::pair<Register, SDValue>, 4> RetVals;
   for (unsigned I = 0, OutsIndex = 0, E = RVLocs.size(); I != E;
        ++I, ++OutsIndex) {
     CCValAssign &VA = RVLocs[I];
@@ -2715,7 +2691,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       // change the value to the FP stack register class.
       if (isScalarFPTypeInSSEReg(VA.getValVT()))
         ValToCopy = DAG.getNode(ISD::FP_EXTEND, dl, MVT::f80, ValToCopy);
-      RetOps.push_back(ValToCopy);
+      RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
       // Don't emit a copytoreg.
       continue;
     }
@@ -2736,31 +2712,39 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
       }
     }
 
-    SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
-
     if (VA.needsCustom()) {
       assert(VA.getValVT() == MVT::v64i1 &&
              "Currently the only custom case is when we split v64i1 to 2 regs");
 
-      Passv64i1ArgInRegs(dl, DAG, ValToCopy, RegsToPass, VA, RVLocs[++I],
+      Passv64i1ArgInRegs(dl, DAG, ValToCopy, RetVals, VA, RVLocs[++I],
                          Subtarget);
 
-      assert(2 == RegsToPass.size() &&
-             "Expecting two registers after Pass64BitArgInRegs");
-
       // Add the second register to the CalleeSaveDisableRegs list.
       if (ShouldDisableCalleeSavedRegister)
         MF.getRegInfo().disableCalleeSavedRegister(RVLocs[I].getLocReg());
     } else {
-      RegsToPass.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
+      RetVals.push_back(std::make_pair(VA.getLocReg(), ValToCopy));
     }
+  }
 
-    // Add nodes to the DAG and add the values into the RetOps list
-    for (auto &Reg : RegsToPass) {
-      Chain = DAG.getCopyToReg(Chain, dl, Reg.first, Reg.second, Flag);
-      Flag = Chain.getValue(1);
-      RetOps.push_back(DAG.getRegister(Reg.first, Reg.second.getValueType()));
+  SDValue Flag;
+  SmallVector<SDValue, 6> RetOps;
+  RetOps.push_back(Chain); // Operand #0 = Chain (updated below)
+  // Operand #1 = Bytes To Pop
+  RetOps.push_back(DAG.getTargetConstant(FuncInfo->getBytesToPopOnReturn(), dl,
+                   MVT::i32));
+
+  // Copy the result values into the output registers.
+  for (auto &RetVal : RetVals) {
+    if (RetVal.first == X86::FP0 || RetVal.first == X86::FP1) {
+      RetOps.push_back(RetVal.second);
+      continue; // Don't emit a copytoreg.
     }
+
+    Chain = DAG.getCopyToReg(Chain, dl, RetVal.first, RetVal.second, Flag);
+    Flag = Chain.getValue(1);
+    RetOps.push_back(
+        DAG.getRegister(RetVal.first, RetVal.second.getValueType()));
   }
 
   // Swift calling convention does not require we copy the sret argument
@@ -2775,7 +2759,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
   // may not have an explicit sret argument. If FuncInfo.CanLowerReturn is
   // false, then an sret argument may be implicitly inserted in the SelDAG. In
   // either case FuncInfo->setSRetReturnReg() will have been called.
-  if (unsigned SRetReg = FuncInfo->getSRetReturnReg()) {
+  if (Register SRetReg = FuncInfo->getSRetReturnReg()) {
     // When we have both sret and another return value, we should use the
     // original Chain stored in RetOps[0], instead of the current Chain updated
     // in the above loop. If we only have sret, RetOps[0] equals to Chain.
@@ -2798,7 +2782,7 @@ X86TargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
     SDValue Val = DAG.getCopyFromReg(RetOps[0], dl, SRetReg,
                                      getPointerTy(MF.getDataLayout()));
 
-    unsigned RetValReg
+    Register RetValReg
         = (Subtarget.is64Bit() && !Subtarget.isTarget64BitILP32()) ?
           X86::RAX : X86::EAX;
     Chain = DAG.getCopyToReg(Chain, dl, RetValReg, Val, Flag);
@@ -2924,7 +2908,7 @@ static SDValue getv64i1Argument(CCValAssign &VA, CCValAssign &NextVA,
   if (nullptr == InFlag) {
     // When no physical register is present,
     // create an intermediate virtual register.
-    unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+    Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
     ArgValueLo = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
     Reg = MF.addLiveIn(NextVA.getLocReg(), RC);
     ArgValueHi = DAG.getCopyFromReg(Root, Dl, Reg, MVT::i32);
@@ -3133,10 +3117,10 @@ static SDValue CreateCopyOfByValArgument(SDValue Src, SDValue Dst,
                                          SelectionDAG &DAG, const SDLoc &dl) {
   SDValue SizeNode = DAG.getConstant(Flags.getByValSize(), dl, MVT::i32);
 
-  return DAG.getMemcpy(Chain, dl, Dst, Src, SizeNode, Flags.getByValAlign(),
-                       /*isVolatile*/false, /*AlwaysInline=*/true,
-                       /*isTailCall*/false,
-                       MachinePointerInfo(), MachinePointerInfo());
+  return DAG.getMemcpy(
+      Chain, dl, Dst, Src, SizeNode, Flags.getNonZeroByValAlign(),
+      /*isVolatile*/ false, /*AlwaysInline=*/true,
+      /*isTailCall*/ false, MachinePointerInfo(), MachinePointerInfo());
 }
 
 /// Return true if the calling convention is one that we can guarantee TCO for.
@@ -3176,8 +3160,7 @@ bool X86TargetLowering::mayBeEmittedAsTailCall(const CallInst *CI) const {
   if (!CI->isTailCall())
     return false;
 
-  ImmutableCallSite CS(CI);
-  CallingConv::ID CalleeCC = CS.getCallingConv();
+  CallingConv::ID CalleeCC = CI->getCallingConv();
   if (!mayTailCallThisCC(CalleeCC))
     return false;
 
@@ -3341,20 +3324,223 @@ static ArrayRef<MCPhysReg> get64BitArgumentXMMs(MachineFunction &MF,
 
 #ifndef NDEBUG
 static bool isSortedByValueNo(ArrayRef<CCValAssign> ArgLocs) {
-  return std::is_sorted(ArgLocs.begin(), ArgLocs.end(),
-                        [](const CCValAssign &A, const CCValAssign &B) -> bool {
-                          return A.getValNo() < B.getValNo();
-                        });
+  return llvm::is_sorted(
+      ArgLocs, [](const CCValAssign &A, const CCValAssign &B) -> bool {
+        return A.getValNo() < B.getValNo();
+      });
 }
 #endif
 
+namespace {
+/// This is a helper class for lowering variable arguments parameters.
+class VarArgsLoweringHelper {
+public:
+  VarArgsLoweringHelper(X86MachineFunctionInfo *FuncInfo, const SDLoc &Loc,
+                        SelectionDAG &DAG, const X86Subtarget &Subtarget,
+                        CallingConv::ID CallConv, CCState &CCInfo)
+      : FuncInfo(FuncInfo), DL(Loc), DAG(DAG), Subtarget(Subtarget),
+        TheMachineFunction(DAG.getMachineFunction()),
+        TheFunction(TheMachineFunction.getFunction()),
+        FrameInfo(TheMachineFunction.getFrameInfo()),
+        FrameLowering(*Subtarget.getFrameLowering()),
+        TargLowering(DAG.getTargetLoweringInfo()), CallConv(CallConv),
+        CCInfo(CCInfo) {}
+
+  // Lower variable arguments parameters.
+  void lowerVarArgsParameters(SDValue &Chain, unsigned StackSize);
+
+private:
+  void createVarArgAreaAndStoreRegisters(SDValue &Chain, unsigned StackSize);
+
+  void forwardMustTailParameters(SDValue &Chain);
+
+  bool is64Bit() { return Subtarget.is64Bit(); }
+  bool isWin64() { return Subtarget.isCallingConvWin64(CallConv); }
+
+  X86MachineFunctionInfo *FuncInfo;
+  const SDLoc &DL;
+  SelectionDAG &DAG;
+  const X86Subtarget &Subtarget;
+  MachineFunction &TheMachineFunction;
+  const Function &TheFunction;
+  MachineFrameInfo &FrameInfo;
+  const TargetFrameLowering &FrameLowering;
+  const TargetLowering &TargLowering;
+  CallingConv::ID CallConv;
+  CCState &CCInfo;
+};
+} // namespace
+
+void VarArgsLoweringHelper::createVarArgAreaAndStoreRegisters(
+    SDValue &Chain, unsigned StackSize) {
+  // If the function takes variable number of arguments, make a frame index for
+  // the start of the first vararg value... for expansion of llvm.va_start. We
+  // can skip this if there are no va_start calls.
+  if (is64Bit() || (CallConv != CallingConv::X86_FastCall &&
+                    CallConv != CallingConv::X86_ThisCall)) {
+    FuncInfo->setVarArgsFrameIndex(
+        FrameInfo.CreateFixedObject(1, StackSize, true));
+  }
+
+  // Figure out if XMM registers are in use.
+  assert(!(Subtarget.useSoftFloat() &&
+           TheFunction.hasFnAttribute(Attribute::NoImplicitFloat)) &&
+         "SSE register cannot be used when SSE is disabled!");
+
+  // 64-bit calling conventions support varargs and register parameters, so we
+  // have to do extra work to spill them in the prologue.
+  if (is64Bit()) {
+    // Find the first unallocated argument registers.
+    ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
+    ArrayRef<MCPhysReg> ArgXMMs =
+        get64BitArgumentXMMs(TheMachineFunction, CallConv, Subtarget);
+    unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
+    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
+
+    assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
+           "SSE register cannot be used when SSE is disabled!");
+
+    if (isWin64()) {
+      // Get to the caller-allocated home save location.  Add 8 to account
+      // for the return address.
+      int HomeOffset = FrameLowering.getOffsetOfLocalArea() + 8;
+      FuncInfo->setRegSaveFrameIndex(
+          FrameInfo.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
+      // Fixup to set vararg frame on shadow area (4 x i64).
+      if (NumIntRegs < 4)
+        FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
+    } else {
+      // For X86-64, if there are vararg parameters that are passed via
+      // registers, then we must store them to their spots on the stack so
+      // they may be loaded by dereferencing the result of va_next.
+      FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
+      FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
+      FuncInfo->setRegSaveFrameIndex(FrameInfo.CreateStackObject(
+          ArgGPRs.size() * 8 + ArgXMMs.size() * 16, Align(16), false));
+    }
+
+    SmallVector<SDValue, 6>
+        LiveGPRs; // list of SDValue for GPR registers keeping live input value
+    SmallVector<SDValue, 8> LiveXMMRegs; // list of SDValue for XMM registers
+                                         // keeping live input value
+    SDValue ALVal; // if applicable keeps SDValue for %al register
+
+    // Gather all the live in physical registers.
+    for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
+      Register GPR = TheMachineFunction.addLiveIn(Reg, &X86::GR64RegClass);
+      LiveGPRs.push_back(DAG.getCopyFromReg(Chain, DL, GPR, MVT::i64));
+    }
+    const auto &AvailableXmms = ArgXMMs.slice(NumXMMRegs);
+    if (!AvailableXmms.empty()) {
+      Register AL = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
+      ALVal = DAG.getCopyFromReg(Chain, DL, AL, MVT::i8);
+      for (MCPhysReg Reg : AvailableXmms) {
+        Register XMMReg = TheMachineFunction.addLiveIn(Reg, &X86::VR128RegClass);
+        LiveXMMRegs.push_back(
+            DAG.getCopyFromReg(Chain, DL, XMMReg, MVT::v4f32));
+      }
+    }
+
+    // Store the integer parameter registers.
+    SmallVector<SDValue, 8> MemOps;
+    SDValue RSFIN =
+        DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
+                          TargLowering.getPointerTy(DAG.getDataLayout()));
+    unsigned Offset = FuncInfo->getVarArgsGPOffset();
+    for (SDValue Val : LiveGPRs) {
+      SDValue FIN = DAG.getNode(ISD::ADD, DL,
+                                TargLowering.getPointerTy(DAG.getDataLayout()),
+                                RSFIN, DAG.getIntPtrConstant(Offset, DL));
+      SDValue Store =
+          DAG.getStore(Val.getValue(1), DL, Val, FIN,
+                       MachinePointerInfo::getFixedStack(
+                           DAG.getMachineFunction(),
+                           FuncInfo->getRegSaveFrameIndex(), Offset));
+      MemOps.push_back(Store);
+      Offset += 8;
+    }
+
+    // Now store the XMM (fp + vector) parameter registers.
+    if (!LiveXMMRegs.empty()) {
+      SmallVector<SDValue, 12> SaveXMMOps;
+      SaveXMMOps.push_back(Chain);
+      SaveXMMOps.push_back(ALVal);
+      SaveXMMOps.push_back(
+          DAG.getIntPtrConstant(FuncInfo->getRegSaveFrameIndex(), DL));
+      SaveXMMOps.push_back(
+          DAG.getIntPtrConstant(FuncInfo->getVarArgsFPOffset(), DL));
+      SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
+                        LiveXMMRegs.end());
+      MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, DL,
+                                   MVT::Other, SaveXMMOps));
+    }
+
+    if (!MemOps.empty())
+      Chain = DAG.getNode(ISD::TokenFactor, DL, MVT::Other, MemOps);
+  }
+}
+
+void VarArgsLoweringHelper::forwardMustTailParameters(SDValue &Chain) {
+  // Find the largest legal vector type.
+  MVT VecVT = MVT::Other;
+  // FIXME: Only some x86_32 calling conventions support AVX512.
+  if (Subtarget.useAVX512Regs() &&
+      (is64Bit() || (CallConv == CallingConv::X86_VectorCall ||
+                     CallConv == CallingConv::Intel_OCL_BI)))
+    VecVT = MVT::v16f32;
+  else if (Subtarget.hasAVX())
+    VecVT = MVT::v8f32;
+  else if (Subtarget.hasSSE2())
+    VecVT = MVT::v4f32;
+
+  // We forward some GPRs and some vector types.
+  SmallVector<MVT, 2> RegParmTypes;
+  MVT IntVT = is64Bit() ? MVT::i64 : MVT::i32;
+  RegParmTypes.push_back(IntVT);
+  if (VecVT != MVT::Other)
+    RegParmTypes.push_back(VecVT);
+
+  // Compute the set of forwarded registers. The rest are scratch.
+  SmallVectorImpl<ForwardedRegister> &Forwards =
+      FuncInfo->getForwardedMustTailRegParms();
+  CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
+
+  // Forward AL for SysV x86_64 targets, since it is used for varargs.
+  if (is64Bit() && !isWin64() && !CCInfo.isAllocated(X86::AL)) {
+    Register ALVReg = TheMachineFunction.addLiveIn(X86::AL, &X86::GR8RegClass);
+    Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
+  }
+
+  // Copy all forwards from physical to virtual registers.
+  for (ForwardedRegister &FR : Forwards) {
+    // FIXME: Can we use a less constrained schedule?
+    SDValue RegVal = DAG.getCopyFromReg(Chain, DL, FR.VReg, FR.VT);
+    FR.VReg = TheMachineFunction.getRegInfo().createVirtualRegister(
+        TargLowering.getRegClassFor(FR.VT));
+    Chain = DAG.getCopyToReg(Chain, DL, FR.VReg, RegVal);
+  }
+}
+
+void VarArgsLoweringHelper::lowerVarArgsParameters(SDValue &Chain,
+                                                   unsigned StackSize) {
+  // Set FrameIndex to the 0xAAAAAAA value to mark unset state.
+  // If necessary, it would be set into the correct value later.
+  FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
+  FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
+
+  if (FrameInfo.hasVAStart())
+    createVarArgAreaAndStoreRegisters(Chain, StackSize);
+
+  if (FrameInfo.hasMustTailInVarArgFunc())
+    forwardMustTailParameters(Chain);
+}
+
 SDValue X86TargetLowering::LowerFormalArguments(
-    SDValue Chain, CallingConv::ID CallConv, bool isVarArg,
+    SDValue Chain, CallingConv::ID CallConv, bool IsVarArg,
     const SmallVectorImpl<ISD::InputArg> &Ins, const SDLoc &dl,
     SelectionDAG &DAG, SmallVectorImpl<SDValue> &InVals) const {
   MachineFunction &MF = DAG.getMachineFunction();
   X86MachineFunctionInfo *FuncInfo = MF.getInfo<X86MachineFunctionInfo>();
-  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
 
   const Function &F = MF.getFunction();
   if (F.hasExternalLinkage() && Subtarget.isTargetCygMing() &&
@@ -3366,16 +3552,16 @@ SDValue X86TargetLowering::LowerFormalArguments(
   bool IsWin64 = Subtarget.isCallingConvWin64(CallConv);
 
   assert(
-      !(isVarArg && canGuaranteeTCO(CallConv)) &&
+      !(IsVarArg && canGuaranteeTCO(CallConv)) &&
       "Var args not supported with calling conv' regcall, fastcc, ghc or hipe");
 
   // Assign locations to all of the incoming arguments.
   SmallVector<CCValAssign, 16> ArgLocs;
-  CCState CCInfo(CallConv, isVarArg, MF, ArgLocs, *DAG.getContext());
+  CCState CCInfo(CallConv, IsVarArg, MF, ArgLocs, *DAG.getContext());
 
   // Allocate shadow area for Win64.
   if (IsWin64)
-    CCInfo.AllocateStack(32, 8);
+    CCInfo.AllocateStack(32, Align(8));
 
   CCInfo.AnalyzeArguments(Ins, CC_X86);
 
@@ -3446,7 +3632,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
         else
           llvm_unreachable("Unknown argument type!");
 
-        unsigned Reg = MF.addLiveIn(VA.getLocReg(), RC);
+        Register Reg = MF.addLiveIn(VA.getLocReg(), RC);
         ArgValue = DAG.getCopyFromReg(Chain, dl, Reg, RegVT);
       }
 
@@ -3500,7 +3686,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
     // the argument into a virtual register so that we can access it from the
     // return points.
     if (Ins[I].Flags.isSRet()) {
-      unsigned Reg = FuncInfo->getSRetReturnReg();
+      Register Reg = FuncInfo->getSRetReturnReg();
       if (!Reg) {
         MVT PtrTy = getPointerTy(DAG.getDataLayout());
         Reg = MF.getRegInfo().createVirtualRegister(getRegClassFor(PtrTy));
@@ -3518,147 +3704,12 @@ SDValue X86TargetLowering::LowerFormalArguments(
                          MF.getTarget().Options.GuaranteedTailCallOpt))
     StackSize = GetAlignedArgumentStackSize(StackSize, DAG);
 
-  // If the function takes variable number of arguments, make a frame index for
-  // the start of the first vararg value... for expansion of llvm.va_start. We
-  // can skip this if there are no va_start calls.
-  if (MFI.hasVAStart() &&
-      (Is64Bit || (CallConv != CallingConv::X86_FastCall &&
-                   CallConv != CallingConv::X86_ThisCall))) {
-    FuncInfo->setVarArgsFrameIndex(MFI.CreateFixedObject(1, StackSize, true));
-  }
-
-  // Figure out if XMM registers are in use.
-  assert(!(Subtarget.useSoftFloat() &&
-           F.hasFnAttribute(Attribute::NoImplicitFloat)) &&
-         "SSE register cannot be used when SSE is disabled!");
-
-  // 64-bit calling conventions support varargs and register parameters, so we
-  // have to do extra work to spill them in the prologue.
-  if (Is64Bit && isVarArg && MFI.hasVAStart()) {
-    // Find the first unallocated argument registers.
-    ArrayRef<MCPhysReg> ArgGPRs = get64BitArgumentGPRs(CallConv, Subtarget);
-    ArrayRef<MCPhysReg> ArgXMMs = get64BitArgumentXMMs(MF, CallConv, Subtarget);
-    unsigned NumIntRegs = CCInfo.getFirstUnallocated(ArgGPRs);
-    unsigned NumXMMRegs = CCInfo.getFirstUnallocated(ArgXMMs);
-    assert(!(NumXMMRegs && !Subtarget.hasSSE1()) &&
-           "SSE register cannot be used when SSE is disabled!");
-
-    // Gather all the live in physical registers.
-    SmallVector<SDValue, 6> LiveGPRs;
-    SmallVector<SDValue, 8> LiveXMMRegs;
-    SDValue ALVal;
-    for (MCPhysReg Reg : ArgGPRs.slice(NumIntRegs)) {
-      unsigned GPR = MF.addLiveIn(Reg, &X86::GR64RegClass);
-      LiveGPRs.push_back(
-          DAG.getCopyFromReg(Chain, dl, GPR, MVT::i64));
-    }
-    if (!ArgXMMs.empty()) {
-      unsigned AL = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
-      ALVal = DAG.getCopyFromReg(Chain, dl, AL, MVT::i8);
-      for (MCPhysReg Reg : ArgXMMs.slice(NumXMMRegs)) {
-        unsigned XMMReg = MF.addLiveIn(Reg, &X86::VR128RegClass);
-        LiveXMMRegs.push_back(
-            DAG.getCopyFromReg(Chain, dl, XMMReg, MVT::v4f32));
-      }
-    }
-
-    if (IsWin64) {
-      // Get to the caller-allocated home save location.  Add 8 to account
-      // for the return address.
-      int HomeOffset = TFI.getOffsetOfLocalArea() + 8;
-      FuncInfo->setRegSaveFrameIndex(
-          MFI.CreateFixedObject(1, NumIntRegs * 8 + HomeOffset, false));
-      // Fixup to set vararg frame on shadow area (4 x i64).
-      if (NumIntRegs < 4)
-        FuncInfo->setVarArgsFrameIndex(FuncInfo->getRegSaveFrameIndex());
-    } else {
-      // For X86-64, if there are vararg parameters that are passed via
-      // registers, then we must store them to their spots on the stack so
-      // they may be loaded by dereferencing the result of va_next.
-      FuncInfo->setVarArgsGPOffset(NumIntRegs * 8);
-      FuncInfo->setVarArgsFPOffset(ArgGPRs.size() * 8 + NumXMMRegs * 16);
-      FuncInfo->setRegSaveFrameIndex(MFI.CreateStackObject(
-          ArgGPRs.size() * 8 + ArgXMMs.size() * 16, 16, false));
-    }
-
-    // Store the integer parameter registers.
-    SmallVector<SDValue, 8> MemOps;
-    SDValue RSFIN = DAG.getFrameIndex(FuncInfo->getRegSaveFrameIndex(),
-                                      getPointerTy(DAG.getDataLayout()));
-    unsigned Offset = FuncInfo->getVarArgsGPOffset();
-    for (SDValue Val : LiveGPRs) {
-      SDValue FIN = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
-                                RSFIN, DAG.getIntPtrConstant(Offset, dl));
-      SDValue Store =
-          DAG.getStore(Val.getValue(1), dl, Val, FIN,
-                       MachinePointerInfo::getFixedStack(
-                           DAG.getMachineFunction(),
-                           FuncInfo->getRegSaveFrameIndex(), Offset));
-      MemOps.push_back(Store);
-      Offset += 8;
-    }
-
-    if (!ArgXMMs.empty() && NumXMMRegs != ArgXMMs.size()) {
-      // Now store the XMM (fp + vector) parameter registers.
-      SmallVector<SDValue, 12> SaveXMMOps;
-      SaveXMMOps.push_back(Chain);
-      SaveXMMOps.push_back(ALVal);
-      SaveXMMOps.push_back(DAG.getIntPtrConstant(
-                             FuncInfo->getRegSaveFrameIndex(), dl));
-      SaveXMMOps.push_back(DAG.getIntPtrConstant(
-                             FuncInfo->getVarArgsFPOffset(), dl));
-      SaveXMMOps.insert(SaveXMMOps.end(), LiveXMMRegs.begin(),
-                        LiveXMMRegs.end());
-      MemOps.push_back(DAG.getNode(X86ISD::VASTART_SAVE_XMM_REGS, dl,
-                                   MVT::Other, SaveXMMOps));
-    }
-
-    if (!MemOps.empty())
-      Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, MemOps);
-  }
-
-  if (isVarArg && MFI.hasMustTailInVarArgFunc()) {
-    // Find the largest legal vector type.
-    MVT VecVT = MVT::Other;
-    // FIXME: Only some x86_32 calling conventions support AVX512.
-    if (Subtarget.useAVX512Regs() &&
-        (Is64Bit || (CallConv == CallingConv::X86_VectorCall ||
-                     CallConv == CallingConv::Intel_OCL_BI)))
-      VecVT = MVT::v16f32;
-    else if (Subtarget.hasAVX())
-      VecVT = MVT::v8f32;
-    else if (Subtarget.hasSSE2())
-      VecVT = MVT::v4f32;
-
-    // We forward some GPRs and some vector types.
-    SmallVector<MVT, 2> RegParmTypes;
-    MVT IntVT = Is64Bit ? MVT::i64 : MVT::i32;
-    RegParmTypes.push_back(IntVT);
-    if (VecVT != MVT::Other)
-      RegParmTypes.push_back(VecVT);
-
-    // Compute the set of forwarded registers. The rest are scratch.
-    SmallVectorImpl<ForwardedRegister> &Forwards =
-        FuncInfo->getForwardedMustTailRegParms();
-    CCInfo.analyzeMustTailForwardedRegisters(Forwards, RegParmTypes, CC_X86);
-
-    // Forward AL for SysV x86_64 targets, since it is used for varargs.
-    if (Is64Bit && !IsWin64 && !CCInfo.isAllocated(X86::AL)) {
-      unsigned ALVReg = MF.addLiveIn(X86::AL, &X86::GR8RegClass);
-      Forwards.push_back(ForwardedRegister(ALVReg, X86::AL, MVT::i8));
-    }
-
-    // Copy all forwards from physical to virtual registers.
-    for (ForwardedRegister &FR : Forwards) {
-      // FIXME: Can we use a less constrained schedule?
-      SDValue RegVal = DAG.getCopyFromReg(Chain, dl, FR.VReg, FR.VT);
-      FR.VReg = MF.getRegInfo().createVirtualRegister(getRegClassFor(FR.VT));
-      Chain = DAG.getCopyToReg(Chain, dl, FR.VReg, RegVal);
-    }
-  }
+  if (IsVarArg)
+    VarArgsLoweringHelper(FuncInfo, dl, DAG, Subtarget, CallConv, CCInfo)
+        .lowerVarArgsParameters(Chain, StackSize);
 
   // Some CCs need callee pop.
-  if (X86::isCalleePop(CallConv, Is64Bit, isVarArg,
+  if (X86::isCalleePop(CallConv, Is64Bit, IsVarArg,
                        MF.getTarget().Options.GuaranteedTailCallOpt)) {
     FuncInfo->setBytesToPopOnReturn(StackSize); // Callee pops everything.
   } else if (CallConv == CallingConv::X86_INTR && Ins.size() == 2) {
@@ -3677,10 +3728,6 @@ SDValue X86TargetLowering::LowerFormalArguments(
   if (!Is64Bit) {
     // RegSaveFrameIndex is X86-64 only.
     FuncInfo->setRegSaveFrameIndex(0xAAAAAAA);
-    if (CallConv == CallingConv::X86_FastCall ||
-        CallConv == CallingConv::X86_ThisCall)
-      // fastcc functions can't have varargs.
-      FuncInfo->setVarArgsFrameIndex(0xAAAAAAA);
   }
 
   FuncInfo->setArgumentStackSize(StackSize);
@@ -3697,7 +3744,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
       // same, so the size of funclets' (mostly empty) frames is dictated by
       // how far this slot is from the bottom (since they allocate just enough
       // space to accommodate holding this slot at the correct offset).
-      int PSPSymFI = MFI.CreateStackObject(8, 8, /*isSS=*/false);
+      int PSPSymFI = MFI.CreateStackObject(8, Align(8), /*isSS=*/false);
       EHInfo->PSPSymFrameIdx = PSPSymFI;
     }
   }
@@ -3705,7 +3752,7 @@ SDValue X86TargetLowering::LowerFormalArguments(
   if (CallConv == CallingConv::X86_RegCall ||
       F.hasFnAttribute("no_caller_saved_registers")) {
     MachineRegisterInfo &MRI = MF.getRegInfo();
-    for (std::pair<unsigned, unsigned> Pair : MRI.liveins())
+    for (std::pair<Register, Register> Pair : MRI.liveins())
       MRI.disableCalleeSavedRegister(Pair.first);
   }
 
@@ -3716,12 +3763,13 @@ SDValue X86TargetLowering::LowerMemOpCallTo(SDValue Chain, SDValue StackPtr,
                                             SDValue Arg, const SDLoc &dl,
                                             SelectionDAG &DAG,
                                             const CCValAssign &VA,
-                                            ISD::ArgFlagsTy Flags) const {
+                                            ISD::ArgFlagsTy Flags,
+                                            bool isByVal) const {
   unsigned LocMemOffset = VA.getLocMemOffset();
   SDValue PtrOff = DAG.getIntPtrConstant(LocMemOffset, dl);
   PtrOff = DAG.getNode(ISD::ADD, dl, getPointerTy(DAG.getDataLayout()),
                        StackPtr, PtrOff);
-  if (Flags.isByVal())
+  if (isByVal)
     return CreateCopyOfByValArgument(Arg, PtrOff, Chain, Flags, DAG, dl);
 
   return DAG.getStore(
@@ -3796,18 +3844,17 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   bool IsGuaranteeTCO = MF.getTarget().Options.GuaranteedTailCallOpt ||
       CallConv == CallingConv::Tail;
   X86MachineFunctionInfo *X86Info = MF.getInfo<X86MachineFunctionInfo>();
-  const auto *CI = dyn_cast_or_null<CallInst>(CLI.CS.getInstruction());
+  const auto *CI = dyn_cast_or_null<CallInst>(CLI.CB);
   const Function *Fn = CI ? CI->getCalledFunction() : nullptr;
   bool HasNCSR = (CI && CI->hasFnAttr("no_caller_saved_registers")) ||
                  (Fn && Fn->hasFnAttribute("no_caller_saved_registers"));
-  const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CS.getInstruction());
+  const auto *II = dyn_cast_or_null<InvokeInst>(CLI.CB);
   bool HasNoCfCheck =
       (CI && CI->doesNoCfCheck()) || (II && II->doesNoCfCheck());
   const Module *M = MF.getMMI().getModule();
   Metadata *IsCFProtectionSupported = M->getModuleFlag("cf-protection-branch");
 
   MachineFunction::CallSiteInfo CSInfo;
-
   if (CallConv == CallingConv::X86_INTR)
     report_fatal_error("X86 interrupts may not be called directly");
 
@@ -3823,7 +3870,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       isTailCall = false;
   }
 
-  bool IsMustTail = CLI.CS && CLI.CS.isMustTailCall();
+  bool IsMustTail = CLI.CB && CLI.CB->isMustTailCall();
   if (IsMustTail) {
     // Force this to be a tail call.  The verifier rules are enough to ensure
     // that we can lower this successfully without moving the return address
@@ -3854,7 +3901,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 
   // Allocate shadow area for Win64.
   if (IsWin64)
-    CCInfo.AllocateStack(32, 8);
+    CCInfo.AllocateStack(32, Align(8));
 
   CCInfo.AnalyzeArguments(Outs, CC_X86);
 
@@ -3900,6 +3947,21 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     if (ArgLocs.back().getLocMemOffset() != 0)
       report_fatal_error("any parameter with the inalloca attribute must be "
                          "the only memory argument");
+  } else if (CLI.IsPreallocated) {
+    assert(ArgLocs.back().isMemLoc() &&
+           "cannot use preallocated attribute on a register "
+           "parameter");
+    SmallVector<size_t, 4> PreallocatedOffsets;
+    for (size_t i = 0; i < CLI.OutVals.size(); ++i) {
+      if (CLI.CB->paramHasAttr(i, Attribute::Preallocated)) {
+        PreallocatedOffsets.push_back(ArgLocs[i].getLocMemOffset());
+      }
+    }
+    auto *MFI = DAG.getMachineFunction().getInfo<X86MachineFunctionInfo>();
+    size_t PreallocatedId = MFI->getPreallocatedIdForCallSite(CLI.CB);
+    MFI->setPreallocatedStackSize(PreallocatedId, NumBytes);
+    MFI->setPreallocatedArgOffsets(PreallocatedId, PreallocatedOffsets);
+    NumBytesToPush = 0;
   }
 
   if (!IsSibcall && !IsMustTail)
@@ -3912,7 +3974,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = EmitTailCallLoadRetAddr(DAG, RetAddrFrIdx, Chain, isTailCall,
                                     Is64Bit, FPDiff, dl);
 
-  SmallVector<std::pair<unsigned, SDValue>, 8> RegsToPass;
+  SmallVector<std::pair<Register, SDValue>, 8> RegsToPass;
   SmallVector<SDValue, 8> MemOpChains;
   SDValue StackPtr;
 
@@ -3927,9 +3989,9 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   for (unsigned I = 0, OutIndex = 0, E = ArgLocs.size(); I != E;
        ++I, ++OutIndex) {
     assert(OutIndex < Outs.size() && "Invalid Out index");
-    // Skip inalloca arguments, they have already been written.
+    // Skip inalloca/preallocated arguments, they have already been written.
     ISD::ArgFlagsTy Flags = Outs[OutIndex].Flags;
-    if (Flags.isInAlloca())
+    if (Flags.isInAlloca() || Flags.isPreallocated())
       continue;
 
     CCValAssign &VA = ArgLocs[I];
@@ -3968,8 +4030,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         // the caller from seeing any modifications the callee may make
         // as guaranteed by the `byval` attribute.
         int FrameIdx = MF.getFrameInfo().CreateStackObject(
-            Flags.getByValSize(), std::max(16, (int)Flags.getByValAlign()),
-            false);
+            Flags.getByValSize(),
+            std::max(Align(16), Flags.getNonZeroByValAlign()), false);
         SDValue StackSlot =
             DAG.getFrameIndex(FrameIdx, getPointerTy(DAG.getDataLayout()));
         Chain =
@@ -3998,12 +4060,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     } else if (VA.isRegLoc()) {
       RegsToPass.push_back(std::make_pair(VA.getLocReg(), Arg));
       const TargetOptions &Options = DAG.getTarget().Options;
-      if (Options.EnableDebugEntryValues)
+      if (Options.EmitCallSiteInfo)
         CSInfo.emplace_back(VA.getLocReg(), I);
       if (isVarArg && IsWin64) {
         // Win64 ABI requires argument XMM reg to be copied to the corresponding
         // shadow reg if callee is a varargs function.
-        unsigned ShadowReg = 0;
+        Register ShadowReg;
         switch (VA.getLocReg()) {
         case X86::XMM0: ShadowReg = X86::RCX; break;
         case X86::XMM1: ShadowReg = X86::RDX; break;
@@ -4019,7 +4081,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
         StackPtr = DAG.getCopyFromReg(Chain, dl, RegInfo->getStackRegister(),
                                       getPointerTy(DAG.getDataLayout()));
       MemOpChains.push_back(LowerMemOpCallTo(Chain, StackPtr, Arg,
-                                             dl, DAG, VA, Flags));
+                                             dl, DAG, VA, Flags, isByVal));
     }
   }
 
@@ -4031,7 +4093,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     // GOT pointer.
     if (!isTailCall) {
       RegsToPass.push_back(std::make_pair(
-          unsigned(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
+          Register(X86::EBX), DAG.getNode(X86ISD::GlobalBaseReg, SDLoc(),
                                           getPointerTy(DAG.getDataLayout()))));
     } else {
       // If we are tail calling and generating PIC/GOT style code load the
@@ -4069,8 +4131,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     unsigned NumXMMRegs = CCInfo.getFirstUnallocated(XMMArgRegs);
     assert((Subtarget.hasSSE1() || !NumXMMRegs)
            && "SSE registers cannot be used when SSE is disabled");
-
-    RegsToPass.push_back(std::make_pair(unsigned(X86::AL),
+    RegsToPass.push_back(std::make_pair(Register(X86::AL),
                                         DAG.getConstant(NumXMMRegs, dl,
                                                         MVT::i8)));
   }
@@ -4079,7 +4140,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     const auto &Forwards = X86Info->getForwardedMustTailRegParms();
     for (const auto &F : Forwards) {
       SDValue Val = DAG.getCopyFromReg(Chain, dl, F.VReg, F.VT);
-      RegsToPass.push_back(std::make_pair(unsigned(F.PReg), Val));
+      RegsToPass.push_back(std::make_pair(F.PReg, Val));
     }
   }
 
@@ -4117,8 +4178,8 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
       assert(VA.isMemLoc());
       SDValue Arg = OutVals[OutsIndex];
       ISD::ArgFlagsTy Flags = Outs[OutsIndex].Flags;
-      // Skip inalloca arguments.  They don't require any work.
-      if (Flags.isInAlloca())
+      // Skip inalloca/preallocated arguments.  They don't require any work.
+      if (Flags.isInAlloca() || Flags.isPreallocated())
         continue;
       // Create frame index.
       int32_t Offset = VA.getLocMemOffset()+FPDiff;
@@ -4219,7 +4280,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   // is thrown, the runtime will not restore CSRs.
   // FIXME: Model this more precisely so that we can register allocate across
   // the normal edge and spill and fill across the exceptional edge.
-  if (!Is64Bit && CLI.CS && CLI.CS.isInvoke()) {
+  if (!Is64Bit && CLI.CB && isa<InvokeInst>(CLI.CB)) {
     const Function &CallerFn = MF.getFunction();
     EHPersonality Pers =
         CallerFn.hasPersonalityFn()
@@ -4278,11 +4339,12 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
     Chain = DAG.getNode(X86ISD::CALL, dl, NodeTys, Ops);
   }
   InFlag = Chain.getValue(1);
+  DAG.addNoMergeSiteInfo(Chain.getNode(), CLI.NoMerge);
   DAG.addCallSiteInfo(Chain.getNode(), std::move(CSInfo));
 
   // Save heapallocsite metadata.
-  if (CLI.CS)
-    if (MDNode *HeapAlloc = CLI.CS->getMetadata("heapallocsite"))
+  if (CLI.CB)
+    if (MDNode *HeapAlloc = CLI.CB->getMetadata("heapallocsite"))
       DAG.addHeapAllocSite(Chain.getNode(), HeapAlloc);
 
   // Create the CALLSEQ_END node.
@@ -4301,12 +4363,6 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
   else
     NumBytesForCalleeToPop = 0;  // Callee pops nothing.
 
-  if (CLI.DoesNotReturn && !getTargetMachine().Options.TrapUnreachable) {
-    // No need to reset the stack after the call if the call doesn't return. To
-    // make the MI verify, we'll pretend the callee does it for us.
-    NumBytesForCalleeToPop = NumBytes;
-  }
-
   // Returns a flag for retval copy to use.
   if (!IsSibcall) {
     Chain = DAG.getCALLSEQ_END(Chain,
@@ -4337,7 +4393,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 //  (within module) calls are supported at the moment.
 //  To keep the stack aligned according to platform abi the function
 //  GetAlignedArgumentStackSize ensures that argument delta is always multiples
-//  of stack alignment. (Dynamic linkers need this - darwin's dyld for example)
+//  of stack alignment. (Dynamic linkers need this - Darwin's dyld for example)
 //  If a tail called function callee has more arguments than the caller the
 //  caller needs to make sure that there is room to move the RETADDR to. This is
 //  achieved by reserving an area the size of the argument delta right after the
@@ -4359,7 +4415,7 @@ X86TargetLowering::LowerCall(TargetLowering::CallLoweringInfo &CLI,
 unsigned
 X86TargetLowering::GetAlignedArgumentStackSize(const unsigned StackSize,
                                                SelectionDAG &DAG) const {
-  const Align StackAlignment(Subtarget.getFrameLowering()->getStackAlignment());
+  const Align StackAlignment = Subtarget.getFrameLowering()->getStackAlign();
   const uint64_t SlotSize = Subtarget.getRegisterInfo()->getSlotSize();
   assert(StackSize % SlotSize == 0 &&
          "StackSize must be a multiple of SlotSize");
@@ -4395,7 +4451,7 @@ bool MatchingStackOffset(SDValue Arg, unsigned Offset, ISD::ArgFlagsTy Flags,
 
   int FI = INT_MAX;
   if (Arg.getOpcode() == ISD::CopyFromReg) {
-    unsigned VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
+    Register VR = cast<RegisterSDNode>(Arg.getOperand(1))->getReg();
     if (!Register::isVirtualRegister(VR))
       return false;
     MachineInstr *Def = MRI->getVRegDef(VR);
@@ -4578,7 +4634,7 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
 
     // Allocate shadow area for Win64
     if (IsCalleeWin64)
-      CCInfo.AllocateStack(32, 8);
+      CCInfo.AllocateStack(32, Align(8));
 
     CCInfo.AnalyzeCallOperands(Outs, CC_X86);
     StackArgsSize = CCInfo.getNextStackOffset();
@@ -4693,6 +4749,7 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::INSERTPS:
   case X86ISD::EXTRQI:
   case X86ISD::INSERTQI:
+  case X86ISD::VALIGN:
   case X86ISD::PALIGNR:
   case X86ISD::VSHLDQ:
   case X86ISD::VSRLDQ:
@@ -4739,6 +4796,13 @@ static bool isTargetShuffleVariableMask(unsigned Opcode) {
   }
 }
 
+static bool isTargetShuffleSplat(SDValue Op) {
+  unsigned Opcode = Op.getOpcode();
+  if (Opcode == ISD::EXTRACT_SUBVECTOR)
+    return isTargetShuffleSplat(Op.getOperand(0));
+  return Opcode == X86ISD::VBROADCAST || Opcode == X86ISD::VBROADCAST_LOAD;
+}
+
 SDValue X86TargetLowering::getReturnAddressFrameIndex(SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
@@ -4972,7 +5036,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
       ScalarVT = MVT::i32;
 
     Info.memVT = MVT::getVectorVT(ScalarVT, VT.getVectorNumElements());
-    Info.align = Align::None();
+    Info.align = Align(1);
     Info.flags |= MachineMemOperand::MOStore;
     break;
   }
@@ -4985,7 +5049,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
                                 IndexVT.getVectorNumElements());
     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
-    Info.align = Align::None();
+    Info.align = Align(1);
     Info.flags |= MachineMemOperand::MOLoad;
     break;
   }
@@ -4997,7 +5061,7 @@ bool X86TargetLowering::getTgtMemIntrinsic(IntrinsicInfo &Info,
     unsigned NumElts = std::min(DataVT.getVectorNumElements(),
                                 IndexVT.getVectorNumElements());
     Info.memVT = MVT::getVectorVT(DataVT.getVectorElementType(), NumElts);
-    Info.align = Align::None();
+    Info.align = Align(1);
     Info.flags |= MachineMemOperand::MOStore;
     break;
   }
@@ -5146,7 +5210,8 @@ bool X86TargetLowering::shouldScalarizeBinop(SDValue VecOp) const {
   return isOperationLegalOrCustomOrPromote(Opc, ScalarVT);
 }
 
-bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT) const {
+bool X86TargetLowering::shouldFormOverflowOp(unsigned Opcode, EVT VT,
+                                             bool) const {
   // TODO: Allow vectors?
   if (VT.isVector())
     return false;
@@ -5374,6 +5439,19 @@ static bool isAnyInRange(ArrayRef<int> Mask, int Low, int Hi) {
   return llvm::any_of(Mask, [Low, Hi](int M) { return isInRange(M, Low, Hi); });
 }
 
+/// Return true if the value of any element in Mask is the zero sentinel value.
+static bool isAnyZero(ArrayRef<int> Mask) {
+  return llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+}
+
+/// Return true if the value of any element in Mask is the zero or undef
+/// sentinel values.
+static bool isAnyZeroOrUndef(ArrayRef<int> Mask) {
+  return llvm::any_of(Mask, [](int M) {
+    return M == SM_SentinelZero || M == SM_SentinelUndef;
+  });
+}
+
 /// Return true if Val is undef or if its value falls within the
 /// specified range (L, H].
 static bool isUndefOrInRange(int Val, int Low, int Hi) {
@@ -5511,6 +5589,36 @@ static bool canWidenShuffleElements(ArrayRef<int> Mask) {
   return canWidenShuffleElements(Mask, WidenedMask);
 }
 
+// Attempt to narrow/widen shuffle mask until it matches the target number of
+// elements.
+static bool scaleShuffleElements(ArrayRef<int> Mask, unsigned NumDstElts,
+                                 SmallVectorImpl<int> &ScaledMask) {
+  unsigned NumSrcElts = Mask.size();
+  assert(((NumSrcElts % NumDstElts) == 0 || (NumDstElts % NumSrcElts) == 0) &&
+         "Illegal shuffle scale factor");
+
+  // Narrowing is guaranteed to work.
+  if (NumDstElts >= NumSrcElts) {
+    int Scale = NumDstElts / NumSrcElts;
+    llvm::narrowShuffleMaskElts(Scale, Mask, ScaledMask);
+    return true;
+  }
+
+  // We have to repeat the widening until we reach the target size, but we can
+  // split out the first widening as it sets up ScaledMask for us.
+  if (canWidenShuffleElements(Mask, ScaledMask)) {
+    while (ScaledMask.size() > NumDstElts) {
+      SmallVector<int, 16> WidenedMask;
+      if (!canWidenShuffleElements(ScaledMask, WidenedMask))
+        return false;
+      ScaledMask = std::move(WidenedMask);
+    }
+    return true;
+  }
+
+  return false;
+}
+
 /// Returns true if Elt is a constant zero or a floating point constant +0.0.
 bool X86::isZeroNode(SDValue Elt) {
   return isNullConstant(Elt) || isNullFPConstant(Elt);
@@ -5725,7 +5833,7 @@ static SDValue widenSubVector(SDValue Vec, bool ZeroNewElements,
   return widenSubVector(VT, Vec, ZeroNewElements, Subtarget, DAG, dl);
 }
 
-// Helper function to collect subvector ops that are concated together,
+// Helper function to collect subvector ops that are concatenated together,
 // either by ISD::CONCAT_VECTORS or a ISD::INSERT_SUBVECTOR series.
 // The subvectors in Ops are guaranteed to be the same type.
 static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
@@ -5736,8 +5844,7 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
     return true;
   }
 
-  if (N->getOpcode() == ISD::INSERT_SUBVECTOR &&
-      isa<ConstantSDNode>(N->getOperand(2))) {
+  if (N->getOpcode() == ISD::INSERT_SUBVECTOR) {
     SDValue Src = N->getOperand(0);
     SDValue Sub = N->getOperand(1);
     const APInt &Idx = N->getConstantOperandAPInt(2);
@@ -5746,19 +5853,93 @@ static bool collectConcatOps(SDNode *N, SmallVectorImpl<SDValue> &Ops) {
 
     // TODO - Handle more general insert_subvector chains.
     if (VT.getSizeInBits() == (SubVT.getSizeInBits() * 2) &&
-        Idx == (VT.getVectorNumElements() / 2) &&
-        Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
-        Src.getOperand(1).getValueType() == SubVT &&
-        isNullConstant(Src.getOperand(2))) {
-      Ops.push_back(Src.getOperand(1));
-      Ops.push_back(Sub);
-      return true;
+        Idx == (VT.getVectorNumElements() / 2)) {
+      // insert_subvector(insert_subvector(undef, x, lo), y, hi)
+      if (Src.getOpcode() == ISD::INSERT_SUBVECTOR &&
+          Src.getOperand(1).getValueType() == SubVT &&
+          isNullConstant(Src.getOperand(2))) {
+        Ops.push_back(Src.getOperand(1));
+        Ops.push_back(Sub);
+        return true;
+      }
+      // insert_subvector(x, extract_subvector(x, lo), hi)
+      if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+          Sub.getOperand(0) == Src && isNullConstant(Sub.getOperand(1))) {
+        Ops.append(2, Sub);
+        return true;
+      }
     }
   }
 
   return false;
 }
 
+static std::pair<SDValue, SDValue> splitVector(SDValue Op, SelectionDAG &DAG,
+                                               const SDLoc &dl) {
+  EVT VT = Op.getValueType();
+  unsigned NumElems = VT.getVectorNumElements();
+  unsigned SizeInBits = VT.getSizeInBits();
+  assert((NumElems % 2) == 0 && (SizeInBits % 2) == 0 &&
+         "Can't split odd sized vector");
+
+  SDValue Lo = extractSubVector(Op, 0, DAG, dl, SizeInBits / 2);
+  SDValue Hi = extractSubVector(Op, NumElems / 2, DAG, dl, SizeInBits / 2);
+  return std::make_pair(Lo, Hi);
+}
+
+// Split an unary integer op into 2 half sized ops.
+static SDValue splitVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  // Make sure we only try to split 256/512-bit types to avoid creating
+  // narrow vectors.
+  assert((Op.getOperand(0).getValueType().is256BitVector() ||
+          Op.getOperand(0).getValueType().is512BitVector()) &&
+         (VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
+  assert(Op.getOperand(0).getValueType().getVectorNumElements() ==
+             VT.getVectorNumElements() &&
+         "Unexpected VTs!");
+
+  SDLoc dl(Op);
+
+  // Extract the Lo/Hi vectors
+  SDValue Lo, Hi;
+  std::tie(Lo, Hi) = splitVector(Op.getOperand(0), DAG, dl);
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, LoVT, Lo),
+                     DAG.getNode(Op.getOpcode(), dl, HiVT, Hi));
+}
+
+/// Break a binary integer operation into 2 half sized ops and then
+/// concatenate the result back.
+static SDValue splitVectorIntBinary(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
+
+  // Sanity check that all the types match.
+  assert(Op.getOperand(0).getValueType() == VT &&
+         Op.getOperand(1).getValueType() == VT && "Unexpected VTs!");
+  assert((VT.is256BitVector() || VT.is512BitVector()) && "Unsupported VT!");
+
+  SDLoc dl(Op);
+
+  // Extract the LHS Lo/Hi vectors
+  SDValue LHS1, LHS2;
+  std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
+
+  // Extract the RHS Lo/Hi vectors
+  SDValue RHS1, RHS2;
+  std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
+
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
+                     DAG.getNode(Op.getOpcode(), dl, LoVT, LHS1, RHS1),
+                     DAG.getNode(Op.getOpcode(), dl, HiVT, LHS2, RHS2));
+}
+
 // Helper for splitting operands of an operation to legal target size and
 // apply a function on each part.
 // Useful for operations that are available on SSE2 in 128-bit, on AVX2 in
@@ -5815,21 +5996,17 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
   SDValue Vec = Op.getOperand(0);
   SDValue SubVec = Op.getOperand(1);
   SDValue Idx = Op.getOperand(2);
-
-  if (!isa<ConstantSDNode>(Idx))
-    return SDValue();
+  unsigned IdxVal = Op.getConstantOperandVal(2);
 
   // Inserting undef is a nop. We can just return the original vector.
   if (SubVec.isUndef())
     return Vec;
 
-  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   if (IdxVal == 0 && Vec.isUndef()) // the operation is legal
     return Op;
 
   MVT OpVT = Op.getSimpleValueType();
   unsigned NumElems = OpVT.getVectorNumElements();
-
   SDValue ZeroIdx = DAG.getIntPtrConstant(0, dl);
 
   // Extend to natively supported kshift.
@@ -5849,7 +6026,6 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
 
   MVT SubVecVT = SubVec.getSimpleValueType();
   unsigned SubVecNumElems = SubVecVT.getVectorNumElements();
-
   assert(IdxVal + SubVecNumElems <= NumElems &&
          IdxVal % SubVecVT.getSizeInBits() == 0 &&
          "Unexpected index value in INSERT_SUBVECTOR");
@@ -5900,7 +6076,7 @@ static SDValue insert1BitVector(SDValue Op, SelectionDAG &DAG,
                          DAG.getTargetConstant(IdxVal, dl, MVT::i8));
     if (SubVecNumElems * 2 == NumElems) {
       // Special case, use legal zero extending insert_subvector. This allows
-      // isel to opimitize when bits are known zero.
+      // isel to optimize when bits are known zero.
       Vec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, SubVecVT, Vec, ZeroIdx);
       Vec = DAG.getNode(ISD::INSERT_SUBVECTOR, dl, WideOpVT,
                         DAG.getConstant(0, dl, WideOpVT),
@@ -6042,8 +6218,8 @@ static SDValue getExtendInVec(unsigned Opcode, const SDLoc &DL, EVT VT,
 // Match (xor X, -1) -> X.
 // Match extract_subvector(xor X, -1) -> extract_subvector(X).
 // Match concat_vectors(xor X, -1, xor Y, -1) -> concat_vectors(X, Y).
-static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
-  V = peekThroughBitcasts(V);
+static SDValue IsNOT(SDValue V, SelectionDAG &DAG, bool OneUse = false) {
+  V = OneUse ? peekThroughOneUseBitcasts(V) : peekThroughBitcasts(V);
   if (V.getOpcode() == ISD::XOR &&
       ISD::isBuildVectorAllOnes(V.getOperand(1).getNode()))
     return V.getOperand(0);
@@ -6067,6 +6243,35 @@ static SDValue IsNOT(SDValue V, SelectionDAG &DAG) {
   return SDValue();
 }
 
+void llvm::createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
+                                   bool Lo, bool Unary) {
+  assert(Mask.empty() && "Expected an empty shuffle mask vector");
+  int NumElts = VT.getVectorNumElements();
+  int NumEltsInLane = 128 / VT.getScalarSizeInBits();
+  for (int i = 0; i < NumElts; ++i) {
+    unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
+    int Pos = (i % NumEltsInLane) / 2 + LaneStart;
+    Pos += (Unary ? 0 : NumElts * (i % 2));
+    Pos += (Lo ? 0 : NumEltsInLane / 2);
+    Mask.push_back(Pos);
+  }
+}
+
+/// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
+/// imposed by AVX and specific to the unary pattern. Example:
+/// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
+/// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
+void llvm::createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
+                                   bool Lo) {
+  assert(Mask.empty() && "Expected an empty shuffle mask vector");
+  int NumElts = VT.getVectorNumElements();
+  for (int i = 0; i < NumElts; ++i) {
+    int Pos = i / 2;
+    Pos += (Lo ? 0 : NumElts / 2);
+    Mask.push_back(Pos);
+  }
+}
+
 /// Returns a vector_shuffle node for an unpackl operation.
 static SDValue getUnpackl(SelectionDAG &DAG, const SDLoc &dl, MVT VT,
                           SDValue V1, SDValue V2) {
@@ -6102,14 +6307,10 @@ static SDValue getShuffleVectorZeroOrUndef(SDValue V2, int Idx,
   return DAG.getVectorShuffle(VT, SDLoc(V2), V1, V2, MaskVec);
 }
 
-static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
-  if (!Load || !ISD::isNormalLoad(Load))
-    return nullptr;
-
-  SDValue Ptr = Load->getBasePtr();
-  if (Ptr->getOpcode() == X86ISD::Wrapper ||
-      Ptr->getOpcode() == X86ISD::WrapperRIP)
-    Ptr = Ptr->getOperand(0);
+static const Constant *getTargetConstantFromBasePtr(SDValue Ptr) {
+  if (Ptr.getOpcode() == X86ISD::Wrapper ||
+      Ptr.getOpcode() == X86ISD::WrapperRIP)
+    Ptr = Ptr.getOperand(0);
 
   auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
   if (!CNode || CNode->isMachineConstantPoolEntry() || CNode->getOffset() != 0)
@@ -6118,6 +6319,12 @@ static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
   return CNode->getConstVal();
 }
 
+static const Constant *getTargetConstantFromNode(LoadSDNode *Load) {
+  if (!Load || !ISD::isNormalLoad(Load))
+    return nullptr;
+  return getTargetConstantFromBasePtr(Load->getBasePtr());
+}
+
 static const Constant *getTargetConstantFromNode(SDValue Op) {
   Op = peekThroughBitcasts(Op);
   return getTargetConstantFromNode(dyn_cast<LoadSDNode>(Op));
@@ -6298,23 +6505,6 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   }
 
   // Extract constant bits from a broadcasted constant pool scalar.
-  if (Op.getOpcode() == X86ISD::VBROADCAST &&
-      EltSizeInBits <= VT.getScalarSizeInBits()) {
-    if (auto *Broadcast = getTargetConstantFromNode(Op.getOperand(0))) {
-      unsigned SrcEltSizeInBits = Broadcast->getType()->getScalarSizeInBits();
-      unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
-
-      APInt UndefSrcElts(NumSrcElts, 0);
-      SmallVector<APInt, 64> SrcEltBits(1, APInt(SrcEltSizeInBits, 0));
-      if (CollectConstantBits(Broadcast, SrcEltBits[0], UndefSrcElts, 0)) {
-        if (UndefSrcElts[0])
-          UndefSrcElts.setBits(0, NumSrcElts);
-        SrcEltBits.append(NumSrcElts - 1, SrcEltBits[0]);
-        return CastBitData(UndefSrcElts, SrcEltBits);
-      }
-    }
-  }
-
   if (Op.getOpcode() == X86ISD::VBROADCAST_LOAD &&
       EltSizeInBits <= VT.getScalarSizeInBits()) {
     auto *MemIntr = cast<MemIntrinsicSDNode>(Op);
@@ -6322,16 +6512,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
       return false;
 
     SDValue Ptr = MemIntr->getBasePtr();
-    if (Ptr->getOpcode() == X86ISD::Wrapper ||
-        Ptr->getOpcode() == X86ISD::WrapperRIP)
-      Ptr = Ptr->getOperand(0);
-
-    auto *CNode = dyn_cast<ConstantPoolSDNode>(Ptr);
-    if (!CNode || CNode->isMachineConstantPoolEntry() ||
-        CNode->getOffset() != 0)
-      return false;
-
-    if (const Constant *C = CNode->getConstVal()) {
+    if (const Constant *C = getTargetConstantFromBasePtr(Ptr)) {
       unsigned SrcEltSizeInBits = C->getType()->getScalarSizeInBits();
       unsigned NumSrcElts = SizeInBits / SrcEltSizeInBits;
 
@@ -6375,8 +6556,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   }
 
   // Insert constant bits from a base and sub vector sources.
-  if (Op.getOpcode() == ISD::INSERT_SUBVECTOR &&
-      isa<ConstantSDNode>(Op.getOperand(2))) {
+  if (Op.getOpcode() == ISD::INSERT_SUBVECTOR) {
     // TODO - support insert_subvector through bitcasts.
     if (EltSizeInBits != VT.getScalarSizeInBits())
       return false;
@@ -6398,8 +6578,7 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
   }
 
   // Extract constant bits from a subvector's source.
-  if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-      isa<ConstantSDNode>(Op.getOperand(1))) {
+  if (Op.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
     // TODO - support extract_subvector through bitcasts.
     if (EltSizeInBits != VT.getScalarSizeInBits())
       return false;
@@ -6468,11 +6647,12 @@ static bool getTargetConstantBitsFromNode(SDValue Op, unsigned EltSizeInBits,
 
 namespace llvm {
 namespace X86 {
-bool isConstantSplat(SDValue Op, APInt &SplatVal) {
+bool isConstantSplat(SDValue Op, APInt &SplatVal, bool AllowPartialUndefs) {
   APInt UndefElts;
   SmallVector<APInt, 16> EltBits;
   if (getTargetConstantBitsFromNode(Op, Op.getScalarValueSizeInBits(),
-                                    UndefElts, EltBits, true, false)) {
+                                    UndefElts, EltBits, true,
+                                    AllowPartialUndefs)) {
     int SplatIndex = -1;
     for (int i = 0, e = EltBits.size(); i != e; ++i) {
       if (UndefElts[i])
@@ -6513,20 +6693,26 @@ static bool getTargetShuffleMaskIndices(SDValue MaskNode,
 }
 
 /// Create a shuffle mask that matches the PACKSS/PACKUS truncation.
+/// A multi-stage pack shuffle mask is created by specifying NumStages > 1.
 /// Note: This ignores saturation, so inputs must be checked first.
 static void createPackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask,
-                                  bool Unary) {
+                                  bool Unary, unsigned NumStages = 1) {
   assert(Mask.empty() && "Expected an empty shuffle mask vector");
   unsigned NumElts = VT.getVectorNumElements();
   unsigned NumLanes = VT.getSizeInBits() / 128;
   unsigned NumEltsPerLane = 128 / VT.getScalarSizeInBits();
   unsigned Offset = Unary ? 0 : NumElts;
+  unsigned Repetitions = 1u << (NumStages - 1);
+  unsigned Increment = 1u << NumStages;
+  assert((NumEltsPerLane >> NumStages) > 0 && "Illegal packing compaction");
 
   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
-    for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
-      Mask.push_back(Elt + (Lane * NumEltsPerLane));
-    for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += 2)
-      Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
+    for (unsigned Stage = 0; Stage != Repetitions; ++Stage) {
+      for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
+        Mask.push_back(Elt + (Lane * NumEltsPerLane));
+      for (unsigned Elt = 0; Elt != NumEltsPerLane; Elt += Increment)
+        Mask.push_back(Elt + (Lane * NumEltsPerLane) + Offset);
+    }
   }
 }
 
@@ -6597,7 +6783,7 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   unsigned MaskEltSize = VT.getScalarSizeInBits();
   SmallVector<uint64_t, 32> RawMask;
   APInt RawUndefs;
-  SDValue ImmN;
+  uint64_t ImmN;
 
   assert(Mask.empty() && "getTargetShuffleMask expects an empty Mask vector");
   assert(Ops.empty() && "getTargetShuffleMask expects an empty Ops vector");
@@ -6608,23 +6794,22 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   case X86ISD::BLENDI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands() - 1);
-    DecodeBLENDMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+    DecodeBLENDMask(NumElems, ImmN, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::SHUFP:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands() - 1);
-    DecodeSHUFPMask(NumElems, MaskEltSize,
-                    cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+    DecodeSHUFPMask(NumElems, MaskEltSize, ImmN, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::INSERTPS:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands() - 1);
-    DecodeINSERTPSMask(cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+    DecodeINSERTPSMask(ImmN, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::EXTRQI:
@@ -6672,13 +6857,23 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     DecodeMOVLHPSMask(NumElems, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
+  case X86ISD::VALIGN:
+    assert((VT.getScalarType() == MVT::i32 || VT.getScalarType() == MVT::i64) &&
+           "Only 32-bit and 64-bit elements are supported!");
+    assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
+    assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
+    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+    DecodeVALIGNMask(NumElems, ImmN, Mask);
+    IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
+    Ops.push_back(N->getOperand(1));
+    Ops.push_back(N->getOperand(0));
+    break;
   case X86ISD::PALIGNR:
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands() - 1);
-    DecodePALIGNRMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                      Mask);
+    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+    DecodePALIGNRMask(NumElems, ImmN, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     Ops.push_back(N->getOperand(1));
     Ops.push_back(N->getOperand(0));
@@ -6686,39 +6881,34 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   case X86ISD::VSHLDQ:
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands() - 1);
-    DecodePSLLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                     Mask);
+    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+    DecodePSLLDQMask(NumElems, ImmN, Mask);
     IsUnary = true;
     break;
   case X86ISD::VSRLDQ:
     assert(VT.getScalarType() == MVT::i8 && "Byte vector expected");
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands() - 1);
-    DecodePSRLDQMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                     Mask);
+    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+    DecodePSRLDQMask(NumElems, ImmN, Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFD:
   case X86ISD::VPERMILPI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands() - 1);
-    DecodePSHUFMask(NumElems, MaskEltSize,
-                    cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+    DecodePSHUFMask(NumElems, MaskEltSize, ImmN, Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFHW:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands() - 1);
-    DecodePSHUFHWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                      Mask);
+    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+    DecodePSHUFHWMask(NumElems, ImmN, Mask);
     IsUnary = true;
     break;
   case X86ISD::PSHUFLW:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands() - 1);
-    DecodePSHUFLWMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                      Mask);
+    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+    DecodePSHUFLWMask(NumElems, ImmN, Mask);
     IsUnary = true;
     break;
   case X86ISD::VZEXT_MOVL:
@@ -6770,8 +6960,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   }
   case X86ISD::VPERMI:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands() - 1);
-    DecodeVPERMMask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+    DecodeVPERMMask(NumElems, ImmN, Mask);
     IsUnary = true;
     break;
   case X86ISD::MOVSS:
@@ -6783,17 +6973,15 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
   case X86ISD::VPERM2X128:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands() - 1);
-    DecodeVPERM2X128Mask(NumElems, cast<ConstantSDNode>(ImmN)->getZExtValue(),
-                         Mask);
+    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+    DecodeVPERM2X128Mask(NumElems, ImmN, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::SHUF128:
     assert(N->getOperand(0).getValueType() == VT && "Unexpected value type");
     assert(N->getOperand(1).getValueType() == VT && "Unexpected value type");
-    ImmN = N->getOperand(N->getNumOperands() - 1);
-    decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize,
-                              cast<ConstantSDNode>(ImmN)->getZExtValue(), Mask);
+    ImmN = N->getConstantOperandVal(N->getNumOperands() - 1);
+    decodeVSHUF64x2FamilyMask(NumElems, MaskEltSize, ImmN, Mask);
     IsUnary = IsFakeUnary = N->getOperand(0) == N->getOperand(1);
     break;
   case X86ISD::MOVSLDUP:
@@ -6875,9 +7063,8 @@ static bool getTargetShuffleMask(SDNode *N, MVT VT, bool AllowSentinelZero,
     return false;
 
   // Check if we're getting a shuffle mask with zero'd elements.
-  if (!AllowSentinelZero)
-    if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
-      return false;
+  if (!AllowSentinelZero && isAnyZero(Mask))
+    return false;
 
   // If we have a fake unary shuffle, the shuffle mask is spread across two
   // inputs that are actually the same node. Re-map the mask to always point
@@ -7060,6 +7247,20 @@ static bool getTargetShuffleAndZeroables(SDValue N, SmallVectorImpl<int> &Mask,
       continue;
     }
 
+    // INSERT_SUBVECTOR - to widen vectors we often insert them into UNDEF
+    // base vectors.
+    if (V.getOpcode() == ISD::INSERT_SUBVECTOR) {
+      SDValue Vec = V.getOperand(0);
+      int NumVecElts = Vec.getValueType().getVectorNumElements();
+      if (Vec.isUndef() && Size == NumVecElts) {
+        int Idx = V.getConstantOperandVal(2);
+        int NumSubElts = V.getOperand(1).getValueType().getVectorNumElements();
+        if (M < Idx || (Idx + NumSubElts) <= M)
+          KnownUndef.setBit(i);
+      }
+      continue;
+    }
+
     // Attempt to extract from the source's constant bits.
     if (IsSrcConstant[SrcIdx]) {
       if (UndefSrcElts[SrcIdx][M])
@@ -7111,7 +7312,7 @@ static void resolveZeroablesFromTargetShuffle(const SmallVectorImpl<int> &Mask,
 // TODO: Use DemandedElts variant.
 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
                                    SmallVectorImpl<int> &Mask,
-                                   SelectionDAG &DAG, unsigned Depth,
+                                   const SelectionDAG &DAG, unsigned Depth,
                                    bool ResolveKnownElts);
 
 // Attempt to decode ops that could be represented as a shuffle mask.
@@ -7120,7 +7321,7 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
 static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
                                SmallVectorImpl<int> &Mask,
                                SmallVectorImpl<SDValue> &Ops,
-                               SelectionDAG &DAG, unsigned Depth,
+                               const SelectionDAG &DAG, unsigned Depth,
                                bool ResolveKnownElts) {
   Mask.clear();
   Ops.clear();
@@ -7132,6 +7333,8 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
   if ((NumBitsPerElt % 8) != 0 || (NumSizeInBits % 8) != 0)
     return false;
   assert(NumElts == DemandedElts.getBitWidth() && "Unexpected vector size");
+  unsigned NumSizeInBytes = NumSizeInBits / 8;
+  unsigned NumBytesPerElt = NumBitsPerElt / 8;
 
   unsigned Opcode = N.getOpcode();
   switch (Opcode) {
@@ -7179,8 +7382,6 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
         DAG.computeKnownBits(N.getOperand(1), DemandedElts, Depth + 1);
     if (Known0.One.isNullValue() && Known1.One.isNullValue()) {
       bool IsByteMask = true;
-      unsigned NumSizeInBytes = NumSizeInBits / 8;
-      unsigned NumBytesPerElt = NumBitsPerElt / 8;
       APInt ZeroMask = APInt::getNullValue(NumBytesPerElt);
       APInt SelectMask = APInt::getNullValue(NumBytesPerElt);
       for (unsigned i = 0; i != NumBytesPerElt && IsByteMask; ++i) {
@@ -7220,10 +7421,21 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
         !getTargetShuffleInputs(N1, SrcInputs1, SrcMask1, DAG, Depth + 1,
                                 true))
       return false;
+
+    // Shuffle inputs must be the same size as the result.
+    if (llvm::any_of(SrcInputs0, [VT](SDValue Op) {
+          return VT.getSizeInBits() != Op.getValueSizeInBits();
+        }))
+      return false;
+    if (llvm::any_of(SrcInputs1, [VT](SDValue Op) {
+          return VT.getSizeInBits() != Op.getValueSizeInBits();
+        }))
+      return false;
+
     size_t MaskSize = std::max(SrcMask0.size(), SrcMask1.size());
     SmallVector<int, 64> Mask0, Mask1;
-    scaleShuffleMask<int>(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
-    scaleShuffleMask<int>(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
+    narrowShuffleMaskElts(MaskSize / SrcMask0.size(), SrcMask0, Mask0);
+    narrowShuffleMaskElts(MaskSize / SrcMask1.size(), SrcMask1, Mask1);
     for (size_t i = 0; i != MaskSize; ++i) {
       if (Mask0[i] == SM_SentinelUndef && Mask1[i] == SM_SentinelUndef)
         Mask.push_back(SM_SentinelUndef);
@@ -7245,14 +7457,12 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
     SDValue Sub = N.getOperand(1);
     EVT SubVT = Sub.getValueType();
     unsigned NumSubElts = SubVT.getVectorNumElements();
-    if (!isa<ConstantSDNode>(N.getOperand(2)) ||
-        !N->isOnlyUserOf(Sub.getNode()))
+    if (!N->isOnlyUserOf(Sub.getNode()))
       return false;
     uint64_t InsertIdx = N.getConstantOperandVal(2);
     // Handle INSERT_SUBVECTOR(SRC0, EXTRACT_SUBVECTOR(SRC1)).
     if (Sub.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-        Sub.getOperand(0).getValueType() == VT &&
-        isa<ConstantSDNode>(Sub.getOperand(1))) {
+        Sub.getOperand(0).getValueType() == VT) {
       uint64_t ExtractIdx = Sub.getConstantOperandVal(1);
       for (int i = 0; i != (int)NumElts; ++i)
         Mask.push_back(i);
@@ -7268,13 +7478,20 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
     if (!getTargetShuffleInputs(peekThroughOneUseBitcasts(Sub), SubInputs,
                                 SubMask, DAG, Depth + 1, ResolveKnownElts))
       return false;
+
+    // Subvector shuffle inputs must not be larger than the subvector.
+    if (llvm::any_of(SubInputs, [SubVT](SDValue SubInput) {
+          return SubVT.getSizeInBits() < SubInput.getValueSizeInBits();
+        }))
+      return false;
+
     if (SubMask.size() != NumSubElts) {
       assert(((SubMask.size() % NumSubElts) == 0 ||
               (NumSubElts % SubMask.size()) == 0) && "Illegal submask scale");
       if ((NumSubElts % SubMask.size()) == 0) {
         int Scale = NumSubElts / SubMask.size();
         SmallVector<int,64> ScaledSubMask;
-        scaleShuffleMask<int>(Scale, SubMask, ScaledSubMask);
+        narrowShuffleMaskElts(Scale, SubMask, ScaledSubMask);
         SubMask = ScaledSubMask;
       } else {
         int Scale = SubMask.size() / NumSubElts;
@@ -7284,14 +7501,7 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
       }
     }
     Ops.push_back(Src);
-    for (SDValue &SubInput : SubInputs) {
-      EVT SubSVT = SubInput.getValueType().getScalarType();
-      EVT AltVT = EVT::getVectorVT(*DAG.getContext(), SubSVT,
-                                   NumSizeInBits / SubSVT.getSizeInBits());
-      Ops.push_back(DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), AltVT,
-                                DAG.getUNDEF(AltVT), SubInput,
-                                DAG.getIntPtrConstant(0, SDLoc(N))));
-    }
+    Ops.append(SubInputs.begin(), SubInputs.end());
     for (int i = 0; i != (int)NumElts; ++i)
       Mask.push_back(i);
     for (int i = 0; i != (int)NumSubElts; ++i) {
@@ -7304,75 +7514,83 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
     }
     return true;
   }
-  case ISD::SCALAR_TO_VECTOR: {
-    // Match against a scalar_to_vector of an extract from a vector,
-    // for PEXTRW/PEXTRB we must handle the implicit zext of the scalar.
-    SDValue N0 = N.getOperand(0);
-    SDValue SrcExtract;
+  case X86ISD::PINSRB:
+  case X86ISD::PINSRW:
+  case ISD::SCALAR_TO_VECTOR:
+  case ISD::INSERT_VECTOR_ELT: {
+    // Match against a insert_vector_elt/scalar_to_vector of an extract from a
+    // vector, for matching src/dst vector types.
+    SDValue Scl = N.getOperand(Opcode == ISD::SCALAR_TO_VECTOR ? 0 : 1);
+
+    unsigned DstIdx = 0;
+    if (Opcode != ISD::SCALAR_TO_VECTOR) {
+      // Check we have an in-range constant insertion index.
+      if (!isa<ConstantSDNode>(N.getOperand(2)) ||
+          N.getConstantOperandAPInt(2).uge(NumElts))
+        return false;
+      DstIdx = N.getConstantOperandVal(2);
 
-    if ((N0.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
-         N0.getOperand(0).getValueType() == VT) ||
-        (N0.getOpcode() == X86ISD::PEXTRW &&
-         N0.getOperand(0).getValueType() == MVT::v8i16) ||
-        (N0.getOpcode() == X86ISD::PEXTRB &&
-         N0.getOperand(0).getValueType() == MVT::v16i8)) {
-      SrcExtract = N0;
+      // Attempt to recognise an INSERT*(VEC, 0, DstIdx) shuffle pattern.
+      if (X86::isZeroNode(Scl)) {
+        Ops.push_back(N.getOperand(0));
+        for (unsigned i = 0; i != NumElts; ++i)
+          Mask.push_back(i == DstIdx ? SM_SentinelZero : (int)i);
+        return true;
+      }
     }
 
+    // Peek through trunc/aext/zext.
+    // TODO: aext shouldn't require SM_SentinelZero padding.
+    // TODO: handle shift of scalars.
+    unsigned MinBitsPerElt = Scl.getScalarValueSizeInBits();
+    while (Scl.getOpcode() == ISD::TRUNCATE ||
+           Scl.getOpcode() == ISD::ANY_EXTEND ||
+           Scl.getOpcode() == ISD::ZERO_EXTEND) {
+      Scl = Scl.getOperand(0);
+      MinBitsPerElt =
+          std::min<unsigned>(MinBitsPerElt, Scl.getScalarValueSizeInBits());
+    }
+    if ((MinBitsPerElt % 8) != 0)
+      return false;
+
+    // Attempt to find the source vector the scalar was extracted from.
+    SDValue SrcExtract;
+    if ((Scl.getOpcode() == ISD::EXTRACT_VECTOR_ELT ||
+         Scl.getOpcode() == X86ISD::PEXTRW ||
+         Scl.getOpcode() == X86ISD::PEXTRB) &&
+        Scl.getOperand(0).getValueSizeInBits() == NumSizeInBits) {
+      SrcExtract = Scl;
+    }
     if (!SrcExtract || !isa<ConstantSDNode>(SrcExtract.getOperand(1)))
       return false;
 
     SDValue SrcVec = SrcExtract.getOperand(0);
     EVT SrcVT = SrcVec.getValueType();
-    unsigned NumSrcElts = SrcVT.getVectorNumElements();
-    unsigned NumZeros = (NumBitsPerElt / SrcVT.getScalarSizeInBits()) - 1;
-
-    unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
-    if (NumSrcElts <= SrcIdx)
+    if (!SrcVT.getScalarType().isByteSized())
       return false;
-
-    Ops.push_back(SrcVec);
-    Mask.push_back(SrcIdx);
-    Mask.append(NumZeros, SM_SentinelZero);
-    Mask.append(NumSrcElts - Mask.size(), SM_SentinelUndef);
-    return true;
-  }
-  case X86ISD::PINSRB:
-  case X86ISD::PINSRW: {
-    SDValue InVec = N.getOperand(0);
-    SDValue InScl = N.getOperand(1);
-    SDValue InIndex = N.getOperand(2);
-    if (!isa<ConstantSDNode>(InIndex) ||
-        cast<ConstantSDNode>(InIndex)->getAPIntValue().uge(NumElts))
-      return false;
-    uint64_t InIdx = N.getConstantOperandVal(2);
-
-    // Attempt to recognise a PINSR*(VEC, 0, Idx) shuffle pattern.
-    if (X86::isZeroNode(InScl)) {
-      Ops.push_back(InVec);
-      for (unsigned i = 0; i != NumElts; ++i)
-        Mask.push_back(i == InIdx ? SM_SentinelZero : (int)i);
-      return true;
+    unsigned SrcIdx = SrcExtract.getConstantOperandVal(1);
+    unsigned SrcByte = SrcIdx * (SrcVT.getScalarSizeInBits() / 8);
+    unsigned DstByte = DstIdx * NumBytesPerElt;
+    MinBitsPerElt =
+        std::min<unsigned>(MinBitsPerElt, SrcVT.getScalarSizeInBits());
+
+    // Create 'identity' byte level shuffle mask and then add inserted bytes.
+    if (Opcode == ISD::SCALAR_TO_VECTOR) {
+      Ops.push_back(SrcVec);
+      Mask.append(NumSizeInBytes, SM_SentinelUndef);
+    } else {
+      Ops.push_back(SrcVec);
+      Ops.push_back(N.getOperand(0));
+      for (int i = 0; i != (int)NumSizeInBytes; ++i)
+        Mask.push_back(NumSizeInBytes + i);
     }
 
-    // Attempt to recognise a PINSR*(PEXTR*) shuffle pattern.
-    // TODO: Expand this to support INSERT_VECTOR_ELT/etc.
-    unsigned ExOp =
-        (X86ISD::PINSRB == Opcode ? X86ISD::PEXTRB : X86ISD::PEXTRW);
-    if (InScl.getOpcode() != ExOp)
-      return false;
-
-    SDValue ExVec = InScl.getOperand(0);
-    SDValue ExIndex = InScl.getOperand(1);
-    if (!isa<ConstantSDNode>(ExIndex) ||
-        cast<ConstantSDNode>(ExIndex)->getAPIntValue().uge(NumElts))
-      return false;
-    uint64_t ExIdx = InScl.getConstantOperandVal(1);
-
-    Ops.push_back(InVec);
-    Ops.push_back(ExVec);
-    for (unsigned i = 0; i != NumElts; ++i)
-      Mask.push_back(i == InIdx ? NumElts + ExIdx : i);
+    unsigned MinBytesPerElts = MinBitsPerElt / 8;
+    MinBytesPerElts = std::min(MinBytesPerElts, NumBytesPerElt);
+    for (unsigned i = 0; i != MinBytesPerElts; ++i)
+      Mask[DstByte + i] = SrcByte + i;
+    for (unsigned i = MinBytesPerElts; i < NumBytesPerElt; ++i)
+      Mask[DstByte + i] = SM_SentinelZero;
     return true;
   }
   case X86ISD::PACKSS:
@@ -7412,6 +7630,23 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
     createPackShuffleMask(VT, Mask, IsUnary);
     return true;
   }
+  case X86ISD::VTRUNC: {
+    SDValue Src = N.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    // Truncated source must be a simple vector.
+    if (!SrcVT.isSimple() || (SrcVT.getSizeInBits() % 128) != 0 ||
+        (SrcVT.getScalarSizeInBits() % 8) != 0)
+      return false;
+    unsigned NumSrcElts = SrcVT.getVectorNumElements();
+    unsigned NumBitsPerSrcElt = SrcVT.getScalarSizeInBits();
+    unsigned Scale = NumBitsPerSrcElt / NumBitsPerElt;
+    assert((NumBitsPerSrcElt % NumBitsPerElt) == 0 && "Illegal truncation");
+    for (unsigned i = 0; i != NumSrcElts; ++i)
+      Mask.push_back(i * Scale);
+    Mask.append(NumElts - NumSrcElts, SM_SentinelZero);
+    Ops.push_back(Src);
+    return true;
+  }
   case X86ISD::VSHLI:
   case X86ISD::VSRLI: {
     uint64_t ShiftVal = N.getConstantOperandVal(1);
@@ -7426,40 +7661,43 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
       break;
 
     uint64_t ByteShift = ShiftVal / 8;
-    unsigned NumBytes = NumSizeInBits / 8;
-    unsigned NumBytesPerElt = NumBitsPerElt / 8;
     Ops.push_back(N.getOperand(0));
 
     // Clear mask to all zeros and insert the shifted byte indices.
-    Mask.append(NumBytes, SM_SentinelZero);
+    Mask.append(NumSizeInBytes, SM_SentinelZero);
 
     if (X86ISD::VSHLI == Opcode) {
-      for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
+      for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
           Mask[i + j] = i + j - ByteShift;
     } else {
-      for (unsigned i = 0; i != NumBytes; i += NumBytesPerElt)
+      for (unsigned i = 0; i != NumSizeInBytes; i += NumBytesPerElt)
         for (unsigned j = ByteShift; j != NumBytesPerElt; ++j)
           Mask[i + j - ByteShift] = i + j;
     }
     return true;
   }
+  case X86ISD::VROTLI:
+  case X86ISD::VROTRI: {
+    // We can only decode 'whole byte' bit rotates as shuffles.
+    uint64_t RotateVal = N.getConstantOperandAPInt(1).urem(NumBitsPerElt);
+    if ((RotateVal % 8) != 0)
+      return false;
+    Ops.push_back(N.getOperand(0));
+    int Offset = RotateVal / 8;
+    Offset = (X86ISD::VROTLI == Opcode ? NumBytesPerElt - Offset : Offset);
+    for (int i = 0; i != (int)NumElts; ++i) {
+      int BaseIdx = i * NumBytesPerElt;
+      for (int j = 0; j != (int)NumBytesPerElt; ++j) {
+        Mask.push_back(BaseIdx + ((Offset + j) % NumBytesPerElt));
+      }
+    }
+    return true;
+  }
   case X86ISD::VBROADCAST: {
     SDValue Src = N.getOperand(0);
-    MVT SrcVT = Src.getSimpleValueType();
-    if (!SrcVT.isVector())
+    if (!Src.getSimpleValueType().isVector())
       return false;
-
-    if (NumSizeInBits != SrcVT.getSizeInBits()) {
-      assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
-             "Illegal broadcast type");
-      SrcVT = MVT::getVectorVT(SrcVT.getScalarType(),
-                               NumSizeInBits / SrcVT.getScalarSizeInBits());
-      Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
-                        DAG.getUNDEF(SrcVT), Src,
-                        DAG.getIntPtrConstant(0, SDLoc(N)));
-    }
-
     Ops.push_back(Src);
     Mask.append(NumElts, 0);
     return true;
@@ -7476,22 +7714,10 @@ static bool getFauxShuffleMask(SDValue N, const APInt &DemandedElts,
         (SrcVT.getScalarSizeInBits() % 8) != 0)
       return false;
 
-    unsigned NumSrcBitsPerElt = SrcVT.getScalarSizeInBits();
     bool IsAnyExtend =
         (ISD::ANY_EXTEND == Opcode || ISD::ANY_EXTEND_VECTOR_INREG == Opcode);
-    DecodeZeroExtendMask(NumSrcBitsPerElt, NumBitsPerElt, NumElts, IsAnyExtend,
-                         Mask);
-
-    if (NumSizeInBits != SrcVT.getSizeInBits()) {
-      assert((NumSizeInBits % SrcVT.getSizeInBits()) == 0 &&
-             "Illegal zero-extension type");
-      SrcVT = MVT::getVectorVT(SrcVT.getSimpleVT().getScalarType(),
-                               NumSizeInBits / NumSrcBitsPerElt);
-      Src = DAG.getNode(ISD::INSERT_SUBVECTOR, SDLoc(N), SrcVT,
-                        DAG.getUNDEF(SrcVT), Src,
-                        DAG.getIntPtrConstant(0, SDLoc(N)));
-    }
-
+    DecodeZeroExtendMask(SrcVT.getScalarSizeInBits(), NumBitsPerElt, NumElts,
+                         IsAnyExtend, Mask);
     Ops.push_back(Src);
     return true;
   }
@@ -7549,7 +7775,7 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
                                    SmallVectorImpl<SDValue> &Inputs,
                                    SmallVectorImpl<int> &Mask,
                                    APInt &KnownUndef, APInt &KnownZero,
-                                   SelectionDAG &DAG, unsigned Depth,
+                                   const SelectionDAG &DAG, unsigned Depth,
                                    bool ResolveKnownElts) {
   EVT VT = Op.getValueType();
   if (!VT.isSimple() || !VT.isVector())
@@ -7570,7 +7796,7 @@ static bool getTargetShuffleInputs(SDValue Op, const APInt &DemandedElts,
 
 static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
                                    SmallVectorImpl<int> &Mask,
-                                   SelectionDAG &DAG, unsigned Depth = 0,
+                                   const SelectionDAG &DAG, unsigned Depth = 0,
                                    bool ResolveKnownElts = true) {
   EVT VT = Op.getValueType();
   if (!VT.isSimple() || !VT.isVector())
@@ -7583,93 +7809,107 @@ static bool getTargetShuffleInputs(SDValue Op, SmallVectorImpl<SDValue> &Inputs,
                                 KnownZero, DAG, Depth, ResolveKnownElts);
 }
 
-/// Returns the scalar element that will make up the ith
+/// Returns the scalar element that will make up the i'th
 /// element of the result of the vector shuffle.
-static SDValue getShuffleScalarElt(SDNode *N, unsigned Index, SelectionDAG &DAG,
-                                   unsigned Depth) {
-  if (Depth == 6)
-    return SDValue();  // Limit search depth.
+static SDValue getShuffleScalarElt(SDValue Op, unsigned Index,
+                                   SelectionDAG &DAG, unsigned Depth) {
+  if (Depth >= SelectionDAG::MaxRecursionDepth)
+    return SDValue(); // Limit search depth.
 
-  SDValue V = SDValue(N, 0);
-  EVT VT = V.getValueType();
-  unsigned Opcode = V.getOpcode();
+  EVT VT = Op.getValueType();
+  unsigned Opcode = Op.getOpcode();
+  unsigned NumElems = VT.getVectorNumElements();
 
   // Recurse into ISD::VECTOR_SHUFFLE node to find scalars.
-  if (const ShuffleVectorSDNode *SV = dyn_cast<ShuffleVectorSDNode>(N)) {
+  if (auto *SV = dyn_cast<ShuffleVectorSDNode>(Op)) {
     int Elt = SV->getMaskElt(Index);
 
     if (Elt < 0)
       return DAG.getUNDEF(VT.getVectorElementType());
 
-    unsigned NumElems = VT.getVectorNumElements();
-    SDValue NewV = (Elt < (int)NumElems) ? SV->getOperand(0)
-                                         : SV->getOperand(1);
-    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG, Depth+1);
+    SDValue Src = (Elt < (int)NumElems) ? SV->getOperand(0) : SV->getOperand(1);
+    return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
   }
 
   // Recurse into target specific vector shuffles to find scalars.
   if (isTargetShuffle(Opcode)) {
-    MVT ShufVT = V.getSimpleValueType();
+    MVT ShufVT = VT.getSimpleVT();
     MVT ShufSVT = ShufVT.getVectorElementType();
     int NumElems = (int)ShufVT.getVectorNumElements();
     SmallVector<int, 16> ShuffleMask;
     SmallVector<SDValue, 16> ShuffleOps;
     bool IsUnary;
 
-    if (!getTargetShuffleMask(N, ShufVT, true, ShuffleOps, ShuffleMask, IsUnary))
+    if (!getTargetShuffleMask(Op.getNode(), ShufVT, true, ShuffleOps,
+                              ShuffleMask, IsUnary))
       return SDValue();
 
     int Elt = ShuffleMask[Index];
     if (Elt == SM_SentinelZero)
-      return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(N), ShufSVT)
-                                 : DAG.getConstantFP(+0.0, SDLoc(N), ShufSVT);
+      return ShufSVT.isInteger() ? DAG.getConstant(0, SDLoc(Op), ShufSVT)
+                                 : DAG.getConstantFP(+0.0, SDLoc(Op), ShufSVT);
     if (Elt == SM_SentinelUndef)
       return DAG.getUNDEF(ShufSVT);
 
-    assert(0 <= Elt && Elt < (2*NumElems) && "Shuffle index out of range");
-    SDValue NewV = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
-    return getShuffleScalarElt(NewV.getNode(), Elt % NumElems, DAG,
-                               Depth+1);
+    assert(0 <= Elt && Elt < (2 * NumElems) && "Shuffle index out of range");
+    SDValue Src = (Elt < NumElems) ? ShuffleOps[0] : ShuffleOps[1];
+    return getShuffleScalarElt(Src, Elt % NumElems, DAG, Depth + 1);
   }
 
   // Recurse into insert_subvector base/sub vector to find scalars.
-  if (Opcode == ISD::INSERT_SUBVECTOR &&
-      isa<ConstantSDNode>(N->getOperand(2))) {
-    SDValue Vec = N->getOperand(0);
-    SDValue Sub = N->getOperand(1);
-    EVT SubVT = Sub.getValueType();
-    unsigned NumSubElts = SubVT.getVectorNumElements();
-    uint64_t SubIdx = N->getConstantOperandVal(2);
+  if (Opcode == ISD::INSERT_SUBVECTOR) {
+    SDValue Vec = Op.getOperand(0);
+    SDValue Sub = Op.getOperand(1);
+    uint64_t SubIdx = Op.getConstantOperandVal(2);
+    unsigned NumSubElts = Sub.getValueType().getVectorNumElements();
 
     if (SubIdx <= Index && Index < (SubIdx + NumSubElts))
-      return getShuffleScalarElt(Sub.getNode(), Index - SubIdx, DAG, Depth + 1);
-    return getShuffleScalarElt(Vec.getNode(), Index, DAG, Depth + 1);
+      return getShuffleScalarElt(Sub, Index - SubIdx, DAG, Depth + 1);
+    return getShuffleScalarElt(Vec, Index, DAG, Depth + 1);
+  }
+
+  // Recurse into concat_vectors sub vector to find scalars.
+  if (Opcode == ISD::CONCAT_VECTORS) {
+    EVT SubVT = Op.getOperand(0).getValueType();
+    unsigned NumSubElts = SubVT.getVectorNumElements();
+    uint64_t SubIdx = Index / NumSubElts;
+    uint64_t SubElt = Index % NumSubElts;
+    return getShuffleScalarElt(Op.getOperand(SubIdx), SubElt, DAG, Depth + 1);
   }
 
   // Recurse into extract_subvector src vector to find scalars.
-  if (Opcode == ISD::EXTRACT_SUBVECTOR &&
-      isa<ConstantSDNode>(N->getOperand(1))) {
-    SDValue Src = N->getOperand(0);
-    uint64_t SrcIdx = N->getConstantOperandVal(1);
-    return getShuffleScalarElt(Src.getNode(), Index + SrcIdx, DAG, Depth + 1);
+  if (Opcode == ISD::EXTRACT_SUBVECTOR) {
+    SDValue Src = Op.getOperand(0);
+    uint64_t SrcIdx = Op.getConstantOperandVal(1);
+    return getShuffleScalarElt(Src, Index + SrcIdx, DAG, Depth + 1);
   }
 
-  // Actual nodes that may contain scalar elements
+  // We only peek through bitcasts of the same vector width.
   if (Opcode == ISD::BITCAST) {
-    V = V.getOperand(0);
-    EVT SrcVT = V.getValueType();
-    unsigned NumElems = VT.getVectorNumElements();
+    SDValue Src = Op.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.isVector() && SrcVT.getVectorNumElements() == NumElems)
+      return getShuffleScalarElt(Src, Index, DAG, Depth + 1);
+    return SDValue();
+  }
 
-    if (!SrcVT.isVector() || SrcVT.getVectorNumElements() != NumElems)
-      return SDValue();
+  // Actual nodes that may contain scalar elements
+
+  // For insert_vector_elt - either return the index matching scalar or recurse
+  // into the base vector.
+  if (Opcode == ISD::INSERT_VECTOR_ELT &&
+      isa<ConstantSDNode>(Op.getOperand(2))) {
+    if (Op.getConstantOperandAPInt(2) == Index)
+      return Op.getOperand(1);
+    return getShuffleScalarElt(Op.getOperand(0), Index, DAG, Depth + 1);
   }
 
-  if (V.getOpcode() == ISD::SCALAR_TO_VECTOR)
-    return (Index == 0) ? V.getOperand(0)
+  if (Opcode == ISD::SCALAR_TO_VECTOR)
+    return (Index == 0) ? Op.getOperand(0)
                         : DAG.getUNDEF(VT.getVectorElementType());
 
-  if (V.getOpcode() == ISD::BUILD_VECTOR)
-    return V.getOperand(Index);
+  if (Opcode == ISD::BUILD_VECTOR)
+    return Op.getOperand(Index);
 
   return SDValue();
 }
@@ -7762,10 +8002,11 @@ static SDValue LowerBuildVectorv16i8(SDValue Op, unsigned NonZeros,
         Elt = NextElt;
     }
 
-    // If our first insertion is not the first index then insert into zero
-    // vector to break any register dependency else use SCALAR_TO_VECTOR.
+    // If our first insertion is not the first index or zeros are needed, then
+    // insert into zero vector. Otherwise, use SCALAR_TO_VECTOR (leaves high
+    // elements undefined).
     if (!V) {
-      if (i != 0)
+      if (i != 0 || NumZero)
         V = getZeroVector(MVT::v8i16, Subtarget, DAG, dl);
       else {
         V = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4i32, Elt);
@@ -7964,11 +8205,12 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
 
     // FIXME: 256-bit vector instructions don't require a strict alignment,
     // improve this code to support it better.
-    unsigned RequiredAlign = VT.getSizeInBits()/8;
+    Align RequiredAlign(VT.getSizeInBits() / 8);
     SDValue Chain = LD->getChain();
     // Make sure the stack object alignment is at least 16 or 32.
     MachineFrameInfo &MFI = DAG.getMachineFunction().getFrameInfo();
-    if (DAG.InferPtrAlignment(Ptr) < RequiredAlign) {
+    MaybeAlign InferredAlign = DAG.InferPtrAlign(Ptr);
+    if (!InferredAlign || *InferredAlign < RequiredAlign) {
       if (MFI.isFixedObjectIndex(FI)) {
         // Can't change the alignment. FIXME: It's possible to compute
         // the exact stack offset and reference FI + adjust offset instead.
@@ -7983,9 +8225,9 @@ static SDValue LowerAsSplatVectorLoad(SDValue SrcOp, MVT VT, const SDLoc &dl,
     // Ptr + (Offset & ~15).
     if (Offset < 0)
       return SDValue();
-    if ((Offset % RequiredAlign) & 3)
+    if ((Offset % RequiredAlign.value()) & 3)
       return SDValue();
-    int64_t StartOffset = Offset & ~int64_t(RequiredAlign - 1);
+    int64_t StartOffset = Offset & ~int64_t(RequiredAlign.value() - 1);
     if (StartOffset) {
       SDLoc DL(Ptr);
       Ptr = DAG.getNode(ISD::ADD, DL, Ptr.getValueType(), Ptr,
@@ -8024,8 +8266,8 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
   case ISD::SCALAR_TO_VECTOR:
     return findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset);
   case ISD::SRL:
-    if (isa<ConstantSDNode>(Elt.getOperand(1))) {
-      uint64_t Idx = Elt.getConstantOperandVal(1);
+    if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
+      uint64_t Idx = IdxC->getZExtValue();
       if ((Idx % 8) == 0 && findEltLoadSrc(Elt.getOperand(0), Ld, ByteOffset)) {
         ByteOffset += Idx / 8;
         return true;
@@ -8033,13 +8275,13 @@ static bool findEltLoadSrc(SDValue Elt, LoadSDNode *&Ld, int64_t &ByteOffset) {
     }
     break;
   case ISD::EXTRACT_VECTOR_ELT:
-    if (isa<ConstantSDNode>(Elt.getOperand(1))) {
+    if (auto *IdxC = dyn_cast<ConstantSDNode>(Elt.getOperand(1))) {
       SDValue Src = Elt.getOperand(0);
       unsigned SrcSizeInBits = Src.getScalarValueSizeInBits();
       unsigned DstSizeInBits = Elt.getScalarValueSizeInBits();
       if (DstSizeInBits == SrcSizeInBits && (SrcSizeInBits % 8) == 0 &&
           findEltLoadSrc(Src, Ld, ByteOffset)) {
-        uint64_t Idx = Elt.getConstantOperandVal(1);
+        uint64_t Idx = IdxC->getZExtValue();
         ByteOffset += Idx * (SrcSizeInBits / 8);
         return true;
       }
@@ -8169,7 +8411,8 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
            "Cannot merge volatile or atomic loads.");
     SDValue NewLd =
         DAG.getLoad(VT, DL, LDBase->getChain(), LDBase->getBasePtr(),
-                    LDBase->getPointerInfo(), LDBase->getAlignment(), MMOFlags);
+                    LDBase->getPointerInfo(), LDBase->getOriginalAlign(),
+                    MMOFlags);
     for (auto *LD : Loads)
       if (LD)
         DAG.makeEquivalentMemoryOrdering(LD, NewLd);
@@ -8247,14 +8490,16 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
     MVT VecSVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(LoadSizeInBits)
                                       : MVT::getIntegerVT(LoadSizeInBits);
     MVT VecVT = MVT::getVectorVT(VecSVT, VT.getSizeInBits() / LoadSizeInBits);
+    // Allow v4f32 on SSE1 only targets.
+    // FIXME: Add more isel patterns so we can just use VT directly.
+    if (!Subtarget.hasSSE2() && VT == MVT::v4f32)
+      VecVT = MVT::v4f32;
     if (TLI.isTypeLegal(VecVT)) {
       SDVTList Tys = DAG.getVTList(VecVT, MVT::Other);
       SDValue Ops[] = { LDBase->getChain(), LDBase->getBasePtr() };
-      SDValue ResNode =
-          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT,
-                                  LDBase->getPointerInfo(),
-                                  LDBase->getAlignment(),
-                                  MachineMemOperand::MOLoad);
+      SDValue ResNode = DAG.getMemIntrinsicNode(
+          X86ISD::VZEXT_LOAD, DL, Tys, Ops, VecSVT, LDBase->getPointerInfo(),
+          LDBase->getOriginalAlign(), MachineMemOperand::MOLoad);
       for (auto *LD : Loads)
         if (LD)
           DAG.makeEquivalentMemoryOrdering(LD, ResNode);
@@ -8318,13 +8563,13 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, ArrayRef<SDValue> Elts,
 // Combine a vector ops (shuffles etc.) that is equal to build_vector load1,
 // load2, load3, load4, <0, 1, 2, 3> into a vector load if the load addresses
 // are consecutive, non-overlapping, and in the right order.
-static SDValue combineToConsecutiveLoads(EVT VT, SDNode *N, const SDLoc &DL,
+static SDValue combineToConsecutiveLoads(EVT VT, SDValue Op, const SDLoc &DL,
                                          SelectionDAG &DAG,
                                          const X86Subtarget &Subtarget,
                                          bool isAfterLegalize) {
   SmallVector<SDValue, 64> Elts;
   for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
-    if (SDValue Elt = getShuffleScalarElt(N, i, DAG, 0)) {
+    if (SDValue Elt = getShuffleScalarElt(Op, i, DAG, 0)) {
       Elts.push_back(Elt);
       continue;
     }
@@ -8439,7 +8684,7 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
   SDValue Ld = BVOp->getSplatValue(&UndefElements);
 
   // Attempt to use VBROADCASTM
-  // From this paterrn:
+  // From this pattern:
   // a. t0 = (zext_i64 (bitcast_i8 v2i1 X))
   // b. t1 = (build_vector t0 t0)
   //
@@ -8486,8 +8731,8 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
       LLVMContext *Ctx = DAG.getContext();
       MVT PVT = TLI.getPointerTy(DAG.getDataLayout());
       if (Subtarget.hasAVX()) {
-        if (SplatBitSize <= 64 && Subtarget.hasAVX2() &&
-            !(SplatBitSize == 64 && Subtarget.is32Bit())) {
+        if (SplatBitSize == 32 || SplatBitSize == 64 ||
+            (SplatBitSize < 32 && Subtarget.hasAVX2())) {
           // Splatted value can fit in one INTEGER constant in constant pool.
           // Load the constant and broadcast it.
           MVT CVT = MVT::getIntegerVT(SplatBitSize);
@@ -8496,46 +8741,25 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
           SDValue CP = DAG.getConstantPool(C, PVT);
           unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
 
-          unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
-          Ld = DAG.getLoad(
-              CVT, dl, DAG.getEntryNode(), CP,
-              MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-              Alignment);
-          SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
-                                       MVT::getVectorVT(CVT, Repeat), Ld);
-          return DAG.getBitcast(VT, Brdcst);
-        } else if (SplatBitSize == 32 || SplatBitSize == 64) {
-          // Splatted value can fit in one FLOAT constant in constant pool.
-          // Load the constant and broadcast it.
-          // AVX have support for 32 and 64 bit broadcast for floats only.
-          // No 64bit integer in 32bit subtarget.
-          MVT CVT = MVT::getFloatingPointVT(SplatBitSize);
-          // Lower the splat via APFloat directly, to avoid any conversion.
-          Constant *C =
-              SplatBitSize == 32
-                  ? ConstantFP::get(*Ctx,
-                                    APFloat(APFloat::IEEEsingle(), SplatValue))
-                  : ConstantFP::get(*Ctx,
-                                    APFloat(APFloat::IEEEdouble(), SplatValue));
-          SDValue CP = DAG.getConstantPool(C, PVT);
-          unsigned Repeat = VT.getSizeInBits() / SplatBitSize;
-
-          unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
-          Ld = DAG.getLoad(
-              CVT, dl, DAG.getEntryNode(), CP,
-              MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-              Alignment);
-          SDValue Brdcst = DAG.getNode(X86ISD::VBROADCAST, dl,
-                                       MVT::getVectorVT(CVT, Repeat), Ld);
+          Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
+          SDVTList Tys =
+              DAG.getVTList(MVT::getVectorVT(CVT, Repeat), MVT::Other);
+          SDValue Ops[] = {DAG.getEntryNode(), CP};
+          MachinePointerInfo MPI =
+              MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+          SDValue Brdcst = DAG.getMemIntrinsicNode(
+              X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT, MPI, Alignment,
+              MachineMemOperand::MOLoad);
           return DAG.getBitcast(VT, Brdcst);
-        } else if (SplatBitSize > 64) {
+        }
+        if (SplatBitSize > 64) {
           // Load the vector of constants and broadcast it.
           MVT CVT = VT.getScalarType();
           Constant *VecC = getConstantVector(VT, SplatValue, SplatBitSize,
                                              *Ctx);
           SDValue VCP = DAG.getConstantPool(VecC, PVT);
           unsigned NumElm = SplatBitSize / VT.getScalarSizeInBits();
-          unsigned Alignment = cast<ConstantPoolSDNode>(VCP)->getAlignment();
+          Align Alignment = cast<ConstantPoolSDNode>(VCP)->getAlign();
           Ld = DAG.getLoad(
               MVT::getVectorVT(CVT, NumElm), dl, DAG.getEntryNode(), VCP,
               MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
@@ -8560,10 +8784,12 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
 
   bool ConstSplatVal =
       (Ld.getOpcode() == ISD::Constant || Ld.getOpcode() == ISD::ConstantFP);
+  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
 
   // Make sure that all of the users of a non-constant load are from the
   // BUILD_VECTOR node.
-  if (!ConstSplatVal && !BVOp->isOnlyUserOf(Ld.getNode()))
+  // FIXME: Is the use count needed for non-constant, non-load case?
+  if (!ConstSplatVal && !IsLoad && !BVOp->isOnlyUserOf(Ld.getNode()))
     return SDValue();
 
   unsigned ScalarSize = Ld.getValueSizeInBits();
@@ -8603,18 +8829,17 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       SDValue CP =
           DAG.getConstantPool(C, TLI.getPointerTy(DAG.getDataLayout()));
-      unsigned Alignment = cast<ConstantPoolSDNode>(CP)->getAlignment();
-      Ld = DAG.getLoad(
-          CVT, dl, DAG.getEntryNode(), CP,
-          MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-          Alignment);
+      Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
 
-      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+      SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+      SDValue Ops[] = {DAG.getEntryNode(), CP};
+      MachinePointerInfo MPI =
+          MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+      return DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops, CVT,
+                                     MPI, Alignment, MachineMemOperand::MOLoad);
     }
   }
 
-  bool IsLoad = ISD::isNormalLoad(Ld.getNode());
-
   // Handle AVX2 in-register broadcasts.
   if (!IsLoad && Subtarget.hasInt256() &&
       (ScalarSize == 32 || (IsGE256 && ScalarSize == 64)))
@@ -8624,15 +8849,34 @@ static SDValue lowerBuildVectorAsBroadcast(BuildVectorSDNode *BVOp,
   if (!IsLoad)
     return SDValue();
 
+  // Make sure the non-chain result is only used by this build vector.
+  if (!Ld->hasNUsesOfValue(NumElts - NumUndefElts, 0))
+    return SDValue();
+
   if (ScalarSize == 32 || (IsGE256 && ScalarSize == 64) ||
-      (Subtarget.hasVLX() && ScalarSize == 64))
-    return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+      (Subtarget.hasVLX() && ScalarSize == 64)) {
+    auto *LN = cast<LoadSDNode>(Ld);
+    SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+    SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+    SDValue BCast =
+        DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
+                                LN->getMemoryVT(), LN->getMemOperand());
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
+    return BCast;
+  }
 
   // The integer check is needed for the 64-bit into 128-bit so it doesn't match
   // double since there is no vbroadcastsd xmm
-  if (Subtarget.hasInt256() && Ld.getValueType().isInteger()) {
-    if (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)
-      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, Ld);
+  if (Subtarget.hasInt256() && Ld.getValueType().isInteger() &&
+      (ScalarSize == 8 || ScalarSize == 16 || ScalarSize == 64)) {
+    auto *LN = cast<LoadSDNode>(Ld);
+    SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+    SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+    SDValue BCast =
+        DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, dl, Tys, Ops,
+                                LN->getMemoryVT(), LN->getMemOperand());
+    DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BCast.getValue(1));
+    return BCast;
   }
 
   // Unsupported broadcast.
@@ -8746,20 +8990,6 @@ static SDValue buildFromShuffleMostly(SDValue Op, SelectionDAG &DAG) {
   return NV;
 }
 
-static SDValue ConvertI1VectorToInteger(SDValue Op, SelectionDAG &DAG) {
-  assert(ISD::isBuildVectorOfConstantSDNodes(Op.getNode()) &&
-         Op.getScalarValueSizeInBits() == 1 &&
-         "Can not convert non-constant vector");
-  uint64_t Immediate = 0;
-  for (unsigned idx = 0, e = Op.getNumOperands(); idx < e; ++idx) {
-    SDValue In = Op.getOperand(idx);
-    if (!In.isUndef())
-      Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
-  }
-  SDLoc dl(Op);
-  MVT VT = MVT::getIntegerVT(std::max((int)Op.getValueSizeInBits(), 8));
-  return DAG.getConstant(Immediate, dl, VT);
-}
 // Lower BUILD_VECTOR operation for v8i1 and v16i1 types.
 static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
                                      const X86Subtarget &Subtarget) {
@@ -8782,11 +9012,11 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
     SDValue In = Op.getOperand(idx);
     if (In.isUndef())
       continue;
-    if (!isa<ConstantSDNode>(In))
-      NonConstIdx.push_back(idx);
-    else {
-      Immediate |= (cast<ConstantSDNode>(In)->getZExtValue() & 0x1) << idx;
+    if (auto *InC = dyn_cast<ConstantSDNode>(In)) {
+      Immediate |= (InC->getZExtValue() & 0x1) << idx;
       HasConstElts = true;
+    } else {
+      NonConstIdx.push_back(idx);
     }
     if (SplatIdx < 0)
       SplatIdx = idx;
@@ -8805,9 +9035,24 @@ static SDValue LowerBUILD_VECTORvXi1(SDValue Op, SelectionDAG &DAG,
     if (Cond.getOpcode() != ISD::SETCC)
       Cond = DAG.getNode(ISD::AND, dl, MVT::i8, Cond,
                          DAG.getConstant(1, dl, MVT::i8));
-    return DAG.getSelect(dl, VT, Cond,
-                         DAG.getConstant(1, dl, VT),
-                         DAG.getConstant(0, dl, VT));
+
+    // Perform the select in the scalar domain so we can use cmov.
+    if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+      SDValue Select = DAG.getSelect(dl, MVT::i32, Cond,
+                                     DAG.getAllOnesConstant(dl, MVT::i32),
+                                     DAG.getConstant(0, dl, MVT::i32));
+      Select = DAG.getBitcast(MVT::v32i1, Select);
+      return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Select, Select);
+    } else {
+      MVT ImmVT = MVT::getIntegerVT(std::max((unsigned)VT.getSizeInBits(), 8U));
+      SDValue Select = DAG.getSelect(dl, ImmVT, Cond,
+                                     DAG.getAllOnesConstant(dl, ImmVT),
+                                     DAG.getConstant(0, dl, ImmVT));
+      MVT VecVT = VT.getSizeInBits() >= 8 ? VT : MVT::v8i1;
+      Select = DAG.getBitcast(VecVT, Select);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, Select,
+                         DAG.getIntPtrConstant(0, dl));
+    }
   }
 
   // insert elements one by one
@@ -8907,8 +9152,8 @@ static bool isHorizontalBinOpPart(const BuildVectorSDNode *N, unsigned Opcode,
     if (!CanFold)
       break;
 
-    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
-    unsigned I1 = cast<ConstantSDNode>(Op1.getOperand(1))->getZExtValue();
+    unsigned I0 = Op0.getConstantOperandVal(1);
+    unsigned I1 = Op1.getConstantOperandVal(1);
 
     if (i * 2 < NumElts) {
       if (V0.isUndef()) {
@@ -9056,11 +9301,10 @@ static bool isAddSubOrSubAdd(const BuildVectorSDNode *BV,
     if (Op0.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         Op1.getOpcode() != ISD::EXTRACT_VECTOR_ELT ||
         !isa<ConstantSDNode>(Op0.getOperand(1)) ||
-        !isa<ConstantSDNode>(Op1.getOperand(1)) ||
         Op0.getOperand(1) != Op1.getOperand(1))
       return false;
 
-    unsigned I0 = cast<ConstantSDNode>(Op0.getOperand(1))->getZExtValue();
+    unsigned I0 = Op0.getConstantOperandVal(1);
     if (I0 != i)
       return false;
 
@@ -9445,6 +9689,9 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
   return SDValue();
 }
 
+static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
+                          SelectionDAG &DAG);
+
 /// If a BUILD_VECTOR's source elements all apply the same bit operation and
 /// one of their operands is constant, lower to a pair of BUILD_VECTOR and
 /// just apply the bit to the vectors.
@@ -9452,6 +9699,7 @@ static SDValue LowerToHorizontalOp(const BuildVectorSDNode *BV,
 /// from this, but enough scalar bit operations are created from the later
 /// legalization + scalarization stages to need basic support.
 static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
+                                       const X86Subtarget &Subtarget,
                                        SelectionDAG &DAG) {
   SDLoc DL(Op);
   MVT VT = Op->getSimpleValueType(0);
@@ -9515,7 +9763,14 @@ static SDValue lowerBuildVectorToBitOp(BuildVectorSDNode *Op,
 
   SDValue LHS = DAG.getBuildVector(VT, DL, LHSElts);
   SDValue RHS = DAG.getBuildVector(VT, DL, RHSElts);
-  return DAG.getNode(Opcode, DL, VT, LHS, RHS);
+  SDValue Res = DAG.getNode(Opcode, DL, VT, LHS, RHS);
+
+  if (!IsShift)
+    return Res;
+
+  // Immediately lower the shift to ensure the constant build vector doesn't
+  // get converted to a constant pool before the shift is lowered.
+  return LowerShift(Res, Subtarget, DAG);
 }
 
 /// Create a vector constant without a load. SSE/AVX provide the bare minimum
@@ -9571,9 +9826,11 @@ static SDValue createVariablePermute(MVT VT, SDValue SrcVec, SDValue IndicesVec,
       IndicesVT = EVT(VT).changeVectorElementTypeToInteger();
       IndicesVec = widenSubVector(IndicesVT.getSimpleVT(), IndicesVec, false,
                                   Subtarget, DAG, SDLoc(IndicesVec));
-      return extractSubVector(
-          createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget), 0,
-          DAG, DL, SizeInBits);
+      SDValue NewSrcVec =
+          createVariablePermute(VT, SrcVec, IndicesVec, DL, DAG, Subtarget);
+      if (NewSrcVec)
+        return extractSubVector(NewSrcVec, 0, DAG, DL, SizeInBits);
+      return SDValue();
     } else if (SrcVec.getValueSizeInBits() < SizeInBits) {
       // Widen smaller SrcVec to match VT.
       SrcVec = widenSubVector(VT, SrcVec, false, Subtarget, DAG, SDLoc(SrcVec));
@@ -9869,7 +10126,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
     return HorizontalOp;
   if (SDValue Broadcast = lowerBuildVectorAsBroadcast(BV, Subtarget, DAG))
     return Broadcast;
-  if (SDValue BitOp = lowerBuildVectorToBitOp(BV, DAG))
+  if (SDValue BitOp = lowerBuildVectorToBitOp(BV, Subtarget, DAG))
     return BitOp;
 
   unsigned EVTBits = EltVT.getSizeInBits();
@@ -9929,7 +10186,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
         assert(!VarElt.getNode() && !InsIndex.getNode() &&
                "Expected one variable element in this vector");
         VarElt = Elt;
-        InsIndex = DAG.getConstant(i, dl, getVectorIdxTy(DAG.getDataLayout()));
+        InsIndex = DAG.getVectorIdxConstant(i, dl);
       }
     }
     Constant *CV = ConstantVector::get(ConstVecOps);
@@ -10929,6 +11186,71 @@ static SDValue lowerShuffleWithUNPCK(const SDLoc &DL, MVT VT,
   return SDValue();
 }
 
+/// Check if the mask can be mapped to a preliminary shuffle (vperm 64-bit)
+/// followed by unpack 256-bit.
+static SDValue lowerShuffleWithUNPCK256(const SDLoc &DL, MVT VT,
+                                        ArrayRef<int> Mask, SDValue V1,
+                                        SDValue V2, SelectionDAG &DAG) {
+  SmallVector<int, 32> Unpckl, Unpckh;
+  createSplat2ShuffleMask(VT, Unpckl, /* Lo */ true);
+  createSplat2ShuffleMask(VT, Unpckh, /* Lo */ false);
+
+  unsigned UnpackOpcode;
+  if (isShuffleEquivalent(V1, V2, Mask, Unpckl))
+    UnpackOpcode = X86ISD::UNPCKL;
+  else if (isShuffleEquivalent(V1, V2, Mask, Unpckh))
+    UnpackOpcode = X86ISD::UNPCKH;
+  else
+    return SDValue();
+
+  // This is a "natural" unpack operation (rather than the 128-bit sectored
+  // operation implemented by AVX). We need to rearrange 64-bit chunks of the
+  // input in order to use the x86 instruction.
+  V1 = DAG.getVectorShuffle(MVT::v4f64, DL, DAG.getBitcast(MVT::v4f64, V1),
+                            DAG.getUNDEF(MVT::v4f64), {0, 2, 1, 3});
+  V1 = DAG.getBitcast(VT, V1);
+  return DAG.getNode(UnpackOpcode, DL, VT, V1, V1);
+}
+
+// Check if the mask can be mapped to a TRUNCATE or VTRUNC, truncating the
+// source into the lower elements and zeroing the upper elements.
+// TODO: Merge with matchShuffleAsVPMOV.
+static bool matchShuffleAsVTRUNC(MVT &SrcVT, MVT &DstVT, MVT VT,
+                                 ArrayRef<int> Mask, const APInt &Zeroable,
+                                 const X86Subtarget &Subtarget) {
+  if (!VT.is512BitVector() && !Subtarget.hasVLX())
+    return false;
+
+  unsigned NumElts = Mask.size();
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
+  unsigned MaxScale = 64 / EltSizeInBits;
+
+  for (unsigned Scale = 2; Scale <= MaxScale; Scale += Scale) {
+    unsigned SrcEltBits = EltSizeInBits * Scale;
+    if (SrcEltBits < 32 && !Subtarget.hasBWI())
+      continue;
+    unsigned NumSrcElts = NumElts / Scale;
+    if (!isSequentialOrUndefInRange(Mask, 0, NumSrcElts, 0, Scale))
+      continue;
+    unsigned UpperElts = NumElts - NumSrcElts;
+    if (!Zeroable.extractBits(UpperElts, NumSrcElts).isAllOnesValue())
+      continue;
+    SrcVT = MVT::getIntegerVT(EltSizeInBits * Scale);
+    SrcVT = MVT::getVectorVT(SrcVT, NumSrcElts);
+    DstVT = MVT::getIntegerVT(EltSizeInBits);
+    if ((NumSrcElts * EltSizeInBits) >= 128) {
+      // ISD::TRUNCATE
+      DstVT = MVT::getVectorVT(DstVT, NumSrcElts);
+    } else {
+      // X86ISD::VTRUNC
+      DstVT = MVT::getVectorVT(DstVT, 128 / EltSizeInBits);
+    }
+    return true;
+  }
+
+  return false;
+}
+
 static bool matchShuffleAsVPMOV(ArrayRef<int> Mask, bool SwappedOps,
                                 int Delta) {
   int Size = (int)Mask.size();
@@ -11022,22 +11344,93 @@ static SDValue lowerShuffleWithVPMOV(const SDLoc &DL, ArrayRef<int> Mask,
   return DAG.getNode(X86ISD::VTRUNC, DL, VT, Src);
 }
 
+/// Check whether a compaction lowering can be done by dropping even
+/// elements and compute how many times even elements must be dropped.
+///
+/// This handles shuffles which take every Nth element where N is a power of
+/// two. Example shuffle masks:
+///
+///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
+///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
+///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
+///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
+///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
+///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
+///
+/// Any of these lanes can of course be undef.
+///
+/// This routine only supports N <= 3.
+/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
+/// for larger N.
+///
+/// \returns N above, or the number of times even elements must be dropped if
+/// there is such a number. Otherwise returns zero.
+static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
+                                          bool IsSingleInput) {
+  // The modulus for the shuffle vector entries is based on whether this is
+  // a single input or not.
+  int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
+  assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
+         "We should only be called with masks with a power-of-2 size!");
+
+  uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
+
+  // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
+  // and 2^3 simultaneously. This is because we may have ambiguity with
+  // partially undef inputs.
+  bool ViableForN[3] = {true, true, true};
+
+  for (int i = 0, e = Mask.size(); i < e; ++i) {
+    // Ignore undef lanes, we'll optimistically collapse them to the pattern we
+    // want.
+    if (Mask[i] < 0)
+      continue;
+
+    bool IsAnyViable = false;
+    for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+      if (ViableForN[j]) {
+        uint64_t N = j + 1;
+
+        // The shuffle mask must be equal to (i * 2^N) % M.
+        if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
+          IsAnyViable = true;
+        else
+          ViableForN[j] = false;
+      }
+    // Early exit if we exhaust the possible powers of two.
+    if (!IsAnyViable)
+      break;
+  }
+
+  for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
+    if (ViableForN[j])
+      return j + 1;
+
+  // Return 0 as there is no viable power of two.
+  return 0;
+}
+
 // X86 has dedicated pack instructions that can handle specific truncation
 // operations: PACKSS and PACKUS.
+// Checks for compaction shuffle masks if MaxStages > 1.
+// TODO: Add support for matching multiple PACKSS/PACKUS stages.
 static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
                                  unsigned &PackOpcode, ArrayRef<int> TargetMask,
                                  SelectionDAG &DAG,
-                                 const X86Subtarget &Subtarget) {
+                                 const X86Subtarget &Subtarget,
+                                 unsigned MaxStages = 1) {
   unsigned NumElts = VT.getVectorNumElements();
   unsigned BitSize = VT.getScalarSizeInBits();
-  MVT PackSVT = MVT::getIntegerVT(BitSize * 2);
-  MVT PackVT = MVT::getVectorVT(PackSVT, NumElts / 2);
+  assert(0 < MaxStages && MaxStages <= 3 && (BitSize << MaxStages) <= 64 &&
+         "Illegal maximum compaction");
 
-  auto MatchPACK = [&](SDValue N1, SDValue N2) {
+  auto MatchPACK = [&](SDValue N1, SDValue N2, MVT PackVT) {
+    unsigned NumSrcBits = PackVT.getScalarSizeInBits();
+    unsigned NumPackedBits = NumSrcBits - BitSize;
     SDValue VV1 = DAG.getBitcast(PackVT, N1);
     SDValue VV2 = DAG.getBitcast(PackVT, N2);
-    if (Subtarget.hasSSE41() || PackSVT == MVT::i16) {
-      APInt ZeroMask = APInt::getHighBitsSet(BitSize * 2, BitSize);
+    if (Subtarget.hasSSE41() || BitSize == 8) {
+      APInt ZeroMask = APInt::getHighBitsSet(NumSrcBits, NumPackedBits);
       if ((N1.isUndef() || DAG.MaskedValueIsZero(VV1, ZeroMask)) &&
           (N2.isUndef() || DAG.MaskedValueIsZero(VV2, ZeroMask))) {
         V1 = VV1;
@@ -11047,8 +11440,8 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
         return true;
       }
     }
-    if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > BitSize) &&
-        (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > BitSize)) {
+    if ((N1.isUndef() || DAG.ComputeNumSignBits(VV1) > NumPackedBits) &&
+        (N2.isUndef() || DAG.ComputeNumSignBits(VV2) > NumPackedBits)) {
       V1 = VV1;
       V2 = VV2;
       SrcVT = PackVT;
@@ -11058,19 +11451,25 @@ static bool matchShuffleWithPACK(MVT VT, MVT &SrcVT, SDValue &V1, SDValue &V2,
     return false;
   };
 
-  // Try binary shuffle.
-  SmallVector<int, 32> BinaryMask;
-  createPackShuffleMask(VT, BinaryMask, false);
-  if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
-    if (MatchPACK(V1, V2))
-      return true;
+  // Attempt to match against wider and wider compaction patterns.
+  for (unsigned NumStages = 1; NumStages <= MaxStages; ++NumStages) {
+    MVT PackSVT = MVT::getIntegerVT(BitSize << NumStages);
+    MVT PackVT = MVT::getVectorVT(PackSVT, NumElts >> NumStages);
 
-  // Try unary shuffle.
-  SmallVector<int, 32> UnaryMask;
-  createPackShuffleMask(VT, UnaryMask, true);
-  if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
-    if (MatchPACK(V1, V1))
-      return true;
+    // Try binary shuffle.
+    SmallVector<int, 32> BinaryMask;
+    createPackShuffleMask(VT, BinaryMask, false, NumStages);
+    if (isTargetShuffleEquivalent(TargetMask, BinaryMask, V1, V2))
+      if (MatchPACK(V1, V2, PackVT))
+        return true;
+
+    // Try unary shuffle.
+    SmallVector<int, 32> UnaryMask;
+    createPackShuffleMask(VT, UnaryMask, true, NumStages);
+    if (isTargetShuffleEquivalent(TargetMask, UnaryMask, V1))
+      if (MatchPACK(V1, V1, PackVT))
+        return true;
+  }
 
   return false;
 }
@@ -11080,12 +11479,44 @@ static SDValue lowerShuffleWithPACK(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
                                     const X86Subtarget &Subtarget) {
   MVT PackVT;
   unsigned PackOpcode;
-  if (matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
-                           Subtarget))
-    return DAG.getNode(PackOpcode, DL, VT, DAG.getBitcast(PackVT, V1),
-                       DAG.getBitcast(PackVT, V2));
+  unsigned SizeBits = VT.getSizeInBits();
+  unsigned EltBits = VT.getScalarSizeInBits();
+  unsigned MaxStages = Log2_32(64 / EltBits);
+  if (!matchShuffleWithPACK(VT, PackVT, V1, V2, PackOpcode, Mask, DAG,
+                            Subtarget, MaxStages))
+    return SDValue();
 
-  return SDValue();
+  unsigned CurrentEltBits = PackVT.getScalarSizeInBits();
+  unsigned NumStages = Log2_32(CurrentEltBits / EltBits);
+
+  // Don't lower multi-stage packs on AVX512, truncation is better.
+  if (NumStages != 1 && SizeBits == 128 && Subtarget.hasVLX())
+    return SDValue();
+
+  // Pack to the largest type possible:
+  // vXi64/vXi32 -> PACK*SDW and vXi16 -> PACK*SWB.
+  unsigned MaxPackBits = 16;
+  if (CurrentEltBits > 16 &&
+      (PackOpcode == X86ISD::PACKSS || Subtarget.hasSSE41()))
+    MaxPackBits = 32;
+
+  // Repeatedly pack down to the target size.
+  SDValue Res;
+  for (unsigned i = 0; i != NumStages; ++i) {
+    unsigned SrcEltBits = std::min(MaxPackBits, CurrentEltBits);
+    unsigned NumSrcElts = SizeBits / SrcEltBits;
+    MVT SrcSVT = MVT::getIntegerVT(SrcEltBits);
+    MVT DstSVT = MVT::getIntegerVT(SrcEltBits / 2);
+    MVT SrcVT = MVT::getVectorVT(SrcSVT, NumSrcElts);
+    MVT DstVT = MVT::getVectorVT(DstSVT, NumSrcElts * 2);
+    Res = DAG.getNode(PackOpcode, DL, DstVT, DAG.getBitcast(SrcVT, V1),
+                      DAG.getBitcast(SrcVT, V2));
+    V1 = V2 = Res;
+    CurrentEltBits /= 2;
+  }
+  assert(Res && Res.getValueType() == VT &&
+         "Failed to lower compaction shuffle");
+  return Res;
 }
 
 /// Try to emit a bitmask instruction for a shuffle.
@@ -11109,8 +11540,9 @@ static SDValue lowerShuffleAsBitMask(const SDLoc &DL, MVT VT, SDValue V1,
   MVT LogicVT = VT;
   if (EltVT == MVT::f32 || EltVT == MVT::f64) {
     Zero = DAG.getConstantFP(0.0, DL, EltVT);
-    AllOnes = DAG.getConstantFP(
-        APFloat::getAllOnesValue(EltVT.getSizeInBits(), true), DL, EltVT);
+    APFloat AllOnesValue = APFloat::getAllOnesValue(
+        SelectionDAG::EVTToAPFloatSemantics(EltVT), EltVT.getSizeInBits());
+    AllOnes = DAG.getConstantFP(AllOnesValue, DL, EltVT);
     LogicVT =
         MVT::getVectorVT(EltVT == MVT::f64 ? MVT::i64 : MVT::i32, Mask.size());
   } else {
@@ -11312,6 +11744,12 @@ static SDValue lowerShuffleAsBlend(const SDLoc &DL, MVT VT, SDValue V1,
       return getVectorMaskingNode(V2, MaskNode, V1, Subtarget, DAG);
     }
 
+    // If we have VPTERNLOG, we can use that as a bit blend.
+    if (Subtarget.hasVLX())
+      if (SDValue BitBlend =
+              lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+        return BitBlend;
+
     // Scale the blend by the number of bytes per element.
     int Scale = VT.getScalarSizeInBits() / 8;
 
@@ -11622,10 +12060,101 @@ static SDValue lowerShuffleAsDecomposedShuffleBlend(
   return DAG.getVectorShuffle(VT, DL, V1, V2, BlendMask);
 }
 
-/// Try to lower a vector shuffle as a rotation.
+/// Try to lower a vector shuffle as a bit rotation.
+///
+/// Look for a repeated rotation pattern in each sub group.
+/// Returns a ISD::ROTL element rotation amount or -1 if failed.
+static int matchShuffleAsBitRotate(ArrayRef<int> Mask, int NumSubElts) {
+  int NumElts = Mask.size();
+  assert((NumElts % NumSubElts) == 0 && "Illegal shuffle mask");
+
+  int RotateAmt = -1;
+  for (int i = 0; i != NumElts; i += NumSubElts) {
+    for (int j = 0; j != NumSubElts; ++j) {
+      int M = Mask[i + j];
+      if (M < 0)
+        continue;
+      if (!isInRange(M, i, i + NumSubElts))
+        return -1;
+      int Offset = (NumSubElts - (M - (i + j))) % NumSubElts;
+      if (0 <= RotateAmt && Offset != RotateAmt)
+        return -1;
+      RotateAmt = Offset;
+    }
+  }
+  return RotateAmt;
+}
+
+static int matchShuffleAsBitRotate(MVT &RotateVT, int EltSizeInBits,
+                                   const X86Subtarget &Subtarget,
+                                   ArrayRef<int> Mask) {
+  assert(!isNoopShuffleMask(Mask) && "We shouldn't lower no-op shuffles!");
+  assert(EltSizeInBits < 64 && "Can't rotate 64-bit integers");
+
+  // AVX512 only has vXi32/vXi64 rotates, so limit the rotation sub group size.
+  int MinSubElts = Subtarget.hasAVX512() ? std::max(32 / EltSizeInBits, 2) : 2;
+  int MaxSubElts = 64 / EltSizeInBits;
+  for (int NumSubElts = MinSubElts; NumSubElts <= MaxSubElts; NumSubElts *= 2) {
+    int RotateAmt = matchShuffleAsBitRotate(Mask, NumSubElts);
+    if (RotateAmt < 0)
+      continue;
+
+    int NumElts = Mask.size();
+    MVT RotateSVT = MVT::getIntegerVT(EltSizeInBits * NumSubElts);
+    RotateVT = MVT::getVectorVT(RotateSVT, NumElts / NumSubElts);
+    return RotateAmt * EltSizeInBits;
+  }
+
+  return -1;
+}
+
+/// Lower shuffle using X86ISD::VROTLI rotations.
+static SDValue lowerShuffleAsBitRotate(const SDLoc &DL, MVT VT, SDValue V1,
+                                       ArrayRef<int> Mask,
+                                       const X86Subtarget &Subtarget,
+                                       SelectionDAG &DAG) {
+  // Only XOP + AVX512 targets have bit rotation instructions.
+  // If we at least have SSSE3 (PSHUFB) then we shouldn't attempt to use this.
+  bool IsLegal =
+      (VT.is128BitVector() && Subtarget.hasXOP()) || Subtarget.hasAVX512();
+  if (!IsLegal && Subtarget.hasSSE3())
+    return SDValue();
+
+  MVT RotateVT;
+  int RotateAmt = matchShuffleAsBitRotate(RotateVT, VT.getScalarSizeInBits(),
+                                          Subtarget, Mask);
+  if (RotateAmt < 0)
+    return SDValue();
+
+  // For pre-SSSE3 targets, if we are shuffling vXi8 elts then ISD::ROTL,
+  // expanded to OR(SRL,SHL), will be more efficient, but if they can
+  // widen to vXi16 or more then existing lowering should will be better.
+  if (!IsLegal) {
+    if ((RotateAmt % 16) == 0)
+      return SDValue();
+    // TODO: Use getTargetVShiftByConstNode.
+    unsigned ShlAmt = RotateAmt;
+    unsigned SrlAmt = RotateVT.getScalarSizeInBits() - RotateAmt;
+    V1 = DAG.getBitcast(RotateVT, V1);
+    SDValue SHL = DAG.getNode(X86ISD::VSHLI, DL, RotateVT, V1,
+                              DAG.getTargetConstant(ShlAmt, DL, MVT::i8));
+    SDValue SRL = DAG.getNode(X86ISD::VSRLI, DL, RotateVT, V1,
+                              DAG.getTargetConstant(SrlAmt, DL, MVT::i8));
+    SDValue Rot = DAG.getNode(ISD::OR, DL, RotateVT, SHL, SRL);
+    return DAG.getBitcast(VT, Rot);
+  }
+
+  SDValue Rot =
+      DAG.getNode(X86ISD::VROTLI, DL, RotateVT, DAG.getBitcast(RotateVT, V1),
+                  DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
+  return DAG.getBitcast(VT, Rot);
+}
+
+/// Try to match a vector shuffle as an element rotation.
 ///
 /// This is used for support PALIGNR for SSSE3 or VALIGND/Q for AVX512.
-static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
+static int matchShuffleAsElementRotate(SDValue &V1, SDValue &V2,
+                                       ArrayRef<int> Mask) {
   int NumElts = Mask.size();
 
   // We need to detect various ways of spelling a rotation:
@@ -11712,7 +12241,7 @@ static int matchShuffleAsRotate(SDValue &V1, SDValue &V2, ArrayRef<int> Mask) {
 static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
                                     ArrayRef<int> Mask) {
   // Don't accept any shuffles with zero elements.
-  if (any_of(Mask, [](int M) { return M == SM_SentinelZero; }))
+  if (isAnyZero(Mask))
     return -1;
 
   // PALIGNR works on 128-bit lanes.
@@ -11720,7 +12249,7 @@ static int matchShuffleAsByteRotate(MVT VT, SDValue &V1, SDValue &V2,
   if (!is128BitLaneRepeatedShuffleMask(VT, Mask, RepeatedMask))
     return -1;
 
-  int Rotation = matchShuffleAsRotate(V1, V2, RepeatedMask);
+  int Rotation = matchShuffleAsElementRotate(V1, V2, RepeatedMask);
   if (Rotation <= 0)
     return -1;
 
@@ -11788,7 +12317,7 @@ static SDValue lowerShuffleAsByteRotate(const SDLoc &DL, MVT VT, SDValue V1,
 /// elements, and takes the low elements as the result. Note that while this is
 /// specified as a *right shift* because x86 is little-endian, it is a *left
 /// rotate* of the vector lanes.
-static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
+static SDValue lowerShuffleAsVALIGN(const SDLoc &DL, MVT VT, SDValue V1,
                                     SDValue V2, ArrayRef<int> Mask,
                                     const X86Subtarget &Subtarget,
                                     SelectionDAG &DAG) {
@@ -11800,7 +12329,7 @@ static SDValue lowerShuffleAsRotate(const SDLoc &DL, MVT VT, SDValue V1,
          && "VLX required for 128/256-bit vectors");
 
   SDValue Lo = V1, Hi = V2;
-  int Rotation = matchShuffleAsRotate(Lo, Hi, Mask);
+  int Rotation = matchShuffleAsElementRotate(Lo, Hi, Mask);
   if (Rotation <= 0)
     return SDValue();
 
@@ -12566,13 +13095,13 @@ static SDValue lowerShuffleAsTruncBroadcast(const SDLoc &DL, MVT VT, SDValue V0,
   assert(Subtarget.hasAVX2() &&
          "We can only lower integer broadcasts with AVX2!");
 
-  EVT EltVT = VT.getVectorElementType();
-  EVT V0VT = V0.getValueType();
+  MVT EltVT = VT.getVectorElementType();
+  MVT V0VT = V0.getSimpleValueType();
 
   assert(VT.isInteger() && "Unexpected non-integer trunc broadcast!");
   assert(V0VT.isVector() && "Unexpected non-vector vector-sized value!");
 
-  EVT V0EltVT = V0VT.getVectorElementType();
+  MVT V0EltVT = V0VT.getVectorElementType();
   if (!V0EltVT.isInteger())
     return SDValue();
 
@@ -12636,7 +13165,7 @@ static bool isSingleSHUFPSMask(ArrayRef<int> Mask) {
 static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
                                              SDValue N1, ArrayRef<int> Mask,
                                              SelectionDAG &DAG) {
-  EVT VT = N0.getValueType();
+  MVT VT = N0.getSimpleValueType();
   assert((VT.is128BitVector() &&
           (VT.getScalarSizeInBits() == 32 || VT.getScalarSizeInBits() == 64)) &&
          "VPERM* family of shuffles requires 32-bit or 64-bit elements");
@@ -12649,9 +13178,8 @@ static SDValue lowerShuffleOfExtractsAsVperm(const SDLoc &DL, SDValue N0,
     return SDValue();
 
   SDValue WideVec = N0.getOperand(0);
-  EVT WideVT = WideVec.getValueType();
-  if (!WideVT.is256BitVector() || !isa<ConstantSDNode>(N0.getOperand(1)) ||
-      !isa<ConstantSDNode>(N1.getOperand(1)))
+  MVT WideVT = WideVec.getSimpleValueType();
+  if (!WideVT.is256BitVector())
     return SDValue();
 
   // Match extracts of each half of the wide source vector. Commute the shuffle
@@ -12699,7 +13227,6 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
 
   // With MOVDDUP (v2f64) we can broadcast from a register or a load, otherwise
   // we can only broadcast from a register with AVX2.
-  unsigned NumElts = Mask.size();
   unsigned NumEltBits = VT.getScalarSizeInBits();
   unsigned Opcode = (VT == MVT::v2f64 && !Subtarget.hasAVX2())
                         ? X86ISD::MOVDDUP
@@ -12707,15 +13234,7 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
   bool BroadcastFromReg = (Opcode == X86ISD::MOVDDUP) || Subtarget.hasAVX2();
 
   // Check that the mask is a broadcast.
-  int BroadcastIdx = -1;
-  for (int i = 0; i != (int)NumElts; ++i) {
-    SmallVector<int, 8> BroadcastMask(NumElts, i);
-    if (isShuffleEquivalent(V1, V2, Mask, BroadcastMask)) {
-      BroadcastIdx = i;
-      break;
-    }
-  }
-
+  int BroadcastIdx = getSplatIndex(Mask);
   if (BroadcastIdx < 0)
     return SDValue();
   assert(BroadcastIdx < (int)Mask.size() && "We only expect to be called with "
@@ -12724,6 +13243,8 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
 
   // Go up the chain of (vector) values to find a scalar load that we can
   // combine with the broadcast.
+  // TODO: Combine this logic with findEltLoadSrc() used by
+  //       EltsFromConsecutiveLoads().
   int BitOffset = BroadcastIdx * NumEltBits;
   SDValue V = V1;
   for (;;) {
@@ -12739,14 +13260,19 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
       BitOffset %= OpBitWidth;
       continue;
     }
+    case ISD::EXTRACT_SUBVECTOR: {
+      // The extraction index adds to the existing offset.
+      unsigned EltBitWidth = V.getScalarValueSizeInBits();
+      unsigned Idx = V.getConstantOperandVal(1);
+      unsigned BeginOffset = Idx * EltBitWidth;
+      BitOffset += BeginOffset;
+      V = V.getOperand(0);
+      continue;
+    }
     case ISD::INSERT_SUBVECTOR: {
       SDValue VOuter = V.getOperand(0), VInner = V.getOperand(1);
-      auto ConstantIdx = dyn_cast<ConstantSDNode>(V.getOperand(2));
-      if (!ConstantIdx)
-        break;
-
       int EltBitWidth = VOuter.getScalarValueSizeInBits();
-      int Idx = (int)ConstantIdx->getZExtValue();
+      int Idx = (int)V.getConstantOperandVal(2);
       int NumSubElts = (int)VInner.getSimpleValueType().getVectorNumElements();
       int BeginOffset = Idx * EltBitWidth;
       int EndOffset = BeginOffset + NumSubElts * EltBitWidth;
@@ -12777,8 +13303,6 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
             DL, VT, V, BroadcastIdx, Subtarget, DAG))
       return TruncBroadcast;
 
-  MVT BroadcastVT = VT;
-
   // Also check the simpler case, where we can directly reuse the scalar.
   if (!BitCastSrc &&
       ((V.getOpcode() == ISD::BUILD_VECTOR && V.hasOneUse()) ||
@@ -12788,23 +13312,34 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
     // If we can't broadcast from a register, check that the input is a load.
     if (!BroadcastFromReg && !isShuffleFoldableLoad(V))
       return SDValue();
-  } else if (MayFoldLoad(V) && cast<LoadSDNode>(V)->isSimple()) {
-    // 32-bit targets need to load i64 as a f64 and then bitcast the result.
-    if (!Subtarget.is64Bit() && VT.getScalarType() == MVT::i64) {
-      BroadcastVT = MVT::getVectorVT(MVT::f64, VT.getVectorNumElements());
-      Opcode = (BroadcastVT.is128BitVector() && !Subtarget.hasAVX2())
-                   ? X86ISD::MOVDDUP
-                   : Opcode;
-    }
+  } else if (ISD::isNormalLoad(V.getNode()) &&
+             cast<LoadSDNode>(V)->isSimple()) {
+    // We do not check for one-use of the vector load because a broadcast load
+    // is expected to be a win for code size, register pressure, and possibly
+    // uops even if the original vector load is not eliminated.
 
-    // If we are broadcasting a load that is only used by the shuffle
-    // then we can reduce the vector load to the broadcasted scalar load.
+    // Reduce the vector load and shuffle to a broadcasted scalar load.
     LoadSDNode *Ld = cast<LoadSDNode>(V);
     SDValue BaseAddr = Ld->getOperand(1);
-    EVT SVT = BroadcastVT.getScalarType();
+    MVT SVT = VT.getScalarType();
     unsigned Offset = BroadcastIdx * SVT.getStoreSize();
     assert((int)(Offset * 8) == BitOffset && "Unexpected bit-offset");
     SDValue NewAddr = DAG.getMemBasePlusOffset(BaseAddr, Offset, DL);
+
+    // Directly form VBROADCAST_LOAD if we're using VBROADCAST opcode rather
+    // than MOVDDUP.
+    // FIXME: Should we add VBROADCAST_LOAD isel patterns for pre-AVX?
+    if (Opcode == X86ISD::VBROADCAST) {
+      SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+      SDValue Ops[] = {Ld->getChain(), NewAddr};
+      V = DAG.getMemIntrinsicNode(
+          X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, SVT,
+          DAG.getMachineFunction().getMachineMemOperand(
+              Ld->getMemOperand(), Offset, SVT.getStoreSize()));
+      DAG.makeEquivalentMemoryOrdering(Ld, V);
+      return DAG.getBitcast(VT, V);
+    }
+    assert(SVT == MVT::f64 && "Unexpected VT!");
     V = DAG.getLoad(SVT, DL, Ld->getChain(), NewAddr,
                     DAG.getMachineFunction().getMachineMemOperand(
                         Ld->getMemOperand(), Offset, SVT.getStoreSize()));
@@ -12839,38 +13374,26 @@ static SDValue lowerShuffleAsBroadcast(const SDLoc &DL, MVT VT, SDValue V1,
     V = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v2f64,
                     DAG.getBitcast(MVT::f64, V));
 
-  // Bitcast back to the same scalar type as BroadcastVT.
-  if (V.getValueType().getScalarType() != BroadcastVT.getScalarType()) {
-    assert(NumEltBits == BroadcastVT.getScalarSizeInBits() &&
-           "Unexpected vector element size");
-    MVT ExtVT;
-    if (V.getValueType().isVector()) {
-      unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
-      ExtVT = MVT::getVectorVT(BroadcastVT.getScalarType(), NumSrcElts);
-    } else {
-      ExtVT = BroadcastVT.getScalarType();
-    }
-    V = DAG.getBitcast(ExtVT, V);
-  }
-
-  // 32-bit targets need to load i64 as a f64 and then bitcast the result.
-  if (!Subtarget.is64Bit() && V.getValueType() == MVT::i64) {
-    V = DAG.getBitcast(MVT::f64, V);
-    unsigned NumBroadcastElts = BroadcastVT.getVectorNumElements();
-    BroadcastVT = MVT::getVectorVT(MVT::f64, NumBroadcastElts);
+  // If this is a scalar, do the broadcast on this type and bitcast.
+  if (!V.getValueType().isVector()) {
+    assert(V.getScalarValueSizeInBits() == NumEltBits &&
+           "Unexpected scalar size");
+    MVT BroadcastVT = MVT::getVectorVT(V.getSimpleValueType(),
+                                       VT.getVectorNumElements());
+    return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
   }
 
   // We only support broadcasting from 128-bit vectors to minimize the
   // number of patterns we need to deal with in isel. So extract down to
   // 128-bits, removing as many bitcasts as possible.
-  if (V.getValueSizeInBits() > 128) {
-    MVT ExtVT = V.getSimpleValueType().getScalarType();
-    ExtVT = MVT::getVectorVT(ExtVT, 128 / ExtVT.getScalarSizeInBits());
+  if (V.getValueSizeInBits() > 128)
     V = extract128BitVector(peekThroughBitcasts(V), 0, DAG, DL);
-    V = DAG.getBitcast(ExtVT, V);
-  }
 
-  return DAG.getBitcast(VT, DAG.getNode(Opcode, DL, BroadcastVT, V));
+  // Otherwise cast V to a vector with the same element type as VT, but
+  // possibly narrower than VT. Then perform the broadcast.
+  unsigned NumSrcElts = V.getValueSizeInBits() / NumEltBits;
+  MVT CastVT = MVT::getVectorVT(VT.getVectorElementType(), NumSrcElts);
+  return DAG.getNode(Opcode, DL, VT, DAG.getBitcast(CastVT, V));
 }
 
 // Check for whether we can use INSERTPS to perform the shuffle. We only use
@@ -13259,7 +13782,7 @@ static SDValue lowerV2I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
   if (Subtarget.hasSSSE3()) {
     if (Subtarget.hasVLX())
-      if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v2i64, V1, V2, Mask,
+      if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v2i64, V1, V2, Mask,
                                                 Subtarget, DAG))
         return Rotate;
 
@@ -13293,8 +13816,7 @@ static SDValue lowerShuffleWithSHUFPS(const SDLoc &DL, MVT VT,
                                       ArrayRef<int> Mask, SDValue V1,
                                       SDValue V2, SelectionDAG &DAG) {
   SDValue LowV = V1, HighV = V2;
-  int NewMask[4] = {Mask[0], Mask[1], Mask[2], Mask[3]};
-
+  SmallVector<int, 4> NewMask(Mask.begin(), Mask.end());
   int NumV2Elements = count_if(Mask, [](int M) { return M >= 4; });
 
   if (NumV2Elements == 1) {
@@ -13548,7 +14070,7 @@ static SDValue lowerV4I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // Its more profitable for pre-SSSE3 to use shuffles/unpacks.
   if (Subtarget.hasSSSE3()) {
     if (Subtarget.hasVLX())
-      if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i32, V1, V2, Mask,
+      if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i32, V1, V2, Mask,
                                                 Subtarget, DAG))
         return Rotate;
 
@@ -14186,6 +14708,11 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                     Mask, Subtarget, DAG))
       return Broadcast;
 
+    // Try to use bit rotation instructions.
+    if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v8i16, V1, Mask,
+                                                 Subtarget, DAG))
+      return Rotate;
+
     // Use dedicated unpack instructions for masks that match their pattern.
     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v8i16, Mask, V1, V2, DAG))
       return V;
@@ -14262,6 +14789,29 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Zeroable, Subtarget, DAG))
     return V;
 
+  // Attempt to lower using compaction, SSE41 is necessary for PACKUSDW.
+  // We could use SIGN_EXTEND_INREG+PACKSSDW for older targets but this seems to
+  // be slower than a PSHUFLW+PSHUFHW+PSHUFD chain.
+  int NumEvenDrops = canLowerByDroppingEvenElements(Mask, false);
+  if ((NumEvenDrops == 1 || NumEvenDrops == 2) && Subtarget.hasSSE41() &&
+      !Subtarget.hasVLX()) {
+    SmallVector<SDValue, 8> DWordClearOps(4, DAG.getConstant(0, DL, MVT::i32));
+    for (unsigned i = 0; i != 4; i += 1 << (NumEvenDrops - 1))
+      DWordClearOps[i] = DAG.getConstant(0xFFFF, DL, MVT::i32);
+    SDValue DWordClearMask = DAG.getBuildVector(MVT::v4i32, DL, DWordClearOps);
+    V1 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V1),
+                     DWordClearMask);
+    V2 = DAG.getNode(ISD::AND, DL, MVT::v4i32, DAG.getBitcast(MVT::v4i32, V2),
+                     DWordClearMask);
+    // Now pack things back together.
+    SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, V1, V2);
+    if (NumEvenDrops == 2) {
+      Result = DAG.getBitcast(MVT::v4i32, Result);
+      Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v8i16, Result, Result);
+    }
+    return Result;
+  }
+
   // Try to lower by permuting the inputs into an unpack instruction.
   if (SDValue Unpack = lowerShuffleAsPermuteAndUnpack(DL, MVT::v8i16, V1, V2,
                                                       Mask, Subtarget, DAG))
@@ -14281,72 +14831,6 @@ static SDValue lowerV8I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Mask, Subtarget, DAG);
 }
 
-/// Check whether a compaction lowering can be done by dropping even
-/// elements and compute how many times even elements must be dropped.
-///
-/// This handles shuffles which take every Nth element where N is a power of
-/// two. Example shuffle masks:
-///
-///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14,  0,  2,  4,  6,  8, 10, 12, 14
-///  N = 1:  0,  2,  4,  6,  8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30
-///  N = 2:  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12,  0,  4,  8, 12
-///  N = 2:  0,  4,  8, 12, 16, 20, 24, 28,  0,  4,  8, 12, 16, 20, 24, 28
-///  N = 3:  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8,  0,  8
-///  N = 3:  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24,  0,  8, 16, 24
-///
-/// Any of these lanes can of course be undef.
-///
-/// This routine only supports N <= 3.
-/// FIXME: Evaluate whether either AVX or AVX-512 have any opportunities here
-/// for larger N.
-///
-/// \returns N above, or the number of times even elements must be dropped if
-/// there is such a number. Otherwise returns zero.
-static int canLowerByDroppingEvenElements(ArrayRef<int> Mask,
-                                          bool IsSingleInput) {
-  // The modulus for the shuffle vector entries is based on whether this is
-  // a single input or not.
-  int ShuffleModulus = Mask.size() * (IsSingleInput ? 1 : 2);
-  assert(isPowerOf2_32((uint32_t)ShuffleModulus) &&
-         "We should only be called with masks with a power-of-2 size!");
-
-  uint64_t ModMask = (uint64_t)ShuffleModulus - 1;
-
-  // We track whether the input is viable for all power-of-2 strides 2^1, 2^2,
-  // and 2^3 simultaneously. This is because we may have ambiguity with
-  // partially undef inputs.
-  bool ViableForN[3] = {true, true, true};
-
-  for (int i = 0, e = Mask.size(); i < e; ++i) {
-    // Ignore undef lanes, we'll optimistically collapse them to the pattern we
-    // want.
-    if (Mask[i] < 0)
-      continue;
-
-    bool IsAnyViable = false;
-    for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
-      if (ViableForN[j]) {
-        uint64_t N = j + 1;
-
-        // The shuffle mask must be equal to (i * 2^N) % M.
-        if ((uint64_t)Mask[i] == (((uint64_t)i << N) & ModMask))
-          IsAnyViable = true;
-        else
-          ViableForN[j] = false;
-      }
-    // Early exit if we exhaust the possible powers of two.
-    if (!IsAnyViable)
-      break;
-  }
-
-  for (unsigned j = 0; j != array_lengthof(ViableForN); ++j)
-    if (ViableForN[j])
-      return j + 1;
-
-  // Return 0 as there is no viable power of two.
-  return 0;
-}
-
 static SDValue lowerShuffleWithPERMV(const SDLoc &DL, MVT VT,
                                      ArrayRef<int> Mask, SDValue V1,
                                      SDValue V2, SelectionDAG &DAG) {
@@ -14410,6 +14894,11 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                     Mask, Subtarget, DAG))
       return Broadcast;
 
+    // Try to use bit rotation instructions.
+    if (SDValue Rotate = lowerShuffleAsBitRotate(DL, MVT::v16i8, V1, Mask,
+                                                 Subtarget, DAG))
+      return Rotate;
+
     if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v16i8, Mask, V1, V2, DAG))
       return V;
 
@@ -14524,6 +15013,10 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                               Zeroable, Subtarget, DAG))
     return V;
 
+  // Check for compaction patterns.
+  bool IsSingleInput = V2.isUndef();
+  int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput);
+
   // Check for SSSE3 which lets us lower all v16i8 shuffles much more directly
   // with PSHUFB. It is important to do this before we attempt to generate any
   // blends but after all of the single-input lowerings. If the single input
@@ -14534,10 +15027,13 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // and there are *very* few patterns that would actually be faster than the
   // PSHUFB approach because of its ability to zero lanes.
   //
+  // If the mask is a binary compaction, we can more efficiently perform this
+  // as a PACKUS(AND(),AND()) - which is quicker than UNPACK(PSHUFB(),PSHUFB()).
+  //
   // FIXME: The only exceptions to the above are blends which are exact
   // interleavings with direct instructions supporting them. We currently don't
   // handle those well here.
-  if (Subtarget.hasSSSE3()) {
+  if (Subtarget.hasSSSE3() && (IsSingleInput || NumEvenDrops != 1)) {
     bool V1InUse = false;
     bool V2InUse = false;
 
@@ -14595,8 +15091,7 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // We special case these as they can be particularly efficiently handled with
   // the PACKUSB instruction on x86 and they show up in common patterns of
   // rearranging bytes to truncate wide elements.
-  bool IsSingleInput = V2.isUndef();
-  if (int NumEvenDrops = canLowerByDroppingEvenElements(Mask, IsSingleInput)) {
+  if (NumEvenDrops) {
     // NumEvenDrops is the power of two stride of the elements. Another way of
     // thinking about it is that we need to drop the even elements this many
     // times to get the original input.
@@ -14604,23 +15099,23 @@ static SDValue lowerV16I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     // First we need to zero all the dropped bytes.
     assert(NumEvenDrops <= 3 &&
            "No support for dropping even elements more than 3 times.");
-    SmallVector<SDValue, 16> ByteClearOps(16, DAG.getConstant(0, DL, MVT::i8));
-    for (unsigned i = 0; i != 16; i += 1 << NumEvenDrops)
-      ByteClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i8);
-    SDValue ByteClearMask = DAG.getBuildVector(MVT::v16i8, DL, ByteClearOps);
-    V1 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V1, ByteClearMask);
+    SmallVector<SDValue, 8> WordClearOps(8, DAG.getConstant(0, DL, MVT::i16));
+    for (unsigned i = 0; i != 8; i += 1 << (NumEvenDrops - 1))
+      WordClearOps[i] = DAG.getConstant(0xFF, DL, MVT::i16);
+    SDValue WordClearMask = DAG.getBuildVector(MVT::v8i16, DL, WordClearOps);
+    V1 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V1),
+                     WordClearMask);
     if (!IsSingleInput)
-      V2 = DAG.getNode(ISD::AND, DL, MVT::v16i8, V2, ByteClearMask);
+      V2 = DAG.getNode(ISD::AND, DL, MVT::v8i16, DAG.getBitcast(MVT::v8i16, V2),
+                       WordClearMask);
 
     // Now pack things back together.
-    V1 = DAG.getBitcast(MVT::v8i16, V1);
-    V2 = IsSingleInput ? V1 : DAG.getBitcast(MVT::v8i16, V2);
-    SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1, V2);
+    SDValue Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, V1,
+                                 IsSingleInput ? V1 : V2);
     for (int i = 1; i < NumEvenDrops; ++i) {
       Result = DAG.getBitcast(MVT::v8i16, Result);
       Result = DAG.getNode(X86ISD::PACKUS, DL, MVT::v16i8, Result, Result);
     }
-
     return Result;
   }
 
@@ -14725,37 +15220,13 @@ static SDValue splitAndLowerShuffle(const SDLoc &DL, MVT VT, SDValue V1,
   int NumElements = VT.getVectorNumElements();
   int SplitNumElements = NumElements / 2;
   MVT ScalarVT = VT.getVectorElementType();
-  MVT SplitVT = MVT::getVectorVT(ScalarVT, NumElements / 2);
+  MVT SplitVT = MVT::getVectorVT(ScalarVT, SplitNumElements);
 
-  // Rather than splitting build-vectors, just build two narrower build
-  // vectors. This helps shuffling with splats and zeros.
+  // Use splitVector/extractSubVector so that split build-vectors just build two
+  // narrower build vectors. This helps shuffling with splats and zeros.
   auto SplitVector = [&](SDValue V) {
-    V = peekThroughBitcasts(V);
-
-    MVT OrigVT = V.getSimpleValueType();
-    int OrigNumElements = OrigVT.getVectorNumElements();
-    int OrigSplitNumElements = OrigNumElements / 2;
-    MVT OrigScalarVT = OrigVT.getVectorElementType();
-    MVT OrigSplitVT = MVT::getVectorVT(OrigScalarVT, OrigNumElements / 2);
-
     SDValue LoV, HiV;
-
-    auto *BV = dyn_cast<BuildVectorSDNode>(V);
-    if (!BV) {
-      LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
-                        DAG.getIntPtrConstant(0, DL));
-      HiV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, OrigSplitVT, V,
-                        DAG.getIntPtrConstant(OrigSplitNumElements, DL));
-    } else {
-
-      SmallVector<SDValue, 16> LoOps, HiOps;
-      for (int i = 0; i < OrigSplitNumElements; ++i) {
-        LoOps.push_back(BV->getOperand(i));
-        HiOps.push_back(BV->getOperand(i + OrigSplitNumElements));
-      }
-      LoV = DAG.getBuildVector(OrigSplitVT, DL, LoOps);
-      HiV = DAG.getBuildVector(OrigSplitVT, DL, HiOps);
-    }
+    std::tie(LoV, HiV) = splitVector(peekThroughBitcasts(V), DAG, DL);
     return std::make_pair(DAG.getBitcast(SplitVT, LoV),
                           DAG.getBitcast(SplitVT, HiV));
   };
@@ -15963,7 +16434,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     SmallVector<int, 2> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v4i64, Mask, RepeatedMask)) {
       SmallVector<int, 4> PSHUFDMask;
-      scaleShuffleMask<int>(2, RepeatedMask, PSHUFDMask);
+      narrowShuffleMaskElts(2, RepeatedMask, PSHUFDMask);
       return DAG.getBitcast(
           MVT::v4i64,
           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v8i32,
@@ -15984,7 +16455,7 @@ static SDValue lowerV4I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // If we have VLX support, we can use VALIGN or VEXPAND.
   if (Subtarget.hasVLX()) {
-    if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v4i64, V1, V2, Mask,
+    if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v4i64, V1, V2, Mask,
                                               Subtarget, DAG))
       return Rotate;
 
@@ -16085,13 +16556,14 @@ static SDValue lowerV8F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // If we have a single input shuffle with different shuffle patterns in the
   // two 128-bit lanes use the variable mask to VPERMILPS.
   if (V2.isUndef()) {
-    SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
-    if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask))
+    if (!is128BitLaneCrossingShuffleMask(MVT::v8f32, Mask)) {
+      SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
       return DAG.getNode(X86ISD::VPERMILPV, DL, MVT::v8f32, V1, VPermMask);
-
-    if (Subtarget.hasAVX2())
+    }
+    if (Subtarget.hasAVX2()) {
+      SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
       return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8f32, VPermMask, V1);
-
+    }
     // Otherwise, fall back.
     return lowerShuffleAsLanePermuteAndShuffle(DL, MVT::v8f32, V1, V2, Mask,
                                                DAG, Subtarget);
@@ -16190,7 +16662,7 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // If we have VLX support, we can use VALIGN or EXPAND.
   if (Subtarget.hasVLX()) {
-    if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i32, V1, V2, Mask,
+    if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i32, V1, V2, Mask,
                                               Subtarget, DAG))
       return Rotate;
 
@@ -16210,9 +16682,14 @@ static SDValue lowerV8I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
           DL, MVT::v8i32, V1, V2, Mask, Subtarget, DAG))
     return V;
 
-  // If the shuffle patterns aren't repeated but it is a single input, directly
-  // generate a cross-lane VPERMD instruction.
   if (V2.isUndef()) {
+    // Try to produce a fixed cross-128-bit lane permute followed by unpack
+    // because that should be faster than the variable permute alternatives.
+    if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v8i32, Mask, V1, V2, DAG))
+      return V;
+
+    // If the shuffle patterns aren't repeated but it's a single input, directly
+    // generate a cross-lane VPERMD instruction.
     SDValue VPermMask = getConstVector(Mask, MVT::v8i32, DAG, DL, true);
     return DAG.getNode(X86ISD::VPERMV, DL, MVT::v8i32, VPermMask, V1);
   }
@@ -16294,6 +16771,16 @@ static SDValue lowerV16I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return V;
 
   if (V2.isUndef()) {
+    // Try to use bit rotation instructions.
+    if (SDValue Rotate =
+            lowerShuffleAsBitRotate(DL, MVT::v16i16, V1, Mask, Subtarget, DAG))
+      return Rotate;
+
+    // Try to produce a fixed cross-128-bit lane permute followed by unpack
+    // because that should be faster than the variable permute alternatives.
+    if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v16i16, Mask, V1, V2, DAG))
+      return V;
+
     // There are no generalized cross-lane shuffle operations available on i16
     // element types.
     if (is128BitLaneCrossingShuffleMask(MVT::v16i16, Mask)) {
@@ -16379,7 +16866,7 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
 
   // Try to use shift instructions.
   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i8, V1, V2, Mask,
-                                                Zeroable, Subtarget, DAG))
+                                          Zeroable, Subtarget, DAG))
     return Shift;
 
   // Try to use byte rotation instructions.
@@ -16387,6 +16874,12 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                 Subtarget, DAG))
     return Rotate;
 
+  // Try to use bit rotation instructions.
+  if (V2.isUndef())
+    if (SDValue Rotate =
+            lowerShuffleAsBitRotate(DL, MVT::v32i8, V1, Mask, Subtarget, DAG))
+      return Rotate;
+
   // Try to create an in-lane repeating shuffle mask and then shuffle the
   // results into the target lanes.
   if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
@@ -16396,6 +16889,11 @@ static SDValue lowerV32I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   // There are no generalized cross-lane shuffle operations available on i8
   // element types.
   if (V2.isUndef() && is128BitLaneCrossingShuffleMask(MVT::v32i8, Mask)) {
+    // Try to produce a fixed cross-128-bit lane permute followed by unpack
+    // because that should be faster than the variable permute alternatives.
+    if (SDValue V = lowerShuffleWithUNPCK256(DL, MVT::v32i8, Mask, V1, V2, DAG))
+      return V;
+
     if (SDValue V = lowerShuffleAsLanePermuteAndPermute(
             DL, MVT::v32i8, V1, V2, Mask, DAG, Subtarget))
       return V;
@@ -16518,13 +17016,14 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
   assert(VT.is512BitVector() && "Unexpected vector size for 512bit shuffle.");
 
   // TODO - use Zeroable like we do for lowerV2X128VectorShuffle?
-  SmallVector<int, 4> WidenedMask;
-  if (!canWidenShuffleElements(Mask, WidenedMask))
+  SmallVector<int, 4> Widened128Mask;
+  if (!canWidenShuffleElements(Mask, Widened128Mask))
     return SDValue();
+  assert(Widened128Mask.size() == 4 && "Shuffle widening mismatch");
 
   // Try to use an insert into a zero vector.
-  if (WidenedMask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
-      (WidenedMask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
+  if (Widened128Mask[0] == 0 && (Zeroable & 0xf0) == 0xf0 &&
+      (Widened128Mask[1] == 1 || (Zeroable & 0x0c) == 0x0c)) {
     unsigned NumElts = ((Zeroable & 0x0c) == 0x0c) ? 2 : 4;
     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
     SDValue LoV = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, V1,
@@ -16536,37 +17035,34 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
 
   // Check for patterns which can be matched with a single insert of a 256-bit
   // subvector.
-  bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask,
-                                        {0, 1, 2, 3, 0, 1, 2, 3});
-  if (OnlyUsesV1 || isShuffleEquivalent(V1, V2, Mask,
-                                        {0, 1, 2, 3, 8, 9, 10, 11})) {
+  bool OnlyUsesV1 = isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 0, 1, 2, 3});
+  if (OnlyUsesV1 ||
+      isShuffleEquivalent(V1, V2, Mask, {0, 1, 2, 3, 8, 9, 10, 11})) {
     MVT SubVT = MVT::getVectorVT(VT.getVectorElementType(), 4);
-    SDValue SubVec = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT,
-                                 OnlyUsesV1 ? V1 : V2,
-                              DAG.getIntPtrConstant(0, DL));
+    SDValue SubVec =
+        DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, SubVT, OnlyUsesV1 ? V1 : V2,
+                    DAG.getIntPtrConstant(0, DL));
     return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT, V1, SubVec,
                        DAG.getIntPtrConstant(4, DL));
   }
 
-  assert(WidenedMask.size() == 4);
-
   // See if this is an insertion of the lower 128-bits of V2 into V1.
   bool IsInsert = true;
   int V2Index = -1;
   for (int i = 0; i < 4; ++i) {
-    assert(WidenedMask[i] >= -1);
-    if (WidenedMask[i] < 0)
+    assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
+    if (Widened128Mask[i] < 0)
       continue;
 
     // Make sure all V1 subvectors are in place.
-    if (WidenedMask[i] < 4) {
-      if (WidenedMask[i] != i) {
+    if (Widened128Mask[i] < 4) {
+      if (Widened128Mask[i] != i) {
         IsInsert = false;
         break;
       }
     } else {
       // Make sure we only have a single V2 index and its the lowest 128-bits.
-      if (V2Index >= 0 || WidenedMask[i] != 4) {
+      if (V2Index >= 0 || Widened128Mask[i] != 4) {
         IsInsert = false;
         break;
       }
@@ -16580,16 +17076,26 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
     return insert128BitVector(V1, Subvec, V2Index * 2, DAG, DL);
   }
 
+  // See if we can widen to a 256-bit lane shuffle, we're going to lose 128-lane
+  // UNDEF info by lowering to X86ISD::SHUF128 anyway, so by widening where
+  // possible we at least ensure the lanes stay sequential to help later
+  // combines.
+  SmallVector<int, 2> Widened256Mask;
+  if (canWidenShuffleElements(Widened128Mask, Widened256Mask)) {
+    Widened128Mask.clear();
+    narrowShuffleMaskElts(2, Widened256Mask, Widened128Mask);
+  }
+
   // Try to lower to vshuf64x2/vshuf32x4.
   SDValue Ops[2] = {DAG.getUNDEF(VT), DAG.getUNDEF(VT)};
   unsigned PermMask = 0;
   // Insure elements came from the same Op.
   for (int i = 0; i < 4; ++i) {
-    assert(WidenedMask[i] >= -1);
-    if (WidenedMask[i] < 0)
+    assert(Widened128Mask[i] >= -1 && "Illegal shuffle sentinel value");
+    if (Widened128Mask[i] < 0)
       continue;
 
-    SDValue Op = WidenedMask[i] >= 4 ? V2 : V1;
+    SDValue Op = Widened128Mask[i] >= 4 ? V2 : V1;
     unsigned OpIndex = i / 2;
     if (Ops[OpIndex].isUndef())
       Ops[OpIndex] = Op;
@@ -16598,7 +17104,7 @@ static SDValue lowerV4X128Shuffle(const SDLoc &DL, MVT VT, ArrayRef<int> Mask,
 
     // Convert the 128-bit shuffle mask selection values into 128-bit selection
     // bits defined by a vshuf64x2 instruction's immediate control byte.
-    PermMask |= (WidenedMask[i] % 4) << (i * 2);
+    PermMask |= (Widened128Mask[i] % 4) << (i * 2);
   }
 
   return DAG.getNode(X86ISD::SHUF128, DL, VT, Ops[0], Ops[1],
@@ -16696,6 +17202,12 @@ static SDValue lowerV16F32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return lowerShuffleWithSHUFPS(DL, MVT::v16f32, RepeatedMask, V1, V2, DAG);
   }
 
+  // Try to create an in-lane repeating shuffle mask and then shuffle the
+  // results into the target lanes.
+  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+          DL, MVT::v16f32, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
   // If we have a single input shuffle with different shuffle patterns in the
   // 128-bit lanes and don't lane cross, use variable mask VPERMILPS.
   if (V2.isUndef() &&
@@ -16728,7 +17240,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     SmallVector<int, 2> Repeated128Mask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v8i64, Mask, Repeated128Mask)) {
       SmallVector<int, 4> PSHUFDMask;
-      scaleShuffleMask<int>(2, Repeated128Mask, PSHUFDMask);
+      narrowShuffleMaskElts(2, Repeated128Mask, PSHUFDMask);
       return DAG.getBitcast(
           MVT::v8i64,
           DAG.getNode(X86ISD::PSHUFD, DL, MVT::v16i32,
@@ -16752,7 +17264,7 @@ static SDValue lowerV8I64Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Shift;
 
   // Try to use VALIGN.
-  if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v8i64, V1, V2, Mask,
+  if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v8i64, V1, V2, Mask,
                                             Subtarget, DAG))
     return Rotate;
 
@@ -16814,7 +17326,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Shift;
 
   // Try to use VALIGN.
-  if (SDValue Rotate = lowerShuffleAsRotate(DL, MVT::v16i32, V1, V2, Mask,
+  if (SDValue Rotate = lowerShuffleAsVALIGN(DL, MVT::v16i32, V1, V2, Mask,
                                             Subtarget, DAG))
     return Rotate;
 
@@ -16833,6 +17345,13 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                             CastV1, CastV2, DAG);
     return DAG.getBitcast(MVT::v16i32, ShufPS);
   }
+
+  // Try to create an in-lane repeating shuffle mask and then shuffle the
+  // results into the target lanes.
+  if (SDValue V = lowerShuffleAsRepeatedMaskAndLanePermute(
+          DL, MVT::v16i32, V1, V2, Mask, Subtarget, DAG))
+    return V;
+
   // If we have AVX512F support, we can use VEXPAND.
   if (SDValue V = lowerShuffleToEXPAND(DL, MVT::v16i32, Zeroable, Mask, V1, V2,
                                        DAG, Subtarget))
@@ -16841,6 +17360,7 @@ static SDValue lowerV16I32Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v16i32, V1, V2, Mask,
                                           Zeroable, Subtarget, DAG))
     return Blend;
+
   return lowerShuffleWithPERMV(DL, MVT::v16i32, Mask, V1, V2, DAG);
 }
 
@@ -16865,6 +17385,11 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
   if (SDValue V = lowerShuffleWithUNPCK(DL, MVT::v32i16, Mask, V1, V2, DAG))
     return V;
 
+  // Use dedicated pack instructions for masks that match their pattern.
+  if (SDValue V =
+          lowerShuffleWithPACK(DL, MVT::v32i16, Mask, V1, V2, DAG, Subtarget))
+    return V;
+
   // Try to use shift instructions.
   if (SDValue Shift = lowerShuffleAsShift(DL, MVT::v32i16, V1, V2, Mask,
                                           Zeroable, Subtarget, DAG))
@@ -16876,18 +17401,23 @@ static SDValue lowerV32I16Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
     return Rotate;
 
   if (V2.isUndef()) {
+    // Try to use bit rotation instructions.
+    if (SDValue Rotate =
+            lowerShuffleAsBitRotate(DL, MVT::v32i16, V1, Mask, Subtarget, DAG))
+      return Rotate;
+
     SmallVector<int, 8> RepeatedMask;
     if (is128BitLaneRepeatedShuffleMask(MVT::v32i16, Mask, RepeatedMask)) {
       // As this is a single-input shuffle, the repeated mask should be
       // a strictly valid v8i16 mask that we can pass through to the v8i16
       // lowering to handle even the v32 case.
-      return lowerV8I16GeneralSingleInputShuffle(
-          DL, MVT::v32i16, V1, RepeatedMask, Subtarget, DAG);
+      return lowerV8I16GeneralSingleInputShuffle(DL, MVT::v32i16, V1,
+                                                 RepeatedMask, Subtarget, DAG);
     }
   }
 
   if (SDValue Blend = lowerShuffleAsBlend(DL, MVT::v32i16, V1, V2, Mask,
-                                                Zeroable, Subtarget, DAG))
+                                          Zeroable, Subtarget, DAG))
     return Blend;
 
   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v32i16, Mask, V1, V2,
@@ -16933,6 +17463,17 @@ static SDValue lowerV64I8Shuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                 Subtarget, DAG))
     return Rotate;
 
+  // Try to use bit rotation instructions.
+  if (V2.isUndef())
+    if (SDValue Rotate =
+            lowerShuffleAsBitRotate(DL, MVT::v64i8, V1, Mask, Subtarget, DAG))
+      return Rotate;
+
+  // Lower as AND if possible.
+  if (SDValue Masked = lowerShuffleAsBitMask(DL, MVT::v64i8, V1, V2, Mask,
+                                             Zeroable, Subtarget, DAG))
+    return Masked;
+
   if (SDValue PSHUFB = lowerShuffleWithPSHUFB(DL, MVT::v64i8, Mask, V1, V2,
                                               Zeroable, Subtarget, DAG))
     return PSHUFB;
@@ -16995,6 +17536,18 @@ static SDValue lower512BitShuffle(const SDLoc &DL, ArrayRef<int> Mask,
                                                   Subtarget, DAG))
     return Broadcast;
 
+  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI()) {
+    // Try using bit ops for masking and blending before falling back to
+    // splitting.
+    if (SDValue V = lowerShuffleAsBitMask(DL, VT, V1, V2, Mask, Zeroable,
+                                          Subtarget, DAG))
+      return V;
+    if (SDValue V = lowerShuffleAsBitBlend(DL, VT, V1, V2, Mask, DAG))
+      return V;
+
+    return splitAndLowerShuffle(DL, VT, V1, V2, Mask, DAG);
+  }
+
   // Dispatch to each element type for lowering. If we don't have support for
   // specific element type shuffles at 512 bits, immediately split them and
   // lower them. Each lowering routine of a given type is allowed to assume that
@@ -17477,6 +18030,10 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
   unsigned EltSize = VT.getScalarSizeInBits();
   unsigned NumElts = VT.getVectorNumElements();
 
+  // Expand v32i16/v64i8 without BWI.
+  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+    return SDValue();
+
   // If the VSELECT is on a 512-bit type, we have to convert a non-i1 condition
   // into an i1 condition so that we can use the mask-based 512-bit blend
   // instructions.
@@ -17532,14 +18089,24 @@ SDValue X86TargetLowering::LowerVSELECT(SDValue Op, SelectionDAG &DAG) const {
 
 static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
   MVT VT = Op.getSimpleValueType();
+  SDValue Vec = Op.getOperand(0);
+  SDValue Idx = Op.getOperand(1);
+  assert(isa<ConstantSDNode>(Idx) && "Constant index expected");
   SDLoc dl(Op);
 
-  if (!Op.getOperand(0).getSimpleValueType().is128BitVector())
+  if (!Vec.getSimpleValueType().is128BitVector())
     return SDValue();
 
   if (VT.getSizeInBits() == 8) {
-    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32,
-                                  Op.getOperand(0), Op.getOperand(1));
+    // If IdxVal is 0, it's cheaper to do a move instead of a pextrb, unless
+    // we're going to zero extend the register or fold the store.
+    if (llvm::isNullConstant(Idx) && !MayFoldIntoZeroExtend(Op) &&
+        !MayFoldIntoStore(Op))
+      return DAG.getNode(ISD::TRUNCATE, dl, MVT::i8,
+                         DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
+                                     DAG.getBitcast(MVT::v4i32, Vec), Idx));
+
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRB, dl, MVT::i32, Vec, Idx);
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
   }
 
@@ -17552,22 +18119,17 @@ static SDValue LowerEXTRACT_VECTOR_ELT_SSE4(SDValue Op, SelectionDAG &DAG) {
     if (!Op.hasOneUse())
       return SDValue();
     SDNode *User = *Op.getNode()->use_begin();
-    if ((User->getOpcode() != ISD::STORE ||
-         isNullConstant(Op.getOperand(1))) &&
+    if ((User->getOpcode() != ISD::STORE || isNullConstant(Idx)) &&
         (User->getOpcode() != ISD::BITCAST ||
          User->getValueType(0) != MVT::i32))
       return SDValue();
     SDValue Extract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
-                                  DAG.getBitcast(MVT::v4i32, Op.getOperand(0)),
-                                  Op.getOperand(1));
+                                  DAG.getBitcast(MVT::v4i32, Vec), Idx);
     return DAG.getBitcast(MVT::f32, Extract);
   }
 
-  if (VT == MVT::i32 || VT == MVT::i64) {
-    // ExtractPS/pextrq works with constant index.
-    if (isa<ConstantSDNode>(Op.getOperand(1)))
+  if (VT == MVT::i32 || VT == MVT::i64)
       return Op;
-  }
 
   return SDValue();
 }
@@ -17580,6 +18142,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
   SDLoc dl(Vec);
   MVT VecVT = Vec.getSimpleValueType();
   SDValue Idx = Op.getOperand(1);
+  auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
   MVT EltVT = Op.getSimpleValueType();
 
   assert((VecVT.getVectorNumElements() <= 16 || Subtarget.hasBWI()) &&
@@ -17587,7 +18150,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
 
   // variable index can't be handled in mask registers,
   // extend vector to VR512/128
-  if (!isa<ConstantSDNode>(Idx)) {
+  if (!IdxC) {
     unsigned NumElts = VecVT.getVectorNumElements();
     // Extending v8i1/v16i1 to 512-bit get better performance on KNL
     // than extending to 128/256bit.
@@ -17598,7 +18161,7 @@ static SDValue ExtractBitFromMaskVector(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ISD::TRUNCATE, dl, EltVT, Elt);
   }
 
-  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  unsigned IdxVal = IdxC->getZExtValue();
   if (IdxVal == 0) // the operation is legal
     return Op;
 
@@ -17627,11 +18190,12 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
   SDValue Vec = Op.getOperand(0);
   MVT VecVT = Vec.getSimpleValueType();
   SDValue Idx = Op.getOperand(1);
+  auto* IdxC = dyn_cast<ConstantSDNode>(Idx);
 
   if (VecVT.getVectorElementType() == MVT::i1)
     return ExtractBitFromMaskVector(Op, DAG, Subtarget);
 
-  if (!isa<ConstantSDNode>(Idx)) {
+  if (!IdxC) {
     // Its more profitable to go through memory (1 cycles throughput)
     // than using VMOVD + VPERMV/PSHUFB sequence ( 2/3 cycles throughput)
     // IACA tool was used to get performance estimation
@@ -17665,7 +18229,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
     return SDValue();
   }
 
-  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
+  unsigned IdxVal = IdxC->getZExtValue();
 
   // If this is a 256-bit vector result, first extract the 128-bit vector and
   // then extract the element from the 128-bit vector.
@@ -17697,9 +18261,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
                          DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i32,
                                      DAG.getBitcast(MVT::v4i32, Vec), Idx));
 
-    // Transform it so it match pextrw which produces a 32-bit result.
-    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32,
-                                  Op.getOperand(0), Op.getOperand(1));
+    SDValue Extract = DAG.getNode(X86ISD::PEXTRW, dl, MVT::i32, Vec, Idx);
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Extract);
   }
 
@@ -17789,9 +18351,7 @@ static SDValue InsertBitToMaskVector(SDValue Op, SelectionDAG &DAG,
 
   // Copy into a k-register, extract to v1i1 and insert_subvector.
   SDValue EltInVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v1i1, Elt);
-
-  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec,
-                     Op.getOperand(2));
+  return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VecVT, Vec, EltInVec, Idx);
 }
 
 SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
@@ -17864,11 +18424,22 @@ SDValue X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op,
   assert(VT.is128BitVector() && "Only 128-bit vector types should be left!");
 
   // This will be just movd/movq/movss/movsd.
-  if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode()) &&
-      (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
-       EltVT == MVT::i64)) {
-    N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
-    return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+  if (IdxVal == 0 && ISD::isBuildVectorAllZeros(N0.getNode())) {
+    if (EltVT == MVT::i32 || EltVT == MVT::f32 || EltVT == MVT::f64 ||
+        EltVT == MVT::i64) {
+      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, N1);
+      return getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+    }
+
+    // We can't directly insert an i8 or i16 into a vector, so zero extend
+    // it to i32 first.
+    if (EltVT == MVT::i16 || EltVT == MVT::i8) {
+      N1 = DAG.getNode(ISD::ZERO_EXTEND, dl, MVT::i32, N1);
+      MVT ShufVT = MVT::getVectorVT(MVT::i32, VT.getSizeInBits()/32);
+      N1 = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, ShufVT, N1);
+      N1 = getShuffleVectorZeroOrUndef(N1, 0, true, Subtarget, DAG);
+      return DAG.getBitcast(VT, N1);
+    }
   }
 
   // Transform it so it match pinsr{b,w} which expects a GR32 as its second
@@ -17981,12 +18552,8 @@ static SDValue LowerEXTRACT_SUBVECTOR(SDValue Op, const X86Subtarget &Subtarget,
 
   SDLoc dl(Op);
   SDValue Vec = Op.getOperand(0);
-  SDValue Idx = Op.getOperand(1);
-
-  if (!isa<ConstantSDNode>(Idx))
-    return SDValue();
+  uint64_t IdxVal = Op.getConstantOperandVal(1);
 
-  unsigned IdxVal = cast<ConstantSDNode>(Idx)->getZExtValue();
   if (IdxVal == 0) // the operation is legal
     return Op;
 
@@ -18045,7 +18612,7 @@ X86TargetLowering::LowerConstantPool(SDValue Op, SelectionDAG &DAG) const {
 
   auto PtrVT = getPointerTy(DAG.getDataLayout());
   SDValue Result = DAG.getTargetConstantPool(
-      CP->getConstVal(), PtrVT, CP->getAlignment(), CP->getOffset(), OpFlag);
+      CP->getConstVal(), PtrVT, CP->getAlign(), CP->getOffset(), OpFlag);
   SDLoc DL(CP);
   Result = DAG.getNode(getGlobalWrapperKind(), DL, PtrVT, Result);
   // With PIC, the address is actually $g + Offset.
@@ -18554,25 +19121,47 @@ static SDValue LowerFunnelShift(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getNode(IsFSHR ? X86ISD::VSHRDV : X86ISD::VSHLDV, DL, VT,
                        Op0, Op1, Amt);
   }
-
-  assert((VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
-         "Unexpected funnel shift type!");
+  assert(
+      (VT == MVT::i8 || VT == MVT::i16 || VT == MVT::i32 || VT == MVT::i64) &&
+      "Unexpected funnel shift type!");
 
   // Expand slow SHLD/SHRD cases if we are not optimizing for size.
   bool OptForSize = DAG.shouldOptForSize();
-  if (!OptForSize && Subtarget.isSHLDSlow())
-    return SDValue();
+  bool ExpandFunnel = !OptForSize && Subtarget.isSHLDSlow();
 
-  if (IsFSHR)
-    std::swap(Op0, Op1);
+  // fshl(x,y,z) -> (((aext(x) << bw) | zext(y)) << (z & (bw-1))) >> bw.
+  // fshr(x,y,z) -> (((aext(x) << bw) | zext(y)) >> (z & (bw-1))).
+  if ((VT == MVT::i8 || (ExpandFunnel && VT == MVT::i16)) &&
+      !isa<ConstantSDNode>(Amt)) {
+    unsigned EltSizeInBits = VT.getScalarSizeInBits();
+    SDValue Mask = DAG.getConstant(EltSizeInBits - 1, DL, Amt.getValueType());
+    SDValue HiShift = DAG.getConstant(EltSizeInBits, DL, Amt.getValueType());
+    Op0 = DAG.getAnyExtOrTrunc(Op0, DL, MVT::i32);
+    Op1 = DAG.getZExtOrTrunc(Op1, DL, MVT::i32);
+    Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt, Mask);
+    SDValue Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Op0, HiShift);
+    Res = DAG.getNode(ISD::OR, DL, MVT::i32, Res, Op1);
+    if (IsFSHR) {
+      Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, Amt);
+    } else {
+      Res = DAG.getNode(ISD::SHL, DL, MVT::i32, Res, Amt);
+      Res = DAG.getNode(ISD::SRL, DL, MVT::i32, Res, HiShift);
+    }
+    return DAG.getZExtOrTrunc(Res, DL, VT);
+  }
+
+  if (VT == MVT::i8 || ExpandFunnel)
+    return SDValue();
 
   // i16 needs to modulo the shift amount, but i32/i64 have implicit modulo.
-  if (VT == MVT::i16)
+  if (VT == MVT::i16) {
     Amt = DAG.getNode(ISD::AND, DL, Amt.getValueType(), Amt,
                       DAG.getConstant(15, DL, Amt.getValueType()));
+    unsigned FSHOp = (IsFSHR ? X86ISD::FSHR : X86ISD::FSHL);
+    return DAG.getNode(FSHOp, DL, VT, Op0, Op1, Amt);
+  }
 
-  unsigned SHDOp = (IsFSHR ? X86ISD::SHRD : X86ISD::SHLD);
-  return DAG.getNode(SHDOp, DL, VT, Op0, Op1, Amt);
+  return Op;
 }
 
 // Try to use a packed vector operation to handle i64 on 32-bit targets when
@@ -18682,6 +19271,56 @@ static SDValue vectorizeExtractedCast(SDValue Cast, SelectionDAG &DAG,
                      DAG.getIntPtrConstant(0, DL));
 }
 
+/// Given a scalar cast to FP with a cast to integer operand (almost an ftrunc),
+/// try to vectorize the cast ops. This will avoid an expensive round-trip
+/// between XMM and GPR.
+static SDValue lowerFPToIntToFP(SDValue CastToFP, SelectionDAG &DAG,
+                                const X86Subtarget &Subtarget) {
+  // TODO: Allow FP_TO_UINT.
+  SDValue CastToInt = CastToFP.getOperand(0);
+  MVT VT = CastToFP.getSimpleValueType();
+  if (CastToInt.getOpcode() != ISD::FP_TO_SINT || VT.isVector())
+    return SDValue();
+
+  MVT IntVT = CastToInt.getSimpleValueType();
+  SDValue X = CastToInt.getOperand(0);
+  MVT SrcVT = X.getSimpleValueType();
+  if (SrcVT != MVT::f32 && SrcVT != MVT::f64)
+    return SDValue();
+
+  // See if we have 128-bit vector cast instructions for this type of cast.
+  // We need cvttps2dq/cvttpd2dq and cvtdq2ps/cvtdq2pd.
+  if (!Subtarget.hasSSE2() || (VT != MVT::f32 && VT != MVT::f64) ||
+      IntVT != MVT::i32)
+    return SDValue();
+
+  unsigned SrcSize = SrcVT.getSizeInBits();
+  unsigned IntSize = IntVT.getSizeInBits();
+  unsigned VTSize = VT.getSizeInBits();
+  MVT VecSrcVT = MVT::getVectorVT(SrcVT, 128 / SrcSize);
+  MVT VecIntVT = MVT::getVectorVT(IntVT, 128 / IntSize);
+  MVT VecVT = MVT::getVectorVT(VT, 128 / VTSize);
+
+  // We need target-specific opcodes if this is v2f64 -> v4i32 -> v2f64.
+  unsigned ToIntOpcode =
+      SrcSize != IntSize ? X86ISD::CVTTP2SI : (unsigned)ISD::FP_TO_SINT;
+  unsigned ToFPOpcode =
+      IntSize != VTSize ? X86ISD::CVTSI2P : (unsigned)ISD::SINT_TO_FP;
+
+  // sint_to_fp (fp_to_sint X) --> extelt (sint_to_fp (fp_to_sint (s2v X))), 0
+  //
+  // We are not defining the high elements (for example, zero them) because
+  // that could nullify any performance advantage that we hoped to gain from
+  // this vector op hack. We do not expect any adverse effects (like denorm
+  // penalties) with cast ops.
+  SDLoc DL(CastToFP);
+  SDValue ZeroIdx = DAG.getIntPtrConstant(0, DL);
+  SDValue VecX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecSrcVT, X);
+  SDValue VCastToInt = DAG.getNode(ToIntOpcode, DL, VecIntVT, VecX);
+  SDValue VCastToFP = DAG.getNode(ToFPOpcode, DL, VecVT, VCastToInt);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, VCastToFP, ZeroIdx);
+}
+
 static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
                                     const X86Subtarget &Subtarget) {
   SDLoc DL(Op);
@@ -18739,15 +19378,15 @@ static SDValue lowerINT_TO_FP_vXi64(SDValue Op, SelectionDAG &DAG,
   SmallVector<SDValue, 4> SignCvts(4);
   SmallVector<SDValue, 4> Chains(4);
   for (int i = 0; i != 4; ++i) {
-    SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
+    SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::i64, SignSrc,
                               DAG.getIntPtrConstant(i, DL));
     if (IsStrict) {
       SignCvts[i] =
           DAG.getNode(ISD::STRICT_SINT_TO_FP, DL, {MVT::f32, MVT::Other},
-                      {Op.getOperand(0), Src});
+                      {Op.getOperand(0), Elt});
       Chains[i] = SignCvts[i].getValue(1);
     } else {
-      SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Src);
+      SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, DL, MVT::f32, Elt);
     }
   }
   SDValue SignCvt = DAG.getBuildVector(VT, DL, SignCvts);
@@ -18784,6 +19423,9 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   if (SDValue Extract = vectorizeExtractedCast(Op, DAG, Subtarget))
     return Extract;
 
+  if (SDValue R = lowerFPToIntToFP(Op, DAG, Subtarget))
+    return R;
+
   if (SrcVT.isVector()) {
     if (SrcVT == MVT::v2i32 && VT == MVT::v2f64) {
       // Note: Since v2f64 is a legal type. We don't need to zero extend the
@@ -18832,21 +19474,23 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
     return LowerF128Call(Op, DAG, RTLIB::getSINTTOFP(SrcVT, VT));
 
   SDValue ValueToStore = Src;
-  if (SrcVT == MVT::i64 && UseSSEReg && !Subtarget.is64Bit())
+  if (SrcVT == MVT::i64 && Subtarget.hasSSE2() && !Subtarget.is64Bit())
     // Bitcasting to f64 here allows us to do a single 64-bit store from
     // an SSE register, avoiding the store forwarding penalty that would come
     // with two 32-bit stores.
     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
 
-  unsigned Size = SrcVT.getSizeInBits()/8;
+  unsigned Size = SrcVT.getStoreSize();
+  Align Alignment(Size);
   MachineFunction &MF = DAG.getMachineFunction();
   auto PtrVT = getPointerTy(MF.getDataLayout());
-  int SSFI = MF.getFrameInfo().CreateStackObject(Size, Size, false);
+  int SSFI = MF.getFrameInfo().CreateStackObject(Size, Alignment, false);
+  MachinePointerInfo MPI =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
-  Chain = DAG.getStore(
-      Chain, dl, ValueToStore, StackSlot,
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
-  std::pair<SDValue, SDValue> Tmp = BuildFILD(Op, SrcVT, Chain, StackSlot, DAG);
+  Chain = DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Alignment);
+  std::pair<SDValue, SDValue> Tmp =
+      BuildFILD(VT, SrcVT, dl, Chain, StackSlot, MPI, Alignment, DAG);
 
   if (IsStrict)
     return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
@@ -18854,58 +19498,40 @@ SDValue X86TargetLowering::LowerSINT_TO_FP(SDValue Op,
   return Tmp.first;
 }
 
-std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
-                                     SDValue StackSlot,
-                                     SelectionDAG &DAG) const {
+std::pair<SDValue, SDValue> X86TargetLowering::BuildFILD(
+    EVT DstVT, EVT SrcVT, const SDLoc &DL, SDValue Chain, SDValue Pointer,
+    MachinePointerInfo PtrInfo, Align Alignment, SelectionDAG &DAG) const {
   // Build the FILD
-  SDLoc DL(Op);
   SDVTList Tys;
-  bool useSSE = isScalarFPTypeInSSEReg(Op.getValueType());
+  bool useSSE = isScalarFPTypeInSSEReg(DstVT);
   if (useSSE)
-    Tys = DAG.getVTList(MVT::f64, MVT::Other, MVT::Glue);
+    Tys = DAG.getVTList(MVT::f80, MVT::Other);
   else
-    Tys = DAG.getVTList(Op.getValueType(), MVT::Other);
+    Tys = DAG.getVTList(DstVT, MVT::Other);
 
-  unsigned ByteSize = SrcVT.getSizeInBits() / 8;
-
-  FrameIndexSDNode *FI = dyn_cast<FrameIndexSDNode>(StackSlot);
-  MachineMemOperand *LoadMMO;
-  if (FI) {
-    int SSFI = FI->getIndex();
-    LoadMMO = DAG.getMachineFunction().getMachineMemOperand(
-        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
-        MachineMemOperand::MOLoad, ByteSize, ByteSize);
-  } else {
-    LoadMMO = cast<LoadSDNode>(StackSlot)->getMemOperand();
-    StackSlot = StackSlot.getOperand(1);
-  }
-  SDValue FILDOps[] = {Chain, StackSlot};
+  SDValue FILDOps[] = {Chain, Pointer};
   SDValue Result =
-      DAG.getMemIntrinsicNode(useSSE ? X86ISD::FILD_FLAG : X86ISD::FILD, DL,
-                              Tys, FILDOps, SrcVT, LoadMMO);
+      DAG.getMemIntrinsicNode(X86ISD::FILD, DL, Tys, FILDOps, SrcVT, PtrInfo,
+                              Alignment, MachineMemOperand::MOLoad);
   Chain = Result.getValue(1);
 
   if (useSSE) {
-    SDValue InFlag = Result.getValue(2);
-
-    // FIXME: Currently the FST is glued to the FILD_FLAG. This
-    // shouldn't be necessary except that RFP cannot be live across
-    // multiple blocks. When stackifier is fixed, they can be uncoupled.
     MachineFunction &MF = DAG.getMachineFunction();
-    unsigned SSFISize = Op.getValueSizeInBits() / 8;
-    int SSFI = MF.getFrameInfo().CreateStackObject(SSFISize, SSFISize, false);
+    unsigned SSFISize = DstVT.getStoreSize();
+    int SSFI =
+        MF.getFrameInfo().CreateStackObject(SSFISize, Align(SSFISize), false);
     auto PtrVT = getPointerTy(MF.getDataLayout());
     SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
     Tys = DAG.getVTList(MVT::Other);
-    SDValue FSTOps[] = {Chain, Result, StackSlot, InFlag};
+    SDValue FSTOps[] = {Chain, Result, StackSlot};
     MachineMemOperand *StoreMMO = DAG.getMachineFunction().getMachineMemOperand(
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
-        MachineMemOperand::MOStore, SSFISize, SSFISize);
+        MachineMemOperand::MOStore, SSFISize, Align(SSFISize));
 
-    Chain = DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps,
-                                    Op.getValueType(), StoreMMO);
+    Chain =
+        DAG.getMemIntrinsicNode(X86ISD::FST, DL, Tys, FSTOps, DstVT, StoreMMO);
     Result = DAG.getLoad(
-        Op.getValueType(), DL, Chain, StackSlot,
+        DstVT, DL, Chain, StackSlot,
         MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI));
     Chain = Result.getValue(1);
   }
@@ -18948,7 +19574,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
   static const uint32_t CV0[] = { 0x43300000, 0x45300000, 0, 0 };
   Constant *C0 = ConstantDataVector::get(*Context, CV0);
   auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
-  SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, 16);
+  SDValue CPIdx0 = DAG.getConstantPool(C0, PtrVT, Align(16));
 
   SmallVector<Constant*,2> CV1;
   CV1.push_back(
@@ -18958,7 +19584,7 @@ static SDValue LowerUINT_TO_FP_i64(SDValue Op, SelectionDAG &DAG,
     ConstantFP::get(*Context, APFloat(APFloat::IEEEdouble(),
                                       APInt(64, 0x4530000000000000ULL))));
   Constant *C1 = ConstantVector::get(CV1);
-  SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, 16);
+  SDValue CPIdx1 = DAG.getConstantPool(C1, PtrVT, Align(16));
 
   // Load the 64-bit value into an XMM register.
   SDValue XR1 =
@@ -19163,13 +19789,13 @@ static SDValue lowerUINT_TO_FP_vXi32(SDValue Op, SelectionDAG &DAG,
         *DAG.getContext(),
         APFloat(APFloat::IEEEdouble(), APInt(64, 0x4330000000000000ULL)));
     auto PtrVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
-    SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, /*Alignment*/ 8);
+    SDValue CPIdx = DAG.getConstantPool(Bias, PtrVT, Align(8));
     SDVTList Tys = DAG.getVTList(MVT::v4f64, MVT::Other);
     SDValue Ops[] = {DAG.getEntryNode(), CPIdx};
     SDValue VBias = DAG.getMemIntrinsicNode(
         X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
-        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()),
-        /*Alignment*/ 8, MachineMemOperand::MOLoad);
+        MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), Align(8),
+        MachineMemOperand::MOLoad);
 
     SDValue Or = DAG.getNode(ISD::OR, DL, MVT::v4i64, ZExtIn,
                              DAG.getBitcast(MVT::v4i64, VBias));
@@ -19337,15 +19963,18 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
     return SDValue();
 
   // Make a 64-bit buffer, and use it to build an FILD.
-  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64);
+  SDValue StackSlot = DAG.CreateStackTemporary(MVT::i64, 8);
+  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
+  MachinePointerInfo MPI =
+    MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI);
   if (SrcVT == MVT::i32) {
     SDValue OffsetSlot = DAG.getMemBasePlusOffset(StackSlot, 4, dl);
     SDValue Store1 =
-        DAG.getStore(Chain, dl, Src, StackSlot, MachinePointerInfo());
+        DAG.getStore(Chain, dl, Src, StackSlot, MPI, 8 /*Align*/);
     SDValue Store2 = DAG.getStore(Store1, dl, DAG.getConstant(0, dl, MVT::i32),
-                                  OffsetSlot, MachinePointerInfo());
+                                  OffsetSlot, MPI.getWithOffset(4), 4);
     std::pair<SDValue, SDValue> Tmp =
-        BuildFILD(Op, MVT::i64, Store2, StackSlot, DAG);
+        BuildFILD(DstVT, MVT::i64, dl, Store2, StackSlot, MPI, Align(8), DAG);
     if (IsStrict)
       return DAG.getMergeValues({Tmp.first, Tmp.second}, dl);
 
@@ -19361,21 +19990,17 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
     ValueToStore = DAG.getBitcast(MVT::f64, ValueToStore);
   }
   SDValue Store =
-      DAG.getStore(Chain, dl, ValueToStore, StackSlot, MachinePointerInfo());
+      DAG.getStore(Chain, dl, ValueToStore, StackSlot, MPI, Align(8));
   // For i64 source, we need to add the appropriate power of 2 if the input
   // was negative.  This is the same as the optimization in
   // DAGTypeLegalizer::ExpandIntOp_UNIT_TO_FP, and for it to be safe here,
   // we must be careful to do the computation in x87 extended precision, not
   // in SSE. (The generic code can't know it's OK to do this, or how to.)
-  int SSFI = cast<FrameIndexSDNode>(StackSlot)->getIndex();
-  MachineMemOperand *MMO = DAG.getMachineFunction().getMachineMemOperand(
-      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SSFI),
-      MachineMemOperand::MOLoad, 8, 8);
-
   SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
   SDValue Ops[] = { Store, StackSlot };
-  SDValue Fild = DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops,
-                                         MVT::i64, MMO);
+  SDValue Fild =
+      DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, Ops, MVT::i64, MPI,
+                              Align(8), MachineMemOperand::MOLoad);
   Chain = Fild.getValue(1);
 
 
@@ -19388,6 +20013,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   APInt FF(64, 0x5F80000000000000ULL);
   SDValue FudgePtr = DAG.getConstantPool(
       ConstantInt::get(*DAG.getContext(), FF), PtrVT);
+  Align CPAlignment = cast<ConstantPoolSDNode>(FudgePtr)->getAlign();
 
   // Get a pointer to FF if the sign bit was set, or to 0 otherwise.
   SDValue Zero = DAG.getIntPtrConstant(0, dl);
@@ -19399,7 +20025,7 @@ SDValue X86TargetLowering::LowerUINT_TO_FP(SDValue Op,
   SDValue Fudge = DAG.getExtLoad(
       ISD::EXTLOAD, dl, MVT::f80, Chain, FudgePtr,
       MachinePointerInfo::getConstantPool(DAG.getMachineFunction()), MVT::f32,
-      /* Alignment = */ 4);
+      CPAlignment);
   Chain = Fudge.getValue(1);
   // Extend everything to 80 bits to force it to be done on x87.
   // TODO: Are there any fast-math-flags to propagate here?
@@ -19462,7 +20088,8 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   // stack slot.
   MachineFunction &MF = DAG.getMachineFunction();
   unsigned MemSize = DstTy.getStoreSize();
-  int SSFI = MF.getFrameInfo().CreateStackObject(MemSize, MemSize, false);
+  int SSFI =
+      MF.getFrameInfo().CreateStackObject(MemSize, Align(MemSize), false);
   SDValue StackSlot = DAG.getFrameIndex(SSFI, PtrVT);
 
   Chain = IsStrict ? Op.getOperand(0) : DAG.getEntryNode();
@@ -19537,20 +20164,20 @@ X86TargetLowering::FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG,
   if (isScalarFPTypeInSSEReg(TheVT)) {
     assert(DstTy == MVT::i64 && "Invalid FP_TO_SINT to lower!");
     Chain = DAG.getStore(Chain, DL, Value, StackSlot, MPI);
-    SDVTList Tys = DAG.getVTList(TheVT, MVT::Other);
+    SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
     SDValue Ops[] = { Chain, StackSlot };
 
     unsigned FLDSize = TheVT.getStoreSize();
     assert(FLDSize <= MemSize && "Stack slot not big enough");
     MachineMemOperand *MMO = MF.getMachineMemOperand(
-        MPI, MachineMemOperand::MOLoad, FLDSize, FLDSize);
+        MPI, MachineMemOperand::MOLoad, FLDSize, Align(FLDSize));
     Value = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, TheVT, MMO);
     Chain = Value.getValue(1);
   }
 
   // Build the FP_TO_INT*_IN_MEM
   MachineMemOperand *MMO = MF.getMachineMemOperand(
-      MPI, MachineMemOperand::MOStore, MemSize, MemSize);
+      MPI, MachineMemOperand::MOStore, MemSize, Align(MemSize));
   SDValue Ops[] = { Chain, Value, StackSlot };
   SDValue FIST = DAG.getMemIntrinsicNode(X86ISD::FP_TO_INT_IN_MEM, DL,
                                          DAG.getVTList(MVT::Other),
@@ -19590,14 +20217,9 @@ static SDValue LowerAVXExtend(SDValue Op, SelectionDAG &DAG,
 
   unsigned ExtendInVecOpc = getOpcode_EXTEND_VECTOR_INREG(Opc);
 
-  // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
-  if (InVT == MVT::v8i8) {
-    if (VT != MVT::v8i64)
-      return SDValue();
-
-    In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
-                     MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
-    return DAG.getNode(ExtendInVecOpc, dl, VT, In);
+  if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
+    assert(InVT == MVT::v32i8 && "Unexpected VT!");
+    return splitVectorIntUnary(Op, DAG);
   }
 
   if (Subtarget.hasInt256())
@@ -19729,7 +20351,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
          "Unexpected PACK opcode");
   assert(DstVT.isVector() && "VT not a vector?");
 
-  // Requires SSE2 but AVX512 has fast vector truncate.
+  // Requires SSE2 for PACKSS (SSE41 PACKUSDW is handled below).
   if (!Subtarget.hasSSE2())
     return SDValue();
 
@@ -19770,15 +20392,14 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
     InVT = EVT::getVectorVT(Ctx, InVT, 128 / InVT.getSizeInBits());
     OutVT = EVT::getVectorVT(Ctx, OutVT, 128 / OutVT.getSizeInBits());
     In = DAG.getBitcast(InVT, In);
-    SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, In);
+    SDValue Res = DAG.getNode(Opcode, DL, OutVT, In, DAG.getUNDEF(InVT));
     Res = extractSubVector(Res, 0, DAG, DL, 64);
     return DAG.getBitcast(DstVT, Res);
   }
 
-  // Extract lower/upper subvectors.
-  unsigned NumSubElts = NumElems / 2;
-  SDValue Lo = extractSubVector(In, 0 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
-  SDValue Hi = extractSubVector(In, 1 * NumSubElts, DAG, DL, SrcSizeInBits / 2);
+  // Split lower/upper subvectors.
+  SDValue Lo, Hi;
+  std::tie(Lo, Hi) = splitVector(In, DAG, DL);
 
   unsigned SubSizeInBits = SrcSizeInBits / 2;
   InVT = EVT::getVectorVT(Ctx, InVT, SubSizeInBits / InVT.getSizeInBits());
@@ -19804,7 +20425,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
     // Scale shuffle mask to avoid bitcasts and help ComputeNumSignBits.
     SmallVector<int, 64> Mask;
     int Scale = 64 / OutVT.getScalarSizeInBits();
-    scaleShuffleMask<int>(Scale, ArrayRef<int>({ 0, 2, 1, 3 }), Mask);
+    narrowShuffleMaskElts(Scale, { 0, 2, 1, 3 }, Mask);
     Res = DAG.getVectorShuffle(OutVT, DL, Res, Res, Mask);
 
     if (DstVT.is256BitVector())
@@ -19818,7 +20439,7 @@ static SDValue truncateVectorWithPACK(unsigned Opcode, EVT DstVT, SDValue In,
 
   // Recursively pack lower/upper subvectors, concat result and pack again.
   assert(SrcSizeInBits >= 256 && "Expected 256-bit vector or greater");
-  EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumSubElts);
+  EVT PackedVT = EVT::getVectorVT(Ctx, PackedSVT, NumElems / 2);
   Lo = truncateVectorWithPACK(Opcode, PackedVT, Lo, DL, DAG, Subtarget);
   Hi = truncateVectorWithPACK(Opcode, PackedVT, Hi, DL, DAG, Subtarget);
 
@@ -19865,17 +20486,22 @@ static SDValue LowerTruncateVecI1(SDValue Op, SelectionDAG &DAG,
     // trying to avoid 512-bit vectors. If we are avoiding 512-bit vectors
     // we need to split into two 8 element vectors which we can extend to v8i32,
     // truncate and concat the results. There's an additional complication if
-    // the original type is v16i8. In that case we can't split the v16i8 so
-    // first we pre-extend it to v16i16 which we can split to v8i16, then extend
-    // to v8i32, truncate that to v8i1 and concat the two halves.
+    // the original type is v16i8. In that case we can't split the v16i8
+    // directly, so we need to shuffle high elements to low and use
+    // sign_extend_vector_inreg.
     if (NumElts == 16 && !Subtarget.canExtendTo512DQ()) {
+      SDValue Lo, Hi;
       if (InVT == MVT::v16i8) {
-        // First we need to sign extend up to 256-bits so we can split that.
-        InVT = MVT::v16i16;
-        In = DAG.getNode(ISD::SIGN_EXTEND, DL, InVT, In);
+        Lo = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, In);
+        Hi = DAG.getVectorShuffle(
+            InVT, DL, In, In,
+            {8, 9, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1});
+        Hi = DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, MVT::v8i32, Hi);
+      } else {
+        assert(InVT == MVT::v16i16 && "Unexpected VT!");
+        Lo = extract128BitVector(In, 0, DAG, DL);
+        Hi = extract128BitVector(In, 8, DAG, DL);
       }
-      SDValue Lo = extract128BitVector(In, 0, DAG, DL);
-      SDValue Hi = extract128BitVector(In, 8, DAG, DL);
       // We're split now, just emit two truncates and a concat. The two
       // truncates will trigger legalization to come back to this function.
       Lo = DAG.getNode(ISD::TRUNCATE, DL, MVT::v8i1, Lo);
@@ -19918,7 +20544,8 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
   if (!TLI.isTypeLegal(InVT)) {
     if ((InVT == MVT::v8i64 || InVT == MVT::v16i32 || InVT == MVT::v16i64) &&
         VT.is128BitVector()) {
-      assert(Subtarget.hasVLX() && "Unexpected subtarget!");
+      assert((InVT == MVT::v16i64 || Subtarget.hasVLX()) &&
+             "Unexpected subtarget!");
       // The default behavior is to truncate one step, concatenate, and then
       // truncate the remainder. We'd rather produce two 64-bit results and
       // concatenate those.
@@ -19942,6 +20569,11 @@ SDValue X86TargetLowering::LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const {
 
   // vpmovqb/w/d, vpmovdb/w, vpmovwb
   if (Subtarget.hasAVX512()) {
+    if (InVT == MVT::v32i16 && !Subtarget.hasBWI()) {
+      assert(VT == MVT::v32i8 && "Unexpected VT!");
+      return splitVectorIntUnary(Op, DAG);
+    }
+
     // word to byte only under BWI. Otherwise we have to promoted to v16i32
     // and then truncate that. But we should only do that if we haven't been
     // asked to avoid 512-bit vectors. The actual promotion to v16i32 will be
@@ -20174,6 +20806,25 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
     }
 
     if (VT == MVT::v2i64 && SrcVT  == MVT::v2f32) {
+      if (!Subtarget.hasVLX()) {
+        // Non-strict nodes without VLX can we widened to v4f32->v4i64 by type
+        // legalizer and then widened again by vector op legalization.
+        if (!IsStrict)
+          return SDValue();
+
+        SDValue Zero = DAG.getConstantFP(0.0, dl, MVT::v2f32);
+        SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8f32,
+                                  {Src, Zero, Zero, Zero});
+        Tmp = DAG.getNode(Op.getOpcode(), dl, {MVT::v8i64, MVT::Other},
+                          {Op->getOperand(0), Tmp});
+        SDValue Chain = Tmp.getValue(1);
+        Tmp = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2i64, Tmp,
+                          DAG.getIntPtrConstant(0, dl));
+        if (IsStrict)
+          return DAG.getMergeValues({Tmp, Chain}, dl);
+        return Tmp;
+      }
+
       assert(Subtarget.hasDQI() && Subtarget.hasVLX() && "Requires AVX512DQVL");
       SDValue Tmp = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
                                 DAG.getUNDEF(MVT::v2f32));
@@ -20281,6 +20932,62 @@ SDValue X86TargetLowering::LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const {
   llvm_unreachable("Expected FP_TO_INTHelper to handle all remaining cases.");
 }
 
+SDValue X86TargetLowering::LowerLRINT_LLRINT(SDValue Op,
+                                             SelectionDAG &DAG) const {
+  SDValue Src = Op.getOperand(0);
+  MVT SrcVT = Src.getSimpleValueType();
+
+  // If the source is in an SSE register, the node is Legal.
+  if (isScalarFPTypeInSSEReg(SrcVT))
+    return Op;
+
+  return LRINT_LLRINTHelper(Op.getNode(), DAG);
+}
+
+SDValue X86TargetLowering::LRINT_LLRINTHelper(SDNode *N,
+                                              SelectionDAG &DAG) const {
+  EVT DstVT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  if (SrcVT != MVT::f32 && SrcVT != MVT::f64 && SrcVT != MVT::f80) {
+    // f16 must be promoted before using the lowering in this routine.
+    // fp128 does not use this lowering.
+    return SDValue();
+  }
+
+  SDLoc DL(N);
+  SDValue Chain = DAG.getEntryNode();
+
+  bool UseSSE = isScalarFPTypeInSSEReg(SrcVT);
+
+  // If we're converting from SSE, the stack slot needs to hold both types.
+  // Otherwise it only needs to hold the DstVT.
+  EVT OtherVT = UseSSE ? SrcVT : DstVT;
+  SDValue StackPtr = DAG.CreateStackTemporary(DstVT, OtherVT);
+  int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+  MachinePointerInfo MPI =
+      MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+
+  if (UseSSE) {
+    assert(DstVT == MVT::i64 && "Invalid LRINT/LLRINT to lower!");
+    Chain = DAG.getStore(Chain, DL, Src, StackPtr, MPI);
+    SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+    SDValue Ops[] = { Chain, StackPtr };
+
+    Src = DAG.getMemIntrinsicNode(X86ISD::FLD, DL, Tys, Ops, SrcVT, MPI,
+                                  /*Align*/ None, MachineMemOperand::MOLoad);
+    Chain = Src.getValue(1);
+  }
+
+  SDValue StoreOps[] = { Chain, Src, StackPtr };
+  Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, DL, DAG.getVTList(MVT::Other),
+                                  StoreOps, DstVT, MPI, /*Align*/ None,
+                                  MachineMemOperand::MOStore);
+
+  return DAG.getLoad(DstVT, DL, Chain, StackPtr, MPI);
+}
+
 SDValue X86TargetLowering::LowerFP_EXTEND(SDValue Op, SelectionDAG &DAG) const {
   bool IsStrict = Op->isStrictFPOpcode();
 
@@ -20333,6 +21040,67 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
   return Tmp.first;
 }
 
+static SDValue LowerFP16_TO_FP(SDValue Op, SelectionDAG &DAG) {
+  bool IsStrict = Op->isStrictFPOpcode();
+  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+  assert(Src.getValueType() == MVT::i16 && Op.getValueType() == MVT::f32 &&
+         "Unexpected VT!");
+
+  SDLoc dl(Op);
+  SDValue Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v8i16,
+                            DAG.getConstant(0, dl, MVT::v8i16), Src,
+                            DAG.getIntPtrConstant(0, dl));
+
+  SDValue Chain;
+  if (IsStrict) {
+    Res = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {MVT::v4f32, MVT::Other},
+                      {Op.getOperand(0), Res});
+    Chain = Res.getValue(1);
+  } else {
+    Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
+  }
+
+  Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
+                    DAG.getIntPtrConstant(0, dl));
+
+  if (IsStrict)
+    return DAG.getMergeValues({Res, Chain}, dl);
+
+  return Res;
+}
+
+static SDValue LowerFP_TO_FP16(SDValue Op, SelectionDAG &DAG) {
+  bool IsStrict = Op->isStrictFPOpcode();
+  SDValue Src = Op.getOperand(IsStrict ? 1 : 0);
+  assert(Src.getValueType() == MVT::f32 && Op.getValueType() == MVT::i16 &&
+         "Unexpected VT!");
+
+  SDLoc dl(Op);
+  SDValue Res, Chain;
+  if (IsStrict) {
+    Res = DAG.getNode(ISD::INSERT_VECTOR_ELT, dl, MVT::v4f32,
+                      DAG.getConstantFP(0, dl, MVT::v4f32), Src,
+                      DAG.getIntPtrConstant(0, dl));
+    Res = DAG.getNode(
+        X86ISD::STRICT_CVTPS2PH, dl, {MVT::v8i16, MVT::Other},
+        {Op.getOperand(0), Res, DAG.getTargetConstant(4, dl, MVT::i32)});
+    Chain = Res.getValue(1);
+  } else {
+    // FIXME: Should we use zeros for upper elements for non-strict?
+    Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32, Src);
+    Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
+                      DAG.getTargetConstant(4, dl, MVT::i32));
+  }
+
+  Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i16, Res,
+                    DAG.getIntPtrConstant(0, dl));
+
+  if (IsStrict)
+    return DAG.getMergeValues({Res, Chain}, dl);
+
+  return Res;
+}
+
 /// Depending on uarch and/or optimizing for size, we might prefer to use a
 /// vector operation in place of the typical scalar operation.
 static SDValue lowerAddSubToHorizontalOp(SDValue Op, SelectionDAG &DAG,
@@ -20413,6 +21181,30 @@ SDValue X86TargetLowering::lowerFaddFsub(SDValue Op, SelectionDAG &DAG) const {
   return lowerAddSubToHorizontalOp(Op, DAG, Subtarget);
 }
 
+/// ISD::FROUND is defined to round to nearest with ties rounding away from 0.
+/// This mode isn't supported in hardware on X86. But as long as we aren't
+/// compiling with trapping math, we can emulate this with
+/// floor(X + copysign(nextafter(0.5, 0.0), X)).
+static SDValue LowerFROUND(SDValue Op, SelectionDAG &DAG) {
+  SDValue N0 = Op.getOperand(0);
+  SDLoc dl(Op);
+  MVT VT = Op.getSimpleValueType();
+
+  // N0 += copysign(nextafter(0.5, 0.0), N0)
+  const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(VT);
+  bool Ignored;
+  APFloat Point5Pred = APFloat(0.5f);
+  Point5Pred.convert(Sem, APFloat::rmNearestTiesToEven, &Ignored);
+  Point5Pred.next(/*nextDown*/true);
+
+  SDValue Adder = DAG.getNode(ISD::FCOPYSIGN, dl, VT,
+                              DAG.getConstantFP(Point5Pred, dl, VT), N0);
+  N0 = DAG.getNode(ISD::FADD, dl, VT, N0, Adder);
+
+  // Truncate the result to remove fraction.
+  return DAG.getNode(ISD::FTRUNC, dl, VT, N0);
+}
+
 /// The only differences between FABS and FNEG are the mask and the logic op.
 /// FNEG also has a folding opportunity for FNEG(FABS(x)).
 static SDValue LowerFABSorFNEG(SDValue Op, SelectionDAG &DAG) {
@@ -20568,9 +21360,12 @@ static SDValue getSETCC(X86::CondCode Cond, SDValue EFLAGS, const SDLoc &dl,
 }
 
 /// Helper for matching OR(EXTRACTELT(X,0),OR(EXTRACTELT(X,1),...))
-/// style scalarized (associative) reduction patterns.
+/// style scalarized (associative) reduction patterns. Partial reductions
+/// are supported when the pointer SrcMask is non-null.
+/// TODO - move this to SelectionDAG?
 static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
-                                 SmallVectorImpl<SDValue> &SrcOps) {
+                                 SmallVectorImpl<SDValue> &SrcOps,
+                                 SmallVectorImpl<APInt> *SrcMask = nullptr) {
   SmallVector<SDValue, 8> Opnds;
   DenseMap<SDValue, APInt> SrcOpMap;
   EVT VT = MVT::Other;
@@ -20598,8 +21393,8 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
       return false;
 
     // Quit if without a constant index.
-    SDValue Idx = I->getOperand(1);
-    if (!isa<ConstantSDNode>(Idx))
+    auto *Idx = dyn_cast<ConstantSDNode>(I->getOperand(1));
+    if (!Idx)
       return false;
 
     SDValue Src = I->getOperand(0);
@@ -20615,61 +21410,167 @@ static bool matchScalarReduction(SDValue Op, ISD::NodeType BinOp,
       M = SrcOpMap.insert(std::make_pair(Src, EltCount)).first;
       SrcOps.push_back(Src);
     }
+
     // Quit if element already used.
-    unsigned CIdx = cast<ConstantSDNode>(Idx)->getZExtValue();
+    unsigned CIdx = Idx->getZExtValue();
     if (M->second[CIdx])
       return false;
     M->second.setBit(CIdx);
   }
 
-  // Quit if not all elements are used.
-  for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
-                                                E = SrcOpMap.end();
-       I != E; ++I) {
-    if (!I->second.isAllOnesValue())
-      return false;
+  if (SrcMask) {
+    // Collect the source partial masks.
+    for (SDValue &SrcOp : SrcOps)
+      SrcMask->push_back(SrcOpMap[SrcOp]);
+  } else {
+    // Quit if not all elements are used.
+    for (DenseMap<SDValue, APInt>::const_iterator I = SrcOpMap.begin(),
+                                                  E = SrcOpMap.end();
+         I != E; ++I) {
+      if (!I->second.isAllOnesValue())
+        return false;
+    }
   }
 
   return true;
 }
 
-// Check whether an OR'd tree is PTEST-able.
-static SDValue LowerVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
+// Helper function for comparing all bits of a vector against zero.
+static SDValue LowerVectorAllZero(const SDLoc &DL, SDValue V, ISD::CondCode CC,
+                                  const APInt &Mask,
+                                  const X86Subtarget &Subtarget,
+                                  SelectionDAG &DAG, X86::CondCode &X86CC) {
+  EVT VT = V.getValueType();
+  assert(Mask.getBitWidth() == VT.getScalarSizeInBits() &&
+         "Element Mask vs Vector bitwidth mismatch");
+
+  assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
+  X86CC = (CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE);
+
+  auto MaskBits = [&](SDValue Src) {
+    if (Mask.isAllOnesValue())
+      return Src;
+    EVT SrcVT = Src.getValueType();
+    SDValue MaskValue = DAG.getConstant(Mask, DL, SrcVT);
+    return DAG.getNode(ISD::AND, DL, SrcVT, Src, MaskValue);
+  };
+
+  // For sub-128-bit vector, cast to (legal) integer and compare with zero.
+  if (VT.getSizeInBits() < 128) {
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getSizeInBits());
+    if (!DAG.getTargetLoweringInfo().isTypeLegal(IntVT))
+      return SDValue();
+    return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+                       DAG.getBitcast(IntVT, MaskBits(V)),
+                       DAG.getConstant(0, DL, IntVT));
+  }
+
+  // Quit if not splittable to 128/256-bit vector.
+  if (!isPowerOf2_32(VT.getSizeInBits()))
+    return SDValue();
+
+  // Split down to 128/256-bit vector.
+  unsigned TestSize = Subtarget.hasAVX() ? 256 : 128;
+  while (VT.getSizeInBits() > TestSize) {
+    auto Split = DAG.SplitVector(V, DL);
+    VT = Split.first.getValueType();
+    V = DAG.getNode(ISD::OR, DL, VT, Split.first, Split.second);
+  }
+
+  bool UsePTEST = Subtarget.hasSSE41();
+  if (UsePTEST) {
+    MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+    V = DAG.getBitcast(TestVT, MaskBits(V));
+    return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, V, V);
+  }
+
+  // Without PTEST, a masked v2i64 or-reduction is not faster than
+  // scalarization.
+  if (!Mask.isAllOnesValue() && VT.getScalarSizeInBits() > 32)
+      return SDValue();
+
+  V = DAG.getBitcast(MVT::v16i8, MaskBits(V));
+  V = DAG.getNode(X86ISD::PCMPEQ, DL, MVT::v16i8, V,
+                  getZeroVector(MVT::v16i8, Subtarget, DAG, DL));
+  V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, V);
+  return DAG.getNode(X86ISD::CMP, DL, MVT::i32, V,
+                     DAG.getConstant(0xFFFF, DL, MVT::i32));
+}
+
+// Check whether an OR'd reduction tree is PTEST-able, or if we can fallback to
+// CMP(MOVMSK(PCMPEQB(X,0))).
+static SDValue MatchVectorAllZeroTest(SDValue Op, ISD::CondCode CC,
+                                      const SDLoc &DL,
                                       const X86Subtarget &Subtarget,
                                       SelectionDAG &DAG, SDValue &X86CC) {
-  assert(Op.getOpcode() == ISD::OR && "Only check OR'd tree.");
+  assert((CC == ISD::SETEQ || CC == ISD::SETNE) && "Unsupported ISD::CondCode");
 
-  if (!Subtarget.hasSSE41() || !Op->hasOneUse())
+  if (!Subtarget.hasSSE2() || !Op->hasOneUse())
     return SDValue();
 
-  SmallVector<SDValue, 8> VecIns;
-  if (!matchScalarReduction(Op, ISD::OR, VecIns))
-    return SDValue();
+  // Check whether we're masking/truncating an OR-reduction result, in which
+  // case track the masked bits.
+  APInt Mask = APInt::getAllOnesValue(Op.getScalarValueSizeInBits());
+  switch (Op.getOpcode()) {
+  case ISD::TRUNCATE: {
+    SDValue Src = Op.getOperand(0);
+    Mask = APInt::getLowBitsSet(Src.getScalarValueSizeInBits(),
+                                Op.getScalarValueSizeInBits());
+    Op = Src;
+    break;
+  }
+  case ISD::AND: {
+    if (auto *Cst = dyn_cast<ConstantSDNode>(Op.getOperand(1))) {
+      Mask = Cst->getAPIntValue();
+      Op = Op.getOperand(0);
+    }
+    break;
+  }
+  }
 
-  // Quit if not 128/256-bit vector.
-  EVT VT = VecIns[0].getValueType();
-  if (!VT.is128BitVector() && !VT.is256BitVector())
-    return SDValue();
+  SmallVector<SDValue, 8> VecIns;
+  if (Op.getOpcode() == ISD::OR && matchScalarReduction(Op, ISD::OR, VecIns)) {
+    EVT VT = VecIns[0].getValueType();
+    assert(llvm::all_of(VecIns,
+                        [VT](SDValue V) { return VT == V.getValueType(); }) &&
+           "Reduction source vector mismatch");
+
+    // Quit if less than 128-bits or not splittable to 128/256-bit vector.
+    if (VT.getSizeInBits() < 128 || !isPowerOf2_32(VT.getSizeInBits()))
+      return SDValue();
 
-  SDLoc DL(Op);
-  MVT TestVT = VT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+    // If more than one full vector is evaluated, OR them first before PTEST.
+    for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1;
+         Slot += 2, e += 1) {
+      // Each iteration will OR 2 nodes and append the result until there is
+      // only 1 node left, i.e. the final OR'd value of all vectors.
+      SDValue LHS = VecIns[Slot];
+      SDValue RHS = VecIns[Slot + 1];
+      VecIns.push_back(DAG.getNode(ISD::OR, DL, VT, LHS, RHS));
+    }
 
-  // Cast all vectors into TestVT for PTEST.
-  for (unsigned i = 0, e = VecIns.size(); i < e; ++i)
-    VecIns[i] = DAG.getBitcast(TestVT, VecIns[i]);
+    X86::CondCode CCode;
+    if (SDValue V = LowerVectorAllZero(DL, VecIns.back(), CC, Mask, Subtarget,
+                                       DAG, CCode)) {
+      X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
+      return V;
+    }
+  }
 
-  // If more than one full vector is evaluated, OR them first before PTEST.
-  for (unsigned Slot = 0, e = VecIns.size(); e - Slot > 1; Slot += 2, e += 1) {
-    // Each iteration will OR 2 nodes and append the result until there is only
-    // 1 node left, i.e. the final OR'd value of all vectors.
-    SDValue LHS = VecIns[Slot];
-    SDValue RHS = VecIns[Slot + 1];
-    VecIns.push_back(DAG.getNode(ISD::OR, DL, TestVT, LHS, RHS));
+  if (Op.getOpcode() == ISD::EXTRACT_VECTOR_ELT) {
+    ISD::NodeType BinOp;
+    if (SDValue Match =
+            DAG.matchBinOpReduction(Op.getNode(), BinOp, {ISD::OR})) {
+      X86::CondCode CCode;
+      if (SDValue V =
+              LowerVectorAllZero(DL, Match, CC, Mask, Subtarget, DAG, CCode)) {
+        X86CC = DAG.getTargetConstant(CCode, DL, MVT::i8);
+        return V;
+      }
+    }
   }
 
-  X86CC = DAG.getTargetConstant(CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE,
-                                DL, MVT::i8);
-  return DAG.getNode(X86ISD::PTEST, DL, MVT::i32, VecIns.back(), VecIns.back());
+  return SDValue();
 }
 
 /// return true if \c Op has a use that doesn't just read flags.
@@ -20814,27 +21715,14 @@ static SDValue EmitTest(SDValue Op, unsigned X86CC, const SDLoc &dl,
 
 /// Emit nodes that will be selected as "cmp Op0,Op1", or something
 /// equivalent.
-static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1,
-                                           unsigned X86CC, const SDLoc &dl,
-                                           SelectionDAG &DAG,
-                                           const X86Subtarget &Subtarget,
-                                           SDValue Chain, bool IsSignaling) {
+static SDValue EmitCmp(SDValue Op0, SDValue Op1, unsigned X86CC,
+                       const SDLoc &dl, SelectionDAG &DAG,
+                       const X86Subtarget &Subtarget) {
   if (isNullConstant(Op1))
-    return std::make_pair(EmitTest(Op0, X86CC, dl, DAG, Subtarget), Chain);
+    return EmitTest(Op0, X86CC, dl, DAG, Subtarget);
 
   EVT CmpVT = Op0.getValueType();
 
-  if (CmpVT.isFloatingPoint()) {
-    if (Chain) {
-      SDValue Res =
-          DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
-                      dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
-      return std::make_pair(Res, Res.getValue(1));
-    }
-    return std::make_pair(DAG.getNode(X86ISD::CMP, dl, MVT::i32, Op0, Op1),
-                          SDValue());
-  }
-
   assert((CmpVT == MVT::i8 || CmpVT == MVT::i16 ||
           CmpVT == MVT::i32 || CmpVT == MVT::i64) && "Unexpected VT!");
 
@@ -20884,40 +21772,28 @@ static std::pair<SDValue, SDValue> EmitCmp(SDValue Op0, SDValue Op1,
     Op1 = DAG.getNode(ISD::TRUNCATE, dl, CmpVT, Op1);
   }
 
+  // 0-x == y --> x+y == 0
+  // 0-x != y --> x+y != 0
+  if (Op0.getOpcode() == ISD::SUB && isNullConstant(Op0.getOperand(0)) &&
+      Op0.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+    SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
+    SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0.getOperand(1), Op1);
+    return Add.getValue(1);
+  }
+
+  // x == 0-y --> x+y == 0
+  // x != 0-y --> x+y != 0
+  if (Op1.getOpcode() == ISD::SUB && isNullConstant(Op1.getOperand(0)) &&
+      Op1.hasOneUse() && (X86CC == X86::COND_E || X86CC == X86::COND_NE)) {
+    SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
+    SDValue Add = DAG.getNode(X86ISD::ADD, dl, VTs, Op0, Op1.getOperand(1));
+    return Add.getValue(1);
+  }
+
   // Use SUB instead of CMP to enable CSE between SUB and CMP.
   SDVTList VTs = DAG.getVTList(CmpVT, MVT::i32);
   SDValue Sub = DAG.getNode(X86ISD::SUB, dl, VTs, Op0, Op1);
-  return std::make_pair(Sub.getValue(1), SDValue());
-}
-
-/// Convert a comparison if required by the subtarget.
-SDValue X86TargetLowering::ConvertCmpIfNecessary(SDValue Cmp,
-                                                 SelectionDAG &DAG) const {
-  // If the subtarget does not support the FUCOMI instruction, floating-point
-  // comparisons have to be converted.
-  bool IsCmp = Cmp.getOpcode() == X86ISD::CMP;
-  bool IsStrictCmp = Cmp.getOpcode() == X86ISD::STRICT_FCMP ||
-                     Cmp.getOpcode() == X86ISD::STRICT_FCMPS;
-
-  if (Subtarget.hasCMov() || (!IsCmp && !IsStrictCmp) ||
-      !Cmp.getOperand(IsStrictCmp ? 1 : 0).getValueType().isFloatingPoint() ||
-      !Cmp.getOperand(IsStrictCmp ? 2 : 1).getValueType().isFloatingPoint())
-    return Cmp;
-
-  // The instruction selector will select an FUCOM instruction instead of
-  // FUCOMI, which writes the comparison result to FPSW instead of EFLAGS. Hence
-  // build an SDNode sequence that transfers the result from FPSW into EFLAGS:
-  // (X86sahf (trunc (srl (X86fp_stsw (trunc (X86any_fcmp ...)), 8))))
-  SDLoc dl(Cmp);
-  SDValue TruncFPSW = DAG.getNode(ISD::TRUNCATE, dl, MVT::i16, Cmp);
-  SDValue FNStSW = DAG.getNode(X86ISD::FNSTSW16r, dl, MVT::i16, TruncFPSW);
-  SDValue Srl = DAG.getNode(ISD::SRL, dl, MVT::i16, FNStSW,
-                            DAG.getConstant(8, dl, MVT::i8));
-  SDValue TruncSrl = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, Srl);
-
-  // Some 64-bit targets lack SAHF support, but they do support FCOMI.
-  assert(Subtarget.hasLAHFSAHF() && "Target doesn't support SAHF or FCOMI?");
-  return DAG.getNode(X86ISD::SAHF, dl, MVT::i32, TruncSrl);
+  return Sub.getValue(1);
 }
 
 /// Check if replacement of SQRT with RSQRT should be disabled.
@@ -21056,7 +21932,7 @@ X86TargetLowering::BuildSDIVPow2(SDNode *N, const APInt &Divisor,
 
   // Divide by pow2.
   SDValue SRA =
-      DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i64));
+      DAG.getNode(ISD::SRA, DL, VT, CMov, DAG.getConstant(Lg2, DL, MVT::i8));
 
   // If we're dividing by a positive value, we're done.  Otherwise, we must
   // negate the result.
@@ -21211,32 +22087,30 @@ static unsigned translateX86FSETCC(ISD::CondCode SetCCOpcode, SDValue &Op0,
 
 /// Break a VSETCC 256-bit integer VSETCC into two new 128 ones and then
 /// concatenate the result back.
-static SDValue Lower256IntVSETCC(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
+static SDValue splitIntVSETCC(SDValue Op, SelectionDAG &DAG) {
+  EVT VT = Op.getValueType();
 
-  assert(VT.is256BitVector() && Op.getOpcode() == ISD::SETCC &&
-         "Unsupported value type for operation");
+  assert(Op.getOpcode() == ISD::SETCC && "Unsupported operation");
+  assert(Op.getOperand(0).getValueType().isInteger() &&
+         VT == Op.getOperand(0).getValueType() && "Unsupported VTs!");
 
-  unsigned NumElems = VT.getVectorNumElements();
   SDLoc dl(Op);
   SDValue CC = Op.getOperand(2);
 
-  // Extract the LHS vectors
-  SDValue LHS = Op.getOperand(0);
-  SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
-  SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
+  // Extract the LHS Lo/Hi vectors
+  SDValue LHS1, LHS2;
+  std::tie(LHS1, LHS2) = splitVector(Op.getOperand(0), DAG, dl);
 
-  // Extract the RHS vectors
-  SDValue RHS = Op.getOperand(1);
-  SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
-  SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
+  // Extract the RHS Lo/Hi vectors
+  SDValue RHS1, RHS2;
+  std::tie(RHS1, RHS2) = splitVector(Op.getOperand(1), DAG, dl);
 
   // Issue the operation on the smaller types and concatenate the result back
-  MVT EltVT = VT.getVectorElementType();
-  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
   return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
-                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1, CC),
-                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2, CC));
+                     DAG.getNode(ISD::SETCC, dl, LoVT, LHS1, RHS1, CC),
+                     DAG.getNode(ISD::SETCC, dl, HiVT, LHS2, RHS2, CC));
 }
 
 static SDValue LowerIntVSETCC_AVX512(SDValue Op, SelectionDAG &DAG) {
@@ -21369,8 +22243,14 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
     bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
     SDValue Chain = IsStrict ? Op.getOperand(0) : SDValue();
 
+    // If we have a strict compare with a vXi1 result and the input is 128/256
+    // bits we can't use a masked compare unless we have VLX. If we use a wider
+    // compare like we do for non-strict, we might trigger spurious exceptions
+    // from the upper elements. Instead emit a AVX compare and convert to mask.
     unsigned Opc;
-    if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1) {
+    if (Subtarget.hasAVX512() && VT.getVectorElementType() == MVT::i1 &&
+        (!IsStrict || Subtarget.hasVLX() ||
+         Op0.getSimpleValueType().is512BitVector())) {
       assert(VT.getVectorNumElements() <= 16);
       Opc = IsStrict ? X86ISD::STRICT_CMPM : X86ISD::CMPM;
     } else {
@@ -21466,10 +22346,19 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
             Opc, dl, VT, Op0, Op1, DAG.getTargetConstant(SSECC, dl, MVT::i8));
     }
 
-    // If this is SSE/AVX CMPP, bitcast the result back to integer to match the
-    // result type of SETCC. The bitcast is expected to be optimized away
-    // during combining/isel.
-    Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+    if (VT.getSizeInBits() > Op.getSimpleValueType().getSizeInBits()) {
+      // We emitted a compare with an XMM/YMM result. Finish converting to a
+      // mask register using a vptestm.
+      EVT CastVT = EVT(VT).changeVectorElementTypeToInteger();
+      Cmp = DAG.getBitcast(CastVT, Cmp);
+      Cmp = DAG.getSetCC(dl, Op.getSimpleValueType(), Cmp,
+                         DAG.getConstant(0, dl, CastVT), ISD::SETNE);
+    } else {
+      // If this is SSE/AVX CMPP, bitcast the result back to integer to match
+      // the result type of SETCC. The bitcast is expected to be optimized
+      // away during combining/isel.
+      Cmp = DAG.getBitcast(Op.getSimpleValueType(), Cmp);
+    }
 
     if (IsStrict)
       return DAG.getMergeValues({Cmp, Chain}, dl);
@@ -21563,7 +22452,12 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
 
   // Break 256-bit integer vector compare into smaller ones.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
-    return Lower256IntVSETCC(Op, DAG);
+    return splitIntVSETCC(Op, DAG);
+
+  if (VT == MVT::v32i16 || VT == MVT::v64i8) {
+    assert(!Subtarget.hasBWI() && "Unexpected VT with AVX512BW!");
+    return splitIntVSETCC(Op, DAG);
+  }
 
   // If this is a SETNE against the signed minimum value, change it to SETGT.
   // If this is a SETNE against the signed maximum value, change it to SETLT.
@@ -21812,9 +22706,8 @@ static SDValue EmitAVX512Test(SDValue Op0, SDValue Op1, ISD::CondCode CC,
 /// corresponding X86 condition code constant in X86CC.
 SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
                                              ISD::CondCode CC, const SDLoc &dl,
-                                             SelectionDAG &DAG, SDValue &X86CC,
-                                             SDValue &Chain,
-                                             bool IsSignaling) const {
+                                             SelectionDAG &DAG,
+                                             SDValue &X86CC) const {
   // Optimize to BT if possible.
   // Lower (X & (1 << N)) == 0 to BT(X, N).
   // Lower ((X >>u N) & 1) != 0 to BT(X, N).
@@ -21825,13 +22718,12 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
       return BT;
   }
 
-  // Try to use PTEST for a tree ORs equality compared with 0.
+  // Try to use PTEST/PMOVMSKB for a tree ORs equality compared with 0.
   // TODO: We could do AND tree with all 1s as well by using the C flag.
-  if (Op0.getOpcode() == ISD::OR && isNullConstant(Op1) &&
-      (CC == ISD::SETEQ || CC == ISD::SETNE)) {
-    if (SDValue PTEST = LowerVectorAllZeroTest(Op0, CC, Subtarget, DAG, X86CC))
-      return PTEST;
-  }
+  if (isNullConstant(Op1) && (CC == ISD::SETEQ || CC == ISD::SETNE))
+    if (SDValue CmpZ =
+            MatchVectorAllZeroTest(Op0, CC, dl, Subtarget, DAG, X86CC))
+      return CmpZ;
 
   // Try to lower using KORTEST or KTEST.
   if (SDValue Test = EmitAVX512Test(Op0, Op1, CC, dl, DAG, Subtarget, X86CC))
@@ -21873,17 +22765,11 @@ SDValue X86TargetLowering::emitFlagsForSetcc(SDValue Op0, SDValue Op1,
     }
   }
 
-  bool IsFP = Op1.getSimpleValueType().isFloatingPoint();
-  X86::CondCode CondCode = TranslateX86CC(CC, dl, IsFP, Op0, Op1, DAG);
-  if (CondCode == X86::COND_INVALID)
-    return SDValue();
+  X86::CondCode CondCode =
+      TranslateX86CC(CC, dl, /*IsFP*/ false, Op0, Op1, DAG);
+  assert(CondCode != X86::COND_INVALID && "Unexpected condition code!");
 
-  std::pair<SDValue, SDValue> Tmp =
-      EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget, Chain, IsSignaling);
-  SDValue EFLAGS = Tmp.first;
-  if (Chain)
-    Chain = Tmp.second;
-  EFLAGS = ConvertCmpIfNecessary(EFLAGS, DAG);
+  SDValue EFLAGS = EmitCmp(Op0, Op1, CondCode, dl, DAG, Subtarget);
   X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
   return EFLAGS;
 }
@@ -21920,18 +22806,32 @@ SDValue X86TargetLowering::LowerSETCC(SDValue Op, SelectionDAG &DAG) const {
     }
   }
 
-  SDValue X86CC;
-  SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC, Chain,
-                                     Op.getOpcode() == ISD::STRICT_FSETCCS);
-  if (!EFLAGS)
-    return SDValue();
+  if (Op0.getSimpleValueType().isInteger()) {
+    SDValue X86CC;
+    SDValue EFLAGS = emitFlagsForSetcc(Op0, Op1, CC, dl, DAG, X86CC);
+    SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+    return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
+  }
 
-  SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+  // Handle floating point.
+  X86::CondCode CondCode = TranslateX86CC(CC, dl, /*IsFP*/ true, Op0, Op1, DAG);
+  if (CondCode == X86::COND_INVALID)
+    return SDValue();
 
-  if (IsStrict)
-    return DAG.getMergeValues({Res, Chain}, dl);
+  SDValue EFLAGS;
+  if (IsStrict) {
+    bool IsSignaling = Op.getOpcode() == ISD::STRICT_FSETCCS;
+    EFLAGS =
+        DAG.getNode(IsSignaling ? X86ISD::STRICT_FCMPS : X86ISD::STRICT_FCMP,
+                    dl, {MVT::i32, MVT::Other}, {Chain, Op0, Op1});
+    Chain = EFLAGS.getValue(1);
+  } else {
+    EFLAGS = DAG.getNode(X86ISD::FCMP, dl, MVT::i32, Op0, Op1);
+  }
 
-  return Res;
+  SDValue X86CC = DAG.getTargetConstant(CondCode, dl, MVT::i8);
+  SDValue Res = DAG.getNode(X86ISD::SETCC, dl, MVT::i8, X86CC, EFLAGS);
+  return IsStrict ? DAG.getMergeValues({Res, Chain}, dl) : Res;
 }
 
 SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const {
@@ -21946,9 +22846,8 @@ SDValue X86TargetLowering::LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const
 
   // Recreate the carry if needed.
   EVT CarryVT = Carry.getValueType();
-  APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
-                      Carry, DAG.getConstant(NegOne, DL, CarryVT));
+                      Carry, DAG.getAllOnesConstant(DL, CarryVT));
 
   SDVTList VTs = DAG.getVTList(LHS.getValueType(), MVT::i32);
   SDValue Cmp = DAG.getNode(X86ISD::SBB, DL, VTs, LHS, RHS, Carry.getValue(1));
@@ -22024,7 +22923,7 @@ static SDValue LowerXALUO(SDValue Op, SelectionDAG &DAG) {
 static bool isX86LogicalCmp(SDValue Op) {
   unsigned Opc = Op.getOpcode();
   if (Opc == X86ISD::CMP || Opc == X86ISD::COMI || Opc == X86ISD::UCOMI ||
-      Opc == X86ISD::SAHF)
+      Opc == X86ISD::FCMP)
     return true;
   if (Op.getResNo() == 1 &&
       (Opc == X86ISD::ADD || Opc == X86ISD::SUB || Opc == X86ISD::ADC ||
@@ -22057,9 +22956,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // Lower FP selects into a CMP/AND/ANDN/OR sequence when the necessary SSE ops
   // are available or VBLENDV if AVX is available.
   // Otherwise FP cmovs get lowered into a less efficient branch sequence later.
-  if (Cond.getOpcode() == ISD::SETCC &&
-      ((Subtarget.hasSSE2() && VT == MVT::f64) ||
-       (Subtarget.hasSSE1() && VT == MVT::f32)) &&
+  if (Cond.getOpcode() == ISD::SETCC && isScalarFPTypeInSSEReg(VT) &&
       VT == Cond.getOperand(0).getSimpleValueType() && Cond->hasOneUse()) {
     SDValue CondOp0 = Cond.getOperand(0), CondOp1 = Cond.getOperand(1);
     bool IsAlwaysSignaling;
@@ -22115,45 +23012,11 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // AVX512 fallback is to lower selects of scalar floats to masked moves.
-  if ((VT == MVT::f64 || VT == MVT::f32) && Subtarget.hasAVX512()) {
+  if (isScalarFPTypeInSSEReg(VT) && Subtarget.hasAVX512()) {
     SDValue Cmp = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v1i1, Cond);
     return DAG.getNode(X86ISD::SELECTS, DL, VT, Cmp, Op1, Op2);
   }
 
-  // For v64i1 without 64-bit support we need to split and rejoin.
-  if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
-    assert(Subtarget.hasBWI() && "Expected BWI to be legal");
-    SDValue Op1Lo = extractSubVector(Op1, 0, DAG, DL, 32);
-    SDValue Op2Lo = extractSubVector(Op2, 0, DAG, DL, 32);
-    SDValue Op1Hi = extractSubVector(Op1, 32, DAG, DL, 32);
-    SDValue Op2Hi = extractSubVector(Op2, 32, DAG, DL, 32);
-    SDValue Lo = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Lo, Op2Lo);
-    SDValue Hi = DAG.getSelect(DL, MVT::v32i1, Cond, Op1Hi, Op2Hi);
-    return DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Lo, Hi);
-  }
-
-  if (VT.isVector() && VT.getVectorElementType() == MVT::i1) {
-    SDValue Op1Scalar;
-    if (ISD::isBuildVectorOfConstantSDNodes(Op1.getNode()))
-      Op1Scalar = ConvertI1VectorToInteger(Op1, DAG);
-    else if (Op1.getOpcode() == ISD::BITCAST && Op1.getOperand(0))
-      Op1Scalar = Op1.getOperand(0);
-    SDValue Op2Scalar;
-    if (ISD::isBuildVectorOfConstantSDNodes(Op2.getNode()))
-      Op2Scalar = ConvertI1VectorToInteger(Op2, DAG);
-    else if (Op2.getOpcode() == ISD::BITCAST && Op2.getOperand(0))
-      Op2Scalar = Op2.getOperand(0);
-    if (Op1Scalar.getNode() && Op2Scalar.getNode()) {
-      SDValue newSelect = DAG.getSelect(DL, Op1Scalar.getValueType(), Cond,
-                                        Op1Scalar, Op2Scalar);
-      if (newSelect.getValueSizeInBits() == VT.getSizeInBits())
-        return DAG.getBitcast(VT, newSelect);
-      SDValue ExtVec = DAG.getBitcast(MVT::v8i1, newSelect);
-      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, ExtVec,
-                         DAG.getIntPtrConstant(0, DL));
-    }
-  }
-
   if (Cond.getOpcode() == ISD::SETCC) {
     if (SDValue NewCond = LowerSETCC(Cond, DAG)) {
       Cond = NewCond;
@@ -22175,12 +23038,28 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       Cond.getOperand(1).getOpcode() == X86ISD::CMP &&
       isNullConstant(Cond.getOperand(1).getOperand(1))) {
     SDValue Cmp = Cond.getOperand(1);
+    SDValue CmpOp0 = Cmp.getOperand(0);
     unsigned CondCode = Cond.getConstantOperandVal(0);
 
-    if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
+    // Special handling for __builtin_ffs(X) - 1 pattern which looks like
+    // (select (seteq X, 0), -1, (cttz_zero_undef X)). Disable the special
+    // handle to keep the CMP with 0. This should be removed by
+    // optimizeCompareInst by using the flags from the BSR/TZCNT used for the
+    // cttz_zero_undef.
+    auto MatchFFSMinus1 = [&](SDValue Op1, SDValue Op2) {
+      return (Op1.getOpcode() == ISD::CTTZ_ZERO_UNDEF && Op1.hasOneUse() &&
+              Op1.getOperand(0) == CmpOp0 && isAllOnesConstant(Op2));
+    };
+    if (Subtarget.hasCMov() && (VT == MVT::i32 || VT == MVT::i64) &&
+        ((CondCode == X86::COND_NE && MatchFFSMinus1(Op1, Op2)) ||
+         (CondCode == X86::COND_E && MatchFFSMinus1(Op2, Op1)))) {
+      // Keep Cmp.
+    } else if ((isAllOnesConstant(Op1) || isAllOnesConstant(Op2)) &&
         (CondCode == X86::COND_E || CondCode == X86::COND_NE)) {
       SDValue Y = isAllOnesConstant(Op2) ? Op1 : Op2;
-      SDValue CmpOp0 = Cmp.getOperand(0);
+
+      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+      SDVTList CmpVTs = DAG.getVTList(CmpOp0.getValueType(), MVT::i32);
 
       // Apply further optimizations for special cases
       // (select (x != 0), -1, 0) -> neg & sbb
@@ -22188,31 +23067,25 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       if (isNullConstant(Y) &&
           (isAllOnesConstant(Op1) == (CondCode == X86::COND_NE))) {
         SDValue Zero = DAG.getConstant(0, DL, CmpOp0.getValueType());
-        SDValue CmpZero = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Zero, CmpOp0);
-        SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
+        SDValue Neg = DAG.getNode(X86ISD::SUB, DL, CmpVTs, Zero, CmpOp0);
         Zero = DAG.getConstant(0, DL, Op.getValueType());
-        return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, CmpZero);
+        return DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Neg.getValue(1));
       }
 
-      Cmp = DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+      Cmp = DAG.getNode(X86ISD::SUB, DL, CmpVTs,
                         CmpOp0, DAG.getConstant(1, DL, CmpOp0.getValueType()));
-      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
 
-      SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::i32);
       SDValue Zero = DAG.getConstant(0, DL, Op.getValueType());
       SDValue Res =   // Res = 0 or -1.
-        DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp);
+        DAG.getNode(X86ISD::SBB, DL, VTs, Zero, Zero, Cmp.getValue(1));
 
       if (isAllOnesConstant(Op1) != (CondCode == X86::COND_E))
         Res = DAG.getNOT(DL, Res, Res.getValueType());
 
-      if (!isNullConstant(Op2))
-        Res = DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
-      return Res;
+      return DAG.getNode(ISD::OR, DL, Res.getValueType(), Res, Y);
     } else if (!Subtarget.hasCMov() && CondCode == X86::COND_E &&
                Cmp.getOperand(0).getOpcode() == ISD::AND &&
                isOneConstant(Cmp.getOperand(0).getOperand(1))) {
-      SDValue CmpOp0 = Cmp.getOperand(0);
       SDValue Src1, Src2;
       // true if Op2 is XOR or OR operator and one of its operands
       // is equal to Op1
@@ -22265,7 +23138,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
     SDValue Cmp = Cond.getOperand(1);
     bool IllegalFPCMov = false;
     if (VT.isFloatingPoint() && !VT.isVector() &&
-        !isScalarFPTypeInSSEReg(VT))  // FPStack?
+        !isScalarFPTypeInSSEReg(VT) && Subtarget.hasCMov())  // FPStack?
       IllegalFPCMov = !hasFPCMov(cast<ConstantSDNode>(CC)->getSExtValue());
 
     if ((isX86LogicalCmp(Cmp) && !IllegalFPCMov) ||
@@ -22311,7 +23184,6 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
   // a >= b ? -1 :  0 -> RES = setcc_carry
   // a >= b ?  0 : -1 -> RES = ~setcc_carry
   if (Cond.getOpcode() == X86ISD::SUB) {
-    Cond = ConvertCmpIfNecessary(Cond, DAG);
     unsigned CondCode = cast<ConstantSDNode>(CC)->getZExtValue();
 
     if ((CondCode == X86::COND_AE || CondCode == X86::COND_B) &&
@@ -22333,7 +23205,7 @@ SDValue X86TargetLowering::LowerSELECT(SDValue Op, SelectionDAG &DAG) const {
       Op1.getOpcode() == ISD::TRUNCATE && Op2.getOpcode() == ISD::TRUNCATE) {
     SDValue T1 = Op1.getOperand(0), T2 = Op2.getOperand(0);
     if (T1.getValueType() == T2.getValueType() &&
-        // Blacklist CopyFromReg to avoid partial register stalls.
+        // Exclude CopyFromReg to avoid partial register stalls.
         T1.getOpcode() != ISD::CopyFromReg && T2.getOpcode()!=ISD::CopyFromReg){
       SDValue Cmov = DAG.getNode(X86ISD::CMOV, DL, T1.getValueType(), T2, T1,
                                  CC, Cond);
@@ -22570,14 +23442,9 @@ static SDValue LowerSIGN_EXTEND(SDValue Op, const X86Subtarget &Subtarget,
           InVT.getVectorElementType() == MVT::i32) &&
          "Unexpected element type");
 
-  // Custom legalize v8i8->v8i64 on CPUs without avx512bw.
-  if (InVT == MVT::v8i8) {
-    if (VT != MVT::v8i64)
-      return SDValue();
-
-    In = DAG.getNode(ISD::CONCAT_VECTORS, SDLoc(Op),
-                     MVT::v16i8, In, DAG.getUNDEF(MVT::v8i8));
-    return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, dl, VT, In);
+  if (VT == MVT::v32i16 && !Subtarget.hasBWI()) {
+    assert(InVT == MVT::v32i8 && "Unexpected VT!");
+    return splitVectorIntUnary(Op, DAG);
   }
 
   if (Subtarget.hasInt256())
@@ -22620,23 +23487,19 @@ static SDValue splitVectorStore(StoreSDNode *Store, SelectionDAG &DAG) {
   if (!Store->isSimple())
     return SDValue();
 
-  EVT StoreVT = StoredVal.getValueType();
-  unsigned NumElems = StoreVT.getVectorNumElements();
-  unsigned HalfSize = StoredVal.getValueSizeInBits() / 2;
-  unsigned HalfAlign = (128 == HalfSize ? 16 : 32);
-
   SDLoc DL(Store);
-  SDValue Value0 = extractSubVector(StoredVal, 0, DAG, DL, HalfSize);
-  SDValue Value1 = extractSubVector(StoredVal, NumElems / 2, DAG, DL, HalfSize);
+  SDValue Value0, Value1;
+  std::tie(Value0, Value1) = splitVector(StoredVal, DAG, DL);
+  unsigned HalfOffset = Value0.getValueType().getStoreSize();
   SDValue Ptr0 = Store->getBasePtr();
-  SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfAlign, DL);
-  unsigned Alignment = Store->getAlignment();
+  SDValue Ptr1 = DAG.getMemBasePlusOffset(Ptr0, HalfOffset, DL);
   SDValue Ch0 =
       DAG.getStore(Store->getChain(), DL, Value0, Ptr0, Store->getPointerInfo(),
-                   Alignment, Store->getMemOperand()->getFlags());
+                   Store->getOriginalAlign(),
+                   Store->getMemOperand()->getFlags());
   SDValue Ch1 = DAG.getStore(Store->getChain(), DL, Value1, Ptr1,
-                             Store->getPointerInfo().getWithOffset(HalfAlign),
-                             MinAlign(Alignment, HalfAlign),
+                             Store->getPointerInfo().getWithOffset(HalfOffset),
+                             Store->getOriginalAlign(),
                              Store->getMemOperand()->getFlags());
   return DAG.getNode(ISD::TokenFactor, DL, MVT::Other, Ch0, Ch1);
 }
@@ -22659,7 +23522,6 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
   MVT StoreSVT = StoreVT.getScalarType();
   unsigned NumElems = StoreVT.getVectorNumElements();
   unsigned ScalarSize = StoreSVT.getStoreSize();
-  unsigned Alignment = Store->getAlignment();
 
   SDLoc DL(Store);
   SmallVector<SDValue, 4> Stores;
@@ -22670,7 +23532,7 @@ static SDValue scalarizeVectorStore(StoreSDNode *Store, MVT StoreVT,
                               DAG.getIntPtrConstant(i, DL));
     SDValue Ch = DAG.getStore(Store->getChain(), DL, Scl, Ptr,
                               Store->getPointerInfo().getWithOffset(Offset),
-                              MinAlign(Alignment, Offset),
+                              Store->getOriginalAlign(),
                               Store->getMemOperand()->getFlags());
     Stores.push_back(Ch);
   }
@@ -22699,7 +23561,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
     StoredVal = DAG.getNode(ISD::TRUNCATE, dl, MVT::i8, StoredVal);
 
     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
-                        St->getPointerInfo(), St->getAlignment(),
+                        St->getPointerInfo(), St->getOriginalAlign(),
                         St->getMemOperand()->getFlags());
   }
 
@@ -22711,7 +23573,9 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
   // and each half can execute independently. Some cores would split the op into
   // halves anyway, so the concat (vinsertf128) is purely an extra op.
   MVT StoreVT = StoredVal.getSimpleValueType();
-  if (StoreVT.is256BitVector()) {
+  if (StoreVT.is256BitVector() ||
+      ((StoreVT == MVT::v32i16 || StoreVT == MVT::v64i8) &&
+       !Subtarget.hasBWI())) {
     SmallVector<SDValue, 4> CatOps;
     if (StoredVal.hasOneUse() && collectConcatOps(StoredVal.getNode(), CatOps))
       return splitVectorStore(St, DAG);
@@ -22738,7 +23602,7 @@ static SDValue LowerStore(SDValue Op, const X86Subtarget &Subtarget,
                             DAG.getIntPtrConstant(0, dl));
 
     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
-                        St->getPointerInfo(), St->getAlignment(),
+                        St->getPointerInfo(), St->getOriginalAlign(),
                         St->getMemOperand()->getFlags());
   }
   assert(Subtarget.hasSSE1() && "Expected SSE");
@@ -22773,7 +23637,7 @@ static SDValue LowerLoad(SDValue Op, const X86Subtarget &Subtarget,
            "Expected AVX512F without AVX512DQI");
 
     SDValue NewLd = DAG.getLoad(MVT::i8, dl, Ld->getChain(), Ld->getBasePtr(),
-                                Ld->getPointerInfo(), Ld->getAlignment(),
+                                Ld->getPointerInfo(), Ld->getOriginalAlign(),
                                 Ld->getMemOperand()->getFlags());
 
     // Replace chain users with the new chain.
@@ -22801,163 +23665,44 @@ static bool isAndOrOfSetCCs(SDValue Op, unsigned &Opc) {
           Op.getOperand(1).hasOneUse());
 }
 
-/// Return true if node is an ISD::XOR of a X86ISD::SETCC and 1 and that the
-/// SETCC node has a single use.
-static bool isXor1OfSetCC(SDValue Op) {
-  if (Op.getOpcode() != ISD::XOR)
-    return false;
-  if (isOneConstant(Op.getOperand(1)))
-    return Op.getOperand(0).getOpcode() == X86ISD::SETCC &&
-           Op.getOperand(0).hasOneUse();
-  return false;
-}
-
 SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
-  bool addTest = true;
   SDValue Chain = Op.getOperand(0);
   SDValue Cond  = Op.getOperand(1);
   SDValue Dest  = Op.getOperand(2);
   SDLoc dl(Op);
-  SDValue CC;
-  bool Inverted = false;
 
-  if (Cond.getOpcode() == ISD::SETCC) {
-    // Check for setcc([su]{add,sub,mul}o == 0).
-    if (cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
-        isNullConstant(Cond.getOperand(1)) &&
-        Cond.getOperand(0).getResNo() == 1 &&
-        (Cond.getOperand(0).getOpcode() == ISD::SADDO ||
-         Cond.getOperand(0).getOpcode() == ISD::UADDO ||
-         Cond.getOperand(0).getOpcode() == ISD::SSUBO ||
-         Cond.getOperand(0).getOpcode() == ISD::USUBO ||
-         Cond.getOperand(0).getOpcode() == ISD::SMULO ||
-         Cond.getOperand(0).getOpcode() == ISD::UMULO)) {
-      Inverted = true;
-      Cond = Cond.getOperand(0);
-    } else {
-      if (SDValue NewCond = LowerSETCC(Cond, DAG))
-        Cond = NewCond;
-    }
-  }
-#if 0
-  // FIXME: LowerXALUO doesn't handle these!!
-  else if (Cond.getOpcode() == X86ISD::ADD  ||
-           Cond.getOpcode() == X86ISD::SUB  ||
-           Cond.getOpcode() == X86ISD::SMUL ||
-           Cond.getOpcode() == X86ISD::UMUL)
-    Cond = LowerXALUO(Cond, DAG);
-#endif
+  if (Cond.getOpcode() == ISD::SETCC &&
+      Cond.getOperand(0).getValueType() != MVT::f128) {
+    SDValue LHS = Cond.getOperand(0);
+    SDValue RHS = Cond.getOperand(1);
+    ISD::CondCode CC = cast<CondCodeSDNode>(Cond.getOperand(2))->get();
 
-  // Look pass (and (setcc_carry (cmp ...)), 1).
-  if (Cond.getOpcode() == ISD::AND &&
-      Cond.getOperand(0).getOpcode() == X86ISD::SETCC_CARRY &&
-      isOneConstant(Cond.getOperand(1)))
-    Cond = Cond.getOperand(0);
+    // Special case for
+    // setcc([su]{add,sub,mul}o == 0)
+    // setcc([su]{add,sub,mul}o != 1)
+    if (ISD::isOverflowIntrOpRes(LHS) &&
+        (CC == ISD::SETEQ || CC == ISD::SETNE) &&
+        (isNullConstant(RHS) || isOneConstant(RHS))) {
+      SDValue Value, Overflow;
+      X86::CondCode X86Cond;
+      std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, LHS.getValue(0), DAG);
 
-  // If condition flag is set by a X86ISD::CMP, then use it as the condition
-  // setting operand in place of the X86ISD::SETCC.
-  unsigned CondOpcode = Cond.getOpcode();
-  if (CondOpcode == X86ISD::SETCC ||
-      CondOpcode == X86ISD::SETCC_CARRY) {
-    CC = Cond.getOperand(0);
+      if ((CC == ISD::SETEQ) == isNullConstant(RHS))
+        X86Cond = X86::GetOppositeBranchCondition(X86Cond);
 
-    SDValue Cmp = Cond.getOperand(1);
-    unsigned Opc = Cmp.getOpcode();
-    // FIXME: WHY THE SPECIAL CASING OF LogicalCmp??
-    if (isX86LogicalCmp(Cmp) || Opc == X86ISD::BT) {
-      Cond = Cmp;
-      addTest = false;
-    } else {
-      switch (cast<ConstantSDNode>(CC)->getZExtValue()) {
-      default: break;
-      case X86::COND_O:
-      case X86::COND_B:
-        // These can only come from an arithmetic instruction with overflow,
-        // e.g. SADDO, UADDO.
-        Cond = Cond.getOperand(1);
-        addTest = false;
-        break;
-      }
+      SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+      return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                         Overflow);
     }
-  }
-  CondOpcode = Cond.getOpcode();
-  if (CondOpcode == ISD::UADDO || CondOpcode == ISD::SADDO ||
-      CondOpcode == ISD::USUBO || CondOpcode == ISD::SSUBO ||
-      CondOpcode == ISD::UMULO || CondOpcode == ISD::SMULO) {
-    SDValue Value;
-    X86::CondCode X86Cond;
-    std::tie(Value, Cond) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
 
-    if (Inverted)
-      X86Cond = X86::GetOppositeBranchCondition(X86Cond);
+    if (LHS.getSimpleValueType().isInteger()) {
+      SDValue CCVal;
+      SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, CC, SDLoc(Cond), DAG, CCVal);
+      return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                         EFLAGS);
+    }
 
-    CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
-    addTest = false;
-  } else {
-    unsigned CondOpc;
-    if (Cond.hasOneUse() && isAndOrOfSetCCs(Cond, CondOpc)) {
-      SDValue Cmp = Cond.getOperand(0).getOperand(1);
-      if (CondOpc == ISD::OR) {
-        // Also, recognize the pattern generated by an FCMP_UNE. We can emit
-        // two branches instead of an explicit OR instruction with a
-        // separate test.
-        if (Cmp == Cond.getOperand(1).getOperand(1) &&
-            isX86LogicalCmp(Cmp)) {
-          CC = Cond.getOperand(0).getOperand(0);
-          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
-                              Chain, Dest, CC, Cmp);
-          CC = Cond.getOperand(1).getOperand(0);
-          Cond = Cmp;
-          addTest = false;
-        }
-      } else { // ISD::AND
-        // Also, recognize the pattern generated by an FCMP_OEQ. We can emit
-        // two branches instead of an explicit AND instruction with a
-        // separate test. However, we only do this if this block doesn't
-        // have a fall-through edge, because this requires an explicit
-        // jmp when the condition is false.
-        if (Cmp == Cond.getOperand(1).getOperand(1) &&
-            isX86LogicalCmp(Cmp) &&
-            Op.getNode()->hasOneUse()) {
-          X86::CondCode CCode0 =
-              (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
-          CCode0 = X86::GetOppositeBranchCondition(CCode0);
-          CC = DAG.getTargetConstant(CCode0, dl, MVT::i8);
-          SDNode *User = *Op.getNode()->use_begin();
-          // Look for an unconditional branch following this conditional branch.
-          // We need this because we need to reverse the successors in order
-          // to implement FCMP_OEQ.
-          if (User->getOpcode() == ISD::BR) {
-            SDValue FalseBB = User->getOperand(1);
-            SDNode *NewBR =
-              DAG.UpdateNodeOperands(User, User->getOperand(0), Dest);
-            assert(NewBR == User);
-            (void)NewBR;
-            Dest = FalseBB;
-
-            Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(), Chain,
-                                Dest, CC, Cmp);
-            X86::CondCode CCode1 =
-                (X86::CondCode)Cond.getOperand(1).getConstantOperandVal(0);
-            CCode1 = X86::GetOppositeBranchCondition(CCode1);
-            CC = DAG.getTargetConstant(CCode1, dl, MVT::i8);
-            Cond = Cmp;
-            addTest = false;
-          }
-        }
-      }
-    } else if (Cond.hasOneUse() && isXor1OfSetCC(Cond)) {
-      // Recognize for xorb (setcc), 1 patterns. The xor inverts the condition.
-      // It should be transformed during dag combiner except when the condition
-      // is set by a arithmetics with overflow node.
-      X86::CondCode CCode =
-        (X86::CondCode)Cond.getOperand(0).getConstantOperandVal(0);
-      CCode = X86::GetOppositeBranchCondition(CCode);
-      CC = DAG.getTargetConstant(CCode, dl, MVT::i8);
-      Cond = Cond.getOperand(0).getOperand(1);
-      addTest = false;
-    } else if (Cond.getOpcode() == ISD::SETCC &&
-               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETOEQ) {
+    if (CC == ISD::SETOEQ) {
       // For FCMP_OEQ, we can emit
       // two branches instead of an explicit AND instruction with a
       // separate test. However, we only do this if this block doesn't
@@ -22976,59 +23721,65 @@ SDValue X86TargetLowering::LowerBRCOND(SDValue Op, SelectionDAG &DAG) const {
           (void)NewBR;
           Dest = FalseBB;
 
-          SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
-                                    Cond.getOperand(0), Cond.getOperand(1));
-          Cmp = ConvertCmpIfNecessary(Cmp, DAG);
-          CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
-          Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
-                              Chain, Dest, CC, Cmp);
-          CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
-          Cond = Cmp;
-          addTest = false;
+          SDValue Cmp =
+              DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
+          SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
+          Chain = DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest,
+                              CCVal, Cmp);
+          CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
+          return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                             Cmp);
         }
       }
-    } else if (Cond.getOpcode() == ISD::SETCC &&
-               cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETUNE) {
+    } else if (CC == ISD::SETUNE) {
       // For FCMP_UNE, we can emit
       // two branches instead of an explicit OR instruction with a
       // separate test.
-      SDValue Cmp = DAG.getNode(X86ISD::CMP, dl, MVT::i32,
-                                Cond.getOperand(0), Cond.getOperand(1));
-      Cmp = ConvertCmpIfNecessary(Cmp, DAG);
-      CC = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
-      Chain = DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
-                          Chain, Dest, CC, Cmp);
-      CC = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
-      Cond = Cmp;
-      addTest = false;
+      SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
+      SDValue CCVal = DAG.getTargetConstant(X86::COND_NE, dl, MVT::i8);
+      Chain =
+          DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal, Cmp);
+      CCVal = DAG.getTargetConstant(X86::COND_P, dl, MVT::i8);
+      return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                         Cmp);
+    } else {
+      X86::CondCode X86Cond =
+          TranslateX86CC(CC, dl, /*IsFP*/ true, LHS, RHS, DAG);
+      SDValue Cmp = DAG.getNode(X86ISD::FCMP, SDLoc(Cond), MVT::i32, LHS, RHS);
+      SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+      return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                         Cmp);
     }
   }
 
-  if (addTest) {
-    // Look pass the truncate if the high bits are known zero.
-    if (isTruncWithZeroHighBitsInput(Cond, DAG))
-        Cond = Cond.getOperand(0);
+  if (ISD::isOverflowIntrOpRes(Cond)) {
+    SDValue Value, Overflow;
+    X86::CondCode X86Cond;
+    std::tie(Value, Overflow) = getX86XALUOOp(X86Cond, Cond.getValue(0), DAG);
 
-    // We know the result of AND is compared against zero. Try to match
-    // it to BT.
-    if (Cond.getOpcode() == ISD::AND && Cond.hasOneUse()) {
-      SDValue BTCC;
-      if (SDValue BT = LowerAndToBT(Cond, ISD::SETNE, dl, DAG, BTCC)) {
-        CC = BTCC;
-        Cond = BT;
-        addTest = false;
-      }
-    }
+    SDValue CCVal = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
+    return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                       Overflow);
   }
 
-  if (addTest) {
-    X86::CondCode X86Cond = Inverted ? X86::COND_E : X86::COND_NE;
-    CC = DAG.getTargetConstant(X86Cond, dl, MVT::i8);
-    Cond = EmitTest(Cond, X86Cond, dl, DAG, Subtarget);
-  }
-  Cond = ConvertCmpIfNecessary(Cond, DAG);
-  return DAG.getNode(X86ISD::BRCOND, dl, Op.getValueType(),
-                     Chain, Dest, CC, Cond);
+  // Look past the truncate if the high bits are known zero.
+  if (isTruncWithZeroHighBitsInput(Cond, DAG))
+    Cond = Cond.getOperand(0);
+
+  EVT CondVT = Cond.getValueType();
+
+  // Add an AND with 1 if we don't already have one.
+  if (!(Cond.getOpcode() == ISD::AND && isOneConstant(Cond.getOperand(1))))
+    Cond =
+        DAG.getNode(ISD::AND, dl, CondVT, Cond, DAG.getConstant(1, dl, CondVT));
+
+  SDValue LHS = Cond;
+  SDValue RHS = DAG.getConstant(0, dl, CondVT);
+
+  SDValue CCVal;
+  SDValue EFLAGS = emitFlagsForSetcc(LHS, RHS, ISD::SETNE, dl, DAG, CCVal);
+  return DAG.getNode(X86ISD::BRCOND, dl, MVT::Other, Chain, Dest, CCVal,
+                     EFLAGS);
 }
 
 // Lower dynamic stack allocation to _alloca call for Cygwin/Mingw targets.
@@ -23041,9 +23792,9 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
                                            SelectionDAG &DAG) const {
   MachineFunction &MF = DAG.getMachineFunction();
   bool SplitStack = MF.shouldSplitStack();
-  bool EmitStackProbe = !getStackProbeSymbolName(MF).empty();
+  bool EmitStackProbeCall = hasStackProbeSymbol(MF);
   bool Lower = (Subtarget.isOSWindows() && !Subtarget.isTargetMachO()) ||
-               SplitStack || EmitStackProbe;
+               SplitStack || EmitStackProbeCall;
   SDLoc dl(Op);
 
   // Get the inputs.
@@ -23067,12 +23818,22 @@ X86TargetLowering::LowerDYNAMIC_STACKALLOC(SDValue Op,
     assert(SPReg && "Target cannot require DYNAMIC_STACKALLOC expansion and"
                     " not tell us which reg is the stack pointer!");
 
-    SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
-    Chain = SP.getValue(1);
     const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
-    const Align StackAlign(TFI.getStackAlignment());
-    Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
-    if (Alignment && Alignment > StackAlign)
+    const Align StackAlign = TFI.getStackAlign();
+    if (hasInlineStackProbe(MF)) {
+      MachineRegisterInfo &MRI = MF.getRegInfo();
+
+      const TargetRegisterClass *AddrRegClass = getRegClassFor(SPTy);
+      Register Vreg = MRI.createVirtualRegister(AddrRegClass);
+      Chain = DAG.getCopyToReg(Chain, dl, Vreg, Size);
+      Result = DAG.getNode(X86ISD::PROBED_ALLOCA, dl, SPTy, Chain,
+                           DAG.getRegister(Vreg, SPTy));
+    } else {
+      SDValue SP = DAG.getCopyFromReg(Chain, dl, SPReg, VT);
+      Chain = SP.getValue(1);
+      Result = DAG.getNode(ISD::SUB, dl, VT, SP, Size); // Value
+    }
+    if (Alignment && *Alignment > StackAlign)
       Result =
           DAG.getNode(ISD::AND, dl, VT, Result,
                       DAG.getConstant(~(Alignment->value() - 1ULL), dl, VT));
@@ -23203,14 +23964,13 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
   // Decide which area this value should be read from.
   // TODO: Implement the AMD64 ABI in its entirety. This simple
   // selection mechanism works only for the basic types.
-  if (ArgVT == MVT::f80) {
-    llvm_unreachable("va_arg for f80 not yet implemented");
-  } else if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
+  assert(ArgVT != MVT::f80 && "va_arg for f80 not yet implemented");
+  if (ArgVT.isFloatingPoint() && ArgSize <= 16 /*bytes*/) {
     ArgMode = 2;  // Argument passed in XMM register. Use fp_offset.
-  } else if (ArgVT.isInteger() && ArgSize <= 32 /*bytes*/) {
-    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
   } else {
-    llvm_unreachable("Unhandled argument type in LowerVAARG");
+    assert(ArgVT.isInteger() && ArgSize <= 32 /*bytes*/ &&
+           "Unhandled argument type in LowerVAARG");
+    ArgMode = 1;  // Argument passed in GPR64 register(s). Use gp_offset.
   }
 
   if (ArgMode == 2) {
@@ -23227,11 +23987,8 @@ SDValue X86TargetLowering::LowerVAARG(SDValue Op, SelectionDAG &DAG) const {
                        DAG.getConstant(Align, dl, MVT::i32)};
   SDVTList VTs = DAG.getVTList(getPointerTy(DAG.getDataLayout()), MVT::Other);
   SDValue VAARG = DAG.getMemIntrinsicNode(
-    X86ISD::VAARG_64, dl,
-    VTs, InstOps, MVT::i64,
-    MachinePointerInfo(SV),
-    /*Align=*/0,
-    MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
+      X86ISD::VAARG_64, dl, VTs, InstOps, MVT::i64, MachinePointerInfo(SV),
+      /*Align=*/None, MachineMemOperand::MOLoad | MachineMemOperand::MOStore);
   Chain = VAARG.getValue(1);
 
   // Load the next argument and return it
@@ -23255,9 +24012,8 @@ static SDValue LowerVACOPY(SDValue Op, const X86Subtarget &Subtarget,
   const Value *SrcSV = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
   SDLoc DL(Op);
 
-  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr,
-                       DAG.getIntPtrConstant(24, DL), 8, /*isVolatile*/false,
-                       false, false,
+  return DAG.getMemcpy(Chain, DL, DstPtr, SrcPtr, DAG.getIntPtrConstant(24, DL),
+                       Align(8), /*isVolatile*/ false, false, false,
                        MachinePointerInfo(DstSV), MachinePointerInfo(SrcSV));
 }
 
@@ -23319,7 +24075,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
       for (unsigned i = 0; i != NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
         if (CurrentOp->isUndef()) {
-          Elts.push_back(CurrentOp);
+          // Must produce 0s in the correct bits.
+          Elts.push_back(DAG.getConstant(0, dl, ElementType));
           continue;
         }
         auto *ND = cast<ConstantSDNode>(CurrentOp);
@@ -23331,7 +24088,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
       for (unsigned i = 0; i != NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
         if (CurrentOp->isUndef()) {
-          Elts.push_back(CurrentOp);
+          // Must produce 0s in the correct bits.
+          Elts.push_back(DAG.getConstant(0, dl, ElementType));
           continue;
         }
         auto *ND = cast<ConstantSDNode>(CurrentOp);
@@ -23343,7 +24101,8 @@ static SDValue getTargetVShiftByConstNode(unsigned Opc, const SDLoc &dl, MVT VT,
       for (unsigned i = 0; i != NumElts; ++i) {
         SDValue CurrentOp = SrcOp->getOperand(i);
         if (CurrentOp->isUndef()) {
-          Elts.push_back(CurrentOp);
+          // All shifted in bits must be the same so use 0.
+          Elts.push_back(DAG.getConstant(0, dl, ElementType));
           continue;
         }
         auto *ND = cast<ConstantSDNode>(CurrentOp);
@@ -24001,8 +24760,11 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       ISD::CondCode CC = (ISD::CondCode)IntrData->Opc1;
       SDValue LHS = Op.getOperand(1);
       SDValue RHS = Op.getOperand(2);
+      // Some conditions require the operands to be swapped.
+      if (CC == ISD::SETLT || CC == ISD::SETLE)
+        std::swap(LHS, RHS);
+
       SDValue Comi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, LHS, RHS);
-      SDValue InvComi = DAG.getNode(IntrData->Opc0, dl, MVT::i32, RHS, LHS);
       SDValue SetCC;
       switch (CC) {
       case ISD::SETEQ: { // (ZF = 0 and PF = 0)
@@ -24018,18 +24780,14 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
         break;
       }
       case ISD::SETGT: // (CF = 0 and ZF = 0)
+      case ISD::SETLT: { // Condition opposite to GT. Operands swapped above.
         SetCC = getSETCC(X86::COND_A, Comi, dl, DAG);
         break;
-      case ISD::SETLT: { // The condition is opposite to GT. Swap the operands.
-        SetCC = getSETCC(X86::COND_A, InvComi, dl, DAG);
-        break;
       }
       case ISD::SETGE: // CF = 0
+      case ISD::SETLE: // Condition opposite to GE. Operands swapped above.
         SetCC = getSETCC(X86::COND_AE, Comi, dl, DAG);
         break;
-      case ISD::SETLE: // The condition is opposite to GE. Swap the operands.
-        SetCC = getSETCC(X86::COND_AE, InvComi, dl, DAG);
-        break;
       default:
         llvm_unreachable("Unexpected illegal condition!");
       }
@@ -24478,6 +25236,9 @@ SDValue X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
       // Clamp out of bounds shift amounts since they will otherwise be masked
       // to 8-bits which may make it no longer out of bounds.
       unsigned ShiftAmount = C->getAPIntValue().getLimitedValue(255);
+      if (ShiftAmount == 0)
+        return Op.getOperand(1);
+
       return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, DL, Op.getValueType(),
                          Op.getOperand(0), Op.getOperand(1),
                          DAG.getTargetConstant(ShiftAmount, DL, MVT::i32));
@@ -24537,19 +25298,23 @@ static SDValue getAVX2GatherNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
   SDValue Scale = DAG.getTargetConstant(C->getZExtValue(), dl,
                                         TLI.getPointerTy(DAG.getDataLayout()));
   EVT MaskVT = Mask.getValueType().changeVectorElementTypeToInteger();
-  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
   // If source is undef or we know it won't be used, use a zero vector
   // to break register dependency.
   // TODO: use undef instead and let BreakFalseDeps deal with it?
   if (Src.isUndef() || ISD::isBuildVectorAllOnes(Mask.getNode()))
     Src = getZeroVector(Op.getSimpleValueType(), Subtarget, DAG, dl);
 
+  // Cast mask to an integer type.
+  Mask = DAG.getBitcast(MaskVT, Mask);
+
   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
 
   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
-  SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
-    VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
-  return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
+  SDValue Res =
+      DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
+                              MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+  return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
 }
 
 static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
@@ -24574,7 +25339,7 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
   if (Mask.getValueType() != MaskVT)
     Mask = getMaskNode(Mask, MaskVT, Subtarget, DAG, dl);
 
-  SDVTList VTs = DAG.getVTList(Op.getValueType(), MaskVT, MVT::Other);
+  SDVTList VTs = DAG.getVTList(Op.getValueType(), MVT::Other);
   // If source is undef or we know it won't be used, use a zero vector
   // to break register dependency.
   // TODO: use undef instead and let BreakFalseDeps deal with it?
@@ -24584,9 +25349,10 @@ static SDValue getGatherNode(SDValue Op, SelectionDAG &DAG,
   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
 
   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale };
-  SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
-    VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
-  return DAG.getMergeValues({ Res, Res.getValue(2) }, dl);
+  SDValue Res =
+      DAG.getMemIntrinsicNode(X86ISD::MGATHER, dl, VTs, Ops,
+                              MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+  return DAG.getMergeValues({Res, Res.getValue(1)}, dl);
 }
 
 static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@@ -24612,11 +25378,12 @@ static SDValue getScatterNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
 
   MemIntrinsicSDNode *MemIntr = cast<MemIntrinsicSDNode>(Op);
 
-  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+  SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Ops[] = {Chain, Src, Mask, Base, Index, Scale};
-  SDValue Res = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
-      VTs, Ops, dl, MemIntr->getMemoryVT(), MemIntr->getMemOperand());
-  return Res.getValue(1);
+  SDValue Res =
+      DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
+                              MemIntr->getMemoryVT(), MemIntr->getMemOperand());
+  return Res;
 }
 
 static SDValue getPrefetchNode(unsigned Opc, SDValue Op, SelectionDAG &DAG,
@@ -24775,13 +25542,11 @@ static SDValue
 EmitTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl, SDValue Val,
                 SDValue Ptr, EVT MemVT, MachineMemOperand *MMO,
                 SelectionDAG &DAG) {
-
   SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Undef = DAG.getUNDEF(Ptr.getValueType());
   SDValue Ops[] = { Chain, Val, Ptr, Undef };
-  return SignedSat ?
-    DAG.getTargetMemSDNode<TruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
-    DAG.getTargetMemSDNode<TruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
+  unsigned Opc = SignedSat ? X86ISD::VTRUNCSTORES : X86ISD::VTRUNCSTOREUS;
+  return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
 }
 
 /// Emit Masked Truncating Store with signed or unsigned saturation.
@@ -24789,12 +25554,10 @@ static SDValue
 EmitMaskedTruncSStore(bool SignedSat, SDValue Chain, const SDLoc &Dl,
                       SDValue Val, SDValue Ptr, SDValue Mask, EVT MemVT,
                       MachineMemOperand *MMO, SelectionDAG &DAG) {
-
   SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Ops[] = { Chain, Val, Ptr, Mask };
-  return SignedSat ?
-    DAG.getTargetMemSDNode<MaskedTruncSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO) :
-    DAG.getTargetMemSDNode<MaskedTruncUSStoreSDNode>(VTs, Ops, Dl, MemVT, MMO);
+  unsigned Opc = SignedSat ? X86ISD::VMTRUNCSTORES : X86ISD::VMTRUNCSTOREUS;
+  return DAG.getMemIntrinsicNode(Opc, Dl, VTs, Ops, MemVT, MMO);
 }
 
 static SDValue LowerINTRINSIC_W_CHAIN(SDValue Op, const X86Subtarget &Subtarget,
@@ -25144,7 +25907,7 @@ SDValue X86TargetLowering::LowerFRAME_TO_ARGS_OFFSET(SDValue Op,
   return DAG.getIntPtrConstant(2 * RegInfo->getSlotSize(), SDLoc(Op));
 }
 
-unsigned X86TargetLowering::getExceptionPointerRegister(
+Register X86TargetLowering::getExceptionPointerRegister(
     const Constant *PersonalityFn) const {
   if (classifyEHPersonality(PersonalityFn) == EHPersonality::CoreCLR)
     return Subtarget.isTarget64BitLP64() ? X86::RDX : X86::EDX;
@@ -25152,7 +25915,7 @@ unsigned X86TargetLowering::getExceptionPointerRegister(
   return Subtarget.isTarget64BitLP64() ? X86::RAX : X86::EAX;
 }
 
-unsigned X86TargetLowering::getExceptionSelectorRegister(
+Register X86TargetLowering::getExceptionSelectorRegister(
     const Constant *PersonalityFn) const {
   // Funclet personalities don't use selectors (the runtime does the selection).
   assert(!isFuncletEHPersonality(classifyEHPersonality(PersonalityFn)));
@@ -25176,7 +25939,7 @@ SDValue X86TargetLowering::LowerEH_RETURN(SDValue Op, SelectionDAG &DAG) const {
           (FrameReg == X86::EBP && PtrVT == MVT::i32)) &&
          "Invalid Frame Register!");
   SDValue Frame = DAG.getCopyFromReg(DAG.getEntryNode(), dl, FrameReg, PtrVT);
-  unsigned StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
+  Register StoreAddrReg = (PtrVT == MVT::i64) ? X86::RCX : X86::ECX;
 
   SDValue StoreAddr = DAG.getNode(ISD::ADD, dl, PtrVT, Frame,
                                  DAG.getIntPtrConstant(RegInfo->getSlotSize(),
@@ -25390,93 +26153,51 @@ SDValue X86TargetLowering::LowerFLT_ROUNDS_(SDValue Op,
      2 Round to +inf
      3 Round to -inf
 
-  To perform the conversion, we do:
-    (((((FPSR & 0x800) >> 11) | ((FPSR & 0x400) >> 9)) + 1) & 3)
+  To perform the conversion, we use a packed lookup table of the four 2-bit
+  values that we can index by FPSP[11:10]
+    0x2d --> (0b00,10,11,01) --> (0,2,3,1) >> FPSR[11:10]
+
+    (0x2d >> ((FPSR & 0xc00) >> 9)) & 3
   */
 
   MachineFunction &MF = DAG.getMachineFunction();
-  const TargetFrameLowering &TFI = *Subtarget.getFrameLowering();
-  const Align StackAlignment(TFI.getStackAlignment());
   MVT VT = Op.getSimpleValueType();
   SDLoc DL(Op);
 
   // Save FP Control Word to stack slot
-  int SSFI =
-      MF.getFrameInfo().CreateStackObject(2, StackAlignment.value(), false);
+  int SSFI = MF.getFrameInfo().CreateStackObject(2, Align(2), false);
   SDValue StackSlot =
       DAG.getFrameIndex(SSFI, getPointerTy(DAG.getDataLayout()));
 
-  MachineMemOperand *MMO =
-      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(MF, SSFI),
-                              MachineMemOperand::MOStore, 2, 2);
+  MachinePointerInfo MPI = MachinePointerInfo::getFixedStack(MF, SSFI);
 
-  SDValue Ops[] = { DAG.getEntryNode(), StackSlot };
-  SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
-                                          DAG.getVTList(MVT::Other),
-                                          Ops, MVT::i16, MMO);
+  SDValue Chain = Op.getOperand(0);
+  SDValue Ops[] = {Chain, StackSlot};
+  Chain = DAG.getMemIntrinsicNode(X86ISD::FNSTCW16m, DL,
+                                  DAG.getVTList(MVT::Other), Ops, MVT::i16, MPI,
+                                  Align(2), MachineMemOperand::MOStore);
 
   // Load FP Control Word from stack slot
-  SDValue CWD =
-      DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MachinePointerInfo());
+  SDValue CWD = DAG.getLoad(MVT::i16, DL, Chain, StackSlot, MPI, Align(2));
+  Chain = CWD.getValue(1);
 
-  // Transform as necessary
-  SDValue CWD1 =
-    DAG.getNode(ISD::SRL, DL, MVT::i16,
-                DAG.getNode(ISD::AND, DL, MVT::i16,
-                            CWD, DAG.getConstant(0x800, DL, MVT::i16)),
-                DAG.getConstant(11, DL, MVT::i8));
-  SDValue CWD2 =
+  // Mask and turn the control bits into a shift for the lookup table.
+  SDValue Shift =
     DAG.getNode(ISD::SRL, DL, MVT::i16,
                 DAG.getNode(ISD::AND, DL, MVT::i16,
-                            CWD, DAG.getConstant(0x400, DL, MVT::i16)),
+                            CWD, DAG.getConstant(0xc00, DL, MVT::i16)),
                 DAG.getConstant(9, DL, MVT::i8));
+  Shift = DAG.getNode(ISD::TRUNCATE, DL, MVT::i8, Shift);
 
+  SDValue LUT = DAG.getConstant(0x2d, DL, MVT::i32);
   SDValue RetVal =
-    DAG.getNode(ISD::AND, DL, MVT::i16,
-                DAG.getNode(ISD::ADD, DL, MVT::i16,
-                            DAG.getNode(ISD::OR, DL, MVT::i16, CWD1, CWD2),
-                            DAG.getConstant(1, DL, MVT::i16)),
-                DAG.getConstant(3, DL, MVT::i16));
-
-  return DAG.getNode((VT.getSizeInBits() < 16 ?
-                      ISD::TRUNCATE : ISD::ZERO_EXTEND), DL, VT, RetVal);
-}
-
-// Split an unary integer op into 2 half sized ops.
-static SDValue LowerVectorIntUnary(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-  unsigned NumElems = VT.getVectorNumElements();
-  unsigned SizeInBits = VT.getSizeInBits();
-  MVT EltVT = VT.getVectorElementType();
-  SDValue Src = Op.getOperand(0);
-  assert(EltVT == Src.getSimpleValueType().getVectorElementType() &&
-         "Src and Op should have the same element type!");
+    DAG.getNode(ISD::AND, DL, MVT::i32,
+                DAG.getNode(ISD::SRL, DL, MVT::i32, LUT, Shift),
+                DAG.getConstant(3, DL, MVT::i32));
 
-  // Extract the Lo/Hi vectors
-  SDLoc dl(Op);
-  SDValue Lo = extractSubVector(Src, 0, DAG, dl, SizeInBits / 2);
-  SDValue Hi = extractSubVector(Src, NumElems / 2, DAG, dl, SizeInBits / 2);
+  RetVal = DAG.getZExtOrTrunc(RetVal, DL, VT);
 
-  MVT NewVT = MVT::getVectorVT(EltVT, NumElems / 2);
-  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
-                     DAG.getNode(Op.getOpcode(), dl, NewVT, Lo),
-                     DAG.getNode(Op.getOpcode(), dl, NewVT, Hi));
-}
-
-// Decompose 256-bit ops into smaller 128-bit ops.
-static SDValue Lower256IntUnary(SDValue Op, SelectionDAG &DAG) {
-  assert(Op.getSimpleValueType().is256BitVector() &&
-         Op.getSimpleValueType().isInteger() &&
-         "Only handle AVX 256-bit vector integer operation");
-  return LowerVectorIntUnary(Op, DAG);
-}
-
-// Decompose 512-bit ops into smaller 256-bit ops.
-static SDValue Lower512IntUnary(SDValue Op, SelectionDAG &DAG) {
-  assert(Op.getSimpleValueType().is512BitVector() &&
-         Op.getSimpleValueType().isInteger() &&
-         "Only handle AVX 512-bit vector integer operation");
-  return LowerVectorIntUnary(Op, DAG);
+  return DAG.getMergeValues({RetVal, Chain}, DL);
 }
 
 /// Lower a vector CTLZ using native supported vector CTLZ instruction.
@@ -25499,7 +26220,7 @@ static SDValue LowerVectorCTLZ_AVX512CDI(SDValue Op, SelectionDAG &DAG,
   // Split vector, it's Lo and Hi parts will be handled in next iteration.
   if (NumElems > 16 ||
       (NumElems == 16 && !Subtarget.canExtendTo512DQ()))
-    return LowerVectorIntUnary(Op, DAG);
+    return splitVectorIntUnary(Op, DAG);
 
   MVT NewVT = MVT::getVectorVT(MVT::i32, NumElems);
   assert((NewVT.is256BitVector() || NewVT.is512BitVector()) &&
@@ -25609,11 +26330,11 @@ static SDValue LowerVectorCTLZ(SDValue Op, const SDLoc &DL,
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
-    return Lower256IntUnary(Op, DAG);
+    return splitVectorIntUnary(Op, DAG);
 
   // Decompose 512-bit ops into smaller 256-bit ops.
   if (VT.is512BitVector() && !Subtarget.hasBWI())
-    return Lower512IntUnary(Op, DAG);
+    return splitVectorIntUnary(Op, DAG);
 
   assert(Subtarget.hasSSSE3() && "Expected SSSE3 support for PSHUFB");
   return LowerVectorCTLZInRegLUT(Op, DL, Subtarget, DAG);
@@ -25679,64 +26400,6 @@ static SDValue LowerCTTZ(SDValue Op, const X86Subtarget &Subtarget,
   return DAG.getNode(X86ISD::CMOV, dl, VT, Ops);
 }
 
-/// Break a 256-bit integer operation into two new 128-bit ones and then
-/// concatenate the result back.
-static SDValue split256IntArith(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-
-  assert(VT.is256BitVector() && VT.isInteger() &&
-         "Unsupported value type for operation");
-
-  unsigned NumElems = VT.getVectorNumElements();
-  SDLoc dl(Op);
-
-  // Extract the LHS vectors
-  SDValue LHS = Op.getOperand(0);
-  SDValue LHS1 = extract128BitVector(LHS, 0, DAG, dl);
-  SDValue LHS2 = extract128BitVector(LHS, NumElems / 2, DAG, dl);
-
-  // Extract the RHS vectors
-  SDValue RHS = Op.getOperand(1);
-  SDValue RHS1 = extract128BitVector(RHS, 0, DAG, dl);
-  SDValue RHS2 = extract128BitVector(RHS, NumElems / 2, DAG, dl);
-
-  MVT EltVT = VT.getVectorElementType();
-  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
-  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
-                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
-                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
-}
-
-/// Break a 512-bit integer operation into two new 256-bit ones and then
-/// concatenate the result back.
-static SDValue split512IntArith(SDValue Op, SelectionDAG &DAG) {
-  MVT VT = Op.getSimpleValueType();
-
-  assert(VT.is512BitVector() && VT.isInteger() &&
-         "Unsupported value type for operation");
-
-  unsigned NumElems = VT.getVectorNumElements();
-  SDLoc dl(Op);
-
-  // Extract the LHS vectors
-  SDValue LHS = Op.getOperand(0);
-  SDValue LHS1 = extract256BitVector(LHS, 0, DAG, dl);
-  SDValue LHS2 = extract256BitVector(LHS, NumElems / 2, DAG, dl);
-
-  // Extract the RHS vectors
-  SDValue RHS = Op.getOperand(1);
-  SDValue RHS1 = extract256BitVector(RHS, 0, DAG, dl);
-  SDValue RHS2 = extract256BitVector(RHS, NumElems / 2, DAG, dl);
-
-  MVT EltVT = VT.getVectorElementType();
-  MVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
-
-  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT,
-                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, RHS1),
-                     DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, RHS2));
-}
-
 static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
                            const X86Subtarget &Subtarget) {
   MVT VT = Op.getSimpleValueType();
@@ -25747,10 +26410,13 @@ static SDValue lowerAddSub(SDValue Op, SelectionDAG &DAG,
     return DAG.getNode(ISD::XOR, SDLoc(Op), VT,
                        Op.getOperand(0), Op.getOperand(1));
 
+  if (VT == MVT::v32i16 || VT == MVT::v64i8)
+    return splitVectorIntBinary(Op, DAG);
+
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
-  return split256IntArith(Op, DAG);
+  return splitVectorIntBinary(Op, DAG);
 }
 
 static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
@@ -25795,10 +26461,13 @@ static SDValue LowerADDSAT_SUBSAT(SDValue Op, SelectionDAG &DAG,
     return SDValue();
   }
 
+  if (VT == MVT::v32i16 || VT == MVT::v64i8)
+    return splitVectorIntBinary(Op, DAG);
+
   assert(Op.getSimpleValueType().is256BitVector() &&
          Op.getSimpleValueType().isInteger() &&
          "Only handle AVX 256-bit vector integer operation");
-  return split256IntArith(Op, DAG);
+  return splitVectorIntBinary(Op, DAG);
 }
 
 static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
@@ -25828,9 +26497,12 @@ static SDValue LowerABS(SDValue Op, const X86Subtarget &Subtarget,
   if (VT.is256BitVector() && !Subtarget.hasInt256()) {
     assert(VT.isInteger() &&
            "Only handle AVX 256-bit vector integer operation");
-    return Lower256IntUnary(Op, DAG);
+    return splitVectorIntUnary(Op, DAG);
   }
 
+  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+    return splitVectorIntUnary(Op, DAG);
+
   // Default to expand.
   return SDValue();
 }
@@ -25840,7 +26512,10 @@ static SDValue LowerMINMAX(SDValue Op, SelectionDAG &DAG) {
 
   // For AVX1 cases, split to use legal ops (everything but v4i64).
   if (VT.getScalarType() != MVT::i64 && VT.is256BitVector())
-    return split256IntArith(Op, DAG);
+    return splitVectorIntBinary(Op, DAG);
+
+  if (VT == MVT::v32i16 || VT == MVT::v64i8)
+    return splitVectorIntBinary(Op, DAG);
 
   SDLoc DL(Op);
   unsigned Opcode = Op.getOpcode();
@@ -25884,7 +26559,10 @@ static SDValue LowerMUL(SDValue Op, const X86Subtarget &Subtarget,
 
   // Decompose 256-bit ops into 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
-    return split256IntArith(Op, DAG);
+    return splitVectorIntBinary(Op, DAG);
+
+  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+    return splitVectorIntBinary(Op, DAG);
 
   SDValue A = Op.getOperand(0);
   SDValue B = Op.getOperand(1);
@@ -26030,7 +26708,10 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
 
   // Decompose 256-bit ops into 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
-    return split256IntArith(Op, DAG);
+    return splitVectorIntBinary(Op, DAG);
+
+  if ((VT == MVT::v32i16 || VT == MVT::v64i8) && !Subtarget.hasBWI())
+    return splitVectorIntBinary(Op, DAG);
 
   if (VT == MVT::v4i32 || VT == MVT::v8i32 || VT == MVT::v16i32) {
     assert((VT == MVT::v4i32 && Subtarget.hasSSE2()) ||
@@ -26119,41 +26800,9 @@ static SDValue LowerMULH(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getNode(ISD::TRUNCATE, dl, VT, Mul);
   }
 
-  // For signed 512-bit vectors, split into 256-bit vectors to allow the
-  // sign-extension to occur.
-  if (VT == MVT::v64i8 && IsSigned)
-    return split512IntArith(Op, DAG);
-
-  // Signed AVX2 implementation - extend xmm subvectors to ymm.
-  if (VT == MVT::v32i8 && IsSigned) {
-    MVT ExVT = MVT::v16i16;
-    SDValue ALo = extract128BitVector(A, 0, DAG, dl);
-    SDValue BLo = extract128BitVector(B, 0, DAG, dl);
-    SDValue AHi = extract128BitVector(A, NumElts / 2, DAG, dl);
-    SDValue BHi = extract128BitVector(B, NumElts / 2, DAG, dl);
-    ALo = DAG.getNode(ExAVX, dl, ExVT, ALo);
-    BLo = DAG.getNode(ExAVX, dl, ExVT, BLo);
-    AHi = DAG.getNode(ExAVX, dl, ExVT, AHi);
-    BHi = DAG.getNode(ExAVX, dl, ExVT, BHi);
-    SDValue Lo = DAG.getNode(ISD::MUL, dl, ExVT, ALo, BLo);
-    SDValue Hi = DAG.getNode(ISD::MUL, dl, ExVT, AHi, BHi);
-    Lo = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Lo, 8, DAG);
-    Hi = getTargetVShiftByConstNode(X86ISD::VSRLI, dl, ExVT, Hi, 8, DAG);
-
-    // Bitcast back to VT and then pack all the even elements from Lo and Hi.
-    // Shuffle lowering should turn this into PACKUS+PERMQ
-    Lo = DAG.getBitcast(VT, Lo);
-    Hi = DAG.getBitcast(VT, Hi);
-    return DAG.getVectorShuffle(VT, dl, Lo, Hi,
-                                { 0,  2,  4,  6,  8, 10, 12, 14,
-                                 16, 18, 20, 22, 24, 26, 28, 30,
-                                 32, 34, 36, 38, 40, 42, 44, 46,
-                                 48, 50, 52, 54, 56, 58, 60, 62});
-  }
-
-  // For signed v16i8 and all unsigned vXi8 we will unpack the low and high
-  // half of each 128 bit lane to widen to a vXi16 type. Do the multiplies,
-  // shift the results and pack the half lane results back together.
+  // For vXi8 we will unpack the low and high half of each 128 bit lane to widen
+  // to a vXi16 type. Do the multiplies, shift the results and pack the half
+  // lane results back together.
 
   MVT ExVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
@@ -26267,9 +26916,12 @@ SDValue X86TargetLowering::LowerWin64_i128OP(SDValue Op, SelectionDAG &DAG) cons
     assert(ArgVT.isInteger() && ArgVT.getSizeInBits() == 128 &&
            "Unexpected argument type for lowering");
     SDValue StackPtr = DAG.CreateStackTemporary(ArgVT, 16);
+    int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+    MachinePointerInfo MPI =
+        MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
     Entry.Node = StackPtr;
     InChain = DAG.getStore(InChain, dl, Op->getOperand(i), StackPtr,
-                           MachinePointerInfo(), /* Alignment = */ 16);
+                           MPI, /* Alignment = */ 16);
     Type *ArgTy = ArgVT.getTypeForEVT(*DAG.getContext());
     Entry.Ty = PointerType::get(ArgTy,0);
     Entry.IsSExt = false;
@@ -26410,7 +27062,7 @@ static SDValue LowerScalarImmediateShift(SDValue Op, SelectionDAG &DAG,
     return ArithmeticShiftRight64(ShiftAmt);
 
   if (VT == MVT::v16i8 || (Subtarget.hasInt256() && VT == MVT::v32i8) ||
-      VT == MVT::v64i8) {
+      (Subtarget.hasBWI() && VT == MVT::v64i8)) {
     unsigned NumElts = VT.getVectorNumElements();
     MVT ShiftVT = MVT::getVectorVT(MVT::i16, NumElts / 2);
 
@@ -26856,8 +27508,8 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
   // Constant ISD::SRA/SRL can be performed efficiently on vXi8 vectors as we
   // extend to vXi16 to perform a MUL scale effectively as a MUL_LOHI.
   if (ConstantAmt && (Opc == ISD::SRA || Opc == ISD::SRL) &&
-      (VT == MVT::v16i8 || VT == MVT::v64i8 ||
-       (VT == MVT::v32i8 && Subtarget.hasInt256())) &&
+      (VT == MVT::v16i8 || (VT == MVT::v32i8 && Subtarget.hasInt256()) ||
+       (VT == MVT::v64i8 && Subtarget.hasBWI())) &&
       !Subtarget.hasXOP()) {
     int NumElts = VT.getVectorNumElements();
     SDValue Cst8 = DAG.getTargetConstant(8, dl, MVT::i8);
@@ -26920,12 +27572,13 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
                            ISD::SETGT);
         return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
       } else if (Subtarget.hasSSE41()) {
-        // On SSE41 targets we make use of the fact that VSELECT lowers
-        // to PBLENDVB which selects bytes based just on the sign bit.
+        // On SSE41 targets we can use PBLENDVB which selects bytes based just
+        // on the sign bit.
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
-        return DAG.getBitcast(SelVT, DAG.getSelect(dl, VT, Sel, V0, V1));
+        return DAG.getBitcast(SelVT,
+                              DAG.getNode(X86ISD::BLENDV, dl, VT, Sel, V0, V1));
       }
       // On pre-SSE41 targets we test for the sign bit by comparing to
       // zero - a negative value will set all bits of the lanes to true
@@ -27035,14 +27688,15 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
                     !ISD::isBuildVectorOfConstantSDNodes(Amt.getNode());
 
     auto SignBitSelect = [&](SDValue Sel, SDValue V0, SDValue V1) {
-      // On SSE41 targets we make use of the fact that VSELECT lowers
-      // to PBLENDVB which selects bytes based just on the sign bit.
+      // On SSE41 targets we can use PBLENDVB which selects bytes based just on
+      // the sign bit.
       if (UseSSE41) {
         MVT ExtVT = MVT::getVectorVT(MVT::i8, VT.getVectorNumElements() * 2);
         V0 = DAG.getBitcast(ExtVT, V0);
         V1 = DAG.getBitcast(ExtVT, V1);
         Sel = DAG.getBitcast(ExtVT, Sel);
-        return DAG.getBitcast(VT, DAG.getSelect(dl, ExtVT, Sel, V0, V1));
+        return DAG.getBitcast(
+            VT, DAG.getNode(X86ISD::BLENDV, dl, ExtVT, Sel, V0, V1));
       }
       // On pre-SSE41 targets we splat the sign bit - a negative value will
       // set all bits of the lanes to true and VSELECT uses that in
@@ -27093,7 +27747,10 @@ static SDValue LowerShift(SDValue Op, const X86Subtarget &Subtarget,
 
   // Decompose 256-bit shifts into 128-bit shifts.
   if (VT.is256BitVector())
-    return split256IntArith(Op, DAG);
+    return splitVectorIntBinary(Op, DAG);
+
+  if (VT == MVT::v32i16 || VT == MVT::v64i8)
+    return splitVectorIntBinary(Op, DAG);
 
   return SDValue();
 }
@@ -27111,28 +27768,21 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   int NumElts = VT.getVectorNumElements();
 
   // Check for constant splat rotation amount.
-  APInt UndefElts;
-  SmallVector<APInt, 32> EltBits;
-  int CstSplatIndex = -1;
-  if (getTargetConstantBitsFromNode(Amt, EltSizeInBits, UndefElts, EltBits))
-    for (int i = 0; i != NumElts; ++i)
-      if (!UndefElts[i]) {
-        if (CstSplatIndex < 0 || EltBits[i] == EltBits[CstSplatIndex]) {
-          CstSplatIndex = i;
-          continue;
-        }
-        CstSplatIndex = -1;
-        break;
-      }
+  APInt CstSplatValue;
+  bool IsCstSplat = X86::isConstantSplat(Amt, CstSplatValue);
+
+  // Check for splat rotate by zero.
+  if (IsCstSplat && CstSplatValue.urem(EltSizeInBits) == 0)
+    return R;
 
   // AVX512 implicitly uses modulo rotation amounts.
   if (Subtarget.hasAVX512() && 32 <= EltSizeInBits) {
     // Attempt to rotate by immediate.
-    if (0 <= CstSplatIndex) {
-      unsigned Op = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
-      uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
-      return DAG.getNode(Op, DL, VT, R,
-                         DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
+    if (IsCstSplat) {
+      unsigned RotOpc = (Opcode == ISD::ROTL ? X86ISD::VROTLI : X86ISD::VROTRI);
+      uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
+      return DAG.getNode(RotOpc, DL, VT, R,
+                         DAG.getTargetConstant(RotAmt, DL, MVT::i8));
     }
 
     // Else, fall-back on VPROLV/VPRORV.
@@ -27146,14 +27796,14 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
   // XOP implicitly uses modulo rotation amounts.
   if (Subtarget.hasXOP()) {
     if (VT.is256BitVector())
-      return split256IntArith(Op, DAG);
+      return splitVectorIntBinary(Op, DAG);
     assert(VT.is128BitVector() && "Only rotate 128-bit vectors!");
 
     // Attempt to rotate by immediate.
-    if (0 <= CstSplatIndex) {
-      uint64_t RotateAmt = EltBits[CstSplatIndex].urem(EltSizeInBits);
+    if (IsCstSplat) {
+      uint64_t RotAmt = CstSplatValue.urem(EltSizeInBits);
       return DAG.getNode(X86ISD::VROTLI, DL, VT, R,
-                         DAG.getTargetConstant(RotateAmt, DL, MVT::i8));
+                         DAG.getTargetConstant(RotAmt, DL, MVT::i8));
     }
 
     // Use general rotate by variable (per-element).
@@ -27162,7 +27812,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
 
   // Split 256-bit integers on pre-AVX2 targets.
   if (VT.is256BitVector() && !Subtarget.hasAVX2())
-    return split256IntArith(Op, DAG);
+    return splitVectorIntBinary(Op, DAG);
 
   assert((VT == MVT::v4i32 || VT == MVT::v8i16 || VT == MVT::v16i8 ||
           ((VT == MVT::v8i32 || VT == MVT::v16i16 || VT == MVT::v32i8) &&
@@ -27170,7 +27820,7 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
          "Only vXi32/vXi16/vXi8 vector rotates supported");
 
   // Rotate by an uniform constant - expand back to shifts.
-  if (0 <= CstSplatIndex)
+  if (IsCstSplat)
     return SDValue();
 
   bool IsSplatAmt = DAG.isSplatValue(Amt);
@@ -27186,12 +27836,13 @@ static SDValue LowerRotate(SDValue Op, const X86Subtarget &Subtarget,
 
     auto SignBitSelect = [&](MVT SelVT, SDValue Sel, SDValue V0, SDValue V1) {
       if (Subtarget.hasSSE41()) {
-        // On SSE41 targets we make use of the fact that VSELECT lowers
-        // to PBLENDVB which selects bytes based just on the sign bit.
+        // On SSE41 targets we can use PBLENDVB which selects bytes based just
+        // on the sign bit.
         V0 = DAG.getBitcast(VT, V0);
         V1 = DAG.getBitcast(VT, V1);
         Sel = DAG.getBitcast(VT, Sel);
-        return DAG.getBitcast(SelVT, DAG.getSelect(DL, VT, Sel, V0, V1));
+        return DAG.getBitcast(SelVT,
+                              DAG.getNode(X86ISD::BLENDV, DL, VT, Sel, V0, V1));
       }
       // On pre-SSE41 targets we test for the sign bit by comparing to
       // zero - a negative value will set all bits of the lanes to true
@@ -27303,15 +27954,14 @@ bool X86TargetLowering::needsCmpXchgNb(Type *MemType) const {
   return false;
 }
 
-// TODO: In 32-bit mode, use MOVLPS when SSE1 is available?
-// TODO: In 32-bit mode, use FISTP when X87 is available?
 bool X86TargetLowering::shouldExpandAtomicStoreInIR(StoreInst *SI) const {
   Type *MemType = SI->getValueOperand()->getType();
 
   bool NoImplicitFloatOps =
       SI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
-      !Subtarget.useSoftFloat() && !NoImplicitFloatOps && Subtarget.hasSSE2())
+      !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
+      (Subtarget.hasSSE1() || Subtarget.hasX87()))
     return false;
 
   return needsCmpXchgNb(MemType);
@@ -27330,7 +27980,7 @@ X86TargetLowering::shouldExpandAtomicLoadInIR(LoadInst *LI) const {
       LI->getFunction()->hasFnAttribute(Attribute::NoImplicitFloat);
   if (MemType->getPrimitiveSizeInBits() == 64 && !Subtarget.is64Bit() &&
       !Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
-      (Subtarget.hasSSE2() || Subtarget.hasX87()))
+      (Subtarget.hasSSE1() || Subtarget.hasX87()))
     return AtomicExpansionKind::None;
 
   return needsCmpXchgNb(MemType) ? AtomicExpansionKind::CmpXChg
@@ -27396,7 +28046,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
         AI->use_empty())
       return nullptr;
 
-  auto Builder = IRBuilder<>(AI);
+  IRBuilder<> Builder(AI);
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   auto SSID = AI->getSyncScopeID();
   // We must restrict the ordering to avoid generating loads with Release or
@@ -27438,7 +28088,7 @@ X86TargetLowering::lowerIdempotentRMWIntoFencedLoad(AtomicRMWInst *AI) const {
   // Finally we can emit the atomic load.
   LoadInst *Loaded =
       Builder.CreateAlignedLoad(AI->getType(), AI->getPointerOperand(),
-                                AI->getType()->getPrimitiveSizeInBits());
+                                Align(AI->getType()->getPrimitiveSizeInBits()));
   Loaded->setAtomic(Order, SSID);
   AI->replaceAllUsesWith(Loaded);
   AI->eraseFromParent();
@@ -27633,18 +28283,6 @@ static SDValue LowerBITCAST(SDValue Op, const X86Subtarget &Subtarget,
     return DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v64i1, Lo, Hi);
   }
 
-  // Custom splitting for BWI types when AVX512F is available but BWI isn't.
-  if ((SrcVT == MVT::v32i16 || SrcVT == MVT::v64i8) && DstVT.isVector() &&
-    DAG.getTargetLoweringInfo().isTypeLegal(DstVT)) {
-    SDLoc dl(Op);
-    SDValue Lo, Hi;
-    std::tie(Lo, Hi) = DAG.SplitVector(Op.getOperand(0), dl);
-    MVT CastVT = DstVT.getHalfNumVectorElementsVT();
-    Lo = DAG.getBitcast(CastVT, Lo);
-    Hi = DAG.getBitcast(CastVT, Hi);
-    return DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
-  }
-
   // Use MOVMSK for vector to scalar conversion to prevent scalarization.
   if ((SrcVT == MVT::v16i1 || SrcVT == MVT::v32i1) && DstVT.isScalarInteger()) {
     assert(!Subtarget.hasAVX512() && "Should use K-registers with AVX512");
@@ -27828,11 +28466,11 @@ static SDValue LowerVectorCTPOP(SDValue Op, const X86Subtarget &Subtarget,
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
-    return Lower256IntUnary(Op, DAG);
+    return splitVectorIntUnary(Op, DAG);
 
   // Decompose 512-bit ops into smaller 256-bit ops.
   if (VT.is512BitVector() && !Subtarget.hasBWI())
-    return Lower512IntUnary(Op, DAG);
+    return splitVectorIntUnary(Op, DAG);
 
   // For element types greater than i8, do vXi8 pop counts and a bytesum.
   if (VT.getScalarType() != MVT::i8) {
@@ -27876,7 +28514,7 @@ static SDValue LowerBITREVERSE_XOP(SDValue Op, SelectionDAG &DAG) {
 
   // Decompose 256-bit ops into smaller 128-bit ops.
   if (VT.is256BitVector())
-    return Lower256IntUnary(Op, DAG);
+    return splitVectorIntUnary(Op, DAG);
 
   assert(VT.is128BitVector() &&
          "Only 128-bit vector bitreverse lowering supported.");
@@ -27913,12 +28551,9 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
   SDValue In = Op.getOperand(0);
   SDLoc DL(Op);
 
-  // Split v8i64/v16i32 without BWI so that we can still use the PSHUFB
-  // lowering.
-  if (VT == MVT::v8i64 || VT == MVT::v16i32) {
-    assert(!Subtarget.hasBWI() && "BWI should Expand BITREVERSE");
-    return Lower512IntUnary(Op, DAG);
-  }
+  // Split v64i8 without BWI so that we can still use the PSHUFB lowering.
+  if (VT == MVT::v64i8 && !Subtarget.hasBWI())
+    return splitVectorIntUnary(Op, DAG);
 
   unsigned NumElts = VT.getVectorNumElements();
   assert(VT.getScalarType() == MVT::i8 &&
@@ -27926,7 +28561,7 @@ static SDValue LowerBITREVERSE(SDValue Op, const X86Subtarget &Subtarget,
 
   // Decompose 256-bit ops into smaller 128-bit ops on pre-AVX2.
   if (VT.is256BitVector() && !Subtarget.hasInt256())
-    return Lower256IntUnary(Op, DAG);
+    return splitVectorIntUnary(Op, DAG);
 
   // Perform BITREVERSE using PSHUFB lookups. Each byte is split into
   // two nibbles and a PSHUFB lookup to find the bitreverse of each
@@ -28070,28 +28705,54 @@ static SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG,
     return Op;
 
   if (VT == MVT::i64 && !IsTypeLegal) {
-    // For illegal i64 atomic_stores, we can try to use MOVQ if SSE2 is enabled.
-    // FIXME: Use movlps with SSE1.
-    // FIXME: Use fist with X87.
+    // For illegal i64 atomic_stores, we can try to use MOVQ or MOVLPS if SSE
+    // is enabled.
     bool NoImplicitFloatOps =
         DAG.getMachineFunction().getFunction().hasFnAttribute(
             Attribute::NoImplicitFloat);
-    if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps &&
-        Subtarget.hasSSE2()) {
-      SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
-                                     Node->getOperand(2));
-      SDVTList Tys = DAG.getVTList(MVT::Other);
-      SDValue Ops[] = { Node->getChain(), SclToVec, Node->getBasePtr() };
-      SDValue Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys,
-                                              Ops, MVT::i64,
-                                              Node->getMemOperand());
+    if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
+      SDValue Chain;
+      if (Subtarget.hasSSE1()) {
+        SDValue SclToVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v2i64,
+                                       Node->getOperand(2));
+        MVT StVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
+        SclToVec = DAG.getBitcast(StVT, SclToVec);
+        SDVTList Tys = DAG.getVTList(MVT::Other);
+        SDValue Ops[] = {Node->getChain(), SclToVec, Node->getBasePtr()};
+        Chain = DAG.getMemIntrinsicNode(X86ISD::VEXTRACT_STORE, dl, Tys, Ops,
+                                        MVT::i64, Node->getMemOperand());
+      } else if (Subtarget.hasX87()) {
+        // First load this into an 80-bit X87 register using a stack temporary.
+        // This will put the whole integer into the significand.
+        SDValue StackPtr = DAG.CreateStackTemporary(MVT::i64);
+        int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
+        MachinePointerInfo MPI =
+            MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
+        Chain =
+            DAG.getStore(Node->getChain(), dl, Node->getOperand(2), StackPtr,
+                         MPI, /*Align*/ 0, MachineMemOperand::MOStore);
+        SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
+        SDValue LdOps[] = {Chain, StackPtr};
+        SDValue Value =
+            DAG.getMemIntrinsicNode(X86ISD::FILD, dl, Tys, LdOps, MVT::i64, MPI,
+                                    /*Align*/ None, MachineMemOperand::MOLoad);
+        Chain = Value.getValue(1);
+
+        // Now use an FIST to do the atomic store.
+        SDValue StoreOps[] = {Chain, Value, Node->getBasePtr()};
+        Chain =
+            DAG.getMemIntrinsicNode(X86ISD::FIST, dl, DAG.getVTList(MVT::Other),
+                                    StoreOps, MVT::i64, Node->getMemOperand());
+      }
 
-      // If this is a sequentially consistent store, also emit an appropriate
-      // barrier.
-      if (IsSeqCst)
-        Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
+      if (Chain) {
+        // If this is a sequentially consistent store, also emit an appropriate
+        // barrier.
+        if (IsSeqCst)
+          Chain = emitLockedStackOp(DAG, Subtarget, Chain, dl);
 
-      return Chain;
+        return Chain;
+      }
     }
   }
 
@@ -28120,9 +28781,8 @@ static SDValue LowerADDSUBCARRY(SDValue Op, SelectionDAG &DAG) {
   // Set the carry flag.
   SDValue Carry = Op.getOperand(2);
   EVT CarryVT = Carry.getValueType();
-  APInt NegOne = APInt::getAllOnesValue(CarryVT.getScalarSizeInBits());
   Carry = DAG.getNode(X86ISD::ADD, DL, DAG.getVTList(CarryVT, MVT::i32),
-                      Carry, DAG.getConstant(NegOne, DL, CarryVT));
+                      Carry, DAG.getAllOnesConstant(DL, CarryVT));
 
   unsigned Opc = Op.getOpcode() == ISD::ADDCARRY ? X86ISD::ADC : X86ISD::SBB;
   SDValue Sum = DAG.getNode(Opc, DL, VTs, Op.getOperand(0),
@@ -28167,7 +28827,7 @@ static SDValue LowerFSINCOS(SDValue Op, const X86Subtarget &Subtarget,
       DAG.getExternalSymbol(LibcallName, TLI.getPointerTy(DAG.getDataLayout()));
 
   Type *RetTy = isF64 ? (Type *)StructType::get(ArgTy, ArgTy)
-                      : (Type *)VectorType::get(ArgTy, 4);
+                      : (Type *)FixedVectorType::get(ArgTy, 4);
 
   TargetLowering::CallLoweringInfo CLI(DAG);
   CLI.setDebugLoc(dl)
@@ -28264,17 +28924,15 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
       EVT WideVT = TLI.getTypeToTransformTo(*DAG.getContext(), VT);
       Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, WideVT, Src, DAG.getUNDEF(VT));
-      SDVTList VTs = DAG.getVTList(MVT::v2i1, MVT::Other);
+      SDVTList VTs = DAG.getVTList(MVT::Other);
       SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
-      SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
-          VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
-      return SDValue(NewScatter.getNode(), 1);
+      return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
+                                     N->getMemoryVT(), N->getMemOperand());
     }
     return SDValue();
   }
 
   MVT IndexVT = Index.getSimpleValueType();
-  MVT MaskVT = Mask.getSimpleValueType();
 
   // If the index is v2i32, we're being called by type legalization and we
   // should just let the default handling take care of it.
@@ -28292,18 +28950,17 @@ static SDValue LowerMSCATTER(SDValue Op, const X86Subtarget &Subtarget,
 
     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
-    MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
 
     Src = ExtendToType(Src, VT, DAG);
     Index = ExtendToType(Index, IndexVT, DAG);
     Mask = ExtendToType(Mask, MaskVT, DAG, true);
   }
 
-  SDVTList VTs = DAG.getVTList(MaskVT, MVT::Other);
+  SDVTList VTs = DAG.getVTList(MVT::Other);
   SDValue Ops[] = {Chain, Src, Mask, BasePtr, Index, Scale};
-  SDValue NewScatter = DAG.getTargetMemSDNode<X86MaskedScatterSDNode>(
-      VTs, Ops, dl, N->getMemoryVT(), N->getMemOperand());
-  return SDValue(NewScatter.getNode(), 1);
+  return DAG.getMemIntrinsicNode(X86ISD::MSCATTER, dl, VTs, Ops,
+                                 N->getMemoryVT(), N->getMemOperand());
 }
 
 static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
@@ -28329,8 +28986,7 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
         N->getMemOperand(), N->getAddressingMode(), N->getExtensionType(),
         N->isExpandingLoad());
     // Emit a blend.
-    SDValue Select = DAG.getNode(ISD::VSELECT, dl, MaskVT, Mask, NewLoad,
-                                 PassThru);
+    SDValue Select = DAG.getNode(ISD::VSELECT, dl, VT, Mask, NewLoad, PassThru);
     return DAG.getMergeValues({ Select, NewLoad.getValue(1) }, dl);
   }
 
@@ -28366,10 +29022,10 @@ static SDValue LowerMLOAD(SDValue Op, const X86Subtarget &Subtarget,
       PassThru, N->getMemoryVT(), N->getMemOperand(), N->getAddressingMode(),
       N->getExtensionType(), N->isExpandingLoad());
 
-  SDValue Exract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT,
-                               NewLoad.getValue(0),
-                               DAG.getIntPtrConstant(0, dl));
-  SDValue RetOps[] = {Exract, NewLoad.getValue(1)};
+  SDValue Extract =
+      DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, VT, NewLoad.getValue(0),
+                  DAG.getIntPtrConstant(0, dl));
+  SDValue RetOps[] = {Extract, NewLoad.getValue(1)};
   return DAG.getMergeValues(RetOps, dl);
 }
 
@@ -28427,7 +29083,6 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
   SDValue Mask = N->getMask();
   SDValue PassThru = N->getPassThru();
   MVT IndexVT = Index.getSimpleValueType();
-  MVT MaskVT = Mask.getSimpleValueType();
 
   assert(VT.getScalarSizeInBits() >= 32 && "Unsupported gather op");
 
@@ -28448,7 +29103,7 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
 
     VT = MVT::getVectorVT(VT.getVectorElementType(), NumElts);
     IndexVT = MVT::getVectorVT(IndexVT.getVectorElementType(), NumElts);
-    MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
+    MVT MaskVT = MVT::getVectorVT(MVT::i1, NumElts);
 
     PassThru = ExtendToType(PassThru, VT, DAG);
     Index = ExtendToType(Index, IndexVT, DAG);
@@ -28457,12 +29112,12 @@ static SDValue LowerMGATHER(SDValue Op, const X86Subtarget &Subtarget,
 
   SDValue Ops[] = { N->getChain(), PassThru, Mask, N->getBasePtr(), Index,
                     N->getScale() };
-  SDValue NewGather = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
-      DAG.getVTList(VT, MaskVT, MVT::Other), Ops, dl, N->getMemoryVT(),
+  SDValue NewGather = DAG.getMemIntrinsicNode(
+      X86ISD::MGATHER, dl, DAG.getVTList(VT, MVT::Other), Ops, N->getMemoryVT(),
       N->getMemOperand());
   SDValue Extract = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, OrigVT,
                                 NewGather, DAG.getIntPtrConstant(0, dl));
-  return DAG.getMergeValues({Extract, NewGather.getValue(2)}, dl);
+  return DAG.getMergeValues({Extract, NewGather.getValue(1)}, dl);
 }
 
 static SDValue LowerADDRSPACECAST(SDValue Op, SelectionDAG &DAG) {
@@ -28528,6 +29183,20 @@ SDValue X86TargetLowering::LowerF128Call(SDValue Op, SelectionDAG &DAG,
   return Tmp.first;
 }
 
+// Custom split CVTPS2PH with wide types.
+static SDValue LowerCVTPS2PH(SDValue Op, SelectionDAG &DAG) {
+  SDLoc dl(Op);
+  EVT VT = Op.getValueType();
+  SDValue Lo, Hi;
+  std::tie(Lo, Hi) = DAG.SplitVectorOperand(Op.getNode(), 0);
+  EVT LoVT, HiVT;
+  std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+  SDValue RC = Op.getOperand(1);
+  Lo = DAG.getNode(X86ISD::CVTPS2PH, dl, LoVT, Lo, RC);
+  Hi = DAG.getNode(X86ISD::CVTPS2PH, dl, HiVT, Hi, RC);
+  return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+}
+
 /// Provide custom lowering hooks for some operations.
 SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   switch (Op.getOpcode()) {
@@ -28581,14 +29250,21 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::STRICT_FP_EXTEND:   return LowerFP_EXTEND(Op, DAG);
   case ISD::FP_ROUND:
   case ISD::STRICT_FP_ROUND:    return LowerFP_ROUND(Op, DAG);
+  case ISD::FP16_TO_FP:
+  case ISD::STRICT_FP16_TO_FP:  return LowerFP16_TO_FP(Op, DAG);
+  case ISD::FP_TO_FP16:
+  case ISD::STRICT_FP_TO_FP16:  return LowerFP_TO_FP16(Op, DAG);
   case ISD::LOAD:               return LowerLoad(Op, Subtarget, DAG);
   case ISD::STORE:              return LowerStore(Op, Subtarget, DAG);
   case ISD::FADD:
   case ISD::FSUB:               return lowerFaddFsub(Op, DAG);
+  case ISD::FROUND:             return LowerFROUND(Op, DAG);
   case ISD::FABS:
   case ISD::FNEG:               return LowerFABSorFNEG(Op, DAG);
   case ISD::FCOPYSIGN:          return LowerFCOPYSIGN(Op, DAG);
   case ISD::FGETSIGN:           return LowerFGETSIGN(Op, DAG);
+  case ISD::LRINT:
+  case ISD::LLRINT:             return LowerLRINT_LLRINT(Op, DAG);
   case ISD::SETCC:
   case ISD::STRICT_FSETCC:
   case ISD::STRICT_FSETCCS:     return LowerSETCC(Op, DAG);
@@ -28656,8 +29332,8 @@ SDValue X86TargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::MSCATTER:           return LowerMSCATTER(Op, Subtarget, DAG);
   case ISD::GC_TRANSITION_START:
   case ISD::GC_TRANSITION_END:  return LowerGC_TRANSITION(Op, DAG);
-  case ISD::ADDRSPACECAST:
-    return LowerADDRSPACECAST(Op, DAG);
+  case ISD::ADDRSPACECAST:      return LowerADDRSPACECAST(Op, DAG);
+  case X86ISD::CVTPS2PH:        return LowerCVTPS2PH(Op, DAG);
   }
 }
 
@@ -28703,6 +29379,35 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     N->dump(&DAG);
 #endif
     llvm_unreachable("Do not know how to custom type legalize this operation!");
+  case X86ISD::CVTPH2PS: {
+    EVT VT = N->getValueType(0);
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
+    EVT LoVT, HiVT;
+    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+    Lo = DAG.getNode(X86ISD::CVTPH2PS, dl, LoVT, Lo);
+    Hi = DAG.getNode(X86ISD::CVTPH2PS, dl, HiVT, Hi);
+    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+    Results.push_back(Res);
+    return;
+  }
+  case X86ISD::STRICT_CVTPH2PS: {
+    EVT VT = N->getValueType(0);
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 1);
+    EVT LoVT, HiVT;
+    std::tie(LoVT, HiVT) = DAG.GetSplitDestVTs(VT);
+    Lo = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {LoVT, MVT::Other},
+                     {N->getOperand(0), Lo});
+    Hi = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {HiVT, MVT::Other},
+                     {N->getOperand(0), Hi});
+    SDValue Chain = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
+                                Lo.getValue(1), Hi.getValue(1));
+    SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Lo, Hi);
+    Results.push_back(Res);
+    Results.push_back(Chain);
+    return;
+  }
   case ISD::CTPOP: {
     assert(N->getValueType(0) == MVT::i64 && "Unexpected VT!");
     // Use a v2i64 if possible.
@@ -28772,7 +29477,6 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   case ISD::ABS: {
-    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     assert(N->getValueType(0) == MVT::i64 &&
            "Unexpected type (!= i64) on ABS.");
     MVT HalfT = MVT::i32;
@@ -28785,15 +29489,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
                      DAG.getConstant(1, dl, HalfT));
     Tmp = DAG.getNode(
         ISD::SRA, dl, HalfT, Hi,
-        DAG.getConstant(HalfT.getSizeInBits() - 1, dl,
-                        TLI.getShiftAmountTy(HalfT, DAG.getDataLayout())));
+        DAG.getShiftAmountConstant(HalfT.getSizeInBits() - 1, HalfT, dl));
     Lo = DAG.getNode(ISD::UADDO, dl, VTList, Tmp, Lo);
     Hi = DAG.getNode(ISD::ADDCARRY, dl, VTList, Tmp, Hi,
                      SDValue(Lo.getNode(), 1));
     Hi = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Hi);
     Lo = DAG.getNode(ISD::XOR, dl, HalfT, Tmp, Lo);
-    Results.push_back(Lo);
-    Results.push_back(Hi);
+    Results.push_back(DAG.getNode(ISD::BUILD_PAIR, dl, MVT::i64, Lo, Hi));
     return;
   }
   // We might have generated v2f32 FMIN/FMAX operations. Widen them to v4f32.
@@ -29145,6 +29847,13 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     }
     return;
   }
+  case ISD::LRINT:
+  case ISD::LLRINT: {
+    if (SDValue V = LRINT_LLRINTHelper(N, DAG))
+      Results.push_back(V);
+    return;
+  }
+
   case ISD::SINT_TO_FP:
   case ISD::STRICT_SINT_TO_FP:
   case ISD::UINT_TO_FP:
@@ -29182,14 +29891,14 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       SDValue SignSrc = DAG.getSelect(dl, SrcVT, IsNeg, Sign, Src);
       SmallVector<SDValue, 4> SignCvts(4, DAG.getConstantFP(0.0, dl, MVT::f32));
       for (int i = 0; i != 2; ++i) {
-        SDValue Src = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
+        SDValue Elt = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64,
                                   SignSrc, DAG.getIntPtrConstant(i, dl));
         if (IsStrict)
           SignCvts[i] =
               DAG.getNode(ISD::STRICT_SINT_TO_FP, dl, {MVT::f32, MVT::Other},
-                          {N->getOperand(0), Src});
+                          {N->getOperand(0), Elt});
         else
-          SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Src);
+          SignCvts[i] = DAG.getNode(ISD::SINT_TO_FP, dl, MVT::f32, Elt);
       };
       SDValue SignCvt = DAG.getBuildVector(MVT::v4f32, dl, SignCvts);
       SDValue Slow, Chain;
@@ -29269,7 +29978,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       Results.push_back(V.getValue(1));
     return;
   }
-  case ISD::FP_EXTEND: {
+  case ISD::FP_EXTEND:
+  case ISD::STRICT_FP_EXTEND: {
     // Right now, only MVT::v2f32 has OperationAction for FP_EXTEND.
     // No other ValueType for FP_EXTEND should reach this point.
     assert(N->getValueType(0) == MVT::v2f32 &&
@@ -29391,15 +30101,27 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
             Attribute::NoImplicitFloat);
     if (!Subtarget.useSoftFloat() && !NoImplicitFloatOps) {
       auto *Node = cast<AtomicSDNode>(N);
-      if (Subtarget.hasSSE2()) {
-        // Use a VZEXT_LOAD which will be selected as MOVQ. Then extract the
-        // lower 64-bits.
-        SDVTList Tys = DAG.getVTList(MVT::v2i64, MVT::Other);
+      if (Subtarget.hasSSE1()) {
+        // Use a VZEXT_LOAD which will be selected as MOVQ or XORPS+MOVLPS.
+        // Then extract the lower 64-bits.
+        MVT LdVT = Subtarget.hasSSE2() ? MVT::v2i64 : MVT::v4f32;
+        SDVTList Tys = DAG.getVTList(LdVT, MVT::Other);
         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
         SDValue Ld = DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
                                              MVT::i64, Node->getMemOperand());
-        SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+        if (Subtarget.hasSSE2()) {
+          SDValue Res = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::i64, Ld,
+                                    DAG.getIntPtrConstant(0, dl));
+          Results.push_back(Res);
+          Results.push_back(Ld.getValue(1));
+          return;
+        }
+        // We use an alternative sequence for SSE1 that extracts as v2f32 and
+        // then casts to i64. This avoids a 128-bit stack temporary being
+        // created by type legalization if we were to cast v4f32->v2i64.
+        SDValue Res = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Ld,
                                   DAG.getIntPtrConstant(0, dl));
+        Res = DAG.getBitcast(MVT::i64, Res);
         Results.push_back(Res);
         Results.push_back(Ld.getValue(1));
         return;
@@ -29407,14 +30129,12 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       if (Subtarget.hasX87()) {
         // First load this into an 80-bit X87 register. This will put the whole
         // integer into the significand.
-        // FIXME: Do we need to glue? See FIXME comment in BuildFILD.
-        SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other, MVT::Glue);
+        SDVTList Tys = DAG.getVTList(MVT::f80, MVT::Other);
         SDValue Ops[] = { Node->getChain(), Node->getBasePtr() };
-        SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD_FLAG,
+        SDValue Result = DAG.getMemIntrinsicNode(X86ISD::FILD,
                                                  dl, Tys, Ops, MVT::i64,
                                                  Node->getMemOperand());
         SDValue Chain = Result.getValue(1);
-        SDValue InFlag = Result.getValue(2);
 
         // Now store the X87 register to a stack temporary and convert to i64.
         // This store is not atomic and doesn't need to be.
@@ -29424,11 +30144,10 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
         int SPFI = cast<FrameIndexSDNode>(StackPtr.getNode())->getIndex();
         MachinePointerInfo MPI =
             MachinePointerInfo::getFixedStack(DAG.getMachineFunction(), SPFI);
-        SDValue StoreOps[] = { Chain, Result, StackPtr, InFlag };
-        Chain = DAG.getMemIntrinsicNode(X86ISD::FIST, dl,
-                                        DAG.getVTList(MVT::Other), StoreOps,
-                                        MVT::i64, MPI, 0 /*Align*/,
-                                        MachineMemOperand::MOStore);
+        SDValue StoreOps[] = { Chain, Result, StackPtr };
+        Chain = DAG.getMemIntrinsicNode(
+            X86ISD::FIST, dl, DAG.getVTList(MVT::Other), StoreOps, MVT::i64,
+            MPI, None /*Align*/, MachineMemOperand::MOStore);
 
         // Finally load the value back from the stack temporary and return it.
         // This load is not atomic and doesn't need to be.
@@ -29477,24 +30196,15 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       return;
     }
 
-    // Custom splitting for BWI types when AVX512F is available but BWI isn't.
-    if ((DstVT == MVT::v32i16 || DstVT == MVT::v64i8) &&
-        SrcVT.isVector() && isTypeLegal(SrcVT)) {
-      SDValue Lo, Hi;
-      std::tie(Lo, Hi) = DAG.SplitVectorOperand(N, 0);
-      MVT CastVT = (DstVT == MVT::v32i16) ? MVT::v16i16 : MVT::v32i8;
-      Lo = DAG.getBitcast(CastVT, Lo);
-      Hi = DAG.getBitcast(CastVT, Hi);
-      SDValue Res = DAG.getNode(ISD::CONCAT_VECTORS, dl, DstVT, Lo, Hi);
-      Results.push_back(Res);
-      return;
-    }
-
     if (DstVT.isVector() && SrcVT == MVT::x86mmx) {
+      // FIXME: Use v4f32 for SSE1?
+      assert(Subtarget.hasSSE2() && "Requires SSE2");
       assert(getTypeAction(*DAG.getContext(), DstVT) == TypeWidenVector &&
              "Unexpected type action!");
       EVT WideVT = getTypeToTransformTo(*DAG.getContext(), DstVT);
-      SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, WideVT, N->getOperand(0));
+      SDValue Res = DAG.getNode(X86ISD::MOVQ2DQ, dl, MVT::v2i64,
+                                N->getOperand(0));
+      Res = DAG.getBitcast(WideVT, Res);
       Results.push_back(Res);
       return;
     }
@@ -29526,11 +30236,11 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
       }
       SDValue Ops[] = { Gather->getChain(), PassThru, Mask,
                         Gather->getBasePtr(), Index, Gather->getScale() };
-      SDValue Res = DAG.getTargetMemSDNode<X86MaskedGatherSDNode>(
-        DAG.getVTList(WideVT, Mask.getValueType(), MVT::Other), Ops, dl,
-        Gather->getMemoryVT(), Gather->getMemOperand());
+      SDValue Res = DAG.getMemIntrinsicNode(
+          X86ISD::MGATHER, dl, DAG.getVTList(WideVT, MVT::Other), Ops,
+          Gather->getMemoryVT(), Gather->getMemOperand());
       Results.push_back(Res);
-      Results.push_back(Res.getValue(2));
+      Results.push_back(Res.getValue(1));
       return;
     }
     return;
@@ -29549,7 +30259,7 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     if (Subtarget.hasSSE2()) {
       MVT LdVT = Subtarget.is64Bit() && VT.isInteger() ? MVT::i64 : MVT::f64;
       SDValue Res = DAG.getLoad(LdVT, dl, Ld->getChain(), Ld->getBasePtr(),
-                                Ld->getPointerInfo(), Ld->getAlignment(),
+                                Ld->getPointerInfo(), Ld->getOriginalAlign(),
                                 Ld->getMemOperand()->getFlags());
       SDValue Chain = Res.getValue(1);
       MVT VecVT = MVT::getVectorVT(LdVT, 2);
@@ -29570,25 +30280,8 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
     return;
   }
   case ISD::ADDRSPACECAST: {
-    SDValue Src = N->getOperand(0);
-    EVT DstVT = N->getValueType(0);
-    AddrSpaceCastSDNode *CastN = cast<AddrSpaceCastSDNode>(N);
-    unsigned SrcAS = CastN->getSrcAddressSpace();
-
-    assert(SrcAS != CastN->getDestAddressSpace() &&
-           "addrspacecast must be between different address spaces");
-
-    SDValue Res;
-    if (SrcAS == X86AS::PTR32_UPTR && DstVT == MVT::i64)
-      Res = DAG.getNode(ISD::ZERO_EXTEND, dl, DstVT, Src);
-    else if (DstVT == MVT::i64)
-      Res = DAG.getNode(ISD::SIGN_EXTEND, dl, DstVT, Src);
-    else if (DstVT == MVT::i32)
-      Res = DAG.getNode(ISD::TRUNCATE, dl, DstVT, Src);
-    else
-      report_fatal_error("Unrecognized addrspacecast type legalization");
-
-    Results.push_back(Res);
+    SDValue V = LowerADDRSPACECAST(SDValue(N,0), DAG);
+    Results.push_back(V);
     return;
   }
   }
@@ -29597,362 +30290,367 @@ void X86TargetLowering::ReplaceNodeResults(SDNode *N,
 const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   switch ((X86ISD::NodeType)Opcode) {
   case X86ISD::FIRST_NUMBER:       break;
-  case X86ISD::BSF:                return "X86ISD::BSF";
-  case X86ISD::BSR:                return "X86ISD::BSR";
-  case X86ISD::SHLD:               return "X86ISD::SHLD";
-  case X86ISD::SHRD:               return "X86ISD::SHRD";
-  case X86ISD::FAND:               return "X86ISD::FAND";
-  case X86ISD::FANDN:              return "X86ISD::FANDN";
-  case X86ISD::FOR:                return "X86ISD::FOR";
-  case X86ISD::FXOR:               return "X86ISD::FXOR";
-  case X86ISD::FILD:               return "X86ISD::FILD";
-  case X86ISD::FILD_FLAG:          return "X86ISD::FILD_FLAG";
-  case X86ISD::FIST:               return "X86ISD::FIST";
-  case X86ISD::FP_TO_INT_IN_MEM:   return "X86ISD::FP_TO_INT_IN_MEM";
-  case X86ISD::FLD:                return "X86ISD::FLD";
-  case X86ISD::FST:                return "X86ISD::FST";
-  case X86ISD::CALL:               return "X86ISD::CALL";
-  case X86ISD::BT:                 return "X86ISD::BT";
-  case X86ISD::CMP:                return "X86ISD::CMP";
-  case X86ISD::STRICT_FCMP:        return "X86ISD::STRICT_FCMP";
-  case X86ISD::STRICT_FCMPS:       return "X86ISD::STRICT_FCMPS";
-  case X86ISD::COMI:               return "X86ISD::COMI";
-  case X86ISD::UCOMI:              return "X86ISD::UCOMI";
-  case X86ISD::CMPM:               return "X86ISD::CMPM";
-  case X86ISD::STRICT_CMPM:        return "X86ISD::STRICT_CMPM";
-  case X86ISD::CMPM_SAE:           return "X86ISD::CMPM_SAE";
-  case X86ISD::SETCC:              return "X86ISD::SETCC";
-  case X86ISD::SETCC_CARRY:        return "X86ISD::SETCC_CARRY";
-  case X86ISD::FSETCC:             return "X86ISD::FSETCC";
-  case X86ISD::FSETCCM:            return "X86ISD::FSETCCM";
-  case X86ISD::FSETCCM_SAE:        return "X86ISD::FSETCCM_SAE";
-  case X86ISD::CMOV:               return "X86ISD::CMOV";
-  case X86ISD::BRCOND:             return "X86ISD::BRCOND";
-  case X86ISD::RET_FLAG:           return "X86ISD::RET_FLAG";
-  case X86ISD::IRET:               return "X86ISD::IRET";
-  case X86ISD::REP_STOS:           return "X86ISD::REP_STOS";
-  case X86ISD::REP_MOVS:           return "X86ISD::REP_MOVS";
-  case X86ISD::GlobalBaseReg:      return "X86ISD::GlobalBaseReg";
-  case X86ISD::Wrapper:            return "X86ISD::Wrapper";
-  case X86ISD::WrapperRIP:         return "X86ISD::WrapperRIP";
-  case X86ISD::MOVQ2DQ:            return "X86ISD::MOVQ2DQ";
-  case X86ISD::MOVDQ2Q:            return "X86ISD::MOVDQ2Q";
-  case X86ISD::MMX_MOVD2W:         return "X86ISD::MMX_MOVD2W";
-  case X86ISD::MMX_MOVW2D:         return "X86ISD::MMX_MOVW2D";
-  case X86ISD::PEXTRB:             return "X86ISD::PEXTRB";
-  case X86ISD::PEXTRW:             return "X86ISD::PEXTRW";
-  case X86ISD::INSERTPS:           return "X86ISD::INSERTPS";
-  case X86ISD::PINSRB:             return "X86ISD::PINSRB";
-  case X86ISD::PINSRW:             return "X86ISD::PINSRW";
-  case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
-  case X86ISD::ANDNP:              return "X86ISD::ANDNP";
-  case X86ISD::BLENDI:             return "X86ISD::BLENDI";
-  case X86ISD::BLENDV:             return "X86ISD::BLENDV";
-  case X86ISD::HADD:               return "X86ISD::HADD";
-  case X86ISD::HSUB:               return "X86ISD::HSUB";
-  case X86ISD::FHADD:              return "X86ISD::FHADD";
-  case X86ISD::FHSUB:              return "X86ISD::FHSUB";
-  case X86ISD::CONFLICT:           return "X86ISD::CONFLICT";
-  case X86ISD::FMAX:               return "X86ISD::FMAX";
-  case X86ISD::FMAXS:              return "X86ISD::FMAXS";
-  case X86ISD::FMAX_SAE:           return "X86ISD::FMAX_SAE";
-  case X86ISD::FMAXS_SAE:          return "X86ISD::FMAXS_SAE";
-  case X86ISD::FMIN:               return "X86ISD::FMIN";
-  case X86ISD::FMINS:              return "X86ISD::FMINS";
-  case X86ISD::FMIN_SAE:           return "X86ISD::FMIN_SAE";
-  case X86ISD::FMINS_SAE:          return "X86ISD::FMINS_SAE";
-  case X86ISD::FMAXC:              return "X86ISD::FMAXC";
-  case X86ISD::FMINC:              return "X86ISD::FMINC";
-  case X86ISD::FRSQRT:             return "X86ISD::FRSQRT";
-  case X86ISD::FRCP:               return "X86ISD::FRCP";
-  case X86ISD::EXTRQI:             return "X86ISD::EXTRQI";
-  case X86ISD::INSERTQI:           return "X86ISD::INSERTQI";
-  case X86ISD::TLSADDR:            return "X86ISD::TLSADDR";
-  case X86ISD::TLSBASEADDR:        return "X86ISD::TLSBASEADDR";
-  case X86ISD::TLSCALL:            return "X86ISD::TLSCALL";
-  case X86ISD::EH_SJLJ_SETJMP:     return "X86ISD::EH_SJLJ_SETJMP";
-  case X86ISD::EH_SJLJ_LONGJMP:    return "X86ISD::EH_SJLJ_LONGJMP";
-  case X86ISD::EH_SJLJ_SETUP_DISPATCH:
-    return "X86ISD::EH_SJLJ_SETUP_DISPATCH";
-  case X86ISD::EH_RETURN:          return "X86ISD::EH_RETURN";
-  case X86ISD::TC_RETURN:          return "X86ISD::TC_RETURN";
-  case X86ISD::FNSTCW16m:          return "X86ISD::FNSTCW16m";
-  case X86ISD::FNSTSW16r:          return "X86ISD::FNSTSW16r";
-  case X86ISD::LCMPXCHG_DAG:       return "X86ISD::LCMPXCHG_DAG";
-  case X86ISD::LCMPXCHG8_DAG:      return "X86ISD::LCMPXCHG8_DAG";
-  case X86ISD::LCMPXCHG16_DAG:     return "X86ISD::LCMPXCHG16_DAG";
-  case X86ISD::LCMPXCHG8_SAVE_EBX_DAG:
-    return "X86ISD::LCMPXCHG8_SAVE_EBX_DAG";
-  case X86ISD::LCMPXCHG16_SAVE_RBX_DAG:
-    return "X86ISD::LCMPXCHG16_SAVE_RBX_DAG";
-  case X86ISD::LADD:               return "X86ISD::LADD";
-  case X86ISD::LSUB:               return "X86ISD::LSUB";
-  case X86ISD::LOR:                return "X86ISD::LOR";
-  case X86ISD::LXOR:               return "X86ISD::LXOR";
-  case X86ISD::LAND:               return "X86ISD::LAND";
-  case X86ISD::VZEXT_MOVL:         return "X86ISD::VZEXT_MOVL";
-  case X86ISD::VZEXT_LOAD:         return "X86ISD::VZEXT_LOAD";
-  case X86ISD::VEXTRACT_STORE:     return "X86ISD::VEXTRACT_STORE";
-  case X86ISD::VTRUNC:             return "X86ISD::VTRUNC";
-  case X86ISD::VTRUNCS:            return "X86ISD::VTRUNCS";
-  case X86ISD::VTRUNCUS:           return "X86ISD::VTRUNCUS";
-  case X86ISD::VMTRUNC:            return "X86ISD::VMTRUNC";
-  case X86ISD::VMTRUNCS:           return "X86ISD::VMTRUNCS";
-  case X86ISD::VMTRUNCUS:          return "X86ISD::VMTRUNCUS";
-  case X86ISD::VTRUNCSTORES:       return "X86ISD::VTRUNCSTORES";
-  case X86ISD::VTRUNCSTOREUS:      return "X86ISD::VTRUNCSTOREUS";
-  case X86ISD::VMTRUNCSTORES:      return "X86ISD::VMTRUNCSTORES";
-  case X86ISD::VMTRUNCSTOREUS:     return "X86ISD::VMTRUNCSTOREUS";
-  case X86ISD::VFPEXT:             return "X86ISD::VFPEXT";
-  case X86ISD::STRICT_VFPEXT:      return "X86ISD::STRICT_VFPEXT";
-  case X86ISD::VFPEXT_SAE:         return "X86ISD::VFPEXT_SAE";
-  case X86ISD::VFPEXTS:            return "X86ISD::VFPEXTS";
-  case X86ISD::VFPEXTS_SAE:        return "X86ISD::VFPEXTS_SAE";
-  case X86ISD::VFPROUND:           return "X86ISD::VFPROUND";
-  case X86ISD::STRICT_VFPROUND:    return "X86ISD::STRICT_VFPROUND";
-  case X86ISD::VMFPROUND:          return "X86ISD::VMFPROUND";
-  case X86ISD::VFPROUND_RND:       return "X86ISD::VFPROUND_RND";
-  case X86ISD::VFPROUNDS:          return "X86ISD::VFPROUNDS";
-  case X86ISD::VFPROUNDS_RND:      return "X86ISD::VFPROUNDS_RND";
-  case X86ISD::VSHLDQ:             return "X86ISD::VSHLDQ";
-  case X86ISD::VSRLDQ:             return "X86ISD::VSRLDQ";
-  case X86ISD::VSHL:               return "X86ISD::VSHL";
-  case X86ISD::VSRL:               return "X86ISD::VSRL";
-  case X86ISD::VSRA:               return "X86ISD::VSRA";
-  case X86ISD::VSHLI:              return "X86ISD::VSHLI";
-  case X86ISD::VSRLI:              return "X86ISD::VSRLI";
-  case X86ISD::VSRAI:              return "X86ISD::VSRAI";
-  case X86ISD::VSHLV:              return "X86ISD::VSHLV";
-  case X86ISD::VSRLV:              return "X86ISD::VSRLV";
-  case X86ISD::VSRAV:              return "X86ISD::VSRAV";
-  case X86ISD::VROTLI:             return "X86ISD::VROTLI";
-  case X86ISD::VROTRI:             return "X86ISD::VROTRI";
-  case X86ISD::VPPERM:             return "X86ISD::VPPERM";
-  case X86ISD::CMPP:               return "X86ISD::CMPP";
-  case X86ISD::STRICT_CMPP:        return "X86ISD::STRICT_CMPP";
-  case X86ISD::PCMPEQ:             return "X86ISD::PCMPEQ";
-  case X86ISD::PCMPGT:             return "X86ISD::PCMPGT";
-  case X86ISD::PHMINPOS:           return "X86ISD::PHMINPOS";
-  case X86ISD::ADD:                return "X86ISD::ADD";
-  case X86ISD::SUB:                return "X86ISD::SUB";
-  case X86ISD::ADC:                return "X86ISD::ADC";
-  case X86ISD::SBB:                return "X86ISD::SBB";
-  case X86ISD::SMUL:               return "X86ISD::SMUL";
-  case X86ISD::UMUL:               return "X86ISD::UMUL";
-  case X86ISD::OR:                 return "X86ISD::OR";
-  case X86ISD::XOR:                return "X86ISD::XOR";
-  case X86ISD::AND:                return "X86ISD::AND";
-  case X86ISD::BEXTR:              return "X86ISD::BEXTR";
-  case X86ISD::BZHI:               return "X86ISD::BZHI";
-  case X86ISD::MUL_IMM:            return "X86ISD::MUL_IMM";
-  case X86ISD::MOVMSK:             return "X86ISD::MOVMSK";
-  case X86ISD::PTEST:              return "X86ISD::PTEST";
-  case X86ISD::TESTP:              return "X86ISD::TESTP";
-  case X86ISD::KORTEST:            return "X86ISD::KORTEST";
-  case X86ISD::KTEST:              return "X86ISD::KTEST";
-  case X86ISD::KADD:               return "X86ISD::KADD";
-  case X86ISD::KSHIFTL:            return "X86ISD::KSHIFTL";
-  case X86ISD::KSHIFTR:            return "X86ISD::KSHIFTR";
-  case X86ISD::PACKSS:             return "X86ISD::PACKSS";
-  case X86ISD::PACKUS:             return "X86ISD::PACKUS";
-  case X86ISD::PALIGNR:            return "X86ISD::PALIGNR";
-  case X86ISD::VALIGN:             return "X86ISD::VALIGN";
-  case X86ISD::VSHLD:              return "X86ISD::VSHLD";
-  case X86ISD::VSHRD:              return "X86ISD::VSHRD";
-  case X86ISD::VSHLDV:             return "X86ISD::VSHLDV";
-  case X86ISD::VSHRDV:             return "X86ISD::VSHRDV";
-  case X86ISD::PSHUFD:             return "X86ISD::PSHUFD";
-  case X86ISD::PSHUFHW:            return "X86ISD::PSHUFHW";
-  case X86ISD::PSHUFLW:            return "X86ISD::PSHUFLW";
-  case X86ISD::SHUFP:              return "X86ISD::SHUFP";
-  case X86ISD::SHUF128:            return "X86ISD::SHUF128";
-  case X86ISD::MOVLHPS:            return "X86ISD::MOVLHPS";
-  case X86ISD::MOVHLPS:            return "X86ISD::MOVHLPS";
-  case X86ISD::MOVDDUP:            return "X86ISD::MOVDDUP";
-  case X86ISD::MOVSHDUP:           return "X86ISD::MOVSHDUP";
-  case X86ISD::MOVSLDUP:           return "X86ISD::MOVSLDUP";
-  case X86ISD::MOVSD:              return "X86ISD::MOVSD";
-  case X86ISD::MOVSS:              return "X86ISD::MOVSS";
-  case X86ISD::UNPCKL:             return "X86ISD::UNPCKL";
-  case X86ISD::UNPCKH:             return "X86ISD::UNPCKH";
-  case X86ISD::VBROADCAST:         return "X86ISD::VBROADCAST";
-  case X86ISD::VBROADCAST_LOAD:    return "X86ISD::VBROADCAST_LOAD";
-  case X86ISD::VBROADCASTM:        return "X86ISD::VBROADCASTM";
-  case X86ISD::SUBV_BROADCAST:     return "X86ISD::SUBV_BROADCAST";
-  case X86ISD::VPERMILPV:          return "X86ISD::VPERMILPV";
-  case X86ISD::VPERMILPI:          return "X86ISD::VPERMILPI";
-  case X86ISD::VPERM2X128:         return "X86ISD::VPERM2X128";
-  case X86ISD::VPERMV:             return "X86ISD::VPERMV";
-  case X86ISD::VPERMV3:            return "X86ISD::VPERMV3";
-  case X86ISD::VPERMI:             return "X86ISD::VPERMI";
-  case X86ISD::VPTERNLOG:          return "X86ISD::VPTERNLOG";
-  case X86ISD::VFIXUPIMM:          return "X86ISD::VFIXUPIMM";
-  case X86ISD::VFIXUPIMM_SAE:      return "X86ISD::VFIXUPIMM_SAE";
-  case X86ISD::VFIXUPIMMS:         return "X86ISD::VFIXUPIMMS";
-  case X86ISD::VFIXUPIMMS_SAE:     return "X86ISD::VFIXUPIMMS_SAE";
-  case X86ISD::VRANGE:             return "X86ISD::VRANGE";
-  case X86ISD::VRANGE_SAE:         return "X86ISD::VRANGE_SAE";
-  case X86ISD::VRANGES:            return "X86ISD::VRANGES";
-  case X86ISD::VRANGES_SAE:        return "X86ISD::VRANGES_SAE";
-  case X86ISD::PMULUDQ:            return "X86ISD::PMULUDQ";
-  case X86ISD::PMULDQ:             return "X86ISD::PMULDQ";
-  case X86ISD::PSADBW:             return "X86ISD::PSADBW";
-  case X86ISD::DBPSADBW:           return "X86ISD::DBPSADBW";
-  case X86ISD::VASTART_SAVE_XMM_REGS: return "X86ISD::VASTART_SAVE_XMM_REGS";
-  case X86ISD::VAARG_64:           return "X86ISD::VAARG_64";
-  case X86ISD::WIN_ALLOCA:         return "X86ISD::WIN_ALLOCA";
-  case X86ISD::MEMBARRIER:         return "X86ISD::MEMBARRIER";
-  case X86ISD::MFENCE:             return "X86ISD::MFENCE";
-  case X86ISD::SEG_ALLOCA:         return "X86ISD::SEG_ALLOCA";
-  case X86ISD::SAHF:               return "X86ISD::SAHF";
-  case X86ISD::RDRAND:             return "X86ISD::RDRAND";
-  case X86ISD::RDSEED:             return "X86ISD::RDSEED";
-  case X86ISD::RDPKRU:             return "X86ISD::RDPKRU";
-  case X86ISD::WRPKRU:             return "X86ISD::WRPKRU";
-  case X86ISD::VPMADDUBSW:         return "X86ISD::VPMADDUBSW";
-  case X86ISD::VPMADDWD:           return "X86ISD::VPMADDWD";
-  case X86ISD::VPSHA:              return "X86ISD::VPSHA";
-  case X86ISD::VPSHL:              return "X86ISD::VPSHL";
-  case X86ISD::VPCOM:              return "X86ISD::VPCOM";
-  case X86ISD::VPCOMU:             return "X86ISD::VPCOMU";
-  case X86ISD::VPERMIL2:           return "X86ISD::VPERMIL2";
-  case X86ISD::FMSUB:              return "X86ISD::FMSUB";
-  case X86ISD::FNMADD:             return "X86ISD::FNMADD";
-  case X86ISD::FNMSUB:             return "X86ISD::FNMSUB";
-  case X86ISD::FMADDSUB:           return "X86ISD::FMADDSUB";
-  case X86ISD::FMSUBADD:           return "X86ISD::FMSUBADD";
-  case X86ISD::FMADD_RND:          return "X86ISD::FMADD_RND";
-  case X86ISD::FNMADD_RND:         return "X86ISD::FNMADD_RND";
-  case X86ISD::FMSUB_RND:          return "X86ISD::FMSUB_RND";
-  case X86ISD::FNMSUB_RND:         return "X86ISD::FNMSUB_RND";
-  case X86ISD::FMADDSUB_RND:       return "X86ISD::FMADDSUB_RND";
-  case X86ISD::FMSUBADD_RND:       return "X86ISD::FMSUBADD_RND";
-  case X86ISD::VPMADD52H:          return "X86ISD::VPMADD52H";
-  case X86ISD::VPMADD52L:          return "X86ISD::VPMADD52L";
-  case X86ISD::VRNDSCALE:          return "X86ISD::VRNDSCALE";
-  case X86ISD::STRICT_VRNDSCALE:   return "X86ISD::STRICT_VRNDSCALE";
-  case X86ISD::VRNDSCALE_SAE:      return "X86ISD::VRNDSCALE_SAE";
-  case X86ISD::VRNDSCALES:         return "X86ISD::VRNDSCALES";
-  case X86ISD::VRNDSCALES_SAE:     return "X86ISD::VRNDSCALES_SAE";
-  case X86ISD::VREDUCE:            return "X86ISD::VREDUCE";
-  case X86ISD::VREDUCE_SAE:        return "X86ISD::VREDUCE_SAE";
-  case X86ISD::VREDUCES:           return "X86ISD::VREDUCES";
-  case X86ISD::VREDUCES_SAE:       return "X86ISD::VREDUCES_SAE";
-  case X86ISD::VGETMANT:           return "X86ISD::VGETMANT";
-  case X86ISD::VGETMANT_SAE:       return "X86ISD::VGETMANT_SAE";
-  case X86ISD::VGETMANTS:          return "X86ISD::VGETMANTS";
-  case X86ISD::VGETMANTS_SAE:      return "X86ISD::VGETMANTS_SAE";
-  case X86ISD::PCMPESTR:           return "X86ISD::PCMPESTR";
-  case X86ISD::PCMPISTR:           return "X86ISD::PCMPISTR";
-  case X86ISD::XTEST:              return "X86ISD::XTEST";
-  case X86ISD::COMPRESS:           return "X86ISD::COMPRESS";
-  case X86ISD::EXPAND:             return "X86ISD::EXPAND";
-  case X86ISD::SELECTS:            return "X86ISD::SELECTS";
-  case X86ISD::ADDSUB:             return "X86ISD::ADDSUB";
-  case X86ISD::RCP14:              return "X86ISD::RCP14";
-  case X86ISD::RCP14S:             return "X86ISD::RCP14S";
-  case X86ISD::RCP28:              return "X86ISD::RCP28";
-  case X86ISD::RCP28_SAE:          return "X86ISD::RCP28_SAE";
-  case X86ISD::RCP28S:             return "X86ISD::RCP28S";
-  case X86ISD::RCP28S_SAE:         return "X86ISD::RCP28S_SAE";
-  case X86ISD::EXP2:               return "X86ISD::EXP2";
-  case X86ISD::EXP2_SAE:           return "X86ISD::EXP2_SAE";
-  case X86ISD::RSQRT14:            return "X86ISD::RSQRT14";
-  case X86ISD::RSQRT14S:           return "X86ISD::RSQRT14S";
-  case X86ISD::RSQRT28:            return "X86ISD::RSQRT28";
-  case X86ISD::RSQRT28_SAE:        return "X86ISD::RSQRT28_SAE";
-  case X86ISD::RSQRT28S:           return "X86ISD::RSQRT28S";
-  case X86ISD::RSQRT28S_SAE:       return "X86ISD::RSQRT28S_SAE";
-  case X86ISD::FADD_RND:           return "X86ISD::FADD_RND";
-  case X86ISD::FADDS:              return "X86ISD::FADDS";
-  case X86ISD::FADDS_RND:          return "X86ISD::FADDS_RND";
-  case X86ISD::FSUB_RND:           return "X86ISD::FSUB_RND";
-  case X86ISD::FSUBS:              return "X86ISD::FSUBS";
-  case X86ISD::FSUBS_RND:          return "X86ISD::FSUBS_RND";
-  case X86ISD::FMUL_RND:           return "X86ISD::FMUL_RND";
-  case X86ISD::FMULS:              return "X86ISD::FMULS";
-  case X86ISD::FMULS_RND:          return "X86ISD::FMULS_RND";
-  case X86ISD::FDIV_RND:           return "X86ISD::FDIV_RND";
-  case X86ISD::FDIVS:              return "X86ISD::FDIVS";
-  case X86ISD::FDIVS_RND:          return "X86ISD::FDIVS_RND";
-  case X86ISD::FSQRT_RND:          return "X86ISD::FSQRT_RND";
-  case X86ISD::FSQRTS:             return "X86ISD::FSQRTS";
-  case X86ISD::FSQRTS_RND:         return "X86ISD::FSQRTS_RND";
-  case X86ISD::FGETEXP:            return "X86ISD::FGETEXP";
-  case X86ISD::FGETEXP_SAE:        return "X86ISD::FGETEXP_SAE";
-  case X86ISD::FGETEXPS:           return "X86ISD::FGETEXPS";
-  case X86ISD::FGETEXPS_SAE:       return "X86ISD::FGETEXPS_SAE";
-  case X86ISD::SCALEF:             return "X86ISD::SCALEF";
-  case X86ISD::SCALEF_RND:         return "X86ISD::SCALEF_RND";
-  case X86ISD::SCALEFS:            return "X86ISD::SCALEFS";
-  case X86ISD::SCALEFS_RND:        return "X86ISD::SCALEFS_RND";
-  case X86ISD::AVG:                return "X86ISD::AVG";
-  case X86ISD::MULHRS:             return "X86ISD::MULHRS";
-  case X86ISD::SINT_TO_FP_RND:     return "X86ISD::SINT_TO_FP_RND";
-  case X86ISD::UINT_TO_FP_RND:     return "X86ISD::UINT_TO_FP_RND";
-  case X86ISD::CVTTP2SI:           return "X86ISD::CVTTP2SI";
-  case X86ISD::CVTTP2UI:           return "X86ISD::CVTTP2UI";
-  case X86ISD::STRICT_CVTTP2SI:    return "X86ISD::STRICT_CVTTP2SI";
-  case X86ISD::STRICT_CVTTP2UI:    return "X86ISD::STRICT_CVTTP2UI";
-  case X86ISD::MCVTTP2SI:          return "X86ISD::MCVTTP2SI";
-  case X86ISD::MCVTTP2UI:          return "X86ISD::MCVTTP2UI";
-  case X86ISD::CVTTP2SI_SAE:       return "X86ISD::CVTTP2SI_SAE";
-  case X86ISD::CVTTP2UI_SAE:       return "X86ISD::CVTTP2UI_SAE";
-  case X86ISD::CVTTS2SI:           return "X86ISD::CVTTS2SI";
-  case X86ISD::CVTTS2UI:           return "X86ISD::CVTTS2UI";
-  case X86ISD::CVTTS2SI_SAE:       return "X86ISD::CVTTS2SI_SAE";
-  case X86ISD::CVTTS2UI_SAE:       return "X86ISD::CVTTS2UI_SAE";
-  case X86ISD::CVTSI2P:            return "X86ISD::CVTSI2P";
-  case X86ISD::CVTUI2P:            return "X86ISD::CVTUI2P";
-  case X86ISD::STRICT_CVTSI2P:     return "X86ISD::STRICT_CVTSI2P";
-  case X86ISD::STRICT_CVTUI2P:     return "X86ISD::STRICT_CVTUI2P";
-  case X86ISD::MCVTSI2P:           return "X86ISD::MCVTSI2P";
-  case X86ISD::MCVTUI2P:           return "X86ISD::MCVTUI2P";
-  case X86ISD::VFPCLASS:           return "X86ISD::VFPCLASS";
-  case X86ISD::VFPCLASSS:          return "X86ISD::VFPCLASSS";
-  case X86ISD::MULTISHIFT:         return "X86ISD::MULTISHIFT";
-  case X86ISD::SCALAR_SINT_TO_FP:     return "X86ISD::SCALAR_SINT_TO_FP";
-  case X86ISD::SCALAR_SINT_TO_FP_RND: return "X86ISD::SCALAR_SINT_TO_FP_RND";
-  case X86ISD::SCALAR_UINT_TO_FP:     return "X86ISD::SCALAR_UINT_TO_FP";
-  case X86ISD::SCALAR_UINT_TO_FP_RND: return "X86ISD::SCALAR_UINT_TO_FP_RND";
-  case X86ISD::CVTPS2PH:           return "X86ISD::CVTPS2PH";
-  case X86ISD::MCVTPS2PH:          return "X86ISD::MCVTPS2PH";
-  case X86ISD::CVTPH2PS:           return "X86ISD::CVTPH2PS";
-  case X86ISD::CVTPH2PS_SAE:       return "X86ISD::CVTPH2PS_SAE";
-  case X86ISD::CVTP2SI:            return "X86ISD::CVTP2SI";
-  case X86ISD::CVTP2UI:            return "X86ISD::CVTP2UI";
-  case X86ISD::MCVTP2SI:           return "X86ISD::MCVTP2SI";
-  case X86ISD::MCVTP2UI:           return "X86ISD::MCVTP2UI";
-  case X86ISD::CVTP2SI_RND:        return "X86ISD::CVTP2SI_RND";
-  case X86ISD::CVTP2UI_RND:        return "X86ISD::CVTP2UI_RND";
-  case X86ISD::CVTS2SI:            return "X86ISD::CVTS2SI";
-  case X86ISD::CVTS2UI:            return "X86ISD::CVTS2UI";
-  case X86ISD::CVTS2SI_RND:        return "X86ISD::CVTS2SI_RND";
-  case X86ISD::CVTS2UI_RND:        return "X86ISD::CVTS2UI_RND";
-  case X86ISD::CVTNE2PS2BF16:      return "X86ISD::CVTNE2PS2BF16";
-  case X86ISD::CVTNEPS2BF16:       return "X86ISD::CVTNEPS2BF16";
-  case X86ISD::MCVTNEPS2BF16:      return "X86ISD::MCVTNEPS2BF16";
-  case X86ISD::DPBF16PS:           return "X86ISD::DPBF16PS";
-  case X86ISD::LWPINS:             return "X86ISD::LWPINS";
-  case X86ISD::MGATHER:            return "X86ISD::MGATHER";
-  case X86ISD::MSCATTER:           return "X86ISD::MSCATTER";
-  case X86ISD::VPDPBUSD:           return "X86ISD::VPDPBUSD";
-  case X86ISD::VPDPBUSDS:          return "X86ISD::VPDPBUSDS";
-  case X86ISD::VPDPWSSD:           return "X86ISD::VPDPWSSD";
-  case X86ISD::VPDPWSSDS:          return "X86ISD::VPDPWSSDS";
-  case X86ISD::VPSHUFBITQMB:       return "X86ISD::VPSHUFBITQMB";
-  case X86ISD::GF2P8MULB:          return "X86ISD::GF2P8MULB";
-  case X86ISD::GF2P8AFFINEQB:      return "X86ISD::GF2P8AFFINEQB";
-  case X86ISD::GF2P8AFFINEINVQB:   return "X86ISD::GF2P8AFFINEINVQB";
-  case X86ISD::NT_CALL:            return "X86ISD::NT_CALL";
-  case X86ISD::NT_BRIND:           return "X86ISD::NT_BRIND";
-  case X86ISD::UMWAIT:             return "X86ISD::UMWAIT";
-  case X86ISD::TPAUSE:             return "X86ISD::TPAUSE";
-  case X86ISD::ENQCMD:             return "X86ISD:ENQCMD";
-  case X86ISD::ENQCMDS:            return "X86ISD:ENQCMDS";
-  case X86ISD::VP2INTERSECT:       return "X86ISD::VP2INTERSECT";
+#define NODE_NAME_CASE(NODE) case X86ISD::NODE: return "X86ISD::" #NODE;
+  NODE_NAME_CASE(BSF)
+  NODE_NAME_CASE(BSR)
+  NODE_NAME_CASE(FSHL)
+  NODE_NAME_CASE(FSHR)
+  NODE_NAME_CASE(FAND)
+  NODE_NAME_CASE(FANDN)
+  NODE_NAME_CASE(FOR)
+  NODE_NAME_CASE(FXOR)
+  NODE_NAME_CASE(FILD)
+  NODE_NAME_CASE(FIST)
+  NODE_NAME_CASE(FP_TO_INT_IN_MEM)
+  NODE_NAME_CASE(FLD)
+  NODE_NAME_CASE(FST)
+  NODE_NAME_CASE(CALL)
+  NODE_NAME_CASE(BT)
+  NODE_NAME_CASE(CMP)
+  NODE_NAME_CASE(FCMP)
+  NODE_NAME_CASE(STRICT_FCMP)
+  NODE_NAME_CASE(STRICT_FCMPS)
+  NODE_NAME_CASE(COMI)
+  NODE_NAME_CASE(UCOMI)
+  NODE_NAME_CASE(CMPM)
+  NODE_NAME_CASE(STRICT_CMPM)
+  NODE_NAME_CASE(CMPM_SAE)
+  NODE_NAME_CASE(SETCC)
+  NODE_NAME_CASE(SETCC_CARRY)
+  NODE_NAME_CASE(FSETCC)
+  NODE_NAME_CASE(FSETCCM)
+  NODE_NAME_CASE(FSETCCM_SAE)
+  NODE_NAME_CASE(CMOV)
+  NODE_NAME_CASE(BRCOND)
+  NODE_NAME_CASE(RET_FLAG)
+  NODE_NAME_CASE(IRET)
+  NODE_NAME_CASE(REP_STOS)
+  NODE_NAME_CASE(REP_MOVS)
+  NODE_NAME_CASE(GlobalBaseReg)
+  NODE_NAME_CASE(Wrapper)
+  NODE_NAME_CASE(WrapperRIP)
+  NODE_NAME_CASE(MOVQ2DQ)
+  NODE_NAME_CASE(MOVDQ2Q)
+  NODE_NAME_CASE(MMX_MOVD2W)
+  NODE_NAME_CASE(MMX_MOVW2D)
+  NODE_NAME_CASE(PEXTRB)
+  NODE_NAME_CASE(PEXTRW)
+  NODE_NAME_CASE(INSERTPS)
+  NODE_NAME_CASE(PINSRB)
+  NODE_NAME_CASE(PINSRW)
+  NODE_NAME_CASE(PSHUFB)
+  NODE_NAME_CASE(ANDNP)
+  NODE_NAME_CASE(BLENDI)
+  NODE_NAME_CASE(BLENDV)
+  NODE_NAME_CASE(HADD)
+  NODE_NAME_CASE(HSUB)
+  NODE_NAME_CASE(FHADD)
+  NODE_NAME_CASE(FHSUB)
+  NODE_NAME_CASE(CONFLICT)
+  NODE_NAME_CASE(FMAX)
+  NODE_NAME_CASE(FMAXS)
+  NODE_NAME_CASE(FMAX_SAE)
+  NODE_NAME_CASE(FMAXS_SAE)
+  NODE_NAME_CASE(FMIN)
+  NODE_NAME_CASE(FMINS)
+  NODE_NAME_CASE(FMIN_SAE)
+  NODE_NAME_CASE(FMINS_SAE)
+  NODE_NAME_CASE(FMAXC)
+  NODE_NAME_CASE(FMINC)
+  NODE_NAME_CASE(FRSQRT)
+  NODE_NAME_CASE(FRCP)
+  NODE_NAME_CASE(EXTRQI)
+  NODE_NAME_CASE(INSERTQI)
+  NODE_NAME_CASE(TLSADDR)
+  NODE_NAME_CASE(TLSBASEADDR)
+  NODE_NAME_CASE(TLSCALL)
+  NODE_NAME_CASE(EH_SJLJ_SETJMP)
+  NODE_NAME_CASE(EH_SJLJ_LONGJMP)
+  NODE_NAME_CASE(EH_SJLJ_SETUP_DISPATCH)
+  NODE_NAME_CASE(EH_RETURN)
+  NODE_NAME_CASE(TC_RETURN)
+  NODE_NAME_CASE(FNSTCW16m)
+  NODE_NAME_CASE(LCMPXCHG_DAG)
+  NODE_NAME_CASE(LCMPXCHG8_DAG)
+  NODE_NAME_CASE(LCMPXCHG16_DAG)
+  NODE_NAME_CASE(LCMPXCHG8_SAVE_EBX_DAG)
+  NODE_NAME_CASE(LCMPXCHG16_SAVE_RBX_DAG)
+  NODE_NAME_CASE(LADD)
+  NODE_NAME_CASE(LSUB)
+  NODE_NAME_CASE(LOR)
+  NODE_NAME_CASE(LXOR)
+  NODE_NAME_CASE(LAND)
+  NODE_NAME_CASE(VZEXT_MOVL)
+  NODE_NAME_CASE(VZEXT_LOAD)
+  NODE_NAME_CASE(VEXTRACT_STORE)
+  NODE_NAME_CASE(VTRUNC)
+  NODE_NAME_CASE(VTRUNCS)
+  NODE_NAME_CASE(VTRUNCUS)
+  NODE_NAME_CASE(VMTRUNC)
+  NODE_NAME_CASE(VMTRUNCS)
+  NODE_NAME_CASE(VMTRUNCUS)
+  NODE_NAME_CASE(VTRUNCSTORES)
+  NODE_NAME_CASE(VTRUNCSTOREUS)
+  NODE_NAME_CASE(VMTRUNCSTORES)
+  NODE_NAME_CASE(VMTRUNCSTOREUS)
+  NODE_NAME_CASE(VFPEXT)
+  NODE_NAME_CASE(STRICT_VFPEXT)
+  NODE_NAME_CASE(VFPEXT_SAE)
+  NODE_NAME_CASE(VFPEXTS)
+  NODE_NAME_CASE(VFPEXTS_SAE)
+  NODE_NAME_CASE(VFPROUND)
+  NODE_NAME_CASE(STRICT_VFPROUND)
+  NODE_NAME_CASE(VMFPROUND)
+  NODE_NAME_CASE(VFPROUND_RND)
+  NODE_NAME_CASE(VFPROUNDS)
+  NODE_NAME_CASE(VFPROUNDS_RND)
+  NODE_NAME_CASE(VSHLDQ)
+  NODE_NAME_CASE(VSRLDQ)
+  NODE_NAME_CASE(VSHL)
+  NODE_NAME_CASE(VSRL)
+  NODE_NAME_CASE(VSRA)
+  NODE_NAME_CASE(VSHLI)
+  NODE_NAME_CASE(VSRLI)
+  NODE_NAME_CASE(VSRAI)
+  NODE_NAME_CASE(VSHLV)
+  NODE_NAME_CASE(VSRLV)
+  NODE_NAME_CASE(VSRAV)
+  NODE_NAME_CASE(VROTLI)
+  NODE_NAME_CASE(VROTRI)
+  NODE_NAME_CASE(VPPERM)
+  NODE_NAME_CASE(CMPP)
+  NODE_NAME_CASE(STRICT_CMPP)
+  NODE_NAME_CASE(PCMPEQ)
+  NODE_NAME_CASE(PCMPGT)
+  NODE_NAME_CASE(PHMINPOS)
+  NODE_NAME_CASE(ADD)
+  NODE_NAME_CASE(SUB)
+  NODE_NAME_CASE(ADC)
+  NODE_NAME_CASE(SBB)
+  NODE_NAME_CASE(SMUL)
+  NODE_NAME_CASE(UMUL)
+  NODE_NAME_CASE(OR)
+  NODE_NAME_CASE(XOR)
+  NODE_NAME_CASE(AND)
+  NODE_NAME_CASE(BEXTR)
+  NODE_NAME_CASE(BZHI)
+  NODE_NAME_CASE(PDEP)
+  NODE_NAME_CASE(PEXT)
+  NODE_NAME_CASE(MUL_IMM)
+  NODE_NAME_CASE(MOVMSK)
+  NODE_NAME_CASE(PTEST)
+  NODE_NAME_CASE(TESTP)
+  NODE_NAME_CASE(KORTEST)
+  NODE_NAME_CASE(KTEST)
+  NODE_NAME_CASE(KADD)
+  NODE_NAME_CASE(KSHIFTL)
+  NODE_NAME_CASE(KSHIFTR)
+  NODE_NAME_CASE(PACKSS)
+  NODE_NAME_CASE(PACKUS)
+  NODE_NAME_CASE(PALIGNR)
+  NODE_NAME_CASE(VALIGN)
+  NODE_NAME_CASE(VSHLD)
+  NODE_NAME_CASE(VSHRD)
+  NODE_NAME_CASE(VSHLDV)
+  NODE_NAME_CASE(VSHRDV)
+  NODE_NAME_CASE(PSHUFD)
+  NODE_NAME_CASE(PSHUFHW)
+  NODE_NAME_CASE(PSHUFLW)
+  NODE_NAME_CASE(SHUFP)
+  NODE_NAME_CASE(SHUF128)
+  NODE_NAME_CASE(MOVLHPS)
+  NODE_NAME_CASE(MOVHLPS)
+  NODE_NAME_CASE(MOVDDUP)
+  NODE_NAME_CASE(MOVSHDUP)
+  NODE_NAME_CASE(MOVSLDUP)
+  NODE_NAME_CASE(MOVSD)
+  NODE_NAME_CASE(MOVSS)
+  NODE_NAME_CASE(UNPCKL)
+  NODE_NAME_CASE(UNPCKH)
+  NODE_NAME_CASE(VBROADCAST)
+  NODE_NAME_CASE(VBROADCAST_LOAD)
+  NODE_NAME_CASE(VBROADCASTM)
+  NODE_NAME_CASE(SUBV_BROADCAST)
+  NODE_NAME_CASE(VPERMILPV)
+  NODE_NAME_CASE(VPERMILPI)
+  NODE_NAME_CASE(VPERM2X128)
+  NODE_NAME_CASE(VPERMV)
+  NODE_NAME_CASE(VPERMV3)
+  NODE_NAME_CASE(VPERMI)
+  NODE_NAME_CASE(VPTERNLOG)
+  NODE_NAME_CASE(VFIXUPIMM)
+  NODE_NAME_CASE(VFIXUPIMM_SAE)
+  NODE_NAME_CASE(VFIXUPIMMS)
+  NODE_NAME_CASE(VFIXUPIMMS_SAE)
+  NODE_NAME_CASE(VRANGE)
+  NODE_NAME_CASE(VRANGE_SAE)
+  NODE_NAME_CASE(VRANGES)
+  NODE_NAME_CASE(VRANGES_SAE)
+  NODE_NAME_CASE(PMULUDQ)
+  NODE_NAME_CASE(PMULDQ)
+  NODE_NAME_CASE(PSADBW)
+  NODE_NAME_CASE(DBPSADBW)
+  NODE_NAME_CASE(VASTART_SAVE_XMM_REGS)
+  NODE_NAME_CASE(VAARG_64)
+  NODE_NAME_CASE(WIN_ALLOCA)
+  NODE_NAME_CASE(MEMBARRIER)
+  NODE_NAME_CASE(MFENCE)
+  NODE_NAME_CASE(SEG_ALLOCA)
+  NODE_NAME_CASE(PROBED_ALLOCA)
+  NODE_NAME_CASE(RDRAND)
+  NODE_NAME_CASE(RDSEED)
+  NODE_NAME_CASE(RDPKRU)
+  NODE_NAME_CASE(WRPKRU)
+  NODE_NAME_CASE(VPMADDUBSW)
+  NODE_NAME_CASE(VPMADDWD)
+  NODE_NAME_CASE(VPSHA)
+  NODE_NAME_CASE(VPSHL)
+  NODE_NAME_CASE(VPCOM)
+  NODE_NAME_CASE(VPCOMU)
+  NODE_NAME_CASE(VPERMIL2)
+  NODE_NAME_CASE(FMSUB)
+  NODE_NAME_CASE(STRICT_FMSUB)
+  NODE_NAME_CASE(FNMADD)
+  NODE_NAME_CASE(STRICT_FNMADD)
+  NODE_NAME_CASE(FNMSUB)
+  NODE_NAME_CASE(STRICT_FNMSUB)
+  NODE_NAME_CASE(FMADDSUB)
+  NODE_NAME_CASE(FMSUBADD)
+  NODE_NAME_CASE(FMADD_RND)
+  NODE_NAME_CASE(FNMADD_RND)
+  NODE_NAME_CASE(FMSUB_RND)
+  NODE_NAME_CASE(FNMSUB_RND)
+  NODE_NAME_CASE(FMADDSUB_RND)
+  NODE_NAME_CASE(FMSUBADD_RND)
+  NODE_NAME_CASE(VPMADD52H)
+  NODE_NAME_CASE(VPMADD52L)
+  NODE_NAME_CASE(VRNDSCALE)
+  NODE_NAME_CASE(STRICT_VRNDSCALE)
+  NODE_NAME_CASE(VRNDSCALE_SAE)
+  NODE_NAME_CASE(VRNDSCALES)
+  NODE_NAME_CASE(VRNDSCALES_SAE)
+  NODE_NAME_CASE(VREDUCE)
+  NODE_NAME_CASE(VREDUCE_SAE)
+  NODE_NAME_CASE(VREDUCES)
+  NODE_NAME_CASE(VREDUCES_SAE)
+  NODE_NAME_CASE(VGETMANT)
+  NODE_NAME_CASE(VGETMANT_SAE)
+  NODE_NAME_CASE(VGETMANTS)
+  NODE_NAME_CASE(VGETMANTS_SAE)
+  NODE_NAME_CASE(PCMPESTR)
+  NODE_NAME_CASE(PCMPISTR)
+  NODE_NAME_CASE(XTEST)
+  NODE_NAME_CASE(COMPRESS)
+  NODE_NAME_CASE(EXPAND)
+  NODE_NAME_CASE(SELECTS)
+  NODE_NAME_CASE(ADDSUB)
+  NODE_NAME_CASE(RCP14)
+  NODE_NAME_CASE(RCP14S)
+  NODE_NAME_CASE(RCP28)
+  NODE_NAME_CASE(RCP28_SAE)
+  NODE_NAME_CASE(RCP28S)
+  NODE_NAME_CASE(RCP28S_SAE)
+  NODE_NAME_CASE(EXP2)
+  NODE_NAME_CASE(EXP2_SAE)
+  NODE_NAME_CASE(RSQRT14)
+  NODE_NAME_CASE(RSQRT14S)
+  NODE_NAME_CASE(RSQRT28)
+  NODE_NAME_CASE(RSQRT28_SAE)
+  NODE_NAME_CASE(RSQRT28S)
+  NODE_NAME_CASE(RSQRT28S_SAE)
+  NODE_NAME_CASE(FADD_RND)
+  NODE_NAME_CASE(FADDS)
+  NODE_NAME_CASE(FADDS_RND)
+  NODE_NAME_CASE(FSUB_RND)
+  NODE_NAME_CASE(FSUBS)
+  NODE_NAME_CASE(FSUBS_RND)
+  NODE_NAME_CASE(FMUL_RND)
+  NODE_NAME_CASE(FMULS)
+  NODE_NAME_CASE(FMULS_RND)
+  NODE_NAME_CASE(FDIV_RND)
+  NODE_NAME_CASE(FDIVS)
+  NODE_NAME_CASE(FDIVS_RND)
+  NODE_NAME_CASE(FSQRT_RND)
+  NODE_NAME_CASE(FSQRTS)
+  NODE_NAME_CASE(FSQRTS_RND)
+  NODE_NAME_CASE(FGETEXP)
+  NODE_NAME_CASE(FGETEXP_SAE)
+  NODE_NAME_CASE(FGETEXPS)
+  NODE_NAME_CASE(FGETEXPS_SAE)
+  NODE_NAME_CASE(SCALEF)
+  NODE_NAME_CASE(SCALEF_RND)
+  NODE_NAME_CASE(SCALEFS)
+  NODE_NAME_CASE(SCALEFS_RND)
+  NODE_NAME_CASE(AVG)
+  NODE_NAME_CASE(MULHRS)
+  NODE_NAME_CASE(SINT_TO_FP_RND)
+  NODE_NAME_CASE(UINT_TO_FP_RND)
+  NODE_NAME_CASE(CVTTP2SI)
+  NODE_NAME_CASE(CVTTP2UI)
+  NODE_NAME_CASE(STRICT_CVTTP2SI)
+  NODE_NAME_CASE(STRICT_CVTTP2UI)
+  NODE_NAME_CASE(MCVTTP2SI)
+  NODE_NAME_CASE(MCVTTP2UI)
+  NODE_NAME_CASE(CVTTP2SI_SAE)
+  NODE_NAME_CASE(CVTTP2UI_SAE)
+  NODE_NAME_CASE(CVTTS2SI)
+  NODE_NAME_CASE(CVTTS2UI)
+  NODE_NAME_CASE(CVTTS2SI_SAE)
+  NODE_NAME_CASE(CVTTS2UI_SAE)
+  NODE_NAME_CASE(CVTSI2P)
+  NODE_NAME_CASE(CVTUI2P)
+  NODE_NAME_CASE(STRICT_CVTSI2P)
+  NODE_NAME_CASE(STRICT_CVTUI2P)
+  NODE_NAME_CASE(MCVTSI2P)
+  NODE_NAME_CASE(MCVTUI2P)
+  NODE_NAME_CASE(VFPCLASS)
+  NODE_NAME_CASE(VFPCLASSS)
+  NODE_NAME_CASE(MULTISHIFT)
+  NODE_NAME_CASE(SCALAR_SINT_TO_FP)
+  NODE_NAME_CASE(SCALAR_SINT_TO_FP_RND)
+  NODE_NAME_CASE(SCALAR_UINT_TO_FP)
+  NODE_NAME_CASE(SCALAR_UINT_TO_FP_RND)
+  NODE_NAME_CASE(CVTPS2PH)
+  NODE_NAME_CASE(STRICT_CVTPS2PH)
+  NODE_NAME_CASE(MCVTPS2PH)
+  NODE_NAME_CASE(CVTPH2PS)
+  NODE_NAME_CASE(STRICT_CVTPH2PS)
+  NODE_NAME_CASE(CVTPH2PS_SAE)
+  NODE_NAME_CASE(CVTP2SI)
+  NODE_NAME_CASE(CVTP2UI)
+  NODE_NAME_CASE(MCVTP2SI)
+  NODE_NAME_CASE(MCVTP2UI)
+  NODE_NAME_CASE(CVTP2SI_RND)
+  NODE_NAME_CASE(CVTP2UI_RND)
+  NODE_NAME_CASE(CVTS2SI)
+  NODE_NAME_CASE(CVTS2UI)
+  NODE_NAME_CASE(CVTS2SI_RND)
+  NODE_NAME_CASE(CVTS2UI_RND)
+  NODE_NAME_CASE(CVTNE2PS2BF16)
+  NODE_NAME_CASE(CVTNEPS2BF16)
+  NODE_NAME_CASE(MCVTNEPS2BF16)
+  NODE_NAME_CASE(DPBF16PS)
+  NODE_NAME_CASE(LWPINS)
+  NODE_NAME_CASE(MGATHER)
+  NODE_NAME_CASE(MSCATTER)
+  NODE_NAME_CASE(VPDPBUSD)
+  NODE_NAME_CASE(VPDPBUSDS)
+  NODE_NAME_CASE(VPDPWSSD)
+  NODE_NAME_CASE(VPDPWSSDS)
+  NODE_NAME_CASE(VPSHUFBITQMB)
+  NODE_NAME_CASE(GF2P8MULB)
+  NODE_NAME_CASE(GF2P8AFFINEQB)
+  NODE_NAME_CASE(GF2P8AFFINEINVQB)
+  NODE_NAME_CASE(NT_CALL)
+  NODE_NAME_CASE(NT_BRIND)
+  NODE_NAME_CASE(UMWAIT)
+  NODE_NAME_CASE(TPAUSE)
+  NODE_NAME_CASE(ENQCMD)
+  NODE_NAME_CASE(ENQCMDS)
+  NODE_NAME_CASE(VP2INTERSECT)
   }
   return nullptr;
+#undef NODE_NAME_CASE
 }
 
 /// Return true if the addressing mode represented by AM is legal for this
@@ -30018,7 +30716,8 @@ bool X86TargetLowering::isVectorShiftByScalarCheap(Type *Ty) const {
     return false;
 
   // XOP has v16i8/v8i16/v4i32/v2i64 variable vector shifts.
-  if (Subtarget.hasXOP() && Ty->getPrimitiveSizeInBits() == 128 &&
+  // Splitting for v32i8/v16i16 on XOP+AVX2 targets is still preferred.
+  if (Subtarget.hasXOP() &&
       (Bits == 8 || Bits == 16 || Bits == 32 || Bits == 64))
     return false;
 
@@ -30104,7 +30803,7 @@ bool X86TargetLowering::isLegalStoreImmediate(int64_t Imm) const {
 }
 
 bool X86TargetLowering::isTruncateFree(EVT VT1, EVT VT2) const {
-  if (!VT1.isInteger() || !VT2.isInteger())
+  if (!VT1.isScalarInteger() || !VT2.isScalarInteger())
     return false;
   unsigned NumBits1 = VT1.getSizeInBits();
   unsigned NumBits2 = VT2.getSizeInBits();
@@ -30145,6 +30844,39 @@ bool X86TargetLowering::isZExtFree(SDValue Val, EVT VT2) const {
   return false;
 }
 
+bool X86TargetLowering::shouldSinkOperands(Instruction *I,
+                                           SmallVectorImpl<Use *> &Ops) const {
+  // A uniform shift amount in a vector shift or funnel shift may be much
+  // cheaper than a generic variable vector shift, so make that pattern visible
+  // to SDAG by sinking the shuffle instruction next to the shift.
+  int ShiftAmountOpNum = -1;
+  if (I->isShift())
+    ShiftAmountOpNum = 1;
+  else if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+    if (II->getIntrinsicID() == Intrinsic::fshl ||
+        II->getIntrinsicID() == Intrinsic::fshr)
+      ShiftAmountOpNum = 2;
+  }
+
+  if (ShiftAmountOpNum == -1)
+    return false;
+
+  auto *Shuf = dyn_cast<ShuffleVectorInst>(I->getOperand(ShiftAmountOpNum));
+  if (Shuf && getSplatIndex(Shuf->getShuffleMask()) >= 0 &&
+      isVectorShiftByScalarCheap(I->getType())) {
+    Ops.push_back(&I->getOperandUse(ShiftAmountOpNum));
+    return true;
+  }
+
+  return false;
+}
+
+bool X86TargetLowering::shouldConvertPhiType(Type *From, Type *To) const {
+  if (!Subtarget.is64Bit())
+    return false;
+  return TargetLowering::shouldConvertPhiType(From, To);
+}
+
 bool X86TargetLowering::isVectorLoadExtDesirable(SDValue ExtVal) const {
   if (isa<MaskedLoadSDNode>(ExtVal.getOperand(0)))
     return false;
@@ -30188,7 +30920,7 @@ bool X86TargetLowering::isNarrowingProfitable(EVT VT1, EVT VT2) const {
 /// VECTOR_SHUFFLE operations, those with specific masks.
 /// By default, if a target supports the VECTOR_SHUFFLE node, all mask values
 /// are assumed to be legal.
-bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> M, EVT VT) const {
+bool X86TargetLowering::isShuffleMaskLegal(ArrayRef<int> Mask, EVT VT) const {
   if (!VT.isSimple())
     return false;
 
@@ -30218,8 +30950,8 @@ bool X86TargetLowering::isVectorClearMaskLegal(ArrayRef<int> Mask,
 }
 
 bool X86TargetLowering::areJTsAllowed(const Function *Fn) const {
-  // If the subtarget is using retpolines, we need to not generate jump tables.
-  if (Subtarget.useRetpolineIndirectBranches())
+  // If the subtarget is using thunks, we need to not generate jump tables.
+  if (Subtarget.useIndirectThunkBranches())
     return false;
 
   // Otherwise, fallback on the generic logic.
@@ -30333,7 +31065,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
   MachineOperand &Segment = MI.getOperand(5);
   unsigned ArgSize = MI.getOperand(6).getImm();
   unsigned ArgMode = MI.getOperand(7).getImm();
-  unsigned Align = MI.getOperand(8).getImm();
+  Align Alignment = Align(MI.getOperand(8).getImm());
 
   MachineFunction *MF = MBB->getParent();
 
@@ -30373,7 +31105,7 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
 
   /* Align ArgSize to a multiple of 8 */
   unsigned ArgSizeA8 = (ArgSize + 7) & ~7;
-  bool NeedsAlign = (Align > 8);
+  bool NeedsAlign = (Alignment > 8);
 
   MachineBasicBlock *thisMBB = MBB;
   MachineBasicBlock *overflowMBB;
@@ -30521,17 +31253,16 @@ X86TargetLowering::EmitVAARG64WithCustomInserter(MachineInstr &MI,
   // to OverflowDestReg.
   if (NeedsAlign) {
     // Align the overflow address
-    assert(isPowerOf2_32(Align) && "Alignment must be a power of 2");
     Register TmpReg = MRI.createVirtualRegister(AddrRegClass);
 
     // aligned_addr = (addr + (align-1)) & ~(align-1)
     BuildMI(overflowMBB, DL, TII->get(X86::ADD64ri32), TmpReg)
-      .addReg(OverflowAddrReg)
-      .addImm(Align-1);
+        .addReg(OverflowAddrReg)
+        .addImm(Alignment.value() - 1);
 
     BuildMI(overflowMBB, DL, TII->get(X86::AND64ri32), OverflowDestReg)
-      .addReg(TmpReg)
-      .addImm(~(uint64_t)(Align-1));
+        .addReg(TmpReg)
+        .addImm(~(uint64_t)(Alignment.value() - 1));
   } else {
     BuildMI(overflowMBB, DL, TII->get(TargetOpcode::COPY), OverflowDestReg)
       .addReg(OverflowAddrReg);
@@ -30627,7 +31358,7 @@ MachineBasicBlock *X86TargetLowering::EmitVAStartSaveXMMRegsWithCustomInserter(
     MachineMemOperand *MMO = F->getMachineMemOperand(
         MachinePointerInfo::getFixedStack(*F, RegSaveFrameIndex, Offset),
         MachineMemOperand::MOStore,
-        /*Size=*/16, /*Align=*/16);
+        /*Size=*/16, Align(16));
     BuildMI(XMMSaveMBB, DL, TII->get(MOVOpc))
         .addFrameIndex(RegSaveFrameIndex)
         .addImm(/*Scale=*/1)
@@ -30694,11 +31425,13 @@ static bool isCMOVPseudo(MachineInstr &MI) {
   case X86::CMOV_RFP32:
   case X86::CMOV_RFP64:
   case X86::CMOV_RFP80:
+  case X86::CMOV_VR64:
   case X86::CMOV_VR128:
   case X86::CMOV_VR128X:
   case X86::CMOV_VR256:
   case X86::CMOV_VR256X:
   case X86::CMOV_VR512:
+  case X86::CMOV_VK1:
   case X86::CMOV_VK2:
   case X86::CMOV_VK4:
   case X86::CMOV_VK8:
@@ -30995,8 +31728,7 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
            (NextMIIt->getOperand(3).getImm() == CC ||
             NextMIIt->getOperand(3).getImm() == OppCC)) {
       LastCMOV = &*NextMIIt;
-      ++NextMIIt;
-      NextMIIt = skipDebugInstructionsForward(NextMIIt, ThisMBB->end());
+      NextMIIt = next_nodbg(NextMIIt, ThisMBB->end());
     }
   }
 
@@ -31068,6 +31800,112 @@ X86TargetLowering::EmitLoweredSelect(MachineInstr &MI,
   return SinkMBB;
 }
 
+static unsigned getSUBriOpcode(bool IsLP64, int64_t Imm) {
+  if (IsLP64) {
+    if (isInt<8>(Imm))
+      return X86::SUB64ri8;
+    return X86::SUB64ri32;
+  } else {
+    if (isInt<8>(Imm))
+      return X86::SUB32ri8;
+    return X86::SUB32ri;
+  }
+}
+
+MachineBasicBlock *
+X86TargetLowering::EmitLoweredProbedAlloca(MachineInstr &MI,
+                                           MachineBasicBlock *MBB) const {
+  MachineFunction *MF = MBB->getParent();
+  const TargetInstrInfo *TII = Subtarget.getInstrInfo();
+  const X86FrameLowering &TFI = *Subtarget.getFrameLowering();
+  DebugLoc DL = MI.getDebugLoc();
+  const BasicBlock *LLVM_BB = MBB->getBasicBlock();
+
+  const unsigned ProbeSize = getStackProbeSize(*MF);
+
+  MachineRegisterInfo &MRI = MF->getRegInfo();
+  MachineBasicBlock *testMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *tailMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+  MachineBasicBlock *blockMBB = MF->CreateMachineBasicBlock(LLVM_BB);
+
+  MachineFunction::iterator MBBIter = ++MBB->getIterator();
+  MF->insert(MBBIter, testMBB);
+  MF->insert(MBBIter, blockMBB);
+  MF->insert(MBBIter, tailMBB);
+
+  Register sizeVReg = MI.getOperand(1).getReg();
+
+  Register physSPReg = TFI.Uses64BitFramePtr ? X86::RSP : X86::ESP;
+
+  Register TmpStackPtr = MRI.createVirtualRegister(
+      TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
+  Register FinalStackPtr = MRI.createVirtualRegister(
+      TFI.Uses64BitFramePtr ? &X86::GR64RegClass : &X86::GR32RegClass);
+
+  BuildMI(*MBB, {MI}, DL, TII->get(TargetOpcode::COPY), TmpStackPtr)
+      .addReg(physSPReg);
+  {
+    const unsigned Opc = TFI.Uses64BitFramePtr ? X86::SUB64rr : X86::SUB32rr;
+    BuildMI(*MBB, {MI}, DL, TII->get(Opc), FinalStackPtr)
+        .addReg(TmpStackPtr)
+        .addReg(sizeVReg);
+  }
+
+  // test rsp size
+
+  BuildMI(testMBB, DL,
+          TII->get(TFI.Uses64BitFramePtr ? X86::CMP64rr : X86::CMP32rr))
+      .addReg(FinalStackPtr)
+      .addReg(physSPReg);
+
+  BuildMI(testMBB, DL, TII->get(X86::JCC_1))
+      .addMBB(tailMBB)
+      .addImm(X86::COND_L);
+  testMBB->addSuccessor(blockMBB);
+  testMBB->addSuccessor(tailMBB);
+
+  // Touch the block then extend it. This is done on the opposite side of
+  // static probe where we allocate then touch, to avoid the need of probing the
+  // tail of the static alloca. Possible scenarios are:
+  //
+  //       + ---- <- ------------ <- ------------- <- ------------ +
+  //       |                                                       |
+  // [free probe] -> [page alloc] -> [alloc probe] -> [tail alloc] + -> [dyn probe] -> [page alloc] -> [dyn probe] -> [tail alloc] +
+  //                                                               |                                                               |
+  //                                                               + <- ----------- <- ------------ <- ----------- <- ------------ +
+  //
+  // The property we want to enforce is to never have more than [page alloc] between two probes.
+
+  const unsigned MovMIOpc =
+      TFI.Uses64BitFramePtr ? X86::MOV64mi32 : X86::MOV32mi;
+  addRegOffset(BuildMI(blockMBB, DL, TII->get(MovMIOpc)), physSPReg, false, 0)
+      .addImm(0);
+
+  BuildMI(blockMBB, DL,
+          TII->get(getSUBriOpcode(TFI.Uses64BitFramePtr, ProbeSize)), physSPReg)
+      .addReg(physSPReg)
+      .addImm(ProbeSize);
+
+
+  BuildMI(blockMBB, DL, TII->get(X86::JMP_1)).addMBB(testMBB);
+  blockMBB->addSuccessor(testMBB);
+
+  // Replace original instruction by the expected stack ptr
+  BuildMI(tailMBB, DL, TII->get(TargetOpcode::COPY), MI.getOperand(0).getReg())
+      .addReg(FinalStackPtr);
+
+  tailMBB->splice(tailMBB->end(), MBB,
+                  std::next(MachineBasicBlock::iterator(MI)), MBB->end());
+  tailMBB->transferSuccessorsAndUpdatePHIs(MBB);
+  MBB->addSuccessor(testMBB);
+
+  // Delete the original pseudo instruction.
+  MI.eraseFromParent();
+
+  // And we're done.
+  return tailMBB;
+}
+
 MachineBasicBlock *
 X86TargetLowering::EmitLoweredSegAlloca(MachineInstr &MI,
                                         MachineBasicBlock *BB) const {
@@ -31228,29 +32066,16 @@ X86TargetLowering::EmitLoweredCatchRet(MachineInstr &MI,
   BB->addSuccessor(RestoreMBB);
   MI.getOperand(0).setMBB(RestoreMBB);
 
+  // Marking this as an EH pad but not a funclet entry block causes PEI to
+  // restore stack pointers in the block.
+  RestoreMBB->setIsEHPad(true);
+
   auto RestoreMBBI = RestoreMBB->begin();
-  BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::EH_RESTORE));
   BuildMI(*RestoreMBB, RestoreMBBI, DL, TII.get(X86::JMP_4)).addMBB(TargetMBB);
   return BB;
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitLoweredCatchPad(MachineInstr &MI,
-                                       MachineBasicBlock *BB) const {
-  MachineFunction *MF = BB->getParent();
-  const Constant *PerFn = MF->getFunction().getPersonalityFn();
-  bool IsSEH = isAsynchronousEHPersonality(classifyEHPersonality(PerFn));
-  // Only 32-bit SEH requires special handling for catchpad.
-  if (IsSEH && Subtarget.is32Bit()) {
-    const TargetInstrInfo &TII = *Subtarget.getInstrInfo();
-    DebugLoc DL = MI.getDebugLoc();
-    BuildMI(*BB, MI, DL, TII.get(X86::EH_RESTORE));
-  }
-  MI.eraseFromParent();
-  return BB;
-}
-
-MachineBasicBlock *
 X86TargetLowering::EmitLoweredTLSAddr(MachineInstr &MI,
                                       MachineBasicBlock *BB) const {
   // So, here we replace TLSADDR with the sequence:
@@ -31342,22 +32167,22 @@ X86TargetLowering::EmitLoweredTLSCall(MachineInstr &MI,
   return BB;
 }
 
-static unsigned getOpcodeForRetpoline(unsigned RPOpc) {
+static unsigned getOpcodeForIndirectThunk(unsigned RPOpc) {
   switch (RPOpc) {
-  case X86::RETPOLINE_CALL32:
+  case X86::INDIRECT_THUNK_CALL32:
     return X86::CALLpcrel32;
-  case X86::RETPOLINE_CALL64:
+  case X86::INDIRECT_THUNK_CALL64:
     return X86::CALL64pcrel32;
-  case X86::RETPOLINE_TCRETURN32:
+  case X86::INDIRECT_THUNK_TCRETURN32:
     return X86::TCRETURNdi;
-  case X86::RETPOLINE_TCRETURN64:
+  case X86::INDIRECT_THUNK_TCRETURN64:
     return X86::TCRETURNdi64;
   }
-  llvm_unreachable("not retpoline opcode");
+  llvm_unreachable("not indirect thunk opcode");
 }
 
-static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
-                                      unsigned Reg) {
+static const char *getIndirectThunkSymbol(const X86Subtarget &Subtarget,
+                                          unsigned Reg) {
   if (Subtarget.useRetpolineExternalThunk()) {
     // When using an external thunk for retpolines, we pick names that match the
     // names GCC happens to use as well. This helps simplify the implementation
@@ -31389,39 +32214,48 @@ static const char *getRetpolineSymbol(const X86Subtarget &Subtarget,
       assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
       return "__x86_indirect_thunk_r11";
     }
+    llvm_unreachable("unexpected reg for external indirect thunk");
+  }
+
+  if (Subtarget.useRetpolineIndirectCalls() ||
+      Subtarget.useRetpolineIndirectBranches()) {
+    // When targeting an internal COMDAT thunk use an LLVM-specific name.
+    switch (Reg) {
+    case X86::EAX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__llvm_retpoline_eax";
+    case X86::ECX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__llvm_retpoline_ecx";
+    case X86::EDX:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__llvm_retpoline_edx";
+    case X86::EDI:
+      assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
+      return "__llvm_retpoline_edi";
+    case X86::R11:
+      assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
+      return "__llvm_retpoline_r11";
+    }
     llvm_unreachable("unexpected reg for retpoline");
   }
 
-  // When targeting an internal COMDAT thunk use an LLVM-specific name.
-  switch (Reg) {
-  case X86::EAX:
-    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
-    return "__llvm_retpoline_eax";
-  case X86::ECX:
-    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
-    return "__llvm_retpoline_ecx";
-  case X86::EDX:
-    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
-    return "__llvm_retpoline_edx";
-  case X86::EDI:
-    assert(!Subtarget.is64Bit() && "Should not be using a 32-bit thunk!");
-    return "__llvm_retpoline_edi";
-  case X86::R11:
+  if (Subtarget.useLVIControlFlowIntegrity()) {
     assert(Subtarget.is64Bit() && "Should not be using a 64-bit thunk!");
-    return "__llvm_retpoline_r11";
+    return "__llvm_lvi_thunk_r11";
   }
-  llvm_unreachable("unexpected reg for retpoline");
+  llvm_unreachable("getIndirectThunkSymbol() invoked without thunk feature");
 }
 
 MachineBasicBlock *
-X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
-                                        MachineBasicBlock *BB) const {
+X86TargetLowering::EmitLoweredIndirectThunk(MachineInstr &MI,
+                                            MachineBasicBlock *BB) const {
   // Copy the virtual register into the R11 physical register and
   // call the retpoline thunk.
   DebugLoc DL = MI.getDebugLoc();
   const X86InstrInfo *TII = Subtarget.getInstrInfo();
   Register CalleeVReg = MI.getOperand(0).getReg();
-  unsigned Opc = getOpcodeForRetpoline(MI.getOpcode());
+  unsigned Opc = getOpcodeForIndirectThunk(MI.getOpcode());
 
   // Find an available scratch register to hold the callee. On 64-bit, we can
   // just use R11, but we scan for uses anyway to ensure we don't generate
@@ -31455,7 +32289,7 @@ X86TargetLowering::EmitLoweredRetpoline(MachineInstr &MI,
     report_fatal_error("calling convention incompatible with retpoline, no "
                        "available registers");
 
-  const char *Symbol = getRetpolineSymbol(Subtarget, AvailableReg);
+  const char *Symbol = getIndirectThunkSymbol(Subtarget, AvailableReg);
 
   BuildMI(*BB, MI, DL, TII->get(TargetOpcode::COPY), AvailableReg)
       .addReg(CalleeVReg);
@@ -31743,12 +32577,17 @@ X86TargetLowering::emitLongJmpShadowStackFix(MachineInstr &MI,
   MBB->addSuccessor(checkSspMBB);
 
   // Initialize a register with zero.
-  Register ZReg = MRI.createVirtualRegister(PtrRC);
-  unsigned XorRROpc = (PVT == MVT::i64) ? X86::XOR64rr : X86::XOR32rr;
-  BuildMI(checkSspMBB, DL, TII->get(XorRROpc))
-      .addDef(ZReg)
-      .addReg(ZReg, RegState::Undef)
-      .addReg(ZReg, RegState::Undef);
+  Register ZReg = MRI.createVirtualRegister(&X86::GR32RegClass);
+  BuildMI(checkSspMBB, DL, TII->get(X86::MOV32r0), ZReg);
+
+  if (PVT == MVT::i64) {
+    Register TmpZReg = MRI.createVirtualRegister(PtrRC);
+    BuildMI(checkSspMBB, DL, TII->get(X86::SUBREG_TO_REG), TmpZReg)
+      .addImm(0)
+      .addReg(ZReg)
+      .addImm(X86::sub_32bit);
+    ZReg = TmpZReg;
+  }
 
   // Read the current SSP Register value to the zeroed register.
   Register SSPCopyReg = MRI.createVirtualRegister(PtrRC);
@@ -31877,7 +32716,7 @@ X86TargetLowering::emitEHSjLjLongJmp(MachineInstr &MI,
   Register Tmp = MRI.createVirtualRegister(RC);
   // Since FP is only updated here but NOT referenced, it's treated as GPR.
   const X86RegisterInfo *RegInfo = Subtarget.getRegisterInfo();
-  unsigned FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
+  Register FP = (PVT == MVT::i64) ? X86::RBP : X86::EBP;
   Register SP = RegInfo->getStackRegister();
 
   MachineInstrBuilder MIB;
@@ -32224,6 +33063,10 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   const TargetInstrInfo *TII = Subtarget.getInstrInfo();
   DebugLoc DL = MI.getDebugLoc();
 
+  auto TMMImmToTMMReg = [](unsigned Imm) {
+    assert (Imm < 8 && "Illegal tmm index");
+    return X86::TMM0 + Imm;
+  };
   switch (MI.getOpcode()) {
   default: llvm_unreachable("Unexpected instr type to insert");
   case X86::TLS_addr32:
@@ -32231,18 +33074,19 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::TLS_base_addr32:
   case X86::TLS_base_addr64:
     return EmitLoweredTLSAddr(MI, BB);
-  case X86::RETPOLINE_CALL32:
-  case X86::RETPOLINE_CALL64:
-  case X86::RETPOLINE_TCRETURN32:
-  case X86::RETPOLINE_TCRETURN64:
-    return EmitLoweredRetpoline(MI, BB);
+  case X86::INDIRECT_THUNK_CALL32:
+  case X86::INDIRECT_THUNK_CALL64:
+  case X86::INDIRECT_THUNK_TCRETURN32:
+  case X86::INDIRECT_THUNK_TCRETURN64:
+    return EmitLoweredIndirectThunk(MI, BB);
   case X86::CATCHRET:
     return EmitLoweredCatchRet(MI, BB);
-  case X86::CATCHPAD:
-    return EmitLoweredCatchPad(MI, BB);
   case X86::SEG_ALLOCA_32:
   case X86::SEG_ALLOCA_64:
     return EmitLoweredSegAlloca(MI, BB);
+  case X86::PROBED_ALLOCA_32:
+  case X86::PROBED_ALLOCA_64:
+    return EmitLoweredProbedAlloca(MI, BB);
   case X86::TLSCall_32:
   case X86::TLSCall_64:
     return EmitLoweredTLSCall(MI, BB);
@@ -32256,11 +33100,13 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::CMOV_RFP32:
   case X86::CMOV_RFP64:
   case X86::CMOV_RFP80:
+  case X86::CMOV_VR64:
   case X86::CMOV_VR128:
   case X86::CMOV_VR128X:
   case X86::CMOV_VR256:
   case X86::CMOV_VR256X:
   case X86::CMOV_VR512:
+  case X86::CMOV_VK1:
   case X86::CMOV_VK2:
   case X86::CMOV_VK4:
   case X86::CMOV_VK8:
@@ -32315,7 +33161,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
   case X86::FP80_TO_INT64_IN_MEM: {
     // Change the floating point control register to use "round towards zero"
     // mode when truncating to an integer value.
-    int OrigCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
+    int OrigCWFrameIdx =
+        MF->getFrameInfo().CreateStackObject(2, Align(2), false);
     addFrameReference(BuildMI(*BB, MI, DL,
                               TII->get(X86::FNSTCW16m)), OrigCWFrameIdx);
 
@@ -32336,7 +33183,8 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
       .addReg(NewCW, RegState::Kill, X86::sub_16bit);
 
     // Prepare memory for FLDCW.
-    int NewCWFrameIdx = MF->getFrameInfo().CreateStackObject(2, 2, false);
+    int NewCWFrameIdx =
+        MF->getFrameInfo().CreateStackObject(2, Align(2), false);
     addFrameReference(BuildMI(*BB, MI, DL, TII->get(X86::MOV16mr)),
                       NewCWFrameIdx)
       .addReg(NewCW16, RegState::Kill);
@@ -32471,6 +33319,97 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
       BB->addLiveIn(BasePtr);
     return BB;
   }
+  case TargetOpcode::PREALLOCATED_SETUP: {
+    assert(Subtarget.is32Bit() && "preallocated only used in 32-bit");
+    auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+    MFI->setHasPreallocatedCall(true);
+    int64_t PreallocatedId = MI.getOperand(0).getImm();
+    size_t StackAdjustment = MFI->getPreallocatedStackSize(PreallocatedId);
+    assert(StackAdjustment != 0 && "0 stack adjustment");
+    LLVM_DEBUG(dbgs() << "PREALLOCATED_SETUP stack adjustment "
+                      << StackAdjustment << "\n");
+    BuildMI(*BB, MI, DL, TII->get(X86::SUB32ri), X86::ESP)
+        .addReg(X86::ESP)
+        .addImm(StackAdjustment);
+    MI.eraseFromParent();
+    return BB;
+  }
+  case TargetOpcode::PREALLOCATED_ARG: {
+    assert(Subtarget.is32Bit() && "preallocated calls only used in 32-bit");
+    int64_t PreallocatedId = MI.getOperand(1).getImm();
+    int64_t ArgIdx = MI.getOperand(2).getImm();
+    auto MFI = MF->getInfo<X86MachineFunctionInfo>();
+    size_t ArgOffset = MFI->getPreallocatedArgOffsets(PreallocatedId)[ArgIdx];
+    LLVM_DEBUG(dbgs() << "PREALLOCATED_ARG arg index " << ArgIdx
+                      << ", arg offset " << ArgOffset << "\n");
+    // stack pointer + offset
+    addRegOffset(
+        BuildMI(*BB, MI, DL, TII->get(X86::LEA32r), MI.getOperand(0).getReg()),
+        X86::ESP, false, ArgOffset);
+    MI.eraseFromParent();
+    return BB;
+  }
+  case X86::PTDPBSSD:
+  case X86::PTDPBSUD:
+  case X86::PTDPBUSD:
+  case X86::PTDPBUUD:
+  case X86::PTDPBF16PS: {
+    const DebugLoc &DL = MI.getDebugLoc();
+    unsigned Opc;
+    switch (MI.getOpcode()) {
+    case X86::PTDPBSSD: Opc = X86::TDPBSSD; break;
+    case X86::PTDPBSUD: Opc = X86::TDPBSUD; break;
+    case X86::PTDPBUSD: Opc = X86::TDPBUSD; break;
+    case X86::PTDPBUUD: Opc = X86::TDPBUUD; break;
+    case X86::PTDPBF16PS: Opc = X86::TDPBF16PS; break;
+    }
+
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Define);
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(0).getImm()), RegState::Undef);
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(1).getImm()), RegState::Undef);
+    MIB.addReg(TMMImmToTMMReg(MI.getOperand(2).getImm()), RegState::Undef);
+
+    MI.eraseFromParent(); // The pseudo is gone now.
+    return BB;
+  }
+  case X86::PTILEZERO: {
+    const DebugLoc &DL = MI.getDebugLoc();
+    unsigned Imm = MI.getOperand(0).getImm();
+    BuildMI(*BB, MI, DL, TII->get(X86::TILEZERO), TMMImmToTMMReg(Imm));
+    MI.eraseFromParent(); // The pseudo is gone now.
+    return BB;
+  }
+  case X86::PTILELOADD:
+  case X86::PTILELOADDT1:
+  case X86::PTILESTORED: {
+    const DebugLoc &DL = MI.getDebugLoc();
+    unsigned Opc;
+    switch (MI.getOpcode()) {
+    case X86::PTILELOADD:   Opc = X86::TILELOADD;   break;
+    case X86::PTILELOADDT1: Opc = X86::TILELOADDT1; break;
+    case X86::PTILESTORED:  Opc = X86::TILESTORED;  break;
+    }
+
+    MachineInstrBuilder MIB = BuildMI(*BB, MI, DL, TII->get(Opc));
+    unsigned CurOp = 0;
+    if (Opc != X86::TILESTORED)
+      MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
+                 RegState::Define);
+
+    MIB.add(MI.getOperand(CurOp++)); // base
+    MIB.add(MI.getOperand(CurOp++)); // scale
+    MIB.add(MI.getOperand(CurOp++)); // index -- stride
+    MIB.add(MI.getOperand(CurOp++)); // displacement
+    MIB.add(MI.getOperand(CurOp++)); // segment
+
+    if (Opc == X86::TILESTORED)
+      MIB.addReg(TMMImmToTMMReg(MI.getOperand(CurOp++).getImm()),
+                 RegState::Undef);
+
+    MI.eraseFromParent(); // The pseudo is gone now.
+    return BB;
+  }
   }
 }
 
@@ -32480,20 +33419,53 @@ X86TargetLowering::EmitInstrWithCustomInserter(MachineInstr &MI,
 
 bool
 X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
-                                                const APInt &Demanded,
+                                                const APInt &DemandedBits,
+                                                const APInt &DemandedElts,
                                                 TargetLoweringOpt &TLO) const {
-  // Only optimize Ands to prevent shrinking a constant that could be
-  // matched by movzx.
-  if (Op.getOpcode() != ISD::AND)
-    return false;
-
   EVT VT = Op.getValueType();
+  unsigned Opcode = Op.getOpcode();
+  unsigned EltSize = VT.getScalarSizeInBits();
 
-  // Ignore vectors.
-  if (VT.isVector())
+  if (VT.isVector()) {
+    // If the constant is only all signbits in the active bits, then we should
+    // extend it to the entire constant to allow it act as a boolean constant
+    // vector.
+    auto NeedsSignExtension = [&](SDValue V, unsigned ActiveBits) {
+      if (!ISD::isBuildVectorOfConstantSDNodes(V.getNode()))
+        return false;
+      for (unsigned i = 0, e = V.getNumOperands(); i != e; ++i) {
+        if (!DemandedElts[i] || V.getOperand(i).isUndef())
+          continue;
+        const APInt &Val = V.getConstantOperandAPInt(i);
+        if (Val.getBitWidth() > Val.getNumSignBits() &&
+            Val.trunc(ActiveBits).getNumSignBits() == ActiveBits)
+          return true;
+      }
+      return false;
+    };
+    // For vectors - if we have a constant, then try to sign extend.
+    // TODO: Handle AND/ANDN cases.
+    unsigned ActiveBits = DemandedBits.getActiveBits();
+    if (EltSize > ActiveBits && EltSize > 1 && isTypeLegal(VT) &&
+        (Opcode == ISD::OR || Opcode == ISD::XOR) &&
+        NeedsSignExtension(Op.getOperand(1), ActiveBits)) {
+      EVT ExtSVT = EVT::getIntegerVT(*TLO.DAG.getContext(), ActiveBits);
+      EVT ExtVT = EVT::getVectorVT(*TLO.DAG.getContext(), ExtSVT,
+                                    VT.getVectorNumElements());
+      SDValue NewC =
+          TLO.DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(Op), VT,
+                          Op.getOperand(1), TLO.DAG.getValueType(ExtVT));
+      SDValue NewOp =
+          TLO.DAG.getNode(Opcode, SDLoc(Op), VT, Op.getOperand(0), NewC);
+      return TLO.CombineTo(Op, NewOp);
+    }
     return false;
+  }
 
-  unsigned Size = VT.getSizeInBits();
+  // Only optimize Ands to prevent shrinking a constant that could be
+  // matched by movzx.
+  if (Opcode != ISD::AND)
+    return false;
 
   // Make sure the RHS really is a constant.
   ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op.getOperand(1));
@@ -32503,7 +33475,7 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
   const APInt &Mask = C->getAPIntValue();
 
   // Clear all non-demanded bits initially.
-  APInt ShrunkMask = Mask & Demanded;
+  APInt ShrunkMask = Mask & DemandedBits;
 
   // Find the width of the shrunk mask.
   unsigned Width = ShrunkMask.getActiveBits();
@@ -32515,10 +33487,10 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
   // Find the next power of 2 width, rounding up to a byte.
   Width = PowerOf2Ceil(std::max(Width, 8U));
   // Truncate the width to size to handle illegal types.
-  Width = std::min(Width, Size);
+  Width = std::min(Width, EltSize);
 
   // Calculate a possible zero extend mask for this constant.
-  APInt ZeroExtendMask = APInt::getLowBitsSet(Size, Width);
+  APInt ZeroExtendMask = APInt::getLowBitsSet(EltSize, Width);
 
   // If we aren't changing the mask, just return true to keep it and prevent
   // the caller from optimizing.
@@ -32527,7 +33499,7 @@ X86TargetLowering::targetShrinkDemandedConstant(SDValue Op,
 
   // Make sure the new mask can be represented by a combination of mask bits
   // and non-demanded bits.
-  if (!ZeroExtendMask.isSubsetOf(Mask | ~Demanded))
+  if (!ZeroExtendMask.isSubsetOf(Mask | ~DemandedBits))
     return false;
 
   // Replace the constant with the zero extend mask.
@@ -32543,6 +33515,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
                                                       const SelectionDAG &DAG,
                                                       unsigned Depth) const {
   unsigned BitWidth = Known.getBitWidth();
+  unsigned NumElts = DemandedElts.getBitWidth();
   unsigned Opc = Op.getOpcode();
   EVT VT = Op.getValueType();
   assert((Opc >= ISD::BUILTIN_OP_END ||
@@ -32570,7 +33543,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     APInt DemandedElt = APInt::getOneBitSet(SrcVT.getVectorNumElements(),
                                             Op.getConstantOperandVal(1));
     Known = DAG.computeKnownBits(Src, DemandedElt, Depth + 1);
-    Known = Known.zextOrTrunc(BitWidth, false);
+    Known = Known.anyextOrTrunc(BitWidth);
     Known.Zero.setBitsFrom(SrcVT.getScalarSizeInBits());
     break;
   }
@@ -32640,10 +33613,7 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     Known = DAG.computeKnownBits(Op.getOperand(1), DemandedElts, Depth + 1);
     Known2 = DAG.computeKnownBits(Op.getOperand(0), DemandedElts, Depth + 1);
 
-    // Output known-0 bits are only known if clear in both the LHS & RHS.
-    Known.Zero &= Known2.Zero;
-    // Output known-1 are known to be set if set in either the LHS | RHS.
-    Known.One |= Known2.One;
+    Known |= Known2;
     break;
   }
   case X86ISD::PSADBW: {
@@ -32667,6 +33637,76 @@ void X86TargetLowering::computeKnownBitsForTargetNode(const SDValue Op,
     Known.Zero &= Known2.Zero;
     break;
   }
+  case X86ISD::BEXTR: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    if (auto* Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
+      unsigned Shift = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 0);
+      unsigned Length = Cst1->getAPIntValue().extractBitsAsZExtValue(8, 8);
+
+      // If the length is 0, the result is 0.
+      if (Length == 0) {
+        Known.setAllZero();
+        break;
+      }
+
+      if ((Shift + Length) <= BitWidth) {
+        Known = DAG.computeKnownBits(Op0, Depth + 1);
+        Known = Known.extractBits(Length, Shift);
+        Known = Known.zextOrTrunc(BitWidth);
+      }
+    }
+    break;
+  }
+  case X86ISD::CVTSI2P:
+  case X86ISD::CVTUI2P:
+  case X86ISD::CVTP2SI:
+  case X86ISD::CVTP2UI:
+  case X86ISD::MCVTP2SI:
+  case X86ISD::MCVTP2UI:
+  case X86ISD::CVTTP2SI:
+  case X86ISD::CVTTP2UI:
+  case X86ISD::MCVTTP2SI:
+  case X86ISD::MCVTTP2UI:
+  case X86ISD::MCVTSI2P:
+  case X86ISD::MCVTUI2P:
+  case X86ISD::VFPROUND:
+  case X86ISD::VMFPROUND:
+  case X86ISD::CVTPS2PH:
+  case X86ISD::MCVTPS2PH: {
+    // Conversions - upper elements are known zero.
+    EVT SrcVT = Op.getOperand(0).getValueType();
+    if (SrcVT.isVector()) {
+      unsigned NumSrcElts = SrcVT.getVectorNumElements();
+      if (NumElts > NumSrcElts &&
+          DemandedElts.countTrailingZeros() >= NumSrcElts)
+        Known.setAllZero();
+    }
+    break;
+  }
+  case X86ISD::STRICT_CVTTP2SI:
+  case X86ISD::STRICT_CVTTP2UI:
+  case X86ISD::STRICT_CVTSI2P:
+  case X86ISD::STRICT_CVTUI2P:
+  case X86ISD::STRICT_VFPROUND:
+  case X86ISD::STRICT_CVTPS2PH: {
+    // Strict Conversions - upper elements are known zero.
+    EVT SrcVT = Op.getOperand(1).getValueType();
+    if (SrcVT.isVector()) {
+      unsigned NumSrcElts = SrcVT.getVectorNumElements();
+      if (NumElts > NumSrcElts &&
+          DemandedElts.countTrailingZeros() >= NumSrcElts)
+        Known.setAllZero();
+    }
+    break;
+  }
+  case X86ISD::MOVQ2DQ: {
+    // Move from MMX to XMM. Upper half of XMM should be 0.
+    if (DemandedElts.countTrailingZeros() >= (NumElts / 2))
+      Known.setAllZero();
+    break;
+  }
   }
 
   // Handle target shuffles.
@@ -32733,11 +33773,12 @@ unsigned X86TargetLowering::ComputeNumSignBitsForTargetNode(
     return VTBits;
 
   case X86ISD::VTRUNC: {
-    // TODO: Add DemandedElts support.
     SDValue Src = Op.getOperand(0);
-    unsigned NumSrcBits = Src.getScalarValueSizeInBits();
+    MVT SrcVT = Src.getSimpleValueType();
+    unsigned NumSrcBits = SrcVT.getScalarSizeInBits();
     assert(VTBits < NumSrcBits && "Illegal truncation input type");
-    unsigned Tmp = DAG.ComputeNumSignBits(Src, Depth + 1);
+    APInt DemandedSrc = DemandedElts.zextOrTrunc(SrcVT.getVectorNumElements());
+    unsigned Tmp = DAG.ComputeNumSignBits(Src, DemandedSrc, Depth + 1);
     if (Tmp > (NumSrcBits - VTBits))
       return Tmp - (NumSrcBits - VTBits);
     return 1;
@@ -32865,6 +33906,21 @@ SDValue X86TargetLowering::unwrapAddress(SDValue N) const {
   return N;
 }
 
+// Helper to look for a normal load that can be narrowed into a vzload with the
+// specified VT and memory VT. Returns SDValue() on failure.
+static SDValue narrowLoadToVZLoad(LoadSDNode *LN, MVT MemVT, MVT VT,
+                                  SelectionDAG &DAG) {
+  // Can't if the load is volatile or atomic.
+  if (!LN->isSimple())
+    return SDValue();
+
+  SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+  SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+  return DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, SDLoc(LN), Tys, Ops, MemVT,
+                                 LN->getPointerInfo(), LN->getOriginalAlign(),
+                                 LN->getMemOperand()->getFlags());
+}
+
 // Attempt to match a combined shuffle mask against supported unary shuffle
 // instructions.
 // TODO: Investigate sharing more of this with shuffle lowering.
@@ -33009,9 +34065,7 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
   unsigned InputSizeInBits = MaskVT.getSizeInBits();
   unsigned MaskScalarSizeInBits = InputSizeInBits / NumMaskElts;
   MVT MaskEltVT = MVT::getIntegerVT(MaskScalarSizeInBits);
-
-  bool ContainsZeros =
-      llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+  bool ContainsZeros = isAnyZero(Mask);
 
   // Handle VPERMI/VPERMILPD vXi64/vXi64 patterns.
   if (!ContainsZeros && MaskScalarSizeInBits == 64) {
@@ -33059,7 +34113,7 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
       // Narrow the repeated mask to create 32-bit element permutes.
       SmallVector<int, 4> WordMask = RepeatedMask;
       if (MaskScalarSizeInBits == 64)
-        scaleShuffleMask<int>(2, RepeatedMask, WordMask);
+        narrowShuffleMaskElts(2, RepeatedMask, WordMask);
 
       Shuffle = (AllowIntDomain ? X86ISD::PSHUFD : X86ISD::VPERMILPI);
       ShuffleVT = (AllowIntDomain ? MVT::i32 : MVT::f32);
@@ -33102,17 +34156,32 @@ static bool matchUnaryPermuteShuffle(MVT MaskVT, ArrayRef<int> Mask,
   }
 
   // Attempt to match against byte/bit shifts.
-  // FIXME: Add 512-bit support.
-  if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
-                         (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+  if (AllowIntDomain &&
+      ((MaskVT.is128BitVector() && Subtarget.hasSSE2()) ||
+       (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
     int ShiftAmt = matchShuffleAsShift(ShuffleVT, Shuffle, MaskScalarSizeInBits,
                                        Mask, 0, Zeroable, Subtarget);
-    if (0 < ShiftAmt) {
+    if (0 < ShiftAmt && (!ShuffleVT.is512BitVector() || Subtarget.hasBWI() ||
+                         32 <= ShuffleVT.getScalarSizeInBits())) {
       PermuteImm = (unsigned)ShiftAmt;
       return true;
     }
   }
 
+  // Attempt to match against bit rotates.
+  if (!ContainsZeros && AllowIntDomain && MaskScalarSizeInBits < 64 &&
+      ((MaskVT.is128BitVector() && Subtarget.hasXOP()) ||
+       Subtarget.hasAVX512())) {
+    int RotateAmt = matchShuffleAsBitRotate(ShuffleVT, MaskScalarSizeInBits,
+                                            Subtarget, Mask);
+    if (0 < RotateAmt) {
+      Shuffle = X86ISD::VROTLI;
+      PermuteImm = (unsigned)RotateAmt;
+      return true;
+    }
+  }
+
   return false;
 }
 
@@ -33193,9 +34262,29 @@ static bool matchBinaryPermuteShuffle(
   unsigned NumMaskElts = Mask.size();
   unsigned EltSizeInBits = MaskVT.getScalarSizeInBits();
 
+  // Attempt to match against VALIGND/VALIGNQ rotate.
+  if (AllowIntDomain && (EltSizeInBits == 64 || EltSizeInBits == 32) &&
+      ((MaskVT.is128BitVector() && Subtarget.hasVLX()) ||
+       (MaskVT.is256BitVector() && Subtarget.hasVLX()) ||
+       (MaskVT.is512BitVector() && Subtarget.hasAVX512()))) {
+    if (!isAnyZero(Mask)) {
+      int Rotation = matchShuffleAsElementRotate(V1, V2, Mask);
+      if (0 < Rotation) {
+        Shuffle = X86ISD::VALIGN;
+        if (EltSizeInBits == 64)
+          ShuffleVT = MVT::getVectorVT(MVT::i64, MaskVT.getSizeInBits() / 64);
+        else
+          ShuffleVT = MVT::getVectorVT(MVT::i32, MaskVT.getSizeInBits() / 32);
+        PermuteImm = Rotation;
+        return true;
+      }
+    }
+  }
+
   // Attempt to match against PALIGNR byte rotate.
   if (AllowIntDomain && ((MaskVT.is128BitVector() && Subtarget.hasSSSE3()) ||
-                         (MaskVT.is256BitVector() && Subtarget.hasAVX2()))) {
+                         (MaskVT.is256BitVector() && Subtarget.hasAVX2()) ||
+                         (MaskVT.is512BitVector() && Subtarget.hasBWI()))) {
     int ByteRotation = matchShuffleAsByteRotate(MaskVT, V1, V2, Mask);
     if (0 < ByteRotation) {
       Shuffle = X86ISD::PALIGNR;
@@ -33245,8 +34334,7 @@ static bool matchBinaryPermuteShuffle(
   // Attempt to combine to INSERTPS, but only if it has elements that need to
   // be set to zero.
   if (AllowFloatDomain && EltSizeInBits == 32 && Subtarget.hasSSE41() &&
-      MaskVT.is128BitVector() &&
-      llvm::any_of(Mask, [](int M) { return M == SM_SentinelZero; }) &&
+      MaskVT.is128BitVector() && isAnyZero(Mask) &&
       matchShuffleAsInsertPS(V1, V2, PermuteImm, Zeroable, Mask, DAG)) {
     Shuffle = X86ISD::INSERTPS;
     ShuffleVT = MVT::v4f32;
@@ -33374,6 +34462,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     return DAG.getBitcast(RootVT, V1);
   }
 
+  bool OptForSize = DAG.shouldOptForSize();
   unsigned RootSizeInBits = RootVT.getSizeInBits();
   unsigned NumRootElts = RootVT.getVectorNumElements();
   unsigned BaseMaskEltSizeInBits = RootSizeInBits / NumBaseMaskElts;
@@ -33384,11 +34473,21 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   // Don't combine if we are a AVX512/EVEX target and the mask element size
   // is different from the root element size - this would prevent writemasks
   // from being reused.
-  // TODO - this currently prevents all lane shuffles from occurring.
-  // TODO - check for writemasks usage instead of always preventing combining.
-  // TODO - attempt to narrow Mask back to writemask size.
-  bool IsEVEXShuffle =
-      RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128);
+  bool IsMaskedShuffle = false;
+  if (RootSizeInBits == 512 || (Subtarget.hasVLX() && RootSizeInBits >= 128)) {
+    if (Root.hasOneUse() && Root->use_begin()->getOpcode() == ISD::VSELECT &&
+        Root->use_begin()->getOperand(0).getScalarValueSizeInBits() == 1) {
+      IsMaskedShuffle = true;
+    }
+  }
+
+  // If we are shuffling a broadcast (and not introducing zeros) then
+  // we can just use the broadcast directly. This works for smaller broadcast
+  // elements as well as they already repeat across each mask element
+  if (UnaryShuffle && isTargetShuffleSplat(V1) && !isAnyZero(BaseMask) &&
+      (BaseMaskEltSizeInBits % V1.getScalarValueSizeInBits()) == 0) {
+    return DAG.getBitcast(RootVT, V1);
+  }
 
   // Attempt to match a subvector broadcast.
   // shuffle(insert_subvector(undef, sub, 0), undef, 0, 0, 0, 0)
@@ -33408,27 +34507,138 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     }
   }
 
-  // TODO - handle 128/256-bit lane shuffles of 512-bit vectors.
+  // Handle 128/256-bit lane shuffles of 512-bit vectors.
+  if (RootVT.is512BitVector() &&
+      (NumBaseMaskElts == 2 || NumBaseMaskElts == 4)) {
+    MVT ShuffleVT = (FloatDomain ? MVT::v8f64 : MVT::v8i64);
+
+    // If the upper subvectors are zeroable, then an extract+insert is more
+    // optimal than using X86ISD::SHUF128. The insertion is free, even if it has
+    // to zero the upper subvectors.
+    if (isUndefOrZeroInRange(BaseMask, 1, NumBaseMaskElts - 1)) {
+      if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
+        return SDValue(); // Nothing to do!
+      assert(isInRange(BaseMask[0], 0, NumBaseMaskElts) &&
+             "Unexpected lane shuffle");
+      Res = DAG.getBitcast(ShuffleVT, V1);
+      unsigned SubIdx = BaseMask[0] * (8 / NumBaseMaskElts);
+      bool UseZero = isAnyZero(BaseMask);
+      Res = extractSubVector(Res, SubIdx, DAG, DL, BaseMaskEltSizeInBits);
+      Res = widenSubVector(Res, UseZero, Subtarget, DAG, DL, RootSizeInBits);
+      return DAG.getBitcast(RootVT, Res);
+    }
+
+    // Narrow shuffle mask to v4x128.
+    SmallVector<int, 4> Mask;
+    assert((BaseMaskEltSizeInBits % 128) == 0 && "Illegal mask size");
+    narrowShuffleMaskElts(BaseMaskEltSizeInBits / 128, BaseMask, Mask);
+
+    // Try to lower to vshuf64x2/vshuf32x4.
+    auto MatchSHUF128 = [](MVT ShuffleVT, const SDLoc &DL, ArrayRef<int> Mask,
+                           SDValue V1, SDValue V2, SelectionDAG &DAG) {
+      unsigned PermMask = 0;
+      // Insure elements came from the same Op.
+      SDValue Ops[2] = {DAG.getUNDEF(ShuffleVT), DAG.getUNDEF(ShuffleVT)};
+      for (int i = 0; i < 4; ++i) {
+        assert(Mask[i] >= -1 && "Illegal shuffle sentinel value");
+        if (Mask[i] < 0)
+          continue;
+
+        SDValue Op = Mask[i] >= 4 ? V2 : V1;
+        unsigned OpIndex = i / 2;
+        if (Ops[OpIndex].isUndef())
+          Ops[OpIndex] = Op;
+        else if (Ops[OpIndex] != Op)
+          return SDValue();
+
+        // Convert the 128-bit shuffle mask selection values into 128-bit
+        // selection bits defined by a vshuf64x2 instruction's immediate control
+        // byte.
+        PermMask |= (Mask[i] % 4) << (i * 2);
+      }
+
+      return DAG.getNode(X86ISD::SHUF128, DL, ShuffleVT,
+                         DAG.getBitcast(ShuffleVT, Ops[0]),
+                         DAG.getBitcast(ShuffleVT, Ops[1]),
+                         DAG.getTargetConstant(PermMask, DL, MVT::i8));
+    };
+
+    // FIXME: Is there a better way to do this? is256BitLaneRepeatedShuffleMask
+    // doesn't work because our mask is for 128 bits and we don't have an MVT
+    // to match that.
+    bool PreferPERMQ =
+        UnaryShuffle && isUndefOrInRange(Mask[0], 0, 2) &&
+        isUndefOrInRange(Mask[1], 0, 2) && isUndefOrInRange(Mask[2], 2, 4) &&
+        isUndefOrInRange(Mask[3], 2, 4) &&
+        (Mask[0] < 0 || Mask[2] < 0 || Mask[0] == (Mask[2] % 2)) &&
+        (Mask[1] < 0 || Mask[3] < 0 || Mask[1] == (Mask[3] % 2));
+
+    if (!isAnyZero(Mask) && !PreferPERMQ) {
+      if (SDValue V = MatchSHUF128(ShuffleVT, DL, Mask, V1, V2, DAG))
+        return DAG.getBitcast(RootVT, V);
+    }
+  }
 
   // Handle 128-bit lane shuffles of 256-bit vectors.
-  // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
-  // we need to use the zeroing feature.
-  // TODO - this should support binary shuffles.
-  if (UnaryShuffle && RootVT.is256BitVector() && NumBaseMaskElts == 2 &&
-      !(Subtarget.hasAVX2() && BaseMask[0] >= -1 && BaseMask[1] >= -1) &&
-      !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0)) {
+  if (RootVT.is256BitVector() && NumBaseMaskElts == 2) {
+    MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
+
+    // If the upper half is zeroable, then an extract+insert is more optimal
+    // than using X86ISD::VPERM2X128. The insertion is free, even if it has to
+    // zero the upper half.
+    if (isUndefOrZero(BaseMask[1])) {
+      if (Depth == 0 && Root.getOpcode() == ISD::INSERT_SUBVECTOR)
+        return SDValue(); // Nothing to do!
+      assert(isInRange(BaseMask[0], 0, 2) && "Unexpected lane shuffle");
+      Res = DAG.getBitcast(ShuffleVT, V1);
+      Res = extract128BitVector(Res, BaseMask[0] * 2, DAG, DL);
+      Res = widenSubVector(Res, BaseMask[1] == SM_SentinelZero, Subtarget, DAG,
+                           DL, 256);
+      return DAG.getBitcast(RootVT, Res);
+    }
+
     if (Depth == 0 && Root.getOpcode() == X86ISD::VPERM2X128)
       return SDValue(); // Nothing to do!
-    MVT ShuffleVT = (FloatDomain ? MVT::v4f64 : MVT::v4i64);
-    unsigned PermMask = 0;
-    PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
-    PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
-
-    Res = DAG.getBitcast(ShuffleVT, V1);
-    Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
-                      DAG.getUNDEF(ShuffleVT),
-                      DAG.getTargetConstant(PermMask, DL, MVT::i8));
-    return DAG.getBitcast(RootVT, Res);
+
+    // If we have AVX2, prefer to use VPERMQ/VPERMPD for unary shuffles unless
+    // we need to use the zeroing feature.
+    // Prefer blends for sequential shuffles unless we are optimizing for size.
+    if (UnaryShuffle &&
+        !(Subtarget.hasAVX2() && isUndefOrInRange(BaseMask, 0, 2)) &&
+        (OptForSize || !isSequentialOrUndefOrZeroInRange(BaseMask, 0, 2, 0))) {
+      unsigned PermMask = 0;
+      PermMask |= ((BaseMask[0] < 0 ? 0x8 : (BaseMask[0] & 1)) << 0);
+      PermMask |= ((BaseMask[1] < 0 ? 0x8 : (BaseMask[1] & 1)) << 4);
+
+      Res = DAG.getBitcast(ShuffleVT, V1);
+      Res = DAG.getNode(X86ISD::VPERM2X128, DL, ShuffleVT, Res,
+                        DAG.getUNDEF(ShuffleVT),
+                        DAG.getTargetConstant(PermMask, DL, MVT::i8));
+      return DAG.getBitcast(RootVT, Res);
+    }
+
+    if (Depth == 0 && Root.getOpcode() == X86ISD::SHUF128)
+      return SDValue(); // Nothing to do!
+
+    // TODO - handle AVX512VL cases with X86ISD::SHUF128.
+    if (!UnaryShuffle && !IsMaskedShuffle) {
+      assert(llvm::all_of(BaseMask, [](int M) { return 0 <= M && M < 4; }) &&
+             "Unexpected shuffle sentinel value");
+      // Prefer blends to X86ISD::VPERM2X128.
+      if (!((BaseMask[0] == 0 && BaseMask[1] == 3) ||
+            (BaseMask[0] == 2 && BaseMask[1] == 1))) {
+        unsigned PermMask = 0;
+        PermMask |= ((BaseMask[0] & 3) << 0);
+        PermMask |= ((BaseMask[1] & 3) << 4);
+
+        Res = DAG.getNode(
+            X86ISD::VPERM2X128, DL, ShuffleVT,
+            DAG.getBitcast(ShuffleVT, isInRange(BaseMask[0], 0, 2) ? V1 : V2),
+            DAG.getBitcast(ShuffleVT, isInRange(BaseMask[1], 0, 2) ? V1 : V2),
+            DAG.getTargetConstant(PermMask, DL, MVT::i8));
+        return DAG.getBitcast(RootVT, Res);
+      }
+    }
   }
 
   // For masks that have been widened to 128-bit elements or more,
@@ -33437,9 +34647,20 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   if (BaseMaskEltSizeInBits > 64) {
     assert((BaseMaskEltSizeInBits % 64) == 0 && "Illegal mask size");
     int MaskScale = BaseMaskEltSizeInBits / 64;
-    scaleShuffleMask<int>(MaskScale, BaseMask, Mask);
+    narrowShuffleMaskElts(MaskScale, BaseMask, Mask);
   } else {
-    Mask = SmallVector<int, 64>(BaseMask.begin(), BaseMask.end());
+    Mask.assign(BaseMask.begin(), BaseMask.end());
+  }
+
+  // For masked shuffles, we're trying to match the root width for better
+  // writemask folding, attempt to scale the mask.
+  // TODO - variable shuffles might need this to be widened again.
+  if (IsMaskedShuffle && NumRootElts > Mask.size()) {
+    assert((NumRootElts % Mask.size()) == 0 && "Illegal mask size");
+    int MaskScale = NumRootElts / Mask.size();
+    SmallVector<int, 64> ScaledMask;
+    narrowShuffleMaskElts(MaskScale, Mask, ScaledMask);
+    Mask = std::move(ScaledMask);
   }
 
   unsigned NumMaskElts = Mask.size();
@@ -33472,26 +34693,11 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   APInt Zeroable = KnownUndef | KnownZero;
 
   if (UnaryShuffle) {
-    // If we are shuffling a X86ISD::VZEXT_LOAD then we can use the load
-    // directly if we don't shuffle the lower element and we shuffle the upper
-    // (zero) elements within themselves.
-    if (V1.getOpcode() == X86ISD::VZEXT_LOAD &&
-        (cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() %
-         MaskEltSizeInBits) == 0) {
-      unsigned Scale =
-          cast<MemIntrinsicSDNode>(V1)->getMemoryVT().getScalarSizeInBits() /
-          MaskEltSizeInBits;
-      ArrayRef<int> HiMask(Mask.data() + Scale, NumMaskElts - Scale);
-      if (isSequentialOrUndefInRange(Mask, 0, Scale, 0) &&
-          isUndefOrZeroOrInRange(HiMask, Scale, NumMaskElts)) {
-        return DAG.getBitcast(RootVT, V1);
-      }
-    }
-
     // Attempt to match against broadcast-from-vector.
     // Limit AVX1 to cases where we're loading+broadcasting a scalar element.
-    if ((Subtarget.hasAVX2() || (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits))
-        && (!IsEVEXShuffle || NumRootElts == NumMaskElts)) {
+    if ((Subtarget.hasAVX2() ||
+         (Subtarget.hasAVX() && 32 <= MaskEltSizeInBits)) &&
+        (!IsMaskedShuffle || NumRootElts == NumMaskElts)) {
       SmallVector<int, 64> BroadcastMask(NumMaskElts, 0);
       if (isTargetShuffleEquivalent(Mask, BroadcastMask)) {
         if (V1.getValueType() == MaskVT &&
@@ -33517,7 +34723,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     if (matchUnaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
                           DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
                           ShuffleVT) &&
-        (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+        (!IsMaskedShuffle ||
+         (NumRootElts == ShuffleVT.getVectorNumElements()))) {
       if (Depth == 0 && Root.getOpcode() == Shuffle)
         return SDValue(); // Nothing to do!
       Res = DAG.getBitcast(ShuffleSrcVT, NewV1);
@@ -33528,7 +34735,8 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     if (matchUnaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
                                  AllowIntDomain, Subtarget, Shuffle, ShuffleVT,
                                  PermuteImm) &&
-        (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+        (!IsMaskedShuffle ||
+         (NumRootElts == ShuffleVT.getVectorNumElements()))) {
       if (Depth == 0 && Root.getOpcode() == Shuffle)
         return SDValue(); // Nothing to do!
       Res = DAG.getBitcast(ShuffleVT, V1);
@@ -33538,12 +34746,31 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     }
   }
 
+  // Attempt to combine to INSERTPS, but only if the inserted element has come
+  // from a scalar.
+  // TODO: Handle other insertions here as well?
+  if (!UnaryShuffle && AllowFloatDomain && RootSizeInBits == 128 &&
+      MaskEltSizeInBits == 32 && Subtarget.hasSSE41() &&
+      !isTargetShuffleEquivalent(Mask, {4, 1, 2, 3})) {
+    SDValue SrcV1 = V1, SrcV2 = V2;
+    if (matchShuffleAsInsertPS(SrcV1, SrcV2, PermuteImm, Zeroable, Mask, DAG) &&
+        SrcV2.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+      if (Depth == 0 && Root.getOpcode() == X86ISD::INSERTPS)
+        return SDValue(); // Nothing to do!
+      Res = DAG.getNode(X86ISD::INSERTPS, DL, MVT::v4f32,
+                        DAG.getBitcast(MVT::v4f32, SrcV1),
+                        DAG.getBitcast(MVT::v4f32, SrcV2),
+                        DAG.getTargetConstant(PermuteImm, DL, MVT::i8));
+      return DAG.getBitcast(RootVT, Res);
+    }
+  }
+
   SDValue NewV1 = V1; // Save operands in case early exit happens.
   SDValue NewV2 = V2;
   if (matchBinaryShuffle(MaskVT, Mask, AllowFloatDomain, AllowIntDomain, NewV1,
                          NewV2, DL, DAG, Subtarget, Shuffle, ShuffleSrcVT,
                          ShuffleVT, UnaryShuffle) &&
-      (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+      (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
     if (Depth == 0 && Root.getOpcode() == Shuffle)
       return SDValue(); // Nothing to do!
     NewV1 = DAG.getBitcast(ShuffleSrcVT, NewV1);
@@ -33554,10 +34781,10 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
 
   NewV1 = V1; // Save operands in case early exit happens.
   NewV2 = V2;
-  if (matchBinaryPermuteShuffle(
-          MaskVT, Mask, Zeroable, AllowFloatDomain, AllowIntDomain, NewV1,
-          NewV2, DL, DAG, Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
-      (!IsEVEXShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
+  if (matchBinaryPermuteShuffle(MaskVT, Mask, Zeroable, AllowFloatDomain,
+                                AllowIntDomain, NewV1, NewV2, DL, DAG,
+                                Subtarget, Shuffle, ShuffleVT, PermuteImm) &&
+      (!IsMaskedShuffle || (NumRootElts == ShuffleVT.getVectorNumElements()))) {
     if (Depth == 0 && Root.getOpcode() == Shuffle)
       return SDValue(); // Nothing to do!
     NewV1 = DAG.getBitcast(ShuffleVT, NewV1);
@@ -33597,6 +34824,44 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     }
   }
 
+  // Match shuffle against TRUNCATE patterns.
+  if (AllowIntDomain && MaskEltSizeInBits < 64 && Subtarget.hasAVX512()) {
+    // Match against a VTRUNC instruction, accounting for src/dst sizes.
+    if (matchShuffleAsVTRUNC(ShuffleSrcVT, ShuffleVT, IntMaskVT, Mask, Zeroable,
+                             Subtarget)) {
+      bool IsTRUNCATE = ShuffleVT.getVectorNumElements() ==
+                        ShuffleSrcVT.getVectorNumElements();
+      unsigned Opc =
+          IsTRUNCATE ? (unsigned)ISD::TRUNCATE : (unsigned)X86ISD::VTRUNC;
+      if (Depth == 0 && Root.getOpcode() == Opc)
+        return SDValue(); // Nothing to do!
+      V1 = DAG.getBitcast(ShuffleSrcVT, V1);
+      Res = DAG.getNode(Opc, DL, ShuffleVT, V1);
+      if (ShuffleVT.getSizeInBits() < RootSizeInBits)
+        Res = widenSubVector(Res, true, Subtarget, DAG, DL, RootSizeInBits);
+      return DAG.getBitcast(RootVT, Res);
+    }
+
+    // Do we need a more general binary truncation pattern?
+    if (RootSizeInBits < 512 &&
+        ((RootVT.is256BitVector() && Subtarget.useAVX512Regs()) ||
+         (RootVT.is128BitVector() && Subtarget.hasVLX())) &&
+        (MaskEltSizeInBits > 8 || Subtarget.hasBWI()) &&
+        isSequentialOrUndefInRange(Mask, 0, NumMaskElts, 0, 2)) {
+      if (Depth == 0 && Root.getOpcode() == ISD::TRUNCATE)
+        return SDValue(); // Nothing to do!
+      ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
+      ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts / 2);
+      V1 = DAG.getBitcast(ShuffleSrcVT, V1);
+      V2 = DAG.getBitcast(ShuffleSrcVT, V2);
+      ShuffleSrcVT = MVT::getIntegerVT(MaskEltSizeInBits * 2);
+      ShuffleSrcVT = MVT::getVectorVT(ShuffleSrcVT, NumMaskElts);
+      Res = DAG.getNode(ISD::CONCAT_VECTORS, DL, ShuffleSrcVT, V1, V2);
+      Res = DAG.getNode(ISD::TRUNCATE, DL, IntMaskVT, Res);
+      return DAG.getBitcast(RootVT, Res);
+    }
+  }
+
   // Don't try to re-form single instruction chains under any circumstances now
   // that we've done encoding canonicalization for them.
   if (Depth < 1)
@@ -33606,8 +34871,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
   int VariableShuffleDepth = Subtarget.hasFastVariableShuffle() ? 1 : 2;
   AllowVariableMask &= (Depth >= VariableShuffleDepth) || HasVariableMask;
 
-  bool MaskContainsZeros =
-      any_of(Mask, [](int M) { return M == SM_SentinelZero; });
+  bool MaskContainsZeros = isAnyZero(Mask);
 
   if (is128BitLaneCrossingShuffleMask(MaskVT, Mask)) {
     // If we have a single input lane-crossing shuffle then lower to VPERMV.
@@ -33702,7 +34966,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
     SDValue BitMask = getConstVector(EltBits, UndefElts, MaskVT, DAG, DL);
     Res = DAG.getBitcast(MaskVT, V1);
     unsigned AndOpcode =
-        FloatDomain ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
+        MaskVT.isFloatingPoint() ? unsigned(X86ISD::FAND) : unsigned(ISD::AND);
     Res = DAG.getNode(AndOpcode, DL, MaskVT, Res, BitMask);
     return DAG.getBitcast(RootVT, Res);
   }
@@ -33779,7 +35043,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
         continue;
       }
       if (M == SM_SentinelZero) {
-        PSHUFBMask.push_back(DAG.getConstant(255, DL, MVT::i8));
+        PSHUFBMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
         continue;
       }
       M = Ratio * M + i % Ratio;
@@ -33810,7 +35074,7 @@ static SDValue combineX86ShuffleChain(ArrayRef<SDValue> Inputs, SDValue Root,
         continue;
       }
       if (M == SM_SentinelZero) {
-        VPPERMMask.push_back(DAG.getConstant(128, DL, MVT::i8));
+        VPPERMMask.push_back(DAG.getConstant(0x80, DL, MVT::i8));
         continue;
       }
       M = Ratio * M + i % Ratio;
@@ -33885,8 +35149,7 @@ static SDValue combineX86ShuffleChainWithExtract(
     unsigned &Offset = Offsets[i];
     Src = peekThroughBitcasts(Src);
     EVT BaseVT = Src.getValueType();
-    while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
-           isa<ConstantSDNode>(Src.getOperand(1))) {
+    while (Src.getOpcode() == ISD::EXTRACT_SUBVECTOR) {
       Offset += Src.getConstantOperandVal(1);
       Src = Src.getOperand(0);
     }
@@ -33998,6 +35261,7 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
     return SDValue();
 
   // Shuffle the constant bits according to the mask.
+  SDLoc DL(Root);
   APInt UndefElts(NumMaskElts, 0);
   APInt ZeroElts(NumMaskElts, 0);
   APInt ConstantElts(NumMaskElts, 0);
@@ -34035,6 +35299,10 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
   }
   assert((UndefElts | ZeroElts | ConstantElts).isAllOnesValue());
 
+  // Attempt to create a zero vector.
+  if ((UndefElts | ZeroElts).isAllOnesValue())
+    return getZeroVector(Root.getSimpleValueType(), Subtarget, DAG, DL);
+
   // Create the constant data.
   MVT MaskSVT;
   if (VT.isFloatingPoint() && (MaskSizeInBits == 32 || MaskSizeInBits == 64))
@@ -34043,8 +35311,9 @@ static SDValue combineX86ShufflesConstants(ArrayRef<SDValue> Ops,
     MaskSVT = MVT::getIntegerVT(MaskSizeInBits);
 
   MVT MaskVT = MVT::getVectorVT(MaskSVT, NumMaskElts);
+  if (!DAG.getTargetLoweringInfo().isTypeLegal(MaskVT))
+    return SDValue();
 
-  SDLoc DL(Root);
   SDValue CstOp = getConstVector(ConstantBitData, UndefElts, MaskVT, DAG, DL);
   return DAG.getBitcast(VT, CstOp);
 }
@@ -34103,7 +35372,8 @@ static SDValue combineX86ShufflesRecursively(
 
   assert(Root.getSimpleValueType().isVector() &&
          "Shuffles operate on vector types!");
-  assert(VT.getSizeInBits() == Root.getSimpleValueType().getSizeInBits() &&
+  unsigned RootSizeInBits = Root.getSimpleValueType().getSizeInBits();
+  assert(VT.getSizeInBits() == RootSizeInBits &&
          "Can only combine shuffles of the same vector register size.");
 
   // Extract target shuffle mask and resolve sentinels and inputs.
@@ -34117,6 +35387,18 @@ static SDValue combineX86ShufflesRecursively(
                               OpZero, DAG, Depth, false))
     return SDValue();
 
+  // Shuffle inputs must be the same size as the result, bail on any larger
+  // inputs and widen any smaller inputs.
+  if (llvm::any_of(OpInputs, [RootSizeInBits](SDValue Op) {
+        return Op.getValueSizeInBits() > RootSizeInBits;
+      }))
+    return SDValue();
+
+  for (SDValue &Op : OpInputs)
+    if (Op.getValueSizeInBits() < RootSizeInBits)
+      Op = widenSubVector(peekThroughOneUseBitcasts(Op), false, Subtarget, DAG,
+                          SDLoc(Op), RootSizeInBits);
+
   SmallVector<int, 64> Mask;
   SmallVector<SDValue, 16> Ops;
 
@@ -34517,6 +35799,59 @@ combineRedundantDWordShuffle(SDValue N, MutableArrayRef<int> Mask,
   return V;
 }
 
+// Attempt to commute shufps LHS loads:
+// permilps(shufps(load(),x)) --> permilps(shufps(x,load()))
+static SDValue combineCommutableSHUFP(SDValue N, MVT VT, const SDLoc &DL,
+                                      SelectionDAG &DAG) {
+  // TODO: Add vXf64 support.
+  if (VT != MVT::v4f32 && VT != MVT::v8f32 && VT != MVT::v16f32)
+    return SDValue();
+
+  // SHUFP(LHS, RHS) -> SHUFP(RHS, LHS) iff LHS is foldable + RHS is not.
+  auto commuteSHUFP = [&VT, &DL, &DAG](SDValue Parent, SDValue V) {
+    if (V.getOpcode() != X86ISD::SHUFP || !Parent->isOnlyUserOf(V.getNode()))
+      return SDValue();
+    SDValue N0 = V.getOperand(0);
+    SDValue N1 = V.getOperand(1);
+    unsigned Imm = V.getConstantOperandVal(2);
+    if (!MayFoldLoad(peekThroughOneUseBitcasts(N0)) ||
+        MayFoldLoad(peekThroughOneUseBitcasts(N1)))
+      return SDValue();
+    Imm = ((Imm & 0x0F) << 4) | ((Imm & 0xF0) >> 4);
+    return DAG.getNode(X86ISD::SHUFP, DL, VT, N1, N0,
+                       DAG.getTargetConstant(Imm, DL, MVT::i8));
+  };
+
+  switch (N.getOpcode()) {
+  case X86ISD::VPERMILPI:
+    if (SDValue NewSHUFP = commuteSHUFP(N, N.getOperand(0))) {
+      unsigned Imm = N.getConstantOperandVal(1);
+      return DAG.getNode(X86ISD::VPERMILPI, DL, VT, NewSHUFP,
+                         DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
+    }
+    break;
+  case X86ISD::SHUFP: {
+    SDValue N0 = N.getOperand(0);
+    SDValue N1 = N.getOperand(1);
+    unsigned Imm = N.getConstantOperandVal(2);
+    if (N0 == N1) {
+      if (SDValue NewSHUFP = commuteSHUFP(N, N0))
+        return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, NewSHUFP,
+                           DAG.getTargetConstant(Imm ^ 0xAA, DL, MVT::i8));
+    } else if (SDValue NewSHUFP = commuteSHUFP(N, N0)) {
+      return DAG.getNode(X86ISD::SHUFP, DL, VT, NewSHUFP, N1,
+                         DAG.getTargetConstant(Imm ^ 0x0A, DL, MVT::i8));
+    } else if (SDValue NewSHUFP = commuteSHUFP(N, N1)) {
+      return DAG.getNode(X86ISD::SHUFP, DL, VT, N0, NewSHUFP,
+                         DAG.getTargetConstant(Imm ^ 0xA0, DL, MVT::i8));
+    }
+    break;
+  }
+  }
+
+  return SDValue();
+}
+
 /// Try to combine x86 target specific shuffles.
 static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI,
@@ -34526,35 +35861,105 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
   SmallVector<int, 4> Mask;
   unsigned Opcode = N.getOpcode();
 
+  bool IsUnary;
+  SmallVector<int, 64> TargetMask;
+  SmallVector<SDValue, 2> TargetOps;
+  if (isTargetShuffle(Opcode))
+    getTargetShuffleMask(N.getNode(), VT, true, TargetOps, TargetMask, IsUnary);
+
   // Combine binary shuffle of 2 similar 'Horizontal' instructions into a
-  // single instruction.
-  if (VT.getScalarSizeInBits() == 64 &&
-      (Opcode == X86ISD::MOVSD || Opcode == X86ISD::UNPCKH ||
-       Opcode == X86ISD::UNPCKL)) {
-    auto BC0 = peekThroughBitcasts(N.getOperand(0));
-    auto BC1 = peekThroughBitcasts(N.getOperand(1));
-    EVT VT0 = BC0.getValueType();
-    EVT VT1 = BC1.getValueType();
-    unsigned Opcode0 = BC0.getOpcode();
-    unsigned Opcode1 = BC1.getOpcode();
-    if (Opcode0 == Opcode1 && VT0 == VT1 &&
-        (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
-         Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB ||
-         Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
-      SDValue Lo, Hi;
-      if (Opcode == X86ISD::MOVSD) {
-        Lo = BC1.getOperand(0);
-        Hi = BC0.getOperand(1);
-      } else {
-        Lo = BC0.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
-        Hi = BC1.getOperand(Opcode == X86ISD::UNPCKH ? 1 : 0);
+  // single instruction. Attempt to match a v2X64 repeating shuffle pattern that
+  // represents the LHS/RHS inputs for the lower/upper halves.
+  SmallVector<int, 16> TargetMask128;
+  if (!TargetMask.empty() && 0 < TargetOps.size() && TargetOps.size() <= 2 &&
+      isRepeatedTargetShuffleMask(128, VT, TargetMask, TargetMask128)) {
+    SmallVector<int, 16> WidenedMask128 = TargetMask128;
+    while (WidenedMask128.size() > 2) {
+      SmallVector<int, 16> WidenedMask;
+      if (!canWidenShuffleElements(WidenedMask128, WidenedMask))
+        break;
+      WidenedMask128 = std::move(WidenedMask);
+    }
+    if (WidenedMask128.size() == 2) {
+      assert(isUndefOrZeroOrInRange(WidenedMask128, 0, 4) && "Illegal shuffle");
+      SDValue BC0 = peekThroughBitcasts(TargetOps.front());
+      SDValue BC1 = peekThroughBitcasts(TargetOps.back());
+      EVT VT0 = BC0.getValueType();
+      EVT VT1 = BC1.getValueType();
+      unsigned Opcode0 = BC0.getOpcode();
+      unsigned Opcode1 = BC1.getOpcode();
+      bool isHoriz = (Opcode0 == X86ISD::FHADD || Opcode0 == X86ISD::HADD ||
+                      Opcode0 == X86ISD::FHSUB || Opcode0 == X86ISD::HSUB);
+      if (Opcode0 == Opcode1 && VT0 == VT1 &&
+          (isHoriz || Opcode0 == X86ISD::PACKSS || Opcode0 == X86ISD::PACKUS)) {
+        bool SingleOp = (TargetOps.size() == 1);
+        if (!isHoriz || shouldUseHorizontalOp(SingleOp, DAG, Subtarget)) {
+          SDValue Lo = isInRange(WidenedMask128[0], 0, 2) ? BC0 : BC1;
+          SDValue Hi = isInRange(WidenedMask128[1], 0, 2) ? BC0 : BC1;
+          Lo = Lo.getOperand(WidenedMask128[0] & 1);
+          Hi = Hi.getOperand(WidenedMask128[1] & 1);
+          if (SingleOp) {
+            MVT SrcVT = BC0.getOperand(0).getSimpleValueType();
+            SDValue Undef = DAG.getUNDEF(SrcVT);
+            SDValue Zero = getZeroVector(SrcVT, Subtarget, DAG, DL);
+            Lo = (WidenedMask128[0] == SM_SentinelZero ? Zero : Lo);
+            Hi = (WidenedMask128[1] == SM_SentinelZero ? Zero : Hi);
+            Lo = (WidenedMask128[0] == SM_SentinelUndef ? Undef : Lo);
+            Hi = (WidenedMask128[1] == SM_SentinelUndef ? Undef : Hi);
+          }
+          SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
+          return DAG.getBitcast(VT, Horiz);
+        }
       }
-      SDValue Horiz = DAG.getNode(Opcode0, DL, VT0, Lo, Hi);
-      return DAG.getBitcast(VT, Horiz);
     }
   }
 
+  if (SDValue R = combineCommutableSHUFP(N, VT, DL, DAG))
+    return R;
+
+  // Canonicalize UNARYSHUFFLE(XOR(X,-1) -> XOR(UNARYSHUFFLE(X),-1) to
+  // help expose the 'NOT' pattern further up the DAG.
+  // TODO: This might be beneficial for any binop with a 'splattable' operand.
   switch (Opcode) {
+  case X86ISD::MOVDDUP:
+  case X86ISD::PSHUFD: {
+    SDValue Src = N.getOperand(0);
+    if (Src.hasOneUse() && Src.getValueType() == VT) {
+      if (SDValue Not = IsNOT(Src, DAG, /*OneUse*/ true)) {
+        Not = DAG.getBitcast(VT, Not);
+        Not = Opcode == X86ISD::MOVDDUP
+                  ? DAG.getNode(Opcode, DL, VT, Not)
+                  : DAG.getNode(Opcode, DL, VT, Not, N.getOperand(1));
+        EVT IntVT = Not.getValueType().changeTypeToInteger();
+        SDValue AllOnes = DAG.getConstant(-1, DL, IntVT);
+        Not = DAG.getBitcast(IntVT, Not);
+        Not = DAG.getNode(ISD::XOR, DL, IntVT, Not, AllOnes);
+        return DAG.getBitcast(VT, Not);
+      }
+    }
+    break;
+  }
+  }
+
+  // Handle specific target shuffles.
+  switch (Opcode) {
+  case X86ISD::MOVDDUP: {
+    SDValue Src = N.getOperand(0);
+    // Turn a 128-bit MOVDDUP of a full vector load into movddup+vzload.
+    if (VT == MVT::v2f64 && Src.hasOneUse() &&
+        ISD::isNormalLoad(Src.getNode())) {
+      LoadSDNode *LN = cast<LoadSDNode>(Src);
+      if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::f64, MVT::v2f64, DAG)) {
+        SDValue Movddup = DAG.getNode(X86ISD::MOVDDUP, DL, MVT::v2f64, VZLoad);
+        DCI.CombineTo(N.getNode(), Movddup);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+        DCI.recursivelyDeleteUnusedNodes(LN);
+        return N; // Return N so it doesn't get rechecked!
+      }
+    }
+
+    return SDValue();
+  }
   case X86ISD::VBROADCAST: {
     SDValue Src = N.getOperand(0);
     SDValue BC = peekThroughBitcasts(Src);
@@ -34580,7 +35985,8 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     // broadcast(bitcast(src)) -> bitcast(broadcast(src))
     // 32-bit targets have to bitcast i64 to f64, so better to bitcast upward.
     if (Src.getOpcode() == ISD::BITCAST &&
-        SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits()) {
+        SrcVT.getScalarSizeInBits() == BCVT.getScalarSizeInBits() &&
+        DAG.getTargetLoweringInfo().isTypeLegal(BCVT)) {
       EVT NewVT = EVT::getVectorVT(*DAG.getContext(), BCVT.getScalarType(),
                                    VT.getVectorNumElements());
       return DAG.getBitcast(VT, DAG.getNode(X86ISD::VBROADCAST, DL, NewVT, BC));
@@ -34627,6 +36033,190 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
       return N; // Return N so it doesn't get rechecked!
     }
 
+    // Due to isTypeDesirableForOp, we won't always shrink a load truncated to
+    // i16. So shrink it ourselves if we can make a broadcast_load.
+    if (SrcVT == MVT::i16 && Src.getOpcode() == ISD::TRUNCATE &&
+        Src.hasOneUse() && Src.getOperand(0).hasOneUse()) {
+      assert(Subtarget.hasAVX2() && "Expected AVX2");
+      SDValue TruncIn = Src.getOperand(0);
+
+      // If this is a truncate of a non extending load we can just narrow it to
+      // use a broadcast_load.
+      if (ISD::isNormalLoad(TruncIn.getNode())) {
+        LoadSDNode *LN = cast<LoadSDNode>(TruncIn);
+        // Unless its volatile or atomic.
+        if (LN->isSimple()) {
+          SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+          SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+          SDValue BcastLd = DAG.getMemIntrinsicNode(
+              X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
+              LN->getPointerInfo(), LN->getOriginalAlign(),
+              LN->getMemOperand()->getFlags());
+          DCI.CombineTo(N.getNode(), BcastLd);
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+          DCI.recursivelyDeleteUnusedNodes(Src.getNode());
+          return N; // Return N so it doesn't get rechecked!
+        }
+      }
+
+      // If this is a truncate of an i16 extload, we can directly replace it.
+      if (ISD::isUNINDEXEDLoad(Src.getOperand(0).getNode()) &&
+          ISD::isEXTLoad(Src.getOperand(0).getNode())) {
+        LoadSDNode *LN = cast<LoadSDNode>(Src.getOperand(0));
+        if (LN->getMemoryVT().getSizeInBits() == 16) {
+          SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+          SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+          SDValue BcastLd =
+              DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+                                      LN->getMemoryVT(), LN->getMemOperand());
+          DCI.CombineTo(N.getNode(), BcastLd);
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+          DCI.recursivelyDeleteUnusedNodes(Src.getNode());
+          return N; // Return N so it doesn't get rechecked!
+        }
+      }
+
+      // If this is a truncate of load that has been shifted right, we can
+      // offset the pointer and use a narrower load.
+      if (TruncIn.getOpcode() == ISD::SRL &&
+          TruncIn.getOperand(0).hasOneUse() &&
+          isa<ConstantSDNode>(TruncIn.getOperand(1)) &&
+          ISD::isNormalLoad(TruncIn.getOperand(0).getNode())) {
+        LoadSDNode *LN = cast<LoadSDNode>(TruncIn.getOperand(0));
+        unsigned ShiftAmt = TruncIn.getConstantOperandVal(1);
+        // Make sure the shift amount and the load size are divisible by 16.
+        // Don't do this if the load is volatile or atomic.
+        if (ShiftAmt % 16 == 0 && TruncIn.getValueSizeInBits() % 16 == 0 &&
+            LN->isSimple()) {
+          unsigned Offset = ShiftAmt / 8;
+          SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+          SDValue Ptr = DAG.getMemBasePlusOffset(LN->getBasePtr(), Offset, DL);
+          SDValue Ops[] = { LN->getChain(), Ptr };
+          SDValue BcastLd = DAG.getMemIntrinsicNode(
+              X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::i16,
+              LN->getPointerInfo().getWithOffset(Offset),
+              LN->getOriginalAlign(),
+              LN->getMemOperand()->getFlags());
+          DCI.CombineTo(N.getNode(), BcastLd);
+          DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+          DCI.recursivelyDeleteUnusedNodes(Src.getNode());
+          return N; // Return N so it doesn't get rechecked!
+        }
+      }
+    }
+
+    // vbroadcast(vzload X) -> vbroadcast_load X
+    if (Src.getOpcode() == X86ISD::VZEXT_LOAD && Src.hasOneUse()) {
+      MemSDNode *LN = cast<MemIntrinsicSDNode>(Src);
+      if (LN->getMemoryVT().getSizeInBits() == VT.getScalarSizeInBits()) {
+        SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+        SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+        SDValue BcastLd =
+            DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, DL, Tys, Ops,
+                                    LN->getMemoryVT(), LN->getMemOperand());
+        DCI.CombineTo(N.getNode(), BcastLd);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+        DCI.recursivelyDeleteUnusedNodes(LN);
+        return N; // Return N so it doesn't get rechecked!
+      }
+    }
+
+    // vbroadcast(vector load X) -> vbroadcast_load
+    if (SrcVT == MVT::v2f64 && Src.hasOneUse() &&
+        ISD::isNormalLoad(Src.getNode())) {
+      LoadSDNode *LN = cast<LoadSDNode>(Src);
+      // Unless the load is volatile or atomic.
+      if (LN->isSimple()) {
+        SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+        SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
+        SDValue BcastLd = DAG.getMemIntrinsicNode(
+            X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MVT::f64,
+            LN->getPointerInfo(), LN->getOriginalAlign(),
+            LN->getMemOperand()->getFlags());
+        DCI.CombineTo(N.getNode(), BcastLd);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), BcastLd.getValue(1));
+        DCI.recursivelyDeleteUnusedNodes(LN);
+        return N; // Return N so it doesn't get rechecked!
+      }
+    }
+
+    return SDValue();
+  }
+  case X86ISD::VZEXT_MOVL: {
+    SDValue N0 = N.getOperand(0);
+
+    // If this a vzmovl of a full vector load, replace it with a vzload, unless
+    // the load is volatile.
+    if (N0.hasOneUse() && ISD::isNormalLoad(N0.getNode())) {
+      auto *LN = cast<LoadSDNode>(N0);
+      if (SDValue VZLoad =
+              narrowLoadToVZLoad(LN, VT.getVectorElementType(), VT, DAG)) {
+        DCI.CombineTo(N.getNode(), VZLoad);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+        DCI.recursivelyDeleteUnusedNodes(LN);
+        return N;
+      }
+    }
+
+    // If this a VZEXT_MOVL of a VBROADCAST_LOAD, we don't need the broadcast
+    // and can just use a VZEXT_LOAD.
+    // FIXME: Is there some way to do this with SimplifyDemandedVectorElts?
+    if (N0.hasOneUse() && N0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+      auto *LN = cast<MemSDNode>(N0);
+      if (VT.getScalarSizeInBits() == LN->getMemoryVT().getSizeInBits()) {
+        SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+        SDValue Ops[] = {LN->getChain(), LN->getBasePtr()};
+        SDValue VZLoad =
+            DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, DL, Tys, Ops,
+                                    LN->getMemoryVT(), LN->getMemOperand());
+        DCI.CombineTo(N.getNode(), VZLoad);
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+        DCI.recursivelyDeleteUnusedNodes(LN);
+        return N;
+      }
+    }
+
+    // Turn (v2i64 (vzext_movl (scalar_to_vector (i64 X)))) into
+    // (v2i64 (bitcast (v4i32 (vzext_movl (scalar_to_vector (i32 (trunc X)))))))
+    // if the upper bits of the i64 are zero.
+    if (N0.hasOneUse() && N0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
+        N0.getOperand(0).hasOneUse() &&
+        N0.getOperand(0).getValueType() == MVT::i64) {
+      SDValue In = N0.getOperand(0);
+      APInt Mask = APInt::getHighBitsSet(64, 32);
+      if (DAG.MaskedValueIsZero(In, Mask)) {
+        SDValue Trunc = DAG.getNode(ISD::TRUNCATE, DL, MVT::i32, In);
+        MVT VecVT = MVT::getVectorVT(MVT::i32, VT.getVectorNumElements() * 2);
+        SDValue SclVec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VecVT, Trunc);
+        SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, DL, VecVT, SclVec);
+        return DAG.getBitcast(VT, Movl);
+      }
+    }
+
+    // Load a scalar integer constant directly to XMM instead of transferring an
+    // immediate value from GPR.
+    // vzext_movl (scalar_to_vector C) --> load [C,0...]
+    if (N0.getOpcode() == ISD::SCALAR_TO_VECTOR) {
+      if (auto *C = dyn_cast<ConstantSDNode>(N0.getOperand(0))) {
+        // Create a vector constant - scalar constant followed by zeros.
+        EVT ScalarVT = N0.getOperand(0).getValueType();
+        Type *ScalarTy = ScalarVT.getTypeForEVT(*DAG.getContext());
+        unsigned NumElts = VT.getVectorNumElements();
+        Constant *Zero = ConstantInt::getNullValue(ScalarTy);
+        SmallVector<Constant *, 32> ConstantVec(NumElts, Zero);
+        ConstantVec[0] = const_cast<ConstantInt *>(C->getConstantIntValue());
+
+        // Load the vector constant from constant pool.
+        MVT PVT = DAG.getTargetLoweringInfo().getPointerTy(DAG.getDataLayout());
+        SDValue CP = DAG.getConstantPool(ConstantVector::get(ConstantVec), PVT);
+        MachinePointerInfo MPI =
+            MachinePointerInfo::getConstantPool(DAG.getMachineFunction());
+        Align Alignment = cast<ConstantPoolSDNode>(CP)->getAlign();
+        return DAG.getLoad(VT, DL, DAG.getEntryNode(), CP, MPI, Alignment,
+                           MachineMemOperand::MOLoad);
+      }
+    }
+
     return SDValue();
   }
   case X86ISD::BLENDI: {
@@ -34667,6 +36257,34 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     }
     return SDValue();
   }
+  case X86ISD::VPERM2X128: {
+    // If both 128-bit values were inserted into high halves of 256-bit values,
+    // the shuffle can be reduced to a concatenation of subvectors:
+    // vperm2x128 (ins ?, X, C1), (ins ?, Y, C2), 0x31 --> concat X, Y
+    // Note: We are only looking for the exact high/high shuffle mask because we
+    //       expect to fold other similar patterns before creating this opcode.
+    SDValue Ins0 = peekThroughBitcasts(N.getOperand(0));
+    SDValue Ins1 = peekThroughBitcasts(N.getOperand(1));
+    unsigned Imm = N.getConstantOperandVal(2);
+    if (!(Imm == 0x31 &&
+          Ins0.getOpcode() == ISD::INSERT_SUBVECTOR &&
+          Ins1.getOpcode() == ISD::INSERT_SUBVECTOR &&
+          Ins0.getValueType() == Ins1.getValueType()))
+      return SDValue();
+
+    SDValue X = Ins0.getOperand(1);
+    SDValue Y = Ins1.getOperand(1);
+    unsigned C1 = Ins0.getConstantOperandVal(2);
+    unsigned C2 = Ins1.getConstantOperandVal(2);
+    MVT SrcVT = X.getSimpleValueType();
+    unsigned SrcElts = SrcVT.getVectorNumElements();
+    if (SrcVT != Y.getSimpleValueType() || SrcVT.getSizeInBits() != 128 ||
+        C1 != SrcElts || C2 != SrcElts)
+      return SDValue();
+
+    return DAG.getBitcast(VT, DAG.getNode(ISD::CONCAT_VECTORS, DL,
+                                          Ins1.getValueType(), X, Y));
+  }
   case X86ISD::PSHUFD:
   case X86ISD::PSHUFLW:
   case X86ISD::PSHUFHW:
@@ -34706,8 +36324,7 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
     assert(VT == MVT::v4f32 && "INSERTPS ValueType must be MVT::v4f32");
     SDValue Op0 = N.getOperand(0);
     SDValue Op1 = N.getOperand(1);
-    SDValue Op2 = N.getOperand(2);
-    unsigned InsertPSMask = cast<ConstantSDNode>(Op2)->getZExtValue();
+    unsigned InsertPSMask = N.getConstantOperandVal(2);
     unsigned SrcIdx = (InsertPSMask >> 6) & 0x3;
     unsigned DstIdx = (InsertPSMask >> 4) & 0x3;
     unsigned ZeroMask = InsertPSMask & 0xF;
@@ -34847,9 +36464,9 @@ static SDValue combineTargetShuffle(SDValue N, SelectionDAG &DAG,
         (V.getOpcode() == X86ISD::PSHUFLW ||
          V.getOpcode() == X86ISD::PSHUFHW) &&
         V.getOpcode() != N.getOpcode() &&
-        V.hasOneUse()) {
+        V.hasOneUse() && V.getOperand(0).hasOneUse()) {
       SDValue D = peekThroughOneUseBitcasts(V.getOperand(0));
-      if (D.getOpcode() == X86ISD::PSHUFD && D.hasOneUse()) {
+      if (D.getOpcode() == X86ISD::PSHUFD) {
         SmallVector<int, 4> VMask = getPSHUFShuffleMask(V);
         SmallVector<int, 4> DMask = getPSHUFShuffleMask(D);
         int NOffset = N.getOpcode() == X86ISD::PSHUFLW ? 0 : 4;
@@ -35248,7 +36865,8 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
   }
 
   // Attempt to combine into a vector load/broadcast.
-  if (SDValue LD = combineToConsecutiveLoads(VT, N, dl, DAG, Subtarget, true))
+  if (SDValue LD = combineToConsecutiveLoads(VT, SDValue(N, 0), dl, DAG,
+                                             Subtarget, true))
     return LD;
 
   // For AVX2, we sometimes want to combine
@@ -35281,79 +36899,100 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
       return SDValue(N, 0);
   }
 
-  // Look for a v2i64/v2f64 VZEXT_MOVL of a node that already produces zeros
-  // in the upper 64 bits.
-  // TODO: Can we generalize this using computeKnownBits.
-  if (N->getOpcode() == X86ISD::VZEXT_MOVL &&
-      (VT == MVT::v2f64 || VT == MVT::v2i64) &&
-      N->getOperand(0).getOpcode() == ISD::BITCAST &&
-      (N->getOperand(0).getOperand(0).getValueType() == MVT::v4f32 ||
-       N->getOperand(0).getOperand(0).getValueType() == MVT::v4i32)) {
-    SDValue In = N->getOperand(0).getOperand(0);
-    switch (In.getOpcode()) {
-    default:
-      break;
-    case X86ISD::CVTP2SI:   case X86ISD::CVTP2UI:
-    case X86ISD::MCVTP2SI:  case X86ISD::MCVTP2UI:
-    case X86ISD::CVTTP2SI:  case X86ISD::CVTTP2UI:
-    case X86ISD::MCVTTP2SI: case X86ISD::MCVTTP2UI:
-    case X86ISD::CVTSI2P:   case X86ISD::CVTUI2P:
-    case X86ISD::MCVTSI2P:  case X86ISD::MCVTUI2P:
-    case X86ISD::VFPROUND:  case X86ISD::VMFPROUND:
-      if (In.getOperand(0).getValueType() == MVT::v2f64 ||
-          In.getOperand(0).getValueType() == MVT::v2i64)
-        return N->getOperand(0); // return the bitcast
-      break;
-    case X86ISD::STRICT_CVTTP2SI:
-    case X86ISD::STRICT_CVTTP2UI:
-    case X86ISD::STRICT_CVTSI2P:
-    case X86ISD::STRICT_CVTUI2P:
-    case X86ISD::STRICT_VFPROUND:
-      if (In.getOperand(1).getValueType() == MVT::v2f64 ||
-          In.getOperand(1).getValueType() == MVT::v2i64)
-        return N->getOperand(0);
-      break;
-    }
-  }
-
   // Pull subvector inserts into undef through VZEXT_MOVL by making it an
   // insert into a zero vector. This helps get VZEXT_MOVL closer to
   // scalar_to_vectors where 256/512 are canonicalized to an insert and a
   // 128-bit scalar_to_vector. This reduces the number of isel patterns.
   if (N->getOpcode() == X86ISD::VZEXT_MOVL && !DCI.isBeforeLegalizeOps() &&
-      N->getOperand(0).getOpcode() == ISD::INSERT_SUBVECTOR &&
-      N->getOperand(0).hasOneUse() &&
-      N->getOperand(0).getOperand(0).isUndef() &&
-      isNullConstant(N->getOperand(0).getOperand(2))) {
-    SDValue In = N->getOperand(0).getOperand(1);
-    SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, In.getValueType(), In);
-    return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
-                       getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
-                       Movl, N->getOperand(0).getOperand(2));
-  }
-
-  // If this a vzmovl of a full vector load, replace it with a vzload, unless
-  // the load is volatile.
-  if (N->getOpcode() == X86ISD::VZEXT_MOVL && N->getOperand(0).hasOneUse() &&
-      ISD::isNormalLoad(N->getOperand(0).getNode())) {
-    LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
-    if (LN->isSimple()) {
-      SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-      SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
-      SDValue VZLoad =
-          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops,
-                                  VT.getVectorElementType(),
-                                  LN->getPointerInfo(),
-                                  LN->getAlignment(),
-                                  MachineMemOperand::MOLoad);
-      DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
-      return VZLoad;
+      N->getOperand(0).hasOneUse()) {
+    SDValue V = peekThroughOneUseBitcasts(N->getOperand(0));
+
+    if (V.getOpcode() == ISD::INSERT_SUBVECTOR &&
+        V.getOperand(0).isUndef() && isNullConstant(V.getOperand(2))) {
+      SDValue In = V.getOperand(1);
+      MVT SubVT =
+          MVT::getVectorVT(VT.getSimpleVT().getVectorElementType(),
+                           In.getValueSizeInBits() / VT.getScalarSizeInBits());
+      In = DAG.getBitcast(SubVT, In);
+      SDValue Movl = DAG.getNode(X86ISD::VZEXT_MOVL, dl, SubVT, In);
+      return DAG.getNode(ISD::INSERT_SUBVECTOR, dl, VT,
+                         getZeroVector(VT.getSimpleVT(), Subtarget, DAG, dl),
+                         Movl, V.getOperand(2));
     }
   }
 
   return SDValue();
 }
 
+// Simplify variable target shuffle masks based on the demanded elements.
+// TODO: Handle DemandedBits in mask indices as well?
+bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
+    SDValue Op, const APInt &DemandedElts, unsigned MaskIndex,
+    TargetLowering::TargetLoweringOpt &TLO, unsigned Depth) const {
+  // If we're demanding all elements don't bother trying to simplify the mask.
+  unsigned NumElts = DemandedElts.getBitWidth();
+  if (DemandedElts.isAllOnesValue())
+    return false;
+
+  SDValue Mask = Op.getOperand(MaskIndex);
+  if (!Mask.hasOneUse())
+    return false;
+
+  // Attempt to generically simplify the variable shuffle mask.
+  APInt MaskUndef, MaskZero;
+  if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
+                                 Depth + 1))
+    return true;
+
+  // Attempt to extract+simplify a (constant pool load) shuffle mask.
+  // TODO: Support other types from getTargetShuffleMaskIndices?
+  SDValue BC = peekThroughOneUseBitcasts(Mask);
+  EVT BCVT = BC.getValueType();
+  auto *Load = dyn_cast<LoadSDNode>(BC);
+  if (!Load)
+    return false;
+
+  const Constant *C = getTargetConstantFromNode(Load);
+  if (!C)
+    return false;
+
+  Type *CTy = C->getType();
+  if (!CTy->isVectorTy() ||
+      CTy->getPrimitiveSizeInBits() != Mask.getValueSizeInBits())
+    return false;
+
+  // Handle scaling for i64 elements on 32-bit targets.
+  unsigned NumCstElts = cast<FixedVectorType>(CTy)->getNumElements();
+  if (NumCstElts != NumElts && NumCstElts != (NumElts * 2))
+    return false;
+  unsigned Scale = NumCstElts / NumElts;
+
+  // Simplify mask if we have an undemanded element that is not undef.
+  bool Simplified = false;
+  SmallVector<Constant *, 32> ConstVecOps;
+  for (unsigned i = 0; i != NumCstElts; ++i) {
+    Constant *Elt = C->getAggregateElement(i);
+    if (!DemandedElts[i / Scale] && !isa<UndefValue>(Elt)) {
+      ConstVecOps.push_back(UndefValue::get(Elt->getType()));
+      Simplified = true;
+      continue;
+    }
+    ConstVecOps.push_back(Elt);
+  }
+  if (!Simplified)
+    return false;
+
+  // Generate new constant pool entry + legalize immediately for the load.
+  SDLoc DL(Op);
+  SDValue CV = TLO.DAG.getConstantPool(ConstantVector::get(ConstVecOps), BCVT);
+  SDValue LegalCV = LowerConstantPool(CV, TLO.DAG);
+  SDValue NewMask = TLO.DAG.getLoad(
+      BCVT, DL, TLO.DAG.getEntryNode(), LegalCV,
+      MachinePointerInfo::getConstantPool(TLO.DAG.getMachineFunction()),
+      Load->getAlign());
+  return TLO.CombineTo(Mask, TLO.DAG.getBitcast(Mask.getValueType(), NewMask));
+}
+
 bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     SDValue Op, const APInt &DemandedElts, APInt &KnownUndef, APInt &KnownZero,
     TargetLoweringOpt &TLO, unsigned Depth) const {
@@ -35523,12 +37162,10 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     // Aggressively peek through ops to get at the demanded elts.
     // TODO - we should do this for all target/faux shuffles ops.
     if (!DemandedElts.isAllOnesValue()) {
-      APInt DemandedSrcBits =
-          APInt::getAllOnesValue(N0.getScalarValueSizeInBits());
-      SDValue NewN0 = SimplifyMultipleUseDemandedBits(
-          N0, DemandedSrcBits, DemandedLHS, TLO.DAG, Depth + 1);
-      SDValue NewN1 = SimplifyMultipleUseDemandedBits(
-          N1, DemandedSrcBits, DemandedRHS, TLO.DAG, Depth + 1);
+      SDValue NewN0 = SimplifyMultipleUseDemandedVectorElts(N0, DemandedLHS,
+                                                            TLO.DAG, Depth + 1);
+      SDValue NewN1 = SimplifyMultipleUseDemandedVectorElts(N1, DemandedRHS,
+                                                            TLO.DAG, Depth + 1);
       if (NewN0 || NewN1) {
         NewN0 = NewN0 ? NewN0 : N0;
         NewN1 = NewN1 ? NewN1 : N1;
@@ -35590,6 +37227,15 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     KnownUndef = LHSUndef & RHSUndef;
     break;
   }
+  case X86ISD::VZEXT_MOVL: {
+    // If upper demanded elements are already zero then we have nothing to do.
+    SDValue Src = Op.getOperand(0);
+    APInt DemandedUpperElts = DemandedElts;
+    DemandedUpperElts.clearLowBits(1);
+    if (TLO.DAG.computeKnownBits(Src, DemandedUpperElts, Depth + 1).isZero())
+      return TLO.CombineTo(Op, Src);
+    break;
+  }
   case X86ISD::VBROADCAST: {
     SDValue Src = Op.getOperand(0);
     MVT SrcVT = Src.getSimpleValueType();
@@ -35607,36 +37253,32 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     if (SimplifyDemandedVectorElts(Src, SrcElts, SrcUndef, SrcZero, TLO,
                                    Depth + 1))
       return true;
+    // Aggressively peek through src to get at the demanded elt.
+    // TODO - we should do this for all target/faux shuffles ops.
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedVectorElts(
+            Src, SrcElts, TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
     break;
   }
-  case X86ISD::VPERMV: {
-    SDValue Mask = Op.getOperand(0);
-    APInt MaskUndef, MaskZero;
-    if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
-                                   Depth + 1))
+  case X86ISD::VPERMV:
+    if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 0, TLO,
+                                                   Depth))
       return true;
     break;
-  }
   case X86ISD::PSHUFB:
   case X86ISD::VPERMV3:
-  case X86ISD::VPERMILPV: {
-    SDValue Mask = Op.getOperand(1);
-    APInt MaskUndef, MaskZero;
-    if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
-                                   Depth + 1))
+  case X86ISD::VPERMILPV:
+    if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 1, TLO,
+                                                   Depth))
       return true;
     break;
-  }
   case X86ISD::VPPERM:
-  case X86ISD::VPERMIL2: {
-    SDValue Mask = Op.getOperand(2);
-    APInt MaskUndef, MaskZero;
-    if (SimplifyDemandedVectorElts(Mask, DemandedElts, MaskUndef, MaskZero, TLO,
-                                   Depth + 1))
+  case X86ISD::VPERMIL2:
+    if (SimplifyDemandedVectorEltsForTargetShuffle(Op, DemandedElts, 2, TLO,
+                                                   Depth))
       return true;
     break;
   }
-  }
 
   // For 256/512-bit ops that are 128/256-bit ops glued together, if we do not
   // demand any of the high elements, then narrow the op to 128/256-bits: e.g.
@@ -35651,18 +37293,6 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
       ExtSizeInBits = SizeInBits / 4;
 
     switch (Opc) {
-      // Zero upper elements.
-    case X86ISD::VZEXT_MOVL: {
-      SDLoc DL(Op);
-      SDValue Ext0 =
-          extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
-      SDValue ExtOp =
-          TLO.DAG.getNode(Opc, DL, Ext0.getValueType(), Ext0);
-      SDValue UndefVec = TLO.DAG.getUNDEF(VT);
-      SDValue Insert =
-          insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
-      return TLO.CombineTo(Op, Insert);
-    }
       // Subvector broadcast.
     case X86ISD::SUBV_BROADCAST: {
       SDLoc DL(Op);
@@ -35715,10 +37345,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
       }
       break;
     }
-      // Target Shuffles.
+      // Zero upper elements.
+    case X86ISD::VZEXT_MOVL:
+      // Target unary shuffles by immediate:
+    case X86ISD::PSHUFD:
+    case X86ISD::PSHUFLW:
+    case X86ISD::PSHUFHW:
+    case X86ISD::VPERMILPI:
+      // (Non-Lane Crossing) Target Shuffles.
+    case X86ISD::VPERMILPV:
+    case X86ISD::VPERMIL2:
     case X86ISD::PSHUFB:
     case X86ISD::UNPCKL:
     case X86ISD::UNPCKH:
+    case X86ISD::BLENDI:
       // Saturated Packs.
     case X86ISD::PACKSS:
     case X86ISD::PACKUS:
@@ -35728,14 +37368,20 @@ bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetNode(
     case X86ISD::FHADD:
     case X86ISD::FHSUB: {
       SDLoc DL(Op);
+      SmallVector<SDValue, 4> Ops;
+      for (unsigned i = 0, e = Op.getNumOperands(); i != e; ++i) {
+        SDValue SrcOp = Op.getOperand(i);
+        EVT SrcVT = SrcOp.getValueType();
+        assert((!SrcVT.isVector() || SrcVT.getSizeInBits() == SizeInBits) &&
+               "Unsupported vector size");
+        Ops.push_back(SrcVT.isVector() ? extractSubVector(SrcOp, 0, TLO.DAG, DL,
+                                                          ExtSizeInBits)
+                                       : SrcOp);
+      }
       MVT ExtVT = VT.getSimpleVT();
       ExtVT = MVT::getVectorVT(ExtVT.getScalarType(),
                                ExtSizeInBits / ExtVT.getScalarSizeInBits());
-      SDValue Ext0 =
-          extractSubVector(Op.getOperand(0), 0, TLO.DAG, DL, ExtSizeInBits);
-      SDValue Ext1 =
-          extractSubVector(Op.getOperand(1), 0, TLO.DAG, DL, ExtSizeInBits);
-      SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ext0, Ext1);
+      SDValue ExtOp = TLO.DAG.getNode(Opc, DL, ExtVT, Ops);
       SDValue UndefVec = TLO.DAG.getUNDEF(VT);
       SDValue Insert =
           insertSubVector(UndefVec, ExtOp, 0, TLO.DAG, DL, ExtSizeInBits);
@@ -35832,6 +37478,18 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
   unsigned BitWidth = OriginalDemandedBits.getBitWidth();
   unsigned Opc = Op.getOpcode();
   switch(Opc) {
+  case X86ISD::VTRUNC: {
+    KnownBits KnownOp;
+    SDValue Src = Op.getOperand(0);
+    MVT SrcVT = Src.getSimpleValueType();
+
+    // Simplify the input, using demanded bit information.
+    APInt TruncMask = OriginalDemandedBits.zext(SrcVT.getScalarSizeInBits());
+    APInt DemandedElts = OriginalDemandedElts.trunc(SrcVT.getVectorNumElements());
+    if (SimplifyDemandedBits(Src, TruncMask, DemandedElts, KnownOp, TLO, Depth + 1))
+      return true;
+    break;
+  }
   case X86ISD::PMULDQ:
   case X86ISD::PMULUDQ: {
     // PMULDQ/PMULUDQ only uses lower 32 bits from each vector element.
@@ -35888,6 +37546,14 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
       }
     }
 
+    // If we are only demanding sign bits then we can use the shift source directly.
+    unsigned NumSignBits =
+        TLO.DAG.ComputeNumSignBits(Op0, OriginalDemandedElts, Depth + 1);
+    unsigned UpperDemandedBits =
+        BitWidth - OriginalDemandedBits.countTrailingZeros();
+    if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
+      return TLO.CombineTo(Op, Op0);
+
     if (SimplifyDemandedBits(Op0, DemandedMask, OriginalDemandedElts, Known,
                              TLO, Depth + 1))
       return true;
@@ -36001,7 +37667,7 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
         return TLO.CombineTo(
             Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, V, Op.getOperand(1)));
 
-      Known = KnownVec.zext(BitWidth, true);
+      Known = KnownVec.zext(BitWidth);
       return false;
     }
     break;
@@ -36054,6 +37720,17 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
       if (SimplifyDemandedBits(Op.getOperand(1), SignMask, DemandedRHS,
                                KnownRHS, TLO, Depth + 1))
         return true;
+
+      // Attempt to avoid multi-use ops if we don't need anything from them.
+      SDValue DemandedOp0 = SimplifyMultipleUseDemandedBits(
+          Op.getOperand(0), SignMask, DemandedLHS, TLO.DAG, Depth + 1);
+      SDValue DemandedOp1 = SimplifyMultipleUseDemandedBits(
+          Op.getOperand(1), SignMask, DemandedRHS, TLO.DAG, Depth + 1);
+      if (DemandedOp0 || DemandedOp1) {
+        SDValue Op0 = DemandedOp0 ? DemandedOp0 : Op.getOperand(0);
+        SDValue Op1 = DemandedOp1 ? DemandedOp1 : Op.getOperand(1);
+        return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, Op0, Op1));
+      }
     }
     // TODO - add general PACKSS/PACKUS SimplifyDemandedBits support.
     break;
@@ -36086,16 +37763,51 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
 
     // MOVMSK only uses the MSB from each vector element.
     KnownBits KnownSrc;
-    if (SimplifyDemandedBits(Src, APInt::getSignMask(SrcBits), DemandedElts,
-                             KnownSrc, TLO, Depth + 1))
+    APInt DemandedSrcBits = APInt::getSignMask(SrcBits);
+    if (SimplifyDemandedBits(Src, DemandedSrcBits, DemandedElts, KnownSrc, TLO,
+                             Depth + 1))
       return true;
 
     if (KnownSrc.One[SrcBits - 1])
       Known.One.setLowBits(NumElts);
     else if (KnownSrc.Zero[SrcBits - 1])
       Known.Zero.setLowBits(NumElts);
+
+    // Attempt to avoid multi-use os if we don't need anything from it.
+    if (SDValue NewSrc = SimplifyMultipleUseDemandedBits(
+            Src, DemandedSrcBits, DemandedElts, TLO.DAG, Depth + 1))
+      return TLO.CombineTo(Op, TLO.DAG.getNode(Opc, SDLoc(Op), VT, NewSrc));
     return false;
   }
+  case X86ISD::BEXTR: {
+    SDValue Op0 = Op.getOperand(0);
+    SDValue Op1 = Op.getOperand(1);
+
+    // Only bottom 16-bits of the control bits are required.
+    if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
+      // NOTE: SimplifyDemandedBits won't do this for constants.
+      const APInt &Val1 = Cst1->getAPIntValue();
+      APInt MaskedVal1 = Val1 & 0xFFFF;
+      if (MaskedVal1 != Val1) {
+        SDLoc DL(Op);
+        return TLO.CombineTo(
+            Op, TLO.DAG.getNode(X86ISD::BEXTR, DL, VT, Op0,
+                                TLO.DAG.getConstant(MaskedVal1, DL, VT)));
+      }
+    }
+
+    KnownBits Known1;
+    APInt DemandedMask(APInt::getLowBitsSet(BitWidth, 16));
+    if (SimplifyDemandedBits(Op1, DemandedMask, Known1, TLO, Depth + 1))
+      return true;
+
+    // If the length is 0, replace with 0.
+    KnownBits LengthBits = Known1.extractBits(8, 8);
+    if (LengthBits.isZero())
+      return TLO.CombineTo(Op, TLO.DAG.getConstant(0, SDLoc(Op), VT));
+
+    break;
+  }
   }
 
   return TargetLowering::SimplifyDemandedBitsForTargetNode(
@@ -36119,8 +37831,26 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
     if (CIdx && CIdx->getAPIntValue().ult(VecVT.getVectorNumElements()) &&
         !DemandedElts[CIdx->getZExtValue()])
       return Vec;
-     break;
+    break;
   }
+  case X86ISD::VSHLI: {
+    // If we are only demanding sign bits then we can use the shift source
+    // directly.
+    SDValue Op0 = Op.getOperand(0);
+    unsigned ShAmt = Op.getConstantOperandVal(1);
+    unsigned BitWidth = DemandedBits.getBitWidth();
+    unsigned NumSignBits = DAG.ComputeNumSignBits(Op0, DemandedElts, Depth + 1);
+    unsigned UpperDemandedBits = BitWidth - DemandedBits.countTrailingZeros();
+    if (NumSignBits > ShAmt && (NumSignBits - ShAmt) >= UpperDemandedBits)
+      return Op0;
+    break;
+  }
+  case X86ISD::VSRAI:
+    // iff we only need the sign bit then we can use the source directly.
+    // TODO: generalize where we only demand extended signbits.
+    if (DemandedBits.isSignMask())
+      return Op.getOperand(0);
+    break;
   case X86ISD::PCMPGT:
     // icmp sgt(0, R) == ashr(R, BitWidth-1).
     // iff we only need the sign bit then we can use R directly.
@@ -36154,13 +37884,13 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
         int M = ShuffleMask[i];
         if (!DemandedElts[i] || ShuffleUndef[i])
           continue;
-        int Op = M / NumElts;
-        int Index = M % NumElts;
-        if (M < 0 || Index != i) {
+        int OpIdx = M / NumElts;
+        int EltIdx = M % NumElts;
+        if (M < 0 || EltIdx != i) {
           IdentityOp.clearAllBits();
           break;
         }
-        IdentityOp &= APInt::getOneBitSet(NumOps, Op);
+        IdentityOp &= APInt::getOneBitSet(NumOps, OpIdx);
         if (IdentityOp == 0)
           break;
       }
@@ -36191,6 +37921,51 @@ static bool checkBitcastSrcVectorSize(SDValue Src, unsigned Size) {
   return false;
 }
 
+// Helper to flip between AND/OR/XOR opcodes and their X86ISD FP equivalents.
+static unsigned getAltBitOpcode(unsigned Opcode) {
+  switch(Opcode) {
+  case ISD::AND: return X86ISD::FAND;
+  case ISD::OR: return X86ISD::FOR;
+  case ISD::XOR: return X86ISD::FXOR;
+  case X86ISD::ANDNP: return X86ISD::FANDN;
+  }
+  llvm_unreachable("Unknown bitwise opcode");
+}
+
+// Helper to adjust v4i32 MOVMSK expansion to work with SSE1-only targets.
+static SDValue adjustBitcastSrcVectorSSE1(SelectionDAG &DAG, SDValue Src,
+                                          const SDLoc &DL) {
+  EVT SrcVT = Src.getValueType();
+  if (SrcVT != MVT::v4i1)
+    return SDValue();
+
+  switch (Src.getOpcode()) {
+  case ISD::SETCC:
+    if (Src.getOperand(0).getValueType() == MVT::v4i32 &&
+        ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode()) &&
+        cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT) {
+      SDValue Op0 = Src.getOperand(0);
+      if (ISD::isNormalLoad(Op0.getNode()))
+        return DAG.getBitcast(MVT::v4f32, Op0);
+      if (Op0.getOpcode() == ISD::BITCAST &&
+          Op0.getOperand(0).getValueType() == MVT::v4f32)
+        return Op0.getOperand(0);
+    }
+    break;
+  case ISD::AND:
+  case ISD::XOR:
+  case ISD::OR: {
+    SDValue Op0 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(0), DL);
+    SDValue Op1 = adjustBitcastSrcVectorSSE1(DAG, Src.getOperand(1), DL);
+    if (Op0 && Op1)
+      return DAG.getNode(getAltBitOpcode(Src.getOpcode()), DL, MVT::v4f32, Op0,
+                         Op1);
+    break;
+  }
+  }
+  return SDValue();
+}
+
 // Helper to push sign extension of vXi1 SETCC result through bitops.
 static SDValue signExtendBitcastSrcVector(SelectionDAG &DAG, EVT SExtVT,
                                           SDValue Src, const SDLoc &DL) {
@@ -36221,18 +37996,40 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
   if (!SrcVT.isSimple() || SrcVT.getScalarType() != MVT::i1)
     return SDValue();
 
+  // Recognize the IR pattern for the movmsk intrinsic under SSE1 before type
+  // legalization destroys the v4i32 type.
+  if (Subtarget.hasSSE1() && !Subtarget.hasSSE2()) {
+    if (SDValue V = adjustBitcastSrcVectorSSE1(DAG, Src, DL)) {
+      V = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32,
+                      DAG.getBitcast(MVT::v4f32, V));
+      return DAG.getZExtOrTrunc(V, DL, VT);
+    }
+  }
+
   // If the input is a truncate from v16i8 or v32i8 go ahead and use a
   // movmskb even with avx512. This will be better than truncating to vXi1 and
   // using a kmov. This can especially help KNL if the input is a v16i8/v32i8
   // vpcmpeqb/vpcmpgtb.
-  bool IsTruncated = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
-                     (Src.getOperand(0).getValueType() == MVT::v16i8 ||
-                      Src.getOperand(0).getValueType() == MVT::v32i8 ||
-                      Src.getOperand(0).getValueType() == MVT::v64i8);
+  bool PreferMovMsk = Src.getOpcode() == ISD::TRUNCATE && Src.hasOneUse() &&
+                      (Src.getOperand(0).getValueType() == MVT::v16i8 ||
+                       Src.getOperand(0).getValueType() == MVT::v32i8 ||
+                       Src.getOperand(0).getValueType() == MVT::v64i8);
+
+  // Prefer movmsk for AVX512 for (bitcast (setlt X, 0)) which can be handled
+  // directly with vpmovmskb/vmovmskps/vmovmskpd.
+  if (Src.getOpcode() == ISD::SETCC && Src.hasOneUse() &&
+      cast<CondCodeSDNode>(Src.getOperand(2))->get() == ISD::SETLT &&
+      ISD::isBuildVectorAllZeros(Src.getOperand(1).getNode())) {
+    EVT CmpVT = Src.getOperand(0).getValueType();
+    EVT EltVT = CmpVT.getVectorElementType();
+    if (CmpVT.getSizeInBits() <= 256 &&
+        (EltVT == MVT::i8 || EltVT == MVT::i32 || EltVT == MVT::i64))
+      PreferMovMsk = true;
+  }
 
   // With AVX512 vxi1 types are legal and we prefer using k-regs.
   // MOVMSK is supported in SSE2 or later.
-  if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !IsTruncated))
+  if (!Subtarget.hasSSE2() || (Subtarget.hasAVX512() && !PreferMovMsk))
     return SDValue();
 
   // There are MOVMSK flavors for types v16i8, v32i8, v4f32, v8f32, v4f64 and
@@ -36288,7 +38085,14 @@ static SDValue combineBitcastvxi1(SelectionDAG &DAG, EVT VT, SDValue Src,
   case MVT::v64i1:
     // If we have AVX512F, but not AVX512BW and the input is truncated from
     // v64i8 checked earlier. Then split the input and make two pmovmskbs.
-    if (Subtarget.hasAVX512() && !Subtarget.hasBWI()) {
+    if (Subtarget.hasAVX512()) {
+      if (Subtarget.hasBWI())
+        return SDValue();
+      SExtVT = MVT::v64i8;
+      break;
+    }
+    // Split if this is a <64 x i8> comparison result.
+    if (checkBitcastSrcVectorSize(Src, 512)) {
       SExtVT = MVT::v64i8;
       break;
     }
@@ -36458,6 +38262,74 @@ static SDValue createMMXBuildVector(BuildVectorSDNode *BV, SelectionDAG &DAG,
   return Ops[0];
 }
 
+// Recursive function that attempts to find if a bool vector node was originally
+// a vector/float/double that got truncated/extended/bitcast to/from a scalar
+// integer. If so, replace the scalar ops with bool vector equivalents back down
+// the chain.
+static SDValue combineBitcastToBoolVector(EVT VT, SDValue V, SDLoc DL,
+                                          SelectionDAG &DAG,
+                                          const X86Subtarget &Subtarget) {
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  unsigned Opc = V.getOpcode();
+  switch (Opc) {
+  case ISD::BITCAST: {
+    // Bitcast from a vector/float/double, we can cheaply bitcast to VT.
+    SDValue Src = V.getOperand(0);
+    EVT SrcVT = Src.getValueType();
+    if (SrcVT.isVector() || SrcVT.isFloatingPoint())
+      return DAG.getBitcast(VT, Src);
+    break;
+  }
+  case ISD::TRUNCATE: {
+    // If we find a suitable source, a truncated scalar becomes a subvector.
+    SDValue Src = V.getOperand(0);
+    EVT NewSrcVT =
+        EVT::getVectorVT(*DAG.getContext(), MVT::i1, Src.getValueSizeInBits());
+    if (TLI.isTypeLegal(NewSrcVT))
+      if (SDValue N0 =
+              combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
+        return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, N0,
+                           DAG.getIntPtrConstant(0, DL));
+    break;
+  }
+  case ISD::ANY_EXTEND:
+  case ISD::ZERO_EXTEND: {
+    // If we find a suitable source, an extended scalar becomes a subvector.
+    SDValue Src = V.getOperand(0);
+    EVT NewSrcVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1,
+                                    Src.getScalarValueSizeInBits());
+    if (TLI.isTypeLegal(NewSrcVT))
+      if (SDValue N0 =
+              combineBitcastToBoolVector(NewSrcVT, Src, DL, DAG, Subtarget))
+        return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VT,
+                           Opc == ISD::ANY_EXTEND ? DAG.getUNDEF(VT)
+                                                  : DAG.getConstant(0, DL, VT),
+                           N0, DAG.getIntPtrConstant(0, DL));
+    break;
+  }
+  case ISD::OR: {
+    // If we find suitable sources, we can just move an OR to the vector domain.
+    SDValue Src0 = V.getOperand(0);
+    SDValue Src1 = V.getOperand(1);
+    if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
+      if (SDValue N1 = combineBitcastToBoolVector(VT, Src1, DL, DAG, Subtarget))
+        return DAG.getNode(Opc, DL, VT, N0, N1);
+    break;
+  }
+  case ISD::SHL: {
+    // If we find a suitable source, a SHL becomes a KSHIFTL.
+    SDValue Src0 = V.getOperand(0);
+    if (auto *Amt = dyn_cast<ConstantSDNode>(V.getOperand(1)))
+      if (SDValue N0 = combineBitcastToBoolVector(VT, Src0, DL, DAG, Subtarget))
+        return DAG.getNode(
+            X86ISD::KSHIFTL, DL, VT, N0,
+            DAG.getTargetConstant(Amt->getZExtValue(), DL, MVT::i8));
+    break;
+  }
+  }
+  return SDValue();
+}
+
 static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
                               TargetLowering::DAGCombinerInfo &DCI,
                               const X86Subtarget &Subtarget) {
@@ -36476,24 +38348,6 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
     if (SDValue V = combineBitcastvxi1(DAG, VT, N0, dl, Subtarget))
       return V;
 
-    // Recognize the IR pattern for the movmsk intrinsic under SSE1 befoer type
-    // legalization destroys the v4i32 type.
-    if (Subtarget.hasSSE1() && !Subtarget.hasSSE2() && SrcVT == MVT::v4i1 &&
-        VT.isScalarInteger() && N0.getOpcode() == ISD::SETCC &&
-        N0.getOperand(0).getValueType() == MVT::v4i32 &&
-        ISD::isBuildVectorAllZeros(N0.getOperand(1).getNode()) &&
-        cast<CondCodeSDNode>(N0.getOperand(2))->get() == ISD::SETLT) {
-      SDValue N00 = N0.getOperand(0);
-      // Only do this if we can avoid scalarizing the input.
-      if (ISD::isNormalLoad(N00.getNode()) ||
-          (N00.getOpcode() == ISD::BITCAST &&
-           N00.getOperand(0).getValueType() == MVT::v4f32)) {
-        SDValue V = DAG.getNode(X86ISD::MOVMSK, dl, MVT::i32,
-                                DAG.getBitcast(MVT::v4f32, N00));
-        return DAG.getZExtOrTrunc(V, dl, VT);
-      }
-    }
-
     // If this is a bitcast between a MVT::v4i1/v2i1 and an illegal integer
     // type, widen both sides to avoid a trip through memory.
     if ((VT == MVT::v4i1 || VT == MVT::v2i1) && SrcVT.isScalarInteger() &&
@@ -36535,6 +38389,16 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
       N0 = DAG.getBitcast(MVT::i8, N0);
       return DAG.getNode(ISD::TRUNCATE, dl, VT, N0);
     }
+  } else {
+    // If we're bitcasting from iX to vXi1, see if the integer originally
+    // began as a vXi1 and whether we can remove the bitcast entirely.
+    if (VT.isVector() && VT.getScalarType() == MVT::i1 &&
+        SrcVT.isScalarInteger() &&
+        DAG.getTargetLoweringInfo().isTypeLegal(VT)) {
+      if (SDValue V =
+              combineBitcastToBoolVector(VT, N0, SDLoc(N), DAG, Subtarget))
+        return V;
+    }
   }
 
   // Look for (i8 (bitcast (v8i1 (extract_subvector (v16i1 X), 0)))) and
@@ -36549,19 +38413,30 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
     return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT,
                        DAG.getBitcast(MVT::i16, N0.getOperand(0)));
 
-  // Combine (bitcast (vbroadcast_load)) -> (vbroadcast_load). The memory VT
-  // determines // the number of bits loaded. Remaining bits are zero.
+  // Canonicalize (bitcast (vbroadcast_load)) so that the output of the bitcast
+  // and the vbroadcast_load are both integer or both fp. In some cases this
+  // will remove the bitcast entirely.
   if (N0.getOpcode() == X86ISD::VBROADCAST_LOAD && N0.hasOneUse() &&
-      VT.getScalarSizeInBits() == SrcVT.getScalarSizeInBits()) {
+       VT.isFloatingPoint() != SrcVT.isFloatingPoint() && VT.isVector()) {
     auto *BCast = cast<MemIntrinsicSDNode>(N0);
-    SDVTList Tys = DAG.getVTList(VT, MVT::Other);
-    SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
-    SDValue ResNode =
-        DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
-                                VT.getVectorElementType(),
-                                BCast->getMemOperand());
-    DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
-    return ResNode;
+    unsigned SrcVTSize = SrcVT.getScalarSizeInBits();
+    unsigned MemSize = BCast->getMemoryVT().getScalarSizeInBits();
+    // Don't swap i8/i16 since don't have fp types that size.
+    if (MemSize >= 32) {
+      MVT MemVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(MemSize)
+                                       : MVT::getIntegerVT(MemSize);
+      MVT LoadVT = VT.isFloatingPoint() ? MVT::getFloatingPointVT(SrcVTSize)
+                                        : MVT::getIntegerVT(SrcVTSize);
+      LoadVT = MVT::getVectorVT(LoadVT, SrcVT.getVectorNumElements());
+
+      SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
+      SDValue Ops[] = { BCast->getChain(), BCast->getBasePtr() };
+      SDValue ResNode =
+          DAG.getMemIntrinsicNode(X86ISD::VBROADCAST_LOAD, SDLoc(N), Tys, Ops,
+                                  MemVT, BCast->getMemOperand());
+      DAG.ReplaceAllUsesOfValueWith(SDValue(BCast, 1), ResNode.getValue(1));
+      return DAG.getBitcast(VT, ResNode);
+    }
   }
 
   // Since MMX types are special and don't usually play with other vector types,
@@ -36648,6 +38523,47 @@ static SDValue combineBitcast(SDNode *N, SelectionDAG &DAG,
       return DAG.getConstant(0, SDLoc(N0), VT);
   }
 
+  // Look for MOVMSK that is maybe truncated and then bitcasted to vXi1.
+  // Turn it into a sign bit compare that produces a k-register. This avoids
+  // a trip through a GPR.
+  if (Subtarget.hasAVX512() && SrcVT.isScalarInteger() &&
+      VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
+      isPowerOf2_32(VT.getVectorNumElements())) {
+    unsigned NumElts = VT.getVectorNumElements();
+    SDValue Src = N0;
+
+    // Peek through truncate.
+    if (N0.getOpcode() == ISD::TRUNCATE && N0.hasOneUse())
+      Src = N0.getOperand(0);
+
+    if (Src.getOpcode() == X86ISD::MOVMSK && Src.hasOneUse()) {
+      SDValue MovmskIn = Src.getOperand(0);
+      MVT MovmskVT = MovmskIn.getSimpleValueType();
+      unsigned MovMskElts = MovmskVT.getVectorNumElements();
+
+      // We allow extra bits of the movmsk to be used since they are known zero.
+      // We can't convert a VPMOVMSKB without avx512bw.
+      if (MovMskElts <= NumElts &&
+          (Subtarget.hasBWI() || MovmskVT.getVectorElementType() != MVT::i8)) {
+        EVT IntVT = EVT(MovmskVT).changeVectorElementTypeToInteger();
+        MovmskIn = DAG.getBitcast(IntVT, MovmskIn);
+        SDLoc dl(N);
+        MVT CmpVT = MVT::getVectorVT(MVT::i1, MovMskElts);
+        SDValue Cmp = DAG.getSetCC(dl, CmpVT, MovmskIn,
+                                   DAG.getConstant(0, dl, IntVT), ISD::SETLT);
+        if (EVT(CmpVT) == VT)
+          return Cmp;
+
+        // Pad with zeroes up to original VT to replace the zeroes that were
+        // being used from the MOVMSK.
+        unsigned NumConcats = NumElts / MovMskElts;
+        SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, dl, CmpVT));
+        Ops[0] = Cmp;
+        return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, Ops);
+      }
+    }
+  }
+
   // Try to remove bitcasts from input and output of mask arithmetic to
   // remove GPR<->K-register crossings.
   if (SDValue V = combineCastedMaskArithmetic(N, DAG, DCI, Subtarget))
@@ -36772,12 +38688,9 @@ static SDValue combineHorizontalMinMaxResult(SDNode *Extract, SelectionDAG &DAG,
 
   // First, reduce the source down to 128-bit, applying BinOp to lo/hi.
   while (SrcVT.getSizeInBits() > 128) {
-    unsigned NumElts = SrcVT.getVectorNumElements();
-    unsigned NumSubElts = NumElts / 2;
-    SrcVT = EVT::getVectorVT(*DAG.getContext(), SrcSVT, NumSubElts);
-    unsigned SubSizeInBits = SrcVT.getSizeInBits();
-    SDValue Lo = extractSubVector(MinPos, 0, DAG, DL, SubSizeInBits);
-    SDValue Hi = extractSubVector(MinPos, NumSubElts, DAG, DL, SubSizeInBits);
+    SDValue Lo, Hi;
+    std::tie(Lo, Hi) = splitVector(MinPos, DAG, DL);
+    SrcVT = Lo.getValueType();
     MinPos = DAG.getNode(BinOp, DL, SrcVT, Lo, Hi);
   }
   assert(((SrcVT == MVT::v8i16 && ExtractVT == MVT::i16) ||
@@ -36864,6 +38777,25 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
       EVT MovmskVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
       Movmsk = DAG.getBitcast(MovmskVT, Match);
     } else {
+      // For all_of(setcc(vec,0,eq)) - avoid vXi64 comparisons if we don't have
+      // PCMPEQQ (SSE41+), use PCMPEQD instead.
+      if (BinOp == ISD::AND && !Subtarget.hasSSE41() &&
+          Match.getOpcode() == ISD::SETCC &&
+          ISD::isBuildVectorAllZeros(Match.getOperand(1).getNode()) &&
+          cast<CondCodeSDNode>(Match.getOperand(2))->get() ==
+              ISD::CondCode::SETEQ) {
+        SDValue Vec = Match.getOperand(0);
+        if (Vec.getValueType().getScalarType() == MVT::i64 &&
+            (2 * NumElts) <= MaxElts) {
+          NumElts *= 2;
+          EVT CmpVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
+          MatchVT = EVT::getVectorVT(*DAG.getContext(), MVT::i1, NumElts);
+          Match = DAG.getSetCC(
+              DL, MatchVT, DAG.getBitcast(CmpVT, Match.getOperand(0)),
+              DAG.getBitcast(CmpVT, Match.getOperand(1)), ISD::CondCode::SETEQ);
+        }
+      }
+
       // Use combineBitcastvxi1 to create the MOVMSK.
       while (NumElts > MaxElts) {
         SDValue Lo, Hi;
@@ -36878,10 +38810,7 @@ static SDValue combineHorizontalPredicateResult(SDNode *Extract,
       return SDValue();
     Movmsk = DAG.getZExtOrTrunc(Movmsk, DL, NumElts > 32 ? MVT::i64 : MVT::i32);
   } else {
-    // Bail with AVX512VL (which uses predicate registers).
-    if (Subtarget.hasVLX())
-      return SDValue();
-
+    // FIXME: Better handling of k-registers or 512-bit vectors?
     unsigned MatchSizeInBits = Match.getValueSizeInBits();
     if (!(MatchSizeInBits == 128 ||
           (MatchSizeInBits == 256 && Subtarget.hasAVX())))
@@ -36958,21 +38887,14 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   if (!Subtarget.hasSSE2())
     return SDValue();
 
-  // Verify the type we're extracting from is any integer type above i16.
-  EVT VT = Extract->getOperand(0).getValueType();
-  if (!VT.isSimple() || !(VT.getVectorElementType().getSizeInBits() > 16))
+  EVT ExtractVT = Extract->getValueType(0);
+  // Verify the type we're extracting is either i32 or i64.
+  // FIXME: Could support other types, but this is what we have coverage for.
+  if (ExtractVT != MVT::i32 && ExtractVT != MVT::i64)
     return SDValue();
 
-  unsigned RegSize = 128;
-  if (Subtarget.useBWIRegs())
-    RegSize = 512;
-  else if (Subtarget.hasAVX())
-    RegSize = 256;
-
-  // We handle upto v16i* for SSE2 / v32i* for AVX / v64i* for AVX512.
-  // TODO: We should be able to handle larger vectors by splitting them before
-  // feeding them into several SADs, and then reducing over those.
-  if (RegSize / VT.getVectorNumElements() < 8)
+  EVT VT = Extract->getOperand(0).getValueType();
+  if (!isPowerOf2_32(VT.getVectorNumElements()))
     return SDValue();
 
   // Match shuffle + add pyramid.
@@ -36988,8 +38910,8 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   // (extends the sign bit which is zero).
   // So it is correct to skip the sign/zero extend instruction.
   if (Root && (Root.getOpcode() == ISD::SIGN_EXTEND ||
-    Root.getOpcode() == ISD::ZERO_EXTEND ||
-    Root.getOpcode() == ISD::ANY_EXTEND))
+               Root.getOpcode() == ISD::ZERO_EXTEND ||
+               Root.getOpcode() == ISD::ANY_EXTEND))
     Root = Root.getOperand(0);
 
   // If there was a match, we want Root to be a select that is the root of an
@@ -37009,7 +38931,7 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
   // If the original vector was wider than 8 elements, sum over the results
   // in the SAD vector.
   unsigned Stages = Log2_32(VT.getVectorNumElements());
-  MVT SadVT = SAD.getSimpleValueType();
+  EVT SadVT = SAD.getValueType();
   if (Stages > 3) {
     unsigned SadElems = SadVT.getVectorNumElements();
 
@@ -37024,12 +38946,12 @@ static SDValue combineBasicSADPattern(SDNode *Extract, SelectionDAG &DAG,
     }
   }
 
-  MVT Type = Extract->getSimpleValueType(0);
-  unsigned TypeSizeInBits = Type.getSizeInBits();
-  // Return the lowest TypeSizeInBits bits.
-  MVT ResVT = MVT::getVectorVT(Type, SadVT.getSizeInBits() / TypeSizeInBits);
+  unsigned ExtractSizeInBits = ExtractVT.getSizeInBits();
+  // Return the lowest ExtractSizeInBits bits.
+  EVT ResVT = EVT::getVectorVT(*DAG.getContext(), ExtractVT,
+                               SadVT.getSizeInBits() / ExtractSizeInBits);
   SAD = DAG.getBitcast(ResVT, SAD);
-  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, Type, SAD,
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, ExtractVT, SAD,
                      Extract->getOperand(1));
 }
 
@@ -37048,19 +38970,34 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   EVT SrcVT = Src.getValueType();
   EVT SrcSVT = SrcVT.getVectorElementType();
+  unsigned SrcEltBits = SrcSVT.getSizeInBits();
   unsigned NumSrcElts = SrcVT.getVectorNumElements();
 
   // Don't attempt this for boolean mask vectors or unknown extraction indices.
   if (SrcSVT == MVT::i1 || !isa<ConstantSDNode>(Idx))
     return SDValue();
 
+  const APInt &IdxC = N->getConstantOperandAPInt(1);
+  if (IdxC.uge(NumSrcElts))
+    return SDValue();
+
   SDValue SrcBC = peekThroughBitcasts(Src);
 
-  // Handle extract(broadcast(scalar_value)), it doesn't matter what index is.
+  // Handle extract(bitcast(broadcast(scalar_value))).
   if (X86ISD::VBROADCAST == SrcBC.getOpcode()) {
     SDValue SrcOp = SrcBC.getOperand(0);
-    if (SrcOp.getValueSizeInBits() == VT.getSizeInBits())
-      return DAG.getBitcast(VT, SrcOp);
+    EVT SrcOpVT = SrcOp.getValueType();
+    if (SrcOpVT.isScalarInteger() && VT.isInteger() &&
+        (SrcOpVT.getSizeInBits() % SrcEltBits) == 0) {
+      unsigned Scale = SrcOpVT.getSizeInBits() / SrcEltBits;
+      unsigned Offset = IdxC.urem(Scale) * SrcEltBits;
+      // TODO support non-zero offsets.
+      if (Offset == 0) {
+        SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, SrcVT.getScalarType());
+        SrcOp = DAG.getZExtOrTrunc(SrcOp, dl, VT);
+        return SrcOp;
+      }
+    }
   }
 
   // If we're extracting a single element from a broadcast load and there are
@@ -37069,22 +39006,43 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
     auto *MemIntr = cast<MemIntrinsicSDNode>(SrcBC);
     unsigned SrcBCWidth = SrcBC.getScalarValueSizeInBits();
     if (MemIntr->getMemoryVT().getSizeInBits() == SrcBCWidth &&
-        VT.getSizeInBits() == SrcBCWidth) {
+        VT.getSizeInBits() == SrcBCWidth && SrcEltBits == SrcBCWidth) {
       SDValue Load = DAG.getLoad(VT, dl, MemIntr->getChain(),
                                  MemIntr->getBasePtr(),
                                  MemIntr->getPointerInfo(),
-                                 MemIntr->getAlignment(),
+                                 MemIntr->getOriginalAlign(),
                                  MemIntr->getMemOperand()->getFlags());
       DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), Load.getValue(1));
       return Load;
     }
   }
 
+  // Handle extract(bitcast(scalar_to_vector(scalar_value))) for integers.
+  // TODO: Move to DAGCombine?
+  if (SrcBC.getOpcode() == ISD::SCALAR_TO_VECTOR && VT.isInteger() &&
+      SrcBC.getValueType().isInteger() &&
+      (SrcBC.getScalarValueSizeInBits() % SrcEltBits) == 0 &&
+      SrcBC.getScalarValueSizeInBits() ==
+          SrcBC.getOperand(0).getValueSizeInBits()) {
+    unsigned Scale = SrcBC.getScalarValueSizeInBits() / SrcEltBits;
+    if (IdxC.ult(Scale)) {
+      unsigned Offset = IdxC.getZExtValue() * SrcVT.getScalarSizeInBits();
+      SDValue Scl = SrcBC.getOperand(0);
+      EVT SclVT = Scl.getValueType();
+      if (Offset) {
+        Scl = DAG.getNode(ISD::SRL, dl, SclVT, Scl,
+                          DAG.getShiftAmountConstant(Offset, SclVT, dl));
+      }
+      Scl = DAG.getZExtOrTrunc(Scl, dl, SrcVT.getScalarType());
+      Scl = DAG.getZExtOrTrunc(Scl, dl, VT);
+      return Scl;
+    }
+  }
+
   // Handle extract(truncate(x)) for 0'th index.
   // TODO: Treat this as a faux shuffle?
   // TODO: When can we use this for general indices?
-  if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() &&
-      isNullConstant(Idx)) {
+  if (ISD::TRUNCATE == Src.getOpcode() && SrcVT.is128BitVector() && IdxC == 0) {
     Src = extract128BitVector(Src.getOperand(0), 0, DAG, dl);
     Src = DAG.getBitcast(SrcVT, Src);
     return DAG.getNode(N->getOpcode(), dl, VT, Src, Idx);
@@ -37096,12 +39054,18 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
   if (!getTargetShuffleInputs(SrcBC, Ops, Mask, DAG))
     return SDValue();
 
+  // Shuffle inputs must be the same size as the result.
+  if (llvm::any_of(Ops, [SrcVT](SDValue Op) {
+        return SrcVT.getSizeInBits() != Op.getValueSizeInBits();
+      }))
+    return SDValue();
+
   // Attempt to narrow/widen the shuffle mask to the correct size.
   if (Mask.size() != NumSrcElts) {
     if ((NumSrcElts % Mask.size()) == 0) {
       SmallVector<int, 16> ScaledMask;
       int Scale = NumSrcElts / Mask.size();
-      scaleShuffleMask<int>(Scale, Mask, ScaledMask);
+      narrowShuffleMaskElts(Scale, Mask, ScaledMask);
       Mask = std::move(ScaledMask);
     } else if ((Mask.size() % NumSrcElts) == 0) {
       // Simplify Mask based on demanded element.
@@ -37126,7 +39090,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
   if (Mask.size() != NumSrcElts)
     return SDValue();
 
-  int SrcIdx = Mask[N->getConstantOperandVal(1)];
+  int SrcIdx = Mask[IdxC.getZExtValue()];
 
   // If the shuffle source element is undef/zero then we can just accept it.
   if (SrcIdx == SM_SentinelUndef)
@@ -37153,8 +39117,7 @@ static SDValue combineExtractWithShuffle(SDNode *N, SelectionDAG &DAG,
 
   if ((SrcVT == MVT::v8i16 && Subtarget.hasSSE2()) ||
       (SrcVT == MVT::v16i8 && Subtarget.hasSSE41())) {
-    assert(VT.getSizeInBits() >= SrcSVT.getSizeInBits() &&
-           "Unexpected extraction type");
+    assert(VT.getSizeInBits() >= SrcEltBits && "Unexpected extraction type");
     unsigned OpCode = (SrcVT == MVT::v8i16 ? X86ISD::PEXTRW : X86ISD::PEXTRB);
     SrcOp = DAG.getBitcast(SrcVT, SrcOp);
     SDValue ExtOp = DAG.getNode(OpCode, dl, MVT::i32, SrcOp,
@@ -37324,12 +39287,10 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
   // vXi8 reduction - sum lo/hi halves then use PSADBW.
   if (VT == MVT::i8) {
     while (Rdx.getValueSizeInBits() > 128) {
-      unsigned HalfSize = VecVT.getSizeInBits() / 2;
-      unsigned HalfElts = VecVT.getVectorNumElements() / 2;
-      SDValue Lo = extractSubVector(Rdx, 0, DAG, DL, HalfSize);
-      SDValue Hi = extractSubVector(Rdx, HalfElts, DAG, DL, HalfSize);
-      Rdx = DAG.getNode(ISD::ADD, DL, Lo.getValueType(), Lo, Hi);
-      VecVT = Rdx.getValueType();
+      SDValue Lo, Hi;
+      std::tie(Lo, Hi) = splitVector(Rdx, DAG, DL);
+      VecVT = Lo.getValueType();
+      Rdx = DAG.getNode(ISD::ADD, DL, VecVT, Lo, Hi);
     }
     assert(VecVT == MVT::v16i8 && "v16i8 reduction expected");
 
@@ -37344,8 +39305,7 @@ static SDValue combineReductionToHorizontal(SDNode *ExtElt, SelectionDAG &DAG,
   }
 
   // Only use (F)HADD opcodes if they aren't microcoded or minimizes codesize.
-  bool OptForSize = DAG.getMachineFunction().getFunction().hasOptSize();
-  if (!Subtarget.hasFastHorizontalOps() && !OptForSize)
+  if (!shouldUseHorizontalOp(true, DAG, Subtarget))
     return SDValue();
 
   unsigned HorizOpcode = Opc == ISD::ADD ? X86ISD::HADD : X86ISD::FHADD;
@@ -37477,11 +39437,21 @@ static SDValue combineExtractVectorElt(SDNode *N, SelectionDAG &DAG,
 
   // Attempt to extract a i1 element by using MOVMSK to extract the signbits
   // and then testing the relevant element.
+  //
+  // Note that we only combine extracts on the *same* result number, i.e.
+  //   t0 = merge_values a0, a1, a2, a3
+  //   i1 = extract_vector_elt t0, Constant:i64<2>
+  //   i1 = extract_vector_elt t0, Constant:i64<3>
+  // but not
+  //   i1 = extract_vector_elt t0:1, Constant:i64<2>
+  // since the latter would need its own MOVMSK.
   if (CIdx && SrcVT.getScalarType() == MVT::i1) {
     SmallVector<SDNode *, 16> BoolExtracts;
-    auto IsBoolExtract = [&BoolExtracts](SDNode *Use) {
+    unsigned ResNo = InputVector.getResNo();
+    auto IsBoolExtract = [&BoolExtracts, &ResNo](SDNode *Use) {
       if (Use->getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
           isa<ConstantSDNode>(Use->getOperand(1)) &&
+          Use->getOperand(0).getResNo() == ResNo &&
           Use->getValueType(0) == MVT::i1) {
         BoolExtracts.push_back(Use);
         return true;
@@ -37530,8 +39500,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
 
   assert(CondVT.isVector() && "Vector select expects a vector selector!");
 
-  // Check if the first operand is all zeros and Cond type is vXi1.
-  // This situation only applies to avx512.
   // TODO: Use isNullOrNullSplat() to distinguish constants with undefs?
   // TODO: Can we assert that both operands are not zeros (because that should
   //       get simplified at node creation time)?
@@ -37546,14 +39514,6 @@ combineVSelectWithAllOnesOrZeros(SDNode *N, SelectionDAG &DAG,
     return DAG.getConstant(0, DL, VT);
   }
 
-  if (TValIsAllZeros && !FValIsAllZeros && Subtarget.hasAVX512() &&
-      Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1) {
-    // Invert the cond to not(cond) : xor(op,allones)=not(op)
-    SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
-    // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
-    return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
-  }
-
   // To use the condition operand as a bitwise mask, it must have elements that
   // are the same size as the select elements. Ie, the condition operand must
   // have already been promoted from the IR select condition type <N x i1>.
@@ -37778,12 +39738,13 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
     return true;
   };
 
+  APInt DemandedBits(APInt::getSignMask(BitWidth));
+
   if (OnlyUsedAsSelectCond(Cond)) {
-    APInt DemandedMask(APInt::getSignMask(BitWidth));
     KnownBits Known;
     TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
                                           !DCI.isBeforeLegalizeOps());
-    if (!TLI.SimplifyDemandedBits(Cond, DemandedMask, Known, TLO, 0, true))
+    if (!TLI.SimplifyDemandedBits(Cond, DemandedBits, Known, TLO, 0, true))
       return SDValue();
 
     // If we changed the computation somewhere in the DAG, this change will
@@ -37805,15 +39766,9 @@ static SDValue combineVSelectToBLENDV(SDNode *N, SelectionDAG &DAG,
   }
 
   // Otherwise we can still at least try to simplify multiple use bits.
-  APInt DemandedMask(APInt::getSignMask(BitWidth));
-  APInt DemandedElts(APInt::getAllOnesValue(VT.getVectorNumElements()));
-  KnownBits Known;
-  TargetLowering::TargetLoweringOpt TLO(DAG, !DCI.isBeforeLegalize(),
-                                        !DCI.isBeforeLegalizeOps());
-  if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedMask,
-                                                      DemandedElts, DAG, 0))
-    return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0),
-                       V, N->getOperand(1), N->getOperand(2));
+  if (SDValue V = TLI.SimplifyMultipleUseDemandedBits(Cond, DemandedBits, DAG))
+      return DAG.getNode(X86ISD::BLENDV, SDLoc(N), N->getValueType(0), V,
+                         N->getOperand(1), N->getOperand(2));
 
   return SDValue();
 }
@@ -38297,6 +40252,19 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // Check if the first operand is all zeros and Cond type is vXi1.
+  // If this an avx512 target we can improve the use of zero masking by
+  // swapping the operands and inverting the condition.
+  if (N->getOpcode() == ISD::VSELECT && Cond.hasOneUse() &&
+       Subtarget.hasAVX512() && CondVT.getVectorElementType() == MVT::i1 &&
+      ISD::isBuildVectorAllZeros(LHS.getNode()) &&
+      !ISD::isBuildVectorAllZeros(RHS.getNode())) {
+    // Invert the cond to not(cond) : xor(op,allones)=not(op)
+    SDValue CondNew = DAG.getNOT(DL, Cond, CondVT);
+    // Vselect cond, op1, op2 = Vselect not(cond), op2, op1
+    return DAG.getSelect(DL, VT, CondNew, RHS, LHS);
+  }
+
   // Early exit check
   if (!TLI.isTypeLegal(VT))
     return SDValue();
@@ -38316,12 +40284,86 @@ static SDValue combineSelect(SDNode *N, SelectionDAG &DAG,
       return DAG.getNode(N->getOpcode(), DL, VT,
                          DAG.getBitcast(CondVT, CondNot), RHS, LHS);
 
-  // Custom action for SELECT MMX
-  if (VT == MVT::x86mmx) {
-    LHS = DAG.getBitcast(MVT::i64, LHS);
-    RHS = DAG.getBitcast(MVT::i64, RHS);
-    SDValue newSelect = DAG.getNode(ISD::SELECT, DL, MVT::i64, Cond, LHS, RHS);
-    return DAG.getBitcast(VT, newSelect);
+  // Try to optimize vXi1 selects if both operands are either all constants or
+  // bitcasts from scalar integer type. In that case we can convert the operands
+  // to integer and use an integer select which will be converted to a CMOV.
+  // We need to take a little bit of care to avoid creating an i64 type after
+  // type legalization.
+  if (N->getOpcode() == ISD::SELECT && VT.isVector() &&
+      VT.getVectorElementType() == MVT::i1 &&
+      (DCI.isBeforeLegalize() || (VT != MVT::v64i1 || Subtarget.is64Bit()))) {
+    EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), VT.getVectorNumElements());
+    bool LHSIsConst = ISD::isBuildVectorOfConstantSDNodes(LHS.getNode());
+    bool RHSIsConst = ISD::isBuildVectorOfConstantSDNodes(RHS.getNode());
+
+    if ((LHSIsConst ||
+         (LHS.getOpcode() == ISD::BITCAST &&
+          LHS.getOperand(0).getValueType() == IntVT)) &&
+        (RHSIsConst ||
+         (RHS.getOpcode() == ISD::BITCAST &&
+          RHS.getOperand(0).getValueType() == IntVT))) {
+      if (LHSIsConst)
+        LHS = combinevXi1ConstantToInteger(LHS, DAG);
+      else
+        LHS = LHS.getOperand(0);
+
+      if (RHSIsConst)
+        RHS = combinevXi1ConstantToInteger(RHS, DAG);
+      else
+        RHS = RHS.getOperand(0);
+
+      SDValue Select = DAG.getSelect(DL, IntVT, Cond, LHS, RHS);
+      return DAG.getBitcast(VT, Select);
+    }
+  }
+
+  // If this is "((X & C) == 0) ? Y : Z" and C is a constant mask vector of
+  // single bits, then invert the predicate and swap the select operands.
+  // This can lower using a vector shift bit-hack rather than mask and compare.
+  if (DCI.isBeforeLegalize() && !Subtarget.hasAVX512() &&
+      N->getOpcode() == ISD::VSELECT && Cond.getOpcode() == ISD::SETCC &&
+      Cond.hasOneUse() && CondVT.getVectorElementType() == MVT::i1 &&
+      Cond.getOperand(0).getOpcode() == ISD::AND &&
+      isNullOrNullSplat(Cond.getOperand(1)) &&
+      cast<CondCodeSDNode>(Cond.getOperand(2))->get() == ISD::SETEQ &&
+      Cond.getOperand(0).getValueType() == VT) {
+    // The 'and' mask must be composed of power-of-2 constants.
+    SDValue And = Cond.getOperand(0);
+    auto *C = isConstOrConstSplat(And.getOperand(1));
+    if (C && C->getAPIntValue().isPowerOf2()) {
+      // vselect (X & C == 0), LHS, RHS --> vselect (X & C != 0), RHS, LHS
+      SDValue NotCond =
+          DAG.getSetCC(DL, CondVT, And, Cond.getOperand(1), ISD::SETNE);
+      return DAG.getSelect(DL, VT, NotCond, RHS, LHS);
+    }
+
+    // If we have a non-splat but still powers-of-2 mask, AVX1 can use pmulld
+    // and AVX2 can use vpsllv{dq}. 8-bit lacks a proper shift or multiply.
+    // 16-bit lacks a proper blendv.
+    unsigned EltBitWidth = VT.getScalarSizeInBits();
+    bool CanShiftBlend =
+        TLI.isTypeLegal(VT) && ((Subtarget.hasAVX() && EltBitWidth == 32) ||
+                                (Subtarget.hasAVX2() && EltBitWidth == 64) ||
+                                (Subtarget.hasXOP()));
+    if (CanShiftBlend &&
+        ISD::matchUnaryPredicate(And.getOperand(1), [](ConstantSDNode *C) {
+          return C->getAPIntValue().isPowerOf2();
+        })) {
+      // Create a left-shift constant to get the mask bits over to the sign-bit.
+      SDValue Mask = And.getOperand(1);
+      SmallVector<int, 32> ShlVals;
+      for (unsigned i = 0, e = VT.getVectorNumElements(); i != e; ++i) {
+        auto *MaskVal = cast<ConstantSDNode>(Mask.getOperand(i));
+        ShlVals.push_back(EltBitWidth - 1 -
+                          MaskVal->getAPIntValue().exactLogBase2());
+      }
+      // vsel ((X & C) == 0), LHS, RHS --> vsel ((shl X, C') < 0), RHS, LHS
+      SDValue ShlAmt = getConstVector(ShlVals, VT.getSimpleVT(), DAG, DL);
+      SDValue Shl = DAG.getNode(ISD::SHL, DL, VT, And.getOperand(0), ShlAmt);
+      SDValue NewCond =
+          DAG.getSetCC(DL, CondVT, Shl, Cond.getOperand(1), ISD::SETLT);
+      return DAG.getSelect(DL, VT, NewCond, RHS, LHS);
+    }
   }
 
   return SDValue();
@@ -38647,6 +40689,282 @@ static SDValue combineCarryThroughADD(SDValue EFLAGS, SelectionDAG &DAG) {
   return SDValue();
 }
 
+/// If we are inverting an PTEST/TESTP operand, attempt to adjust the CC
+/// to avoid the inversion.
+static SDValue combinePTESTCC(SDValue EFLAGS, X86::CondCode &CC,
+                              SelectionDAG &DAG,
+                              const X86Subtarget &Subtarget) {
+  // TODO: Handle X86ISD::KTEST/X86ISD::KORTEST.
+  if (EFLAGS.getOpcode() != X86ISD::PTEST &&
+      EFLAGS.getOpcode() != X86ISD::TESTP)
+    return SDValue();
+
+  // PTEST/TESTP sets EFLAGS as:
+  // TESTZ: ZF = (Op0 & Op1) == 0
+  // TESTC: CF = (~Op0 & Op1) == 0
+  // TESTNZC: ZF == 0 && CF == 0
+  EVT VT = EFLAGS.getValueType();
+  SDValue Op0 = EFLAGS.getOperand(0);
+  SDValue Op1 = EFLAGS.getOperand(1);
+  EVT OpVT = Op0.getValueType();
+
+  // TEST*(~X,Y) == TEST*(X,Y)
+  if (SDValue NotOp0 = IsNOT(Op0, DAG)) {
+    X86::CondCode InvCC;
+    switch (CC) {
+    case X86::COND_B:
+      // testc -> testz.
+      InvCC = X86::COND_E;
+      break;
+    case X86::COND_AE:
+      // !testc -> !testz.
+      InvCC = X86::COND_NE;
+      break;
+    case X86::COND_E:
+      // testz -> testc.
+      InvCC = X86::COND_B;
+      break;
+    case X86::COND_NE:
+      // !testz -> !testc.
+      InvCC = X86::COND_AE;
+      break;
+    case X86::COND_A:
+    case X86::COND_BE:
+      // testnzc -> testnzc (no change).
+      InvCC = CC;
+      break;
+    default:
+      InvCC = X86::COND_INVALID;
+      break;
+    }
+
+    if (InvCC != X86::COND_INVALID) {
+      CC = InvCC;
+      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+                         DAG.getBitcast(OpVT, NotOp0), Op1);
+    }
+  }
+
+  if (CC == X86::COND_E || CC == X86::COND_NE) {
+    // TESTZ(X,~Y) == TESTC(Y,X)
+    if (SDValue NotOp1 = IsNOT(Op1, DAG)) {
+      CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
+      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+                         DAG.getBitcast(OpVT, NotOp1), Op0);
+    }
+
+    if (Op0 == Op1) {
+      SDValue BC = peekThroughBitcasts(Op0);
+      EVT BCVT = BC.getValueType();
+      assert(BCVT.isVector() && DAG.getTargetLoweringInfo().isTypeLegal(BCVT) &&
+             "Unexpected vector type");
+
+      // TESTZ(AND(X,Y),AND(X,Y)) == TESTZ(X,Y)
+      if (BC.getOpcode() == ISD::AND || BC.getOpcode() == X86ISD::FAND) {
+        return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+                           DAG.getBitcast(OpVT, BC.getOperand(0)),
+                           DAG.getBitcast(OpVT, BC.getOperand(1)));
+      }
+
+      // TESTZ(AND(~X,Y),AND(~X,Y)) == TESTC(X,Y)
+      if (BC.getOpcode() == X86ISD::ANDNP || BC.getOpcode() == X86ISD::FANDN) {
+        CC = (CC == X86::COND_E ? X86::COND_B : X86::COND_AE);
+        return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT,
+                           DAG.getBitcast(OpVT, BC.getOperand(0)),
+                           DAG.getBitcast(OpVT, BC.getOperand(1)));
+      }
+
+      // If every element is an all-sign value, see if we can use MOVMSK to
+      // more efficiently extract the sign bits and compare that.
+      // TODO: Handle TESTC with comparison inversion.
+      // TODO: Can we remove SimplifyMultipleUseDemandedBits and rely on
+      // MOVMSK combines to make sure its never worse than PTEST?
+      unsigned EltBits = BCVT.getScalarSizeInBits();
+      if (DAG.ComputeNumSignBits(BC) == EltBits) {
+        assert(VT == MVT::i32 && "Expected i32 EFLAGS comparison result");
+        APInt SignMask = APInt::getSignMask(EltBits);
+        const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+        if (SDValue Res =
+                TLI.SimplifyMultipleUseDemandedBits(BC, SignMask, DAG)) {
+          // For vXi16 cases we need to use pmovmksb and extract every other
+          // sign bit.
+          SDLoc DL(EFLAGS);
+          if (EltBits == 16) {
+            MVT MovmskVT = BCVT.is128BitVector() ? MVT::v16i8 : MVT::v32i8;
+            Res = DAG.getBitcast(MovmskVT, Res);
+            Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
+            Res = DAG.getNode(ISD::AND, DL, MVT::i32, Res,
+                              DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
+          } else {
+            Res = getPMOVMSKB(DL, Res, DAG, Subtarget);
+          }
+          return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Res,
+                             DAG.getConstant(0, DL, MVT::i32));
+        }
+      }
+    }
+
+    // TESTZ(-1,X) == TESTZ(X,X)
+    if (ISD::isBuildVectorAllOnes(Op0.getNode()))
+      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op1, Op1);
+
+    // TESTZ(X,-1) == TESTZ(X,X)
+    if (ISD::isBuildVectorAllOnes(Op1.getNode()))
+      return DAG.getNode(EFLAGS.getOpcode(), SDLoc(EFLAGS), VT, Op0, Op0);
+  }
+
+  return SDValue();
+}
+
+// Attempt to simplify the MOVMSK input based on the comparison type.
+static SDValue combineSetCCMOVMSK(SDValue EFLAGS, X86::CondCode &CC,
+                                  SelectionDAG &DAG,
+                                  const X86Subtarget &Subtarget) {
+  // Handle eq/ne against zero (any_of).
+  // Handle eq/ne against -1 (all_of).
+  if (!(CC == X86::COND_E || CC == X86::COND_NE))
+    return SDValue();
+  if (EFLAGS.getValueType() != MVT::i32)
+    return SDValue();
+  unsigned CmpOpcode = EFLAGS.getOpcode();
+  if (CmpOpcode != X86ISD::CMP && CmpOpcode != X86ISD::SUB)
+    return SDValue();
+  auto *CmpConstant = dyn_cast<ConstantSDNode>(EFLAGS.getOperand(1));
+  if (!CmpConstant)
+    return SDValue();
+  const APInt &CmpVal = CmpConstant->getAPIntValue();
+
+  SDValue CmpOp = EFLAGS.getOperand(0);
+  unsigned CmpBits = CmpOp.getValueSizeInBits();
+  assert(CmpBits == CmpVal.getBitWidth() && "Value size mismatch");
+
+  // Peek through any truncate.
+  if (CmpOp.getOpcode() == ISD::TRUNCATE)
+    CmpOp = CmpOp.getOperand(0);
+
+  // Bail if we don't find a MOVMSK.
+  if (CmpOp.getOpcode() != X86ISD::MOVMSK)
+    return SDValue();
+
+  SDValue Vec = CmpOp.getOperand(0);
+  MVT VecVT = Vec.getSimpleValueType();
+  assert((VecVT.is128BitVector() || VecVT.is256BitVector()) &&
+         "Unexpected MOVMSK operand");
+  unsigned NumElts = VecVT.getVectorNumElements();
+  unsigned NumEltBits = VecVT.getScalarSizeInBits();
+
+  bool IsAnyOf = CmpOpcode == X86ISD::CMP && CmpVal.isNullValue();
+  bool IsAllOf = CmpOpcode == X86ISD::SUB && NumElts <= CmpBits &&
+                 CmpVal.isMask(NumElts);
+  if (!IsAnyOf && !IsAllOf)
+    return SDValue();
+
+  // See if we can peek through to a vector with a wider element type, if the
+  // signbits extend down to all the sub-elements as well.
+  // Calling MOVMSK with the wider type, avoiding the bitcast, helps expose
+  // potential SimplifyDemandedBits/Elts cases.
+  if (Vec.getOpcode() == ISD::BITCAST) {
+    SDValue BC = peekThroughBitcasts(Vec);
+    MVT BCVT = BC.getSimpleValueType();
+    unsigned BCNumElts = BCVT.getVectorNumElements();
+    unsigned BCNumEltBits = BCVT.getScalarSizeInBits();
+    if ((BCNumEltBits == 32 || BCNumEltBits == 64) &&
+        BCNumEltBits > NumEltBits &&
+        DAG.ComputeNumSignBits(BC) > (BCNumEltBits - NumEltBits)) {
+      SDLoc DL(EFLAGS);
+      unsigned CmpMask = IsAnyOf ? 0 : ((1 << BCNumElts) - 1);
+      return DAG.getNode(X86ISD::CMP, DL, MVT::i32,
+                         DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, BC),
+                         DAG.getConstant(CmpMask, DL, MVT::i32));
+    }
+  }
+
+  // MOVMSK(PCMPEQ(X,0)) == -1 -> PTESTZ(X,X).
+  // MOVMSK(PCMPEQ(X,0)) != -1 -> !PTESTZ(X,X).
+  if (IsAllOf && Subtarget.hasSSE41()) {
+    SDValue BC = peekThroughBitcasts(Vec);
+    if (BC.getOpcode() == X86ISD::PCMPEQ &&
+        ISD::isBuildVectorAllZeros(BC.getOperand(1).getNode())) {
+      MVT TestVT = VecVT.is128BitVector() ? MVT::v2i64 : MVT::v4i64;
+      SDValue V = DAG.getBitcast(TestVT, BC.getOperand(0));
+      return DAG.getNode(X86ISD::PTEST, SDLoc(EFLAGS), MVT::i32, V, V);
+    }
+  }
+
+  // See if we can avoid a PACKSS by calling MOVMSK on the sources.
+  // For vXi16 cases we can use a v2Xi8 PMOVMSKB. We must mask out
+  // sign bits prior to the comparison with zero unless we know that
+  // the vXi16 splats the sign bit down to the lower i8 half.
+  // TODO: Handle all_of patterns.
+  if (Vec.getOpcode() == X86ISD::PACKSS && VecVT == MVT::v16i8) {
+    SDValue VecOp0 = Vec.getOperand(0);
+    SDValue VecOp1 = Vec.getOperand(1);
+    bool SignExt0 = DAG.ComputeNumSignBits(VecOp0) > 8;
+    bool SignExt1 = DAG.ComputeNumSignBits(VecOp1) > 8;
+    // PMOVMSKB(PACKSSBW(X, undef)) -> PMOVMSKB(BITCAST_v16i8(X)) & 0xAAAA.
+    if (IsAnyOf && CmpBits == 8 && VecOp1.isUndef()) {
+      SDLoc DL(EFLAGS);
+      SDValue Result = DAG.getBitcast(MVT::v16i8, VecOp0);
+      Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+      Result = DAG.getZExtOrTrunc(Result, DL, MVT::i16);
+      if (!SignExt0) {
+        Result = DAG.getNode(ISD::AND, DL, MVT::i16, Result,
+                             DAG.getConstant(0xAAAA, DL, MVT::i16));
+      }
+      return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+                         DAG.getConstant(0, DL, MVT::i16));
+    }
+    // PMOVMSKB(PACKSSBW(LO(X), HI(X)))
+    // -> PMOVMSKB(BITCAST_v32i8(X)) & 0xAAAAAAAA.
+    if (CmpBits == 16 && Subtarget.hasInt256() &&
+        VecOp0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        VecOp1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        VecOp0.getOperand(0) == VecOp1.getOperand(0) &&
+        VecOp0.getConstantOperandAPInt(1) == 0 &&
+        VecOp1.getConstantOperandAPInt(1) == 8 &&
+        (IsAnyOf || (SignExt0 && SignExt1))) {
+      SDLoc DL(EFLAGS);
+      SDValue Result = DAG.getBitcast(MVT::v32i8, VecOp0.getOperand(0));
+      Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+      unsigned CmpMask = IsAnyOf ? 0 : 0xFFFFFFFF;
+      if (!SignExt0 || !SignExt1) {
+        assert(IsAnyOf && "Only perform v16i16 signmasks for any_of patterns");
+        Result = DAG.getNode(ISD::AND, DL, MVT::i32, Result,
+                             DAG.getConstant(0xAAAAAAAA, DL, MVT::i32));
+      }
+      return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+                         DAG.getConstant(CmpMask, DL, MVT::i32));
+    }
+  }
+
+  // MOVMSK(SHUFFLE(X,u)) -> MOVMSK(X) iff every element is referenced.
+  SmallVector<int, 32> ShuffleMask;
+  SmallVector<SDValue, 2> ShuffleInputs;
+  if (NumElts == CmpBits &&
+      getTargetShuffleInputs(peekThroughBitcasts(Vec), ShuffleInputs,
+                             ShuffleMask, DAG) &&
+      ShuffleInputs.size() == 1 && !isAnyZeroOrUndef(ShuffleMask) &&
+      ShuffleInputs[0].getValueSizeInBits() == VecVT.getSizeInBits()) {
+    unsigned NumShuffleElts = ShuffleMask.size();
+    APInt DemandedElts = APInt::getNullValue(NumShuffleElts);
+    for (int M : ShuffleMask) {
+      assert(0 <= M && M < (int)NumShuffleElts && "Bad unary shuffle index");
+      DemandedElts.setBit(M);
+    }
+    if (DemandedElts.isAllOnesValue()) {
+      SDLoc DL(EFLAGS);
+      SDValue Result = DAG.getBitcast(VecVT, ShuffleInputs[0]);
+      Result = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+      Result =
+          DAG.getZExtOrTrunc(Result, DL, EFLAGS.getOperand(0).getValueType());
+      return DAG.getNode(X86ISD::CMP, DL, MVT::i32, Result,
+                         EFLAGS.getOperand(1));
+    }
+  }
+
+  return SDValue();
+}
+
 /// Optimize an EFLAGS definition used according to the condition code \p CC
 /// into a simpler EFLAGS value, potentially returning a new \p CC and replacing
 /// uses of chain values.
@@ -38659,6 +40977,13 @@ static SDValue combineSetCCEFLAGS(SDValue EFLAGS, X86::CondCode &CC,
 
   if (SDValue R = checkBoolTestSetCCCombine(EFLAGS, CC))
     return R;
+
+  if (SDValue R = combinePTESTCC(EFLAGS, CC, DAG, Subtarget))
+    return R;
+
+  if (SDValue R = combineSetCCMOVMSK(EFLAGS, CC, DAG, Subtarget))
+    return R;
+
   return combineSetCCAtomicArith(EFLAGS, CC, DAG, Subtarget);
 }
 
@@ -38680,7 +41005,10 @@ static SDValue combineCMov(SDNode *N, SelectionDAG &DAG,
   // Try to simplify the EFLAGS and condition code operands.
   // We can't always do this as FCMOV only supports a subset of X86 cond.
   if (SDValue Flags = combineSetCCEFLAGS(Cond, CC, DAG, Subtarget)) {
-    if (FalseOp.getValueType() != MVT::f80 || hasFPCMov(CC)) {
+    if (!(FalseOp.getValueType() == MVT::f80 ||
+          (FalseOp.getValueType() == MVT::f64 && !Subtarget.hasSSE2()) ||
+          (FalseOp.getValueType() == MVT::f32 && !Subtarget.hasSSE1())) ||
+        !Subtarget.hasCMov() || hasFPCMov(CC)) {
       SDValue Ops[] = {FalseOp, TrueOp, DAG.getTargetConstant(CC, DL, MVT::i8),
                        Flags};
       return DAG.getNode(X86ISD::CMOV, DL, N->getValueType(0), Ops);
@@ -38989,7 +41317,7 @@ static SDValue reduceVMULWidth(SDNode *N, SelectionDAG &DAG,
                                                    : ISD::SIGN_EXTEND,
                        DL, VT, MulLo);
 
-  MVT ResVT = MVT::getVectorVT(MVT::i32, NumElts / 2);
+  EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts / 2);
   // Generate the higher part of mul: pmulhw/pmulhuw. For MULU16/MULS16,
   // the higher part is also needed.
   SDValue MulHi =
@@ -39120,10 +41448,14 @@ static SDValue combineMulToPMADDWD(SDNode *N, SelectionDAG &DAG,
   if (!VT.isVector() || VT.getVectorElementType() != MVT::i32)
     return SDValue();
 
-  // Make sure the vXi16 type is legal. This covers the AVX512 without BWI case.
-  // Also allow v2i32 if it will be widened.
+  // Make sure the type is legal or will be widened to a legal type.
+  if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(VT))
+    return SDValue();
+
   MVT WVT = MVT::getVectorVT(MVT::i16, 2 * VT.getVectorNumElements());
-  if (VT != MVT::v2i32 && !DAG.getTargetLoweringInfo().isTypeLegal(WVT))
+
+  // Without BWI, we would need to split v32i16.
+  if (WVT == MVT::v32i16 && !Subtarget.hasBWI())
     return SDValue();
 
   SDValue N0 = N->getOperand(0);
@@ -39340,6 +41672,64 @@ static SDValue combineMul(SDNode *N, SelectionDAG &DAG,
   return NewMul;
 }
 
+// Try to form a MULHU or MULHS node by looking for
+// (srl (mul ext, ext), 16)
+// TODO: This is X86 specific because we want to be able to handle wide types
+// before type legalization. But we can only do it if the vector will be
+// legalized via widening/splitting. Type legalization can't handle promotion
+// of a MULHU/MULHS. There isn't a way to convey this to the generic DAG
+// combiner.
+static SDValue combineShiftToPMULH(SDNode *N, SelectionDAG &DAG,
+                                   const X86Subtarget &Subtarget) {
+  assert((N->getOpcode() == ISD::SRL || N->getOpcode() == ISD::SRA) &&
+           "SRL or SRA node is required here!");
+  SDLoc DL(N);
+
+  // Only do this with SSE4.1. On earlier targets reduceVMULWidth will expand
+  // the multiply.
+  if (!Subtarget.hasSSE41())
+    return SDValue();
+
+  // The operation feeding into the shift must be a multiply.
+  SDValue ShiftOperand = N->getOperand(0);
+  if (ShiftOperand.getOpcode() != ISD::MUL || !ShiftOperand.hasOneUse())
+    return SDValue();
+
+  // Input type should be at least vXi32.
+  EVT VT = N->getValueType(0);
+  if (!VT.isVector() || VT.getVectorElementType().getSizeInBits() < 32)
+    return SDValue();
+
+  // Need a shift by 16.
+  APInt ShiftAmt;
+  if (!ISD::isConstantSplatVector(N->getOperand(1).getNode(), ShiftAmt) ||
+      ShiftAmt != 16)
+    return SDValue();
+
+  SDValue LHS = ShiftOperand.getOperand(0);
+  SDValue RHS = ShiftOperand.getOperand(1);
+
+  unsigned ExtOpc = LHS.getOpcode();
+  if ((ExtOpc != ISD::SIGN_EXTEND && ExtOpc != ISD::ZERO_EXTEND) ||
+      RHS.getOpcode() != ExtOpc)
+    return SDValue();
+
+  // Peek through the extends.
+  LHS = LHS.getOperand(0);
+  RHS = RHS.getOperand(0);
+
+  // Ensure the input types match.
+  EVT MulVT = LHS.getValueType();
+  if (MulVT.getVectorElementType() != MVT::i16 || RHS.getValueType() != MulVT)
+    return SDValue();
+
+  unsigned Opc = ExtOpc == ISD::SIGN_EXTEND ? ISD::MULHS : ISD::MULHU;
+  SDValue Mulh = DAG.getNode(Opc, DL, MulVT, LHS, RHS);
+
+  ExtOpc = N->getOpcode() == ISD::SRA ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+  return DAG.getNode(ExtOpc, DL, VT, Mulh);
+}
+
 static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
@@ -39399,12 +41789,16 @@ static SDValue combineShiftLeft(SDNode *N, SelectionDAG &DAG) {
   return SDValue();
 }
 
-static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG,
+                                           const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
   unsigned Size = VT.getSizeInBits();
 
+  if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
+    return V;
+
   // fold (ashr (shl, a, [56,48,32,24,16]), SarConst)
   // into (shl, (sext (a), [56,48,32,24,16] - SarConst)) or
   // into (lshr, (sext (a), SarConst - [56,48,32,24,16]))
@@ -39453,11 +41847,15 @@ static SDValue combineShiftRightArithmetic(SDNode *N, SelectionDAG &DAG) {
 }
 
 static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
-                                        TargetLowering::DAGCombinerInfo &DCI) {
+                                        TargetLowering::DAGCombinerInfo &DCI,
+                                        const X86Subtarget &Subtarget) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
   EVT VT = N0.getValueType();
 
+  if (SDValue V = combineShiftToPMULH(N, DAG, Subtarget))
+    return V;
+
   // Only do this on the last DAG combine as it can interfere with other
   // combines.
   if (!DCI.isAfterLegalizeDAG())
@@ -39501,16 +41899,92 @@ static SDValue combineShiftRightLogical(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+static SDValue combineVectorPackWithShuffle(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opcode = N->getOpcode();
+  assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
+         "Unexpected pack opcode");
+
+  EVT VT = N->getValueType(0);
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  unsigned NumDstElts = VT.getVectorNumElements();
+
+  // Attempt to fold PACK(LOSUBVECTOR(SHUFFLE(X)),HISUBVECTOR(SHUFFLE(X)))
+  // to SHUFFLE(PACK(LOSUBVECTOR(X),HISUBVECTOR(X))), this is mainly for
+  // truncation trees that help us avoid lane crossing shuffles.
+  // TODO: There's a lot more we can do for PACK/HADD style shuffle combines.
+  if (N0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      N1.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+      N0.getConstantOperandAPInt(1) == 0 &&
+      N1.getConstantOperandAPInt(1) == (NumDstElts / 2) &&
+      N0.getOperand(0) == N1.getOperand(0) && VT.is128BitVector() &&
+      N0.getOperand(0).getValueType().is256BitVector()) {
+    // TODO - support target/faux shuffles.
+    SDValue Vec = peekThroughBitcasts(N0.getOperand(0));
+    if (auto *SVN = dyn_cast<ShuffleVectorSDNode>(Vec)) {
+      // To keep the PACK LHS/RHS coherency, we must be able to scale the unary
+      // shuffle to a vXi64 width - we can probably relax this in the future.
+      SmallVector<int, 4> ShuffleMask;
+      if (SVN->getOperand(1).isUndef() &&
+          scaleShuffleElements(SVN->getMask(), 4, ShuffleMask)) {
+        SDLoc DL(N);
+        SDValue Lo, Hi;
+        std::tie(Lo, Hi) = DAG.SplitVector(SVN->getOperand(0), DL);
+        Lo = DAG.getBitcast(N0.getValueType(), Lo);
+        Hi = DAG.getBitcast(N1.getValueType(), Hi);
+        SDValue Res = DAG.getNode(Opcode, DL, VT, Lo, Hi);
+        Res = DAG.getBitcast(MVT::v4i32, Res);
+        Res = DAG.getVectorShuffle(MVT::v4i32, DL, Res, Res, ShuffleMask);
+        return DAG.getBitcast(VT, Res);
+      }
+    }
+  }
+
+  // Attempt to fold PACK(SHUFFLE(X,Y),SHUFFLE(X,Y)) -> SHUFFLE(PACK(X,Y)).
+  // TODO: Relax shuffle scaling to support sub-128-bit subvector shuffles.
+  if (VT.is256BitVector()) {
+    if (auto *SVN0 = dyn_cast<ShuffleVectorSDNode>(N0)) {
+      if (auto *SVN1 = dyn_cast<ShuffleVectorSDNode>(N1)) {
+        SmallVector<int, 2> ShuffleMask0, ShuffleMask1;
+        if (scaleShuffleElements(SVN0->getMask(), 2, ShuffleMask0) &&
+            scaleShuffleElements(SVN1->getMask(), 2, ShuffleMask1)) {
+          SDValue Op00 = SVN0->getOperand(0);
+          SDValue Op01 = SVN0->getOperand(1);
+          SDValue Op10 = SVN1->getOperand(0);
+          SDValue Op11 = SVN1->getOperand(1);
+          if ((Op00 == Op11) && (Op01 == Op10)) {
+            std::swap(Op10, Op11);
+            ShuffleVectorSDNode::commuteMask(ShuffleMask1);
+          }
+          if ((Op00 == Op10) && (Op01 == Op11)) {
+            SmallVector<int, 4> ShuffleMask;
+            ShuffleMask.append(ShuffleMask0.begin(), ShuffleMask0.end());
+            ShuffleMask.append(ShuffleMask1.begin(), ShuffleMask1.end());
+            SDLoc DL(N);
+            SDValue Res = DAG.getNode(Opcode, DL, VT, Op00, Op01);
+            Res = DAG.getBitcast(MVT::v4i64, Res);
+            Res = DAG.getVectorShuffle(MVT::v4i64, DL, Res, Res, ShuffleMask);
+            return DAG.getBitcast(VT, Res);
+          }
+        }
+      }
+    }
+  }
+
+  return SDValue();
+}
+
 static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget &Subtarget) {
   unsigned Opcode = N->getOpcode();
   assert((X86ISD::PACKSS == Opcode || X86ISD::PACKUS == Opcode) &&
-         "Unexpected shift opcode");
+         "Unexpected pack opcode");
 
   EVT VT = N->getValueType(0);
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
+  unsigned NumDstElts = VT.getVectorNumElements();
   unsigned DstBitsPerElt = VT.getScalarSizeInBits();
   unsigned SrcBitsPerElt = 2 * DstBitsPerElt;
   assert(N0.getScalarValueSizeInBits() == SrcBitsPerElt &&
@@ -39527,7 +42001,6 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
       getTargetConstantBitsFromNode(N0, SrcBitsPerElt, UndefElts0, EltBits0) &&
       getTargetConstantBitsFromNode(N1, SrcBitsPerElt, UndefElts1, EltBits1)) {
     unsigned NumLanes = VT.getSizeInBits() / 128;
-    unsigned NumDstElts = VT.getVectorNumElements();
     unsigned NumSrcElts = NumDstElts / 2;
     unsigned NumDstEltsPerLane = NumDstElts / NumLanes;
     unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
@@ -39574,6 +42047,10 @@ static SDValue combineVectorPack(SDNode *N, SelectionDAG &DAG,
     return getConstVector(Bits, Undefs, VT.getSimpleVT(), DAG, SDLoc(N));
   }
 
+  // Try to fold PACK(SHUFFLE(),SHUFFLE()) -> SHUFFLE(PACK()).
+  if (SDValue V = combineVectorPackWithShuffle(N, DAG))
+    return V;
+
   // Try to combine a PACKUSWB/PACKSSWB implemented truncate with a regular
   // truncate to create a larger truncate.
   if (Subtarget.hasAVX512() &&
@@ -39656,26 +42133,37 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
   if (ShiftVal >= NumBitsPerElt) {
     if (LogicalShift)
       return DAG.getConstant(0, SDLoc(N), VT);
-    else
-      ShiftVal = NumBitsPerElt - 1;
+    ShiftVal = NumBitsPerElt - 1;
   }
 
-  // Shift N0 by zero -> N0.
+  // (shift X, 0) -> X
   if (!ShiftVal)
     return N0;
 
-  // Shift zero -> zero.
+  // (shift 0, C) -> 0
   if (ISD::isBuildVectorAllZeros(N0.getNode()))
+    // N0 is all zeros or undef. We guarantee that the bits shifted into the
+    // result are all zeros, not undef.
     return DAG.getConstant(0, SDLoc(N), VT);
 
-  // Fold (VSRAI (VSRAI X, C1), C2) --> (VSRAI X, (C1 + C2)) with (C1 + C2)
-  // clamped to (NumBitsPerElt - 1).
-  if (Opcode == X86ISD::VSRAI && N0.getOpcode() == X86ISD::VSRAI) {
+  // (VSRAI -1, C) -> -1
+  if (!LogicalShift && ISD::isBuildVectorAllOnes(N0.getNode()))
+    // N0 is all ones or undef. We guarantee that the bits shifted into the
+    // result are all ones, not undef.
+    return DAG.getConstant(-1, SDLoc(N), VT);
+
+  // (shift (shift X, C2), C1) -> (shift X, (C1 + C2))
+  if (Opcode == N0.getOpcode()) {
     unsigned ShiftVal2 = cast<ConstantSDNode>(N0.getOperand(1))->getZExtValue();
     unsigned NewShiftVal = ShiftVal + ShiftVal2;
-    if (NewShiftVal >= NumBitsPerElt)
+    if (NewShiftVal >= NumBitsPerElt) {
+      // Out of range logical bit shifts are guaranteed to be zero.
+      // Out of range arithmetic bit shifts splat the sign bit.
+      if (LogicalShift)
+        return DAG.getConstant(0, SDLoc(N), VT);
       NewShiftVal = NumBitsPerElt - 1;
-    return DAG.getNode(X86ISD::VSRAI, SDLoc(N), VT, N0.getOperand(0),
+    }
+    return DAG.getNode(Opcode, SDLoc(N), VT, N0.getOperand(0),
                        DAG.getTargetConstant(NewShiftVal, SDLoc(N), MVT::i8));
   }
 
@@ -39693,14 +42181,22 @@ static SDValue combineVectorShiftImm(SDNode *N, SelectionDAG &DAG,
       getTargetConstantBitsFromNode(N0, NumBitsPerElt, UndefElts, EltBits)) {
     assert(EltBits.size() == VT.getVectorNumElements() &&
            "Unexpected shift value type");
-    for (APInt &Elt : EltBits) {
-      if (X86ISD::VSHLI == Opcode)
+    // Undef elements need to fold to 0. It's possible SimplifyDemandedBits
+    // created an undef input due to no input bits being demanded, but user
+    // still expects 0 in other bits.
+    for (unsigned i = 0, e = EltBits.size(); i != e; ++i) {
+      APInt &Elt = EltBits[i];
+      if (UndefElts[i])
+        Elt = 0;
+      else if (X86ISD::VSHLI == Opcode)
         Elt <<= ShiftVal;
       else if (X86ISD::VSRAI == Opcode)
         Elt.ashrInPlace(ShiftVal);
       else
         Elt.lshrInPlace(ShiftVal);
     }
+    // Reset undef elements since they were zeroed above.
+    UndefElts = 0;
     return getConstVector(EltBits, UndefElts, VT.getSimpleVT(), DAG, SDLoc(N));
   }
 
@@ -39717,19 +42213,24 @@ static SDValue combineVectorInsert(SDNode *N, SelectionDAG &DAG,
                                    const X86Subtarget &Subtarget) {
   EVT VT = N->getValueType(0);
   assert(((N->getOpcode() == X86ISD::PINSRB && VT == MVT::v16i8) ||
-          (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16)) &&
+          (N->getOpcode() == X86ISD::PINSRW && VT == MVT::v8i16) ||
+          N->getOpcode() == ISD::INSERT_VECTOR_ELT) &&
          "Unexpected vector insertion");
 
-  unsigned NumBitsPerElt = VT.getScalarSizeInBits();
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-  if (TLI.SimplifyDemandedBits(SDValue(N, 0),
-                               APInt::getAllOnesValue(NumBitsPerElt), DCI))
-    return SDValue(N, 0);
+  if (N->getOpcode() == X86ISD::PINSRB || N->getOpcode() == X86ISD::PINSRW) {
+    unsigned NumBitsPerElt = VT.getScalarSizeInBits();
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    if (TLI.SimplifyDemandedBits(SDValue(N, 0),
+                                 APInt::getAllOnesValue(NumBitsPerElt), DCI))
+      return SDValue(N, 0);
+  }
 
-  // Attempt to combine PINSRB/PINSRW patterns to a shuffle.
-  SDValue Op(N, 0);
-  if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
-    return Res;
+  // Attempt to combine insertion patterns to a shuffle.
+  if (VT.isSimple() && DCI.isAfterLegalizeDAG()) {
+    SDValue Op(N, 0);
+    if (SDValue Res = combineX86ShufflesRecursively(Op, DAG, Subtarget))
+      return Res;
+  }
 
   return SDValue();
 }
@@ -39752,7 +42253,7 @@ static SDValue combineCompareEqual(SDNode *N, SelectionDAG &DAG,
     SDLoc DL(N);
 
     // The SETCCs should both refer to the same CMP.
-    if (CMP0.getOpcode() != X86ISD::CMP || CMP0 != CMP1)
+    if (CMP0.getOpcode() != X86ISD::FCMP || CMP0 != CMP1)
       return SDValue();
 
     SDValue CMP00 = CMP0->getOperand(0);
@@ -39851,10 +42352,27 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
-  if (SDValue Not = IsNOT(N0, DAG)) {
+  auto GetNot = [&VT, &DAG](SDValue V) {
+    // Basic X = NOT(Y) detection.
+    if (SDValue Not = IsNOT(V, DAG))
+      return Not;
+    // Fold BROADCAST(NOT(Y)) -> BROADCAST(Y).
+    if (V.getOpcode() == X86ISD::VBROADCAST) {
+      SDValue Src = V.getOperand(0);
+      EVT SrcVT = Src.getValueType();
+      if (!SrcVT.isVector())
+        return SDValue();
+      if (SDValue Not = IsNOT(Src, DAG))
+        return DAG.getNode(X86ISD::VBROADCAST, SDLoc(V), VT,
+                           DAG.getBitcast(SrcVT, Not));
+    }
+    return SDValue();
+  };
+
+  if (SDValue Not = GetNot(N0)) {
     X = Not;
     Y = N1;
-  } else if (SDValue Not = IsNOT(N1, DAG)) {
+  } else if (SDValue Not = GetNot(N1)) {
     X = Not;
     Y = N0;
   } else
@@ -39865,6 +42383,65 @@ static SDValue combineANDXORWithAllOnesIntoANDNP(SDNode *N, SelectionDAG &DAG) {
   return DAG.getNode(X86ISD::ANDNP, SDLoc(N), VT, X, Y);
 }
 
+// Try to widen AND, OR and XOR nodes to VT in order to remove casts around
+// logical operations, like in the example below.
+//   or (and (truncate x, truncate y)),
+//      (xor (truncate z, build_vector (constants)))
+// Given a target type \p VT, we generate
+//   or (and x, y), (xor z, zext(build_vector (constants)))
+// given x, y and z are of type \p VT. We can do so, if operands are either
+// truncates from VT types, the second operand is a vector of constants or can
+// be recursively promoted.
+static SDValue PromoteMaskArithmetic(SDNode *N, EVT VT, SelectionDAG &DAG,
+                                     unsigned Depth) {
+  // Limit recursion to avoid excessive compile times.
+  if (Depth >= SelectionDAG::MaxRecursionDepth)
+    return SDValue();
+
+  if (N->getOpcode() != ISD::XOR && N->getOpcode() != ISD::AND &&
+      N->getOpcode() != ISD::OR)
+    return SDValue();
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+  SDLoc DL(N);
+
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (!TLI.isOperationLegalOrPromote(N->getOpcode(), VT))
+    return SDValue();
+
+  if (SDValue NN0 = PromoteMaskArithmetic(N0.getNode(), VT, DAG, Depth + 1))
+    N0 = NN0;
+  else {
+    // The Left side has to be a trunc.
+    if (N0.getOpcode() != ISD::TRUNCATE)
+      return SDValue();
+
+    // The type of the truncated inputs.
+    if (N0.getOperand(0).getValueType() != VT)
+      return SDValue();
+
+    N0 = N0.getOperand(0);
+  }
+
+  if (SDValue NN1 = PromoteMaskArithmetic(N1.getNode(), VT, DAG, Depth + 1))
+    N1 = NN1;
+  else {
+    // The right side has to be a 'trunc' or a constant vector.
+    bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
+                    N1.getOperand(0).getValueType() == VT;
+    if (!RHSTrunc && !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
+      return SDValue();
+
+    if (RHSTrunc)
+      N1 = N1.getOperand(0);
+    else
+      N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
+  }
+
+  return DAG.getNode(N->getOpcode(), DL, VT, N0, N1);
+}
+
 // On AVX/AVX2 the type v8i1 is legalized to v8i16, which is an XMM sized
 // register. In most cases we actually compare or select YMM-sized registers
 // and mixing the two types creates horrible code. This method optimizes
@@ -39876,6 +42453,7 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   EVT VT = N->getValueType(0);
   assert(VT.isVector() && "Expected vector type");
 
+  SDLoc DL(N);
   assert((N->getOpcode() == ISD::ANY_EXTEND ||
           N->getOpcode() == ISD::ZERO_EXTEND ||
           N->getOpcode() == ISD::SIGN_EXTEND) && "Invalid Node");
@@ -39883,57 +42461,33 @@ static SDValue PromoteMaskArithmetic(SDNode *N, SelectionDAG &DAG,
   SDValue Narrow = N->getOperand(0);
   EVT NarrowVT = Narrow.getValueType();
 
-  if (Narrow->getOpcode() != ISD::XOR &&
-      Narrow->getOpcode() != ISD::AND &&
-      Narrow->getOpcode() != ISD::OR)
-    return SDValue();
-
-  SDValue N0  = Narrow->getOperand(0);
-  SDValue N1  = Narrow->getOperand(1);
-  SDLoc DL(Narrow);
-
-  // The Left side has to be a trunc.
-  if (N0.getOpcode() != ISD::TRUNCATE)
-    return SDValue();
-
-  // The type of the truncated inputs.
-  if (N0.getOperand(0).getValueType() != VT)
-    return SDValue();
-
-  // The right side has to be a 'trunc' or a constant vector.
-  bool RHSTrunc = N1.getOpcode() == ISD::TRUNCATE &&
-                  N1.getOperand(0).getValueType() == VT;
-  if (!RHSTrunc &&
-      !ISD::isBuildVectorOfConstantSDNodes(N1.getNode()))
-    return SDValue();
-
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
-  if (!TLI.isOperationLegalOrPromote(Narrow->getOpcode(), VT))
-    return SDValue();
-
-  // Set N0 and N1 to hold the inputs to the new wide operation.
-  N0 = N0.getOperand(0);
-  if (RHSTrunc)
-    N1 = N1.getOperand(0);
-  else
-    N1 = DAG.getNode(ISD::ZERO_EXTEND, DL, VT, N1);
-
   // Generate the wide operation.
-  SDValue Op = DAG.getNode(Narrow->getOpcode(), DL, VT, N0, N1);
-  unsigned Opcode = N->getOpcode();
-  switch (Opcode) {
+  SDValue Op = PromoteMaskArithmetic(Narrow.getNode(), VT, DAG, 0);
+  if (!Op)
+    return SDValue();
+  switch (N->getOpcode()) {
   default: llvm_unreachable("Unexpected opcode");
   case ISD::ANY_EXTEND:
     return Op;
   case ISD::ZERO_EXTEND:
-    return DAG.getZeroExtendInReg(Op, DL, NarrowVT.getScalarType());
+    return DAG.getZeroExtendInReg(Op, DL, NarrowVT);
   case ISD::SIGN_EXTEND:
     return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT,
                        Op, DAG.getValueType(NarrowVT));
   }
 }
 
+static unsigned convertIntLogicToFPLogicOpcode(unsigned Opcode) {
+  unsigned FPOpcode;
+  switch (Opcode) {
+  default: llvm_unreachable("Unexpected input node for FP logic conversion");
+  case ISD::AND: FPOpcode = X86ISD::FAND; break;
+  case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
+  case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
+  }
+  return FPOpcode;
+}
+
 /// If both input operands of a logic op are being cast from floating point
 /// types, try to convert this into a floating point logic node to avoid
 /// unnecessary moves from SSE to integer registers.
@@ -39958,18 +42512,45 @@ static SDValue convertIntLogicToFPLogic(SDNode *N, SelectionDAG &DAG,
         (Subtarget.hasSSE2() && N00Type == MVT::f64)))
     return SDValue();
 
-  unsigned FPOpcode;
-  switch (N->getOpcode()) {
-  default: llvm_unreachable("Unexpected input node for FP logic conversion");
-  case ISD::AND: FPOpcode = X86ISD::FAND; break;
-  case ISD::OR:  FPOpcode = X86ISD::FOR;  break;
-  case ISD::XOR: FPOpcode = X86ISD::FXOR; break;
-  }
-
+  unsigned FPOpcode = convertIntLogicToFPLogicOpcode(N->getOpcode());
   SDValue FPLogic = DAG.getNode(FPOpcode, DL, N00Type, N00, N10);
   return DAG.getBitcast(VT, FPLogic);
 }
 
+// Attempt to fold BITOP(MOVMSK(X),MOVMSK(Y)) -> MOVMSK(BITOP(X,Y))
+// to reduce XMM->GPR traffic.
+static SDValue combineBitOpWithMOVMSK(SDNode *N, SelectionDAG &DAG) {
+  unsigned Opc = N->getOpcode();
+  assert((Opc == ISD::OR || Opc == ISD::AND || Opc == ISD::XOR) &&
+         "Unexpected bit opcode");
+
+  SDValue N0 = N->getOperand(0);
+  SDValue N1 = N->getOperand(1);
+
+  // Both operands must be single use MOVMSK.
+  if (N0.getOpcode() != X86ISD::MOVMSK || !N0.hasOneUse() ||
+      N1.getOpcode() != X86ISD::MOVMSK || !N1.hasOneUse())
+    return SDValue();
+
+  SDValue Vec0 = N0.getOperand(0);
+  SDValue Vec1 = N1.getOperand(0);
+  EVT VecVT0 = Vec0.getValueType();
+  EVT VecVT1 = Vec1.getValueType();
+
+  // Both MOVMSK operands must be from vectors of the same size and same element
+  // size, but its OK for a fp/int diff.
+  if (VecVT0.getSizeInBits() != VecVT1.getSizeInBits() ||
+      VecVT0.getScalarSizeInBits() != VecVT1.getScalarSizeInBits())
+    return SDValue();
+
+  SDLoc DL(N);
+  unsigned VecOpc =
+      VecVT0.isFloatingPoint() ? convertIntLogicToFPLogicOpcode(Opc) : Opc;
+  SDValue Result =
+      DAG.getNode(VecOpc, DL, VecVT0, Vec0, DAG.getBitcast(VecVT0, Vec1));
+  return DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Result);
+}
+
 /// If this is a zero/all-bits result that is bitwise-anded with a low bits
 /// mask. (Mask == 1 for the x86 lowering of a SETCC + ZEXT), replace the 'and'
 /// with a shift-right to eliminate loading the vector constant mask value.
@@ -40292,7 +42873,8 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   // TODO: Support multiple SrcOps.
   if (VT == MVT::i1) {
     SmallVector<SDValue, 2> SrcOps;
-    if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps) &&
+    SmallVector<APInt, 2> SrcPartials;
+    if (matchScalarReduction(SDValue(N, 0), ISD::AND, SrcOps, &SrcPartials) &&
         SrcOps.size() == 1) {
       SDLoc dl(N);
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -40302,9 +42884,11 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
       if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
         Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
       if (Mask) {
-        APInt AllBits = APInt::getAllOnesValue(NumElts);
-        return DAG.getSetCC(dl, MVT::i1, Mask,
-                            DAG.getConstant(AllBits, dl, MaskVT), ISD::SETEQ);
+        assert(SrcPartials[0].getBitWidth() == NumElts &&
+               "Unexpected partial reduction mask");
+        SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
+        Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
+        return DAG.getSetCC(dl, MVT::i1, Mask, PartialBits, ISD::SETEQ);
       }
     }
   }
@@ -40312,6 +42896,9 @@ static SDValue combineAnd(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = combineScalarAndWithMaskSetcc(N, DAG, Subtarget))
     return V;
 
+  if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
+    return R;
+
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -40420,6 +43007,16 @@ static SDValue canonicalizeBitSelect(SDNode *N, SelectionDAG &DAG,
   }
 
   SDLoc DL(N);
+
+  if (UseVPTERNLOG) {
+    // Emit a VPTERNLOG node directly.
+    SDValue A = DAG.getBitcast(VT, N0.getOperand(1));
+    SDValue B = DAG.getBitcast(VT, N0.getOperand(0));
+    SDValue C = DAG.getBitcast(VT, N1.getOperand(0));
+    SDValue Imm = DAG.getTargetConstant(0xCA, DL, MVT::i8);
+    return DAG.getNode(X86ISD::VPTERNLOG, DL, VT, A, B, C, Imm);
+  }
+
   SDValue X = N->getOperand(0);
   SDValue Y =
       DAG.getNode(X86ISD::ANDNP, DL, VT, DAG.getBitcast(VT, N0.getOperand(1)),
@@ -40503,6 +43100,10 @@ static SDValue combineLogicBlendIntoPBLENDV(SDNode *N, SelectionDAG &DAG,
   if (!Subtarget.hasSSE41())
     return SDValue();
 
+  // If we have VPTERNLOG we should prefer that since PBLENDVB is multiple uops.
+  if (Subtarget.hasVLX())
+    return SDValue();
+
   MVT BlendVT = VT.is256BitVector() ? MVT::v32i8 : MVT::v16i8;
 
   X = DAG.getBitcast(BlendVT, X);
@@ -40619,139 +43220,6 @@ static SDValue combineOrCmpEqZeroToCtlzSrl(SDNode *N, SelectionDAG &DAG,
   return Ret;
 }
 
-static SDValue combineOrShiftToFunnelShift(SDNode *N, SelectionDAG &DAG,
-                                           const X86Subtarget &Subtarget) {
-  assert(N->getOpcode() == ISD::OR && "Expected ISD::OR node");
-  SDValue N0 = N->getOperand(0);
-  SDValue N1 = N->getOperand(1);
-  EVT VT = N->getValueType(0);
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
-  if (!TLI.isOperationLegalOrCustom(ISD::FSHL, VT) ||
-      !TLI.isOperationLegalOrCustom(ISD::FSHR, VT))
-    return SDValue();
-
-  // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
-  bool OptForSize = DAG.shouldOptForSize();
-  unsigned Bits = VT.getScalarSizeInBits();
-
-  // SHLD/SHRD instructions have lower register pressure, but on some
-  // platforms they have higher latency than the equivalent
-  // series of shifts/or that would otherwise be generated.
-  // Don't fold (or (x << c) | (y >> (64 - c))) if SHLD/SHRD instructions
-  // have higher latencies and we are not optimizing for size.
-  if (!OptForSize && Subtarget.isSHLDSlow())
-    return SDValue();
-
-  if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
-    std::swap(N0, N1);
-  if (N0.getOpcode() != ISD::SHL || N1.getOpcode() != ISD::SRL)
-    return SDValue();
-  if (!N0.hasOneUse() || !N1.hasOneUse())
-    return SDValue();
-
-  EVT ShiftVT = TLI.getShiftAmountTy(VT, DAG.getDataLayout());
-
-  SDValue ShAmt0 = N0.getOperand(1);
-  if (ShAmt0.getValueType() != ShiftVT)
-    return SDValue();
-  SDValue ShAmt1 = N1.getOperand(1);
-  if (ShAmt1.getValueType() != ShiftVT)
-    return SDValue();
-
-  // Peek through any modulo shift masks.
-  SDValue ShMsk0;
-  if (ShAmt0.getOpcode() == ISD::AND &&
-      isa<ConstantSDNode>(ShAmt0.getOperand(1)) &&
-      ShAmt0.getConstantOperandAPInt(1) == (Bits - 1)) {
-    ShMsk0 = ShAmt0;
-    ShAmt0 = ShAmt0.getOperand(0);
-  }
-  SDValue ShMsk1;
-  if (ShAmt1.getOpcode() == ISD::AND &&
-      isa<ConstantSDNode>(ShAmt1.getOperand(1)) &&
-      ShAmt1.getConstantOperandAPInt(1) == (Bits - 1)) {
-    ShMsk1 = ShAmt1;
-    ShAmt1 = ShAmt1.getOperand(0);
-  }
-
-  if (ShAmt0.getOpcode() == ISD::TRUNCATE)
-    ShAmt0 = ShAmt0.getOperand(0);
-  if (ShAmt1.getOpcode() == ISD::TRUNCATE)
-    ShAmt1 = ShAmt1.getOperand(0);
-
-  SDLoc DL(N);
-  unsigned Opc = ISD::FSHL;
-  SDValue Op0 = N0.getOperand(0);
-  SDValue Op1 = N1.getOperand(0);
-  if (ShAmt0.getOpcode() == ISD::SUB || ShAmt0.getOpcode() == ISD::XOR) {
-    Opc = ISD::FSHR;
-    std::swap(Op0, Op1);
-    std::swap(ShAmt0, ShAmt1);
-    std::swap(ShMsk0, ShMsk1);
-  }
-
-  auto GetFunnelShift = [&DAG, &DL, VT, Opc, &ShiftVT](SDValue Op0, SDValue Op1,
-                                                       SDValue Amt) {
-    if (Opc == ISD::FSHR)
-      std::swap(Op0, Op1);
-    return DAG.getNode(Opc, DL, VT, Op0, Op1,
-                       DAG.getNode(ISD::TRUNCATE, DL, ShiftVT, Amt));
-  };
-
-  // OR( SHL( X, C ), SRL( Y, 32 - C ) ) -> FSHL( X, Y, C )
-  // OR( SRL( X, C ), SHL( Y, 32 - C ) ) -> FSHR( Y, X, C )
-  // OR( SHL( X, C ), SRL( SRL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHL( X, Y, C )
-  // OR( SRL( X, C ), SHL( SHL( Y, 1 ), XOR( C, 31 ) ) ) -> FSHR( Y, X, C )
-  // OR( SHL( X, AND( C, 31 ) ), SRL( Y, AND( 0 - C, 31 ) ) ) -> FSHL( X, Y, C )
-  // OR( SRL( X, AND( C, 31 ) ), SHL( Y, AND( 0 - C, 31 ) ) ) -> FSHR( Y, X, C )
-  if (ShAmt1.getOpcode() == ISD::SUB) {
-    SDValue Sum = ShAmt1.getOperand(0);
-    if (auto *SumC = dyn_cast<ConstantSDNode>(Sum)) {
-      SDValue ShAmt1Op1 = ShAmt1.getOperand(1);
-      if (ShAmt1Op1.getOpcode() == ISD::AND &&
-          isa<ConstantSDNode>(ShAmt1Op1.getOperand(1)) &&
-          ShAmt1Op1.getConstantOperandAPInt(1) == (Bits - 1)) {
-        ShMsk1 = ShAmt1Op1;
-        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
-      }
-      if (ShAmt1Op1.getOpcode() == ISD::TRUNCATE)
-        ShAmt1Op1 = ShAmt1Op1.getOperand(0);
-      if ((SumC->getAPIntValue() == Bits ||
-           (SumC->getAPIntValue() == 0 && ShMsk1)) &&
-          ShAmt1Op1 == ShAmt0)
-        return GetFunnelShift(Op0, Op1, ShAmt0);
-    }
-  } else if (auto *ShAmt1C = dyn_cast<ConstantSDNode>(ShAmt1)) {
-    auto *ShAmt0C = dyn_cast<ConstantSDNode>(ShAmt0);
-    if (ShAmt0C && (ShAmt0C->getSExtValue() + ShAmt1C->getSExtValue()) == Bits)
-      return GetFunnelShift(Op0, Op1, ShAmt0);
-  } else if (ShAmt1.getOpcode() == ISD::XOR) {
-    SDValue Mask = ShAmt1.getOperand(1);
-    if (auto *MaskC = dyn_cast<ConstantSDNode>(Mask)) {
-      unsigned InnerShift = (ISD::FSHL == Opc ? ISD::SRL : ISD::SHL);
-      SDValue ShAmt1Op0 = ShAmt1.getOperand(0);
-      if (ShAmt1Op0.getOpcode() == ISD::TRUNCATE)
-        ShAmt1Op0 = ShAmt1Op0.getOperand(0);
-      if (MaskC->getSExtValue() == (Bits - 1) &&
-          (ShAmt1Op0 == ShAmt0 || ShAmt1Op0 == ShMsk0)) {
-        if (Op1.getOpcode() == InnerShift &&
-            isa<ConstantSDNode>(Op1.getOperand(1)) &&
-            Op1.getConstantOperandAPInt(1).isOneValue()) {
-          return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
-        }
-        // Test for ADD( Y, Y ) as an equivalent to SHL( Y, 1 ).
-        if (InnerShift == ISD::SHL && Op1.getOpcode() == ISD::ADD &&
-            Op1.getOperand(0) == Op1.getOperand(1)) {
-          return GetFunnelShift(Op0, Op1.getOperand(0), ShAmt0);
-        }
-      }
-    }
-  }
-
-  return SDValue();
-}
-
 static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI,
                          const X86Subtarget &Subtarget) {
@@ -40771,7 +43239,8 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
   // TODO: Support multiple SrcOps.
   if (VT == MVT::i1) {
     SmallVector<SDValue, 2> SrcOps;
-    if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps) &&
+    SmallVector<APInt, 2> SrcPartials;
+    if (matchScalarReduction(SDValue(N, 0), ISD::OR, SrcOps, &SrcPartials) &&
         SrcOps.size() == 1) {
       SDLoc dl(N);
       const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -40781,13 +43250,19 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
       if (!Mask && TLI.isTypeLegal(SrcOps[0].getValueType()))
         Mask = DAG.getBitcast(MaskVT, SrcOps[0]);
       if (Mask) {
-        APInt AllBits = APInt::getNullValue(NumElts);
-        return DAG.getSetCC(dl, MVT::i1, Mask,
-                            DAG.getConstant(AllBits, dl, MaskVT), ISD::SETNE);
+        assert(SrcPartials[0].getBitWidth() == NumElts &&
+               "Unexpected partial reduction mask");
+        SDValue ZeroBits = DAG.getConstant(0, dl, MaskVT);
+        SDValue PartialBits = DAG.getConstant(SrcPartials[0], dl, MaskVT);
+        Mask = DAG.getNode(ISD::AND, dl, MaskVT, Mask, PartialBits);
+        return DAG.getSetCC(dl, MVT::i1, Mask, ZeroBits, ISD::SETNE);
       }
     }
   }
 
+  if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
+    return R;
+
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -40803,8 +43278,33 @@ static SDValue combineOr(SDNode *N, SelectionDAG &DAG,
   if (SDValue R = combineLogicBlendIntoPBLENDV(N, DAG, Subtarget))
     return R;
 
-  if (SDValue R = combineOrShiftToFunnelShift(N, DAG, Subtarget))
-    return R;
+  // Combine OR(X,KSHIFTL(Y,Elts/2)) -> CONCAT_VECTORS(X,Y) == KUNPCK(X,Y).
+  // Combine OR(KSHIFTL(X,Elts/2),Y) -> CONCAT_VECTORS(Y,X) == KUNPCK(Y,X).
+  // iff the upper elements of the non-shifted arg are zero.
+  // KUNPCK require 16+ bool vector elements.
+  if (N0.getOpcode() == X86ISD::KSHIFTL || N1.getOpcode() == X86ISD::KSHIFTL) {
+    unsigned NumElts = VT.getVectorNumElements();
+    unsigned HalfElts = NumElts / 2;
+    APInt UpperElts = APInt::getHighBitsSet(NumElts, HalfElts);
+    if (NumElts >= 16 && N1.getOpcode() == X86ISD::KSHIFTL &&
+        N1.getConstantOperandAPInt(1) == HalfElts &&
+        DAG.MaskedValueIsZero(N0, APInt(1, 1), UpperElts)) {
+      SDLoc dl(N);
+      return DAG.getNode(
+          ISD::CONCAT_VECTORS, dl, VT,
+          extractSubVector(N0, 0, DAG, dl, HalfElts),
+          extractSubVector(N1.getOperand(0), 0, DAG, dl, HalfElts));
+    }
+    if (NumElts >= 16 && N0.getOpcode() == X86ISD::KSHIFTL &&
+        N0.getConstantOperandAPInt(1) == HalfElts &&
+        DAG.MaskedValueIsZero(N1, APInt(1, 1), UpperElts)) {
+      SDLoc dl(N);
+      return DAG.getNode(
+          ISD::CONCAT_VECTORS, dl, VT,
+          extractSubVector(N1, 0, DAG, dl, HalfElts),
+          extractSubVector(N0.getOperand(0), 0, DAG, dl, HalfElts));
+    }
+  }
 
   // Attempt to recursively combine an OR of shuffles.
   if (VT.isVector() && (VT.getScalarSizeInBits() % 8) == 0) {
@@ -41153,18 +43653,9 @@ static SDValue detectAVGPattern(SDValue In, EVT VT, SelectionDAG &DAG,
   // A lambda checking the given SDValue is a constant vector and each element
   // is in the range [Min, Max].
   auto IsConstVectorInRange = [](SDValue V, unsigned Min, unsigned Max) {
-    BuildVectorSDNode *BV = dyn_cast<BuildVectorSDNode>(V);
-    if (!BV || !BV->isConstant())
-      return false;
-    for (SDValue Op : V->ops()) {
-      ConstantSDNode *C = dyn_cast<ConstantSDNode>(Op);
-      if (!C)
-        return false;
-      const APInt &Val = C->getAPIntValue();
-      if (Val.ult(Min) || Val.ugt(Max))
-        return false;
-    }
-    return true;
+    return ISD::matchUnaryPredicate(V, [Min, Max](ConstantSDNode *C) {
+      return !(C->getAPIntValue().ult(Min) || C->getAPIntValue().ugt(Max));
+    });
   };
 
   // Check if each element of the vector is right-shifted by one.
@@ -41265,10 +43756,10 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
   // pre-AVX2 targets as 32-byte loads will lower to regular temporal loads.
   ISD::LoadExtType Ext = Ld->getExtensionType();
   bool Fast;
-  unsigned Alignment = Ld->getAlignment();
   if (RegVT.is256BitVector() && !DCI.isBeforeLegalizeOps() &&
       Ext == ISD::NON_EXTLOAD &&
-      ((Ld->isNonTemporal() && !Subtarget.hasInt256() && Alignment >= 16) ||
+      ((Ld->isNonTemporal() && !Subtarget.hasInt256() &&
+        Ld->getAlignment() >= 16) ||
        (TLI.allowsMemoryAccess(*DAG.getContext(), DAG.getDataLayout(), RegVT,
                                *Ld->getMemOperand(), &Fast) &&
         !Fast))) {
@@ -41276,17 +43767,18 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
     if (NumElems < 2)
       return SDValue();
 
-    unsigned HalfAlign = 16;
+    unsigned HalfOffset = 16;
     SDValue Ptr1 = Ld->getBasePtr();
-    SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfAlign, dl);
+    SDValue Ptr2 = DAG.getMemBasePlusOffset(Ptr1, HalfOffset, dl);
     EVT HalfVT = EVT::getVectorVT(*DAG.getContext(), MemVT.getScalarType(),
                                   NumElems / 2);
     SDValue Load1 =
         DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr1, Ld->getPointerInfo(),
-                    Alignment, Ld->getMemOperand()->getFlags());
+                    Ld->getOriginalAlign(),
+                    Ld->getMemOperand()->getFlags());
     SDValue Load2 = DAG.getLoad(HalfVT, dl, Ld->getChain(), Ptr2,
-                                Ld->getPointerInfo().getWithOffset(HalfAlign),
-                                MinAlign(Alignment, HalfAlign),
+                                Ld->getPointerInfo().getWithOffset(HalfOffset),
+                                Ld->getOriginalAlign(),
                                 Ld->getMemOperand()->getFlags());
     SDValue TF = DAG.getNode(ISD::TokenFactor, dl, MVT::Other,
                              Load1.getValue(1), Load2.getValue(1));
@@ -41303,13 +43795,28 @@ static SDValue combineLoad(SDNode *N, SelectionDAG &DAG,
     EVT IntVT = EVT::getIntegerVT(*DAG.getContext(), NumElts);
     if (TLI.isTypeLegal(IntVT)) {
       SDValue IntLoad = DAG.getLoad(IntVT, dl, Ld->getChain(), Ld->getBasePtr(),
-                                    Ld->getPointerInfo(), Alignment,
+                                    Ld->getPointerInfo(),
+                                    Ld->getOriginalAlign(),
                                     Ld->getMemOperand()->getFlags());
       SDValue BoolVec = DAG.getBitcast(RegVT, IntLoad);
       return DCI.CombineTo(N, BoolVec, IntLoad.getValue(1), true);
     }
   }
 
+  // Cast ptr32 and ptr64 pointers to the default address space before a load.
+  unsigned AddrSpace = Ld->getAddressSpace();
+  if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
+      AddrSpace == X86AS::PTR32_UPTR) {
+    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+    if (PtrVT != Ld->getBasePtr().getSimpleValueType()) {
+      SDValue Cast =
+          DAG.getAddrSpaceCast(dl, PtrVT, Ld->getBasePtr(), AddrSpace, 0);
+      return DAG.getLoad(RegVT, dl, Ld->getChain(), Cast, Ld->getPointerInfo(),
+                         Ld->getOriginalAlign(),
+                         Ld->getMemOperand()->getFlags());
+    }
+  }
+
   return SDValue();
 }
 
@@ -41456,7 +43963,7 @@ combineMaskedLoadConstantMask(MaskedLoadSDNode *ML, SelectionDAG &DAG,
 static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
                                  TargetLowering::DAGCombinerInfo &DCI,
                                  const X86Subtarget &Subtarget) {
-  MaskedLoadSDNode *Mld = cast<MaskedLoadSDNode>(N);
+  auto *Mld = cast<MaskedLoadSDNode>(N);
 
   // TODO: Expanding load with constant mask may be optimized as well.
   if (Mld->isExpandingLoad())
@@ -41465,12 +43972,33 @@ static SDValue combineMaskedLoad(SDNode *N, SelectionDAG &DAG,
   if (Mld->getExtensionType() == ISD::NON_EXTLOAD) {
     if (SDValue ScalarLoad = reduceMaskedLoadToScalarLoad(Mld, DAG, DCI))
       return ScalarLoad;
+
     // TODO: Do some AVX512 subsets benefit from this transform?
     if (!Subtarget.hasAVX512())
       if (SDValue Blend = combineMaskedLoadConstantMask(Mld, DAG, DCI))
         return Blend;
   }
 
+  // If the mask value has been legalized to a non-boolean vector, try to
+  // simplify ops leading up to it. We only demand the MSB of each lane.
+  SDValue Mask = Mld->getMask();
+  if (Mask.getScalarValueSizeInBits() != 1) {
+    EVT VT = Mld->getValueType(0);
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
+    if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        DCI.AddToWorklist(N);
+      return SDValue(N, 0);
+    }
+    if (SDValue NewMask =
+            TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
+      return DAG.getMaskedLoad(
+          VT, SDLoc(N), Mld->getChain(), Mld->getBasePtr(), Mld->getOffset(),
+          NewMask, Mld->getPassThru(), Mld->getMemoryVT(), Mld->getMemOperand(),
+          Mld->getAddressingMode(), Mld->getExtensionType());
+  }
+
   return SDValue();
 }
 
@@ -41522,9 +44050,18 @@ static SDValue combineMaskedStore(SDNode *N, SelectionDAG &DAG,
   // simplify ops leading up to it. We only demand the MSB of each lane.
   SDValue Mask = Mst->getMask();
   if (Mask.getScalarValueSizeInBits() != 1) {
-    APInt DemandedMask(APInt::getSignMask(VT.getScalarSizeInBits()));
-    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+    APInt DemandedBits(APInt::getSignMask(VT.getScalarSizeInBits()));
+    if (TLI.SimplifyDemandedBits(Mask, DemandedBits, DCI)) {
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        DCI.AddToWorklist(N);
       return SDValue(N, 0);
+    }
+    if (SDValue NewMask =
+            TLI.SimplifyMultipleUseDemandedBits(Mask, DemandedBits, DAG))
+      return DAG.getMaskedStore(Mst->getChain(), SDLoc(N), Mst->getValue(),
+                                Mst->getBasePtr(), Mst->getOffset(), NewMask,
+                                Mst->getMemoryVT(), Mst->getMemOperand(),
+                                Mst->getAddressingMode());
   }
 
   SDValue Value = Mst->getValue();
@@ -41546,7 +44083,6 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   StoreSDNode *St = cast<StoreSDNode>(N);
   EVT StVT = St->getMemoryVT();
   SDLoc dl(St);
-  unsigned Alignment = St->getAlignment();
   SDValue StoredVal = St->getValue();
   EVT VT = StoredVal.getValueType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -41559,7 +44095,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     StoredVal = DAG.getBitcast(NewVT, StoredVal);
 
     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
-                        St->getPointerInfo(), St->getAlignment(),
+                        St->getPointerInfo(), St->getOriginalAlign(),
                         St->getMemOperand()->getFlags());
   }
 
@@ -41570,7 +44106,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
       StoredVal.getOperand(0).getValueType() == MVT::i8) {
     return DAG.getStore(St->getChain(), dl, StoredVal.getOperand(0),
                         St->getBasePtr(), St->getPointerInfo(),
-                        St->getAlignment(), St->getMemOperand()->getFlags());
+                        St->getOriginalAlign(),
+                        St->getMemOperand()->getFlags());
   }
 
   // Widen v2i1/v4i1 stores to v8i1.
@@ -41581,7 +44118,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     Ops[0] = StoredVal;
     StoredVal = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i1, Ops);
     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
-                        St->getPointerInfo(), St->getAlignment(),
+                        St->getPointerInfo(), St->getOriginalAlign(),
                         St->getMemOperand()->getFlags());
   }
 
@@ -41590,7 +44127,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
        VT == MVT::v64i1) && VT == StVT && TLI.isTypeLegal(VT) &&
       ISD::isBuildVectorOfConstantSDNodes(StoredVal.getNode())) {
     // If its a v64i1 store without 64-bit support, we need two stores.
-    if (VT == MVT::v64i1 && !Subtarget.is64Bit()) {
+    if (!DCI.isBeforeLegalize() && VT == MVT::v64i1 && !Subtarget.is64Bit()) {
       SDValue Lo = DAG.getBuildVector(MVT::v32i1, dl,
                                       StoredVal->ops().slice(0, 32));
       Lo = combinevXi1ConstantToInteger(Lo, DAG);
@@ -41603,18 +44140,19 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
 
       SDValue Ch0 =
           DAG.getStore(St->getChain(), dl, Lo, Ptr0, St->getPointerInfo(),
-                       Alignment, St->getMemOperand()->getFlags());
+                       St->getOriginalAlign(),
+                       St->getMemOperand()->getFlags());
       SDValue Ch1 =
           DAG.getStore(St->getChain(), dl, Hi, Ptr1,
                        St->getPointerInfo().getWithOffset(4),
-                       MinAlign(Alignment, 4U),
+                       St->getOriginalAlign(),
                        St->getMemOperand()->getFlags());
       return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Ch0, Ch1);
     }
 
     StoredVal = combinevXi1ConstantToInteger(StoredVal, DAG);
     return DAG.getStore(St->getChain(), dl, StoredVal, St->getBasePtr(),
-                        St->getPointerInfo(), St->getAlignment(),
+                        St->getPointerInfo(), St->getOriginalAlign(),
                         St->getMemOperand()->getFlags());
   }
 
@@ -41633,7 +44171,8 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
   }
 
   // Split under-aligned vector non-temporal stores.
-  if (St->isNonTemporal() && StVT == VT && Alignment < VT.getStoreSize()) {
+  if (St->isNonTemporal() && StVT == VT &&
+      St->getAlignment() < VT.getStoreSize()) {
     // ZMM/YMM nt-stores - either it can be stored as a series of shorter
     // vectors or the legalizer can scalarize it to use MOVNTI.
     if (VT.is256BitVector() || VT.is512BitVector()) {
@@ -41687,7 +44226,7 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
       if (SDValue Avg = detectAVGPattern(St->getValue(), St->getMemoryVT(), DAG,
                                          Subtarget, dl))
         return DAG.getStore(St->getChain(), dl, Avg, St->getBasePtr(),
-                            St->getPointerInfo(), St->getAlignment(),
+                            St->getPointerInfo(), St->getOriginalAlign(),
                             St->getMemOperand()->getFlags());
 
     if (TLI.isTruncStoreLegal(VT, StVT)) {
@@ -41705,6 +44244,20 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     return SDValue();
   }
 
+  // Cast ptr32 and ptr64 pointers to the default address space before a store.
+  unsigned AddrSpace = St->getAddressSpace();
+  if (AddrSpace == X86AS::PTR64 || AddrSpace == X86AS::PTR32_SPTR ||
+      AddrSpace == X86AS::PTR32_UPTR) {
+    MVT PtrVT = TLI.getPointerTy(DAG.getDataLayout());
+    if (PtrVT != St->getBasePtr().getSimpleValueType()) {
+      SDValue Cast =
+          DAG.getAddrSpaceCast(dl, PtrVT, St->getBasePtr(), AddrSpace, 0);
+      return DAG.getStore(St->getChain(), dl, StoredVal, Cast,
+                          St->getPointerInfo(), St->getOriginalAlign(),
+                          St->getMemOperand()->getFlags(), St->getAAInfo());
+    }
+  }
+
   // Turn load->store of MMX types into GPR load/stores.  This avoids clobbering
   // the FP state in cases where an emms may be missing.
   // A preferable solution to the general problem is to figure out the right
@@ -41759,13 +44312,38 @@ static SDValue combineStore(SDNode *N, SelectionDAG &DAG,
     SDValue NewExtract = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f64,
                                      BitCast, OldExtract.getOperand(1));
     return DAG.getStore(St->getChain(), dl, NewExtract, St->getBasePtr(),
-                        St->getPointerInfo(), St->getAlignment(),
+                        St->getPointerInfo(), St->getOriginalAlign(),
                         St->getMemOperand()->getFlags());
   }
 
   return SDValue();
 }
 
+static SDValue combineVEXTRACT_STORE(SDNode *N, SelectionDAG &DAG,
+                                     TargetLowering::DAGCombinerInfo &DCI,
+                                     const X86Subtarget &Subtarget) {
+  auto *St = cast<MemIntrinsicSDNode>(N);
+
+  SDValue StoredVal = N->getOperand(1);
+  MVT VT = StoredVal.getSimpleValueType();
+  EVT MemVT = St->getMemoryVT();
+
+  // Figure out which elements we demand.
+  unsigned StElts = MemVT.getSizeInBits() / VT.getScalarSizeInBits();
+  APInt DemandedElts = APInt::getLowBitsSet(VT.getVectorNumElements(), StElts);
+
+  APInt KnownUndef, KnownZero;
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  if (TLI.SimplifyDemandedVectorElts(StoredVal, DemandedElts, KnownUndef,
+                                     KnownZero, DCI)) {
+    if (N->getOpcode() != ISD::DELETED_NODE)
+      DCI.AddToWorklist(N);
+    return SDValue(N, 0);
+  }
+
+  return SDValue();
+}
+
 /// Return 'true' if this vector operation is "horizontal"
 /// and return the operands for the horizontal operation in LHS and RHS.  A
 /// horizontal operation performs the binary operation on successive elements
@@ -42002,17 +44580,6 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
   // of one truncation.
   // i.e. if one of the inputs will constant fold or the input is repeated.
   switch (SrcOpcode) {
-  case ISD::AND:
-  case ISD::XOR:
-  case ISD::OR: {
-    SDValue Op0 = Src.getOperand(0);
-    SDValue Op1 = Src.getOperand(1);
-    if (TLI.isOperationLegalOrPromote(SrcOpcode, VT) &&
-        (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
-      return TruncateArithmetic(Op0, Op1);
-    break;
-  }
-
   case ISD::MUL:
     // X86 is rubbish at scalar and vector i64 multiplies (until AVX512DQ) - its
     // better to truncate if we have the chance.
@@ -42021,21 +44588,15 @@ static SDValue combineTruncatedArithmetic(SDNode *N, SelectionDAG &DAG,
         !TLI.isOperationLegal(SrcOpcode, SrcVT))
       return TruncateArithmetic(Src.getOperand(0), Src.getOperand(1));
     LLVM_FALLTHROUGH;
-  case ISD::ADD: {
-    SDValue Op0 = Src.getOperand(0);
-    SDValue Op1 = Src.getOperand(1);
-    if (TLI.isOperationLegal(SrcOpcode, VT) &&
-        (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
-      return TruncateArithmetic(Op0, Op1);
-    break;
-  }
+  case ISD::AND:
+  case ISD::XOR:
+  case ISD::OR:
+  case ISD::ADD:
   case ISD::SUB: {
-    // TODO: ISD::SUB We are conservative and require both sides to be freely
-    // truncatable to avoid interfering with combineSubToSubus.
     SDValue Op0 = Src.getOperand(0);
     SDValue Op1 = Src.getOperand(1);
     if (TLI.isOperationLegal(SrcOpcode, VT) &&
-        (Op0 == Op1 || (IsFreeTruncation(Op0) && IsFreeTruncation(Op1))))
+        (Op0 == Op1 || IsFreeTruncation(Op0) || IsFreeTruncation(Op1)))
       return TruncateArithmetic(Op0, Op1);
     break;
   }
@@ -42146,13 +44707,17 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
   MVT InSVT = InVT.getScalarType();
 
   // Check we have a truncation suited for PACKSS/PACKUS.
-  if (!VT.is128BitVector() && !VT.is256BitVector())
+  if (!isPowerOf2_32(VT.getVectorNumElements()))
     return SDValue();
   if (SVT != MVT::i8 && SVT != MVT::i16 && SVT != MVT::i32)
     return SDValue();
   if (InSVT != MVT::i16 && InSVT != MVT::i32 && InSVT != MVT::i64)
     return SDValue();
 
+  // Truncation to sub-128bit vXi32 can be better handled with shuffles.
+  if (SVT == MVT::i32 && VT.getSizeInBits() < 128)
+    return SDValue();
+
   // AVX512 has fast truncate, but if the input is already going to be split,
   // there's no harm in trying pack.
   if (Subtarget.hasAVX512() &&
@@ -42173,6 +44738,13 @@ static SDValue combineVectorSignBitsTruncation(SDNode *N, const SDLoc &DL,
   // Use PACKSS if the input has sign-bits that extend all the way to the
   // packed/truncated value. e.g. Comparison result, sext_in_reg, etc.
   unsigned NumSignBits = DAG.ComputeNumSignBits(In);
+
+  // Don't use PACKSS for vXi64 -> vXi32 truncations unless we're dealing with
+  // a sign splat. ComputeNumSignBits struggles to see through BITCASTs later
+  // on and combines/simplifications can't then use it.
+  if (SVT == MVT::i32 && NumSignBits != InSVT.getSizeInBits())
+    return SDValue();
+
   if (NumSignBits > (InSVT.getSizeInBits() - NumPackedSignBits))
     return truncateVectorWithPACK(X86ISD::PACKSS, VT, In, DL, DAG, Subtarget);
 
@@ -42201,9 +44773,9 @@ static SDValue combinePMULH(SDValue Src, EVT VT, const SDLoc &DL,
   if (!VT.isVector() || VT.getVectorElementType() != MVT::i16)
     return SDValue();
 
-  // Input type should be vXi32.
+  // Input type should be at least vXi32.
   EVT InVT = Src.getValueType();
-  if (InVT.getVectorElementType() != MVT::i32)
+  if (InVT.getVectorElementType().getSizeInBits() < 32)
     return SDValue();
 
   // Need a shift by 16.
@@ -42412,7 +44984,8 @@ static SDValue combineTruncate(SDNode *N, SelectionDAG &DAG,
   return combineVectorTruncation(N, DAG, Subtarget);
 }
 
-static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {
+static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG,
+                             TargetLowering::DAGCombinerInfo &DCI) {
   EVT VT = N->getValueType(0);
   SDValue In = N->getOperand(0);
   SDLoc DL(N);
@@ -42422,6 +44995,11 @@ static SDValue combineVTRUNC(SDNode *N, SelectionDAG &DAG) {
   if (auto USatVal = detectUSatPattern(In, VT, DAG, DL))
     return DAG.getNode(X86ISD::VTRUNCUS, DL, VT, USatVal);
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  APInt DemandedMask(APInt::getAllOnesValue(VT.getScalarSizeInBits()));
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
+    return SDValue(N, 0);
+
   return SDValue();
 }
 
@@ -42514,37 +45092,46 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
   if (NegMul) {
     switch (Opcode) {
     default: llvm_unreachable("Unexpected opcode");
-    case ISD::FMA:             Opcode = X86ISD::FNMADD;       break;
-    case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMADD_RND;   break;
-    case X86ISD::FMSUB:        Opcode = X86ISD::FNMSUB;       break;
-    case X86ISD::FMSUB_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
-    case X86ISD::FNMADD:       Opcode = ISD::FMA;             break;
-    case X86ISD::FNMADD_RND:   Opcode = X86ISD::FMADD_RND;    break;
-    case X86ISD::FNMSUB:       Opcode = X86ISD::FMSUB;        break;
-    case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FMSUB_RND;    break;
+    case ISD::FMA:              Opcode = X86ISD::FNMADD;        break;
+    case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FNMADD; break;
+    case X86ISD::FMADD_RND:     Opcode = X86ISD::FNMADD_RND;    break;
+    case X86ISD::FMSUB:         Opcode = X86ISD::FNMSUB;        break;
+    case X86ISD::STRICT_FMSUB:  Opcode = X86ISD::STRICT_FNMSUB; break;
+    case X86ISD::FMSUB_RND:     Opcode = X86ISD::FNMSUB_RND;    break;
+    case X86ISD::FNMADD:        Opcode = ISD::FMA;              break;
+    case X86ISD::STRICT_FNMADD: Opcode = ISD::STRICT_FMA;       break;
+    case X86ISD::FNMADD_RND:    Opcode = X86ISD::FMADD_RND;     break;
+    case X86ISD::FNMSUB:        Opcode = X86ISD::FMSUB;         break;
+    case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FMSUB;  break;
+    case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FMSUB_RND;     break;
     }
   }
 
   if (NegAcc) {
     switch (Opcode) {
     default: llvm_unreachable("Unexpected opcode");
-    case ISD::FMA:             Opcode = X86ISD::FMSUB;        break;
-    case X86ISD::FMADD_RND:    Opcode = X86ISD::FMSUB_RND;    break;
-    case X86ISD::FMSUB:        Opcode = ISD::FMA;             break;
-    case X86ISD::FMSUB_RND:    Opcode = X86ISD::FMADD_RND;    break;
-    case X86ISD::FNMADD:       Opcode = X86ISD::FNMSUB;       break;
-    case X86ISD::FNMADD_RND:   Opcode = X86ISD::FNMSUB_RND;   break;
-    case X86ISD::FNMSUB:       Opcode = X86ISD::FNMADD;       break;
-    case X86ISD::FNMSUB_RND:   Opcode = X86ISD::FNMADD_RND;   break;
-    case X86ISD::FMADDSUB:     Opcode = X86ISD::FMSUBADD;     break;
-    case X86ISD::FMADDSUB_RND: Opcode = X86ISD::FMSUBADD_RND; break;
-    case X86ISD::FMSUBADD:     Opcode = X86ISD::FMADDSUB;     break;
-    case X86ISD::FMSUBADD_RND: Opcode = X86ISD::FMADDSUB_RND; break;
+    case ISD::FMA:              Opcode = X86ISD::FMSUB;         break;
+    case ISD::STRICT_FMA:       Opcode = X86ISD::STRICT_FMSUB;  break;
+    case X86ISD::FMADD_RND:     Opcode = X86ISD::FMSUB_RND;     break;
+    case X86ISD::FMSUB:         Opcode = ISD::FMA;              break;
+    case X86ISD::STRICT_FMSUB:  Opcode = ISD::STRICT_FMA;       break;
+    case X86ISD::FMSUB_RND:     Opcode = X86ISD::FMADD_RND;     break;
+    case X86ISD::FNMADD:        Opcode = X86ISD::FNMSUB;        break;
+    case X86ISD::STRICT_FNMADD: Opcode = X86ISD::STRICT_FNMSUB; break;
+    case X86ISD::FNMADD_RND:    Opcode = X86ISD::FNMSUB_RND;    break;
+    case X86ISD::FNMSUB:        Opcode = X86ISD::FNMADD;        break;
+    case X86ISD::STRICT_FNMSUB: Opcode = X86ISD::STRICT_FNMADD; break;
+    case X86ISD::FNMSUB_RND:    Opcode = X86ISD::FNMADD_RND;    break;
+    case X86ISD::FMADDSUB:      Opcode = X86ISD::FMSUBADD;      break;
+    case X86ISD::FMADDSUB_RND:  Opcode = X86ISD::FMSUBADD_RND;  break;
+    case X86ISD::FMSUBADD:      Opcode = X86ISD::FMADDSUB;      break;
+    case X86ISD::FMSUBADD_RND:  Opcode = X86ISD::FMADDSUB_RND;  break;
     }
   }
 
   if (NegRes) {
     switch (Opcode) {
+    // For accuracy reason, we never combine fneg and fma under strict FP.
     default: llvm_unreachable("Unexpected opcode");
     case ISD::FMA:             Opcode = X86ISD::FNMSUB;       break;
     case X86ISD::FMADD_RND:    Opcode = X86ISD::FNMSUB_RND;   break;
@@ -42562,18 +45149,20 @@ static unsigned negateFMAOpcode(unsigned Opcode, bool NegMul, bool NegAcc,
 
 /// Do target-specific dag combines on floating point negations.
 static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
+                           TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget &Subtarget) {
   EVT OrigVT = N->getValueType(0);
   SDValue Arg = isFNEG(DAG, N);
   if (!Arg)
     return SDValue();
 
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
   EVT VT = Arg.getValueType();
   EVT SVT = VT.getScalarType();
   SDLoc DL(N);
 
   // Let legalize expand this if it isn't a legal type yet.
-  if (!DAG.getTargetLoweringInfo().isTypeLegal(VT))
+  if (!TLI.isTypeLegal(VT))
     return SDValue();
 
   // If we're negating a FMUL node on a target with FMA, then we can avoid the
@@ -42587,80 +45176,25 @@ static SDValue combineFneg(SDNode *N, SelectionDAG &DAG,
     return DAG.getBitcast(OrigVT, NewNode);
   }
 
-  // If we're negating an FMA node, then we can adjust the
-  // instruction to include the extra negation.
-  if (Arg.hasOneUse() && Subtarget.hasAnyFMA()) {
-    switch (Arg.getOpcode()) {
-    case ISD::FMA:
-    case X86ISD::FMSUB:
-    case X86ISD::FNMADD:
-    case X86ISD::FNMSUB:
-    case X86ISD::FMADD_RND:
-    case X86ISD::FMSUB_RND:
-    case X86ISD::FNMADD_RND:
-    case X86ISD::FNMSUB_RND: {
-      // We can't handle scalar intrinsic node here because it would only
-      // invert one element and not the whole vector. But we could try to handle
-      // a negation of the lower element only.
-      unsigned NewOpcode = negateFMAOpcode(Arg.getOpcode(), false, false, true);
-      return DAG.getBitcast(OrigVT, DAG.getNode(NewOpcode, DL, VT, Arg->ops()));
-    }
-    }
-  }
+  bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
+  bool LegalOperations = !DCI.isBeforeLegalizeOps();
+  if (SDValue NegArg =
+          TLI.getNegatedExpression(Arg, DAG, LegalOperations, CodeSize))
+    return DAG.getBitcast(OrigVT, NegArg);
 
   return SDValue();
 }
 
-char X86TargetLowering::isNegatibleForFree(SDValue Op, SelectionDAG &DAG,
-                                           bool LegalOperations,
-                                           bool ForCodeSize,
-                                           unsigned Depth) const {
-  // fneg patterns are removable even if they have multiple uses.
-  if (isFNEG(DAG, Op.getNode(), Depth))
-    return 2;
-
-  // Don't recurse exponentially.
-  if (Depth > SelectionDAG::MaxRecursionDepth)
-    return 0;
-
-  EVT VT = Op.getValueType();
-  EVT SVT = VT.getScalarType();
-  switch (Op.getOpcode()) {
-  case ISD::FMA:
-  case X86ISD::FMSUB:
-  case X86ISD::FNMADD:
-  case X86ISD::FNMSUB:
-  case X86ISD::FMADD_RND:
-  case X86ISD::FMSUB_RND:
-  case X86ISD::FNMADD_RND:
-  case X86ISD::FNMSUB_RND: {
-    if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
-        !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)
-      break;
-
-    // This is always negatible for free but we might be able to remove some
-    // extra operand negations as well.
-    for (int i = 0; i != 3; ++i) {
-      char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
-                                  ForCodeSize, Depth + 1);
-      if (V == 2)
-        return V;
-    }
-    return 1;
-  }
-  }
-
-  return TargetLowering::isNegatibleForFree(Op, DAG, LegalOperations,
-                                            ForCodeSize, Depth);
-}
-
 SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
                                                 bool LegalOperations,
                                                 bool ForCodeSize,
+                                                NegatibleCost &Cost,
                                                 unsigned Depth) const {
   // fneg patterns are removable even if they have multiple uses.
-  if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth))
+  if (SDValue Arg = isFNEG(DAG, Op.getNode(), Depth)) {
+    Cost = NegatibleCost::Cheaper;
     return DAG.getBitcast(Op.getValueType(), Arg);
+  }
 
   EVT VT = Op.getValueType();
   EVT SVT = VT.getScalarType();
@@ -42675,35 +45209,41 @@ SDValue X86TargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
   case X86ISD::FNMADD_RND:
   case X86ISD::FNMSUB_RND: {
     if (!Op.hasOneUse() || !Subtarget.hasAnyFMA() || !isTypeLegal(VT) ||
-        !(SVT == MVT::f32 || SVT == MVT::f64) || !LegalOperations)
+        !(SVT == MVT::f32 || SVT == MVT::f64) ||
+        !isOperationLegal(ISD::FMA, VT))
       break;
 
     // This is always negatible for free but we might be able to remove some
     // extra operand negations as well.
     SmallVector<SDValue, 4> NewOps(Op.getNumOperands(), SDValue());
-    for (int i = 0; i != 3; ++i) {
-      char V = isNegatibleForFree(Op.getOperand(i), DAG, LegalOperations,
-                                  ForCodeSize, Depth + 1);
-      if (V == 2)
-        NewOps[i] = getNegatedExpression(Op.getOperand(i), DAG, LegalOperations,
-                                         ForCodeSize, Depth + 1);
-    }
+    for (int i = 0; i != 3; ++i)
+      NewOps[i] = getCheaperNegatedExpression(
+          Op.getOperand(i), DAG, LegalOperations, ForCodeSize, Depth + 1);
 
     bool NegA = !!NewOps[0];
     bool NegB = !!NewOps[1];
     bool NegC = !!NewOps[2];
     unsigned NewOpc = negateFMAOpcode(Opc, NegA != NegB, NegC, true);
 
+    Cost = (NegA || NegB || NegC) ? NegatibleCost::Cheaper
+                                  : NegatibleCost::Neutral;
+
     // Fill in the non-negated ops with the original values.
     for (int i = 0, e = Op.getNumOperands(); i != e; ++i)
       if (!NewOps[i])
         NewOps[i] = Op.getOperand(i);
     return DAG.getNode(NewOpc, SDLoc(Op), VT, NewOps);
   }
+  case X86ISD::FRCP:
+    if (SDValue NegOp0 =
+            getNegatedExpression(Op.getOperand(0), DAG, LegalOperations,
+                                 ForCodeSize, Cost, Depth + 1))
+      return DAG.getNode(Opc, SDLoc(Op), VT, NegOp0);
+    break;
   }
 
   return TargetLowering::getNegatedExpression(Op, DAG, LegalOperations,
-                                              ForCodeSize, Depth);
+                                              ForCodeSize, Cost, Depth);
 }
 
 static SDValue lowerX86FPLogicOp(SDNode *N, SelectionDAG &DAG,
@@ -42764,6 +45304,9 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
   if (SDValue Cmp = foldVectorXorShiftIntoCmp(N, DAG, Subtarget))
     return Cmp;
 
+  if (SDValue R = combineBitOpWithMOVMSK(N, DAG))
+    return R;
+
   if (DCI.isBeforeLegalizeOps())
     return SDValue();
 
@@ -42776,33 +45319,21 @@ static SDValue combineXor(SDNode *N, SelectionDAG &DAG,
   if (SDValue FPLogic = convertIntLogicToFPLogic(N, DAG, Subtarget))
     return FPLogic;
 
-  return combineFneg(N, DAG, Subtarget);
+  return combineFneg(N, DAG, DCI, Subtarget);
 }
 
 static SDValue combineBEXTR(SDNode *N, SelectionDAG &DAG,
                             TargetLowering::DAGCombinerInfo &DCI,
                             const X86Subtarget &Subtarget) {
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
   EVT VT = N->getValueType(0);
   unsigned NumBits = VT.getSizeInBits();
 
-  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-
   // TODO - Constant Folding.
-  if (auto *Cst1 = dyn_cast<ConstantSDNode>(Op1)) {
-    // Reduce Cst1 to the bottom 16-bits.
-    // NOTE: SimplifyDemandedBits won't do this for constants.
-    const APInt &Val1 = Cst1->getAPIntValue();
-    APInt MaskedVal1 = Val1 & 0xFFFF;
-    if (MaskedVal1 != Val1)
-      return DAG.getNode(X86ISD::BEXTR, SDLoc(N), VT, Op0,
-                         DAG.getConstant(MaskedVal1, SDLoc(N), VT));
-  }
-
-  // Only bottom 16-bits of the control bits are required.
-  APInt DemandedMask(APInt::getLowBitsSet(NumBits, 16));
-  if (TLI.SimplifyDemandedBits(Op1, DemandedMask, DCI))
+
+  // Simplify the inputs.
+  const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+  APInt DemandedMask(APInt::getAllOnesValue(NumBits));
+  if (TLI.SimplifyDemandedBits(SDValue(N, 0), DemandedMask, DCI))
     return SDValue(N, 0);
 
   return SDValue();
@@ -42893,6 +45424,7 @@ static SDValue combineFAndn(SDNode *N, SelectionDAG &DAG,
 
 /// Do target-specific dag combines on X86ISD::FOR and X86ISD::FXOR nodes.
 static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
+                          TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
   assert(N->getOpcode() == X86ISD::FOR || N->getOpcode() == X86ISD::FXOR);
 
@@ -42904,7 +45436,7 @@ static SDValue combineFOr(SDNode *N, SelectionDAG &DAG,
   if (isNullFPScalarOrVectorConst(N->getOperand(1)))
     return N->getOperand(0);
 
-  if (SDValue NewVal = combineFneg(N, DAG, Subtarget))
+  if (SDValue NewVal = combineFneg(N, DAG, DCI, Subtarget))
     return NewVal;
 
   return lowerX86FPLogicOp(N, DAG, Subtarget);
@@ -43015,23 +45547,16 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
     LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(0));
-    // Unless the load is volatile or atomic.
-    if (LN->isSimple()) {
+    unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+    MVT MemVT = MVT::getIntegerVT(NumBits);
+    MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+    if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
       SDLoc dl(N);
-      unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
-      MVT MemVT = MVT::getIntegerVT(NumBits);
-      MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
-      SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
-      SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
-      SDValue VZLoad =
-          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
-                                  LN->getPointerInfo(),
-                                  LN->getAlignment(),
-                                  LN->getMemOperand()->getFlags());
       SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
                                     DAG.getBitcast(InVT, VZLoad));
       DCI.CombineTo(N, Convert);
       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+      DCI.recursivelyDeleteUnusedNodes(LN);
       return SDValue(N, 0);
     }
   }
@@ -43041,33 +45566,33 @@ static SDValue combineX86INT_TO_FP(SDNode *N, SelectionDAG &DAG,
 
 static SDValue combineCVTP2I_CVTTP2I(SDNode *N, SelectionDAG &DAG,
                                      TargetLowering::DAGCombinerInfo &DCI) {
-  // FIXME: Handle strict fp nodes.
+  bool IsStrict = N->isTargetStrictFPOpcode();
   EVT VT = N->getValueType(0);
 
   // Convert a full vector load into vzload when not all bits are needed.
-  SDValue In = N->getOperand(0);
+  SDValue In = N->getOperand(IsStrict ? 1 : 0);
   MVT InVT = In.getSimpleValueType();
   if (VT.getVectorNumElements() < InVT.getVectorNumElements() &&
       ISD::isNormalLoad(In.getNode()) && In.hasOneUse()) {
     assert(InVT.is128BitVector() && "Expected 128-bit input vector");
     LoadSDNode *LN = cast<LoadSDNode>(In);
-    // Unless the load is volatile or atomic.
-    if (LN->isSimple()) {
+    unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
+    MVT MemVT = MVT::getFloatingPointVT(NumBits);
+    MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
+    if (SDValue VZLoad = narrowLoadToVZLoad(LN, MemVT, LoadVT, DAG)) {
       SDLoc dl(N);
-      unsigned NumBits = InVT.getScalarSizeInBits() * VT.getVectorNumElements();
-      MVT MemVT = MVT::getFloatingPointVT(NumBits);
-      MVT LoadVT = MVT::getVectorVT(MemVT, 128 / NumBits);
-      SDVTList Tys = DAG.getVTList(LoadVT, MVT::Other);
-      SDValue Ops[] = { LN->getChain(), LN->getBasePtr() };
-      SDValue VZLoad =
-          DAG.getMemIntrinsicNode(X86ISD::VZEXT_LOAD, dl, Tys, Ops, MemVT,
-                                  LN->getPointerInfo(),
-                                  LN->getAlignment(),
-                                  LN->getMemOperand()->getFlags());
-      SDValue Convert = DAG.getNode(N->getOpcode(), dl, VT,
-                                    DAG.getBitcast(InVT, VZLoad));
-      DCI.CombineTo(N, Convert);
+      if (IsStrict) {
+        SDValue Convert =
+            DAG.getNode(N->getOpcode(), dl, {VT, MVT::Other},
+                        {N->getOperand(0), DAG.getBitcast(InVT, VZLoad)});
+        DCI.CombineTo(N, Convert, Convert.getValue(1));
+      } else {
+        SDValue Convert =
+            DAG.getNode(N->getOpcode(), dl, VT, DAG.getBitcast(InVT, VZLoad));
+        DCI.CombineTo(N, Convert);
+      }
       DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+      DCI.recursivelyDeleteUnusedNodes(LN);
       return SDValue(N, 0);
     }
   }
@@ -43106,14 +45631,58 @@ static SDValue combineAndnp(SDNode *N, SelectionDAG &DAG,
 
 static SDValue combineBT(SDNode *N, SelectionDAG &DAG,
                          TargetLowering::DAGCombinerInfo &DCI) {
-  SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // BT ignores high bits in the bit index operand.
   unsigned BitWidth = N1.getValueSizeInBits();
   APInt DemandedMask = APInt::getLowBitsSet(BitWidth, Log2_32(BitWidth));
-  if (SDValue DemandedN1 = DAG.GetDemandedBits(N1, DemandedMask))
-    return DAG.getNode(X86ISD::BT, SDLoc(N), MVT::i32, N0, DemandedN1);
+  if (DAG.getTargetLoweringInfo().SimplifyDemandedBits(N1, DemandedMask, DCI)) {
+    if (N->getOpcode() != ISD::DELETED_NODE)
+      DCI.AddToWorklist(N);
+    return SDValue(N, 0);
+  }
+
+  return SDValue();
+}
+
+static SDValue combineCVTPH2PS(SDNode *N, SelectionDAG &DAG,
+                               TargetLowering::DAGCombinerInfo &DCI) {
+  bool IsStrict = N->getOpcode() == X86ISD::STRICT_CVTPH2PS;
+  SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+
+  if (N->getValueType(0) == MVT::v4f32 && Src.getValueType() == MVT::v8i16) {
+    APInt KnownUndef, KnownZero;
+    const TargetLowering &TLI = DAG.getTargetLoweringInfo();
+    APInt DemandedElts = APInt::getLowBitsSet(8, 4);
+    if (TLI.SimplifyDemandedVectorElts(Src, DemandedElts, KnownUndef, KnownZero,
+                                       DCI)) {
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        DCI.AddToWorklist(N);
+      return SDValue(N, 0);
+    }
+
+    // Convert a full vector load into vzload when not all bits are needed.
+    if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+      LoadSDNode *LN = cast<LoadSDNode>(N->getOperand(IsStrict ? 1 : 0));
+      if (SDValue VZLoad = narrowLoadToVZLoad(LN, MVT::i64, MVT::v2i64, DAG)) {
+        SDLoc dl(N);
+        if (IsStrict) {
+          SDValue Convert = DAG.getNode(
+              N->getOpcode(), dl, {MVT::v4f32, MVT::Other},
+              {N->getOperand(0), DAG.getBitcast(MVT::v8i16, VZLoad)});
+          DCI.CombineTo(N, Convert, Convert.getValue(1));
+        } else {
+          SDValue Convert = DAG.getNode(N->getOpcode(), dl, MVT::v4f32,
+                                        DAG.getBitcast(MVT::v8i16, VZLoad));
+          DCI.CombineTo(N, Convert);
+        }
+
+        DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), VZLoad.getValue(1));
+        DCI.recursivelyDeleteUnusedNodes(LN);
+        return SDValue(N, 0);
+      }
+    }
+  }
 
   return SDValue();
 }
@@ -43199,7 +45768,7 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
   //(sext_in_reg (v4i64 anyext (v4i32 x )), ExtraVT) ->
   // (v4i64 sext (v4i32 sext_in_reg (v4i32 x , ExtraVT)))
   if (VT == MVT::v4i64 && (N0.getOpcode() == ISD::ANY_EXTEND ||
-      N0.getOpcode() == ISD::SIGN_EXTEND)) {
+                           N0.getOpcode() == ISD::SIGN_EXTEND)) {
     SDValue N00 = N0.getOperand(0);
 
     // EXTLOAD has a better solution on AVX2,
@@ -43208,9 +45777,14 @@ static SDValue combineSignExtendInReg(SDNode *N, SelectionDAG &DAG,
       if (!ISD::isNormalLoad(N00.getNode()))
         return SDValue();
 
+    // Attempt to promote any comparison mask ops before moving the
+    // SIGN_EXTEND_INREG in the way.
+    if (SDValue Promote = PromoteMaskArithmetic(N0.getNode(), DAG, Subtarget))
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, VT, Promote, N1);
+
     if (N00.getValueType() == MVT::v4i32 && ExtraVT.getSizeInBits() < 128) {
-        SDValue Tmp = DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32,
-                                  N00, N1);
+      SDValue Tmp =
+          DAG.getNode(ISD::SIGN_EXTEND_INREG, dl, MVT::v4i32, N00, N1);
       return DAG.getNode(ISD::SIGN_EXTEND, dl, MVT::v4i64, Tmp);
     }
   }
@@ -43395,6 +45969,21 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
 
     for (unsigned i = 0; i != Scale; ++i)
       ShuffleMask.append(EltSizeInBits, i);
+    Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
+  } else if (Subtarget.hasAVX2() && NumElts < EltSizeInBits &&
+             (SclVT == MVT::i8 || SclVT == MVT::i16 || SclVT == MVT::i32)) {
+    // If we have register broadcast instructions, use the scalar size as the
+    // element type for the shuffle. Then cast to the wider element type. The
+    // widened bits won't be used, and this might allow the use of a broadcast
+    // load.
+    assert((EltSizeInBits % NumElts) == 0 && "Unexpected integer scale");
+    unsigned Scale = EltSizeInBits / NumElts;
+    EVT BroadcastVT =
+        EVT::getVectorVT(*DAG.getContext(), SclVT, NumElts * Scale);
+    Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, BroadcastVT, N00);
+    ShuffleMask.append(NumElts * Scale, 0);
+    Vec = DAG.getVectorShuffle(BroadcastVT, DL, Vec, Vec, ShuffleMask);
+    Vec = DAG.getBitcast(VT, Vec);
   } else {
     // For smaller scalar integers, we can simply any-extend it to the vector
     // element size (we don't care about the upper bits) and broadcast it to all
@@ -43402,8 +45991,8 @@ combineToExtendBoolVectorInReg(SDNode *N, SelectionDAG &DAG,
     SDValue Scl = DAG.getAnyExtOrTrunc(N00, DL, SVT);
     Vec = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, VT, Scl);
     ShuffleMask.append(NumElts, 0);
+    Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
   }
-  Vec = DAG.getVectorShuffle(VT, DL, Vec, Vec, ShuffleMask);
 
   // Now, mask the relevant bit in each element.
   SmallVector<SDValue, 32> Bits;
@@ -43448,7 +46037,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
 
   // We can only do this if the vector size in 256 bits or less.
   unsigned Size = VT.getSizeInBits();
-  if (Size > 256)
+  if (Size > 256 && Subtarget.useAVX512Regs())
     return SDValue();
 
   // Don't fold if the condition code can't be handled by PCMPEQ/PCMPGT since
@@ -43466,7 +46055,7 @@ static SDValue combineExtSetcc(SDNode *N, SelectionDAG &DAG,
   SDValue Res = DAG.getSetCC(dl, VT, N0.getOperand(0), N0.getOperand(1), CC);
 
   if (N->getOpcode() == ISD::ZERO_EXTEND)
-    Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType().getScalarType());
+    Res = DAG.getZeroExtendInReg(Res, dl, N0.getValueType());
 
   return Res;
 }
@@ -43479,6 +46068,23 @@ static SDValue combineSext(SDNode *N, SelectionDAG &DAG,
   EVT InVT = N0.getValueType();
   SDLoc DL(N);
 
+  // (i32 (sext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
+  if (!DCI.isBeforeLegalizeOps() &&
+      N0.getOpcode() == X86ISD::SETCC_CARRY) {
+    SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, DL, VT, N0->getOperand(0),
+                                 N0->getOperand(1));
+    bool ReplaceOtherUses = !N0.hasOneUse();
+    DCI.CombineTo(N, Setcc);
+    // Replace other uses with a truncate of the widened setcc_carry.
+    if (ReplaceOtherUses) {
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+                                  N0.getValueType(), Setcc);
+      DCI.CombineTo(N0.getNode(), Trunc);
+    }
+
+    return SDValue(N, 0);
+  }
+
   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
     return NewCMov;
 
@@ -43516,6 +46122,7 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
                           const X86Subtarget &Subtarget) {
   SDLoc dl(N);
   EVT VT = N->getValueType(0);
+  bool IsStrict = N->isStrictFPOpcode() || N->isTargetStrictFPOpcode();
 
   // Let legalize expand this if it isn't a legal type yet.
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -43526,15 +46133,16 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
   if ((ScalarVT != MVT::f32 && ScalarVT != MVT::f64) || !Subtarget.hasAnyFMA())
     return SDValue();
 
-  SDValue A = N->getOperand(0);
-  SDValue B = N->getOperand(1);
-  SDValue C = N->getOperand(2);
+  SDValue A = N->getOperand(IsStrict ? 1 : 0);
+  SDValue B = N->getOperand(IsStrict ? 2 : 1);
+  SDValue C = N->getOperand(IsStrict ? 3 : 2);
 
   auto invertIfNegative = [&DAG, &TLI, &DCI](SDValue &V) {
     bool CodeSize = DAG.getMachineFunction().getFunction().hasOptSize();
     bool LegalOperations = !DCI.isBeforeLegalizeOps();
-    if (TLI.isNegatibleForFree(V, DAG, LegalOperations, CodeSize) == 2) {
-      V = TLI.getNegatedExpression(V, DAG, LegalOperations, CodeSize);
+    if (SDValue NegV = TLI.getCheaperNegatedExpression(V, DAG, LegalOperations,
+                                                       CodeSize)) {
+      V = NegV;
       return true;
     }
     // Look through extract_vector_elts. If it comes from an FNEG, create a
@@ -43542,11 +46150,10 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
     if (V.getOpcode() == ISD::EXTRACT_VECTOR_ELT &&
         isNullConstant(V.getOperand(1))) {
       SDValue Vec = V.getOperand(0);
-      if (TLI.isNegatibleForFree(Vec, DAG, LegalOperations, CodeSize) == 2) {
-        SDValue NegVal =
-            TLI.getNegatedExpression(Vec, DAG, LegalOperations, CodeSize);
+      if (SDValue NegV = TLI.getCheaperNegatedExpression(
+              Vec, DAG, LegalOperations, CodeSize)) {
         V = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, SDLoc(V), V.getValueType(),
-                        NegVal, V.getOperand(1));
+                        NegV, V.getOperand(1));
         return true;
       }
     }
@@ -43566,9 +46173,15 @@ static SDValue combineFMA(SDNode *N, SelectionDAG &DAG,
   unsigned NewOpcode =
       negateFMAOpcode(N->getOpcode(), NegA != NegB, NegC, false);
 
-  if (N->getNumOperands() == 4)
-    return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
-  return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+  if (IsStrict) {
+    assert(N->getNumOperands() == 4 && "Shouldn't be greater than 4");
+    return DAG.getNode(NewOpcode, dl, {VT, MVT::Other},
+                       {N->getOperand(0), A, B, C});
+  } else {
+    if (N->getNumOperands() == 4)
+      return DAG.getNode(NewOpcode, dl, VT, A, B, C, N->getOperand(3));
+    return DAG.getNode(NewOpcode, dl, VT, A, B, C);
+  }
 }
 
 // Combine FMADDSUB(A, B, FNEG(C)) -> FMSUBADD(A, B, C)
@@ -43582,10 +46195,11 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
   bool LegalOperations = !DCI.isBeforeLegalizeOps();
 
   SDValue N2 = N->getOperand(2);
-  if (TLI.isNegatibleForFree(N2, DAG, LegalOperations, CodeSize) != 2)
-    return SDValue();
 
-  SDValue NegN2 = TLI.getNegatedExpression(N2, DAG, LegalOperations, CodeSize);
+  SDValue NegN2 =
+      TLI.getCheaperNegatedExpression(N2, DAG, LegalOperations, CodeSize);
+  if (!NegN2)
+    return SDValue();
   unsigned NewOpcode = negateFMAOpcode(N->getOpcode(), false, true, false);
 
   if (N->getNumOperands() == 4)
@@ -43598,38 +46212,26 @@ static SDValue combineFMADDSUB(SDNode *N, SelectionDAG &DAG,
 static SDValue combineZext(SDNode *N, SelectionDAG &DAG,
                            TargetLowering::DAGCombinerInfo &DCI,
                            const X86Subtarget &Subtarget) {
-  // (i32 zext (and (i8  x86isd::setcc_carry), 1)) ->
-  //           (and (i32 x86isd::setcc_carry), 1)
-  // This eliminates the zext. This transformation is necessary because
-  // ISD::SETCC is always legalized to i8.
   SDLoc dl(N);
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
-  if (N0.getOpcode() == ISD::AND &&
-      N0.hasOneUse() &&
-      N0.getOperand(0).hasOneUse()) {
-    SDValue N00 = N0.getOperand(0);
-    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
-      if (!isOneConstant(N0.getOperand(1)))
-        return SDValue();
-      return DAG.getNode(ISD::AND, dl, VT,
-                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
-                                     N00.getOperand(0), N00.getOperand(1)),
-                         DAG.getConstant(1, dl, VT));
+  // (i32 (aext (i8 (x86isd::setcc_carry)))) -> (i32 (x86isd::setcc_carry))
+  // FIXME: Is this needed? We don't seem to have any tests for it.
+  if (!DCI.isBeforeLegalizeOps() && N->getOpcode() == ISD::ANY_EXTEND &&
+      N0.getOpcode() == X86ISD::SETCC_CARRY) {
+    SDValue Setcc = DAG.getNode(X86ISD::SETCC_CARRY, dl, VT, N0->getOperand(0),
+                                 N0->getOperand(1));
+    bool ReplaceOtherUses = !N0.hasOneUse();
+    DCI.CombineTo(N, Setcc);
+    // Replace other uses with a truncate of the widened setcc_carry.
+    if (ReplaceOtherUses) {
+      SDValue Trunc = DAG.getNode(ISD::TRUNCATE, SDLoc(N0),
+                                  N0.getValueType(), Setcc);
+      DCI.CombineTo(N0.getNode(), Trunc);
     }
-  }
 
-  if (N0.getOpcode() == ISD::TRUNCATE &&
-      N0.hasOneUse() &&
-      N0.getOperand(0).hasOneUse()) {
-    SDValue N00 = N0.getOperand(0);
-    if (N00.getOpcode() == X86ISD::SETCC_CARRY) {
-      return DAG.getNode(ISD::AND, dl, VT,
-                         DAG.getNode(X86ISD::SETCC_CARRY, dl, VT,
-                                     N00.getOperand(0), N00.getOperand(1)),
-                         DAG.getConstant(1, dl, VT));
-    }
+    return SDValue(N, 0);
   }
 
   if (SDValue NewCMov = combineToExtendCMOV(N, DAG))
@@ -43742,13 +46344,12 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
 
   EVT VT = SetCC->getValueType(0);
   SDLoc DL(SetCC);
-  bool HasAVX = Subtarget.hasAVX();
 
   // Use XOR (plus OR) and PTEST after SSE4.1 for 128/256-bit operands.
   // Use PCMPNEQ (plus OR) and KORTEST for 512-bit operands.
   // Otherwise use PCMPEQ (plus AND) and mask testing.
   if ((OpSize == 128 && Subtarget.hasSSE2()) ||
-      (OpSize == 256 && HasAVX) ||
+      (OpSize == 256 && Subtarget.hasAVX()) ||
       (OpSize == 512 && Subtarget.useAVX512Regs())) {
     bool HasPT = Subtarget.hasSSE41();
 
@@ -43802,11 +46403,9 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
       X = DAG.getBitcast(TmpCastVT, X);
       if (!NeedZExt && !TmpZext)
         return X;
-      const TargetLowering &TLI = DAG.getTargetLoweringInfo();
-      MVT VecIdxVT = TLI.getVectorIdxTy(DAG.getDataLayout());
       return DAG.getNode(ISD::INSERT_SUBVECTOR, DL, VecVT,
                          DAG.getConstant(0, DL, VecVT), X,
-                         DAG.getConstant(0, DL, VecIdxVT));
+                         DAG.getVectorIdxConstant(0, DL));
     };
 
     SDValue Cmp;
@@ -43839,17 +46438,16 @@ static SDValue combineVectorSizedSetCCEquality(SDNode *SetCC, SelectionDAG &DAG,
                                      Cmp);
       SDValue PT = DAG.getNode(X86ISD::PTEST, DL, MVT::i32, BCCmp, BCCmp);
       X86::CondCode X86CC = CC == ISD::SETEQ ? X86::COND_E : X86::COND_NE;
-      SDValue SetCC = getSETCC(X86CC, PT, DL, DAG);
-      return DAG.getNode(ISD::TRUNCATE, DL, VT, SetCC.getValue(0));
+      SDValue X86SetCC = getSETCC(X86CC, PT, DL, DAG);
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, X86SetCC.getValue(0));
     }
     // If all bytes match (bitmask is 0x(FFFF)FFFF), that's equality.
     // setcc i128 X, Y, eq --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, eq
     // setcc i128 X, Y, ne --> setcc (pmovmskb (pcmpeqb X, Y)), 0xFFFF, ne
-    // setcc i256 X, Y, eq --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, eq
-    // setcc i256 X, Y, ne --> setcc (vpmovmskb (vpcmpeqb X, Y)), 0xFFFFFFFF, ne
+    assert(Cmp.getValueType() == MVT::v16i8 &&
+           "Non 128-bit vector on pre-SSE41 target");
     SDValue MovMsk = DAG.getNode(X86ISD::MOVMSK, DL, MVT::i32, Cmp);
-    SDValue FFFFs = DAG.getConstant(OpSize == 128 ? 0xFFFF : 0xFFFFFFFF, DL,
-                                    MVT::i32);
+    SDValue FFFFs = DAG.getConstant(0xFFFF, DL, MVT::i32);
     return DAG.getSetCC(DL, VT, MovMsk, FFFFs, CC);
   }
 
@@ -43866,23 +46464,16 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
   SDLoc DL(N);
 
   if (CC == ISD::SETNE || CC == ISD::SETEQ) {
-    // 0-x == y --> x+y == 0
-    // 0-x != y --> x+y != 0
-    if (LHS.getOpcode() == ISD::SUB && isNullConstant(LHS.getOperand(0)) &&
-        LHS.hasOneUse()) {
-      SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, RHS, LHS.getOperand(1));
-      return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
-    }
-    // x == 0-y --> x+y == 0
-    // x != 0-y --> x+y != 0
-    if (RHS.getOpcode() == ISD::SUB && isNullConstant(RHS.getOperand(0)) &&
-        RHS.hasOneUse()) {
-      SDValue Add = DAG.getNode(ISD::ADD, DL, OpVT, LHS, RHS.getOperand(1));
-      return DAG.getSetCC(DL, VT, Add, DAG.getConstant(0, DL, OpVT), CC);
-    }
-
     if (SDValue V = combineVectorSizedSetCCEquality(N, DAG, Subtarget))
       return V;
+
+    if (VT == MVT::i1 && isNullConstant(RHS)) {
+      SDValue X86CC;
+      if (SDValue V =
+              MatchVectorAllZeroTest(LHS, CC, DL, Subtarget, DAG, X86CC))
+        return DAG.getNode(ISD::TRUNCATE, DL, VT,
+                           DAG.getNode(X86ISD::SETCC, DL, MVT::i8, X86CC, V));
+    }
   }
 
   if (VT.isVector() && VT.getVectorElementType() == MVT::i1 &&
@@ -43905,7 +46496,7 @@ static SDValue combineSetCC(SDNode *N, SelectionDAG &DAG,
 
     if (IsSEXT0 && IsVZero1) {
       assert(VT == Op0.getOperand(0).getValueType() &&
-             "Uexpected operand type");
+             "Unexpected operand type");
       if (TmpCC == ISD::SETGT)
         return DAG.getConstant(0, DL, VT);
       if (TmpCC == ISD::SETLE)
@@ -43995,20 +46586,43 @@ static SDValue combineX86GatherScatter(SDNode *N, SelectionDAG &DAG,
   if (Mask.getScalarValueSizeInBits() != 1) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
-    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        DCI.AddToWorklist(N);
       return SDValue(N, 0);
+    }
   }
 
   return SDValue();
 }
 
+static SDValue rebuildGatherScatter(MaskedGatherScatterSDNode *GorS,
+                                    SDValue Index, SDValue Base, SDValue Scale,
+                                    SelectionDAG &DAG) {
+  SDLoc DL(GorS);
+
+  if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
+    SDValue Ops[] = { Gather->getChain(), Gather->getPassThru(),
+                      Gather->getMask(), Base, Index, Scale } ;
+    return DAG.getMaskedGather(Gather->getVTList(),
+                               Gather->getMemoryVT(), DL, Ops,
+                               Gather->getMemOperand(),
+                               Gather->getIndexType());
+  }
+  auto *Scatter = cast<MaskedScatterSDNode>(GorS);
+  SDValue Ops[] = { Scatter->getChain(), Scatter->getValue(),
+                    Scatter->getMask(), Base, Index, Scale };
+  return DAG.getMaskedScatter(Scatter->getVTList(),
+                              Scatter->getMemoryVT(), DL,
+                              Ops, Scatter->getMemOperand(),
+                              Scatter->getIndexType());
+}
+
 static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
                                     TargetLowering::DAGCombinerInfo &DCI) {
   SDLoc DL(N);
   auto *GorS = cast<MaskedGatherScatterSDNode>(N);
-  SDValue Chain = GorS->getChain();
   SDValue Index = GorS->getIndex();
-  SDValue Mask = GorS->getMask();
   SDValue Base = GorS->getBasePtr();
   SDValue Scale = GorS->getScale();
 
@@ -44028,21 +46642,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
         unsigned NumElts = Index.getValueType().getVectorNumElements();
         EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
         Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
-        if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
-          SDValue Ops[] = { Chain, Gather->getPassThru(),
-                            Mask, Base, Index, Scale } ;
-          return DAG.getMaskedGather(Gather->getVTList(),
-                                     Gather->getMemoryVT(), DL, Ops,
-                                     Gather->getMemOperand(),
-                                     Gather->getIndexType());
-        }
-        auto *Scatter = cast<MaskedScatterSDNode>(GorS);
-        SDValue Ops[] = { Chain, Scatter->getValue(),
-                          Mask, Base, Index, Scale };
-        return DAG.getMaskedScatter(Scatter->getVTList(),
-                                    Scatter->getMemoryVT(), DL,
-                                    Ops, Scatter->getMemOperand(),
-                                    Scatter->getIndexType());
+        return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
       }
     }
 
@@ -44057,21 +46657,7 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
       unsigned NumElts = Index.getValueType().getVectorNumElements();
       EVT NewVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32, NumElts);
       Index = DAG.getNode(ISD::TRUNCATE, DL, NewVT, Index);
-      if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
-        SDValue Ops[] = { Chain, Gather->getPassThru(),
-                          Mask, Base, Index, Scale } ;
-        return DAG.getMaskedGather(Gather->getVTList(),
-                                   Gather->getMemoryVT(), DL, Ops,
-                                   Gather->getMemOperand(),
-                                   Gather->getIndexType());
-      }
-      auto *Scatter = cast<MaskedScatterSDNode>(GorS);
-      SDValue Ops[] = { Chain, Scatter->getValue(),
-                        Mask, Base, Index, Scale };
-      return DAG.getMaskedScatter(Scatter->getVTList(),
-                                  Scatter->getMemoryVT(), DL,
-                                  Ops, Scatter->getMemOperand(),
-                                  Scatter->getIndexType());
+      return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
     }
   }
 
@@ -44084,30 +46670,20 @@ static SDValue combineGatherScatter(SDNode *N, SelectionDAG &DAG,
       EVT IndexVT = EVT::getVectorVT(*DAG.getContext(), EltVT,
                                    Index.getValueType().getVectorNumElements());
       Index = DAG.getSExtOrTrunc(Index, DL, IndexVT);
-      if (auto *Gather = dyn_cast<MaskedGatherSDNode>(GorS)) {
-        SDValue Ops[] = { Chain, Gather->getPassThru(),
-                          Mask, Base, Index, Scale } ;
-        return DAG.getMaskedGather(Gather->getVTList(),
-                                   Gather->getMemoryVT(), DL, Ops,
-                                   Gather->getMemOperand(),
-                                   Gather->getIndexType());
-      }
-      auto *Scatter = cast<MaskedScatterSDNode>(GorS);
-      SDValue Ops[] = { Chain, Scatter->getValue(),
-                        Mask, Base, Index, Scale };
-      return DAG.getMaskedScatter(Scatter->getVTList(),
-                                  Scatter->getMemoryVT(), DL,
-                                  Ops, Scatter->getMemOperand(),
-                                  Scatter->getIndexType());
+      return rebuildGatherScatter(GorS, Index, Base, Scale, DAG);
     }
   }
 
   // With vector masks we only demand the upper bit of the mask.
+  SDValue Mask = GorS->getMask();
   if (Mask.getScalarValueSizeInBits() != 1) {
     const TargetLowering &TLI = DAG.getTargetLoweringInfo();
     APInt DemandedMask(APInt::getSignMask(Mask.getScalarValueSizeInBits()));
-    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI))
+    if (TLI.SimplifyDemandedBits(Mask, DemandedMask, DCI)) {
+      if (N->getOpcode() != ISD::DELETED_NODE)
+        DCI.AddToWorklist(N);
       return SDValue(N, 0);
+    }
   }
 
   return SDValue();
@@ -44146,10 +46722,11 @@ static SDValue combineBrCond(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// TODO: Could we move this to DAGCombine?
 static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
                                                   SelectionDAG &DAG) {
-  // Take advantage of vector comparisons producing 0 or -1 in each lane to
-  // optimize away operation when it's from a constant.
+  // Take advantage of vector comparisons (etc.) producing 0 or -1 in each lane
+  // to optimize away operation when it's from a constant.
   //
   // The general transformation is:
   //    UNARYOP(AND(VECTOR_CMP(x,y), constant)) -->
@@ -44161,9 +46738,10 @@ static SDValue combineVectorCompareAndMaskUnaryOp(SDNode *N,
   // aren't the same.
   EVT VT = N->getValueType(0);
   bool IsStrict = N->isStrictFPOpcode();
+  unsigned NumEltBits = VT.getScalarSizeInBits();
   SDValue Op0 = N->getOperand(IsStrict ? 1 : 0);
-  if (!VT.isVector() || Op0->getOpcode() != ISD::AND ||
-      Op0->getOperand(0)->getOpcode() != ISD::SETCC ||
+  if (!VT.isVector() || Op0.getOpcode() != ISD::AND ||
+      DAG.ComputeNumSignBits(Op0.getOperand(0)) != NumEltBits ||
       VT.getSizeInBits() != Op0.getValueSizeInBits())
     return SDValue();
 
@@ -44336,7 +46914,6 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
   if (!Subtarget.useSoftFloat() && Subtarget.hasX87() &&
       Op0.getOpcode() == ISD::LOAD) {
     LoadSDNode *Ld = cast<LoadSDNode>(Op0.getNode());
-    EVT LdVT = Ld->getValueType(0);
 
     // This transformation is not supported if the result type is f16 or f128.
     if (VT == MVT::f16 || VT == MVT::f128)
@@ -44347,11 +46924,12 @@ static SDValue combineSIntToFP(SDNode *N, SelectionDAG &DAG,
     if (Subtarget.hasDQI() && VT != MVT::f80)
       return SDValue();
 
-    if (Ld->isSimple() && !VT.isVector() &&
-        ISD::isNON_EXTLoad(Op0.getNode()) && Op0.hasOneUse() &&
-        !Subtarget.is64Bit() && LdVT == MVT::i64) {
-      std::pair<SDValue, SDValue> Tmp = Subtarget.getTargetLowering()->BuildFILD(
-          SDValue(N, 0), LdVT, Ld->getChain(), Op0, DAG);
+    if (Ld->isSimple() && !VT.isVector() && ISD::isNormalLoad(Op0.getNode()) &&
+        Op0.hasOneUse() && !Subtarget.is64Bit() && InVT == MVT::i64) {
+      std::pair<SDValue, SDValue> Tmp =
+          Subtarget.getTargetLowering()->BuildFILD(
+              VT, InVT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(),
+              Ld->getPointerInfo(), Ld->getOriginalAlign(), DAG);
       DAG.ReplaceAllUsesOfValueWith(Op0.getValue(1), Tmp.second);
       return Tmp.first;
     }
@@ -44685,7 +47263,7 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
   }
 
   if (CC == X86::COND_A) {
-    SDValue EFLAGS = Y->getOperand(1);
+    SDValue EFLAGS = Y.getOperand(1);
     // Try to convert COND_A into COND_B in an attempt to facilitate
     // materializing "setb reg".
     //
@@ -44698,13 +47276,44 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
       SDValue NewSub = DAG.getNode(X86ISD::SUB, SDLoc(EFLAGS),
                                    EFLAGS.getNode()->getVTList(),
                                    EFLAGS.getOperand(1), EFLAGS.getOperand(0));
-      SDValue NewEFLAGS = SDValue(NewSub.getNode(), EFLAGS.getResNo());
+      SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
       return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL,
                          DAG.getVTList(VT, MVT::i32), X,
                          DAG.getConstant(0, DL, VT), NewEFLAGS);
     }
   }
 
+  if (CC == X86::COND_AE) {
+    // X + SETAE --> sbb X, -1
+    // X - SETAE --> adc X, -1
+    return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
+                       DAG.getVTList(VT, MVT::i32), X,
+                       DAG.getConstant(-1, DL, VT), Y.getOperand(1));
+  }
+
+  if (CC == X86::COND_BE) {
+    // X + SETBE --> sbb X, -1
+    // X - SETBE --> adc X, -1
+    SDValue EFLAGS = Y.getOperand(1);
+    // Try to convert COND_BE into COND_AE in an attempt to facilitate
+    // materializing "setae reg".
+    //
+    // Do not flip "e <= c", where "c" is a constant, because Cmp instruction
+    // cannot take an immediate as its first operand.
+    //
+    if (EFLAGS.getOpcode() == X86ISD::SUB && EFLAGS.getNode()->hasOneUse() &&
+        EFLAGS.getValueType().isInteger() &&
+        !isa<ConstantSDNode>(EFLAGS.getOperand(1))) {
+      SDValue NewSub = DAG.getNode(
+          X86ISD::SUB, SDLoc(EFLAGS), EFLAGS.getNode()->getVTList(),
+          EFLAGS.getOperand(1), EFLAGS.getOperand(0));
+      SDValue NewEFLAGS = NewSub.getValue(EFLAGS.getResNo());
+      return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL,
+                         DAG.getVTList(VT, MVT::i32), X,
+                         DAG.getConstant(-1, DL, VT), NewEFLAGS);
+    }
+  }
+
   if (CC != X86::COND_E && CC != X86::COND_NE)
     return SDValue();
 
@@ -44741,15 +47350,18 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
     if ((IsSub && CC == X86::COND_E && ConstantX->isNullValue()) ||
         (!IsSub && CC == X86::COND_NE && ConstantX->isAllOnesValue())) {
       SDValue One = DAG.getConstant(1, DL, ZVT);
-      SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+      SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+      SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
       return DAG.getNode(X86ISD::SETCC_CARRY, DL, VT,
-                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8), Cmp1);
+                         DAG.getTargetConstant(X86::COND_B, DL, MVT::i8),
+                         Cmp1.getValue(1));
     }
   }
 
   // (cmp Z, 1) sets the carry flag if Z is 0.
   SDValue One = DAG.getConstant(1, DL, ZVT);
-  SDValue Cmp1 = DAG.getNode(X86ISD::CMP, DL, MVT::i32, Z, One);
+  SDVTList X86SubVTs = DAG.getVTList(ZVT, MVT::i32);
+  SDValue Cmp1 = DAG.getNode(X86ISD::SUB, DL, X86SubVTs, Z, One);
 
   // Add the flags type for ADC/SBB nodes.
   SDVTList VTs = DAG.getVTList(VT, MVT::i32);
@@ -44758,151 +47370,12 @@ static SDValue combineAddOrSubToADCOrSBB(SDNode *N, SelectionDAG &DAG) {
   // X + (Z != 0) --> add X, (zext(setne Z, 0)) --> sbb X, -1, (cmp Z, 1)
   if (CC == X86::COND_NE)
     return DAG.getNode(IsSub ? X86ISD::ADC : X86ISD::SBB, DL, VTs, X,
-                       DAG.getConstant(-1ULL, DL, VT), Cmp1);
+                       DAG.getConstant(-1ULL, DL, VT), Cmp1.getValue(1));
 
   // X - (Z == 0) --> sub X, (zext(sete  Z, 0)) --> sbb X, 0, (cmp Z, 1)
   // X + (Z == 0) --> add X, (zext(sete  Z, 0)) --> adc X, 0, (cmp Z, 1)
   return DAG.getNode(IsSub ? X86ISD::SBB : X86ISD::ADC, DL, VTs, X,
-                     DAG.getConstant(0, DL, VT), Cmp1);
-}
-
-static SDValue combineLoopMAddPattern(SDNode *N, SelectionDAG &DAG,
-                                      const X86Subtarget &Subtarget) {
-  if (!Subtarget.hasSSE2())
-    return SDValue();
-
-  EVT VT = N->getValueType(0);
-
-  // If the vector size is less than 128, or greater than the supported RegSize,
-  // do not use PMADD.
-  if (!VT.isVector() || VT.getVectorNumElements() < 8)
-    return SDValue();
-
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-
-  auto UsePMADDWD = [&](SDValue Op) {
-    ShrinkMode Mode;
-    return Op.getOpcode() == ISD::MUL &&
-           canReduceVMulWidth(Op.getNode(), DAG, Mode) &&
-           Mode != ShrinkMode::MULU16 &&
-           (!Subtarget.hasSSE41() ||
-            (Op->isOnlyUserOf(Op.getOperand(0).getNode()) &&
-             Op->isOnlyUserOf(Op.getOperand(1).getNode())));
-  };
-
-  SDValue MulOp, OtherOp;
-  if (UsePMADDWD(Op0)) {
-    MulOp = Op0;
-    OtherOp = Op1;
-  } else if (UsePMADDWD(Op1)) {
-    MulOp = Op1;
-    OtherOp = Op0;
-  } else
-   return SDValue();
-
-  SDLoc DL(N);
-  EVT ReducedVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
-                                   VT.getVectorNumElements());
-  EVT MAddVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
-                                VT.getVectorNumElements() / 2);
-
-  // Shrink the operands of mul.
-  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(0));
-  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, ReducedVT, MulOp->getOperand(1));
-
-  // Madd vector size is half of the original vector size
-  auto PMADDWDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
-                           ArrayRef<SDValue> Ops) {
-    MVT OpVT = MVT::getVectorVT(MVT::i32, Ops[0].getValueSizeInBits() / 32);
-    return DAG.getNode(X86ISD::VPMADDWD, DL, OpVT, Ops);
-  };
-  SDValue Madd = SplitOpsAndApply(DAG, Subtarget, DL, MAddVT, { N0, N1 },
-                                  PMADDWDBuilder);
-  // Fill the rest of the output with 0
-  SDValue Zero = DAG.getConstant(0, DL, Madd.getSimpleValueType());
-  SDValue Concat = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Madd, Zero);
-
-  // Preserve the reduction flag on the ADD. We may need to revisit for the
-  // other operand.
-  SDNodeFlags Flags;
-  Flags.setVectorReduction(true);
-  return DAG.getNode(ISD::ADD, DL, VT, Concat, OtherOp, Flags);
-}
-
-static SDValue combineLoopSADPattern(SDNode *N, SelectionDAG &DAG,
-                                     const X86Subtarget &Subtarget) {
-  if (!Subtarget.hasSSE2())
-    return SDValue();
-
-  SDLoc DL(N);
-  EVT VT = N->getValueType(0);
-
-  // TODO: There's nothing special about i32, any integer type above i16 should
-  // work just as well.
-  if (!VT.isVector() || !VT.isSimple() ||
-      !(VT.getVectorElementType() == MVT::i32))
-    return SDValue();
-
-  unsigned RegSize = 128;
-  if (Subtarget.useBWIRegs())
-    RegSize = 512;
-  else if (Subtarget.hasAVX())
-    RegSize = 256;
-
-  // We only handle v16i32 for SSE2 / v32i32 for AVX / v64i32 for AVX512.
-  // TODO: We should be able to handle larger vectors by splitting them before
-  // feeding them into several SADs, and then reducing over those.
-  if (VT.getSizeInBits() / 4 > RegSize)
-    return SDValue();
-
-  // We know N is a reduction add. To match SAD, we need one of the operands to
-  // be an ABS.
-  SDValue AbsOp = N->getOperand(0);
-  SDValue OtherOp = N->getOperand(1);
-  if (AbsOp.getOpcode() != ISD::ABS)
-    std::swap(AbsOp, OtherOp);
-  if (AbsOp.getOpcode() != ISD::ABS)
-    return SDValue();
-
-  // Check whether we have an abs-diff pattern feeding into the select.
-  SDValue SadOp0, SadOp1;
-  if(!detectZextAbsDiff(AbsOp, SadOp0, SadOp1))
-    return SDValue();
-
-  // SAD pattern detected. Now build a SAD instruction and an addition for
-  // reduction. Note that the number of elements of the result of SAD is less
-  // than the number of elements of its input. Therefore, we could only update
-  // part of elements in the reduction vector.
-  SDValue Sad = createPSADBW(DAG, SadOp0, SadOp1, DL, Subtarget);
-
-  // The output of PSADBW is a vector of i64.
-  // We need to turn the vector of i64 into a vector of i32.
-  // If the reduction vector is at least as wide as the psadbw result, just
-  // bitcast. If it's narrower which can only occur for v2i32, bits 127:16 of
-  // the PSADBW will be zero. If we promote/ narrow vectors, truncate the v2i64
-  // result to v2i32 which will be removed by type legalization. If we/ widen
-  // narrow vectors then we bitcast to v4i32 and extract v2i32.
-  MVT ResVT = MVT::getVectorVT(MVT::i32, Sad.getValueSizeInBits() / 32);
-  Sad = DAG.getNode(ISD::BITCAST, DL, ResVT, Sad);
-
-  if (VT.getSizeInBits() > ResVT.getSizeInBits()) {
-    // Fill the upper elements with zero to match the add width.
-    assert(VT.getSizeInBits() % ResVT.getSizeInBits() == 0 && "Unexpected VTs");
-    unsigned NumConcats = VT.getSizeInBits() / ResVT.getSizeInBits();
-    SmallVector<SDValue, 4> Ops(NumConcats, DAG.getConstant(0, DL, ResVT));
-    Ops[0] = Sad;
-    Sad = DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Ops);
-  } else if (VT.getSizeInBits() < ResVT.getSizeInBits()) {
-    Sad = DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, Sad,
-                      DAG.getIntPtrConstant(0, DL));
-  }
-
-  // Preserve the reduction flag on the ADD. We may need to revisit for the
-  // other operand.
-  SDNodeFlags Flags;
-  Flags.setVectorReduction(true);
-  return DAG.getNode(ISD::ADD, DL, VT, Sad, OtherOp, Flags);
+                     DAG.getConstant(0, DL, VT), Cmp1.getValue(1));
 }
 
 static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
@@ -44994,30 +47467,25 @@ static SDValue matchPMADDWD(SelectionDAG &DAG, SDValue Op0, SDValue Op1,
       Mode == ShrinkMode::MULU16)
     return SDValue();
 
+  EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+                                 VT.getVectorNumElements() * 2);
+  SDValue N0 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(0));
+  SDValue N1 = DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Mul.getOperand(1));
+
   auto PMADDBuilder = [](SelectionDAG &DAG, const SDLoc &DL,
                          ArrayRef<SDValue> Ops) {
-    // Shrink by adding truncate nodes and let DAGCombine fold with the
-    // sources.
     EVT InVT = Ops[0].getValueType();
-    assert(InVT.getScalarType() == MVT::i32 &&
-           "Unexpected scalar element type");
     assert(InVT == Ops[1].getValueType() && "Operands' types mismatch");
     EVT ResVT = EVT::getVectorVT(*DAG.getContext(), MVT::i32,
                                  InVT.getVectorNumElements() / 2);
-    EVT TruncVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
-                                   InVT.getVectorNumElements());
-    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT,
-                       DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[0]),
-                       DAG.getNode(ISD::TRUNCATE, DL, TruncVT, Ops[1]));
+    return DAG.getNode(X86ISD::VPMADDWD, DL, ResVT, Ops[0], Ops[1]);
   };
-  return SplitOpsAndApply(DAG, Subtarget, DL, VT,
-                          { Mul.getOperand(0), Mul.getOperand(1) },
-                          PMADDBuilder);
+  return SplitOpsAndApply(DAG, Subtarget, DL, VT, { N0, N1 }, PMADDBuilder);
 }
 
 // Attempt to turn this pattern into PMADDWD.
-// (mul (add (sext (build_vector)), (sext (build_vector))),
-//      (add (sext (build_vector)), (sext (build_vector)))
+// (add (mul (sext (build_vector)), (sext (build_vector))),
+//      (mul (sext (build_vector)), (sext (build_vector)))
 static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
                               const SDLoc &DL, EVT VT,
                               const X86Subtarget &Subtarget) {
@@ -45139,13 +47607,6 @@ static SDValue matchPMADDWD_2(SelectionDAG &DAG, SDValue N0, SDValue N1,
 static SDValue combineAdd(SDNode *N, SelectionDAG &DAG,
                           TargetLowering::DAGCombinerInfo &DCI,
                           const X86Subtarget &Subtarget) {
-  const SDNodeFlags Flags = N->getFlags();
-  if (Flags.hasVectorReduction()) {
-    if (SDValue Sad = combineLoopSADPattern(N, DAG, Subtarget))
-      return Sad;
-    if (SDValue MAdd = combineLoopMAddPattern(N, DAG, Subtarget))
-      return MAdd;
-  }
   EVT VT = N->getValueType(0);
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
@@ -45236,6 +47697,38 @@ static SDValue combineSubToSubus(SDNode *N, SelectionDAG &DAG,
       SubusRHS = MinLHS;
     else
       return SDValue();
+  } else if (Op1.getOpcode() == ISD::TRUNCATE &&
+             Op1.getOperand(0).getOpcode() == ISD::UMIN &&
+             (EltVT == MVT::i8 || EltVT == MVT::i16)) {
+    // Special case where the UMIN has been truncated. Try to push the truncate
+    // further up. This is similar to the i32/i64 special processing.
+    SubusLHS = Op0;
+    SDValue MinLHS = Op1.getOperand(0).getOperand(0);
+    SDValue MinRHS = Op1.getOperand(0).getOperand(1);
+    EVT TruncVT = Op1.getOperand(0).getValueType();
+    if (!(Subtarget.hasSSSE3() && (TruncVT == MVT::v8i32 ||
+                                   TruncVT == MVT::v8i64)) &&
+        !(Subtarget.useBWIRegs() && (TruncVT == MVT::v16i32)))
+      return SDValue();
+    SDValue OpToSaturate;
+    if (MinLHS.getOpcode() == ISD::ZERO_EXTEND &&
+        MinLHS.getOperand(0) == Op0)
+      OpToSaturate = MinRHS;
+    else if (MinRHS.getOpcode() == ISD::ZERO_EXTEND &&
+             MinRHS.getOperand(0) == Op0)
+      OpToSaturate = MinLHS;
+    else
+      return SDValue();
+
+    // Saturate the non-extended input and then truncate it.
+    SDLoc DL(N);
+    SDValue SaturationConst =
+        DAG.getConstant(APInt::getLowBitsSet(TruncVT.getScalarSizeInBits(),
+                                             VT.getScalarSizeInBits()),
+                        DL, TruncVT);
+    SDValue UMin = DAG.getNode(ISD::UMIN, DL, TruncVT, OpToSaturate,
+                               SaturationConst);
+    SubusRHS = DAG.getNode(ISD::TRUNCATE, DL, VT, UMin);
   } else
     return SDValue();
 
@@ -45350,6 +47843,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
                                       TargetLowering::DAGCombinerInfo &DCI,
                                       const X86Subtarget &Subtarget) {
   assert(Subtarget.hasAVX() && "AVX assumed for concat_vectors");
+  unsigned EltSizeInBits = VT.getScalarSizeInBits();
 
   if (llvm::all_of(Ops, [](SDValue Op) { return Op.isUndef(); }))
     return DAG.getUNDEF(VT);
@@ -45360,6 +47854,7 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
     return getZeroVector(VT, Subtarget, DAG, DL);
 
   SDValue Op0 = Ops[0];
+  bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
 
   // Fold subvector loads into one.
   // If needed, look through bitcasts to get to the load.
@@ -45376,13 +47871,28 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
   }
 
   // Repeated subvectors.
-  if (llvm::all_of(Ops, [Op0](SDValue Op) { return Op == Op0; })) {
+  if (IsSplat) {
     // If this broadcast/subv_broadcast is inserted into both halves, use a
     // larger broadcast/subv_broadcast.
     if (Op0.getOpcode() == X86ISD::VBROADCAST ||
         Op0.getOpcode() == X86ISD::SUBV_BROADCAST)
       return DAG.getNode(Op0.getOpcode(), DL, VT, Op0.getOperand(0));
 
+    // If this broadcast_load is inserted into both halves, use a larger
+    // broadcast_load. Update other uses to use an extracted subvector.
+    if (Op0.getOpcode() == X86ISD::VBROADCAST_LOAD) {
+      auto *MemIntr = cast<MemIntrinsicSDNode>(Op0);
+      SDVTList Tys = DAG.getVTList(VT, MVT::Other);
+      SDValue Ops[] = {MemIntr->getChain(), MemIntr->getBasePtr()};
+      SDValue BcastLd = DAG.getMemIntrinsicNode(
+          X86ISD::VBROADCAST_LOAD, DL, Tys, Ops, MemIntr->getMemoryVT(),
+          MemIntr->getMemOperand());
+      DAG.ReplaceAllUsesOfValueWith(
+          Op0, extractSubVector(BcastLd, 0, DAG, DL, Op0.getValueSizeInBits()));
+      DAG.ReplaceAllUsesOfValueWith(SDValue(MemIntr, 1), BcastLd.getValue(1));
+      return BcastLd;
+    }
+
     // concat_vectors(movddup(x),movddup(x)) -> broadcast(x)
     if (Op0.getOpcode() == X86ISD::MOVDDUP && VT == MVT::v4f64 &&
         (Subtarget.hasAVX2() || MayFoldLoad(Op0.getOperand(0))))
@@ -45394,12 +47904,19 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
     // concat_vectors(scalar_to_vector(x),scalar_to_vector(x)) -> broadcast(x)
     if (Op0.getOpcode() == ISD::SCALAR_TO_VECTOR &&
         (Subtarget.hasAVX2() ||
-         (VT.getScalarSizeInBits() >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
+         (EltSizeInBits >= 32 && MayFoldLoad(Op0.getOperand(0)))) &&
         Op0.getOperand(0).getValueType() == VT.getScalarType())
       return DAG.getNode(X86ISD::VBROADCAST, DL, VT, Op0.getOperand(0));
-  }
 
-  bool IsSplat = llvm::all_of(Ops, [&Op0](SDValue Op) { return Op == Op0; });
+    // concat_vectors(extract_subvector(broadcast(x)),
+    //                extract_subvector(broadcast(x))) -> broadcast(x)
+    if (Op0.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
+        Op0.getOperand(0).getValueType() == VT) {
+      if (Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST ||
+          Op0.getOperand(0).getOpcode() == X86ISD::VBROADCAST_LOAD)
+        return Op0.getOperand(0);
+    }
+  }
 
   // Repeated opcode.
   // TODO - combineX86ShufflesRecursively should handle shuffle concatenation
@@ -45409,6 +47926,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
       })) {
     unsigned NumOps = Ops.size();
     switch (Op0.getOpcode()) {
+    case X86ISD::SHUFP: {
+      // Add SHUFPD support if/when necessary.
+      if (!IsSplat && VT.getScalarType() == MVT::f32 &&
+          llvm::all_of(Ops, [Op0](SDValue Op) {
+            return Op.getOperand(2) == Op0.getOperand(2);
+          })) {
+        SmallVector<SDValue, 2> LHS, RHS;
+        for (unsigned i = 0; i != NumOps; ++i) {
+          LHS.push_back(Ops[i].getOperand(0));
+          RHS.push_back(Ops[i].getOperand(1));
+        }
+        return DAG.getNode(Op0.getOpcode(), DL, VT,
+                           DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
+                           DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
+                           Op0.getOperand(2));
+      }
+      break;
+    }
     case X86ISD::PSHUFHW:
     case X86ISD::PSHUFLW:
     case X86ISD::PSHUFD:
@@ -45435,8 +47970,42 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
         return DAG.getBitcast(VT, Res);
       }
       break;
+    case X86ISD::VSHLI:
+    case X86ISD::VSRAI:
+    case X86ISD::VSRLI:
+      if (((VT.is256BitVector() && Subtarget.hasInt256()) ||
+           (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
+            (EltSizeInBits >= 32 || Subtarget.useBWIRegs()))) &&
+          llvm::all_of(Ops, [Op0](SDValue Op) {
+            return Op0.getOperand(1) == Op.getOperand(1);
+          })) {
+        SmallVector<SDValue, 2> Src;
+        for (unsigned i = 0; i != NumOps; ++i)
+          Src.push_back(Ops[i].getOperand(0));
+        return DAG.getNode(Op0.getOpcode(), DL, VT,
+                           DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
+                           Op0.getOperand(1));
+      }
+      break;
+    case X86ISD::VPERMI:
+    case X86ISD::VROTLI:
+    case X86ISD::VROTRI:
+      if (VT.is512BitVector() && Subtarget.useAVX512Regs() &&
+          llvm::all_of(Ops, [Op0](SDValue Op) {
+            return Op0.getOperand(1) == Op.getOperand(1);
+          })) {
+        SmallVector<SDValue, 2> Src;
+        for (unsigned i = 0; i != NumOps; ++i)
+          Src.push_back(Ops[i].getOperand(0));
+        return DAG.getNode(Op0.getOpcode(), DL, VT,
+                           DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, Src),
+                           Op0.getOperand(1));
+      }
+      break;
+    case X86ISD::PACKSS:
     case X86ISD::PACKUS:
-      if (NumOps == 2 && VT.is256BitVector() && Subtarget.hasInt256()) {
+      if (!IsSplat && NumOps == 2 && VT.is256BitVector() &&
+          Subtarget.hasInt256()) {
         SmallVector<SDValue, 2> LHS, RHS;
         for (unsigned i = 0; i != NumOps; ++i) {
           LHS.push_back(Ops[i].getOperand(0));
@@ -45450,6 +48019,24 @@ static SDValue combineConcatVectorOps(const SDLoc &DL, MVT VT,
                            DAG.getNode(ISD::CONCAT_VECTORS, DL, SrcVT, RHS));
       }
       break;
+    case X86ISD::PALIGNR:
+      if (!IsSplat &&
+          ((VT.is256BitVector() && Subtarget.hasInt256()) ||
+           (VT.is512BitVector() && Subtarget.useBWIRegs())) &&
+          llvm::all_of(Ops, [Op0](SDValue Op) {
+            return Op0.getOperand(2) == Op.getOperand(2);
+          })) {
+        SmallVector<SDValue, 2> LHS, RHS;
+        for (unsigned i = 0; i != NumOps; ++i) {
+          LHS.push_back(Ops[i].getOperand(0));
+          RHS.push_back(Ops[i].getOperand(1));
+        }
+        return DAG.getNode(Op0.getOpcode(), DL, VT,
+                           DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, LHS),
+                           DAG.getNode(ISD::CONCAT_VECTORS, DL, VT, RHS),
+                           Op0.getOperand(2));
+      }
+      break;
     }
   }
 
@@ -45539,7 +48126,8 @@ static SDValue combineInsertSubvector(SDNode *N, SelectionDAG &DAG,
   // if the insert or extract can be represented with a subregister operation.
   if (SubVec.getOpcode() == ISD::EXTRACT_SUBVECTOR &&
       SubVec.getOperand(0).getSimpleValueType() == OpVT &&
-      (IdxVal != 0 || !Vec.isUndef())) {
+      (IdxVal != 0 ||
+       !(Vec.isUndef() || ISD::isBuildVectorAllZeros(Vec.getNode())))) {
     int ExtIdxVal = SubVec.getConstantOperandVal(1);
     if (ExtIdxVal != 0) {
       int VecNumElts = OpVT.getVectorNumElements();
@@ -45628,7 +48216,7 @@ static SDValue narrowExtractedVectorSelect(SDNode *Ext, SelectionDAG &DAG) {
 
   unsigned SelElts = SelVT.getVectorNumElements();
   unsigned CastedElts = WideVT.getVectorNumElements();
-  unsigned ExtIdx = cast<ConstantSDNode>(Ext->getOperand(1))->getZExtValue();
+  unsigned ExtIdx = Ext->getConstantOperandVal(1);
   if (SelElts % CastedElts == 0) {
     // The select has the same or more (narrower) elements than the extract
     // operand. The extraction index gets scaled by that factor.
@@ -45673,6 +48261,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
 
   MVT VT = N->getSimpleValueType(0);
   SDValue InVec = N->getOperand(0);
+  unsigned IdxVal = N->getConstantOperandVal(1);
   SDValue InVecBC = peekThroughBitcasts(InVec);
   EVT InVecVT = InVec.getValueType();
   const TargetLowering &TLI = DAG.getTargetLoweringInfo();
@@ -45690,7 +48279,7 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
     if (isConcatenatedNot(InVecBC.getOperand(0)) ||
         isConcatenatedNot(InVecBC.getOperand(1))) {
       // extract (and v4i64 X, (not (concat Y1, Y2))), n -> andnp v2i64 X(n), Y1
-      SDValue Concat = split256IntArith(InVecBC, DAG);
+      SDValue Concat = splitVectorIntBinary(InVecBC, DAG);
       return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT,
                          DAG.getBitcast(InVecVT, Concat), N->getOperand(1));
     }
@@ -45702,8 +48291,6 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
   if (SDValue V = narrowExtractedVectorSelect(N, DAG))
     return V;
 
-  unsigned IdxVal = cast<ConstantSDNode>(N->getOperand(1))->getZExtValue();
-
   if (ISD::isBuildVectorAllZeros(InVec.getNode()))
     return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
 
@@ -45753,6 +48340,43 @@ static SDValue combineExtractSubvector(SDNode *N, SelectionDAG &DAG,
     }
   }
 
+  // If we're extracting an upper subvector from a broadcast we should just
+  // extract the lowest subvector instead which should allow
+  // SimplifyDemandedVectorElts do more simplifications.
+  if (IdxVal != 0 && (InVec.getOpcode() == X86ISD::VBROADCAST ||
+                      InVec.getOpcode() == X86ISD::VBROADCAST_LOAD))
+    return extractSubVector(InVec, 0, DAG, SDLoc(N), VT.getSizeInBits());
+
+  // If we're extracting a broadcasted subvector, just use the source.
+  if (InVec.getOpcode() == X86ISD::SUBV_BROADCAST &&
+      InVec.getOperand(0).getValueType() == VT)
+    return InVec.getOperand(0);
+
+  // Attempt to extract from the source of a shuffle vector.
+  if ((InVecVT.getSizeInBits() % VT.getSizeInBits()) == 0 &&
+      (IdxVal % VT.getVectorNumElements()) == 0) {
+    SmallVector<int, 32> ShuffleMask;
+    SmallVector<int, 32> ScaledMask;
+    SmallVector<SDValue, 2> ShuffleInputs;
+    unsigned NumSubVecs = InVecVT.getSizeInBits() / VT.getSizeInBits();
+    // Decode the shuffle mask and scale it so its shuffling subvectors.
+    if (getTargetShuffleInputs(InVecBC, ShuffleInputs, ShuffleMask, DAG) &&
+        scaleShuffleElements(ShuffleMask, NumSubVecs, ScaledMask)) {
+      unsigned SubVecIdx = IdxVal / VT.getVectorNumElements();
+      if (ScaledMask[SubVecIdx] == SM_SentinelUndef)
+        return DAG.getUNDEF(VT);
+      if (ScaledMask[SubVecIdx] == SM_SentinelZero)
+        return getZeroVector(VT, Subtarget, DAG, SDLoc(N));
+      SDValue Src = ShuffleInputs[ScaledMask[SubVecIdx] / NumSubVecs];
+      if (Src.getValueSizeInBits() == InVecVT.getSizeInBits()) {
+        unsigned SrcSubVecIdx = ScaledMask[SubVecIdx] % NumSubVecs;
+        unsigned SrcEltIdx = SrcSubVecIdx * VT.getVectorNumElements();
+        return extractSubVector(DAG.getBitcast(InVecVT, Src), SrcEltIdx, DAG,
+                                SDLoc(N), VT.getSizeInBits());
+      }
+    }
+  }
+
   // If we're extracting the lowest subvector and we're the only user,
   // we may be able to perform this with a smaller vector width.
   if (IdxVal == 0 && InVec.hasOneUse()) {
@@ -45825,13 +48449,30 @@ static SDValue combineScalarToVector(SDNode *N, SelectionDAG &DAG) {
                            Src.getOperand(1));
 
   // Reduce v2i64 to v4i32 if we don't need the upper bits.
-  // TODO: Move to DAGCombine?
-  if (VT == MVT::v2i64 && Src.getOpcode() == ISD::ANY_EXTEND &&
-      Src.getValueType() == MVT::i64 && Src.hasOneUse() &&
-      Src.getOperand(0).getScalarValueSizeInBits() <= 32)
-    return DAG.getBitcast(
-        VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
-                        DAG.getAnyExtOrTrunc(Src.getOperand(0), DL, MVT::i32)));
+  // TODO: Move to DAGCombine/SimplifyDemandedBits?
+  if (VT == MVT::v2i64 || VT == MVT::v2f64) {
+    auto IsAnyExt64 = [](SDValue Op) {
+      if (Op.getValueType() != MVT::i64 || !Op.hasOneUse())
+        return SDValue();
+      if (Op.getOpcode() == ISD::ANY_EXTEND &&
+          Op.getOperand(0).getScalarValueSizeInBits() <= 32)
+        return Op.getOperand(0);
+      if (auto *Ld = dyn_cast<LoadSDNode>(Op))
+        if (Ld->getExtensionType() == ISD::EXTLOAD &&
+            Ld->getMemoryVT().getScalarSizeInBits() <= 32)
+          return Op;
+      return SDValue();
+    };
+    if (SDValue ExtSrc = IsAnyExt64(peekThroughOneUseBitcasts(Src)))
+      return DAG.getBitcast(
+          VT, DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4i32,
+                          DAG.getAnyExtOrTrunc(ExtSrc, DL, MVT::i32)));
+  }
+
+  // Combine (v2i64 (scalar_to_vector (i64 (bitconvert (mmx))))) to MOVQ2DQ.
+  if (VT == MVT::v2i64 && Src.getOpcode() == ISD::BITCAST &&
+      Src.getOperand(0).getValueType() == MVT::x86mmx)
+    return DAG.getNode(X86ISD::MOVQ2DQ, DL, VT, Src.getOperand(0));
 
   return SDValue();
 }
@@ -45902,13 +48543,16 @@ static SDValue combineExtInVec(SDNode *N, SelectionDAG &DAG,
     auto *Ld = cast<LoadSDNode>(In);
     if (Ld->isSimple()) {
       MVT SVT = In.getSimpleValueType().getVectorElementType();
-      ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG ? ISD::SEXTLOAD : ISD::ZEXTLOAD;
-      EVT MemVT = EVT::getVectorVT(*DAG.getContext(), SVT,
-                                   VT.getVectorNumElements());
+      ISD::LoadExtType Ext = N->getOpcode() == ISD::SIGN_EXTEND_VECTOR_INREG
+                                 ? ISD::SEXTLOAD
+                                 : ISD::ZEXTLOAD;
+      EVT MemVT =
+          EVT::getVectorVT(*DAG.getContext(), SVT, VT.getVectorNumElements());
       if (TLI.isLoadExtLegal(Ext, VT, MemVT)) {
         SDValue Load =
             DAG.getExtLoad(Ext, SDLoc(N), VT, Ld->getChain(), Ld->getBasePtr(),
-                           Ld->getPointerInfo(), MemVT, Ld->getAlignment(),
+                           Ld->getPointerInfo(), MemVT,
+                           Ld->getOriginalAlign(),
                            Ld->getMemOperand()->getFlags());
         DAG.ReplaceAllUsesOfValueWith(SDValue(Ld, 1), Load.getValue(1));
         return Load;
@@ -45945,6 +48589,196 @@ static SDValue combineKSHIFT(SDNode *N, SelectionDAG &DAG,
   return SDValue();
 }
 
+// Optimize (fp16_to_fp (fp_to_fp16 X)) to VCVTPS2PH followed by VCVTPH2PS.
+// Done as a combine because the lowering for fp16_to_fp and fp_to_fp16 produce
+// extra instructions between the conversion due to going to scalar and back.
+static SDValue combineFP16_TO_FP(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget &Subtarget) {
+  if (Subtarget.useSoftFloat() || !Subtarget.hasF16C())
+    return SDValue();
+
+  if (N->getOperand(0).getOpcode() != ISD::FP_TO_FP16)
+    return SDValue();
+
+  if (N->getValueType(0) != MVT::f32 ||
+      N->getOperand(0).getOperand(0).getValueType() != MVT::f32)
+    return SDValue();
+
+  SDLoc dl(N);
+  SDValue Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, MVT::v4f32,
+                            N->getOperand(0).getOperand(0));
+  Res = DAG.getNode(X86ISD::CVTPS2PH, dl, MVT::v8i16, Res,
+                    DAG.getTargetConstant(4, dl, MVT::i32));
+  Res = DAG.getNode(X86ISD::CVTPH2PS, dl, MVT::v4f32, Res);
+  return DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl, MVT::f32, Res,
+                     DAG.getIntPtrConstant(0, dl));
+}
+
+static SDValue combineFP_EXTEND(SDNode *N, SelectionDAG &DAG,
+                                const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
+    return SDValue();
+
+  bool IsStrict = N->isStrictFPOpcode();
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(IsStrict ? 1 : 0);
+  EVT SrcVT = Src.getValueType();
+
+  if (!SrcVT.isVector() || SrcVT.getVectorElementType() != MVT::f16)
+    return SDValue();
+
+  if (VT.getVectorElementType() != MVT::f32 &&
+      VT.getVectorElementType() != MVT::f64)
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts == 1 || !isPowerOf2_32(NumElts))
+    return SDValue();
+
+  SDLoc dl(N);
+
+  // Convert the input to vXi16.
+  EVT IntVT = SrcVT.changeVectorElementTypeToInteger();
+  Src = DAG.getBitcast(IntVT, Src);
+
+  // Widen to at least 8 input elements.
+  if (NumElts < 8) {
+    unsigned NumConcats = 8 / NumElts;
+    SDValue Fill = NumElts == 4 ? DAG.getUNDEF(IntVT)
+                                : DAG.getConstant(0, dl, IntVT);
+    SmallVector<SDValue, 4> Ops(NumConcats, Fill);
+    Ops[0] = Src;
+    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v8i16, Ops);
+  }
+
+  // Destination is vXf32 with at least 4 elements.
+  EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::f32,
+                               std::max(4U, NumElts));
+  SDValue Cvt, Chain;
+  if (IsStrict) {
+    Cvt = DAG.getNode(X86ISD::STRICT_CVTPH2PS, dl, {CvtVT, MVT::Other},
+                      {N->getOperand(0), Src});
+    Chain = Cvt.getValue(1);
+  } else {
+    Cvt = DAG.getNode(X86ISD::CVTPH2PS, dl, CvtVT, Src);
+  }
+
+  if (NumElts < 4) {
+    assert(NumElts == 2 && "Unexpected size");
+    Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, MVT::v2f32, Cvt,
+                      DAG.getIntPtrConstant(0, dl));
+  }
+
+  if (IsStrict) {
+    // Extend to the original VT if necessary.
+    if (Cvt.getValueType() != VT) {
+      Cvt = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {VT, MVT::Other},
+                        {Chain, Cvt});
+      Chain = Cvt.getValue(1);
+    }
+    return DAG.getMergeValues({Cvt, Chain}, dl);
+  }
+
+  // Extend to the original VT if necessary.
+  return DAG.getNode(ISD::FP_EXTEND, dl, VT, Cvt);
+}
+
+// Try to find a larger VBROADCAST_LOAD that we can extract from. Limit this to
+// cases where the loads have the same input chain and the output chains are
+// unused. This avoids any memory ordering issues.
+static SDValue combineVBROADCAST_LOAD(SDNode *N, SelectionDAG &DAG,
+                                      TargetLowering::DAGCombinerInfo &DCI) {
+  // Only do this if the chain result is unused.
+  if (N->hasAnyUseOfValue(1))
+    return SDValue();
+
+  auto *MemIntrin = cast<MemIntrinsicSDNode>(N);
+
+  SDValue Ptr = MemIntrin->getBasePtr();
+  SDValue Chain = MemIntrin->getChain();
+  EVT VT = N->getSimpleValueType(0);
+  EVT MemVT = MemIntrin->getMemoryVT();
+
+  // Look at other users of our base pointer and try to find a wider broadcast.
+  // The input chain and the size of the memory VT must match.
+  for (SDNode *User : Ptr->uses())
+    if (User != N && User->getOpcode() == X86ISD::VBROADCAST_LOAD &&
+        cast<MemIntrinsicSDNode>(User)->getBasePtr() == Ptr &&
+        cast<MemIntrinsicSDNode>(User)->getChain() == Chain &&
+        cast<MemIntrinsicSDNode>(User)->getMemoryVT().getSizeInBits() ==
+            MemVT.getSizeInBits() &&
+        !User->hasAnyUseOfValue(1) &&
+        User->getValueSizeInBits(0) > VT.getSizeInBits()) {
+      SDValue Extract = extractSubVector(SDValue(User, 0), 0, DAG, SDLoc(N),
+                                         VT.getSizeInBits());
+      Extract = DAG.getBitcast(VT, Extract);
+      return DCI.CombineTo(N, Extract, SDValue(User, 1));
+    }
+
+  return SDValue();
+}
+
+static SDValue combineFP_ROUND(SDNode *N, SelectionDAG &DAG,
+                               const X86Subtarget &Subtarget) {
+  if (!Subtarget.hasF16C() || Subtarget.useSoftFloat())
+    return SDValue();
+
+  EVT VT = N->getValueType(0);
+  SDValue Src = N->getOperand(0);
+  EVT SrcVT = Src.getValueType();
+
+  if (!VT.isVector() || VT.getVectorElementType() != MVT::f16 ||
+      SrcVT.getVectorElementType() != MVT::f32)
+    return SDValue();
+
+  unsigned NumElts = VT.getVectorNumElements();
+  if (NumElts == 1 || !isPowerOf2_32(NumElts))
+    return SDValue();
+
+  SDLoc dl(N);
+
+  // Widen to at least 4 input elements.
+  if (NumElts < 4)
+    Src = DAG.getNode(ISD::CONCAT_VECTORS, dl, MVT::v4f32, Src,
+                      DAG.getConstantFP(0.0, dl, SrcVT));
+
+  // Destination is v8i16 with at least 8 elements.
+  EVT CvtVT = EVT::getVectorVT(*DAG.getContext(), MVT::i16,
+                               std::max(8U, NumElts));
+  SDValue Cvt = DAG.getNode(X86ISD::CVTPS2PH, dl, CvtVT, Src,
+                            DAG.getTargetConstant(4, dl, MVT::i32));
+
+  // Extract down to real number of elements.
+  if (NumElts < 8) {
+    EVT IntVT = VT.changeVectorElementTypeToInteger();
+    Cvt = DAG.getNode(ISD::EXTRACT_SUBVECTOR, dl, IntVT, Cvt,
+                      DAG.getIntPtrConstant(0, dl));
+  }
+
+  return DAG.getBitcast(VT, Cvt);
+}
+
+static SDValue combineMOVDQ2Q(SDNode *N, SelectionDAG &DAG) {
+  SDValue Src = N->getOperand(0);
+
+  // Turn MOVDQ2Q+simple_load into an mmx load.
+  if (ISD::isNormalLoad(Src.getNode()) && Src.hasOneUse()) {
+    LoadSDNode *LN = cast<LoadSDNode>(Src.getNode());
+
+    if (LN->isSimple()) {
+      SDValue NewLd = DAG.getLoad(MVT::x86mmx, SDLoc(N), LN->getChain(),
+                                  LN->getBasePtr(),
+                                  LN->getPointerInfo(),
+                                  LN->getOriginalAlign(),
+                                  LN->getMemOperand()->getFlags());
+      DAG.ReplaceAllUsesOfValueWith(SDValue(LN, 1), NewLd.getValue(1));
+      return NewLd;
+    }
+  }
+
+  return SDValue();
+}
+
 SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
                                              DAGCombinerInfo &DCI) const {
   SelectionDAG &DAG = DCI.DAG;
@@ -45976,8 +48810,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::ADC:         return combineADC(N, DAG, DCI);
   case ISD::MUL:            return combineMul(N, DAG, DCI, Subtarget);
   case ISD::SHL:            return combineShiftLeft(N, DAG);
-  case ISD::SRA:            return combineShiftRightArithmetic(N, DAG);
-  case ISD::SRL:            return combineShiftRightLogical(N, DAG, DCI);
+  case ISD::SRA:            return combineShiftRightArithmetic(N, DAG, Subtarget);
+  case ISD::SRL:            return combineShiftRightLogical(N, DAG, DCI, Subtarget);
   case ISD::AND:            return combineAnd(N, DAG, DCI, Subtarget);
   case ISD::OR:             return combineOr(N, DAG, DCI, Subtarget);
   case ISD::XOR:            return combineXor(N, DAG, DCI, Subtarget);
@@ -45986,6 +48820,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::MLOAD:          return combineMaskedLoad(N, DAG, DCI, Subtarget);
   case ISD::STORE:          return combineStore(N, DAG, DCI, Subtarget);
   case ISD::MSTORE:         return combineMaskedStore(N, DAG, DCI, Subtarget);
+  case X86ISD::VEXTRACT_STORE:
+    return combineVEXTRACT_STORE(N, DAG, DCI, Subtarget);
   case ISD::SINT_TO_FP:
   case ISD::STRICT_SINT_TO_FP:
     return combineSIntToFP(N, DAG, DCI, Subtarget);
@@ -45994,14 +48830,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
     return combineUIntToFP(N, DAG, Subtarget);
   case ISD::FADD:
   case ISD::FSUB:           return combineFaddFsub(N, DAG, Subtarget);
-  case ISD::FNEG:           return combineFneg(N, DAG, Subtarget);
+  case ISD::FNEG:           return combineFneg(N, DAG, DCI, Subtarget);
   case ISD::TRUNCATE:       return combineTruncate(N, DAG, Subtarget);
-  case X86ISD::VTRUNC:      return combineVTRUNC(N, DAG);
+  case X86ISD::VTRUNC:      return combineVTRUNC(N, DAG, DCI);
   case X86ISD::ANDNP:       return combineAndnp(N, DAG, DCI, Subtarget);
   case X86ISD::FAND:        return combineFAnd(N, DAG, Subtarget);
   case X86ISD::FANDN:       return combineFAndn(N, DAG, Subtarget);
   case X86ISD::FXOR:
-  case X86ISD::FOR:         return combineFOr(N, DAG, Subtarget);
+  case X86ISD::FOR:         return combineFOr(N, DAG, DCI, Subtarget);
   case X86ISD::FMIN:
   case X86ISD::FMAX:        return combineFMinFMax(N, DAG);
   case ISD::FMINNUM:
@@ -46010,8 +48846,13 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::CVTUI2P:     return combineX86INT_TO_FP(N, DAG, DCI);
   case X86ISD::CVTP2SI:
   case X86ISD::CVTP2UI:
+  case X86ISD::STRICT_CVTTP2SI:
   case X86ISD::CVTTP2SI:
-  case X86ISD::CVTTP2UI:    return combineCVTP2I_CVTTP2I(N, DAG, DCI);
+  case X86ISD::STRICT_CVTTP2UI:
+  case X86ISD::CVTTP2UI:
+                            return combineCVTP2I_CVTTP2I(N, DAG, DCI);
+  case X86ISD::STRICT_CVTPH2PS:
+  case X86ISD::CVTPH2PS:    return combineCVTPH2PS(N, DAG, DCI);
   case X86ISD::BT:          return combineBT(N, DAG, DCI);
   case ISD::ANY_EXTEND:
   case ISD::ZERO_EXTEND:    return combineZext(N, DAG, DCI, Subtarget);
@@ -46034,12 +48875,14 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::VSRAI:
   case X86ISD::VSRLI:
     return combineVectorShiftImm(N, DAG, DCI, Subtarget);
+  case ISD::INSERT_VECTOR_ELT:
   case X86ISD::PINSRB:
   case X86ISD::PINSRW:      return combineVectorInsert(N, DAG, DCI, Subtarget);
   case X86ISD::SHUFP:       // Handle all target specific shuffles
   case X86ISD::INSERTPS:
   case X86ISD::EXTRQI:
   case X86ISD::INSERTQI:
+  case X86ISD::VALIGN:
   case X86ISD::PALIGNR:
   case X86ISD::VSHLDQ:
   case X86ISD::VSRLDQ:
@@ -46071,12 +48914,16 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::VECTOR_SHUFFLE: return combineShuffle(N, DAG, DCI,Subtarget);
   case X86ISD::FMADD_RND:
   case X86ISD::FMSUB:
+  case X86ISD::STRICT_FMSUB:
   case X86ISD::FMSUB_RND:
   case X86ISD::FNMADD:
+  case X86ISD::STRICT_FNMADD:
   case X86ISD::FNMADD_RND:
   case X86ISD::FNMSUB:
+  case X86ISD::STRICT_FNMSUB:
   case X86ISD::FNMSUB_RND:
-  case ISD::FMA: return combineFMA(N, DAG, DCI, Subtarget);
+  case ISD::FMA:
+  case ISD::STRICT_FMA:     return combineFMA(N, DAG, DCI, Subtarget);
   case X86ISD::FMADDSUB_RND:
   case X86ISD::FMSUBADD_RND:
   case X86ISD::FMADDSUB:
@@ -46092,6 +48939,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::PMULUDQ:     return combinePMULDQ(N, DAG, DCI, Subtarget);
   case X86ISD::KSHIFTL:
   case X86ISD::KSHIFTR:     return combineKSHIFT(N, DAG, DCI);
+  case ISD::FP16_TO_FP:     return combineFP16_TO_FP(N, DAG, Subtarget);
+  case ISD::STRICT_FP_EXTEND:
+  case ISD::FP_EXTEND:      return combineFP_EXTEND(N, DAG, Subtarget);
+  case ISD::FP_ROUND:       return combineFP_ROUND(N, DAG, Subtarget);
+  case X86ISD::VBROADCAST_LOAD: return combineVBROADCAST_LOAD(N, DAG, DCI);
+  case X86ISD::MOVDQ2Q:     return combineMOVDQ2Q(N, DAG);
   }
 
   return SDValue();
@@ -46240,27 +49093,6 @@ bool X86TargetLowering::IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const {
   return true;
 }
 
-bool X86TargetLowering::
-    isDesirableToCombineBuildVectorToShuffleTruncate(
-        ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const {
-
-  assert(SrcVT.getVectorNumElements() == ShuffleMask.size() &&
-         "Element count mismatch");
-  assert(
-      Subtarget.getTargetLowering()->isShuffleMaskLegal(ShuffleMask, SrcVT) &&
-      "Shuffle Mask expected to be legal");
-
-  // For 32-bit elements VPERMD is better than shuffle+truncate.
-  // TODO: After we improve lowerBuildVector, add execption for VPERMW.
-  if (SrcVT.getScalarSizeInBits() == 32 || !Subtarget.hasAVX2())
-    return false;
-
-  if (is128BitLaneCrossingShuffleMask(SrcVT.getSimpleVT(), ShuffleMask))
-    return false;
-
-  return true;
-}
-
 //===----------------------------------------------------------------------===//
 //                           X86 Inline Assembly Support
 //===----------------------------------------------------------------------===//
@@ -46301,7 +49133,7 @@ static bool clobbersFlagRegisters(const SmallVector<StringRef, 4> &AsmPieces) {
 }
 
 bool X86TargetLowering::ExpandInlineAsm(CallInst *CI) const {
-  InlineAsm *IA = cast<InlineAsm>(CI->getCalledValue());
+  InlineAsm *IA = cast<InlineAsm>(CI->getCalledOperand());
 
   const std::string &AsmStr = IA->getAsmString();
 
@@ -46424,7 +49256,6 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
     case 'y':
     case 'x':
     case 'v':
-    case 'Y':
     case 'l':
     case 'k': // AVX512 masking registers.
       return C_RegisterClass;
@@ -46461,7 +49292,6 @@ X86TargetLowering::getConstraintType(StringRef Constraint) const {
       default:
         break;
       case 'z':
-      case '0':
         return C_Register;
       case 'i':
       case 'm':
@@ -46517,19 +49347,17 @@ TargetLowering::ConstraintWeight
     if (type->isX86_MMXTy() && Subtarget.hasMMX())
       weight = CW_SpecificReg;
     break;
-  case 'Y': {
-    unsigned Size = StringRef(constraint).size();
-    // Pick 'i' as the next char as 'Yi' and 'Y' are synonymous, when matching 'Y'
-    char NextChar = Size == 2 ? constraint[1] : 'i';
-    if (Size > 2)
+  case 'Y':
+    if (StringRef(constraint).size() != 2)
       break;
-    switch (NextChar) {
+    switch (constraint[1]) {
       default:
         return CW_Invalid;
       // XMM0
       case 'z':
-      case '0':
-        if ((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1())
+        if (((type->getPrimitiveSizeInBits() == 128) && Subtarget.hasSSE1()) ||
+            ((type->getPrimitiveSizeInBits() == 256) && Subtarget.hasAVX()) ||
+            ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512()))
           return CW_SpecificReg;
         return CW_Invalid;
       // Conditional OpMask regs (AVX512)
@@ -46542,7 +49370,7 @@ TargetLowering::ConstraintWeight
         if (type->isX86_MMXTy() && Subtarget.hasMMX())
           return weight;
         return CW_Invalid;
-      // Any SSE reg when ISA >= SSE2, same as 'Y'
+      // Any SSE reg when ISA >= SSE2, same as 'x'
       case 'i':
       case 't':
       case '2':
@@ -46550,9 +49378,7 @@ TargetLowering::ConstraintWeight
           return CW_Invalid;
         break;
     }
-    // Fall through (handle "Y" constraint).
-    LLVM_FALLTHROUGH;
-  }
+    break;
   case 'v':
     if ((type->getPrimitiveSizeInBits() == 512) && Subtarget.hasAVX512())
       weight = CW_Register;
@@ -46634,8 +49460,6 @@ LowerXConstraint(EVT ConstraintVT) const {
   // FP X constraints get lowered to SSE1/2 registers if available, otherwise
   // 'f' like normal targets.
   if (ConstraintVT.isFloatingPoint()) {
-    if (Subtarget.hasSSE2())
-      return "Y";
     if (Subtarget.hasSSE1())
       return "x";
   }
@@ -46884,26 +49708,26 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
       break;
     case 'q':   // GENERAL_REGS in 64-bit mode, Q_REGS in 32-bit mode.
       if (Subtarget.is64Bit()) {
-        if (VT == MVT::i32 || VT == MVT::f32)
-          return std::make_pair(0U, &X86::GR32RegClass);
-        if (VT == MVT::i16)
-          return std::make_pair(0U, &X86::GR16RegClass);
         if (VT == MVT::i8 || VT == MVT::i1)
           return std::make_pair(0U, &X86::GR8RegClass);
-        if (VT == MVT::i64 || VT == MVT::f64)
+        if (VT == MVT::i16)
+          return std::make_pair(0U, &X86::GR16RegClass);
+        if (VT == MVT::i32 || VT == MVT::f32)
+          return std::make_pair(0U, &X86::GR32RegClass);
+        if (VT != MVT::f80)
           return std::make_pair(0U, &X86::GR64RegClass);
         break;
       }
       LLVM_FALLTHROUGH;
       // 32-bit fallthrough
     case 'Q':   // Q_REGS
-      if (VT == MVT::i32 || VT == MVT::f32)
-        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
-      if (VT == MVT::i16)
-        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
       if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, &X86::GR8_ABCD_LRegClass);
-      if (VT == MVT::i64)
+      if (VT == MVT::i16)
+        return std::make_pair(0U, &X86::GR16_ABCDRegClass);
+      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
+        return std::make_pair(0U, &X86::GR32_ABCDRegClass);
+      if (VT != MVT::f80)
         return std::make_pair(0U, &X86::GR64_ABCDRegClass);
       break;
     case 'r':   // GENERAL_REGS
@@ -46914,15 +49738,19 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, &X86::GR16RegClass);
       if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
         return std::make_pair(0U, &X86::GR32RegClass);
-      return std::make_pair(0U, &X86::GR64RegClass);
+      if (VT != MVT::f80)
+        return std::make_pair(0U, &X86::GR64RegClass);
+      break;
     case 'R':   // LEGACY_REGS
       if (VT == MVT::i8 || VT == MVT::i1)
         return std::make_pair(0U, &X86::GR8_NOREXRegClass);
       if (VT == MVT::i16)
         return std::make_pair(0U, &X86::GR16_NOREXRegClass);
-      if (VT == MVT::i32 || !Subtarget.is64Bit())
+      if (VT == MVT::i32 || VT == MVT::f32 || !Subtarget.is64Bit())
         return std::make_pair(0U, &X86::GR32_NOREXRegClass);
-      return std::make_pair(0U, &X86::GR64_NOREXRegClass);
+      if (VT != MVT::f80)
+        return std::make_pair(0U, &X86::GR64_NOREXRegClass);
+      break;
     case 'f':  // FP Stack registers.
       // If SSE is enabled for this VT, use f80 to ensure the isel moves the
       // value to the correct fpstack register class.
@@ -46930,13 +49758,12 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         return std::make_pair(0U, &X86::RFP32RegClass);
       if (VT == MVT::f64 && !isScalarFPTypeInSSEReg(VT))
         return std::make_pair(0U, &X86::RFP64RegClass);
-      return std::make_pair(0U, &X86::RFP80RegClass);
+      if (VT == MVT::f32 || VT == MVT::f64 || VT == MVT::f80)
+        return std::make_pair(0U, &X86::RFP80RegClass);
+      break;
     case 'y':   // MMX_REGS if MMX allowed.
       if (!Subtarget.hasMMX()) break;
       return std::make_pair(0U, &X86::VR64RegClass);
-    case 'Y':   // SSE_REGS if SSE2 allowed
-      if (!Subtarget.hasSSE2()) break;
-      LLVM_FALLTHROUGH;
     case 'v':
     case 'x':   // SSE_REGS if SSE1 allowed or AVX_REGS if AVX allowed
       if (!Subtarget.hasSSE1()) break;
@@ -46955,7 +49782,13 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         if (VConstraint && Subtarget.hasVLX())
           return std::make_pair(0U, &X86::FR64XRegClass);
         return std::make_pair(0U, &X86::FR64RegClass);
-      // TODO: Handle i128 in FR128RegClass after it is tested well.
+      case MVT::i128:
+        if (Subtarget.is64Bit()) {
+          if (VConstraint && Subtarget.hasVLX())
+            return std::make_pair(0U, &X86::VR128XRegClass);
+          return std::make_pair(0U, &X86::VR128RegClass);
+        }
+        break;
       // Vector types and fp128.
       case MVT::f128:
       case MVT::v16i8:
@@ -46979,6 +49812,8 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
         if (Subtarget.hasAVX())
           return std::make_pair(0U, &X86::VR256RegClass);
         break;
+      case MVT::v64i8:
+      case MVT::v32i16:
       case MVT::v8f64:
       case MVT::v16f32:
       case MVT::v16i32:
@@ -46997,14 +49832,50 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
     case 'i':
     case 't':
     case '2':
-      return getRegForInlineAsmConstraint(TRI, "Y", VT);
+      return getRegForInlineAsmConstraint(TRI, "x", VT);
     case 'm':
       if (!Subtarget.hasMMX()) break;
       return std::make_pair(0U, &X86::VR64RegClass);
     case 'z':
-    case '0':
       if (!Subtarget.hasSSE1()) break;
-      return std::make_pair(X86::XMM0, &X86::VR128RegClass);
+      switch (VT.SimpleTy) {
+      default: break;
+      // Scalar SSE types.
+      case MVT::f32:
+      case MVT::i32:
+        return std::make_pair(X86::XMM0, &X86::FR32RegClass);
+      case MVT::f64:
+      case MVT::i64:
+        return std::make_pair(X86::XMM0, &X86::FR64RegClass);
+      case MVT::f128:
+      case MVT::v16i8:
+      case MVT::v8i16:
+      case MVT::v4i32:
+      case MVT::v2i64:
+      case MVT::v4f32:
+      case MVT::v2f64:
+        return std::make_pair(X86::XMM0, &X86::VR128RegClass);
+      // AVX types.
+      case MVT::v32i8:
+      case MVT::v16i16:
+      case MVT::v8i32:
+      case MVT::v4i64:
+      case MVT::v8f32:
+      case MVT::v4f64:
+        if (Subtarget.hasAVX())
+          return std::make_pair(X86::YMM0, &X86::VR256RegClass);
+        break;
+      case MVT::v64i8:
+      case MVT::v32i16:
+      case MVT::v8f64:
+      case MVT::v16f32:
+      case MVT::v16i32:
+      case MVT::v8i64:
+        if (Subtarget.hasAVX512())
+          return std::make_pair(X86::ZMM0, &X86::VR512_0_15RegClass);
+        break;
+      }
+      break;
     case 'k':
       // This register class doesn't allocate k0 for masked vector operation.
       if (Subtarget.hasAVX512()) {
@@ -47030,7 +49901,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
 
   // Use the default implementation in TargetLowering to convert the register
   // constraint into a member of a register class.
-  std::pair<unsigned, const TargetRegisterClass*> Res;
+  std::pair<Register, const TargetRegisterClass*> Res;
   Res = TargetLowering::getRegForInlineAsmConstraint(TRI, Constraint, VT);
 
   // Not found as a standard register?
@@ -47101,7 +49972,7 @@ X86TargetLowering::getRegForInlineAsmConstraint(const TargetRegisterInfo *TRI,
   if (isGRClass(*Class)) {
     unsigned Size = VT.getSizeInBits();
     if (Size == 1) Size = 8;
-    unsigned DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
+    Register DestReg = getX86SubSuperRegisterOrZero(Res.first, Size);
     if (DestReg > 0) {
       bool is64Bit = Subtarget.is64Bit();
       const TargetRegisterClass *RC =
@@ -47217,8 +50088,7 @@ bool X86TargetLowering::isIntDivCheap(EVT VT, AttributeList Attr) const {
   // integer division, leaving the division as-is is a loss even in terms of
   // size, because it will have to be scalarized, while the alternative code
   // sequence can be performed in vector form.
-  bool OptSize =
-      Attr.hasAttribute(AttributeList::FunctionIndex, Attribute::MinSize);
+  bool OptSize = Attr.hasFnAttribute(Attribute::MinSize);
   return OptSize && !VT.isVector();
 }
 
@@ -47275,10 +50145,35 @@ bool X86TargetLowering::supportSwiftError() const {
   return Subtarget.is64Bit();
 }
 
+/// Returns true if stack probing through a function call is requested.
+bool X86TargetLowering::hasStackProbeSymbol(MachineFunction &MF) const {
+  return !getStackProbeSymbolName(MF).empty();
+}
+
+/// Returns true if stack probing through inline assembly is requested.
+bool X86TargetLowering::hasInlineStackProbe(MachineFunction &MF) const {
+
+  // No inline stack probe for Windows, they have their own mechanism.
+  if (Subtarget.isOSWindows() ||
+      MF.getFunction().hasFnAttribute("no-stack-arg-probe"))
+    return false;
+
+  // If the function specifically requests inline stack probes, emit them.
+  if (MF.getFunction().hasFnAttribute("probe-stack"))
+    return MF.getFunction().getFnAttribute("probe-stack").getValueAsString() ==
+           "inline-asm";
+
+  return false;
+}
+
 /// Returns the name of the symbol used to emit stack probes or the empty
 /// string if not applicable.
 StringRef
 X86TargetLowering::getStackProbeSymbolName(MachineFunction &MF) const {
+  // Inline Stack probes disable stack probe call
+  if (hasInlineStackProbe(MF))
+    return "";
+
   // If the function specifically requests stack probes, emit them.
   if (MF.getFunction().hasFnAttribute("probe-stack"))
     return MF.getFunction().getFnAttribute("probe-stack").getValueAsString();
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 3a17099da38f..7f3dc90a2d73 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -14,8 +14,6 @@
 #ifndef LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
 #define LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
 
-#include "llvm/CodeGen/CallingConvLower.h"
-#include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLowering.h"
 
 namespace llvm {
@@ -24,680 +22,809 @@ namespace llvm {
 
   namespace X86ISD {
     // X86 Specific DAG Nodes
-    enum NodeType : unsigned {
-      // Start the numbering where the builtin ops leave off.
-      FIRST_NUMBER = ISD::BUILTIN_OP_END,
-
-      /// Bit scan forward.
-      BSF,
-      /// Bit scan reverse.
-      BSR,
-
-      /// Double shift instructions. These correspond to
-      /// X86::SHLDxx and X86::SHRDxx instructions.
-      SHLD,
-      SHRD,
-
-      /// Bitwise logical AND of floating point values. This corresponds
-      /// to X86::ANDPS or X86::ANDPD.
-      FAND,
-
-      /// Bitwise logical OR of floating point values. This corresponds
-      /// to X86::ORPS or X86::ORPD.
-      FOR,
-
-      /// Bitwise logical XOR of floating point values. This corresponds
-      /// to X86::XORPS or X86::XORPD.
-      FXOR,
-
-      ///  Bitwise logical ANDNOT of floating point values. This
-      /// corresponds to X86::ANDNPS or X86::ANDNPD.
-      FANDN,
-
-      /// These operations represent an abstract X86 call
-      /// instruction, which includes a bunch of information.  In particular the
-      /// operands of these node are:
-      ///
-      ///     #0 - The incoming token chain
-      ///     #1 - The callee
-      ///     #2 - The number of arg bytes the caller pushes on the stack.
-      ///     #3 - The number of arg bytes the callee pops off the stack.
-      ///     #4 - The value to pass in AL/AX/EAX (optional)
-      ///     #5 - The value to pass in DL/DX/EDX (optional)
-      ///
-      /// The result values of these nodes are:
-      ///
-      ///     #0 - The outgoing token chain
-      ///     #1 - The first register result value (optional)
-      ///     #2 - The second register result value (optional)
-      ///
-      CALL,
-
-      /// Same as call except it adds the NoTrack prefix.
-      NT_CALL,
-
-      /// X86 compare and logical compare instructions.
-      CMP, COMI, UCOMI,
-
-      /// X86 bit-test instructions.
-      BT,
-
-      /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
-      /// operand, usually produced by a CMP instruction.
-      SETCC,
-
-      /// X86 Select
-      SELECTS,
-
-      // Same as SETCC except it's materialized with a sbb and the value is all
-      // one's or all zero's.
-      SETCC_CARRY,  // R = carry_bit ? ~0 : 0
-
-      /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
-      /// Operands are two FP values to compare; result is a mask of
-      /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
-      FSETCC,
-
-      /// X86 FP SETCC, similar to above, but with output as an i1 mask and
-      /// and a version with SAE.
-      FSETCCM, FSETCCM_SAE,
-
-      /// X86 conditional moves. Operand 0 and operand 1 are the two values
-      /// to select from. Operand 2 is the condition code, and operand 3 is the
-      /// flag operand produced by a CMP or TEST instruction.
-      CMOV,
-
-      /// X86 conditional branches. Operand 0 is the chain operand, operand 1
-      /// is the block to branch if condition is true, operand 2 is the
-      /// condition code, and operand 3 is the flag operand produced by a CMP
-      /// or TEST instruction.
-      BRCOND,
-
-      /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
-      /// operand 1 is the target address.
-      NT_BRIND,
-
-      /// Return with a flag operand. Operand 0 is the chain operand, operand
-      /// 1 is the number of bytes of stack to pop.
-      RET_FLAG,
-
-      /// Return from interrupt. Operand 0 is the number of bytes to pop.
-      IRET,
-
-      /// Repeat fill, corresponds to X86::REP_STOSx.
-      REP_STOS,
-
-      /// Repeat move, corresponds to X86::REP_MOVSx.
-      REP_MOVS,
-
-      /// On Darwin, this node represents the result of the popl
-      /// at function entry, used for PIC code.
-      GlobalBaseReg,
-
-      /// A wrapper node for TargetConstantPool, TargetJumpTable,
-      /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
-      /// MCSymbol and TargetBlockAddress.
-      Wrapper,
-
-      /// Special wrapper used under X86-64 PIC mode for RIP
-      /// relative displacements.
-      WrapperRIP,
-
-      /// Copies a 64-bit value from an MMX vector to the low word
-      /// of an XMM vector, with the high word zero filled.
-      MOVQ2DQ,
-
-      /// Copies a 64-bit value from the low word of an XMM vector
-      /// to an MMX vector.
-      MOVDQ2Q,
-
-      /// Copies a 32-bit value from the low word of a MMX
-      /// vector to a GPR.
-      MMX_MOVD2W,
-
-      /// Copies a GPR into the low 32-bit word of a MMX vector
-      /// and zero out the high word.
-      MMX_MOVW2D,
-
-      /// Extract an 8-bit value from a vector and zero extend it to
-      /// i32, corresponds to X86::PEXTRB.
-      PEXTRB,
-
-      /// Extract a 16-bit value from a vector and zero extend it to
-      /// i32, corresponds to X86::PEXTRW.
-      PEXTRW,
-
-      /// Insert any element of a 4 x float vector into any element
-      /// of a destination 4 x floatvector.
-      INSERTPS,
-
-      /// Insert the lower 8-bits of a 32-bit value to a vector,
-      /// corresponds to X86::PINSRB.
-      PINSRB,
-
-      /// Insert the lower 16-bits of a 32-bit value to a vector,
-      /// corresponds to X86::PINSRW.
-      PINSRW,
-
-      /// Shuffle 16 8-bit values within a vector.
-      PSHUFB,
-
-      /// Compute Sum of Absolute Differences.
-      PSADBW,
-      /// Compute Double Block Packed Sum-Absolute-Differences
-      DBPSADBW,
-
-      /// Bitwise Logical AND NOT of Packed FP values.
-      ANDNP,
-
-      /// Blend where the selector is an immediate.
-      BLENDI,
-
-      /// Dynamic (non-constant condition) vector blend where only the sign bits
-      /// of the condition elements are used. This is used to enforce that the
-      /// condition mask is not valid for generic VSELECT optimizations. This
-      /// is also used to implement the intrinsics.
-      /// Operands are in VSELECT order: MASK, TRUE, FALSE
-      BLENDV,
-
-      /// Combined add and sub on an FP vector.
-      ADDSUB,
-
-      //  FP vector ops with rounding mode.
-      FADD_RND, FADDS, FADDS_RND,
-      FSUB_RND, FSUBS, FSUBS_RND,
-      FMUL_RND, FMULS, FMULS_RND,
-      FDIV_RND, FDIVS, FDIVS_RND,
-      FMAX_SAE, FMAXS_SAE,
-      FMIN_SAE, FMINS_SAE,
-      FSQRT_RND, FSQRTS, FSQRTS_RND,
-
-      // FP vector get exponent.
-      FGETEXP, FGETEXP_SAE, FGETEXPS, FGETEXPS_SAE,
-      // Extract Normalized Mantissas.
-      VGETMANT, VGETMANT_SAE, VGETMANTS, VGETMANTS_SAE,
-      // FP Scale.
-      SCALEF, SCALEF_RND,
-      SCALEFS, SCALEFS_RND,
-
-      // Unsigned Integer average.
-      AVG,
-
-      /// Integer horizontal add/sub.
-      HADD,
-      HSUB,
-
-      /// Floating point horizontal add/sub.
-      FHADD,
-      FHSUB,
-
-      // Detect Conflicts Within a Vector
-      CONFLICT,
-
-      /// Floating point max and min.
-      FMAX, FMIN,
-
-      /// Commutative FMIN and FMAX.
-      FMAXC, FMINC,
-
-      /// Scalar intrinsic floating point max and min.
-      FMAXS, FMINS,
-
-      /// Floating point reciprocal-sqrt and reciprocal approximation.
-      /// Note that these typically require refinement
-      /// in order to obtain suitable precision.
-      FRSQRT, FRCP,
-
-      // AVX-512 reciprocal approximations with a little more precision.
-      RSQRT14, RSQRT14S, RCP14, RCP14S,
-
-      // Thread Local Storage.
-      TLSADDR,
-
-      // Thread Local Storage. A call to get the start address
-      // of the TLS block for the current module.
-      TLSBASEADDR,
-
-      // Thread Local Storage.  When calling to an OS provided
-      // thunk at the address from an earlier relocation.
-      TLSCALL,
+  enum NodeType : unsigned {
+    // Start the numbering where the builtin ops leave off.
+    FIRST_NUMBER = ISD::BUILTIN_OP_END,
+
+    /// Bit scan forward.
+    BSF,
+    /// Bit scan reverse.
+    BSR,
+
+    /// X86 funnel/double shift i16 instructions. These correspond to
+    /// X86::SHLDW and X86::SHRDW instructions which have different amt
+    /// modulo rules to generic funnel shifts.
+    /// NOTE: The operand order matches ISD::FSHL/FSHR not SHLD/SHRD.
+    FSHL,
+    FSHR,
+
+    /// Bitwise logical AND of floating point values. This corresponds
+    /// to X86::ANDPS or X86::ANDPD.
+    FAND,
+
+    /// Bitwise logical OR of floating point values. This corresponds
+    /// to X86::ORPS or X86::ORPD.
+    FOR,
+
+    /// Bitwise logical XOR of floating point values. This corresponds
+    /// to X86::XORPS or X86::XORPD.
+    FXOR,
+
+    ///  Bitwise logical ANDNOT of floating point values. This
+    /// corresponds to X86::ANDNPS or X86::ANDNPD.
+    FANDN,
+
+    /// These operations represent an abstract X86 call
+    /// instruction, which includes a bunch of information.  In particular the
+    /// operands of these node are:
+    ///
+    ///     #0 - The incoming token chain
+    ///     #1 - The callee
+    ///     #2 - The number of arg bytes the caller pushes on the stack.
+    ///     #3 - The number of arg bytes the callee pops off the stack.
+    ///     #4 - The value to pass in AL/AX/EAX (optional)
+    ///     #5 - The value to pass in DL/DX/EDX (optional)
+    ///
+    /// The result values of these nodes are:
+    ///
+    ///     #0 - The outgoing token chain
+    ///     #1 - The first register result value (optional)
+    ///     #2 - The second register result value (optional)
+    ///
+    CALL,
 
-      // Exception Handling helpers.
-      EH_RETURN,
+    /// Same as call except it adds the NoTrack prefix.
+    NT_CALL,
 
-      // SjLj exception handling setjmp.
-      EH_SJLJ_SETJMP,
+    /// X86 compare and logical compare instructions.
+    CMP,
+    FCMP,
+    COMI,
+    UCOMI,
 
-      // SjLj exception handling longjmp.
-      EH_SJLJ_LONGJMP,
+    /// X86 bit-test instructions.
+    BT,
 
-      // SjLj exception handling dispatch.
-      EH_SJLJ_SETUP_DISPATCH,
+    /// X86 SetCC. Operand 0 is condition code, and operand 1 is the EFLAGS
+    /// operand, usually produced by a CMP instruction.
+    SETCC,
 
-      /// Tail call return. See X86TargetLowering::LowerCall for
-      /// the list of operands.
-      TC_RETURN,
+    /// X86 Select
+    SELECTS,
 
-      // Vector move to low scalar and zero higher vector elements.
-      VZEXT_MOVL,
+    // Same as SETCC except it's materialized with a sbb and the value is all
+    // one's or all zero's.
+    SETCC_CARRY, // R = carry_bit ? ~0 : 0
 
-      // Vector integer truncate.
-      VTRUNC,
-      // Vector integer truncate with unsigned/signed saturation.
-      VTRUNCUS, VTRUNCS,
+    /// X86 FP SETCC, implemented with CMP{cc}SS/CMP{cc}SD.
+    /// Operands are two FP values to compare; result is a mask of
+    /// 0s or 1s.  Generally DTRT for C/C++ with NaNs.
+    FSETCC,
 
-      // Masked version of the above. Used when less than a 128-bit result is
-      // produced since the mask only applies to the lower elements and can't
-      // be represented by a select.
-      // SRC, PASSTHRU, MASK
-      VMTRUNC, VMTRUNCUS, VMTRUNCS,
-
-      // Vector FP extend.
-      VFPEXT, VFPEXT_SAE, VFPEXTS, VFPEXTS_SAE,
-
-      // Vector FP round.
-      VFPROUND, VFPROUND_RND, VFPROUNDS, VFPROUNDS_RND,
-
-      // Masked version of above. Used for v2f64->v4f32.
-      // SRC, PASSTHRU, MASK
-      VMFPROUND,
-
-      // 128-bit vector logical left / right shift
-      VSHLDQ, VSRLDQ,
-
-      // Vector shift elements
-      VSHL, VSRL, VSRA,
-
-      // Vector variable shift
-      VSHLV, VSRLV, VSRAV,
-
-      // Vector shift elements by immediate
-      VSHLI, VSRLI, VSRAI,
-
-      // Shifts of mask registers.
-      KSHIFTL, KSHIFTR,
-
-      // Bit rotate by immediate
-      VROTLI, VROTRI,
-
-      // Vector packed double/float comparison.
-      CMPP,
-
-      // Vector integer comparisons.
-      PCMPEQ, PCMPGT,
-
-      // v8i16 Horizontal minimum and position.
-      PHMINPOS,
-
-      MULTISHIFT,
-
-      /// Vector comparison generating mask bits for fp and
-      /// integer signed and unsigned data types.
-      CMPM,
-      // Vector comparison with SAE for FP values
-      CMPM_SAE,
-
-      // Arithmetic operations with FLAGS results.
-      ADD, SUB, ADC, SBB, SMUL, UMUL,
-      OR, XOR, AND,
-
-      // Bit field extract.
-      BEXTR,
-
-      // Zero High Bits Starting with Specified Bit Position.
-      BZHI,
-
-      // X86-specific multiply by immediate.
-      MUL_IMM,
-
-      // Vector sign bit extraction.
-      MOVMSK,
-
-      // Vector bitwise comparisons.
-      PTEST,
-
-      // Vector packed fp sign bitwise comparisons.
-      TESTP,
-
-      // OR/AND test for masks.
-      KORTEST,
-      KTEST,
-
-      // ADD for masks.
-      KADD,
-
-      // Several flavors of instructions with vector shuffle behaviors.
-      // Saturated signed/unnsigned packing.
-      PACKSS,
-      PACKUS,
-      // Intra-lane alignr.
-      PALIGNR,
-      // AVX512 inter-lane alignr.
-      VALIGN,
-      PSHUFD,
-      PSHUFHW,
-      PSHUFLW,
-      SHUFP,
-      // VBMI2 Concat & Shift.
-      VSHLD,
-      VSHRD,
-      VSHLDV,
-      VSHRDV,
-      //Shuffle Packed Values at 128-bit granularity.
-      SHUF128,
-      MOVDDUP,
-      MOVSHDUP,
-      MOVSLDUP,
-      MOVLHPS,
-      MOVHLPS,
-      MOVSD,
-      MOVSS,
-      UNPCKL,
-      UNPCKH,
-      VPERMILPV,
-      VPERMILPI,
-      VPERMI,
-      VPERM2X128,
-
-      // Variable Permute (VPERM).
-      // Res = VPERMV MaskV, V0
-      VPERMV,
-
-      // 3-op Variable Permute (VPERMT2).
-      // Res = VPERMV3 V0, MaskV, V1
-      VPERMV3,
-
-      // Bitwise ternary logic.
-      VPTERNLOG,
-      // Fix Up Special Packed Float32/64 values.
-      VFIXUPIMM, VFIXUPIMM_SAE,
-      VFIXUPIMMS, VFIXUPIMMS_SAE,
-      // Range Restriction Calculation For Packed Pairs of Float32/64 values.
-      VRANGE, VRANGE_SAE, VRANGES, VRANGES_SAE,
-      // Reduce - Perform Reduction Transformation on scalar\packed FP.
-      VREDUCE, VREDUCE_SAE, VREDUCES, VREDUCES_SAE,
-      // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
-      // Also used by the legacy (V)ROUND intrinsics where we mask out the
-      // scaling part of the immediate.
-      VRNDSCALE, VRNDSCALE_SAE, VRNDSCALES, VRNDSCALES_SAE,
-      // Tests Types Of a FP Values for packed types.
-      VFPCLASS,
-      // Tests Types Of a FP Values for scalar types.
-      VFPCLASSS,
-
-      // Broadcast (splat) scalar or element 0 of a vector. If the operand is
-      // a vector, this node may change the vector length as part of the splat.
-      VBROADCAST,
-      // Broadcast mask to vector.
-      VBROADCASTM,
-      // Broadcast subvector to vector.
-      SUBV_BROADCAST,
-
-      /// SSE4A Extraction and Insertion.
-      EXTRQI, INSERTQI,
-
-      // XOP arithmetic/logical shifts.
-      VPSHA, VPSHL,
-      // XOP signed/unsigned integer comparisons.
-      VPCOM, VPCOMU,
-      // XOP packed permute bytes.
-      VPPERM,
-      // XOP two source permutation.
-      VPERMIL2,
-
-      // Vector multiply packed unsigned doubleword integers.
-      PMULUDQ,
-      // Vector multiply packed signed doubleword integers.
-      PMULDQ,
-      // Vector Multiply Packed UnsignedIntegers with Round and Scale.
-      MULHRS,
-
-      // Multiply and Add Packed Integers.
-      VPMADDUBSW, VPMADDWD,
-
-      // AVX512IFMA multiply and add.
-      // NOTE: These are different than the instruction and perform
-      // op0 x op1 + op2.
-      VPMADD52L, VPMADD52H,
-
-      // VNNI
-      VPDPBUSD,
-      VPDPBUSDS,
-      VPDPWSSD,
-      VPDPWSSDS,
-
-      // FMA nodes.
-      // We use the target independent ISD::FMA for the non-inverted case.
-      FNMADD,
-      FMSUB,
-      FNMSUB,
-      FMADDSUB,
-      FMSUBADD,
-
-      // FMA with rounding mode.
-      FMADD_RND,
-      FNMADD_RND,
-      FMSUB_RND,
-      FNMSUB_RND,
-      FMADDSUB_RND,
-      FMSUBADD_RND,
-
-      // Compress and expand.
-      COMPRESS,
-      EXPAND,
-
-      // Bits shuffle
-      VPSHUFBITQMB,
-
-      // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
-      SINT_TO_FP_RND, UINT_TO_FP_RND,
-      SCALAR_SINT_TO_FP, SCALAR_UINT_TO_FP,
-      SCALAR_SINT_TO_FP_RND, SCALAR_UINT_TO_FP_RND,
-
-      // Vector float/double to signed/unsigned integer.
-      CVTP2SI, CVTP2UI, CVTP2SI_RND, CVTP2UI_RND,
-      // Scalar float/double to signed/unsigned integer.
-      CVTS2SI, CVTS2UI, CVTS2SI_RND, CVTS2UI_RND,
-
-      // Vector float/double to signed/unsigned integer with truncation.
-      CVTTP2SI, CVTTP2UI, CVTTP2SI_SAE, CVTTP2UI_SAE,
-      // Scalar float/double to signed/unsigned integer with truncation.
-      CVTTS2SI, CVTTS2UI, CVTTS2SI_SAE, CVTTS2UI_SAE,
-
-      // Vector signed/unsigned integer to float/double.
-      CVTSI2P, CVTUI2P,
-
-      // Masked versions of above. Used for v2f64->v4f32.
-      // SRC, PASSTHRU, MASK
-      MCVTP2SI, MCVTP2UI, MCVTTP2SI, MCVTTP2UI,
-      MCVTSI2P, MCVTUI2P,
-
-      // Vector float to bfloat16.
-      // Convert TWO packed single data to one packed BF16 data
-      CVTNE2PS2BF16, 
-      // Convert packed single data to packed BF16 data
-      CVTNEPS2BF16,
-      // Masked version of above.
-      // SRC, PASSTHRU, MASK
-      MCVTNEPS2BF16,
-
-      // Dot product of BF16 pairs to accumulated into
-      // packed single precision.
-      DPBF16PS,
-
-      // Save xmm argument registers to the stack, according to %al. An operator
-      // is needed so that this can be expanded with control flow.
-      VASTART_SAVE_XMM_REGS,
-
-      // Windows's _chkstk call to do stack probing.
-      WIN_ALLOCA,
-
-      // For allocating variable amounts of stack space when using
-      // segmented stacks. Check if the current stacklet has enough space, and
-      // falls back to heap allocation if not.
-      SEG_ALLOCA,
-
-      // Memory barriers.
-      MEMBARRIER,
-      MFENCE,
-
-      // Store FP status word into i16 register.
-      FNSTSW16r,
-
-      // Store contents of %ah into %eflags.
-      SAHF,
-
-      // Get a random integer and indicate whether it is valid in CF.
-      RDRAND,
-
-      // Get a NIST SP800-90B & C compliant random integer and
-      // indicate whether it is valid in CF.
-      RDSEED,
-
-      // Protection keys
-      // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
-      // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
-      // value for ECX.
-      RDPKRU, WRPKRU,
-
-      // SSE42 string comparisons.
-      // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
-      // will emit one or two instructions based on which results are used. If
-      // flags and index/mask this allows us to use a single instruction since
-      // we won't have to pick and opcode for flags. Instead we can rely on the
-      // DAG to CSE everything and decide at isel.
-      PCMPISTR,
-      PCMPESTR,
-
-      // Test if in transactional execution.
-      XTEST,
-
-      // ERI instructions.
-      RSQRT28, RSQRT28_SAE, RSQRT28S, RSQRT28S_SAE,
-      RCP28, RCP28_SAE, RCP28S, RCP28S_SAE, EXP2, EXP2_SAE,
-
-      // Conversions between float and half-float.
-      CVTPS2PH, CVTPH2PS, CVTPH2PS_SAE,
-
-      // Masked version of above.
-      // SRC, RND, PASSTHRU, MASK
-      MCVTPS2PH,
-
-      // Galois Field Arithmetic Instructions
-      GF2P8AFFINEINVQB, GF2P8AFFINEQB, GF2P8MULB,
-
-      // LWP insert record.
-      LWPINS,
-
-      // User level wait
-      UMWAIT, TPAUSE,
-
-      // Enqueue Stores Instructions
-      ENQCMD, ENQCMDS,
-
-      // For avx512-vp2intersect
-      VP2INTERSECT,
-
-      /// X86 strict FP compare instructions.
-      STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
-      STRICT_FCMPS,
-
-      // Vector packed double/float comparison.
-      STRICT_CMPP,
-
-      /// Vector comparison generating mask bits for fp and
-      /// integer signed and unsigned data types.
-      STRICT_CMPM,
-
-      // Vector float/double to signed/unsigned integer with truncation.
-      STRICT_CVTTP2SI, STRICT_CVTTP2UI,
-
-      // Vector FP extend.
-      STRICT_VFPEXT,
-
-      // Vector FP round.
-      STRICT_VFPROUND,
-
-      // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
-      // Also used by the legacy (V)ROUND intrinsics where we mask out the
-      // scaling part of the immediate.
-      STRICT_VRNDSCALE,
-
-      // Vector signed/unsigned integer to float/double.
-      STRICT_CVTSI2P, STRICT_CVTUI2P,
-
-      // Compare and swap.
-      LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
-      LCMPXCHG8_DAG,
-      LCMPXCHG16_DAG,
-      LCMPXCHG8_SAVE_EBX_DAG,
-      LCMPXCHG16_SAVE_RBX_DAG,
-
-      /// LOCK-prefixed arithmetic read-modify-write instructions.
-      /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
-      LADD, LSUB, LOR, LXOR, LAND,
-
-      // Load, scalar_to_vector, and zero extend.
-      VZEXT_LOAD,
-
-      // extract_vector_elt, store.
-      VEXTRACT_STORE,
-
-      // scalar broadcast from memory
-      VBROADCAST_LOAD,
-
-      // Store FP control world into i16 memory.
-      FNSTCW16m,
-
-      /// This instruction implements FP_TO_SINT with the
-      /// integer destination in memory and a FP reg source.  This corresponds
-      /// to the X86::FIST*m instructions and the rounding mode change stuff. It
-      /// has two inputs (token chain and address) and two outputs (int value
-      /// and token chain). Memory VT specifies the type to store to.
-      FP_TO_INT_IN_MEM,
-
-      /// This instruction implements SINT_TO_FP with the
-      /// integer source in memory and FP reg result.  This corresponds to the
-      /// X86::FILD*m instructions. It has two inputs (token chain and address)
-      /// and two outputs (FP value and token chain). FILD_FLAG also produces a
-      /// flag). The integer source type is specified by the memory VT.
-      FILD,
-      FILD_FLAG,
-
-      /// This instruction implements a fp->int store from FP stack
-      /// slots. This corresponds to the fist instruction. It takes a
-      /// chain operand, value to store, address, and glue. The memory VT
-      /// specifies the type to store as.
-      FIST,
-
-      /// This instruction implements an extending load to FP stack slots.
-      /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
-      /// operand, and ptr to load from. The memory VT specifies the type to
-      /// load from.
-      FLD,
+    /// X86 FP SETCC, similar to above, but with output as an i1 mask and
+    /// and a version with SAE.
+    FSETCCM,
+    FSETCCM_SAE,
 
-      /// This instruction implements a truncating store from FP stack
-      /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
-      /// chain operand, value to store, address, and glue. The memory VT
-      /// specifies the type to store as.
-      FST,
-
-      /// This instruction grabs the address of the next argument
-      /// from a va_list. (reads and modifies the va_list in memory)
-      VAARG_64,
-
-      // Vector truncating store with unsigned/signed saturation
-      VTRUNCSTOREUS, VTRUNCSTORES,
-      // Vector truncating masked store with unsigned/signed saturation
-      VMTRUNCSTOREUS, VMTRUNCSTORES,
-
-      // X86 specific gather and scatter
-      MGATHER, MSCATTER,
-
-      // WARNING: Do not add anything in the end unless you want the node to
-      // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
-      // opcodes will be thought as target memory ops!
-    };
+    /// X86 conditional moves. Operand 0 and operand 1 are the two values
+    /// to select from. Operand 2 is the condition code, and operand 3 is the
+    /// flag operand produced by a CMP or TEST instruction.
+    CMOV,
+
+    /// X86 conditional branches. Operand 0 is the chain operand, operand 1
+    /// is the block to branch if condition is true, operand 2 is the
+    /// condition code, and operand 3 is the flag operand produced by a CMP
+    /// or TEST instruction.
+    BRCOND,
+
+    /// BRIND node with NoTrack prefix. Operand 0 is the chain operand and
+    /// operand 1 is the target address.
+    NT_BRIND,
+
+    /// Return with a flag operand. Operand 0 is the chain operand, operand
+    /// 1 is the number of bytes of stack to pop.
+    RET_FLAG,
+
+    /// Return from interrupt. Operand 0 is the number of bytes to pop.
+    IRET,
+
+    /// Repeat fill, corresponds to X86::REP_STOSx.
+    REP_STOS,
+
+    /// Repeat move, corresponds to X86::REP_MOVSx.
+    REP_MOVS,
+
+    /// On Darwin, this node represents the result of the popl
+    /// at function entry, used for PIC code.
+    GlobalBaseReg,
+
+    /// A wrapper node for TargetConstantPool, TargetJumpTable,
+    /// TargetExternalSymbol, TargetGlobalAddress, TargetGlobalTLSAddress,
+    /// MCSymbol and TargetBlockAddress.
+    Wrapper,
+
+    /// Special wrapper used under X86-64 PIC mode for RIP
+    /// relative displacements.
+    WrapperRIP,
+
+    /// Copies a 64-bit value from an MMX vector to the low word
+    /// of an XMM vector, with the high word zero filled.
+    MOVQ2DQ,
+
+    /// Copies a 64-bit value from the low word of an XMM vector
+    /// to an MMX vector.
+    MOVDQ2Q,
+
+    /// Copies a 32-bit value from the low word of a MMX
+    /// vector to a GPR.
+    MMX_MOVD2W,
+
+    /// Copies a GPR into the low 32-bit word of a MMX vector
+    /// and zero out the high word.
+    MMX_MOVW2D,
+
+    /// Extract an 8-bit value from a vector and zero extend it to
+    /// i32, corresponds to X86::PEXTRB.
+    PEXTRB,
+
+    /// Extract a 16-bit value from a vector and zero extend it to
+    /// i32, corresponds to X86::PEXTRW.
+    PEXTRW,
+
+    /// Insert any element of a 4 x float vector into any element
+    /// of a destination 4 x floatvector.
+    INSERTPS,
+
+    /// Insert the lower 8-bits of a 32-bit value to a vector,
+    /// corresponds to X86::PINSRB.
+    PINSRB,
+
+    /// Insert the lower 16-bits of a 32-bit value to a vector,
+    /// corresponds to X86::PINSRW.
+    PINSRW,
+
+    /// Shuffle 16 8-bit values within a vector.
+    PSHUFB,
+
+    /// Compute Sum of Absolute Differences.
+    PSADBW,
+    /// Compute Double Block Packed Sum-Absolute-Differences
+    DBPSADBW,
+
+    /// Bitwise Logical AND NOT of Packed FP values.
+    ANDNP,
+
+    /// Blend where the selector is an immediate.
+    BLENDI,
+
+    /// Dynamic (non-constant condition) vector blend where only the sign bits
+    /// of the condition elements are used. This is used to enforce that the
+    /// condition mask is not valid for generic VSELECT optimizations. This
+    /// is also used to implement the intrinsics.
+    /// Operands are in VSELECT order: MASK, TRUE, FALSE
+    BLENDV,
+
+    /// Combined add and sub on an FP vector.
+    ADDSUB,
+
+    //  FP vector ops with rounding mode.
+    FADD_RND,
+    FADDS,
+    FADDS_RND,
+    FSUB_RND,
+    FSUBS,
+    FSUBS_RND,
+    FMUL_RND,
+    FMULS,
+    FMULS_RND,
+    FDIV_RND,
+    FDIVS,
+    FDIVS_RND,
+    FMAX_SAE,
+    FMAXS_SAE,
+    FMIN_SAE,
+    FMINS_SAE,
+    FSQRT_RND,
+    FSQRTS,
+    FSQRTS_RND,
+
+    // FP vector get exponent.
+    FGETEXP,
+    FGETEXP_SAE,
+    FGETEXPS,
+    FGETEXPS_SAE,
+    // Extract Normalized Mantissas.
+    VGETMANT,
+    VGETMANT_SAE,
+    VGETMANTS,
+    VGETMANTS_SAE,
+    // FP Scale.
+    SCALEF,
+    SCALEF_RND,
+    SCALEFS,
+    SCALEFS_RND,
+
+    // Unsigned Integer average.
+    AVG,
+
+    /// Integer horizontal add/sub.
+    HADD,
+    HSUB,
+
+    /// Floating point horizontal add/sub.
+    FHADD,
+    FHSUB,
+
+    // Detect Conflicts Within a Vector
+    CONFLICT,
+
+    /// Floating point max and min.
+    FMAX,
+    FMIN,
+
+    /// Commutative FMIN and FMAX.
+    FMAXC,
+    FMINC,
+
+    /// Scalar intrinsic floating point max and min.
+    FMAXS,
+    FMINS,
+
+    /// Floating point reciprocal-sqrt and reciprocal approximation.
+    /// Note that these typically require refinement
+    /// in order to obtain suitable precision.
+    FRSQRT,
+    FRCP,
+
+    // AVX-512 reciprocal approximations with a little more precision.
+    RSQRT14,
+    RSQRT14S,
+    RCP14,
+    RCP14S,
+
+    // Thread Local Storage.
+    TLSADDR,
+
+    // Thread Local Storage. A call to get the start address
+    // of the TLS block for the current module.
+    TLSBASEADDR,
+
+    // Thread Local Storage.  When calling to an OS provided
+    // thunk at the address from an earlier relocation.
+    TLSCALL,
+
+    // Exception Handling helpers.
+    EH_RETURN,
+
+    // SjLj exception handling setjmp.
+    EH_SJLJ_SETJMP,
+
+    // SjLj exception handling longjmp.
+    EH_SJLJ_LONGJMP,
+
+    // SjLj exception handling dispatch.
+    EH_SJLJ_SETUP_DISPATCH,
+
+    /// Tail call return. See X86TargetLowering::LowerCall for
+    /// the list of operands.
+    TC_RETURN,
+
+    // Vector move to low scalar and zero higher vector elements.
+    VZEXT_MOVL,
+
+    // Vector integer truncate.
+    VTRUNC,
+    // Vector integer truncate with unsigned/signed saturation.
+    VTRUNCUS,
+    VTRUNCS,
+
+    // Masked version of the above. Used when less than a 128-bit result is
+    // produced since the mask only applies to the lower elements and can't
+    // be represented by a select.
+    // SRC, PASSTHRU, MASK
+    VMTRUNC,
+    VMTRUNCUS,
+    VMTRUNCS,
+
+    // Vector FP extend.
+    VFPEXT,
+    VFPEXT_SAE,
+    VFPEXTS,
+    VFPEXTS_SAE,
+
+    // Vector FP round.
+    VFPROUND,
+    VFPROUND_RND,
+    VFPROUNDS,
+    VFPROUNDS_RND,
+
+    // Masked version of above. Used for v2f64->v4f32.
+    // SRC, PASSTHRU, MASK
+    VMFPROUND,
+
+    // 128-bit vector logical left / right shift
+    VSHLDQ,
+    VSRLDQ,
+
+    // Vector shift elements
+    VSHL,
+    VSRL,
+    VSRA,
+
+    // Vector variable shift
+    VSHLV,
+    VSRLV,
+    VSRAV,
+
+    // Vector shift elements by immediate
+    VSHLI,
+    VSRLI,
+    VSRAI,
+
+    // Shifts of mask registers.
+    KSHIFTL,
+    KSHIFTR,
+
+    // Bit rotate by immediate
+    VROTLI,
+    VROTRI,
+
+    // Vector packed double/float comparison.
+    CMPP,
+
+    // Vector integer comparisons.
+    PCMPEQ,
+    PCMPGT,
+
+    // v8i16 Horizontal minimum and position.
+    PHMINPOS,
+
+    MULTISHIFT,
+
+    /// Vector comparison generating mask bits for fp and
+    /// integer signed and unsigned data types.
+    CMPM,
+    // Vector comparison with SAE for FP values
+    CMPM_SAE,
+
+    // Arithmetic operations with FLAGS results.
+    ADD,
+    SUB,
+    ADC,
+    SBB,
+    SMUL,
+    UMUL,
+    OR,
+    XOR,
+    AND,
+
+    // Bit field extract.
+    BEXTR,
+
+    // Zero High Bits Starting with Specified Bit Position.
+    BZHI,
+
+    // Parallel extract and deposit.
+    PDEP,
+    PEXT,
+
+    // X86-specific multiply by immediate.
+    MUL_IMM,
+
+    // Vector sign bit extraction.
+    MOVMSK,
+
+    // Vector bitwise comparisons.
+    PTEST,
+
+    // Vector packed fp sign bitwise comparisons.
+    TESTP,
+
+    // OR/AND test for masks.
+    KORTEST,
+    KTEST,
+
+    // ADD for masks.
+    KADD,
+
+    // Several flavors of instructions with vector shuffle behaviors.
+    // Saturated signed/unnsigned packing.
+    PACKSS,
+    PACKUS,
+    // Intra-lane alignr.
+    PALIGNR,
+    // AVX512 inter-lane alignr.
+    VALIGN,
+    PSHUFD,
+    PSHUFHW,
+    PSHUFLW,
+    SHUFP,
+    // VBMI2 Concat & Shift.
+    VSHLD,
+    VSHRD,
+    VSHLDV,
+    VSHRDV,
+    // Shuffle Packed Values at 128-bit granularity.
+    SHUF128,
+    MOVDDUP,
+    MOVSHDUP,
+    MOVSLDUP,
+    MOVLHPS,
+    MOVHLPS,
+    MOVSD,
+    MOVSS,
+    UNPCKL,
+    UNPCKH,
+    VPERMILPV,
+    VPERMILPI,
+    VPERMI,
+    VPERM2X128,
+
+    // Variable Permute (VPERM).
+    // Res = VPERMV MaskV, V0
+    VPERMV,
+
+    // 3-op Variable Permute (VPERMT2).
+    // Res = VPERMV3 V0, MaskV, V1
+    VPERMV3,
+
+    // Bitwise ternary logic.
+    VPTERNLOG,
+    // Fix Up Special Packed Float32/64 values.
+    VFIXUPIMM,
+    VFIXUPIMM_SAE,
+    VFIXUPIMMS,
+    VFIXUPIMMS_SAE,
+    // Range Restriction Calculation For Packed Pairs of Float32/64 values.
+    VRANGE,
+    VRANGE_SAE,
+    VRANGES,
+    VRANGES_SAE,
+    // Reduce - Perform Reduction Transformation on scalar\packed FP.
+    VREDUCE,
+    VREDUCE_SAE,
+    VREDUCES,
+    VREDUCES_SAE,
+    // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+    // Also used by the legacy (V)ROUND intrinsics where we mask out the
+    // scaling part of the immediate.
+    VRNDSCALE,
+    VRNDSCALE_SAE,
+    VRNDSCALES,
+    VRNDSCALES_SAE,
+    // Tests Types Of a FP Values for packed types.
+    VFPCLASS,
+    // Tests Types Of a FP Values for scalar types.
+    VFPCLASSS,
+
+    // Broadcast (splat) scalar or element 0 of a vector. If the operand is
+    // a vector, this node may change the vector length as part of the splat.
+    VBROADCAST,
+    // Broadcast mask to vector.
+    VBROADCASTM,
+    // Broadcast subvector to vector.
+    SUBV_BROADCAST,
+
+    /// SSE4A Extraction and Insertion.
+    EXTRQI,
+    INSERTQI,
+
+    // XOP arithmetic/logical shifts.
+    VPSHA,
+    VPSHL,
+    // XOP signed/unsigned integer comparisons.
+    VPCOM,
+    VPCOMU,
+    // XOP packed permute bytes.
+    VPPERM,
+    // XOP two source permutation.
+    VPERMIL2,
+
+    // Vector multiply packed unsigned doubleword integers.
+    PMULUDQ,
+    // Vector multiply packed signed doubleword integers.
+    PMULDQ,
+    // Vector Multiply Packed UnsignedIntegers with Round and Scale.
+    MULHRS,
+
+    // Multiply and Add Packed Integers.
+    VPMADDUBSW,
+    VPMADDWD,
+
+    // AVX512IFMA multiply and add.
+    // NOTE: These are different than the instruction and perform
+    // op0 x op1 + op2.
+    VPMADD52L,
+    VPMADD52H,
+
+    // VNNI
+    VPDPBUSD,
+    VPDPBUSDS,
+    VPDPWSSD,
+    VPDPWSSDS,
+
+    // FMA nodes.
+    // We use the target independent ISD::FMA for the non-inverted case.
+    FNMADD,
+    FMSUB,
+    FNMSUB,
+    FMADDSUB,
+    FMSUBADD,
+
+    // FMA with rounding mode.
+    FMADD_RND,
+    FNMADD_RND,
+    FMSUB_RND,
+    FNMSUB_RND,
+    FMADDSUB_RND,
+    FMSUBADD_RND,
+
+    // Compress and expand.
+    COMPRESS,
+    EXPAND,
+
+    // Bits shuffle
+    VPSHUFBITQMB,
+
+    // Convert Unsigned/Integer to Floating-Point Value with rounding mode.
+    SINT_TO_FP_RND,
+    UINT_TO_FP_RND,
+    SCALAR_SINT_TO_FP,
+    SCALAR_UINT_TO_FP,
+    SCALAR_SINT_TO_FP_RND,
+    SCALAR_UINT_TO_FP_RND,
+
+    // Vector float/double to signed/unsigned integer.
+    CVTP2SI,
+    CVTP2UI,
+    CVTP2SI_RND,
+    CVTP2UI_RND,
+    // Scalar float/double to signed/unsigned integer.
+    CVTS2SI,
+    CVTS2UI,
+    CVTS2SI_RND,
+    CVTS2UI_RND,
+
+    // Vector float/double to signed/unsigned integer with truncation.
+    CVTTP2SI,
+    CVTTP2UI,
+    CVTTP2SI_SAE,
+    CVTTP2UI_SAE,
+    // Scalar float/double to signed/unsigned integer with truncation.
+    CVTTS2SI,
+    CVTTS2UI,
+    CVTTS2SI_SAE,
+    CVTTS2UI_SAE,
+
+    // Vector signed/unsigned integer to float/double.
+    CVTSI2P,
+    CVTUI2P,
+
+    // Masked versions of above. Used for v2f64->v4f32.
+    // SRC, PASSTHRU, MASK
+    MCVTP2SI,
+    MCVTP2UI,
+    MCVTTP2SI,
+    MCVTTP2UI,
+    MCVTSI2P,
+    MCVTUI2P,
+
+    // Vector float to bfloat16.
+    // Convert TWO packed single data to one packed BF16 data
+    CVTNE2PS2BF16,
+    // Convert packed single data to packed BF16 data
+    CVTNEPS2BF16,
+    // Masked version of above.
+    // SRC, PASSTHRU, MASK
+    MCVTNEPS2BF16,
+
+    // Dot product of BF16 pairs to accumulated into
+    // packed single precision.
+    DPBF16PS,
+
+    // Save xmm argument registers to the stack, according to %al. An operator
+    // is needed so that this can be expanded with control flow.
+    VASTART_SAVE_XMM_REGS,
+
+    // Windows's _chkstk call to do stack probing.
+    WIN_ALLOCA,
+
+    // For allocating variable amounts of stack space when using
+    // segmented stacks. Check if the current stacklet has enough space, and
+    // falls back to heap allocation if not.
+    SEG_ALLOCA,
+
+    // For allocating stack space when using stack clash protector.
+    // Allocation is performed by block, and each block is probed.
+    PROBED_ALLOCA,
+
+    // Memory barriers.
+    MEMBARRIER,
+    MFENCE,
+
+    // Get a random integer and indicate whether it is valid in CF.
+    RDRAND,
+
+    // Get a NIST SP800-90B & C compliant random integer and
+    // indicate whether it is valid in CF.
+    RDSEED,
+
+    // Protection keys
+    // RDPKRU - Operand 0 is chain. Operand 1 is value for ECX.
+    // WRPKRU - Operand 0 is chain. Operand 1 is value for EDX. Operand 2 is
+    // value for ECX.
+    RDPKRU,
+    WRPKRU,
+
+    // SSE42 string comparisons.
+    // These nodes produce 3 results, index, mask, and flags. X86ISelDAGToDAG
+    // will emit one or two instructions based on which results are used. If
+    // flags and index/mask this allows us to use a single instruction since
+    // we won't have to pick and opcode for flags. Instead we can rely on the
+    // DAG to CSE everything and decide at isel.
+    PCMPISTR,
+    PCMPESTR,
+
+    // Test if in transactional execution.
+    XTEST,
+
+    // ERI instructions.
+    RSQRT28,
+    RSQRT28_SAE,
+    RSQRT28S,
+    RSQRT28S_SAE,
+    RCP28,
+    RCP28_SAE,
+    RCP28S,
+    RCP28S_SAE,
+    EXP2,
+    EXP2_SAE,
+
+    // Conversions between float and half-float.
+    CVTPS2PH,
+    CVTPH2PS,
+    CVTPH2PS_SAE,
+
+    // Masked version of above.
+    // SRC, RND, PASSTHRU, MASK
+    MCVTPS2PH,
+
+    // Galois Field Arithmetic Instructions
+    GF2P8AFFINEINVQB,
+    GF2P8AFFINEQB,
+    GF2P8MULB,
+
+    // LWP insert record.
+    LWPINS,
+
+    // User level wait
+    UMWAIT,
+    TPAUSE,
+
+    // Enqueue Stores Instructions
+    ENQCMD,
+    ENQCMDS,
+
+    // For avx512-vp2intersect
+    VP2INTERSECT,
+
+    /// X86 strict FP compare instructions.
+    STRICT_FCMP = ISD::FIRST_TARGET_STRICTFP_OPCODE,
+    STRICT_FCMPS,
+
+    // Vector packed double/float comparison.
+    STRICT_CMPP,
+
+    /// Vector comparison generating mask bits for fp and
+    /// integer signed and unsigned data types.
+    STRICT_CMPM,
+
+    // Vector float/double to signed/unsigned integer with truncation.
+    STRICT_CVTTP2SI,
+    STRICT_CVTTP2UI,
+
+    // Vector FP extend.
+    STRICT_VFPEXT,
+
+    // Vector FP round.
+    STRICT_VFPROUND,
+
+    // RndScale - Round FP Values To Include A Given Number Of Fraction Bits.
+    // Also used by the legacy (V)ROUND intrinsics where we mask out the
+    // scaling part of the immediate.
+    STRICT_VRNDSCALE,
+
+    // Vector signed/unsigned integer to float/double.
+    STRICT_CVTSI2P,
+    STRICT_CVTUI2P,
+
+    // Strict FMA nodes.
+    STRICT_FNMADD,
+    STRICT_FMSUB,
+    STRICT_FNMSUB,
+
+    // Conversions between float and half-float.
+    STRICT_CVTPS2PH,
+    STRICT_CVTPH2PS,
+
+    // Compare and swap.
+    LCMPXCHG_DAG = ISD::FIRST_TARGET_MEMORY_OPCODE,
+    LCMPXCHG8_DAG,
+    LCMPXCHG16_DAG,
+    LCMPXCHG8_SAVE_EBX_DAG,
+    LCMPXCHG16_SAVE_RBX_DAG,
+
+    /// LOCK-prefixed arithmetic read-modify-write instructions.
+    /// EFLAGS, OUTCHAIN = LADD(INCHAIN, PTR, RHS)
+    LADD,
+    LSUB,
+    LOR,
+    LXOR,
+    LAND,
+
+    // Load, scalar_to_vector, and zero extend.
+    VZEXT_LOAD,
+
+    // extract_vector_elt, store.
+    VEXTRACT_STORE,
+
+    // scalar broadcast from memory
+    VBROADCAST_LOAD,
+
+    // Store FP control world into i16 memory.
+    FNSTCW16m,
+
+    /// This instruction implements FP_TO_SINT with the
+    /// integer destination in memory and a FP reg source.  This corresponds
+    /// to the X86::FIST*m instructions and the rounding mode change stuff. It
+    /// has two inputs (token chain and address) and two outputs (int value
+    /// and token chain). Memory VT specifies the type to store to.
+    FP_TO_INT_IN_MEM,
+
+    /// This instruction implements SINT_TO_FP with the
+    /// integer source in memory and FP reg result.  This corresponds to the
+    /// X86::FILD*m instructions. It has two inputs (token chain and address)
+    /// and two outputs (FP value and token chain). The integer source type is
+    /// specified by the memory VT.
+    FILD,
+
+    /// This instruction implements a fp->int store from FP stack
+    /// slots. This corresponds to the fist instruction. It takes a
+    /// chain operand, value to store, address, and glue. The memory VT
+    /// specifies the type to store as.
+    FIST,
+
+    /// This instruction implements an extending load to FP stack slots.
+    /// This corresponds to the X86::FLD32m / X86::FLD64m. It takes a chain
+    /// operand, and ptr to load from. The memory VT specifies the type to
+    /// load from.
+    FLD,
+
+    /// This instruction implements a truncating store from FP stack
+    /// slots. This corresponds to the X86::FST32m / X86::FST64m. It takes a
+    /// chain operand, value to store, address, and glue. The memory VT
+    /// specifies the type to store as.
+    FST,
+
+    /// This instruction grabs the address of the next argument
+    /// from a va_list. (reads and modifies the va_list in memory)
+    VAARG_64,
+
+    // Vector truncating store with unsigned/signed saturation
+    VTRUNCSTOREUS,
+    VTRUNCSTORES,
+    // Vector truncating masked store with unsigned/signed saturation
+    VMTRUNCSTOREUS,
+    VMTRUNCSTORES,
+
+    // X86 specific gather and scatter
+    MGATHER,
+    MSCATTER,
+
+    // WARNING: Do not add anything in the end unless you want the node to
+    // have memop! In fact, starting from FIRST_TARGET_MEMORY_OPCODE all
+    // opcodes will be thought as target memory ops!
+  };
   } // end namespace X86ISD
 
   /// Define some predicates that are used for node matching.
@@ -717,7 +844,10 @@ namespace llvm {
 
     /// If Op is a constant whose elements are all the same constant or
     /// undefined, return true and return the constant value in \p SplatVal.
-    bool isConstantSplat(SDValue Op, APInt &SplatVal);
+    /// If we have undef bits that don't cover an entire element, we treat these
+    /// as zero if AllowPartialUndefs is set, else we fail and return false.
+    bool isConstantSplat(SDValue Op, APInt &SplatVal,
+                         bool AllowPartialUndefs = true);
   } // end namespace X86
 
   //===--------------------------------------------------------------------===//
@@ -756,19 +886,7 @@ namespace llvm {
     unsigned getByValTypeAlignment(Type *Ty,
                                    const DataLayout &DL) const override;
 
-    /// Returns the target specific optimal type for load
-    /// and store operations as a result of memset, memcpy, and memmove
-    /// lowering. If DstAlign is zero that means it's safe to destination
-    /// alignment can satisfy any constraint. Similarly if SrcAlign is zero it
-    /// means there isn't a need to check it against alignment requirement,
-    /// probably because the source does not need to be loaded. If 'IsMemset' is
-    /// true, that means it's expanding a memset. If 'ZeroMemset' is true, that
-    /// means it's a memset of zero. 'MemcpyStrSrc' indicates whether the memcpy
-    /// source is constant so it does not need to be loaded.
-    /// It returns EVT::Other if the type should be determined using generic
-    /// target-independent logic.
-    EVT getOptimalMemOpType(uint64_t Size, unsigned DstAlign, unsigned SrcAlign,
-                            bool IsMemset, bool ZeroMemset, bool MemcpyStrSrc,
+    EVT getOptimalMemOpType(const MemOp &Op,
                             const AttributeList &FuncAttributes) const override;
 
     /// Returns true if it's safe to use load / store of the
@@ -805,19 +923,6 @@ namespace llvm {
 
     SDValue PerformDAGCombine(SDNode *N, DAGCombinerInfo &DCI) const override;
 
-    // Return true if it is profitable to combine a BUILD_VECTOR with a
-    // stride-pattern to a shuffle and a truncate.
-    // Example of such a combine:
-    // v4i32 build_vector((extract_elt V, 1),
-    //                    (extract_elt V, 3),
-    //                    (extract_elt V, 5),
-    //                    (extract_elt V, 7))
-    //  -->
-    // v4i32 truncate (bitcast (shuffle<1,u,3,u,4,u,5,u,6,u,7,u> V, u) to
-    // v4i64)
-    bool isDesirableToCombineBuildVectorToShuffleTruncate(
-        ArrayRef<int> ShuffleMask, EVT SrcVT, EVT TruncVT) const override;
-
     /// Return true if the target has native support for
     /// the specified value type and it is 'desirable' to use the type for the
     /// given node type. e.g. On x86 i16 is legal, but undesirable since i16
@@ -830,15 +935,12 @@ namespace llvm {
     /// and some i16 instructions are slow.
     bool IsDesirableToPromoteOp(SDValue Op, EVT &PVT) const override;
 
-    /// Return 1 if we can compute the negated form of the specified expression
-    /// for the same cost as the expression itself, or 2 if we can compute the
-    /// negated form more cheaply than the expression itself. Else return 0.
-    char isNegatibleForFree(SDValue Op, SelectionDAG &DAG, bool LegalOperations,
-                            bool ForCodeSize, unsigned Depth) const override;
-
-    /// If isNegatibleForFree returns true, return the newly negated expression.
+    /// Return the newly negated expression if the cost is not expensive and
+    /// set the cost in \p Cost to indicate that if it is cheaper or neutral to
+    /// do the negation.
     SDValue getNegatedExpression(SDValue Op, SelectionDAG &DAG,
                                  bool LegalOperations, bool ForCodeSize,
+                                 NegatibleCost &Cost,
                                  unsigned Depth) const override;
 
     MachineBasicBlock *
@@ -934,7 +1036,8 @@ namespace llvm {
     EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Context,
                            EVT VT) const override;
 
-    bool targetShrinkDemandedConstant(SDValue Op, const APInt &Demanded,
+    bool targetShrinkDemandedConstant(SDValue Op, const APInt &DemandedBits,
+                                      const APInt &DemandedElts,
                                       TargetLoweringOpt &TLO) const override;
 
     /// Determine which of the bits specified in Mask are known to be either
@@ -958,6 +1061,12 @@ namespace llvm {
                                                  TargetLoweringOpt &TLO,
                                                  unsigned Depth) const override;
 
+    bool SimplifyDemandedVectorEltsForTargetShuffle(SDValue Op,
+                                                    const APInt &DemandedElts,
+                                                    unsigned MaskIndex,
+                                                    TargetLoweringOpt &TLO,
+                                                    unsigned Depth) const;
+
     bool SimplifyDemandedBitsForTargetNode(SDValue Op,
                                            const APInt &DemandedBits,
                                            const APInt &DemandedElts,
@@ -1047,6 +1156,8 @@ namespace llvm {
     int getScalingFactorCost(const DataLayout &DL, const AddrMode &AM, Type *Ty,
                              unsigned AS) const override;
 
+    /// This is used to enable splatted operand transforms for vector shifts
+    /// and vector funnel shifts.
     bool isVectorShiftByScalarCheap(Type *Ty) const override;
 
     /// Add x86-specific opcodes to the default list.
@@ -1075,6 +1186,10 @@ namespace llvm {
     bool isZExtFree(EVT VT1, EVT VT2) const override;
     bool isZExtFree(SDValue Val, EVT VT2) const override;
 
+    bool shouldSinkOperands(Instruction *I,
+                            SmallVectorImpl<Use *> &Ops) const override;
+    bool shouldConvertPhiType(Type *From, Type *To) const override;
+
     /// Return true if folding a vector load into ExtVal (a sign, zero, or any
     /// extend node) is profitable.
     bool isVectorLoadExtDesirable(SDValue) const override;
@@ -1171,7 +1286,8 @@ namespace llvm {
     /// Overflow nodes should get combined/lowered to optimal instructions
     /// (they should allow eliminating explicit compares by getting flags from
     /// math ops).
-    bool shouldFormOverflowOp(unsigned Opcode, EVT VT) const override;
+    bool shouldFormOverflowOp(unsigned Opcode, EVT VT,
+                              bool MathUsed) const override;
 
     bool storeOfVectorConstantIsCheap(EVT MemVT, unsigned NumElem,
                                       unsigned AddrSpace) const override {
@@ -1194,12 +1310,12 @@ namespace llvm {
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
-    unsigned
+    Register
     getExceptionPointerRegister(const Constant *PersonalityFn) const override;
 
     /// If a physical register, this returns the register that receives the
     /// exception typeid on entry to a landing pad.
-    unsigned
+    Register
     getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
     virtual bool needsFixedCatchObjects() const override;
@@ -1227,8 +1343,10 @@ namespace llvm {
     /// offset as appropriate.
     Value *getSafeStackPointerLocation(IRBuilder<> &IRB) const override;
 
-    std::pair<SDValue, SDValue> BuildFILD(SDValue Op, EVT SrcVT, SDValue Chain,
-                                          SDValue StackSlot,
+    std::pair<SDValue, SDValue> BuildFILD(EVT DstVT, EVT SrcVT, const SDLoc &DL,
+                                          SDValue Chain, SDValue Pointer,
+                                          MachinePointerInfo PtrInfo,
+                                          Align Alignment,
                                           SelectionDAG &DAG) const;
 
     bool isNoopAddrSpaceCast(unsigned SrcAS, unsigned DestAS) const override;
@@ -1236,6 +1354,8 @@ namespace llvm {
     /// Customize the preferred legalization strategy for certain types.
     LegalizeTypeAction getPreferredVectorAction(MVT VT) const override;
 
+    bool softPromoteHalfType() const override { return true; }
+
     MVT getRegisterTypeForCallingConv(LLVMContext &Context, CallingConv::ID CC,
                                       EVT VT) const override;
 
@@ -1251,6 +1371,8 @@ namespace llvm {
 
     bool supportSwiftError() const override;
 
+    bool hasStackProbeSymbol(MachineFunction &MF) const override;
+    bool hasInlineStackProbe(MachineFunction &MF) const override;
     StringRef getStackProbeSymbolName(MachineFunction &MF) const override;
 
     unsigned getStackProbeSize(MachineFunction &MF) const;
@@ -1314,7 +1436,7 @@ namespace llvm {
     SDValue LowerMemOpCallTo(SDValue Chain, SDValue StackPtr, SDValue Arg,
                              const SDLoc &dl, SelectionDAG &DAG,
                              const CCValAssign &VA,
-                             ISD::ArgFlagsTy Flags) const;
+                             ISD::ArgFlagsTy Flags, bool isByval) const;
 
     // Call lowering helpers.
 
@@ -1340,8 +1462,9 @@ namespace llvm {
 
     unsigned getAddressSpace(void) const;
 
-    SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool isSigned,
+    SDValue FP_TO_INTHelper(SDValue Op, SelectionDAG &DAG, bool IsSigned,
                             SDValue &Chain) const;
+    SDValue LRINT_LLRINTHelper(SDNode *N, SelectionDAG &DAG) const;
 
     SDValue LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerVSELECT(SDValue Op, SelectionDAG &DAG) const;
@@ -1365,8 +1488,8 @@ namespace llvm {
     SDValue LowerUINT_TO_FP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerTRUNCATE(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerFP_TO_INT(SDValue Op, SelectionDAG &DAG) const;
+    SDValue LowerLRINT_LLRINT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCC(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerSTRICT_FSETCC(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSETCCCARRY(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerSELECT(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerBRCOND(SDValue Op, SelectionDAG &DAG) const;
@@ -1431,7 +1554,7 @@ namespace llvm {
     const MCPhysReg *getScratchRegisters(CallingConv::ID CC) const override;
 
     TargetLoweringBase::AtomicExpansionKind
-    shouldExpandAtomicLoadInIR(LoadInst *SI) const override;
+    shouldExpandAtomicLoadInIR(LoadInst *LI) const override;
     bool shouldExpandAtomicStoreInIR(StoreInst *SI) const override;
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicRMWInIR(AtomicRMWInst *AI) const override;
@@ -1464,26 +1587,23 @@ namespace llvm {
     MachineBasicBlock *EmitLoweredSelect(MachineInstr &I,
                                          MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredAtomicFP(MachineInstr &I,
-                                           MachineBasicBlock *BB) const;
-
     MachineBasicBlock *EmitLoweredCatchRet(MachineInstr &MI,
                                            MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredCatchPad(MachineInstr &MI,
-                                           MachineBasicBlock *BB) const;
-
     MachineBasicBlock *EmitLoweredSegAlloca(MachineInstr &MI,
                                             MachineBasicBlock *BB) const;
 
+    MachineBasicBlock *EmitLoweredProbedAlloca(MachineInstr &MI,
+                                               MachineBasicBlock *BB) const;
+
     MachineBasicBlock *EmitLoweredTLSAddr(MachineInstr &MI,
                                           MachineBasicBlock *BB) const;
 
     MachineBasicBlock *EmitLoweredTLSCall(MachineInstr &MI,
                                           MachineBasicBlock *BB) const;
 
-    MachineBasicBlock *EmitLoweredRetpoline(MachineInstr &MI,
-                                            MachineBasicBlock *BB) const;
+    MachineBasicBlock *EmitLoweredIndirectThunk(MachineInstr &MI,
+                                                MachineBasicBlock *BB) const;
 
     MachineBasicBlock *emitEHSjLjSetJmp(MachineInstr &MI,
                                         MachineBasicBlock *MBB) const;
@@ -1497,32 +1617,25 @@ namespace llvm {
     MachineBasicBlock *emitLongJmpShadowStackFix(MachineInstr &MI,
                                                  MachineBasicBlock *MBB) const;
 
-    MachineBasicBlock *emitFMA3Instr(MachineInstr &MI,
-                                     MachineBasicBlock *MBB) const;
-
     MachineBasicBlock *EmitSjLjDispatchBlock(MachineInstr &MI,
                                              MachineBasicBlock *MBB) const;
 
-    /// Convert a comparison if required by the subtarget.
-    SDValue ConvertCmpIfNecessary(SDValue Cmp, SelectionDAG &DAG) const;
-
     /// Emit flags for the given setcc condition and operands. Also returns the
     /// corresponding X86 condition code constant in X86CC.
     SDValue emitFlagsForSetcc(SDValue Op0, SDValue Op1, ISD::CondCode CC,
                               const SDLoc &dl, SelectionDAG &DAG,
-                              SDValue &X86CC, SDValue &Chain,
-                              bool IsSignaling) const;
+                              SDValue &X86CC) const;
 
     /// Check if replacement of SQRT with RSQRT should be disabled.
-    bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override;
+    bool isFsqrtCheap(SDValue Op, SelectionDAG &DAG) const override;
 
     /// Use rsqrt* to speed up sqrt calculations.
-    SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+    SDValue getSqrtEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
                             int &RefinementSteps, bool &UseOneConstNR,
                             bool Reciprocal) const override;
 
     /// Use rcp* to speed up fdiv calculations.
-    SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+    SDValue getRecipEstimate(SDValue Op, SelectionDAG &DAG, int Enabled,
                              int &RefinementSteps) const override;
 
     /// Reassociate floating point divisions into multiply by reciprocal.
@@ -1537,101 +1650,14 @@ namespace llvm {
                              const TargetLibraryInfo *libInfo);
   } // end namespace X86
 
-  // Base class for all X86 non-masked store operations.
-  class X86StoreSDNode : public MemSDNode {
-  public:
-    X86StoreSDNode(unsigned Opcode, unsigned Order, const DebugLoc &dl,
-                   SDVTList VTs, EVT MemVT,
-                   MachineMemOperand *MMO)
-      :MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
-    const SDValue &getValue() const { return getOperand(1); }
-    const SDValue &getBasePtr() const { return getOperand(2); }
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VTRUNCSTORES ||
-        N->getOpcode() == X86ISD::VTRUNCSTOREUS;
-    }
-  };
-
-  // Base class for all X86 masked store operations.
-  // The class has the same order of operands as MaskedStoreSDNode for
-  // convenience.
-  class X86MaskedStoreSDNode : public MemSDNode {
-  public:
-    X86MaskedStoreSDNode(unsigned Opcode, unsigned Order,
-                         const DebugLoc &dl, SDVTList VTs, EVT MemVT,
-                         MachineMemOperand *MMO)
-      : MemSDNode(Opcode, Order, dl, VTs, MemVT, MMO) {}
-
-    const SDValue &getValue()   const { return getOperand(1); }
-    const SDValue &getBasePtr() const { return getOperand(2); }
-    const SDValue &getMask()    const { return getOperand(3); }
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VMTRUNCSTORES ||
-        N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
-    }
-  };
-
-  // X86 Truncating Store with Signed saturation.
-  class TruncSStoreSDNode : public X86StoreSDNode {
-  public:
-    TruncSStoreSDNode(unsigned Order, const DebugLoc &dl,
-                        SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
-      : X86StoreSDNode(X86ISD::VTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VTRUNCSTORES;
-    }
-  };
-
-  // X86 Truncating Store with Unsigned saturation.
-  class TruncUSStoreSDNode : public X86StoreSDNode {
-  public:
-    TruncUSStoreSDNode(unsigned Order, const DebugLoc &dl,
-                      SDVTList VTs, EVT MemVT, MachineMemOperand *MMO)
-      : X86StoreSDNode(X86ISD::VTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VTRUNCSTOREUS;
-    }
-  };
-
-  // X86 Truncating Masked Store with Signed saturation.
-  class MaskedTruncSStoreSDNode : public X86MaskedStoreSDNode {
-  public:
-    MaskedTruncSStoreSDNode(unsigned Order,
-                         const DebugLoc &dl, SDVTList VTs, EVT MemVT,
-                         MachineMemOperand *MMO)
-      : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTORES, Order, dl, VTs, MemVT, MMO) {}
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VMTRUNCSTORES;
-    }
-  };
-
-  // X86 Truncating Masked Store with Unsigned saturation.
-  class MaskedTruncUSStoreSDNode : public X86MaskedStoreSDNode {
-  public:
-    MaskedTruncUSStoreSDNode(unsigned Order,
-                            const DebugLoc &dl, SDVTList VTs, EVT MemVT,
-                            MachineMemOperand *MMO)
-      : X86MaskedStoreSDNode(X86ISD::VMTRUNCSTOREUS, Order, dl, VTs, MemVT, MMO) {}
-
-    static bool classof(const SDNode *N) {
-      return N->getOpcode() == X86ISD::VMTRUNCSTOREUS;
-    }
-  };
-
   // X86 specific Gather/Scatter nodes.
   // The class has the same order of operands as MaskedGatherScatterSDNode for
   // convenience.
-  class X86MaskedGatherScatterSDNode : public MemSDNode {
+  class X86MaskedGatherScatterSDNode : public MemIntrinsicSDNode {
   public:
-    X86MaskedGatherScatterSDNode(unsigned Opc, unsigned Order,
-                                 const DebugLoc &dl, SDVTList VTs, EVT MemVT,
-                                 MachineMemOperand *MMO)
-        : MemSDNode(Opc, Order, dl, VTs, MemVT, MMO) {}
+    // This is a intended as a utility and should never be directly created.
+    X86MaskedGatherScatterSDNode() = delete;
+    ~X86MaskedGatherScatterSDNode() = delete;
 
     const SDValue &getBasePtr() const { return getOperand(3); }
     const SDValue &getIndex()   const { return getOperand(4); }
@@ -1646,11 +1672,6 @@ namespace llvm {
 
   class X86MaskedGatherSDNode : public X86MaskedGatherScatterSDNode {
   public:
-    X86MaskedGatherSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
-                          EVT MemVT, MachineMemOperand *MMO)
-        : X86MaskedGatherScatterSDNode(X86ISD::MGATHER, Order, dl, VTs, MemVT,
-                                       MMO) {}
-
     const SDValue &getPassThru() const { return getOperand(1); }
 
     static bool classof(const SDNode *N) {
@@ -1660,11 +1681,6 @@ namespace llvm {
 
   class X86MaskedScatterSDNode : public X86MaskedGatherScatterSDNode {
   public:
-    X86MaskedScatterSDNode(unsigned Order, const DebugLoc &dl, SDVTList VTs,
-                           EVT MemVT, MachineMemOperand *MMO)
-        : X86MaskedGatherScatterSDNode(X86ISD::MSCATTER, Order, dl, VTs, MemVT,
-                                       MMO) {}
-
     const SDValue &getValue() const { return getOperand(1); }
 
     static bool classof(const SDNode *N) {
@@ -1673,47 +1689,15 @@ namespace llvm {
   };
 
   /// Generate unpacklo/unpackhi shuffle mask.
-  template <typename T = int>
-  void createUnpackShuffleMask(MVT VT, SmallVectorImpl<T> &Mask, bool Lo,
-                               bool Unary) {
-    assert(Mask.empty() && "Expected an empty shuffle mask vector");
-    int NumElts = VT.getVectorNumElements();
-    int NumEltsInLane = 128 / VT.getScalarSizeInBits();
-    for (int i = 0; i < NumElts; ++i) {
-      unsigned LaneStart = (i / NumEltsInLane) * NumEltsInLane;
-      int Pos = (i % NumEltsInLane) / 2 + LaneStart;
-      Pos += (Unary ? 0 : NumElts * (i % 2));
-      Pos += (Lo ? 0 : NumEltsInLane / 2);
-      Mask.push_back(Pos);
-    }
-  }
-
-  /// Helper function to scale a shuffle or target shuffle mask, replacing each
-  /// mask index with the scaled sequential indices for an equivalent narrowed
-  /// mask. This is the reverse process to canWidenShuffleElements, but can
-  /// always succeed.
-  template <typename T>
-  void scaleShuffleMask(size_t Scale, ArrayRef<T> Mask,
-                        SmallVectorImpl<T> &ScaledMask) {
-    assert(0 < Scale && "Unexpected scaling factor");
-    size_t NumElts = Mask.size();
-    ScaledMask.assign(NumElts * Scale, -1);
-
-    for (size_t i = 0; i != NumElts; ++i) {
-      int M = Mask[i];
-
-      // Repeat sentinel values in every mask element.
-      if (M < 0) {
-        for (size_t s = 0; s != Scale; ++s)
-          ScaledMask[(Scale * i) + s] = M;
-        continue;
-      }
-
-      // Scale mask element and increment across each mask element.
-      for (size_t s = 0; s != Scale; ++s)
-        ScaledMask[(Scale * i) + s] = (Scale * M) + s;
-    }
-  }
+  void createUnpackShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo,
+                               bool Unary);
+
+  /// Similar to unpacklo/unpackhi, but without the 128-bit lane limitation
+  /// imposed by AVX and specific to the unary pattern. Example:
+  /// v8iX Lo --> <0, 0, 1, 1, 2, 2, 3, 3>
+  /// v8iX Hi --> <4, 4, 5, 5, 6, 6, 7, 7>
+  void createSplat2ShuffleMask(MVT VT, SmallVectorImpl<int> &Mask, bool Lo);
+
 } // end namespace llvm
 
 #endif // LLVM_LIB_TARGET_X86_X86ISELLOWERING_H
diff --git a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
index 48d0d8a35704..1628f85da808 100644
--- a/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
+++ b/llvm/lib/Target/X86/X86IndirectBranchTracking.cpp
@@ -18,6 +18,7 @@
 #include "X86.h"
 #include "X86InstrInfo.h"
 #include "X86Subtarget.h"
+#include "X86TargetMachine.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
@@ -91,9 +92,7 @@ static bool IsCallReturnTwice(llvm::MachineOperand &MOp) {
   if (!CalleeFn)
     return false;
   AttributeList Attrs = CalleeFn->getAttributes();
-  if (Attrs.hasAttribute(AttributeList::FunctionIndex, Attribute::ReturnsTwice))
-    return true;
-  return false;
+  return Attrs.hasFnAttribute(Attribute::ReturnsTwice);
 }
 
 bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
@@ -102,7 +101,16 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
   // Check that the cf-protection-branch is enabled.
   Metadata *isCFProtectionSupported =
       MF.getMMI().getModule()->getModuleFlag("cf-protection-branch");
-  if (!isCFProtectionSupported && !IndirectBranchTracking)
+  // NB: We need to enable IBT in jitted code if JIT compiler is CET
+  // enabled.
+  const X86TargetMachine *TM =
+      static_cast<const X86TargetMachine *>(&MF.getTarget());
+#ifdef __CET__
+  bool isJITwithCET = TM->isJIT();
+#else
+  bool isJITwithCET = false;
+#endif
+  if (!isCFProtectionSupported && !IndirectBranchTracking && !isJITwithCET)
     return false;
 
   // True if the current MF was changed and false otherwise.
@@ -111,10 +119,11 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
   TII = SubTarget.getInstrInfo();
   EndbrOpcode = SubTarget.is64Bit() ? X86::ENDBR64 : X86::ENDBR32;
 
-  // Non-internal function or function whose address was taken, can be
-  // accessed through indirect calls. Mark the first BB with ENDBR instruction
-  // unless nocf_check attribute is used.
-  if ((MF.getFunction().hasAddressTaken() ||
+  // Large code model, non-internal function or function whose address
+  // was taken, can be accessed through indirect calls. Mark the first
+  // BB with ENDBR instruction unless nocf_check attribute is used.
+  if ((TM->getCodeModel() == CodeModel::Large ||
+       MF.getFunction().hasAddressTaken() ||
        !MF.getFunction().hasLocalLinkage()) &&
       !MF.getFunction().doesNoCfCheck()) {
     auto MBB = MF.begin();
@@ -128,10 +137,38 @@ bool X86IndirectBranchTrackingPass::runOnMachineFunction(MachineFunction &MF) {
       Changed |= addENDBR(MBB, MBB.begin());
 
     for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
-      if (!I->isCall())
-        continue;
-      if (IsCallReturnTwice(I->getOperand(0)))
+      if (I->isCall() && IsCallReturnTwice(I->getOperand(0)))
+        Changed |= addENDBR(MBB, std::next(I));
+    }
+
+    // Exception handle may indirectly jump to catch pad, So we should add
+    // ENDBR before catch pad instructions. For SjLj exception model, it will
+    // create a new BB(new landingpad) indirectly jump to the old landingpad.
+    if (TM->Options.ExceptionModel == ExceptionHandling::SjLj) {
+      for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+        // New Landingpad BB without EHLabel.
+        if (MBB.isEHPad()) {
+          if (I->isDebugInstr())
+            continue;
+          Changed |= addENDBR(MBB, I);
+          break;
+        } else if (I->isEHLabel()) {
+          // Old Landingpad BB (is not Landingpad now) with
+          // the the old "callee" EHLabel.
+          MCSymbol *Sym = I->getOperand(0).getMCSymbol();
+          if (!MF.hasCallSiteLandingPad(Sym))
+            continue;
+          Changed |= addENDBR(MBB, std::next(I));
+          break;
+        }
+      }
+    } else if (MBB.isEHPad()){
+      for (MachineBasicBlock::iterator I = MBB.begin(); I != MBB.end(); ++I) {
+        if (!I->isEHLabel())
+          continue;
         Changed |= addENDBR(MBB, std::next(I));
+        break;
+      }
     }
   }
   return Changed;
diff --git a/llvm/lib/Target/X86/X86IndirectThunks.cpp b/llvm/lib/Target/X86/X86IndirectThunks.cpp
new file mode 100644
index 000000000000..828887d96129
--- /dev/null
+++ b/llvm/lib/Target/X86/X86IndirectThunks.cpp
@@ -0,0 +1,270 @@
+//==- X86IndirectThunks.cpp - Construct indirect call/jump thunks for x86  --=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// Pass that injects an MI thunk that is used to lower indirect calls in a way
+/// that prevents speculation on some x86 processors and can be used to mitigate
+/// security vulnerabilities due to targeted speculative execution and side
+/// channels such as CVE-2017-5715.
+///
+/// Currently supported thunks include:
+/// - Retpoline -- A RET-implemented trampoline that lowers indirect calls
+/// - LVI Thunk -- A CALL/JMP-implemented thunk that forces load serialization
+///   before making an indirect call/jump
+///
+/// Note that the reason that this is implemented as a MachineFunctionPass and
+/// not a ModulePass is that ModulePasses at this point in the LLVM X86 pipeline
+/// serialize all transformations, which can consume lots of memory.
+///
+/// TODO(chandlerc): All of this code could use better comments and
+/// documentation.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/IndirectThunks.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineModuleInfo.h"
+#include "llvm/CodeGen/Passes.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-retpoline-thunks"
+
+static const char RetpolineNamePrefix[] = "__llvm_retpoline_";
+static const char R11RetpolineName[] = "__llvm_retpoline_r11";
+static const char EAXRetpolineName[] = "__llvm_retpoline_eax";
+static const char ECXRetpolineName[] = "__llvm_retpoline_ecx";
+static const char EDXRetpolineName[] = "__llvm_retpoline_edx";
+static const char EDIRetpolineName[] = "__llvm_retpoline_edi";
+
+static const char LVIThunkNamePrefix[] = "__llvm_lvi_thunk_";
+static const char R11LVIThunkName[] = "__llvm_lvi_thunk_r11";
+
+namespace {
+struct RetpolineThunkInserter : ThunkInserter<RetpolineThunkInserter> {
+  const char *getThunkPrefix() { return RetpolineNamePrefix; }
+  bool mayUseThunk(const MachineFunction &MF) {
+    const auto &STI = MF.getSubtarget<X86Subtarget>();
+    return (STI.useRetpolineIndirectCalls() ||
+            STI.useRetpolineIndirectBranches()) &&
+           !STI.useRetpolineExternalThunk();
+  }
+  void insertThunks(MachineModuleInfo &MMI);
+  void populateThunk(MachineFunction &MF);
+};
+
+struct LVIThunkInserter : ThunkInserter<LVIThunkInserter> {
+  const char *getThunkPrefix() { return LVIThunkNamePrefix; }
+  bool mayUseThunk(const MachineFunction &MF) {
+    return MF.getSubtarget<X86Subtarget>().useLVIControlFlowIntegrity();
+  }
+  void insertThunks(MachineModuleInfo &MMI) {
+    createThunkFunction(MMI, R11LVIThunkName);
+  }
+  void populateThunk(MachineFunction &MF) {
+    assert (MF.size() == 1);
+    MachineBasicBlock *Entry = &MF.front();
+    Entry->clear();
+
+    // This code mitigates LVI by replacing each indirect call/jump with a
+    // direct call/jump to a thunk that looks like:
+    // ```
+    // lfence
+    // jmpq *%r11
+    // ```
+    // This ensures that if the value in register %r11 was loaded from memory,
+    // then the value in %r11 is (architecturally) correct prior to the jump.
+    const TargetInstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+    BuildMI(&MF.front(), DebugLoc(), TII->get(X86::LFENCE));
+    BuildMI(&MF.front(), DebugLoc(), TII->get(X86::JMP64r)).addReg(X86::R11);
+    MF.front().addLiveIn(X86::R11);
+    return;
+  }
+};
+
+class X86IndirectThunks : public MachineFunctionPass {
+public:
+  static char ID;
+
+  X86IndirectThunks() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override { return "X86 Indirect Thunks"; }
+
+  bool doInitialization(Module &M) override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+private:
+  std::tuple<RetpolineThunkInserter, LVIThunkInserter> TIs;
+
+  // FIXME: When LLVM moves to C++17, these can become folds
+  template <typename... ThunkInserterT>
+  static void initTIs(Module &M,
+                      std::tuple<ThunkInserterT...> &ThunkInserters) {
+    (void)std::initializer_list<int>{
+        (std::get<ThunkInserterT>(ThunkInserters).init(M), 0)...};
+  }
+  template <typename... ThunkInserterT>
+  static bool runTIs(MachineModuleInfo &MMI, MachineFunction &MF,
+                     std::tuple<ThunkInserterT...> &ThunkInserters) {
+    bool Modified = false;
+    (void)std::initializer_list<int>{
+        Modified |= std::get<ThunkInserterT>(ThunkInserters).run(MMI, MF)...};
+    return Modified;
+  }
+};
+
+} // end anonymous namespace
+
+void RetpolineThunkInserter::insertThunks(MachineModuleInfo &MMI) {
+  if (MMI.getTarget().getTargetTriple().getArch() == Triple::x86_64)
+    createThunkFunction(MMI, R11RetpolineName);
+  else
+    for (StringRef Name : {EAXRetpolineName, ECXRetpolineName, EDXRetpolineName,
+                           EDIRetpolineName})
+      createThunkFunction(MMI, Name);
+}
+
+void RetpolineThunkInserter::populateThunk(MachineFunction &MF) {
+  bool Is64Bit = MF.getTarget().getTargetTriple().getArch() == Triple::x86_64;
+  Register ThunkReg;
+  if (Is64Bit) {
+    assert(MF.getName() == "__llvm_retpoline_r11" &&
+           "Should only have an r11 thunk on 64-bit targets");
+
+    // __llvm_retpoline_r11:
+    //   callq .Lr11_call_target
+    // .Lr11_capture_spec:
+    //   pause
+    //   lfence
+    //   jmp .Lr11_capture_spec
+    // .align 16
+    // .Lr11_call_target:
+    //   movq %r11, (%rsp)
+    //   retq
+    ThunkReg = X86::R11;
+  } else {
+    // For 32-bit targets we need to emit a collection of thunks for various
+    // possible scratch registers as well as a fallback that uses EDI, which is
+    // normally callee saved.
+    //   __llvm_retpoline_eax:
+    //         calll .Leax_call_target
+    //   .Leax_capture_spec:
+    //         pause
+    //         jmp .Leax_capture_spec
+    //   .align 16
+    //   .Leax_call_target:
+    //         movl %eax, (%esp)  # Clobber return addr
+    //         retl
+    //
+    //   __llvm_retpoline_ecx:
+    //   ... # Same setup
+    //         movl %ecx, (%esp)
+    //         retl
+    //
+    //   __llvm_retpoline_edx:
+    //   ... # Same setup
+    //         movl %edx, (%esp)
+    //         retl
+    //
+    //   __llvm_retpoline_edi:
+    //   ... # Same setup
+    //         movl %edi, (%esp)
+    //         retl
+    if (MF.getName() == EAXRetpolineName)
+      ThunkReg = X86::EAX;
+    else if (MF.getName() == ECXRetpolineName)
+      ThunkReg = X86::ECX;
+    else if (MF.getName() == EDXRetpolineName)
+      ThunkReg = X86::EDX;
+    else if (MF.getName() == EDIRetpolineName)
+      ThunkReg = X86::EDI;
+    else
+      llvm_unreachable("Invalid thunk name on x86-32!");
+  }
+
+  const TargetInstrInfo *TII = MF.getSubtarget<X86Subtarget>().getInstrInfo();
+  assert (MF.size() == 1);
+  MachineBasicBlock *Entry = &MF.front();
+  Entry->clear();
+
+  MachineBasicBlock *CaptureSpec =
+      MF.CreateMachineBasicBlock(Entry->getBasicBlock());
+  MachineBasicBlock *CallTarget =
+      MF.CreateMachineBasicBlock(Entry->getBasicBlock());
+  MCSymbol *TargetSym = MF.getContext().createTempSymbol();
+  MF.push_back(CaptureSpec);
+  MF.push_back(CallTarget);
+
+  const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
+  const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;
+
+  Entry->addLiveIn(ThunkReg);
+  BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addSym(TargetSym);
+
+  // The MIR verifier thinks that the CALL in the entry block will fall through
+  // to CaptureSpec, so mark it as the successor. Technically, CaptureTarget is
+  // the successor, but the MIR verifier doesn't know how to cope with that.
+  Entry->addSuccessor(CaptureSpec);
+
+  // In the capture loop for speculation, we want to stop the processor from
+  // speculating as fast as possible. On Intel processors, the PAUSE instruction
+  // will block speculation without consuming any execution resources. On AMD
+  // processors, the PAUSE instruction is (essentially) a nop, so we also use an
+  // LFENCE instruction which they have advised will stop speculation as well
+  // with minimal resource utilization. We still end the capture with a jump to
+  // form an infinite loop to fully guarantee that no matter what implementation
+  // of the x86 ISA, speculating this code path never escapes.
+  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE));
+  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE));
+  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec);
+  CaptureSpec->setHasAddressTaken();
+  CaptureSpec->addSuccessor(CaptureSpec);
+
+  CallTarget->addLiveIn(ThunkReg);
+  CallTarget->setHasAddressTaken();
+  CallTarget->setAlignment(Align(16));
+
+  // Insert return address clobber
+  const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
+  const Register SPReg = Is64Bit ? X86::RSP : X86::ESP;
+  addRegOffset(BuildMI(CallTarget, DebugLoc(), TII->get(MovOpc)), SPReg, false,
+               0)
+      .addReg(ThunkReg);
+
+  CallTarget->back().setPreInstrSymbol(MF, TargetSym);
+  BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
+}
+
+FunctionPass *llvm::createX86IndirectThunksPass() {
+  return new X86IndirectThunks();
+}
+
+char X86IndirectThunks::ID = 0;
+
+bool X86IndirectThunks::doInitialization(Module &M) {
+  initTIs(M, TIs);
+  return false;
+}
+
+bool X86IndirectThunks::runOnMachineFunction(MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << getPassName() << '\n');
+  auto &MMI = getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
+  return runTIs(MMI, MF, TIs);
+}
diff --git a/llvm/lib/Target/X86/X86InsertPrefetch.cpp b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
index 2b1e3f23efd7..53925bbfd72f 100644
--- a/llvm/lib/Target/X86/X86InsertPrefetch.cpp
+++ b/llvm/lib/Target/X86/X86InsertPrefetch.cpp
@@ -173,7 +173,7 @@ bool X86InsertPrefetch::doInitialization(Module &M) {
 
 void X86InsertPrefetch::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<MachineModuleInfoWrapperPass>();
+  MachineFunctionPass::getAnalysisUsage(AU);
 }
 
 bool X86InsertPrefetch::runOnMachineFunction(MachineFunction &MF) {
diff --git a/llvm/lib/Target/X86/X86InsertWait.cpp b/llvm/lib/Target/X86/X86InsertWait.cpp
new file mode 100644
index 000000000000..a82d98d88b30
--- /dev/null
+++ b/llvm/lib/Target/X86/X86InsertWait.cpp
@@ -0,0 +1,151 @@
+//-  X86Insertwait.cpp - Strict-Fp:Insert wait instruction X87 instructions --//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the pass which insert x86 wait instructions after each
+// X87 instructions when strict float is enabled.
+//
+// The logic to insert a wait instruction after an X87 instruction is as below:
+// 1. If the X87 instruction don't raise float exception nor is a load/store
+//    instruction, or is a x87 control instruction, don't insert wait.
+// 2. If the X87 instruction is an instruction which the following instruction
+//    is an X87 exception synchronizing X87 instruction, don't insert wait.
+// 3. For other situations, insert wait instruction.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineOperand.h"
+#include "llvm/CodeGen/TargetInstrInfo.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/Support/Debug.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-insert-wait"
+
+namespace {
+
+class WaitInsert : public MachineFunctionPass {
+public:
+  static char ID;
+
+  WaitInsert() : MachineFunctionPass(ID) {}
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  StringRef getPassName() const override {
+    return "X86 insert wait instruction";
+  }
+
+private:
+  const TargetInstrInfo *TII; // Machine instruction info.
+};
+
+} // namespace
+
+char WaitInsert::ID = 0;
+
+FunctionPass *llvm::createX86InsertX87waitPass() { return new WaitInsert(); }
+
+/// Return true if the Reg is X87 register.
+static bool isX87Reg(unsigned Reg) {
+  return (Reg == X86::FPCW || Reg == X86::FPSW ||
+          (Reg >= X86::ST0 && Reg <= X86::ST7));
+}
+
+/// check if the instruction is X87 instruction
+static bool isX87Instruction(MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.operands()) {
+    if (!MO.isReg())
+      continue;
+    if (isX87Reg(MO.getReg()))
+      return true;
+  }
+  return false;
+}
+
+static bool isX87ControlInstruction(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  case X86::FNINIT:
+  case X86::FLDCW16m:
+  case X86::FNSTCW16m:
+  case X86::FNSTSW16r:
+  case X86::FNSTSWm:
+  case X86::FNCLEX:
+  case X86::FLDENVm:
+  case X86::FSTENVm:
+  case X86::FRSTORm:
+  case X86::FSAVEm:
+  case X86::FINCSTP:
+  case X86::FDECSTP:
+  case X86::FFREE:
+  case X86::FFREEP:
+  case X86::FNOP:
+  case X86::WAIT:
+    return true;
+  default:
+    return false;
+  }
+}
+
+static bool isX87NonWaitingControlInstruction(MachineInstr &MI) {
+  // a few special control instructions don't perform a wait operation
+  switch (MI.getOpcode()) {
+  case X86::FNINIT:
+  case X86::FNSTSW16r:
+  case X86::FNSTSWm:
+  case X86::FNSTCW16m:
+  case X86::FNCLEX:
+    return true;
+  default:
+    return false;
+  }
+}
+
+bool WaitInsert::runOnMachineFunction(MachineFunction &MF) {
+  if (!MF.getFunction().hasFnAttribute(Attribute::StrictFP))
+    return false;
+
+  const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
+  TII = ST.getInstrInfo();
+  bool Changed = false;
+
+  for (MachineBasicBlock &MBB : MF) {
+    for (MachineBasicBlock::iterator MI = MBB.begin(); MI != MBB.end(); ++MI) {
+      // Jump non X87 instruction.
+      if (!isX87Instruction(*MI))
+        continue;
+      // If the instruction instruction neither has float exception nor is
+      // a load/store instruction, or the instruction is x87 control
+      // instruction, do not insert wait.
+      if (!(MI->mayRaiseFPException() || MI->mayLoadOrStore()) ||
+          isX87ControlInstruction(*MI))
+        continue;
+      // If the following instruction is an X87 instruction and isn't an X87
+      // non-waiting control instruction, we can omit insert wait instruction.
+      MachineBasicBlock::iterator AfterMI = std::next(MI);
+      if (AfterMI != MBB.end() && isX87Instruction(*AfterMI) &&
+          !isX87NonWaitingControlInstruction(*AfterMI))
+        continue;
+
+      BuildMI(MBB, AfterMI, MI->getDebugLoc(), TII->get(X86::WAIT));
+      LLVM_DEBUG(dbgs() << "\nInsert wait after:\t" << *MI);
+      // Jump the newly inserting wait
+      ++MI;
+      Changed = true;
+    }
+  }
+  return Changed;
+}
diff --git a/llvm/lib/Target/X86/X86InstrAMX.td b/llvm/lib/Target/X86/X86InstrAMX.td
new file mode 100644
index 000000000000..e26dd5050a23
--- /dev/null
+++ b/llvm/lib/Target/X86/X86InstrAMX.td
@@ -0,0 +1,119 @@
+//===---- X86InstrAMX.td - AMX Instruction Set Extension --*- tablegen -*--===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file describes the instructions that make up the Intel AMX instruction
+// set.
+//
+//===----------------------------------------------------------------------===//
+
+//===----------------------------------------------------------------------===//
+// AMX instructions
+
+let Predicates = [HasAMXTILE, In64BitMode] in {
+  let SchedRW = [WriteSystem] in {
+    let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+    def LDTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
+                       "ldtilecfg\t$src",
+                       [(int_x86_ldtilecfg addr:$src)]>, VEX, T8PS;
+    def STTILECFG : I <0x49, MRM0m, (outs), (ins opaquemem:$src),
+                       "sttilecfg\t$src",
+                       [(int_x86_sttilecfg addr:$src)]>, VEX, T8PD;
+    def TILELOADD : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
+                      (ins sibmem:$src),
+                      "tileloadd\t{$src, $dst|$dst, $src}", []>,
+                      VEX, T8XD;
+    def TILELOADDT1 : I<0x4b, MRMSrcMemFSIB, (outs TILE:$dst),
+                        (ins sibmem:$src),
+                        "tileloaddt1\t{$src, $dst|$dst, $src}", []>,
+                        VEX, T8PD;
+    let Defs = [TMM0,TMM1,TMM2,TMM3,TMM4,TMM5,TMM6,TMM7] in
+    def TILERELEASE : I<0x49, MRM_C0, (outs), (ins),
+                        "tilerelease", [(int_x86_tilerelease)]>, VEX, T8PS;
+    def TILESTORED : I<0x4b, MRMDestMemFSIB, (outs),
+                       (ins sibmem:$dst, TILE:$src),
+                       "tilestored\t{$src, $dst|$dst, $src}", []>,
+                       VEX, T8XS;
+    def TILEZERO : I<0x49, MRMr0, (outs TILE:$dst), (ins),
+                     "tilezero\t$dst", []>,
+                     VEX, T8XD;
+
+    let usesCustomInserter = 1 in {
+      // Pseudo instructions, using immediates instead of tile registers.
+      // To be translated to the actual instructions in X86ISelLowering.cpp
+      def PTILELOADD : PseudoI<(outs), (ins u8imm:$src1, sibmem:$src2), []>;
+      def PTILELOADDT1 : PseudoI<(outs), (ins u8imm:$src1,
+                                          sibmem:$src2), []>;
+      def PTILESTORED : PseudoI<(outs), (ins i8mem:$dst, u8imm:$src), []>;
+      def PTILEZERO : PseudoI<(outs), (ins u8imm:$src),
+                              [(int_x86_tilezero imm:$src)]>;
+    }
+  } // SchedRW
+} // HasAMXTILE
+
+let Predicates = [HasAMXINT8, In64BitMode] in {
+  let SchedRW = [WriteSystem] in {
+    let Constraints = "$src1 = $dst" in {
+      def TDPBSSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
+                      (ins TILE:$src1, TILE:$src2, TILE:$src3),
+                      "tdpbssd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+                      VEX_4V, T8XD;
+      def TDPBSUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
+                      (ins TILE:$src1, TILE:$src2, TILE:$src3),
+                      "tdpbsud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+                      VEX_4V, T8XS;
+      def TDPBUSD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
+                      (ins TILE:$src1, TILE:$src2, TILE:$src3),
+                      "tdpbusd\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+                      VEX_4V, T8PD;
+      def TDPBUUD : I<0x5e, MRMSrcReg4VOp3, (outs TILE:$dst),
+                      (ins TILE:$src1, TILE:$src2, TILE:$src3),
+                      "tdpbuud\t{$src3, $src2, $dst|$dst, $src2, $src3}", []>,
+                      VEX_4V, T8PS;
+    }
+
+    let usesCustomInserter = 1 in {
+      // Pseudo instructions, using immediates instead of tile registers.
+      // To be translated to the actual instructions in X86ISelLowering.cpp
+      def PTDPBSSD : PseudoI<(outs), (ins u8imm:$src1,
+                             u8imm:$src2, u8imm:$src3),
+                             [(int_x86_tdpbssd imm:$src1,
+                               imm:$src2, imm:$src3)]>;
+      def PTDPBSUD : PseudoI<(outs), (ins u8imm:$src1,
+                             u8imm:$src2, u8imm:$src3),
+                             [(int_x86_tdpbsud imm:$src1,
+                               imm:$src2, imm:$src3)]>;
+      def PTDPBUSD : PseudoI<(outs), (ins u8imm:$src1,
+                             u8imm:$src2, u8imm:$src3),
+                             [(int_x86_tdpbusd imm:$src1,
+                               imm:$src2, imm:$src3)]>;
+      def PTDPBUUD : PseudoI<(outs), (ins u8imm:$src1,
+                             u8imm:$src2, u8imm:$src3),
+                             [(int_x86_tdpbuud imm:$src1,
+                               imm:$src2, imm:$src3)]>;
+    }
+  }
+} // HasAMXTILE
+
+let Predicates = [HasAMXBF16, In64BitMode] in {
+  let SchedRW = [WriteSystem] in {
+    let Constraints = "$src1 = $dst" in
+    def TDPBF16PS : I<0x5c, MRMSrcReg4VOp3, (outs TILE:$dst),
+                      (ins TILE:$src1, TILE:$src2, TILE:$src3),
+                      "tdpbf16ps\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+                      []>, VEX_4V, T8XS;
+
+    let usesCustomInserter = 1 in {
+      // Pseudo instructions, using immediates instead of tile registers.
+      // To be translated to the actual instructions in X86ISelLowering.cpp
+      def PTDPBF16PS : PseudoI<(outs), (ins u8imm:$src1,
+                               u8imm:$src2, u8imm:$src3),
+                               [(int_x86_tdpbf16ps imm:$src1,
+                                 imm:$src2, imm:$src3)]>;
+    }
+  }
+} // HasAMXTILE, HasAMXBF16
diff --git a/llvm/lib/Target/X86/X86InstrAVX512.td b/llvm/lib/Target/X86/X86InstrAVX512.td
index 32f012033fb0..a3ad0b1c8dd6 100644
--- a/llvm/lib/Target/X86/X86InstrAVX512.td
+++ b/llvm/lib/Target/X86/X86InstrAVX512.td
@@ -76,11 +76,11 @@ class X86VectorVTInfo<int numelts, ValueType eltvt, RegisterClass rc,
   PatFrag ScalarLdFrag = !cast<PatFrag>("load" # EltVT);
   PatFrag BroadcastLdFrag = !cast<PatFrag>("X86VBroadcastld" # EltSizeName);
 
-  ComplexPattern ScalarIntMemCPat = !if (!eq (EltTypeName, "f32"),
-                                          !cast<ComplexPattern>("sse_load_f32"),
-                                    !if (!eq (EltTypeName, "f64"),
-                                          !cast<ComplexPattern>("sse_load_f64"),
-                                    ?));
+  PatFrags ScalarIntMemFrags = !if (!eq (EltTypeName, "f32"),
+                                           !cast<PatFrags>("sse_load_f32"),
+                               !if (!eq (EltTypeName, "f64"),
+                                     !cast<PatFrags>("sse_load_f64"),
+                               ?));
 
   // The string to specify embedded broadcast in assembly.
   string BroadcastStr = "{1to" # NumElts # "}";
@@ -169,6 +169,18 @@ def v16i1_info : X86KVectorVTInfo<VK16, VK16WM, v16i1>;
 def v32i1_info : X86KVectorVTInfo<VK32, VK32WM, v32i1>;
 def v64i1_info : X86KVectorVTInfo<VK64, VK64WM, v64i1>;
 
+// Used for matching masked operations. Ensures the operation part only has a
+// single use.
+def vselect_mask : PatFrag<(ops node:$mask, node:$src1, node:$src2),
+                           (vselect node:$mask, node:$src1, node:$src2), [{
+  return isProfitableToFormMaskedOp(N);
+}]>;
+
+def X86selects_mask : PatFrag<(ops node:$mask, node:$src1, node:$src2),
+                              (X86selects node:$mask, node:$src1, node:$src2), [{
+  return isProfitableToFormMaskedOp(N);
+}]>;
+
 // This multiclass generates the masking variants from the non-masking
 // variant.  It only provides the assembly pieces for the masking variants.
 // It assumes custom ISel patterns for masking which can be provided as
@@ -220,7 +232,7 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
                                   string OpcodeStr,
                                   string AttSrcAsm, string IntelSrcAsm,
                                   dag RHS, dag MaskingRHS,
-                                  SDNode Select = vselect,
+                                  SDPatternOperator Select = vselect_mask,
                                   string MaskingConstraint = "",
                                   bit IsCommutable = 0,
                                   bit IsKCommutable = 0,
@@ -236,35 +248,36 @@ multiclass AVX512_maskable_common<bits<8> O, Format F, X86VectorVTInfo _,
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
-// perserved vector elements come from a new dummy input operand tied to $dst.
+// preserved vector elements come from a new dummy input operand tied to $dst.
 // This version uses a separate dag for non-masking and masking.
 multiclass AVX512_maskable_split<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
                            dag RHS, dag MaskRHS,
                            bit IsCommutable = 0, bit IsKCommutable = 0,
-                           SDNode Select = vselect> :
+                           bit IsKZCommutable = IsCommutable> :
    AVX512_maskable_custom<O, F, Outs, Ins,
                           !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
                           !con((ins _.KRCWM:$mask), Ins),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm,
                           [(set _.RC:$dst, RHS)],
                           [(set _.RC:$dst,
-                              (Select _.KRCWM:$mask, MaskRHS, _.RC:$src0))],
+                              (vselect_mask _.KRCWM:$mask, MaskRHS, _.RC:$src0))],
                           [(set _.RC:$dst,
-                              (Select _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))],
-                          "$src0 = $dst", IsCommutable, IsKCommutable>;
+                              (vselect_mask _.KRCWM:$mask, MaskRHS, _.ImmAllZerosV))],
+                          "$src0 = $dst", IsCommutable, IsKCommutable,
+                          IsKZCommutable>;
 
 // This multiclass generates the unconditional/non-masking, the masking and
 // the zero-masking variant of the vector instruction.  In the masking case, the
-// perserved vector elements come from a new dummy input operand tied to $dst.
+// preserved vector elements come from a new dummy input operand tied to $dst.
 multiclass AVX512_maskable<bits<8> O, Format F, X86VectorVTInfo _,
                            dag Outs, dag Ins, string OpcodeStr,
                            string AttSrcAsm, string IntelSrcAsm,
                            dag RHS,
                            bit IsCommutable = 0, bit IsKCommutable = 0,
                            bit IsKZCommutable = IsCommutable,
-                           SDNode Select = vselect> :
+                           SDPatternOperator Select = vselect_mask> :
    AVX512_maskable_common<O, F, _, Outs, Ins,
                           !con((ins _.RC:$src0, _.KRCWM:$mask), Ins),
                           !con((ins _.KRCWM:$mask), Ins),
@@ -280,7 +293,7 @@ multiclass AVX512_maskable_scalar<bits<8> O, Format F, X86VectorVTInfo _,
                            string AttSrcAsm, string IntelSrcAsm,
                            dag RHS> :
    AVX512_maskable<O, F, _, Outs, Ins, OpcodeStr, AttSrcAsm, IntelSrcAsm,
-                   RHS, 0, 0, 0, X86selects>;
+                   RHS, 0, 0, 0, X86selects_mask>;
 
 // Similar to AVX512_maskable but in this case one of the source operands
 // ($src1) is already tied to $dst so we just use that for the preserved
@@ -292,7 +305,7 @@ multiclass AVX512_maskable_3src<bits<8> O, Format F, X86VectorVTInfo _,
                                 dag RHS,
                                 bit IsCommutable = 0,
                                 bit IsKCommutable = 0,
-                                SDNode Select = vselect,
+                                SDPatternOperator Select = vselect_mask,
                                 bit MaskOnly = 0> :
    AVX512_maskable_common<O, F, _, Outs,
                           !con((ins _.RC:$src1), NonTiedIns),
@@ -317,9 +330,9 @@ multiclass AVX512_maskable_3src_cast<bits<8> O, Format F, X86VectorVTInfo OutVT,
                           !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
                           !con((ins InVT.RC:$src1, InVT.KRCWM:$mask), NonTiedIns),
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, (null_frag),
-                          (vselect InVT.KRCWM:$mask, RHS,
+                          (vselect_mask InVT.KRCWM:$mask, RHS,
                            (bitconvert InVT.RC:$src1)),
-                           vselect, "", IsCommutable>;
+                           vselect_mask, "", IsCommutable>;
 
 multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
                                      dag Outs, dag NonTiedIns, string OpcodeStr,
@@ -330,7 +343,7 @@ multiclass AVX512_maskable_3src_scalar<bits<8> O, Format F, X86VectorVTInfo _,
                                      bit MaskOnly = 0> :
    AVX512_maskable_3src<O, F, _, Outs, NonTiedIns, OpcodeStr, AttSrcAsm,
                         IntelSrcAsm, RHS, IsCommutable, IsKCommutable,
-                        X86selects, MaskOnly>;
+                        X86selects_mask, MaskOnly>;
 
 multiclass AVX512_maskable_in_asm<bits<8> O, Format F, X86VectorVTInfo _,
                                   dag Outs, dag Ins,
@@ -399,6 +412,36 @@ multiclass AVX512_maskable_cmp<bits<8> O, Format F, X86VectorVTInfo _,
                           OpcodeStr, AttSrcAsm, IntelSrcAsm, RHS,
                           (and _.KRCWM:$mask, RHS_su), IsCommutable>;
 
+// Used by conversion instructions.
+multiclass AVX512_maskable_cvt<bits<8> O, Format F, X86VectorVTInfo _,
+                                  dag Outs,
+                                  dag Ins, dag MaskingIns, dag ZeroMaskingIns,
+                                  string OpcodeStr,
+                                  string AttSrcAsm, string IntelSrcAsm,
+                                  dag RHS, dag MaskingRHS, dag ZeroMaskingRHS> :
+  AVX512_maskable_custom<O, F, Outs, Ins, MaskingIns, ZeroMaskingIns, OpcodeStr,
+                         AttSrcAsm, IntelSrcAsm,
+                         [(set _.RC:$dst, RHS)],
+                         [(set _.RC:$dst, MaskingRHS)],
+                         [(set _.RC:$dst, ZeroMaskingRHS)],
+                         "$src0 = $dst">;
+
+multiclass AVX512_maskable_fma<bits<8> O, Format F, X86VectorVTInfo _,
+                               dag Outs, dag NonTiedIns, string OpcodeStr,
+                               string AttSrcAsm, string IntelSrcAsm,
+                               dag RHS, dag MaskingRHS, bit IsCommutable,
+                               bit IsKCommutable> :
+   AVX512_maskable_custom<O, F, Outs,
+                          !con((ins _.RC:$src1), NonTiedIns),
+                          !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+                          !con((ins _.RC:$src1, _.KRCWM:$mask), NonTiedIns),
+                          OpcodeStr, AttSrcAsm, IntelSrcAsm,
+                          [(set _.RC:$dst, RHS)],
+                          [(set _.RC:$dst,
+                            (vselect_mask _.KRCWM:$mask, MaskingRHS, _.RC:$src1))],
+                          [(set _.RC:$dst,
+                            (vselect_mask _.KRCWM:$mask, MaskingRHS, _.ImmAllZerosV))],
+                          "", IsCommutable, IsKCommutable>;
 
 // Alias instruction that maps zero vector to pxor / xorp* for AVX-512.
 // This is expanded by ExpandPostRAPseudos to an xorps / vxorps, and then
@@ -625,45 +668,45 @@ multiclass vinsert_for_mask_cast<string InstrStr, X86VectorVTInfo From,
                                  list<Predicate> p> {
 let Predicates = p in {
   def : Pat<(Cast.VT
-             (vselect Cast.KRCWM:$mask,
-                      (bitconvert
-                       (vinsert_insert:$ins (To.VT To.RC:$src1),
-                                            (From.VT From.RC:$src2),
-                                            (iPTR imm))),
-                      Cast.RC:$src0)),
+             (vselect_mask Cast.KRCWM:$mask,
+                           (bitconvert
+                            (vinsert_insert:$ins (To.VT To.RC:$src1),
+                                                 (From.VT From.RC:$src2),
+                                                 (iPTR imm))),
+                           Cast.RC:$src0)),
             (!cast<Instruction>(InstrStr#"rrk")
              Cast.RC:$src0, Cast.KRCWM:$mask, To.RC:$src1, From.RC:$src2,
              (INSERT_get_vinsert_imm To.RC:$ins))>;
   def : Pat<(Cast.VT
-             (vselect Cast.KRCWM:$mask,
-                      (bitconvert
-                       (vinsert_insert:$ins (To.VT To.RC:$src1),
-                                            (From.VT
-                                             (bitconvert
-                                              (From.LdFrag addr:$src2))),
-                                            (iPTR imm))),
-                      Cast.RC:$src0)),
+             (vselect_mask Cast.KRCWM:$mask,
+                           (bitconvert
+                            (vinsert_insert:$ins (To.VT To.RC:$src1),
+                                                 (From.VT
+                                                  (bitconvert
+                                                   (From.LdFrag addr:$src2))),
+                                                 (iPTR imm))),
+                           Cast.RC:$src0)),
             (!cast<Instruction>(InstrStr#"rmk")
              Cast.RC:$src0, Cast.KRCWM:$mask, To.RC:$src1, addr:$src2,
              (INSERT_get_vinsert_imm To.RC:$ins))>;
 
   def : Pat<(Cast.VT
-             (vselect Cast.KRCWM:$mask,
-                      (bitconvert
-                       (vinsert_insert:$ins (To.VT To.RC:$src1),
-                                            (From.VT From.RC:$src2),
-                                            (iPTR imm))),
-                      Cast.ImmAllZerosV)),
+             (vselect_mask Cast.KRCWM:$mask,
+                           (bitconvert
+                            (vinsert_insert:$ins (To.VT To.RC:$src1),
+                                                 (From.VT From.RC:$src2),
+                                                 (iPTR imm))),
+                           Cast.ImmAllZerosV)),
             (!cast<Instruction>(InstrStr#"rrkz")
              Cast.KRCWM:$mask, To.RC:$src1, From.RC:$src2,
              (INSERT_get_vinsert_imm To.RC:$ins))>;
   def : Pat<(Cast.VT
-             (vselect Cast.KRCWM:$mask,
-                      (bitconvert
-                       (vinsert_insert:$ins (To.VT To.RC:$src1),
-                                            (From.VT (From.LdFrag addr:$src2)),
-                                            (iPTR imm))),
-                      Cast.ImmAllZerosV)),
+             (vselect_mask Cast.KRCWM:$mask,
+                           (bitconvert
+                            (vinsert_insert:$ins (To.VT To.RC:$src1),
+                                                 (From.VT (From.LdFrag addr:$src2)),
+                                                 (iPTR imm))),
+                           Cast.ImmAllZerosV)),
             (!cast<Instruction>(InstrStr#"rmkz")
              Cast.KRCWM:$mask, To.RC:$src1, addr:$src2,
              (INSERT_get_vinsert_imm To.RC:$ins))>;
@@ -981,20 +1024,20 @@ multiclass vextract_for_mask_cast<string InstrStr, X86VectorVTInfo From,
                                   SDNodeXForm EXTRACT_get_vextract_imm,
                                   list<Predicate> p> {
 let Predicates = p in {
-  def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask,
-                              (bitconvert
-                               (To.VT (vextract_extract:$ext
-                                       (From.VT From.RC:$src), (iPTR imm)))),
-                              To.RC:$src0)),
+  def : Pat<(Cast.VT (vselect_mask Cast.KRCWM:$mask,
+                                   (bitconvert
+                                    (To.VT (vextract_extract:$ext
+                                            (From.VT From.RC:$src), (iPTR imm)))),
+                                   To.RC:$src0)),
             (Cast.VT (!cast<Instruction>(InstrStr#"rrk")
                       Cast.RC:$src0, Cast.KRCWM:$mask, From.RC:$src,
                       (EXTRACT_get_vextract_imm To.RC:$ext)))>;
 
-  def : Pat<(Cast.VT (vselect Cast.KRCWM:$mask,
-                              (bitconvert
-                               (To.VT (vextract_extract:$ext
-                                       (From.VT From.RC:$src), (iPTR imm)))),
-                              Cast.ImmAllZerosV)),
+  def : Pat<(Cast.VT (vselect_mask Cast.KRCWM:$mask,
+                                   (bitconvert
+                                    (To.VT (vextract_extract:$ext
+                                            (From.VT From.RC:$src), (iPTR imm)))),
+                                   Cast.ImmAllZerosV)),
             (Cast.VT (!cast<Instruction>(InstrStr#"rrkz")
                       Cast.KRCWM:$mask, From.RC:$src,
                       (EXTRACT_get_vextract_imm To.RC:$ext)))>;
@@ -1101,18 +1144,18 @@ multiclass avx512_broadcast_scalar<bits<8> opc, string OpcodeStr,
                             string Name,
                             X86VectorVTInfo DestInfo, X86VectorVTInfo SrcInfo> {
   def : Pat<(DestInfo.VT (X86VBroadcast SrcInfo.FRC:$src)),
-            (!cast<Instruction>(Name#DestInfo.ZSuffix#r)
+            (!cast<Instruction>(Name#DestInfo.ZSuffix#rr)
              (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
-  def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
-                                  (X86VBroadcast SrcInfo.FRC:$src),
-                                  DestInfo.RC:$src0)),
-            (!cast<Instruction>(Name#DestInfo.ZSuffix#rk)
+  def : Pat<(DestInfo.VT (vselect_mask DestInfo.KRCWM:$mask,
+                                       (X86VBroadcast SrcInfo.FRC:$src),
+                                       DestInfo.RC:$src0)),
+            (!cast<Instruction>(Name#DestInfo.ZSuffix#rrk)
              DestInfo.RC:$src0, DestInfo.KRCWM:$mask,
              (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
-  def : Pat<(DestInfo.VT (vselect DestInfo.KRCWM:$mask,
-                                  (X86VBroadcast SrcInfo.FRC:$src),
-                                  DestInfo.ImmAllZerosV)),
-            (!cast<Instruction>(Name#DestInfo.ZSuffix#rkz)
+  def : Pat<(DestInfo.VT (vselect_mask DestInfo.KRCWM:$mask,
+                                       (X86VBroadcast SrcInfo.FRC:$src),
+                                       DestInfo.ImmAllZerosV)),
+            (!cast<Instruction>(Name#DestInfo.ZSuffix#rrkz)
              DestInfo.KRCWM:$mask, (SrcInfo.VT (COPY_TO_REGCLASS SrcInfo.FRC:$src, SrcInfo.RC)))>;
 }
 
@@ -1128,83 +1171,83 @@ multiclass avx512_broadcast_rm_split<bits<8> opc, string OpcodeStr,
                                      SDPatternOperator UnmaskedOp = X86VBroadcast,
                                      SDPatternOperator UnmaskedBcastOp = SrcInfo.BroadcastLdFrag> {
   let hasSideEffects = 0 in
-  def r : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), (ins SrcInfo.RC:$src),
-                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set MaskInfo.RC:$dst,
-                     (MaskInfo.VT
-                      (bitconvert
-                       (DestInfo.VT
-                        (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))],
-                   DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>;
-  def rkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
-                     (ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src),
-                     !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
-                      "${dst} {${mask}} {z}, $src}"),
-                      [(set MaskInfo.RC:$dst,
-                        (vselect MaskInfo.KRCWM:$mask,
-                         (MaskInfo.VT
-                          (bitconvert
-                           (DestInfo.VT
-                            (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
-                         MaskInfo.ImmAllZerosV))],
-                      DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>;
-  let Constraints = "$src0 = $dst" in
-  def rk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
-                    (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
-                         SrcInfo.RC:$src),
-                    !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
-                    "${dst} {${mask}}, $src}"),
+  def rr : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst), (ins SrcInfo.RC:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
                     [(set MaskInfo.RC:$dst,
-                      (vselect MaskInfo.KRCWM:$mask,
-                       (MaskInfo.VT
-                        (bitconvert
-                         (DestInfo.VT
-                          (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
-                       MaskInfo.RC:$src0))],
-                     DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>;
+                      (MaskInfo.VT
+                       (bitconvert
+                        (DestInfo.VT
+                         (UnmaskedOp (SrcInfo.VT SrcInfo.RC:$src))))))],
+                    DestInfo.ExeDomain>, T8PD, EVEX, Sched<[SchedRR]>;
+  def rrkz : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
+                      (ins MaskInfo.KRCWM:$mask, SrcInfo.RC:$src),
+                      !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+                       "${dst} {${mask}} {z}, $src}"),
+                       [(set MaskInfo.RC:$dst,
+                         (vselect_mask MaskInfo.KRCWM:$mask,
+                          (MaskInfo.VT
+                           (bitconvert
+                            (DestInfo.VT
+                             (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
+                          MaskInfo.ImmAllZerosV))],
+                       DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ, Sched<[SchedRR]>;
+  let Constraints = "$src0 = $dst" in
+  def rrk : AVX512PI<opc, MRMSrcReg, (outs MaskInfo.RC:$dst),
+                     (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
+                          SrcInfo.RC:$src),
+                     !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
+                     "${dst} {${mask}}, $src}"),
+                     [(set MaskInfo.RC:$dst,
+                       (vselect_mask MaskInfo.KRCWM:$mask,
+                        (MaskInfo.VT
+                         (bitconvert
+                          (DestInfo.VT
+                           (X86VBroadcast (SrcInfo.VT SrcInfo.RC:$src))))),
+                        MaskInfo.RC:$src0))],
+                      DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K, Sched<[SchedRR]>;
 
   let hasSideEffects = 0, mayLoad = 1 in
-  def m : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
-                   (ins SrcInfo.ScalarMemOp:$src),
-                   !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                   [(set MaskInfo.RC:$dst,
-                     (MaskInfo.VT
-                      (bitconvert
-                       (DestInfo.VT
-                        (UnmaskedBcastOp addr:$src)))))],
-                   DestInfo.ExeDomain>, T8PD, EVEX,
-                   EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
-
-  def mkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
-                     (ins MaskInfo.KRCWM:$mask, SrcInfo.ScalarMemOp:$src),
-                     !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
-                      "${dst} {${mask}} {z}, $src}"),
-                      [(set MaskInfo.RC:$dst,
-                        (vselect MaskInfo.KRCWM:$mask,
-                         (MaskInfo.VT
-                          (bitconvert
-                           (DestInfo.VT
-                            (SrcInfo.BroadcastLdFrag addr:$src)))),
-                         MaskInfo.ImmAllZerosV))],
-                      DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ,
-                      EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+  def rm : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+                    (ins SrcInfo.ScalarMemOp:$src),
+                    !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
+                    [(set MaskInfo.RC:$dst,
+                      (MaskInfo.VT
+                       (bitconvert
+                        (DestInfo.VT
+                         (UnmaskedBcastOp addr:$src)))))],
+                    DestInfo.ExeDomain>, T8PD, EVEX,
+                    EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+
+  def rmkz : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+                      (ins MaskInfo.KRCWM:$mask, SrcInfo.ScalarMemOp:$src),
+                      !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}} {z}|",
+                       "${dst} {${mask}} {z}, $src}"),
+                       [(set MaskInfo.RC:$dst,
+                         (vselect_mask MaskInfo.KRCWM:$mask,
+                          (MaskInfo.VT
+                           (bitconvert
+                            (DestInfo.VT
+                             (SrcInfo.BroadcastLdFrag addr:$src)))),
+                          MaskInfo.ImmAllZerosV))],
+                       DestInfo.ExeDomain>, T8PD, EVEX, EVEX_KZ,
+                       EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
 
   let Constraints = "$src0 = $dst",
       isConvertibleToThreeAddress = IsConvertibleToThreeAddress in
-  def mk : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
-                    (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
-                         SrcInfo.ScalarMemOp:$src),
-                    !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
-                    "${dst} {${mask}}, $src}"),
-                    [(set MaskInfo.RC:$dst,
-                      (vselect MaskInfo.KRCWM:$mask,
-                       (MaskInfo.VT
-                        (bitconvert
-                         (DestInfo.VT
-                          (SrcInfo.BroadcastLdFrag addr:$src)))),
-                       MaskInfo.RC:$src0))],
-                     DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K,
-                     EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
+  def rmk : AVX512PI<opc, MRMSrcMem, (outs MaskInfo.RC:$dst),
+                     (ins MaskInfo.RC:$src0, MaskInfo.KRCWM:$mask,
+                          SrcInfo.ScalarMemOp:$src),
+                     !strconcat(OpcodeStr, "\t{$src, ${dst} {${mask}}|",
+                     "${dst} {${mask}}, $src}"),
+                     [(set MaskInfo.RC:$dst,
+                       (vselect_mask MaskInfo.KRCWM:$mask,
+                        (MaskInfo.VT
+                         (bitconvert
+                          (DestInfo.VT
+                           (SrcInfo.BroadcastLdFrag addr:$src)))),
+                        MaskInfo.RC:$src0))],
+                      DestInfo.ExeDomain>, T8PD, EVEX, EVEX_K,
+                      EVEX_CD8<SrcInfo.EltSize, CD8VT1>, Sched<[SchedRM]>;
 }
 
 // Helper class to force mask and broadcast result to same type.
@@ -1267,35 +1310,38 @@ defm VBROADCASTSD  : avx512_fp_broadcast_sd<0x19, "vbroadcastsd",
 multiclass avx512_int_broadcast_reg<bits<8> opc, SchedWrite SchedRR,
                                     X86VectorVTInfo _, SDPatternOperator OpNode,
                                     RegisterClass SrcRC> {
+  // Fold with a mask even if it has multiple uses since it is cheap.
   let ExeDomain = _.ExeDomain in
-  defm r : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                         (ins SrcRC:$src),
-                         "vpbroadcast"##_.Suffix, "$src", "$src",
-                         (_.VT (OpNode SrcRC:$src))>, T8PD, EVEX,
-                         Sched<[SchedRR]>;
+  defm rr : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                          (ins SrcRC:$src),
+                          "vpbroadcast"#_.Suffix, "$src", "$src",
+                          (_.VT (OpNode SrcRC:$src)), /*IsCommutable*/0,
+                          /*IsKCommutable*/0, /*IsKZCommutable*/0, vselect>,
+                          T8PD, EVEX, Sched<[SchedRR]>;
 }
 
 multiclass avx512_int_broadcastbw_reg<bits<8> opc, string Name, SchedWrite SchedRR,
                                     X86VectorVTInfo _, SDPatternOperator OpNode,
                                     RegisterClass SrcRC, SubRegIndex Subreg> {
   let hasSideEffects = 0, ExeDomain = _.ExeDomain in
-  defm r : AVX512_maskable_custom<opc, MRMSrcReg,
-                        (outs _.RC:$dst), (ins GR32:$src),
-                        !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)),
-                        !con((ins _.KRCWM:$mask), (ins GR32:$src)),
-                        "vpbroadcast"##_.Suffix, "$src", "$src", [], [], [],
-                        "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>;
+  defm rr : AVX512_maskable_custom<opc, MRMSrcReg,
+                         (outs _.RC:$dst), (ins GR32:$src),
+                         !con((ins _.RC:$src0, _.KRCWM:$mask), (ins GR32:$src)),
+                         !con((ins _.KRCWM:$mask), (ins GR32:$src)),
+                         "vpbroadcast"#_.Suffix, "$src", "$src", [], [], [],
+                         "$src0 = $dst">, T8PD, EVEX, Sched<[SchedRR]>;
 
   def : Pat <(_.VT (OpNode SrcRC:$src)),
-             (!cast<Instruction>(Name#r)
+             (!cast<Instruction>(Name#rr)
               (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
 
+  // Fold with a mask even if it has multiple uses since it is cheap.
   def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.RC:$src0),
-             (!cast<Instruction>(Name#rk) _.RC:$src0, _.KRCWM:$mask,
+             (!cast<Instruction>(Name#rrk) _.RC:$src0, _.KRCWM:$mask,
               (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
 
   def : Pat <(vselect _.KRCWM:$mask, (_.VT (OpNode SrcRC:$src)), _.ImmAllZerosV),
-             (!cast<Instruction>(Name#rkz) _.KRCWM:$mask,
+             (!cast<Instruction>(Name#rrkz) _.KRCWM:$mask,
               (i32 (INSERT_SUBREG (i32 (IMPLICIT_DEF)), SrcRC:$src, Subreg)))>;
 }
 
@@ -1392,72 +1438,6 @@ multiclass avx512_subvec_broadcast_rm_dq<bits<8> opc, string OpcodeStr,
                            AVX5128IBase, EVEX;
 }
 
-let Predicates = [HasAVX512] in {
-  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
-  def : Pat<(v8i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
-            (VPBROADCASTQZm addr:$src)>;
-
-  // FIXME this is to handle aligned extloads from i8.
-  def : Pat<(v16i32 (X86VBroadcast (loadi32 addr:$src))),
-            (VPBROADCASTDZm addr:$src)>;
-}
-
-let Predicates = [HasVLX] in {
-  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
-  def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
-            (VPBROADCASTQZ128m addr:$src)>;
-  def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
-            (VPBROADCASTQZ256m addr:$src)>;
-
-  // FIXME this is to handle aligned extloads from i8.
-  def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
-            (VPBROADCASTDZ128m addr:$src)>;
-  def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
-            (VPBROADCASTDZ256m addr:$src)>;
-}
-let Predicates = [HasVLX, HasBWI] in {
-  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
-  // This means we'll encounter truncated i32 loads; match that here.
-  def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
-            (VPBROADCASTWZ128m addr:$src)>;
-  def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
-            (VPBROADCASTWZ256m addr:$src)>;
-  def : Pat<(v8i16 (X86VBroadcast
-              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
-            (VPBROADCASTWZ128m addr:$src)>;
-  def : Pat<(v8i16 (X86VBroadcast
-              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
-            (VPBROADCASTWZ128m addr:$src)>;
-  def : Pat<(v16i16 (X86VBroadcast
-              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
-            (VPBROADCASTWZ256m addr:$src)>;
-  def : Pat<(v16i16 (X86VBroadcast
-              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
-            (VPBROADCASTWZ256m addr:$src)>;
-
-  // FIXME this is to handle aligned extloads from i8.
-  def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
-            (VPBROADCASTWZ128m addr:$src)>;
-  def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
-            (VPBROADCASTWZ256m addr:$src)>;
-}
-let Predicates = [HasBWI] in {
-  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
-  // This means we'll encounter truncated i32 loads; match that here.
-  def : Pat<(v32i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
-            (VPBROADCASTWZm addr:$src)>;
-  def : Pat<(v32i16 (X86VBroadcast
-              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
-            (VPBROADCASTWZm addr:$src)>;
-  def : Pat<(v32i16 (X86VBroadcast
-              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
-            (VPBROADCASTWZm addr:$src)>;
-
-  // FIXME this is to handle aligned extloads from i8.
-  def : Pat<(v32i16 (X86VBroadcast (loadi16 addr:$src))),
-            (VPBROADCASTWZm addr:$src)>;
-}
-
 //===----------------------------------------------------------------------===//
 // AVX-512 BROADCAST SUBVECTORS
 //
@@ -1516,38 +1496,38 @@ def : Pat<(v64i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
           (VBROADCASTI32X4rm addr:$src)>;
 
 // Patterns for selects of bitcasted operations.
-def : Pat<(vselect VK16WM:$mask,
-                   (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
-                   (v16f32 immAllZerosV)),
+def : Pat<(vselect_mask VK16WM:$mask,
+                        (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+                        (v16f32 immAllZerosV)),
           (VBROADCASTF32X4rmkz VK16WM:$mask, addr:$src)>;
-def : Pat<(vselect VK16WM:$mask,
-                   (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
-                   VR512:$src0),
+def : Pat<(vselect_mask VK16WM:$mask,
+                        (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+                        VR512:$src0),
           (VBROADCASTF32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
-def : Pat<(vselect VK16WM:$mask,
-                   (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
-                   (v16i32 immAllZerosV)),
+def : Pat<(vselect_mask VK16WM:$mask,
+                        (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+                        (v16i32 immAllZerosV)),
           (VBROADCASTI32X4rmkz VK16WM:$mask, addr:$src)>;
-def : Pat<(vselect VK16WM:$mask,
-                   (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
-                   VR512:$src0),
+def : Pat<(vselect_mask VK16WM:$mask,
+                        (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+                        VR512:$src0),
           (VBROADCASTI32X4rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
 
-def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
-                   (v8f64 immAllZerosV)),
+def : Pat<(vselect_mask VK8WM:$mask,
+                        (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
+                        (v8f64 immAllZerosV)),
           (VBROADCASTF64X4rmkz VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
-                   VR512:$src0),
+def : Pat<(vselect_mask VK8WM:$mask,
+                        (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv8f32 addr:$src)))),
+                        VR512:$src0),
           (VBROADCASTF64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
-                   (v8i64 immAllZerosV)),
+def : Pat<(vselect_mask VK8WM:$mask,
+                        (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
+                        (v8i64 immAllZerosV)),
           (VBROADCASTI64X4rmkz VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
-                   VR512:$src0),
+def : Pat<(vselect_mask VK8WM:$mask,
+                        (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv8i32 addr:$src)))),
+                        VR512:$src0),
           (VBROADCASTI64X4rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 }
 
@@ -1569,21 +1549,21 @@ def : Pat<(v32i8 (X86SubVBroadcast (loadv16i8 addr:$src))),
           (VBROADCASTI32X4Z256rm addr:$src)>;
 
 // Patterns for selects of bitcasted operations.
-def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
-                   (v8f32 immAllZerosV)),
+def : Pat<(vselect_mask VK8WM:$mask,
+                        (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+                        (v8f32 immAllZerosV)),
           (VBROADCASTF32X4Z256rmkz VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
-                   VR256X:$src0),
+def : Pat<(vselect_mask VK8WM:$mask,
+                        (bc_v8f32 (v4f64 (X86SubVBroadcast (loadv2f64 addr:$src)))),
+                        VR256X:$src0),
           (VBROADCASTF32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
-                   (v8i32 immAllZerosV)),
+def : Pat<(vselect_mask VK8WM:$mask,
+                        (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+                        (v8i32 immAllZerosV)),
           (VBROADCASTI32X4Z256rmkz VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
-                   VR256X:$src0),
+def : Pat<(vselect_mask VK8WM:$mask,
+                        (bc_v8i32 (v4i64 (X86SubVBroadcast (loadv2i64 addr:$src)))),
+                        VR256X:$src0),
           (VBROADCASTI32X4Z256rmk VR256X:$src0, VK8WM:$mask, addr:$src)>;
 
 
@@ -1618,21 +1598,21 @@ defm VBROADCASTF64X2Z128 : avx512_subvec_broadcast_rm_dq<0x1a, "vbroadcastf64x2"
                            EVEX_V256, EVEX_CD8<64, CD8VT2>;
 
 // Patterns for selects of bitcasted operations.
-def : Pat<(vselect VK4WM:$mask,
-                   (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
-                   (v4f64 immAllZerosV)),
+def : Pat<(vselect_mask VK4WM:$mask,
+                        (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+                        (v4f64 immAllZerosV)),
           (VBROADCASTF64X2Z128rmkz VK4WM:$mask, addr:$src)>;
-def : Pat<(vselect VK4WM:$mask,
-                   (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
-                   VR256X:$src0),
+def : Pat<(vselect_mask VK4WM:$mask,
+                        (bc_v4f64 (v8f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+                        VR256X:$src0),
           (VBROADCASTF64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
-def : Pat<(vselect VK4WM:$mask,
-                   (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
-                   (v4i64 immAllZerosV)),
+def : Pat<(vselect_mask VK4WM:$mask,
+                        (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+                        (v4i64 immAllZerosV)),
           (VBROADCASTI64X2Z128rmkz VK4WM:$mask, addr:$src)>;
-def : Pat<(vselect VK4WM:$mask,
-                   (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
-                   VR256X:$src0),
+def : Pat<(vselect_mask VK4WM:$mask,
+                        (bc_v4i64 (v8i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+                        VR256X:$src0),
           (VBROADCASTI64X2Z128rmk VR256X:$src0, VK4WM:$mask, addr:$src)>;
 }
 
@@ -1651,38 +1631,38 @@ defm VBROADCASTF32X8 : avx512_subvec_broadcast_rm_dq<0x1b, "vbroadcastf32x8",
                        EVEX_V512, EVEX_CD8<32, CD8VT8>;
 
 // Patterns for selects of bitcasted operations.
-def : Pat<(vselect VK16WM:$mask,
-                   (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
-                   (v16f32 immAllZerosV)),
+def : Pat<(vselect_mask VK16WM:$mask,
+                        (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
+                        (v16f32 immAllZerosV)),
           (VBROADCASTF32X8rmkz VK16WM:$mask, addr:$src)>;
-def : Pat<(vselect VK16WM:$mask,
-                   (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
-                   VR512:$src0),
+def : Pat<(vselect_mask VK16WM:$mask,
+                        (bc_v16f32 (v8f64 (X86SubVBroadcast (loadv4f64 addr:$src)))),
+                        VR512:$src0),
           (VBROADCASTF32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
-def : Pat<(vselect VK16WM:$mask,
-                   (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
-                   (v16i32 immAllZerosV)),
+def : Pat<(vselect_mask VK16WM:$mask,
+                        (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
+                        (v16i32 immAllZerosV)),
           (VBROADCASTI32X8rmkz VK16WM:$mask, addr:$src)>;
-def : Pat<(vselect VK16WM:$mask,
-                   (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
-                   VR512:$src0),
+def : Pat<(vselect_mask VK16WM:$mask,
+                        (bc_v16i32 (v8i64 (X86SubVBroadcast (loadv4i64 addr:$src)))),
+                        VR512:$src0),
           (VBROADCASTI32X8rmk VR512:$src0, VK16WM:$mask, addr:$src)>;
 
-def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
-                   (v8f64 immAllZerosV)),
+def : Pat<(vselect_mask VK8WM:$mask,
+                        (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+                        (v8f64 immAllZerosV)),
           (VBROADCASTF64X2rmkz VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
-                   VR512:$src0),
+def : Pat<(vselect_mask VK8WM:$mask,
+                        (bc_v8f64 (v16f32 (X86SubVBroadcast (loadv4f32 addr:$src)))),
+                        VR512:$src0),
           (VBROADCASTF64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
-                   (v8i64 immAllZerosV)),
+def : Pat<(vselect_mask VK8WM:$mask,
+                        (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+                        (v8i64 immAllZerosV)),
           (VBROADCASTI64X2rmkz VK8WM:$mask, addr:$src)>;
-def : Pat<(vselect VK8WM:$mask,
-                   (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
-                   VR512:$src0),
+def : Pat<(vselect_mask VK8WM:$mask,
+                        (bc_v8i64 (v16i32 (X86SubVBroadcast (loadv4i32 addr:$src)))),
+                        VR512:$src0),
           (VBROADCASTI64X2rmk VR512:$src0, VK8WM:$mask, addr:$src)>;
 }
 
@@ -1836,24 +1816,27 @@ defm VPERMI2PD : avx512_perm_i_sizes<0x77, "vpermi2pd", WriteFVarShuffle256,
 multiclass avx512_perm_i_lowering<string InstrStr, X86VectorVTInfo _,
                                   X86VectorVTInfo IdxVT,
                                   X86VectorVTInfo CastVT> {
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                             (X86VPermt2 (_.VT _.RC:$src2),
-                                         (IdxVT.VT (bitconvert (CastVT.VT _.RC:$src1))), _.RC:$src3),
-                             (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+                                (X86VPermt2 (_.VT _.RC:$src2),
+                                            (IdxVT.VT (bitconvert
+                                                       (CastVT.VT _.RC:$src1))),
+                                            _.RC:$src3),
+                                (_.VT (bitconvert (CastVT.VT _.RC:$src1))))),
             (!cast<Instruction>(InstrStr#"rrk") _.RC:$src1, _.KRCWM:$mask,
                                                 _.RC:$src2, _.RC:$src3)>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                             (X86VPermt2 _.RC:$src2,
-                                         (IdxVT.VT (bitconvert  (CastVT.VT _.RC:$src1))),
-                                         (_.LdFrag addr:$src3)),
-                             (_.VT (bitconvert  (CastVT.VT _.RC:$src1))))),
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+                                (X86VPermt2 _.RC:$src2,
+                                            (IdxVT.VT (bitconvert
+                                                       (CastVT.VT _.RC:$src1))),
+                                            (_.LdFrag addr:$src3)),
+                                (_.VT (bitconvert  (CastVT.VT _.RC:$src1))))),
             (!cast<Instruction>(InstrStr#"rmk") _.RC:$src1, _.KRCWM:$mask,
                                                 _.RC:$src2, addr:$src3)>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
-                             (X86VPermt2 _.RC:$src2,
-                                         (IdxVT.VT (bitconvert  (CastVT.VT _.RC:$src1))),
-                                         (_.BroadcastLdFrag addr:$src3)),
-                             (_.VT (bitconvert  (CastVT.VT _.RC:$src1))))),
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
+                                 (X86VPermt2 _.RC:$src2,
+                                             (IdxVT.VT (bitconvert  (CastVT.VT _.RC:$src1))),
+                                             (_.BroadcastLdFrag addr:$src3)),
+                                 (_.VT (bitconvert  (CastVT.VT _.RC:$src1))))),
             (!cast<Instruction>(InstrStr#"rmbk") _.RC:$src1, _.KRCWM:$mask,
                                                  _.RC:$src2, addr:$src3)>;
 }
@@ -2085,9 +2068,9 @@ multiclass avx512_cmp_scalar<X86VectorVTInfo _, SDNode OpNode, SDNode OpNodeSAE,
                     (ins _.RC:$src1, _.IntScalarMemOp:$src2, u8imm:$cc),
                     "vcmp"#_.Suffix,
                     "$cc, $src2, $src1", "$src1, $src2, $cc",
-                    (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
+                    (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
                         timm:$cc),
-                    (OpNode_su (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2,
+                    (OpNode_su (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2),
                         timm:$cc)>, EVEX_4V, VEX_LIG, EVEX_CD8<_.EltSize, CD8VT1>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 
@@ -2646,13 +2629,13 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
   let Predicates = [prd], ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
       def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      OpcodeStr#_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set _.KRC:$dst,(X86Vfpclasss (_.VT _.RC:$src1),
                               (i32 timm:$src2)))]>,
                       Sched<[sched]>;
       def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix#
+                      OpcodeStr#_.Suffix#
                       "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                       [(set _.KRC:$dst,(and _.KRCWM:$mask,
                                       (X86Vfpclasss_su (_.VT _.RC:$src1),
@@ -2660,18 +2643,18 @@ multiclass avx512_scalar_fpclass<bits<8> opc, string OpcodeStr,
                       EVEX_K, Sched<[sched]>;
     def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.IntScalarMemOp:$src1, i32u8imm:$src2),
-                    OpcodeStr##_.Suffix##
+                    OpcodeStr#_.Suffix#
                               "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set _.KRC:$dst,
-                          (X86Vfpclasss _.ScalarIntMemCPat:$src1,
-                                       (i32 timm:$src2)))]>,
+                          (X86Vfpclasss (_.ScalarIntMemFrags addr:$src1),
+                                        (i32 timm:$src2)))]>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
     def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.KRCWM:$mask, _.IntScalarMemOp:$src1, i32u8imm:$src2),
-                    OpcodeStr##_.Suffix##
+                    OpcodeStr#_.Suffix#
                     "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                     [(set _.KRC:$dst,(and _.KRCWM:$mask,
-                        (X86Vfpclasss_su _.ScalarIntMemCPat:$src1,
+                        (X86Vfpclasss_su (_.ScalarIntMemFrags addr:$src1),
                             (i32 timm:$src2))))]>,
                     EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -2686,13 +2669,13 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
   let ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
   def rr : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                      OpcodeStr#_.Suffix#"\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                       [(set _.KRC:$dst,(X86Vfpclass (_.VT _.RC:$src1),
                                        (i32 timm:$src2)))]>,
                       Sched<[sched]>;
   def rrk : AVX512<opc, MRMSrcReg, (outs _.KRC:$dst),
                       (ins _.KRCWM:$mask, _.RC:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix#
+                      OpcodeStr#_.Suffix#
                       "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                       [(set _.KRC:$dst,(and _.KRCWM:$mask,
                                        (X86Vfpclass_su (_.VT _.RC:$src1),
@@ -2700,7 +2683,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                       EVEX_K, Sched<[sched]>;
   def rm : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.MemOp:$src1, i32u8imm:$src2),
-                    OpcodeStr##_.Suffix#"{"#mem#"}"#
+                    OpcodeStr#_.Suffix#"{"#mem#"}"#
                     "\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                     [(set _.KRC:$dst,(X86Vfpclass
                                      (_.VT (_.LdFrag addr:$src1)),
@@ -2708,7 +2691,7 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.KRCWM:$mask, _.MemOp:$src1, i32u8imm:$src2),
-                    OpcodeStr##_.Suffix#"{"#mem#"}"#
+                    OpcodeStr#_.Suffix#"{"#mem#"}"#
                     "\t{$src2, $src1, $dst {${mask}}|$dst {${mask}}, $src1, $src2}",
                     [(set _.KRC:$dst, (and _.KRCWM:$mask, (X86Vfpclass_su
                                   (_.VT (_.LdFrag addr:$src1)),
@@ -2716,18 +2699,18 @@ multiclass avx512_vector_fpclass<bits<8> opc, string OpcodeStr,
                     EVEX_K, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmb : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
-                    OpcodeStr##_.Suffix##"\t{$src2, ${src1}"##
-                                      _.BroadcastStr##", $dst|$dst, ${src1}"
-                                                  ##_.BroadcastStr##", $src2}",
+                    OpcodeStr#_.Suffix#"\t{$src2, ${src1}"#
+                                      _.BroadcastStr#", $dst|$dst, ${src1}"
+                                                  #_.BroadcastStr#", $src2}",
                     [(set _.KRC:$dst,(X86Vfpclass
                                      (_.VT (_.BroadcastLdFrag addr:$src1)),
                                      (i32 timm:$src2)))]>,
                     EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   def rmbk : AVX512<opc, MRMSrcMem, (outs _.KRC:$dst),
                     (ins _.KRCWM:$mask, _.ScalarMemOp:$src1, i32u8imm:$src2),
-                    OpcodeStr##_.Suffix##"\t{$src2, ${src1}"##
-                          _.BroadcastStr##", $dst {${mask}}|$dst {${mask}}, ${src1}"##
-                                                   _.BroadcastStr##", $src2}",
+                    OpcodeStr#_.Suffix#"\t{$src2, ${src1}"#
+                          _.BroadcastStr#", $dst {${mask}}|$dst {${mask}}, ${src1}"#
+                                                   _.BroadcastStr#", $src2}",
                     [(set _.KRC:$dst,(and _.KRCWM:$mask, (X86Vfpclass_su
                                      (_.VT (_.BroadcastLdFrag addr:$src1)),
                                      (i32 timm:$src2))))]>,
@@ -2979,6 +2962,8 @@ def : Pat<(vnot VK4:$src),
           (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK4:$src, VK16)), VK4)>;
 def : Pat<(vnot VK2:$src),
           (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK2:$src, VK16)), VK2)>;
+def : Pat<(vnot VK1:$src),
+          (COPY_TO_REGCLASS (KNOTWrr (COPY_TO_REGCLASS VK1:$src, VK16)), VK2)>;
 
 // Mask binary operation
 // - KAND, KANDN, KOR, KXNOR, KXOR
@@ -3008,8 +2993,6 @@ multiclass avx512_mask_binop_all<bits<8> opc, string OpcodeStr,
                              sched, HasBWI, IsCommutable>, VEX_4V, VEX_L, VEX_W, PS;
 }
 
-def andn : PatFrag<(ops node:$i0, node:$i1), (and (not node:$i0), node:$i1)>;
-def xnor : PatFrag<(ops node:$i0, node:$i1), (not (xor node:$i0, node:$i1))>;
 // These nodes use 'vnot' instead of 'not' to support vectors.
 def vandn : PatFrag<(ops node:$i0, node:$i1), (and (vnot node:$i0), node:$i1)>;
 def vxnor : PatFrag<(ops node:$i0, node:$i1), (vnot (xor node:$i0, node:$i1))>;
@@ -3022,7 +3005,7 @@ defm KXOR  : avx512_mask_binop_all<0x47, "kxor",  xor,     SchedWriteVecLogic.XM
 defm KANDN : avx512_mask_binop_all<0x42, "kandn", vandn,   SchedWriteVecLogic.XMM, 0>;
 defm KADD  : avx512_mask_binop_all<0x4A, "kadd",  X86kadd, SchedWriteVecLogic.XMM, 1, HasDQI>;
 
-multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
+multiclass avx512_binop_pat<SDPatternOperator VOpNode,
                             Instruction Inst> {
   // With AVX512F, 8-bit mask is promoted to 16-bit mask,
   // for the DQI set, this type is legal and KxxxB instruction is used
@@ -3033,25 +3016,25 @@ multiclass avx512_binop_pat<SDPatternOperator VOpNode, SDPatternOperator OpNode,
                     (COPY_TO_REGCLASS VK8:$src2, VK16)), VK8)>;
 
   // All types smaller than 8 bits require conversion anyway
-  def : Pat<(OpNode VK1:$src1, VK1:$src2),
+  def : Pat<(VOpNode VK1:$src1, VK1:$src2),
         (COPY_TO_REGCLASS (Inst
                            (COPY_TO_REGCLASS VK1:$src1, VK16),
                            (COPY_TO_REGCLASS VK1:$src2, VK16)), VK1)>;
   def : Pat<(VOpNode VK2:$src1, VK2:$src2),
         (COPY_TO_REGCLASS (Inst
                            (COPY_TO_REGCLASS VK2:$src1, VK16),
-                           (COPY_TO_REGCLASS VK2:$src2, VK16)), VK1)>;
+                           (COPY_TO_REGCLASS VK2:$src2, VK16)), VK2)>;
   def : Pat<(VOpNode VK4:$src1, VK4:$src2),
         (COPY_TO_REGCLASS (Inst
                            (COPY_TO_REGCLASS VK4:$src1, VK16),
-                           (COPY_TO_REGCLASS VK4:$src2, VK16)), VK1)>;
+                           (COPY_TO_REGCLASS VK4:$src2, VK16)), VK4)>;
 }
 
-defm : avx512_binop_pat<and,   and,  KANDWrr>;
-defm : avx512_binop_pat<vandn, andn, KANDNWrr>;
-defm : avx512_binop_pat<or,    or,   KORWrr>;
-defm : avx512_binop_pat<vxnor, xnor, KXNORWrr>;
-defm : avx512_binop_pat<xor,   xor,  KXORWrr>;
+defm : avx512_binop_pat<and,   KANDWrr>;
+defm : avx512_binop_pat<vandn, KANDNWrr>;
+defm : avx512_binop_pat<or,    KORWrr>;
+defm : avx512_binop_pat<vxnor, KXNORWrr>;
+defm : avx512_binop_pat<xor,   KXORWrr>;
 
 // Mask unpacking
 multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst,
@@ -3065,7 +3048,7 @@ multiclass avx512_mask_unpck<string Suffix, X86KVectorVTInfo Dst,
                VEX_4V, VEX_L, Sched<[sched]>;
 
     def : Pat<(Dst.KVT (concat_vectors Src.KRC:$src1, Src.KRC:$src2)),
-              (!cast<Instruction>(NAME##rr) Src.KRC:$src2, Src.KRC:$src1)>;
+              (!cast<Instruction>(NAME#rr) Src.KRC:$src2, Src.KRC:$src1)>;
   }
 }
 
@@ -3201,8 +3184,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
 multiclass axv512_cmp_packed_cc_no_vlx_lowering<string InstStr,
                                                 X86VectorVTInfo Narrow,
                                                 X86VectorVTInfo Wide> {
-def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT Narrow.RC:$src1),
-                                   (Narrow.VT Narrow.RC:$src2), timm:$cc)),
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
+                               (Narrow.VT Narrow.RC:$src2), timm:$cc)),
           (COPY_TO_REGCLASS
            (!cast<Instruction>(InstStr#"Zrri")
             (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@@ -3219,8 +3202,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
            timm:$cc), Narrow.KRC)>;
 
 // Broadcast load.
-def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT Narrow.RC:$src1),
-                                   (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)),
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT Narrow.RC:$src1),
+                               (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)), timm:$cc)),
           (COPY_TO_REGCLASS
            (!cast<Instruction>(InstStr#"Zrmbi")
             (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@@ -3235,8 +3218,8 @@ def : Pat<(Narrow.KVT (and Narrow.KRC:$mask,
            addr:$src2, timm:$cc), Narrow.KRC)>;
 
 // Commuted with broadcast load.
-def : Pat<(Narrow.KVT (X86any_cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
-                                   (Narrow.VT Narrow.RC:$src1), timm:$cc)),
+def : Pat<(Narrow.KVT (X86cmpm (Narrow.VT (Narrow.BroadcastLdFrag addr:$src2)),
+                               (Narrow.VT Narrow.RC:$src1), timm:$cc)),
           (COPY_TO_REGCLASS
            (!cast<Instruction>(InstStr#"Zrmbi")
             (Wide.VT (INSERT_SUBREG (IMPLICIT_DEF), Narrow.RC:$src1, Narrow.SubRegIdx)),
@@ -3301,7 +3284,7 @@ multiclass avx512_mask_setop<RegisterClass KRC, ValueType VT, PatFrag Val> {
   let Predicates = [HasAVX512] in
     let isReMaterializable = 1, isAsCheapAsAMove = 1, isPseudo = 1,
         SchedRW = [WriteZero] in
-      def #NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
+      def NAME# : I<0, Pseudo, (outs KRC:$dst), (ins), "",
                      [(set KRC:$dst, (VT Val))]>;
 }
 
@@ -3409,7 +3392,7 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                      !strconcat(OpcodeStr, "\t{$src1, ${dst} {${mask}}|",
                       "${dst} {${mask}}, $src1}"),
                      [(set _.RC:$dst, (_.VT
-                         (vselect _.KRCWM:$mask,
+                         (vselect_mask _.KRCWM:$mask,
                           (_.VT (ld_frag addr:$src1)),
                            (_.VT _.RC:$src0))))], _.ExeDomain>,
                      EVEX, EVEX_K, Sched<[Sched.RM]>;
@@ -3418,18 +3401,18 @@ multiclass avx512_load<bits<8> opc, string OpcodeStr, string Name,
                   (ins _.KRCWM:$mask, _.MemOp:$src),
                   OpcodeStr #"\t{$src, ${dst} {${mask}} {z}|"#
                                 "${dst} {${mask}} {z}, $src}",
-                  [(set _.RC:$dst, (_.VT (vselect _.KRCWM:$mask,
+                  [(set _.RC:$dst, (_.VT (vselect_mask _.KRCWM:$mask,
                     (_.VT (ld_frag addr:$src)), _.ImmAllZerosV)))],
                   _.ExeDomain>, EVEX, EVEX_KZ, Sched<[Sched.RM]>;
   }
   def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, undef)),
-            (!cast<Instruction>(Name#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+            (!cast<Instruction>(Name#_.ZSuffix#rmkz) _.KRCWM:$mask, addr:$ptr)>;
 
   def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, _.ImmAllZerosV)),
-            (!cast<Instruction>(Name#_.ZSuffix##rmkz) _.KRCWM:$mask, addr:$ptr)>;
+            (!cast<Instruction>(Name#_.ZSuffix#rmkz) _.KRCWM:$mask, addr:$ptr)>;
 
   def : Pat<(_.VT (mload addr:$ptr, _.KRCWM:$mask, (_.VT _.RC:$src0))),
-            (!cast<Instruction>(Name#_.ZSuffix##rmk) _.RC:$src0,
+            (!cast<Instruction>(Name#_.ZSuffix#rmk) _.RC:$src0,
              _.KRCWM:$mask, addr:$ptr)>;
 }
 
@@ -4286,6 +4269,17 @@ def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), (f64 FR64X:$src0)))
 def : Pat<(f64 (X86selects VK1WM:$mask, (loadf64 addr:$src), fp64imm0)),
           (COPY_TO_REGCLASS (v2f64 (VMOVSDZrmkz VK1WM:$mask, addr:$src)), FR64X)>;
 
+
+def : Pat<(v4f32 (X86selects VK1WM:$mask, (v4f32 VR128X:$src1), (v4f32 VR128X:$src2))),
+          (VMOVSSZrrk VR128X:$src2, VK1WM:$mask, VR128X:$src1, VR128X:$src1)>;
+def : Pat<(v2f64 (X86selects VK1WM:$mask, (v2f64 VR128X:$src1), (v2f64 VR128X:$src2))),
+          (VMOVSDZrrk VR128X:$src2, VK1WM:$mask, VR128X:$src1, VR128X:$src1)>;
+
+def : Pat<(v4f32 (X86selects VK1WM:$mask, (v4f32 VR128X:$src1), (v4f32 immAllZerosV))),
+          (VMOVSSZrrkz VK1WM:$mask, VR128X:$src1, VR128X:$src1)>;
+def : Pat<(v2f64 (X86selects VK1WM:$mask, (v2f64 VR128X:$src1), (v2f64 immAllZerosV))),
+          (VMOVSDZrrkz VK1WM:$mask, VR128X:$src1, VR128X:$src1)>;
+
 let hasSideEffects = 0, isCodeGenOnly = 1, ForceDisassemble = 1 in {
   def VMOVSSZrr_REV: AVX512<0x11, MRMDestReg, (outs VR128X:$dst),
                            (ins VR128X:$src1, VR128X:$src2),
@@ -4439,8 +4433,6 @@ let Predicates = [HasAVX512] in {
             (VMOV64toPQIZrr GR64:$src)>;
 
   // AVX 128-bit movd/movq instruction write zeros in the high 128-bit part.
-  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
-            (VMOVDI2PDIZrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload32 addr:$src)),
             (VMOVDI2PDIZrm addr:$src)>;
   def : Pat<(v8i32 (X86vzload32 addr:$src)),
@@ -4624,8 +4616,8 @@ multiclass avx512_binop_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
            avx512_binop_rm<opc, OpcodeStr, OpNode, _, sched, IsCommutable> {
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
-                  "${src2}"##_.BroadcastStr##", $src1",
-                  "$src1, ${src2}"##_.BroadcastStr,
+                  "${src2}"#_.BroadcastStr#", $src1",
+                  "$src1, ${src2}"#_.BroadcastStr,
                   (_.VT (OpNode _.RC:$src1,
                                 (_.BroadcastLdFrag addr:$src2)))>,
                   AVX512BIBase, EVEX_4V, EVEX_B,
@@ -4750,8 +4742,8 @@ multiclass avx512_binop_rm2<bits<8> opc, string OpcodeStr,
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                     (ins _Src.RC:$src1, _Brdct.ScalarMemOp:$src2),
                     OpcodeStr,
-                    "${src2}"##_Brdct.BroadcastStr##", $src1",
-                     "$src1, ${src2}"##_Brdct.BroadcastStr,
+                    "${src2}"#_Brdct.BroadcastStr#", $src1",
+                     "$src1, ${src2}"#_Brdct.BroadcastStr,
                     (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
                                  (_Brdct.VT (_Brdct.BroadcastLdFrag addr:$src2)))))>,
                     AVX512BIBase, EVEX_4V, EVEX_B,
@@ -4822,8 +4814,8 @@ multiclass avx512_packs_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _Dst, (outs _Dst.RC:$dst),
                     (ins _Src.RC:$src1, _Src.ScalarMemOp:$src2),
                     OpcodeStr,
-                    "${src2}"##_Src.BroadcastStr##", $src1",
-                     "$src1, ${src2}"##_Src.BroadcastStr,
+                    "${src2}"#_Src.BroadcastStr#", $src1",
+                     "$src1, ${src2}"#_Src.BroadcastStr,
                     (_Dst.VT (OpNode (_Src.VT _Src.RC:$src1), (bitconvert
                                  (_Src.VT (_Src.BroadcastLdFrag addr:$src2)))))>,
                     EVEX_4V, EVEX_B, EVEX_CD8<_Src.EltSize, CD8VF>,
@@ -5159,26 +5151,26 @@ multiclass avx512_logical_lowering<string InstrStr, SDNode OpNode,
                                     X86VectorVTInfo _,
                                     X86VectorVTInfo IntInfo> {
   // Masked register-register logical operations.
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (bitconvert (IntInfo.VT (OpNode _.RC:$src1, _.RC:$src2))),
                    _.RC:$src0)),
             (!cast<Instruction>(InstrStr#rrk) _.RC:$src0, _.KRCWM:$mask,
              _.RC:$src1, _.RC:$src2)>;
 
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (bitconvert (IntInfo.VT (OpNode _.RC:$src1, _.RC:$src2))),
                    _.ImmAllZerosV)),
             (!cast<Instruction>(InstrStr#rrkz) _.KRCWM:$mask, _.RC:$src1,
              _.RC:$src2)>;
 
   // Masked register-memory logical operations.
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (bitconvert (IntInfo.VT (OpNode _.RC:$src1,
                                             (load addr:$src2)))),
                    _.RC:$src0)),
             (!cast<Instruction>(InstrStr#rmk) _.RC:$src0, _.KRCWM:$mask,
              _.RC:$src1, addr:$src2)>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (bitconvert (IntInfo.VT (OpNode _.RC:$src1,
                                             (load addr:$src2)))),
                    _.ImmAllZerosV)),
@@ -5190,14 +5182,14 @@ multiclass avx512_logical_lowering_bcast<string InstrStr, SDNode OpNode,
                                          X86VectorVTInfo _,
                                          X86VectorVTInfo IntInfo> {
   // Register-broadcast logical operations.
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (bitconvert
                     (IntInfo.VT (OpNode _.RC:$src1,
                                  (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))),
                    _.RC:$src0)),
             (!cast<Instruction>(InstrStr#rmbk) _.RC:$src0, _.KRCWM:$mask,
              _.RC:$src1, addr:$src2)>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (bitconvert
                     (IntInfo.VT (OpNode _.RC:$src1,
                                  (IntInfo.VT (IntInfo.BroadcastLdFrag addr:$src2))))),
@@ -5304,7 +5296,7 @@ multiclass avx512_fp_scalar<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (VecNode _.RC:$src1,
-                                        _.ScalarIntMemCPat:$src2))>,
+                                        (_.ScalarIntMemFrags addr:$src2)))>,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
   let isCodeGenOnly = 1, Predicates = [HasAVX512] in {
   def rr : I< opc, MRMSrcReg, (outs _.FRC:$dst),
@@ -5350,7 +5342,7 @@ multiclass avx512_fp_scalar_sae<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (VecNode _.RC:$src1,
-                                        _.ScalarIntMemCPat:$src2))>,
+                                        (_.ScalarIntMemFrags addr:$src2)))>,
                          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 
   let isCodeGenOnly = 1, Predicates = [HasAVX512],
@@ -5463,28 +5455,32 @@ defm VMAXCSDZ : avx512_comutable_binop_s<0x5F, "vmaxsd", f64x_info, X86fmaxc,
                                          EVEX_CD8<64, CD8VT1>, SIMD_EXC;
 
 multiclass avx512_fp_packed<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                            SDPatternOperator MaskOpNode,
                             X86VectorVTInfo _, X86FoldableSchedWrite sched,
                             bit IsCommutable,
                             bit IsKCommutable = IsCommutable> {
   let ExeDomain = _.ExeDomain, hasSideEffects = 0,
       Uses = [MXCSR], mayRaiseFPException = 1 in {
-  defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+  defm rr: AVX512_maskable_split<opc, MRMSrcReg, _, (outs _.RC:$dst),
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
                   "$src2, $src1", "$src1, $src2",
-                  (_.VT (OpNode _.RC:$src1, _.RC:$src2)), IsCommutable,
+                  (_.VT (OpNode _.RC:$src1, _.RC:$src2)),
+                  (_.VT (MaskOpNode _.RC:$src1, _.RC:$src2)), IsCommutable,
                   IsKCommutable, IsKCommutable>,
                   EVEX_4V, Sched<[sched]>;
   let mayLoad = 1 in {
-    defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+    defm rm: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                    (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix,
                     "$src2, $src1", "$src1, $src2",
-                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
+                    (OpNode _.RC:$src1, (_.LdFrag addr:$src2)),
+                    (MaskOpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
                     EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
-    defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
-                     "${src2}"##_.BroadcastStr##", $src1",
-                     "$src1, ${src2}"##_.BroadcastStr,
-                     (OpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
+    defm rmb: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#_.Suffix,
+                     "${src2}"#_.BroadcastStr#", $src1",
+                     "$src1, ${src2}"#_.BroadcastStr,
+                     (OpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))),
+                     (MaskOpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
                      EVEX_4V, EVEX_B,
                      Sched<[sched.Folded, sched.ReadAfterFold]>;
     }
@@ -5496,7 +5492,7 @@ multiclass avx512_fp_round_packed<bits<8> opc, string OpcodeStr,
                                   X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
   defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                  (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr##_.Suffix,
+                  (ins _.RC:$src1, _.RC:$src2, AVX512RC:$rc), OpcodeStr#_.Suffix,
                   "$rc, $src2, $src1", "$src1, $src2, $rc",
                   (_.VT (OpNodeRnd _.RC:$src1, _.RC:$src2, (i32 timm:$rc)))>,
                   EVEX_4V, EVEX_B, EVEX_RC, Sched<[sched]>;
@@ -5507,38 +5503,39 @@ multiclass avx512_fp_sae_packed<bits<8> opc, string OpcodeStr,
                                 X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
   defm rrb: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
                   "{sae}, $src2, $src1", "$src1, $src2, {sae}",
                   (_.VT (OpNodeSAE _.RC:$src1, _.RC:$src2))>,
                   EVEX_4V, EVEX_B, Sched<[sched]>;
 }
 
 multiclass avx512_fp_binop_p<bits<8> opc, string OpcodeStr, SDPatternOperator OpNode,
+                             SDPatternOperator MaskOpNode,
                              Predicate prd, X86SchedWriteSizes sched,
                              bit IsCommutable = 0,
                              bit IsPD128Commutable = IsCommutable> {
   let Predicates = [prd] in {
-  defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v16f32_info,
+  defm PSZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v16f32_info,
                               sched.PS.ZMM, IsCommutable>, EVEX_V512, PS,
                               EVEX_CD8<32, CD8VF>;
-  defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f64_info,
+  defm PDZ : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f64_info,
                               sched.PD.ZMM, IsCommutable>, EVEX_V512, PD, VEX_W,
                               EVEX_CD8<64, CD8VF>;
   }
 
     // Define only if AVX512VL feature is present.
   let Predicates = [prd, HasVLX] in {
-    defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f32x_info,
+    defm PSZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f32x_info,
                                    sched.PS.XMM, IsCommutable>, EVEX_V128, PS,
                                    EVEX_CD8<32, CD8VF>;
-    defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v8f32x_info,
+    defm PSZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v8f32x_info,
                                    sched.PS.YMM, IsCommutable>, EVEX_V256, PS,
                                    EVEX_CD8<32, CD8VF>;
-    defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, v2f64x_info,
+    defm PDZ128 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v2f64x_info,
                                    sched.PD.XMM, IsPD128Commutable,
                                    IsCommutable>, EVEX_V128, PD, VEX_W,
                                    EVEX_CD8<64, CD8VF>;
-    defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, v4f64x_info,
+    defm PDZ256 : avx512_fp_packed<opc, OpcodeStr, OpNode, MaskOpNode, v4f64x_info,
                                    sched.PD.YMM, IsCommutable>, EVEX_V256, PD, VEX_W,
                                    EVEX_CD8<64, CD8VF>;
   }
@@ -5566,38 +5563,38 @@ multiclass avx512_fp_binop_p_sae<bits<8> opc, string OpcodeStr, SDNode OpNodeRnd
                                   EVEX_V512, PD, VEX_W,EVEX_CD8<64, CD8VF>;
 }
 
-defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, HasAVX512,
+defm VADD : avx512_fp_binop_p<0x58, "vadd", any_fadd, fadd, HasAVX512,
                               SchedWriteFAddSizes, 1>,
             avx512_fp_binop_p_round<0x58, "vadd", X86faddRnd, SchedWriteFAddSizes>;
-defm VMUL : avx512_fp_binop_p<0x59, "vmul", any_fmul, HasAVX512,
+defm VMUL : avx512_fp_binop_p<0x59, "vmul", any_fmul, fmul, HasAVX512,
                               SchedWriteFMulSizes, 1>,
             avx512_fp_binop_p_round<0x59, "vmul", X86fmulRnd, SchedWriteFMulSizes>;
-defm VSUB : avx512_fp_binop_p<0x5C, "vsub", any_fsub, HasAVX512,
+defm VSUB : avx512_fp_binop_p<0x5C, "vsub", any_fsub, fsub, HasAVX512,
                               SchedWriteFAddSizes>,
             avx512_fp_binop_p_round<0x5C, "vsub", X86fsubRnd, SchedWriteFAddSizes>;
-defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", any_fdiv, HasAVX512,
+defm VDIV : avx512_fp_binop_p<0x5E, "vdiv", any_fdiv, fdiv, HasAVX512,
                               SchedWriteFDivSizes>,
             avx512_fp_binop_p_round<0x5E, "vdiv", X86fdivRnd, SchedWriteFDivSizes>;
-defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, HasAVX512,
+defm VMIN : avx512_fp_binop_p<0x5D, "vmin", X86fmin, X86fmin, HasAVX512,
                               SchedWriteFCmpSizes, 0>,
             avx512_fp_binop_p_sae<0x5D, "vmin", X86fminSAE, SchedWriteFCmpSizes>;
-defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, HasAVX512,
+defm VMAX : avx512_fp_binop_p<0x5F, "vmax", X86fmax, X86fmax, HasAVX512,
                               SchedWriteFCmpSizes, 0>,
             avx512_fp_binop_p_sae<0x5F, "vmax", X86fmaxSAE, SchedWriteFCmpSizes>;
 let isCodeGenOnly = 1 in {
-  defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, HasAVX512,
+  defm VMINC : avx512_fp_binop_p<0x5D, "vmin", X86fminc, X86fminc, HasAVX512,
                                  SchedWriteFCmpSizes, 1>;
-  defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, HasAVX512,
+  defm VMAXC : avx512_fp_binop_p<0x5F, "vmax", X86fmaxc, X86fmaxc, HasAVX512,
                                  SchedWriteFCmpSizes, 1>;
 }
 let Uses = []<Register>, mayRaiseFPException = 0 in {
-defm VAND  : avx512_fp_binop_p<0x54, "vand", null_frag, HasDQI,
+defm VAND  : avx512_fp_binop_p<0x54, "vand", null_frag, null_frag, HasDQI,
                                SchedWriteFLogicSizes, 1>;
-defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, HasDQI,
+defm VANDN : avx512_fp_binop_p<0x55, "vandn", null_frag, null_frag, HasDQI,
                                SchedWriteFLogicSizes, 0>;
-defm VOR   : avx512_fp_binop_p<0x56, "vor", null_frag, HasDQI,
+defm VOR   : avx512_fp_binop_p<0x56, "vor", null_frag, null_frag, HasDQI,
                                SchedWriteFLogicSizes, 1>;
-defm VXOR  : avx512_fp_binop_p<0x57, "vxor", null_frag, HasDQI,
+defm VXOR  : avx512_fp_binop_p<0x57, "vxor", null_frag, null_frag, HasDQI,
                                SchedWriteFLogicSizes, 1>;
 }
 
@@ -5605,19 +5602,19 @@ multiclass avx512_fp_scalef_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm rr: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
                   EVEX_4V, Sched<[sched]>;
   defm rm: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                  (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr##_.Suffix,
+                  (ins _.RC:$src1, _.MemOp:$src2), OpcodeStr#_.Suffix,
                   "$src2, $src1", "$src1, $src2",
                   (OpNode _.RC:$src1, (_.LdFrag addr:$src2))>,
                   EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                   (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr##_.Suffix,
-                   "${src2}"##_.BroadcastStr##", $src1",
-                   "$src1, ${src2}"##_.BroadcastStr,
+                   (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr#_.Suffix,
+                   "${src2}"#_.BroadcastStr#", $src1",
+                   "$src1, ${src2}"#_.BroadcastStr,
                    (OpNode  _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2)))>,
                    EVEX_4V, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -5627,14 +5624,14 @@ multiclass avx512_fp_scalef_scalar<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                    X86FoldableSchedWrite sched, X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
   defm rr: AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr##_.Suffix,
+                  (ins _.RC:$src1, _.RC:$src2), OpcodeStr#_.Suffix,
                   "$src2, $src1", "$src1, $src2",
                   (_.VT (OpNode _.RC:$src1, _.RC:$src2))>,
                   Sched<[sched]>;
   defm rm: AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                  (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr##_.Suffix,
+                  (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr#_.Suffix,
                   "$src2, $src1", "$src1, $src2",
-                  (OpNode _.RC:$src1, _.ScalarIntMemCPat:$src2)>,
+                  (OpNode _.RC:$src1, (_.ScalarIntMemFrags addr:$src2))>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -5648,11 +5645,11 @@ multiclass avx512_fp_scalef_all<bits<8> opc, bits<8> opcScaler, string OpcodeStr
              avx512_fp_round_packed<opc, OpcodeStr, X86scalefRnd, sched.ZMM, v8f64_info>,
                               EVEX_V512, VEX_W, EVEX_CD8<64, CD8VF>;
   defm SSZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f32x_info>,
-             avx512_fp_scalar_round<opcScaler, OpcodeStr##"ss", f32x_info,
+             avx512_fp_scalar_round<opcScaler, OpcodeStr#"ss", f32x_info,
                                     X86scalefsRnd, sched.Scl>,
                                     EVEX_4V, VEX_LIG, EVEX_CD8<32, CD8VT1>;
   defm SDZ : avx512_fp_scalef_scalar<opcScaler, OpcodeStr, X86scalefs, sched.Scl, f64x_info>,
-             avx512_fp_scalar_round<opcScaler, OpcodeStr##"sd", f64x_info,
+             avx512_fp_scalar_round<opcScaler, OpcodeStr#"sd", f64x_info,
                                     X86scalefsRnd, sched.Scl>,
                                     EVEX_4V, VEX_LIG, EVEX_CD8<64, CD8VT1>, VEX_W;
 
@@ -5679,7 +5676,7 @@ multiclass avx512_vptest<bits<8> opc, string OpcodeStr,
                          X86FoldableSchedWrite sched, X86VectorVTInfo _,
                          string Name> {
   // NOTE: Patterns are omitted in favor of manual selection in X86ISelDAGToDAG.
-  // There are just too many permuations due to commutability and bitcasts.
+  // There are just too many permutations due to commutability and bitcasts.
   let ExeDomain = _.ExeDomain, hasSideEffects = 0 in {
   defm rr : AVX512_maskable_cmp<opc, MRMSrcReg, _, (outs _.KRC:$dst),
                    (ins _.RC:$src1, _.RC:$src2), OpcodeStr,
@@ -5701,8 +5698,8 @@ multiclass avx512_vptest_mb<bits<8> opc, string OpcodeStr,
   let ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in
   defm rmb : AVX512_maskable_cmp<opc, MRMSrcMem, _, (outs _.KRC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
-                    "${src2}"##_.BroadcastStr##", $src1",
-                    "$src1, ${src2}"##_.BroadcastStr,
+                    "${src2}"#_.BroadcastStr#", $src1",
+                    "$src1, ${src2}"#_.BroadcastStr,
                     (null_frag), (null_frag)>,
                     EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -5790,7 +5787,7 @@ multiclass avx512_shift_rmbi<bits<8> opc, Format ImmFormM,
   let ExeDomain = _.ExeDomain in
   defm mbi : AVX512_maskable<opc, ImmFormM, _, (outs _.RC:$dst),
                    (ins _.ScalarMemOp:$src1, u8imm:$src2), OpcodeStr,
-      "$src2, ${src1}"##_.BroadcastStr, "${src1}"##_.BroadcastStr##", $src2",
+      "$src2, ${src1}"#_.BroadcastStr, "${src1}"#_.BroadcastStr#", $src2",
      (_.VT (OpNode (_.BroadcastLdFrag addr:$src1), (i8 timm:$src2)))>,
      EVEX_B, Sched<[sched.Folded]>;
 }
@@ -5973,8 +5970,8 @@ multiclass avx512_var_shift_mb<bits<8> opc, string OpcodeStr, SDNode OpNode,
   let ExeDomain = _.ExeDomain in
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
-                    "${src2}"##_.BroadcastStr##", $src1",
-                    "$src1, ${src2}"##_.BroadcastStr,
+                    "${src2}"#_.BroadcastStr#", $src1",
+                    "$src1, ${src2}"#_.BroadcastStr,
                     (_.VT (OpNode _.RC:$src1, (_.VT (_.BroadcastLdFrag addr:$src2))))>,
                     AVX5128IBase, EVEX_B, EVEX_4V, EVEX_CD8<_.EltSize, CD8VF>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -6245,8 +6242,8 @@ multiclass avx512_permil_vec<bits<8> OpcVar, string OpcodeStr, SDNode OpNode,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmb: AVX512_maskable<OpcVar, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.ScalarMemOp:$src2), OpcodeStr,
-                   "${src2}"##_.BroadcastStr##", $src1",
-                   "$src1, ${src2}"##_.BroadcastStr,
+                   "${src2}"#_.BroadcastStr#", $src1",
+                   "$src1, ${src2}"#_.BroadcastStr,
                    (_.VT (OpNode
                             _.RC:$src1,
                             (Ctrl.VT (Ctrl.BroadcastLdFrag addr:$src2))))>,
@@ -6370,9 +6367,6 @@ defm VMOVLPDZ128 : avx512_mov_hilo_packed<0x12, "vmovlpd", X86Movsd,
 
 let Predicates = [HasAVX512] in {
   // VMOVHPD patterns
-  def : Pat<(v2f64 (X86Unpckl VR128X:$src1,
-                    (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
-           (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
   def : Pat<(v2f64 (X86Unpckl VR128X:$src1, (X86vzload64 addr:$src2))),
             (VMOVHPDZ128rm VR128X:$src1, addr:$src2)>;
 
@@ -6419,29 +6413,33 @@ let Predicates = [HasAVX512] in {
 //
 
 multiclass avx512_fma3p_213_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                               X86FoldableSchedWrite sched,
+                               SDNode MaskOpNode, X86FoldableSchedWrite sched,
                                X86VectorVTInfo _, string Suff> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
       Uses = [MXCSR], mayRaiseFPException = 1 in {
-  defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
+          (_.VT (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)),
+          (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, _.RC:$src3)), 1, 1>,
           AVX512FMA3Base, Sched<[sched]>;
 
-  defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
+          (_.VT (OpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))),
+          (_.VT (MaskOpNode _.RC:$src2, _.RC:$src1, (_.LdFrag addr:$src3))), 1, 0>,
           AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
-  defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
             (ins _.RC:$src2, _.ScalarMemOp:$src3),
             OpcodeStr,   !strconcat("${src3}", _.BroadcastStr,", $src2"),
             !strconcat("$src2, ${src3}", _.BroadcastStr ),
             (OpNode _.RC:$src2,
+             _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))),
+            (MaskOpNode _.RC:$src2,
              _.RC:$src1,(_.VT (_.BroadcastLdFrag addr:$src3))), 1, 0>,
-             AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
+            AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
 
@@ -6450,74 +6448,88 @@ multiclass avx512_fma3_213_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                  X86VectorVTInfo _, string Suff> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
       Uses = [MXCSR] in
-  defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
+          (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))),
           (_.VT ( OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i32 timm:$rc))), 1, 1>,
           AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_213_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                   SDNode OpNodeRnd, X86SchedWriteWidths sched,
+                                   SDNode MaskOpNode, SDNode OpNodeRnd,
+                                   X86SchedWriteWidths sched,
                                    AVX512VLVectorVTInfo _, string Suff> {
   let Predicates = [HasAVX512] in {
-    defm Z      : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.ZMM,
-                                      _.info512, Suff>,
+    defm Z      : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                      sched.ZMM, _.info512, Suff>,
                   avx512_fma3_213_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
                                         _.info512, Suff>,
                               EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
   }
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.YMM,
-                                    _.info256, Suff>,
+    defm Z256 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                    sched.YMM, _.info256, Suff>,
                       EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
-    defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, sched.XMM,
-                                    _.info128, Suff>,
+    defm Z128 : avx512_fma3p_213_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                    sched.XMM, _.info128, Suff>,
                       EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
   }
 }
 
 multiclass avx512_fma3p_213_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              SDNode OpNodeRnd> {
-    defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
-                                      SchedWriteFMA, avx512vl_f32_info, "PS">;
-    defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
-                                      SchedWriteFMA, avx512vl_f64_info, "PD">,
-                                      VEX_W;
-}
-
-defm VFMADD213    : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd, X86FmaddRnd>;
-defm VFMSUB213    : avx512_fma3p_213_f<0xAA, "vfmsub213", X86Fmsub, X86FmsubRnd>;
-defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub, X86FmaddsubRnd>;
-defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd, X86FmsubaddRnd>;
-defm VFNMADD213   : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB213   : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86Fnmsub, X86FnmsubRnd>;
+                              SDNode MaskOpNode, SDNode OpNodeRnd> {
+    defm PS : avx512_fma3p_213_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
+                                      OpNodeRnd, SchedWriteFMA,
+                                      avx512vl_f32_info, "PS">;
+    defm PD : avx512_fma3p_213_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
+                                      OpNodeRnd, SchedWriteFMA,
+                                      avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD213    : avx512_fma3p_213_f<0xA8, "vfmadd213", X86any_Fmadd,
+                                       X86Fmadd, X86FmaddRnd>;
+defm VFMSUB213    : avx512_fma3p_213_f<0xAA, "vfmsub213", X86any_Fmsub,
+                                       X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB213 : avx512_fma3p_213_f<0xA6, "vfmaddsub213", X86Fmaddsub,
+                                       X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD213 : avx512_fma3p_213_f<0xA7, "vfmsubadd213", X86Fmsubadd,
+                                       X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD213   : avx512_fma3p_213_f<0xAC, "vfnmadd213", X86any_Fnmadd,
+                                       X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB213   : avx512_fma3p_213_f<0xAE, "vfnmsub213", X86any_Fnmsub,
+                                       X86Fnmsub, X86FnmsubRnd>;
 
 
 multiclass avx512_fma3p_231_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                               X86FoldableSchedWrite sched,
+                               SDNode MaskOpNode, X86FoldableSchedWrite sched,
                                X86VectorVTInfo _, string Suff> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
       Uses = [MXCSR], mayRaiseFPException = 1 in {
-  defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1,
-          vselect, 1>, AVX512FMA3Base, Sched<[sched]>;
+          (null_frag),
+          (_.VT (MaskOpNode _.RC:$src2, _.RC:$src3, _.RC:$src1)), 1, 1>,
+          AVX512FMA3Base, Sched<[sched]>;
 
-  defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
+          (_.VT (OpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)),
+          (_.VT (MaskOpNode _.RC:$src2, (_.LdFrag addr:$src3), _.RC:$src1)), 1, 0>,
           AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
-  defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
          (ins _.RC:$src2, _.ScalarMemOp:$src3),
-         OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
-         "$src2, ${src3}"##_.BroadcastStr,
+         OpcodeStr, "${src3}"#_.BroadcastStr#", $src2",
+         "$src2, ${src3}"#_.BroadcastStr,
          (_.VT (OpNode _.RC:$src2,
                       (_.VT (_.BroadcastLdFrag addr:$src3)),
-                      _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B,
+                      _.RC:$src1)),
+         (_.VT (MaskOpNode _.RC:$src2,
+                           (_.VT (_.BroadcastLdFrag addr:$src3)),
+                           _.RC:$src1)), 1, 0>, AVX512FMA3Base, EVEX_B,
          Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -6527,77 +6539,89 @@ multiclass avx512_fma3_231_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                  X86VectorVTInfo _, string Suff> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
       Uses = [MXCSR] in
-  defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
-          (_.VT ( OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))),
-          1, 1, vselect, 1>,
-          AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
+          (null_frag),
+          (_.VT (OpNode _.RC:$src2, _.RC:$src3, _.RC:$src1, (i32 timm:$rc))),
+          1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_231_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                   SDNode OpNodeRnd, X86SchedWriteWidths sched,
+                                   SDNode MaskOpNode, SDNode OpNodeRnd,
+                                   X86SchedWriteWidths sched,
                                    AVX512VLVectorVTInfo _, string Suff> {
   let Predicates = [HasAVX512] in {
-    defm Z      : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.ZMM,
-                                      _.info512, Suff>,
+    defm Z      : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                      sched.ZMM, _.info512, Suff>,
                   avx512_fma3_231_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
                                         _.info512, Suff>,
                               EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
   }
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.YMM,
-                                    _.info256, Suff>,
+    defm Z256 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                    sched.YMM, _.info256, Suff>,
                       EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
-    defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, sched.XMM,
-                                    _.info128, Suff>,
+    defm Z128 : avx512_fma3p_231_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                    sched.XMM, _.info128, Suff>,
                       EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
   }
 }
 
 multiclass avx512_fma3p_231_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              SDNode OpNodeRnd > {
-    defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
-                                      SchedWriteFMA, avx512vl_f32_info, "PS">;
-    defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
-                                      SchedWriteFMA, avx512vl_f64_info, "PD">,
-                                      VEX_W;
-}
-
-defm VFMADD231    : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd, X86FmaddRnd>;
-defm VFMSUB231    : avx512_fma3p_231_f<0xBA, "vfmsub231", X86Fmsub, X86FmsubRnd>;
-defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub, X86FmaddsubRnd>;
-defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd, X86FmsubaddRnd>;
-defm VFNMADD231   : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB231   : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86Fnmsub, X86FnmsubRnd>;
+                              SDNode MaskOpNode, SDNode OpNodeRnd > {
+    defm PS : avx512_fma3p_231_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
+                                      OpNodeRnd, SchedWriteFMA,
+                                      avx512vl_f32_info, "PS">;
+    defm PD : avx512_fma3p_231_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
+                                      OpNodeRnd, SchedWriteFMA,
+                                      avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD231    : avx512_fma3p_231_f<0xB8, "vfmadd231", X86any_Fmadd,
+                                       X86Fmadd, X86FmaddRnd>;
+defm VFMSUB231    : avx512_fma3p_231_f<0xBA, "vfmsub231", X86any_Fmsub,
+                                       X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB231 : avx512_fma3p_231_f<0xB6, "vfmaddsub231", X86Fmaddsub,
+                                       X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD231 : avx512_fma3p_231_f<0xB7, "vfmsubadd231", X86Fmsubadd,
+                                       X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD231   : avx512_fma3p_231_f<0xBC, "vfnmadd231", X86any_Fnmadd,
+                                       X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB231   : avx512_fma3p_231_f<0xBE, "vfnmsub231", X86any_Fnmsub,
+                                       X86Fnmsub, X86FnmsubRnd>;
 
 multiclass avx512_fma3p_132_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                               X86FoldableSchedWrite sched,
+                               SDNode MaskOpNode, X86FoldableSchedWrite sched,
                                X86VectorVTInfo _, string Suff> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
       Uses = [MXCSR], mayRaiseFPException = 1 in {
-  defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm r: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1, vselect, 1>,
+          (null_frag),
+          (_.VT (MaskOpNode _.RC:$src1, _.RC:$src3, _.RC:$src2)), 1, 1>,
           AVX512FMA3Base, Sched<[sched]>;
 
   // Pattern is 312 order so that the load is in a different place from the
   // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
-  defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm m: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.MemOp:$src3),
           OpcodeStr, "$src3, $src2", "$src2, $src3",
-          (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>,
+          (_.VT (OpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)),
+          (_.VT (MaskOpNode (_.LdFrag addr:$src3), _.RC:$src1, _.RC:$src2)), 1, 0>,
           AVX512FMA3Base, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   // Pattern is 312 order so that the load is in a different place from the
   // 213 and 231 patterns this helps tablegen's duplicate pattern detection.
-  defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm mb: AVX512_maskable_fma<opc, MRMSrcMem, _, (outs _.RC:$dst),
          (ins _.RC:$src2, _.ScalarMemOp:$src3),
-         OpcodeStr, "${src3}"##_.BroadcastStr##", $src2",
-         "$src2, ${src3}"##_.BroadcastStr,
+         OpcodeStr, "${src3}"#_.BroadcastStr#", $src2",
+         "$src2, ${src3}"#_.BroadcastStr,
          (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src3)),
-                       _.RC:$src1, _.RC:$src2)), 1, 0>,
+                       _.RC:$src1, _.RC:$src2)),
+         (_.VT (MaskOpNode (_.VT (_.BroadcastLdFrag addr:$src3)),
+                           _.RC:$src1, _.RC:$src2)), 1, 0>,
          AVX512FMA3Base, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -6607,49 +6631,57 @@ multiclass avx512_fma3_132_round<bits<8> opc, string OpcodeStr, SDNode OpNode,
                                  X86VectorVTInfo _, string Suff> {
   let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, hasSideEffects = 0,
       Uses = [MXCSR] in
-  defm rb: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rb: AVX512_maskable_fma<opc, MRMSrcReg, _, (outs _.RC:$dst),
           (ins _.RC:$src2, _.RC:$src3, AVX512RC:$rc),
           OpcodeStr, "$rc, $src3, $src2", "$src2, $src3, $rc",
-          (_.VT ( OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))),
-          1, 1, vselect, 1>,
-          AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
+          (null_frag),
+          (_.VT (OpNode _.RC:$src1, _.RC:$src3, _.RC:$src2, (i32 timm:$rc))),
+          1, 1>, AVX512FMA3Base, EVEX_B, EVEX_RC, Sched<[sched]>;
 }
 
 multiclass avx512_fma3p_132_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                   SDNode OpNodeRnd, X86SchedWriteWidths sched,
+                                   SDNode MaskOpNode, SDNode OpNodeRnd,
+                                   X86SchedWriteWidths sched,
                                    AVX512VLVectorVTInfo _, string Suff> {
   let Predicates = [HasAVX512] in {
-    defm Z      : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.ZMM,
-                                      _.info512, Suff>,
+    defm Z      : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                      sched.ZMM, _.info512, Suff>,
                   avx512_fma3_132_round<opc, OpcodeStr, OpNodeRnd, sched.ZMM,
                                         _.info512, Suff>,
                               EVEX_V512, EVEX_CD8<_.info512.EltSize, CD8VF>;
   }
   let Predicates = [HasVLX, HasAVX512] in {
-    defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.YMM,
-                                    _.info256, Suff>,
+    defm Z256 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                    sched.YMM, _.info256, Suff>,
                       EVEX_V256, EVEX_CD8<_.info256.EltSize, CD8VF>;
-    defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, sched.XMM,
-                                    _.info128, Suff>,
+    defm Z128 : avx512_fma3p_132_rm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                    sched.XMM, _.info128, Suff>,
                       EVEX_V128, EVEX_CD8<_.info128.EltSize, CD8VF>;
   }
 }
 
 multiclass avx512_fma3p_132_f<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              SDNode OpNodeRnd > {
-    defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, OpNodeRnd,
-                                      SchedWriteFMA, avx512vl_f32_info, "PS">;
-    defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, OpNodeRnd,
-                                      SchedWriteFMA, avx512vl_f64_info, "PD">,
-                                      VEX_W;
-}
-
-defm VFMADD132    : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd, X86FmaddRnd>;
-defm VFMSUB132    : avx512_fma3p_132_f<0x9A, "vfmsub132", X86Fmsub, X86FmsubRnd>;
-defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub, X86FmaddsubRnd>;
-defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd, X86FmsubaddRnd>;
-defm VFNMADD132   : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB132   : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86Fnmsub, X86FnmsubRnd>;
+                              SDNode MaskOpNode, SDNode OpNodeRnd > {
+    defm PS : avx512_fma3p_132_common<opc, OpcodeStr#"ps", OpNode, MaskOpNode,
+                                      OpNodeRnd, SchedWriteFMA,
+                                      avx512vl_f32_info, "PS">;
+    defm PD : avx512_fma3p_132_common<opc, OpcodeStr#"pd", OpNode, MaskOpNode,
+                                      OpNodeRnd, SchedWriteFMA,
+                                      avx512vl_f64_info, "PD">, VEX_W;
+}
+
+defm VFMADD132    : avx512_fma3p_132_f<0x98, "vfmadd132", X86any_Fmadd,
+                                       X86Fmadd, X86FmaddRnd>;
+defm VFMSUB132    : avx512_fma3p_132_f<0x9A, "vfmsub132", X86any_Fmsub,
+                                       X86Fmsub, X86FmsubRnd>;
+defm VFMADDSUB132 : avx512_fma3p_132_f<0x96, "vfmaddsub132", X86Fmaddsub,
+                                       X86Fmaddsub, X86FmaddsubRnd>;
+defm VFMSUBADD132 : avx512_fma3p_132_f<0x97, "vfmsubadd132", X86Fmsubadd,
+                                       X86Fmsubadd, X86FmsubaddRnd>;
+defm VFNMADD132   : avx512_fma3p_132_f<0x9C, "vfnmadd132", X86any_Fnmadd,
+                                       X86Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB132   : avx512_fma3p_132_f<0x9E, "vfnmsub132", X86any_Fnmsub,
+                                       X86Fnmsub, X86FnmsubRnd>;
 
 // Scalar FMA
 multiclass avx512_fma3s_common<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
@@ -6742,11 +6774,12 @@ multiclass avx512_fma3s<bits<8> opc213, bits<8> opc231, bits<8> opc132,
 }
 
 defm VFMADD  : avx512_fma3s<0xA9, 0xB9, 0x99, "vfmadd", X86any_Fmadd, X86FmaddRnd>;
-defm VFMSUB  : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86Fmsub, X86FmsubRnd>;
-defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86Fnmadd, X86FnmaddRnd>;
-defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86Fnmsub, X86FnmsubRnd>;
+defm VFMSUB  : avx512_fma3s<0xAB, 0xBB, 0x9B, "vfmsub", X86any_Fmsub, X86FmsubRnd>;
+defm VFNMADD : avx512_fma3s<0xAD, 0xBD, 0x9D, "vfnmadd", X86any_Fnmadd, X86FnmaddRnd>;
+defm VFNMSUB : avx512_fma3s<0xAF, 0xBF, 0x9F, "vfnmsub", X86any_Fnmsub, X86FnmsubRnd>;
 
-multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
+multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode MaskedOp,
+                                      SDNode RndOp, string Prefix,
                                       string Suffix, SDNode Move,
                                       X86VectorVTInfo _, PatLeaf ZeroFP> {
   let Predicates = [HasAVX512] in {
@@ -6788,8 +6821,8 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
                addr:$src3)>;
 
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
-               (X86selects VK1WM:$mask,
-                (Op _.FRC:$src2,
+               (X86selects_mask VK1WM:$mask,
+                (MaskedOp _.FRC:$src2,
                     (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                     _.FRC:$src3),
                 (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
@@ -6799,8 +6832,8 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
 
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
-               (X86selects VK1WM:$mask,
-                (Op _.FRC:$src2,
+               (X86selects_mask VK1WM:$mask,
+                (MaskedOp _.FRC:$src2,
                     (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                     (_.ScalarLdFrag addr:$src3)),
                 (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
@@ -6809,18 +6842,18 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
 
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
-               (X86selects VK1WM:$mask,
-                (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
-                    (_.ScalarLdFrag addr:$src3), _.FRC:$src2),
+               (X86selects_mask VK1WM:$mask,
+                (MaskedOp (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                          (_.ScalarLdFrag addr:$src3), _.FRC:$src2),
                 (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
               (!cast<I>(Prefix#"132"#Suffix#"Zm_Intk")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
 
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
-               (X86selects VK1WM:$mask,
-                (Op _.FRC:$src2, _.FRC:$src3,
-                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+               (X86selects_mask VK1WM:$mask,
+                (MaskedOp _.FRC:$src2, _.FRC:$src3,
+                          (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
                 (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
               (!cast<I>(Prefix#"231"#Suffix#"Zr_Intk")
                VR128X:$src1, VK1WM:$mask,
@@ -6828,19 +6861,19 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
 
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
-               (X86selects VK1WM:$mask,
-                (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
-                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+               (X86selects_mask VK1WM:$mask,
+                (MaskedOp _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
+                          (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
                 (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))))))),
               (!cast<I>(Prefix#"231"#Suffix#"Zm_Intk")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
 
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
-               (X86selects VK1WM:$mask,
-                (Op _.FRC:$src2,
-                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
-                    _.FRC:$src3),
+               (X86selects_mask VK1WM:$mask,
+                (MaskedOp _.FRC:$src2,
+                          (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                          _.FRC:$src3),
                 (_.EltVT ZeroFP)))))),
               (!cast<I>(Prefix#"213"#Suffix#"Zr_Intkz")
                VR128X:$src1, VK1WM:$mask,
@@ -6848,9 +6881,9 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
 
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
-               (X86selects VK1WM:$mask,
-                (Op _.FRC:$src2, _.FRC:$src3,
-                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+               (X86selects_mask VK1WM:$mask,
+                (MaskedOp _.FRC:$src2, _.FRC:$src3,
+                          (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
                 (_.EltVT ZeroFP)))))),
               (!cast<I>(Prefix#"231"#Suffix#"Zr_Intkz")
                VR128X:$src1, VK1WM:$mask,
@@ -6858,28 +6891,28 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)))>;
 
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
-               (X86selects VK1WM:$mask,
-                (Op _.FRC:$src2,
-                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
-                    (_.ScalarLdFrag addr:$src3)),
+               (X86selects_mask VK1WM:$mask,
+                (MaskedOp _.FRC:$src2,
+                          (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                          (_.ScalarLdFrag addr:$src3)),
                 (_.EltVT ZeroFP)))))),
               (!cast<I>(Prefix#"213"#Suffix#"Zm_Intkz")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
 
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
-               (X86selects VK1WM:$mask,
-                (Op (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
-                    _.FRC:$src2, (_.ScalarLdFrag addr:$src3)),
+               (X86selects_mask VK1WM:$mask,
+                (MaskedOp (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                          _.FRC:$src2, (_.ScalarLdFrag addr:$src3)),
                 (_.EltVT ZeroFP)))))),
               (!cast<I>(Prefix#"132"#Suffix#"Zm_Intkz")
                VR128X:$src1, VK1WM:$mask,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)), addr:$src3)>;
 
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
-               (X86selects VK1WM:$mask,
-                (Op _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
-                    (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
+               (X86selects_mask VK1WM:$mask,
+                (MaskedOp _.FRC:$src2, (_.ScalarLdFrag addr:$src3),
+                          (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0)))),
                 (_.EltVT ZeroFP)))))),
               (!cast<I>(Prefix#"231"#Suffix#"Zm_Intkz")
                VR128X:$src1, VK1WM:$mask,
@@ -6903,7 +6936,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
 
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
-               (X86selects VK1WM:$mask,
+               (X86selects_mask VK1WM:$mask,
                 (RndOp _.FRC:$src2,
                        (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                        _.FRC:$src3, (i32 timm:$rc)),
@@ -6914,7 +6947,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
 
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
-               (X86selects VK1WM:$mask,
+               (X86selects_mask VK1WM:$mask,
                 (RndOp _.FRC:$src2, _.FRC:$src3,
                        (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                        (i32 timm:$rc)),
@@ -6925,7 +6958,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
 
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
-               (X86selects VK1WM:$mask,
+               (X86selects_mask VK1WM:$mask,
                 (RndOp _.FRC:$src2,
                        (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                        _.FRC:$src3, (i32 timm:$rc)),
@@ -6936,7 +6969,7 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src3, VR128X)), AVX512RC:$rc)>;
 
     def : Pat<(_.VT (Move (_.VT VR128X:$src1), (_.VT (scalar_to_vector
-               (X86selects VK1WM:$mask,
+               (X86selects_mask VK1WM:$mask,
                 (RndOp _.FRC:$src2, _.FRC:$src3,
                        (_.EltVT (extractelt (_.VT VR128X:$src1), (iPTR 0))),
                        (i32 timm:$rc)),
@@ -6948,23 +6981,23 @@ multiclass avx512_scalar_fma_patterns<SDNode Op, SDNode RndOp, string Prefix,
   }
 }
 
-defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SS",
-                                  X86Movss, v4f32x_info, fp32imm0>;
-defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SS",
-                                  X86Movss, v4f32x_info, fp32imm0>;
-defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SS",
-                                  X86Movss, v4f32x_info, fp32imm0>;
-defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SS",
-                                  X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86Fmadd, X86FmaddRnd, "VFMADD",
+                                  "SS", X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB",
+                                  "SS", X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86Fnmadd, X86FnmaddRnd, "VFNMADD",
+                                  "SS", X86Movss, v4f32x_info, fp32imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86Fnmsub, X86FnmsubRnd, "VFNMSUB",
+                                  "SS", X86Movss, v4f32x_info, fp32imm0>;
 
-defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86FmaddRnd, "VFMADD", "SD",
-                                  X86Movsd, v2f64x_info, fp64imm0>;
-defm : avx512_scalar_fma_patterns<X86Fmsub, X86FmsubRnd, "VFMSUB", "SD",
-                                  X86Movsd, v2f64x_info, fp64imm0>;
-defm : avx512_scalar_fma_patterns<X86Fnmadd, X86FnmaddRnd, "VFNMADD", "SD",
-                                  X86Movsd, v2f64x_info, fp64imm0>;
-defm : avx512_scalar_fma_patterns<X86Fnmsub, X86FnmsubRnd, "VFNMSUB", "SD",
-                                  X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fmadd, X86Fmadd, X86FmaddRnd, "VFMADD",
+                                  "SD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fmsub, X86Fmsub, X86FmsubRnd, "VFMSUB",
+                                  "SD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmadd, X86Fnmadd, X86FnmaddRnd, "VFNMADD",
+                                  "SD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : avx512_scalar_fma_patterns<X86any_Fnmsub, X86Fnmsub, X86FnmsubRnd, "VFNMSUB",
+                                  "SD", X86Movsd, v2f64x_info, fp64imm0>;
 
 //===----------------------------------------------------------------------===//
 // AVX-512  Packed Multiply of Unsigned 52-bit Integers and Add the Low 52-bit IFMA
@@ -7194,7 +7227,7 @@ multiclass avx512_cvt_s_int_round<bits<8> opc, X86VectorVTInfo SrcVT,
     def rm_Int : SI<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.IntScalarMemOp:$src),
                 !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
                 [(set DstVT.RC:$dst, (OpNode
-                      (SrcVT.VT SrcVT.ScalarIntMemCPat:$src)))]>,
+                      (SrcVT.ScalarIntMemFrags addr:$src)))]>,
                 EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
   } // Predicates = [HasAVX512]
 
@@ -7233,6 +7266,45 @@ defm VCVTSD2USI64Z: avx512_cvt_s_int_round<0x79, f64x_info, i64x_info, X86cvts2u
                                    X86cvts2usiRnd, WriteCvtSD2I, "cvtsd2usi", "{q}">,
                                    XD, VEX_W, EVEX_CD8<64, CD8VT1>;
 
+multiclass avx512_cvt_s<bits<8> opc, string asm, X86VectorVTInfo SrcVT,
+                        X86VectorVTInfo DstVT, SDNode OpNode,
+                        X86FoldableSchedWrite sched,
+                        string aliasStr> {
+  let Predicates = [HasAVX512], ExeDomain = SrcVT.ExeDomain in {
+    let isCodeGenOnly = 1 in {
+    def rr : AVX512<opc, MRMSrcReg, (outs DstVT.RC:$dst), (ins SrcVT.FRC:$src),
+                !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+                [(set DstVT.RC:$dst, (OpNode SrcVT.FRC:$src))]>,
+                EVEX, VEX_LIG, Sched<[sched]>, SIMD_EXC;
+    def rm : AVX512<opc, MRMSrcMem, (outs DstVT.RC:$dst), (ins SrcVT.ScalarMemOp:$src),
+                !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
+                [(set DstVT.RC:$dst, (OpNode (SrcVT.ScalarLdFrag addr:$src)))]>,
+                EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+    }
+  } // Predicates = [HasAVX512]
+}
+
+defm VCVTSS2SIZ: avx512_cvt_s<0x2D, "vcvtss2si", f32x_info, i32x_info,
+                       lrint, WriteCvtSS2I,
+                       "{l}">, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSS2SI64Z: avx512_cvt_s<0x2D, "vcvtss2si", f32x_info, i64x_info,
+                       llrint, WriteCvtSS2I,
+                       "{q}">, VEX_W, XS, EVEX_CD8<32, CD8VT1>;
+defm VCVTSD2SIZ: avx512_cvt_s<0x2D, "vcvtsd2si", f64x_info, i32x_info,
+                       lrint, WriteCvtSD2I,
+                       "{l}">, XD, EVEX_CD8<64, CD8VT1>;
+defm VCVTSD2SI64Z: avx512_cvt_s<0x2D, "vcvtsd2si", f64x_info, i64x_info,
+                       llrint, WriteCvtSD2I,
+                       "{q}">, VEX_W, XD, EVEX_CD8<64, CD8VT1>;
+
+let Predicates = [HasAVX512] in {
+  def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64Zrr FR32:$src)>;
+  def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64Zrm addr:$src)>;
+
+  def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64Zrr FR64:$src)>;
+  def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64Zrm addr:$src)>;
+}
+
 // Patterns used for matching vcvtsi2s{s,d} intrinsic sequences from clang
 // which produce unnecessary vmovs{s,d} instructions
 let Predicates = [HasAVX512] in {
@@ -7347,7 +7419,7 @@ let Predicates = [HasAVX512], ExeDomain = _SrcRC.ExeDomain in {
               (ins _SrcRC.IntScalarMemOp:$src),
               !strconcat(asm,"\t{$src, $dst|$dst, $src}"),
               [(set _DstRC.RC:$dst,
-                (OpNodeInt (_SrcRC.VT _SrcRC.ScalarIntMemCPat:$src)))]>,
+                (OpNodeInt (_SrcRC.ScalarIntMemFrags addr:$src)))]>,
               EVEX, VEX_LIG, Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 } //HasAVX512
 
@@ -7404,7 +7476,7 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
                          (ins _.RC:$src1, _Src.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (_.VT (OpNode (_.VT _.RC:$src1),
-                                  (_Src.VT _Src.ScalarIntMemCPat:$src2)))>,
+                                  (_Src.ScalarIntMemFrags addr:$src2)))>,
                          EVEX_4V, VEX_LIG,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -7421,7 +7493,7 @@ multiclass avx512_cvt_fp_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _
   }
 }
 
-// Scalar Coversion with SAE - suppress all exceptions
+// Scalar Conversion with SAE - suppress all exceptions
 multiclass avx512_cvt_fp_sae_scalar<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                                     X86VectorVTInfo _Src, SDNode OpNodeSAE,
                                     X86FoldableSchedWrite sched> {
@@ -7506,55 +7578,63 @@ def : Pat<(v2f64 (X86Movsd
 //===----------------------------------------------------------------------===//
 
 multiclass avx512_vcvt_fp<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                          X86VectorVTInfo _Src, SDNode OpNode,
+                          X86VectorVTInfo _Src, SDNode OpNode, SDNode MaskOpNode,
                           X86FoldableSchedWrite sched,
                           string Broadcast = _.BroadcastStr,
                           string Alias = "", X86MemOperand MemOp = _Src.MemOp,
                           RegisterClass MaskRC = _.KRCWM,
-                          dag LdDAG = (_.VT (OpNode (_Src.VT (_Src.LdFrag addr:$src))))> {
+                          dag LdDAG = (_.VT (OpNode (_Src.VT (_Src.LdFrag addr:$src)))),
+                          dag MaskLdDAG = (_.VT (MaskOpNode (_Src.VT (_Src.LdFrag addr:$src))))> {
 let Uses = [MXCSR], mayRaiseFPException = 1 in {
-  defm rr : AVX512_maskable_common<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rr : AVX512_maskable_cvt<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _Src.RC:$src),
                          (ins _.RC:$src0, MaskRC:$mask, _Src.RC:$src),
                          (ins MaskRC:$mask, _Src.RC:$src),
                           OpcodeStr, "$src", "$src",
                          (_.VT (OpNode (_Src.VT _Src.RC:$src))),
-                         (vselect MaskRC:$mask,
-                                  (_.VT (OpNode (_Src.VT _Src.RC:$src))),
-                                  _.RC:$src0),
-                         vselect, "$src0 = $dst">,
+                         (vselect_mask MaskRC:$mask,
+                                       (_.VT (MaskOpNode (_Src.VT _Src.RC:$src))),
+                                       _.RC:$src0),
+                         (vselect_mask MaskRC:$mask,
+                                       (_.VT (MaskOpNode (_Src.VT _Src.RC:$src))),
+                                       _.ImmAllZerosV)>,
                          EVEX, Sched<[sched]>;
 
-  defm rm : AVX512_maskable_common<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm rm : AVX512_maskable_cvt<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins MemOp:$src),
                          (ins _.RC:$src0, MaskRC:$mask, MemOp:$src),
                          (ins MaskRC:$mask, MemOp:$src),
                          OpcodeStr#Alias, "$src", "$src",
                          LdDAG,
-                         (vselect MaskRC:$mask, LdDAG, _.RC:$src0),
-                         vselect, "$src0 = $dst">,
+                         (vselect_mask MaskRC:$mask, MaskLdDAG, _.RC:$src0),
+                         (vselect_mask MaskRC:$mask, MaskLdDAG, _.ImmAllZerosV)>,
                          EVEX, Sched<[sched.Folded]>;
 
-  defm rmb : AVX512_maskable_common<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm rmb : AVX512_maskable_cvt<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _Src.ScalarMemOp:$src),
                          (ins _.RC:$src0, MaskRC:$mask, _Src.ScalarMemOp:$src),
                          (ins MaskRC:$mask, _Src.ScalarMemOp:$src),
                          OpcodeStr,
-                         "${src}"##Broadcast, "${src}"##Broadcast,
+                         "${src}"#Broadcast, "${src}"#Broadcast,
                          (_.VT (OpNode (_Src.VT
                                   (_Src.BroadcastLdFrag addr:$src))
                             )),
-                         (vselect MaskRC:$mask,
-                                  (_.VT
-                                   (OpNode
-                                    (_Src.VT
-                                     (_Src.BroadcastLdFrag addr:$src)))),
-                                  _.RC:$src0),
-                         vselect, "$src0 = $dst">,
+                         (vselect_mask MaskRC:$mask,
+                                       (_.VT
+                                        (MaskOpNode
+                                         (_Src.VT
+                                          (_Src.BroadcastLdFrag addr:$src)))),
+                                       _.RC:$src0),
+                         (vselect_mask MaskRC:$mask,
+                                       (_.VT
+                                        (MaskOpNode
+                                         (_Src.VT
+                                          (_Src.BroadcastLdFrag addr:$src)))),
+                                       _.ImmAllZerosV)>,
                          EVEX, EVEX_B, Sched<[sched.Folded]>;
   }
 }
-// Coversion with SAE - suppress all exceptions
+// Conversion with SAE - suppress all exceptions
 multiclass avx512_vcvt_fp_sae<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                               X86VectorVTInfo _Src, SDNode OpNodeSAE,
                               X86FoldableSchedWrite sched> {
@@ -7581,12 +7661,14 @@ multiclass avx512_vcvt_fp_rc<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
 // Similar to avx512_vcvt_fp, but uses an extload for the memory form.
 multiclass avx512_vcvt_fpextend<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
                                 X86VectorVTInfo _Src, SDNode OpNode,
+                                SDNode MaskOpNode, 
                                 X86FoldableSchedWrite sched,
                                 string Broadcast = _.BroadcastStr,
                                 string Alias = "", X86MemOperand MemOp = _Src.MemOp,
                                 RegisterClass MaskRC = _.KRCWM>
-  : avx512_vcvt_fp<opc, OpcodeStr, _, _Src, OpNode, sched, Broadcast, Alias,
-                   MemOp, MaskRC,
+  : avx512_vcvt_fp<opc, OpcodeStr, _, _Src, OpNode, MaskOpNode, sched, Broadcast,
+                   Alias, MemOp, MaskRC,
+                   (_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src)),
                    (_.VT (!cast<PatFrag>("extload"#_Src.VTName) addr:$src))>;
 
 // Extend Float to Double
@@ -7594,69 +7676,72 @@ multiclass avx512_cvtps2pd<bits<8> opc, string OpcodeStr,
                            X86SchedWriteWidths sched> {
   let Predicates = [HasAVX512] in {
     defm Z : avx512_vcvt_fpextend<opc, OpcodeStr, v8f64_info, v8f32x_info,
-                            any_fpextend, sched.ZMM>,
+                            any_fpextend, fpextend, sched.ZMM>,
              avx512_vcvt_fp_sae<opc, OpcodeStr, v8f64_info, v8f32x_info,
                                 X86vfpextSAE, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasVLX] in {
     defm Z128 : avx512_vcvt_fpextend<opc, OpcodeStr, v2f64x_info, v4f32x_info,
-                               X86any_vfpext, sched.XMM, "{1to2}", "", f64mem>, EVEX_V128;
-    defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info, any_fpextend,
-                               sched.YMM>, EVEX_V256;
+                               X86any_vfpext, X86vfpext, sched.XMM, "{1to2}",
+                               "", f64mem>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fpextend<opc, OpcodeStr, v4f64x_info, v4f32x_info,
+                                     any_fpextend, fpextend, sched.YMM>, EVEX_V256;
   }
 }
 
 // Truncate Double to Float
 multiclass avx512_cvtpd2ps<bits<8> opc, string OpcodeStr, X86SchedWriteWidths sched> {
   let Predicates = [HasAVX512] in {
-    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info, X86any_vfpround, sched.ZMM>,
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8f64_info,
+                            X86any_vfpround, X86vfpround, sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8f64_info,
                                X86vfproundRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2f64x_info,
-                               null_frag, sched.XMM, "{1to2}", "{x}", f128mem, VK2WM>,
-                               EVEX_V128;
-    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info, X86any_vfpround,
+                               null_frag, null_frag, sched.XMM, "{1to2}", "{x}",
+                               f128mem, VK2WM>, EVEX_V128;
+    defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4f64x_info,
+                               X86any_vfpround, X86vfpround,
                                sched.YMM, "{1to4}", "{y}">, EVEX_V256;
   }
 
-  def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
                   (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
                   (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
                   VK2WM:$mask, VR128X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|"
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|"
                   "$dst {${mask}} {z}, $src}",
                   (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
                   VK2WM:$mask, VR128X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
                   (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst, f64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
                   "$dst {${mask}}, ${src}{1to2}}",
                   (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
                   VK2WM:$mask, f64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
                   "$dst {${mask}} {z}, ${src}{1to2}}",
                   (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
                   VK2WM:$mask, f64mem:$src), 0, "att">;
 
-  def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
                   (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
                   (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
                   VK4WM:$mask, VR256X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|"
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|"
                   "$dst {${mask}} {z}, $src}",
                   (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
                   VK4WM:$mask, VR256X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
                   (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst, f64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
                   "$dst {${mask}}, ${src}{1to4}}",
                   (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
                   VK4WM:$mask, f64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
                   "$dst {${mask}} {z}, ${src}{1to4}}",
                   (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
                   VK4WM:$mask, f64mem:$src), 0, "att">;
@@ -7701,81 +7786,91 @@ let Predicates = [HasVLX] in {
 // Convert Signed/Unsigned Doubleword to Double
 let Uses = []<Register>, mayRaiseFPException = 0 in
 multiclass avx512_cvtdq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNode128, X86SchedWriteWidths sched> {
+                           SDNode MaskOpNode, SDNode OpNode128,
+                           SDNode MaskOpNode128,
+                           X86SchedWriteWidths sched> {
   // No rounding in this op
   let Predicates = [HasAVX512] in
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i32x_info, OpNode,
-                            sched.ZMM>, EVEX_V512;
+                            MaskOpNode, sched.ZMM>, EVEX_V512;
 
   let Predicates = [HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v4i32x_info,
-                               OpNode128, sched.XMM, "{1to2}", "", i64mem, VK2WM,
+                               OpNode128, MaskOpNode128, sched.XMM, "{1to2}",
+                               "", i64mem, VK2WM,
                                (v2f64 (OpNode128 (bc_v4i32
                                 (v2i64
+                                 (scalar_to_vector (loadi64 addr:$src)))))),
+                               (v2f64 (MaskOpNode128 (bc_v4i32
+                                (v2i64
                                  (scalar_to_vector (loadi64 addr:$src))))))>,
                                EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i32x_info, OpNode,
-                               sched.YMM>, EVEX_V256;
+                               MaskOpNode, sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Signed/Unsigned Doubleword to Float
 multiclass avx512_cvtdq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+                           SDNode MaskOpNode, SDNode OpNodeRnd,
+                           X86SchedWriteWidths sched> {
   let Predicates = [HasAVX512] in
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16f32_info, v16i32_info, OpNode,
-                            sched.ZMM>,
+                            MaskOpNode, sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v16f32_info, v16i32_info,
                                OpNodeRnd, sched.ZMM>, EVEX_V512;
 
   let Predicates = [HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i32x_info, OpNode,
-                               sched.XMM>, EVEX_V128;
+                               MaskOpNode, sched.XMM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i32x_info, OpNode,
-                               sched.YMM>, EVEX_V256;
+                               MaskOpNode, sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Float to Signed/Unsigned Doubleword with truncation
 multiclass avx512_cvttps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                            SDNode MaskOpNode,
                             SDNode OpNodeSAE, X86SchedWriteWidths sched> {
   let Predicates = [HasAVX512] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
-                            sched.ZMM>,
+                            MaskOpNode, sched.ZMM>,
              avx512_vcvt_fp_sae<opc, OpcodeStr, v16i32_info, v16f32_info,
                                 OpNodeSAE, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
-                               sched.XMM>, EVEX_V128;
+                               MaskOpNode, sched.XMM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
-                               sched.YMM>, EVEX_V256;
+                               MaskOpNode, sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Float to Signed/Unsigned Doubleword
 multiclass avx512_cvtps2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+                           SDNode MaskOpNode, SDNode OpNodeRnd,
+                           X86SchedWriteWidths sched> {
   let Predicates = [HasAVX512] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i32_info, v16f32_info, OpNode,
-                            sched.ZMM>,
+                            MaskOpNode, sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v16i32_info, v16f32_info,
                                 OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f32x_info, OpNode,
-                               sched.XMM>, EVEX_V128;
+                               MaskOpNode, sched.XMM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f32x_info, OpNode,
-                               sched.YMM>, EVEX_V256;
+                               MaskOpNode, sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Double to Signed/Unsigned Doubleword with truncation
 multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            SDNode OpNodeSAE, X86SchedWriteWidths sched> {
+                            SDNode MaskOpNode, SDNode OpNodeSAE,
+                            X86SchedWriteWidths sched> {
   let Predicates = [HasAVX512] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
-                            sched.ZMM>,
+                            MaskOpNode, sched.ZMM>,
              avx512_vcvt_fp_sae<opc, OpcodeStr, v8i32x_info, v8f64_info,
                                 OpNodeSAE, sched.ZMM>, EVEX_V512;
   }
@@ -7785,50 +7880,50 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
     // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
     // due to the same reason.
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
-                               null_frag, sched.XMM, "{1to2}", "{x}", f128mem,
+                               null_frag, null_frag, sched.XMM, "{1to2}", "{x}", f128mem,
                                VK2WM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
-                               sched.YMM, "{1to4}", "{y}">, EVEX_V256;
+                               MaskOpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256;
   }
 
-  def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
                   (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
                   VR128X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
                   (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
                   VK2WM:$mask, VR128X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
                   (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
                   VK2WM:$mask, VR128X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
                   (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
                   f64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
                   "$dst {${mask}}, ${src}{1to2}}",
                   (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
                   VK2WM:$mask, f64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
                   "$dst {${mask}} {z}, ${src}{1to2}}",
                   (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
                   VK2WM:$mask, f64mem:$src), 0, "att">;
 
-  def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
                   (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
                   VR256X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
                   (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
                   VK4WM:$mask, VR256X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
                   (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
                   VK4WM:$mask, VR256X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
                   (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
                   f64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
                   "$dst {${mask}}, ${src}{1to4}}",
                   (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
                   VK4WM:$mask, f64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
                   "$dst {${mask}} {z}, ${src}{1to4}}",
                   (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
                   VK4WM:$mask, f64mem:$src), 0, "att">;
@@ -7836,10 +7931,11 @@ multiclass avx512_cvttpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 // Convert Double to Signed/Unsigned Doubleword
 multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+                           SDNode MaskOpNode, SDNode OpNodeRnd,
+                           X86SchedWriteWidths sched> {
   let Predicates = [HasAVX512] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i32x_info, v8f64_info, OpNode,
-                            sched.ZMM>,
+                            MaskOpNode, sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v8i32x_info, v8f64_info,
                                OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
@@ -7849,48 +7945,48 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
     // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
     // due to the same reason.
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v2f64x_info,
-                               null_frag, sched.XMM, "{1to2}", "{x}", f128mem,
+                               null_frag, null_frag, sched.XMM, "{1to2}", "{x}", f128mem,
                                VK2WM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i32x_info, v4f64x_info, OpNode,
-                               sched.YMM, "{1to4}", "{y}">, EVEX_V256;
+                               MaskOpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256;
   }
 
-  def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
                   (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst, VR128X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
                   (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
                   VK2WM:$mask, VR128X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
                   (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
                   VK2WM:$mask, VR128X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
                   (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
                   f64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
                   "$dst {${mask}}, ${src}{1to2}}",
                   (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
                   VK2WM:$mask, f64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
                   "$dst {${mask}} {z}, ${src}{1to2}}",
                   (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
                   VK2WM:$mask, f64mem:$src), 0, "att">;
 
-  def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
                   (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst, VR256X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
                   (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
                   VK4WM:$mask, VR256X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
                   (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
                   VK4WM:$mask, VR256X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
                   (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
                   f64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
                   "$dst {${mask}}, ${src}{1to4}}",
                   (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
                   VK4WM:$mask, f64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
                   "$dst {${mask}} {z}, ${src}{1to4}}",
                   (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
                   VK4WM:$mask, f64mem:$src), 0, "att">;
@@ -7898,61 +7994,65 @@ multiclass avx512_cvtpd2dq<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
 // Convert Double to Signed/Unsigned Quardword
 multiclass avx512_cvtpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+                           SDNode MaskOpNode, SDNode OpNodeRnd,
+                           X86SchedWriteWidths sched> {
   let Predicates = [HasDQI] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
-                            sched.ZMM>,
+                            MaskOpNode, sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f64_info,
                                OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasDQI, HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
-                               sched.XMM>, EVEX_V128;
+                               MaskOpNode, sched.XMM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
-                               sched.YMM>, EVEX_V256;
+                               MaskOpNode, sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Double to Signed/Unsigned Quardword with truncation
 multiclass avx512_cvttpd2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+                            SDNode MaskOpNode, SDNode OpNodeRnd,
+                            X86SchedWriteWidths sched> {
   let Predicates = [HasDQI] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f64_info, OpNode,
-                            sched.ZMM>,
+                            MaskOpNode, sched.ZMM>,
              avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f64_info,
                                 OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasDQI, HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v2f64x_info, OpNode,
-                               sched.XMM>, EVEX_V128;
+                               MaskOpNode, sched.XMM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f64x_info, OpNode,
-                               sched.YMM>, EVEX_V256;
+                               MaskOpNode, sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Signed/Unsigned Quardword to Double
 multiclass avx512_cvtqq2pd<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+                           SDNode MaskOpNode, SDNode OpNodeRnd,
+                           X86SchedWriteWidths sched> {
   let Predicates = [HasDQI] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f64_info, v8i64_info, OpNode,
-                            sched.ZMM>,
+                            MaskOpNode, sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v8f64_info, v8i64_info,
                                OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasDQI, HasVLX] in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2f64x_info, v2i64x_info, OpNode,
-                               sched.XMM>, EVEX_V128, NotEVEX2VEXConvertible;
+                               MaskOpNode, sched.XMM>, EVEX_V128, NotEVEX2VEXConvertible;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f64x_info, v4i64x_info, OpNode,
-                               sched.YMM>, EVEX_V256, NotEVEX2VEXConvertible;
+                               MaskOpNode, sched.YMM>, EVEX_V256, NotEVEX2VEXConvertible;
   }
 }
 
 // Convert Float to Signed/Unsigned Quardword
 multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+                           SDNode MaskOpNode, SDNode OpNodeRnd,
+                           X86SchedWriteWidths sched> {
   let Predicates = [HasDQI] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
-                            sched.ZMM>,
+                            MaskOpNode, sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v8i64_info, v8f32x_info,
                                OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
@@ -7960,21 +8060,26 @@ multiclass avx512_cvtps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
     // Explicitly specified broadcast string, since we take only 2 elements
     // from v4f32x_info source
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
-                               sched.XMM, "{1to2}", "", f64mem, VK2WM,
+                               MaskOpNode, sched.XMM, "{1to2}", "", f64mem, VK2WM,
                                (v2i64 (OpNode (bc_v4f32
                                 (v2f64
+                                 (scalar_to_vector (loadf64 addr:$src)))))),
+                               (v2i64 (MaskOpNode (bc_v4f32
+                                (v2f64
                                  (scalar_to_vector (loadf64 addr:$src))))))>,
                                EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
-                               sched.YMM>, EVEX_V256;
+                               MaskOpNode, sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Float to Signed/Unsigned Quardword with truncation
 multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                            SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+                            SDNode MaskOpNode, SDNode OpNodeRnd,
+                            X86SchedWriteWidths sched> {
   let Predicates = [HasDQI] in {
-    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode, sched.ZMM>,
+    defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8i64_info, v8f32x_info, OpNode,
+                            MaskOpNode, sched.ZMM>,
              avx512_vcvt_fp_sae<opc, OpcodeStr, v8i64_info, v8f32x_info,
                                 OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
@@ -7982,22 +8087,26 @@ multiclass avx512_cvttps2qq<bits<8> opc, string OpcodeStr, SDNode OpNode,
     // Explicitly specified broadcast string, since we take only 2 elements
     // from v4f32x_info source
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v2i64x_info, v4f32x_info, OpNode,
-                               sched.XMM, "{1to2}", "", f64mem, VK2WM,
+                               MaskOpNode, sched.XMM, "{1to2}", "", f64mem, VK2WM,
                                (v2i64 (OpNode (bc_v4f32
                                 (v2f64
+                                 (scalar_to_vector (loadf64 addr:$src)))))),
+                               (v2i64 (MaskOpNode (bc_v4f32
+                                (v2f64
                                  (scalar_to_vector (loadf64 addr:$src))))))>,
                                EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4i64x_info, v4f32x_info, OpNode,
-                               sched.YMM>, EVEX_V256;
+                               MaskOpNode, sched.YMM>, EVEX_V256;
   }
 }
 
 // Convert Signed/Unsigned Quardword to Float
 multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                           SDNode OpNodeRnd, X86SchedWriteWidths sched> {
+                           SDNode MaskOpNode, SDNode OpNodeRnd,
+                           X86SchedWriteWidths sched> {
   let Predicates = [HasDQI] in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v8f32x_info, v8i64_info, OpNode,
-                            sched.ZMM>,
+                            MaskOpNode, sched.ZMM>,
              avx512_vcvt_fp_rc<opc, OpcodeStr, v8f32x_info, v8i64_info,
                                OpNodeRnd, sched.ZMM>, EVEX_V512;
   }
@@ -8007,152 +8116,159 @@ multiclass avx512_cvtqq2ps<bits<8> opc, string OpcodeStr, SDNode OpNode,
     // dest type - 'v4i32x_info'. We also specify the broadcast string explicitly
     // due to the same reason.
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v2i64x_info, null_frag,
-                               sched.XMM, "{1to2}", "{x}", i128mem, VK2WM>,
+                               null_frag, sched.XMM, "{1to2}", "{x}", i128mem, VK2WM>,
                                EVEX_V128, NotEVEX2VEXConvertible;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v4f32x_info, v4i64x_info, OpNode,
-                               sched.YMM, "{1to4}", "{y}">, EVEX_V256,
+                               MaskOpNode, sched.YMM, "{1to4}", "{y}">, EVEX_V256,
                                NotEVEX2VEXConvertible;
   }
 
-  def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
                   (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
                   VR128X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}}|$dst {${mask}}, $src}",
                   (!cast<Instruction>(NAME # "Z128rrk") VR128X:$dst,
                   VK2WM:$mask, VR128X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst {${mask}} {z}|$dst {${mask}} {z}, $src}",
                   (!cast<Instruction>(NAME # "Z128rrkz") VR128X:$dst,
                   VK2WM:$mask, VR128X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst|$dst, ${src}{1to2}}",
                   (!cast<Instruction>(NAME # "Z128rmb") VR128X:$dst,
                   i64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}}|"
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}}|"
                   "$dst {${mask}}, ${src}{1to2}}",
                   (!cast<Instruction>(NAME # "Z128rmbk") VR128X:$dst,
                   VK2WM:$mask, i64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"x\t{${src}{1to2}, $dst {${mask}} {z}|"
+  def : InstAlias<OpcodeStr#"x\t{${src}{1to2}, $dst {${mask}} {z}|"
                   "$dst {${mask}} {z}, ${src}{1to2}}",
                   (!cast<Instruction>(NAME # "Z128rmbkz") VR128X:$dst,
                   VK2WM:$mask, i64mem:$src), 0, "att">;
 
-  def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
                   (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
                   VR256X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}}|"
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}}|"
                   "$dst {${mask}}, $src}",
                   (!cast<Instruction>(NAME # "Z256rrk") VR128X:$dst,
                   VK4WM:$mask, VR256X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{$src, $dst {${mask}} {z}|"
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst {${mask}} {z}|"
                   "$dst {${mask}} {z}, $src}",
                   (!cast<Instruction>(NAME # "Z256rrkz") VR128X:$dst,
                   VK4WM:$mask, VR256X:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst|$dst, ${src}{1to4}}",
                   (!cast<Instruction>(NAME # "Z256rmb") VR128X:$dst,
                   i64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}}|"
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}}|"
                   "$dst {${mask}}, ${src}{1to4}}",
                   (!cast<Instruction>(NAME # "Z256rmbk") VR128X:$dst,
                   VK4WM:$mask, i64mem:$src), 0, "att">;
-  def : InstAlias<OpcodeStr##"y\t{${src}{1to4}, $dst {${mask}} {z}|"
+  def : InstAlias<OpcodeStr#"y\t{${src}{1to4}, $dst {${mask}} {z}|"
                   "$dst {${mask}} {z}, ${src}{1to4}}",
                   (!cast<Instruction>(NAME # "Z256rmbkz") VR128X:$dst,
                   VK4WM:$mask, i64mem:$src), 0, "att">;
 }
 
-defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", any_sint_to_fp, X86any_VSintToFP,
+defm VCVTDQ2PD : avx512_cvtdq2pd<0xE6, "vcvtdq2pd", any_sint_to_fp, sint_to_fp,
+                                 X86any_VSintToFP, X86VSintToFP,
                                  SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>;
 
-defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", any_sint_to_fp,
+defm VCVTDQ2PS : avx512_cvtdq2ps<0x5B, "vcvtdq2ps", any_sint_to_fp, sint_to_fp,
                                 X86VSintToFpRnd, SchedWriteCvtDQ2PS>,
                                 PS, EVEX_CD8<32, CD8VF>;
 
 defm VCVTTPS2DQ : avx512_cvttps2dq<0x5B, "vcvttps2dq", X86any_cvttp2si,
-                                X86cvttp2siSAE, SchedWriteCvtPS2DQ>,
-                                XS, EVEX_CD8<32, CD8VF>;
+                                 X86cvttp2si, X86cvttp2siSAE,
+                                 SchedWriteCvtPS2DQ>, XS, EVEX_CD8<32, CD8VF>;
 
 defm VCVTTPD2DQ : avx512_cvttpd2dq<0xE6, "vcvttpd2dq", X86any_cvttp2si,
-                                 X86cvttp2siSAE, SchedWriteCvtPD2DQ>,
+                                 X86cvttp2si, X86cvttp2siSAE,
+                                 SchedWriteCvtPD2DQ>,
                                  PD, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VCVTTPS2UDQ : avx512_cvttps2dq<0x78, "vcvttps2udq", X86any_cvttp2ui,
-                                 X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PS,
-                                 EVEX_CD8<32, CD8VF>;
+                                 X86cvttp2ui, X86cvttp2uiSAE,
+                                 SchedWriteCvtPS2DQ>, PS, EVEX_CD8<32, CD8VF>;
 
 defm VCVTTPD2UDQ : avx512_cvttpd2dq<0x78, "vcvttpd2udq", X86any_cvttp2ui,
-                                 X86cvttp2uiSAE, SchedWriteCvtPD2DQ>,
+                                 X86cvttp2ui, X86cvttp2uiSAE,
+                                 SchedWriteCvtPD2DQ>,
                                  PS, VEX_W, EVEX_CD8<64, CD8VF>;
 
 defm VCVTUDQ2PD : avx512_cvtdq2pd<0x7A, "vcvtudq2pd", any_uint_to_fp,
-                                  X86any_VUintToFP, SchedWriteCvtDQ2PD>, XS,
-                                  EVEX_CD8<32, CD8VH>;
+                                  uint_to_fp, X86any_VUintToFP, X86VUintToFP,
+                                  SchedWriteCvtDQ2PD>, XS, EVEX_CD8<32, CD8VH>;
 
 defm VCVTUDQ2PS : avx512_cvtdq2ps<0x7A, "vcvtudq2ps", any_uint_to_fp,
-                                 X86VUintToFpRnd, SchedWriteCvtDQ2PS>, XD,
-                                 EVEX_CD8<32, CD8VF>;
+                                 uint_to_fp, X86VUintToFpRnd,
+                                 SchedWriteCvtDQ2PS>, XD, EVEX_CD8<32, CD8VF>;
 
-defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int,
+defm VCVTPS2DQ : avx512_cvtps2dq<0x5B, "vcvtps2dq", X86cvtp2Int, X86cvtp2Int,
                                  X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD,
                                  EVEX_CD8<32, CD8VF>;
 
-defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int,
+defm VCVTPD2DQ : avx512_cvtpd2dq<0xE6, "vcvtpd2dq", X86cvtp2Int, X86cvtp2Int,
                                  X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, XD,
                                  VEX_W, EVEX_CD8<64, CD8VF>;
 
-defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt,
+defm VCVTPS2UDQ : avx512_cvtps2dq<0x79, "vcvtps2udq", X86cvtp2UInt, X86cvtp2UInt,
                                  X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>,
                                  PS, EVEX_CD8<32, CD8VF>;
 
-defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt,
+defm VCVTPD2UDQ : avx512_cvtpd2dq<0x79, "vcvtpd2udq", X86cvtp2UInt, X86cvtp2UInt,
                                  X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, VEX_W,
                                  PS, EVEX_CD8<64, CD8VF>;
 
-defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int,
+defm VCVTPD2QQ : avx512_cvtpd2qq<0x7B, "vcvtpd2qq", X86cvtp2Int, X86cvtp2Int,
                                  X86cvtp2IntRnd, SchedWriteCvtPD2DQ>, VEX_W,
                                  PD, EVEX_CD8<64, CD8VF>;
 
-defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int,
+defm VCVTPS2QQ : avx512_cvtps2qq<0x7B, "vcvtps2qq", X86cvtp2Int, X86cvtp2Int,
                                  X86cvtp2IntRnd, SchedWriteCvtPS2DQ>, PD,
                                  EVEX_CD8<32, CD8VH>;
 
-defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt,
+defm VCVTPD2UQQ : avx512_cvtpd2qq<0x79, "vcvtpd2uqq", X86cvtp2UInt, X86cvtp2UInt,
                                  X86cvtp2UIntRnd, SchedWriteCvtPD2DQ>, VEX_W,
                                  PD, EVEX_CD8<64, CD8VF>;
 
-defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt,
+defm VCVTPS2UQQ : avx512_cvtps2qq<0x79, "vcvtps2uqq", X86cvtp2UInt, X86cvtp2UInt,
                                  X86cvtp2UIntRnd, SchedWriteCvtPS2DQ>, PD,
                                  EVEX_CD8<32, CD8VH>;
 
 defm VCVTTPD2QQ : avx512_cvttpd2qq<0x7A, "vcvttpd2qq", X86any_cvttp2si,
-                                 X86cvttp2siSAE, SchedWriteCvtPD2DQ>, VEX_W,
+                                 X86cvttp2si, X86cvttp2siSAE,
+                                 SchedWriteCvtPD2DQ>, VEX_W,
                                  PD, EVEX_CD8<64, CD8VF>;
 
 defm VCVTTPS2QQ : avx512_cvttps2qq<0x7A, "vcvttps2qq", X86any_cvttp2si,
-                                 X86cvttp2siSAE, SchedWriteCvtPS2DQ>, PD,
+                                 X86cvttp2si, X86cvttp2siSAE,
+                                 SchedWriteCvtPS2DQ>, PD,
                                  EVEX_CD8<32, CD8VH>;
 
 defm VCVTTPD2UQQ : avx512_cvttpd2qq<0x78, "vcvttpd2uqq", X86any_cvttp2ui,
-                                 X86cvttp2uiSAE, SchedWriteCvtPD2DQ>, VEX_W,
+                                 X86cvttp2ui, X86cvttp2uiSAE,
+                                 SchedWriteCvtPD2DQ>, VEX_W,
                                  PD, EVEX_CD8<64, CD8VF>;
 
 defm VCVTTPS2UQQ : avx512_cvttps2qq<0x78, "vcvttps2uqq", X86any_cvttp2ui,
-                                 X86cvttp2uiSAE, SchedWriteCvtPS2DQ>, PD,
+                                 X86cvttp2ui, X86cvttp2uiSAE,
+                                 SchedWriteCvtPS2DQ>, PD,
                                  EVEX_CD8<32, CD8VH>;
 
 defm VCVTQQ2PD : avx512_cvtqq2pd<0xE6, "vcvtqq2pd", any_sint_to_fp,
-                            X86VSintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS,
-                            EVEX_CD8<64, CD8VF>;
+                            sint_to_fp, X86VSintToFpRnd,
+                            SchedWriteCvtDQ2PD>, VEX_W, XS, EVEX_CD8<64, CD8VF>;
 
 defm VCVTUQQ2PD : avx512_cvtqq2pd<0x7A, "vcvtuqq2pd", any_uint_to_fp,
-                            X86VUintToFpRnd, SchedWriteCvtDQ2PD>, VEX_W, XS,
-                            EVEX_CD8<64, CD8VF>;
+                            uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PD>,
+                            VEX_W, XS, EVEX_CD8<64, CD8VF>;
 
 defm VCVTQQ2PS : avx512_cvtqq2ps<0x5B, "vcvtqq2ps", any_sint_to_fp,
-                            X86VSintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, PS,
-                            EVEX_CD8<64, CD8VF>;
+                            sint_to_fp, X86VSintToFpRnd, SchedWriteCvtDQ2PS>,
+                            VEX_W, PS, EVEX_CD8<64, CD8VF>;
 
 defm VCVTUQQ2PS : avx512_cvtqq2ps<0x7A, "vcvtuqq2ps", any_uint_to_fp,
-                            X86VUintToFpRnd, SchedWriteCvtDQ2PS>, VEX_W, XD,
-                            EVEX_CD8<64, CD8VF>;
+                            uint_to_fp, X86VUintToFpRnd, SchedWriteCvtDQ2PS>,
+                            VEX_W, XD, EVEX_CD8<64, CD8VF>;
 
 let Predicates = [HasVLX] in {
   // Special patterns to allow use of X86mcvtp2Int for masking. Instruction
@@ -8275,70 +8391,70 @@ let Predicates = [HasVLX] in {
 let Predicates = [HasDQI, HasVLX] in {
   def : Pat<(v2i64 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
             (VCVTPS2QQZ128rm addr:$src)>;
-  def : Pat<(v2i64 (vselect VK2WM:$mask,
-                            (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
-                            VR128X:$src0)),
+  def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+                                 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+                                 VR128X:$src0)),
             (VCVTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(v2i64 (vselect VK2WM:$mask,
-                            (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
-                            v2i64x_info.ImmAllZerosV)),
+  def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+                                 (X86cvtp2Int (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+                                 v2i64x_info.ImmAllZerosV)),
             (VCVTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
 
   def : Pat<(v2i64 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
             (VCVTPS2UQQZ128rm addr:$src)>;
-  def : Pat<(v2i64 (vselect VK2WM:$mask,
-                            (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
-                            VR128X:$src0)),
+  def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+                                 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+                                 VR128X:$src0)),
             (VCVTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(v2i64 (vselect VK2WM:$mask,
-                            (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
-                            v2i64x_info.ImmAllZerosV)),
+  def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+                                 (X86cvtp2UInt (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+                                 v2i64x_info.ImmAllZerosV)),
             (VCVTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
 
   def : Pat<(v2i64 (X86any_cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
             (VCVTTPS2QQZ128rm addr:$src)>;
-  def : Pat<(v2i64 (vselect VK2WM:$mask,
-                            (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
-                            VR128X:$src0)),
+  def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+                                 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+                                 VR128X:$src0)),
             (VCVTTPS2QQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(v2i64 (vselect VK2WM:$mask,
-                            (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
-                            v2i64x_info.ImmAllZerosV)),
+  def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+                                 (X86cvttp2si (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+                                 v2i64x_info.ImmAllZerosV)),
             (VCVTTPS2QQZ128rmkz VK2WM:$mask, addr:$src)>;
 
   def : Pat<(v2i64 (X86any_cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src))))),
             (VCVTTPS2UQQZ128rm addr:$src)>;
-  def : Pat<(v2i64 (vselect VK2WM:$mask,
-                            (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
-                            VR128X:$src0)),
+  def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+                                 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+                                 VR128X:$src0)),
             (VCVTTPS2UQQZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(v2i64 (vselect VK2WM:$mask,
-                            (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
-                            v2i64x_info.ImmAllZerosV)),
+  def : Pat<(v2i64 (vselect_mask VK2WM:$mask,
+                                 (X86cvttp2ui (bc_v4f32 (v2f64 (X86vzload64 addr:$src)))),
+                                 v2i64x_info.ImmAllZerosV)),
             (VCVTTPS2UQQZ128rmkz VK2WM:$mask, addr:$src)>;
 }
 
 let Predicates = [HasVLX] in {
   def : Pat<(v2f64 (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
             (VCVTDQ2PDZ128rm addr:$src)>;
-  def : Pat<(v2f64 (vselect VK2WM:$mask,
-                            (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
-                            VR128X:$src0)),
+  def : Pat<(v2f64 (vselect_mask VK2WM:$mask,
+                                 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+                                 VR128X:$src0)),
             (VCVTDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(v2f64 (vselect VK2WM:$mask,
-                            (X86any_VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
-                            v2f64x_info.ImmAllZerosV)),
+  def : Pat<(v2f64 (vselect_mask VK2WM:$mask,
+                                 (X86VSintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+                                 v2f64x_info.ImmAllZerosV)),
             (VCVTDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
 
   def : Pat<(v2f64 (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src))))),
             (VCVTUDQ2PDZ128rm addr:$src)>;
-  def : Pat<(v2f64 (vselect VK2WM:$mask,
-                            (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
-                            VR128X:$src0)),
+  def : Pat<(v2f64 (vselect_mask VK2WM:$mask,
+                                 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+                                 VR128X:$src0)),
             (VCVTUDQ2PDZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-  def : Pat<(v2f64 (vselect VK2WM:$mask,
-                            (X86any_VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
-                            v2f64x_info.ImmAllZerosV)),
+  def : Pat<(v2f64 (vselect_mask VK2WM:$mask,
+                                 (X86VUintToFP (bc_v4i32 (v2i64 (X86vzload64 addr:$src)))),
+                                 v2f64x_info.ImmAllZerosV)),
             (VCVTUDQ2PDZ128rmkz VK2WM:$mask, addr:$src)>;
 }
 
@@ -8408,16 +8524,17 @@ let Predicates = [HasDQI, HasVLX] in {
 
 let Uses = [MXCSR], mayRaiseFPException = 1 in
 multiclass avx512_cvtph2ps<X86VectorVTInfo _dest, X86VectorVTInfo _src,
-                           X86MemOperand x86memop, PatFrag ld_frag,
+                           X86MemOperand x86memop, dag ld_dag,
                            X86FoldableSchedWrite sched> {
-  defm rr : AVX512_maskable<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst),
+  defm rr : AVX512_maskable_split<0x13, MRMSrcReg, _dest ,(outs _dest.RC:$dst),
                             (ins _src.RC:$src), "vcvtph2ps", "$src", "$src",
+                            (X86any_cvtph2ps (_src.VT _src.RC:$src)),
                             (X86cvtph2ps (_src.VT _src.RC:$src))>,
                             T8PD, Sched<[sched]>;
-  defm rm : AVX512_maskable<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
+  defm rm : AVX512_maskable_split<0x13, MRMSrcMem, _dest, (outs _dest.RC:$dst),
                             (ins x86memop:$src), "vcvtph2ps", "$src", "$src",
-                            (X86cvtph2ps (_src.VT
-                                          (ld_frag addr:$src)))>,
+                            (X86any_cvtph2ps (_src.VT ld_dag)),
+                            (X86cvtph2ps (_src.VT ld_dag))>,
                             T8PD, Sched<[sched.Folded]>;
 }
 
@@ -8432,23 +8549,22 @@ multiclass avx512_cvtph2ps_sae<X86VectorVTInfo _dest, X86VectorVTInfo _src,
 }
 
 let Predicates = [HasAVX512] in
-  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem, load,
-                                    WriteCvtPH2PSZ>,
+  defm VCVTPH2PSZ : avx512_cvtph2ps<v16f32_info, v16i16x_info, f256mem,
+                                    (load addr:$src), WriteCvtPH2PSZ>,
                     avx512_cvtph2ps_sae<v16f32_info, v16i16x_info, WriteCvtPH2PSZ>,
                     EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
 
 let Predicates = [HasVLX] in {
   defm VCVTPH2PSZ256 : avx512_cvtph2ps<v8f32x_info, v8i16x_info, f128mem,
-                       load, WriteCvtPH2PSY>, EVEX, EVEX_V256,
+                       (load addr:$src), WriteCvtPH2PSY>, EVEX, EVEX_V256,
                        EVEX_CD8<32, CD8VH>;
   defm VCVTPH2PSZ128 : avx512_cvtph2ps<v4f32x_info, v8i16x_info, f64mem,
-                       load, WriteCvtPH2PS>, EVEX, EVEX_V128,
+                       (bitconvert (v2i64 (X86vzload64 addr:$src))),
+                       WriteCvtPH2PS>, EVEX, EVEX_V128,
                        EVEX_CD8<32, CD8VH>;
 
   // Pattern match vcvtph2ps of a scalar i64 load.
-  def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
-            (VCVTPH2PSZ128rm addr:$src)>;
-  def : Pat<(v4f32 (X86cvtph2ps (v8i16 (bitconvert
+  def : Pat<(v4f32 (X86any_cvtph2ps (v8i16 (bitconvert
               (v2i64 (scalar_to_vector (loadi64 addr:$src))))))),
             (VCVTPH2PSZ128rm addr:$src)>;
 }
@@ -8460,7 +8576,7 @@ let ExeDomain = GenericDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
              (ins _src.RC:$src1, i32u8imm:$src2),
              "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
              [(set _dest.RC:$dst,
-                   (X86cvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>,
+                   (X86any_cvtps2ph (_src.VT _src.RC:$src1), (i32 timm:$src2)))]>,
              Sched<[RR]>;
   let Constraints = "$src0 = $dst" in
   def rrk : AVX512AIi8<0x1D, MRMDestReg, (outs _dest.RC:$dst),
@@ -8505,54 +8621,35 @@ let Predicates = [HasAVX512] in {
                                     WriteCvtPS2PHZ, WriteCvtPS2PHZSt>,
                     avx512_cvtps2ph_sae<v16i16x_info, v16f32_info, WriteCvtPS2PHZ>,
                                         EVEX, EVEX_V512, EVEX_CD8<32, CD8VH>;
-  let Predicates = [HasVLX] in {
-    defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem,
-                                         WriteCvtPS2PHY, WriteCvtPS2PHYSt>,
-                                         EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
-    defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem,
-                                         WriteCvtPS2PH, WriteCvtPS2PHSt>,
-                                         EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
-  }
+
+  def : Pat<(store (v16i16 (X86any_cvtps2ph VR512:$src1, timm:$src2)), addr:$dst),
+            (VCVTPS2PHZmr addr:$dst, VR512:$src1, timm:$src2)>;
+}
+
+let Predicates = [HasVLX] in {
+  defm VCVTPS2PHZ256 : avx512_cvtps2ph<v8i16x_info, v8f32x_info, f128mem,
+                                       WriteCvtPS2PHY, WriteCvtPS2PHYSt>,
+                                       EVEX, EVEX_V256, EVEX_CD8<32, CD8VH>;
+  defm VCVTPS2PHZ128 : avx512_cvtps2ph<v8i16x_info, v4f32x_info, f64mem,
+                                       WriteCvtPS2PH, WriteCvtPS2PHSt>,
+                                       EVEX, EVEX_V128, EVEX_CD8<32, CD8VH>;
 
   def : Pat<(store (f64 (extractelt
-                         (bc_v2f64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))),
+                         (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128X:$src1, timm:$src2))),
                          (iPTR 0))), addr:$dst),
             (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>;
   def : Pat<(store (i64 (extractelt
-                         (bc_v2i64 (v8i16 (X86cvtps2ph VR128X:$src1, timm:$src2))),
+                         (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128X:$src1, timm:$src2))),
                          (iPTR 0))), addr:$dst),
             (VCVTPS2PHZ128mr addr:$dst, VR128X:$src1, timm:$src2)>;
-  def : Pat<(store (v8i16 (X86cvtps2ph VR256X:$src1, timm:$src2)), addr:$dst),
+  def : Pat<(store (v8i16 (X86any_cvtps2ph VR256X:$src1, timm:$src2)), addr:$dst),
             (VCVTPS2PHZ256mr addr:$dst, VR256X:$src1, timm:$src2)>;
-  def : Pat<(store (v16i16 (X86cvtps2ph VR512:$src1, timm:$src2)), addr:$dst),
-            (VCVTPS2PHZmr addr:$dst, VR512:$src1, timm:$src2)>;
-}
-
-// Patterns for matching conversions from float to half-float and vice versa.
-let Predicates = [HasVLX] in {
-  // Use MXCSR.RC for rounding instead of explicitly specifying the default
-  // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
-  // configurations we support (the default). However, falling back to MXCSR is
-  // more consistent with other instructions, which are always controlled by it.
-  // It's encoded as 0b100.
-  def : Pat<(fp_to_f16 FR32X:$src),
-            (i16 (EXTRACT_SUBREG (VMOVPDI2DIZrr (v8i16 (VCVTPS2PHZ128rr
-              (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4))), sub_16bit))>;
-
-  def : Pat<(f16_to_fp GR16:$src),
-            (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSZ128rr
-              (v8i16 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128X)))), FR32X)) >;
-
-  def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32X:$src))),
-            (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSZ128rr
-              (v8i16 (VCVTPS2PHZ128rr
-               (v4f32 (COPY_TO_REGCLASS FR32X:$src, VR128X)), 4)))), FR32X)) >;
 }
 
 //  Unordered/Ordered scalar fp compare with Sae and set EFLAGS
 multiclass avx512_ord_cmp_sae<bits<8> opc, X86VectorVTInfo _,
                             string OpcodeStr, Domain d,
-                            X86FoldableSchedWrite sched = WriteFCom> {
+                            X86FoldableSchedWrite sched = WriteFComX> {
   let hasSideEffects = 0, Uses = [MXCSR] in
   def rrb: AVX512<opc, MRMSrcReg, (outs), (ins _.RC:$src1, _.RC:$src2),
                   !strconcat(OpcodeStr, "\t{{sae}, $src2, $src1|$src1, $src2, {sae}}"), []>,
@@ -8613,7 +8710,7 @@ multiclass avx512_fp14_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (OpNode (_.VT _.RC:$src1),
-                          _.ScalarIntMemCPat:$src2)>, EVEX_4V, VEX_LIG,
+                          (_.ScalarIntMemFrags addr:$src2))>, EVEX_4V, VEX_LIG,
                           Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 }
@@ -8646,7 +8743,7 @@ multiclass avx512_fp14_p<bits<8> opc, string OpcodeStr, SDNode OpNode,
                          Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                           (ins _.ScalarMemOp:$src), OpcodeStr,
-                          "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+                          "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr,
                           (OpNode (_.VT
                             (_.BroadcastLdFrag addr:$src)))>,
                           EVEX, T8PD, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -8701,7 +8798,7 @@ multiclass avx512_fp28_s<bits<8> opc, string OpcodeStr,X86VectorVTInfo _,
   defm m : AVX512_maskable_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
-                         (OpNode (_.VT _.RC:$src1), _.ScalarIntMemCPat:$src2)>,
+                         (OpNode (_.VT _.RC:$src1), (_.ScalarIntMemFrags addr:$src2))>,
                          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
   }
 }
@@ -8741,7 +8838,7 @@ multiclass avx512_fp28_p<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
 
   defm mb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.ScalarMemOp:$src), OpcodeStr,
-                         "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
+                         "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr,
                          (OpNode (_.VT
                                   (_.BroadcastLdFrag addr:$src)))>,
                          EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
@@ -8811,20 +8908,21 @@ multiclass avx512_sqrt_packed_round<bits<8> opc, string OpcodeStr,
 multiclass avx512_sqrt_packed<bits<8> opc, string OpcodeStr,
                               X86FoldableSchedWrite sched, X86VectorVTInfo _>{
   let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
-  defm r: AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm r: AVX512_maskable_split<opc, MRMSrcReg, _, (outs _.RC:$dst),
                          (ins _.RC:$src), OpcodeStr, "$src", "$src",
-                         (_.VT (any_fsqrt _.RC:$src))>, EVEX,
+                         (_.VT (any_fsqrt _.RC:$src)),
+                         (_.VT (fsqrt _.RC:$src))>, EVEX,
                          Sched<[sched]>;
-  defm m: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm m: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
                          (ins _.MemOp:$src), OpcodeStr, "$src", "$src",
-                         (any_fsqrt (_.VT
-                           (bitconvert (_.LdFrag addr:$src))))>, EVEX,
-                           Sched<[sched.Folded, sched.ReadAfterFold]>;
-  defm mb: AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                         (any_fsqrt (_.VT (_.LdFrag addr:$src))),
+                         (fsqrt (_.VT (_.LdFrag addr:$src)))>, EVEX,
+                         Sched<[sched.Folded, sched.ReadAfterFold]>;
+  defm mb: AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
                           (ins _.ScalarMemOp:$src), OpcodeStr,
-                          "${src}"##_.BroadcastStr, "${src}"##_.BroadcastStr,
-                          (any_fsqrt (_.VT
-                            (_.BroadcastLdFrag addr:$src)))>,
+                          "${src}"#_.BroadcastStr, "${src}"#_.BroadcastStr,
+                          (any_fsqrt (_.VT (_.BroadcastLdFrag addr:$src))),
+                          (fsqrt (_.VT (_.BroadcastLdFrag addr:$src)))>,
                           EVEX, EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -8879,7 +8977,7 @@ multiclass avx512_sqrt_scalar<bits<8> opc, string OpcodeStr, X86FoldableSchedWri
                          (ins _.RC:$src1, _.IntScalarMemOp:$src2), OpcodeStr,
                          "$src2, $src1", "$src1, $src2",
                          (X86fsqrts (_.VT _.RC:$src1),
-                                    _.ScalarIntMemCPat:$src2)>,
+                                    (_.ScalarIntMemFrags addr:$src2))>,
                          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
     let Uses = [MXCSR] in
     defm rb_Int : AVX512_maskable_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
@@ -8952,7 +9050,7 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
                          OpcodeStr,
                          "$src3, $src2, $src1", "$src1, $src2, $src3",
                          (_.VT (X86RndScales _.RC:$src1,
-                                _.ScalarIntMemCPat:$src2, (i32 timm:$src3)))>,
+                                (_.ScalarIntMemFrags addr:$src2), (i32 timm:$src3)))>,
                          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 
   let isCodeGenOnly = 1, hasSideEffects = 0, Predicates = [HasAVX512] in {
@@ -8971,13 +9069,13 @@ multiclass avx512_rndscale_scalar<bits<8> opc, string OpcodeStr,
 
   let Predicates = [HasAVX512] in {
     def : Pat<(X86any_VRndScale _.FRC:$src1, timm:$src2),
-              (_.EltVT (!cast<Instruction>(NAME##r) (_.EltVT (IMPLICIT_DEF)),
+              (_.EltVT (!cast<Instruction>(NAME#r) (_.EltVT (IMPLICIT_DEF)),
                _.FRC:$src1, timm:$src2))>;
   }
 
   let Predicates = [HasAVX512, OptForSize] in {
     def : Pat<(X86any_VRndScale (_.ScalarLdFrag addr:$src1), timm:$src2),
-              (_.EltVT (!cast<Instruction>(NAME##m) (_.EltVT (IMPLICIT_DEF)),
+              (_.EltVT (!cast<Instruction>(NAME#m) (_.EltVT (IMPLICIT_DEF)),
                addr:$src1, timm:$src2))>;
   }
 }
@@ -8996,13 +9094,13 @@ multiclass avx512_masked_scalar<SDNode OpNode, string OpcPrefix, SDNode Move,
                                 dag Mask, X86VectorVTInfo _, PatLeaf ZeroFP,
                                 dag OutMask, Predicate BasePredicate> {
   let Predicates = [BasePredicate] in {
-    def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
+    def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects_mask Mask,
                (OpNode (extractelt _.VT:$src2, (iPTR 0))),
                (extractelt _.VT:$dst, (iPTR 0))))),
               (!cast<Instruction>("V"#OpcPrefix#r_Intk)
                _.VT:$dst, OutMask, _.VT:$src2, _.VT:$src1)>;
 
-    def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects Mask,
+    def : Pat<(Move _.VT:$src1, (scalar_to_vector (X86selects_mask Mask,
                (OpNode (extractelt _.VT:$src2, (iPTR 0))),
                ZeroFP))),
               (!cast<Instruction>("V"#OpcPrefix#r_Intkz)
@@ -9026,14 +9124,14 @@ defm : avx512_masked_scalar<fsqrt, "SQRTSDZ", X86Movsd,
 // same order as X86vmtrunc, X86vmtruncs, X86vmtruncus. This allows us to pass
 // either to the multiclasses.
 def select_trunc : PatFrag<(ops node:$src, node:$src0, node:$mask),
-                           (vselect node:$mask,
-                                    (trunc node:$src), node:$src0)>;
+                           (vselect_mask node:$mask,
+                                         (trunc node:$src), node:$src0)>;
 def select_truncs : PatFrag<(ops node:$src, node:$src0, node:$mask),
-                            (vselect node:$mask,
-                                     (X86vtruncs node:$src), node:$src0)>;
+                            (vselect_mask node:$mask,
+                                          (X86vtruncs node:$src), node:$src0)>;
 def select_truncus : PatFrag<(ops node:$src, node:$src0, node:$mask),
-                             (vselect node:$mask,
-                                      (X86vtruncus node:$src), node:$src0)>;
+                             (vselect_mask node:$mask,
+                                           (X86vtruncus node:$src), node:$src0)>;
 
 multiclass avx512_trunc_common<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               SDPatternOperator MaskNode,
@@ -9083,12 +9181,12 @@ multiclass avx512_trunc_mr_lowering<X86VectorVTInfo SrcInfo,
                                     string Name> {
 
   def : Pat<(truncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst),
-            (!cast<Instruction>(Name#SrcInfo.ZSuffix##mr)
+            (!cast<Instruction>(Name#SrcInfo.ZSuffix#mr)
                                     addr:$dst, SrcInfo.RC:$src)>;
 
   def : Pat<(mtruncFrag (SrcInfo.VT SrcInfo.RC:$src), addr:$dst,
                         SrcInfo.KRCWM:$mask),
-            (!cast<Instruction>(Name#SrcInfo.ZSuffix##mrk)
+            (!cast<Instruction>(Name#SrcInfo.ZSuffix#mrk)
                             addr:$dst, SrcInfo.KRCWM:$mask, SrcInfo.RC:$src)>;
 }
 
@@ -9548,6 +9646,8 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
   let Predicates = [HasVLX] in {
   def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
+  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
   def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
             (!cast<I>(OpcPrefix#BDZ256rm) addr:$src)>;
 
@@ -9558,6 +9658,8 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
 
   def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
+  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
   def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
             (!cast<I>(OpcPrefix#WQZ256rm) addr:$src)>;
   }
@@ -9565,6 +9667,10 @@ multiclass AVX512_pmovx_patterns<string OpcPrefix, SDNode ExtOp,
   let Predicates = [HasAVX512] in {
   def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
+  def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
+  def : Pat<(v8i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
+            (!cast<I>(OpcPrefix#BQZrm) addr:$src)>;
   }
 }
 
@@ -9586,54 +9692,49 @@ def: Pat<(v16i8 (trunc (loadv16i16 addr:$src))),
 
 // FIXME: Improve scheduling of gather/scatter instructions.
 multiclass avx512_gather<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                         X86MemOperand memop, PatFrag GatherNode,
-                         RegisterClass MaskRC = _.KRCWM> {
+                         X86MemOperand memop, RegisterClass MaskRC = _.KRCWM> {
   let Constraints = "@earlyclobber $dst, $src1 = $dst, $mask = $mask_wb",
-      ExeDomain = _.ExeDomain in
+      ExeDomain = _.ExeDomain, mayLoad = 1, hasSideEffects = 0 in
   def rm  : AVX5128I<opc, MRMSrcMem, (outs _.RC:$dst, MaskRC:$mask_wb),
             (ins _.RC:$src1, MaskRC:$mask, memop:$src2),
             !strconcat(OpcodeStr#_.Suffix,
             "\t{$src2, ${dst} {${mask}}|${dst} {${mask}}, $src2}"),
-            [(set _.RC:$dst, MaskRC:$mask_wb,
-              (GatherNode  (_.VT _.RC:$src1), MaskRC:$mask,
-                     vectoraddr:$src2))]>, EVEX, EVEX_K,
-             EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>;
+            []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>, Sched<[WriteLoad]>;
 }
 
 multiclass avx512_gather_q_pd<bits<8> dopc, bits<8> qopc,
                         AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
-  defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512,
-                                      vy512xmem, mgatherv8i32>, EVEX_V512, VEX_W;
-  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info512,
-                                      vz512mem,  mgatherv8i64>, EVEX_V512, VEX_W;
+  defm NAME#D#SUFF#Z: avx512_gather<dopc, OpcodeStr#"d", _.info512,
+                                      vy512xmem>, EVEX_V512, VEX_W;
+  defm NAME#Q#SUFF#Z: avx512_gather<qopc, OpcodeStr#"q", _.info512,
+                                      vz512mem>, EVEX_V512, VEX_W;
 let Predicates = [HasVLX] in {
-  defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
-                              vx256xmem, mgatherv4i32>, EVEX_V256, VEX_W;
-  defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info256,
-                              vy256xmem, mgatherv4i64>, EVEX_V256, VEX_W;
-  defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
-                              vx128xmem, mgatherv4i32>, EVEX_V128, VEX_W;
-  defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
-                              vx128xmem, mgatherv2i64>, EVEX_V128, VEX_W;
+  defm NAME#D#SUFF#Z256: avx512_gather<dopc, OpcodeStr#"d", _.info256,
+                              vx256xmem>, EVEX_V256, VEX_W;
+  defm NAME#Q#SUFF#Z256: avx512_gather<qopc, OpcodeStr#"q", _.info256,
+                              vy256xmem>, EVEX_V256, VEX_W;
+  defm NAME#D#SUFF#Z128: avx512_gather<dopc, OpcodeStr#"d", _.info128,
+                              vx128xmem>, EVEX_V128, VEX_W;
+  defm NAME#Q#SUFF#Z128: avx512_gather<qopc, OpcodeStr#"q", _.info128,
+                              vx128xmem>, EVEX_V128, VEX_W;
 }
 }
 
 multiclass avx512_gather_d_ps<bits<8> dopc, bits<8> qopc,
                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
-  defm NAME##D##SUFF##Z: avx512_gather<dopc, OpcodeStr##"d", _.info512, vz512mem,
-                                       mgatherv16i32>, EVEX_V512;
-  defm NAME##Q##SUFF##Z: avx512_gather<qopc, OpcodeStr##"q", _.info256, vz256mem,
-                                       mgatherv8i64>, EVEX_V512;
+  defm NAME#D#SUFF#Z: avx512_gather<dopc, OpcodeStr#"d", _.info512, vz512mem>,
+                                       EVEX_V512;
+  defm NAME#Q#SUFF#Z: avx512_gather<qopc, OpcodeStr#"q", _.info256, vz256mem>,
+                                       EVEX_V512;
 let Predicates = [HasVLX] in {
-  defm NAME##D##SUFF##Z256: avx512_gather<dopc, OpcodeStr##"d", _.info256,
-                                          vy256xmem, mgatherv8i32>, EVEX_V256;
-  defm NAME##Q##SUFF##Z256: avx512_gather<qopc, OpcodeStr##"q", _.info128,
-                                          vy128xmem, mgatherv4i64>, EVEX_V256;
-  defm NAME##D##SUFF##Z128: avx512_gather<dopc, OpcodeStr##"d", _.info128,
-                                          vx128xmem, mgatherv4i32>, EVEX_V128;
-  defm NAME##Q##SUFF##Z128: avx512_gather<qopc, OpcodeStr##"q", _.info128,
-                                          vx64xmem, mgatherv2i64, VK2WM>,
-                                          EVEX_V128;
+  defm NAME#D#SUFF#Z256: avx512_gather<dopc, OpcodeStr#"d", _.info256,
+                                          vy256xmem>, EVEX_V256;
+  defm NAME#Q#SUFF#Z256: avx512_gather<qopc, OpcodeStr#"q", _.info128,
+                                          vy128xmem>, EVEX_V256;
+  defm NAME#D#SUFF#Z128: avx512_gather<dopc, OpcodeStr#"d", _.info128,
+                                          vx128xmem>, EVEX_V128;
+  defm NAME#Q#SUFF#Z128: avx512_gather<qopc, OpcodeStr#"q", _.info128,
+                                          vx64xmem, VK2WM>, EVEX_V128;
 }
 }
 
@@ -9645,55 +9746,52 @@ defm VPGATHER : avx512_gather_q_pd<0x90, 0x91, avx512vl_i64_info, "vpgather", "Q
                 avx512_gather_d_ps<0x90, 0x91, avx512vl_i32_info, "vpgather", "D">;
 
 multiclass avx512_scatter<bits<8> opc, string OpcodeStr, X86VectorVTInfo _,
-                          X86MemOperand memop, PatFrag ScatterNode,
-                          RegisterClass MaskRC = _.KRCWM> {
+                          X86MemOperand memop, RegisterClass MaskRC = _.KRCWM> {
 
-let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain in
+let mayStore = 1, Constraints = "$mask = $mask_wb", ExeDomain = _.ExeDomain, 
+    hasSideEffects = 0 in
 
   def mr  : AVX5128I<opc, MRMDestMem, (outs MaskRC:$mask_wb),
             (ins memop:$dst, MaskRC:$mask, _.RC:$src),
             !strconcat(OpcodeStr#_.Suffix,
             "\t{$src, ${dst} {${mask}}|${dst} {${mask}}, $src}"),
-            [(set MaskRC:$mask_wb, (ScatterNode (_.VT _.RC:$src),
-                                    MaskRC:$mask,  vectoraddr:$dst))]>,
-            EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
+            []>, EVEX, EVEX_K, EVEX_CD8<_.EltSize, CD8VT1>,
             Sched<[WriteStore]>;
 }
 
 multiclass avx512_scatter_q_pd<bits<8> dopc, bits<8> qopc,
                         AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
-  defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512,
-                                      vy512xmem, mscatterv8i32>, EVEX_V512, VEX_W;
-  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info512,
-                                      vz512mem,  mscatterv8i64>, EVEX_V512, VEX_W;
+  defm NAME#D#SUFF#Z: avx512_scatter<dopc, OpcodeStr#"d", _.info512,
+                                      vy512xmem>, EVEX_V512, VEX_W;
+  defm NAME#Q#SUFF#Z: avx512_scatter<qopc, OpcodeStr#"q", _.info512,
+                                      vz512mem>, EVEX_V512, VEX_W;
 let Predicates = [HasVLX] in {
-  defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
-                              vx256xmem, mscatterv4i32>, EVEX_V256, VEX_W;
-  defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info256,
-                              vy256xmem, mscatterv4i64>, EVEX_V256, VEX_W;
-  defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
-                              vx128xmem, mscatterv4i32>, EVEX_V128, VEX_W;
-  defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
-                              vx128xmem, mscatterv2i64>, EVEX_V128, VEX_W;
+  defm NAME#D#SUFF#Z256: avx512_scatter<dopc, OpcodeStr#"d", _.info256,
+                              vx256xmem>, EVEX_V256, VEX_W;
+  defm NAME#Q#SUFF#Z256: avx512_scatter<qopc, OpcodeStr#"q", _.info256,
+                              vy256xmem>, EVEX_V256, VEX_W;
+  defm NAME#D#SUFF#Z128: avx512_scatter<dopc, OpcodeStr#"d", _.info128,
+                              vx128xmem>, EVEX_V128, VEX_W;
+  defm NAME#Q#SUFF#Z128: avx512_scatter<qopc, OpcodeStr#"q", _.info128,
+                              vx128xmem>, EVEX_V128, VEX_W;
 }
 }
 
 multiclass avx512_scatter_d_ps<bits<8> dopc, bits<8> qopc,
                        AVX512VLVectorVTInfo _, string OpcodeStr, string SUFF> {
-  defm NAME##D##SUFF##Z: avx512_scatter<dopc, OpcodeStr##"d", _.info512, vz512mem,
-                                       mscatterv16i32>, EVEX_V512;
-  defm NAME##Q##SUFF##Z: avx512_scatter<qopc, OpcodeStr##"q", _.info256, vz256mem,
-                                       mscatterv8i64>, EVEX_V512;
+  defm NAME#D#SUFF#Z: avx512_scatter<dopc, OpcodeStr#"d", _.info512, vz512mem>,
+                                       EVEX_V512;
+  defm NAME#Q#SUFF#Z: avx512_scatter<qopc, OpcodeStr#"q", _.info256, vz256mem>,
+                                       EVEX_V512;
 let Predicates = [HasVLX] in {
-  defm NAME##D##SUFF##Z256: avx512_scatter<dopc, OpcodeStr##"d", _.info256,
-                                          vy256xmem, mscatterv8i32>, EVEX_V256;
-  defm NAME##Q##SUFF##Z256: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
-                                          vy128xmem, mscatterv4i64>, EVEX_V256;
-  defm NAME##D##SUFF##Z128: avx512_scatter<dopc, OpcodeStr##"d", _.info128,
-                                          vx128xmem, mscatterv4i32>, EVEX_V128;
-  defm NAME##Q##SUFF##Z128: avx512_scatter<qopc, OpcodeStr##"q", _.info128,
-                                          vx64xmem, mscatterv2i64, VK2WM>,
-                                          EVEX_V128;
+  defm NAME#D#SUFF#Z256: avx512_scatter<dopc, OpcodeStr#"d", _.info256,
+                                          vy256xmem>, EVEX_V256;
+  defm NAME#Q#SUFF#Z256: avx512_scatter<qopc, OpcodeStr#"q", _.info128,
+                                          vy128xmem>, EVEX_V256;
+  defm NAME#D#SUFF#Z128: avx512_scatter<dopc, OpcodeStr#"d", _.info128,
+                                          vx128xmem>, EVEX_V128;
+  defm NAME#Q#SUFF#Z128: avx512_scatter<qopc, OpcodeStr#"q", _.info128,
+                                          vx64xmem, VK2WM>, EVEX_V128;
 }
 }
 
@@ -9762,13 +9860,9 @@ defm VSCATTERPF1QPD: avx512_gather_scatter_prefetch<0xC7, MRM6m, "vscatterpf1qpd
 
 multiclass cvt_by_vec_width<bits<8> opc, X86VectorVTInfo Vec, string OpcodeStr > {
 def rr : AVX512XS8I<opc, MRMSrcReg, (outs Vec.RC:$dst), (ins Vec.KRC:$src),
-                  !strconcat(OpcodeStr##Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
+                  !strconcat(OpcodeStr#Vec.Suffix, "\t{$src, $dst|$dst, $src}"),
                   [(set Vec.RC:$dst, (Vec.VT (sext Vec.KRC:$src)))]>,
                   EVEX, Sched<[WriteMove]>; // TODO - WriteVecTrunc?
-
-// Also need a pattern for anyextend.
-def : Pat<(Vec.VT (anyext Vec.KRC:$src)),
-          (!cast<Instruction>(NAME#"rr") Vec.KRC:$src)>;
 }
 
 multiclass cvt_mask_by_elt_width<bits<8> opc, AVX512VLVectorVTInfo VTInfo,
@@ -9842,19 +9936,11 @@ let Predicates = [HasDQI, NoBWI] in {
             (VPMOVDBZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
   def : Pat<(v16i16 (sext (v16i1 VK16:$src))),
             (VPMOVDWZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
-
-  def : Pat<(v16i8 (anyext (v16i1 VK16:$src))),
-            (VPMOVDBZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
-  def : Pat<(v16i16 (anyext (v16i1 VK16:$src))),
-            (VPMOVDWZrr (v16i32 (VPMOVM2DZrr VK16:$src)))>;
 }
 
 let Predicates = [HasDQI, NoBWI, HasVLX] in {
   def : Pat<(v8i16 (sext (v8i1 VK8:$src))),
             (VPMOVDWZ256rr (v8i32 (VPMOVM2DZ256rr VK8:$src)))>;
-
-  def : Pat<(v8i16 (anyext (v8i1 VK8:$src))),
-            (VPMOVDWZ256rr (v8i32 (VPMOVM2DZ256rr VK8:$src)))>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -9885,14 +9971,14 @@ multiclass compress_by_vec_width_common<bits<8> opc, X86VectorVTInfo _,
 
 multiclass compress_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
   def : Pat<(X86mCompressingStore (_.VT _.RC:$src), addr:$dst, _.KRCWM:$mask),
-            (!cast<Instruction>(Name#_.ZSuffix##mrk)
+            (!cast<Instruction>(Name#_.ZSuffix#mrk)
                             addr:$dst, _.KRCWM:$mask, _.RC:$src)>;
 
   def : Pat<(X86compress (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask),
-            (!cast<Instruction>(Name#_.ZSuffix##rrk)
+            (!cast<Instruction>(Name#_.ZSuffix#rrk)
                             _.RC:$src0, _.KRCWM:$mask, _.RC:$src)>;
   def : Pat<(X86compress (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask),
-            (!cast<Instruction>(Name#_.ZSuffix##rrkz)
+            (!cast<Instruction>(Name#_.ZSuffix#rrkz)
                             _.KRCWM:$mask, _.RC:$src)>;
 }
 
@@ -9940,23 +10026,23 @@ multiclass expand_by_vec_width<bits<8> opc, X86VectorVTInfo _,
 multiclass expand_by_vec_width_lowering<X86VectorVTInfo _, string Name> {
 
   def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, undef)),
-            (!cast<Instruction>(Name#_.ZSuffix##rmkz)
+            (!cast<Instruction>(Name#_.ZSuffix#rmkz)
                                         _.KRCWM:$mask, addr:$src)>;
 
   def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask, _.ImmAllZerosV)),
-            (!cast<Instruction>(Name#_.ZSuffix##rmkz)
+            (!cast<Instruction>(Name#_.ZSuffix#rmkz)
                                         _.KRCWM:$mask, addr:$src)>;
 
   def : Pat<(_.VT (X86mExpandingLoad addr:$src, _.KRCWM:$mask,
                                                (_.VT _.RC:$src0))),
-            (!cast<Instruction>(Name#_.ZSuffix##rmk)
+            (!cast<Instruction>(Name#_.ZSuffix#rmk)
                             _.RC:$src0, _.KRCWM:$mask, addr:$src)>;
 
   def : Pat<(X86expand (_.VT _.RC:$src), _.RC:$src0, _.KRCWM:$mask),
-            (!cast<Instruction>(Name#_.ZSuffix##rrk)
+            (!cast<Instruction>(Name#_.ZSuffix#rrk)
                             _.RC:$src0, _.KRCWM:$mask, _.RC:$src)>;
   def : Pat<(X86expand (_.VT _.RC:$src), _.ImmAllZerosV, _.KRCWM:$mask),
-            (!cast<Instruction>(Name#_.ZSuffix##rrkz)
+            (!cast<Instruction>(Name#_.ZSuffix#rrkz)
                             _.KRCWM:$mask, _.RC:$src)>;
 }
 
@@ -9990,26 +10076,33 @@ defm VEXPANDPD : expand_by_elt_width <0x88, "vexpandpd", WriteVarShuffle256,
 //                               op(mem_vec,imm)
 //                               op(broadcast(eltVt),imm)
 //all instruction created with FROUND_CURRENT
-multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                                      X86FoldableSchedWrite sched, X86VectorVTInfo _> {
+multiclass avx512_unary_fp_packed_imm<bits<8> opc, string OpcodeStr,
+                                      SDNode OpNode, SDNode MaskOpNode,
+                                      X86FoldableSchedWrite sched,
+                                      X86VectorVTInfo _> {
   let ExeDomain = _.ExeDomain, Uses = [MXCSR], mayRaiseFPException = 1 in {
-  defm rri : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
+  defm rri : AVX512_maskable_split<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
-                      (OpNode (_.VT _.RC:$src1),
-                              (i32 timm:$src2))>, Sched<[sched]>;
-  defm rmi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+                      OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2",
+                      (OpNode (_.VT _.RC:$src1), (i32 timm:$src2)),
+                      (MaskOpNode (_.VT _.RC:$src1), (i32 timm:$src2))>,
+                      Sched<[sched]>;
+  defm rmi : AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.MemOp:$src1, i32u8imm:$src2),
-                    OpcodeStr##_.Suffix, "$src2, $src1", "$src1, $src2",
+                    OpcodeStr#_.Suffix, "$src2, $src1", "$src1, $src2",
                     (OpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
-                            (i32 timm:$src2))>,
+                            (i32 timm:$src2)),
+                    (MaskOpNode (_.VT (bitconvert (_.LdFrag addr:$src1))),
+                                (i32 timm:$src2))>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
-  defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
+  defm rmbi : AVX512_maskable_split<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.ScalarMemOp:$src1, i32u8imm:$src2),
-                    OpcodeStr##_.Suffix, "$src2, ${src1}"##_.BroadcastStr,
-                    "${src1}"##_.BroadcastStr##", $src2",
+                    OpcodeStr#_.Suffix, "$src2, ${src1}"#_.BroadcastStr,
+                    "${src1}"#_.BroadcastStr#", $src2",
                     (OpNode (_.VT (_.BroadcastLdFrag addr:$src1)),
-                            (i32 timm:$src2))>, EVEX_B,
+                            (i32 timm:$src2)),
+                    (MaskOpNode (_.VT (_.BroadcastLdFrag addr:$src1)),
+                                (i32 timm:$src2))>, EVEX_B,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 }
@@ -10021,7 +10114,7 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
   let ExeDomain = _.ExeDomain, Uses = [MXCSR] in
   defm rrib : AVX512_maskable<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src1, i32u8imm:$src2),
-                      OpcodeStr##_.Suffix, "$src2, {sae}, $src1",
+                      OpcodeStr#_.Suffix, "$src2, {sae}, $src1",
                       "$src1, {sae}, $src2",
                       (OpNode (_.VT _.RC:$src1),
                               (i32 timm:$src2))>,
@@ -10030,18 +10123,19 @@ multiclass avx512_unary_fp_sae_packed_imm<bits<8> opc, string OpcodeStr,
 
 multiclass avx512_common_unary_fp_sae_packed_imm<string OpcodeStr,
             AVX512VLVectorVTInfo _, bits<8> opc, SDNode OpNode,
-            SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{
+            SDNode MaskOpNode, SDNode OpNodeSAE, X86SchedWriteWidths sched,
+            Predicate prd>{
   let Predicates = [prd] in {
-    defm Z    : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.ZMM,
-                                           _.info512>,
+    defm Z    : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                           sched.ZMM, _.info512>,
                 avx512_unary_fp_sae_packed_imm<opc, OpcodeStr, OpNodeSAE,
                                                sched.ZMM, _.info512>, EVEX_V512;
   }
   let Predicates = [prd, HasVLX] in {
-    defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.XMM,
-                                           _.info128>, EVEX_V128;
-    defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, sched.YMM,
-                                           _.info256>, EVEX_V256;
+    defm Z128 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                           sched.XMM, _.info128>, EVEX_V128;
+    defm Z256 : avx512_unary_fp_packed_imm<opc, OpcodeStr, OpNode, MaskOpNode,
+                                           sched.YMM, _.info256>, EVEX_V256;
   }
 }
 
@@ -10068,8 +10162,8 @@ multiclass avx512_fp_packed_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2, i32u8imm:$src3),
-                    OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
-                    "$src1, ${src2}"##_.BroadcastStr##", $src3",
+                    OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
+                    "$src1, ${src2}"#_.BroadcastStr#", $src3",
                     (OpNode (_.VT _.RC:$src1),
                             (_.VT (_.BroadcastLdFrag addr:$src2)),
                             (i32 timm:$src3))>, EVEX_B,
@@ -10111,8 +10205,8 @@ multiclass avx512_3Op_imm8<bits<8> opc, string OpcodeStr, SDNode OpNode,
   let ExeDomain = _.ExeDomain in
   defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
-                    OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
-                    "$src1, ${src2}"##_.BroadcastStr##", $src3",
+                    OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
+                    "$src1, ${src2}"#_.BroadcastStr#", $src3",
                     (OpNode (_.VT _.RC:$src1),
                             (_.VT (_.BroadcastLdFrag addr:$src2)),
                             (i8 timm:$src3))>, EVEX_B,
@@ -10135,7 +10229,7 @@ multiclass avx512_fp_scalar_imm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     (ins _.RC:$src1, _.IntScalarMemOp:$src2, i32u8imm:$src3),
                     OpcodeStr, "$src3, $src2, $src1", "$src1, $src2, $src3",
                     (OpNode (_.VT _.RC:$src1),
-                            (_.VT _.ScalarIntMemCPat:$src2),
+                            (_.ScalarIntMemFrags addr:$src2),
                             (i32 timm:$src3))>,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
@@ -10228,24 +10322,26 @@ multiclass avx512_common_fp_sae_scalar_imm<string OpcodeStr,
 
 multiclass avx512_common_unary_fp_sae_packed_imm_all<string OpcodeStr,
                     bits<8> opcPs, bits<8> opcPd, SDNode OpNode,
-                    SDNode OpNodeSAE, X86SchedWriteWidths sched, Predicate prd>{
+                    SDNode MaskOpNode, SDNode OpNodeSAE,
+                    X86SchedWriteWidths sched, Predicate prd>{
   defm PS : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f32_info,
-                            opcPs, OpNode, OpNodeSAE, sched, prd>,
+                            opcPs, OpNode, MaskOpNode, OpNodeSAE, sched, prd>,
                             EVEX_CD8<32, CD8VF>;
   defm PD : avx512_common_unary_fp_sae_packed_imm<OpcodeStr, avx512vl_f64_info,
-                            opcPd, OpNode, OpNodeSAE, sched, prd>,
+                            opcPd, OpNode, MaskOpNode, OpNodeSAE, sched, prd>,
                             EVEX_CD8<64, CD8VF>, VEX_W;
 }
 
 defm VREDUCE   : avx512_common_unary_fp_sae_packed_imm_all<"vreduce", 0x56, 0x56,
-                              X86VReduce, X86VReduceSAE, SchedWriteFRnd, HasDQI>,
-                              AVX512AIi8Base, EVEX;
+                              X86VReduce, X86VReduce, X86VReduceSAE,
+                              SchedWriteFRnd, HasDQI>, AVX512AIi8Base, EVEX;
 defm VRNDSCALE : avx512_common_unary_fp_sae_packed_imm_all<"vrndscale", 0x08, 0x09,
-                              X86any_VRndScale, X86VRndScaleSAE, SchedWriteFRnd, HasAVX512>,
+                              X86any_VRndScale, X86VRndScale, X86VRndScaleSAE,
+                              SchedWriteFRnd, HasAVX512>,
                               AVX512AIi8Base, EVEX;
 defm VGETMANT : avx512_common_unary_fp_sae_packed_imm_all<"vgetmant", 0x26, 0x26,
-                              X86VGetMant, X86VGetMantSAE, SchedWriteFRnd, HasAVX512>,
-                              AVX512AIi8Base, EVEX;
+                              X86VGetMant, X86VGetMant, X86VGetMantSAE,
+                              SchedWriteFRnd, HasAVX512>, AVX512AIi8Base, EVEX;
 
 defm VRANGEPD : avx512_common_fp_sae_packed_imm<"vrangepd", avx512vl_f64_info,
                                                 0x50, X86VRange, X86VRangeSAE,
@@ -10302,8 +10398,8 @@ multiclass avx512_shuff_packed_128_common<bits<8> opc, string OpcodeStr,
                 EVEX2VEXOverride<EVEX2VEXOvrd#"rm">;
   defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
-                    OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
-                    "$src1, ${src2}"##_.BroadcastStr##", $src3",
+                    OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
+                    "$src1, ${src2}"#_.BroadcastStr#", $src3",
                     (_.VT
                      (bitconvert
                       (CastInfo.VT
@@ -10391,8 +10487,8 @@ multiclass avx512_valign<bits<8> opc, string OpcodeStr,
 
   defm rmbi : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                    (ins _.RC:$src1, _.ScalarMemOp:$src2, u8imm:$src3),
-                   OpcodeStr, "$src3, ${src2}"##_.BroadcastStr##", $src1",
-                   "$src1, ${src2}"##_.BroadcastStr##", $src3",
+                   OpcodeStr, "$src3, ${src2}"#_.BroadcastStr#", $src1",
+                   "$src1, ${src2}"#_.BroadcastStr#", $src3",
                    (X86VAlign _.RC:$src1,
                               (_.VT (_.BroadcastLdFrag addr:$src2)),
                               (i8 timm:$src3))>, EVEX_B,
@@ -10441,40 +10537,40 @@ def ValigndImm8XForm : SDNodeXForm<timm, [{
 multiclass avx512_vpalign_mask_lowering<string OpcodeStr, SDNode OpNode,
                                         X86VectorVTInfo From, X86VectorVTInfo To,
                                         SDNodeXForm ImmXForm> {
-  def : Pat<(To.VT (vselect To.KRCWM:$mask,
-                            (bitconvert
-                             (From.VT (OpNode From.RC:$src1, From.RC:$src2,
-                                              timm:$src3))),
-                            To.RC:$src0)),
+  def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+                                 (bitconvert
+                                  (From.VT (OpNode From.RC:$src1, From.RC:$src2,
+                                                   timm:$src3))),
+                                 To.RC:$src0)),
             (!cast<Instruction>(OpcodeStr#"rrik") To.RC:$src0, To.KRCWM:$mask,
                                                   To.RC:$src1, To.RC:$src2,
                                                   (ImmXForm timm:$src3))>;
 
-  def : Pat<(To.VT (vselect To.KRCWM:$mask,
-                            (bitconvert
-                             (From.VT (OpNode From.RC:$src1, From.RC:$src2,
-                                              timm:$src3))),
-                            To.ImmAllZerosV)),
+  def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+                                 (bitconvert
+                                  (From.VT (OpNode From.RC:$src1, From.RC:$src2,
+                                                   timm:$src3))),
+                                 To.ImmAllZerosV)),
             (!cast<Instruction>(OpcodeStr#"rrikz") To.KRCWM:$mask,
                                                    To.RC:$src1, To.RC:$src2,
                                                    (ImmXForm timm:$src3))>;
 
-  def : Pat<(To.VT (vselect To.KRCWM:$mask,
-                            (bitconvert
-                             (From.VT (OpNode From.RC:$src1,
-                                              (From.LdFrag addr:$src2),
-                                      timm:$src3))),
-                            To.RC:$src0)),
+  def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+                                 (bitconvert
+                                  (From.VT (OpNode From.RC:$src1,
+                                                   (From.LdFrag addr:$src2),
+                                           timm:$src3))),
+                                 To.RC:$src0)),
             (!cast<Instruction>(OpcodeStr#"rmik") To.RC:$src0, To.KRCWM:$mask,
                                                   To.RC:$src1, addr:$src2,
                                                   (ImmXForm timm:$src3))>;
 
-  def : Pat<(To.VT (vselect To.KRCWM:$mask,
-                            (bitconvert
-                             (From.VT (OpNode From.RC:$src1,
-                                              (From.LdFrag addr:$src2),
-                                      timm:$src3))),
-                            To.ImmAllZerosV)),
+  def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+                                 (bitconvert
+                                  (From.VT (OpNode From.RC:$src1,
+                                                   (From.LdFrag addr:$src2),
+                                           timm:$src3))),
+                                 To.ImmAllZerosV)),
             (!cast<Instruction>(OpcodeStr#"rmikz") To.KRCWM:$mask,
                                                    To.RC:$src1, addr:$src2,
                                                    (ImmXForm timm:$src3))>;
@@ -10491,24 +10587,24 @@ multiclass avx512_vpalign_mask_lowering_mb<string OpcodeStr, SDNode OpNode,
             (!cast<Instruction>(OpcodeStr#"rmbi") To.RC:$src1, addr:$src2,
                                                   (ImmXForm timm:$src3))>;
 
-  def : Pat<(To.VT (vselect To.KRCWM:$mask,
-                            (bitconvert
-                             (From.VT (OpNode From.RC:$src1,
-                                      (bitconvert
-                                       (To.VT (To.BroadcastLdFrag addr:$src2))),
-                                      timm:$src3))),
-                            To.RC:$src0)),
+  def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+                                 (bitconvert
+                                  (From.VT (OpNode From.RC:$src1,
+                                           (bitconvert
+                                            (To.VT (To.BroadcastLdFrag addr:$src2))),
+                                           timm:$src3))),
+                                 To.RC:$src0)),
             (!cast<Instruction>(OpcodeStr#"rmbik") To.RC:$src0, To.KRCWM:$mask,
                                                    To.RC:$src1, addr:$src2,
                                                    (ImmXForm timm:$src3))>;
 
-  def : Pat<(To.VT (vselect To.KRCWM:$mask,
-                            (bitconvert
-                             (From.VT (OpNode From.RC:$src1,
-                                      (bitconvert
-                                       (To.VT (To.BroadcastLdFrag addr:$src2))),
-                                      timm:$src3))),
-                            To.ImmAllZerosV)),
+  def : Pat<(To.VT (vselect_mask To.KRCWM:$mask,
+                                 (bitconvert
+                                  (From.VT (OpNode From.RC:$src1,
+                                           (bitconvert
+                                            (To.VT (To.BroadcastLdFrag addr:$src2))),
+                                           timm:$src3))),
+                                 To.ImmAllZerosV)),
             (!cast<Instruction>(OpcodeStr#"rmbikz") To.KRCWM:$mask,
                                                     To.RC:$src1, addr:$src2,
                                                     (ImmXForm timm:$src3))>;
@@ -10567,8 +10663,8 @@ multiclass avx512_unary_rmb<bits<8> opc, string OpcodeStr, SDNode OpNode,
            avx512_unary_rm<opc, OpcodeStr, OpNode, sched, _> {
   defm rmb : AVX512_maskable<opc, MRMSrcMem, _, (outs _.RC:$dst),
                   (ins _.ScalarMemOp:$src1), OpcodeStr,
-                  "${src1}"##_.BroadcastStr,
-                  "${src1}"##_.BroadcastStr,
+                  "${src1}"#_.BroadcastStr,
+                  "${src1}"#_.BroadcastStr,
                   (_.VT (OpNode (_.VT (_.BroadcastLdFrag addr:$src1))))>,
              EVEX, AVX5128IBase, EVEX_B, EVEX_CD8<_.EltSize, CD8VF>,
              Sched<[sched.Folded]>;
@@ -10751,32 +10847,14 @@ defm VMOVDDUP : avx512_movddup<0x12, "vmovddup", X86Movddup, SchedWriteFShuffle>
 let Predicates = [HasVLX] in {
 def : Pat<(v2f64 (X86VBroadcast f64:$src)),
           (VMOVDDUPZ128rr (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
-def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
-          (VMOVDDUPZ128rm addr:$src)>;
-def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
-          (VMOVDDUPZ128rm addr:$src)>;
 
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
-                   (v2f64 VR128X:$src0)),
+def : Pat<(vselect_mask (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
+                        (v2f64 VR128X:$src0)),
           (VMOVDDUPZ128rrk VR128X:$src0, VK2WM:$mask,
                            (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
-                   immAllZerosV),
+def : Pat<(vselect_mask (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast f64:$src)),
+                        immAllZerosV),
           (VMOVDDUPZ128rrkz VK2WM:$mask, (v2f64 (COPY_TO_REGCLASS FR64X:$src, VR128X)))>;
-
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)),
-                   (v2f64 VR128X:$src0)),
-          (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcastld64 addr:$src)),
-                   immAllZerosV),
-          (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
-
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
-                   (v2f64 VR128X:$src0)),
-          (VMOVDDUPZ128rmk VR128X:$src0, VK2WM:$mask, addr:$src)>;
-def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
-                   immAllZerosV),
-          (VMOVDDUPZ128rmkz VK2WM:$mask, addr:$src)>;
 }
 
 //===----------------------------------------------------------------------===//
@@ -10784,9 +10862,9 @@ def : Pat<(vselect (v2i1 VK2WM:$mask), (v2f64 (X86VBroadcast (v2f64 (simple_load
 //===----------------------------------------------------------------------===//
 
 let Uses = []<Register>, mayRaiseFPException = 0 in {
-defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, HasAVX512,
+defm VUNPCKH : avx512_fp_binop_p<0x15, "vunpckh", X86Unpckh, X86Unpckh, HasAVX512,
                                  SchedWriteFShuffleSizes, 0, 1>;
-defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, HasAVX512,
+defm VUNPCKL : avx512_fp_binop_p<0x14, "vunpckl", X86Unpckl, X86Unpckl, HasAVX512,
                                  SchedWriteFShuffleSizes>;
 }
 
@@ -10945,16 +11023,15 @@ defm VSHUFPD: avx512_shufp<"vshufpd", avx512vl_i64_info, avx512vl_f64_info>, PD,
 // AVX-512 - Byte shift Left/Right
 //===----------------------------------------------------------------------===//
 
-// FIXME: The SSE/AVX names are PSLLDQri etc. - should we add the i here as well?
 multiclass avx512_shift_packed<bits<8> opc, SDNode OpNode, Format MRMr,
                                Format MRMm, string OpcodeStr,
                                X86FoldableSchedWrite sched, X86VectorVTInfo _>{
-  def rr : AVX512<opc, MRMr,
+  def ri : AVX512<opc, MRMr,
              (outs _.RC:$dst), (ins _.RC:$src1, u8imm:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set _.RC:$dst,(_.VT (OpNode _.RC:$src1, (i8 timm:$src2))))]>,
              Sched<[sched]>;
-  def rm : AVX512<opc, MRMm,
+  def mi : AVX512<opc, MRMm,
            (outs _.RC:$dst), (ins _.MemOp:$src1, u8imm:$src2),
            !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
            [(set _.RC:$dst,(_.VT (OpNode
@@ -11106,8 +11183,8 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
                     Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                     (ins _.RC:$src2, _.ScalarMemOp:$src3, u8imm:$src4),
-                    OpcodeStr, "$src4, ${src3}"##_.BroadcastStr##", $src2",
-                    "$src2, ${src3}"##_.BroadcastStr##", $src4",
+                    OpcodeStr, "$src4, ${src3}"#_.BroadcastStr#", $src2",
+                    "$src2, ${src3}"#_.BroadcastStr#", $src4",
                     (OpNode (_.VT _.RC:$src1),
                             (_.VT _.RC:$src2),
                             (_.VT (_.BroadcastLdFrag addr:$src3)),
@@ -11117,12 +11194,12 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }// Constraints = "$src1 = $dst"
 
   // Additional patterns for matching passthru operand in other positions.
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode _.RC:$src3, _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, _.RC:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode _.RC:$src2, _.RC:$src1, _.RC:$src3, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rrik) _.RC:$src1, _.KRCWM:$mask,
@@ -11141,13 +11218,13 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
   // Additional patterns for matching zero masking with loads in other
   // positions.
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode (bitconvert (_.LdFrag addr:$src3)),
                     _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
                    _.ImmAllZerosV)),
             (!cast<Instruction>(Name#_.ZSuffix#rmikz) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
                     _.RC:$src2, (i8 timm:$src4)),
                    _.ImmAllZerosV)),
@@ -11156,31 +11233,31 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
   // Additional patterns for matching masked loads with different
   // operand orders.
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode _.RC:$src1, (bitconvert (_.LdFrag addr:$src3)),
                     _.RC:$src2, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode (bitconvert (_.LdFrag addr:$src3)),
                     _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode _.RC:$src2, _.RC:$src1,
                     (bitconvert (_.LdFrag addr:$src3)), (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode _.RC:$src2, (bitconvert (_.LdFrag addr:$src3)),
                     _.RC:$src1, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode (bitconvert (_.LdFrag addr:$src3)),
                     _.RC:$src1, _.RC:$src2, (i8 timm:$src4)),
                    _.RC:$src1)),
@@ -11200,14 +11277,14 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
   // Additional patterns for matching zero masking with broadcasts in other
   // positions.
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode (_.BroadcastLdFrag addr:$src3),
                     _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
                    _.ImmAllZerosV)),
             (!cast<Instruction>(Name#_.ZSuffix#rmbikz) _.RC:$src1,
              _.KRCWM:$mask, _.RC:$src2, addr:$src3,
              (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode _.RC:$src1,
                     (_.BroadcastLdFrag addr:$src3),
                     _.RC:$src2, (i8 timm:$src4)),
@@ -11218,32 +11295,32 @@ multiclass avx512_ternlog<bits<8> opc, string OpcodeStr, SDNode OpNode,
 
   // Additional patterns for matching masked broadcasts with different
   // operand orders.
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode _.RC:$src1, (_.BroadcastLdFrag addr:$src3),
                     _.RC:$src2, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG132_imm8 timm:$src4))>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode (_.BroadcastLdFrag addr:$src3),
                     _.RC:$src2, _.RC:$src1, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG321_imm8 timm:$src4))>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode _.RC:$src2, _.RC:$src1,
                     (_.BroadcastLdFrag addr:$src3),
                     (i8 timm:$src4)), _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG213_imm8 timm:$src4))>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode _.RC:$src2,
                     (_.BroadcastLdFrag addr:$src3),
                     _.RC:$src1, (i8 timm:$src4)),
                    _.RC:$src1)),
             (!cast<Instruction>(Name#_.ZSuffix#rmbik) _.RC:$src1, _.KRCWM:$mask,
              _.RC:$src2, addr:$src3, (VPTERNLOG231_imm8 timm:$src4))>;
-  def : Pat<(_.VT (vselect _.KRCWM:$mask,
+  def : Pat<(_.VT (vselect_mask _.KRCWM:$mask,
                    (OpNode (_.BroadcastLdFrag addr:$src3),
                     _.RC:$src1, _.RC:$src2, (i8 timm:$src4)),
                    _.RC:$src1)),
@@ -11288,6 +11365,36 @@ let Predicates = [HasVLX] in {
             (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
                                (VPTERNLOG132_imm8 timm:$src4))>;
 
+  def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2,
+                                 (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+                                 (i8 timm:$src4))),
+            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                timm:$src4)>;
+  def : Pat<(v16i8 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v16i8 (X86vpternlog VR128X:$src1,
+                                 (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR128X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                               (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v16i8 (X86vpternlog VR128X:$src1, VR128X:$src2,
+                                 (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+                                 (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                timm:$src4)>;
+  def : Pat<(v16i8 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+                                 VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v16i8 (X86vpternlog VR128X:$src1,
+                                 (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+                                 VR128X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                (VPTERNLOG132_imm8 timm:$src4))>;
+
   def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2, VR128X:$src3,
                                  (i8 timm:$src4))),
             (VPTERNLOGQZ128rri VR128X:$src1, VR128X:$src2, VR128X:$src3,
@@ -11305,6 +11412,66 @@ let Predicates = [HasVLX] in {
             (VPTERNLOGQZ128rmi VR128X:$src1, VR128X:$src2, addr:$src3,
                                (VPTERNLOG132_imm8 timm:$src4))>;
 
+  def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2,
+                                 (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+                                 (i8 timm:$src4))),
+            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                timm:$src4)>;
+  def : Pat<(v8i16 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v8i16 (X86vpternlog VR128X:$src1,
+                                 (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR128X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                               (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v8i16 (X86vpternlog VR128X:$src1, VR128X:$src2,
+                                 (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+                                 (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                timm:$src4)>;
+  def : Pat<(v8i16 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+                                 VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v8i16 (X86vpternlog VR128X:$src1,
+                                 (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+                                 VR128X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v4i32 (X86vpternlog VR128X:$src1, VR128X:$src2,
+                                 (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+                                 (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                timm:$src4)>;
+  def : Pat<(v4i32 (X86vpternlog (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+                                 VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v4i32 (X86vpternlog VR128X:$src1,
+                                 (bitconvert (v2i64 (X86VBroadcastld64 addr:$src3))),
+                                 VR128X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v2i64 (X86vpternlog VR128X:$src1, VR128X:$src2,
+                                 (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+                                 (i8 timm:$src4))),
+            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                timm:$src4)>;
+  def : Pat<(v2i64 (X86vpternlog (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR128X:$src2, VR128X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v2i64 (X86vpternlog VR128X:$src1,
+                                 (bitconvert (v4i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR128X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGDZ128rmbi VR128X:$src1, VR128X:$src2, addr:$src3,
+                               (VPTERNLOG132_imm8 timm:$src4))>;
+
   def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
                                  (i8 timm:$src4))),
             (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
@@ -11322,6 +11489,36 @@ let Predicates = [HasVLX] in {
             (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
                                (VPTERNLOG132_imm8 timm:$src4))>;
 
+  def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2,
+                                 (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+                                 (i8 timm:$src4))),
+            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                timm:$src4)>;
+  def : Pat<(v32i8 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v32i8 (X86vpternlog VR256X:$src1,
+                                 (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR256X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                               (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v32i8 (X86vpternlog VR256X:$src1, VR256X:$src2,
+                                 (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+                                 (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                timm:$src4)>;
+  def : Pat<(v32i8 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+                                 VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v32i8 (X86vpternlog VR256X:$src1,
+                                 (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+                                 VR256X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                (VPTERNLOG132_imm8 timm:$src4))>;
+
   def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2, VR256X:$src3,
                                   (i8 timm:$src4))),
             (VPTERNLOGQZ256rri VR256X:$src1, VR256X:$src2, VR256X:$src3,
@@ -11338,6 +11535,66 @@ let Predicates = [HasVLX] in {
                                   VR256X:$src2, (i8 timm:$src4))),
             (VPTERNLOGQZ256rmi VR256X:$src1, VR256X:$src2, addr:$src3,
                                (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2,
+                                  (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+                                  (i8 timm:$src4))),
+            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                timm:$src4)>;
+  def : Pat<(v16i16 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v16i16 (X86vpternlog VR256X:$src1,
+                                  (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+                                  VR256X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                               (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v16i16 (X86vpternlog VR256X:$src1, VR256X:$src2,
+                                  (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+                                  (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                timm:$src4)>;
+  def : Pat<(v16i16 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+                                  VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v16i16 (X86vpternlog VR256X:$src1,
+                                  (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+                                  VR256X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v8i32 (X86vpternlog VR256X:$src1, VR256X:$src2,
+                                 (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+                                 (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                timm:$src4)>;
+  def : Pat<(v8i32 (X86vpternlog (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+                                  VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v8i32 (X86vpternlog VR256X:$src1,
+                                 (bitconvert (v4i64 (X86VBroadcastld64 addr:$src3))),
+                                 VR256X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v4i64 (X86vpternlog VR256X:$src1, VR256X:$src2,
+                                 (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+                                 (i8 timm:$src4))),
+            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                timm:$src4)>;
+  def : Pat<(v4i64 (X86vpternlog (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR256X:$src2, VR256X:$src1, (i8 timm:$src4))),
+            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v4i64 (X86vpternlog VR256X:$src1,
+                                 (bitconvert (v8i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR256X:$src2, (i8 timm:$src4))),
+            (VPTERNLOGDZ256rmbi VR256X:$src1, VR256X:$src2, addr:$src3,
+                               (VPTERNLOG132_imm8 timm:$src4))>;
 }
 
 let Predicates = [HasAVX512] in {
@@ -11358,6 +11615,36 @@ let Predicates = [HasAVX512] in {
             (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
                             (VPTERNLOG132_imm8 timm:$src4))>;
 
+  def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2,
+                                 (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+                                 (i8 timm:$src4))),
+            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                             timm:$src4)>;
+  def : Pat<(v64i8 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR512:$src2, VR512:$src1, (i8 timm:$src4))),
+            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                             (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v64i8 (X86vpternlog VR512:$src1,
+                                 (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR512:$src2, (i8 timm:$src4))),
+            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                             (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v64i8 (X86vpternlog VR512:$src1, VR512:$src2,
+                                 (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+                                 (i8 timm:$src4))),
+            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                                timm:$src4)>;
+  def : Pat<(v64i8 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+                                 VR512:$src2, VR512:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v64i8 (X86vpternlog VR512:$src1,
+                                 (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+                                 VR512:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                                (VPTERNLOG132_imm8 timm:$src4))>;
+
   def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2, VR512:$src3,
                                   (i8 timm:$src4))),
             (VPTERNLOGQZrri VR512:$src1, VR512:$src2, VR512:$src3,
@@ -11371,9 +11658,84 @@ let Predicates = [HasAVX512] in {
             (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
                             (VPTERNLOG321_imm8 timm:$src4))>;
   def : Pat<(v32i16 (X86vpternlog VR512:$src1, (loadv32i16 addr:$src3),
-                                 VR512:$src2, (i8 timm:$src4))),
+                                  VR512:$src2, (i8 timm:$src4))),
             (VPTERNLOGQZrmi VR512:$src1, VR512:$src2, addr:$src3,
                             (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
+                                  (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+                                  (i8 timm:$src4))),
+            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                             timm:$src4)>;
+  def : Pat<(v32i16 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR512:$src2, VR512:$src1, (i8 timm:$src4))),
+            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v32i16 (X86vpternlog VR512:$src1,
+                                  (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+                                  VR512:$src2, (i8 timm:$src4))),
+            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                               (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
+                                  (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+                                  (i8 timm:$src4))),
+            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                             timm:$src4)>;
+  def : Pat<(v32i16 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+                                  VR512:$src2, VR512:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                             (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v32i16 (X86vpternlog VR512:$src1,
+                                  (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+                                  VR512:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                             (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v32i16 (X86vpternlog VR512:$src1, VR512:$src2,
+                                  (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+                                  (i8 timm:$src4))),
+            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                             timm:$src4)>;
+  def : Pat<(v32i16 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR512:$src2, VR512:$src1, (i8 timm:$src4))),
+            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v32i16 (X86vpternlog VR512:$src1,
+                                  (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+                                  VR512:$src2, (i8 timm:$src4))),
+            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                               (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v16i32 (X86vpternlog VR512:$src1, VR512:$src2,
+                                  (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+                                  (i8 timm:$src4))),
+            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                             timm:$src4)>;
+  def : Pat<(v16i32 (X86vpternlog (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+                                  VR512:$src2, VR512:$src1, (i8 timm:$src4))),
+            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                             (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v16i32 (X86vpternlog VR512:$src1,
+                                  (bitconvert (v8i64 (X86VBroadcastld64 addr:$src3))),
+                                  VR512:$src2, (i8 timm:$src4))),
+            (VPTERNLOGQZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                             (VPTERNLOG132_imm8 timm:$src4))>;
+
+  def : Pat<(v8i64 (X86vpternlog VR512:$src1, VR512:$src2,
+                                  (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+                                  (i8 timm:$src4))),
+            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                             timm:$src4)>;
+  def : Pat<(v8i64 (X86vpternlog (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+                                 VR512:$src2, VR512:$src1, (i8 timm:$src4))),
+            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                                (VPTERNLOG321_imm8 timm:$src4))>;
+  def : Pat<(v8i64 (X86vpternlog VR512:$src1,
+                                  (bitconvert (v16i32 (X86VBroadcastld32 addr:$src3))),
+                                  VR512:$src2, (i8 timm:$src4))),
+            (VPTERNLOGDZrmbi VR512:$src1, VR512:$src2, addr:$src3,
+                               (VPTERNLOG132_imm8 timm:$src4))>;
 }
 
 // Patterns to implement vnot using vpternlog instead of creating all ones
@@ -11484,14 +11846,14 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
       Uses = [MXCSR], mayRaiseFPException = 1 in {
     defm rri : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
                         (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
-                         OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                         OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
                         (X86VFixupimm (_.VT _.RC:$src1),
                                       (_.VT _.RC:$src2),
                                       (TblVT.VT _.RC:$src3),
                                       (i32 timm:$src4))>, Sched<[sched]>;
     defm rmi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.MemOp:$src3, i32u8imm:$src4),
-                      OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                      OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
                       (X86VFixupimm (_.VT _.RC:$src1),
                                     (_.VT _.RC:$src2),
                                     (TblVT.VT (bitconvert (TblVT.LdFrag addr:$src3))),
@@ -11499,8 +11861,8 @@ multiclass avx512_fixupimm_packed<bits<8> opc, string OpcodeStr,
                       Sched<[sched.Folded, sched.ReadAfterFold]>;
     defm rmbi : AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
-                    OpcodeStr##_.Suffix, "$src4, ${src3}"##_.BroadcastStr##", $src2",
-                    "$src2, ${src3}"##_.BroadcastStr##", $src4",
+                    OpcodeStr#_.Suffix, "$src4, ${src3}"#_.BroadcastStr#", $src2",
+                    "$src2, ${src3}"#_.BroadcastStr#", $src4",
                       (X86VFixupimm (_.VT _.RC:$src1),
                                     (_.VT _.RC:$src2),
                                     (TblVT.VT (TblVT.BroadcastLdFrag addr:$src3)),
@@ -11516,7 +11878,7 @@ multiclass avx512_fixupimm_packed_sae<bits<8> opc, string OpcodeStr,
 let Constraints = "$src1 = $dst", ExeDomain = _.ExeDomain, Uses = [MXCSR] in {
   defm rrib : AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
-                      OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
+                      OpcodeStr#_.Suffix, "$src4, {sae}, $src3, $src2",
                       "$src2, $src3, {sae}, $src4",
                       (X86VFixupimmSAE (_.VT _.RC:$src1),
                                        (_.VT _.RC:$src2),
@@ -11533,7 +11895,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
       ExeDomain = _.ExeDomain in {
     defm rri : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
-                      OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                      OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
                       (X86VFixupimms (_.VT _.RC:$src1),
                                      (_.VT _.RC:$src2),
                                      (_src3VT.VT _src3VT.RC:$src3),
@@ -11541,7 +11903,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
     let Uses = [MXCSR] in
     defm rrib : AVX512_maskable_3src_scalar<opc, MRMSrcReg, _, (outs _.RC:$dst),
                       (ins _.RC:$src2, _.RC:$src3, i32u8imm:$src4),
-                      OpcodeStr##_.Suffix, "$src4, {sae}, $src3, $src2",
+                      OpcodeStr#_.Suffix, "$src4, {sae}, $src3, $src2",
                       "$src2, $src3, {sae}, $src4",
                       (X86VFixupimmSAEs (_.VT _.RC:$src1),
                                         (_.VT _.RC:$src2),
@@ -11550,7 +11912,7 @@ multiclass avx512_fixupimm_scalar<bits<8> opc, string OpcodeStr,
                       EVEX_B, Sched<[sched.Folded, sched.ReadAfterFold]>;
     defm rmi : AVX512_maskable_3src_scalar<opc, MRMSrcMem, _, (outs _.RC:$dst),
                      (ins _.RC:$src2, _.ScalarMemOp:$src3, i32u8imm:$src4),
-                     OpcodeStr##_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
+                     OpcodeStr#_.Suffix, "$src4, $src3, $src2", "$src2, $src3, $src4",
                      (X86VFixupimms (_.VT _.RC:$src1),
                                     (_.VT _.RC:$src2),
                                     (_src3VT.VT (scalar_to_vector
@@ -11630,8 +11992,9 @@ defm VFIXUPIMMPD : avx512_fixupimm_packed_all<SchedWriteFAdd, avx512vl_f64_info,
 
 // TODO: Some canonicalization in lowering would simplify the number of
 // patterns we have to try to match.
-multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode MoveNode,
-                                           X86VectorVTInfo _, PatLeaf ZeroFP> {
+multiclass AVX512_scalar_math_fp_patterns<SDNode Op, SDNode MaskedOp,
+                                          string OpcPrefix, SDNode MoveNode,
+                                          X86VectorVTInfo _, PatLeaf ZeroFP> {
   let Predicates = [HasAVX512] in {
     // extracted scalar math op with insert via movss
     def : Pat<(MoveNode
@@ -11639,79 +12002,79 @@ multiclass AVX512_scalar_math_fp_patterns<SDNode Op, string OpcPrefix, SDNode Mo
                (_.VT (scalar_to_vector
                       (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))),
                           _.FRC:$src)))),
-              (!cast<Instruction>("V"#OpcPrefix#Zrr_Int) _.VT:$dst,
+              (!cast<Instruction>("V"#OpcPrefix#"Zrr_Int") _.VT:$dst,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src, VR128X)))>;
     def : Pat<(MoveNode
                (_.VT VR128X:$dst),
                (_.VT (scalar_to_vector
                       (Op (_.EltVT (extractelt (_.VT VR128X:$dst), (iPTR 0))),
                           (_.ScalarLdFrag addr:$src))))),
-              (!cast<Instruction>("V"#OpcPrefix#Zrm_Int) _.VT:$dst, addr:$src)>;
+              (!cast<Instruction>("V"#OpcPrefix#"Zrm_Int") _.VT:$dst, addr:$src)>;
 
     // extracted masked scalar math op with insert via movss
     def : Pat<(MoveNode (_.VT VR128X:$src1),
                (scalar_to_vector
-                (X86selects VK1WM:$mask,
-                            (Op (_.EltVT
-                                 (extractelt (_.VT VR128X:$src1), (iPTR 0))),
-                                _.FRC:$src2),
+                (X86selects_mask VK1WM:$mask,
+                            (MaskedOp (_.EltVT
+                                       (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                                      _.FRC:$src2),
                             _.FRC:$src0))),
-              (!cast<Instruction>("V"#OpcPrefix#Zrr_Intk)
+              (!cast<Instruction>("V"#OpcPrefix#"Zrr_Intk")
                (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
                VK1WM:$mask, _.VT:$src1,
                (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
     def : Pat<(MoveNode (_.VT VR128X:$src1),
                (scalar_to_vector
-                (X86selects VK1WM:$mask,
-                            (Op (_.EltVT
-                                 (extractelt (_.VT VR128X:$src1), (iPTR 0))),
-                                (_.ScalarLdFrag addr:$src2)),
+                (X86selects_mask VK1WM:$mask,
+                            (MaskedOp (_.EltVT
+                                       (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                                      (_.ScalarLdFrag addr:$src2)),
                             _.FRC:$src0))),
-              (!cast<Instruction>("V"#OpcPrefix#Zrm_Intk)
+              (!cast<Instruction>("V"#OpcPrefix#"Zrm_Intk")
                (_.VT (COPY_TO_REGCLASS _.FRC:$src0, VR128X)),
                VK1WM:$mask, _.VT:$src1, addr:$src2)>;
 
     // extracted masked scalar math op with insert via movss
     def : Pat<(MoveNode (_.VT VR128X:$src1),
                (scalar_to_vector
-                (X86selects VK1WM:$mask,
-                            (Op (_.EltVT
-                                 (extractelt (_.VT VR128X:$src1), (iPTR 0))),
-                                _.FRC:$src2), (_.EltVT ZeroFP)))),
-      (!cast<I>("V"#OpcPrefix#Zrr_Intkz) 
+                (X86selects_mask VK1WM:$mask,
+                            (MaskedOp (_.EltVT
+                                       (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                                      _.FRC:$src2), (_.EltVT ZeroFP)))),
+      (!cast<I>("V"#OpcPrefix#"Zrr_Intkz")
           VK1WM:$mask, _.VT:$src1,
           (_.VT (COPY_TO_REGCLASS _.FRC:$src2, VR128X)))>;
     def : Pat<(MoveNode (_.VT VR128X:$src1),
                (scalar_to_vector
-                (X86selects VK1WM:$mask,
-                            (Op (_.EltVT
-                                 (extractelt (_.VT VR128X:$src1), (iPTR 0))),
-                                (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))),
-      (!cast<I>("V"#OpcPrefix#Zrm_Intkz) VK1WM:$mask, _.VT:$src1, addr:$src2)>;
+                (X86selects_mask VK1WM:$mask,
+                            (MaskedOp (_.EltVT
+                                       (extractelt (_.VT VR128X:$src1), (iPTR 0))),
+                                      (_.ScalarLdFrag addr:$src2)), (_.EltVT ZeroFP)))),
+      (!cast<I>("V"#OpcPrefix#"Zrm_Intkz") VK1WM:$mask, _.VT:$src1, addr:$src2)>;
   }
 }
 
-defm : AVX512_scalar_math_fp_patterns<fadd, "ADDSS", X86Movss, v4f32x_info, fp32imm0>;
-defm : AVX512_scalar_math_fp_patterns<fsub, "SUBSS", X86Movss, v4f32x_info, fp32imm0>;
-defm : AVX512_scalar_math_fp_patterns<fmul, "MULSS", X86Movss, v4f32x_info, fp32imm0>;
-defm : AVX512_scalar_math_fp_patterns<fdiv, "DIVSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fadd, fadd, "ADDSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fsub, fsub, "SUBSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fmul, fmul, "MULSS", X86Movss, v4f32x_info, fp32imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fdiv, fdiv, "DIVSS", X86Movss, v4f32x_info, fp32imm0>;
 
-defm : AVX512_scalar_math_fp_patterns<fadd, "ADDSD", X86Movsd, v2f64x_info, fp64imm0>;
-defm : AVX512_scalar_math_fp_patterns<fsub, "SUBSD", X86Movsd, v2f64x_info, fp64imm0>;
-defm : AVX512_scalar_math_fp_patterns<fmul, "MULSD", X86Movsd, v2f64x_info, fp64imm0>;
-defm : AVX512_scalar_math_fp_patterns<fdiv, "DIVSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fadd, fadd, "ADDSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fsub, fsub, "SUBSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fmul, fmul, "MULSD", X86Movsd, v2f64x_info, fp64imm0>;
+defm : AVX512_scalar_math_fp_patterns<any_fdiv, fdiv, "DIVSD", X86Movsd, v2f64x_info, fp64imm0>;
 
 multiclass AVX512_scalar_unary_math_patterns<SDNode OpNode, string OpcPrefix,
                                              SDNode Move, X86VectorVTInfo _> {
   let Predicates = [HasAVX512] in {
     def : Pat<(_.VT (Move _.VT:$dst,
                      (scalar_to_vector (OpNode (extractelt _.VT:$src, 0))))),
-              (!cast<Instruction>("V"#OpcPrefix#Zr_Int) _.VT:$dst, _.VT:$src)>;
+              (!cast<Instruction>("V"#OpcPrefix#"Zr_Int") _.VT:$dst, _.VT:$src)>;
   }
 }
 
-defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSS", X86Movss, v4f32x_info>;
-defm : AVX512_scalar_unary_math_patterns<fsqrt, "SQRTSD", X86Movsd, v2f64x_info>;
+defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSS", X86Movss, v4f32x_info>;
+defm : AVX512_scalar_unary_math_patterns<any_fsqrt, "SQRTSD", X86Movsd, v2f64x_info>;
 
 //===----------------------------------------------------------------------===//
 // AES instructions
@@ -11724,13 +12087,13 @@ multiclass avx512_vaes<bits<8> Op, string OpStr, string IntPrefix> {
                                   loadv2i64, 0, VR128X, i128mem>,
                   EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V128, VEX_WIG;
     defm Z256 : AESI_binop_rm_int<Op, OpStr,
-                                  !cast<Intrinsic>(IntPrefix##"_256"),
+                                  !cast<Intrinsic>(IntPrefix#"_256"),
                                   loadv4i64, 0, VR256X, i256mem>,
                   EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V256, VEX_WIG;
     }
     let Predicates = [HasAVX512, HasVAES] in
     defm Z    : AESI_binop_rm_int<Op, OpStr,
-                                  !cast<Intrinsic>(IntPrefix##"_512"),
+                                  !cast<Intrinsic>(IntPrefix#"_512"),
                                   loadv8i64, 0, VR512, i512mem>,
                   EVEX_4V, EVEX_CD8<64, CD8VF>, EVEX_V512, VEX_WIG;
 }
@@ -11792,8 +12155,8 @@ multiclass VBMI2_shift_var_rmb<bits<8> Op, string OpStr, SDNode OpNode,
       ExeDomain   = VTI.ExeDomain in
   defm mb:  AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
               (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3), OpStr,
-              "${src3}"##VTI.BroadcastStr##", $src2",
-              "$src2, ${src3}"##VTI.BroadcastStr,
+              "${src3}"#VTI.BroadcastStr#", $src2",
+              "$src2, ${src3}"#VTI.BroadcastStr,
               (OpNode VTI.RC:$src1, VTI.RC:$src2,
                (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
               AVX512FMA3Base, EVEX_B,
@@ -11827,22 +12190,22 @@ multiclass VBMI2_shift_var_rmb_common<bits<8> Op, string OpStr, SDNode OpNode,
 }
 multiclass VBMI2_shift_var<bits<8> wOp, bits<8> dqOp, string Prefix,
                            SDNode OpNode, X86SchedWriteWidths sched> {
-  defm W : VBMI2_shift_var_rm_common<wOp, Prefix##"w", OpNode, sched,
+  defm W : VBMI2_shift_var_rm_common<wOp, Prefix#"w", OpNode, sched,
              avx512vl_i16_info>, VEX_W, EVEX_CD8<16, CD8VF>;
-  defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix##"d", OpNode, sched,
+  defm D : VBMI2_shift_var_rmb_common<dqOp, Prefix#"d", OpNode, sched,
              avx512vl_i32_info>, EVEX_CD8<32, CD8VF>;
-  defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix##"q", OpNode, sched,
+  defm Q : VBMI2_shift_var_rmb_common<dqOp, Prefix#"q", OpNode, sched,
              avx512vl_i64_info>, VEX_W, EVEX_CD8<64, CD8VF>;
 }
 
 multiclass VBMI2_shift_imm<bits<8> wOp, bits<8> dqOp, string Prefix,
                            SDNode OpNode, X86SchedWriteWidths sched> {
-  defm W : avx512_common_3Op_rm_imm8<wOp, OpNode, Prefix##"w", sched,
+  defm W : avx512_common_3Op_rm_imm8<wOp, OpNode, Prefix#"w", sched,
              avx512vl_i16_info, avx512vl_i16_info, HasVBMI2>,
              VEX_W, EVEX_CD8<16, CD8VF>;
-  defm D : avx512_common_3Op_imm8<Prefix##"d", avx512vl_i32_info, dqOp,
+  defm D : avx512_common_3Op_imm8<Prefix#"d", avx512vl_i32_info, dqOp,
              OpNode, sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<32, CD8VF>;
-  defm Q : avx512_common_3Op_imm8<Prefix##"q", avx512vl_i64_info, dqOp, OpNode,
+  defm Q : avx512_common_3Op_imm8<Prefix#"q", avx512vl_i64_info, dqOp, OpNode,
              sched, HasVBMI2>, AVX512AIi8Base, EVEX_4V, EVEX_CD8<64, CD8VF>, VEX_W;
 }
 
@@ -11890,8 +12253,8 @@ multiclass VNNI_rmb<bits<8> Op, string OpStr, SDNode OpNode,
                                    Sched<[sched.Folded, sched.ReadAfterFold]>;
   defm mb :   AVX512_maskable_3src<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                                    (ins VTI.RC:$src2, VTI.ScalarMemOp:$src3),
-                                   OpStr, "${src3}"##VTI.BroadcastStr##", $src2",
-                                   "$src2, ${src3}"##VTI.BroadcastStr,
+                                   OpStr, "${src3}"#VTI.BroadcastStr#", $src2",
+                                   "$src2, ${src3}"#VTI.BroadcastStr,
                                    (OpNode VTI.RC:$src1, VTI.RC:$src2,
                                     (VTI.VT (VTI.BroadcastLdFrag addr:$src3)))>,
                                    EVEX_4V, EVEX_CD8<32, CD8VF>, EVEX_B,
@@ -12027,8 +12390,8 @@ multiclass GF2P8AFFINE_avx512_rmb_imm<bits<8> Op, string OpStr, SDNode OpNode,
   let ExeDomain = VTI.ExeDomain in
   defm rmbi : AVX512_maskable<Op, MRMSrcMem, VTI, (outs VTI.RC:$dst),
                 (ins VTI.RC:$src1, VTI.ScalarMemOp:$src2, u8imm:$src3),
-                OpStr, "$src3, ${src2}"##BcstVTI.BroadcastStr##", $src1",
-                "$src1, ${src2}"##BcstVTI.BroadcastStr##", $src3",
+                OpStr, "$src3, ${src2}"#BcstVTI.BroadcastStr#", $src1",
+                "$src1, ${src2}"#BcstVTI.BroadcastStr#", $src3",
                 (OpNode (VTI.VT VTI.RC:$src1),
                  (bitconvert (BcstVTI.VT (X86VBroadcastld64 addr:$src2))),
                  (i8 timm:$src3))>, EVEX_B,
@@ -12184,41 +12547,44 @@ multiclass avx512_binop_all2<bits<8> opc, string OpcodeStr,
   }
 }
 
+let ExeDomain = SSEPackedSingle in
 defm VCVTNE2PS2BF16 : avx512_binop_all2<0x72, "vcvtne2ps2bf16",
-                                        SchedWriteCvtPD2PS, //FIXME: Shoulod be SchedWriteCvtPS2BF
+                                        SchedWriteCvtPD2PS, //FIXME: Should be SchedWriteCvtPS2BF
                                         avx512vl_f32_info, avx512vl_i16_info,
                                         X86cvtne2ps2bf16, HasBF16, 0>, T8XD;
 
 // Truncate Float to BFloat16
 multiclass avx512_cvtps2bf16<bits<8> opc, string OpcodeStr,
                              X86SchedWriteWidths sched> {
+  let ExeDomain = SSEPackedSingle in {
   let Predicates = [HasBF16], Uses = []<Register>, mayRaiseFPException = 0 in {
     defm Z : avx512_vcvt_fp<opc, OpcodeStr, v16i16x_info, v16f32_info,
-                            X86cvtneps2bf16, sched.ZMM>, EVEX_V512;
+                            X86cvtneps2bf16, X86cvtneps2bf16, sched.ZMM>, EVEX_V512;
   }
   let Predicates = [HasBF16, HasVLX] in {
     let Uses = []<Register>, mayRaiseFPException = 0 in {
     defm Z128 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v4f32x_info,
-                               null_frag, sched.XMM, "{1to4}", "{x}", f128mem,
+                               null_frag, null_frag, sched.XMM, "{1to4}", "{x}", f128mem,
                                VK4WM>, EVEX_V128;
     defm Z256 : avx512_vcvt_fp<opc, OpcodeStr, v8i16x_info, v8f32x_info,
-                               X86cvtneps2bf16,
+                               X86cvtneps2bf16, X86cvtneps2bf16,
                                sched.YMM, "{1to8}", "{y}">, EVEX_V256;
     }
+  } // Predicates = [HasBF16, HasVLX]
+  } // ExeDomain = SSEPackedSingle
 
-    def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
-                    (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
-                    VR128X:$src), 0>;
-    def : InstAlias<OpcodeStr##"x\t{$src, $dst|$dst, $src}",
-                    (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst,
-                    f128mem:$src), 0, "intel">;
-    def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
-                    (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
-                    VR256X:$src), 0>;
-    def : InstAlias<OpcodeStr##"y\t{$src, $dst|$dst, $src}",
-                    (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst,
-                    f256mem:$src), 0, "intel">;
-  }
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Z128rr") VR128X:$dst,
+                  VR128X:$src), 0>;
+  def : InstAlias<OpcodeStr#"x\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Z128rm") VR128X:$dst,
+                  f128mem:$src), 0, "intel">;
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Z256rr") VR128X:$dst,
+                  VR256X:$src), 0>;
+  def : InstAlias<OpcodeStr#"y\t{$src, $dst|$dst, $src}",
+                  (!cast<Instruction>(NAME # "Z256rm") VR128X:$dst,
+                  f256mem:$src), 0, "intel">;
 }
 
 defm VCVTNEPS2BF16 : avx512_cvtps2bf16<0x72, "vcvtneps2bf16",
@@ -12262,25 +12628,24 @@ multiclass avx512_dpbf16ps_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
                               X86FoldableSchedWrite sched,
                               X86VectorVTInfo _, X86VectorVTInfo src_v> {
   defm r: AVX512_maskable_3src<opc, MRMSrcReg, _, (outs _.RC:$dst),
-                           (ins _.RC:$src2, _.RC:$src3),
+                           (ins src_v.RC:$src2, src_v.RC:$src3),
                            OpcodeStr, "$src3, $src2", "$src2, $src3",
-                           (_.VT (OpNode _.RC:$src1, _.RC:$src2, _.RC:$src3))>,
+                           (_.VT (OpNode _.RC:$src1, src_v.RC:$src2, src_v.RC:$src3))>,
                            EVEX_4V, Sched<[sched]>;
 
   defm m: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                               (ins _.RC:$src2, _.MemOp:$src3),
+                               (ins src_v.RC:$src2, src_v.MemOp:$src3),
                                OpcodeStr, "$src3, $src2", "$src2, $src3",
-                               (_.VT (OpNode _.RC:$src1, _.RC:$src2,
-                               (src_v.VT (bitconvert
-                               (src_v.LdFrag addr:$src3)))))>, EVEX_4V,
+                               (_.VT (OpNode _.RC:$src1, src_v.RC:$src2,
+                               (src_v.LdFrag addr:$src3)))>, EVEX_4V,
                                Sched<[sched.Folded, sched.ReadAfterFold]>;
 
   defm mb: AVX512_maskable_3src<opc, MRMSrcMem, _, (outs _.RC:$dst),
-                  (ins _.RC:$src2, _.ScalarMemOp:$src3),
+                  (ins src_v.RC:$src2, src_v.ScalarMemOp:$src3),
                   OpcodeStr,
                   !strconcat("${src3}", _.BroadcastStr,", $src2"),
                   !strconcat("$src2, ${src3}", _.BroadcastStr),
-                  (_.VT (OpNode _.RC:$src1, _.RC:$src2,
+                  (_.VT (OpNode _.RC:$src1, src_v.RC:$src2,
                   (src_v.VT (src_v.BroadcastLdFrag addr:$src3))))>,
                   EVEX_B, EVEX_4V, Sched<[sched.Folded, sched.ReadAfterFold]>;
 
@@ -12302,6 +12667,7 @@ multiclass avx512_dpbf16ps_sizes<bits<8> opc, string OpcodeStr, SDNode OpNode,
   }
 }
 
+let ExeDomain = SSEPackedSingle in
 defm VDPBF16PS : avx512_dpbf16ps_sizes<0x52, "vdpbf16ps", X86dpbf16ps, SchedWriteFMA,
                                        avx512vl_f32_info, avx512vl_i32_info,
                                        HasBF16>, T8XS, EVEX_CD8<32, CD8VF>;
diff --git a/llvm/lib/Target/X86/X86InstrArithmetic.td b/llvm/lib/Target/X86/X86InstrArithmetic.td
index 1e399a894490..f7f22285bd15 100644
--- a/llvm/lib/Target/X86/X86InstrArithmetic.td
+++ b/llvm/lib/Target/X86/X86InstrArithmetic.td
@@ -605,16 +605,16 @@ def invalid_node : SDNode<"<<invalid_node>>", SDTIntLeaf,[],"<<invalid_node>>">;
 
 
 def Xi8  : X86TypeInfo<i8, "b", GR8, loadi8, i8mem,
-                       Imm8, i8imm, relocImm8_su, i8imm, invalid_node,
+                       Imm8, i8imm, imm_su, i8imm, invalid_node,
                        0, OpSizeFixed, 0>;
 def Xi16 : X86TypeInfo<i16, "w", GR16, loadi16, i16mem,
-                       Imm16, i16imm, relocImm16_su, i16i8imm, i16immSExt8_su,
+                       Imm16, i16imm, imm_su, i16i8imm, i16immSExt8_su,
                        1, OpSize16, 0>;
 def Xi32 : X86TypeInfo<i32, "l", GR32, loadi32, i32mem,
-                       Imm32, i32imm, relocImm32_su, i32i8imm, i32immSExt8_su,
+                       Imm32, i32imm, imm_su, i32i8imm, i32immSExt8_su,
                        1, OpSize32, 0>;
 def Xi64 : X86TypeInfo<i64, "q", GR64, loadi64, i64mem,
-                       Imm32S, i64i32imm, i64relocImmSExt32_su, i64i8imm, i64immSExt8_su,
+                       Imm32S, i64i32imm, i64immSExt32_su, i64i8imm, i64immSExt8_su,
                        1, OpSizeFixed, 1>;
 
 /// ITy - This instruction base class takes the type info for the instruction.
@@ -1217,6 +1217,146 @@ def : Pat<(store (X86adc_flag GR64:$src, (loadi64 addr:$dst), EFLAGS),
                  addr:$dst),
           (ADC64mr addr:$dst, GR64:$src)>;
 
+// Patterns for basic arithmetic ops with relocImm for the immediate field.
+multiclass ArithBinOp_RF_relocImm_Pats<SDNode OpNodeFlag, SDNode OpNode> {
+  def : Pat<(OpNodeFlag GR8:$src1, relocImm8_su:$src2),
+            (!cast<Instruction>(NAME#"8ri") GR8:$src1, relocImm8_su:$src2)>;
+  def : Pat<(OpNodeFlag GR16:$src1, i16relocImmSExt8_su:$src2),
+            (!cast<Instruction>(NAME#"16ri8") GR16:$src1, i16relocImmSExt8_su:$src2)>;
+  def : Pat<(OpNodeFlag GR16:$src1, relocImm16_su:$src2),
+            (!cast<Instruction>(NAME#"16ri") GR16:$src1, relocImm16_su:$src2)>;
+  def : Pat<(OpNodeFlag GR32:$src1, i32relocImmSExt8_su:$src2),
+            (!cast<Instruction>(NAME#"32ri8") GR32:$src1, i32relocImmSExt8_su:$src2)>;
+  def : Pat<(OpNodeFlag GR32:$src1, relocImm32_su:$src2),
+            (!cast<Instruction>(NAME#"32ri") GR32:$src1, relocImm32_su:$src2)>;
+  def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt8_su:$src2),
+            (!cast<Instruction>(NAME#"64ri8") GR64:$src1, i64relocImmSExt8_su:$src2)>;
+  def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2),
+            (!cast<Instruction>(NAME#"64ri32") GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+  def : Pat<(store (OpNode (load addr:$dst), relocImm8_su:$src), addr:$dst),
+            (!cast<Instruction>(NAME#"8mi") addr:$dst, relocImm8_su:$src)>;
+  def : Pat<(store (OpNode (load addr:$dst), i16relocImmSExt8_su:$src), addr:$dst),
+            (!cast<Instruction>(NAME#"16mi8") addr:$dst, i16relocImmSExt8_su:$src)>;
+  def : Pat<(store (OpNode (load addr:$dst), relocImm16_su:$src), addr:$dst),
+            (!cast<Instruction>(NAME#"16mi") addr:$dst, relocImm16_su:$src)>;
+  def : Pat<(store (OpNode (load addr:$dst), i32relocImmSExt8_su:$src), addr:$dst),
+            (!cast<Instruction>(NAME#"32mi8") addr:$dst, i32relocImmSExt8_su:$src)>;
+  def : Pat<(store (OpNode (load addr:$dst), relocImm32_su:$src), addr:$dst),
+            (!cast<Instruction>(NAME#"32mi") addr:$dst, relocImm32_su:$src)>;
+  def : Pat<(store (OpNode (load addr:$dst), i64relocImmSExt8_su:$src), addr:$dst),
+            (!cast<Instruction>(NAME#"64mi8") addr:$dst, i64relocImmSExt8_su:$src)>;
+  def : Pat<(store (OpNode (load addr:$dst), i64relocImmSExt32_su:$src), addr:$dst),
+            (!cast<Instruction>(NAME#"64mi32") addr:$dst, i64relocImmSExt32_su:$src)>;
+}
+
+multiclass ArithBinOp_RFF_relocImm_Pats<SDNode OpNodeFlag> {
+  def : Pat<(OpNodeFlag GR8:$src1, relocImm8_su:$src2, EFLAGS),
+            (!cast<Instruction>(NAME#"8ri") GR8:$src1, relocImm8_su:$src2)>;
+  def : Pat<(OpNodeFlag GR16:$src1, i16relocImmSExt8_su:$src2, EFLAGS),
+            (!cast<Instruction>(NAME#"16ri8") GR16:$src1, i16relocImmSExt8_su:$src2)>;
+  def : Pat<(OpNodeFlag GR16:$src1, relocImm16_su:$src2, EFLAGS),
+            (!cast<Instruction>(NAME#"16ri") GR16:$src1, relocImm16_su:$src2)>;
+  def : Pat<(OpNodeFlag GR32:$src1, i32relocImmSExt8_su:$src2, EFLAGS),
+            (!cast<Instruction>(NAME#"32ri8") GR32:$src1, i32relocImmSExt8_su:$src2)>;
+  def : Pat<(OpNodeFlag GR32:$src1, relocImm32_su:$src2, EFLAGS),
+            (!cast<Instruction>(NAME#"32ri") GR32:$src1, relocImm32_su:$src2)>;
+  def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt8_su:$src2, EFLAGS),
+            (!cast<Instruction>(NAME#"64ri8") GR64:$src1, i64relocImmSExt8_su:$src2)>;
+  def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2, EFLAGS),
+            (!cast<Instruction>(NAME#"64ri32") GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+  def : Pat<(store (OpNodeFlag (load addr:$dst), relocImm8_su:$src, EFLAGS), addr:$dst),
+            (!cast<Instruction>(NAME#"8mi") addr:$dst, relocImm8_su:$src)>;
+  def : Pat<(store (OpNodeFlag (load addr:$dst), i16relocImmSExt8_su:$src, EFLAGS), addr:$dst),
+            (!cast<Instruction>(NAME#"16mi8") addr:$dst, i16relocImmSExt8_su:$src)>;
+  def : Pat<(store (OpNodeFlag (load addr:$dst), relocImm16_su:$src, EFLAGS), addr:$dst),
+            (!cast<Instruction>(NAME#"16mi") addr:$dst, relocImm16_su:$src)>;
+  def : Pat<(store (OpNodeFlag (load addr:$dst), i32relocImmSExt8_su:$src, EFLAGS), addr:$dst),
+            (!cast<Instruction>(NAME#"32mi8") addr:$dst, i32relocImmSExt8_su:$src)>;
+  def : Pat<(store (OpNodeFlag (load addr:$dst), relocImm32_su:$src, EFLAGS), addr:$dst),
+            (!cast<Instruction>(NAME#"32mi") addr:$dst, relocImm32_su:$src)>;
+  def : Pat<(store (OpNodeFlag (load addr:$dst), i64relocImmSExt8_su:$src, EFLAGS), addr:$dst),
+            (!cast<Instruction>(NAME#"64mi8") addr:$dst, i64relocImmSExt8_su:$src)>;
+  def : Pat<(store (OpNodeFlag (load addr:$dst), i64relocImmSExt32_su:$src, EFLAGS), addr:$dst),
+            (!cast<Instruction>(NAME#"64mi32") addr:$dst, i64relocImmSExt32_su:$src)>;
+}
+
+multiclass ArithBinOp_F_relocImm_Pats<SDNode OpNodeFlag> {
+  def : Pat<(OpNodeFlag GR8:$src1, relocImm8_su:$src2),
+            (!cast<Instruction>(NAME#"8ri") GR8:$src1, relocImm8_su:$src2)>;
+  def : Pat<(OpNodeFlag GR16:$src1, i16relocImmSExt8_su:$src2),
+            (!cast<Instruction>(NAME#"16ri8") GR16:$src1, i16relocImmSExt8_su:$src2)>;
+  def : Pat<(OpNodeFlag GR16:$src1, relocImm16_su:$src2),
+            (!cast<Instruction>(NAME#"16ri") GR16:$src1, relocImm16_su:$src2)>;
+  def : Pat<(OpNodeFlag GR32:$src1, i32relocImmSExt8_su:$src2),
+            (!cast<Instruction>(NAME#"32ri8") GR32:$src1, i32relocImmSExt8_su:$src2)>;
+  def : Pat<(OpNodeFlag GR32:$src1, relocImm32_su:$src2),
+            (!cast<Instruction>(NAME#"32ri") GR32:$src1, relocImm32_su:$src2)>;
+  def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt8_su:$src2),
+            (!cast<Instruction>(NAME#"64ri8") GR64:$src1, i64relocImmSExt8_su:$src2)>;
+  def : Pat<(OpNodeFlag GR64:$src1, i64relocImmSExt32_su:$src2),
+            (!cast<Instruction>(NAME#"64ri32") GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+  def : Pat<(OpNodeFlag (loadi8 addr:$src1), relocImm8_su:$src2),
+            (!cast<Instruction>(NAME#"8mi") addr:$src1, relocImm8_su:$src2)>;
+  def : Pat<(OpNodeFlag (loadi16 addr:$src1), i16relocImmSExt8_su:$src2),
+            (!cast<Instruction>(NAME#"16mi8") addr:$src1, i16relocImmSExt8_su:$src2)>;
+  def : Pat<(OpNodeFlag (loadi16 addr:$src1), relocImm16_su:$src2),
+            (!cast<Instruction>(NAME#"16mi") addr:$src1, relocImm16_su:$src2)>;
+  def : Pat<(OpNodeFlag (loadi32 addr:$src1), i32relocImmSExt8_su:$src2),
+            (!cast<Instruction>(NAME#"32mi8") addr:$src1, i32relocImmSExt8_su:$src2)>;
+  def : Pat<(OpNodeFlag (loadi32 addr:$src1), relocImm32_su:$src2),
+            (!cast<Instruction>(NAME#"32mi") addr:$src1, relocImm32_su:$src2)>;
+  def : Pat<(OpNodeFlag (loadi64 addr:$src1), i64relocImmSExt8_su:$src2),
+            (!cast<Instruction>(NAME#"64mi8") addr:$src1, i64relocImmSExt8_su:$src2)>;
+  def : Pat<(OpNodeFlag (loadi64 addr:$src1), i64relocImmSExt32_su:$src2),
+            (!cast<Instruction>(NAME#"64mi32") addr:$src1, i64relocImmSExt32_su:$src2)>;
+}
+
+defm AND : ArithBinOp_RF_relocImm_Pats<X86and_flag, and>;
+defm OR  : ArithBinOp_RF_relocImm_Pats<X86or_flag, or>;
+defm XOR : ArithBinOp_RF_relocImm_Pats<X86xor_flag, xor>;
+defm ADD : ArithBinOp_RF_relocImm_Pats<X86add_flag, add>;
+defm SUB : ArithBinOp_RF_relocImm_Pats<X86sub_flag, sub>;
+
+defm ADC : ArithBinOp_RFF_relocImm_Pats<X86adc_flag>;
+defm SBB : ArithBinOp_RFF_relocImm_Pats<X86sbb_flag>;
+
+defm CMP : ArithBinOp_F_relocImm_Pats<X86cmp>;
+
+// ADC is commutable, but we can't indicate that to tablegen. So manually
+// reverse the operands.
+def : Pat<(X86adc_flag GR8:$src1, relocImm8_su:$src2, EFLAGS),
+          (ADC8ri relocImm8_su:$src2, GR8:$src1)>;
+def : Pat<(X86adc_flag i16relocImmSExt8_su:$src2, GR16:$src1, EFLAGS),
+          (ADC16ri8 GR16:$src1, i16relocImmSExt8_su:$src2)>;
+def : Pat<(X86adc_flag relocImm16_su:$src2, GR16:$src1, EFLAGS),
+          (ADC16ri GR16:$src1, relocImm16_su:$src2)>;
+def : Pat<(X86adc_flag i32relocImmSExt8_su:$src2, GR32:$src1, EFLAGS),
+          (ADC32ri8 GR32:$src1, i32relocImmSExt8_su:$src2)>;
+def : Pat<(X86adc_flag relocImm32_su:$src2, GR32:$src1, EFLAGS),
+          (ADC32ri GR32:$src1, relocImm32_su:$src2)>;
+def : Pat<(X86adc_flag i64relocImmSExt8_su:$src2, GR64:$src1, EFLAGS),
+          (ADC64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>;
+def : Pat<(X86adc_flag i64relocImmSExt32_su:$src2, GR64:$src1, EFLAGS),
+          (ADC64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+def : Pat<(store (X86adc_flag relocImm8_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+          (ADC8mi addr:$dst, relocImm8_su:$src)>;
+def : Pat<(store (X86adc_flag i16relocImmSExt8_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+          (ADC16mi8 addr:$dst, i16relocImmSExt8_su:$src)>;
+def : Pat<(store (X86adc_flag relocImm16_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+          (ADC16mi addr:$dst, relocImm16_su:$src)>;
+def : Pat<(store (X86adc_flag i32relocImmSExt8_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+          (ADC32mi8 addr:$dst, i32relocImmSExt8_su:$src)>;
+def : Pat<(store (X86adc_flag relocImm32_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+          (ADC32mi addr:$dst, relocImm32_su:$src)>;
+def : Pat<(store (X86adc_flag i64relocImmSExt8_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+          (ADC64mi8 addr:$dst, i64relocImmSExt8_su:$src)>;
+def : Pat<(store (X86adc_flag i64relocImmSExt32_su:$src, (load addr:$dst), EFLAGS), addr:$dst),
+          (ADC64mi32 addr:$dst, i64relocImmSExt32_su:$src)>;
+
 //===----------------------------------------------------------------------===//
 // Semantically, test instructions are similar like AND, except they don't
 // generate a result.  From an encoding perspective, they are very different:
@@ -1247,7 +1387,6 @@ let isCompare = 1 in {
     def TEST8ri    : BinOpRI_F<0xF6, "test", Xi8 , X86testpat, MRM0r>;
     def TEST16ri   : BinOpRI_F<0xF6, "test", Xi16, X86testpat, MRM0r>;
     def TEST32ri   : BinOpRI_F<0xF6, "test", Xi32, X86testpat, MRM0r>;
-    let Predicates = [In64BitMode] in
     def TEST64ri32 : BinOpRI_F<0xF6, "test", Xi64, X86testpat, MRM0r>;
 
     def TEST8mi    : BinOpMI_F<0xF6, "test", Xi8 , X86testpat, MRM0m>;
@@ -1267,6 +1406,25 @@ let isCompare = 1 in {
                              "{$src, %rax|rax, $src}">;
 } // isCompare
 
+// Patterns to match a relocImm into the immediate field.
+def : Pat<(X86testpat GR8:$src1, relocImm8_su:$src2),
+          (TEST8ri GR8:$src1, relocImm8_su:$src2)>;
+def : Pat<(X86testpat GR16:$src1, relocImm16_su:$src2),
+          (TEST16ri GR16:$src1, relocImm16_su:$src2)>;
+def : Pat<(X86testpat GR32:$src1, relocImm32_su:$src2),
+          (TEST32ri GR32:$src1, relocImm32_su:$src2)>;
+def : Pat<(X86testpat GR64:$src1, i64relocImmSExt32_su:$src2),
+          (TEST64ri32 GR64:$src1, i64relocImmSExt32_su:$src2)>;
+
+def : Pat<(X86testpat (loadi8 addr:$src1), relocImm8_su:$src2),
+          (TEST8mi addr:$src1, relocImm8_su:$src2)>;
+def : Pat<(X86testpat (loadi16 addr:$src1), relocImm16_su:$src2),
+          (TEST16mi addr:$src1, relocImm16_su:$src2)>;
+def : Pat<(X86testpat (loadi32 addr:$src1), relocImm32_su:$src2),
+          (TEST32mi addr:$src1, relocImm32_su:$src2)>;
+def : Pat<(X86testpat (loadi64 addr:$src1), i64relocImmSExt32_su:$src2),
+          (TEST64mi32 addr:$src1, i64relocImmSExt32_su:$src2)>;
+
 //===----------------------------------------------------------------------===//
 // ANDN Instruction
 //
@@ -1306,7 +1464,6 @@ let Predicates = [HasBMI], AddedComplexity = -6 in {
 multiclass bmi_mulx<string mnemonic, RegisterClass RC, X86MemOperand x86memop,
                     X86FoldableSchedWrite sched> {
 let hasSideEffects = 0 in {
-  let isCommutable = 1 in
   def rr : I<0xF6, MRMSrcReg, (outs RC:$dst1, RC:$dst2), (ins RC:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
              []>, T8XD, VEX_4V, Sched<[sched, WriteIMulH]>;
@@ -1314,7 +1471,17 @@ let hasSideEffects = 0 in {
   let mayLoad = 1 in
   def rm : I<0xF6, MRMSrcMem, (outs RC:$dst1, RC:$dst2), (ins x86memop:$src),
              !strconcat(mnemonic, "\t{$src, $dst2, $dst1|$dst1, $dst2, $src}"),
+
              []>, T8XD, VEX_4V, Sched<[sched.Folded, WriteIMulH]>;
+
+  // Pseudo instructions to be used when the low result isn't used. The
+  // instruction is defined to keep the high if both destinations are the same.
+  def Hrr : PseudoI<(outs RC:$dst), (ins RC:$src),
+                    []>, Sched<[sched]>;
+
+  let mayLoad = 1 in
+  def Hrm : PseudoI<(outs RC:$dst), (ins x86memop:$src),
+                    []>, Sched<[sched.Folded]>;
 }
 }
 
diff --git a/llvm/lib/Target/X86/X86InstrBuilder.h b/llvm/lib/Target/X86/X86InstrBuilder.h
index aa45e9b191c1..07079ef87fd4 100644
--- a/llvm/lib/Target/X86/X86InstrBuilder.h
+++ b/llvm/lib/Target/X86/X86InstrBuilder.h
@@ -207,7 +207,7 @@ addFrameReference(const MachineInstrBuilder &MIB, int FI, int Offset = 0) {
     Flags |= MachineMemOperand::MOStore;
   MachineMemOperand *MMO = MF.getMachineMemOperand(
       MachinePointerInfo::getFixedStack(MF, FI, Offset), Flags,
-      MFI.getObjectSize(FI), MFI.getObjectAlignment(FI));
+      MFI.getObjectSize(FI), MFI.getObjectAlign(FI));
   return addOffset(MIB.addFrameIndex(FI), Offset)
             .addMemOperand(MMO);
 }
diff --git a/llvm/lib/Target/X86/X86InstrCompiler.td b/llvm/lib/Target/X86/X86InstrCompiler.td
index 78d8dd3c0d03..4df93fb2ed60 100644
--- a/llvm/lib/Target/X86/X86InstrCompiler.td
+++ b/llvm/lib/Target/X86/X86InstrCompiler.td
@@ -111,8 +111,30 @@ def SEG_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
                       [(set GR64:$dst,
                          (X86SegAlloca GR64:$size))]>,
                     Requires<[In64BitMode]>;
+
+// To protect against stack clash, dynamic allocation should perform a memory
+// probe at each page.
+
+let Defs = [EAX, ESP, EFLAGS], Uses = [ESP] in
+def PROBED_ALLOCA_32 : I<0, Pseudo, (outs GR32:$dst), (ins GR32:$size),
+                      "# variable sized alloca with probing",
+                      [(set GR32:$dst,
+                         (X86ProbedAlloca GR32:$size))]>,
+                    Requires<[NotLP64]>;
+
+let Defs = [RAX, RSP, EFLAGS], Uses = [RSP] in
+def PROBED_ALLOCA_64 : I<0, Pseudo, (outs GR64:$dst), (ins GR64:$size),
+                      "# variable sized alloca with probing",
+                      [(set GR64:$dst,
+                         (X86ProbedAlloca GR64:$size))]>,
+                    Requires<[In64BitMode]>;
 }
 
+let hasNoSchedulingInfo = 1 in
+def STACKALLOC_W_PROBING : I<0, Pseudo, (outs), (ins i64imm:$stacksize),
+                             "# fixed size alloca with probing",
+                             []>;
+
 // Dynamic stack allocation yields a _chkstk or _alloca call for all Windows
 // targets.  These calls are needed to probe the stack when allocating more than
 // 4k bytes in one go. Touching the stack at 4K increments is necessary to
@@ -177,18 +199,6 @@ let isTerminator = 1, hasSideEffects = 1, isBarrier = 1, hasCtrlDep = 1,
                      [(catchret bb:$dst, bb:$from)]>;
 }
 
-let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1,
-    usesCustomInserter = 1 in
-def CATCHPAD : I<0, Pseudo, (outs), (ins), "# CATCHPAD", [(catchpad)]>;
-
-// This instruction is responsible for re-establishing stack pointers after an
-// exception has been caught and we are rejoining normal control flow in the
-// parent function or funclet. It generally sets ESP and EBP, and optionally
-// ESI. It is only needed for 32-bit WinEH, as the runtime restores CSRs for us
-// elsewhere.
-let hasSideEffects = 1, hasCtrlDep = 1, isCodeGenOnly = 1 in
-def EH_RESTORE : I<0, Pseudo, (outs), (ins), "# EH_RESTORE", []>;
-
 let hasSideEffects = 1, isBarrier = 1, isCodeGenOnly = 1,
     usesCustomInserter = 1 in {
   def EH_SjLj_SetJmp32  : I<0, Pseudo, (outs GR32:$dst), (ins i32mem:$buf),
@@ -308,69 +318,26 @@ def MOV64ImmSExti8 : I<0, Pseudo, (outs GR64:$dst), (ins i64i8imm:$src), "",
 // Materialize i64 constant where top 32-bits are zero. This could theoretically
 // use MOV32ri with a SUBREG_TO_REG to represent the zero-extension, however
 // that would make it more difficult to rematerialize.
-let isReMaterializable = 1, isAsCheapAsAMove = 1,
-    isPseudo = 1, hasSideEffects = 0, SchedRW = [WriteMove] in
-def MOV32ri64 : I<0, Pseudo, (outs GR64:$dst), (ins i64i32imm:$src), "", []>;
-
-// This 64-bit pseudo-move can be used for both a 64-bit constant that is
-// actually the zero-extension of a 32-bit constant and for labels in the
-// x86-64 small code model.
-def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [imm, X86Wrapper]>;
-
+let AddedComplexity = 1, isReMaterializable = 1, isAsCheapAsAMove = 1,
+    isPseudo = 1, SchedRW = [WriteMove] in
+def MOV32ri64 : I<0, Pseudo, (outs GR64:$dst), (ins i64i32imm:$src), "",
+                  [(set GR64:$dst, i64immZExt32:$src)]>;
+
+// This 64-bit pseudo-move can also be used for labels in the x86-64 small code
+// model.
+def mov64imm32 : ComplexPattern<i64, 1, "selectMOV64Imm32", [X86Wrapper]>;
 def : Pat<(i64 mov64imm32:$src), (MOV32ri64 mov64imm32:$src)>;
 
 // Use sbb to materialize carry bit.
-let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteALU] in {
+let Uses = [EFLAGS], Defs = [EFLAGS], isPseudo = 1, SchedRW = [WriteADC],
+    hasSideEffects = 0 in {
 // FIXME: These are pseudo ops that should be replaced with Pat<> patterns.
 // However, Pat<> can't replicate the destination reg into the inputs of the
 // result.
-def SETB_C8r : I<0, Pseudo, (outs GR8:$dst), (ins), "",
-                 [(set GR8:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
-def SETB_C16r : I<0, Pseudo, (outs GR16:$dst), (ins), "",
-                 [(set GR16:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
-def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "",
-                 [(set GR32:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
-def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "",
-                 [(set GR64:$dst, (X86setcc_c X86_COND_B, EFLAGS))]>;
+def SETB_C32r : I<0, Pseudo, (outs GR32:$dst), (ins), "", []>;
+def SETB_C64r : I<0, Pseudo, (outs GR64:$dst), (ins), "", []>;
 } // isCodeGenOnly
 
-
-def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
-          (SETB_C16r)>;
-def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
-          (SETB_C32r)>;
-def : Pat<(i64 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
-          (SETB_C64r)>;
-
-def : Pat<(i16 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
-          (SETB_C16r)>;
-def : Pat<(i32 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
-          (SETB_C32r)>;
-def : Pat<(i64 (sext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
-          (SETB_C64r)>;
-
-// We canonicalize 'setb' to "(and (sbb reg,reg), 1)" on the hope that the and
-// will be eliminated and that the sbb can be extended up to a wider type.  When
-// this happens, it is great.  However, if we are left with an 8-bit sbb and an
-// and, we might as well just match it as a setb.
-def : Pat<(and (i8 (X86setcc_c X86_COND_B, EFLAGS)), 1),
-          (SETCCr (i8 2))>;
-
-// Patterns to give priority when both inputs are zero so that we don't use
-// an immediate for the RHS.
-// TODO: Should we use a 32-bit sbb for 8/16 to push the extract_subreg out?
-def : Pat<(X86sbb_flag (i8 0), (i8 0), EFLAGS),
-          (SBB8rr (EXTRACT_SUBREG (MOV32r0), sub_8bit),
-                  (EXTRACT_SUBREG (MOV32r0), sub_8bit))>;
-def : Pat<(X86sbb_flag (i16 0), (i16 0), EFLAGS),
-          (SBB16rr (EXTRACT_SUBREG (MOV32r0), sub_16bit),
-                   (EXTRACT_SUBREG (MOV32r0), sub_16bit))>;
-def : Pat<(X86sbb_flag (i32 0), (i32 0), EFLAGS),
-          (SBB32rr (MOV32r0), (MOV32r0))>;
-def : Pat<(X86sbb_flag (i64 0), (i64 0), EFLAGS),
-          (SBB64rr (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit),
-                   (SUBREG_TO_REG (i64 0), (MOV32r0), sub_32bit))>;
-
 //===----------------------------------------------------------------------===//
 // String Pseudo Instructions
 //
@@ -568,10 +535,13 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
 
   defm _RFP80 : CMOVrr_PSEUDO<RFP80, f80>;
 
-  let Predicates = [NoAVX512] in {
+  let Predicates = [HasMMX] in
+    defm _VR64   : CMOVrr_PSEUDO<VR64, x86mmx>;
+
+  let Predicates = [HasSSE1,NoAVX512] in
     defm _FR32   : CMOVrr_PSEUDO<FR32, f32>;
+  let Predicates = [HasSSE2,NoAVX512] in
     defm _FR64   : CMOVrr_PSEUDO<FR64, f64>;
-  }
   let Predicates = [HasAVX512] in {
     defm _FR32X  : CMOVrr_PSEUDO<FR32X, f32>;
     defm _FR64X  : CMOVrr_PSEUDO<FR64X, f64>;
@@ -585,6 +555,7 @@ let usesCustomInserter = 1, hasNoSchedulingInfo = 1, Uses = [EFLAGS] in {
     defm _VR256X : CMOVrr_PSEUDO<VR256X, v4i64>;
   }
   defm _VR512  : CMOVrr_PSEUDO<VR512, v8i64>;
+  defm _VK1    : CMOVrr_PSEUDO<VK1,  v1i1>;
   defm _VK2    : CMOVrr_PSEUDO<VK2,  v2i1>;
   defm _VK4    : CMOVrr_PSEUDO<VK4,  v4i1>;
   defm _VK8    : CMOVrr_PSEUDO<VK8,  v8i1>;
@@ -880,7 +851,7 @@ defm LCMPXCHG8B : LCMPXCHG_UnOp<0xC7, MRM1m, "cmpxchg8b", X86cas8, i64mem>;
 // it. In other words, the register will not fix the clobbering of
 // RBX that will happen when setting the arguments for the instrucion.
 //
-// Unlike the actual related instuction, we mark that this one
+// Unlike the actual related instruction, we mark that this one
 // defines EBX (instead of using EBX).
 // The rationale is that we will define RBX during the expansion of
 // the pseudo. The argument feeding EBX is ebx_input.
@@ -1213,14 +1184,14 @@ def X86tcret_6regs : PatFrag<(ops node:$ptr, node:$off),
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri ptr_rc_tailcall:$dst, imm:$off)>,
-          Requires<[Not64BitMode, NotUseRetpolineIndirectCalls]>;
+          Requires<[Not64BitMode, NotUseIndirectThunkCalls]>;
 
 // FIXME: This is disabled for 32-bit PIC mode because the global base
 // register which is part of the address mode may be assigned a
 // callee-saved register.
 def : Pat<(X86tcret (load addr:$dst), imm:$off),
           (TCRETURNmi addr:$dst, imm:$off)>,
-          Requires<[Not64BitMode, IsNotPIC, NotUseRetpolineIndirectCalls]>;
+          Requires<[Not64BitMode, IsNotPIC, NotUseIndirectThunkCalls]>;
 
 def : Pat<(X86tcret (i32 tglobaladdr:$dst), imm:$off),
           (TCRETURNdi tglobaladdr:$dst, imm:$off)>,
@@ -1232,21 +1203,21 @@ def : Pat<(X86tcret (i32 texternalsym:$dst), imm:$off),
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
           (TCRETURNri64 ptr_rc_tailcall:$dst, imm:$off)>,
-          Requires<[In64BitMode, NotUseRetpolineIndirectCalls]>;
+          Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
 
 // Don't fold loads into X86tcret requiring more than 6 regs.
 // There wouldn't be enough scratch registers for base+index.
 def : Pat<(X86tcret_6regs (load addr:$dst), imm:$off),
           (TCRETURNmi64 addr:$dst, imm:$off)>,
-          Requires<[In64BitMode, NotUseRetpolineIndirectCalls]>;
+          Requires<[In64BitMode, NotUseIndirectThunkCalls]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
-          (RETPOLINE_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>,
-          Requires<[In64BitMode, UseRetpolineIndirectCalls]>;
+          (INDIRECT_THUNK_TCRETURN64 ptr_rc_tailcall:$dst, imm:$off)>,
+          Requires<[In64BitMode, UseIndirectThunkCalls]>;
 
 def : Pat<(X86tcret ptr_rc_tailcall:$dst, imm:$off),
-          (RETPOLINE_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>,
-          Requires<[Not64BitMode, UseRetpolineIndirectCalls]>;
+          (INDIRECT_THUNK_TCRETURN32 ptr_rc_tailcall:$dst, imm:$off)>,
+          Requires<[Not64BitMode, UseIndirectThunkCalls]>;
 
 def : Pat<(X86tcret (i64 tglobaladdr:$dst), imm:$off),
           (TCRETURNdi64 tglobaladdr:$dst, imm:$off)>,
@@ -1815,21 +1786,24 @@ multiclass MaskedRotateAmountPats<SDNode frag, string name> {
 defm : MaskedRotateAmountPats<rotl, "ROL">;
 defm : MaskedRotateAmountPats<rotr, "ROR">;
 
-// Double shift amount is implicitly masked.
-multiclass MaskedDoubleShiftAmountPats<SDNode frag, string name> {
-  // (shift x (and y, 31)) ==> (shift x, y)
-  def : Pat<(frag GR16:$src1, GR16:$src2, (shiftMask32 CL)),
-            (!cast<Instruction>(name # "16rrCL") GR16:$src1, GR16:$src2)>;
-  def : Pat<(frag GR32:$src1, GR32:$src2, (shiftMask32 CL)),
-            (!cast<Instruction>(name # "32rrCL") GR32:$src1, GR32:$src2)>;
-
-  // (shift x (and y, 63)) ==> (shift x, y)
-  def : Pat<(frag GR64:$src1, GR64:$src2, (shiftMask32 CL)),
-            (!cast<Instruction>(name # "64rrCL") GR64:$src1, GR64:$src2)>;
-}
-
-defm : MaskedDoubleShiftAmountPats<X86shld, "SHLD">;
-defm : MaskedDoubleShiftAmountPats<X86shrd, "SHRD">;
+// Double "funnel" shift amount is implicitly masked.
+// (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y) (NOTE: modulo32)
+def : Pat<(X86fshl GR16:$src1, GR16:$src2, (shiftMask32 CL)),
+          (SHLD16rrCL GR16:$src1, GR16:$src2)>;
+def : Pat<(X86fshr GR16:$src2, GR16:$src1, (shiftMask32 CL)),
+          (SHRD16rrCL GR16:$src1, GR16:$src2)>;
+
+// (fshl/fshr x (and y, 31)) ==> (fshl/fshr x, y)
+def : Pat<(fshl GR32:$src1, GR32:$src2, (shiftMask32 CL)),
+          (SHLD32rrCL GR32:$src1, GR32:$src2)>;
+def : Pat<(fshr GR32:$src2, GR32:$src1, (shiftMask32 CL)),
+          (SHRD32rrCL GR32:$src1, GR32:$src2)>;
+
+// (fshl/fshr x (and y, 63)) ==> (fshl/fshr x, y)
+def : Pat<(fshl GR64:$src1, GR64:$src2, (shiftMask64 CL)),
+          (SHLD64rrCL GR64:$src1, GR64:$src2)>;
+def : Pat<(fshr GR64:$src2, GR64:$src1, (shiftMask64 CL)),
+          (SHRD64rrCL GR64:$src1, GR64:$src2)>;
 
 let Predicates = [HasBMI2] in {
   let AddedComplexity = 1 in {
@@ -1919,15 +1893,6 @@ defm : one_bit_patterns<GR16, i16, BTR16rr, BTS16rr, BTC16rr, shiftMask16>;
 defm : one_bit_patterns<GR32, i32, BTR32rr, BTS32rr, BTC32rr, shiftMask32>;
 defm : one_bit_patterns<GR64, i64, BTR64rr, BTS64rr, BTC64rr, shiftMask64>;
 
-
-// (anyext (setcc_carry)) -> (setcc_carry)
-def : Pat<(i16 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
-          (SETB_C16r)>;
-def : Pat<(i32 (anyext (i8 (X86setcc_c X86_COND_B, EFLAGS)))),
-          (SETB_C32r)>;
-def : Pat<(i32 (anyext (i16 (X86setcc_c X86_COND_B, EFLAGS)))),
-          (SETB_C32r)>;
-
 //===----------------------------------------------------------------------===//
 // EFLAGS-defining Patterns
 //===----------------------------------------------------------------------===//
@@ -1999,10 +1964,6 @@ def : Pat<(X86sub_flag 0, GR16:$src), (NEG16r GR16:$src)>;
 def : Pat<(X86sub_flag 0, GR32:$src), (NEG32r GR32:$src)>;
 def : Pat<(X86sub_flag 0, GR64:$src), (NEG64r GR64:$src)>;
 
-// sub reg, relocImm
-def : Pat<(X86sub_flag GR64:$src1, i64relocImmSExt8_su:$src2),
-          (SUB64ri8 GR64:$src1, i64relocImmSExt8_su:$src2)>;
-
 // mul reg, reg
 def : Pat<(mul GR16:$src1, GR16:$src2),
           (IMUL16rr GR16:$src1, GR16:$src2)>;
diff --git a/llvm/lib/Target/X86/X86InstrControl.td b/llvm/lib/Target/X86/X86InstrControl.td
index 32faeb1a86f2..4f7867744017 100644
--- a/llvm/lib/Target/X86/X86InstrControl.td
+++ b/llvm/lib/Target/X86/X86InstrControl.td
@@ -193,14 +193,16 @@ let isBranch = 1, isTerminator = 1, isBarrier = 1, isIndirectBranch = 1 in {
                             "ljmp{l}\t$seg, $off", []>,
                             OpSize32, Sched<[WriteJump]>;
   }
-  def FARJMP64   : RI<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
-                      "ljmp{q}\t{*}$dst", []>, Sched<[WriteJump]>, Requires<[In64BitMode]>;
-
-  let AsmVariantName = "att" in
-  def FARJMP16m  : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
-                     "ljmp{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
-  def FARJMP32m  : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
-                     "{l}jmp{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
+  let mayLoad = 1 in {
+    def FARJMP64m  : RI<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+                        "ljmp{q}\t{*}$dst", []>, Sched<[WriteJump]>, Requires<[In64BitMode]>;
+
+    let AsmVariantName = "att" in
+    def FARJMP16m  : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+                       "ljmp{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
+    def FARJMP32m  : I<0xFF, MRM5m, (outs), (ins opaquemem:$dst),
+                       "{l}jmp{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
+  }
 }
 
 // Loop instructions
@@ -237,13 +239,13 @@ let isCall = 1 in
                         Sched<[WriteJumpLd]>;
     def CALL32r     : I<0xFF, MRM2r, (outs), (ins GR32:$dst),
                         "call{l}\t{*}$dst", [(X86call GR32:$dst)]>, OpSize32,
-                        Requires<[Not64BitMode,NotUseRetpolineIndirectCalls]>,
+                        Requires<[Not64BitMode,NotUseIndirectThunkCalls]>,
                         Sched<[WriteJump]>;
     def CALL32m     : I<0xFF, MRM2m, (outs), (ins i32mem:$dst),
                         "call{l}\t{*}$dst", [(X86call (loadi32 addr:$dst))]>,
                         OpSize32,
                         Requires<[Not64BitMode,FavorMemIndirectCall,
-                                  NotUseRetpolineIndirectCalls]>,
+                                  NotUseIndirectThunkCalls]>,
                         Sched<[WriteJumpLd]>;
 
     // Non-tracking calls for IBT, use with caution.
@@ -275,10 +277,12 @@ let isCall = 1 in
                                OpSize32, Sched<[WriteJump]>;
     }
 
-    def FARCALL16m  : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
-                        "lcall{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
-    def FARCALL32m  : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
-                        "{l}call{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
+    let mayLoad = 1 in {
+      def FARCALL16m  : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+                          "lcall{w}\t{*}$dst", []>, OpSize16, Sched<[WriteJumpLd]>;
+      def FARCALL32m  : I<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+                          "{l}call{l}\t{*}$dst", []>, OpSize32, Sched<[WriteJumpLd]>;
+    }
   }
 
 
@@ -334,11 +338,11 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
                       Requires<[In64BitMode]>;
   def CALL64r       : I<0xFF, MRM2r, (outs), (ins GR64:$dst),
                         "call{q}\t{*}$dst", [(X86call GR64:$dst)]>,
-                      Requires<[In64BitMode,NotUseRetpolineIndirectCalls]>;
+                      Requires<[In64BitMode,NotUseIndirectThunkCalls]>;
   def CALL64m       : I<0xFF, MRM2m, (outs), (ins i64mem:$dst),
                         "call{q}\t{*}$dst", [(X86call (loadi64 addr:$dst))]>,
                       Requires<[In64BitMode,FavorMemIndirectCall,
-                                NotUseRetpolineIndirectCalls]>;
+                                NotUseIndirectThunkCalls]>;
 
   // Non-tracking calls for IBT, use with caution.
   let isCodeGenOnly = 1 in {
@@ -351,7 +355,8 @@ let isCall = 1, Uses = [RSP, SSP], SchedRW = [WriteJump] in {
                        Requires<[In64BitMode,FavorMemIndirectCall]>, NOTRACK;
   }
 
-  def FARCALL64   : RI<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
+  let mayLoad = 1 in
+  def FARCALL64m  : RI<0xFF, MRM3m, (outs), (ins opaquemem:$dst),
                        "lcall{q}\t{*}$dst", []>;
 }
 
@@ -393,19 +398,19 @@ let isPseudo = 1, isCall = 1, isCodeGenOnly = 1,
     Uses = [RSP, SSP],
     usesCustomInserter = 1,
     SchedRW = [WriteJump] in {
-  def RETPOLINE_CALL32 :
+  def INDIRECT_THUNK_CALL32 :
     PseudoI<(outs), (ins GR32:$dst), [(X86call GR32:$dst)]>,
-            Requires<[Not64BitMode,UseRetpolineIndirectCalls]>;
+            Requires<[Not64BitMode,UseIndirectThunkCalls]>;
 
-  def RETPOLINE_CALL64 :
+  def INDIRECT_THUNK_CALL64 :
     PseudoI<(outs), (ins GR64:$dst), [(X86call GR64:$dst)]>,
-            Requires<[In64BitMode,UseRetpolineIndirectCalls]>;
+            Requires<[In64BitMode,UseIndirectThunkCalls]>;
 
-  // Retpoline variant of indirect tail calls.
+  // Indirect thunk variant of indirect tail calls.
   let isTerminator = 1, isReturn = 1, isBarrier = 1 in {
-    def RETPOLINE_TCRETURN64 :
+    def INDIRECT_THUNK_TCRETURN64 :
       PseudoI<(outs), (ins GR64:$dst, i32imm:$offset), []>;
-    def RETPOLINE_TCRETURN32 :
+    def INDIRECT_THUNK_TCRETURN32 :
       PseudoI<(outs), (ins GR32:$dst, i32imm:$offset), []>;
   }
 }
diff --git a/llvm/lib/Target/X86/X86InstrFMA.td b/llvm/lib/Target/X86/X86InstrFMA.td
index 9e43a532a3f8..4dbd6bb8cd7e 100644
--- a/llvm/lib/Target/X86/X86InstrFMA.td
+++ b/llvm/lib/Target/X86/X86InstrFMA.td
@@ -126,7 +126,7 @@ let ExeDomain = SSEPackedSingle in {
                                loadv4f32, loadv8f32, X86any_Fmadd, v4f32, v8f32,
                                SchedWriteFMA>;
   defm VFMSUB    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "ps", "PS",
-                               loadv4f32, loadv8f32, X86Fmsub, v4f32, v8f32,
+                               loadv4f32, loadv8f32, X86any_Fmsub, v4f32, v8f32,
                                SchedWriteFMA>;
   defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "ps", "PS",
                                loadv4f32, loadv8f32, X86Fmaddsub, v4f32, v8f32,
@@ -141,7 +141,7 @@ let ExeDomain = SSEPackedDouble in {
                                loadv2f64, loadv4f64, X86any_Fmadd, v2f64,
                                v4f64, SchedWriteFMA>, VEX_W;
   defm VFMSUB    : fma3p_forms<0x9A, 0xAA, 0xBA, "vfmsub", "pd", "PD",
-                               loadv2f64, loadv4f64, X86Fmsub, v2f64,
+                               loadv2f64, loadv4f64, X86any_Fmsub, v2f64,
                                v4f64, SchedWriteFMA>, VEX_W;
   defm VFMADDSUB : fma3p_forms<0x96, 0xA6, 0xB6, "vfmaddsub", "pd", "PD",
                                loadv2f64, loadv4f64, X86Fmaddsub,
@@ -154,19 +154,19 @@ let ExeDomain = SSEPackedDouble in {
 // Fused Negative Multiply-Add
 let ExeDomain = SSEPackedSingle in {
   defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "ps", "PS", loadv4f32,
-                             loadv8f32, X86Fnmadd, v4f32, v8f32, SchedWriteFMA>;
+                             loadv8f32, X86any_Fnmadd, v4f32, v8f32, SchedWriteFMA>;
   defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "ps", "PS", loadv4f32,
-                             loadv8f32, X86Fnmsub, v4f32, v8f32, SchedWriteFMA>;
+                             loadv8f32, X86any_Fnmsub, v4f32, v8f32, SchedWriteFMA>;
 }
 let ExeDomain = SSEPackedDouble in {
   defm VFNMADD : fma3p_forms<0x9C, 0xAC, 0xBC, "vfnmadd", "pd", "PD", loadv2f64,
-                             loadv4f64, X86Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W;
+                             loadv4f64, X86any_Fnmadd, v2f64, v4f64, SchedWriteFMA>, VEX_W;
   defm VFNMSUB : fma3p_forms<0x9E, 0xAE, 0xBE, "vfnmsub", "pd", "PD", loadv2f64,
-                             loadv4f64, X86Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W;
+                             loadv4f64, X86any_Fnmsub, v2f64, v4f64, SchedWriteFMA>, VEX_W;
 }
 
 // All source register operands of FMA opcodes defined in fma3s_rm multiclass
-// can be commuted. In many cases such commute transformation requres an opcode
+// can be commuted. In many cases such commute transformation requires an opcode
 // adjustment, for example, commuting the operands 1 and 2 in FMA*132 form
 // would require an opcode change to FMA*231:
 //     FMA*132* reg1, reg2, reg3; // reg1 * reg3 + reg2;
@@ -283,7 +283,7 @@ multiclass fma3s_rm_int<bits<8> opc, string OpcodeStr,
                         []>, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
 }
 
-// The FMA 213 form is created for lowering of scalar FMA intrinscis
+// The FMA 213 form is created for lowering of scalar FMA intrinsics
 // to machine instructions.
 // The FMA 132 form can trivially be get by commuting the 2nd and 3rd operands
 // of FMA 213 form.
@@ -321,12 +321,12 @@ multiclass fma3s<bits<8> opc132, bits<8> opc213, bits<8> opc231,
 
 defm VFMADD : fma3s<0x99, 0xA9, 0xB9, "vfmadd", X86any_Fmadd,
                     SchedWriteFMA.Scl>, VEX_LIG;
-defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86Fmsub,
+defm VFMSUB : fma3s<0x9B, 0xAB, 0xBB, "vfmsub", X86any_Fmsub,
                     SchedWriteFMA.Scl>, VEX_LIG;
 
-defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86Fnmadd,
+defm VFNMADD : fma3s<0x9D, 0xAD, 0xBD, "vfnmadd", X86any_Fnmadd,
                      SchedWriteFMA.Scl>, VEX_LIG;
-defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86Fnmsub,
+defm VFNMSUB : fma3s<0x9F, 0xAF, 0xBF, "vfnmsub", X86any_Fnmsub,
                      SchedWriteFMA.Scl>, VEX_LIG;
 
 multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
@@ -373,14 +373,14 @@ multiclass scalar_fma_patterns<SDNode Op, string Prefix, string Suffix,
 }
 
 defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
-defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
+defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SS", X86Movss, v4f32, f32, FR32, loadf32>;
 
 defm : scalar_fma_patterns<X86any_Fmadd, "VFMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
-defm : scalar_fma_patterns<X86Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
-defm : scalar_fma_patterns<X86Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
-defm : scalar_fma_patterns<X86Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fmsub, "VFMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fnmadd, "VFNMADD", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
+defm : scalar_fma_patterns<X86any_Fnmsub, "VFNMSUB", "SD", X86Movsd, v2f64, f64, FR64, loadf64>;
 
 //===----------------------------------------------------------------------===//
 // FMA4 - AMD 4 operand Fused Multiply-Add instructions
@@ -542,26 +542,26 @@ let ExeDomain = SSEPackedSingle in {
                           SchedWriteFMA.Scl>,
                     fma4s_int<0x6A, "vfmaddss", ssmem, v4f32,
                               SchedWriteFMA.Scl>;
-  defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86Fmsub, loadf32,
+  defm VFMSUBSS4  : fma4s<0x6E, "vfmsubss", FR32, f32mem, f32, X86any_Fmsub, loadf32,
                           SchedWriteFMA.Scl>,
                     fma4s_int<0x6E, "vfmsubss", ssmem, v4f32,
                               SchedWriteFMA.Scl>;
   defm VFNMADDSS4 : fma4s<0x7A, "vfnmaddss", FR32, f32mem, f32,
-                          X86Fnmadd, loadf32, SchedWriteFMA.Scl>,
+                          X86any_Fnmadd, loadf32, SchedWriteFMA.Scl>,
                     fma4s_int<0x7A, "vfnmaddss", ssmem, v4f32,
                               SchedWriteFMA.Scl>;
   defm VFNMSUBSS4 : fma4s<0x7E, "vfnmsubss", FR32, f32mem, f32,
-                          X86Fnmsub, loadf32, SchedWriteFMA.Scl>,
+                          X86any_Fnmsub, loadf32, SchedWriteFMA.Scl>,
                     fma4s_int<0x7E, "vfnmsubss", ssmem, v4f32,
                               SchedWriteFMA.Scl>;
   // Packed Instructions
   defm VFMADDPS4    : fma4p<0x68, "vfmaddps", X86any_Fmadd, v4f32, v8f32,
                             loadv4f32, loadv8f32, SchedWriteFMA>;
-  defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86Fmsub, v4f32, v8f32,
+  defm VFMSUBPS4    : fma4p<0x6C, "vfmsubps", X86any_Fmsub, v4f32, v8f32,
                             loadv4f32, loadv8f32, SchedWriteFMA>;
-  defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86Fnmadd, v4f32, v8f32,
+  defm VFNMADDPS4   : fma4p<0x78, "vfnmaddps", X86any_Fnmadd, v4f32, v8f32,
                             loadv4f32, loadv8f32, SchedWriteFMA>;
-  defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86Fnmsub, v4f32, v8f32,
+  defm VFNMSUBPS4   : fma4p<0x7C, "vfnmsubps", X86any_Fnmsub, v4f32, v8f32,
                             loadv4f32, loadv8f32, SchedWriteFMA>;
   defm VFMADDSUBPS4 : fma4p<0x5C, "vfmaddsubps", X86Fmaddsub, v4f32, v8f32,
                             loadv4f32, loadv8f32, SchedWriteFMA>;
@@ -575,26 +575,26 @@ let ExeDomain = SSEPackedDouble in {
                           SchedWriteFMA.Scl>,
                     fma4s_int<0x6B, "vfmaddsd", sdmem, v2f64,
                               SchedWriteFMA.Scl>;
-  defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86Fmsub, loadf64,
+  defm VFMSUBSD4  : fma4s<0x6F, "vfmsubsd", FR64, f64mem, f64, X86any_Fmsub, loadf64,
                           SchedWriteFMA.Scl>,
                     fma4s_int<0x6F, "vfmsubsd", sdmem, v2f64,
                               SchedWriteFMA.Scl>;
   defm VFNMADDSD4 : fma4s<0x7B, "vfnmaddsd", FR64, f64mem, f64,
-                          X86Fnmadd, loadf64, SchedWriteFMA.Scl>,
+                          X86any_Fnmadd, loadf64, SchedWriteFMA.Scl>,
                     fma4s_int<0x7B, "vfnmaddsd", sdmem, v2f64,
                               SchedWriteFMA.Scl>;
   defm VFNMSUBSD4 : fma4s<0x7F, "vfnmsubsd", FR64, f64mem, f64,
-                          X86Fnmsub, loadf64, SchedWriteFMA.Scl>,
+                          X86any_Fnmsub, loadf64, SchedWriteFMA.Scl>,
                     fma4s_int<0x7F, "vfnmsubsd", sdmem, v2f64,
                               SchedWriteFMA.Scl>;
   // Packed Instructions
   defm VFMADDPD4    : fma4p<0x69, "vfmaddpd", X86any_Fmadd, v2f64, v4f64,
                             loadv2f64, loadv4f64, SchedWriteFMA>;
-  defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86Fmsub, v2f64, v4f64,
+  defm VFMSUBPD4    : fma4p<0x6D, "vfmsubpd", X86any_Fmsub, v2f64, v4f64,
                             loadv2f64, loadv4f64, SchedWriteFMA>;
-  defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86Fnmadd, v2f64, v4f64,
+  defm VFNMADDPD4   : fma4p<0x79, "vfnmaddpd", X86any_Fnmadd, v2f64, v4f64,
                             loadv2f64, loadv4f64, SchedWriteFMA>;
-  defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86Fnmsub, v2f64, v4f64,
+  defm VFNMSUBPD4   : fma4p<0x7D, "vfnmsubpd", X86any_Fnmsub, v2f64, v4f64,
                             loadv2f64, loadv4f64, SchedWriteFMA>;
   defm VFMADDSUBPD4 : fma4p<0x5D, "vfmaddsubpd", X86Fmaddsub, v2f64, v4f64,
                             loadv2f64, loadv4f64, SchedWriteFMA>;
@@ -630,11 +630,11 @@ multiclass scalar_fma4_patterns<SDNode Op, string Name,
 }
 
 defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
-defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSS4", v4f32, f32, FR32, loadf32>;
+defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSS4", v4f32, f32, FR32, loadf32>;
 
 defm : scalar_fma4_patterns<X86any_Fmadd, "VFMADDSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
-defm : scalar_fma4_patterns<X86Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fmsub, "VFMSUBSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fnmadd, "VFNMADDSD4", v2f64, f64, FR64, loadf64>;
+defm : scalar_fma4_patterns<X86any_Fnmsub, "VFNMSUBSD4", v2f64, f64, FR64, loadf64>;
diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
index 25bbdddb7a21..6d803e931b68 100644
--- a/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
+++ b/llvm/lib/Target/X86/X86InstrFMA3Info.cpp
@@ -116,11 +116,8 @@ static void verifyTables() {
 #ifndef NDEBUG
   static std::atomic<bool> TableChecked(false);
   if (!TableChecked.load(std::memory_order_relaxed)) {
-    assert(std::is_sorted(std::begin(Groups), std::end(Groups)) &&
-           std::is_sorted(std::begin(RoundGroups), std::end(RoundGroups)) &&
-           std::is_sorted(std::begin(BroadcastGroups),
-                          std::end(BroadcastGroups)) &&
-           "FMA3 tables not sorted!");
+    assert(llvm::is_sorted(Groups) && llvm::is_sorted(RoundGroups) &&
+           llvm::is_sorted(BroadcastGroups) && "FMA3 tables not sorted!");
     TableChecked.store(true, std::memory_order_relaxed);
   }
 #endif
diff --git a/llvm/lib/Target/X86/X86InstrFMA3Info.h b/llvm/lib/Target/X86/X86InstrFMA3Info.h
index 7fa6f5917862..ce0a7cc7f82e 100644
--- a/llvm/lib/Target/X86/X86InstrFMA3Info.h
+++ b/llvm/lib/Target/X86/X86InstrFMA3Info.h
@@ -14,11 +14,7 @@
 #ifndef LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
 #define LLVM_LIB_TARGET_X86_UTILS_X86INSTRFMA3INFO_H
 
-#include "X86.h"
-#include "llvm/ADT/DenseMap.h"
-#include <cassert>
 #include <cstdint>
-#include <set>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/X86/X86InstrFPStack.td b/llvm/lib/Target/X86/X86InstrFPStack.td
index 1830262205c6..67dcb8d00ea5 100644
--- a/llvm/lib/Target/X86/X86InstrFPStack.td
+++ b/llvm/lib/Target/X86/X86InstrFPStack.td
@@ -22,24 +22,17 @@ def SDTX86Fst       : SDTypeProfile<0, 2, [SDTCisFP<0>,
                                            SDTCisPtrTy<1>]>;
 def SDTX86Fild      : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
 def SDTX86Fist      : SDTypeProfile<0, 2, [SDTCisFP<0>, SDTCisPtrTy<1>]>;
-def SDTX86Fnstsw    : SDTypeProfile<1, 1, [SDTCisVT<0, i16>, SDTCisVT<1, i16>]>;
 
 def SDTX86CwdStore  : SDTypeProfile<0, 1, [SDTCisPtrTy<0>]>;
 
 def X86fld          : SDNode<"X86ISD::FLD", SDTX86Fld,
                              [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
 def X86fst          : SDNode<"X86ISD::FST", SDTX86Fst,
-                             [SDNPHasChain, SDNPOptInGlue, SDNPMayStore,
-                              SDNPMemOperand]>;
+                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 def X86fild         : SDNode<"X86ISD::FILD", SDTX86Fild,
                              [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-def X86fildflag     : SDNode<"X86ISD::FILD_FLAG", SDTX86Fild,
-                             [SDNPHasChain, SDNPOutGlue, SDNPMayLoad,
-                              SDNPMemOperand]>;
 def X86fist         : SDNode<"X86ISD::FIST", SDTX86Fist,
-                             [SDNPHasChain, SDNPOptInGlue, SDNPMayStore,
-                              SDNPMemOperand]>;
-def X86fp_stsw      : SDNode<"X86ISD::FNSTSW16r", SDTX86Fnstsw>;
+                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 def X86fp_to_mem : SDNode<"X86ISD::FP_TO_INT_IN_MEM", SDTX86Fst,
                           [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
 def X86fp_cwd_get16 : SDNode<"X86ISD::FNSTCW16m",          SDTX86CwdStore,
@@ -79,8 +72,9 @@ def X86fild64 : PatFrag<(ops node:$ptr), (X86fild node:$ptr), [{
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
 }]>;
 
-def X86fildflag64 : PatFrag<(ops node:$ptr), (X86fildflag node:$ptr), [{
-  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i64;
+def X86fist32 : PatFrag<(ops node:$val, node:$ptr),
+                        (X86fist node:$val, node:$ptr), [{
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT() == MVT::i32;
 }]>;
 
 def X86fist64 : PatFrag<(ops node:$val, node:$ptr),
@@ -292,7 +286,7 @@ defm MUL : FPBinary_rr<any_fmul>;
 defm DIV : FPBinary_rr<any_fdiv>;
 }
 
-// Sets the scheduling resources for the actual NAME#_F<size>m defintions.
+// Sets the scheduling resources for the actual NAME#_F<size>m definitions.
 let SchedRW = [WriteFAddLd] in {
 defm ADD : FPBinary<any_fadd, MRM0m, "add">;
 defm SUB : FPBinary<any_fsub, MRM4m, "sub">;
@@ -381,7 +375,8 @@ def TST_F  : FPI<0xD9, MRM_E4, (outs), (ins), "ftst">;
 
 // Versions of FP instructions that take a single memory operand.  Added for the
 //   disassembler; remove as they are included with patterns elsewhere.
-let SchedRW = [WriteFComLd], Uses = [FPCW], mayRaiseFPException = 1 in {
+let SchedRW = [WriteFComLd], Uses = [FPCW], mayRaiseFPException = 1,
+    mayLoad = 1 in {
 def FCOM32m  : FPI<0xD8, MRM2m, (outs), (ins f32mem:$src), "fcom{s}\t$src">;
 def FCOMP32m : FPI<0xD8, MRM3m, (outs), (ins f32mem:$src), "fcomp{s}\t$src">;
 
@@ -396,21 +391,22 @@ def FICOMP32m: FPI<0xDA, MRM3m, (outs), (ins i32mem:$src), "ficomp{l}\t$src">;
 } // SchedRW
 
 let SchedRW = [WriteMicrocoded] in {
-let Defs = [FPSW, FPCW] in {
+let Defs = [FPSW, FPCW], mayLoad = 1 in {
 def FLDENVm  : FPI<0xD9, MRM4m, (outs), (ins f32mem:$src), "fldenv\t$src">;
 def FRSTORm  : FPI<0xDD, MRM4m, (outs), (ins f32mem:$dst), "frstor\t$dst">;
 }
 
-let Defs = [FPSW, FPCW], Uses = [FPSW, FPCW] in {
+let Defs = [FPSW, FPCW], Uses = [FPSW, FPCW], mayStore = 1 in {
 def FSTENVm  : FPI<0xD9, MRM6m, (outs), (ins f32mem:$dst), "fnstenv\t$dst">;
 def FSAVEm   : FPI<0xDD, MRM6m, (outs), (ins f32mem:$dst), "fnsave\t$dst">;
 }
 
-let Uses = [FPSW] in
+let Uses = [FPSW], mayStore = 1 in
 def FNSTSWm  : FPI<0xDD, MRM7m, (outs), (ins i16mem:$dst), "fnstsw\t$dst">;
 
+let mayLoad = 1 in
 def FBLDm    : FPI<0xDF, MRM4m, (outs), (ins f80mem:$src), "fbld\t$src">;
-let Uses = [FPCW] ,mayRaiseFPException = 1 in
+let Uses = [FPCW] ,mayRaiseFPException = 1, mayStore = 1 in
 def FBSTPm   : FPI<0xDF, MRM6m, (outs), (ins f80mem:$dst), "fbstp\t$dst">;
 } // SchedRW
 
@@ -534,14 +530,20 @@ def ST_FpP80m    : FpI_<(outs), (ins f80mem:$op, RFP80:$src), OneArgFP,
 
 let mayStore = 1, hasSideEffects = 0 in {
 def IST_Fp16m32  : FpIf32<(outs), (ins i16mem:$op, RFP32:$src), OneArgFP, []>;
-def IST_Fp32m32  : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP, []>;
-def IST_Fp64m32  : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP, []>;
+def IST_Fp32m32  : FpIf32<(outs), (ins i32mem:$op, RFP32:$src), OneArgFP,
+                          [(X86fist32 RFP32:$src, addr:$op)]>;
+def IST_Fp64m32  : FpIf32<(outs), (ins i64mem:$op, RFP32:$src), OneArgFP,
+                          [(X86fist64 RFP32:$src, addr:$op)]>;
 def IST_Fp16m64  : FpIf64<(outs), (ins i16mem:$op, RFP64:$src), OneArgFP, []>;
-def IST_Fp32m64  : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP, []>;
-def IST_Fp64m64  : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP, []>;
+def IST_Fp32m64  : FpIf64<(outs), (ins i32mem:$op, RFP64:$src), OneArgFP,
+                          [(X86fist32 RFP64:$src, addr:$op)]>;
+def IST_Fp64m64  : FpIf64<(outs), (ins i64mem:$op, RFP64:$src), OneArgFP,
+                          [(X86fist64 RFP64:$src, addr:$op)]>;
 def IST_Fp16m80  : FpI_<(outs), (ins i16mem:$op, RFP80:$src), OneArgFP, []>;
-def IST_Fp32m80  : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP, []>;
-def IST_Fp64m80  : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP, []>;
+def IST_Fp32m80  : FpI_<(outs), (ins i32mem:$op, RFP80:$src), OneArgFP,
+                        [(X86fist32 RFP80:$src, addr:$op)]>;
+def IST_Fp64m80  : FpI_<(outs), (ins i64mem:$op, RFP80:$src), OneArgFP,
+                        [(X86fist64 RFP80:$src, addr:$op)]>;
 } // mayStore
 } // SchedRW, Uses = [FPCW]
 
@@ -601,6 +603,7 @@ let SchedRW = [WriteMove], Uses = [FPCW] in {
 def LD_Frr   : FPI<0xD9, MRM0r, (outs), (ins RSTi:$op), "fld\t$op">;
 def ST_Frr   : FPI<0xDD, MRM2r, (outs), (ins RSTi:$op), "fst\t$op">;
 def ST_FPrr  : FPI<0xDD, MRM3r, (outs), (ins RSTi:$op), "fstp\t$op">;
+let mayRaiseFPException = 0 in
 def XCH_F    : FPI<0xD9, MRM1r, (outs), (ins RSTi:$op), "fxch\t$op">;
 }
 
@@ -620,13 +623,13 @@ def LD_Fp180 : FpI_<(outs RFP80:$dst), (ins), ZeroArgFP,
                 [(set RFP80:$dst, fpimm1)]>;
 }
 
-let SchedRW = [WriteFLD0], Uses = [FPCW] in
+let SchedRW = [WriteFLD0], Uses = [FPCW], mayRaiseFPException = 0 in
 def LD_F0 : FPI<0xD9, MRM_EE, (outs), (ins), "fldz">;
 
-let SchedRW = [WriteFLD1], Uses = [FPCW] in
+let SchedRW = [WriteFLD1], Uses = [FPCW], mayRaiseFPException = 0 in
 def LD_F1 : FPI<0xD9, MRM_E8, (outs), (ins), "fld1">;
 
-let SchedRW = [WriteFLDC], Defs = [FPSW], Uses = [FPCW] in {
+let SchedRW = [WriteFLDC], Defs = [FPSW], Uses = [FPCW], mayRaiseFPException = 0 in {
 def FLDL2T : I<0xD9, MRM_E9, (outs), (ins), "fldl2t", []>;
 def FLDL2E : I<0xD9, MRM_EA, (outs), (ins), "fldl2e", []>;
 def FLDPI : I<0xD9, MRM_EB, (outs), (ins), "fldpi", []>;
@@ -635,25 +638,19 @@ def FLDLN2 : I<0xD9, MRM_ED, (outs), (ins), "fldln2", []>;
 } // SchedRW
 
 // Floating point compares.
-let SchedRW = [WriteFCom], Uses = [FPCW] in {
-def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
-                        [(set FPSW, (trunc (X86any_fcmp RFP32:$lhs, RFP32:$rhs)))]>;
-def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
-                        [(set FPSW, (trunc (X86any_fcmp RFP64:$lhs, RFP64:$rhs)))]>;
-def UCOM_Fpr80 : FpI_  <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
-                        [(set FPSW, (trunc (X86any_fcmp RFP80:$lhs, RFP80:$rhs)))]>;
-def COM_Fpr32  : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
-                        [(set FPSW, (trunc (X86strict_fcmps RFP32:$lhs, RFP32:$rhs)))]>;
-def COM_Fpr64  : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP,
-                        [(set FPSW, (trunc (X86strict_fcmps RFP64:$lhs, RFP64:$rhs)))]>;
-def COM_Fpr80  : FpI_  <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP,
-                        [(set FPSW, (trunc (X86strict_fcmps RFP80:$lhs, RFP80:$rhs)))]>;
+let SchedRW = [WriteFCom], Uses = [FPCW], hasSideEffects = 0 in {
+def UCOM_Fpr32 : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, []>;
+def UCOM_Fpr64 : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, []>;
+def UCOM_Fpr80 : FpI_  <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, []>;
+def COM_Fpr32  : FpIf32<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP, []>;
+def COM_Fpr64  : FpIf64<(outs), (ins RFP64:$lhs, RFP64:$rhs), CompareFP, []>;
+def COM_Fpr80  : FpI_  <(outs), (ins RFP80:$lhs, RFP80:$rhs), CompareFP, []>;
 } // SchedRW
 } // mayRaiseFPException = 1
 
 let SchedRW = [WriteFCom], mayRaiseFPException = 1 in {
 // CC = ST(0) cmp ST(i)
-let Defs = [EFLAGS, FPCW], Uses = [FPCW] in {
+let Defs = [EFLAGS, FPSW], Uses = [FPCW] in {
 def UCOM_FpIr32: FpI_<(outs), (ins RFP32:$lhs, RFP32:$rhs), CompareFP,
                   [(set EFLAGS, (X86any_fcmp RFP32:$lhs, RFP32:$rhs))]>,
                   Requires<[FPStackf32, HasCMov]>;
@@ -698,10 +695,9 @@ def COM_FIPr : FPI<0xDF, MRM6r, (outs), (ins RSTi:$reg),
 
 // Floating point flag ops.
 let SchedRW = [WriteALU] in {
-let Defs = [AX, FPSW], Uses = [FPSW] in
+let Defs = [AX, FPSW], Uses = [FPSW], hasSideEffects = 0 in
 def FNSTSW16r : I<0xDF, MRM_E0,                  // AX = fp flags
-                  (outs), (ins), "fnstsw\t{%ax|ax}",
-                  [(set AX, (X86fp_stsw FPSW))]>;
+                  (outs), (ins), "fnstsw\t{%ax|ax}", []>;
 let Defs = [FPSW], Uses = [FPCW] in
 def FNSTCW16m : I<0xD9, MRM7m,                   // [mem16] = X87 control world
                   (outs), (ins i16mem:$dst), "fnstcw\t$dst",
@@ -754,20 +750,20 @@ def FCOMPP : I<0xDE, MRM_D9, (outs), (ins), "fcompp", []>;
 
 let Uses = [FPSW, FPCW] in {
 def FXSAVE : I<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
-             "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, TB,
+             "fxsave\t$dst", [(int_x86_fxsave addr:$dst)]>, PS,
              Requires<[HasFXSR]>;
 def FXSAVE64 : RI<0xAE, MRM0m, (outs), (ins opaquemem:$dst),
                "fxsave64\t$dst", [(int_x86_fxsave64 addr:$dst)]>,
-               TB, Requires<[HasFXSR, In64BitMode]>;
+               PS, Requires<[HasFXSR, In64BitMode]>;
 } // Uses = [FPSW, FPCW]
 
 let Defs = [FPSW, FPCW] in {
 def FXRSTOR : I<0xAE, MRM1m, (outs), (ins opaquemem:$src),
               "fxrstor\t$src", [(int_x86_fxrstor addr:$src)]>,
-              TB, Requires<[HasFXSR]>;
+              PS, Requires<[HasFXSR]>;
 def FXRSTOR64 : RI<0xAE, MRM1m, (outs), (ins opaquemem:$src),
                 "fxrstor64\t$src", [(int_x86_fxrstor64 addr:$src)]>,
-                TB, Requires<[HasFXSR, In64BitMode]>;
+                PS, Requires<[HasFXSR, In64BitMode]>;
 } // Defs = [FPSW, FPCW]
 } // SchedRW
 
@@ -799,13 +795,6 @@ def : Pat<(f64 fpimmneg1), (CHS_Fp64 (LD_Fp164))>, Requires<[FPStackf64]>;
 def : Pat<(f80 fpimmneg0), (CHS_Fp80 (LD_Fp080))>;
 def : Pat<(f80 fpimmneg1), (CHS_Fp80 (LD_Fp180))>;
 
-// Used to conv. i64 to f64 since there isn't a SSE version.
-def : Pat<(X86fildflag64 addr:$src), (ILD_Fp64m64 addr:$src)>;
-
-// Used to conv. between f80 and i64 for i64 atomic loads.
-def : Pat<(X86fildflag64 addr:$src), (ILD_Fp64m80 addr:$src)>;
-def : Pat<(X86fist64 RFP80:$src, addr:$op), (IST_Fp64m80 addr:$op, RFP80:$src)>;
-
 // FP extensions map onto simple pseudo-value conversions if they are to/from
 // the FP stack.
 def : Pat<(f64 (any_fpextend RFP32:$src)), (COPY_TO_REGCLASS RFP32:$src, RFP64)>,
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.cpp b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
index f3b286e0375c..e16382e956c5 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.cpp
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.cpp
@@ -486,7 +486,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::CVTPD2PSrr,           X86::CVTPD2PSrm,           TB_ALIGN_16 },
   { X86::CVTPS2DQrr,           X86::CVTPS2DQrm,           TB_ALIGN_16 },
   { X86::CVTPS2PDrr,           X86::CVTPS2PDrm,           TB_NO_REVERSE },
+  { X86::CVTSD2SI64rr,         X86::CVTSD2SI64rm,         0 },
   { X86::CVTSD2SI64rr_Int,     X86::CVTSD2SI64rm_Int,     TB_NO_REVERSE },
+  { X86::CVTSD2SIrr,           X86::CVTSD2SIrm,           0 },
   { X86::CVTSD2SIrr_Int,       X86::CVTSD2SIrm_Int,       TB_NO_REVERSE },
   { X86::CVTSD2SSrr,           X86::CVTSD2SSrm,           0 },
   { X86::CVTSI2SDrr,           X86::CVTSI2SDrm,           0 },
@@ -494,7 +496,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::CVTSI642SDrr,         X86::CVTSI642SDrm,         0 },
   { X86::CVTSI642SSrr,         X86::CVTSI642SSrm,         0 },
   { X86::CVTSS2SDrr,           X86::CVTSS2SDrm,           0 },
+  { X86::CVTSS2SI64rr,         X86::CVTSS2SI64rm,         0 },
   { X86::CVTSS2SI64rr_Int,     X86::CVTSS2SI64rm_Int,     TB_NO_REVERSE },
+  { X86::CVTSS2SIrr,           X86::CVTSS2SIrm,           0 },
   { X86::CVTSS2SIrr_Int,       X86::CVTSS2SIrm_Int,       TB_NO_REVERSE },
   { X86::CVTTPD2DQrr,          X86::CVTTPD2DQrm,          TB_ALIGN_16 },
   { X86::CVTTPS2DQrr,          X86::CVTTPS2DQrm,          TB_ALIGN_16 },
@@ -627,18 +631,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::UCOMISSrr_Int,        X86::UCOMISSrm_Int,        TB_NO_REVERSE },
   { X86::VAESIMCrr,            X86::VAESIMCrm,            0 },
   { X86::VAESKEYGENASSIST128rr,X86::VAESKEYGENASSIST128rm,0 },
-  { X86::VBROADCASTF32X2Z256r, X86::VBROADCASTF32X2Z256m, TB_NO_REVERSE },
-  { X86::VBROADCASTF32X2Zr,    X86::VBROADCASTF32X2Zm,    TB_NO_REVERSE },
-  { X86::VBROADCASTI32X2Z128r, X86::VBROADCASTI32X2Z128m, TB_NO_REVERSE },
-  { X86::VBROADCASTI32X2Z256r, X86::VBROADCASTI32X2Z256m, TB_NO_REVERSE },
-  { X86::VBROADCASTI32X2Zr,    X86::VBROADCASTI32X2Zm,    TB_NO_REVERSE },
+  { X86::VBROADCASTF32X2Z256rr,X86::VBROADCASTF32X2Z256rm,TB_NO_REVERSE },
+  { X86::VBROADCASTF32X2Zrr,   X86::VBROADCASTF32X2Zrm,   TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Z128rr,X86::VBROADCASTI32X2Z128rm,TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Z256rr,X86::VBROADCASTI32X2Z256rm,TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Zrr,   X86::VBROADCASTI32X2Zrm,   TB_NO_REVERSE },
   { X86::VBROADCASTSDYrr,      X86::VBROADCASTSDYrm,      TB_NO_REVERSE },
-  { X86::VBROADCASTSDZ256r,    X86::VBROADCASTSDZ256m,    TB_NO_REVERSE },
-  { X86::VBROADCASTSDZr,       X86::VBROADCASTSDZm,       TB_NO_REVERSE },
+  { X86::VBROADCASTSDZ256rr,   X86::VBROADCASTSDZ256rm,   TB_NO_REVERSE },
+  { X86::VBROADCASTSDZrr,      X86::VBROADCASTSDZrm,      TB_NO_REVERSE },
   { X86::VBROADCASTSSYrr,      X86::VBROADCASTSSYrm,      TB_NO_REVERSE },
-  { X86::VBROADCASTSSZ128r,    X86::VBROADCASTSSZ128m,    TB_NO_REVERSE },
-  { X86::VBROADCASTSSZ256r,    X86::VBROADCASTSSZ256m,    TB_NO_REVERSE },
-  { X86::VBROADCASTSSZr,       X86::VBROADCASTSSZm,       TB_NO_REVERSE },
+  { X86::VBROADCASTSSZ128rr,   X86::VBROADCASTSSZ128rm,   TB_NO_REVERSE },
+  { X86::VBROADCASTSSZ256rr,   X86::VBROADCASTSSZ256rm,   TB_NO_REVERSE },
+  { X86::VBROADCASTSSZrr,      X86::VBROADCASTSSZrm,      TB_NO_REVERSE },
   { X86::VBROADCASTSSrr,       X86::VBROADCASTSSrm,       TB_NO_REVERSE },
   { X86::VCOMISDZrr,           X86::VCOMISDZrm,           0 },
   { X86::VCOMISDZrr_Int,       X86::VCOMISDZrm_Int,       TB_NO_REVERSE },
@@ -710,15 +714,23 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::VCVTQQ2PSZ128rr,      X86::VCVTQQ2PSZ128rm,      0 },
   { X86::VCVTQQ2PSZ256rr,      X86::VCVTQQ2PSZ256rm,      0 },
   { X86::VCVTQQ2PSZrr,         X86::VCVTQQ2PSZrm,         0 },
+  { X86::VCVTSD2SI64Zrr,       X86::VCVTSD2SI64Zrm,       0 },
   { X86::VCVTSD2SI64Zrr_Int,   X86::VCVTSD2SI64Zrm_Int,   TB_NO_REVERSE },
+  { X86::VCVTSD2SI64rr,        X86::VCVTSD2SI64rm,        0 },
   { X86::VCVTSD2SI64rr_Int,    X86::VCVTSD2SI64rm_Int,    TB_NO_REVERSE },
+  { X86::VCVTSD2SIZrr,         X86::VCVTSD2SIZrm,         0 },
   { X86::VCVTSD2SIZrr_Int,     X86::VCVTSD2SIZrm_Int,     TB_NO_REVERSE },
+  { X86::VCVTSD2SIrr,          X86::VCVTSD2SIrm,          0 },
   { X86::VCVTSD2SIrr_Int,      X86::VCVTSD2SIrm_Int,      TB_NO_REVERSE },
   { X86::VCVTSD2USI64Zrr_Int,  X86::VCVTSD2USI64Zrm_Int,  TB_NO_REVERSE },
   { X86::VCVTSD2USIZrr_Int,    X86::VCVTSD2USIZrm_Int,    TB_NO_REVERSE },
+  { X86::VCVTSS2SI64Zrr,       X86::VCVTSS2SI64Zrm,       0 },
   { X86::VCVTSS2SI64Zrr_Int,   X86::VCVTSS2SI64Zrm_Int,   TB_NO_REVERSE },
+  { X86::VCVTSS2SI64rr,        X86::VCVTSS2SI64rm,        0 },
   { X86::VCVTSS2SI64rr_Int,    X86::VCVTSS2SI64rm_Int,    TB_NO_REVERSE },
+  { X86::VCVTSS2SIZrr,         X86::VCVTSS2SIZrm,         0 },
   { X86::VCVTSS2SIZrr_Int,     X86::VCVTSS2SIZrm_Int,     TB_NO_REVERSE },
+  { X86::VCVTSS2SIrr,          X86::VCVTSS2SIrm,          0 },
   { X86::VCVTSS2SIrr_Int,      X86::VCVTSS2SIrm_Int,      TB_NO_REVERSE },
   { X86::VCVTSS2USI64Zrr_Int,  X86::VCVTSS2USI64Zrm_Int,  TB_NO_REVERSE },
   { X86::VCVTSS2USIZrr_Int,    X86::VCVTSS2USIZrm_Int,    TB_NO_REVERSE },
@@ -906,24 +918,24 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::VPABSWZrr,            X86::VPABSWZrm,            0 },
   { X86::VPABSWrr,             X86::VPABSWrm,             0 },
   { X86::VPBROADCASTBYrr,      X86::VPBROADCASTBYrm,      TB_NO_REVERSE },
-  { X86::VPBROADCASTBZ128r,    X86::VPBROADCASTBZ128m,    TB_NO_REVERSE },
-  { X86::VPBROADCASTBZ256r,    X86::VPBROADCASTBZ256m,    TB_NO_REVERSE },
-  { X86::VPBROADCASTBZr,       X86::VPBROADCASTBZm,       TB_NO_REVERSE },
-  { X86::VPBROADCASTBrr,       X86::VPBROADCASTBrm,       TB_NO_REVERSE },
+  { X86::VPBROADCASTBZ128rr,   X86::VPBROADCASTBZ128rm,   TB_NO_REVERSE },
+  { X86::VPBROADCASTBZ256rr,   X86::VPBROADCASTBZ256rm,   TB_NO_REVERSE },
+  { X86::VPBROADCASTBZrr,      X86::VPBROADCASTBZrm,      TB_NO_REVERSE },
+  { X86::VPBROADCASTBrr ,      X86::VPBROADCASTBrm,       TB_NO_REVERSE },
   { X86::VPBROADCASTDYrr,      X86::VPBROADCASTDYrm,      TB_NO_REVERSE },
-  { X86::VPBROADCASTDZ128r,    X86::VPBROADCASTDZ128m,    TB_NO_REVERSE },
-  { X86::VPBROADCASTDZ256r,    X86::VPBROADCASTDZ256m,    TB_NO_REVERSE },
-  { X86::VPBROADCASTDZr,       X86::VPBROADCASTDZm,       TB_NO_REVERSE },
+  { X86::VPBROADCASTDZ128rr,   X86::VPBROADCASTDZ128rm,   TB_NO_REVERSE },
+  { X86::VPBROADCASTDZ256rr,   X86::VPBROADCASTDZ256rm,   TB_NO_REVERSE },
+  { X86::VPBROADCASTDZrr,      X86::VPBROADCASTDZrm,      TB_NO_REVERSE },
   { X86::VPBROADCASTDrr,       X86::VPBROADCASTDrm,       TB_NO_REVERSE },
   { X86::VPBROADCASTQYrr,      X86::VPBROADCASTQYrm,      TB_NO_REVERSE },
-  { X86::VPBROADCASTQZ128r,    X86::VPBROADCASTQZ128m,    TB_NO_REVERSE },
-  { X86::VPBROADCASTQZ256r,    X86::VPBROADCASTQZ256m,    TB_NO_REVERSE },
-  { X86::VPBROADCASTQZr,       X86::VPBROADCASTQZm,       TB_NO_REVERSE },
+  { X86::VPBROADCASTQZ128rr,   X86::VPBROADCASTQZ128rm,   TB_NO_REVERSE },
+  { X86::VPBROADCASTQZ256rr,   X86::VPBROADCASTQZ256rm,   TB_NO_REVERSE },
+  { X86::VPBROADCASTQZrr,      X86::VPBROADCASTQZrm,      TB_NO_REVERSE },
   { X86::VPBROADCASTQrr,       X86::VPBROADCASTQrm,       TB_NO_REVERSE },
   { X86::VPBROADCASTWYrr,      X86::VPBROADCASTWYrm,      TB_NO_REVERSE },
-  { X86::VPBROADCASTWZ128r,    X86::VPBROADCASTWZ128m,    TB_NO_REVERSE },
-  { X86::VPBROADCASTWZ256r,    X86::VPBROADCASTWZ256m,    TB_NO_REVERSE },
-  { X86::VPBROADCASTWZr,       X86::VPBROADCASTWZm,       TB_NO_REVERSE },
+  { X86::VPBROADCASTWZ128rr,   X86::VPBROADCASTWZ128rm,   TB_NO_REVERSE },
+  { X86::VPBROADCASTWZ256rr,   X86::VPBROADCASTWZ256rm,   TB_NO_REVERSE },
+  { X86::VPBROADCASTWZrr,      X86::VPBROADCASTWZrm,      TB_NO_REVERSE },
   { X86::VPBROADCASTWrr,       X86::VPBROADCASTWrm,       TB_NO_REVERSE },
   { X86::VPCMPESTRIrr,         X86::VPCMPESTRIrm,         0 },
   { X86::VPCMPESTRMrr,         X86::VPCMPESTRMrm,         0 },
@@ -1100,9 +1112,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::VPSHUFLWZ256ri,       X86::VPSHUFLWZ256mi,       0 },
   { X86::VPSHUFLWZri,          X86::VPSHUFLWZmi,          0 },
   { X86::VPSHUFLWri,           X86::VPSHUFLWmi,           0 },
-  { X86::VPSLLDQZ128rr,        X86::VPSLLDQZ128rm,        0 },
-  { X86::VPSLLDQZ256rr,        X86::VPSLLDQZ256rm,        0 },
-  { X86::VPSLLDQZrr,           X86::VPSLLDQZrm,           0 },
+  { X86::VPSLLDQZ128ri,        X86::VPSLLDQZ128mi,        0 },
+  { X86::VPSLLDQZ256ri,        X86::VPSLLDQZ256mi,        0 },
+  { X86::VPSLLDQZri,           X86::VPSLLDQZmi,           0 },
   { X86::VPSLLDZ128ri,         X86::VPSLLDZ128mi,         0 },
   { X86::VPSLLDZ256ri,         X86::VPSLLDZ256mi,         0 },
   { X86::VPSLLDZri,            X86::VPSLLDZmi,            0 },
@@ -1121,9 +1133,9 @@ static const X86MemoryFoldTableEntry MemoryFoldTable1[] = {
   { X86::VPSRAWZ128ri,         X86::VPSRAWZ128mi,         0 },
   { X86::VPSRAWZ256ri,         X86::VPSRAWZ256mi,         0 },
   { X86::VPSRAWZri,            X86::VPSRAWZmi,            0 },
-  { X86::VPSRLDQZ128rr,        X86::VPSRLDQZ128rm,        0 },
-  { X86::VPSRLDQZ256rr,        X86::VPSRLDQZ256rm,        0 },
-  { X86::VPSRLDQZrr,           X86::VPSRLDQZrm,           0 },
+  { X86::VPSRLDQZ128ri,        X86::VPSRLDQZ128mi,        0 },
+  { X86::VPSRLDQZ256ri,        X86::VPSRLDQZ256mi,        0 },
+  { X86::VPSRLDQZri,           X86::VPSRLDQZmi,           0 },
   { X86::VPSRLDZ128ri,         X86::VPSRLDZ128mi,         0 },
   { X86::VPSRLDZ256ri,         X86::VPSRLDZ256mi,         0 },
   { X86::VPSRLDZri,            X86::VPSRLDZmi,            0 },
@@ -1609,16 +1621,16 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VBLENDVPDrr,              X86::VBLENDVPDrm,              0 },
   { X86::VBLENDVPSYrr,             X86::VBLENDVPSYrm,             0 },
   { X86::VBLENDVPSrr,              X86::VBLENDVPSrm,              0 },
-  { X86::VBROADCASTF32X2Z256rkz,   X86::VBROADCASTF32X2Z256mkz,   TB_NO_REVERSE },
-  { X86::VBROADCASTF32X2Zrkz,      X86::VBROADCASTF32X2Zmkz,      TB_NO_REVERSE },
-  { X86::VBROADCASTI32X2Z128rkz,   X86::VBROADCASTI32X2Z128mkz,   TB_NO_REVERSE },
-  { X86::VBROADCASTI32X2Z256rkz,   X86::VBROADCASTI32X2Z256mkz,   TB_NO_REVERSE },
-  { X86::VBROADCASTI32X2Zrkz,      X86::VBROADCASTI32X2Zmkz,      TB_NO_REVERSE },
-  { X86::VBROADCASTSDZ256rkz,      X86::VBROADCASTSDZ256mkz,      TB_NO_REVERSE },
-  { X86::VBROADCASTSDZrkz,         X86::VBROADCASTSDZmkz,         TB_NO_REVERSE },
-  { X86::VBROADCASTSSZ128rkz,      X86::VBROADCASTSSZ128mkz,      TB_NO_REVERSE },
-  { X86::VBROADCASTSSZ256rkz,      X86::VBROADCASTSSZ256mkz,      TB_NO_REVERSE },
-  { X86::VBROADCASTSSZrkz,         X86::VBROADCASTSSZmkz,         TB_NO_REVERSE },
+  { X86::VBROADCASTF32X2Z256rrkz,  X86::VBROADCASTF32X2Z256rmkz,  TB_NO_REVERSE },
+  { X86::VBROADCASTF32X2Zrrkz,     X86::VBROADCASTF32X2Zrmkz,     TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Z128rrkz,  X86::VBROADCASTI32X2Z128rmkz,  TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Z256rrkz,  X86::VBROADCASTI32X2Z256rmkz,  TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Zrrkz,     X86::VBROADCASTI32X2Zrmkz,     TB_NO_REVERSE },
+  { X86::VBROADCASTSDZ256rrkz,     X86::VBROADCASTSDZ256rmkz,     TB_NO_REVERSE },
+  { X86::VBROADCASTSDZrrkz,        X86::VBROADCASTSDZrmkz,        TB_NO_REVERSE },
+  { X86::VBROADCASTSSZ128rrkz,     X86::VBROADCASTSSZ128rmkz,     TB_NO_REVERSE },
+  { X86::VBROADCASTSSZ256rrkz,     X86::VBROADCASTSSZ256rmkz,     TB_NO_REVERSE },
+  { X86::VBROADCASTSSZrrkz,        X86::VBROADCASTSSZrmkz,        TB_NO_REVERSE },
   { X86::VCMPPDYrri,               X86::VCMPPDYrmi,               0 },
   { X86::VCMPPDZ128rri,            X86::VCMPPDZ128rmi,            0 },
   { X86::VCMPPDZ256rri,            X86::VCMPPDZ256rmi,            0 },
@@ -2153,18 +2165,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable2[] = {
   { X86::VPBLENDVBrr,              X86::VPBLENDVBrm,              0 },
   { X86::VPBLENDWYrri,             X86::VPBLENDWYrmi,             0 },
   { X86::VPBLENDWrri,              X86::VPBLENDWrmi,              0 },
-  { X86::VPBROADCASTBZ128rkz,      X86::VPBROADCASTBZ128mkz,      TB_NO_REVERSE },
-  { X86::VPBROADCASTBZ256rkz,      X86::VPBROADCASTBZ256mkz,      TB_NO_REVERSE },
-  { X86::VPBROADCASTBZrkz,         X86::VPBROADCASTBZmkz,         TB_NO_REVERSE },
-  { X86::VPBROADCASTDZ128rkz,      X86::VPBROADCASTDZ128mkz,      TB_NO_REVERSE },
-  { X86::VPBROADCASTDZ256rkz,      X86::VPBROADCASTDZ256mkz,      TB_NO_REVERSE },
-  { X86::VPBROADCASTDZrkz,         X86::VPBROADCASTDZmkz,         TB_NO_REVERSE },
-  { X86::VPBROADCASTQZ128rkz,      X86::VPBROADCASTQZ128mkz,      TB_NO_REVERSE },
-  { X86::VPBROADCASTQZ256rkz,      X86::VPBROADCASTQZ256mkz,      TB_NO_REVERSE },
-  { X86::VPBROADCASTQZrkz,         X86::VPBROADCASTQZmkz,         TB_NO_REVERSE },
-  { X86::VPBROADCASTWZ128rkz,      X86::VPBROADCASTWZ128mkz,      TB_NO_REVERSE },
-  { X86::VPBROADCASTWZ256rkz,      X86::VPBROADCASTWZ256mkz,      TB_NO_REVERSE },
-  { X86::VPBROADCASTWZrkz,         X86::VPBROADCASTWZmkz,         TB_NO_REVERSE },
+  { X86::VPBROADCASTBZ128rrkz,     X86::VPBROADCASTBZ128rmkz,     TB_NO_REVERSE },
+  { X86::VPBROADCASTBZ256rrkz,     X86::VPBROADCASTBZ256rmkz,     TB_NO_REVERSE },
+  { X86::VPBROADCASTBZrrkz,        X86::VPBROADCASTBZrmkz,        TB_NO_REVERSE },
+  { X86::VPBROADCASTDZ128rrkz,     X86::VPBROADCASTDZ128rmkz,     TB_NO_REVERSE },
+  { X86::VPBROADCASTDZ256rrkz,     X86::VPBROADCASTDZ256rmkz,     TB_NO_REVERSE },
+  { X86::VPBROADCASTDZrrkz,        X86::VPBROADCASTDZrmkz,        TB_NO_REVERSE },
+  { X86::VPBROADCASTQZ128rrkz,     X86::VPBROADCASTQZ128rmkz,     TB_NO_REVERSE },
+  { X86::VPBROADCASTQZ256rrkz,     X86::VPBROADCASTQZ256rmkz,     TB_NO_REVERSE },
+  { X86::VPBROADCASTQZrrkz,        X86::VPBROADCASTQZrmkz,        TB_NO_REVERSE },
+  { X86::VPBROADCASTWZ128rrkz,     X86::VPBROADCASTWZ128rmkz,     TB_NO_REVERSE },
+  { X86::VPBROADCASTWZ256rrkz,     X86::VPBROADCASTWZ256rmkz,     TB_NO_REVERSE },
+  { X86::VPBROADCASTWZrrkz,        X86::VPBROADCASTWZrmkz,        TB_NO_REVERSE },
   { X86::VPCLMULQDQYrr,            X86::VPCLMULQDQYrm,            0 },
   { X86::VPCLMULQDQZ128rr,         X86::VPCLMULQDQZ128rm,         0 },
   { X86::VPCLMULQDQZ256rr,         X86::VPCLMULQDQZ256rm,         0 },
@@ -3010,16 +3022,16 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VBLENDMPSZ128rrk,           X86::VBLENDMPSZ128rmk,           0 },
   { X86::VBLENDMPSZ256rrk,           X86::VBLENDMPSZ256rmk,           0 },
   { X86::VBLENDMPSZrrk,              X86::VBLENDMPSZrmk,              0 },
-  { X86::VBROADCASTF32X2Z256rk,      X86::VBROADCASTF32X2Z256mk,      TB_NO_REVERSE },
-  { X86::VBROADCASTF32X2Zrk,         X86::VBROADCASTF32X2Zmk,         TB_NO_REVERSE },
-  { X86::VBROADCASTI32X2Z128rk,      X86::VBROADCASTI32X2Z128mk,      TB_NO_REVERSE },
-  { X86::VBROADCASTI32X2Z256rk,      X86::VBROADCASTI32X2Z256mk,      TB_NO_REVERSE },
-  { X86::VBROADCASTI32X2Zrk,         X86::VBROADCASTI32X2Zmk,         TB_NO_REVERSE },
-  { X86::VBROADCASTSDZ256rk,         X86::VBROADCASTSDZ256mk,         TB_NO_REVERSE },
-  { X86::VBROADCASTSDZrk,            X86::VBROADCASTSDZmk,            TB_NO_REVERSE },
-  { X86::VBROADCASTSSZ128rk,         X86::VBROADCASTSSZ128mk,         TB_NO_REVERSE },
-  { X86::VBROADCASTSSZ256rk,         X86::VBROADCASTSSZ256mk,         TB_NO_REVERSE },
-  { X86::VBROADCASTSSZrk,            X86::VBROADCASTSSZmk,            TB_NO_REVERSE },
+  { X86::VBROADCASTF32X2Z256rrk,     X86::VBROADCASTF32X2Z256rmk,     TB_NO_REVERSE },
+  { X86::VBROADCASTF32X2Zrrk,        X86::VBROADCASTF32X2Zrmk,        TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Z128rrk,     X86::VBROADCASTI32X2Z128rmk,     TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Z256rrk,     X86::VBROADCASTI32X2Z256rmk,     TB_NO_REVERSE },
+  { X86::VBROADCASTI32X2Zrrk,        X86::VBROADCASTI32X2Zrmk,        TB_NO_REVERSE },
+  { X86::VBROADCASTSDZ256rrk,        X86::VBROADCASTSDZ256rmk,        TB_NO_REVERSE },
+  { X86::VBROADCASTSDZrrk,           X86::VBROADCASTSDZrmk,           TB_NO_REVERSE },
+  { X86::VBROADCASTSSZ128rrk,        X86::VBROADCASTSSZ128rmk,        TB_NO_REVERSE },
+  { X86::VBROADCASTSSZ256rrk,        X86::VBROADCASTSSZ256rmk,        TB_NO_REVERSE },
+  { X86::VBROADCASTSSZrrk,           X86::VBROADCASTSSZrmk,           TB_NO_REVERSE },
   { X86::VCMPPDZ128rrik,             X86::VCMPPDZ128rmik,             0 },
   { X86::VCMPPDZ256rrik,             X86::VCMPPDZ256rmik,             0 },
   { X86::VCMPPDZrrik,                X86::VCMPPDZrmik,                0 },
@@ -3662,18 +3674,18 @@ static const X86MemoryFoldTableEntry MemoryFoldTable3[] = {
   { X86::VPBLENDMWZ128rrk,           X86::VPBLENDMWZ128rmk,           0 },
   { X86::VPBLENDMWZ256rrk,           X86::VPBLENDMWZ256rmk,           0 },
   { X86::VPBLENDMWZrrk,              X86::VPBLENDMWZrmk,              0 },
-  { X86::VPBROADCASTBZ128rk,         X86::VPBROADCASTBZ128mk,         TB_NO_REVERSE },
-  { X86::VPBROADCASTBZ256rk,         X86::VPBROADCASTBZ256mk,         TB_NO_REVERSE },
-  { X86::VPBROADCASTBZrk,            X86::VPBROADCASTBZmk,            TB_NO_REVERSE },
-  { X86::VPBROADCASTDZ128rk,         X86::VPBROADCASTDZ128mk,         TB_NO_REVERSE },
-  { X86::VPBROADCASTDZ256rk,         X86::VPBROADCASTDZ256mk,         TB_NO_REVERSE },
-  { X86::VPBROADCASTDZrk,            X86::VPBROADCASTDZmk,            TB_NO_REVERSE },
-  { X86::VPBROADCASTQZ128rk,         X86::VPBROADCASTQZ128mk,         TB_NO_REVERSE },
-  { X86::VPBROADCASTQZ256rk,         X86::VPBROADCASTQZ256mk,         TB_NO_REVERSE },
-  { X86::VPBROADCASTQZrk,            X86::VPBROADCASTQZmk,            TB_NO_REVERSE },
-  { X86::VPBROADCASTWZ128rk,         X86::VPBROADCASTWZ128mk,         TB_NO_REVERSE },
-  { X86::VPBROADCASTWZ256rk,         X86::VPBROADCASTWZ256mk,         TB_NO_REVERSE },
-  { X86::VPBROADCASTWZrk,            X86::VPBROADCASTWZmk,            TB_NO_REVERSE },
+  { X86::VPBROADCASTBZ128rrk,        X86::VPBROADCASTBZ128rmk,        TB_NO_REVERSE },
+  { X86::VPBROADCASTBZ256rrk,        X86::VPBROADCASTBZ256rmk,        TB_NO_REVERSE },
+  { X86::VPBROADCASTBZrrk,           X86::VPBROADCASTBZrmk,           TB_NO_REVERSE },
+  { X86::VPBROADCASTDZ128rrk,        X86::VPBROADCASTDZ128rmk,        TB_NO_REVERSE },
+  { X86::VPBROADCASTDZ256rrk,        X86::VPBROADCASTDZ256rmk,        TB_NO_REVERSE },
+  { X86::VPBROADCASTDZrrk,           X86::VPBROADCASTDZrmk,           TB_NO_REVERSE },
+  { X86::VPBROADCASTQZ128rrk,        X86::VPBROADCASTQZ128rmk,        TB_NO_REVERSE },
+  { X86::VPBROADCASTQZ256rrk,        X86::VPBROADCASTQZ256rmk,        TB_NO_REVERSE },
+  { X86::VPBROADCASTQZrrk,           X86::VPBROADCASTQZrmk,           TB_NO_REVERSE },
+  { X86::VPBROADCASTWZ128rrk,        X86::VPBROADCASTWZ128rmk,        TB_NO_REVERSE },
+  { X86::VPBROADCASTWZ256rrk,        X86::VPBROADCASTWZ256rmk,        TB_NO_REVERSE },
+  { X86::VPBROADCASTWZrrk,           X86::VPBROADCASTWZrmk,           TB_NO_REVERSE },
   { X86::VPCMOVYrrr,                 X86::VPCMOVYrrm,                 0 },
   { X86::VPCMOVrrr,                  X86::VPCMOVrrm,                  0 },
   { X86::VPCMPBZ128rrik,             X86::VPCMPBZ128rmik,             0 },
@@ -5509,6 +5521,12 @@ static const X86MemoryFoldTableEntry BroadcastFoldTable3[] = {
   { X86::VFNMSUB231PSZ128r,    X86::VFNMSUB231PSZ128mb,   TB_BCAST_SS },
   { X86::VFNMSUB231PSZ256r,    X86::VFNMSUB231PSZ256mb,   TB_BCAST_SS },
   { X86::VFNMSUB231PSZr,       X86::VFNMSUB231PSZmb,      TB_BCAST_SS },
+  { X86::VPTERNLOGDZ128rri,    X86::VPTERNLOGDZ128rmbi,   TB_BCAST_D },
+  { X86::VPTERNLOGDZ256rri,    X86::VPTERNLOGDZ256rmbi,   TB_BCAST_D },
+  { X86::VPTERNLOGDZrri,       X86::VPTERNLOGDZrmbi,      TB_BCAST_D },
+  { X86::VPTERNLOGQZ128rri,    X86::VPTERNLOGQZ128rmbi,   TB_BCAST_Q },
+  { X86::VPTERNLOGQZ256rri,    X86::VPTERNLOGQZ256rmbi,   TB_BCAST_Q },
+  { X86::VPTERNLOGQZrri,       X86::VPTERNLOGQZrmbi,      TB_BCAST_Q },
 };
 
 static const X86MemoryFoldTableEntry *
@@ -5517,53 +5535,45 @@ lookupFoldTableImpl(ArrayRef<X86MemoryFoldTableEntry> Table, unsigned RegOp) {
   // Make sure the tables are sorted.
   static std::atomic<bool> FoldTablesChecked(false);
   if (!FoldTablesChecked.load(std::memory_order_relaxed)) {
-    assert(std::is_sorted(std::begin(MemoryFoldTable2Addr),
-                          std::end(MemoryFoldTable2Addr)) &&
+    assert(llvm::is_sorted(MemoryFoldTable2Addr) &&
            std::adjacent_find(std::begin(MemoryFoldTable2Addr),
                               std::end(MemoryFoldTable2Addr)) ==
-           std::end(MemoryFoldTable2Addr) &&
+               std::end(MemoryFoldTable2Addr) &&
            "MemoryFoldTable2Addr is not sorted and unique!");
-    assert(std::is_sorted(std::begin(MemoryFoldTable0),
-                          std::end(MemoryFoldTable0)) &&
+    assert(llvm::is_sorted(MemoryFoldTable0) &&
            std::adjacent_find(std::begin(MemoryFoldTable0),
                               std::end(MemoryFoldTable0)) ==
-           std::end(MemoryFoldTable0) &&
+               std::end(MemoryFoldTable0) &&
            "MemoryFoldTable0 is not sorted and unique!");
-    assert(std::is_sorted(std::begin(MemoryFoldTable1),
-                          std::end(MemoryFoldTable1)) &&
+    assert(llvm::is_sorted(MemoryFoldTable1) &&
            std::adjacent_find(std::begin(MemoryFoldTable1),
                               std::end(MemoryFoldTable1)) ==
-           std::end(MemoryFoldTable1) &&
+               std::end(MemoryFoldTable1) &&
            "MemoryFoldTable1 is not sorted and unique!");
-    assert(std::is_sorted(std::begin(MemoryFoldTable2),
-                          std::end(MemoryFoldTable2)) &&
+    assert(llvm::is_sorted(MemoryFoldTable2) &&
            std::adjacent_find(std::begin(MemoryFoldTable2),
                               std::end(MemoryFoldTable2)) ==
-           std::end(MemoryFoldTable2) &&
+               std::end(MemoryFoldTable2) &&
            "MemoryFoldTable2 is not sorted and unique!");
-    assert(std::is_sorted(std::begin(MemoryFoldTable3),
-                          std::end(MemoryFoldTable3)) &&
+    assert(llvm::is_sorted(MemoryFoldTable3) &&
            std::adjacent_find(std::begin(MemoryFoldTable3),
                               std::end(MemoryFoldTable3)) ==
-           std::end(MemoryFoldTable3) &&
+               std::end(MemoryFoldTable3) &&
            "MemoryFoldTable3 is not sorted and unique!");
-    assert(std::is_sorted(std::begin(MemoryFoldTable4),
-                          std::end(MemoryFoldTable4)) &&
+    assert(llvm::is_sorted(MemoryFoldTable4) &&
            std::adjacent_find(std::begin(MemoryFoldTable4),
                               std::end(MemoryFoldTable4)) ==
-           std::end(MemoryFoldTable4) &&
+               std::end(MemoryFoldTable4) &&
            "MemoryFoldTable4 is not sorted and unique!");
-    assert(std::is_sorted(std::begin(BroadcastFoldTable2),
-                          std::end(BroadcastFoldTable2)) &&
+    assert(llvm::is_sorted(BroadcastFoldTable2) &&
            std::adjacent_find(std::begin(BroadcastFoldTable2),
                               std::end(BroadcastFoldTable2)) ==
-           std::end(BroadcastFoldTable2) &&
+               std::end(BroadcastFoldTable2) &&
            "BroadcastFoldTable2 is not sorted and unique!");
-    assert(std::is_sorted(std::begin(BroadcastFoldTable3),
-                          std::end(BroadcastFoldTable3)) &&
+    assert(llvm::is_sorted(BroadcastFoldTable3) &&
            std::adjacent_find(std::begin(BroadcastFoldTable3),
                               std::end(BroadcastFoldTable3)) ==
-           std::end(BroadcastFoldTable3) &&
+               std::end(BroadcastFoldTable3) &&
            "BroadcastFoldTable3 is not sorted and unique!");
     FoldTablesChecked.store(true, std::memory_order_relaxed);
   }
@@ -5639,7 +5649,7 @@ struct X86MemUnfoldTable {
       addTableEntry(Entry, TB_INDEX_2 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
 
     for (const X86MemoryFoldTableEntry &Entry : BroadcastFoldTable3)
-      // Index 2, folded broadcast
+      // Index 3, folded broadcast
       addTableEntry(Entry, TB_INDEX_3 | TB_FOLDED_LOAD | TB_FOLDED_BCAST);
 
     // Sort the memory->reg unfold table.
diff --git a/llvm/lib/Target/X86/X86InstrFoldTables.h b/llvm/lib/Target/X86/X86InstrFoldTables.h
index 7dc236a0d7e4..b7aca27ab2bb 100644
--- a/llvm/lib/Target/X86/X86InstrFoldTables.h
+++ b/llvm/lib/Target/X86/X86InstrFoldTables.h
@@ -13,7 +13,7 @@
 #ifndef LLVM_LIB_TARGET_X86_X86INSTRFOLDTABLES_H
 #define LLVM_LIB_TARGET_X86_X86INSTRFOLDTABLES_H
 
-#include "llvm/Support/DataTypes.h"
+#include <cstdint>
 
 namespace llvm {
 
diff --git a/llvm/lib/Target/X86/X86InstrFormats.td b/llvm/lib/Target/X86/X86InstrFormats.td
index 2f797fcfb8de..d7752e656b55 100644
--- a/llvm/lib/Target/X86/X86InstrFormats.td
+++ b/llvm/lib/Target/X86/X86InstrFormats.td
@@ -27,26 +27,33 @@ def RawFrmDstSrc  : Format<6>;
 def RawFrmImm8    : Format<7>;
 def RawFrmImm16   : Format<8>;
 def AddCCFrm      : Format<9>;
-def MRMDestMem     : Format<32>;
-def MRMSrcMem      : Format<33>;
-def MRMSrcMem4VOp3 : Format<34>;
-def MRMSrcMemOp4   : Format<35>;
-def MRMSrcMemCC    : Format<36>;
-def MRMXmCC: Format<38>;
-def MRMXm  : Format<39>;
-def MRM0m  : Format<40>;  def MRM1m  : Format<41>;  def MRM2m  : Format<42>;
-def MRM3m  : Format<43>;  def MRM4m  : Format<44>;  def MRM5m  : Format<45>;
-def MRM6m  : Format<46>;  def MRM7m  : Format<47>;
-def MRMDestReg     : Format<48>;
-def MRMSrcReg      : Format<49>;
-def MRMSrcReg4VOp3 : Format<50>;
-def MRMSrcRegOp4   : Format<51>;
-def MRMSrcRegCC    : Format<52>;
-def MRMXrCC: Format<54>;
-def MRMXr  : Format<55>;
-def MRM0r  : Format<56>;  def MRM1r  : Format<57>;  def MRM2r  : Format<58>;
-def MRM3r  : Format<59>;  def MRM4r  : Format<60>;  def MRM5r  : Format<61>;
-def MRM6r  : Format<62>;  def MRM7r  : Format<63>;
+def PrefixByte    : Format<10>;
+def MRMr0          : Format<21>;
+def MRMSrcMemFSIB  : Format<22>;
+def MRMDestMemFSIB : Format<23>;
+def MRMDestMem     : Format<24>;
+def MRMSrcMem      : Format<25>;
+def MRMSrcMem4VOp3 : Format<26>;
+def MRMSrcMemOp4   : Format<27>;
+def MRMSrcMemCC    : Format<28>;
+def MRMXmCC: Format<30>;
+def MRMXm  : Format<31>;
+def MRM0m  : Format<32>;  def MRM1m  : Format<33>;  def MRM2m  : Format<34>;
+def MRM3m  : Format<35>;  def MRM4m  : Format<36>;  def MRM5m  : Format<37>;
+def MRM6m  : Format<38>;  def MRM7m  : Format<39>;
+def MRMDestReg     : Format<40>;
+def MRMSrcReg      : Format<41>;
+def MRMSrcReg4VOp3 : Format<42>;
+def MRMSrcRegOp4   : Format<43>;
+def MRMSrcRegCC    : Format<44>;
+def MRMXrCC: Format<46>;
+def MRMXr  : Format<47>;
+def MRM0r  : Format<48>;  def MRM1r  : Format<49>;  def MRM2r  : Format<50>;
+def MRM3r  : Format<51>;  def MRM4r  : Format<52>;  def MRM5r  : Format<53>;
+def MRM6r  : Format<54>;  def MRM7r  : Format<55>;
+def MRM0X  : Format<56>;  def MRM1X  : Format<57>;  def MRM2X  : Format<58>;
+def MRM3X  : Format<59>;  def MRM4X  : Format<60>;  def MRM5X  : Format<61>;
+def MRM6X  : Format<62>;  def MRM7X  : Format<63>;
 def MRM_C0 : Format<64>;  def MRM_C1 : Format<65>;  def MRM_C2 : Format<66>;
 def MRM_C3 : Format<67>;  def MRM_C4 : Format<68>;  def MRM_C5 : Format<69>;
 def MRM_C6 : Format<70>;  def MRM_C7 : Format<71>;  def MRM_C8 : Format<72>;
diff --git a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
index 3250123e5aa6..f3f7d17d9b3c 100644
--- a/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/llvm/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -59,9 +59,13 @@ def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
 def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
 def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
 def X86hsub    : SDNode<"X86ISD::HSUB",      SDTIntBinOp>;
-def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
-def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
-def X86cmps    : SDNode<"X86ISD::FSETCC",     SDTX86Cmps>;
+def X86comi    : SDNode<"X86ISD::COMI",      SDTX86FCmp>;
+def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86FCmp>;
+
+def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<0, 1>,
+                                      SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def X86cmps    : SDNode<"X86ISD::FSETCC",    SDTX86Cmps>;
+
 def X86pshufb  : SDNode<"X86ISD::PSHUFB",
                  SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i8>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
@@ -535,8 +539,20 @@ def X86any_Fmadd    : PatFrags<(ops node:$src1, node:$src2, node:$src3),
                                [(X86strict_Fmadd node:$src1, node:$src2, node:$src3),
                                 (X86Fmadd node:$src1, node:$src2, node:$src3)]>;
 def X86Fnmadd    : SDNode<"X86ISD::FNMADD",    SDTFPTernaryOp, [SDNPCommutative]>;
+def X86strict_Fnmadd : SDNode<"X86ISD::STRICT_FNMADD", SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
+def X86any_Fnmadd : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+                             [(X86strict_Fnmadd node:$src1, node:$src2, node:$src3),
+                              (X86Fnmadd node:$src1, node:$src2, node:$src3)]>;
 def X86Fmsub     : SDNode<"X86ISD::FMSUB",     SDTFPTernaryOp, [SDNPCommutative]>;
+def X86strict_Fmsub : SDNode<"X86ISD::STRICT_FMSUB",     SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
+def X86any_Fmsub : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+                            [(X86strict_Fmsub node:$src1, node:$src2, node:$src3),
+                             (X86Fmsub node:$src1, node:$src2, node:$src3)]>;
 def X86Fnmsub    : SDNode<"X86ISD::FNMSUB",    SDTFPTernaryOp, [SDNPCommutative]>;
+def X86strict_Fnmsub : SDNode<"X86ISD::STRICT_FNMSUB",    SDTFPTernaryOp, [SDNPCommutative, SDNPHasChain]>;
+def X86any_Fnmsub : PatFrags<(ops node:$src1, node:$src2, node:$src3),
+                             [(X86strict_Fnmsub node:$src1, node:$src2, node:$src3),
+                              (X86Fnmsub node:$src1, node:$src2, node:$src3)]>;
 def X86Fmaddsub  : SDNode<"X86ISD::FMADDSUB",  SDTFPTernaryOp, [SDNPCommutative]>;
 def X86Fmsubadd  : SDNode<"X86ISD::FMSUBADD",  SDTFPTernaryOp, [SDNPCommutative]>;
 
@@ -709,19 +725,27 @@ def X86mcvtp2UInt    : SDNode<"X86ISD::MCVTP2UI",  SDTMFloatToInt>;
 def X86mcvttp2si     : SDNode<"X86ISD::MCVTTP2SI", SDTMFloatToInt>;
 def X86mcvttp2ui     : SDNode<"X86ISD::MCVTTP2UI", SDTMFloatToInt>;
 
+def SDTcvtph2ps : SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
+                                       SDTCVecEltisVT<1, i16>]>;
+def X86cvtph2ps        : SDNode<"X86ISD::CVTPH2PS", SDTcvtph2ps>;
+def X86strict_cvtph2ps : SDNode<"X86ISD::STRICT_CVTPH2PS", SDTcvtph2ps,
+                                [SDNPHasChain]>;
+def X86any_cvtph2ps : PatFrags<(ops node:$src),
+                               [(X86strict_cvtph2ps node:$src),
+                                (X86cvtph2ps node:$src)]>;
+
+def X86cvtph2psSAE     : SDNode<"X86ISD::CVTPH2PS_SAE", SDTcvtph2ps>;
+
+def SDTcvtps2ph : SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
+                                       SDTCVecEltisVT<1, f32>,
+                                       SDTCisVT<2, i32>]>;
+def X86cvtps2ph        : SDNode<"X86ISD::CVTPS2PH", SDTcvtps2ph>;
+def X86strict_cvtps2ph : SDNode<"X86ISD::STRICT_CVTPS2PH", SDTcvtps2ph,
+                                [SDNPHasChain]>;
+def X86any_cvtps2ph : PatFrags<(ops node:$src1, node:$src2),
+                               [(X86strict_cvtps2ph node:$src1, node:$src2),
+                                (X86cvtps2ph node:$src1, node:$src2)]>;
 
-def X86cvtph2ps     : SDNode<"X86ISD::CVTPH2PS",
-                              SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
-                                                   SDTCVecEltisVT<1, i16>]> >;
-
-def X86cvtph2psSAE  : SDNode<"X86ISD::CVTPH2PS_SAE",
-                              SDTypeProfile<1, 1, [SDTCVecEltisVT<0, f32>,
-                                                   SDTCVecEltisVT<1, i16>]> >;
-
-def X86cvtps2ph   : SDNode<"X86ISD::CVTPS2PH",
-                        SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
-                                             SDTCVecEltisVT<1, f32>,
-                                             SDTCisVT<2, i32>]> >;
 def X86mcvtps2ph   : SDNode<"X86ISD::MCVTPS2PH",
                         SDTypeProfile<1, 4, [SDTCVecEltisVT<0, i16>,
                                              SDTCVecEltisVT<1, f32>,
@@ -741,7 +765,9 @@ def X86vfproundRnd: SDNode<"X86ISD::VFPROUND_RND",
 
 // cvt fp to bfloat16
 def X86cvtne2ps2bf16 : SDNode<"X86ISD::CVTNE2PS2BF16",
-                       SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisVec<1>,
+                       SDTypeProfile<1, 2, [SDTCVecEltisVT<0, i16>,
+                                            SDTCVecEltisVT<1, f32>,
+                                            SDTCisSameSizeAs<0,1>,
                                             SDTCisSameAs<1,2>]>>;
 def X86mcvtneps2bf16 : SDNode<"X86ISD::MCVTNEPS2BF16",
                        SDTypeProfile<1, 3, [SDTCVecEltisVT<0, i16>,
@@ -768,23 +794,6 @@ def SDTX86MaskedStore: SDTypeProfile<0, 3, [       // masked store
 ]>;
 
 //===----------------------------------------------------------------------===//
-// SSE Complex Patterns
-//===----------------------------------------------------------------------===//
-
-// These are 'extloads' from a scalar to the low element of a vector, zeroing
-// the top elements.  These are used for the SSE 'ss' and 'sd' instruction
-// forms.
-def sse_load_f32 : ComplexPattern<v4f32, 5, "selectScalarSSELoad", [],
-                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
-                                   SDNPWantRoot, SDNPWantParent]>;
-def sse_load_f64 : ComplexPattern<v2f64, 5, "selectScalarSSELoad", [],
-                                  [SDNPHasChain, SDNPMayLoad, SDNPMemOperand,
-                                   SDNPWantRoot, SDNPWantParent]>;
-
-def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
-def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
-
-//===----------------------------------------------------------------------===//
 // SSE pattern fragments
 //===----------------------------------------------------------------------===//
 
@@ -895,89 +904,6 @@ def memopv4i32 : PatFrag<(ops node:$ptr), (v4i32 (memop node:$ptr))>;
 def memopv8i16 : PatFrag<(ops node:$ptr), (v8i16 (memop node:$ptr))>;
 def memopv16i8 : PatFrag<(ops node:$ptr), (v16i8 (memop node:$ptr))>;
 
-def X86masked_gather : SDNode<"X86ISD::MGATHER",
-                              SDTypeProfile<2, 3, [SDTCisVec<0>,
-                                                   SDTCisVec<1>, SDTCisInt<1>,
-                                                   SDTCisSameAs<0, 2>,
-                                                   SDTCisSameAs<1, 3>,
-                                                   SDTCisPtrTy<4>]>,
-                             [SDNPHasChain, SDNPMayLoad, SDNPMemOperand]>;
-
-def X86masked_scatter : SDNode<"X86ISD::MSCATTER",
-                              SDTypeProfile<1, 3, [SDTCisVec<0>, SDTCisVec<1>,
-                                                   SDTCisSameAs<0, 2>,
-                                                   SDTCVecEltisVT<0, i1>,
-                                                   SDTCisPtrTy<3>]>,
-                             [SDNPHasChain, SDNPMayStore, SDNPMemOperand]>;
-
-def mgatherv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
-  return Mgt->getIndex().getValueType() == MVT::v4i32;
-}]>;
-
-def mgatherv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
-  return Mgt->getIndex().getValueType() == MVT::v8i32;
-}]>;
-
-def mgatherv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
-  return Mgt->getIndex().getValueType() == MVT::v2i64;
-}]>;
-def mgatherv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
-  return Mgt->getIndex().getValueType() == MVT::v4i64;
-}]>;
-def mgatherv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
-  return Mgt->getIndex().getValueType() == MVT::v8i64;
-}]>;
-def mgatherv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_gather node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedGatherSDNode *Mgt = cast<X86MaskedGatherSDNode>(N);
-  return Mgt->getIndex().getValueType() == MVT::v16i32;
-}]>;
-
-def mscatterv2i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
-  return Sc->getIndex().getValueType() == MVT::v2i64;
-}]>;
-
-def mscatterv4i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
-  return Sc->getIndex().getValueType() == MVT::v4i32;
-}]>;
-
-def mscatterv4i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
-  return Sc->getIndex().getValueType() == MVT::v4i64;
-}]>;
-
-def mscatterv8i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
-  return Sc->getIndex().getValueType() == MVT::v8i32;
-}]>;
-
-def mscatterv8i64 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
-  return Sc->getIndex().getValueType() == MVT::v8i64;
-}]>;
-def mscatterv16i32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
-  (X86masked_scatter node:$src1, node:$src2, node:$src3) , [{
-  X86MaskedScatterSDNode *Sc = cast<X86MaskedScatterSDNode>(N);
-  return Sc->getIndex().getValueType() == MVT::v16i32;
-}]>;
-
 // 128-bit bitconvert pattern fragments
 def bc_v4f32 : PatFrag<(ops node:$in), (v4f32 (bitconvert node:$in))>;
 def bc_v2f64 : PatFrag<(ops node:$in), (v2f64 (bitconvert node:$in))>;
@@ -1037,6 +963,23 @@ def X86VBroadcastld64 : PatFrag<(ops node:$src),
   return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getStoreSize() == 8;
 }]>;
 
+// Scalar SSE intrinsic fragments to match several different types of loads.
+// Used by scalar SSE intrinsic instructions which have 128 bit types, but
+// only load a single element.
+// FIXME: We should add more canolicalizing in DAGCombine. Particulary removing
+// the simple_load case.
+def sse_load_f32 : PatFrags<(ops node:$ptr),
+                            [(v4f32 (simple_load node:$ptr)),
+                             (v4f32 (X86vzload32 node:$ptr)),
+                             (v4f32 (scalar_to_vector (loadf32 node:$ptr)))]>;
+def sse_load_f64 : PatFrags<(ops node:$ptr),
+                            [(v2f64 (simple_load node:$ptr)),
+                             (v2f64 (X86vzload64 node:$ptr)),
+                             (v2f64 (scalar_to_vector (loadf64 node:$ptr)))]>;
+
+def ssmem : X86MemOperand<"printdwordmem", X86Mem32AsmOperand>;
+def sdmem : X86MemOperand<"printqwordmem", X86Mem64AsmOperand>;
+
 
 def fp32imm0 : PatLeaf<(f32 fpimm), [{
   return N->isExactlyValue(+0.0);
@@ -1185,60 +1128,60 @@ def X86MTruncUSStore : SDNode<"X86ISD::VMTRUNCSTOREUS",  SDTX86MaskedStore,
 
 def truncstore_s_vi8 : PatFrag<(ops node:$val, node:$ptr),
                                (X86TruncSStore node:$val, node:$ptr), [{
-  return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
 }]>;
 
 def truncstore_us_vi8 : PatFrag<(ops node:$val, node:$ptr),
                                (X86TruncUSStore node:$val, node:$ptr), [{
-  return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
 }]>;
 
 def truncstore_s_vi16 : PatFrag<(ops node:$val, node:$ptr),
                                (X86TruncSStore node:$val, node:$ptr), [{
-  return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
 }]>;
 
 def truncstore_us_vi16 : PatFrag<(ops node:$val, node:$ptr),
                                (X86TruncUSStore node:$val, node:$ptr), [{
-  return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
 }]>;
 
 def truncstore_s_vi32 : PatFrag<(ops node:$val, node:$ptr),
                                (X86TruncSStore node:$val, node:$ptr), [{
-  return cast<TruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
 }]>;
 
 def truncstore_us_vi32 : PatFrag<(ops node:$val, node:$ptr),
                                (X86TruncUSStore node:$val, node:$ptr), [{
-  return cast<TruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
 }]>;
 
 def masked_truncstore_s_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                      (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
-  return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
 }]>;
 
 def masked_truncstore_us_vi8 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                                (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
-  return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i8;
 }]>;
 
 def masked_truncstore_s_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                                (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
-  return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
 }]>;
 
 def masked_truncstore_us_vi16 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                                (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
-  return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i16;
 }]>;
 
 def masked_truncstore_s_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                                (X86MTruncSStore node:$src1, node:$src2, node:$src3), [{
-  return cast<MaskedTruncSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
 }]>;
 
 def masked_truncstore_us_vi32 : PatFrag<(ops node:$src1, node:$src2, node:$src3),
                                (X86MTruncUSStore node:$src1, node:$src2, node:$src3), [{
-  return cast<MaskedTruncUSStoreSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
+  return cast<MemIntrinsicSDNode>(N)->getMemoryVT().getScalarType() == MVT::i32;
 }]>;
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 245346d82731..42c111173570 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -88,7 +88,7 @@ X86InstrInfo::X86InstrInfo(X86Subtarget &STI)
 
 bool
 X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
-                                    unsigned &SrcReg, unsigned &DstReg,
+                                    Register &SrcReg, Register &DstReg,
                                     unsigned &SubIdx) const {
   switch (MI.getOpcode()) {
   default: break;
@@ -135,13 +135,497 @@ X86InstrInfo::isCoalescableExtInstr(const MachineInstr &MI,
   return false;
 }
 
+bool X86InstrInfo::isDataInvariant(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    // By default, assume that the instruction is not data invariant.
+    return false;
+
+    // Some target-independent operations that trivially lower to data-invariant
+    // instructions.
+  case TargetOpcode::COPY:
+  case TargetOpcode::INSERT_SUBREG:
+  case TargetOpcode::SUBREG_TO_REG:
+    return true;
+
+  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
+  // However, they set flags and are perhaps the most surprisingly constant
+  // time operations so we call them out here separately.
+  case X86::IMUL16rr:
+  case X86::IMUL16rri8:
+  case X86::IMUL16rri:
+  case X86::IMUL32rr:
+  case X86::IMUL32rri8:
+  case X86::IMUL32rri:
+  case X86::IMUL64rr:
+  case X86::IMUL64rri32:
+  case X86::IMUL64rri8:
+
+  // Bit scanning and counting instructions that are somewhat surprisingly
+  // constant time as they scan across bits and do other fairly complex
+  // operations like popcnt, but are believed to be constant time on x86.
+  // However, these set flags.
+  case X86::BSF16rr:
+  case X86::BSF32rr:
+  case X86::BSF64rr:
+  case X86::BSR16rr:
+  case X86::BSR32rr:
+  case X86::BSR64rr:
+  case X86::LZCNT16rr:
+  case X86::LZCNT32rr:
+  case X86::LZCNT64rr:
+  case X86::POPCNT16rr:
+  case X86::POPCNT32rr:
+  case X86::POPCNT64rr:
+  case X86::TZCNT16rr:
+  case X86::TZCNT32rr:
+  case X86::TZCNT64rr:
+
+  // Bit manipulation instructions are effectively combinations of basic
+  // arithmetic ops, and should still execute in constant time. These also
+  // set flags.
+  case X86::BLCFILL32rr:
+  case X86::BLCFILL64rr:
+  case X86::BLCI32rr:
+  case X86::BLCI64rr:
+  case X86::BLCIC32rr:
+  case X86::BLCIC64rr:
+  case X86::BLCMSK32rr:
+  case X86::BLCMSK64rr:
+  case X86::BLCS32rr:
+  case X86::BLCS64rr:
+  case X86::BLSFILL32rr:
+  case X86::BLSFILL64rr:
+  case X86::BLSI32rr:
+  case X86::BLSI64rr:
+  case X86::BLSIC32rr:
+  case X86::BLSIC64rr:
+  case X86::BLSMSK32rr:
+  case X86::BLSMSK64rr:
+  case X86::BLSR32rr:
+  case X86::BLSR64rr:
+  case X86::TZMSK32rr:
+  case X86::TZMSK64rr:
+
+  // Bit extracting and clearing instructions should execute in constant time,
+  // and set flags.
+  case X86::BEXTR32rr:
+  case X86::BEXTR64rr:
+  case X86::BEXTRI32ri:
+  case X86::BEXTRI64ri:
+  case X86::BZHI32rr:
+  case X86::BZHI64rr:
+
+  // Shift and rotate.
+  case X86::ROL8r1:
+  case X86::ROL16r1:
+  case X86::ROL32r1:
+  case X86::ROL64r1:
+  case X86::ROL8rCL:
+  case X86::ROL16rCL:
+  case X86::ROL32rCL:
+  case X86::ROL64rCL:
+  case X86::ROL8ri:
+  case X86::ROL16ri:
+  case X86::ROL32ri:
+  case X86::ROL64ri:
+  case X86::ROR8r1:
+  case X86::ROR16r1:
+  case X86::ROR32r1:
+  case X86::ROR64r1:
+  case X86::ROR8rCL:
+  case X86::ROR16rCL:
+  case X86::ROR32rCL:
+  case X86::ROR64rCL:
+  case X86::ROR8ri:
+  case X86::ROR16ri:
+  case X86::ROR32ri:
+  case X86::ROR64ri:
+  case X86::SAR8r1:
+  case X86::SAR16r1:
+  case X86::SAR32r1:
+  case X86::SAR64r1:
+  case X86::SAR8rCL:
+  case X86::SAR16rCL:
+  case X86::SAR32rCL:
+  case X86::SAR64rCL:
+  case X86::SAR8ri:
+  case X86::SAR16ri:
+  case X86::SAR32ri:
+  case X86::SAR64ri:
+  case X86::SHL8r1:
+  case X86::SHL16r1:
+  case X86::SHL32r1:
+  case X86::SHL64r1:
+  case X86::SHL8rCL:
+  case X86::SHL16rCL:
+  case X86::SHL32rCL:
+  case X86::SHL64rCL:
+  case X86::SHL8ri:
+  case X86::SHL16ri:
+  case X86::SHL32ri:
+  case X86::SHL64ri:
+  case X86::SHR8r1:
+  case X86::SHR16r1:
+  case X86::SHR32r1:
+  case X86::SHR64r1:
+  case X86::SHR8rCL:
+  case X86::SHR16rCL:
+  case X86::SHR32rCL:
+  case X86::SHR64rCL:
+  case X86::SHR8ri:
+  case X86::SHR16ri:
+  case X86::SHR32ri:
+  case X86::SHR64ri:
+  case X86::SHLD16rrCL:
+  case X86::SHLD32rrCL:
+  case X86::SHLD64rrCL:
+  case X86::SHLD16rri8:
+  case X86::SHLD32rri8:
+  case X86::SHLD64rri8:
+  case X86::SHRD16rrCL:
+  case X86::SHRD32rrCL:
+  case X86::SHRD64rrCL:
+  case X86::SHRD16rri8:
+  case X86::SHRD32rri8:
+  case X86::SHRD64rri8:
+
+  // Basic arithmetic is constant time on the input but does set flags.
+  case X86::ADC8rr:
+  case X86::ADC8ri:
+  case X86::ADC16rr:
+  case X86::ADC16ri:
+  case X86::ADC16ri8:
+  case X86::ADC32rr:
+  case X86::ADC32ri:
+  case X86::ADC32ri8:
+  case X86::ADC64rr:
+  case X86::ADC64ri8:
+  case X86::ADC64ri32:
+  case X86::ADD8rr:
+  case X86::ADD8ri:
+  case X86::ADD16rr:
+  case X86::ADD16ri:
+  case X86::ADD16ri8:
+  case X86::ADD32rr:
+  case X86::ADD32ri:
+  case X86::ADD32ri8:
+  case X86::ADD64rr:
+  case X86::ADD64ri8:
+  case X86::ADD64ri32:
+  case X86::AND8rr:
+  case X86::AND8ri:
+  case X86::AND16rr:
+  case X86::AND16ri:
+  case X86::AND16ri8:
+  case X86::AND32rr:
+  case X86::AND32ri:
+  case X86::AND32ri8:
+  case X86::AND64rr:
+  case X86::AND64ri8:
+  case X86::AND64ri32:
+  case X86::OR8rr:
+  case X86::OR8ri:
+  case X86::OR16rr:
+  case X86::OR16ri:
+  case X86::OR16ri8:
+  case X86::OR32rr:
+  case X86::OR32ri:
+  case X86::OR32ri8:
+  case X86::OR64rr:
+  case X86::OR64ri8:
+  case X86::OR64ri32:
+  case X86::SBB8rr:
+  case X86::SBB8ri:
+  case X86::SBB16rr:
+  case X86::SBB16ri:
+  case X86::SBB16ri8:
+  case X86::SBB32rr:
+  case X86::SBB32ri:
+  case X86::SBB32ri8:
+  case X86::SBB64rr:
+  case X86::SBB64ri8:
+  case X86::SBB64ri32:
+  case X86::SUB8rr:
+  case X86::SUB8ri:
+  case X86::SUB16rr:
+  case X86::SUB16ri:
+  case X86::SUB16ri8:
+  case X86::SUB32rr:
+  case X86::SUB32ri:
+  case X86::SUB32ri8:
+  case X86::SUB64rr:
+  case X86::SUB64ri8:
+  case X86::SUB64ri32:
+  case X86::XOR8rr:
+  case X86::XOR8ri:
+  case X86::XOR16rr:
+  case X86::XOR16ri:
+  case X86::XOR16ri8:
+  case X86::XOR32rr:
+  case X86::XOR32ri:
+  case X86::XOR32ri8:
+  case X86::XOR64rr:
+  case X86::XOR64ri8:
+  case X86::XOR64ri32:
+  // Arithmetic with just 32-bit and 64-bit variants and no immediates.
+  case X86::ADCX32rr:
+  case X86::ADCX64rr:
+  case X86::ADOX32rr:
+  case X86::ADOX64rr:
+  case X86::ANDN32rr:
+  case X86::ANDN64rr:
+  // Unary arithmetic operations.
+  case X86::DEC8r:
+  case X86::DEC16r:
+  case X86::DEC32r:
+  case X86::DEC64r:
+  case X86::INC8r:
+  case X86::INC16r:
+  case X86::INC32r:
+  case X86::INC64r:
+  case X86::NEG8r:
+  case X86::NEG16r:
+  case X86::NEG32r:
+  case X86::NEG64r:
+
+  // Unlike other arithmetic, NOT doesn't set EFLAGS.
+  case X86::NOT8r:
+  case X86::NOT16r:
+  case X86::NOT32r:
+  case X86::NOT64r:
+
+  // Various move instructions used to zero or sign extend things. Note that we
+  // intentionally don't support the _NOREX variants as we can't handle that
+  // register constraint anyways.
+  case X86::MOVSX16rr8:
+  case X86::MOVSX32rr8:
+  case X86::MOVSX32rr16:
+  case X86::MOVSX64rr8:
+  case X86::MOVSX64rr16:
+  case X86::MOVSX64rr32:
+  case X86::MOVZX16rr8:
+  case X86::MOVZX32rr8:
+  case X86::MOVZX32rr16:
+  case X86::MOVZX64rr8:
+  case X86::MOVZX64rr16:
+  case X86::MOV32rr:
+
+  // Arithmetic instructions that are both constant time and don't set flags.
+  case X86::RORX32ri:
+  case X86::RORX64ri:
+  case X86::SARX32rr:
+  case X86::SARX64rr:
+  case X86::SHLX32rr:
+  case X86::SHLX64rr:
+  case X86::SHRX32rr:
+  case X86::SHRX64rr:
+
+  // LEA doesn't actually access memory, and its arithmetic is constant time.
+  case X86::LEA16r:
+  case X86::LEA32r:
+  case X86::LEA64_32r:
+  case X86::LEA64r:
+    return true;
+  }
+}
+
+bool X86InstrInfo::isDataInvariantLoad(MachineInstr &MI) {
+  switch (MI.getOpcode()) {
+  default:
+    // By default, assume that the load will immediately leak.
+    return false;
+
+  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
+  // However, they set flags and are perhaps the most surprisingly constant
+  // time operations so we call them out here separately.
+  case X86::IMUL16rm:
+  case X86::IMUL16rmi8:
+  case X86::IMUL16rmi:
+  case X86::IMUL32rm:
+  case X86::IMUL32rmi8:
+  case X86::IMUL32rmi:
+  case X86::IMUL64rm:
+  case X86::IMUL64rmi32:
+  case X86::IMUL64rmi8:
+
+  // Bit scanning and counting instructions that are somewhat surprisingly
+  // constant time as they scan across bits and do other fairly complex
+  // operations like popcnt, but are believed to be constant time on x86.
+  // However, these set flags.
+  case X86::BSF16rm:
+  case X86::BSF32rm:
+  case X86::BSF64rm:
+  case X86::BSR16rm:
+  case X86::BSR32rm:
+  case X86::BSR64rm:
+  case X86::LZCNT16rm:
+  case X86::LZCNT32rm:
+  case X86::LZCNT64rm:
+  case X86::POPCNT16rm:
+  case X86::POPCNT32rm:
+  case X86::POPCNT64rm:
+  case X86::TZCNT16rm:
+  case X86::TZCNT32rm:
+  case X86::TZCNT64rm:
+
+  // Bit manipulation instructions are effectively combinations of basic
+  // arithmetic ops, and should still execute in constant time. These also
+  // set flags.
+  case X86::BLCFILL32rm:
+  case X86::BLCFILL64rm:
+  case X86::BLCI32rm:
+  case X86::BLCI64rm:
+  case X86::BLCIC32rm:
+  case X86::BLCIC64rm:
+  case X86::BLCMSK32rm:
+  case X86::BLCMSK64rm:
+  case X86::BLCS32rm:
+  case X86::BLCS64rm:
+  case X86::BLSFILL32rm:
+  case X86::BLSFILL64rm:
+  case X86::BLSI32rm:
+  case X86::BLSI64rm:
+  case X86::BLSIC32rm:
+  case X86::BLSIC64rm:
+  case X86::BLSMSK32rm:
+  case X86::BLSMSK64rm:
+  case X86::BLSR32rm:
+  case X86::BLSR64rm:
+  case X86::TZMSK32rm:
+  case X86::TZMSK64rm:
+
+  // Bit extracting and clearing instructions should execute in constant time,
+  // and set flags.
+  case X86::BEXTR32rm:
+  case X86::BEXTR64rm:
+  case X86::BEXTRI32mi:
+  case X86::BEXTRI64mi:
+  case X86::BZHI32rm:
+  case X86::BZHI64rm:
+
+  // Basic arithmetic is constant time on the input but does set flags.
+  case X86::ADC8rm:
+  case X86::ADC16rm:
+  case X86::ADC32rm:
+  case X86::ADC64rm:
+  case X86::ADCX32rm:
+  case X86::ADCX64rm:
+  case X86::ADD8rm:
+  case X86::ADD16rm:
+  case X86::ADD32rm:
+  case X86::ADD64rm:
+  case X86::ADOX32rm:
+  case X86::ADOX64rm:
+  case X86::AND8rm:
+  case X86::AND16rm:
+  case X86::AND32rm:
+  case X86::AND64rm:
+  case X86::ANDN32rm:
+  case X86::ANDN64rm:
+  case X86::OR8rm:
+  case X86::OR16rm:
+  case X86::OR32rm:
+  case X86::OR64rm:
+  case X86::SBB8rm:
+  case X86::SBB16rm:
+  case X86::SBB32rm:
+  case X86::SBB64rm:
+  case X86::SUB8rm:
+  case X86::SUB16rm:
+  case X86::SUB32rm:
+  case X86::SUB64rm:
+  case X86::XOR8rm:
+  case X86::XOR16rm:
+  case X86::XOR32rm:
+  case X86::XOR64rm:
+
+  // Integer multiply w/o affecting flags is still believed to be constant
+  // time on x86. Called out separately as this is among the most surprising
+  // instructions to exhibit that behavior.
+  case X86::MULX32rm:
+  case X86::MULX64rm:
+
+  // Arithmetic instructions that are both constant time and don't set flags.
+  case X86::RORX32mi:
+  case X86::RORX64mi:
+  case X86::SARX32rm:
+  case X86::SARX64rm:
+  case X86::SHLX32rm:
+  case X86::SHLX64rm:
+  case X86::SHRX32rm:
+  case X86::SHRX64rm:
+
+  // Conversions are believed to be constant time and don't set flags.
+  case X86::CVTTSD2SI64rm:
+  case X86::VCVTTSD2SI64rm:
+  case X86::VCVTTSD2SI64Zrm:
+  case X86::CVTTSD2SIrm:
+  case X86::VCVTTSD2SIrm:
+  case X86::VCVTTSD2SIZrm:
+  case X86::CVTTSS2SI64rm:
+  case X86::VCVTTSS2SI64rm:
+  case X86::VCVTTSS2SI64Zrm:
+  case X86::CVTTSS2SIrm:
+  case X86::VCVTTSS2SIrm:
+  case X86::VCVTTSS2SIZrm:
+  case X86::CVTSI2SDrm:
+  case X86::VCVTSI2SDrm:
+  case X86::VCVTSI2SDZrm:
+  case X86::CVTSI2SSrm:
+  case X86::VCVTSI2SSrm:
+  case X86::VCVTSI2SSZrm:
+  case X86::CVTSI642SDrm:
+  case X86::VCVTSI642SDrm:
+  case X86::VCVTSI642SDZrm:
+  case X86::CVTSI642SSrm:
+  case X86::VCVTSI642SSrm:
+  case X86::VCVTSI642SSZrm:
+  case X86::CVTSS2SDrm:
+  case X86::VCVTSS2SDrm:
+  case X86::VCVTSS2SDZrm:
+  case X86::CVTSD2SSrm:
+  case X86::VCVTSD2SSrm:
+  case X86::VCVTSD2SSZrm:
+  // AVX512 added unsigned integer conversions.
+  case X86::VCVTTSD2USI64Zrm:
+  case X86::VCVTTSD2USIZrm:
+  case X86::VCVTTSS2USI64Zrm:
+  case X86::VCVTTSS2USIZrm:
+  case X86::VCVTUSI2SDZrm:
+  case X86::VCVTUSI642SDZrm:
+  case X86::VCVTUSI2SSZrm:
+  case X86::VCVTUSI642SSZrm:
+
+  // Loads to register don't set flags.
+  case X86::MOV8rm:
+  case X86::MOV8rm_NOREX:
+  case X86::MOV16rm:
+  case X86::MOV32rm:
+  case X86::MOV64rm:
+  case X86::MOVSX16rm8:
+  case X86::MOVSX32rm16:
+  case X86::MOVSX32rm8:
+  case X86::MOVSX32rm8_NOREX:
+  case X86::MOVSX64rm16:
+  case X86::MOVSX64rm32:
+  case X86::MOVSX64rm8:
+  case X86::MOVZX16rm8:
+  case X86::MOVZX32rm16:
+  case X86::MOVZX32rm8:
+  case X86::MOVZX32rm8_NOREX:
+  case X86::MOVZX64rm16:
+  case X86::MOVZX64rm8:
+    return true;
+  }
+}
+
 int X86InstrInfo::getSPAdjust(const MachineInstr &MI) const {
   const MachineFunction *MF = MI.getParent()->getParent();
   const TargetFrameLowering *TFI = MF->getSubtarget().getFrameLowering();
 
   if (isFrameInstr(MI)) {
-    unsigned StackAlign = TFI->getStackAlignment();
-    int SPAdj = alignTo(getFrameSize(MI), StackAlign);
+    int SPAdj = alignTo(getFrameSize(MI), TFI->getStackAlign());
     SPAdj -= getFrameAdjustment(MI);
     if (!isFrameSetup(MI))
       SPAdj = -SPAdj;
@@ -639,7 +1123,7 @@ bool X86InstrInfo::isReallyTriviallyReMaterializable(const MachineInstr &MI,
 
 void X86InstrInfo::reMaterialize(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator I,
-                                 unsigned DestReg, unsigned SubIdx,
+                                 Register DestReg, unsigned SubIdx,
                                  const MachineInstr &Orig,
                                  const TargetRegisterInfo &TRI) const {
   bool ClobbersEFLAGS = Orig.modifiesRegister(X86::EFLAGS, &TRI);
@@ -1182,61 +1666,61 @@ X86InstrInfo::convertToThreeAddress(MachineFunction::iterator &MFI,
   case X86::VMOVUPSZ128rmk:   case X86::VMOVAPSZ128rmk:
   case X86::VMOVUPSZ256rmk:   case X86::VMOVAPSZ256rmk:
   case X86::VMOVUPSZrmk:      case X86::VMOVAPSZrmk:
-  case X86::VBROADCASTSDZ256mk:
-  case X86::VBROADCASTSDZmk:
-  case X86::VBROADCASTSSZ128mk:
-  case X86::VBROADCASTSSZ256mk:
-  case X86::VBROADCASTSSZmk:
-  case X86::VPBROADCASTDZ128mk:
-  case X86::VPBROADCASTDZ256mk:
-  case X86::VPBROADCASTDZmk:
-  case X86::VPBROADCASTQZ128mk:
-  case X86::VPBROADCASTQZ256mk:
-  case X86::VPBROADCASTQZmk: {
+  case X86::VBROADCASTSDZ256rmk:
+  case X86::VBROADCASTSDZrmk:
+  case X86::VBROADCASTSSZ128rmk:
+  case X86::VBROADCASTSSZ256rmk:
+  case X86::VBROADCASTSSZrmk:
+  case X86::VPBROADCASTDZ128rmk:
+  case X86::VPBROADCASTDZ256rmk:
+  case X86::VPBROADCASTDZrmk:
+  case X86::VPBROADCASTQZ128rmk:
+  case X86::VPBROADCASTQZ256rmk:
+  case X86::VPBROADCASTQZrmk: {
     unsigned Opc;
     switch (MIOpc) {
     default: llvm_unreachable("Unreachable!");
-    case X86::VMOVDQU8Z128rmk:    Opc = X86::VPBLENDMBZ128rmk; break;
-    case X86::VMOVDQU8Z256rmk:    Opc = X86::VPBLENDMBZ256rmk; break;
-    case X86::VMOVDQU8Zrmk:       Opc = X86::VPBLENDMBZrmk;    break;
-    case X86::VMOVDQU16Z128rmk:   Opc = X86::VPBLENDMWZ128rmk; break;
-    case X86::VMOVDQU16Z256rmk:   Opc = X86::VPBLENDMWZ256rmk; break;
-    case X86::VMOVDQU16Zrmk:      Opc = X86::VPBLENDMWZrmk;    break;
-    case X86::VMOVDQU32Z128rmk:   Opc = X86::VPBLENDMDZ128rmk; break;
-    case X86::VMOVDQU32Z256rmk:   Opc = X86::VPBLENDMDZ256rmk; break;
-    case X86::VMOVDQU32Zrmk:      Opc = X86::VPBLENDMDZrmk;    break;
-    case X86::VMOVDQU64Z128rmk:   Opc = X86::VPBLENDMQZ128rmk; break;
-    case X86::VMOVDQU64Z256rmk:   Opc = X86::VPBLENDMQZ256rmk; break;
-    case X86::VMOVDQU64Zrmk:      Opc = X86::VPBLENDMQZrmk;    break;
-    case X86::VMOVUPDZ128rmk:     Opc = X86::VBLENDMPDZ128rmk; break;
-    case X86::VMOVUPDZ256rmk:     Opc = X86::VBLENDMPDZ256rmk; break;
-    case X86::VMOVUPDZrmk:        Opc = X86::VBLENDMPDZrmk;    break;
-    case X86::VMOVUPSZ128rmk:     Opc = X86::VBLENDMPSZ128rmk; break;
-    case X86::VMOVUPSZ256rmk:     Opc = X86::VBLENDMPSZ256rmk; break;
-    case X86::VMOVUPSZrmk:        Opc = X86::VBLENDMPSZrmk;    break;
-    case X86::VMOVDQA32Z128rmk:   Opc = X86::VPBLENDMDZ128rmk; break;
-    case X86::VMOVDQA32Z256rmk:   Opc = X86::VPBLENDMDZ256rmk; break;
-    case X86::VMOVDQA32Zrmk:      Opc = X86::VPBLENDMDZrmk;    break;
-    case X86::VMOVDQA64Z128rmk:   Opc = X86::VPBLENDMQZ128rmk; break;
-    case X86::VMOVDQA64Z256rmk:   Opc = X86::VPBLENDMQZ256rmk; break;
-    case X86::VMOVDQA64Zrmk:      Opc = X86::VPBLENDMQZrmk;    break;
-    case X86::VMOVAPDZ128rmk:     Opc = X86::VBLENDMPDZ128rmk; break;
-    case X86::VMOVAPDZ256rmk:     Opc = X86::VBLENDMPDZ256rmk; break;
-    case X86::VMOVAPDZrmk:        Opc = X86::VBLENDMPDZrmk;    break;
-    case X86::VMOVAPSZ128rmk:     Opc = X86::VBLENDMPSZ128rmk; break;
-    case X86::VMOVAPSZ256rmk:     Opc = X86::VBLENDMPSZ256rmk; break;
-    case X86::VMOVAPSZrmk:        Opc = X86::VBLENDMPSZrmk;    break;
-    case X86::VBROADCASTSDZ256mk: Opc = X86::VBLENDMPDZ256rmbk; break;
-    case X86::VBROADCASTSDZmk:    Opc = X86::VBLENDMPDZrmbk;    break;
-    case X86::VBROADCASTSSZ128mk: Opc = X86::VBLENDMPSZ128rmbk; break;
-    case X86::VBROADCASTSSZ256mk: Opc = X86::VBLENDMPSZ256rmbk; break;
-    case X86::VBROADCASTSSZmk:    Opc = X86::VBLENDMPSZrmbk;    break;
-    case X86::VPBROADCASTDZ128mk: Opc = X86::VPBLENDMDZ128rmbk; break;
-    case X86::VPBROADCASTDZ256mk: Opc = X86::VPBLENDMDZ256rmbk; break;
-    case X86::VPBROADCASTDZmk:    Opc = X86::VPBLENDMDZrmbk;    break;
-    case X86::VPBROADCASTQZ128mk: Opc = X86::VPBLENDMQZ128rmbk; break;
-    case X86::VPBROADCASTQZ256mk: Opc = X86::VPBLENDMQZ256rmbk; break;
-    case X86::VPBROADCASTQZmk:    Opc = X86::VPBLENDMQZrmbk;    break;
+    case X86::VMOVDQU8Z128rmk:     Opc = X86::VPBLENDMBZ128rmk; break;
+    case X86::VMOVDQU8Z256rmk:     Opc = X86::VPBLENDMBZ256rmk; break;
+    case X86::VMOVDQU8Zrmk:        Opc = X86::VPBLENDMBZrmk;    break;
+    case X86::VMOVDQU16Z128rmk:    Opc = X86::VPBLENDMWZ128rmk; break;
+    case X86::VMOVDQU16Z256rmk:    Opc = X86::VPBLENDMWZ256rmk; break;
+    case X86::VMOVDQU16Zrmk:       Opc = X86::VPBLENDMWZrmk;    break;
+    case X86::VMOVDQU32Z128rmk:    Opc = X86::VPBLENDMDZ128rmk; break;
+    case X86::VMOVDQU32Z256rmk:    Opc = X86::VPBLENDMDZ256rmk; break;
+    case X86::VMOVDQU32Zrmk:       Opc = X86::VPBLENDMDZrmk;    break;
+    case X86::VMOVDQU64Z128rmk:    Opc = X86::VPBLENDMQZ128rmk; break;
+    case X86::VMOVDQU64Z256rmk:    Opc = X86::VPBLENDMQZ256rmk; break;
+    case X86::VMOVDQU64Zrmk:       Opc = X86::VPBLENDMQZrmk;    break;
+    case X86::VMOVUPDZ128rmk:      Opc = X86::VBLENDMPDZ128rmk; break;
+    case X86::VMOVUPDZ256rmk:      Opc = X86::VBLENDMPDZ256rmk; break;
+    case X86::VMOVUPDZrmk:         Opc = X86::VBLENDMPDZrmk;    break;
+    case X86::VMOVUPSZ128rmk:      Opc = X86::VBLENDMPSZ128rmk; break;
+    case X86::VMOVUPSZ256rmk:      Opc = X86::VBLENDMPSZ256rmk; break;
+    case X86::VMOVUPSZrmk:         Opc = X86::VBLENDMPSZrmk;    break;
+    case X86::VMOVDQA32Z128rmk:    Opc = X86::VPBLENDMDZ128rmk; break;
+    case X86::VMOVDQA32Z256rmk:    Opc = X86::VPBLENDMDZ256rmk; break;
+    case X86::VMOVDQA32Zrmk:       Opc = X86::VPBLENDMDZrmk;    break;
+    case X86::VMOVDQA64Z128rmk:    Opc = X86::VPBLENDMQZ128rmk; break;
+    case X86::VMOVDQA64Z256rmk:    Opc = X86::VPBLENDMQZ256rmk; break;
+    case X86::VMOVDQA64Zrmk:       Opc = X86::VPBLENDMQZrmk;    break;
+    case X86::VMOVAPDZ128rmk:      Opc = X86::VBLENDMPDZ128rmk; break;
+    case X86::VMOVAPDZ256rmk:      Opc = X86::VBLENDMPDZ256rmk; break;
+    case X86::VMOVAPDZrmk:         Opc = X86::VBLENDMPDZrmk;    break;
+    case X86::VMOVAPSZ128rmk:      Opc = X86::VBLENDMPSZ128rmk; break;
+    case X86::VMOVAPSZ256rmk:      Opc = X86::VBLENDMPSZ256rmk; break;
+    case X86::VMOVAPSZrmk:         Opc = X86::VBLENDMPSZrmk;    break;
+    case X86::VBROADCASTSDZ256rmk: Opc = X86::VBLENDMPDZ256rmbk; break;
+    case X86::VBROADCASTSDZrmk:    Opc = X86::VBLENDMPDZrmbk;    break;
+    case X86::VBROADCASTSSZ128rmk: Opc = X86::VBLENDMPSZ128rmbk; break;
+    case X86::VBROADCASTSSZ256rmk: Opc = X86::VBLENDMPSZ256rmbk; break;
+    case X86::VBROADCASTSSZrmk:    Opc = X86::VBLENDMPSZrmbk;    break;
+    case X86::VPBROADCASTDZ128rmk: Opc = X86::VPBLENDMDZ128rmbk; break;
+    case X86::VPBROADCASTDZ256rmk: Opc = X86::VPBLENDMDZ256rmbk; break;
+    case X86::VPBROADCASTDZrmk:    Opc = X86::VPBLENDMDZrmbk;    break;
+    case X86::VPBROADCASTQZ128rmk: Opc = X86::VPBLENDMQZ128rmbk; break;
+    case X86::VPBROADCASTQZ256rmk: Opc = X86::VPBLENDMQZ256rmbk; break;
+    case X86::VPBROADCASTQZrmk:    Opc = X86::VPBLENDMQZrmbk;    break;
     }
 
     NewMI = BuildMI(MF, MI.getDebugLoc(), get(Opc))
@@ -1883,7 +2367,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
   unsigned KMaskOp = -1U;
   if (X86II::isKMasked(TSFlags)) {
     // For k-zero-masked operations it is Ok to commute the first vector
-    // operand.
+    // operand. Unless this is an intrinsic instruction.
     // For regular k-masked operations a conservative choice is done as the
     // elements of the first vector operand, for which the corresponding bit
     // in the k-mask operand is set to 0, are copied to the result of the
@@ -1902,7 +2386,7 @@ X86InstrInfo::findThreeSrcCommutedOpIndices(const MachineInstr &MI,
 
     // The operand with index = 1 is used as a source for those elements for
     // which the corresponding bit in the k-mask is set to 0.
-    if (X86II::isKMergeMasked(TSFlags))
+    if (X86II::isKMergeMasked(TSFlags) || IsIntrinsic)
       FirstCommutableVecOp = 3;
 
     LastCommutableVecOp++;
@@ -2379,17 +2863,6 @@ unsigned X86::getSwappedVCMPImm(unsigned Imm) {
   return Imm;
 }
 
-bool X86InstrInfo::isUnpredicatedTerminator(const MachineInstr &MI) const {
-  if (!MI.isTerminator()) return false;
-
-  // Conditional branch is a special case.
-  if (MI.isBranch() && !MI.isBarrier())
-    return true;
-  if (!MI.isPredicable())
-    return true;
-  return !isPredicated(MI);
-}
-
 bool X86InstrInfo::isUnconditionalTailCall(const MachineInstr &MI) const {
   switch (MI.getOpcode()) {
   case X86::TCRETURNdi:
@@ -2826,11 +3299,11 @@ unsigned X86InstrInfo::insertBranch(MachineBasicBlock &MBB,
   return Count;
 }
 
-bool X86InstrInfo::
-canInsertSelect(const MachineBasicBlock &MBB,
-                ArrayRef<MachineOperand> Cond,
-                unsigned TrueReg, unsigned FalseReg,
-                int &CondCycles, int &TrueCycles, int &FalseCycles) const {
+bool X86InstrInfo::canInsertSelect(const MachineBasicBlock &MBB,
+                                   ArrayRef<MachineOperand> Cond,
+                                   Register DstReg, Register TrueReg,
+                                   Register FalseReg, int &CondCycles,
+                                   int &TrueCycles, int &FalseCycles) const {
   // Not all subtargets have cmov instructions.
   if (!Subtarget.hasCMov())
     return false;
@@ -2865,9 +3338,9 @@ canInsertSelect(const MachineBasicBlock &MBB,
 
 void X86InstrInfo::insertSelect(MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I,
-                                const DebugLoc &DL, unsigned DstReg,
-                                ArrayRef<MachineOperand> Cond, unsigned TrueReg,
-                                unsigned FalseReg) const {
+                                const DebugLoc &DL, Register DstReg,
+                                ArrayRef<MachineOperand> Cond, Register TrueReg,
+                                Register FalseReg) const {
   MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
   const TargetRegisterInfo &TRI = *MRI.getTargetRegisterInfo();
   const TargetRegisterClass &RC = *MRI.getRegClass(DstReg);
@@ -3189,8 +3662,9 @@ static unsigned getLoadStoreRegOpcode(unsigned Reg,
   }
 }
 
-bool X86InstrInfo::getMemOperandWithOffset(
-    const MachineInstr &MemOp, const MachineOperand *&BaseOp, int64_t &Offset,
+bool X86InstrInfo::getMemOperandsWithOffsetWidth(
+    const MachineInstr &MemOp, SmallVectorImpl<const MachineOperand *> &BaseOps,
+    int64_t &Offset, bool &OffsetIsScalable, unsigned &Width,
     const TargetRegisterInfo *TRI) const {
   const MCInstrDesc &Desc = MemOp.getDesc();
   int MemRefBegin = X86II::getMemoryOperandNo(Desc.TSFlags);
@@ -3199,7 +3673,8 @@ bool X86InstrInfo::getMemOperandWithOffset(
 
   MemRefBegin += X86II::getOperandBias(Desc);
 
-  BaseOp = &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
+  const MachineOperand *BaseOp =
+      &MemOp.getOperand(MemRefBegin + X86::AddrBaseReg);
   if (!BaseOp->isReg()) // Can be an MO_FrameIndex
     return false;
 
@@ -3221,6 +3696,13 @@ bool X86InstrInfo::getMemOperandWithOffset(
   if (!BaseOp->isReg())
     return false;
 
+  OffsetIsScalable = false;
+  // FIXME: Relying on memoperands() may not be right thing to do here. Check
+  // with X86 maintainers, and fix it accordingly. For now, it is ok, since
+  // there is no use of `Width` for X86 back-end at the moment.
+  Width =
+      !MemOp.memoperands_empty() ? MemOp.memoperands().front()->getSize() : 0;
+  BaseOps.push_back(BaseOp);
   return true;
 }
 
@@ -3241,7 +3723,7 @@ static unsigned getLoadRegOpcode(unsigned DestReg,
 
 void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                        MachineBasicBlock::iterator MI,
-                                       unsigned SrcReg, bool isKill, int FrameIdx,
+                                       Register SrcReg, bool isKill, int FrameIdx,
                                        const TargetRegisterClass *RC,
                                        const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
@@ -3249,7 +3731,7 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
          "Stack slot too small for store");
   unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
   bool isAligned =
-      (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+      (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
       RI.canRealignStack(MF);
   unsigned Opc = getStoreRegOpcode(SrcReg, RC, isAligned, Subtarget);
   addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc)), FrameIdx)
@@ -3258,20 +3740,20 @@ void X86InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
 void X86InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                         MachineBasicBlock::iterator MI,
-                                        unsigned DestReg, int FrameIdx,
+                                        Register DestReg, int FrameIdx,
                                         const TargetRegisterClass *RC,
                                         const TargetRegisterInfo *TRI) const {
   const MachineFunction &MF = *MBB.getParent();
   unsigned Alignment = std::max<uint32_t>(TRI->getSpillSize(*RC), 16);
   bool isAligned =
-      (Subtarget.getFrameLowering()->getStackAlignment() >= Alignment) ||
+      (Subtarget.getFrameLowering()->getStackAlign() >= Alignment) ||
       RI.canRealignStack(MF);
   unsigned Opc = getLoadRegOpcode(DestReg, RC, isAligned, Subtarget);
   addFrameReference(BuildMI(MBB, MI, DebugLoc(), get(Opc), DestReg), FrameIdx);
 }
 
-bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                                  unsigned &SrcReg2, int &CmpMask,
+bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                                  Register &SrcReg2, int &CmpMask,
                                   int &CmpValue) const {
   switch (MI.getOpcode()) {
   default: break;
@@ -3358,7 +3840,7 @@ bool X86InstrInfo::analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
 /// SrcReg, SrcRegs: register operands for FlagI.
 /// ImmValue: immediate for FlagI if it takes an immediate.
 inline static bool isRedundantFlagInstr(const MachineInstr &FlagI,
-                                        unsigned SrcReg, unsigned SrcReg2,
+                                        Register SrcReg, Register SrcReg2,
                                         int ImmMask, int ImmValue,
                                         const MachineInstr &OI) {
   if (((FlagI.getOpcode() == X86::CMP64rr && OI.getOpcode() == X86::SUB64rr) ||
@@ -3547,8 +4029,8 @@ static X86::CondCode isUseDefConvertible(const MachineInstr &MI) {
 /// Check if there exists an earlier instruction that
 /// operates on the same source operands and sets flags in the same way as
 /// Compare; remove Compare if possible.
-bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
-                                        unsigned SrcReg2, int CmpMask,
+bool X86InstrInfo::optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+                                        Register SrcReg2, int CmpMask,
                                         int CmpValue,
                                         const MachineRegisterInfo *MRI) const {
   // Check whether we can replace SUB with CMP.
@@ -3875,15 +4357,15 @@ MachineInstr *X86InstrInfo::optimizeLoadInstr(MachineInstr &MI,
 static bool Expand2AddrUndef(MachineInstrBuilder &MIB,
                              const MCInstrDesc &Desc) {
   assert(Desc.getNumOperands() == 3 && "Expected two-addr instruction.");
-  Register Reg = MIB->getOperand(0).getReg();
+  Register Reg = MIB.getReg(0);
   MIB->setDesc(Desc);
 
   // MachineInstr::addOperand() will insert explicit operands before any
   // implicit operands.
   MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef);
   // But we don't trust that.
-  assert(MIB->getOperand(1).getReg() == Reg &&
-         MIB->getOperand(2).getReg() == Reg && "Misplaced operand");
+  assert(MIB.getReg(1) == Reg &&
+         MIB.getReg(2) == Reg && "Misplaced operand");
   return true;
 }
 
@@ -3905,7 +4387,7 @@ static bool expandMOV32r1(MachineInstrBuilder &MIB, const TargetInstrInfo &TII,
                           bool MinusOne) {
   MachineBasicBlock &MBB = *MIB->getParent();
   DebugLoc DL = MIB->getDebugLoc();
-  Register Reg = MIB->getOperand(0).getReg();
+  Register Reg = MIB.getReg(0);
 
   // Insert the XOR.
   BuildMI(MBB, MIB.getInstr(), DL, TII.get(X86::XOR32rr), Reg)
@@ -3949,13 +4431,15 @@ static bool ExpandMOVImmSExti8(MachineInstrBuilder &MIB,
     BuildMI(MBB, I, DL, TII.get(X86::PUSH64i8)).addImm(Imm);
     MIB->setDesc(TII.get(X86::POP64r));
     MIB->getOperand(0)
-        .setReg(getX86SubSuperRegister(MIB->getOperand(0).getReg(), 64));
+        .setReg(getX86SubSuperRegister(MIB.getReg(0), 64));
   } else {
     assert(MIB->getOpcode() == X86::MOV32ImmSExti8);
     StackAdjustment = 4;
     BuildMI(MBB, I, DL, TII.get(X86::PUSH32i8)).addImm(Imm);
     MIB->setDesc(TII.get(X86::POP32r));
   }
+  MIB->RemoveOperand(1);
+  MIB->addImplicitDefUseOperands(*MBB.getParent());
 
   // Build CFI if necessary.
   MachineFunction &MF = *MBB.getParent();
@@ -3979,14 +4463,14 @@ static void expandLoadStackGuard(MachineInstrBuilder &MIB,
                                  const TargetInstrInfo &TII) {
   MachineBasicBlock &MBB = *MIB->getParent();
   DebugLoc DL = MIB->getDebugLoc();
-  Register Reg = MIB->getOperand(0).getReg();
+  Register Reg = MIB.getReg(0);
   const GlobalValue *GV =
       cast<GlobalValue>((*MIB->memoperands_begin())->getValue());
   auto Flags = MachineMemOperand::MOLoad |
                MachineMemOperand::MODereferenceable |
                MachineMemOperand::MOInvariant;
   MachineMemOperand *MMO = MBB.getParent()->getMachineMemOperand(
-      MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, 8);
+      MachinePointerInfo::getGOT(*MBB.getParent()), Flags, 8, Align(8));
   MachineBasicBlock::iterator I = MIB.getInstr();
 
   BuildMI(MBB, I, DL, TII.get(X86::MOV64rm), Reg).addReg(X86::RIP).addImm(1)
@@ -4017,7 +4501,7 @@ static bool expandNOVLXLoad(MachineInstrBuilder &MIB,
                             const MCInstrDesc &LoadDesc,
                             const MCInstrDesc &BroadcastDesc,
                             unsigned SubIdx) {
-  Register DestReg = MIB->getOperand(0).getReg();
+  Register DestReg = MIB.getReg(0);
   // Check if DestReg is XMM16-31 or YMM16-31.
   if (TRI->getEncodingValue(DestReg) < 16) {
     // We can use a normal VEX encoded load.
@@ -4040,7 +4524,7 @@ static bool expandNOVLXStore(MachineInstrBuilder &MIB,
                              const MCInstrDesc &StoreDesc,
                              const MCInstrDesc &ExtractDesc,
                              unsigned SubIdx) {
-  Register SrcReg = MIB->getOperand(X86::AddrNumOperands).getReg();
+  Register SrcReg = MIB.getReg(X86::AddrNumOperands);
   // Check if DestReg is XMM16-31 or YMM16-31.
   if (TRI->getEncodingValue(SrcReg) < 16) {
     // We can use a normal VEX encoded store.
@@ -4063,7 +4547,7 @@ static bool expandSHXDROT(MachineInstrBuilder &MIB, const MCInstrDesc &Desc) {
   // Temporarily remove the immediate so we can add another source register.
   MIB->RemoveOperand(2);
   // Add the register. Don't copy the kill flag if there is one.
-  MIB.addReg(MIB->getOperand(1).getReg(),
+  MIB.addReg(MIB.getReg(1),
              getUndefRegState(MIB->getOperand(1).isUndef()));
   // Add back the immediate.
   MIB.addImm(ShiftAmt);
@@ -4083,10 +4567,6 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::MOV32ImmSExti8:
   case X86::MOV64ImmSExti8:
     return ExpandMOVImmSExti8(MIB, *this, Subtarget);
-  case X86::SETB_C8r:
-    return Expand2AddrUndef(MIB, get(X86::SBB8rr));
-  case X86::SETB_C16r:
-    return Expand2AddrUndef(MIB, get(X86::SBB16rr));
   case X86::SETB_C32r:
     return Expand2AddrUndef(MIB, get(X86::SBB32rr));
   case X86::SETB_C64r:
@@ -4101,7 +4581,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::AVX_SET0: {
     assert(HasAVX && "AVX not supported");
     const TargetRegisterInfo *TRI = &getRegisterInfo();
-    Register SrcReg = MIB->getOperand(0).getReg();
+    Register SrcReg = MIB.getReg(0);
     Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
     MIB->getOperand(0).setReg(XReg);
     Expand2AddrUndef(MIB, get(X86::VXORPSrr));
@@ -4113,7 +4593,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::AVX512_FsFLD0SD:
   case X86::AVX512_FsFLD0F128: {
     bool HasVLX = Subtarget.hasVLX();
-    Register SrcReg = MIB->getOperand(0).getReg();
+    Register SrcReg = MIB.getReg(0);
     const TargetRegisterInfo *TRI = &getRegisterInfo();
     if (HasVLX || TRI->getEncodingValue(SrcReg) < 16)
       return Expand2AddrUndef(MIB,
@@ -4127,7 +4607,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::AVX512_256_SET0:
   case X86::AVX512_512_SET0: {
     bool HasVLX = Subtarget.hasVLX();
-    Register SrcReg = MIB->getOperand(0).getReg();
+    Register SrcReg = MIB.getReg(0);
     const TargetRegisterInfo *TRI = &getRegisterInfo();
     if (HasVLX || TRI->getEncodingValue(SrcReg) < 16) {
       Register XReg = TRI->getSubReg(SrcReg, X86::sub_xmm);
@@ -4150,14 +4630,14 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   case X86::AVX2_SETALLONES:
     return Expand2AddrUndef(MIB, get(X86::VPCMPEQDYrr));
   case X86::AVX1_SETALLONES: {
-    Register Reg = MIB->getOperand(0).getReg();
+    Register Reg = MIB.getReg(0);
     // VCMPPSYrri with an immediate 0xf should produce VCMPTRUEPS.
     MIB->setDesc(get(X86::VCMPPSYrri));
     MIB.addReg(Reg, RegState::Undef).addReg(Reg, RegState::Undef).addImm(0xf);
     return true;
   }
   case X86::AVX512_512_SETALLONES: {
-    Register Reg = MIB->getOperand(0).getReg();
+    Register Reg = MIB.getReg(0);
     MIB->setDesc(get(X86::VPTERNLOGDZrri));
     // VPTERNLOGD needs 3 register inputs and an immediate.
     // 0xff will return 1s for any input.
@@ -4167,8 +4647,8 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
   }
   case X86::AVX512_512_SEXT_MASK_32:
   case X86::AVX512_512_SEXT_MASK_64: {
-    Register Reg = MIB->getOperand(0).getReg();
-    Register MaskReg = MIB->getOperand(1).getReg();
+    Register Reg = MIB.getReg(0);
+    Register MaskReg = MIB.getReg(1);
     unsigned MaskState = getRegState(MIB->getOperand(1));
     unsigned Opc = (MI.getOpcode() == X86::AVX512_512_SEXT_MASK_64) ?
                    X86::VPTERNLOGQZrrikz : X86::VPTERNLOGDZrrikz;
@@ -4205,7 +4685,7 @@ bool X86InstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return expandNOVLXStore(MIB, &getRegisterInfo(), get(X86::VMOVUPSYmr),
                             get(X86::VEXTRACTF64x4Zmr), X86::sub_ymm);
   case X86::MOV32ri64: {
-    Register Reg = MIB->getOperand(0).getReg();
+    Register Reg = MIB.getReg(0);
     Register Reg32 = RI.getSubReg(Reg, X86::sub_32bit);
     MI.setDesc(get(X86::MOV32ri));
     MIB->getOperand(0).setReg(Reg32);
@@ -4358,11 +4838,105 @@ unsigned X86InstrInfo::getPartialRegUpdateClearance(
 
 // Return true for any instruction the copies the high bits of the first source
 // operand into the unused high bits of the destination operand.
-static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
+// Also returns true for instructions that have two inputs where one may
+// be undef and we want it to use the same register as the other input.
+static bool hasUndefRegUpdate(unsigned Opcode, unsigned OpNum,
                               bool ForLoadFold = false) {
   // Set the OpNum parameter to the first source operand.
-  OpNum = 1;
   switch (Opcode) {
+  case X86::MMX_PUNPCKHBWirr:
+  case X86::MMX_PUNPCKHWDirr:
+  case X86::MMX_PUNPCKHDQirr:
+  case X86::MMX_PUNPCKLBWirr:
+  case X86::MMX_PUNPCKLWDirr:
+  case X86::MMX_PUNPCKLDQirr:
+  case X86::MOVHLPSrr:
+  case X86::PACKSSWBrr:
+  case X86::PACKUSWBrr:
+  case X86::PACKSSDWrr:
+  case X86::PACKUSDWrr:
+  case X86::PUNPCKHBWrr:
+  case X86::PUNPCKLBWrr:
+  case X86::PUNPCKHWDrr:
+  case X86::PUNPCKLWDrr:
+  case X86::PUNPCKHDQrr:
+  case X86::PUNPCKLDQrr:
+  case X86::PUNPCKHQDQrr:
+  case X86::PUNPCKLQDQrr:
+  case X86::SHUFPDrri:
+  case X86::SHUFPSrri:
+    // These instructions are sometimes used with an undef first or second
+    // source. Return true here so BreakFalseDeps will assign this source to the
+    // same register as the first source to avoid a false dependency.
+    // Operand 1 of these instructions is tied so they're separate from their
+    // VEX counterparts.
+    return OpNum == 2 && !ForLoadFold;
+
+  case X86::VMOVLHPSrr:
+  case X86::VMOVLHPSZrr:
+  case X86::VPACKSSWBrr:
+  case X86::VPACKUSWBrr:
+  case X86::VPACKSSDWrr:
+  case X86::VPACKUSDWrr:
+  case X86::VPACKSSWBZ128rr:
+  case X86::VPACKUSWBZ128rr:
+  case X86::VPACKSSDWZ128rr:
+  case X86::VPACKUSDWZ128rr:
+  case X86::VPERM2F128rr:
+  case X86::VPERM2I128rr:
+  case X86::VSHUFF32X4Z256rri:
+  case X86::VSHUFF32X4Zrri:
+  case X86::VSHUFF64X2Z256rri:
+  case X86::VSHUFF64X2Zrri:
+  case X86::VSHUFI32X4Z256rri:
+  case X86::VSHUFI32X4Zrri:
+  case X86::VSHUFI64X2Z256rri:
+  case X86::VSHUFI64X2Zrri:
+  case X86::VPUNPCKHBWrr:
+  case X86::VPUNPCKLBWrr:
+  case X86::VPUNPCKHBWYrr:
+  case X86::VPUNPCKLBWYrr:
+  case X86::VPUNPCKHBWZ128rr:
+  case X86::VPUNPCKLBWZ128rr:
+  case X86::VPUNPCKHBWZ256rr:
+  case X86::VPUNPCKLBWZ256rr:
+  case X86::VPUNPCKHBWZrr:
+  case X86::VPUNPCKLBWZrr:
+  case X86::VPUNPCKHWDrr:
+  case X86::VPUNPCKLWDrr:
+  case X86::VPUNPCKHWDYrr:
+  case X86::VPUNPCKLWDYrr:
+  case X86::VPUNPCKHWDZ128rr:
+  case X86::VPUNPCKLWDZ128rr:
+  case X86::VPUNPCKHWDZ256rr:
+  case X86::VPUNPCKLWDZ256rr:
+  case X86::VPUNPCKHWDZrr:
+  case X86::VPUNPCKLWDZrr:
+  case X86::VPUNPCKHDQrr:
+  case X86::VPUNPCKLDQrr:
+  case X86::VPUNPCKHDQYrr:
+  case X86::VPUNPCKLDQYrr:
+  case X86::VPUNPCKHDQZ128rr:
+  case X86::VPUNPCKLDQZ128rr:
+  case X86::VPUNPCKHDQZ256rr:
+  case X86::VPUNPCKLDQZ256rr:
+  case X86::VPUNPCKHDQZrr:
+  case X86::VPUNPCKLDQZrr:
+  case X86::VPUNPCKHQDQrr:
+  case X86::VPUNPCKLQDQrr:
+  case X86::VPUNPCKHQDQYrr:
+  case X86::VPUNPCKLQDQYrr:
+  case X86::VPUNPCKHQDQZ128rr:
+  case X86::VPUNPCKLQDQZ128rr:
+  case X86::VPUNPCKHQDQZ256rr:
+  case X86::VPUNPCKLQDQZ256rr:
+  case X86::VPUNPCKHQDQZrr:
+  case X86::VPUNPCKLQDQZrr:
+    // These instructions are sometimes used with an undef first or second
+    // source. Return true here so BreakFalseDeps will assign this source to the
+    // same register as the first source to avoid a false dependency.
+    return (OpNum == 1 || OpNum == 2) && !ForLoadFold;
+
   case X86::VCVTSI2SSrr:
   case X86::VCVTSI2SSrm:
   case X86::VCVTSI2SSrr_Int:
@@ -4420,7 +4994,7 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
   case X86::VCVTUSI642SDZrm_Int:
     // Load folding won't effect the undef register update since the input is
     // a GPR.
-    return !ForLoadFold;
+    return OpNum == 1 && !ForLoadFold;
   case X86::VCVTSD2SSrr:
   case X86::VCVTSD2SSrm:
   case X86::VCVTSD2SSrr_Int:
@@ -4519,15 +5093,13 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
   case X86::VSQRTSDZrb_Int:
   case X86::VSQRTSDZm:
   case X86::VSQRTSDZm_Int:
-    return true;
+    return OpNum == 1;
   case X86::VMOVSSZrrk:
   case X86::VMOVSDZrrk:
-    OpNum = 3;
-    return true;
+    return OpNum == 3 && !ForLoadFold;
   case X86::VMOVSSZrrkz:
   case X86::VMOVSDZrrkz:
-    OpNum = 2;
-    return true;
+    return OpNum == 2 && !ForLoadFold;
   }
 
   return false;
@@ -4550,13 +5122,17 @@ static bool hasUndefRegUpdate(unsigned Opcode, unsigned &OpNum,
 unsigned
 X86InstrInfo::getUndefRegClearance(const MachineInstr &MI, unsigned &OpNum,
                                    const TargetRegisterInfo *TRI) const {
-  if (!hasUndefRegUpdate(MI.getOpcode(), OpNum))
-    return 0;
-
-  const MachineOperand &MO = MI.getOperand(OpNum);
-  if (MO.isUndef() && Register::isPhysicalRegister(MO.getReg())) {
-    return UndefRegClearance;
+  for (unsigned i = MI.getNumExplicitDefs(), e = MI.getNumExplicitOperands();
+         i != e; ++i) {
+    const MachineOperand &MO = MI.getOperand(i);
+    if (MO.isReg() && MO.isUndef() &&
+        Register::isPhysicalRegister(MO.getReg()) &&
+        hasUndefRegUpdate(MI.getOpcode(), i)) {
+      OpNum = i;
+      return UndefRegClearance;
+    }
   }
+
   return 0;
 }
 
@@ -4727,7 +5303,7 @@ static MachineInstr *MakeM0Inst(const TargetInstrInfo &TII, unsigned Opcode,
 MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
     MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
     ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
-    unsigned Size, unsigned Align) const {
+    unsigned Size, Align Alignment) const {
   switch (MI.getOpcode()) {
   case X86::INSERTPSrr:
   case X86::VINSERTPSrr:
@@ -4743,7 +5319,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
       const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
-      if ((Size == 0 || Size >= 16) && RCSize >= 16 && 4 <= Align) {
+      if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(4)) {
         int PtrOffset = SrcIdx * 4;
         unsigned NewImm = (DstIdx << 4) | ZMask;
         unsigned NewOpCode =
@@ -4767,7 +5343,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
       const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
-      if ((Size == 0 || Size >= 16) && RCSize >= 16 && 8 <= Align) {
+      if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment >= Align(8)) {
         unsigned NewOpCode =
             (MI.getOpcode() == X86::VMOVHLPSZrr) ? X86::VMOVLPSZ128rm :
             (MI.getOpcode() == X86::VMOVHLPSrr)  ? X86::VMOVLPSrm     :
@@ -4786,7 +5362,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
       const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
       const TargetRegisterClass *RC = getRegClass(MI.getDesc(), OpNum, &RI, MF);
       unsigned RCSize = TRI.getRegSizeInBits(*RC) / 8;
-      if ((Size == 0 || Size >= 16) && RCSize >= 16 && Align < 16) {
+      if ((Size == 0 || Size >= 16) && RCSize >= 16 && Alignment < Align(16)) {
         MachineInstr *NewMI =
             FuseInst(MF, X86::MOVHPDrm, OpNum, MOs, InsertPt, MI, *this);
         return NewMI;
@@ -4800,8 +5376,7 @@ MachineInstr *X86InstrInfo::foldMemoryOperandCustom(
 
 static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
                                                MachineInstr &MI) {
-  unsigned Ignored;
-  if (!hasUndefRegUpdate(MI.getOpcode(), Ignored, /*ForLoadFold*/true) ||
+  if (!hasUndefRegUpdate(MI.getOpcode(), 1, /*ForLoadFold*/true) ||
       !MI.getOperand(1).isReg())
     return false;
 
@@ -4818,11 +5393,10 @@ static bool shouldPreventUndefRegUpdateMemFold(MachineFunction &MF,
   return VRegDef && VRegDef->isImplicitDef();
 }
 
-
 MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     MachineFunction &MF, MachineInstr &MI, unsigned OpNum,
     ArrayRef<MachineOperand> MOs, MachineBasicBlock::iterator InsertPt,
-    unsigned Size, unsigned Align, bool AllowCommute) const {
+    unsigned Size, Align Alignment, bool AllowCommute) const {
   bool isSlowTwoMemOps = Subtarget.slowTwoMemOps();
   bool isTwoAddrFold = false;
 
@@ -4862,8 +5436,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
   MachineInstr *NewMI = nullptr;
 
   // Attempt to fold any custom cases we have.
-  if (MachineInstr *CustomMI =
-          foldMemoryOperandCustom(MF, MI, OpNum, MOs, InsertPt, Size, Align))
+  if (MachineInstr *CustomMI = foldMemoryOperandCustom(
+          MF, MI, OpNum, MOs, InsertPt, Size, Alignment))
     return CustomMI;
 
   const X86MemoryFoldTableEntry *I = nullptr;
@@ -4890,9 +5464,9 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
 
   if (I != nullptr) {
     unsigned Opcode = I->DstOp;
-    unsigned MinAlign = (I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT;
-    MinAlign = MinAlign ? 1 << (MinAlign - 1) : 0;
-    if (Align < MinAlign)
+    MaybeAlign MinAlign =
+        decodeMaybeAlign((I->Flags & TB_ALIGN_MASK) >> TB_ALIGN_SHIFT);
+    if (MinAlign && Alignment < *MinAlign)
       return nullptr;
     bool NarrowToMOV32rm = false;
     if (Size) {
@@ -4967,8 +5541,8 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
       }
 
       // Attempt to fold with the commuted version of the instruction.
-      NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt,
-                                    Size, Align, /*AllowCommute=*/false);
+      NewMI = foldMemoryOperandImpl(MF, MI, CommuteOpIdx2, MOs, InsertPt, Size,
+                                    Alignment, /*AllowCommute=*/false);
       if (NewMI)
         return NewMI;
 
@@ -5022,12 +5596,12 @@ X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF, MachineInstr &MI,
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   unsigned Size = MFI.getObjectSize(FrameIndex);
-  unsigned Alignment = MFI.getObjectAlignment(FrameIndex);
+  Align Alignment = MFI.getObjectAlign(FrameIndex);
   // If the function stack isn't realigned we don't want to fold instructions
   // that need increased alignment.
   if (!RI.needsStackRealignment(MF))
     Alignment =
-        std::min(Alignment, Subtarget.getFrameLowering()->getStackAlignment());
+        std::min(Alignment, Subtarget.getFrameLowering()->getStackAlign());
   if (Ops.size() == 2 && Ops[0] == 0 && Ops[1] == 1) {
     unsigned NewOpc = 0;
     unsigned RCSize = 0;
@@ -5085,12 +5659,31 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     // destination register is wider than 32 bits (4 bytes), and its user
     // instruction isn't scalar (SS).
     switch (UserOpc) {
+    case X86::CVTSS2SDrr_Int:
+    case X86::VCVTSS2SDrr_Int:
+    case X86::VCVTSS2SDZrr_Int:
+    case X86::VCVTSS2SDZrr_Intk:
+    case X86::VCVTSS2SDZrr_Intkz:
+    case X86::CVTSS2SIrr_Int:     case X86::CVTSS2SI64rr_Int:
+    case X86::VCVTSS2SIrr_Int:    case X86::VCVTSS2SI64rr_Int:
+    case X86::VCVTSS2SIZrr_Int:   case X86::VCVTSS2SI64Zrr_Int:
+    case X86::CVTTSS2SIrr_Int:    case X86::CVTTSS2SI64rr_Int:
+    case X86::VCVTTSS2SIrr_Int:   case X86::VCVTTSS2SI64rr_Int:
+    case X86::VCVTTSS2SIZrr_Int:  case X86::VCVTTSS2SI64Zrr_Int:
+    case X86::VCVTSS2USIZrr_Int:  case X86::VCVTSS2USI64Zrr_Int:
+    case X86::VCVTTSS2USIZrr_Int: case X86::VCVTTSS2USI64Zrr_Int:
+    case X86::RCPSSr_Int:   case X86::VRCPSSr_Int:
+    case X86::RSQRTSSr_Int: case X86::VRSQRTSSr_Int:
+    case X86::ROUNDSSr_Int: case X86::VROUNDSSr_Int:
+    case X86::COMISSrr_Int: case X86::VCOMISSrr_Int: case X86::VCOMISSZrr_Int:
+    case X86::UCOMISSrr_Int:case X86::VUCOMISSrr_Int:case X86::VUCOMISSZrr_Int:
     case X86::ADDSSrr_Int: case X86::VADDSSrr_Int: case X86::VADDSSZrr_Int:
     case X86::CMPSSrr_Int: case X86::VCMPSSrr_Int: case X86::VCMPSSZrr_Int:
     case X86::DIVSSrr_Int: case X86::VDIVSSrr_Int: case X86::VDIVSSZrr_Int:
     case X86::MAXSSrr_Int: case X86::VMAXSSrr_Int: case X86::VMAXSSZrr_Int:
     case X86::MINSSrr_Int: case X86::VMINSSrr_Int: case X86::VMINSSZrr_Int:
     case X86::MULSSrr_Int: case X86::VMULSSrr_Int: case X86::VMULSSZrr_Int:
+    case X86::SQRTSSr_Int: case X86::VSQRTSSr_Int: case X86::VSQRTSSZr_Int:
     case X86::SUBSSrr_Int: case X86::VSUBSSrr_Int: case X86::VSUBSSZrr_Int:
     case X86::VADDSSZrr_Intk: case X86::VADDSSZrr_Intkz:
     case X86::VCMPSSZrr_Intk:
@@ -5098,6 +5691,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VMAXSSZrr_Intk: case X86::VMAXSSZrr_Intkz:
     case X86::VMINSSZrr_Intk: case X86::VMINSSZrr_Intkz:
     case X86::VMULSSZrr_Intk: case X86::VMULSSZrr_Intkz:
+    case X86::VSQRTSSZr_Intk: case X86::VSQRTSSZr_Intkz:
     case X86::VSUBSSZrr_Intk: case X86::VSUBSSZrr_Intkz:
     case X86::VFMADDSS4rr_Int:   case X86::VFNMADDSS4rr_Int:
     case X86::VFMSUBSS4rr_Int:   case X86::VFNMSUBSS4rr_Int:
@@ -5125,6 +5719,41 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VFMSUB132SSZr_Intkz: case X86::VFNMSUB132SSZr_Intkz:
     case X86::VFMSUB213SSZr_Intkz: case X86::VFNMSUB213SSZr_Intkz:
     case X86::VFMSUB231SSZr_Intkz: case X86::VFNMSUB231SSZr_Intkz:
+    case X86::VFIXUPIMMSSZrri:
+    case X86::VFIXUPIMMSSZrrik:
+    case X86::VFIXUPIMMSSZrrikz:
+    case X86::VFPCLASSSSZrr:
+    case X86::VFPCLASSSSZrrk:
+    case X86::VGETEXPSSZr:
+    case X86::VGETEXPSSZrk:
+    case X86::VGETEXPSSZrkz:
+    case X86::VGETMANTSSZrri:
+    case X86::VGETMANTSSZrrik:
+    case X86::VGETMANTSSZrrikz:
+    case X86::VRANGESSZrri:
+    case X86::VRANGESSZrrik:
+    case X86::VRANGESSZrrikz:
+    case X86::VRCP14SSZrr:
+    case X86::VRCP14SSZrrk:
+    case X86::VRCP14SSZrrkz:
+    case X86::VRCP28SSZr:
+    case X86::VRCP28SSZrk:
+    case X86::VRCP28SSZrkz:
+    case X86::VREDUCESSZrri:
+    case X86::VREDUCESSZrrik:
+    case X86::VREDUCESSZrrikz:
+    case X86::VRNDSCALESSZr_Int:
+    case X86::VRNDSCALESSZr_Intk:
+    case X86::VRNDSCALESSZr_Intkz:
+    case X86::VRSQRT14SSZrr:
+    case X86::VRSQRT14SSZrrk:
+    case X86::VRSQRT14SSZrrkz:
+    case X86::VRSQRT28SSZr:
+    case X86::VRSQRT28SSZrk:
+    case X86::VRSQRT28SSZrkz:
+    case X86::VSCALEFSSZrr:
+    case X86::VSCALEFSSZrrk:
+    case X86::VSCALEFSSZrrkz:
       return false;
     default:
       return true;
@@ -5139,12 +5768,29 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     // destination register is wider than 64 bits (8 bytes), and its user
     // instruction isn't scalar (SD).
     switch (UserOpc) {
+    case X86::CVTSD2SSrr_Int:
+    case X86::VCVTSD2SSrr_Int:
+    case X86::VCVTSD2SSZrr_Int:
+    case X86::VCVTSD2SSZrr_Intk:
+    case X86::VCVTSD2SSZrr_Intkz:
+    case X86::CVTSD2SIrr_Int:     case X86::CVTSD2SI64rr_Int:
+    case X86::VCVTSD2SIrr_Int:    case X86::VCVTSD2SI64rr_Int:
+    case X86::VCVTSD2SIZrr_Int:   case X86::VCVTSD2SI64Zrr_Int:
+    case X86::CVTTSD2SIrr_Int:    case X86::CVTTSD2SI64rr_Int:
+    case X86::VCVTTSD2SIrr_Int:   case X86::VCVTTSD2SI64rr_Int:
+    case X86::VCVTTSD2SIZrr_Int:  case X86::VCVTTSD2SI64Zrr_Int:
+    case X86::VCVTSD2USIZrr_Int:  case X86::VCVTSD2USI64Zrr_Int:
+    case X86::VCVTTSD2USIZrr_Int: case X86::VCVTTSD2USI64Zrr_Int:
+    case X86::ROUNDSDr_Int: case X86::VROUNDSDr_Int:
+    case X86::COMISDrr_Int: case X86::VCOMISDrr_Int: case X86::VCOMISDZrr_Int:
+    case X86::UCOMISDrr_Int:case X86::VUCOMISDrr_Int:case X86::VUCOMISDZrr_Int:
     case X86::ADDSDrr_Int: case X86::VADDSDrr_Int: case X86::VADDSDZrr_Int:
     case X86::CMPSDrr_Int: case X86::VCMPSDrr_Int: case X86::VCMPSDZrr_Int:
     case X86::DIVSDrr_Int: case X86::VDIVSDrr_Int: case X86::VDIVSDZrr_Int:
     case X86::MAXSDrr_Int: case X86::VMAXSDrr_Int: case X86::VMAXSDZrr_Int:
     case X86::MINSDrr_Int: case X86::VMINSDrr_Int: case X86::VMINSDZrr_Int:
     case X86::MULSDrr_Int: case X86::VMULSDrr_Int: case X86::VMULSDZrr_Int:
+    case X86::SQRTSDr_Int: case X86::VSQRTSDr_Int: case X86::VSQRTSDZr_Int:
     case X86::SUBSDrr_Int: case X86::VSUBSDrr_Int: case X86::VSUBSDZrr_Int:
     case X86::VADDSDZrr_Intk: case X86::VADDSDZrr_Intkz:
     case X86::VCMPSDZrr_Intk:
@@ -5152,6 +5798,7 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VMAXSDZrr_Intk: case X86::VMAXSDZrr_Intkz:
     case X86::VMINSDZrr_Intk: case X86::VMINSDZrr_Intkz:
     case X86::VMULSDZrr_Intk: case X86::VMULSDZrr_Intkz:
+    case X86::VSQRTSDZr_Intk: case X86::VSQRTSDZr_Intkz:
     case X86::VSUBSDZrr_Intk: case X86::VSUBSDZrr_Intkz:
     case X86::VFMADDSD4rr_Int:   case X86::VFNMADDSD4rr_Int:
     case X86::VFMSUBSD4rr_Int:   case X86::VFNMSUBSD4rr_Int:
@@ -5179,6 +5826,41 @@ static bool isNonFoldablePartialRegisterLoad(const MachineInstr &LoadMI,
     case X86::VFMSUB132SDZr_Intkz: case X86::VFNMSUB132SDZr_Intkz:
     case X86::VFMSUB213SDZr_Intkz: case X86::VFNMSUB213SDZr_Intkz:
     case X86::VFMSUB231SDZr_Intkz: case X86::VFNMSUB231SDZr_Intkz:
+    case X86::VFIXUPIMMSDZrri:
+    case X86::VFIXUPIMMSDZrrik:
+    case X86::VFIXUPIMMSDZrrikz:
+    case X86::VFPCLASSSDZrr:
+    case X86::VFPCLASSSDZrrk:
+    case X86::VGETEXPSDZr:
+    case X86::VGETEXPSDZrk:
+    case X86::VGETEXPSDZrkz:
+    case X86::VGETMANTSDZrri:
+    case X86::VGETMANTSDZrrik:
+    case X86::VGETMANTSDZrrikz:
+    case X86::VRANGESDZrri:
+    case X86::VRANGESDZrrik:
+    case X86::VRANGESDZrrikz:
+    case X86::VRCP14SDZrr:
+    case X86::VRCP14SDZrrk:
+    case X86::VRCP14SDZrrkz:
+    case X86::VRCP28SDZr:
+    case X86::VRCP28SDZrk:
+    case X86::VRCP28SDZrkz:
+    case X86::VREDUCESDZrri:
+    case X86::VREDUCESDZrrik:
+    case X86::VREDUCESDZrrikz:
+    case X86::VRNDSCALESDZr_Int:
+    case X86::VRNDSCALESDZr_Intk:
+    case X86::VRNDSCALESDZr_Intkz:
+    case X86::VRSQRT14SDZrr:
+    case X86::VRSQRT14SDZrrk:
+    case X86::VRSQRT14SDZrrkz:
+    case X86::VRSQRT28SDZr:
+    case X86::VRSQRT28SDZrk:
+    case X86::VRSQRT28SDZrkz:
+    case X86::VSCALEFSDZrr:
+    case X86::VSCALEFSDZrrk:
+    case X86::VSCALEFSDZrrkz:
       return false;
     default:
       return true;
@@ -5219,36 +5901,36 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     return nullptr;
 
   // Determine the alignment of the load.
-  unsigned Alignment = 0;
+  Align Alignment;
   if (LoadMI.hasOneMemOperand())
-    Alignment = (*LoadMI.memoperands_begin())->getAlignment();
+    Alignment = (*LoadMI.memoperands_begin())->getAlign();
   else
     switch (LoadMI.getOpcode()) {
     case X86::AVX512_512_SET0:
     case X86::AVX512_512_SETALLONES:
-      Alignment = 64;
+      Alignment = Align(64);
       break;
     case X86::AVX2_SETALLONES:
     case X86::AVX1_SETALLONES:
     case X86::AVX_SET0:
     case X86::AVX512_256_SET0:
-      Alignment = 32;
+      Alignment = Align(32);
       break;
     case X86::V_SET0:
     case X86::V_SETALLONES:
     case X86::AVX512_128_SET0:
     case X86::FsFLD0F128:
     case X86::AVX512_FsFLD0F128:
-      Alignment = 16;
+      Alignment = Align(16);
       break;
     case X86::MMX_SET0:
     case X86::FsFLD0SD:
     case X86::AVX512_FsFLD0SD:
-      Alignment = 8;
+      Alignment = Align(8);
       break;
     case X86::FsFLD0SS:
     case X86::AVX512_FsFLD0SS:
-      Alignment = 4;
+      Alignment = Align(4);
       break;
     default:
       return nullptr;
@@ -5323,14 +6005,18 @@ MachineInstr *X86InstrInfo::foldMemoryOperandImpl(
     else if (Opc == X86::FsFLD0F128 || Opc == X86::AVX512_FsFLD0F128)
       Ty = Type::getFP128Ty(MF.getFunction().getContext());
     else if (Opc == X86::AVX512_512_SET0 || Opc == X86::AVX512_512_SETALLONES)
-      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),16);
+      Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+                                16);
     else if (Opc == X86::AVX2_SETALLONES || Opc == X86::AVX_SET0 ||
              Opc == X86::AVX512_256_SET0 || Opc == X86::AVX1_SETALLONES)
-      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 8);
+      Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+                                8);
     else if (Opc == X86::MMX_SET0)
-      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 2);
+      Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+                                2);
     else
-      Ty = VectorType::get(Type::getInt32Ty(MF.getFunction().getContext()), 4);
+      Ty = FixedVectorType::get(Type::getInt32Ty(MF.getFunction().getContext()),
+                                4);
 
     bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX2_SETALLONES ||
                       Opc == X86::AVX512_512_SETALLONES ||
@@ -5416,33 +6102,33 @@ static unsigned getBroadcastOpcode(const X86MemoryFoldTableEntry *I,
   case TB_BCAST_D:
     switch (SpillSize) {
     default: llvm_unreachable("Unknown spill size");
-    case 16: return X86::VPBROADCASTDZ128m;
-    case 32: return X86::VPBROADCASTDZ256m;
-    case 64: return X86::VPBROADCASTDZm;
+    case 16: return X86::VPBROADCASTDZ128rm;
+    case 32: return X86::VPBROADCASTDZ256rm;
+    case 64: return X86::VPBROADCASTDZrm;
     }
     break;
   case TB_BCAST_Q:
     switch (SpillSize) {
     default: llvm_unreachable("Unknown spill size");
-    case 16: return X86::VPBROADCASTQZ128m;
-    case 32: return X86::VPBROADCASTQZ256m;
-    case 64: return X86::VPBROADCASTQZm;
+    case 16: return X86::VPBROADCASTQZ128rm;
+    case 32: return X86::VPBROADCASTQZ256rm;
+    case 64: return X86::VPBROADCASTQZrm;
     }
     break;
   case TB_BCAST_SS:
     switch (SpillSize) {
     default: llvm_unreachable("Unknown spill size");
-    case 16: return X86::VBROADCASTSSZ128m;
-    case 32: return X86::VBROADCASTSSZ256m;
-    case 64: return X86::VBROADCASTSSZm;
+    case 16: return X86::VBROADCASTSSZ128rm;
+    case 32: return X86::VBROADCASTSSZ256rm;
+    case 64: return X86::VBROADCASTSSZrm;
     }
     break;
   case TB_BCAST_SD:
     switch (SpillSize) {
     default: llvm_unreachable("Unknown spill size");
     case 16: return X86::VMOVDDUPZ128rm;
-    case 32: return X86::VBROADCASTSDZ256m;
-    case 64: return X86::VBROADCASTSDZm;
+    case 32: return X86::VBROADCASTSDZ256rm;
+    case 64: return X86::VBROADCASTSDZrm;
     }
     break;
   }
@@ -5502,7 +6188,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
       Opc = getBroadcastOpcode(I, RC, Subtarget);
     } else {
       unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
-      bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+      bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
       Opc = getLoadRegOpcode(Reg, RC, isAligned, Subtarget);
     }
 
@@ -5579,7 +6265,7 @@ bool X86InstrInfo::unfoldMemoryOperand(
     const TargetRegisterClass *DstRC = getRegClass(MCID, 0, &RI, MF);
     auto MMOs = extractStoreMMOs(MI.memoperands(), MF);
     unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*DstRC), 16);
-    bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+    bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
     unsigned Opc = getStoreRegOpcode(Reg, DstRC, isAligned, Subtarget);
     DebugLoc DL;
     MachineInstrBuilder MIB = BuildMI(MF, DL, get(Opc));
@@ -5646,7 +6332,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
       Opc = getBroadcastOpcode(I, RC, Subtarget);
     } else {
       unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
-      bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+      bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
       Opc = getLoadRegOpcode(0, RC, isAligned, Subtarget);
     }
 
@@ -5712,7 +6398,7 @@ X86InstrInfo::unfoldMemoryOperand(SelectionDAG &DAG, SDNode *N,
     // FIXME: If a VR128 can have size 32, we should be checking if a 32-byte
     // memory access is slow above.
     unsigned Alignment = std::max<uint32_t>(TRI.getSpillSize(*RC), 16);
-    bool isAligned = !MMOs.empty() && MMOs.front()->getAlignment() >= Alignment;
+    bool isAligned = !MMOs.empty() && MMOs.front()->getAlign() >= Alignment;
     SDNode *Store =
         DAG.getMachineNode(getStoreRegOpcode(0, DstRC, isAligned, Subtarget),
                            dl, MVT::Other, AddrOps);
@@ -6122,18 +6808,18 @@ static const uint16_t ReplaceableInstrs[][3] = {
   { X86::VMOVSDZrm_alt,  X86::VMOVSDZrm_alt,  X86::VMOVQI2PQIZrm  },
   { X86::VMOVSSZrm,      X86::VMOVSSZrm,      X86::VMOVDI2PDIZrm  },
   { X86::VMOVSSZrm_alt,  X86::VMOVSSZrm_alt,  X86::VMOVDI2PDIZrm  },
-  { X86::VBROADCASTSSZ128r, X86::VBROADCASTSSZ128r, X86::VPBROADCASTDZ128r },
-  { X86::VBROADCASTSSZ128m, X86::VBROADCASTSSZ128m, X86::VPBROADCASTDZ128m },
-  { X86::VBROADCASTSSZ256r, X86::VBROADCASTSSZ256r, X86::VPBROADCASTDZ256r },
-  { X86::VBROADCASTSSZ256m, X86::VBROADCASTSSZ256m, X86::VPBROADCASTDZ256m },
-  { X86::VBROADCASTSSZr,    X86::VBROADCASTSSZr,    X86::VPBROADCASTDZr },
-  { X86::VBROADCASTSSZm,    X86::VBROADCASTSSZm,    X86::VPBROADCASTDZm },
-  { X86::VMOVDDUPZ128rr,    X86::VMOVDDUPZ128rr,    X86::VPBROADCASTQZ128r },
-  { X86::VMOVDDUPZ128rm,    X86::VMOVDDUPZ128rm,    X86::VPBROADCASTQZ128m },
-  { X86::VBROADCASTSDZ256r, X86::VBROADCASTSDZ256r, X86::VPBROADCASTQZ256r },
-  { X86::VBROADCASTSDZ256m, X86::VBROADCASTSDZ256m, X86::VPBROADCASTQZ256m },
-  { X86::VBROADCASTSDZr,    X86::VBROADCASTSDZr,    X86::VPBROADCASTQZr },
-  { X86::VBROADCASTSDZm,    X86::VBROADCASTSDZm,    X86::VPBROADCASTQZm },
+  { X86::VBROADCASTSSZ128rr,X86::VBROADCASTSSZ128rr,X86::VPBROADCASTDZ128rr },
+  { X86::VBROADCASTSSZ128rm,X86::VBROADCASTSSZ128rm,X86::VPBROADCASTDZ128rm },
+  { X86::VBROADCASTSSZ256rr,X86::VBROADCASTSSZ256rr,X86::VPBROADCASTDZ256rr },
+  { X86::VBROADCASTSSZ256rm,X86::VBROADCASTSSZ256rm,X86::VPBROADCASTDZ256rm },
+  { X86::VBROADCASTSSZrr,   X86::VBROADCASTSSZrr,   X86::VPBROADCASTDZrr },
+  { X86::VBROADCASTSSZrm,   X86::VBROADCASTSSZrm,   X86::VPBROADCASTDZrm },
+  { X86::VMOVDDUPZ128rr,    X86::VMOVDDUPZ128rr,    X86::VPBROADCASTQZ128rr },
+  { X86::VMOVDDUPZ128rm,    X86::VMOVDDUPZ128rm,    X86::VPBROADCASTQZ128rm },
+  { X86::VBROADCASTSDZ256rr,X86::VBROADCASTSDZ256rr,X86::VPBROADCASTQZ256rr },
+  { X86::VBROADCASTSDZ256rm,X86::VBROADCASTSDZ256rm,X86::VPBROADCASTQZ256rm },
+  { X86::VBROADCASTSDZrr,   X86::VBROADCASTSDZrr,   X86::VPBROADCASTQZrr },
+  { X86::VBROADCASTSDZrm,   X86::VBROADCASTSDZrm,   X86::VPBROADCASTQZrm },
   { X86::VINSERTF32x4Zrr,   X86::VINSERTF32x4Zrr,   X86::VINSERTI32x4Zrr },
   { X86::VINSERTF32x4Zrm,   X86::VINSERTF32x4Zrm,   X86::VINSERTI32x4Zrm },
   { X86::VINSERTF32x8Zrr,   X86::VINSERTF32x8Zrr,   X86::VINSERTI32x8Zrr },
@@ -6893,7 +7579,7 @@ void X86InstrInfo::setExecutionDomain(MachineInstr &MI, unsigned Domain) const {
     assert((Subtarget.hasDQI() || Domain >= 3) && "Requires AVX-512DQ");
     table = lookupAVX512(MI.getOpcode(), dom, ReplaceableInstrsAVX512DQ);
     // Don't change integer Q instructions to D instructions and
-    // use D intructions if we started with a PS instruction.
+    // use D instructions if we started with a PS instruction.
     if (table && Domain == 3 && (dom == 1 || table[3] == MI.getOpcode()))
       Domain = 4;
   }
@@ -7550,7 +8236,8 @@ bool X86InstrInfo::isAssociativeAndCommutative(const MachineInstr &Inst) const {
   case X86::VMULSSrr:
   case X86::VMULSDZrr:
   case X86::VMULSSZrr:
-    return Inst.getParent()->getParent()->getTarget().Options.UnsafeFPMath;
+    return Inst.getFlag(MachineInstr::MIFlag::FmReassoc) &&
+           Inst.getFlag(MachineInstr::MIFlag::FmNsz);
   default:
     return false;
   }
@@ -7677,6 +8364,10 @@ X86InstrInfo::describeLoadedValue(const MachineInstr &MI, Register Reg) const {
 
     return ParamLoadedValue(*Op, Expr);;
   }
+  case X86::MOV8ri:
+  case X86::MOV16ri:
+    // TODO: Handle MOV8ri and MOV16ri.
+    return None;
   case X86::MOV32ri:
   case X86::MOV64ri:
   case X86::MOV64ri32:
@@ -7736,6 +8427,20 @@ void X86InstrInfo::setSpecialOperandAttr(MachineInstr &OldMI1,
                                          MachineInstr &OldMI2,
                                          MachineInstr &NewMI1,
                                          MachineInstr &NewMI2) const {
+  // Propagate FP flags from the original instructions.
+  // But clear poison-generating flags because those may not be valid now.
+  // TODO: There should be a helper function for copying only fast-math-flags.
+  uint16_t IntersectedFlags = OldMI1.getFlags() & OldMI2.getFlags();
+  NewMI1.setFlags(IntersectedFlags);
+  NewMI1.clearFlag(MachineInstr::MIFlag::NoSWrap);
+  NewMI1.clearFlag(MachineInstr::MIFlag::NoUWrap);
+  NewMI1.clearFlag(MachineInstr::MIFlag::IsExact);
+
+  NewMI2.setFlags(IntersectedFlags);
+  NewMI2.clearFlag(MachineInstr::MIFlag::NoSWrap);
+  NewMI2.clearFlag(MachineInstr::MIFlag::NoUWrap);
+  NewMI2.clearFlag(MachineInstr::MIFlag::IsExact);
+
   // Integer instructions may define an implicit EFLAGS dest register operand.
   MachineOperand *OldFlagDef1 = OldMI1.findRegisterDefOperand(X86::EFLAGS);
   MachineOperand *OldFlagDef2 = OldMI2.findRegisterDefOperand(X86::EFLAGS);
@@ -7955,8 +8660,7 @@ namespace {
       }
 
       // Visit the children of this block in the dominator tree.
-      for (MachineDomTreeNode::iterator I = Node->begin(), E = Node->end();
-           I != E; ++I) {
+      for (auto I = Node->begin(), E = Node->end(); I != E; ++I) {
         Changed |= VisitNode(*I, TLSBaseAddrReg);
       }
 
@@ -8071,6 +8775,35 @@ outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo(
                         return Sum + 1;
                       });
 
+  // We check to see if CFI Instructions are present, and if they are
+  // we find the number of CFI Instructions in the candidates.
+  unsigned CFICount = 0;
+  MachineBasicBlock::iterator MBBI = RepeatedSequenceLocs[0].front();
+  for (unsigned Loc = RepeatedSequenceLocs[0].getStartIdx();
+       Loc < RepeatedSequenceLocs[0].getEndIdx() + 1; Loc++) {
+    const std::vector<MCCFIInstruction> &CFIInstructions =
+        RepeatedSequenceLocs[0].getMF()->getFrameInstructions();
+    if (MBBI->isCFIInstruction()) {
+      unsigned CFIIndex = MBBI->getOperand(0).getCFIIndex();
+      MCCFIInstruction CFI = CFIInstructions[CFIIndex];
+      CFICount++;
+    }
+    MBBI++;
+  }
+
+  // We compare the number of found CFI Instructions to  the number of CFI
+  // instructions in the parent function for each candidate.  We must check this
+  // since if we outline one of the CFI instructions in a function, we have to
+  // outline them all for correctness. If we do not, the address offsets will be
+  // incorrect between the two sections of the program.
+  for (outliner::Candidate &C : RepeatedSequenceLocs) {
+    std::vector<MCCFIInstruction> CFIInstructions =
+        C.getMF()->getFrameInstructions();
+
+    if (CFICount > 0 && CFICount != CFIInstructions.size())
+      return outliner::OutlinedFunction();
+  }
+
   // FIXME: Use real size in bytes for call and ret instructions.
   if (RepeatedSequenceLocs[0].back()->isTerminator()) {
     for (outliner::Candidate &C : RepeatedSequenceLocs)
@@ -8082,6 +8815,9 @@ outliner::OutlinedFunction X86InstrInfo::getOutliningCandidateInfo(
     );
   }
 
+  if (CFICount > 0)
+    return outliner::OutlinedFunction();
+
   for (outliner::Candidate &C : RepeatedSequenceLocs)
     C.setCallInfo(MachineOutlinerDefault, 1);
 
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 1d2da5305357..89f2ff118c37 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -24,8 +24,6 @@
 #include "X86GenInstrInfo.inc"
 
 namespace llvm {
-class MachineInstrBuilder;
-class X86RegisterInfo;
 class X86Subtarget;
 
 namespace X86 {
@@ -180,8 +178,37 @@ public:
   /// true, then it's expected the pre-extension value is available as a subreg
   /// of the result register. This also returns the sub-register index in
   /// SubIdx.
-  bool isCoalescableExtInstr(const MachineInstr &MI, unsigned &SrcReg,
-                             unsigned &DstReg, unsigned &SubIdx) const override;
+  bool isCoalescableExtInstr(const MachineInstr &MI, Register &SrcReg,
+                             Register &DstReg, unsigned &SubIdx) const override;
+
+  /// Returns true if the instruction has no behavior (specified or otherwise)
+  /// that is based on the value of any of its register operands
+  ///
+  /// Instructions are considered data invariant even if they set EFLAGS.
+  ///
+  /// A classical example of something that is inherently not data invariant is
+  /// an indirect jump -- the destination is loaded into icache based on the
+  /// bits set in the jump destination register.
+  ///
+  /// FIXME: This should become part of our instruction tables.
+  static bool isDataInvariant(MachineInstr &MI);
+
+  /// Returns true if the instruction has no behavior (specified or otherwise)
+  /// that is based on the value loaded from memory or the value of any
+  /// non-address register operands.
+  ///
+  /// For example, if the latency of the instruction is dependent on the
+  /// particular bits set in any of the registers *or* any of the bits loaded
+  /// from memory.
+  ///
+  /// Instructions are considered data invariant even if they set EFLAGS.
+  ///
+  /// A classical example of something that is inherently not data invariant is
+  /// an indirect jump -- the destination is loaded into icache based on the
+  /// bits set in the jump destination register.
+  ///
+  /// FIXME: This should become part of our instruction tables.
+  static bool isDataInvariantLoad(MachineInstr &MI);
 
   unsigned isLoadFromStackSlot(const MachineInstr &MI,
                                int &FrameIndex) const override;
@@ -208,7 +235,7 @@ public:
   bool isReallyTriviallyReMaterializable(const MachineInstr &MI,
                                          AAResults *AA) const override;
   void reMaterialize(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                     unsigned DestReg, unsigned SubIdx,
+                     Register DestReg, unsigned SubIdx,
                      const MachineInstr &Orig,
                      const TargetRegisterInfo &TRI) const override;
 
@@ -278,7 +305,6 @@ public:
                                  const X86InstrFMA3Group &FMA3Group) const;
 
   // Branch analysis.
-  bool isUnpredicatedTerminator(const MachineInstr &MI) const override;
   bool isUnconditionalTailCall(const MachineInstr &MI) const override;
   bool canMakeTailCallConditional(SmallVectorImpl<MachineOperand> &Cond,
                                   const MachineInstr &TailCall) const override;
@@ -291,10 +317,11 @@ public:
                      SmallVectorImpl<MachineOperand> &Cond,
                      bool AllowModify) const override;
 
-  bool getMemOperandWithOffset(const MachineInstr &LdSt,
-                               const MachineOperand *&BaseOp,
-                               int64_t &Offset,
-                               const TargetRegisterInfo *TRI) const override;
+  bool getMemOperandsWithOffsetWidth(
+      const MachineInstr &LdSt,
+      SmallVectorImpl<const MachineOperand *> &BaseOps, int64_t &Offset,
+      bool &OffsetIsScalable, unsigned &Width,
+      const TargetRegisterInfo *TRI) const override;
   bool analyzeBranchPredicate(MachineBasicBlock &MBB,
                               TargetInstrInfo::MachineBranchPredicate &MBP,
                               bool AllowModify = false) const override;
@@ -306,22 +333,23 @@ public:
                         const DebugLoc &DL,
                         int *BytesAdded = nullptr) const override;
   bool canInsertSelect(const MachineBasicBlock &, ArrayRef<MachineOperand> Cond,
-                       unsigned, unsigned, int &, int &, int &) const override;
+                       Register, Register, Register, int &, int &,
+                       int &) const override;
   void insertSelect(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
-                    const DebugLoc &DL, unsigned DstReg,
-                    ArrayRef<MachineOperand> Cond, unsigned TrueReg,
-                    unsigned FalseReg) const override;
+                    const DebugLoc &DL, Register DstReg,
+                    ArrayRef<MachineOperand> Cond, Register TrueReg,
+                    Register FalseReg) const override;
   void copyPhysReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                    const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg,
                    bool KillSrc) const override;
   void storeRegToStackSlot(MachineBasicBlock &MBB,
-                           MachineBasicBlock::iterator MI, unsigned SrcReg,
+                           MachineBasicBlock::iterator MI, Register SrcReg,
                            bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI, unsigned DestReg,
+                            MachineBasicBlock::iterator MI, Register DestReg,
                             int FrameIndex, const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
@@ -443,7 +471,7 @@ public:
                                       unsigned OpNum,
                                       ArrayRef<MachineOperand> MOs,
                                       MachineBasicBlock::iterator InsertPt,
-                                      unsigned Size, unsigned Alignment,
+                                      unsigned Size, Align Alignment,
                                       bool AllowCommute) const;
 
   bool isHighLatencyDef(int opc) const override;
@@ -469,15 +497,15 @@ public:
   /// in SrcReg and SrcReg2 if having two register operands, and the value it
   /// compares against in CmpValue. Return true if the comparison instruction
   /// can be analyzed.
-  bool analyzeCompare(const MachineInstr &MI, unsigned &SrcReg,
-                      unsigned &SrcReg2, int &CmpMask,
+  bool analyzeCompare(const MachineInstr &MI, Register &SrcReg,
+                      Register &SrcReg2, int &CmpMask,
                       int &CmpValue) const override;
 
   /// optimizeCompareInstr - Check if there exists an earlier instruction that
   /// operates on the same source operands and sets flags in the same way as
   /// Compare; remove Compare if possible.
-  bool optimizeCompareInstr(MachineInstr &CmpInstr, unsigned SrcReg,
-                            unsigned SrcReg2, int CmpMask, int CmpValue,
+  bool optimizeCompareInstr(MachineInstr &CmpInstr, Register SrcReg,
+                            Register SrcReg2, int CmpMask, int CmpValue,
                             const MachineRegisterInfo *MRI) const override;
 
   /// optimizeLoadInstr - Try to remove the load by folding it to a register
@@ -563,7 +591,7 @@ private:
                                         unsigned OpNum,
                                         ArrayRef<MachineOperand> MOs,
                                         MachineBasicBlock::iterator InsertPt,
-                                        unsigned Size, unsigned Align) const;
+                                        unsigned Size, Align Alignment) const;
 
   /// isFrameOperand - Return true and the FrameIndex if the specified
   /// operand and follow operands form a reference to the stack frame.
diff --git a/llvm/lib/Target/X86/X86InstrInfo.td b/llvm/lib/Target/X86/X86InstrInfo.td
index ca5425e8b89f..23841c3d7e50 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.td
+++ b/llvm/lib/Target/X86/X86InstrInfo.td
@@ -16,10 +16,10 @@
 // X86 specific DAG Nodes.
 //
 
-def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisSameAs<1, 2>]>;
-
-def SDTX86Cmps : SDTypeProfile<1, 3, [SDTCisFP<0>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
-//def SDTX86Cmpss : SDTypeProfile<1, 3, [SDTCisVT<0, f32>, SDTCisSameAs<1, 2>, SDTCisVT<3, i8>]>;
+def SDTX86CmpTest : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisInt<1>,
+                                         SDTCisSameAs<1, 2>]>;
+def SDTX86FCmp    : SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisFP<1>,
+                                         SDTCisSameAs<1, 2>]>;
 
 def SDTX86Cmov    : SDTypeProfile<1, 4,
                                   [SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>,
@@ -121,6 +121,8 @@ def SDT_X86WIN_ALLOCA : SDTypeProfile<0, 1, [SDTCisVT<0, iPTR>]>;
 
 def SDT_X86SEG_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
 
+def SDT_X86PROBED_ALLOCA : SDTypeProfile<1, 1, [SDTCisVT<0, iPTR>, SDTCisVT<1, iPTR>]>;
+
 def SDT_X86EHRET : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_X86TCRET : SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisVT<1, i32>]>;
@@ -138,12 +140,13 @@ def X86MFence : SDNode<"X86ISD::MFENCE", SDT_X86MEMBARRIER,
 
 def X86bsf     : SDNode<"X86ISD::BSF",      SDTUnaryArithWithFlags>;
 def X86bsr     : SDNode<"X86ISD::BSR",      SDTUnaryArithWithFlags>;
-def X86shld    : SDNode<"X86ISD::SHLD",     SDTIntShiftDOp>;
-def X86shrd    : SDNode<"X86ISD::SHRD",     SDTIntShiftDOp>;
+def X86fshl    : SDNode<"X86ISD::FSHL",     SDTIntShiftDOp>;
+def X86fshr    : SDNode<"X86ISD::FSHR",     SDTIntShiftDOp>;
 
 def X86cmp     : SDNode<"X86ISD::CMP" ,     SDTX86CmpTest>;
-def X86strict_fcmp : SDNode<"X86ISD::STRICT_FCMP", SDTX86CmpTest, [SDNPHasChain]>;
-def X86strict_fcmps : SDNode<"X86ISD::STRICT_FCMPS", SDTX86CmpTest, [SDNPHasChain]>;
+def X86fcmp    : SDNode<"X86ISD::FCMP",     SDTX86FCmp>;
+def X86strict_fcmp : SDNode<"X86ISD::STRICT_FCMP", SDTX86FCmp, [SDNPHasChain]>;
+def X86strict_fcmps : SDNode<"X86ISD::STRICT_FCMPS", SDTX86FCmp, [SDNPHasChain]>;
 def X86bt      : SDNode<"X86ISD::BT",       SDTX86CmpTest>;
 
 def X86cmov    : SDNode<"X86ISD::CMOV",     SDTX86Cmov>;
@@ -152,8 +155,6 @@ def X86brcond  : SDNode<"X86ISD::BRCOND",   SDTX86BrCond,
 def X86setcc   : SDNode<"X86ISD::SETCC",    SDTX86SetCC>;
 def X86setcc_c : SDNode<"X86ISD::SETCC_CARRY", SDTX86SetCC_C>;
 
-def X86sahf    : SDNode<"X86ISD::SAHF",     SDTX86sahf>;
-
 def X86rdrand  : SDNode<"X86ISD::RDRAND",   SDTX86rdrand,
                         [SDNPHasChain, SDNPSideEffect]>;
 
@@ -286,6 +287,9 @@ def X86bextr  : SDNode<"X86ISD::BEXTR",  SDTIntBinOp>;
 
 def X86bzhi   : SDNode<"X86ISD::BZHI",   SDTIntBinOp>;
 
+def X86pdep   : SDNode<"X86ISD::PDEP",   SDTIntBinOp>;
+def X86pext   : SDNode<"X86ISD::PEXT",   SDTIntBinOp>;
+
 def X86mul_imm : SDNode<"X86ISD::MUL_IMM", SDTIntBinOp>;
 
 def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
@@ -294,6 +298,9 @@ def X86WinAlloca : SDNode<"X86ISD::WIN_ALLOCA", SDT_X86WIN_ALLOCA,
 def X86SegAlloca : SDNode<"X86ISD::SEG_ALLOCA", SDT_X86SEG_ALLOCA,
                           [SDNPHasChain]>;
 
+def X86ProbedAlloca : SDNode<"X86ISD::PROBED_ALLOCA", SDT_X86PROBED_ALLOCA,
+                          [SDNPHasChain]>;
+
 def X86TLSCall : SDNode<"X86ISD::TLSCALL", SDT_X86TLSCALL,
                         [SDNPHasChain, SDNPOptInGlue, SDNPOutGlue]>;
 
@@ -354,6 +361,8 @@ let RenderMethod = "addMemOperands", SuperClasses = [X86MemAsmOperand] in {
   def X86Mem512_RC256XOperand : AsmOperandClass { let Name = "Mem512_RC256X"; }
   def X86Mem256_RC512Operand  : AsmOperandClass { let Name = "Mem256_RC512"; }
   def X86Mem512_RC512Operand  : AsmOperandClass { let Name = "Mem512_RC512"; }
+
+  def X86SibMemOperand : AsmOperandClass { let Name = "SibMem"; }
 }
 
 def X86AbsMemAsmOperand : AsmOperandClass {
@@ -376,14 +385,16 @@ class X86VMemOperand<RegisterClass RC, string printMethod,
   let MIOperandInfo = (ops ptr_rc, i8imm, RC, i32imm, SEGMENT_REG);
 }
 
-def anymem : X86MemOperand<"printanymem">;
+def anymem : X86MemOperand<"printMemReference">;
 def X86any_fcmp : PatFrags<(ops node:$lhs, node:$rhs),
                           [(X86strict_fcmp node:$lhs, node:$rhs),
-                           (X86cmp node:$lhs, node:$rhs)]>;
+                           (X86fcmp node:$lhs, node:$rhs)]>;
 
 // FIXME: Right now we allow any size during parsing, but we might want to
 // restrict to only unsized memory.
-def opaquemem : X86MemOperand<"printopaquemem">;
+def opaquemem : X86MemOperand<"printMemReference">;
+
+def sibmem: X86MemOperand<"printMemReference", X86SibMemOperand>;
 
 def i8mem   : X86MemOperand<"printbytemem",   X86Mem8AsmOperand>;
 def i16mem  : X86MemOperand<"printwordmem",  X86Mem16AsmOperand>;
@@ -757,14 +768,14 @@ def i64u8imm : Operand<i64> {
 }
 
 def lea64_32mem : Operand<i32> {
-  let PrintMethod = "printanymem";
+  let PrintMethod = "printMemReference";
   let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
   let ParserMatchClass = X86MemAsmOperand;
 }
 
 // Memory operands that use 64-bit pointers in both ILP32 and LP64.
 def lea64mem : Operand<i64> {
-  let PrintMethod = "printanymem";
+  let PrintMethod = "printMemReference";
   let MIOperandInfo = (ops GR64, i8imm, GR64_NOSP, i32imm, SEGMENT_REG);
   let ParserMatchClass = X86MemAsmOperand;
 }
@@ -830,11 +841,10 @@ def tls64baseaddr : ComplexPattern<i64, 5, "selectTLSADDRAddr",
 
 def vectoraddr : ComplexPattern<iPTR, 5, "selectVectorAddr", [],[SDNPWantParent]>;
 
-// A relocatable immediate is either an immediate operand or an operand that can
-// be relocated by the linker to an immediate, such as a regular symbol in
-// non-PIC code.
-def relocImm : ComplexPattern<iAny, 1, "selectRelocImm", [imm, X86Wrapper], [],
-                              0>;
+// A relocatable immediate is an operand that can be relocated by the linker to
+// an immediate, such as a regular symbol in non-PIC code.
+def relocImm : ComplexPattern<iAny, 1, "selectRelocImm",
+                              [X86Wrapper], [], 0>;
 
 //===----------------------------------------------------------------------===//
 // X86 Instruction Predicate Definitions.
@@ -922,11 +932,10 @@ def HasRTM       : Predicate<"Subtarget->hasRTM()">;
 def HasADX       : Predicate<"Subtarget->hasADX()">;
 def HasSHA       : Predicate<"Subtarget->hasSHA()">;
 def HasSGX       : Predicate<"Subtarget->hasSGX()">;
-def HasPRFCHW    : Predicate<"Subtarget->hasPRFCHW()">;
 def HasRDSEED    : Predicate<"Subtarget->hasRDSEED()">;
 def HasSSEPrefetch : Predicate<"Subtarget->hasSSEPrefetch()">;
 def NoSSEPrefetch : Predicate<"!Subtarget->hasSSEPrefetch()">;
-def HasPrefetchW : Predicate<"Subtarget->hasPRFCHW()">;
+def HasPrefetchW : Predicate<"Subtarget->hasPrefetchW()">;
 def HasPREFETCHWT1 : Predicate<"Subtarget->hasPREFETCHWT1()">;
 def HasLAHFSAHF  : Predicate<"Subtarget->hasLAHFSAHF()">;
 def HasMWAITX    : Predicate<"Subtarget->hasMWAITX()">;
@@ -948,18 +957,23 @@ def HasCmpxchg8b : Predicate<"Subtarget->hasCmpxchg8b()">;
 def HasCmpxchg16b: Predicate<"Subtarget->hasCmpxchg16b()">;
 def HasPCONFIG   : Predicate<"Subtarget->hasPCONFIG()">;
 def HasENQCMD    : Predicate<"Subtarget->hasENQCMD()">;
+def HasSERIALIZE : Predicate<"Subtarget->hasSERIALIZE()">;
+def HasTSXLDTRK  : Predicate<"Subtarget->hasTSXLDTRK()">;
+def HasAMXTILE   : Predicate<"Subtarget->hasAMXTILE()">;
+def HasAMXBF16   : Predicate<"Subtarget->hasAMXBF16()">;
+def HasAMXINT8   : Predicate<"Subtarget->hasAMXINT8()">;
 def Not64BitMode : Predicate<"!Subtarget->is64Bit()">,
-                             AssemblerPredicate<"!Mode64Bit", "Not 64-bit mode">;
+                             AssemblerPredicate<(all_of (not Mode64Bit)), "Not 64-bit mode">;
 def In64BitMode  : Predicate<"Subtarget->is64Bit()">,
-                             AssemblerPredicate<"Mode64Bit", "64-bit mode">;
+                             AssemblerPredicate<(all_of Mode64Bit), "64-bit mode">;
 def IsLP64  : Predicate<"Subtarget->isTarget64BitLP64()">;
 def NotLP64 : Predicate<"!Subtarget->isTarget64BitLP64()">;
 def In16BitMode  : Predicate<"Subtarget->is16Bit()">,
-                             AssemblerPredicate<"Mode16Bit", "16-bit mode">;
+                             AssemblerPredicate<(all_of Mode16Bit), "16-bit mode">;
 def Not16BitMode : Predicate<"!Subtarget->is16Bit()">,
-                             AssemblerPredicate<"!Mode16Bit", "Not 16-bit mode">;
+                             AssemblerPredicate<(all_of (not Mode16Bit)), "Not 16-bit mode">;
 def In32BitMode  : Predicate<"Subtarget->is32Bit()">,
-                             AssemblerPredicate<"Mode32Bit", "32-bit mode">;
+                             AssemblerPredicate<(all_of Mode32Bit), "32-bit mode">;
 def IsWin64      : Predicate<"Subtarget->isTargetWin64()">;
 def NotWin64     : Predicate<"!Subtarget->isTargetWin64()">;
 def NotWin64WithoutFP : Predicate<"!Subtarget->isTargetWin64() ||"
@@ -996,8 +1010,8 @@ def HasFastLZCNT : Predicate<"Subtarget->hasFastLZCNT()">;
 def HasFastSHLDRotate : Predicate<"Subtarget->hasFastSHLDRotate()">;
 def HasERMSB : Predicate<"Subtarget->hasERMSB()">;
 def HasMFence    : Predicate<"Subtarget->hasMFence()">;
-def UseRetpolineIndirectCalls : Predicate<"Subtarget->useRetpolineIndirectCalls()">;
-def NotUseRetpolineIndirectCalls : Predicate<"!Subtarget->useRetpolineIndirectCalls()">;
+def UseIndirectThunkCalls : Predicate<"Subtarget->useIndirectThunkCalls()">;
+def NotUseIndirectThunkCalls : Predicate<"!Subtarget->useIndirectThunkCalls()">;
 
 //===----------------------------------------------------------------------===//
 // X86 Instruction Format Definitions.
@@ -1033,13 +1047,17 @@ def i32immSExt8  : ImmLeaf<i32, [{ return isInt<8>(Imm); }]>;
 def i64immSExt8  : ImmLeaf<i64, [{ return isInt<8>(Imm); }]>;
 def i64immSExt32 : ImmLeaf<i64, [{ return isInt<32>(Imm); }]>;
 
-// FIXME: Ideally we would just replace the above i*immSExt* matchers with
-// relocImm-based matchers, but then FastISel would be unable to use them.
+def i16relocImmSExt8 : PatLeaf<(i16 relocImm), [{
+  return isSExtAbsoluteSymbolRef(8, N);
+}]>;
+def i32relocImmSExt8 : PatLeaf<(i32 relocImm), [{
+  return isSExtAbsoluteSymbolRef(8, N);
+}]>;
 def i64relocImmSExt8 : PatLeaf<(i64 relocImm), [{
-  return isSExtRelocImm<8>(N);
+  return isSExtAbsoluteSymbolRef(8, N);
 }]>;
 def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{
-  return isSExtRelocImm<32>(N);
+  return isSExtAbsoluteSymbolRef(32, N);
 }]>;
 
 // If we have multiple users of an immediate, it's much smaller to reuse
@@ -1059,6 +1077,13 @@ def i64relocImmSExt32 : PatLeaf<(i64 relocImm), [{
 // Eventually, it would be nice to allow ConstantHoisting to merge constants
 // globally for potentially added savings.
 //
+def imm_su : PatLeaf<(imm), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64immSExt32_su : PatLeaf<(i64immSExt32), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+
 def relocImm8_su : PatLeaf<(i8 relocImm), [{
     return !shouldAvoidImmediateInstFormsForSize(N);
 }]>;
@@ -1069,20 +1094,26 @@ def relocImm32_su : PatLeaf<(i32 relocImm), [{
     return !shouldAvoidImmediateInstFormsForSize(N);
 }]>;
 
-def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
+def i16relocImmSExt8_su : PatLeaf<(i16relocImmSExt8), [{
     return !shouldAvoidImmediateInstFormsForSize(N);
 }]>;
-def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
+def i32relocImmSExt8_su : PatLeaf<(i32relocImmSExt8), [{
     return !shouldAvoidImmediateInstFormsForSize(N);
 }]>;
-def i64immSExt8_su : PatLeaf<(i64immSExt8), [{
+def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{
     return !shouldAvoidImmediateInstFormsForSize(N);
 }]>;
 
-def i64relocImmSExt8_su : PatLeaf<(i64relocImmSExt8), [{
+def i16immSExt8_su : PatLeaf<(i16immSExt8), [{
     return !shouldAvoidImmediateInstFormsForSize(N);
 }]>;
-def i64relocImmSExt32_su : PatLeaf<(i64relocImmSExt32), [{
+def i32immSExt8_su : PatLeaf<(i32immSExt8), [{
+    return !shouldAvoidImmediateInstFormsForSize(N);
+}]>;
+def i64immSExt8_su : PatLeaf<(i64immSExt8), [{
     return !shouldAvoidImmediateInstFormsForSize(N);
 }]>;
 
@@ -1113,7 +1144,7 @@ def loadi16 : PatFrag<(ops node:$ptr), (i16 (unindexedload node:$ptr)), [{
   ISD::LoadExtType ExtType = LD->getExtensionType();
   if (ExtType == ISD::NON_EXTLOAD)
     return true;
-  if (ExtType == ISD::EXTLOAD)
+  if (ExtType == ISD::EXTLOAD && EnablePromoteAnyextLoad)
     return LD->getAlignment() >= 2 && LD->isSimple();
   return false;
 }]>;
@@ -1123,7 +1154,7 @@ def loadi32 : PatFrag<(ops node:$ptr), (i32 (unindexedload node:$ptr)), [{
   ISD::LoadExtType ExtType = LD->getExtensionType();
   if (ExtType == ISD::NON_EXTLOAD)
     return true;
-  if (ExtType == ISD::EXTLOAD)
+  if (ExtType == ISD::EXTLOAD && EnablePromoteAnyextLoad)
     return LD->getAlignment() >= 4 && LD->isSimple();
   return false;
 }]>;
@@ -1550,7 +1581,7 @@ def MOV16ri : Ii16<0xB8, AddRegFrm, (outs GR16:$dst), (ins i16imm:$src),
                    [(set GR16:$dst, imm:$src)]>, OpSize16;
 def MOV32ri : Ii32<0xB8, AddRegFrm, (outs GR32:$dst), (ins i32imm:$src),
                    "mov{l}\t{$src, $dst|$dst, $src}",
-                   [(set GR32:$dst, relocImm:$src)]>, OpSize32;
+                   [(set GR32:$dst, imm:$src)]>, OpSize32;
 def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
                        "mov{q}\t{$src, $dst|$dst, $src}",
                        [(set GR64:$dst, i64immSExt32:$src)]>;
@@ -1558,7 +1589,7 @@ def MOV64ri32 : RIi32S<0xC7, MRM0r, (outs GR64:$dst), (ins i64i32imm:$src),
 let isReMaterializable = 1, isMoveImm = 1 in {
 def MOV64ri : RIi64<0xB8, AddRegFrm, (outs GR64:$dst), (ins i64imm:$src),
                     "movabs{q}\t{$src, $dst|$dst, $src}",
-                    [(set GR64:$dst, relocImm:$src)]>;
+                    [(set GR64:$dst, imm:$src)]>;
 }
 
 // Longer forms that use a ModR/M byte. Needed for disassembler
@@ -1578,19 +1609,31 @@ def MOV32ri_alt : Ii32<0xC7, MRM0r, (outs GR32:$dst), (ins i32imm:$src),
 let SchedRW = [WriteStore] in {
 def MOV8mi  : Ii8 <0xC6, MRM0m, (outs), (ins i8mem :$dst, i8imm :$src),
                    "mov{b}\t{$src, $dst|$dst, $src}",
-                   [(store (i8 relocImm8_su:$src), addr:$dst)]>;
+                   [(store (i8 imm_su:$src), addr:$dst)]>;
 def MOV16mi : Ii16<0xC7, MRM0m, (outs), (ins i16mem:$dst, i16imm:$src),
                    "mov{w}\t{$src, $dst|$dst, $src}",
-                   [(store (i16 relocImm16_su:$src), addr:$dst)]>, OpSize16;
+                   [(store (i16 imm_su:$src), addr:$dst)]>, OpSize16;
 def MOV32mi : Ii32<0xC7, MRM0m, (outs), (ins i32mem:$dst, i32imm:$src),
                    "mov{l}\t{$src, $dst|$dst, $src}",
-                   [(store (i32 relocImm32_su:$src), addr:$dst)]>, OpSize32;
+                   [(store (i32 imm_su:$src), addr:$dst)]>, OpSize32;
 def MOV64mi32 : RIi32S<0xC7, MRM0m, (outs), (ins i64mem:$dst, i64i32imm:$src),
                        "mov{q}\t{$src, $dst|$dst, $src}",
-                       [(store i64relocImmSExt32_su:$src, addr:$dst)]>,
+                       [(store i64immSExt32_su:$src, addr:$dst)]>,
                        Requires<[In64BitMode]>;
 } // SchedRW
 
+def : Pat<(i32 relocImm:$src), (MOV32ri relocImm:$src)>;
+def : Pat<(i64 relocImm:$src), (MOV64ri relocImm:$src)>;
+
+def : Pat<(store (i8 relocImm8_su:$src), addr:$dst),
+          (MOV8mi addr:$dst, relocImm8_su:$src)>;
+def : Pat<(store (i16 relocImm16_su:$src), addr:$dst),
+          (MOV16mi addr:$dst, relocImm16_su:$src)>;
+def : Pat<(store (i32 relocImm32_su:$src), addr:$dst),
+          (MOV32mi addr:$dst, relocImm32_su:$src)>;
+def : Pat<(store (i64 i64relocImmSExt32_su:$src), addr:$dst),
+          (MOV64mi32 addr:$dst, i64immSExt32_su:$src)>;
+
 let hasSideEffects = 0 in {
 
 /// Memory offset versions of moves. The immediate is an address mode sized
@@ -1787,9 +1830,8 @@ def MOV8rm_NOREX : I<0x8A, MRMSrcMem,
 
 // Condition code ops, incl. set if equal/not equal/...
 let SchedRW = [WriteLAHFSAHF] in {
-let Defs = [EFLAGS], Uses = [AH] in
-def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf",
-                 [(set EFLAGS, (X86sahf AH))]>,
+let Defs = [EFLAGS], Uses = [AH], hasSideEffects = 0 in
+def SAHF     : I<0x9E, RawFrm, (outs),  (ins), "sahf", []>,  // flags = AH
                  Requires<[HasLAHFSAHF]>;
 let Defs = [AH], Uses = [EFLAGS], hasSideEffects = 0 in
 def LAHF     : I<0x9F, RawFrm, (outs),  (ins), "lahf", []>,  // AH = flags
@@ -2163,24 +2205,24 @@ def CMPXCHG16B : RI<0xC7, MRM1m, (outs), (ins i128mem:$dst),
 
 // Lock instruction prefix
 let SchedRW = [WriteMicrocoded] in
-def LOCK_PREFIX : I<0xF0, RawFrm, (outs),  (ins), "lock", []>;
+def LOCK_PREFIX : I<0xF0, PrefixByte, (outs),  (ins), "lock", []>;
 
 let SchedRW = [WriteNop] in {
 
 // Rex64 instruction prefix
-def REX64_PREFIX : I<0x48, RawFrm, (outs),  (ins), "rex64", []>,
+def REX64_PREFIX : I<0x48, PrefixByte, (outs),  (ins), "rex64", []>,
                      Requires<[In64BitMode]>;
 
 // Data16 instruction prefix
-def DATA16_PREFIX : I<0x66, RawFrm, (outs),  (ins), "data16", []>;
+def DATA16_PREFIX : I<0x66, PrefixByte, (outs),  (ins), "data16", []>;
 } // SchedRW
 
 // Repeat string operation instruction prefixes
 let Defs = [ECX], Uses = [ECX,DF], SchedRW = [WriteMicrocoded] in {
 // Repeat (used with INS, OUTS, MOVS, LODS and STOS)
-def REP_PREFIX : I<0xF3, RawFrm, (outs),  (ins), "rep", []>;
+def REP_PREFIX : I<0xF3, PrefixByte, (outs),  (ins), "rep", []>;
 // Repeat while not equal (used with CMPS and SCAS)
-def REPNE_PREFIX : I<0xF2, RawFrm, (outs),  (ins), "repne", []>;
+def REPNE_PREFIX : I<0xF2, PrefixByte, (outs),  (ins), "repne", []>;
 }
 
 // String manipulation instructions
@@ -2581,27 +2623,27 @@ let Predicates = [HasBMI2, NoTBM] in {
 }
 
 multiclass bmi_pdep_pext<string mnemonic, RegisterClass RC,
-                         X86MemOperand x86memop, Intrinsic Int,
+                         X86MemOperand x86memop, SDNode OpNode,
                          PatFrag ld_frag> {
   def rr : I<0xF5, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
              !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (Int RC:$src1, RC:$src2))]>,
+             [(set RC:$dst, (OpNode RC:$src1, RC:$src2))]>,
              VEX_4V, Sched<[WriteALU]>;
   def rm : I<0xF5, MRMSrcMem, (outs RC:$dst), (ins RC:$src1, x86memop:$src2),
              !strconcat(mnemonic, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set RC:$dst, (Int RC:$src1, (ld_frag addr:$src2)))]>,
+             [(set RC:$dst, (OpNode RC:$src1, (ld_frag addr:$src2)))]>,
              VEX_4V, Sched<[WriteALU.Folded, WriteALU.ReadAfterFold]>;
 }
 
 let Predicates = [HasBMI2] in {
   defm PDEP32 : bmi_pdep_pext<"pdep{l}", GR32, i32mem,
-                               int_x86_bmi_pdep_32, loadi32>, T8XD;
+                               X86pdep, loadi32>, T8XD;
   defm PDEP64 : bmi_pdep_pext<"pdep{q}", GR64, i64mem,
-                               int_x86_bmi_pdep_64, loadi64>, T8XD, VEX_W;
+                               X86pdep, loadi64>, T8XD, VEX_W;
   defm PEXT32 : bmi_pdep_pext<"pext{l}", GR32, i32mem,
-                               int_x86_bmi_pext_32, loadi32>, T8XS;
+                               X86pext, loadi32>, T8XS;
   defm PEXT64 : bmi_pdep_pext<"pext{q}", GR64, i64mem,
-                               int_x86_bmi_pext_64, loadi64>, T8XS, VEX_W;
+                               X86pext, loadi64>, T8XS, VEX_W;
 }
 
 //===----------------------------------------------------------------------===//
@@ -2785,11 +2827,11 @@ let SchedRW = [WriteStore] in {
 def MOVDIRI32 : I<0xF9, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src),
                   "movdiri\t{$src, $dst|$dst, $src}",
                   [(int_x86_directstore32 addr:$dst, GR32:$src)]>,
-                 T8, Requires<[HasMOVDIRI]>;
+                 T8PS, Requires<[HasMOVDIRI]>;
 def MOVDIRI64 : RI<0xF9, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src),
                    "movdiri\t{$src, $dst|$dst, $src}",
                    [(int_x86_directstore64 addr:$dst, GR64:$src)]>,
-                  T8, Requires<[In64BitMode, HasMOVDIRI]>;
+                  T8PS, Requires<[In64BitMode, HasMOVDIRI]>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -2856,6 +2898,23 @@ def : InstAlias<"clzero\t{%eax|eax}", (CLZERO32r)>, Requires<[Not64BitMode]>;
 def : InstAlias<"clzero\t{%rax|rax}", (CLZERO64r)>, Requires<[In64BitMode]>;
 
 //===----------------------------------------------------------------------===//
+// SERIALIZE Instruction
+//
+def SERIALIZE : I<0x01, MRM_E8, (outs), (ins), "serialize",
+                  [(int_x86_serialize)]>, PS,
+                  Requires<[HasSERIALIZE]>;
+
+//===----------------------------------------------------------------------===//
+// TSXLDTRK - TSX Suspend Load Address Tracking
+//
+let Predicates = [HasTSXLDTRK] in {
+  def XSUSLDTRK : I<0x01, MRM_E8, (outs), (ins), "xsusldtrk",
+                    [(int_x86_xsusldtrk)]>, XD;
+  def XRESLDTRK : I<0x01, MRM_E9, (outs), (ins), "xresldtrk",
+                    [(int_x86_xresldtrk)]>, XD;
+}
+
+//===----------------------------------------------------------------------===//
 // Pattern fragments to auto generate TBM instructions.
 //===----------------------------------------------------------------------===//
 
@@ -2913,6 +2972,11 @@ let Predicates = [HasTBM] in {
             (TZMSK64rr GR64:$src)>;
 
   // Patterns to match flag producing ops.
+  def : Pat<(and_flag_nocf GR32:$src, (add GR32:$src, 1)),
+            (BLCFILL32rr GR32:$src)>;
+  def : Pat<(and_flag_nocf GR64:$src, (add GR64:$src, 1)),
+            (BLCFILL64rr GR64:$src)>;
+
   def : Pat<(or_flag_nocf GR32:$src, (not (add GR32:$src, 1))),
             (BLCI32rr GR32:$src)>;
   def : Pat<(or_flag_nocf GR64:$src, (not (add GR64:$src, 1))),
@@ -2974,7 +3038,7 @@ def CLWB       : I<0xAE, MRM6m, (outs), (ins i8mem:$src), "clwb\t$src",
 
 let Predicates = [HasCLDEMOTE], SchedRW = [WriteLoad] in
 def CLDEMOTE : I<0x1C, MRM0m, (outs), (ins i8mem:$src), "cldemote\t$src",
-                   [(int_x86_cldemote addr:$src)]>, TB;
+                   [(int_x86_cldemote addr:$src)]>, PS;
 
 //===----------------------------------------------------------------------===//
 // Subsystems.
@@ -3013,6 +3077,9 @@ include "X86InstrSVM.td"
 include "X86InstrTSX.td"
 include "X86InstrSGX.td"
 
+// AMX instructions
+include "X86InstrAMX.td"
+
 // System instructions.
 include "X86InstrSystem.td"
 
@@ -3108,6 +3175,9 @@ def : MnemonicAlias<"smovl", "movsl", "att">;
 def : MnemonicAlias<"smovq", "movsq", "att">;
 
 def : MnemonicAlias<"ud2a",  "ud2",  "att">;
+def : MnemonicAlias<"ud2bw", "ud1w", "att">;
+def : MnemonicAlias<"ud2bl", "ud1l", "att">;
+def : MnemonicAlias<"ud2bq", "ud1q", "att">;
 def : MnemonicAlias<"verrw", "verr", "att">;
 
 // MS recognizes 'xacquire'/'xrelease' as 'acquire'/'release'
diff --git a/llvm/lib/Target/X86/X86InstrMMX.td b/llvm/lib/Target/X86/X86InstrMMX.td
index 0f4d4d764cc9..49940204c25a 100644
--- a/llvm/lib/Target/X86/X86InstrMMX.td
+++ b/llvm/lib/Target/X86/X86InstrMMX.td
@@ -24,8 +24,9 @@
 // We set canFoldAsLoad because this can be converted to a constant-pool
 // load of an all-zeros value if folding it would be beneficial.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isPseudo = 1, SchedRW = [WriteZero] in {
-def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "", []>;
+    isPseudo = 1, SchedRW = [WriteZero], Predicates = [HasMMX] in {
+def MMX_SET0 : I<0, Pseudo, (outs VR64:$dst), (ins), "",
+                 [(set VR64:$dst, (x86mmx (MMX_X86movw2d (i32 0))))]>;
 }
 
 let Constraints = "$src1 = $dst" in {
@@ -43,8 +44,7 @@ let Constraints = "$src1 = $dst" in {
     def irm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
                  (ins VR64:$src1, OType:$src2),
                  !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                 [(set VR64:$dst, (IntId VR64:$src1,
-                                   (bitconvert (load_mmx addr:$src2))))]>,
+                 [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2)))]>,
                  Sched<[sched.Folded, sched.ReadAfterFold]>;
   }
 
@@ -60,8 +60,7 @@ let Constraints = "$src1 = $dst" in {
     def rm : MMXI<opc, MRMSrcMem, (outs VR64:$dst),
                                   (ins VR64:$src1, i64mem:$src2),
                   !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-                  [(set VR64:$dst, (IntId VR64:$src1,
-                                    (bitconvert (load_mmx addr:$src2))))]>,
+                  [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2)))]>,
                   Sched<[sched.Folded, sched.ReadAfterFold]>;
     def ri : MMXIi8<opc2, ImmForm, (outs VR64:$dst),
                                    (ins VR64:$src1, i32u8imm:$src2),
@@ -81,8 +80,7 @@ multiclass SS3I_unop_rm_int_mm<bits<8> opc, string OpcodeStr,
 
   def rm : MMXSS38I<opc, MRMSrcMem, (outs VR64:$dst), (ins i64mem:$src),
                  !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-                 [(set VR64:$dst,
-                   (IntId64 (bitconvert (load_mmx addr:$src))))]>,
+                 [(set VR64:$dst, (IntId64 (load_mmx addr:$src)))]>,
                  Sched<[sched.Folded]>;
 }
 
@@ -101,8 +99,7 @@ multiclass SS3I_binop_rm_int_mm<bits<8> opc, string OpcodeStr,
        (ins VR64:$src1, i64mem:$src2),
         !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
        [(set VR64:$dst,
-         (IntId64 VR64:$src1,
-          (bitconvert (load_mmx addr:$src2))))]>,
+         (IntId64 VR64:$src1, (load_mmx addr:$src2)))]>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 }
@@ -118,8 +115,8 @@ multiclass ssse3_palign_mm<string asm, Intrinsic IntId,
   def rmi  : MMXSS3AI<0x0F, MRMSrcMem, (outs VR64:$dst),
       (ins VR64:$src1, i64mem:$src2, u8imm:$src3),
       !strconcat(asm, "\t{$src3, $src2, $dst|$dst, $src2, $src3}"),
-      [(set VR64:$dst, (IntId VR64:$src1,
-                       (bitconvert (load_mmx addr:$src2)), (i8 timm:$src3)))]>,
+      [(set VR64:$dst, (IntId VR64:$src1, (load_mmx addr:$src2),
+                                          (i8 timm:$src3)))]>,
       Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -164,23 +161,14 @@ def MMX_EMMS  : MMXI<0x77, RawFrm, (outs), (ins), "emms", [(int_x86_mmx_emms)]>;
 def MMX_MOVD64rr : MMXI<0x6E, MRMSrcReg, (outs VR64:$dst), (ins GR32:$src),
                         "movd\t{$src, $dst|$dst, $src}",
                         [(set VR64:$dst,
-                         (x86mmx (scalar_to_vector GR32:$src)))]>,
+                         (x86mmx (MMX_X86movw2d GR32:$src)))]>,
                         Sched<[WriteVecMoveFromGpr]>;
 def MMX_MOVD64rm : MMXI<0x6E, MRMSrcMem, (outs VR64:$dst), (ins i32mem:$src),
                         "movd\t{$src, $dst|$dst, $src}",
                         [(set VR64:$dst,
-                        (x86mmx (scalar_to_vector (loadi32 addr:$src))))]>,
+                          (x86mmx (MMX_X86movw2d (loadi32 addr:$src))))]>,
                         Sched<[WriteVecLoad]>;
 
-let Predicates = [HasMMX] in {
-  def : Pat<(x86mmx (MMX_X86movw2d GR32:$src)),
-            (MMX_MOVD64rr GR32:$src)>;
-  def : Pat<(x86mmx (MMX_X86movw2d (i32 0))),
-            (MMX_SET0)>;
-  def : Pat<(x86mmx (MMX_X86movw2d (loadi32 addr:$src))),
-            (MMX_MOVD64rm addr:$src)>;
-}
-
 let mayStore = 1 in
 def MMX_MOVD64mr : MMXI<0x7E, MRMDestMem, (outs), (ins i32mem:$dst, VR64:$src),
                         "movd\t{$src, $dst|$dst, $src}", []>,
@@ -240,20 +228,21 @@ def MMX_MOVQ64mr : MMXI<0x7F, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                         "movq\t{$src, $dst|$dst, $src}",
                         [(store (x86mmx VR64:$src), addr:$dst)]>;
 
+def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
+def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
+                            [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>;
+
 let SchedRW = [SchedWriteVecMoveLS.XMM.RR] in {
 def MMX_MOVDQ2Qrr : MMXSDIi8<0xD6, MRMSrcReg, (outs VR64:$dst),
                              (ins VR128:$src), "movdq2q\t{$src, $dst|$dst, $src}",
                              [(set VR64:$dst,
-                               (x86mmx (bitconvert
-                               (i64 (extractelt (v2i64 VR128:$src),
-                                     (iPTR 0))))))]>;
+                               (x86mmx (MMX_X86movdq2q VR128:$src)))]>;
 
 def MMX_MOVQ2DQrr : MMXS2SIi8<0xD6, MRMSrcReg, (outs VR128:$dst),
                               (ins VR64:$src), "movq2dq\t{$src, $dst|$dst, $src}",
                               [(set VR128:$dst,
-                                (v2i64
-                                  (scalar_to_vector
-                                    (i64 (bitconvert (x86mmx VR64:$src))))))]>;
+                                (v2i64 (MMX_X86movq2dq VR64:$src)))]>;
 
 let isCodeGenOnly = 1, hasSideEffects = 1 in {
 def MMX_MOVQ2FR64rr: MMXS2SIi8<0xD6, MRMSrcReg, (outs FR64:$dst),
@@ -272,14 +261,6 @@ def MMX_MOVNTQmr  : MMXI<0xE7, MRMDestMem, (outs), (ins i64mem:$dst, VR64:$src),
                          [(int_x86_mmx_movnt_dq addr:$dst, VR64:$src)]>,
                          Sched<[SchedWriteVecMoveLSNT.MMX.MR]>;
 
-let Predicates = [HasMMX] in {
-  // movd to MMX register zero-extends
-  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector GR32:$src)))),
-            (MMX_MOVD64rr GR32:$src)>;
-  def : Pat<(x86mmx (X86vzmovl (x86mmx (scalar_to_vector (loadi32 addr:$src))))),
-            (MMX_MOVD64rm addr:$src)>;
-}
-
 // Arithmetic Instructions
 defm MMX_PABSB : SS3I_unop_rm_int_mm<0x1C, "pabsb", int_x86_ssse3_pabs_b,
                                      SchedWriteVecALU.MMX>;
@@ -566,27 +547,6 @@ def MMX_PMOVMSKBrr : MMXI<0xD7, MRMSrcReg, (outs GR32orGR64:$dst),
                                 (int_x86_mmx_pmovmskb VR64:$src))]>,
                           Sched<[WriteMMXMOVMSK]>;
 
-// MMX to XMM for vector types
-def MMX_X86movq2dq : SDNode<"X86ISD::MOVQ2DQ", SDTypeProfile<1, 1,
-                            [SDTCisVT<0, v2i64>, SDTCisVT<1, x86mmx>]>>;
-
-def : Pat<(v2i64 (MMX_X86movq2dq VR64:$src)),
-          (v2i64 (MMX_MOVQ2DQrr VR64:$src))>;
-
-// Low word of XMM to MMX.
-def MMX_X86movdq2q : SDNode<"X86ISD::MOVDQ2Q", SDTypeProfile<1, 1,
-                            [SDTCisVT<0, x86mmx>, SDTCisVT<1, v2i64>]>>;
-
-def : Pat<(x86mmx (MMX_X86movdq2q VR128:$src)),
-          (x86mmx (MMX_MOVDQ2Qrr VR128:$src))>;
-
-def : Pat<(x86mmx (MMX_X86movdq2q (v2i64 (simple_load addr:$src)))),
-          (x86mmx (MMX_MOVQ64rm addr:$src))>;
-
-def : Pat<(v2i64 (X86vzmovl (scalar_to_vector
-                             (i64 (bitconvert (x86mmx VR64:$src)))))),
-          (MMX_MOVQ2DQrr VR64:$src)>;
-
 // Misc.
 let SchedRW = [SchedWriteShuffle.MMX] in {
 let Uses = [EDI], Predicates = [HasMMX, HasSSE1,Not64BitMode] in
diff --git a/llvm/lib/Target/X86/X86InstrSGX.td b/llvm/lib/Target/X86/X86InstrSGX.td
index 747f5aa86653..6439f717accb 100644
--- a/llvm/lib/Target/X86/X86InstrSGX.td
+++ b/llvm/lib/Target/X86/X86InstrSGX.td
@@ -17,13 +17,13 @@
 let SchedRW = [WriteSystem], Predicates = [HasSGX] in {
 // ENCLS - Execute an Enclave System Function of Specified Leaf Number
 def ENCLS : I<0x01, MRM_CF, (outs), (ins),
-             "encls", []>, TB;
+             "encls", []>, PS;
 
 // ENCLU - Execute an Enclave User Function of Specified Leaf Number
 def ENCLU : I<0x01, MRM_D7, (outs), (ins),
-             "enclu", []>, TB;
+             "enclu", []>, PS;
 
 // ENCLV - Execute an Enclave VMM Function of Specified Leaf Number
 def ENCLV : I<0x01, MRM_C0, (outs), (ins),
-             "enclv", []>, TB;
+             "enclv", []>, PS;
 } // SchedRW
diff --git a/llvm/lib/Target/X86/X86InstrSSE.td b/llvm/lib/Target/X86/X86InstrSSE.td
index c45f342ed75b..c3c9f22381f8 100644
--- a/llvm/lib/Target/X86/X86InstrSSE.td
+++ b/llvm/lib/Target/X86/X86InstrSSE.td
@@ -43,7 +43,7 @@ let isCodeGenOnly = 1 in {
 multiclass sse12_fp_scalar_int<bits<8> opc, string OpcodeStr,
                                SDPatternOperator OpNode, RegisterClass RC,
                                ValueType VT, string asm, Operand memopr,
-                               ComplexPattern mem_cpat, Domain d,
+                               PatFrags mem_frags, Domain d,
                                X86FoldableSchedWrite sched, bit Is2Addr = 1> {
 let hasSideEffects = 0 in {
   def rr_Int : SI_Int<opc, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2),
@@ -57,7 +57,7 @@ let hasSideEffects = 0 in {
        !if(Is2Addr,
            !strconcat(asm, "\t{$src2, $dst|$dst, $src2}"),
            !strconcat(asm, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set RC:$dst, (VT (OpNode RC:$src1, mem_cpat:$src2)))], d>,
+       [(set RC:$dst, (VT (OpNode RC:$src1, (mem_frags addr:$src2))))], d>,
        Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 }
@@ -720,11 +720,7 @@ def MOVHPDmr : PDI<0x17, MRMDestMem, (outs), (ins f64mem:$dst, VR128:$src),
 } // SchedRW
 
 let Predicates = [UseAVX] in {
-  // Also handle an i64 load because that may get selected as a faster way to
-  // load the data.
-  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
-                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
-            (VMOVHPDrm VR128:$src1, addr:$src2)>;
+  // MOVHPD patterns
   def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
             (VMOVHPDrm VR128:$src1, addr:$src2)>;
 
@@ -754,12 +750,6 @@ let Predicates = [UseSSE1] in {
 
 let Predicates = [UseSSE2] in {
   // MOVHPD patterns
-
-  // Also handle an i64 load because that may get selected as a faster way to
-  // load the data.
-  def : Pat<(v2f64 (X86Unpckl VR128:$src1,
-                      (bc_v2f64 (v2i64 (scalar_to_vector (loadi64 addr:$src2)))))),
-            (MOVHPDrm VR128:$src1, addr:$src2)>;
   def : Pat<(v2f64 (X86Unpckl VR128:$src1, (X86vzload64 addr:$src2))),
             (MOVHPDrm VR128:$src1, addr:$src2)>;
 
@@ -884,6 +874,23 @@ defm VCVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf6
                                 "cvttsd2si", "cvttsd2si",
                                 WriteCvtSD2I, SSEPackedDouble>,
                                 XD, VEX, VEX_W, VEX_LIG;
+
+defm VCVTSS2SI   : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
+                               "cvtss2si", "cvtss2si",
+                               WriteCvtSS2I, SSEPackedSingle>,
+                               XS, VEX, VEX_LIG;
+defm VCVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
+                               "cvtss2si", "cvtss2si",
+                               WriteCvtSS2I, SSEPackedSingle>,
+                               XS, VEX, VEX_W, VEX_LIG;
+defm VCVTSD2SI   : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
+                               "cvtsd2si", "cvtsd2si",
+                               WriteCvtSD2I, SSEPackedDouble>,
+                               XD, VEX, VEX_LIG;
+defm VCVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
+                               "cvtsd2si", "cvtsd2si",
+                               WriteCvtSD2I, SSEPackedDouble>,
+                               XD, VEX, VEX_W, VEX_LIG;
 }
 
 // The assembler can recognize rr 64-bit instructions by seeing a rxx
@@ -923,6 +930,12 @@ let Predicates = [UseAVX] in {
             (VCVTSI2SDrr (f64 (IMPLICIT_DEF)), GR32:$src)>;
   def : Pat<(f64 (any_sint_to_fp GR64:$src)),
             (VCVTSI642SDrr (f64 (IMPLICIT_DEF)), GR64:$src)>;
+
+  def : Pat<(i64 (lrint FR32:$src)), (VCVTSS2SI64rr FR32:$src)>;
+  def : Pat<(i64 (lrint (loadf32 addr:$src))), (VCVTSS2SI64rm addr:$src)>;
+
+  def : Pat<(i64 (lrint FR64:$src)), (VCVTSD2SI64rr FR64:$src)>;
+  def : Pat<(i64 (lrint (loadf64 addr:$src))), (VCVTSD2SI64rm addr:$src)>;
 }
 
 let isCodeGenOnly = 1 in {
@@ -938,6 +951,20 @@ defm CVTTSD2SI : sse12_cvt_s<0x2C, FR64, GR32, any_fp_to_sint, f64mem, loadf64,
 defm CVTTSD2SI64 : sse12_cvt_s<0x2C, FR64, GR64, any_fp_to_sint, f64mem, loadf64,
                       "cvttsd2si", "cvttsd2si",
                       WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
+
+defm CVTSS2SI : sse12_cvt_s<0x2D, FR32, GR32, lrint, f32mem, loadf32,
+                     "cvtss2si", "cvtss2si",
+                     WriteCvtSS2I, SSEPackedSingle>, XS, SIMD_EXC;
+defm CVTSS2SI64 : sse12_cvt_s<0x2D, FR32, GR64, llrint, f32mem, loadf32,
+                     "cvtss2si", "cvtss2si",
+                     WriteCvtSS2I, SSEPackedSingle>, XS, REX_W, SIMD_EXC;
+defm CVTSD2SI : sse12_cvt_s<0x2D, FR64, GR32, lrint, f64mem, loadf64,
+                     "cvtsd2si", "cvtsd2si",
+                     WriteCvtSD2I, SSEPackedDouble>, XD, SIMD_EXC;
+defm CVTSD2SI64 : sse12_cvt_s<0x2D, FR64, GR64, llrint, f64mem, loadf64,
+                     "cvtsd2si", "cvtsd2si",
+                     WriteCvtSD2I, SSEPackedDouble>, XD, REX_W, SIMD_EXC;
+
 defm CVTSI2SS  : sse12_cvt_s<0x2A, GR32, FR32, any_sint_to_fp, i32mem, loadi32,
                       "cvtsi2ss", "cvtsi2ss{l}",
                       WriteCvtI2SS, SSEPackedSingle, ReadInt2Fpu>, XS, SIMD_EXC;
@@ -952,12 +979,22 @@ defm CVTSI642SD : sse12_cvt_s<0x2A, GR64, FR64, any_sint_to_fp, i64mem, loadi64,
                       WriteCvtI2SD, SSEPackedDouble, ReadInt2Fpu>, XD, REX_W, SIMD_EXC;
 } // isCodeGenOnly = 1
 
+let Predicates = [UseSSE1] in {
+  def : Pat<(i64 (lrint FR32:$src)), (CVTSS2SI64rr FR32:$src)>;
+  def : Pat<(i64 (lrint (loadf32 addr:$src))), (CVTSS2SI64rm addr:$src)>;
+}
+
+let Predicates = [UseSSE2] in {
+  def : Pat<(i64 (lrint FR64:$src)), (CVTSD2SI64rr FR64:$src)>;
+  def : Pat<(i64 (lrint (loadf64 addr:$src))), (CVTSD2SI64rm addr:$src)>;
+}
+
 // Conversion Instructions Intrinsics - Match intrinsics which expect MM
 // and/or XMM operand(s).
 
 multiclass sse12_cvt_sint<bits<8> opc, RegisterClass SrcRC, RegisterClass DstRC,
                           ValueType DstVT, ValueType SrcVT, SDNode OpNode,
-                          Operand memop, ComplexPattern mem_cpat, string asm,
+                          Operand memop, PatFrags mem_frags, string asm,
                           X86FoldableSchedWrite sched, Domain d> {
 let ExeDomain = d in {
   def rr_Int : SI<opc, MRMSrcReg, (outs DstRC:$dst), (ins SrcRC:$src),
@@ -966,7 +1003,7 @@ let ExeDomain = d in {
                Sched<[sched]>;
   def rm_Int : SI<opc, MRMSrcMem, (outs DstRC:$dst), (ins memop:$src),
                   !strconcat(asm, "\t{$src, $dst|$dst, $src}"),
-                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT mem_cpat:$src))))]>,
+                  [(set DstRC:$dst, (DstVT (OpNode (SrcVT (mem_frags addr:$src)))))]>,
                Sched<[sched.Folded]>;
 }
 }
@@ -1247,7 +1284,7 @@ def VCVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "vcvtsd2ss\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                        [(set VR128:$dst,
-                         (v4f32 (X86frounds VR128:$src1, sse_load_f64:$src2)))]>,
+                         (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
                        XD, VEX_4V, VEX_LIG, VEX_WIG, Requires<[UseAVX]>,
                        Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
 let Constraints = "$src1 = $dst" in {
@@ -1261,7 +1298,7 @@ def CVTSD2SSrm_Int: I<0x5A, MRMSrcMem,
                        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2),
                        "cvtsd2ss\t{$src2, $dst|$dst, $src2}",
                        [(set VR128:$dst,
-                         (v4f32 (X86frounds VR128:$src1,sse_load_f64:$src2)))]>,
+                         (v4f32 (X86frounds VR128:$src1, (sse_load_f64 addr:$src2))))]>,
                        XD, Requires<[UseSSE2]>,
                        Sched<[WriteCvtSD2SS.Folded, WriteCvtSD2SS.ReadAfterFold]>;
 }
@@ -1745,124 +1782,94 @@ def CVTPD2PSrm : PDI<0x5A, MRMSrcMem, (outs VR128:$dst), (ins f128mem:$src),
 
 // sse12_cmp_scalar - sse 1 & 2 compare scalar instructions
 multiclass sse12_cmp_scalar<RegisterClass RC, X86MemOperand x86memop,
-                            SDNode OpNode, ValueType VT,
+                            Operand memop, SDNode OpNode, ValueType VT,
                             PatFrag ld_frag, string asm,
-                            X86FoldableSchedWrite sched> {
-let Uses = [MXCSR], mayRaiseFPException = 1 in {
-  let isCommutable = 1 in
-  def rr : SIi8<0xC2, MRMSrcReg,
-                (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
-                [(set RC:$dst, (OpNode (VT RC:$src1), RC:$src2, timm:$cc))]>,
-                Sched<[sched]>;
-  def rm : SIi8<0xC2, MRMSrcMem,
-                (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
-                [(set RC:$dst, (OpNode (VT RC:$src1),
-                                         (ld_frag addr:$src2), timm:$cc))]>,
-                Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
-}
-
-let isCodeGenOnly = 1 in {
-  let ExeDomain = SSEPackedSingle in
-  defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
-                   "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-                   SchedWriteFCmpSizes.PS.Scl>, XS, VEX_4V, VEX_LIG, VEX_WIG;
-  let ExeDomain = SSEPackedDouble in
-  defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
-                   "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
-                   SchedWriteFCmpSizes.PD.Scl>,
-                   XD, VEX_4V, VEX_LIG, VEX_WIG;
-
-  let Constraints = "$src1 = $dst" in {
-    let ExeDomain = SSEPackedSingle in
-    defm CMPSS : sse12_cmp_scalar<FR32, f32mem, X86cmps, f32, loadf32,
-                    "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                    SchedWriteFCmpSizes.PS.Scl>, XS;
-    let ExeDomain = SSEPackedDouble in
-    defm CMPSD : sse12_cmp_scalar<FR64, f64mem, X86cmps, f64, loadf64,
-                    "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
-                    SchedWriteFCmpSizes.PD.Scl>, XD;
-  }
-}
-
-multiclass sse12_cmp_scalar_int<Operand memop,
-                         Intrinsic Int, string asm, X86FoldableSchedWrite sched,
-                         ComplexPattern mem_cpat> {
-let Uses = [MXCSR], mayRaiseFPException = 1 in {
+                            X86FoldableSchedWrite sched,
+                            PatFrags mem_frags> {
   def rr_Int : SIi8<0xC2, MRMSrcReg, (outs VR128:$dst),
-                      (ins VR128:$src1, VR128:$src, u8imm:$cc), asm,
-                        [(set VR128:$dst, (Int VR128:$src1,
-                                               VR128:$src, timm:$cc))]>,
-           Sched<[sched]>;
-let mayLoad = 1 in
+                    (ins VR128:$src1, VR128:$src2, u8imm:$cc), asm,
+                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
+                                              VR128:$src2, timm:$cc))]>,
+           Sched<[sched]>, SIMD_EXC;
+  let mayLoad = 1 in
   def rm_Int : SIi8<0xC2, MRMSrcMem, (outs VR128:$dst),
-                      (ins VR128:$src1, memop:$src, u8imm:$cc), asm,
-                        [(set VR128:$dst, (Int VR128:$src1,
-                                               mem_cpat:$src, timm:$cc))]>,
-           Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
+                    (ins VR128:$src1, memop:$src2, u8imm:$cc), asm,
+                    [(set VR128:$dst, (OpNode (VT VR128:$src1),
+                                              (mem_frags addr:$src2), timm:$cc))]>,
+           Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+
+  let isCodeGenOnly = 1 in {
+    let isCommutable = 1 in
+    def rr : SIi8<0xC2, MRMSrcReg,
+                  (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
+                  [(set RC:$dst, (OpNode RC:$src1, RC:$src2, timm:$cc))]>,
+                  Sched<[sched]>, SIMD_EXC;
+    def rm : SIi8<0xC2, MRMSrcMem,
+                  (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
+                  [(set RC:$dst, (OpNode RC:$src1,
+                                         (ld_frag addr:$src2), timm:$cc))]>,
+                  Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
+  }
 }
 
-// Aliases to match intrinsics which expect XMM operand(s).
 let ExeDomain = SSEPackedSingle in
-defm VCMPSS  : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
-                     "cmpss\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
-                     SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
-                     XS, VEX_4V, VEX_LIG, VEX_WIG;
+defm VCMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
+                 "cmpss\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+                 SchedWriteFCmpSizes.PS.Scl, sse_load_f32>,
+                 XS, VEX_4V, VEX_LIG, VEX_WIG;
 let ExeDomain = SSEPackedDouble in
-defm VCMPSD  : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
-                     "cmpsd\t{$cc, $src, $src1, $dst|$dst, $src1, $src, $cc}",
-                     SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
-                     XD, VEX_4V, VEX_LIG, VEX_WIG;
+defm VCMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
+                 "cmpsd\t{$cc, $src2, $src1, $dst|$dst, $src1, $src2, $cc}",
+                 SchedWriteFCmpSizes.PD.Scl, sse_load_f64>,
+                 XD, VEX_4V, VEX_LIG, VEX_WIG;
+
 let Constraints = "$src1 = $dst" in {
   let ExeDomain = SSEPackedSingle in
-  defm CMPSS  : sse12_cmp_scalar_int<ssmem, int_x86_sse_cmp_ss,
-                       "cmpss\t{$cc, $src, $dst|$dst, $src, $cc}",
-                       SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
+  defm CMPSS : sse12_cmp_scalar<FR32, f32mem, ssmem, X86cmps, v4f32, loadf32,
+                  "cmpss\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                  SchedWriteFCmpSizes.PS.Scl, sse_load_f32>, XS;
   let ExeDomain = SSEPackedDouble in
-  defm CMPSD  : sse12_cmp_scalar_int<sdmem, int_x86_sse2_cmp_sd,
-                       "cmpsd\t{$cc, $src, $dst|$dst, $src, $cc}",
-                       SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
+  defm CMPSD : sse12_cmp_scalar<FR64, f64mem, sdmem, X86cmps, v2f64, loadf64,
+                  "cmpsd\t{$cc, $src2, $dst|$dst, $src2, $cc}",
+                  SchedWriteFCmpSizes.PD.Scl, sse_load_f64>, XD;
 }
 
-
 // sse12_ord_cmp - Unordered/Ordered scalar fp compare and set EFLAGS
 multiclass sse12_ord_cmp<bits<8> opc, RegisterClass RC, SDNode OpNode,
                          ValueType vt, X86MemOperand x86memop,
                          PatFrag ld_frag, string OpcodeStr, Domain d,
-                         X86FoldableSchedWrite sched = WriteFCom> {
-let hasSideEffects = 0, Uses = [MXCSR], mayRaiseFPException = 1,
-    ExeDomain = d in {
+                         X86FoldableSchedWrite sched = WriteFComX> {
+  let ExeDomain = d in {
   def rr: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
-          Sched<[sched]>;
-let mayLoad = 1 in
+          Sched<[sched]>, SIMD_EXC;
+  let mayLoad = 1 in
   def rm: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, x86memop:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1),
                                            (ld_frag addr:$src2)))]>,
-          Sched<[sched.Folded, sched.ReadAfterFold]>;
+          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 }
 }
 
 // sse12_ord_cmp_int - Intrinsic version of sse12_ord_cmp
 multiclass sse12_ord_cmp_int<bits<8> opc, RegisterClass RC, SDNode OpNode,
                              ValueType vt, Operand memop,
-                             ComplexPattern mem_cpat, string OpcodeStr,
+                             PatFrags mem_frags, string OpcodeStr,
                              Domain d,
-                             X86FoldableSchedWrite sched = WriteFCom> {
-let Uses = [MXCSR], mayRaiseFPException = 1, ExeDomain = d in {
+                             X86FoldableSchedWrite sched = WriteFComX> {
+let ExeDomain = d in {
   def rr_Int: SI<opc, MRMSrcReg, (outs), (ins RC:$src1, RC:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1), RC:$src2))]>,
-          Sched<[sched]>;
+          Sched<[sched]>, SIMD_EXC;
 let mayLoad = 1 in
   def rm_Int: SI<opc, MRMSrcMem, (outs), (ins RC:$src1, memop:$src2),
                      !strconcat(OpcodeStr, "\t{$src2, $src1|$src1, $src2}"),
                      [(set EFLAGS, (OpNode (vt RC:$src1),
-                                           mem_cpat:$src2))]>,
-          Sched<[sched.Folded, sched.ReadAfterFold]>;
+                                           (mem_frags addr:$src2)))]>,
+          Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 }
 }
 
@@ -1914,18 +1921,16 @@ multiclass sse12_cmp_packed<RegisterClass RC, X86MemOperand x86memop,
                             ValueType VT, string asm,
                             X86FoldableSchedWrite sched,
                             Domain d, PatFrag ld_frag> {
-let Uses = [MXCSR], mayRaiseFPException = 1 in {
   let isCommutable = 1 in
   def rri : PIi8<0xC2, MRMSrcReg,
              (outs RC:$dst), (ins RC:$src1, RC:$src2, u8imm:$cc), asm,
              [(set RC:$dst, (VT (X86any_cmpp RC:$src1, RC:$src2, timm:$cc)))], d>,
-            Sched<[sched]>;
+            Sched<[sched]>, SIMD_EXC;
   def rmi : PIi8<0xC2, MRMSrcMem,
              (outs RC:$dst), (ins RC:$src1, x86memop:$src2, u8imm:$cc), asm,
              [(set RC:$dst,
                (VT (X86any_cmpp RC:$src1, (ld_frag addr:$src2), timm:$cc)))], d>,
-            Sched<[sched.Folded, sched.ReadAfterFold]>;
-}
+            Sched<[sched.Folded, sched.ReadAfterFold]>, SIMD_EXC;
 }
 
 defm VCMPPS : sse12_cmp_packed<VR128, f128mem, v4f32,
@@ -2812,7 +2817,7 @@ multiclass sse_fp_unop_s<bits<8> opc, string OpcodeStr, RegisterClass RC,
 }
 
 multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
-                              ComplexPattern int_cpat, Intrinsic Intr,
+                              PatFrags mem_frags, Intrinsic Intr,
                               Predicate target, string Suffix> {
   let Predicates = [target] in {
   // These are unary operations, but they are modeled as having 2 source operands
@@ -2828,13 +2833,13 @@ multiclass sse_fp_unop_s_intr<RegisterClass RC, ValueType vt,
   // which has a clobber before the rcp, vs.
   // rcpss mem, %xmm0
   let Predicates = [target, OptForSize] in {
-    def : Pat<(Intr int_cpat:$src2),
+    def : Pat<(Intr (mem_frags addr:$src2)),
                (!cast<Instruction>(NAME#m_Int)
                       (vt (IMPLICIT_DEF)), addr:$src2)>;
   }
 }
 
-multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int_cpat,
+multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, PatFrags mem_frags,
                               Intrinsic Intr, Predicate target> {
   let Predicates = [target] in {
    def : Pat<(Intr VR128:$src),
@@ -2842,7 +2847,7 @@ multiclass avx_fp_unop_s_intr<RegisterClass RC, ValueType vt, ComplexPattern int
                                  VR128:$src)>;
   }
   let Predicates = [target, OptForSize] in {
-    def : Pat<(Intr int_cpat:$src2),
+    def : Pat<(Intr (mem_frags addr:$src2)),
               (!cast<Instruction>(NAME#m_Int)
                     (vt (IMPLICIT_DEF)), addr:$src2)>;
   }
@@ -2968,28 +2973,28 @@ let Predicates = [HasAVX, NoVLX] in {
 multiclass sse1_fp_unop_s_intr<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           X86SchedWriteWidths sched, Predicate AVXTarget> {
   defm SS        :  sse_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
-                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
+                      !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
                       UseSSE1, "SS">, XS;
   defm V#NAME#SS  : avx_fp_unop_s_intr<FR32, v4f32, sse_load_f32,
-                      !cast<Intrinsic>("int_x86_sse_"##OpcodeStr##_ss),
+                      !cast<Intrinsic>("int_x86_sse_"#OpcodeStr#_ss),
                       AVXTarget>,
                       XS, VEX_4V, VEX_LIG, VEX_WIG, NotMemoryFoldable;
 }
 
 multiclass sse1_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           X86SchedWriteWidths sched, Predicate AVXTarget> {
-  defm SS        :  sse_fp_unop_s<opc, OpcodeStr##ss, FR32, f32, f32mem,
+  defm SS        :  sse_fp_unop_s<opc, OpcodeStr#ss, FR32, f32, f32mem,
                       ssmem, OpNode, SSEPackedSingle, sched.Scl, UseSSE1>, XS;
-  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr##ss, FR32, f32,
+  defm V#NAME#SS  : avx_fp_unop_s<opc, "v"#OpcodeStr#ss, FR32, f32,
                       f32mem, ssmem, OpNode, SSEPackedSingle, sched.Scl, AVXTarget>,
                        XS, VEX_4V, VEX_LIG, VEX_WIG;
 }
 
 multiclass sse2_fp_unop_s<bits<8> opc, string OpcodeStr, SDNode OpNode,
                           X86SchedWriteWidths sched, Predicate AVXTarget> {
-  defm SD         : sse_fp_unop_s<opc, OpcodeStr##sd, FR64, f64, f64mem,
+  defm SD         : sse_fp_unop_s<opc, OpcodeStr#sd, FR64, f64, f64mem,
                          sdmem, OpNode, SSEPackedDouble, sched.Scl, UseSSE2>, XD;
-  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr##sd, FR64, f64,
+  defm V#NAME#SD  : avx_fp_unop_s<opc, "v"#OpcodeStr#sd, FR64, f64,
                          f64mem, sdmem, OpNode, SSEPackedDouble, sched.Scl, AVXTarget>,
                          XD, VEX_4V, VEX_LIG, VEX_WIG;
 }
@@ -3185,13 +3190,13 @@ def PAUSE : I<0x90, RawFrm, (outs), (ins),
 
 let SchedRW = [WriteFence] in {
 // Load, store, and memory fence
-// TODO: As with mfence, we may want to ease the availablity of sfence/lfence
+// TODO: As with mfence, we may want to ease the availability of sfence/lfence
 // to include any 64-bit target.
-def SFENCE : I<0xAE, MRM_F8, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
+def SFENCE : I<0xAE, MRM7X, (outs), (ins), "sfence", [(int_x86_sse_sfence)]>,
                PS, Requires<[HasSSE1]>;
-def LFENCE : I<0xAE, MRM_E8, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
+def LFENCE : I<0xAE, MRM5X, (outs), (ins), "lfence", [(int_x86_sse2_lfence)]>,
                PS, Requires<[HasSSE2]>;
-def MFENCE : I<0xAE, MRM_F0, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
+def MFENCE : I<0xAE, MRM6X, (outs), (ins), "mfence", [(int_x86_sse2_mfence)]>,
                PS, Requires<[HasMFence]>;
 } // SchedRW
 
@@ -3213,11 +3218,11 @@ def VSTMXCSR : VPSI<0xAE, MRM3m, (outs), (ins i32mem:$dst),
 let mayLoad=1, hasSideEffects=1 in
 def LDMXCSR : I<0xAE, MRM2m, (outs), (ins i32mem:$src),
               "ldmxcsr\t$src", [(int_x86_sse_ldmxcsr addr:$src)]>,
-              TB, Sched<[WriteLDMXCSR]>;
+              PS, Sched<[WriteLDMXCSR]>;
 let mayStore=1, hasSideEffects=1 in
 def STMXCSR : I<0xAE, MRM3m, (outs), (ins i32mem:$dst),
               "stmxcsr\t$dst", [(int_x86_sse_stmxcsr addr:$dst)]>,
-              TB, Sched<[WriteSTMXCSR]>;
+              PS, Sched<[WriteSTMXCSR]>;
 
 //===---------------------------------------------------------------------===//
 // SSE2 - Move Aligned/Unaligned Packed Integer Instructions
@@ -4185,8 +4190,6 @@ let Predicates = [UseAVX] in {
 
   // AVX 128-bit movd/movq instructions write zeros in the high 128-bit part.
   // These instructions also write zeros in the high part of a 256-bit register.
-  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
-            (VMOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload32 addr:$src)),
             (VMOVDI2PDIrm addr:$src)>;
   def : Pat<(v8i32 (X86vzload32 addr:$src)),
@@ -4199,8 +4202,6 @@ let Predicates = [UseSSE2] in {
 
   def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector GR64:$src)))),
             (MOV64toPQIrr GR64:$src)>;
-  def : Pat<(v2i64 (X86vzmovl (v2i64 (scalar_to_vector (zextloadi64i32 addr:$src))))),
-            (MOVDI2PDIrm addr:$src)>;
   def : Pat<(v4i32 (X86vzload32 addr:$src)),
             (MOVDI2PDIrm addr:$src)>;
 }
@@ -4429,16 +4430,11 @@ defm MOVDDUP : sse3_replicate_dfp<"movddup", SchedWriteFShuffle>;
 
 
 let Predicates = [HasAVX, NoVLX] in {
-  def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
-            (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
   def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
             (VMOVDDUPrm addr:$src)>, Requires<[HasAVX]>;
 }
 
 let Predicates = [UseSSE3] in {
-  // No need for aligned memory as this only loads 64-bits.
-  def : Pat<(X86Movddup (v2f64 (simple_load addr:$src))),
-            (MOVDDUPrm addr:$src)>;
   def : Pat<(X86Movddup (v2f64 (X86vzload64 addr:$src))),
             (MOVDDUPrm addr:$src)>;
 }
@@ -5022,7 +5018,9 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
 
   def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
-  def : Pat<(v8i32 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
+  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
+  def : Pat<(v8i32 (InVecOp (bc_v16i8 (v2i64 (X86vzload64 addr:$src))))),
             (!cast<I>(OpcPrefix#BDYrm) addr:$src)>;
 
   def : Pat<(v4i64 (ExtOp (loadv4i32 addr:$src))),
@@ -5030,12 +5028,14 @@ multiclass SS41I_pmovx_avx2_patterns<string OpcPrefix, string ExtTy,
 
   def : Pat<(v4i64 (InVecOp (bc_v16i8 (v4i32 (scalar_to_vector (loadi32 addr:$src)))))),
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
-  def : Pat<(v4i64 (InVecOp (v16i8 (X86vzload64 addr:$src)))),
+  def : Pat<(v4i64 (InVecOp (bc_v16i8 (v2i64 (X86vzload32 addr:$src))))),
             (!cast<I>(OpcPrefix#BQYrm) addr:$src)>;
 
   def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
-  def : Pat<(v4i64 (InVecOp (v8i16 (X86vzload64 addr:$src)))),
+  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2f64 (scalar_to_vector (loadf64 addr:$src)))))),
+            (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
+  def : Pat<(v4i64 (InVecOp (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
             (!cast<I>(OpcPrefix#WQYrm) addr:$src)>;
   }
 }
@@ -5499,7 +5499,7 @@ let ExeDomain = SSEPackedSingle in {
             !strconcat(OpcodeStr,
                 "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set VR128:$dst,
-             (OpNode VR128:$src1, sse_load_f32:$src2, timm:$src3))]>,
+             (OpNode VR128:$src1, (sse_load_f32 addr:$src2), timm:$src3))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 } // ExeDomain = SSEPackedSingle, isCodeGenOnly = 1
 
@@ -5522,7 +5522,7 @@ let ExeDomain = SSEPackedDouble in {
             !strconcat(OpcodeStr,
                 "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}")),
         [(set VR128:$dst,
-              (OpNode VR128:$src1, sse_load_f64:$src2, timm:$src3))]>,
+              (OpNode VR128:$src1, (sse_load_f64 addr:$src2), timm:$src3))]>,
         Sched<[sched.Folded, sched.ReadAfterFold]>;
 } // ExeDomain = SSEPackedDouble, isCodeGenOnly = 1
 }
@@ -6623,7 +6623,7 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
              [!if(UsesXMM0,
                   (set VR128:$dst, (IntId VR128:$src1, VR128:$src2, XMM0)),
                   (set VR128:$dst, (IntId VR128:$src1, VR128:$src2)))]>,
-             T8, Sched<[sched]>;
+             T8PS, Sched<[sched]>;
 
   def rm : I<Opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2),
@@ -6634,7 +6634,7 @@ multiclass SHAI_binop<bits<8> Opc, string OpcodeStr, Intrinsic IntId,
                   (set VR128:$dst, (IntId VR128:$src1,
                     (memop addr:$src2), XMM0)),
                   (set VR128:$dst, (IntId VR128:$src1,
-                    (memop addr:$src2))))]>, T8,
+                    (memop addr:$src2))))]>, T8PS,
              Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -6644,7 +6644,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
                          "sha1rnds4\t{$src3, $src2, $dst|$dst, $src2, $src3}",
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1, VR128:$src2,
-                            (i8 timm:$src3)))]>, TA,
+                            (i8 timm:$src3)))]>, TAPS,
                          Sched<[SchedWriteVecIMul.XMM]>;
   def SHA1RNDS4rmi : Ii8<0xCC, MRMSrcMem, (outs VR128:$dst),
                          (ins VR128:$src1, i128mem:$src2, u8imm:$src3),
@@ -6652,7 +6652,7 @@ let Constraints = "$src1 = $dst", Predicates = [HasSHA] in {
                          [(set VR128:$dst,
                            (int_x86_sha1rnds4 VR128:$src1,
                             (memop addr:$src2),
-                            (i8 timm:$src3)))]>, TA,
+                            (i8 timm:$src3)))]>, TAPS,
                          Sched<[SchedWriteVecIMul.XMM.Folded,
                                 SchedWriteVecIMul.XMM.ReadAfterFold]>;
 
@@ -6687,7 +6687,7 @@ multiclass AESI_binop_rm_int<bits<8> opc, string OpcodeStr,
                              Intrinsic IntId, PatFrag ld_frag,
                              bit Is2Addr = 0, RegisterClass RC = VR128,
                              X86MemOperand MemOp = i128mem> {
-  let AsmString = OpcodeStr##
+  let AsmString = OpcodeStr#
                   !if(Is2Addr, "\t{$src2, $dst|$dst, $src2}",
                                "\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
     def rr : AES8I<opc, MRMSrcReg, (outs RC:$dst),
@@ -6874,10 +6874,10 @@ defm VPCLMULQDQY : vpclmulqdq<VR256, i256mem, load,
 
 multiclass vpclmulqdq_aliases_impl<string InstStr, RegisterClass RC,
                                    X86MemOperand MemOp, string Hi, string Lo> {
-  def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+  def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   (!cast<Instruction>(InstStr # "rr") RC:$dst, RC:$src1, RC:$src2,
                         !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
-  def : InstAlias<"vpclmul"##Hi##Lo##"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
+  def : InstAlias<"vpclmul"#Hi#Lo#"dq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
                   (!cast<Instruction>(InstStr # "rm") RC:$dst, RC:$src1, MemOp:$src2,
                         !add(!shl(!eq(Lo,"hq"),4),!eq(Hi,"hq"))), 0>;
 }
@@ -7290,13 +7290,12 @@ multiclass f16c_ph2ps<RegisterClass RC, X86MemOperand x86memop,
                       X86FoldableSchedWrite sched> {
   def rr : I<0x13, MRMSrcReg, (outs RC:$dst), (ins VR128:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
-             [(set RC:$dst, (X86cvtph2ps VR128:$src))]>,
+             [(set RC:$dst, (X86any_cvtph2ps VR128:$src))]>,
              T8PD, VEX, Sched<[sched]>;
   let hasSideEffects = 0, mayLoad = 1 in
   def rm : I<0x13, MRMSrcMem, (outs RC:$dst), (ins x86memop:$src),
              "vcvtph2ps\t{$src, $dst|$dst, $src}",
-             [(set RC:$dst, (X86cvtph2ps (loadv8i16 addr:$src)))]>,
-             T8PD, VEX, Sched<[sched.Folded]>;
+             []>, T8PD, VEX, Sched<[sched.Folded]>;
 }
 
 multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
@@ -7304,7 +7303,7 @@ multiclass f16c_ps2ph<RegisterClass RC, X86MemOperand x86memop,
   def rr : Ii8<0x1D, MRMDestReg, (outs VR128:$dst),
                (ins RC:$src1, i32u8imm:$src2),
                "vcvtps2ph\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-               [(set VR128:$dst, (X86cvtps2ph RC:$src1, timm:$src2))]>,
+               [(set VR128:$dst, (X86any_cvtps2ph RC:$src1, timm:$src2))]>,
                TAPD, VEX, Sched<[RR]>;
   let hasSideEffects = 0, mayStore = 1 in
   def mr : Ii8<0x1D, MRMDestMem, (outs),
@@ -7322,44 +7321,26 @@ let Predicates = [HasF16C, NoVLX] in {
                                WriteCvtPS2PHYSt>, VEX_L, SIMD_EXC;
 
   // Pattern match vcvtph2ps of a scalar i64 load.
-  def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
+  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16 (v2i64 (X86vzload64 addr:$src))))),
             (VCVTPH2PSrm addr:$src)>;
-  def : Pat<(v4f32 (X86cvtph2ps (bc_v8i16
+  def : Pat<(v4f32 (X86any_cvtph2ps (bc_v8i16
               (v2i64 (scalar_to_vector (loadi64 addr:$src)))))),
             (VCVTPH2PSrm addr:$src)>;
+  def : Pat<(v8f32 (X86any_cvtph2ps (loadv8i16 addr:$src))),
+            (VCVTPH2PSYrm addr:$src)>;
 
   def : Pat<(store (f64 (extractelt
-                         (bc_v2f64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
+                         (bc_v2f64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
                          (iPTR 0))), addr:$dst),
             (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
   def : Pat<(store (i64 (extractelt
-                         (bc_v2i64 (v8i16 (X86cvtps2ph VR128:$src1, timm:$src2))),
+                         (bc_v2i64 (v8i16 (X86any_cvtps2ph VR128:$src1, timm:$src2))),
                          (iPTR 0))), addr:$dst),
             (VCVTPS2PHmr addr:$dst, VR128:$src1, timm:$src2)>;
-  def : Pat<(store (v8i16 (X86cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
+  def : Pat<(store (v8i16 (X86any_cvtps2ph VR256:$src1, timm:$src2)), addr:$dst),
             (VCVTPS2PHYmr addr:$dst, VR256:$src1, timm:$src2)>;
 }
 
-// Patterns for  matching conversions from float to half-float and vice versa.
-let Predicates = [HasF16C, NoVLX] in {
-  // Use MXCSR.RC for rounding instead of explicitly specifying the default
-  // rounding mode (Nearest-Even, encoded as 0). Both are equivalent in the
-  // configurations we support (the default). However, falling back to MXCSR is
-  // more consistent with other instructions, which are always controlled by it.
-  // It's encoded as 0b100.
-  def : Pat<(fp_to_f16 FR32:$src),
-            (i16 (EXTRACT_SUBREG (VMOVPDI2DIrr (v8i16 (VCVTPS2PHrr
-              (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4))), sub_16bit))>;
-
-  def : Pat<(f16_to_fp GR16:$src),
-            (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
-              (v4i32 (COPY_TO_REGCLASS (MOVSX32rr16 GR16:$src), VR128)))), FR32)) >;
-
-  def : Pat<(f16_to_fp (i16 (fp_to_f16 FR32:$src))),
-            (f32 (COPY_TO_REGCLASS (v4f32 (VCVTPH2PSrr
-             (v8i16 (VCVTPS2PHrr (v4f32 (COPY_TO_REGCLASS FR32:$src, VR128)), 4)))), FR32)) >;
-}
-
 //===----------------------------------------------------------------------===//
 // AVX2 Instructions
 //===----------------------------------------------------------------------===//
@@ -7415,7 +7396,7 @@ def : Pat<(X86Blendi (loadv2i64 addr:$src2), VR128:$src1, timm:$src3),
 
 // For insertion into the zero index (low half) of a 256-bit vector, it is
 // more efficient to generate a blend with immediate instead of an insert*128.
-// NOTE: We're using FP instructions here, but exeuction domain fixing should
+// NOTE: We're using FP instructions here, but execution domain fixing should
 // take care of using integer instructions when profitable.
 let Predicates = [HasAVX] in {
 def : Pat<(insert_subvector (v8i32 VR256:$src1), (v4i32 VR128:$src2), (iPTR 0)),
@@ -7496,46 +7477,6 @@ defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, X86VBroadcastl
                                     v2i64, v4i64, NoVLX>;
 
 let Predicates = [HasAVX2, NoVLX] in {
-  // 32-bit targets will fail to load a i64 directly but can use ZEXT_LOAD.
-  def : Pat<(v2i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
-            (VPBROADCASTQrm addr:$src)>;
-  def : Pat<(v4i64 (X86VBroadcast (v2i64 (X86vzload64 addr:$src)))),
-            (VPBROADCASTQYrm addr:$src)>;
-
-  // FIXME this is to handle aligned extloads from i8/i16.
-  def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
-            (VPBROADCASTDrm addr:$src)>;
-  def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
-            (VPBROADCASTDYrm addr:$src)>;
-}
-let Predicates = [HasAVX2, NoVLX_Or_NoBWI] in {
-  // loadi16 is tricky to fold, because !isTypeDesirableForOp, justifiably.
-  // This means we'll encounter truncated i32 loads; match that here.
-  def : Pat<(v8i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
-            (VPBROADCASTWrm addr:$src)>;
-  def : Pat<(v16i16 (X86VBroadcast (i16 (trunc (i32 (load addr:$src)))))),
-            (VPBROADCASTWYrm addr:$src)>;
-  def : Pat<(v8i16 (X86VBroadcast
-              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
-            (VPBROADCASTWrm addr:$src)>;
-  def : Pat<(v8i16 (X86VBroadcast
-              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
-            (VPBROADCASTWrm addr:$src)>;
-  def : Pat<(v16i16 (X86VBroadcast
-              (i16 (trunc (i32 (extloadi16 addr:$src)))))),
-            (VPBROADCASTWYrm addr:$src)>;
-  def : Pat<(v16i16 (X86VBroadcast
-              (i16 (trunc (i32 (zextloadi16 addr:$src)))))),
-            (VPBROADCASTWYrm addr:$src)>;
-
-  // FIXME this is to handle aligned extloads from i8.
-  def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
-            (VPBROADCASTWrm addr:$src)>;
-  def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
-            (VPBROADCASTWYrm addr:$src)>;
-}
-
-let Predicates = [HasAVX2, NoVLX] in {
   // Provide fallback in case the load node that is used in the patterns above
   // is used by additional users, which prevents the pattern selection.
     def : Pat<(v4f32 (X86VBroadcast FR32:$src)),
@@ -7597,10 +7538,6 @@ let Predicates = [HasAVX, NoVLX] in {
 
   def : Pat<(v2f64 (X86VBroadcast v2f64:$src)),
             (VMOVDDUPrr VR128:$src)>;
-  def : Pat<(v2f64 (X86VBroadcast (v2f64 (simple_load addr:$src)))),
-            (VMOVDDUPrm addr:$src)>;
-  def : Pat<(v2f64 (X86VBroadcast (v2f64 (X86vzload64 addr:$src)))),
-            (VMOVDDUPrm addr:$src)>;
 }
 
 let Predicates = [HasAVX1Only] in {
@@ -7760,39 +7697,43 @@ let Predicates = [HasAVX2, NoVLX] in {
 //
 multiclass avx2_pmovmask<string OpcodeStr,
                          Intrinsic IntLd128, Intrinsic IntLd256,
-                         Intrinsic IntSt128, Intrinsic IntSt256> {
+                         Intrinsic IntSt128, Intrinsic IntSt256,
+                         X86SchedWriteMaskMove schedX,
+                         X86SchedWriteMaskMove schedY> {
   def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst, (IntLd128 addr:$src2, VR128:$src1))]>,
-             VEX_4V, Sched<[WriteVecMaskedLoad]>;
+             VEX_4V, Sched<[schedX.RM]>;
   def Yrm : AVX28I<0x8c, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst, (IntLd256 addr:$src2, VR256:$src1))]>,
-             VEX_4V, VEX_L, Sched<[WriteVecMaskedLoadY]>;
+             VEX_4V, VEX_L, Sched<[schedY.RM]>;
   def mr  : AVX28I<0x8e, MRMDestMem, (outs),
              (ins i128mem:$dst, VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(IntSt128 addr:$dst, VR128:$src1, VR128:$src2)]>,
-             VEX_4V, Sched<[WriteVecMaskedStore]>;
+             VEX_4V, Sched<[schedX.MR]>;
   def Ymr : AVX28I<0x8e, MRMDestMem, (outs),
              (ins i256mem:$dst, VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>,
-             VEX_4V, VEX_L, Sched<[WriteVecMaskedStoreY]>;
+             VEX_4V, VEX_L, Sched<[schedY.MR]>;
 }
 
 defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
                                 int_x86_avx2_maskload_d,
                                 int_x86_avx2_maskload_d_256,
                                 int_x86_avx2_maskstore_d,
-                                int_x86_avx2_maskstore_d_256>;
+                                int_x86_avx2_maskstore_d_256,
+                                WriteVecMaskMove32, WriteVecMaskMove32Y>;
 defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
                                 int_x86_avx2_maskload_q,
                                 int_x86_avx2_maskload_q_256,
                                 int_x86_avx2_maskstore_q,
-                                int_x86_avx2_maskstore_q_256>, VEX_W;
+                                int_x86_avx2_maskstore_q_256,
+                                WriteVecMaskMove64, WriteVecMaskMove64Y>, VEX_W;
 
 multiclass maskmov_lowering<string InstrStr, RegisterClass RC, ValueType VT,
                           ValueType MaskVT> {
@@ -7905,57 +7846,48 @@ let Predicates = [HasAVX2, NoVLX] in {
 
 // FIXME: Improve scheduling of gather instructions.
 multiclass avx2_gather<bits<8> opc, string OpcodeStr, ValueType VTx,
-                       ValueType VTy, PatFrag GatherNode128,
-                       PatFrag GatherNode256, RegisterClass RC256,
+                       ValueType VTy, RegisterClass RC256,
                        X86MemOperand memop128, X86MemOperand memop256,
                        ValueType MTx = VTx, ValueType MTy = VTy> {
+let mayLoad = 1, hasSideEffects = 0 in {
   def rm  : AVX28I<opc, MRMSrcMem4VOp3, (outs VR128:$dst, VR128:$mask_wb),
             (ins VR128:$src1, memop128:$src2, VR128:$mask),
             !strconcat(OpcodeStr,
               "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
-            [(set (VTx VR128:$dst), (MTx VR128:$mask_wb),
-                  (GatherNode128 VR128:$src1, VR128:$mask,
-                                vectoraddr:$src2))]>,
-            VEX, Sched<[WriteLoad]>;
+            []>, VEX, Sched<[WriteLoad]>;
   def Yrm : AVX28I<opc, MRMSrcMem4VOp3, (outs RC256:$dst, RC256:$mask_wb),
             (ins RC256:$src1, memop256:$src2, RC256:$mask),
             !strconcat(OpcodeStr,
               "\t{$mask, $src2, $dst|$dst, $src2, $mask}"),
-            [(set (VTy RC256:$dst), (MTy RC256:$mask_wb),
-                  (GatherNode256 RC256:$src1, RC256:$mask,
-                                vectoraddr:$src2))]>,
-            VEX, VEX_L, Sched<[WriteLoad]>;
+            []>, VEX, VEX_L, Sched<[WriteLoad]>;
+}
 }
 
 let Predicates = [HasAVX2] in {
   let mayLoad = 1, hasSideEffects = 0, Constraints
     = "@earlyclobber $dst,@earlyclobber $mask_wb, $src1 = $dst, $mask = $mask_wb"
     in {
-    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64, mgatherv4i32,
-                        mgatherv4i32, VR256, vx128mem, vx256mem>, VEX_W;
-    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64, mgatherv2i64,
-                        mgatherv4i64, VR256, vx128mem, vy256mem>, VEX_W;
-    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32, mgatherv4i32,
-                        mgatherv8i32, VR256, vx128mem, vy256mem>;
-    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32, mgatherv2i64,
-                        mgatherv4i64, VR128, vx64mem, vy128mem>;
+    defm VPGATHERDQ : avx2_gather<0x90, "vpgatherdq", v2i64, v4i64,
+                        VR256, vx128mem, vx256mem>, VEX_W;
+    defm VPGATHERQQ : avx2_gather<0x91, "vpgatherqq", v2i64, v4i64,
+                        VR256, vx128mem, vy256mem>, VEX_W;
+    defm VPGATHERDD : avx2_gather<0x90, "vpgatherdd", v4i32, v8i32,
+                        VR256, vx128mem, vy256mem>;
+    defm VPGATHERQD : avx2_gather<0x91, "vpgatherqd", v4i32, v4i32,
+                        VR128, vx64mem, vy128mem>;
 
     let ExeDomain = SSEPackedDouble in {
-      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64, mgatherv4i32,
-                          mgatherv4i32, VR256, vx128mem, vx256mem,
-                          v2i64, v4i64>, VEX_W;
-      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64, mgatherv2i64,
-                          mgatherv4i64, VR256, vx128mem, vy256mem,
-                          v2i64, v4i64>, VEX_W;
+      defm VGATHERDPD : avx2_gather<0x92, "vgatherdpd", v2f64, v4f64,
+                          VR256, vx128mem, vx256mem, v2i64, v4i64>, VEX_W;
+      defm VGATHERQPD : avx2_gather<0x93, "vgatherqpd", v2f64, v4f64,
+                          VR256, vx128mem, vy256mem, v2i64, v4i64>, VEX_W;
     }
 
     let ExeDomain = SSEPackedSingle in {
-      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32, mgatherv4i32,
-                          mgatherv8i32, VR256, vx128mem, vy256mem,
-                          v4i32, v8i32>;
-      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32, mgatherv2i64,
-                          mgatherv4i64, VR128, vx64mem, vy128mem,
-                          v4i32, v4i32>;
+      defm VGATHERDPS : avx2_gather<0x92, "vgatherdps", v4f32, v8f32,
+                          VR256, vx128mem, vy256mem, v4i32, v8i32>;
+      defm VGATHERQPS : avx2_gather<0x93, "vgatherqps", v4f32, v4f32,
+                          VR128, vx64mem, vy128mem, v4i32, v4i32>;
     }
   }
 }
@@ -7969,8 +7901,8 @@ multiclass GF2P8MULB_rm<string OpcodeStr, ValueType OpVT,
                         X86MemOperand X86MemOp, bit Is2Addr = 0> {
   let ExeDomain = SSEPackedInt,
       AsmString = !if(Is2Addr,
-        OpcodeStr##"\t{$src2, $dst|$dst, $src2}",
-        OpcodeStr##"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
+        OpcodeStr#"\t{$src2, $dst|$dst, $src2}",
+        OpcodeStr#"\t{$src2, $src1, $dst|$dst, $src1, $src2}") in {
     let isCommutable = 1 in
     def rr : PDI<0xCF, MRMSrcReg, (outs RC:$dst), (ins RC:$src1, RC:$src2), "",
                  [(set RC:$dst, (OpVT (X86GF2P8mulb RC:$src1, RC:$src2)))]>,
@@ -7987,8 +7919,8 @@ multiclass GF2P8AFFINE_rmi<bits<8> Op, string OpStr, ValueType OpVT,
                            SDNode OpNode, RegisterClass RC, PatFrag MemOpFrag,
                            X86MemOperand X86MemOp, bit Is2Addr = 0> {
   let AsmString = !if(Is2Addr,
-      OpStr##"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-      OpStr##"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
+      OpStr#"\t{$src3, $src2, $dst|$dst, $src2, $src3}",
+      OpStr#"\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}") in {
   def rri : Ii8<Op, MRMSrcReg, (outs RC:$dst),
               (ins RC:$src1, RC:$src2, u8imm:$src3), "",
               [(set RC:$dst, (OpVT (OpNode RC:$src1, RC:$src2, timm:$src3)))],
@@ -8008,9 +7940,9 @@ multiclass GF2P8AFFINE_common<bits<8> Op, string OpStr, SDNode OpNode> {
   defm NAME         : GF2P8AFFINE_rmi<Op, OpStr, v16i8, OpNode,
                                       VR128, load, i128mem, 1>;
   let Predicates  = [HasGFNI, HasAVX, NoVLX_Or_NoBWI] in {
-    defm V##NAME    : GF2P8AFFINE_rmi<Op, "v"##OpStr, v16i8, OpNode, VR128,
+    defm V#NAME    : GF2P8AFFINE_rmi<Op, "v"#OpStr, v16i8, OpNode, VR128,
                                       load, i128mem>, VEX_4V, VEX_W;
-    defm V##NAME##Y : GF2P8AFFINE_rmi<Op, "v"##OpStr, v32i8, OpNode, VR256,
+    defm V#NAME#Y : GF2P8AFFINE_rmi<Op, "v"#OpStr, v32i8, OpNode, VR256,
                                       load, i256mem>, VEX_4V, VEX_L, VEX_W;
   }
 }
diff --git a/llvm/lib/Target/X86/X86InstrShiftRotate.td b/llvm/lib/Target/X86/X86InstrShiftRotate.td
index 9d974b716dda..823ff78b9903 100644
--- a/llvm/lib/Target/X86/X86InstrShiftRotate.td
+++ b/llvm/lib/Target/X86/X86InstrShiftRotate.td
@@ -472,19 +472,19 @@ def ROL64rCL : RI<0xD3, MRM0r, (outs GR64:$dst), (ins GR64:$src1),
 
 def ROL8ri   : Ii8<0xC0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
                    "rol{b}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR8:$dst, (rotl GR8:$src1, (i8 relocImm:$src2)))]>;
+                   [(set GR8:$dst, (rotl GR8:$src1, (i8 imm:$src2)))]>;
 def ROL16ri  : Ii8<0xC1, MRM0r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
                    "rol{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (rotl GR16:$src1, (i8 relocImm:$src2)))]>,
+                   [(set GR16:$dst, (rotl GR16:$src1, (i8 imm:$src2)))]>,
                    OpSize16;
 def ROL32ri  : Ii8<0xC1, MRM0r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
                    "rol{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (rotl GR32:$src1, (i8 relocImm:$src2)))]>,
+                   [(set GR32:$dst, (rotl GR32:$src1, (i8 imm:$src2)))]>,
                    OpSize32;
 def ROL64ri  : RIi8<0xC1, MRM0r, (outs GR64:$dst),
                     (ins GR64:$src1, u8imm:$src2),
                     "rol{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (rotl GR64:$src1, (i8 relocImm:$src2)))]>;
+                    [(set GR64:$dst, (rotl GR64:$src1, (i8 imm:$src2)))]>;
 
 // Rotate by 1
 def ROL8r1   : I<0xD0, MRM0r, (outs GR8 :$dst), (ins GR8 :$src1),
@@ -570,19 +570,19 @@ def ROR64rCL : RI<0xD3, MRM1r, (outs GR64:$dst), (ins GR64:$src1),
 
 def ROR8ri   : Ii8<0xC0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1, u8imm:$src2),
                    "ror{b}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR8:$dst, (rotr GR8:$src1, (i8 relocImm:$src2)))]>;
+                   [(set GR8:$dst, (rotr GR8:$src1, (i8 imm:$src2)))]>;
 def ROR16ri  : Ii8<0xC1, MRM1r, (outs GR16:$dst), (ins GR16:$src1, u8imm:$src2),
                    "ror{w}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR16:$dst, (rotr GR16:$src1, (i8 relocImm:$src2)))]>,
+                   [(set GR16:$dst, (rotr GR16:$src1, (i8 imm:$src2)))]>,
                    OpSize16;
 def ROR32ri  : Ii8<0xC1, MRM1r, (outs GR32:$dst), (ins GR32:$src1, u8imm:$src2),
                    "ror{l}\t{$src2, $dst|$dst, $src2}",
-                   [(set GR32:$dst, (rotr GR32:$src1, (i8 relocImm:$src2)))]>,
+                   [(set GR32:$dst, (rotr GR32:$src1, (i8 imm:$src2)))]>,
                    OpSize32;
 def ROR64ri  : RIi8<0xC1, MRM1r, (outs GR64:$dst),
                     (ins GR64:$src1, u8imm:$src2),
                     "ror{q}\t{$src2, $dst|$dst, $src2}",
-                    [(set GR64:$dst, (rotr GR64:$src1, (i8 relocImm:$src2)))]>;
+                    [(set GR64:$dst, (rotr GR64:$src1, (i8 imm:$src2)))]>;
 
 // Rotate by 1
 def ROR8r1   : I<0xD0, MRM1r, (outs GR8 :$dst), (ins GR8 :$src1),
@@ -661,32 +661,32 @@ let Uses = [CL], SchedRW = [WriteSHDrrcl] in {
 def SHLD16rrCL : I<0xA5, MRMDestReg, (outs GR16:$dst),
                    (ins GR16:$src1, GR16:$src2),
                    "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2, CL))]>,
+                   [(set GR16:$dst, (X86fshl GR16:$src1, GR16:$src2, CL))]>,
                    TB, OpSize16;
 def SHRD16rrCL : I<0xAD, MRMDestReg, (outs GR16:$dst),
                    (ins GR16:$src1, GR16:$src2),
                    "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2, CL))]>,
+                   [(set GR16:$dst, (X86fshr GR16:$src2, GR16:$src1, CL))]>,
                    TB, OpSize16;
 def SHLD32rrCL : I<0xA5, MRMDestReg, (outs GR32:$dst),
                    (ins GR32:$src1, GR32:$src2),
                    "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2, CL))]>,
+                   [(set GR32:$dst, (fshl GR32:$src1, GR32:$src2, CL))]>,
                    TB, OpSize32;
 def SHRD32rrCL : I<0xAD, MRMDestReg, (outs GR32:$dst),
                    (ins GR32:$src1, GR32:$src2),
                    "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2, CL))]>,
+                   [(set GR32:$dst, (fshr GR32:$src2, GR32:$src1, CL))]>,
                    TB, OpSize32;
 def SHLD64rrCL : RI<0xA5, MRMDestReg, (outs GR64:$dst),
                     (ins GR64:$src1, GR64:$src2),
                     "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                    [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2, CL))]>,
+                    [(set GR64:$dst, (fshl GR64:$src1, GR64:$src2, CL))]>,
                     TB;
 def SHRD64rrCL : RI<0xAD, MRMDestReg, (outs GR64:$dst),
                     (ins GR64:$src1, GR64:$src2),
                     "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                    [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2, CL))]>,
+                    [(set GR64:$dst, (fshr GR64:$src2, GR64:$src1, CL))]>,
                     TB;
 } // SchedRW
 
@@ -695,42 +695,42 @@ def SHLD16rri8 : Ii8<0xA4, MRMDestReg,
                      (outs GR16:$dst),
                      (ins GR16:$src1, GR16:$src2, u8imm:$src3),
                      "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                     [(set GR16:$dst, (X86shld GR16:$src1, GR16:$src2,
+                     [(set GR16:$dst, (X86fshl GR16:$src1, GR16:$src2,
                                       (i8 imm:$src3)))]>,
                      TB, OpSize16;
 def SHRD16rri8 : Ii8<0xAC, MRMDestReg,
                      (outs GR16:$dst),
                      (ins GR16:$src1, GR16:$src2, u8imm:$src3),
                      "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                     [(set GR16:$dst, (X86shrd GR16:$src1, GR16:$src2,
+                     [(set GR16:$dst, (X86fshr GR16:$src2, GR16:$src1,
                                       (i8 imm:$src3)))]>,
                      TB, OpSize16;
 def SHLD32rri8 : Ii8<0xA4, MRMDestReg,
                      (outs GR32:$dst),
                      (ins GR32:$src1, GR32:$src2, u8imm:$src3),
                      "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                     [(set GR32:$dst, (X86shld GR32:$src1, GR32:$src2,
+                     [(set GR32:$dst, (fshl GR32:$src1, GR32:$src2,
                                       (i8 imm:$src3)))]>,
                  TB, OpSize32;
 def SHRD32rri8 : Ii8<0xAC, MRMDestReg,
                      (outs GR32:$dst),
                      (ins GR32:$src1, GR32:$src2, u8imm:$src3),
                      "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                     [(set GR32:$dst, (X86shrd GR32:$src1, GR32:$src2,
+                     [(set GR32:$dst, (fshr GR32:$src2, GR32:$src1,
                                       (i8 imm:$src3)))]>,
                  TB, OpSize32;
 def SHLD64rri8 : RIi8<0xA4, MRMDestReg,
                       (outs GR64:$dst),
                       (ins GR64:$src1, GR64:$src2, u8imm:$src3),
                       "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                      [(set GR64:$dst, (X86shld GR64:$src1, GR64:$src2,
+                      [(set GR64:$dst, (fshl GR64:$src1, GR64:$src2,
                                        (i8 imm:$src3)))]>,
                  TB;
 def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
                       (outs GR64:$dst),
                       (ins GR64:$src1, GR64:$src2, u8imm:$src3),
                       "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                      [(set GR64:$dst, (X86shrd GR64:$src1, GR64:$src2,
+                      [(set GR64:$dst, (fshr GR64:$src2, GR64:$src1,
                                        (i8 imm:$src3)))]>,
                  TB;
 } // SchedRW
@@ -739,70 +739,70 @@ def SHRD64rri8 : RIi8<0xAC, MRMDestReg,
 let Uses = [CL], SchedRW = [WriteSHDmrcl] in {
 def SHLD16mrCL : I<0xA5, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
                    "shld{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(store (X86shld (loadi16 addr:$dst), GR16:$src2, CL),
-                     addr:$dst)]>, TB, OpSize16;
+                   [(store (X86fshl (loadi16 addr:$dst), GR16:$src2, CL),
+                                    addr:$dst)]>, TB, OpSize16;
 def SHRD16mrCL : I<0xAD, MRMDestMem, (outs), (ins i16mem:$dst, GR16:$src2),
                   "shrd{w}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                  [(store (X86shrd (loadi16 addr:$dst), GR16:$src2, CL),
-                    addr:$dst)]>, TB, OpSize16;
+                  [(store (X86fshr GR16:$src2, (loadi16 addr:$dst), CL),
+                                   addr:$dst)]>, TB, OpSize16;
 
 def SHLD32mrCL : I<0xA5, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                    "shld{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                   [(store (X86shld (loadi32 addr:$dst), GR32:$src2, CL),
+                   [(store (fshl (loadi32 addr:$dst), GR32:$src2, CL),
                      addr:$dst)]>, TB, OpSize32;
 def SHRD32mrCL : I<0xAD, MRMDestMem, (outs), (ins i32mem:$dst, GR32:$src2),
                   "shrd{l}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                  [(store (X86shrd (loadi32 addr:$dst), GR32:$src2, CL),
-                    addr:$dst)]>, TB, OpSize32;
+                  [(store (fshr GR32:$src2, (loadi32 addr:$dst), CL),
+                                addr:$dst)]>, TB, OpSize32;
 
 def SHLD64mrCL : RI<0xA5, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                     "shld{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                    [(store (X86shld (loadi64 addr:$dst), GR64:$src2, CL),
-                      addr:$dst)]>, TB;
+                    [(store (fshl (loadi64 addr:$dst), GR64:$src2, CL),
+                                  addr:$dst)]>, TB;
 def SHRD64mrCL : RI<0xAD, MRMDestMem, (outs), (ins i64mem:$dst, GR64:$src2),
                     "shrd{q}\t{%cl, $src2, $dst|$dst, $src2, cl}",
-                    [(store (X86shrd (loadi64 addr:$dst), GR64:$src2, CL),
-                      addr:$dst)]>, TB;
+                    [(store (fshr GR64:$src2, (loadi64 addr:$dst), CL),
+                                  addr:$dst)]>, TB;
 } // SchedRW
 
 let SchedRW = [WriteSHDmri] in {
 def SHLD16mri8 : Ii8<0xA4, MRMDestMem,
                     (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
                     "shld{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    [(store (X86shld (loadi16 addr:$dst), GR16:$src2,
-                                      (i8 imm:$src3)), addr:$dst)]>,
+                    [(store (X86fshl (loadi16 addr:$dst), GR16:$src2,
+                                     (i8 imm:$src3)), addr:$dst)]>,
                     TB, OpSize16;
 def SHRD16mri8 : Ii8<0xAC, MRMDestMem,
                      (outs), (ins i16mem:$dst, GR16:$src2, u8imm:$src3),
                      "shrd{w}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    [(store (X86shrd (loadi16 addr:$dst), GR16:$src2,
-                                      (i8 imm:$src3)), addr:$dst)]>,
+                    [(store (X86fshr GR16:$src2, (loadi16 addr:$dst),
+                                     (i8 imm:$src3)), addr:$dst)]>,
                      TB, OpSize16;
 
 def SHLD32mri8 : Ii8<0xA4, MRMDestMem,
                     (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
                     "shld{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                    [(store (X86shld (loadi32 addr:$dst), GR32:$src2,
-                                      (i8 imm:$src3)), addr:$dst)]>,
+                    [(store (fshl (loadi32 addr:$dst), GR32:$src2,
+                                  (i8 imm:$src3)), addr:$dst)]>,
                     TB, OpSize32;
 def SHRD32mri8 : Ii8<0xAC, MRMDestMem,
                      (outs), (ins i32mem:$dst, GR32:$src2, u8imm:$src3),
                      "shrd{l}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                     [(store (X86shrd (loadi32 addr:$dst), GR32:$src2,
-                                       (i8 imm:$src3)), addr:$dst)]>,
+                     [(store (fshr GR32:$src2, (loadi32 addr:$dst),
+                                   (i8 imm:$src3)), addr:$dst)]>,
                      TB, OpSize32;
 
 def SHLD64mri8 : RIi8<0xA4, MRMDestMem,
                       (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
                       "shld{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                      [(store (X86shld (loadi64 addr:$dst), GR64:$src2,
-                                       (i8 imm:$src3)), addr:$dst)]>,
+                      [(store (fshl (loadi64 addr:$dst), GR64:$src2,
+                                    (i8 imm:$src3)), addr:$dst)]>,
                  TB;
 def SHRD64mri8 : RIi8<0xAC, MRMDestMem,
                       (outs), (ins i64mem:$dst, GR64:$src2, u8imm:$src3),
                       "shrd{q}\t{$src3, $src2, $dst|$dst, $src2, $src3}",
-                      [(store (X86shrd (loadi64 addr:$dst), GR64:$src2,
-                                       (i8 imm:$src3)), addr:$dst)]>,
+                      [(store (fshr GR64:$src2, (loadi64 addr:$dst),
+                                    (i8 imm:$src3)), addr:$dst)]>,
                  TB;
 } // SchedRW
 
@@ -1013,3 +1013,21 @@ let Predicates = [HasBMI2] in {
                       (INSERT_SUBREG
                         (i64 (IMPLICIT_DEF)), GR8:$src2, sub_8bit))>;
 }
+
+def : Pat<(rotl GR8:$src1, (i8 relocImm:$src2)),
+          (ROL8ri GR8:$src1, relocImm:$src2)>;
+def : Pat<(rotl GR16:$src1, (i8 relocImm:$src2)),
+          (ROL16ri GR16:$src1, relocImm:$src2)>;
+def : Pat<(rotl GR32:$src1, (i8 relocImm:$src2)),
+          (ROL32ri GR32:$src1, relocImm:$src2)>;
+def : Pat<(rotl GR64:$src1, (i8 relocImm:$src2)),
+          (ROL64ri GR64:$src1, relocImm:$src2)>;
+
+def : Pat<(rotr GR8:$src1, (i8 relocImm:$src2)),
+          (ROR8ri GR8:$src1, relocImm:$src2)>;
+def : Pat<(rotr GR16:$src1, (i8 relocImm:$src2)),
+          (ROR16ri GR16:$src1, relocImm:$src2)>;
+def : Pat<(rotr GR32:$src1, (i8 relocImm:$src2)),
+          (ROR32ri GR32:$src1, relocImm:$src2)>;
+def : Pat<(rotr GR64:$src1, (i8 relocImm:$src2)),
+          (ROR64ri GR64:$src1, relocImm:$src2)>;
diff --git a/llvm/lib/Target/X86/X86InstrSystem.td b/llvm/lib/Target/X86/X86InstrSystem.td
index 7f41feb6c0d9..c23bc7ebbf70 100644
--- a/llvm/lib/Target/X86/X86InstrSystem.td
+++ b/llvm/lib/Target/X86/X86InstrSystem.td
@@ -23,7 +23,20 @@ def RDTSCP : I<0x01, MRM_F9, (outs), (ins), "rdtscp", []>, TB;
 
 let mayLoad = 1, mayStore = 0, hasSideEffects = 1, isTrap = 1 in {
   def TRAP    : I<0x0B, RawFrm, (outs), (ins), "ud2", [(trap)]>, TB;
-  def UD2B    : I<0xB9, RawFrm, (outs), (ins), "ud2b", []>, TB;
+
+  def UD1Wm   : I<0xB9, MRMSrcMem, (outs), (ins GR16:$src1, i16mem:$src2),
+                  "ud1{w} {$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
+  def UD1Lm   : I<0xB9, MRMSrcMem, (outs), (ins GR32:$src1, i32mem:$src2),
+                  "ud1{l} {$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
+  def UD1Qm   : RI<0xB9, MRMSrcMem, (outs), (ins GR64:$src1, i64mem:$src2),
+                   "ud1{q} {$src2, $src1|$src1, $src2}", []>, TB;
+
+  def UD1Wr   : I<0xB9, MRMSrcReg, (outs), (ins GR16:$src1, GR16:$src2),
+                  "ud1{w} {$src2, $src1|$src1, $src2}", []>, TB, OpSize16;
+  def UD1Lr   : I<0xB9, MRMSrcReg, (outs), (ins GR32:$src1, GR32:$src2),
+                  "ud1{l} {$src2, $src1|$src1, $src2}", []>, TB, OpSize32;
+  def UD1Qr   : RI<0xB9, MRMSrcReg, (outs), (ins GR64:$src1, GR64:$src2),
+                   "ud1{q} {$src2, $src1|$src1, $src2}", []>, TB;
 }
 
 def HLT : I<0xF4, RawFrm, (outs), (ins), "hlt", []>;
@@ -149,12 +162,12 @@ def MOV64cr : I<0x22, MRMSrcReg, (outs CONTROL_REG:$dst), (ins GR64:$src),
 // Segment override instruction prefixes
 
 let SchedRW = [WriteNop] in {
-def CS_PREFIX : I<0x2E, RawFrm, (outs), (ins), "cs", []>;
-def SS_PREFIX : I<0x36, RawFrm, (outs), (ins), "ss", []>;
-def DS_PREFIX : I<0x3E, RawFrm, (outs), (ins), "ds", []>;
-def ES_PREFIX : I<0x26, RawFrm, (outs), (ins), "es", []>;
-def FS_PREFIX : I<0x64, RawFrm, (outs), (ins), "fs", []>;
-def GS_PREFIX : I<0x65, RawFrm, (outs), (ins), "gs", []>;
+def CS_PREFIX : I<0x2E, PrefixByte, (outs), (ins), "cs", []>;
+def SS_PREFIX : I<0x36, PrefixByte, (outs), (ins), "ss", []>;
+def DS_PREFIX : I<0x3E, PrefixByte, (outs), (ins), "ds", []>;
+def ES_PREFIX : I<0x26, PrefixByte, (outs), (ins), "es", []>;
+def FS_PREFIX : I<0x64, PrefixByte, (outs), (ins), "fs", []>;
+def GS_PREFIX : I<0x65, PrefixByte, (outs), (ins), "gs", []>;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -512,12 +525,12 @@ let SchedRW = [WriteSystem] in {
 let SchedRW = [WriteSystem] in {
 let Predicates = [HasXSAVE] in {
 let Defs = [EDX, EAX], Uses = [ECX] in
-  def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, TB;
+  def XGETBV : I<0x01, MRM_D0, (outs), (ins), "xgetbv", []>, PS;
 
 let Uses = [EDX, EAX, ECX] in
   def XSETBV : I<0x01, MRM_D1, (outs), (ins),
                 "xsetbv",
-                [(int_x86_xsetbv ECX, EDX, EAX)]>, TB;
+                [(int_x86_xsetbv ECX, EDX, EAX)]>, PS;
 
 } // HasXSAVE
 
@@ -542,47 +555,47 @@ def XSAVEOPT64 : RI<0xAE, MRM6m, (outs), (ins opaquemem:$dst),
                     [(int_x86_xsaveopt64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEOPT, In64BitMode]>;
 def XSAVEC : I<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
                "xsavec\t$dst",
-               [(int_x86_xsavec addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC]>;
+               [(int_x86_xsavec addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEC]>;
 def XSAVEC64 : RI<0xC7, MRM4m, (outs), (ins opaquemem:$dst),
                  "xsavec64\t$dst",
-                 [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVEC, In64BitMode]>;
+                 [(int_x86_xsavec64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVEC, In64BitMode]>;
 def XSAVES : I<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
                "xsaves\t$dst",
-               [(int_x86_xsaves addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>;
+               [(int_x86_xsaves addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES]>;
 def XSAVES64 : RI<0xC7, MRM5m, (outs), (ins opaquemem:$dst),
                   "xsaves64\t$dst",
-                  [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVE, In64BitMode]>;
+                  [(int_x86_xsaves64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVE, In64BitMode]>;
 def XRSTORS : I<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
                 "xrstors\t$dst",
-                [(int_x86_xrstors addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES]>;
+                [(int_x86_xrstors addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES]>;
 def XRSTORS64 : RI<0xC7, MRM3m, (outs), (ins opaquemem:$dst),
                    "xrstors64\t$dst",
-                   [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, TB, Requires<[HasXSAVES, In64BitMode]>;
+                   [(int_x86_xrstors64 addr:$dst, EDX, EAX)]>, PS, Requires<[HasXSAVES, In64BitMode]>;
 } // Uses
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
 // VIA PadLock crypto instructions
 let Defs = [RAX, RDI], Uses = [RDX, RDI], SchedRW = [WriteSystem] in
-  def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB;
+  def XSTORE : I<0xa7, MRM_C0, (outs), (ins), "xstore", []>, TB, REP;
 
 def : InstAlias<"xstorerng", (XSTORE)>;
 
 let SchedRW = [WriteSystem] in {
 let Defs = [RSI, RDI], Uses = [RBX, RDX, RSI, RDI] in {
-  def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB;
-  def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB;
-  def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB;
-  def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB;
-  def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB;
+  def XCRYPTECB : I<0xa7, MRM_C8, (outs), (ins), "xcryptecb", []>, TB, REP;
+  def XCRYPTCBC : I<0xa7, MRM_D0, (outs), (ins), "xcryptcbc", []>, TB, REP;
+  def XCRYPTCTR : I<0xa7, MRM_D8, (outs), (ins), "xcryptctr", []>, TB, REP;
+  def XCRYPTCFB : I<0xa7, MRM_E0, (outs), (ins), "xcryptcfb", []>, TB, REP;
+  def XCRYPTOFB : I<0xa7, MRM_E8, (outs), (ins), "xcryptofb", []>, TB, REP;
 }
 
 let Defs = [RAX, RSI, RDI], Uses = [RAX, RSI, RDI] in {
-  def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB;
-  def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB;
+  def XSHA1 : I<0xa6, MRM_C8, (outs), (ins), "xsha1", []>, TB, REP;
+  def XSHA256 : I<0xa6, MRM_D0, (outs), (ins), "xsha256", []>, TB, REP;
 }
 let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
-  def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB;
+  def MONTMUL : I<0xa6, MRM_C0, (outs), (ins), "montmul", []>, TB, REP;
 } // SchedRW
 
 //==-----------------------------------------------------------------------===//
@@ -590,10 +603,10 @@ let Defs = [RAX, RDX, RSI], Uses = [RAX, RSI] in
 let SchedRW = [WriteSystem] in {
 let Defs = [EAX, EDX], Uses = [ECX] in
   def RDPKRUr : I<0x01, MRM_EE, (outs), (ins), "rdpkru",
-                  [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, TB;
+                  [(set EAX, (X86rdpkru ECX)), (implicit EDX)]>, PS;
 let Uses = [EAX, ECX, EDX] in
   def WRPKRUr : I<0x01, MRM_EF, (outs), (ins), "wrpkru",
-                  [(X86wrpkru EAX, EDX, ECX)]>, TB;
+                  [(X86wrpkru EAX, EDX, ECX)]>, PS;
 } // SchedRW
 
 //===----------------------------------------------------------------------===//
@@ -653,15 +666,15 @@ let Predicates = [In64BitMode, HasINVPCID] in {
 //===----------------------------------------------------------------------===//
 // SMAP Instruction
 let Defs = [EFLAGS], SchedRW = [WriteSystem] in {
-  def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, TB;
-  def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, TB;
+  def CLAC : I<0x01, MRM_CA, (outs), (ins), "clac", []>, PS;
+  def STAC : I<0x01, MRM_CB, (outs), (ins), "stac", []>, PS;
 }
 
 //===----------------------------------------------------------------------===//
 // SMX Instruction
 let SchedRW = [WriteSystem] in {
 let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX] in {
-  def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, TB;
+  def GETSEC : I<0x37, RawFrm, (outs), (ins), "getsec", []>, PS;
 } // Uses, Defs
 } // SchedRW
 
@@ -729,6 +742,6 @@ def PTWRITE64r : RI<0xAE, MRM4r, (outs), (ins GR64:$dst),
 
 let SchedRW = [WriteSystem] in {
 let Uses = [RAX, RBX, RCX, RDX], Defs = [RAX, RBX, RCX, RDX, EFLAGS] in
-    def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, TB,
+    def PCONFIG : I<0x01, MRM_C5, (outs), (ins), "pconfig", []>, PS,
                   Requires<[HasPCONFIG]>;
 } // SchedRW
diff --git a/llvm/lib/Target/X86/X86InstrTSX.td b/llvm/lib/Target/X86/X86InstrTSX.td
index 41b839425ccd..28563eeb4484 100644
--- a/llvm/lib/Target/X86/X86InstrTSX.td
+++ b/llvm/lib/Target/X86/X86InstrTSX.td
@@ -37,11 +37,11 @@ def XABORT_DEF : I<0, Pseudo, (outs), (ins), "# XABORT DEF", []>;
 }
 
 def XEND : I<0x01, MRM_D5, (outs), (ins),
-             "xend", [(int_x86_xend)]>, TB, Requires<[HasRTM]>;
+             "xend", [(int_x86_xend)]>, PS, Requires<[HasRTM]>;
 
 let Defs = [EFLAGS] in
 def XTEST : I<0x01, MRM_D6, (outs), (ins),
-              "xtest", [(set EFLAGS, (X86xtest))]>, TB, Requires<[HasRTM]>;
+              "xtest", [(set EFLAGS, (X86xtest))]>, PS, Requires<[HasRTM]>;
 
 def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
                  "xabort\t$imm",
@@ -52,8 +52,8 @@ def XABORT : Ii8<0xc6, MRM_F8, (outs), (ins i8imm:$imm),
 let SchedRW = [WriteSystem] in {
 
 let isAsmParserOnly = 1 in {
-def XACQUIRE_PREFIX : I<0xF2, RawFrm, (outs), (ins), "xacquire", []>;
-def XRELEASE_PREFIX : I<0xF3, RawFrm, (outs), (ins), "xrelease", []>;
+def XACQUIRE_PREFIX : I<0xF2, PrefixByte, (outs), (ins), "xacquire", []>;
+def XRELEASE_PREFIX : I<0xF3, PrefixByte, (outs), (ins), "xrelease", []>;
 }
 
 } // SchedRW
diff --git a/llvm/lib/Target/X86/X86InstrVMX.td b/llvm/lib/Target/X86/X86InstrVMX.td
index 37bc4ce2e053..d204a33358ea 100644
--- a/llvm/lib/Target/X86/X86InstrVMX.td
+++ b/llvm/lib/Target/X86/X86InstrVMX.td
@@ -37,7 +37,7 @@ def VMCLEARm : I<0xC7, MRM6m, (outs), (ins i64mem:$vmcs),
   "vmclear\t$vmcs", []>, PD;
 
 // OF 01 D4
-def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, TB;
+def VMFUNC : I<0x01, MRM_D4, (outs), (ins), "vmfunc", []>, PS;
 
 // 0F 01 C2
 def VMLAUNCH : I<0x01, MRM_C2, (outs), (ins), "vmlaunch", []>, TB;
diff --git a/llvm/lib/Target/X86/X86InstrXOP.td b/llvm/lib/Target/X86/X86InstrXOP.td
index 229af366d940..a5976b7d2d74 100644
--- a/llvm/lib/Target/X86/X86InstrXOP.td
+++ b/llvm/lib/Target/X86/X86InstrXOP.td
@@ -40,14 +40,14 @@ let ExeDomain = SSEPackedInt in {
 
 // Scalar load 2 addr operand instructions
 multiclass xop2opsld<bits<8> opc, string OpcodeStr, Intrinsic Int,
-                     Operand memop, ComplexPattern mem_cpat,
+                     Operand memop, PatFrags mem_frags,
                      X86FoldableSchedWrite sched> {
   def rr : IXOP<opc, MRMSrcReg, (outs VR128:$dst), (ins VR128:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
            [(set VR128:$dst, (Int VR128:$src))]>, XOP, Sched<[sched]>;
   def rm : IXOP<opc, MRMSrcMem, (outs VR128:$dst), (ins memop:$src),
            !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
-           [(set VR128:$dst, (Int mem_cpat:$src))]>, XOP,
+           [(set VR128:$dst, (Int (mem_frags addr:$src)))]>, XOP,
            Sched<[sched.Folded, sched.ReadAfterFold]>;
 }
 
@@ -335,13 +335,13 @@ multiclass xop4op_int<bits<8> opc, string OpcodeStr, RegisterClass RC,
             [(set RC:$dst, (VT (or (and RC:$src3, RC:$src1),
                                    (X86andnp RC:$src3, RC:$src2))))]>, XOP_4V,
             Sched<[sched]>;
-  // FIXME: This pattern can't match.
+  // FIXME: We can't write a pattern for this in tablegen.
+  let hasSideEffects = 0, mayLoad = 1 in
   def rrm : IXOPi8Reg<opc, MRMSrcMemOp4, (outs RC:$dst),
             (ins RC:$src1, RC:$src2, x86memop:$src3),
             !strconcat(OpcodeStr,
             "\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-            [(set RC:$dst, (VT (or (and (load addr:$src3), RC:$src1),
-                                   (X86andnp (load addr:$src3), RC:$src2))))]>,
+            []>,
             XOP_4V, VEX_W, Sched<[sched.Folded, sched.ReadAfterFold, sched.ReadAfterFold]>;
   def rmr : IXOPi8Reg<opc, MRMSrcMem, (outs RC:$dst),
             (ins RC:$src1, x86memop:$src2, RC:$src3),
@@ -383,13 +383,13 @@ let Predicates = [HasXOP] in {
             (VPCMOVrrr VR128:$src1, VR128:$src2, VR128:$src3)>;
 
   def : Pat<(or (and VR128:$src3, VR128:$src1),
-                (X86andnp VR128:$src3, (bc_v16i8 (loadv2i64 addr:$src2)))),
+                (X86andnp VR128:$src3, (loadv16i8 addr:$src2))),
             (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
   def : Pat<(or (and VR128:$src3, VR128:$src1),
-                (X86andnp VR128:$src3, (bc_v8i16 (loadv2i64 addr:$src2)))),
+                (X86andnp VR128:$src3, (loadv8i16 addr:$src2))),
             (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
   def : Pat<(or (and VR128:$src3, VR128:$src1),
-                (X86andnp VR128:$src3, (bc_v4i32 (loadv2i64 addr:$src2)))),
+                (X86andnp VR128:$src3, (loadv4i32 addr:$src2))),
             (VPCMOVrmr VR128:$src1, addr:$src2, VR128:$src3)>;
 
   def : Pat<(v32i8 (or (and VR256:$src3, VR256:$src1),
@@ -403,13 +403,13 @@ let Predicates = [HasXOP] in {
             (VPCMOVYrrr VR256:$src1, VR256:$src2, VR256:$src3)>;
 
   def : Pat<(or (and VR256:$src3, VR256:$src1),
-                (X86andnp VR256:$src3, (bc_v32i8 (loadv4i64 addr:$src2)))),
+                (X86andnp VR256:$src3, (loadv32i8 addr:$src2))),
             (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
   def : Pat<(or (and VR256:$src3, VR256:$src1),
-                (X86andnp VR256:$src3, (bc_v16i16 (loadv4i64 addr:$src2)))),
+                (X86andnp VR256:$src3, (loadv16i16 addr:$src2))),
             (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
   def : Pat<(or (and VR256:$src3, VR256:$src1),
-                (X86andnp VR256:$src3, (bc_v8i32 (loadv4i64 addr:$src2)))),
+                (X86andnp VR256:$src3, (loadv8i32 addr:$src2))),
             (VPCMOVYrmr VR256:$src1, addr:$src2, VR256:$src3)>;
 }
 
diff --git a/llvm/lib/Target/X86/X86InstructionSelector.cpp b/llvm/lib/Target/X86/X86InstructionSelector.cpp
index 3f9d626ff912..60fb4d2ef4bf 100644
--- a/llvm/lib/Target/X86/X86InstructionSelector.cpp
+++ b/llvm/lib/Target/X86/X86InstructionSelector.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "MCTargetDesc/X86BaseInfo.h"
+#include "X86.h"
 #include "X86InstrBuilder.h"
 #include "X86InstrInfo.h"
 #include "X86RegisterBankInfo.h"
@@ -71,7 +72,7 @@ private:
 
   // TODO: remove after supported by Tablegen-erated instruction selection.
   unsigned getLoadStoreOp(const LLT &Ty, const RegisterBank &RB, unsigned Opc,
-                          uint64_t Alignment) const;
+                          Align Alignment) const;
 
   bool selectLoadStoreOp(MachineInstr &I, MachineRegisterInfo &MRI,
                          MachineFunction &MF) const;
@@ -394,7 +395,7 @@ bool X86InstructionSelector::select(MachineInstr &I) {
 unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty,
                                                 const RegisterBank &RB,
                                                 unsigned Opc,
-                                                uint64_t Alignment) const {
+                                                Align Alignment) const {
   bool Isload = (Opc == TargetOpcode::G_LOAD);
   bool HasAVX = STI.hasAVX();
   bool HasAVX512 = STI.hasAVX512();
@@ -427,7 +428,7 @@ unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty,
                        HasAVX    ? X86::VMOVSDmr :
                                    X86::MOVSDmr);
   } else if (Ty.isVector() && Ty.getSizeInBits() == 128) {
-    if (Alignment >= 16)
+    if (Alignment >= Align(16))
       return Isload ? (HasVLX ? X86::VMOVAPSZ128rm
                               : HasAVX512
                                     ? X86::VMOVAPSZ128rm_NOVLX
@@ -446,7 +447,7 @@ unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty,
                                     ? X86::VMOVUPSZ128mr_NOVLX
                                     : HasAVX ? X86::VMOVUPSmr : X86::MOVUPSmr);
   } else if (Ty.isVector() && Ty.getSizeInBits() == 256) {
-    if (Alignment >= 32)
+    if (Alignment >= Align(32))
       return Isload ? (HasVLX ? X86::VMOVAPSZ256rm
                               : HasAVX512 ? X86::VMOVAPSZ256rm_NOVLX
                                           : X86::VMOVAPSYrm)
@@ -461,7 +462,7 @@ unsigned X86InstructionSelector::getLoadStoreOp(const LLT &Ty,
                               : HasAVX512 ? X86::VMOVUPSZ256mr_NOVLX
                                           : X86::VMOVUPSYmr);
   } else if (Ty.isVector() && Ty.getSizeInBits() == 512) {
-    if (Alignment >= 64)
+    if (Alignment >= Align(64))
       return Isload ? X86::VMOVAPSZrm : X86::VMOVAPSZmr;
     else
       return Isload ? X86::VMOVUPSZrm : X86::VMOVUPSZmr;
@@ -520,13 +521,13 @@ bool X86InstructionSelector::selectLoadStoreOp(MachineInstr &I,
       LLVM_DEBUG(dbgs() << "Atomic ordering not supported yet\n");
       return false;
     }
-    if (MemOp.getAlignment() < Ty.getSizeInBits()/8) {
+    if (MemOp.getAlign() < Ty.getSizeInBits() / 8) {
       LLVM_DEBUG(dbgs() << "Unaligned atomics not supported yet\n");
       return false;
     }
   }
 
-  unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlignment());
+  unsigned NewOpc = getLoadStoreOp(Ty, RB, Opc, MemOp.getAlign());
   if (NewOpc == Opc)
     return false;
 
@@ -1435,14 +1436,15 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I,
   const Register DstReg = I.getOperand(0).getReg();
   const LLT DstTy = MRI.getType(DstReg);
   const RegisterBank &RegBank = *RBI.getRegBank(DstReg, MRI, TRI);
-  unsigned Align = DstTy.getSizeInBits();
+  Align Alignment = Align(DstTy.getSizeInBytes());
   const DebugLoc &DbgLoc = I.getDebugLoc();
 
-  unsigned Opc = getLoadStoreOp(DstTy, RegBank, TargetOpcode::G_LOAD, Align);
+  unsigned Opc =
+      getLoadStoreOp(DstTy, RegBank, TargetOpcode::G_LOAD, Alignment);
 
   // Create the load from the constant pool.
   const ConstantFP *CFP = I.getOperand(1).getFPImm();
-  unsigned CPI = MF.getConstantPool()->getConstantPoolIndex(CFP, Align);
+  unsigned CPI = MF.getConstantPool()->getConstantPoolIndex(CFP, Alignment);
   MachineInstr *LoadInst = nullptr;
   unsigned char OpFlag = STI.classifyLocalReference(nullptr);
 
@@ -1456,7 +1458,7 @@ bool X86InstructionSelector::materializeFP(MachineInstr &I,
 
     MachineMemOperand *MMO = MF.getMachineMemOperand(
         MachinePointerInfo::getConstantPool(MF), MachineMemOperand::MOLoad,
-        MF.getDataLayout().getPointerSize(), Align);
+        MF.getDataLayout().getPointerSize(), Alignment);
 
     LoadInst =
         addDirectMem(BuildMI(*I.getParent(), I, DbgLoc, TII.get(Opc), DstReg),
diff --git a/llvm/lib/Target/X86/X86InterleavedAccess.cpp b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
index 8f74a8fe041d..a19e12766e10 100644
--- a/llvm/lib/Target/X86/X86InterleavedAccess.cpp
+++ b/llvm/lib/Target/X86/X86InterleavedAccess.cpp
@@ -69,7 +69,7 @@ class X86InterleavedAccessGroup {
 
   /// Breaks down a vector \p 'Inst' of N elements into \p NumSubVectors
   /// sub vectors of type \p T. Returns the sub-vectors in \p DecomposedVectors.
-  void decompose(Instruction *Inst, unsigned NumSubVectors, VectorType *T,
+  void decompose(Instruction *Inst, unsigned NumSubVectors, FixedVectorType *T,
                  SmallVectorImpl<Instruction *> &DecomposedVectors);
 
   /// Performs matrix transposition on a 4x4 matrix \p InputVectors and
@@ -127,7 +127,7 @@ public:
 
 bool X86InterleavedAccessGroup::isSupported() const {
   VectorType *ShuffleVecTy = Shuffles[0]->getType();
-  Type *ShuffleEltTy = ShuffleVecTy->getVectorElementType();
+  Type *ShuffleEltTy = ShuffleVecTy->getElementType();
   unsigned ShuffleElemSize = DL.getTypeSizeInBits(ShuffleEltTy);
   unsigned WideInstSize;
 
@@ -150,7 +150,7 @@ bool X86InterleavedAccessGroup::isSupported() const {
   // We support shuffle represents stride 4 for byte type with size of
   // WideInstSize.
   if (ShuffleElemSize == 64 && WideInstSize == 1024 && Factor == 4)
-     return true;
+    return true;
 
   if (ShuffleElemSize == 8 && isa<StoreInst>(Inst) && Factor == 4 &&
       (WideInstSize == 256 || WideInstSize == 512 || WideInstSize == 1024 ||
@@ -165,7 +165,7 @@ bool X86InterleavedAccessGroup::isSupported() const {
 }
 
 void X86InterleavedAccessGroup::decompose(
-    Instruction *VecInst, unsigned NumSubVectors, VectorType *SubVecTy,
+    Instruction *VecInst, unsigned NumSubVectors, FixedVectorType *SubVecTy,
     SmallVectorImpl<Instruction *> &DecomposedVectors) {
   assert((isa<LoadInst>(VecInst) || isa<ShuffleVectorInst>(VecInst)) &&
          "Expected Load or Shuffle");
@@ -186,8 +186,8 @@ void X86InterleavedAccessGroup::decompose(
       DecomposedVectors.push_back(
           cast<ShuffleVectorInst>(Builder.CreateShuffleVector(
               Op0, Op1,
-              createSequentialMask(Builder, Indices[i],
-                                   SubVecTy->getVectorNumElements(), 0))));
+              createSequentialMask(Indices[i], SubVecTy->getNumElements(),
+                                   0))));
     return;
   }
 
@@ -201,7 +201,7 @@ void X86InterleavedAccessGroup::decompose(
   // [0,1...,VF/2-1,VF/2+VF,VF/2+VF+1,...,2VF-1]
   unsigned VecLength = DL.getTypeSizeInBits(VecWidth);
   if (VecLength == 768 || VecLength == 1536) {
-    VecBaseTy = VectorType::get(Type::getInt8Ty(LI->getContext()), 16);
+    VecBaseTy = FixedVectorType::get(Type::getInt8Ty(LI->getContext()), 16);
     VecBasePtrTy = VecBaseTy->getPointerTo(LI->getPointerAddressSpace());
     VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
     NumLoads = NumSubVectors * (VecLength / 384);
@@ -211,13 +211,20 @@ void X86InterleavedAccessGroup::decompose(
     VecBasePtr = Builder.CreateBitCast(LI->getPointerOperand(), VecBasePtrTy);
   }
   // Generate N loads of T type.
+  assert(VecBaseTy->getPrimitiveSizeInBits().isByteSized() &&
+         "VecBaseTy's size must be a multiple of 8");
+  const Align FirstAlignment = LI->getAlign();
+  const Align SubsequentAlignment = commonAlignment(
+      FirstAlignment, VecBaseTy->getPrimitiveSizeInBits().getFixedSize() / 8);
+  Align Alignment = FirstAlignment;
   for (unsigned i = 0; i < NumLoads; i++) {
     // TODO: Support inbounds GEP.
     Value *NewBasePtr =
         Builder.CreateGEP(VecBaseTy, VecBasePtr, Builder.getInt32(i));
     Instruction *NewLoad =
-        Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, LI->getAlignment());
+        Builder.CreateAlignedLoad(VecBaseTy, NewBasePtr, Alignment);
     DecomposedVectors.push_back(NewLoad);
+    Alignment = SubsequentAlignment;
   }
 }
 
@@ -229,11 +236,11 @@ static MVT scaleVectorType(MVT VT) {
                           VT.getVectorNumElements() / 2);
 }
 
-static uint32_t Concat[] = {
-  0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
-  16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
-  32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
-  48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63 };
+static constexpr int Concat[] = {
+    0,  1,  2,  3,  4,  5,  6,  7,  8,  9,  10, 11, 12, 13, 14, 15,
+    16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+    32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+    48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63};
 
 // genShuffleBland - Creates shuffle according to two vectors.This function is
 // only works on instructions with lane inside 256 registers. According to
@@ -251,11 +258,11 @@ static uint32_t Concat[] = {
 // By computing the shuffle on a sequence of 16 elements(one lane) and add the
 // correct offset. We are creating a vpsuffed + blend sequence between two
 // shuffles.
-static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
-  SmallVectorImpl<uint32_t> &Out, int LowOffset,
-  int HighOffset) {
+static void genShuffleBland(MVT VT, ArrayRef<int> Mask,
+                            SmallVectorImpl<int> &Out, int LowOffset,
+                            int HighOffset) {
   assert(VT.getSizeInBits() >= 256 &&
-    "This function doesn't accept width smaller then 256");
+         "This function doesn't accept width smaller then 256");
   unsigned NumOfElm = VT.getVectorNumElements();
   for (unsigned i = 0; i < Mask.size(); i++)
     Out.push_back(Mask[i] + LowOffset);
@@ -282,36 +289,35 @@ static void genShuffleBland(MVT VT, ArrayRef<uint32_t> Mask,
 // Invec[2] -  |2|5|8|11|     TransposedMatrix[2] - |8|9|10|11|
 
 static void reorderSubVector(MVT VT, SmallVectorImpl<Value *> &TransposedMatrix,
-  ArrayRef<Value *> Vec, ArrayRef<uint32_t> VPShuf,
-  unsigned VecElems, unsigned Stride,
-  IRBuilder<> Builder) {
+                             ArrayRef<Value *> Vec, ArrayRef<int> VPShuf,
+                             unsigned VecElems, unsigned Stride,
+                             IRBuilder<> &Builder) {
 
   if (VecElems == 16) {
     for (unsigned i = 0; i < Stride; i++)
       TransposedMatrix[i] = Builder.CreateShuffleVector(
-        Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
+          Vec[i], UndefValue::get(Vec[i]->getType()), VPShuf);
     return;
   }
 
-  SmallVector<uint32_t, 32> OptimizeShuf;
+  SmallVector<int, 32> OptimizeShuf;
   Value *Temp[8];
 
   for (unsigned i = 0; i < (VecElems / 16) * Stride; i += 2) {
     genShuffleBland(VT, VPShuf, OptimizeShuf, (i / Stride) * 16,
-      (i + 1) / Stride * 16);
+                    (i + 1) / Stride * 16);
     Temp[i / 2] = Builder.CreateShuffleVector(
-      Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
+        Vec[i % Stride], Vec[(i + 1) % Stride], OptimizeShuf);
     OptimizeShuf.clear();
   }
 
   if (VecElems == 32) {
     std::copy(Temp, Temp + Stride, TransposedMatrix.begin());
     return;
-  }
-  else
+  } else
     for (unsigned i = 0; i < Stride; i++)
       TransposedMatrix[i] =
-      Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
+          Builder.CreateShuffleVector(Temp[2 * i], Temp[2 * i + 1], Concat);
 }
 
 void X86InterleavedAccessGroup::interleave8bitStride4VF8(
@@ -325,19 +331,19 @@ void X86InterleavedAccessGroup::interleave8bitStride4VF8(
 
   MVT VT = MVT::v8i16;
   TransposedMatrix.resize(2);
-  SmallVector<uint32_t, 16> MaskLow;
-  SmallVector<uint32_t, 32> MaskLowTemp1, MaskLowWord;
-  SmallVector<uint32_t, 32> MaskHighTemp1, MaskHighWord;
+  SmallVector<int, 16> MaskLow;
+  SmallVector<int, 32> MaskLowTemp1, MaskLowWord;
+  SmallVector<int, 32> MaskHighTemp1, MaskHighWord;
 
   for (unsigned i = 0; i < 8; ++i) {
     MaskLow.push_back(i);
     MaskLow.push_back(i + 8);
   }
 
-  createUnpackShuffleMask<uint32_t>(VT, MaskLowTemp1, true, false);
-  createUnpackShuffleMask<uint32_t>(VT, MaskHighTemp1, false, false);
-  scaleShuffleMask<uint32_t>(2, MaskHighTemp1, MaskHighWord);
-  scaleShuffleMask<uint32_t>(2, MaskLowTemp1, MaskLowWord);
+  createUnpackShuffleMask(VT, MaskLowTemp1, true, false);
+  createUnpackShuffleMask(VT, MaskHighTemp1, false, false);
+  narrowShuffleMaskElts(2, MaskHighTemp1, MaskHighWord);
+  narrowShuffleMaskElts(2, MaskLowTemp1, MaskLowWord);
   // IntrVec1Low = c0 m0 c1 m1 c2 m2 c3 m3 c4 m4 c5 m5 c6 m6 c7 m7
   // IntrVec2Low = y0 k0 y1 k1 y2 k2 y3 k3 y4 k4 y5 k5 y6 k6 y7 k7
   Value *IntrVec1Low =
@@ -367,25 +373,25 @@ void X86InterleavedAccessGroup::interleave8bitStride4(
   MVT HalfVT = scaleVectorType(VT);
 
   TransposedMatrix.resize(4);
-  SmallVector<uint32_t, 32> MaskHigh;
-  SmallVector<uint32_t, 32> MaskLow;
-  SmallVector<uint32_t, 32> LowHighMask[2];
-  SmallVector<uint32_t, 32> MaskHighTemp;
-  SmallVector<uint32_t, 32> MaskLowTemp;
+  SmallVector<int, 32> MaskHigh;
+  SmallVector<int, 32> MaskLow;
+  SmallVector<int, 32> LowHighMask[2];
+  SmallVector<int, 32> MaskHighTemp;
+  SmallVector<int, 32> MaskLowTemp;
 
   // MaskHighTemp and MaskLowTemp built in the vpunpckhbw and vpunpcklbw X86
   // shuffle pattern.
 
-  createUnpackShuffleMask<uint32_t>(VT, MaskLow, true, false);
-  createUnpackShuffleMask<uint32_t>(VT, MaskHigh, false, false);
+  createUnpackShuffleMask(VT, MaskLow, true, false);
+  createUnpackShuffleMask(VT, MaskHigh, false, false);
 
   // MaskHighTemp1 and MaskLowTemp1 built in the vpunpckhdw and vpunpckldw X86
   // shuffle pattern.
 
-  createUnpackShuffleMask<uint32_t>(HalfVT, MaskLowTemp, true, false);
-  createUnpackShuffleMask<uint32_t>(HalfVT, MaskHighTemp, false, false);
-  scaleShuffleMask<uint32_t>(2, MaskLowTemp, LowHighMask[0]);
-  scaleShuffleMask<uint32_t>(2, MaskHighTemp, LowHighMask[1]);
+  createUnpackShuffleMask(HalfVT, MaskLowTemp, true, false);
+  createUnpackShuffleMask(HalfVT, MaskHighTemp, false, false);
+  narrowShuffleMaskElts(2, MaskLowTemp, LowHighMask[0]);
+  narrowShuffleMaskElts(2, MaskHighTemp, LowHighMask[1]);
 
   // IntrVec1Low  = c0  m0  c1  m1 ... c7  m7  | c16 m16 c17 m17 ... c23 m23
   // IntrVec1High = c8  m8  c9  m9 ... c15 m15 | c24 m24 c25 m25 ... c31 m31
@@ -433,7 +439,7 @@ void X86InterleavedAccessGroup::interleave8bitStride4(
 //  For example shuffle pattern for VF 16 register size 256 -> lanes = 2
 //  {<[0|3|6|1|4|7|2|5]-[8|11|14|9|12|15|10|13]>}
 static void createShuffleStride(MVT VT, int Stride,
-                                SmallVectorImpl<uint32_t> &Mask) {
+                                SmallVectorImpl<int> &Mask) {
   int VectorSize = VT.getSizeInBits();
   int VF = VT.getVectorNumElements();
   int LaneCount = std::max(VectorSize / 128, 1);
@@ -446,7 +452,7 @@ static void createShuffleStride(MVT VT, int Stride,
 //  inside mask a shuffleMask. A mask contains exactly 3 groups, where
 //  each group is a monotonically increasing sequence with stride 3.
 //  For example shuffleMask {0,3,6,1,4,7,2,5} => {3,3,2}
-static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
+static void setGroupSize(MVT VT, SmallVectorImpl<int> &SizeInfo) {
   int VectorSize = VT.getSizeInBits();
   int VF = VT.getVectorNumElements() / std::max(VectorSize / 128, 1);
   for (int i = 0, FirstGroupElement = 0; i < 3; i++) {
@@ -470,7 +476,7 @@ static void setGroupSize(MVT VT, SmallVectorImpl<uint32_t> &SizeInfo) {
 //  direction of the alignment. (false - align to the "right" side while true -
 //  align to the "left" side)
 static void DecodePALIGNRMask(MVT VT, unsigned Imm,
-                              SmallVectorImpl<uint32_t> &ShuffleMask,
+                              SmallVectorImpl<int> &ShuffleMask,
                               bool AlignDirection = true, bool Unary = false) {
   unsigned NumElts = VT.getVectorNumElements();
   unsigned NumLanes = std::max((int)VT.getSizeInBits() / 128, 1);
@@ -519,7 +525,7 @@ static void DecodePALIGNRMask(MVT VT, unsigned Imm,
 // Invec[2] - |8|9|10|11|      Vec[2] - |2|5|8|11|
 
 static void concatSubVector(Value **Vec, ArrayRef<Instruction *> InVec,
-                            unsigned VecElems, IRBuilder<> Builder) {
+                            unsigned VecElems, IRBuilder<> &Builder) {
   if (VecElems == 16) {
     for (int i = 0; i < 3; i++)
       Vec[i] = InVec[i];
@@ -547,11 +553,11 @@ void X86InterleavedAccessGroup::deinterleave8bitStride3(
   // Matrix[2]= b5 c5 a6 b6 c6 a7 b7 c7
 
   TransposedMatrix.resize(3);
-  SmallVector<uint32_t, 32> VPShuf;
-  SmallVector<uint32_t, 32> VPAlign[2];
-  SmallVector<uint32_t, 32> VPAlign2;
-  SmallVector<uint32_t, 32> VPAlign3;
-  SmallVector<uint32_t, 3> GroupSize;
+  SmallVector<int, 32> VPShuf;
+  SmallVector<int, 32> VPAlign[2];
+  SmallVector<int, 32> VPAlign2;
+  SmallVector<int, 32> VPAlign3;
+  SmallVector<int, 3> GroupSize;
   Value *Vec[6], *TempVector[3];
 
   MVT VT = MVT::getVT(Shuffles[0]->getType());
@@ -605,8 +611,8 @@ void X86InterleavedAccessGroup::deinterleave8bitStride3(
 // group2Shuffle reorder the shuffle stride back into continuous order.
 // For example For VF16 with Mask1 = {0,3,6,9,12,15,2,5,8,11,14,1,4,7,10,13} =>
 // MaskResult = {0,11,6,1,12,7,2,13,8,3,14,9,4,15,10,5}.
-static void group2Shuffle(MVT VT, SmallVectorImpl<uint32_t> &Mask,
-                          SmallVectorImpl<uint32_t> &Output) {
+static void group2Shuffle(MVT VT, SmallVectorImpl<int> &Mask,
+                          SmallVectorImpl<int> &Output) {
   int IndexGroup[3] = {0, 0, 0};
   int Index = 0;
   int VectorWidth = VT.getSizeInBits();
@@ -633,11 +639,11 @@ void X86InterleavedAccessGroup::interleave8bitStride3(
   // Matrix[2]= c0 c1 c2 c3 c3 a7 b7 c7
 
   TransposedMatrix.resize(3);
-  SmallVector<uint32_t, 3> GroupSize;
-  SmallVector<uint32_t, 32> VPShuf;
-  SmallVector<uint32_t, 32> VPAlign[3];
-  SmallVector<uint32_t, 32> VPAlign2;
-  SmallVector<uint32_t, 32> VPAlign3;
+  SmallVector<int, 3> GroupSize;
+  SmallVector<int, 32> VPShuf;
+  SmallVector<int, 32> VPAlign[3];
+  SmallVector<int, 32> VPAlign2;
+  SmallVector<int, 32> VPAlign3;
 
   Value *Vec[3], *TempVector[3];
   MVT VT = MVT::getVectorVT(MVT::i8, VecElems);
@@ -682,7 +688,7 @@ void X86InterleavedAccessGroup::interleave8bitStride3(
 
   unsigned NumOfElm = VT.getVectorNumElements();
   group2Shuffle(VT, GroupSize, VPShuf);
-  reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm,3, Builder);
+  reorderSubVector(VT, TransposedMatrix, Vec, VPShuf, NumOfElm, 3, Builder);
 }
 
 void X86InterleavedAccessGroup::transpose_4x4(
@@ -692,25 +698,25 @@ void X86InterleavedAccessGroup::transpose_4x4(
   TransposedMatrix.resize(4);
 
   // dst = src1[0,1],src2[0,1]
-  uint32_t IntMask1[] = {0, 1, 4, 5};
-  ArrayRef<uint32_t> Mask = makeArrayRef(IntMask1, 4);
+  static constexpr int IntMask1[] = {0, 1, 4, 5};
+  ArrayRef<int> Mask = makeArrayRef(IntMask1, 4);
   Value *IntrVec1 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
   Value *IntrVec2 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
 
   // dst = src1[2,3],src2[2,3]
-  uint32_t IntMask2[] = {2, 3, 6, 7};
+  static constexpr int IntMask2[] = {2, 3, 6, 7};
   Mask = makeArrayRef(IntMask2, 4);
   Value *IntrVec3 = Builder.CreateShuffleVector(Matrix[0], Matrix[2], Mask);
   Value *IntrVec4 = Builder.CreateShuffleVector(Matrix[1], Matrix[3], Mask);
 
   // dst = src1[0],src2[0],src1[2],src2[2]
-  uint32_t IntMask3[] = {0, 4, 2, 6};
+  static constexpr int IntMask3[] = {0, 4, 2, 6};
   Mask = makeArrayRef(IntMask3, 4);
   TransposedMatrix[0] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
   TransposedMatrix[2] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
 
   // dst = src1[1],src2[1],src1[3],src2[3]
-  uint32_t IntMask4[] = {1, 5, 3, 7};
+  static constexpr int IntMask4[] = {1, 5, 3, 7};
   Mask = makeArrayRef(IntMask4, 4);
   TransposedMatrix[1] = Builder.CreateShuffleVector(IntrVec1, IntrVec2, Mask);
   TransposedMatrix[3] = Builder.CreateShuffleVector(IntrVec3, IntrVec4, Mask);
@@ -721,14 +727,14 @@ void X86InterleavedAccessGroup::transpose_4x4(
 bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
   SmallVector<Instruction *, 4> DecomposedVectors;
   SmallVector<Value *, 4> TransposedVectors;
-  VectorType *ShuffleTy = Shuffles[0]->getType();
+  auto *ShuffleTy = cast<FixedVectorType>(Shuffles[0]->getType());
 
   if (isa<LoadInst>(Inst)) {
     // Try to generate target-sized register(/instruction).
     decompose(Inst, Factor, ShuffleTy, DecomposedVectors);
 
-    Type *ShuffleEltTy = Inst->getType();
-    unsigned NumSubVecElems = ShuffleEltTy->getVectorNumElements() / Factor;
+    auto *ShuffleEltTy = cast<FixedVectorType>(Inst->getType());
+    unsigned NumSubVecElems = ShuffleEltTy->getNumElements() / Factor;
     // Perform matrix-transposition in order to compute interleaved
     // results by generating some sort of (optimized) target-specific
     // instructions.
@@ -756,13 +762,14 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
     return true;
   }
 
-  Type *ShuffleEltTy = ShuffleTy->getVectorElementType();
-  unsigned NumSubVecElems = ShuffleTy->getVectorNumElements() / Factor;
+  Type *ShuffleEltTy = ShuffleTy->getElementType();
+  unsigned NumSubVecElems = ShuffleTy->getNumElements() / Factor;
 
   // Lower the interleaved stores:
   //   1. Decompose the interleaved wide shuffle into individual shuffle
   //   vectors.
-  decompose(Shuffles[0], Factor, VectorType::get(ShuffleEltTy, NumSubVecElems),
+  decompose(Shuffles[0], Factor,
+            FixedVectorType::get(ShuffleEltTy, NumSubVecElems),
             DecomposedVectors);
 
   //   2. Transpose the interleaved-vectors into vectors of contiguous
@@ -793,8 +800,7 @@ bool X86InterleavedAccessGroup::lowerIntoOptimizedSequence() {
 
   //   4. Generate a store instruction for wide-vec.
   StoreInst *SI = cast<StoreInst>(Inst);
-  Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(),
-                             SI->getAlignment());
+  Builder.CreateAlignedStore(WideVec, SI->getPointerOperand(), SI->getAlign());
 
   return true;
 }
@@ -826,7 +832,8 @@ bool X86TargetLowering::lowerInterleavedStore(StoreInst *SI,
   assert(Factor >= 2 && Factor <= getMaxSupportedInterleaveFactor() &&
          "Invalid interleave factor");
 
-  assert(SVI->getType()->getVectorNumElements() % Factor == 0 &&
+  assert(cast<FixedVectorType>(SVI->getType())->getNumElements() % Factor ==
+             0 &&
          "Invalid interleaved store");
 
   // Holds the indices of SVI that correspond to the starting index of each
diff --git a/llvm/lib/Target/X86/X86IntrinsicsInfo.h b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
index 40bf28df3b90..1c10c07abeee 100644
--- a/llvm/lib/Target/X86/X86IntrinsicsInfo.h
+++ b/llvm/lib/Target/X86/X86IntrinsicsInfo.h
@@ -679,8 +679,8 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
   X86_INTRINSIC_DATA(avx512_mask_pmovs_qb_512, TRUNCATE_TO_REG,
                      X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
-  X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, INTR_TYPE_1OP_MASK,
-                     X86ISD::VTRUNCS, 0),
+  X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_128, TRUNCATE_TO_REG,
+                     X86ISD::VTRUNCS, X86ISD::VMTRUNCS),
   X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_256, INTR_TYPE_1OP_MASK,
                      X86ISD::VTRUNCS, 0),
   X86_INTRINSIC_DATA(avx512_mask_pmovs_qd_512, INTR_TYPE_1OP_MASK,
@@ -783,10 +783,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
                      X86ISD::FSUBS, X86ISD::FSUBS_RND),
   X86_INTRINSIC_DATA(avx512_mask_sub_ss_round, INTR_TYPE_SCALAR_MASK,
                      X86ISD::FSUBS, X86ISD::FSUBS_RND),
-  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_128, INTR_TYPE_1OP_MASK,
-                     X86ISD::CVTPH2PS, 0),
-  X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_256, INTR_TYPE_1OP_MASK,
-                     X86ISD::CVTPH2PS, 0),
   X86_INTRINSIC_DATA(avx512_mask_vcvtph2ps_512, INTR_TYPE_1OP_MASK_SAE,
                      X86ISD::CVTPH2PS, X86ISD::CVTPH2PS_SAE),
   X86_INTRINSIC_DATA(avx512_mask_vcvtps2ph_128, CVTPS2PH_MASK,
@@ -997,7 +993,16 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(bmi_bextr_64,         INTR_TYPE_2OP, X86ISD::BEXTR, 0),
   X86_INTRINSIC_DATA(bmi_bzhi_32,          INTR_TYPE_2OP, X86ISD::BZHI, 0),
   X86_INTRINSIC_DATA(bmi_bzhi_64,          INTR_TYPE_2OP, X86ISD::BZHI, 0),
+  X86_INTRINSIC_DATA(bmi_pdep_32,          INTR_TYPE_2OP, X86ISD::PDEP, 0),
+  X86_INTRINSIC_DATA(bmi_pdep_64,          INTR_TYPE_2OP, X86ISD::PDEP, 0),
+  X86_INTRINSIC_DATA(bmi_pext_32,          INTR_TYPE_2OP, X86ISD::PEXT, 0),
+  X86_INTRINSIC_DATA(bmi_pext_64,          INTR_TYPE_2OP, X86ISD::PEXT, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_pd,     INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_pd_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_ps,     INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
+  X86_INTRINSIC_DATA(fma_vfmaddsub_ps_256, INTR_TYPE_3OP, X86ISD::FMADDSUB, 0),
   X86_INTRINSIC_DATA(sse_cmp_ps,        INTR_TYPE_3OP, X86ISD::CMPP, 0),
+  X86_INTRINSIC_DATA(sse_cmp_ss,        INTR_TYPE_3OP, X86ISD::FSETCC, 0),
   X86_INTRINSIC_DATA(sse_comieq_ss,     COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse_comige_ss,     COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse_comigt_ss,     COMI, X86ISD::COMI, ISD::SETGT),
@@ -1022,6 +1027,7 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(sse_ucomilt_ss,    COMI, X86ISD::UCOMI, ISD::SETLT),
   X86_INTRINSIC_DATA(sse_ucomineq_ss,   COMI, X86ISD::UCOMI, ISD::SETNE),
   X86_INTRINSIC_DATA(sse2_cmp_pd,       INTR_TYPE_3OP, X86ISD::CMPP, 0),
+  X86_INTRINSIC_DATA(sse2_cmp_sd,       INTR_TYPE_3OP, X86ISD::FSETCC, 0),
   X86_INTRINSIC_DATA(sse2_comieq_sd,    COMI, X86ISD::COMI, ISD::SETEQ),
   X86_INTRINSIC_DATA(sse2_comige_sd,    COMI, X86ISD::COMI, ISD::SETGE),
   X86_INTRINSIC_DATA(sse2_comigt_sd,    COMI, X86ISD::COMI, ISD::SETGT),
@@ -1104,8 +1110,6 @@ static const IntrinsicData  IntrinsicsWithoutChain[] = {
   X86_INTRINSIC_DATA(subborrow_64,      ADX, X86ISD::SBB, X86ISD::SUB),
   X86_INTRINSIC_DATA(tbm_bextri_u32,    BEXTRI, X86ISD::BEXTR, 0),
   X86_INTRINSIC_DATA(tbm_bextri_u64,    BEXTRI, X86ISD::BEXTR, 0),
-  X86_INTRINSIC_DATA(vcvtph2ps_128,     INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
-  X86_INTRINSIC_DATA(vcvtph2ps_256,     INTR_TYPE_1OP, X86ISD::CVTPH2PS, 0),
   X86_INTRINSIC_DATA(vcvtps2ph_128,     INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
   X86_INTRINSIC_DATA(vcvtps2ph_256,     INTR_TYPE_2OP, X86ISD::CVTPS2PH, 0),
 
@@ -1157,10 +1161,8 @@ static const IntrinsicData* getIntrinsicWithoutChain(unsigned IntNo) {
 }
 
 static void verifyIntrinsicTables() {
-  assert(std::is_sorted(std::begin(IntrinsicsWithoutChain),
-                        std::end(IntrinsicsWithoutChain)) &&
-         std::is_sorted(std::begin(IntrinsicsWithChain),
-                        std::end(IntrinsicsWithChain)) &&
+  assert(llvm::is_sorted(IntrinsicsWithoutChain) &&
+         llvm::is_sorted(IntrinsicsWithChain) &&
          "Intrinsic data tables should be sorted by Intrinsic ID");
   assert((std::adjacent_find(std::begin(IntrinsicsWithoutChain),
                              std::end(IntrinsicsWithoutChain)) ==
diff --git a/llvm/lib/Target/X86/X86LegalizerInfo.cpp b/llvm/lib/Target/X86/X86LegalizerInfo.cpp
index da53d6420021..84f560f2f9ee 100644
--- a/llvm/lib/Target/X86/X86LegalizerInfo.cpp
+++ b/llvm/lib/Target/X86/X86LegalizerInfo.cpp
@@ -85,14 +85,14 @@ X86LegalizerInfo::X86LegalizerInfo(const X86Subtarget &STI,
   verify(*STI.getInstrInfo());
 }
 
-bool X86LegalizerInfo::legalizeIntrinsic(MachineInstr &MI,
-                                         MachineRegisterInfo &MRI,
-                                         MachineIRBuilder &MIRBuilder) const {
+bool X86LegalizerInfo::legalizeIntrinsic(LegalizerHelper &Helper,
+                                         MachineInstr &MI) const {
+  MachineIRBuilder &MIRBuilder = Helper.MIRBuilder;
   switch (MI.getIntrinsicID()) {
   case Intrinsic::memcpy:
   case Intrinsic::memset:
   case Intrinsic::memmove:
-    if (createMemLibcall(MIRBuilder, MRI, MI) ==
+    if (createMemLibcall(MIRBuilder, *MIRBuilder.getMRI(), MI) ==
         LegalizerHelper::UnableToLegalize)
       return false;
     MI.eraseFromParent();
diff --git a/llvm/lib/Target/X86/X86LegalizerInfo.h b/llvm/lib/Target/X86/X86LegalizerInfo.h
index 7a0f13fb5ae6..72d25096d72b 100644
--- a/llvm/lib/Target/X86/X86LegalizerInfo.h
+++ b/llvm/lib/Target/X86/X86LegalizerInfo.h
@@ -32,8 +32,8 @@ private:
 public:
   X86LegalizerInfo(const X86Subtarget &STI, const X86TargetMachine &TM);
 
-  bool legalizeIntrinsic(MachineInstr &MI, MachineRegisterInfo &MRI,
-                         MachineIRBuilder &MIRBuilder) const override;
+  bool legalizeIntrinsic(LegalizerHelper &Helper,
+                         MachineInstr &MI) const override;
 
 private:
   void setLegalizerInfo32bit();
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
new file mode 100644
index 000000000000..50f8b3477acc
--- /dev/null
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionLoadHardening.cpp
@@ -0,0 +1,824 @@
+//==-- X86LoadValueInjectionLoadHardening.cpp - LVI load hardening for x86 --=//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Description: This pass finds Load Value Injection (LVI) gadgets consisting
+/// of a load from memory (i.e., SOURCE), and any operation that may transmit
+/// the value loaded from memory over a covert channel, or use the value loaded
+/// from memory to determine a branch/call target (i.e., SINK). After finding
+/// all such gadgets in a given function, the pass minimally inserts LFENCE
+/// instructions in such a manner that the following property is satisfied: for
+/// all SOURCE+SINK pairs, all paths in the CFG from SOURCE to SINK contain at
+/// least one LFENCE instruction. The algorithm that implements this minimal
+/// insertion is influenced by an academic paper that minimally inserts memory
+/// fences for high-performance concurrent programs:
+///         http://www.cs.ucr.edu/~lesani/companion/oopsla15/OOPSLA15.pdf
+/// The algorithm implemented in this pass is as follows:
+/// 1. Build a condensed CFG (i.e., a GadgetGraph) consisting only of the
+/// following components:
+///    - SOURCE instructions (also includes function arguments)
+///    - SINK instructions
+///    - Basic block entry points
+///    - Basic block terminators
+///    - LFENCE instructions
+/// 2. Analyze the GadgetGraph to determine which SOURCE+SINK pairs (i.e.,
+/// gadgets) are already mitigated by existing LFENCEs. If all gadgets have been
+/// mitigated, go to step 6.
+/// 3. Use a heuristic or plugin to approximate minimal LFENCE insertion.
+/// 4. Insert one LFENCE along each CFG edge that was cut in step 3.
+/// 5. Go to step 2.
+/// 6. If any LFENCEs were inserted, return `true` from runOnMachineFunction()
+/// to tell LLVM that the function was modified.
+///
+//===----------------------------------------------------------------------===//
+
+#include "ImmutableGraph.h"
+#include "X86.h"
+#include "X86Subtarget.h"
+#include "X86TargetMachine.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineDominanceFrontier.h"
+#include "llvm/CodeGen/MachineDominators.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstr.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/CodeGen/MachineLoopInfo.h"
+#include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/RDFGraph.h"
+#include "llvm/CodeGen/RDFLiveness.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DOTGraphTraits.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/DynamicLibrary.h"
+#include "llvm/Support/GraphWriter.h"
+#include "llvm/Support/raw_ostream.h"
+
+using namespace llvm;
+
+#define PASS_KEY "x86-lvi-load"
+#define DEBUG_TYPE PASS_KEY
+
+STATISTIC(NumFences, "Number of LFENCEs inserted for LVI mitigation");
+STATISTIC(NumFunctionsConsidered, "Number of functions analyzed");
+STATISTIC(NumFunctionsMitigated, "Number of functions for which mitigations "
+                                 "were deployed");
+STATISTIC(NumGadgets, "Number of LVI gadgets detected during analysis");
+
+static cl::opt<std::string> OptimizePluginPath(
+    PASS_KEY "-opt-plugin",
+    cl::desc("Specify a plugin to optimize LFENCE insertion"), cl::Hidden);
+
+static cl::opt<bool> NoConditionalBranches(
+    PASS_KEY "-no-cbranch",
+    cl::desc("Don't treat conditional branches as disclosure gadgets. This "
+             "may improve performance, at the cost of security."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EmitDot(
+    PASS_KEY "-dot",
+    cl::desc(
+        "For each function, emit a dot graph depicting potential LVI gadgets"),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EmitDotOnly(
+    PASS_KEY "-dot-only",
+    cl::desc("For each function, emit a dot graph depicting potential LVI "
+             "gadgets, and do not insert any fences"),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> EmitDotVerify(
+    PASS_KEY "-dot-verify",
+    cl::desc("For each function, emit a dot graph to stdout depicting "
+             "potential LVI gadgets, used for testing purposes only"),
+    cl::init(false), cl::Hidden);
+
+static llvm::sys::DynamicLibrary OptimizeDL;
+typedef int (*OptimizeCutT)(unsigned int *nodes, unsigned int nodes_size,
+                            unsigned int *edges, int *edge_values,
+                            int *cut_edges /* out */, unsigned int edges_size);
+static OptimizeCutT OptimizeCut = nullptr;
+
+namespace {
+
+struct MachineGadgetGraph : ImmutableGraph<MachineInstr *, int> {
+  static constexpr int GadgetEdgeSentinel = -1;
+  static constexpr MachineInstr *const ArgNodeSentinel = nullptr;
+
+  using GraphT = ImmutableGraph<MachineInstr *, int>;
+  using Node = typename GraphT::Node;
+  using Edge = typename GraphT::Edge;
+  using size_type = typename GraphT::size_type;
+  MachineGadgetGraph(std::unique_ptr<Node[]> Nodes,
+                     std::unique_ptr<Edge[]> Edges, size_type NodesSize,
+                     size_type EdgesSize, int NumFences = 0, int NumGadgets = 0)
+      : GraphT(std::move(Nodes), std::move(Edges), NodesSize, EdgesSize),
+        NumFences(NumFences), NumGadgets(NumGadgets) {}
+  static inline bool isCFGEdge(const Edge &E) {
+    return E.getValue() != GadgetEdgeSentinel;
+  }
+  static inline bool isGadgetEdge(const Edge &E) {
+    return E.getValue() == GadgetEdgeSentinel;
+  }
+  int NumFences;
+  int NumGadgets;
+};
+
+class X86LoadValueInjectionLoadHardeningPass : public MachineFunctionPass {
+public:
+  X86LoadValueInjectionLoadHardeningPass() : MachineFunctionPass(ID) {}
+
+  StringRef getPassName() const override {
+    return "X86 Load Value Injection (LVI) Load Hardening";
+  }
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+
+private:
+  using GraphBuilder = ImmutableGraphBuilder<MachineGadgetGraph>;
+  using EdgeSet = MachineGadgetGraph::EdgeSet;
+  using NodeSet = MachineGadgetGraph::NodeSet;
+  using Gadget = std::pair<MachineInstr *, MachineInstr *>;
+
+  const X86Subtarget *STI;
+  const TargetInstrInfo *TII;
+  const TargetRegisterInfo *TRI;
+
+  std::unique_ptr<MachineGadgetGraph>
+  getGadgetGraph(MachineFunction &MF, const MachineLoopInfo &MLI,
+                 const MachineDominatorTree &MDT,
+                 const MachineDominanceFrontier &MDF) const;
+  int hardenLoadsWithPlugin(MachineFunction &MF,
+                            std::unique_ptr<MachineGadgetGraph> Graph) const;
+  int hardenLoadsWithGreedyHeuristic(
+      MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const;
+  int elimMitigatedEdgesAndNodes(MachineGadgetGraph &G,
+                                 EdgeSet &ElimEdges /* in, out */,
+                                 NodeSet &ElimNodes /* in, out */) const;
+  std::unique_ptr<MachineGadgetGraph>
+  trimMitigatedEdges(std::unique_ptr<MachineGadgetGraph> Graph) const;
+  void findAndCutEdges(MachineGadgetGraph &G,
+                       EdgeSet &CutEdges /* out */) const;
+  int insertFences(MachineFunction &MF, MachineGadgetGraph &G,
+                   EdgeSet &CutEdges /* in, out */) const;
+  bool instrUsesRegToAccessMemory(const MachineInstr &I, unsigned Reg) const;
+  bool instrUsesRegToBranch(const MachineInstr &I, unsigned Reg) const;
+  inline bool isFence(const MachineInstr *MI) const {
+    return MI && (MI->getOpcode() == X86::LFENCE ||
+                  (STI->useLVIControlFlowIntegrity() && MI->isCall()));
+  }
+};
+
+} // end anonymous namespace
+
+namespace llvm {
+
+template <>
+struct GraphTraits<MachineGadgetGraph *>
+    : GraphTraits<ImmutableGraph<MachineInstr *, int> *> {};
+
+template <>
+struct DOTGraphTraits<MachineGadgetGraph *> : DefaultDOTGraphTraits {
+  using GraphType = MachineGadgetGraph;
+  using Traits = llvm::GraphTraits<GraphType *>;
+  using NodeRef = typename Traits::NodeRef;
+  using EdgeRef = typename Traits::EdgeRef;
+  using ChildIteratorType = typename Traits::ChildIteratorType;
+  using ChildEdgeIteratorType = typename Traits::ChildEdgeIteratorType;
+
+  DOTGraphTraits(bool isSimple = false) : DefaultDOTGraphTraits(isSimple) {}
+
+  std::string getNodeLabel(NodeRef Node, GraphType *) {
+    if (Node->getValue() == MachineGadgetGraph::ArgNodeSentinel)
+      return "ARGS";
+
+    std::string Str;
+    raw_string_ostream OS(Str);
+    OS << *Node->getValue();
+    return OS.str();
+  }
+
+  static std::string getNodeAttributes(NodeRef Node, GraphType *) {
+    MachineInstr *MI = Node->getValue();
+    if (MI == MachineGadgetGraph::ArgNodeSentinel)
+      return "color = blue";
+    if (MI->getOpcode() == X86::LFENCE)
+      return "color = green";
+    return "";
+  }
+
+  static std::string getEdgeAttributes(NodeRef, ChildIteratorType E,
+                                       GraphType *) {
+    int EdgeVal = (*E.getCurrent()).getValue();
+    return EdgeVal >= 0 ? "label = " + std::to_string(EdgeVal)
+                        : "color = red, style = \"dashed\"";
+  }
+};
+
+} // end namespace llvm
+
+constexpr MachineInstr *MachineGadgetGraph::ArgNodeSentinel;
+constexpr int MachineGadgetGraph::GadgetEdgeSentinel;
+
+char X86LoadValueInjectionLoadHardeningPass::ID = 0;
+
+void X86LoadValueInjectionLoadHardeningPass::getAnalysisUsage(
+    AnalysisUsage &AU) const {
+  MachineFunctionPass::getAnalysisUsage(AU);
+  AU.addRequired<MachineLoopInfo>();
+  AU.addRequired<MachineDominatorTree>();
+  AU.addRequired<MachineDominanceFrontier>();
+  AU.setPreservesCFG();
+}
+
+static void WriteGadgetGraph(raw_ostream &OS, MachineFunction &MF,
+                             MachineGadgetGraph *G) {
+  WriteGraph(OS, G, /*ShortNames*/ false,
+             "Speculative gadgets for \"" + MF.getName() + "\" function");
+}
+
+bool X86LoadValueInjectionLoadHardeningPass::runOnMachineFunction(
+    MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
+                    << " *****\n");
+  STI = &MF.getSubtarget<X86Subtarget>();
+  if (!STI->useLVILoadHardening())
+    return false;
+
+  // FIXME: support 32-bit
+  if (!STI->is64Bit())
+    report_fatal_error("LVI load hardening is only supported on 64-bit", false);
+
+  // Don't skip functions with the "optnone" attr but participate in opt-bisect.
+  const Function &F = MF.getFunction();
+  if (!F.hasOptNone() && skipFunction(F))
+    return false;
+
+  ++NumFunctionsConsidered;
+  TII = STI->getInstrInfo();
+  TRI = STI->getRegisterInfo();
+  LLVM_DEBUG(dbgs() << "Building gadget graph...\n");
+  const auto &MLI = getAnalysis<MachineLoopInfo>();
+  const auto &MDT = getAnalysis<MachineDominatorTree>();
+  const auto &MDF = getAnalysis<MachineDominanceFrontier>();
+  std::unique_ptr<MachineGadgetGraph> Graph = getGadgetGraph(MF, MLI, MDT, MDF);
+  LLVM_DEBUG(dbgs() << "Building gadget graph... Done\n");
+  if (Graph == nullptr)
+    return false; // didn't find any gadgets
+
+  if (EmitDotVerify) {
+    WriteGadgetGraph(outs(), MF, Graph.get());
+    return false;
+  }
+
+  if (EmitDot || EmitDotOnly) {
+    LLVM_DEBUG(dbgs() << "Emitting gadget graph...\n");
+    std::error_code FileError;
+    std::string FileName = "lvi.";
+    FileName += MF.getName();
+    FileName += ".dot";
+    raw_fd_ostream FileOut(FileName, FileError);
+    if (FileError)
+      errs() << FileError.message();
+    WriteGadgetGraph(FileOut, MF, Graph.get());
+    FileOut.close();
+    LLVM_DEBUG(dbgs() << "Emitting gadget graph... Done\n");
+    if (EmitDotOnly)
+      return false;
+  }
+
+  int FencesInserted;
+  if (!OptimizePluginPath.empty()) {
+    if (!OptimizeDL.isValid()) {
+      std::string ErrorMsg;
+      OptimizeDL = llvm::sys::DynamicLibrary::getPermanentLibrary(
+          OptimizePluginPath.c_str(), &ErrorMsg);
+      if (!ErrorMsg.empty())
+        report_fatal_error("Failed to load opt plugin: \"" + ErrorMsg + '\"');
+      OptimizeCut = (OptimizeCutT)OptimizeDL.getAddressOfSymbol("optimize_cut");
+      if (!OptimizeCut)
+        report_fatal_error("Invalid optimization plugin");
+    }
+    FencesInserted = hardenLoadsWithPlugin(MF, std::move(Graph));
+  } else { // Use the default greedy heuristic
+    FencesInserted = hardenLoadsWithGreedyHeuristic(MF, std::move(Graph));
+  }
+
+  if (FencesInserted > 0)
+    ++NumFunctionsMitigated;
+  NumFences += FencesInserted;
+  return (FencesInserted > 0);
+}
+
+std::unique_ptr<MachineGadgetGraph>
+X86LoadValueInjectionLoadHardeningPass::getGadgetGraph(
+    MachineFunction &MF, const MachineLoopInfo &MLI,
+    const MachineDominatorTree &MDT,
+    const MachineDominanceFrontier &MDF) const {
+  using namespace rdf;
+
+  // Build the Register Dataflow Graph using the RDF framework
+  TargetOperandInfo TOI{*TII};
+  DataFlowGraph DFG{MF, *TII, *TRI, MDT, MDF, TOI};
+  DFG.build();
+  Liveness L{MF.getRegInfo(), DFG};
+  L.computePhiInfo();
+
+  GraphBuilder Builder;
+  using GraphIter = typename GraphBuilder::BuilderNodeRef;
+  DenseMap<MachineInstr *, GraphIter> NodeMap;
+  int FenceCount = 0, GadgetCount = 0;
+  auto MaybeAddNode = [&NodeMap, &Builder](MachineInstr *MI) {
+    auto Ref = NodeMap.find(MI);
+    if (Ref == NodeMap.end()) {
+      auto I = Builder.addVertex(MI);
+      NodeMap[MI] = I;
+      return std::pair<GraphIter, bool>{I, true};
+    }
+    return std::pair<GraphIter, bool>{Ref->getSecond(), false};
+  };
+
+  // The `Transmitters` map memoizes transmitters found for each def. If a def
+  // has not yet been analyzed, then it will not appear in the map. If a def
+  // has been analyzed and was determined not to have any transmitters, then
+  // its list of transmitters will be empty.
+  DenseMap<NodeId, std::vector<NodeId>> Transmitters;
+
+  // Analyze all machine instructions to find gadgets and LFENCEs, adding
+  // each interesting value to `Nodes`
+  auto AnalyzeDef = [&](NodeAddr<DefNode *> SourceDef) {
+    SmallSet<NodeId, 8> UsesVisited, DefsVisited;
+    std::function<void(NodeAddr<DefNode *>)> AnalyzeDefUseChain =
+        [&](NodeAddr<DefNode *> Def) {
+          if (Transmitters.find(Def.Id) != Transmitters.end())
+            return; // Already analyzed `Def`
+
+          // Use RDF to find all the uses of `Def`
+          rdf::NodeSet Uses;
+          RegisterRef DefReg = DFG.getPRI().normalize(Def.Addr->getRegRef(DFG));
+          for (auto UseID : L.getAllReachedUses(DefReg, Def)) {
+            auto Use = DFG.addr<UseNode *>(UseID);
+            if (Use.Addr->getFlags() & NodeAttrs::PhiRef) { // phi node
+              NodeAddr<PhiNode *> Phi = Use.Addr->getOwner(DFG);
+              for (auto I : L.getRealUses(Phi.Id)) {
+                if (DFG.getPRI().alias(RegisterRef(I.first), DefReg)) {
+                  for (auto UA : I.second)
+                    Uses.emplace(UA.first);
+                }
+              }
+            } else { // not a phi node
+              Uses.emplace(UseID);
+            }
+          }
+
+          // For each use of `Def`, we want to know whether:
+          // (1) The use can leak the Def'ed value,
+          // (2) The use can further propagate the Def'ed value to more defs
+          for (auto UseID : Uses) {
+            if (!UsesVisited.insert(UseID).second)
+              continue; // Already visited this use of `Def`
+
+            auto Use = DFG.addr<UseNode *>(UseID);
+            assert(!(Use.Addr->getFlags() & NodeAttrs::PhiRef));
+            MachineOperand &UseMO = Use.Addr->getOp();
+            MachineInstr &UseMI = *UseMO.getParent();
+            assert(UseMO.isReg());
+
+            // We naively assume that an instruction propagates any loaded
+            // uses to all defs unless the instruction is a call, in which
+            // case all arguments will be treated as gadget sources during
+            // analysis of the callee function.
+            if (UseMI.isCall())
+              continue;
+
+            // Check whether this use can transmit (leak) its value.
+            if (instrUsesRegToAccessMemory(UseMI, UseMO.getReg()) ||
+                (!NoConditionalBranches &&
+                 instrUsesRegToBranch(UseMI, UseMO.getReg()))) {
+              Transmitters[Def.Id].push_back(Use.Addr->getOwner(DFG).Id);
+              if (UseMI.mayLoad())
+                continue; // Found a transmitting load -- no need to continue
+                          // traversing its defs (i.e., this load will become
+                          // a new gadget source anyways).
+            }
+
+            // Check whether the use propagates to more defs.
+            NodeAddr<InstrNode *> Owner{Use.Addr->getOwner(DFG)};
+            rdf::NodeList AnalyzedChildDefs;
+            for (auto &ChildDef :
+                 Owner.Addr->members_if(DataFlowGraph::IsDef, DFG)) {
+              if (!DefsVisited.insert(ChildDef.Id).second)
+                continue; // Already visited this def
+              if (Def.Addr->getAttrs() & NodeAttrs::Dead)
+                continue;
+              if (Def.Id == ChildDef.Id)
+                continue; // `Def` uses itself (e.g., increment loop counter)
+
+              AnalyzeDefUseChain(ChildDef);
+
+              // `Def` inherits all of its child defs' transmitters.
+              for (auto TransmitterId : Transmitters[ChildDef.Id])
+                Transmitters[Def.Id].push_back(TransmitterId);
+            }
+          }
+
+          // Note that this statement adds `Def.Id` to the map if no
+          // transmitters were found for `Def`.
+          auto &DefTransmitters = Transmitters[Def.Id];
+
+          // Remove duplicate transmitters
+          llvm::sort(DefTransmitters);
+          DefTransmitters.erase(
+              std::unique(DefTransmitters.begin(), DefTransmitters.end()),
+              DefTransmitters.end());
+        };
+
+    // Find all of the transmitters
+    AnalyzeDefUseChain(SourceDef);
+    auto &SourceDefTransmitters = Transmitters[SourceDef.Id];
+    if (SourceDefTransmitters.empty())
+      return; // No transmitters for `SourceDef`
+
+    MachineInstr *Source = SourceDef.Addr->getFlags() & NodeAttrs::PhiRef
+                               ? MachineGadgetGraph::ArgNodeSentinel
+                               : SourceDef.Addr->getOp().getParent();
+    auto GadgetSource = MaybeAddNode(Source);
+    // Each transmitter is a sink for `SourceDef`.
+    for (auto TransmitterId : SourceDefTransmitters) {
+      MachineInstr *Sink = DFG.addr<StmtNode *>(TransmitterId).Addr->getCode();
+      auto GadgetSink = MaybeAddNode(Sink);
+      // Add the gadget edge to the graph.
+      Builder.addEdge(MachineGadgetGraph::GadgetEdgeSentinel,
+                      GadgetSource.first, GadgetSink.first);
+      ++GadgetCount;
+    }
+  };
+
+  LLVM_DEBUG(dbgs() << "Analyzing def-use chains to find gadgets\n");
+  // Analyze function arguments
+  NodeAddr<BlockNode *> EntryBlock = DFG.getFunc().Addr->getEntryBlock(DFG);
+  for (NodeAddr<PhiNode *> ArgPhi :
+       EntryBlock.Addr->members_if(DataFlowGraph::IsPhi, DFG)) {
+    NodeList Defs = ArgPhi.Addr->members_if(DataFlowGraph::IsDef, DFG);
+    llvm::for_each(Defs, AnalyzeDef);
+  }
+  // Analyze every instruction in MF
+  for (NodeAddr<BlockNode *> BA : DFG.getFunc().Addr->members(DFG)) {
+    for (NodeAddr<StmtNode *> SA :
+         BA.Addr->members_if(DataFlowGraph::IsCode<NodeAttrs::Stmt>, DFG)) {
+      MachineInstr *MI = SA.Addr->getCode();
+      if (isFence(MI)) {
+        MaybeAddNode(MI);
+        ++FenceCount;
+      } else if (MI->mayLoad()) {
+        NodeList Defs = SA.Addr->members_if(DataFlowGraph::IsDef, DFG);
+        llvm::for_each(Defs, AnalyzeDef);
+      }
+    }
+  }
+  LLVM_DEBUG(dbgs() << "Found " << FenceCount << " fences\n");
+  LLVM_DEBUG(dbgs() << "Found " << GadgetCount << " gadgets\n");
+  if (GadgetCount == 0)
+    return nullptr;
+  NumGadgets += GadgetCount;
+
+  // Traverse CFG to build the rest of the graph
+  SmallSet<MachineBasicBlock *, 8> BlocksVisited;
+  std::function<void(MachineBasicBlock *, GraphIter, unsigned)> TraverseCFG =
+      [&](MachineBasicBlock *MBB, GraphIter GI, unsigned ParentDepth) {
+        unsigned LoopDepth = MLI.getLoopDepth(MBB);
+        if (!MBB->empty()) {
+          // Always add the first instruction in each block
+          auto NI = MBB->begin();
+          auto BeginBB = MaybeAddNode(&*NI);
+          Builder.addEdge(ParentDepth, GI, BeginBB.first);
+          if (!BlocksVisited.insert(MBB).second)
+            return;
+
+          // Add any instructions within the block that are gadget components
+          GI = BeginBB.first;
+          while (++NI != MBB->end()) {
+            auto Ref = NodeMap.find(&*NI);
+            if (Ref != NodeMap.end()) {
+              Builder.addEdge(LoopDepth, GI, Ref->getSecond());
+              GI = Ref->getSecond();
+            }
+          }
+
+          // Always add the terminator instruction, if one exists
+          auto T = MBB->getFirstTerminator();
+          if (T != MBB->end()) {
+            auto EndBB = MaybeAddNode(&*T);
+            if (EndBB.second)
+              Builder.addEdge(LoopDepth, GI, EndBB.first);
+            GI = EndBB.first;
+          }
+        }
+        for (MachineBasicBlock *Succ : MBB->successors())
+          TraverseCFG(Succ, GI, LoopDepth);
+      };
+  // ArgNodeSentinel is a pseudo-instruction that represents MF args in the
+  // GadgetGraph
+  GraphIter ArgNode = MaybeAddNode(MachineGadgetGraph::ArgNodeSentinel).first;
+  TraverseCFG(&MF.front(), ArgNode, 0);
+  std::unique_ptr<MachineGadgetGraph> G{Builder.get(FenceCount, GadgetCount)};
+  LLVM_DEBUG(dbgs() << "Found " << G->nodes_size() << " nodes\n");
+  return G;
+}
+
+// Returns the number of remaining gadget edges that could not be eliminated
+int X86LoadValueInjectionLoadHardeningPass::elimMitigatedEdgesAndNodes(
+    MachineGadgetGraph &G, MachineGadgetGraph::EdgeSet &ElimEdges /* in, out */,
+    MachineGadgetGraph::NodeSet &ElimNodes /* in, out */) const {
+  if (G.NumFences > 0) {
+    // Eliminate fences and CFG edges that ingress and egress the fence, as
+    // they are trivially mitigated.
+    for (const auto &E : G.edges()) {
+      const MachineGadgetGraph::Node *Dest = E.getDest();
+      if (isFence(Dest->getValue())) {
+        ElimNodes.insert(*Dest);
+        ElimEdges.insert(E);
+        for (const auto &DE : Dest->edges())
+          ElimEdges.insert(DE);
+      }
+    }
+  }
+
+  // Find and eliminate gadget edges that have been mitigated.
+  int MitigatedGadgets = 0, RemainingGadgets = 0;
+  MachineGadgetGraph::NodeSet ReachableNodes{G};
+  for (const auto &RootN : G.nodes()) {
+    if (llvm::none_of(RootN.edges(), MachineGadgetGraph::isGadgetEdge))
+      continue; // skip this node if it isn't a gadget source
+
+    // Find all of the nodes that are CFG-reachable from RootN using DFS
+    ReachableNodes.clear();
+    std::function<void(const MachineGadgetGraph::Node *, bool)>
+        FindReachableNodes =
+            [&](const MachineGadgetGraph::Node *N, bool FirstNode) {
+              if (!FirstNode)
+                ReachableNodes.insert(*N);
+              for (const auto &E : N->edges()) {
+                const MachineGadgetGraph::Node *Dest = E.getDest();
+                if (MachineGadgetGraph::isCFGEdge(E) &&
+                    !ElimEdges.contains(E) && !ReachableNodes.contains(*Dest))
+                  FindReachableNodes(Dest, false);
+              }
+            };
+    FindReachableNodes(&RootN, true);
+
+    // Any gadget whose sink is unreachable has been mitigated
+    for (const auto &E : RootN.edges()) {
+      if (MachineGadgetGraph::isGadgetEdge(E)) {
+        if (ReachableNodes.contains(*E.getDest())) {
+          // This gadget's sink is reachable
+          ++RemainingGadgets;
+        } else { // This gadget's sink is unreachable, and therefore mitigated
+          ++MitigatedGadgets;
+          ElimEdges.insert(E);
+        }
+      }
+    }
+  }
+  return RemainingGadgets;
+}
+
+std::unique_ptr<MachineGadgetGraph>
+X86LoadValueInjectionLoadHardeningPass::trimMitigatedEdges(
+    std::unique_ptr<MachineGadgetGraph> Graph) const {
+  MachineGadgetGraph::NodeSet ElimNodes{*Graph};
+  MachineGadgetGraph::EdgeSet ElimEdges{*Graph};
+  int RemainingGadgets =
+      elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes);
+  if (ElimEdges.empty() && ElimNodes.empty()) {
+    Graph->NumFences = 0;
+    Graph->NumGadgets = RemainingGadgets;
+  } else {
+    Graph = GraphBuilder::trim(*Graph, ElimNodes, ElimEdges, 0 /* NumFences */,
+                               RemainingGadgets);
+  }
+  return Graph;
+}
+
+int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithPlugin(
+    MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const {
+  int FencesInserted = 0;
+
+  do {
+    LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
+    Graph = trimMitigatedEdges(std::move(Graph));
+    LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
+    if (Graph->NumGadgets == 0)
+      break;
+
+    LLVM_DEBUG(dbgs() << "Cutting edges...\n");
+    EdgeSet CutEdges{*Graph};
+    auto Nodes = std::make_unique<unsigned int[]>(Graph->nodes_size() +
+                                                  1 /* terminator node */);
+    auto Edges = std::make_unique<unsigned int[]>(Graph->edges_size());
+    auto EdgeCuts = std::make_unique<int[]>(Graph->edges_size());
+    auto EdgeValues = std::make_unique<int[]>(Graph->edges_size());
+    for (const auto &N : Graph->nodes()) {
+      Nodes[Graph->getNodeIndex(N)] = Graph->getEdgeIndex(*N.edges_begin());
+    }
+    Nodes[Graph->nodes_size()] = Graph->edges_size(); // terminator node
+    for (const auto &E : Graph->edges()) {
+      Edges[Graph->getEdgeIndex(E)] = Graph->getNodeIndex(*E.getDest());
+      EdgeValues[Graph->getEdgeIndex(E)] = E.getValue();
+    }
+    OptimizeCut(Nodes.get(), Graph->nodes_size(), Edges.get(), EdgeValues.get(),
+                EdgeCuts.get(), Graph->edges_size());
+    for (int I = 0; I < Graph->edges_size(); ++I)
+      if (EdgeCuts[I])
+        CutEdges.set(I);
+    LLVM_DEBUG(dbgs() << "Cutting edges... Done\n");
+    LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n");
+
+    LLVM_DEBUG(dbgs() << "Inserting LFENCEs...\n");
+    FencesInserted += insertFences(MF, *Graph, CutEdges);
+    LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n");
+    LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n");
+
+    Graph = GraphBuilder::trim(*Graph, MachineGadgetGraph::NodeSet{*Graph},
+                               CutEdges);
+  } while (true);
+
+  return FencesInserted;
+}
+
+int X86LoadValueInjectionLoadHardeningPass::hardenLoadsWithGreedyHeuristic(
+    MachineFunction &MF, std::unique_ptr<MachineGadgetGraph> Graph) const {
+  LLVM_DEBUG(dbgs() << "Eliminating mitigated paths...\n");
+  Graph = trimMitigatedEdges(std::move(Graph));
+  LLVM_DEBUG(dbgs() << "Eliminating mitigated paths... Done\n");
+  if (Graph->NumGadgets == 0)
+    return 0;
+
+  LLVM_DEBUG(dbgs() << "Cutting edges...\n");
+  MachineGadgetGraph::NodeSet ElimNodes{*Graph}, GadgetSinks{*Graph};
+  MachineGadgetGraph::EdgeSet ElimEdges{*Graph}, CutEdges{*Graph};
+  auto IsCFGEdge = [&ElimEdges, &CutEdges](const MachineGadgetGraph::Edge &E) {
+    return !ElimEdges.contains(E) && !CutEdges.contains(E) &&
+           MachineGadgetGraph::isCFGEdge(E);
+  };
+  auto IsGadgetEdge = [&ElimEdges,
+                       &CutEdges](const MachineGadgetGraph::Edge &E) {
+    return !ElimEdges.contains(E) && !CutEdges.contains(E) &&
+           MachineGadgetGraph::isGadgetEdge(E);
+  };
+
+  // FIXME: this is O(E^2), we could probably do better.
+  do {
+    // Find the cheapest CFG edge that will eliminate a gadget (by being
+    // egress from a SOURCE node or ingress to a SINK node), and cut it.
+    const MachineGadgetGraph::Edge *CheapestSoFar = nullptr;
+
+    // First, collect all gadget source and sink nodes.
+    MachineGadgetGraph::NodeSet GadgetSources{*Graph}, GadgetSinks{*Graph};
+    for (const auto &N : Graph->nodes()) {
+      if (ElimNodes.contains(N))
+        continue;
+      for (const auto &E : N.edges()) {
+        if (IsGadgetEdge(E)) {
+          GadgetSources.insert(N);
+          GadgetSinks.insert(*E.getDest());
+        }
+      }
+    }
+
+    // Next, look for the cheapest CFG edge which, when cut, is guaranteed to
+    // mitigate at least one gadget by either:
+    // (a) being egress from a gadget source, or
+    // (b) being ingress to a gadget sink.
+    for (const auto &N : Graph->nodes()) {
+      if (ElimNodes.contains(N))
+        continue;
+      for (const auto &E : N.edges()) {
+        if (IsCFGEdge(E)) {
+          if (GadgetSources.contains(N) || GadgetSinks.contains(*E.getDest())) {
+            if (!CheapestSoFar || E.getValue() < CheapestSoFar->getValue())
+              CheapestSoFar = &E;
+          }
+        }
+      }
+    }
+
+    assert(CheapestSoFar && "Failed to cut an edge");
+    CutEdges.insert(*CheapestSoFar);
+    ElimEdges.insert(*CheapestSoFar);
+  } while (elimMitigatedEdgesAndNodes(*Graph, ElimEdges, ElimNodes));
+  LLVM_DEBUG(dbgs() << "Cutting edges... Done\n");
+  LLVM_DEBUG(dbgs() << "Cut " << CutEdges.count() << " edges\n");
+
+  LLVM_DEBUG(dbgs() << "Inserting LFENCEs...\n");
+  int FencesInserted = insertFences(MF, *Graph, CutEdges);
+  LLVM_DEBUG(dbgs() << "Inserting LFENCEs... Done\n");
+  LLVM_DEBUG(dbgs() << "Inserted " << FencesInserted << " fences\n");
+
+  return FencesInserted;
+}
+
+int X86LoadValueInjectionLoadHardeningPass::insertFences(
+    MachineFunction &MF, MachineGadgetGraph &G,
+    EdgeSet &CutEdges /* in, out */) const {
+  int FencesInserted = 0;
+  for (const auto &N : G.nodes()) {
+    for (const auto &E : N.edges()) {
+      if (CutEdges.contains(E)) {
+        MachineInstr *MI = N.getValue(), *Prev;
+        MachineBasicBlock *MBB;                  // Insert an LFENCE in this MBB
+        MachineBasicBlock::iterator InsertionPt; // ...at this point
+        if (MI == MachineGadgetGraph::ArgNodeSentinel) {
+          // insert LFENCE at beginning of entry block
+          MBB = &MF.front();
+          InsertionPt = MBB->begin();
+          Prev = nullptr;
+        } else if (MI->isBranch()) { // insert the LFENCE before the branch
+          MBB = MI->getParent();
+          InsertionPt = MI;
+          Prev = MI->getPrevNode();
+          // Remove all egress CFG edges from this branch because the inserted
+          // LFENCE prevents gadgets from crossing the branch.
+          for (const auto &E : N.edges()) {
+            if (MachineGadgetGraph::isCFGEdge(E))
+              CutEdges.insert(E);
+          }
+        } else { // insert the LFENCE after the instruction
+          MBB = MI->getParent();
+          InsertionPt = MI->getNextNode() ? MI->getNextNode() : MBB->end();
+          Prev = InsertionPt == MBB->end()
+                     ? (MBB->empty() ? nullptr : &MBB->back())
+                     : InsertionPt->getPrevNode();
+        }
+        // Ensure this insertion is not redundant (two LFENCEs in sequence).
+        if ((InsertionPt == MBB->end() || !isFence(&*InsertionPt)) &&
+            (!Prev || !isFence(Prev))) {
+          BuildMI(*MBB, InsertionPt, DebugLoc(), TII->get(X86::LFENCE));
+          ++FencesInserted;
+        }
+      }
+    }
+  }
+  return FencesInserted;
+}
+
+bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToAccessMemory(
+    const MachineInstr &MI, unsigned Reg) const {
+  if (!MI.mayLoadOrStore() || MI.getOpcode() == X86::MFENCE ||
+      MI.getOpcode() == X86::SFENCE || MI.getOpcode() == X86::LFENCE)
+    return false;
+
+  // FIXME: This does not handle pseudo loading instruction like TCRETURN*
+  const MCInstrDesc &Desc = MI.getDesc();
+  int MemRefBeginIdx = X86II::getMemoryOperandNo(Desc.TSFlags);
+  if (MemRefBeginIdx < 0) {
+    LLVM_DEBUG(dbgs() << "Warning: unable to obtain memory operand for loading "
+                         "instruction:\n";
+               MI.print(dbgs()); dbgs() << '\n';);
+    return false;
+  }
+  MemRefBeginIdx += X86II::getOperandBias(Desc);
+
+  const MachineOperand &BaseMO =
+      MI.getOperand(MemRefBeginIdx + X86::AddrBaseReg);
+  const MachineOperand &IndexMO =
+      MI.getOperand(MemRefBeginIdx + X86::AddrIndexReg);
+  return (BaseMO.isReg() && BaseMO.getReg() != X86::NoRegister &&
+          TRI->regsOverlap(BaseMO.getReg(), Reg)) ||
+         (IndexMO.isReg() && IndexMO.getReg() != X86::NoRegister &&
+          TRI->regsOverlap(IndexMO.getReg(), Reg));
+}
+
+bool X86LoadValueInjectionLoadHardeningPass::instrUsesRegToBranch(
+    const MachineInstr &MI, unsigned Reg) const {
+  if (!MI.isConditionalBranch())
+    return false;
+  for (const MachineOperand &Use : MI.uses())
+    if (Use.isReg() && Use.getReg() == Reg)
+      return true;
+  return false;
+}
+
+INITIALIZE_PASS_BEGIN(X86LoadValueInjectionLoadHardeningPass, PASS_KEY,
+                      "X86 LVI load hardening", false, false)
+INITIALIZE_PASS_DEPENDENCY(MachineLoopInfo)
+INITIALIZE_PASS_DEPENDENCY(MachineDominatorTree)
+INITIALIZE_PASS_DEPENDENCY(MachineDominanceFrontier)
+INITIALIZE_PASS_END(X86LoadValueInjectionLoadHardeningPass, PASS_KEY,
+                    "X86 LVI load hardening", false, false)
+
+FunctionPass *llvm::createX86LoadValueInjectionLoadHardeningPass() {
+  return new X86LoadValueInjectionLoadHardeningPass();
+}
diff --git a/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp b/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
new file mode 100644
index 000000000000..6e1134a25950
--- /dev/null
+++ b/llvm/lib/Target/X86/X86LoadValueInjectionRetHardening.cpp
@@ -0,0 +1,143 @@
+//===-- X86LoadValueInjectionRetHardening.cpp - LVI RET hardening for x86 --==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+/// Description: Replaces every `ret` instruction with the sequence:
+/// ```
+/// pop <scratch-reg>
+/// lfence
+/// jmp *<scratch-reg>
+/// ```
+/// where `<scratch-reg>` is some available scratch register, according to the
+/// calling convention of the function being mitigated.
+///
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrBuilder.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineBasicBlock.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/IR/Function.h"
+#include "llvm/Support/Debug.h"
+#include <bitset>
+
+using namespace llvm;
+
+#define PASS_KEY "x86-lvi-ret"
+#define DEBUG_TYPE PASS_KEY
+
+STATISTIC(NumFences, "Number of LFENCEs inserted for LVI mitigation");
+STATISTIC(NumFunctionsConsidered, "Number of functions analyzed");
+STATISTIC(NumFunctionsMitigated, "Number of functions for which mitigations "
+                                 "were deployed");
+
+namespace {
+
+class X86LoadValueInjectionRetHardeningPass : public MachineFunctionPass {
+public:
+  X86LoadValueInjectionRetHardeningPass() : MachineFunctionPass(ID) {}
+  StringRef getPassName() const override {
+    return "X86 Load Value Injection (LVI) Ret-Hardening";
+  }
+  bool runOnMachineFunction(MachineFunction &MF) override;
+
+  static char ID;
+};
+
+} // end anonymous namespace
+
+char X86LoadValueInjectionRetHardeningPass::ID = 0;
+
+bool X86LoadValueInjectionRetHardeningPass::runOnMachineFunction(
+    MachineFunction &MF) {
+  LLVM_DEBUG(dbgs() << "***** " << getPassName() << " : " << MF.getName()
+                    << " *****\n");
+  const X86Subtarget *Subtarget = &MF.getSubtarget<X86Subtarget>();
+  if (!Subtarget->useLVIControlFlowIntegrity() || !Subtarget->is64Bit())
+    return false; // FIXME: support 32-bit
+
+  // Don't skip functions with the "optnone" attr but participate in opt-bisect.
+  const Function &F = MF.getFunction();
+  if (!F.hasOptNone() && skipFunction(F))
+    return false;
+
+  ++NumFunctionsConsidered;
+  const X86RegisterInfo *TRI = Subtarget->getRegisterInfo();
+  const X86InstrInfo *TII = Subtarget->getInstrInfo();
+  unsigned ClobberReg = X86::NoRegister;
+  std::bitset<X86::NUM_TARGET_REGS> UnclobberableGR64s;
+  UnclobberableGR64s.set(X86::RSP); // can't clobber stack pointer
+  UnclobberableGR64s.set(X86::RIP); // can't clobber instruction pointer
+  UnclobberableGR64s.set(X86::RAX); // used for function return
+  UnclobberableGR64s.set(X86::RDX); // used for function return
+
+  // We can clobber any register allowed by the function's calling convention.
+  for (const MCPhysReg *PR = TRI->getCalleeSavedRegs(&MF); auto Reg = *PR; ++PR)
+    UnclobberableGR64s.set(Reg);
+  for (auto &Reg : X86::GR64RegClass) {
+    if (!UnclobberableGR64s.test(Reg)) {
+      ClobberReg = Reg;
+      break;
+    }
+  }
+
+  if (ClobberReg != X86::NoRegister) {
+    LLVM_DEBUG(dbgs() << "Selected register "
+                      << Subtarget->getRegisterInfo()->getRegAsmName(ClobberReg)
+                      << " to clobber\n");
+  } else {
+    LLVM_DEBUG(dbgs() << "Could not find a register to clobber\n");
+  }
+
+  bool Modified = false;
+  for (auto &MBB : MF) {
+    if (MBB.empty())
+      continue;
+
+    MachineInstr &MI = MBB.back();
+    if (MI.getOpcode() != X86::RETQ)
+      continue;
+
+    if (ClobberReg != X86::NoRegister) {
+      MBB.erase_instr(&MI);
+      BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::POP64r))
+          .addReg(ClobberReg, RegState::Define)
+          .setMIFlag(MachineInstr::FrameDestroy);
+      BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::LFENCE));
+      BuildMI(MBB, MBB.end(), DebugLoc(), TII->get(X86::JMP64r))
+          .addReg(ClobberReg);
+    } else {
+      // In case there is no available scratch register, we can still read from
+      // RSP to assert that RSP points to a valid page. The write to RSP is
+      // also helpful because it verifies that the stack's write permissions
+      // are intact.
+      MachineInstr *Fence = BuildMI(MBB, MI, DebugLoc(), TII->get(X86::LFENCE));
+      addRegOffset(BuildMI(MBB, Fence, DebugLoc(), TII->get(X86::SHL64mi)),
+                   X86::RSP, false, 0)
+          .addImm(0)
+          ->addRegisterDead(X86::EFLAGS, TRI);
+    }
+
+    ++NumFences;
+    Modified = true;
+  }
+
+  if (Modified)
+    ++NumFunctionsMitigated;
+  return Modified;
+}
+
+INITIALIZE_PASS(X86LoadValueInjectionRetHardeningPass, PASS_KEY,
+                "X86 LVI ret hardener", false, false)
+
+FunctionPass *llvm::createX86LoadValueInjectionRetHardeningPass() {
+  return new X86LoadValueInjectionRetHardeningPass();
+}
diff --git a/llvm/lib/Target/X86/X86MCInstLower.cpp b/llvm/lib/Target/X86/X86MCInstLower.cpp
index 2fc9a2af01d7..9ce2a4637e2e 100644
--- a/llvm/lib/Target/X86/X86MCInstLower.cpp
+++ b/llvm/lib/Target/X86/X86MCInstLower.cpp
@@ -14,11 +14,12 @@
 #include "MCTargetDesc/X86ATTInstPrinter.h"
 #include "MCTargetDesc/X86BaseInfo.h"
 #include "MCTargetDesc/X86InstComments.h"
+#include "MCTargetDesc/X86ShuffleDecode.h"
 #include "MCTargetDesc/X86TargetStreamer.h"
-#include "Utils/X86ShuffleDecode.h"
 #include "X86AsmPrinter.h"
 #include "X86RegisterInfo.h"
 #include "X86ShuffleDecodeConstantPool.h"
+#include "X86Subtarget.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/iterator_range.h"
@@ -43,6 +44,7 @@
 #include "llvm/MC/MCSymbol.h"
 #include "llvm/MC/MCSymbolELF.h"
 #include "llvm/Target/TargetLoweringObjectFile.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 
@@ -72,9 +74,30 @@ private:
 
 } // end anonymous namespace
 
+/// A RAII helper which defines a region of instructions which can't have
+/// padding added between them for correctness.
+struct NoAutoPaddingScope {
+  MCStreamer &OS;
+  const bool OldAllowAutoPadding;
+  NoAutoPaddingScope(MCStreamer &OS)
+      : OS(OS), OldAllowAutoPadding(OS.getAllowAutoPadding()) {
+    changeAndComment(false);
+  }
+  ~NoAutoPaddingScope() { changeAndComment(OldAllowAutoPadding); }
+  void changeAndComment(bool b) {
+    if (b == OS.getAllowAutoPadding())
+      return;
+    OS.setAllowAutoPadding(b);
+    if (b)
+      OS.emitRawComment("autopadding");
+    else
+      OS.emitRawComment("noautopadding");
+  }
+};
+
 // Emit a minimal sequence of nops spanning NumBytes bytes.
-static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
-                     const MCSubtargetInfo &STI);
+static void emitX86Nops(MCStreamer &OS, unsigned NumBytes,
+                        const X86Subtarget *Subtarget);
 
 void X86AsmPrinter::StackMapShadowTracker::count(MCInst &Inst,
                                                  const MCSubtargetInfo &STI,
@@ -94,13 +117,13 @@ void X86AsmPrinter::StackMapShadowTracker::emitShadowPadding(
     MCStreamer &OutStreamer, const MCSubtargetInfo &STI) {
   if (InShadow && CurrentShadowSize < RequiredShadowSize) {
     InShadow = false;
-    EmitNops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
-             MF->getSubtarget<X86Subtarget>().is64Bit(), STI);
+    emitX86Nops(OutStreamer, RequiredShadowSize - CurrentShadowSize,
+                &MF->getSubtarget<X86Subtarget>());
   }
 }
 
 void X86AsmPrinter::EmitAndCountInstruction(MCInst &Inst) {
-  OutStreamer->EmitInstruction(Inst, getSubtargetInfo());
+  OutStreamer->emitInstruction(Inst, getSubtargetInfo());
   SMShadowTracker.count(Inst, getSubtargetInfo(), CodeEmitter.get());
 }
 
@@ -116,6 +139,10 @@ MachineModuleInfoMachO &X86MCInstLower::getMachOMMI() const {
 /// GetSymbolFromOperand - Lower an MO_GlobalAddress or MO_ExternalSymbol
 /// operand to an MCSymbol.
 MCSymbol *X86MCInstLower::GetSymbolFromOperand(const MachineOperand &MO) const {
+  const Triple &TT = TM.getTargetTriple();
+  if (MO.isGlobal() && TT.isOSBinFormatELF())
+    return AsmPrinter.getSymbolPreferLocal(*MO.getGlobal());
+
   const DataLayout &DL = MF.getDataLayout();
   assert((MO.isGlobal() || MO.isSymbol() || MO.isMBB()) &&
          "Isn't a symbol reference");
@@ -272,7 +299,7 @@ MCOperand X86MCInstLower::LowerSymbolOperand(const MachineOperand &MO,
       // local labels. This is only safe when the symbols are in the same
       // section so we are restricting it to jumptable references.
       MCSymbol *Label = Ctx.createTempSymbol();
-      AsmPrinter.OutStreamer->EmitAssignment(Label, Expr);
+      AsmPrinter.OutStreamer->emitAssignment(Label, Expr);
       Expr = MCSymbolRefExpr::create(Label, Ctx);
     }
     break;
@@ -482,6 +509,26 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
            "LEA has segment specified!");
     break;
 
+  case X86::MULX32Hrr:
+  case X86::MULX32Hrm:
+  case X86::MULX64Hrr:
+  case X86::MULX64Hrm: {
+    // Turn into regular MULX by duplicating the destination.
+    unsigned NewOpc;
+    switch (OutMI.getOpcode()) {
+    default: llvm_unreachable("Invalid opcode");
+    case X86::MULX32Hrr: NewOpc = X86::MULX32rr; break;
+    case X86::MULX32Hrm: NewOpc = X86::MULX32rm; break;
+    case X86::MULX64Hrr: NewOpc = X86::MULX64rr; break;
+    case X86::MULX64Hrm: NewOpc = X86::MULX64rm; break;
+    }
+    OutMI.setOpcode(NewOpc);
+    // Duplicate the destination.
+    unsigned DestReg = OutMI.getOperand(0).getReg();
+    OutMI.insert(OutMI.begin(), MCOperand::createReg(DestReg));
+    break;
+  }
+
   // Commute operands to get a smaller encoding by using VEX.R instead of VEX.B
   // if one of the registers is extended, but other isn't.
   case X86::VMOVZPQILo2PQIrr:
@@ -929,6 +976,7 @@ void X86MCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
 
 void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
                                  const MachineInstr &MI) {
+  NoAutoPaddingScope NoPadScope(*OutStreamer);
   bool Is64Bits = MI.getOpcode() == X86::TLS_addr64 ||
                   MI.getOpcode() == X86::TLS_base_addr64;
   MCContext &Ctx = OutStreamer->getContext();
@@ -1034,29 +1082,26 @@ void X86AsmPrinter::LowerTlsAddr(X86MCInstLower &MCInstLowering,
 /// Return the longest nop which can be efficiently decoded for the given
 /// target cpu.  15-bytes is the longest single NOP instruction, but some
 /// platforms can't decode the longest forms efficiently.
-static unsigned MaxLongNopLength(const MCSubtargetInfo &STI) {
-  uint64_t MaxNopLength = 10;
-  if (STI.getFeatureBits()[X86::ProcIntelSLM])
-    MaxNopLength = 7;
-  else if (STI.getFeatureBits()[X86::FeatureFast15ByteNOP])
-    MaxNopLength = 15;
-  else if (STI.getFeatureBits()[X86::FeatureFast11ByteNOP])
-    MaxNopLength = 11;
-  return MaxNopLength;
+static unsigned maxLongNopLength(const X86Subtarget *Subtarget) {
+  if (Subtarget->getFeatureBits()[X86::ProcIntelSLM])
+    return 7;
+  if (Subtarget->getFeatureBits()[X86::FeatureFast15ByteNOP])
+    return 15;
+  if (Subtarget->getFeatureBits()[X86::FeatureFast11ByteNOP])
+    return 11;
+  if (Subtarget->getFeatureBits()[X86::FeatureNOPL] || Subtarget->is64Bit())
+    return 10;
+  if (Subtarget->is32Bit())
+    return 2;
+  return 1;
 }
 
 /// Emit the largest nop instruction smaller than or equal to \p NumBytes
 /// bytes.  Return the size of nop emitted.
-static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
-                        const MCSubtargetInfo &STI) {
-  if (!Is64Bit) {
-    // TODO Do additional checking if the CPU supports multi-byte nops.
-    OS.EmitInstruction(MCInstBuilder(X86::NOOP), STI);
-    return 1;
-  }
-
+static unsigned emitNop(MCStreamer &OS, unsigned NumBytes,
+                        const X86Subtarget *Subtarget) {
   // Cap a single nop emission at the profitable value for the target
-  NumBytes = std::min(NumBytes, MaxLongNopLength(STI));
+  NumBytes = std::min(NumBytes, maxLongNopLength(Subtarget));
 
   unsigned NopSize;
   unsigned Opc, BaseReg, ScaleVal, IndexReg, Displacement, SegmentReg;
@@ -1125,25 +1170,26 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
   unsigned NumPrefixes = std::min(NumBytes - NopSize, 5U);
   NopSize += NumPrefixes;
   for (unsigned i = 0; i != NumPrefixes; ++i)
-    OS.EmitBytes("\x66");
+    OS.emitBytes("\x66");
 
   switch (Opc) {
   default: llvm_unreachable("Unexpected opcode");
   case X86::NOOP:
-    OS.EmitInstruction(MCInstBuilder(Opc), STI);
+    OS.emitInstruction(MCInstBuilder(Opc), *Subtarget);
     break;
   case X86::XCHG16ar:
-    OS.EmitInstruction(MCInstBuilder(Opc).addReg(X86::AX).addReg(X86::AX), STI);
+    OS.emitInstruction(MCInstBuilder(Opc).addReg(X86::AX).addReg(X86::AX),
+                       *Subtarget);
     break;
   case X86::NOOPL:
   case X86::NOOPW:
-    OS.EmitInstruction(MCInstBuilder(Opc)
+    OS.emitInstruction(MCInstBuilder(Opc)
                            .addReg(BaseReg)
                            .addImm(ScaleVal)
                            .addReg(IndexReg)
                            .addImm(Displacement)
                            .addReg(SegmentReg),
-                       STI);
+                       *Subtarget);
     break;
   }
   assert(NopSize <= NumBytes && "We overemitted?");
@@ -1151,39 +1197,16 @@ static unsigned EmitNop(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
 }
 
 /// Emit the optimal amount of multi-byte nops on X86.
-static void EmitNops(MCStreamer &OS, unsigned NumBytes, bool Is64Bit,
-                     const MCSubtargetInfo &STI) {
+static void emitX86Nops(MCStreamer &OS, unsigned NumBytes,
+                        const X86Subtarget *Subtarget) {
   unsigned NopsToEmit = NumBytes;
   (void)NopsToEmit;
   while (NumBytes) {
-    NumBytes -= EmitNop(OS, NumBytes, Is64Bit, STI);
+    NumBytes -= emitNop(OS, NumBytes, Subtarget);
     assert(NopsToEmit >= NumBytes && "Emitted more than I asked for!");
   }
 }
 
-/// A RAII helper which defines a region of instructions which can't have
-/// padding added between them for correctness.
-struct NoAutoPaddingScope {
-  MCStreamer &OS;
-  const bool OldAllowAutoPadding;
-  NoAutoPaddingScope(MCStreamer &OS)
-    : OS(OS), OldAllowAutoPadding(OS.getAllowAutoPadding()) {
-    changeAndComment(false);
-  }
-  ~NoAutoPaddingScope() {
-    changeAndComment(OldAllowAutoPadding);
-  }
-  void changeAndComment(bool b) {
-    if (b == OS.getAllowAutoPadding())
-      return;
-    OS.setAllowAutoPadding(b);
-    if (b)
-      OS.emitRawComment("autopadding");
-    else
-      OS.emitRawComment("noautopadding");
-  }
-};
-
 void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
                                     X86MCInstLower &MCIL) {
   assert(Subtarget->is64Bit() && "Statepoint currently only supports X86-64");
@@ -1192,8 +1215,7 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
 
   StatepointOpers SOpers(&MI);
   if (unsigned PatchBytes = SOpers.getNumPatchBytes()) {
-    EmitNops(*OutStreamer, PatchBytes, Subtarget->is64Bit(),
-             getSubtargetInfo());
+    emitX86Nops(*OutStreamer, PatchBytes, Subtarget);
   } else {
     // Lower call target and choose correct opcode
     const MachineOperand &CallTarget = SOpers.getCallTarget();
@@ -1220,8 +1242,8 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
       break;
     case MachineOperand::MO_Register:
       // FIXME: Add retpoline support and remove this.
-      if (Subtarget->useRetpolineIndirectCalls())
-        report_fatal_error("Lowering register statepoints with retpoline not "
+      if (Subtarget->useIndirectThunkCalls())
+        report_fatal_error("Lowering register statepoints with thunks not "
                            "yet implemented.");
       CallTargetMCOp = MCOperand::createReg(CallTarget.getReg());
       CallOpcode = X86::CALL64r;
@@ -1235,14 +1257,14 @@ void X86AsmPrinter::LowerSTATEPOINT(const MachineInstr &MI,
     MCInst CallInst;
     CallInst.setOpcode(CallOpcode);
     CallInst.addOperand(CallTargetMCOp);
-    OutStreamer->EmitInstruction(CallInst, getSubtargetInfo());
+    OutStreamer->emitInstruction(CallInst, getSubtargetInfo());
   }
 
   // Record our statepoint node in the same section used by STACKMAP
   // and PATCHPOINT
   auto &Ctx = OutStreamer->getContext();
   MCSymbol *MILabel = Ctx.createTempSymbol();
-  OutStreamer->EmitLabel(MILabel);
+  OutStreamer->emitLabel(MILabel);
   SM.recordStatepoint(*MILabel, MI);
 }
 
@@ -1262,7 +1284,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
 
   auto &Ctx = OutStreamer->getContext();
   MCSymbol *FaultingLabel = Ctx.createTempSymbol();
-  OutStreamer->EmitLabel(FaultingLabel);
+  OutStreamer->emitLabel(FaultingLabel);
 
   assert(FK < FaultMaps::FaultKindMax && "Invalid Faulting Kind!");
   FM.recordFaultingOp(FK, FaultingLabel, HandlerLabel);
@@ -1280,7 +1302,7 @@ void X86AsmPrinter::LowerFAULTING_OP(const MachineInstr &FaultingMI,
       MI.addOperand(MaybeOperand.getValue());
 
   OutStreamer->AddComment("on-fault: " + HandlerLabel->getName());
-  OutStreamer->EmitInstruction(MI, getSubtargetInfo());
+  OutStreamer->emitInstruction(MI, getSubtargetInfo());
 }
 
 void X86AsmPrinter::LowerFENTRY_CALL(const MachineInstr &MI,
@@ -1317,7 +1339,17 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
   CodeEmitter->encodeInstruction(MCI, VecOS, Fixups, getSubtargetInfo());
 
   if (Code.size() < MinSize) {
-    if (MinSize == 2 && Opcode == X86::PUSH64r) {
+    if (MinSize == 2 && Subtarget->is32Bit() &&
+        Subtarget->isTargetWindowsMSVC() &&
+        (Subtarget->getCPU().empty() || Subtarget->getCPU() == "pentium3")) {
+      // For compatibilty reasons, when targetting MSVC, is is important to
+      // generate a 'legacy' NOP in the form of a 8B FF MOV EDI, EDI. Some tools
+      // rely specifically on this pattern to be able to patch a function.
+      // This is only for 32-bit targets, when using /arch:IA32 or /arch:SSE.
+      OutStreamer->emitInstruction(
+          MCInstBuilder(X86::MOV32rr_REV).addReg(X86::EDI).addReg(X86::EDI),
+          *Subtarget);
+    } else if (MinSize == 2 && Opcode == X86::PUSH64r) {
       // This is an optimization that lets us get away without emitting a nop in
       // many cases.
       //
@@ -1325,14 +1357,13 @@ void X86AsmPrinter::LowerPATCHABLE_OP(const MachineInstr &MI,
       // bytes too, so the check on MinSize is important.
       MCI.setOpcode(X86::PUSH64rmr);
     } else {
-      unsigned NopSize = EmitNop(*OutStreamer, MinSize, Subtarget->is64Bit(),
-                                 getSubtargetInfo());
+      unsigned NopSize = emitNop(*OutStreamer, MinSize, Subtarget);
       assert(NopSize == MinSize && "Could not implement MinSize!");
       (void)NopSize;
     }
   }
 
-  OutStreamer->EmitInstruction(MCI, getSubtargetInfo());
+  OutStreamer->emitInstruction(MCI, getSubtargetInfo());
 }
 
 // Lower a stackmap of the form:
@@ -1342,7 +1373,7 @@ void X86AsmPrinter::LowerSTACKMAP(const MachineInstr &MI) {
 
   auto &Ctx = OutStreamer->getContext();
   MCSymbol *MILabel = Ctx.createTempSymbol();
-  OutStreamer->EmitLabel(MILabel);
+  OutStreamer->emitLabel(MILabel);
 
   SM.recordStackMap(*MILabel, MI);
   unsigned NumShadowBytes = MI.getOperand(1).getImm();
@@ -1361,7 +1392,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
 
   auto &Ctx = OutStreamer->getContext();
   MCSymbol *MILabel = Ctx.createTempSymbol();
-  OutStreamer->EmitLabel(MILabel);
+  OutStreamer->emitLabel(MILabel);
   SM.recordPatchPoint(*MILabel, MI);
 
   PatchPointOpers opers(&MI);
@@ -1399,9 +1430,9 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
     EmitAndCountInstruction(
         MCInstBuilder(X86::MOV64ri).addReg(ScratchReg).addOperand(CalleeMCOp));
     // FIXME: Add retpoline support and remove this.
-    if (Subtarget->useRetpolineIndirectCalls())
+    if (Subtarget->useIndirectThunkCalls())
       report_fatal_error(
-          "Lowering patchpoint with retpoline not yet implemented.");
+          "Lowering patchpoint with thunks not yet implemented.");
     EmitAndCountInstruction(MCInstBuilder(X86::CALL64r).addReg(ScratchReg));
   }
 
@@ -1410,8 +1441,7 @@ void X86AsmPrinter::LowerPATCHPOINT(const MachineInstr &MI,
   assert(NumBytes >= EncodedBytes &&
          "Patchpoint can't request size less than the length of a call.");
 
-  EmitNops(*OutStreamer, NumBytes - EncodedBytes, Subtarget->is64Bit(),
-           getSubtargetInfo());
+  emitX86Nops(*OutStreamer, NumBytes - EncodedBytes, Subtarget);
 }
 
 void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
@@ -1442,13 +1472,13 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
   // First we emit the label and the jump.
   auto CurSled = OutContext.createTempSymbol("xray_event_sled_", true);
   OutStreamer->AddComment("# XRay Custom Event Log");
-  OutStreamer->EmitCodeAlignment(2);
-  OutStreamer->EmitLabel(CurSled);
+  OutStreamer->emitCodeAlignment(2);
+  OutStreamer->emitLabel(CurSled);
 
   // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
   // an operand (computed as an offset from the jmp instruction).
   // FIXME: Find another less hacky way do force the relative jump.
-  OutStreamer->EmitBinaryData("\xeb\x0f");
+  OutStreamer->emitBinaryData("\xeb\x0f");
 
   // The default C calling convention will place two arguments into %rcx and
   // %rdx -- so we only work with those.
@@ -1471,7 +1501,7 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
         EmitAndCountInstruction(
             MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I]));
       } else {
-        EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo());
+        emitX86Nops(*OutStreamer, 4, Subtarget);
       }
     }
 
@@ -1500,14 +1530,14 @@ void X86AsmPrinter::LowerPATCHABLE_EVENT_CALL(const MachineInstr &MI,
     if (UsedMask[I])
       EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I]));
     else
-      EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo());
+      emitX86Nops(*OutStreamer, 1, Subtarget);
 
   OutStreamer->AddComment("xray custom event end.");
 
-  // Record the sled version. Older versions of this sled were spelled
-  // differently, so we let the runtime handle the different offsets we're
-  // using.
-  recordSled(CurSled, MI, SledKind::CUSTOM_EVENT, 1);
+  // Record the sled version. Version 0 of this sled was spelled differently, so
+  // we let the runtime handle the different offsets we're using. Version 2
+  // changed the absolute address to a PC-relative address.
+  recordSled(CurSled, MI, SledKind::CUSTOM_EVENT, 2);
 }
 
 void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
@@ -1538,13 +1568,13 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
   // First we emit the label and the jump.
   auto CurSled = OutContext.createTempSymbol("xray_typed_event_sled_", true);
   OutStreamer->AddComment("# XRay Typed Event Log");
-  OutStreamer->EmitCodeAlignment(2);
-  OutStreamer->EmitLabel(CurSled);
+  OutStreamer->emitCodeAlignment(2);
+  OutStreamer->emitLabel(CurSled);
 
   // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
   // an operand (computed as an offset from the jmp instruction).
   // FIXME: Find another less hacky way do force the relative jump.
-  OutStreamer->EmitBinaryData("\xeb\x14");
+  OutStreamer->emitBinaryData("\xeb\x14");
 
   // An x86-64 convention may place three arguments into %rcx, %rdx, and R8,
   // so we'll work with those. Or we may be called via SystemV, in which case
@@ -1569,7 +1599,7 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
         EmitAndCountInstruction(
             MCInstBuilder(X86::PUSH64r).addReg(DestRegs[I]));
       } else {
-        EmitNops(*OutStreamer, 4, Subtarget->is64Bit(), getSubtargetInfo());
+        emitX86Nops(*OutStreamer, 4, Subtarget);
       }
     }
 
@@ -1603,12 +1633,12 @@ void X86AsmPrinter::LowerPATCHABLE_TYPED_EVENT_CALL(const MachineInstr &MI,
     if (UsedMask[I])
       EmitAndCountInstruction(MCInstBuilder(X86::POP64r).addReg(DestRegs[I]));
     else
-      EmitNops(*OutStreamer, 1, Subtarget->is64Bit(), getSubtargetInfo());
+      emitX86Nops(*OutStreamer, 1, Subtarget);
 
   OutStreamer->AddComment("xray typed event end.");
 
   // Record the sled version.
-  recordSled(CurSled, MI, SledKind::TYPED_EVENT, 0);
+  recordSled(CurSled, MI, SledKind::TYPED_EVENT, 2);
 }
 
 void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
@@ -1623,7 +1653,7 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
             .getValueAsString()
             .getAsInteger(10, Num))
       return;
-    EmitNops(*OutStreamer, Num, Subtarget->is64Bit(), getSubtargetInfo());
+    emitX86Nops(*OutStreamer, Num, Subtarget);
     return;
   }
   // We want to emit the following pattern:
@@ -1640,15 +1670,15 @@ void X86AsmPrinter::LowerPATCHABLE_FUNCTION_ENTER(const MachineInstr &MI,
   //   call <relative offset, 32-bits>   // 5 bytes
   //
   auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
-  OutStreamer->EmitCodeAlignment(2);
-  OutStreamer->EmitLabel(CurSled);
+  OutStreamer->emitCodeAlignment(2);
+  OutStreamer->emitLabel(CurSled);
 
   // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
   // an operand (computed as an offset from the jmp instruction).
   // FIXME: Find another less hacky way do force the relative jump.
-  OutStreamer->EmitBytes("\xeb\x09");
-  EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
-  recordSled(CurSled, MI, SledKind::FUNCTION_ENTER);
+  OutStreamer->emitBytes("\xeb\x09");
+  emitX86Nops(*OutStreamer, 9, Subtarget);
+  recordSled(CurSled, MI, SledKind::FUNCTION_ENTER, 2);
 }
 
 void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
@@ -1670,17 +1700,17 @@ void X86AsmPrinter::LowerPATCHABLE_RET(const MachineInstr &MI,
   //
   // This just makes sure that the alignment for the next instruction is 2.
   auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
-  OutStreamer->EmitCodeAlignment(2);
-  OutStreamer->EmitLabel(CurSled);
+  OutStreamer->emitCodeAlignment(2);
+  OutStreamer->emitLabel(CurSled);
   unsigned OpCode = MI.getOperand(0).getImm();
   MCInst Ret;
   Ret.setOpcode(OpCode);
   for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
     if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
       Ret.addOperand(MaybeOperand.getValue());
-  OutStreamer->EmitInstruction(Ret, getSubtargetInfo());
-  EmitNops(*OutStreamer, 10, Subtarget->is64Bit(), getSubtargetInfo());
-  recordSled(CurSled, MI, SledKind::FUNCTION_EXIT);
+  OutStreamer->emitInstruction(Ret, getSubtargetInfo());
+  emitX86Nops(*OutStreamer, 10, Subtarget);
+  recordSled(CurSled, MI, SledKind::FUNCTION_EXIT, 2);
 }
 
 void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
@@ -1694,17 +1724,17 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
   // the PATCHABLE_FUNCTION_ENTER case, followed by the lowering of the actual
   // tail call much like how we have it in PATCHABLE_RET.
   auto CurSled = OutContext.createTempSymbol("xray_sled_", true);
-  OutStreamer->EmitCodeAlignment(2);
-  OutStreamer->EmitLabel(CurSled);
+  OutStreamer->emitCodeAlignment(2);
+  OutStreamer->emitLabel(CurSled);
   auto Target = OutContext.createTempSymbol();
 
   // Use a two-byte `jmp`. This version of JMP takes an 8-bit relative offset as
   // an operand (computed as an offset from the jmp instruction).
   // FIXME: Find another less hacky way do force the relative jump.
-  OutStreamer->EmitBytes("\xeb\x09");
-  EmitNops(*OutStreamer, 9, Subtarget->is64Bit(), getSubtargetInfo());
-  OutStreamer->EmitLabel(Target);
-  recordSled(CurSled, MI, SledKind::TAIL_CALL);
+  OutStreamer->emitBytes("\xeb\x09");
+  emitX86Nops(*OutStreamer, 9, Subtarget);
+  OutStreamer->emitLabel(Target);
+  recordSled(CurSled, MI, SledKind::TAIL_CALL, 2);
 
   unsigned OpCode = MI.getOperand(0).getImm();
   OpCode = convertTailJumpOpcode(OpCode);
@@ -1717,7 +1747,7 @@ void X86AsmPrinter::LowerPATCHABLE_TAIL_CALL(const MachineInstr &MI,
   for (auto &MO : make_range(MI.operands_begin() + 1, MI.operands_end()))
     if (auto MaybeOperand = MCIL.LowerMachineOperand(&MI, MO))
       TC.addOperand(MaybeOperand.getValue());
-  OutStreamer->EmitInstruction(TC, getSubtargetInfo());
+  OutStreamer->emitInstruction(TC, getSubtargetInfo());
 }
 
 // Returns instruction preceding MBBI in MachineFunction.
@@ -1961,281 +1991,9 @@ static unsigned getRegisterWidth(const MCOperandInfo &Info) {
   llvm_unreachable("Unknown register class!");
 }
 
-void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
-  X86MCInstLower MCInstLowering(*MF, *this);
-  const X86RegisterInfo *RI =
-      MF->getSubtarget<X86Subtarget>().getRegisterInfo();
-
-  // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
-  // are compressed from EVEX encoding to VEX encoding.
-  if (TM.Options.MCOptions.ShowMCEncoding) {
-    if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX)
-      OutStreamer->AddComment("EVEX TO VEX Compression ", false);
-  }
-
+static void addConstantComments(const MachineInstr *MI,
+                                MCStreamer &OutStreamer) {
   switch (MI->getOpcode()) {
-  case TargetOpcode::DBG_VALUE:
-    llvm_unreachable("Should be handled target independently");
-
-  // Emit nothing here but a comment if we can.
-  case X86::Int_MemBarrier:
-    OutStreamer->emitRawComment("MEMBARRIER");
-    return;
-
-  case X86::EH_RETURN:
-  case X86::EH_RETURN64: {
-    // Lower these as normal, but add some comments.
-    Register Reg = MI->getOperand(0).getReg();
-    OutStreamer->AddComment(StringRef("eh_return, addr: %") +
-                            X86ATTInstPrinter::getRegisterName(Reg));
-    break;
-  }
-  case X86::CLEANUPRET: {
-    // Lower these as normal, but add some comments.
-    OutStreamer->AddComment("CLEANUPRET");
-    break;
-  }
-
-  case X86::CATCHRET: {
-    // Lower these as normal, but add some comments.
-    OutStreamer->AddComment("CATCHRET");
-    break;
-  }
-
-  case X86::TAILJMPr:
-  case X86::TAILJMPm:
-  case X86::TAILJMPd:
-  case X86::TAILJMPd_CC:
-  case X86::TAILJMPr64:
-  case X86::TAILJMPm64:
-  case X86::TAILJMPd64:
-  case X86::TAILJMPd64_CC:
-  case X86::TAILJMPr64_REX:
-  case X86::TAILJMPm64_REX:
-    // Lower these as normal, but add some comments.
-    OutStreamer->AddComment("TAILCALL");
-    break;
-
-  case X86::TLS_addr32:
-  case X86::TLS_addr64:
-  case X86::TLS_base_addr32:
-  case X86::TLS_base_addr64:
-    return LowerTlsAddr(MCInstLowering, *MI);
-
-  // Loading/storing mask pairs requires two kmov operations. The second one of these
-  // needs a 2 byte displacement relative to the specified address (with 32 bit spill
-  // size). The pairs of 1bit masks up to 16 bit masks all use the same spill size,
-  // they all are stored using MASKPAIR16STORE, loaded using MASKPAIR16LOAD.
-  //
-  // The displacement value might wrap around in theory, thus the asserts in both
-  // cases.
-  case X86::MASKPAIR16LOAD: {
-    int64_t Disp = MI->getOperand(1 + X86::AddrDisp).getImm();
-    assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
-    Register Reg = MI->getOperand(0).getReg();
-    Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
-    Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
-
-    // Load the first mask register
-    MCInstBuilder MIB = MCInstBuilder(X86::KMOVWkm);
-    MIB.addReg(Reg0);
-    for (int i = 0; i < X86::AddrNumOperands; ++i) {
-      auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i));
-      MIB.addOperand(Op.getValue());
-    }
-    EmitAndCountInstruction(MIB);
-
-    // Load the second mask register of the pair
-    MIB = MCInstBuilder(X86::KMOVWkm);
-    MIB.addReg(Reg1);
-    for (int i = 0; i < X86::AddrNumOperands; ++i) {
-      if (i == X86::AddrDisp) {
-        MIB.addImm(Disp + 2);
-      } else {
-        auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(1 + i));
-        MIB.addOperand(Op.getValue());
-      }
-    }
-    EmitAndCountInstruction(MIB);
-    return;
-  }
-
-  case X86::MASKPAIR16STORE: {
-    int64_t Disp = MI->getOperand(X86::AddrDisp).getImm();
-    assert(Disp >= 0 && Disp <= INT32_MAX - 2 && "Unexpected displacement");
-    Register Reg = MI->getOperand(X86::AddrNumOperands).getReg();
-    Register Reg0 = RI->getSubReg(Reg, X86::sub_mask_0);
-    Register Reg1 = RI->getSubReg(Reg, X86::sub_mask_1);
-
-    // Store the first mask register
-    MCInstBuilder MIB = MCInstBuilder(X86::KMOVWmk);
-    for (int i = 0; i < X86::AddrNumOperands; ++i)
-      MIB.addOperand(MCInstLowering.LowerMachineOperand(MI, MI->getOperand(i)).getValue());
-    MIB.addReg(Reg0);
-    EmitAndCountInstruction(MIB);
-
-    // Store the second mask register of the pair
-    MIB = MCInstBuilder(X86::KMOVWmk);
-    for (int i = 0; i < X86::AddrNumOperands; ++i) {
-      if (i == X86::AddrDisp) {
-        MIB.addImm(Disp + 2);
-      } else {
-        auto Op = MCInstLowering.LowerMachineOperand(MI, MI->getOperand(0 + i));
-        MIB.addOperand(Op.getValue());
-      }
-    }
-    MIB.addReg(Reg1);
-    EmitAndCountInstruction(MIB);
-    return;
-  }
-
-  case X86::MOVPC32r: {
-    // This is a pseudo op for a two instruction sequence with a label, which
-    // looks like:
-    //     call "L1$pb"
-    // "L1$pb":
-    //     popl %esi
-
-    // Emit the call.
-    MCSymbol *PICBase = MF->getPICBaseSymbol();
-    // FIXME: We would like an efficient form for this, so we don't have to do a
-    // lot of extra uniquing.
-    EmitAndCountInstruction(
-        MCInstBuilder(X86::CALLpcrel32)
-            .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
-
-    const X86FrameLowering *FrameLowering =
-        MF->getSubtarget<X86Subtarget>().getFrameLowering();
-    bool hasFP = FrameLowering->hasFP(*MF);
-
-    // TODO: This is needed only if we require precise CFA.
-    bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
-                               !OutStreamer->getDwarfFrameInfos().back().End;
-
-    int stackGrowth = -RI->getSlotSize();
-
-    if (HasActiveDwarfFrame && !hasFP) {
-      OutStreamer->EmitCFIAdjustCfaOffset(-stackGrowth);
-    }
-
-    // Emit the label.
-    OutStreamer->EmitLabel(PICBase);
-
-    // popl $reg
-    EmitAndCountInstruction(
-        MCInstBuilder(X86::POP32r).addReg(MI->getOperand(0).getReg()));
-
-    if (HasActiveDwarfFrame && !hasFP) {
-      OutStreamer->EmitCFIAdjustCfaOffset(stackGrowth);
-    }
-    return;
-  }
-
-  case X86::ADD32ri: {
-    // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
-    if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
-      break;
-
-    // Okay, we have something like:
-    //  EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
-
-    // For this, we want to print something like:
-    //   MYGLOBAL + (. - PICBASE)
-    // However, we can't generate a ".", so just emit a new label here and refer
-    // to it.
-    MCSymbol *DotSym = OutContext.createTempSymbol();
-    OutStreamer->EmitLabel(DotSym);
-
-    // Now that we have emitted the label, lower the complex operand expression.
-    MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
-
-    const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
-    const MCExpr *PICBase =
-        MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
-    DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
-
-    DotExpr = MCBinaryExpr::createAdd(
-        MCSymbolRefExpr::create(OpSym, OutContext), DotExpr, OutContext);
-
-    EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
-                                .addReg(MI->getOperand(0).getReg())
-                                .addReg(MI->getOperand(1).getReg())
-                                .addExpr(DotExpr));
-    return;
-  }
-  case TargetOpcode::STATEPOINT:
-    return LowerSTATEPOINT(*MI, MCInstLowering);
-
-  case TargetOpcode::FAULTING_OP:
-    return LowerFAULTING_OP(*MI, MCInstLowering);
-
-  case TargetOpcode::FENTRY_CALL:
-    return LowerFENTRY_CALL(*MI, MCInstLowering);
-
-  case TargetOpcode::PATCHABLE_OP:
-    return LowerPATCHABLE_OP(*MI, MCInstLowering);
-
-  case TargetOpcode::STACKMAP:
-    return LowerSTACKMAP(*MI);
-
-  case TargetOpcode::PATCHPOINT:
-    return LowerPATCHPOINT(*MI, MCInstLowering);
-
-  case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
-    return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering);
-
-  case TargetOpcode::PATCHABLE_RET:
-    return LowerPATCHABLE_RET(*MI, MCInstLowering);
-
-  case TargetOpcode::PATCHABLE_TAIL_CALL:
-    return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);
-
-  case TargetOpcode::PATCHABLE_EVENT_CALL:
-    return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering);
-
-  case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
-    return LowerPATCHABLE_TYPED_EVENT_CALL(*MI, MCInstLowering);
-
-  case X86::MORESTACK_RET:
-    EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
-    return;
-
-  case X86::MORESTACK_RET_RESTORE_R10:
-    // Return, then restore R10.
-    EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
-    EmitAndCountInstruction(
-        MCInstBuilder(X86::MOV64rr).addReg(X86::R10).addReg(X86::RAX));
-    return;
-
-  case X86::SEH_PushReg:
-  case X86::SEH_SaveReg:
-  case X86::SEH_SaveXMM:
-  case X86::SEH_StackAlloc:
-  case X86::SEH_StackAlign:
-  case X86::SEH_SetFrame:
-  case X86::SEH_PushFrame:
-  case X86::SEH_EndPrologue:
-    EmitSEHInstruction(MI);
-    return;
-
-  case X86::SEH_Epilogue: {
-    assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
-    MachineBasicBlock::const_iterator MBBI(MI);
-    // Check if preceded by a call and emit nop if so.
-    for (MBBI = PrevCrossBBInst(MBBI);
-         MBBI != MachineBasicBlock::const_iterator();
-         MBBI = PrevCrossBBInst(MBBI)) {
-      // Conservatively assume that pseudo instructions don't emit code and keep
-      // looking for a call. We may emit an unnecessary nop in some cases.
-      if (!MBBI->isPseudo()) {
-        if (MBBI->isCall())
-          EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
-        break;
-      }
-    }
-    return;
-  }
-
   // Lower PSHUFB and VPERMILP normally but add a comment if we can find
   // a constant shuffle mask. We won't be able to do this at the MC layer
   // because the mask isn't an immediate.
@@ -2251,30 +2009,19 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case X86::VPSHUFBZrm:
   case X86::VPSHUFBZrmk:
   case X86::VPSHUFBZrmkz: {
-    if (!OutStreamer->isVerboseAsm())
-      break;
-    unsigned SrcIdx, MaskIdx;
-    switch (MI->getOpcode()) {
-    default: llvm_unreachable("Invalid opcode");
-    case X86::PSHUFBrm:
-    case X86::VPSHUFBrm:
-    case X86::VPSHUFBYrm:
-    case X86::VPSHUFBZ128rm:
-    case X86::VPSHUFBZ256rm:
-    case X86::VPSHUFBZrm:
-      SrcIdx = 1; MaskIdx = 5; break;
-    case X86::VPSHUFBZ128rmkz:
-    case X86::VPSHUFBZ256rmkz:
-    case X86::VPSHUFBZrmkz:
-      SrcIdx = 2; MaskIdx = 6; break;
-    case X86::VPSHUFBZ128rmk:
-    case X86::VPSHUFBZ256rmk:
-    case X86::VPSHUFBZrmk:
-      SrcIdx = 3; MaskIdx = 7; break;
+    unsigned SrcIdx = 1;
+    if (X86II::isKMasked(MI->getDesc().TSFlags)) {
+      // Skip mask operand.
+      ++SrcIdx;
+      if (X86II::isKMergeMasked(MI->getDesc().TSFlags)) {
+        // Skip passthru operand.
+        ++SrcIdx;
+      }
     }
+    unsigned MaskIdx = SrcIdx + 1 + X86::AddrDisp;
 
-    assert(MI->getNumOperands() >= 6 &&
-           "We should always have at least 6 operands!");
+    assert(MI->getNumOperands() >= (SrcIdx + 1 + X86::AddrNumOperands) &&
+           "Unexpected number of operands!");
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
@@ -2282,7 +2029,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       SmallVector<int, 64> Mask;
       DecodePSHUFBMask(C, Width, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+        OutStreamer.AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
     }
     break;
   }
@@ -2309,9 +2056,6 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case X86::VPERMILPDZrm:
   case X86::VPERMILPDZrmk:
   case X86::VPERMILPDZrmkz: {
-    if (!OutStreamer->isVerboseAsm())
-      break;
-    unsigned SrcIdx, MaskIdx;
     unsigned ElSize;
     switch (MI->getOpcode()) {
     default: llvm_unreachable("Invalid opcode");
@@ -2320,33 +2064,42 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     case X86::VPERMILPSZ128rm:
     case X86::VPERMILPSZ256rm:
     case X86::VPERMILPSZrm:
-      SrcIdx = 1; MaskIdx = 5; ElSize = 32; break;
     case X86::VPERMILPSZ128rmkz:
     case X86::VPERMILPSZ256rmkz:
     case X86::VPERMILPSZrmkz:
-      SrcIdx = 2; MaskIdx = 6; ElSize = 32; break;
     case X86::VPERMILPSZ128rmk:
     case X86::VPERMILPSZ256rmk:
     case X86::VPERMILPSZrmk:
-      SrcIdx = 3; MaskIdx = 7; ElSize = 32; break;
+      ElSize = 32;
+      break;
     case X86::VPERMILPDrm:
     case X86::VPERMILPDYrm:
     case X86::VPERMILPDZ128rm:
     case X86::VPERMILPDZ256rm:
     case X86::VPERMILPDZrm:
-      SrcIdx = 1; MaskIdx = 5; ElSize = 64; break;
     case X86::VPERMILPDZ128rmkz:
     case X86::VPERMILPDZ256rmkz:
     case X86::VPERMILPDZrmkz:
-      SrcIdx = 2; MaskIdx = 6; ElSize = 64; break;
     case X86::VPERMILPDZ128rmk:
     case X86::VPERMILPDZ256rmk:
     case X86::VPERMILPDZrmk:
-      SrcIdx = 3; MaskIdx = 7; ElSize = 64; break;
+      ElSize = 64;
+      break;
+    }
+
+    unsigned SrcIdx = 1;
+    if (X86II::isKMasked(MI->getDesc().TSFlags)) {
+      // Skip mask operand.
+      ++SrcIdx;
+      if (X86II::isKMergeMasked(MI->getDesc().TSFlags)) {
+        // Skip passthru operand.
+        ++SrcIdx;
+      }
     }
+    unsigned MaskIdx = SrcIdx + 1 + X86::AddrDisp;
 
-    assert(MI->getNumOperands() >= 6 &&
-           "We should always have at least 6 operands!");
+    assert(MI->getNumOperands() >= (SrcIdx + 1 + X86::AddrNumOperands) &&
+           "Unexpected number of operands!");
 
     const MachineOperand &MaskOp = MI->getOperand(MaskIdx);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
@@ -2354,7 +2107,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
       SmallVector<int, 16> Mask;
       DecodeVPERMILPMask(C, ElSize, Width, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
+        OutStreamer.AddComment(getShuffleComment(MI, SrcIdx, SrcIdx, Mask));
     }
     break;
   }
@@ -2363,10 +2116,8 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case X86::VPERMIL2PSrm:
   case X86::VPERMIL2PDYrm:
   case X86::VPERMIL2PSYrm: {
-    if (!OutStreamer->isVerboseAsm())
-      break;
-    assert(MI->getNumOperands() >= 8 &&
-           "We should always have at least 8 operands!");
+    assert(MI->getNumOperands() >= (3 + X86::AddrNumOperands + 1) &&
+           "Unexpected number of operands!");
 
     const MachineOperand &CtrlOp = MI->getOperand(MI->getNumOperands() - 1);
     if (!CtrlOp.isImm())
@@ -2379,47 +2130,43 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     case X86::VPERMIL2PDrm: case X86::VPERMIL2PDYrm: ElSize = 64; break;
     }
 
-    const MachineOperand &MaskOp = MI->getOperand(6);
+    const MachineOperand &MaskOp = MI->getOperand(3 + X86::AddrDisp);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
       unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
       DecodeVPERMIL2PMask(C, (unsigned)CtrlOp.getImm(), ElSize, Width, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
+        OutStreamer.AddComment(getShuffleComment(MI, 1, 2, Mask));
     }
     break;
   }
 
   case X86::VPPERMrrm: {
-    if (!OutStreamer->isVerboseAsm())
-      break;
-    assert(MI->getNumOperands() >= 7 &&
-           "We should always have at least 7 operands!");
+    assert(MI->getNumOperands() >= (3 + X86::AddrNumOperands) &&
+           "Unexpected number of operands!");
 
-    const MachineOperand &MaskOp = MI->getOperand(6);
+    const MachineOperand &MaskOp = MI->getOperand(3 + X86::AddrDisp);
     if (auto *C = getConstantFromPool(*MI, MaskOp)) {
       unsigned Width = getRegisterWidth(MI->getDesc().OpInfo[0]);
       SmallVector<int, 16> Mask;
       DecodeVPPERMMask(C, Width, Mask);
       if (!Mask.empty())
-        OutStreamer->AddComment(getShuffleComment(MI, 1, 2, Mask));
+        OutStreamer.AddComment(getShuffleComment(MI, 1, 2, Mask));
     }
     break;
   }
 
   case X86::MMX_MOVQ64rm: {
-    if (!OutStreamer->isVerboseAsm())
-      break;
-    if (MI->getNumOperands() <= 4)
-      break;
-    if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+    assert(MI->getNumOperands() == (1 + X86::AddrNumOperands) &&
+           "Unexpected number of operands!");
+    if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
       std::string Comment;
       raw_string_ostream CS(Comment);
       const MachineOperand &DstOp = MI->getOperand(0);
       CS << X86ATTInstPrinter::getRegisterName(DstOp.getReg()) << " = ";
       if (auto *CF = dyn_cast<ConstantFP>(C)) {
         CS << "0x" << CF->getValueAPF().bitcastToAPInt().toString(16, false);
-        OutStreamer->AddComment(CS.str());
+        OutStreamer.AddComment(CS.str());
       }
     }
     break;
@@ -2470,11 +2217,9 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
   case X86::VBROADCASTI64X2Z128rm:
   case X86::VBROADCASTI64X2rm:
   case X86::VBROADCASTI64X4rm:
-    if (!OutStreamer->isVerboseAsm())
-      break;
-    if (MI->getNumOperands() <= 4)
-      break;
-    if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+    assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
+           "Unexpected number of operands!");
+    if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
       int NumLanes = 1;
       // Override NumLanes for the broadcast instructions.
       switch (MI->getOpcode()) {
@@ -2516,7 +2261,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
           }
         }
         CS << "]";
-        OutStreamer->AddComment(CS.str());
+        OutStreamer.AddComment(CS.str());
       } else if (auto *CV = dyn_cast<ConstantVector>(C)) {
         CS << "<";
         for (int l = 0; l != NumLanes; ++l) {
@@ -2528,80 +2273,79 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
           }
         }
         CS << ">";
-        OutStreamer->AddComment(CS.str());
+        OutStreamer.AddComment(CS.str());
       }
     }
     break;
+
   case X86::MOVDDUPrm:
   case X86::VMOVDDUPrm:
   case X86::VMOVDDUPZ128rm:
   case X86::VBROADCASTSSrm:
   case X86::VBROADCASTSSYrm:
-  case X86::VBROADCASTSSZ128m:
-  case X86::VBROADCASTSSZ256m:
-  case X86::VBROADCASTSSZm:
+  case X86::VBROADCASTSSZ128rm:
+  case X86::VBROADCASTSSZ256rm:
+  case X86::VBROADCASTSSZrm:
   case X86::VBROADCASTSDYrm:
-  case X86::VBROADCASTSDZ256m:
-  case X86::VBROADCASTSDZm:
+  case X86::VBROADCASTSDZ256rm:
+  case X86::VBROADCASTSDZrm:
   case X86::VPBROADCASTBrm:
   case X86::VPBROADCASTBYrm:
-  case X86::VPBROADCASTBZ128m:
-  case X86::VPBROADCASTBZ256m:
-  case X86::VPBROADCASTBZm:
+  case X86::VPBROADCASTBZ128rm:
+  case X86::VPBROADCASTBZ256rm:
+  case X86::VPBROADCASTBZrm:
   case X86::VPBROADCASTDrm:
   case X86::VPBROADCASTDYrm:
-  case X86::VPBROADCASTDZ128m:
-  case X86::VPBROADCASTDZ256m:
-  case X86::VPBROADCASTDZm:
+  case X86::VPBROADCASTDZ128rm:
+  case X86::VPBROADCASTDZ256rm:
+  case X86::VPBROADCASTDZrm:
   case X86::VPBROADCASTQrm:
   case X86::VPBROADCASTQYrm:
-  case X86::VPBROADCASTQZ128m:
-  case X86::VPBROADCASTQZ256m:
-  case X86::VPBROADCASTQZm:
+  case X86::VPBROADCASTQZ128rm:
+  case X86::VPBROADCASTQZ256rm:
+  case X86::VPBROADCASTQZrm:
   case X86::VPBROADCASTWrm:
   case X86::VPBROADCASTWYrm:
-  case X86::VPBROADCASTWZ128m:
-  case X86::VPBROADCASTWZ256m:
-  case X86::VPBROADCASTWZm:
-    if (!OutStreamer->isVerboseAsm())
-      break;
-    if (MI->getNumOperands() <= 4)
-      break;
-    if (auto *C = getConstantFromPool(*MI, MI->getOperand(4))) {
+  case X86::VPBROADCASTWZ128rm:
+  case X86::VPBROADCASTWZ256rm:
+  case X86::VPBROADCASTWZrm:
+    assert(MI->getNumOperands() >= (1 + X86::AddrNumOperands) &&
+           "Unexpected number of operands!");
+    if (auto *C = getConstantFromPool(*MI, MI->getOperand(1 + X86::AddrDisp))) {
       int NumElts;
       switch (MI->getOpcode()) {
       default: llvm_unreachable("Invalid opcode");
-      case X86::MOVDDUPrm:         NumElts = 2;  break;
-      case X86::VMOVDDUPrm:        NumElts = 2;  break;
-      case X86::VMOVDDUPZ128rm:    NumElts = 2;  break;
-      case X86::VBROADCASTSSrm:    NumElts = 4;  break;
-      case X86::VBROADCASTSSYrm:   NumElts = 8;  break;
-      case X86::VBROADCASTSSZ128m: NumElts = 4;  break;
-      case X86::VBROADCASTSSZ256m: NumElts = 8;  break;
-      case X86::VBROADCASTSSZm:    NumElts = 16; break;
-      case X86::VBROADCASTSDYrm:   NumElts = 4;  break;
-      case X86::VBROADCASTSDZ256m: NumElts = 4;  break;
-      case X86::VBROADCASTSDZm:    NumElts = 8;  break;
-      case X86::VPBROADCASTBrm:    NumElts = 16; break;
-      case X86::VPBROADCASTBYrm:   NumElts = 32; break;
-      case X86::VPBROADCASTBZ128m: NumElts = 16; break;
-      case X86::VPBROADCASTBZ256m: NumElts = 32; break;
-      case X86::VPBROADCASTBZm:    NumElts = 64; break;
-      case X86::VPBROADCASTDrm:    NumElts = 4;  break;
-      case X86::VPBROADCASTDYrm:   NumElts = 8;  break;
-      case X86::VPBROADCASTDZ128m: NumElts = 4;  break;
-      case X86::VPBROADCASTDZ256m: NumElts = 8;  break;
-      case X86::VPBROADCASTDZm:    NumElts = 16; break;
-      case X86::VPBROADCASTQrm:    NumElts = 2;  break;
-      case X86::VPBROADCASTQYrm:   NumElts = 4;  break;
-      case X86::VPBROADCASTQZ128m: NumElts = 2;  break;
-      case X86::VPBROADCASTQZ256m: NumElts = 4;  break;
-      case X86::VPBROADCASTQZm:    NumElts = 8;  break;
-      case X86::VPBROADCASTWrm:    NumElts = 8;  break;
-      case X86::VPBROADCASTWYrm:   NumElts = 16; break;
-      case X86::VPBROADCASTWZ128m: NumElts = 8;  break;
-      case X86::VPBROADCASTWZ256m: NumElts = 16; break;
-      case X86::VPBROADCASTWZm:    NumElts = 32; break;
+      case X86::MOVDDUPrm:          NumElts = 2;  break;
+      case X86::VMOVDDUPrm:         NumElts = 2;  break;
+      case X86::VMOVDDUPZ128rm:     NumElts = 2;  break;
+      case X86::VBROADCASTSSrm:     NumElts = 4;  break;
+      case X86::VBROADCASTSSYrm:    NumElts = 8;  break;
+      case X86::VBROADCASTSSZ128rm: NumElts = 4;  break;
+      case X86::VBROADCASTSSZ256rm: NumElts = 8;  break;
+      case X86::VBROADCASTSSZrm:    NumElts = 16; break;
+      case X86::VBROADCASTSDYrm:    NumElts = 4;  break;
+      case X86::VBROADCASTSDZ256rm: NumElts = 4;  break;
+      case X86::VBROADCASTSDZrm:    NumElts = 8;  break;
+      case X86::VPBROADCASTBrm:     NumElts = 16; break;
+      case X86::VPBROADCASTBYrm:    NumElts = 32; break;
+      case X86::VPBROADCASTBZ128rm: NumElts = 16; break;
+      case X86::VPBROADCASTBZ256rm: NumElts = 32; break;
+      case X86::VPBROADCASTBZrm:    NumElts = 64; break;
+      case X86::VPBROADCASTDrm:     NumElts = 4;  break;
+      case X86::VPBROADCASTDYrm:    NumElts = 8;  break;
+      case X86::VPBROADCASTDZ128rm: NumElts = 4;  break;
+      case X86::VPBROADCASTDZ256rm: NumElts = 8;  break;
+      case X86::VPBROADCASTDZrm:    NumElts = 16; break;
+      case X86::VPBROADCASTQrm:     NumElts = 2;  break;
+      case X86::VPBROADCASTQYrm:    NumElts = 4;  break;
+      case X86::VPBROADCASTQZ128rm: NumElts = 2;  break;
+      case X86::VPBROADCASTQZ256rm: NumElts = 4;  break;
+      case X86::VPBROADCASTQZrm:    NumElts = 8;  break;
+      case X86::VPBROADCASTWrm:     NumElts = 8;  break;
+      case X86::VPBROADCASTWYrm:    NumElts = 16; break;
+      case X86::VPBROADCASTWZ128rm: NumElts = 8;  break;
+      case X86::VPBROADCASTWZ256rm: NumElts = 16; break;
+      case X86::VPBROADCASTWZrm:    NumElts = 32; break;
       }
 
       std::string Comment;
@@ -2615,8 +2359,241 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
         printConstant(C, CS);
       }
       CS << "]";
-      OutStreamer->AddComment(CS.str());
+      OutStreamer.AddComment(CS.str());
+    }
+  }
+}
+
+void X86AsmPrinter::emitInstruction(const MachineInstr *MI) {
+  X86MCInstLower MCInstLowering(*MF, *this);
+  const X86RegisterInfo *RI =
+      MF->getSubtarget<X86Subtarget>().getRegisterInfo();
+
+  // Add a comment about EVEX-2-VEX compression for AVX-512 instrs that
+  // are compressed from EVEX encoding to VEX encoding.
+  if (TM.Options.MCOptions.ShowMCEncoding) {
+    if (MI->getAsmPrinterFlags() & X86::AC_EVEX_2_VEX)
+      OutStreamer->AddComment("EVEX TO VEX Compression ", false);
+  }
+
+  // Add comments for values loaded from constant pool.
+  if (OutStreamer->isVerboseAsm())
+    addConstantComments(MI, *OutStreamer);
+
+  switch (MI->getOpcode()) {
+  case TargetOpcode::DBG_VALUE:
+    llvm_unreachable("Should be handled target independently");
+
+  // Emit nothing here but a comment if we can.
+  case X86::Int_MemBarrier:
+    OutStreamer->emitRawComment("MEMBARRIER");
+    return;
+
+  case X86::EH_RETURN:
+  case X86::EH_RETURN64: {
+    // Lower these as normal, but add some comments.
+    Register Reg = MI->getOperand(0).getReg();
+    OutStreamer->AddComment(StringRef("eh_return, addr: %") +
+                            X86ATTInstPrinter::getRegisterName(Reg));
+    break;
+  }
+  case X86::CLEANUPRET: {
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("CLEANUPRET");
+    break;
+  }
+
+  case X86::CATCHRET: {
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("CATCHRET");
+    break;
+  }
+
+  case X86::ENDBR32:
+  case X86::ENDBR64: {
+    // CurrentPatchableFunctionEntrySym can be CurrentFnBegin only for
+    // -fpatchable-function-entry=N,0. The entry MBB is guaranteed to be
+    // non-empty. If MI is the initial ENDBR, place the
+    // __patchable_function_entries label after ENDBR.
+    if (CurrentPatchableFunctionEntrySym &&
+        CurrentPatchableFunctionEntrySym == CurrentFnBegin &&
+        MI == &MF->front().front()) {
+      MCInst Inst;
+      MCInstLowering.Lower(MI, Inst);
+      EmitAndCountInstruction(Inst);
+      CurrentPatchableFunctionEntrySym = createTempSymbol("patch");
+      OutStreamer->emitLabel(CurrentPatchableFunctionEntrySym);
+      return;
+    }
+    break;
+  }
+
+  case X86::TAILJMPr:
+  case X86::TAILJMPm:
+  case X86::TAILJMPd:
+  case X86::TAILJMPd_CC:
+  case X86::TAILJMPr64:
+  case X86::TAILJMPm64:
+  case X86::TAILJMPd64:
+  case X86::TAILJMPd64_CC:
+  case X86::TAILJMPr64_REX:
+  case X86::TAILJMPm64_REX:
+    // Lower these as normal, but add some comments.
+    OutStreamer->AddComment("TAILCALL");
+    break;
+
+  case X86::TLS_addr32:
+  case X86::TLS_addr64:
+  case X86::TLS_base_addr32:
+  case X86::TLS_base_addr64:
+    return LowerTlsAddr(MCInstLowering, *MI);
+
+  case X86::MOVPC32r: {
+    // This is a pseudo op for a two instruction sequence with a label, which
+    // looks like:
+    //     call "L1$pb"
+    // "L1$pb":
+    //     popl %esi
+
+    // Emit the call.
+    MCSymbol *PICBase = MF->getPICBaseSymbol();
+    // FIXME: We would like an efficient form for this, so we don't have to do a
+    // lot of extra uniquing.
+    EmitAndCountInstruction(
+        MCInstBuilder(X86::CALLpcrel32)
+            .addExpr(MCSymbolRefExpr::create(PICBase, OutContext)));
+
+    const X86FrameLowering *FrameLowering =
+        MF->getSubtarget<X86Subtarget>().getFrameLowering();
+    bool hasFP = FrameLowering->hasFP(*MF);
+
+    // TODO: This is needed only if we require precise CFA.
+    bool HasActiveDwarfFrame = OutStreamer->getNumFrameInfos() &&
+                               !OutStreamer->getDwarfFrameInfos().back().End;
+
+    int stackGrowth = -RI->getSlotSize();
+
+    if (HasActiveDwarfFrame && !hasFP) {
+      OutStreamer->emitCFIAdjustCfaOffset(-stackGrowth);
+    }
+
+    // Emit the label.
+    OutStreamer->emitLabel(PICBase);
+
+    // popl $reg
+    EmitAndCountInstruction(
+        MCInstBuilder(X86::POP32r).addReg(MI->getOperand(0).getReg()));
+
+    if (HasActiveDwarfFrame && !hasFP) {
+      OutStreamer->emitCFIAdjustCfaOffset(stackGrowth);
+    }
+    return;
+  }
+
+  case X86::ADD32ri: {
+    // Lower the MO_GOT_ABSOLUTE_ADDRESS form of ADD32ri.
+    if (MI->getOperand(2).getTargetFlags() != X86II::MO_GOT_ABSOLUTE_ADDRESS)
+      break;
+
+    // Okay, we have something like:
+    //  EAX = ADD32ri EAX, MO_GOT_ABSOLUTE_ADDRESS(@MYGLOBAL)
+
+    // For this, we want to print something like:
+    //   MYGLOBAL + (. - PICBASE)
+    // However, we can't generate a ".", so just emit a new label here and refer
+    // to it.
+    MCSymbol *DotSym = OutContext.createTempSymbol();
+    OutStreamer->emitLabel(DotSym);
+
+    // Now that we have emitted the label, lower the complex operand expression.
+    MCSymbol *OpSym = MCInstLowering.GetSymbolFromOperand(MI->getOperand(2));
+
+    const MCExpr *DotExpr = MCSymbolRefExpr::create(DotSym, OutContext);
+    const MCExpr *PICBase =
+        MCSymbolRefExpr::create(MF->getPICBaseSymbol(), OutContext);
+    DotExpr = MCBinaryExpr::createSub(DotExpr, PICBase, OutContext);
+
+    DotExpr = MCBinaryExpr::createAdd(
+        MCSymbolRefExpr::create(OpSym, OutContext), DotExpr, OutContext);
+
+    EmitAndCountInstruction(MCInstBuilder(X86::ADD32ri)
+                                .addReg(MI->getOperand(0).getReg())
+                                .addReg(MI->getOperand(1).getReg())
+                                .addExpr(DotExpr));
+    return;
+  }
+  case TargetOpcode::STATEPOINT:
+    return LowerSTATEPOINT(*MI, MCInstLowering);
+
+  case TargetOpcode::FAULTING_OP:
+    return LowerFAULTING_OP(*MI, MCInstLowering);
+
+  case TargetOpcode::FENTRY_CALL:
+    return LowerFENTRY_CALL(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_OP:
+    return LowerPATCHABLE_OP(*MI, MCInstLowering);
+
+  case TargetOpcode::STACKMAP:
+    return LowerSTACKMAP(*MI);
+
+  case TargetOpcode::PATCHPOINT:
+    return LowerPATCHPOINT(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_FUNCTION_ENTER:
+    return LowerPATCHABLE_FUNCTION_ENTER(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_RET:
+    return LowerPATCHABLE_RET(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_TAIL_CALL:
+    return LowerPATCHABLE_TAIL_CALL(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_EVENT_CALL:
+    return LowerPATCHABLE_EVENT_CALL(*MI, MCInstLowering);
+
+  case TargetOpcode::PATCHABLE_TYPED_EVENT_CALL:
+    return LowerPATCHABLE_TYPED_EVENT_CALL(*MI, MCInstLowering);
+
+  case X86::MORESTACK_RET:
+    EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+    return;
+
+  case X86::MORESTACK_RET_RESTORE_R10:
+    // Return, then restore R10.
+    EmitAndCountInstruction(MCInstBuilder(getRetOpcode(*Subtarget)));
+    EmitAndCountInstruction(
+        MCInstBuilder(X86::MOV64rr).addReg(X86::R10).addReg(X86::RAX));
+    return;
+
+  case X86::SEH_PushReg:
+  case X86::SEH_SaveReg:
+  case X86::SEH_SaveXMM:
+  case X86::SEH_StackAlloc:
+  case X86::SEH_StackAlign:
+  case X86::SEH_SetFrame:
+  case X86::SEH_PushFrame:
+  case X86::SEH_EndPrologue:
+    EmitSEHInstruction(MI);
+    return;
+
+  case X86::SEH_Epilogue: {
+    assert(MF->hasWinCFI() && "SEH_ instruction in function without WinCFI?");
+    MachineBasicBlock::const_iterator MBBI(MI);
+    // Check if preceded by a call and emit nop if so.
+    for (MBBI = PrevCrossBBInst(MBBI);
+         MBBI != MachineBasicBlock::const_iterator();
+         MBBI = PrevCrossBBInst(MBBI)) {
+      // Conservatively assume that pseudo instructions don't emit code and keep
+      // looking for a call. We may emit an unnecessary nop in some cases.
+      if (!MBBI->isPseudo()) {
+        if (MBBI->isCall())
+          EmitAndCountInstruction(MCInstBuilder(X86::NOOP));
+        break;
+      }
     }
+    return;
+  }
   }
 
   MCInst TmpInst;
@@ -2633,7 +2610,7 @@ void X86AsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // after it.
     SMShadowTracker.emitShadowPadding(*OutStreamer, getSubtargetInfo());
     // Then emit the call
-    OutStreamer->EmitInstruction(TmpInst, getSubtargetInfo());
+    OutStreamer->emitInstruction(TmpInst, getSubtargetInfo());
     return;
   }
 
diff --git a/llvm/lib/Target/X86/X86MachineFunctionInfo.h b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
index 5cb80a082b56..eedad952c3b9 100644
--- a/llvm/lib/Target/X86/X86MachineFunctionInfo.h
+++ b/llvm/lib/Target/X86/X86MachineFunctionInfo.h
@@ -13,9 +13,10 @@
 #ifndef LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
 #define LLVM_LIB_TARGET_X86_X86MACHINEFUNCTIONINFO_H
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/CodeGen/CallingConvLower.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/Support/MachineValueType.h"
 
 namespace llvm {
 
@@ -62,12 +63,12 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// SRetReturnReg - Some subtargets require that sret lowering includes
   /// returning the value of the returned struct in a register. This field
   /// holds the virtual register into which the sret argument is passed.
-  unsigned SRetReturnReg = 0;
+  Register SRetReturnReg;
 
   /// GlobalBaseReg - keeps track of the virtual register initialized for
   /// use as the global base register. This is used for PIC in some PIC
   /// relocation models.
-  unsigned GlobalBaseReg = 0;
+  Register GlobalBaseReg;
 
   /// VarArgsFrameIndex - FrameIndex for start of varargs area.
   int VarArgsFrameIndex = 0;
@@ -104,6 +105,13 @@ class X86MachineFunctionInfo : public MachineFunctionInfo {
   /// True if this function has WIN_ALLOCA instructions.
   bool HasWinAlloca = false;
 
+  /// True if this function has any preallocated calls.
+  bool HasPreallocatedCall = false;
+
+  ValueMap<const Value *, size_t> PreallocatedIds;
+  SmallVector<size_t, 0> PreallocatedStackSizes;
+  SmallVector<SmallVector<size_t, 4>, 0> PreallocatedArgOffsets;
+
 private:
   /// ForwardedMustTailRegParms - A list of virtual and physical registers
   /// that must be forwarded to every musttail call.
@@ -143,11 +151,11 @@ public:
   int getTCReturnAddrDelta() const { return TailCallReturnAddrDelta; }
   void setTCReturnAddrDelta(int delta) {TailCallReturnAddrDelta = delta;}
 
-  unsigned getSRetReturnReg() const { return SRetReturnReg; }
-  void setSRetReturnReg(unsigned Reg) { SRetReturnReg = Reg; }
+  Register getSRetReturnReg() const { return SRetReturnReg; }
+  void setSRetReturnReg(Register Reg) { SRetReturnReg = Reg; }
 
-  unsigned getGlobalBaseReg() const { return GlobalBaseReg; }
-  void setGlobalBaseReg(unsigned Reg) { GlobalBaseReg = Reg; }
+  Register getGlobalBaseReg() const { return GlobalBaseReg; }
+  void setGlobalBaseReg(Register Reg) { GlobalBaseReg = Reg; }
 
   int getVarArgsFrameIndex() const { return VarArgsFrameIndex; }
   void setVarArgsFrameIndex(int Idx) { VarArgsFrameIndex = Idx; }
@@ -185,6 +193,36 @@ public:
 
   bool hasWinAlloca() const { return HasWinAlloca; }
   void setHasWinAlloca(bool v) { HasWinAlloca = v; }
+
+  bool hasPreallocatedCall() const { return HasPreallocatedCall; }
+  void setHasPreallocatedCall(bool v) { HasPreallocatedCall = v; }
+
+  size_t getPreallocatedIdForCallSite(const Value *CS) {
+    auto Insert = PreallocatedIds.insert({CS, PreallocatedIds.size()});
+    if (Insert.second) {
+      PreallocatedStackSizes.push_back(0);
+      PreallocatedArgOffsets.emplace_back();
+    }
+    return Insert.first->second;
+  }
+
+  void setPreallocatedStackSize(size_t Id, size_t StackSize) {
+    PreallocatedStackSizes[Id] = StackSize;
+  }
+
+  size_t getPreallocatedStackSize(const size_t Id) {
+    assert(PreallocatedStackSizes[Id] != 0 && "stack size not set");
+    return PreallocatedStackSizes[Id];
+  }
+
+  void setPreallocatedArgOffsets(size_t Id, ArrayRef<size_t> AO) {
+    PreallocatedArgOffsets[Id].assign(AO.begin(), AO.end());
+  }
+
+  const ArrayRef<size_t> getPreallocatedArgOffsets(const size_t Id) {
+    assert(!PreallocatedArgOffsets[Id].empty() && "arg offsets not set");
+    return PreallocatedArgOffsets[Id];
+  }
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/X86/X86MacroFusion.cpp b/llvm/lib/Target/X86/X86MacroFusion.cpp
index b19d1263e0c9..425054cfdd92 100644
--- a/llvm/lib/Target/X86/X86MacroFusion.cpp
+++ b/llvm/lib/Target/X86/X86MacroFusion.cpp
@@ -11,8 +11,8 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86MacroFusion.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86Subtarget.h"
 #include "llvm/CodeGen/MacroFusion.h"
 #include "llvm/CodeGen/TargetInstrInfo.h"
diff --git a/llvm/lib/Target/X86/X86MacroFusion.h b/llvm/lib/Target/X86/X86MacroFusion.h
index d4ae54f657a5..05388b275ca3 100644
--- a/llvm/lib/Target/X86/X86MacroFusion.h
+++ b/llvm/lib/Target/X86/X86MacroFusion.h
@@ -14,10 +14,12 @@
 #ifndef LLVM_LIB_TARGET_X86_X86MACROFUSION_H
 #define LLVM_LIB_TARGET_X86_X86MACROFUSION_H
 
-#include "llvm/CodeGen/MachineScheduler.h"
+#include <memory>
 
 namespace llvm {
 
+class ScheduleDAGMutation;
+
 /// Note that you have to add:
 ///   DAG.addMutation(createX86MacroFusionDAGMutation());
 /// to X86PassConfig::createMachineScheduler() to have an effect.
diff --git a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
index 0c791b6674dc..c8899a85118e 100644
--- a/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
+++ b/llvm/lib/Target/X86/X86OptimizeLEAs.cpp
@@ -578,7 +578,7 @@ bool X86OptimizeLEAPass::removeRedundantAddrCalc(MemOpMap &LEAs) {
 MachineInstr *X86OptimizeLEAPass::replaceDebugValue(MachineInstr &MI,
                                                     unsigned VReg,
                                                     int64_t AddrDispShift) {
-  DIExpression *Expr = const_cast<DIExpression *>(MI.getDebugExpression());
+  const DIExpression *Expr = MI.getDebugExpression();
   if (AddrDispShift != 0)
     Expr = DIExpression::prepend(Expr, DIExpression::StackValue, AddrDispShift);
 
diff --git a/llvm/lib/Target/X86/X86PadShortFunction.cpp b/llvm/lib/Target/X86/X86PadShortFunction.cpp
index 4c6bd0ccc2cd..ec81b07f9e5f 100644
--- a/llvm/lib/Target/X86/X86PadShortFunction.cpp
+++ b/llvm/lib/Target/X86/X86PadShortFunction.cpp
@@ -58,6 +58,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequired<ProfileSummaryInfoWrapperPass>();
       AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
+      AU.addPreserved<LazyMachineBlockFrequencyInfoPass>();
       MachineFunctionPass::getAnalysisUsage(AU);
     }
 
diff --git a/llvm/lib/Target/X86/X86PartialReduction.cpp b/llvm/lib/Target/X86/X86PartialReduction.cpp
new file mode 100644
index 000000000000..8784a3df1773
--- /dev/null
+++ b/llvm/lib/Target/X86/X86PartialReduction.cpp
@@ -0,0 +1,490 @@
+//===-- X86PartialReduction.cpp -------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass looks for add instructions used by a horizontal reduction to see
+// if we might be able to use pmaddwd or psadbw. Some cases of this require
+// cross basic block knowledge and can't be done in SelectionDAG.
+//
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/CodeGen/TargetPassConfig.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicsX86.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Operator.h"
+#include "llvm/Pass.h"
+#include "X86TargetMachine.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-partial-reduction"
+
+namespace {
+
+class X86PartialReduction : public FunctionPass {
+  const DataLayout *DL;
+  const X86Subtarget *ST;
+
+public:
+  static char ID; // Pass identification, replacement for typeid.
+
+  X86PartialReduction() : FunctionPass(ID) { }
+
+  bool runOnFunction(Function &Fn) override;
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+  }
+
+  StringRef getPassName() const override {
+    return "X86 Partial Reduction";
+  }
+
+private:
+  bool tryMAddReplacement(Instruction *Op);
+  bool trySADReplacement(Instruction *Op);
+};
+}
+
+FunctionPass *llvm::createX86PartialReductionPass() {
+  return new X86PartialReduction();
+}
+
+char X86PartialReduction::ID = 0;
+
+INITIALIZE_PASS(X86PartialReduction, DEBUG_TYPE,
+                "X86 Partial Reduction", false, false)
+
+bool X86PartialReduction::tryMAddReplacement(Instruction *Op) {
+  if (!ST->hasSSE2())
+    return false;
+
+  // Need at least 8 elements.
+  if (cast<FixedVectorType>(Op->getType())->getNumElements() < 8)
+    return false;
+
+  // Element type should be i32.
+  if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32))
+    return false;
+
+  auto *Mul = dyn_cast<BinaryOperator>(Op);
+  if (!Mul || Mul->getOpcode() != Instruction::Mul)
+    return false;
+
+  Value *LHS = Mul->getOperand(0);
+  Value *RHS = Mul->getOperand(1);
+
+  // LHS and RHS should be only used once or if they are the same then only
+  // used twice. Only check this when SSE4.1 is enabled and we have zext/sext
+  // instructions, otherwise we use punpck to emulate zero extend in stages. The
+  // trunc/ we need to do likely won't introduce new instructions in that case.
+  if (ST->hasSSE41()) {
+    if (LHS == RHS) {
+      if (!isa<Constant>(LHS) && !LHS->hasNUses(2))
+        return false;
+    } else {
+      if (!isa<Constant>(LHS) && !LHS->hasOneUse())
+        return false;
+      if (!isa<Constant>(RHS) && !RHS->hasOneUse())
+        return false;
+    }
+  }
+
+  auto CanShrinkOp = [&](Value *Op) {
+    auto IsFreeTruncation = [&](Value *Op) {
+      if (auto *Cast = dyn_cast<CastInst>(Op)) {
+        if (Cast->getParent() == Mul->getParent() &&
+            (Cast->getOpcode() == Instruction::SExt ||
+             Cast->getOpcode() == Instruction::ZExt) &&
+            Cast->getOperand(0)->getType()->getScalarSizeInBits() <= 16)
+          return true;
+      }
+
+      return isa<Constant>(Op);
+    };
+
+    // If the operation can be freely truncated and has enough sign bits we
+    // can shrink.
+    if (IsFreeTruncation(Op) &&
+        ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16)
+      return true;
+
+    // SelectionDAG has limited support for truncating through an add or sub if
+    // the inputs are freely truncatable.
+    if (auto *BO = dyn_cast<BinaryOperator>(Op)) {
+      if (BO->getParent() == Mul->getParent() &&
+          IsFreeTruncation(BO->getOperand(0)) &&
+          IsFreeTruncation(BO->getOperand(1)) &&
+          ComputeNumSignBits(Op, *DL, 0, nullptr, Mul) > 16)
+        return true;
+    }
+
+    return false;
+  };
+
+  // Both Ops need to be shrinkable.
+  if (!CanShrinkOp(LHS) && !CanShrinkOp(RHS))
+    return false;
+
+  IRBuilder<> Builder(Mul);
+
+  auto *MulTy = cast<FixedVectorType>(Op->getType());
+  unsigned NumElts = MulTy->getNumElements();
+
+  // Extract even elements and odd elements and add them together. This will
+  // be pattern matched by SelectionDAG to pmaddwd. This instruction will be
+  // half the original width.
+  SmallVector<int, 16> EvenMask(NumElts / 2);
+  SmallVector<int, 16> OddMask(NumElts / 2);
+  for (int i = 0, e = NumElts / 2; i != e; ++i) {
+    EvenMask[i] = i * 2;
+    OddMask[i] = i * 2 + 1;
+  }
+  // Creating a new mul so the replaceAllUsesWith below doesn't replace the
+  // uses in the shuffles we're creating.
+  Value *NewMul = Builder.CreateMul(Mul->getOperand(0), Mul->getOperand(1));
+  Value *EvenElts = Builder.CreateShuffleVector(NewMul, NewMul, EvenMask);
+  Value *OddElts = Builder.CreateShuffleVector(NewMul, NewMul, OddMask);
+  Value *MAdd = Builder.CreateAdd(EvenElts, OddElts);
+
+  // Concatenate zeroes to extend back to the original type.
+  SmallVector<int, 32> ConcatMask(NumElts);
+  std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+  Value *Zero = Constant::getNullValue(MAdd->getType());
+  Value *Concat = Builder.CreateShuffleVector(MAdd, Zero, ConcatMask);
+
+  Mul->replaceAllUsesWith(Concat);
+  Mul->eraseFromParent();
+
+  return true;
+}
+
+bool X86PartialReduction::trySADReplacement(Instruction *Op) {
+  if (!ST->hasSSE2())
+    return false;
+
+  // TODO: There's nothing special about i32, any integer type above i16 should
+  // work just as well.
+  if (!cast<VectorType>(Op->getType())->getElementType()->isIntegerTy(32))
+    return false;
+
+  // Operand should be a select.
+  auto *SI = dyn_cast<SelectInst>(Op);
+  if (!SI)
+    return false;
+
+  // Select needs to implement absolute value.
+  Value *LHS, *RHS;
+  auto SPR = matchSelectPattern(SI, LHS, RHS);
+  if (SPR.Flavor != SPF_ABS)
+    return false;
+
+  // Need a subtract of two values.
+  auto *Sub = dyn_cast<BinaryOperator>(LHS);
+  if (!Sub || Sub->getOpcode() != Instruction::Sub)
+    return false;
+
+  // Look for zero extend from i8.
+  auto getZeroExtendedVal = [](Value *Op) -> Value * {
+    if (auto *ZExt = dyn_cast<ZExtInst>(Op))
+      if (cast<VectorType>(ZExt->getOperand(0)->getType())
+              ->getElementType()
+              ->isIntegerTy(8))
+        return ZExt->getOperand(0);
+
+    return nullptr;
+  };
+
+  // Both operands of the subtract should be extends from vXi8.
+  Value *Op0 = getZeroExtendedVal(Sub->getOperand(0));
+  Value *Op1 = getZeroExtendedVal(Sub->getOperand(1));
+  if (!Op0 || !Op1)
+    return false;
+
+  IRBuilder<> Builder(SI);
+
+  auto *OpTy = cast<FixedVectorType>(Op->getType());
+  unsigned NumElts = OpTy->getNumElements();
+
+  unsigned IntrinsicNumElts;
+  Intrinsic::ID IID;
+  if (ST->hasBWI() && NumElts >= 64) {
+    IID = Intrinsic::x86_avx512_psad_bw_512;
+    IntrinsicNumElts = 64;
+  } else if (ST->hasAVX2() && NumElts >= 32) {
+    IID = Intrinsic::x86_avx2_psad_bw;
+    IntrinsicNumElts = 32;
+  } else {
+    IID = Intrinsic::x86_sse2_psad_bw;
+    IntrinsicNumElts = 16;
+  }
+
+  Function *PSADBWFn = Intrinsic::getDeclaration(SI->getModule(), IID);
+
+  if (NumElts < 16) {
+    // Pad input with zeroes.
+    SmallVector<int, 32> ConcatMask(16);
+    for (unsigned i = 0; i != NumElts; ++i)
+      ConcatMask[i] = i;
+    for (unsigned i = NumElts; i != 16; ++i)
+      ConcatMask[i] = (i % NumElts) + NumElts;
+
+    Value *Zero = Constant::getNullValue(Op0->getType());
+    Op0 = Builder.CreateShuffleVector(Op0, Zero, ConcatMask);
+    Op1 = Builder.CreateShuffleVector(Op1, Zero, ConcatMask);
+    NumElts = 16;
+  }
+
+  // Intrinsics produce vXi64 and need to be casted to vXi32.
+  auto *I32Ty =
+      FixedVectorType::get(Builder.getInt32Ty(), IntrinsicNumElts / 4);
+
+  assert(NumElts % IntrinsicNumElts == 0 && "Unexpected number of elements!");
+  unsigned NumSplits = NumElts / IntrinsicNumElts;
+
+  // First collect the pieces we need.
+  SmallVector<Value *, 4> Ops(NumSplits);
+  for (unsigned i = 0; i != NumSplits; ++i) {
+    SmallVector<int, 64> ExtractMask(IntrinsicNumElts);
+    std::iota(ExtractMask.begin(), ExtractMask.end(), i * IntrinsicNumElts);
+    Value *ExtractOp0 = Builder.CreateShuffleVector(Op0, Op0, ExtractMask);
+    Value *ExtractOp1 = Builder.CreateShuffleVector(Op1, Op0, ExtractMask);
+    Ops[i] = Builder.CreateCall(PSADBWFn, {ExtractOp0, ExtractOp1});
+    Ops[i] = Builder.CreateBitCast(Ops[i], I32Ty);
+  }
+
+  assert(isPowerOf2_32(NumSplits) && "Expected power of 2 splits");
+  unsigned Stages = Log2_32(NumSplits);
+  for (unsigned s = Stages; s > 0; --s) {
+    unsigned NumConcatElts =
+        cast<FixedVectorType>(Ops[0]->getType())->getNumElements() * 2;
+    for (unsigned i = 0; i != 1U << (s - 1); ++i) {
+      SmallVector<int, 64> ConcatMask(NumConcatElts);
+      std::iota(ConcatMask.begin(), ConcatMask.end(), 0);
+      Ops[i] = Builder.CreateShuffleVector(Ops[i*2], Ops[i*2+1], ConcatMask);
+    }
+  }
+
+  // At this point the final value should be in Ops[0]. Now we need to adjust
+  // it to the final original type.
+  NumElts = cast<FixedVectorType>(OpTy)->getNumElements();
+  if (NumElts == 2) {
+    // Extract down to 2 elements.
+    Ops[0] = Builder.CreateShuffleVector(Ops[0], Ops[0], ArrayRef<int>{0, 1});
+  } else if (NumElts >= 8) {
+    SmallVector<int, 32> ConcatMask(NumElts);
+    unsigned SubElts =
+        cast<FixedVectorType>(Ops[0]->getType())->getNumElements();
+    for (unsigned i = 0; i != SubElts; ++i)
+      ConcatMask[i] = i;
+    for (unsigned i = SubElts; i != NumElts; ++i)
+      ConcatMask[i] = (i % SubElts) + SubElts;
+
+    Value *Zero = Constant::getNullValue(Ops[0]->getType());
+    Ops[0] = Builder.CreateShuffleVector(Ops[0], Zero, ConcatMask);
+  }
+
+  SI->replaceAllUsesWith(Ops[0]);
+  SI->eraseFromParent();
+
+  return true;
+}
+
+// Walk backwards from the ExtractElementInst and determine if it is the end of
+// a horizontal reduction. Return the input to the reduction if we find one.
+static Value *matchAddReduction(const ExtractElementInst &EE) {
+  // Make sure we're extracting index 0.
+  auto *Index = dyn_cast<ConstantInt>(EE.getIndexOperand());
+  if (!Index || !Index->isNullValue())
+    return nullptr;
+
+  const auto *BO = dyn_cast<BinaryOperator>(EE.getVectorOperand());
+  if (!BO || BO->getOpcode() != Instruction::Add || !BO->hasOneUse())
+    return nullptr;
+
+  unsigned NumElems = cast<FixedVectorType>(BO->getType())->getNumElements();
+  // Ensure the reduction size is a power of 2.
+  if (!isPowerOf2_32(NumElems))
+    return nullptr;
+
+  const Value *Op = BO;
+  unsigned Stages = Log2_32(NumElems);
+  for (unsigned i = 0; i != Stages; ++i) {
+    const auto *BO = dyn_cast<BinaryOperator>(Op);
+    if (!BO || BO->getOpcode() != Instruction::Add)
+      return nullptr;
+
+    // If this isn't the first add, then it should only have 2 users, the
+    // shuffle and another add which we checked in the previous iteration.
+    if (i != 0 && !BO->hasNUses(2))
+      return nullptr;
+
+    Value *LHS = BO->getOperand(0);
+    Value *RHS = BO->getOperand(1);
+
+    auto *Shuffle = dyn_cast<ShuffleVectorInst>(LHS);
+    if (Shuffle) {
+      Op = RHS;
+    } else {
+      Shuffle = dyn_cast<ShuffleVectorInst>(RHS);
+      Op = LHS;
+    }
+
+    // The first operand of the shuffle should be the same as the other operand
+    // of the bin op.
+    if (!Shuffle || Shuffle->getOperand(0) != Op)
+      return nullptr;
+
+    // Verify the shuffle has the expected (at this stage of the pyramid) mask.
+    unsigned MaskEnd = 1 << i;
+    for (unsigned Index = 0; Index < MaskEnd; ++Index)
+      if (Shuffle->getMaskValue(Index) != (int)(MaskEnd + Index))
+        return nullptr;
+  }
+
+  return const_cast<Value *>(Op);
+}
+
+// See if this BO is reachable from this Phi by walking forward through single
+// use BinaryOperators with the same opcode. If we get back then we know we've
+// found a loop and it is safe to step through this Add to find more leaves.
+static bool isReachableFromPHI(PHINode *Phi, BinaryOperator *BO) {
+  // The PHI itself should only have one use.
+  if (!Phi->hasOneUse())
+    return false;
+
+  Instruction *U = cast<Instruction>(*Phi->user_begin());
+  if (U == BO)
+    return true;
+
+  while (U->hasOneUse() && U->getOpcode() == BO->getOpcode())
+    U = cast<Instruction>(*U->user_begin());
+
+  return U == BO;
+}
+
+// Collect all the leaves of the tree of adds that feeds into the horizontal
+// reduction. Root is the Value that is used by the horizontal reduction.
+// We look through single use phis, single use adds, or adds that are used by
+// a phi that forms a loop with the add.
+static void collectLeaves(Value *Root, SmallVectorImpl<Instruction *> &Leaves) {
+  SmallPtrSet<Value *, 8> Visited;
+  SmallVector<Value *, 8> Worklist;
+  Worklist.push_back(Root);
+
+  while (!Worklist.empty()) {
+    Value *V = Worklist.pop_back_val();
+     if (!Visited.insert(V).second)
+       continue;
+
+    if (auto *PN = dyn_cast<PHINode>(V)) {
+      // PHI node should have single use unless it is the root node, then it
+      // has 2 uses.
+      if (!PN->hasNUses(PN == Root ? 2 : 1))
+        break;
+
+      // Push incoming values to the worklist.
+      for (Value *InV : PN->incoming_values())
+        Worklist.push_back(InV);
+
+      continue;
+    }
+
+    if (auto *BO = dyn_cast<BinaryOperator>(V)) {
+      if (BO->getOpcode() == Instruction::Add) {
+        // Simple case. Single use, just push its operands to the worklist.
+        if (BO->hasNUses(BO == Root ? 2 : 1)) {
+          for (Value *Op : BO->operands())
+            Worklist.push_back(Op);
+          continue;
+        }
+
+        // If there is additional use, make sure it is an unvisited phi that
+        // gets us back to this node.
+        if (BO->hasNUses(BO == Root ? 3 : 2)) {
+          PHINode *PN = nullptr;
+          for (auto *U : Root->users())
+            if (auto *P = dyn_cast<PHINode>(U))
+              if (!Visited.count(P))
+                PN = P;
+
+          // If we didn't find a 2-input PHI then this isn't a case we can
+          // handle.
+          if (!PN || PN->getNumIncomingValues() != 2)
+            continue;
+
+          // Walk forward from this phi to see if it reaches back to this add.
+          if (!isReachableFromPHI(PN, BO))
+            continue;
+
+          // The phi forms a loop with this Add, push its operands.
+          for (Value *Op : BO->operands())
+            Worklist.push_back(Op);
+        }
+      }
+    }
+
+    // Not an add or phi, make it a leaf.
+    if (auto *I = dyn_cast<Instruction>(V)) {
+      if (!V->hasNUses(I == Root ? 2 : 1))
+        continue;
+
+      // Add this as a leaf.
+      Leaves.push_back(I);
+    }
+  }
+}
+
+bool X86PartialReduction::runOnFunction(Function &F) {
+  if (skipFunction(F))
+    return false;
+
+  auto *TPC = getAnalysisIfAvailable<TargetPassConfig>();
+  if (!TPC)
+    return false;
+
+  auto &TM = TPC->getTM<X86TargetMachine>();
+  ST = TM.getSubtargetImpl(F);
+
+  DL = &F.getParent()->getDataLayout();
+
+  bool MadeChange = false;
+  for (auto &BB : F) {
+    for (auto &I : BB) {
+      auto *EE = dyn_cast<ExtractElementInst>(&I);
+      if (!EE)
+        continue;
+
+      // First find a reduction tree.
+      // FIXME: Do we need to handle other opcodes than Add?
+      Value *Root = matchAddReduction(*EE);
+      if (!Root)
+        continue;
+
+      SmallVector<Instruction *, 8> Leaves;
+      collectLeaves(Root, Leaves);
+
+      for (Instruction *I : Leaves) {
+        if (tryMAddReplacement(I)) {
+          MadeChange = true;
+          continue;
+        }
+
+        // Don't do SAD matching on the root node. SelectionDAG already
+        // has support for that and currently generates better code.
+        if (I != Root && trySADReplacement(I))
+          MadeChange = true;
+      }
+    }
+  }
+
+  return MadeChange;
+}
diff --git a/llvm/lib/Target/X86/X86PfmCounters.td b/llvm/lib/Target/X86/X86PfmCounters.td
index 93238983afa2..833013fb69f3 100644
--- a/llvm/lib/Target/X86/X86PfmCounters.td
+++ b/llvm/lib/Target/X86/X86PfmCounters.td
@@ -223,3 +223,13 @@ def ZnVer1PfmCounters : ProcPfmCounters {
   ];
 }
 def : PfmCountersBinding<"znver1", ZnVer1PfmCounters>;
+
+def ZnVer2PfmCounters : ProcPfmCounters {
+  let CycleCounter = PfmCounter<"cycles_not_in_halt">;
+  let UopsCounter = PfmCounter<"retired_uops">;
+  let IssueCounters = [
+    PfmIssueCounter<"Zn2AGU", "ls_dispatch:ld_dispatch + ls_dispatch:store_dispatch">,
+    PfmIssueCounter<"Zn2Divider", "div_op_count">
+  ];
+}
+def : PfmCountersBinding<"znver2", ZnVer2PfmCounters>;
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.cpp b/llvm/lib/Target/X86/X86RegisterInfo.cpp
index f69626b2622e..f456728cf47b 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.cpp
+++ b/llvm/lib/Target/X86/X86RegisterInfo.cpp
@@ -72,12 +72,6 @@ X86RegisterInfo::X86RegisterInfo(const Triple &TT)
   }
 }
 
-bool
-X86RegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
-  // ExecutionDomainFix, BreakFalseDeps and PostRAScheduler require liveness.
-  return true;
-}
-
 int
 X86RegisterInfo::getSEHRegNum(unsigned i) const {
   return getEncodingValue(i);
@@ -633,18 +627,22 @@ static bool CantUseSP(const MachineFrameInfo &MFI) {
 }
 
 bool X86RegisterInfo::hasBasePointer(const MachineFunction &MF) const {
-   const MachineFrameInfo &MFI = MF.getFrameInfo();
-
-   if (!EnableBasePointer)
-     return false;
-
-   // When we need stack realignment, we can't address the stack from the frame
-   // pointer.  When we have dynamic allocas or stack-adjusting inline asm, we
-   // can't address variables from the stack pointer.  MS inline asm can
-   // reference locals while also adjusting the stack pointer.  When we can't
-   // use both the SP and the FP, we need a separate base pointer register.
-   bool CantUseFP = needsStackRealignment(MF);
-   return CantUseFP && CantUseSP(MFI);
+  const X86MachineFunctionInfo *X86FI = MF.getInfo<X86MachineFunctionInfo>();
+  if (X86FI->hasPreallocatedCall())
+    return true;
+
+  const MachineFrameInfo &MFI = MF.getFrameInfo();
+
+  if (!EnableBasePointer)
+    return false;
+
+  // When we need stack realignment, we can't address the stack from the frame
+  // pointer.  When we have dynamic allocas or stack-adjusting inline asm, we
+  // can't address variables from the stack pointer.  MS inline asm can
+  // reference locals while also adjusting the stack pointer.  When we can't
+  // use both the SP and the FP, we need a separate base pointer register.
+  bool CantUseFP = needsStackRealignment(MF);
+  return CantUseFP && CantUseSP(MFI);
 }
 
 bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
@@ -667,7 +665,7 @@ bool X86RegisterInfo::canRealignStack(const MachineFunction &MF) const {
 }
 
 bool X86RegisterInfo::hasReservedSpillSlot(const MachineFunction &MF,
-                                           unsigned Reg, int &FrameIdx) const {
+                                           Register Reg, int &FrameIdx) const {
   // Since X86 defines assignCalleeSavedSpillSlots which always return true
   // this function neither used nor tested.
   llvm_unreachable("Unused function on X86. Otherwise need a test case.");
@@ -728,7 +726,7 @@ X86RegisterInfo::eliminateFrameIndex(MachineBasicBlock::iterator II,
 
   // Determine base register and offset.
   int FIOffset;
-  unsigned BasePtr;
+  Register BasePtr;
   if (MI.isReturn()) {
     assert((!needsStackRealignment(MF) ||
            MF.getFrameInfo().isFixedObjectIndex(FrameIndex)) &&
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.h b/llvm/lib/Target/X86/X86RegisterInfo.h
index b82920898069..3435c0a10b04 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.h
+++ b/llvm/lib/Target/X86/X86RegisterInfo.h
@@ -54,10 +54,6 @@ public:
   // FIXME: This should be tablegen'd like getDwarfRegNum is
   int getSEHRegNum(unsigned i) const;
 
-  /// Code Generation virtual methods...
-  ///
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
-
   /// getMatchingSuperRegClass - Return a subclass of the specified register
   /// class A so that each register in it has a sub-register of the
   /// specified sub-register index which is in the specified register class B.
@@ -125,7 +121,7 @@ public:
 
   bool canRealignStack(const MachineFunction &MF) const override;
 
-  bool hasReservedSpillSlot(const MachineFunction &MF, unsigned Reg,
+  bool hasReservedSpillSlot(const MachineFunction &MF, Register Reg,
                             int &FrameIdx) const override;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator MI,
diff --git a/llvm/lib/Target/X86/X86RegisterInfo.td b/llvm/lib/Target/X86/X86RegisterInfo.td
index 3cfaf714e93e..8de5b94bbffa 100644
--- a/llvm/lib/Target/X86/X86RegisterInfo.td
+++ b/llvm/lib/Target/X86/X86RegisterInfo.td
@@ -265,6 +265,16 @@ let SubRegIndices = [sub_ymm] in {
   }
 }
 
+// Tile "registers".
+def TMM0:  X86Reg<"tmm0",   0>;
+def TMM1:  X86Reg<"tmm1",   1>;
+def TMM2:  X86Reg<"tmm2",   2>;
+def TMM3:  X86Reg<"tmm3",   3>;
+def TMM4:  X86Reg<"tmm4",   4>;
+def TMM5:  X86Reg<"tmm5",   5>;
+def TMM6:  X86Reg<"tmm6",   6>;
+def TMM7:  X86Reg<"tmm7",   7>;
+
 // Mask Registers, used by AVX-512 instructions.
 def K0 : X86Reg<"k0", 0>, DwarfRegNum<[118,  93,  93]>;
 def K1 : X86Reg<"k1", 1>, DwarfRegNum<[119,  94,  94]>;
@@ -498,7 +508,7 @@ def GR64_NOREX_NOSP : RegisterClass<"X86", [i64], 64,
 // which we do not have right now.
 def LOW32_ADDR_ACCESS : RegisterClass<"X86", [i32], 32, (add GR32, RIP)>;
 
-// When RBP is used as a base pointer in a 32-bit addresses environement,
+// When RBP is used as a base pointer in a 32-bit addresses environment,
 // this is also safe to use the full register to access addresses.
 // Since RBP will never be spilled, stick to a 32 alignment to save
 // on memory consumption.
@@ -621,3 +631,8 @@ def VK64WM  : RegisterClass<"X86", [v64i1], 64, (add VK32WM)> {let Size = 64;}
 
 // Bound registers
 def BNDR : RegisterClass<"X86", [v2i64], 128, (sequence "BND%u", 0, 3)>;
+
+// Tiles
+let isAllocatable = 0 in
+def TILE : RegisterClass<"X86", [untyped], 0,
+                         (sequence "TMM%u", 0, 7)> {let Size = 8192;}
diff --git a/llvm/lib/Target/X86/X86RetpolineThunks.cpp b/llvm/lib/Target/X86/X86RetpolineThunks.cpp
deleted file mode 100644
index 9085d7f068ac..000000000000
--- a/llvm/lib/Target/X86/X86RetpolineThunks.cpp
+++ /dev/null
@@ -1,286 +0,0 @@
-//======- X86RetpolineThunks.cpp - Construct retpoline thunks for x86  --=====//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-/// \file
-///
-/// Pass that injects an MI thunk implementing a "retpoline". This is
-/// a RET-implemented trampoline that is used to lower indirect calls in a way
-/// that prevents speculation on some x86 processors and can be used to mitigate
-/// security vulnerabilities due to targeted speculative execution and side
-/// channels such as CVE-2017-5715.
-///
-/// TODO(chandlerc): All of this code could use better comments and
-/// documentation.
-///
-//===----------------------------------------------------------------------===//
-
-#include "X86.h"
-#include "X86InstrBuilder.h"
-#include "X86Subtarget.h"
-#include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineInstrBuilder.h"
-#include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/Passes.h"
-#include "llvm/CodeGen/TargetPassConfig.h"
-#include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/Instructions.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
-
-using namespace llvm;
-
-#define DEBUG_TYPE "x86-retpoline-thunks"
-
-static const char ThunkNamePrefix[] = "__llvm_retpoline_";
-static const char R11ThunkName[]    = "__llvm_retpoline_r11";
-static const char EAXThunkName[]    = "__llvm_retpoline_eax";
-static const char ECXThunkName[]    = "__llvm_retpoline_ecx";
-static const char EDXThunkName[]    = "__llvm_retpoline_edx";
-static const char EDIThunkName[]    = "__llvm_retpoline_edi";
-
-namespace {
-class X86RetpolineThunks : public MachineFunctionPass {
-public:
-  static char ID;
-
-  X86RetpolineThunks() : MachineFunctionPass(ID) {}
-
-  StringRef getPassName() const override { return "X86 Retpoline Thunks"; }
-
-  bool doInitialization(Module &M) override;
-  bool runOnMachineFunction(MachineFunction &F) override;
-
-  void getAnalysisUsage(AnalysisUsage &AU) const override {
-    MachineFunctionPass::getAnalysisUsage(AU);
-    AU.addRequired<MachineModuleInfoWrapperPass>();
-    AU.addPreserved<MachineModuleInfoWrapperPass>();
-  }
-
-private:
-  MachineModuleInfo *MMI = nullptr;
-  const TargetMachine *TM = nullptr;
-  bool Is64Bit = false;
-  const X86Subtarget *STI = nullptr;
-  const X86InstrInfo *TII = nullptr;
-
-  bool InsertedThunks = false;
-
-  void createThunkFunction(Module &M, StringRef Name);
-  void insertRegReturnAddrClobber(MachineBasicBlock &MBB, unsigned Reg);
-  void populateThunk(MachineFunction &MF, unsigned Reg);
-};
-
-} // end anonymous namespace
-
-FunctionPass *llvm::createX86RetpolineThunksPass() {
-  return new X86RetpolineThunks();
-}
-
-char X86RetpolineThunks::ID = 0;
-
-bool X86RetpolineThunks::doInitialization(Module &M) {
-  InsertedThunks = false;
-  return false;
-}
-
-bool X86RetpolineThunks::runOnMachineFunction(MachineFunction &MF) {
-  LLVM_DEBUG(dbgs() << getPassName() << '\n');
-
-  TM = &MF.getTarget();;
-  STI = &MF.getSubtarget<X86Subtarget>();
-  TII = STI->getInstrInfo();
-  Is64Bit = TM->getTargetTriple().getArch() == Triple::x86_64;
-
-  MMI = &getAnalysis<MachineModuleInfoWrapperPass>().getMMI();
-  Module &M = const_cast<Module &>(*MMI->getModule());
-
-  // If this function is not a thunk, check to see if we need to insert
-  // a thunk.
-  if (!MF.getName().startswith(ThunkNamePrefix)) {
-    // If we've already inserted a thunk, nothing else to do.
-    if (InsertedThunks)
-      return false;
-
-    // Only add a thunk if one of the functions has the retpoline feature
-    // enabled in its subtarget, and doesn't enable external thunks.
-    // FIXME: Conditionalize on indirect calls so we don't emit a thunk when
-    // nothing will end up calling it.
-    // FIXME: It's a little silly to look at every function just to enumerate
-    // the subtargets, but eventually we'll want to look at them for indirect
-    // calls, so maybe this is OK.
-    if ((!STI->useRetpolineIndirectCalls() &&
-         !STI->useRetpolineIndirectBranches()) ||
-        STI->useRetpolineExternalThunk())
-      return false;
-
-    // Otherwise, we need to insert the thunk.
-    // WARNING: This is not really a well behaving thing to do in a function
-    // pass. We extract the module and insert a new function (and machine
-    // function) directly into the module.
-    if (Is64Bit)
-      createThunkFunction(M, R11ThunkName);
-    else
-      for (StringRef Name :
-           {EAXThunkName, ECXThunkName, EDXThunkName, EDIThunkName})
-        createThunkFunction(M, Name);
-    InsertedThunks = true;
-    return true;
-  }
-
-  // If this *is* a thunk function, we need to populate it with the correct MI.
-  if (Is64Bit) {
-    assert(MF.getName() == "__llvm_retpoline_r11" &&
-           "Should only have an r11 thunk on 64-bit targets");
-
-    // __llvm_retpoline_r11:
-    //   callq .Lr11_call_target
-    // .Lr11_capture_spec:
-    //   pause
-    //   lfence
-    //   jmp .Lr11_capture_spec
-    // .align 16
-    // .Lr11_call_target:
-    //   movq %r11, (%rsp)
-    //   retq
-    populateThunk(MF, X86::R11);
-  } else {
-    // For 32-bit targets we need to emit a collection of thunks for various
-    // possible scratch registers as well as a fallback that uses EDI, which is
-    // normally callee saved.
-    //   __llvm_retpoline_eax:
-    //         calll .Leax_call_target
-    //   .Leax_capture_spec:
-    //         pause
-    //         jmp .Leax_capture_spec
-    //   .align 16
-    //   .Leax_call_target:
-    //         movl %eax, (%esp)  # Clobber return addr
-    //         retl
-    //
-    //   __llvm_retpoline_ecx:
-    //   ... # Same setup
-    //         movl %ecx, (%esp)
-    //         retl
-    //
-    //   __llvm_retpoline_edx:
-    //   ... # Same setup
-    //         movl %edx, (%esp)
-    //         retl
-    //
-    //   __llvm_retpoline_edi:
-    //   ... # Same setup
-    //         movl %edi, (%esp)
-    //         retl
-    if (MF.getName() == EAXThunkName)
-      populateThunk(MF, X86::EAX);
-    else if (MF.getName() == ECXThunkName)
-      populateThunk(MF, X86::ECX);
-    else if (MF.getName() == EDXThunkName)
-      populateThunk(MF, X86::EDX);
-    else if (MF.getName() == EDIThunkName)
-      populateThunk(MF, X86::EDI);
-    else
-      llvm_unreachable("Invalid thunk name on x86-32!");
-  }
-
-  return true;
-}
-
-void X86RetpolineThunks::createThunkFunction(Module &M, StringRef Name) {
-  assert(Name.startswith(ThunkNamePrefix) &&
-         "Created a thunk with an unexpected prefix!");
-
-  LLVMContext &Ctx = M.getContext();
-  auto Type = FunctionType::get(Type::getVoidTy(Ctx), false);
-  Function *F =
-      Function::Create(Type, GlobalValue::LinkOnceODRLinkage, Name, &M);
-  F->setVisibility(GlobalValue::HiddenVisibility);
-  F->setComdat(M.getOrInsertComdat(Name));
-
-  // Add Attributes so that we don't create a frame, unwind information, or
-  // inline.
-  AttrBuilder B;
-  B.addAttribute(llvm::Attribute::NoUnwind);
-  B.addAttribute(llvm::Attribute::Naked);
-  F->addAttributes(llvm::AttributeList::FunctionIndex, B);
-
-  // Populate our function a bit so that we can verify.
-  BasicBlock *Entry = BasicBlock::Create(Ctx, "entry", F);
-  IRBuilder<> Builder(Entry);
-
-  Builder.CreateRetVoid();
-
-  // MachineFunctions/MachineBasicBlocks aren't created automatically for the
-  // IR-level constructs we already made. Create them and insert them into the
-  // module.
-  MachineFunction &MF = MMI->getOrCreateMachineFunction(*F);
-  MachineBasicBlock *EntryMBB = MF.CreateMachineBasicBlock(Entry);
-
-  // Insert EntryMBB into MF. It's not in the module until we do this.
-  MF.insert(MF.end(), EntryMBB);
-}
-
-void X86RetpolineThunks::insertRegReturnAddrClobber(MachineBasicBlock &MBB,
-                                                    unsigned Reg) {
-  const unsigned MovOpc = Is64Bit ? X86::MOV64mr : X86::MOV32mr;
-  const unsigned SPReg = Is64Bit ? X86::RSP : X86::ESP;
-  addRegOffset(BuildMI(&MBB, DebugLoc(), TII->get(MovOpc)), SPReg, false, 0)
-      .addReg(Reg);
-}
-
-void X86RetpolineThunks::populateThunk(MachineFunction &MF,
-                                       unsigned Reg) {
-  // Set MF properties. We never use vregs...
-  MF.getProperties().set(MachineFunctionProperties::Property::NoVRegs);
-
-  // Grab the entry MBB and erase any other blocks. O0 codegen appears to
-  // generate two bbs for the entry block.
-  MachineBasicBlock *Entry = &MF.front();
-  Entry->clear();
-  while (MF.size() > 1)
-    MF.erase(std::next(MF.begin()));
-
-  MachineBasicBlock *CaptureSpec = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
-  MachineBasicBlock *CallTarget = MF.CreateMachineBasicBlock(Entry->getBasicBlock());
-  MCSymbol *TargetSym = MF.getContext().createTempSymbol();
-  MF.push_back(CaptureSpec);
-  MF.push_back(CallTarget);
-
-  const unsigned CallOpc = Is64Bit ? X86::CALL64pcrel32 : X86::CALLpcrel32;
-  const unsigned RetOpc = Is64Bit ? X86::RETQ : X86::RETL;
-
-  Entry->addLiveIn(Reg);
-  BuildMI(Entry, DebugLoc(), TII->get(CallOpc)).addSym(TargetSym);
-
-  // The MIR verifier thinks that the CALL in the entry block will fall through
-  // to CaptureSpec, so mark it as the successor. Technically, CaptureTarget is
-  // the successor, but the MIR verifier doesn't know how to cope with that.
-  Entry->addSuccessor(CaptureSpec);
-
-  // In the capture loop for speculation, we want to stop the processor from
-  // speculating as fast as possible. On Intel processors, the PAUSE instruction
-  // will block speculation without consuming any execution resources. On AMD
-  // processors, the PAUSE instruction is (essentially) a nop, so we also use an
-  // LFENCE instruction which they have advised will stop speculation as well
-  // with minimal resource utilization. We still end the capture with a jump to
-  // form an infinite loop to fully guarantee that no matter what implementation
-  // of the x86 ISA, speculating this code path never escapes.
-  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::PAUSE));
-  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::LFENCE));
-  BuildMI(CaptureSpec, DebugLoc(), TII->get(X86::JMP_1)).addMBB(CaptureSpec);
-  CaptureSpec->setHasAddressTaken();
-  CaptureSpec->addSuccessor(CaptureSpec);
-
-  CallTarget->addLiveIn(Reg);
-  CallTarget->setHasAddressTaken();
-  CallTarget->setAlignment(Align(16));
-  insertRegReturnAddrClobber(*CallTarget, Reg);
-  CallTarget->back().setPreInstrSymbol(MF, TargetSym);
-  BuildMI(CallTarget, DebugLoc(), TII->get(RetOpc));
-}
diff --git a/llvm/lib/Target/X86/X86SchedBroadwell.td b/llvm/lib/Target/X86/X86SchedBroadwell.td
index 9b1fcaa8a13d..4aea7bc253bb 100755
--- a/llvm/lib/Target/X86/X86SchedBroadwell.td
+++ b/llvm/lib/Target/X86/X86SchedBroadwell.td
@@ -260,7 +260,8 @@ defm : BWWriteResPair<WriteFCmp64X, [BWPort1],  3, [1], 1, 5>; // Floating point
 defm : BWWriteResPair<WriteFCmp64Y, [BWPort1],  3, [1], 1, 6>; // Floating point double compare (YMM/ZMM).
 defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
 
-defm : BWWriteResPair<WriteFCom,    [BWPort1],  3>; // Floating point compare to flags.
+defm : BWWriteResPair<WriteFCom,    [BWPort1],  3>; // Floating point compare to flags (X87).
+defm : BWWriteResPair<WriteFComX,   [BWPort1],  3>; // Floating point compare to flags (SSE).
 
 defm : BWWriteResPair<WriteFMul,    [BWPort01], 3, [1], 1, 5>; // Floating point multiplication.
 defm : BWWriteResPair<WriteFMulX,   [BWPort01], 3, [1], 1, 5>; // Floating point multiplication (XMM).
@@ -351,8 +352,10 @@ defm : X86WriteRes<WriteVecStoreX,       [BWPort237,BWPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteVecStoreY,       [BWPort237,BWPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteVecStoreNT,      [BWPort237,BWPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteVecStoreNTY,     [BWPort237,BWPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteVecMaskedStore,  [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore32,  [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore64,  [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [BWPort0,BWPort4,BWPort237,BWPort15], 5, [1,1,1,1], 4>;
 defm : X86WriteRes<WriteVecMove,         [BWPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [BWPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [BWPort015], 1, [1], 1>;
@@ -986,7 +989,7 @@ def BWWriteResGroup62 : SchedWriteRes<[BWPort6,BWPort23]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[BWWriteResGroup62], (instrs FARJMP64)>;
+def: InstRW<[BWWriteResGroup62], (instrs FARJMP64m)>;
 def: InstRW<[BWWriteResGroup62], (instregex "JMP(16|32|64)m")>;
 
 def BWWriteResGroup64 : SchedWriteRes<[BWPort23,BWPort15]> {
@@ -1127,7 +1130,7 @@ def BWWriteResGroup89 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort
   let ResourceCycles = [1,1,1,1,1];
 }
 def: InstRW<[BWWriteResGroup89], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[BWWriteResGroup89], (instrs FARCALL64)>;
+def: InstRW<[BWWriteResGroup89], (instrs FARCALL64m)>;
 
 def BWWriteResGroup90 : SchedWriteRes<[BWPort6,BWPort06,BWPort15,BWPort0156]> {
   let Latency = 7;
@@ -1479,54 +1482,42 @@ def BWWriteResGroup182 : SchedWriteRes<[BWPort0,BWPort1,BWPort23]> {
 def: InstRW<[BWWriteResGroup182], (instregex "DIVR_FI(16|32)m")>;
 
 def BWWriteResGroup183_1 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
-  let Latency = 22;
+  let Latency = 17;
   let NumMicroOps = 7;
   let ResourceCycles = [1,3,2,1];
 }
-def: InstRW<[BWWriteResGroup183_1], (instrs VGATHERQPDrm)>;
+def: InstRW<[BWWriteResGroup183_1], (instrs VGATHERDPDrm, VPGATHERDQrm,
+                                            VGATHERQPDrm, VPGATHERQQrm)>;
 
 def BWWriteResGroup183_2 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
-  let Latency = 23;
+  let Latency = 18;
   let NumMicroOps = 9;
   let ResourceCycles = [1,3,4,1];
 }
-def: InstRW<[BWWriteResGroup183_2], (instrs VGATHERQPDYrm)>;
+def: InstRW<[BWWriteResGroup183_2], (instrs VGATHERDPDYrm, VPGATHERDQYrm,
+                                            VGATHERQPDYrm, VPGATHERQQYrm)>;
 
 def BWWriteResGroup183_3 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
-  let Latency = 24;
+  let Latency = 19;
   let NumMicroOps = 9;
   let ResourceCycles = [1,5,2,1];
 }
-def: InstRW<[BWWriteResGroup183_3], (instrs VGATHERQPSYrm)>;
+def: InstRW<[BWWriteResGroup183_3], (instrs VGATHERQPSrm, VPGATHERQDrm)>;
 
 def BWWriteResGroup183_4 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
-  let Latency = 25;
-  let NumMicroOps = 7;
-  let ResourceCycles = [1,3,2,1];
+  let Latency = 19;
+  let NumMicroOps = 10;
+  let ResourceCycles = [1,4,4,1];
 }
-def: InstRW<[BWWriteResGroup183_4], (instrs VGATHERDPDrm,
-                                            VGATHERDPSrm)>;
+def: InstRW<[BWWriteResGroup183_4], (instrs VGATHERDPSrm, VPGATHERDDrm,
+                                            VGATHERQPSYrm, VPGATHERQDYrm)>;
 
 def BWWriteResGroup183_5 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
-  let Latency = 26;
-  let NumMicroOps = 9;
-  let ResourceCycles = [1,5,2,1];
-}
-def: InstRW<[BWWriteResGroup183_5], (instrs VGATHERDPDYrm)>;
-
-def BWWriteResGroup183_6 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
-  let Latency = 26;
+  let Latency = 21;
   let NumMicroOps = 14;
   let ResourceCycles = [1,4,8,1];
 }
-def: InstRW<[BWWriteResGroup183_6], (instrs VGATHERDPSYrm)>;
-
-def BWWriteResGroup183_7 : SchedWriteRes<[BWPort4, BWPort5, BWPort23, BWPort0156]> {
-  let Latency = 27;
-  let NumMicroOps = 9;
-  let ResourceCycles = [1,5,2,1];
-}
-def: InstRW<[BWWriteResGroup183_7], (instrs VGATHERQPSrm)>;
+def: InstRW<[BWWriteResGroup183_5], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>;
 
 def BWWriteResGroup185 : SchedWriteRes<[BWPort4,BWPort6,BWPort23,BWPort237,BWPort0156]> {
   let Latency = 29;
@@ -1604,7 +1595,7 @@ def: InstRW<[BWWriteResGroup202], (instrs FSTENVm)>;
 def: InstRW<[WriteZero], (instrs CLC)>;
 
 
-// Intruction variants handled by the renamer. These might not need execution
+// Instruction variants handled by the renamer. These might not need execution
 // ports in certain conditions.
 // See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
 // section "Haswell and Broadwell Pipeline" > "Register allocation and
diff --git a/llvm/lib/Target/X86/X86SchedHaswell.td b/llvm/lib/Target/X86/X86SchedHaswell.td
index 06f417501b21..746dbaeca189 100644
--- a/llvm/lib/Target/X86/X86SchedHaswell.td
+++ b/llvm/lib/Target/X86/X86SchedHaswell.td
@@ -261,6 +261,7 @@ defm : HWWriteResPair<WriteFCmp64Y, [HWPort1],  3, [1], 1, 7>;
 defm : HWWriteResPair<WriteFCmp64Z, [HWPort1],  3, [1], 1, 7>; // Unsupported = 1
 
 defm : HWWriteResPair<WriteFCom,    [HWPort1],  3>;
+defm : HWWriteResPair<WriteFComX,   [HWPort1],  3>;
 
 defm : HWWriteResPair<WriteFMul,    [HWPort01],  5, [1], 1, 5>;
 defm : HWWriteResPair<WriteFMulX,   [HWPort01],  5, [1], 1, 6>;
@@ -391,8 +392,10 @@ defm : X86WriteRes<WriteVecStoreX,       [HWPort237,HWPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteVecStoreY,       [HWPort237,HWPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteVecStoreNT,      [HWPort237,HWPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteVecStoreNTY,     [HWPort237,HWPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteVecMaskedStore,  [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore32,  [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore64,  [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [HWPort0,HWPort4,HWPort237,HWPort15], 5, [1,1,1,1], 4>;
 defm : X86WriteRes<WriteVecMove,         [HWPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [HWPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [HWPort015], 1, [1], 1>;
@@ -996,7 +999,7 @@ def HWWriteResGroup14 : SchedWriteRes<[HWPort6,HWPort23]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[HWWriteResGroup14], (instrs FARJMP64)>;
+def: InstRW<[HWWriteResGroup14], (instrs FARJMP64m)>;
 def: InstRW<[HWWriteResGroup14], (instregex "JMP(16|32|64)m")>;
 
 def HWWriteResGroup16 : SchedWriteRes<[HWPort23,HWPort15]> {
@@ -1205,7 +1208,7 @@ def HWWriteResGroup48 : SchedWriteRes<[HWPort4,HWPort6,HWPort23,HWPort237,HWPort
   let ResourceCycles = [1,1,1,1,1];
 }
 def: InstRW<[HWWriteResGroup48], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[HWWriteResGroup48], (instrs FARCALL64)>;
+def: InstRW<[HWWriteResGroup48], (instrs FARCALL64m)>;
 
 def HWWriteResGroup50 : SchedWriteRes<[HWPort1]> {
   let Latency = 3;
@@ -1784,80 +1787,60 @@ def HWWriteResGroup183 : SchedWriteRes<[HWPort0,HWPort1,HWPort4,HWPort5,HWPort6,
 }
 def: InstRW<[HWWriteResGroup183], (instrs FSTENVm)>;
 
-def HWWriteResGroup184 : SchedWriteRes<[HWPort0, HWPort5, HWPort15, HWPort015, HWPort06, HWPort23]> {
-  let Latency = 26;
+def HWWriteResGroup184 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+  let Latency = 14;
   let NumMicroOps = 12;
-  let ResourceCycles = [2,2,1,3,2,2];
-}
-def: InstRW<[HWWriteResGroup184], (instrs VGATHERDPDrm,
-                                          VPGATHERDQrm,
-                                          VPGATHERDDrm)>;
-
-def HWWriteResGroup185 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
-  let Latency = 24;
-  let NumMicroOps = 22;
-  let ResourceCycles = [5,3,4,1,5,4];
+  let ResourceCycles = [2,2,2,1,3,2];
 }
-def: InstRW<[HWWriteResGroup185], (instrs VGATHERQPDYrm,
-                                          VPGATHERQQYrm)>;
+def: InstRW<[HWWriteResGroup184], (instrs VGATHERDPDrm, VPGATHERDQrm)>;
 
-def HWWriteResGroup186 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
-  let Latency = 28;
-  let NumMicroOps = 22;
-  let ResourceCycles = [5,3,4,1,5,4];
-}
-def: InstRW<[HWWriteResGroup186], (instrs VPGATHERQDYrm)>;
-
-def HWWriteResGroup187 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
-  let Latency = 25;
-  let NumMicroOps = 22;
-  let ResourceCycles = [5,3,4,1,5,4];
+def HWWriteResGroup185 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+  let Latency = 17;
+  let NumMicroOps = 20;
+  let ResourceCycles = [3,3,4,1,5,4];
 }
-def: InstRW<[HWWriteResGroup187], (instrs VPGATHERQDrm)>;
+def: InstRW<[HWWriteResGroup185], (instrs VGATHERDPDYrm, VPGATHERDQYrm)>;
 
-def HWWriteResGroup188 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
-  let Latency = 27;
+def HWWriteResGroup186 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+  let Latency = 16;
   let NumMicroOps = 20;
   let ResourceCycles = [3,3,4,1,5,4];
 }
-def: InstRW<[HWWriteResGroup188], (instrs VGATHERDPDYrm,
-                                          VPGATHERDQYrm)>;
+def: InstRW<[HWWriteResGroup186], (instrs VGATHERDPSrm, VPGATHERDDrm)>;
 
-def HWWriteResGroup189 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
-  let Latency = 27;
+def HWWriteResGroup187 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+  let Latency = 22;
   let NumMicroOps = 34;
   let ResourceCycles = [5,3,8,1,9,8];
 }
-def: InstRW<[HWWriteResGroup189], (instrs VGATHERDPSYrm,
-                                          VPGATHERDDYrm)>;
+def: InstRW<[HWWriteResGroup187], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>;
 
-def HWWriteResGroup190 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
-  let Latency = 23;
+def HWWriteResGroup188 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+  let Latency = 15;
   let NumMicroOps = 14;
   let ResourceCycles = [3,3,2,1,3,2];
 }
-def: InstRW<[HWWriteResGroup190], (instrs VGATHERQPDrm,
-                                          VPGATHERQQrm)>;
+def: InstRW<[HWWriteResGroup188], (instrs VGATHERQPDrm, VPGATHERQQrm)>;
 
-def HWWriteResGroup191 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
-  let Latency = 28;
-  let NumMicroOps = 15;
-  let ResourceCycles = [3,3,2,1,4,2];
+def HWWriteResGroup189 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+  let Latency = 17;
+  let NumMicroOps = 22;
+  let ResourceCycles = [5,3,4,1,5,4];
 }
-def: InstRW<[HWWriteResGroup191], (instrs VGATHERQPSYrm)>;
+def: InstRW<[HWWriteResGroup189], (instrs VGATHERQPDYrm, VPGATHERQQYrm,
+                                          VGATHERQPSYrm, VPGATHERQDYrm)>;
 
-def HWWriteResGroup192 : SchedWriteRes<[HWPort0, HWPort5, HWPort06, HWPort15, HWPort015, HWPort23]> {
-  let Latency = 25;
+def HWWriteResGroup190 : SchedWriteRes<[HWPort0,HWPort5,HWPort06,HWPort15,HWPort015,HWPort23]> {
+  let Latency = 16;
   let NumMicroOps = 15;
   let ResourceCycles = [3,3,2,1,4,2];
 }
-def: InstRW<[HWWriteResGroup192], (instrs VGATHERQPSrm,
-                                          VGATHERDPSrm)>;
+def: InstRW<[HWWriteResGroup190], (instrs VGATHERQPSrm, VPGATHERQDrm)>;
 
 def: InstRW<[WriteZero], (instrs CLC)>;
 
 
-// Intruction variants handled by the renamer. These might not need execution
+// Instruction variants handled by the renamer. These might not need execution
 // ports in certain conditions.
 // See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
 // section "Haswell and Broadwell Pipeline" > "Register allocation and
diff --git a/llvm/lib/Target/X86/X86SchedSandyBridge.td b/llvm/lib/Target/X86/X86SchedSandyBridge.td
index 26d4d8fa3549..ac32f1b19990 100644
--- a/llvm/lib/Target/X86/X86SchedSandyBridge.td
+++ b/llvm/lib/Target/X86/X86SchedSandyBridge.td
@@ -238,6 +238,7 @@ defm : SBWriteResPair<WriteFCmp64Y, [SBPort1],  3, [1], 1, 7>;
 defm : SBWriteResPair<WriteFCmp64Z, [SBPort1],  3, [1], 1, 7>; // Unsupported = 1
 
 defm : SBWriteResPair<WriteFCom,    [SBPort1],  3>;
+defm : SBWriteResPair<WriteFComX,   [SBPort1],  3>;
 
 defm : SBWriteResPair<WriteFMul,    [SBPort0],  5, [1], 1, 6>;
 defm : SBWriteResPair<WriteFMulX,   [SBPort0],  5, [1], 1, 6>;
@@ -366,8 +367,10 @@ defm : X86WriteRes<WriteVecStoreX,       [SBPort23,SBPort4], 1, [1,1], 1>;
 defm : X86WriteRes<WriteVecStoreY,       [SBPort23,SBPort4], 1, [1,1], 1>;
 defm : X86WriteRes<WriteVecStoreNT,      [SBPort23,SBPort4], 1, [1,1], 1>;
 defm : X86WriteRes<WriteVecStoreNTY,     [SBPort23,SBPort4], 1, [1,1], 1>;
-defm : X86WriteRes<WriteVecMaskedStore,  [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMaskedStore32,  [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMaskedStore64,  [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [SBPort4,SBPort01,SBPort23], 5, [1,1,1], 3>;
 defm : X86WriteRes<WriteVecMove,         [SBPort05], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [SBPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [SBPort05], 1, [1], 1>;
@@ -481,7 +484,7 @@ def : WriteRes<WritePCmpEStrM, [SBPort015]> {
   let ResourceCycles = [8];
 }
 def : WriteRes<WritePCmpEStrMLd, [SBPort015, SBPort23]> {
-  let Latency = 11;
+  let Latency = 17;
   let ResourceCycles = [7, 1];
 }
 
@@ -503,7 +506,7 @@ def : WriteRes<WritePCmpEStrI, [SBPort015]> {
   let ResourceCycles = [8];
 }
 def : WriteRes<WritePCmpEStrILd, [SBPort015, SBPort23]> {
-  let Latency = 4;
+  let Latency = 10;
   let ResourceCycles = [7, 1];
 }
 
@@ -541,7 +544,7 @@ def : WriteRes<WriteAESKeyGen, [SBPort015]> {
   let ResourceCycles = [11];
 }
 def : WriteRes<WriteAESKeyGenLd, [SBPort015, SBPort23]> {
-  let Latency = 8;
+  let Latency = 14;
   let ResourceCycles = [10, 1];
 }
 
@@ -551,7 +554,7 @@ def : WriteRes<WriteCLMul, [SBPort015]> {
   let ResourceCycles = [18];
 }
 def : WriteRes<WriteCLMulLd, [SBPort015, SBPort23]> {
-  let Latency = 14;
+  let Latency = 20;
   let ResourceCycles = [17, 1];
 }
 
@@ -881,7 +884,7 @@ def SBWriteResGroup64 : SchedWriteRes<[SBPort5,SBPort01,SBPort23]> {
   let NumMicroOps = 3;
   let ResourceCycles = [1,1,1];
 }
-def: InstRW<[SBWriteResGroup64], (instrs FARJMP64)>;
+def: InstRW<[SBWriteResGroup64], (instrs FARJMP64m)>;
 
 def SBWriteResGroup66 : SchedWriteRes<[SBPort0,SBPort4,SBPort23]> {
   let Latency = 7;
@@ -967,7 +970,7 @@ def SBWriteResGroup87 : SchedWriteRes<[SBPort4,SBPort5,SBPort01,SBPort23]> {
   let NumMicroOps = 5;
   let ResourceCycles = [1,1,1,2];
 }
-def: InstRW<[SBWriteResGroup87], (instrs FARCALL64)>;
+def: InstRW<[SBWriteResGroup87], (instrs FARCALL64m)>;
 
 def SBWriteResGroup93 : SchedWriteRes<[SBPort0,SBPort1,SBPort23]> {
   let Latency = 9;
@@ -1105,7 +1108,7 @@ def: InstRW<[SBWriteResGroupVzeroupper], (instrs VZEROUPPER)>;
 
 def: InstRW<[WriteZero], (instrs CLC)>;
 
-// Intruction variants handled by the renamer. These might not need execution
+// Instruction variants handled by the renamer. These might not need execution
 // ports in certain conditions.
 // See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
 // section "Sandy Bridge and Ivy Bridge Pipeline" > "Register allocation and
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeClient.td b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
index 9a511ecc0071..0599564765da 100644
--- a/llvm/lib/Target/X86/X86SchedSkylakeClient.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeClient.td
@@ -255,7 +255,8 @@ defm : SKLWriteResPair<WriteFCmp64X,  [SKLPort01],  4, [1], 1, 6>;
 defm : SKLWriteResPair<WriteFCmp64Y,  [SKLPort01],  4, [1], 1, 7>;
 defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
 
-defm : SKLWriteResPair<WriteFCom,      [SKLPort0],  2>; // Floating point compare to flags.
+defm : SKLWriteResPair<WriteFCom,      [SKLPort0],  2>; // Floating point compare to flags (X87).
+defm : SKLWriteResPair<WriteFComX,     [SKLPort0],  2>; // Floating point compare to flags (SSE).
 
 defm : SKLWriteResPair<WriteFMul,     [SKLPort01],  4, [1], 1, 5>; // Floating point multiplication.
 defm : SKLWriteResPair<WriteFMulX,    [SKLPort01],  4, [1], 1, 6>;
@@ -342,8 +343,10 @@ defm : X86WriteRes<WriteVecStoreX,       [SKLPort237,SKLPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteVecStoreY,       [SKLPort237,SKLPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteVecStoreNT,      [SKLPort237,SKLPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteVecStoreNTY,     [SKLPort237,SKLPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteVecMaskedStore,  [SKLPort237,SKLPort0], 2, [1,1], 2>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32,  [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64,  [SKLPort237,SKLPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [SKLPort237,SKLPort0], 2, [1,1], 2>;
 defm : X86WriteRes<WriteVecMove,         [SKLPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [SKLPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [SKLPort015], 1, [1], 1>;
@@ -361,9 +364,9 @@ defm : X86WriteResPairUnsupported<WriteVecLogicZ>;
 defm : SKLWriteResPair<WriteVecTest,  [SKLPort0,SKLPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions.
 defm : SKLWriteResPair<WriteVecTestY, [SKLPort0,SKLPort5], 3, [1,1], 2, 7>;
 defm : X86WriteResPairUnsupported<WriteVecTestZ>;
-defm : SKLWriteResPair<WriteVecIMul,  [SKLPort0] ,  4, [1], 1, 5>; // Vector integer multiply.
-defm : SKLWriteResPair<WriteVecIMulX, [SKLPort01],  4, [1], 1, 6>;
-defm : SKLWriteResPair<WriteVecIMulY, [SKLPort01],  4, [1], 1, 7>;
+defm : SKLWriteResPair<WriteVecIMul,  [SKLPort0] ,  5, [1], 1, 5>; // Vector integer multiply.
+defm : SKLWriteResPair<WriteVecIMulX, [SKLPort01],  5, [1], 1, 6>;
+defm : SKLWriteResPair<WriteVecIMulY, [SKLPort01],  5, [1], 1, 7>;
 defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
 defm : SKLWriteResPair<WritePMULLD,   [SKLPort01], 10, [2], 2, 6>; // Vector PMULLD.
 defm : SKLWriteResPair<WritePMULLDY,  [SKLPort01], 10, [2], 2, 7>;
@@ -1012,7 +1015,7 @@ def SKLWriteResGroup72 : SchedWriteRes<[SKLPort6,SKLPort23]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKLWriteResGroup72], (instrs FARJMP64)>;
+def: InstRW<[SKLWriteResGroup72], (instrs FARJMP64m)>;
 def: InstRW<[SKLWriteResGroup72], (instregex "JMP(16|32|64)m")>;
 
 def SKLWriteResGroup75 : SchedWriteRes<[SKLPort23,SKLPort15]> {
@@ -1193,7 +1196,7 @@ def SKLWriteResGroup102 : SchedWriteRes<[SKLPort4,SKLPort6,SKLPort23,SKLPort237,
   let ResourceCycles = [1,1,1,1,1];
 }
 def: InstRW<[SKLWriteResGroup102], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[SKLWriteResGroup102], (instrs FARCALL64)>;
+def: InstRW<[SKLWriteResGroup102], (instrs FARCALL64m)>;
 
 def SKLWriteResGroup103 : SchedWriteRes<[SKLPort6,SKLPort06,SKLPort15,SKLPort0156]> {
   let Latency = 7;
@@ -1592,33 +1595,31 @@ def SKLWriteResGroup196 : SchedWriteRes<[SKLPort0,SKLPort23]> {
 }
 def: InstRW<[SKLWriteResGroup196], (instregex "DIV_F(32|64)m")>;
 
-def SKLWriteResGroup196_1 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
-  let Latency = 22;
-  let NumMicroOps = 5;
+def SKLWriteResGroupVEX2 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
+  let Latency = 18;
+  let NumMicroOps = 5; // 2 uops perform multiple loads
   let ResourceCycles = [1,2,1,1];
 }
-def: InstRW<[SKLWriteResGroup196_1], (instrs VGATHERDPSrm,
-                                             VGATHERDPDrm,
-                                             VGATHERQPDrm,
-                                             VGATHERQPSrm,
-                                             VPGATHERDDrm,
-                                             VPGATHERDQrm,
-                                             VPGATHERQDrm,
-                                             VPGATHERQQrm)>;
+def: InstRW<[SKLWriteResGroupVEX2], (instrs VGATHERDPDrm, VPGATHERDQrm,
+                                            VGATHERQPDrm, VPGATHERQQrm,
+                                            VGATHERQPSrm, VPGATHERQDrm)>;
 
-def SKLWriteResGroup196_2 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
-  let Latency = 25;
-  let NumMicroOps = 5;
-  let ResourceCycles = [1,2,1,1];
+def SKLWriteResGroupVEX4 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
+  let Latency = 20;
+  let NumMicroOps = 5; // 2 uops peform multiple loads
+  let ResourceCycles = [1,4,1,1];
+}
+def: InstRW<[SKLWriteResGroupVEX4], (instrs VGATHERDPDYrm, VPGATHERDQYrm,
+                                            VGATHERDPSrm,  VPGATHERDDrm,
+                                            VGATHERQPDYrm, VPGATHERQQYrm,
+                                            VGATHERQPSYrm,  VPGATHERQDYrm)>;
+
+def SKLWriteResGroupVEX8 : SchedWriteRes<[SKLPort0, SKLPort23, SKLPort5, SKLPort015]> {
+  let Latency = 22;
+  let NumMicroOps = 5; // 2 uops perform multiple loads
+  let ResourceCycles = [1,8,1,1];
 }
-def: InstRW<[SKLWriteResGroup196_2], (instrs VGATHERDPSYrm,
-                                             VGATHERQPDYrm,
-                                             VGATHERQPSYrm,
-                                             VPGATHERDDYrm,
-                                             VPGATHERDQYrm,
-                                             VPGATHERQDYrm,
-                                             VPGATHERQQYrm,
-                                             VGATHERDPDYrm)>;
+def: InstRW<[SKLWriteResGroupVEX8], (instrs VGATHERDPSYrm,  VPGATHERDDYrm)>;
 
 def SKLWriteResGroup198 : SchedWriteRes<[SKLPort0,SKLPort4,SKLPort5,SKLPort23,SKLPort237,SKLPort06,SKLPort0156]> {
   let Latency = 23;
@@ -1745,7 +1746,7 @@ def: InstRW<[SKLWriteResGroup223], (instrs FSTENVm)>;
 def: InstRW<[WriteZero], (instrs CLC)>;
 
 
-// Intruction variants handled by the renamer. These might not need execution
+// Instruction variants handled by the renamer. These might not need execution
 // ports in certain conditions.
 // See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
 // section "Skylake Pipeline" > "Register allocation and renaming".
diff --git a/llvm/lib/Target/X86/X86SchedSkylakeServer.td b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
index a8c65435ab9b..7fc96d1eda89 100755
--- a/llvm/lib/Target/X86/X86SchedSkylakeServer.td
+++ b/llvm/lib/Target/X86/X86SchedSkylakeServer.td
@@ -255,7 +255,8 @@ defm : SKXWriteResPair<WriteFCmp64X,   [SKXPort01],  4, [1], 1, 6>;
 defm : SKXWriteResPair<WriteFCmp64Y,   [SKXPort01],  4, [1], 1, 7>;
 defm : SKXWriteResPair<WriteFCmp64Z,   [SKXPort05],  4, [1], 1, 7>;
 
-defm : SKXWriteResPair<WriteFCom,       [SKXPort0],  2>; // Floating point compare to flags.
+defm : SKXWriteResPair<WriteFCom,       [SKXPort0],  2>; // Floating point compare to flags (X87).
+defm : SKXWriteResPair<WriteFComX,      [SKXPort0],  2>; // Floating point compare to flags (SSE).
 
 defm : SKXWriteResPair<WriteFMul,      [SKXPort01],  4, [1], 1, 5>; // Floating point multiplication.
 defm : SKXWriteResPair<WriteFMulX,     [SKXPort01],  4, [1], 1, 6>;
@@ -342,8 +343,10 @@ defm : X86WriteRes<WriteVecStoreX,       [SKXPort237,SKXPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteVecStoreY,       [SKXPort237,SKXPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteVecStoreNT,      [SKXPort237,SKXPort4], 1, [1,1], 2>;
 defm : X86WriteRes<WriteVecStoreNTY,     [SKXPort237,SKXPort4], 1, [1,1], 2>;
-defm : X86WriteRes<WriteVecMaskedStore,  [SKXPort237,SKXPort0], 2, [1,1], 2>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32,  [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64,  [SKXPort237,SKXPort0], 2, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [SKXPort237,SKXPort0], 2, [1,1], 2>;
 defm : X86WriteRes<WriteVecMove,         [SKXPort05],  1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [SKXPort015], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [SKXPort015], 1, [1], 1>;
@@ -361,10 +364,10 @@ defm : SKXWriteResPair<WriteVecLogicZ,[SKXPort05], 1, [1], 1, 7>;
 defm : SKXWriteResPair<WriteVecTest,  [SKXPort0,SKXPort5], 3, [1,1], 2, 6>; // Vector integer TEST instructions.
 defm : SKXWriteResPair<WriteVecTestY, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>;
 defm : SKXWriteResPair<WriteVecTestZ, [SKXPort0,SKXPort5], 3, [1,1], 2, 7>;
-defm : SKXWriteResPair<WriteVecIMul,  [SKXPort0],    4, [1], 1, 5>; // Vector integer multiply.
-defm : SKXWriteResPair<WriteVecIMulX, [SKXPort01],  4, [1], 1, 6>;
-defm : SKXWriteResPair<WriteVecIMulY, [SKXPort01],  4, [1], 1, 7>;
-defm : SKXWriteResPair<WriteVecIMulZ, [SKXPort05],  4, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecIMul,  [SKXPort0],   5, [1], 1, 5>; // Vector integer multiply.
+defm : SKXWriteResPair<WriteVecIMulX, [SKXPort01],  5, [1], 1, 6>;
+defm : SKXWriteResPair<WriteVecIMulY, [SKXPort01],  5, [1], 1, 7>;
+defm : SKXWriteResPair<WriteVecIMulZ, [SKXPort05],  5, [1], 1, 7>;
 defm : SKXWriteResPair<WritePMULLD,   [SKXPort01], 10, [2], 2, 6>; // Vector PMULLD.
 defm : SKXWriteResPair<WritePMULLDY,  [SKXPort01], 10, [2], 2, 7>;
 defm : SKXWriteResPair<WritePMULLDZ,  [SKXPort05], 10, [2], 2, 7>;
@@ -619,6 +622,8 @@ def: InstRW<[SKXWriteResGroup1], (instregex "KAND(B|D|Q|W)rr",
                                             "KOR(B|D|Q|W)rr",
                                             "KXNOR(B|D|Q|W)rr",
                                             "KXOR(B|D|Q|W)rr",
+                                            "KSET0(B|D|Q|W)", // Same as KXOR
+                                            "KSET1(B|D|Q|W)", // Same as KXNOR
                                             "MMX_PADDS(B|W)irr",
                                             "MMX_PADDUS(B|W)irr",
                                             "MMX_PAVG(B|W)irr",
@@ -814,19 +819,26 @@ def SKXWriteResGroup32 : SchedWriteRes<[SKXPort5]> {
 }
 def: InstRW<[SKXWriteResGroup32], (instrs VPSADBWZrr)>; // TODO: 512-bit ops require ports 0/1 to be joined.
 def: InstRW<[SKXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0)",
-                                             "KADD(B|D|Q|W)rr",
+                                             "VALIGND(Z|Z128|Z256)rri",
+                                             "VALIGNQ(Z|Z128|Z256)rri",
+                                             "VDBPSADBWZrri", // TODO: 512-bit ops require ports 0/1 to be joined.
+                                             "VPBROADCAST(B|W)rr",
+                                             "VP(MAX|MIN)(S|U)Q(Z|Z128|Z256)rr")>;
+
+def SKXWriteResGroup33 : SchedWriteRes<[SKXPort5]> {
+  let Latency = 4;
+  let NumMicroOps = 1;
+  let ResourceCycles = [1];
+}
+def: InstRW<[SKXWriteResGroup33], (instregex "KADD(B|D|Q|W)rr",
                                              "KSHIFTL(B|D|Q|W)ri",
                                              "KSHIFTR(B|D|Q|W)ri",
                                              "KUNPCK(BW|DQ|WD)rr",
-                                             "VALIGND(Z|Z128|Z256)rri",
-                                             "VALIGNQ(Z|Z128|Z256)rri",
                                              "VCMPPD(Z|Z128|Z256)rri",
                                              "VCMPPS(Z|Z128|Z256)rri",
                                              "VCMP(SD|SS)Zrr",
-                                             "VDBPSADBWZrri", // TODO: 512-bit ops require ports 0/1 to be joined.
                                              "VFPCLASS(PD|PS)(Z|Z128|Z256)rr",
                                              "VFPCLASS(SD|SS)Zrr",
-                                             "VPBROADCAST(B|W)rr",
                                              "VPCMPB(Z|Z128|Z256)rri",
                                              "VPCMPD(Z|Z128|Z256)rri",
                                              "VPCMPEQ(B|D|Q|W)(Z|Z128|Z256)rr",
@@ -834,7 +846,6 @@ def: InstRW<[SKXWriteResGroup32], (instregex "(ADD|SUB|SUBR)_(FPrST0|FST0r|FrST0
                                              "VPCMPQ(Z|Z128|Z256)rri",
                                              "VPCMPU(B|D|Q|W)(Z|Z128|Z256)rri",
                                              "VPCMPW(Z|Z128|Z256)rri",
-                                             "VP(MAX|MIN)(S|U)Q(Z|Z128|Z256)rr",
                                              "VPTEST(N?)M(B|D|Q|W)(Z|Z128|Z256)rr")>;
 
 def SKXWriteResGroup34 : SchedWriteRes<[SKXPort0,SKXPort0156]> {
@@ -1171,7 +1182,7 @@ def SKXWriteResGroup76 : SchedWriteRes<[SKXPort6,SKXPort23]> {
   let NumMicroOps = 2;
   let ResourceCycles = [1,1];
 }
-def: InstRW<[SKXWriteResGroup76], (instrs FARJMP64)>;
+def: InstRW<[SKXWriteResGroup76], (instrs FARJMP64m)>;
 def: InstRW<[SKXWriteResGroup76], (instregex "JMP(16|32|64)m")>;
 
 def SKXWriteResGroup79 : SchedWriteRes<[SKXPort23,SKXPort15]> {
@@ -1331,8 +1342,8 @@ def: InstRW<[SKXWriteResGroup95], (instrs VMOVNTDQAZ128rm,
 def: InstRW<[SKXWriteResGroup95, ReadAfterVecXLd],
                                   (instregex "VBLENDMPDZ128rm(b?)",
                                              "VBLENDMPSZ128rm(b?)",
-                                             "VBROADCASTI32X2Z128m(b?)",
-                                             "VBROADCASTSSZ128m(b?)",
+                                             "VBROADCASTI32X2Z128rm(b?)",
+                                             "VBROADCASTSSZ128rm(b?)",
                                              "VINSERT(F|I)128rm",
                                              "VMOVAPDZ128rm(b?)",
                                              "VMOVAPSZ128rm(b?)",
@@ -1350,8 +1361,8 @@ def: InstRW<[SKXWriteResGroup95, ReadAfterVecXLd],
                                              "VPADD(B|D|Q|W)Z128rm(b?)",
                                              "(V?)PADD(B|D|Q|W)rm",
                                              "VPBLENDM(B|D|Q|W)Z128rm(b?)",
-                                             "VPBROADCASTDZ128m(b?)",
-                                             "VPBROADCASTQZ128m(b?)",
+                                             "VPBROADCASTDZ128rm(b?)",
+                                             "VPBROADCASTQZ128rm(b?)",
                                              "VPSUB(B|D|Q|W)Z128rm(b?)",
                                              "(V?)PSUB(B|D|Q|W)rm",
                                              "VPTERNLOGDZ128rm(b?)i",
@@ -1456,7 +1467,7 @@ def SKXWriteResGroup109 : SchedWriteRes<[SKXPort4,SKXPort6,SKXPort23,SKXPort237,
   let ResourceCycles = [1,1,1,1,1];
 }
 def: InstRW<[SKXWriteResGroup109], (instregex "CALL(16|32|64)m")>;
-def: InstRW<[SKXWriteResGroup109], (instrs FARCALL64)>;
+def: InstRW<[SKXWriteResGroup109], (instrs FARCALL64m)>;
 
 def SKXWriteResGroup110 : SchedWriteRes<[SKXPort0,SKXPort4,SKXPort237,SKXPort0156]> {
   let Latency = 7;
@@ -1516,9 +1527,8 @@ def SKXWriteResGroup119 : SchedWriteRes<[SKXPort5,SKXPort23]> {
   let ResourceCycles = [1,1];
 }
 def: InstRW<[SKXWriteResGroup119], (instregex "FCOM(P?)(32|64)m",
-                                              "VFPCLASSSDZrm(b?)",
-                                              "VPBROADCASTB(Z|Z256)m(b?)",
-                                              "VPBROADCASTW(Z|Z256)m(b?)")>;
+                                              "VPBROADCASTB(Z|Z256)rm(b?)",
+                                              "VPBROADCASTW(Z|Z256)rm(b?)")>;
 def: InstRW<[SKXWriteResGroup119], (instrs VPBROADCASTBYrm,
                                            VPBROADCASTWYrm,
                                            VPMOVSXBDYrm,
@@ -1535,24 +1545,24 @@ def: InstRW<[SKXWriteResGroup121], (instrs VMOVNTDQAZ256rm,
 def: InstRW<[SKXWriteResGroup121, ReadAfterVecYLd],
                                    (instregex "VBLENDMPD(Z|Z256)rm(b?)",
                                               "VBLENDMPS(Z|Z256)rm(b?)",
-                                              "VBROADCASTF32X2Z256m(b?)",
-                                              "VBROADCASTF32X2Zm(b?)",
+                                              "VBROADCASTF32X2Z256rm(b?)",
+                                              "VBROADCASTF32X2Zrm(b?)",
                                               "VBROADCASTF32X4Z256rm(b?)",
                                               "VBROADCASTF32X4rm(b?)",
                                               "VBROADCASTF32X8rm(b?)",
                                               "VBROADCASTF64X2Z128rm(b?)",
                                               "VBROADCASTF64X2rm(b?)",
                                               "VBROADCASTF64X4rm(b?)",
-                                              "VBROADCASTI32X2Z256m(b?)",
-                                              "VBROADCASTI32X2Zm(b?)",
+                                              "VBROADCASTI32X2Z256rm(b?)",
+                                              "VBROADCASTI32X2Zrm(b?)",
                                               "VBROADCASTI32X4Z256rm(b?)",
                                               "VBROADCASTI32X4rm(b?)",
                                               "VBROADCASTI32X8rm(b?)",
                                               "VBROADCASTI64X2Z128rm(b?)",
                                               "VBROADCASTI64X2rm(b?)",
                                               "VBROADCASTI64X4rm(b?)",
-                                              "VBROADCASTSD(Z|Z256)m(b?)",
-                                              "VBROADCASTSS(Z|Z256)m(b?)",
+                                              "VBROADCASTSD(Z|Z256)rm(b?)",
+                                              "VBROADCASTSS(Z|Z256)rm(b?)",
                                               "VINSERTF32x4(Z|Z256)rm(b?)",
                                               "VINSERTF32x8Zrm(b?)",
                                               "VINSERTF64x2(Z|Z256)rm(b?)",
@@ -1577,8 +1587,8 @@ def: InstRW<[SKXWriteResGroup121, ReadAfterVecYLd],
                                               "VPADD(B|D|Q|W)Yrm",
                                               "VPADD(B|D|Q|W)(Z|Z256)rm(b?)",
                                               "VPBLENDM(B|D|Q|W)(Z|Z256)rm(b?)",
-                                              "VPBROADCASTD(Z|Z256)m(b?)",
-                                              "VPBROADCASTQ(Z|Z256)m(b?)",
+                                              "VPBROADCASTD(Z|Z256)rm(b?)",
+                                              "VPBROADCASTQ(Z|Z256)rm(b?)",
                                               "VPSUB(B|D|Q|W)Yrm",
                                               "VPSUB(B|D|Q|W)(Z|Z256)rm(b?)",
                                               "VPTERNLOGD(Z|Z256)rm(b?)i",
@@ -1667,17 +1677,9 @@ def: InstRW<[SKXWriteResGroup136], (instrs VPMOVSXBWYrm,
                                            VPMOVSXWDYrm,
                                            VPMOVZXWDYrm)>;
 def: InstRW<[SKXWriteResGroup136], (instregex "VALIGN(D|Q)Z128rm(b?)i",
-                                              "VCMP(PD|PS)Z128rm(b?)i",
-                                              "VCMP(SD|SS)Zrm",
+                                              "VFPCLASSSDZrm(b?)",
                                               "VFPCLASSSSZrm(b?)",
-                                              "VPCMPBZ128rmi(b?)",
-                                              "VPCMPDZ128rmi(b?)",
-                                              "VPCMPEQ(B|D|Q|W)Z128rm(b?)",
-                                              "VPCMPGT(B|D|Q|W)Z128rm(b?)",
                                               "(V?)PCMPGTQrm",
-                                              "VPCMPQZ128rmi(b?)",
-                                              "VPCMPU(B|D|Q|W)Z128rmi(b?)",
-                                              "VPCMPWZ128rmi(b?)",
                                               "VPERMI2D128rm(b?)",
                                               "VPERMI2PD128rm(b?)",
                                               "VPERMI2PS128rm(b?)",
@@ -1701,15 +1703,32 @@ def: InstRW<[SKXWriteResGroup136], (instregex "VALIGN(D|Q)Z128rm(b?)i",
                                               "VPMOVZXBWZ128rm(b?)",
                                               "VPMOVZXDQZ128rm(b?)",
                                               "VPMOVZXWDZ128rm(b?)",
-                                              "VPMOVZXWQZ128rm(b?)",
-                                              "VPTESTMBZ128rm(b?)",
-                                              "VPTESTMDZ128rm(b?)",
-                                              "VPTESTMQZ128rm(b?)",
-                                              "VPTESTMWZ128rm(b?)",
-                                              "VPTESTNMBZ128rm(b?)",
-                                              "VPTESTNMDZ128rm(b?)",
-                                              "VPTESTNMQZ128rm(b?)",
-                                              "VPTESTNMWZ128rm(b?)")>;
+                                              "VPMOVZXWQZ128rm(b?)")>;
+
+def SKXWriteResGroup136_2 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+  let Latency = 10;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup136_2], (instregex "VCMP(PD|PS)Z128rm(b?)i",
+                                                "VCMP(SD|SS)Zrm",
+                                                "VFPCLASSPDZ128rm(b?)",
+                                                "VFPCLASSPSZ128rm(b?)",
+                                                "VPCMPBZ128rmi(b?)",
+                                                "VPCMPDZ128rmi(b?)",
+                                                "VPCMPEQ(B|D|Q|W)Z128rm(b?)",
+                                                "VPCMPGT(B|D|Q|W)Z128rm(b?)",
+                                                "VPCMPQZ128rmi(b?)",
+                                                "VPCMPU(B|D|Q|W)Z128rmi(b?)",
+                                                "VPCMPWZ128rmi(b?)",
+                                                "VPTESTMBZ128rm(b?)",
+                                                "VPTESTMDZ128rm(b?)",
+                                                "VPTESTMQZ128rm(b?)",
+                                                "VPTESTMWZ128rm(b?)",
+                                                "VPTESTNMBZ128rm(b?)",
+                                                "VPTESTNMDZ128rm(b?)",
+                                                "VPTESTNMQZ128rm(b?)",
+                                                "VPTESTNMWZ128rm(b?)")>;
 
 def SKXWriteResGroup137 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let Latency = 9;
@@ -1745,30 +1764,38 @@ def: InstRW<[SKXWriteResGroup148], (instregex "(ADD|SUB|SUBR)_F(32|64)m",
                                               "ILD_F(16|32|64)m",
                                               "VALIGND(Z|Z256)rm(b?)i",
                                               "VALIGNQ(Z|Z256)rm(b?)i",
-                                              "VCMPPD(Z|Z256)rm(b?)i",
-                                              "VCMPPS(Z|Z256)rm(b?)i",
-                                              "VPCMPB(Z|Z256)rmi(b?)",
-                                              "VPCMPD(Z|Z256)rmi(b?)",
-                                              "VPCMPEQB(Z|Z256)rm(b?)",
-                                              "VPCMPEQD(Z|Z256)rm(b?)",
-                                              "VPCMPEQQ(Z|Z256)rm(b?)",
-                                              "VPCMPEQW(Z|Z256)rm(b?)",
-                                              "VPCMPGTB(Z|Z256)rm(b?)",
-                                              "VPCMPGTD(Z|Z256)rm(b?)",
-                                              "VPCMPGTQ(Z|Z256)rm(b?)",
-                                              "VPCMPGTW(Z|Z256)rm(b?)",
-                                              "VPCMPQ(Z|Z256)rmi(b?)",
-                                              "VPCMPU(B|D|Q|W)Z256rmi(b?)",
-                                              "VPCMPU(B|D|Q|W)Zrmi(b?)",
-                                              "VPCMPW(Z|Z256)rmi(b?)",
                                               "VPMAXSQ(Z|Z256)rm(b?)",
                                               "VPMAXUQ(Z|Z256)rm(b?)",
                                               "VPMINSQ(Z|Z256)rm(b?)",
-                                              "VPMINUQ(Z|Z256)rm(b?)",
-                                              "VPTESTM(B|D|Q|W)Z256rm(b?)",
-                                              "VPTESTM(B|D|Q|W)Zrm(b?)",
-                                              "VPTESTNM(B|D|Q|W)Z256rm(b?)",
-                                              "VPTESTNM(B|D|Q|W)Zrm(b?)")>;
+                                              "VPMINUQ(Z|Z256)rm(b?)")>;
+
+def SKXWriteResGroup148_2 : SchedWriteRes<[SKXPort5,SKXPort23]> {
+  let Latency = 11;
+  let NumMicroOps = 2;
+  let ResourceCycles = [1,1];
+}
+def: InstRW<[SKXWriteResGroup148_2], (instregex "VCMPPD(Z|Z256)rm(b?)i",
+                                                "VCMPPS(Z|Z256)rm(b?)i",
+                                                "VFPCLASSPD(Z|Z256)rm(b?)",
+                                                "VFPCLASSPS(Z|Z256)rm(b?)",
+                                                "VPCMPB(Z|Z256)rmi(b?)",
+                                                "VPCMPD(Z|Z256)rmi(b?)",
+                                                "VPCMPEQB(Z|Z256)rm(b?)",
+                                                "VPCMPEQD(Z|Z256)rm(b?)",
+                                                "VPCMPEQQ(Z|Z256)rm(b?)",
+                                                "VPCMPEQW(Z|Z256)rm(b?)",
+                                                "VPCMPGTB(Z|Z256)rm(b?)",
+                                                "VPCMPGTD(Z|Z256)rm(b?)",
+                                                "VPCMPGTQ(Z|Z256)rm(b?)",
+                                                "VPCMPGTW(Z|Z256)rm(b?)",
+                                                "VPCMPQ(Z|Z256)rmi(b?)",
+                                                "VPCMPU(B|D|Q|W)Z256rmi(b?)",
+                                                "VPCMPU(B|D|Q|W)Zrmi(b?)",
+                                                "VPCMPW(Z|Z256)rmi(b?)",
+                                                "VPTESTM(B|D|Q|W)Z256rm(b?)",
+                                                "VPTESTM(B|D|Q|W)Zrm(b?)",
+                                                "VPTESTNM(B|D|Q|W)Z256rm(b?)",
+                                                "VPTESTNM(B|D|Q|W)Zrm(b?)")>;
 
 def SKXWriteResGroup149 : SchedWriteRes<[SKXPort23,SKXPort015]> {
   let Latency = 10;
@@ -1938,14 +1965,14 @@ def SKXWriteResGroup171 : SchedWriteRes<[SKXPort06,SKXPort0156]> {
 def: InstRW<[SKXWriteResGroup171], (instrs LOOPE, LOOPNE)>;
 
 def SKXWriteResGroup174 : SchedWriteRes<[SKXPort01]> {
-  let Latency = 12;
+  let Latency = 15;
   let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
 def: InstRW<[SKXWriteResGroup174], (instregex "VPMULLQ(Z128|Z256)rr")>;
 
 def SKXWriteResGroup174z : SchedWriteRes<[SKXPort05]> {
-  let Latency = 12;
+  let Latency = 15;
   let NumMicroOps = 3;
   let ResourceCycles = [3];
 }
@@ -2106,8 +2133,8 @@ def SKXWriteResGroup202 : SchedWriteRes<[SKXPort0,SKXPort1,SKXPort5,SKXPort6,SKX
 }
 def: InstRW<[SKXWriteResGroup202], (instrs XCH_F)>;
 
-def SKXWriteResGroup205 : SchedWriteRes<[SKXPort23,SKXPort015]> {
-  let Latency = 18;
+def SKXWriteResGroup205 : SchedWriteRes<[SKXPort23,SKXPort01]> {
+  let Latency = 21;
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];
 }
@@ -2134,21 +2161,19 @@ def SKXWriteResGroup209 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
 }
 def : SchedAlias<WriteFDiv64Ld,  SKXWriteResGroup209>; // TODO - convert to ZnWriteResFpuPair
 
-def SKXWriteResGroup211 : SchedWriteRes<[SKXPort23,SKXPort015]> {
-  let Latency = 19;
+def SKXWriteResGroup211 : SchedWriteRes<[SKXPort23,SKXPort01]> {
+  let Latency = 22;
   let NumMicroOps = 4;
   let ResourceCycles = [1,3];
 }
-def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)",
-                                              "VPMULLQZrm(b?)")>;
+def: InstRW<[SKXWriteResGroup211], (instregex "VPMULLQZ256rm(b?)")>;
 
-def SKXWriteResGroup214 : SchedWriteRes<[]> {
-  let Latency = 20;
-  let NumMicroOps = 0;
+def SKXWriteResGroup211_1 : SchedWriteRes<[SKXPort23,SKXPort05]> {
+  let Latency = 22;
+  let NumMicroOps = 4;
+  let ResourceCycles = [1,3];
 }
-def: InstRW<[SKXWriteResGroup214], (instrs VGATHERDPSZ128rm,
-                                           VGATHERQPSZrm,
-                                           VPGATHERDDZ128rm)>;
+def: InstRW<[SKXWriteResGroup211_1], (instregex "VPMULLQZrm(b?)")>;
 
 def SKXWriteResGroup215 : SchedWriteRes<[SKXPort0]> {
   let Latency = 20;
@@ -2164,15 +2189,41 @@ def SKXWriteResGroup216 : SchedWriteRes<[SKXPort0,SKXPort23,SKXFPDivider]> {
 }
 def : SchedAlias<WriteFDiv64XLd, SKXWriteResGroup216>; // TODO - convert to ZnWriteResFpuPair
 
-def SKXWriteResGroup218 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
-  let Latency = 20;
-  let NumMicroOps = 5;
+def SKXWriteGatherEVEX2 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+  let Latency = 17;
+  let NumMicroOps = 5; // 2 uops perform multiple loads
   let ResourceCycles = [1,2,1,1];
 }
-def: InstRW<[SKXWriteResGroup218], (instrs VGATHERQPSZ128rm,
-                                           VGATHERQPSZ256rm,
-                                           VPGATHERQDZ128rm,
-                                           VPGATHERQDZ256rm)>;
+def: InstRW<[SKXWriteGatherEVEX2], (instrs VGATHERQPSZ128rm, VPGATHERQDZ128rm,
+                                           VGATHERDPDZ128rm, VPGATHERDQZ128rm,
+                                           VGATHERQPDZ128rm, VPGATHERQQZ128rm)>;
+
+def SKXWriteGatherEVEX4 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+  let Latency = 19;
+  let NumMicroOps = 5; // 2 uops perform multiple loads
+  let ResourceCycles = [1,4,1,1];
+}
+def: InstRW<[SKXWriteGatherEVEX4], (instrs VGATHERQPSZ256rm, VPGATHERQDZ256rm,
+                                           VGATHERQPDZ256rm, VPGATHERQQZ256rm,
+                                           VGATHERDPSZ128rm, VPGATHERDDZ128rm,
+                                           VGATHERDPDZ256rm, VPGATHERDQZ256rm)>;
+
+def SKXWriteGatherEVEX8 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+  let Latency = 21;
+  let NumMicroOps = 5; // 2 uops perform multiple loads
+  let ResourceCycles = [1,8,1,1];
+}
+def: InstRW<[SKXWriteGatherEVEX8], (instrs VGATHERDPSZ256rm, VPGATHERDDZ256rm,
+                                           VGATHERDPDZrm,    VPGATHERDQZrm,
+                                           VGATHERQPDZrm,    VPGATHERQQZrm,
+                                           VGATHERQPSZrm,    VPGATHERQDZrm)>;
+
+def SKXWriteGatherEVEX16 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
+  let Latency = 25;
+  let NumMicroOps = 5; // 2 uops perform multiple loads
+  let ResourceCycles = [1,16,1,1];
+}
+def: InstRW<[SKXWriteGatherEVEX16], (instrs VGATHERDPSZrm, VPGATHERDDZrm)>;
 
 def SKXWriteResGroup219 : SchedWriteRes<[SKXPort4,SKXPort5,SKXPort6,SKXPort23,SKXPort237,SKXPort06,SKXPort0156]> {
   let Latency = 20;
@@ -2202,57 +2253,31 @@ def SKXWriteResGroup223 : SchedWriteRes<[SKXPort0,SKXPort23]> {
 }
 def: InstRW<[SKXWriteResGroup223], (instregex "DIV_F(32|64)m")>;
 
-def SKXWriteResGroup224 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
-  let Latency = 22;
-  let NumMicroOps = 5;
+def SKXWriteResGroupVEX2 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
+  let Latency = 18;
+  let NumMicroOps = 5; // 2 uops perform multiple loads
   let ResourceCycles = [1,2,1,1];
 }
-def: InstRW<[SKXWriteResGroup224], (instrs VGATHERDPDZ128rm,
-                                           VGATHERQPDZ128rm,
-                                           VPGATHERDQZ128rm,
-                                           VPGATHERQQZ128rm)>;
+def: InstRW<[SKXWriteResGroupVEX2], (instrs VGATHERDPDrm, VPGATHERDQrm,
+                                            VGATHERQPDrm, VPGATHERQQrm,
+                                            VGATHERQPSrm, VPGATHERQDrm)>;
 
-def SKXWriteResGroup224_2 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
-  let Latency = 22;
-  let NumMicroOps = 5;
-  let ResourceCycles = [1,2,1,1];
+def SKXWriteResGroupVEX4 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
+  let Latency = 20;
+  let NumMicroOps = 5; // 2 uops peform multiple loads
+  let ResourceCycles = [1,4,1,1];
 }
-def: InstRW<[SKXWriteResGroup224_2], (instrs VGATHERDPSrm,
-                                             VGATHERDPDrm,
-                                             VGATHERQPDrm,
-                                             VGATHERQPSrm,
-                                             VPGATHERDDrm,
-                                             VPGATHERDQrm,
-                                             VPGATHERQDrm,
-                                             VPGATHERQQrm,
-                                             VPGATHERDDrm,
-                                             VPGATHERQDrm,
-                                             VPGATHERDQrm,
-                                             VPGATHERQQrm,
-                                             VGATHERDPSrm,
-                                             VGATHERQPSrm,
-                                             VGATHERDPDrm,
-                                             VGATHERQPDrm)>;
-
-def SKXWriteResGroup224_3 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
-  let Latency = 25;
-  let NumMicroOps = 5;
-  let ResourceCycles = [1,2,1,1];
+def: InstRW<[SKXWriteResGroupVEX4], (instrs VGATHERDPDYrm, VPGATHERDQYrm,
+                                            VGATHERDPSrm,  VPGATHERDDrm,
+                                            VGATHERQPDYrm, VPGATHERQQYrm,
+                                            VGATHERQPSYrm,  VPGATHERQDYrm)>;
+
+def SKXWriteResGroupVEX8 : SchedWriteRes<[SKXPort0, SKXPort23, SKXPort5, SKXPort015]> {
+  let Latency = 22;
+  let NumMicroOps = 5; // 2 uops perform multiple loads
+  let ResourceCycles = [1,8,1,1];
 }
-def: InstRW<[SKXWriteResGroup224_3], (instrs VGATHERDPSYrm,
-                                             VGATHERQPDYrm,
-                                             VGATHERQPSYrm,
-                                             VPGATHERDDYrm,
-                                             VPGATHERDQYrm,
-                                             VPGATHERQDYrm,
-                                             VPGATHERQQYrm,
-                                             VPGATHERDDYrm,
-                                             VPGATHERQDYrm,
-                                             VPGATHERDQYrm,
-                                             VPGATHERQQYrm,
-                                             VGATHERDPSYrm,
-                                             VGATHERQPSYrm,
-                                             VGATHERDPDYrm)>;
+def: InstRW<[SKXWriteResGroupVEX8], (instrs VGATHERDPSYrm, VPGATHERDDYrm)>;
 
 def SKXWriteResGroup225 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort015]> {
   let Latency = 22;
@@ -2276,27 +2301,6 @@ def SKXWriteResGroup233 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
 }
 def: InstRW<[SKXWriteResGroup233], (instregex "DIV_FI(16|32)m")>;
 
-def SKXWriteResGroup234 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
-  let Latency = 25;
-  let NumMicroOps = 5;
-  let ResourceCycles = [1,2,1,1];
-}
-def: InstRW<[SKXWriteResGroup234], (instrs VGATHERDPDZ256rm,
-                                           VGATHERQPDZ256rm,
-                                           VPGATHERDQZ256rm,
-                                           VPGATHERQDZrm,
-                                           VPGATHERQQZ256rm)>;
-
-def SKXWriteResGroup238 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
-  let Latency = 26;
-  let NumMicroOps = 5;
-  let ResourceCycles = [1,2,1,1];
-}
-def: InstRW<[SKXWriteResGroup238], (instrs VGATHERDPDZrm,
-                                           VGATHERQPDZrm,
-                                           VPGATHERDQZrm,
-                                           VPGATHERQQZrm)>;
-
 def SKXWriteResGroup239 : SchedWriteRes<[SKXPort0,SKXPort23]> {
   let Latency = 27;
   let NumMicroOps = 2;
@@ -2304,14 +2308,6 @@ def SKXWriteResGroup239 : SchedWriteRes<[SKXPort0,SKXPort23]> {
 }
 def: InstRW<[SKXWriteResGroup239], (instregex "DIVR_F(32|64)m")>;
 
-def SKXWriteResGroup240 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
-  let Latency = 27;
-  let NumMicroOps = 5;
-  let ResourceCycles = [1,2,1,1];
-}
-def: InstRW<[SKXWriteResGroup240], (instrs VGATHERDPSZ256rm,
-                                           VPGATHERDDZ256rm)>;
-
 def SKXWriteResGroup242 : SchedWriteRes<[SKXPort5,SKXPort01,SKXPort23,SKXPort015]> {
   let Latency = 29;
   let NumMicroOps = 15;
@@ -2326,14 +2322,6 @@ def SKXWriteResGroup243 : SchedWriteRes<[SKXPort0,SKXPort5,SKXPort23]> {
 }
 def: InstRW<[SKXWriteResGroup243], (instregex "DIVR_FI(16|32)m")>;
 
-def SKXWriteResGroup245 : SchedWriteRes<[SKXPort0,SKXPort23,SKXPort015,SKXPort0156]> {
-  let Latency = 30;
-  let NumMicroOps = 5;
-  let ResourceCycles = [1,2,1,1];
-}
-def: InstRW<[SKXWriteResGroup245], (instrs VGATHERDPSZrm,
-                                           VPGATHERDDZrm)>;
-
 def SKXWriteResGroup247 : SchedWriteRes<[SKXPort5,SKXPort6,SKXPort23,SKXPort06,SKXPort0156]> {
   let Latency = 35;
   let NumMicroOps = 23;
@@ -2461,7 +2449,7 @@ def: InstRW<[SKXWriteResGroup267], (instrs PAUSE)>;
 def: InstRW<[WriteZero], (instrs CLC)>;
 
 
-// Intruction variants handled by the renamer. These might not need execution
+// Instruction variants handled by the renamer. These might not need execution
 // ports in certain conditions.
 // See Agner's Fog "The microarchitecture of Intel, AMD and VIA CPUs",
 // section "Skylake Pipeline" > "Register allocation and renaming".
diff --git a/llvm/lib/Target/X86/X86Schedule.td b/llvm/lib/Target/X86/X86Schedule.td
index 95f710061aeb..f204d6622119 100644
--- a/llvm/lib/Target/X86/X86Schedule.td
+++ b/llvm/lib/Target/X86/X86Schedule.td
@@ -250,7 +250,8 @@ defm WriteFCmp64  : X86SchedWritePair<ReadAfterVecLd>;  // Floating point double
 defm WriteFCmp64X : X86SchedWritePair<ReadAfterVecXLd>; // Floating point double compare (XMM).
 defm WriteFCmp64Y : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double compare (YMM).
 defm WriteFCmp64Z : X86SchedWritePair<ReadAfterVecYLd>; // Floating point double compare (ZMM).
-defm WriteFCom    : X86SchedWritePair<ReadAfterVecLd>;  // Floating point compare to flags.
+defm WriteFCom    : X86SchedWritePair<ReadAfterVecLd>;  // Floating point compare to flags (X87).
+defm WriteFComX   : X86SchedWritePair<ReadAfterVecLd>;  // Floating point compare to flags (SSE).
 defm WriteFMul    : X86SchedWritePair<ReadAfterVecLd>;  // Floating point multiplication.
 defm WriteFMulX   : X86SchedWritePair<ReadAfterVecXLd>; // Floating point multiplication (XMM).
 defm WriteFMulY   : X86SchedWritePair<ReadAfterVecYLd>; // Floating point multiplication (YMM).
@@ -340,8 +341,10 @@ def  WriteVecStoreX       : SchedWrite;
 def  WriteVecStoreY       : SchedWrite;
 def  WriteVecStoreNT      : SchedWrite;
 def  WriteVecStoreNTY     : SchedWrite;
-def  WriteVecMaskedStore  : SchedWrite;
-def  WriteVecMaskedStoreY : SchedWrite;
+def  WriteVecMaskedStore32  : SchedWrite;
+def  WriteVecMaskedStore64  : SchedWrite;
+def  WriteVecMaskedStore32Y : SchedWrite;
+def  WriteVecMaskedStore64Y : SchedWrite;
 def  WriteVecMove         : SchedWrite;
 def  WriteVecMoveX        : SchedWrite;
 def  WriteVecMoveY        : SchedWrite;
@@ -549,6 +552,14 @@ def WriteFMaskMove32Y
   : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore32Y>;
 def WriteFMaskMove64Y
   : X86SchedWriteMaskMove<WriteFMaskedLoadY, WriteFMaskedStore64Y>;
+def WriteVecMaskMove32
+  : X86SchedWriteMaskMove<WriteVecMaskedLoad, WriteVecMaskedStore32>;
+def WriteVecMaskMove64
+  : X86SchedWriteMaskMove<WriteVecMaskedLoad, WriteVecMaskedStore64>;
+def WriteVecMaskMove32Y
+  : X86SchedWriteMaskMove<WriteVecMaskedLoadY, WriteVecMaskedStore32Y>;
+def WriteVecMaskMove64Y
+  : X86SchedWriteMaskMove<WriteVecMaskedLoadY, WriteVecMaskedStore64Y>;
 
 // Vector width wrappers.
 def SchedWriteFAdd
diff --git a/llvm/lib/Target/X86/X86ScheduleAtom.td b/llvm/lib/Target/X86/X86ScheduleAtom.td
index b0153ca9da36..b90baf6c16b1 100644
--- a/llvm/lib/Target/X86/X86ScheduleAtom.td
+++ b/llvm/lib/Target/X86/X86ScheduleAtom.td
@@ -244,6 +244,7 @@ defm : AtomWriteResPair<WriteFCmp64X,       [AtomPort01], [AtomPort01],  6,  7,
 defm : X86WriteResPairUnsupported<WriteFCmp64Y>;
 defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
 defm : AtomWriteResPair<WriteFCom,           [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
+defm : AtomWriteResPair<WriteFComX,          [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
 defm : AtomWriteResPair<WriteFMul,           [AtomPort0],  [AtomPort0],  4,  4,  [4],  [4]>;
 defm : AtomWriteResPair<WriteFMulX,          [AtomPort0],  [AtomPort0],  5,  5,  [5],  [5]>;
 defm : X86WriteResPairUnsupported<WriteFMulY>;
@@ -368,8 +369,10 @@ def  : WriteRes<WriteVecStoreX,       [AtomPort0]>;
 defm : X86WriteResUnsupported<WriteVecStoreY>;
 def  : WriteRes<WriteVecStoreNT,      [AtomPort0]>;
 defm : X86WriteResUnsupported<WriteVecStoreNTY>;
-def  : WriteRes<WriteVecMaskedStore,  [AtomPort0]>;
-defm : X86WriteResUnsupported<WriteVecMaskedStoreY>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
 
 def  : WriteRes<WriteVecMove,          [AtomPort0]>;
 def  : WriteRes<WriteVecMoveX,        [AtomPort01]>;
diff --git a/llvm/lib/Target/X86/X86ScheduleBdVer2.td b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
index d7aea3cf4e9d..0a201bc74a48 100644
--- a/llvm/lib/Target/X86/X86ScheduleBdVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBdVer2.td
@@ -545,8 +545,40 @@ def PdWriteBTSRm : SchedWriteRes<[PdEX01, PdLoad]> {
 def : SchedAlias<WriteBitTestSetRegRMW, PdWriteBTSRm>;
 
 // This is for simple LEAs with one or two input operands.
-// FIXME: SAGU 3-operand LEA
-def : WriteRes<WriteLEA,              [PdEX01]> { let NumMicroOps = 2; }
+def : WriteRes<WriteLEA,              [PdEX01]> { let ResourceCycles = [2]; }
+
+// This write is used for slow LEA instructions.
+def PdWrite3OpsLEA : SchedWriteRes<[PdEX01]> {
+  let Latency = 2;
+  let ResourceCycles = [2];
+}
+
+// On Piledriver, a slow LEA is either a 3Ops LEA (base, index, offset),
+// or an LEA with a `Scale` value different than 1.
+def PdSlowLEAPredicate : MCSchedPredicate<
+  CheckAny<[
+    // A 3-operand LEA (base, index, offset).
+    IsThreeOperandsLEAFn,
+    // An LEA with a "Scale" different than 1.
+    CheckAll<[
+      CheckIsImmOperand<2>,
+      CheckNot<CheckImmOperand<2, 1>>
+    ]>
+  ]>
+>;
+
+def PdWriteLEA : SchedWriteVariant<[
+    SchedVar<PdSlowLEAPredicate, [PdWrite3OpsLEA]>,
+    SchedVar<NoSchedPred,        [WriteLEA]>
+]>;
+
+def : InstRW<[PdWriteLEA], (instrs LEA32r, LEA64r, LEA64_32r)>;
+
+def PdWriteLEA16r : SchedWriteRes<[PdEX01]> {
+  let ResourceCycles = [3];
+  let NumMicroOps = 2;
+}
+def : InstRW<[PdWriteLEA16r], (instrs LEA16r)>;
 
 // Bit counts.
 defm : PdWriteResExPair<WriteBSF,     [PdEX01],          3,  [6],     6, 2>;
@@ -766,6 +798,7 @@ defm : PdWriteResYMMPair<WriteFCmp64Y,      [PdFPU0, PdFPFMA],  2, [1, 2]>;
 defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
 
 defm : PdWriteResXMMPair<WriteFCom,         [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
+defm : PdWriteResXMMPair<WriteFComX,        [PdFPU0, PdFPFMA, PdEX0], 1, [], 2>;
 
 def PdWriteFCOMPm : SchedWriteRes<[PdFPU1, PdFPFMA]> {
   let Latency = 6;
@@ -1060,8 +1093,10 @@ def : InstRW<[PdWriteVMOVDQUYmr], (instrs VMOVDQUYmr)>;
 defm : PdWriteRes<WriteVecStoreNT,          [PdStore, PdFPU1,   PdFPSTO], 2>;
 defm : PdWriteRes<WriteVecStoreNTY,         [PdStore, PdFPU1,   PdFPSTO], 2, [2, 2, 2], 4>;
 
-defm : PdWriteRes<WriteVecMaskedStore,      [PdStore, PdFPU01, PdFPMAL], 6, [1, 1, 4]>;
-defm : PdWriteRes<WriteVecMaskedStoreY,     [PdStore, PdFPU01, PdFPMAL], 6, [2, 2, 4], 2>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
 
 defm : PdWriteRes<WriteVecMove,             [PdFPU01, PdFPMAL], 2>;
 defm : PdWriteRes<WriteVecMoveX,            [PdFPU01, PdFPMAL], 1, [1, 2]>;
diff --git a/llvm/lib/Target/X86/X86ScheduleBtVer2.td b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
index d0421d94ee05..13b6eed5126d 100644
--- a/llvm/lib/Target/X86/X86ScheduleBtVer2.td
+++ b/llvm/lib/Target/X86/X86ScheduleBtVer2.td
@@ -541,6 +541,7 @@ defm : JWriteResFpuPair<WriteFCmp64X,      [JFPU0, JFPA],  2>;
 defm : JWriteResYMMPair<WriteFCmp64Y,      [JFPU0, JFPA],  2, [2,2], 2>;
 defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
 defm : JWriteResFpuPair<WriteFCom,  [JFPU0, JFPA, JALU0],  3>;
+defm : JWriteResFpuPair<WriteFComX, [JFPU0, JFPA, JALU0],  3>;
 defm : JWriteResFpuPair<WriteFMul,         [JFPU1, JFPM],  2>;
 defm : JWriteResFpuPair<WriteFMulX,        [JFPU1, JFPM],  2>;
 defm : JWriteResYMMPair<WriteFMulY,        [JFPU1, JFPM],  2, [2,2], 2>;
@@ -669,8 +670,10 @@ defm : X86WriteRes<WriteVecStoreX,        [JSAGU, JFPU1,   JSTC], 1, [1, 1, 1],
 defm : X86WriteRes<WriteVecStoreY,        [JSAGU, JFPU1,   JSTC], 1, [2, 2, 2], 2>;
 defm : X86WriteRes<WriteVecStoreNT,       [JSAGU, JFPU1,   JSTC], 2, [1, 1, 1], 1>;
 defm : X86WriteRes<WriteVecStoreNTY,      [JSAGU, JFPU1,   JSTC], 2, [2, 2, 2], 1>;
-defm : X86WriteRes<WriteVecMaskedStore,   [JSAGU, JFPU01, JVALU], 6, [1, 1, 4], 1>;
-defm : X86WriteRes<WriteVecMaskedStoreY,  [JSAGU, JFPU01, JVALU], 6, [2, 2, 4], 2>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore32Y>;
+defm : X86WriteResUnsupported<WriteVecMaskedStore64Y>;
 
 defm : X86WriteRes<WriteVecMove,          [JFPU01, JVALU], 1, [1, 1], 1>;
 defm : X86WriteRes<WriteVecMoveX,         [JFPU01, JVALU], 1, [1, 1], 1>;
diff --git a/llvm/lib/Target/X86/X86ScheduleSLM.td b/llvm/lib/Target/X86/X86ScheduleSLM.td
index dcd155ea0e0e..3d53ef104ed6 100644
--- a/llvm/lib/Target/X86/X86ScheduleSLM.td
+++ b/llvm/lib/Target/X86/X86ScheduleSLM.td
@@ -214,6 +214,7 @@ defm : SLMWriteResPair<WriteFCmp64X,  [SLM_FPC_RSV1], 3>;
 defm : SLMWriteResPair<WriteFCmp64Y,  [SLM_FPC_RSV1], 3>;
 defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
 defm : SLMWriteResPair<WriteFCom,     [SLM_FPC_RSV1], 3>;
+defm : SLMWriteResPair<WriteFComX,    [SLM_FPC_RSV1], 3>;
 defm : SLMWriteResPair<WriteFMul,     [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
 defm : SLMWriteResPair<WriteFMulX,    [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
 defm : SLMWriteResPair<WriteFMulY,    [SLM_FPC_RSV0, SLMFPMultiplier], 5, [1,2]>;
@@ -310,8 +311,10 @@ def  : WriteRes<WriteVecStoreX,       [SLM_MEC_RSV]>;
 def  : WriteRes<WriteVecStoreY,       [SLM_MEC_RSV]>;
 def  : WriteRes<WriteVecStoreNT,      [SLM_MEC_RSV]>;
 def  : WriteRes<WriteVecStoreNTY,     [SLM_MEC_RSV]>;
-def  : WriteRes<WriteVecMaskedStore,  [SLM_MEC_RSV]>;
-def  : WriteRes<WriteVecMaskedStoreY, [SLM_MEC_RSV]>;
+def  : WriteRes<WriteVecMaskedStore32,    [SLM_MEC_RSV]>;
+def  : WriteRes<WriteVecMaskedStore32Y,   [SLM_MEC_RSV]>;
+def  : WriteRes<WriteVecMaskedStore64,    [SLM_MEC_RSV]>;
+def  : WriteRes<WriteVecMaskedStore64Y,   [SLM_MEC_RSV]>;
 def  : WriteRes<WriteVecMove,         [SLM_FPC_RSV01]>;
 def  : WriteRes<WriteVecMoveX,        [SLM_FPC_RSV01]>;
 def  : WriteRes<WriteVecMoveY,        [SLM_FPC_RSV01]>;
@@ -390,44 +393,15 @@ defm : X86WriteResPairUnsupported<WritePHAddZ>;
 
 // String instructions.
 // Packed Compare Implicit Length Strings, Return Mask
-def : WriteRes<WritePCmpIStrM, [SLM_FPC_RSV0]> {
-  let Latency = 13;
-  let ResourceCycles = [13];
-}
-def : WriteRes<WritePCmpIStrMLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
-  let Latency = 13;
-  let ResourceCycles = [13, 1];
-}
+defm : SLMWriteResPair<WritePCmpIStrM,  [SLM_FPC_RSV0], 13, [13]>;
 
 // Packed Compare Explicit Length Strings, Return Mask
-def : WriteRes<WritePCmpEStrM, [SLM_FPC_RSV0]> {
-  let Latency = 17;
-  let ResourceCycles = [17];
-}
-def : WriteRes<WritePCmpEStrMLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
-  let Latency = 17;
-  let ResourceCycles = [17, 1];
-}
-
+defm : SLMWriteResPair<WritePCmpEStrM,  [SLM_FPC_RSV0], 17, [17]>;
 // Packed Compare Implicit Length Strings, Return Index
-def : WriteRes<WritePCmpIStrI, [SLM_FPC_RSV0]> {
-  let Latency = 17;
-  let ResourceCycles = [17];
-}
-def : WriteRes<WritePCmpIStrILd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
-  let Latency = 17;
-  let ResourceCycles = [17, 1];
-}
+defm : SLMWriteResPair<WritePCmpIStrI,  [SLM_FPC_RSV0], 17, [17]>;
 
 // Packed Compare Explicit Length Strings, Return Index
-def : WriteRes<WritePCmpEStrI, [SLM_FPC_RSV0]> {
-  let Latency = 21;
-  let ResourceCycles = [21];
-}
-def : WriteRes<WritePCmpEStrILd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
-  let Latency = 21;
-  let ResourceCycles = [21, 1];
-}
+defm : SLMWriteResPair<WritePCmpEStrI,  [SLM_FPC_RSV0], 21, [21]>;
 
 // MOVMSK Instructions.
 def : WriteRes<WriteFMOVMSK,    [SLM_FPC_RSV1]> { let Latency = 4; }
@@ -436,42 +410,12 @@ def : WriteRes<WriteVecMOVMSKY, [SLM_FPC_RSV1]> { let Latency = 4; }
 def : WriteRes<WriteMMXMOVMSK,  [SLM_FPC_RSV1]> { let Latency = 4; }
 
 // AES Instructions.
-def : WriteRes<WriteAESDecEnc, [SLM_FPC_RSV0]> {
-  let Latency = 8;
-  let ResourceCycles = [5];
-}
-def : WriteRes<WriteAESDecEncLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
-  let Latency = 8;
-  let ResourceCycles = [5, 1];
-}
-
-def : WriteRes<WriteAESIMC, [SLM_FPC_RSV0]> {
-  let Latency = 8;
-  let ResourceCycles = [5];
-}
-def : WriteRes<WriteAESIMCLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
-  let Latency = 8;
-  let ResourceCycles = [5, 1];
-}
-
-def : WriteRes<WriteAESKeyGen, [SLM_FPC_RSV0]> {
-  let Latency = 8;
-  let ResourceCycles = [5];
-}
-def : WriteRes<WriteAESKeyGenLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
-  let Latency = 8;
-  let ResourceCycles = [5, 1];
-}
+defm : SLMWriteResPair<WriteAESDecEnc, [SLM_FPC_RSV0], 8, [5]>;
+defm : SLMWriteResPair<WriteAESIMC,    [SLM_FPC_RSV0], 8, [5]>;
+defm : SLMWriteResPair<WriteAESKeyGen, [SLM_FPC_RSV0], 8, [5]>;
 
 // Carry-less multiplication instructions.
-def : WriteRes<WriteCLMul, [SLM_FPC_RSV0]> {
-  let Latency = 10;
-  let ResourceCycles = [10];
-}
-def : WriteRes<WriteCLMulLd, [SLM_FPC_RSV0, SLM_MEC_RSV]> {
-  let Latency = 10;
-  let ResourceCycles = [10, 1];
-}
+defm : SLMWriteResPair<WriteCLMul, [SLM_FPC_RSV0], 10, [10]>;
 
 def : WriteRes<WriteSystem,     [SLM_FPC_RSV0]> { let Latency = 100; }
 def : WriteRes<WriteMicrocoded, [SLM_FPC_RSV0]> { let Latency = 100; }
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver1.td b/llvm/lib/Target/X86/X86ScheduleZnver1.td
index 06201f4a3a84..fe09d6f85221 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver1.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver1.td
@@ -295,6 +295,7 @@ defm : ZnWriteResFpuPair<WriteFCmp64X,   [ZnFPU0],  3>;
 defm : ZnWriteResFpuPair<WriteFCmp64Y,   [ZnFPU0],  3>;
 defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
 defm : ZnWriteResFpuPair<WriteFCom,      [ZnFPU0],  3>;
+defm : ZnWriteResFpuPair<WriteFComX,     [ZnFPU0],  3>;
 defm : ZnWriteResFpuPair<WriteFBlend,    [ZnFPU01], 1>;
 defm : ZnWriteResFpuPair<WriteFBlendY,   [ZnFPU01], 1>;
 defm : X86WriteResPairUnsupported<WriteFBlendZ>;
@@ -387,8 +388,10 @@ defm : X86WriteRes<WriteVecStoreX,       [ZnAGU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecStoreY,       [ZnAGU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecStoreNT,      [ZnAGU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecStoreNTY,     [ZnAGU], 1, [1], 1>;
-defm : X86WriteRes<WriteVecMaskedStore,  [ZnAGU,ZnFPU01], 4, [1,1], 1>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32,  [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64,  [ZnAGU,ZnFPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [ZnAGU,ZnFPU01], 5, [1,2], 2>;
 defm : X86WriteRes<WriteVecMove,         [ZnFPU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [ZnFPU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [ZnFPU], 2, [1], 2>;
diff --git a/llvm/lib/Target/X86/X86ScheduleZnver2.td b/llvm/lib/Target/X86/X86ScheduleZnver2.td
index 4537d9cc7956..48da0d6329b1 100644
--- a/llvm/lib/Target/X86/X86ScheduleZnver2.td
+++ b/llvm/lib/Target/X86/X86ScheduleZnver2.td
@@ -187,7 +187,7 @@ defm : Zn2WriteResPair<WriteIMul8,     [Zn2ALU1, Zn2Multiplier], 4>;
 
 defm : X86WriteRes<WriteBSWAP32, [Zn2ALU], 1, [4], 1>;
 defm : X86WriteRes<WriteBSWAP64, [Zn2ALU], 1, [4], 1>;
-defm : X86WriteRes<WriteCMPXCHG, [Zn2ALU], 1, [1], 1>;
+defm : X86WriteRes<WriteCMPXCHG, [Zn2ALU], 3, [1], 1>;
 defm : X86WriteRes<WriteCMPXCHGRMW,[Zn2ALU,Zn2AGU], 8, [1,1], 5>;
 defm : X86WriteRes<WriteXCHG, [Zn2ALU], 1, [2], 2>;
 
@@ -216,7 +216,7 @@ defm : X86WriteRes<WriteBitTestSet,      [Zn2ALU], 2, [1], 2>;
 
 // Bit counts.
 defm : Zn2WriteResPair<WriteBSF, [Zn2ALU], 3>;
-defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 3>;
+defm : Zn2WriteResPair<WriteBSR, [Zn2ALU], 4>;
 defm : Zn2WriteResPair<WriteLZCNT,          [Zn2ALU], 1>;
 defm : Zn2WriteResPair<WriteTZCNT,          [Zn2ALU], 2>;
 defm : Zn2WriteResPair<WritePOPCNT,         [Zn2ALU], 1>;
@@ -272,15 +272,16 @@ defm : Zn2WriteResFpuPair<WriteFAdd64,    [Zn2FPU0],  3>;
 defm : Zn2WriteResFpuPair<WriteFAdd64X,   [Zn2FPU0],  3>;
 defm : Zn2WriteResFpuPair<WriteFAdd64Y,   [Zn2FPU0],  3>;
 defm : X86WriteResPairUnsupported<WriteFAdd64Z>;
-defm : Zn2WriteResFpuPair<WriteFCmp,      [Zn2FPU0],  3>;
-defm : Zn2WriteResFpuPair<WriteFCmpX,     [Zn2FPU0],  3>;
-defm : Zn2WriteResFpuPair<WriteFCmpY,     [Zn2FPU0],  3>;
+defm : Zn2WriteResFpuPair<WriteFCmp,      [Zn2FPU0],  1>;
+defm : Zn2WriteResFpuPair<WriteFCmpX,     [Zn2FPU0],  1>;
+defm : Zn2WriteResFpuPair<WriteFCmpY,     [Zn2FPU0],  1>;
 defm : X86WriteResPairUnsupported<WriteFCmpZ>;
-defm : Zn2WriteResFpuPair<WriteFCmp64,    [Zn2FPU0],  3>;
-defm : Zn2WriteResFpuPair<WriteFCmp64X,   [Zn2FPU0],  3>;
-defm : Zn2WriteResFpuPair<WriteFCmp64Y,   [Zn2FPU0],  3>;
+defm : Zn2WriteResFpuPair<WriteFCmp64,    [Zn2FPU0],  1>;
+defm : Zn2WriteResFpuPair<WriteFCmp64X,   [Zn2FPU0],  1>;
+defm : Zn2WriteResFpuPair<WriteFCmp64Y,   [Zn2FPU0],  1>;
 defm : X86WriteResPairUnsupported<WriteFCmp64Z>;
 defm : Zn2WriteResFpuPair<WriteFCom,      [Zn2FPU0],  3>;
+defm : Zn2WriteResFpuPair<WriteFComX,     [Zn2FPU0],  3>;
 defm : Zn2WriteResFpuPair<WriteFBlend,    [Zn2FPU01], 1>;
 defm : Zn2WriteResFpuPair<WriteFBlendY,   [Zn2FPU01], 1>;
 defm : X86WriteResPairUnsupported<WriteFBlendZ>;
@@ -313,8 +314,8 @@ defm : Zn2WriteResFpuPair<WriteFDiv64,    [Zn2FPU3], 15>;
 defm : Zn2WriteResFpuPair<WriteFDiv64X,   [Zn2FPU3], 15>;
 defm : X86WriteResPairUnsupported<WriteFDiv64Z>;
 defm : Zn2WriteResFpuPair<WriteFSign,     [Zn2FPU3],  2>;
-defm : Zn2WriteResFpuPair<WriteFRnd,      [Zn2FPU3],  4, [1], 1, 7, 0>;
-defm : Zn2WriteResFpuPair<WriteFRndY,     [Zn2FPU3],  4, [1], 1, 7, 0>;
+defm : Zn2WriteResFpuPair<WriteFRnd,      [Zn2FPU3],  3, [1], 1, 7, 0>;
+defm : Zn2WriteResFpuPair<WriteFRndY,     [Zn2FPU3],  3, [1], 1, 7, 0>;
 defm : X86WriteResPairUnsupported<WriteFRndZ>;
 defm : Zn2WriteResFpuPair<WriteFLogic,    [Zn2FPU],   1>;
 defm : Zn2WriteResFpuPair<WriteFLogicY,   [Zn2FPU],   1>;
@@ -325,16 +326,16 @@ defm : X86WriteResPairUnsupported<WriteFTestZ>;
 defm : Zn2WriteResFpuPair<WriteFShuffle,  [Zn2FPU12], 1>;
 defm : Zn2WriteResFpuPair<WriteFShuffleY, [Zn2FPU12], 1>;
 defm : X86WriteResPairUnsupported<WriteFShuffleZ>;
-defm : Zn2WriteResFpuPair<WriteFVarShuffle, [Zn2FPU12], 1>;
-defm : Zn2WriteResFpuPair<WriteFVarShuffleY,[Zn2FPU12], 1>;
+defm : Zn2WriteResFpuPair<WriteFVarShuffle, [Zn2FPU12], 3>;
+defm : Zn2WriteResFpuPair<WriteFVarShuffleY,[Zn2FPU12], 3>;
 defm : X86WriteResPairUnsupported<WriteFVarShuffleZ>;
 defm : Zn2WriteResFpuPair<WriteFMul,      [Zn2FPU01], 3, [1], 1, 7, 1>;
 defm : Zn2WriteResFpuPair<WriteFMulX,     [Zn2FPU01], 3, [1], 1, 7, 1>;
-defm : Zn2WriteResFpuPair<WriteFMulY,     [Zn2FPU01], 4, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFMulY,     [Zn2FPU01], 3, [1], 1, 7, 1>;
 defm : X86WriteResPairUnsupported<WriteFMulZ>;
 defm : Zn2WriteResFpuPair<WriteFMul64,    [Zn2FPU01], 3, [1], 1, 7, 1>;
 defm : Zn2WriteResFpuPair<WriteFMul64X,   [Zn2FPU01], 3, [1], 1, 7, 1>;
-defm : Zn2WriteResFpuPair<WriteFMul64Y,   [Zn2FPU01], 4, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WriteFMul64Y,   [Zn2FPU01], 3, [1], 1, 7, 1>;
 defm : X86WriteResPairUnsupported<WriteFMul64Z>;
 defm : Zn2WriteResFpuPair<WriteFMA,       [Zn2FPU03], 5>;
 defm : Zn2WriteResFpuPair<WriteFMAX,      [Zn2FPU03], 5>;
@@ -369,8 +370,10 @@ defm : X86WriteRes<WriteVecStoreX,       [Zn2AGU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecStoreY,       [Zn2AGU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecStoreNT,      [Zn2AGU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecStoreNTY,     [Zn2AGU], 1, [1], 1>;
-defm : X86WriteRes<WriteVecMaskedStore,  [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
-defm : X86WriteRes<WriteVecMaskedStoreY, [Zn2AGU,Zn2FPU01], 5, [1,1], 2>;
+defm : X86WriteRes<WriteVecMaskedStore32,  [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore32Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
+defm : X86WriteRes<WriteVecMaskedStore64,  [Zn2AGU,Zn2FPU01], 4, [1,1], 1>;
+defm : X86WriteRes<WriteVecMaskedStore64Y, [Zn2AGU,Zn2FPU01], 5, [1,2], 2>;
 defm : X86WriteRes<WriteVecMove,         [Zn2FPU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveX,        [Zn2FPU], 1, [1], 1>;
 defm : X86WriteRes<WriteVecMoveY,        [Zn2FPU], 2, [1], 2>;
@@ -380,7 +383,7 @@ defm : X86WriteRes<WriteEMMS,            [Zn2FPU], 2, [1], 1>;
 
 defm : Zn2WriteResFpuPair<WriteVecShift,   [Zn2FPU],   1>;
 defm : Zn2WriteResFpuPair<WriteVecShiftX,  [Zn2FPU2],  1>;
-defm : Zn2WriteResFpuPair<WriteVecShiftY,  [Zn2FPU2],  2>;
+defm : Zn2WriteResFpuPair<WriteVecShiftY,  [Zn2FPU2],  1>;
 defm : X86WriteResPairUnsupported<WriteVecShiftZ>;
 defm : Zn2WriteResFpuPair<WriteVecShiftImm,  [Zn2FPU], 1>;
 defm : Zn2WriteResFpuPair<WriteVecShiftImmX, [Zn2FPU], 1>;
@@ -402,7 +405,7 @@ defm : Zn2WriteResFpuPair<WriteVecIMulX,   [Zn2FPU0],  4>;
 defm : Zn2WriteResFpuPair<WriteVecIMulY,   [Zn2FPU0],  4>;
 defm : X86WriteResPairUnsupported<WriteVecIMulZ>;
 defm : Zn2WriteResFpuPair<WritePMULLD,     [Zn2FPU0],  4, [1], 1, 7, 1>;
-defm : Zn2WriteResFpuPair<WritePMULLDY,    [Zn2FPU0],  3, [1], 1, 7, 1>;
+defm : Zn2WriteResFpuPair<WritePMULLDY,    [Zn2FPU0],  4, [1], 1, 7, 1>;
 defm : X86WriteResPairUnsupported<WritePMULLDZ>;
 defm : Zn2WriteResFpuPair<WriteShuffle,    [Zn2FPU],   1>;
 defm : Zn2WriteResFpuPair<WriteShuffleX,   [Zn2FPU],   1>;
@@ -424,8 +427,8 @@ defm : X86WriteResPairUnsupported<WritePSADBWZ>;
 defm : Zn2WriteResFpuPair<WritePHMINPOS,   [Zn2FPU0],  4>;
 
 // Vector Shift Operations
-defm : Zn2WriteResFpuPair<WriteVarVecShift,  [Zn2FPU12], 1>;
-defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU12], 1>;
+defm : Zn2WriteResFpuPair<WriteVarVecShift,  [Zn2FPU12], 3>;
+defm : Zn2WriteResFpuPair<WriteVarVecShiftY, [Zn2FPU12], 3>;
 defm : X86WriteResPairUnsupported<WriteVarVecShiftZ>;
 
 // Vector insert/extract operations.
@@ -469,6 +472,12 @@ defm : Zn2WriteResFpuPair<WriteFVarShuffle256, [Zn2FPU], 100>;
 def Zn2WriteMicrocoded : SchedWriteRes<[]> {
   let Latency = 100;
 }
+defm : Zn2WriteResPair<WriteDPPS, [], 15>;
+defm : Zn2WriteResPair<WriteFHAdd, [], 7>;
+defm : Zn2WriteResPair<WriteFHAddY, [], 7>;
+defm : Zn2WriteResPair<WritePHAdd, [], 3>;
+defm : Zn2WriteResPair<WritePHAddX, [], 3>;
+defm : Zn2WriteResPair<WritePHAddY, [], 3>;
 
 def : SchedAlias<WriteMicrocoded, Zn2WriteMicrocoded>;
 def : SchedAlias<WriteFCMOV, Zn2WriteMicrocoded>;
@@ -517,14 +526,14 @@ def Zn2WriteXCHG : SchedWriteRes<[Zn2ALU]> {
   let NumMicroOps = 2;
 }
 
-def : InstRW<[Zn2WriteXCHG], (instregex "XCHG(8|16|32|64)rr", "XCHG(16|32|64)ar")>;
+def : InstRW<[Zn2WriteXCHG], (instregex "^XCHG(8|16|32|64)rr", "^XCHG(16|32|64)ar")>;
 
 // r,m.
 def Zn2WriteXCHGrm : SchedWriteRes<[Zn2AGU, Zn2ALU]> {
   let Latency = 5;
   let NumMicroOps = 2;
 }
-def : InstRW<[Zn2WriteXCHGrm, ReadAfterLd], (instregex "XCHG(8|16|32|64)rm")>;
+def : InstRW<[Zn2WriteXCHGrm, ReadAfterLd], (instregex "^XCHG(8|16|32|64)rm")>;
 
 def : InstRW<[WriteMicrocoded], (instrs XLAT)>;
 
@@ -594,8 +603,11 @@ def : InstRW<[WriteALULd],
 def Zn2WriteMul16 : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
   let Latency = 3;
 }
+def Zn2WriteMul16Imm : SchedWriteRes<[Zn2ALU1, Zn2Multiplier]> {
+  let Latency = 4;
+}
 def : SchedAlias<WriteIMul16, Zn2WriteMul16>;
-def : SchedAlias<WriteIMul16Imm, Zn2WriteMul16>;
+def : SchedAlias<WriteIMul16Imm, Zn2WriteMul16Imm>;
 def : SchedAlias<WriteIMul16Reg, Zn2WriteMul16>;
 
 // m16.
@@ -1001,6 +1013,7 @@ def : InstRW<[WriteMicrocoded], (instrs FNINIT)>;
 // mm <- mm.
 def Zn2WriteFPU12 : SchedWriteRes<[Zn2FPU12]> ;
 def Zn2WriteFPU12Y : SchedWriteRes<[Zn2FPU12]> {
+  let Latency = 4;
   let NumMicroOps = 2;
 }
 def Zn2WriteFPU12m : SchedWriteRes<[Zn2AGU, Zn2FPU12]> ;
@@ -1109,15 +1122,6 @@ def : InstRW<[WriteMicrocoded], (instregex "VPGATHER(Q|D)(Q|D)(Y?)rm")>;
 
 //-- Arithmetic instructions --//
 
-// HADD, HSUB PS/PD
-// PHADD|PHSUB (S) W/D.
-def : SchedAlias<WritePHAdd,    Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddLd,  Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddX,   Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddXLd, Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddY,   Zn2WriteMicrocoded>;
-def : SchedAlias<WritePHAddYLd, Zn2WriteMicrocoded>;
-
 // PCMPGTQ.
 def Zn2WritePCMPGTQr : SchedWriteRes<[Zn2FPU03]>;
 def : InstRW<[Zn2WritePCMPGTQr], (instregex "(V?)PCMPGTQ(Y?)rr")>;
@@ -1137,8 +1141,12 @@ def : InstRW<[Zn2WritePCMPGTQYm], (instrs VPCMPGTQYrm)>;
 
 // PSLL,PSRL,PSRA W/D/Q.
 // x,x / v,v,x.
-def Zn2WritePShift  : SchedWriteRes<[Zn2FPU2]> ;
-def Zn2WritePShiftY : SchedWriteRes<[Zn2FPU2]> ;
+def Zn2WritePShift  : SchedWriteRes<[Zn2FPU2]> {
+  let Latency = 3;
+}
+def Zn2WritePShiftY : SchedWriteRes<[Zn2FPU2]> {
+  let Latency = 3;
+}
 
 // PSLL,PSRL DQ.
 def : InstRW<[Zn2WritePShift], (instregex "(V?)PS(R|L)LDQri")>;
@@ -1280,7 +1288,7 @@ def Zn2WriteCVTDQ2PDr: SchedWriteRes<[Zn2FPU12,Zn2FPU3]> {
 }
 // CVTDQ2PD.
 // x,x.
-def : InstRW<[Zn2WriteCVTDQ2PDr], (instregex "(V)?CVTDQ2PDrr")>;
+def : InstRW<[Zn2WriteCVTDQ2PDr], (instregex "(V)?CVTDQ2P(D|S)rr")>;
 
 // Same as xmm
 // y,x.
@@ -1290,9 +1298,9 @@ def : InstRW<[Zn2WriteCVTDQ2PDr], (instrs VCVTDQ2PSYrr)>;
 def Zn2WriteCVTPD2DQr: SchedWriteRes<[Zn2FPU12, Zn2FPU3]> {
   let Latency = 3;
 }
-// CVT(T)PD2DQ.
+// CVT(T)P(D|S)2DQ.
 // x,x.
-def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)PD2DQrr")>;
+def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)P(D|S)2DQrr")>;
 
 def Zn2WriteCVTPD2DQLd: SchedWriteRes<[Zn2AGU,Zn2FPU12,Zn2FPU3]> {
   let Latency = 10;
@@ -1322,7 +1330,7 @@ def : InstRW<[Zn2WriteCVTPS2PDr], (instrs MMX_CVTPI2PDirr)>;
 def : InstRW<[Zn2WriteCVTPS2PIr], (instregex "MMX_CVT(T?)PD2PIirr")>;
 
 def Zn2WriteCVSTSI2SSr: SchedWriteRes<[Zn2FPU3]> {
-  let Latency = 4;
+  let Latency = 3;
 }
 
 // same as CVTPD2DQr
@@ -1334,7 +1342,7 @@ def : InstRW<[Zn2WriteCVTPD2DQr], (instregex "(V?)CVT(T?)SS2SI(64)?rr")>;
 def : InstRW<[Zn2WriteCVTPD2DQLd], (instregex "(V?)CVT(T?)SS2SI(64)?rm")>;
 
 def Zn2WriteCVSTSI2SDr: SchedWriteRes<[Zn2FPU013, Zn2FPU3]> {
-  let Latency = 4;
+  let Latency = 3;
 }
 // CVTSI2SD.
 // x,r32/64.
@@ -1376,7 +1384,7 @@ defm : X86WriteResUnsupported<WriteCvtPH2PSZLd>;
 //-- SSE4A instructions --//
 // EXTRQ
 def Zn2WriteEXTRQ: SchedWriteRes<[Zn2FPU12, Zn2FPU2]> {
-  let Latency = 2;
+  let Latency = 3;
 }
 def : InstRW<[Zn2WriteEXTRQ], (instregex "EXTRQ")>;
 
@@ -1448,12 +1456,6 @@ def : InstRW<[Zn2WriteSHA256RNDS2Ld], (instregex "SHA256RNDS2rm")>;
 
 //-- Arithmetic instructions --//
 
-// HADD, HSUB PS/PD
-def : SchedAlias<WriteFHAdd,    Zn2WriteMicrocoded>;
-def : SchedAlias<WriteFHAddLd,  Zn2WriteMicrocoded>;
-def : SchedAlias<WriteFHAddY,   Zn2WriteMicrocoded>;
-def : SchedAlias<WriteFHAddYLd, Zn2WriteMicrocoded>;
-
 // VDIVPS.
 // TODO - convert to Zn2WriteResFpuPair
 // y,y,y.
@@ -1490,11 +1492,9 @@ def : SchedAlias<WriteFDiv64YLd, Zn2WriteVDIVPDYLd>;
 
 // DPPS.
 // x,x,i / v,v,v,i.
-def : SchedAlias<WriteDPPS,   Zn2WriteMicrocoded>;
 def : SchedAlias<WriteDPPSY,  Zn2WriteMicrocoded>;
 
 // x,m,i / v,v,m,i.
-def : SchedAlias<WriteDPPSLd, Zn2WriteMicrocoded>;
 def : SchedAlias<WriteDPPSYLd,Zn2WriteMicrocoded>;
 
 // DPPD.
diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
index 1ae8df977f83..ce8d1d464da9 100644
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.cpp
@@ -15,6 +15,7 @@
 #include "X86InstrInfo.h"
 #include "X86RegisterInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLowering.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -45,7 +46,7 @@ bool X86SelectionDAGInfo::isBaseRegConflictPossible(
 
 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Val,
-    SDValue Size, unsigned Align, bool isVolatile,
+    SDValue Size, Align Alignment, bool isVolatile,
     MachinePointerInfo DstPtrInfo) const {
   ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size);
   const X86Subtarget &Subtarget =
@@ -65,7 +66,7 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
   // If not DWORD aligned or size is more than the threshold, call the library.
   // The libc version is likely to be faster for these cases. It can use the
   // address value and run time information about the CPU.
-  if ((Align & 3) != 0 || !ConstantSize ||
+  if (Alignment < Align(4) || !ConstantSize ||
       ConstantSize->getZExtValue() > Subtarget.getMaxInlineSizeThreshold()) {
     // Check to see if there is a specialized entry-point for memory zeroing.
     ConstantSDNode *ValC = dyn_cast<ConstantSDNode>(Val);
@@ -111,28 +112,27 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
     uint64_t Val = ValC->getZExtValue() & 255;
 
     // If the value is a constant, then we can potentially use larger sets.
-    switch (Align & 3) {
-    case 2:   // WORD aligned
-      AVT = MVT::i16;
-      ValReg = X86::AX;
-      Val = (Val << 8) | Val;
-      break;
-    case 0:  // DWORD aligned
+    if (Alignment > Align(2)) {
+      // DWORD aligned
       AVT = MVT::i32;
       ValReg = X86::EAX;
       Val = (Val << 8)  | Val;
       Val = (Val << 16) | Val;
-      if (Subtarget.is64Bit() && ((Align & 0x7) == 0)) {  // QWORD aligned
+      if (Subtarget.is64Bit() && Alignment > Align(8)) { // QWORD aligned
         AVT = MVT::i64;
         ValReg = X86::RAX;
         Val = (Val << 32) | Val;
       }
-      break;
-    default:  // Byte aligned
+    } else if (Alignment == Align(2)) {
+      // WORD aligned
+      AVT = MVT::i16;
+      ValReg = X86::AX;
+      Val = (Val << 8) | Val;
+    } else {
+      // Byte aligned
       AVT = MVT::i8;
       ValReg = X86::AL;
       Count = DAG.getIntPtrConstant(SizeVal, dl);
-      break;
     }
 
     if (AVT.bitsGT(MVT::i8)) {
@@ -169,13 +169,12 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemset(
     EVT AddrVT = Dst.getValueType();
     EVT SizeVT = Size.getValueType();
 
-    Chain = DAG.getMemset(Chain, dl,
-                          DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
-                                      DAG.getConstant(Offset, dl, AddrVT)),
-                          Val,
-                          DAG.getConstant(BytesLeft, dl, SizeVT),
-                          Align, isVolatile, false,
-                          DstPtrInfo.getWithOffset(Offset));
+    Chain =
+        DAG.getMemset(Chain, dl,
+                      DAG.getNode(ISD::ADD, dl, AddrVT, Dst,
+                                  DAG.getConstant(Offset, dl, AddrVT)),
+                      Val, DAG.getConstant(BytesLeft, dl, SizeVT), Alignment,
+                      isVolatile, false, DstPtrInfo.getWithOffset(Offset));
   }
 
   // TODO: Use a Tokenfactor, as in memcpy, instead of a single chain.
@@ -283,7 +282,7 @@ static SDValue emitConstantSizeRepmov(
       Chain, dl,
       DAG.getNode(ISD::ADD, dl, DstVT, Dst, DAG.getConstant(Offset, dl, DstVT)),
       DAG.getNode(ISD::ADD, dl, SrcVT, Src, DAG.getConstant(Offset, dl, SrcVT)),
-      DAG.getConstant(BytesLeft, dl, SizeVT), Align, isVolatile,
+      DAG.getConstant(BytesLeft, dl, SizeVT), llvm::Align(Align), isVolatile,
       /*AlwaysInline*/ true, /*isTailCall*/ false,
       DstPtrInfo.getWithOffset(Offset), SrcPtrInfo.getWithOffset(Offset)));
   return DAG.getNode(ISD::TokenFactor, dl, MVT::Other, Results);
@@ -291,7 +290,7 @@ static SDValue emitConstantSizeRepmov(
 
 SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   // If to a segment-relative address space, use the default lowering.
   if (DstPtrInfo.getAddrSpace() >= 256 || SrcPtrInfo.getAddrSpace() >= 256)
@@ -309,10 +308,10 @@ SDValue X86SelectionDAGInfo::EmitTargetCodeForMemcpy(
 
   /// Handle constant sizes,
   if (ConstantSDNode *ConstantSize = dyn_cast<ConstantSDNode>(Size))
-    return emitConstantSizeRepmov(DAG, Subtarget, dl, Chain, Dst, Src,
-                                  ConstantSize->getZExtValue(),
-                                  Size.getValueType(), Align, isVolatile,
-                                  AlwaysInline, DstPtrInfo, SrcPtrInfo);
+    return emitConstantSizeRepmov(
+        DAG, Subtarget, dl, Chain, Dst, Src, ConstantSize->getZExtValue(),
+        Size.getValueType(), Alignment.value(), isVolatile, AlwaysInline,
+        DstPtrInfo, SrcPtrInfo);
 
   return SDValue();
 }
diff --git a/llvm/lib/Target/X86/X86SelectionDAGInfo.h b/llvm/lib/Target/X86/X86SelectionDAGInfo.h
index 0f2d979f91e3..dac62973636c 100644
--- a/llvm/lib/Target/X86/X86SelectionDAGInfo.h
+++ b/llvm/lib/Target/X86/X86SelectionDAGInfo.h
@@ -14,14 +14,9 @@
 #define LLVM_LIB_TARGET_X86_X86SELECTIONDAGINFO_H
 
 #include "llvm/CodeGen/SelectionDAGTargetInfo.h"
-#include "llvm/MC/MCRegisterInfo.h"
 
 namespace llvm {
 
-class X86TargetLowering;
-class X86TargetMachine;
-class X86Subtarget;
-
 class X86SelectionDAGInfo : public SelectionDAGTargetInfo {
   /// Returns true if it is possible for the base register to conflict with the
   /// given set of clobbers for a memory intrinsic.
@@ -33,13 +28,14 @@ public:
 
   SDValue EmitTargetCodeForMemset(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align, bool isVolatile,
+                                  SDValue Size, Align Alignment,
+                                  bool isVolatile,
                                   MachinePointerInfo DstPtrInfo) const override;
 
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Dst, SDValue Src,
-                                  SDValue Size, unsigned Align, bool isVolatile,
-                                  bool AlwaysInline,
+                                  SDValue Size, Align Alignment,
+                                  bool isVolatile, bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const override;
 };
diff --git a/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp b/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
index a202fc63637b..de528299654c 100644
--- a/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
+++ b/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.cpp
@@ -11,8 +11,10 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "Utils/X86ShuffleDecode.h"
+#include "X86ShuffleDecodeConstantPool.h"
+#include "MCTargetDesc/X86ShuffleDecode.h"
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Constants.h"
 
 //===----------------------------------------------------------------------===//
@@ -34,17 +36,17 @@ static bool extractConstantMask(const Constant *C, unsigned MaskEltSizeInBits,
   //
   //   <4 x i32> <i32 -2147483648, i32 -2147483648,
   //              i32 -2147483648, i32 -2147483648>
-  Type *CstTy = C->getType();
-  if (!CstTy->isVectorTy())
+  auto *CstTy = dyn_cast<FixedVectorType>(C->getType());
+  if (!CstTy)
     return false;
 
-  Type *CstEltTy = CstTy->getVectorElementType();
+  Type *CstEltTy = CstTy->getElementType();
   if (!CstEltTy->isIntegerTy())
     return false;
 
   unsigned CstSizeInBits = CstTy->getPrimitiveSizeInBits();
   unsigned CstEltSizeInBits = CstTy->getScalarSizeInBits();
-  unsigned NumCstElts = CstTy->getVectorNumElements();
+  unsigned NumCstElts = CstTy->getNumElements();
 
   assert((CstSizeInBits % MaskEltSizeInBits) == 0 &&
          "Unaligned shuffle mask size");
@@ -185,13 +187,12 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
 }
 
 void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
-                         unsigned Width,
-                         SmallVectorImpl<int> &ShuffleMask) {
+                         unsigned Width, SmallVectorImpl<int> &ShuffleMask) {
   Type *MaskTy = C->getType();
   unsigned MaskTySize = MaskTy->getPrimitiveSizeInBits();
   (void)MaskTySize;
-  assert((MaskTySize == 128 || MaskTySize == 256) &&
-         Width >= MaskTySize && "Unexpected vector size.");
+  assert((MaskTySize == 128 || MaskTySize == 256) && Width >= MaskTySize &&
+         "Unexpected vector size.");
 
   // The shuffle mask requires elements the same size as the target.
   APInt UndefElts;
diff --git a/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h b/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
index 296341517579..51229a69a626 100644
--- a/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
+++ b/llvm/lib/Target/X86/X86ShuffleDecodeConstantPool.h
@@ -14,15 +14,13 @@
 #ifndef LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
 #define LLVM_LIB_TARGET_X86_X86SHUFFLEDECODECONSTANTPOOL_H
 
-#include "llvm/ADT/SmallVector.h"
-
 //===----------------------------------------------------------------------===//
 //  Vector Mask Decoding
 //===----------------------------------------------------------------------===//
 
 namespace llvm {
 class Constant;
-class MVT;
+template <typename T> class SmallVectorImpl;
 
 /// Decode a PSHUFB mask from an IR-level vector constant.
 void DecodePSHUFBMask(const Constant *C, unsigned Width,
@@ -33,9 +31,8 @@ void DecodeVPERMILPMask(const Constant *C, unsigned ElSize, unsigned Width,
                         SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPERMILP2 variable mask from an IR-level vector constant.
-void DecodeVPERMIL2PMask(const Constant *C, unsigned MatchImm, unsigned ElSize,
-                         unsigned Width,
-                         SmallVectorImpl<int> &ShuffleMask);
+void DecodeVPERMIL2PMask(const Constant *C, unsigned M2Z, unsigned ElSize,
+                         unsigned Width, SmallVectorImpl<int> &ShuffleMask);
 
 /// Decode a VPPERM variable mask from an IR-level vector constant.
 void DecodeVPPERMMask(const Constant *C, unsigned Width,
diff --git a/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp b/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
new file mode 100644
index 000000000000..7e91c37367d2
--- /dev/null
+++ b/llvm/lib/Target/X86/X86SpeculativeExecutionSideEffectSuppression.cpp
@@ -0,0 +1,181 @@
+//===-- X86SpeculativeExecutionSideEffectSuppression.cpp ------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file contains the X86 implementation of the speculative execution side
+/// effect suppression mitigation.
+///
+/// This must be used with the -mlvi-cfi flag in order to mitigate indirect
+/// branches and returns.
+//===----------------------------------------------------------------------===//
+
+#include "X86.h"
+#include "X86InstrInfo.h"
+#include "X86Subtarget.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/CodeGen/MachineFunction.h"
+#include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachineInstrBuilder.h"
+#include "llvm/Pass.h"
+#include "llvm/Target/TargetMachine.h"
+using namespace llvm;
+
+#define DEBUG_TYPE "x86-seses"
+
+STATISTIC(NumLFENCEsInserted, "Number of lfence instructions inserted");
+
+static cl::opt<bool> EnableSpeculativeExecutionSideEffectSuppression(
+    "x86-seses-enable-without-lvi-cfi",
+    cl::desc("Force enable speculative execution side effect suppression. "
+             "(Note: User must pass -mlvi-cfi in order to mitigate indirect "
+             "branches and returns.)"),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> OneLFENCEPerBasicBlock(
+    "x86-seses-one-lfence-per-bb",
+    cl::desc(
+        "Omit all lfences other than the first to be placed in a basic block."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool> OnlyLFENCENonConst(
+    "x86-seses-only-lfence-non-const",
+    cl::desc("Only lfence before groups of terminators where at least one "
+             "branch instruction has an input to the addressing mode that is a "
+             "register other than %rip."),
+    cl::init(false), cl::Hidden);
+
+static cl::opt<bool>
+    OmitBranchLFENCEs("x86-seses-omit-branch-lfences",
+                      cl::desc("Omit all lfences before branch instructions."),
+                      cl::init(false), cl::Hidden);
+
+namespace {
+
+class X86SpeculativeExecutionSideEffectSuppression
+    : public MachineFunctionPass {
+public:
+  X86SpeculativeExecutionSideEffectSuppression() : MachineFunctionPass(ID) {}
+
+  static char ID;
+  StringRef getPassName() const override {
+    return "X86 Speculative Execution Side Effect Suppression";
+  }
+
+  bool runOnMachineFunction(MachineFunction &MF) override;
+};
+} // namespace
+
+char X86SpeculativeExecutionSideEffectSuppression::ID = 0;
+
+// This function returns whether the passed instruction uses a memory addressing
+// mode that is constant. We treat all memory addressing modes that read
+// from a register that is not %rip as non-constant. Note that the use
+// of the EFLAGS register results in an addressing mode being considered
+// non-constant, therefore all JCC instructions will return false from this
+// function since one of their operands will always be the EFLAGS register.
+static bool hasConstantAddressingMode(const MachineInstr &MI) {
+  for (const MachineOperand &MO : MI.uses())
+    if (MO.isReg() && X86::RIP != MO.getReg())
+      return false;
+  return true;
+}
+
+bool X86SpeculativeExecutionSideEffectSuppression::runOnMachineFunction(
+    MachineFunction &MF) {
+
+  const auto &OptLevel = MF.getTarget().getOptLevel();
+  const X86Subtarget &Subtarget = MF.getSubtarget<X86Subtarget>();
+
+  // Check whether SESES needs to run as the fallback for LVI at O0, whether the
+  // user explicitly passed an SESES flag, or whether the SESES target feature
+  // was set.
+  if (!EnableSpeculativeExecutionSideEffectSuppression &&
+      !(Subtarget.useLVILoadHardening() && OptLevel == CodeGenOpt::None) &&
+      !Subtarget.useSpeculativeExecutionSideEffectSuppression())
+    return false;
+
+  LLVM_DEBUG(dbgs() << "********** " << getPassName() << " : " << MF.getName()
+                    << " **********\n");
+  bool Modified = false;
+  const X86InstrInfo *TII = Subtarget.getInstrInfo();
+  for (MachineBasicBlock &MBB : MF) {
+    MachineInstr *FirstTerminator = nullptr;
+    // Keep track of whether the previous instruction was an LFENCE to avoid
+    // adding redundant LFENCEs.
+    bool PrevInstIsLFENCE = false;
+    for (auto &MI : MBB) {
+
+      if (MI.getOpcode() == X86::LFENCE) {
+        PrevInstIsLFENCE = true;
+        continue;
+      }
+      // We want to put an LFENCE before any instruction that
+      // may load or store. This LFENCE is intended to avoid leaking any secret
+      // data due to a given load or store. This results in closing the cache
+      // and memory timing side channels. We will treat terminators that load
+      // or store separately.
+      if (MI.mayLoadOrStore() && !MI.isTerminator()) {
+        if (!PrevInstIsLFENCE) {
+          BuildMI(MBB, MI, DebugLoc(), TII->get(X86::LFENCE));
+          NumLFENCEsInserted++;
+          Modified = true;
+        }
+        if (OneLFENCEPerBasicBlock)
+          break;
+      }
+      // The following section will be LFENCEing before groups of terminators
+      // that include branches. This will close the branch prediction side
+      // channels since we will prevent code executing after misspeculation as
+      // a result of the LFENCEs placed with this logic.
+
+      // Keep track of the first terminator in a basic block since if we need
+      // to LFENCE the terminators in this basic block we must add the
+      // instruction before the first terminator in the basic block (as
+      // opposed to before the terminator that indicates an LFENCE is
+      // required). An example of why this is necessary is that the
+      // X86InstrInfo::analyzeBranch method assumes all terminators are grouped
+      // together and terminates it's analysis once the first non-termintor
+      // instruction is found.
+      if (MI.isTerminator() && FirstTerminator == nullptr)
+        FirstTerminator = &MI;
+
+      // Look for branch instructions that will require an LFENCE to be put
+      // before this basic block's terminators.
+      if (!MI.isBranch() || OmitBranchLFENCEs) {
+        // This isn't a branch or we're not putting LFENCEs before branches.
+        PrevInstIsLFENCE = false;
+        continue;
+      }
+
+      if (OnlyLFENCENonConst && hasConstantAddressingMode(MI)) {
+        // This is a branch, but it only has constant addressing mode and we're
+        // not adding LFENCEs before such branches.
+        PrevInstIsLFENCE = false;
+        continue;
+      }
+
+      // This branch requires adding an LFENCE.
+      if (!PrevInstIsLFENCE) {
+        BuildMI(MBB, FirstTerminator, DebugLoc(), TII->get(X86::LFENCE));
+        NumLFENCEsInserted++;
+        Modified = true;
+      }
+      break;
+    }
+  }
+
+  return Modified;
+}
+
+FunctionPass *llvm::createX86SpeculativeExecutionSideEffectSuppression() {
+  return new X86SpeculativeExecutionSideEffectSuppression();
+}
+
+INITIALIZE_PASS(X86SpeculativeExecutionSideEffectSuppression, "x86-seses",
+                "X86 Speculative Execution Side Effect Suppression", false,
+                false)
diff --git a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
index 9aa47c532e82..fe5b9a05f811 100644
--- a/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
+++ b/llvm/lib/Target/X86/X86SpeculativeLoadHardening.cpp
@@ -53,6 +53,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Target/TargetMachine.h"
 #include <algorithm>
 #include <cassert>
 #include <iterator>
@@ -872,10 +873,10 @@ void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
 
       case X86::FARCALL16m:
       case X86::FARCALL32m:
-      case X86::FARCALL64:
+      case X86::FARCALL64m:
       case X86::FARJMP16m:
       case X86::FARJMP32m:
-      case X86::FARJMP64:
+      case X86::FARJMP64m:
         // We cannot mitigate far jumps or calls, but we also don't expect them
         // to be vulnerable to Spectre v1.2 style attacks.
         continue;
@@ -920,6 +921,11 @@ void X86SpeculativeLoadHardeningPass::unfoldCallAndJumpLoads(
         // Now stitch the new instructions into place and erase the old one.
         for (auto *NewMI : NewMIs)
           MBB.insert(MI.getIterator(), NewMI);
+
+        // Update the call site info.
+        if (MI.isCandidateForCallSiteEntry())
+          MF.eraseCallSiteInfo(&MI);
+
         MI.eraseFromParent();
         LLVM_DEBUG({
           dbgs() << "Unfolded load successfully into:\n";
@@ -993,7 +999,7 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
 
     case X86::FARJMP16m:
     case X86::FARJMP32m:
-    case X86::FARJMP64:
+    case X86::FARJMP64m:
       // We cannot mitigate far jumps or calls, but we also don't expect them
       // to be vulnerable to Spectre v1.2 or v2 (self trained) style attacks.
       continue;
@@ -1195,394 +1201,13 @@ X86SpeculativeLoadHardeningPass::tracePredStateThroughIndirectBranches(
   return CMovs;
 }
 
-/// Returns true if the instruction has no behavior (specified or otherwise)
-/// that is based on the value of any of its register operands
-///
-/// A classical example of something that is inherently not data invariant is an
-/// indirect jump -- the destination is loaded into icache based on the bits set
-/// in the jump destination register.
-///
-/// FIXME: This should become part of our instruction tables.
-static bool isDataInvariant(MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    // By default, assume that the instruction is not data invariant.
-    return false;
-
-    // Some target-independent operations that trivially lower to data-invariant
-    // instructions.
-  case TargetOpcode::COPY:
-  case TargetOpcode::INSERT_SUBREG:
-  case TargetOpcode::SUBREG_TO_REG:
-    return true;
-
-  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
-  // However, they set flags and are perhaps the most surprisingly constant
-  // time operations so we call them out here separately.
-  case X86::IMUL16rr:
-  case X86::IMUL16rri8:
-  case X86::IMUL16rri:
-  case X86::IMUL32rr:
-  case X86::IMUL32rri8:
-  case X86::IMUL32rri:
-  case X86::IMUL64rr:
-  case X86::IMUL64rri32:
-  case X86::IMUL64rri8:
-
-  // Bit scanning and counting instructions that are somewhat surprisingly
-  // constant time as they scan across bits and do other fairly complex
-  // operations like popcnt, but are believed to be constant time on x86.
-  // However, these set flags.
-  case X86::BSF16rr:
-  case X86::BSF32rr:
-  case X86::BSF64rr:
-  case X86::BSR16rr:
-  case X86::BSR32rr:
-  case X86::BSR64rr:
-  case X86::LZCNT16rr:
-  case X86::LZCNT32rr:
-  case X86::LZCNT64rr:
-  case X86::POPCNT16rr:
-  case X86::POPCNT32rr:
-  case X86::POPCNT64rr:
-  case X86::TZCNT16rr:
-  case X86::TZCNT32rr:
-  case X86::TZCNT64rr:
-
-  // Bit manipulation instructions are effectively combinations of basic
-  // arithmetic ops, and should still execute in constant time. These also
-  // set flags.
-  case X86::BLCFILL32rr:
-  case X86::BLCFILL64rr:
-  case X86::BLCI32rr:
-  case X86::BLCI64rr:
-  case X86::BLCIC32rr:
-  case X86::BLCIC64rr:
-  case X86::BLCMSK32rr:
-  case X86::BLCMSK64rr:
-  case X86::BLCS32rr:
-  case X86::BLCS64rr:
-  case X86::BLSFILL32rr:
-  case X86::BLSFILL64rr:
-  case X86::BLSI32rr:
-  case X86::BLSI64rr:
-  case X86::BLSIC32rr:
-  case X86::BLSIC64rr:
-  case X86::BLSMSK32rr:
-  case X86::BLSMSK64rr:
-  case X86::BLSR32rr:
-  case X86::BLSR64rr:
-  case X86::TZMSK32rr:
-  case X86::TZMSK64rr:
-
-  // Bit extracting and clearing instructions should execute in constant time,
-  // and set flags.
-  case X86::BEXTR32rr:
-  case X86::BEXTR64rr:
-  case X86::BEXTRI32ri:
-  case X86::BEXTRI64ri:
-  case X86::BZHI32rr:
-  case X86::BZHI64rr:
-
-  // Shift and rotate.
-  case X86::ROL8r1:  case X86::ROL16r1:  case X86::ROL32r1:  case X86::ROL64r1:
-  case X86::ROL8rCL: case X86::ROL16rCL: case X86::ROL32rCL: case X86::ROL64rCL:
-  case X86::ROL8ri:  case X86::ROL16ri:  case X86::ROL32ri:  case X86::ROL64ri:
-  case X86::ROR8r1:  case X86::ROR16r1:  case X86::ROR32r1:  case X86::ROR64r1:
-  case X86::ROR8rCL: case X86::ROR16rCL: case X86::ROR32rCL: case X86::ROR64rCL:
-  case X86::ROR8ri:  case X86::ROR16ri:  case X86::ROR32ri:  case X86::ROR64ri:
-  case X86::SAR8r1:  case X86::SAR16r1:  case X86::SAR32r1:  case X86::SAR64r1:
-  case X86::SAR8rCL: case X86::SAR16rCL: case X86::SAR32rCL: case X86::SAR64rCL:
-  case X86::SAR8ri:  case X86::SAR16ri:  case X86::SAR32ri:  case X86::SAR64ri:
-  case X86::SHL8r1:  case X86::SHL16r1:  case X86::SHL32r1:  case X86::SHL64r1:
-  case X86::SHL8rCL: case X86::SHL16rCL: case X86::SHL32rCL: case X86::SHL64rCL:
-  case X86::SHL8ri:  case X86::SHL16ri:  case X86::SHL32ri:  case X86::SHL64ri:
-  case X86::SHR8r1:  case X86::SHR16r1:  case X86::SHR32r1:  case X86::SHR64r1:
-  case X86::SHR8rCL: case X86::SHR16rCL: case X86::SHR32rCL: case X86::SHR64rCL:
-  case X86::SHR8ri:  case X86::SHR16ri:  case X86::SHR32ri:  case X86::SHR64ri:
-  case X86::SHLD16rrCL: case X86::SHLD32rrCL: case X86::SHLD64rrCL:
-  case X86::SHLD16rri8: case X86::SHLD32rri8: case X86::SHLD64rri8:
-  case X86::SHRD16rrCL: case X86::SHRD32rrCL: case X86::SHRD64rrCL:
-  case X86::SHRD16rri8: case X86::SHRD32rri8: case X86::SHRD64rri8:
-
-  // Basic arithmetic is constant time on the input but does set flags.
-  case X86::ADC8rr:   case X86::ADC8ri:
-  case X86::ADC16rr:  case X86::ADC16ri:   case X86::ADC16ri8:
-  case X86::ADC32rr:  case X86::ADC32ri:   case X86::ADC32ri8:
-  case X86::ADC64rr:  case X86::ADC64ri8:  case X86::ADC64ri32:
-  case X86::ADD8rr:   case X86::ADD8ri:
-  case X86::ADD16rr:  case X86::ADD16ri:   case X86::ADD16ri8:
-  case X86::ADD32rr:  case X86::ADD32ri:   case X86::ADD32ri8:
-  case X86::ADD64rr:  case X86::ADD64ri8:  case X86::ADD64ri32:
-  case X86::AND8rr:   case X86::AND8ri:
-  case X86::AND16rr:  case X86::AND16ri:   case X86::AND16ri8:
-  case X86::AND32rr:  case X86::AND32ri:   case X86::AND32ri8:
-  case X86::AND64rr:  case X86::AND64ri8:  case X86::AND64ri32:
-  case X86::OR8rr:    case X86::OR8ri:
-  case X86::OR16rr:   case X86::OR16ri:    case X86::OR16ri8:
-  case X86::OR32rr:   case X86::OR32ri:    case X86::OR32ri8:
-  case X86::OR64rr:   case X86::OR64ri8:   case X86::OR64ri32:
-  case X86::SBB8rr:   case X86::SBB8ri:
-  case X86::SBB16rr:  case X86::SBB16ri:   case X86::SBB16ri8:
-  case X86::SBB32rr:  case X86::SBB32ri:   case X86::SBB32ri8:
-  case X86::SBB64rr:  case X86::SBB64ri8:  case X86::SBB64ri32:
-  case X86::SUB8rr:   case X86::SUB8ri:
-  case X86::SUB16rr:  case X86::SUB16ri:   case X86::SUB16ri8:
-  case X86::SUB32rr:  case X86::SUB32ri:   case X86::SUB32ri8:
-  case X86::SUB64rr:  case X86::SUB64ri8:  case X86::SUB64ri32:
-  case X86::XOR8rr:   case X86::XOR8ri:
-  case X86::XOR16rr:  case X86::XOR16ri:   case X86::XOR16ri8:
-  case X86::XOR32rr:  case X86::XOR32ri:   case X86::XOR32ri8:
-  case X86::XOR64rr:  case X86::XOR64ri8:  case X86::XOR64ri32:
-  // Arithmetic with just 32-bit and 64-bit variants and no immediates.
-  case X86::ADCX32rr: case X86::ADCX64rr:
-  case X86::ADOX32rr: case X86::ADOX64rr:
-  case X86::ANDN32rr: case X86::ANDN64rr:
-  // Unary arithmetic operations.
-  case X86::DEC8r: case X86::DEC16r: case X86::DEC32r: case X86::DEC64r:
-  case X86::INC8r: case X86::INC16r: case X86::INC32r: case X86::INC64r:
-  case X86::NEG8r: case X86::NEG16r: case X86::NEG32r: case X86::NEG64r:
-    // Check whether the EFLAGS implicit-def is dead. We assume that this will
-    // always find the implicit-def because this code should only be reached
-    // for instructions that do in fact implicitly def this.
-    if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
-      // If we would clobber EFLAGS that are used, just bail for now.
-      LLVM_DEBUG(dbgs() << "    Unable to harden post-load due to EFLAGS: ";
-                 MI.dump(); dbgs() << "\n");
-      return false;
-    }
-
-    // Otherwise, fallthrough to handle these the same as instructions that
-    // don't set EFLAGS.
-    LLVM_FALLTHROUGH;
-
-  // Unlike other arithmetic, NOT doesn't set EFLAGS.
-  case X86::NOT8r: case X86::NOT16r: case X86::NOT32r: case X86::NOT64r:
-
-  // Various move instructions used to zero or sign extend things. Note that we
-  // intentionally don't support the _NOREX variants as we can't handle that
-  // register constraint anyways.
-  case X86::MOVSX16rr8:
-  case X86::MOVSX32rr8: case X86::MOVSX32rr16:
-  case X86::MOVSX64rr8: case X86::MOVSX64rr16: case X86::MOVSX64rr32:
-  case X86::MOVZX16rr8:
-  case X86::MOVZX32rr8: case X86::MOVZX32rr16:
-  case X86::MOVZX64rr8: case X86::MOVZX64rr16:
-  case X86::MOV32rr:
-
-  // Arithmetic instructions that are both constant time and don't set flags.
-  case X86::RORX32ri:
-  case X86::RORX64ri:
-  case X86::SARX32rr:
-  case X86::SARX64rr:
-  case X86::SHLX32rr:
-  case X86::SHLX64rr:
-  case X86::SHRX32rr:
-  case X86::SHRX64rr:
-
-  // LEA doesn't actually access memory, and its arithmetic is constant time.
-  case X86::LEA16r:
-  case X86::LEA32r:
-  case X86::LEA64_32r:
-  case X86::LEA64r:
-    return true;
-  }
-}
-
-/// Returns true if the instruction has no behavior (specified or otherwise)
-/// that is based on the value loaded from memory or the value of any
-/// non-address register operands.
-///
-/// For example, if the latency of the instruction is dependent on the
-/// particular bits set in any of the registers *or* any of the bits loaded from
-/// memory.
-///
-/// A classical example of something that is inherently not data invariant is an
-/// indirect jump -- the destination is loaded into icache based on the bits set
-/// in the jump destination register.
-///
-/// FIXME: This should become part of our instruction tables.
-static bool isDataInvariantLoad(MachineInstr &MI) {
-  switch (MI.getOpcode()) {
-  default:
-    // By default, assume that the load will immediately leak.
-    return false;
-
-  // On x86 it is believed that imul is constant time w.r.t. the loaded data.
-  // However, they set flags and are perhaps the most surprisingly constant
-  // time operations so we call them out here separately.
-  case X86::IMUL16rm:
-  case X86::IMUL16rmi8:
-  case X86::IMUL16rmi:
-  case X86::IMUL32rm:
-  case X86::IMUL32rmi8:
-  case X86::IMUL32rmi:
-  case X86::IMUL64rm:
-  case X86::IMUL64rmi32:
-  case X86::IMUL64rmi8:
-
-  // Bit scanning and counting instructions that are somewhat surprisingly
-  // constant time as they scan across bits and do other fairly complex
-  // operations like popcnt, but are believed to be constant time on x86.
-  // However, these set flags.
-  case X86::BSF16rm:
-  case X86::BSF32rm:
-  case X86::BSF64rm:
-  case X86::BSR16rm:
-  case X86::BSR32rm:
-  case X86::BSR64rm:
-  case X86::LZCNT16rm:
-  case X86::LZCNT32rm:
-  case X86::LZCNT64rm:
-  case X86::POPCNT16rm:
-  case X86::POPCNT32rm:
-  case X86::POPCNT64rm:
-  case X86::TZCNT16rm:
-  case X86::TZCNT32rm:
-  case X86::TZCNT64rm:
-
-  // Bit manipulation instructions are effectively combinations of basic
-  // arithmetic ops, and should still execute in constant time. These also
-  // set flags.
-  case X86::BLCFILL32rm:
-  case X86::BLCFILL64rm:
-  case X86::BLCI32rm:
-  case X86::BLCI64rm:
-  case X86::BLCIC32rm:
-  case X86::BLCIC64rm:
-  case X86::BLCMSK32rm:
-  case X86::BLCMSK64rm:
-  case X86::BLCS32rm:
-  case X86::BLCS64rm:
-  case X86::BLSFILL32rm:
-  case X86::BLSFILL64rm:
-  case X86::BLSI32rm:
-  case X86::BLSI64rm:
-  case X86::BLSIC32rm:
-  case X86::BLSIC64rm:
-  case X86::BLSMSK32rm:
-  case X86::BLSMSK64rm:
-  case X86::BLSR32rm:
-  case X86::BLSR64rm:
-  case X86::TZMSK32rm:
-  case X86::TZMSK64rm:
-
-  // Bit extracting and clearing instructions should execute in constant time,
-  // and set flags.
-  case X86::BEXTR32rm:
-  case X86::BEXTR64rm:
-  case X86::BEXTRI32mi:
-  case X86::BEXTRI64mi:
-  case X86::BZHI32rm:
-  case X86::BZHI64rm:
-
-  // Basic arithmetic is constant time on the input but does set flags.
-  case X86::ADC8rm:
-  case X86::ADC16rm:
-  case X86::ADC32rm:
-  case X86::ADC64rm:
-  case X86::ADCX32rm:
-  case X86::ADCX64rm:
-  case X86::ADD8rm:
-  case X86::ADD16rm:
-  case X86::ADD32rm:
-  case X86::ADD64rm:
-  case X86::ADOX32rm:
-  case X86::ADOX64rm:
-  case X86::AND8rm:
-  case X86::AND16rm:
-  case X86::AND32rm:
-  case X86::AND64rm:
-  case X86::ANDN32rm:
-  case X86::ANDN64rm:
-  case X86::OR8rm:
-  case X86::OR16rm:
-  case X86::OR32rm:
-  case X86::OR64rm:
-  case X86::SBB8rm:
-  case X86::SBB16rm:
-  case X86::SBB32rm:
-  case X86::SBB64rm:
-  case X86::SUB8rm:
-  case X86::SUB16rm:
-  case X86::SUB32rm:
-  case X86::SUB64rm:
-  case X86::XOR8rm:
-  case X86::XOR16rm:
-  case X86::XOR32rm:
-  case X86::XOR64rm:
-    // Check whether the EFLAGS implicit-def is dead. We assume that this will
-    // always find the implicit-def because this code should only be reached
-    // for instructions that do in fact implicitly def this.
-    if (!MI.findRegisterDefOperand(X86::EFLAGS)->isDead()) {
-      // If we would clobber EFLAGS that are used, just bail for now.
-      LLVM_DEBUG(dbgs() << "    Unable to harden post-load due to EFLAGS: ";
-                 MI.dump(); dbgs() << "\n");
-      return false;
-    }
-
-    // Otherwise, fallthrough to handle these the same as instructions that
-    // don't set EFLAGS.
-    LLVM_FALLTHROUGH;
-
-  // Integer multiply w/o affecting flags is still believed to be constant
-  // time on x86. Called out separately as this is among the most surprising
-  // instructions to exhibit that behavior.
-  case X86::MULX32rm:
-  case X86::MULX64rm:
-
-  // Arithmetic instructions that are both constant time and don't set flags.
-  case X86::RORX32mi:
-  case X86::RORX64mi:
-  case X86::SARX32rm:
-  case X86::SARX64rm:
-  case X86::SHLX32rm:
-  case X86::SHLX64rm:
-  case X86::SHRX32rm:
-  case X86::SHRX64rm:
-
-  // Conversions are believed to be constant time and don't set flags.
-  case X86::CVTTSD2SI64rm: case X86::VCVTTSD2SI64rm: case X86::VCVTTSD2SI64Zrm:
-  case X86::CVTTSD2SIrm:   case X86::VCVTTSD2SIrm:   case X86::VCVTTSD2SIZrm:
-  case X86::CVTTSS2SI64rm: case X86::VCVTTSS2SI64rm: case X86::VCVTTSS2SI64Zrm:
-  case X86::CVTTSS2SIrm:   case X86::VCVTTSS2SIrm:   case X86::VCVTTSS2SIZrm:
-  case X86::CVTSI2SDrm:    case X86::VCVTSI2SDrm:    case X86::VCVTSI2SDZrm:
-  case X86::CVTSI2SSrm:    case X86::VCVTSI2SSrm:    case X86::VCVTSI2SSZrm:
-  case X86::CVTSI642SDrm:  case X86::VCVTSI642SDrm:  case X86::VCVTSI642SDZrm:
-  case X86::CVTSI642SSrm:  case X86::VCVTSI642SSrm:  case X86::VCVTSI642SSZrm:
-  case X86::CVTSS2SDrm:    case X86::VCVTSS2SDrm:    case X86::VCVTSS2SDZrm:
-  case X86::CVTSD2SSrm:    case X86::VCVTSD2SSrm:    case X86::VCVTSD2SSZrm:
-  // AVX512 added unsigned integer conversions.
-  case X86::VCVTTSD2USI64Zrm:
-  case X86::VCVTTSD2USIZrm:
-  case X86::VCVTTSS2USI64Zrm:
-  case X86::VCVTTSS2USIZrm:
-  case X86::VCVTUSI2SDZrm:
-  case X86::VCVTUSI642SDZrm:
-  case X86::VCVTUSI2SSZrm:
-  case X86::VCVTUSI642SSZrm:
-
-  // Loads to register don't set flags.
-  case X86::MOV8rm:
-  case X86::MOV8rm_NOREX:
-  case X86::MOV16rm:
-  case X86::MOV32rm:
-  case X86::MOV64rm:
-  case X86::MOVSX16rm8:
-  case X86::MOVSX32rm16:
-  case X86::MOVSX32rm8:
-  case X86::MOVSX32rm8_NOREX:
-  case X86::MOVSX64rm16:
-  case X86::MOVSX64rm32:
-  case X86::MOVSX64rm8:
-  case X86::MOVZX16rm8:
-  case X86::MOVZX32rm16:
-  case X86::MOVZX32rm8:
-  case X86::MOVZX32rm8_NOREX:
-  case X86::MOVZX64rm16:
-  case X86::MOVZX64rm8:
-    return true;
+// Returns true if the MI has EFLAGS as a register def operand and it's live,
+// otherwise it returns false
+static bool isEFLAGSDefLive(const MachineInstr &MI) {
+  if (const MachineOperand *DefOp = MI.findRegisterDefOperand(X86::EFLAGS)) {
+    return !DefOp->isDead();
   }
+  return false;
 }
 
 static bool isEFLAGSLive(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
@@ -1740,8 +1365,9 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
         // address registers, queue it up to be hardened post-load. Notably,
         // even once hardened this won't introduce a useful dependency that
         // could prune out subsequent loads.
-        if (EnablePostLoadHardening && isDataInvariantLoad(MI) &&
-            MI.getDesc().getNumDefs() == 1 && MI.getOperand(0).isReg() &&
+        if (EnablePostLoadHardening && X86InstrInfo::isDataInvariantLoad(MI) &&
+            !isEFLAGSDefLive(MI) && MI.getDesc().getNumDefs() == 1 &&
+            MI.getOperand(0).isReg() &&
             canHardenRegister(MI.getOperand(0).getReg()) &&
             !HardenedAddrRegs.count(BaseReg) &&
             !HardenedAddrRegs.count(IndexReg)) {
@@ -1795,9 +1421,10 @@ void X86SpeculativeLoadHardeningPass::tracePredStateThroughBlocksAndHarden(
         if (HardenPostLoad.erase(&MI)) {
           assert(!MI.isCall() && "Must not try to post-load harden a call!");
 
-          // If this is a data-invariant load, we want to try and sink any
-          // hardening as far as possible.
-          if (isDataInvariantLoad(MI)) {
+          // If this is a data-invariant load and there is no EFLAGS
+          // interference, we want to try and sink any hardening as far as
+          // possible.
+          if (X86InstrInfo::isDataInvariantLoad(MI) && !isEFLAGSDefLive(MI)) {
             // Sink the instruction we'll need to harden as far as we can down
             // the graph.
             MachineInstr *SunkMI = sinkPostLoadHardenedInst(MI, HardenPostLoad);
@@ -2085,9 +1712,9 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
 
       // Broadcast our state into a vector register.
       Register VStateReg = MRI->createVirtualRegister(OpRC);
-      unsigned BroadcastOp =
-          Is128Bit ? X86::VPBROADCASTQrZ128r
-                   : Is256Bit ? X86::VPBROADCASTQrZ256r : X86::VPBROADCASTQrZr;
+      unsigned BroadcastOp = Is128Bit ? X86::VPBROADCASTQrZ128rr
+                                      : Is256Bit ? X86::VPBROADCASTQrZ256rr
+                                                 : X86::VPBROADCASTQrZrr;
       auto BroadcastI =
           BuildMI(MBB, InsertPt, Loc, TII->get(BroadcastOp), VStateReg)
               .addReg(StateReg);
@@ -2147,8 +1774,11 @@ void X86SpeculativeLoadHardeningPass::hardenLoadAddr(
 
 MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
     MachineInstr &InitialMI, SmallPtrSetImpl<MachineInstr *> &HardenedInstrs) {
-  assert(isDataInvariantLoad(InitialMI) &&
+  assert(X86InstrInfo::isDataInvariantLoad(InitialMI) &&
          "Cannot get here with a non-invariant load!");
+  assert(!isEFLAGSDefLive(InitialMI) &&
+         "Cannot get here with a data invariant load "
+         "that interferes with EFLAGS!");
 
   // See if we can sink hardening the loaded value.
   auto SinkCheckToSingleUse =
@@ -2160,14 +1790,14 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
     // own.
     MachineInstr *SingleUseMI = nullptr;
     for (MachineInstr &UseMI : MRI->use_instructions(DefReg)) {
-      // If we're already going to harden this use, it is data invariant and
-      // within our block.
+      // If we're already going to harden this use, it is data invariant, it
+      // does not interfere with EFLAGS, and within our block.
       if (HardenedInstrs.count(&UseMI)) {
-        if (!isDataInvariantLoad(UseMI)) {
+        if (!X86InstrInfo::isDataInvariantLoad(UseMI) || isEFLAGSDefLive(UseMI)) {
           // If we've already decided to harden a non-load, we must have sunk
           // some other post-load hardened instruction to it and it must itself
           // be data-invariant.
-          assert(isDataInvariant(UseMI) &&
+          assert(X86InstrInfo::isDataInvariant(UseMI) &&
                  "Data variant instruction being hardened!");
           continue;
         }
@@ -2199,7 +1829,8 @@ MachineInstr *X86SpeculativeLoadHardeningPass::sinkPostLoadHardenedInst(
 
       // If this single use isn't data invariant, isn't in this block, or has
       // interfering EFLAGS, we can't sink the hardening to it.
-      if (!isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent())
+      if (!X86InstrInfo::isDataInvariant(UseMI) || UseMI.getParent() != MI.getParent() ||
+          isEFLAGSDefLive(UseMI))
         return {};
 
       // If this instruction defines multiple registers bail as we won't harden
@@ -2590,10 +2221,10 @@ void X86SpeculativeLoadHardeningPass::hardenIndirectCallOrJumpInstr(
   switch (MI.getOpcode()) {
   case X86::FARCALL16m:
   case X86::FARCALL32m:
-  case X86::FARCALL64:
+  case X86::FARCALL64m:
   case X86::FARJMP16m:
   case X86::FARJMP32m:
-  case X86::FARJMP64:
+  case X86::FARJMP64m:
     // We don't need to harden either far calls or far jumps as they are
     // safe from Spectre.
     return;
diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
index 75c3a70b430a..975cbabb30fd 100644
--- a/llvm/lib/Target/X86/X86Subtarget.cpp
+++ b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -10,14 +10,13 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "X86Subtarget.h"
+#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86.h"
-
 #include "X86CallLowering.h"
 #include "X86LegalizerInfo.h"
 #include "X86MacroFusion.h"
 #include "X86RegisterBankInfo.h"
-#include "X86Subtarget.h"
-#include "MCTargetDesc/X86BaseInfo.h"
 #include "X86TargetMachine.h"
 #include "llvm/ADT/Triple.h"
 #include "llvm/CodeGen/GlobalISel/CallLowering.h"
@@ -89,7 +88,9 @@ X86Subtarget::classifyLocalReference(const GlobalValue *GV) const {
 
       // Medium is a hybrid: RIP-rel for code, GOTOFF for DSO local data.
       case CodeModel::Medium:
-        if (isa<Function>(GV))
+        // Constant pool and jump table handling pass a nullptr to this
+        // function so we need to use isa_and_nonnull.
+        if (isa_and_nonnull<Function>(GV))
           return X86II::MO_NO_FLAG; // All code is RIP-relative
         return X86II::MO_GOTOFF;    // Local symbols use GOTOFF.
       }
@@ -227,11 +228,11 @@ bool X86Subtarget::isLegalToCallImmediateAddr() const {
 }
 
 void X86Subtarget::initSubtargetFeatures(StringRef CPU, StringRef FS) {
-  std::string CPUName = CPU;
+  std::string CPUName = std::string(CPU);
   if (CPUName.empty())
     CPUName = "generic";
 
-  std::string FullFS = FS;
+  std::string FullFS = std::string(FS);
   if (In64BitMode) {
     // SSE2 should default to enabled in 64-bit mode, but can be turned off
     // explicitly.
@@ -379,3 +380,7 @@ void X86Subtarget::getPostRAMutations(
     std::vector<std::unique_ptr<ScheduleDAGMutation>> &Mutations) const {
   Mutations.push_back(createX86MacroFusionDAGMutation());
 }
+
+bool X86Subtarget::isPositionIndependent() const {
+  return TM.isPositionIndependent();
+}
diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
index f4e8d30328ca..de45d357e3c2 100644
--- a/llvm/lib/Target/X86/X86Subtarget.h
+++ b/llvm/lib/Target/X86/X86Subtarget.h
@@ -17,15 +17,9 @@
 #include "X86ISelLowering.h"
 #include "X86InstrInfo.h"
 #include "X86SelectionDAGInfo.h"
-#include "llvm/ADT/StringRef.h"
 #include "llvm/ADT/Triple.h"
-#include "llvm/CodeGen/GlobalISel/CallLowering.h"
-#include "llvm/CodeGen/GlobalISel/InstructionSelector.h"
-#include "llvm/CodeGen/GlobalISel/LegalizerInfo.h"
-#include "llvm/CodeGen/GlobalISel/RegisterBankInfo.h"
 #include "llvm/CodeGen/TargetSubtargetInfo.h"
 #include "llvm/IR/CallingConv.h"
-#include "llvm/Target/TargetMachine.h"
 #include <climits>
 #include <memory>
 
@@ -34,7 +28,13 @@
 
 namespace llvm {
 
+class CallLowering;
 class GlobalValue;
+class InstructionSelector;
+class LegalizerInfo;
+class RegisterBankInfo;
+class StringRef;
+class TargetMachine;
 
 /// The X86 backend supports a number of different styles of PIC.
 ///
@@ -258,6 +258,10 @@ protected:
   bool InsertVZEROUPPER = false;
 
   /// True if there is no performance penalty for writing NOPs with up to
+  /// 7 bytes.
+  bool HasFast7ByteNOP = false;
+
+  /// True if there is no performance penalty for writing NOPs with up to
   /// 11 bytes.
   bool HasFast11ByteNOP = false;
 
@@ -393,6 +397,17 @@ protected:
   /// Processor supports PCONFIG instruction
   bool HasPCONFIG = false;
 
+  /// Processor supports SERIALIZE instruction
+  bool HasSERIALIZE = false;
+
+  /// Processor supports TSXLDTRK instruction
+  bool HasTSXLDTRK = false;
+
+  /// Processor has AMX support
+  bool HasAMXTILE = false;
+  bool HasAMXBF16 = false;
+  bool HasAMXINT8 = false;
+
   /// Processor has a single uop BEXTR implementation.
   bool HasFastBEXTR = false;
 
@@ -421,6 +436,19 @@ protected:
   /// than emitting one inside the compiler.
   bool UseRetpolineExternalThunk = false;
 
+  /// Prevent generation of indirect call/branch instructions from memory,
+  /// and force all indirect call/branch instructions from a register to be
+  /// preceded by an LFENCE. Also decompose RET instructions into a
+  /// POP+LFENCE+JMP sequence.
+  bool UseLVIControlFlowIntegrity = false;
+
+  /// Enable Speculative Execution Side Effect Suppression
+  bool UseSpeculativeExecutionSideEffectSuppression = false;
+
+  /// Insert LFENCE instructions to prevent data speculatively injected into
+  /// loads from being used maliciously.
+  bool UseLVILoadHardening = false;
+
   /// Use software floating point for code generation.
   bool UseSoftFloat = false;
 
@@ -627,8 +655,15 @@ public:
   bool hasRTM() const { return HasRTM; }
   bool hasADX() const { return HasADX; }
   bool hasSHA() const { return HasSHA; }
-  bool hasPRFCHW() const { return HasPRFCHW || HasPREFETCHWT1; }
+  bool hasPRFCHW() const { return HasPRFCHW; }
   bool hasPREFETCHWT1() const { return HasPREFETCHWT1; }
+  bool hasPrefetchW() const {
+    // The PREFETCHW instruction was added with 3DNow but later CPUs gave it
+    // its own CPUID bit as part of deprecating 3DNow. Intel eventually added
+    // it and KNL has another that prefetches to L2 cache. We assume the
+    // L1 version exists if the L2 version does.
+    return has3DNow() || hasPRFCHW() || hasPREFETCHWT1();
+  }
   bool hasSSEPrefetch() const {
     // We implicitly enable these when we have a write prefix supporting cache
     // level OR if we have prfchw, but don't already have a read prefetch from
@@ -702,13 +737,34 @@ public:
   bool threewayBranchProfitable() const { return ThreewayBranchProfitable; }
   bool hasINVPCID() const { return HasINVPCID; }
   bool hasENQCMD() const { return HasENQCMD; }
+  bool hasSERIALIZE() const { return HasSERIALIZE; }
+  bool hasTSXLDTRK() const { return HasTSXLDTRK; }
   bool useRetpolineIndirectCalls() const { return UseRetpolineIndirectCalls; }
   bool useRetpolineIndirectBranches() const {
     return UseRetpolineIndirectBranches;
   }
+  bool hasAMXTILE() const { return HasAMXTILE; }
+  bool hasAMXBF16() const { return HasAMXBF16; }
+  bool hasAMXINT8() const { return HasAMXINT8; }
   bool useRetpolineExternalThunk() const { return UseRetpolineExternalThunk; }
+
+  // These are generic getters that OR together all of the thunk types
+  // supported by the subtarget. Therefore useIndirectThunk*() will return true
+  // if any respective thunk feature is enabled.
+  bool useIndirectThunkCalls() const {
+    return useRetpolineIndirectCalls() || useLVIControlFlowIntegrity();
+  }
+  bool useIndirectThunkBranches() const {
+    return useRetpolineIndirectBranches() || useLVIControlFlowIntegrity();
+  }
+
   bool preferMaskRegisters() const { return PreferMaskRegisters; }
   bool useGLMDivSqrtCosts() const { return UseGLMDivSqrtCosts; }
+  bool useLVIControlFlowIntegrity() const { return UseLVIControlFlowIntegrity; }
+  bool useLVILoadHardening() const { return UseLVILoadHardening; }
+  bool useSpeculativeExecutionSideEffectSuppression() const {
+    return UseSpeculativeExecutionSideEffectSuppression;
+  }
 
   unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
   unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }
@@ -806,7 +862,7 @@ public:
     return PICStyle == PICStyles::Style::StubPIC;
   }
 
-  bool isPositionIndependent() const { return TM.isPositionIndependent(); }
+  bool isPositionIndependent() const;
 
   bool isCallingConvWin64(CallingConv::ID CC) const {
     switch (CC) {
@@ -853,10 +909,10 @@ public:
   /// Return true if the subtarget allows calls to immediate address.
   bool isLegalToCallImmediateAddr() const;
 
-  /// If we are using retpolines, we need to expand indirectbr to avoid it
+  /// If we are using indirect thunks, we need to expand indirectbr to avoid it
   /// lowering to an actual indirect jump.
   bool enableIndirectBrExpand() const override {
-    return useRetpolineIndirectBranches();
+    return useIndirectThunkBranches();
   }
 
   /// Enable the MachineScheduler pass for all X86 subtargets.
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index 8c696e9adbed..7344116e14af 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -73,16 +73,22 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializeX86Target() {
   initializeEvexToVexInstPassPass(PR);
   initializeFixupLEAPassPass(PR);
   initializeFPSPass(PR);
+  initializeX86FixupSetCCPassPass(PR);
   initializeX86CallFrameOptimizationPass(PR);
   initializeX86CmovConverterPassPass(PR);
   initializeX86ExpandPseudoPass(PR);
   initializeX86ExecutionDomainFixPass(PR);
   initializeX86DomainReassignmentPass(PR);
   initializeX86AvoidSFBPassPass(PR);
+  initializeX86AvoidTrailingCallPassPass(PR);
   initializeX86SpeculativeLoadHardeningPassPass(PR);
+  initializeX86SpeculativeExecutionSideEffectSuppressionPass(PR);
   initializeX86FlagsCopyLoweringPassPass(PR);
   initializeX86CondBrFoldingPassPass(PR);
+  initializeX86LoadValueInjectionLoadHardeningPassPass(PR);
+  initializeX86LoadValueInjectionRetHardeningPassPass(PR);
   initializeX86OptimizeLEAPassPass(PR);
+  initializeX86PartialReductionPass(PR);
 }
 
 static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
@@ -92,19 +98,9 @@ static std::unique_ptr<TargetLoweringObjectFile> createTLOF(const Triple &TT) {
     return std::make_unique<TargetLoweringObjectFileMachO>();
   }
 
-  if (TT.isOSFreeBSD())
-    return std::make_unique<X86FreeBSDTargetObjectFile>();
-  if (TT.isOSLinux() || TT.isOSNaCl() || TT.isOSIAMCU())
-    return std::make_unique<X86LinuxNaClTargetObjectFile>();
-  if (TT.isOSSolaris())
-    return std::make_unique<X86SolarisTargetObjectFile>();
-  if (TT.isOSFuchsia())
-    return std::make_unique<X86FuchsiaTargetObjectFile>();
-  if (TT.isOSBinFormatELF())
-    return std::make_unique<X86ELFTargetObjectFile>();
   if (TT.isOSBinFormatCOFF())
     return std::make_unique<TargetLoweringObjectFileCOFF>();
-  llvm_unreachable("unknown subtarget type");
+  return std::make_unique<X86ELFTargetObjectFile>();
 }
 
 static std::string computeDataLayout(const Triple &TT) {
@@ -222,7 +218,7 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
           getEffectiveRelocModel(TT, JIT, RM),
           getEffectiveX86CodeModel(CM, JIT, TT.getArch() == Triple::x86_64),
           OL),
-      TLOF(createTLOF(getTargetTriple())) {
+      TLOF(createTLOF(getTargetTriple())), IsJIT(JIT) {
   // On PS4, the "return address" of a 'noreturn' call must still be within
   // the calling function, and TrapUnreachable is an easy way to get that.
   if (TT.isPS4() || TT.isOSBinFormatMachO()) {
@@ -232,6 +228,9 @@ X86TargetMachine::X86TargetMachine(const Target &T, const Triple &TT,
 
   setMachineOutliner(true);
 
+  // x86 supports the debug entry values.
+  setSupportsDebugEntryValues(true);
+
   initAsmInfo();
 }
 
@@ -315,14 +314,6 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
 }
 
 //===----------------------------------------------------------------------===//
-// Command line options for x86
-//===----------------------------------------------------------------------===//
-static cl::opt<bool>
-UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
-  cl::desc("Minimize AVX to SSE transition penalty"),
-  cl::init(true));
-
-//===----------------------------------------------------------------------===//
 // X86 TTI query.
 //===----------------------------------------------------------------------===//
 
@@ -406,8 +397,10 @@ void X86PassConfig::addIRPasses() {
 
   TargetPassConfig::addIRPasses();
 
-  if (TM->getOptLevel() != CodeGenOpt::None)
+  if (TM->getOptLevel() != CodeGenOpt::None) {
     addPass(createInterleavedAccessPass());
+    addPass(createX86PartialReductionPass());
+  }
 
   // Add passes that handle indirect branch removal and insertion of a retpoline
   // thunk. These will be a no-op unless a function subtarget has the retpoline
@@ -496,6 +489,12 @@ void X86PassConfig::addMachineSSAOptimization() {
 
 void X86PassConfig::addPostRegAlloc() {
   addPass(createX86FloatingPointStackifierPass());
+  // When -O0 is enabled, the Load Value Injection Hardening pass will fall back
+  // to using the Speculative Execution Side Effect Suppression pass for
+  // mitigation. This is to prevent slow downs due to
+  // analyses needed by the LVIHardening pass when compiling at -O0.
+  if (getOptLevel() != CodeGenOpt::None)
+    addPass(createX86LoadValueInjectionLoadHardeningPass());
 }
 
 void X86PassConfig::addPreSched2() { addPass(createX86ExpandPseudoPass()); }
@@ -508,24 +507,34 @@ void X86PassConfig::addPreEmitPass() {
 
   addPass(createX86IndirectBranchTrackingPass());
 
-  if (UseVZeroUpper)
-    addPass(createX86IssueVZeroUpperPass());
+  addPass(createX86IssueVZeroUpperPass());
 
   if (getOptLevel() != CodeGenOpt::None) {
     addPass(createX86FixupBWInsts());
     addPass(createX86PadShortFunctions());
     addPass(createX86FixupLEAs());
-    addPass(createX86EvexToVexInsts());
   }
+  addPass(createX86EvexToVexInsts());
   addPass(createX86DiscriminateMemOpsPass());
   addPass(createX86InsertPrefetchPass());
+  addPass(createX86InsertX87waitPass());
 }
 
 void X86PassConfig::addPreEmitPass2() {
   const Triple &TT = TM->getTargetTriple();
   const MCAsmInfo *MAI = TM->getMCAsmInfo();
 
-  addPass(createX86RetpolineThunksPass());
+  // The X86 Speculative Execution Pass must run after all control
+  // flow graph modifying passes. As a result it was listed to run right before
+  // the X86 Retpoline Thunks pass. The reason it must run after control flow
+  // graph modifications is that the model of LFENCE in LLVM has to be updated
+  // (FIXME: https://bugs.llvm.org/show_bug.cgi?id=45167). Currently the
+  // placement of this pass was hand checked to ensure that the subsequent
+  // passes don't move the code around the LFENCEs in a way that will hurt the
+  // correctness of this pass. This placement has been shown to work based on
+  // hand inspection of the codegen output.
+  addPass(createX86SpeculativeExecutionSideEffectSuppression());
+  addPass(createX86IndirectThunksPass());
 
   // Insert extra int3 instructions after trailing call instructions to avoid
   // issues in the unwinder.
@@ -542,6 +551,7 @@ void X86PassConfig::addPreEmitPass2() {
   // Identify valid longjmp targets for Windows Control Flow Guard.
   if (TT.isOSWindows())
     addPass(createCFGuardLongjmpPass());
+  addPass(createX86LoadValueInjectionRetHardeningPass());
 }
 
 std::unique_ptr<CSEConfigBase> X86PassConfig::getCSEConfig() const {
diff --git a/llvm/lib/Target/X86/X86TargetMachine.h b/llvm/lib/Target/X86/X86TargetMachine.h
index ec3db7b1e9e8..8d98474a39c0 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.h
+++ b/llvm/lib/Target/X86/X86TargetMachine.h
@@ -23,13 +23,13 @@
 namespace llvm {
 
 class StringRef;
-class X86Subtarget;
-class X86RegisterBankInfo;
 class TargetTransformInfo;
 
 class X86TargetMachine final : public LLVMTargetMachine {
   std::unique_ptr<TargetLoweringObjectFile> TLOF;
   mutable StringMap<std::unique_ptr<X86Subtarget>> SubtargetMap;
+  // True if this is used in JIT.
+  bool IsJIT;
 
 public:
   X86TargetMachine(const Target &T, const Triple &TT, StringRef CPU,
@@ -52,6 +52,8 @@ public:
   TargetLoweringObjectFile *getObjFileLowering() const override {
     return TLOF.get();
   }
+
+  bool isJIT() const { return IsJIT; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/X86/X86TargetObjectFile.cpp b/llvm/lib/Target/X86/X86TargetObjectFile.cpp
index 44185957686b..2b48baccc01f 100644
--- a/llvm/lib/Target/X86/X86TargetObjectFile.cpp
+++ b/llvm/lib/Target/X86/X86TargetObjectFile.cpp
@@ -18,6 +18,7 @@
 #include "llvm/MC/MCSectionCOFF.h"
 #include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/MCValue.h"
+#include "llvm/Target/TargetMachine.h"
 
 using namespace llvm;
 using namespace dwarf;
@@ -63,30 +64,3 @@ const MCExpr *X86ELFTargetObjectFile::getDebugThreadLocalSymbol(
     const MCSymbol *Sym) const {
   return MCSymbolRefExpr::create(Sym, MCSymbolRefExpr::VK_DTPOFF, getContext());
 }
-
-void
-X86FreeBSDTargetObjectFile::Initialize(MCContext &Ctx,
-                                       const TargetMachine &TM) {
-  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
-}
-
-void
-X86FuchsiaTargetObjectFile::Initialize(MCContext &Ctx,
-                                       const TargetMachine &TM) {
-  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
-}
-
-void
-X86LinuxNaClTargetObjectFile::Initialize(MCContext &Ctx,
-                                         const TargetMachine &TM) {
-  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
-}
-
-void X86SolarisTargetObjectFile::Initialize(MCContext &Ctx,
-                                            const TargetMachine &TM) {
-  TargetLoweringObjectFileELF::Initialize(Ctx, TM);
-  InitializeELF(TM.Options.UseInitArray);
-}
diff --git a/llvm/lib/Target/X86/X86TargetObjectFile.h b/llvm/lib/Target/X86/X86TargetObjectFile.h
index 1fd0bbf56b19..acea772eb036 100644
--- a/llvm/lib/Target/X86/X86TargetObjectFile.h
+++ b/llvm/lib/Target/X86/X86TargetObjectFile.h
@@ -10,7 +10,6 @@
 #define LLVM_LIB_TARGET_X86_X86TARGETOBJECTFILE_H
 
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
-#include "llvm/Target/TargetLoweringObjectFile.h"
 
 namespace llvm {
 
@@ -44,33 +43,10 @@ namespace llvm {
     X86ELFTargetObjectFile() {
       PLTRelativeVariantKind = MCSymbolRefExpr::VK_PLT;
     }
-
     /// Describe a TLS variable address within debug info.
     const MCExpr *getDebugThreadLocalSymbol(const MCSymbol *Sym) const override;
   };
 
-  /// X86FreeBSDTargetObjectFile - This implementation is used for FreeBSD
-  /// on x86 and x86-64.
-  class X86FreeBSDTargetObjectFile : public X86ELFTargetObjectFile {
-    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
-  };
-
-  /// This implementation is used for Fuchsia on x86-64.
-  class X86FuchsiaTargetObjectFile : public X86ELFTargetObjectFile {
-    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
-  };
-
-  /// X86LinuxNaClTargetObjectFile - This implementation is used for linux and
-  /// Native Client on x86 and x86-64.
-  class X86LinuxNaClTargetObjectFile : public X86ELFTargetObjectFile {
-    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
-  };
-
-  /// This implementation is used for Solaris on x86/x86-64.
-  class X86SolarisTargetObjectFile : public X86ELFTargetObjectFile {
-    void Initialize(MCContext &Ctx, const TargetMachine &TM) override;
-  };
-
 } // end namespace llvm
 
 #endif
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index b754836ea517..cc18e55656ef 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -170,12 +170,18 @@ unsigned X86TTIImpl::getMaxInterleaveFactor(unsigned VF) {
 }
 
 int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
+                                       TTI::TargetCostKind CostKind,
                                        TTI::OperandValueKind Op1Info,
                                        TTI::OperandValueKind Op2Info,
                                        TTI::OperandValueProperties Opd1PropInfo,
                                        TTI::OperandValueProperties Opd2PropInfo,
                                        ArrayRef<const Value *> Args,
                                        const Instruction *CxtI) {
+  // TODO: Handle more cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info,
+                                         Op2Info, Opd1PropInfo,
+                                         Opd2PropInfo, Args, CxtI);
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
@@ -256,20 +262,25 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
       // The OperandValue properties may not be the same as that of the previous
       // operation; conservatively assume OP_None.
       int Cost =
-          2 * getArithmeticInstrCost(Instruction::AShr, Ty, Op1Info, Op2Info,
+          2 * getArithmeticInstrCost(Instruction::AShr, Ty, CostKind, Op1Info,
+                                     Op2Info,
                                      TargetTransformInfo::OP_None,
                                      TargetTransformInfo::OP_None);
-      Cost += getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+      Cost += getArithmeticInstrCost(Instruction::LShr, Ty, CostKind, Op1Info,
+                                     Op2Info,
                                      TargetTransformInfo::OP_None,
                                      TargetTransformInfo::OP_None);
-      Cost += getArithmeticInstrCost(Instruction::Add, Ty, Op1Info, Op2Info,
+      Cost += getArithmeticInstrCost(Instruction::Add, Ty, CostKind, Op1Info,
+                                     Op2Info,
                                      TargetTransformInfo::OP_None,
                                      TargetTransformInfo::OP_None);
 
       if (ISD == ISD::SREM) {
         // For SREM: (X % C) is the equivalent of (X - (X/C)*C)
-        Cost += getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info);
-        Cost += getArithmeticInstrCost(Instruction::Sub, Ty, Op1Info, Op2Info);
+        Cost += getArithmeticInstrCost(Instruction::Mul, Ty, CostKind, Op1Info,
+                                       Op2Info);
+        Cost += getArithmeticInstrCost(Instruction::Sub, Ty, CostKind, Op1Info,
+                                       Op2Info);
       }
 
       return Cost;
@@ -277,12 +288,14 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
     // Vector unsigned division/remainder will be simplified to shifts/masks.
     if (ISD == ISD::UDIV)
-      return getArithmeticInstrCost(Instruction::LShr, Ty, Op1Info, Op2Info,
+      return getArithmeticInstrCost(Instruction::LShr, Ty, CostKind,
+                                    Op1Info, Op2Info,
                                     TargetTransformInfo::OP_None,
                                     TargetTransformInfo::OP_None);
 
     else // UREM
-      return getArithmeticInstrCost(Instruction::And, Ty, Op1Info, Op2Info,
+      return getArithmeticInstrCost(Instruction::And, Ty, CostKind,
+                                    Op1Info, Op2Info,
                                     TargetTransformInfo::OP_None,
                                     TargetTransformInfo::OP_None);
   }
@@ -304,6 +317,10 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     { ISD::SRA,  MVT::v2i64,   1 },
     { ISD::SRA,  MVT::v4i64,   1 },
     { ISD::SRA,  MVT::v8i64,   1 },
+
+    { ISD::SHL,  MVT::v64i8,   4 }, // psllw + pand.
+    { ISD::SRL,  MVT::v64i8,   4 }, // psrlw + pand.
+    { ISD::SRA,  MVT::v64i8,   8 }, // psrlw, pand, pxor, psubb.
   };
 
   if (Op2Info == TargetTransformInfo::OK_UniformConstantValue &&
@@ -370,6 +387,14 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     { ISD::SREM, MVT::v16i32, 17 }, // vpmuldq+mul+sub sequence
     { ISD::UDIV, MVT::v16i32, 15 }, // vpmuludq sequence
     { ISD::UREM, MVT::v16i32, 17 }, // vpmuludq+mul+sub sequence
+    { ISD::SDIV, MVT::v64i8,  28 }, // 4*ext+4*pmulhw sequence
+    { ISD::SREM, MVT::v64i8,  32 }, // 4*ext+4*pmulhw+mul+sub sequence
+    { ISD::UDIV, MVT::v64i8,  28 }, // 4*ext+4*pmulhw sequence
+    { ISD::UREM, MVT::v64i8,  32 }, // 4*ext+4*pmulhw+mul+sub sequence
+    { ISD::SDIV, MVT::v32i16, 12 }, // 2*vpmulhw sequence
+    { ISD::SREM, MVT::v32i16, 16 }, // 2*vpmulhw+mul+sub sequence
+    { ISD::UDIV, MVT::v32i16, 12 }, // 2*vpmulhuw sequence
+    { ISD::UREM, MVT::v32i16, 16 }, // 2*vpmulhuw+mul+sub sequence
   };
 
   if ((Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
@@ -446,11 +471,32 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
       return LT.first * Entry->Cost;
   }
 
+  static const CostTblEntry AVX512BWShiftCostTable[] = {
+    { ISD::SHL,   MVT::v8i16,      1 }, // vpsllvw
+    { ISD::SRL,   MVT::v8i16,      1 }, // vpsrlvw
+    { ISD::SRA,   MVT::v8i16,      1 }, // vpsravw
+
+    { ISD::SHL,   MVT::v16i16,     1 }, // vpsllvw
+    { ISD::SRL,   MVT::v16i16,     1 }, // vpsrlvw
+    { ISD::SRA,   MVT::v16i16,     1 }, // vpsravw
+
+    { ISD::SHL,   MVT::v32i16,     1 }, // vpsllvw
+    { ISD::SRL,   MVT::v32i16,     1 }, // vpsrlvw
+    { ISD::SRA,   MVT::v32i16,     1 }, // vpsravw
+  };
+
+  if (ST->hasBWI())
+    if (const auto *Entry = CostTableLookup(AVX512BWShiftCostTable, ISD, LT.second))
+      return LT.first * Entry->Cost;
+
   static const CostTblEntry AVX2UniformCostTable[] = {
     // Uniform splats are cheaper for the following instructions.
     { ISD::SHL,  MVT::v16i16, 1 }, // psllw.
     { ISD::SRL,  MVT::v16i16, 1 }, // psrlw.
     { ISD::SRA,  MVT::v16i16, 1 }, // psraw.
+    { ISD::SHL,  MVT::v32i16, 2 }, // 2*psllw.
+    { ISD::SRL,  MVT::v32i16, 2 }, // 2*psrlw.
+    { ISD::SRA,  MVT::v32i16, 2 }, // 2*psraw.
   };
 
   if (ST->hasAVX2() &&
@@ -495,18 +541,6 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
       return LT.first * Entry->Cost;
 
   static const CostTblEntry AVX512BWCostTable[] = {
-    { ISD::SHL,   MVT::v8i16,      1 }, // vpsllvw
-    { ISD::SRL,   MVT::v8i16,      1 }, // vpsrlvw
-    { ISD::SRA,   MVT::v8i16,      1 }, // vpsravw
-
-    { ISD::SHL,   MVT::v16i16,     1 }, // vpsllvw
-    { ISD::SRL,   MVT::v16i16,     1 }, // vpsrlvw
-    { ISD::SRA,   MVT::v16i16,     1 }, // vpsravw
-
-    { ISD::SHL,   MVT::v32i16,     1 }, // vpsllvw
-    { ISD::SRL,   MVT::v32i16,     1 }, // vpsrlvw
-    { ISD::SRA,   MVT::v32i16,     1 }, // vpsravw
-
     { ISD::SHL,   MVT::v64i8,     11 }, // vpblendvb sequence.
     { ISD::SRL,   MVT::v64i8,     11 }, // vpblendvb sequence.
     { ISD::SRA,   MVT::v64i8,     24 }, // vpblendvb sequence.
@@ -533,6 +567,7 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     { ISD::SRA,     MVT::v4i64,      1 },
     { ISD::SRA,     MVT::v8i64,      1 },
 
+    { ISD::MUL,     MVT::v64i8,     26 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,     MVT::v32i8,     13 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,     MVT::v16i8,      5 }, // extend/pmullw/trunc sequence.
     { ISD::MUL,     MVT::v16i32,     1 }, // pmulld (Skylake from agner.org)
@@ -568,6 +603,18 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
     { ISD::SRL,     MVT::v4i64,    1 },
   };
 
+  if (ST->hasAVX512()) {
+    if (ISD == ISD::SHL && LT.second == MVT::v32i16 &&
+        (Op2Info == TargetTransformInfo::OK_UniformConstantValue ||
+         Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
+      // On AVX512, a packed v32i16 shift left by a constant build_vector
+      // is lowered into a vector multiply (vpmullw).
+      return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
+                                    Op1Info, Op2Info,
+                                    TargetTransformInfo::OP_None,
+                                    TargetTransformInfo::OP_None);
+  }
+
   // Look for AVX2 lowering tricks.
   if (ST->hasAVX2()) {
     if (ISD == ISD::SHL && LT.second == MVT::v16i16 &&
@@ -575,7 +622,8 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
          Op2Info == TargetTransformInfo::OK_NonUniformConstantValue))
       // On AVX2, a packed v16i16 shift left by a constant build_vector
       // is lowered into a vector multiply (vpmullw).
-      return getArithmeticInstrCost(Instruction::Mul, Ty, Op1Info, Op2Info,
+      return getArithmeticInstrCost(Instruction::Mul, Ty, CostKind,
+                                    Op1Info, Op2Info,
                                     TargetTransformInfo::OP_None,
                                     TargetTransformInfo::OP_None);
 
@@ -667,13 +715,19 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
 
   static const CostTblEntry AVX2CostTable[] = {
     { ISD::SHL,  MVT::v32i8,     11 }, // vpblendvb sequence.
+    { ISD::SHL,  MVT::v64i8,     22 }, // 2*vpblendvb sequence.
     { ISD::SHL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
+    { ISD::SHL,  MVT::v32i16,    20 }, // 2*extend/vpsrlvd/pack sequence.
 
     { ISD::SRL,  MVT::v32i8,     11 }, // vpblendvb sequence.
+    { ISD::SRL,  MVT::v64i8,     22 }, // 2*vpblendvb sequence.
     { ISD::SRL,  MVT::v16i16,    10 }, // extend/vpsrlvd/pack sequence.
+    { ISD::SRL,  MVT::v32i16,    20 }, // 2*extend/vpsrlvd/pack sequence.
 
     { ISD::SRA,  MVT::v32i8,     24 }, // vpblendvb sequence.
+    { ISD::SRA,  MVT::v64i8,     48 }, // 2*vpblendvb sequence.
     { ISD::SRA,  MVT::v16i16,    10 }, // extend/vpsravd/pack sequence.
+    { ISD::SRA,  MVT::v32i16,    20 }, // 2*extend/vpsravd/pack sequence.
     { ISD::SRA,  MVT::v2i64,      4 }, // srl/xor/sub sequence.
     { ISD::SRA,  MVT::v4i64,      4 }, // srl/xor/sub sequence.
 
@@ -877,20 +931,20 @@ int X86TTIImpl::getArithmeticInstrCost(unsigned Opcode, Type *Ty,
   if (LT.second.isVector() && (ISD == ISD::SDIV || ISD == ISD::SREM ||
                                ISD == ISD::UDIV || ISD == ISD::UREM)) {
     int ScalarCost = getArithmeticInstrCost(
-        Opcode, Ty->getScalarType(), Op1Info, Op2Info,
+        Opcode, Ty->getScalarType(), CostKind, Op1Info, Op2Info,
         TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
     return 20 * LT.first * LT.second.getVectorNumElements() * ScalarCost;
   }
 
   // Fallback to the default implementation.
-  return BaseT::getArithmeticInstrCost(Opcode, Ty, Op1Info, Op2Info);
+  return BaseT::getArithmeticInstrCost(Opcode, Ty, CostKind, Op1Info, Op2Info);
 }
 
-int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
-                               Type *SubTp) {
+int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, VectorType *BaseTp,
+                               int Index, VectorType *SubTp) {
   // 64-bit packed float vectors (v2f32) are widened to type v4f32.
   // 64-bit packed integer vectors (v2i32) are widened to type v4i32.
-  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Tp);
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, BaseTp);
 
   // Treat Transpose as 2-op shuffles - there's no difference in lowering.
   if (Kind == TTI::SK_Transpose)
@@ -919,19 +973,19 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       // FIXME: Remove some of the alignment restrictions.
       // FIXME: We can use permq for 64-bit or larger extracts from 256-bit
       // vectors.
-      int OrigSubElts = SubTp->getVectorNumElements();
-      if (NumSubElts > OrigSubElts &&
-          (Index % OrigSubElts) == 0 && (NumSubElts % OrigSubElts) == 0 &&
+      int OrigSubElts = cast<FixedVectorType>(SubTp)->getNumElements();
+      if (NumSubElts > OrigSubElts && (Index % OrigSubElts) == 0 &&
+          (NumSubElts % OrigSubElts) == 0 &&
           LT.second.getVectorElementType() ==
-            SubLT.second.getVectorElementType() &&
+              SubLT.second.getVectorElementType() &&
           LT.second.getVectorElementType().getSizeInBits() ==
-            Tp->getVectorElementType()->getPrimitiveSizeInBits()) {
+              BaseTp->getElementType()->getPrimitiveSizeInBits()) {
         assert(NumElts >= NumSubElts && NumElts > OrigSubElts &&
                "Unexpected number of elements!");
-        Type *VecTy = VectorType::get(Tp->getVectorElementType(),
-                                      LT.second.getVectorNumElements());
-        Type *SubTy = VectorType::get(Tp->getVectorElementType(),
-                                      SubLT.second.getVectorNumElements());
+        auto *VecTy = FixedVectorType::get(BaseTp->getElementType(),
+                                           LT.second.getVectorNumElements());
+        auto *SubTy = FixedVectorType::get(BaseTp->getElementType(),
+                                           SubLT.second.getVectorNumElements());
         int ExtractIndex = alignDown((Index % NumElts), NumSubElts);
         int ExtractCost = getShuffleCost(TTI::SK_ExtractSubvector, VecTy,
                                          ExtractIndex, SubTy);
@@ -949,6 +1003,42 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     }
   }
 
+  // Handle some common (illegal) sub-vector types as they are often very cheap
+  // to shuffle even on targets without PSHUFB.
+  EVT VT = TLI->getValueType(DL, BaseTp);
+  if (VT.isSimple() && VT.isVector() && VT.getSizeInBits() < 128 &&
+      !ST->hasSSSE3()) {
+     static const CostTblEntry SSE2SubVectorShuffleTbl[] = {
+      {TTI::SK_Broadcast,        MVT::v4i16, 1}, // pshuflw
+      {TTI::SK_Broadcast,        MVT::v2i16, 1}, // pshuflw
+      {TTI::SK_Broadcast,        MVT::v8i8,  2}, // punpck/pshuflw
+      {TTI::SK_Broadcast,        MVT::v4i8,  2}, // punpck/pshuflw
+      {TTI::SK_Broadcast,        MVT::v2i8,  1}, // punpck
+
+      {TTI::SK_Reverse,          MVT::v4i16, 1}, // pshuflw
+      {TTI::SK_Reverse,          MVT::v2i16, 1}, // pshuflw
+      {TTI::SK_Reverse,          MVT::v4i8,  3}, // punpck/pshuflw/packus
+      {TTI::SK_Reverse,          MVT::v2i8,  1}, // punpck
+
+      {TTI::SK_PermuteTwoSrc,    MVT::v4i16, 2}, // punpck/pshuflw
+      {TTI::SK_PermuteTwoSrc,    MVT::v2i16, 2}, // punpck/pshuflw
+      {TTI::SK_PermuteTwoSrc,    MVT::v8i8,  7}, // punpck/pshuflw
+      {TTI::SK_PermuteTwoSrc,    MVT::v4i8,  4}, // punpck/pshuflw
+      {TTI::SK_PermuteTwoSrc,    MVT::v2i8,  2}, // punpck
+
+      {TTI::SK_PermuteSingleSrc, MVT::v4i16, 1}, // pshuflw
+      {TTI::SK_PermuteSingleSrc, MVT::v2i16, 1}, // pshuflw
+      {TTI::SK_PermuteSingleSrc, MVT::v8i8,  5}, // punpck/pshuflw
+      {TTI::SK_PermuteSingleSrc, MVT::v4i8,  3}, // punpck/pshuflw
+      {TTI::SK_PermuteSingleSrc, MVT::v2i8,  1}, // punpck
+    };
+
+    if (ST->hasSSE2())
+      if (const auto *Entry =
+              CostTableLookup(SSE2SubVectorShuffleTbl, Kind, VT.getSimpleVT()))
+        return Entry->Cost;
+  }
+
   // We are going to permute multiple sources and the result will be in multiple
   // destinations. Providing an accurate cost only for splits where the element
   // type remains the same.
@@ -956,25 +1046,26 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     MVT LegalVT = LT.second;
     if (LegalVT.isVector() &&
         LegalVT.getVectorElementType().getSizeInBits() ==
-            Tp->getVectorElementType()->getPrimitiveSizeInBits() &&
-        LegalVT.getVectorNumElements() < Tp->getVectorNumElements()) {
+            BaseTp->getElementType()->getPrimitiveSizeInBits() &&
+        LegalVT.getVectorNumElements() <
+            cast<FixedVectorType>(BaseTp)->getNumElements()) {
 
-      unsigned VecTySize = DL.getTypeStoreSize(Tp);
+      unsigned VecTySize = DL.getTypeStoreSize(BaseTp);
       unsigned LegalVTSize = LegalVT.getStoreSize();
       // Number of source vectors after legalization:
       unsigned NumOfSrcs = (VecTySize + LegalVTSize - 1) / LegalVTSize;
       // Number of destination vectors after legalization:
       unsigned NumOfDests = LT.first;
 
-      Type *SingleOpTy = VectorType::get(Tp->getVectorElementType(),
-                                         LegalVT.getVectorNumElements());
+      auto *SingleOpTy = FixedVectorType::get(BaseTp->getElementType(),
+                                              LegalVT.getVectorNumElements());
 
       unsigned NumOfShuffles = (NumOfSrcs - 1) * NumOfDests;
       return NumOfShuffles *
              getShuffleCost(TTI::SK_PermuteTwoSrc, SingleOpTy, 0, nullptr);
     }
 
-    return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+    return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
   }
 
   // For 2-input shuffles, we must account for splitting the 2 inputs into many.
@@ -992,9 +1083,9 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       {TTI::SK_PermuteSingleSrc, MVT::v64i8, 1}, // vpermb
       {TTI::SK_PermuteSingleSrc, MVT::v32i8, 1}, // vpermb
 
-      {TTI::SK_PermuteTwoSrc, MVT::v64i8, 1}, // vpermt2b
-      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 1}, // vpermt2b
-      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 1}  // vpermt2b
+      {TTI::SK_PermuteTwoSrc, MVT::v64i8, 2}, // vpermt2b
+      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 2}, // vpermt2b
+      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 2}  // vpermt2b
   };
 
   if (ST->hasVBMI())
@@ -1006,22 +1097,18 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
       {TTI::SK_Broadcast, MVT::v64i8, 1},  // vpbroadcastb
 
-      {TTI::SK_Reverse, MVT::v32i16, 1}, // vpermw
-      {TTI::SK_Reverse, MVT::v16i16, 1}, // vpermw
+      {TTI::SK_Reverse, MVT::v32i16, 2}, // vpermw
+      {TTI::SK_Reverse, MVT::v16i16, 2}, // vpermw
       {TTI::SK_Reverse, MVT::v64i8, 2},  // pshufb + vshufi64x2
 
-      {TTI::SK_PermuteSingleSrc, MVT::v32i16, 1}, // vpermw
-      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 1}, // vpermw
-      {TTI::SK_PermuteSingleSrc, MVT::v8i16, 1},  // vpermw
+      {TTI::SK_PermuteSingleSrc, MVT::v32i16, 2}, // vpermw
+      {TTI::SK_PermuteSingleSrc, MVT::v16i16, 2}, // vpermw
       {TTI::SK_PermuteSingleSrc, MVT::v64i8, 8},  // extend to v32i16
-      {TTI::SK_PermuteSingleSrc, MVT::v32i8, 3},  // vpermw + zext/trunc
 
-      {TTI::SK_PermuteTwoSrc, MVT::v32i16, 1}, // vpermt2w
-      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 1}, // vpermt2w
-      {TTI::SK_PermuteTwoSrc, MVT::v8i16, 1},  // vpermt2w
-      {TTI::SK_PermuteTwoSrc, MVT::v32i8, 3},  // zext + vpermt2w + trunc
+      {TTI::SK_PermuteTwoSrc, MVT::v32i16, 2}, // vpermt2w
+      {TTI::SK_PermuteTwoSrc, MVT::v16i16, 2}, // vpermt2w
+      {TTI::SK_PermuteTwoSrc, MVT::v8i16, 2},  // vpermt2w
       {TTI::SK_PermuteTwoSrc, MVT::v64i8, 19}, // 6 * v32i8 + 1
-      {TTI::SK_PermuteTwoSrc, MVT::v16i8, 3}   // zext + vpermt2w + trunc
   };
 
   if (ST->hasBWI())
@@ -1034,6 +1121,8 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       {TTI::SK_Broadcast, MVT::v16f32, 1}, // vbroadcastps
       {TTI::SK_Broadcast, MVT::v8i64, 1},  // vpbroadcastq
       {TTI::SK_Broadcast, MVT::v16i32, 1}, // vpbroadcastd
+      {TTI::SK_Broadcast, MVT::v32i16, 1}, // vpbroadcastw
+      {TTI::SK_Broadcast, MVT::v64i8, 1},  // vpbroadcastb
 
       {TTI::SK_Reverse, MVT::v8f64, 1},  // vpermpd
       {TTI::SK_Reverse, MVT::v16f32, 1}, // vpermps
@@ -1065,7 +1154,14 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
       {TTI::SK_PermuteTwoSrc, MVT::v2f64, 1},  // vpermt2pd
       {TTI::SK_PermuteTwoSrc, MVT::v4f32, 1},  // vpermt2ps
       {TTI::SK_PermuteTwoSrc, MVT::v2i64, 1},  // vpermt2q
-      {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1}   // vpermt2d
+      {TTI::SK_PermuteTwoSrc, MVT::v4i32, 1},  // vpermt2d
+
+      // FIXME: This just applies the type legalization cost rules above
+      // assuming these completely split.
+      {TTI::SK_PermuteSingleSrc, MVT::v32i16, 14},
+      {TTI::SK_PermuteSingleSrc, MVT::v64i8,  14},
+      {TTI::SK_PermuteTwoSrc,    MVT::v32i16, 42},
+      {TTI::SK_PermuteTwoSrc,    MVT::v64i8,  42},
   };
 
   if (ST->hasAVX512())
@@ -1267,14 +1363,22 @@ int X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index,
     if (const auto *Entry = CostTableLookup(SSE1ShuffleTbl, Kind, LT.second))
       return LT.first * Entry->Cost;
 
-  return BaseT::getShuffleCost(Kind, Tp, Index, SubTp);
+  return BaseT::getShuffleCost(Kind, BaseTp, Index, SubTp);
 }
 
 int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                                 TTI::TargetCostKind CostKind,
                                  const Instruction *I) {
   int ISD = TLI->InstructionOpcodeToISD(Opcode);
   assert(ISD && "Invalid opcode");
 
+  // TODO: Allow non-throughput costs that aren't binary.
+  auto AdjustCost = [&CostKind](int Cost) {
+    if (CostKind != TTI::TCK_RecipThroughput)
+      return Cost == 0 ? 0 : 1;
+    return Cost;
+  };
+
   // FIXME: Need a better design of the cost table to handle non-simple types of
   // potential massive combinations (elem_num x src_type x dst_type).
 
@@ -1283,6 +1387,11 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 1 },
 
     // Mask sign extend has an instruction.
+    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,  1 },
     { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,  1 },
     { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1, 1 },
     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
@@ -1290,42 +1399,45 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i1, 1 },
     { ISD::SIGN_EXTEND, MVT::v64i8,  MVT::v64i1, 1 },
 
-    // Mask zero extend is a load + broadcast.
+    // Mask zero extend is a sext + shift.
+    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,  2 },
     { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,  2 },
     { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1, 2 },
     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
     { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1, 2 },
     { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i1, 2 },
     { ISD::ZERO_EXTEND, MVT::v64i8,  MVT::v64i1, 2 },
+
+    { ISD::TRUNCATE,    MVT::v32i8,  MVT::v32i16, 2 },
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 }, // widen to zmm
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 }, // widen to zmm
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 }, // widen to zmm
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 }, // widen to zmm
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 }, // widen to zmm
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 }, // widen to zmm
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 }, // widen to zmm
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 }, // widen to zmm
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 }, // widen to zmm
+    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 }, // widen to zmm
+    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i16, 2 },
+    { ISD::TRUNCATE,    MVT::v64i1,  MVT::v64i8,  2 },
   };
 
   static const TypeConversionCostTblEntry AVX512DQConversionTbl[] = {
-    { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
-    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
-    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
-    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
     { ISD::SINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
 
-    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
-    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
-    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
-    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64,  1 },
     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  1 },
 
-    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f32,  1 },
-    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f32,  1 },
     { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f32,  1 },
-    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f64,  1 },
-    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f64,  1 },
     { ISD::FP_TO_SINT,  MVT::v8i64,  MVT::v8f64,  1 },
 
-    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f32,  1 },
-    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  1 },
     { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f32,  1 },
-    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  1 },
-    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  1 },
     { ISD::FP_TO_UINT,  MVT::v8i64,  MVT::v8f64,  1 },
   };
 
@@ -1337,14 +1449,70 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::FP_EXTEND, MVT::v8f64,   MVT::v16f32, 3 },
     { ISD::FP_ROUND,  MVT::v8f32,   MVT::v8f64,  1 },
 
-    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 1 },
-    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 1 },
-    { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  1 },
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,   3 }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,   3 }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,   3 }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,  3 }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,  3 }, // sext+vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,  3 }, // sext+vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,  3 }, // sext+vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16, 3 }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,  2 }, // zmm vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,  2 }, // zmm vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,  2 }, // zmm vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i32, 2 }, // vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,  2 }, // zmm vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,  2 }, // zmm vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i64,  2 }, // vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i32, 2 },
+    { ISD::TRUNCATE,  MVT::v16i16,  MVT::v16i32, 2 },
+    { ISD::TRUNCATE,  MVT::v8i8,    MVT::v8i64,  2 },
+    { ISD::TRUNCATE,  MVT::v8i16,   MVT::v8i64,  2 },
     { ISD::TRUNCATE,  MVT::v8i32,   MVT::v8i64,  1 },
+    { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,  1 }, // zmm vpmovqd
+    { ISD::TRUNCATE,  MVT::v16i8,   MVT::v16i64, 5 },// 2*vpmovqd+concat+vpmovdb
+
+    { ISD::TRUNCATE,  MVT::v16i8,  MVT::v16i16,  3 }, // extend to v16i32
+    { ISD::TRUNCATE,  MVT::v32i8,  MVT::v32i16,  8 },
+
+    // Sign extend is zmm vpternlogd+vptruncdb.
+    // Zero extend is zmm broadcast load+vptruncdw.
+    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   4 },
+    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   4 },
+    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   4 },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1,  3 },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1,  4 },
+
+    // Sign extend is zmm vpternlogd+vptruncdw.
+    // Zero extend is zmm vpternlogd+vptruncdw+vpsrlw.
+    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   4 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   4 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   3 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   4 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  3 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  4 },
+
+    { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   1 }, // zmm vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   2 }, // zmm vpternlogd+psrld
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 }, // zmm vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 }, // zmm vpternlogd+psrld
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 }, // zmm vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 }, // zmm vpternlogd+psrld
+    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 }, // zmm vpternlogq
+    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 }, // zmm vpternlogq+psrlq
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 }, // zmm vpternlogq
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 }, // zmm vpternlogq+psrlq
+
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  1 }, // vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 }, // vpternlogd+psrld
+    { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i1,   1 }, // vpternlogq
+    { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i1,   2 }, // vpternlogq+psrlq
 
-    // v16i1 -> v16i32 - load + broadcast
-    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
-    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i1,  2 },
     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i8,  1 },
     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 1 },
@@ -1356,6 +1524,9 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::SIGN_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
     { ISD::ZERO_EXTEND, MVT::v8i64,  MVT::v8i32,  1 },
 
+    { ISD::SIGN_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
+    { ISD::ZERO_EXTEND, MVT::v32i16, MVT::v32i8, 3 }, // FIXME: May not be right
+
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
     { ISD::SINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
     { ISD::SINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
@@ -1367,44 +1538,163 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
 
     { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i1,   4 },
     { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i1,  3 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
+    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
+    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
+    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  5 },
+
+    { ISD::FP_TO_SINT,  MVT::v8i8,   MVT::v8f64,  3 },
+    { ISD::FP_TO_SINT,  MVT::v8i16,  MVT::v8f64,  3 },
+    { ISD::FP_TO_SINT,  MVT::v16i8,  MVT::v16f32, 3 },
+    { ISD::FP_TO_SINT,  MVT::v16i16, MVT::v16f32, 3 },
+
+    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  1 },
+    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f64,  3 },
+    { ISD::FP_TO_UINT,  MVT::v8i8,   MVT::v8f64,  3 },
+    { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
+    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 3 },
+    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v16f32, 3 },
+  };
+
+  static const TypeConversionCostTblEntry AVX512BWVLConversionTbl[] {
+    // Mask sign extend has an instruction.
+    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,  1 },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1, 1 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 1 },
+    { ISD::SIGN_EXTEND, MVT::v32i8,  MVT::v32i1, 1 },
+
+    // Mask zero extend is a sext + shift.
+    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,  2 },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1, 2 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 2 },
+    { ISD::ZERO_EXTEND, MVT::v32i8,  MVT::v32i1, 2 },
+
+    { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 2 },
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   2 }, // vpsllw+vptestmb
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 }, // vpsllw+vptestmw
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 }, // vpsllw+vptestmb
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  2 }, // vpsllw+vptestmw
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   2 }, // vpsllw+vptestmb
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i16,  2 }, // vpsllw+vptestmw
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i8,  2 }, // vpsllw+vptestmb
+    { ISD::TRUNCATE,    MVT::v16i1,  MVT::v16i16, 2 }, // vpsllw+vptestmw
+    { ISD::TRUNCATE,    MVT::v32i1,  MVT::v32i8,  2 }, // vpsllw+vptestmb
+  };
+
+  static const TypeConversionCostTblEntry AVX512DQVLConversionTbl[] = {
+    { ISD::SINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
+    { ISD::SINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
+    { ISD::SINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
+    { ISD::SINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
+
+    { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i64,  1 },
+    { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  1 },
+
+    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f32,  1 },
+    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f32,  1 },
+    { ISD::FP_TO_SINT,  MVT::v2i64,  MVT::v2f64,  1 },
+    { ISD::FP_TO_SINT,  MVT::v4i64,  MVT::v4f64,  1 },
+
+    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v2i64,  MVT::v2f64,  1 },
+    { ISD::FP_TO_UINT,  MVT::v4i64,  MVT::v4f64,  1 },
+  };
+
+  static const TypeConversionCostTblEntry AVX512VLConversionTbl[] = {
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i8,   3 }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i8,   3 }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i8,   3 }, // sext+vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i8,  8 }, // split+2*v8i8
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i16,  3 }, // sext+vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i16,  3 }, // sext+vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i16,  3 }, // sext+vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v16i1,   MVT::v16i16, 8 }, // split+2*v8i16
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i32,  2 }, // vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i32,  2 }, // vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v8i1,    MVT::v8i32,  2 }, // vpslld+vptestmd
+    { ISD::TRUNCATE,  MVT::v2i1,    MVT::v2i64,  2 }, // vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v4i1,    MVT::v4i64,  2 }, // vpsllq+vptestmq
+    { ISD::TRUNCATE,  MVT::v4i32,   MVT::v4i64,  1 }, // vpmovqd
+
+    // sign extend is vpcmpeq+maskedmove+vpmovdw+vpacksswb
+    // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw+vpackuswb
+    { ISD::SIGN_EXTEND, MVT::v2i8,   MVT::v2i1,   5 },
+    { ISD::ZERO_EXTEND, MVT::v2i8,   MVT::v2i1,   6 },
+    { ISD::SIGN_EXTEND, MVT::v4i8,   MVT::v4i1,   5 },
+    { ISD::ZERO_EXTEND, MVT::v4i8,   MVT::v4i1,   6 },
+    { ISD::SIGN_EXTEND, MVT::v8i8,   MVT::v8i1,   5 },
+    { ISD::ZERO_EXTEND, MVT::v8i8,   MVT::v8i1,   6 },
+    { ISD::SIGN_EXTEND, MVT::v16i8,  MVT::v16i1, 10 },
+    { ISD::ZERO_EXTEND, MVT::v16i8,  MVT::v16i1, 12 },
+
+    // sign extend is vpcmpeq+maskedmove+vpmovdw
+    // zero extend is vpcmpeq+maskedmove+vpmovdw+vpsrlw
+    { ISD::SIGN_EXTEND, MVT::v2i16,  MVT::v2i1,   4 },
+    { ISD::ZERO_EXTEND, MVT::v2i16,  MVT::v2i1,   5 },
+    { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i1,   4 },
+    { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i1,   5 },
+    { ISD::SIGN_EXTEND, MVT::v8i16,  MVT::v8i1,   4 },
+    { ISD::ZERO_EXTEND, MVT::v8i16,  MVT::v8i1,   5 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 10 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 12 },
+
+    { ISD::SIGN_EXTEND, MVT::v2i32,  MVT::v2i1,   1 }, // vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v2i32,  MVT::v2i1,   2 }, // vpternlogd+psrld
+    { ISD::SIGN_EXTEND, MVT::v4i32,  MVT::v4i1,   1 }, // vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v4i32,  MVT::v4i1,   2 }, // vpternlogd+psrld
+    { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i1,   1 }, // vpternlogd
+    { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i1,   2 }, // vpternlogd+psrld
+    { ISD::SIGN_EXTEND, MVT::v2i64,  MVT::v2i1,   1 }, // vpternlogq
+    { ISD::ZERO_EXTEND, MVT::v2i64,  MVT::v2i1,   2 }, // vpternlogq+psrlq
+    { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i1,   1 }, // vpternlogq
+    { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i1,   2 }, // vpternlogq+psrlq
+
     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i8,   2 },
     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i8,   2 },
     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i8,   2 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i8,   2 },
-    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i8,  2 },
     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i16,  5 },
     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i16,  2 },
     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i16,  2 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i16,  2 },
-    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i16, 2 },
     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i32,  2 },
     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i32,  1 },
     { ISD::UINT_TO_FP,  MVT::v4f32,  MVT::v4i32,  1 },
     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i32,  1 },
     { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i32,  1 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i32,  1 },
-    { ISD::UINT_TO_FP,  MVT::v16f32, MVT::v16i32, 1 },
     { ISD::UINT_TO_FP,  MVT::v2f32,  MVT::v2i64,  5 },
-    { ISD::UINT_TO_FP,  MVT::v8f32,  MVT::v8i64, 26 },
     { ISD::UINT_TO_FP,  MVT::v2f64,  MVT::v2i64,  5 },
     { ISD::UINT_TO_FP,  MVT::v4f64,  MVT::v4i64,  5 },
-    { ISD::UINT_TO_FP,  MVT::v8f64,  MVT::v8i64,  5 },
 
     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    1 },
     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    1 },
+
+    { ISD::FP_TO_SINT,  MVT::v8i8,   MVT::v8f32,  3 },
+    { ISD::FP_TO_UINT,  MVT::v8i8,   MVT::v8f32,  3 },
+
     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    1 },
     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    1 },
 
     { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f32,  1 },
     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f32,  1 },
+    { ISD::FP_TO_UINT,  MVT::v2i32,  MVT::v2f64,  1 },
     { ISD::FP_TO_UINT,  MVT::v4i32,  MVT::v4f64,  1 },
     { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f32,  1 },
-    { ISD::FP_TO_UINT,  MVT::v8i32,  MVT::v8f64,  1 },
-    { ISD::FP_TO_UINT,  MVT::v8i16,  MVT::v8f64,  2 },
-    { ISD::FP_TO_UINT,  MVT::v8i8,   MVT::v8f64,  2 },
-    { ISD::FP_TO_UINT,  MVT::v16i32, MVT::v16f32, 1 },
-    { ISD::FP_TO_UINT,  MVT::v16i16, MVT::v16f32, 2 },
-    { ISD::FP_TO_UINT,  MVT::v16i8,  MVT::v16f32, 2 },
   };
 
   static const TypeConversionCostTblEntry AVX2ConversionTbl[] = {
@@ -1416,6 +1706,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,   1 },
     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,   1 },
     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,   1 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1,  1 },
     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8,  1 },
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16,  1 },
@@ -1424,13 +1716,16 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i16,  1 },
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  1 },
+    { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
+    { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 3 },
+
+    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i32,  2 },
 
     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i64,  2 },
     { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i64,  2 },
-    { ISD::TRUNCATE,    MVT::v4i32,  MVT::v4i64,  2 },
     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i32,  2 },
     { ISD::TRUNCATE,    MVT::v8i16,  MVT::v8i32,  2 },
-    { ISD::TRUNCATE,    MVT::v8i32,  MVT::v8i64,  4 },
 
     { ISD::FP_EXTEND,   MVT::v8f64,  MVT::v8f32,  3 },
     { ISD::FP_ROUND,    MVT::v8f32,  MVT::v8f64,  3 },
@@ -1447,6 +1742,8 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i8,  4 },
     { ISD::SIGN_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
     { ISD::ZERO_EXTEND, MVT::v8i32,  MVT::v8i8,  4 },
+    { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
+    { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i1, 4 },
     { ISD::SIGN_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
     { ISD::ZERO_EXTEND, MVT::v16i16, MVT::v16i8, 4 },
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i16, 4 },
@@ -1456,15 +1753,21 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32, 4 },
 
+    { ISD::TRUNCATE,    MVT::v4i1,  MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v8i1,  MVT::v8i32,  5 },
+    { ISD::TRUNCATE,    MVT::v16i1, MVT::v16i16, 4 },
+    { ISD::TRUNCATE,    MVT::v8i1,  MVT::v8i64,  9 },
+    { ISD::TRUNCATE,    MVT::v16i1, MVT::v16i64, 11 },
+
     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i16, 4 },
     { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i32,  4 },
     { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i32,  5 },
     { ISD::TRUNCATE,    MVT::v4i8,  MVT::v4i64,  4 },
     { ISD::TRUNCATE,    MVT::v4i16, MVT::v4i64,  4 },
-    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64,  4 },
+    { ISD::TRUNCATE,    MVT::v4i32, MVT::v4i64,  2 },
     { ISD::TRUNCATE,    MVT::v8i8,  MVT::v8i64, 11 },
     { ISD::TRUNCATE,    MVT::v8i16, MVT::v8i64,  9 },
-    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64,  9 },
+    { ISD::TRUNCATE,    MVT::v8i32, MVT::v8i64,  3 },
     { ISD::TRUNCATE,    MVT::v16i8, MVT::v16i64, 11 },
 
     { ISD::SINT_TO_FP,  MVT::v4f32, MVT::v4i1,  3 },
@@ -1503,8 +1806,15 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
     { ISD::SINT_TO_FP,  MVT::v4f64, MVT::v4i64, 13 },
 
-    { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f32, 1 },
-    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 7 },
+    { ISD::FP_TO_SINT,  MVT::v8i8,  MVT::v8f32, 4 },
+    { ISD::FP_TO_SINT,  MVT::v4i8,  MVT::v4f64, 3 },
+    { ISD::FP_TO_SINT,  MVT::v4i16, MVT::v4f64, 2 },
+    { ISD::FP_TO_SINT,  MVT::v8i16, MVT::v8f32, 3 },
+
+    { ISD::FP_TO_UINT,  MVT::v4i8,  MVT::v4f64, 3 },
+    { ISD::FP_TO_UINT,  MVT::v4i16, MVT::v4f64, 2 },
+    { ISD::FP_TO_UINT,  MVT::v8i8,  MVT::v8f32, 4 },
+    { ISD::FP_TO_UINT,  MVT::v8i16, MVT::v8f32, 3 },
     // This node is expanded into scalarized operations but BasicTTI is overly
     // optimistic estimating its cost.  It computes 3 per element (one
     // vector-extract, one scalar conversion and one vector-insert).  The
@@ -1544,7 +1854,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::ZERO_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
     { ISD::SIGN_EXTEND, MVT::v16i32, MVT::v16i16, 4 },
 
-    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 },
+    // These truncates end up widening elements.
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   1 }, // PMOVXZBQ
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  1 }, // PMOVXZWQ
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   1 }, // PMOVXZBD
+
+    { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i16,  1 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  1 },
     { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  1 },
     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  1 },
     { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  1 },
@@ -1555,6 +1871,13 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
 
     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    4 },
     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    4 },
+
+    { ISD::FP_TO_SINT,  MVT::v2i8,   MVT::v2f32,  3 },
+    { ISD::FP_TO_SINT,  MVT::v2i8,   MVT::v2f64,  3 },
+
+    { ISD::FP_TO_UINT,  MVT::v2i8,   MVT::v2f32,  3 },
+    { ISD::FP_TO_UINT,  MVT::v2i8,   MVT::v2f64,  3 },
+    { ISD::FP_TO_UINT,  MVT::v4i16,  MVT::v4f32,  2 },
   };
 
   static const TypeConversionCostTblEntry SSE2ConversionTbl[] = {
@@ -1580,16 +1903,26 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::UINT_TO_FP, MVT::v2f64, MVT::v2i64, 6 },
     { ISD::UINT_TO_FP, MVT::v4f32, MVT::v2i64, 15 },
 
+    { ISD::FP_TO_SINT,  MVT::v2i8,   MVT::v2f32,  4 },
+    { ISD::FP_TO_SINT,  MVT::v2i16,  MVT::v2f32,  2 },
+    { ISD::FP_TO_SINT,  MVT::v4i8,   MVT::v4f32,  3 },
     { ISD::FP_TO_SINT,  MVT::v4i16,  MVT::v4f32,  2 },
     { ISD::FP_TO_SINT,  MVT::v2i16,  MVT::v2f64,  2 },
+    { ISD::FP_TO_SINT,  MVT::v2i8,   MVT::v2f64,  4 },
 
-    { ISD::FP_TO_SINT,  MVT::v2i32,  MVT::v2f64,  3 },
+    { ISD::FP_TO_SINT,  MVT::v2i32,  MVT::v2f64,  1 },
 
     { ISD::UINT_TO_FP,  MVT::f32,    MVT::i64,    6 },
     { ISD::UINT_TO_FP,  MVT::f64,    MVT::i64,    6 },
 
     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f32,    4 },
     { ISD::FP_TO_UINT,  MVT::i64,    MVT::f64,    4 },
+    { ISD::FP_TO_UINT,  MVT::v2i8,   MVT::v2f32,  4 },
+    { ISD::FP_TO_UINT,  MVT::v2i8,   MVT::v2f64,  4 },
+    { ISD::FP_TO_UINT,  MVT::v4i8,   MVT::v4f32,  3 },
+    { ISD::FP_TO_UINT,  MVT::v2i16,  MVT::v2f32,  2 },
+    { ISD::FP_TO_UINT,  MVT::v2i16,  MVT::v2f64,  2 },
+    { ISD::FP_TO_UINT,  MVT::v4i16,  MVT::v4f32,  4 },
 
     { ISD::ZERO_EXTEND, MVT::v4i16,  MVT::v4i8,   1 },
     { ISD::SIGN_EXTEND, MVT::v4i16,  MVT::v4i8,   6 },
@@ -1616,11 +1949,19 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
     { ISD::ZERO_EXTEND, MVT::v4i64,  MVT::v4i32,  3 },
     { ISD::SIGN_EXTEND, MVT::v4i64,  MVT::v4i32,  5 },
 
+    // These truncates are really widening elements.
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i32,  1 }, // PSHUFD
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i16,  2 }, // PUNPCKLWD+DQ
+    { ISD::TRUNCATE,    MVT::v2i1,   MVT::v2i8,   3 }, // PUNPCKLBW+WD+PSHUFD
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i16,  1 }, // PUNPCKLWD
+    { ISD::TRUNCATE,    MVT::v4i1,   MVT::v4i8,   2 }, // PUNPCKLBW+WD
+    { ISD::TRUNCATE,    MVT::v8i1,   MVT::v8i8,   1 }, // PUNPCKLBW
+
     { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i16,  2 }, // PAND+PACKUSWB
-    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  4 },
-    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 },
+    { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i16,  2 }, // PAND+PACKUSWB
+    { ISD::TRUNCATE,    MVT::v8i8,   MVT::v8i16,  2 }, // PAND+PACKUSWB
     { ISD::TRUNCATE,    MVT::v16i8,  MVT::v16i16, 3 },
-    { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i32,  3 }, // PAND+3*PACKUSWB
+    { ISD::TRUNCATE,    MVT::v2i8,   MVT::v2i32,  3 }, // PAND+2*PACKUSWB
     { ISD::TRUNCATE,    MVT::v2i16,  MVT::v2i32,  1 },
     { ISD::TRUNCATE,    MVT::v4i8,   MVT::v4i32,  3 },
     { ISD::TRUNCATE,    MVT::v4i16,  MVT::v4i32,  3 },
@@ -1639,7 +1980,7 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
   if (ST->hasSSE2() && !ST->hasAVX()) {
     if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
                                                    LTDest.second, LTSrc.second))
-      return LTSrc.first * Entry->Cost;
+      return AdjustCost(LTSrc.first * Entry->Cost);
   }
 
   EVT SrcTy = TLI->getValueType(DL, Src);
@@ -1647,61 +1988,77 @@ int X86TTIImpl::getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
 
   // The function getSimpleVT only handles simple value types.
   if (!SrcTy.isSimple() || !DstTy.isSimple())
-    return BaseT::getCastInstrCost(Opcode, Dst, Src);
+    return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind));
 
   MVT SimpleSrcTy = SrcTy.getSimpleVT();
   MVT SimpleDstTy = DstTy.getSimpleVT();
 
-  // Make sure that neither type is going to be split before using the
-  // AVX512 tables. This handles -mprefer-vector-width=256
-  // with -min-legal-vector-width<=256
-  if (TLI->getTypeAction(SimpleSrcTy) != TargetLowering::TypeSplitVector &&
-      TLI->getTypeAction(SimpleDstTy) != TargetLowering::TypeSplitVector) {
+  if (ST->useAVX512Regs()) {
     if (ST->hasBWI())
       if (const auto *Entry = ConvertCostTableLookup(AVX512BWConversionTbl, ISD,
                                                      SimpleDstTy, SimpleSrcTy))
-        return Entry->Cost;
+        return AdjustCost(Entry->Cost);
 
     if (ST->hasDQI())
       if (const auto *Entry = ConvertCostTableLookup(AVX512DQConversionTbl, ISD,
                                                      SimpleDstTy, SimpleSrcTy))
-        return Entry->Cost;
+        return AdjustCost(Entry->Cost);
 
     if (ST->hasAVX512())
       if (const auto *Entry = ConvertCostTableLookup(AVX512FConversionTbl, ISD,
                                                      SimpleDstTy, SimpleSrcTy))
-        return Entry->Cost;
+        return AdjustCost(Entry->Cost);
   }
 
+  if (ST->hasBWI())
+    if (const auto *Entry = ConvertCostTableLookup(AVX512BWVLConversionTbl, ISD,
+                                                   SimpleDstTy, SimpleSrcTy))
+      return AdjustCost(Entry->Cost);
+
+  if (ST->hasDQI())
+    if (const auto *Entry = ConvertCostTableLookup(AVX512DQVLConversionTbl, ISD,
+                                                   SimpleDstTy, SimpleSrcTy))
+      return AdjustCost(Entry->Cost);
+
+  if (ST->hasAVX512())
+    if (const auto *Entry = ConvertCostTableLookup(AVX512VLConversionTbl, ISD,
+                                                   SimpleDstTy, SimpleSrcTy))
+      return AdjustCost(Entry->Cost);
+
   if (ST->hasAVX2()) {
     if (const auto *Entry = ConvertCostTableLookup(AVX2ConversionTbl, ISD,
                                                    SimpleDstTy, SimpleSrcTy))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
   if (ST->hasAVX()) {
     if (const auto *Entry = ConvertCostTableLookup(AVXConversionTbl, ISD,
                                                    SimpleDstTy, SimpleSrcTy))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
   if (ST->hasSSE41()) {
     if (const auto *Entry = ConvertCostTableLookup(SSE41ConversionTbl, ISD,
                                                    SimpleDstTy, SimpleSrcTy))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
   if (ST->hasSSE2()) {
     if (const auto *Entry = ConvertCostTableLookup(SSE2ConversionTbl, ISD,
                                                    SimpleDstTy, SimpleSrcTy))
-      return Entry->Cost;
+      return AdjustCost(Entry->Cost);
   }
 
-  return BaseT::getCastInstrCost(Opcode, Dst, Src, I);
+  return AdjustCost(BaseT::getCastInstrCost(Opcode, Dst, Src, CostKind, I));
 }
 
 int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                                   TTI::TargetCostKind CostKind,
                                    const Instruction *I) {
+  // TODO: Handle other cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
+
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
@@ -1774,6 +2131,12 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
     { ISD::SELECT,  MVT::v16i32,  1 },
     { ISD::SELECT,  MVT::v8f64,   1 },
     { ISD::SELECT,  MVT::v16f32,  1 },
+
+    { ISD::SETCC,   MVT::v32i16,  2 }, // FIXME: should probably be 4
+    { ISD::SETCC,   MVT::v64i8,   2 }, // FIXME: should probably be 4
+
+    { ISD::SELECT,  MVT::v32i16,  2 }, // FIXME: should be 3
+    { ISD::SELECT,  MVT::v64i8,   2 }, // FIXME: should be 3
   };
 
   static const CostTblEntry AVX2CostTbl[] = {
@@ -1878,14 +2241,14 @@ int X86TTIImpl::getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
     if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
       return LT.first * (ExtraCost + Entry->Cost);
 
-  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, I);
+  return BaseT::getCmpSelInstrCost(Opcode, ValTy, CondTy, CostKind, I);
 }
 
 unsigned X86TTIImpl::getAtomicMemIntrinsicMaxElementSize() const { return 16; }
 
-int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                                      ArrayRef<Type *> Tys, FastMathFlags FMF,
-                                      unsigned ScalarizationCostPassed) {
+int X86TTIImpl::getTypeBasedIntrinsicInstrCost(
+  const IntrinsicCostAttributes &ICA, TTI::TargetCostKind CostKind) {
+
   // Costs should match the codegen from:
   // BITREVERSE: llvm\test\CodeGen\X86\vector-bitreverse.ll
   // BSWAP: llvm\test\CodeGen\X86\bswap-vector.ll
@@ -1935,12 +2298,20 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
   static const CostTblEntry AVX512CostTbl[] = {
     { ISD::BITREVERSE, MVT::v8i64,  36 },
     { ISD::BITREVERSE, MVT::v16i32, 24 },
+    { ISD::BITREVERSE, MVT::v32i16, 10 },
+    { ISD::BITREVERSE, MVT::v64i8,  10 },
     { ISD::CTLZ,       MVT::v8i64,  29 },
     { ISD::CTLZ,       MVT::v16i32, 35 },
+    { ISD::CTLZ,       MVT::v32i16, 28 },
+    { ISD::CTLZ,       MVT::v64i8,  18 },
     { ISD::CTPOP,      MVT::v8i64,  16 },
     { ISD::CTPOP,      MVT::v16i32, 24 },
+    { ISD::CTPOP,      MVT::v32i16, 18 },
+    { ISD::CTPOP,      MVT::v64i8,  12 },
     { ISD::CTTZ,       MVT::v8i64,  20 },
     { ISD::CTTZ,       MVT::v16i32, 28 },
+    { ISD::CTTZ,       MVT::v32i16, 24 },
+    { ISD::CTTZ,       MVT::v64i8,  18 },
     { ISD::USUBSAT,    MVT::v16i32,  2 }, // pmaxud + psubd
     { ISD::USUBSAT,    MVT::v2i64,   2 }, // pmaxuq + psubq
     { ISD::USUBSAT,    MVT::v4i64,   2 }, // pmaxuq + psubq
@@ -1949,6 +2320,22 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::UADDSAT,    MVT::v2i64,   3 }, // not + pminuq + paddq
     { ISD::UADDSAT,    MVT::v4i64,   3 }, // not + pminuq + paddq
     { ISD::UADDSAT,    MVT::v8i64,   3 }, // not + pminuq + paddq
+    { ISD::SADDSAT,    MVT::v32i16,  2 }, // FIXME: include split
+    { ISD::SADDSAT,    MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::SSUBSAT,    MVT::v32i16,  2 }, // FIXME: include split
+    { ISD::SSUBSAT,    MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::UADDSAT,    MVT::v32i16,  2 }, // FIXME: include split
+    { ISD::UADDSAT,    MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::USUBSAT,    MVT::v32i16,  2 }, // FIXME: include split
+    { ISD::USUBSAT,    MVT::v64i8,   2 }, // FIXME: include split
+    { ISD::FMAXNUM,    MVT::f32,     2 },
+    { ISD::FMAXNUM,    MVT::v4f32,   2 },
+    { ISD::FMAXNUM,    MVT::v8f32,   2 },
+    { ISD::FMAXNUM,    MVT::v16f32,  2 },
+    { ISD::FMAXNUM,    MVT::f64,     2 },
+    { ISD::FMAXNUM,    MVT::v2f64,   2 },
+    { ISD::FMAXNUM,    MVT::v4f64,   2 },
+    { ISD::FMAXNUM,    MVT::v8f64,   2 },
   };
   static const CostTblEntry XOPCostTbl[] = {
     { ISD::BITREVERSE, MVT::v4i64,   4 },
@@ -2031,6 +2418,12 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::USUBSAT,    MVT::v16i16,  4 }, // 2 x 128-bit Op + extract/insert
     { ISD::USUBSAT,    MVT::v32i8,   4 }, // 2 x 128-bit Op + extract/insert
     { ISD::USUBSAT,    MVT::v8i32,   6 }, // 2 x 128-bit Op + extract/insert
+    { ISD::FMAXNUM,    MVT::f32,     3 },
+    { ISD::FMAXNUM,    MVT::v4f32,   3 },
+    { ISD::FMAXNUM,    MVT::v8f32,   5 },
+    { ISD::FMAXNUM,    MVT::f64,     3 },
+    { ISD::FMAXNUM,    MVT::v2f64,   3 },
+    { ISD::FMAXNUM,    MVT::v4f64,   5 },
     { ISD::FSQRT,      MVT::f32,    14 }, // SNB from http://www.agner.org/
     { ISD::FSQRT,      MVT::v4f32,  14 }, // SNB from http://www.agner.org/
     { ISD::FSQRT,      MVT::v8f32,  28 }, // SNB from http://www.agner.org/
@@ -2105,13 +2498,25 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::UADDSAT,    MVT::v16i8,   1 },
     { ISD::USUBSAT,    MVT::v8i16,   1 },
     { ISD::USUBSAT,    MVT::v16i8,   1 },
+    { ISD::FMAXNUM,    MVT::f64,     4 },
+    { ISD::FMAXNUM,    MVT::v2f64,   4 },
     { ISD::FSQRT,      MVT::f64,    32 }, // Nehalem from http://www.agner.org/
     { ISD::FSQRT,      MVT::v2f64,  32 }, // Nehalem from http://www.agner.org/
   };
   static const CostTblEntry SSE1CostTbl[] = {
+    { ISD::FMAXNUM,    MVT::f32,     4 },
+    { ISD::FMAXNUM,    MVT::v4f32,   4 },
     { ISD::FSQRT,      MVT::f32,    28 }, // Pentium III from http://www.agner.org/
     { ISD::FSQRT,      MVT::v4f32,  56 }, // Pentium III from http://www.agner.org/
   };
+  static const CostTblEntry BMI64CostTbl[] = { // 64-bit targets
+    { ISD::CTTZ,       MVT::i64,     1 },
+  };
+  static const CostTblEntry BMI32CostTbl[] = { // 32 or 64-bit targets
+    { ISD::CTTZ,       MVT::i32,     1 },
+    { ISD::CTTZ,       MVT::i16,     1 },
+    { ISD::CTTZ,       MVT::i8,      1 },
+  };
   static const CostTblEntry LZCNT64CostTbl[] = { // 64-bit targets
     { ISD::CTLZ,       MVT::i64,     1 },
   };
@@ -2131,6 +2536,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
   static const CostTblEntry X64CostTbl[] = { // 64-bit targets
     { ISD::BITREVERSE, MVT::i64,    14 },
     { ISD::CTLZ,       MVT::i64,     4 }, // BSR+XOR or BSR+XOR+CMOV
+    { ISD::CTTZ,       MVT::i64,     3 }, // TEST+BSF+CMOV/BRANCH
     { ISD::CTPOP,      MVT::i64,    10 },
     { ISD::SADDO,      MVT::i64,     1 },
     { ISD::UADDO,      MVT::i64,     1 },
@@ -2142,6 +2548,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::CTLZ,       MVT::i32,     4 }, // BSR+XOR or BSR+XOR+CMOV
     { ISD::CTLZ,       MVT::i16,     4 }, // BSR+XOR or BSR+XOR+CMOV
     { ISD::CTLZ,       MVT::i8,      4 }, // BSR+XOR or BSR+XOR+CMOV
+    { ISD::CTTZ,       MVT::i32,     3 }, // TEST+BSF+CMOV/BRANCH
+    { ISD::CTTZ,       MVT::i16,     3 }, // TEST+BSF+CMOV/BRANCH
+    { ISD::CTTZ,       MVT::i8,      3 }, // TEST+BSF+CMOV/BRANCH
     { ISD::CTPOP,      MVT::i32,     8 },
     { ISD::CTPOP,      MVT::i16,     9 },
     { ISD::CTPOP,      MVT::i8,      7 },
@@ -2153,7 +2562,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::UADDO,      MVT::i8,      1 },
   };
 
+  Type *RetTy = ICA.getReturnType();
   Type *OpTy = RetTy;
+  Intrinsic::ID IID = ICA.getID();
   unsigned ISD = ISD::DELETED_NODE;
   switch (IID) {
   default:
@@ -2173,6 +2584,11 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
   case Intrinsic::cttz:
     ISD = ISD::CTTZ;
     break;
+  case Intrinsic::maxnum:
+  case Intrinsic::minnum:
+    // FMINNUM has same costs so don't duplicate.
+    ISD = ISD::FMAXNUM;
+    break;
   case Intrinsic::sadd_sat:
     ISD = ISD::SADDSAT;
     break;
@@ -2256,6 +2672,15 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
       if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
         return LT.first * Entry->Cost;
 
+    if (ST->hasBMI()) {
+      if (ST->is64Bit())
+        if (const auto *Entry = CostTableLookup(BMI64CostTbl, ISD, MTy))
+          return LT.first * Entry->Cost;
+
+      if (const auto *Entry = CostTableLookup(BMI32CostTbl, ISD, MTy))
+        return LT.first * Entry->Cost;
+    }
+
     if (ST->hasLZCNT()) {
       if (ST->is64Bit())
         if (const auto *Entry = CostTableLookup(LZCNT64CostTbl, ISD, MTy))
@@ -2284,12 +2709,17 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
       return LT.first * Entry->Cost;
   }
 
-  return BaseT::getIntrinsicInstrCost(IID, RetTy, Tys, FMF, ScalarizationCostPassed);
+  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
-int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                                      ArrayRef<Value *> Args, FastMathFlags FMF,
-                                      unsigned VF) {
+int X86TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                                      TTI::TargetCostKind CostKind) {
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return BaseT::getIntrinsicInstrCost(ICA, CostKind);
+
+  if (ICA.isTypeBasedOnly())
+    return getTypeBasedIntrinsicInstrCost(ICA, CostKind);
+
   static const CostTblEntry AVX512CostTbl[] = {
     { ISD::ROTL,       MVT::v8i64,   1 },
     { ISD::ROTL,       MVT::v4i64,   1 },
@@ -2340,6 +2770,9 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
     { ISD::FSHL,       MVT::i8,      4 }
   };
 
+  Intrinsic::ID IID = ICA.getID();
+  Type *RetTy = ICA.getReturnType();
+  const SmallVectorImpl<const Value *> &Args = ICA.getArgs();
   unsigned ISD = ISD::DELETED_NODE;
   switch (IID) {
   default:
@@ -2379,7 +2812,7 @@ int X86TTIImpl::getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
       return LT.first * Entry->Cost;
   }
 
-  return BaseT::getIntrinsicInstrCost(IID, RetTy, Args, FMF, VF);
+  return BaseT::getIntrinsicInstrCost(ICA, CostKind);
 }
 
 int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
@@ -2391,10 +2824,11 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
    };
 
   assert(Val->isVectorTy() && "This must be a vector type");
-
   Type *ScalarType = Val->getScalarType();
+  int RegisterFileMoveCost = 0;
 
-  if (Index != -1U) {
+  if (Index != -1U && (Opcode == Instruction::ExtractElement ||
+                       Opcode == Instruction::InsertElement)) {
     // Legalize the type.
     std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Val);
 
@@ -2403,17 +2837,32 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
       return 0;
 
     // The type may be split. Normalize the index to the new type.
-    unsigned Width = LT.second.getVectorNumElements();
-    Index = Index % Width;
+    unsigned NumElts = LT.second.getVectorNumElements();
+    unsigned SubNumElts = NumElts;
+    Index = Index % NumElts;
+
+    // For >128-bit vectors, we need to extract higher 128-bit subvectors.
+    // For inserts, we also need to insert the subvector back.
+    if (LT.second.getSizeInBits() > 128) {
+      assert((LT.second.getSizeInBits() % 128) == 0 && "Illegal vector");
+      unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
+      SubNumElts = NumElts / NumSubVecs;
+      if (SubNumElts <= Index) {
+        RegisterFileMoveCost += (Opcode == Instruction::InsertElement ? 2 : 1);
+        Index %= SubNumElts;
+      }
+    }
 
     if (Index == 0) {
       // Floating point scalars are already located in index #0.
+      // Many insertions to #0 can fold away for scalar fp-ops, so let's assume
+      // true for all.
       if (ScalarType->isFloatingPointTy())
-        return 0;
+        return RegisterFileMoveCost;
 
-      // Assume movd/movq XMM <-> GPR is relatively cheap on all targets.
-      if (ScalarType->isIntegerTy())
-        return 1;
+      // Assume movd/movq XMM -> GPR is relatively cheap on all targets.
+      if (ScalarType->isIntegerTy() && Opcode == Instruction::ExtractElement)
+        return 1 + RegisterFileMoveCost;
     }
 
     int ISD = TLI->InstructionOpcodeToISD(Opcode);
@@ -2421,24 +2870,124 @@ int X86TTIImpl::getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index) {
     MVT MScalarTy = LT.second.getScalarType();
     if (ST->isSLM())
       if (auto *Entry = CostTableLookup(SLMCostTbl, ISD, MScalarTy))
-        return LT.first * Entry->Cost;
+        return Entry->Cost + RegisterFileMoveCost;
+
+    // Assume pinsr/pextr XMM <-> GPR is relatively cheap on all targets.
+    if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
+        (MScalarTy.isInteger() && ST->hasSSE41()))
+      return 1 + RegisterFileMoveCost;
+
+    // Assume insertps is relatively cheap on all targets.
+    if (MScalarTy == MVT::f32 && ST->hasSSE41() &&
+        Opcode == Instruction::InsertElement)
+      return 1 + RegisterFileMoveCost;
+
+    // For extractions we just need to shuffle the element to index 0, which
+    // should be very cheap (assume cost = 1). For insertions we need to shuffle
+    // the elements to its destination. In both cases we must handle the
+    // subvector move(s).
+    // If the vector type is already less than 128-bits then don't reduce it.
+    // TODO: Under what circumstances should we shuffle using the full width?
+    int ShuffleCost = 1;
+    if (Opcode == Instruction::InsertElement) {
+      auto *SubTy = cast<VectorType>(Val);
+      EVT VT = TLI->getValueType(DL, Val);
+      if (VT.getScalarType() != MScalarTy || VT.getSizeInBits() >= 128)
+        SubTy = FixedVectorType::get(ScalarType, SubNumElts);
+      ShuffleCost = getShuffleCost(TTI::SK_PermuteTwoSrc, SubTy, 0, SubTy);
+    }
+    int IntOrFpCost = ScalarType->isFloatingPointTy() ? 0 : 1;
+    return ShuffleCost + IntOrFpCost + RegisterFileMoveCost;
   }
 
   // Add to the base cost if we know that the extracted element of a vector is
   // destined to be moved to and used in the integer register file.
-  int RegisterFileMoveCost = 0;
   if (Opcode == Instruction::ExtractElement && ScalarType->isPointerTy())
-    RegisterFileMoveCost = 1;
+    RegisterFileMoveCost += 1;
 
   return BaseT::getVectorInstrCost(Opcode, Val, Index) + RegisterFileMoveCost;
 }
 
+unsigned X86TTIImpl::getScalarizationOverhead(VectorType *Ty,
+                                              const APInt &DemandedElts,
+                                              bool Insert, bool Extract) {
+  unsigned Cost = 0;
+
+  // For insertions, a ISD::BUILD_VECTOR style vector initialization can be much
+  // cheaper than an accumulation of ISD::INSERT_VECTOR_ELT.
+  if (Insert) {
+    std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
+    MVT MScalarTy = LT.second.getScalarType();
+
+    if ((MScalarTy == MVT::i16 && ST->hasSSE2()) ||
+        (MScalarTy.isInteger() && ST->hasSSE41()) ||
+        (MScalarTy == MVT::f32 && ST->hasSSE41())) {
+      // For types we can insert directly, insertion into 128-bit sub vectors is
+      // cheap, followed by a cheap chain of concatenations.
+      if (LT.second.getSizeInBits() <= 128) {
+        Cost +=
+            BaseT::getScalarizationOverhead(Ty, DemandedElts, Insert, false);
+      } else {
+        unsigned NumSubVecs = LT.second.getSizeInBits() / 128;
+        Cost += (PowerOf2Ceil(NumSubVecs) - 1) * LT.first;
+        Cost += DemandedElts.countPopulation();
+
+        // For vXf32 cases, insertion into the 0'th index in each v4f32
+        // 128-bit vector is free.
+        // NOTE: This assumes legalization widens vXf32 vectors.
+        if (MScalarTy == MVT::f32)
+          for (unsigned i = 0, e = cast<FixedVectorType>(Ty)->getNumElements();
+               i < e; i += 4)
+            if (DemandedElts[i])
+              Cost--;
+      }
+    } else if (LT.second.isVector()) {
+      // Without fast insertion, we need to use MOVD/MOVQ to pass each demanded
+      // integer element as a SCALAR_TO_VECTOR, then we build the vector as a
+      // series of UNPCK followed by CONCAT_VECTORS - all of these can be
+      // considered cheap.
+      if (Ty->isIntOrIntVectorTy())
+        Cost += DemandedElts.countPopulation();
+
+      // Get the smaller of the legalized or original pow2-extended number of
+      // vector elements, which represents the number of unpacks we'll end up
+      // performing.
+      unsigned NumElts = LT.second.getVectorNumElements();
+      unsigned Pow2Elts =
+          PowerOf2Ceil(cast<FixedVectorType>(Ty)->getNumElements());
+      Cost += (std::min<unsigned>(NumElts, Pow2Elts) - 1) * LT.first;
+    }
+  }
+
+  // TODO: Use default extraction for now, but we should investigate extending this
+  // to handle repeated subvector extraction.
+  if (Extract)
+    Cost += BaseT::getScalarizationOverhead(Ty, DemandedElts, false, Extract);
+
+  return Cost;
+}
+
 int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
                                 MaybeAlign Alignment, unsigned AddressSpace,
+                                TTI::TargetCostKind CostKind,
                                 const Instruction *I) {
+  // TODO: Handle other cost kinds.
+  if (CostKind != TTI::TCK_RecipThroughput) {
+    if (isa_and_nonnull<StoreInst>(I)) {
+      Value *Ptr = I->getOperand(1);
+      // Store instruction with index and scale costs 2 Uops.
+      // Check the preceding GEP to identify non-const indices.
+      if (auto *GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
+        if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
+          return TTI::TCC_Basic * 2;
+      }
+    }
+    return TTI::TCC_Basic;
+  }
+
   // Handle non-power-of-two vectors such as <3 x float>
-  if (VectorType *VTy = dyn_cast<VectorType>(Src)) {
-    unsigned NumElem = VTy->getVectorNumElements();
+  if (auto *VTy = dyn_cast<FixedVectorType>(Src)) {
+    unsigned NumElem = VTy->getNumElements();
 
     // Handle a few common cases:
     // <3 x float>
@@ -2453,14 +3002,21 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 
     // Assume that all other non-power-of-two numbers are scalarized.
     if (!isPowerOf2_32(NumElem)) {
+      APInt DemandedElts = APInt::getAllOnesValue(NumElem);
       int Cost = BaseT::getMemoryOpCost(Opcode, VTy->getScalarType(), Alignment,
-                                        AddressSpace);
-      int SplitCost = getScalarizationOverhead(Src, Opcode == Instruction::Load,
+                                        AddressSpace, CostKind);
+      int SplitCost = getScalarizationOverhead(VTy, DemandedElts,
+                                               Opcode == Instruction::Load,
                                                Opcode == Instruction::Store);
       return NumElem * Cost + SplitCost;
     }
   }
 
+  // Type legalization can't handle structs
+  if (TLI->getValueType(DL, Src,  true) == MVT::Other)
+    return BaseT::getMemoryOpCost(Opcode, Src, Alignment, AddressSpace,
+                                  CostKind);
+
   // Legalize the type.
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Src);
   assert((Opcode == Instruction::Load || Opcode == Instruction::Store) &&
@@ -2478,33 +3034,36 @@ int X86TTIImpl::getMemoryOpCost(unsigned Opcode, Type *Src,
 }
 
 int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
-                                      unsigned Alignment,
-                                      unsigned AddressSpace) {
+                                      Align Alignment, unsigned AddressSpace,
+                                      TTI::TargetCostKind CostKind) {
   bool IsLoad = (Instruction::Load == Opcode);
   bool IsStore = (Instruction::Store == Opcode);
 
-  VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy);
+  auto *SrcVTy = dyn_cast<FixedVectorType>(SrcTy);
   if (!SrcVTy)
     // To calculate scalar take the regular cost, without mask
-    return getMemoryOpCost(Opcode, SrcTy, MaybeAlign(Alignment), AddressSpace);
+    return getMemoryOpCost(Opcode, SrcTy, Alignment, AddressSpace, CostKind);
 
-  unsigned NumElem = SrcVTy->getVectorNumElements();
-  VectorType *MaskTy =
-      VectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
-  if ((IsLoad && !isLegalMaskedLoad(SrcVTy, MaybeAlign(Alignment))) ||
-      (IsStore && !isLegalMaskedStore(SrcVTy, MaybeAlign(Alignment))) ||
+  unsigned NumElem = SrcVTy->getNumElements();
+  auto *MaskTy =
+      FixedVectorType::get(Type::getInt8Ty(SrcVTy->getContext()), NumElem);
+  if ((IsLoad && !isLegalMaskedLoad(SrcVTy, Alignment)) ||
+      (IsStore && !isLegalMaskedStore(SrcVTy, Alignment)) ||
       !isPowerOf2_32(NumElem)) {
     // Scalarization
-    int MaskSplitCost = getScalarizationOverhead(MaskTy, false, true);
+    APInt DemandedElts = APInt::getAllOnesValue(NumElem);
+    int MaskSplitCost =
+        getScalarizationOverhead(MaskTy, DemandedElts, false, true);
     int ScalarCompareCost = getCmpSelInstrCost(
-        Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr);
-    int BranchCost = getCFInstrCost(Instruction::Br);
+        Instruction::ICmp, Type::getInt8Ty(SrcVTy->getContext()), nullptr,
+        CostKind);
+    int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
     int MaskCmpCost = NumElem * (BranchCost + ScalarCompareCost);
-
-    int ValueSplitCost = getScalarizationOverhead(SrcVTy, IsLoad, IsStore);
+    int ValueSplitCost =
+        getScalarizationOverhead(SrcVTy, DemandedElts, IsLoad, IsStore);
     int MemopCost =
         NumElem * BaseT::getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
-                                         MaybeAlign(Alignment), AddressSpace);
+                                         Alignment, AddressSpace, CostKind);
     return MemopCost + ValueSplitCost + MaskSplitCost + MaskCmpCost;
   }
 
@@ -2519,8 +3078,8 @@ int X86TTIImpl::getMaskedMemoryOpCost(unsigned Opcode, Type *SrcTy,
             getShuffleCost(TTI::SK_PermuteTwoSrc, MaskTy, 0, nullptr);
 
   else if (LT.second.getVectorNumElements() > NumElem) {
-    VectorType *NewMaskTy = VectorType::get(MaskTy->getVectorElementType(),
-                                            LT.second.getVectorNumElements());
+    auto *NewMaskTy = FixedVectorType::get(MaskTy->getElementType(),
+                                           LT.second.getVectorNumElements());
     // Expanding requires fill mask with zeroes
     Cost += getShuffleCost(TTI::SK_InsertSubvector, NewMaskTy, 0, MaskTy);
   }
@@ -2558,41 +3117,16 @@ int X86TTIImpl::getAddressComputationCost(Type *Ty, ScalarEvolution *SE,
   return BaseT::getAddressComputationCost(Ty, SE, Ptr);
 }
 
-int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
-                                           bool IsPairwise) {
+int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, VectorType *ValTy,
+                                           bool IsPairwise,
+                                           TTI::TargetCostKind CostKind) {
+  // Just use the default implementation for pair reductions.
+  if (IsPairwise)
+    return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise, CostKind);
+
   // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
   // and make it as the cost.
 
-  static const CostTblEntry SLMCostTblPairWise[] = {
-    { ISD::FADD,  MVT::v2f64,   3 },
-    { ISD::ADD,   MVT::v2i64,   5 },
-  };
-
-  static const CostTblEntry SSE2CostTblPairWise[] = {
-    { ISD::FADD,  MVT::v2f64,   2 },
-    { ISD::FADD,  MVT::v4f32,   4 },
-    { ISD::ADD,   MVT::v2i64,   2 },      // The data reported by the IACA tool is "1.6".
-    { ISD::ADD,   MVT::v2i32,   2 }, // FIXME: chosen to be less than v4i32.
-    { ISD::ADD,   MVT::v4i32,   3 },      // The data reported by the IACA tool is "3.5".
-    { ISD::ADD,   MVT::v2i16,   3 }, // FIXME: chosen to be less than v4i16
-    { ISD::ADD,   MVT::v4i16,   4 }, // FIXME: chosen to be less than v8i16
-    { ISD::ADD,   MVT::v8i16,   5 },
-    { ISD::ADD,   MVT::v2i8,    2 },
-    { ISD::ADD,   MVT::v4i8,    2 },
-    { ISD::ADD,   MVT::v8i8,    2 },
-    { ISD::ADD,   MVT::v16i8,   3 },
-  };
-
-  static const CostTblEntry AVX1CostTblPairWise[] = {
-    { ISD::FADD,  MVT::v4f64,   5 },
-    { ISD::FADD,  MVT::v8f32,   7 },
-    { ISD::ADD,   MVT::v2i64,   1 },      // The data reported by the IACA tool is "1.5".
-    { ISD::ADD,   MVT::v4i64,   5 },      // The data reported by the IACA tool is "4.8".
-    { ISD::ADD,   MVT::v8i32,   5 },
-    { ISD::ADD,   MVT::v16i16,  6 },
-    { ISD::ADD,   MVT::v32i8,   4 },
-  };
-
   static const CostTblEntry SLMCostTblNoPairWise[] = {
     { ISD::FADD,  MVT::v2f64,   3 },
     { ISD::ADD,   MVT::v2i64,   5 },
@@ -2633,66 +3167,49 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
   EVT VT = TLI->getValueType(DL, ValTy);
   if (VT.isSimple()) {
     MVT MTy = VT.getSimpleVT();
-    if (IsPairwise) {
-      if (ST->isSLM())
-        if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy))
-          return Entry->Cost;
-
-      if (ST->hasAVX())
-        if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
-          return Entry->Cost;
-
-      if (ST->hasSSE2())
-        if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
-          return Entry->Cost;
-    } else {
-      if (ST->isSLM())
-        if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
-          return Entry->Cost;
+    if (ST->isSLM())
+      if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
+        return Entry->Cost;
 
-      if (ST->hasAVX())
-        if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
-          return Entry->Cost;
+    if (ST->hasAVX())
+      if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+        return Entry->Cost;
 
-      if (ST->hasSSE2())
-        if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
-          return Entry->Cost;
-    }
+    if (ST->hasSSE2())
+      if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+        return Entry->Cost;
   }
 
   std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
 
   MVT MTy = LT.second;
 
-  if (IsPairwise) {
-    if (ST->isSLM())
-      if (const auto *Entry = CostTableLookup(SLMCostTblPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  auto *ValVTy = cast<FixedVectorType>(ValTy);
 
-    if (ST->hasAVX())
-      if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  unsigned ArithmeticCost = 0;
+  if (LT.first != 1 && MTy.isVector() &&
+      MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+    // Type needs to be split. We need LT.first - 1 arithmetic ops.
+    auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
+                                            MTy.getVectorNumElements());
+    ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
+    ArithmeticCost *= LT.first - 1;
+  }
 
-    if (ST->hasSSE2())
-      if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
-  } else {
-    if (ST->isSLM())
-      if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  if (ST->isSLM())
+    if (const auto *Entry = CostTableLookup(SLMCostTblNoPairWise, ISD, MTy))
+      return ArithmeticCost + Entry->Cost;
 
-    if (ST->hasAVX())
-      if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  if (ST->hasAVX())
+    if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+      return ArithmeticCost + Entry->Cost;
 
-    if (ST->hasSSE2())
-      if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
-  }
+  if (ST->hasSSE2())
+    if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+      return ArithmeticCost + Entry->Cost;
 
   // FIXME: These assume a naive kshift+binop lowering, which is probably
   // conservative in most cases.
-  // FIXME: This doesn't cost large types like v128i1 correctly.
   static const CostTblEntry AVX512BoolReduction[] = {
     { ISD::AND,  MVT::v2i1,   3 },
     { ISD::AND,  MVT::v4i1,   5 },
@@ -2738,252 +3255,408 @@ int X86TTIImpl::getArithmeticReductionCost(unsigned Opcode, Type *ValTy,
   };
 
   // Handle bool allof/anyof patterns.
-  if (!IsPairwise && ValTy->getVectorElementType()->isIntegerTy(1)) {
+  if (ValVTy->getElementType()->isIntegerTy(1)) {
+    unsigned ArithmeticCost = 0;
+    if (LT.first != 1 && MTy.isVector() &&
+        MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+      // Type needs to be split. We need LT.first - 1 arithmetic ops.
+      auto *SingleOpTy = FixedVectorType::get(ValVTy->getElementType(),
+                                              MTy.getVectorNumElements());
+      ArithmeticCost = getArithmeticInstrCost(Opcode, SingleOpTy, CostKind);
+      ArithmeticCost *= LT.first - 1;
+    }
+
     if (ST->hasAVX512())
       if (const auto *Entry = CostTableLookup(AVX512BoolReduction, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return ArithmeticCost + Entry->Cost;
     if (ST->hasAVX2())
       if (const auto *Entry = CostTableLookup(AVX2BoolReduction, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return ArithmeticCost + Entry->Cost;
     if (ST->hasAVX())
       if (const auto *Entry = CostTableLookup(AVX1BoolReduction, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return ArithmeticCost + Entry->Cost;
     if (ST->hasSSE2())
       if (const auto *Entry = CostTableLookup(SSE2BoolReduction, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return ArithmeticCost + Entry->Cost;
+
+    return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
+                                             CostKind);
+  }
+
+  unsigned NumVecElts = ValVTy->getNumElements();
+  unsigned ScalarSize = ValVTy->getScalarSizeInBits();
+
+  // Special case power of 2 reductions where the scalar type isn't changed
+  // by type legalization.
+  if (!isPowerOf2_32(NumVecElts) || ScalarSize != MTy.getScalarSizeInBits())
+    return BaseT::getArithmeticReductionCost(Opcode, ValVTy, IsPairwise,
+                                             CostKind);
+
+  unsigned ReductionCost = 0;
+
+  auto *Ty = ValVTy;
+  if (LT.first != 1 && MTy.isVector() &&
+      MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+    // Type needs to be split. We need LT.first - 1 arithmetic ops.
+    Ty = FixedVectorType::get(ValVTy->getElementType(),
+                              MTy.getVectorNumElements());
+    ReductionCost = getArithmeticInstrCost(Opcode, Ty, CostKind);
+    ReductionCost *= LT.first - 1;
+    NumVecElts = MTy.getVectorNumElements();
+  }
+
+  // Now handle reduction with the legal type, taking into account size changes
+  // at each level.
+  while (NumVecElts > 1) {
+    // Determine the size of the remaining vector we need to reduce.
+    unsigned Size = NumVecElts * ScalarSize;
+    NumVecElts /= 2;
+    // If we're reducing from 256/512 bits, use an extract_subvector.
+    if (Size > 128) {
+      auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
+      ReductionCost +=
+          getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
+      Ty = SubTy;
+    } else if (Size == 128) {
+      // Reducing from 128 bits is a permute of v2f64/v2i64.
+      FixedVectorType *ShufTy;
+      if (ValVTy->isFloatingPointTy())
+        ShufTy =
+            FixedVectorType::get(Type::getDoubleTy(ValVTy->getContext()), 2);
+      else
+        ShufTy =
+            FixedVectorType::get(Type::getInt64Ty(ValVTy->getContext()), 2);
+      ReductionCost +=
+          getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+    } else if (Size == 64) {
+      // Reducing from 64 bits is a shuffle of v4f32/v4i32.
+      FixedVectorType *ShufTy;
+      if (ValVTy->isFloatingPointTy())
+        ShufTy =
+            FixedVectorType::get(Type::getFloatTy(ValVTy->getContext()), 4);
+      else
+        ShufTy =
+            FixedVectorType::get(Type::getInt32Ty(ValVTy->getContext()), 4);
+      ReductionCost +=
+          getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+    } else {
+      // Reducing from smaller size is a shift by immediate.
+      auto *ShiftTy = FixedVectorType::get(
+          Type::getIntNTy(ValVTy->getContext(), Size), 128 / Size);
+      ReductionCost += getArithmeticInstrCost(
+          Instruction::LShr, ShiftTy, CostKind,
+          TargetTransformInfo::OK_AnyValue,
+          TargetTransformInfo::OK_UniformConstantValue,
+          TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+    }
+
+    // Add the arithmetic op for this level.
+    ReductionCost += getArithmeticInstrCost(Opcode, Ty, CostKind);
   }
 
-  return BaseT::getArithmeticReductionCost(Opcode, ValTy, IsPairwise);
+  // Add the final extract element to the cost.
+  return ReductionCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
 }
 
-int X86TTIImpl::getMinMaxReductionCost(Type *ValTy, Type *CondTy,
-                                       bool IsPairwise, bool IsUnsigned) {
-  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+int X86TTIImpl::getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned) {
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, Ty);
 
   MVT MTy = LT.second;
 
   int ISD;
-  if (ValTy->isIntOrIntVectorTy()) {
+  if (Ty->isIntOrIntVectorTy()) {
     ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
   } else {
-    assert(ValTy->isFPOrFPVectorTy() &&
+    assert(Ty->isFPOrFPVectorTy() &&
            "Expected float point or integer vector type.");
     ISD = ISD::FMINNUM;
   }
 
-  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
-  // and make it as the cost.
+  static const CostTblEntry SSE1CostTbl[] = {
+    {ISD::FMINNUM, MVT::v4f32, 1},
+  };
 
-  static const CostTblEntry SSE1CostTblPairWise[] = {
-      {ISD::FMINNUM, MVT::v4f32, 4},
-  };
-
-  static const CostTblEntry SSE2CostTblPairWise[] = {
-      {ISD::FMINNUM, MVT::v2f64, 3},
-      {ISD::SMIN, MVT::v2i64, 6},
-      {ISD::UMIN, MVT::v2i64, 8},
-      {ISD::SMIN, MVT::v4i32, 6},
-      {ISD::UMIN, MVT::v4i32, 8},
-      {ISD::SMIN, MVT::v8i16, 4},
-      {ISD::UMIN, MVT::v8i16, 6},
-      {ISD::SMIN, MVT::v16i8, 8},
-      {ISD::UMIN, MVT::v16i8, 6},
-  };
-
-  static const CostTblEntry SSE41CostTblPairWise[] = {
-      {ISD::FMINNUM, MVT::v4f32, 2},
-      {ISD::SMIN, MVT::v2i64, 9},
-      {ISD::UMIN, MVT::v2i64,10},
-      {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
-      {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
-      {ISD::SMIN, MVT::v8i16, 2},
-      {ISD::UMIN, MVT::v8i16, 2},
-      {ISD::SMIN, MVT::v16i8, 3},
-      {ISD::UMIN, MVT::v16i8, 3},
-  };
-
-  static const CostTblEntry SSE42CostTblPairWise[] = {
-      {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
-      {ISD::UMIN, MVT::v2i64, 8}, // The data reported by the IACA is "8.6"
-  };
-
-  static const CostTblEntry AVX1CostTblPairWise[] = {
-      {ISD::FMINNUM, MVT::v4f32, 1},
-      {ISD::FMINNUM, MVT::v4f64, 1},
-      {ISD::FMINNUM, MVT::v8f32, 2},
-      {ISD::SMIN, MVT::v2i64, 3},
-      {ISD::UMIN, MVT::v2i64, 3},
-      {ISD::SMIN, MVT::v4i32, 1},
-      {ISD::UMIN, MVT::v4i32, 1},
-      {ISD::SMIN, MVT::v8i16, 1},
-      {ISD::UMIN, MVT::v8i16, 1},
-      {ISD::SMIN, MVT::v16i8, 2},
-      {ISD::UMIN, MVT::v16i8, 2},
-      {ISD::SMIN, MVT::v4i64, 7},
-      {ISD::UMIN, MVT::v4i64, 7},
-      {ISD::SMIN, MVT::v8i32, 3},
-      {ISD::UMIN, MVT::v8i32, 3},
-      {ISD::SMIN, MVT::v16i16, 3},
-      {ISD::UMIN, MVT::v16i16, 3},
-      {ISD::SMIN, MVT::v32i8, 3},
-      {ISD::UMIN, MVT::v32i8, 3},
-  };
-
-  static const CostTblEntry AVX2CostTblPairWise[] = {
-      {ISD::SMIN, MVT::v4i64, 2},
-      {ISD::UMIN, MVT::v4i64, 2},
-      {ISD::SMIN, MVT::v8i32, 1},
-      {ISD::UMIN, MVT::v8i32, 1},
-      {ISD::SMIN, MVT::v16i16, 1},
-      {ISD::UMIN, MVT::v16i16, 1},
-      {ISD::SMIN, MVT::v32i8, 2},
-      {ISD::UMIN, MVT::v32i8, 2},
-  };
-
-  static const CostTblEntry AVX512CostTblPairWise[] = {
-      {ISD::FMINNUM, MVT::v8f64, 1},
-      {ISD::FMINNUM, MVT::v16f32, 2},
-      {ISD::SMIN, MVT::v8i64, 2},
-      {ISD::UMIN, MVT::v8i64, 2},
-      {ISD::SMIN, MVT::v16i32, 1},
-      {ISD::UMIN, MVT::v16i32, 1},
-  };
-
-  static const CostTblEntry SSE1CostTblNoPairWise[] = {
-      {ISD::FMINNUM, MVT::v4f32, 4},
+  static const CostTblEntry SSE2CostTbl[] = {
+    {ISD::FMINNUM, MVT::v2f64, 1},
+    {ISD::SMIN,    MVT::v8i16, 1},
+    {ISD::UMIN,    MVT::v16i8, 1},
   };
 
-  static const CostTblEntry SSE2CostTblNoPairWise[] = {
-      {ISD::FMINNUM, MVT::v2f64, 3},
-      {ISD::SMIN, MVT::v2i64, 6},
-      {ISD::UMIN, MVT::v2i64, 8},
-      {ISD::SMIN, MVT::v4i32, 6},
-      {ISD::UMIN, MVT::v4i32, 8},
-      {ISD::SMIN, MVT::v8i16, 4},
-      {ISD::UMIN, MVT::v8i16, 6},
-      {ISD::SMIN, MVT::v16i8, 8},
-      {ISD::UMIN, MVT::v16i8, 6},
+  static const CostTblEntry SSE41CostTbl[] = {
+    {ISD::SMIN,    MVT::v4i32, 1},
+    {ISD::UMIN,    MVT::v4i32, 1},
+    {ISD::UMIN,    MVT::v8i16, 1},
+    {ISD::SMIN,    MVT::v16i8, 1},
   };
 
-  static const CostTblEntry SSE41CostTblNoPairWise[] = {
-      {ISD::FMINNUM, MVT::v4f32, 3},
-      {ISD::SMIN, MVT::v2i64, 9},
-      {ISD::UMIN, MVT::v2i64,11},
-      {ISD::SMIN, MVT::v4i32, 1}, // The data reported by the IACA is "1.5"
-      {ISD::UMIN, MVT::v4i32, 2}, // The data reported by the IACA is "1.8"
-      {ISD::SMIN, MVT::v8i16, 1}, // The data reported by the IACA is "1.5"
-      {ISD::UMIN, MVT::v8i16, 2}, // The data reported by the IACA is "1.8"
-      {ISD::SMIN, MVT::v16i8, 3},
-      {ISD::UMIN, MVT::v16i8, 3},
+  static const CostTblEntry SSE42CostTbl[] = {
+    {ISD::UMIN,    MVT::v2i64, 3}, // xor+pcmpgtq+blendvpd
   };
 
-  static const CostTblEntry SSE42CostTblNoPairWise[] = {
-      {ISD::SMIN, MVT::v2i64, 7}, // The data reported by the IACA is "6.8"
-      {ISD::UMIN, MVT::v2i64, 9}, // The data reported by the IACA is "8.6"
+  static const CostTblEntry AVX1CostTbl[] = {
+    {ISD::FMINNUM, MVT::v8f32,  1},
+    {ISD::FMINNUM, MVT::v4f64,  1},
+    {ISD::SMIN,    MVT::v8i32,  3},
+    {ISD::UMIN,    MVT::v8i32,  3},
+    {ISD::SMIN,    MVT::v16i16, 3},
+    {ISD::UMIN,    MVT::v16i16, 3},
+    {ISD::SMIN,    MVT::v32i8,  3},
+    {ISD::UMIN,    MVT::v32i8,  3},
   };
 
-  static const CostTblEntry AVX1CostTblNoPairWise[] = {
-      {ISD::FMINNUM, MVT::v4f32, 1},
-      {ISD::FMINNUM, MVT::v4f64, 1},
-      {ISD::FMINNUM, MVT::v8f32, 1},
-      {ISD::SMIN, MVT::v2i64, 3},
-      {ISD::UMIN, MVT::v2i64, 3},
-      {ISD::SMIN, MVT::v4i32, 1},
-      {ISD::UMIN, MVT::v4i32, 1},
-      {ISD::SMIN, MVT::v8i16, 1},
-      {ISD::UMIN, MVT::v8i16, 1},
-      {ISD::SMIN, MVT::v16i8, 2},
-      {ISD::UMIN, MVT::v16i8, 2},
-      {ISD::SMIN, MVT::v4i64, 7},
-      {ISD::UMIN, MVT::v4i64, 7},
-      {ISD::SMIN, MVT::v8i32, 2},
-      {ISD::UMIN, MVT::v8i32, 2},
-      {ISD::SMIN, MVT::v16i16, 2},
-      {ISD::UMIN, MVT::v16i16, 2},
-      {ISD::SMIN, MVT::v32i8, 2},
-      {ISD::UMIN, MVT::v32i8, 2},
-  };
-
-  static const CostTblEntry AVX2CostTblNoPairWise[] = {
-      {ISD::SMIN, MVT::v4i64, 1},
-      {ISD::UMIN, MVT::v4i64, 1},
-      {ISD::SMIN, MVT::v8i32, 1},
-      {ISD::UMIN, MVT::v8i32, 1},
-      {ISD::SMIN, MVT::v16i16, 1},
-      {ISD::UMIN, MVT::v16i16, 1},
-      {ISD::SMIN, MVT::v32i8, 1},
-      {ISD::UMIN, MVT::v32i8, 1},
-  };
-
-  static const CostTblEntry AVX512CostTblNoPairWise[] = {
-      {ISD::FMINNUM, MVT::v8f64, 1},
-      {ISD::FMINNUM, MVT::v16f32, 2},
-      {ISD::SMIN, MVT::v8i64, 1},
-      {ISD::UMIN, MVT::v8i64, 1},
-      {ISD::SMIN, MVT::v16i32, 1},
-      {ISD::UMIN, MVT::v16i32, 1},
-  };
-
-  if (IsPairwise) {
-    if (ST->hasAVX512())
-      if (const auto *Entry = CostTableLookup(AVX512CostTblPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  static const CostTblEntry AVX2CostTbl[] = {
+    {ISD::SMIN,    MVT::v8i32,  1},
+    {ISD::UMIN,    MVT::v8i32,  1},
+    {ISD::SMIN,    MVT::v16i16, 1},
+    {ISD::UMIN,    MVT::v16i16, 1},
+    {ISD::SMIN,    MVT::v32i8,  1},
+    {ISD::UMIN,    MVT::v32i8,  1},
+  };
 
-    if (ST->hasAVX2())
-      if (const auto *Entry = CostTableLookup(AVX2CostTblPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  static const CostTblEntry AVX512CostTbl[] = {
+    {ISD::FMINNUM, MVT::v16f32, 1},
+    {ISD::FMINNUM, MVT::v8f64,  1},
+    {ISD::SMIN,    MVT::v2i64,  1},
+    {ISD::UMIN,    MVT::v2i64,  1},
+    {ISD::SMIN,    MVT::v4i64,  1},
+    {ISD::UMIN,    MVT::v4i64,  1},
+    {ISD::SMIN,    MVT::v8i64,  1},
+    {ISD::UMIN,    MVT::v8i64,  1},
+    {ISD::SMIN,    MVT::v16i32, 1},
+    {ISD::UMIN,    MVT::v16i32, 1},
+  };
 
-    if (ST->hasAVX())
-      if (const auto *Entry = CostTableLookup(AVX1CostTblPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  static const CostTblEntry AVX512BWCostTbl[] = {
+    {ISD::SMIN,    MVT::v32i16, 1},
+    {ISD::UMIN,    MVT::v32i16, 1},
+    {ISD::SMIN,    MVT::v64i8,  1},
+    {ISD::UMIN,    MVT::v64i8,  1},
+  };
 
-    if (ST->hasSSE42())
-      if (const auto *Entry = CostTableLookup(SSE42CostTblPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  // If we have a native MIN/MAX instruction for this type, use it.
+  if (ST->hasBWI())
+    if (const auto *Entry = CostTableLookup(AVX512BWCostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
 
-    if (ST->hasSSE41())
-      if (const auto *Entry = CostTableLookup(SSE41CostTblPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  if (ST->hasAVX512())
+    if (const auto *Entry = CostTableLookup(AVX512CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
 
-    if (ST->hasSSE2())
-      if (const auto *Entry = CostTableLookup(SSE2CostTblPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  if (ST->hasAVX2())
+    if (const auto *Entry = CostTableLookup(AVX2CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
 
-    if (ST->hasSSE1())
-      if (const auto *Entry = CostTableLookup(SSE1CostTblPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  if (ST->hasAVX())
+    if (const auto *Entry = CostTableLookup(AVX1CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasSSE42())
+    if (const auto *Entry = CostTableLookup(SSE42CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasSSE41())
+    if (const auto *Entry = CostTableLookup(SSE41CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasSSE2())
+    if (const auto *Entry = CostTableLookup(SSE2CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  if (ST->hasSSE1())
+    if (const auto *Entry = CostTableLookup(SSE1CostTbl, ISD, MTy))
+      return LT.first * Entry->Cost;
+
+  unsigned CmpOpcode;
+  if (Ty->isFPOrFPVectorTy()) {
+    CmpOpcode = Instruction::FCmp;
   } else {
-    if (ST->hasAVX512())
-      if (const auto *Entry =
-              CostTableLookup(AVX512CostTblNoPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+    assert(Ty->isIntOrIntVectorTy() &&
+           "expecting floating point or integer type for min/max reduction");
+    CmpOpcode = Instruction::ICmp;
+  }
 
-    if (ST->hasAVX2())
-      if (const auto *Entry = CostTableLookup(AVX2CostTblNoPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  // Otherwise fall back to cmp+select.
+  return getCmpSelInstrCost(CmpOpcode, Ty, CondTy, CostKind) +
+         getCmpSelInstrCost(Instruction::Select, Ty, CondTy, CostKind);
+}
+
+int X86TTIImpl::getMinMaxReductionCost(VectorType *ValTy, VectorType *CondTy,
+                                       bool IsPairwise, bool IsUnsigned,
+                                       TTI::TargetCostKind CostKind) {
+  // Just use the default implementation for pair reductions.
+  if (IsPairwise)
+    return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
+                                         CostKind);
+
+  std::pair<int, MVT> LT = TLI->getTypeLegalizationCost(DL, ValTy);
+
+  MVT MTy = LT.second;
+
+  int ISD;
+  if (ValTy->isIntOrIntVectorTy()) {
+    ISD = IsUnsigned ? ISD::UMIN : ISD::SMIN;
+  } else {
+    assert(ValTy->isFPOrFPVectorTy() &&
+           "Expected float point or integer vector type.");
+    ISD = ISD::FMINNUM;
+  }
+
+  // We use the Intel Architecture Code Analyzer(IACA) to measure the throughput
+  // and make it as the cost.
+
+  static const CostTblEntry SSE2CostTblNoPairWise[] = {
+      {ISD::UMIN, MVT::v2i16, 5}, // need pxors to use pminsw/pmaxsw
+      {ISD::UMIN, MVT::v4i16, 7}, // need pxors to use pminsw/pmaxsw
+      {ISD::UMIN, MVT::v8i16, 9}, // need pxors to use pminsw/pmaxsw
+  };
+
+  static const CostTblEntry SSE41CostTblNoPairWise[] = {
+      {ISD::SMIN, MVT::v2i16, 3}, // same as sse2
+      {ISD::SMIN, MVT::v4i16, 5}, // same as sse2
+      {ISD::UMIN, MVT::v2i16, 5}, // same as sse2
+      {ISD::UMIN, MVT::v4i16, 7}, // same as sse2
+      {ISD::SMIN, MVT::v8i16, 4}, // phminposuw+xor
+      {ISD::UMIN, MVT::v8i16, 4}, // FIXME: umin is cheaper than umax
+      {ISD::SMIN, MVT::v2i8,  3}, // pminsb
+      {ISD::SMIN, MVT::v4i8,  5}, // pminsb
+      {ISD::SMIN, MVT::v8i8,  7}, // pminsb
+      {ISD::SMIN, MVT::v16i8, 6},
+      {ISD::UMIN, MVT::v2i8,  3}, // same as sse2
+      {ISD::UMIN, MVT::v4i8,  5}, // same as sse2
+      {ISD::UMIN, MVT::v8i8,  7}, // same as sse2
+      {ISD::UMIN, MVT::v16i8, 6}, // FIXME: umin is cheaper than umax
+  };
+
+  static const CostTblEntry AVX1CostTblNoPairWise[] = {
+      {ISD::SMIN, MVT::v16i16, 6},
+      {ISD::UMIN, MVT::v16i16, 6}, // FIXME: umin is cheaper than umax
+      {ISD::SMIN, MVT::v32i8, 8},
+      {ISD::UMIN, MVT::v32i8, 8},
+  };
+
+  static const CostTblEntry AVX512BWCostTblNoPairWise[] = {
+      {ISD::SMIN, MVT::v32i16, 8},
+      {ISD::UMIN, MVT::v32i16, 8}, // FIXME: umin is cheaper than umax
+      {ISD::SMIN, MVT::v64i8, 10},
+      {ISD::UMIN, MVT::v64i8, 10},
+  };
+
+  // Before legalizing the type, give a chance to look up illegal narrow types
+  // in the table.
+  // FIXME: Is there a better way to do this?
+  EVT VT = TLI->getValueType(DL, ValTy);
+  if (VT.isSimple()) {
+    MVT MTy = VT.getSimpleVT();
+    if (ST->hasBWI())
+      if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
+        return Entry->Cost;
 
     if (ST->hasAVX())
       if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
-
-    if (ST->hasSSE42())
-      if (const auto *Entry = CostTableLookup(SSE42CostTblNoPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return Entry->Cost;
 
     if (ST->hasSSE41())
       if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return Entry->Cost;
 
     if (ST->hasSSE2())
       if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+        return Entry->Cost;
+  }
 
-    if (ST->hasSSE1())
-      if (const auto *Entry = CostTableLookup(SSE1CostTblNoPairWise, ISD, MTy))
-        return LT.first * Entry->Cost;
+  auto *ValVTy = cast<FixedVectorType>(ValTy);
+  unsigned NumVecElts = ValVTy->getNumElements();
+
+  auto *Ty = ValVTy;
+  unsigned MinMaxCost = 0;
+  if (LT.first != 1 && MTy.isVector() &&
+      MTy.getVectorNumElements() < ValVTy->getNumElements()) {
+    // Type needs to be split. We need LT.first - 1 operations ops.
+    Ty = FixedVectorType::get(ValVTy->getElementType(),
+                              MTy.getVectorNumElements());
+    auto *SubCondTy = FixedVectorType::get(CondTy->getElementType(),
+                                           MTy.getVectorNumElements());
+    MinMaxCost = getMinMaxCost(Ty, SubCondTy, IsUnsigned);
+    MinMaxCost *= LT.first - 1;
+    NumVecElts = MTy.getVectorNumElements();
   }
 
-  return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned);
+  if (ST->hasBWI())
+    if (const auto *Entry = CostTableLookup(AVX512BWCostTblNoPairWise, ISD, MTy))
+      return MinMaxCost + Entry->Cost;
+
+  if (ST->hasAVX())
+    if (const auto *Entry = CostTableLookup(AVX1CostTblNoPairWise, ISD, MTy))
+      return MinMaxCost + Entry->Cost;
+
+  if (ST->hasSSE41())
+    if (const auto *Entry = CostTableLookup(SSE41CostTblNoPairWise, ISD, MTy))
+      return MinMaxCost + Entry->Cost;
+
+  if (ST->hasSSE2())
+    if (const auto *Entry = CostTableLookup(SSE2CostTblNoPairWise, ISD, MTy))
+      return MinMaxCost + Entry->Cost;
+
+  unsigned ScalarSize = ValTy->getScalarSizeInBits();
+
+  // Special case power of 2 reductions where the scalar type isn't changed
+  // by type legalization.
+  if (!isPowerOf2_32(ValVTy->getNumElements()) ||
+      ScalarSize != MTy.getScalarSizeInBits())
+    return BaseT::getMinMaxReductionCost(ValTy, CondTy, IsPairwise, IsUnsigned,
+                                         CostKind);
+
+  // Now handle reduction with the legal type, taking into account size changes
+  // at each level.
+  while (NumVecElts > 1) {
+    // Determine the size of the remaining vector we need to reduce.
+    unsigned Size = NumVecElts * ScalarSize;
+    NumVecElts /= 2;
+    // If we're reducing from 256/512 bits, use an extract_subvector.
+    if (Size > 128) {
+      auto *SubTy = FixedVectorType::get(ValVTy->getElementType(), NumVecElts);
+      MinMaxCost +=
+          getShuffleCost(TTI::SK_ExtractSubvector, Ty, NumVecElts, SubTy);
+      Ty = SubTy;
+    } else if (Size == 128) {
+      // Reducing from 128 bits is a permute of v2f64/v2i64.
+      VectorType *ShufTy;
+      if (ValTy->isFloatingPointTy())
+        ShufTy =
+            FixedVectorType::get(Type::getDoubleTy(ValTy->getContext()), 2);
+      else
+        ShufTy = FixedVectorType::get(Type::getInt64Ty(ValTy->getContext()), 2);
+      MinMaxCost +=
+          getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+    } else if (Size == 64) {
+      // Reducing from 64 bits is a shuffle of v4f32/v4i32.
+      FixedVectorType *ShufTy;
+      if (ValTy->isFloatingPointTy())
+        ShufTy = FixedVectorType::get(Type::getFloatTy(ValTy->getContext()), 4);
+      else
+        ShufTy = FixedVectorType::get(Type::getInt32Ty(ValTy->getContext()), 4);
+      MinMaxCost +=
+          getShuffleCost(TTI::SK_PermuteSingleSrc, ShufTy, 0, nullptr);
+    } else {
+      // Reducing from smaller size is a shift by immediate.
+      auto *ShiftTy = FixedVectorType::get(
+          Type::getIntNTy(ValTy->getContext(), Size), 128 / Size);
+      MinMaxCost += getArithmeticInstrCost(
+          Instruction::LShr, ShiftTy, TTI::TCK_RecipThroughput,
+          TargetTransformInfo::OK_AnyValue,
+          TargetTransformInfo::OK_UniformConstantValue,
+          TargetTransformInfo::OP_None, TargetTransformInfo::OP_None);
+    }
+
+    // Add the arithmetic op for this level.
+    auto *SubCondTy =
+        FixedVectorType::get(CondTy->getElementType(), Ty->getNumElements());
+    MinMaxCost += getMinMaxCost(Ty, SubCondTy, IsUnsigned);
+  }
+
+  // Add the final extract element to the cost.
+  return MinMaxCost + getVectorInstrCost(Instruction::ExtractElement, Ty, 0);
 }
 
 /// Calculate the cost of materializing a 64-bit value. This helper
@@ -2999,7 +3672,8 @@ int X86TTIImpl::getIntImmCost(int64_t Val) {
   return 2 * TTI::TCC_Basic;
 }
 
-int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
+int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty,
+                              TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -3034,7 +3708,7 @@ int X86TTIImpl::getIntImmCost(const APInt &Imm, Type *Ty) {
 }
 
 int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm,
-                              Type *Ty) {
+                                  Type *Ty, TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -3121,17 +3795,18 @@ int X86TTIImpl::getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Im
 
   if (Idx == ImmIdx) {
     int NumConstants = divideCeil(BitSize, 64);
-    int Cost = X86TTIImpl::getIntImmCost(Imm, Ty);
+    int Cost = X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
     return (Cost <= NumConstants * TTI::TCC_Basic)
                ? static_cast<int>(TTI::TCC_Free)
                : Cost;
   }
 
-  return X86TTIImpl::getIntImmCost(Imm, Ty);
+  return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
 }
 
 int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
-                                    const APInt &Imm, Type *Ty) {
+                                    const APInt &Imm, Type *Ty,
+                                    TTI::TargetCostKind CostKind) {
   assert(Ty->isIntegerTy());
 
   unsigned BitSize = Ty->getPrimitiveSizeInBits();
@@ -3162,52 +3837,45 @@ int X86TTIImpl::getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx,
       return TTI::TCC_Free;
     break;
   }
-  return X86TTIImpl::getIntImmCost(Imm, Ty);
+  return X86TTIImpl::getIntImmCost(Imm, Ty, CostKind);
 }
 
-unsigned X86TTIImpl::getUserCost(const User *U,
-                                 ArrayRef<const Value *> Operands) {
-  if (isa<StoreInst>(U)) {
-    Value *Ptr = U->getOperand(1);
-    // Store instruction with index and scale costs 2 Uops.
-    // Check the preceding GEP to identify non-const indices.
-    if (auto GEP = dyn_cast<GetElementPtrInst>(Ptr)) {
-      if (!all_of(GEP->indices(), [](Value *V) { return isa<Constant>(V); }))
-        return TTI::TCC_Basic * 2;
-    }
-    return TTI::TCC_Basic;
-  }
-  return BaseT::getUserCost(U, Operands);
+unsigned
+X86TTIImpl::getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind) {
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return Opcode == Instruction::PHI ? 0 : 1;
+  // Branches are assumed to be predicted.
+  return CostKind == TTI::TCK_RecipThroughput ? 0 : 1;
 }
 
 // Return an average cost of Gather / Scatter instruction, maybe improved later
-int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
-                                unsigned Alignment, unsigned AddressSpace) {
+int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, const Value *Ptr,
+                                Align Alignment, unsigned AddressSpace) {
 
   assert(isa<VectorType>(SrcVTy) && "Unexpected type in getGSVectorCost");
-  unsigned VF = SrcVTy->getVectorNumElements();
+  unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
 
   // Try to reduce index size from 64 bit (default for GEP)
   // to 32. It is essential for VF 16. If the index can't be reduced to 32, the
   // operation will use 16 x 64 indices which do not fit in a zmm and needs
   // to split. Also check that the base pointer is the same for all lanes,
   // and that there's at most one variable index.
-  auto getIndexSizeInBits = [](Value *Ptr, const DataLayout& DL) {
+  auto getIndexSizeInBits = [](const Value *Ptr, const DataLayout &DL) {
     unsigned IndexSize = DL.getPointerSizeInBits();
-    GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
+    const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Ptr);
     if (IndexSize < 64 || !GEP)
       return IndexSize;
 
     unsigned NumOfVarIndices = 0;
-    Value *Ptrs = GEP->getPointerOperand();
+    const Value *Ptrs = GEP->getPointerOperand();
     if (Ptrs->getType()->isVectorTy() && !getSplatValue(Ptrs))
       return IndexSize;
     for (unsigned i = 1; i < GEP->getNumOperands(); ++i) {
       if (isa<Constant>(GEP->getOperand(i)))
         continue;
       Type *IndxTy = GEP->getOperand(i)->getType();
-      if (IndxTy->isVectorTy())
-        IndxTy = IndxTy->getVectorElementType();
+      if (auto *IndexVTy = dyn_cast<VectorType>(IndxTy))
+        IndxTy = IndexVTy->getElementType();
       if ((IndxTy->getPrimitiveSizeInBits() == 64 &&
           !isa<SExtInst>(GEP->getOperand(i))) ||
          ++NumOfVarIndices > 1)
@@ -3216,21 +3884,21 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
     return (unsigned)32;
   };
 
-
   // Trying to reduce IndexSize to 32 bits for vector 16.
   // By default the IndexSize is equal to pointer size.
   unsigned IndexSize = (ST->hasAVX512() && VF >= 16)
                            ? getIndexSizeInBits(Ptr, DL)
                            : DL.getPointerSizeInBits();
 
-  Type *IndexVTy = VectorType::get(IntegerType::get(SrcVTy->getContext(),
-                                                    IndexSize), VF);
+  auto *IndexVTy = FixedVectorType::get(
+      IntegerType::get(SrcVTy->getContext(), IndexSize), VF);
   std::pair<int, MVT> IdxsLT = TLI->getTypeLegalizationCost(DL, IndexVTy);
   std::pair<int, MVT> SrcLT = TLI->getTypeLegalizationCost(DL, SrcVTy);
   int SplitFactor = std::max(IdxsLT.first, SrcLT.first);
   if (SplitFactor > 1) {
     // Handle splitting of vector of pointers
-    Type *SplitSrcTy = VectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
+    auto *SplitSrcTy =
+        FixedVectorType::get(SrcVTy->getScalarType(), VF / SplitFactor);
     return SplitFactor * getGSVectorCost(Opcode, SplitSrcTy, Ptr, Alignment,
                                          AddressSpace);
   }
@@ -3241,7 +3909,8 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
                              ? ST->getGatherOverhead()
                              : ST->getScatterOverhead();
   return GSOverhead + VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
-                                           MaybeAlign(Alignment), AddressSpace);
+                                           MaybeAlign(Alignment), AddressSpace,
+                                           TTI::TCK_RecipThroughput);
 }
 
 /// Return the cost of full scalarization of gather / scatter operation.
@@ -3253,25 +3922,29 @@ int X86TTIImpl::getGSVectorCost(unsigned Opcode, Type *SrcVTy, Value *Ptr,
 /// AddressSpace - pointer[s] address space.
 ///
 int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
-                                bool VariableMask, unsigned Alignment,
+                                bool VariableMask, Align Alignment,
                                 unsigned AddressSpace) {
-  unsigned VF = SrcVTy->getVectorNumElements();
+  unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
+  APInt DemandedElts = APInt::getAllOnesValue(VF);
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   int MaskUnpackCost = 0;
   if (VariableMask) {
-    VectorType *MaskTy =
-      VectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
-    MaskUnpackCost = getScalarizationOverhead(MaskTy, false, true);
+    auto *MaskTy =
+        FixedVectorType::get(Type::getInt1Ty(SrcVTy->getContext()), VF);
+    MaskUnpackCost =
+        getScalarizationOverhead(MaskTy, DemandedElts, false, true);
     int ScalarCompareCost =
       getCmpSelInstrCost(Instruction::ICmp, Type::getInt1Ty(SrcVTy->getContext()),
-                         nullptr);
-    int BranchCost = getCFInstrCost(Instruction::Br);
+                         nullptr, CostKind);
+    int BranchCost = getCFInstrCost(Instruction::Br, CostKind);
     MaskUnpackCost += VF * (BranchCost + ScalarCompareCost);
   }
 
   // The cost of the scalar loads/stores.
   int MemoryOpCost = VF * getMemoryOpCost(Opcode, SrcVTy->getScalarType(),
-                                          MaybeAlign(Alignment), AddressSpace);
+                                          MaybeAlign(Alignment), AddressSpace,
+                                          CostKind);
 
   int InsertExtractCost = 0;
   if (Opcode == Instruction::Load)
@@ -3290,21 +3963,28 @@ int X86TTIImpl::getGSScalarCost(unsigned Opcode, Type *SrcVTy,
 
 /// Calculate the cost of Gather / Scatter operation
 int X86TTIImpl::getGatherScatterOpCost(unsigned Opcode, Type *SrcVTy,
-                                       Value *Ptr, bool VariableMask,
-                                       unsigned Alignment) {
+                                       const Value *Ptr, bool VariableMask,
+                                       Align Alignment,
+                                       TTI::TargetCostKind CostKind,
+                                       const Instruction *I = nullptr) {
+
+  if (CostKind != TTI::TCK_RecipThroughput)
+    return 1;
+
   assert(SrcVTy->isVectorTy() && "Unexpected data type for Gather/Scatter");
-  unsigned VF = SrcVTy->getVectorNumElements();
+  unsigned VF = cast<FixedVectorType>(SrcVTy)->getNumElements();
   PointerType *PtrTy = dyn_cast<PointerType>(Ptr->getType());
   if (!PtrTy && Ptr->getType()->isVectorTy())
-    PtrTy = dyn_cast<PointerType>(Ptr->getType()->getVectorElementType());
+    PtrTy = dyn_cast<PointerType>(
+        cast<VectorType>(Ptr->getType())->getElementType());
   assert(PtrTy && "Unexpected type for Ptr argument");
   unsigned AddressSpace = PtrTy->getAddressSpace();
 
   bool Scalarize = false;
   if ((Opcode == Instruction::Load &&
-       !isLegalMaskedGather(SrcVTy, MaybeAlign(Alignment))) ||
+       !isLegalMaskedGather(SrcVTy, Align(Alignment))) ||
       (Opcode == Instruction::Store &&
-       !isLegalMaskedScatter(SrcVTy, MaybeAlign(Alignment))))
+       !isLegalMaskedScatter(SrcVTy, Align(Alignment))))
     Scalarize = true;
   // Gather / Scatter for vector 2 is not profitable on KNL / SKX
   // Vector-4 of gather/scatter instruction does not exist on KNL.
@@ -3337,12 +4017,13 @@ bool X86TTIImpl::canMacroFuseCmp() {
   return ST->hasMacroFusion() || ST->hasBranchFusion();
 }
 
-bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
+bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, Align Alignment) {
   if (!ST->hasAVX())
     return false;
 
   // The backend can't handle a single element vector.
-  if (isa<VectorType>(DataTy) && DataTy->getVectorNumElements() == 1)
+  if (isa<VectorType>(DataTy) &&
+      cast<FixedVectorType>(DataTy)->getNumElements() == 1)
     return false;
   Type *ScalarTy = DataTy->getScalarType();
 
@@ -3360,7 +4041,7 @@ bool X86TTIImpl::isLegalMaskedLoad(Type *DataTy, MaybeAlign Alignment) {
          ((IntWidth == 8 || IntWidth == 16) && ST->hasBWI());
 }
 
-bool X86TTIImpl::isLegalMaskedStore(Type *DataType, MaybeAlign Alignment) {
+bool X86TTIImpl::isLegalMaskedStore(Type *DataType, Align Alignment) {
   return isLegalMaskedLoad(DataType, Alignment);
 }
 
@@ -3407,10 +4088,10 @@ bool X86TTIImpl::isLegalMaskedExpandLoad(Type *DataTy) {
     return false;
 
   // The backend can't handle a single element vector.
-  if (DataTy->getVectorNumElements() == 1)
+  if (cast<FixedVectorType>(DataTy)->getNumElements() == 1)
     return false;
 
-  Type *ScalarTy = DataTy->getVectorElementType();
+  Type *ScalarTy = cast<VectorType>(DataTy)->getElementType();
 
   if (ScalarTy->isFloatTy() || ScalarTy->isDoubleTy())
     return true;
@@ -3427,7 +4108,7 @@ bool X86TTIImpl::isLegalMaskedCompressStore(Type *DataTy) {
   return isLegalMaskedExpandLoad(DataTy);
 }
 
-bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) {
+bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, Align Alignment) {
   // Some CPUs have better gather performance than others.
   // TODO: Remove the explicit ST->hasAVX512()?, That would mean we would only
   // enable gather with a -march.
@@ -3446,8 +4127,8 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) {
   // In this case we can reject non-power-of-2 vectors.
   // We also reject single element vectors as the type legalizer can't
   // scalarize it.
-  if (isa<VectorType>(DataTy)) {
-    unsigned NumElts = DataTy->getVectorNumElements();
+  if (auto *DataVTy = dyn_cast<FixedVectorType>(DataTy)) {
+    unsigned NumElts = DataVTy->getNumElements();
     if (NumElts == 1 || !isPowerOf2_32(NumElts))
       return false;
   }
@@ -3465,7 +4146,7 @@ bool X86TTIImpl::isLegalMaskedGather(Type *DataTy, MaybeAlign Alignment) {
   return IntWidth == 32 || IntWidth == 64;
 }
 
-bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
+bool X86TTIImpl::isLegalMaskedScatter(Type *DataType, Align Alignment) {
   // AVX2 doesn't support scatter
   if (!ST->hasAVX512())
     return false;
@@ -3505,11 +4186,22 @@ bool X86TTIImpl::areFunctionArgsABICompatible(
   // If we get here, we know the target features match. If one function
   // considers 512-bit vectors legal and the other does not, consider them
   // incompatible.
-  // FIXME Look at the arguments and only consider 512 bit or larger vectors?
   const TargetMachine &TM = getTLI()->getTargetMachine();
 
-  return TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
-         TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs();
+  if (TM.getSubtarget<X86Subtarget>(*Caller).useAVX512Regs() ==
+      TM.getSubtarget<X86Subtarget>(*Callee).useAVX512Regs())
+    return true;
+
+  // Consider the arguments compatible if they aren't vectors or aggregates.
+  // FIXME: Look at the size of vectors.
+  // FIXME: Look at the element types of aggregates to see if there are vectors.
+  // FIXME: The API of this function seems intended to allow arguments
+  // to be removed from the set, but the caller doesn't check if the set
+  // becomes empty so that may not work in practice.
+  return llvm::none_of(Args, [](Argument *A) {
+    auto *EltTy = cast<PointerType>(A->getType())->getElementType();
+    return EltTy->isVectorTy() || EltTy->isAggregateType();
+  });
 }
 
 X86TTIImpl::TTI::MemCmpExpansionOptions
@@ -3517,6 +4209,8 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
   TTI::MemCmpExpansionOptions Options;
   Options.MaxNumLoads = TLI->getMaxExpandSizeMemcmp(OptSize);
   Options.NumLoadsPerBlock = 2;
+  // All GPR and vector loads can be unaligned.
+  Options.AllowOverlappingLoads = true;
   if (IsZeroCmp) {
     // Only enable vector loads for equality comparison. Right now the vector
     // version is not as fast for three way compare (see #33329).
@@ -3524,8 +4218,6 @@ X86TTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const {
     if (PreferredWidth >= 512 && ST->hasAVX512()) Options.LoadSizes.push_back(64);
     if (PreferredWidth >= 256 && ST->hasAVX()) Options.LoadSizes.push_back(32);
     if (PreferredWidth >= 128 && ST->hasSSE2()) Options.LoadSizes.push_back(16);
-    // All GPR and vector loads can be unaligned.
-    Options.AllowOverlappingLoads = true;
   }
   if (ST->is64Bit()) {
     Options.LoadSizes.push_back(8);
@@ -3555,24 +4247,22 @@ bool X86TTIImpl::enableInterleavedAccessVectorization() {
 // computing the cost using a generic formula as a function of generic
 // shuffles. We therefore use a lookup table instead, filled according to
 // the instruction sequences that codegen currently generates.
-int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
-                                               unsigned Factor,
-                                               ArrayRef<unsigned> Indices,
-                                               unsigned Alignment,
-                                               unsigned AddressSpace,
-                                               bool UseMaskForCond,
-                                               bool UseMaskForGaps) {
+int X86TTIImpl::getInterleavedMemoryOpCostAVX2(
+    unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+    ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+    TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
 
   if (UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace,
+                                             Alignment, AddressSpace, CostKind,
                                              UseMaskForCond, UseMaskForGaps);
 
   // We currently Support only fully-interleaved groups, with no gaps.
   // TODO: Support also strided loads (interleaved-groups with gaps).
   if (Indices.size() && Indices.size() != Factor)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace);
+                                             Alignment, AddressSpace,
+                                             CostKind);
 
   // VecTy for interleave memop is <VF*Factor x Elt>.
   // So, for VF=4, Interleave Factor = 3, Element type = i32 we have
@@ -3584,10 +4274,11 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
   // (see MachineValueType.h::getVectorVT()).
   if (!LegalVT.isVector())
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace);
+                                             Alignment, AddressSpace,
+                                             CostKind);
 
-  unsigned VF = VecTy->getVectorNumElements() / Factor;
-  Type *ScalarTy = VecTy->getVectorElementType();
+  unsigned VF = VecTy->getNumElements() / Factor;
+  Type *ScalarTy = VecTy->getElementType();
 
   // Calculate the number of memory operations (NumOfMemOps), required
   // for load/store the VecTy.
@@ -3596,16 +4287,18 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
   unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
 
   // Get the cost of one memory operation.
-  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
-                                        LegalVT.getVectorNumElements());
+  auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
+                                             LegalVT.getVectorNumElements());
   unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
-                                       MaybeAlign(Alignment), AddressSpace);
+                                       MaybeAlign(Alignment), AddressSpace,
+                                       CostKind);
 
-  VectorType *VT = VectorType::get(ScalarTy, VF);
+  auto *VT = FixedVectorType::get(ScalarTy, VF);
   EVT ETy = TLI->getValueType(DL, VT);
   if (!ETy.isSimple())
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace);
+                                             Alignment, AddressSpace,
+                                             CostKind);
 
   // TODO: Complete for other data-types and strides.
   // Each combination of Stride, ElementTy and VF results in a different
@@ -3664,24 +4357,21 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
   }
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace);
+                                           Alignment, AddressSpace, CostKind);
 }
 
 // Get estimation for interleaved load/store operations and strided load.
 // \p Indices contains indices for strided load.
 // \p Factor - the factor of interleaving.
 // AVX-512 provides 3-src shuffles that significantly reduces the cost.
-int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
-                                                 unsigned Factor,
-                                                 ArrayRef<unsigned> Indices,
-                                                 unsigned Alignment,
-                                                 unsigned AddressSpace,
-                                                 bool UseMaskForCond,
-                                                 bool UseMaskForGaps) {
+int X86TTIImpl::getInterleavedMemoryOpCostAVX512(
+    unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+    ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+    TTI::TargetCostKind CostKind, bool UseMaskForCond, bool UseMaskForGaps) {
 
   if (UseMaskForCond || UseMaskForGaps)
     return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                             Alignment, AddressSpace,
+                                             Alignment, AddressSpace, CostKind,
                                              UseMaskForCond, UseMaskForGaps);
 
   // VecTy for interleave memop is <VF*Factor x Elt>.
@@ -3696,12 +4386,13 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
   unsigned NumOfMemOps = (VecTySize + LegalVTSize - 1) / LegalVTSize;
 
   // Get the cost of one memory operation.
-  Type *SingleMemOpTy = VectorType::get(VecTy->getVectorElementType(),
-                                        LegalVT.getVectorNumElements());
+  auto *SingleMemOpTy = FixedVectorType::get(VecTy->getElementType(),
+                                             LegalVT.getVectorNumElements());
   unsigned MemOpCost = getMemoryOpCost(Opcode, SingleMemOpTy,
-                                       MaybeAlign(Alignment), AddressSpace);
+                                       MaybeAlign(Alignment), AddressSpace,
+                                       CostKind);
 
-  unsigned VF = VecTy->getVectorNumElements() / Factor;
+  unsigned VF = VecTy->getNumElements() / Factor;
   MVT VT = MVT::getVectorVT(MVT::getVT(VecTy->getScalarType()), VF);
 
   if (Opcode == Instruction::Load) {
@@ -3733,8 +4424,8 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
 
     unsigned NumOfLoadsInInterleaveGrp =
         Indices.size() ? Indices.size() : Factor;
-    Type *ResultTy = VectorType::get(VecTy->getVectorElementType(),
-                                     VecTy->getVectorNumElements() / Factor);
+    auto *ResultTy = FixedVectorType::get(VecTy->getElementType(),
+                                          VecTy->getNumElements() / Factor);
     unsigned NumOfResults =
         getTLI()->getTypeLegalizationCost(DL, ResultTy).first *
         NumOfLoadsInInterleaveGrp;
@@ -3796,15 +4487,12 @@ int X86TTIImpl::getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
   return Cost;
 }
 
-int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                           unsigned Factor,
-                                           ArrayRef<unsigned> Indices,
-                                           unsigned Alignment,
-                                           unsigned AddressSpace,
-                                           bool UseMaskForCond,
-                                           bool UseMaskForGaps) {
+int X86TTIImpl::getInterleavedMemoryOpCost(
+    unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+    Align Alignment, unsigned AddressSpace, TTI::TargetCostKind CostKind,
+    bool UseMaskForCond, bool UseMaskForGaps) {
   auto isSupportedOnAVX512 = [](Type *VecTy, bool HasBW) {
-    Type *EltTy = VecTy->getVectorElementType();
+    Type *EltTy = cast<VectorType>(VecTy)->getElementType();
     if (EltTy->isFloatTy() || EltTy->isDoubleTy() || EltTy->isIntegerTy(64) ||
         EltTy->isIntegerTy(32) || EltTy->isPointerTy())
       return true;
@@ -3813,15 +4501,15 @@ int X86TTIImpl::getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
     return false;
   };
   if (ST->hasAVX512() && isSupportedOnAVX512(VecTy, ST->hasBWI()))
-    return getInterleavedMemoryOpCostAVX512(Opcode, VecTy, Factor, Indices,
-                                            Alignment, AddressSpace,
-                                            UseMaskForCond, UseMaskForGaps);
+    return getInterleavedMemoryOpCostAVX512(
+        Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
+        AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
   if (ST->hasAVX2())
-    return getInterleavedMemoryOpCostAVX2(Opcode, VecTy, Factor, Indices,
-                                          Alignment, AddressSpace,
-                                          UseMaskForCond, UseMaskForGaps);
+    return getInterleavedMemoryOpCostAVX2(
+        Opcode, cast<FixedVectorType>(VecTy), Factor, Indices, Alignment,
+        AddressSpace, CostKind, UseMaskForCond, UseMaskForGaps);
 
   return BaseT::getInterleavedMemoryOpCost(Opcode, VecTy, Factor, Indices,
-                                           Alignment, AddressSpace,
+                                           Alignment, AddressSpace, CostKind,
                                            UseMaskForCond, UseMaskForGaps);
 }
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.h b/llvm/lib/Target/X86/X86TargetTransformInfo.h
index b9c2dbd78058..d462e1f96ca2 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.h
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.h
@@ -16,11 +16,9 @@
 #ifndef LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
 #define LLVM_LIB_TARGET_X86_X86TARGETTRANSFORMINFO_H
 
-#include "X86.h"
 #include "X86TargetMachine.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/CodeGen/BasicTTIImpl.h"
-#include "llvm/CodeGen/TargetLowering.h"
 
 namespace llvm {
 
@@ -107,9 +105,9 @@ public:
   /// \name Cache TTI Implementation
   /// @{
   llvm::Optional<unsigned> getCacheSize(
-    TargetTransformInfo::CacheLevel Level) const;
+    TargetTransformInfo::CacheLevel Level) const override;
   llvm::Optional<unsigned> getCacheAssociativity(
-    TargetTransformInfo::CacheLevel Level) const;
+    TargetTransformInfo::CacheLevel Level) const override;
   /// @}
 
   /// \name Vector TTI Implementations
@@ -121,76 +119,90 @@ public:
   unsigned getMaxInterleaveFactor(unsigned VF);
   int getArithmeticInstrCost(
       unsigned Opcode, Type *Ty,
+      TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput,
       TTI::OperandValueKind Opd1Info = TTI::OK_AnyValue,
       TTI::OperandValueKind Opd2Info = TTI::OK_AnyValue,
       TTI::OperandValueProperties Opd1PropInfo = TTI::OP_None,
       TTI::OperandValueProperties Opd2PropInfo = TTI::OP_None,
       ArrayRef<const Value *> Args = ArrayRef<const Value *>(),
       const Instruction *CxtI = nullptr);
-  int getShuffleCost(TTI::ShuffleKind Kind, Type *Tp, int Index, Type *SubTp);
+  int getShuffleCost(TTI::ShuffleKind Kind, VectorType *Tp, int Index,
+                     VectorType *SubTp);
   int getCastInstrCost(unsigned Opcode, Type *Dst, Type *Src,
+                       TTI::TargetCostKind CostKind,
                        const Instruction *I = nullptr);
   int getCmpSelInstrCost(unsigned Opcode, Type *ValTy, Type *CondTy,
+                         TTI::TargetCostKind CostKind,
                          const Instruction *I = nullptr);
   int getVectorInstrCost(unsigned Opcode, Type *Val, unsigned Index);
+  unsigned getScalarizationOverhead(VectorType *Ty, const APInt &DemandedElts,
+                                    bool Insert, bool Extract);
   int getMemoryOpCost(unsigned Opcode, Type *Src, MaybeAlign Alignment,
-                      unsigned AddressSpace, const Instruction *I = nullptr);
-  int getMaskedMemoryOpCost(unsigned Opcode, Type *Src, unsigned Alignment,
-                            unsigned AddressSpace);
-  int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, Value *Ptr,
-                             bool VariableMask, unsigned Alignment);
+                      unsigned AddressSpace,
+                      TTI::TargetCostKind CostKind,
+                      const Instruction *I = nullptr);
+  int getMaskedMemoryOpCost(
+      unsigned Opcode, Type *Src, Align Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
+  int getGatherScatterOpCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
+                             bool VariableMask, Align Alignment,
+                             TTI::TargetCostKind CostKind,
+                             const Instruction *I);
   int getAddressComputationCost(Type *PtrTy, ScalarEvolution *SE,
                                 const SCEV *Ptr);
 
   unsigned getAtomicMemIntrinsicMaxElementSize() const;
 
-  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<Type *> Tys, FastMathFlags FMF,
-                            unsigned ScalarizationCostPassed = UINT_MAX);
-  int getIntrinsicInstrCost(Intrinsic::ID IID, Type *RetTy,
-                            ArrayRef<Value *> Args, FastMathFlags FMF,
-                            unsigned VF = 1);
-
-  int getArithmeticReductionCost(unsigned Opcode, Type *Ty,
-                                 bool IsPairwiseForm);
-
-  int getMinMaxReductionCost(Type *Ty, Type *CondTy, bool IsPairwiseForm,
-                             bool IsUnsigned);
-
-  int getInterleavedMemoryOpCost(unsigned Opcode, Type *VecTy,
-                                 unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace,
-                                 bool UseMaskForCond = false,
-                                 bool UseMaskForGaps = false);
-  int getInterleavedMemoryOpCostAVX512(unsigned Opcode, Type *VecTy,
-                                 unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace,
-                                 bool UseMaskForCond = false,
-                                 bool UseMaskForGaps = false);
-  int getInterleavedMemoryOpCostAVX2(unsigned Opcode, Type *VecTy,
-                                 unsigned Factor, ArrayRef<unsigned> Indices,
-                                 unsigned Alignment, unsigned AddressSpace,
-                                 bool UseMaskForCond = false,
-                                 bool UseMaskForGaps = false);
+  int getTypeBasedIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                                     TTI::TargetCostKind CostKind);
+  int getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
+                            TTI::TargetCostKind CostKind);
+
+  int getArithmeticReductionCost(unsigned Opcode, VectorType *Ty,
+                                 bool IsPairwiseForm,
+                                 TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency);
+
+  int getMinMaxCost(Type *Ty, Type *CondTy, bool IsUnsigned);
+
+  int getMinMaxReductionCost(VectorType *Ty, VectorType *CondTy,
+                             bool IsPairwiseForm, bool IsUnsigned,
+                             TTI::TargetCostKind CostKind);
+
+  int getInterleavedMemoryOpCost(
+      unsigned Opcode, Type *VecTy, unsigned Factor, ArrayRef<unsigned> Indices,
+      Align Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      bool UseMaskForCond = false, bool UseMaskForGaps = false);
+  int getInterleavedMemoryOpCostAVX512(
+      unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+      ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      bool UseMaskForCond = false, bool UseMaskForGaps = false);
+  int getInterleavedMemoryOpCostAVX2(
+      unsigned Opcode, FixedVectorType *VecTy, unsigned Factor,
+      ArrayRef<unsigned> Indices, Align Alignment, unsigned AddressSpace,
+      TTI::TargetCostKind CostKind = TTI::TCK_SizeAndLatency,
+      bool UseMaskForCond = false, bool UseMaskForGaps = false);
 
   int getIntImmCost(int64_t);
 
-  int getIntImmCost(const APInt &Imm, Type *Ty);
+  int getIntImmCost(const APInt &Imm, Type *Ty, TTI::TargetCostKind CostKind);
 
-  unsigned getUserCost(const User *U, ArrayRef<const Value *> Operands);
+  unsigned getCFInstrCost(unsigned Opcode, TTI::TargetCostKind CostKind);
 
-  int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty);
+  int getIntImmCostInst(unsigned Opcode, unsigned Idx, const APInt &Imm, Type *Ty,
+                        TTI::TargetCostKind CostKind);
   int getIntImmCostIntrin(Intrinsic::ID IID, unsigned Idx, const APInt &Imm,
-                          Type *Ty);
+                          Type *Ty, TTI::TargetCostKind CostKind);
   bool isLSRCostLess(TargetTransformInfo::LSRCost &C1,
                      TargetTransformInfo::LSRCost &C2);
   bool canMacroFuseCmp();
-  bool isLegalMaskedLoad(Type *DataType, MaybeAlign Alignment);
-  bool isLegalMaskedStore(Type *DataType, MaybeAlign Alignment);
+  bool isLegalMaskedLoad(Type *DataType, Align Alignment);
+  bool isLegalMaskedStore(Type *DataType, Align Alignment);
   bool isLegalNTLoad(Type *DataType, Align Alignment);
   bool isLegalNTStore(Type *DataType, Align Alignment);
-  bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment);
-  bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment);
+  bool isLegalMaskedGather(Type *DataType, Align Alignment);
+  bool isLegalMaskedScatter(Type *DataType, Align Alignment);
   bool isLegalMaskedExpandLoad(Type *DataType);
   bool isLegalMaskedCompressStore(Type *DataType);
   bool hasDivRemOp(Type *DataType, bool IsSigned);
@@ -203,11 +215,20 @@ public:
   TTI::MemCmpExpansionOptions enableMemCmpExpansion(bool OptSize,
                                                     bool IsZeroCmp) const;
   bool enableInterleavedAccessVectorization();
+
+  /// Allow vectorizers to form reduction intrinsics in IR. The IR is expanded
+  /// into shuffles and vector math/logic by the backend
+  /// (see TTI::shouldExpandReduction)
+  bool useReductionIntrinsic(unsigned Opcode, Type *Ty,
+                             TTI::ReductionFlags Flags) const {
+    return true;
+  }
+
 private:
   int getGSScalarCost(unsigned Opcode, Type *DataTy, bool VariableMask,
-                      unsigned Alignment, unsigned AddressSpace);
-  int getGSVectorCost(unsigned Opcode, Type *DataTy, Value *Ptr,
-                      unsigned Alignment, unsigned AddressSpace);
+                      Align Alignment, unsigned AddressSpace);
+  int getGSVectorCost(unsigned Opcode, Type *DataTy, const Value *Ptr,
+                      Align Alignment, unsigned AddressSpace);
 
   /// @}
 };
diff --git a/llvm/lib/Target/X86/X86VZeroUpper.cpp b/llvm/lib/Target/X86/X86VZeroUpper.cpp
index 7a8308ef1ba9..c188c7443625 100644
--- a/llvm/lib/Target/X86/X86VZeroUpper.cpp
+++ b/llvm/lib/Target/X86/X86VZeroUpper.cpp
@@ -39,6 +39,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "x86-vzeroupper"
 
+static cl::opt<bool>
+UseVZeroUpper("x86-use-vzeroupper", cl::Hidden,
+  cl::desc("Minimize AVX to SSE transition penalty"),
+  cl::init(true));
+
 STATISTIC(NumVZU, "Number of vzeroupper instructions inserted");
 
 namespace {
@@ -278,6 +283,9 @@ void VZeroUpperInserter::processBasicBlock(MachineBasicBlock &MBB) {
 /// Loop over all of the basic blocks, inserting vzeroupper instructions before
 /// function calls.
 bool VZeroUpperInserter::runOnMachineFunction(MachineFunction &MF) {
+  if (!UseVZeroUpper)
+    return false;
+
   const X86Subtarget &ST = MF.getSubtarget<X86Subtarget>();
   if (!ST.hasAVX() || !ST.insertVZEROUPPER())
     return false;
diff --git a/llvm/lib/Target/X86/X86WinAllocaExpander.cpp b/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
index 42e8fba2201e..72593afb2258 100644
--- a/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
+++ b/llvm/lib/Target/X86/X86WinAllocaExpander.cpp
@@ -19,6 +19,7 @@
 #include "X86InstrInfo.h"
 #include "X86MachineFunctionInfo.h"
 #include "X86Subtarget.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp
index 78d3f6460189..8627bbbf18d2 100644
--- a/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -19,7 +19,7 @@
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/WinEHFuncInfo.h"
-#include "llvm/IR/CallSite.h"
+#include "llvm/IR/CFG.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -67,13 +67,13 @@ private:
 
   Function *generateLSDAInEAXThunk(Function *ParentFunc);
 
-  bool isStateStoreNeeded(EHPersonality Personality, CallSite CS);
-  void rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F, CallSite CS,
-                             Value *State);
+  bool isStateStoreNeeded(EHPersonality Personality, CallBase &Call);
+  void rewriteSetJmpCall(IRBuilder<> &Builder, Function &F, CallBase &Call,
+                         Value *State);
   int getBaseStateForBB(DenseMap<BasicBlock *, ColorVector> &BlockColors,
                         WinEHFuncInfo &FuncInfo, BasicBlock *BB);
-  int getStateForCallSite(DenseMap<BasicBlock *, ColorVector> &BlockColors,
-                          WinEHFuncInfo &FuncInfo, CallSite CS);
+  int getStateForCall(DenseMap<BasicBlock *, ColorVector> &BlockColors,
+                      WinEHFuncInfo &FuncInfo, CallBase &Call);
 
   // Module-level type getters.
   Type *getEHLinkRegistrationType();
@@ -455,16 +455,14 @@ void WinEHStatePass::unlinkExceptionRegistration(IRBuilder<> &Builder) {
 // The idea behind _setjmp3 is that it takes an optional number of personality
 // specific parameters to indicate how to restore the personality-specific frame
 // state when longjmp is initiated.  Typically, the current TryLevel is saved.
-void WinEHStatePass::rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F,
-                                           CallSite CS, Value *State) {
+void WinEHStatePass::rewriteSetJmpCall(IRBuilder<> &Builder, Function &F,
+                                       CallBase &Call, Value *State) {
   // Don't rewrite calls with a weird number of arguments.
-  if (CS.getNumArgOperands() != 2)
+  if (Call.getNumArgOperands() != 2)
     return;
 
-  Instruction *Inst = CS.getInstruction();
-
   SmallVector<OperandBundleDef, 1> OpBundles;
-  CS.getOperandBundlesAsDefs(OpBundles);
+  Call.getOperandBundlesAsDefs(OpBundles);
 
   SmallVector<Value *, 3> OptionalArgs;
   if (Personality == EHPersonality::MSVC_CXX) {
@@ -482,29 +480,27 @@ void WinEHStatePass::rewriteSetJmpCallSite(IRBuilder<> &Builder, Function &F,
 
   SmallVector<Value *, 5> Args;
   Args.push_back(
-      Builder.CreateBitCast(CS.getArgOperand(0), Builder.getInt8PtrTy()));
+      Builder.CreateBitCast(Call.getArgOperand(0), Builder.getInt8PtrTy()));
   Args.push_back(Builder.getInt32(OptionalArgs.size()));
   Args.append(OptionalArgs.begin(), OptionalArgs.end());
 
-  CallSite NewCS;
-  if (CS.isCall()) {
-    auto *CI = cast<CallInst>(Inst);
+  CallBase *NewCall;
+  if (auto *CI = dyn_cast<CallInst>(&Call)) {
     CallInst *NewCI = Builder.CreateCall(SetJmp3, Args, OpBundles);
     NewCI->setTailCallKind(CI->getTailCallKind());
-    NewCS = NewCI;
+    NewCall = NewCI;
   } else {
-    auto *II = cast<InvokeInst>(Inst);
-    NewCS = Builder.CreateInvoke(
+    auto *II = cast<InvokeInst>(&Call);
+    NewCall = Builder.CreateInvoke(
         SetJmp3, II->getNormalDest(), II->getUnwindDest(), Args, OpBundles);
   }
-  NewCS.setCallingConv(CS.getCallingConv());
-  NewCS.setAttributes(CS.getAttributes());
-  NewCS->setDebugLoc(CS->getDebugLoc());
-
-  Instruction *NewInst = NewCS.getInstruction();
-  NewInst->takeName(Inst);
-  Inst->replaceAllUsesWith(NewInst);
-  Inst->eraseFromParent();
+  NewCall->setCallingConv(Call.getCallingConv());
+  NewCall->setAttributes(Call.getAttributes());
+  NewCall->setDebugLoc(Call.getDebugLoc());
+
+  NewCall->takeName(&Call);
+  Call.replaceAllUsesWith(NewCall);
+  Call.eraseFromParent();
 }
 
 // Figure out what state we should assign calls in this block.
@@ -527,17 +523,17 @@ int WinEHStatePass::getBaseStateForBB(
 }
 
 // Calculate the state a call-site is in.
-int WinEHStatePass::getStateForCallSite(
+int WinEHStatePass::getStateForCall(
     DenseMap<BasicBlock *, ColorVector> &BlockColors, WinEHFuncInfo &FuncInfo,
-    CallSite CS) {
-  if (auto *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+    CallBase &Call) {
+  if (auto *II = dyn_cast<InvokeInst>(&Call)) {
     // Look up the state number of the EH pad this unwinds to.
     assert(FuncInfo.InvokeStateMap.count(II) && "invoke has no state!");
     return FuncInfo.InvokeStateMap[II];
   }
   // Possibly throwing call instructions have no actions to take after
   // an unwind. Ensure they are in the -1 state.
-  return getBaseStateForBB(BlockColors, FuncInfo, CS.getParent());
+  return getBaseStateForBB(BlockColors, FuncInfo, Call.getParent());
 }
 
 // Calculate the intersection of all the FinalStates for a BasicBlock's
@@ -618,16 +614,13 @@ static int getSuccState(DenseMap<BasicBlock *, int> &InitialStates, Function &F,
 }
 
 bool WinEHStatePass::isStateStoreNeeded(EHPersonality Personality,
-                                        CallSite CS) {
-  if (!CS)
-    return false;
-
+                                        CallBase &Call) {
   // If the function touches memory, it needs a state store.
   if (isAsynchronousEHPersonality(Personality))
-    return !CS.doesNotAccessMemory();
+    return !Call.doesNotAccessMemory();
 
   // If the function throws, it needs a state store.
-  return !CS.doesNotThrow();
+  return !Call.doesNotThrow();
 }
 
 void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
@@ -672,11 +665,11 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
     if (&F.getEntryBlock() == BB)
       InitialState = FinalState = ParentBaseState;
     for (Instruction &I : *BB) {
-      CallSite CS(&I);
-      if (!isStateStoreNeeded(Personality, CS))
+      auto *Call = dyn_cast<CallBase>(&I);
+      if (!Call || !isStateStoreNeeded(Personality, *Call))
         continue;
 
-      int State = getStateForCallSite(BlockColors, FuncInfo, CS);
+      int State = getStateForCall(BlockColors, FuncInfo, *Call);
       if (InitialState == OverdefinedState)
         InitialState = State;
       FinalState = State;
@@ -739,11 +732,11 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
                       << " PrevState=" << PrevState << '\n');
 
     for (Instruction &I : *BB) {
-      CallSite CS(&I);
-      if (!isStateStoreNeeded(Personality, CS))
+      auto *Call = dyn_cast<CallBase>(&I);
+      if (!Call || !isStateStoreNeeded(Personality, *Call))
         continue;
 
-      int State = getStateForCallSite(BlockColors, FuncInfo, CS);
+      int State = getStateForCall(BlockColors, FuncInfo, *Call);
       if (State != PrevState)
         insertStateNumberStore(&I, State);
       PrevState = State;
@@ -756,35 +749,35 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
         insertStateNumberStore(BB->getTerminator(), EndState->second);
   }
 
-  SmallVector<CallSite, 1> SetJmp3CallSites;
+  SmallVector<CallBase *, 1> SetJmp3Calls;
   for (BasicBlock *BB : RPOT) {
     for (Instruction &I : *BB) {
-      CallSite CS(&I);
-      if (!CS)
+      auto *Call = dyn_cast<CallBase>(&I);
+      if (!Call)
         continue;
-      if (CS.getCalledValue()->stripPointerCasts() !=
+      if (Call->getCalledOperand()->stripPointerCasts() !=
           SetJmp3.getCallee()->stripPointerCasts())
         continue;
 
-      SetJmp3CallSites.push_back(CS);
+      SetJmp3Calls.push_back(Call);
     }
   }
 
-  for (CallSite CS : SetJmp3CallSites) {
-    auto &BBColors = BlockColors[CS->getParent()];
+  for (CallBase *Call : SetJmp3Calls) {
+    auto &BBColors = BlockColors[Call->getParent()];
     BasicBlock *FuncletEntryBB = BBColors.front();
     bool InCleanup = isa<CleanupPadInst>(FuncletEntryBB->getFirstNonPHI());
 
-    IRBuilder<> Builder(CS.getInstruction());
+    IRBuilder<> Builder(Call);
     Value *State;
     if (InCleanup) {
       Value *StateField = Builder.CreateStructGEP(RegNode->getAllocatedType(),
                                                   RegNode, StateFieldIndex);
       State = Builder.CreateLoad(Builder.getInt32Ty(), StateField);
     } else {
-      State = Builder.getInt32(getStateForCallSite(BlockColors, FuncInfo, CS));
+      State = Builder.getInt32(getStateForCall(BlockColors, FuncInfo, *Call));
     }
-    rewriteSetJmpCallSite(Builder, F, CS, State);
+    rewriteSetJmpCall(Builder, F, *Call, State);
   }
 }
 
diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
index ae19e2a78eec..4c1c87cc1e68 100644
--- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
+++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCAsmInfo.cpp
@@ -28,5 +28,7 @@ XCoreMCAsmInfo::XCoreMCAsmInfo(const Triple &TT) {
   // Debug
   ExceptionsType = ExceptionHandling::DwarfCFI;
   DwarfRegNumForCFI = true;
+
+  UseIntegratedAssembler = false;
 }
 
diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index 46ebccee521e..4de252548961 100644
--- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -60,7 +60,7 @@ static MCAsmInfo *createXCoreMCAsmInfo(const MCRegisterInfo &MRI,
   MCAsmInfo *MAI = new XCoreMCAsmInfo(TT);
 
   // Initial state of the frame pointer is SP.
-  MCCFIInstruction Inst = MCCFIInstruction::createDefCfa(nullptr, XCore::SP, 0);
+  MCCFIInstruction Inst = MCCFIInstruction::cfiDefCfa(nullptr, XCore::SP, 0);
   MAI->addInitialFrameState(Inst);
 
   return MAI;
diff --git a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
index 3e56302f4add..096b22415a22 100644
--- a/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
+++ b/llvm/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.h
@@ -13,12 +13,6 @@
 #ifndef LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H
 #define LLVM_LIB_TARGET_XCORE_MCTARGETDESC_XCOREMCTARGETDESC_H
 
-namespace llvm {
-
-class Target;
-
-} // end namespace llvm
-
 // Defines symbolic names for XCore registers.  This defines a mapping from
 // register name to register number.
 //
diff --git a/llvm/lib/Target/XCore/XCore.h b/llvm/lib/Target/XCore/XCore.h
index b7b86be9ab51..d31c34910ef6 100644
--- a/llvm/lib/Target/XCore/XCore.h
+++ b/llvm/lib/Target/XCore/XCore.h
@@ -22,7 +22,6 @@ namespace llvm {
   class ModulePass;
   class TargetMachine;
   class XCoreTargetMachine;
-  class formatted_raw_ostream;
 
   void initializeXCoreLowerThreadLocalPass(PassRegistry &p);
 
diff --git a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
index 35dc56e90419..4ea775305e12 100644
--- a/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
+++ b/llvm/lib/Target/XCore/XCoreAsmPrinter.cpp
@@ -72,12 +72,12 @@ namespace {
                                const char *ExtraCode, raw_ostream &O) override;
 
     void emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV);
-    void EmitGlobalVariable(const GlobalVariable *GV) override;
+    void emitGlobalVariable(const GlobalVariable *GV) override;
 
-    void EmitFunctionEntryLabel() override;
-    void EmitInstruction(const MachineInstr *MI) override;
-    void EmitFunctionBodyStart() override;
-    void EmitFunctionBodyEnd() override;
+    void emitFunctionEntryLabel() override;
+    void emitInstruction(const MachineInstr *MI) override;
+    void emitFunctionBodyStart() override;
+    void emitFunctionBodyEnd() override;
   };
 } // end of anonymous namespace
 
@@ -93,21 +93,20 @@ void XCoreAsmPrinter::emitArrayBound(MCSymbol *Sym, const GlobalVariable *GV) {
 
     MCSymbol *SymGlob = OutContext.getOrCreateSymbol(
                           Twine(Sym->getName() + StringRef(".globound")));
-    OutStreamer->EmitSymbolAttribute(SymGlob, MCSA_Global);
-    OutStreamer->EmitAssignment(SymGlob,
+    OutStreamer->emitSymbolAttribute(SymGlob, MCSA_Global);
+    OutStreamer->emitAssignment(SymGlob,
                                 MCConstantExpr::create(ATy->getNumElements(),
                                                        OutContext));
     if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
         GV->hasCommonLinkage()) {
-      OutStreamer->EmitSymbolAttribute(SymGlob, MCSA_Weak);
+      OutStreamer->emitSymbolAttribute(SymGlob, MCSA_Weak);
     }
   }
 }
 
-void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
+void XCoreAsmPrinter::emitGlobalVariable(const GlobalVariable *GV) {
   // Check to see if this is a special global used by LLVM, if so, emit it.
-  if (!GV->hasInitializer() ||
-      EmitSpecialLLVMGlobal(GV))
+  if (!GV->hasInitializer() || emitSpecialLLVMGlobal(GV))
     return;
 
   const DataLayout &DL = getDataLayout();
@@ -130,11 +129,11 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
   case GlobalValue::ExternalLinkage:
   case GlobalValue::CommonLinkage:
     emitArrayBound(GVSym, GV);
-    OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Global);
+    OutStreamer->emitSymbolAttribute(GVSym, MCSA_Global);
 
     if (GV->hasWeakLinkage() || GV->hasLinkOnceLinkage() ||
         GV->hasCommonLinkage())
-      OutStreamer->EmitSymbolAttribute(GVSym, MCSA_Weak);
+      OutStreamer->emitSymbolAttribute(GVSym, MCSA_Weak);
     LLVM_FALLTHROUGH;
   case GlobalValue::InternalLinkage:
   case GlobalValue::PrivateLinkage:
@@ -143,43 +142,43 @@ void XCoreAsmPrinter::EmitGlobalVariable(const GlobalVariable *GV) {
     llvm_unreachable("Unknown linkage type!");
   }
 
-  EmitAlignment(std::max(Alignment, Align(4)), GV);
+  emitAlignment(std::max(Alignment, Align(4)), GV);
 
   if (GV->isThreadLocal()) {
     report_fatal_error("TLS is not supported by this target!");
   }
   unsigned Size = DL.getTypeAllocSize(C->getType());
   if (MAI->hasDotTypeDotSizeDirective()) {
-    OutStreamer->EmitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
+    OutStreamer->emitSymbolAttribute(GVSym, MCSA_ELF_TypeObject);
     OutStreamer->emitELFSize(GVSym, MCConstantExpr::create(Size, OutContext));
   }
-  OutStreamer->EmitLabel(GVSym);
+  OutStreamer->emitLabel(GVSym);
 
-  EmitGlobalConstant(DL, C);
+  emitGlobalConstant(DL, C);
   // The ABI requires that unsigned scalar types smaller than 32 bits
   // are padded to 32 bits.
   if (Size < 4)
-    OutStreamer->EmitZeros(4 - Size);
+    OutStreamer->emitZeros(4 - Size);
 
   // Mark the end of the global
   getTargetStreamer().emitCCBottomData(GVSym->getName());
 }
 
-void XCoreAsmPrinter::EmitFunctionBodyStart() {
+void XCoreAsmPrinter::emitFunctionBodyStart() {
   MCInstLowering.Initialize(&MF->getContext());
 }
 
 /// EmitFunctionBodyEnd - Targets can override this to emit stuff after
 /// the last basic block in the function.
-void XCoreAsmPrinter::EmitFunctionBodyEnd() {
+void XCoreAsmPrinter::emitFunctionBodyEnd() {
   // Emit function end directives
   getTargetStreamer().emitCCBottomFunction(CurrentFnSym->getName());
 }
 
-void XCoreAsmPrinter::EmitFunctionEntryLabel() {
+void XCoreAsmPrinter::emitFunctionEntryLabel() {
   // Mark the start of the function
   getTargetStreamer().emitCCTopFunction(CurrentFnSym->getName());
-  OutStreamer->EmitLabel(CurrentFnSym);
+  OutStreamer->emitLabel(CurrentFnSym);
 }
 
 void XCoreAsmPrinter::
@@ -256,7 +255,7 @@ bool XCoreAsmPrinter::PrintAsmMemoryOperand(const MachineInstr *MI,
   return false;
 }
 
-void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
+void XCoreAsmPrinter::emitInstruction(const MachineInstr *MI) {
   SmallString<128> Str;
   raw_svector_ostream O(Str);
 
@@ -268,7 +267,7 @@ void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       O << "\tmov "
         << XCoreInstPrinter::getRegisterName(MI->getOperand(0).getReg()) << ", "
         << XCoreInstPrinter::getRegisterName(MI->getOperand(1).getReg());
-      OutStreamer->EmitRawText(O.str());
+      OutStreamer->emitRawText(O.str());
       return;
     }
     break;
@@ -281,7 +280,7 @@ void XCoreAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     else
       printInlineJT32(MI, 0, O);
     O << '\n';
-    OutStreamer->EmitRawText(O.str());
+    OutStreamer->emitRawText(O.str());
     return;
   }
 
diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
index fd8b37e26e47..27ac6a4d1439 100644
--- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -73,7 +73,7 @@ static void EmitDefCfaOffset(MachineBasicBlock &MBB,
                              int Offset) {
   MachineFunction &MF = *MBB.getParent();
   unsigned CFIIndex =
-      MF.addFrameInst(MCCFIInstruction::createDefCfaOffset(nullptr, -Offset));
+      MF.addFrameInst(MCCFIInstruction::cfiDefCfaOffset(nullptr, Offset));
   BuildMI(MBB, MBBI, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
       .addCFIIndex(CFIIndex);
 }
@@ -179,7 +179,7 @@ static MachineMemOperand *getFrameIndexMMO(MachineBasicBlock &MBB,
   const MachineFrameInfo &MFI = MF->getFrameInfo();
   MachineMemOperand *MMO = MF->getMachineMemOperand(
       MachinePointerInfo::getFixedStack(*MF, FrameIndex), flags,
-      MFI.getObjectSize(FrameIndex), MFI.getObjectAlignment(FrameIndex));
+      MFI.getObjectSize(FrameIndex), MFI.getObjectAlign(FrameIndex));
   return MMO;
 }
 
@@ -233,9 +233,9 @@ void XCoreFrameLowering::emitPrologue(MachineFunction &MF,
   // to determine the end of the prologue.
   DebugLoc dl;
 
-  if (MFI.getMaxAlignment() > getStackAlignment())
-    report_fatal_error("emitPrologue unsupported alignment: "
-                       + Twine(MFI.getMaxAlignment()));
+  if (MFI.getMaxAlign() > getStackAlign())
+    report_fatal_error("emitPrologue unsupported alignment: " +
+                       Twine(MFI.getMaxAlign().value()));
 
   const AttributeList &PAL = MF.getFunction().getAttributes();
   if (PAL.hasAttrSomewhere(Attribute::Nest))
@@ -412,11 +412,9 @@ void XCoreFrameLowering::emitEpilogue(MachineFunction &MF,
   } // else Don't erase the return instruction.
 }
 
-bool XCoreFrameLowering::
-spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                          MachineBasicBlock::iterator MI,
-                          const std::vector<CalleeSavedInfo> &CSI,
-                          const TargetRegisterInfo *TRI) const {
+bool XCoreFrameLowering::spillCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    ArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   if (CSI.empty())
     return true;
 
@@ -429,8 +427,7 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   if (MI != MBB.end() && !MI->isDebugInstr())
     DL = MI->getDebugLoc();
 
-  for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
-                                                    it != CSI.end(); ++it) {
+  for (auto it = CSI.begin(); it != CSI.end(); ++it) {
     unsigned Reg = it->getReg();
     assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) &&
            "LR & FP are always handled in emitPrologue");
@@ -448,25 +445,22 @@ spillCalleeSavedRegisters(MachineBasicBlock &MBB,
   return true;
 }
 
-bool XCoreFrameLowering::
-restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                            MachineBasicBlock::iterator MI,
-                            std::vector<CalleeSavedInfo> &CSI,
-                            const TargetRegisterInfo *TRI) const{
+bool XCoreFrameLowering::restoreCalleeSavedRegisters(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
+    MutableArrayRef<CalleeSavedInfo> CSI, const TargetRegisterInfo *TRI) const {
   MachineFunction *MF = MBB.getParent();
   const TargetInstrInfo &TII = *MF->getSubtarget().getInstrInfo();
   bool AtStart = MI == MBB.begin();
   MachineBasicBlock::iterator BeforeI = MI;
   if (!AtStart)
     --BeforeI;
-  for (std::vector<CalleeSavedInfo>::const_iterator it = CSI.begin();
-                                                    it != CSI.end(); ++it) {
-    unsigned Reg = it->getReg();
+  for (const CalleeSavedInfo &CSR : CSI) {
+    unsigned Reg = CSR.getReg();
     assert(Reg != XCore::LR && !(Reg == XCore::R10 && hasFP(*MF)) &&
            "LR & FP are always handled in emitEpilogue");
 
     const TargetRegisterClass *RC = TRI->getMinimalPhysRegClass(Reg);
-    TII.loadRegFromStackSlot(MBB, MI, Reg, it->getFrameIdx(), RC, TRI);
+    TII.loadRegFromStackSlot(MBB, MI, Reg, CSR.getFrameIdx(), RC, TRI);
     assert(MI != MBB.begin() &&
            "loadRegFromStackSlot didn't insert any code!");
     // Insert in reverse order.  loadRegFromStackSlot can insert multiple
@@ -496,8 +490,7 @@ MachineBasicBlock::iterator XCoreFrameLowering::eliminateCallFramePseudoInstr(
       // We need to keep the stack aligned properly.  To do this, we round the
       // amount of space needed for the outgoing arguments up to the next
       // alignment boundary.
-      unsigned Align = getStackAlignment();
-      Amount = (Amount+Align-1)/Align*Align;
+      Amount = alignTo(Amount, getStackAlign());
 
       assert(Amount%4 == 0);
       Amount /= 4;
@@ -582,9 +575,9 @@ processFunctionBeforeFrameFinalized(MachineFunction &MF,
   // When using SP for large frames, we may need 2 scratch registers.
   // When using FP, for large or small frames, we may need 1 scratch register.
   unsigned Size = TRI.getSpillSize(RC);
-  unsigned Align = TRI.getSpillAlignment(RC);
+  Align Alignment = TRI.getSpillAlign(RC);
   if (XFI->isLargeFrame(MF) || hasFP(MF))
-    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
+    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Alignment, false));
   if (XFI->isLargeFrame(MF) && !hasFP(MF))
-    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Align, false));
+    RS->addScavengingFrameIndex(MFI.CreateStackObject(Size, Alignment, false));
 }
diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.h b/llvm/lib/Target/XCore/XCoreFrameLowering.h
index 95c3a2973033..a914d82e1989 100644
--- a/llvm/lib/Target/XCore/XCoreFrameLowering.h
+++ b/llvm/lib/Target/XCore/XCoreFrameLowering.h
@@ -31,14 +31,16 @@ namespace llvm {
     void emitEpilogue(MachineFunction &MF,
                       MachineBasicBlock &MBB) const override;
 
-    bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MI,
-                                  const std::vector<CalleeSavedInfo> &CSI,
-                                  const TargetRegisterInfo *TRI) const override;
-    bool restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
-                                  MachineBasicBlock::iterator MI,
-                                  std::vector<CalleeSavedInfo> &CSI,
-                                  const TargetRegisterInfo *TRI) const override;
+    bool
+    spillCalleeSavedRegisters(MachineBasicBlock &MBB,
+                              MachineBasicBlock::iterator MI,
+                              ArrayRef<CalleeSavedInfo> CSI,
+                              const TargetRegisterInfo *TRI) const override;
+    bool
+    restoreCalleeSavedRegisters(MachineBasicBlock &MBB,
+                                MachineBasicBlock::iterator MI,
+                                MutableArrayRef<CalleeSavedInfo> CSI,
+                                const TargetRegisterInfo *TRI) const override;
 
     MachineBasicBlock::iterator
     eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp b/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
index b1f9717fbddc..b300697cc5ae 100644
--- a/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
+++ b/llvm/lib/Target/XCore/XCoreISelDAGToDAG.cpp
@@ -152,7 +152,7 @@ void XCoreDAGToDAGISel::Select(SDNode *N) {
                                             CurDAG->getEntryNode());
       MachineMemOperand *MemOp =
           MF->getMachineMemOperand(MachinePointerInfo::getConstantPool(*MF),
-                                   MachineMemOperand::MOLoad, 4, 4);
+                                   MachineMemOperand::MOLoad, 4, Align(4));
       CurDAG->setNodeMemRefs(cast<MachineSDNode>(node), {MemOp});
       ReplaceNode(N, node);
       return;
diff --git a/llvm/lib/Target/XCore/XCoreISelLowering.cpp b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
index 70770f4c8e7c..c32653137a10 100644
--- a/llvm/lib/Target/XCore/XCoreISelLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreISelLowering.cpp
@@ -107,6 +107,7 @@ XCoreTargetLowering::XCoreTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::CTPOP, MVT::i32, Expand);
   setOperationAction(ISD::ROTL , MVT::i32, Expand);
   setOperationAction(ISD::ROTR , MVT::i32, Expand);
+  setOperationAction(ISD::BITREVERSE , MVT::i32, Legal);
 
   setOperationAction(ISD::TRAP, MVT::Other, Legal);
 
@@ -328,10 +329,10 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
   SDValue Res;
   if (CP->isMachineConstantPoolEntry()) {
     Res = DAG.getTargetConstantPool(CP->getMachineCPVal(), PtrVT,
-                                    CP->getAlignment(), CP->getOffset());
+                                    CP->getAlign(), CP->getOffset());
   } else {
-    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT,
-                                    CP->getAlignment(), CP->getOffset());
+    Res = DAG.getTargetConstantPool(CP->getConstVal(), PtrVT, CP->getAlign(),
+                                    CP->getOffset());
   }
   return DAG.getNode(XCoreISD::CPRelativeWrapper, dl, MVT::i32, Res);
 }
@@ -434,7 +435,7 @@ SDValue XCoreTargetLowering::LowerLOAD(SDValue Op, SelectionDAG &DAG) const {
                                                     Offset, DAG);
     }
     if (TLI.isGAPlusOffset(BasePtr.getNode(), GV, Offset) &&
-        MinAlign(GV->getAlignment(), 4) == 4) {
+        GV->getPointerAlignment(DAG.getDataLayout()) >= 4) {
       SDValue NewBasePtr = DAG.getGlobalAddress(GV, DL,
                                                 BasePtr->getValueType(0));
       return lowerLoadWordFromAlignedBasePlusOffset(DL, Chain, NewBasePtr,
@@ -996,7 +997,7 @@ LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const {
 }
 
 MachineMemOperand::Flags
-XCoreTargetLowering::getMMOFlags(const Instruction &I) const {
+XCoreTargetLowering::getTargetMMOFlags(const Instruction &I) const {
   // Because of how we convert atomic_load and atomic_store to normal loads and
   // stores in the DAG, we need to ensure that the MMOs are marked volatile
   // since DAGCombine hasn't been updated to account for atomic, but non
@@ -1118,7 +1119,7 @@ SDValue XCoreTargetLowering::LowerCCCCallTo(
 
   // The ABI dictates there should be one stack slot available to the callee
   // on function entry (for saving lr).
-  CCInfo.AllocateStack(4, 4);
+  CCInfo.AllocateStack(4, Align(4));
 
   CCInfo.AnalyzeCallOperands(Outs, CC_XCore);
 
@@ -1126,7 +1127,7 @@ SDValue XCoreTargetLowering::LowerCCCCallTo(
   // Analyze return values to determine the number of bytes of stack required.
   CCState RetCCInfo(CallConv, isVarArg, DAG.getMachineFunction(), RVLocs,
                     *DAG.getContext());
-  RetCCInfo.AllocateStack(CCInfo.getNextStackOffset(), 4);
+  RetCCInfo.AllocateStack(CCInfo.getNextStackOffset(), Align(4));
   RetCCInfo.AnalyzeCallResult(Ins, RetCC_XCore);
 
   // Get a count of how many bytes are to be pushed on the stack.
@@ -1391,16 +1392,16 @@ SDValue XCoreTargetLowering::LowerCCCArguments(
        ArgDI != ArgDE; ++ArgDI) {
     if (ArgDI->Flags.isByVal() && ArgDI->Flags.getByValSize()) {
       unsigned Size = ArgDI->Flags.getByValSize();
-      unsigned Align = std::max(StackSlotSize, ArgDI->Flags.getByValAlign());
+      Align Alignment =
+          std::max(Align(StackSlotSize), ArgDI->Flags.getNonZeroByValAlign());
       // Create a new object on the stack and copy the pointee into it.
-      int FI = MFI.CreateStackObject(Size, Align, false);
+      int FI = MFI.CreateStackObject(Size, Alignment, false);
       SDValue FIN = DAG.getFrameIndex(FI, MVT::i32);
       InVals.push_back(FIN);
-      MemOps.push_back(DAG.getMemcpy(Chain, dl, FIN, ArgDI->SDV,
-                                     DAG.getConstant(Size, dl, MVT::i32),
-                                     Align, false, false, false,
-                                     MachinePointerInfo(),
-                                     MachinePointerInfo()));
+      MemOps.push_back(DAG.getMemcpy(
+          Chain, dl, FIN, ArgDI->SDV, DAG.getConstant(Size, dl, MVT::i32),
+          Alignment, false, false, false, MachinePointerInfo(),
+          MachinePointerInfo()));
     } else {
       InVals.push_back(ArgDI->SDV);
     }
@@ -1454,7 +1455,7 @@ XCoreTargetLowering::LowerReturn(SDValue Chain, CallingConv::ID CallConv,
 
   // Analyze return values.
   if (!isVarArg)
-    CCInfo.AllocateStack(XFI->getReturnStackOffset(), 4);
+    CCInfo.AllocateStack(XFI->getReturnStackOffset(), Align(4));
 
   CCInfo.AnalyzeReturn(Outs, RetCC_XCore);
 
@@ -1800,11 +1801,10 @@ SDValue XCoreTargetLowering::PerformDAGCombine(SDNode *N,
         !LD->isVolatile() && !LD->isIndexed() &&
         Chain.reachesChainWithoutSideEffects(SDValue(LD, 1))) {
         bool isTail = isInTailCallPosition(DAG, ST, Chain);
-        return DAG.getMemmove(Chain, dl, ST->getBasePtr(),
-                              LD->getBasePtr(),
-                              DAG.getConstant(StoreBits/8, dl, MVT::i32),
-                              Alignment, false, isTail, ST->getPointerInfo(),
-                              LD->getPointerInfo());
+        return DAG.getMemmove(Chain, dl, ST->getBasePtr(), LD->getBasePtr(),
+                              DAG.getConstant(StoreBits / 8, dl, MVT::i32),
+                              Align(Alignment), false, isTail,
+                              ST->getPointerInfo(), LD->getPointerInfo());
       }
     }
     break;
diff --git a/llvm/lib/Target/XCore/XCoreISelLowering.h b/llvm/lib/Target/XCore/XCoreISelLowering.h
index b4f25feda7fe..45c21fbf2b74 100644
--- a/llvm/lib/Target/XCore/XCoreISelLowering.h
+++ b/llvm/lib/Target/XCore/XCoreISelLowering.h
@@ -22,7 +22,6 @@ namespace llvm {
 
   // Forward delcarations
   class XCoreSubtarget;
-  class XCoreTargetMachine;
 
   namespace XCoreISD {
     enum NodeType : unsigned {
@@ -127,14 +126,14 @@ namespace llvm {
 
     /// If a physical register, this returns the register that receives the
     /// exception address on entry to an EH pad.
-    unsigned
+    Register
     getExceptionPointerRegister(const Constant *PersonalityFn) const override {
       return XCore::R0;
     }
 
     /// If a physical register, this returns the register that receives the
     /// exception typeid on entry to a landing pad.
-    unsigned
+    Register
     getExceptionSelectorRegister(const Constant *PersonalityFn) const override {
       return XCore::R1;
     }
@@ -188,7 +187,8 @@ namespace llvm {
     SDValue LowerATOMIC_LOAD(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerATOMIC_STORE(SDValue Op, SelectionDAG &DAG) const;
 
-    MachineMemOperand::Flags getMMOFlags(const Instruction &I) const override;
+    MachineMemOperand::Flags getTargetMMOFlags(
+      const Instruction &I) const override;
 
     // Inline asm support
     std::pair<unsigned, const TargetRegisterClass *>
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
index db44a56be538..1b21e1ce195b 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.cpp
@@ -163,7 +163,7 @@ static inline XCore::CondCode GetOppositeBranchCondition(XCore::CondCode CC)
   }
 }
 
-/// AnalyzeBranch - Analyze the branching code at the end of MBB, returning
+/// analyzeBranch - Analyze the branching code at the end of MBB, returning
 /// true if it cannot be understood (e.g. it's a switch dispatch or isn't
 /// implemented for a target).  Upon success, this returns false and returns
 /// with the following information in various cases:
@@ -357,7 +357,7 @@ void XCoreInstrInfo::copyPhysReg(MachineBasicBlock &MBB,
 
 void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
                                          MachineBasicBlock::iterator I,
-                                         unsigned SrcReg, bool isKill,
+                                         Register SrcReg, bool isKill,
                                          int FrameIndex,
                                          const TargetRegisterClass *RC,
                                          const TargetRegisterInfo *TRI) const
@@ -370,7 +370,7 @@ void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineMemOperand *MMO = MF->getMachineMemOperand(
       MachinePointerInfo::getFixedStack(*MF, FrameIndex),
       MachineMemOperand::MOStore, MFI.getObjectSize(FrameIndex),
-      MFI.getObjectAlignment(FrameIndex));
+      MFI.getObjectAlign(FrameIndex));
   BuildMI(MBB, I, DL, get(XCore::STWFI))
     .addReg(SrcReg, getKillRegState(isKill))
     .addFrameIndex(FrameIndex)
@@ -380,7 +380,7 @@ void XCoreInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
 void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
                                           MachineBasicBlock::iterator I,
-                                          unsigned DestReg, int FrameIndex,
+                                          Register DestReg, int FrameIndex,
                                           const TargetRegisterClass *RC,
                                           const TargetRegisterInfo *TRI) const
 {
@@ -392,7 +392,7 @@ void XCoreInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   MachineMemOperand *MMO = MF->getMachineMemOperand(
       MachinePointerInfo::getFixedStack(*MF, FrameIndex),
       MachineMemOperand::MOLoad, MFI.getObjectSize(FrameIndex),
-      MFI.getObjectAlignment(FrameIndex));
+      MFI.getObjectAlign(FrameIndex));
   BuildMI(MBB, I, DL, get(XCore::LDWFI), DestReg)
     .addFrameIndex(FrameIndex)
     .addImm(0)
@@ -443,7 +443,7 @@ MachineBasicBlock::iterator XCoreInstrInfo::loadImmediate(
   MachineConstantPool *ConstantPool = MBB.getParent()->getConstantPool();
   const Constant *C = ConstantInt::get(
         Type::getInt32Ty(MBB.getParent()->getFunction().getContext()), Value);
-  unsigned Idx = ConstantPool->getConstantPoolIndex(C, 4);
+  unsigned Idx = ConstantPool->getConstantPoolIndex(C, Align(4));
   return BuildMI(MBB, MI, dl, get(XCore::LDWCP_lru6), Reg)
       .addConstantPoolIndex(Idx)
       .getInstr();
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.h b/llvm/lib/Target/XCore/XCoreInstrInfo.h
index 057fb763efbf..1fbb293bde60 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.h
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.h
@@ -68,13 +68,13 @@ public:
 
   void storeRegToStackSlot(MachineBasicBlock &MBB,
                            MachineBasicBlock::iterator MI,
-                           unsigned SrcReg, bool isKill, int FrameIndex,
+                           Register SrcReg, bool isKill, int FrameIndex,
                            const TargetRegisterClass *RC,
                            const TargetRegisterInfo *TRI) const override;
 
   void loadRegFromStackSlot(MachineBasicBlock &MBB,
                             MachineBasicBlock::iterator MI,
-                            unsigned DestReg, int FrameIndex,
+                            Register DestReg, int FrameIndex,
                             const TargetRegisterClass *RC,
                             const TargetRegisterInfo *TRI) const override;
 
diff --git a/llvm/lib/Target/XCore/XCoreInstrInfo.td b/llvm/lib/Target/XCore/XCoreInstrInfo.td
index 18f02e1d80f0..aa3739d0335e 100644
--- a/llvm/lib/Target/XCore/XCoreInstrInfo.td
+++ b/llvm/lib/Target/XCore/XCoreInstrInfo.td
@@ -535,7 +535,7 @@ let hasSideEffects = 0, isReMaterializable = 1 in
 def LDAWDP_ru6: _FRU6<0b011000, (outs RRegs:$a), (ins i32imm:$b),
                       "ldaw $a, dp[$b]", []>;
 
-let isReMaterializable = 1 in                    
+let isReMaterializable = 1 in
 def LDAWDP_lru6: _FLRU6<0b011000, (outs RRegs:$a), (ins i32imm:$b),
                         "ldaw $a, dp[$b]",
                         [(set RRegs:$a, (dprelwrapper tglobaladdr:$b))]>;
@@ -974,17 +974,17 @@ def SETDP_1r : _F1R<0b001100, (outs), (ins GRRegs:$a), "set dp, $a", []>;
 let hasSideEffects=0 in
 def SETCP_1r : _F1R<0b001101, (outs), (ins GRRegs:$a), "set cp, $a", []>;
 
-let hasCtrlDep = 1 in 
+let hasCtrlDep = 1 in
 def ECALLT_1r : _F1R<0b010011, (outs), (ins GRRegs:$a),
                  "ecallt $a",
                  []>;
 
-let hasCtrlDep = 1 in 
+let hasCtrlDep = 1 in
 def ECALLF_1r : _F1R<0b010010, (outs), (ins GRRegs:$a),
                  "ecallf $a",
                  []>;
 
-let isCall=1, 
+let isCall=1,
 // All calls clobber the link register and the non-callee-saved registers:
 Defs = [R0, R1, R2, R3, R11, LR], Uses = [SP] in {
 def BLA_1r : _F1R<0b001000, (outs), (ins GRRegs:$a),
@@ -1141,7 +1141,7 @@ def : Pat<(truncstorei8 GRRegs:$val, (add GRRegs:$addr, GRRegs:$offset)),
           (ST8_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
 def : Pat<(truncstorei8 GRRegs:$val, GRRegs:$addr),
           (ST8_l3r GRRegs:$val, GRRegs:$addr, (LDC_ru6 0))>;
-          
+
 def : Pat<(truncstorei16 GRRegs:$val, (lda16f GRRegs:$addr, GRRegs:$offset)),
           (ST16_l3r GRRegs:$val, GRRegs:$addr, GRRegs:$offset)>;
 def : Pat<(truncstorei16 GRRegs:$val, GRRegs:$addr),
@@ -1154,6 +1154,9 @@ def : Pat<(store GRRegs:$val, (add GRRegs:$addr, immUs4:$offset)),
 def : Pat<(store GRRegs:$val, GRRegs:$addr),
           (STW_2rus GRRegs:$val, GRRegs:$addr, 0)>;
 
+/// bitrev
+def : Pat<(bitreverse GRRegs:$src), (BITREV_l2r GRRegs:$src)>;
+
 /// cttz
 def : Pat<(cttz GRRegs:$src), (CLZ_l2r (BITREV_l2r GRRegs:$src))>;
 
diff --git a/llvm/lib/Target/XCore/XCoreMCInstLower.h b/llvm/lib/Target/XCore/XCoreMCInstLower.h
index 0eaa84ef736b..efb359cc57e1 100644
--- a/llvm/lib/Target/XCore/XCoreMCInstLower.h
+++ b/llvm/lib/Target/XCore/XCoreMCInstLower.h
@@ -8,6 +8,7 @@
 
 #ifndef LLVM_LIB_TARGET_XCORE_XCOREMCINSTLOWER_H
 #define LLVM_LIB_TARGET_XCORE_XCOREMCINSTLOWER_H
+
 #include "llvm/CodeGen/MachineOperand.h"
 #include "llvm/Support/Compiler.h"
 
@@ -16,8 +17,6 @@ namespace llvm {
   class MCInst;
   class MCOperand;
   class MachineInstr;
-  class MachineFunction;
-  class Mangler;
   class AsmPrinter;
 
 /// This class is used to lower an MachineInstr into an MCInst.
diff --git a/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp b/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
index 0b4fcffbc655..ec44d2899dd5 100644
--- a/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreMachineFunctionInfo.cpp
@@ -43,7 +43,7 @@ int XCoreFunctionInfo::createLRSpillSlot(MachineFunction &MF) {
     LRSpillSlot = MFI.CreateFixedObject(TRI.getSpillSize(RC), 0, true);
   } else {
     LRSpillSlot = MFI.CreateStackObject(TRI.getSpillSize(RC),
-                                        TRI.getSpillAlignment(RC), true);
+                                        TRI.getSpillAlign(RC), true);
   }
   LRSpillSlotSet = true;
   return LRSpillSlot;
@@ -56,8 +56,8 @@ int XCoreFunctionInfo::createFPSpillSlot(MachineFunction &MF) {
   const TargetRegisterClass &RC = XCore::GRRegsRegClass;
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   MachineFrameInfo &MFI = MF.getFrameInfo();
-  FPSpillSlot = MFI.CreateStackObject(TRI.getSpillSize(RC),
-                                      TRI.getSpillAlignment(RC), true);
+  FPSpillSlot =
+      MFI.CreateStackObject(TRI.getSpillSize(RC), TRI.getSpillAlign(RC), true);
   FPSpillSlotSet = true;
   return FPSpillSlot;
 }
@@ -70,9 +70,9 @@ const int* XCoreFunctionInfo::createEHSpillSlot(MachineFunction &MF) {
   const TargetRegisterInfo &TRI = *MF.getSubtarget().getRegisterInfo();
   MachineFrameInfo &MFI = MF.getFrameInfo();
   unsigned Size = TRI.getSpillSize(RC);
-  unsigned Align = TRI.getSpillAlignment(RC);
-  EHSpillSlot[0] = MFI.CreateStackObject(Size, Align, true);
-  EHSpillSlot[1] = MFI.CreateStackObject(Size, Align, true);
+  Align Alignment = TRI.getSpillAlign(RC);
+  EHSpillSlot[0] = MFI.CreateStackObject(Size, Alignment, true);
+  EHSpillSlot[1] = MFI.CreateStackObject(Size, Alignment, true);
   EHSpillSlotSet = true;
   return EHSpillSlot;
 }
diff --git a/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp b/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
index 56fed26ebd7b..6799823f6fcb 100644
--- a/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreRegisterInfo.cpp
@@ -246,11 +246,6 @@ XCoreRegisterInfo::requiresRegisterScavenging(const MachineFunction &MF) const {
 }
 
 bool
-XCoreRegisterInfo::trackLivenessAfterRegAlloc(const MachineFunction &MF) const {
-  return true;
-}
-
-bool
 XCoreRegisterInfo::useFPForScavengingIndex(const MachineFunction &MF) const {
   return false;
 }
diff --git a/llvm/lib/Target/XCore/XCoreRegisterInfo.h b/llvm/lib/Target/XCore/XCoreRegisterInfo.h
index 35a42e1a1457..f1eec7bc87b4 100644
--- a/llvm/lib/Target/XCore/XCoreRegisterInfo.h
+++ b/llvm/lib/Target/XCore/XCoreRegisterInfo.h
@@ -20,8 +20,6 @@
 
 namespace llvm {
 
-class TargetInstrInfo;
-
 struct XCoreRegisterInfo : public XCoreGenRegisterInfo {
 public:
   XCoreRegisterInfo();
@@ -34,8 +32,6 @@ public:
 
   bool requiresRegisterScavenging(const MachineFunction &MF) const override;
 
-  bool trackLivenessAfterRegAlloc(const MachineFunction &MF) const override;
-
   bool useFPForScavengingIndex(const MachineFunction &MF) const override;
 
   void eliminateFrameIndex(MachineBasicBlock::iterator II,
diff --git a/llvm/lib/Target/XCore/XCoreRegisterInfo.td b/llvm/lib/Target/XCore/XCoreRegisterInfo.td
index d9502939bae3..82f61d5865ab 100644
--- a/llvm/lib/Target/XCore/XCoreRegisterInfo.td
+++ b/llvm/lib/Target/XCore/XCoreRegisterInfo.td
@@ -7,7 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 //===----------------------------------------------------------------------===//
-//  Declarations that describe the XCore register file 
+//  Declarations that describe the XCore register file
 //===----------------------------------------------------------------------===//
 
 class XCoreReg<string n> : Register<n> {
@@ -24,17 +24,17 @@ class Ri<bits<4> num, string n> : XCoreReg<n> {
 // CPU registers
 def R0  : Ri< 0, "r0">, DwarfRegNum<[0]>;
 def R1  : Ri< 1, "r1">, DwarfRegNum<[1]>;
-def R2  : Ri< 2, "r2">, DwarfRegNum<[2]>; 
+def R2  : Ri< 2, "r2">, DwarfRegNum<[2]>;
 def R3  : Ri< 3, "r3">, DwarfRegNum<[3]>;
 def R4  : Ri< 4, "r4">, DwarfRegNum<[4]>;
-def R5  : Ri< 5, "r5">, DwarfRegNum<[5]>; 
+def R5  : Ri< 5, "r5">, DwarfRegNum<[5]>;
 def R6  : Ri< 6, "r6">, DwarfRegNum<[6]>;
 def R7  : Ri< 7, "r7">, DwarfRegNum<[7]>;
 def R8  : Ri< 8, "r8">, DwarfRegNum<[8]>;
-def R9  : Ri< 9, "r9">, DwarfRegNum<[9]>; 
+def R9  : Ri< 9, "r9">, DwarfRegNum<[9]>;
 def R10 : Ri<10, "r10">, DwarfRegNum<[10]>;
 def R11 : Ri<11, "r11">, DwarfRegNum<[11]>;
-def CP : Ri<12, "cp">, DwarfRegNum<[12]>; 
+def CP : Ri<12, "cp">, DwarfRegNum<[12]>;
 def DP : Ri<13, "dp">, DwarfRegNum<[13]>;
 def SP : Ri<14, "sp">, DwarfRegNum<[14]>;
 def LR : Ri<15, "lr">, DwarfRegNum<[15]>;
diff --git a/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp b/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
index c86756e345a9..0d097076348c 100644
--- a/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
+++ b/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.cpp
@@ -17,11 +17,11 @@ using namespace llvm;
 
 SDValue XCoreSelectionDAGInfo::EmitTargetCodeForMemcpy(
     SelectionDAG &DAG, const SDLoc &dl, SDValue Chain, SDValue Dst, SDValue Src,
-    SDValue Size, unsigned Align, bool isVolatile, bool AlwaysInline,
+    SDValue Size, Align Alignment, bool isVolatile, bool AlwaysInline,
     MachinePointerInfo DstPtrInfo, MachinePointerInfo SrcPtrInfo) const {
   unsigned SizeBitWidth = Size.getValueSizeInBits();
   // Call __memcpy_4 if the src, dst and size are all 4 byte aligned.
-  if (!AlwaysInline && (Align & 3) == 0 &&
+  if (!AlwaysInline && Alignment >= Align(4) &&
       DAG.MaskedValueIsZero(Size, APInt(SizeBitWidth, 3))) {
     const TargetLowering &TLI = *DAG.getSubtarget().getTargetLowering();
     TargetLowering::ArgListTy Args;
diff --git a/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h b/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h
index 5dcef08391c9..2abf52677978 100644
--- a/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h
+++ b/llvm/lib/Target/XCore/XCoreSelectionDAGInfo.h
@@ -17,13 +17,11 @@
 
 namespace llvm {
 
-class XCoreTargetMachine;
-
 class XCoreSelectionDAGInfo : public SelectionDAGTargetInfo {
 public:
   SDValue EmitTargetCodeForMemcpy(SelectionDAG &DAG, const SDLoc &dl,
                                   SDValue Chain, SDValue Op1, SDValue Op2,
-                                  SDValue Op3, unsigned Align, bool isVolatile,
+                                  SDValue Op3, Align Alignment, bool isVolatile,
                                   bool AlwaysInline,
                                   MachinePointerInfo DstPtrInfo,
                                   MachinePointerInfo SrcPtrInfo) const override;
diff --git a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
index 736bc4148a19..1eea1e37c253 100644
--- a/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/llvm/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -54,7 +54,7 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, const Triple &TT,
           TT, CPU, FS, Options, getEffectiveRelocModel(RM),
           getEffectiveXCoreCodeModel(CM), OL),
       TLOF(std::make_unique<XCoreTargetObjectFile>()),
-      Subtarget(TT, CPU, FS, *this) {
+      Subtarget(TT, std::string(CPU), std::string(FS), *this) {
   initAsmInfo();
 }
 
diff --git a/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp b/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
index fe743b28b4b4..9fec74a372fb 100644
--- a/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
+++ b/llvm/lib/Target/XCore/XCoreTargetObjectFile.cpp
@@ -140,10 +140,9 @@ MCSection *XCoreTargetObjectFile::SelectSectionForGlobal(
   report_fatal_error("Target does not support TLS or Common sections");
 }
 
-MCSection *XCoreTargetObjectFile::getSectionForConstant(const DataLayout &DL,
-                                                        SectionKind Kind,
-                                                        const Constant *C,
-                                                        unsigned &Align) const {
+MCSection *XCoreTargetObjectFile::getSectionForConstant(
+    const DataLayout &DL, SectionKind Kind, const Constant *C,
+    Align &Alignment) const {
   if (Kind.isMergeableConst4())           return MergeableConst4Section;
   if (Kind.isMergeableConst8())           return MergeableConst8Section;
   if (Kind.isMergeableConst16())          return MergeableConst16Section;
diff --git a/llvm/lib/Target/XCore/XCoreTargetObjectFile.h b/llvm/lib/Target/XCore/XCoreTargetObjectFile.h
index fd172c55919f..73cc6686d775 100644
--- a/llvm/lib/Target/XCore/XCoreTargetObjectFile.h
+++ b/llvm/lib/Target/XCore/XCoreTargetObjectFile.h
@@ -32,7 +32,7 @@ static const unsigned CodeModelLargeSize = 256;
 
     MCSection *getSectionForConstant(const DataLayout &DL, SectionKind Kind,
                                      const Constant *C,
-                                     unsigned &Align) const override;
+                                     Align &Alignment) const override;
   };
 } // end namespace llvm
 
diff --git a/llvm/lib/TextAPI/MachO/Architecture.cpp b/llvm/lib/TextAPI/MachO/Architecture.cpp
index 699fb5f4587a..0c5988030336 100644
--- a/llvm/lib/TextAPI/MachO/Architecture.cpp
+++ b/llvm/lib/TextAPI/MachO/Architecture.cpp
@@ -12,13 +12,16 @@
 
 #include "llvm/TextAPI/MachO/Architecture.h"
 #include "llvm/ADT/StringSwitch.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/TextAPI/MachO/ArchitectureSet.h"
 
 namespace llvm {
 namespace MachO {
 
 Architecture getArchitectureFromCpuType(uint32_t CPUType, uint32_t CPUSubType) {
-#define ARCHINFO(Arch, Type, Subtype)                                          \
+#define ARCHINFO(Arch, Type, Subtype, NumBits)                                 \
   if (CPUType == (Type) &&                                                     \
       (CPUSubType & ~MachO::CPU_SUBTYPE_MASK) == (Subtype))                    \
     return AK_##Arch;
@@ -30,7 +33,7 @@ Architecture getArchitectureFromCpuType(uint32_t CPUType, uint32_t CPUSubType) {
 
 Architecture getArchitectureFromName(StringRef Name) {
   return StringSwitch<Architecture>(Name)
-#define ARCHINFO(Arch, Type, Subtype) .Case(#Arch, AK_##Arch)
+#define ARCHINFO(Arch, Type, Subtype, NumBits) .Case(#Arch, AK_##Arch)
 #include "llvm/TextAPI/MachO/Architecture.def"
 #undef ARCHINFO
       .Default(AK_unknown);
@@ -38,7 +41,7 @@ Architecture getArchitectureFromName(StringRef Name) {
 
 StringRef getArchitectureName(Architecture Arch) {
   switch (Arch) {
-#define ARCHINFO(Arch, Type, Subtype)                                          \
+#define ARCHINFO(Arch, Type, Subtype, NumBits)                                 \
   case AK_##Arch:                                                              \
     return #Arch;
 #include "llvm/TextAPI/MachO/Architecture.def"
@@ -54,7 +57,7 @@ StringRef getArchitectureName(Architecture Arch) {
 
 std::pair<uint32_t, uint32_t> getCPUTypeFromArchitecture(Architecture Arch) {
   switch (Arch) {
-#define ARCHINFO(Arch, Type, Subtype)                                          \
+#define ARCHINFO(Arch, Type, Subtype, NumBits)                                 \
   case AK_##Arch:                                                              \
     return std::make_pair(Type, Subtype);
 #include "llvm/TextAPI/MachO/Architecture.def"
@@ -72,6 +75,20 @@ Architecture mapToArchitecture(const Triple &Target) {
   return getArchitectureFromName(Target.getArchName());
 }
 
+bool is64Bit(Architecture Arch) {
+  switch (Arch) {
+#define ARCHINFO(Arch, Type, Subtype, NumBits)                                 \
+  case AK_##Arch:                                                              \
+    return NumBits == 64;
+#include "llvm/TextAPI/MachO/Architecture.def"
+#undef ARCHINFO
+  case AK_unknown:
+    return false;
+  }
+
+  llvm_unreachable("Fully handled switch case above.");
+}
+
 raw_ostream &operator<<(raw_ostream &OS, Architecture Arch) {
   OS << getArchitectureName(Arch);
   return OS;
diff --git a/llvm/lib/TextAPI/MachO/ArchitectureSet.cpp b/llvm/lib/TextAPI/MachO/ArchitectureSet.cpp
index c589671199b7..f665706fad81 100644
--- a/llvm/lib/TextAPI/MachO/ArchitectureSet.cpp
+++ b/llvm/lib/TextAPI/MachO/ArchitectureSet.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/TextAPI/MachO/ArchitectureSet.h"
+#include "llvm/Support/raw_ostream.h"
 
 namespace llvm {
 namespace MachO {
@@ -40,7 +41,7 @@ ArchitectureSet::operator std::string() const {
   std::string result;
   auto size = count();
   for (auto arch : *this) {
-    result.append(getArchitectureName(arch));
+    result.append(std::string(getArchitectureName(arch)));
     size -= 1;
     if (size)
       result.append(" ");
diff --git a/llvm/lib/TextAPI/MachO/InterfaceFile.cpp b/llvm/lib/TextAPI/MachO/InterfaceFile.cpp
index c40a952a6a8b..64d2c3e865ab 100644
--- a/llvm/lib/TextAPI/MachO/InterfaceFile.cpp
+++ b/llvm/lib/TextAPI/MachO/InterfaceFile.cpp
@@ -14,9 +14,10 @@
 #include <iomanip>
 #include <sstream>
 
-namespace llvm {
-namespace MachO {
-namespace detail {
+using namespace llvm;
+using namespace llvm::MachO;
+
+namespace {
 template <typename C>
 typename C::iterator addEntry(C &Container, StringRef InstallName) {
   auto I = partition_point(Container, [=](const InterfaceFileRef &O) {
@@ -39,21 +40,21 @@ typename C::iterator addEntry(C &Container, const Target &Target_) {
 
   return Container.insert(Iter, Target_);
 }
-} // end namespace detail.
+} // end namespace
 
 void InterfaceFileRef::addTarget(const Target &Target) {
-  detail::addEntry(Targets, Target);
+  addEntry(Targets, Target);
 }
 
 void InterfaceFile::addAllowableClient(StringRef InstallName,
                                        const Target &Target) {
-  auto Client = detail::addEntry(AllowableClients, InstallName);
+  auto Client = addEntry(AllowableClients, InstallName);
   Client->addTarget(Target);
 }
 
 void InterfaceFile::addReexportedLibrary(StringRef InstallName,
                                          const Target &Target) {
-  auto Lib = detail::addEntry(ReexportedLibraries, InstallName);
+  auto Lib = addEntry(ReexportedLibraries, InstallName);
   Lib->addTarget(Target);
 }
 
@@ -63,11 +64,11 @@ void InterfaceFile::addParentUmbrella(const Target &Target_, StringRef Parent) {
                              Target RHS) { return LHS.first < RHS; });
 
   if ((Iter != ParentUmbrellas.end()) && !(Target_ < Iter->first)) {
-    Iter->second = Parent;
+    Iter->second = std::string(Parent);
     return;
   }
 
-  ParentUmbrellas.emplace(Iter, Target_, Parent);
+  ParentUmbrellas.emplace(Iter, Target_, std::string(Parent));
   return;
 }
 
@@ -77,11 +78,11 @@ void InterfaceFile::addUUID(const Target &Target_, StringRef UUID) {
                              Target RHS) { return LHS.first < RHS; });
 
   if ((Iter != UUIDs.end()) && !(Target_ < Iter->first)) {
-    Iter->second = UUID;
+    Iter->second = std::string(UUID);
     return;
   }
 
-  UUIDs.emplace(Iter, Target_, UUID);
+  UUIDs.emplace(Iter, Target_, std::string(UUID));
   return;
 }
 
@@ -97,7 +98,7 @@ void InterfaceFile::addUUID(const Target &Target, uint8_t UUID[16]) {
 }
 
 void InterfaceFile::addTarget(const Target &Target) {
-  detail::addEntry(Targets, Target);
+  addEntry(Targets, Target);
 }
 
 InterfaceFile::const_filtered_target_range
@@ -118,6 +119,3 @@ void InterfaceFile::addSymbol(SymbolKind Kind, StringRef Name,
     for (const auto &Target : Targets)
       result.first->second->addTarget(Target);
 }
-
-} // end namespace MachO.
-} // end namespace llvm.
diff --git a/llvm/lib/TextAPI/MachO/TextAPIContext.h b/llvm/lib/TextAPI/MachO/TextAPIContext.h
index 3df40f09f7f7..217d1f5400ee 100644
--- a/llvm/lib/TextAPI/MachO/TextAPIContext.h
+++ b/llvm/lib/TextAPI/MachO/TextAPIContext.h
@@ -13,7 +13,6 @@
 #ifndef LLVM_TEXTAPI_MACHO_CONTEXT_H
 #define LLVM_TEXTAPI_MACHO_CONTEXT_H
 
-#include "llvm/Support/MemoryBuffer.h"
 #include <string>
 
 namespace llvm {
diff --git a/llvm/lib/TextAPI/MachO/TextStub.cpp b/llvm/lib/TextAPI/MachO/TextStub.cpp
index 0584e43d5893..141f897fb564 100644
--- a/llvm/lib/TextAPI/MachO/TextStub.cpp
+++ b/llvm/lib/TextAPI/MachO/TextStub.cpp
@@ -412,8 +412,10 @@ template <> struct ScalarTraits<Target> {
 
   static StringRef input(StringRef Scalar, void *, Target &Value) {
     auto Result = Target::create(Scalar);
-    if (!Result)
-      return toString(Result.takeError());
+    if (!Result) {
+      consumeError(Result.takeError());
+      return "unparsable target";
+    }
 
     Value = *Result;
     if (Value.Arch == AK_unknown)
@@ -450,10 +452,8 @@ template <> struct MappingTraits<const InterfaceFile *> {
       if (File->isInstallAPI())
         Flags |= TBDFlags::InstallAPI;
 
-      for (const auto &Iter : File->umbrellas()) {
-        ParentUmbrella = Iter.second;
-        break;
-      }
+      if (!File->umbrellas().empty())
+        ParentUmbrella = File->umbrellas().begin()->second;
 
       std::set<ArchitectureSet> ArchSet;
       for (const auto &Library : File->allowableClients())
@@ -959,7 +959,8 @@ template <> struct MappingTraits<const InterfaceFile *> {
 
           for (auto &sym : CurrentSection.WeakSymbols)
             File->addSymbol(SymbolKind::GlobalSymbol, sym,
-                            CurrentSection.Targets);
+                            CurrentSection.Targets, SymbolFlags::WeakDefined);
+
           for (auto &sym : CurrentSection.TlvSymbols)
             File->addSymbol(SymbolKind::GlobalSymbol, sym,
                             CurrentSection.Targets,
@@ -1088,8 +1089,8 @@ struct DocumentListTraits<std::vector<const MachO::InterfaceFile *>> {
 };
 
 } // end namespace yaml.
+} // namespace llvm
 
-namespace MachO {
 static void DiagHandler(const SMDiagnostic &Diag, void *Context) {
   auto *File = static_cast<TextAPIContext *>(Context);
   SmallString<1024> Message;
@@ -1107,7 +1108,7 @@ static void DiagHandler(const SMDiagnostic &Diag, void *Context) {
 Expected<std::unique_ptr<InterfaceFile>>
 TextAPIReader::get(MemoryBufferRef InputBuffer) {
   TextAPIContext Ctx;
-  Ctx.Path = InputBuffer.getBufferIdentifier();
+  Ctx.Path = std::string(InputBuffer.getBufferIdentifier());
   yaml::Input YAMLIn(InputBuffer.getBuffer(), &Ctx, DiagHandler, &Ctx);
 
   // Fill vector with interface file objects created by parsing the YAML file.
@@ -1119,6 +1120,10 @@ TextAPIReader::get(MemoryBufferRef InputBuffer) {
   auto File = std::unique_ptr<InterfaceFile>(
       const_cast<InterfaceFile *>(Files.front()));
 
+  for (auto Iter = std::next(Files.begin()); Iter != Files.end(); ++Iter)
+    File->addDocument(
+        std::shared_ptr<InterfaceFile>(const_cast<InterfaceFile *>(*Iter)));
+
   if (YAMLIn.error())
     return make_error<StringError>(Ctx.ErrorMessage, YAMLIn.error());
 
@@ -1127,18 +1132,18 @@ TextAPIReader::get(MemoryBufferRef InputBuffer) {
 
 Error TextAPIWriter::writeToStream(raw_ostream &OS, const InterfaceFile &File) {
   TextAPIContext Ctx;
-  Ctx.Path = File.getPath();
+  Ctx.Path = std::string(File.getPath());
   Ctx.FileKind = File.getFileType();
   llvm::yaml::Output YAMLOut(OS, &Ctx, /*WrapColumn=*/80);
 
   std::vector<const InterfaceFile *> Files;
   Files.emplace_back(&File);
 
+  for (auto Document : File.documents())
+    Files.emplace_back(Document.get());
+
   // Stream out yaml.
   YAMLOut << Files;
 
   return Error::success();
 }
-
-} // end namespace MachO.
-} // end namespace llvm.
diff --git a/llvm/lib/TextAPI/MachO/TextStubCommon.cpp b/llvm/lib/TextAPI/MachO/TextStubCommon.cpp
index 183c5d5a93b0..4a82df6beac1 100644
--- a/llvm/lib/TextAPI/MachO/TextStubCommon.cpp
+++ b/llvm/lib/TextAPI/MachO/TextStubCommon.cpp
@@ -12,6 +12,7 @@
 
 #include "TextStubCommon.h"
 #include "TextAPIContext.h"
+#include "llvm/ADT/StringSwitch.h"
 
 using namespace llvm::MachO;
 
@@ -62,18 +63,27 @@ void ScalarTraits<PlatformSet>::output(const PlatformSet &Values, void *IO,
   case PlatformKind::macOS:
     OS << "macosx";
     break;
+  case PlatformKind::iOSSimulator:
+    LLVM_FALLTHROUGH;
   case PlatformKind::iOS:
     OS << "ios";
     break;
+  case PlatformKind::watchOSSimulator:
+    LLVM_FALLTHROUGH;
   case PlatformKind::watchOS:
     OS << "watchos";
     break;
+  case PlatformKind::tvOSSimulator:
+    LLVM_FALLTHROUGH;
   case PlatformKind::tvOS:
     OS << "tvos";
     break;
   case PlatformKind::bridgeOS:
     OS << "bridgeos";
     break;
+  case PlatformKind::macCatalyst:
+    OS << "iosmac";
+    break;
   }
 }
 
@@ -119,7 +129,7 @@ QuotingType ScalarTraits<PlatformSet>::mustQuote(StringRef) {
 
 void ScalarBitSetTraits<ArchitectureSet>::bitset(IO &IO,
                                                  ArchitectureSet &Archs) {
-#define ARCHINFO(arch, type, subtype)                                          \
+#define ARCHINFO(arch, type, subtype, numbits)                                 \
   IO.bitSetCase(Archs, #arch, 1U << static_cast<int>(AK_##arch));
 #include "llvm/TextAPI/MachO/Architecture.def"
 #undef ARCHINFO
@@ -212,7 +222,7 @@ StringRef ScalarTraits<UUID>::input(StringRef Scalar, void *, UUID &Value) {
   auto UUID = Split.second.trim();
   if (UUID.empty())
     return "invalid uuid string pair";
-  Value.second = UUID;
+  Value.second = std::string(UUID);
   Value.first = Target{getArchitectureFromName(Arch), PlatformKind::unknown};
   return {};
 }
diff --git a/llvm/lib/TextAPI/MachO/TextStubCommon.h b/llvm/lib/TextAPI/MachO/TextStubCommon.h
index a558cbcec9fb..f2cda50e297d 100644
--- a/llvm/lib/TextAPI/MachO/TextStubCommon.h
+++ b/llvm/lib/TextAPI/MachO/TextStubCommon.h
@@ -14,7 +14,6 @@
 #define LLVM_TEXTAPI_TEXT_STUB_COMMON_H
 
 #include "llvm/ADT/StringRef.h"
-#include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/YAMLTraits.h"
 #include "llvm/TextAPI/MachO/Architecture.h"
 #include "llvm/TextAPI/MachO/ArchitectureSet.h"
diff --git a/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp b/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
index 19f253be7952..a4fd62e5557f 100644
--- a/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-dlltool/DlltoolDriver.cpp
@@ -146,7 +146,7 @@ int llvm::dlltoolDriverMain(llvm::ArrayRef<const char *> ArgsArr) {
     return 1;
   }
 
-  std::string Path = Args.getLastArgValue(OPT_l);
+  std::string Path = std::string(Args.getLastArgValue(OPT_l));
 
   // If ExtName is set (if the "ExtName = Name" syntax was used), overwrite
   // Name with ExtName and clear ExtName. When only creating an import
diff --git a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
index 286191abff20..c40901255424 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
+++ b/llvm/lib/ToolDrivers/llvm-lib/LibDriver.cpp
@@ -59,13 +59,10 @@ public:
 
 }
 
-static std::string getOutputPath(opt::InputArgList *Args,
-                                 const NewArchiveMember &FirstMember) {
-  if (auto *Arg = Args->getLastArg(OPT_out))
-    return Arg->getValue();
+static std::string getDefaultOutputPath(const NewArchiveMember &FirstMember) {
   SmallString<128> Val = StringRef(FirstMember.Buf->getBufferIdentifier());
   sys::path::replace_extension(Val, ".lib");
-  return Val.str();
+  return std::string(Val.str());
 }
 
 static std::vector<StringRef> getSearchPaths(opt::InputArgList *Args,
@@ -96,7 +93,7 @@ static std::string findInputFile(StringRef File, ArrayRef<StringRef> Paths) {
     SmallString<128> Path = Dir;
     sys::path::append(Path, File);
     if (sys::fs::exists(Path))
-      return Path.str().str();
+      return std::string(Path);
   }
   return "";
 }
@@ -144,14 +141,14 @@ static void doList(opt::InputArgList& Args) {
 
 static COFF::MachineTypes getCOFFFileMachine(MemoryBufferRef MB) {
   std::error_code EC;
-  object::COFFObjectFile Obj(MB, EC);
-  if (EC) {
+  auto Obj = object::COFFObjectFile::create(MB);
+  if (!Obj) {
     llvm::errs() << MB.getBufferIdentifier()
-                 << ": failed to open: " << EC.message() << '\n';
+                 << ": failed to open: " << Obj.takeError() << '\n';
     exit(1);
   }
 
-  uint16_t Machine = Obj.getMachine();
+  uint16_t Machine = (*Obj)->getMachine();
   if (Machine != COFF::IMAGE_FILE_MACHINE_I386 &&
       Machine != COFF::IMAGE_FILE_MACHINE_AMD64 &&
       Machine != COFF::IMAGE_FILE_MACHINE_ARMNT &&
@@ -292,8 +289,9 @@ int llvm::libDriverMain(ArrayRef<const char *> ArgsArr) {
     return 0;
   }
 
-  // If no input files, silently do nothing to match lib.exe.
-  if (!Args.hasArgNoClaim(OPT_INPUT))
+  // If no input files and not told otherwise, silently do nothing to match
+  // lib.exe
+  if (!Args.hasArgNoClaim(OPT_INPUT) && !Args.hasArg(OPT_llvmlibempty))
     return 0;
 
   if (Args.hasArg(OPT_lst)) {
@@ -352,7 +350,15 @@ int llvm::libDriverMain(ArrayRef<const char *> ArgsArr) {
   }
 
   // Create an archive file.
-  std::string OutputPath = getOutputPath(&Args, Members[0]);
+  std::string OutputPath;
+  if (auto *Arg = Args.getLastArg(OPT_out)) {
+    OutputPath = Arg->getValue();
+  } else if (!Members.empty()) {
+    OutputPath = getDefaultOutputPath(Members[0]);
+  } else {
+    llvm::errs() << "no output path given, and cannot infer with no inputs\n";
+    return 1;
+  }
   // llvm-lib uses relative paths for both regular and thin archives, unlike
   // standard GNU ar, which only uses relative paths for thin archives and
   // basenames for regular archives.
diff --git a/llvm/lib/ToolDrivers/llvm-lib/Options.td b/llvm/lib/ToolDrivers/llvm-lib/Options.td
index 7863196126a8..5891e238a328 100644
--- a/llvm/lib/ToolDrivers/llvm-lib/Options.td
+++ b/llvm/lib/ToolDrivers/llvm-lib/Options.td
@@ -18,6 +18,9 @@ def out    : P<"out", "Path to file to write output">;
 def llvmlibthin : F<"llvmlibthin">,
     HelpText<"Make .lib point to .obj files instead of copying their contents">;
 
+def llvmlibempty : F<"llvmlibempty">,
+    HelpText<"When given no contents, produce an empty .lib file">;
+
 def machine: P<"machine", "Specify target platform">;
 
 def help : F<"help">;
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 59b94567a9c2..d315c7f13ac2 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -16,6 +16,7 @@
 #include "AggressiveInstCombineInternal.h"
 #include "llvm-c/Initialization.h"
 #include "llvm-c/Transforms/AggressiveInstCombine.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
@@ -28,11 +29,17 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Transforms/Utils/Local.h"
+
 using namespace llvm;
 using namespace PatternMatch;
 
 #define DEBUG_TYPE "aggressive-instcombine"
 
+STATISTIC(NumAnyOrAllBitsSet, "Number of any/all-bits-set patterns folded");
+STATISTIC(NumGuardedRotates,
+          "Number of guarded rotates transformed into funnel shifts");
+STATISTIC(NumPopCountRecognized, "Number of popcount idioms recognized");
+
 namespace {
 /// Contains expression pattern combiner logic.
 /// This class provides both the logic to combine expression patterns and
@@ -148,6 +155,7 @@ static bool foldGuardedRotateToFunnelShift(Instruction &I) {
   IRBuilder<> Builder(PhiBB, PhiBB->getFirstInsertionPt());
   Function *F = Intrinsic::getDeclaration(Phi.getModule(), IID, Phi.getType());
   Phi.replaceAllUsesWith(Builder.CreateCall(F, {RotSrc, RotSrc, RotAmt}));
+  ++NumGuardedRotates;
   return true;
 }
 
@@ -248,6 +256,7 @@ static bool foldAnyOrAllBitsSet(Instruction &I) {
                                : Builder.CreateIsNotNull(And);
   Value *Zext = Builder.CreateZExt(Cmp, I.getType());
   I.replaceAllUsesWith(Zext);
+  ++NumAnyOrAllBitsSet;
   return true;
 }
 
@@ -308,6 +317,7 @@ static bool tryToRecognizePopCount(Instruction &I) {
           Function *Func = Intrinsic::getDeclaration(
               I.getModule(), Intrinsic::ctpop, I.getType());
           I.replaceAllUsesWith(Builder.CreateCall(Func, {Root}));
+          ++NumPopCountRecognized;
           return true;
         }
       }
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
index 44e1c45664e7..42bcadfc7dcd 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombineInternal.h
@@ -16,14 +16,8 @@
 #define LLVM_LIB_TRANSFORMS_AGGRESSIVEINSTCOMBINE_COMBINEINTERNAL_H
 
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/Analysis/AliasAnalysis.h"
-#include "llvm/Analysis/BasicAliasAnalysis.h"
-#include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/GlobalsModRef.h"
-#include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/DataLayout.h"
-#include "llvm/Pass.h"
+#include "llvm/ADT/SmallVector.h"
+
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
@@ -47,7 +41,12 @@ using namespace llvm;
 namespace llvm {
   class DataLayout;
   class DominatorTree;
+  class Function;
+  class Instruction;
   class TargetLibraryInfo;
+  class TruncInst;
+  class Type;
+  class Value;
 
 class TruncInstCombine {
   TargetLibraryInfo &TLI;
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
index 7c5767912fd3..5cd40c66227f 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/TruncInstCombine.cpp
@@ -25,17 +25,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "AggressiveInstCombineInternal.h"
-#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Instruction.h"
 #include "llvm/IR/IRBuilder.h"
+
 using namespace llvm;
 
 #define DEBUG_TYPE "aggressive-instcombine"
 
+STATISTIC(
+    NumDAGsReduced,
+    "Number of truncations eliminated by reducing bit width of expression DAG");
+STATISTIC(NumInstrsReduced,
+          "Number of instructions whose bit width was reduced");
+
 /// Given an instruction and a container, it fills all the relevant operands of
 /// that instruction, with respect to the Trunc expression dag optimizaton.
 static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
@@ -56,6 +64,10 @@ static void getRelevantOperands(Instruction *I, SmallVectorImpl<Value *> &Ops) {
     Ops.push_back(I->getOperand(0));
     Ops.push_back(I->getOperand(1));
     break;
+  case Instruction::Select:
+    Ops.push_back(I->getOperand(1));
+    Ops.push_back(I->getOperand(2));
+    break;
   default:
     llvm_unreachable("Unreachable!");
   }
@@ -114,7 +126,8 @@ bool TruncInstCombine::buildTruncExpressionDag() {
     case Instruction::Mul:
     case Instruction::And:
     case Instruction::Or:
-    case Instruction::Xor: {
+    case Instruction::Xor:
+    case Instruction::Select: {
       SmallVector<Value *, 2> Operands;
       getRelevantOperands(I, Operands);
       for (Value *Operand : Operands)
@@ -123,7 +136,7 @@ bool TruncInstCombine::buildTruncExpressionDag() {
     }
     default:
       // TODO: Can handle more cases here:
-      // 1. select, shufflevector, extractelement, insertelement
+      // 1. shufflevector, extractelement, insertelement
       // 2. udiv, urem
       // 3. shl, lshr, ashr
       // 4. phi node(and loop handling)
@@ -194,7 +207,7 @@ unsigned TruncInstCombine::getMinBitWidth() {
         unsigned IOpBitwidth = InstInfoMap.lookup(IOp).ValidBitWidth;
         if (IOpBitwidth >= ValidBitWidth)
           continue;
-        InstInfoMap[IOp].ValidBitWidth = std::max(ValidBitWidth, IOpBitwidth);
+        InstInfoMap[IOp].ValidBitWidth = ValidBitWidth;
         Worklist.push_back(IOp);
       }
   }
@@ -276,8 +289,10 @@ Type *TruncInstCombine::getBestTruncatedType() {
 /// version of \p Ty, otherwise return \p Ty.
 static Type *getReducedType(Value *V, Type *Ty) {
   assert(Ty && !Ty->isVectorTy() && "Expect Scalar Type");
-  if (auto *VTy = dyn_cast<VectorType>(V->getType()))
-    return VectorType::get(Ty, VTy->getNumElements());
+  if (auto *VTy = dyn_cast<VectorType>(V->getType())) {
+    // FIXME: should this handle scalable vectors?
+    return FixedVectorType::get(Ty, VTy->getNumElements());
+  }
   return Ty;
 }
 
@@ -286,9 +301,7 @@ Value *TruncInstCombine::getReducedOperand(Value *V, Type *SclTy) {
   if (auto *C = dyn_cast<Constant>(V)) {
     C = ConstantExpr::getIntegerCast(C, Ty, false);
     // If we got a constantexpr back, try to simplify it with DL info.
-    if (Constant *FoldedC = ConstantFoldConstant(C, DL, &TLI))
-      C = FoldedC;
-    return C;
+    return ConstantFoldConstant(C, DL, &TLI);
   }
 
   auto *I = cast<Instruction>(V);
@@ -298,6 +311,7 @@ Value *TruncInstCombine::getReducedOperand(Value *V, Type *SclTy) {
 }
 
 void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
+  NumInstrsReduced += InstInfoMap.size();
   for (auto &Itr : InstInfoMap) { // Forward
     Instruction *I = Itr.first;
     TruncInstCombine::Info &NodeInfo = Itr.second;
@@ -351,6 +365,13 @@ void TruncInstCombine::ReduceExpressionDag(Type *SclTy) {
       Res = Builder.CreateBinOp((Instruction::BinaryOps)Opc, LHS, RHS);
       break;
     }
+    case Instruction::Select: {
+      Value *Op0 = I->getOperand(0);
+      Value *LHS = getReducedOperand(I->getOperand(1), SclTy);
+      Value *RHS = getReducedOperand(I->getOperand(2), SclTy);
+      Res = Builder.CreateSelect(Op0, LHS, RHS);
+      break;
+    }
     default:
       llvm_unreachable("Unhandled instruction");
     }
@@ -409,6 +430,7 @@ bool TruncInstCombine::run(Function &F) {
                     "dominated by: "
                  << CurrentTruncInst << '\n');
       ReduceExpressionDag(NewDstSclTy);
+      ++NumDAGsReduced;
       MadeIRChange = true;
     }
   }
diff --git a/llvm/lib/Transforms/CFGuard/CFGuard.cpp b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
index 7c5e90cb53cd..96c083a144b2 100644
--- a/llvm/lib/Transforms/CFGuard/CFGuard.cpp
+++ b/llvm/lib/Transforms/CFGuard/CFGuard.cpp
@@ -204,14 +204,9 @@ void CFGuard::insertCFGuardDispatch(CallBase *CB) {
   Bundles.emplace_back("cfguardtarget", CalledOperand);
 
   // Create a copy of the call/invoke instruction and add the new bundle.
-  CallBase *NewCB;
-  if (CallInst *CI = dyn_cast<CallInst>(CB)) {
-    NewCB = CallInst::Create(CI, Bundles, CB);
-  } else {
-    assert(isa<InvokeInst>(CB) && "Unknown indirect call type");
-    InvokeInst *II = cast<InvokeInst>(CB);
-    NewCB = llvm::InvokeInst::Create(II, Bundles, CB);
-  }
+  assert((isa<CallInst>(CB) || isa<InvokeInst>(CB)) &&
+         "Unknown indirect call type");
+  CallBase *NewCB = CallBase::Create(CB, Bundles, CB);
 
   // Change the target of the call to be the guard dispatch function.
   NewCB->setCalledOperand(GuardDispatchLoad);
@@ -302,4 +297,4 @@ FunctionPass *llvm::createCFGuardCheckPass() {
 
 FunctionPass *llvm::createCFGuardDispatchPass() {
   return new CFGuard(CFGuard::CF_Dispatch);
-}
-\ No newline at end of file
+}
diff --git a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
index c2dbd6f41642..233eae37c497 100644
--- a/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroCleanup.cpp
@@ -5,9 +5,8 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// This pass lowers all remaining coroutine intrinsics.
-//===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Coroutines/CoroCleanup.h"
 #include "CoroInternal.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
@@ -90,12 +89,26 @@ bool Lowerer::lowerRemainingCoroIntrinsics(Function &F) {
     // After replacement were made we can cleanup the function body a little.
     simplifyCFG(F);
   }
+
   return Changed;
 }
 
-//===----------------------------------------------------------------------===//
-//                              Top Level Driver
-//===----------------------------------------------------------------------===//
+static bool declaresCoroCleanupIntrinsics(const Module &M) {
+  return coro::declaresIntrinsics(M, {"llvm.coro.alloc", "llvm.coro.begin",
+                                      "llvm.coro.subfn.addr", "llvm.coro.free",
+                                      "llvm.coro.id", "llvm.coro.id.retcon",
+                                      "llvm.coro.id.retcon.once"});
+}
+
+PreservedAnalyses CoroCleanupPass::run(Function &F,
+                                       FunctionAnalysisManager &AM) {
+  auto &M = *F.getParent();
+  if (!declaresCoroCleanupIntrinsics(M) ||
+      !Lowerer(M).lowerRemainingCoroIntrinsics(F))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
 
 namespace {
 
@@ -111,10 +124,7 @@ struct CoroCleanupLegacy : FunctionPass {
   // This pass has work to do only if we find intrinsics we are going to lower
   // in the module.
   bool doInitialization(Module &M) override {
-    if (coro::declaresIntrinsics(M, {"llvm.coro.alloc", "llvm.coro.begin",
-                                     "llvm.coro.subfn.addr", "llvm.coro.free",
-                                     "llvm.coro.id", "llvm.coro.id.retcon",
-                                     "llvm.coro.id.retcon.once"}))
+    if (declaresCoroCleanupIntrinsics(M))
       L = std::make_unique<Lowerer>(M);
     return false;
   }
diff --git a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
index e73fb9eeb1e9..242e6c3f6b23 100644
--- a/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroEarly.cpp
@@ -5,13 +5,9 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// This pass lowers coroutine intrinsics that hide the details of the exact
-// calling convention for coroutine resume and destroy functions and details of
-// the structure of the coroutine frame.
-//===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Coroutines/CoroEarly.h"
 #include "CoroInternal.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Module.h"
@@ -28,7 +24,7 @@ class Lowerer : public coro::LowererBase {
   PointerType *const AnyResumeFnPtrTy;
   Constant *NoopCoro = nullptr;
 
-  void lowerResumeOrDestroy(CallSite CS, CoroSubFnInst::ResumeKind);
+  void lowerResumeOrDestroy(CallBase &CB, CoroSubFnInst::ResumeKind);
   void lowerCoroPromise(CoroPromiseInst *Intrin);
   void lowerCoroDone(IntrinsicInst *II);
   void lowerCoroNoop(IntrinsicInst *II);
@@ -47,12 +43,11 @@ public:
 // an address returned by coro.subfn.addr intrinsic. This is done so that
 // CGPassManager recognizes devirtualization when CoroElide pass replaces a call
 // to coro.subfn.addr with an appropriate function address.
-void Lowerer::lowerResumeOrDestroy(CallSite CS,
+void Lowerer::lowerResumeOrDestroy(CallBase &CB,
                                    CoroSubFnInst::ResumeKind Index) {
-  Value *ResumeAddr =
-      makeSubFnCall(CS.getArgOperand(0), Index, CS.getInstruction());
-  CS.setCalledFunction(ResumeAddr);
-  CS.setCallingConv(CallingConv::Fast);
+  Value *ResumeAddr = makeSubFnCall(CB.getArgOperand(0), Index, &CB);
+  CB.setCalledOperand(ResumeAddr);
+  CB.setCallingConv(CallingConv::Fast);
 }
 
 // Coroutine promise field is always at the fixed offset from the beginning of
@@ -64,14 +59,14 @@ void Lowerer::lowerResumeOrDestroy(CallSite CS,
 // TODO: Handle the case when coroutine promise alloca has align override.
 void Lowerer::lowerCoroPromise(CoroPromiseInst *Intrin) {
   Value *Operand = Intrin->getArgOperand(0);
-  unsigned Alignement = Intrin->getAlignment();
+  Align Alignment = Intrin->getAlignment();
   Type *Int8Ty = Builder.getInt8Ty();
 
   auto *SampleStruct =
       StructType::get(Context, {AnyResumeFnPtrTy, AnyResumeFnPtrTy, Int8Ty});
   const DataLayout &DL = TheModule.getDataLayout();
   int64_t Offset = alignTo(
-      DL.getStructLayout(SampleStruct)->getElementOffset(2), Alignement);
+      DL.getStructLayout(SampleStruct)->getElementOffset(2), Alignment);
   if (Intrin->isFromPromise())
     Offset = -Offset;
 
@@ -98,7 +93,7 @@ void Lowerer::lowerCoroDone(IntrinsicInst *II) {
 
   Builder.SetInsertPoint(II);
   auto *BCI = Builder.CreateBitCast(Operand, FramePtrTy);
-  auto *Load = Builder.CreateLoad(BCI);
+  auto *Load = Builder.CreateLoad(FrameTy, BCI);
   auto *Cond = Builder.CreateICmpEQ(Load, NullPtr);
 
   II->replaceAllUsesWith(Cond);
@@ -156,8 +151,8 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
   SmallVector<CoroFreeInst *, 4> CoroFrees;
   for (auto IB = inst_begin(F), IE = inst_end(F); IB != IE;) {
     Instruction &I = *IB++;
-    if (auto CS = CallSite(&I)) {
-      switch (CS.getIntrinsicID()) {
+    if (auto *CB = dyn_cast<CallBase>(&I)) {
+      switch (CB->getIntrinsicID()) {
       default:
         continue;
       case Intrinsic::coro_free:
@@ -167,13 +162,13 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
         // Make sure that final suspend point is not duplicated as CoroSplit
         // pass expects that there is at most one final suspend point.
         if (cast<CoroSuspendInst>(&I)->isFinal())
-          CS.setCannotDuplicate();
+          CB->setCannotDuplicate();
         break;
       case Intrinsic::coro_end:
         // Make sure that fallthrough coro.end is not duplicated as CoroSplit
         // pass expects that there is at most one fallthrough coro.end.
         if (cast<CoroEndInst>(&I)->isFallthrough())
-          CS.setCannotDuplicate();
+          CB->setCannotDuplicate();
         break;
       case Intrinsic::coro_noop:
         lowerCoroNoop(cast<IntrinsicInst>(&I));
@@ -195,10 +190,10 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
         F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT);
         break;
       case Intrinsic::coro_resume:
-        lowerResumeOrDestroy(CS, CoroSubFnInst::ResumeIndex);
+        lowerResumeOrDestroy(*CB, CoroSubFnInst::ResumeIndex);
         break;
       case Intrinsic::coro_destroy:
-        lowerResumeOrDestroy(CS, CoroSubFnInst::DestroyIndex);
+        lowerResumeOrDestroy(*CB, CoroSubFnInst::DestroyIndex);
         break;
       case Intrinsic::coro_promise:
         lowerCoroPromise(cast<CoroPromiseInst>(&I));
@@ -219,9 +214,23 @@ bool Lowerer::lowerEarlyIntrinsics(Function &F) {
   return Changed;
 }
 
-//===----------------------------------------------------------------------===//
-//                              Top Level Driver
-//===----------------------------------------------------------------------===//
+static bool declaresCoroEarlyIntrinsics(const Module &M) {
+  return coro::declaresIntrinsics(
+      M, {"llvm.coro.id", "llvm.coro.id.retcon", "llvm.coro.id.retcon.once",
+          "llvm.coro.destroy", "llvm.coro.done", "llvm.coro.end",
+          "llvm.coro.noop", "llvm.coro.free", "llvm.coro.promise",
+          "llvm.coro.resume", "llvm.coro.suspend"});
+}
+
+PreservedAnalyses CoroEarlyPass::run(Function &F, FunctionAnalysisManager &) {
+  Module &M = *F.getParent();
+  if (!declaresCoroEarlyIntrinsics(M) || !Lowerer(M).lowerEarlyIntrinsics(F))
+    return PreservedAnalyses::all();
+
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  return PA;
+}
 
 namespace {
 
@@ -236,17 +245,7 @@ struct CoroEarlyLegacy : public FunctionPass {
   // This pass has work to do only if we find intrinsics we are going to lower
   // in the module.
   bool doInitialization(Module &M) override {
-    if (coro::declaresIntrinsics(M, {"llvm.coro.id",
-                                     "llvm.coro.id.retcon",
-                                     "llvm.coro.id.retcon.once",
-                                     "llvm.coro.destroy",
-                                     "llvm.coro.done",
-                                     "llvm.coro.end",
-                                     "llvm.coro.noop", 
-                                     "llvm.coro.free",
-                                     "llvm.coro.promise",
-                                     "llvm.coro.resume",
-                                     "llvm.coro.suspend"}))
+    if (declaresCoroEarlyIntrinsics(M))
       L = std::make_unique<Lowerer>(M);
     return false;
   }
diff --git a/llvm/lib/Transforms/Coroutines/CoroElide.cpp b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
index 23d22e23861a..9d364b3097c1 100644
--- a/llvm/lib/Transforms/Coroutines/CoroElide.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroElide.cpp
@@ -5,12 +5,10 @@
 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 //
 //===----------------------------------------------------------------------===//
-// This pass replaces dynamic allocation of coroutine frame with alloca and
-// replaces calls to llvm.coro.resume and llvm.coro.destroy with direct calls
-// to coroutine sub-functions.
-//===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Coroutines/CoroElide.h"
 #include "CoroInternal.h"
+#include "llvm/ADT/DenseMap.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/IR/Dominators.h"
@@ -30,14 +28,19 @@ struct Lowerer : coro::LowererBase {
   SmallVector<CoroBeginInst *, 1> CoroBegins;
   SmallVector<CoroAllocInst *, 1> CoroAllocs;
   SmallVector<CoroSubFnInst *, 4> ResumeAddr;
-  SmallVector<CoroSubFnInst *, 4> DestroyAddr;
+  DenseMap<CoroBeginInst *, SmallVector<CoroSubFnInst *, 4>> DestroyAddr;
   SmallVector<CoroFreeInst *, 1> CoroFrees;
+  SmallPtrSet<const SwitchInst *, 4> CoroSuspendSwitches;
 
   Lowerer(Module &M) : LowererBase(M) {}
 
-  void elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA);
+  void elideHeapAllocations(Function *F, uint64_t FrameSize, Align FrameAlign,
+                            AAResults &AA);
   bool shouldElide(Function *F, DominatorTree &DT) const;
+  void collectPostSplitCoroIds(Function *F);
   bool processCoroId(CoroIdInst *, AAResults &AA, DominatorTree &DT);
+  bool hasEscapePath(const CoroBeginInst *,
+                     const SmallPtrSetImpl<BasicBlock *> &) const;
 };
 } // end anonymous namespace
 
@@ -90,10 +93,23 @@ static void removeTailCallAttribute(AllocaInst *Frame, AAResults &AA) {
       }
 }
 
-// Given a resume function @f.resume(%f.frame* %frame), returns %f.frame type.
-static Type *getFrameType(Function *Resume) {
-  auto *ArgType = Resume->arg_begin()->getType();
-  return cast<PointerType>(ArgType)->getElementType();
+// Given a resume function @f.resume(%f.frame* %frame), returns the size
+// and expected alignment of %f.frame type.
+static std::pair<uint64_t, Align> getFrameLayout(Function *Resume) {
+  // Prefer to pull information from the function attributes.
+  auto Size = Resume->getParamDereferenceableBytes(0);
+  auto Align = Resume->getParamAlign(0);
+
+  // If those aren't given, extract them from the type.
+  if (Size == 0 || !Align) {
+    auto *FrameTy = Resume->arg_begin()->getType()->getPointerElementType();
+
+    const DataLayout &DL = Resume->getParent()->getDataLayout();
+    if (!Size) Size = DL.getTypeAllocSize(FrameTy);
+    if (!Align) Align = DL.getABITypeAlign(FrameTy);
+  }
+
+  return std::make_pair(Size, *Align);
 }
 
 // Finds first non alloca instruction in the entry block of a function.
@@ -106,8 +122,9 @@ static Instruction *getFirstNonAllocaInTheEntryBlock(Function *F) {
 
 // To elide heap allocations we need to suppress code blocks guarded by
 // llvm.coro.alloc and llvm.coro.free instructions.
-void Lowerer::elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA) {
-  LLVMContext &C = FrameTy->getContext();
+void Lowerer::elideHeapAllocations(Function *F, uint64_t FrameSize,
+                                   Align FrameAlign, AAResults &AA) {
+  LLVMContext &C = F->getContext();
   auto *InsertPt =
       getFirstNonAllocaInTheEntryBlock(CoroIds.front()->getFunction());
 
@@ -128,7 +145,9 @@ void Lowerer::elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA) {
   // here. Possibly we will need to do a mini SROA here and break the coroutine
   // frame into individual AllocaInst recreating the original alignment.
   const DataLayout &DL = F->getParent()->getDataLayout();
+  auto FrameTy = ArrayType::get(Type::getInt8Ty(C), FrameSize);
   auto *Frame = new AllocaInst(FrameTy, DL.getAllocaAddrSpace(), "", InsertPt);
+  Frame->setAlignment(FrameAlign);
   auto *FrameVoidPtr =
       new BitCastInst(Frame, Type::getInt8PtrTy(C), "vFrame", InsertPt);
 
@@ -142,44 +161,92 @@ void Lowerer::elideHeapAllocations(Function *F, Type *FrameTy, AAResults &AA) {
   removeTailCallAttribute(Frame, AA);
 }
 
+bool Lowerer::hasEscapePath(const CoroBeginInst *CB,
+                            const SmallPtrSetImpl<BasicBlock *> &TIs) const {
+  const auto &It = DestroyAddr.find(CB);
+  assert(It != DestroyAddr.end());
+
+  // Limit the number of blocks we visit.
+  unsigned Limit = 32 * (1 + It->second.size());
+
+  SmallVector<const BasicBlock *, 32> Worklist;
+  Worklist.push_back(CB->getParent());
+
+  SmallPtrSet<const BasicBlock *, 32> Visited;
+  // Consider basicblock of coro.destroy as visited one, so that we
+  // skip the path pass through coro.destroy.
+  for (auto *DA : It->second)
+    Visited.insert(DA->getParent());
+
+  do {
+    const auto *BB = Worklist.pop_back_val();
+    if (!Visited.insert(BB).second)
+      continue;
+    if (TIs.count(BB))
+      return true;
+
+    // Conservatively say that there is potentially a path.
+    if (!--Limit)
+      return true;
+
+    auto TI = BB->getTerminator();
+    // Although the default dest of coro.suspend switches is suspend pointer
+    // which means a escape path to normal terminator, it is reasonable to skip
+    // it since coroutine frame doesn't change outside the coroutine body.
+    if (isa<SwitchInst>(TI) &&
+        CoroSuspendSwitches.count(cast<SwitchInst>(TI))) {
+      Worklist.push_back(cast<SwitchInst>(TI)->getSuccessor(1));
+      Worklist.push_back(cast<SwitchInst>(TI)->getSuccessor(2));
+    } else
+      Worklist.append(succ_begin(BB), succ_end(BB));
+
+  } while (!Worklist.empty());
+
+  // We have exhausted all possible paths and are certain that coro.begin can
+  // not reach to any of terminators.
+  return false;
+}
+
 bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
   // If no CoroAllocs, we cannot suppress allocation, so elision is not
   // possible.
   if (CoroAllocs.empty())
     return false;
 
-  // Check that for every coro.begin there is a coro.destroy directly
-  // referencing the SSA value of that coro.begin along a non-exceptional path.
+  // Check that for every coro.begin there is at least one coro.destroy directly
+  // referencing the SSA value of that coro.begin along each
+  // non-exceptional path.
   // If the value escaped, then coro.destroy would have been referencing a
   // memory location storing that value and not the virtual register.
 
+  SmallPtrSet<BasicBlock *, 8> Terminators;
   // First gather all of the non-exceptional terminators for the function.
-  SmallPtrSet<Instruction *, 8> Terminators;
-  for (BasicBlock &B : *F) {
-    auto *TI = B.getTerminator();
-    if (TI->getNumSuccessors() == 0 && !TI->isExceptionalTerminator() &&
-        !isa<UnreachableInst>(TI))
-      Terminators.insert(TI);
-  }
+  // Consider the final coro.suspend as the real terminator when the current
+  // function is a coroutine.
+    for (BasicBlock &B : *F) {
+      auto *TI = B.getTerminator();
+      if (TI->getNumSuccessors() == 0 && !TI->isExceptionalTerminator() &&
+          !isa<UnreachableInst>(TI))
+        Terminators.insert(&B);
+    }
 
   // Filter out the coro.destroy that lie along exceptional paths.
-  SmallPtrSet<CoroSubFnInst *, 4> DAs;
-  for (CoroSubFnInst *DA : DestroyAddr) {
-    for (Instruction *TI : Terminators) {
-      if (DT.dominates(DA, TI)) {
-        DAs.insert(DA);
-        break;
+  SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
+  for (auto &It : DestroyAddr) {
+    for (Instruction *DA : It.second) {
+      for (BasicBlock *TI : Terminators) {
+        if (DT.dominates(DA, TI->getTerminator())) {
+          ReferencedCoroBegins.insert(It.first);
+          break;
+        }
       }
     }
-  }
 
-  // Find all the coro.begin referenced by coro.destroy along happy paths.
-  SmallPtrSet<CoroBeginInst *, 8> ReferencedCoroBegins;
-  for (CoroSubFnInst *DA : DAs) {
-    if (auto *CB = dyn_cast<CoroBeginInst>(DA->getFrame()))
-      ReferencedCoroBegins.insert(CB);
-    else
-      return false;
+    // Whether there is any paths from coro.begin to Terminators which not pass
+    // through any of the coro.destroys.
+    if (!ReferencedCoroBegins.count(It.first) &&
+        !hasEscapePath(It.first, Terminators))
+      ReferencedCoroBegins.insert(It.first);
   }
 
   // If size of the set is the same as total number of coro.begin, that means we
@@ -188,6 +255,30 @@ bool Lowerer::shouldElide(Function *F, DominatorTree &DT) const {
   return ReferencedCoroBegins.size() == CoroBegins.size();
 }
 
+void Lowerer::collectPostSplitCoroIds(Function *F) {
+  CoroIds.clear();
+  CoroSuspendSwitches.clear();
+  for (auto &I : instructions(F)) {
+    if (auto *CII = dyn_cast<CoroIdInst>(&I))
+      if (CII->getInfo().isPostSplit())
+        // If it is the coroutine itself, don't touch it.
+        if (CII->getCoroutine() != CII->getFunction())
+          CoroIds.push_back(CII);
+
+    // Consider case like:
+    // %0 = call i8 @llvm.coro.suspend(...)
+    // switch i8 %0, label %suspend [i8 0, label %resume
+    //                              i8 1, label %cleanup]
+    // and collect the SwitchInsts which are used by escape analysis later.
+    if (auto *CSI = dyn_cast<CoroSuspendInst>(&I))
+      if (CSI->hasOneUse() && isa<SwitchInst>(CSI->use_begin()->getUser())) {
+        SwitchInst *SWI = cast<SwitchInst>(CSI->use_begin()->getUser());
+        if (SWI->getNumCases() == 2)
+          CoroSuspendSwitches.insert(SWI);
+      }
+  }
+}
+
 bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA,
                             DominatorTree &DT) {
   CoroBegins.clear();
@@ -218,7 +309,7 @@ bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA,
           ResumeAddr.push_back(II);
           break;
         case CoroSubFnInst::DestroyIndex:
-          DestroyAddr.push_back(II);
+          DestroyAddr[CB].push_back(II);
           break;
         default:
           llvm_unreachable("unexpected coro.subfn.addr constant");
@@ -241,11 +332,13 @@ bool Lowerer::processCoroId(CoroIdInst *CoroId, AAResults &AA,
       Resumers,
       ShouldElide ? CoroSubFnInst::CleanupIndex : CoroSubFnInst::DestroyIndex);
 
-  replaceWithConstant(DestroyAddrConstant, DestroyAddr);
+  for (auto &It : DestroyAddr)
+    replaceWithConstant(DestroyAddrConstant, It.second);
 
   if (ShouldElide) {
-    auto *FrameTy = getFrameType(cast<Function>(ResumeAddrConstant));
-    elideHeapAllocations(CoroId->getFunction(), FrameTy, AA);
+    auto FrameSizeAndAlign = getFrameLayout(cast<Function>(ResumeAddrConstant));
+    elideHeapAllocations(CoroId->getFunction(), FrameSizeAndAlign.first,
+                         FrameSizeAndAlign.second, AA);
     coro::replaceCoroFree(CoroId, /*Elide=*/true);
   }
 
@@ -272,9 +365,31 @@ static bool replaceDevirtTrigger(Function &F) {
   return true;
 }
 
-//===----------------------------------------------------------------------===//
-//                              Top Level Driver
-//===----------------------------------------------------------------------===//
+static bool declaresCoroElideIntrinsics(Module &M) {
+  return coro::declaresIntrinsics(M, {"llvm.coro.id"});
+}
+
+PreservedAnalyses CoroElidePass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &M = *F.getParent();
+  if (!declaresCoroElideIntrinsics(M))
+    return PreservedAnalyses::all();
+
+  Lowerer L(M);
+  L.CoroIds.clear();
+  L.collectPostSplitCoroIds(&F);
+  // If we did not find any coro.id, there is nothing to do.
+  if (L.CoroIds.empty())
+    return PreservedAnalyses::all();
+
+  AAResults &AA = AM.getResult<AAManager>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+  bool Changed = false;
+  for (auto *CII : L.CoroIds)
+    Changed |= L.processCoroId(CII, AA, DT);
+
+  return Changed ? PreservedAnalyses::none() : PreservedAnalyses::all();
+}
 
 namespace {
 struct CoroElideLegacy : FunctionPass {
@@ -286,7 +401,7 @@ struct CoroElideLegacy : FunctionPass {
   std::unique_ptr<Lowerer> L;
 
   bool doInitialization(Module &M) override {
-    if (coro::declaresIntrinsics(M, {"llvm.coro.id"}))
+    if (declaresCoroElideIntrinsics(M))
       L = std::make_unique<Lowerer>(M);
     return false;
   }
@@ -301,15 +416,7 @@ struct CoroElideLegacy : FunctionPass {
       Changed = replaceDevirtTrigger(F);
 
     L->CoroIds.clear();
-
-    // Collect all PostSplit coro.ids.
-    for (auto &I : instructions(F))
-      if (auto *CII = dyn_cast<CoroIdInst>(&I))
-        if (CII->getInfo().isPostSplit())
-          // If it is the coroutine itself, don't touch it.
-          if (CII->getCoroutine() != CII->getFunction())
-            L->CoroIds.push_back(CII);
-
+    L->collectPostSplitCoroIds(&F);
     // If we did not find any coro.id, there is nothing to do.
     if (L->CoroIds.empty())
       return Changed;
diff --git a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
index 2c42cf8a6d25..f55501a05d85 100644
--- a/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroFrame.cpp
@@ -18,18 +18,22 @@
 
 #include "CoroInternal.h"
 #include "llvm/ADT/BitVector.h"
+#include "llvm/ADT/SmallString.h"
 #include "llvm/Analysis/PtrUseVisitor.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/CFG.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/circular_raw_ostream.h"
+#include "llvm/Support/OptimizedStructLayout.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
+#include <algorithm>
 
 using namespace llvm;
 
@@ -105,7 +109,6 @@ struct SuspendCrossingInfo {
     size_t const DefIndex = Mapping.blockToIndex(DefBB);
     size_t const UseIndex = Mapping.blockToIndex(UseBB);
 
-    assert(Block[UseIndex].Consumes[DefIndex] && "use must consume def");
     bool const Result = Block[UseIndex].Kills[DefIndex];
     LLVM_DEBUG(dbgs() << UseBB->getName() << " => " << DefBB->getName()
                       << " answer is " << Result << "\n");
@@ -338,52 +341,182 @@ namespace {
 // coroutine frame and if the alignment specified on the Alloca instruction
 // differs from the natural alignment of the alloca type we will need to insert
 // padding.
-struct PaddingCalculator {
+class FrameTypeBuilder {
+  struct Field {
+    uint64_t Size;
+    uint64_t Offset;
+    Spill *ForSpill;
+    Type *Ty;
+    unsigned FieldIndex;
+    Align Alignment;
+    Align TyAlignment;
+  };
+
   const DataLayout &DL;
   LLVMContext &Context;
-  unsigned StructSize = 0;
+  uint64_t StructSize = 0;
+  Align StructAlign;
+  bool IsFinished = false;
 
-  PaddingCalculator(LLVMContext &Context, DataLayout const &DL)
-      : DL(DL), Context(Context) {}
+  SmallVector<Field, 8> Fields;
+  DenseMap<Value*, unsigned> FieldIndexByKey;
 
-  // Replicate the logic from IR/DataLayout.cpp to match field offset
-  // computation for LLVM structs.
-  void addType(Type *Ty) {
-    unsigned TyAlign = DL.getABITypeAlignment(Ty);
-    if ((StructSize & (TyAlign - 1)) != 0)
-      StructSize = alignTo(StructSize, TyAlign);
+public:
+  FrameTypeBuilder(LLVMContext &Context, DataLayout const &DL)
+    : DL(DL), Context(Context) {}
 
-    StructSize += DL.getTypeAllocSize(Ty); // Consume space for this data item.
-  }
+  class FieldId {
+    size_t Value;
+    explicit FieldId(size_t Value) : Value(Value) {}
+
+    friend class FrameTypeBuilder;
+  };
+
+  /// Add a field to this structure for the storage of an `alloca`
+  /// instruction.
+  FieldId addFieldForAlloca(AllocaInst *AI, Spill *ForSpill = nullptr,
+                            bool IsHeader = false) {
+    Type *Ty = AI->getAllocatedType();
+
+    // Make an array type if this is a static array allocation.
+    if (AI->isArrayAllocation()) {
+      if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize()))
+        Ty = ArrayType::get(Ty, CI->getValue().getZExtValue());
+      else
+        report_fatal_error("Coroutines cannot handle non static allocas yet");
+    }
 
-  void addTypes(SmallVectorImpl<Type *> const &Types) {
-    for (auto *Ty : Types)
-      addType(Ty);
+    return addField(Ty, AI->getAlign(), ForSpill, IsHeader);
   }
 
-  unsigned computePadding(Type *Ty, unsigned ForcedAlignment) {
-    unsigned TyAlign = DL.getABITypeAlignment(Ty);
-    auto Natural = alignTo(StructSize, TyAlign);
-    auto Forced = alignTo(StructSize, ForcedAlignment);
+  /// Add a field to this structure.
+  FieldId addField(Type *Ty, MaybeAlign FieldAlignment,
+                   Spill *ForSpill = nullptr,
+                   bool IsHeader = false) {
+    assert(!IsFinished && "adding fields to a finished builder");
+    assert(Ty && "must provide a type for a field");
 
-    // Return how many bytes of padding we need to insert.
-    if (Natural != Forced)
-      return std::max(Natural, Forced) - StructSize;
+    // The field size is always the alloc size of the type.
+    uint64_t FieldSize = DL.getTypeAllocSize(Ty);
 
-    // Rely on natural alignment.
-    return 0;
+    // The field alignment might not be the type alignment, but we need
+    // to remember the type alignment anyway to build the type.
+    Align TyAlignment = DL.getABITypeAlign(Ty);
+    if (!FieldAlignment) FieldAlignment = TyAlignment;
+
+    // Lay out header fields immediately.
+    uint64_t Offset;
+    if (IsHeader) {
+      Offset = alignTo(StructSize, FieldAlignment);
+      StructSize = Offset + FieldSize;
+
+    // Everything else has a flexible offset.
+    } else {
+      Offset = OptimizedStructLayoutField::FlexibleOffset;
+    }
+
+    Fields.push_back({FieldSize, Offset, ForSpill, Ty, 0,
+                      *FieldAlignment, TyAlignment});
+    return FieldId(Fields.size() - 1);
+  }
+
+  /// Finish the layout and set the body on the given type.
+  void finish(StructType *Ty);
+
+  uint64_t getStructSize() const {
+    assert(IsFinished && "not yet finished!");
+    return StructSize;
   }
 
-  // If padding required, return the padding field type to insert.
-  ArrayType *getPaddingType(Type *Ty, unsigned ForcedAlignment) {
-    if (auto Padding = computePadding(Ty, ForcedAlignment))
-      return ArrayType::get(Type::getInt8Ty(Context), Padding);
+  Align getStructAlign() const {
+    assert(IsFinished && "not yet finished!");
+    return StructAlign;
+  }
 
-    return nullptr;
+  unsigned getFieldIndex(FieldId Id) const {
+    assert(IsFinished && "not yet finished!");
+    return Fields[Id.Value].FieldIndex;
   }
 };
 } // namespace
 
+void FrameTypeBuilder::finish(StructType *Ty) {
+  assert(!IsFinished && "already finished!");
+
+  // Prepare the optimal-layout field array.
+  // The Id in the layout field is a pointer to our Field for it.
+  SmallVector<OptimizedStructLayoutField, 8> LayoutFields;
+  LayoutFields.reserve(Fields.size());
+  for (auto &Field : Fields) {
+    LayoutFields.emplace_back(&Field, Field.Size, Field.Alignment,
+                              Field.Offset);
+  }
+
+  // Perform layout.
+  auto SizeAndAlign = performOptimizedStructLayout(LayoutFields);
+  StructSize = SizeAndAlign.first;
+  StructAlign = SizeAndAlign.second;
+
+  auto getField = [](const OptimizedStructLayoutField &LayoutField) -> Field & {
+    return *static_cast<Field *>(const_cast<void*>(LayoutField.Id));
+  };
+
+  // We need to produce a packed struct type if there's a field whose
+  // assigned offset isn't a multiple of its natural type alignment.
+  bool Packed = [&] {
+    for (auto &LayoutField : LayoutFields) {
+      auto &F = getField(LayoutField);
+      if (!isAligned(F.TyAlignment, LayoutField.Offset))
+        return true;
+    }
+    return false;
+  }();
+
+  // Build the struct body.
+  SmallVector<Type*, 16> FieldTypes;
+  FieldTypes.reserve(LayoutFields.size() * 3 / 2);
+  uint64_t LastOffset = 0;
+  for (auto &LayoutField : LayoutFields) {
+    auto &F = getField(LayoutField);
+
+    auto Offset = LayoutField.Offset;
+
+    // Add a padding field if there's a padding gap and we're either
+    // building a packed struct or the padding gap is more than we'd
+    // get from aligning to the field type's natural alignment.
+    assert(Offset >= LastOffset);
+    if (Offset != LastOffset) {
+      if (Packed || alignTo(LastOffset, F.TyAlignment) != Offset)
+        FieldTypes.push_back(ArrayType::get(Type::getInt8Ty(Context),
+                                            Offset - LastOffset));
+    }
+
+    // Record the layout information into both the Field and the
+    // original Spill, if there is one.
+    F.Offset = Offset;
+    F.FieldIndex = FieldTypes.size();
+    if (F.ForSpill) {
+      F.ForSpill->setFieldIndex(F.FieldIndex);
+    }
+
+    FieldTypes.push_back(F.Ty);
+    LastOffset = Offset + F.Size;
+  }
+
+  Ty->setBody(FieldTypes, Packed);
+
+#ifndef NDEBUG
+  // Check that the IR layout matches the offsets we expect.
+  auto Layout = DL.getStructLayout(Ty);
+  for (auto &F : Fields) {
+    assert(Ty->getElementType(F.FieldIndex) == F.Ty);
+    assert(Layout->getElementOffset(F.FieldIndex) == F.Offset);
+  }
+#endif
+
+  IsFinished = true;
+}
+
 // Build a struct that will keep state for an active coroutine.
 //   struct f.frame {
 //     ResumeFnTy ResumeFnAddr;
@@ -396,13 +529,17 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
                                   SpillInfo &Spills) {
   LLVMContext &C = F.getContext();
   const DataLayout &DL = F.getParent()->getDataLayout();
-  PaddingCalculator Padder(C, DL);
-  SmallString<32> Name(F.getName());
-  Name.append(".Frame");
-  StructType *FrameTy = StructType::create(C, Name);
-  SmallVector<Type *, 8> Types;
+  StructType *FrameTy = [&] {
+    SmallString<32> Name(F.getName());
+    Name.append(".Frame");
+    return StructType::create(C, Name);
+  }();
+
+  FrameTypeBuilder B(C, DL);
 
   AllocaInst *PromiseAlloca = Shape.getPromiseAlloca();
+  Optional<FrameTypeBuilder::FieldId> PromiseFieldId;
+  Optional<FrameTypeBuilder::FieldId> SwitchIndexFieldId;
 
   if (Shape.ABI == coro::ABI::Switch) {
     auto *FramePtrTy = FrameTy->getPointerTo();
@@ -410,74 +547,74 @@ static StructType *buildFrameType(Function &F, coro::Shape &Shape,
                                    /*IsVarArg=*/false);
     auto *FnPtrTy = FnTy->getPointerTo();
 
-    // Figure out how wide should be an integer type storing the suspend index.
+    // Add header fields for the resume and destroy functions.
+    // We can rely on these being perfectly packed.
+    B.addField(FnPtrTy, None, nullptr, /*header*/ true);
+    B.addField(FnPtrTy, None, nullptr, /*header*/ true);
+
+    // Add a header field for the promise if there is one.
+    if (PromiseAlloca) {
+      PromiseFieldId =
+        B.addFieldForAlloca(PromiseAlloca, nullptr, /*header*/ true);
+    }
+
+    // Add a field to store the suspend index.  This doesn't need to
+    // be in the header.
     unsigned IndexBits = std::max(1U, Log2_64_Ceil(Shape.CoroSuspends.size()));
-    Type *PromiseType = PromiseAlloca
-                            ? PromiseAlloca->getType()->getElementType()
-                            : Type::getInt1Ty(C);
     Type *IndexType = Type::getIntNTy(C, IndexBits);
-    Types.push_back(FnPtrTy);
-    Types.push_back(FnPtrTy);
-    Types.push_back(PromiseType);
-    Types.push_back(IndexType);
+
+    SwitchIndexFieldId = B.addField(IndexType, None);
   } else {
     assert(PromiseAlloca == nullptr && "lowering doesn't support promises");
   }
 
   Value *CurrentDef = nullptr;
 
-  Padder.addTypes(Types);
-
   // Create an entry for every spilled value.
   for (auto &S : Spills) {
+    // We can have multiple entries in Spills for a single value, but
+    // they should form a contiguous run.  Ignore all but the first.
     if (CurrentDef == S.def())
       continue;
 
     CurrentDef = S.def();
-    // PromiseAlloca was already added to Types array earlier.
-    if (CurrentDef == PromiseAlloca)
-      continue;
 
-    uint64_t Count = 1;
-    Type *Ty = nullptr;
+    assert(CurrentDef != PromiseAlloca &&
+           "recorded spill use of promise alloca?");
+
     if (auto *AI = dyn_cast<AllocaInst>(CurrentDef)) {
-      Ty = AI->getAllocatedType();
-      if (unsigned AllocaAlignment = AI->getAlignment()) {
-        // If alignment is specified in alloca, see if we need to insert extra
-        // padding.
-        if (auto PaddingTy = Padder.getPaddingType(Ty, AllocaAlignment)) {
-          Types.push_back(PaddingTy);
-          Padder.addType(PaddingTy);
-        }
-      }
-      if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize()))
-        Count = CI->getValue().getZExtValue();
-      else
-        report_fatal_error("Coroutines cannot handle non static allocas yet");
+      B.addFieldForAlloca(AI, &S);
     } else {
-      Ty = CurrentDef->getType();
+      Type *Ty = CurrentDef->getType();
+      B.addField(Ty, None, &S);
     }
-    S.setFieldIndex(Types.size());
-    if (Count == 1)
-      Types.push_back(Ty);
-    else
-      Types.push_back(ArrayType::get(Ty, Count));
-    Padder.addType(Ty);
   }
-  FrameTy->setBody(Types);
+
+  B.finish(FrameTy);
+  Shape.FrameAlign = B.getStructAlign();
+  Shape.FrameSize = B.getStructSize();
 
   switch (Shape.ABI) {
+  // In the switch ABI, remember the field indices for the promise and
+  // switch-index fields.
   case coro::ABI::Switch:
+    Shape.SwitchLowering.IndexField =
+      B.getFieldIndex(*SwitchIndexFieldId);
+    Shape.SwitchLowering.PromiseField =
+      (PromiseAlloca ? B.getFieldIndex(*PromiseFieldId) : 0);
+
+    // Also round the frame size up to a multiple of its alignment, as is
+    // generally expected in C/C++.
+    Shape.FrameSize = alignTo(Shape.FrameSize, Shape.FrameAlign);
     break;
 
-  // Remember whether the frame is inline in the storage.
+  // In the retcon ABI, remember whether the frame is inline in the storage.
   case coro::ABI::Retcon:
   case coro::ABI::RetconOnce: {
-    auto &Layout = F.getParent()->getDataLayout();
     auto Id = Shape.getRetconCoroId();
     Shape.RetconLowering.IsFrameInlineInStorage
-      = (Layout.getTypeAllocSize(FrameTy) <= Id->getStorageSize() &&
-         Layout.getABITypeAlignment(FrameTy) <= Id->getStorageAlignment());
+      = (B.getStructSize() <= Id->getStorageSize() &&
+         B.getStructAlign() <= Id->getStorageAlignment());
     break;
   }
   }
@@ -606,10 +743,12 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
   // we remember allocas and their indices to be handled once we processed
   // all the spills.
   SmallVector<std::pair<AllocaInst *, unsigned>, 4> Allocas;
-  // Promise alloca (if present) has a fixed field number.
+
+  // Promise alloca (if present) doesn't show in the spills and has a
+  // special field number.
   if (auto *PromiseAlloca = Shape.getPromiseAlloca()) {
     assert(Shape.ABI == coro::ABI::Switch);
-    Allocas.emplace_back(PromiseAlloca, coro::Shape::SwitchFieldIndex::Promise);
+    Allocas.emplace_back(PromiseAlloca, Shape.getPromiseField());
   }
 
   // Create a GEP with the given index into the coroutine frame for the original
@@ -636,12 +775,12 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
   };
 
   // Create a load instruction to reload the spilled value from the coroutine
-  // frame.
-  auto CreateReload = [&](Instruction *InsertBefore) {
+  // frame. Populates the Value pointer reference provided with the frame GEP.
+  auto CreateReload = [&](Instruction *InsertBefore, Value *&G) {
     assert(Index != InvalidFieldIndex && "accessing unassigned field number");
     Builder.SetInsertPoint(InsertBefore);
 
-    auto *G = GetFramePointer(Index, CurrentValue);
+    G = GetFramePointer(Index, CurrentValue);
     G->setName(CurrentValue->getName() + Twine(".reload.addr"));
 
     return isa<AllocaInst>(CurrentValue)
@@ -650,6 +789,7 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
                                     CurrentValue->getName() + Twine(".reload"));
   };
 
+  Value *GEP = nullptr, *CurrentGEP = nullptr;
   for (auto const &E : Spills) {
     // If we have not seen the value, generate a spill.
     if (CurrentValue != E.def()) {
@@ -722,7 +862,7 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
     // If we have not seen the use block, generate a reload in it.
     if (CurrentBlock != E.userBlock()) {
       CurrentBlock = E.userBlock();
-      CurrentReload = CreateReload(&*CurrentBlock->getFirstInsertionPt());
+      CurrentReload = CreateReload(&*CurrentBlock->getFirstInsertionPt(), GEP);
     }
 
     // If we have a single edge PHINode, remove it and replace it with a reload
@@ -736,6 +876,19 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
       continue;
     }
 
+    // If we have not seen this GEP instruction, migrate any dbg.declare from
+    // the alloca to it.
+    if (CurrentGEP != GEP) {
+      CurrentGEP = GEP;
+      TinyPtrVector<DbgDeclareInst *> DIs = FindDbgDeclareUses(CurrentValue);
+      if (!DIs.empty())
+        DIBuilder(*CurrentBlock->getParent()->getParent(),
+                  /*AllowUnresolved*/ false)
+            .insertDeclare(CurrentGEP, DIs.front()->getVariable(),
+                           DIs.front()->getExpression(),
+                           DIs.front()->getDebugLoc(), DIs.front());
+    }
+
     // Replace all uses of CurrentValue in the current instruction with reload.
     E.user()->replaceUsesOfWith(CurrentValue, CurrentReload);
   }
@@ -746,14 +899,38 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
     FramePtrBB->splitBasicBlock(FramePtr->getNextNode(), "AllocaSpillBB");      
   SpillBlock->splitBasicBlock(&SpillBlock->front(), "PostSpill");
   Shape.AllocaSpillBlock = SpillBlock;
-  // If we found any allocas, replace all of their remaining uses with Geps.
-  // Note: we cannot do it indiscriminately as some of the uses may not be
-  // dominated by CoroBegin.
+
+  // retcon and retcon.once lowering assumes all uses have been sunk.
+  if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce) {
+    // If we found any allocas, replace all of their remaining uses with Geps.
+    Builder.SetInsertPoint(&SpillBlock->front());
+    for (auto &P : Allocas) {
+      auto *G = GetFramePointer(P.second, P.first);
+
+      // We are not using ReplaceInstWithInst(P.first, cast<Instruction>(G))
+      // here, as we are changing location of the instruction.
+      G->takeName(P.first);
+      P.first->replaceAllUsesWith(G);
+      P.first->eraseFromParent();
+    }
+    return FramePtr;
+  }
+
+  // If we found any alloca, replace all of their remaining uses with GEP
+  // instructions. Because new dbg.declare have been created for these alloca,
+  // we also delete the original dbg.declare and replace other uses with undef.
+  // Note: We cannot replace the alloca with GEP instructions indiscriminately,
+  // as some of the uses may not be dominated by CoroBegin.
   bool MightNeedToCopy = false;
   Builder.SetInsertPoint(&Shape.AllocaSpillBlock->front());
   SmallVector<Instruction *, 4> UsersToUpdate;
   for (auto &P : Allocas) {
     AllocaInst *const A = P.first;
+
+    for (auto *DI : FindDbgDeclareUses(A))
+      DI->eraseFromParent();
+    replaceDbgUsesWithUndef(A);
+
     UsersToUpdate.clear();
     for (User *U : A->users()) {
       auto *I = cast<Instruction>(U);
@@ -784,7 +961,7 @@ static Instruction *insertSpills(const SpillInfo &Spills, coro::Shape &Shape) {
               "Coroutines cannot handle copying of array allocas yet");
 
         auto *G = GetFramePointer(P.second, A);
-        auto *Value = Builder.CreateLoad(A);
+        auto *Value = Builder.CreateLoad(A->getAllocatedType(), A);
         Builder.CreateStore(Value, G);
       }
     }
@@ -1106,7 +1283,7 @@ static void lowerLocalAllocas(ArrayRef<CoroAllocaAllocInst*> LocalAllocas,
 
     // Allocate memory.
     auto Alloca = Builder.CreateAlloca(Builder.getInt8Ty(), AI->getSize());
-    Alloca->setAlignment(MaybeAlign(AI->getAlignment()));
+    Alloca->setAlignment(Align(AI->getAlignment()));
 
     for (auto U : AI->users()) {
       // Replace gets with the allocation.
@@ -1166,7 +1343,7 @@ static Value *emitGetSwiftErrorValue(IRBuilder<> &Builder, Type *ValueTy,
   auto FnTy = FunctionType::get(ValueTy, {}, false);
   auto Fn = ConstantPointerNull::get(FnTy->getPointerTo());
 
-  auto Call = Builder.CreateCall(Fn, {});
+  auto Call = Builder.CreateCall(FnTy, Fn, {});
   Shape.SwiftErrorOps.push_back(Call);
 
   return Call;
@@ -1182,7 +1359,7 @@ static Value *emitSetSwiftErrorValue(IRBuilder<> &Builder, Value *V,
                                 {V->getType()}, false);
   auto Fn = ConstantPointerNull::get(FnTy->getPointerTo());
 
-  auto Call = Builder.CreateCall(Fn, { V });
+  auto Call = Builder.CreateCall(FnTy, Fn, { V });
   Shape.SwiftErrorOps.push_back(Call);
 
   return Call;
@@ -1322,11 +1499,125 @@ static void eliminateSwiftError(Function &F, coro::Shape &Shape) {
   }
 }
 
-void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
-  // Lower coro.dbg.declare to coro.dbg.value, since we are going to rewrite
-  // access to local variables.
-  LowerDbgDeclare(F);
+/// retcon and retcon.once conventions assume that all spill uses can be sunk
+/// after the coro.begin intrinsic.
+static void sinkSpillUsesAfterCoroBegin(Function &F, const SpillInfo &Spills,
+                                        CoroBeginInst *CoroBegin) {
+  DominatorTree Dom(F);
+
+  SmallSetVector<Instruction *, 32> ToMove;
+  SmallVector<Instruction *, 32> Worklist;
+
+  // Collect all users that precede coro.begin.
+  for (auto const &Entry : Spills) {
+    auto *SpillDef = Entry.def();
+    for (User *U : SpillDef->users()) {
+      auto Inst = cast<Instruction>(U);
+      if (Inst->getParent() != CoroBegin->getParent() ||
+          Dom.dominates(CoroBegin, Inst))
+        continue;
+      if (ToMove.insert(Inst))
+        Worklist.push_back(Inst);
+    }
+  }
+  // Recursively collect users before coro.begin.
+  while (!Worklist.empty()) {
+    auto *Def = Worklist.back();
+    Worklist.pop_back();
+    for (User *U : Def->users()) {
+      auto Inst = cast<Instruction>(U);
+      if (Dom.dominates(CoroBegin, Inst))
+        continue;
+      if (ToMove.insert(Inst))
+        Worklist.push_back(Inst);
+    }
+  }
+
+  // Sort by dominance.
+  SmallVector<Instruction *, 64> InsertionList(ToMove.begin(), ToMove.end());
+  std::sort(InsertionList.begin(), InsertionList.end(),
+            [&Dom](Instruction *A, Instruction *B) -> bool {
+              // If a dominates b it should preceed (<) b.
+              return Dom.dominates(A, B);
+            });
 
+  Instruction *InsertPt = CoroBegin->getNextNode();
+  for (Instruction *Inst : InsertionList)
+    Inst->moveBefore(InsertPt);
+
+  return;
+}
+
+/// For each local variable that all of its user are only used inside one of
+/// suspended region, we sink their lifetime.start markers to the place where
+/// after the suspend block. Doing so minimizes the lifetime of each variable,
+/// hence minimizing the amount of data we end up putting on the frame.
+static void sinkLifetimeStartMarkers(Function &F, coro::Shape &Shape,
+                                     SuspendCrossingInfo &Checker) {
+  DominatorTree DT(F);
+
+  // Collect all possible basic blocks which may dominate all uses of allocas.
+  SmallPtrSet<BasicBlock *, 4> DomSet;
+  DomSet.insert(&F.getEntryBlock());
+  for (auto *CSI : Shape.CoroSuspends) {
+    BasicBlock *SuspendBlock = CSI->getParent();
+    assert(isSuspendBlock(SuspendBlock) && SuspendBlock->getSingleSuccessor() &&
+           "should have split coro.suspend into its own block");
+    DomSet.insert(SuspendBlock->getSingleSuccessor());
+  }
+
+  for (Instruction &I : instructions(F)) {
+    if (!isa<AllocaInst>(&I))
+      continue;
+
+    for (BasicBlock *DomBB : DomSet) {
+      bool Valid = true;
+      SmallVector<Instruction *, 1> BCInsts;
+
+      auto isUsedByLifetimeStart = [&](Instruction *I) {
+        if (isa<BitCastInst>(I) && I->hasOneUse())
+          if (auto *IT = dyn_cast<IntrinsicInst>(I->user_back()))
+            return IT->getIntrinsicID() == Intrinsic::lifetime_start;
+        return false;
+      };
+
+      for (User *U : I.users()) {
+        Instruction *UI = cast<Instruction>(U);
+        // For all users except lifetime.start markers, if they are all
+        // dominated by one of the basic blocks and do not cross
+        // suspend points as well, then there is no need to spill the
+        // instruction.
+        if (!DT.dominates(DomBB, UI->getParent()) ||
+            Checker.isDefinitionAcrossSuspend(DomBB, U)) {
+          // Skip bitcast used by lifetime.start markers.
+          if (isUsedByLifetimeStart(UI)) {
+            BCInsts.push_back(UI);
+            continue;
+          }
+          Valid = false;
+          break;
+        }
+      }
+      // Sink lifetime.start markers to dominate block when they are
+      // only used outside the region.
+      if (Valid && BCInsts.size() != 0) {
+        auto *NewBitcast = BCInsts[0]->clone();
+        auto *NewLifetime = cast<Instruction>(BCInsts[0]->user_back())->clone();
+        NewLifetime->replaceUsesOfWith(BCInsts[0], NewBitcast);
+        NewBitcast->insertBefore(DomBB->getTerminator());
+        NewLifetime->insertBefore(DomBB->getTerminator());
+
+        // All the outsided lifetime.start markers are no longer necessary.
+        for (Instruction *S : BCInsts) {
+          S->user_back()->eraseFromParent();
+        }
+        break;
+      }
+    }
+  }
+}
+
+void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
   eliminateSwiftError(F, Shape);
 
   if (Shape.ABI == coro::ABI::Switch &&
@@ -1376,6 +1667,25 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
     Spills.clear();
   }
 
+  sinkLifetimeStartMarkers(F, Shape, Checker);
+  // Collect lifetime.start info for each alloca.
+  using LifetimeStart = SmallPtrSet<Instruction *, 2>;
+  llvm::DenseMap<Instruction *, std::unique_ptr<LifetimeStart>> LifetimeMap;
+  for (Instruction &I : instructions(F)) {
+    auto *II = dyn_cast<IntrinsicInst>(&I);
+    if (!II || II->getIntrinsicID() != Intrinsic::lifetime_start)
+      continue;
+
+    if (auto *OpInst = dyn_cast<BitCastInst>(I.getOperand(1)))
+      if (auto *AI = dyn_cast<AllocaInst>(OpInst->getOperand(0))) {
+
+        if (LifetimeMap.find(AI) == LifetimeMap.end())
+          LifetimeMap[AI] = std::make_unique<LifetimeStart>();
+
+        LifetimeMap[AI]->insert(OpInst);
+      }
+  }
+
   // Collect the spills for arguments and other not-materializable values.
   for (Argument &A : F.args())
     for (User *U : A.users())
@@ -1421,16 +1731,31 @@ void coro::buildCoroutineFrame(Function &F, Shape &Shape) {
       continue;
     }
 
-    for (User *U : I.users())
-      if (Checker.isDefinitionAcrossSuspend(I, U)) {
+    auto Iter = LifetimeMap.find(&I);
+    for (User *U : I.users()) {
+      bool NeedSpill = false;
+
+      // Check against lifetime.start if the instruction has the info.
+      if (Iter != LifetimeMap.end())
+        for (auto *S : *Iter->second) {
+          if ((NeedSpill = Checker.isDefinitionAcrossSuspend(*S, U)))
+            break;
+        }
+      else
+        NeedSpill = Checker.isDefinitionAcrossSuspend(I, U);
+
+      if (NeedSpill) {
         // We cannot spill a token.
         if (I.getType()->isTokenTy())
           report_fatal_error(
               "token definition is separated from the use by a suspend point");
         Spills.emplace_back(&I, U);
       }
+    }
   }
   LLVM_DEBUG(dump("Spills", Spills));
+  if (Shape.ABI == coro::ABI::Retcon || Shape.ABI == coro::ABI::RetconOnce)
+    sinkSpillUsesAfterCoroBegin(F, Spills, Shape.CoroBegin);
   Shape.FrameTy = buildFrameType(F, Shape, Spills);
   Shape.FramePtr = insertSpills(Spills, Shape);
   lowerLocalAllocas(LocalAllocas, DeadInstructions);
diff --git a/llvm/lib/Transforms/Coroutines/CoroInstr.h b/llvm/lib/Transforms/Coroutines/CoroInstr.h
index de2d2920cb15..320137526db8 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInstr.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInstr.h
@@ -211,8 +211,8 @@ public:
     return cast<ConstantInt>(getArgOperand(SizeArg))->getZExtValue();
   }
 
-  uint64_t getStorageAlignment() const {
-    return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue();
+  Align getStorageAlignment() const {
+    return cast<ConstantInt>(getArgOperand(AlignArg))->getAlignValue();
   }
 
   Value *getStorage() const {
@@ -338,11 +338,16 @@ class LLVM_LIBRARY_VISIBILITY CoroPromiseInst : public IntrinsicInst {
   enum { FrameArg, AlignArg, FromArg };
 
 public:
+  /// Are we translating from the frame to the promise (false) or from
+  /// the promise to the frame (true)?
   bool isFromPromise() const {
     return cast<Constant>(getArgOperand(FromArg))->isOneValue();
   }
-  unsigned getAlignment() const {
-    return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue();
+
+  /// The required alignment of the promise.  This must match the
+  /// alignment of the promise alloca in the coroutine.
+  Align getAlignment() const {
+    return cast<ConstantInt>(getArgOperand(AlignArg))->getAlignValue();
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
@@ -463,8 +468,8 @@ public:
   Value *getSize() const {
     return getArgOperand(SizeArg);
   }
-  unsigned getAlignment() const {
-    return cast<ConstantInt>(getArgOperand(AlignArg))->getZExtValue();
+  Align getAlignment() const {
+    return cast<ConstantInt>(getArgOperand(AlignArg))->getAlignValue();
   }
 
   // Methods to support type inquiry through isa, cast, and dyn_cast:
diff --git a/llvm/lib/Transforms/Coroutines/CoroInternal.h b/llvm/lib/Transforms/Coroutines/CoroInternal.h
index 7eb35400c0d5..bd76e93c9124 100644
--- a/llvm/lib/Transforms/Coroutines/CoroInternal.h
+++ b/llvm/lib/Transforms/Coroutines/CoroInternal.h
@@ -96,17 +96,22 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
   struct SwitchFieldIndex {
     enum {
       Resume,
-      Destroy,
-      Promise,
-      Index,
-      /// The index of the first spill field.
-      FirstSpill
+      Destroy
+
+      // The promise field is always at a fixed offset from the start of
+      // frame given its type, but the index isn't a constant for all
+      // possible frames.
+
+      // The switch-index field isn't at a fixed offset or index, either;
+      // we just work it in where it fits best.
     };
   };
 
   coro::ABI ABI;
 
   StructType *FrameTy;
+  Align FrameAlign;
+  uint64_t FrameSize;
   Instruction *FramePtr;
   BasicBlock *AllocaSpillBlock;
 
@@ -114,6 +119,8 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
     SwitchInst *ResumeSwitch;
     AllocaInst *PromiseAlloca;
     BasicBlock *ResumeEntryBlock;
+    unsigned IndexField;
+    unsigned PromiseField;
     bool HasFinalSuspend;
   };
 
@@ -141,10 +148,15 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
     return cast<AnyCoroIdRetconInst>(CoroBegin->getId());
   }
 
+  unsigned getSwitchIndexField() const {
+    assert(ABI == coro::ABI::Switch);
+    assert(FrameTy && "frame type not assigned");
+    return SwitchLowering.IndexField;
+  }
   IntegerType *getIndexType() const {
     assert(ABI == coro::ABI::Switch);
     assert(FrameTy && "frame type not assigned");
-    return cast<IntegerType>(FrameTy->getElementType(SwitchFieldIndex::Index));
+    return cast<IntegerType>(FrameTy->getElementType(getSwitchIndexField()));
   }
   ConstantInt *getIndex(uint64_t Value) const {
     return ConstantInt::get(getIndexType(), Value);
@@ -203,23 +215,17 @@ struct LLVM_LIBRARY_VISIBILITY Shape {
     llvm_unreachable("Unknown coro::ABI enum");
   }
 
-  unsigned getFirstSpillFieldIndex() const {
-    switch (ABI) {
-    case coro::ABI::Switch:
-      return SwitchFieldIndex::FirstSpill;
-
-    case coro::ABI::Retcon:
-    case coro::ABI::RetconOnce:
-      return 0;
-    }
-    llvm_unreachable("Unknown coro::ABI enum");
-  }
-
   AllocaInst *getPromiseAlloca() const {
     if (ABI == coro::ABI::Switch)
       return SwitchLowering.PromiseAlloca;
     return nullptr;
   }
+  unsigned getPromiseField() const {
+    assert(ABI == coro::ABI::Switch);
+    assert(FrameTy && "frame type not assigned");
+    assert(SwitchLowering.PromiseAlloca && "no promise alloca");
+    return SwitchLowering.PromiseField;
+  }
 
   /// Allocate memory according to the rules of the active lowering.
   ///
diff --git a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
index 66cb3e74e53e..9c4392e7999b 100644
--- a/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
+++ b/llvm/lib/Transforms/Coroutines/CoroSplit.cpp
@@ -18,6 +18,7 @@
 // coroutine.
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Transforms/Coroutines/CoroSplit.h"
 #include "CoroInstr.h"
 #include "CoroInternal.h"
 #include "llvm/ADT/DenseMap.h"
@@ -31,7 +32,6 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -59,6 +59,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/CallGraphUpdater.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
@@ -74,7 +75,7 @@ using namespace llvm;
 
 namespace {
 
-/// A little helper class for building 
+/// A little helper class for building
 class CoroCloner {
 public:
   enum class Kind {
@@ -283,7 +284,7 @@ static void createResumeEntryBlock(Function &F, coro::Shape &Shape) {
   auto *FramePtr = Shape.FramePtr;
   auto *FrameTy = Shape.FrameTy;
   auto *GepIndex = Builder.CreateStructGEP(
-      FrameTy, FramePtr, coro::Shape::SwitchFieldIndex::Index, "index.addr");
+      FrameTy, FramePtr, Shape.getSwitchIndexField(), "index.addr");
   auto *Index = Builder.CreateLoad(Shape.getIndexType(), GepIndex, "index");
   auto *Switch =
       Builder.CreateSwitch(Index, UnreachBB, Shape.CoroSuspends.size());
@@ -309,7 +310,7 @@ static void createResumeEntryBlock(Function &F, coro::Shape &Shape) {
       Builder.CreateStore(NullPtr, GepIndex);
     } else {
       auto *GepIndex = Builder.CreateStructGEP(
-          FrameTy, FramePtr, coro::Shape::SwitchFieldIndex::Index, "index.addr");
+          FrameTy, FramePtr, Shape.getSwitchIndexField(), "index.addr");
       Builder.CreateStore(IndexVal, GepIndex);
     }
     Save->replaceAllUsesWith(ConstantTokenNone::get(C));
@@ -562,11 +563,12 @@ void CoroCloner::replaceEntryBlock() {
   // In the original function, the AllocaSpillBlock is a block immediately
   // following the allocation of the frame object which defines GEPs for
   // all the allocas that have been moved into the frame, and it ends by
-  // branching to the original beginning of the coroutine.  Make this 
+  // branching to the original beginning of the coroutine.  Make this
   // the entry block of the cloned function.
   auto *Entry = cast<BasicBlock>(VMap[Shape.AllocaSpillBlock]);
+  auto *OldEntry = &NewF->getEntryBlock();
   Entry->setName("entry" + Suffix);
-  Entry->moveBefore(&NewF->getEntryBlock());
+  Entry->moveBefore(OldEntry);
   Entry->getTerminator()->eraseFromParent();
 
   // Clear all predecessors of the new entry block.  There should be
@@ -579,8 +581,14 @@ void CoroCloner::replaceEntryBlock() {
   Builder.CreateUnreachable();
   BranchToEntry->eraseFromParent();
 
-  // TODO: move any allocas into Entry that weren't moved into the frame.
-  // (Currently we move all allocas into the frame.)
+  // Move any allocas into Entry that weren't moved into the frame.
+  for (auto IT = OldEntry->begin(), End = OldEntry->end(); IT != End;) {
+    Instruction &I = *IT++;
+    if (!isa<AllocaInst>(&I) || I.use_empty())
+      continue;
+
+    I.moveBefore(*Entry, Entry->getFirstInsertionPt());
+  }
 
   // Branch from the entry to the appropriate place.
   Builder.SetInsertPoint(Entry);
@@ -630,12 +638,23 @@ Value *CoroCloner::deriveNewFramePointer() {
     // Otherwise, load the real frame from the opaque storage.
     auto FramePtrPtr =
       Builder.CreateBitCast(NewStorage, FramePtrTy->getPointerTo());
-    return Builder.CreateLoad(FramePtrPtr);
+    return Builder.CreateLoad(FramePtrTy, FramePtrPtr);
   }
   }
   llvm_unreachable("bad ABI");
 }
 
+static void addFramePointerAttrs(AttributeList &Attrs, LLVMContext &Context,
+                                 unsigned ParamIndex,
+                                 uint64_t Size, Align Alignment) {
+  AttrBuilder ParamAttrs;
+  ParamAttrs.addAttribute(Attribute::NonNull);
+  ParamAttrs.addAttribute(Attribute::NoAlias);
+  ParamAttrs.addAlignmentAttr(Alignment);
+  ParamAttrs.addDereferenceableAttr(Size);
+  Attrs = Attrs.addParamAttributes(Context, ParamIndex, ParamAttrs);
+}
+
 /// Clone the body of the original function into a resume function of
 /// some sort.
 void CoroCloner::create() {
@@ -684,6 +703,9 @@ void CoroCloner::create() {
     // original function.  This should include optimization settings and so on.
     NewAttrs = NewAttrs.addAttributes(Context, AttributeList::FunctionIndex,
                                       OrigAttrs.getFnAttributes());
+
+    addFramePointerAttrs(NewAttrs, Context, 0,
+                         Shape.FrameSize, Shape.FrameAlign);
     break;
 
   case coro::ABI::Retcon:
@@ -691,13 +713,13 @@ void CoroCloner::create() {
     // If we have a continuation prototype, just use its attributes,
     // full-stop.
     NewAttrs = Shape.RetconLowering.ResumePrototype->getAttributes();
+
+    addFramePointerAttrs(NewAttrs, Context, 0,
+                         Shape.getRetconCoroId()->getStorageSize(),
+                         Shape.getRetconCoroId()->getStorageAlignment());
     break;
   }
 
-  // Make the frame parameter nonnull and noalias.
-  NewAttrs = NewAttrs.addParamAttribute(Context, 0, Attribute::NonNull);
-  NewAttrs = NewAttrs.addParamAttribute(Context, 0, Attribute::NoAlias);
-
   switch (Shape.ABI) {
   // In these ABIs, the cloned functions always return 'void', and the
   // existing return sites are meaningless.  Note that for unique
@@ -872,7 +894,8 @@ static void postSplitCleanup(Function &F) {
   // For now, we do a mandatory verification step because we don't
   // entirely trust this pass.  Note that we don't want to add a verifier
   // pass to FPM below because it will also verify all the global data.
-  verifyFunction(F);
+  if (verifyFunction(F, &errs()))
+    report_fatal_error("Broken function");
 
   legacy::FunctionPassManager FPM(F.getParent());
 
@@ -911,17 +934,14 @@ static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) {
   BasicBlock *UnconditionalSucc = nullptr;
 
   Instruction *I = InitialInst;
-  while (I->isTerminator()) {
+  while (I->isTerminator() ||
+         (isa<CmpInst>(I) && I->getNextNode()->isTerminator())) {
     if (isa<ReturnInst>(I)) {
       if (I != InitialInst) {
         // If InitialInst is an unconditional branch,
         // remove PHI values that come from basic block of InitialInst
         if (UnconditionalSucc)
-          for (PHINode &PN : UnconditionalSucc->phis()) {
-            int idx = PN.getBasicBlockIndex(InitialInst->getParent());
-            if (idx != -1)
-              PN.removeIncomingValue(idx);
-          }
+          UnconditionalSucc->removePredecessor(InitialInst->getParent(), true);
         ReplaceInstWithInst(InitialInst, I->clone());
       }
       return true;
@@ -935,6 +955,29 @@ static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) {
         I = BB->getFirstNonPHIOrDbgOrLifetime();
         continue;
       }
+    } else if (auto *CondCmp = dyn_cast<CmpInst>(I)) {
+      auto *BR = dyn_cast<BranchInst>(I->getNextNode());
+      if (BR && BR->isConditional() && CondCmp == BR->getCondition()) {
+        // If the case number of suspended switch instruction is reduced to
+        // 1, then it is simplified to CmpInst in llvm::ConstantFoldTerminator.
+        // And the comparsion looks like : %cond = icmp eq i8 %V, constant.
+        ConstantInt *CondConst = dyn_cast<ConstantInt>(CondCmp->getOperand(1));
+        if (CondConst && CondCmp->getPredicate() == CmpInst::ICMP_EQ) {
+          Value *V = CondCmp->getOperand(0);
+          auto it = ResolvedValues.find(V);
+          if (it != ResolvedValues.end())
+            V = it->second;
+
+          if (ConstantInt *Cond0 = dyn_cast<ConstantInt>(V)) {
+            BasicBlock *BB = Cond0->equalsInt(CondConst->getZExtValue())
+                                 ? BR->getSuccessor(0)
+                                 : BR->getSuccessor(1);
+            scanPHIsAndUpdateValueMap(I, BB, ResolvedValues);
+            I = BB->getFirstNonPHIOrDbgOrLifetime();
+            continue;
+          }
+        }
+      }
     } else if (auto *SI = dyn_cast<SwitchInst>(I)) {
       Value *V = SI->getCondition();
       auto it = ResolvedValues.find(V);
@@ -952,6 +995,37 @@ static bool simplifyTerminatorLeadingToRet(Instruction *InitialInst) {
   return false;
 }
 
+// Check whether CI obeys the rules of musttail attribute.
+static bool shouldBeMustTail(const CallInst &CI, const Function &F) {
+  if (CI.isInlineAsm())
+    return false;
+
+  // Match prototypes and calling conventions of resume function.
+  FunctionType *CalleeTy = CI.getFunctionType();
+  if (!CalleeTy->getReturnType()->isVoidTy() || (CalleeTy->getNumParams() != 1))
+    return false;
+
+  Type *CalleeParmTy = CalleeTy->getParamType(0);
+  if (!CalleeParmTy->isPointerTy() ||
+      (CalleeParmTy->getPointerAddressSpace() != 0))
+    return false;
+
+  if (CI.getCallingConv() != F.getCallingConv())
+    return false;
+
+  // CI should not has any ABI-impacting function attributes.
+  static const Attribute::AttrKind ABIAttrs[] = {
+      Attribute::StructRet,    Attribute::ByVal,     Attribute::InAlloca,
+      Attribute::Preallocated, Attribute::InReg,     Attribute::Returned,
+      Attribute::SwiftSelf,    Attribute::SwiftError};
+  AttributeList Attrs = CI.getAttributes();
+  for (auto AK : ABIAttrs)
+    if (Attrs.hasParamAttribute(0, AK))
+      return false;
+
+  return true;
+}
+
 // Add musttail to any resume instructions that is immediately followed by a
 // suspend (i.e. ret). We do this even in -O0 to support guaranteed tail call
 // for symmetrical coroutine control transfer (C++ Coroutines TS extension).
@@ -964,11 +1038,8 @@ static void addMustTailToCoroResumes(Function &F) {
   SmallVector<CallInst *, 4> Resumes;
   for (auto &I : instructions(F))
     if (auto *Call = dyn_cast<CallInst>(&I))
-      if (auto *CalledValue = Call->getCalledValue())
-        // CoroEarly pass replaced coro resumes with indirect calls to an
-        // address return by CoroSubFnInst intrinsic. See if it is one of those.
-        if (isa<CoroSubFnInst>(CalledValue->stripPointerCasts()))
-          Resumes.push_back(Call);
+      if (shouldBeMustTail(*Call, F))
+        Resumes.push_back(Call);
 
   // Set musttail on those that are followed by a ret instruction.
   for (CallInst *Call : Resumes)
@@ -993,8 +1064,8 @@ static void handleNoSuspendCoroutine(coro::Shape &Shape) {
     coro::replaceCoroFree(SwitchId, /*Elide=*/AllocInst != nullptr);
     if (AllocInst) {
       IRBuilder<> Builder(AllocInst);
-      // FIXME: Need to handle overaligned members.
       auto *Frame = Builder.CreateAlloca(Shape.FrameTy);
+      Frame->setAlignment(Shape.FrameAlign);
       auto *VFrame = Builder.CreateBitCast(Frame, Builder.getInt8PtrTy());
       AllocInst->replaceAllUsesWith(Builder.getFalse());
       AllocInst->eraseFromParent();
@@ -1023,7 +1094,7 @@ static bool hasCallsInBlockBetween(Instruction *From, Instruction *To) {
     if (isa<IntrinsicInst>(I))
       continue;
 
-    if (CallSite(I))
+    if (isa<CallBase>(I))
       return true;
   }
   return false;
@@ -1093,13 +1164,11 @@ static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
     Prev = Pred->getTerminator();
   }
 
-  CallSite CS{Prev};
-  if (!CS)
+  CallBase *CB = dyn_cast<CallBase>(Prev);
+  if (!CB)
     return false;
 
-  auto *CallInstr = CS.getInstruction();
-
-  auto *Callee = CS.getCalledValue()->stripPointerCasts();
+  auto *Callee = CB->getCalledOperand()->stripPointerCasts();
 
   // See if the callsite is for resumption or destruction of the coroutine.
   auto *SubFn = dyn_cast<CoroSubFnInst>(Callee);
@@ -1114,7 +1183,7 @@ static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
   // calls in between Save and CallInstr. They can potenitally resume the
   // coroutine rendering this optimization unsafe.
   auto *Save = Suspend->getCoroSave();
-  if (hasCallsBetween(Save, CallInstr))
+  if (hasCallsBetween(Save, CB))
     return false;
 
   // Replace llvm.coro.suspend with the value that results in resumption over
@@ -1124,13 +1193,13 @@ static bool simplifySuspendPoint(CoroSuspendInst *Suspend,
   Save->eraseFromParent();
 
   // No longer need a call to coro.resume or coro.destroy.
-  if (auto *Invoke = dyn_cast<InvokeInst>(CallInstr)) {
+  if (auto *Invoke = dyn_cast<InvokeInst>(CB)) {
     BranchInst::Create(Invoke->getNormalDest(), Invoke);
   }
 
-  // Grab the CalledValue from CS before erasing the CallInstr.
-  auto *CalledValue = CS.getCalledValue();
-  CallInstr->eraseFromParent();
+  // Grab the CalledValue from CB before erasing the CallInstr.
+  auto *CalledValue = CB->getCalledOperand();
+  CB->eraseFromParent();
 
   // If no more users remove it. Usually it is a bitcast of SubFn.
   if (CalledValue != SubFn && CalledValue->user_empty())
@@ -1155,7 +1224,10 @@ static void simplifySuspendPoints(coro::Shape &Shape) {
   if (N == 0)
     return;
   while (true) {
-    if (simplifySuspendPoint(cast<CoroSuspendInst>(S[I]), Shape.CoroBegin)) {
+    auto SI = cast<CoroSuspendInst>(S[I]);
+    // Leave final.suspend to handleFinalSuspend since it is undefined behavior
+    // to resume a coroutine suspended at the final suspend point.
+    if (!SI->isFinal() && simplifySuspendPoint(SI, Shape.CoroBegin)) {
       if (--N == I)
         break;
       std::swap(S[I], S[N]);
@@ -1225,6 +1297,7 @@ static void splitRetconCoroutine(Function &F, coro::Shape &Shape,
 
     // Allocate.  We don't need to update the call graph node because we're
     // going to recompute it from scratch after splitting.
+    // FIXME: pass the required alignment
     RawFramePtr = Shape.emitAlloc(Builder, Builder.getInt64(Size), nullptr);
     RawFramePtr =
       Builder.CreateBitCast(RawFramePtr, Shape.CoroBegin->getType());
@@ -1342,19 +1415,8 @@ namespace {
   };
 }
 
-static void splitCoroutine(Function &F, coro::Shape &Shape,
-                           SmallVectorImpl<Function *> &Clones) {
-  switch (Shape.ABI) {
-  case coro::ABI::Switch:
-    return splitSwitchCoroutine(F, Shape, Clones);
-  case coro::ABI::Retcon:
-  case coro::ABI::RetconOnce:
-    return splitRetconCoroutine(F, Shape, Clones);
-  }
-  llvm_unreachable("bad ABI kind");
-}
-
-static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) {
+static coro::Shape splitCoroutine(Function &F,
+                                  SmallVectorImpl<Function *> &Clones) {
   PrettyStackTraceFunction prettyStackTrace(F);
 
   // The suspend-crossing algorithm in buildCoroutineFrame get tripped
@@ -1363,26 +1425,42 @@ static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) {
 
   coro::Shape Shape(F);
   if (!Shape.CoroBegin)
-    return;
+    return Shape;
 
   simplifySuspendPoints(Shape);
   buildCoroutineFrame(F, Shape);
   replaceFrameSize(Shape);
 
-  SmallVector<Function*, 4> Clones;
-
   // If there are no suspend points, no split required, just remove
   // the allocation and deallocation blocks, they are not needed.
   if (Shape.CoroSuspends.empty()) {
     handleNoSuspendCoroutine(Shape);
   } else {
-    splitCoroutine(F, Shape, Clones);
+    switch (Shape.ABI) {
+    case coro::ABI::Switch:
+      splitSwitchCoroutine(F, Shape, Clones);
+      break;
+    case coro::ABI::Retcon:
+    case coro::ABI::RetconOnce:
+      splitRetconCoroutine(F, Shape, Clones);
+      break;
+    }
   }
 
   // Replace all the swifterror operations in the original function.
   // This invalidates SwiftErrorOps in the Shape.
   replaceSwiftErrorOps(F, Shape, nullptr);
 
+  return Shape;
+}
+
+static void
+updateCallGraphAfterCoroutineSplit(Function &F, const coro::Shape &Shape,
+                                   const SmallVectorImpl<Function *> &Clones,
+                                   CallGraph &CG, CallGraphSCC &SCC) {
+  if (!Shape.CoroBegin)
+    return;
+
   removeCoroEnds(Shape, &CG);
   postSplitCleanup(F);
 
@@ -1390,6 +1468,44 @@ static void splitCoroutine(Function &F, CallGraph &CG, CallGraphSCC &SCC) {
   coro::updateCallGraph(F, Clones, CG, SCC);
 }
 
+static void updateCallGraphAfterCoroutineSplit(
+    LazyCallGraph::Node &N, const coro::Shape &Shape,
+    const SmallVectorImpl<Function *> &Clones, LazyCallGraph::SCC &C,
+    LazyCallGraph &CG, CGSCCAnalysisManager &AM, CGSCCUpdateResult &UR,
+    FunctionAnalysisManager &FAM) {
+  if (!Shape.CoroBegin)
+    return;
+
+  for (llvm::CoroEndInst *End : Shape.CoroEnds) {
+    auto &Context = End->getContext();
+    End->replaceAllUsesWith(ConstantInt::getFalse(Context));
+    End->eraseFromParent();
+  }
+
+  postSplitCleanup(N.getFunction());
+
+  // To insert the newly created coroutine funclets 'f.resume', 'f.destroy', and
+  // 'f.cleanup' into the same SCC as the coroutine 'f' they were outlined from,
+  // we make use of the CallGraphUpdater class, which can modify the internal
+  // state of the LazyCallGraph.
+  for (Function *Clone : Clones)
+    CG.addNewFunctionIntoRefSCC(*Clone, C.getOuterRefSCC());
+
+  // We've inserted instructions into coroutine 'f' that reference the three new
+  // coroutine funclets. We must now update the call graph so that reference
+  // edges between 'f' and its funclets are added to it. LazyCallGraph only
+  // allows CGSCC passes to insert "trivial" reference edges. We've ensured
+  // above, by inserting the funclets into the same SCC as the corutine, that
+  // the edges are trivial.
+  //
+  // N.B.: If we didn't update the call graph here, a CGSCCToFunctionPassAdaptor
+  // later in this CGSCC pass pipeline may be run, triggering a call graph
+  // update of its own. Function passes run by the adaptor are not permitted to
+  // add new edges of any kind to the graph, and the new edges inserted by this
+  // pass would be misattributed to that unrelated function pass.
+  updateCGAndAnalysisManagerForCGSCCPass(CG, C, N, AM, UR, FAM);
+}
+
 // When we see the coroutine the first time, we insert an indirect call to a
 // devirt trigger function and mark the coroutine that it is now ready for
 // split.
@@ -1521,12 +1637,89 @@ static bool replaceAllPrepares(Function *PrepareFn, CallGraph &CG) {
   return Changed;
 }
 
-//===----------------------------------------------------------------------===//
-//                              Top Level Driver
-//===----------------------------------------------------------------------===//
+static bool declaresCoroSplitIntrinsics(const Module &M) {
+  return coro::declaresIntrinsics(
+      M, {"llvm.coro.begin", "llvm.coro.prepare.retcon"});
+}
+
+PreservedAnalyses CoroSplitPass::run(LazyCallGraph::SCC &C,
+                                     CGSCCAnalysisManager &AM,
+                                     LazyCallGraph &CG, CGSCCUpdateResult &UR) {
+  // NB: One invariant of a valid LazyCallGraph::SCC is that it must contain a
+  //     non-zero number of nodes, so we assume that here and grab the first
+  //     node's function's module.
+  Module &M = *C.begin()->getFunction().getParent();
+  auto &FAM =
+      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+
+  if (!declaresCoroSplitIntrinsics(M))
+    return PreservedAnalyses::all();
+
+  // Check for uses of llvm.coro.prepare.retcon.
+  const auto *PrepareFn = M.getFunction("llvm.coro.prepare.retcon");
+  if (PrepareFn && PrepareFn->use_empty())
+    PrepareFn = nullptr;
+
+  // Find coroutines for processing.
+  SmallVector<LazyCallGraph::Node *, 4> Coroutines;
+  for (LazyCallGraph::Node &N : C)
+    if (N.getFunction().hasFnAttribute(CORO_PRESPLIT_ATTR))
+      Coroutines.push_back(&N);
+
+  if (Coroutines.empty() && !PrepareFn)
+    return PreservedAnalyses::all();
+
+  if (Coroutines.empty())
+    llvm_unreachable("new pass manager cannot yet handle "
+                     "'llvm.coro.prepare.retcon'");
+
+  // Split all the coroutines.
+  for (LazyCallGraph::Node *N : Coroutines) {
+    Function &F = N->getFunction();
+    Attribute Attr = F.getFnAttribute(CORO_PRESPLIT_ATTR);
+    StringRef Value = Attr.getValueAsString();
+    LLVM_DEBUG(dbgs() << "CoroSplit: Processing coroutine '" << F.getName()
+                      << "' state: " << Value << "\n");
+    if (Value == UNPREPARED_FOR_SPLIT) {
+      // Enqueue a second iteration of the CGSCC pipeline.
+      // N.B.:
+      // The CoroSplitLegacy pass "triggers" a restart of the CGSCC pass
+      // pipeline by inserting an indirect function call that the
+      // CoroElideLegacy pass then replaces with a direct function call. The
+      // legacy CGSCC pipeline's implicit behavior was as if wrapped in the new
+      // pass manager abstraction DevirtSCCRepeatedPass.
+      //
+      // This pass does not need to "trigger" another run of the pipeline.
+      // Instead, it simply enqueues the same RefSCC onto the pipeline's
+      // worklist.
+      UR.CWorklist.insert(&C);
+      F.addFnAttr(CORO_PRESPLIT_ATTR, PREPARED_FOR_SPLIT);
+      continue;
+    }
+    F.removeFnAttr(CORO_PRESPLIT_ATTR);
+
+    SmallVector<Function *, 4> Clones;
+    const coro::Shape Shape = splitCoroutine(F, Clones);
+    updateCallGraphAfterCoroutineSplit(*N, Shape, Clones, C, CG, AM, UR, FAM);
+  }
+
+  if (PrepareFn)
+    llvm_unreachable("new pass manager cannot yet handle "
+                     "'llvm.coro.prepare.retcon'");
+
+  return PreservedAnalyses::none();
+}
 
 namespace {
 
+// We present a coroutine to LLVM as an ordinary function with suspension
+// points marked up with intrinsics. We let the optimizer party on the coroutine
+// as a single function for as long as possible. Shortly before the coroutine is
+// eligible to be inlined into its callers, we split up the coroutine into parts
+// corresponding to initial, resume and destroy invocations of the coroutine,
+// add them to the current SCC and restart the IPO pipeline to optimize the
+// coroutine subfunctions we extracted before proceeding to the caller of the
+// coroutine.
 struct CoroSplitLegacy : public CallGraphSCCPass {
   static char ID; // Pass identification, replacement for typeid
 
@@ -1539,9 +1732,7 @@ struct CoroSplitLegacy : public CallGraphSCCPass {
   // A coroutine is identified by the presence of coro.begin intrinsic, if
   // we don't have any, this pass has nothing to do.
   bool doInitialization(CallGraph &CG) override {
-    Run = coro::declaresIntrinsics(CG.getModule(),
-                                   {"llvm.coro.begin",
-                                    "llvm.coro.prepare.retcon"});
+    Run = declaresCoroSplitIntrinsics(CG.getModule());
     return CallGraphSCCPass::doInitialization(CG);
   }
 
@@ -1583,7 +1774,10 @@ struct CoroSplitLegacy : public CallGraphSCCPass {
         continue;
       }
       F->removeFnAttr(CORO_PRESPLIT_ATTR);
-      splitCoroutine(*F, CG, SCC);
+
+      SmallVector<Function *, 4> Clones;
+      const coro::Shape Shape = splitCoroutine(*F, Clones);
+      updateCallGraphAfterCoroutineSplit(*F, Shape, Clones, CG, SCC);
     }
 
     if (PrepareFn)
diff --git a/llvm/lib/Transforms/Coroutines/Coroutines.cpp b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
index 02d11af3303f..87c3a8b0d0cf 100644
--- a/llvm/lib/Transforms/Coroutines/Coroutines.cpp
+++ b/llvm/lib/Transforms/Coroutines/Coroutines.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
@@ -649,3 +648,9 @@ void LLVMAddCoroElidePass(LLVMPassManagerRef PM) {
 void LLVMAddCoroCleanupPass(LLVMPassManagerRef PM) {
   unwrap(PM)->add(createCoroCleanupLegacyPass());
 }
+
+void
+LLVMPassManagerBuilderAddCoroutinePassesToExtensionPoints(LLVMPassManagerBuilderRef PMB) {
+  PassManagerBuilder *Builder = unwrap(PMB);
+  addCoroutinePassesToExtensionPoints(*Builder);
+}
diff --git a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
index 06d1763353f4..53f9512f86f3 100644
--- a/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
+++ b/llvm/lib/Transforms/IPO/AlwaysInliner.cpp
@@ -16,7 +16,6 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
@@ -37,30 +36,30 @@ PreservedAnalyses AlwaysInlinerPass::run(Module &M,
   // Add inline assumptions during code generation.
   FunctionAnalysisManager &FAM =
       MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
-  std::function<AssumptionCache &(Function &)> GetAssumptionCache =
-      [&](Function &F) -> AssumptionCache & {
+  auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
     return FAM.getResult<AssumptionAnalysis>(F);
   };
-  InlineFunctionInfo IFI(/*cg=*/nullptr, &GetAssumptionCache);
+  InlineFunctionInfo IFI(/*cg=*/nullptr, GetAssumptionCache);
 
-  SmallSetVector<CallSite, 16> Calls;
+  SmallSetVector<CallBase *, 16> Calls;
   bool Changed = false;
   SmallVector<Function *, 16> InlinedFunctions;
   for (Function &F : M)
     if (!F.isDeclaration() && F.hasFnAttribute(Attribute::AlwaysInline) &&
-        isInlineViable(F)) {
+        isInlineViable(F).isSuccess()) {
       Calls.clear();
 
       for (User *U : F.users())
-        if (auto CS = CallSite(U))
-          if (CS.getCalledFunction() == &F)
-            Calls.insert(CS);
+        if (auto *CB = dyn_cast<CallBase>(U))
+          if (CB->getCalledFunction() == &F)
+            Calls.insert(CB);
 
-      for (CallSite CS : Calls)
+      for (CallBase *CB : Calls)
         // FIXME: We really shouldn't be able to fail to inline at this point!
         // We should do something to log or check the inline failures here.
         Changed |=
-            InlineFunction(CS, IFI, /*CalleeAAR=*/nullptr, InsertLifetime);
+            InlineFunction(*CB, IFI, /*CalleeAAR=*/nullptr, InsertLifetime)
+                .isSuccess();
 
       // Remember to try and delete this function afterward. This both avoids
       // re-walking the rest of the module and avoids dealing with any iterator
@@ -116,7 +115,7 @@ public:
 
   static char ID; // Pass identification, replacement for typeid
 
-  InlineCost getInlineCost(CallSite CS) override;
+  InlineCost getInlineCost(CallBase &CB) override;
 
   using llvm::Pass::doFinalization;
   bool doFinalization(CallGraph &CG) override {
@@ -151,8 +150,8 @@ Pass *llvm::createAlwaysInlinerLegacyPass(bool InsertLifetime) {
 /// computed here, but as we only expect to do this for relatively few and
 /// small functions which have the explicit attribute to force inlining, it is
 /// likely not worth it in practice.
-InlineCost AlwaysInlinerLegacyPass::getInlineCost(CallSite CS) {
-  Function *Callee = CS.getCalledFunction();
+InlineCost AlwaysInlinerLegacyPass::getInlineCost(CallBase &CB) {
+  Function *Callee = CB.getCalledFunction();
 
   // Only inline direct calls to functions with always-inline attributes
   // that are viable for inlining.
@@ -163,12 +162,12 @@ InlineCost AlwaysInlinerLegacyPass::getInlineCost(CallSite CS) {
   if (Callee->isDeclaration())
     return InlineCost::getNever("no definition");
 
-  if (!CS.hasFnAttr(Attribute::AlwaysInline))
+  if (!CB.hasFnAttr(Attribute::AlwaysInline))
     return InlineCost::getNever("no alwaysinline attribute");
 
   auto IsViable = isInlineViable(*Callee);
-  if (!IsViable)
-    return InlineCost::getNever(IsViable.message);
+  if (!IsViable.isSuccess())
+    return InlineCost::getNever(IsViable.getFailureReason());
 
   return InlineCost::getAlways("always inliner");
 }
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index cdf8a2eb598e..ad0d7eb51507 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -36,7 +36,6 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/ADT/StringExtras.h"
 #include "llvm/ADT/Twine.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -53,7 +52,6 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -74,6 +72,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/FormatVariadic.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/IPO.h"
 #include <algorithm>
@@ -105,7 +104,7 @@ using IndicesVector = std::vector<uint64_t>;
 static Function *
 doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
             SmallPtrSetImpl<Argument *> &ByValArgsToTransform,
-            Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
+            Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
                 ReplaceCallSite) {
   // Start by computing a new prototype for the function, which is the same as
   // the old function, but has modified arguments.
@@ -197,7 +196,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
       for (const auto &ArgIndex : ArgIndices) {
         // not allowed to dereference ->begin() if size() is 0
         Params.push_back(GetElementPtrInst::getIndexedType(
-            cast<PointerType>(I->getType()->getScalarType())->getElementType(),
+            cast<PointerType>(I->getType())->getElementType(),
             ArgIndex.second));
         ArgAttrVec.push_back(AttributeSet());
         assert(Params.back());
@@ -241,15 +240,14 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
   //
   SmallVector<Value *, 16> Args;
   while (!F->use_empty()) {
-    CallSite CS(F->user_back());
-    assert(CS.getCalledFunction() == F);
-    Instruction *Call = CS.getInstruction();
-    const AttributeList &CallPAL = CS.getAttributes();
-    IRBuilder<NoFolder> IRB(Call);
+    CallBase &CB = cast<CallBase>(*F->user_back());
+    assert(CB.getCalledFunction() == F);
+    const AttributeList &CallPAL = CB.getAttributes();
+    IRBuilder<NoFolder> IRB(&CB);
 
     // Loop over the operands, inserting GEP and loads in the caller as
     // appropriate.
-    CallSite::arg_iterator AI = CS.arg_begin();
+    auto AI = CB.arg_begin();
     ArgNo = 0;
     for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
          ++I, ++AI, ++ArgNo)
@@ -295,7 +293,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
               if (auto *ElPTy = dyn_cast<PointerType>(ElTy))
                 ElTy = ElPTy->getElementType();
               else
-                ElTy = cast<CompositeType>(ElTy)->getTypeAtIndex(II);
+                ElTy = GetElementPtrInst::getTypeAtIndex(ElTy, II);
             }
             // And create a GEP to extract those indices.
             V = IRB.CreateGEP(ArgIndex.first, V, Ops, V->getName() + ".idx");
@@ -305,7 +303,7 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
           // of the previous load.
           LoadInst *newLoad =
               IRB.CreateLoad(OrigLoad->getType(), V, V->getName() + ".val");
-          newLoad->setAlignment(MaybeAlign(OrigLoad->getAlignment()));
+          newLoad->setAlignment(OrigLoad->getAlign());
           // Transfer the AA info too.
           AAMDNodes AAInfo;
           OrigLoad->getAAMetadata(AAInfo);
@@ -317,46 +315,43 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
       }
 
     // Push any varargs arguments on the list.
-    for (; AI != CS.arg_end(); ++AI, ++ArgNo) {
+    for (; AI != CB.arg_end(); ++AI, ++ArgNo) {
       Args.push_back(*AI);
       ArgAttrVec.push_back(CallPAL.getParamAttributes(ArgNo));
     }
 
     SmallVector<OperandBundleDef, 1> OpBundles;
-    CS.getOperandBundlesAsDefs(OpBundles);
+    CB.getOperandBundlesAsDefs(OpBundles);
 
-    CallSite NewCS;
-    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
+    CallBase *NewCS = nullptr;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
       NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
-                                 Args, OpBundles, "", Call);
+                                 Args, OpBundles, "", &CB);
     } else {
-      auto *NewCall = CallInst::Create(NF, Args, OpBundles, "", Call);
-      NewCall->setTailCallKind(cast<CallInst>(Call)->getTailCallKind());
+      auto *NewCall = CallInst::Create(NF, Args, OpBundles, "", &CB);
+      NewCall->setTailCallKind(cast<CallInst>(&CB)->getTailCallKind());
       NewCS = NewCall;
     }
-    NewCS.setCallingConv(CS.getCallingConv());
-    NewCS.setAttributes(
+    NewCS->setCallingConv(CB.getCallingConv());
+    NewCS->setAttributes(
         AttributeList::get(F->getContext(), CallPAL.getFnAttributes(),
                            CallPAL.getRetAttributes(), ArgAttrVec));
-    NewCS->setDebugLoc(Call->getDebugLoc());
-    uint64_t W;
-    if (Call->extractProfTotalWeight(W))
-      NewCS->setProfWeight(W);
+    NewCS->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
     Args.clear();
     ArgAttrVec.clear();
 
     // Update the callgraph to know that the callsite has been transformed.
     if (ReplaceCallSite)
-      (*ReplaceCallSite)(CS, NewCS);
+      (*ReplaceCallSite)(CB, *NewCS);
 
-    if (!Call->use_empty()) {
-      Call->replaceAllUsesWith(NewCS.getInstruction());
-      NewCS->takeName(Call);
+    if (!CB.use_empty()) {
+      CB.replaceAllUsesWith(NewCS);
+      NewCS->takeName(&CB);
     }
 
     // Finally, remove the old call from the program, reducing the use-count of
     // F.
-    Call->eraseFromParent();
+    CB.eraseFromParent();
   }
 
   const DataLayout &DL = F->getParent()->getDataLayout();
@@ -387,9 +382,10 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
 
       // Just add all the struct element types.
       Type *AgTy = cast<PointerType>(I->getType())->getElementType();
-      Value *TheAlloca =
-          new AllocaInst(AgTy, DL.getAllocaAddrSpace(), nullptr,
-                         MaybeAlign(I->getParamAlignment()), "", InsertPt);
+      Value *TheAlloca = new AllocaInst(
+          AgTy, DL.getAllocaAddrSpace(), nullptr,
+          I->getParamAlign().getValueOr(DL.getPrefTypeAlign(AgTy)), "",
+          InsertPt);
       StructType *STy = cast<StructType>(AgTy);
       Value *Idxs[2] = {ConstantInt::get(Type::getInt32Ty(F->getContext()), 0),
                         nullptr};
@@ -453,12 +449,8 @@ doPromotion(Function *F, SmallPtrSetImpl<Argument *> &ArgsToPromote,
           assert(It != ArgIndices.end() && "GEP not handled??");
         }
 
-        std::string NewName = I->getName();
-        for (unsigned i = 0, e = Operands.size(); i != e; ++i) {
-          NewName += "." + utostr(Operands[i]);
-        }
-        NewName += ".val";
-        TheArg->setName(NewName);
+        TheArg->setName(formatv("{0}.{1:$[.]}.val", I->getName(),
+                                make_range(Operands.begin(), Operands.end())));
 
         LLVM_DEBUG(dbgs() << "*** Promoted agg argument '" << TheArg->getName()
                           << "' of function '" << NF->getName() << "'\n");
@@ -492,10 +484,9 @@ static bool allCallersPassValidPointerForArgument(Argument *Arg, Type *Ty) {
   // Look at all call sites of the function.  At this point we know we only have
   // direct callees.
   for (User *U : Callee->users()) {
-    CallSite CS(U);
-    assert(CS && "Should only have direct calls!");
+    CallBase &CB = cast<CallBase>(*U);
 
-    if (!isDereferenceablePointer(CS.getArgument(ArgNo), Ty, DL))
+    if (!isDereferenceablePointer(CB.getArgOperand(ArgNo), Ty, DL))
       return false;
   }
   return true;
@@ -774,8 +765,7 @@ static bool isSafeToPromoteArgument(Argument *Arg, Type *ByValTy, AAResults &AAR
   return true;
 }
 
-/// Checks if a type could have padding bytes.
-static bool isDenselyPacked(Type *type, const DataLayout &DL) {
+bool ArgumentPromotionPass::isDenselyPacked(Type *type, const DataLayout &DL) {
   // There is no size information, so be conservative.
   if (!type->isSized())
     return false;
@@ -785,13 +775,18 @@ static bool isDenselyPacked(Type *type, const DataLayout &DL) {
   if (DL.getTypeSizeInBits(type) != DL.getTypeAllocSizeInBits(type))
     return false;
 
-  if (!isa<CompositeType>(type))
-    return true;
+  // FIXME: This isn't the right way to check for padding in vectors with
+  // non-byte-size elements.
+  if (VectorType *seqTy = dyn_cast<VectorType>(type))
+    return isDenselyPacked(seqTy->getElementType(), DL);
 
-  // For homogenous sequential types, check for padding within members.
-  if (SequentialType *seqTy = dyn_cast<SequentialType>(type))
+  // For array types, check for padding within members.
+  if (ArrayType *seqTy = dyn_cast<ArrayType>(type))
     return isDenselyPacked(seqTy->getElementType(), DL);
 
+  if (!isa<StructType>(type))
+    return true;
+
   // Check for padding within and between elements of a struct.
   StructType *StructTy = cast<StructType>(type);
   const StructLayout *Layout = DL.getStructLayout(StructTy);
@@ -844,14 +839,16 @@ static bool canPaddingBeAccessed(Argument *arg) {
   return false;
 }
 
-static bool areFunctionArgsABICompatible(
+bool ArgumentPromotionPass::areFunctionArgsABICompatible(
     const Function &F, const TargetTransformInfo &TTI,
     SmallPtrSetImpl<Argument *> &ArgsToPromote,
     SmallPtrSetImpl<Argument *> &ByValArgsToTransform) {
   for (const Use &U : F.uses()) {
-    CallSite CS(U.getUser());
-    const Function *Caller = CS.getCaller();
-    const Function *Callee = CS.getCalledFunction();
+    CallBase *CB = dyn_cast<CallBase>(U.getUser());
+    if (!CB)
+      return false;
+    const Function *Caller = CB->getCaller();
+    const Function *Callee = CB->getCalledFunction();
     if (!TTI.areFunctionArgsABICompatible(Caller, Callee, ArgsToPromote) ||
         !TTI.areFunctionArgsABICompatible(Caller, Callee, ByValArgsToTransform))
       return false;
@@ -866,7 +863,7 @@ static bool areFunctionArgsABICompatible(
 static Function *
 promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
                  unsigned MaxElements,
-                 Optional<function_ref<void(CallSite OldCS, CallSite NewCS)>>
+                 Optional<function_ref<void(CallBase &OldCS, CallBase &NewCS)>>
                      ReplaceCallSite,
                  const TargetTransformInfo &TTI) {
   // Don't perform argument promotion for naked functions; otherwise we can end
@@ -905,16 +902,16 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
   // is self-recursive and check that target features are compatible.
   bool isSelfRecursive = false;
   for (Use &U : F->uses()) {
-    CallSite CS(U.getUser());
+    CallBase *CB = dyn_cast<CallBase>(U.getUser());
     // Must be a direct call.
-    if (CS.getInstruction() == nullptr || !CS.isCallee(&U))
+    if (CB == nullptr || !CB->isCallee(&U))
       return nullptr;
 
     // Can't change signature of musttail callee
-    if (CS.isMustTailCall())
+    if (CB->isMustTailCall())
       return nullptr;
 
-    if (CS.getInstruction()->getParent()->getParent() == F)
+    if (CB->getParent()->getParent() == F)
       isSelfRecursive = true;
   }
 
@@ -942,18 +939,18 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
       F->removeParamAttr(ArgNo, Attribute::StructRet);
       F->addParamAttr(ArgNo, Attribute::NoAlias);
       for (Use &U : F->uses()) {
-        CallSite CS(U.getUser());
-        CS.removeParamAttr(ArgNo, Attribute::StructRet);
-        CS.addParamAttr(ArgNo, Attribute::NoAlias);
+        CallBase &CB = cast<CallBase>(*U.getUser());
+        CB.removeParamAttr(ArgNo, Attribute::StructRet);
+        CB.addParamAttr(ArgNo, Attribute::NoAlias);
       }
     }
 
     // If this is a byval argument, and if the aggregate type is small, just
     // pass the elements, which is always safe, if the passed value is densely
     // packed or if we can prove the padding bytes are never accessed.
-    bool isSafeToPromote =
-        PtrArg->hasByValAttr() &&
-        (isDenselyPacked(AgTy, DL) || !canPaddingBeAccessed(PtrArg));
+    bool isSafeToPromote = PtrArg->hasByValAttr() &&
+                           (ArgumentPromotionPass::isDenselyPacked(AgTy, DL) ||
+                            !canPaddingBeAccessed(PtrArg));
     if (isSafeToPromote) {
       if (StructType *STy = dyn_cast<StructType>(AgTy)) {
         if (MaxElements > 0 && STy->getNumElements() > MaxElements) {
@@ -1011,8 +1008,8 @@ promoteArguments(Function *F, function_ref<AAResults &(Function &F)> AARGetter,
   if (ArgsToPromote.empty() && ByValArgsToTransform.empty())
     return nullptr;
 
-  if (!areFunctionArgsABICompatible(*F, TTI, ArgsToPromote,
-                                    ByValArgsToTransform))
+  if (!ArgumentPromotionPass::areFunctionArgsABICompatible(
+          *F, TTI, ArgsToPromote, ByValArgsToTransform))
     return nullptr;
 
   return doPromotion(F, ArgsToPromote, ByValArgsToTransform, ReplaceCallSite);
@@ -1135,14 +1132,13 @@ bool ArgPromotion::runOnSCC(CallGraphSCC &SCC) {
       if (!OldF)
         continue;
 
-      auto ReplaceCallSite = [&](CallSite OldCS, CallSite NewCS) {
-        Function *Caller = OldCS.getInstruction()->getParent()->getParent();
+      auto ReplaceCallSite = [&](CallBase &OldCS, CallBase &NewCS) {
+        Function *Caller = OldCS.getParent()->getParent();
         CallGraphNode *NewCalleeNode =
             CG.getOrInsertFunction(NewCS.getCalledFunction());
         CallGraphNode *CallerNode = CG[Caller];
-        CallerNode->replaceCallEdge(*cast<CallBase>(OldCS.getInstruction()),
-                                    *cast<CallBase>(NewCS.getInstruction()),
-                                    NewCalleeNode);
+        CallerNode->replaceCallEdge(cast<CallBase>(OldCS),
+                                    cast<CallBase>(NewCS), NewCalleeNode);
       };
 
       const TargetTransformInfo &TTI =
diff --git a/llvm/lib/Transforms/IPO/Attributor.cpp b/llvm/lib/Transforms/IPO/Attributor.cpp
index f2995817eaf8..f96dac5f3515 100644
--- a/llvm/lib/Transforms/IPO/Attributor.cpp
+++ b/llvm/lib/Transforms/IPO/Attributor.cpp
@@ -6,7 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// This file implements an inter procedural pass that deduces and/or propagating
+// This file implements an interprocedural pass that deduces and/or propagates
 // attributes. This is done in an abstract interpretation style fixpoint
 // iteration. See the Attributor.h file comment and the class descriptions in
 // that file for more information.
@@ -15,29 +15,16 @@
 
 #include "llvm/Transforms/IPO/Attributor.h"
 
-#include "llvm/ADT/DepthFirstIterator.h"
-#include "llvm/ADT/STLExtras.h"
-#include "llvm/ADT/SmallPtrSet.h"
-#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/Analysis/CaptureTracking.h"
-#include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LazyValueInfo.h"
-#include "llvm/Analysis/Loads.h"
-#include "llvm/Analysis/MemoryBuiltins.h"
-#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/Argument.h"
-#include "llvm/IR/Attributes.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/NoFolder.h"
 #include "llvm/IR/Verifier.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 
@@ -47,10 +34,12 @@ using namespace llvm;
 
 #define DEBUG_TYPE "attributor"
 
+STATISTIC(NumFnDeleted, "Number of function deleted");
 STATISTIC(NumFnWithExactDefinition,
-          "Number of function with exact definitions");
+          "Number of functions with exact definitions");
 STATISTIC(NumFnWithoutExactDefinition,
-          "Number of function without exact definitions");
+          "Number of functions without exact definitions");
+STATISTIC(NumFnShallowWrapperCreated, "Number of shallow wrappers created");
 STATISTIC(NumAttributesTimedOut,
           "Number of abstract attributes timed out before fixpoint");
 STATISTIC(NumAttributesValidFixpoint,
@@ -60,80 +49,6 @@ STATISTIC(NumAttributesManifested,
 STATISTIC(NumAttributesFixedDueToRequiredDependences,
           "Number of abstract attributes fixed due to required dependences");
 
-// Some helper macros to deal with statistics tracking.
-//
-// Usage:
-// For simple IR attribute tracking overload trackStatistics in the abstract
-// attribute and choose the right STATS_DECLTRACK_********* macro,
-// e.g.,:
-//  void trackStatistics() const override {
-//    STATS_DECLTRACK_ARG_ATTR(returned)
-//  }
-// If there is a single "increment" side one can use the macro
-// STATS_DECLTRACK with a custom message. If there are multiple increment
-// sides, STATS_DECL and STATS_TRACK can also be used separatly.
-//
-#define BUILD_STAT_MSG_IR_ATTR(TYPE, NAME)                                     \
-  ("Number of " #TYPE " marked '" #NAME "'")
-#define BUILD_STAT_NAME(NAME, TYPE) NumIR##TYPE##_##NAME
-#define STATS_DECL_(NAME, MSG) STATISTIC(NAME, MSG);
-#define STATS_DECL(NAME, TYPE, MSG)                                            \
-  STATS_DECL_(BUILD_STAT_NAME(NAME, TYPE), MSG);
-#define STATS_TRACK(NAME, TYPE) ++(BUILD_STAT_NAME(NAME, TYPE));
-#define STATS_DECLTRACK(NAME, TYPE, MSG)                                       \
-  {                                                                            \
-    STATS_DECL(NAME, TYPE, MSG)                                                \
-    STATS_TRACK(NAME, TYPE)                                                    \
-  }
-#define STATS_DECLTRACK_ARG_ATTR(NAME)                                         \
-  STATS_DECLTRACK(NAME, Arguments, BUILD_STAT_MSG_IR_ATTR(arguments, NAME))
-#define STATS_DECLTRACK_CSARG_ATTR(NAME)                                       \
-  STATS_DECLTRACK(NAME, CSArguments,                                           \
-                  BUILD_STAT_MSG_IR_ATTR(call site arguments, NAME))
-#define STATS_DECLTRACK_FN_ATTR(NAME)                                          \
-  STATS_DECLTRACK(NAME, Function, BUILD_STAT_MSG_IR_ATTR(functions, NAME))
-#define STATS_DECLTRACK_CS_ATTR(NAME)                                          \
-  STATS_DECLTRACK(NAME, CS, BUILD_STAT_MSG_IR_ATTR(call site, NAME))
-#define STATS_DECLTRACK_FNRET_ATTR(NAME)                                       \
-  STATS_DECLTRACK(NAME, FunctionReturn,                                        \
-                  BUILD_STAT_MSG_IR_ATTR(function returns, NAME))
-#define STATS_DECLTRACK_CSRET_ATTR(NAME)                                       \
-  STATS_DECLTRACK(NAME, CSReturn,                                              \
-                  BUILD_STAT_MSG_IR_ATTR(call site returns, NAME))
-#define STATS_DECLTRACK_FLOATING_ATTR(NAME)                                    \
-  STATS_DECLTRACK(NAME, Floating,                                              \
-                  ("Number of floating values known to be '" #NAME "'"))
-
-// Specialization of the operator<< for abstract attributes subclasses. This
-// disambiguates situations where multiple operators are applicable.
-namespace llvm {
-#define PIPE_OPERATOR(CLASS)                                                   \
-  raw_ostream &operator<<(raw_ostream &OS, const CLASS &AA) {                  \
-    return OS << static_cast<const AbstractAttribute &>(AA);                   \
-  }
-
-PIPE_OPERATOR(AAIsDead)
-PIPE_OPERATOR(AANoUnwind)
-PIPE_OPERATOR(AANoSync)
-PIPE_OPERATOR(AANoRecurse)
-PIPE_OPERATOR(AAWillReturn)
-PIPE_OPERATOR(AANoReturn)
-PIPE_OPERATOR(AAReturnedValues)
-PIPE_OPERATOR(AANonNull)
-PIPE_OPERATOR(AANoAlias)
-PIPE_OPERATOR(AADereferenceable)
-PIPE_OPERATOR(AAAlign)
-PIPE_OPERATOR(AANoCapture)
-PIPE_OPERATOR(AAValueSimplify)
-PIPE_OPERATOR(AANoFree)
-PIPE_OPERATOR(AAHeapToStack)
-PIPE_OPERATOR(AAReachability)
-PIPE_OPERATOR(AAMemoryBehavior)
-PIPE_OPERATOR(AAValueConstantRange)
-
-#undef PIPE_OPERATOR
-} // namespace llvm
-
 // TODO: Determine a good default value.
 //
 // In the LLVM-TS and SPEC2006, 32 seems to not induce compile time overheads
@@ -151,30 +66,24 @@ static cl::opt<bool> VerifyMaxFixpointIterations(
     cl::desc("Verify that max-iterations is a tight bound for a fixpoint"),
     cl::init(false));
 
-static cl::opt<bool> DisableAttributor(
-    "attributor-disable", cl::Hidden,
-    cl::desc("Disable the attributor inter-procedural deduction pass."),
-    cl::init(true));
-
 static cl::opt<bool> AnnotateDeclarationCallSites(
     "attributor-annotate-decl-cs", cl::Hidden,
     cl::desc("Annotate call sites of function declarations."), cl::init(false));
 
-static cl::opt<bool> ManifestInternal(
-    "attributor-manifest-internal", cl::Hidden,
-    cl::desc("Manifest Attributor internal string attributes."),
-    cl::init(false));
-
-static cl::opt<unsigned> DepRecInterval(
-    "attributor-dependence-recompute-interval", cl::Hidden,
-    cl::desc("Number of iterations until dependences are recomputed."),
-    cl::init(4));
-
 static cl::opt<bool> EnableHeapToStack("enable-heap-to-stack-conversion",
                                        cl::init(true), cl::Hidden);
 
-static cl::opt<int> MaxHeapToStackSize("max-heap-to-stack-size", cl::init(128),
-                                       cl::Hidden);
+static cl::opt<bool>
+    AllowShallowWrappers("attributor-allow-shallow-wrappers", cl::Hidden,
+                         cl::desc("Allow the Attributor to create shallow "
+                                  "wrappers for non-exact definitions."),
+                         cl::init(false));
+
+static cl::list<std::string>
+    SeedAllowList("attributor-seed-allow-list", cl::Hidden,
+                  cl::desc("Comma seperated list of attrbute names that are "
+                           "allowed to be seeded."),
+                  cl::ZeroOrMore, cl::CommaSeparated);
 
 /// Logic operators for the change status enum class.
 ///
@@ -187,6 +96,49 @@ ChangeStatus llvm::operator&(ChangeStatus l, ChangeStatus r) {
 }
 ///}
 
+/// Return true if \p New is equal or worse than \p Old.
+static bool isEqualOrWorse(const Attribute &New, const Attribute &Old) {
+  if (!Old.isIntAttribute())
+    return true;
+
+  return Old.getValueAsInt() >= New.getValueAsInt();
+}
+
+/// Return true if the information provided by \p Attr was added to the
+/// attribute list \p Attrs. This is only the case if it was not already present
+/// in \p Attrs at the position describe by \p PK and \p AttrIdx.
+static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr,
+                             AttributeList &Attrs, int AttrIdx) {
+
+  if (Attr.isEnumAttribute()) {
+    Attribute::AttrKind Kind = Attr.getKindAsEnum();
+    if (Attrs.hasAttribute(AttrIdx, Kind))
+      if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+        return false;
+    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+    return true;
+  }
+  if (Attr.isStringAttribute()) {
+    StringRef Kind = Attr.getKindAsString();
+    if (Attrs.hasAttribute(AttrIdx, Kind))
+      if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+        return false;
+    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+    return true;
+  }
+  if (Attr.isIntAttribute()) {
+    Attribute::AttrKind Kind = Attr.getKindAsEnum();
+    if (Attrs.hasAttribute(AttrIdx, Kind))
+      if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
+        return false;
+    Attrs = Attrs.removeAttribute(Ctx, AttrIdx, Kind);
+    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
+    return true;
+  }
+
+  llvm_unreachable("Expected enum or string attribute!");
+}
+
 Argument *IRPosition::getAssociatedArgument() const {
   if (getPositionKind() == IRP_ARGUMENT)
     return cast<Argument>(&getAnchorValue());
@@ -202,10 +154,10 @@ Argument *IRPosition::getAssociatedArgument() const {
   // of the underlying call site operand, we want the corresponding callback
   // callee argument and not the direct callee argument.
   Optional<Argument *> CBCandidateArg;
-  SmallVector<const Use *, 4> CBUses;
-  ImmutableCallSite ICS(&getAnchorValue());
-  AbstractCallSite::getCallbackUses(ICS, CBUses);
-  for (const Use *U : CBUses) {
+  SmallVector<const Use *, 4> CallbackUses;
+  const auto &CB = cast<CallBase>(getAnchorValue());
+  AbstractCallSite::getCallbackUses(CB, CallbackUses);
+  for (const Use *U : CallbackUses) {
     AbstractCallSite ACS(U);
     assert(ACS && ACS.isCallbackCall());
     if (!ACS.getCalledFunction())
@@ -234,176 +186,13 @@ Argument *IRPosition::getAssociatedArgument() const {
 
   // If no callbacks were found, or none used the underlying call site operand
   // exclusively, use the direct callee argument if available.
-  const Function *Callee = ICS.getCalledFunction();
+  const Function *Callee = CB.getCalledFunction();
   if (Callee && Callee->arg_size() > unsigned(ArgNo))
     return Callee->getArg(ArgNo);
 
   return nullptr;
 }
 
-/// For calls (and invokes) we will only replace instruction uses to not disturb
-/// the old style call graph.
-/// TODO: Remove this once we get rid of the old PM.
-static void replaceAllInstructionUsesWith(Value &Old, Value &New) {
-  if (!isa<CallBase>(Old))
-    return Old.replaceAllUsesWith(&New);
-  SmallVector<Use *, 8> Uses;
-  for (Use &U : Old.uses())
-    if (isa<Instruction>(U.getUser()))
-      Uses.push_back(&U);
-  for (Use *U : Uses)
-    U->set(&New);
-}
-
-/// Recursively visit all values that might become \p IRP at some point. This
-/// will be done by looking through cast instructions, selects, phis, and calls
-/// with the "returned" attribute. Once we cannot look through the value any
-/// further, the callback \p VisitValueCB is invoked and passed the current
-/// value, the \p State, and a flag to indicate if we stripped anything. To
-/// limit how much effort is invested, we will never visit more values than
-/// specified by \p MaxValues.
-template <typename AAType, typename StateTy>
-static bool genericValueTraversal(
-    Attributor &A, IRPosition IRP, const AAType &QueryingAA, StateTy &State,
-    const function_ref<bool(Value &, StateTy &, bool)> &VisitValueCB,
-    int MaxValues = 8) {
-
-  const AAIsDead *LivenessAA = nullptr;
-  if (IRP.getAnchorScope())
-    LivenessAA = &A.getAAFor<AAIsDead>(
-        QueryingAA, IRPosition::function(*IRP.getAnchorScope()),
-        /* TrackDependence */ false);
-  bool AnyDead = false;
-
-  // TODO: Use Positions here to allow context sensitivity in VisitValueCB
-  SmallPtrSet<Value *, 16> Visited;
-  SmallVector<Value *, 16> Worklist;
-  Worklist.push_back(&IRP.getAssociatedValue());
-
-  int Iteration = 0;
-  do {
-    Value *V = Worklist.pop_back_val();
-
-    // Check if we should process the current value. To prevent endless
-    // recursion keep a record of the values we followed!
-    if (!Visited.insert(V).second)
-      continue;
-
-    // Make sure we limit the compile time for complex expressions.
-    if (Iteration++ >= MaxValues)
-      return false;
-
-    // Explicitly look through calls with a "returned" attribute if we do
-    // not have a pointer as stripPointerCasts only works on them.
-    Value *NewV = nullptr;
-    if (V->getType()->isPointerTy()) {
-      NewV = V->stripPointerCasts();
-    } else {
-      CallSite CS(V);
-      if (CS && CS.getCalledFunction()) {
-        for (Argument &Arg : CS.getCalledFunction()->args())
-          if (Arg.hasReturnedAttr()) {
-            NewV = CS.getArgOperand(Arg.getArgNo());
-            break;
-          }
-      }
-    }
-    if (NewV && NewV != V) {
-      Worklist.push_back(NewV);
-      continue;
-    }
-
-    // Look through select instructions, visit both potential values.
-    if (auto *SI = dyn_cast<SelectInst>(V)) {
-      Worklist.push_back(SI->getTrueValue());
-      Worklist.push_back(SI->getFalseValue());
-      continue;
-    }
-
-    // Look through phi nodes, visit all live operands.
-    if (auto *PHI = dyn_cast<PHINode>(V)) {
-      assert(LivenessAA &&
-             "Expected liveness in the presence of instructions!");
-      for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) {
-        const BasicBlock *IncomingBB = PHI->getIncomingBlock(u);
-        if (LivenessAA->isAssumedDead(IncomingBB->getTerminator())) {
-          AnyDead = true;
-          continue;
-        }
-        Worklist.push_back(PHI->getIncomingValue(u));
-      }
-      continue;
-    }
-
-    // Once a leaf is reached we inform the user through the callback.
-    if (!VisitValueCB(*V, State, Iteration > 1))
-      return false;
-  } while (!Worklist.empty());
-
-  // If we actually used liveness information so we have to record a dependence.
-  if (AnyDead)
-    A.recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL);
-
-  // All values have been visited.
-  return true;
-}
-
-/// Return true if \p New is equal or worse than \p Old.
-static bool isEqualOrWorse(const Attribute &New, const Attribute &Old) {
-  if (!Old.isIntAttribute())
-    return true;
-
-  return Old.getValueAsInt() >= New.getValueAsInt();
-}
-
-/// Return true if the information provided by \p Attr was added to the
-/// attribute list \p Attrs. This is only the case if it was not already present
-/// in \p Attrs at the position describe by \p PK and \p AttrIdx.
-static bool addIfNotExistent(LLVMContext &Ctx, const Attribute &Attr,
-                             AttributeList &Attrs, int AttrIdx) {
-
-  if (Attr.isEnumAttribute()) {
-    Attribute::AttrKind Kind = Attr.getKindAsEnum();
-    if (Attrs.hasAttribute(AttrIdx, Kind))
-      if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
-        return false;
-    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
-    return true;
-  }
-  if (Attr.isStringAttribute()) {
-    StringRef Kind = Attr.getKindAsString();
-    if (Attrs.hasAttribute(AttrIdx, Kind))
-      if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
-        return false;
-    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
-    return true;
-  }
-  if (Attr.isIntAttribute()) {
-    Attribute::AttrKind Kind = Attr.getKindAsEnum();
-    if (Attrs.hasAttribute(AttrIdx, Kind))
-      if (isEqualOrWorse(Attr, Attrs.getAttribute(AttrIdx, Kind)))
-        return false;
-    Attrs = Attrs.removeAttribute(Ctx, AttrIdx, Kind);
-    Attrs = Attrs.addAttribute(Ctx, AttrIdx, Attr);
-    return true;
-  }
-
-  llvm_unreachable("Expected enum or string attribute!");
-}
-
-static const Value *
-getBasePointerOfAccessPointerOperand(const Instruction *I, int64_t &BytesOffset,
-                                     const DataLayout &DL,
-                                     bool AllowNonInbounds = false) {
-  const Value *Ptr =
-      Attributor::getPointerOperand(I, /* AllowVolatile */ false);
-  if (!Ptr)
-    return nullptr;
-
-  return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL,
-                                          AllowNonInbounds);
-}
-
 ChangeStatus AbstractAttribute::update(Attributor &A) {
   ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
   if (getState().isAtFixpoint())
@@ -422,7 +211,7 @@ ChangeStatus AbstractAttribute::update(Attributor &A) {
 ChangeStatus
 IRAttributeManifest::manifestAttrs(Attributor &A, const IRPosition &IRP,
                                    const ArrayRef<Attribute> &DeducedAttrs) {
-  Function *ScopeFn = IRP.getAssociatedFunction();
+  Function *ScopeFn = IRP.getAnchorScope();
   IRPosition::Kind PK = IRP.getPositionKind();
 
   // In the following some generic code that will manifest attributes in
@@ -442,7 +231,7 @@ IRAttributeManifest::manifestAttrs(Attributor &A, const IRPosition &IRP,
   case IRPosition::IRP_CALL_SITE:
   case IRPosition::IRP_CALL_SITE_RETURNED:
   case IRPosition::IRP_CALL_SITE_ARGUMENT:
-    Attrs = ImmutableCallSite(&IRP.getAnchorValue()).getAttributes();
+    Attrs = cast<CallBase>(IRP.getAnchorValue()).getAttributes();
     break;
   }
 
@@ -467,7 +256,7 @@ IRAttributeManifest::manifestAttrs(Attributor &A, const IRPosition &IRP,
   case IRPosition::IRP_CALL_SITE:
   case IRPosition::IRP_CALL_SITE_RETURNED:
   case IRPosition::IRP_CALL_SITE_ARGUMENT:
-    CallSite(&IRP.getAnchorValue()).setAttributes(Attrs);
+    cast<CallBase>(IRP.getAnchorValue()).setAttributes(Attrs);
     break;
   case IRPosition::IRP_INVALID:
   case IRPosition::IRP_FLOAT:
@@ -477,13 +266,14 @@ IRAttributeManifest::manifestAttrs(Attributor &A, const IRPosition &IRP,
   return HasChanged;
 }
 
-const IRPosition IRPosition::EmptyKey(255);
-const IRPosition IRPosition::TombstoneKey(256);
+const IRPosition IRPosition::EmptyKey(DenseMapInfo<void *>::getEmptyKey());
+const IRPosition
+    IRPosition::TombstoneKey(DenseMapInfo<void *>::getTombstoneKey());
 
 SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) {
   IRPositions.emplace_back(IRP);
 
-  ImmutableCallSite ICS(&IRP.getAnchorValue());
+  const auto *CB = dyn_cast<CallBase>(&IRP.getAnchorValue());
   switch (IRP.getPositionKind()) {
   case IRPosition::IRP_INVALID:
   case IRPosition::IRP_FLOAT:
@@ -491,37 +281,43 @@ SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) {
     return;
   case IRPosition::IRP_ARGUMENT:
   case IRPosition::IRP_RETURNED:
-    IRPositions.emplace_back(
-        IRPosition::function(*IRP.getAssociatedFunction()));
+    IRPositions.emplace_back(IRPosition::function(*IRP.getAnchorScope()));
     return;
   case IRPosition::IRP_CALL_SITE:
-    assert(ICS && "Expected call site!");
+    assert(CB && "Expected call site!");
     // TODO: We need to look at the operand bundles similar to the redirection
     //       in CallBase.
-    if (!ICS.hasOperandBundles())
-      if (const Function *Callee = ICS.getCalledFunction())
+    if (!CB->hasOperandBundles())
+      if (const Function *Callee = CB->getCalledFunction())
         IRPositions.emplace_back(IRPosition::function(*Callee));
     return;
   case IRPosition::IRP_CALL_SITE_RETURNED:
-    assert(ICS && "Expected call site!");
+    assert(CB && "Expected call site!");
     // TODO: We need to look at the operand bundles similar to the redirection
     //       in CallBase.
-    if (!ICS.hasOperandBundles()) {
-      if (const Function *Callee = ICS.getCalledFunction()) {
+    if (!CB->hasOperandBundles()) {
+      if (const Function *Callee = CB->getCalledFunction()) {
         IRPositions.emplace_back(IRPosition::returned(*Callee));
         IRPositions.emplace_back(IRPosition::function(*Callee));
+        for (const Argument &Arg : Callee->args())
+          if (Arg.hasReturnedAttr()) {
+            IRPositions.emplace_back(
+                IRPosition::callsite_argument(*CB, Arg.getArgNo()));
+            IRPositions.emplace_back(
+                IRPosition::value(*CB->getArgOperand(Arg.getArgNo())));
+            IRPositions.emplace_back(IRPosition::argument(Arg));
+          }
       }
     }
-    IRPositions.emplace_back(
-        IRPosition::callsite_function(cast<CallBase>(*ICS.getInstruction())));
+    IRPositions.emplace_back(IRPosition::callsite_function(*CB));
     return;
   case IRPosition::IRP_CALL_SITE_ARGUMENT: {
     int ArgNo = IRP.getArgNo();
-    assert(ICS && ArgNo >= 0 && "Expected call site!");
+    assert(CB && ArgNo >= 0 && "Expected call site!");
     // TODO: We need to look at the operand bundles similar to the redirection
     //       in CallBase.
-    if (!ICS.hasOperandBundles()) {
-      const Function *Callee = ICS.getCalledFunction();
+    if (!CB->hasOperandBundles()) {
+      const Function *Callee = CB->getCalledFunction();
       if (Callee && Callee->arg_size() > unsigned(ArgNo))
         IRPositions.emplace_back(IRPosition::argument(*Callee->getArg(ArgNo)));
       if (Callee)
@@ -534,10 +330,11 @@ SubsumingPositionIterator::SubsumingPositionIterator(const IRPosition &IRP) {
 }
 
 bool IRPosition::hasAttr(ArrayRef<Attribute::AttrKind> AKs,
-                         bool IgnoreSubsumingPositions) const {
+                         bool IgnoreSubsumingPositions, Attributor *A) const {
+  SmallVector<Attribute, 4> Attrs;
   for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) {
     for (Attribute::AttrKind AK : AKs)
-      if (EquivIRP.getAttr(AK).getKindAsEnum() == AK)
+      if (EquivIRP.getAttrsFromIRAttr(AK, Attrs))
         return true;
     // The first position returned by the SubsumingPositionIterator is
     // always the position itself. If we ignore subsuming positions we
@@ -545,5052 +342,300 @@ bool IRPosition::hasAttr(ArrayRef<Attribute::AttrKind> AKs,
     if (IgnoreSubsumingPositions)
       break;
   }
+  if (A)
+    for (Attribute::AttrKind AK : AKs)
+      if (getAttrsFromAssumes(AK, Attrs, *A))
+        return true;
   return false;
 }
 
 void IRPosition::getAttrs(ArrayRef<Attribute::AttrKind> AKs,
                           SmallVectorImpl<Attribute> &Attrs,
-                          bool IgnoreSubsumingPositions) const {
+                          bool IgnoreSubsumingPositions, Attributor *A) const {
   for (const IRPosition &EquivIRP : SubsumingPositionIterator(*this)) {
-    for (Attribute::AttrKind AK : AKs) {
-      const Attribute &Attr = EquivIRP.getAttr(AK);
-      if (Attr.getKindAsEnum() == AK)
-        Attrs.push_back(Attr);
-    }
+    for (Attribute::AttrKind AK : AKs)
+      EquivIRP.getAttrsFromIRAttr(AK, Attrs);
     // The first position returned by the SubsumingPositionIterator is
     // always the position itself. If we ignore subsuming positions we
     // are done after the first iteration.
     if (IgnoreSubsumingPositions)
       break;
   }
+  if (A)
+    for (Attribute::AttrKind AK : AKs)
+      getAttrsFromAssumes(AK, Attrs, *A);
+}
+
+bool IRPosition::getAttrsFromIRAttr(Attribute::AttrKind AK,
+                                    SmallVectorImpl<Attribute> &Attrs) const {
+  if (getPositionKind() == IRP_INVALID || getPositionKind() == IRP_FLOAT)
+    return false;
+
+  AttributeList AttrList;
+  if (const auto *CB = dyn_cast<CallBase>(&getAnchorValue()))
+    AttrList = CB->getAttributes();
+  else
+    AttrList = getAssociatedFunction()->getAttributes();
+
+  bool HasAttr = AttrList.hasAttribute(getAttrIdx(), AK);
+  if (HasAttr)
+    Attrs.push_back(AttrList.getAttribute(getAttrIdx(), AK));
+  return HasAttr;
+}
+
+bool IRPosition::getAttrsFromAssumes(Attribute::AttrKind AK,
+                                     SmallVectorImpl<Attribute> &Attrs,
+                                     Attributor &A) const {
+  assert(getPositionKind() != IRP_INVALID && "Did expect a valid position!");
+  Value &AssociatedValue = getAssociatedValue();
+
+  const Assume2KnowledgeMap &A2K =
+      A.getInfoCache().getKnowledgeMap().lookup({&AssociatedValue, AK});
+
+  // Check if we found any potential assume use, if not we don't need to create
+  // explorer iterators.
+  if (A2K.empty())
+    return false;
+
+  LLVMContext &Ctx = AssociatedValue.getContext();
+  unsigned AttrsSize = Attrs.size();
+  MustBeExecutedContextExplorer &Explorer =
+      A.getInfoCache().getMustBeExecutedContextExplorer();
+  auto EIt = Explorer.begin(getCtxI()), EEnd = Explorer.end(getCtxI());
+  for (auto &It : A2K)
+    if (Explorer.findInContextOf(It.first, EIt, EEnd))
+      Attrs.push_back(Attribute::get(Ctx, AK, It.second.Max));
+  return AttrsSize != Attrs.size();
 }
 
 void IRPosition::verify() {
-  switch (KindOrArgNo) {
-  default:
-    assert(KindOrArgNo >= 0 && "Expected argument or call site argument!");
-    assert((isa<CallBase>(AnchorVal) || isa<Argument>(AnchorVal)) &&
-           "Expected call base or argument for positive attribute index!");
-    if (isa<Argument>(AnchorVal)) {
-      assert(cast<Argument>(AnchorVal)->getArgNo() == unsigned(getArgNo()) &&
-             "Argument number mismatch!");
-      assert(cast<Argument>(AnchorVal) == &getAssociatedValue() &&
-             "Associated value mismatch!");
-    } else {
-      assert(cast<CallBase>(*AnchorVal).arg_size() > unsigned(getArgNo()) &&
-             "Call site argument number mismatch!");
-      assert(cast<CallBase>(*AnchorVal).getArgOperand(getArgNo()) ==
-                 &getAssociatedValue() &&
-             "Associated value mismatch!");
-    }
-    break;
+#ifdef EXPENSIVE_CHECKS
+  switch (getPositionKind()) {
   case IRP_INVALID:
-    assert(!AnchorVal && "Expected no value for an invalid position!");
-    break;
+    assert(!Enc.getOpaqueValue() &&
+           "Expected a nullptr for an invalid position!");
+    return;
   case IRP_FLOAT:
     assert((!isa<CallBase>(&getAssociatedValue()) &&
             !isa<Argument>(&getAssociatedValue())) &&
            "Expected specialized kind for call base and argument values!");
-    break;
+    return;
   case IRP_RETURNED:
-    assert(isa<Function>(AnchorVal) &&
+    assert(isa<Function>(getAsValuePtr()) &&
            "Expected function for a 'returned' position!");
-    assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!");
-    break;
+    assert(getAsValuePtr() == &getAssociatedValue() &&
+           "Associated value mismatch!");
+    return;
   case IRP_CALL_SITE_RETURNED:
-    assert((isa<CallBase>(AnchorVal)) &&
+    assert((isa<CallBase>(getAsValuePtr())) &&
            "Expected call base for 'call site returned' position!");
-    assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!");
-    break;
+    assert(getAsValuePtr() == &getAssociatedValue() &&
+           "Associated value mismatch!");
+    return;
   case IRP_CALL_SITE:
-    assert((isa<CallBase>(AnchorVal)) &&
+    assert((isa<CallBase>(getAsValuePtr())) &&
            "Expected call base for 'call site function' position!");
-    assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!");
-    break;
+    assert(getAsValuePtr() == &getAssociatedValue() &&
+           "Associated value mismatch!");
+    return;
   case IRP_FUNCTION:
-    assert(isa<Function>(AnchorVal) &&
+    assert(isa<Function>(getAsValuePtr()) &&
            "Expected function for a 'function' position!");
-    assert(AnchorVal == &getAssociatedValue() && "Associated value mismatch!");
-    break;
-  }
-}
-
-namespace {
-/// Helper function to clamp a state \p S of type \p StateType with the
-/// information in \p R and indicate/return if \p S did change (as-in update is
-/// required to be run again).
-template <typename StateType>
-ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R) {
-  auto Assumed = S.getAssumed();
-  S ^= R;
-  return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED
-                                   : ChangeStatus::CHANGED;
-}
-
-/// Clamp the information known for all returned values of a function
-/// (identified by \p QueryingAA) into \p S.
-template <typename AAType, typename StateType = typename AAType::StateType>
-static void clampReturnedValueStates(Attributor &A, const AAType &QueryingAA,
-                                     StateType &S) {
-  LLVM_DEBUG(dbgs() << "[Attributor] Clamp return value states for "
-                    << QueryingAA << " into " << S << "\n");
-
-  assert((QueryingAA.getIRPosition().getPositionKind() ==
-              IRPosition::IRP_RETURNED ||
-          QueryingAA.getIRPosition().getPositionKind() ==
-              IRPosition::IRP_CALL_SITE_RETURNED) &&
-         "Can only clamp returned value states for a function returned or call "
-         "site returned position!");
-
-  // Use an optional state as there might not be any return values and we want
-  // to join (IntegerState::operator&) the state of all there are.
-  Optional<StateType> T;
-
-  // Callback for each possibly returned value.
-  auto CheckReturnValue = [&](Value &RV) -> bool {
-    const IRPosition &RVPos = IRPosition::value(RV);
-    const AAType &AA = A.getAAFor<AAType>(QueryingAA, RVPos);
-    LLVM_DEBUG(dbgs() << "[Attributor] RV: " << RV << " AA: " << AA.getAsStr()
-                      << " @ " << RVPos << "\n");
-    const StateType &AAS = static_cast<const StateType &>(AA.getState());
-    if (T.hasValue())
-      *T &= AAS;
-    else
-      T = AAS;
-    LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " RV State: " << T
-                      << "\n");
-    return T->isValidState();
-  };
-
-  if (!A.checkForAllReturnedValues(CheckReturnValue, QueryingAA))
-    S.indicatePessimisticFixpoint();
-  else if (T.hasValue())
-    S ^= *T;
-}
-
-/// Helper class to compose two generic deduction
-template <typename AAType, typename Base, typename StateType,
-          template <typename...> class F, template <typename...> class G>
-struct AAComposeTwoGenericDeduction
-    : public F<AAType, G<AAType, Base, StateType>, StateType> {
-  AAComposeTwoGenericDeduction(const IRPosition &IRP)
-      : F<AAType, G<AAType, Base, StateType>, StateType>(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    ChangeStatus ChangedF =
-        F<AAType, G<AAType, Base, StateType>, StateType>::updateImpl(A);
-    ChangeStatus ChangedG = G<AAType, Base, StateType>::updateImpl(A);
-    return ChangedF | ChangedG;
+    assert(getAsValuePtr() == &getAssociatedValue() &&
+           "Associated value mismatch!");
+    return;
+  case IRP_ARGUMENT:
+    assert(isa<Argument>(getAsValuePtr()) &&
+           "Expected argument for a 'argument' position!");
+    assert(getAsValuePtr() == &getAssociatedValue() &&
+           "Associated value mismatch!");
+    return;
+  case IRP_CALL_SITE_ARGUMENT: {
+    Use *U = getAsUsePtr();
+    assert(U && "Expected use for a 'call site argument' position!");
+    assert(isa<CallBase>(U->getUser()) &&
+           "Expected call base user for a 'call site argument' position!");
+    assert(cast<CallBase>(U->getUser())->isArgOperand(U) &&
+           "Expected call base argument operand for a 'call site argument' "
+           "position");
+    assert(cast<CallBase>(U->getUser())->getArgOperandNo(U) ==
+               unsigned(getArgNo()) &&
+           "Argument number mismatch!");
+    assert(U->get() == &getAssociatedValue() && "Associated value mismatch!");
+    return;
   }
-};
-
-/// Helper class for generic deduction: return value -> returned position.
-template <typename AAType, typename Base,
-          typename StateType = typename AAType::StateType>
-struct AAReturnedFromReturnedValues : public Base {
-  AAReturnedFromReturnedValues(const IRPosition &IRP) : Base(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    StateType S;
-    clampReturnedValueStates<AAType, StateType>(A, *this, S);
-    // TODO: If we know we visited all returned values, thus no are assumed
-    // dead, we can take the known information from the state T.
-    return clampStateAndIndicateChange<StateType>(this->getState(), S);
   }
-};
-
-/// Clamp the information known at all call sites for a given argument
-/// (identified by \p QueryingAA) into \p S.
-template <typename AAType, typename StateType = typename AAType::StateType>
-static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA,
-                                        StateType &S) {
-  LLVM_DEBUG(dbgs() << "[Attributor] Clamp call site argument states for "
-                    << QueryingAA << " into " << S << "\n");
-
-  assert(QueryingAA.getIRPosition().getPositionKind() ==
-             IRPosition::IRP_ARGUMENT &&
-         "Can only clamp call site argument states for an argument position!");
-
-  // Use an optional state as there might not be any return values and we want
-  // to join (IntegerState::operator&) the state of all there are.
-  Optional<StateType> T;
-
-  // The argument number which is also the call site argument number.
-  unsigned ArgNo = QueryingAA.getIRPosition().getArgNo();
-
-  auto CallSiteCheck = [&](AbstractCallSite ACS) {
-    const IRPosition &ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo);
-    // Check if a coresponding argument was found or if it is on not associated
-    // (which can happen for callback calls).
-    if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
-      return false;
-
-    const AAType &AA = A.getAAFor<AAType>(QueryingAA, ACSArgPos);
-    LLVM_DEBUG(dbgs() << "[Attributor] ACS: " << *ACS.getInstruction()
-                      << " AA: " << AA.getAsStr() << " @" << ACSArgPos << "\n");
-    const StateType &AAS = static_cast<const StateType &>(AA.getState());
-    if (T.hasValue())
-      *T &= AAS;
-    else
-      T = AAS;
-    LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " CSA State: " << T
-                      << "\n");
-    return T->isValidState();
-  };
-
-  if (!A.checkForAllCallSites(CallSiteCheck, QueryingAA, true))
-    S.indicatePessimisticFixpoint();
-  else if (T.hasValue())
-    S ^= *T;
+#endif
 }
 
-/// Helper class for generic deduction: call site argument -> argument position.
-template <typename AAType, typename Base,
-          typename StateType = typename AAType::StateType>
-struct AAArgumentFromCallSiteArguments : public Base {
-  AAArgumentFromCallSiteArguments(const IRPosition &IRP) : Base(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    StateType S;
-    clampCallSiteArgumentStates<AAType, StateType>(A, *this, S);
-    // TODO: If we know we visited all incoming values, thus no are assumed
-    // dead, we can take the known information from the state T.
-    return clampStateAndIndicateChange<StateType>(this->getState(), S);
-  }
-};
-
-/// Helper class for generic replication: function returned -> cs returned.
-template <typename AAType, typename Base,
-          typename StateType = typename AAType::StateType>
-struct AACallSiteReturnedFromReturned : public Base {
-  AACallSiteReturnedFromReturned(const IRPosition &IRP) : Base(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    assert(this->getIRPosition().getPositionKind() ==
-               IRPosition::IRP_CALL_SITE_RETURNED &&
-           "Can only wrap function returned positions for call site returned "
-           "positions!");
-    auto &S = this->getState();
-
-    const Function *AssociatedFunction =
-        this->getIRPosition().getAssociatedFunction();
-    if (!AssociatedFunction)
-      return S.indicatePessimisticFixpoint();
-
-    IRPosition FnPos = IRPosition::returned(*AssociatedFunction);
-    const AAType &AA = A.getAAFor<AAType>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        S, static_cast<const typename AAType::StateType &>(AA.getState()));
-  }
-};
-
-/// Helper class for generic deduction using must-be-executed-context
-/// Base class is required to have `followUse` method.
-
-/// bool followUse(Attributor &A, const Use *U, const Instruction *I)
-/// U - Underlying use.
-/// I - The user of the \p U.
-/// `followUse` returns true if the value should be tracked transitively.
-
-template <typename AAType, typename Base,
-          typename StateType = typename AAType::StateType>
-struct AAFromMustBeExecutedContext : public Base {
-  AAFromMustBeExecutedContext(const IRPosition &IRP) : Base(IRP) {}
-
-  void initialize(Attributor &A) override {
-    Base::initialize(A);
-    const IRPosition &IRP = this->getIRPosition();
-    Instruction *CtxI = IRP.getCtxI();
-
-    if (!CtxI)
-      return;
-
-    for (const Use &U : IRP.getAssociatedValue().uses())
-      Uses.insert(&U);
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    auto BeforeState = this->getState();
-    auto &S = this->getState();
-    Instruction *CtxI = this->getIRPosition().getCtxI();
-    if (!CtxI)
-      return ChangeStatus::UNCHANGED;
-
-    MustBeExecutedContextExplorer &Explorer =
-        A.getInfoCache().getMustBeExecutedContextExplorer();
-
-    auto EIt = Explorer.begin(CtxI), EEnd = Explorer.end(CtxI);
-    for (unsigned u = 0; u < Uses.size(); ++u) {
-      const Use *U = Uses[u];
-      if (const Instruction *UserI = dyn_cast<Instruction>(U->getUser())) {
-        bool Found = Explorer.findInContextOf(UserI, EIt, EEnd);
-        if (Found && Base::followUse(A, U, UserI))
-          for (const Use &Us : UserI->uses())
-            Uses.insert(&Us);
-      }
-    }
-
-    return BeforeState == S ? ChangeStatus::UNCHANGED : ChangeStatus::CHANGED;
-  }
-
-private:
-  /// Container for (transitive) uses of the associated value.
-  SetVector<const Use *> Uses;
-};
-
-template <typename AAType, typename Base,
-          typename StateType = typename AAType::StateType>
-using AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext =
-    AAComposeTwoGenericDeduction<AAType, Base, StateType,
-                                 AAFromMustBeExecutedContext,
-                                 AAArgumentFromCallSiteArguments>;
-
-template <typename AAType, typename Base,
-          typename StateType = typename AAType::StateType>
-using AACallSiteReturnedFromReturnedAndMustBeExecutedContext =
-    AAComposeTwoGenericDeduction<AAType, Base, StateType,
-                                 AAFromMustBeExecutedContext,
-                                 AACallSiteReturnedFromReturned>;
-
-/// -----------------------NoUnwind Function Attribute--------------------------
-
-struct AANoUnwindImpl : AANoUnwind {
-  AANoUnwindImpl(const IRPosition &IRP) : AANoUnwind(IRP) {}
-
-  const std::string getAsStr() const override {
-    return getAssumed() ? "nounwind" : "may-unwind";
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    auto Opcodes = {
-        (unsigned)Instruction::Invoke,      (unsigned)Instruction::CallBr,
-        (unsigned)Instruction::Call,        (unsigned)Instruction::CleanupRet,
-        (unsigned)Instruction::CatchSwitch, (unsigned)Instruction::Resume};
-
-    auto CheckForNoUnwind = [&](Instruction &I) {
-      if (!I.mayThrow())
-        return true;
-
-      if (ImmutableCallSite ICS = ImmutableCallSite(&I)) {
-        const auto &NoUnwindAA =
-            A.getAAFor<AANoUnwind>(*this, IRPosition::callsite_function(ICS));
-        return NoUnwindAA.isAssumedNoUnwind();
-      }
-      return false;
-    };
-
-    if (!A.checkForAllInstructions(CheckForNoUnwind, *this, Opcodes))
-      return indicatePessimisticFixpoint();
-
-    return ChangeStatus::UNCHANGED;
-  }
-};
-
-struct AANoUnwindFunction final : public AANoUnwindImpl {
-  AANoUnwindFunction(const IRPosition &IRP) : AANoUnwindImpl(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nounwind) }
-};
-
-/// NoUnwind attribute deduction for a call sites.
-struct AANoUnwindCallSite final : AANoUnwindImpl {
-  AANoUnwindCallSite(const IRPosition &IRP) : AANoUnwindImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    AANoUnwindImpl::initialize(A);
-    Function *F = getAssociatedFunction();
-    if (!F)
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Once we have call site specific value information we can provide
-    //       call site specific liveness information and then it makes
-    //       sense to specialize attributes for call sites arguments instead of
-    //       redirecting requests to the callee argument.
-    Function *F = getAssociatedFunction();
-    const IRPosition &FnPos = IRPosition::function(*F);
-    auto &FnAA = A.getAAFor<AANoUnwind>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AANoUnwind::StateType &>(FnAA.getState()));
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nounwind); }
-};
-
-/// --------------------- Function Return Values -------------------------------
-
-/// "Attribute" that collects all potential returned values and the return
-/// instructions that they arise from.
-///
-/// If there is a unique returned value R, the manifest method will:
-///   - mark R with the "returned" attribute, if R is an argument.
-class AAReturnedValuesImpl : public AAReturnedValues, public AbstractState {
-
-  /// Mapping of values potentially returned by the associated function to the
-  /// return instructions that might return them.
-  MapVector<Value *, SmallSetVector<ReturnInst *, 4>> ReturnedValues;
-
-  /// Mapping to remember the number of returned values for a call site such
-  /// that we can avoid updates if nothing changed.
-  DenseMap<const CallBase *, unsigned> NumReturnedValuesPerKnownAA;
-
-  /// Set of unresolved calls returned by the associated function.
-  SmallSetVector<CallBase *, 4> UnresolvedCalls;
-
-  /// State flags
-  ///
-  ///{
-  bool IsFixed = false;
-  bool IsValidState = true;
-  ///}
-
-public:
-  AAReturnedValuesImpl(const IRPosition &IRP) : AAReturnedValues(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    // Reset the state.
-    IsFixed = false;
-    IsValidState = true;
-    ReturnedValues.clear();
-
-    Function *F = getAssociatedFunction();
-    if (!F) {
-      indicatePessimisticFixpoint();
-      return;
-    }
-
-    // The map from instruction opcodes to those instructions in the function.
-    auto &OpcodeInstMap = A.getInfoCache().getOpcodeInstMapForFunction(*F);
-
-    // Look through all arguments, if one is marked as returned we are done.
-    for (Argument &Arg : F->args()) {
-      if (Arg.hasReturnedAttr()) {
-        auto &ReturnInstSet = ReturnedValues[&Arg];
-        for (Instruction *RI : OpcodeInstMap[Instruction::Ret])
-          ReturnInstSet.insert(cast<ReturnInst>(RI));
-
-        indicateOptimisticFixpoint();
-        return;
-      }
-    }
-
-    if (!F->hasExactDefinition())
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::manifest(...).
-  ChangeStatus manifest(Attributor &A) override;
-
-  /// See AbstractAttribute::getState(...).
-  AbstractState &getState() override { return *this; }
-
-  /// See AbstractAttribute::getState(...).
-  const AbstractState &getState() const override { return *this; }
-
-  /// See AbstractAttribute::updateImpl(Attributor &A).
-  ChangeStatus updateImpl(Attributor &A) override;
-
-  llvm::iterator_range<iterator> returned_values() override {
-    return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end());
-  }
-
-  llvm::iterator_range<const_iterator> returned_values() const override {
-    return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end());
-  }
-
-  const SmallSetVector<CallBase *, 4> &getUnresolvedCalls() const override {
-    return UnresolvedCalls;
-  }
-
-  /// Return the number of potential return values, -1 if unknown.
-  size_t getNumReturnValues() const override {
-    return isValidState() ? ReturnedValues.size() : -1;
-  }
-
-  /// Return an assumed unique return value if a single candidate is found. If
-  /// there cannot be one, return a nullptr. If it is not clear yet, return the
-  /// Optional::NoneType.
-  Optional<Value *> getAssumedUniqueReturnValue(Attributor &A) const;
-
-  /// See AbstractState::checkForAllReturnedValues(...).
-  bool checkForAllReturnedValuesAndReturnInsts(
-      const function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)>
-          &Pred) const override;
-
-  /// Pretty print the attribute similar to the IR representation.
-  const std::string getAsStr() const override;
-
-  /// See AbstractState::isAtFixpoint().
-  bool isAtFixpoint() const override { return IsFixed; }
-
-  /// See AbstractState::isValidState().
-  bool isValidState() const override { return IsValidState; }
-
-  /// See AbstractState::indicateOptimisticFixpoint(...).
-  ChangeStatus indicateOptimisticFixpoint() override {
-    IsFixed = true;
-    return ChangeStatus::UNCHANGED;
+Optional<Constant *>
+Attributor::getAssumedConstant(const Value &V, const AbstractAttribute &AA,
+                               bool &UsedAssumedInformation) {
+  const auto &ValueSimplifyAA = getAAFor<AAValueSimplify>(
+      AA, IRPosition::value(V), /* TrackDependence */ false);
+  Optional<Value *> SimplifiedV =
+      ValueSimplifyAA.getAssumedSimplifiedValue(*this);
+  bool IsKnown = ValueSimplifyAA.isKnown();
+  UsedAssumedInformation |= !IsKnown;
+  if (!SimplifiedV.hasValue()) {
+    recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
+    return llvm::None;
   }
-
-  ChangeStatus indicatePessimisticFixpoint() override {
-    IsFixed = true;
-    IsValidState = false;
-    return ChangeStatus::CHANGED;
+  if (isa_and_nonnull<UndefValue>(SimplifiedV.getValue())) {
+    recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
+    return llvm::None;
   }
-};
-
-ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) {
-  ChangeStatus Changed = ChangeStatus::UNCHANGED;
-
-  // Bookkeeping.
-  assert(isValidState());
-  STATS_DECLTRACK(KnownReturnValues, FunctionReturn,
-                  "Number of function with known return values");
-
-  // Check if we have an assumed unique return value that we could manifest.
-  Optional<Value *> UniqueRV = getAssumedUniqueReturnValue(A);
-
-  if (!UniqueRV.hasValue() || !UniqueRV.getValue())
-    return Changed;
-
-  // Bookkeeping.
-  STATS_DECLTRACK(UniqueReturnValue, FunctionReturn,
-                  "Number of function with unique return");
-
-  // Callback to replace the uses of CB with the constant C.
-  auto ReplaceCallSiteUsersWith = [](CallBase &CB, Constant &C) {
-    if (CB.getNumUses() == 0 || CB.isMustTailCall())
-      return ChangeStatus::UNCHANGED;
-    replaceAllInstructionUsesWith(CB, C);
-    return ChangeStatus::CHANGED;
-  };
-
-  // If the assumed unique return value is an argument, annotate it.
-  if (auto *UniqueRVArg = dyn_cast<Argument>(UniqueRV.getValue())) {
-    // TODO: This should be handled differently!
-    this->AnchorVal = UniqueRVArg;
-    this->KindOrArgNo = UniqueRVArg->getArgNo();
-    Changed = IRAttribute::manifest(A);
-  } else if (auto *RVC = dyn_cast<Constant>(UniqueRV.getValue())) {
-    // We can replace the returned value with the unique returned constant.
-    Value &AnchorValue = getAnchorValue();
-    if (Function *F = dyn_cast<Function>(&AnchorValue)) {
-      for (const Use &U : F->uses())
-        if (CallBase *CB = dyn_cast<CallBase>(U.getUser()))
-          if (CB->isCallee(&U)) {
-            Constant *RVCCast =
-                CB->getType() == RVC->getType()
-                    ? RVC
-                    : ConstantExpr::getTruncOrBitCast(RVC, CB->getType());
-            Changed = ReplaceCallSiteUsersWith(*CB, *RVCCast) | Changed;
-          }
-    } else {
-      assert(isa<CallBase>(AnchorValue) &&
-             "Expcected a function or call base anchor!");
-      Constant *RVCCast =
-          AnchorValue.getType() == RVC->getType()
-              ? RVC
-              : ConstantExpr::getTruncOrBitCast(RVC, AnchorValue.getType());
-      Changed = ReplaceCallSiteUsersWith(cast<CallBase>(AnchorValue), *RVCCast);
-    }
-    if (Changed == ChangeStatus::CHANGED)
-      STATS_DECLTRACK(UniqueConstantReturnValue, FunctionReturn,
-                      "Number of function returns replaced by constant return");
+  Constant *CI = dyn_cast_or_null<Constant>(SimplifiedV.getValue());
+  if (CI && CI->getType() != V.getType()) {
+    // TODO: Check for a save conversion.
+    return nullptr;
   }
-
-  return Changed;
-}
-
-const std::string AAReturnedValuesImpl::getAsStr() const {
-  return (isAtFixpoint() ? "returns(#" : "may-return(#") +
-         (isValidState() ? std::to_string(getNumReturnValues()) : "?") +
-         ")[#UC: " + std::to_string(UnresolvedCalls.size()) + "]";
+  if (CI)
+    recordDependence(ValueSimplifyAA, AA, DepClassTy::OPTIONAL);
+  return CI;
 }
 
-Optional<Value *>
-AAReturnedValuesImpl::getAssumedUniqueReturnValue(Attributor &A) const {
-  // If checkForAllReturnedValues provides a unique value, ignoring potential
-  // undef values that can also be present, it is assumed to be the actual
-  // return value and forwarded to the caller of this method. If there are
-  // multiple, a nullptr is returned indicating there cannot be a unique
-  // returned value.
-  Optional<Value *> UniqueRV;
-
-  auto Pred = [&](Value &RV) -> bool {
-    // If we found a second returned value and neither the current nor the saved
-    // one is an undef, there is no unique returned value. Undefs are special
-    // since we can pretend they have any value.
-    if (UniqueRV.hasValue() && UniqueRV != &RV &&
-        !(isa<UndefValue>(RV) || isa<UndefValue>(UniqueRV.getValue()))) {
-      UniqueRV = nullptr;
-      return false;
-    }
-
-    // Do not overwrite a value with an undef.
-    if (!UniqueRV.hasValue() || !isa<UndefValue>(RV))
-      UniqueRV = &RV;
-
-    return true;
-  };
-
-  if (!A.checkForAllReturnedValues(Pred, *this))
-    UniqueRV = nullptr;
-
-  return UniqueRV;
+Attributor::~Attributor() {
+  // The abstract attributes are allocated via the BumpPtrAllocator Allocator,
+  // thus we cannot delete them. We can, and want to, destruct them though.
+  for (AbstractAttribute *AA : AllAbstractAttributes)
+    AA->~AbstractAttribute();
 }
 
-bool AAReturnedValuesImpl::checkForAllReturnedValuesAndReturnInsts(
-    const function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)>
-        &Pred) const {
-  if (!isValidState())
+bool Attributor::isAssumedDead(const AbstractAttribute &AA,
+                               const AAIsDead *FnLivenessAA,
+                               bool CheckBBLivenessOnly, DepClassTy DepClass) {
+  const IRPosition &IRP = AA.getIRPosition();
+  if (!Functions.count(IRP.getAnchorScope()))
     return false;
-
-  // Check all returned values but ignore call sites as long as we have not
-  // encountered an overdefined one during an update.
-  for (auto &It : ReturnedValues) {
-    Value *RV = It.first;
-
-    CallBase *CB = dyn_cast<CallBase>(RV);
-    if (CB && !UnresolvedCalls.count(CB))
-      continue;
-
-    if (!Pred(*RV, It.second))
-      return false;
-  }
-
-  return true;
-}
-
-ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) {
-  size_t NumUnresolvedCalls = UnresolvedCalls.size();
-  bool Changed = false;
-
-  // State used in the value traversals starting in returned values.
-  struct RVState {
-    // The map in which we collect return values -> return instrs.
-    decltype(ReturnedValues) &RetValsMap;
-    // The flag to indicate a change.
-    bool &Changed;
-    // The return instrs we come from.
-    SmallSetVector<ReturnInst *, 4> RetInsts;
-  };
-
-  // Callback for a leaf value returned by the associated function.
-  auto VisitValueCB = [](Value &Val, RVState &RVS, bool) -> bool {
-    auto Size = RVS.RetValsMap[&Val].size();
-    RVS.RetValsMap[&Val].insert(RVS.RetInsts.begin(), RVS.RetInsts.end());
-    bool Inserted = RVS.RetValsMap[&Val].size() != Size;
-    RVS.Changed |= Inserted;
-    LLVM_DEBUG({
-      if (Inserted)
-        dbgs() << "[AAReturnedValues] 1 Add new returned value " << Val
-               << " => " << RVS.RetInsts.size() << "\n";
-    });
+  return isAssumedDead(IRP, &AA, FnLivenessAA, CheckBBLivenessOnly, DepClass);
+}
+
+bool Attributor::isAssumedDead(const Use &U,
+                               const AbstractAttribute *QueryingAA,
+                               const AAIsDead *FnLivenessAA,
+                               bool CheckBBLivenessOnly, DepClassTy DepClass) {
+  Instruction *UserI = dyn_cast<Instruction>(U.getUser());
+  if (!UserI)
+    return isAssumedDead(IRPosition::value(*U.get()), QueryingAA, FnLivenessAA,
+                         CheckBBLivenessOnly, DepClass);
+
+  if (auto *CB = dyn_cast<CallBase>(UserI)) {
+    // For call site argument uses we can check if the argument is
+    // unused/dead.
+    if (CB->isArgOperand(&U)) {
+      const IRPosition &CSArgPos =
+          IRPosition::callsite_argument(*CB, CB->getArgOperandNo(&U));
+      return isAssumedDead(CSArgPos, QueryingAA, FnLivenessAA,
+                           CheckBBLivenessOnly, DepClass);
+    }
+  } else if (ReturnInst *RI = dyn_cast<ReturnInst>(UserI)) {
+    const IRPosition &RetPos = IRPosition::returned(*RI->getFunction());
+    return isAssumedDead(RetPos, QueryingAA, FnLivenessAA, CheckBBLivenessOnly,
+                         DepClass);
+  } else if (PHINode *PHI = dyn_cast<PHINode>(UserI)) {
+    BasicBlock *IncomingBB = PHI->getIncomingBlock(U);
+    return isAssumedDead(*IncomingBB->getTerminator(), QueryingAA, FnLivenessAA,
+                         CheckBBLivenessOnly, DepClass);
+  }
+
+  return isAssumedDead(IRPosition::value(*UserI), QueryingAA, FnLivenessAA,
+                       CheckBBLivenessOnly, DepClass);
+}
+
+bool Attributor::isAssumedDead(const Instruction &I,
+                               const AbstractAttribute *QueryingAA,
+                               const AAIsDead *FnLivenessAA,
+                               bool CheckBBLivenessOnly, DepClassTy DepClass) {
+  if (!FnLivenessAA)
+    FnLivenessAA = lookupAAFor<AAIsDead>(IRPosition::function(*I.getFunction()),
+                                         QueryingAA,
+                                         /* TrackDependence */ false);
+
+  // If we have a context instruction and a liveness AA we use it.
+  if (FnLivenessAA &&
+      FnLivenessAA->getIRPosition().getAnchorScope() == I.getFunction() &&
+      FnLivenessAA->isAssumedDead(&I)) {
+    if (QueryingAA)
+      recordDependence(*FnLivenessAA, *QueryingAA, DepClass);
     return true;
-  };
-
-  // Helper method to invoke the generic value traversal.
-  auto VisitReturnedValue = [&](Value &RV, RVState &RVS) {
-    IRPosition RetValPos = IRPosition::value(RV);
-    return genericValueTraversal<AAReturnedValues, RVState>(A, RetValPos, *this,
-                                                            RVS, VisitValueCB);
-  };
-
-  // Callback for all "return intructions" live in the associated function.
-  auto CheckReturnInst = [this, &VisitReturnedValue, &Changed](Instruction &I) {
-    ReturnInst &Ret = cast<ReturnInst>(I);
-    RVState RVS({ReturnedValues, Changed, {}});
-    RVS.RetInsts.insert(&Ret);
-    return VisitReturnedValue(*Ret.getReturnValue(), RVS);
-  };
-
-  // Start by discovering returned values from all live returned instructions in
-  // the associated function.
-  if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret}))
-    return indicatePessimisticFixpoint();
-
-  // Once returned values "directly" present in the code are handled we try to
-  // resolve returned calls.
-  decltype(ReturnedValues) NewRVsMap;
-  for (auto &It : ReturnedValues) {
-    LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned value: " << *It.first
-                      << " by #" << It.second.size() << " RIs\n");
-    CallBase *CB = dyn_cast<CallBase>(It.first);
-    if (!CB || UnresolvedCalls.count(CB))
-      continue;
-
-    if (!CB->getCalledFunction()) {
-      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB
-                        << "\n");
-      UnresolvedCalls.insert(CB);
-      continue;
-    }
-
-    // TODO: use the function scope once we have call site AAReturnedValues.
-    const auto &RetValAA = A.getAAFor<AAReturnedValues>(
-        *this, IRPosition::function(*CB->getCalledFunction()));
-    LLVM_DEBUG(dbgs() << "[AAReturnedValues] Found another AAReturnedValues: "
-                      << RetValAA << "\n");
-
-    // Skip dead ends, thus if we do not know anything about the returned
-    // call we mark it as unresolved and it will stay that way.
-    if (!RetValAA.getState().isValidState()) {
-      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB
-                        << "\n");
-      UnresolvedCalls.insert(CB);
-      continue;
-    }
-
-    // Do not try to learn partial information. If the callee has unresolved
-    // return values we will treat the call as unresolved/opaque.
-    auto &RetValAAUnresolvedCalls = RetValAA.getUnresolvedCalls();
-    if (!RetValAAUnresolvedCalls.empty()) {
-      UnresolvedCalls.insert(CB);
-      continue;
-    }
-
-    // Now check if we can track transitively returned values. If possible, thus
-    // if all return value can be represented in the current scope, do so.
-    bool Unresolved = false;
-    for (auto &RetValAAIt : RetValAA.returned_values()) {
-      Value *RetVal = RetValAAIt.first;
-      if (isa<Argument>(RetVal) || isa<CallBase>(RetVal) ||
-          isa<Constant>(RetVal))
-        continue;
-      // Anything that did not fit in the above categories cannot be resolved,
-      // mark the call as unresolved.
-      LLVM_DEBUG(dbgs() << "[AAReturnedValues] transitively returned value "
-                           "cannot be translated: "
-                        << *RetVal << "\n");
-      UnresolvedCalls.insert(CB);
-      Unresolved = true;
-      break;
-    }
-
-    if (Unresolved)
-      continue;
-
-    // Now track transitively returned values.
-    unsigned &NumRetAA = NumReturnedValuesPerKnownAA[CB];
-    if (NumRetAA == RetValAA.getNumReturnValues()) {
-      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Skip call as it has not "
-                           "changed since it was seen last\n");
-      continue;
-    }
-    NumRetAA = RetValAA.getNumReturnValues();
-
-    for (auto &RetValAAIt : RetValAA.returned_values()) {
-      Value *RetVal = RetValAAIt.first;
-      if (Argument *Arg = dyn_cast<Argument>(RetVal)) {
-        // Arguments are mapped to call site operands and we begin the traversal
-        // again.
-        bool Unused = false;
-        RVState RVS({NewRVsMap, Unused, RetValAAIt.second});
-        VisitReturnedValue(*CB->getArgOperand(Arg->getArgNo()), RVS);
-        continue;
-      } else if (isa<CallBase>(RetVal)) {
-        // Call sites are resolved by the callee attribute over time, no need to
-        // do anything for us.
-        continue;
-      } else if (isa<Constant>(RetVal)) {
-        // Constants are valid everywhere, we can simply take them.
-        NewRVsMap[RetVal].insert(It.second.begin(), It.second.end());
-        continue;
-      }
-    }
-  }
-
-  // To avoid modifications to the ReturnedValues map while we iterate over it
-  // we kept record of potential new entries in a copy map, NewRVsMap.
-  for (auto &It : NewRVsMap) {
-    assert(!It.second.empty() && "Entry does not add anything.");
-    auto &ReturnInsts = ReturnedValues[It.first];
-    for (ReturnInst *RI : It.second)
-      if (ReturnInsts.insert(RI)) {
-        LLVM_DEBUG(dbgs() << "[AAReturnedValues] Add new returned value "
-                          << *It.first << " => " << *RI << "\n");
-        Changed = true;
-      }
-  }
-
-  Changed |= (NumUnresolvedCalls != UnresolvedCalls.size());
-  return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
-}
-
-struct AAReturnedValuesFunction final : public AAReturnedValuesImpl {
-  AAReturnedValuesFunction(const IRPosition &IRP) : AAReturnedValuesImpl(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(returned) }
-};
-
-/// Returned values information for a call sites.
-struct AAReturnedValuesCallSite final : AAReturnedValuesImpl {
-  AAReturnedValuesCallSite(const IRPosition &IRP) : AAReturnedValuesImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    // TODO: Once we have call site specific value information we can provide
-    //       call site specific liveness information and then it makes
-    //       sense to specialize attributes for call sites instead of
-    //       redirecting requests to the callee.
-    llvm_unreachable("Abstract attributes for returned values are not "
-                     "supported for call sites yet!");
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    return indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {}
-};
-
-/// ------------------------ NoSync Function Attribute -------------------------
-
-struct AANoSyncImpl : AANoSync {
-  AANoSyncImpl(const IRPosition &IRP) : AANoSync(IRP) {}
-
-  const std::string getAsStr() const override {
-    return getAssumed() ? "nosync" : "may-sync";
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
-
-  /// Helper function used to determine whether an instruction is non-relaxed
-  /// atomic. In other words, if an atomic instruction does not have unordered
-  /// or monotonic ordering
-  static bool isNonRelaxedAtomic(Instruction *I);
-
-  /// Helper function used to determine whether an instruction is volatile.
-  static bool isVolatile(Instruction *I);
-
-  /// Helper function uset to check if intrinsic is volatile (memcpy, memmove,
-  /// memset).
-  static bool isNoSyncIntrinsic(Instruction *I);
-};
-
-bool AANoSyncImpl::isNonRelaxedAtomic(Instruction *I) {
-  if (!I->isAtomic())
-    return false;
-
-  AtomicOrdering Ordering;
-  switch (I->getOpcode()) {
-  case Instruction::AtomicRMW:
-    Ordering = cast<AtomicRMWInst>(I)->getOrdering();
-    break;
-  case Instruction::Store:
-    Ordering = cast<StoreInst>(I)->getOrdering();
-    break;
-  case Instruction::Load:
-    Ordering = cast<LoadInst>(I)->getOrdering();
-    break;
-  case Instruction::Fence: {
-    auto *FI = cast<FenceInst>(I);
-    if (FI->getSyncScopeID() == SyncScope::SingleThread)
-      return false;
-    Ordering = FI->getOrdering();
-    break;
-  }
-  case Instruction::AtomicCmpXchg: {
-    AtomicOrdering Success = cast<AtomicCmpXchgInst>(I)->getSuccessOrdering();
-    AtomicOrdering Failure = cast<AtomicCmpXchgInst>(I)->getFailureOrdering();
-    // Only if both are relaxed, than it can be treated as relaxed.
-    // Otherwise it is non-relaxed.
-    if (Success != AtomicOrdering::Unordered &&
-        Success != AtomicOrdering::Monotonic)
-      return true;
-    if (Failure != AtomicOrdering::Unordered &&
-        Failure != AtomicOrdering::Monotonic)
-      return true;
-    return false;
-  }
-  default:
-    llvm_unreachable(
-        "New atomic operations need to be known in the attributor.");
-  }
-
-  // Relaxed.
-  if (Ordering == AtomicOrdering::Unordered ||
-      Ordering == AtomicOrdering::Monotonic)
-    return false;
-  return true;
-}
-
-/// Checks if an intrinsic is nosync. Currently only checks mem* intrinsics.
-/// FIXME: We should ipmrove the handling of intrinsics.
-bool AANoSyncImpl::isNoSyncIntrinsic(Instruction *I) {
-  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
-    switch (II->getIntrinsicID()) {
-    /// Element wise atomic memory intrinsics are can only be unordered,
-    /// therefore nosync.
-    case Intrinsic::memset_element_unordered_atomic:
-    case Intrinsic::memmove_element_unordered_atomic:
-    case Intrinsic::memcpy_element_unordered_atomic:
-      return true;
-    case Intrinsic::memset:
-    case Intrinsic::memmove:
-    case Intrinsic::memcpy:
-      if (!cast<MemIntrinsic>(II)->isVolatile())
-        return true;
-      return false;
-    default:
-      return false;
-    }
   }
-  return false;
-}
-
-bool AANoSyncImpl::isVolatile(Instruction *I) {
-  assert(!ImmutableCallSite(I) && !isa<CallBase>(I) &&
-         "Calls should not be checked here");
-
-  switch (I->getOpcode()) {
-  case Instruction::AtomicRMW:
-    return cast<AtomicRMWInst>(I)->isVolatile();
-  case Instruction::Store:
-    return cast<StoreInst>(I)->isVolatile();
-  case Instruction::Load:
-    return cast<LoadInst>(I)->isVolatile();
-  case Instruction::AtomicCmpXchg:
-    return cast<AtomicCmpXchgInst>(I)->isVolatile();
-  default:
-    return false;
-  }
-}
-
-ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) {
-
-  auto CheckRWInstForNoSync = [&](Instruction &I) {
-    /// We are looking for volatile instructions or Non-Relaxed atomics.
-    /// FIXME: We should improve the handling of intrinsics.
-
-    if (isa<IntrinsicInst>(&I) && isNoSyncIntrinsic(&I))
-      return true;
-
-    if (ImmutableCallSite ICS = ImmutableCallSite(&I)) {
-      if (ICS.hasFnAttr(Attribute::NoSync))
-        return true;
-
-      const auto &NoSyncAA =
-          A.getAAFor<AANoSync>(*this, IRPosition::callsite_function(ICS));
-      if (NoSyncAA.isAssumedNoSync())
-        return true;
-      return false;
-    }
-
-    if (!isVolatile(&I) && !isNonRelaxedAtomic(&I))
-      return true;
 
+  if (CheckBBLivenessOnly)
     return false;
-  };
-
-  auto CheckForNoSync = [&](Instruction &I) {
-    // At this point we handled all read/write effects and they are all
-    // nosync, so they can be skipped.
-    if (I.mayReadOrWriteMemory())
-      return true;
-
-    // non-convergent and readnone imply nosync.
-    return !ImmutableCallSite(&I).isConvergent();
-  };
-
-  if (!A.checkForAllReadWriteInstructions(CheckRWInstForNoSync, *this) ||
-      !A.checkForAllCallLikeInstructions(CheckForNoSync, *this))
-    return indicatePessimisticFixpoint();
-
-  return ChangeStatus::UNCHANGED;
-}
-
-struct AANoSyncFunction final : public AANoSyncImpl {
-  AANoSyncFunction(const IRPosition &IRP) : AANoSyncImpl(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nosync) }
-};
-
-/// NoSync attribute deduction for a call sites.
-struct AANoSyncCallSite final : AANoSyncImpl {
-  AANoSyncCallSite(const IRPosition &IRP) : AANoSyncImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    AANoSyncImpl::initialize(A);
-    Function *F = getAssociatedFunction();
-    if (!F)
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Once we have call site specific value information we can provide
-    //       call site specific liveness information and then it makes
-    //       sense to specialize attributes for call sites arguments instead of
-    //       redirecting requests to the callee argument.
-    Function *F = getAssociatedFunction();
-    const IRPosition &FnPos = IRPosition::function(*F);
-    auto &FnAA = A.getAAFor<AANoSync>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AANoSync::StateType &>(FnAA.getState()));
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nosync); }
-};
-
-/// ------------------------ No-Free Attributes ----------------------------
-
-struct AANoFreeImpl : public AANoFree {
-  AANoFreeImpl(const IRPosition &IRP) : AANoFree(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    auto CheckForNoFree = [&](Instruction &I) {
-      ImmutableCallSite ICS(&I);
-      if (ICS.hasFnAttr(Attribute::NoFree))
-        return true;
-
-      const auto &NoFreeAA =
-          A.getAAFor<AANoFree>(*this, IRPosition::callsite_function(ICS));
-      return NoFreeAA.isAssumedNoFree();
-    };
-
-    if (!A.checkForAllCallLikeInstructions(CheckForNoFree, *this))
-      return indicatePessimisticFixpoint();
-    return ChangeStatus::UNCHANGED;
-  }
-
-  /// See AbstractAttribute::getAsStr().
-  const std::string getAsStr() const override {
-    return getAssumed() ? "nofree" : "may-free";
-  }
-};
-
-struct AANoFreeFunction final : public AANoFreeImpl {
-  AANoFreeFunction(const IRPosition &IRP) : AANoFreeImpl(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nofree) }
-};
-
-/// NoFree attribute deduction for a call sites.
-struct AANoFreeCallSite final : AANoFreeImpl {
-  AANoFreeCallSite(const IRPosition &IRP) : AANoFreeImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    AANoFreeImpl::initialize(A);
-    Function *F = getAssociatedFunction();
-    if (!F)
-      indicatePessimisticFixpoint();
-  }
 
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Once we have call site specific value information we can provide
-    //       call site specific liveness information and then it makes
-    //       sense to specialize attributes for call sites arguments instead of
-    //       redirecting requests to the callee argument.
-    Function *F = getAssociatedFunction();
-    const IRPosition &FnPos = IRPosition::function(*F);
-    auto &FnAA = A.getAAFor<AANoFree>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AANoFree::StateType &>(FnAA.getState()));
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nofree); }
-};
-
-/// NoFree attribute for floating values.
-struct AANoFreeFloating : AANoFreeImpl {
-  AANoFreeFloating(const IRPosition &IRP) : AANoFreeImpl(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override{STATS_DECLTRACK_FLOATING_ATTR(nofree)}
-
-  /// See Abstract Attribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    const IRPosition &IRP = getIRPosition();
-
-    const auto &NoFreeAA =
-        A.getAAFor<AANoFree>(*this, IRPosition::function_scope(IRP));
-    if (NoFreeAA.isAssumedNoFree())
-      return ChangeStatus::UNCHANGED;
-
-    Value &AssociatedValue = getIRPosition().getAssociatedValue();
-    auto Pred = [&](const Use &U, bool &Follow) -> bool {
-      Instruction *UserI = cast<Instruction>(U.getUser());
-      if (auto *CB = dyn_cast<CallBase>(UserI)) {
-        if (CB->isBundleOperand(&U))
-          return false;
-        if (!CB->isArgOperand(&U))
-          return true;
-        unsigned ArgNo = CB->getArgOperandNo(&U);
-
-        const auto &NoFreeArg = A.getAAFor<AANoFree>(
-            *this, IRPosition::callsite_argument(*CB, ArgNo));
-        return NoFreeArg.isAssumedNoFree();
-      }
-
-      if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) ||
-          isa<PHINode>(UserI) || isa<SelectInst>(UserI)) {
-        Follow = true;
-        return true;
-      }
-
-      // Unknown user.
-      return false;
-    };
-    if (!A.checkForAllUses(Pred, *this, AssociatedValue))
-      return indicatePessimisticFixpoint();
-
-    return ChangeStatus::UNCHANGED;
-  }
-};
-
-/// NoFree attribute for a call site argument.
-struct AANoFreeArgument final : AANoFreeFloating {
-  AANoFreeArgument(const IRPosition &IRP) : AANoFreeFloating(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nofree) }
-};
-
-/// NoFree attribute for call site arguments.
-struct AANoFreeCallSiteArgument final : AANoFreeFloating {
-  AANoFreeCallSiteArgument(const IRPosition &IRP) : AANoFreeFloating(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Once we have call site specific value information we can provide
-    //       call site specific liveness information and then it makes
-    //       sense to specialize attributes for call sites arguments instead of
-    //       redirecting requests to the callee argument.
-    Argument *Arg = getAssociatedArgument();
-    if (!Arg)
-      return indicatePessimisticFixpoint();
-    const IRPosition &ArgPos = IRPosition::argument(*Arg);
-    auto &ArgAA = A.getAAFor<AANoFree>(*this, ArgPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AANoFree::StateType &>(ArgAA.getState()));
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nofree)};
-};
-
-/// NoFree attribute for function return value.
-struct AANoFreeReturned final : AANoFreeFloating {
-  AANoFreeReturned(const IRPosition &IRP) : AANoFreeFloating(IRP) {
-    llvm_unreachable("NoFree is not applicable to function returns!");
-  }
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    llvm_unreachable("NoFree is not applicable to function returns!");
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    llvm_unreachable("NoFree is not applicable to function returns!");
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {}
-};
-
-/// NoFree attribute deduction for a call site return value.
-struct AANoFreeCallSiteReturned final : AANoFreeFloating {
-  AANoFreeCallSiteReturned(const IRPosition &IRP) : AANoFreeFloating(IRP) {}
-
-  ChangeStatus manifest(Attributor &A) override {
-    return ChangeStatus::UNCHANGED;
-  }
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nofree) }
-};
-
-/// ------------------------ NonNull Argument Attribute ------------------------
-static int64_t getKnownNonNullAndDerefBytesForUse(
-    Attributor &A, AbstractAttribute &QueryingAA, Value &AssociatedValue,
-    const Use *U, const Instruction *I, bool &IsNonNull, bool &TrackUse) {
-  TrackUse = false;
-
-  const Value *UseV = U->get();
-  if (!UseV->getType()->isPointerTy())
-    return 0;
-
-  Type *PtrTy = UseV->getType();
-  const Function *F = I->getFunction();
-  bool NullPointerIsDefined =
-      F ? llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()) : true;
-  const DataLayout &DL = A.getInfoCache().getDL();
-  if (ImmutableCallSite ICS = ImmutableCallSite(I)) {
-    if (ICS.isBundleOperand(U))
-      return 0;
-
-    if (ICS.isCallee(U)) {
-      IsNonNull |= !NullPointerIsDefined;
-      return 0;
-    }
-
-    unsigned ArgNo = ICS.getArgumentNo(U);
-    IRPosition IRP = IRPosition::callsite_argument(ICS, ArgNo);
-    // As long as we only use known information there is no need to track
-    // dependences here.
-    auto &DerefAA = A.getAAFor<AADereferenceable>(QueryingAA, IRP,
-                                                  /* TrackDependence */ false);
-    IsNonNull |= DerefAA.isKnownNonNull();
-    return DerefAA.getKnownDereferenceableBytes();
-  }
-
-  // We need to follow common pointer manipulation uses to the accesses they
-  // feed into. We can try to be smart to avoid looking through things we do not
-  // like for now, e.g., non-inbounds GEPs.
-  if (isa<CastInst>(I)) {
-    TrackUse = true;
-    return 0;
-  }
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(I))
-    if (GEP->hasAllConstantIndices()) {
-      TrackUse = true;
-      return 0;
-    }
-
-  int64_t Offset;
-  if (const Value *Base = getBasePointerOfAccessPointerOperand(I, Offset, DL)) {
-    if (Base == &AssociatedValue &&
-        Attributor::getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
-      int64_t DerefBytes =
-          (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()) + Offset;
-
-      IsNonNull |= !NullPointerIsDefined;
-      return std::max(int64_t(0), DerefBytes);
-    }
-  }
-
-  /// Corner case when an offset is 0.
-  if (const Value *Base = getBasePointerOfAccessPointerOperand(
-          I, Offset, DL, /*AllowNonInbounds*/ true)) {
-    if (Offset == 0 && Base == &AssociatedValue &&
-        Attributor::getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
-      int64_t DerefBytes =
-          (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType());
-      IsNonNull |= !NullPointerIsDefined;
-      return std::max(int64_t(0), DerefBytes);
-    }
-  }
-
-  return 0;
-}
-
-struct AANonNullImpl : AANonNull {
-  AANonNullImpl(const IRPosition &IRP)
-      : AANonNull(IRP),
-        NullIsDefined(NullPointerIsDefined(
-            getAnchorScope(),
-            getAssociatedValue().getType()->getPointerAddressSpace())) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    if (!NullIsDefined &&
-        hasAttr({Attribute::NonNull, Attribute::Dereferenceable}))
-      indicateOptimisticFixpoint();
-    else if (isa<ConstantPointerNull>(getAssociatedValue()))
-      indicatePessimisticFixpoint();
-    else
-      AANonNull::initialize(A);
-  }
-
-  /// See AAFromMustBeExecutedContext
-  bool followUse(Attributor &A, const Use *U, const Instruction *I) {
-    bool IsNonNull = false;
-    bool TrackUse = false;
-    getKnownNonNullAndDerefBytesForUse(A, *this, getAssociatedValue(), U, I,
-                                       IsNonNull, TrackUse);
-    setKnown(IsNonNull);
-    return TrackUse;
-  }
-
-  /// See AbstractAttribute::getAsStr().
-  const std::string getAsStr() const override {
-    return getAssumed() ? "nonnull" : "may-null";
-  }
-
-  /// Flag to determine if the underlying value can be null and still allow
-  /// valid accesses.
-  const bool NullIsDefined;
-};
-
-/// NonNull attribute for a floating value.
-struct AANonNullFloating
-    : AAFromMustBeExecutedContext<AANonNull, AANonNullImpl> {
-  using Base = AAFromMustBeExecutedContext<AANonNull, AANonNullImpl>;
-  AANonNullFloating(const IRPosition &IRP) : Base(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    ChangeStatus Change = Base::updateImpl(A);
-    if (isKnownNonNull())
-      return Change;
-
-    if (!NullIsDefined) {
-      const auto &DerefAA =
-          A.getAAFor<AADereferenceable>(*this, getIRPosition());
-      if (DerefAA.getAssumedDereferenceableBytes())
-        return Change;
-    }
-
-    const DataLayout &DL = A.getDataLayout();
-
-    DominatorTree *DT = nullptr;
-    InformationCache &InfoCache = A.getInfoCache();
-    if (const Function *Fn = getAnchorScope())
-      DT = InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(*Fn);
-
-    auto VisitValueCB = [&](Value &V, AANonNull::StateType &T,
-                            bool Stripped) -> bool {
-      const auto &AA = A.getAAFor<AANonNull>(*this, IRPosition::value(V));
-      if (!Stripped && this == &AA) {
-        if (!isKnownNonZero(&V, DL, 0, /* TODO: AC */ nullptr, getCtxI(), DT))
-          T.indicatePessimisticFixpoint();
-      } else {
-        // Use abstract attribute information.
-        const AANonNull::StateType &NS =
-            static_cast<const AANonNull::StateType &>(AA.getState());
-        T ^= NS;
-      }
-      return T.isValidState();
-    };
-
-    StateType T;
-    if (!genericValueTraversal<AANonNull, StateType>(A, getIRPosition(), *this,
-                                                     T, VisitValueCB))
-      return indicatePessimisticFixpoint();
-
-    return clampStateAndIndicateChange(getState(), T);
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
-};
-
-/// NonNull attribute for function return value.
-struct AANonNullReturned final
-    : AAReturnedFromReturnedValues<AANonNull, AANonNullImpl> {
-  AANonNullReturned(const IRPosition &IRP)
-      : AAReturnedFromReturnedValues<AANonNull, AANonNullImpl>(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
-};
-
-/// NonNull attribute for function argument.
-struct AANonNullArgument final
-    : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AANonNull,
-                                                              AANonNullImpl> {
-  AANonNullArgument(const IRPosition &IRP)
-      : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AANonNull,
-                                                                AANonNullImpl>(
-            IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nonnull) }
-};
-
-struct AANonNullCallSiteArgument final : AANonNullFloating {
-  AANonNullCallSiteArgument(const IRPosition &IRP) : AANonNullFloating(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(nonnull) }
-};
-
-/// NonNull attribute for a call site return position.
-struct AANonNullCallSiteReturned final
-    : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AANonNull,
-                                                             AANonNullImpl> {
-  AANonNullCallSiteReturned(const IRPosition &IRP)
-      : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AANonNull,
-                                                               AANonNullImpl>(
-            IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nonnull) }
-};
-
-/// ------------------------ No-Recurse Attributes ----------------------------
-
-struct AANoRecurseImpl : public AANoRecurse {
-  AANoRecurseImpl(const IRPosition &IRP) : AANoRecurse(IRP) {}
-
-  /// See AbstractAttribute::getAsStr()
-  const std::string getAsStr() const override {
-    return getAssumed() ? "norecurse" : "may-recurse";
-  }
-};
-
-struct AANoRecurseFunction final : AANoRecurseImpl {
-  AANoRecurseFunction(const IRPosition &IRP) : AANoRecurseImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    AANoRecurseImpl::initialize(A);
-    if (const Function *F = getAnchorScope())
-      if (A.getInfoCache().getSccSize(*F) == 1)
-        return;
-    indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-
-    auto CheckForNoRecurse = [&](Instruction &I) {
-      ImmutableCallSite ICS(&I);
-      if (ICS.hasFnAttr(Attribute::NoRecurse))
-        return true;
-
-      const auto &NoRecurseAA =
-          A.getAAFor<AANoRecurse>(*this, IRPosition::callsite_function(ICS));
-      if (!NoRecurseAA.isAssumedNoRecurse())
-        return false;
-
-      // Recursion to the same function
-      if (ICS.getCalledFunction() == getAnchorScope())
-        return false;
-
-      return true;
-    };
-
-    if (!A.checkForAllCallLikeInstructions(CheckForNoRecurse, *this))
-      return indicatePessimisticFixpoint();
-    return ChangeStatus::UNCHANGED;
-  }
-
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(norecurse) }
-};
-
-/// NoRecurse attribute deduction for a call sites.
-struct AANoRecurseCallSite final : AANoRecurseImpl {
-  AANoRecurseCallSite(const IRPosition &IRP) : AANoRecurseImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    AANoRecurseImpl::initialize(A);
-    Function *F = getAssociatedFunction();
-    if (!F)
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Once we have call site specific value information we can provide
-    //       call site specific liveness information and then it makes
-    //       sense to specialize attributes for call sites arguments instead of
-    //       redirecting requests to the callee argument.
-    Function *F = getAssociatedFunction();
-    const IRPosition &FnPos = IRPosition::function(*F);
-    auto &FnAA = A.getAAFor<AANoRecurse>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AANoRecurse::StateType &>(FnAA.getState()));
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(norecurse); }
-};
-
-/// -------------------- Undefined-Behavior Attributes ------------------------
-
-struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
-  AAUndefinedBehaviorImpl(const IRPosition &IRP) : AAUndefinedBehavior(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  // through a pointer (i.e. also branches etc.)
-  ChangeStatus updateImpl(Attributor &A) override {
-    const size_t UBPrevSize = KnownUBInsts.size();
-    const size_t NoUBPrevSize = AssumedNoUBInsts.size();
-
-    auto InspectMemAccessInstForUB = [&](Instruction &I) {
-      // Skip instructions that are already saved.
-      if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
-        return true;
-
-      // If we reach here, we know we have an instruction
-      // that accesses memory through a pointer operand,
-      // for which getPointerOperand() should give it to us.
-      const Value *PtrOp =
-          Attributor::getPointerOperand(&I, /* AllowVolatile */ true);
-      assert(PtrOp &&
-             "Expected pointer operand of memory accessing instruction");
-
-      // A memory access through a pointer is considered UB
-      // only if the pointer has constant null value.
-      // TODO: Expand it to not only check constant values.
-      if (!isa<ConstantPointerNull>(PtrOp)) {
-        AssumedNoUBInsts.insert(&I);
-        return true;
-      }
-      const Type *PtrTy = PtrOp->getType();
-
-      // Because we only consider instructions inside functions,
-      // assume that a parent function exists.
-      const Function *F = I.getFunction();
-
-      // A memory access using constant null pointer is only considered UB
-      // if null pointer is _not_ defined for the target platform.
-      if (llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()))
-        AssumedNoUBInsts.insert(&I);
-      else
-        KnownUBInsts.insert(&I);
-      return true;
-    };
-
-    auto InspectBrInstForUB = [&](Instruction &I) {
-      // A conditional branch instruction is considered UB if it has `undef`
-      // condition.
-
-      // Skip instructions that are already saved.
-      if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
-        return true;
-
-      // We know we have a branch instruction.
-      auto BrInst = cast<BranchInst>(&I);
-
-      // Unconditional branches are never considered UB.
-      if (BrInst->isUnconditional())
-        return true;
-
-      // Either we stopped and the appropriate action was taken,
-      // or we got back a simplified value to continue.
-      Optional<Value *> SimplifiedCond =
-          stopOnUndefOrAssumed(A, BrInst->getCondition(), BrInst);
-      if (!SimplifiedCond.hasValue())
-        return true;
-      AssumedNoUBInsts.insert(&I);
-      return true;
-    };
-
-    A.checkForAllInstructions(InspectMemAccessInstForUB, *this,
-                              {Instruction::Load, Instruction::Store,
-                               Instruction::AtomicCmpXchg,
-                               Instruction::AtomicRMW});
-    A.checkForAllInstructions(InspectBrInstForUB, *this, {Instruction::Br});
-    if (NoUBPrevSize != AssumedNoUBInsts.size() ||
-        UBPrevSize != KnownUBInsts.size())
-      return ChangeStatus::CHANGED;
-    return ChangeStatus::UNCHANGED;
-  }
-
-  bool isKnownToCauseUB(Instruction *I) const override {
-    return KnownUBInsts.count(I);
-  }
-
-  bool isAssumedToCauseUB(Instruction *I) const override {
-    // In simple words, if an instruction is not in the assumed to _not_
-    // cause UB, then it is assumed UB (that includes those
-    // in the KnownUBInsts set). The rest is boilerplate
-    // is to ensure that it is one of the instructions we test
-    // for UB.
-
-    switch (I->getOpcode()) {
-    case Instruction::Load:
-    case Instruction::Store:
-    case Instruction::AtomicCmpXchg:
-    case Instruction::AtomicRMW:
-      return !AssumedNoUBInsts.count(I);
-    case Instruction::Br: {
-      auto BrInst = cast<BranchInst>(I);
-      if (BrInst->isUnconditional())
-        return false;
-      return !AssumedNoUBInsts.count(I);
-    } break;
-    default:
-      return false;
-    }
+  const AAIsDead &IsDeadAA = getOrCreateAAFor<AAIsDead>(
+      IRPosition::value(I), QueryingAA, /* TrackDependence */ false);
+  // Don't check liveness for AAIsDead.
+  if (QueryingAA == &IsDeadAA)
     return false;
-  }
 
-  ChangeStatus manifest(Attributor &A) override {
-    if (KnownUBInsts.empty())
-      return ChangeStatus::UNCHANGED;
-    for (Instruction *I : KnownUBInsts)
-      A.changeToUnreachableAfterManifest(I);
-    return ChangeStatus::CHANGED;
-  }
-
-  /// See AbstractAttribute::getAsStr()
-  const std::string getAsStr() const override {
-    return getAssumed() ? "undefined-behavior" : "no-ub";
-  }
-
-  /// Note: The correctness of this analysis depends on the fact that the
-  /// following 2 sets will stop changing after some point.
-  /// "Change" here means that their size changes.
-  /// The size of each set is monotonically increasing
-  /// (we only add items to them) and it is upper bounded by the number of
-  /// instructions in the processed function (we can never save more
-  /// elements in either set than this number). Hence, at some point,
-  /// they will stop increasing.
-  /// Consequently, at some point, both sets will have stopped
-  /// changing, effectively making the analysis reach a fixpoint.
-
-  /// Note: These 2 sets are disjoint and an instruction can be considered
-  /// one of 3 things:
-  /// 1) Known to cause UB (AAUndefinedBehavior could prove it) and put it in
-  ///    the KnownUBInsts set.
-  /// 2) Assumed to cause UB (in every updateImpl, AAUndefinedBehavior
-  ///    has a reason to assume it).
-  /// 3) Assumed to not cause UB. very other instruction - AAUndefinedBehavior
-  ///    could not find a reason to assume or prove that it can cause UB,
-  ///    hence it assumes it doesn't. We have a set for these instructions
-  ///    so that we don't reprocess them in every update.
-  ///    Note however that instructions in this set may cause UB.
-
-protected:
-  /// A set of all live instructions _known_ to cause UB.
-  SmallPtrSet<Instruction *, 8> KnownUBInsts;
-
-private:
-  /// A set of all the (live) instructions that are assumed to _not_ cause UB.
-  SmallPtrSet<Instruction *, 8> AssumedNoUBInsts;
-
-  // Should be called on updates in which if we're processing an instruction
-  // \p I that depends on a value \p V, one of the following has to happen:
-  // - If the value is assumed, then stop.
-  // - If the value is known but undef, then consider it UB.
-  // - Otherwise, do specific processing with the simplified value.
-  // We return None in the first 2 cases to signify that an appropriate
-  // action was taken and the caller should stop.
-  // Otherwise, we return the simplified value that the caller should
-  // use for specific processing.
-  Optional<Value *> stopOnUndefOrAssumed(Attributor &A, const Value *V,
-                                         Instruction *I) {
-    const auto &ValueSimplifyAA =
-        A.getAAFor<AAValueSimplify>(*this, IRPosition::value(*V));
-    Optional<Value *> SimplifiedV =
-        ValueSimplifyAA.getAssumedSimplifiedValue(A);
-    if (!ValueSimplifyAA.isKnown()) {
-      // Don't depend on assumed values.
-      return llvm::None;
-    }
-    if (!SimplifiedV.hasValue()) {
-      // If it is known (which we tested above) but it doesn't have a value,
-      // then we can assume `undef` and hence the instruction is UB.
-      KnownUBInsts.insert(I);
-      return llvm::None;
-    }
-    Value *Val = SimplifiedV.getValue();
-    if (isa<UndefValue>(Val)) {
-      KnownUBInsts.insert(I);
-      return llvm::None;
-    }
-    return Val;
-  }
-};
-
-struct AAUndefinedBehaviorFunction final : AAUndefinedBehaviorImpl {
-  AAUndefinedBehaviorFunction(const IRPosition &IRP)
-      : AAUndefinedBehaviorImpl(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECL(UndefinedBehaviorInstruction, Instruction,
-               "Number of instructions known to have UB");
-    BUILD_STAT_NAME(UndefinedBehaviorInstruction, Instruction) +=
-        KnownUBInsts.size();
+  if (IsDeadAA.isAssumedDead()) {
+    if (QueryingAA)
+      recordDependence(IsDeadAA, *QueryingAA, DepClass);
+    return true;
   }
-};
 
-/// ------------------------ Will-Return Attributes ----------------------------
-
-// Helper function that checks whether a function has any cycle.
-// TODO: Replace with more efficent code
-static bool containsCycle(Function &F) {
-  SmallPtrSet<BasicBlock *, 32> Visited;
-
-  // Traverse BB by dfs and check whether successor is already visited.
-  for (BasicBlock *BB : depth_first(&F)) {
-    Visited.insert(BB);
-    for (auto *SuccBB : successors(BB)) {
-      if (Visited.count(SuccBB))
-        return true;
-    }
-  }
   return false;
 }
 
-// Helper function that checks the function have a loop which might become an
-// endless loop
-// FIXME: Any cycle is regarded as endless loop for now.
-//        We have to allow some patterns.
-static bool containsPossiblyEndlessLoop(Function *F) {
-  return !F || !F->hasExactDefinition() || containsCycle(*F);
-}
-
-struct AAWillReturnImpl : public AAWillReturn {
-  AAWillReturnImpl(const IRPosition &IRP) : AAWillReturn(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    AAWillReturn::initialize(A);
-
-    Function *F = getAssociatedFunction();
-    if (containsPossiblyEndlessLoop(F))
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    auto CheckForWillReturn = [&](Instruction &I) {
-      IRPosition IPos = IRPosition::callsite_function(ImmutableCallSite(&I));
-      const auto &WillReturnAA = A.getAAFor<AAWillReturn>(*this, IPos);
-      if (WillReturnAA.isKnownWillReturn())
-        return true;
-      if (!WillReturnAA.isAssumedWillReturn())
-        return false;
-      const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(*this, IPos);
-      return NoRecurseAA.isAssumedNoRecurse();
-    };
-
-    if (!A.checkForAllCallLikeInstructions(CheckForWillReturn, *this))
-      return indicatePessimisticFixpoint();
-
-    return ChangeStatus::UNCHANGED;
-  }
-
-  /// See AbstractAttribute::getAsStr()
-  const std::string getAsStr() const override {
-    return getAssumed() ? "willreturn" : "may-noreturn";
-  }
-};
-
-struct AAWillReturnFunction final : AAWillReturnImpl {
-  AAWillReturnFunction(const IRPosition &IRP) : AAWillReturnImpl(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(willreturn) }
-};
-
-/// WillReturn attribute deduction for a call sites.
-struct AAWillReturnCallSite final : AAWillReturnImpl {
-  AAWillReturnCallSite(const IRPosition &IRP) : AAWillReturnImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    AAWillReturnImpl::initialize(A);
-    Function *F = getAssociatedFunction();
-    if (!F)
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Once we have call site specific value information we can provide
-    //       call site specific liveness information and then it makes
-    //       sense to specialize attributes for call sites arguments instead of
-    //       redirecting requests to the callee argument.
-    Function *F = getAssociatedFunction();
-    const IRPosition &FnPos = IRPosition::function(*F);
-    auto &FnAA = A.getAAFor<AAWillReturn>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AAWillReturn::StateType &>(FnAA.getState()));
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(willreturn); }
-};
-
-/// -------------------AAReachability Attribute--------------------------
-
-struct AAReachabilityImpl : AAReachability {
-  AAReachabilityImpl(const IRPosition &IRP) : AAReachability(IRP) {}
-
-  const std::string getAsStr() const override {
-    // TODO: Return the number of reachable queries.
-    return "reachable";
-  }
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override { indicatePessimisticFixpoint(); }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    return indicatePessimisticFixpoint();
-  }
-};
-
-struct AAReachabilityFunction final : public AAReachabilityImpl {
-  AAReachabilityFunction(const IRPosition &IRP) : AAReachabilityImpl(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(reachable); }
-};
-
-/// ------------------------ NoAlias Argument Attribute ------------------------
-
-struct AANoAliasImpl : AANoAlias {
-  AANoAliasImpl(const IRPosition &IRP) : AANoAlias(IRP) {}
-
-  const std::string getAsStr() const override {
-    return getAssumed() ? "noalias" : "may-alias";
-  }
-};
-
-/// NoAlias attribute for a floating value.
-struct AANoAliasFloating final : AANoAliasImpl {
-  AANoAliasFloating(const IRPosition &IRP) : AANoAliasImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    AANoAliasImpl::initialize(A);
-    Value &Val = getAssociatedValue();
-    if (isa<AllocaInst>(Val))
-      indicateOptimisticFixpoint();
-    if (isa<ConstantPointerNull>(Val) &&
-        Val.getType()->getPointerAddressSpace() == 0)
-      indicateOptimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Implement this.
-    return indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_FLOATING_ATTR(noalias)
-  }
-};
-
-/// NoAlias attribute for an argument.
-struct AANoAliasArgument final
-    : AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl> {
-  using Base = AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl>;
-  AANoAliasArgument(const IRPosition &IRP) : Base(IRP) {}
-
-  /// See AbstractAttribute::update(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // We have to make sure no-alias on the argument does not break
-    // synchronization when this is a callback argument, see also [1] below.
-    // If synchronization cannot be affected, we delegate to the base updateImpl
-    // function, otherwise we give up for now.
-
-    // If the function is no-sync, no-alias cannot break synchronization.
-    const auto &NoSyncAA = A.getAAFor<AANoSync>(
-        *this, IRPosition::function_scope(getIRPosition()));
-    if (NoSyncAA.isAssumedNoSync())
-      return Base::updateImpl(A);
-
-    // If the argument is read-only, no-alias cannot break synchronization.
-    const auto &MemBehaviorAA =
-        A.getAAFor<AAMemoryBehavior>(*this, getIRPosition());
-    if (MemBehaviorAA.isAssumedReadOnly())
-      return Base::updateImpl(A);
-
-    // If the argument is never passed through callbacks, no-alias cannot break
-    // synchronization.
-    if (A.checkForAllCallSites(
-            [](AbstractCallSite ACS) { return !ACS.isCallbackCall(); }, *this,
-            true))
-      return Base::updateImpl(A);
-
-    // TODO: add no-alias but make sure it doesn't break synchronization by
-    // introducing fake uses. See:
-    // [1] Compiler Optimizations for OpenMP, J. Doerfert and H. Finkel,
-    //     International Workshop on OpenMP 2018,
-    //     http://compilers.cs.uni-saarland.de/people/doerfert/par_opt18.pdf
-
-    return indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(noalias) }
-};
-
-struct AANoAliasCallSiteArgument final : AANoAliasImpl {
-  AANoAliasCallSiteArgument(const IRPosition &IRP) : AANoAliasImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    // See callsite argument attribute and callee argument attribute.
-    ImmutableCallSite ICS(&getAnchorValue());
-    if (ICS.paramHasAttr(getArgNo(), Attribute::NoAlias))
-      indicateOptimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // We can deduce "noalias" if the following conditions hold.
-    // (i)   Associated value is assumed to be noalias in the definition.
-    // (ii)  Associated value is assumed to be no-capture in all the uses
-    //       possibly executed before this callsite.
-    // (iii) There is no other pointer argument which could alias with the
-    //       value.
-
-    const Value &V = getAssociatedValue();
-    const IRPosition IRP = IRPosition::value(V);
-
-    // (i) Check whether noalias holds in the definition.
-
-    auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, IRP);
-    LLVM_DEBUG(dbgs() << "[Attributor][AANoAliasCSArg] check definition: " << V
-                      << " :: " << NoAliasAA << "\n");
-
-    if (!NoAliasAA.isAssumedNoAlias())
-      return indicatePessimisticFixpoint();
-
-    LLVM_DEBUG(dbgs() << "[Attributor][AANoAliasCSArg] " << V
-                      << " is assumed NoAlias in the definition\n");
-
-    // (ii) Check whether the value is captured in the scope using AANoCapture.
-    //      FIXME: This is conservative though, it is better to look at CFG and
-    //             check only uses possibly executed before this callsite.
-
-    auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, IRP);
-    if (!NoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
-      LLVM_DEBUG(
-          dbgs() << "[Attributor][AANoAliasCSArg] " << V
-                 << " cannot be noalias as it is potentially captured\n");
-      return indicatePessimisticFixpoint();
-    }
-
-    // (iii) Check there is no other pointer argument which could alias with the
-    // value.
-    // TODO: AbstractCallSite
-    ImmutableCallSite ICS(&getAnchorValue());
-    for (unsigned i = 0; i < ICS.getNumArgOperands(); i++) {
-      if (getArgNo() == (int)i)
-        continue;
-      const Value *ArgOp = ICS.getArgOperand(i);
-      if (!ArgOp->getType()->isPointerTy())
-        continue;
-
-      if (const Function *F = getAnchorScope()) {
-        if (AAResults *AAR = A.getInfoCache().getAAResultsForFunction(*F)) {
-          bool IsAliasing = !AAR->isNoAlias(&getAssociatedValue(), ArgOp);
-          LLVM_DEBUG(dbgs()
-                     << "[Attributor][NoAliasCSArg] Check alias between "
-                        "callsite arguments "
-                     << AAR->isNoAlias(&getAssociatedValue(), ArgOp) << " "
-                     << getAssociatedValue() << " " << *ArgOp << " => "
-                     << (IsAliasing ? "" : "no-") << "alias \n");
-
-          if (!IsAliasing)
-            continue;
-        }
-      }
-      return indicatePessimisticFixpoint();
-    }
-
-    return ChangeStatus::UNCHANGED;
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(noalias) }
-};
-
-/// NoAlias attribute for function return value.
-struct AANoAliasReturned final : AANoAliasImpl {
-  AANoAliasReturned(const IRPosition &IRP) : AANoAliasImpl(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  virtual ChangeStatus updateImpl(Attributor &A) override {
-
-    auto CheckReturnValue = [&](Value &RV) -> bool {
-      if (Constant *C = dyn_cast<Constant>(&RV))
-        if (C->isNullValue() || isa<UndefValue>(C))
-          return true;
-
-      /// For now, we can only deduce noalias if we have call sites.
-      /// FIXME: add more support.
-      ImmutableCallSite ICS(&RV);
-      if (!ICS)
-        return false;
-
-      const IRPosition &RVPos = IRPosition::value(RV);
-      const auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, RVPos);
-      if (!NoAliasAA.isAssumedNoAlias())
-        return false;
-
-      const auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, RVPos);
-      return NoCaptureAA.isAssumedNoCaptureMaybeReturned();
-    };
-
-    if (!A.checkForAllReturnedValues(CheckReturnValue, *this))
-      return indicatePessimisticFixpoint();
-
-    return ChangeStatus::UNCHANGED;
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(noalias) }
-};
-
-/// NoAlias attribute deduction for a call site return value.
-struct AANoAliasCallSiteReturned final : AANoAliasImpl {
-  AANoAliasCallSiteReturned(const IRPosition &IRP) : AANoAliasImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    AANoAliasImpl::initialize(A);
-    Function *F = getAssociatedFunction();
-    if (!F)
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Once we have call site specific value information we can provide
-    //       call site specific liveness information and then it makes
-    //       sense to specialize attributes for call sites arguments instead of
-    //       redirecting requests to the callee argument.
-    Function *F = getAssociatedFunction();
-    const IRPosition &FnPos = IRPosition::returned(*F);
-    auto &FnAA = A.getAAFor<AANoAlias>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AANoAlias::StateType &>(FnAA.getState()));
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noalias); }
-};
-
-/// -------------------AAIsDead Function Attribute-----------------------
-
-struct AAIsDeadValueImpl : public AAIsDead {
-  AAIsDeadValueImpl(const IRPosition &IRP) : AAIsDead(IRP) {}
-
-  /// See AAIsDead::isAssumedDead().
-  bool isAssumedDead() const override { return getAssumed(); }
-
-  /// See AAIsDead::isAssumedDead(BasicBlock *).
-  bool isAssumedDead(const BasicBlock *BB) const override { return false; }
-
-  /// See AAIsDead::isKnownDead(BasicBlock *).
-  bool isKnownDead(const BasicBlock *BB) const override { return false; }
-
-  /// See AAIsDead::isAssumedDead(Instruction *I).
-  bool isAssumedDead(const Instruction *I) const override {
-    return I == getCtxI() && isAssumedDead();
-  }
-
-  /// See AAIsDead::isKnownDead(Instruction *I).
-  bool isKnownDead(const Instruction *I) const override {
-    return I == getCtxI() && getKnown();
-  }
-
-  /// See AbstractAttribute::getAsStr().
-  const std::string getAsStr() const override {
-    return isAssumedDead() ? "assumed-dead" : "assumed-live";
-  }
-};
-
-struct AAIsDeadFloating : public AAIsDeadValueImpl {
-  AAIsDeadFloating(const IRPosition &IRP) : AAIsDeadValueImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    if (Instruction *I = dyn_cast<Instruction>(&getAssociatedValue()))
-      if (!wouldInstructionBeTriviallyDead(I))
-        indicatePessimisticFixpoint();
-    if (isa<UndefValue>(getAssociatedValue()))
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    auto UsePred = [&](const Use &U, bool &Follow) {
-      Instruction *UserI = cast<Instruction>(U.getUser());
-      if (CallSite CS = CallSite(UserI)) {
-        if (!CS.isArgOperand(&U))
-          return false;
-        const IRPosition &CSArgPos =
-            IRPosition::callsite_argument(CS, CS.getArgumentNo(&U));
-        const auto &CSArgIsDead = A.getAAFor<AAIsDead>(*this, CSArgPos);
-        return CSArgIsDead.isAssumedDead();
-      }
-      if (ReturnInst *RI = dyn_cast<ReturnInst>(UserI)) {
-        const IRPosition &RetPos = IRPosition::returned(*RI->getFunction());
-        const auto &RetIsDeadAA = A.getAAFor<AAIsDead>(*this, RetPos);
-        return RetIsDeadAA.isAssumedDead();
-      }
-      Follow = true;
-      return wouldInstructionBeTriviallyDead(UserI);
-    };
-
-    if (!A.checkForAllUses(UsePred, *this, getAssociatedValue()))
-      return indicatePessimisticFixpoint();
-    return ChangeStatus::UNCHANGED;
-  }
-
-  /// See AbstractAttribute::manifest(...).
-  ChangeStatus manifest(Attributor &A) override {
-    Value &V = getAssociatedValue();
-    if (auto *I = dyn_cast<Instruction>(&V))
-      if (wouldInstructionBeTriviallyDead(I)) {
-        A.deleteAfterManifest(*I);
-        return ChangeStatus::CHANGED;
-      }
-
-    if (V.use_empty())
-      return ChangeStatus::UNCHANGED;
-
-    UndefValue &UV = *UndefValue::get(V.getType());
-    bool AnyChange = A.changeValueAfterManifest(V, UV);
-    return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_FLOATING_ATTR(IsDead)
-  }
-};
-
-struct AAIsDeadArgument : public AAIsDeadFloating {
-  AAIsDeadArgument(const IRPosition &IRP) : AAIsDeadFloating(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    if (!getAssociatedFunction()->hasExactDefinition())
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::manifest(...).
-  ChangeStatus manifest(Attributor &A) override {
-    ChangeStatus Changed = AAIsDeadFloating::manifest(A);
-    Argument &Arg = *getAssociatedArgument();
-    if (Arg.getParent()->hasLocalLinkage())
-      if (A.registerFunctionSignatureRewrite(
-              Arg, /* ReplacementTypes */ {},
-              Attributor::ArgumentReplacementInfo::CalleeRepairCBTy{},
-              Attributor::ArgumentReplacementInfo::ACSRepairCBTy{}))
-        return ChangeStatus::CHANGED;
-    return Changed;
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(IsDead) }
-};
-
-struct AAIsDeadCallSiteArgument : public AAIsDeadValueImpl {
-  AAIsDeadCallSiteArgument(const IRPosition &IRP) : AAIsDeadValueImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    if (isa<UndefValue>(getAssociatedValue()))
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Once we have call site specific value information we can provide
-    //       call site specific liveness information and then it makes
-    //       sense to specialize attributes for call sites arguments instead of
-    //       redirecting requests to the callee argument.
-    Argument *Arg = getAssociatedArgument();
-    if (!Arg)
-      return indicatePessimisticFixpoint();
-    const IRPosition &ArgPos = IRPosition::argument(*Arg);
-    auto &ArgAA = A.getAAFor<AAIsDead>(*this, ArgPos);
-    return clampStateAndIndicateChange(
-        getState(), static_cast<const AAIsDead::StateType &>(ArgAA.getState()));
-  }
-
-  /// See AbstractAttribute::manifest(...).
-  ChangeStatus manifest(Attributor &A) override {
-    CallBase &CB = cast<CallBase>(getAnchorValue());
-    Use &U = CB.getArgOperandUse(getArgNo());
-    assert(!isa<UndefValue>(U.get()) &&
-           "Expected undef values to be filtered out!");
-    UndefValue &UV = *UndefValue::get(U->getType());
-    if (A.changeUseAfterManifest(U, UV))
-      return ChangeStatus::CHANGED;
-    return ChangeStatus::UNCHANGED;
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(IsDead) }
-};
-
-struct AAIsDeadReturned : public AAIsDeadValueImpl {
-  AAIsDeadReturned(const IRPosition &IRP) : AAIsDeadValueImpl(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-
-    auto PredForCallSite = [&](AbstractCallSite ACS) {
-      if (ACS.isCallbackCall())
-        return false;
-      const IRPosition &CSRetPos =
-          IRPosition::callsite_returned(ACS.getCallSite());
-      const auto &RetIsDeadAA = A.getAAFor<AAIsDead>(*this, CSRetPos);
-      return RetIsDeadAA.isAssumedDead();
-    };
-
-    if (!A.checkForAllCallSites(PredForCallSite, *this, true))
-      return indicatePessimisticFixpoint();
-
-    return ChangeStatus::UNCHANGED;
-  }
-
-  /// See AbstractAttribute::manifest(...).
-  ChangeStatus manifest(Attributor &A) override {
-    // TODO: Rewrite the signature to return void?
-    bool AnyChange = false;
-    UndefValue &UV = *UndefValue::get(getAssociatedFunction()->getReturnType());
-    auto RetInstPred = [&](Instruction &I) {
-      ReturnInst &RI = cast<ReturnInst>(I);
-      if (!isa<UndefValue>(RI.getReturnValue()))
-        AnyChange |= A.changeUseAfterManifest(RI.getOperandUse(0), UV);
-      return true;
-    };
-    A.checkForAllInstructions(RetInstPred, *this, {Instruction::Ret});
-    return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(IsDead) }
-};
-
-struct AAIsDeadCallSiteReturned : public AAIsDeadFloating {
-  AAIsDeadCallSiteReturned(const IRPosition &IRP) : AAIsDeadFloating(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(IsDead) }
-};
-
-struct AAIsDeadFunction : public AAIsDead {
-  AAIsDeadFunction(const IRPosition &IRP) : AAIsDead(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    const Function *F = getAssociatedFunction();
-    if (F && !F->isDeclaration()) {
-      ToBeExploredFrom.insert(&F->getEntryBlock().front());
-      assumeLive(A, F->getEntryBlock());
-    }
-  }
-
-  /// See AbstractAttribute::getAsStr().
-  const std::string getAsStr() const override {
-    return "Live[#BB " + std::to_string(AssumedLiveBlocks.size()) + "/" +
-           std::to_string(getAssociatedFunction()->size()) + "][#TBEP " +
-           std::to_string(ToBeExploredFrom.size()) + "][#KDE " +
-           std::to_string(KnownDeadEnds.size()) + "]";
-  }
-
-  /// See AbstractAttribute::manifest(...).
-  ChangeStatus manifest(Attributor &A) override {
-    assert(getState().isValidState() &&
-           "Attempted to manifest an invalid state!");
-
-    ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
-    Function &F = *getAssociatedFunction();
-
-    if (AssumedLiveBlocks.empty()) {
-      A.deleteAfterManifest(F);
-      return ChangeStatus::CHANGED;
-    }
-
-    // Flag to determine if we can change an invoke to a call assuming the
-    // callee is nounwind. This is not possible if the personality of the
-    // function allows to catch asynchronous exceptions.
-    bool Invoke2CallAllowed = !mayCatchAsynchronousExceptions(F);
-
-    KnownDeadEnds.set_union(ToBeExploredFrom);
-    for (const Instruction *DeadEndI : KnownDeadEnds) {
-      auto *CB = dyn_cast<CallBase>(DeadEndI);
-      if (!CB)
-        continue;
-      const auto &NoReturnAA =
-          A.getAAFor<AANoReturn>(*this, IRPosition::callsite_function(*CB));
-      bool MayReturn = !NoReturnAA.isAssumedNoReturn();
-      if (MayReturn && (!Invoke2CallAllowed || !isa<InvokeInst>(CB)))
-        continue;
-
-      if (auto *II = dyn_cast<InvokeInst>(DeadEndI))
-        A.registerInvokeWithDeadSuccessor(const_cast<InvokeInst &>(*II));
-      else
-        A.changeToUnreachableAfterManifest(
-            const_cast<Instruction *>(DeadEndI->getNextNode()));
-      HasChanged = ChangeStatus::CHANGED;
-    }
-
-    for (BasicBlock &BB : F)
-      if (!AssumedLiveBlocks.count(&BB))
-        A.deleteAfterManifest(BB);
-
-    return HasChanged;
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {}
-
-  /// Returns true if the function is assumed dead.
-  bool isAssumedDead() const override { return false; }
-
-  /// See AAIsDead::isAssumedDead(BasicBlock *).
-  bool isAssumedDead(const BasicBlock *BB) const override {
-    assert(BB->getParent() == getAssociatedFunction() &&
-           "BB must be in the same anchor scope function.");
-
-    if (!getAssumed())
-      return false;
-    return !AssumedLiveBlocks.count(BB);
-  }
-
-  /// See AAIsDead::isKnownDead(BasicBlock *).
-  bool isKnownDead(const BasicBlock *BB) const override {
-    return getKnown() && isAssumedDead(BB);
-  }
-
-  /// See AAIsDead::isAssumed(Instruction *I).
-  bool isAssumedDead(const Instruction *I) const override {
-    assert(I->getParent()->getParent() == getAssociatedFunction() &&
-           "Instruction must be in the same anchor scope function.");
-
-    if (!getAssumed())
-      return false;
-
-    // If it is not in AssumedLiveBlocks then it for sure dead.
-    // Otherwise, it can still be after noreturn call in a live block.
-    if (!AssumedLiveBlocks.count(I->getParent()))
-      return true;
-
-    // If it is not after a liveness barrier it is live.
-    const Instruction *PrevI = I->getPrevNode();
-    while (PrevI) {
-      if (KnownDeadEnds.count(PrevI) || ToBeExploredFrom.count(PrevI))
-        return true;
-      PrevI = PrevI->getPrevNode();
-    }
-    return false;
-  }
-
-  /// See AAIsDead::isKnownDead(Instruction *I).
-  bool isKnownDead(const Instruction *I) const override {
-    return getKnown() && isAssumedDead(I);
-  }
-
-  /// Determine if \p F might catch asynchronous exceptions.
-  static bool mayCatchAsynchronousExceptions(const Function &F) {
-    return F.hasPersonalityFn() && !canSimplifyInvokeNoUnwind(&F);
-  }
-
-  /// Assume \p BB is (partially) live now and indicate to the Attributor \p A
-  /// that internal function called from \p BB should now be looked at.
-  bool assumeLive(Attributor &A, const BasicBlock &BB) {
-    if (!AssumedLiveBlocks.insert(&BB).second)
-      return false;
-
-    // We assume that all of BB is (probably) live now and if there are calls to
-    // internal functions we will assume that those are now live as well. This
-    // is a performance optimization for blocks with calls to a lot of internal
-    // functions. It can however cause dead functions to be treated as live.
-    for (const Instruction &I : BB)
-      if (ImmutableCallSite ICS = ImmutableCallSite(&I))
-        if (const Function *F = ICS.getCalledFunction())
-          if (F->hasLocalLinkage())
-            A.markLiveInternalFunction(*F);
+bool Attributor::isAssumedDead(const IRPosition &IRP,
+                               const AbstractAttribute *QueryingAA,
+                               const AAIsDead *FnLivenessAA,
+                               bool CheckBBLivenessOnly, DepClassTy DepClass) {
+  Instruction *CtxI = IRP.getCtxI();
+  if (CtxI &&
+      isAssumedDead(*CtxI, QueryingAA, FnLivenessAA,
+                    /* CheckBBLivenessOnly */ true,
+                    CheckBBLivenessOnly ? DepClass : DepClassTy::OPTIONAL))
     return true;
-  }
 
-  /// Collection of instructions that need to be explored again, e.g., we
-  /// did assume they do not transfer control to (one of their) successors.
-  SmallSetVector<const Instruction *, 8> ToBeExploredFrom;
-
-  /// Collection of instructions that are known to not transfer control.
-  SmallSetVector<const Instruction *, 8> KnownDeadEnds;
-
-  /// Collection of all assumed live BasicBlocks.
-  DenseSet<const BasicBlock *> AssumedLiveBlocks;
-};
+  if (CheckBBLivenessOnly)
+    return false;
 
-static bool
-identifyAliveSuccessors(Attributor &A, const CallBase &CB,
-                        AbstractAttribute &AA,
-                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
-  const IRPosition &IPos = IRPosition::callsite_function(CB);
-
-  const auto &NoReturnAA = A.getAAFor<AANoReturn>(AA, IPos);
-  if (NoReturnAA.isAssumedNoReturn())
-    return !NoReturnAA.isKnownNoReturn();
-  if (CB.isTerminator())
-    AliveSuccessors.push_back(&CB.getSuccessor(0)->front());
+  // If we haven't succeeded we query the specific liveness info for the IRP.
+  const AAIsDead *IsDeadAA;
+  if (IRP.getPositionKind() == IRPosition::IRP_CALL_SITE)
+    IsDeadAA = &getOrCreateAAFor<AAIsDead>(
+        IRPosition::callsite_returned(cast<CallBase>(IRP.getAssociatedValue())),
+        QueryingAA, /* TrackDependence */ false);
   else
-    AliveSuccessors.push_back(CB.getNextNode());
-  return false;
-}
-
-static bool
-identifyAliveSuccessors(Attributor &A, const InvokeInst &II,
-                        AbstractAttribute &AA,
-                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
-  bool UsedAssumedInformation =
-      identifyAliveSuccessors(A, cast<CallBase>(II), AA, AliveSuccessors);
-
-  // First, determine if we can change an invoke to a call assuming the
-  // callee is nounwind. This is not possible if the personality of the
-  // function allows to catch asynchronous exceptions.
-  if (AAIsDeadFunction::mayCatchAsynchronousExceptions(*II.getFunction())) {
-    AliveSuccessors.push_back(&II.getUnwindDest()->front());
-  } else {
-    const IRPosition &IPos = IRPosition::callsite_function(II);
-    const auto &AANoUnw = A.getAAFor<AANoUnwind>(AA, IPos);
-    if (AANoUnw.isAssumedNoUnwind()) {
-      UsedAssumedInformation |= !AANoUnw.isKnownNoUnwind();
-    } else {
-      AliveSuccessors.push_back(&II.getUnwindDest()->front());
-    }
-  }
-  return UsedAssumedInformation;
-}
-
-static Optional<ConstantInt *>
-getAssumedConstant(Attributor &A, const Value &V, AbstractAttribute &AA,
-                   bool &UsedAssumedInformation) {
-  const auto &ValueSimplifyAA =
-      A.getAAFor<AAValueSimplify>(AA, IRPosition::value(V));
-  Optional<Value *> SimplifiedV = ValueSimplifyAA.getAssumedSimplifiedValue(A);
-  UsedAssumedInformation |= !ValueSimplifyAA.isKnown();
-  if (!SimplifiedV.hasValue())
-    return llvm::None;
-  if (isa_and_nonnull<UndefValue>(SimplifiedV.getValue()))
-    return llvm::None;
-  return dyn_cast_or_null<ConstantInt>(SimplifiedV.getValue());
-}
-
-static bool
-identifyAliveSuccessors(Attributor &A, const BranchInst &BI,
-                        AbstractAttribute &AA,
-                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
-  bool UsedAssumedInformation = false;
-  if (BI.getNumSuccessors() == 1) {
-    AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
-  } else {
-    Optional<ConstantInt *> CI =
-        getAssumedConstant(A, *BI.getCondition(), AA, UsedAssumedInformation);
-    if (!CI.hasValue()) {
-      // No value yet, assume both edges are dead.
-    } else if (CI.getValue()) {
-      const BasicBlock *SuccBB =
-          BI.getSuccessor(1 - CI.getValue()->getZExtValue());
-      AliveSuccessors.push_back(&SuccBB->front());
-    } else {
-      AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
-      AliveSuccessors.push_back(&BI.getSuccessor(1)->front());
-      UsedAssumedInformation = false;
-    }
-  }
-  return UsedAssumedInformation;
-}
-
-static bool
-identifyAliveSuccessors(Attributor &A, const SwitchInst &SI,
-                        AbstractAttribute &AA,
-                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
-  bool UsedAssumedInformation = false;
-  Optional<ConstantInt *> CI =
-      getAssumedConstant(A, *SI.getCondition(), AA, UsedAssumedInformation);
-  if (!CI.hasValue()) {
-    // No value yet, assume all edges are dead.
-  } else if (CI.getValue()) {
-    for (auto &CaseIt : SI.cases()) {
-      if (CaseIt.getCaseValue() == CI.getValue()) {
-        AliveSuccessors.push_back(&CaseIt.getCaseSuccessor()->front());
-        return UsedAssumedInformation;
-      }
-    }
-    AliveSuccessors.push_back(&SI.getDefaultDest()->front());
-    return UsedAssumedInformation;
-  } else {
-    for (const BasicBlock *SuccBB : successors(SI.getParent()))
-      AliveSuccessors.push_back(&SuccBB->front());
-  }
-  return UsedAssumedInformation;
-}
-
-ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) {
-  ChangeStatus Change = ChangeStatus::UNCHANGED;
-
-  LLVM_DEBUG(dbgs() << "[AAIsDead] Live [" << AssumedLiveBlocks.size() << "/"
-                    << getAssociatedFunction()->size() << "] BBs and "
-                    << ToBeExploredFrom.size() << " exploration points and "
-                    << KnownDeadEnds.size() << " known dead ends\n");
-
-  // Copy and clear the list of instructions we need to explore from. It is
-  // refilled with instructions the next update has to look at.
-  SmallVector<const Instruction *, 8> Worklist(ToBeExploredFrom.begin(),
-                                               ToBeExploredFrom.end());
-  decltype(ToBeExploredFrom) NewToBeExploredFrom;
-
-  SmallVector<const Instruction *, 8> AliveSuccessors;
-  while (!Worklist.empty()) {
-    const Instruction *I = Worklist.pop_back_val();
-    LLVM_DEBUG(dbgs() << "[AAIsDead] Exploration inst: " << *I << "\n");
-
-    AliveSuccessors.clear();
-
-    bool UsedAssumedInformation = false;
-    switch (I->getOpcode()) {
-    // TODO: look for (assumed) UB to backwards propagate "deadness".
-    default:
-      if (I->isTerminator()) {
-        for (const BasicBlock *SuccBB : successors(I->getParent()))
-          AliveSuccessors.push_back(&SuccBB->front());
-      } else {
-        AliveSuccessors.push_back(I->getNextNode());
-      }
-      break;
-    case Instruction::Call:
-      UsedAssumedInformation = identifyAliveSuccessors(A, cast<CallInst>(*I),
-                                                       *this, AliveSuccessors);
-      break;
-    case Instruction::Invoke:
-      UsedAssumedInformation = identifyAliveSuccessors(A, cast<InvokeInst>(*I),
-                                                       *this, AliveSuccessors);
-      break;
-    case Instruction::Br:
-      UsedAssumedInformation = identifyAliveSuccessors(A, cast<BranchInst>(*I),
-                                                       *this, AliveSuccessors);
-      break;
-    case Instruction::Switch:
-      UsedAssumedInformation = identifyAliveSuccessors(A, cast<SwitchInst>(*I),
-                                                       *this, AliveSuccessors);
-      break;
-    }
-
-    if (UsedAssumedInformation) {
-      NewToBeExploredFrom.insert(I);
-    } else {
-      Change = ChangeStatus::CHANGED;
-      if (AliveSuccessors.empty() ||
-          (I->isTerminator() && AliveSuccessors.size() < I->getNumSuccessors()))
-        KnownDeadEnds.insert(I);
-    }
-
-    LLVM_DEBUG(dbgs() << "[AAIsDead] #AliveSuccessors: "
-                      << AliveSuccessors.size() << " UsedAssumedInformation: "
-                      << UsedAssumedInformation << "\n");
-
-    for (const Instruction *AliveSuccessor : AliveSuccessors) {
-      if (!I->isTerminator()) {
-        assert(AliveSuccessors.size() == 1 &&
-               "Non-terminator expected to have a single successor!");
-        Worklist.push_back(AliveSuccessor);
-      } else {
-        if (assumeLive(A, *AliveSuccessor->getParent()))
-          Worklist.push_back(AliveSuccessor);
-      }
-    }
-  }
-
-  ToBeExploredFrom = std::move(NewToBeExploredFrom);
-
-  // If we know everything is live there is no need to query for liveness.
-  // Instead, indicating a pessimistic fixpoint will cause the state to be
-  // "invalid" and all queries to be answered conservatively without lookups.
-  // To be in this state we have to (1) finished the exploration and (3) not
-  // discovered any non-trivial dead end and (2) not ruled unreachable code
-  // dead.
-  if (ToBeExploredFrom.empty() &&
-      getAssociatedFunction()->size() == AssumedLiveBlocks.size() &&
-      llvm::all_of(KnownDeadEnds, [](const Instruction *DeadEndI) {
-        return DeadEndI->isTerminator() && DeadEndI->getNumSuccessors() == 0;
-      }))
-    return indicatePessimisticFixpoint();
-  return Change;
-}
-
-/// Liveness information for a call sites.
-struct AAIsDeadCallSite final : AAIsDeadFunction {
-  AAIsDeadCallSite(const IRPosition &IRP) : AAIsDeadFunction(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    // TODO: Once we have call site specific value information we can provide
-    //       call site specific liveness information and then it makes
-    //       sense to specialize attributes for call sites instead of
-    //       redirecting requests to the callee.
-    llvm_unreachable("Abstract attributes for liveness are not "
-                     "supported for call sites yet!");
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    return indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {}
-};
-
-/// -------------------- Dereferenceable Argument Attribute --------------------
-
-template <>
-ChangeStatus clampStateAndIndicateChange<DerefState>(DerefState &S,
-                                                     const DerefState &R) {
-  ChangeStatus CS0 =
-      clampStateAndIndicateChange(S.DerefBytesState, R.DerefBytesState);
-  ChangeStatus CS1 = clampStateAndIndicateChange(S.GlobalState, R.GlobalState);
-  return CS0 | CS1;
-}
-
-struct AADereferenceableImpl : AADereferenceable {
-  AADereferenceableImpl(const IRPosition &IRP) : AADereferenceable(IRP) {}
-  using StateType = DerefState;
-
-  void initialize(Attributor &A) override {
-    SmallVector<Attribute, 4> Attrs;
-    getAttrs({Attribute::Dereferenceable, Attribute::DereferenceableOrNull},
-             Attrs);
-    for (const Attribute &Attr : Attrs)
-      takeKnownDerefBytesMaximum(Attr.getValueAsInt());
-
-    NonNullAA = &A.getAAFor<AANonNull>(*this, getIRPosition());
-
-    const IRPosition &IRP = this->getIRPosition();
-    bool IsFnInterface = IRP.isFnInterfaceKind();
-    const Function *FnScope = IRP.getAnchorScope();
-    if (IsFnInterface && (!FnScope || !FnScope->hasExactDefinition()))
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::getState()
-  /// {
-  StateType &getState() override { return *this; }
-  const StateType &getState() const override { return *this; }
-  /// }
-
-  /// Helper function for collecting accessed bytes in must-be-executed-context
-  void addAccessedBytesForUse(Attributor &A, const Use *U,
-                              const Instruction *I) {
-    const Value *UseV = U->get();
-    if (!UseV->getType()->isPointerTy())
-      return;
-
-    Type *PtrTy = UseV->getType();
-    const DataLayout &DL = A.getDataLayout();
-    int64_t Offset;
-    if (const Value *Base = getBasePointerOfAccessPointerOperand(
-            I, Offset, DL, /*AllowNonInbounds*/ true)) {
-      if (Base == &getAssociatedValue() &&
-          Attributor::getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
-        uint64_t Size = DL.getTypeStoreSize(PtrTy->getPointerElementType());
-        addAccessedBytes(Offset, Size);
-      }
-    }
-    return;
-  }
-
-  /// See AAFromMustBeExecutedContext
-  bool followUse(Attributor &A, const Use *U, const Instruction *I) {
-    bool IsNonNull = false;
-    bool TrackUse = false;
-    int64_t DerefBytes = getKnownNonNullAndDerefBytesForUse(
-        A, *this, getAssociatedValue(), U, I, IsNonNull, TrackUse);
-
-    addAccessedBytesForUse(A, U, I);
-    takeKnownDerefBytesMaximum(DerefBytes);
-    return TrackUse;
-  }
-
-  /// See AbstractAttribute::manifest(...).
-  ChangeStatus manifest(Attributor &A) override {
-    ChangeStatus Change = AADereferenceable::manifest(A);
-    if (isAssumedNonNull() && hasAttr(Attribute::DereferenceableOrNull)) {
-      removeAttrs({Attribute::DereferenceableOrNull});
-      return ChangeStatus::CHANGED;
-    }
-    return Change;
-  }
-
-  void getDeducedAttributes(LLVMContext &Ctx,
-                            SmallVectorImpl<Attribute> &Attrs) const override {
-    // TODO: Add *_globally support
-    if (isAssumedNonNull())
-      Attrs.emplace_back(Attribute::getWithDereferenceableBytes(
-          Ctx, getAssumedDereferenceableBytes()));
-    else
-      Attrs.emplace_back(Attribute::getWithDereferenceableOrNullBytes(
-          Ctx, getAssumedDereferenceableBytes()));
-  }
-
-  /// See AbstractAttribute::getAsStr().
-  const std::string getAsStr() const override {
-    if (!getAssumedDereferenceableBytes())
-      return "unknown-dereferenceable";
-    return std::string("dereferenceable") +
-           (isAssumedNonNull() ? "" : "_or_null") +
-           (isAssumedGlobal() ? "_globally" : "") + "<" +
-           std::to_string(getKnownDereferenceableBytes()) + "-" +
-           std::to_string(getAssumedDereferenceableBytes()) + ">";
-  }
-};
-
-/// Dereferenceable attribute for a floating value.
-struct AADereferenceableFloating
-    : AAFromMustBeExecutedContext<AADereferenceable, AADereferenceableImpl> {
-  using Base =
-      AAFromMustBeExecutedContext<AADereferenceable, AADereferenceableImpl>;
-  AADereferenceableFloating(const IRPosition &IRP) : Base(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    ChangeStatus Change = Base::updateImpl(A);
-
-    const DataLayout &DL = A.getDataLayout();
-
-    auto VisitValueCB = [&](Value &V, DerefState &T, bool Stripped) -> bool {
-      unsigned IdxWidth =
-          DL.getIndexSizeInBits(V.getType()->getPointerAddressSpace());
-      APInt Offset(IdxWidth, 0);
-      const Value *Base =
-          V.stripAndAccumulateInBoundsConstantOffsets(DL, Offset);
-
-      const auto &AA =
-          A.getAAFor<AADereferenceable>(*this, IRPosition::value(*Base));
-      int64_t DerefBytes = 0;
-      if (!Stripped && this == &AA) {
-        // Use IR information if we did not strip anything.
-        // TODO: track globally.
-        bool CanBeNull;
-        DerefBytes = Base->getPointerDereferenceableBytes(DL, CanBeNull);
-        T.GlobalState.indicatePessimisticFixpoint();
-      } else {
-        const DerefState &DS = static_cast<const DerefState &>(AA.getState());
-        DerefBytes = DS.DerefBytesState.getAssumed();
-        T.GlobalState &= DS.GlobalState;
-      }
-
-      // TODO: Use `AAConstantRange` to infer dereferenceable bytes.
-
-      // For now we do not try to "increase" dereferenceability due to negative
-      // indices as we first have to come up with code to deal with loops and
-      // for overflows of the dereferenceable bytes.
-      int64_t OffsetSExt = Offset.getSExtValue();
-      if (OffsetSExt < 0)
-        OffsetSExt = 0;
-
-      T.takeAssumedDerefBytesMinimum(
-          std::max(int64_t(0), DerefBytes - OffsetSExt));
-
-      if (this == &AA) {
-        if (!Stripped) {
-          // If nothing was stripped IR information is all we got.
-          T.takeKnownDerefBytesMaximum(
-              std::max(int64_t(0), DerefBytes - OffsetSExt));
-          T.indicatePessimisticFixpoint();
-        } else if (OffsetSExt > 0) {
-          // If something was stripped but there is circular reasoning we look
-          // for the offset. If it is positive we basically decrease the
-          // dereferenceable bytes in a circluar loop now, which will simply
-          // drive them down to the known value in a very slow way which we
-          // can accelerate.
-          T.indicatePessimisticFixpoint();
-        }
-      }
-
-      return T.isValidState();
-    };
-
-    DerefState T;
-    if (!genericValueTraversal<AADereferenceable, DerefState>(
-            A, getIRPosition(), *this, T, VisitValueCB))
-      return indicatePessimisticFixpoint();
-
-    return Change | clampStateAndIndicateChange(getState(), T);
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_FLOATING_ATTR(dereferenceable)
-  }
-};
-
-/// Dereferenceable attribute for a return value.
-struct AADereferenceableReturned final
-    : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl,
-                                   DerefState> {
-  AADereferenceableReturned(const IRPosition &IRP)
-      : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl,
-                                     DerefState>(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_FNRET_ATTR(dereferenceable)
-  }
-};
-
-/// Dereferenceable attribute for an argument
-struct AADereferenceableArgument final
-    : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<
-          AADereferenceable, AADereferenceableImpl, DerefState> {
-  using Base = AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<
-      AADereferenceable, AADereferenceableImpl, DerefState>;
-  AADereferenceableArgument(const IRPosition &IRP) : Base(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_ARG_ATTR(dereferenceable)
-  }
-};
-
-/// Dereferenceable attribute for a call site argument.
-struct AADereferenceableCallSiteArgument final : AADereferenceableFloating {
-  AADereferenceableCallSiteArgument(const IRPosition &IRP)
-      : AADereferenceableFloating(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_CSARG_ATTR(dereferenceable)
-  }
-};
-
-/// Dereferenceable attribute deduction for a call site return value.
-struct AADereferenceableCallSiteReturned final
-    : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<
-          AADereferenceable, AADereferenceableImpl> {
-  using Base = AACallSiteReturnedFromReturnedAndMustBeExecutedContext<
-      AADereferenceable, AADereferenceableImpl>;
-  AADereferenceableCallSiteReturned(const IRPosition &IRP) : Base(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_CS_ATTR(dereferenceable);
-  }
-};
-
-// ------------------------ Align Argument Attribute ------------------------
-
-static unsigned int getKnownAlignForUse(Attributor &A,
-                                        AbstractAttribute &QueryingAA,
-                                        Value &AssociatedValue, const Use *U,
-                                        const Instruction *I, bool &TrackUse) {
-  // We need to follow common pointer manipulation uses to the accesses they
-  // feed into.
-  if (isa<CastInst>(I)) {
-    // Follow all but ptr2int casts.
-    TrackUse = !isa<PtrToIntInst>(I);
-    return 0;
-  }
-  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
-    if (GEP->hasAllConstantIndices()) {
-      TrackUse = true;
-      return 0;
-    }
-  }
-
-  unsigned Alignment = 0;
-  if (ImmutableCallSite ICS = ImmutableCallSite(I)) {
-    if (ICS.isBundleOperand(U) || ICS.isCallee(U))
-      return 0;
-
-    unsigned ArgNo = ICS.getArgumentNo(U);
-    IRPosition IRP = IRPosition::callsite_argument(ICS, ArgNo);
-    // As long as we only use known information there is no need to track
-    // dependences here.
-    auto &AlignAA = A.getAAFor<AAAlign>(QueryingAA, IRP,
-                                        /* TrackDependence */ false);
-    Alignment = AlignAA.getKnownAlign();
-  }
-
-  const Value *UseV = U->get();
-  if (auto *SI = dyn_cast<StoreInst>(I))
-    Alignment = SI->getAlignment();
-  else if (auto *LI = dyn_cast<LoadInst>(I))
-    Alignment = LI->getAlignment();
-
-  if (Alignment <= 1)
-    return 0;
-
-  auto &DL = A.getDataLayout();
-  int64_t Offset;
-
-  if (const Value *Base = GetPointerBaseWithConstantOffset(UseV, Offset, DL)) {
-    if (Base == &AssociatedValue) {
-      // BasePointerAddr + Offset = Alignment * Q for some integer Q.
-      // So we can say that the maximum power of two which is a divisor of
-      // gcd(Offset, Alignment) is an alignment.
-
-      uint32_t gcd =
-          greatestCommonDivisor(uint32_t(abs((int32_t)Offset)), Alignment);
-      Alignment = llvm::PowerOf2Floor(gcd);
-    }
-  }
-
-  return Alignment;
-}
-struct AAAlignImpl : AAAlign {
-  AAAlignImpl(const IRPosition &IRP) : AAAlign(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    SmallVector<Attribute, 4> Attrs;
-    getAttrs({Attribute::Alignment}, Attrs);
-    for (const Attribute &Attr : Attrs)
-      takeKnownMaximum(Attr.getValueAsInt());
-
-    if (getIRPosition().isFnInterfaceKind() &&
-        (!getAssociatedFunction() ||
-         !getAssociatedFunction()->hasExactDefinition()))
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::manifest(...).
-  ChangeStatus manifest(Attributor &A) override {
-    ChangeStatus Changed = ChangeStatus::UNCHANGED;
-
-    // Check for users that allow alignment annotations.
-    Value &AnchorVal = getIRPosition().getAnchorValue();
-    for (const Use &U : AnchorVal.uses()) {
-      if (auto *SI = dyn_cast<StoreInst>(U.getUser())) {
-        if (SI->getPointerOperand() == &AnchorVal)
-          if (SI->getAlignment() < getAssumedAlign()) {
-            STATS_DECLTRACK(AAAlign, Store,
-                            "Number of times alignment added to a store");
-            SI->setAlignment(Align(getAssumedAlign()));
-            Changed = ChangeStatus::CHANGED;
-          }
-      } else if (auto *LI = dyn_cast<LoadInst>(U.getUser())) {
-        if (LI->getPointerOperand() == &AnchorVal)
-          if (LI->getAlignment() < getAssumedAlign()) {
-            LI->setAlignment(Align(getAssumedAlign()));
-            STATS_DECLTRACK(AAAlign, Load,
-                            "Number of times alignment added to a load");
-            Changed = ChangeStatus::CHANGED;
-          }
-      }
-    }
-
-    return AAAlign::manifest(A) | Changed;
-  }
-
-  // TODO: Provide a helper to determine the implied ABI alignment and check in
-  //       the existing manifest method and a new one for AAAlignImpl that value
-  //       to avoid making the alignment explicit if it did not improve.
-
-  /// See AbstractAttribute::getDeducedAttributes
-  virtual void
-  getDeducedAttributes(LLVMContext &Ctx,
-                       SmallVectorImpl<Attribute> &Attrs) const override {
-    if (getAssumedAlign() > 1)
-      Attrs.emplace_back(
-          Attribute::getWithAlignment(Ctx, Align(getAssumedAlign())));
-  }
-  /// See AAFromMustBeExecutedContext
-  bool followUse(Attributor &A, const Use *U, const Instruction *I) {
-    bool TrackUse = false;
-
-    unsigned int KnownAlign =
-        getKnownAlignForUse(A, *this, getAssociatedValue(), U, I, TrackUse);
-    takeKnownMaximum(KnownAlign);
-
-    return TrackUse;
-  }
-
-  /// See AbstractAttribute::getAsStr().
-  const std::string getAsStr() const override {
-    return getAssumedAlign() ? ("align<" + std::to_string(getKnownAlign()) +
-                                "-" + std::to_string(getAssumedAlign()) + ">")
-                             : "unknown-align";
-  }
-};
-
-/// Align attribute for a floating value.
-struct AAAlignFloating : AAFromMustBeExecutedContext<AAAlign, AAAlignImpl> {
-  using Base = AAFromMustBeExecutedContext<AAAlign, AAAlignImpl>;
-  AAAlignFloating(const IRPosition &IRP) : Base(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    Base::updateImpl(A);
-
-    const DataLayout &DL = A.getDataLayout();
-
-    auto VisitValueCB = [&](Value &V, AAAlign::StateType &T,
-                            bool Stripped) -> bool {
-      const auto &AA = A.getAAFor<AAAlign>(*this, IRPosition::value(V));
-      if (!Stripped && this == &AA) {
-        // Use only IR information if we did not strip anything.
-        const MaybeAlign PA = V.getPointerAlignment(DL);
-        T.takeKnownMaximum(PA ? PA->value() : 0);
-        T.indicatePessimisticFixpoint();
-      } else {
-        // Use abstract attribute information.
-        const AAAlign::StateType &DS =
-            static_cast<const AAAlign::StateType &>(AA.getState());
-        T ^= DS;
-      }
-      return T.isValidState();
-    };
-
-    StateType T;
-    if (!genericValueTraversal<AAAlign, StateType>(A, getIRPosition(), *this, T,
-                                                   VisitValueCB))
-      return indicatePessimisticFixpoint();
-
-    // TODO: If we know we visited all incoming values, thus no are assumed
-    // dead, we can take the known information from the state T.
-    return clampStateAndIndicateChange(getState(), T);
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_FLOATING_ATTR(align) }
-};
-
-/// Align attribute for function return value.
-struct AAAlignReturned final
-    : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl> {
-  AAAlignReturned(const IRPosition &IRP)
-      : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl>(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(aligned) }
-};
-
-/// Align attribute for function argument.
-struct AAAlignArgument final
-    : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AAAlign,
-                                                              AAAlignImpl> {
-  AAAlignArgument(const IRPosition &IRP)
-      : AAArgumentFromCallSiteArgumentsAndMustBeExecutedContext<AAAlign,
-                                                                AAAlignImpl>(
-            IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(aligned) }
-};
-
-struct AAAlignCallSiteArgument final : AAAlignFloating {
-  AAAlignCallSiteArgument(const IRPosition &IRP) : AAAlignFloating(IRP) {}
-
-  /// See AbstractAttribute::manifest(...).
-  ChangeStatus manifest(Attributor &A) override {
-    return AAAlignImpl::manifest(A);
-  }
-
-  /// See AbstractAttribute::updateImpl(Attributor &A).
-  ChangeStatus updateImpl(Attributor &A) override {
-    ChangeStatus Changed = AAAlignFloating::updateImpl(A);
-    if (Argument *Arg = getAssociatedArgument()) {
-      const auto &ArgAlignAA = A.getAAFor<AAAlign>(
-          *this, IRPosition::argument(*Arg), /* TrackDependence */ false,
-          DepClassTy::OPTIONAL);
-      takeKnownMaximum(ArgAlignAA.getKnownAlign());
-    }
-    return Changed;
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(aligned) }
-};
-
-/// Align attribute deduction for a call site return value.
-struct AAAlignCallSiteReturned final
-    : AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AAAlign,
-                                                             AAAlignImpl> {
-  using Base =
-      AACallSiteReturnedFromReturnedAndMustBeExecutedContext<AAAlign,
-                                                             AAAlignImpl>;
-  AAAlignCallSiteReturned(const IRPosition &IRP) : Base(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    Base::initialize(A);
-    Function *F = getAssociatedFunction();
-    if (!F)
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
-};
-
-/// ------------------ Function No-Return Attribute ----------------------------
-struct AANoReturnImpl : public AANoReturn {
-  AANoReturnImpl(const IRPosition &IRP) : AANoReturn(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    AANoReturn::initialize(A);
-    Function *F = getAssociatedFunction();
-    if (!F)
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::getAsStr().
-  const std::string getAsStr() const override {
-    return getAssumed() ? "noreturn" : "may-return";
-  }
-
-  /// See AbstractAttribute::updateImpl(Attributor &A).
-  virtual ChangeStatus updateImpl(Attributor &A) override {
-    auto CheckForNoReturn = [](Instruction &) { return false; };
-    if (!A.checkForAllInstructions(CheckForNoReturn, *this,
-                                   {(unsigned)Instruction::Ret}))
-      return indicatePessimisticFixpoint();
-    return ChangeStatus::UNCHANGED;
-  }
-};
-
-struct AANoReturnFunction final : AANoReturnImpl {
-  AANoReturnFunction(const IRPosition &IRP) : AANoReturnImpl(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(noreturn) }
-};
-
-/// NoReturn attribute deduction for a call sites.
-struct AANoReturnCallSite final : AANoReturnImpl {
-  AANoReturnCallSite(const IRPosition &IRP) : AANoReturnImpl(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Once we have call site specific value information we can provide
-    //       call site specific liveness information and then it makes
-    //       sense to specialize attributes for call sites arguments instead of
-    //       redirecting requests to the callee argument.
-    Function *F = getAssociatedFunction();
-    const IRPosition &FnPos = IRPosition::function(*F);
-    auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AANoReturn::StateType &>(FnAA.getState()));
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(noreturn); }
-};
-
-/// ----------------------- Variable Capturing ---------------------------------
-
-/// A class to hold the state of for no-capture attributes.
-struct AANoCaptureImpl : public AANoCapture {
-  AANoCaptureImpl(const IRPosition &IRP) : AANoCapture(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    if (hasAttr(getAttrKind(), /* IgnoreSubsumingPositions */ true)) {
-      indicateOptimisticFixpoint();
-      return;
-    }
-    Function *AnchorScope = getAnchorScope();
-    if (isFnInterfaceKind() &&
-        (!AnchorScope || !AnchorScope->hasExactDefinition())) {
-      indicatePessimisticFixpoint();
-      return;
-    }
-
-    // You cannot "capture" null in the default address space.
-    if (isa<ConstantPointerNull>(getAssociatedValue()) &&
-        getAssociatedValue().getType()->getPointerAddressSpace() == 0) {
-      indicateOptimisticFixpoint();
-      return;
-    }
-
-    const Function *F = getArgNo() >= 0 ? getAssociatedFunction() : AnchorScope;
-
-    // Check what state the associated function can actually capture.
-    if (F)
-      determineFunctionCaptureCapabilities(getIRPosition(), *F, *this);
-    else
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
-
-  /// see AbstractAttribute::isAssumedNoCaptureMaybeReturned(...).
-  virtual void
-  getDeducedAttributes(LLVMContext &Ctx,
-                       SmallVectorImpl<Attribute> &Attrs) const override {
-    if (!isAssumedNoCaptureMaybeReturned())
-      return;
-
-    if (getArgNo() >= 0) {
-      if (isAssumedNoCapture())
-        Attrs.emplace_back(Attribute::get(Ctx, Attribute::NoCapture));
-      else if (ManifestInternal)
-        Attrs.emplace_back(Attribute::get(Ctx, "no-capture-maybe-returned"));
-    }
-  }
-
-  /// Set the NOT_CAPTURED_IN_MEM and NOT_CAPTURED_IN_RET bits in \p Known
-  /// depending on the ability of the function associated with \p IRP to capture
-  /// state in memory and through "returning/throwing", respectively.
-  static void determineFunctionCaptureCapabilities(const IRPosition &IRP,
-                                                   const Function &F,
-                                                   BitIntegerState &State) {
-    // TODO: Once we have memory behavior attributes we should use them here.
-
-    // If we know we cannot communicate or write to memory, we do not care about
-    // ptr2int anymore.
-    if (F.onlyReadsMemory() && F.doesNotThrow() &&
-        F.getReturnType()->isVoidTy()) {
-      State.addKnownBits(NO_CAPTURE);
-      return;
-    }
-
-    // A function cannot capture state in memory if it only reads memory, it can
-    // however return/throw state and the state might be influenced by the
-    // pointer value, e.g., loading from a returned pointer might reveal a bit.
-    if (F.onlyReadsMemory())
-      State.addKnownBits(NOT_CAPTURED_IN_MEM);
-
-    // A function cannot communicate state back if it does not through
-    // exceptions and doesn not return values.
-    if (F.doesNotThrow() && F.getReturnType()->isVoidTy())
-      State.addKnownBits(NOT_CAPTURED_IN_RET);
-
-    // Check existing "returned" attributes.
-    int ArgNo = IRP.getArgNo();
-    if (F.doesNotThrow() && ArgNo >= 0) {
-      for (unsigned u = 0, e = F.arg_size(); u < e; ++u)
-        if (F.hasParamAttribute(u, Attribute::Returned)) {
-          if (u == unsigned(ArgNo))
-            State.removeAssumedBits(NOT_CAPTURED_IN_RET);
-          else if (F.onlyReadsMemory())
-            State.addKnownBits(NO_CAPTURE);
-          else
-            State.addKnownBits(NOT_CAPTURED_IN_RET);
-          break;
-        }
-    }
-  }
-
-  /// See AbstractState::getAsStr().
-  const std::string getAsStr() const override {
-    if (isKnownNoCapture())
-      return "known not-captured";
-    if (isAssumedNoCapture())
-      return "assumed not-captured";
-    if (isKnownNoCaptureMaybeReturned())
-      return "known not-captured-maybe-returned";
-    if (isAssumedNoCaptureMaybeReturned())
-      return "assumed not-captured-maybe-returned";
-    return "assumed-captured";
-  }
-};
-
-/// Attributor-aware capture tracker.
-struct AACaptureUseTracker final : public CaptureTracker {
-
-  /// Create a capture tracker that can lookup in-flight abstract attributes
-  /// through the Attributor \p A.
-  ///
-  /// If a use leads to a potential capture, \p CapturedInMemory is set and the
-  /// search is stopped. If a use leads to a return instruction,
-  /// \p CommunicatedBack is set to true and \p CapturedInMemory is not changed.
-  /// If a use leads to a ptr2int which may capture the value,
-  /// \p CapturedInInteger is set. If a use is found that is currently assumed
-  /// "no-capture-maybe-returned", the user is added to the \p PotentialCopies
-  /// set. All values in \p PotentialCopies are later tracked as well. For every
-  /// explored use we decrement \p RemainingUsesToExplore. Once it reaches 0,
-  /// the search is stopped with \p CapturedInMemory and \p CapturedInInteger
-  /// conservatively set to true.
-  AACaptureUseTracker(Attributor &A, AANoCapture &NoCaptureAA,
-                      const AAIsDead &IsDeadAA, AANoCapture::StateType &State,
-                      SmallVectorImpl<const Value *> &PotentialCopies,
-                      unsigned &RemainingUsesToExplore)
-      : A(A), NoCaptureAA(NoCaptureAA), IsDeadAA(IsDeadAA), State(State),
-        PotentialCopies(PotentialCopies),
-        RemainingUsesToExplore(RemainingUsesToExplore) {}
-
-  /// Determine if \p V maybe captured. *Also updates the state!*
-  bool valueMayBeCaptured(const Value *V) {
-    if (V->getType()->isPointerTy()) {
-      PointerMayBeCaptured(V, this);
-    } else {
-      State.indicatePessimisticFixpoint();
-    }
-    return State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
-  }
-
-  /// See CaptureTracker::tooManyUses().
-  void tooManyUses() override {
-    State.removeAssumedBits(AANoCapture::NO_CAPTURE);
-  }
-
-  bool isDereferenceableOrNull(Value *O, const DataLayout &DL) override {
-    if (CaptureTracker::isDereferenceableOrNull(O, DL))
-      return true;
-    const auto &DerefAA =
-        A.getAAFor<AADereferenceable>(NoCaptureAA, IRPosition::value(*O));
-    return DerefAA.getAssumedDereferenceableBytes();
-  }
-
-  /// See CaptureTracker::captured(...).
-  bool captured(const Use *U) override {
-    Instruction *UInst = cast<Instruction>(U->getUser());
-    LLVM_DEBUG(dbgs() << "Check use: " << *U->get() << " in " << *UInst
-                      << "\n");
-
-    // Because we may reuse the tracker multiple times we keep track of the
-    // number of explored uses ourselves as well.
-    if (RemainingUsesToExplore-- == 0) {
-      LLVM_DEBUG(dbgs() << " - too many uses to explore!\n");
-      return isCapturedIn(/* Memory */ true, /* Integer */ true,
-                          /* Return */ true);
-    }
-
-    // Deal with ptr2int by following uses.
-    if (isa<PtrToIntInst>(UInst)) {
-      LLVM_DEBUG(dbgs() << " - ptr2int assume the worst!\n");
-      return valueMayBeCaptured(UInst);
-    }
-
-    // Explicitly catch return instructions.
-    if (isa<ReturnInst>(UInst))
-      return isCapturedIn(/* Memory */ false, /* Integer */ false,
-                          /* Return */ true);
-
-    // For now we only use special logic for call sites. However, the tracker
-    // itself knows about a lot of other non-capturing cases already.
-    CallSite CS(UInst);
-    if (!CS || !CS.isArgOperand(U))
-      return isCapturedIn(/* Memory */ true, /* Integer */ true,
-                          /* Return */ true);
-
-    unsigned ArgNo = CS.getArgumentNo(U);
-    const IRPosition &CSArgPos = IRPosition::callsite_argument(CS, ArgNo);
-    // If we have a abstract no-capture attribute for the argument we can use
-    // it to justify a non-capture attribute here. This allows recursion!
-    auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(NoCaptureAA, CSArgPos);
-    if (ArgNoCaptureAA.isAssumedNoCapture())
-      return isCapturedIn(/* Memory */ false, /* Integer */ false,
-                          /* Return */ false);
-    if (ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
-      addPotentialCopy(CS);
-      return isCapturedIn(/* Memory */ false, /* Integer */ false,
-                          /* Return */ false);
-    }
-
-    // Lastly, we could not find a reason no-capture can be assumed so we don't.
-    return isCapturedIn(/* Memory */ true, /* Integer */ true,
-                        /* Return */ true);
-  }
-
-  /// Register \p CS as potential copy of the value we are checking.
-  void addPotentialCopy(CallSite CS) {
-    PotentialCopies.push_back(CS.getInstruction());
-  }
-
-  /// See CaptureTracker::shouldExplore(...).
-  bool shouldExplore(const Use *U) override {
-    // Check liveness.
-    return !IsDeadAA.isAssumedDead(cast<Instruction>(U->getUser()));
-  }
-
-  /// Update the state according to \p CapturedInMem, \p CapturedInInt, and
-  /// \p CapturedInRet, then return the appropriate value for use in the
-  /// CaptureTracker::captured() interface.
-  bool isCapturedIn(bool CapturedInMem, bool CapturedInInt,
-                    bool CapturedInRet) {
-    LLVM_DEBUG(dbgs() << " - captures [Mem " << CapturedInMem << "|Int "
-                      << CapturedInInt << "|Ret " << CapturedInRet << "]\n");
-    if (CapturedInMem)
-      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_MEM);
-    if (CapturedInInt)
-      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_INT);
-    if (CapturedInRet)
-      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_RET);
-    return !State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
-  }
-
-private:
-  /// The attributor providing in-flight abstract attributes.
-  Attributor &A;
-
-  /// The abstract attribute currently updated.
-  AANoCapture &NoCaptureAA;
-
-  /// The abstract liveness state.
-  const AAIsDead &IsDeadAA;
-
-  /// The state currently updated.
-  AANoCapture::StateType &State;
-
-  /// Set of potential copies of the tracked value.
-  SmallVectorImpl<const Value *> &PotentialCopies;
-
-  /// Global counter to limit the number of explored uses.
-  unsigned &RemainingUsesToExplore;
-};
-
-ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) {
-  const IRPosition &IRP = getIRPosition();
-  const Value *V =
-      getArgNo() >= 0 ? IRP.getAssociatedArgument() : &IRP.getAssociatedValue();
-  if (!V)
-    return indicatePessimisticFixpoint();
-
-  const Function *F =
-      getArgNo() >= 0 ? IRP.getAssociatedFunction() : IRP.getAnchorScope();
-  assert(F && "Expected a function!");
-  const IRPosition &FnPos = IRPosition::function(*F);
-  const auto &IsDeadAA = A.getAAFor<AAIsDead>(*this, FnPos);
-
-  AANoCapture::StateType T;
-
-  // Readonly means we cannot capture through memory.
-  const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
-  if (FnMemAA.isAssumedReadOnly()) {
-    T.addKnownBits(NOT_CAPTURED_IN_MEM);
-    if (FnMemAA.isKnownReadOnly())
-      addKnownBits(NOT_CAPTURED_IN_MEM);
-  }
+    IsDeadAA = &getOrCreateAAFor<AAIsDead>(IRP, QueryingAA,
+                                           /* TrackDependence */ false);
+  // Don't check liveness for AAIsDead.
+  if (QueryingAA == IsDeadAA)
+    return false;
 
-  // Make sure all returned values are different than the underlying value.
-  // TODO: we could do this in a more sophisticated way inside
-  //       AAReturnedValues, e.g., track all values that escape through returns
-  //       directly somehow.
-  auto CheckReturnedArgs = [&](const AAReturnedValues &RVAA) {
-    bool SeenConstant = false;
-    for (auto &It : RVAA.returned_values()) {
-      if (isa<Constant>(It.first)) {
-        if (SeenConstant)
-          return false;
-        SeenConstant = true;
-      } else if (!isa<Argument>(It.first) ||
-                 It.first == getAssociatedArgument())
-        return false;
-    }
+  if (IsDeadAA->isAssumedDead()) {
+    if (QueryingAA)
+      recordDependence(*IsDeadAA, *QueryingAA, DepClass);
     return true;
-  };
-
-  const auto &NoUnwindAA = A.getAAFor<AANoUnwind>(*this, FnPos);
-  if (NoUnwindAA.isAssumedNoUnwind()) {
-    bool IsVoidTy = F->getReturnType()->isVoidTy();
-    const AAReturnedValues *RVAA =
-        IsVoidTy ? nullptr : &A.getAAFor<AAReturnedValues>(*this, FnPos);
-    if (IsVoidTy || CheckReturnedArgs(*RVAA)) {
-      T.addKnownBits(NOT_CAPTURED_IN_RET);
-      if (T.isKnown(NOT_CAPTURED_IN_MEM))
-        return ChangeStatus::UNCHANGED;
-      if (NoUnwindAA.isKnownNoUnwind() &&
-          (IsVoidTy || RVAA->getState().isAtFixpoint())) {
-        addKnownBits(NOT_CAPTURED_IN_RET);
-        if (isKnown(NOT_CAPTURED_IN_MEM))
-          return indicateOptimisticFixpoint();
-      }
-    }
   }
 
-  // Use the CaptureTracker interface and logic with the specialized tracker,
-  // defined in AACaptureUseTracker, that can look at in-flight abstract
-  // attributes and directly updates the assumed state.
-  SmallVector<const Value *, 4> PotentialCopies;
-  unsigned RemainingUsesToExplore = DefaultMaxUsesToExplore;
-  AACaptureUseTracker Tracker(A, *this, IsDeadAA, T, PotentialCopies,
-                              RemainingUsesToExplore);
-
-  // Check all potential copies of the associated value until we can assume
-  // none will be captured or we have to assume at least one might be.
-  unsigned Idx = 0;
-  PotentialCopies.push_back(V);
-  while (T.isAssumed(NO_CAPTURE_MAYBE_RETURNED) && Idx < PotentialCopies.size())
-    Tracker.valueMayBeCaptured(PotentialCopies[Idx++]);
-
-  AANoCapture::StateType &S = getState();
-  auto Assumed = S.getAssumed();
-  S.intersectAssumedBits(T.getAssumed());
-  if (!isAssumedNoCaptureMaybeReturned())
-    return indicatePessimisticFixpoint();
-  return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED
-                                   : ChangeStatus::CHANGED;
+  return false;
 }
 
-/// NoCapture attribute for function arguments.
-struct AANoCaptureArgument final : AANoCaptureImpl {
-  AANoCaptureArgument(const IRPosition &IRP) : AANoCaptureImpl(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nocapture) }
-};
-
-/// NoCapture attribute for call site arguments.
-struct AANoCaptureCallSiteArgument final : AANoCaptureImpl {
-  AANoCaptureCallSiteArgument(const IRPosition &IRP) : AANoCaptureImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    if (Argument *Arg = getAssociatedArgument())
-      if (Arg->hasByValAttr())
-        indicateOptimisticFixpoint();
-    AANoCaptureImpl::initialize(A);
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Once we have call site specific value information we can provide
-    //       call site specific liveness information and then it makes
-    //       sense to specialize attributes for call sites arguments instead of
-    //       redirecting requests to the callee argument.
-    Argument *Arg = getAssociatedArgument();
-    if (!Arg)
-      return indicatePessimisticFixpoint();
-    const IRPosition &ArgPos = IRPosition::argument(*Arg);
-    auto &ArgAA = A.getAAFor<AANoCapture>(*this, ArgPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AANoCapture::StateType &>(ArgAA.getState()));
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nocapture)};
-};
-
-/// NoCapture attribute for floating values.
-struct AANoCaptureFloating final : AANoCaptureImpl {
-  AANoCaptureFloating(const IRPosition &IRP) : AANoCaptureImpl(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_FLOATING_ATTR(nocapture)
-  }
-};
-
-/// NoCapture attribute for function return value.
-struct AANoCaptureReturned final : AANoCaptureImpl {
-  AANoCaptureReturned(const IRPosition &IRP) : AANoCaptureImpl(IRP) {
-    llvm_unreachable("NoCapture is not applicable to function returns!");
-  }
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    llvm_unreachable("NoCapture is not applicable to function returns!");
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    llvm_unreachable("NoCapture is not applicable to function returns!");
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {}
-};
-
-/// NoCapture attribute deduction for a call site return value.
-struct AANoCaptureCallSiteReturned final : AANoCaptureImpl {
-  AANoCaptureCallSiteReturned(const IRPosition &IRP) : AANoCaptureImpl(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_CSRET_ATTR(nocapture)
-  }
-};
-
-/// ------------------ Value Simplify Attribute ----------------------------
-struct AAValueSimplifyImpl : AAValueSimplify {
-  AAValueSimplifyImpl(const IRPosition &IRP) : AAValueSimplify(IRP) {}
-
-  /// See AbstractAttribute::getAsStr().
-  const std::string getAsStr() const override {
-    return getAssumed() ? (getKnown() ? "simplified" : "maybe-simple")
-                        : "not-simple";
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {}
-
-  /// See AAValueSimplify::getAssumedSimplifiedValue()
-  Optional<Value *> getAssumedSimplifiedValue(Attributor &A) const override {
-    if (!getAssumed())
-      return const_cast<Value *>(&getAssociatedValue());
-    return SimplifiedAssociatedValue;
-  }
-  void initialize(Attributor &A) override {}
-
-  /// Helper function for querying AAValueSimplify and updating candicate.
-  /// \param QueryingValue Value trying to unify with SimplifiedValue
-  /// \param AccumulatedSimplifiedValue Current simplification result.
-  static bool checkAndUpdate(Attributor &A, const AbstractAttribute &QueryingAA,
-                             Value &QueryingValue,
-                             Optional<Value *> &AccumulatedSimplifiedValue) {
-    // FIXME: Add a typecast support.
-
-    auto &ValueSimpifyAA = A.getAAFor<AAValueSimplify>(
-        QueryingAA, IRPosition::value(QueryingValue));
-
-    Optional<Value *> QueryingValueSimplified =
-        ValueSimpifyAA.getAssumedSimplifiedValue(A);
-
-    if (!QueryingValueSimplified.hasValue())
-      return true;
-
-    if (!QueryingValueSimplified.getValue())
-      return false;
-
-    Value &QueryingValueSimplifiedUnwrapped =
-        *QueryingValueSimplified.getValue();
+bool Attributor::checkForAllUses(function_ref<bool(const Use &, bool &)> Pred,
+                                 const AbstractAttribute &QueryingAA,
+                                 const Value &V, DepClassTy LivenessDepClass) {
 
-    if (isa<UndefValue>(QueryingValueSimplifiedUnwrapped))
-      return true;
-
-    if (AccumulatedSimplifiedValue.hasValue())
-      return AccumulatedSimplifiedValue == QueryingValueSimplified;
-
-    LLVM_DEBUG(dbgs() << "[Attributor][ValueSimplify] " << QueryingValue
-                      << " is assumed to be "
-                      << QueryingValueSimplifiedUnwrapped << "\n");
-
-    AccumulatedSimplifiedValue = QueryingValueSimplified;
-    return true;
-  }
-
-  bool askSimplifiedValueForAAValueConstantRange(Attributor &A) {
-    if (!getAssociatedValue().getType()->isIntegerTy())
-      return false;
-
-    const auto &ValueConstantRangeAA =
-        A.getAAFor<AAValueConstantRange>(*this, getIRPosition());
-
-    Optional<ConstantInt *> COpt =
-        ValueConstantRangeAA.getAssumedConstantInt(A);
-    if (COpt.hasValue()) {
-      if (auto *C = COpt.getValue())
-        SimplifiedAssociatedValue = C;
-      else
-        return false;
-    } else {
-      // FIXME: It should be llvm::None but if you set llvm::None,
-      //        values are mistakenly infered as `undef` now.
-      SimplifiedAssociatedValue = &getAssociatedValue();
-    }
+  // Check the trivial case first as it catches void values.
+  if (V.use_empty())
     return true;
-  }
-
-  /// See AbstractAttribute::manifest(...).
-  ChangeStatus manifest(Attributor &A) override {
-    ChangeStatus Changed = ChangeStatus::UNCHANGED;
-
-    if (!SimplifiedAssociatedValue.hasValue() ||
-        !SimplifiedAssociatedValue.getValue())
-      return Changed;
-
-    if (auto *C = dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())) {
-      // We can replace the AssociatedValue with the constant.
-      Value &V = getAssociatedValue();
-      if (!V.user_empty() && &V != C && V.getType() == C->getType()) {
-        LLVM_DEBUG(dbgs() << "[Attributor][ValueSimplify] " << V << " -> " << *C
-                          << "\n");
-        A.changeValueAfterManifest(V, *C);
-        Changed = ChangeStatus::CHANGED;
-      }
-    }
-
-    return Changed | AAValueSimplify::manifest(A);
-  }
-
-  /// See AbstractState::indicatePessimisticFixpoint(...).
-  ChangeStatus indicatePessimisticFixpoint() override {
-    // NOTE: Associated value will be returned in a pessimistic fixpoint and is
-    // regarded as known. That's why`indicateOptimisticFixpoint` is called.
-    SimplifiedAssociatedValue = &getAssociatedValue();
-    indicateOptimisticFixpoint();
-    return ChangeStatus::CHANGED;
-  }
-
-protected:
-  // An assumed simplified value. Initially, it is set to Optional::None, which
-  // means that the value is not clear under current assumption. If in the
-  // pessimistic state, getAssumedSimplifiedValue doesn't return this value but
-  // returns orignal associated value.
-  Optional<Value *> SimplifiedAssociatedValue;
-};
-
-struct AAValueSimplifyArgument final : AAValueSimplifyImpl {
-  AAValueSimplifyArgument(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {}
-
-  void initialize(Attributor &A) override {
-    AAValueSimplifyImpl::initialize(A);
-    if (!getAssociatedFunction() || getAssociatedFunction()->isDeclaration())
-      indicatePessimisticFixpoint();
-    if (hasAttr({Attribute::InAlloca, Attribute::StructRet, Attribute::Nest},
-                /* IgnoreSubsumingPositions */ true))
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // Byval is only replacable if it is readonly otherwise we would write into
-    // the replaced value and not the copy that byval creates implicitly.
-    Argument *Arg = getAssociatedArgument();
-    if (Arg->hasByValAttr()) {
-      const auto &MemAA = A.getAAFor<AAMemoryBehavior>(*this, getIRPosition());
-      if (!MemAA.isAssumedReadOnly())
-        return indicatePessimisticFixpoint();
-    }
-
-    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
-
-    auto PredForCallSite = [&](AbstractCallSite ACS) {
-      // Check if we have an associated argument or not (which can happen for
-      // callback calls).
-      Value *ArgOp = ACS.getCallArgOperand(getArgNo());
-      if (!ArgOp)
-        return false;
-      // We can only propagate thread independent values through callbacks.
-      // This is different to direct/indirect call sites because for them we
-      // know the thread executing the caller and callee is the same. For
-      // callbacks this is not guaranteed, thus a thread dependent value could
-      // be different for the caller and callee, making it invalid to propagate.
-      if (ACS.isCallbackCall())
-        if (auto *C = dyn_cast<Constant>(ArgOp))
-          if (C->isThreadDependent())
-            return false;
-      return checkAndUpdate(A, *this, *ArgOp, SimplifiedAssociatedValue);
-    };
-
-    if (!A.checkForAllCallSites(PredForCallSite, *this, true))
-      if (!askSimplifiedValueForAAValueConstantRange(A))
-        return indicatePessimisticFixpoint();
-
-    // If a candicate was found in this update, return CHANGED.
-    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
-               ? ChangeStatus::UNCHANGED
-               : ChangeStatus ::CHANGED;
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_ARG_ATTR(value_simplify)
-  }
-};
-
-struct AAValueSimplifyReturned : AAValueSimplifyImpl {
-  AAValueSimplifyReturned(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
-
-    auto PredForReturned = [&](Value &V) {
-      return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue);
-    };
-
-    if (!A.checkForAllReturnedValues(PredForReturned, *this))
-      if (!askSimplifiedValueForAAValueConstantRange(A))
-        return indicatePessimisticFixpoint();
-
-    // If a candicate was found in this update, return CHANGED.
-    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
-               ? ChangeStatus::UNCHANGED
-               : ChangeStatus ::CHANGED;
-  }
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_FNRET_ATTR(value_simplify)
-  }
-};
-
-struct AAValueSimplifyFloating : AAValueSimplifyImpl {
-  AAValueSimplifyFloating(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    Value &V = getAnchorValue();
-
-    // TODO: add other stuffs
-    if (isa<Constant>(V))
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
-
-    auto VisitValueCB = [&](Value &V, BooleanState, bool Stripped) -> bool {
-      auto &AA = A.getAAFor<AAValueSimplify>(*this, IRPosition::value(V));
-      if (!Stripped && this == &AA) {
-        // TODO: Look the instruction and check recursively.
-
-        LLVM_DEBUG(
-            dbgs() << "[Attributor][ValueSimplify] Can't be stripped more : "
-                   << V << "\n");
-        return false;
-      }
-      return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue);
-    };
-
-    if (!genericValueTraversal<AAValueSimplify, BooleanState>(
-            A, getIRPosition(), *this, static_cast<BooleanState &>(*this),
-            VisitValueCB))
-      if (!askSimplifiedValueForAAValueConstantRange(A))
-        return indicatePessimisticFixpoint();
-
-    // If a candicate was found in this update, return CHANGED.
-
-    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
-               ? ChangeStatus::UNCHANGED
-               : ChangeStatus ::CHANGED;
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_FLOATING_ATTR(value_simplify)
-  }
-};
-
-struct AAValueSimplifyFunction : AAValueSimplifyImpl {
-  AAValueSimplifyFunction(const IRPosition &IRP) : AAValueSimplifyImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    SimplifiedAssociatedValue = &getAnchorValue();
-    indicateOptimisticFixpoint();
-  }
-  /// See AbstractAttribute::initialize(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    llvm_unreachable(
-        "AAValueSimplify(Function|CallSite)::updateImpl will not be called");
-  }
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_FN_ATTR(value_simplify)
-  }
-};
-
-struct AAValueSimplifyCallSite : AAValueSimplifyFunction {
-  AAValueSimplifyCallSite(const IRPosition &IRP)
-      : AAValueSimplifyFunction(IRP) {}
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_CS_ATTR(value_simplify)
-  }
-};
-
-struct AAValueSimplifyCallSiteReturned : AAValueSimplifyReturned {
-  AAValueSimplifyCallSiteReturned(const IRPosition &IRP)
-      : AAValueSimplifyReturned(IRP) {}
-
-  void trackStatistics() const override {
-    STATS_DECLTRACK_CSRET_ATTR(value_simplify)
-  }
-};
-struct AAValueSimplifyCallSiteArgument : AAValueSimplifyFloating {
-  AAValueSimplifyCallSiteArgument(const IRPosition &IRP)
-      : AAValueSimplifyFloating(IRP) {}
-
-  void trackStatistics() const override {
-    STATS_DECLTRACK_CSARG_ATTR(value_simplify)
-  }
-};
-
-/// ----------------------- Heap-To-Stack Conversion ---------------------------
-struct AAHeapToStackImpl : public AAHeapToStack {
-  AAHeapToStackImpl(const IRPosition &IRP) : AAHeapToStack(IRP) {}
-
-  const std::string getAsStr() const override {
-    return "[H2S] Mallocs: " + std::to_string(MallocCalls.size());
-  }
-
-  ChangeStatus manifest(Attributor &A) override {
-    assert(getState().isValidState() &&
-           "Attempted to manifest an invalid state!");
-
-    ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
-    Function *F = getAssociatedFunction();
-    const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
-
-    for (Instruction *MallocCall : MallocCalls) {
-      // This malloc cannot be replaced.
-      if (BadMallocCalls.count(MallocCall))
-        continue;
-
-      for (Instruction *FreeCall : FreesForMalloc[MallocCall]) {
-        LLVM_DEBUG(dbgs() << "H2S: Removing free call: " << *FreeCall << "\n");
-        A.deleteAfterManifest(*FreeCall);
-        HasChanged = ChangeStatus::CHANGED;
-      }
-
-      LLVM_DEBUG(dbgs() << "H2S: Removing malloc call: " << *MallocCall
-                        << "\n");
-
-      Constant *Size;
-      if (isCallocLikeFn(MallocCall, TLI)) {
-        auto *Num = cast<ConstantInt>(MallocCall->getOperand(0));
-        auto *SizeT = dyn_cast<ConstantInt>(MallocCall->getOperand(1));
-        APInt TotalSize = SizeT->getValue() * Num->getValue();
-        Size =
-            ConstantInt::get(MallocCall->getOperand(0)->getType(), TotalSize);
-      } else {
-        Size = cast<ConstantInt>(MallocCall->getOperand(0));
-      }
-
-      unsigned AS = cast<PointerType>(MallocCall->getType())->getAddressSpace();
-      Instruction *AI = new AllocaInst(Type::getInt8Ty(F->getContext()), AS,
-                                       Size, "", MallocCall->getNextNode());
-
-      if (AI->getType() != MallocCall->getType())
-        AI = new BitCastInst(AI, MallocCall->getType(), "malloc_bc",
-                             AI->getNextNode());
-
-      replaceAllInstructionUsesWith(*MallocCall, *AI);
-
-      if (auto *II = dyn_cast<InvokeInst>(MallocCall)) {
-        auto *NBB = II->getNormalDest();
-        BranchInst::Create(NBB, MallocCall->getParent());
-        A.deleteAfterManifest(*MallocCall);
-      } else {
-        A.deleteAfterManifest(*MallocCall);
-      }
-
-      if (isCallocLikeFn(MallocCall, TLI)) {
-        auto *BI = new BitCastInst(AI, MallocCall->getType(), "calloc_bc",
-                                   AI->getNextNode());
-        Value *Ops[] = {
-            BI, ConstantInt::get(F->getContext(), APInt(8, 0, false)), Size,
-            ConstantInt::get(Type::getInt1Ty(F->getContext()), false)};
-
-        Type *Tys[] = {BI->getType(), MallocCall->getOperand(0)->getType()};
-        Module *M = F->getParent();
-        Function *Fn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys);
-        CallInst::Create(Fn, Ops, "", BI->getNextNode());
-      }
-      HasChanged = ChangeStatus::CHANGED;
-    }
-
-    return HasChanged;
-  }
-
-  /// Collection of all malloc calls in a function.
-  SmallSetVector<Instruction *, 4> MallocCalls;
-
-  /// Collection of malloc calls that cannot be converted.
-  DenseSet<const Instruction *> BadMallocCalls;
-
-  /// A map for each malloc call to the set of associated free calls.
-  DenseMap<Instruction *, SmallPtrSet<Instruction *, 4>> FreesForMalloc;
-
-  ChangeStatus updateImpl(Attributor &A) override;
-};
-
-ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) {
-  const Function *F = getAssociatedFunction();
-  const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
-
-  MustBeExecutedContextExplorer &Explorer =
-      A.getInfoCache().getMustBeExecutedContextExplorer();
-
-  auto FreeCheck = [&](Instruction &I) {
-    const auto &Frees = FreesForMalloc.lookup(&I);
-    if (Frees.size() != 1)
-      return false;
-    Instruction *UniqueFree = *Frees.begin();
-    return Explorer.findInContextOf(UniqueFree, I.getNextNode());
-  };
-
-  auto UsesCheck = [&](Instruction &I) {
-    bool ValidUsesOnly = true;
-    bool MustUse = true;
-    auto Pred = [&](const Use &U, bool &Follow) -> bool {
-      Instruction *UserI = cast<Instruction>(U.getUser());
-      if (isa<LoadInst>(UserI))
-        return true;
-      if (auto *SI = dyn_cast<StoreInst>(UserI)) {
-        if (SI->getValueOperand() == U.get()) {
-          LLVM_DEBUG(dbgs()
-                     << "[H2S] escaping store to memory: " << *UserI << "\n");
-          ValidUsesOnly = false;
-        } else {
-          // A store into the malloc'ed memory is fine.
-        }
-        return true;
-      }
-      if (auto *CB = dyn_cast<CallBase>(UserI)) {
-        if (!CB->isArgOperand(&U) || CB->isLifetimeStartOrEnd())
-          return true;
-        // Record malloc.
-        if (isFreeCall(UserI, TLI)) {
-          if (MustUse) {
-            FreesForMalloc[&I].insert(UserI);
-          } else {
-            LLVM_DEBUG(dbgs() << "[H2S] free potentially on different mallocs: "
-                              << *UserI << "\n");
-            ValidUsesOnly = false;
-          }
-          return true;
-        }
-
-        unsigned ArgNo = CB->getArgOperandNo(&U);
-
-        const auto &NoCaptureAA = A.getAAFor<AANoCapture>(
-            *this, IRPosition::callsite_argument(*CB, ArgNo));
-
-        // If a callsite argument use is nofree, we are fine.
-        const auto &ArgNoFreeAA = A.getAAFor<AANoFree>(
-            *this, IRPosition::callsite_argument(*CB, ArgNo));
-
-        if (!NoCaptureAA.isAssumedNoCapture() ||
-            !ArgNoFreeAA.isAssumedNoFree()) {
-          LLVM_DEBUG(dbgs() << "[H2S] Bad user: " << *UserI << "\n");
-          ValidUsesOnly = false;
-        }
-        return true;
-      }
-
-      if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) ||
-          isa<PHINode>(UserI) || isa<SelectInst>(UserI)) {
-        MustUse &= !(isa<PHINode>(UserI) || isa<SelectInst>(UserI));
-        Follow = true;
-        return true;
-      }
-      // Unknown user for which we can not track uses further (in a way that
-      // makes sense).
-      LLVM_DEBUG(dbgs() << "[H2S] Unknown user: " << *UserI << "\n");
-      ValidUsesOnly = false;
-      return true;
-    };
-    A.checkForAllUses(Pred, *this, I);
-    return ValidUsesOnly;
-  };
-
-  auto MallocCallocCheck = [&](Instruction &I) {
-    if (BadMallocCalls.count(&I))
-      return true;
-
-    bool IsMalloc = isMallocLikeFn(&I, TLI);
-    bool IsCalloc = !IsMalloc && isCallocLikeFn(&I, TLI);
-    if (!IsMalloc && !IsCalloc) {
-      BadMallocCalls.insert(&I);
-      return true;
-    }
-
-    if (IsMalloc) {
-      if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(0)))
-        if (Size->getValue().ule(MaxHeapToStackSize))
-          if (UsesCheck(I) || FreeCheck(I)) {
-            MallocCalls.insert(&I);
-            return true;
-          }
-    } else if (IsCalloc) {
-      bool Overflow = false;
-      if (auto *Num = dyn_cast<ConstantInt>(I.getOperand(0)))
-        if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1)))
-          if ((Size->getValue().umul_ov(Num->getValue(), Overflow))
-                  .ule(MaxHeapToStackSize))
-            if (!Overflow && (UsesCheck(I) || FreeCheck(I))) {
-              MallocCalls.insert(&I);
-              return true;
-            }
-    }
-
-    BadMallocCalls.insert(&I);
-    return true;
-  };
-
-  size_t NumBadMallocs = BadMallocCalls.size();
-
-  A.checkForAllCallLikeInstructions(MallocCallocCheck, *this);
-
-  if (NumBadMallocs != BadMallocCalls.size())
-    return ChangeStatus::CHANGED;
-
-  return ChangeStatus::UNCHANGED;
-}
-
-struct AAHeapToStackFunction final : public AAHeapToStackImpl {
-  AAHeapToStackFunction(const IRPosition &IRP) : AAHeapToStackImpl(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECL(MallocCalls, Function,
-               "Number of malloc calls converted to allocas");
-    for (auto *C : MallocCalls)
-      if (!BadMallocCalls.count(C))
-        ++BUILD_STAT_NAME(MallocCalls, Function);
-  }
-};
-
-/// -------------------- Memory Behavior Attributes ----------------------------
-/// Includes read-none, read-only, and write-only.
-/// ----------------------------------------------------------------------------
-struct AAMemoryBehaviorImpl : public AAMemoryBehavior {
-  AAMemoryBehaviorImpl(const IRPosition &IRP) : AAMemoryBehavior(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    intersectAssumedBits(BEST_STATE);
-    getKnownStateFromValue(getIRPosition(), getState());
-    IRAttribute::initialize(A);
-  }
-
-  /// Return the memory behavior information encoded in the IR for \p IRP.
-  static void getKnownStateFromValue(const IRPosition &IRP,
-                                     BitIntegerState &State,
-                                     bool IgnoreSubsumingPositions = false) {
-    SmallVector<Attribute, 2> Attrs;
-    IRP.getAttrs(AttrKinds, Attrs, IgnoreSubsumingPositions);
-    for (const Attribute &Attr : Attrs) {
-      switch (Attr.getKindAsEnum()) {
-      case Attribute::ReadNone:
-        State.addKnownBits(NO_ACCESSES);
-        break;
-      case Attribute::ReadOnly:
-        State.addKnownBits(NO_WRITES);
-        break;
-      case Attribute::WriteOnly:
-        State.addKnownBits(NO_READS);
-        break;
-      default:
-        llvm_unreachable("Unexpcted attribute!");
-      }
-    }
-
-    if (auto *I = dyn_cast<Instruction>(&IRP.getAnchorValue())) {
-      if (!I->mayReadFromMemory())
-        State.addKnownBits(NO_READS);
-      if (!I->mayWriteToMemory())
-        State.addKnownBits(NO_WRITES);
-    }
-  }
-
-  /// See AbstractAttribute::getDeducedAttributes(...).
-  void getDeducedAttributes(LLVMContext &Ctx,
-                            SmallVectorImpl<Attribute> &Attrs) const override {
-    assert(Attrs.size() == 0);
-    if (isAssumedReadNone())
-      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadNone));
-    else if (isAssumedReadOnly())
-      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadOnly));
-    else if (isAssumedWriteOnly())
-      Attrs.push_back(Attribute::get(Ctx, Attribute::WriteOnly));
-    assert(Attrs.size() <= 1);
-  }
-
-  /// See AbstractAttribute::manifest(...).
-  ChangeStatus manifest(Attributor &A) override {
-    const IRPosition &IRP = getIRPosition();
-
-    // Check if we would improve the existing attributes first.
-    SmallVector<Attribute, 4> DeducedAttrs;
-    getDeducedAttributes(IRP.getAnchorValue().getContext(), DeducedAttrs);
-    if (llvm::all_of(DeducedAttrs, [&](const Attribute &Attr) {
-          return IRP.hasAttr(Attr.getKindAsEnum(),
-                             /* IgnoreSubsumingPositions */ true);
-        }))
-      return ChangeStatus::UNCHANGED;
-
-    // Clear existing attributes.
-    IRP.removeAttrs(AttrKinds);
-
-    // Use the generic manifest method.
-    return IRAttribute::manifest(A);
-  }
-
-  /// See AbstractState::getAsStr().
-  const std::string getAsStr() const override {
-    if (isAssumedReadNone())
-      return "readnone";
-    if (isAssumedReadOnly())
-      return "readonly";
-    if (isAssumedWriteOnly())
-      return "writeonly";
-    return "may-read/write";
-  }
-
-  /// The set of IR attributes AAMemoryBehavior deals with.
-  static const Attribute::AttrKind AttrKinds[3];
-};
-
-const Attribute::AttrKind AAMemoryBehaviorImpl::AttrKinds[] = {
-    Attribute::ReadNone, Attribute::ReadOnly, Attribute::WriteOnly};
-
-/// Memory behavior attribute for a floating value.
-struct AAMemoryBehaviorFloating : AAMemoryBehaviorImpl {
-  AAMemoryBehaviorFloating(const IRPosition &IRP) : AAMemoryBehaviorImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    AAMemoryBehaviorImpl::initialize(A);
-    // Initialize the use vector with all direct uses of the associated value.
-    for (const Use &U : getAssociatedValue().uses())
-      Uses.insert(&U);
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override;
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    if (isAssumedReadNone())
-      STATS_DECLTRACK_FLOATING_ATTR(readnone)
-    else if (isAssumedReadOnly())
-      STATS_DECLTRACK_FLOATING_ATTR(readonly)
-    else if (isAssumedWriteOnly())
-      STATS_DECLTRACK_FLOATING_ATTR(writeonly)
-  }
-
-private:
-  /// Return true if users of \p UserI might access the underlying
-  /// variable/location described by \p U and should therefore be analyzed.
-  bool followUsersOfUseIn(Attributor &A, const Use *U,
-                          const Instruction *UserI);
-
-  /// Update the state according to the effect of use \p U in \p UserI.
-  void analyzeUseIn(Attributor &A, const Use *U, const Instruction *UserI);
-
-protected:
-  /// Container for (transitive) uses of the associated argument.
-  SetVector<const Use *> Uses;
-};
-
-/// Memory behavior attribute for function argument.
-struct AAMemoryBehaviorArgument : AAMemoryBehaviorFloating {
-  AAMemoryBehaviorArgument(const IRPosition &IRP)
-      : AAMemoryBehaviorFloating(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    intersectAssumedBits(BEST_STATE);
-    const IRPosition &IRP = getIRPosition();
-    // TODO: Make IgnoreSubsumingPositions a property of an IRAttribute so we
-    // can query it when we use has/getAttr. That would allow us to reuse the
-    // initialize of the base class here.
-    bool HasByVal =
-        IRP.hasAttr({Attribute::ByVal}, /* IgnoreSubsumingPositions */ true);
-    getKnownStateFromValue(IRP, getState(),
-                           /* IgnoreSubsumingPositions */ HasByVal);
-
-    // Initialize the use vector with all direct uses of the associated value.
-    Argument *Arg = getAssociatedArgument();
-    if (!Arg || !Arg->getParent()->hasExactDefinition()) {
-      indicatePessimisticFixpoint();
-    } else {
-      // Initialize the use vector with all direct uses of the associated value.
-      for (const Use &U : Arg->uses())
-        Uses.insert(&U);
-    }
-  }
-
-  ChangeStatus manifest(Attributor &A) override {
-    // TODO: From readattrs.ll: "inalloca parameters are always
-    //                           considered written"
-    if (hasAttr({Attribute::InAlloca})) {
-      removeKnownBits(NO_WRITES);
-      removeAssumedBits(NO_WRITES);
-    }
-    return AAMemoryBehaviorFloating::manifest(A);
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    if (isAssumedReadNone())
-      STATS_DECLTRACK_ARG_ATTR(readnone)
-    else if (isAssumedReadOnly())
-      STATS_DECLTRACK_ARG_ATTR(readonly)
-    else if (isAssumedWriteOnly())
-      STATS_DECLTRACK_ARG_ATTR(writeonly)
-  }
-};
-
-struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument {
-  AAMemoryBehaviorCallSiteArgument(const IRPosition &IRP)
-      : AAMemoryBehaviorArgument(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    if (Argument *Arg = getAssociatedArgument()) {
-      if (Arg->hasByValAttr()) {
-        addKnownBits(NO_WRITES);
-        removeKnownBits(NO_READS);
-        removeAssumedBits(NO_READS);
-      }
-    } else {
-    }
-    AAMemoryBehaviorArgument::initialize(A);
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Once we have call site specific value information we can provide
-    //       call site specific liveness liveness information and then it makes
-    //       sense to specialize attributes for call sites arguments instead of
-    //       redirecting requests to the callee argument.
-    Argument *Arg = getAssociatedArgument();
-    const IRPosition &ArgPos = IRPosition::argument(*Arg);
-    auto &ArgAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AAMemoryBehavior::StateType &>(ArgAA.getState()));
-  }
 
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    if (isAssumedReadNone())
-      STATS_DECLTRACK_CSARG_ATTR(readnone)
-    else if (isAssumedReadOnly())
-      STATS_DECLTRACK_CSARG_ATTR(readonly)
-    else if (isAssumedWriteOnly())
-      STATS_DECLTRACK_CSARG_ATTR(writeonly)
-  }
-};
-
-/// Memory behavior attribute for a call site return position.
-struct AAMemoryBehaviorCallSiteReturned final : AAMemoryBehaviorFloating {
-  AAMemoryBehaviorCallSiteReturned(const IRPosition &IRP)
-      : AAMemoryBehaviorFloating(IRP) {}
-
-  /// See AbstractAttribute::manifest(...).
-  ChangeStatus manifest(Attributor &A) override {
-    // We do not annotate returned values.
-    return ChangeStatus::UNCHANGED;
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {}
-};
-
-/// An AA to represent the memory behavior function attributes.
-struct AAMemoryBehaviorFunction final : public AAMemoryBehaviorImpl {
-  AAMemoryBehaviorFunction(const IRPosition &IRP) : AAMemoryBehaviorImpl(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(Attributor &A).
-  virtual ChangeStatus updateImpl(Attributor &A) override;
-
-  /// See AbstractAttribute::manifest(...).
-  ChangeStatus manifest(Attributor &A) override {
-    Function &F = cast<Function>(getAnchorValue());
-    if (isAssumedReadNone()) {
-      F.removeFnAttr(Attribute::ArgMemOnly);
-      F.removeFnAttr(Attribute::InaccessibleMemOnly);
-      F.removeFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
-    }
-    return AAMemoryBehaviorImpl::manifest(A);
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    if (isAssumedReadNone())
-      STATS_DECLTRACK_FN_ATTR(readnone)
-    else if (isAssumedReadOnly())
-      STATS_DECLTRACK_FN_ATTR(readonly)
-    else if (isAssumedWriteOnly())
-      STATS_DECLTRACK_FN_ATTR(writeonly)
-  }
-};
-
-/// AAMemoryBehavior attribute for call sites.
-struct AAMemoryBehaviorCallSite final : AAMemoryBehaviorImpl {
-  AAMemoryBehaviorCallSite(const IRPosition &IRP) : AAMemoryBehaviorImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    AAMemoryBehaviorImpl::initialize(A);
-    Function *F = getAssociatedFunction();
-    if (!F || !F->hasExactDefinition())
-      indicatePessimisticFixpoint();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Once we have call site specific value information we can provide
-    //       call site specific liveness liveness information and then it makes
-    //       sense to specialize attributes for call sites arguments instead of
-    //       redirecting requests to the callee argument.
-    Function *F = getAssociatedFunction();
-    const IRPosition &FnPos = IRPosition::function(*F);
-    auto &FnAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
-    return clampStateAndIndicateChange(
-        getState(),
-        static_cast<const AAMemoryBehavior::StateType &>(FnAA.getState()));
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    if (isAssumedReadNone())
-      STATS_DECLTRACK_CS_ATTR(readnone)
-    else if (isAssumedReadOnly())
-      STATS_DECLTRACK_CS_ATTR(readonly)
-    else if (isAssumedWriteOnly())
-      STATS_DECLTRACK_CS_ATTR(writeonly)
-  }
-};
-} // namespace
-
-ChangeStatus AAMemoryBehaviorFunction::updateImpl(Attributor &A) {
-
-  // The current assumed state used to determine a change.
-  auto AssumedState = getAssumed();
-
-  auto CheckRWInst = [&](Instruction &I) {
-    // If the instruction has an own memory behavior state, use it to restrict
-    // the local state. No further analysis is required as the other memory
-    // state is as optimistic as it gets.
-    if (ImmutableCallSite ICS = ImmutableCallSite(&I)) {
-      const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
-          *this, IRPosition::callsite_function(ICS));
-      intersectAssumedBits(MemBehaviorAA.getAssumed());
-      return !isAtFixpoint();
-    }
-
-    // Remove access kind modifiers if necessary.
-    if (I.mayReadFromMemory())
-      removeAssumedBits(NO_READS);
-    if (I.mayWriteToMemory())
-      removeAssumedBits(NO_WRITES);
-    return !isAtFixpoint();
-  };
-
-  if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
-    return indicatePessimisticFixpoint();
-
-  return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
-                                        : ChangeStatus::UNCHANGED;
-}
-
-ChangeStatus AAMemoryBehaviorFloating::updateImpl(Attributor &A) {
-
-  const IRPosition &IRP = getIRPosition();
-  const IRPosition &FnPos = IRPosition::function_scope(IRP);
-  AAMemoryBehavior::StateType &S = getState();
-
-  // First, check the function scope. We take the known information and we avoid
-  // work if the assumed information implies the current assumed information for
-  // this attribute. This is a valid for all but byval arguments.
-  Argument *Arg = IRP.getAssociatedArgument();
-  AAMemoryBehavior::base_t FnMemAssumedState =
-      AAMemoryBehavior::StateType::getWorstState();
-  if (!Arg || !Arg->hasByValAttr()) {
-    const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
-    FnMemAssumedState = FnMemAA.getAssumed();
-    S.addKnownBits(FnMemAA.getKnown());
-    if ((S.getAssumed() & FnMemAA.getAssumed()) == S.getAssumed())
-      return ChangeStatus::UNCHANGED;
-  }
-
-  // Make sure the value is not captured (except through "return"), if
-  // it is, any information derived would be irrelevant anyway as we cannot
-  // check the potential aliases introduced by the capture. However, no need
-  // to fall back to anythign less optimistic than the function state.
-  const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(
-      *this, IRP, /* TrackDependence */ true, DepClassTy::OPTIONAL);
-  if (!ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
-    S.intersectAssumedBits(FnMemAssumedState);
-    return ChangeStatus::CHANGED;
-  }
-
-  // The current assumed state used to determine a change.
-  auto AssumedState = S.getAssumed();
-
-  // Liveness information to exclude dead users.
-  // TODO: Take the FnPos once we have call site specific liveness information.
-  const auto &LivenessAA = A.getAAFor<AAIsDead>(
-      *this, IRPosition::function(*IRP.getAssociatedFunction()));
-
-  // Visit and expand uses until all are analyzed or a fixpoint is reached.
-  for (unsigned i = 0; i < Uses.size() && !isAtFixpoint(); i++) {
-    const Use *U = Uses[i];
-    Instruction *UserI = cast<Instruction>(U->getUser());
-    LLVM_DEBUG(dbgs() << "[AAMemoryBehavior] Use: " << **U << " in " << *UserI
-                      << " [Dead: " << (LivenessAA.isAssumedDead(UserI))
-                      << "]\n");
-    if (LivenessAA.isAssumedDead(UserI))
-      continue;
-
-    // Check if the users of UserI should also be visited.
-    if (followUsersOfUseIn(A, U, UserI))
-      for (const Use &UserIUse : UserI->uses())
-        Uses.insert(&UserIUse);
-
-    // If UserI might touch memory we analyze the use in detail.
-    if (UserI->mayReadOrWriteMemory())
-      analyzeUseIn(A, U, UserI);
-  }
-
-  return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
-                                        : ChangeStatus::UNCHANGED;
-}
-
-bool AAMemoryBehaviorFloating::followUsersOfUseIn(Attributor &A, const Use *U,
-                                                  const Instruction *UserI) {
-  // The loaded value is unrelated to the pointer argument, no need to
-  // follow the users of the load.
-  if (isa<LoadInst>(UserI))
-    return false;
-
-  // By default we follow all uses assuming UserI might leak information on U,
-  // we have special handling for call sites operands though.
-  ImmutableCallSite ICS(UserI);
-  if (!ICS || !ICS.isArgOperand(U))
+  // If the value is replaced by another one, for now a constant, we do not have
+  // uses. Note that this requires users of `checkForAllUses` to not recurse but
+  // instead use the `follow` callback argument to look at transitive users,
+  // however, that should be clear from the presence of the argument.
+  bool UsedAssumedInformation = false;
+  Optional<Constant *> C =
+      getAssumedConstant(V, QueryingAA, UsedAssumedInformation);
+  if (C.hasValue() && C.getValue()) {
+    LLVM_DEBUG(dbgs() << "[Attributor] Value is simplified, uses skipped: " << V
+                      << " -> " << *C.getValue() << "\n");
     return true;
-
-  // If the use is a call argument known not to be captured, the users of
-  // the call do not need to be visited because they have to be unrelated to
-  // the input. Note that this check is not trivial even though we disallow
-  // general capturing of the underlying argument. The reason is that the
-  // call might the argument "through return", which we allow and for which we
-  // need to check call users.
-  unsigned ArgNo = ICS.getArgumentNo(U);
-  const auto &ArgNoCaptureAA =
-      A.getAAFor<AANoCapture>(*this, IRPosition::callsite_argument(ICS, ArgNo));
-  return !ArgNoCaptureAA.isAssumedNoCapture();
-}
-
-void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use *U,
-                                            const Instruction *UserI) {
-  assert(UserI->mayReadOrWriteMemory());
-
-  switch (UserI->getOpcode()) {
-  default:
-    // TODO: Handle all atomics and other side-effect operations we know of.
-    break;
-  case Instruction::Load:
-    // Loads cause the NO_READS property to disappear.
-    removeAssumedBits(NO_READS);
-    return;
-
-  case Instruction::Store:
-    // Stores cause the NO_WRITES property to disappear if the use is the
-    // pointer operand. Note that we do assume that capturing was taken care of
-    // somewhere else.
-    if (cast<StoreInst>(UserI)->getPointerOperand() == U->get())
-      removeAssumedBits(NO_WRITES);
-    return;
-
-  case Instruction::Call:
-  case Instruction::CallBr:
-  case Instruction::Invoke: {
-    // For call sites we look at the argument memory behavior attribute (this
-    // could be recursive!) in order to restrict our own state.
-    ImmutableCallSite ICS(UserI);
-
-    // Give up on operand bundles.
-    if (ICS.isBundleOperand(U)) {
-      indicatePessimisticFixpoint();
-      return;
-    }
-
-    // Calling a function does read the function pointer, maybe write it if the
-    // function is self-modifying.
-    if (ICS.isCallee(U)) {
-      removeAssumedBits(NO_READS);
-      break;
-    }
-
-    // Adjust the possible access behavior based on the information on the
-    // argument.
-    unsigned ArgNo = ICS.getArgumentNo(U);
-    const IRPosition &ArgPos = IRPosition::callsite_argument(ICS, ArgNo);
-    const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos);
-    // "assumed" has at most the same bits as the MemBehaviorAA assumed
-    // and at least "known".
-    intersectAssumedBits(MemBehaviorAA.getAssumed());
-    return;
-  }
-  };
-
-  // Generally, look at the "may-properties" and adjust the assumed state if we
-  // did not trigger special handling before.
-  if (UserI->mayReadFromMemory())
-    removeAssumedBits(NO_READS);
-  if (UserI->mayWriteToMemory())
-    removeAssumedBits(NO_WRITES);
-}
-/// ------------------ Value Constant Range Attribute -------------------------
-
-struct AAValueConstantRangeImpl : AAValueConstantRange {
-  using StateType = IntegerRangeState;
-  AAValueConstantRangeImpl(const IRPosition &IRP) : AAValueConstantRange(IRP) {}
-
-  /// See AbstractAttribute::getAsStr().
-  const std::string getAsStr() const override {
-    std::string Str;
-    llvm::raw_string_ostream OS(Str);
-    OS << "range(" << getBitWidth() << ")<";
-    getKnown().print(OS);
-    OS << " / ";
-    getAssumed().print(OS);
-    OS << ">";
-    return OS.str();
-  }
-
-  /// Helper function to get a SCEV expr for the associated value at program
-  /// point \p I.
-  const SCEV *getSCEV(Attributor &A, const Instruction *I = nullptr) const {
-    if (!getAnchorScope())
-      return nullptr;
-
-    ScalarEvolution *SE =
-        A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(
-            *getAnchorScope());
-
-    LoopInfo *LI = A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(
-        *getAnchorScope());
-
-    if (!SE || !LI)
-      return nullptr;
-
-    const SCEV *S = SE->getSCEV(&getAssociatedValue());
-    if (!I)
-      return S;
-
-    return SE->getSCEVAtScope(S, LI->getLoopFor(I->getParent()));
-  }
-
-  /// Helper function to get a range from SCEV for the associated value at
-  /// program point \p I.
-  ConstantRange getConstantRangeFromSCEV(Attributor &A,
-                                         const Instruction *I = nullptr) const {
-    if (!getAnchorScope())
-      return getWorstState(getBitWidth());
-
-    ScalarEvolution *SE =
-        A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(
-            *getAnchorScope());
-
-    const SCEV *S = getSCEV(A, I);
-    if (!SE || !S)
-      return getWorstState(getBitWidth());
-
-    return SE->getUnsignedRange(S);
-  }
-
-  /// Helper function to get a range from LVI for the associated value at
-  /// program point \p I.
-  ConstantRange
-  getConstantRangeFromLVI(Attributor &A,
-                          const Instruction *CtxI = nullptr) const {
-    if (!getAnchorScope())
-      return getWorstState(getBitWidth());
-
-    LazyValueInfo *LVI =
-        A.getInfoCache().getAnalysisResultForFunction<LazyValueAnalysis>(
-            *getAnchorScope());
-
-    if (!LVI || !CtxI)
-      return getWorstState(getBitWidth());
-    return LVI->getConstantRange(&getAssociatedValue(),
-                                 const_cast<BasicBlock *>(CtxI->getParent()),
-                                 const_cast<Instruction *>(CtxI));
-  }
-
-  /// See AAValueConstantRange::getKnownConstantRange(..).
-  ConstantRange
-  getKnownConstantRange(Attributor &A,
-                        const Instruction *CtxI = nullptr) const override {
-    if (!CtxI || CtxI == getCtxI())
-      return getKnown();
-
-    ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI);
-    ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI);
-    return getKnown().intersectWith(SCEVR).intersectWith(LVIR);
   }
 
-  /// See AAValueConstantRange::getAssumedConstantRange(..).
-  ConstantRange
-  getAssumedConstantRange(Attributor &A,
-                          const Instruction *CtxI = nullptr) const override {
-    // TODO: Make SCEV use Attributor assumption.
-    //       We may be able to bound a variable range via assumptions in
-    //       Attributor. ex.) If x is assumed to be in [1, 3] and y is known to
-    //       evolve to x^2 + x, then we can say that y is in [2, 12].
-
-    if (!CtxI || CtxI == getCtxI())
-      return getAssumed();
-
-    ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI);
-    ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI);
-    return getAssumed().intersectWith(SCEVR).intersectWith(LVIR);
-  }
-
-  /// See AbstractAttribute::initialize(..).
-  void initialize(Attributor &A) override {
-    // Intersect a range given by SCEV.
-    intersectKnown(getConstantRangeFromSCEV(A, getCtxI()));
-
-    // Intersect a range given by LVI.
-    intersectKnown(getConstantRangeFromLVI(A, getCtxI()));
-  }
-
-  /// Helper function to create MDNode for range metadata.
-  static MDNode *
-  getMDNodeForConstantRange(Type *Ty, LLVMContext &Ctx,
-                            const ConstantRange &AssumedConstantRange) {
-    Metadata *LowAndHigh[] = {ConstantAsMetadata::get(ConstantInt::get(
-                                  Ty, AssumedConstantRange.getLower())),
-                              ConstantAsMetadata::get(ConstantInt::get(
-                                  Ty, AssumedConstantRange.getUpper()))};
-    return MDNode::get(Ctx, LowAndHigh);
-  }
-
-  /// Return true if \p Assumed is included in \p KnownRanges.
-  static bool isBetterRange(const ConstantRange &Assumed, MDNode *KnownRanges) {
-
-    if (Assumed.isFullSet())
-      return false;
-
-    if (!KnownRanges)
-      return true;
-
-    // If multiple ranges are annotated in IR, we give up to annotate assumed
-    // range for now.
-
-    // TODO:  If there exists a known range which containts assumed range, we
-    // can say assumed range is better.
-    if (KnownRanges->getNumOperands() > 2)
-      return false;
-
-    ConstantInt *Lower =
-        mdconst::extract<ConstantInt>(KnownRanges->getOperand(0));
-    ConstantInt *Upper =
-        mdconst::extract<ConstantInt>(KnownRanges->getOperand(1));
-
-    ConstantRange Known(Lower->getValue(), Upper->getValue());
-    return Known.contains(Assumed) && Known != Assumed;
-  }
-
-  /// Helper function to set range metadata.
-  static bool
-  setRangeMetadataIfisBetterRange(Instruction *I,
-                                  const ConstantRange &AssumedConstantRange) {
-    auto *OldRangeMD = I->getMetadata(LLVMContext::MD_range);
-    if (isBetterRange(AssumedConstantRange, OldRangeMD)) {
-      if (!AssumedConstantRange.isEmptySet()) {
-        I->setMetadata(LLVMContext::MD_range,
-                       getMDNodeForConstantRange(I->getType(), I->getContext(),
-                                                 AssumedConstantRange));
-        return true;
-      }
-    }
-    return false;
-  }
-
-  /// See AbstractAttribute::manifest()
-  ChangeStatus manifest(Attributor &A) override {
-    ChangeStatus Changed = ChangeStatus::UNCHANGED;
-    ConstantRange AssumedConstantRange = getAssumedConstantRange(A);
-    assert(!AssumedConstantRange.isFullSet() && "Invalid state");
-
-    auto &V = getAssociatedValue();
-    if (!AssumedConstantRange.isEmptySet() &&
-        !AssumedConstantRange.isSingleElement()) {
-      if (Instruction *I = dyn_cast<Instruction>(&V))
-        if (isa<CallInst>(I) || isa<LoadInst>(I))
-          if (setRangeMetadataIfisBetterRange(I, AssumedConstantRange))
-            Changed = ChangeStatus::CHANGED;
-    }
-
-    return Changed;
-  }
-};
-
-struct AAValueConstantRangeArgument final : public AAValueConstantRangeImpl {
-
-  AAValueConstantRangeArgument(const IRPosition &IRP)
-      : AAValueConstantRangeImpl(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Use AAArgumentFromCallSiteArguments
-
-    IntegerRangeState S(getBitWidth());
-    clampCallSiteArgumentStates<AAValueConstantRange, IntegerRangeState>(
-        A, *this, S);
-
-    // TODO: If we know we visited all incoming values, thus no are assumed
-    // dead, we can take the known information from the state T.
-    return clampStateAndIndicateChange<IntegerRangeState>(this->getState(), S);
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_ARG_ATTR(value_range)
-  }
-};
-
-struct AAValueConstantRangeReturned : AAValueConstantRangeImpl {
-  AAValueConstantRangeReturned(const IRPosition &IRP)
-      : AAValueConstantRangeImpl(IRP) {}
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    // TODO: Use AAReturnedFromReturnedValues
-
-    // TODO: If we know we visited all returned values, thus no are assumed
-    // dead, we can take the known information from the state T.
-
-    IntegerRangeState S(getBitWidth());
-
-    clampReturnedValueStates<AAValueConstantRange, IntegerRangeState>(A, *this,
-                                                                      S);
-    return clampStateAndIndicateChange<StateType>(this->getState(), S);
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_FNRET_ATTR(value_range)
-  }
-};
-
-struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
-  AAValueConstantRangeFloating(const IRPosition &IRP)
-      : AAValueConstantRangeImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    AAValueConstantRange::initialize(A);
-    Value &V = getAssociatedValue();
-
-    if (auto *C = dyn_cast<ConstantInt>(&V)) {
-      unionAssumed(ConstantRange(C->getValue()));
-      indicateOptimisticFixpoint();
-      return;
-    }
-
-    if (isa<UndefValue>(&V)) {
-      indicateOptimisticFixpoint();
-      return;
-    }
-
-    if (auto *I = dyn_cast<Instruction>(&V))
-      if (isa<BinaryOperator>(I) || isa<CmpInst>(I)) {
-        Value *LHS = I->getOperand(0);
-        Value *RHS = I->getOperand(1);
-
-        if (LHS->getType()->isIntegerTy() && RHS->getType()->isIntegerTy())
-          return;
-      }
-
-    // If it is a load instruction with range metadata, use it.
-    if (LoadInst *LI = dyn_cast<LoadInst>(&V))
-      if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range)) {
-        intersectKnown(getConstantRangeFromMetadata(*RangeMD));
-        return;
-      }
-
-    // Otherwise we give up.
-    indicatePessimisticFixpoint();
-
-    LLVM_DEBUG(dbgs() << "[Attributor][AAValueConstantRange] We give up: "
-                      << getAssociatedValue());
-  }
-
-  bool calculateBinaryOperator(Attributor &A, BinaryOperator *BinOp,
-                               IntegerRangeState &T, Instruction *CtxI) {
-    Value *LHS = BinOp->getOperand(0);
-    Value *RHS = BinOp->getOperand(1);
-
-    auto &LHSAA =
-        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS));
-    auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI);
-
-    auto &RHSAA =
-        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS));
-    auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI);
-
-    auto AssumedRange = LHSAARange.binaryOp(BinOp->getOpcode(), RHSAARange);
-
-    T.unionAssumed(AssumedRange);
-
-    // TODO: Track a known state too.
-
-    return T.isValidState();
-  }
-
-  bool calculateCmpInst(Attributor &A, CmpInst *CmpI, IntegerRangeState &T,
-                        Instruction *CtxI) {
-    Value *LHS = CmpI->getOperand(0);
-    Value *RHS = CmpI->getOperand(1);
-
-    auto &LHSAA =
-        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS));
-    auto &RHSAA =
-        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS));
-
-    auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI);
-    auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI);
-
-    // If one of them is empty set, we can't decide.
-    if (LHSAARange.isEmptySet() || RHSAARange.isEmptySet())
-      return true;
-
-    bool MustTrue = false, MustFalse = false;
-
-    auto AllowedRegion =
-        ConstantRange::makeAllowedICmpRegion(CmpI->getPredicate(), RHSAARange);
-
-    auto SatisfyingRegion = ConstantRange::makeSatisfyingICmpRegion(
-        CmpI->getPredicate(), RHSAARange);
-
-    if (AllowedRegion.intersectWith(LHSAARange).isEmptySet())
-      MustFalse = true;
-
-    if (SatisfyingRegion.contains(LHSAARange))
-      MustTrue = true;
-
-    assert((!MustTrue || !MustFalse) &&
-           "Either MustTrue or MustFalse should be false!");
-
-    if (MustTrue)
-      T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 1)));
-    else if (MustFalse)
-      T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 0)));
-    else
-      T.unionAssumed(ConstantRange(/* BitWidth */ 1, /* isFullSet */ true));
-
-    LLVM_DEBUG(dbgs() << "[AAValueConstantRange] " << *CmpI << " " << LHSAA
-                      << " " << RHSAA << "\n");
-
-    // TODO: Track a known state too.
-    return T.isValidState();
-  }
-
-  /// See AbstractAttribute::updateImpl(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    Instruction *CtxI = getCtxI();
-    auto VisitValueCB = [&](Value &V, IntegerRangeState &T,
-                            bool Stripped) -> bool {
-      Instruction *I = dyn_cast<Instruction>(&V);
-      if (!I) {
-
-        // If the value is not instruction, we query AA to Attributor.
-        const auto &AA =
-            A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(V));
-
-        // Clamp operator is not used to utilize a program point CtxI.
-        T.unionAssumed(AA.getAssumedConstantRange(A, CtxI));
-
-        return T.isValidState();
-      }
-
-      if (auto *BinOp = dyn_cast<BinaryOperator>(I))
-        return calculateBinaryOperator(A, BinOp, T, CtxI);
-      else if (auto *CmpI = dyn_cast<CmpInst>(I))
-        return calculateCmpInst(A, CmpI, T, CtxI);
-      else {
-        // Give up with other instructions.
-        // TODO: Add other instructions
-
-        T.indicatePessimisticFixpoint();
-        return false;
-      }
-    };
-
-    IntegerRangeState T(getBitWidth());
-
-    if (!genericValueTraversal<AAValueConstantRange, IntegerRangeState>(
-            A, getIRPosition(), *this, T, VisitValueCB))
-      return indicatePessimisticFixpoint();
-
-    return clampStateAndIndicateChange(getState(), T);
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_FLOATING_ATTR(value_range)
-  }
-};
-
-struct AAValueConstantRangeFunction : AAValueConstantRangeImpl {
-  AAValueConstantRangeFunction(const IRPosition &IRP)
-      : AAValueConstantRangeImpl(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  ChangeStatus updateImpl(Attributor &A) override {
-    llvm_unreachable("AAValueConstantRange(Function|CallSite)::updateImpl will "
-                     "not be called");
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(value_range) }
-};
-
-struct AAValueConstantRangeCallSite : AAValueConstantRangeFunction {
-  AAValueConstantRangeCallSite(const IRPosition &IRP)
-      : AAValueConstantRangeFunction(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(value_range) }
-};
-
-struct AAValueConstantRangeCallSiteReturned : AAValueConstantRangeReturned {
-  AAValueConstantRangeCallSiteReturned(const IRPosition &IRP)
-      : AAValueConstantRangeReturned(IRP) {}
-
-  /// See AbstractAttribute::initialize(...).
-  void initialize(Attributor &A) override {
-    // If it is a load instruction with range metadata, use the metadata.
-    if (CallInst *CI = dyn_cast<CallInst>(&getAssociatedValue()))
-      if (auto *RangeMD = CI->getMetadata(LLVMContext::MD_range))
-        intersectKnown(getConstantRangeFromMetadata(*RangeMD));
-
-    AAValueConstantRangeReturned::initialize(A);
-  }
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_CSRET_ATTR(value_range)
-  }
-};
-struct AAValueConstantRangeCallSiteArgument : AAValueConstantRangeFloating {
-  AAValueConstantRangeCallSiteArgument(const IRPosition &IRP)
-      : AAValueConstantRangeFloating(IRP) {}
-
-  /// See AbstractAttribute::trackStatistics()
-  void trackStatistics() const override {
-    STATS_DECLTRACK_CSARG_ATTR(value_range)
-  }
-};
-/// ----------------------------------------------------------------------------
-///                               Attributor
-/// ----------------------------------------------------------------------------
-
-bool Attributor::isAssumedDead(const AbstractAttribute &AA,
-                               const AAIsDead *LivenessAA) {
-  const Instruction *CtxI = AA.getIRPosition().getCtxI();
-  if (!CtxI)
-    return false;
-
-  // TODO: Find a good way to utilize fine and coarse grained liveness
-  // information.
-  if (!LivenessAA)
-    LivenessAA =
-        &getAAFor<AAIsDead>(AA, IRPosition::function(*CtxI->getFunction()),
-                            /* TrackDependence */ false);
-
-  // Don't check liveness for AAIsDead.
-  if (&AA == LivenessAA)
-    return false;
-
-  if (!LivenessAA->isAssumedDead(CtxI))
-    return false;
-
-  // We actually used liveness information so we have to record a dependence.
-  recordDependence(*LivenessAA, AA, DepClassTy::OPTIONAL);
-
-  return true;
-}
-
-bool Attributor::checkForAllUses(
-    const function_ref<bool(const Use &, bool &)> &Pred,
-    const AbstractAttribute &QueryingAA, const Value &V) {
   const IRPosition &IRP = QueryingAA.getIRPosition();
   SmallVector<const Use *, 16> Worklist;
   SmallPtrSet<const Use *, 16> Visited;
@@ -5601,10 +646,6 @@ bool Attributor::checkForAllUses(
   LLVM_DEBUG(dbgs() << "[Attributor] Got " << Worklist.size()
                     << " initial uses to check\n");
 
-  if (Worklist.empty())
-    return true;
-
-  bool AnyDead = false;
   const Function *ScopeFn = IRP.getAnchorScope();
   const auto *LivenessAA =
       ScopeFn ? &getAAFor<AAIsDead>(QueryingAA, IRPosition::function(*ScopeFn),
@@ -5615,14 +656,17 @@ bool Attributor::checkForAllUses(
     const Use *U = Worklist.pop_back_val();
     if (!Visited.insert(U).second)
       continue;
-    LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << **U << "\n");
-    if (Instruction *UserI = dyn_cast<Instruction>(U->getUser()))
-      if (LivenessAA && LivenessAA->isAssumedDead(UserI)) {
-        LLVM_DEBUG(dbgs() << "[Attributor] Dead user: " << *UserI << ": "
-                          << *LivenessAA << "\n");
-        AnyDead = true;
-        continue;
-      }
+    LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << **U << " in "
+                      << *U->getUser() << "\n");
+    if (isAssumedDead(*U, &QueryingAA, LivenessAA,
+                      /* CheckBBLivenessOnly */ false, LivenessDepClass)) {
+      LLVM_DEBUG(dbgs() << "[Attributor] Dead use, skip!\n");
+      continue;
+    }
+    if (U->getUser()->isDroppable()) {
+      LLVM_DEBUG(dbgs() << "[Attributor] Droppable user, skip!\n");
+      continue;
+    }
 
     bool Follow = false;
     if (!Pred(*U, Follow))
@@ -5633,15 +677,13 @@ bool Attributor::checkForAllUses(
       Worklist.push_back(&UU);
   }
 
-  if (AnyDead)
-    recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL);
-
   return true;
 }
 
-bool Attributor::checkForAllCallSites(
-    const function_ref<bool(AbstractCallSite)> &Pred,
-    const AbstractAttribute &QueryingAA, bool RequireAllCallSites) {
+bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
+                                      const AbstractAttribute &QueryingAA,
+                                      bool RequireAllCallSites,
+                                      bool &AllCallSitesKnown) {
   // We can try to determine information from
   // the call sites. However, this is only possible all call sites are known,
   // hence the function has internal linkage.
@@ -5650,25 +692,49 @@ bool Attributor::checkForAllCallSites(
   if (!AssociatedFunction) {
     LLVM_DEBUG(dbgs() << "[Attributor] No function associated with " << IRP
                       << "\n");
+    AllCallSitesKnown = false;
     return false;
   }
 
   return checkForAllCallSites(Pred, *AssociatedFunction, RequireAllCallSites,
-                              &QueryingAA);
+                              &QueryingAA, AllCallSitesKnown);
 }
 
-bool Attributor::checkForAllCallSites(
-    const function_ref<bool(AbstractCallSite)> &Pred, const Function &Fn,
-    bool RequireAllCallSites, const AbstractAttribute *QueryingAA) {
+bool Attributor::checkForAllCallSites(function_ref<bool(AbstractCallSite)> Pred,
+                                      const Function &Fn,
+                                      bool RequireAllCallSites,
+                                      const AbstractAttribute *QueryingAA,
+                                      bool &AllCallSitesKnown) {
   if (RequireAllCallSites && !Fn.hasLocalLinkage()) {
     LLVM_DEBUG(
         dbgs()
         << "[Attributor] Function " << Fn.getName()
         << " has no internal linkage, hence not all call sites are known\n");
+    AllCallSitesKnown = false;
     return false;
   }
 
-  for (const Use &U : Fn.uses()) {
+  // If we do not require all call sites we might not see all.
+  AllCallSitesKnown = RequireAllCallSites;
+
+  SmallVector<const Use *, 8> Uses(make_pointer_range(Fn.uses()));
+  for (unsigned u = 0; u < Uses.size(); ++u) {
+    const Use &U = *Uses[u];
+    LLVM_DEBUG(dbgs() << "[Attributor] Check use: " << *U << " in "
+                      << *U.getUser() << "\n");
+    if (isAssumedDead(U, QueryingAA, nullptr, /* CheckBBLivenessOnly */ true)) {
+      LLVM_DEBUG(dbgs() << "[Attributor] Dead use, skip!\n");
+      continue;
+    }
+    if (ConstantExpr *CE = dyn_cast<ConstantExpr>(U.getUser())) {
+      if (CE->isCast() && CE->getType()->isPointerTy() &&
+          CE->getType()->getPointerElementType()->isFunctionTy()) {
+        for (const Use &CEU : CE->uses())
+          Uses.push_back(&CEU);
+        continue;
+      }
+    }
+
     AbstractCallSite ACS(&U);
     if (!ACS) {
       LLVM_DEBUG(dbgs() << "[Attributor] Function " << Fn.getName()
@@ -5680,22 +746,6 @@ bool Attributor::checkForAllCallSites(
       return false;
     }
 
-    Instruction *I = ACS.getInstruction();
-    Function *Caller = I->getFunction();
-
-    const auto *LivenessAA =
-        lookupAAFor<AAIsDead>(IRPosition::function(*Caller), QueryingAA,
-                              /* TrackDependence */ false);
-
-    // Skip dead calls.
-    if (LivenessAA && LivenessAA->isAssumedDead(I)) {
-      // We actually used liveness information so we have to record a
-      // dependence.
-      if (QueryingAA)
-        recordDependence(*LivenessAA, *QueryingAA, DepClassTy::OPTIONAL);
-      continue;
-    }
-
     const Use *EffectiveUse =
         ACS.isCallbackCall() ? &ACS.getCalleeUseForCallback() : &U;
     if (!ACS.isCallee(EffectiveUse)) {
@@ -5706,6 +756,24 @@ bool Attributor::checkForAllCallSites(
       return false;
     }
 
+    // Make sure the arguments that can be matched between the call site and the
+    // callee argee on their type. It is unlikely they do not and it doesn't
+    // make sense for all attributes to know/care about this.
+    assert(&Fn == ACS.getCalledFunction() && "Expected known callee");
+    unsigned MinArgsParams =
+        std::min(size_t(ACS.getNumArgOperands()), Fn.arg_size());
+    for (unsigned u = 0; u < MinArgsParams; ++u) {
+      Value *CSArgOp = ACS.getCallArgOperand(u);
+      if (CSArgOp && Fn.getArg(u)->getType() != CSArgOp->getType()) {
+        LLVM_DEBUG(
+            dbgs() << "[Attributor] Call site / callee argument type mismatch ["
+                   << u << "@" << Fn.getName() << ": "
+                   << *Fn.getArg(u)->getType() << " vs. "
+                   << *ACS.getCallArgOperand(u)->getType() << "\n");
+        return false;
+      }
+    }
+
     if (Pred(ACS))
       continue;
 
@@ -5718,8 +786,7 @@ bool Attributor::checkForAllCallSites(
 }
 
 bool Attributor::checkForAllReturnedValuesAndReturnInsts(
-    const function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)>
-        &Pred,
+    function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred,
     const AbstractAttribute &QueryingAA) {
 
   const IRPosition &IRP = QueryingAA.getIRPosition();
@@ -5741,8 +808,7 @@ bool Attributor::checkForAllReturnedValuesAndReturnInsts(
 }
 
 bool Attributor::checkForAllReturnedValues(
-    const function_ref<bool(Value &)> &Pred,
-    const AbstractAttribute &QueryingAA) {
+    function_ref<bool(Value &)> Pred, const AbstractAttribute &QueryingAA) {
 
   const IRPosition &IRP = QueryingAA.getIRPosition();
   const Function *AssociatedFunction = IRP.getAssociatedFunction();
@@ -5761,18 +827,22 @@ bool Attributor::checkForAllReturnedValues(
       });
 }
 
-static bool
-checkForAllInstructionsImpl(InformationCache::OpcodeInstMapTy &OpcodeInstMap,
-                            const function_ref<bool(Instruction &)> &Pred,
-                            const AAIsDead *LivenessAA, bool &AnyDead,
-                            const ArrayRef<unsigned> &Opcodes) {
+static bool checkForAllInstructionsImpl(
+    Attributor *A, InformationCache::OpcodeInstMapTy &OpcodeInstMap,
+    function_ref<bool(Instruction &)> Pred, const AbstractAttribute *QueryingAA,
+    const AAIsDead *LivenessAA, const ArrayRef<unsigned> &Opcodes,
+    bool CheckBBLivenessOnly = false) {
   for (unsigned Opcode : Opcodes) {
-    for (Instruction *I : OpcodeInstMap[Opcode]) {
+    // Check if we have instructions with this opcode at all first.
+    auto *Insts = OpcodeInstMap.lookup(Opcode);
+    if (!Insts)
+      continue;
+
+    for (Instruction *I : *Insts) {
       // Skip dead instructions.
-      if (LivenessAA && LivenessAA->isAssumedDead(I)) {
-        AnyDead = true;
+      if (A && A->isAssumedDead(IRPosition::value(*I), QueryingAA, LivenessAA,
+                                CheckBBLivenessOnly))
         continue;
-      }
 
       if (!Pred(*I))
         return false;
@@ -5781,9 +851,10 @@ checkForAllInstructionsImpl(InformationCache::OpcodeInstMapTy &OpcodeInstMap,
   return true;
 }
 
-bool Attributor::checkForAllInstructions(
-    const llvm::function_ref<bool(Instruction &)> &Pred,
-    const AbstractAttribute &QueryingAA, const ArrayRef<unsigned> &Opcodes) {
+bool Attributor::checkForAllInstructions(function_ref<bool(Instruction &)> Pred,
+                                         const AbstractAttribute &QueryingAA,
+                                         const ArrayRef<unsigned> &Opcodes,
+                                         bool CheckBBLivenessOnly) {
 
   const IRPosition &IRP = QueryingAA.getIRPosition();
   // Since we need to provide instructions we have to have an exact definition.
@@ -5795,24 +866,18 @@ bool Attributor::checkForAllInstructions(
   const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
   const auto &LivenessAA =
       getAAFor<AAIsDead>(QueryingAA, QueryIRP, /* TrackDependence */ false);
-  bool AnyDead = false;
 
   auto &OpcodeInstMap =
       InfoCache.getOpcodeInstMapForFunction(*AssociatedFunction);
-  if (!checkForAllInstructionsImpl(OpcodeInstMap, Pred, &LivenessAA, AnyDead,
-                                   Opcodes))
+  if (!checkForAllInstructionsImpl(this, OpcodeInstMap, Pred, &QueryingAA,
+                                   &LivenessAA, Opcodes, CheckBBLivenessOnly))
     return false;
 
-  // If we actually used liveness information so we have to record a dependence.
-  if (AnyDead)
-    recordDependence(LivenessAA, QueryingAA, DepClassTy::OPTIONAL);
-
   return true;
 }
 
 bool Attributor::checkForAllReadWriteInstructions(
-    const llvm::function_ref<bool(Instruction &)> &Pred,
-    AbstractAttribute &QueryingAA) {
+    function_ref<bool(Instruction &)> Pred, AbstractAttribute &QueryingAA) {
 
   const Function *AssociatedFunction =
       QueryingAA.getIRPosition().getAssociatedFunction();
@@ -5823,28 +888,21 @@ bool Attributor::checkForAllReadWriteInstructions(
   const IRPosition &QueryIRP = IRPosition::function(*AssociatedFunction);
   const auto &LivenessAA =
       getAAFor<AAIsDead>(QueryingAA, QueryIRP, /* TrackDependence */ false);
-  bool AnyDead = false;
 
   for (Instruction *I :
        InfoCache.getReadOrWriteInstsForFunction(*AssociatedFunction)) {
     // Skip dead instructions.
-    if (LivenessAA.isAssumedDead(I)) {
-      AnyDead = true;
+    if (isAssumedDead(IRPosition::value(*I), &QueryingAA, &LivenessAA))
       continue;
-    }
 
     if (!Pred(*I))
       return false;
   }
 
-  // If we actually used liveness information so we have to record a dependence.
-  if (AnyDead)
-    recordDependence(LivenessAA, QueryingAA, DepClassTy::OPTIONAL);
-
   return true;
 }
 
-ChangeStatus Attributor::run(Module &M) {
+void Attributor::runTillFixpoint() {
   LLVM_DEBUG(dbgs() << "[Attributor] Identified and initialized "
                     << AllAbstractAttributes.size()
                     << " abstract attributes.\n");
@@ -5854,12 +912,10 @@ ChangeStatus Attributor::run(Module &M) {
 
   unsigned IterationCounter = 1;
 
-  SmallVector<AbstractAttribute *, 64> ChangedAAs;
+  SmallVector<AbstractAttribute *, 32> ChangedAAs;
   SetVector<AbstractAttribute *> Worklist, InvalidAAs;
   Worklist.insert(AllAbstractAttributes.begin(), AllAbstractAttributes.end());
 
-  bool RecomputeDependences = false;
-
   do {
     // Remember the size to determine new attributes.
     size_t NumAAs = AllAbstractAttributes.size();
@@ -5871,44 +927,35 @@ ChangeStatus Attributor::run(Module &M) {
     // to run updates.
     for (unsigned u = 0; u < InvalidAAs.size(); ++u) {
       AbstractAttribute *InvalidAA = InvalidAAs[u];
-      auto &QuerriedAAs = QueryMap[InvalidAA];
+
+      // Check the dependences to fast track invalidation.
       LLVM_DEBUG(dbgs() << "[Attributor] InvalidAA: " << *InvalidAA << " has "
-                        << QuerriedAAs.RequiredAAs.size() << "/"
-                        << QuerriedAAs.OptionalAAs.size()
-                        << " required/optional dependences\n");
-      for (AbstractAttribute *DepOnInvalidAA : QuerriedAAs.RequiredAAs) {
-        AbstractState &DOIAAState = DepOnInvalidAA->getState();
-        DOIAAState.indicatePessimisticFixpoint();
-        ++NumAttributesFixedDueToRequiredDependences;
-        assert(DOIAAState.isAtFixpoint() && "Expected fixpoint state!");
-        if (!DOIAAState.isValidState())
-          InvalidAAs.insert(DepOnInvalidAA);
+                        << InvalidAA->Deps.size()
+                        << " required & optional dependences\n");
+      while (!InvalidAA->Deps.empty()) {
+        const auto &Dep = InvalidAA->Deps.back();
+        InvalidAA->Deps.pop_back();
+        AbstractAttribute *DepAA = Dep.getPointer();
+        if (Dep.getInt() == unsigned(DepClassTy::OPTIONAL)) {
+          Worklist.insert(DepAA);
+          continue;
+        }
+        DepAA->getState().indicatePessimisticFixpoint();
+        assert(DepAA->getState().isAtFixpoint() && "Expected fixpoint state!");
+        if (!DepAA->getState().isValidState())
+          InvalidAAs.insert(DepAA);
+        else
+          ChangedAAs.push_back(DepAA);
       }
-      if (!RecomputeDependences)
-        Worklist.insert(QuerriedAAs.OptionalAAs.begin(),
-                        QuerriedAAs.OptionalAAs.end());
-    }
-
-    // If dependences (=QueryMap) are recomputed we have to look at all abstract
-    // attributes again, regardless of what changed in the last iteration.
-    if (RecomputeDependences) {
-      LLVM_DEBUG(
-          dbgs() << "[Attributor] Run all AAs to recompute dependences\n");
-      QueryMap.clear();
-      ChangedAAs.clear();
-      Worklist.insert(AllAbstractAttributes.begin(),
-                      AllAbstractAttributes.end());
     }
 
     // Add all abstract attributes that are potentially dependent on one that
     // changed to the work list.
-    for (AbstractAttribute *ChangedAA : ChangedAAs) {
-      auto &QuerriedAAs = QueryMap[ChangedAA];
-      Worklist.insert(QuerriedAAs.OptionalAAs.begin(),
-                      QuerriedAAs.OptionalAAs.end());
-      Worklist.insert(QuerriedAAs.RequiredAAs.begin(),
-                      QuerriedAAs.RequiredAAs.end());
-    }
+    for (AbstractAttribute *ChangedAA : ChangedAAs)
+      while (!ChangedAA->Deps.empty()) {
+        Worklist.insert(ChangedAA->Deps.back().getPointer());
+        ChangedAA->Deps.pop_back();
+      }
 
     LLVM_DEBUG(dbgs() << "[Attributor] #Iteration: " << IterationCounter
                       << ", Worklist+Dependent size: " << Worklist.size()
@@ -5920,23 +967,17 @@ ChangeStatus Attributor::run(Module &M) {
 
     // Update all abstract attribute in the work list and record the ones that
     // changed.
-    for (AbstractAttribute *AA : Worklist)
-      if (!AA->getState().isAtFixpoint() && !isAssumedDead(*AA, nullptr)) {
-        QueriedNonFixAA = false;
-        if (AA->update(*this) == ChangeStatus::CHANGED) {
+    for (AbstractAttribute *AA : Worklist) {
+      const auto &AAState = AA->getState();
+      if (!AAState.isAtFixpoint())
+        if (updateAA(*AA) == ChangeStatus::CHANGED)
           ChangedAAs.push_back(AA);
-          if (!AA->getState().isValidState())
-            InvalidAAs.insert(AA);
-        } else if (!QueriedNonFixAA) {
-          // If the attribute did not query any non-fix information, the state
-          // will not change and we can indicate that right away.
-          AA->getState().indicateOptimisticFixpoint();
-        }
-      }
 
-    // Check if we recompute the dependences in the next iteration.
-    RecomputeDependences = (DepRecomputeInterval > 0 &&
-                            IterationCounter % DepRecomputeInterval == 0);
+      // Use the InvalidAAs vector to propagate invalid states fast transitively
+      // without requiring updates.
+      if (!AAState.isValidState())
+        InvalidAAs.insert(AA);
+    }
 
     // Add attributes to the changed set if they have been created in the last
     // iteration.
@@ -5955,8 +996,6 @@ ChangeStatus Attributor::run(Module &M) {
                     << IterationCounter << "/" << MaxFixpointIterations
                     << " iterations\n");
 
-  size_t NumFinalAAs = AllAbstractAttributes.size();
-
   // Reset abstract arguments not settled in a sound fixpoint by now. This
   // happens when we stopped the fixpoint iteration early. Note that only the
   // ones marked as "changed" *and* the ones transitively depending on them
@@ -5975,11 +1014,10 @@ ChangeStatus Attributor::run(Module &M) {
       NumAttributesTimedOut++;
     }
 
-    auto &QuerriedAAs = QueryMap[ChangedAA];
-    ChangedAAs.append(QuerriedAAs.OptionalAAs.begin(),
-                      QuerriedAAs.OptionalAAs.end());
-    ChangedAAs.append(QuerriedAAs.RequiredAAs.begin(),
-                      QuerriedAAs.RequiredAAs.end());
+    while (!ChangedAA->Deps.empty()) {
+      ChangedAAs.push_back(ChangedAA->Deps.back().getPointer());
+      ChangedAA->Deps.pop_back();
+    }
   }
 
   LLVM_DEBUG({
@@ -5988,6 +1026,19 @@ ChangeStatus Attributor::run(Module &M) {
              << " abstract attributes.\n";
   });
 
+  if (VerifyMaxFixpointIterations &&
+      IterationCounter != MaxFixpointIterations) {
+    errs() << "\n[Attributor] Fixpoint iteration done after: "
+           << IterationCounter << "/" << MaxFixpointIterations
+           << " iterations\n";
+    llvm_unreachable("The fixpoint was not reached with exactly the number of "
+                     "specified iterations!");
+  }
+}
+
+ChangeStatus Attributor::manifestAttributes() {
+  size_t NumFinalAAs = AllAbstractAttributes.size();
+
   unsigned NumManifested = 0;
   unsigned NumAtFixpoint = 0;
   ChangeStatus ManifestChange = ChangeStatus::UNCHANGED;
@@ -6006,12 +1057,14 @@ ChangeStatus Attributor::run(Module &M) {
       continue;
 
     // Skip dead code.
-    if (isAssumedDead(*AA, nullptr))
+    if (isAssumedDead(*AA, nullptr, /* CheckBBLivenessOnly */ true))
       continue;
     // Manifest the state and record if we changed the IR.
     ChangeStatus LocalChange = AA->manifest(*this);
     if (LocalChange == ChangeStatus::CHANGED && AreStatisticsEnabled())
       AA->trackStatistics();
+    LLVM_DEBUG(dbgs() << "[Attributor] Manifest " << LocalChange << " : " << *AA
+                      << "\n");
 
     ManifestChange = ManifestChange | LocalChange;
 
@@ -6029,160 +1082,298 @@ ChangeStatus Attributor::run(Module &M) {
   NumAttributesValidFixpoint += NumAtFixpoint;
 
   (void)NumFinalAAs;
-  assert(
-      NumFinalAAs == AllAbstractAttributes.size() &&
-      "Expected the final number of abstract attributes to remain unchanged!");
+  if (NumFinalAAs != AllAbstractAttributes.size()) {
+    for (unsigned u = NumFinalAAs; u < AllAbstractAttributes.size(); ++u)
+      errs() << "Unexpected abstract attribute: " << *AllAbstractAttributes[u]
+             << " :: "
+             << AllAbstractAttributes[u]->getIRPosition().getAssociatedValue()
+             << "\n";
+    llvm_unreachable("Expected the final number of abstract attributes to "
+                     "remain unchanged!");
+  }
+  return ManifestChange;
+}
 
+ChangeStatus Attributor::cleanupIR() {
   // Delete stuff at the end to avoid invalid references and a nice order.
-  {
-    LLVM_DEBUG(dbgs() << "\n[Attributor] Delete at least "
-                      << ToBeDeletedFunctions.size() << " functions and "
-                      << ToBeDeletedBlocks.size() << " blocks and "
-                      << ToBeDeletedInsts.size() << " instructions and "
-                      << ToBeChangedUses.size() << " uses\n");
-
-    SmallVector<Instruction *, 32> DeadInsts;
-    SmallVector<Instruction *, 32> TerminatorsToFold;
-
-    for (auto &It : ToBeChangedUses) {
-      Use *U = It.first;
-      Value *NewV = It.second;
-      Value *OldV = U->get();
-      LLVM_DEBUG(dbgs() << "Use " << *NewV << " in " << *U->getUser()
-                        << " instead of " << *OldV << "\n");
-      U->set(NewV);
-      if (Instruction *I = dyn_cast<Instruction>(OldV))
-        if (!isa<PHINode>(I) && !ToBeDeletedInsts.count(I) &&
-            isInstructionTriviallyDead(I)) {
-          DeadInsts.push_back(I);
-        }
-      if (isa<Constant>(NewV) && isa<BranchInst>(U->getUser())) {
-        Instruction *UserI = cast<Instruction>(U->getUser());
-        if (isa<UndefValue>(NewV)) {
-          ToBeChangedToUnreachableInsts.insert(UserI);
-        } else {
-          TerminatorsToFold.push_back(UserI);
-        }
+  LLVM_DEBUG(dbgs() << "\n[Attributor] Delete at least "
+                    << ToBeDeletedFunctions.size() << " functions and "
+                    << ToBeDeletedBlocks.size() << " blocks and "
+                    << ToBeDeletedInsts.size() << " instructions and "
+                    << ToBeChangedUses.size() << " uses\n");
+
+  SmallVector<WeakTrackingVH, 32> DeadInsts;
+  SmallVector<Instruction *, 32> TerminatorsToFold;
+
+  for (auto &It : ToBeChangedUses) {
+    Use *U = It.first;
+    Value *NewV = It.second;
+    Value *OldV = U->get();
+
+    // Do not replace uses in returns if the value is a must-tail call we will
+    // not delete.
+    if (isa<ReturnInst>(U->getUser()))
+      if (auto *CI = dyn_cast<CallInst>(OldV->stripPointerCasts()))
+        if (CI->isMustTailCall() && !ToBeDeletedInsts.count(CI))
+          continue;
+
+    LLVM_DEBUG(dbgs() << "Use " << *NewV << " in " << *U->getUser()
+                      << " instead of " << *OldV << "\n");
+    U->set(NewV);
+    // Do not modify call instructions outside the SCC.
+    if (auto *CB = dyn_cast<CallBase>(OldV))
+      if (!Functions.count(CB->getCaller()))
+        continue;
+    if (Instruction *I = dyn_cast<Instruction>(OldV)) {
+      CGModifiedFunctions.insert(I->getFunction());
+      if (!isa<PHINode>(I) && !ToBeDeletedInsts.count(I) &&
+          isInstructionTriviallyDead(I))
+        DeadInsts.push_back(I);
+    }
+    if (isa<Constant>(NewV) && isa<BranchInst>(U->getUser())) {
+      Instruction *UserI = cast<Instruction>(U->getUser());
+      if (isa<UndefValue>(NewV)) {
+        ToBeChangedToUnreachableInsts.insert(UserI);
+      } else {
+        TerminatorsToFold.push_back(UserI);
       }
     }
-    for (auto &V : InvokeWithDeadSuccessor)
-      if (InvokeInst *II = dyn_cast_or_null<InvokeInst>(V)) {
-        bool UnwindBBIsDead = II->hasFnAttr(Attribute::NoUnwind);
-        bool NormalBBIsDead = II->hasFnAttr(Attribute::NoReturn);
-        bool Invoke2CallAllowed =
-            !AAIsDeadFunction::mayCatchAsynchronousExceptions(
-                *II->getFunction());
-        assert((UnwindBBIsDead || NormalBBIsDead) &&
-               "Invoke does not have dead successors!");
-        BasicBlock *BB = II->getParent();
-        BasicBlock *NormalDestBB = II->getNormalDest();
-        if (UnwindBBIsDead) {
-          Instruction *NormalNextIP = &NormalDestBB->front();
-          if (Invoke2CallAllowed) {
-            changeToCall(II);
-            NormalNextIP = BB->getTerminator();
-          }
-          if (NormalBBIsDead)
-            ToBeChangedToUnreachableInsts.insert(NormalNextIP);
-        } else {
-          assert(NormalBBIsDead && "Broken invariant!");
-          if (!NormalDestBB->getUniquePredecessor())
-            NormalDestBB = SplitBlockPredecessors(NormalDestBB, {BB}, ".dead");
-          ToBeChangedToUnreachableInsts.insert(&NormalDestBB->front());
+  }
+  for (auto &V : InvokeWithDeadSuccessor)
+    if (InvokeInst *II = dyn_cast_or_null<InvokeInst>(V)) {
+      bool UnwindBBIsDead = II->hasFnAttr(Attribute::NoUnwind);
+      bool NormalBBIsDead = II->hasFnAttr(Attribute::NoReturn);
+      bool Invoke2CallAllowed =
+          !AAIsDead::mayCatchAsynchronousExceptions(*II->getFunction());
+      assert((UnwindBBIsDead || NormalBBIsDead) &&
+             "Invoke does not have dead successors!");
+      BasicBlock *BB = II->getParent();
+      BasicBlock *NormalDestBB = II->getNormalDest();
+      if (UnwindBBIsDead) {
+        Instruction *NormalNextIP = &NormalDestBB->front();
+        if (Invoke2CallAllowed) {
+          changeToCall(II);
+          NormalNextIP = BB->getTerminator();
         }
+        if (NormalBBIsDead)
+          ToBeChangedToUnreachableInsts.insert(NormalNextIP);
+      } else {
+        assert(NormalBBIsDead && "Broken invariant!");
+        if (!NormalDestBB->getUniquePredecessor())
+          NormalDestBB = SplitBlockPredecessors(NormalDestBB, {BB}, ".dead");
+        ToBeChangedToUnreachableInsts.insert(&NormalDestBB->front());
       }
-    for (auto &V : ToBeChangedToUnreachableInsts)
-      if (Instruction *I = dyn_cast_or_null<Instruction>(V))
-        changeToUnreachable(I, /* UseLLVMTrap */ false);
-    for (Instruction *I : TerminatorsToFold)
-      ConstantFoldTerminator(I->getParent());
-
-    for (Instruction *I : ToBeDeletedInsts) {
-      I->replaceAllUsesWith(UndefValue::get(I->getType()));
+    }
+  for (Instruction *I : TerminatorsToFold) {
+    CGModifiedFunctions.insert(I->getFunction());
+    ConstantFoldTerminator(I->getParent());
+  }
+  for (auto &V : ToBeChangedToUnreachableInsts)
+    if (Instruction *I = dyn_cast_or_null<Instruction>(V)) {
+      CGModifiedFunctions.insert(I->getFunction());
+      changeToUnreachable(I, /* UseLLVMTrap */ false);
+    }
+
+  for (auto &V : ToBeDeletedInsts) {
+    if (Instruction *I = dyn_cast_or_null<Instruction>(V)) {
+      I->dropDroppableUses();
+      CGModifiedFunctions.insert(I->getFunction());
+      if (!I->getType()->isVoidTy())
+        I->replaceAllUsesWith(UndefValue::get(I->getType()));
       if (!isa<PHINode>(I) && isInstructionTriviallyDead(I))
         DeadInsts.push_back(I);
       else
         I->eraseFromParent();
     }
+  }
 
-    RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
-
-    if (unsigned NumDeadBlocks = ToBeDeletedBlocks.size()) {
-      SmallVector<BasicBlock *, 8> ToBeDeletedBBs;
-      ToBeDeletedBBs.reserve(NumDeadBlocks);
-      ToBeDeletedBBs.append(ToBeDeletedBlocks.begin(), ToBeDeletedBlocks.end());
-      // Actually we do not delete the blocks but squash them into a single
-      // unreachable but untangling branches that jump here is something we need
-      // to do in a more generic way.
-      DetatchDeadBlocks(ToBeDeletedBBs, nullptr);
-      STATS_DECL(AAIsDead, BasicBlock, "Number of dead basic blocks deleted.");
-      BUILD_STAT_NAME(AAIsDead, BasicBlock) += ToBeDeletedBlocks.size();
-    }
+  LLVM_DEBUG(dbgs() << "[Attributor] DeadInsts size: " << DeadInsts.size()
+                    << "\n");
 
-    // Identify dead internal functions and delete them. This happens outside
-    // the other fixpoint analysis as we might treat potentially dead functions
-    // as live to lower the number of iterations. If they happen to be dead, the
-    // below fixpoint loop will identify and eliminate them.
-    SmallVector<Function *, 8> InternalFns;
-    for (Function &F : M)
-      if (F.hasLocalLinkage())
-        InternalFns.push_back(&F);
-
-    bool FoundDeadFn = true;
-    while (FoundDeadFn) {
-      FoundDeadFn = false;
-      for (unsigned u = 0, e = InternalFns.size(); u < e; ++u) {
-        Function *F = InternalFns[u];
-        if (!F)
-          continue;
+  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts);
+
+  if (unsigned NumDeadBlocks = ToBeDeletedBlocks.size()) {
+    SmallVector<BasicBlock *, 8> ToBeDeletedBBs;
+    ToBeDeletedBBs.reserve(NumDeadBlocks);
+    for (BasicBlock *BB : ToBeDeletedBlocks) {
+      CGModifiedFunctions.insert(BB->getParent());
+      ToBeDeletedBBs.push_back(BB);
+    }
+    // Actually we do not delete the blocks but squash them into a single
+    // unreachable but untangling branches that jump here is something we need
+    // to do in a more generic way.
+    DetatchDeadBlocks(ToBeDeletedBBs, nullptr);
+  }
+
+  // Identify dead internal functions and delete them. This happens outside
+  // the other fixpoint analysis as we might treat potentially dead functions
+  // as live to lower the number of iterations. If they happen to be dead, the
+  // below fixpoint loop will identify and eliminate them.
+  SmallVector<Function *, 8> InternalFns;
+  for (Function *F : Functions)
+    if (F->hasLocalLinkage())
+      InternalFns.push_back(F);
+
+  bool FoundDeadFn = true;
+  while (FoundDeadFn) {
+    FoundDeadFn = false;
+    for (unsigned u = 0, e = InternalFns.size(); u < e; ++u) {
+      Function *F = InternalFns[u];
+      if (!F)
+        continue;
 
-        if (!checkForAllCallSites(
-                [this](AbstractCallSite ACS) {
-                  return ToBeDeletedFunctions.count(
-                      ACS.getInstruction()->getFunction());
-                },
-                *F, true, nullptr))
-          continue;
+      bool AllCallSitesKnown;
+      if (!checkForAllCallSites(
+              [this](AbstractCallSite ACS) {
+                return ToBeDeletedFunctions.count(
+                    ACS.getInstruction()->getFunction());
+              },
+              *F, true, nullptr, AllCallSitesKnown))
+        continue;
 
-        ToBeDeletedFunctions.insert(F);
-        InternalFns[u] = nullptr;
-        FoundDeadFn = true;
-      }
+      ToBeDeletedFunctions.insert(F);
+      InternalFns[u] = nullptr;
+      FoundDeadFn = true;
     }
   }
 
-  STATS_DECL(AAIsDead, Function, "Number of dead functions deleted.");
-  BUILD_STAT_NAME(AAIsDead, Function) += ToBeDeletedFunctions.size();
-
   // Rewrite the functions as requested during manifest.
-  ManifestChange = ManifestChange | rewriteFunctionSignatures();
+  ChangeStatus ManifestChange = rewriteFunctionSignatures(CGModifiedFunctions);
 
-  for (Function *Fn : ToBeDeletedFunctions) {
-    Fn->deleteBody();
-    Fn->replaceAllUsesWith(UndefValue::get(Fn->getType()));
-    Fn->eraseFromParent();
-  }
+  for (Function *Fn : CGModifiedFunctions)
+    CGUpdater.reanalyzeFunction(*Fn);
 
-  if (VerifyMaxFixpointIterations &&
-      IterationCounter != MaxFixpointIterations) {
-    errs() << "\n[Attributor] Fixpoint iteration done after: "
-           << IterationCounter << "/" << MaxFixpointIterations
-           << " iterations\n";
-    llvm_unreachable("The fixpoint was not reached with exactly the number of "
-                     "specified iterations!");
+  for (Function *Fn : ToBeDeletedFunctions)
+    CGUpdater.removeFunction(*Fn);
+
+  NumFnDeleted += ToBeDeletedFunctions.size();
+
+  LLVM_DEBUG(dbgs() << "[Attributor] Deleted " << NumFnDeleted
+                    << " functions after manifest.\n");
+
+#ifdef EXPENSIVE_CHECKS
+  for (Function *F : Functions) {
+    if (ToBeDeletedFunctions.count(F))
+      continue;
+    assert(!verifyFunction(*F, &errs()) && "Module verification failed!");
   }
+#endif
 
   return ManifestChange;
 }
 
-bool Attributor::registerFunctionSignatureRewrite(
-    Argument &Arg, ArrayRef<Type *> ReplacementTypes,
-    ArgumentReplacementInfo::CalleeRepairCBTy &&CalleeRepairCB,
-    ArgumentReplacementInfo::ACSRepairCBTy &&ACSRepairCB) {
+ChangeStatus Attributor::run() {
+  SeedingPeriod = false;
+  runTillFixpoint();
+  ChangeStatus ManifestChange = manifestAttributes();
+  ChangeStatus CleanupChange = cleanupIR();
+  return ManifestChange | CleanupChange;
+}
+
+ChangeStatus Attributor::updateAA(AbstractAttribute &AA) {
+  // Use a new dependence vector for this update.
+  DependenceVector DV;
+  DependenceStack.push_back(&DV);
+
+  auto &AAState = AA.getState();
+  ChangeStatus CS = ChangeStatus::UNCHANGED;
+  if (!isAssumedDead(AA, nullptr, /* CheckBBLivenessOnly */ true))
+    CS = AA.update(*this);
+
+  if (DV.empty()) {
+    // If the attribute did not query any non-fix information, the state
+    // will not change and we can indicate that right away.
+    AAState.indicateOptimisticFixpoint();
+  }
+
+  if (!AAState.isAtFixpoint())
+    rememberDependences();
+
+  // Verify the stack was used properly, that is we pop the dependence vector we
+  // put there earlier.
+  DependenceVector *PoppedDV = DependenceStack.pop_back_val();
+  (void)PoppedDV;
+  assert(PoppedDV == &DV && "Inconsistent usage of the dependence stack!");
+
+  return CS;
+}
+
+/// Create a shallow wrapper for \p F such that \p F has internal linkage
+/// afterwards. It also sets the original \p F 's name to anonymous
+///
+/// A wrapper is a function with the same type (and attributes) as \p F
+/// that will only call \p F and return the result, if any.
+///
+/// Assuming the declaration of looks like:
+///   rty F(aty0 arg0, ..., atyN argN);
+///
+/// The wrapper will then look as follows:
+///   rty wrapper(aty0 arg0, ..., atyN argN) {
+///     return F(arg0, ..., argN);
+///   }
+///
+static void createShallowWrapper(Function &F) {
+  assert(AllowShallowWrappers &&
+         "Cannot create a wrapper if it is not allowed!");
+  assert(!F.isDeclaration() && "Cannot create a wrapper around a declaration!");
+
+  Module &M = *F.getParent();
+  LLVMContext &Ctx = M.getContext();
+  FunctionType *FnTy = F.getFunctionType();
+
+  Function *Wrapper =
+      Function::Create(FnTy, F.getLinkage(), F.getAddressSpace(), F.getName());
+  F.setName(""); // set the inside function anonymous
+  M.getFunctionList().insert(F.getIterator(), Wrapper);
+
+  F.setLinkage(GlobalValue::InternalLinkage);
+
+  F.replaceAllUsesWith(Wrapper);
+  assert(F.use_empty() && "Uses remained after wrapper was created!");
+
+  // Move the COMDAT section to the wrapper.
+  // TODO: Check if we need to keep it for F as well.
+  Wrapper->setComdat(F.getComdat());
+  F.setComdat(nullptr);
+
+  // Copy all metadata and attributes but keep them on F as well.
+  SmallVector<std::pair<unsigned, MDNode *>, 1> MDs;
+  F.getAllMetadata(MDs);
+  for (auto MDIt : MDs)
+    Wrapper->addMetadata(MDIt.first, *MDIt.second);
+  Wrapper->setAttributes(F.getAttributes());
+
+  // Create the call in the wrapper.
+  BasicBlock *EntryBB = BasicBlock::Create(Ctx, "entry", Wrapper);
+
+  SmallVector<Value *, 8> Args;
+  auto FArgIt = F.arg_begin();
+  for (Argument &Arg : Wrapper->args()) {
+    Args.push_back(&Arg);
+    Arg.setName((FArgIt++)->getName());
+  }
+
+  CallInst *CI = CallInst::Create(&F, Args, "", EntryBB);
+  CI->setTailCall(true);
+  CI->addAttribute(AttributeList::FunctionIndex, Attribute::NoInline);
+  ReturnInst::Create(Ctx, CI->getType()->isVoidTy() ? nullptr : CI, EntryBB);
+
+  NumFnShallowWrapperCreated++;
+}
+
+bool Attributor::isValidFunctionSignatureRewrite(
+    Argument &Arg, ArrayRef<Type *> ReplacementTypes) {
 
   auto CallSiteCanBeChanged = [](AbstractCallSite ACS) {
+    // Forbid the call site to cast the function return type. If we need to
+    // rewrite these functions we need to re-create a cast for the new call site
+    // (if the old had uses).
+    if (!ACS.getCalledFunction() ||
+        ACS.getInstruction()->getType() !=
+            ACS.getCalledFunction()->getReturnType())
+      return false;
     // Forbid must-tail calls for now.
-    return !ACS.isCallbackCall() && !ACS.getCallSite().isMustTailCall();
+    return !ACS.isCallbackCall() && !ACS.getInstruction()->isMustTailCall();
   };
 
   Function *Fn = Arg.getParent();
@@ -6196,14 +1387,17 @@ bool Attributor::registerFunctionSignatureRewrite(
   AttributeList FnAttributeList = Fn->getAttributes();
   if (FnAttributeList.hasAttrSomewhere(Attribute::Nest) ||
       FnAttributeList.hasAttrSomewhere(Attribute::StructRet) ||
-      FnAttributeList.hasAttrSomewhere(Attribute::InAlloca)) {
+      FnAttributeList.hasAttrSomewhere(Attribute::InAlloca) ||
+      FnAttributeList.hasAttrSomewhere(Attribute::Preallocated)) {
     LLVM_DEBUG(
         dbgs() << "[Attributor] Cannot rewrite due to complex attribute\n");
     return false;
   }
 
   // Avoid callbacks for now.
-  if (!checkForAllCallSites(CallSiteCanBeChanged, *Fn, true, nullptr)) {
+  bool AllCallSitesKnown;
+  if (!checkForAllCallSites(CallSiteCanBeChanged, *Fn, true, nullptr,
+                            AllCallSitesKnown)) {
     LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite all call sites\n");
     return false;
   }
@@ -6216,21 +1410,35 @@ bool Attributor::registerFunctionSignatureRewrite(
 
   // Forbid must-tail calls for now.
   // TODO:
-  bool AnyDead;
   auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(*Fn);
-  if (!checkForAllInstructionsImpl(OpcodeInstMap, InstPred, nullptr, AnyDead,
-                                   {Instruction::Call})) {
+  if (!checkForAllInstructionsImpl(nullptr, OpcodeInstMap, InstPred, nullptr,
+                                   nullptr, {Instruction::Call})) {
     LLVM_DEBUG(dbgs() << "[Attributor] Cannot rewrite due to instructions\n");
     return false;
   }
 
-  SmallVectorImpl<ArgumentReplacementInfo *> &ARIs = ArgumentReplacementMap[Fn];
-  if (ARIs.size() == 0)
+  return true;
+}
+
+bool Attributor::registerFunctionSignatureRewrite(
+    Argument &Arg, ArrayRef<Type *> ReplacementTypes,
+    ArgumentReplacementInfo::CalleeRepairCBTy &&CalleeRepairCB,
+    ArgumentReplacementInfo::ACSRepairCBTy &&ACSRepairCB) {
+  LLVM_DEBUG(dbgs() << "[Attributor] Register new rewrite of " << Arg << " in "
+                    << Arg.getParent()->getName() << " with "
+                    << ReplacementTypes.size() << " replacements\n");
+  assert(isValidFunctionSignatureRewrite(Arg, ReplacementTypes) &&
+         "Cannot register an invalid rewrite");
+
+  Function *Fn = Arg.getParent();
+  SmallVectorImpl<std::unique_ptr<ArgumentReplacementInfo>> &ARIs =
+      ArgumentReplacementMap[Fn];
+  if (ARIs.empty())
     ARIs.resize(Fn->arg_size());
 
   // If we have a replacement already with less than or equal new arguments,
   // ignore this request.
-  ArgumentReplacementInfo *&ARI = ARIs[Arg.getArgNo()];
+  std::unique_ptr<ArgumentReplacementInfo> &ARI = ARIs[Arg.getArgNo()];
   if (ARI && ARI->getNumReplacementArgs() <= ReplacementTypes.size()) {
     LLVM_DEBUG(dbgs() << "[Attributor] Existing rewrite is preferred\n");
     return false;
@@ -6238,18 +1446,28 @@ bool Attributor::registerFunctionSignatureRewrite(
 
   // If we have a replacement already but we like the new one better, delete
   // the old.
-  if (ARI)
-    delete ARI;
+  ARI.reset();
+
+  LLVM_DEBUG(dbgs() << "[Attributor] Register new rewrite of " << Arg << " in "
+                    << Arg.getParent()->getName() << " with "
+                    << ReplacementTypes.size() << " replacements\n");
 
   // Remember the replacement.
-  ARI = new ArgumentReplacementInfo(*this, Arg, ReplacementTypes,
-                                    std::move(CalleeRepairCB),
-                                    std::move(ACSRepairCB));
+  ARI.reset(new ArgumentReplacementInfo(*this, Arg, ReplacementTypes,
+                                        std::move(CalleeRepairCB),
+                                        std::move(ACSRepairCB)));
 
   return true;
 }
 
-ChangeStatus Attributor::rewriteFunctionSignatures() {
+bool Attributor::shouldSeedAttribute(AbstractAttribute &AA) {
+  if (SeedAllowList.size() == 0)
+    return true;
+  return std::count(SeedAllowList.begin(), SeedAllowList.end(), AA.getName());
+}
+
+ChangeStatus Attributor::rewriteFunctionSignatures(
+    SmallPtrSetImpl<Function *> &ModifiedFns) {
   ChangeStatus Changed = ChangeStatus::UNCHANGED;
 
   for (auto &It : ArgumentReplacementMap) {
@@ -6259,7 +1477,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures() {
     if (ToBeDeletedFunctions.count(OldFn))
       continue;
 
-    const SmallVectorImpl<ArgumentReplacementInfo *> &ARIs = It.getSecond();
+    const SmallVectorImpl<std::unique_ptr<ArgumentReplacementInfo>> &ARIs =
+        It.getSecond();
     assert(ARIs.size() == OldFn->arg_size() && "Inconsistent state!");
 
     SmallVector<Type *, 16> NewArgumentTypes;
@@ -6268,7 +1487,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures() {
     // Collect replacement argument types and copy over existing attributes.
     AttributeList OldFnAttributeList = OldFn->getAttributes();
     for (Argument &Arg : OldFn->args()) {
-      if (ArgumentReplacementInfo *ARI = ARIs[Arg.getArgNo()]) {
+      if (const std::unique_ptr<ArgumentReplacementInfo> &ARI =
+              ARIs[Arg.getArgNo()]) {
         NewArgumentTypes.append(ARI->ReplacementTypes.begin(),
                                 ARI->ReplacementTypes.end());
         NewArgumentAttributes.append(ARI->getNumReplacementArgs(),
@@ -6315,6 +1535,14 @@ ChangeStatus Attributor::rewriteFunctionSignatures() {
     NewFn->getBasicBlockList().splice(NewFn->begin(),
                                       OldFn->getBasicBlockList());
 
+    // Fixup block addresses to reference new function.
+    SmallVector<BlockAddress *, 8u> BlockAddresses;
+    for (User *U : OldFn->users())
+      if (auto *BA = dyn_cast<BlockAddress>(U))
+        BlockAddresses.push_back(BA);
+    for (auto *BA : BlockAddresses)
+      BA->replaceAllUsesWith(BlockAddress::get(NewFn, BA->getBasicBlock()));
+
     // Set of all "call-like" instructions that invoke the old function mapped
     // to their new replacements.
     SmallVector<std::pair<CallBase *, CallBase *>, 8> CallSitePairs;
@@ -6330,7 +1558,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures() {
       for (unsigned OldArgNum = 0; OldArgNum < ARIs.size(); ++OldArgNum) {
         unsigned NewFirstArgNum = NewArgOperands.size();
         (void)NewFirstArgNum; // only used inside assert.
-        if (ArgumentReplacementInfo *ARI = ARIs[OldArgNum]) {
+        if (const std::unique_ptr<ArgumentReplacementInfo> &ARI =
+                ARIs[OldArgNum]) {
           if (ARI->ACSRepairCB)
             ARI->ACSRepairCB(*ARI, ACS, NewArgOperands);
           assert(ARI->getNumReplacementArgs() + NewFirstArgNum ==
@@ -6369,11 +1598,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures() {
       }
 
       // Copy over various properties and the new attributes.
-      uint64_t W;
-      if (OldCB->extractProfTotalWeight(W))
-        NewCB->setProfWeight(W);
+      NewCB->copyMetadata(*OldCB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
       NewCB->setCallingConv(OldCB->getCallingConv());
-      NewCB->setDebugLoc(OldCB->getDebugLoc());
       NewCB->takeName(OldCB);
       NewCB->setAttributes(AttributeList::get(
           Ctx, OldCallAttributeList.getFnAttributes(),
@@ -6384,8 +1610,9 @@ ChangeStatus Attributor::rewriteFunctionSignatures() {
     };
 
     // Use the CallSiteReplacementCreator to create replacement call sites.
-    bool Success =
-        checkForAllCallSites(CallSiteReplacementCreator, *OldFn, true, nullptr);
+    bool AllCallSitesKnown;
+    bool Success = checkForAllCallSites(CallSiteReplacementCreator, *OldFn,
+                                        true, nullptr, AllCallSitesKnown);
     (void)Success;
     assert(Success && "Assumed call site replacement to succeed!");
 
@@ -6394,7 +1621,8 @@ ChangeStatus Attributor::rewriteFunctionSignatures() {
     auto NewFnArgIt = NewFn->arg_begin();
     for (unsigned OldArgNum = 0; OldArgNum < ARIs.size();
          ++OldArgNum, ++OldFnArgIt) {
-      if (ArgumentReplacementInfo *ARI = ARIs[OldArgNum]) {
+      if (const std::unique_ptr<ArgumentReplacementInfo> &ARI =
+              ARIs[OldArgNum]) {
         if (ARI->CalleeRepairCB)
           ARI->CalleeRepairCB(*ARI, *NewFn, NewFnArgIt);
         NewFnArgIt += ARI->ReplacementTypes.size();
@@ -6409,11 +1637,21 @@ ChangeStatus Attributor::rewriteFunctionSignatures() {
     for (auto &CallSitePair : CallSitePairs) {
       CallBase &OldCB = *CallSitePair.first;
       CallBase &NewCB = *CallSitePair.second;
+      assert(OldCB.getType() == NewCB.getType() &&
+             "Cannot handle call sites with different types!");
+      ModifiedFns.insert(OldCB.getFunction());
+      CGUpdater.replaceCallSite(OldCB, NewCB);
       OldCB.replaceAllUsesWith(&NewCB);
       OldCB.eraseFromParent();
     }
 
-    ToBeDeletedFunctions.insert(OldFn);
+    // Replace the function in the call graph (if any).
+    CGUpdater.replaceFunctionWith(*OldFn, *NewFn);
+
+    // If the old function was modified and needed to be reanalyzed, the new one
+    // does now.
+    if (ModifiedFns.erase(OldFn))
+      ModifiedFns.insert(NewFn);
 
     Changed = ChangeStatus::CHANGED;
   }
@@ -6421,13 +1659,16 @@ ChangeStatus Attributor::rewriteFunctionSignatures() {
   return Changed;
 }
 
-void Attributor::initializeInformationCache(Function &F) {
+void InformationCache::initializeInformationCache(const Function &CF,
+                                                  FunctionInfo &FI) {
+  // As we do not modify the function here we can remove the const
+  // withouth breaking implicit assumptions. At the end of the day, we could
+  // initialize the cache eagerly which would look the same to the users.
+  Function &F = const_cast<Function &>(CF);
 
   // Walk all instructions to find interesting instructions that might be
   // queried by abstract attributes during their initialization or update.
   // This has to happen before we create attributes.
-  auto &ReadOrWriteInsts = InfoCache.FuncRWInstsMap[&F];
-  auto &InstOpcodeMap = InfoCache.FuncInstOpcodeMap[&F];
 
   for (Instruction &I : instructions(&F)) {
     bool IsInterestingOpcode = false;
@@ -6439,15 +1680,23 @@ void Attributor::initializeInformationCache(Function &F) {
     // Note: There are no concrete attributes now so this is initially empty.
     switch (I.getOpcode()) {
     default:
-      assert((!ImmutableCallSite(&I)) && (!isa<CallBase>(&I)) &&
-             "New call site/base instruction type needs to be known int the "
+      assert(!isa<CallBase>(&I) &&
+             "New call base instruction type needs to be known in the "
              "Attributor.");
       break;
-    case Instruction::Load:
-      // The alignment of a pointer is interesting for loads.
-    case Instruction::Store:
-      // The alignment of a pointer is interesting for stores.
     case Instruction::Call:
+      // Calls are interesting on their own, additionally:
+      // For `llvm.assume` calls we also fill the KnowledgeMap as we find them.
+      // For `must-tail` calls we remember the caller and callee.
+      if (IntrinsicInst *Assume = dyn_cast<IntrinsicInst>(&I)) {
+        if (Assume->getIntrinsicID() == Intrinsic::assume)
+          fillMapFromAssume(*Assume, KnowledgeMap);
+      } else if (cast<CallInst>(I).isMustTailCall()) {
+        FI.ContainsMustTailCall = true;
+        if (const Function *Callee = cast<CallInst>(I).getCalledFunction())
+          getFunctionInfo(*Callee).CalledViaMustTail = true;
+      }
+      LLVM_FALLTHROUGH;
     case Instruction::CallBr:
     case Instruction::Invoke:
     case Instruction::CleanupRet:
@@ -6457,28 +1706,55 @@ void Attributor::initializeInformationCache(Function &F) {
     case Instruction::Br:
     case Instruction::Resume:
     case Instruction::Ret:
+    case Instruction::Load:
+      // The alignment of a pointer is interesting for loads.
+    case Instruction::Store:
+      // The alignment of a pointer is interesting for stores.
       IsInterestingOpcode = true;
     }
-    if (IsInterestingOpcode)
-      InstOpcodeMap[I.getOpcode()].push_back(&I);
+    if (IsInterestingOpcode) {
+      auto *&Insts = FI.OpcodeInstMap[I.getOpcode()];
+      if (!Insts)
+        Insts = new (Allocator) InstructionVectorTy();
+      Insts->push_back(&I);
+    }
     if (I.mayReadOrWriteMemory())
-      ReadOrWriteInsts.push_back(&I);
+      FI.RWInsts.push_back(&I);
   }
+
+  if (F.hasFnAttribute(Attribute::AlwaysInline) &&
+      isInlineViable(F).isSuccess())
+    InlineableFunctions.insert(&F);
+}
+
+InformationCache::FunctionInfo::~FunctionInfo() {
+  // The instruction vectors are allocated using a BumpPtrAllocator, we need to
+  // manually destroy them.
+  for (auto &It : OpcodeInstMap)
+    It.getSecond()->~InstructionVectorTy();
 }
 
 void Attributor::recordDependence(const AbstractAttribute &FromAA,
                                   const AbstractAttribute &ToAA,
                                   DepClassTy DepClass) {
+  // If we are outside of an update, thus before the actual fixpoint iteration
+  // started (= when we create AAs), we do not track dependences because we will
+  // put all AAs into the initial worklist anyway.
+  if (DependenceStack.empty())
+    return;
   if (FromAA.getState().isAtFixpoint())
     return;
+  DependenceStack.back()->push_back({&FromAA, &ToAA, DepClass});
+}
 
-  if (DepClass == DepClassTy::REQUIRED)
-    QueryMap[&FromAA].RequiredAAs.insert(
-        const_cast<AbstractAttribute *>(&ToAA));
-  else
-    QueryMap[&FromAA].OptionalAAs.insert(
-        const_cast<AbstractAttribute *>(&ToAA));
-  QueriedNonFixAA = true;
+void Attributor::rememberDependences() {
+  assert(!DependenceStack.empty() && "No dependences to remember!");
+
+  for (DepInfo &DI : *DependenceStack.back()) {
+    auto &DepAAs = const_cast<AbstractAttribute &>(*DI.FromAA).Deps;
+    DepAAs.push_back(AbstractAttribute::DepTy(
+        const_cast<AbstractAttribute *>(DI.ToAA), unsigned(DI.DepClass)));
+  }
 }
 
 void Attributor::identifyDefaultAbstractAttributes(Function &F) {
@@ -6487,6 +1763,17 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
   if (F.isDeclaration())
     return;
 
+  // In non-module runs we need to look at the call sites of a function to
+  // determine if it is part of a must-tail call edge. This will influence what
+  // attributes we can derive.
+  InformationCache::FunctionInfo &FI = InfoCache.getFunctionInfo(F);
+  if (!isModulePass() && !FI.CalledViaMustTail) {
+    for (const Use &U : F.uses())
+      if (const auto *CB = dyn_cast<CallBase>(U.getUser()))
+        if (CB->isCallee(&U) && CB->isMustTailCall())
+          FI.CalledViaMustTail = true;
+  }
+
   IRPosition FPos = IRPosition::function(F);
 
   // Check for dead BasicBlocks in every function.
@@ -6518,6 +1805,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
   // Every function might be "readnone/readonly/writeonly/...".
   getOrCreateAAFor<AAMemoryBehavior>(FPos);
 
+  // Every function can be "readnone/argmemonly/inaccessiblememonly/...".
+  getOrCreateAAFor<AAMemoryLocation>(FPos);
+
   // Every function might be applicable for Heap-To-Stack conversion.
   if (EnableHeapToStack)
     getOrCreateAAFor<AAHeapToStack>(FPos);
@@ -6560,6 +1850,9 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
     // Every argument might be simplified.
     getOrCreateAAFor<AAValueSimplify>(ArgPos);
 
+    // Every argument might be dead.
+    getOrCreateAAFor<AAIsDead>(ArgPos);
+
     if (Arg.getType()->isPointerTy()) {
       // Every argument with pointer type might be marked nonnull.
       getOrCreateAAFor<AANonNull>(ArgPos);
@@ -6582,75 +1875,87 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
 
       // Every argument with pointer type might be marked nofree.
       getOrCreateAAFor<AANoFree>(ArgPos);
+
+      // Every argument with pointer type might be privatizable (or promotable)
+      getOrCreateAAFor<AAPrivatizablePtr>(ArgPos);
     }
   }
 
   auto CallSitePred = [&](Instruction &I) -> bool {
-    CallSite CS(&I);
-    if (Function *Callee = CS.getCalledFunction()) {
-      // Skip declerations except if annotations on their call sites were
-      // explicitly requested.
-      if (!AnnotateDeclarationCallSites && Callee->isDeclaration() &&
-          !Callee->hasMetadata(LLVMContext::MD_callback))
-        return true;
+    auto &CB = cast<CallBase>(I);
+    IRPosition CBRetPos = IRPosition::callsite_returned(CB);
 
-      if (!Callee->getReturnType()->isVoidTy() && !CS->use_empty()) {
+    // Call sites might be dead if they do not have side effects and no live
+    // users. The return value might be dead if there are no live users.
+    getOrCreateAAFor<AAIsDead>(CBRetPos);
 
-        IRPosition CSRetPos = IRPosition::callsite_returned(CS);
+    Function *Callee = CB.getCalledFunction();
+    // TODO: Even if the callee is not known now we might be able to simplify
+    //       the call/callee.
+    if (!Callee)
+      return true;
 
-        // Call site return values might be dead.
-        getOrCreateAAFor<AAIsDead>(CSRetPos);
+    // Skip declarations except if annotations on their call sites were
+    // explicitly requested.
+    if (!AnnotateDeclarationCallSites && Callee->isDeclaration() &&
+        !Callee->hasMetadata(LLVMContext::MD_callback))
+      return true;
 
-        // Call site return integer values might be limited by a constant range.
-        if (Callee->getReturnType()->isIntegerTy()) {
-          getOrCreateAAFor<AAValueConstantRange>(CSRetPos);
-        }
-      }
+    if (!Callee->getReturnType()->isVoidTy() && !CB.use_empty()) {
 
-      for (int i = 0, e = CS.getNumArgOperands(); i < e; i++) {
+      IRPosition CBRetPos = IRPosition::callsite_returned(CB);
 
-        IRPosition CSArgPos = IRPosition::callsite_argument(CS, i);
+      // Call site return integer values might be limited by a constant range.
+      if (Callee->getReturnType()->isIntegerTy())
+        getOrCreateAAFor<AAValueConstantRange>(CBRetPos);
+    }
 
-        // Every call site argument might be dead.
-        getOrCreateAAFor<AAIsDead>(CSArgPos);
+    for (int I = 0, E = CB.getNumArgOperands(); I < E; ++I) {
 
-        // Call site argument might be simplified.
-        getOrCreateAAFor<AAValueSimplify>(CSArgPos);
+      IRPosition CBArgPos = IRPosition::callsite_argument(CB, I);
 
-        if (!CS.getArgument(i)->getType()->isPointerTy())
-          continue;
+      // Every call site argument might be dead.
+      getOrCreateAAFor<AAIsDead>(CBArgPos);
 
-        // Call site argument attribute "non-null".
-        getOrCreateAAFor<AANonNull>(CSArgPos);
+      // Call site argument might be simplified.
+      getOrCreateAAFor<AAValueSimplify>(CBArgPos);
 
-        // Call site argument attribute "no-alias".
-        getOrCreateAAFor<AANoAlias>(CSArgPos);
+      if (!CB.getArgOperand(I)->getType()->isPointerTy())
+        continue;
 
-        // Call site argument attribute "dereferenceable".
-        getOrCreateAAFor<AADereferenceable>(CSArgPos);
+      // Call site argument attribute "non-null".
+      getOrCreateAAFor<AANonNull>(CBArgPos);
 
-        // Call site argument attribute "align".
-        getOrCreateAAFor<AAAlign>(CSArgPos);
+      // Call site argument attribute "nocapture".
+      getOrCreateAAFor<AANoCapture>(CBArgPos);
 
-        // Call site argument attribute
-        // "readnone/readonly/writeonly/..."
-        getOrCreateAAFor<AAMemoryBehavior>(CSArgPos);
+      // Call site argument attribute "no-alias".
+      getOrCreateAAFor<AANoAlias>(CBArgPos);
 
-        // Call site argument attribute "nofree".
-        getOrCreateAAFor<AANoFree>(CSArgPos);
-      }
+      // Call site argument attribute "dereferenceable".
+      getOrCreateAAFor<AADereferenceable>(CBArgPos);
+
+      // Call site argument attribute "align".
+      getOrCreateAAFor<AAAlign>(CBArgPos);
+
+      // Call site argument attribute
+      // "readnone/readonly/writeonly/..."
+      getOrCreateAAFor<AAMemoryBehavior>(CBArgPos);
+
+      // Call site argument attribute "nofree".
+      getOrCreateAAFor<AANoFree>(CBArgPos);
     }
     return true;
   };
 
   auto &OpcodeInstMap = InfoCache.getOpcodeInstMapForFunction(F);
-  bool Success, AnyDead = false;
+  bool Success;
   Success = checkForAllInstructionsImpl(
-      OpcodeInstMap, CallSitePred, nullptr, AnyDead,
+      nullptr, OpcodeInstMap, CallSitePred, nullptr, nullptr,
       {(unsigned)Instruction::Invoke, (unsigned)Instruction::CallBr,
        (unsigned)Instruction::Call});
   (void)Success;
-  assert(Success && !AnyDead && "Expected the check call to be successful!");
+  assert(Success && "Expected the check call to be successful!");
 
   auto LoadStorePred = [&](Instruction &I) -> bool {
     if (isa<LoadInst>(I))
@@ -6662,10 +1967,10 @@ void Attributor::identifyDefaultAbstractAttributes(Function &F) {
     return true;
   };
   Success = checkForAllInstructionsImpl(
-      OpcodeInstMap, LoadStorePred, nullptr, AnyDead,
+      nullptr, OpcodeInstMap, LoadStorePred, nullptr, nullptr,
       {(unsigned)Instruction::Load, (unsigned)Instruction::Store});
   (void)Success;
-  assert(Success && !AnyDead && "Expected the check call to be successful!");
+  assert(Success && "Expected the check call to be successful!");
 }
 
 /// Helpers to ease debugging through output streams and print calls.
@@ -6703,14 +2008,6 @@ raw_ostream &llvm::operator<<(raw_ostream &OS, const IRPosition &Pos) {
             << Pos.getAnchorValue().getName() << "@" << Pos.getArgNo() << "]}";
 }
 
-template <typename base_ty, base_ty BestState, base_ty WorstState>
-raw_ostream &
-llvm::operator<<(raw_ostream &OS,
-                 const IntegerStateBase<base_ty, BestState, WorstState> &S) {
-  return OS << "(" << S.getKnown() << "-" << S.getAssumed() << ")"
-            << static_cast<const AbstractState &>(S);
-}
-
 raw_ostream &llvm::operator<<(raw_ostream &OS, const IntegerRangeState &S) {
   OS << "range-state(" << S.getBitWidth() << ")<";
   S.getKnown().print(OS);
@@ -6740,50 +2037,95 @@ void AbstractAttribute::print(raw_ostream &OS) const {
 ///                       Pass (Manager) Boilerplate
 /// ----------------------------------------------------------------------------
 
-static bool runAttributorOnModule(Module &M, AnalysisGetter &AG) {
-  if (DisableAttributor)
+static bool runAttributorOnFunctions(InformationCache &InfoCache,
+                                     SetVector<Function *> &Functions,
+                                     AnalysisGetter &AG,
+                                     CallGraphUpdater &CGUpdater) {
+  if (Functions.empty())
     return false;
 
-  LLVM_DEBUG(dbgs() << "[Attributor] Run on module with " << M.size()
+  LLVM_DEBUG(dbgs() << "[Attributor] Run on module with " << Functions.size()
                     << " functions.\n");
 
   // Create an Attributor and initially empty information cache that is filled
   // while we identify default attribute opportunities.
-  InformationCache InfoCache(M, AG);
-  Attributor A(InfoCache, DepRecInterval);
+  Attributor A(Functions, InfoCache, CGUpdater);
 
-  for (Function &F : M)
-    A.initializeInformationCache(F);
+  // Create shallow wrappers for all functions that are not IPO amendable
+  if (AllowShallowWrappers)
+    for (Function *F : Functions)
+      if (!A.isFunctionIPOAmendable(*F))
+        createShallowWrapper(*F);
 
-  for (Function &F : M) {
-    if (F.hasExactDefinition())
+  for (Function *F : Functions) {
+    if (F->hasExactDefinition())
       NumFnWithExactDefinition++;
     else
       NumFnWithoutExactDefinition++;
 
     // We look at internal functions only on-demand but if any use is not a
-    // direct call, we have to do it eagerly.
-    if (F.hasLocalLinkage()) {
-      if (llvm::all_of(F.uses(), [](const Use &U) {
-            return ImmutableCallSite(U.getUser()) &&
-                   ImmutableCallSite(U.getUser()).isCallee(&U);
+    // direct call or outside the current set of analyzed functions, we have to
+    // do it eagerly.
+    if (F->hasLocalLinkage()) {
+      if (llvm::all_of(F->uses(), [&Functions](const Use &U) {
+            const auto *CB = dyn_cast<CallBase>(U.getUser());
+            return CB && CB->isCallee(&U) &&
+                   Functions.count(const_cast<Function *>(CB->getCaller()));
           }))
         continue;
     }
 
     // Populate the Attributor with abstract attribute opportunities in the
     // function and the information cache with IR information.
-    A.identifyDefaultAbstractAttributes(F);
+    A.identifyDefaultAbstractAttributes(*F);
   }
 
-  bool Changed = A.run(M) == ChangeStatus::CHANGED;
-  assert(!verifyModule(M, &errs()) && "Module verification failed!");
-  return Changed;
+  ChangeStatus Changed = A.run();
+  LLVM_DEBUG(dbgs() << "[Attributor] Done with " << Functions.size()
+                    << " functions, result: " << Changed << ".\n");
+  return Changed == ChangeStatus::CHANGED;
 }
 
 PreservedAnalyses AttributorPass::run(Module &M, ModuleAnalysisManager &AM) {
-  AnalysisGetter AG(AM);
-  if (runAttributorOnModule(M, AG)) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  AnalysisGetter AG(FAM);
+
+  SetVector<Function *> Functions;
+  for (Function &F : M)
+    Functions.insert(&F);
+
+  CallGraphUpdater CGUpdater;
+  BumpPtrAllocator Allocator;
+  InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
+  if (runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater)) {
+    // FIXME: Think about passes we will preserve and add them here.
+    return PreservedAnalyses::none();
+  }
+  return PreservedAnalyses::all();
+}
+
+PreservedAnalyses AttributorCGSCCPass::run(LazyCallGraph::SCC &C,
+                                           CGSCCAnalysisManager &AM,
+                                           LazyCallGraph &CG,
+                                           CGSCCUpdateResult &UR) {
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+  AnalysisGetter AG(FAM);
+
+  SetVector<Function *> Functions;
+  for (LazyCallGraph::Node &N : C)
+    Functions.insert(&N.getFunction());
+
+  if (Functions.empty())
+    return PreservedAnalyses::all();
+
+  Module &M = *Functions.back()->getParent();
+  CallGraphUpdater CGUpdater;
+  CGUpdater.initialize(CG, C, AM, UR);
+  BumpPtrAllocator Allocator;
+  InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ &Functions);
+  if (runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater)) {
     // FIXME: Think about passes we will preserve and add them here.
     return PreservedAnalyses::none();
   }
@@ -6804,7 +2146,14 @@ struct AttributorLegacyPass : public ModulePass {
       return false;
 
     AnalysisGetter AG;
-    return runAttributorOnModule(M, AG);
+    SetVector<Function *> Functions;
+    for (Function &F : M)
+      Functions.insert(&F);
+
+    CallGraphUpdater CGUpdater;
+    BumpPtrAllocator Allocator;
+    InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ nullptr);
+    return runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -6813,158 +2162,65 @@ struct AttributorLegacyPass : public ModulePass {
   }
 };
 
-} // end anonymous namespace
+struct AttributorCGSCCLegacyPass : public CallGraphSCCPass {
+  CallGraphUpdater CGUpdater;
+  static char ID;
 
-Pass *llvm::createAttributorLegacyPass() { return new AttributorLegacyPass(); }
+  AttributorCGSCCLegacyPass() : CallGraphSCCPass(ID) {
+    initializeAttributorCGSCCLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
 
-char AttributorLegacyPass::ID = 0;
+  bool runOnSCC(CallGraphSCC &SCC) override {
+    if (skipSCC(SCC))
+      return false;
 
-const char AAReturnedValues::ID = 0;
-const char AANoUnwind::ID = 0;
-const char AANoSync::ID = 0;
-const char AANoFree::ID = 0;
-const char AANonNull::ID = 0;
-const char AANoRecurse::ID = 0;
-const char AAWillReturn::ID = 0;
-const char AAUndefinedBehavior::ID = 0;
-const char AANoAlias::ID = 0;
-const char AAReachability::ID = 0;
-const char AANoReturn::ID = 0;
-const char AAIsDead::ID = 0;
-const char AADereferenceable::ID = 0;
-const char AAAlign::ID = 0;
-const char AANoCapture::ID = 0;
-const char AAValueSimplify::ID = 0;
-const char AAHeapToStack::ID = 0;
-const char AAMemoryBehavior::ID = 0;
-const char AAValueConstantRange::ID = 0;
-
-// Macro magic to create the static generator function for attributes that
-// follow the naming scheme.
-
-#define SWITCH_PK_INV(CLASS, PK, POS_NAME)                                     \
-  case IRPosition::PK:                                                         \
-    llvm_unreachable("Cannot create " #CLASS " for a " POS_NAME " position!");
-
-#define SWITCH_PK_CREATE(CLASS, IRP, PK, SUFFIX)                               \
-  case IRPosition::PK:                                                         \
-    AA = new CLASS##SUFFIX(IRP);                                               \
-    break;
+    SetVector<Function *> Functions;
+    for (CallGraphNode *CGN : SCC)
+      if (Function *Fn = CGN->getFunction())
+        if (!Fn->isDeclaration())
+          Functions.insert(Fn);
 
-#define CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                 \
-  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
-    CLASS *AA = nullptr;                                                       \
-    switch (IRP.getPositionKind()) {                                           \
-      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
-      SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating")                              \
-      SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument")                           \
-      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
-      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned")       \
-      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument")       \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
-    }                                                                          \
-    return *AA;                                                                \
-  }
+    if (Functions.empty())
+      return false;
 
-#define CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                    \
-  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
-    CLASS *AA = nullptr;                                                       \
-    switch (IRP.getPositionKind()) {                                           \
-      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
-      SWITCH_PK_INV(CLASS, IRP_FUNCTION, "function")                           \
-      SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site")                         \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned)                     \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
-    }                                                                          \
-    return *AA;                                                                \
+    AnalysisGetter AG;
+    CallGraph &CG = const_cast<CallGraph &>(SCC.getCallGraph());
+    CGUpdater.initialize(CG, SCC);
+    Module &M = *Functions.back()->getParent();
+    BumpPtrAllocator Allocator;
+    InformationCache InfoCache(M, AG, Allocator, /* CGSCC */ &Functions);
+    return runAttributorOnFunctions(InfoCache, Functions, AG, CGUpdater);
   }
 
-#define CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                      \
-  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
-    CLASS *AA = nullptr;                                                       \
-    switch (IRP.getPositionKind()) {                                           \
-      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned)                     \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
-    }                                                                          \
-    return *AA;                                                                \
-  }
+  bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
 
-#define CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)            \
-  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
-    CLASS *AA = nullptr;                                                       \
-    switch (IRP.getPositionKind()) {                                           \
-      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
-      SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument")                           \
-      SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating")                              \
-      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
-      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned")       \
-      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument")       \
-      SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site")                         \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
-    }                                                                          \
-    return *AA;                                                                \
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    // FIXME: Think about passes we will preserve and add them here.
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
+    CallGraphSCCPass::getAnalysisUsage(AU);
   }
+};
 
-#define CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                  \
-  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
-    CLASS *AA = nullptr;                                                       \
-    switch (IRP.getPositionKind()) {                                           \
-      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
-      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
-      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
-    }                                                                          \
-    return *AA;                                                                \
-  }
+} // end anonymous namespace
+
+Pass *llvm::createAttributorLegacyPass() { return new AttributorLegacyPass(); }
+Pass *llvm::createAttributorCGSCCLegacyPass() {
+  return new AttributorCGSCCLegacyPass();
+}
 
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUnwind)
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoSync)
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoRecurse)
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAWillReturn)
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoReturn)
-CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReturnedValues)
-
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANonNull)
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoAlias)
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AADereferenceable)
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAlign)
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoCapture)
-CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueConstantRange)
-
-CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueSimplify)
-CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAIsDead)
-CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFree)
-
-CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAHeapToStack)
-CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReachability)
-CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAUndefinedBehavior)
-
-CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryBehavior)
-
-#undef CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION
-#undef CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION
-#undef CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION
-#undef CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION
-#undef CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION
-#undef SWITCH_PK_CREATE
-#undef SWITCH_PK_INV
+char AttributorLegacyPass::ID = 0;
+char AttributorCGSCCLegacyPass::ID = 0;
 
 INITIALIZE_PASS_BEGIN(AttributorLegacyPass, "attributor",
                       "Deduce and propagate attributes", false, false)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(AttributorLegacyPass, "attributor",
                     "Deduce and propagate attributes", false, false)
+INITIALIZE_PASS_BEGIN(AttributorCGSCCLegacyPass, "attributor-cgscc",
+                      "Deduce and propagate attributes (CGSCC pass)", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(AttributorCGSCCLegacyPass, "attributor-cgscc",
+                    "Deduce and propagate attributes (CGSCC pass)", false,
+                    false)
diff --git a/llvm/lib/Transforms/IPO/AttributorAttributes.cpp b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
new file mode 100644
index 000000000000..7e9fd61eeb41
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/AttributorAttributes.cpp
@@ -0,0 +1,7225 @@
+//===- AttributorAttributes.cpp - Attributes for Attributor deduction -----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// See the Attributor.h file comment and the class descriptions in that file for
+// more information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/Attributor.h"
+
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/CaptureTracking.h"
+#include "llvm/Analysis/LazyValueInfo.h"
+#include "llvm/Analysis/MemoryBuiltins.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/NoFolder.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO/ArgumentPromotion.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+#include <cassert>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "attributor"
+
+static cl::opt<bool> ManifestInternal(
+    "attributor-manifest-internal", cl::Hidden,
+    cl::desc("Manifest Attributor internal string attributes."),
+    cl::init(false));
+
+static cl::opt<int> MaxHeapToStackSize("max-heap-to-stack-size", cl::init(128),
+                                       cl::Hidden);
+
+STATISTIC(NumAAs, "Number of abstract attributes created");
+
+// Some helper macros to deal with statistics tracking.
+//
+// Usage:
+// For simple IR attribute tracking overload trackStatistics in the abstract
+// attribute and choose the right STATS_DECLTRACK_********* macro,
+// e.g.,:
+//  void trackStatistics() const override {
+//    STATS_DECLTRACK_ARG_ATTR(returned)
+//  }
+// If there is a single "increment" side one can use the macro
+// STATS_DECLTRACK with a custom message. If there are multiple increment
+// sides, STATS_DECL and STATS_TRACK can also be used separately.
+//
+#define BUILD_STAT_MSG_IR_ATTR(TYPE, NAME)                                     \
+  ("Number of " #TYPE " marked '" #NAME "'")
+#define BUILD_STAT_NAME(NAME, TYPE) NumIR##TYPE##_##NAME
+#define STATS_DECL_(NAME, MSG) STATISTIC(NAME, MSG);
+#define STATS_DECL(NAME, TYPE, MSG)                                            \
+  STATS_DECL_(BUILD_STAT_NAME(NAME, TYPE), MSG);
+#define STATS_TRACK(NAME, TYPE) ++(BUILD_STAT_NAME(NAME, TYPE));
+#define STATS_DECLTRACK(NAME, TYPE, MSG)                                       \
+  {                                                                            \
+    STATS_DECL(NAME, TYPE, MSG)                                                \
+    STATS_TRACK(NAME, TYPE)                                                    \
+  }
+#define STATS_DECLTRACK_ARG_ATTR(NAME)                                         \
+  STATS_DECLTRACK(NAME, Arguments, BUILD_STAT_MSG_IR_ATTR(arguments, NAME))
+#define STATS_DECLTRACK_CSARG_ATTR(NAME)                                       \
+  STATS_DECLTRACK(NAME, CSArguments,                                           \
+                  BUILD_STAT_MSG_IR_ATTR(call site arguments, NAME))
+#define STATS_DECLTRACK_FN_ATTR(NAME)                                          \
+  STATS_DECLTRACK(NAME, Function, BUILD_STAT_MSG_IR_ATTR(functions, NAME))
+#define STATS_DECLTRACK_CS_ATTR(NAME)                                          \
+  STATS_DECLTRACK(NAME, CS, BUILD_STAT_MSG_IR_ATTR(call site, NAME))
+#define STATS_DECLTRACK_FNRET_ATTR(NAME)                                       \
+  STATS_DECLTRACK(NAME, FunctionReturn,                                        \
+                  BUILD_STAT_MSG_IR_ATTR(function returns, NAME))
+#define STATS_DECLTRACK_CSRET_ATTR(NAME)                                       \
+  STATS_DECLTRACK(NAME, CSReturn,                                              \
+                  BUILD_STAT_MSG_IR_ATTR(call site returns, NAME))
+#define STATS_DECLTRACK_FLOATING_ATTR(NAME)                                    \
+  STATS_DECLTRACK(NAME, Floating,                                              \
+                  ("Number of floating values known to be '" #NAME "'"))
+
+// Specialization of the operator<< for abstract attributes subclasses. This
+// disambiguates situations where multiple operators are applicable.
+namespace llvm {
+#define PIPE_OPERATOR(CLASS)                                                   \
+  raw_ostream &operator<<(raw_ostream &OS, const CLASS &AA) {                  \
+    return OS << static_cast<const AbstractAttribute &>(AA);                   \
+  }
+
+PIPE_OPERATOR(AAIsDead)
+PIPE_OPERATOR(AANoUnwind)
+PIPE_OPERATOR(AANoSync)
+PIPE_OPERATOR(AANoRecurse)
+PIPE_OPERATOR(AAWillReturn)
+PIPE_OPERATOR(AANoReturn)
+PIPE_OPERATOR(AAReturnedValues)
+PIPE_OPERATOR(AANonNull)
+PIPE_OPERATOR(AANoAlias)
+PIPE_OPERATOR(AADereferenceable)
+PIPE_OPERATOR(AAAlign)
+PIPE_OPERATOR(AANoCapture)
+PIPE_OPERATOR(AAValueSimplify)
+PIPE_OPERATOR(AANoFree)
+PIPE_OPERATOR(AAHeapToStack)
+PIPE_OPERATOR(AAReachability)
+PIPE_OPERATOR(AAMemoryBehavior)
+PIPE_OPERATOR(AAMemoryLocation)
+PIPE_OPERATOR(AAValueConstantRange)
+PIPE_OPERATOR(AAPrivatizablePtr)
+PIPE_OPERATOR(AAUndefinedBehavior)
+
+#undef PIPE_OPERATOR
+} // namespace llvm
+
+namespace {
+
+static Optional<ConstantInt *>
+getAssumedConstantInt(Attributor &A, const Value &V,
+                      const AbstractAttribute &AA,
+                      bool &UsedAssumedInformation) {
+  Optional<Constant *> C = A.getAssumedConstant(V, AA, UsedAssumedInformation);
+  if (C.hasValue())
+    return dyn_cast_or_null<ConstantInt>(C.getValue());
+  return llvm::None;
+}
+
+/// Get pointer operand of memory accessing instruction. If \p I is
+/// not a memory accessing instruction, return nullptr. If \p AllowVolatile,
+/// is set to false and the instruction is volatile, return nullptr.
+static const Value *getPointerOperand(const Instruction *I,
+                                      bool AllowVolatile) {
+  if (auto *LI = dyn_cast<LoadInst>(I)) {
+    if (!AllowVolatile && LI->isVolatile())
+      return nullptr;
+    return LI->getPointerOperand();
+  }
+
+  if (auto *SI = dyn_cast<StoreInst>(I)) {
+    if (!AllowVolatile && SI->isVolatile())
+      return nullptr;
+    return SI->getPointerOperand();
+  }
+
+  if (auto *CXI = dyn_cast<AtomicCmpXchgInst>(I)) {
+    if (!AllowVolatile && CXI->isVolatile())
+      return nullptr;
+    return CXI->getPointerOperand();
+  }
+
+  if (auto *RMWI = dyn_cast<AtomicRMWInst>(I)) {
+    if (!AllowVolatile && RMWI->isVolatile())
+      return nullptr;
+    return RMWI->getPointerOperand();
+  }
+
+  return nullptr;
+}
+
+/// Helper function to create a pointer of type \p ResTy, based on \p Ptr, and
+/// advanced by \p Offset bytes. To aid later analysis the method tries to build
+/// getelement pointer instructions that traverse the natural type of \p Ptr if
+/// possible. If that fails, the remaining offset is adjusted byte-wise, hence
+/// through a cast to i8*.
+///
+/// TODO: This could probably live somewhere more prominantly if it doesn't
+///       already exist.
+static Value *constructPointer(Type *ResTy, Value *Ptr, int64_t Offset,
+                               IRBuilder<NoFolder> &IRB, const DataLayout &DL) {
+  assert(Offset >= 0 && "Negative offset not supported yet!");
+  LLVM_DEBUG(dbgs() << "Construct pointer: " << *Ptr << " + " << Offset
+                    << "-bytes as " << *ResTy << "\n");
+
+  // The initial type we are trying to traverse to get nice GEPs.
+  Type *Ty = Ptr->getType();
+
+  SmallVector<Value *, 4> Indices;
+  std::string GEPName = Ptr->getName().str();
+  while (Offset) {
+    uint64_t Idx, Rem;
+
+    if (auto *STy = dyn_cast<StructType>(Ty)) {
+      const StructLayout *SL = DL.getStructLayout(STy);
+      if (int64_t(SL->getSizeInBytes()) < Offset)
+        break;
+      Idx = SL->getElementContainingOffset(Offset);
+      assert(Idx < STy->getNumElements() && "Offset calculation error!");
+      Rem = Offset - SL->getElementOffset(Idx);
+      Ty = STy->getElementType(Idx);
+    } else if (auto *PTy = dyn_cast<PointerType>(Ty)) {
+      Ty = PTy->getElementType();
+      if (!Ty->isSized())
+        break;
+      uint64_t ElementSize = DL.getTypeAllocSize(Ty);
+      assert(ElementSize && "Expected type with size!");
+      Idx = Offset / ElementSize;
+      Rem = Offset % ElementSize;
+    } else {
+      // Non-aggregate type, we cast and make byte-wise progress now.
+      break;
+    }
+
+    LLVM_DEBUG(errs() << "Ty: " << *Ty << " Offset: " << Offset
+                      << " Idx: " << Idx << " Rem: " << Rem << "\n");
+
+    GEPName += "." + std::to_string(Idx);
+    Indices.push_back(ConstantInt::get(IRB.getInt32Ty(), Idx));
+    Offset = Rem;
+  }
+
+  // Create a GEP if we collected indices above.
+  if (Indices.size())
+    Ptr = IRB.CreateGEP(Ptr, Indices, GEPName);
+
+  // If an offset is left we use byte-wise adjustment.
+  if (Offset) {
+    Ptr = IRB.CreateBitCast(Ptr, IRB.getInt8PtrTy());
+    Ptr = IRB.CreateGEP(Ptr, IRB.getInt32(Offset),
+                        GEPName + ".b" + Twine(Offset));
+  }
+
+  // Ensure the result has the requested type.
+  Ptr = IRB.CreateBitOrPointerCast(Ptr, ResTy, Ptr->getName() + ".cast");
+
+  LLVM_DEBUG(dbgs() << "Constructed pointer: " << *Ptr << "\n");
+  return Ptr;
+}
+
+/// Recursively visit all values that might become \p IRP at some point. This
+/// will be done by looking through cast instructions, selects, phis, and calls
+/// with the "returned" attribute. Once we cannot look through the value any
+/// further, the callback \p VisitValueCB is invoked and passed the current
+/// value, the \p State, and a flag to indicate if we stripped anything.
+/// Stripped means that we unpacked the value associated with \p IRP at least
+/// once. Note that the value used for the callback may still be the value
+/// associated with \p IRP (due to PHIs). To limit how much effort is invested,
+/// we will never visit more values than specified by \p MaxValues.
+template <typename AAType, typename StateTy>
+static bool genericValueTraversal(
+    Attributor &A, IRPosition IRP, const AAType &QueryingAA, StateTy &State,
+    function_ref<bool(Value &, const Instruction *, StateTy &, bool)>
+        VisitValueCB,
+    const Instruction *CtxI, bool UseValueSimplify = true, int MaxValues = 16,
+    function_ref<Value *(Value *)> StripCB = nullptr) {
+
+  const AAIsDead *LivenessAA = nullptr;
+  if (IRP.getAnchorScope())
+    LivenessAA = &A.getAAFor<AAIsDead>(
+        QueryingAA, IRPosition::function(*IRP.getAnchorScope()),
+        /* TrackDependence */ false);
+  bool AnyDead = false;
+
+  using Item = std::pair<Value *, const Instruction *>;
+  SmallSet<Item, 16> Visited;
+  SmallVector<Item, 16> Worklist;
+  Worklist.push_back({&IRP.getAssociatedValue(), CtxI});
+
+  int Iteration = 0;
+  do {
+    Item I = Worklist.pop_back_val();
+    Value *V = I.first;
+    CtxI = I.second;
+    if (StripCB)
+      V = StripCB(V);
+
+    // Check if we should process the current value. To prevent endless
+    // recursion keep a record of the values we followed!
+    if (!Visited.insert(I).second)
+      continue;
+
+    // Make sure we limit the compile time for complex expressions.
+    if (Iteration++ >= MaxValues)
+      return false;
+
+    // Explicitly look through calls with a "returned" attribute if we do
+    // not have a pointer as stripPointerCasts only works on them.
+    Value *NewV = nullptr;
+    if (V->getType()->isPointerTy()) {
+      NewV = V->stripPointerCasts();
+    } else {
+      auto *CB = dyn_cast<CallBase>(V);
+      if (CB && CB->getCalledFunction()) {
+        for (Argument &Arg : CB->getCalledFunction()->args())
+          if (Arg.hasReturnedAttr()) {
+            NewV = CB->getArgOperand(Arg.getArgNo());
+            break;
+          }
+      }
+    }
+    if (NewV && NewV != V) {
+      Worklist.push_back({NewV, CtxI});
+      continue;
+    }
+
+    // Look through select instructions, visit both potential values.
+    if (auto *SI = dyn_cast<SelectInst>(V)) {
+      Worklist.push_back({SI->getTrueValue(), CtxI});
+      Worklist.push_back({SI->getFalseValue(), CtxI});
+      continue;
+    }
+
+    // Look through phi nodes, visit all live operands.
+    if (auto *PHI = dyn_cast<PHINode>(V)) {
+      assert(LivenessAA &&
+             "Expected liveness in the presence of instructions!");
+      for (unsigned u = 0, e = PHI->getNumIncomingValues(); u < e; u++) {
+        BasicBlock *IncomingBB = PHI->getIncomingBlock(u);
+        if (A.isAssumedDead(*IncomingBB->getTerminator(), &QueryingAA,
+                            LivenessAA,
+                            /* CheckBBLivenessOnly */ true)) {
+          AnyDead = true;
+          continue;
+        }
+        Worklist.push_back(
+            {PHI->getIncomingValue(u), IncomingBB->getTerminator()});
+      }
+      continue;
+    }
+
+    if (UseValueSimplify && !isa<Constant>(V)) {
+      bool UsedAssumedInformation = false;
+      Optional<Constant *> C =
+          A.getAssumedConstant(*V, QueryingAA, UsedAssumedInformation);
+      if (!C.hasValue())
+        continue;
+      if (Value *NewV = C.getValue()) {
+        Worklist.push_back({NewV, CtxI});
+        continue;
+      }
+    }
+
+    // Once a leaf is reached we inform the user through the callback.
+    if (!VisitValueCB(*V, CtxI, State, Iteration > 1))
+      return false;
+  } while (!Worklist.empty());
+
+  // If we actually used liveness information so we have to record a dependence.
+  if (AnyDead)
+    A.recordDependence(*LivenessAA, QueryingAA, DepClassTy::OPTIONAL);
+
+  // All values have been visited.
+  return true;
+}
+
+const Value *stripAndAccumulateMinimalOffsets(
+    Attributor &A, const AbstractAttribute &QueryingAA, const Value *Val,
+    const DataLayout &DL, APInt &Offset, bool AllowNonInbounds,
+    bool UseAssumed = false) {
+
+  auto AttributorAnalysis = [&](Value &V, APInt &ROffset) -> bool {
+    const IRPosition &Pos = IRPosition::value(V);
+    // Only track dependence if we are going to use the assumed info.
+    const AAValueConstantRange &ValueConstantRangeAA =
+        A.getAAFor<AAValueConstantRange>(QueryingAA, Pos,
+                                         /* TrackDependence */ UseAssumed);
+    ConstantRange Range = UseAssumed ? ValueConstantRangeAA.getAssumed()
+                                     : ValueConstantRangeAA.getKnown();
+    // We can only use the lower part of the range because the upper part can
+    // be higher than what the value can really be.
+    ROffset = Range.getSignedMin();
+    return true;
+  };
+
+  return Val->stripAndAccumulateConstantOffsets(DL, Offset, AllowNonInbounds,
+                                                AttributorAnalysis);
+}
+
+static const Value *getMinimalBaseOfAccsesPointerOperand(
+    Attributor &A, const AbstractAttribute &QueryingAA, const Instruction *I,
+    int64_t &BytesOffset, const DataLayout &DL, bool AllowNonInbounds = false) {
+  const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false);
+  if (!Ptr)
+    return nullptr;
+  APInt OffsetAPInt(DL.getIndexTypeSizeInBits(Ptr->getType()), 0);
+  const Value *Base = stripAndAccumulateMinimalOffsets(
+      A, QueryingAA, Ptr, DL, OffsetAPInt, AllowNonInbounds);
+
+  BytesOffset = OffsetAPInt.getSExtValue();
+  return Base;
+}
+
+static const Value *
+getBasePointerOfAccessPointerOperand(const Instruction *I, int64_t &BytesOffset,
+                                     const DataLayout &DL,
+                                     bool AllowNonInbounds = false) {
+  const Value *Ptr = getPointerOperand(I, /* AllowVolatile */ false);
+  if (!Ptr)
+    return nullptr;
+
+  return GetPointerBaseWithConstantOffset(Ptr, BytesOffset, DL,
+                                          AllowNonInbounds);
+}
+
+/// Helper function to clamp a state \p S of type \p StateType with the
+/// information in \p R and indicate/return if \p S did change (as-in update is
+/// required to be run again).
+template <typename StateType>
+ChangeStatus clampStateAndIndicateChange(StateType &S, const StateType &R) {
+  auto Assumed = S.getAssumed();
+  S ^= R;
+  return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED
+                                   : ChangeStatus::CHANGED;
+}
+
+/// Clamp the information known for all returned values of a function
+/// (identified by \p QueryingAA) into \p S.
+template <typename AAType, typename StateType = typename AAType::StateType>
+static void clampReturnedValueStates(Attributor &A, const AAType &QueryingAA,
+                                     StateType &S) {
+  LLVM_DEBUG(dbgs() << "[Attributor] Clamp return value states for "
+                    << QueryingAA << " into " << S << "\n");
+
+  assert((QueryingAA.getIRPosition().getPositionKind() ==
+              IRPosition::IRP_RETURNED ||
+          QueryingAA.getIRPosition().getPositionKind() ==
+              IRPosition::IRP_CALL_SITE_RETURNED) &&
+         "Can only clamp returned value states for a function returned or call "
+         "site returned position!");
+
+  // Use an optional state as there might not be any return values and we want
+  // to join (IntegerState::operator&) the state of all there are.
+  Optional<StateType> T;
+
+  // Callback for each possibly returned value.
+  auto CheckReturnValue = [&](Value &RV) -> bool {
+    const IRPosition &RVPos = IRPosition::value(RV);
+    const AAType &AA = A.getAAFor<AAType>(QueryingAA, RVPos);
+    LLVM_DEBUG(dbgs() << "[Attributor] RV: " << RV << " AA: " << AA.getAsStr()
+                      << " @ " << RVPos << "\n");
+    const StateType &AAS = static_cast<const StateType &>(AA.getState());
+    if (T.hasValue())
+      *T &= AAS;
+    else
+      T = AAS;
+    LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " RV State: " << T
+                      << "\n");
+    return T->isValidState();
+  };
+
+  if (!A.checkForAllReturnedValues(CheckReturnValue, QueryingAA))
+    S.indicatePessimisticFixpoint();
+  else if (T.hasValue())
+    S ^= *T;
+}
+
+/// Helper class for generic deduction: return value -> returned position.
+template <typename AAType, typename BaseType,
+          typename StateType = typename BaseType::StateType>
+struct AAReturnedFromReturnedValues : public BaseType {
+  AAReturnedFromReturnedValues(const IRPosition &IRP, Attributor &A)
+      : BaseType(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    StateType S(StateType::getBestState(this->getState()));
+    clampReturnedValueStates<AAType, StateType>(A, *this, S);
+    // TODO: If we know we visited all returned values, thus no are assumed
+    // dead, we can take the known information from the state T.
+    return clampStateAndIndicateChange<StateType>(this->getState(), S);
+  }
+};
+
+/// Clamp the information known at all call sites for a given argument
+/// (identified by \p QueryingAA) into \p S.
+template <typename AAType, typename StateType = typename AAType::StateType>
+static void clampCallSiteArgumentStates(Attributor &A, const AAType &QueryingAA,
+                                        StateType &S) {
+  LLVM_DEBUG(dbgs() << "[Attributor] Clamp call site argument states for "
+                    << QueryingAA << " into " << S << "\n");
+
+  assert(QueryingAA.getIRPosition().getPositionKind() ==
+             IRPosition::IRP_ARGUMENT &&
+         "Can only clamp call site argument states for an argument position!");
+
+  // Use an optional state as there might not be any return values and we want
+  // to join (IntegerState::operator&) the state of all there are.
+  Optional<StateType> T;
+
+  // The argument number which is also the call site argument number.
+  unsigned ArgNo = QueryingAA.getIRPosition().getArgNo();
+
+  auto CallSiteCheck = [&](AbstractCallSite ACS) {
+    const IRPosition &ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo);
+    // Check if a coresponding argument was found or if it is on not associated
+    // (which can happen for callback calls).
+    if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
+      return false;
+
+    const AAType &AA = A.getAAFor<AAType>(QueryingAA, ACSArgPos);
+    LLVM_DEBUG(dbgs() << "[Attributor] ACS: " << *ACS.getInstruction()
+                      << " AA: " << AA.getAsStr() << " @" << ACSArgPos << "\n");
+    const StateType &AAS = static_cast<const StateType &>(AA.getState());
+    if (T.hasValue())
+      *T &= AAS;
+    else
+      T = AAS;
+    LLVM_DEBUG(dbgs() << "[Attributor] AA State: " << AAS << " CSA State: " << T
+                      << "\n");
+    return T->isValidState();
+  };
+
+  bool AllCallSitesKnown;
+  if (!A.checkForAllCallSites(CallSiteCheck, QueryingAA, true,
+                              AllCallSitesKnown))
+    S.indicatePessimisticFixpoint();
+  else if (T.hasValue())
+    S ^= *T;
+}
+
+/// Helper class for generic deduction: call site argument -> argument position.
+template <typename AAType, typename BaseType,
+          typename StateType = typename AAType::StateType>
+struct AAArgumentFromCallSiteArguments : public BaseType {
+  AAArgumentFromCallSiteArguments(const IRPosition &IRP, Attributor &A)
+      : BaseType(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    StateType S(StateType::getBestState(this->getState()));
+    clampCallSiteArgumentStates<AAType, StateType>(A, *this, S);
+    // TODO: If we know we visited all incoming values, thus no are assumed
+    // dead, we can take the known information from the state T.
+    return clampStateAndIndicateChange<StateType>(this->getState(), S);
+  }
+};
+
+/// Helper class for generic replication: function returned -> cs returned.
+template <typename AAType, typename BaseType,
+          typename StateType = typename BaseType::StateType>
+struct AACallSiteReturnedFromReturned : public BaseType {
+  AACallSiteReturnedFromReturned(const IRPosition &IRP, Attributor &A)
+      : BaseType(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    assert(this->getIRPosition().getPositionKind() ==
+               IRPosition::IRP_CALL_SITE_RETURNED &&
+           "Can only wrap function returned positions for call site returned "
+           "positions!");
+    auto &S = this->getState();
+
+    const Function *AssociatedFunction =
+        this->getIRPosition().getAssociatedFunction();
+    if (!AssociatedFunction)
+      return S.indicatePessimisticFixpoint();
+
+    IRPosition FnPos = IRPosition::returned(*AssociatedFunction);
+    const AAType &AA = A.getAAFor<AAType>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        S, static_cast<const StateType &>(AA.getState()));
+  }
+};
+
+/// Helper function to accumulate uses.
+template <class AAType, typename StateType = typename AAType::StateType>
+static void followUsesInContext(AAType &AA, Attributor &A,
+                                MustBeExecutedContextExplorer &Explorer,
+                                const Instruction *CtxI,
+                                SetVector<const Use *> &Uses,
+                                StateType &State) {
+  auto EIt = Explorer.begin(CtxI), EEnd = Explorer.end(CtxI);
+  for (unsigned u = 0; u < Uses.size(); ++u) {
+    const Use *U = Uses[u];
+    if (const Instruction *UserI = dyn_cast<Instruction>(U->getUser())) {
+      bool Found = Explorer.findInContextOf(UserI, EIt, EEnd);
+      if (Found && AA.followUseInMBEC(A, U, UserI, State))
+        for (const Use &Us : UserI->uses())
+          Uses.insert(&Us);
+    }
+  }
+}
+
+/// Use the must-be-executed-context around \p I to add information into \p S.
+/// The AAType class is required to have `followUseInMBEC` method with the
+/// following signature and behaviour:
+///
+/// bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I)
+/// U - Underlying use.
+/// I - The user of the \p U.
+/// Returns true if the value should be tracked transitively.
+///
+template <class AAType, typename StateType = typename AAType::StateType>
+static void followUsesInMBEC(AAType &AA, Attributor &A, StateType &S,
+                             Instruction &CtxI) {
+
+  // Container for (transitive) uses of the associated value.
+  SetVector<const Use *> Uses;
+  for (const Use &U : AA.getIRPosition().getAssociatedValue().uses())
+    Uses.insert(&U);
+
+  MustBeExecutedContextExplorer &Explorer =
+      A.getInfoCache().getMustBeExecutedContextExplorer();
+
+  followUsesInContext<AAType>(AA, A, Explorer, &CtxI, Uses, S);
+
+  if (S.isAtFixpoint())
+    return;
+
+  SmallVector<const BranchInst *, 4> BrInsts;
+  auto Pred = [&](const Instruction *I) {
+    if (const BranchInst *Br = dyn_cast<BranchInst>(I))
+      if (Br->isConditional())
+        BrInsts.push_back(Br);
+    return true;
+  };
+
+  // Here, accumulate conditional branch instructions in the context. We
+  // explore the child paths and collect the known states. The disjunction of
+  // those states can be merged to its own state. Let ParentState_i be a state
+  // to indicate the known information for an i-th branch instruction in the
+  // context. ChildStates are created for its successors respectively.
+  //
+  // ParentS_1 = ChildS_{1, 1} /\ ChildS_{1, 2} /\ ... /\ ChildS_{1, n_1}
+  // ParentS_2 = ChildS_{2, 1} /\ ChildS_{2, 2} /\ ... /\ ChildS_{2, n_2}
+  //      ...
+  // ParentS_m = ChildS_{m, 1} /\ ChildS_{m, 2} /\ ... /\ ChildS_{m, n_m}
+  //
+  // Known State |= ParentS_1 \/ ParentS_2 \/... \/ ParentS_m
+  //
+  // FIXME: Currently, recursive branches are not handled. For example, we
+  // can't deduce that ptr must be dereferenced in below function.
+  //
+  // void f(int a, int c, int *ptr) {
+  //    if(a)
+  //      if (b) {
+  //        *ptr = 0;
+  //      } else {
+  //        *ptr = 1;
+  //      }
+  //    else {
+  //      if (b) {
+  //        *ptr = 0;
+  //      } else {
+  //        *ptr = 1;
+  //      }
+  //    }
+  // }
+
+  Explorer.checkForAllContext(&CtxI, Pred);
+  for (const BranchInst *Br : BrInsts) {
+    StateType ParentState;
+
+    // The known state of the parent state is a conjunction of children's
+    // known states so it is initialized with a best state.
+    ParentState.indicateOptimisticFixpoint();
+
+    for (const BasicBlock *BB : Br->successors()) {
+      StateType ChildState;
+
+      size_t BeforeSize = Uses.size();
+      followUsesInContext(AA, A, Explorer, &BB->front(), Uses, ChildState);
+
+      // Erase uses which only appear in the child.
+      for (auto It = Uses.begin() + BeforeSize; It != Uses.end();)
+        It = Uses.erase(It);
+
+      ParentState &= ChildState;
+    }
+
+    // Use only known state.
+    S += ParentState;
+  }
+}
+
+/// -----------------------NoUnwind Function Attribute--------------------------
+
+struct AANoUnwindImpl : AANoUnwind {
+  AANoUnwindImpl(const IRPosition &IRP, Attributor &A) : AANoUnwind(IRP, A) {}
+
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nounwind" : "may-unwind";
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto Opcodes = {
+        (unsigned)Instruction::Invoke,      (unsigned)Instruction::CallBr,
+        (unsigned)Instruction::Call,        (unsigned)Instruction::CleanupRet,
+        (unsigned)Instruction::CatchSwitch, (unsigned)Instruction::Resume};
+
+    auto CheckForNoUnwind = [&](Instruction &I) {
+      if (!I.mayThrow())
+        return true;
+
+      if (const auto *CB = dyn_cast<CallBase>(&I)) {
+        const auto &NoUnwindAA =
+            A.getAAFor<AANoUnwind>(*this, IRPosition::callsite_function(*CB));
+        return NoUnwindAA.isAssumedNoUnwind();
+      }
+      return false;
+    };
+
+    if (!A.checkForAllInstructions(CheckForNoUnwind, *this, Opcodes))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+};
+
+struct AANoUnwindFunction final : public AANoUnwindImpl {
+  AANoUnwindFunction(const IRPosition &IRP, Attributor &A)
+      : AANoUnwindImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nounwind) }
+};
+
+/// NoUnwind attribute deduction for a call sites.
+struct AANoUnwindCallSite final : AANoUnwindImpl {
+  AANoUnwindCallSite(const IRPosition &IRP, Attributor &A)
+      : AANoUnwindImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoUnwindImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoUnwind>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AANoUnwind::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nounwind); }
+};
+
+/// --------------------- Function Return Values -------------------------------
+
+/// "Attribute" that collects all potential returned values and the return
+/// instructions that they arise from.
+///
+/// If there is a unique returned value R, the manifest method will:
+///   - mark R with the "returned" attribute, if R is an argument.
+class AAReturnedValuesImpl : public AAReturnedValues, public AbstractState {
+
+  /// Mapping of values potentially returned by the associated function to the
+  /// return instructions that might return them.
+  MapVector<Value *, SmallSetVector<ReturnInst *, 4>> ReturnedValues;
+
+  /// Mapping to remember the number of returned values for a call site such
+  /// that we can avoid updates if nothing changed.
+  DenseMap<const CallBase *, unsigned> NumReturnedValuesPerKnownAA;
+
+  /// Set of unresolved calls returned by the associated function.
+  SmallSetVector<CallBase *, 4> UnresolvedCalls;
+
+  /// State flags
+  ///
+  ///{
+  bool IsFixed = false;
+  bool IsValidState = true;
+  ///}
+
+public:
+  AAReturnedValuesImpl(const IRPosition &IRP, Attributor &A)
+      : AAReturnedValues(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // Reset the state.
+    IsFixed = false;
+    IsValidState = true;
+    ReturnedValues.clear();
+
+    Function *F = getAssociatedFunction();
+    if (!F) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+    assert(!F->getReturnType()->isVoidTy() &&
+           "Did not expect a void return type!");
+
+    // The map from instruction opcodes to those instructions in the function.
+    auto &OpcodeInstMap = A.getInfoCache().getOpcodeInstMapForFunction(*F);
+
+    // Look through all arguments, if one is marked as returned we are done.
+    for (Argument &Arg : F->args()) {
+      if (Arg.hasReturnedAttr()) {
+        auto &ReturnInstSet = ReturnedValues[&Arg];
+        if (auto *Insts = OpcodeInstMap.lookup(Instruction::Ret))
+          for (Instruction *RI : *Insts)
+            ReturnInstSet.insert(cast<ReturnInst>(RI));
+
+        indicateOptimisticFixpoint();
+        return;
+      }
+    }
+
+    if (!A.isFunctionIPOAmendable(*F))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override;
+
+  /// See AbstractAttribute::getState(...).
+  AbstractState &getState() override { return *this; }
+
+  /// See AbstractAttribute::getState(...).
+  const AbstractState &getState() const override { return *this; }
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  ChangeStatus updateImpl(Attributor &A) override;
+
+  llvm::iterator_range<iterator> returned_values() override {
+    return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end());
+  }
+
+  llvm::iterator_range<const_iterator> returned_values() const override {
+    return llvm::make_range(ReturnedValues.begin(), ReturnedValues.end());
+  }
+
+  const SmallSetVector<CallBase *, 4> &getUnresolvedCalls() const override {
+    return UnresolvedCalls;
+  }
+
+  /// Return the number of potential return values, -1 if unknown.
+  size_t getNumReturnValues() const override {
+    return isValidState() ? ReturnedValues.size() : -1;
+  }
+
+  /// Return an assumed unique return value if a single candidate is found. If
+  /// there cannot be one, return a nullptr. If it is not clear yet, return the
+  /// Optional::NoneType.
+  Optional<Value *> getAssumedUniqueReturnValue(Attributor &A) const;
+
+  /// See AbstractState::checkForAllReturnedValues(...).
+  bool checkForAllReturnedValuesAndReturnInsts(
+      function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred)
+      const override;
+
+  /// Pretty print the attribute similar to the IR representation.
+  const std::string getAsStr() const override;
+
+  /// See AbstractState::isAtFixpoint().
+  bool isAtFixpoint() const override { return IsFixed; }
+
+  /// See AbstractState::isValidState().
+  bool isValidState() const override { return IsValidState; }
+
+  /// See AbstractState::indicateOptimisticFixpoint(...).
+  ChangeStatus indicateOptimisticFixpoint() override {
+    IsFixed = true;
+    return ChangeStatus::UNCHANGED;
+  }
+
+  ChangeStatus indicatePessimisticFixpoint() override {
+    IsFixed = true;
+    IsValidState = false;
+    return ChangeStatus::CHANGED;
+  }
+};
+
+ChangeStatus AAReturnedValuesImpl::manifest(Attributor &A) {
+  ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+  // Bookkeeping.
+  assert(isValidState());
+  STATS_DECLTRACK(KnownReturnValues, FunctionReturn,
+                  "Number of function with known return values");
+
+  // Check if we have an assumed unique return value that we could manifest.
+  Optional<Value *> UniqueRV = getAssumedUniqueReturnValue(A);
+
+  if (!UniqueRV.hasValue() || !UniqueRV.getValue())
+    return Changed;
+
+  // Bookkeeping.
+  STATS_DECLTRACK(UniqueReturnValue, FunctionReturn,
+                  "Number of function with unique return");
+
+  // Callback to replace the uses of CB with the constant C.
+  auto ReplaceCallSiteUsersWith = [&A](CallBase &CB, Constant &C) {
+    if (CB.use_empty())
+      return ChangeStatus::UNCHANGED;
+    if (A.changeValueAfterManifest(CB, C))
+      return ChangeStatus::CHANGED;
+    return ChangeStatus::UNCHANGED;
+  };
+
+  // If the assumed unique return value is an argument, annotate it.
+  if (auto *UniqueRVArg = dyn_cast<Argument>(UniqueRV.getValue())) {
+    if (UniqueRVArg->getType()->canLosslesslyBitCastTo(
+            getAssociatedFunction()->getReturnType())) {
+      getIRPosition() = IRPosition::argument(*UniqueRVArg);
+      Changed = IRAttribute::manifest(A);
+    }
+  } else if (auto *RVC = dyn_cast<Constant>(UniqueRV.getValue())) {
+    // We can replace the returned value with the unique returned constant.
+    Value &AnchorValue = getAnchorValue();
+    if (Function *F = dyn_cast<Function>(&AnchorValue)) {
+      for (const Use &U : F->uses())
+        if (CallBase *CB = dyn_cast<CallBase>(U.getUser()))
+          if (CB->isCallee(&U)) {
+            Constant *RVCCast =
+                CB->getType() == RVC->getType()
+                    ? RVC
+                    : ConstantExpr::getTruncOrBitCast(RVC, CB->getType());
+            Changed = ReplaceCallSiteUsersWith(*CB, *RVCCast) | Changed;
+          }
+    } else {
+      assert(isa<CallBase>(AnchorValue) &&
+             "Expcected a function or call base anchor!");
+      Constant *RVCCast =
+          AnchorValue.getType() == RVC->getType()
+              ? RVC
+              : ConstantExpr::getTruncOrBitCast(RVC, AnchorValue.getType());
+      Changed = ReplaceCallSiteUsersWith(cast<CallBase>(AnchorValue), *RVCCast);
+    }
+    if (Changed == ChangeStatus::CHANGED)
+      STATS_DECLTRACK(UniqueConstantReturnValue, FunctionReturn,
+                      "Number of function returns replaced by constant return");
+  }
+
+  return Changed;
+}
+
+const std::string AAReturnedValuesImpl::getAsStr() const {
+  return (isAtFixpoint() ? "returns(#" : "may-return(#") +
+         (isValidState() ? std::to_string(getNumReturnValues()) : "?") +
+         ")[#UC: " + std::to_string(UnresolvedCalls.size()) + "]";
+}
+
+Optional<Value *>
+AAReturnedValuesImpl::getAssumedUniqueReturnValue(Attributor &A) const {
+  // If checkForAllReturnedValues provides a unique value, ignoring potential
+  // undef values that can also be present, it is assumed to be the actual
+  // return value and forwarded to the caller of this method. If there are
+  // multiple, a nullptr is returned indicating there cannot be a unique
+  // returned value.
+  Optional<Value *> UniqueRV;
+
+  auto Pred = [&](Value &RV) -> bool {
+    // If we found a second returned value and neither the current nor the saved
+    // one is an undef, there is no unique returned value. Undefs are special
+    // since we can pretend they have any value.
+    if (UniqueRV.hasValue() && UniqueRV != &RV &&
+        !(isa<UndefValue>(RV) || isa<UndefValue>(UniqueRV.getValue()))) {
+      UniqueRV = nullptr;
+      return false;
+    }
+
+    // Do not overwrite a value with an undef.
+    if (!UniqueRV.hasValue() || !isa<UndefValue>(RV))
+      UniqueRV = &RV;
+
+    return true;
+  };
+
+  if (!A.checkForAllReturnedValues(Pred, *this))
+    UniqueRV = nullptr;
+
+  return UniqueRV;
+}
+
+bool AAReturnedValuesImpl::checkForAllReturnedValuesAndReturnInsts(
+    function_ref<bool(Value &, const SmallSetVector<ReturnInst *, 4> &)> Pred)
+    const {
+  if (!isValidState())
+    return false;
+
+  // Check all returned values but ignore call sites as long as we have not
+  // encountered an overdefined one during an update.
+  for (auto &It : ReturnedValues) {
+    Value *RV = It.first;
+
+    CallBase *CB = dyn_cast<CallBase>(RV);
+    if (CB && !UnresolvedCalls.count(CB))
+      continue;
+
+    if (!Pred(*RV, It.second))
+      return false;
+  }
+
+  return true;
+}
+
+ChangeStatus AAReturnedValuesImpl::updateImpl(Attributor &A) {
+  size_t NumUnresolvedCalls = UnresolvedCalls.size();
+  bool Changed = false;
+
+  // State used in the value traversals starting in returned values.
+  struct RVState {
+    // The map in which we collect return values -> return instrs.
+    decltype(ReturnedValues) &RetValsMap;
+    // The flag to indicate a change.
+    bool &Changed;
+    // The return instrs we come from.
+    SmallSetVector<ReturnInst *, 4> RetInsts;
+  };
+
+  // Callback for a leaf value returned by the associated function.
+  auto VisitValueCB = [](Value &Val, const Instruction *, RVState &RVS,
+                         bool) -> bool {
+    auto Size = RVS.RetValsMap[&Val].size();
+    RVS.RetValsMap[&Val].insert(RVS.RetInsts.begin(), RVS.RetInsts.end());
+    bool Inserted = RVS.RetValsMap[&Val].size() != Size;
+    RVS.Changed |= Inserted;
+    LLVM_DEBUG({
+      if (Inserted)
+        dbgs() << "[AAReturnedValues] 1 Add new returned value " << Val
+               << " => " << RVS.RetInsts.size() << "\n";
+    });
+    return true;
+  };
+
+  // Helper method to invoke the generic value traversal.
+  auto VisitReturnedValue = [&](Value &RV, RVState &RVS,
+                                const Instruction *CtxI) {
+    IRPosition RetValPos = IRPosition::value(RV);
+    return genericValueTraversal<AAReturnedValues, RVState>(
+        A, RetValPos, *this, RVS, VisitValueCB, CtxI,
+        /* UseValueSimplify */ false);
+  };
+
+  // Callback for all "return intructions" live in the associated function.
+  auto CheckReturnInst = [this, &VisitReturnedValue, &Changed](Instruction &I) {
+    ReturnInst &Ret = cast<ReturnInst>(I);
+    RVState RVS({ReturnedValues, Changed, {}});
+    RVS.RetInsts.insert(&Ret);
+    return VisitReturnedValue(*Ret.getReturnValue(), RVS, &I);
+  };
+
+  // Start by discovering returned values from all live returned instructions in
+  // the associated function.
+  if (!A.checkForAllInstructions(CheckReturnInst, *this, {Instruction::Ret}))
+    return indicatePessimisticFixpoint();
+
+  // Once returned values "directly" present in the code are handled we try to
+  // resolve returned calls. To avoid modifications to the ReturnedValues map
+  // while we iterate over it we kept record of potential new entries in a copy
+  // map, NewRVsMap.
+  decltype(ReturnedValues) NewRVsMap;
+
+  auto HandleReturnValue = [&](Value *RV, SmallSetVector<ReturnInst *, 4> &RIs) {
+    LLVM_DEBUG(dbgs() << "[AAReturnedValues] Returned value: " << *RV
+                      << " by #" << RIs.size() << " RIs\n");
+    CallBase *CB = dyn_cast<CallBase>(RV);
+    if (!CB || UnresolvedCalls.count(CB))
+      return;
+
+    if (!CB->getCalledFunction()) {
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB
+                        << "\n");
+      UnresolvedCalls.insert(CB);
+      return;
+    }
+
+    // TODO: use the function scope once we have call site AAReturnedValues.
+    const auto &RetValAA = A.getAAFor<AAReturnedValues>(
+        *this, IRPosition::function(*CB->getCalledFunction()));
+    LLVM_DEBUG(dbgs() << "[AAReturnedValues] Found another AAReturnedValues: "
+                      << RetValAA << "\n");
+
+    // Skip dead ends, thus if we do not know anything about the returned
+    // call we mark it as unresolved and it will stay that way.
+    if (!RetValAA.getState().isValidState()) {
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Unresolved call: " << *CB
+                        << "\n");
+      UnresolvedCalls.insert(CB);
+      return;
+    }
+
+    // Do not try to learn partial information. If the callee has unresolved
+    // return values we will treat the call as unresolved/opaque.
+    auto &RetValAAUnresolvedCalls = RetValAA.getUnresolvedCalls();
+    if (!RetValAAUnresolvedCalls.empty()) {
+      UnresolvedCalls.insert(CB);
+      return;
+    }
+
+    // Now check if we can track transitively returned values. If possible, thus
+    // if all return value can be represented in the current scope, do so.
+    bool Unresolved = false;
+    for (auto &RetValAAIt : RetValAA.returned_values()) {
+      Value *RetVal = RetValAAIt.first;
+      if (isa<Argument>(RetVal) || isa<CallBase>(RetVal) ||
+          isa<Constant>(RetVal))
+        continue;
+      // Anything that did not fit in the above categories cannot be resolved,
+      // mark the call as unresolved.
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] transitively returned value "
+                           "cannot be translated: "
+                        << *RetVal << "\n");
+      UnresolvedCalls.insert(CB);
+      Unresolved = true;
+      break;
+    }
+
+    if (Unresolved)
+      return;
+
+    // Now track transitively returned values.
+    unsigned &NumRetAA = NumReturnedValuesPerKnownAA[CB];
+    if (NumRetAA == RetValAA.getNumReturnValues()) {
+      LLVM_DEBUG(dbgs() << "[AAReturnedValues] Skip call as it has not "
+                           "changed since it was seen last\n");
+      return;
+    }
+    NumRetAA = RetValAA.getNumReturnValues();
+
+    for (auto &RetValAAIt : RetValAA.returned_values()) {
+      Value *RetVal = RetValAAIt.first;
+      if (Argument *Arg = dyn_cast<Argument>(RetVal)) {
+        // Arguments are mapped to call site operands and we begin the traversal
+        // again.
+        bool Unused = false;
+        RVState RVS({NewRVsMap, Unused, RetValAAIt.second});
+        VisitReturnedValue(*CB->getArgOperand(Arg->getArgNo()), RVS, CB);
+        continue;
+      } else if (isa<CallBase>(RetVal)) {
+        // Call sites are resolved by the callee attribute over time, no need to
+        // do anything for us.
+        continue;
+      } else if (isa<Constant>(RetVal)) {
+        // Constants are valid everywhere, we can simply take them.
+        NewRVsMap[RetVal].insert(RIs.begin(), RIs.end());
+        continue;
+      }
+    }
+  };
+
+  for (auto &It : ReturnedValues)
+    HandleReturnValue(It.first, It.second);
+
+  // Because processing the new information can again lead to new return values
+  // we have to be careful and iterate until this iteration is complete. The
+  // idea is that we are in a stable state at the end of an update. All return
+  // values have been handled and properly categorized. We might not update
+  // again if we have not requested a non-fix attribute so we cannot "wait" for
+  // the next update to analyze a new return value.
+  while (!NewRVsMap.empty()) {
+    auto It = std::move(NewRVsMap.back());
+    NewRVsMap.pop_back();
+
+    assert(!It.second.empty() && "Entry does not add anything.");
+    auto &ReturnInsts = ReturnedValues[It.first];
+    for (ReturnInst *RI : It.second)
+      if (ReturnInsts.insert(RI)) {
+        LLVM_DEBUG(dbgs() << "[AAReturnedValues] Add new returned value "
+                          << *It.first << " => " << *RI << "\n");
+        HandleReturnValue(It.first, ReturnInsts);
+        Changed = true;
+      }
+  }
+
+  Changed |= (NumUnresolvedCalls != UnresolvedCalls.size());
+  return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+}
+
+struct AAReturnedValuesFunction final : public AAReturnedValuesImpl {
+  AAReturnedValuesFunction(const IRPosition &IRP, Attributor &A)
+      : AAReturnedValuesImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(returned) }
+};
+
+/// Returned values information for a call sites.
+struct AAReturnedValuesCallSite final : AAReturnedValuesImpl {
+  AAReturnedValuesCallSite(const IRPosition &IRP, Attributor &A)
+      : AAReturnedValuesImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites instead of
+    //       redirecting requests to the callee.
+    llvm_unreachable("Abstract attributes for returned values are not "
+                     "supported for call sites yet!");
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// ------------------------ NoSync Function Attribute -------------------------
+
+struct AANoSyncImpl : AANoSync {
+  AANoSyncImpl(const IRPosition &IRP, Attributor &A) : AANoSync(IRP, A) {}
+
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nosync" : "may-sync";
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+
+  /// Helper function used to determine whether an instruction is non-relaxed
+  /// atomic. In other words, if an atomic instruction does not have unordered
+  /// or monotonic ordering
+  static bool isNonRelaxedAtomic(Instruction *I);
+
+  /// Helper function used to determine whether an instruction is volatile.
+  static bool isVolatile(Instruction *I);
+
+  /// Helper function uset to check if intrinsic is volatile (memcpy, memmove,
+  /// memset).
+  static bool isNoSyncIntrinsic(Instruction *I);
+};
+
+bool AANoSyncImpl::isNonRelaxedAtomic(Instruction *I) {
+  if (!I->isAtomic())
+    return false;
+
+  AtomicOrdering Ordering;
+  switch (I->getOpcode()) {
+  case Instruction::AtomicRMW:
+    Ordering = cast<AtomicRMWInst>(I)->getOrdering();
+    break;
+  case Instruction::Store:
+    Ordering = cast<StoreInst>(I)->getOrdering();
+    break;
+  case Instruction::Load:
+    Ordering = cast<LoadInst>(I)->getOrdering();
+    break;
+  case Instruction::Fence: {
+    auto *FI = cast<FenceInst>(I);
+    if (FI->getSyncScopeID() == SyncScope::SingleThread)
+      return false;
+    Ordering = FI->getOrdering();
+    break;
+  }
+  case Instruction::AtomicCmpXchg: {
+    AtomicOrdering Success = cast<AtomicCmpXchgInst>(I)->getSuccessOrdering();
+    AtomicOrdering Failure = cast<AtomicCmpXchgInst>(I)->getFailureOrdering();
+    // Only if both are relaxed, than it can be treated as relaxed.
+    // Otherwise it is non-relaxed.
+    if (Success != AtomicOrdering::Unordered &&
+        Success != AtomicOrdering::Monotonic)
+      return true;
+    if (Failure != AtomicOrdering::Unordered &&
+        Failure != AtomicOrdering::Monotonic)
+      return true;
+    return false;
+  }
+  default:
+    llvm_unreachable(
+        "New atomic operations need to be known in the attributor.");
+  }
+
+  // Relaxed.
+  if (Ordering == AtomicOrdering::Unordered ||
+      Ordering == AtomicOrdering::Monotonic)
+    return false;
+  return true;
+}
+
+/// Checks if an intrinsic is nosync. Currently only checks mem* intrinsics.
+/// FIXME: We should ipmrove the handling of intrinsics.
+bool AANoSyncImpl::isNoSyncIntrinsic(Instruction *I) {
+  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
+    switch (II->getIntrinsicID()) {
+    /// Element wise atomic memory intrinsics are can only be unordered,
+    /// therefore nosync.
+    case Intrinsic::memset_element_unordered_atomic:
+    case Intrinsic::memmove_element_unordered_atomic:
+    case Intrinsic::memcpy_element_unordered_atomic:
+      return true;
+    case Intrinsic::memset:
+    case Intrinsic::memmove:
+    case Intrinsic::memcpy:
+      if (!cast<MemIntrinsic>(II)->isVolatile())
+        return true;
+      return false;
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
+bool AANoSyncImpl::isVolatile(Instruction *I) {
+  assert(!isa<CallBase>(I) && "Calls should not be checked here");
+
+  switch (I->getOpcode()) {
+  case Instruction::AtomicRMW:
+    return cast<AtomicRMWInst>(I)->isVolatile();
+  case Instruction::Store:
+    return cast<StoreInst>(I)->isVolatile();
+  case Instruction::Load:
+    return cast<LoadInst>(I)->isVolatile();
+  case Instruction::AtomicCmpXchg:
+    return cast<AtomicCmpXchgInst>(I)->isVolatile();
+  default:
+    return false;
+  }
+}
+
+ChangeStatus AANoSyncImpl::updateImpl(Attributor &A) {
+
+  auto CheckRWInstForNoSync = [&](Instruction &I) {
+    /// We are looking for volatile instructions or Non-Relaxed atomics.
+    /// FIXME: We should improve the handling of intrinsics.
+
+    if (isa<IntrinsicInst>(&I) && isNoSyncIntrinsic(&I))
+      return true;
+
+    if (const auto *CB = dyn_cast<CallBase>(&I)) {
+      if (CB->hasFnAttr(Attribute::NoSync))
+        return true;
+
+      const auto &NoSyncAA =
+          A.getAAFor<AANoSync>(*this, IRPosition::callsite_function(*CB));
+      if (NoSyncAA.isAssumedNoSync())
+        return true;
+      return false;
+    }
+
+    if (!isVolatile(&I) && !isNonRelaxedAtomic(&I))
+      return true;
+
+    return false;
+  };
+
+  auto CheckForNoSync = [&](Instruction &I) {
+    // At this point we handled all read/write effects and they are all
+    // nosync, so they can be skipped.
+    if (I.mayReadOrWriteMemory())
+      return true;
+
+    // non-convergent and readnone imply nosync.
+    return !cast<CallBase>(I).isConvergent();
+  };
+
+  if (!A.checkForAllReadWriteInstructions(CheckRWInstForNoSync, *this) ||
+      !A.checkForAllCallLikeInstructions(CheckForNoSync, *this))
+    return indicatePessimisticFixpoint();
+
+  return ChangeStatus::UNCHANGED;
+}
+
+struct AANoSyncFunction final : public AANoSyncImpl {
+  AANoSyncFunction(const IRPosition &IRP, Attributor &A)
+      : AANoSyncImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nosync) }
+};
+
+/// NoSync attribute deduction for a call sites.
+struct AANoSyncCallSite final : AANoSyncImpl {
+  AANoSyncCallSite(const IRPosition &IRP, Attributor &A)
+      : AANoSyncImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoSyncImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoSync>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AANoSync::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nosync); }
+};
+
+/// ------------------------ No-Free Attributes ----------------------------
+
+struct AANoFreeImpl : public AANoFree {
+  AANoFreeImpl(const IRPosition &IRP, Attributor &A) : AANoFree(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto CheckForNoFree = [&](Instruction &I) {
+      const auto &CB = cast<CallBase>(I);
+      if (CB.hasFnAttr(Attribute::NoFree))
+        return true;
+
+      const auto &NoFreeAA =
+          A.getAAFor<AANoFree>(*this, IRPosition::callsite_function(CB));
+      return NoFreeAA.isAssumedNoFree();
+    };
+
+    if (!A.checkForAllCallLikeInstructions(CheckForNoFree, *this))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nofree" : "may-free";
+  }
+};
+
+struct AANoFreeFunction final : public AANoFreeImpl {
+  AANoFreeFunction(const IRPosition &IRP, Attributor &A)
+      : AANoFreeImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(nofree) }
+};
+
+/// NoFree attribute deduction for a call sites.
+struct AANoFreeCallSite final : AANoFreeImpl {
+  AANoFreeCallSite(const IRPosition &IRP, Attributor &A)
+      : AANoFreeImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoFreeImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoFree>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AANoFree::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(nofree); }
+};
+
+/// NoFree attribute for floating values.
+struct AANoFreeFloating : AANoFreeImpl {
+  AANoFreeFloating(const IRPosition &IRP, Attributor &A)
+      : AANoFreeImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override{STATS_DECLTRACK_FLOATING_ATTR(nofree)}
+
+  /// See Abstract Attribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    const IRPosition &IRP = getIRPosition();
+
+    const auto &NoFreeAA =
+        A.getAAFor<AANoFree>(*this, IRPosition::function_scope(IRP));
+    if (NoFreeAA.isAssumedNoFree())
+      return ChangeStatus::UNCHANGED;
+
+    Value &AssociatedValue = getIRPosition().getAssociatedValue();
+    auto Pred = [&](const Use &U, bool &Follow) -> bool {
+      Instruction *UserI = cast<Instruction>(U.getUser());
+      if (auto *CB = dyn_cast<CallBase>(UserI)) {
+        if (CB->isBundleOperand(&U))
+          return false;
+        if (!CB->isArgOperand(&U))
+          return true;
+        unsigned ArgNo = CB->getArgOperandNo(&U);
+
+        const auto &NoFreeArg = A.getAAFor<AANoFree>(
+            *this, IRPosition::callsite_argument(*CB, ArgNo));
+        return NoFreeArg.isAssumedNoFree();
+      }
+
+      if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) ||
+          isa<PHINode>(UserI) || isa<SelectInst>(UserI)) {
+        Follow = true;
+        return true;
+      }
+      if (isa<ReturnInst>(UserI))
+        return true;
+
+      // Unknown user.
+      return false;
+    };
+    if (!A.checkForAllUses(Pred, *this, AssociatedValue))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+};
+
+/// NoFree attribute for a call site argument.
+struct AANoFreeArgument final : AANoFreeFloating {
+  AANoFreeArgument(const IRPosition &IRP, Attributor &A)
+      : AANoFreeFloating(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nofree) }
+};
+
+/// NoFree attribute for call site arguments.
+struct AANoFreeCallSiteArgument final : AANoFreeFloating {
+  AANoFreeCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AANoFreeFloating(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg)
+      return indicatePessimisticFixpoint();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA = A.getAAFor<AANoFree>(*this, ArgPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AANoFree::StateType &>(ArgAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nofree)};
+};
+
+/// NoFree attribute for function return value.
+struct AANoFreeReturned final : AANoFreeFloating {
+  AANoFreeReturned(const IRPosition &IRP, Attributor &A)
+      : AANoFreeFloating(IRP, A) {
+    llvm_unreachable("NoFree is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    llvm_unreachable("NoFree is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("NoFree is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// NoFree attribute deduction for a call site return value.
+struct AANoFreeCallSiteReturned final : AANoFreeFloating {
+  AANoFreeCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AANoFreeFloating(IRP, A) {}
+
+  ChangeStatus manifest(Attributor &A) override {
+    return ChangeStatus::UNCHANGED;
+  }
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nofree) }
+};
+
+/// ------------------------ NonNull Argument Attribute ------------------------
+static int64_t getKnownNonNullAndDerefBytesForUse(
+    Attributor &A, const AbstractAttribute &QueryingAA, Value &AssociatedValue,
+    const Use *U, const Instruction *I, bool &IsNonNull, bool &TrackUse) {
+  TrackUse = false;
+
+  const Value *UseV = U->get();
+  if (!UseV->getType()->isPointerTy())
+    return 0;
+
+  Type *PtrTy = UseV->getType();
+  const Function *F = I->getFunction();
+  bool NullPointerIsDefined =
+      F ? llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()) : true;
+  const DataLayout &DL = A.getInfoCache().getDL();
+  if (const auto *CB = dyn_cast<CallBase>(I)) {
+    if (CB->isBundleOperand(U)) {
+      if (RetainedKnowledge RK = getKnowledgeFromUse(
+              U, {Attribute::NonNull, Attribute::Dereferenceable})) {
+        IsNonNull |=
+            (RK.AttrKind == Attribute::NonNull || !NullPointerIsDefined);
+        return RK.ArgValue;
+      }
+      return 0;
+    }
+
+    if (CB->isCallee(U)) {
+      IsNonNull |= !NullPointerIsDefined;
+      return 0;
+    }
+
+    unsigned ArgNo = CB->getArgOperandNo(U);
+    IRPosition IRP = IRPosition::callsite_argument(*CB, ArgNo);
+    // As long as we only use known information there is no need to track
+    // dependences here.
+    auto &DerefAA = A.getAAFor<AADereferenceable>(QueryingAA, IRP,
+                                                  /* TrackDependence */ false);
+    IsNonNull |= DerefAA.isKnownNonNull();
+    return DerefAA.getKnownDereferenceableBytes();
+  }
+
+  // We need to follow common pointer manipulation uses to the accesses they
+  // feed into. We can try to be smart to avoid looking through things we do not
+  // like for now, e.g., non-inbounds GEPs.
+  if (isa<CastInst>(I)) {
+    TrackUse = true;
+    return 0;
+  }
+
+  if (isa<GetElementPtrInst>(I)) {
+    TrackUse = true;
+    return 0;
+  }
+
+  int64_t Offset;
+  const Value *Base =
+      getMinimalBaseOfAccsesPointerOperand(A, QueryingAA, I, Offset, DL);
+  if (Base) {
+    if (Base == &AssociatedValue &&
+        getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
+      int64_t DerefBytes =
+          (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType()) + Offset;
+
+      IsNonNull |= !NullPointerIsDefined;
+      return std::max(int64_t(0), DerefBytes);
+    }
+  }
+
+  /// Corner case when an offset is 0.
+  Base = getBasePointerOfAccessPointerOperand(I, Offset, DL,
+                                              /*AllowNonInbounds*/ true);
+  if (Base) {
+    if (Offset == 0 && Base == &AssociatedValue &&
+        getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
+      int64_t DerefBytes =
+          (int64_t)DL.getTypeStoreSize(PtrTy->getPointerElementType());
+      IsNonNull |= !NullPointerIsDefined;
+      return std::max(int64_t(0), DerefBytes);
+    }
+  }
+
+  return 0;
+}
+
+struct AANonNullImpl : AANonNull {
+  AANonNullImpl(const IRPosition &IRP, Attributor &A)
+      : AANonNull(IRP, A),
+        NullIsDefined(NullPointerIsDefined(
+            getAnchorScope(),
+            getAssociatedValue().getType()->getPointerAddressSpace())) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    Value &V = getAssociatedValue();
+    if (!NullIsDefined &&
+        hasAttr({Attribute::NonNull, Attribute::Dereferenceable},
+                /* IgnoreSubsumingPositions */ false, &A))
+      indicateOptimisticFixpoint();
+    else if (isa<ConstantPointerNull>(V))
+      indicatePessimisticFixpoint();
+    else
+      AANonNull::initialize(A);
+
+    bool CanBeNull = true;
+    if (V.getPointerDereferenceableBytes(A.getDataLayout(), CanBeNull))
+      if (!CanBeNull)
+        indicateOptimisticFixpoint();
+
+    if (!getState().isAtFixpoint())
+      if (Instruction *CtxI = getCtxI())
+        followUsesInMBEC(*this, A, getState(), *CtxI);
+  }
+
+  /// See followUsesInMBEC
+  bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I,
+                       AANonNull::StateType &State) {
+    bool IsNonNull = false;
+    bool TrackUse = false;
+    getKnownNonNullAndDerefBytesForUse(A, *this, getAssociatedValue(), U, I,
+                                       IsNonNull, TrackUse);
+    State.setKnown(IsNonNull);
+    return TrackUse;
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? "nonnull" : "may-null";
+  }
+
+  /// Flag to determine if the underlying value can be null and still allow
+  /// valid accesses.
+  const bool NullIsDefined;
+};
+
+/// NonNull attribute for a floating value.
+struct AANonNullFloating : public AANonNullImpl {
+  AANonNullFloating(const IRPosition &IRP, Attributor &A)
+      : AANonNullImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    if (!NullIsDefined) {
+      const auto &DerefAA =
+          A.getAAFor<AADereferenceable>(*this, getIRPosition());
+      if (DerefAA.getAssumedDereferenceableBytes())
+        return ChangeStatus::UNCHANGED;
+    }
+
+    const DataLayout &DL = A.getDataLayout();
+
+    DominatorTree *DT = nullptr;
+    AssumptionCache *AC = nullptr;
+    InformationCache &InfoCache = A.getInfoCache();
+    if (const Function *Fn = getAnchorScope()) {
+      DT = InfoCache.getAnalysisResultForFunction<DominatorTreeAnalysis>(*Fn);
+      AC = InfoCache.getAnalysisResultForFunction<AssumptionAnalysis>(*Fn);
+    }
+
+    auto VisitValueCB = [&](Value &V, const Instruction *CtxI,
+                            AANonNull::StateType &T, bool Stripped) -> bool {
+      const auto &AA = A.getAAFor<AANonNull>(*this, IRPosition::value(V));
+      if (!Stripped && this == &AA) {
+        if (!isKnownNonZero(&V, DL, 0, AC, CtxI, DT))
+          T.indicatePessimisticFixpoint();
+      } else {
+        // Use abstract attribute information.
+        const AANonNull::StateType &NS =
+            static_cast<const AANonNull::StateType &>(AA.getState());
+        T ^= NS;
+      }
+      return T.isValidState();
+    };
+
+    StateType T;
+    if (!genericValueTraversal<AANonNull, StateType>(
+            A, getIRPosition(), *this, T, VisitValueCB, getCtxI()))
+      return indicatePessimisticFixpoint();
+
+    return clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
+};
+
+/// NonNull attribute for function return value.
+struct AANonNullReturned final
+    : AAReturnedFromReturnedValues<AANonNull, AANonNullImpl> {
+  AANonNullReturned(const IRPosition &IRP, Attributor &A)
+      : AAReturnedFromReturnedValues<AANonNull, AANonNullImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(nonnull) }
+};
+
+/// NonNull attribute for function argument.
+struct AANonNullArgument final
+    : AAArgumentFromCallSiteArguments<AANonNull, AANonNullImpl> {
+  AANonNullArgument(const IRPosition &IRP, Attributor &A)
+      : AAArgumentFromCallSiteArguments<AANonNull, AANonNullImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nonnull) }
+};
+
+struct AANonNullCallSiteArgument final : AANonNullFloating {
+  AANonNullCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AANonNullFloating(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(nonnull) }
+};
+
+/// NonNull attribute for a call site return position.
+struct AANonNullCallSiteReturned final
+    : AACallSiteReturnedFromReturned<AANonNull, AANonNullImpl> {
+  AANonNullCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AACallSiteReturnedFromReturned<AANonNull, AANonNullImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(nonnull) }
+};
+
+/// ------------------------ No-Recurse Attributes ----------------------------
+
+struct AANoRecurseImpl : public AANoRecurse {
+  AANoRecurseImpl(const IRPosition &IRP, Attributor &A) : AANoRecurse(IRP, A) {}
+
+  /// See AbstractAttribute::getAsStr()
+  const std::string getAsStr() const override {
+    return getAssumed() ? "norecurse" : "may-recurse";
+  }
+};
+
+struct AANoRecurseFunction final : AANoRecurseImpl {
+  AANoRecurseFunction(const IRPosition &IRP, Attributor &A)
+      : AANoRecurseImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoRecurseImpl::initialize(A);
+    if (const Function *F = getAnchorScope())
+      if (A.getInfoCache().getSccSize(*F) != 1)
+        indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+
+    // If all live call sites are known to be no-recurse, we are as well.
+    auto CallSitePred = [&](AbstractCallSite ACS) {
+      const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(
+          *this, IRPosition::function(*ACS.getInstruction()->getFunction()),
+          /* TrackDependence */ false, DepClassTy::OPTIONAL);
+      return NoRecurseAA.isKnownNoRecurse();
+    };
+    bool AllCallSitesKnown;
+    if (A.checkForAllCallSites(CallSitePred, *this, true, AllCallSitesKnown)) {
+      // If we know all call sites and all are known no-recurse, we are done.
+      // If all known call sites, which might not be all that exist, are known
+      // to be no-recurse, we are not done but we can continue to assume
+      // no-recurse. If one of the call sites we have not visited will become
+      // live, another update is triggered.
+      if (AllCallSitesKnown)
+        indicateOptimisticFixpoint();
+      return ChangeStatus::UNCHANGED;
+    }
+
+    // If the above check does not hold anymore we look at the calls.
+    auto CheckForNoRecurse = [&](Instruction &I) {
+      const auto &CB = cast<CallBase>(I);
+      if (CB.hasFnAttr(Attribute::NoRecurse))
+        return true;
+
+      const auto &NoRecurseAA =
+          A.getAAFor<AANoRecurse>(*this, IRPosition::callsite_function(CB));
+      if (!NoRecurseAA.isAssumedNoRecurse())
+        return false;
+
+      // Recursion to the same function
+      if (CB.getCalledFunction() == getAnchorScope())
+        return false;
+
+      return true;
+    };
+
+    if (!A.checkForAllCallLikeInstructions(CheckForNoRecurse, *this))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(norecurse) }
+};
+
+/// NoRecurse attribute deduction for a call sites.
+struct AANoRecurseCallSite final : AANoRecurseImpl {
+  AANoRecurseCallSite(const IRPosition &IRP, Attributor &A)
+      : AANoRecurseImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoRecurseImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoRecurse>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AANoRecurse::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(norecurse); }
+};
+
+/// -------------------- Undefined-Behavior Attributes ------------------------
+
+struct AAUndefinedBehaviorImpl : public AAUndefinedBehavior {
+  AAUndefinedBehaviorImpl(const IRPosition &IRP, Attributor &A)
+      : AAUndefinedBehavior(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  // through a pointer (i.e. also branches etc.)
+  ChangeStatus updateImpl(Attributor &A) override {
+    const size_t UBPrevSize = KnownUBInsts.size();
+    const size_t NoUBPrevSize = AssumedNoUBInsts.size();
+
+    auto InspectMemAccessInstForUB = [&](Instruction &I) {
+      // Skip instructions that are already saved.
+      if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
+        return true;
+
+      // If we reach here, we know we have an instruction
+      // that accesses memory through a pointer operand,
+      // for which getPointerOperand() should give it to us.
+      const Value *PtrOp = getPointerOperand(&I, /* AllowVolatile */ true);
+      assert(PtrOp &&
+             "Expected pointer operand of memory accessing instruction");
+
+      // Either we stopped and the appropriate action was taken,
+      // or we got back a simplified value to continue.
+      Optional<Value *> SimplifiedPtrOp = stopOnUndefOrAssumed(A, PtrOp, &I);
+      if (!SimplifiedPtrOp.hasValue())
+        return true;
+      const Value *PtrOpVal = SimplifiedPtrOp.getValue();
+
+      // A memory access through a pointer is considered UB
+      // only if the pointer has constant null value.
+      // TODO: Expand it to not only check constant values.
+      if (!isa<ConstantPointerNull>(PtrOpVal)) {
+        AssumedNoUBInsts.insert(&I);
+        return true;
+      }
+      const Type *PtrTy = PtrOpVal->getType();
+
+      // Because we only consider instructions inside functions,
+      // assume that a parent function exists.
+      const Function *F = I.getFunction();
+
+      // A memory access using constant null pointer is only considered UB
+      // if null pointer is _not_ defined for the target platform.
+      if (llvm::NullPointerIsDefined(F, PtrTy->getPointerAddressSpace()))
+        AssumedNoUBInsts.insert(&I);
+      else
+        KnownUBInsts.insert(&I);
+      return true;
+    };
+
+    auto InspectBrInstForUB = [&](Instruction &I) {
+      // A conditional branch instruction is considered UB if it has `undef`
+      // condition.
+
+      // Skip instructions that are already saved.
+      if (AssumedNoUBInsts.count(&I) || KnownUBInsts.count(&I))
+        return true;
+
+      // We know we have a branch instruction.
+      auto BrInst = cast<BranchInst>(&I);
+
+      // Unconditional branches are never considered UB.
+      if (BrInst->isUnconditional())
+        return true;
+
+      // Either we stopped and the appropriate action was taken,
+      // or we got back a simplified value to continue.
+      Optional<Value *> SimplifiedCond =
+          stopOnUndefOrAssumed(A, BrInst->getCondition(), BrInst);
+      if (!SimplifiedCond.hasValue())
+        return true;
+      AssumedNoUBInsts.insert(&I);
+      return true;
+    };
+
+    A.checkForAllInstructions(InspectMemAccessInstForUB, *this,
+                              {Instruction::Load, Instruction::Store,
+                               Instruction::AtomicCmpXchg,
+                               Instruction::AtomicRMW},
+                              /* CheckBBLivenessOnly */ true);
+    A.checkForAllInstructions(InspectBrInstForUB, *this, {Instruction::Br},
+                              /* CheckBBLivenessOnly */ true);
+    if (NoUBPrevSize != AssumedNoUBInsts.size() ||
+        UBPrevSize != KnownUBInsts.size())
+      return ChangeStatus::CHANGED;
+    return ChangeStatus::UNCHANGED;
+  }
+
+  bool isKnownToCauseUB(Instruction *I) const override {
+    return KnownUBInsts.count(I);
+  }
+
+  bool isAssumedToCauseUB(Instruction *I) const override {
+    // In simple words, if an instruction is not in the assumed to _not_
+    // cause UB, then it is assumed UB (that includes those
+    // in the KnownUBInsts set). The rest is boilerplate
+    // is to ensure that it is one of the instructions we test
+    // for UB.
+
+    switch (I->getOpcode()) {
+    case Instruction::Load:
+    case Instruction::Store:
+    case Instruction::AtomicCmpXchg:
+    case Instruction::AtomicRMW:
+      return !AssumedNoUBInsts.count(I);
+    case Instruction::Br: {
+      auto BrInst = cast<BranchInst>(I);
+      if (BrInst->isUnconditional())
+        return false;
+      return !AssumedNoUBInsts.count(I);
+    } break;
+    default:
+      return false;
+    }
+    return false;
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    if (KnownUBInsts.empty())
+      return ChangeStatus::UNCHANGED;
+    for (Instruction *I : KnownUBInsts)
+      A.changeToUnreachableAfterManifest(I);
+    return ChangeStatus::CHANGED;
+  }
+
+  /// See AbstractAttribute::getAsStr()
+  const std::string getAsStr() const override {
+    return getAssumed() ? "undefined-behavior" : "no-ub";
+  }
+
+  /// Note: The correctness of this analysis depends on the fact that the
+  /// following 2 sets will stop changing after some point.
+  /// "Change" here means that their size changes.
+  /// The size of each set is monotonically increasing
+  /// (we only add items to them) and it is upper bounded by the number of
+  /// instructions in the processed function (we can never save more
+  /// elements in either set than this number). Hence, at some point,
+  /// they will stop increasing.
+  /// Consequently, at some point, both sets will have stopped
+  /// changing, effectively making the analysis reach a fixpoint.
+
+  /// Note: These 2 sets are disjoint and an instruction can be considered
+  /// one of 3 things:
+  /// 1) Known to cause UB (AAUndefinedBehavior could prove it) and put it in
+  ///    the KnownUBInsts set.
+  /// 2) Assumed to cause UB (in every updateImpl, AAUndefinedBehavior
+  ///    has a reason to assume it).
+  /// 3) Assumed to not cause UB. very other instruction - AAUndefinedBehavior
+  ///    could not find a reason to assume or prove that it can cause UB,
+  ///    hence it assumes it doesn't. We have a set for these instructions
+  ///    so that we don't reprocess them in every update.
+  ///    Note however that instructions in this set may cause UB.
+
+protected:
+  /// A set of all live instructions _known_ to cause UB.
+  SmallPtrSet<Instruction *, 8> KnownUBInsts;
+
+private:
+  /// A set of all the (live) instructions that are assumed to _not_ cause UB.
+  SmallPtrSet<Instruction *, 8> AssumedNoUBInsts;
+
+  // Should be called on updates in which if we're processing an instruction
+  // \p I that depends on a value \p V, one of the following has to happen:
+  // - If the value is assumed, then stop.
+  // - If the value is known but undef, then consider it UB.
+  // - Otherwise, do specific processing with the simplified value.
+  // We return None in the first 2 cases to signify that an appropriate
+  // action was taken and the caller should stop.
+  // Otherwise, we return the simplified value that the caller should
+  // use for specific processing.
+  Optional<Value *> stopOnUndefOrAssumed(Attributor &A, const Value *V,
+                                         Instruction *I) {
+    const auto &ValueSimplifyAA =
+        A.getAAFor<AAValueSimplify>(*this, IRPosition::value(*V));
+    Optional<Value *> SimplifiedV =
+        ValueSimplifyAA.getAssumedSimplifiedValue(A);
+    if (!ValueSimplifyAA.isKnown()) {
+      // Don't depend on assumed values.
+      return llvm::None;
+    }
+    if (!SimplifiedV.hasValue()) {
+      // If it is known (which we tested above) but it doesn't have a value,
+      // then we can assume `undef` and hence the instruction is UB.
+      KnownUBInsts.insert(I);
+      return llvm::None;
+    }
+    Value *Val = SimplifiedV.getValue();
+    if (isa<UndefValue>(Val)) {
+      KnownUBInsts.insert(I);
+      return llvm::None;
+    }
+    return Val;
+  }
+};
+
+struct AAUndefinedBehaviorFunction final : AAUndefinedBehaviorImpl {
+  AAUndefinedBehaviorFunction(const IRPosition &IRP, Attributor &A)
+      : AAUndefinedBehaviorImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECL(UndefinedBehaviorInstruction, Instruction,
+               "Number of instructions known to have UB");
+    BUILD_STAT_NAME(UndefinedBehaviorInstruction, Instruction) +=
+        KnownUBInsts.size();
+  }
+};
+
+/// ------------------------ Will-Return Attributes ----------------------------
+
+// Helper function that checks whether a function has any cycle which we don't
+// know if it is bounded or not.
+// Loops with maximum trip count are considered bounded, any other cycle not.
+static bool mayContainUnboundedCycle(Function &F, Attributor &A) {
+  ScalarEvolution *SE =
+      A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(F);
+  LoopInfo *LI = A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(F);
+  // If either SCEV or LoopInfo is not available for the function then we assume
+  // any cycle to be unbounded cycle.
+  // We use scc_iterator which uses Tarjan algorithm to find all the maximal
+  // SCCs.To detect if there's a cycle, we only need to find the maximal ones.
+  if (!SE || !LI) {
+    for (scc_iterator<Function *> SCCI = scc_begin(&F); !SCCI.isAtEnd(); ++SCCI)
+      if (SCCI.hasCycle())
+        return true;
+    return false;
+  }
+
+  // If there's irreducible control, the function may contain non-loop cycles.
+  if (mayContainIrreducibleControl(F, LI))
+    return true;
+
+  // Any loop that does not have a max trip count is considered unbounded cycle.
+  for (auto *L : LI->getLoopsInPreorder()) {
+    if (!SE->getSmallConstantMaxTripCount(L))
+      return true;
+  }
+  return false;
+}
+
+struct AAWillReturnImpl : public AAWillReturn {
+  AAWillReturnImpl(const IRPosition &IRP, Attributor &A)
+      : AAWillReturn(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAWillReturn::initialize(A);
+
+    Function *F = getAnchorScope();
+    if (!F || !A.isFunctionIPOAmendable(*F) || mayContainUnboundedCycle(*F, A))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto CheckForWillReturn = [&](Instruction &I) {
+      IRPosition IPos = IRPosition::callsite_function(cast<CallBase>(I));
+      const auto &WillReturnAA = A.getAAFor<AAWillReturn>(*this, IPos);
+      if (WillReturnAA.isKnownWillReturn())
+        return true;
+      if (!WillReturnAA.isAssumedWillReturn())
+        return false;
+      const auto &NoRecurseAA = A.getAAFor<AANoRecurse>(*this, IPos);
+      return NoRecurseAA.isAssumedNoRecurse();
+    };
+
+    if (!A.checkForAllCallLikeInstructions(CheckForWillReturn, *this))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::getAsStr()
+  const std::string getAsStr() const override {
+    return getAssumed() ? "willreturn" : "may-noreturn";
+  }
+};
+
+struct AAWillReturnFunction final : AAWillReturnImpl {
+  AAWillReturnFunction(const IRPosition &IRP, Attributor &A)
+      : AAWillReturnImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(willreturn) }
+};
+
+/// WillReturn attribute deduction for a call sites.
+struct AAWillReturnCallSite final : AAWillReturnImpl {
+  AAWillReturnCallSite(const IRPosition &IRP, Attributor &A)
+      : AAWillReturnImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAWillReturnImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AAWillReturn>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AAWillReturn::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(willreturn); }
+};
+
+/// -------------------AAReachability Attribute--------------------------
+
+struct AAReachabilityImpl : AAReachability {
+  AAReachabilityImpl(const IRPosition &IRP, Attributor &A)
+      : AAReachability(IRP, A) {}
+
+  const std::string getAsStr() const override {
+    // TODO: Return the number of reachable queries.
+    return "reachable";
+  }
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override { indicatePessimisticFixpoint(); }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    return indicatePessimisticFixpoint();
+  }
+};
+
+struct AAReachabilityFunction final : public AAReachabilityImpl {
+  AAReachabilityFunction(const IRPosition &IRP, Attributor &A)
+      : AAReachabilityImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(reachable); }
+};
+
+/// ------------------------ NoAlias Argument Attribute ------------------------
+
+struct AANoAliasImpl : AANoAlias {
+  AANoAliasImpl(const IRPosition &IRP, Attributor &A) : AANoAlias(IRP, A) {
+    assert(getAssociatedType()->isPointerTy() &&
+           "Noalias is a pointer attribute");
+  }
+
+  const std::string getAsStr() const override {
+    return getAssumed() ? "noalias" : "may-alias";
+  }
+};
+
+/// NoAlias attribute for a floating value.
+struct AANoAliasFloating final : AANoAliasImpl {
+  AANoAliasFloating(const IRPosition &IRP, Attributor &A)
+      : AANoAliasImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoAliasImpl::initialize(A);
+    Value *Val = &getAssociatedValue();
+    do {
+      CastInst *CI = dyn_cast<CastInst>(Val);
+      if (!CI)
+        break;
+      Value *Base = CI->getOperand(0);
+      if (!Base->hasOneUse())
+        break;
+      Val = Base;
+    } while (true);
+
+    if (!Val->getType()->isPointerTy()) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    if (isa<AllocaInst>(Val))
+      indicateOptimisticFixpoint();
+    else if (isa<ConstantPointerNull>(Val) &&
+             !NullPointerIsDefined(getAnchorScope(),
+                                   Val->getType()->getPointerAddressSpace()))
+      indicateOptimisticFixpoint();
+    else if (Val != &getAssociatedValue()) {
+      const auto &ValNoAliasAA =
+          A.getAAFor<AANoAlias>(*this, IRPosition::value(*Val));
+      if (ValNoAliasAA.isKnownNoAlias())
+        indicateOptimisticFixpoint();
+    }
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Implement this.
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(noalias)
+  }
+};
+
+/// NoAlias attribute for an argument.
+struct AANoAliasArgument final
+    : AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl> {
+  using Base = AAArgumentFromCallSiteArguments<AANoAlias, AANoAliasImpl>;
+  AANoAliasArgument(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    Base::initialize(A);
+    // See callsite argument attribute and callee argument attribute.
+    if (hasAttr({Attribute::ByVal}))
+      indicateOptimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::update(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // We have to make sure no-alias on the argument does not break
+    // synchronization when this is a callback argument, see also [1] below.
+    // If synchronization cannot be affected, we delegate to the base updateImpl
+    // function, otherwise we give up for now.
+
+    // If the function is no-sync, no-alias cannot break synchronization.
+    const auto &NoSyncAA = A.getAAFor<AANoSync>(
+        *this, IRPosition::function_scope(getIRPosition()));
+    if (NoSyncAA.isAssumedNoSync())
+      return Base::updateImpl(A);
+
+    // If the argument is read-only, no-alias cannot break synchronization.
+    const auto &MemBehaviorAA =
+        A.getAAFor<AAMemoryBehavior>(*this, getIRPosition());
+    if (MemBehaviorAA.isAssumedReadOnly())
+      return Base::updateImpl(A);
+
+    // If the argument is never passed through callbacks, no-alias cannot break
+    // synchronization.
+    bool AllCallSitesKnown;
+    if (A.checkForAllCallSites(
+            [](AbstractCallSite ACS) { return !ACS.isCallbackCall(); }, *this,
+            true, AllCallSitesKnown))
+      return Base::updateImpl(A);
+
+    // TODO: add no-alias but make sure it doesn't break synchronization by
+    // introducing fake uses. See:
+    // [1] Compiler Optimizations for OpenMP, J. Doerfert and H. Finkel,
+    //     International Workshop on OpenMP 2018,
+    //     http://compilers.cs.uni-saarland.de/people/doerfert/par_opt18.pdf
+
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(noalias) }
+};
+
+struct AANoAliasCallSiteArgument final : AANoAliasImpl {
+  AANoAliasCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AANoAliasImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // See callsite argument attribute and callee argument attribute.
+    const auto &CB = cast<CallBase>(getAnchorValue());
+    if (CB.paramHasAttr(getArgNo(), Attribute::NoAlias))
+      indicateOptimisticFixpoint();
+    Value &Val = getAssociatedValue();
+    if (isa<ConstantPointerNull>(Val) &&
+        !NullPointerIsDefined(getAnchorScope(),
+                              Val.getType()->getPointerAddressSpace()))
+      indicateOptimisticFixpoint();
+  }
+
+  /// Determine if the underlying value may alias with the call site argument
+  /// \p OtherArgNo of \p ICS (= the underlying call site).
+  bool mayAliasWithArgument(Attributor &A, AAResults *&AAR,
+                            const AAMemoryBehavior &MemBehaviorAA,
+                            const CallBase &CB, unsigned OtherArgNo) {
+    // We do not need to worry about aliasing with the underlying IRP.
+    if (this->getArgNo() == (int)OtherArgNo)
+      return false;
+
+    // If it is not a pointer or pointer vector we do not alias.
+    const Value *ArgOp = CB.getArgOperand(OtherArgNo);
+    if (!ArgOp->getType()->isPtrOrPtrVectorTy())
+      return false;
+
+    auto &CBArgMemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
+        *this, IRPosition::callsite_argument(CB, OtherArgNo),
+        /* TrackDependence */ false);
+
+    // If the argument is readnone, there is no read-write aliasing.
+    if (CBArgMemBehaviorAA.isAssumedReadNone()) {
+      A.recordDependence(CBArgMemBehaviorAA, *this, DepClassTy::OPTIONAL);
+      return false;
+    }
+
+    // If the argument is readonly and the underlying value is readonly, there
+    // is no read-write aliasing.
+    bool IsReadOnly = MemBehaviorAA.isAssumedReadOnly();
+    if (CBArgMemBehaviorAA.isAssumedReadOnly() && IsReadOnly) {
+      A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
+      A.recordDependence(CBArgMemBehaviorAA, *this, DepClassTy::OPTIONAL);
+      return false;
+    }
+
+    // We have to utilize actual alias analysis queries so we need the object.
+    if (!AAR)
+      AAR = A.getInfoCache().getAAResultsForFunction(*getAnchorScope());
+
+    // Try to rule it out at the call site.
+    bool IsAliasing = !AAR || !AAR->isNoAlias(&getAssociatedValue(), ArgOp);
+    LLVM_DEBUG(dbgs() << "[NoAliasCSArg] Check alias between "
+                         "callsite arguments: "
+                      << getAssociatedValue() << " " << *ArgOp << " => "
+                      << (IsAliasing ? "" : "no-") << "alias \n");
+
+    return IsAliasing;
+  }
+
+  bool
+  isKnownNoAliasDueToNoAliasPreservation(Attributor &A, AAResults *&AAR,
+                                         const AAMemoryBehavior &MemBehaviorAA,
+                                         const AANoAlias &NoAliasAA) {
+    // We can deduce "noalias" if the following conditions hold.
+    // (i)   Associated value is assumed to be noalias in the definition.
+    // (ii)  Associated value is assumed to be no-capture in all the uses
+    //       possibly executed before this callsite.
+    // (iii) There is no other pointer argument which could alias with the
+    //       value.
+
+    bool AssociatedValueIsNoAliasAtDef = NoAliasAA.isAssumedNoAlias();
+    if (!AssociatedValueIsNoAliasAtDef) {
+      LLVM_DEBUG(dbgs() << "[AANoAlias] " << getAssociatedValue()
+                        << " is not no-alias at the definition\n");
+      return false;
+    }
+
+    A.recordDependence(NoAliasAA, *this, DepClassTy::OPTIONAL);
+
+    const IRPosition &VIRP = IRPosition::value(getAssociatedValue());
+    auto &NoCaptureAA =
+        A.getAAFor<AANoCapture>(*this, VIRP, /* TrackDependence */ false);
+    // Check whether the value is captured in the scope using AANoCapture.
+    //      Look at CFG and check only uses possibly executed before this
+    //      callsite.
+    auto UsePred = [&](const Use &U, bool &Follow) -> bool {
+      Instruction *UserI = cast<Instruction>(U.getUser());
+
+      // If user if curr instr and only use.
+      if (UserI == getCtxI() && UserI->hasOneUse())
+        return true;
+
+      const Function *ScopeFn = VIRP.getAnchorScope();
+      if (ScopeFn) {
+        const auto &ReachabilityAA =
+            A.getAAFor<AAReachability>(*this, IRPosition::function(*ScopeFn));
+
+        if (!ReachabilityAA.isAssumedReachable(UserI, getCtxI()))
+          return true;
+
+        if (auto *CB = dyn_cast<CallBase>(UserI)) {
+          if (CB->isArgOperand(&U)) {
+
+            unsigned ArgNo = CB->getArgOperandNo(&U);
+
+            const auto &NoCaptureAA = A.getAAFor<AANoCapture>(
+                *this, IRPosition::callsite_argument(*CB, ArgNo));
+
+            if (NoCaptureAA.isAssumedNoCapture())
+              return true;
+          }
+        }
+      }
+
+      // For cases which can potentially have more users
+      if (isa<GetElementPtrInst>(U) || isa<BitCastInst>(U) || isa<PHINode>(U) ||
+          isa<SelectInst>(U)) {
+        Follow = true;
+        return true;
+      }
+
+      LLVM_DEBUG(dbgs() << "[AANoAliasCSArg] Unknown user: " << *U << "\n");
+      return false;
+    };
+
+    if (!NoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+      if (!A.checkForAllUses(UsePred, *this, getAssociatedValue())) {
+        LLVM_DEBUG(
+            dbgs() << "[AANoAliasCSArg] " << getAssociatedValue()
+                   << " cannot be noalias as it is potentially captured\n");
+        return false;
+      }
+    }
+    A.recordDependence(NoCaptureAA, *this, DepClassTy::OPTIONAL);
+
+    // Check there is no other pointer argument which could alias with the
+    // value passed at this call site.
+    // TODO: AbstractCallSite
+    const auto &CB = cast<CallBase>(getAnchorValue());
+    for (unsigned OtherArgNo = 0; OtherArgNo < CB.getNumArgOperands();
+         OtherArgNo++)
+      if (mayAliasWithArgument(A, AAR, MemBehaviorAA, CB, OtherArgNo))
+        return false;
+
+    return true;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // If the argument is readnone we are done as there are no accesses via the
+    // argument.
+    auto &MemBehaviorAA =
+        A.getAAFor<AAMemoryBehavior>(*this, getIRPosition(),
+                                     /* TrackDependence */ false);
+    if (MemBehaviorAA.isAssumedReadNone()) {
+      A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
+      return ChangeStatus::UNCHANGED;
+    }
+
+    const IRPosition &VIRP = IRPosition::value(getAssociatedValue());
+    const auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, VIRP,
+                                                  /* TrackDependence */ false);
+
+    AAResults *AAR = nullptr;
+    if (isKnownNoAliasDueToNoAliasPreservation(A, AAR, MemBehaviorAA,
+                                               NoAliasAA)) {
+      LLVM_DEBUG(
+          dbgs() << "[AANoAlias] No-Alias deduced via no-alias preservation\n");
+      return ChangeStatus::UNCHANGED;
+    }
+
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(noalias) }
+};
+
+/// NoAlias attribute for function return value.
+struct AANoAliasReturned final : AANoAliasImpl {
+  AANoAliasReturned(const IRPosition &IRP, Attributor &A)
+      : AANoAliasImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  virtual ChangeStatus updateImpl(Attributor &A) override {
+
+    auto CheckReturnValue = [&](Value &RV) -> bool {
+      if (Constant *C = dyn_cast<Constant>(&RV))
+        if (C->isNullValue() || isa<UndefValue>(C))
+          return true;
+
+      /// For now, we can only deduce noalias if we have call sites.
+      /// FIXME: add more support.
+      if (!isa<CallBase>(&RV))
+        return false;
+
+      const IRPosition &RVPos = IRPosition::value(RV);
+      const auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, RVPos);
+      if (!NoAliasAA.isAssumedNoAlias())
+        return false;
+
+      const auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, RVPos);
+      return NoCaptureAA.isAssumedNoCaptureMaybeReturned();
+    };
+
+    if (!A.checkForAllReturnedValues(CheckReturnValue, *this))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(noalias) }
+};
+
+/// NoAlias attribute deduction for a call site return value.
+struct AANoAliasCallSiteReturned final : AANoAliasImpl {
+  AANoAliasCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AANoAliasImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoAliasImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::returned(*F);
+    auto &FnAA = A.getAAFor<AANoAlias>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AANoAlias::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSRET_ATTR(noalias); }
+};
+
+/// -------------------AAIsDead Function Attribute-----------------------
+
+struct AAIsDeadValueImpl : public AAIsDead {
+  AAIsDeadValueImpl(const IRPosition &IRP, Attributor &A) : AAIsDead(IRP, A) {}
+
+  /// See AAIsDead::isAssumedDead().
+  bool isAssumedDead() const override { return getAssumed(); }
+
+  /// See AAIsDead::isKnownDead().
+  bool isKnownDead() const override { return getKnown(); }
+
+  /// See AAIsDead::isAssumedDead(BasicBlock *).
+  bool isAssumedDead(const BasicBlock *BB) const override { return false; }
+
+  /// See AAIsDead::isKnownDead(BasicBlock *).
+  bool isKnownDead(const BasicBlock *BB) const override { return false; }
+
+  /// See AAIsDead::isAssumedDead(Instruction *I).
+  bool isAssumedDead(const Instruction *I) const override {
+    return I == getCtxI() && isAssumedDead();
+  }
+
+  /// See AAIsDead::isKnownDead(Instruction *I).
+  bool isKnownDead(const Instruction *I) const override {
+    return isAssumedDead(I) && getKnown();
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return isAssumedDead() ? "assumed-dead" : "assumed-live";
+  }
+
+  /// Check if all uses are assumed dead.
+  bool areAllUsesAssumedDead(Attributor &A, Value &V) {
+    auto UsePred = [&](const Use &U, bool &Follow) { return false; };
+    // Explicitly set the dependence class to required because we want a long
+    // chain of N dependent instructions to be considered live as soon as one is
+    // without going through N update cycles. This is not required for
+    // correctness.
+    return A.checkForAllUses(UsePred, *this, V, DepClassTy::REQUIRED);
+  }
+
+  /// Determine if \p I is assumed to be side-effect free.
+  bool isAssumedSideEffectFree(Attributor &A, Instruction *I) {
+    if (!I || wouldInstructionBeTriviallyDead(I))
+      return true;
+
+    auto *CB = dyn_cast<CallBase>(I);
+    if (!CB || isa<IntrinsicInst>(CB))
+      return false;
+
+    const IRPosition &CallIRP = IRPosition::callsite_function(*CB);
+    const auto &NoUnwindAA = A.getAndUpdateAAFor<AANoUnwind>(
+        *this, CallIRP, /* TrackDependence */ false);
+    if (!NoUnwindAA.isAssumedNoUnwind())
+      return false;
+    if (!NoUnwindAA.isKnownNoUnwind())
+      A.recordDependence(NoUnwindAA, *this, DepClassTy::OPTIONAL);
+
+    const auto &MemBehaviorAA = A.getAndUpdateAAFor<AAMemoryBehavior>(
+        *this, CallIRP, /* TrackDependence */ false);
+    if (MemBehaviorAA.isAssumedReadOnly()) {
+      if (!MemBehaviorAA.isKnownReadOnly())
+        A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
+      return true;
+    }
+    return false;
+  }
+};
+
+struct AAIsDeadFloating : public AAIsDeadValueImpl {
+  AAIsDeadFloating(const IRPosition &IRP, Attributor &A)
+      : AAIsDeadValueImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (isa<UndefValue>(getAssociatedValue())) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
+    if (!isAssumedSideEffectFree(A, I))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
+    if (!isAssumedSideEffectFree(A, I))
+      return indicatePessimisticFixpoint();
+
+    if (!areAllUsesAssumedDead(A, getAssociatedValue()))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    Value &V = getAssociatedValue();
+    if (auto *I = dyn_cast<Instruction>(&V)) {
+      // If we get here we basically know the users are all dead. We check if
+      // isAssumedSideEffectFree returns true here again because it might not be
+      // the case and only the users are dead but the instruction (=call) is
+      // still needed.
+      if (isAssumedSideEffectFree(A, I) && !isa<InvokeInst>(I)) {
+        A.deleteAfterManifest(*I);
+        return ChangeStatus::CHANGED;
+      }
+    }
+    if (V.use_empty())
+      return ChangeStatus::UNCHANGED;
+
+    bool UsedAssumedInformation = false;
+    Optional<Constant *> C =
+        A.getAssumedConstant(V, *this, UsedAssumedInformation);
+    if (C.hasValue() && C.getValue())
+      return ChangeStatus::UNCHANGED;
+
+    // Replace the value with undef as it is dead but keep droppable uses around
+    // as they provide information we don't want to give up on just yet.
+    UndefValue &UV = *UndefValue::get(V.getType());
+    bool AnyChange =
+        A.changeValueAfterManifest(V, UV, /* ChangeDropppable */ false);
+    return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(IsDead)
+  }
+};
+
+struct AAIsDeadArgument : public AAIsDeadFloating {
+  AAIsDeadArgument(const IRPosition &IRP, Attributor &A)
+      : AAIsDeadFloating(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (!A.isFunctionIPOAmendable(*getAnchorScope()))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = AAIsDeadFloating::manifest(A);
+    Argument &Arg = *getAssociatedArgument();
+    if (A.isValidFunctionSignatureRewrite(Arg, /* ReplacementTypes */ {}))
+      if (A.registerFunctionSignatureRewrite(
+              Arg, /* ReplacementTypes */ {},
+              Attributor::ArgumentReplacementInfo::CalleeRepairCBTy{},
+              Attributor::ArgumentReplacementInfo::ACSRepairCBTy{})) {
+        Arg.dropDroppableUses();
+        return ChangeStatus::CHANGED;
+      }
+    return Changed;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(IsDead) }
+};
+
+struct AAIsDeadCallSiteArgument : public AAIsDeadValueImpl {
+  AAIsDeadCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAIsDeadValueImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (isa<UndefValue>(getAssociatedValue()))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg)
+      return indicatePessimisticFixpoint();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA = A.getAAFor<AAIsDead>(*this, ArgPos);
+    return clampStateAndIndicateChange(
+        getState(), static_cast<const AAIsDead::StateType &>(ArgAA.getState()));
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    CallBase &CB = cast<CallBase>(getAnchorValue());
+    Use &U = CB.getArgOperandUse(getArgNo());
+    assert(!isa<UndefValue>(U.get()) &&
+           "Expected undef values to be filtered out!");
+    UndefValue &UV = *UndefValue::get(U->getType());
+    if (A.changeUseAfterManifest(U, UV))
+      return ChangeStatus::CHANGED;
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(IsDead) }
+};
+
+struct AAIsDeadCallSiteReturned : public AAIsDeadFloating {
+  AAIsDeadCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AAIsDeadFloating(IRP, A), IsAssumedSideEffectFree(true) {}
+
+  /// See AAIsDead::isAssumedDead().
+  bool isAssumedDead() const override {
+    return AAIsDeadFloating::isAssumedDead() && IsAssumedSideEffectFree;
+  }
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (isa<UndefValue>(getAssociatedValue())) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    // We track this separately as a secondary state.
+    IsAssumedSideEffectFree = isAssumedSideEffectFree(A, getCtxI());
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    if (IsAssumedSideEffectFree && !isAssumedSideEffectFree(A, getCtxI())) {
+      IsAssumedSideEffectFree = false;
+      Changed = ChangeStatus::CHANGED;
+    }
+
+    if (!areAllUsesAssumedDead(A, getAssociatedValue()))
+      return indicatePessimisticFixpoint();
+    return Changed;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (IsAssumedSideEffectFree)
+      STATS_DECLTRACK_CSRET_ATTR(IsDead)
+    else
+      STATS_DECLTRACK_CSRET_ATTR(UnusedResult)
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return isAssumedDead()
+               ? "assumed-dead"
+               : (getAssumed() ? "assumed-dead-users" : "assumed-live");
+  }
+
+private:
+  bool IsAssumedSideEffectFree;
+};
+
+struct AAIsDeadReturned : public AAIsDeadValueImpl {
+  AAIsDeadReturned(const IRPosition &IRP, Attributor &A)
+      : AAIsDeadValueImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+
+    A.checkForAllInstructions([](Instruction &) { return true; }, *this,
+                              {Instruction::Ret});
+
+    auto PredForCallSite = [&](AbstractCallSite ACS) {
+      if (ACS.isCallbackCall() || !ACS.getInstruction())
+        return false;
+      return areAllUsesAssumedDead(A, *ACS.getInstruction());
+    };
+
+    bool AllCallSitesKnown;
+    if (!A.checkForAllCallSites(PredForCallSite, *this, true,
+                                AllCallSitesKnown))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    // TODO: Rewrite the signature to return void?
+    bool AnyChange = false;
+    UndefValue &UV = *UndefValue::get(getAssociatedFunction()->getReturnType());
+    auto RetInstPred = [&](Instruction &I) {
+      ReturnInst &RI = cast<ReturnInst>(I);
+      if (!isa<UndefValue>(RI.getReturnValue()))
+        AnyChange |= A.changeUseAfterManifest(RI.getOperandUse(0), UV);
+      return true;
+    };
+    A.checkForAllInstructions(RetInstPred, *this, {Instruction::Ret});
+    return AnyChange ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(IsDead) }
+};
+
+struct AAIsDeadFunction : public AAIsDead {
+  AAIsDeadFunction(const IRPosition &IRP, Attributor &A) : AAIsDead(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    const Function *F = getAnchorScope();
+    if (F && !F->isDeclaration()) {
+      ToBeExploredFrom.insert(&F->getEntryBlock().front());
+      assumeLive(A, F->getEntryBlock());
+    }
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return "Live[#BB " + std::to_string(AssumedLiveBlocks.size()) + "/" +
+           std::to_string(getAnchorScope()->size()) + "][#TBEP " +
+           std::to_string(ToBeExploredFrom.size()) + "][#KDE " +
+           std::to_string(KnownDeadEnds.size()) + "]";
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    assert(getState().isValidState() &&
+           "Attempted to manifest an invalid state!");
+
+    ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+    Function &F = *getAnchorScope();
+
+    if (AssumedLiveBlocks.empty()) {
+      A.deleteAfterManifest(F);
+      return ChangeStatus::CHANGED;
+    }
+
+    // Flag to determine if we can change an invoke to a call assuming the
+    // callee is nounwind. This is not possible if the personality of the
+    // function allows to catch asynchronous exceptions.
+    bool Invoke2CallAllowed = !mayCatchAsynchronousExceptions(F);
+
+    KnownDeadEnds.set_union(ToBeExploredFrom);
+    for (const Instruction *DeadEndI : KnownDeadEnds) {
+      auto *CB = dyn_cast<CallBase>(DeadEndI);
+      if (!CB)
+        continue;
+      const auto &NoReturnAA = A.getAndUpdateAAFor<AANoReturn>(
+          *this, IRPosition::callsite_function(*CB), /* TrackDependence */ true,
+          DepClassTy::OPTIONAL);
+      bool MayReturn = !NoReturnAA.isAssumedNoReturn();
+      if (MayReturn && (!Invoke2CallAllowed || !isa<InvokeInst>(CB)))
+        continue;
+
+      if (auto *II = dyn_cast<InvokeInst>(DeadEndI))
+        A.registerInvokeWithDeadSuccessor(const_cast<InvokeInst &>(*II));
+      else
+        A.changeToUnreachableAfterManifest(
+            const_cast<Instruction *>(DeadEndI->getNextNode()));
+      HasChanged = ChangeStatus::CHANGED;
+    }
+
+    STATS_DECL(AAIsDead, BasicBlock, "Number of dead basic blocks deleted.");
+    for (BasicBlock &BB : F)
+      if (!AssumedLiveBlocks.count(&BB)) {
+        A.deleteAfterManifest(BB);
+        ++BUILD_STAT_NAME(AAIsDead, BasicBlock);
+      }
+
+    return HasChanged;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+
+  /// Returns true if the function is assumed dead.
+  bool isAssumedDead() const override { return false; }
+
+  /// See AAIsDead::isKnownDead().
+  bool isKnownDead() const override { return false; }
+
+  /// See AAIsDead::isAssumedDead(BasicBlock *).
+  bool isAssumedDead(const BasicBlock *BB) const override {
+    assert(BB->getParent() == getAnchorScope() &&
+           "BB must be in the same anchor scope function.");
+
+    if (!getAssumed())
+      return false;
+    return !AssumedLiveBlocks.count(BB);
+  }
+
+  /// See AAIsDead::isKnownDead(BasicBlock *).
+  bool isKnownDead(const BasicBlock *BB) const override {
+    return getKnown() && isAssumedDead(BB);
+  }
+
+  /// See AAIsDead::isAssumed(Instruction *I).
+  bool isAssumedDead(const Instruction *I) const override {
+    assert(I->getParent()->getParent() == getAnchorScope() &&
+           "Instruction must be in the same anchor scope function.");
+
+    if (!getAssumed())
+      return false;
+
+    // If it is not in AssumedLiveBlocks then it for sure dead.
+    // Otherwise, it can still be after noreturn call in a live block.
+    if (!AssumedLiveBlocks.count(I->getParent()))
+      return true;
+
+    // If it is not after a liveness barrier it is live.
+    const Instruction *PrevI = I->getPrevNode();
+    while (PrevI) {
+      if (KnownDeadEnds.count(PrevI) || ToBeExploredFrom.count(PrevI))
+        return true;
+      PrevI = PrevI->getPrevNode();
+    }
+    return false;
+  }
+
+  /// See AAIsDead::isKnownDead(Instruction *I).
+  bool isKnownDead(const Instruction *I) const override {
+    return getKnown() && isAssumedDead(I);
+  }
+
+  /// Assume \p BB is (partially) live now and indicate to the Attributor \p A
+  /// that internal function called from \p BB should now be looked at.
+  bool assumeLive(Attributor &A, const BasicBlock &BB) {
+    if (!AssumedLiveBlocks.insert(&BB).second)
+      return false;
+
+    // We assume that all of BB is (probably) live now and if there are calls to
+    // internal functions we will assume that those are now live as well. This
+    // is a performance optimization for blocks with calls to a lot of internal
+    // functions. It can however cause dead functions to be treated as live.
+    for (const Instruction &I : BB)
+      if (const auto *CB = dyn_cast<CallBase>(&I))
+        if (const Function *F = CB->getCalledFunction())
+          if (F->hasLocalLinkage())
+            A.markLiveInternalFunction(*F);
+    return true;
+  }
+
+  /// Collection of instructions that need to be explored again, e.g., we
+  /// did assume they do not transfer control to (one of their) successors.
+  SmallSetVector<const Instruction *, 8> ToBeExploredFrom;
+
+  /// Collection of instructions that are known to not transfer control.
+  SmallSetVector<const Instruction *, 8> KnownDeadEnds;
+
+  /// Collection of all assumed live BasicBlocks.
+  DenseSet<const BasicBlock *> AssumedLiveBlocks;
+};
+
+static bool
+identifyAliveSuccessors(Attributor &A, const CallBase &CB,
+                        AbstractAttribute &AA,
+                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+  const IRPosition &IPos = IRPosition::callsite_function(CB);
+
+  const auto &NoReturnAA = A.getAndUpdateAAFor<AANoReturn>(
+      AA, IPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+  if (NoReturnAA.isAssumedNoReturn())
+    return !NoReturnAA.isKnownNoReturn();
+  if (CB.isTerminator())
+    AliveSuccessors.push_back(&CB.getSuccessor(0)->front());
+  else
+    AliveSuccessors.push_back(CB.getNextNode());
+  return false;
+}
+
+static bool
+identifyAliveSuccessors(Attributor &A, const InvokeInst &II,
+                        AbstractAttribute &AA,
+                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+  bool UsedAssumedInformation =
+      identifyAliveSuccessors(A, cast<CallBase>(II), AA, AliveSuccessors);
+
+  // First, determine if we can change an invoke to a call assuming the
+  // callee is nounwind. This is not possible if the personality of the
+  // function allows to catch asynchronous exceptions.
+  if (AAIsDeadFunction::mayCatchAsynchronousExceptions(*II.getFunction())) {
+    AliveSuccessors.push_back(&II.getUnwindDest()->front());
+  } else {
+    const IRPosition &IPos = IRPosition::callsite_function(II);
+    const auto &AANoUnw = A.getAndUpdateAAFor<AANoUnwind>(
+        AA, IPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+    if (AANoUnw.isAssumedNoUnwind()) {
+      UsedAssumedInformation |= !AANoUnw.isKnownNoUnwind();
+    } else {
+      AliveSuccessors.push_back(&II.getUnwindDest()->front());
+    }
+  }
+  return UsedAssumedInformation;
+}
+
+static bool
+identifyAliveSuccessors(Attributor &A, const BranchInst &BI,
+                        AbstractAttribute &AA,
+                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+  bool UsedAssumedInformation = false;
+  if (BI.getNumSuccessors() == 1) {
+    AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
+  } else {
+    Optional<ConstantInt *> CI = getAssumedConstantInt(
+        A, *BI.getCondition(), AA, UsedAssumedInformation);
+    if (!CI.hasValue()) {
+      // No value yet, assume both edges are dead.
+    } else if (CI.getValue()) {
+      const BasicBlock *SuccBB =
+          BI.getSuccessor(1 - CI.getValue()->getZExtValue());
+      AliveSuccessors.push_back(&SuccBB->front());
+    } else {
+      AliveSuccessors.push_back(&BI.getSuccessor(0)->front());
+      AliveSuccessors.push_back(&BI.getSuccessor(1)->front());
+      UsedAssumedInformation = false;
+    }
+  }
+  return UsedAssumedInformation;
+}
+
+static bool
+identifyAliveSuccessors(Attributor &A, const SwitchInst &SI,
+                        AbstractAttribute &AA,
+                        SmallVectorImpl<const Instruction *> &AliveSuccessors) {
+  bool UsedAssumedInformation = false;
+  Optional<ConstantInt *> CI =
+      getAssumedConstantInt(A, *SI.getCondition(), AA, UsedAssumedInformation);
+  if (!CI.hasValue()) {
+    // No value yet, assume all edges are dead.
+  } else if (CI.getValue()) {
+    for (auto &CaseIt : SI.cases()) {
+      if (CaseIt.getCaseValue() == CI.getValue()) {
+        AliveSuccessors.push_back(&CaseIt.getCaseSuccessor()->front());
+        return UsedAssumedInformation;
+      }
+    }
+    AliveSuccessors.push_back(&SI.getDefaultDest()->front());
+    return UsedAssumedInformation;
+  } else {
+    for (const BasicBlock *SuccBB : successors(SI.getParent()))
+      AliveSuccessors.push_back(&SuccBB->front());
+  }
+  return UsedAssumedInformation;
+}
+
+ChangeStatus AAIsDeadFunction::updateImpl(Attributor &A) {
+  ChangeStatus Change = ChangeStatus::UNCHANGED;
+
+  LLVM_DEBUG(dbgs() << "[AAIsDead] Live [" << AssumedLiveBlocks.size() << "/"
+                    << getAnchorScope()->size() << "] BBs and "
+                    << ToBeExploredFrom.size() << " exploration points and "
+                    << KnownDeadEnds.size() << " known dead ends\n");
+
+  // Copy and clear the list of instructions we need to explore from. It is
+  // refilled with instructions the next update has to look at.
+  SmallVector<const Instruction *, 8> Worklist(ToBeExploredFrom.begin(),
+                                               ToBeExploredFrom.end());
+  decltype(ToBeExploredFrom) NewToBeExploredFrom;
+
+  SmallVector<const Instruction *, 8> AliveSuccessors;
+  while (!Worklist.empty()) {
+    const Instruction *I = Worklist.pop_back_val();
+    LLVM_DEBUG(dbgs() << "[AAIsDead] Exploration inst: " << *I << "\n");
+
+    AliveSuccessors.clear();
+
+    bool UsedAssumedInformation = false;
+    switch (I->getOpcode()) {
+    // TODO: look for (assumed) UB to backwards propagate "deadness".
+    default:
+      if (I->isTerminator()) {
+        for (const BasicBlock *SuccBB : successors(I->getParent()))
+          AliveSuccessors.push_back(&SuccBB->front());
+      } else {
+        AliveSuccessors.push_back(I->getNextNode());
+      }
+      break;
+    case Instruction::Call:
+      UsedAssumedInformation = identifyAliveSuccessors(A, cast<CallInst>(*I),
+                                                       *this, AliveSuccessors);
+      break;
+    case Instruction::Invoke:
+      UsedAssumedInformation = identifyAliveSuccessors(A, cast<InvokeInst>(*I),
+                                                       *this, AliveSuccessors);
+      break;
+    case Instruction::Br:
+      UsedAssumedInformation = identifyAliveSuccessors(A, cast<BranchInst>(*I),
+                                                       *this, AliveSuccessors);
+      break;
+    case Instruction::Switch:
+      UsedAssumedInformation = identifyAliveSuccessors(A, cast<SwitchInst>(*I),
+                                                       *this, AliveSuccessors);
+      break;
+    }
+
+    if (UsedAssumedInformation) {
+      NewToBeExploredFrom.insert(I);
+    } else {
+      Change = ChangeStatus::CHANGED;
+      if (AliveSuccessors.empty() ||
+          (I->isTerminator() && AliveSuccessors.size() < I->getNumSuccessors()))
+        KnownDeadEnds.insert(I);
+    }
+
+    LLVM_DEBUG(dbgs() << "[AAIsDead] #AliveSuccessors: "
+                      << AliveSuccessors.size() << " UsedAssumedInformation: "
+                      << UsedAssumedInformation << "\n");
+
+    for (const Instruction *AliveSuccessor : AliveSuccessors) {
+      if (!I->isTerminator()) {
+        assert(AliveSuccessors.size() == 1 &&
+               "Non-terminator expected to have a single successor!");
+        Worklist.push_back(AliveSuccessor);
+      } else {
+        if (assumeLive(A, *AliveSuccessor->getParent()))
+          Worklist.push_back(AliveSuccessor);
+      }
+    }
+  }
+
+  ToBeExploredFrom = std::move(NewToBeExploredFrom);
+
+  // If we know everything is live there is no need to query for liveness.
+  // Instead, indicating a pessimistic fixpoint will cause the state to be
+  // "invalid" and all queries to be answered conservatively without lookups.
+  // To be in this state we have to (1) finished the exploration and (3) not
+  // discovered any non-trivial dead end and (2) not ruled unreachable code
+  // dead.
+  if (ToBeExploredFrom.empty() &&
+      getAnchorScope()->size() == AssumedLiveBlocks.size() &&
+      llvm::all_of(KnownDeadEnds, [](const Instruction *DeadEndI) {
+        return DeadEndI->isTerminator() && DeadEndI->getNumSuccessors() == 0;
+      }))
+    return indicatePessimisticFixpoint();
+  return Change;
+}
+
+/// Liveness information for a call sites.
+struct AAIsDeadCallSite final : AAIsDeadFunction {
+  AAIsDeadCallSite(const IRPosition &IRP, Attributor &A)
+      : AAIsDeadFunction(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites instead of
+    //       redirecting requests to the callee.
+    llvm_unreachable("Abstract attributes for liveness are not "
+                     "supported for call sites yet!");
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    return indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// -------------------- Dereferenceable Argument Attribute --------------------
+
+template <>
+ChangeStatus clampStateAndIndicateChange<DerefState>(DerefState &S,
+                                                     const DerefState &R) {
+  ChangeStatus CS0 =
+      clampStateAndIndicateChange(S.DerefBytesState, R.DerefBytesState);
+  ChangeStatus CS1 = clampStateAndIndicateChange(S.GlobalState, R.GlobalState);
+  return CS0 | CS1;
+}
+
+struct AADereferenceableImpl : AADereferenceable {
+  AADereferenceableImpl(const IRPosition &IRP, Attributor &A)
+      : AADereferenceable(IRP, A) {}
+  using StateType = DerefState;
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    SmallVector<Attribute, 4> Attrs;
+    getAttrs({Attribute::Dereferenceable, Attribute::DereferenceableOrNull},
+             Attrs, /* IgnoreSubsumingPositions */ false, &A);
+    for (const Attribute &Attr : Attrs)
+      takeKnownDerefBytesMaximum(Attr.getValueAsInt());
+
+    const IRPosition &IRP = this->getIRPosition();
+    NonNullAA = &A.getAAFor<AANonNull>(*this, IRP,
+                                       /* TrackDependence */ false);
+
+    bool CanBeNull;
+    takeKnownDerefBytesMaximum(
+        IRP.getAssociatedValue().getPointerDereferenceableBytes(
+            A.getDataLayout(), CanBeNull));
+
+    bool IsFnInterface = IRP.isFnInterfaceKind();
+    Function *FnScope = IRP.getAnchorScope();
+    if (IsFnInterface && (!FnScope || !A.isFunctionIPOAmendable(*FnScope))) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    if (Instruction *CtxI = getCtxI())
+      followUsesInMBEC(*this, A, getState(), *CtxI);
+  }
+
+  /// See AbstractAttribute::getState()
+  /// {
+  StateType &getState() override { return *this; }
+  const StateType &getState() const override { return *this; }
+  /// }
+
+  /// Helper function for collecting accessed bytes in must-be-executed-context
+  void addAccessedBytesForUse(Attributor &A, const Use *U, const Instruction *I,
+                              DerefState &State) {
+    const Value *UseV = U->get();
+    if (!UseV->getType()->isPointerTy())
+      return;
+
+    Type *PtrTy = UseV->getType();
+    const DataLayout &DL = A.getDataLayout();
+    int64_t Offset;
+    if (const Value *Base = getBasePointerOfAccessPointerOperand(
+            I, Offset, DL, /*AllowNonInbounds*/ true)) {
+      if (Base == &getAssociatedValue() &&
+          getPointerOperand(I, /* AllowVolatile */ false) == UseV) {
+        uint64_t Size = DL.getTypeStoreSize(PtrTy->getPointerElementType());
+        State.addAccessedBytes(Offset, Size);
+      }
+    }
+    return;
+  }
+
+  /// See followUsesInMBEC
+  bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I,
+                       AADereferenceable::StateType &State) {
+    bool IsNonNull = false;
+    bool TrackUse = false;
+    int64_t DerefBytes = getKnownNonNullAndDerefBytesForUse(
+        A, *this, getAssociatedValue(), U, I, IsNonNull, TrackUse);
+    LLVM_DEBUG(dbgs() << "[AADereferenceable] Deref bytes: " << DerefBytes
+                      << " for instruction " << *I << "\n");
+
+    addAccessedBytesForUse(A, U, I, State);
+    State.takeKnownDerefBytesMaximum(DerefBytes);
+    return TrackUse;
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Change = AADereferenceable::manifest(A);
+    if (isAssumedNonNull() && hasAttr(Attribute::DereferenceableOrNull)) {
+      removeAttrs({Attribute::DereferenceableOrNull});
+      return ChangeStatus::CHANGED;
+    }
+    return Change;
+  }
+
+  void getDeducedAttributes(LLVMContext &Ctx,
+                            SmallVectorImpl<Attribute> &Attrs) const override {
+    // TODO: Add *_globally support
+    if (isAssumedNonNull())
+      Attrs.emplace_back(Attribute::getWithDereferenceableBytes(
+          Ctx, getAssumedDereferenceableBytes()));
+    else
+      Attrs.emplace_back(Attribute::getWithDereferenceableOrNullBytes(
+          Ctx, getAssumedDereferenceableBytes()));
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    if (!getAssumedDereferenceableBytes())
+      return "unknown-dereferenceable";
+    return std::string("dereferenceable") +
+           (isAssumedNonNull() ? "" : "_or_null") +
+           (isAssumedGlobal() ? "_globally" : "") + "<" +
+           std::to_string(getKnownDereferenceableBytes()) + "-" +
+           std::to_string(getAssumedDereferenceableBytes()) + ">";
+  }
+};
+
+/// Dereferenceable attribute for a floating value.
+struct AADereferenceableFloating : AADereferenceableImpl {
+  AADereferenceableFloating(const IRPosition &IRP, Attributor &A)
+      : AADereferenceableImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    const DataLayout &DL = A.getDataLayout();
+
+    auto VisitValueCB = [&](const Value &V, const Instruction *, DerefState &T,
+                            bool Stripped) -> bool {
+      unsigned IdxWidth =
+          DL.getIndexSizeInBits(V.getType()->getPointerAddressSpace());
+      APInt Offset(IdxWidth, 0);
+      const Value *Base =
+          stripAndAccumulateMinimalOffsets(A, *this, &V, DL, Offset, false);
+
+      const auto &AA =
+          A.getAAFor<AADereferenceable>(*this, IRPosition::value(*Base));
+      int64_t DerefBytes = 0;
+      if (!Stripped && this == &AA) {
+        // Use IR information if we did not strip anything.
+        // TODO: track globally.
+        bool CanBeNull;
+        DerefBytes = Base->getPointerDereferenceableBytes(DL, CanBeNull);
+        T.GlobalState.indicatePessimisticFixpoint();
+      } else {
+        const DerefState &DS = static_cast<const DerefState &>(AA.getState());
+        DerefBytes = DS.DerefBytesState.getAssumed();
+        T.GlobalState &= DS.GlobalState;
+      }
+
+
+      // For now we do not try to "increase" dereferenceability due to negative
+      // indices as we first have to come up with code to deal with loops and
+      // for overflows of the dereferenceable bytes.
+      int64_t OffsetSExt = Offset.getSExtValue();
+      if (OffsetSExt < 0)
+        OffsetSExt = 0;
+
+      T.takeAssumedDerefBytesMinimum(
+          std::max(int64_t(0), DerefBytes - OffsetSExt));
+
+      if (this == &AA) {
+        if (!Stripped) {
+          // If nothing was stripped IR information is all we got.
+          T.takeKnownDerefBytesMaximum(
+              std::max(int64_t(0), DerefBytes - OffsetSExt));
+          T.indicatePessimisticFixpoint();
+        } else if (OffsetSExt > 0) {
+          // If something was stripped but there is circular reasoning we look
+          // for the offset. If it is positive we basically decrease the
+          // dereferenceable bytes in a circluar loop now, which will simply
+          // drive them down to the known value in a very slow way which we
+          // can accelerate.
+          T.indicatePessimisticFixpoint();
+        }
+      }
+
+      return T.isValidState();
+    };
+
+    DerefState T;
+    if (!genericValueTraversal<AADereferenceable, DerefState>(
+            A, getIRPosition(), *this, T, VisitValueCB, getCtxI()))
+      return indicatePessimisticFixpoint();
+
+    return clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(dereferenceable)
+  }
+};
+
+/// Dereferenceable attribute for a return value.
+struct AADereferenceableReturned final
+    : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl> {
+  AADereferenceableReturned(const IRPosition &IRP, Attributor &A)
+      : AAReturnedFromReturnedValues<AADereferenceable, AADereferenceableImpl>(
+            IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(dereferenceable)
+  }
+};
+
+/// Dereferenceable attribute for an argument
+struct AADereferenceableArgument final
+    : AAArgumentFromCallSiteArguments<AADereferenceable,
+                                      AADereferenceableImpl> {
+  using Base =
+      AAArgumentFromCallSiteArguments<AADereferenceable, AADereferenceableImpl>;
+  AADereferenceableArgument(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(dereferenceable)
+  }
+};
+
+/// Dereferenceable attribute for a call site argument.
+struct AADereferenceableCallSiteArgument final : AADereferenceableFloating {
+  AADereferenceableCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AADereferenceableFloating(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(dereferenceable)
+  }
+};
+
+/// Dereferenceable attribute deduction for a call site return value.
+struct AADereferenceableCallSiteReturned final
+    : AACallSiteReturnedFromReturned<AADereferenceable, AADereferenceableImpl> {
+  using Base =
+      AACallSiteReturnedFromReturned<AADereferenceable, AADereferenceableImpl>;
+  AADereferenceableCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CS_ATTR(dereferenceable);
+  }
+};
+
+// ------------------------ Align Argument Attribute ------------------------
+
+static unsigned getKnownAlignForUse(Attributor &A,
+                                    AbstractAttribute &QueryingAA,
+                                    Value &AssociatedValue, const Use *U,
+                                    const Instruction *I, bool &TrackUse) {
+  // We need to follow common pointer manipulation uses to the accesses they
+  // feed into.
+  if (isa<CastInst>(I)) {
+    // Follow all but ptr2int casts.
+    TrackUse = !isa<PtrToIntInst>(I);
+    return 0;
+  }
+  if (auto *GEP = dyn_cast<GetElementPtrInst>(I)) {
+    if (GEP->hasAllConstantIndices()) {
+      TrackUse = true;
+      return 0;
+    }
+  }
+
+  MaybeAlign MA;
+  if (const auto *CB = dyn_cast<CallBase>(I)) {
+    if (CB->isBundleOperand(U) || CB->isCallee(U))
+      return 0;
+
+    unsigned ArgNo = CB->getArgOperandNo(U);
+    IRPosition IRP = IRPosition::callsite_argument(*CB, ArgNo);
+    // As long as we only use known information there is no need to track
+    // dependences here.
+    auto &AlignAA = A.getAAFor<AAAlign>(QueryingAA, IRP,
+                                        /* TrackDependence */ false);
+    MA = MaybeAlign(AlignAA.getKnownAlign());
+  }
+
+  const DataLayout &DL = A.getDataLayout();
+  const Value *UseV = U->get();
+  if (auto *SI = dyn_cast<StoreInst>(I)) {
+    if (SI->getPointerOperand() == UseV)
+      MA = SI->getAlign();
+  } else if (auto *LI = dyn_cast<LoadInst>(I)) {
+    if (LI->getPointerOperand() == UseV)
+      MA = LI->getAlign();
+  }
+
+  if (!MA || *MA <= 1)
+    return 0;
+
+  unsigned Alignment = MA->value();
+  int64_t Offset;
+
+  if (const Value *Base = GetPointerBaseWithConstantOffset(UseV, Offset, DL)) {
+    if (Base == &AssociatedValue) {
+      // BasePointerAddr + Offset = Alignment * Q for some integer Q.
+      // So we can say that the maximum power of two which is a divisor of
+      // gcd(Offset, Alignment) is an alignment.
+
+      uint32_t gcd =
+          greatestCommonDivisor(uint32_t(abs((int32_t)Offset)), Alignment);
+      Alignment = llvm::PowerOf2Floor(gcd);
+    }
+  }
+
+  return Alignment;
+}
+
+struct AAAlignImpl : AAAlign {
+  AAAlignImpl(const IRPosition &IRP, Attributor &A) : AAAlign(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    SmallVector<Attribute, 4> Attrs;
+    getAttrs({Attribute::Alignment}, Attrs);
+    for (const Attribute &Attr : Attrs)
+      takeKnownMaximum(Attr.getValueAsInt());
+
+    Value &V = getAssociatedValue();
+    // TODO: This is a HACK to avoid getPointerAlignment to introduce a ptr2int
+    //       use of the function pointer. This was caused by D73131. We want to
+    //       avoid this for function pointers especially because we iterate
+    //       their uses and int2ptr is not handled. It is not a correctness
+    //       problem though!
+    if (!V.getType()->getPointerElementType()->isFunctionTy())
+      takeKnownMaximum(V.getPointerAlignment(A.getDataLayout()).value());
+
+    if (getIRPosition().isFnInterfaceKind() &&
+        (!getAnchorScope() ||
+         !A.isFunctionIPOAmendable(*getAssociatedFunction()))) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    if (Instruction *CtxI = getCtxI())
+      followUsesInMBEC(*this, A, getState(), *CtxI);
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus LoadStoreChanged = ChangeStatus::UNCHANGED;
+
+    // Check for users that allow alignment annotations.
+    Value &AssociatedValue = getAssociatedValue();
+    for (const Use &U : AssociatedValue.uses()) {
+      if (auto *SI = dyn_cast<StoreInst>(U.getUser())) {
+        if (SI->getPointerOperand() == &AssociatedValue)
+          if (SI->getAlignment() < getAssumedAlign()) {
+            STATS_DECLTRACK(AAAlign, Store,
+                            "Number of times alignment added to a store");
+            SI->setAlignment(Align(getAssumedAlign()));
+            LoadStoreChanged = ChangeStatus::CHANGED;
+          }
+      } else if (auto *LI = dyn_cast<LoadInst>(U.getUser())) {
+        if (LI->getPointerOperand() == &AssociatedValue)
+          if (LI->getAlignment() < getAssumedAlign()) {
+            LI->setAlignment(Align(getAssumedAlign()));
+            STATS_DECLTRACK(AAAlign, Load,
+                            "Number of times alignment added to a load");
+            LoadStoreChanged = ChangeStatus::CHANGED;
+          }
+      }
+    }
+
+    ChangeStatus Changed = AAAlign::manifest(A);
+
+    Align InheritAlign =
+        getAssociatedValue().getPointerAlignment(A.getDataLayout());
+    if (InheritAlign >= getAssumedAlign())
+      return LoadStoreChanged;
+    return Changed | LoadStoreChanged;
+  }
+
+  // TODO: Provide a helper to determine the implied ABI alignment and check in
+  //       the existing manifest method and a new one for AAAlignImpl that value
+  //       to avoid making the alignment explicit if it did not improve.
+
+  /// See AbstractAttribute::getDeducedAttributes
+  virtual void
+  getDeducedAttributes(LLVMContext &Ctx,
+                       SmallVectorImpl<Attribute> &Attrs) const override {
+    if (getAssumedAlign() > 1)
+      Attrs.emplace_back(
+          Attribute::getWithAlignment(Ctx, Align(getAssumedAlign())));
+  }
+
+  /// See followUsesInMBEC
+  bool followUseInMBEC(Attributor &A, const Use *U, const Instruction *I,
+                       AAAlign::StateType &State) {
+    bool TrackUse = false;
+
+    unsigned int KnownAlign =
+        getKnownAlignForUse(A, *this, getAssociatedValue(), U, I, TrackUse);
+    State.takeKnownMaximum(KnownAlign);
+
+    return TrackUse;
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumedAlign() ? ("align<" + std::to_string(getKnownAlign()) +
+                                "-" + std::to_string(getAssumedAlign()) + ">")
+                             : "unknown-align";
+  }
+};
+
+/// Align attribute for a floating value.
+struct AAAlignFloating : AAAlignImpl {
+  AAAlignFloating(const IRPosition &IRP, Attributor &A) : AAAlignImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    const DataLayout &DL = A.getDataLayout();
+
+    auto VisitValueCB = [&](Value &V, const Instruction *,
+                            AAAlign::StateType &T, bool Stripped) -> bool {
+      const auto &AA = A.getAAFor<AAAlign>(*this, IRPosition::value(V));
+      if (!Stripped && this == &AA) {
+        // Use only IR information if we did not strip anything.
+        Align PA = V.getPointerAlignment(DL);
+        T.takeKnownMaximum(PA.value());
+        T.indicatePessimisticFixpoint();
+      } else {
+        // Use abstract attribute information.
+        const AAAlign::StateType &DS =
+            static_cast<const AAAlign::StateType &>(AA.getState());
+        T ^= DS;
+      }
+      return T.isValidState();
+    };
+
+    StateType T;
+    if (!genericValueTraversal<AAAlign, StateType>(A, getIRPosition(), *this, T,
+                                                   VisitValueCB, getCtxI()))
+      return indicatePessimisticFixpoint();
+
+    // TODO: If we know we visited all incoming values, thus no are assumed
+    // dead, we can take the known information from the state T.
+    return clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FLOATING_ATTR(align) }
+};
+
+/// Align attribute for function return value.
+struct AAAlignReturned final
+    : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl> {
+  AAAlignReturned(const IRPosition &IRP, Attributor &A)
+      : AAReturnedFromReturnedValues<AAAlign, AAAlignImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FNRET_ATTR(aligned) }
+};
+
+/// Align attribute for function argument.
+struct AAAlignArgument final
+    : AAArgumentFromCallSiteArguments<AAAlign, AAAlignImpl> {
+  using Base = AAArgumentFromCallSiteArguments<AAAlign, AAAlignImpl>;
+  AAAlignArgument(const IRPosition &IRP, Attributor &A) : Base(IRP, A) {}
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    // If the associated argument is involved in a must-tail call we give up
+    // because we would need to keep the argument alignments of caller and
+    // callee in-sync. Just does not seem worth the trouble right now.
+    if (A.getInfoCache().isInvolvedInMustTailCall(*getAssociatedArgument()))
+      return ChangeStatus::UNCHANGED;
+    return Base::manifest(A);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(aligned) }
+};
+
+struct AAAlignCallSiteArgument final : AAAlignFloating {
+  AAAlignCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAAlignFloating(IRP, A) {}
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    // If the associated argument is involved in a must-tail call we give up
+    // because we would need to keep the argument alignments of caller and
+    // callee in-sync. Just does not seem worth the trouble right now.
+    if (Argument *Arg = getAssociatedArgument())
+      if (A.getInfoCache().isInvolvedInMustTailCall(*Arg))
+        return ChangeStatus::UNCHANGED;
+    ChangeStatus Changed = AAAlignImpl::manifest(A);
+    Align InheritAlign =
+        getAssociatedValue().getPointerAlignment(A.getDataLayout());
+    if (InheritAlign >= getAssumedAlign())
+      Changed = ChangeStatus::UNCHANGED;
+    return Changed;
+  }
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus Changed = AAAlignFloating::updateImpl(A);
+    if (Argument *Arg = getAssociatedArgument()) {
+      // We only take known information from the argument
+      // so we do not need to track a dependence.
+      const auto &ArgAlignAA = A.getAAFor<AAAlign>(
+          *this, IRPosition::argument(*Arg), /* TrackDependence */ false);
+      takeKnownMaximum(ArgAlignAA.getKnownAlign());
+    }
+    return Changed;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CSARG_ATTR(aligned) }
+};
+
+/// Align attribute deduction for a call site return value.
+struct AAAlignCallSiteReturned final
+    : AACallSiteReturnedFromReturned<AAAlign, AAAlignImpl> {
+  using Base = AACallSiteReturnedFromReturned<AAAlign, AAAlignImpl>;
+  AAAlignCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    Base::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(align); }
+};
+
+/// ------------------ Function No-Return Attribute ----------------------------
+struct AANoReturnImpl : public AANoReturn {
+  AANoReturnImpl(const IRPosition &IRP, Attributor &A) : AANoReturn(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AANoReturn::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F)
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? "noreturn" : "may-return";
+  }
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  virtual ChangeStatus updateImpl(Attributor &A) override {
+    auto CheckForNoReturn = [](Instruction &) { return false; };
+    if (!A.checkForAllInstructions(CheckForNoReturn, *this,
+                                   {(unsigned)Instruction::Ret}))
+      return indicatePessimisticFixpoint();
+    return ChangeStatus::UNCHANGED;
+  }
+};
+
+struct AANoReturnFunction final : AANoReturnImpl {
+  AANoReturnFunction(const IRPosition &IRP, Attributor &A)
+      : AANoReturnImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(noreturn) }
+};
+
+/// NoReturn attribute deduction for a call sites.
+struct AANoReturnCallSite final : AANoReturnImpl {
+  AANoReturnCallSite(const IRPosition &IRP, Attributor &A)
+      : AANoReturnImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AANoReturn>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AANoReturn::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(noreturn); }
+};
+
+/// ----------------------- Variable Capturing ---------------------------------
+
+/// A class to hold the state of for no-capture attributes.
+struct AANoCaptureImpl : public AANoCapture {
+  AANoCaptureImpl(const IRPosition &IRP, Attributor &A) : AANoCapture(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (hasAttr(getAttrKind(), /* IgnoreSubsumingPositions */ true)) {
+      indicateOptimisticFixpoint();
+      return;
+    }
+    Function *AnchorScope = getAnchorScope();
+    if (isFnInterfaceKind() &&
+        (!AnchorScope || !A.isFunctionIPOAmendable(*AnchorScope))) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    // You cannot "capture" null in the default address space.
+    if (isa<ConstantPointerNull>(getAssociatedValue()) &&
+        getAssociatedValue().getType()->getPointerAddressSpace() == 0) {
+      indicateOptimisticFixpoint();
+      return;
+    }
+
+    const Function *F = getArgNo() >= 0 ? getAssociatedFunction() : AnchorScope;
+
+    // Check what state the associated function can actually capture.
+    if (F)
+      determineFunctionCaptureCapabilities(getIRPosition(), *F, *this);
+    else
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+
+  /// see AbstractAttribute::isAssumedNoCaptureMaybeReturned(...).
+  virtual void
+  getDeducedAttributes(LLVMContext &Ctx,
+                       SmallVectorImpl<Attribute> &Attrs) const override {
+    if (!isAssumedNoCaptureMaybeReturned())
+      return;
+
+    if (getArgNo() >= 0) {
+      if (isAssumedNoCapture())
+        Attrs.emplace_back(Attribute::get(Ctx, Attribute::NoCapture));
+      else if (ManifestInternal)
+        Attrs.emplace_back(Attribute::get(Ctx, "no-capture-maybe-returned"));
+    }
+  }
+
+  /// Set the NOT_CAPTURED_IN_MEM and NOT_CAPTURED_IN_RET bits in \p Known
+  /// depending on the ability of the function associated with \p IRP to capture
+  /// state in memory and through "returning/throwing", respectively.
+  static void determineFunctionCaptureCapabilities(const IRPosition &IRP,
+                                                   const Function &F,
+                                                   BitIntegerState &State) {
+    // TODO: Once we have memory behavior attributes we should use them here.
+
+    // If we know we cannot communicate or write to memory, we do not care about
+    // ptr2int anymore.
+    if (F.onlyReadsMemory() && F.doesNotThrow() &&
+        F.getReturnType()->isVoidTy()) {
+      State.addKnownBits(NO_CAPTURE);
+      return;
+    }
+
+    // A function cannot capture state in memory if it only reads memory, it can
+    // however return/throw state and the state might be influenced by the
+    // pointer value, e.g., loading from a returned pointer might reveal a bit.
+    if (F.onlyReadsMemory())
+      State.addKnownBits(NOT_CAPTURED_IN_MEM);
+
+    // A function cannot communicate state back if it does not through
+    // exceptions and doesn not return values.
+    if (F.doesNotThrow() && F.getReturnType()->isVoidTy())
+      State.addKnownBits(NOT_CAPTURED_IN_RET);
+
+    // Check existing "returned" attributes.
+    int ArgNo = IRP.getArgNo();
+    if (F.doesNotThrow() && ArgNo >= 0) {
+      for (unsigned u = 0, e = F.arg_size(); u < e; ++u)
+        if (F.hasParamAttribute(u, Attribute::Returned)) {
+          if (u == unsigned(ArgNo))
+            State.removeAssumedBits(NOT_CAPTURED_IN_RET);
+          else if (F.onlyReadsMemory())
+            State.addKnownBits(NO_CAPTURE);
+          else
+            State.addKnownBits(NOT_CAPTURED_IN_RET);
+          break;
+        }
+    }
+  }
+
+  /// See AbstractState::getAsStr().
+  const std::string getAsStr() const override {
+    if (isKnownNoCapture())
+      return "known not-captured";
+    if (isAssumedNoCapture())
+      return "assumed not-captured";
+    if (isKnownNoCaptureMaybeReturned())
+      return "known not-captured-maybe-returned";
+    if (isAssumedNoCaptureMaybeReturned())
+      return "assumed not-captured-maybe-returned";
+    return "assumed-captured";
+  }
+};
+
+/// Attributor-aware capture tracker.
+struct AACaptureUseTracker final : public CaptureTracker {
+
+  /// Create a capture tracker that can lookup in-flight abstract attributes
+  /// through the Attributor \p A.
+  ///
+  /// If a use leads to a potential capture, \p CapturedInMemory is set and the
+  /// search is stopped. If a use leads to a return instruction,
+  /// \p CommunicatedBack is set to true and \p CapturedInMemory is not changed.
+  /// If a use leads to a ptr2int which may capture the value,
+  /// \p CapturedInInteger is set. If a use is found that is currently assumed
+  /// "no-capture-maybe-returned", the user is added to the \p PotentialCopies
+  /// set. All values in \p PotentialCopies are later tracked as well. For every
+  /// explored use we decrement \p RemainingUsesToExplore. Once it reaches 0,
+  /// the search is stopped with \p CapturedInMemory and \p CapturedInInteger
+  /// conservatively set to true.
+  AACaptureUseTracker(Attributor &A, AANoCapture &NoCaptureAA,
+                      const AAIsDead &IsDeadAA, AANoCapture::StateType &State,
+                      SmallVectorImpl<const Value *> &PotentialCopies,
+                      unsigned &RemainingUsesToExplore)
+      : A(A), NoCaptureAA(NoCaptureAA), IsDeadAA(IsDeadAA), State(State),
+        PotentialCopies(PotentialCopies),
+        RemainingUsesToExplore(RemainingUsesToExplore) {}
+
+  /// Determine if \p V maybe captured. *Also updates the state!*
+  bool valueMayBeCaptured(const Value *V) {
+    if (V->getType()->isPointerTy()) {
+      PointerMayBeCaptured(V, this);
+    } else {
+      State.indicatePessimisticFixpoint();
+    }
+    return State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
+  }
+
+  /// See CaptureTracker::tooManyUses().
+  void tooManyUses() override {
+    State.removeAssumedBits(AANoCapture::NO_CAPTURE);
+  }
+
+  bool isDereferenceableOrNull(Value *O, const DataLayout &DL) override {
+    if (CaptureTracker::isDereferenceableOrNull(O, DL))
+      return true;
+    const auto &DerefAA = A.getAAFor<AADereferenceable>(
+        NoCaptureAA, IRPosition::value(*O), /* TrackDependence */ true,
+        DepClassTy::OPTIONAL);
+    return DerefAA.getAssumedDereferenceableBytes();
+  }
+
+  /// See CaptureTracker::captured(...).
+  bool captured(const Use *U) override {
+    Instruction *UInst = cast<Instruction>(U->getUser());
+    LLVM_DEBUG(dbgs() << "Check use: " << *U->get() << " in " << *UInst
+                      << "\n");
+
+    // Because we may reuse the tracker multiple times we keep track of the
+    // number of explored uses ourselves as well.
+    if (RemainingUsesToExplore-- == 0) {
+      LLVM_DEBUG(dbgs() << " - too many uses to explore!\n");
+      return isCapturedIn(/* Memory */ true, /* Integer */ true,
+                          /* Return */ true);
+    }
+
+    // Deal with ptr2int by following uses.
+    if (isa<PtrToIntInst>(UInst)) {
+      LLVM_DEBUG(dbgs() << " - ptr2int assume the worst!\n");
+      return valueMayBeCaptured(UInst);
+    }
+
+    // Explicitly catch return instructions.
+    if (isa<ReturnInst>(UInst))
+      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+                          /* Return */ true);
+
+    // For now we only use special logic for call sites. However, the tracker
+    // itself knows about a lot of other non-capturing cases already.
+    auto *CB = dyn_cast<CallBase>(UInst);
+    if (!CB || !CB->isArgOperand(U))
+      return isCapturedIn(/* Memory */ true, /* Integer */ true,
+                          /* Return */ true);
+
+    unsigned ArgNo = CB->getArgOperandNo(U);
+    const IRPosition &CSArgPos = IRPosition::callsite_argument(*CB, ArgNo);
+    // If we have a abstract no-capture attribute for the argument we can use
+    // it to justify a non-capture attribute here. This allows recursion!
+    auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(NoCaptureAA, CSArgPos);
+    if (ArgNoCaptureAA.isAssumedNoCapture())
+      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+                          /* Return */ false);
+    if (ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+      addPotentialCopy(*CB);
+      return isCapturedIn(/* Memory */ false, /* Integer */ false,
+                          /* Return */ false);
+    }
+
+    // Lastly, we could not find a reason no-capture can be assumed so we don't.
+    return isCapturedIn(/* Memory */ true, /* Integer */ true,
+                        /* Return */ true);
+  }
+
+  /// Register \p CS as potential copy of the value we are checking.
+  void addPotentialCopy(CallBase &CB) { PotentialCopies.push_back(&CB); }
+
+  /// See CaptureTracker::shouldExplore(...).
+  bool shouldExplore(const Use *U) override {
+    // Check liveness and ignore droppable users.
+    return !U->getUser()->isDroppable() &&
+           !A.isAssumedDead(*U, &NoCaptureAA, &IsDeadAA);
+  }
+
+  /// Update the state according to \p CapturedInMem, \p CapturedInInt, and
+  /// \p CapturedInRet, then return the appropriate value for use in the
+  /// CaptureTracker::captured() interface.
+  bool isCapturedIn(bool CapturedInMem, bool CapturedInInt,
+                    bool CapturedInRet) {
+    LLVM_DEBUG(dbgs() << " - captures [Mem " << CapturedInMem << "|Int "
+                      << CapturedInInt << "|Ret " << CapturedInRet << "]\n");
+    if (CapturedInMem)
+      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_MEM);
+    if (CapturedInInt)
+      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_INT);
+    if (CapturedInRet)
+      State.removeAssumedBits(AANoCapture::NOT_CAPTURED_IN_RET);
+    return !State.isAssumed(AANoCapture::NO_CAPTURE_MAYBE_RETURNED);
+  }
+
+private:
+  /// The attributor providing in-flight abstract attributes.
+  Attributor &A;
+
+  /// The abstract attribute currently updated.
+  AANoCapture &NoCaptureAA;
+
+  /// The abstract liveness state.
+  const AAIsDead &IsDeadAA;
+
+  /// The state currently updated.
+  AANoCapture::StateType &State;
+
+  /// Set of potential copies of the tracked value.
+  SmallVectorImpl<const Value *> &PotentialCopies;
+
+  /// Global counter to limit the number of explored uses.
+  unsigned &RemainingUsesToExplore;
+};
+
+ChangeStatus AANoCaptureImpl::updateImpl(Attributor &A) {
+  const IRPosition &IRP = getIRPosition();
+  const Value *V =
+      getArgNo() >= 0 ? IRP.getAssociatedArgument() : &IRP.getAssociatedValue();
+  if (!V)
+    return indicatePessimisticFixpoint();
+
+  const Function *F =
+      getArgNo() >= 0 ? IRP.getAssociatedFunction() : IRP.getAnchorScope();
+  assert(F && "Expected a function!");
+  const IRPosition &FnPos = IRPosition::function(*F);
+  const auto &IsDeadAA =
+      A.getAAFor<AAIsDead>(*this, FnPos, /* TrackDependence */ false);
+
+  AANoCapture::StateType T;
+
+  // Readonly means we cannot capture through memory.
+  const auto &FnMemAA =
+      A.getAAFor<AAMemoryBehavior>(*this, FnPos, /* TrackDependence */ false);
+  if (FnMemAA.isAssumedReadOnly()) {
+    T.addKnownBits(NOT_CAPTURED_IN_MEM);
+    if (FnMemAA.isKnownReadOnly())
+      addKnownBits(NOT_CAPTURED_IN_MEM);
+    else
+      A.recordDependence(FnMemAA, *this, DepClassTy::OPTIONAL);
+  }
+
+  // Make sure all returned values are different than the underlying value.
+  // TODO: we could do this in a more sophisticated way inside
+  //       AAReturnedValues, e.g., track all values that escape through returns
+  //       directly somehow.
+  auto CheckReturnedArgs = [&](const AAReturnedValues &RVAA) {
+    bool SeenConstant = false;
+    for (auto &It : RVAA.returned_values()) {
+      if (isa<Constant>(It.first)) {
+        if (SeenConstant)
+          return false;
+        SeenConstant = true;
+      } else if (!isa<Argument>(It.first) ||
+                 It.first == getAssociatedArgument())
+        return false;
+    }
+    return true;
+  };
+
+  const auto &NoUnwindAA = A.getAAFor<AANoUnwind>(
+      *this, FnPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+  if (NoUnwindAA.isAssumedNoUnwind()) {
+    bool IsVoidTy = F->getReturnType()->isVoidTy();
+    const AAReturnedValues *RVAA =
+        IsVoidTy ? nullptr
+                 : &A.getAAFor<AAReturnedValues>(*this, FnPos,
+                                                 /* TrackDependence */ true,
+                                                 DepClassTy::OPTIONAL);
+    if (IsVoidTy || CheckReturnedArgs(*RVAA)) {
+      T.addKnownBits(NOT_CAPTURED_IN_RET);
+      if (T.isKnown(NOT_CAPTURED_IN_MEM))
+        return ChangeStatus::UNCHANGED;
+      if (NoUnwindAA.isKnownNoUnwind() &&
+          (IsVoidTy || RVAA->getState().isAtFixpoint())) {
+        addKnownBits(NOT_CAPTURED_IN_RET);
+        if (isKnown(NOT_CAPTURED_IN_MEM))
+          return indicateOptimisticFixpoint();
+      }
+    }
+  }
+
+  // Use the CaptureTracker interface and logic with the specialized tracker,
+  // defined in AACaptureUseTracker, that can look at in-flight abstract
+  // attributes and directly updates the assumed state.
+  SmallVector<const Value *, 4> PotentialCopies;
+  unsigned RemainingUsesToExplore =
+      getDefaultMaxUsesToExploreForCaptureTracking();
+  AACaptureUseTracker Tracker(A, *this, IsDeadAA, T, PotentialCopies,
+                              RemainingUsesToExplore);
+
+  // Check all potential copies of the associated value until we can assume
+  // none will be captured or we have to assume at least one might be.
+  unsigned Idx = 0;
+  PotentialCopies.push_back(V);
+  while (T.isAssumed(NO_CAPTURE_MAYBE_RETURNED) && Idx < PotentialCopies.size())
+    Tracker.valueMayBeCaptured(PotentialCopies[Idx++]);
+
+  AANoCapture::StateType &S = getState();
+  auto Assumed = S.getAssumed();
+  S.intersectAssumedBits(T.getAssumed());
+  if (!isAssumedNoCaptureMaybeReturned())
+    return indicatePessimisticFixpoint();
+  return Assumed == S.getAssumed() ? ChangeStatus::UNCHANGED
+                                   : ChangeStatus::CHANGED;
+}
+
+/// NoCapture attribute for function arguments.
+struct AANoCaptureArgument final : AANoCaptureImpl {
+  AANoCaptureArgument(const IRPosition &IRP, Attributor &A)
+      : AANoCaptureImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_ARG_ATTR(nocapture) }
+};
+
+/// NoCapture attribute for call site arguments.
+struct AANoCaptureCallSiteArgument final : AANoCaptureImpl {
+  AANoCaptureCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AANoCaptureImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (Argument *Arg = getAssociatedArgument())
+      if (Arg->hasByValAttr())
+        indicateOptimisticFixpoint();
+    AANoCaptureImpl::initialize(A);
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg)
+      return indicatePessimisticFixpoint();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA = A.getAAFor<AANoCapture>(*this, ArgPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AANoCapture::StateType &>(ArgAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override{STATS_DECLTRACK_CSARG_ATTR(nocapture)};
+};
+
+/// NoCapture attribute for floating values.
+struct AANoCaptureFloating final : AANoCaptureImpl {
+  AANoCaptureFloating(const IRPosition &IRP, Attributor &A)
+      : AANoCaptureImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(nocapture)
+  }
+};
+
+/// NoCapture attribute for function return value.
+struct AANoCaptureReturned final : AANoCaptureImpl {
+  AANoCaptureReturned(const IRPosition &IRP, Attributor &A)
+      : AANoCaptureImpl(IRP, A) {
+    llvm_unreachable("NoCapture is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    llvm_unreachable("NoCapture is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("NoCapture is not applicable to function returns!");
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// NoCapture attribute deduction for a call site return value.
+struct AANoCaptureCallSiteReturned final : AANoCaptureImpl {
+  AANoCaptureCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AANoCaptureImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(nocapture)
+  }
+};
+
+/// ------------------ Value Simplify Attribute ----------------------------
+struct AAValueSimplifyImpl : AAValueSimplify {
+  AAValueSimplifyImpl(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplify(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (getAssociatedValue().getType()->isVoidTy())
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    return getAssumed() ? (getKnown() ? "simplified" : "maybe-simple")
+                        : "not-simple";
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+
+  /// See AAValueSimplify::getAssumedSimplifiedValue()
+  Optional<Value *> getAssumedSimplifiedValue(Attributor &A) const override {
+    if (!getAssumed())
+      return const_cast<Value *>(&getAssociatedValue());
+    return SimplifiedAssociatedValue;
+  }
+
+  /// Helper function for querying AAValueSimplify and updating candicate.
+  /// \param QueryingValue Value trying to unify with SimplifiedValue
+  /// \param AccumulatedSimplifiedValue Current simplification result.
+  static bool checkAndUpdate(Attributor &A, const AbstractAttribute &QueryingAA,
+                             Value &QueryingValue,
+                             Optional<Value *> &AccumulatedSimplifiedValue) {
+    // FIXME: Add a typecast support.
+
+    auto &ValueSimplifyAA = A.getAAFor<AAValueSimplify>(
+        QueryingAA, IRPosition::value(QueryingValue));
+
+    Optional<Value *> QueryingValueSimplified =
+        ValueSimplifyAA.getAssumedSimplifiedValue(A);
+
+    if (!QueryingValueSimplified.hasValue())
+      return true;
+
+    if (!QueryingValueSimplified.getValue())
+      return false;
+
+    Value &QueryingValueSimplifiedUnwrapped =
+        *QueryingValueSimplified.getValue();
+
+    if (AccumulatedSimplifiedValue.hasValue() &&
+        !isa<UndefValue>(AccumulatedSimplifiedValue.getValue()) &&
+        !isa<UndefValue>(QueryingValueSimplifiedUnwrapped))
+      return AccumulatedSimplifiedValue == QueryingValueSimplified;
+    if (AccumulatedSimplifiedValue.hasValue() &&
+        isa<UndefValue>(QueryingValueSimplifiedUnwrapped))
+      return true;
+
+    LLVM_DEBUG(dbgs() << "[ValueSimplify] " << QueryingValue
+                      << " is assumed to be "
+                      << QueryingValueSimplifiedUnwrapped << "\n");
+
+    AccumulatedSimplifiedValue = QueryingValueSimplified;
+    return true;
+  }
+
+  bool askSimplifiedValueForAAValueConstantRange(Attributor &A) {
+    if (!getAssociatedValue().getType()->isIntegerTy())
+      return false;
+
+    const auto &ValueConstantRangeAA =
+        A.getAAFor<AAValueConstantRange>(*this, getIRPosition());
+
+    Optional<ConstantInt *> COpt =
+        ValueConstantRangeAA.getAssumedConstantInt(A);
+    if (COpt.hasValue()) {
+      if (auto *C = COpt.getValue())
+        SimplifiedAssociatedValue = C;
+      else
+        return false;
+    } else {
+      SimplifiedAssociatedValue = llvm::None;
+    }
+    return true;
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    if (SimplifiedAssociatedValue.hasValue() &&
+        !SimplifiedAssociatedValue.getValue())
+      return Changed;
+
+    Value &V = getAssociatedValue();
+    auto *C = SimplifiedAssociatedValue.hasValue()
+                  ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())
+                  : UndefValue::get(V.getType());
+    if (C) {
+      // We can replace the AssociatedValue with the constant.
+      if (!V.user_empty() && &V != C && V.getType() == C->getType()) {
+        LLVM_DEBUG(dbgs() << "[ValueSimplify] " << V << " -> " << *C
+                          << " :: " << *this << "\n");
+        if (A.changeValueAfterManifest(V, *C))
+          Changed = ChangeStatus::CHANGED;
+      }
+    }
+
+    return Changed | AAValueSimplify::manifest(A);
+  }
+
+  /// See AbstractState::indicatePessimisticFixpoint(...).
+  ChangeStatus indicatePessimisticFixpoint() override {
+    // NOTE: Associated value will be returned in a pessimistic fixpoint and is
+    // regarded as known. That's why`indicateOptimisticFixpoint` is called.
+    SimplifiedAssociatedValue = &getAssociatedValue();
+    indicateOptimisticFixpoint();
+    return ChangeStatus::CHANGED;
+  }
+
+protected:
+  // An assumed simplified value. Initially, it is set to Optional::None, which
+  // means that the value is not clear under current assumption. If in the
+  // pessimistic state, getAssumedSimplifiedValue doesn't return this value but
+  // returns orignal associated value.
+  Optional<Value *> SimplifiedAssociatedValue;
+};
+
+struct AAValueSimplifyArgument final : AAValueSimplifyImpl {
+  AAValueSimplifyArgument(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplifyImpl(IRP, A) {}
+
+  void initialize(Attributor &A) override {
+    AAValueSimplifyImpl::initialize(A);
+    if (!getAnchorScope() || getAnchorScope()->isDeclaration())
+      indicatePessimisticFixpoint();
+    if (hasAttr({Attribute::InAlloca, Attribute::Preallocated,
+                 Attribute::StructRet, Attribute::Nest},
+                /* IgnoreSubsumingPositions */ true))
+      indicatePessimisticFixpoint();
+
+    // FIXME: This is a hack to prevent us from propagating function poiner in
+    // the new pass manager CGSCC pass as it creates call edges the
+    // CallGraphUpdater cannot handle yet.
+    Value &V = getAssociatedValue();
+    if (V.getType()->isPointerTy() &&
+        V.getType()->getPointerElementType()->isFunctionTy() &&
+        !A.isModulePass())
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // Byval is only replacable if it is readonly otherwise we would write into
+    // the replaced value and not the copy that byval creates implicitly.
+    Argument *Arg = getAssociatedArgument();
+    if (Arg->hasByValAttr()) {
+      // TODO: We probably need to verify synchronization is not an issue, e.g.,
+      //       there is no race by not copying a constant byval.
+      const auto &MemAA = A.getAAFor<AAMemoryBehavior>(*this, getIRPosition());
+      if (!MemAA.isAssumedReadOnly())
+        return indicatePessimisticFixpoint();
+    }
+
+    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
+    auto PredForCallSite = [&](AbstractCallSite ACS) {
+      const IRPosition &ACSArgPos =
+          IRPosition::callsite_argument(ACS, getArgNo());
+      // Check if a coresponding argument was found or if it is on not
+      // associated (which can happen for callback calls).
+      if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
+        return false;
+
+      // We can only propagate thread independent values through callbacks.
+      // This is different to direct/indirect call sites because for them we
+      // know the thread executing the caller and callee is the same. For
+      // callbacks this is not guaranteed, thus a thread dependent value could
+      // be different for the caller and callee, making it invalid to propagate.
+      Value &ArgOp = ACSArgPos.getAssociatedValue();
+      if (ACS.isCallbackCall())
+        if (auto *C = dyn_cast<Constant>(&ArgOp))
+          if (C->isThreadDependent())
+            return false;
+      return checkAndUpdate(A, *this, ArgOp, SimplifiedAssociatedValue);
+    };
+
+    bool AllCallSitesKnown;
+    if (!A.checkForAllCallSites(PredForCallSite, *this, true,
+                                AllCallSitesKnown))
+      if (!askSimplifiedValueForAAValueConstantRange(A))
+        return indicatePessimisticFixpoint();
+
+    // If a candicate was found in this update, return CHANGED.
+    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+               ? ChangeStatus::UNCHANGED
+               : ChangeStatus ::CHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyReturned : AAValueSimplifyImpl {
+  AAValueSimplifyReturned(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplifyImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
+    auto PredForReturned = [&](Value &V) {
+      return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue);
+    };
+
+    if (!A.checkForAllReturnedValues(PredForReturned, *this))
+      if (!askSimplifiedValueForAAValueConstantRange(A))
+        return indicatePessimisticFixpoint();
+
+    // If a candicate was found in this update, return CHANGED.
+    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+               ? ChangeStatus::UNCHANGED
+               : ChangeStatus ::CHANGED;
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    if (SimplifiedAssociatedValue.hasValue() &&
+        !SimplifiedAssociatedValue.getValue())
+      return Changed;
+
+    Value &V = getAssociatedValue();
+    auto *C = SimplifiedAssociatedValue.hasValue()
+                  ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())
+                  : UndefValue::get(V.getType());
+    if (C) {
+      auto PredForReturned =
+          [&](Value &V, const SmallSetVector<ReturnInst *, 4> &RetInsts) {
+            // We can replace the AssociatedValue with the constant.
+            if (&V == C || V.getType() != C->getType() || isa<UndefValue>(V))
+              return true;
+
+            for (ReturnInst *RI : RetInsts) {
+              if (RI->getFunction() != getAnchorScope())
+                continue;
+              auto *RC = C;
+              if (RC->getType() != RI->getReturnValue()->getType())
+                RC = ConstantExpr::getBitCast(RC,
+                                              RI->getReturnValue()->getType());
+              LLVM_DEBUG(dbgs() << "[ValueSimplify] " << V << " -> " << *RC
+                                << " in " << *RI << " :: " << *this << "\n");
+              if (A.changeUseAfterManifest(RI->getOperandUse(0), *RC))
+                Changed = ChangeStatus::CHANGED;
+            }
+            return true;
+          };
+      A.checkForAllReturnedValuesAndReturnInsts(PredForReturned, *this);
+    }
+
+    return Changed | AAValueSimplify::manifest(A);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyFloating : AAValueSimplifyImpl {
+  AAValueSimplifyFloating(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplifyImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // FIXME: This might have exposed a SCC iterator update bug in the old PM.
+    //        Needs investigation.
+    // AAValueSimplifyImpl::initialize(A);
+    Value &V = getAnchorValue();
+
+    // TODO: add other stuffs
+    if (isa<Constant>(V))
+      indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    bool HasValueBefore = SimplifiedAssociatedValue.hasValue();
+
+    auto VisitValueCB = [&](Value &V, const Instruction *CtxI, bool &,
+                            bool Stripped) -> bool {
+      auto &AA = A.getAAFor<AAValueSimplify>(*this, IRPosition::value(V));
+      if (!Stripped && this == &AA) {
+        // TODO: Look the instruction and check recursively.
+
+        LLVM_DEBUG(dbgs() << "[ValueSimplify] Can't be stripped more : " << V
+                          << "\n");
+        return false;
+      }
+      return checkAndUpdate(A, *this, V, SimplifiedAssociatedValue);
+    };
+
+    bool Dummy = false;
+    if (!genericValueTraversal<AAValueSimplify, bool>(
+            A, getIRPosition(), *this, Dummy, VisitValueCB, getCtxI(),
+            /* UseValueSimplify */ false))
+      if (!askSimplifiedValueForAAValueConstantRange(A))
+        return indicatePessimisticFixpoint();
+
+    // If a candicate was found in this update, return CHANGED.
+
+    return HasValueBefore == SimplifiedAssociatedValue.hasValue()
+               ? ChangeStatus::UNCHANGED
+               : ChangeStatus ::CHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyFunction : AAValueSimplifyImpl {
+  AAValueSimplifyFunction(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplifyImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    SimplifiedAssociatedValue = &getAnchorValue();
+    indicateOptimisticFixpoint();
+  }
+  /// See AbstractAttribute::initialize(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable(
+        "AAValueSimplify(Function|CallSite)::updateImpl will not be called");
+  }
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FN_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyCallSite : AAValueSimplifyFunction {
+  AAValueSimplifyCallSite(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplifyFunction(IRP, A) {}
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CS_ATTR(value_simplify)
+  }
+};
+
+struct AAValueSimplifyCallSiteReturned : AAValueSimplifyReturned {
+  AAValueSimplifyCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplifyReturned(IRP, A) {}
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    return AAValueSimplifyImpl::manifest(A);
+  }
+
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(value_simplify)
+  }
+};
+struct AAValueSimplifyCallSiteArgument : AAValueSimplifyFloating {
+  AAValueSimplifyCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAValueSimplifyFloating(IRP, A) {}
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    if (SimplifiedAssociatedValue.hasValue() &&
+        !SimplifiedAssociatedValue.getValue())
+      return Changed;
+
+    Value &V = getAssociatedValue();
+    auto *C = SimplifiedAssociatedValue.hasValue()
+                  ? dyn_cast<Constant>(SimplifiedAssociatedValue.getValue())
+                  : UndefValue::get(V.getType());
+    if (C) {
+      Use &U = cast<CallBase>(&getAnchorValue())->getArgOperandUse(getArgNo());
+      // We can replace the AssociatedValue with the constant.
+      if (&V != C && V.getType() == C->getType()) {
+        if (A.changeUseAfterManifest(U, *C))
+          Changed = ChangeStatus::CHANGED;
+      }
+    }
+
+    return Changed | AAValueSimplify::manifest(A);
+  }
+
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(value_simplify)
+  }
+};
+
+/// ----------------------- Heap-To-Stack Conversion ---------------------------
+struct AAHeapToStackImpl : public AAHeapToStack {
+  AAHeapToStackImpl(const IRPosition &IRP, Attributor &A)
+      : AAHeapToStack(IRP, A) {}
+
+  const std::string getAsStr() const override {
+    return "[H2S] Mallocs: " + std::to_string(MallocCalls.size());
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    assert(getState().isValidState() &&
+           "Attempted to manifest an invalid state!");
+
+    ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+    Function *F = getAnchorScope();
+    const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
+
+    for (Instruction *MallocCall : MallocCalls) {
+      // This malloc cannot be replaced.
+      if (BadMallocCalls.count(MallocCall))
+        continue;
+
+      for (Instruction *FreeCall : FreesForMalloc[MallocCall]) {
+        LLVM_DEBUG(dbgs() << "H2S: Removing free call: " << *FreeCall << "\n");
+        A.deleteAfterManifest(*FreeCall);
+        HasChanged = ChangeStatus::CHANGED;
+      }
+
+      LLVM_DEBUG(dbgs() << "H2S: Removing malloc call: " << *MallocCall
+                        << "\n");
+
+      Align Alignment;
+      Constant *Size;
+      if (isCallocLikeFn(MallocCall, TLI)) {
+        auto *Num = cast<ConstantInt>(MallocCall->getOperand(0));
+        auto *SizeT = cast<ConstantInt>(MallocCall->getOperand(1));
+        APInt TotalSize = SizeT->getValue() * Num->getValue();
+        Size =
+            ConstantInt::get(MallocCall->getOperand(0)->getType(), TotalSize);
+      } else if (isAlignedAllocLikeFn(MallocCall, TLI)) {
+        Size = cast<ConstantInt>(MallocCall->getOperand(1));
+        Alignment = MaybeAlign(cast<ConstantInt>(MallocCall->getOperand(0))
+                                   ->getValue()
+                                   .getZExtValue())
+                        .valueOrOne();
+      } else {
+        Size = cast<ConstantInt>(MallocCall->getOperand(0));
+      }
+
+      unsigned AS = cast<PointerType>(MallocCall->getType())->getAddressSpace();
+      Instruction *AI =
+          new AllocaInst(Type::getInt8Ty(F->getContext()), AS, Size, Alignment,
+                         "", MallocCall->getNextNode());
+
+      if (AI->getType() != MallocCall->getType())
+        AI = new BitCastInst(AI, MallocCall->getType(), "malloc_bc",
+                             AI->getNextNode());
+
+      A.changeValueAfterManifest(*MallocCall, *AI);
+
+      if (auto *II = dyn_cast<InvokeInst>(MallocCall)) {
+        auto *NBB = II->getNormalDest();
+        BranchInst::Create(NBB, MallocCall->getParent());
+        A.deleteAfterManifest(*MallocCall);
+      } else {
+        A.deleteAfterManifest(*MallocCall);
+      }
+
+      // Zero out the allocated memory if it was a calloc.
+      if (isCallocLikeFn(MallocCall, TLI)) {
+        auto *BI = new BitCastInst(AI, MallocCall->getType(), "calloc_bc",
+                                   AI->getNextNode());
+        Value *Ops[] = {
+            BI, ConstantInt::get(F->getContext(), APInt(8, 0, false)), Size,
+            ConstantInt::get(Type::getInt1Ty(F->getContext()), false)};
+
+        Type *Tys[] = {BI->getType(), MallocCall->getOperand(0)->getType()};
+        Module *M = F->getParent();
+        Function *Fn = Intrinsic::getDeclaration(M, Intrinsic::memset, Tys);
+        CallInst::Create(Fn, Ops, "", BI->getNextNode());
+      }
+      HasChanged = ChangeStatus::CHANGED;
+    }
+
+    return HasChanged;
+  }
+
+  /// Collection of all malloc calls in a function.
+  SmallSetVector<Instruction *, 4> MallocCalls;
+
+  /// Collection of malloc calls that cannot be converted.
+  DenseSet<const Instruction *> BadMallocCalls;
+
+  /// A map for each malloc call to the set of associated free calls.
+  DenseMap<Instruction *, SmallPtrSet<Instruction *, 4>> FreesForMalloc;
+
+  ChangeStatus updateImpl(Attributor &A) override;
+};
+
+ChangeStatus AAHeapToStackImpl::updateImpl(Attributor &A) {
+  const Function *F = getAnchorScope();
+  const auto *TLI = A.getInfoCache().getTargetLibraryInfoForFunction(*F);
+
+  MustBeExecutedContextExplorer &Explorer =
+      A.getInfoCache().getMustBeExecutedContextExplorer();
+
+  auto FreeCheck = [&](Instruction &I) {
+    const auto &Frees = FreesForMalloc.lookup(&I);
+    if (Frees.size() != 1)
+      return false;
+    Instruction *UniqueFree = *Frees.begin();
+    return Explorer.findInContextOf(UniqueFree, I.getNextNode());
+  };
+
+  auto UsesCheck = [&](Instruction &I) {
+    bool ValidUsesOnly = true;
+    bool MustUse = true;
+    auto Pred = [&](const Use &U, bool &Follow) -> bool {
+      Instruction *UserI = cast<Instruction>(U.getUser());
+      if (isa<LoadInst>(UserI))
+        return true;
+      if (auto *SI = dyn_cast<StoreInst>(UserI)) {
+        if (SI->getValueOperand() == U.get()) {
+          LLVM_DEBUG(dbgs()
+                     << "[H2S] escaping store to memory: " << *UserI << "\n");
+          ValidUsesOnly = false;
+        } else {
+          // A store into the malloc'ed memory is fine.
+        }
+        return true;
+      }
+      if (auto *CB = dyn_cast<CallBase>(UserI)) {
+        if (!CB->isArgOperand(&U) || CB->isLifetimeStartOrEnd())
+          return true;
+        // Record malloc.
+        if (isFreeCall(UserI, TLI)) {
+          if (MustUse) {
+            FreesForMalloc[&I].insert(UserI);
+          } else {
+            LLVM_DEBUG(dbgs() << "[H2S] free potentially on different mallocs: "
+                              << *UserI << "\n");
+            ValidUsesOnly = false;
+          }
+          return true;
+        }
+
+        unsigned ArgNo = CB->getArgOperandNo(&U);
+
+        const auto &NoCaptureAA = A.getAAFor<AANoCapture>(
+            *this, IRPosition::callsite_argument(*CB, ArgNo));
+
+        // If a callsite argument use is nofree, we are fine.
+        const auto &ArgNoFreeAA = A.getAAFor<AANoFree>(
+            *this, IRPosition::callsite_argument(*CB, ArgNo));
+
+        if (!NoCaptureAA.isAssumedNoCapture() ||
+            !ArgNoFreeAA.isAssumedNoFree()) {
+          LLVM_DEBUG(dbgs() << "[H2S] Bad user: " << *UserI << "\n");
+          ValidUsesOnly = false;
+        }
+        return true;
+      }
+
+      if (isa<GetElementPtrInst>(UserI) || isa<BitCastInst>(UserI) ||
+          isa<PHINode>(UserI) || isa<SelectInst>(UserI)) {
+        MustUse &= !(isa<PHINode>(UserI) || isa<SelectInst>(UserI));
+        Follow = true;
+        return true;
+      }
+      // Unknown user for which we can not track uses further (in a way that
+      // makes sense).
+      LLVM_DEBUG(dbgs() << "[H2S] Unknown user: " << *UserI << "\n");
+      ValidUsesOnly = false;
+      return true;
+    };
+    A.checkForAllUses(Pred, *this, I);
+    return ValidUsesOnly;
+  };
+
+  auto MallocCallocCheck = [&](Instruction &I) {
+    if (BadMallocCalls.count(&I))
+      return true;
+
+    bool IsMalloc = isMallocLikeFn(&I, TLI);
+    bool IsAlignedAllocLike = isAlignedAllocLikeFn(&I, TLI);
+    bool IsCalloc = !IsMalloc && isCallocLikeFn(&I, TLI);
+    if (!IsMalloc && !IsAlignedAllocLike && !IsCalloc) {
+      BadMallocCalls.insert(&I);
+      return true;
+    }
+
+    if (IsMalloc) {
+      if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(0)))
+        if (Size->getValue().ule(MaxHeapToStackSize))
+          if (UsesCheck(I) || FreeCheck(I)) {
+            MallocCalls.insert(&I);
+            return true;
+          }
+    } else if (IsAlignedAllocLike && isa<ConstantInt>(I.getOperand(0))) {
+      // Only if the alignment and sizes are constant.
+      if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1)))
+        if (Size->getValue().ule(MaxHeapToStackSize))
+          if (UsesCheck(I) || FreeCheck(I)) {
+            MallocCalls.insert(&I);
+            return true;
+          }
+    } else if (IsCalloc) {
+      bool Overflow = false;
+      if (auto *Num = dyn_cast<ConstantInt>(I.getOperand(0)))
+        if (auto *Size = dyn_cast<ConstantInt>(I.getOperand(1)))
+          if ((Size->getValue().umul_ov(Num->getValue(), Overflow))
+                  .ule(MaxHeapToStackSize))
+            if (!Overflow && (UsesCheck(I) || FreeCheck(I))) {
+              MallocCalls.insert(&I);
+              return true;
+            }
+    }
+
+    BadMallocCalls.insert(&I);
+    return true;
+  };
+
+  size_t NumBadMallocs = BadMallocCalls.size();
+
+  A.checkForAllCallLikeInstructions(MallocCallocCheck, *this);
+
+  if (NumBadMallocs != BadMallocCalls.size())
+    return ChangeStatus::CHANGED;
+
+  return ChangeStatus::UNCHANGED;
+}
+
+struct AAHeapToStackFunction final : public AAHeapToStackImpl {
+  AAHeapToStackFunction(const IRPosition &IRP, Attributor &A)
+      : AAHeapToStackImpl(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics().
+  void trackStatistics() const override {
+    STATS_DECL(
+        MallocCalls, Function,
+        "Number of malloc/calloc/aligned_alloc calls converted to allocas");
+    for (auto *C : MallocCalls)
+      if (!BadMallocCalls.count(C))
+        ++BUILD_STAT_NAME(MallocCalls, Function);
+  }
+};
+
+/// ----------------------- Privatizable Pointers ------------------------------
+struct AAPrivatizablePtrImpl : public AAPrivatizablePtr {
+  AAPrivatizablePtrImpl(const IRPosition &IRP, Attributor &A)
+      : AAPrivatizablePtr(IRP, A), PrivatizableType(llvm::None) {}
+
+  ChangeStatus indicatePessimisticFixpoint() override {
+    AAPrivatizablePtr::indicatePessimisticFixpoint();
+    PrivatizableType = nullptr;
+    return ChangeStatus::CHANGED;
+  }
+
+  /// Identify the type we can chose for a private copy of the underlying
+  /// argument. None means it is not clear yet, nullptr means there is none.
+  virtual Optional<Type *> identifyPrivatizableType(Attributor &A) = 0;
+
+  /// Return a privatizable type that encloses both T0 and T1.
+  /// TODO: This is merely a stub for now as we should manage a mapping as well.
+  Optional<Type *> combineTypes(Optional<Type *> T0, Optional<Type *> T1) {
+    if (!T0.hasValue())
+      return T1;
+    if (!T1.hasValue())
+      return T0;
+    if (T0 == T1)
+      return T0;
+    return nullptr;
+  }
+
+  Optional<Type *> getPrivatizableType() const override {
+    return PrivatizableType;
+  }
+
+  const std::string getAsStr() const override {
+    return isAssumedPrivatizablePtr() ? "[priv]" : "[no-priv]";
+  }
+
+protected:
+  Optional<Type *> PrivatizableType;
+};
+
+// TODO: Do this for call site arguments (probably also other values) as well.
+
+struct AAPrivatizablePtrArgument final : public AAPrivatizablePtrImpl {
+  AAPrivatizablePtrArgument(const IRPosition &IRP, Attributor &A)
+      : AAPrivatizablePtrImpl(IRP, A) {}
+
+  /// See AAPrivatizablePtrImpl::identifyPrivatizableType(...)
+  Optional<Type *> identifyPrivatizableType(Attributor &A) override {
+    // If this is a byval argument and we know all the call sites (so we can
+    // rewrite them), there is no need to check them explicitly.
+    bool AllCallSitesKnown;
+    if (getIRPosition().hasAttr(Attribute::ByVal) &&
+        A.checkForAllCallSites([](AbstractCallSite ACS) { return true; }, *this,
+                               true, AllCallSitesKnown))
+      return getAssociatedValue().getType()->getPointerElementType();
+
+    Optional<Type *> Ty;
+    unsigned ArgNo = getIRPosition().getArgNo();
+
+    // Make sure the associated call site argument has the same type at all call
+    // sites and it is an allocation we know is safe to privatize, for now that
+    // means we only allow alloca instructions.
+    // TODO: We can additionally analyze the accesses in the callee to  create
+    //       the type from that information instead. That is a little more
+    //       involved and will be done in a follow up patch.
+    auto CallSiteCheck = [&](AbstractCallSite ACS) {
+      IRPosition ACSArgPos = IRPosition::callsite_argument(ACS, ArgNo);
+      // Check if a coresponding argument was found or if it is one not
+      // associated (which can happen for callback calls).
+      if (ACSArgPos.getPositionKind() == IRPosition::IRP_INVALID)
+        return false;
+
+      // Check that all call sites agree on a type.
+      auto &PrivCSArgAA = A.getAAFor<AAPrivatizablePtr>(*this, ACSArgPos);
+      Optional<Type *> CSTy = PrivCSArgAA.getPrivatizableType();
+
+      LLVM_DEBUG({
+        dbgs() << "[AAPrivatizablePtr] ACSPos: " << ACSArgPos << ", CSTy: ";
+        if (CSTy.hasValue() && CSTy.getValue())
+          CSTy.getValue()->print(dbgs());
+        else if (CSTy.hasValue())
+          dbgs() << "<nullptr>";
+        else
+          dbgs() << "<none>";
+      });
+
+      Ty = combineTypes(Ty, CSTy);
+
+      LLVM_DEBUG({
+        dbgs() << " : New Type: ";
+        if (Ty.hasValue() && Ty.getValue())
+          Ty.getValue()->print(dbgs());
+        else if (Ty.hasValue())
+          dbgs() << "<nullptr>";
+        else
+          dbgs() << "<none>";
+        dbgs() << "\n";
+      });
+
+      return !Ty.hasValue() || Ty.getValue();
+    };
+
+    if (!A.checkForAllCallSites(CallSiteCheck, *this, true, AllCallSitesKnown))
+      return nullptr;
+    return Ty;
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    PrivatizableType = identifyPrivatizableType(A);
+    if (!PrivatizableType.hasValue())
+      return ChangeStatus::UNCHANGED;
+    if (!PrivatizableType.getValue())
+      return indicatePessimisticFixpoint();
+
+    // The dependence is optional so we don't give up once we give up on the
+    // alignment.
+    A.getAAFor<AAAlign>(*this, IRPosition::value(getAssociatedValue()),
+                        /* TrackDependence */ true, DepClassTy::OPTIONAL);
+
+    // Avoid arguments with padding for now.
+    if (!getIRPosition().hasAttr(Attribute::ByVal) &&
+        !ArgumentPromotionPass::isDenselyPacked(PrivatizableType.getValue(),
+                                                A.getInfoCache().getDL())) {
+      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Padding detected\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    // Verify callee and caller agree on how the promoted argument would be
+    // passed.
+    // TODO: The use of the ArgumentPromotion interface here is ugly, we need a
+    // specialized form of TargetTransformInfo::areFunctionArgsABICompatible
+    // which doesn't require the arguments ArgumentPromotion wanted to pass.
+    Function &Fn = *getIRPosition().getAnchorScope();
+    SmallPtrSet<Argument *, 1> ArgsToPromote, Dummy;
+    ArgsToPromote.insert(getAssociatedArgument());
+    const auto *TTI =
+        A.getInfoCache().getAnalysisResultForFunction<TargetIRAnalysis>(Fn);
+    if (!TTI ||
+        !ArgumentPromotionPass::areFunctionArgsABICompatible(
+            Fn, *TTI, ArgsToPromote, Dummy) ||
+        ArgsToPromote.empty()) {
+      LLVM_DEBUG(
+          dbgs() << "[AAPrivatizablePtr] ABI incompatibility detected for "
+                 << Fn.getName() << "\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    // Collect the types that will replace the privatizable type in the function
+    // signature.
+    SmallVector<Type *, 16> ReplacementTypes;
+    identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes);
+
+    // Register a rewrite of the argument.
+    Argument *Arg = getAssociatedArgument();
+    if (!A.isValidFunctionSignatureRewrite(*Arg, ReplacementTypes)) {
+      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Rewrite not valid\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    unsigned ArgNo = Arg->getArgNo();
+
+    // Helper to check if for the given call site the associated argument is
+    // passed to a callback where the privatization would be different.
+    auto IsCompatiblePrivArgOfCallback = [&](CallBase &CB) {
+      SmallVector<const Use *, 4> CallbackUses;
+      AbstractCallSite::getCallbackUses(CB, CallbackUses);
+      for (const Use *U : CallbackUses) {
+        AbstractCallSite CBACS(U);
+        assert(CBACS && CBACS.isCallbackCall());
+        for (Argument &CBArg : CBACS.getCalledFunction()->args()) {
+          int CBArgNo = CBACS.getCallArgOperandNo(CBArg);
+
+          LLVM_DEBUG({
+            dbgs()
+                << "[AAPrivatizablePtr] Argument " << *Arg
+                << "check if can be privatized in the context of its parent ("
+                << Arg->getParent()->getName()
+                << ")\n[AAPrivatizablePtr] because it is an argument in a "
+                   "callback ("
+                << CBArgNo << "@" << CBACS.getCalledFunction()->getName()
+                << ")\n[AAPrivatizablePtr] " << CBArg << " : "
+                << CBACS.getCallArgOperand(CBArg) << " vs "
+                << CB.getArgOperand(ArgNo) << "\n"
+                << "[AAPrivatizablePtr] " << CBArg << " : "
+                << CBACS.getCallArgOperandNo(CBArg) << " vs " << ArgNo << "\n";
+          });
+
+          if (CBArgNo != int(ArgNo))
+            continue;
+          const auto &CBArgPrivAA =
+              A.getAAFor<AAPrivatizablePtr>(*this, IRPosition::argument(CBArg));
+          if (CBArgPrivAA.isValidState()) {
+            auto CBArgPrivTy = CBArgPrivAA.getPrivatizableType();
+            if (!CBArgPrivTy.hasValue())
+              continue;
+            if (CBArgPrivTy.getValue() == PrivatizableType)
+              continue;
+          }
+
+          LLVM_DEBUG({
+            dbgs() << "[AAPrivatizablePtr] Argument " << *Arg
+                   << " cannot be privatized in the context of its parent ("
+                   << Arg->getParent()->getName()
+                   << ")\n[AAPrivatizablePtr] because it is an argument in a "
+                      "callback ("
+                   << CBArgNo << "@" << CBACS.getCalledFunction()->getName()
+                   << ").\n[AAPrivatizablePtr] for which the argument "
+                      "privatization is not compatible.\n";
+          });
+          return false;
+        }
+      }
+      return true;
+    };
+
+    // Helper to check if for the given call site the associated argument is
+    // passed to a direct call where the privatization would be different.
+    auto IsCompatiblePrivArgOfDirectCS = [&](AbstractCallSite ACS) {
+      CallBase *DC = cast<CallBase>(ACS.getInstruction());
+      int DCArgNo = ACS.getCallArgOperandNo(ArgNo);
+      assert(DCArgNo >= 0 && unsigned(DCArgNo) < DC->getNumArgOperands() &&
+             "Expected a direct call operand for callback call operand");
+
+      LLVM_DEBUG({
+        dbgs() << "[AAPrivatizablePtr] Argument " << *Arg
+               << " check if be privatized in the context of its parent ("
+               << Arg->getParent()->getName()
+               << ")\n[AAPrivatizablePtr] because it is an argument in a "
+                  "direct call of ("
+               << DCArgNo << "@" << DC->getCalledFunction()->getName()
+               << ").\n";
+      });
+
+      Function *DCCallee = DC->getCalledFunction();
+      if (unsigned(DCArgNo) < DCCallee->arg_size()) {
+        const auto &DCArgPrivAA = A.getAAFor<AAPrivatizablePtr>(
+            *this, IRPosition::argument(*DCCallee->getArg(DCArgNo)));
+        if (DCArgPrivAA.isValidState()) {
+          auto DCArgPrivTy = DCArgPrivAA.getPrivatizableType();
+          if (!DCArgPrivTy.hasValue())
+            return true;
+          if (DCArgPrivTy.getValue() == PrivatizableType)
+            return true;
+        }
+      }
+
+      LLVM_DEBUG({
+        dbgs() << "[AAPrivatizablePtr] Argument " << *Arg
+               << " cannot be privatized in the context of its parent ("
+               << Arg->getParent()->getName()
+               << ")\n[AAPrivatizablePtr] because it is an argument in a "
+                  "direct call of ("
+               << ACS.getInstruction()->getCalledFunction()->getName()
+               << ").\n[AAPrivatizablePtr] for which the argument "
+                  "privatization is not compatible.\n";
+      });
+      return false;
+    };
+
+    // Helper to check if the associated argument is used at the given abstract
+    // call site in a way that is incompatible with the privatization assumed
+    // here.
+    auto IsCompatiblePrivArgOfOtherCallSite = [&](AbstractCallSite ACS) {
+      if (ACS.isDirectCall())
+        return IsCompatiblePrivArgOfCallback(*ACS.getInstruction());
+      if (ACS.isCallbackCall())
+        return IsCompatiblePrivArgOfDirectCS(ACS);
+      return false;
+    };
+
+    bool AllCallSitesKnown;
+    if (!A.checkForAllCallSites(IsCompatiblePrivArgOfOtherCallSite, *this, true,
+                                AllCallSitesKnown))
+      return indicatePessimisticFixpoint();
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// Given a type to private \p PrivType, collect the constituates (which are
+  /// used) in \p ReplacementTypes.
+  static void
+  identifyReplacementTypes(Type *PrivType,
+                           SmallVectorImpl<Type *> &ReplacementTypes) {
+    // TODO: For now we expand the privatization type to the fullest which can
+    //       lead to dead arguments that need to be removed later.
+    assert(PrivType && "Expected privatizable type!");
+
+    // Traverse the type, extract constituate types on the outermost level.
+    if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
+      for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++)
+        ReplacementTypes.push_back(PrivStructType->getElementType(u));
+    } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
+      ReplacementTypes.append(PrivArrayType->getNumElements(),
+                              PrivArrayType->getElementType());
+    } else {
+      ReplacementTypes.push_back(PrivType);
+    }
+  }
+
+  /// Initialize \p Base according to the type \p PrivType at position \p IP.
+  /// The values needed are taken from the arguments of \p F starting at
+  /// position \p ArgNo.
+  static void createInitialization(Type *PrivType, Value &Base, Function &F,
+                                   unsigned ArgNo, Instruction &IP) {
+    assert(PrivType && "Expected privatizable type!");
+
+    IRBuilder<NoFolder> IRB(&IP);
+    const DataLayout &DL = F.getParent()->getDataLayout();
+
+    // Traverse the type, build GEPs and stores.
+    if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
+      const StructLayout *PrivStructLayout = DL.getStructLayout(PrivStructType);
+      for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) {
+        Type *PointeeTy = PrivStructType->getElementType(u)->getPointerTo();
+        Value *Ptr = constructPointer(
+            PointeeTy, &Base, PrivStructLayout->getElementOffset(u), IRB, DL);
+        new StoreInst(F.getArg(ArgNo + u), Ptr, &IP);
+      }
+    } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
+      Type *PointeePtrTy = PrivArrayType->getElementType()->getPointerTo();
+      uint64_t PointeeTySize = DL.getTypeStoreSize(PointeePtrTy);
+      for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) {
+        Value *Ptr =
+            constructPointer(PointeePtrTy, &Base, u * PointeeTySize, IRB, DL);
+        new StoreInst(F.getArg(ArgNo + u), Ptr, &IP);
+      }
+    } else {
+      new StoreInst(F.getArg(ArgNo), &Base, &IP);
+    }
+  }
+
+  /// Extract values from \p Base according to the type \p PrivType at the
+  /// call position \p ACS. The values are appended to \p ReplacementValues.
+  void createReplacementValues(Align Alignment, Type *PrivType,
+                               AbstractCallSite ACS, Value *Base,
+                               SmallVectorImpl<Value *> &ReplacementValues) {
+    assert(Base && "Expected base value!");
+    assert(PrivType && "Expected privatizable type!");
+    Instruction *IP = ACS.getInstruction();
+
+    IRBuilder<NoFolder> IRB(IP);
+    const DataLayout &DL = IP->getModule()->getDataLayout();
+
+    if (Base->getType()->getPointerElementType() != PrivType)
+      Base = BitCastInst::CreateBitOrPointerCast(Base, PrivType->getPointerTo(),
+                                                 "", ACS.getInstruction());
+
+    // Traverse the type, build GEPs and loads.
+    if (auto *PrivStructType = dyn_cast<StructType>(PrivType)) {
+      const StructLayout *PrivStructLayout = DL.getStructLayout(PrivStructType);
+      for (unsigned u = 0, e = PrivStructType->getNumElements(); u < e; u++) {
+        Type *PointeeTy = PrivStructType->getElementType(u);
+        Value *Ptr =
+            constructPointer(PointeeTy->getPointerTo(), Base,
+                             PrivStructLayout->getElementOffset(u), IRB, DL);
+        LoadInst *L = new LoadInst(PointeeTy, Ptr, "", IP);
+        L->setAlignment(Alignment);
+        ReplacementValues.push_back(L);
+      }
+    } else if (auto *PrivArrayType = dyn_cast<ArrayType>(PrivType)) {
+      Type *PointeeTy = PrivArrayType->getElementType();
+      uint64_t PointeeTySize = DL.getTypeStoreSize(PointeeTy);
+      Type *PointeePtrTy = PointeeTy->getPointerTo();
+      for (unsigned u = 0, e = PrivArrayType->getNumElements(); u < e; u++) {
+        Value *Ptr =
+            constructPointer(PointeePtrTy, Base, u * PointeeTySize, IRB, DL);
+        LoadInst *L = new LoadInst(PointeePtrTy, Ptr, "", IP);
+        L->setAlignment(Alignment);
+        ReplacementValues.push_back(L);
+      }
+    } else {
+      LoadInst *L = new LoadInst(PrivType, Base, "", IP);
+      L->setAlignment(Alignment);
+      ReplacementValues.push_back(L);
+    }
+  }
+
+  /// See AbstractAttribute::manifest(...)
+  ChangeStatus manifest(Attributor &A) override {
+    if (!PrivatizableType.hasValue())
+      return ChangeStatus::UNCHANGED;
+    assert(PrivatizableType.getValue() && "Expected privatizable type!");
+
+    // Collect all tail calls in the function as we cannot allow new allocas to
+    // escape into tail recursion.
+    // TODO: Be smarter about new allocas escaping into tail calls.
+    SmallVector<CallInst *, 16> TailCalls;
+    if (!A.checkForAllInstructions(
+            [&](Instruction &I) {
+              CallInst &CI = cast<CallInst>(I);
+              if (CI.isTailCall())
+                TailCalls.push_back(&CI);
+              return true;
+            },
+            *this, {Instruction::Call}))
+      return ChangeStatus::UNCHANGED;
+
+    Argument *Arg = getAssociatedArgument();
+    // Query AAAlign attribute for alignment of associated argument to
+    // determine the best alignment of loads.
+    const auto &AlignAA = A.getAAFor<AAAlign>(*this, IRPosition::value(*Arg));
+
+    // Callback to repair the associated function. A new alloca is placed at the
+    // beginning and initialized with the values passed through arguments. The
+    // new alloca replaces the use of the old pointer argument.
+    Attributor::ArgumentReplacementInfo::CalleeRepairCBTy FnRepairCB =
+        [=](const Attributor::ArgumentReplacementInfo &ARI,
+            Function &ReplacementFn, Function::arg_iterator ArgIt) {
+          BasicBlock &EntryBB = ReplacementFn.getEntryBlock();
+          Instruction *IP = &*EntryBB.getFirstInsertionPt();
+          auto *AI = new AllocaInst(PrivatizableType.getValue(), 0,
+                                    Arg->getName() + ".priv", IP);
+          createInitialization(PrivatizableType.getValue(), *AI, ReplacementFn,
+                               ArgIt->getArgNo(), *IP);
+          Arg->replaceAllUsesWith(AI);
+
+          for (CallInst *CI : TailCalls)
+            CI->setTailCall(false);
+        };
+
+    // Callback to repair a call site of the associated function. The elements
+    // of the privatizable type are loaded prior to the call and passed to the
+    // new function version.
+    Attributor::ArgumentReplacementInfo::ACSRepairCBTy ACSRepairCB =
+        [=, &AlignAA](const Attributor::ArgumentReplacementInfo &ARI,
+                      AbstractCallSite ACS,
+                      SmallVectorImpl<Value *> &NewArgOperands) {
+          // When no alignment is specified for the load instruction,
+          // natural alignment is assumed.
+          createReplacementValues(
+              assumeAligned(AlignAA.getAssumedAlign()),
+              PrivatizableType.getValue(), ACS,
+              ACS.getCallArgOperand(ARI.getReplacedArg().getArgNo()),
+              NewArgOperands);
+        };
+
+    // Collect the types that will replace the privatizable type in the function
+    // signature.
+    SmallVector<Type *, 16> ReplacementTypes;
+    identifyReplacementTypes(PrivatizableType.getValue(), ReplacementTypes);
+
+    // Register a rewrite of the argument.
+    if (A.registerFunctionSignatureRewrite(*Arg, ReplacementTypes,
+                                           std::move(FnRepairCB),
+                                           std::move(ACSRepairCB)))
+      return ChangeStatus::CHANGED;
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(privatizable_ptr);
+  }
+};
+
+struct AAPrivatizablePtrFloating : public AAPrivatizablePtrImpl {
+  AAPrivatizablePtrFloating(const IRPosition &IRP, Attributor &A)
+      : AAPrivatizablePtrImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  virtual void initialize(Attributor &A) override {
+    // TODO: We can privatize more than arguments.
+    indicatePessimisticFixpoint();
+  }
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("AAPrivatizablePtr(Floating|Returned|CallSiteReturned)::"
+                     "updateImpl will not be called");
+  }
+
+  /// See AAPrivatizablePtrImpl::identifyPrivatizableType(...)
+  Optional<Type *> identifyPrivatizableType(Attributor &A) override {
+    Value *Obj =
+        GetUnderlyingObject(&getAssociatedValue(), A.getInfoCache().getDL());
+    if (!Obj) {
+      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] No underlying object found!\n");
+      return nullptr;
+    }
+
+    if (auto *AI = dyn_cast<AllocaInst>(Obj))
+      if (auto *CI = dyn_cast<ConstantInt>(AI->getArraySize()))
+        if (CI->isOne())
+          return Obj->getType()->getPointerElementType();
+    if (auto *Arg = dyn_cast<Argument>(Obj)) {
+      auto &PrivArgAA =
+          A.getAAFor<AAPrivatizablePtr>(*this, IRPosition::argument(*Arg));
+      if (PrivArgAA.isAssumedPrivatizablePtr())
+        return Obj->getType()->getPointerElementType();
+    }
+
+    LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] Underlying object neither valid "
+                         "alloca nor privatizable argument: "
+                      << *Obj << "!\n");
+    return nullptr;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(privatizable_ptr);
+  }
+};
+
+struct AAPrivatizablePtrCallSiteArgument final
+    : public AAPrivatizablePtrFloating {
+  AAPrivatizablePtrCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAPrivatizablePtrFloating(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (getIRPosition().hasAttr(Attribute::ByVal))
+      indicateOptimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    PrivatizableType = identifyPrivatizableType(A);
+    if (!PrivatizableType.hasValue())
+      return ChangeStatus::UNCHANGED;
+    if (!PrivatizableType.getValue())
+      return indicatePessimisticFixpoint();
+
+    const IRPosition &IRP = getIRPosition();
+    auto &NoCaptureAA = A.getAAFor<AANoCapture>(*this, IRP);
+    if (!NoCaptureAA.isAssumedNoCapture()) {
+      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer might be captured!\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    auto &NoAliasAA = A.getAAFor<AANoAlias>(*this, IRP);
+    if (!NoAliasAA.isAssumedNoAlias()) {
+      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer might alias!\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(*this, IRP);
+    if (!MemBehaviorAA.isAssumedReadOnly()) {
+      LLVM_DEBUG(dbgs() << "[AAPrivatizablePtr] pointer is written!\n");
+      return indicatePessimisticFixpoint();
+    }
+
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(privatizable_ptr);
+  }
+};
+
+struct AAPrivatizablePtrCallSiteReturned final
+    : public AAPrivatizablePtrFloating {
+  AAPrivatizablePtrCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AAPrivatizablePtrFloating(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // TODO: We can privatize more than arguments.
+    indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(privatizable_ptr);
+  }
+};
+
+struct AAPrivatizablePtrReturned final : public AAPrivatizablePtrFloating {
+  AAPrivatizablePtrReturned(const IRPosition &IRP, Attributor &A)
+      : AAPrivatizablePtrFloating(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // TODO: We can privatize more than arguments.
+    indicatePessimisticFixpoint();
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(privatizable_ptr);
+  }
+};
+
+/// -------------------- Memory Behavior Attributes ----------------------------
+/// Includes read-none, read-only, and write-only.
+/// ----------------------------------------------------------------------------
+struct AAMemoryBehaviorImpl : public AAMemoryBehavior {
+  AAMemoryBehaviorImpl(const IRPosition &IRP, Attributor &A)
+      : AAMemoryBehavior(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    intersectAssumedBits(BEST_STATE);
+    getKnownStateFromValue(getIRPosition(), getState());
+    IRAttribute::initialize(A);
+  }
+
+  /// Return the memory behavior information encoded in the IR for \p IRP.
+  static void getKnownStateFromValue(const IRPosition &IRP,
+                                     BitIntegerState &State,
+                                     bool IgnoreSubsumingPositions = false) {
+    SmallVector<Attribute, 2> Attrs;
+    IRP.getAttrs(AttrKinds, Attrs, IgnoreSubsumingPositions);
+    for (const Attribute &Attr : Attrs) {
+      switch (Attr.getKindAsEnum()) {
+      case Attribute::ReadNone:
+        State.addKnownBits(NO_ACCESSES);
+        break;
+      case Attribute::ReadOnly:
+        State.addKnownBits(NO_WRITES);
+        break;
+      case Attribute::WriteOnly:
+        State.addKnownBits(NO_READS);
+        break;
+      default:
+        llvm_unreachable("Unexpected attribute!");
+      }
+    }
+
+    if (auto *I = dyn_cast<Instruction>(&IRP.getAnchorValue())) {
+      if (!I->mayReadFromMemory())
+        State.addKnownBits(NO_READS);
+      if (!I->mayWriteToMemory())
+        State.addKnownBits(NO_WRITES);
+    }
+  }
+
+  /// See AbstractAttribute::getDeducedAttributes(...).
+  void getDeducedAttributes(LLVMContext &Ctx,
+                            SmallVectorImpl<Attribute> &Attrs) const override {
+    assert(Attrs.size() == 0);
+    if (isAssumedReadNone())
+      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadNone));
+    else if (isAssumedReadOnly())
+      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadOnly));
+    else if (isAssumedWriteOnly())
+      Attrs.push_back(Attribute::get(Ctx, Attribute::WriteOnly));
+    assert(Attrs.size() <= 1);
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    if (hasAttr(Attribute::ReadNone, /* IgnoreSubsumingPositions */ true))
+      return ChangeStatus::UNCHANGED;
+
+    const IRPosition &IRP = getIRPosition();
+
+    // Check if we would improve the existing attributes first.
+    SmallVector<Attribute, 4> DeducedAttrs;
+    getDeducedAttributes(IRP.getAnchorValue().getContext(), DeducedAttrs);
+    if (llvm::all_of(DeducedAttrs, [&](const Attribute &Attr) {
+          return IRP.hasAttr(Attr.getKindAsEnum(),
+                             /* IgnoreSubsumingPositions */ true);
+        }))
+      return ChangeStatus::UNCHANGED;
+
+    // Clear existing attributes.
+    IRP.removeAttrs(AttrKinds);
+
+    // Use the generic manifest method.
+    return IRAttribute::manifest(A);
+  }
+
+  /// See AbstractState::getAsStr().
+  const std::string getAsStr() const override {
+    if (isAssumedReadNone())
+      return "readnone";
+    if (isAssumedReadOnly())
+      return "readonly";
+    if (isAssumedWriteOnly())
+      return "writeonly";
+    return "may-read/write";
+  }
+
+  /// The set of IR attributes AAMemoryBehavior deals with.
+  static const Attribute::AttrKind AttrKinds[3];
+};
+
+const Attribute::AttrKind AAMemoryBehaviorImpl::AttrKinds[] = {
+    Attribute::ReadNone, Attribute::ReadOnly, Attribute::WriteOnly};
+
+/// Memory behavior attribute for a floating value.
+struct AAMemoryBehaviorFloating : AAMemoryBehaviorImpl {
+  AAMemoryBehaviorFloating(const IRPosition &IRP, Attributor &A)
+      : AAMemoryBehaviorImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAMemoryBehaviorImpl::initialize(A);
+    // Initialize the use vector with all direct uses of the associated value.
+    for (const Use &U : getAssociatedValue().uses())
+      Uses.insert(&U);
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override;
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_FLOATING_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_FLOATING_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_FLOATING_ATTR(writeonly)
+  }
+
+private:
+  /// Return true if users of \p UserI might access the underlying
+  /// variable/location described by \p U and should therefore be analyzed.
+  bool followUsersOfUseIn(Attributor &A, const Use *U,
+                          const Instruction *UserI);
+
+  /// Update the state according to the effect of use \p U in \p UserI.
+  void analyzeUseIn(Attributor &A, const Use *U, const Instruction *UserI);
+
+protected:
+  /// Container for (transitive) uses of the associated argument.
+  SetVector<const Use *> Uses;
+};
+
+/// Memory behavior attribute for function argument.
+struct AAMemoryBehaviorArgument : AAMemoryBehaviorFloating {
+  AAMemoryBehaviorArgument(const IRPosition &IRP, Attributor &A)
+      : AAMemoryBehaviorFloating(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    intersectAssumedBits(BEST_STATE);
+    const IRPosition &IRP = getIRPosition();
+    // TODO: Make IgnoreSubsumingPositions a property of an IRAttribute so we
+    // can query it when we use has/getAttr. That would allow us to reuse the
+    // initialize of the base class here.
+    bool HasByVal =
+        IRP.hasAttr({Attribute::ByVal}, /* IgnoreSubsumingPositions */ true);
+    getKnownStateFromValue(IRP, getState(),
+                           /* IgnoreSubsumingPositions */ HasByVal);
+
+    // Initialize the use vector with all direct uses of the associated value.
+    Argument *Arg = getAssociatedArgument();
+    if (!Arg || !A.isFunctionIPOAmendable(*(Arg->getParent()))) {
+      indicatePessimisticFixpoint();
+    } else {
+      // Initialize the use vector with all direct uses of the associated value.
+      for (const Use &U : Arg->uses())
+        Uses.insert(&U);
+    }
+  }
+
+  ChangeStatus manifest(Attributor &A) override {
+    // TODO: Pointer arguments are not supported on vectors of pointers yet.
+    if (!getAssociatedValue().getType()->isPointerTy())
+      return ChangeStatus::UNCHANGED;
+
+    // TODO: From readattrs.ll: "inalloca parameters are always
+    //                           considered written"
+    if (hasAttr({Attribute::InAlloca, Attribute::Preallocated})) {
+      removeKnownBits(NO_WRITES);
+      removeAssumedBits(NO_WRITES);
+    }
+    return AAMemoryBehaviorFloating::manifest(A);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_ARG_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_ARG_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_ARG_ATTR(writeonly)
+  }
+};
+
+struct AAMemoryBehaviorCallSiteArgument final : AAMemoryBehaviorArgument {
+  AAMemoryBehaviorCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAMemoryBehaviorArgument(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    if (Argument *Arg = getAssociatedArgument()) {
+      if (Arg->hasByValAttr()) {
+        addKnownBits(NO_WRITES);
+        removeKnownBits(NO_READS);
+        removeAssumedBits(NO_READS);
+      }
+    }
+    AAMemoryBehaviorArgument::initialize(A);
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Argument *Arg = getAssociatedArgument();
+    const IRPosition &ArgPos = IRPosition::argument(*Arg);
+    auto &ArgAA = A.getAAFor<AAMemoryBehavior>(*this, ArgPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AAMemoryBehavior::StateType &>(ArgAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_CSARG_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_CSARG_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_CSARG_ATTR(writeonly)
+  }
+};
+
+/// Memory behavior attribute for a call site return position.
+struct AAMemoryBehaviorCallSiteReturned final : AAMemoryBehaviorFloating {
+  AAMemoryBehaviorCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AAMemoryBehaviorFloating(IRP, A) {}
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    // We do not annotate returned values.
+    return ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {}
+};
+
+/// An AA to represent the memory behavior function attributes.
+struct AAMemoryBehaviorFunction final : public AAMemoryBehaviorImpl {
+  AAMemoryBehaviorFunction(const IRPosition &IRP, Attributor &A)
+      : AAMemoryBehaviorImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  virtual ChangeStatus updateImpl(Attributor &A) override;
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    Function &F = cast<Function>(getAnchorValue());
+    if (isAssumedReadNone()) {
+      F.removeFnAttr(Attribute::ArgMemOnly);
+      F.removeFnAttr(Attribute::InaccessibleMemOnly);
+      F.removeFnAttr(Attribute::InaccessibleMemOrArgMemOnly);
+    }
+    return AAMemoryBehaviorImpl::manifest(A);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_FN_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_FN_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_FN_ATTR(writeonly)
+  }
+};
+
+/// AAMemoryBehavior attribute for call sites.
+struct AAMemoryBehaviorCallSite final : AAMemoryBehaviorImpl {
+  AAMemoryBehaviorCallSite(const IRPosition &IRP, Attributor &A)
+      : AAMemoryBehaviorImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAMemoryBehaviorImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F || !A.isFunctionIPOAmendable(*F)) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AAMemoryBehavior>(*this, FnPos);
+    return clampStateAndIndicateChange(
+        getState(),
+        static_cast<const AAMemoryBehavior::StateType &>(FnAA.getState()));
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_CS_ATTR(readnone)
+    else if (isAssumedReadOnly())
+      STATS_DECLTRACK_CS_ATTR(readonly)
+    else if (isAssumedWriteOnly())
+      STATS_DECLTRACK_CS_ATTR(writeonly)
+  }
+};
+
+ChangeStatus AAMemoryBehaviorFunction::updateImpl(Attributor &A) {
+
+  // The current assumed state used to determine a change.
+  auto AssumedState = getAssumed();
+
+  auto CheckRWInst = [&](Instruction &I) {
+    // If the instruction has an own memory behavior state, use it to restrict
+    // the local state. No further analysis is required as the other memory
+    // state is as optimistic as it gets.
+    if (const auto *CB = dyn_cast<CallBase>(&I)) {
+      const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
+          *this, IRPosition::callsite_function(*CB));
+      intersectAssumedBits(MemBehaviorAA.getAssumed());
+      return !isAtFixpoint();
+    }
+
+    // Remove access kind modifiers if necessary.
+    if (I.mayReadFromMemory())
+      removeAssumedBits(NO_READS);
+    if (I.mayWriteToMemory())
+      removeAssumedBits(NO_WRITES);
+    return !isAtFixpoint();
+  };
+
+  if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
+    return indicatePessimisticFixpoint();
+
+  return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
+                                        : ChangeStatus::UNCHANGED;
+}
+
+ChangeStatus AAMemoryBehaviorFloating::updateImpl(Attributor &A) {
+
+  const IRPosition &IRP = getIRPosition();
+  const IRPosition &FnPos = IRPosition::function_scope(IRP);
+  AAMemoryBehavior::StateType &S = getState();
+
+  // First, check the function scope. We take the known information and we avoid
+  // work if the assumed information implies the current assumed information for
+  // this attribute. This is a valid for all but byval arguments.
+  Argument *Arg = IRP.getAssociatedArgument();
+  AAMemoryBehavior::base_t FnMemAssumedState =
+      AAMemoryBehavior::StateType::getWorstState();
+  if (!Arg || !Arg->hasByValAttr()) {
+    const auto &FnMemAA = A.getAAFor<AAMemoryBehavior>(
+        *this, FnPos, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+    FnMemAssumedState = FnMemAA.getAssumed();
+    S.addKnownBits(FnMemAA.getKnown());
+    if ((S.getAssumed() & FnMemAA.getAssumed()) == S.getAssumed())
+      return ChangeStatus::UNCHANGED;
+  }
+
+  // Make sure the value is not captured (except through "return"), if
+  // it is, any information derived would be irrelevant anyway as we cannot
+  // check the potential aliases introduced by the capture. However, no need
+  // to fall back to anythign less optimistic than the function state.
+  const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(
+      *this, IRP, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+  if (!ArgNoCaptureAA.isAssumedNoCaptureMaybeReturned()) {
+    S.intersectAssumedBits(FnMemAssumedState);
+    return ChangeStatus::CHANGED;
+  }
+
+  // The current assumed state used to determine a change.
+  auto AssumedState = S.getAssumed();
+
+  // Liveness information to exclude dead users.
+  // TODO: Take the FnPos once we have call site specific liveness information.
+  const auto &LivenessAA = A.getAAFor<AAIsDead>(
+      *this, IRPosition::function(*IRP.getAssociatedFunction()),
+      /* TrackDependence */ false);
+
+  // Visit and expand uses until all are analyzed or a fixpoint is reached.
+  for (unsigned i = 0; i < Uses.size() && !isAtFixpoint(); i++) {
+    const Use *U = Uses[i];
+    Instruction *UserI = cast<Instruction>(U->getUser());
+    LLVM_DEBUG(dbgs() << "[AAMemoryBehavior] Use: " << **U << " in " << *UserI
+                      << " [Dead: " << (A.isAssumedDead(*U, this, &LivenessAA))
+                      << "]\n");
+    if (A.isAssumedDead(*U, this, &LivenessAA))
+      continue;
+
+    // Droppable users, e.g., llvm::assume does not actually perform any action.
+    if (UserI->isDroppable())
+      continue;
+
+    // Check if the users of UserI should also be visited.
+    if (followUsersOfUseIn(A, U, UserI))
+      for (const Use &UserIUse : UserI->uses())
+        Uses.insert(&UserIUse);
+
+    // If UserI might touch memory we analyze the use in detail.
+    if (UserI->mayReadOrWriteMemory())
+      analyzeUseIn(A, U, UserI);
+  }
+
+  return (AssumedState != getAssumed()) ? ChangeStatus::CHANGED
+                                        : ChangeStatus::UNCHANGED;
+}
+
+bool AAMemoryBehaviorFloating::followUsersOfUseIn(Attributor &A, const Use *U,
+                                                  const Instruction *UserI) {
+  // The loaded value is unrelated to the pointer argument, no need to
+  // follow the users of the load.
+  if (isa<LoadInst>(UserI))
+    return false;
+
+  // By default we follow all uses assuming UserI might leak information on U,
+  // we have special handling for call sites operands though.
+  const auto *CB = dyn_cast<CallBase>(UserI);
+  if (!CB || !CB->isArgOperand(U))
+    return true;
+
+  // If the use is a call argument known not to be captured, the users of
+  // the call do not need to be visited because they have to be unrelated to
+  // the input. Note that this check is not trivial even though we disallow
+  // general capturing of the underlying argument. The reason is that the
+  // call might the argument "through return", which we allow and for which we
+  // need to check call users.
+  if (U->get()->getType()->isPointerTy()) {
+    unsigned ArgNo = CB->getArgOperandNo(U);
+    const auto &ArgNoCaptureAA = A.getAAFor<AANoCapture>(
+        *this, IRPosition::callsite_argument(*CB, ArgNo),
+        /* TrackDependence */ true, DepClassTy::OPTIONAL);
+    return !ArgNoCaptureAA.isAssumedNoCapture();
+  }
+
+  return true;
+}
+
+void AAMemoryBehaviorFloating::analyzeUseIn(Attributor &A, const Use *U,
+                                            const Instruction *UserI) {
+  assert(UserI->mayReadOrWriteMemory());
+
+  switch (UserI->getOpcode()) {
+  default:
+    // TODO: Handle all atomics and other side-effect operations we know of.
+    break;
+  case Instruction::Load:
+    // Loads cause the NO_READS property to disappear.
+    removeAssumedBits(NO_READS);
+    return;
+
+  case Instruction::Store:
+    // Stores cause the NO_WRITES property to disappear if the use is the
+    // pointer operand. Note that we do assume that capturing was taken care of
+    // somewhere else.
+    if (cast<StoreInst>(UserI)->getPointerOperand() == U->get())
+      removeAssumedBits(NO_WRITES);
+    return;
+
+  case Instruction::Call:
+  case Instruction::CallBr:
+  case Instruction::Invoke: {
+    // For call sites we look at the argument memory behavior attribute (this
+    // could be recursive!) in order to restrict our own state.
+    const auto *CB = cast<CallBase>(UserI);
+
+    // Give up on operand bundles.
+    if (CB->isBundleOperand(U)) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+
+    // Calling a function does read the function pointer, maybe write it if the
+    // function is self-modifying.
+    if (CB->isCallee(U)) {
+      removeAssumedBits(NO_READS);
+      break;
+    }
+
+    // Adjust the possible access behavior based on the information on the
+    // argument.
+    IRPosition Pos;
+    if (U->get()->getType()->isPointerTy())
+      Pos = IRPosition::callsite_argument(*CB, CB->getArgOperandNo(U));
+    else
+      Pos = IRPosition::callsite_function(*CB);
+    const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
+        *this, Pos,
+        /* TrackDependence */ true, DepClassTy::OPTIONAL);
+    // "assumed" has at most the same bits as the MemBehaviorAA assumed
+    // and at least "known".
+    intersectAssumedBits(MemBehaviorAA.getAssumed());
+    return;
+  }
+  };
+
+  // Generally, look at the "may-properties" and adjust the assumed state if we
+  // did not trigger special handling before.
+  if (UserI->mayReadFromMemory())
+    removeAssumedBits(NO_READS);
+  if (UserI->mayWriteToMemory())
+    removeAssumedBits(NO_WRITES);
+}
+
+} // namespace
+
+/// -------------------- Memory Locations Attributes ---------------------------
+/// Includes read-none, argmemonly, inaccessiblememonly,
+/// inaccessiblememorargmemonly
+/// ----------------------------------------------------------------------------
+
+std::string AAMemoryLocation::getMemoryLocationsAsStr(
+    AAMemoryLocation::MemoryLocationsKind MLK) {
+  if (0 == (MLK & AAMemoryLocation::NO_LOCATIONS))
+    return "all memory";
+  if (MLK == AAMemoryLocation::NO_LOCATIONS)
+    return "no memory";
+  std::string S = "memory:";
+  if (0 == (MLK & AAMemoryLocation::NO_LOCAL_MEM))
+    S += "stack,";
+  if (0 == (MLK & AAMemoryLocation::NO_CONST_MEM))
+    S += "constant,";
+  if (0 == (MLK & AAMemoryLocation::NO_GLOBAL_INTERNAL_MEM))
+    S += "internal global,";
+  if (0 == (MLK & AAMemoryLocation::NO_GLOBAL_EXTERNAL_MEM))
+    S += "external global,";
+  if (0 == (MLK & AAMemoryLocation::NO_ARGUMENT_MEM))
+    S += "argument,";
+  if (0 == (MLK & AAMemoryLocation::NO_INACCESSIBLE_MEM))
+    S += "inaccessible,";
+  if (0 == (MLK & AAMemoryLocation::NO_MALLOCED_MEM))
+    S += "malloced,";
+  if (0 == (MLK & AAMemoryLocation::NO_UNKOWN_MEM))
+    S += "unknown,";
+  S.pop_back();
+  return S;
+}
+
+namespace {
+struct AAMemoryLocationImpl : public AAMemoryLocation {
+
+  AAMemoryLocationImpl(const IRPosition &IRP, Attributor &A)
+      : AAMemoryLocation(IRP, A), Allocator(A.Allocator) {
+    for (unsigned u = 0; u < llvm::CTLog2<VALID_STATE>(); ++u)
+      AccessKind2Accesses[u] = nullptr;
+  }
+
+  ~AAMemoryLocationImpl() {
+    // The AccessSets are allocated via a BumpPtrAllocator, we call
+    // the destructor manually.
+    for (unsigned u = 0; u < llvm::CTLog2<VALID_STATE>(); ++u)
+      if (AccessKind2Accesses[u])
+        AccessKind2Accesses[u]->~AccessSet();
+  }
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    intersectAssumedBits(BEST_STATE);
+    getKnownStateFromValue(A, getIRPosition(), getState());
+    IRAttribute::initialize(A);
+  }
+
+  /// Return the memory behavior information encoded in the IR for \p IRP.
+  static void getKnownStateFromValue(Attributor &A, const IRPosition &IRP,
+                                     BitIntegerState &State,
+                                     bool IgnoreSubsumingPositions = false) {
+    // For internal functions we ignore `argmemonly` and
+    // `inaccessiblememorargmemonly` as we might break it via interprocedural
+    // constant propagation. It is unclear if this is the best way but it is
+    // unlikely this will cause real performance problems. If we are deriving
+    // attributes for the anchor function we even remove the attribute in
+    // addition to ignoring it.
+    bool UseArgMemOnly = true;
+    Function *AnchorFn = IRP.getAnchorScope();
+    if (AnchorFn && A.isRunOn(*AnchorFn))
+      UseArgMemOnly = !AnchorFn->hasLocalLinkage();
+
+    SmallVector<Attribute, 2> Attrs;
+    IRP.getAttrs(AttrKinds, Attrs, IgnoreSubsumingPositions);
+    for (const Attribute &Attr : Attrs) {
+      switch (Attr.getKindAsEnum()) {
+      case Attribute::ReadNone:
+        State.addKnownBits(NO_LOCAL_MEM | NO_CONST_MEM);
+        break;
+      case Attribute::InaccessibleMemOnly:
+        State.addKnownBits(inverseLocation(NO_INACCESSIBLE_MEM, true, true));
+        break;
+      case Attribute::ArgMemOnly:
+        if (UseArgMemOnly)
+          State.addKnownBits(inverseLocation(NO_ARGUMENT_MEM, true, true));
+        else
+          IRP.removeAttrs({Attribute::ArgMemOnly});
+        break;
+      case Attribute::InaccessibleMemOrArgMemOnly:
+        if (UseArgMemOnly)
+          State.addKnownBits(inverseLocation(
+              NO_INACCESSIBLE_MEM | NO_ARGUMENT_MEM, true, true));
+        else
+          IRP.removeAttrs({Attribute::InaccessibleMemOrArgMemOnly});
+        break;
+      default:
+        llvm_unreachable("Unexpected attribute!");
+      }
+    }
+  }
+
+  /// See AbstractAttribute::getDeducedAttributes(...).
+  void getDeducedAttributes(LLVMContext &Ctx,
+                            SmallVectorImpl<Attribute> &Attrs) const override {
+    assert(Attrs.size() == 0);
+    if (isAssumedReadNone()) {
+      Attrs.push_back(Attribute::get(Ctx, Attribute::ReadNone));
+    } else if (getIRPosition().getPositionKind() == IRPosition::IRP_FUNCTION) {
+      if (isAssumedInaccessibleMemOnly())
+        Attrs.push_back(Attribute::get(Ctx, Attribute::InaccessibleMemOnly));
+      else if (isAssumedArgMemOnly())
+        Attrs.push_back(Attribute::get(Ctx, Attribute::ArgMemOnly));
+      else if (isAssumedInaccessibleOrArgMemOnly())
+        Attrs.push_back(
+            Attribute::get(Ctx, Attribute::InaccessibleMemOrArgMemOnly));
+    }
+    assert(Attrs.size() <= 1);
+  }
+
+  /// See AbstractAttribute::manifest(...).
+  ChangeStatus manifest(Attributor &A) override {
+    const IRPosition &IRP = getIRPosition();
+
+    // Check if we would improve the existing attributes first.
+    SmallVector<Attribute, 4> DeducedAttrs;
+    getDeducedAttributes(IRP.getAnchorValue().getContext(), DeducedAttrs);
+    if (llvm::all_of(DeducedAttrs, [&](const Attribute &Attr) {
+          return IRP.hasAttr(Attr.getKindAsEnum(),
+                             /* IgnoreSubsumingPositions */ true);
+        }))
+      return ChangeStatus::UNCHANGED;
+
+    // Clear existing attributes.
+    IRP.removeAttrs(AttrKinds);
+    if (isAssumedReadNone())
+      IRP.removeAttrs(AAMemoryBehaviorImpl::AttrKinds);
+
+    // Use the generic manifest method.
+    return IRAttribute::manifest(A);
+  }
+
+  /// See AAMemoryLocation::checkForAllAccessesToMemoryKind(...).
+  bool checkForAllAccessesToMemoryKind(
+      function_ref<bool(const Instruction *, const Value *, AccessKind,
+                        MemoryLocationsKind)>
+          Pred,
+      MemoryLocationsKind RequestedMLK) const override {
+    if (!isValidState())
+      return false;
+
+    MemoryLocationsKind AssumedMLK = getAssumedNotAccessedLocation();
+    if (AssumedMLK == NO_LOCATIONS)
+      return true;
+
+    unsigned Idx = 0;
+    for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS;
+         CurMLK *= 2, ++Idx) {
+      if (CurMLK & RequestedMLK)
+        continue;
+
+      if (const AccessSet *Accesses = AccessKind2Accesses[Idx])
+        for (const AccessInfo &AI : *Accesses)
+          if (!Pred(AI.I, AI.Ptr, AI.Kind, CurMLK))
+            return false;
+    }
+
+    return true;
+  }
+
+  ChangeStatus indicatePessimisticFixpoint() override {
+    // If we give up and indicate a pessimistic fixpoint this instruction will
+    // become an access for all potential access kinds:
+    // TODO: Add pointers for argmemonly and globals to improve the results of
+    //       checkForAllAccessesToMemoryKind.
+    bool Changed = false;
+    MemoryLocationsKind KnownMLK = getKnown();
+    Instruction *I = dyn_cast<Instruction>(&getAssociatedValue());
+    for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS; CurMLK *= 2)
+      if (!(CurMLK & KnownMLK))
+        updateStateAndAccessesMap(getState(), CurMLK, I, nullptr, Changed,
+                                  getAccessKindFromInst(I));
+    return AAMemoryLocation::indicatePessimisticFixpoint();
+  }
+
+protected:
+  /// Helper struct to tie together an instruction that has a read or write
+  /// effect with the pointer it accesses (if any).
+  struct AccessInfo {
+
+    /// The instruction that caused the access.
+    const Instruction *I;
+
+    /// The base pointer that is accessed, or null if unknown.
+    const Value *Ptr;
+
+    /// The kind of access (read/write/read+write).
+    AccessKind Kind;
+
+    bool operator==(const AccessInfo &RHS) const {
+      return I == RHS.I && Ptr == RHS.Ptr && Kind == RHS.Kind;
+    }
+    bool operator()(const AccessInfo &LHS, const AccessInfo &RHS) const {
+      if (LHS.I != RHS.I)
+        return LHS.I < RHS.I;
+      if (LHS.Ptr != RHS.Ptr)
+        return LHS.Ptr < RHS.Ptr;
+      if (LHS.Kind != RHS.Kind)
+        return LHS.Kind < RHS.Kind;
+      return false;
+    }
+  };
+
+  /// Mapping from *single* memory location kinds, e.g., LOCAL_MEM with the
+  /// value of NO_LOCAL_MEM, to the accesses encountered for this memory kind.
+  using AccessSet = SmallSet<AccessInfo, 2, AccessInfo>;
+  AccessSet *AccessKind2Accesses[llvm::CTLog2<VALID_STATE>()];
+
+  /// Return the kind(s) of location that may be accessed by \p V.
+  AAMemoryLocation::MemoryLocationsKind
+  categorizeAccessedLocations(Attributor &A, Instruction &I, bool &Changed);
+
+  /// Return the access kind as determined by \p I.
+  AccessKind getAccessKindFromInst(const Instruction *I) {
+    AccessKind AK = READ_WRITE;
+    if (I) {
+      AK = I->mayReadFromMemory() ? READ : NONE;
+      AK = AccessKind(AK | (I->mayWriteToMemory() ? WRITE : NONE));
+    }
+    return AK;
+  }
+
+  /// Update the state \p State and the AccessKind2Accesses given that \p I is
+  /// an access of kind \p AK to a \p MLK memory location with the access
+  /// pointer \p Ptr.
+  void updateStateAndAccessesMap(AAMemoryLocation::StateType &State,
+                                 MemoryLocationsKind MLK, const Instruction *I,
+                                 const Value *Ptr, bool &Changed,
+                                 AccessKind AK = READ_WRITE) {
+
+    assert(isPowerOf2_32(MLK) && "Expected a single location set!");
+    auto *&Accesses = AccessKind2Accesses[llvm::Log2_32(MLK)];
+    if (!Accesses)
+      Accesses = new (Allocator) AccessSet();
+    Changed |= Accesses->insert(AccessInfo{I, Ptr, AK}).second;
+    State.removeAssumedBits(MLK);
+  }
+
+  /// Determine the underlying locations kinds for \p Ptr, e.g., globals or
+  /// arguments, and update the state and access map accordingly.
+  void categorizePtrValue(Attributor &A, const Instruction &I, const Value &Ptr,
+                          AAMemoryLocation::StateType &State, bool &Changed);
+
+  /// Used to allocate access sets.
+  BumpPtrAllocator &Allocator;
+
+  /// The set of IR attributes AAMemoryLocation deals with.
+  static const Attribute::AttrKind AttrKinds[4];
+};
+
+const Attribute::AttrKind AAMemoryLocationImpl::AttrKinds[] = {
+    Attribute::ReadNone, Attribute::InaccessibleMemOnly, Attribute::ArgMemOnly,
+    Attribute::InaccessibleMemOrArgMemOnly};
+
+void AAMemoryLocationImpl::categorizePtrValue(
+    Attributor &A, const Instruction &I, const Value &Ptr,
+    AAMemoryLocation::StateType &State, bool &Changed) {
+  LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize pointer locations for "
+                    << Ptr << " ["
+                    << getMemoryLocationsAsStr(State.getAssumed()) << "]\n");
+
+  auto StripGEPCB = [](Value *V) -> Value * {
+    auto *GEP = dyn_cast<GEPOperator>(V);
+    while (GEP) {
+      V = GEP->getPointerOperand();
+      GEP = dyn_cast<GEPOperator>(V);
+    }
+    return V;
+  };
+
+  auto VisitValueCB = [&](Value &V, const Instruction *,
+                          AAMemoryLocation::StateType &T,
+                          bool Stripped) -> bool {
+    MemoryLocationsKind MLK = NO_LOCATIONS;
+    assert(!isa<GEPOperator>(V) && "GEPs should have been stripped.");
+    if (isa<UndefValue>(V))
+      return true;
+    if (auto *Arg = dyn_cast<Argument>(&V)) {
+      if (Arg->hasByValAttr())
+        MLK = NO_LOCAL_MEM;
+      else
+        MLK = NO_ARGUMENT_MEM;
+    } else if (auto *GV = dyn_cast<GlobalValue>(&V)) {
+      if (GV->hasLocalLinkage())
+        MLK = NO_GLOBAL_INTERNAL_MEM;
+      else
+        MLK = NO_GLOBAL_EXTERNAL_MEM;
+    } else if (isa<ConstantPointerNull>(V) &&
+               !NullPointerIsDefined(getAssociatedFunction(),
+                                     V.getType()->getPointerAddressSpace())) {
+      return true;
+    } else if (isa<AllocaInst>(V)) {
+      MLK = NO_LOCAL_MEM;
+    } else if (const auto *CB = dyn_cast<CallBase>(&V)) {
+      const auto &NoAliasAA =
+          A.getAAFor<AANoAlias>(*this, IRPosition::callsite_returned(*CB));
+      if (NoAliasAA.isAssumedNoAlias())
+        MLK = NO_MALLOCED_MEM;
+      else
+        MLK = NO_UNKOWN_MEM;
+    } else {
+      MLK = NO_UNKOWN_MEM;
+    }
+
+    assert(MLK != NO_LOCATIONS && "No location specified!");
+    updateStateAndAccessesMap(T, MLK, &I, &V, Changed,
+                              getAccessKindFromInst(&I));
+    LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Ptr value cannot be categorized: "
+                      << V << " -> " << getMemoryLocationsAsStr(T.getAssumed())
+                      << "\n");
+    return true;
+  };
+
+  if (!genericValueTraversal<AAMemoryLocation, AAMemoryLocation::StateType>(
+          A, IRPosition::value(Ptr), *this, State, VisitValueCB, getCtxI(),
+          /* UseValueSimplify */ true,
+          /* MaxValues */ 32, StripGEPCB)) {
+    LLVM_DEBUG(
+        dbgs() << "[AAMemoryLocation] Pointer locations not categorized\n");
+    updateStateAndAccessesMap(State, NO_UNKOWN_MEM, &I, nullptr, Changed,
+                              getAccessKindFromInst(&I));
+  } else {
+    LLVM_DEBUG(
+        dbgs()
+        << "[AAMemoryLocation] Accessed locations with pointer locations: "
+        << getMemoryLocationsAsStr(State.getAssumed()) << "\n");
+  }
+}
+
+AAMemoryLocation::MemoryLocationsKind
+AAMemoryLocationImpl::categorizeAccessedLocations(Attributor &A, Instruction &I,
+                                                  bool &Changed) {
+  LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize accessed locations for "
+                    << I << "\n");
+
+  AAMemoryLocation::StateType AccessedLocs;
+  AccessedLocs.intersectAssumedBits(NO_LOCATIONS);
+
+  if (auto *CB = dyn_cast<CallBase>(&I)) {
+
+    // First check if we assume any memory is access is visible.
+    const auto &CBMemLocationAA =
+        A.getAAFor<AAMemoryLocation>(*this, IRPosition::callsite_function(*CB));
+    LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Categorize call site: " << I
+                      << " [" << CBMemLocationAA << "]\n");
+
+    if (CBMemLocationAA.isAssumedReadNone())
+      return NO_LOCATIONS;
+
+    if (CBMemLocationAA.isAssumedInaccessibleMemOnly()) {
+      updateStateAndAccessesMap(AccessedLocs, NO_INACCESSIBLE_MEM, &I, nullptr,
+                                Changed, getAccessKindFromInst(&I));
+      return AccessedLocs.getAssumed();
+    }
+
+    uint32_t CBAssumedNotAccessedLocs =
+        CBMemLocationAA.getAssumedNotAccessedLocation();
+
+    // Set the argmemonly and global bit as we handle them separately below.
+    uint32_t CBAssumedNotAccessedLocsNoArgMem =
+        CBAssumedNotAccessedLocs | NO_ARGUMENT_MEM | NO_GLOBAL_MEM;
+
+    for (MemoryLocationsKind CurMLK = 1; CurMLK < NO_LOCATIONS; CurMLK *= 2) {
+      if (CBAssumedNotAccessedLocsNoArgMem & CurMLK)
+        continue;
+      updateStateAndAccessesMap(AccessedLocs, CurMLK, &I, nullptr, Changed,
+                                getAccessKindFromInst(&I));
+    }
+
+    // Now handle global memory if it might be accessed. This is slightly tricky
+    // as NO_GLOBAL_MEM has multiple bits set.
+    bool HasGlobalAccesses = ((~CBAssumedNotAccessedLocs) & NO_GLOBAL_MEM);
+    if (HasGlobalAccesses) {
+      auto AccessPred = [&](const Instruction *, const Value *Ptr,
+                            AccessKind Kind, MemoryLocationsKind MLK) {
+        updateStateAndAccessesMap(AccessedLocs, MLK, &I, Ptr, Changed,
+                                  getAccessKindFromInst(&I));
+        return true;
+      };
+      if (!CBMemLocationAA.checkForAllAccessesToMemoryKind(
+              AccessPred, inverseLocation(NO_GLOBAL_MEM, false, false)))
+        return AccessedLocs.getWorstState();
+    }
+
+    LLVM_DEBUG(
+        dbgs() << "[AAMemoryLocation] Accessed state before argument handling: "
+               << getMemoryLocationsAsStr(AccessedLocs.getAssumed()) << "\n");
+
+    // Now handle argument memory if it might be accessed.
+    bool HasArgAccesses = ((~CBAssumedNotAccessedLocs) & NO_ARGUMENT_MEM);
+    if (HasArgAccesses) {
+      for (unsigned ArgNo = 0, E = CB->getNumArgOperands(); ArgNo < E;
+           ++ArgNo) {
+
+        // Skip non-pointer arguments.
+        const Value *ArgOp = CB->getArgOperand(ArgNo);
+        if (!ArgOp->getType()->isPtrOrPtrVectorTy())
+          continue;
+
+        // Skip readnone arguments.
+        const IRPosition &ArgOpIRP = IRPosition::callsite_argument(*CB, ArgNo);
+        const auto &ArgOpMemLocationAA = A.getAAFor<AAMemoryBehavior>(
+            *this, ArgOpIRP, /* TrackDependence */ true, DepClassTy::OPTIONAL);
+
+        if (ArgOpMemLocationAA.isAssumedReadNone())
+          continue;
+
+        // Categorize potentially accessed pointer arguments as if there was an
+        // access instruction with them as pointer.
+        categorizePtrValue(A, I, *ArgOp, AccessedLocs, Changed);
+      }
+    }
+
+    LLVM_DEBUG(
+        dbgs() << "[AAMemoryLocation] Accessed state after argument handling: "
+               << getMemoryLocationsAsStr(AccessedLocs.getAssumed()) << "\n");
+
+    return AccessedLocs.getAssumed();
+  }
+
+  if (const Value *Ptr = getPointerOperand(&I, /* AllowVolatile */ true)) {
+    LLVM_DEBUG(
+        dbgs() << "[AAMemoryLocation] Categorize memory access with pointer: "
+               << I << " [" << *Ptr << "]\n");
+    categorizePtrValue(A, I, *Ptr, AccessedLocs, Changed);
+    return AccessedLocs.getAssumed();
+  }
+
+  LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Failed to categorize instruction: "
+                    << I << "\n");
+  updateStateAndAccessesMap(AccessedLocs, NO_UNKOWN_MEM, &I, nullptr, Changed,
+                            getAccessKindFromInst(&I));
+  return AccessedLocs.getAssumed();
+}
+
+/// An AA to represent the memory behavior function attributes.
+struct AAMemoryLocationFunction final : public AAMemoryLocationImpl {
+  AAMemoryLocationFunction(const IRPosition &IRP, Attributor &A)
+      : AAMemoryLocationImpl(IRP, A) {}
+
+  /// See AbstractAttribute::updateImpl(Attributor &A).
+  virtual ChangeStatus updateImpl(Attributor &A) override {
+
+    const auto &MemBehaviorAA = A.getAAFor<AAMemoryBehavior>(
+        *this, getIRPosition(), /* TrackDependence */ false);
+    if (MemBehaviorAA.isAssumedReadNone()) {
+      if (MemBehaviorAA.isKnownReadNone())
+        return indicateOptimisticFixpoint();
+      assert(isAssumedReadNone() &&
+             "AAMemoryLocation was not read-none but AAMemoryBehavior was!");
+      A.recordDependence(MemBehaviorAA, *this, DepClassTy::OPTIONAL);
+      return ChangeStatus::UNCHANGED;
+    }
+
+    // The current assumed state used to determine a change.
+    auto AssumedState = getAssumed();
+    bool Changed = false;
+
+    auto CheckRWInst = [&](Instruction &I) {
+      MemoryLocationsKind MLK = categorizeAccessedLocations(A, I, Changed);
+      LLVM_DEBUG(dbgs() << "[AAMemoryLocation] Accessed locations for " << I
+                        << ": " << getMemoryLocationsAsStr(MLK) << "\n");
+      removeAssumedBits(inverseLocation(MLK, false, false));
+      return true;
+    };
+
+    if (!A.checkForAllReadWriteInstructions(CheckRWInst, *this))
+      return indicatePessimisticFixpoint();
+
+    Changed |= AssumedState != getAssumed();
+    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_FN_ATTR(readnone)
+    else if (isAssumedArgMemOnly())
+      STATS_DECLTRACK_FN_ATTR(argmemonly)
+    else if (isAssumedInaccessibleMemOnly())
+      STATS_DECLTRACK_FN_ATTR(inaccessiblememonly)
+    else if (isAssumedInaccessibleOrArgMemOnly())
+      STATS_DECLTRACK_FN_ATTR(inaccessiblememorargmemonly)
+  }
+};
+
+/// AAMemoryLocation attribute for call sites.
+struct AAMemoryLocationCallSite final : AAMemoryLocationImpl {
+  AAMemoryLocationCallSite(const IRPosition &IRP, Attributor &A)
+      : AAMemoryLocationImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAMemoryLocationImpl::initialize(A);
+    Function *F = getAssociatedFunction();
+    if (!F || !A.isFunctionIPOAmendable(*F)) {
+      indicatePessimisticFixpoint();
+      return;
+    }
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    // TODO: Once we have call site specific value information we can provide
+    //       call site specific liveness liveness information and then it makes
+    //       sense to specialize attributes for call sites arguments instead of
+    //       redirecting requests to the callee argument.
+    Function *F = getAssociatedFunction();
+    const IRPosition &FnPos = IRPosition::function(*F);
+    auto &FnAA = A.getAAFor<AAMemoryLocation>(*this, FnPos);
+    bool Changed = false;
+    auto AccessPred = [&](const Instruction *I, const Value *Ptr,
+                          AccessKind Kind, MemoryLocationsKind MLK) {
+      updateStateAndAccessesMap(getState(), MLK, I, Ptr, Changed,
+                                getAccessKindFromInst(I));
+      return true;
+    };
+    if (!FnAA.checkForAllAccessesToMemoryKind(AccessPred, ALL_LOCATIONS))
+      return indicatePessimisticFixpoint();
+    return Changed ? ChangeStatus::CHANGED : ChangeStatus::UNCHANGED;
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    if (isAssumedReadNone())
+      STATS_DECLTRACK_CS_ATTR(readnone)
+  }
+};
+
+/// ------------------ Value Constant Range Attribute -------------------------
+
+struct AAValueConstantRangeImpl : AAValueConstantRange {
+  using StateType = IntegerRangeState;
+  AAValueConstantRangeImpl(const IRPosition &IRP, Attributor &A)
+      : AAValueConstantRange(IRP, A) {}
+
+  /// See AbstractAttribute::getAsStr().
+  const std::string getAsStr() const override {
+    std::string Str;
+    llvm::raw_string_ostream OS(Str);
+    OS << "range(" << getBitWidth() << ")<";
+    getKnown().print(OS);
+    OS << " / ";
+    getAssumed().print(OS);
+    OS << ">";
+    return OS.str();
+  }
+
+  /// Helper function to get a SCEV expr for the associated value at program
+  /// point \p I.
+  const SCEV *getSCEV(Attributor &A, const Instruction *I = nullptr) const {
+    if (!getAnchorScope())
+      return nullptr;
+
+    ScalarEvolution *SE =
+        A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(
+            *getAnchorScope());
+
+    LoopInfo *LI = A.getInfoCache().getAnalysisResultForFunction<LoopAnalysis>(
+        *getAnchorScope());
+
+    if (!SE || !LI)
+      return nullptr;
+
+    const SCEV *S = SE->getSCEV(&getAssociatedValue());
+    if (!I)
+      return S;
+
+    return SE->getSCEVAtScope(S, LI->getLoopFor(I->getParent()));
+  }
+
+  /// Helper function to get a range from SCEV for the associated value at
+  /// program point \p I.
+  ConstantRange getConstantRangeFromSCEV(Attributor &A,
+                                         const Instruction *I = nullptr) const {
+    if (!getAnchorScope())
+      return getWorstState(getBitWidth());
+
+    ScalarEvolution *SE =
+        A.getInfoCache().getAnalysisResultForFunction<ScalarEvolutionAnalysis>(
+            *getAnchorScope());
+
+    const SCEV *S = getSCEV(A, I);
+    if (!SE || !S)
+      return getWorstState(getBitWidth());
+
+    return SE->getUnsignedRange(S);
+  }
+
+  /// Helper function to get a range from LVI for the associated value at
+  /// program point \p I.
+  ConstantRange
+  getConstantRangeFromLVI(Attributor &A,
+                          const Instruction *CtxI = nullptr) const {
+    if (!getAnchorScope())
+      return getWorstState(getBitWidth());
+
+    LazyValueInfo *LVI =
+        A.getInfoCache().getAnalysisResultForFunction<LazyValueAnalysis>(
+            *getAnchorScope());
+
+    if (!LVI || !CtxI)
+      return getWorstState(getBitWidth());
+    return LVI->getConstantRange(&getAssociatedValue(),
+                                 const_cast<BasicBlock *>(CtxI->getParent()),
+                                 const_cast<Instruction *>(CtxI));
+  }
+
+  /// See AAValueConstantRange::getKnownConstantRange(..).
+  ConstantRange
+  getKnownConstantRange(Attributor &A,
+                        const Instruction *CtxI = nullptr) const override {
+    if (!CtxI || CtxI == getCtxI())
+      return getKnown();
+
+    ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI);
+    ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI);
+    return getKnown().intersectWith(SCEVR).intersectWith(LVIR);
+  }
+
+  /// See AAValueConstantRange::getAssumedConstantRange(..).
+  ConstantRange
+  getAssumedConstantRange(Attributor &A,
+                          const Instruction *CtxI = nullptr) const override {
+    // TODO: Make SCEV use Attributor assumption.
+    //       We may be able to bound a variable range via assumptions in
+    //       Attributor. ex.) If x is assumed to be in [1, 3] and y is known to
+    //       evolve to x^2 + x, then we can say that y is in [2, 12].
+
+    if (!CtxI || CtxI == getCtxI())
+      return getAssumed();
+
+    ConstantRange LVIR = getConstantRangeFromLVI(A, CtxI);
+    ConstantRange SCEVR = getConstantRangeFromSCEV(A, CtxI);
+    return getAssumed().intersectWith(SCEVR).intersectWith(LVIR);
+  }
+
+  /// See AbstractAttribute::initialize(..).
+  void initialize(Attributor &A) override {
+    // Intersect a range given by SCEV.
+    intersectKnown(getConstantRangeFromSCEV(A, getCtxI()));
+
+    // Intersect a range given by LVI.
+    intersectKnown(getConstantRangeFromLVI(A, getCtxI()));
+  }
+
+  /// Helper function to create MDNode for range metadata.
+  static MDNode *
+  getMDNodeForConstantRange(Type *Ty, LLVMContext &Ctx,
+                            const ConstantRange &AssumedConstantRange) {
+    Metadata *LowAndHigh[] = {ConstantAsMetadata::get(ConstantInt::get(
+                                  Ty, AssumedConstantRange.getLower())),
+                              ConstantAsMetadata::get(ConstantInt::get(
+                                  Ty, AssumedConstantRange.getUpper()))};
+    return MDNode::get(Ctx, LowAndHigh);
+  }
+
+  /// Return true if \p Assumed is included in \p KnownRanges.
+  static bool isBetterRange(const ConstantRange &Assumed, MDNode *KnownRanges) {
+
+    if (Assumed.isFullSet())
+      return false;
+
+    if (!KnownRanges)
+      return true;
+
+    // If multiple ranges are annotated in IR, we give up to annotate assumed
+    // range for now.
+
+    // TODO:  If there exists a known range which containts assumed range, we
+    // can say assumed range is better.
+    if (KnownRanges->getNumOperands() > 2)
+      return false;
+
+    ConstantInt *Lower =
+        mdconst::extract<ConstantInt>(KnownRanges->getOperand(0));
+    ConstantInt *Upper =
+        mdconst::extract<ConstantInt>(KnownRanges->getOperand(1));
+
+    ConstantRange Known(Lower->getValue(), Upper->getValue());
+    return Known.contains(Assumed) && Known != Assumed;
+  }
+
+  /// Helper function to set range metadata.
+  static bool
+  setRangeMetadataIfisBetterRange(Instruction *I,
+                                  const ConstantRange &AssumedConstantRange) {
+    auto *OldRangeMD = I->getMetadata(LLVMContext::MD_range);
+    if (isBetterRange(AssumedConstantRange, OldRangeMD)) {
+      if (!AssumedConstantRange.isEmptySet()) {
+        I->setMetadata(LLVMContext::MD_range,
+                       getMDNodeForConstantRange(I->getType(), I->getContext(),
+                                                 AssumedConstantRange));
+        return true;
+      }
+    }
+    return false;
+  }
+
+  /// See AbstractAttribute::manifest()
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+    ConstantRange AssumedConstantRange = getAssumedConstantRange(A);
+    assert(!AssumedConstantRange.isFullSet() && "Invalid state");
+
+    auto &V = getAssociatedValue();
+    if (!AssumedConstantRange.isEmptySet() &&
+        !AssumedConstantRange.isSingleElement()) {
+      if (Instruction *I = dyn_cast<Instruction>(&V))
+        if (isa<CallInst>(I) || isa<LoadInst>(I))
+          if (setRangeMetadataIfisBetterRange(I, AssumedConstantRange))
+            Changed = ChangeStatus::CHANGED;
+    }
+
+    return Changed;
+  }
+};
+
+struct AAValueConstantRangeArgument final
+    : AAArgumentFromCallSiteArguments<
+          AAValueConstantRange, AAValueConstantRangeImpl, IntegerRangeState> {
+  using Base = AAArgumentFromCallSiteArguments<
+      AAValueConstantRange, AAValueConstantRangeImpl, IntegerRangeState>;
+  AAValueConstantRangeArgument(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(..).
+  void initialize(Attributor &A) override {
+    if (!getAnchorScope() || getAnchorScope()->isDeclaration()) {
+      indicatePessimisticFixpoint();
+    } else {
+      Base::initialize(A);
+    }
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_ARG_ATTR(value_range)
+  }
+};
+
+struct AAValueConstantRangeReturned
+    : AAReturnedFromReturnedValues<AAValueConstantRange,
+                                   AAValueConstantRangeImpl> {
+  using Base = AAReturnedFromReturnedValues<AAValueConstantRange,
+                                            AAValueConstantRangeImpl>;
+  AAValueConstantRangeReturned(const IRPosition &IRP, Attributor &A)
+      : Base(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FNRET_ATTR(value_range)
+  }
+};
+
+struct AAValueConstantRangeFloating : AAValueConstantRangeImpl {
+  AAValueConstantRangeFloating(const IRPosition &IRP, Attributor &A)
+      : AAValueConstantRangeImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    AAValueConstantRangeImpl::initialize(A);
+    Value &V = getAssociatedValue();
+
+    if (auto *C = dyn_cast<ConstantInt>(&V)) {
+      unionAssumed(ConstantRange(C->getValue()));
+      indicateOptimisticFixpoint();
+      return;
+    }
+
+    if (isa<UndefValue>(&V)) {
+      // Collapse the undef state to 0.
+      unionAssumed(ConstantRange(APInt(getBitWidth(), 0)));
+      indicateOptimisticFixpoint();
+      return;
+    }
+
+    if (isa<BinaryOperator>(&V) || isa<CmpInst>(&V) || isa<CastInst>(&V))
+      return;
+    // If it is a load instruction with range metadata, use it.
+    if (LoadInst *LI = dyn_cast<LoadInst>(&V))
+      if (auto *RangeMD = LI->getMetadata(LLVMContext::MD_range)) {
+        intersectKnown(getConstantRangeFromMetadata(*RangeMD));
+        return;
+      }
+
+    // We can work with PHI and select instruction as we traverse their operands
+    // during update.
+    if (isa<SelectInst>(V) || isa<PHINode>(V))
+      return;
+
+    // Otherwise we give up.
+    indicatePessimisticFixpoint();
+
+    LLVM_DEBUG(dbgs() << "[AAValueConstantRange] We give up: "
+                      << getAssociatedValue() << "\n");
+  }
+
+  bool calculateBinaryOperator(
+      Attributor &A, BinaryOperator *BinOp, IntegerRangeState &T,
+      const Instruction *CtxI,
+      SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) {
+    Value *LHS = BinOp->getOperand(0);
+    Value *RHS = BinOp->getOperand(1);
+    // TODO: Allow non integers as well.
+    if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
+      return false;
+
+    auto &LHSAA =
+        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS));
+    QuerriedAAs.push_back(&LHSAA);
+    auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI);
+
+    auto &RHSAA =
+        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS));
+    QuerriedAAs.push_back(&RHSAA);
+    auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI);
+
+    auto AssumedRange = LHSAARange.binaryOp(BinOp->getOpcode(), RHSAARange);
+
+    T.unionAssumed(AssumedRange);
+
+    // TODO: Track a known state too.
+
+    return T.isValidState();
+  }
+
+  bool calculateCastInst(
+      Attributor &A, CastInst *CastI, IntegerRangeState &T,
+      const Instruction *CtxI,
+      SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) {
+    assert(CastI->getNumOperands() == 1 && "Expected cast to be unary!");
+    // TODO: Allow non integers as well.
+    Value &OpV = *CastI->getOperand(0);
+    if (!OpV.getType()->isIntegerTy())
+      return false;
+
+    auto &OpAA =
+        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(OpV));
+    QuerriedAAs.push_back(&OpAA);
+    T.unionAssumed(
+        OpAA.getAssumed().castOp(CastI->getOpcode(), getState().getBitWidth()));
+    return T.isValidState();
+  }
+
+  bool
+  calculateCmpInst(Attributor &A, CmpInst *CmpI, IntegerRangeState &T,
+                   const Instruction *CtxI,
+                   SmallVectorImpl<const AAValueConstantRange *> &QuerriedAAs) {
+    Value *LHS = CmpI->getOperand(0);
+    Value *RHS = CmpI->getOperand(1);
+    // TODO: Allow non integers as well.
+    if (!LHS->getType()->isIntegerTy() || !RHS->getType()->isIntegerTy())
+      return false;
+
+    auto &LHSAA =
+        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*LHS));
+    QuerriedAAs.push_back(&LHSAA);
+    auto &RHSAA =
+        A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(*RHS));
+    QuerriedAAs.push_back(&RHSAA);
+
+    auto LHSAARange = LHSAA.getAssumedConstantRange(A, CtxI);
+    auto RHSAARange = RHSAA.getAssumedConstantRange(A, CtxI);
+
+    // If one of them is empty set, we can't decide.
+    if (LHSAARange.isEmptySet() || RHSAARange.isEmptySet())
+      return true;
+
+    bool MustTrue = false, MustFalse = false;
+
+    auto AllowedRegion =
+        ConstantRange::makeAllowedICmpRegion(CmpI->getPredicate(), RHSAARange);
+
+    auto SatisfyingRegion = ConstantRange::makeSatisfyingICmpRegion(
+        CmpI->getPredicate(), RHSAARange);
+
+    if (AllowedRegion.intersectWith(LHSAARange).isEmptySet())
+      MustFalse = true;
+
+    if (SatisfyingRegion.contains(LHSAARange))
+      MustTrue = true;
+
+    assert((!MustTrue || !MustFalse) &&
+           "Either MustTrue or MustFalse should be false!");
+
+    if (MustTrue)
+      T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 1)));
+    else if (MustFalse)
+      T.unionAssumed(ConstantRange(APInt(/* numBits */ 1, /* val */ 0)));
+    else
+      T.unionAssumed(ConstantRange(/* BitWidth */ 1, /* isFullSet */ true));
+
+    LLVM_DEBUG(dbgs() << "[AAValueConstantRange] " << *CmpI << " " << LHSAA
+                      << " " << RHSAA << "\n");
+
+    // TODO: Track a known state too.
+    return T.isValidState();
+  }
+
+  /// See AbstractAttribute::updateImpl(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    auto VisitValueCB = [&](Value &V, const Instruction *CtxI,
+                            IntegerRangeState &T, bool Stripped) -> bool {
+      Instruction *I = dyn_cast<Instruction>(&V);
+      if (!I || isa<CallBase>(I)) {
+
+        // If the value is not instruction, we query AA to Attributor.
+        const auto &AA =
+            A.getAAFor<AAValueConstantRange>(*this, IRPosition::value(V));
+
+        // Clamp operator is not used to utilize a program point CtxI.
+        T.unionAssumed(AA.getAssumedConstantRange(A, CtxI));
+
+        return T.isValidState();
+      }
+
+      SmallVector<const AAValueConstantRange *, 4> QuerriedAAs;
+      if (auto *BinOp = dyn_cast<BinaryOperator>(I)) {
+        if (!calculateBinaryOperator(A, BinOp, T, CtxI, QuerriedAAs))
+          return false;
+      } else if (auto *CmpI = dyn_cast<CmpInst>(I)) {
+        if (!calculateCmpInst(A, CmpI, T, CtxI, QuerriedAAs))
+          return false;
+      } else if (auto *CastI = dyn_cast<CastInst>(I)) {
+        if (!calculateCastInst(A, CastI, T, CtxI, QuerriedAAs))
+          return false;
+      } else {
+        // Give up with other instructions.
+        // TODO: Add other instructions
+
+        T.indicatePessimisticFixpoint();
+        return false;
+      }
+
+      // Catch circular reasoning in a pessimistic way for now.
+      // TODO: Check how the range evolves and if we stripped anything, see also
+      //       AADereferenceable or AAAlign for similar situations.
+      for (const AAValueConstantRange *QueriedAA : QuerriedAAs) {
+        if (QueriedAA != this)
+          continue;
+        // If we are in a stady state we do not need to worry.
+        if (T.getAssumed() == getState().getAssumed())
+          continue;
+        T.indicatePessimisticFixpoint();
+      }
+
+      return T.isValidState();
+    };
+
+    IntegerRangeState T(getBitWidth());
+
+    if (!genericValueTraversal<AAValueConstantRange, IntegerRangeState>(
+            A, getIRPosition(), *this, T, VisitValueCB, getCtxI(),
+            /* UseValueSimplify */ false))
+      return indicatePessimisticFixpoint();
+
+    return clampStateAndIndicateChange(getState(), T);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_FLOATING_ATTR(value_range)
+  }
+};
+
+struct AAValueConstantRangeFunction : AAValueConstantRangeImpl {
+  AAValueConstantRangeFunction(const IRPosition &IRP, Attributor &A)
+      : AAValueConstantRangeImpl(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  ChangeStatus updateImpl(Attributor &A) override {
+    llvm_unreachable("AAValueConstantRange(Function|CallSite)::updateImpl will "
+                     "not be called");
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_FN_ATTR(value_range) }
+};
+
+struct AAValueConstantRangeCallSite : AAValueConstantRangeFunction {
+  AAValueConstantRangeCallSite(const IRPosition &IRP, Attributor &A)
+      : AAValueConstantRangeFunction(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override { STATS_DECLTRACK_CS_ATTR(value_range) }
+};
+
+struct AAValueConstantRangeCallSiteReturned
+    : AACallSiteReturnedFromReturned<AAValueConstantRange,
+                                     AAValueConstantRangeImpl> {
+  AAValueConstantRangeCallSiteReturned(const IRPosition &IRP, Attributor &A)
+      : AACallSiteReturnedFromReturned<AAValueConstantRange,
+                                       AAValueConstantRangeImpl>(IRP, A) {}
+
+  /// See AbstractAttribute::initialize(...).
+  void initialize(Attributor &A) override {
+    // If it is a load instruction with range metadata, use the metadata.
+    if (CallInst *CI = dyn_cast<CallInst>(&getAssociatedValue()))
+      if (auto *RangeMD = CI->getMetadata(LLVMContext::MD_range))
+        intersectKnown(getConstantRangeFromMetadata(*RangeMD));
+
+    AAValueConstantRangeImpl::initialize(A);
+  }
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSRET_ATTR(value_range)
+  }
+};
+struct AAValueConstantRangeCallSiteArgument : AAValueConstantRangeFloating {
+  AAValueConstantRangeCallSiteArgument(const IRPosition &IRP, Attributor &A)
+      : AAValueConstantRangeFloating(IRP, A) {}
+
+  /// See AbstractAttribute::trackStatistics()
+  void trackStatistics() const override {
+    STATS_DECLTRACK_CSARG_ATTR(value_range)
+  }
+};
+} // namespace
+
+const char AAReturnedValues::ID = 0;
+const char AANoUnwind::ID = 0;
+const char AANoSync::ID = 0;
+const char AANoFree::ID = 0;
+const char AANonNull::ID = 0;
+const char AANoRecurse::ID = 0;
+const char AAWillReturn::ID = 0;
+const char AAUndefinedBehavior::ID = 0;
+const char AANoAlias::ID = 0;
+const char AAReachability::ID = 0;
+const char AANoReturn::ID = 0;
+const char AAIsDead::ID = 0;
+const char AADereferenceable::ID = 0;
+const char AAAlign::ID = 0;
+const char AANoCapture::ID = 0;
+const char AAValueSimplify::ID = 0;
+const char AAHeapToStack::ID = 0;
+const char AAPrivatizablePtr::ID = 0;
+const char AAMemoryBehavior::ID = 0;
+const char AAMemoryLocation::ID = 0;
+const char AAValueConstantRange::ID = 0;
+
+// Macro magic to create the static generator function for attributes that
+// follow the naming scheme.
+
+#define SWITCH_PK_INV(CLASS, PK, POS_NAME)                                     \
+  case IRPosition::PK:                                                         \
+    llvm_unreachable("Cannot create " #CLASS " for a " POS_NAME " position!");
+
+#define SWITCH_PK_CREATE(CLASS, IRP, PK, SUFFIX)                               \
+  case IRPosition::PK:                                                         \
+    AA = new (A.Allocator) CLASS##SUFFIX(IRP, A);                              \
+    ++NumAAs;                                                                  \
+    break;
+
+#define CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                 \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating")                              \
+      SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument")                           \
+      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned")       \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument")       \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                    \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_FUNCTION, "function")                           \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site")                         \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                      \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_RETURNED, Returned)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)            \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_ARGUMENT, "argument")                           \
+      SWITCH_PK_INV(CLASS, IRP_FLOAT, "floating")                              \
+      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_RETURNED, "call site returned")       \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE_ARGUMENT, "call site argument")       \
+      SWITCH_PK_INV(CLASS, IRP_CALL_SITE, "call site")                         \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+#define CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(CLASS)                  \
+  CLASS &CLASS::createForPosition(const IRPosition &IRP, Attributor &A) {      \
+    CLASS *AA = nullptr;                                                       \
+    switch (IRP.getPositionKind()) {                                           \
+      SWITCH_PK_INV(CLASS, IRP_INVALID, "invalid")                             \
+      SWITCH_PK_INV(CLASS, IRP_RETURNED, "returned")                           \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FUNCTION, Function)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE, CallSite)                    \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_FLOAT, Floating)                        \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_ARGUMENT, Argument)                     \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_RETURNED, CallSiteReturned)   \
+      SWITCH_PK_CREATE(CLASS, IRP, IRP_CALL_SITE_ARGUMENT, CallSiteArgument)   \
+    }                                                                          \
+    return *AA;                                                                \
+  }
+
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoUnwind)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoSync)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoRecurse)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAWillReturn)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoReturn)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReturnedValues)
+CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryLocation)
+
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANonNull)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoAlias)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAPrivatizablePtr)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AADereferenceable)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAAlign)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoCapture)
+CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueConstantRange)
+
+CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAValueSimplify)
+CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAIsDead)
+CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION(AANoFree)
+
+CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAHeapToStack)
+CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAReachability)
+CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAUndefinedBehavior)
+
+CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION(AAMemoryBehavior)
+
+#undef CREATE_FUNCTION_ONLY_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_FUNCTION_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_NON_RET_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_VALUE_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef CREATE_ALL_ABSTRACT_ATTRIBUTE_FOR_POSITION
+#undef SWITCH_PK_CREATE
+#undef SWITCH_PK_INV
diff --git a/llvm/lib/Transforms/IPO/BlockExtractor.cpp b/llvm/lib/Transforms/IPO/BlockExtractor.cpp
index aec470ffadc4..1d1300c6cd1d 100644
--- a/llvm/lib/Transforms/IPO/BlockExtractor.cpp
+++ b/llvm/lib/Transforms/IPO/BlockExtractor.cpp
@@ -127,7 +127,8 @@ void BlockExtractor::loadFile() {
                        /*KeepEmpty=*/false);
     if (BBNames.empty())
       report_fatal_error("Missing bbs name");
-    BlocksByName.push_back({LineSplit[0], {BBNames.begin(), BBNames.end()}});
+    BlocksByName.push_back(
+        {std::string(LineSplit[0]), {BBNames.begin(), BBNames.end()}});
   }
 }
 
diff --git a/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp b/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
index f28a399b1779..74f11fa30959 100644
--- a/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
+++ b/llvm/lib/Transforms/IPO/CalledValuePropagation.cpp
@@ -19,7 +19,6 @@
 #include "llvm/Transforms/IPO/CalledValuePropagation.h"
 #include "llvm/Analysis/SparsePropagation.h"
 #include "llvm/Analysis/ValueLatticeUtils.h"
-#include "llvm/IR/InstVisitor.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
@@ -72,8 +71,7 @@ public:
   CVPLatticeVal(CVPLatticeStateTy LatticeState) : LatticeState(LatticeState) {}
   CVPLatticeVal(std::vector<Function *> &&Functions)
       : LatticeState(FunctionSet), Functions(std::move(Functions)) {
-    assert(std::is_sorted(this->Functions.begin(), this->Functions.end(),
-                          Compare()));
+    assert(llvm::is_sorted(this->Functions, Compare()));
   }
 
   /// Get a reference to the functions held by this lattice value. The number
@@ -173,9 +171,8 @@ public:
       SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) override {
     switch (I.getOpcode()) {
     case Instruction::Call:
-      return visitCallSite(cast<CallInst>(&I), ChangedValues, SS);
     case Instruction::Invoke:
-      return visitCallSite(cast<InvokeInst>(&I), ChangedValues, SS);
+      return visitCallBase(cast<CallBase>(I), ChangedValues, SS);
     case Instruction::Load:
       return visitLoad(*cast<LoadInst>(&I), ChangedValues, SS);
     case Instruction::Ret:
@@ -217,13 +214,13 @@ public:
 
   /// We collect a set of indirect calls when visiting call sites. This method
   /// returns a reference to that set.
-  SmallPtrSetImpl<Instruction *> &getIndirectCalls() { return IndirectCalls; }
+  SmallPtrSetImpl<CallBase *> &getIndirectCalls() { return IndirectCalls; }
 
 private:
   /// Holds the indirect calls we encounter during the analysis. We will attach
   /// metadata to these calls after the analysis indicating the functions the
   /// calls can possibly target.
-  SmallPtrSet<Instruction *, 32> IndirectCalls;
+  SmallPtrSet<CallBase *, 32> IndirectCalls;
 
   /// Compute a new lattice value for the given constant. The constant, after
   /// stripping any pointer casts, should be a Function. We ignore null
@@ -255,23 +252,22 @@ private:
   /// the merge of the argument state with the call sites corresponding actual
   /// argument state. The call site state is the merge of the call site state
   /// with the returned value state of the called function.
-  void visitCallSite(CallSite CS,
+  void visitCallBase(CallBase &CB,
                      DenseMap<CVPLatticeKey, CVPLatticeVal> &ChangedValues,
                      SparseSolver<CVPLatticeKey, CVPLatticeVal> &SS) {
-    Function *F = CS.getCalledFunction();
-    Instruction *I = CS.getInstruction();
-    auto RegI = CVPLatticeKey(I, IPOGrouping::Register);
+    Function *F = CB.getCalledFunction();
+    auto RegI = CVPLatticeKey(&CB, IPOGrouping::Register);
 
     // If this is an indirect call, save it so we can quickly revisit it when
     // attaching metadata.
     if (!F)
-      IndirectCalls.insert(I);
+      IndirectCalls.insert(&CB);
 
     // If we can't track the function's return values, there's nothing to do.
     if (!F || !canTrackReturnsInterprocedurally(F)) {
       // Void return, No need to create and update CVPLattice state as no one
       // can use it.
-      if (I->getType()->isVoidTy())
+      if (CB.getType()->isVoidTy())
         return;
       ChangedValues[RegI] = getOverdefinedVal();
       return;
@@ -284,14 +280,14 @@ private:
     for (Argument &A : F->args()) {
       auto RegFormal = CVPLatticeKey(&A, IPOGrouping::Register);
       auto RegActual =
-          CVPLatticeKey(CS.getArgument(A.getArgNo()), IPOGrouping::Register);
+          CVPLatticeKey(CB.getArgOperand(A.getArgNo()), IPOGrouping::Register);
       ChangedValues[RegFormal] =
           MergeValues(SS.getValueState(RegFormal), SS.getValueState(RegActual));
     }
 
     // Void return, No need to create and update CVPLattice state as no one can
     // use it.
-    if (I->getType()->isVoidTy())
+    if (CB.getType()->isVoidTy())
       return;
 
     ChangedValues[RegI] =
@@ -388,9 +384,8 @@ static bool runCVP(Module &M) {
   // the set of functions they can possibly target.
   bool Changed = false;
   MDBuilder MDB(M.getContext());
-  for (Instruction *C : Lattice.getIndirectCalls()) {
-    CallSite CS(C);
-    auto RegI = CVPLatticeKey(CS.getCalledValue(), IPOGrouping::Register);
+  for (CallBase *C : Lattice.getIndirectCalls()) {
+    auto RegI = CVPLatticeKey(C->getCalledOperand(), IPOGrouping::Register);
     CVPLatticeVal LV = Solver.getExistingValueState(RegI);
     if (!LV.isFunctionSet() || LV.getFunctions().empty())
       continue;
diff --git a/llvm/lib/Transforms/IPO/ConstantMerge.cpp b/llvm/lib/Transforms/IPO/ConstantMerge.cpp
index ea1278aa108f..67f1438b9b6a 100644
--- a/llvm/lib/Transforms/IPO/ConstantMerge.cpp
+++ b/llvm/lib/Transforms/IPO/ConstantMerge.cpp
@@ -84,11 +84,9 @@ static void copyDebugLocMetadata(const GlobalVariable *From,
     To->addDebugInfo(MD);
 }
 
-static unsigned getAlignment(GlobalVariable *GV) {
-  unsigned Align = GV->getAlignment();
-  if (Align)
-    return Align;
-  return GV->getParent()->getDataLayout().getPreferredAlignment(GV);
+static Align getAlign(GlobalVariable *GV) {
+  return GV->getAlign().getValueOr(
+      GV->getParent()->getDataLayout().getPreferredAlign(GV));
 }
 
 static bool
@@ -120,8 +118,8 @@ static void replace(Module &M, GlobalVariable *Old, GlobalVariable *New) {
                     << New->getName() << "\n");
 
   // Bump the alignment if necessary.
-  if (Old->getAlignment() || New->getAlignment())
-    New->setAlignment(Align(std::max(getAlignment(Old), getAlignment(New))));
+  if (Old->getAlign() || New->getAlign())
+    New->setAlignment(std::max(getAlign(Old), getAlign(New)));
 
   copyDebugLocMetadata(Old, New);
   Old->replaceAllUsesWith(NewConstant);
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 61d519d8ae88..54c51b6e7161 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -22,16 +22,17 @@
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/NoFolder.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
@@ -175,16 +176,15 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
   //
   std::vector<Value *> Args;
   for (Value::user_iterator I = Fn.user_begin(), E = Fn.user_end(); I != E; ) {
-    CallSite CS(*I++);
-    if (!CS)
+    CallBase *CB = dyn_cast<CallBase>(*I++);
+    if (!CB)
       continue;
-    Instruction *Call = CS.getInstruction();
 
     // Pass all the same arguments.
-    Args.assign(CS.arg_begin(), CS.arg_begin() + NumArgs);
+    Args.assign(CB->arg_begin(), CB->arg_begin() + NumArgs);
 
     // Drop any attributes that were on the vararg arguments.
-    AttributeList PAL = CS.getAttributes();
+    AttributeList PAL = CB->getAttributes();
     if (!PAL.isEmpty()) {
       SmallVector<AttributeSet, 8> ArgAttrs;
       for (unsigned ArgNo = 0; ArgNo < NumArgs; ++ArgNo)
@@ -194,34 +194,31 @@ bool DeadArgumentEliminationPass::DeleteDeadVarargs(Function &Fn) {
     }
 
     SmallVector<OperandBundleDef, 1> OpBundles;
-    CS.getOperandBundlesAsDefs(OpBundles);
+    CB->getOperandBundlesAsDefs(OpBundles);
 
-    CallSite NewCS;
-    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
-      NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
-                                 Args, OpBundles, "", Call);
+    CallBase *NewCB = nullptr;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(CB)) {
+      NewCB = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                                 Args, OpBundles, "", CB);
     } else {
-      NewCS = CallInst::Create(NF, Args, OpBundles, "", Call);
-      cast<CallInst>(NewCS.getInstruction())
-          ->setTailCallKind(cast<CallInst>(Call)->getTailCallKind());
+      NewCB = CallInst::Create(NF, Args, OpBundles, "", CB);
+      cast<CallInst>(NewCB)->setTailCallKind(
+          cast<CallInst>(CB)->getTailCallKind());
     }
-    NewCS.setCallingConv(CS.getCallingConv());
-    NewCS.setAttributes(PAL);
-    NewCS->setDebugLoc(Call->getDebugLoc());
-    uint64_t W;
-    if (Call->extractProfTotalWeight(W))
-      NewCS->setProfWeight(W);
+    NewCB->setCallingConv(CB->getCallingConv());
+    NewCB->setAttributes(PAL);
+    NewCB->copyMetadata(*CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
 
     Args.clear();
 
-    if (!Call->use_empty())
-      Call->replaceAllUsesWith(NewCS.getInstruction());
+    if (!CB->use_empty())
+      CB->replaceAllUsesWith(NewCB);
 
-    NewCS->takeName(Call);
+    NewCB->takeName(CB);
 
     // Finally, remove the old call from the program, reducing the use-count of
     // F.
-    Call->eraseFromParent();
+    CB->eraseFromParent();
   }
 
   // Since we have now created the new function, splice the body of the old
@@ -291,7 +288,8 @@ bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) {
   bool Changed = false;
 
   for (Argument &Arg : Fn.args()) {
-    if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() && !Arg.hasByValOrInAllocaAttr()) {
+    if (!Arg.hasSwiftErrorAttr() && Arg.use_empty() &&
+        !Arg.hasPassPointeeByValueAttr()) {
       if (Arg.isUsedByMetadata()) {
         Arg.replaceAllUsesWith(UndefValue::get(Arg.getType()));
         Changed = true;
@@ -304,16 +302,16 @@ bool DeadArgumentEliminationPass::RemoveDeadArgumentsFromCallers(Function &Fn) {
     return false;
 
   for (Use &U : Fn.uses()) {
-    CallSite CS(U.getUser());
-    if (!CS || !CS.isCallee(&U))
+    CallBase *CB = dyn_cast<CallBase>(U.getUser());
+    if (!CB || !CB->isCallee(&U))
       continue;
 
     // Now go through all unused args and replace them with "undef".
     for (unsigned I = 0, E = UnusedArgs.size(); I != E; ++I) {
       unsigned ArgNo = UnusedArgs[I];
 
-      Value *Arg = CS.getArgument(ArgNo);
-      CS.setArgument(ArgNo, UndefValue::get(Arg->getType()));
+      Value *Arg = CB->getArgOperand(ArgNo);
+      CB->setArgOperand(ArgNo, UndefValue::get(Arg->getType()));
       ++NumArgumentsReplacedWithUndef;
       Changed = true;
     }
@@ -391,8 +389,8 @@ DeadArgumentEliminationPass::SurveyUse(const Use *U, UseVector &MaybeLiveUses,
         return MarkIfNotLive(Use, MaybeLiveUses);
       } else {
         DeadArgumentEliminationPass::Liveness Result = MaybeLive;
-        for (unsigned i = 0; i < NumRetVals(F); ++i) {
-          RetOrArg Use = CreateRet(F, i);
+        for (unsigned Ri = 0; Ri < NumRetVals(F); ++Ri) {
+          RetOrArg Use = CreateRet(F, Ri);
           // We might be live, depending on the liveness of Use. If any
           // sub-value is live, then the entire value is considered live. This
           // is a conservative choice, and better tracking is possible.
@@ -424,28 +422,27 @@ DeadArgumentEliminationPass::SurveyUse(const Use *U, UseVector &MaybeLiveUses,
       return Result;
     }
 
-    if (auto CS = ImmutableCallSite(V)) {
-      const Function *F = CS.getCalledFunction();
+    if (const auto *CB = dyn_cast<CallBase>(V)) {
+      const Function *F = CB->getCalledFunction();
       if (F) {
         // Used in a direct call.
 
         // The function argument is live if it is used as a bundle operand.
-        if (CS.isBundleOperand(U))
+        if (CB->isBundleOperand(U))
           return Live;
 
         // Find the argument number. We know for sure that this use is an
         // argument, since if it was the function argument this would be an
         // indirect call and the we know can't be looking at a value of the
         // label type (for the invoke instruction).
-        unsigned ArgNo = CS.getArgumentNo(U);
+        unsigned ArgNo = CB->getArgOperandNo(U);
 
         if (ArgNo >= F->getFunctionType()->getNumParams())
           // The value is passed in through a vararg! Must be live.
           return Live;
 
-        assert(CS.getArgument(ArgNo)
-               == CS->getOperand(U->getOperandNo())
-               && "Argument is not where we expected it");
+        assert(CB->getArgOperand(ArgNo) == CB->getOperand(U->getOperandNo()) &&
+               "Argument is not where we expected it");
 
         // Value passed to a normal call. It's only live when the corresponding
         // argument to the called function turns out live.
@@ -485,9 +482,10 @@ DeadArgumentEliminationPass::SurveyUses(const Value *V,
 // We consider arguments of non-internal functions to be intrinsically alive as
 // well as arguments to functions which have their "address taken".
 void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
-  // Functions with inalloca parameters are expecting args in a particular
-  // register and memory layout.
-  if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca)) {
+  // Functions with inalloca/preallocated parameters are expecting args in a
+  // particular register and memory layout.
+  if (F.getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
+      F.getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
     MarkLive(F);
     return;
   }
@@ -555,24 +553,17 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
   for (const Use &U : F.uses()) {
     // If the function is PASSED IN as an argument, its address has been
     // taken.
-    ImmutableCallSite CS(U.getUser());
-    if (!CS || !CS.isCallee(&U)) {
+    const auto *CB = dyn_cast<CallBase>(U.getUser());
+    if (!CB || !CB->isCallee(&U)) {
       MarkLive(F);
       return;
     }
 
     // The number of arguments for `musttail` call must match the number of
     // arguments of the caller
-    if (CS.isMustTailCall())
+    if (CB->isMustTailCall())
       HasMustTailCallers = true;
 
-    // If this use is anything other than a call site, the function is alive.
-    const Instruction *TheCall = CS.getInstruction();
-    if (!TheCall) {   // Not a direct call site?
-      MarkLive(F);
-      return;
-    }
-
     // If we end up here, we are looking at a direct call to our function.
 
     // Now, check how our return value(s) is/are used in this caller. Don't
@@ -581,7 +572,7 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
       continue;
 
     // Check all uses of the return value.
-    for (const Use &U : TheCall->uses()) {
+    for (const Use &U : CB->uses()) {
       if (ExtractValueInst *Ext = dyn_cast<ExtractValueInst>(U.getUser())) {
         // This use uses a part of our return value, survey the uses of
         // that part and store the results for this index only.
@@ -600,10 +591,10 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
           RetValLiveness.assign(RetCount, Live);
           break;
         } else {
-          for (unsigned i = 0; i != RetCount; ++i) {
-            if (RetValLiveness[i] != Live)
-              MaybeLiveRetUses[i].append(MaybeLiveAggregateUses.begin(),
-                                         MaybeLiveAggregateUses.end());
+          for (unsigned Ri = 0; Ri != RetCount; ++Ri) {
+            if (RetValLiveness[Ri] != Live)
+              MaybeLiveRetUses[Ri].append(MaybeLiveAggregateUses.begin(),
+                                          MaybeLiveAggregateUses.end());
           }
         }
       }
@@ -616,17 +607,17 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
   }
 
   // Now we've inspected all callers, record the liveness of our return values.
-  for (unsigned i = 0; i != RetCount; ++i)
-    MarkValue(CreateRet(&F, i), RetValLiveness[i], MaybeLiveRetUses[i]);
+  for (unsigned Ri = 0; Ri != RetCount; ++Ri)
+    MarkValue(CreateRet(&F, Ri), RetValLiveness[Ri], MaybeLiveRetUses[Ri]);
 
   LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Inspecting args for fn: "
                     << F.getName() << "\n");
 
   // Now, check all of our arguments.
-  unsigned i = 0;
+  unsigned ArgI = 0;
   UseVector MaybeLiveArgUses;
-  for (Function::const_arg_iterator AI = F.arg_begin(),
-       E = F.arg_end(); AI != E; ++AI, ++i) {
+  for (Function::const_arg_iterator AI = F.arg_begin(), E = F.arg_end();
+       AI != E; ++AI, ++ArgI) {
     Liveness Result;
     if (F.getFunctionType()->isVarArg() || HasMustTailCallers ||
         HasMustTailCalls) {
@@ -649,7 +640,7 @@ void DeadArgumentEliminationPass::SurveyFunction(const Function &F) {
     }
 
     // Mark the result.
-    MarkValue(CreateArg(&F, i), Result, MaybeLiveArgUses);
+    MarkValue(CreateArg(&F, ArgI), Result, MaybeLiveArgUses);
     // Clear the vector again for the next iteration.
     MaybeLiveArgUses.clear();
   }
@@ -684,11 +675,11 @@ void DeadArgumentEliminationPass::MarkLive(const Function &F) {
   // Mark the function as live.
   LiveFunctions.insert(&F);
   // Mark all arguments as live.
-  for (unsigned i = 0, e = F.arg_size(); i != e; ++i)
-    PropagateLiveness(CreateArg(&F, i));
+  for (unsigned ArgI = 0, E = F.arg_size(); ArgI != E; ++ArgI)
+    PropagateLiveness(CreateArg(&F, ArgI));
   // Mark all return values as live.
-  for (unsigned i = 0, e = NumRetVals(&F); i != e; ++i)
-    PropagateLiveness(CreateRet(&F, i));
+  for (unsigned Ri = 0, E = NumRetVals(&F); Ri != E; ++Ri)
+    PropagateLiveness(CreateRet(&F, Ri));
 }
 
 /// MarkLive - Mark the given return value or argument as live. Additionally,
@@ -749,19 +740,19 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   // Construct the new parameter list from non-dead arguments. Also construct
   // a new set of parameter attributes to correspond. Skip the first parameter
   // attribute, since that belongs to the return value.
-  unsigned i = 0;
-  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
-       I != E; ++I, ++i) {
-    RetOrArg Arg = CreateArg(F, i);
+  unsigned ArgI = 0;
+  for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(); I != E;
+       ++I, ++ArgI) {
+    RetOrArg Arg = CreateArg(F, ArgI);
     if (LiveValues.erase(Arg)) {
       Params.push_back(I->getType());
-      ArgAlive[i] = true;
-      ArgAttrVec.push_back(PAL.getParamAttributes(i));
-      HasLiveReturnedArg |= PAL.hasParamAttribute(i, Attribute::Returned);
+      ArgAlive[ArgI] = true;
+      ArgAttrVec.push_back(PAL.getParamAttributes(ArgI));
+      HasLiveReturnedArg |= PAL.hasParamAttribute(ArgI, Attribute::Returned);
     } else {
       ++NumArgumentsEliminated;
       LLVM_DEBUG(dbgs() << "DeadArgumentEliminationPass - Removing argument "
-                        << i << " (" << I->getName() << ") from "
+                        << ArgI << " (" << I->getName() << ") from "
                         << F->getName() << "\n");
     }
   }
@@ -798,16 +789,16 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
     NRetTy = RetTy;
   } else {
     // Look at each of the original return values individually.
-    for (unsigned i = 0; i != RetCount; ++i) {
-      RetOrArg Ret = CreateRet(F, i);
+    for (unsigned Ri = 0; Ri != RetCount; ++Ri) {
+      RetOrArg Ret = CreateRet(F, Ri);
       if (LiveValues.erase(Ret)) {
-        RetTypes.push_back(getRetComponentType(F, i));
-        NewRetIdxs[i] = RetTypes.size() - 1;
+        RetTypes.push_back(getRetComponentType(F, Ri));
+        NewRetIdxs[Ri] = RetTypes.size() - 1;
       } else {
         ++NumRetValsEliminated;
         LLVM_DEBUG(
             dbgs() << "DeadArgumentEliminationPass - Removing return value "
-                   << i << " from " << F->getName() << "\n");
+                   << Ri << " from " << F->getName() << "\n");
       }
     }
     if (RetTypes.size() > 1) {
@@ -876,11 +867,10 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   // to pass in a smaller number of arguments into the new function.
   std::vector<Value*> Args;
   while (!F->use_empty()) {
-    CallSite CS(F->user_back());
-    Instruction *Call = CS.getInstruction();
+    CallBase &CB = cast<CallBase>(*F->user_back());
 
     ArgAttrVec.clear();
-    const AttributeList &CallPAL = CS.getAttributes();
+    const AttributeList &CallPAL = CB.getAttributes();
 
     // Adjust the call return attributes in case the function was changed to
     // return void.
@@ -890,15 +880,15 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
 
     // Declare these outside of the loops, so we can reuse them for the second
     // loop, which loops the varargs.
-    CallSite::arg_iterator I = CS.arg_begin();
-    unsigned i = 0;
+    auto I = CB.arg_begin();
+    unsigned Pi = 0;
     // Loop over those operands, corresponding to the normal arguments to the
     // original function, and add those that are still alive.
-    for (unsigned e = FTy->getNumParams(); i != e; ++I, ++i)
-      if (ArgAlive[i]) {
+    for (unsigned E = FTy->getNumParams(); Pi != E; ++I, ++Pi)
+      if (ArgAlive[Pi]) {
         Args.push_back(*I);
         // Get original parameter attributes, but skip return attributes.
-        AttributeSet Attrs = CallPAL.getParamAttributes(i);
+        AttributeSet Attrs = CallPAL.getParamAttributes(Pi);
         if (NRetTy != RetTy && Attrs.hasAttribute(Attribute::Returned)) {
           // If the return type has changed, then get rid of 'returned' on the
           // call site. The alternative is to make all 'returned' attributes on
@@ -915,9 +905,9 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
       }
 
     // Push any varargs arguments on the list. Don't forget their attributes.
-    for (CallSite::arg_iterator E = CS.arg_end(); I != E; ++I, ++i) {
+    for (auto E = CB.arg_end(); I != E; ++I, ++Pi) {
       Args.push_back(*I);
-      ArgAttrVec.push_back(CallPAL.getParamAttributes(i));
+      ArgAttrVec.push_back(CallPAL.getParamAttributes(Pi));
     }
 
     // Reconstruct the AttributesList based on the vector we constructed.
@@ -932,44 +922,41 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
         F->getContext(), FnAttrs, RetAttrs, ArgAttrVec);
 
     SmallVector<OperandBundleDef, 1> OpBundles;
-    CS.getOperandBundlesAsDefs(OpBundles);
+    CB.getOperandBundlesAsDefs(OpBundles);
 
-    CallSite NewCS;
-    if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
-      NewCS = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
-                                 Args, OpBundles, "", Call->getParent());
+    CallBase *NewCB = nullptr;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+      NewCB = InvokeInst::Create(NF, II->getNormalDest(), II->getUnwindDest(),
+                                 Args, OpBundles, "", CB.getParent());
     } else {
-      NewCS = CallInst::Create(NFTy, NF, Args, OpBundles, "", Call);
-      cast<CallInst>(NewCS.getInstruction())
-          ->setTailCallKind(cast<CallInst>(Call)->getTailCallKind());
+      NewCB = CallInst::Create(NFTy, NF, Args, OpBundles, "", &CB);
+      cast<CallInst>(NewCB)->setTailCallKind(
+          cast<CallInst>(&CB)->getTailCallKind());
     }
-    NewCS.setCallingConv(CS.getCallingConv());
-    NewCS.setAttributes(NewCallPAL);
-    NewCS->setDebugLoc(Call->getDebugLoc());
-    uint64_t W;
-    if (Call->extractProfTotalWeight(W))
-      NewCS->setProfWeight(W);
+    NewCB->setCallingConv(CB.getCallingConv());
+    NewCB->setAttributes(NewCallPAL);
+    NewCB->copyMetadata(CB, {LLVMContext::MD_prof, LLVMContext::MD_dbg});
     Args.clear();
     ArgAttrVec.clear();
 
-    Instruction *New = NewCS.getInstruction();
-    if (!Call->use_empty() || Call->isUsedByMetadata()) {
-      if (New->getType() == Call->getType()) {
+    if (!CB.use_empty() || CB.isUsedByMetadata()) {
+      if (NewCB->getType() == CB.getType()) {
         // Return type not changed? Just replace users then.
-        Call->replaceAllUsesWith(New);
-        New->takeName(Call);
-      } else if (New->getType()->isVoidTy()) {
+        CB.replaceAllUsesWith(NewCB);
+        NewCB->takeName(&CB);
+      } else if (NewCB->getType()->isVoidTy()) {
         // If the return value is dead, replace any uses of it with undef
         // (any non-debug value uses will get removed later on).
-        if (!Call->getType()->isX86_MMXTy())
-          Call->replaceAllUsesWith(UndefValue::get(Call->getType()));
+        if (!CB.getType()->isX86_MMXTy())
+          CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
       } else {
         assert((RetTy->isStructTy() || RetTy->isArrayTy()) &&
                "Return type changed, but not into a void. The old return type"
                " must have been a struct or an array!");
-        Instruction *InsertPt = Call;
-        if (InvokeInst *II = dyn_cast<InvokeInst>(Call)) {
-          BasicBlock *NewEdge = SplitEdge(New->getParent(), II->getNormalDest());
+        Instruction *InsertPt = &CB;
+        if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+          BasicBlock *NewEdge =
+              SplitEdge(NewCB->getParent(), II->getNormalDest());
           InsertPt = &*NewEdge->getFirstInsertionPt();
         }
 
@@ -979,30 +966,30 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
         //
         // Start out building up our return value from undef
         Value *RetVal = UndefValue::get(RetTy);
-        for (unsigned i = 0; i != RetCount; ++i)
-          if (NewRetIdxs[i] != -1) {
+        for (unsigned Ri = 0; Ri != RetCount; ++Ri)
+          if (NewRetIdxs[Ri] != -1) {
             Value *V;
+            IRBuilder<NoFolder> IRB(InsertPt);
             if (RetTypes.size() > 1)
               // We are still returning a struct, so extract the value from our
               // return value
-              V = ExtractValueInst::Create(New, NewRetIdxs[i], "newret",
-                                           InsertPt);
+              V = IRB.CreateExtractValue(NewCB, NewRetIdxs[Ri], "newret");
             else
               // We are now returning a single element, so just insert that
-              V = New;
+              V = NewCB;
             // Insert the value at the old position
-            RetVal = InsertValueInst::Create(RetVal, V, i, "oldret", InsertPt);
+            RetVal = IRB.CreateInsertValue(RetVal, V, Ri, "oldret");
           }
         // Now, replace all uses of the old call instruction with the return
         // struct we built
-        Call->replaceAllUsesWith(RetVal);
-        New->takeName(Call);
+        CB.replaceAllUsesWith(RetVal);
+        NewCB->takeName(&CB);
       }
     }
 
     // Finally, remove the old call from the program, reducing the use-count of
     // F.
-    Call->eraseFromParent();
+    CB.eraseFromParent();
   }
 
   // Since we have now created the new function, splice the body of the old
@@ -1012,10 +999,11 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
 
   // Loop over the argument list, transferring uses of the old arguments over to
   // the new arguments, also transferring over the names as well.
-  i = 0;
+  ArgI = 0;
   for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end(),
-       I2 = NF->arg_begin(); I != E; ++I, ++i)
-    if (ArgAlive[i]) {
+                              I2 = NF->arg_begin();
+       I != E; ++I, ++ArgI)
+    if (ArgAlive[ArgI]) {
       // If this is a live argument, move the name and users over to the new
       // version.
       I->replaceAllUsesWith(&*I2);
@@ -1033,11 +1021,10 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
   if (F->getReturnType() != NF->getReturnType())
     for (BasicBlock &BB : *NF)
       if (ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator())) {
-        Value *RetVal;
+        IRBuilder<NoFolder> IRB(RI);
+        Value *RetVal = nullptr;
 
-        if (NFTy->getReturnType()->isVoidTy()) {
-          RetVal = nullptr;
-        } else {
+        if (!NFTy->getReturnType()->isVoidTy()) {
           assert(RetTy->isStructTy() || RetTy->isArrayTy());
           // The original return value was a struct or array, insert
           // extractvalue/insertvalue chains to extract only the values we need
@@ -1047,16 +1034,16 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
           Value *OldRet = RI->getOperand(0);
           // Start out building up our return value from undef
           RetVal = UndefValue::get(NRetTy);
-          for (unsigned i = 0; i != RetCount; ++i)
-            if (NewRetIdxs[i] != -1) {
-              ExtractValueInst *EV = ExtractValueInst::Create(OldRet, i,
-                                                              "oldret", RI);
+          for (unsigned RetI = 0; RetI != RetCount; ++RetI)
+            if (NewRetIdxs[RetI] != -1) {
+              Value *EV = IRB.CreateExtractValue(OldRet, RetI, "oldret");
+
               if (RetTypes.size() > 1) {
                 // We're still returning a struct, so reinsert the value into
                 // our new return value at the new index
 
-                RetVal = InsertValueInst::Create(RetVal, EV, NewRetIdxs[i],
-                                                 "newret", RI);
+                RetVal = IRB.CreateInsertValue(RetVal, EV, NewRetIdxs[RetI],
+                                               "newret");
               } else {
                 // We are now only returning a simple value, so just return the
                 // extracted value.
@@ -1066,7 +1053,8 @@ bool DeadArgumentEliminationPass::RemoveDeadStuffFromFunction(Function *F) {
         }
         // Replace the return instruction with one returning the new return
         // value (possibly 0 if we became void).
-        ReturnInst::Create(F->getContext(), RetVal, RI);
+        auto *NewRet = ReturnInst::Create(F->getContext(), RetVal, RI);
+        NewRet->setDebugLoc(RI->getDebugLoc());
         BB.getInstList().erase(RI);
       }
 
diff --git a/llvm/lib/Transforms/IPO/ExtractGV.cpp b/llvm/lib/Transforms/IPO/ExtractGV.cpp
index f77b528fc42d..b45766a8e783 100644
--- a/llvm/lib/Transforms/IPO/ExtractGV.cpp
+++ b/llvm/lib/Transforms/IPO/ExtractGV.cpp
@@ -54,6 +54,7 @@ namespace {
   class GVExtractorPass : public ModulePass {
     SetVector<GlobalValue *> Named;
     bool deleteStuff;
+    bool keepConstInit;
   public:
     static char ID; // Pass identification, replacement for typeid
 
@@ -61,8 +62,9 @@ namespace {
     /// Otherwise, it deletes as much of the module as possible, except for the
     /// global values specified.
     explicit GVExtractorPass(std::vector<GlobalValue*> &GVs,
-                             bool deleteS = true)
-      : ModulePass(ID), Named(GVs.begin(), GVs.end()), deleteStuff(deleteS) {}
+                             bool deleteS = true, bool keepConstInit = false)
+      : ModulePass(ID), Named(GVs.begin(), GVs.end()), deleteStuff(deleteS),
+        keepConstInit(keepConstInit) {}
 
     bool runOnModule(Module &M) override {
       if (skipModule(M))
@@ -83,7 +85,8 @@ namespace {
       for (Module::global_iterator I = M.global_begin(), E = M.global_end();
            I != E; ++I) {
         bool Delete =
-            deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration();
+            deleteStuff == (bool)Named.count(&*I) && !I->isDeclaration() &&
+            (!I->isConstant() || !keepConstInit);
         if (!Delete) {
           if (I->hasAvailableExternallyLinkage())
             continue;
@@ -156,6 +159,6 @@ namespace {
 }
 
 ModulePass *llvm::createGVExtractionPass(std::vector<GlobalValue *> &GVs,
-                                         bool deleteFn) {
-  return new GVExtractorPass(GVs, deleteFn);
+                                         bool deleteFn, bool keepConstInit) {
+  return new GVExtractorPass(GVs, deleteFn, keepConstInit);
 }
diff --git a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
index b6d0b2e35694..4baeaa6e1630 100644
--- a/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionAttrs.cpp
@@ -33,7 +33,6 @@
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
@@ -160,8 +159,7 @@ static MemoryAccessKind checkFunctionMemoryAccess(Function &F, bool ThisBody,
 
       // Check whether all pointer arguments point to local memory, and
       // ignore calls that only access local memory.
-      for (CallSite::arg_iterator CI = Call->arg_begin(), CE = Call->arg_end();
-           CI != CE; ++CI) {
+      for (auto CI = Call->arg_begin(), CE = Call->arg_end(); CI != CE; ++CI) {
         Value *Arg = *CI;
         if (!Arg->getType()->isPtrOrPtrVectorTy())
           continue;
@@ -362,13 +360,13 @@ struct ArgumentUsesTracker : public CaptureTracker {
   void tooManyUses() override { Captured = true; }
 
   bool captured(const Use *U) override {
-    CallSite CS(U->getUser());
-    if (!CS.getInstruction()) {
+    CallBase *CB = dyn_cast<CallBase>(U->getUser());
+    if (!CB) {
       Captured = true;
       return true;
     }
 
-    Function *F = CS.getCalledFunction();
+    Function *F = CB->getCalledFunction();
     if (!F || !F->hasExactDefinition() || !SCCNodes.count(F)) {
       Captured = true;
       return true;
@@ -379,14 +377,14 @@ struct ArgumentUsesTracker : public CaptureTracker {
     // these.
 
     unsigned UseIndex =
-        std::distance(const_cast<const Use *>(CS.arg_begin()), U);
+        std::distance(const_cast<const Use *>(CB->arg_begin()), U);
 
-    assert(UseIndex < CS.data_operands_size() &&
+    assert(UseIndex < CB->data_operands_size() &&
            "Indirect function calls should have been filtered above!");
 
-    if (UseIndex >= CS.getNumArgOperands()) {
+    if (UseIndex >= CB->getNumArgOperands()) {
       // Data operand, but not a argument operand -- must be a bundle operand
-      assert(CS.hasOperandBundles() && "Must be!");
+      assert(CB->hasOperandBundles() && "Must be!");
 
       // CaptureTracking told us that we're being captured by an operand bundle
       // use.  In this case it does not matter if the callee is within our SCC
@@ -449,7 +447,7 @@ determinePointerReadAttrs(Argument *A,
   SmallPtrSet<Use *, 32> Visited;
 
   // inalloca arguments are always clobbered by the call.
-  if (A->hasInAllocaAttr())
+  if (A->hasInAllocaAttr() || A->hasPreallocatedAttr())
     return Attribute::None;
 
   bool IsRead = false;
@@ -490,15 +488,15 @@ determinePointerReadAttrs(Argument *A,
               Worklist.push_back(&UU);
       };
 
-      CallSite CS(I);
-      if (CS.doesNotAccessMemory()) {
+      CallBase &CB = cast<CallBase>(*I);
+      if (CB.doesNotAccessMemory()) {
         AddUsersToWorklistIfCapturing();
         continue;
       }
 
-      Function *F = CS.getCalledFunction();
+      Function *F = CB.getCalledFunction();
       if (!F) {
-        if (CS.onlyReadsMemory()) {
+        if (CB.onlyReadsMemory()) {
           IsRead = true;
           AddUsersToWorklistIfCapturing();
           continue;
@@ -510,23 +508,23 @@ determinePointerReadAttrs(Argument *A,
       // operands.  This means there is no need to adjust UseIndex to account
       // for these.
 
-      unsigned UseIndex = std::distance(CS.arg_begin(), U);
+      unsigned UseIndex = std::distance(CB.arg_begin(), U);
 
       // U cannot be the callee operand use: since we're exploring the
       // transitive uses of an Argument, having such a use be a callee would
-      // imply the CallSite is an indirect call or invoke; and we'd take the
+      // imply the call site is an indirect call or invoke; and we'd take the
       // early exit above.
-      assert(UseIndex < CS.data_operands_size() &&
+      assert(UseIndex < CB.data_operands_size() &&
              "Data operand use expected!");
 
-      bool IsOperandBundleUse = UseIndex >= CS.getNumArgOperands();
+      bool IsOperandBundleUse = UseIndex >= CB.getNumArgOperands();
 
       if (UseIndex >= F->arg_size() && !IsOperandBundleUse) {
         assert(F->isVarArg() && "More params than args in non-varargs call");
         return Attribute::None;
       }
 
-      Captures &= !CS.doesNotCapture(UseIndex);
+      Captures &= !CB.doesNotCapture(UseIndex);
 
       // Since the optimizer (by design) cannot see the data flow corresponding
       // to a operand bundle use, these cannot participate in the optimistic SCC
@@ -535,12 +533,12 @@ determinePointerReadAttrs(Argument *A,
       if (IsOperandBundleUse ||
           !SCCNodes.count(&*std::next(F->arg_begin(), UseIndex))) {
 
-        // The accessors used on CallSite here do the right thing for calls and
+        // The accessors used on call site here do the right thing for calls and
         // invokes with operand bundles.
 
-        if (!CS.onlyReadsMemory() && !CS.onlyReadsMemory(UseIndex))
+        if (!CB.onlyReadsMemory() && !CB.onlyReadsMemory(UseIndex))
           return Attribute::None;
-        if (!CS.doesNotAccessMemory(UseIndex))
+        if (!CB.doesNotAccessMemory(UseIndex))
           IsRead = true;
       }
 
@@ -638,8 +636,8 @@ static bool addArgumentAttrsFromCallsites(Function &F) {
   // callsite.
   BasicBlock &Entry = F.getEntryBlock();
   for (Instruction &I : Entry) {
-    if (auto CS = CallSite(&I)) {
-      if (auto *CalledFunc = CS.getCalledFunction()) {
+    if (auto *CB = dyn_cast<CallBase>(&I)) {
+      if (auto *CalledFunc = CB->getCalledFunction()) {
         for (auto &CSArg : CalledFunc->args()) {
           if (!CSArg.hasNonNullAttr())
             continue;
@@ -647,7 +645,7 @@ static bool addArgumentAttrsFromCallsites(Function &F) {
           // If the non-null callsite argument operand is an argument to 'F'
           // (the caller) and the call is guaranteed to execute, then the value
           // must be non-null throughout 'F'.
-          auto *FArg = dyn_cast<Argument>(CS.getArgOperand(CSArg.getArgNo()));
+          auto *FArg = dyn_cast<Argument>(CB->getArgOperand(CSArg.getArgNo()));
           if (FArg && !FArg->hasNonNullAttr()) {
             FArg->addAttr(Attribute::NonNull);
             Changed = true;
@@ -904,10 +902,10 @@ static bool isFunctionMallocLike(Function *F, const SCCNodeSet &SCCNodes) {
         break;
       case Instruction::Call:
       case Instruction::Invoke: {
-        CallSite CS(RVI);
-        if (CS.hasRetAttr(Attribute::NoAlias))
+        CallBase &CB = cast<CallBase>(*RVI);
+        if (CB.hasRetAttr(Attribute::NoAlias))
           break;
-        if (CS.getCalledFunction() && SCCNodes.count(CS.getCalledFunction()))
+        if (CB.getCalledFunction() && SCCNodes.count(CB.getCalledFunction()))
           break;
         LLVM_FALLTHROUGH;
       }
@@ -1013,8 +1011,8 @@ static bool isReturnNonNull(Function *F, const SCCNodeSet &SCCNodes,
     }
     case Instruction::Call:
     case Instruction::Invoke: {
-      CallSite CS(RVI);
-      Function *Callee = CS.getCalledFunction();
+      CallBase &CB = cast<CallBase>(*RVI);
+      Function *Callee = CB.getCalledFunction();
       // A call to a node within the SCC is assumed to return null until
       // proven otherwise
       if (Callee && SCCNodes.count(Callee)) {
@@ -1223,10 +1221,11 @@ bool AttributeInferer::run(const SCCNodeSet &SCCNodes) {
 /// Helper for non-Convergent inference predicate InstrBreaksAttribute.
 static bool InstrBreaksNonConvergent(Instruction &I,
                                      const SCCNodeSet &SCCNodes) {
-  const CallSite CS(&I);
+  const CallBase *CB = dyn_cast<CallBase>(&I);
   // Breaks non-convergent assumption if CS is a convergent call to a function
   // not in the SCC.
-  return CS && CS.isConvergent() && SCCNodes.count(CS.getCalledFunction()) == 0;
+  return CB && CB->isConvergent() &&
+         SCCNodes.count(CB->getCalledFunction()) == 0;
 }
 
 /// Helper for NoUnwind inference predicate InstrBreaksAttribute.
@@ -1247,11 +1246,11 @@ static bool InstrBreaksNonThrowing(Instruction &I, const SCCNodeSet &SCCNodes) {
 
 /// Helper for NoFree inference predicate InstrBreaksAttribute.
 static bool InstrBreaksNoFree(Instruction &I, const SCCNodeSet &SCCNodes) {
-  CallSite CS(&I);
-  if (!CS)
+  CallBase *CB = dyn_cast<CallBase>(&I);
+  if (!CB)
     return false;
 
-  Function *Callee = CS.getCalledFunction();
+  Function *Callee = CB->getCalledFunction();
   if (!Callee)
     return true;
 
@@ -1306,7 +1305,7 @@ static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) {
         // Skip non-throwing functions.
         [](const Function &F) { return F.doesNotThrow(); },
         // Instructions that break non-throwing assumption.
-        [SCCNodes](Instruction &I) {
+        [&SCCNodes](Instruction &I) {
           return InstrBreaksNonThrowing(I, SCCNodes);
         },
         [](Function &F) {
@@ -1329,7 +1328,7 @@ static bool inferAttrsFromFunctionBodies(const SCCNodeSet &SCCNodes) {
         // Skip functions known not to free memory.
         [](const Function &F) { return F.doesNotFreeMemory(); },
         // Instructions that break non-deallocating assumption.
-        [SCCNodes](Instruction &I) {
+        [&SCCNodes](Instruction &I) {
           return InstrBreaksNoFree(I, SCCNodes);
         },
         [](Function &F) {
@@ -1368,8 +1367,8 @@ static bool addNoRecurseAttrs(const SCCNodeSet &SCCNodes) {
   // marked norecurse, so any called from F to F will not be marked norecurse.
   for (auto &BB : *F)
     for (auto &I : BB.instructionsWithoutDebug())
-      if (auto CS = CallSite(&I)) {
-        Function *Callee = CS.getCalledFunction();
+      if (auto *CB = dyn_cast<CallBase>(&I)) {
+        Function *Callee = CB->getCalledFunction();
         if (!Callee || Callee == F || !Callee->doesNotRecurse())
           // Function calls a potentially recursive function.
           return false;
@@ -1439,8 +1438,8 @@ PreservedAnalyses PostOrderFunctionAttrsPass::run(LazyCallGraph::SCC &C,
     // function.
     if (!HasUnknownCall)
       for (Instruction &I : instructions(F))
-        if (auto CS = CallSite(&I))
-          if (!CS.getCalledFunction()) {
+        if (auto *CB = dyn_cast<CallBase>(&I))
+          if (!CB->getCalledFunction()) {
             HasUnknownCall = true;
             break;
           }
@@ -1575,8 +1574,8 @@ static bool addNoRecurseAttrsTopDown(Function &F) {
     auto *I = dyn_cast<Instruction>(U);
     if (!I)
       return false;
-    CallSite CS(I);
-    if (!CS || !CS.getParent()->getParent()->doesNotRecurse())
+    CallBase *CB = dyn_cast<CallBase>(I);
+    if (!CB || !CB->getParent()->getParent()->doesNotRecurse())
       return false;
   }
   return setDoesNotRecurse(F);
diff --git a/llvm/lib/Transforms/IPO/FunctionImport.cpp b/llvm/lib/Transforms/IPO/FunctionImport.cpp
index be0446a946ec..468bf19f2e48 100644
--- a/llvm/lib/Transforms/IPO/FunctionImport.cpp
+++ b/llvm/lib/Transforms/IPO/FunctionImport.cpp
@@ -306,28 +306,21 @@ static void computeImportForReferencedGlobals(
              RefSummary->modulePath() != Summary.modulePath();
     };
 
-    auto MarkExported = [&](const ValueInfo &VI, const GlobalValueSummary *S) {
-      if (ExportLists)
-        (*ExportLists)[S->modulePath()].insert(VI);
-    };
-
     for (auto &RefSummary : VI.getSummaryList())
       if (isa<GlobalVarSummary>(RefSummary.get()) &&
           Index.canImportGlobalVar(RefSummary.get(), /* AnalyzeRefs */ true) &&
           !LocalNotInModule(RefSummary.get())) {
         auto ILI = ImportList[RefSummary->modulePath()].insert(VI.getGUID());
-        // Only update stat if we haven't already imported this variable.
-        if (ILI.second)
-          NumImportedGlobalVarsThinLink++;
-        MarkExported(VI, RefSummary.get());
-        // Promote referenced functions and variables. We don't promote
-        // objects referenced by writeonly variable initializer, because
-        // we convert such variables initializers to "zeroinitializer".
-        // See processGlobalForThinLTO.
-        if (!Index.isWriteOnly(cast<GlobalVarSummary>(RefSummary.get())))
-          for (const auto &VI : RefSummary->refs())
-            for (const auto &RefFn : VI.getSummaryList())
-              MarkExported(VI, RefFn.get());
+        // Only update stat and exports if we haven't already imported this
+        // variable.
+        if (!ILI.second)
+          break;
+        NumImportedGlobalVarsThinLink++;
+        // Any references made by this variable will be marked exported later,
+        // in ComputeCrossModuleImport, after import decisions are complete,
+        // which is more efficient than adding them here.
+        if (ExportLists)
+          (*ExportLists)[RefSummary->modulePath()].insert(VI);
         break;
       }
   }
@@ -494,24 +487,11 @@ static void computeImportForFunction(
           NumImportedCriticalFunctionsThinLink++;
       }
 
-      // Make exports in the source module.
-      if (ExportLists) {
-        auto &ExportList = (*ExportLists)[ExportModulePath];
-        ExportList.insert(VI);
-        if (!PreviouslyImported) {
-          // This is the first time this function was exported from its source
-          // module, so mark all functions and globals it references as exported
-          // to the outside if they are defined in the same source module.
-          // For efficiency, we unconditionally add all the referenced GUIDs
-          // to the ExportList for this module, and will prune out any not
-          // defined in the module later in a single pass.
-          for (auto &Edge : ResolvedCalleeSummary->calls())
-            ExportList.insert(Edge.first);
-
-          for (auto &Ref : ResolvedCalleeSummary->refs())
-            ExportList.insert(Ref);
-        }
-      }
+      // Any calls/references made by this function will be marked exported
+      // later, in ComputeCrossModuleImport, after import decisions are
+      // complete, which is more efficient than adding them here.
+      if (ExportLists)
+        (*ExportLists)[ExportModulePath].insert(VI);
     }
 
     auto GetAdjustedThreshold = [](unsigned Threshold, bool IsHotCallsite) {
@@ -678,20 +658,55 @@ void llvm::ComputeCrossModuleImport(
                            &ExportLists);
   }
 
-  // When computing imports we added all GUIDs referenced by anything
-  // imported from the module to its ExportList. Now we prune each ExportList
-  // of any not defined in that module. This is more efficient than checking
-  // while computing imports because some of the summary lists may be long
-  // due to linkonce (comdat) copies.
+  // When computing imports we only added the variables and functions being
+  // imported to the export list. We also need to mark any references and calls
+  // they make as exported as well. We do this here, as it is more efficient
+  // since we may import the same values multiple times into different modules
+  // during the import computation.
   for (auto &ELI : ExportLists) {
+    FunctionImporter::ExportSetTy NewExports;
     const auto &DefinedGVSummaries =
         ModuleToDefinedGVSummaries.lookup(ELI.first());
-    for (auto EI = ELI.second.begin(); EI != ELI.second.end();) {
+    for (auto &EI : ELI.second) {
+      // Find the copy defined in the exporting module so that we can mark the
+      // values it references in that specific definition as exported.
+      // Below we will add all references and called values, without regard to
+      // whether they are also defined in this module. We subsequently prune the
+      // list to only include those defined in the exporting module, see comment
+      // there as to why.
+      auto DS = DefinedGVSummaries.find(EI.getGUID());
+      // Anything marked exported during the import computation must have been
+      // defined in the exporting module.
+      assert(DS != DefinedGVSummaries.end());
+      auto *S = DS->getSecond();
+      S = S->getBaseObject();
+      if (auto *GVS = dyn_cast<GlobalVarSummary>(S)) {
+        // Export referenced functions and variables. We don't export/promote
+        // objects referenced by writeonly variable initializer, because
+        // we convert such variables initializers to "zeroinitializer".
+        // See processGlobalForThinLTO.
+        if (!Index.isWriteOnly(GVS))
+          for (const auto &VI : GVS->refs())
+            NewExports.insert(VI);
+      } else {
+        auto *FS = cast<FunctionSummary>(S);
+        for (auto &Edge : FS->calls())
+          NewExports.insert(Edge.first);
+        for (auto &Ref : FS->refs())
+          NewExports.insert(Ref);
+      }
+    }
+    // Prune list computed above to only include values defined in the exporting
+    // module. We do this after the above insertion since we may hit the same
+    // ref/call target multiple times in above loop, and it is more efficient to
+    // avoid a set lookup each time.
+    for (auto EI = NewExports.begin(); EI != NewExports.end();) {
       if (!DefinedGVSummaries.count(EI->getGUID()))
-        ELI.second.erase(EI++);
+        NewExports.erase(EI++);
       else
         ++EI;
     }
+    ELI.second.insert(NewExports.begin(), NewExports.end());
   }
 
   assert(checkVariableImport(Index, ImportLists, ExportLists));
@@ -913,11 +928,12 @@ void llvm::gatherImportedSummariesForModule(
     const FunctionImporter::ImportMapTy &ImportList,
     std::map<std::string, GVSummaryMapTy> &ModuleToSummariesForIndex) {
   // Include all summaries from the importing module.
-  ModuleToSummariesForIndex[ModulePath] =
+  ModuleToSummariesForIndex[std::string(ModulePath)] =
       ModuleToDefinedGVSummaries.lookup(ModulePath);
   // Include summaries for imports.
   for (auto &ILI : ImportList) {
-    auto &SummariesForIndex = ModuleToSummariesForIndex[ILI.first()];
+    auto &SummariesForIndex =
+        ModuleToSummariesForIndex[std::string(ILI.first())];
     const auto &DefinedGVSummaries =
         ModuleToDefinedGVSummaries.lookup(ILI.first());
     for (auto &GI : ILI.second) {
@@ -976,6 +992,8 @@ bool llvm::convertToDeclaration(GlobalValue &GV) {
     GV.replaceAllUsesWith(NewGV);
     return false;
   }
+  if (!GV.isImplicitDSOLocal())
+    GV.setDSOLocal(false);
   return true;
 }
 
@@ -1214,8 +1232,15 @@ Expected<bool> FunctionImporter::importFunctions(
     // have loaded all the required metadata!
     UpgradeDebugInfo(*SrcModule);
 
+    // Set the partial sample profile ratio in the profile summary module flag
+    // of the imported source module, if applicable, so that the profile summary
+    // module flag will match with that of the destination module when it's
+    // imported.
+    SrcModule->setPartialSampleProfileRatio(Index);
+
     // Link in the specified functions.
-    if (renameModuleForThinLTO(*SrcModule, Index, &GlobalsToImport))
+    if (renameModuleForThinLTO(*SrcModule, Index, ClearDSOLocalOnDeclarations,
+                               &GlobalsToImport))
       return true;
 
     if (PrintImports) {
@@ -1224,10 +1249,12 @@ Expected<bool> FunctionImporter::importFunctions(
                << " from " << SrcModule->getSourceFileName() << "\n";
     }
 
-    if (Mover.move(std::move(SrcModule), GlobalsToImport.getArrayRef(),
-                   [](GlobalValue &, IRMover::ValueAdder) {},
-                   /*IsPerformingImport=*/true))
-      report_fatal_error("Function Import: link error");
+    if (Error Err = Mover.move(
+            std::move(SrcModule), GlobalsToImport.getArrayRef(),
+            [](GlobalValue &, IRMover::ValueAdder) {},
+            /*IsPerformingImport=*/true))
+      report_fatal_error("Function Import: link error: " +
+                         toString(std::move(Err)));
 
     ImportedCount += GlobalsToImport.size();
     NumImportedModules++;
@@ -1284,16 +1311,18 @@ static bool doImportingForModule(Module &M) {
 
   // Next we need to promote to global scope and rename any local values that
   // are potentially exported to other modules.
-  if (renameModuleForThinLTO(M, *Index, nullptr)) {
+  if (renameModuleForThinLTO(M, *Index, /*clearDSOOnDeclarations=*/false,
+                             /*GlobalsToImport=*/nullptr)) {
     errs() << "Error renaming module\n";
     return false;
   }
 
   // Perform the import now.
   auto ModuleLoader = [&M](StringRef Identifier) {
-    return loadFile(Identifier, M.getContext());
+    return loadFile(std::string(Identifier), M.getContext());
   };
-  FunctionImporter Importer(*Index, ModuleLoader);
+  FunctionImporter Importer(*Index, ModuleLoader,
+                            /*ClearDSOLocalOnDeclarations=*/false);
   Expected<bool> Result = Importer.importFunctions(M, ImportList);
 
   // FIXME: Probably need to propagate Errors through the pass manager.
diff --git a/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index 72b8d7522f04..fb4cb23b837e 100644
--- a/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -263,6 +263,15 @@ void GlobalDCEPass::AddVirtualFunctionDependencies(Module &M) {
   if (!ClEnableVFE)
     return;
 
+  // If the Virtual Function Elim module flag is present and set to zero, then
+  // the vcall_visibility metadata was inserted for another optimization (WPD)
+  // and we may not have type checked loads on all accesses to the vtable.
+  // Don't attempt VFE in that case.
+  auto *Val = mdconst::dyn_extract_or_null<ConstantInt>(
+      M.getModuleFlag("Virtual Function Elim"));
+  if (!Val || Val->getZExtValue() == 0)
+    return;
+
   ScanVTables(M);
 
   if (VFESafeVTables.empty())
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 0fd966457ece..d9fb820f7cb5 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -28,7 +28,6 @@
 #include "llvm/BinaryFormat/Dwarf.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -41,6 +40,7 @@
 #include "llvm/IR/GlobalAlias.h"
 #include "llvm/IR/GlobalValue.h"
 #include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -128,13 +128,16 @@ static bool isLeakCheckerRoot(GlobalVariable *GV) {
     Type *Ty = Types.pop_back_val();
     switch (Ty->getTypeID()) {
       default: break;
-      case Type::PointerTyID: return true;
+      case Type::PointerTyID:
+        return true;
+      case Type::FixedVectorTyID:
+      case Type::ScalableVectorTyID:
+        if (cast<VectorType>(Ty)->getElementType()->isPointerTy())
+          return true;
+        break;
       case Type::ArrayTyID:
-      case Type::VectorTyID: {
-        SequentialType *STy = cast<SequentialType>(Ty);
-        Types.push_back(STy->getElementType());
+        Types.push_back(cast<ArrayType>(Ty)->getElementType());
         break;
-      }
       case Type::StructTyID: {
         StructType *STy = cast<StructType>(Ty);
         if (STy->isOpaque()) return true;
@@ -142,7 +145,8 @@ static bool isLeakCheckerRoot(GlobalVariable *GV) {
                  E = STy->element_end(); I != E; ++I) {
           Type *InnerTy = *I;
           if (isa<PointerType>(InnerTy)) return true;
-          if (isa<CompositeType>(InnerTy))
+          if (isa<StructType>(InnerTy) || isa<ArrayType>(InnerTy) ||
+              isa<VectorType>(InnerTy))
             Types.push_back(InnerTy);
         }
         break;
@@ -191,10 +195,10 @@ CleanupPointerRootUsers(GlobalVariable *GV,
                         function_ref<TargetLibraryInfo &(Function &)> GetTLI) {
   // A brief explanation of leak checkers.  The goal is to find bugs where
   // pointers are forgotten, causing an accumulating growth in memory
-  // usage over time.  The common strategy for leak checkers is to whitelist the
-  // memory pointed to by globals at exit.  This is popular because it also
-  // solves another problem where the main thread of a C++ program may shut down
-  // before other threads that are still expecting to use those globals.  To
+  // usage over time.  The common strategy for leak checkers is to explicitly
+  // allow the memory pointed to by globals at exit.  This is popular because it
+  // also solves another problem where the main thread of a C++ program may shut
+  // down before other threads that are still expecting to use those globals. To
   // handle that case, we expect the program may create a singleton and never
   // destroy it.
 
@@ -433,13 +437,27 @@ static bool GlobalUsersSafeToSRA(GlobalValue *GV) {
   return true;
 }
 
+static bool IsSRASequential(Type *T) {
+  return isa<ArrayType>(T) || isa<VectorType>(T);
+}
+static uint64_t GetSRASequentialNumElements(Type *T) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(T))
+    return AT->getNumElements();
+  return cast<FixedVectorType>(T)->getNumElements();
+}
+static Type *GetSRASequentialElementType(Type *T) {
+  if (ArrayType *AT = dyn_cast<ArrayType>(T))
+    return AT->getElementType();
+  return cast<VectorType>(T)->getElementType();
+}
 static bool CanDoGlobalSRA(GlobalVariable *GV) {
   Constant *Init = GV->getInitializer();
 
   if (isa<StructType>(Init->getType())) {
     // nothing to check
-  } else if (SequentialType *STy = dyn_cast<SequentialType>(Init->getType())) {
-    if (STy->getNumElements() > 16 && GV->hasNUsesOrMore(16))
+  } else if (IsSRASequential(Init->getType())) {
+    if (GetSRASequentialNumElements(Init->getType()) > 16 &&
+        GV->hasNUsesOrMore(16))
       return false; // It's not worth it.
   } else
     return false;
@@ -450,14 +468,19 @@ static bool CanDoGlobalSRA(GlobalVariable *GV) {
 /// Copy over the debug info for a variable to its SRA replacements.
 static void transferSRADebugInfo(GlobalVariable *GV, GlobalVariable *NGV,
                                  uint64_t FragmentOffsetInBits,
-                                 uint64_t FragmentSizeInBits,
-                                 unsigned NumElements) {
+                                 uint64_t FragmentSizeInBits) {
   SmallVector<DIGlobalVariableExpression *, 1> GVs;
   GV->getDebugInfo(GVs);
   for (auto *GVE : GVs) {
     DIVariable *Var = GVE->getVariable();
+    Optional<uint64_t> VarSize = Var->getSizeInBits();
+
     DIExpression *Expr = GVE->getExpression();
-    if (NumElements > 1) {
+    // If the FragmentSize is smaller than the variable,
+    // emit a fragment expression.
+    // If the variable size is unknown a fragment must be
+    // emitted to be safe.
+    if (!VarSize || FragmentSizeInBits < *VarSize) {
       if (auto E = DIExpression::createFragmentExpression(
               Expr, FragmentOffsetInBits, FragmentSizeInBits))
         Expr = *E;
@@ -486,9 +509,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
   std::map<unsigned, GlobalVariable *> NewGlobals;
 
   // Get the alignment of the global, either explicit or target-specific.
-  unsigned StartAlignment = GV->getAlignment();
-  if (StartAlignment == 0)
-    StartAlignment = DL.getABITypeAlignment(GV->getType());
+  Align StartAlignment =
+      DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getType());
 
   // Loop over all users and create replacement variables for used aggregate
   // elements.
@@ -509,8 +531,8 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
     Type *ElTy = nullptr;
     if (StructType *STy = dyn_cast<StructType>(Ty))
       ElTy = STy->getElementType(ElementIdx);
-    else if (SequentialType *STy = dyn_cast<SequentialType>(Ty))
-      ElTy = STy->getElementType();
+    else
+      ElTy = GetSRASequentialElementType(Ty);
     assert(ElTy);
 
     Constant *In = Init->getAggregateElement(ElementIdx);
@@ -531,29 +553,27 @@ static GlobalVariable *SRAGlobal(GlobalVariable *GV, const DataLayout &DL) {
       // had 256 byte alignment for example, something might depend on that:
       // propagate info to each field.
       uint64_t FieldOffset = Layout.getElementOffset(ElementIdx);
-      Align NewAlign(MinAlign(StartAlignment, FieldOffset));
-      if (NewAlign >
-          Align(DL.getABITypeAlignment(STy->getElementType(ElementIdx))))
+      Align NewAlign = commonAlignment(StartAlignment, FieldOffset);
+      if (NewAlign > DL.getABITypeAlign(STy->getElementType(ElementIdx)))
         NGV->setAlignment(NewAlign);
 
       // Copy over the debug info for the variable.
       uint64_t Size = DL.getTypeAllocSizeInBits(NGV->getValueType());
       uint64_t FragmentOffsetInBits = Layout.getElementOffsetInBits(ElementIdx);
-      transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size,
-                           STy->getNumElements());
-    } else if (SequentialType *STy = dyn_cast<SequentialType>(Ty)) {
+      transferSRADebugInfo(GV, NGV, FragmentOffsetInBits, Size);
+    } else {
       uint64_t EltSize = DL.getTypeAllocSize(ElTy);
-      Align EltAlign(DL.getABITypeAlignment(ElTy));
+      Align EltAlign = DL.getABITypeAlign(ElTy);
       uint64_t FragmentSizeInBits = DL.getTypeAllocSizeInBits(ElTy);
 
       // Calculate the known alignment of the field.  If the original aggregate
       // had 256 byte alignment for example, something might depend on that:
       // propagate info to each field.
-      Align NewAlign(MinAlign(StartAlignment, EltSize * ElementIdx));
+      Align NewAlign = commonAlignment(StartAlignment, EltSize * ElementIdx);
       if (NewAlign > EltAlign)
         NGV->setAlignment(NewAlign);
       transferSRADebugInfo(GV, NGV, FragmentSizeInBits * ElementIdx,
-                           FragmentSizeInBits, STy->getNumElements());
+                           FragmentSizeInBits);
     }
   }
 
@@ -641,12 +661,12 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V,
         return false;  // Storing the value.
       }
     } else if (const CallInst *CI = dyn_cast<CallInst>(U)) {
-      if (CI->getCalledValue() != V) {
+      if (CI->getCalledOperand() != V) {
         //cerr << "NONTRAPPING USE: " << *U;
         return false;  // Not calling the ptr
       }
     } else if (const InvokeInst *II = dyn_cast<InvokeInst>(U)) {
-      if (II->getCalledValue() != V) {
+      if (II->getCalledOperand() != V) {
         //cerr << "NONTRAPPING USE: " << *U;
         return false;  // Not calling the ptr
       }
@@ -659,9 +679,6 @@ static bool AllUsesOfValueWillTrapIfNull(const Value *V,
       // checked.
       if (PHIs.insert(PN).second && !AllUsesOfValueWillTrapIfNull(PN, PHIs))
         return false;
-    } else if (isa<ICmpInst>(U) &&
-               isa<ConstantPointerNull>(U->getOperand(1))) {
-      // Ignore icmp X, null
     } else {
       //cerr << "NONTRAPPING USE: " << *U;
       return false;
@@ -706,17 +723,17 @@ static bool OptimizeAwayTrappingUsesOfValue(Value *V, Constant *NewV) {
         Changed = true;
       }
     } else if (isa<CallInst>(I) || isa<InvokeInst>(I)) {
-      CallSite CS(I);
-      if (CS.getCalledValue() == V) {
+      CallBase *CB = cast<CallBase>(I);
+      if (CB->getCalledOperand() == V) {
         // Calling through the pointer!  Turn into a direct call, but be careful
         // that the pointer is not also being passed as an argument.
-        CS.setCalledFunction(NewV);
+        CB->setCalledOperand(NewV);
         Changed = true;
         bool PassedAsArg = false;
-        for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
-          if (CS.getArgument(i) == V) {
+        for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
+          if (CB->getArgOperand(i) == V) {
             PassedAsArg = true;
-            CS.setArgument(i, NewV);
+            CB->setArgOperand(i, NewV);
           }
 
         if (PassedAsArg) {
@@ -905,7 +922,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
     if (StoreInst *SI = dyn_cast<StoreInst>(GV->user_back())) {
       // The global is initialized when the store to it occurs.
       new StoreInst(ConstantInt::getTrue(GV->getContext()), InitBool, false,
-                    None, SI->getOrdering(), SI->getSyncScopeID(), SI);
+                    Align(1), SI->getOrdering(), SI->getSyncScopeID(), SI);
       SI->eraseFromParent();
       continue;
     }
@@ -922,7 +939,7 @@ OptimizeGlobalAddressOfMalloc(GlobalVariable *GV, CallInst *CI, Type *AllocTy,
       // Replace the cmp X, 0 with a use of the bool value.
       // Sink the load to where the compare was, if atomic rules allow us to.
       Value *LV = new LoadInst(InitBool->getValueType(), InitBool,
-                               InitBool->getName() + ".val", false, None,
+                               InitBool->getName() + ".val", false, Align(1),
                                LI->getOrdering(), LI->getSyncScopeID(),
                                LI->isUnordered() ? (Instruction *)ICI : LI);
       InitBoolUsed = true;
@@ -1729,7 +1746,7 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
           assert(LI->getOperand(0) == GV && "Not a copy!");
           // Insert a new load, to preserve the saved value.
           StoreVal = new LoadInst(NewGV->getValueType(), NewGV,
-                                  LI->getName() + ".b", false, None,
+                                  LI->getName() + ".b", false, Align(1),
                                   LI->getOrdering(), LI->getSyncScopeID(), LI);
         } else {
           assert((isa<CastInst>(StoredVal) || isa<SelectInst>(StoredVal)) &&
@@ -1739,14 +1756,14 @@ static bool TryToShrinkGlobalToBoolean(GlobalVariable *GV, Constant *OtherVal) {
         }
       }
       StoreInst *NSI =
-          new StoreInst(StoreVal, NewGV, false, None, SI->getOrdering(),
+          new StoreInst(StoreVal, NewGV, false, Align(1), SI->getOrdering(),
                         SI->getSyncScopeID(), SI);
       NSI->setDebugLoc(SI->getDebugLoc());
     } else {
       // Change the load into a load of bool then a select.
       LoadInst *LI = cast<LoadInst>(UI);
       LoadInst *NLI = new LoadInst(NewGV->getValueType(), NewGV,
-                                   LI->getName() + ".b", false, None,
+                                   LI->getName() + ".b", false, Align(1),
                                    LI->getOrdering(), LI->getSyncScopeID(), LI);
       Instruction *NSI;
       if (IsOneZero)
@@ -2117,8 +2134,7 @@ static void ChangeCalleesToFastCall(Function *F) {
   for (User *U : F->users()) {
     if (isa<BlockAddress>(U))
       continue;
-    CallSite CS(cast<Instruction>(U));
-    CS.setCallingConv(CallingConv::Fast);
+    cast<CallBase>(U)->setCallingConv(CallingConv::Fast);
   }
 }
 
@@ -2135,8 +2151,8 @@ static void RemoveAttribute(Function *F, Attribute::AttrKind A) {
   for (User *U : F->users()) {
     if (isa<BlockAddress>(U))
       continue;
-    CallSite CS(cast<Instruction>(U));
-    CS.setAttributes(StripAttr(F->getContext(), CS.getAttributes(), A));
+    CallBase *CB = cast<CallBase>(U);
+    CB->setAttributes(StripAttr(F->getContext(), CB->getAttributes(), A));
   }
 }
 
@@ -2175,12 +2191,12 @@ static bool hasChangeableCC(Function *F) {
 
 /// Return true if the block containing the call site has a BlockFrequency of
 /// less than ColdCCRelFreq% of the entry block.
-static bool isColdCallSite(CallSite CS, BlockFrequencyInfo &CallerBFI) {
+static bool isColdCallSite(CallBase &CB, BlockFrequencyInfo &CallerBFI) {
   const BranchProbability ColdProb(ColdCCRelFreq, 100);
-  auto CallSiteBB = CS.getInstruction()->getParent();
+  auto *CallSiteBB = CB.getParent();
   auto CallSiteFreq = CallerBFI.getBlockFreq(CallSiteBB);
   auto CallerEntryFreq =
-      CallerBFI.getBlockFreq(&(CS.getCaller()->getEntryBlock()));
+      CallerBFI.getBlockFreq(&(CB.getCaller()->getEntryBlock()));
   return CallSiteFreq < CallerEntryFreq * ColdProb;
 }
 
@@ -2200,10 +2216,10 @@ isValidCandidateForColdCC(Function &F,
     if (isa<BlockAddress>(U))
       continue;
 
-    CallSite CS(cast<Instruction>(U));
-    Function *CallerFunc = CS.getInstruction()->getParent()->getParent();
+    CallBase &CB = cast<CallBase>(*U);
+    Function *CallerFunc = CB.getParent()->getParent();
     BlockFrequencyInfo &CallerBFI = GetBFI(*CallerFunc);
-    if (!isColdCallSite(CS, CallerBFI))
+    if (!isColdCallSite(CB, CallerBFI))
       return false;
     auto It = std::find(AllCallsCold.begin(), AllCallsCold.end(), CallerFunc);
     if (It == AllCallsCold.end())
@@ -2216,8 +2232,7 @@ static void changeCallSitesToColdCC(Function *F) {
   for (User *U : F->users()) {
     if (isa<BlockAddress>(U))
       continue;
-    CallSite CS(cast<Instruction>(U));
-    CS.setCallingConv(CallingConv::Cold);
+    cast<CallBase>(U)->setCallingConv(CallingConv::Cold);
   }
 }
 
@@ -2230,7 +2245,6 @@ hasOnlyColdCalls(Function &F,
   for (BasicBlock &BB : F) {
     for (Instruction &I : BB) {
       if (CallInst *CI = dyn_cast<CallInst>(&I)) {
-        CallSite CS(cast<Instruction>(CI));
         // Skip over isline asm instructions since they aren't function calls.
         if (CI->isInlineAsm())
           continue;
@@ -2247,7 +2261,7 @@ hasOnlyColdCalls(Function &F,
             CalledFn->hasAddressTaken())
           return false;
         BlockFrequencyInfo &CallerBFI = GetBFI(F);
-        if (!isColdCallSite(CS, CallerBFI))
+        if (!isColdCallSite(*CI, CallerBFI))
           return false;
       }
     }
@@ -2255,6 +2269,115 @@ hasOnlyColdCalls(Function &F,
   return true;
 }
 
+static bool hasMustTailCallers(Function *F) {
+  for (User *U : F->users()) {
+    CallBase *CB = dyn_cast<CallBase>(U);
+    if (!CB) {
+      assert(isa<BlockAddress>(U) &&
+             "Expected either CallBase or BlockAddress");
+      continue;
+    }
+    if (CB->isMustTailCall())
+      return true;
+  }
+  return false;
+}
+
+static bool hasInvokeCallers(Function *F) {
+  for (User *U : F->users())
+    if (isa<InvokeInst>(U))
+      return true;
+  return false;
+}
+
+static void RemovePreallocated(Function *F) {
+  RemoveAttribute(F, Attribute::Preallocated);
+
+  auto *M = F->getParent();
+
+  IRBuilder<> Builder(M->getContext());
+
+  // Cannot modify users() while iterating over it, so make a copy.
+  SmallVector<User *, 4> PreallocatedCalls(F->users());
+  for (User *U : PreallocatedCalls) {
+    CallBase *CB = dyn_cast<CallBase>(U);
+    if (!CB)
+      continue;
+
+    assert(
+        !CB->isMustTailCall() &&
+        "Shouldn't call RemotePreallocated() on a musttail preallocated call");
+    // Create copy of call without "preallocated" operand bundle.
+    SmallVector<OperandBundleDef, 1> OpBundles;
+    CB->getOperandBundlesAsDefs(OpBundles);
+    CallBase *PreallocatedSetup = nullptr;
+    for (auto *It = OpBundles.begin(); It != OpBundles.end(); ++It) {
+      if (It->getTag() == "preallocated") {
+        PreallocatedSetup = cast<CallBase>(*It->input_begin());
+        OpBundles.erase(It);
+        break;
+      }
+    }
+    assert(PreallocatedSetup && "Did not find preallocated bundle");
+    uint64_t ArgCount =
+        cast<ConstantInt>(PreallocatedSetup->getArgOperand(0))->getZExtValue();
+
+    assert((isa<CallInst>(CB) || isa<InvokeInst>(CB)) &&
+           "Unknown indirect call type");
+    CallBase *NewCB = CallBase::Create(CB, OpBundles, CB);
+    CB->replaceAllUsesWith(NewCB);
+    NewCB->takeName(CB);
+    CB->eraseFromParent();
+
+    Builder.SetInsertPoint(PreallocatedSetup);
+    auto *StackSave =
+        Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stacksave));
+
+    Builder.SetInsertPoint(NewCB->getNextNonDebugInstruction());
+    Builder.CreateCall(Intrinsic::getDeclaration(M, Intrinsic::stackrestore),
+                       StackSave);
+
+    // Replace @llvm.call.preallocated.arg() with alloca.
+    // Cannot modify users() while iterating over it, so make a copy.
+    // @llvm.call.preallocated.arg() can be called with the same index multiple
+    // times. So for each @llvm.call.preallocated.arg(), we see if we have
+    // already created a Value* for the index, and if not, create an alloca and
+    // bitcast right after the @llvm.call.preallocated.setup() so that it
+    // dominates all uses.
+    SmallVector<Value *, 2> ArgAllocas(ArgCount);
+    SmallVector<User *, 2> PreallocatedArgs(PreallocatedSetup->users());
+    for (auto *User : PreallocatedArgs) {
+      auto *UseCall = cast<CallBase>(User);
+      assert(UseCall->getCalledFunction()->getIntrinsicID() ==
+                 Intrinsic::call_preallocated_arg &&
+             "preallocated token use was not a llvm.call.preallocated.arg");
+      uint64_t AllocArgIndex =
+          cast<ConstantInt>(UseCall->getArgOperand(1))->getZExtValue();
+      Value *AllocaReplacement = ArgAllocas[AllocArgIndex];
+      if (!AllocaReplacement) {
+        auto AddressSpace = UseCall->getType()->getPointerAddressSpace();
+        auto *ArgType = UseCall
+                            ->getAttribute(AttributeList::FunctionIndex,
+                                           Attribute::Preallocated)
+                            .getValueAsType();
+        auto *InsertBefore = PreallocatedSetup->getNextNonDebugInstruction();
+        Builder.SetInsertPoint(InsertBefore);
+        auto *Alloca =
+            Builder.CreateAlloca(ArgType, AddressSpace, nullptr, "paarg");
+        auto *BitCast = Builder.CreateBitCast(
+            Alloca, Type::getInt8PtrTy(M->getContext()), UseCall->getName());
+        ArgAllocas[AllocArgIndex] = BitCast;
+        AllocaReplacement = BitCast;
+      }
+
+      UseCall->replaceAllUsesWith(AllocaReplacement);
+      UseCall->eraseFromParent();
+    }
+    // Remove @llvm.call.preallocated.setup().
+    cast<Instruction>(PreallocatedSetup)->eraseFromParent();
+  }
+}
+
 static bool
 OptimizeFunctions(Module &M,
                   function_ref<TargetLibraryInfo &(Function &)> GetTLI,
@@ -2319,11 +2442,22 @@ OptimizeFunctions(Module &M,
     // FIXME: We should also hoist alloca affected by this to the entry
     // block if possible.
     if (F->getAttributes().hasAttrSomewhere(Attribute::InAlloca) &&
-        !F->hasAddressTaken()) {
+        !F->hasAddressTaken() && !hasMustTailCallers(F)) {
       RemoveAttribute(F, Attribute::InAlloca);
       Changed = true;
     }
 
+    // FIXME: handle invokes
+    // FIXME: handle musttail
+    if (F->getAttributes().hasAttrSomewhere(Attribute::Preallocated)) {
+      if (!F->hasAddressTaken() && !hasMustTailCallers(F) &&
+          !hasInvokeCallers(F)) {
+        RemovePreallocated(F);
+        Changed = true;
+      }
+      continue;
+    }
+
     if (hasChangeableCC(F) && !F->isVarArg() && !F->hasAddressTaken()) {
       NumInternalFunc++;
       TargetTransformInfo &TTI = GetTTI(*F);
@@ -2385,7 +2519,7 @@ OptimizeGlobalVars(Module &M,
         // for that optional parameter, since we don't have a Function to
         // provide GetTLI anyway.
         Constant *New = ConstantFoldConstant(C, DL, /*TLI*/ nullptr);
-        if (New && New != C)
+        if (New != C)
           GV->setInitializer(New);
       }
 
@@ -2427,8 +2561,11 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
   }
 
   ConstantInt *CI = cast<ConstantInt>(Addr->getOperand(OpNo));
-  SequentialType *InitTy = cast<SequentialType>(Init->getType());
-  uint64_t NumElts = InitTy->getNumElements();
+  uint64_t NumElts;
+  if (ArrayType *ATy = dyn_cast<ArrayType>(Init->getType()))
+    NumElts = ATy->getNumElements();
+  else
+    NumElts = cast<FixedVectorType>(Init->getType())->getNumElements();
 
   // Break up the array into elements.
   for (uint64_t i = 0, e = NumElts; i != e; ++i)
@@ -2439,7 +2576,7 @@ static Constant *EvaluateStoreInto(Constant *Init, Constant *Val,
     EvaluateStoreInto(Elts[CI->getZExtValue()], Val, Addr, OpNo+1);
 
   if (Init->getType()->isArrayTy())
-    return ConstantArray::get(cast<ArrayType>(InitTy), Elts);
+    return ConstantArray::get(cast<ArrayType>(Init->getType()), Elts);
   return ConstantVector::get(Elts);
 }
 
@@ -2561,8 +2698,10 @@ static void BatchCommitValueTo(const DenseMap<Constant*, Constant*> &Mem) {
       unsigned NumElts;
       if (auto *STy = dyn_cast<StructType>(Ty))
         NumElts = STy->getNumElements();
+      else if (auto *ATy = dyn_cast<ArrayType>(Ty))
+        NumElts = ATy->getNumElements();
       else
-        NumElts = cast<SequentialType>(Ty)->getNumElements();
+        NumElts = cast<FixedVectorType>(Ty)->getNumElements();
       for (unsigned i = 0, e = NumElts; i != e; ++i)
         Elts.push_back(Init->getAggregateElement(i));
     }
diff --git a/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
index 4a319ead23c0..365b269dc3bf 100644
--- a/llvm/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
@@ -111,6 +111,9 @@ static bool splitGlobal(GlobalVariable &GV) {
                             ConstantInt::get(Int32Ty, ByteOffset - SplitBegin)),
                         Type->getOperand(1)}));
     }
+
+    if (GV.hasMetadata(LLVMContext::MD_vcall_visibility))
+      SplitGV->setVCallVisibilityMetadata(GV.getVCallVisibility());
   }
 
   for (User *U : GV.users()) {
diff --git a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
index 5e690714bfdf..d0bd0166534a 100644
--- a/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
+++ b/llvm/lib/Transforms/IPO/HotColdSplitting.cpp
@@ -39,7 +39,6 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
@@ -110,8 +109,8 @@ bool unlikelyExecuted(BasicBlock &BB) {
   // The block is cold if it calls/invokes a cold function. However, do not
   // mark sanitizer traps as cold.
   for (Instruction &I : BB)
-    if (auto CS = CallSite(&I))
-      if (CS.hasFnAttr(Attribute::Cold) && !CS->getMetadata("nosanitize"))
+    if (auto *CB = dyn_cast<CallBase>(&I))
+      if (CB->hasFnAttr(Attribute::Cold) && !CB->getMetadata("nosanitize"))
         return true;
 
   // The block is cold if it has an unreachable terminator, unless it's
@@ -325,11 +324,10 @@ Function *HotColdSplitting::extractColdRegion(
   if (Function *OutF = CE.extractCodeRegion(CEAC)) {
     User *U = *OutF->user_begin();
     CallInst *CI = cast<CallInst>(U);
-    CallSite CS(CI);
     NumColdRegionsOutlined++;
     if (TTI.useColdCCForColdCall(*OutF)) {
       OutF->setCallingConv(CallingConv::Cold);
-      CS.setCallingConv(CallingConv::Cold);
+      CI->setCallingConv(CallingConv::Cold);
     }
     CI->setIsNoInline();
 
@@ -458,6 +456,10 @@ public:
     // first have predecessors within the extraction region.
     if (mayExtractBlock(SinkBB)) {
       addBlockToRegion(&SinkBB, SinkScore);
+      if (pred_empty(&SinkBB)) {
+        ColdRegion->EntireFunctionCold = true;
+        return Regions;
+      }
     } else {
       Regions.emplace_back();
       ColdRegion = &Regions.back();
diff --git a/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp b/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
index 1bda13a9bdd8..8d05a72d68da 100644
--- a/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
+++ b/llvm/lib/Transforms/IPO/IPConstantPropagation.cpp
@@ -17,7 +17,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/CallSite.h"
+#include "llvm/IR/AbstractCallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -53,7 +53,7 @@ static bool PropagateConstantsIntoArguments(Function &F) {
 
   // For each argument, keep track of its constant value and whether it is a
   // constant or not.  The bool is driven to true when found to be non-constant.
-  SmallVector<std::pair<Constant*, bool>, 16> ArgumentConstants;
+  SmallVector<PointerIntPair<Constant *, 1, bool>, 16> ArgumentConstants;
   ArgumentConstants.resize(F.arg_size());
 
   unsigned NumNonconstant = 0;
@@ -80,7 +80,7 @@ static bool PropagateConstantsIntoArguments(Function &F) {
     for (unsigned i = 0, e = ArgumentConstants.size(); i != e; ++i, ++Arg) {
 
       // If this argument is known non-constant, ignore it.
-      if (ArgumentConstants[i].second)
+      if (ArgumentConstants[i].getInt())
         continue;
 
       Value *V = ACS.getCallArgOperand(i);
@@ -102,13 +102,13 @@ static bool PropagateConstantsIntoArguments(Function &F) {
         if (++NumNonconstant == ArgumentConstants.size())
           return false;
 
-        ArgumentConstants[i].second = true;
+        ArgumentConstants[i].setInt(true);
         continue;
       }
 
-      if (C && ArgumentConstants[i].first == nullptr) {
-        ArgumentConstants[i].first = C;   // First constant seen.
-      } else if (C && ArgumentConstants[i].first == C) {
+      if (C && ArgumentConstants[i].getPointer() == nullptr) {
+        ArgumentConstants[i].setPointer(C); // First constant seen.
+      } else if (C && ArgumentConstants[i].getPointer() == C) {
         // Still the constant value we think it is.
       } else if (V == &*Arg) {
         // Ignore recursive calls passing argument down.
@@ -117,7 +117,7 @@ static bool PropagateConstantsIntoArguments(Function &F) {
         // give up on this function.
         if (++NumNonconstant == ArgumentConstants.size())
           return false;
-        ArgumentConstants[i].second = true;
+        ArgumentConstants[i].setInt(true);
       }
     }
   }
@@ -128,11 +128,11 @@ static bool PropagateConstantsIntoArguments(Function &F) {
   Function::arg_iterator AI = F.arg_begin();
   for (unsigned i = 0, e = ArgumentConstants.size(); i != e; ++i, ++AI) {
     // Do we have a constant argument?
-    if (ArgumentConstants[i].second || AI->use_empty() ||
-        AI->hasInAllocaAttr() || (AI->hasByValAttr() && !F.onlyReadsMemory()))
+    if (ArgumentConstants[i].getInt() || AI->use_empty() ||
+        (AI->hasByValAttr() && !F.onlyReadsMemory()))
       continue;
 
-    Value *V = ArgumentConstants[i].first;
+    Value *V = ArgumentConstants[i].getPointer();
     if (!V) V = UndefValue::get(AI->getType());
     AI->replaceAllUsesWith(V);
     ++NumArgumentsProped;
@@ -222,16 +222,15 @@ static bool PropagateConstantReturn(Function &F) {
   // constant.
   bool MadeChange = false;
   for (Use &U : F.uses()) {
-    CallSite CS(U.getUser());
-    Instruction* Call = CS.getInstruction();
+    CallBase *CB = dyn_cast<CallBase>(U.getUser());
 
     // Not a call instruction or a call instruction that's not calling F
     // directly?
-    if (!Call || !CS.isCallee(&U))
+    if (!CB || !CB->isCallee(&U))
       continue;
 
     // Call result not used?
-    if (Call->use_empty())
+    if (CB->use_empty())
       continue;
 
     MadeChange = true;
@@ -241,12 +240,12 @@ static bool PropagateConstantReturn(Function &F) {
       if (Argument *A = dyn_cast<Argument>(New))
         // Was an argument returned? Then find the corresponding argument in
         // the call instruction and use that.
-        New = CS.getArgument(A->getArgNo());
-      Call->replaceAllUsesWith(New);
+        New = CB->getArgOperand(A->getArgNo());
+      CB->replaceAllUsesWith(New);
       continue;
     }
 
-    for (auto I = Call->user_begin(), E = Call->user_end(); I != E;) {
+    for (auto I = CB->user_begin(), E = CB->user_end(); I != E;) {
       Instruction *Ins = cast<Instruction>(*I);
 
       // Increment now, so we can remove the use
@@ -266,7 +265,7 @@ static bool PropagateConstantReturn(Function &F) {
           if (Argument *A = dyn_cast<Argument>(New))
             // Was an argument returned? Then find the corresponding argument in
             // the call instruction and use that.
-            New = CS.getArgument(A->getArgNo());
+            New = CB->getArgOperand(A->getArgNo());
           Ins->replaceAllUsesWith(New);
           Ins->eraseFromParent();
         }
diff --git a/llvm/lib/Transforms/IPO/IPO.cpp b/llvm/lib/Transforms/IPO/IPO.cpp
index 8a15800cbdb5..d37b9236380d 100644
--- a/llvm/lib/Transforms/IPO/IPO.cpp
+++ b/llvm/lib/Transforms/IPO/IPO.cpp
@@ -23,6 +23,7 @@
 using namespace llvm;
 
 void llvm::initializeIPO(PassRegistry &Registry) {
+  initializeOpenMPOptLegacyPassPass(Registry);
   initializeArgPromotionPass(Registry);
   initializeCalledValuePropagationLegacyPassPass(Registry);
   initializeConstantMergeLegacyPassPass(Registry);
@@ -46,6 +47,7 @@ void llvm::initializeIPO(PassRegistry &Registry) {
   initializeMergeFunctionsLegacyPassPass(Registry);
   initializePartialInlinerLegacyPassPass(Registry);
   initializeAttributorLegacyPassPass(Registry);
+  initializeAttributorCGSCCLegacyPassPass(Registry);
   initializePostOrderFunctionAttrsLegacyPassPass(Registry);
   initializeReversePostOrderFunctionAttrsLegacyPassPass(Registry);
   initializePruneEHPass(Registry);
diff --git a/llvm/lib/Transforms/IPO/InlineSimple.cpp b/llvm/lib/Transforms/IPO/InlineSimple.cpp
index e818743544e6..76f1d0c54d08 100644
--- a/llvm/lib/Transforms/IPO/InlineSimple.cpp
+++ b/llvm/lib/Transforms/IPO/InlineSimple.cpp
@@ -15,7 +15,6 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Instructions.h"
@@ -52,26 +51,26 @@ public:
 
   static char ID; // Pass identification, replacement for typeid
 
-  InlineCost getInlineCost(CallSite CS) override {
-    Function *Callee = CS.getCalledFunction();
+  InlineCost getInlineCost(CallBase &CB) override {
+    Function *Callee = CB.getCalledFunction();
     TargetTransformInfo &TTI = TTIWP->getTTI(*Callee);
 
     bool RemarksEnabled = false;
-    const auto &BBs = CS.getCaller()->getBasicBlockList();
+    const auto &BBs = CB.getCaller()->getBasicBlockList();
     if (!BBs.empty()) {
       auto DI = OptimizationRemark(DEBUG_TYPE, "", DebugLoc(), &BBs.front());
       if (DI.isEnabled())
         RemarksEnabled = true;
     }
-    OptimizationRemarkEmitter ORE(CS.getCaller());
+    OptimizationRemarkEmitter ORE(CB.getCaller());
 
     std::function<AssumptionCache &(Function &)> GetAssumptionCache =
         [&](Function &F) -> AssumptionCache & {
       return ACT->getAssumptionCache(F);
     };
-    return llvm::getInlineCost(
-        cast<CallBase>(*CS.getInstruction()), Params, TTI, GetAssumptionCache,
-        /*GetBFI=*/None, PSI, RemarksEnabled ? &ORE : nullptr);
+    return llvm::getInlineCost(CB, Params, TTI, GetAssumptionCache, GetTLI,
+                               /*GetBFI=*/nullptr, PSI,
+                               RemarksEnabled ? &ORE : nullptr);
   }
 
   bool runOnSCC(CallGraphSCC &SCC) override;
diff --git a/llvm/lib/Transforms/IPO/Inliner.cpp b/llvm/lib/Transforms/IPO/Inliner.cpp
index 4b72261131c1..7d2260f4c169 100644
--- a/llvm/lib/Transforms/IPO/Inliner.cpp
+++ b/llvm/lib/Transforms/IPO/Inliner.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/ScopeExit.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -28,16 +29,16 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/CGSCCPassManager.h"
 #include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LazyCallGraph.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -57,8 +58,10 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/CallPromotionUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ImportedFunctionsInliningStatistics.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include <algorithm>
 #include <cassert>
@@ -77,11 +80,6 @@ STATISTIC(NumCallsDeleted, "Number of call sites deleted, not inlined");
 STATISTIC(NumDeleted, "Number of functions deleted because all callers found");
 STATISTIC(NumMergedAllocas, "Number of allocas merged together");
 
-// This weirdly named statistic tracks the number of times that, when attempting
-// to inline a function A into B, we analyze the callers of B in order to see
-// if those would be more profitable and blocked inline steps.
-STATISTIC(NumCallerCallersAnalyzed, "Number of caller-callers analyzed");
-
 /// Flag to disable manual alloca merging.
 ///
 /// Merging of allocas was originally done as a stack-size saving technique
@@ -112,14 +110,6 @@ static cl::opt<InlinerFunctionImportStatsOpts> InlinerFunctionImportStats(
                           "printing of statistics for each inlined function")),
     cl::Hidden, cl::desc("Enable inliner stats for imported functions"));
 
-/// Flag to add inline messages as callsite attributes 'inline-remark'.
-static cl::opt<bool>
-    InlineRemarkAttribute("inline-remark-attribute", cl::init(false),
-                          cl::Hidden,
-                          cl::desc("Enable adding inline-remark attribute to"
-                                   " callsites processed by inliner but decided"
-                                   " to be not inlined"));
-
 LegacyInlinerBase::LegacyInlinerBase(char &ID) : CallGraphSCCPass(ID) {}
 
 LegacyInlinerBase::LegacyInlinerBase(char &ID, bool InsertLifetime)
@@ -158,13 +148,13 @@ using InlinedArrayAllocasTy = DenseMap<ArrayType *, std::vector<AllocaInst *>>;
 /// *actually make it to the backend*, which is really what we want.
 ///
 /// Because we don't have this information, we do this simple and useful hack.
-static void mergeInlinedArrayAllocas(
-    Function *Caller, InlineFunctionInfo &IFI,
-    InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory) {
+static void mergeInlinedArrayAllocas(Function *Caller, InlineFunctionInfo &IFI,
+                                     InlinedArrayAllocasTy &InlinedArrayAllocas,
+                                     int InlineHistory) {
   SmallPtrSet<AllocaInst *, 16> UsedAllocas;
 
-  // When processing our SCC, check to see if CS was inlined from some other
-  // call site.  For example, if we're processing "A" in this code:
+  // When processing our SCC, check to see if the call site was inlined from
+  // some other call site.  For example, if we're processing "A" in this code:
   //   A() { B() }
   //   B() { x = alloca ... C() }
   //   C() { y = alloca ... }
@@ -180,7 +170,7 @@ static void mergeInlinedArrayAllocas(
 
   // Loop over all the allocas we have so far and see if they can be merged with
   // a previously inlined alloca.  If not, remember that we had it.
-  for (unsigned AllocaNo = 0, e = IFI.StaticAllocas.size(); AllocaNo != e;
+  for (unsigned AllocaNo = 0, E = IFI.StaticAllocas.size(); AllocaNo != E;
        ++AllocaNo) {
     AllocaInst *AI = IFI.StaticAllocas[AllocaNo];
 
@@ -201,8 +191,8 @@ static void mergeInlinedArrayAllocas(
     // function.  Also, AllocasForType can be empty of course!
     bool MergedAwayAlloca = false;
     for (AllocaInst *AvailableAlloca : AllocasForType) {
-      unsigned Align1 = AI->getAlignment(),
-               Align2 = AvailableAlloca->getAlignment();
+      Align Align1 = AI->getAlign();
+      Align Align2 = AvailableAlloca->getAlign();
 
       // The available alloca has to be in the right function, not in some other
       // function in this SCC.
@@ -229,18 +219,8 @@ static void mergeInlinedArrayAllocas(
 
       AI->replaceAllUsesWith(AvailableAlloca);
 
-      if (Align1 != Align2) {
-        if (!Align1 || !Align2) {
-          const DataLayout &DL = Caller->getParent()->getDataLayout();
-          unsigned TypeAlign = DL.getABITypeAlignment(AI->getAllocatedType());
-
-          Align1 = Align1 ? Align1 : TypeAlign;
-          Align2 = Align2 ? Align2 : TypeAlign;
-        }
-
-        if (Align1 > Align2)
-          AvailableAlloca->setAlignment(MaybeAlign(AI->getAlignment()));
-      }
+      if (Align1 > Align2)
+        AvailableAlloca->setAlignment(AI->getAlign());
 
       AI->eraseFromParent();
       MergedAwayAlloca = true;
@@ -271,20 +251,20 @@ static void mergeInlinedArrayAllocas(
 /// available from other functions inlined into the caller.  If we are able to
 /// inline this call site we attempt to reuse already available allocas or add
 /// any new allocas to the set if not possible.
-static InlineResult InlineCallIfPossible(
-    CallSite CS, InlineFunctionInfo &IFI,
+static InlineResult inlineCallIfPossible(
+    CallBase &CB, InlineFunctionInfo &IFI,
     InlinedArrayAllocasTy &InlinedArrayAllocas, int InlineHistory,
     bool InsertLifetime, function_ref<AAResults &(Function &)> &AARGetter,
     ImportedFunctionsInliningStatistics &ImportedFunctionsStats) {
-  Function *Callee = CS.getCalledFunction();
-  Function *Caller = CS.getCaller();
+  Function *Callee = CB.getCalledFunction();
+  Function *Caller = CB.getCaller();
 
   AAResults &AAR = AARGetter(*Callee);
 
   // Try to inline the function.  Get the list of static allocas that were
   // inlined.
-  InlineResult IR = InlineFunction(CS, IFI, &AAR, InsertLifetime);
-  if (!IR)
+  InlineResult IR = InlineFunction(CB, IFI, &AAR, InsertLifetime);
+  if (!IR.isSuccess())
     return IR;
 
   if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
@@ -298,188 +278,9 @@ static InlineResult InlineCallIfPossible(
   return IR; // success
 }
 
-/// Return true if inlining of CS can block the caller from being
-/// inlined which is proved to be more beneficial. \p IC is the
-/// estimated inline cost associated with callsite \p CS.
-/// \p TotalSecondaryCost will be set to the estimated cost of inlining the
-/// caller if \p CS is suppressed for inlining.
-static bool
-shouldBeDeferred(Function *Caller, CallSite CS, InlineCost IC,
-                 int &TotalSecondaryCost,
-                 function_ref<InlineCost(CallSite CS)> GetInlineCost) {
-  // For now we only handle local or inline functions.
-  if (!Caller->hasLocalLinkage() && !Caller->hasLinkOnceODRLinkage())
-    return false;
-  // If the cost of inlining CS is non-positive, it is not going to prevent the
-  // caller from being inlined into its callers and hence we don't need to
-  // defer.
-  if (IC.getCost() <= 0)
-    return false;
-  // Try to detect the case where the current inlining candidate caller (call
-  // it B) is a static or linkonce-ODR function and is an inlining candidate
-  // elsewhere, and the current candidate callee (call it C) is large enough
-  // that inlining it into B would make B too big to inline later. In these
-  // circumstances it may be best not to inline C into B, but to inline B into
-  // its callers.
-  //
-  // This only applies to static and linkonce-ODR functions because those are
-  // expected to be available for inlining in the translation units where they
-  // are used. Thus we will always have the opportunity to make local inlining
-  // decisions. Importantly the linkonce-ODR linkage covers inline functions
-  // and templates in C++.
-  //
-  // FIXME: All of this logic should be sunk into getInlineCost. It relies on
-  // the internal implementation of the inline cost metrics rather than
-  // treating them as truly abstract units etc.
-  TotalSecondaryCost = 0;
-  // The candidate cost to be imposed upon the current function.
-  int CandidateCost = IC.getCost() - 1;
-  // If the caller has local linkage and can be inlined to all its callers, we
-  // can apply a huge negative bonus to TotalSecondaryCost.
-  bool ApplyLastCallBonus = Caller->hasLocalLinkage() && !Caller->hasOneUse();
-  // This bool tracks what happens if we DO inline C into B.
-  bool inliningPreventsSomeOuterInline = false;
-  for (User *U : Caller->users()) {
-    // If the caller will not be removed (either because it does not have a
-    // local linkage or because the LastCallToStaticBonus has been already
-    // applied), then we can exit the loop early.
-    if (!ApplyLastCallBonus && TotalSecondaryCost >= IC.getCost())
-      return false;
-    CallSite CS2(U);
-
-    // If this isn't a call to Caller (it could be some other sort
-    // of reference) skip it.  Such references will prevent the caller
-    // from being removed.
-    if (!CS2 || CS2.getCalledFunction() != Caller) {
-      ApplyLastCallBonus = false;
-      continue;
-    }
-
-    InlineCost IC2 = GetInlineCost(CS2);
-    ++NumCallerCallersAnalyzed;
-    if (!IC2) {
-      ApplyLastCallBonus = false;
-      continue;
-    }
-    if (IC2.isAlways())
-      continue;
-
-    // See if inlining of the original callsite would erase the cost delta of
-    // this callsite. We subtract off the penalty for the call instruction,
-    // which we would be deleting.
-    if (IC2.getCostDelta() <= CandidateCost) {
-      inliningPreventsSomeOuterInline = true;
-      TotalSecondaryCost += IC2.getCost();
-    }
-  }
-  // If all outer calls to Caller would get inlined, the cost for the last
-  // one is set very low by getInlineCost, in anticipation that Caller will
-  // be removed entirely.  We did not account for this above unless there
-  // is only one caller of Caller.
-  if (ApplyLastCallBonus)
-    TotalSecondaryCost -= InlineConstants::LastCallToStaticBonus;
-
-  if (inliningPreventsSomeOuterInline && TotalSecondaryCost < IC.getCost())
-    return true;
-
-  return false;
-}
-
-static std::basic_ostream<char> &operator<<(std::basic_ostream<char> &R,
-                                            const ore::NV &Arg) {
-  return R << Arg.Val;
-}
-
-template <class RemarkT>
-RemarkT &operator<<(RemarkT &&R, const InlineCost &IC) {
-  using namespace ore;
-  if (IC.isAlways()) {
-    R << "(cost=always)";
-  } else if (IC.isNever()) {
-    R << "(cost=never)";
-  } else {
-    R << "(cost=" << ore::NV("Cost", IC.getCost())
-      << ", threshold=" << ore::NV("Threshold", IC.getThreshold()) << ")";
-  }
-  if (const char *Reason = IC.getReason())
-    R << ": " << ore::NV("Reason", Reason);
-  return R;
-}
-
-static std::string inlineCostStr(const InlineCost &IC) {
-  std::stringstream Remark;
-  Remark << IC;
-  return Remark.str();
-}
-
-/// Return the cost only if the inliner should attempt to inline at the given
-/// CallSite. If we return the cost, we will emit an optimisation remark later
-/// using that cost, so we won't do so from this function.
-static Optional<InlineCost>
-shouldInline(CallSite CS, function_ref<InlineCost(CallSite CS)> GetInlineCost,
-             OptimizationRemarkEmitter &ORE) {
-  using namespace ore;
-
-  InlineCost IC = GetInlineCost(CS);
-  Instruction *Call = CS.getInstruction();
-  Function *Callee = CS.getCalledFunction();
-  Function *Caller = CS.getCaller();
-
-  if (IC.isAlways()) {
-    LLVM_DEBUG(dbgs() << "    Inlining " << inlineCostStr(IC)
-                      << ", Call: " << *CS.getInstruction() << "\n");
-    return IC;
-  }
-
-  if (IC.isNever()) {
-    LLVM_DEBUG(dbgs() << "    NOT Inlining " << inlineCostStr(IC)
-                      << ", Call: " << *CS.getInstruction() << "\n");
-    ORE.emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
-             << NV("Callee", Callee) << " not inlined into "
-             << NV("Caller", Caller) << " because it should never be inlined "
-             << IC;
-    });
-    return IC;
-  }
-
-  if (!IC) {
-    LLVM_DEBUG(dbgs() << "    NOT Inlining " << inlineCostStr(IC)
-                      << ", Call: " << *CS.getInstruction() << "\n");
-    ORE.emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "TooCostly", Call)
-             << NV("Callee", Callee) << " not inlined into "
-             << NV("Caller", Caller) << " because too costly to inline " << IC;
-    });
-    return IC;
-  }
-
-  int TotalSecondaryCost = 0;
-  if (shouldBeDeferred(Caller, CS, IC, TotalSecondaryCost, GetInlineCost)) {
-    LLVM_DEBUG(dbgs() << "    NOT Inlining: " << *CS.getInstruction()
-                      << " Cost = " << IC.getCost()
-                      << ", outer Cost = " << TotalSecondaryCost << '\n');
-    ORE.emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "IncreaseCostInOtherContexts",
-                                      Call)
-             << "Not inlining. Cost of inlining " << NV("Callee", Callee)
-             << " increases the cost of inlining " << NV("Caller", Caller)
-             << " in other contexts";
-    });
-
-    // IC does not bool() to false, so get an InlineCost that will.
-    // This will not be inspected to make an error message.
-    return None;
-  }
-
-  LLVM_DEBUG(dbgs() << "    Inlining " << inlineCostStr(IC)
-                    << ", Call: " << *CS.getInstruction() << '\n');
-  return IC;
-}
-
 /// Return true if the specified inline history ID
 /// indicates an inline history that includes the specified function.
-static bool InlineHistoryIncludes(
+static bool inlineHistoryIncludes(
     Function *F, int InlineHistoryID,
     const SmallVectorImpl<std::pair<Function *, int>> &InlineHistory) {
   while (InlineHistoryID != -1) {
@@ -504,33 +305,13 @@ bool LegacyInlinerBase::runOnSCC(CallGraphSCC &SCC) {
   return inlineCalls(SCC);
 }
 
-static void emit_inlined_into(OptimizationRemarkEmitter &ORE, DebugLoc &DLoc,
-                              const BasicBlock *Block, const Function &Callee,
-                              const Function &Caller, const InlineCost &IC) {
-  ORE.emit([&]() {
-    bool AlwaysInline = IC.isAlways();
-    StringRef RemarkName = AlwaysInline ? "AlwaysInline" : "Inlined";
-    return OptimizationRemark(DEBUG_TYPE, RemarkName, DLoc, Block)
-           << ore::NV("Callee", &Callee) << " inlined into "
-           << ore::NV("Caller", &Caller) << " with " << IC;
-  });
-}
-
-static void setInlineRemark(CallSite &CS, StringRef message) {
-  if (!InlineRemarkAttribute)
-    return;
-
-  Attribute attr = Attribute::get(CS->getContext(), "inline-remark", message);
-  CS.addAttribute(AttributeList::FunctionIndex, attr);
-}
-
 static bool
 inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
                 std::function<AssumptionCache &(Function &)> GetAssumptionCache,
                 ProfileSummaryInfo *PSI,
-                std::function<TargetLibraryInfo &(Function &)> GetTLI,
+                std::function<const TargetLibraryInfo &(Function &)> GetTLI,
                 bool InsertLifetime,
-                function_ref<InlineCost(CallSite CS)> GetInlineCost,
+                function_ref<InlineCost(CallBase &CB)> GetInlineCost,
                 function_ref<AAResults &(Function &)> AARGetter,
                 ImportedFunctionsInliningStatistics &ImportedFunctionsStats) {
   SmallPtrSet<Function *, 8> SCCFunctions;
@@ -545,7 +326,7 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
   // Scan through and identify all call sites ahead of time so that we only
   // inline call sites in the original functions, not call sites that result
   // from inlining other functions.
-  SmallVector<std::pair<CallSite, int>, 16> CallSites;
+  SmallVector<std::pair<CallBase *, int>, 16> CallSites;
 
   // When inlining a callee produces new call sites, we want to keep track of
   // the fact that they were inlined from the callee.  This allows us to avoid
@@ -561,31 +342,31 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
     OptimizationRemarkEmitter ORE(F);
     for (BasicBlock &BB : *F)
       for (Instruction &I : BB) {
-        CallSite CS(cast<Value>(&I));
+        auto *CB = dyn_cast<CallBase>(&I);
         // If this isn't a call, or it is a call to an intrinsic, it can
         // never be inlined.
-        if (!CS || isa<IntrinsicInst>(I))
+        if (!CB || isa<IntrinsicInst>(I))
           continue;
 
         // If this is a direct call to an external function, we can never inline
         // it.  If it is an indirect call, inlining may resolve it to be a
         // direct call, so we keep it.
-        if (Function *Callee = CS.getCalledFunction())
+        if (Function *Callee = CB->getCalledFunction())
           if (Callee->isDeclaration()) {
             using namespace ore;
 
-            setInlineRemark(CS, "unavailable definition");
+            setInlineRemark(*CB, "unavailable definition");
             ORE.emit([&]() {
               return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
                      << NV("Callee", Callee) << " will not be inlined into "
-                     << NV("Caller", CS.getCaller())
+                     << NV("Caller", CB->getCaller())
                      << " because its definition is unavailable"
                      << setIsVerbose();
             });
             continue;
           }
 
-        CallSites.push_back(std::make_pair(CS, -1));
+        CallSites.push_back(std::make_pair(CB, -1));
       }
   }
 
@@ -598,13 +379,13 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
   // Now that we have all of the call sites, move the ones to functions in the
   // current SCC to the end of the list.
   unsigned FirstCallInSCC = CallSites.size();
-  for (unsigned i = 0; i < FirstCallInSCC; ++i)
-    if (Function *F = CallSites[i].first.getCalledFunction())
+  for (unsigned I = 0; I < FirstCallInSCC; ++I)
+    if (Function *F = CallSites[I].first->getCalledFunction())
       if (SCCFunctions.count(F))
-        std::swap(CallSites[i--], CallSites[--FirstCallInSCC]);
+        std::swap(CallSites[I--], CallSites[--FirstCallInSCC]);
 
   InlinedArrayAllocasTy InlinedArrayAllocas;
-  InlineFunctionInfo InlineInfo(&CG, &GetAssumptionCache, PSI);
+  InlineFunctionInfo InlineInfo(&CG, GetAssumptionCache, PSI);
 
   // Now that we have all of the call sites, loop over them and inline them if
   // it looks profitable to do so.
@@ -616,31 +397,28 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
     // calls to become direct calls.
     // CallSites may be modified inside so ranged for loop can not be used.
     for (unsigned CSi = 0; CSi != CallSites.size(); ++CSi) {
-      CallSite CS = CallSites[CSi].first;
+      auto &P = CallSites[CSi];
+      CallBase &CB = *P.first;
+      const int InlineHistoryID = P.second;
 
-      Function *Caller = CS.getCaller();
-      Function *Callee = CS.getCalledFunction();
+      Function *Caller = CB.getCaller();
+      Function *Callee = CB.getCalledFunction();
 
       // We can only inline direct calls to non-declarations.
       if (!Callee || Callee->isDeclaration())
         continue;
 
-      Instruction *Instr = CS.getInstruction();
-
-      bool IsTriviallyDead =
-          isInstructionTriviallyDead(Instr, &GetTLI(*Caller));
+      bool IsTriviallyDead = isInstructionTriviallyDead(&CB, &GetTLI(*Caller));
 
-      int InlineHistoryID;
       if (!IsTriviallyDead) {
         // If this call site was obtained by inlining another function, verify
         // that the include path for the function did not include the callee
         // itself.  If so, we'd be recursively inlining the same function,
         // which would provide the same callsites, which would cause us to
         // infinitely inline.
-        InlineHistoryID = CallSites[CSi].second;
         if (InlineHistoryID != -1 &&
-            InlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory)) {
-          setInlineRemark(CS, "recursive");
+            inlineHistoryIncludes(Callee, InlineHistoryID, InlineHistory)) {
+          setInlineRemark(CB, "recursive");
           continue;
         }
       }
@@ -650,56 +428,49 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
       // just become a regular analysis dependency.
       OptimizationRemarkEmitter ORE(Caller);
 
-      Optional<InlineCost> OIC = shouldInline(CS, GetInlineCost, ORE);
+      auto OIC = shouldInline(CB, GetInlineCost, ORE);
       // If the policy determines that we should inline this function,
       // delete the call instead.
-      if (!OIC.hasValue()) {
-        setInlineRemark(CS, "deferred");
-        continue;
-      }
-
-      if (!OIC.getValue()) {
-        // shouldInline() call returned a negative inline cost that explains
-        // why this callsite should not be inlined.
-        setInlineRemark(CS, inlineCostStr(*OIC));
+      if (!OIC)
         continue;
-      }
 
       // If this call site is dead and it is to a readonly function, we should
       // just delete the call instead of trying to inline it, regardless of
       // size.  This happens because IPSCCP propagates the result out of the
       // call and then we're left with the dead call.
       if (IsTriviallyDead) {
-        LLVM_DEBUG(dbgs() << "    -> Deleting dead call: " << *Instr << "\n");
+        LLVM_DEBUG(dbgs() << "    -> Deleting dead call: " << CB << "\n");
         // Update the call graph by deleting the edge from Callee to Caller.
-        setInlineRemark(CS, "trivially dead");
-        CG[Caller]->removeCallEdgeFor(*cast<CallBase>(CS.getInstruction()));
-        Instr->eraseFromParent();
+        setInlineRemark(CB, "trivially dead");
+        CG[Caller]->removeCallEdgeFor(CB);
+        CB.eraseFromParent();
         ++NumCallsDeleted;
       } else {
-        // Get DebugLoc to report. CS will be invalid after Inliner.
-        DebugLoc DLoc = CS->getDebugLoc();
-        BasicBlock *Block = CS.getParent();
+        // Get DebugLoc to report. CB will be invalid after Inliner.
+        DebugLoc DLoc = CB.getDebugLoc();
+        BasicBlock *Block = CB.getParent();
 
         // Attempt to inline the function.
         using namespace ore;
 
-        InlineResult IR = InlineCallIfPossible(
-            CS, InlineInfo, InlinedArrayAllocas, InlineHistoryID,
+        InlineResult IR = inlineCallIfPossible(
+            CB, InlineInfo, InlinedArrayAllocas, InlineHistoryID,
             InsertLifetime, AARGetter, ImportedFunctionsStats);
-        if (!IR) {
-          setInlineRemark(CS, std::string(IR) + "; " + inlineCostStr(*OIC));
+        if (!IR.isSuccess()) {
+          setInlineRemark(CB, std::string(IR.getFailureReason()) + "; " +
+                                  inlineCostStr(*OIC));
           ORE.emit([&]() {
             return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc,
                                             Block)
                    << NV("Callee", Callee) << " will not be inlined into "
-                   << NV("Caller", Caller) << ": " << NV("Reason", IR.message);
+                   << NV("Caller", Caller) << ": "
+                   << NV("Reason", IR.getFailureReason());
           });
           continue;
         }
         ++NumInlined;
 
-        emit_inlined_into(ORE, DLoc, Block, *Callee, *Caller, *OIC);
+        emitInlinedInto(ORE, DLoc, Block, *Callee, *Caller, *OIC);
 
         // If inlining this function gave us any new call sites, throw them
         // onto our worklist to process.  They are useful inline candidates.
@@ -709,8 +480,23 @@ inlineCallsImpl(CallGraphSCC &SCC, CallGraph &CG,
           int NewHistoryID = InlineHistory.size();
           InlineHistory.push_back(std::make_pair(Callee, InlineHistoryID));
 
-          for (Value *Ptr : InlineInfo.InlinedCalls)
-            CallSites.push_back(std::make_pair(CallSite(Ptr), NewHistoryID));
+#ifndef NDEBUG
+          // Make sure no dupplicates in the inline candidates. This could
+          // happen when a callsite is simpilfied to reusing the return value
+          // of another callsite during function cloning, thus the other
+          // callsite will be reconsidered here.
+          DenseSet<CallBase *> DbgCallSites;
+          for (auto &II : CallSites)
+            DbgCallSites.insert(II.first);
+#endif
+
+          for (Value *Ptr : InlineInfo.InlinedCalls) {
+#ifndef NDEBUG
+            assert(DbgCallSites.count(dyn_cast<CallBase>(Ptr)) == 0);
+#endif
+            CallSites.push_back(
+                std::make_pair(dyn_cast<CallBase>(Ptr), NewHistoryID));
+          }
         }
       }
 
@@ -759,7 +545,7 @@ bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) {
   CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
   ACT = &getAnalysis<AssumptionCacheTracker>();
   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-  auto GetTLI = [&](Function &F) -> TargetLibraryInfo & {
+  GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
     return getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
   };
   auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
@@ -767,7 +553,7 @@ bool LegacyInlinerBase::inlineCalls(CallGraphSCC &SCC) {
   };
   return inlineCallsImpl(
       SCC, CG, GetAssumptionCache, PSI, GetTLI, InsertLifetime,
-      [this](CallSite CS) { return getInlineCost(CS); }, LegacyAARGetter(*this),
+      [&](CallBase &CB) { return getInlineCost(CB); }, LegacyAARGetter(*this),
       ImportedFunctionsStats);
 }
 
@@ -870,16 +656,47 @@ InlinerPass::~InlinerPass() {
   }
 }
 
+InlineAdvisor &
+InlinerPass::getAdvisor(const ModuleAnalysisManagerCGSCCProxy::Result &MAM,
+                        FunctionAnalysisManager &FAM, Module &M) {
+  auto *IAA = MAM.getCachedResult<InlineAdvisorAnalysis>(M);
+  if (!IAA) {
+    // It should still be possible to run the inliner as a stand-alone SCC pass,
+    // for test scenarios. In that case, we default to the
+    // DefaultInlineAdvisor, which doesn't need to keep state between SCC pass
+    // runs. It also uses just the default InlineParams.
+    // In this case, we need to use the provided FAM, which is valid for the
+    // duration of the inliner pass, and thus the lifetime of the owned advisor.
+    // The one we would get from the MAM can be invalidated as a result of the
+    // inliner's activity.
+    OwnedDefaultAdvisor.emplace(FAM, getInlineParams());
+    return *OwnedDefaultAdvisor;
+  }
+  assert(IAA->getAdvisor() &&
+         "Expected a present InlineAdvisorAnalysis also have an "
+         "InlineAdvisor initialized");
+  return *IAA->getAdvisor();
+}
+
 PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
                                    CGSCCAnalysisManager &AM, LazyCallGraph &CG,
                                    CGSCCUpdateResult &UR) {
-  const ModuleAnalysisManager &MAM =
-      AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG).getManager();
+  const auto &MAMProxy =
+      AM.getResult<ModuleAnalysisManagerCGSCCProxy>(InitialC, CG);
   bool Changed = false;
 
   assert(InitialC.size() > 0 && "Cannot handle an empty SCC!");
   Module &M = *InitialC.begin()->getFunction().getParent();
-  ProfileSummaryInfo *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(M);
+  ProfileSummaryInfo *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(M);
+
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(InitialC, CG)
+          .getManager();
+
+  InlineAdvisor &Advisor = getAdvisor(MAMProxy, FAM, M);
+  Advisor.onPassEntry();
+
+  auto AdvisorOnExit = make_scope_exit([&] { Advisor.onPassExit(); });
 
   if (!ImportedFunctionsStats &&
       InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No) {
@@ -912,11 +729,7 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   // this model, but it is uniformly spread across all the functions in the SCC
   // and eventually they all become too large to inline, rather than
   // incrementally maknig a single function grow in a super linear fashion.
-  SmallVector<std::pair<CallSite, int>, 16> Calls;
-
-  FunctionAnalysisManager &FAM =
-      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(InitialC, CG)
-          .getManager();
+  SmallVector<std::pair<CallBase *, int>, 16> Calls;
 
   // Populate the initial list of calls in this SCC.
   for (auto &N : InitialC) {
@@ -928,17 +741,17 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // FIXME: Using instructions sequence is a really bad way to do this.
     // Instead we should do an actual RPO walk of the function body.
     for (Instruction &I : instructions(N.getFunction()))
-      if (auto CS = CallSite(&I))
-        if (Function *Callee = CS.getCalledFunction()) {
+      if (auto *CB = dyn_cast<CallBase>(&I))
+        if (Function *Callee = CB->getCalledFunction()) {
           if (!Callee->isDeclaration())
-            Calls.push_back({CS, -1});
+            Calls.push_back({CB, -1});
           else if (!isa<IntrinsicInst>(I)) {
             using namespace ore;
-            setInlineRemark(CS, "unavailable definition");
+            setInlineRemark(*CB, "unavailable definition");
             ORE.emit([&]() {
               return OptimizationRemarkMissed(DEBUG_TYPE, "NoDefinition", &I)
                      << NV("Callee", Callee) << " will not be inlined into "
-                     << NV("Caller", CS.getCaller())
+                     << NV("Caller", CB->getCaller())
                      << " because its definition is unavailable"
                      << setIsVerbose();
             });
@@ -969,68 +782,41 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
 
   // Loop forward over all of the calls. Note that we cannot cache the size as
   // inlining can introduce new calls that need to be processed.
-  for (int i = 0; i < (int)Calls.size(); ++i) {
+  for (int I = 0; I < (int)Calls.size(); ++I) {
     // We expect the calls to typically be batched with sequences of calls that
     // have the same caller, so we first set up some shared infrastructure for
     // this caller. We also do any pruning we can at this layer on the caller
     // alone.
-    Function &F = *Calls[i].first.getCaller();
+    Function &F = *Calls[I].first->getCaller();
     LazyCallGraph::Node &N = *CG.lookup(F);
     if (CG.lookupSCC(N) != C)
       continue;
-    if (F.hasOptNone()) {
-      setInlineRemark(Calls[i].first, "optnone attribute");
+    if (!Calls[I].first->getCalledFunction()->hasFnAttribute(
+            Attribute::AlwaysInline) &&
+        F.hasOptNone()) {
+      setInlineRemark(*Calls[I].first, "optnone attribute");
       continue;
     }
 
     LLVM_DEBUG(dbgs() << "Inlining calls in: " << F.getName() << "\n");
 
-    // Get a FunctionAnalysisManager via a proxy for this particular node. We
-    // do this each time we visit a node as the SCC may have changed and as
-    // we're going to mutate this particular function we want to make sure the
-    // proxy is in place to forward any invalidation events. We can use the
-    // manager we get here for looking up results for functions other than this
-    // node however because those functions aren't going to be mutated by this
-    // pass.
-    FunctionAnalysisManager &FAM =
-        AM.getResult<FunctionAnalysisManagerCGSCCProxy>(*C, CG)
-            .getManager();
-
-    // Get the remarks emission analysis for the caller.
-    auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-
-    std::function<AssumptionCache &(Function &)> GetAssumptionCache =
-        [&](Function &F) -> AssumptionCache & {
+    auto GetAssumptionCache = [&](Function &F) -> AssumptionCache & {
       return FAM.getResult<AssumptionAnalysis>(F);
     };
-    auto GetBFI = [&](Function &F) -> BlockFrequencyInfo & {
-      return FAM.getResult<BlockFrequencyAnalysis>(F);
-    };
-
-    auto GetInlineCost = [&](CallSite CS) {
-      Function &Callee = *CS.getCalledFunction();
-      auto &CalleeTTI = FAM.getResult<TargetIRAnalysis>(Callee);
-      bool RemarksEnabled =
-          Callee.getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
-              DEBUG_TYPE);
-      return getInlineCost(cast<CallBase>(*CS.getInstruction()), Params,
-                           CalleeTTI, GetAssumptionCache, {GetBFI}, PSI,
-                           RemarksEnabled ? &ORE : nullptr);
-    };
 
-    // Now process as many calls as we have within this caller in the sequnece.
+    // Now process as many calls as we have within this caller in the sequence.
     // We bail out as soon as the caller has to change so we can update the
     // call graph and prepare the context of that new caller.
     bool DidInline = false;
-    for (; i < (int)Calls.size() && Calls[i].first.getCaller() == &F; ++i) {
-      int InlineHistoryID;
-      CallSite CS;
-      std::tie(CS, InlineHistoryID) = Calls[i];
-      Function &Callee = *CS.getCalledFunction();
+    for (; I < (int)Calls.size() && Calls[I].first->getCaller() == &F; ++I) {
+      auto &P = Calls[I];
+      CallBase *CB = P.first;
+      const int InlineHistoryID = P.second;
+      Function &Callee = *CB->getCalledFunction();
 
       if (InlineHistoryID != -1 &&
-          InlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) {
-        setInlineRemark(CS, "recursive");
+          inlineHistoryIncludes(&Callee, InlineHistoryID, InlineHistory)) {
+        setInlineRemark(*CB, "recursive");
         continue;
       }
 
@@ -1044,62 +830,53 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
         LLVM_DEBUG(dbgs() << "Skipping inlining internal SCC edge from a node "
                              "previously split out of this SCC by inlining: "
                           << F.getName() << " -> " << Callee.getName() << "\n");
-        setInlineRemark(CS, "recursive SCC split");
+        setInlineRemark(*CB, "recursive SCC split");
         continue;
       }
 
-      Optional<InlineCost> OIC = shouldInline(CS, GetInlineCost, ORE);
+      auto Advice = Advisor.getAdvice(*CB);
       // Check whether we want to inline this callsite.
-      if (!OIC.hasValue()) {
-        setInlineRemark(CS, "deferred");
-        continue;
-      }
-
-      if (!OIC.getValue()) {
-        // shouldInline() call returned a negative inline cost that explains
-        // why this callsite should not be inlined.
-        setInlineRemark(CS, inlineCostStr(*OIC));
+      if (!Advice->isInliningRecommended()) {
+        Advice->recordUnattemptedInlining();
         continue;
       }
 
       // Setup the data structure used to plumb customization into the
       // `InlineFunction` routine.
       InlineFunctionInfo IFI(
-          /*cg=*/nullptr, &GetAssumptionCache, PSI,
-          &FAM.getResult<BlockFrequencyAnalysis>(*(CS.getCaller())),
+          /*cg=*/nullptr, GetAssumptionCache, PSI,
+          &FAM.getResult<BlockFrequencyAnalysis>(*(CB->getCaller())),
           &FAM.getResult<BlockFrequencyAnalysis>(Callee));
 
-      // Get DebugLoc to report. CS will be invalid after Inliner.
-      DebugLoc DLoc = CS->getDebugLoc();
-      BasicBlock *Block = CS.getParent();
-
-      using namespace ore;
-
-      InlineResult IR = InlineFunction(CS, IFI);
-      if (!IR) {
-        setInlineRemark(CS, std::string(IR) + "; " + inlineCostStr(*OIC));
-        ORE.emit([&]() {
-          return OptimizationRemarkMissed(DEBUG_TYPE, "NotInlined", DLoc, Block)
-                 << NV("Callee", &Callee) << " will not be inlined into "
-                 << NV("Caller", &F) << ": " << NV("Reason", IR.message);
-        });
+      InlineResult IR = InlineFunction(*CB, IFI);
+      if (!IR.isSuccess()) {
+        Advice->recordUnsuccessfulInlining(IR);
         continue;
       }
+
       DidInline = true;
       InlinedCallees.insert(&Callee);
-
       ++NumInlined;
 
-      emit_inlined_into(ORE, DLoc, Block, Callee, F, *OIC);
-
       // Add any new callsites to defined functions to the worklist.
       if (!IFI.InlinedCallSites.empty()) {
         int NewHistoryID = InlineHistory.size();
         InlineHistory.push_back({&Callee, InlineHistoryID});
-        for (CallSite &CS : reverse(IFI.InlinedCallSites))
-          if (Function *NewCallee = CS.getCalledFunction())
+
+        for (CallBase *ICB : reverse(IFI.InlinedCallSites)) {
+          Function *NewCallee = ICB->getCalledFunction();
+          if (!NewCallee) {
+            // Try to promote an indirect (virtual) call without waiting for
+            // the post-inline cleanup and the next DevirtSCCRepeatedPass
+            // iteration because the next iteration may not happen and we may
+            // miss inlining it.
+            if (tryPromoteCall(*ICB))
+              NewCallee = ICB->getCalledFunction();
+          }
+          if (NewCallee)
             if (!NewCallee->isDeclaration())
-              Calls.push_back({CS, NewHistoryID});
+              Calls.push_back({ICB, NewHistoryID});
+        }
       }
 
       if (InlinerFunctionImportStats != InlinerFunctionImportStatsOpts::No)
@@ -1112,15 +889,16 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
       // dead. In that case, we can drop the body of the function eagerly
       // which may reduce the number of callers of other functions to one,
       // changing inline cost thresholds.
+      bool CalleeWasDeleted = false;
       if (Callee.hasLocalLinkage()) {
         // To check this we also need to nuke any dead constant uses (perhaps
         // made dead by this operation on other functions).
         Callee.removeDeadConstantUsers();
         if (Callee.use_empty() && !CG.isLibFunction(Callee)) {
           Calls.erase(
-              std::remove_if(Calls.begin() + i + 1, Calls.end(),
-                             [&Callee](const std::pair<CallSite, int> &Call) {
-                               return Call.first.getCaller() == &Callee;
+              std::remove_if(Calls.begin() + I + 1, Calls.end(),
+                             [&](const std::pair<CallBase *, int> &Call) {
+                               return Call.first->getCaller() == &Callee;
                              }),
               Calls.end());
           // Clear the body and queue the function itself for deletion when we
@@ -1131,13 +909,18 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
           assert(find(DeadFunctions, &Callee) == DeadFunctions.end() &&
                  "Cannot put cause a function to become dead twice!");
           DeadFunctions.push_back(&Callee);
+          CalleeWasDeleted = true;
         }
       }
+      if (CalleeWasDeleted)
+        Advice->recordInliningWithCalleeDeleted();
+      else
+        Advice->recordInlining();
     }
 
     // Back the call index up by one to put us in a good position to go around
     // the outer loop.
-    --i;
+    --I;
 
     if (!DidInline)
       continue;
@@ -1163,8 +946,13 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     // essentially do all of the same things as a function pass and we can
     // re-use the exact same logic for updating the call graph to reflect the
     // change.
+
+    // Inside the update, we also update the FunctionAnalysisManager in the
+    // proxy for this particular SCC. We do this as the SCC may have changed and
+    // as we're going to mutate this particular function we want to make sure
+    // the proxy is in place to forward any invalidation events.
     LazyCallGraph::SCC *OldC = C;
-    C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR);
+    C = &updateCGAndAnalysisManagerForFunctionPass(CG, *C, N, AM, UR, FAM);
     LLVM_DEBUG(dbgs() << "Updated inlining SCC: " << *C << "\n");
     RC = &C->getOuterRefSCC();
 
@@ -1208,11 +996,8 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   // sets.
   for (Function *DeadF : DeadFunctions) {
     // Get the necessary information out of the call graph and nuke the
-    // function there. Also, cclear out any cached analyses.
+    // function there. Also, clear out any cached analyses.
     auto &DeadC = *CG.lookupSCC(*CG.lookup(*DeadF));
-    FunctionAnalysisManager &FAM =
-        AM.getResult<FunctionAnalysisManagerCGSCCProxy>(DeadC, CG)
-            .getManager();
     FAM.clear(*DeadF, DeadF->getName());
     AM.clear(DeadC, DeadC.getName());
     auto &DeadRC = DeadC.getOuterRefSCC();
@@ -1224,7 +1009,15 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
     UR.InvalidatedRefSCCs.insert(&DeadRC);
 
     // And delete the actual function from the module.
-    M.getFunctionList().erase(DeadF);
+    // The Advisor may use Function pointers to efficiently index various
+    // internal maps, e.g. for memoization. Function cleanup passes like
+    // argument promotion create new functions. It is possible for a new
+    // function to be allocated at the address of a deleted function. We could
+    // index using names, but that's inefficient. Alternatively, we let the
+    // Advisor free the functions when it sees fit.
+    DeadF->getBasicBlockList().clear();
+    M.getFunctionList().remove(DeadF);
+
     ++NumDeleted;
   }
 
@@ -1237,3 +1030,45 @@ PreservedAnalyses InlinerPass::run(LazyCallGraph::SCC &InitialC,
   PA.preserve<FunctionAnalysisManagerCGSCCProxy>();
   return PA;
 }
+
+ModuleInlinerWrapperPass::ModuleInlinerWrapperPass(InlineParams Params,
+                                                   bool Debugging,
+                                                   InliningAdvisorMode Mode,
+                                                   unsigned MaxDevirtIterations)
+    : Params(Params), Mode(Mode), MaxDevirtIterations(MaxDevirtIterations),
+      PM(Debugging), MPM(Debugging) {
+  // Run the inliner first. The theory is that we are walking bottom-up and so
+  // the callees have already been fully optimized, and we want to inline them
+  // into the callers so that our optimizations can reflect that.
+  // For PreLinkThinLTO pass, we disable hot-caller heuristic for sample PGO
+  // because it makes profile annotation in the backend inaccurate.
+  PM.addPass(InlinerPass());
+}
+
+PreservedAnalyses ModuleInlinerWrapperPass::run(Module &M,
+                                                ModuleAnalysisManager &MAM) {
+  auto &IAA = MAM.getResult<InlineAdvisorAnalysis>(M);
+  if (!IAA.tryCreate(Params, Mode)) {
+    M.getContext().emitError(
+        "Could not setup Inlining Advisor for the requested "
+        "mode and/or options");
+    return PreservedAnalyses::all();
+  }
+
+  // We wrap the CGSCC pipeline in a devirtualization repeater. This will try
+  // to detect when we devirtualize indirect calls and iterate the SCC passes
+  // in that case to try and catch knock-on inlining or function attrs
+  // opportunities. Then we add it to the module pipeline by walking the SCCs
+  // in postorder (or bottom-up).
+  // If MaxDevirtIterations is 0, we just don't use the devirtualization
+  // wrapper.
+  if (MaxDevirtIterations == 0)
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(std::move(PM)));
+  else
+    MPM.addPass(createModuleToPostOrderCGSCCPassAdaptor(
+        createDevirtSCCRepeatedPass(std::move(PM), MaxDevirtIterations)));
+  auto Ret = MPM.run(M, MAM);
+
+  IAA.clear();
+  return Ret;
+}
diff --git a/llvm/lib/Transforms/IPO/LoopExtractor.cpp b/llvm/lib/Transforms/IPO/LoopExtractor.cpp
index f7108e8002ac..f7f5b4cf6704 100644
--- a/llvm/lib/Transforms/IPO/LoopExtractor.cpp
+++ b/llvm/lib/Transforms/IPO/LoopExtractor.cpp
@@ -15,7 +15,7 @@
 
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/AssumptionCache.h"
-#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -36,22 +36,30 @@ using namespace llvm;
 STATISTIC(NumExtracted, "Number of loops extracted");
 
 namespace {
-  struct LoopExtractor : public LoopPass {
+  struct LoopExtractor : public ModulePass {
     static char ID; // Pass identification, replacement for typeid
+
+    // The number of natural loops to extract from the program into functions.
     unsigned NumLoops;
 
     explicit LoopExtractor(unsigned numLoops = ~0)
-      : LoopPass(ID), NumLoops(numLoops) {
-        initializeLoopExtractorPass(*PassRegistry::getPassRegistry());
-      }
+        : ModulePass(ID), NumLoops(numLoops) {
+      initializeLoopExtractorPass(*PassRegistry::getPassRegistry());
+    }
 
-    bool runOnLoop(Loop *L, LPPassManager &) override;
+    bool runOnModule(Module &M) override;
+    bool runOnFunction(Function &F);
+
+    bool extractLoops(Loop::iterator From, Loop::iterator To, LoopInfo &LI,
+                      DominatorTree &DT);
+    bool extractLoop(Loop *L, LoopInfo &LI, DominatorTree &DT);
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.addRequiredID(BreakCriticalEdgesID);
-      AU.addRequiredID(LoopSimplifyID);
       AU.addRequired<DominatorTreeWrapperPass>();
       AU.addRequired<LoopInfoWrapperPass>();
+      AU.addPreserved<LoopInfoWrapperPass>();
+      AU.addRequiredID(LoopSimplifyID);
       AU.addUsedIfAvailable<AssumptionCacheTracker>();
     }
   };
@@ -61,8 +69,9 @@ char LoopExtractor::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopExtractor, "loop-extract",
                       "Extract loops into new functions", false, false)
 INITIALIZE_PASS_DEPENDENCY(BreakCriticalEdges)
-INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_END(LoopExtractor, "loop-extract",
                     "Extract loops into new functions", false, false)
 
@@ -83,81 +92,130 @@ INITIALIZE_PASS(SingleLoopExtractor, "loop-extract-single",
 //
 Pass *llvm::createLoopExtractorPass() { return new LoopExtractor(); }
 
-bool LoopExtractor::runOnLoop(Loop *L, LPPassManager &LPM) {
-  if (skipLoop(L))
+bool LoopExtractor::runOnModule(Module &M) {
+  if (skipModule(M))
+    return false;
+
+  if (M.empty())
+    return false;
+
+  if (!NumLoops)
     return false;
 
-  // Only visit top-level loops.
-  if (L->getParentLoop())
+  bool Changed = false;
+
+  // The end of the function list may change (new functions will be added at the
+  // end), so we run from the first to the current last.
+  auto I = M.begin(), E = --M.end();
+  while (true) {
+    Function &F = *I;
+
+    Changed |= runOnFunction(F);
+    if (!NumLoops)
+      break;
+
+    // If this is the last function.
+    if (I == E)
+      break;
+
+    ++I;
+  }
+  return Changed;
+}
+
+bool LoopExtractor::runOnFunction(Function &F) {
+  // Do not modify `optnone` functions.
+  if (F.hasOptNone())
     return false;
 
-  // If LoopSimplify form is not available, stay out of trouble.
-  if (!L->isLoopSimplifyForm())
+  if (F.empty())
     return false;
 
-  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   bool Changed = false;
+  LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>(F, &Changed).getLoopInfo();
+
+  // If there are no loops in the function.
+  if (LI.empty())
+    return Changed;
+
+  DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
 
   // If there is more than one top-level loop in this function, extract all of
-  // the loops. Otherwise there is exactly one top-level loop; in this case if
-  // this function is more than a minimal wrapper around the loop, extract
-  // the loop.
-  bool ShouldExtractLoop = false;
-
-  // Extract the loop if the entry block doesn't branch to the loop header.
-  Instruction *EntryTI =
-      L->getHeader()->getParent()->getEntryBlock().getTerminator();
-  if (!isa<BranchInst>(EntryTI) ||
-      !cast<BranchInst>(EntryTI)->isUnconditional() ||
-      EntryTI->getSuccessor(0) != L->getHeader()) {
-    ShouldExtractLoop = true;
-  } else {
-    // Check to see if any exits from the loop are more than just return
-    // blocks.
-    SmallVector<BasicBlock*, 8> ExitBlocks;
-    L->getExitBlocks(ExitBlocks);
-    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
-      if (!isa<ReturnInst>(ExitBlocks[i]->getTerminator())) {
-        ShouldExtractLoop = true;
-        break;
-      }
+  // the loops.
+  if (std::next(LI.begin()) != LI.end())
+    return Changed | extractLoops(LI.begin(), LI.end(), LI, DT);
+
+  // Otherwise there is exactly one top-level loop.
+  Loop *TLL = *LI.begin();
+
+  // If the loop is in LoopSimplify form, then extract it only if this function
+  // is more than a minimal wrapper around the loop.
+  if (TLL->isLoopSimplifyForm()) {
+    bool ShouldExtractLoop = false;
+
+    // Extract the loop if the entry block doesn't branch to the loop header.
+    Instruction *EntryTI = F.getEntryBlock().getTerminator();
+    if (!isa<BranchInst>(EntryTI) ||
+        !cast<BranchInst>(EntryTI)->isUnconditional() ||
+        EntryTI->getSuccessor(0) != TLL->getHeader()) {
+      ShouldExtractLoop = true;
+    } else {
+      // Check to see if any exits from the loop are more than just return
+      // blocks.
+      SmallVector<BasicBlock *, 8> ExitBlocks;
+      TLL->getExitBlocks(ExitBlocks);
+      for (auto *ExitBlock : ExitBlocks)
+        if (!isa<ReturnInst>(ExitBlock->getTerminator())) {
+          ShouldExtractLoop = true;
+          break;
+        }
+    }
+
+    if (ShouldExtractLoop)
+      return Changed | extractLoop(TLL, LI, DT);
   }
 
-  if (ShouldExtractLoop) {
-    // We must omit EH pads. EH pads must accompany the invoke
-    // instruction. But this would result in a loop in the extracted
-    // function. An infinite cycle occurs when it tries to extract that loop as
-    // well.
-    SmallVector<BasicBlock*, 8> ExitBlocks;
-    L->getExitBlocks(ExitBlocks);
-    for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i)
-      if (ExitBlocks[i]->isEHPad()) {
-        ShouldExtractLoop = false;
-        break;
-      }
+  // Okay, this function is a minimal container around the specified loop.
+  // If we extract the loop, we will continue to just keep extracting it
+  // infinitely... so don't extract it. However, if the loop contains any
+  // sub-loops, extract them.
+  return Changed | extractLoops(TLL->begin(), TLL->end(), LI, DT);
+}
+
+bool LoopExtractor::extractLoops(Loop::iterator From, Loop::iterator To,
+                                 LoopInfo &LI, DominatorTree &DT) {
+  bool Changed = false;
+  SmallVector<Loop *, 8> Loops;
+
+  // Save the list of loops, as it may change.
+  Loops.assign(From, To);
+  for (Loop *L : Loops) {
+    // If LoopSimplify form is not available, stay out of trouble.
+    if (!L->isLoopSimplifyForm())
+      continue;
+
+    Changed |= extractLoop(L, LI, DT);
+    if (!NumLoops)
+      break;
   }
+  return Changed;
+}
 
-  if (ShouldExtractLoop) {
-    if (NumLoops == 0) return Changed;
+bool LoopExtractor::extractLoop(Loop *L, LoopInfo &LI, DominatorTree &DT) {
+  assert(NumLoops != 0);
+  AssumptionCache *AC = nullptr;
+  Function &Func = *L->getHeader()->getParent();
+  if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
+    AC = ACT->lookupAssumptionCache(Func);
+  CodeExtractorAnalysisCache CEAC(Func);
+  CodeExtractor Extractor(DT, *L, false, nullptr, nullptr, AC);
+  if (Extractor.extractCodeRegion(CEAC)) {
+    LI.erase(L);
     --NumLoops;
-    AssumptionCache *AC = nullptr;
-    Function &Func = *L->getHeader()->getParent();
-    if (auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>())
-      AC = ACT->lookupAssumptionCache(Func);
-    CodeExtractorAnalysisCache CEAC(Func);
-    CodeExtractor Extractor(DT, *L, false, nullptr, nullptr, AC);
-    if (Extractor.extractCodeRegion(CEAC) != nullptr) {
-      Changed = true;
-      // After extraction, the loop is replaced by a function call, so
-      // we shouldn't try to run any more loop passes on it.
-      LPM.markLoopAsDeleted(*L);
-      LI.erase(L);
-    }
     ++NumExtracted;
+    return true;
   }
-
-  return Changed;
+  return false;
 }
 
 // createSingleLoopExtractorPass - This pass extracts one natural loop from the
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index fa664966faf7..8eef7e3e7e99 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -382,6 +382,9 @@ class LowerTypeTestsModule {
 
   ModuleSummaryIndex *ExportSummary;
   const ModuleSummaryIndex *ImportSummary;
+  // Set when the client has invoked this to simply drop all type test assume
+  // sequences.
+  bool DropTypeTests;
 
   Triple::ArchType Arch;
   Triple::OSType OS;
@@ -500,7 +503,8 @@ class LowerTypeTestsModule {
 
 public:
   LowerTypeTestsModule(Module &M, ModuleSummaryIndex *ExportSummary,
-                       const ModuleSummaryIndex *ImportSummary);
+                       const ModuleSummaryIndex *ImportSummary,
+                       bool DropTypeTests);
 
   bool lower();
 
@@ -516,22 +520,24 @@ struct LowerTypeTests : public ModulePass {
 
   ModuleSummaryIndex *ExportSummary;
   const ModuleSummaryIndex *ImportSummary;
+  bool DropTypeTests;
 
   LowerTypeTests() : ModulePass(ID), UseCommandLine(true) {
     initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
   }
 
   LowerTypeTests(ModuleSummaryIndex *ExportSummary,
-                 const ModuleSummaryIndex *ImportSummary)
+                 const ModuleSummaryIndex *ImportSummary, bool DropTypeTests)
       : ModulePass(ID), ExportSummary(ExportSummary),
-        ImportSummary(ImportSummary) {
+        ImportSummary(ImportSummary), DropTypeTests(DropTypeTests) {
     initializeLowerTypeTestsPass(*PassRegistry::getPassRegistry());
   }
 
   bool runOnModule(Module &M) override {
     if (UseCommandLine)
       return LowerTypeTestsModule::runForTesting(M);
-    return LowerTypeTestsModule(M, ExportSummary, ImportSummary).lower();
+    return LowerTypeTestsModule(M, ExportSummary, ImportSummary, DropTypeTests)
+        .lower();
   }
 };
 
@@ -544,8 +550,9 @@ INITIALIZE_PASS(LowerTypeTests, "lowertypetests", "Lower type metadata", false,
 
 ModulePass *
 llvm::createLowerTypeTestsPass(ModuleSummaryIndex *ExportSummary,
-                               const ModuleSummaryIndex *ImportSummary) {
-  return new LowerTypeTests(ExportSummary, ImportSummary);
+                               const ModuleSummaryIndex *ImportSummary,
+                               bool DropTypeTests) {
+  return new LowerTypeTests(ExportSummary, ImportSummary, DropTypeTests);
 }
 
 /// Build a bit set for TypeId using the object layouts in
@@ -728,6 +735,9 @@ static bool isKnownTypeIdMember(Metadata *TypeId, const DataLayout &DL,
 /// replace the call with.
 Value *LowerTypeTestsModule::lowerTypeTestCall(Metadata *TypeId, CallInst *CI,
                                                const TypeIdLowering &TIL) {
+  // Delay lowering if the resolution is currently unknown.
+  if (TIL.TheKind == TypeTestResolution::Unknown)
+    return nullptr;
   if (TIL.TheKind == TypeTestResolution::Unsat)
     return ConstantInt::getFalse(M.getContext());
 
@@ -828,11 +838,10 @@ void LowerTypeTestsModule::buildBitSetsFromGlobalVariables(
   uint64_t DesiredPadding = 0;
   for (GlobalTypeMember *G : Globals) {
     auto *GV = cast<GlobalVariable>(G->getGlobal());
-    MaybeAlign Alignment(GV->getAlignment());
-    if (!Alignment)
-      Alignment = Align(DL.getABITypeAlignment(GV->getValueType()));
-    MaxAlign = std::max(MaxAlign, *Alignment);
-    uint64_t GVOffset = alignTo(CurOffset + DesiredPadding, *Alignment);
+    Align Alignment =
+        DL.getValueOrABITypeAlignment(GV->getAlign(), GV->getValueType());
+    MaxAlign = std::max(MaxAlign, Alignment);
+    uint64_t GVOffset = alignTo(CurOffset + DesiredPadding, Alignment);
     GlobalLayout[G] = GVOffset;
     if (GVOffset != 0) {
       uint64_t Padding = GVOffset - CurOffset;
@@ -1030,14 +1039,18 @@ void LowerTypeTestsModule::importTypeTest(CallInst *CI) {
     report_fatal_error("Second argument of llvm.type.test must be metadata");
 
   auto TypeIdStr = dyn_cast<MDString>(TypeIdMDVal->getMetadata());
+  // If this is a local unpromoted type, which doesn't have a metadata string,
+  // treat as Unknown and delay lowering, so that we can still utilize it for
+  // later optimizations.
   if (!TypeIdStr)
-    report_fatal_error(
-        "Second argument of llvm.type.test must be a metadata string");
+    return;
 
   TypeIdLowering TIL = importTypeId(TypeIdStr->getString());
   Value *Lowered = lowerTypeTestCall(TypeIdStr, CI, TIL);
-  CI->replaceAllUsesWith(Lowered);
-  CI->eraseFromParent();
+  if (Lowered) {
+    CI->replaceAllUsesWith(Lowered);
+    CI->eraseFromParent();
+  }
 }
 
 // ThinLTO backend: the function F has a jump table entry; update this module
@@ -1048,7 +1061,7 @@ void LowerTypeTestsModule::importFunction(
   assert(F->getType()->getAddressSpace() == 0);
 
   GlobalValue::VisibilityTypes Visibility = F->getVisibility();
-  std::string Name = F->getName();
+  std::string Name = std::string(F->getName());
 
   if (F->isDeclarationForLinker() && isJumpTableCanonical) {
     // Non-dso_local functions may be overriden at run time,
@@ -1160,8 +1173,10 @@ void LowerTypeTestsModule::lowerTypeTestCalls(
     for (CallInst *CI : TIUI.CallSites) {
       ++NumTypeTestCallsLowered;
       Value *Lowered = lowerTypeTestCall(TypeId, CI, TIL);
-      CI->replaceAllUsesWith(Lowered);
-      CI->eraseFromParent();
+      if (Lowered) {
+        CI->replaceAllUsesWith(Lowered);
+        CI->eraseFromParent();
+      }
     }
   }
 }
@@ -1269,7 +1284,7 @@ void LowerTypeTestsModule::moveInitializerToModuleConstructor(
 
   IRBuilder<> IRB(WeakInitializerFn->getEntryBlock().getTerminator());
   GV->setConstant(false);
-  IRB.CreateAlignedStore(GV->getInitializer(), GV, GV->getAlignment());
+  IRB.CreateAlignedStore(GV->getInitializer(), GV, GV->getAlign());
   GV->setInitializer(Constant::getNullValue(GV->getValueType()));
 }
 
@@ -1516,13 +1531,13 @@ void LowerTypeTestsModule::buildBitSetsFromFunctionsNative(
           F->getType());
       if (Functions[I]->isExported()) {
         if (IsJumpTableCanonical) {
-          ExportSummary->cfiFunctionDefs().insert(F->getName());
+          ExportSummary->cfiFunctionDefs().insert(std::string(F->getName()));
         } else {
           GlobalAlias *JtAlias = GlobalAlias::create(
               F->getValueType(), 0, GlobalValue::ExternalLinkage,
               F->getName() + ".cfi_jt", CombinedGlobalElemPtr, &M);
           JtAlias->setVisibility(GlobalValue::HiddenVisibility);
-          ExportSummary->cfiFunctionDecls().insert(F->getName());
+          ExportSummary->cfiFunctionDecls().insert(std::string(F->getName()));
         }
       }
       if (!IsJumpTableCanonical) {
@@ -1655,8 +1670,9 @@ void LowerTypeTestsModule::buildBitSetsFromDisjointSet(
 /// Lower all type tests in this module.
 LowerTypeTestsModule::LowerTypeTestsModule(
     Module &M, ModuleSummaryIndex *ExportSummary,
-    const ModuleSummaryIndex *ImportSummary)
-    : M(M), ExportSummary(ExportSummary), ImportSummary(ImportSummary) {
+    const ModuleSummaryIndex *ImportSummary, bool DropTypeTests)
+    : M(M), ExportSummary(ExportSummary), ImportSummary(ImportSummary),
+      DropTypeTests(DropTypeTests) {
   assert(!(ExportSummary && ImportSummary));
   Triple TargetTriple(M.getTargetTriple());
   Arch = TargetTriple.getArch();
@@ -1683,7 +1699,8 @@ bool LowerTypeTestsModule::runForTesting(Module &M) {
   bool Changed =
       LowerTypeTestsModule(
           M, ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr,
-          ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr)
+          ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr,
+          /*DropTypeTests*/ false)
           .lower();
 
   if (!ClWriteSummary.empty()) {
@@ -1703,8 +1720,8 @@ bool LowerTypeTestsModule::runForTesting(Module &M) {
 static bool isDirectCall(Use& U) {
   auto *Usr = dyn_cast<CallInst>(U.getUser());
   if (Usr) {
-    CallSite CS(Usr);
-    if (CS.isCallee(&U))
+    auto *CB = dyn_cast<CallBase>(Usr);
+    if (CB && CB->isCallee(&U))
       return true;
   }
   return false;
@@ -1750,6 +1767,33 @@ void LowerTypeTestsModule::replaceDirectCalls(Value *Old, Value *New) {
 }
 
 bool LowerTypeTestsModule::lower() {
+  Function *TypeTestFunc =
+      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+
+  if (DropTypeTests && TypeTestFunc) {
+    for (auto UI = TypeTestFunc->use_begin(), UE = TypeTestFunc->use_end();
+         UI != UE;) {
+      auto *CI = cast<CallInst>((*UI++).getUser());
+      // Find and erase llvm.assume intrinsics for this llvm.type.test call.
+      for (auto CIU = CI->use_begin(), CIUE = CI->use_end(); CIU != CIUE;) {
+        if (auto *AssumeCI = dyn_cast<CallInst>((*CIU++).getUser())) {
+          Function *F = AssumeCI->getCalledFunction();
+          if (F && F->getIntrinsicID() == Intrinsic::assume)
+            AssumeCI->eraseFromParent();
+        }
+      }
+      CI->eraseFromParent();
+    }
+
+    // We have deleted the type intrinsics, so we no longer have enough
+    // information to reason about the liveness of virtual function pointers
+    // in GlobalDCE.
+    for (GlobalVariable &GV : M.globals())
+      GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
+
+    return true;
+  }
+
   // If only some of the modules were split, we cannot correctly perform
   // this transformation. We already checked for the presense of type tests
   // with partially split modules during the thin link, and would have emitted
@@ -1758,8 +1802,6 @@ bool LowerTypeTestsModule::lower() {
       (ImportSummary && ImportSummary->partiallySplitLTOUnits()))
     return false;
 
-  Function *TypeTestFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
   Function *ICallBranchFunnelFunc =
       M.getFunction(Intrinsic::getName(Intrinsic::icall_branch_funnel));
   if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
@@ -1787,9 +1829,10 @@ bool LowerTypeTestsModule::lower() {
       // have the same name, but it's not the one we are looking for.
       if (F.hasLocalLinkage())
         continue;
-      if (ImportSummary->cfiFunctionDefs().count(F.getName()))
+      if (ImportSummary->cfiFunctionDefs().count(std::string(F.getName())))
         Defs.push_back(&F);
-      else if (ImportSummary->cfiFunctionDecls().count(F.getName()))
+      else if (ImportSummary->cfiFunctionDecls().count(
+                   std::string(F.getName())))
         Decls.push_back(&F);
     }
 
@@ -2196,7 +2239,9 @@ bool LowerTypeTestsModule::lower() {
 
 PreservedAnalyses LowerTypeTestsPass::run(Module &M,
                                           ModuleAnalysisManager &AM) {
-  bool Changed = LowerTypeTestsModule(M, ExportSummary, ImportSummary).lower();
+  bool Changed =
+      LowerTypeTestsModule(M, ExportSummary, ImportSummary, DropTypeTests)
+          .lower();
   if (!Changed)
     return PreservedAnalyses::all();
   return PreservedAnalyses::none();
diff --git a/llvm/lib/Transforms/IPO/MergeFunctions.cpp b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
index 06d2a2f31941..8cc19515f3db 100644
--- a/llvm/lib/Transforms/IPO/MergeFunctions.cpp
+++ b/llvm/lib/Transforms/IPO/MergeFunctions.cpp
@@ -95,7 +95,6 @@
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -467,13 +466,13 @@ void MergeFunctions::replaceDirectCallers(Function *Old, Function *New) {
   for (auto UI = Old->use_begin(), UE = Old->use_end(); UI != UE;) {
     Use *U = &*UI;
     ++UI;
-    CallSite CS(U->getUser());
-    if (CS && CS.isCallee(U)) {
+    CallBase *CB = dyn_cast<CallBase>(U->getUser());
+    if (CB && CB->isCallee(U)) {
       // Do not copy attributes from the called function to the call-site.
       // Function comparison ensures that the attributes are the same up to
       // type congruences in byval(), in which case we need to keep the byval
       // type of the call-site, not the callee function.
-      remove(CS.getInstruction()->getFunction());
+      remove(CB->getFunction());
       U->set(BitcastNew);
     }
   }
diff --git a/llvm/lib/Transforms/IPO/OpenMPOpt.cpp b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
new file mode 100644
index 000000000000..f664a2417374
--- /dev/null
+++ b/llvm/lib/Transforms/IPO/OpenMPOpt.cpp
@@ -0,0 +1,1501 @@
+//===-- IPO/OpenMPOpt.cpp - Collection of OpenMP specific optimizations ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// OpenMP specific optimizations:
+//
+// - Deduplication of runtime calls, e.g., omp_get_thread_num.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/IPO/OpenMPOpt.h"
+
+#include "llvm/ADT/EnumeratedArray.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/CallGraph.h"
+#include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
+#include "llvm/Frontend/OpenMP/OMPConstants.h"
+#include "llvm/Frontend/OpenMP/OMPIRBuilder.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/IPO.h"
+#include "llvm/Transforms/IPO/Attributor.h"
+#include "llvm/Transforms/Utils/CallGraphUpdater.h"
+
+using namespace llvm;
+using namespace omp;
+
+#define DEBUG_TYPE "openmp-opt"
+
+static cl::opt<bool> DisableOpenMPOptimizations(
+    "openmp-opt-disable", cl::ZeroOrMore,
+    cl::desc("Disable OpenMP specific optimizations."), cl::Hidden,
+    cl::init(false));
+
+static cl::opt<bool> PrintICVValues("openmp-print-icv-values", cl::init(false),
+                                    cl::Hidden);
+static cl::opt<bool> PrintOpenMPKernels("openmp-print-gpu-kernels",
+                                        cl::init(false), cl::Hidden);
+
+STATISTIC(NumOpenMPRuntimeCallsDeduplicated,
+          "Number of OpenMP runtime calls deduplicated");
+STATISTIC(NumOpenMPParallelRegionsDeleted,
+          "Number of OpenMP parallel regions deleted");
+STATISTIC(NumOpenMPRuntimeFunctionsIdentified,
+          "Number of OpenMP runtime functions identified");
+STATISTIC(NumOpenMPRuntimeFunctionUsesIdentified,
+          "Number of OpenMP runtime function uses identified");
+STATISTIC(NumOpenMPTargetRegionKernels,
+          "Number of OpenMP target region entry points (=kernels) identified");
+STATISTIC(
+    NumOpenMPParallelRegionsReplacedInGPUStateMachine,
+    "Number of OpenMP parallel regions replaced with ID in GPU state machines");
+
+#if !defined(NDEBUG)
+static constexpr auto TAG = "[" DEBUG_TYPE "]";
+#endif
+
+/// Apply \p CB to all uses of \p F. If \p LookThroughConstantExprUses is
+/// true, constant expression users are not given to \p CB but their uses are
+/// traversed transitively.
+template <typename CBTy>
+static void foreachUse(Function &F, CBTy CB,
+                       bool LookThroughConstantExprUses = true) {
+  SmallVector<Use *, 8> Worklist(make_pointer_range(F.uses()));
+
+  for (unsigned idx = 0; idx < Worklist.size(); ++idx) {
+    Use &U = *Worklist[idx];
+
+    // Allow use in constant bitcasts and simply look through them.
+    if (LookThroughConstantExprUses && isa<ConstantExpr>(U.getUser())) {
+      for (Use &CEU : cast<ConstantExpr>(U.getUser())->uses())
+        Worklist.push_back(&CEU);
+      continue;
+    }
+
+    CB(U);
+  }
+}
+
+/// Helper struct to store tracked ICV values at specif instructions.
+struct ICVValue {
+  Instruction *Inst;
+  Value *TrackedValue;
+
+  ICVValue(Instruction *I, Value *Val) : Inst(I), TrackedValue(Val) {}
+};
+
+namespace llvm {
+
+// Provide DenseMapInfo for ICVValue
+template <> struct DenseMapInfo<ICVValue> {
+  using InstInfo = DenseMapInfo<Instruction *>;
+  using ValueInfo = DenseMapInfo<Value *>;
+
+  static inline ICVValue getEmptyKey() {
+    return ICVValue(InstInfo::getEmptyKey(), ValueInfo::getEmptyKey());
+  };
+
+  static inline ICVValue getTombstoneKey() {
+    return ICVValue(InstInfo::getTombstoneKey(), ValueInfo::getTombstoneKey());
+  };
+
+  static unsigned getHashValue(const ICVValue &ICVVal) {
+    return detail::combineHashValue(
+        InstInfo::getHashValue(ICVVal.Inst),
+        ValueInfo::getHashValue(ICVVal.TrackedValue));
+  }
+
+  static bool isEqual(const ICVValue &LHS, const ICVValue &RHS) {
+    return InstInfo::isEqual(LHS.Inst, RHS.Inst) &&
+           ValueInfo::isEqual(LHS.TrackedValue, RHS.TrackedValue);
+  }
+};
+
+} // end namespace llvm
+
+namespace {
+
+struct AAICVTracker;
+
+/// OpenMP specific information. For now, stores RFIs and ICVs also needed for
+/// Attributor runs.
+struct OMPInformationCache : public InformationCache {
+  OMPInformationCache(Module &M, AnalysisGetter &AG,
+                      BumpPtrAllocator &Allocator, SetVector<Function *> &CGSCC,
+                      SmallPtrSetImpl<Kernel> &Kernels)
+      : InformationCache(M, AG, Allocator, &CGSCC), OMPBuilder(M),
+        Kernels(Kernels) {
+    initializeModuleSlice(CGSCC);
+
+    OMPBuilder.initialize();
+    initializeRuntimeFunctions();
+    initializeInternalControlVars();
+  }
+
+  /// Generic information that describes an internal control variable.
+  struct InternalControlVarInfo {
+    /// The kind, as described by InternalControlVar enum.
+    InternalControlVar Kind;
+
+    /// The name of the ICV.
+    StringRef Name;
+
+    /// Environment variable associated with this ICV.
+    StringRef EnvVarName;
+
+    /// Initial value kind.
+    ICVInitValue InitKind;
+
+    /// Initial value.
+    ConstantInt *InitValue;
+
+    /// Setter RTL function associated with this ICV.
+    RuntimeFunction Setter;
+
+    /// Getter RTL function associated with this ICV.
+    RuntimeFunction Getter;
+
+    /// RTL Function corresponding to the override clause of this ICV
+    RuntimeFunction Clause;
+  };
+
+  /// Generic information that describes a runtime function
+  struct RuntimeFunctionInfo {
+
+    /// The kind, as described by the RuntimeFunction enum.
+    RuntimeFunction Kind;
+
+    /// The name of the function.
+    StringRef Name;
+
+    /// Flag to indicate a variadic function.
+    bool IsVarArg;
+
+    /// The return type of the function.
+    Type *ReturnType;
+
+    /// The argument types of the function.
+    SmallVector<Type *, 8> ArgumentTypes;
+
+    /// The declaration if available.
+    Function *Declaration = nullptr;
+
+    /// Uses of this runtime function per function containing the use.
+    using UseVector = SmallVector<Use *, 16>;
+
+    /// Clear UsesMap for runtime function.
+    void clearUsesMap() { UsesMap.clear(); }
+
+    /// Boolean conversion that is true if the runtime function was found.
+    operator bool() const { return Declaration; }
+
+    /// Return the vector of uses in function \p F.
+    UseVector &getOrCreateUseVector(Function *F) {
+      std::shared_ptr<UseVector> &UV = UsesMap[F];
+      if (!UV)
+        UV = std::make_shared<UseVector>();
+      return *UV;
+    }
+
+    /// Return the vector of uses in function \p F or `nullptr` if there are
+    /// none.
+    const UseVector *getUseVector(Function &F) const {
+      auto I = UsesMap.find(&F);
+      if (I != UsesMap.end())
+        return I->second.get();
+      return nullptr;
+    }
+
+    /// Return how many functions contain uses of this runtime function.
+    size_t getNumFunctionsWithUses() const { return UsesMap.size(); }
+
+    /// Return the number of arguments (or the minimal number for variadic
+    /// functions).
+    size_t getNumArgs() const { return ArgumentTypes.size(); }
+
+    /// Run the callback \p CB on each use and forget the use if the result is
+    /// true. The callback will be fed the function in which the use was
+    /// encountered as second argument.
+    void foreachUse(SmallVectorImpl<Function *> &SCC,
+                    function_ref<bool(Use &, Function &)> CB) {
+      for (Function *F : SCC)
+        foreachUse(CB, F);
+    }
+
+    /// Run the callback \p CB on each use within the function \p F and forget
+    /// the use if the result is true.
+    void foreachUse(function_ref<bool(Use &, Function &)> CB, Function *F) {
+      SmallVector<unsigned, 8> ToBeDeleted;
+      ToBeDeleted.clear();
+
+      unsigned Idx = 0;
+      UseVector &UV = getOrCreateUseVector(F);
+
+      for (Use *U : UV) {
+        if (CB(*U, *F))
+          ToBeDeleted.push_back(Idx);
+        ++Idx;
+      }
+
+      // Remove the to-be-deleted indices in reverse order as prior
+      // modifications will not modify the smaller indices.
+      while (!ToBeDeleted.empty()) {
+        unsigned Idx = ToBeDeleted.pop_back_val();
+        UV[Idx] = UV.back();
+        UV.pop_back();
+      }
+    }
+
+  private:
+    /// Map from functions to all uses of this runtime function contained in
+    /// them.
+    DenseMap<Function *, std::shared_ptr<UseVector>> UsesMap;
+  };
+
+  /// Initialize the ModuleSlice member based on \p SCC. ModuleSlices contains
+  /// (a subset of) all functions that we can look at during this SCC traversal.
+  /// This includes functions (transitively) called from the SCC and the
+  /// (transitive) callers of SCC functions. We also can look at a function if
+  /// there is a "reference edge", i.a., if the function somehow uses (!=calls)
+  /// a function in the SCC or a caller of a function in the SCC.
+  void initializeModuleSlice(SetVector<Function *> &SCC) {
+    ModuleSlice.insert(SCC.begin(), SCC.end());
+
+    SmallPtrSet<Function *, 16> Seen;
+    SmallVector<Function *, 16> Worklist(SCC.begin(), SCC.end());
+    while (!Worklist.empty()) {
+      Function *F = Worklist.pop_back_val();
+      ModuleSlice.insert(F);
+
+      for (Instruction &I : instructions(*F))
+        if (auto *CB = dyn_cast<CallBase>(&I))
+          if (Function *Callee = CB->getCalledFunction())
+            if (Seen.insert(Callee).second)
+              Worklist.push_back(Callee);
+    }
+
+    Seen.clear();
+    Worklist.append(SCC.begin(), SCC.end());
+    while (!Worklist.empty()) {
+      Function *F = Worklist.pop_back_val();
+      ModuleSlice.insert(F);
+
+      // Traverse all transitive uses.
+      foreachUse(*F, [&](Use &U) {
+        if (auto *UsrI = dyn_cast<Instruction>(U.getUser()))
+          if (Seen.insert(UsrI->getFunction()).second)
+            Worklist.push_back(UsrI->getFunction());
+      });
+    }
+  }
+
+  /// The slice of the module we are allowed to look at.
+  SmallPtrSet<Function *, 8> ModuleSlice;
+
+  /// An OpenMP-IR-Builder instance
+  OpenMPIRBuilder OMPBuilder;
+
+  /// Map from runtime function kind to the runtime function description.
+  EnumeratedArray<RuntimeFunctionInfo, RuntimeFunction,
+                  RuntimeFunction::OMPRTL___last>
+      RFIs;
+
+  /// Map from ICV kind to the ICV description.
+  EnumeratedArray<InternalControlVarInfo, InternalControlVar,
+                  InternalControlVar::ICV___last>
+      ICVs;
+
+  /// Helper to initialize all internal control variable information for those
+  /// defined in OMPKinds.def.
+  void initializeInternalControlVars() {
+#define ICV_RT_SET(_Name, RTL)                                                 \
+  {                                                                            \
+    auto &ICV = ICVs[_Name];                                                   \
+    ICV.Setter = RTL;                                                          \
+  }
+#define ICV_RT_GET(Name, RTL)                                                  \
+  {                                                                            \
+    auto &ICV = ICVs[Name];                                                    \
+    ICV.Getter = RTL;                                                          \
+  }
+#define ICV_DATA_ENV(Enum, _Name, _EnvVarName, Init)                           \
+  {                                                                            \
+    auto &ICV = ICVs[Enum];                                                    \
+    ICV.Name = _Name;                                                          \
+    ICV.Kind = Enum;                                                           \
+    ICV.InitKind = Init;                                                       \
+    ICV.EnvVarName = _EnvVarName;                                              \
+    switch (ICV.InitKind) {                                                    \
+    case ICV_IMPLEMENTATION_DEFINED:                                           \
+      ICV.InitValue = nullptr;                                                 \
+      break;                                                                   \
+    case ICV_ZERO:                                                             \
+      ICV.InitValue = ConstantInt::get(                                        \
+          Type::getInt32Ty(OMPBuilder.Int32->getContext()), 0);                \
+      break;                                                                   \
+    case ICV_FALSE:                                                            \
+      ICV.InitValue = ConstantInt::getFalse(OMPBuilder.Int1->getContext());    \
+      break;                                                                   \
+    case ICV_LAST:                                                             \
+      break;                                                                   \
+    }                                                                          \
+  }
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  }
+
+  /// Returns true if the function declaration \p F matches the runtime
+  /// function types, that is, return type \p RTFRetType, and argument types
+  /// \p RTFArgTypes.
+  static bool declMatchesRTFTypes(Function *F, Type *RTFRetType,
+                                  SmallVector<Type *, 8> &RTFArgTypes) {
+    // TODO: We should output information to the user (under debug output
+    //       and via remarks).
+
+    if (!F)
+      return false;
+    if (F->getReturnType() != RTFRetType)
+      return false;
+    if (F->arg_size() != RTFArgTypes.size())
+      return false;
+
+    auto RTFTyIt = RTFArgTypes.begin();
+    for (Argument &Arg : F->args()) {
+      if (Arg.getType() != *RTFTyIt)
+        return false;
+
+      ++RTFTyIt;
+    }
+
+    return true;
+  }
+
+  // Helper to collect all uses of the declaration in the UsesMap.
+  unsigned collectUses(RuntimeFunctionInfo &RFI, bool CollectStats = true) {
+    unsigned NumUses = 0;
+    if (!RFI.Declaration)
+      return NumUses;
+    OMPBuilder.addAttributes(RFI.Kind, *RFI.Declaration);
+
+    if (CollectStats) {
+      NumOpenMPRuntimeFunctionsIdentified += 1;
+      NumOpenMPRuntimeFunctionUsesIdentified += RFI.Declaration->getNumUses();
+    }
+
+    // TODO: We directly convert uses into proper calls and unknown uses.
+    for (Use &U : RFI.Declaration->uses()) {
+      if (Instruction *UserI = dyn_cast<Instruction>(U.getUser())) {
+        if (ModuleSlice.count(UserI->getFunction())) {
+          RFI.getOrCreateUseVector(UserI->getFunction()).push_back(&U);
+          ++NumUses;
+        }
+      } else {
+        RFI.getOrCreateUseVector(nullptr).push_back(&U);
+        ++NumUses;
+      }
+    }
+    return NumUses;
+  }
+
+  // Helper function to recollect uses of all runtime functions.
+  void recollectUses() {
+    for (int Idx = 0; Idx < RFIs.size(); ++Idx) {
+      auto &RFI = RFIs[static_cast<RuntimeFunction>(Idx)];
+      RFI.clearUsesMap();
+      collectUses(RFI, /*CollectStats*/ false);
+    }
+  }
+
+  /// Helper to initialize all runtime function information for those defined
+  /// in OpenMPKinds.def.
+  void initializeRuntimeFunctions() {
+    Module &M = *((*ModuleSlice.begin())->getParent());
+
+    // Helper macros for handling __VA_ARGS__ in OMP_RTL
+#define OMP_TYPE(VarName, ...)                                                 \
+  Type *VarName = OMPBuilder.VarName;                                          \
+  (void)VarName;
+
+#define OMP_ARRAY_TYPE(VarName, ...)                                           \
+  ArrayType *VarName##Ty = OMPBuilder.VarName##Ty;                             \
+  (void)VarName##Ty;                                                           \
+  PointerType *VarName##PtrTy = OMPBuilder.VarName##PtrTy;                     \
+  (void)VarName##PtrTy;
+
+#define OMP_FUNCTION_TYPE(VarName, ...)                                        \
+  FunctionType *VarName = OMPBuilder.VarName;                                  \
+  (void)VarName;                                                               \
+  PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr;                         \
+  (void)VarName##Ptr;
+
+#define OMP_STRUCT_TYPE(VarName, ...)                                          \
+  StructType *VarName = OMPBuilder.VarName;                                    \
+  (void)VarName;                                                               \
+  PointerType *VarName##Ptr = OMPBuilder.VarName##Ptr;                         \
+  (void)VarName##Ptr;
+
+#define OMP_RTL(_Enum, _Name, _IsVarArg, _ReturnType, ...)                     \
+  {                                                                            \
+    SmallVector<Type *, 8> ArgsTypes({__VA_ARGS__});                           \
+    Function *F = M.getFunction(_Name);                                        \
+    if (declMatchesRTFTypes(F, OMPBuilder._ReturnType, ArgsTypes)) {           \
+      auto &RFI = RFIs[_Enum];                                                 \
+      RFI.Kind = _Enum;                                                        \
+      RFI.Name = _Name;                                                        \
+      RFI.IsVarArg = _IsVarArg;                                                \
+      RFI.ReturnType = OMPBuilder._ReturnType;                                 \
+      RFI.ArgumentTypes = std::move(ArgsTypes);                                \
+      RFI.Declaration = F;                                                     \
+      unsigned NumUses = collectUses(RFI);                                     \
+      (void)NumUses;                                                           \
+      LLVM_DEBUG({                                                             \
+        dbgs() << TAG << RFI.Name << (RFI.Declaration ? "" : " not")           \
+               << " found\n";                                                  \
+        if (RFI.Declaration)                                                   \
+          dbgs() << TAG << "-> got " << NumUses << " uses in "                 \
+                 << RFI.getNumFunctionsWithUses()                              \
+                 << " different functions.\n";                                 \
+      });                                                                      \
+    }                                                                          \
+  }
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+
+    // TODO: We should attach the attributes defined in OMPKinds.def.
+  }
+
+  /// Collection of known kernels (\see Kernel) in the module.
+  SmallPtrSetImpl<Kernel> &Kernels;
+};
+
+struct OpenMPOpt {
+
+  using OptimizationRemarkGetter =
+      function_ref<OptimizationRemarkEmitter &(Function *)>;
+
+  OpenMPOpt(SmallVectorImpl<Function *> &SCC, CallGraphUpdater &CGUpdater,
+            OptimizationRemarkGetter OREGetter,
+            OMPInformationCache &OMPInfoCache, Attributor &A)
+      : M(*(*SCC.begin())->getParent()), SCC(SCC), CGUpdater(CGUpdater),
+        OREGetter(OREGetter), OMPInfoCache(OMPInfoCache), A(A) {}
+
+  /// Run all OpenMP optimizations on the underlying SCC/ModuleSlice.
+  bool run() {
+    if (SCC.empty())
+      return false;
+
+    bool Changed = false;
+
+    LLVM_DEBUG(dbgs() << TAG << "Run on SCC with " << SCC.size()
+                      << " functions in a slice with "
+                      << OMPInfoCache.ModuleSlice.size() << " functions\n");
+
+    if (PrintICVValues)
+      printICVs();
+    if (PrintOpenMPKernels)
+      printKernels();
+
+    Changed |= rewriteDeviceCodeStateMachine();
+
+    Changed |= runAttributor();
+
+    // Recollect uses, in case Attributor deleted any.
+    OMPInfoCache.recollectUses();
+
+    Changed |= deduplicateRuntimeCalls();
+    Changed |= deleteParallelRegions();
+
+    return Changed;
+  }
+
+  /// Print initial ICV values for testing.
+  /// FIXME: This should be done from the Attributor once it is added.
+  void printICVs() const {
+    InternalControlVar ICVs[] = {ICV_nthreads, ICV_active_levels, ICV_cancel};
+
+    for (Function *F : OMPInfoCache.ModuleSlice) {
+      for (auto ICV : ICVs) {
+        auto ICVInfo = OMPInfoCache.ICVs[ICV];
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "OpenMP ICV " << ore::NV("OpenMPICV", ICVInfo.Name)
+                    << " Value: "
+                    << (ICVInfo.InitValue
+                            ? ICVInfo.InitValue->getValue().toString(10, true)
+                            : "IMPLEMENTATION_DEFINED");
+        };
+
+        emitRemarkOnFunction(F, "OpenMPICVTracker", Remark);
+      }
+    }
+  }
+
+  /// Print OpenMP GPU kernels for testing.
+  void printKernels() const {
+    for (Function *F : SCC) {
+      if (!OMPInfoCache.Kernels.count(F))
+        continue;
+
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "OpenMP GPU kernel "
+                  << ore::NV("OpenMPGPUKernel", F->getName()) << "\n";
+      };
+
+      emitRemarkOnFunction(F, "OpenMPGPU", Remark);
+    }
+  }
+
+  /// Return the call if \p U is a callee use in a regular call. If \p RFI is
+  /// given it has to be the callee or a nullptr is returned.
+  static CallInst *getCallIfRegularCall(
+      Use &U, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
+    CallInst *CI = dyn_cast<CallInst>(U.getUser());
+    if (CI && CI->isCallee(&U) && !CI->hasOperandBundles() &&
+        (!RFI || CI->getCalledFunction() == RFI->Declaration))
+      return CI;
+    return nullptr;
+  }
+
+  /// Return the call if \p V is a regular call. If \p RFI is given it has to be
+  /// the callee or a nullptr is returned.
+  static CallInst *getCallIfRegularCall(
+      Value &V, OMPInformationCache::RuntimeFunctionInfo *RFI = nullptr) {
+    CallInst *CI = dyn_cast<CallInst>(&V);
+    if (CI && !CI->hasOperandBundles() &&
+        (!RFI || CI->getCalledFunction() == RFI->Declaration))
+      return CI;
+    return nullptr;
+  }
+
+private:
+  /// Try to delete parallel regions if possible.
+  bool deleteParallelRegions() {
+    const unsigned CallbackCalleeOperand = 2;
+
+    OMPInformationCache::RuntimeFunctionInfo &RFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_fork_call];
+
+    if (!RFI.Declaration)
+      return false;
+
+    bool Changed = false;
+    auto DeleteCallCB = [&](Use &U, Function &) {
+      CallInst *CI = getCallIfRegularCall(U);
+      if (!CI)
+        return false;
+      auto *Fn = dyn_cast<Function>(
+          CI->getArgOperand(CallbackCalleeOperand)->stripPointerCasts());
+      if (!Fn)
+        return false;
+      if (!Fn->onlyReadsMemory())
+        return false;
+      if (!Fn->hasFnAttribute(Attribute::WillReturn))
+        return false;
+
+      LLVM_DEBUG(dbgs() << TAG << "Delete read-only parallel region in "
+                        << CI->getCaller()->getName() << "\n");
+
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "Parallel region in "
+                  << ore::NV("OpenMPParallelDelete", CI->getCaller()->getName())
+                  << " deleted";
+      };
+      emitRemark<OptimizationRemark>(CI, "OpenMPParallelRegionDeletion",
+                                     Remark);
+
+      CGUpdater.removeCallSite(*CI);
+      CI->eraseFromParent();
+      Changed = true;
+      ++NumOpenMPParallelRegionsDeleted;
+      return true;
+    };
+
+    RFI.foreachUse(SCC, DeleteCallCB);
+
+    return Changed;
+  }
+
+  /// Try to eliminate runtime calls by reusing existing ones.
+  bool deduplicateRuntimeCalls() {
+    bool Changed = false;
+
+    RuntimeFunction DeduplicableRuntimeCallIDs[] = {
+        OMPRTL_omp_get_num_threads,
+        OMPRTL_omp_in_parallel,
+        OMPRTL_omp_get_cancellation,
+        OMPRTL_omp_get_thread_limit,
+        OMPRTL_omp_get_supported_active_levels,
+        OMPRTL_omp_get_level,
+        OMPRTL_omp_get_ancestor_thread_num,
+        OMPRTL_omp_get_team_size,
+        OMPRTL_omp_get_active_level,
+        OMPRTL_omp_in_final,
+        OMPRTL_omp_get_proc_bind,
+        OMPRTL_omp_get_num_places,
+        OMPRTL_omp_get_num_procs,
+        OMPRTL_omp_get_place_num,
+        OMPRTL_omp_get_partition_num_places,
+        OMPRTL_omp_get_partition_place_nums};
+
+    // Global-tid is handled separately.
+    SmallSetVector<Value *, 16> GTIdArgs;
+    collectGlobalThreadIdArguments(GTIdArgs);
+    LLVM_DEBUG(dbgs() << TAG << "Found " << GTIdArgs.size()
+                      << " global thread ID arguments\n");
+
+    for (Function *F : SCC) {
+      for (auto DeduplicableRuntimeCallID : DeduplicableRuntimeCallIDs)
+        deduplicateRuntimeCalls(*F,
+                                OMPInfoCache.RFIs[DeduplicableRuntimeCallID]);
+
+      // __kmpc_global_thread_num is special as we can replace it with an
+      // argument in enough cases to make it worth trying.
+      Value *GTIdArg = nullptr;
+      for (Argument &Arg : F->args())
+        if (GTIdArgs.count(&Arg)) {
+          GTIdArg = &Arg;
+          break;
+        }
+      Changed |= deduplicateRuntimeCalls(
+          *F, OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num], GTIdArg);
+    }
+
+    return Changed;
+  }
+
+  static Value *combinedIdentStruct(Value *CurrentIdent, Value *NextIdent,
+                                    bool GlobalOnly, bool &SingleChoice) {
+    if (CurrentIdent == NextIdent)
+      return CurrentIdent;
+
+    // TODO: Figure out how to actually combine multiple debug locations. For
+    //       now we just keep an existing one if there is a single choice.
+    if (!GlobalOnly || isa<GlobalValue>(NextIdent)) {
+      SingleChoice = !CurrentIdent;
+      return NextIdent;
+    }
+    return nullptr;
+  }
+
+  /// Return an `struct ident_t*` value that represents the ones used in the
+  /// calls of \p RFI inside of \p F. If \p GlobalOnly is true, we will not
+  /// return a local `struct ident_t*`. For now, if we cannot find a suitable
+  /// return value we create one from scratch. We also do not yet combine
+  /// information, e.g., the source locations, see combinedIdentStruct.
+  Value *
+  getCombinedIdentFromCallUsesIn(OMPInformationCache::RuntimeFunctionInfo &RFI,
+                                 Function &F, bool GlobalOnly) {
+    bool SingleChoice = true;
+    Value *Ident = nullptr;
+    auto CombineIdentStruct = [&](Use &U, Function &Caller) {
+      CallInst *CI = getCallIfRegularCall(U, &RFI);
+      if (!CI || &F != &Caller)
+        return false;
+      Ident = combinedIdentStruct(Ident, CI->getArgOperand(0),
+                                  /* GlobalOnly */ true, SingleChoice);
+      return false;
+    };
+    RFI.foreachUse(SCC, CombineIdentStruct);
+
+    if (!Ident || !SingleChoice) {
+      // The IRBuilder uses the insertion block to get to the module, this is
+      // unfortunate but we work around it for now.
+      if (!OMPInfoCache.OMPBuilder.getInsertionPoint().getBlock())
+        OMPInfoCache.OMPBuilder.updateToLocation(OpenMPIRBuilder::InsertPointTy(
+            &F.getEntryBlock(), F.getEntryBlock().begin()));
+      // Create a fallback location if non was found.
+      // TODO: Use the debug locations of the calls instead.
+      Constant *Loc = OMPInfoCache.OMPBuilder.getOrCreateDefaultSrcLocStr();
+      Ident = OMPInfoCache.OMPBuilder.getOrCreateIdent(Loc);
+    }
+    return Ident;
+  }
+
+  /// Try to eliminate calls of \p RFI in \p F by reusing an existing one or
+  /// \p ReplVal if given.
+  bool deduplicateRuntimeCalls(Function &F,
+                               OMPInformationCache::RuntimeFunctionInfo &RFI,
+                               Value *ReplVal = nullptr) {
+    auto *UV = RFI.getUseVector(F);
+    if (!UV || UV->size() + (ReplVal != nullptr) < 2)
+      return false;
+
+    LLVM_DEBUG(
+        dbgs() << TAG << "Deduplicate " << UV->size() << " uses of " << RFI.Name
+               << (ReplVal ? " with an existing value\n" : "\n") << "\n");
+
+    assert((!ReplVal || (isa<Argument>(ReplVal) &&
+                         cast<Argument>(ReplVal)->getParent() == &F)) &&
+           "Unexpected replacement value!");
+
+    // TODO: Use dominance to find a good position instead.
+    auto CanBeMoved = [this](CallBase &CB) {
+      unsigned NumArgs = CB.getNumArgOperands();
+      if (NumArgs == 0)
+        return true;
+      if (CB.getArgOperand(0)->getType() != OMPInfoCache.OMPBuilder.IdentPtr)
+        return false;
+      for (unsigned u = 1; u < NumArgs; ++u)
+        if (isa<Instruction>(CB.getArgOperand(u)))
+          return false;
+      return true;
+    };
+
+    if (!ReplVal) {
+      for (Use *U : *UV)
+        if (CallInst *CI = getCallIfRegularCall(*U, &RFI)) {
+          if (!CanBeMoved(*CI))
+            continue;
+
+          auto Remark = [&](OptimizationRemark OR) {
+            auto newLoc = &*F.getEntryBlock().getFirstInsertionPt();
+            return OR << "OpenMP runtime call "
+                      << ore::NV("OpenMPOptRuntime", RFI.Name) << " moved to "
+                      << ore::NV("OpenMPRuntimeMoves", newLoc->getDebugLoc());
+          };
+          emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeCodeMotion", Remark);
+
+          CI->moveBefore(&*F.getEntryBlock().getFirstInsertionPt());
+          ReplVal = CI;
+          break;
+        }
+      if (!ReplVal)
+        return false;
+    }
+
+    // If we use a call as a replacement value we need to make sure the ident is
+    // valid at the new location. For now we just pick a global one, either
+    // existing and used by one of the calls, or created from scratch.
+    if (CallBase *CI = dyn_cast<CallBase>(ReplVal)) {
+      if (CI->getNumArgOperands() > 0 &&
+          CI->getArgOperand(0)->getType() == OMPInfoCache.OMPBuilder.IdentPtr) {
+        Value *Ident = getCombinedIdentFromCallUsesIn(RFI, F,
+                                                      /* GlobalOnly */ true);
+        CI->setArgOperand(0, Ident);
+      }
+    }
+
+    bool Changed = false;
+    auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
+      CallInst *CI = getCallIfRegularCall(U, &RFI);
+      if (!CI || CI == ReplVal || &F != &Caller)
+        return false;
+      assert(CI->getCaller() == &F && "Unexpected call!");
+
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "OpenMP runtime call "
+                  << ore::NV("OpenMPOptRuntime", RFI.Name) << " deduplicated";
+      };
+      emitRemark<OptimizationRemark>(CI, "OpenMPRuntimeDeduplicated", Remark);
+
+      CGUpdater.removeCallSite(*CI);
+      CI->replaceAllUsesWith(ReplVal);
+      CI->eraseFromParent();
+      ++NumOpenMPRuntimeCallsDeduplicated;
+      Changed = true;
+      return true;
+    };
+    RFI.foreachUse(SCC, ReplaceAndDeleteCB);
+
+    return Changed;
+  }
+
+  /// Collect arguments that represent the global thread id in \p GTIdArgs.
+  void collectGlobalThreadIdArguments(SmallSetVector<Value *, 16> &GTIdArgs) {
+    // TODO: Below we basically perform a fixpoint iteration with a pessimistic
+    //       initialization. We could define an AbstractAttribute instead and
+    //       run the Attributor here once it can be run as an SCC pass.
+
+    // Helper to check the argument \p ArgNo at all call sites of \p F for
+    // a GTId.
+    auto CallArgOpIsGTId = [&](Function &F, unsigned ArgNo, CallInst &RefCI) {
+      if (!F.hasLocalLinkage())
+        return false;
+      for (Use &U : F.uses()) {
+        if (CallInst *CI = getCallIfRegularCall(U)) {
+          Value *ArgOp = CI->getArgOperand(ArgNo);
+          if (CI == &RefCI || GTIdArgs.count(ArgOp) ||
+              getCallIfRegularCall(
+                  *ArgOp, &OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num]))
+            continue;
+        }
+        return false;
+      }
+      return true;
+    };
+
+    // Helper to identify uses of a GTId as GTId arguments.
+    auto AddUserArgs = [&](Value &GTId) {
+      for (Use &U : GTId.uses())
+        if (CallInst *CI = dyn_cast<CallInst>(U.getUser()))
+          if (CI->isArgOperand(&U))
+            if (Function *Callee = CI->getCalledFunction())
+              if (CallArgOpIsGTId(*Callee, U.getOperandNo(), *CI))
+                GTIdArgs.insert(Callee->getArg(U.getOperandNo()));
+    };
+
+    // The argument users of __kmpc_global_thread_num calls are GTIds.
+    OMPInformationCache::RuntimeFunctionInfo &GlobThreadNumRFI =
+        OMPInfoCache.RFIs[OMPRTL___kmpc_global_thread_num];
+
+    GlobThreadNumRFI.foreachUse(SCC, [&](Use &U, Function &F) {
+      if (CallInst *CI = getCallIfRegularCall(U, &GlobThreadNumRFI))
+        AddUserArgs(*CI);
+      return false;
+    });
+
+    // Transitively search for more arguments by looking at the users of the
+    // ones we know already. During the search the GTIdArgs vector is extended
+    // so we cannot cache the size nor can we use a range based for.
+    for (unsigned u = 0; u < GTIdArgs.size(); ++u)
+      AddUserArgs(*GTIdArgs[u]);
+  }
+
+  /// Kernel (=GPU) optimizations and utility functions
+  ///
+  ///{{
+
+  /// Check if \p F is a kernel, hence entry point for target offloading.
+  bool isKernel(Function &F) { return OMPInfoCache.Kernels.count(&F); }
+
+  /// Cache to remember the unique kernel for a function.
+  DenseMap<Function *, Optional<Kernel>> UniqueKernelMap;
+
+  /// Find the unique kernel that will execute \p F, if any.
+  Kernel getUniqueKernelFor(Function &F);
+
+  /// Find the unique kernel that will execute \p I, if any.
+  Kernel getUniqueKernelFor(Instruction &I) {
+    return getUniqueKernelFor(*I.getFunction());
+  }
+
+  /// Rewrite the device (=GPU) code state machine create in non-SPMD mode in
+  /// the cases we can avoid taking the address of a function.
+  bool rewriteDeviceCodeStateMachine();
+
+  ///
+  ///}}
+
+  /// Emit a remark generically
+  ///
+  /// This template function can be used to generically emit a remark. The
+  /// RemarkKind should be one of the following:
+  ///   - OptimizationRemark to indicate a successful optimization attempt
+  ///   - OptimizationRemarkMissed to report a failed optimization attempt
+  ///   - OptimizationRemarkAnalysis to provide additional information about an
+  ///     optimization attempt
+  ///
+  /// The remark is built using a callback function provided by the caller that
+  /// takes a RemarkKind as input and returns a RemarkKind.
+  template <typename RemarkKind,
+            typename RemarkCallBack = function_ref<RemarkKind(RemarkKind &&)>>
+  void emitRemark(Instruction *Inst, StringRef RemarkName,
+                  RemarkCallBack &&RemarkCB) const {
+    Function *F = Inst->getParent()->getParent();
+    auto &ORE = OREGetter(F);
+
+    ORE.emit(
+        [&]() { return RemarkCB(RemarkKind(DEBUG_TYPE, RemarkName, Inst)); });
+  }
+
+  /// Emit a remark on a function. Since only OptimizationRemark is supporting
+  /// this, it can't be made generic.
+  void
+  emitRemarkOnFunction(Function *F, StringRef RemarkName,
+                       function_ref<OptimizationRemark(OptimizationRemark &&)>
+                           &&RemarkCB) const {
+    auto &ORE = OREGetter(F);
+
+    ORE.emit([&]() {
+      return RemarkCB(OptimizationRemark(DEBUG_TYPE, RemarkName, F));
+    });
+  }
+
+  /// The underlying module.
+  Module &M;
+
+  /// The SCC we are operating on.
+  SmallVectorImpl<Function *> &SCC;
+
+  /// Callback to update the call graph, the first argument is a removed call,
+  /// the second an optional replacement call.
+  CallGraphUpdater &CGUpdater;
+
+  /// Callback to get an OptimizationRemarkEmitter from a Function *
+  OptimizationRemarkGetter OREGetter;
+
+  /// OpenMP-specific information cache. Also Used for Attributor runs.
+  OMPInformationCache &OMPInfoCache;
+
+  /// Attributor instance.
+  Attributor &A;
+
+  /// Helper function to run Attributor on SCC.
+  bool runAttributor() {
+    if (SCC.empty())
+      return false;
+
+    registerAAs();
+
+    ChangeStatus Changed = A.run();
+
+    LLVM_DEBUG(dbgs() << "[Attributor] Done with " << SCC.size()
+                      << " functions, result: " << Changed << ".\n");
+
+    return Changed == ChangeStatus::CHANGED;
+  }
+
+  /// Populate the Attributor with abstract attribute opportunities in the
+  /// function.
+  void registerAAs() {
+    for (Function *F : SCC) {
+      if (F->isDeclaration())
+        continue;
+
+      A.getOrCreateAAFor<AAICVTracker>(IRPosition::function(*F));
+    }
+  }
+};
+
+Kernel OpenMPOpt::getUniqueKernelFor(Function &F) {
+  if (!OMPInfoCache.ModuleSlice.count(&F))
+    return nullptr;
+
+  // Use a scope to keep the lifetime of the CachedKernel short.
+  {
+    Optional<Kernel> &CachedKernel = UniqueKernelMap[&F];
+    if (CachedKernel)
+      return *CachedKernel;
+
+    // TODO: We should use an AA to create an (optimistic and callback
+    //       call-aware) call graph. For now we stick to simple patterns that
+    //       are less powerful, basically the worst fixpoint.
+    if (isKernel(F)) {
+      CachedKernel = Kernel(&F);
+      return *CachedKernel;
+    }
+
+    CachedKernel = nullptr;
+    if (!F.hasLocalLinkage())
+      return nullptr;
+  }
+
+  auto GetUniqueKernelForUse = [&](const Use &U) -> Kernel {
+    if (auto *Cmp = dyn_cast<ICmpInst>(U.getUser())) {
+      // Allow use in equality comparisons.
+      if (Cmp->isEquality())
+        return getUniqueKernelFor(*Cmp);
+      return nullptr;
+    }
+    if (auto *CB = dyn_cast<CallBase>(U.getUser())) {
+      // Allow direct calls.
+      if (CB->isCallee(&U))
+        return getUniqueKernelFor(*CB);
+      // Allow the use in __kmpc_kernel_prepare_parallel calls.
+      if (Function *Callee = CB->getCalledFunction())
+        if (Callee->getName() == "__kmpc_kernel_prepare_parallel")
+          return getUniqueKernelFor(*CB);
+      return nullptr;
+    }
+    // Disallow every other use.
+    return nullptr;
+  };
+
+  // TODO: In the future we want to track more than just a unique kernel.
+  SmallPtrSet<Kernel, 2> PotentialKernels;
+  foreachUse(F, [&](const Use &U) {
+    PotentialKernels.insert(GetUniqueKernelForUse(U));
+  });
+
+  Kernel K = nullptr;
+  if (PotentialKernels.size() == 1)
+    K = *PotentialKernels.begin();
+
+  // Cache the result.
+  UniqueKernelMap[&F] = K;
+
+  return K;
+}
+
+bool OpenMPOpt::rewriteDeviceCodeStateMachine() {
+  OMPInformationCache::RuntimeFunctionInfo &KernelPrepareParallelRFI =
+      OMPInfoCache.RFIs[OMPRTL___kmpc_kernel_prepare_parallel];
+
+  bool Changed = false;
+  if (!KernelPrepareParallelRFI)
+    return Changed;
+
+  for (Function *F : SCC) {
+
+    // Check if the function is uses in a __kmpc_kernel_prepare_parallel call at
+    // all.
+    bool UnknownUse = false;
+    bool KernelPrepareUse = false;
+    unsigned NumDirectCalls = 0;
+
+    SmallVector<Use *, 2> ToBeReplacedStateMachineUses;
+    foreachUse(*F, [&](Use &U) {
+      if (auto *CB = dyn_cast<CallBase>(U.getUser()))
+        if (CB->isCallee(&U)) {
+          ++NumDirectCalls;
+          return;
+        }
+
+      if (isa<ICmpInst>(U.getUser())) {
+        ToBeReplacedStateMachineUses.push_back(&U);
+        return;
+      }
+      if (!KernelPrepareUse && OpenMPOpt::getCallIfRegularCall(
+                                   *U.getUser(), &KernelPrepareParallelRFI)) {
+        KernelPrepareUse = true;
+        ToBeReplacedStateMachineUses.push_back(&U);
+        return;
+      }
+      UnknownUse = true;
+    });
+
+    // Do not emit a remark if we haven't seen a __kmpc_kernel_prepare_parallel
+    // use.
+    if (!KernelPrepareUse)
+      continue;
+
+    {
+      auto Remark = [&](OptimizationRemark OR) {
+        return OR << "Found a parallel region that is called in a target "
+                     "region but not part of a combined target construct nor "
+                     "nesed inside a target construct without intermediate "
+                     "code. This can lead to excessive register usage for "
+                     "unrelated target regions in the same translation unit "
+                     "due to spurious call edges assumed by ptxas.";
+      };
+      emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
+    }
+
+    // If this ever hits, we should investigate.
+    // TODO: Checking the number of uses is not a necessary restriction and
+    // should be lifted.
+    if (UnknownUse || NumDirectCalls != 1 ||
+        ToBeReplacedStateMachineUses.size() != 2) {
+      {
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region is used in "
+                    << (UnknownUse ? "unknown" : "unexpected")
+                    << " ways; will not attempt to rewrite the state machine.";
+        };
+        emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD", Remark);
+      }
+      continue;
+    }
+
+    // Even if we have __kmpc_kernel_prepare_parallel calls, we (for now) give
+    // up if the function is not called from a unique kernel.
+    Kernel K = getUniqueKernelFor(*F);
+    if (!K) {
+      {
+        auto Remark = [&](OptimizationRemark OR) {
+          return OR << "Parallel region is not known to be called from a "
+                       "unique single target region, maybe the surrounding "
+                       "function has external linkage?; will not attempt to "
+                       "rewrite the state machine use.";
+        };
+        emitRemarkOnFunction(F, "OpenMPParallelRegionInMultipleKernesl",
+                             Remark);
+      }
+      continue;
+    }
+
+    // We now know F is a parallel body function called only from the kernel K.
+    // We also identified the state machine uses in which we replace the
+    // function pointer by a new global symbol for identification purposes. This
+    // ensures only direct calls to the function are left.
+
+    {
+      auto RemarkParalleRegion = [&](OptimizationRemark OR) {
+        return OR << "Specialize parallel region that is only reached from a "
+                     "single target region to avoid spurious call edges and "
+                     "excessive register usage in other target regions. "
+                     "(parallel region ID: "
+                  << ore::NV("OpenMPParallelRegion", F->getName())
+                  << ", kernel ID: "
+                  << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
+      };
+      emitRemarkOnFunction(F, "OpenMPParallelRegionInNonSPMD",
+                           RemarkParalleRegion);
+      auto RemarkKernel = [&](OptimizationRemark OR) {
+        return OR << "Target region containing the parallel region that is "
+                     "specialized. (parallel region ID: "
+                  << ore::NV("OpenMPParallelRegion", F->getName())
+                  << ", kernel ID: "
+                  << ore::NV("OpenMPTargetRegion", K->getName()) << ")";
+      };
+      emitRemarkOnFunction(K, "OpenMPParallelRegionInNonSPMD", RemarkKernel);
+    }
+
+    Module &M = *F->getParent();
+    Type *Int8Ty = Type::getInt8Ty(M.getContext());
+
+    auto *ID = new GlobalVariable(
+        M, Int8Ty, /* isConstant */ true, GlobalValue::PrivateLinkage,
+        UndefValue::get(Int8Ty), F->getName() + ".ID");
+
+    for (Use *U : ToBeReplacedStateMachineUses)
+      U->set(ConstantExpr::getBitCast(ID, U->get()->getType()));
+
+    ++NumOpenMPParallelRegionsReplacedInGPUStateMachine;
+
+    Changed = true;
+  }
+
+  return Changed;
+}
+
+/// Abstract Attribute for tracking ICV values.
+struct AAICVTracker : public StateWrapper<BooleanState, AbstractAttribute> {
+  using Base = StateWrapper<BooleanState, AbstractAttribute>;
+  AAICVTracker(const IRPosition &IRP, Attributor &A) : Base(IRP) {}
+
+  /// Returns true if value is assumed to be tracked.
+  bool isAssumedTracked() const { return getAssumed(); }
+
+  /// Returns true if value is known to be tracked.
+  bool isKnownTracked() const { return getAssumed(); }
+
+  /// Create an abstract attribute biew for the position \p IRP.
+  static AAICVTracker &createForPosition(const IRPosition &IRP, Attributor &A);
+
+  /// Return the value with which \p I can be replaced for specific \p ICV.
+  virtual Value *getReplacementValue(InternalControlVar ICV,
+                                     const Instruction *I, Attributor &A) = 0;
+
+  /// See AbstractAttribute::getName()
+  const std::string getName() const override { return "AAICVTracker"; }
+
+  /// See AbstractAttribute::getIdAddr()
+  const char *getIdAddr() const override { return &ID; }
+
+  /// This function should return true if the type of the \p AA is AAICVTracker
+  static bool classof(const AbstractAttribute *AA) {
+    return (AA->getIdAddr() == &ID);
+  }
+
+  static const char ID;
+};
+
+struct AAICVTrackerFunction : public AAICVTracker {
+  AAICVTrackerFunction(const IRPosition &IRP, Attributor &A)
+      : AAICVTracker(IRP, A) {}
+
+  // FIXME: come up with better string.
+  const std::string getAsStr() const override { return "ICVTracker"; }
+
+  // FIXME: come up with some stats.
+  void trackStatistics() const override {}
+
+  /// TODO: decide whether to deduplicate here, or use current
+  /// deduplicateRuntimeCalls function.
+  ChangeStatus manifest(Attributor &A) override {
+    ChangeStatus Changed = ChangeStatus::UNCHANGED;
+
+    for (InternalControlVar &ICV : TrackableICVs)
+      if (deduplicateICVGetters(ICV, A))
+        Changed = ChangeStatus::CHANGED;
+
+    return Changed;
+  }
+
+  bool deduplicateICVGetters(InternalControlVar &ICV, Attributor &A) {
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    auto &ICVInfo = OMPInfoCache.ICVs[ICV];
+    auto &GetterRFI = OMPInfoCache.RFIs[ICVInfo.Getter];
+
+    bool Changed = false;
+
+    auto ReplaceAndDeleteCB = [&](Use &U, Function &Caller) {
+      CallInst *CI = OpenMPOpt::getCallIfRegularCall(U, &GetterRFI);
+      Instruction *UserI = cast<Instruction>(U.getUser());
+      Value *ReplVal = getReplacementValue(ICV, UserI, A);
+
+      if (!ReplVal || !CI)
+        return false;
+
+      A.removeCallSite(CI);
+      CI->replaceAllUsesWith(ReplVal);
+      CI->eraseFromParent();
+      Changed = true;
+      return true;
+    };
+
+    GetterRFI.foreachUse(ReplaceAndDeleteCB, getAnchorScope());
+    return Changed;
+  }
+
+  // Map of ICV to their values at specific program point.
+  EnumeratedArray<SmallSetVector<ICVValue, 4>, InternalControlVar,
+                  InternalControlVar::ICV___last>
+      ICVValuesMap;
+
+  // Currently only nthreads is being tracked.
+  // this array will only grow with time.
+  InternalControlVar TrackableICVs[1] = {ICV_nthreads};
+
+  ChangeStatus updateImpl(Attributor &A) override {
+    ChangeStatus HasChanged = ChangeStatus::UNCHANGED;
+
+    Function *F = getAnchorScope();
+
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+
+    for (InternalControlVar ICV : TrackableICVs) {
+      auto &SetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Setter];
+
+      auto TrackValues = [&](Use &U, Function &) {
+        CallInst *CI = OpenMPOpt::getCallIfRegularCall(U);
+        if (!CI)
+          return false;
+
+        // FIXME: handle setters with more that 1 arguments.
+        /// Track new value.
+        if (ICVValuesMap[ICV].insert(ICVValue(CI, CI->getArgOperand(0))))
+          HasChanged = ChangeStatus::CHANGED;
+
+        return false;
+      };
+
+      SetterRFI.foreachUse(TrackValues, F);
+    }
+
+    return HasChanged;
+  }
+
+  /// Return the value with which \p I can be replaced for specific \p ICV.
+  Value *getReplacementValue(InternalControlVar ICV, const Instruction *I,
+                             Attributor &A) override {
+    const BasicBlock *CurrBB = I->getParent();
+
+    auto &ValuesSet = ICVValuesMap[ICV];
+    auto &OMPInfoCache = static_cast<OMPInformationCache &>(A.getInfoCache());
+    auto &GetterRFI = OMPInfoCache.RFIs[OMPInfoCache.ICVs[ICV].Getter];
+
+    for (const auto &ICVVal : ValuesSet) {
+      if (CurrBB == ICVVal.Inst->getParent()) {
+        if (!ICVVal.Inst->comesBefore(I))
+          continue;
+
+        // both instructions are in the same BB and at \p I we know the ICV
+        // value.
+        while (I != ICVVal.Inst) {
+          // we don't yet know if a call might update an ICV.
+          // TODO: check callsite AA for value.
+          if (const auto *CB = dyn_cast<CallBase>(I))
+            if (CB->getCalledFunction() != GetterRFI.Declaration)
+              return nullptr;
+
+          I = I->getPrevNode();
+        }
+
+        // No call in between, return the value.
+        return ICVVal.TrackedValue;
+      }
+    }
+
+    // No value was tracked.
+    return nullptr;
+  }
+};
+} // namespace
+
+const char AAICVTracker::ID = 0;
+
+AAICVTracker &AAICVTracker::createForPosition(const IRPosition &IRP,
+                                              Attributor &A) {
+  AAICVTracker *AA = nullptr;
+  switch (IRP.getPositionKind()) {
+  case IRPosition::IRP_INVALID:
+  case IRPosition::IRP_FLOAT:
+  case IRPosition::IRP_ARGUMENT:
+  case IRPosition::IRP_RETURNED:
+  case IRPosition::IRP_CALL_SITE_RETURNED:
+  case IRPosition::IRP_CALL_SITE_ARGUMENT:
+  case IRPosition::IRP_CALL_SITE:
+    llvm_unreachable("ICVTracker can only be created for function position!");
+  case IRPosition::IRP_FUNCTION:
+    AA = new (A.Allocator) AAICVTrackerFunction(IRP, A);
+    break;
+  }
+
+  return *AA;
+}
+
+PreservedAnalyses OpenMPOptPass::run(LazyCallGraph::SCC &C,
+                                     CGSCCAnalysisManager &AM,
+                                     LazyCallGraph &CG, CGSCCUpdateResult &UR) {
+  if (!containsOpenMP(*C.begin()->getFunction().getParent(), OMPInModule))
+    return PreservedAnalyses::all();
+
+  if (DisableOpenMPOptimizations)
+    return PreservedAnalyses::all();
+
+  SmallVector<Function *, 16> SCC;
+  for (LazyCallGraph::Node &N : C)
+    SCC.push_back(&N.getFunction());
+
+  if (SCC.empty())
+    return PreservedAnalyses::all();
+
+  FunctionAnalysisManager &FAM =
+      AM.getResult<FunctionAnalysisManagerCGSCCProxy>(C, CG).getManager();
+
+  AnalysisGetter AG(FAM);
+
+  auto OREGetter = [&FAM](Function *F) -> OptimizationRemarkEmitter & {
+    return FAM.getResult<OptimizationRemarkEmitterAnalysis>(*F);
+  };
+
+  CallGraphUpdater CGUpdater;
+  CGUpdater.initialize(CG, C, AM, UR);
+
+  SetVector<Function *> Functions(SCC.begin(), SCC.end());
+  BumpPtrAllocator Allocator;
+  OMPInformationCache InfoCache(*(Functions.back()->getParent()), AG, Allocator,
+                                /*CGSCC*/ Functions, OMPInModule.getKernels());
+
+  Attributor A(Functions, InfoCache, CGUpdater);
+
+  // TODO: Compute the module slice we are allowed to look at.
+  OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
+  bool Changed = OMPOpt.run();
+  if (Changed)
+    return PreservedAnalyses::none();
+
+  return PreservedAnalyses::all();
+}
+
+namespace {
+
+struct OpenMPOptLegacyPass : public CallGraphSCCPass {
+  CallGraphUpdater CGUpdater;
+  OpenMPInModule OMPInModule;
+  static char ID;
+
+  OpenMPOptLegacyPass() : CallGraphSCCPass(ID) {
+    initializeOpenMPOptLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    CallGraphSCCPass::getAnalysisUsage(AU);
+  }
+
+  bool doInitialization(CallGraph &CG) override {
+    // Disable the pass if there is no OpenMP (runtime call) in the module.
+    containsOpenMP(CG.getModule(), OMPInModule);
+    return false;
+  }
+
+  bool runOnSCC(CallGraphSCC &CGSCC) override {
+    if (!containsOpenMP(CGSCC.getCallGraph().getModule(), OMPInModule))
+      return false;
+    if (DisableOpenMPOptimizations || skipSCC(CGSCC))
+      return false;
+
+    SmallVector<Function *, 16> SCC;
+    for (CallGraphNode *CGN : CGSCC)
+      if (Function *Fn = CGN->getFunction())
+        if (!Fn->isDeclaration())
+          SCC.push_back(Fn);
+
+    if (SCC.empty())
+      return false;
+
+    CallGraph &CG = getAnalysis<CallGraphWrapperPass>().getCallGraph();
+    CGUpdater.initialize(CG, CGSCC);
+
+    // Maintain a map of functions to avoid rebuilding the ORE
+    DenseMap<Function *, std::unique_ptr<OptimizationRemarkEmitter>> OREMap;
+    auto OREGetter = [&OREMap](Function *F) -> OptimizationRemarkEmitter & {
+      std::unique_ptr<OptimizationRemarkEmitter> &ORE = OREMap[F];
+      if (!ORE)
+        ORE = std::make_unique<OptimizationRemarkEmitter>(F);
+      return *ORE;
+    };
+
+    AnalysisGetter AG;
+    SetVector<Function *> Functions(SCC.begin(), SCC.end());
+    BumpPtrAllocator Allocator;
+    OMPInformationCache InfoCache(
+        *(Functions.back()->getParent()), AG, Allocator,
+        /*CGSCC*/ Functions, OMPInModule.getKernels());
+
+    Attributor A(Functions, InfoCache, CGUpdater);
+
+    // TODO: Compute the module slice we are allowed to look at.
+    OpenMPOpt OMPOpt(SCC, CGUpdater, OREGetter, InfoCache, A);
+    return OMPOpt.run();
+  }
+
+  bool doFinalization(CallGraph &CG) override { return CGUpdater.finalize(); }
+};
+
+} // end anonymous namespace
+
+void OpenMPInModule::identifyKernels(Module &M) {
+
+  NamedMDNode *MD = M.getOrInsertNamedMetadata("nvvm.annotations");
+  if (!MD)
+    return;
+
+  for (auto *Op : MD->operands()) {
+    if (Op->getNumOperands() < 2)
+      continue;
+    MDString *KindID = dyn_cast<MDString>(Op->getOperand(1));
+    if (!KindID || KindID->getString() != "kernel")
+      continue;
+
+    Function *KernelFn =
+        mdconst::dyn_extract_or_null<Function>(Op->getOperand(0));
+    if (!KernelFn)
+      continue;
+
+    ++NumOpenMPTargetRegionKernels;
+
+    Kernels.insert(KernelFn);
+  }
+}
+
+bool llvm::omp::containsOpenMP(Module &M, OpenMPInModule &OMPInModule) {
+  if (OMPInModule.isKnown())
+    return OMPInModule;
+
+  // MSVC doesn't like long if-else chains for some reason and instead just
+  // issues an error. Work around it..
+  do {
+#define OMP_RTL(_Enum, _Name, ...)                                             \
+  if (M.getFunction(_Name)) {                                                  \
+    OMPInModule = true;                                                        \
+    break;                                                                     \
+  }
+#include "llvm/Frontend/OpenMP/OMPKinds.def"
+  } while (false);
+
+  // Identify kernels once. TODO: We should split the OMPInformationCache into a
+  // module and an SCC part. The kernel information, among other things, could
+  // go into the module part.
+  if (OMPInModule.isKnown() && OMPInModule) {
+    OMPInModule.identifyKernels(M);
+    return true;
+  }
+
+  return OMPInModule = false;
+}
+
+char OpenMPOptLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(OpenMPOptLegacyPass, "openmpopt",
+                      "OpenMP specific optimizations", false, false)
+INITIALIZE_PASS_DEPENDENCY(CallGraphWrapperPass)
+INITIALIZE_PASS_END(OpenMPOptLegacyPass, "openmpopt",
+                    "OpenMP specific optimizations", false, false)
+
+Pass *llvm::createOpenMPOptLegacyPass() { return new OpenMPOptLegacyPass(); }
diff --git a/llvm/lib/Transforms/IPO/PartialInlining.cpp b/llvm/lib/Transforms/IPO/PartialInlining.cpp
index cd3701e90308..5d863f1330a4 100644
--- a/llvm/lib/Transforms/IPO/PartialInlining.cpp
+++ b/llvm/lib/Transforms/IPO/PartialInlining.cpp
@@ -30,7 +30,6 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
@@ -199,13 +198,14 @@ struct FunctionOutliningMultiRegionInfo {
 struct PartialInlinerImpl {
 
   PartialInlinerImpl(
-      std::function<AssumptionCache &(Function &)> *GetAC,
+      function_ref<AssumptionCache &(Function &)> GetAC,
       function_ref<AssumptionCache *(Function &)> LookupAC,
-      std::function<TargetTransformInfo &(Function &)> *GTTI,
-      Optional<function_ref<BlockFrequencyInfo &(Function &)>> GBFI,
-      ProfileSummaryInfo *ProfSI)
+      function_ref<TargetTransformInfo &(Function &)> GTTI,
+      function_ref<const TargetLibraryInfo &(Function &)> GTLI,
+      ProfileSummaryInfo &ProfSI,
+      function_ref<BlockFrequencyInfo &(Function &)> GBFI = nullptr)
       : GetAssumptionCache(GetAC), LookupAssumptionCache(LookupAC),
-        GetTTI(GTTI), GetBFI(GBFI), PSI(ProfSI) {}
+        GetTTI(GTTI), GetBFI(GBFI), GetTLI(GTLI), PSI(ProfSI) {}
 
   bool run(Module &M);
   // Main part of the transformation that calls helper functions to find
@@ -270,11 +270,12 @@ struct PartialInlinerImpl {
 
 private:
   int NumPartialInlining = 0;
-  std::function<AssumptionCache &(Function &)> *GetAssumptionCache;
+  function_ref<AssumptionCache &(Function &)> GetAssumptionCache;
   function_ref<AssumptionCache *(Function &)> LookupAssumptionCache;
-  std::function<TargetTransformInfo &(Function &)> *GetTTI;
-  Optional<function_ref<BlockFrequencyInfo &(Function &)>> GetBFI;
-  ProfileSummaryInfo *PSI;
+  function_ref<TargetTransformInfo &(Function &)> GetTTI;
+  function_ref<BlockFrequencyInfo &(Function &)> GetBFI;
+  function_ref<const TargetLibraryInfo &(Function &)> GetTLI;
+  ProfileSummaryInfo &PSI;
 
   // Return the frequency of the OutlininingBB relative to F's entry point.
   // The result is no larger than 1 and is represented using BP.
@@ -282,9 +283,9 @@ private:
   // edges from the guarding entry blocks).
   BranchProbability getOutliningCallBBRelativeFreq(FunctionCloner &Cloner);
 
-  // Return true if the callee of CS should be partially inlined with
+  // Return true if the callee of CB should be partially inlined with
   // profit.
-  bool shouldPartialInline(CallSite CS, FunctionCloner &Cloner,
+  bool shouldPartialInline(CallBase &CB, FunctionCloner &Cloner,
                            BlockFrequency WeightedOutliningRcost,
                            OptimizationRemarkEmitter &ORE);
 
@@ -303,26 +304,22 @@ private:
             NumPartialInlining >= MaxNumPartialInlining);
   }
 
-  static CallSite getCallSite(User *U) {
-    CallSite CS;
-    if (CallInst *CI = dyn_cast<CallInst>(U))
-      CS = CallSite(CI);
-    else if (InvokeInst *II = dyn_cast<InvokeInst>(U))
-      CS = CallSite(II);
-    else
-      llvm_unreachable("All uses must be calls");
-    return CS;
+  static CallBase *getSupportedCallBase(User *U) {
+    if (isa<CallInst>(U) || isa<InvokeInst>(U))
+      return cast<CallBase>(U);
+    llvm_unreachable("All uses must be calls");
+    return nullptr;
   }
 
-  static CallSite getOneCallSiteTo(Function *F) {
+  static CallBase *getOneCallSiteTo(Function *F) {
     User *User = *F->user_begin();
-    return getCallSite(User);
+    return getSupportedCallBase(User);
   }
 
   std::tuple<DebugLoc, BasicBlock *> getOneDebugLoc(Function *F) {
-    CallSite CS = getOneCallSiteTo(F);
-    DebugLoc DLoc = CS.getInstruction()->getDebugLoc();
-    BasicBlock *Block = CS.getParent();
+    CallBase *CB = getOneCallSiteTo(F);
+    DebugLoc DLoc = CB->getDebugLoc();
+    BasicBlock *Block = CB->getParent();
     return std::make_tuple(DLoc, Block);
   }
 
@@ -355,6 +352,7 @@ struct PartialInlinerLegacyPass : public ModulePass {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<ProfileSummaryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 
   bool runOnModule(Module &M) override {
@@ -364,11 +362,10 @@ struct PartialInlinerLegacyPass : public ModulePass {
     AssumptionCacheTracker *ACT = &getAnalysis<AssumptionCacheTracker>();
     TargetTransformInfoWrapperPass *TTIWP =
         &getAnalysis<TargetTransformInfoWrapperPass>();
-    ProfileSummaryInfo *PSI =
-        &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
+    ProfileSummaryInfo &PSI =
+        getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
 
-    std::function<AssumptionCache &(Function &)> GetAssumptionCache =
-        [&ACT](Function &F) -> AssumptionCache & {
+    auto GetAssumptionCache = [&ACT](Function &F) -> AssumptionCache & {
       return ACT->getAssumptionCache(F);
     };
 
@@ -376,13 +373,16 @@ struct PartialInlinerLegacyPass : public ModulePass {
       return ACT->lookupAssumptionCache(F);
     };
 
-    std::function<TargetTransformInfo &(Function &)> GetTTI =
-        [&TTIWP](Function &F) -> TargetTransformInfo & {
+    auto GetTTI = [&TTIWP](Function &F) -> TargetTransformInfo & {
       return TTIWP->getTTI(F);
     };
 
-    return PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache,
-                              &GetTTI, NoneType::None, PSI)
+    auto GetTLI = [this](Function &F) -> TargetLibraryInfo & {
+      return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    };
+
+    return PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
+                              GetTLI, PSI)
         .run(M);
   }
 };
@@ -403,10 +403,10 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
     ScopedBFI.reset(new BlockFrequencyInfo(*F, BPI, LI));
     BFI = ScopedBFI.get();
   } else
-    BFI = &(*GetBFI)(*F);
+    BFI = &(GetBFI(*F));
 
   // Return if we don't have profiling information.
-  if (!PSI->hasInstrumentationProfile())
+  if (!PSI.hasInstrumentationProfile())
     return std::unique_ptr<FunctionOutliningMultiRegionInfo>();
 
   std::unique_ptr<FunctionOutliningMultiRegionInfo> OutliningInfo =
@@ -479,7 +479,7 @@ PartialInlinerImpl::computeOutliningColdRegionsInfo(Function *F,
     // Only consider regions with predecessor blocks that are considered
     // not-cold (default: part of the top 99.99% of all block counters)
     // AND greater than our minimum block execution count (default: 100).
-    if (PSI->isColdBlock(thisBB, BFI) ||
+    if (PSI.isColdBlock(thisBB, BFI) ||
         BBProfileCount(thisBB) < MinBlockCounterExecution)
       continue;
     for (auto SI = succ_begin(thisBB); SI != succ_end(thisBB); ++SI) {
@@ -759,31 +759,28 @@ PartialInlinerImpl::getOutliningCallBBRelativeFreq(FunctionCloner &Cloner) {
 }
 
 bool PartialInlinerImpl::shouldPartialInline(
-    CallSite CS, FunctionCloner &Cloner,
-    BlockFrequency WeightedOutliningRcost,
+    CallBase &CB, FunctionCloner &Cloner, BlockFrequency WeightedOutliningRcost,
     OptimizationRemarkEmitter &ORE) {
   using namespace ore;
 
-  Instruction *Call = CS.getInstruction();
-  Function *Callee = CS.getCalledFunction();
+  Function *Callee = CB.getCalledFunction();
   assert(Callee == Cloner.ClonedFunc);
 
   if (SkipCostAnalysis)
-    return isInlineViable(*Callee);
+    return isInlineViable(*Callee).isSuccess();
 
-  Function *Caller = CS.getCaller();
-  auto &CalleeTTI = (*GetTTI)(*Callee);
+  Function *Caller = CB.getCaller();
+  auto &CalleeTTI = GetTTI(*Callee);
   bool RemarksEnabled =
       Callee->getContext().getDiagHandlerPtr()->isMissedOptRemarkEnabled(
           DEBUG_TYPE);
-  assert(Call && "invalid callsite for partial inline");
-  InlineCost IC = getInlineCost(cast<CallBase>(*Call), getInlineParams(),
-                                CalleeTTI, *GetAssumptionCache, GetBFI, PSI,
-                                RemarksEnabled ? &ORE : nullptr);
+  InlineCost IC =
+      getInlineCost(CB, getInlineParams(), CalleeTTI, GetAssumptionCache,
+                    GetTLI, GetBFI, &PSI, RemarksEnabled ? &ORE : nullptr);
 
   if (IC.isAlways()) {
     ORE.emit([&]() {
-      return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", Call)
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "AlwaysInline", &CB)
              << NV("Callee", Cloner.OrigFunc)
              << " should always be fully inlined, not partially";
     });
@@ -792,7 +789,7 @@ bool PartialInlinerImpl::shouldPartialInline(
 
   if (IC.isNever()) {
     ORE.emit([&]() {
-      return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", Call)
+      return OptimizationRemarkMissed(DEBUG_TYPE, "NeverInline", &CB)
              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
              << NV("Caller", Caller)
              << " because it should never be inlined (cost=never)";
@@ -802,7 +799,7 @@ bool PartialInlinerImpl::shouldPartialInline(
 
   if (!IC) {
     ORE.emit([&]() {
-      return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", Call)
+      return OptimizationRemarkAnalysis(DEBUG_TYPE, "TooCostly", &CB)
              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
              << NV("Caller", Caller) << " because too costly to inline (cost="
              << NV("Cost", IC.getCost()) << ", threshold="
@@ -813,14 +810,14 @@ bool PartialInlinerImpl::shouldPartialInline(
   const DataLayout &DL = Caller->getParent()->getDataLayout();
 
   // The savings of eliminating the call:
-  int NonWeightedSavings = getCallsiteCost(cast<CallBase>(*Call), DL);
+  int NonWeightedSavings = getCallsiteCost(CB, DL);
   BlockFrequency NormWeightedSavings(NonWeightedSavings);
 
   // Weighted saving is smaller than weighted cost, return false
   if (NormWeightedSavings < WeightedOutliningRcost) {
     ORE.emit([&]() {
       return OptimizationRemarkAnalysis(DEBUG_TYPE, "OutliningCallcostTooHigh",
-                                        Call)
+                                        &CB)
              << NV("Callee", Cloner.OrigFunc) << " not partially inlined into "
              << NV("Caller", Caller) << " runtime overhead (overhead="
              << NV("Overhead", (unsigned)WeightedOutliningRcost.getFrequency())
@@ -834,7 +831,7 @@ bool PartialInlinerImpl::shouldPartialInline(
   }
 
   ORE.emit([&]() {
-    return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", Call)
+    return OptimizationRemarkAnalysis(DEBUG_TYPE, "CanBePartiallyInlined", &CB)
            << NV("Callee", Cloner.OrigFunc) << " can be partially inlined into "
            << NV("Caller", Caller) << " with cost=" << NV("Cost", IC.getCost())
            << " (threshold="
@@ -941,20 +938,20 @@ void PartialInlinerImpl::computeCallsiteToProfCountMap(
         CurrentCallerBFI = TempBFI.get();
       } else {
         // New pass manager:
-        CurrentCallerBFI = &(*GetBFI)(*Caller);
+        CurrentCallerBFI = &(GetBFI(*Caller));
       }
   };
 
   for (User *User : Users) {
-    CallSite CS = getCallSite(User);
-    Function *Caller = CS.getCaller();
+    CallBase *CB = getSupportedCallBase(User);
+    Function *Caller = CB->getCaller();
     if (CurrentCaller != Caller) {
       CurrentCaller = Caller;
       ComputeCurrBFI(Caller);
     } else {
       assert(CurrentCallerBFI && "CallerBFI is not set");
     }
-    BasicBlock *CallBB = CS.getInstruction()->getParent();
+    BasicBlock *CallBB = CB->getParent();
     auto Count = CurrentCallerBFI->getBlockProfileCount(CallBB);
     if (Count)
       CallSiteToProfCountMap[User] = *Count;
@@ -1155,8 +1152,8 @@ bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
     Function *OutlinedFunc = CE.extractCodeRegion(CEAC);
 
     if (OutlinedFunc) {
-      CallSite OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
-      BasicBlock *OutliningCallBB = OCS.getInstruction()->getParent();
+      CallBase *OCS = PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc);
+      BasicBlock *OutliningCallBB = OCS->getParent();
       assert(OutliningCallBB->getParent() == ClonedFunc);
       OutlinedFunctions.push_back(std::make_pair(OutlinedFunc,OutliningCallBB));
       NumColdRegionsOutlined++;
@@ -1164,7 +1161,7 @@ bool PartialInlinerImpl::FunctionCloner::doMultiRegionFunctionOutlining() {
 
       if (MarkOutlinedColdCC) {
         OutlinedFunc->setCallingConv(CallingConv::Cold);
-        OCS.setCallingConv(CallingConv::Cold);
+        OCS->setCallingConv(CallingConv::Cold);
       }
     } else
       ORE.emit([&]() {
@@ -1224,7 +1221,6 @@ PartialInlinerImpl::FunctionCloner::doSingleRegionFunctionOutlining() {
   if (OutlinedFunc) {
     BasicBlock *OutliningCallBB =
         PartialInlinerImpl::getOneCallSiteTo(OutlinedFunc)
-            .getInstruction()
             ->getParent();
     assert(OutliningCallBB->getParent() == ClonedFunc);
     OutlinedFunctions.push_back(std::make_pair(OutlinedFunc, OutliningCallBB));
@@ -1266,7 +1262,7 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
   if (F->hasFnAttribute(Attribute::NoInline))
     return {false, nullptr};
 
-  if (PSI->isFunctionEntryCold(F))
+  if (PSI.isFunctionEntryCold(F))
     return {false, nullptr};
 
   if (F->users().empty())
@@ -1276,7 +1272,7 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
 
   // Only try to outline cold regions if we have a profile summary, which
   // implies we have profiling information.
-  if (PSI->hasProfileSummary() && F->hasProfileData() &&
+  if (PSI.hasProfileSummary() && F->hasProfileData() &&
       !DisableMultiRegionPartialInline) {
     std::unique_ptr<FunctionOutliningMultiRegionInfo> OMRI =
         computeOutliningColdRegionsInfo(F, ORE);
@@ -1285,8 +1281,8 @@ std::pair<bool, Function *> PartialInlinerImpl::unswitchFunction(Function *F) {
 
 #ifndef NDEBUG
       if (TracePartialInlining) {
-        dbgs() << "HotCountThreshold = " << PSI->getHotCountThreshold() << "\n";
-        dbgs() << "ColdCountThreshold = " << PSI->getColdCountThreshold()
+        dbgs() << "HotCountThreshold = " << PSI.getHotCountThreshold() << "\n";
+        dbgs() << "ColdCountThreshold = " << PSI.getColdCountThreshold()
                << "\n";
       }
 #endif
@@ -1391,27 +1387,28 @@ bool PartialInlinerImpl::tryPartialInline(FunctionCloner &Cloner) {
 
   bool AnyInline = false;
   for (User *User : Users) {
-    CallSite CS = getCallSite(User);
+    CallBase *CB = getSupportedCallBase(User);
 
     if (IsLimitReached())
       continue;
 
-    OptimizationRemarkEmitter CallerORE(CS.getCaller());
-    if (!shouldPartialInline(CS, Cloner, WeightedRcost, CallerORE))
+    OptimizationRemarkEmitter CallerORE(CB->getCaller());
+    if (!shouldPartialInline(*CB, Cloner, WeightedRcost, CallerORE))
       continue;
 
     // Construct remark before doing the inlining, as after successful inlining
     // the callsite is removed.
-    OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CS.getInstruction());
+    OptimizationRemark OR(DEBUG_TYPE, "PartiallyInlined", CB);
     OR << ore::NV("Callee", Cloner.OrigFunc) << " partially inlined into "
-       << ore::NV("Caller", CS.getCaller());
+       << ore::NV("Caller", CB->getCaller());
 
-    InlineFunctionInfo IFI(nullptr, GetAssumptionCache, PSI);
+    InlineFunctionInfo IFI(nullptr, GetAssumptionCache, &PSI);
     // We can only forward varargs when we outlined a single region, else we
     // bail on vararg functions.
-    if (!InlineFunction(CS, IFI, nullptr, true,
+    if (!InlineFunction(*CB, IFI, nullptr, true,
                         (Cloner.ClonedOI ? Cloner.OutlinedFunctions.back().first
-                                         : nullptr)))
+                                         : nullptr))
+             .isSuccess())
       continue;
 
     CallerORE.emit(OR);
@@ -1492,6 +1489,7 @@ INITIALIZE_PASS_BEGIN(PartialInlinerLegacyPass, "partial-inliner",
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(PartialInlinerLegacyPass, "partial-inliner",
                     "Partial Inliner", false, false)
 
@@ -1503,8 +1501,7 @@ PreservedAnalyses PartialInlinerPass::run(Module &M,
                                           ModuleAnalysisManager &AM) {
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
 
-  std::function<AssumptionCache &(Function &)> GetAssumptionCache =
-      [&FAM](Function &F) -> AssumptionCache & {
+  auto GetAssumptionCache = [&FAM](Function &F) -> AssumptionCache & {
     return FAM.getResult<AssumptionAnalysis>(F);
   };
 
@@ -1512,20 +1509,22 @@ PreservedAnalyses PartialInlinerPass::run(Module &M,
     return FAM.getCachedResult<AssumptionAnalysis>(F);
   };
 
-  std::function<BlockFrequencyInfo &(Function &)> GetBFI =
-      [&FAM](Function &F) -> BlockFrequencyInfo & {
+  auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
     return FAM.getResult<BlockFrequencyAnalysis>(F);
   };
 
-  std::function<TargetTransformInfo &(Function &)> GetTTI =
-      [&FAM](Function &F) -> TargetTransformInfo & {
+  auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
     return FAM.getResult<TargetIRAnalysis>(F);
   };
 
-  ProfileSummaryInfo *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
+  auto GetTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
+
+  ProfileSummaryInfo &PSI = AM.getResult<ProfileSummaryAnalysis>(M);
 
-  if (PartialInlinerImpl(&GetAssumptionCache, LookupAssumptionCache, &GetTTI,
-                         {GetBFI}, PSI)
+  if (PartialInlinerImpl(GetAssumptionCache, LookupAssumptionCache, GetTTI,
+                         GetTLI, PSI, GetBFI)
           .run(M))
     return PreservedAnalyses::none();
   return PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
index 9c992830879a..d73d42c52074 100644
--- a/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
+++ b/llvm/lib/Transforms/IPO/PassManagerBuilder.cpp
@@ -13,6 +13,7 @@
 
 #include "llvm/Transforms/IPO/PassManagerBuilder.h"
 #include "llvm-c/Transforms/PassManagerBuilder.h"
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/CFLAndersAliasAnalysis.h"
@@ -46,6 +47,7 @@
 #include "llvm/Transforms/Vectorize.h"
 #include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/SLPVectorizer.h"
+#include "llvm/Transforms/Vectorize/VectorCombine.h"
 
 using namespace llvm;
 
@@ -98,8 +100,8 @@ static cl::opt<bool>
     EnablePerformThinLTO("perform-thinlto", cl::init(false), cl::Hidden,
                          cl::desc("Enable performing ThinLTO."));
 
-cl::opt<bool> EnableHotColdSplit("hot-cold-split", cl::init(false), cl::Hidden,
-    cl::desc("Enable hot-cold splitting pass"));
+cl::opt<bool> EnableHotColdSplit("hot-cold-split", cl::init(false),
+    cl::ZeroOrMore, cl::desc("Enable hot-cold splitting pass"));
 
 static cl::opt<bool> UseLoopVersioningLICM(
     "enable-loop-versioning-licm", cl::init(false), cl::Hidden,
@@ -115,7 +117,7 @@ static cl::opt<int> PreInlineThreshold(
              "(default = 75)"));
 
 static cl::opt<bool> EnableGVNHoist(
-    "enable-gvn-hoist", cl::init(false), cl::Hidden,
+    "enable-gvn-hoist", cl::init(false), cl::ZeroOrMore,
     cl::desc("Enable the GVN hoisting pass (default = off)"));
 
 static cl::opt<bool>
@@ -129,7 +131,7 @@ static cl::opt<bool> EnableSimpleLoopUnswitch(
              "cleanup passes integrated into the loop pass manager pipeline."));
 
 static cl::opt<bool> EnableGVNSink(
-    "enable-gvn-sink", cl::init(false), cl::Hidden,
+    "enable-gvn-sink", cl::init(false), cl::ZeroOrMore,
     cl::desc("Enable the GVN sinking pass (default = off)"));
 
 // This option is used in simplifying testing SampleFDO optimizations for
@@ -151,15 +153,29 @@ static cl::opt<bool>
     EnableMatrix("enable-matrix", cl::init(false), cl::Hidden,
                  cl::desc("Enable lowering of the matrix intrinsics"));
 
+cl::opt<AttributorRunOption> AttributorRun(
+    "attributor-enable", cl::Hidden, cl::init(AttributorRunOption::NONE),
+    cl::desc("Enable the attributor inter-procedural deduction pass."),
+    cl::values(clEnumValN(AttributorRunOption::ALL, "all",
+                          "enable all attributor runs"),
+               clEnumValN(AttributorRunOption::MODULE, "module",
+                          "enable module-wide attributor runs"),
+               clEnumValN(AttributorRunOption::CGSCC, "cgscc",
+                          "enable call graph SCC attributor runs"),
+               clEnumValN(AttributorRunOption::NONE, "none",
+                          "disable attributor runs")));
+
+extern cl::opt<bool> EnableKnowledgeRetention;
+
 PassManagerBuilder::PassManagerBuilder() {
     OptLevel = 2;
     SizeLevel = 0;
     LibraryInfo = nullptr;
     Inliner = nullptr;
     DisableUnrollLoops = false;
-    SLPVectorize = RunSLPVectorization;
-    LoopVectorize = EnableLoopVectorization;
-    LoopsInterleaved = EnableLoopInterleaving;
+    SLPVectorize = false;
+    LoopVectorize = true;
+    LoopsInterleaved = true;
     RerollLoops = RunLoopRerolling;
     NewGVN = RunNewGVN;
     LicmMssaOptCap = SetLicmMssaOptCap;
@@ -179,6 +195,7 @@ PassManagerBuilder::PassManagerBuilder() {
     PrepareForThinLTO = EnablePrepareForThinLTO;
     PerformThinLTO = EnablePerformThinLTO;
     DivergentTarget = false;
+    CallGraphProfile = true;
 }
 
 PassManagerBuilder::~PassManagerBuilder() {
@@ -187,8 +204,13 @@ PassManagerBuilder::~PassManagerBuilder() {
 }
 
 /// Set of global extensions, automatically added as part of the standard set.
-static ManagedStatic<SmallVector<std::pair<PassManagerBuilder::ExtensionPointTy,
-   PassManagerBuilder::ExtensionFn>, 8> > GlobalExtensions;
+static ManagedStatic<
+    SmallVector<std::tuple<PassManagerBuilder::ExtensionPointTy,
+                           PassManagerBuilder::ExtensionFn,
+                           PassManagerBuilder::GlobalExtensionID>,
+                8>>
+    GlobalExtensions;
+static PassManagerBuilder::GlobalExtensionID GlobalExtensionsCounter;
 
 /// Check if GlobalExtensions is constructed and not empty.
 /// Since GlobalExtensions is a managed static, calling 'empty()' will trigger
@@ -197,10 +219,29 @@ static bool GlobalExtensionsNotEmpty() {
   return GlobalExtensions.isConstructed() && !GlobalExtensions->empty();
 }
 
-void PassManagerBuilder::addGlobalExtension(
-    PassManagerBuilder::ExtensionPointTy Ty,
-    PassManagerBuilder::ExtensionFn Fn) {
-  GlobalExtensions->push_back(std::make_pair(Ty, std::move(Fn)));
+PassManagerBuilder::GlobalExtensionID
+PassManagerBuilder::addGlobalExtension(PassManagerBuilder::ExtensionPointTy Ty,
+                                       PassManagerBuilder::ExtensionFn Fn) {
+  auto ExtensionID = GlobalExtensionsCounter++;
+  GlobalExtensions->push_back(std::make_tuple(Ty, std::move(Fn), ExtensionID));
+  return ExtensionID;
+}
+
+void PassManagerBuilder::removeGlobalExtension(
+    PassManagerBuilder::GlobalExtensionID ExtensionID) {
+  // RegisterStandardPasses may try to call this function after GlobalExtensions
+  // has already been destroyed; doing so should not generate an error.
+  if (!GlobalExtensions.isConstructed())
+    return;
+
+  auto GlobalExtension =
+      llvm::find_if(*GlobalExtensions, [ExtensionID](const auto &elem) {
+        return std::get<2>(elem) == ExtensionID;
+      });
+  assert(GlobalExtension != GlobalExtensions->end() &&
+         "The extension ID to be removed should always be valid.");
+
+  GlobalExtensions->erase(GlobalExtension);
 }
 
 void PassManagerBuilder::addExtension(ExtensionPointTy Ty, ExtensionFn Fn) {
@@ -211,8 +252,8 @@ void PassManagerBuilder::addExtensionsToPM(ExtensionPointTy ETy,
                                            legacy::PassManagerBase &PM) const {
   if (GlobalExtensionsNotEmpty()) {
     for (auto &Ext : *GlobalExtensions) {
-      if (Ext.first == ETy)
-        Ext.second(*this, PM);
+      if (std::get<0>(Ext) == ETy)
+        std::get<1>(Ext)(*this, PM);
     }
   }
   for (unsigned i = 0, e = Extensions.size(); i != e; ++i)
@@ -244,12 +285,6 @@ void PassManagerBuilder::addInitialAliasAnalysisPasses(
   PM.add(createScopedNoAliasAAWrapperPass());
 }
 
-void PassManagerBuilder::addInstructionCombiningPass(
-    legacy::PassManagerBase &PM) const {
-  bool ExpensiveCombines = OptLevel > 2;
-  PM.add(createInstructionCombiningPass(ExpensiveCombines));
-}
-
 void PassManagerBuilder::populateFunctionPassManager(
     legacy::FunctionPassManager &FPM) {
   addExtensionsToPM(EP_EarlyAsPossible, FPM);
@@ -327,6 +362,8 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   assert(OptLevel >= 1 && "Calling function optimizer with no optimization level!");
   MPM.add(createSROAPass());
   MPM.add(createEarlyCSEPass(true /* Enable mem-ssa. */)); // Catch trivial redundancies
+  if (EnableKnowledgeRetention)
+    MPM.add(createAssumeSimplifyPass());
 
   if (OptLevel > 1) {
     if (EnableGVNHoist)
@@ -348,7 +385,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   // Combine silly seq's
   if (OptLevel > 2)
     MPM.add(createAggressiveInstCombinerPass());
-  addInstructionCombiningPass(MPM);
+  MPM.add(createInstructionCombiningPass());
   if (SizeLevel == 0 && !DisableLibCallsShrinkWrap)
     MPM.add(createLibCallsShrinkWrapPass());
   addExtensionsToPM(EP_Peephole, MPM);
@@ -383,7 +420,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   // simplify-cfg. Eventually loop-simplifycfg should be enhanced to replace the
   // need for this.
   MPM.add(createCFGSimplificationPass());
-  addInstructionCombiningPass(MPM);
+  MPM.add(createInstructionCombiningPass());
   // We resume loop passes creating a second loop pipeline here.
   MPM.add(createIndVarSimplifyPass());        // Canonicalize indvars
   MPM.add(createLoopIdiomPass());             // Recognize idioms like memset.
@@ -414,7 +451,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
 
   // Run instcombine after redundancy elimination to exploit opportunities
   // opened up by them.
-  addInstructionCombiningPass(MPM);
+  MPM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, MPM);
   if (OptLevel > 1) {
     MPM.add(createJumpThreadingPass());         // Thread jumps
@@ -432,7 +469,7 @@ void PassManagerBuilder::addFunctionSimplificationPasses(
   MPM.add(createAggressiveDCEPass());         // Delete dead instructions
   MPM.add(createCFGSimplificationPass()); // Merge & remove BBs
   // Clean up after everything.
-  addInstructionCombiningPass(MPM);
+  MPM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, MPM);
 
   if (EnableCHR && OptLevel >= 3 &&
@@ -478,6 +515,7 @@ void PassManagerBuilder::populateModulePassManager(
       MPM.add(createBarrierNoopPass());
 
     if (PerformThinLTO) {
+      MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
       // Drop available_externally and unreferenced globals. This is necessary
       // with ThinLTO in order to avoid leaving undefined references to dead
       // globals in the object file.
@@ -511,9 +549,11 @@ void PassManagerBuilder::populateModulePassManager(
   // inter-module indirect calls. For that we perform indirect call promotion
   // earlier in the pass pipeline, here before globalopt. Otherwise imported
   // available_externally functions look unreferenced and are removed.
-  if (PerformThinLTO)
+  if (PerformThinLTO) {
     MPM.add(createPGOIndirectCallPromotionLegacyPass(/*InLTO = */ true,
                                                      !PGOSampleUse.empty()));
+    MPM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
+  }
 
   // For SamplePGO in ThinLTO compile phase, we do not want to unroll loops
   // as it will change the CFG too much to make the 2nd profile annotation
@@ -526,6 +566,10 @@ void PassManagerBuilder::populateModulePassManager(
   // Infer attributes about declarations if possible.
   MPM.add(createInferFunctionAttrsLegacyPass());
 
+  // Infer attributes on declarations, call sites, arguments, etc.
+  if (AttributorRun & AttributorRunOption::MODULE)
+    MPM.add(createAttributorLegacyPass());
+
   addExtensionsToPM(EP_ModuleOptimizerEarly, MPM);
 
   if (OptLevel > 2)
@@ -534,16 +578,13 @@ void PassManagerBuilder::populateModulePassManager(
   MPM.add(createIPSCCPPass());          // IP SCCP
   MPM.add(createCalledValuePropagationPass());
 
-  // Infer attributes on declarations, call sites, arguments, etc.
-  MPM.add(createAttributorLegacyPass());
-
   MPM.add(createGlobalOptimizerPass()); // Optimize out global vars
   // Promote any localized global vars.
   MPM.add(createPromoteMemoryToRegisterPass());
 
   MPM.add(createDeadArgEliminationPass()); // Dead argument elimination
 
-  addInstructionCombiningPass(MPM); // Clean up after IPCP & DAE
+  MPM.add(createInstructionCombiningPass()); // Clean up after IPCP & DAE
   addExtensionsToPM(EP_Peephole, MPM);
   MPM.add(createCFGSimplificationPass()); // Clean up after IPCP & DAE
 
@@ -574,6 +615,15 @@ void PassManagerBuilder::populateModulePassManager(
     RunInliner = true;
   }
 
+  // Infer attributes on declarations, call sites, arguments, etc. for an SCC.
+  if (AttributorRun & AttributorRunOption::CGSCC)
+    MPM.add(createAttributorCGSCCLegacyPass());
+
+  // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
+  // there are no OpenMP runtime calls present in the module.
+  if (OptLevel > 1)
+    MPM.add(createOpenMPOptLegacyPass());
+
   MPM.add(createPostOrderFunctionAttrsLegacyPass());
   if (OptLevel > 2)
     MPM.add(createArgumentPromotionPass()); // Scalarize uninlined fn args
@@ -705,7 +755,7 @@ void PassManagerBuilder::populateModulePassManager(
   // on -O1 and no #pragma is found). Would be good to have these two passes
   // as function calls, so that we can only pass them when the vectorizer
   // changed the code.
-  addInstructionCombiningPass(MPM);
+  MPM.add(createInstructionCombiningPass());
   if (OptLevel > 1 && ExtraVectorizerPasses) {
     // At higher optimization levels, try to clean up any runtime overlap and
     // alignment checks inserted by the vectorizer. We want to track correllated
@@ -715,11 +765,11 @@ void PassManagerBuilder::populateModulePassManager(
     // dead (or speculatable) control flows or more combining opportunities.
     MPM.add(createEarlyCSEPass());
     MPM.add(createCorrelatedValuePropagationPass());
-    addInstructionCombiningPass(MPM);
+    MPM.add(createInstructionCombiningPass());
     MPM.add(createLICMPass(LicmMssaOptCap, LicmMssaNoAccForPromotionCap));
     MPM.add(createLoopUnswitchPass(SizeLevel || OptLevel < 3, DivergentTarget));
     MPM.add(createCFGSimplificationPass());
-    addInstructionCombiningPass(MPM);
+    MPM.add(createInstructionCombiningPass());
   }
 
   // Cleanup after loop vectorization, etc. Simplification passes like CVP and
@@ -736,8 +786,11 @@ void PassManagerBuilder::populateModulePassManager(
     }
   }
 
+  // Enhance/cleanup vector code.
+  MPM.add(createVectorCombinePass());
+
   addExtensionsToPM(EP_Peephole, MPM);
-  addInstructionCombiningPass(MPM);
+  MPM.add(createInstructionCombiningPass());
 
   if (EnableUnrollAndJam && !DisableUnrollLoops) {
     // Unroll and Jam. We do this before unroll but need to be in a separate
@@ -752,7 +805,7 @@ void PassManagerBuilder::populateModulePassManager(
 
   if (!DisableUnrollLoops) {
     // LoopUnroll may generate some redundency to cleanup.
-    addInstructionCombiningPass(MPM);
+    MPM.add(createInstructionCombiningPass());
 
     // Runtime unrolling will introduce runtime check in loop prologue. If the
     // unrolled loop is a inner loop, then the prologue will be inside the
@@ -785,6 +838,10 @@ void PassManagerBuilder::populateModulePassManager(
   if (MergeFunctions)
     MPM.add(createMergeFunctionsPass());
 
+  // Add Module flag "CG Profile" based on Branch Frequency Information.
+  if (CallGraphProfile)
+    MPM.add(createCGProfileLegacyPass());
+
   // LoopSink pass sinks instructions hoisted by LICM, which serves as a
   // canonicalization pass that enables other optimizations. As a result,
   // LoopSink pass needs to be a very late IR pass to avoid undoing LICM
@@ -852,7 +909,8 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
     PM.add(createCalledValuePropagationPass());
 
     // Infer attributes on declarations, call sites, arguments, etc.
-    PM.add(createAttributorLegacyPass());
+    if (AttributorRun & AttributorRunOption::MODULE)
+      PM.add(createAttributorLegacyPass());
   }
 
   // Infer attributes about definitions. The readnone attribute in particular is
@@ -890,7 +948,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // calls, etc, so let instcombine do this.
   if (OptLevel > 2)
     PM.add(createAggressiveInstCombinerPass());
-  addInstructionCombiningPass(PM);
+  PM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, PM);
 
   // Inline small functions
@@ -905,6 +963,15 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // CSFDO instrumentation and use pass.
   addPGOInstrPasses(PM, /* IsCS */ true);
 
+  // Infer attributes on declarations, call sites, arguments, etc. for an SCC.
+  if (AttributorRun & AttributorRunOption::CGSCC)
+    PM.add(createAttributorCGSCCLegacyPass());
+
+  // Try to perform OpenMP specific optimizations. This is a (quick!) no-op if
+  // there are no OpenMP runtime calls present in the module.
+  if (OptLevel > 1)
+    PM.add(createOpenMPOptLegacyPass());
+
   // Optimize globals again if we ran the inliner.
   if (RunInliner)
     PM.add(createGlobalOptimizerPass());
@@ -915,7 +982,7 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   PM.add(createArgumentPromotionPass());
 
   // The IPO passes may leave cruft around.  Clean up after them.
-  addInstructionCombiningPass(PM);
+  PM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, PM);
   PM.add(createJumpThreadingPass());
 
@@ -960,22 +1027,24 @@ void PassManagerBuilder::addLTOOptimizationPasses(legacy::PassManagerBase &PM) {
   // Now that we've optimized loops (in particular loop induction variables),
   // we may have exposed more scalar opportunities. Run parts of the scalar
   // optimizer again at this point.
-  addInstructionCombiningPass(PM); // Initial cleanup
+  PM.add(createInstructionCombiningPass()); // Initial cleanup
   PM.add(createCFGSimplificationPass()); // if-convert
   PM.add(createSCCPPass()); // Propagate exposed constants
-  addInstructionCombiningPass(PM); // Clean up again
+  PM.add(createInstructionCombiningPass()); // Clean up again
   PM.add(createBitTrackingDCEPass());
 
   // More scalar chains could be vectorized due to more alias information
   if (SLPVectorize)
     PM.add(createSLPVectorizerPass()); // Vectorize parallel scalar chains.
 
+  PM.add(createVectorCombinePass()); // Clean up partial vectorization.
+
   // After vectorization, assume intrinsics may tell us more about pointer
   // alignments.
   PM.add(createAlignmentFromAssumptionsPass());
 
   // Cleanup and simplify the code after the scalar optimizations.
-  addInstructionCombiningPass(PM);
+  PM.add(createInstructionCombiningPass());
   addExtensionsToPM(EP_Peephole, PM);
 
   PM.add(createJumpThreadingPass());
@@ -1013,8 +1082,8 @@ void PassManagerBuilder::populateThinLTOPassManager(
     PM.add(createVerifierPass());
 
   if (ImportSummary) {
-    // These passes import type identifier resolutions for whole-program
-    // devirtualization and CFI. They must run early because other passes may
+    // This pass imports type identifier resolutions for whole-program
+    // devirtualization and CFI. It must run early because other passes may
     // disturb the specific instruction patterns that these passes look for,
     // creating dependencies on resolutions that may not appear in the summary.
     //
@@ -1062,6 +1131,9 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
   // control flow integrity mechanisms (-fsanitize=cfi*) and needs to run at
   // link time if CFI is enabled. The pass does nothing if CFI is disabled.
   PM.add(createLowerTypeTestsPass(ExportSummary, nullptr));
+  // Run a second time to clean up any type tests left behind by WPD for use
+  // in ICP (which is performed earlier than this in the regular LTO pipeline).
+  PM.add(createLowerTypeTestsPass(nullptr, nullptr, true));
 
   if (OptLevel != 0)
     addLateLTOOptimizationPasses(PM);
@@ -1072,14 +1144,6 @@ void PassManagerBuilder::populateLTOPassManager(legacy::PassManagerBase &PM) {
     PM.add(createVerifierPass());
 }
 
-inline PassManagerBuilder *unwrap(LLVMPassManagerBuilderRef P) {
-    return reinterpret_cast<PassManagerBuilder*>(P);
-}
-
-inline LLVMPassManagerBuilderRef wrap(PassManagerBuilder *P) {
-  return reinterpret_cast<LLVMPassManagerBuilderRef>(P);
-}
-
 LLVMPassManagerBuilderRef LLVMPassManagerBuilderCreate() {
   PassManagerBuilder *PMB = new PassManagerBuilder();
   return wrap(PMB);
diff --git a/llvm/lib/Transforms/IPO/PruneEH.cpp b/llvm/lib/Transforms/IPO/PruneEH.cpp
index 45a0ce20eb17..a16dc664db64 100644
--- a/llvm/lib/Transforms/IPO/PruneEH.cpp
+++ b/llvm/lib/Transforms/IPO/PruneEH.cpp
@@ -135,8 +135,8 @@ static bool runImpl(CallGraphSCC &SCC, CallGraph &CG) {
             SCCMightUnwind |= InstMightUnwind;
           }
           if (CheckReturnViaAsm && !SCCMightReturn)
-            if (auto ICS = ImmutableCallSite(&I))
-              if (const auto *IA = dyn_cast<InlineAsm>(ICS.getCalledValue()))
+            if (const auto *CB = dyn_cast<CallBase>(&I))
+              if (const auto *IA = dyn_cast<InlineAsm>(CB->getCalledOperand()))
                 if (IA->hasSideEffects())
                   SCCMightReturn = true;
         }
diff --git a/llvm/lib/Transforms/IPO/SampleProfile.cpp b/llvm/lib/Transforms/IPO/SampleProfile.cpp
index a1fbb1adc412..b6871e260532 100644
--- a/llvm/lib/Transforms/IPO/SampleProfile.cpp
+++ b/llvm/lib/Transforms/IPO/SampleProfile.cpp
@@ -37,15 +37,16 @@
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/CallGraphSCCPass.h"
+#include "llvm/Analysis/InlineAdvisor.h"
 #include "llvm/Analysis/InlineCost.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DebugLoc.h"
 #include "llvm/IR/DiagnosticInfo.h"
@@ -148,14 +149,17 @@ static cl::opt<bool> ProfileAccurateForSymsInList(
              "be accurate. It may be overriden by profile-sample-accurate. "));
 
 static cl::opt<bool> ProfileMergeInlinee(
-    "sample-profile-merge-inlinee", cl::Hidden, cl::init(false),
+    "sample-profile-merge-inlinee", cl::Hidden, cl::init(true),
     cl::desc("Merge past inlinee's profile to outline version if sample "
-             "profile loader decided not to inline a call site."));
+             "profile loader decided not to inline a call site. It will "
+             "only be enabled when top-down order of profile loading is "
+             "enabled. "));
 
 static cl::opt<bool> ProfileTopDownLoad(
-    "sample-profile-top-down-load", cl::Hidden, cl::init(false),
+    "sample-profile-top-down-load", cl::Hidden, cl::init(true),
     cl::desc("Do profile annotation and inlining for functions in top-down "
-             "order of call graph during sample profile loading."));
+             "order of call graph during sample profile loading. It only "
+             "works for new pass manager. "));
 
 static cl::opt<bool> ProfileSizeInline(
     "sample-profile-inline-size", cl::Hidden, cl::init(false),
@@ -235,7 +239,7 @@ public:
                         DenseMap<uint64_t, StringRef> &GUIDToFuncNameMap)
       : CurrentReader(Reader), CurrentModule(M),
       CurrentGUIDToFuncNameMap(GUIDToFuncNameMap) {
-    if (CurrentReader.getFormat() != SPF_Compact_Binary)
+    if (!CurrentReader.useMD5())
       return;
 
     for (const auto &F : CurrentModule) {
@@ -261,7 +265,7 @@ public:
   }
 
   ~GUIDToFuncNameMapper() {
-    if (CurrentReader.getFormat() != SPF_Compact_Binary)
+    if (!CurrentReader.useMD5())
       return;
 
     CurrentGUIDToFuncNameMap.clear();
@@ -307,10 +311,12 @@ public:
   SampleProfileLoader(
       StringRef Name, StringRef RemapName, bool IsThinLTOPreLink,
       std::function<AssumptionCache &(Function &)> GetAssumptionCache,
-      std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo)
+      std::function<TargetTransformInfo &(Function &)> GetTargetTransformInfo,
+      std::function<const TargetLibraryInfo &(Function &)> GetTLI)
       : GetAC(std::move(GetAssumptionCache)),
-        GetTTI(std::move(GetTargetTransformInfo)), CoverageTracker(*this),
-        Filename(Name), RemappingFilename(RemapName),
+        GetTTI(std::move(GetTargetTransformInfo)), GetTLI(std::move(GetTLI)),
+        CoverageTracker(*this), Filename(std::string(Name)),
+        RemappingFilename(std::string(RemapName)),
         IsThinLTOPreLink(IsThinLTOPreLink) {}
 
   bool doInitialization(Module &M);
@@ -327,18 +333,19 @@ protected:
   bool emitAnnotations(Function &F);
   ErrorOr<uint64_t> getInstWeight(const Instruction &I);
   ErrorOr<uint64_t> getBlockWeight(const BasicBlock *BB);
-  const FunctionSamples *findCalleeFunctionSamples(const Instruction &I) const;
+  const FunctionSamples *findCalleeFunctionSamples(const CallBase &I) const;
   std::vector<const FunctionSamples *>
   findIndirectCallFunctionSamples(const Instruction &I, uint64_t &Sum) const;
   mutable DenseMap<const DILocation *, const FunctionSamples *> DILocation2SampleMap;
   const FunctionSamples *findFunctionSamples(const Instruction &I) const;
-  bool inlineCallInstruction(Instruction *I);
+  bool inlineCallInstruction(CallBase &CB);
   bool inlineHotFunctions(Function &F,
                           DenseSet<GlobalValue::GUID> &InlinedGUIDs);
   // Inline cold/small functions in addition to hot ones
-  bool shouldInlineColdCallee(Instruction &CallInst);
+  bool shouldInlineColdCallee(CallBase &CallInst);
   void emitOptimizationRemarksForInlineCandidates(
-    const SmallVector<Instruction *, 10> &Candidates, const Function &F, bool Hot);
+      const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
+      bool Hot);
   void printEdgeWeight(raw_ostream &OS, Edge E);
   void printBlockWeight(raw_ostream &OS, const BasicBlock *BB) const;
   void printBlockEquivalence(raw_ostream &OS, const BasicBlock *BB);
@@ -397,6 +404,7 @@ protected:
 
   std::function<AssumptionCache &(Function &)> GetAC;
   std::function<TargetTransformInfo &(Function &)> GetTTI;
+  std::function<const TargetLibraryInfo &(Function &)> GetTLI;
 
   /// Predecessors for each basic block in the CFG.
   BlockEdgeMap Predecessors;
@@ -474,14 +482,17 @@ public:
 
   SampleProfileLoaderLegacyPass(StringRef Name = SampleProfileFile,
                                 bool IsThinLTOPreLink = false)
-      : ModulePass(ID),
-        SampleLoader(Name, SampleProfileRemappingFile, IsThinLTOPreLink,
-                     [&](Function &F) -> AssumptionCache & {
-                       return ACT->getAssumptionCache(F);
-                     },
-                     [&](Function &F) -> TargetTransformInfo & {
-                       return TTIWP->getTTI(F);
-                     }) {
+      : ModulePass(ID), SampleLoader(
+                            Name, SampleProfileRemappingFile, IsThinLTOPreLink,
+                            [&](Function &F) -> AssumptionCache & {
+                              return ACT->getAssumptionCache(F);
+                            },
+                            [&](Function &F) -> TargetTransformInfo & {
+                              return TTIWP->getTTI(F);
+                            },
+                            [&](Function &F) -> TargetLibraryInfo & {
+                              return TLIWP->getTLI(F);
+                            }) {
     initializeSampleProfileLoaderLegacyPassPass(
         *PassRegistry::getPassRegistry());
   }
@@ -498,6 +509,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<ProfileSummaryInfoWrapperPass>();
   }
 
@@ -505,6 +517,7 @@ private:
   SampleProfileLoader SampleLoader;
   AssumptionCacheTracker *ACT = nullptr;
   TargetTransformInfoWrapperPass *TTIWP = nullptr;
+  TargetLibraryInfoWrapperPass *TLIWP = nullptr;
 };
 
 } // end anonymous namespace
@@ -710,10 +723,9 @@ ErrorOr<uint64_t> SampleProfileLoader::getInstWeight(const Instruction &Inst) {
   // (findCalleeFunctionSamples returns non-empty result), but not inlined here,
   // it means that the inlined callsite has no sample, thus the call
   // instruction should have 0 count.
-  if ((isa<CallInst>(Inst) || isa<InvokeInst>(Inst)) &&
-      !ImmutableCallSite(&Inst).isIndirectCall() &&
-      findCalleeFunctionSamples(Inst))
-    return 0;
+  if (auto *CB = dyn_cast<CallBase>(&Inst))
+    if (!CB->isIndirectCall() && findCalleeFunctionSamples(*CB))
+      return 0;
 
   const DILocation *DIL = DLoc;
   uint32_t LineOffset = FunctionSamples::getOffset(DIL);
@@ -801,7 +813,7 @@ bool SampleProfileLoader::computeBlockWeights(Function &F) {
 ///
 /// \returns The FunctionSamples pointer to the inlined instance.
 const FunctionSamples *
-SampleProfileLoader::findCalleeFunctionSamples(const Instruction &Inst) const {
+SampleProfileLoader::findCalleeFunctionSamples(const CallBase &Inst) const {
   const DILocation *DIL = Inst.getDebugLoc();
   if (!DIL) {
     return nullptr;
@@ -885,13 +897,11 @@ SampleProfileLoader::findFunctionSamples(const Instruction &Inst) const {
   return it.first->second;
 }
 
-bool SampleProfileLoader::inlineCallInstruction(Instruction *I) {
-  assert(isa<CallInst>(I) || isa<InvokeInst>(I));
-  CallSite CS(I);
-  Function *CalledFunction = CS.getCalledFunction();
+bool SampleProfileLoader::inlineCallInstruction(CallBase &CB) {
+  Function *CalledFunction = CB.getCalledFunction();
   assert(CalledFunction);
-  DebugLoc DLoc = I->getDebugLoc();
-  BasicBlock *BB = I->getParent();
+  DebugLoc DLoc = CB.getDebugLoc();
+  BasicBlock *BB = CB.getParent();
   InlineParams Params = getInlineParams();
   Params.ComputeFullInlineCost = true;
   // Checks if there is anything in the reachable portion of the callee at
@@ -901,46 +911,43 @@ bool SampleProfileLoader::inlineCallInstruction(Instruction *I) {
   // The acutal cost does not matter because we only checks isNever() to
   // see if it is legal to inline the callsite.
   InlineCost Cost =
-      getInlineCost(cast<CallBase>(*I), Params, GetTTI(*CalledFunction), GetAC,
-                    None, nullptr, nullptr);
+      getInlineCost(CB, Params, GetTTI(*CalledFunction), GetAC, GetTLI);
   if (Cost.isNever()) {
     ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineFail", DLoc, BB)
               << "incompatible inlining");
     return false;
   }
-  InlineFunctionInfo IFI(nullptr, &GetAC);
-  if (InlineFunction(CS, IFI)) {
+  InlineFunctionInfo IFI(nullptr, GetAC);
+  if (InlineFunction(CB, IFI).isSuccess()) {
     // The call to InlineFunction erases I, so we can't pass it here.
-    ORE->emit(OptimizationRemark(CSINLINE_DEBUG, "InlineSuccess", DLoc, BB)
-              << "inlined callee '" << ore::NV("Callee", CalledFunction)
-              << "' into '" << ore::NV("Caller", BB->getParent()) << "'");
+    emitInlinedInto(*ORE, DLoc, BB, *CalledFunction, *BB->getParent(), Cost,
+                    true, CSINLINE_DEBUG);
     return true;
   }
   return false;
 }
 
-bool SampleProfileLoader::shouldInlineColdCallee(Instruction &CallInst) {
+bool SampleProfileLoader::shouldInlineColdCallee(CallBase &CallInst) {
   if (!ProfileSizeInline)
     return false;
 
-  Function *Callee = CallSite(&CallInst).getCalledFunction();
+  Function *Callee = CallInst.getCalledFunction();
   if (Callee == nullptr)
     return false;
 
-  InlineCost Cost =
-      getInlineCost(cast<CallBase>(CallInst), getInlineParams(),
-                    GetTTI(*Callee), GetAC, None, nullptr, nullptr);
+  InlineCost Cost = getInlineCost(CallInst, getInlineParams(), GetTTI(*Callee),
+                                  GetAC, GetTLI);
 
   return Cost.getCost() <= SampleColdCallSiteThreshold;
 }
 
 void SampleProfileLoader::emitOptimizationRemarksForInlineCandidates(
-    const SmallVector<Instruction *, 10> &Candidates, const Function &F,
+    const SmallVectorImpl<CallBase *> &Candidates, const Function &F,
     bool Hot) {
   for (auto I : Candidates) {
-    Function *CalledFunction = CallSite(I).getCalledFunction();
+    Function *CalledFunction = I->getCalledFunction();
     if (CalledFunction) {
-      ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineAttempt", 
+      ORE->emit(OptimizationRemarkAnalysis(CSINLINE_DEBUG, "InlineAttempt",
                                            I->getDebugLoc(), I->getParent())
                 << "previous inlining reattempted for "
                 << (Hot ? "hotness: '" : "size: '")
@@ -975,43 +982,43 @@ bool SampleProfileLoader::inlineHotFunctions(
          "ProfAccForSymsInList should be false when profile-sample-accurate "
          "is enabled");
 
-  DenseMap<Instruction *, const FunctionSamples *> localNotInlinedCallSites;
+  DenseMap<CallBase *, const FunctionSamples *> localNotInlinedCallSites;
   bool Changed = false;
   while (true) {
     bool LocalChanged = false;
-    SmallVector<Instruction *, 10> CIS;
+    SmallVector<CallBase *, 10> CIS;
     for (auto &BB : F) {
       bool Hot = false;
-      SmallVector<Instruction *, 10> AllCandidates;
-      SmallVector<Instruction *, 10> ColdCandidates;
+      SmallVector<CallBase *, 10> AllCandidates;
+      SmallVector<CallBase *, 10> ColdCandidates;
       for (auto &I : BB.getInstList()) {
         const FunctionSamples *FS = nullptr;
-        if ((isa<CallInst>(I) || isa<InvokeInst>(I)) &&
-            !isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(I))) {
-          AllCandidates.push_back(&I);
-          if (FS->getEntrySamples() > 0)
-            localNotInlinedCallSites.try_emplace(&I, FS);
-          if (callsiteIsHot(FS, PSI))
-            Hot = true;
-          else if (shouldInlineColdCallee(I))
-            ColdCandidates.push_back(&I);
+        if (auto *CB = dyn_cast<CallBase>(&I)) {
+          if (!isa<IntrinsicInst>(I) && (FS = findCalleeFunctionSamples(*CB))) {
+            AllCandidates.push_back(CB);
+            if (FS->getEntrySamples() > 0)
+              localNotInlinedCallSites.try_emplace(CB, FS);
+            if (callsiteIsHot(FS, PSI))
+              Hot = true;
+            else if (shouldInlineColdCallee(*CB))
+              ColdCandidates.push_back(CB);
+          }
         }
       }
       if (Hot) {
         CIS.insert(CIS.begin(), AllCandidates.begin(), AllCandidates.end());
         emitOptimizationRemarksForInlineCandidates(AllCandidates, F, true);
-      }
-      else {
+      } else {
         CIS.insert(CIS.begin(), ColdCandidates.begin(), ColdCandidates.end());
         emitOptimizationRemarksForInlineCandidates(ColdCandidates, F, false);
       }
     }
-    for (auto I : CIS) {
-      Function *CalledFunction = CallSite(I).getCalledFunction();
+    for (CallBase *I : CIS) {
+      Function *CalledFunction = I->getCalledFunction();
       // Do not inline recursive calls.
       if (CalledFunction == &F)
         continue;
-      if (CallSite(I).isIndirectCall()) {
+      if (I->isIndirectCall()) {
         if (PromotedInsns.count(I))
           continue;
         uint64_t Sum;
@@ -1021,7 +1028,7 @@ bool SampleProfileLoader::inlineHotFunctions(
                                      PSI->getOrCompHotCountThreshold());
             continue;
           }
-          auto CalleeFunctionName = FS->getFuncNameInModule(F.getParent());
+          auto CalleeFunctionName = FS->getFuncName();
           // If it is a recursive call, we do not inline it as it could bloat
           // the code exponentially. There is way to better handle this, e.g.
           // clone the caller first, and inline the cloned caller if it is
@@ -1038,15 +1045,16 @@ bool SampleProfileLoader::inlineHotFunctions(
           if (R != SymbolMap.end() && R->getValue() &&
               !R->getValue()->isDeclaration() &&
               R->getValue()->getSubprogram() &&
-              isLegalToPromote(CallSite(I), R->getValue(), &Reason)) {
+              R->getValue()->hasFnAttribute("use-sample-profile") &&
+              isLegalToPromote(*I, R->getValue(), &Reason)) {
             uint64_t C = FS->getEntrySamples();
-            Instruction *DI =
-                pgo::promoteIndirectCall(I, R->getValue(), C, Sum, false, ORE);
+            auto &DI =
+                pgo::promoteIndirectCall(*I, R->getValue(), C, Sum, false, ORE);
             Sum -= C;
             PromotedInsns.insert(I);
             // If profile mismatches, we should not attempt to inline DI.
             if ((isa<CallInst>(DI) || isa<InvokeInst>(DI)) &&
-                inlineCallInstruction(DI)) {
+                inlineCallInstruction(cast<CallBase>(DI))) {
               localNotInlinedCallSites.erase(I);
               LocalChanged = true;
               ++NumCSInlined;
@@ -1059,7 +1067,7 @@ bool SampleProfileLoader::inlineHotFunctions(
         }
       } else if (CalledFunction && CalledFunction->getSubprogram() &&
                  !CalledFunction->isDeclaration()) {
-        if (inlineCallInstruction(I)) {
+        if (inlineCallInstruction(*I)) {
           localNotInlinedCallSites.erase(I);
           LocalChanged = true;
           ++NumCSInlined;
@@ -1078,8 +1086,8 @@ bool SampleProfileLoader::inlineHotFunctions(
 
   // Accumulate not inlined callsite information into notInlinedSamples
   for (const auto &Pair : localNotInlinedCallSites) {
-    Instruction *I = Pair.getFirst();
-    Function *Callee = CallSite(I).getCalledFunction();
+    CallBase *I = Pair.getFirst();
+    Function *Callee = I->getCalledFunction();
     if (!Callee || Callee->isDeclaration())
       continue;
 
@@ -1525,8 +1533,7 @@ void SampleProfileLoader::propagateWeights(Function &F) {
       for (auto &I : BB->getInstList()) {
         if (!isa<CallInst>(I) && !isa<InvokeInst>(I))
           continue;
-        CallSite CS(&I);
-        if (!CS.getCalledFunction()) {
+        if (!cast<CallBase>(I).getCalledFunction()) {
           const DebugLoc &DLoc = I.getDebugLoc();
           if (!DLoc)
             continue;
@@ -1770,6 +1777,7 @@ INITIALIZE_PASS_BEGIN(SampleProfileLoaderLegacyPass, "sample-profile",
                       "Sample Profile loader", false, false)
 INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
 INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
 INITIALIZE_PASS_END(SampleProfileLoaderLegacyPass, "sample-profile",
                     "Sample Profile loader", false, false)
@@ -1780,8 +1788,17 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
   FunctionOrderList.reserve(M.size());
 
   if (!ProfileTopDownLoad || CG == nullptr) {
+    if (ProfileMergeInlinee) {
+      // Disable ProfileMergeInlinee if profile is not loaded in top down order,
+      // because the profile for a function may be used for the profile
+      // annotation of its outline copy before the profile merging of its
+      // non-inlined inline instances, and that is not the way how
+      // ProfileMergeInlinee is supposed to work.
+      ProfileMergeInlinee = false;
+    }
+
     for (Function &F : M)
-      if (!F.isDeclaration())
+      if (!F.isDeclaration() && F.hasFnAttribute("use-sample-profile"))
         FunctionOrderList.push_back(&F);
     return FunctionOrderList;
   }
@@ -1791,7 +1808,7 @@ SampleProfileLoader::buildFunctionOrder(Module &M, CallGraph *CG) {
   while (!CGI.isAtEnd()) {
     for (CallGraphNode *node : *CGI) {
       auto F = node->getFunction();
-      if (F && !F->isDeclaration())
+      if (F && !F->isDeclaration() && F->hasFnAttribute("use-sample-profile"))
         FunctionOrderList.push_back(F);
     }
     ++CGI;
@@ -1839,15 +1856,16 @@ ModulePass *llvm::createSampleProfileLoaderPass(StringRef Name) {
 
 bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
                                       ProfileSummaryInfo *_PSI, CallGraph *CG) {
-  GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
   if (!ProfileIsValid)
     return false;
+  GUIDToFuncNameMapper Mapper(M, *Reader, GUIDToFuncNameMap);
 
   PSI = _PSI;
-  if (M.getProfileSummary(/* IsCS */ false) == nullptr)
+  if (M.getProfileSummary(/* IsCS */ false) == nullptr) {
     M.setProfileSummary(Reader->getSummary().getMD(M.getContext()),
                         ProfileSummary::PSK_Sample);
-
+    PSI->refresh();
+  }
   // Compute the total number of samples collected in this profile.
   for (const auto &I : Reader->getProfiles())
     TotalCollectedSamples += I.second.getTotalSamples();
@@ -1890,6 +1908,7 @@ bool SampleProfileLoader::runOnModule(Module &M, ModuleAnalysisManager *AM,
 bool SampleProfileLoaderLegacyPass::runOnModule(Module &M) {
   ACT = &getAnalysis<AssumptionCacheTracker>();
   TTIWP = &getAnalysis<TargetTransformInfoWrapperPass>();
+  TLIWP = &getAnalysis<TargetLibraryInfoWrapperPass>();
   ProfileSummaryInfo *PSI =
       &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   return SampleLoader.runOnModule(M, nullptr, PSI, nullptr);
@@ -1966,12 +1985,15 @@ PreservedAnalyses SampleProfileLoaderPass::run(Module &M,
   auto GetTTI = [&](Function &F) -> TargetTransformInfo & {
     return FAM.getResult<TargetIRAnalysis>(F);
   };
+  auto GetTLI = [&](Function &F) -> const TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
 
   SampleProfileLoader SampleLoader(
       ProfileFileName.empty() ? SampleProfileFile : ProfileFileName,
       ProfileRemappingFileName.empty() ? SampleProfileRemappingFile
                                        : ProfileRemappingFileName,
-      IsThinLTOPreLink, GetAssumptionCache, GetTTI);
+      IsThinLTOPreLink, GetAssumptionCache, GetTTI, GetTLI);
 
   if (!SampleLoader.doInitialization(M))
     return PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/IPO/StripSymbols.cpp b/llvm/lib/Transforms/IPO/StripSymbols.cpp
index 6ce00714523b..088091df770f 100644
--- a/llvm/lib/Transforms/IPO/StripSymbols.cpp
+++ b/llvm/lib/Transforms/IPO/StripSymbols.cpp
@@ -147,10 +147,12 @@ static void RemoveDeadConstant(Constant *C) {
   if (GlobalVariable *GV = dyn_cast<GlobalVariable>(C)) {
     if (!GV->hasLocalLinkage()) return;   // Don't delete non-static globals.
     GV->eraseFromParent();
-  }
-  else if (!isa<Function>(C))
-    if (isa<CompositeType>(C->getType()))
+  } else if (!isa<Function>(C)) {
+    // FIXME: Why does the type of the constant matter here?
+    if (isa<StructType>(C->getType()) || isa<ArrayType>(C->getType()) ||
+        isa<VectorType>(C->getType()))
       C->destroyConstant();
+  }
 
   // If the constant referenced anything, see if we can delete it as well.
   for (Constant *O : Operands)
diff --git a/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp b/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
index 45fd432fd721..1b1e91cafa65 100644
--- a/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
+++ b/llvm/lib/Transforms/IPO/SyntheticCountsPropagation.cpp
@@ -31,7 +31,6 @@
 #include "llvm/Analysis/CallGraph.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/SyntheticCountsUtils.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/Module.h"
@@ -110,14 +109,13 @@ PreservedAnalyses SyntheticCountsPropagation::run(Module &M,
     Optional<Scaled64> Res = None;
     if (!Edge.first)
       return Res;
-    assert(isa<Instruction>(Edge.first));
-    CallSite CS(cast<Instruction>(Edge.first));
-    Function *Caller = CS.getCaller();
+    CallBase &CB = *cast<CallBase>(*Edge.first);
+    Function *Caller = CB.getCaller();
     auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(*Caller);
 
     // Now compute the callsite count from relative frequency and
     // entry count:
-    BasicBlock *CSBB = CS.getInstruction()->getParent();
+    BasicBlock *CSBB = CB.getParent();
     Scaled64 EntryFreq(BFI.getEntryFreq(), 0);
     Scaled64 BBCount(BFI.getBlockFreq(CSBB).getFrequency(), 0);
     BBCount /= EntryFreq;
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 5ccfb29b01a1..5a25f9857665 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -57,12 +57,14 @@
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TypeMetadataUtils.h"
-#include "llvm/IR/CallSite.h"
+#include "llvm/Bitcode/BitcodeReader.h"
+#include "llvm/Bitcode/BitcodeWriter.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugLoc.h"
@@ -83,11 +85,12 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
-#include "llvm/PassSupport.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Errc.h"
 #include "llvm/Support/Error.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Support/GlobPattern.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/IPO.h"
 #include "llvm/Transforms/IPO/FunctionAttrs.h"
@@ -115,12 +118,15 @@ static cl::opt<PassSummaryAction> ClSummaryAction(
 
 static cl::opt<std::string> ClReadSummary(
     "wholeprogramdevirt-read-summary",
-    cl::desc("Read summary from given YAML file before running pass"),
+    cl::desc(
+        "Read summary from given bitcode or YAML file before running pass"),
     cl::Hidden);
 
 static cl::opt<std::string> ClWriteSummary(
     "wholeprogramdevirt-write-summary",
-    cl::desc("Write summary to given YAML file after running pass"),
+    cl::desc("Write summary to given bitcode or YAML file after running pass. "
+             "Output file format is deduced from extension: *.bc means writing "
+             "bitcode, otherwise YAML"),
     cl::Hidden);
 
 static cl::opt<unsigned>
@@ -134,6 +140,45 @@ static cl::opt<bool>
                        cl::init(false), cl::ZeroOrMore,
                        cl::desc("Print index-based devirtualization messages"));
 
+/// Provide a way to force enable whole program visibility in tests.
+/// This is needed to support legacy tests that don't contain
+/// !vcall_visibility metadata (the mere presense of type tests
+/// previously implied hidden visibility).
+cl::opt<bool>
+    WholeProgramVisibility("whole-program-visibility", cl::init(false),
+                           cl::Hidden, cl::ZeroOrMore,
+                           cl::desc("Enable whole program visibility"));
+
+/// Provide a way to force disable whole program for debugging or workarounds,
+/// when enabled via the linker.
+cl::opt<bool> DisableWholeProgramVisibility(
+    "disable-whole-program-visibility", cl::init(false), cl::Hidden,
+    cl::ZeroOrMore,
+    cl::desc("Disable whole program visibility (overrides enabling options)"));
+
+/// Provide way to prevent certain function from being devirtualized
+cl::list<std::string>
+    SkipFunctionNames("wholeprogramdevirt-skip",
+                      cl::desc("Prevent function(s) from being devirtualized"),
+                      cl::Hidden, cl::ZeroOrMore, cl::CommaSeparated);
+
+namespace {
+struct PatternList {
+  std::vector<GlobPattern> Patterns;
+  template <class T> void init(const T &StringList) {
+    for (const auto &S : StringList)
+      if (Expected<GlobPattern> Pat = GlobPattern::create(S))
+        Patterns.push_back(std::move(*Pat));
+  }
+  bool match(StringRef S) {
+    for (const GlobPattern &P : Patterns)
+      if (P.match(S))
+        return true;
+    return false;
+  }
+};
+} // namespace
+
 // Find the minimum offset that we may store a value of size Size bits at. If
 // IsAfter is set, look for an offset before the object, otherwise look for an
 // offset after the object.
@@ -308,20 +353,20 @@ namespace {
 // A virtual call site. VTable is the loaded virtual table pointer, and CS is
 // the indirect virtual call.
 struct VirtualCallSite {
-  Value *VTable;
-  CallSite CS;
+  Value *VTable = nullptr;
+  CallBase &CB;
 
   // If non-null, this field points to the associated unsafe use count stored in
   // the DevirtModule::NumUnsafeUsesForTypeTest map below. See the description
   // of that field for details.
-  unsigned *NumUnsafeUses;
+  unsigned *NumUnsafeUses = nullptr;
 
   void
   emitRemark(const StringRef OptName, const StringRef TargetName,
              function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter) {
-    Function *F = CS.getCaller();
-    DebugLoc DLoc = CS->getDebugLoc();
-    BasicBlock *Block = CS.getParent();
+    Function *F = CB.getCaller();
+    DebugLoc DLoc = CB.getDebugLoc();
+    BasicBlock *Block = CB.getParent();
 
     using namespace ore;
     OREGetter(F).emit(OptimizationRemark(DEBUG_TYPE, OptName, DLoc, Block)
@@ -336,12 +381,12 @@ struct VirtualCallSite {
       Value *New) {
     if (RemarksEnabled)
       emitRemark(OptName, TargetName, OREGetter);
-    CS->replaceAllUsesWith(New);
-    if (auto II = dyn_cast<InvokeInst>(CS.getInstruction())) {
-      BranchInst::Create(II->getNormalDest(), CS.getInstruction());
+    CB.replaceAllUsesWith(New);
+    if (auto *II = dyn_cast<InvokeInst>(&CB)) {
+      BranchInst::Create(II->getNormalDest(), &CB);
       II->getUnwindDest()->removePredecessor(II->getParent());
     }
-    CS->eraseFromParent();
+    CB.eraseFromParent();
     // This use is no longer unsafe.
     if (NumUnsafeUses)
       --*NumUnsafeUses;
@@ -414,18 +459,18 @@ struct VTableSlotInfo {
   // "this"), grouped by argument list.
   std::map<std::vector<uint64_t>, CallSiteInfo> ConstCSInfo;
 
-  void addCallSite(Value *VTable, CallSite CS, unsigned *NumUnsafeUses);
+  void addCallSite(Value *VTable, CallBase &CB, unsigned *NumUnsafeUses);
 
 private:
-  CallSiteInfo &findCallSiteInfo(CallSite CS);
+  CallSiteInfo &findCallSiteInfo(CallBase &CB);
 };
 
-CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallSite CS) {
+CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallBase &CB) {
   std::vector<uint64_t> Args;
-  auto *CI = dyn_cast<IntegerType>(CS.getType());
-  if (!CI || CI->getBitWidth() > 64 || CS.arg_empty())
+  auto *CBType = dyn_cast<IntegerType>(CB.getType());
+  if (!CBType || CBType->getBitWidth() > 64 || CB.arg_empty())
     return CSInfo;
-  for (auto &&Arg : make_range(CS.arg_begin() + 1, CS.arg_end())) {
+  for (auto &&Arg : make_range(CB.arg_begin() + 1, CB.arg_end())) {
     auto *CI = dyn_cast<ConstantInt>(Arg);
     if (!CI || CI->getBitWidth() > 64)
       return CSInfo;
@@ -434,11 +479,11 @@ CallSiteInfo &VTableSlotInfo::findCallSiteInfo(CallSite CS) {
   return ConstCSInfo[Args];
 }
 
-void VTableSlotInfo::addCallSite(Value *VTable, CallSite CS,
+void VTableSlotInfo::addCallSite(Value *VTable, CallBase &CB,
                                  unsigned *NumUnsafeUses) {
-  auto &CSI = findCallSiteInfo(CS);
+  auto &CSI = findCallSiteInfo(CB);
   CSI.AllCallSitesDevirted = false;
-  CSI.CallSites.push_back({VTable, CS, NumUnsafeUses});
+  CSI.CallSites.push_back({VTable, CB, NumUnsafeUses});
 }
 
 struct DevirtModule {
@@ -454,6 +499,10 @@ struct DevirtModule {
   IntegerType *Int32Ty;
   IntegerType *Int64Ty;
   IntegerType *IntPtrTy;
+  /// Sizeless array type, used for imported vtables. This provides a signal
+  /// to analyzers that these imports may alias, as they do for example
+  /// when multiple unique return values occur in the same vtable.
+  ArrayType *Int8Arr0Ty;
 
   bool RemarksEnabled;
   function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter;
@@ -469,6 +518,7 @@ struct DevirtModule {
   // eliminate the type check by RAUWing the associated llvm.type.test call with
   // true.
   std::map<CallInst *, unsigned> NumUnsafeUsesForTypeTest;
+  PatternList FunctionsToSkip;
 
   DevirtModule(Module &M, function_ref<AAResults &(Function &)> AARGetter,
                function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
@@ -482,13 +532,17 @@ struct DevirtModule {
         Int32Ty(Type::getInt32Ty(M.getContext())),
         Int64Ty(Type::getInt64Ty(M.getContext())),
         IntPtrTy(M.getDataLayout().getIntPtrType(M.getContext(), 0)),
+        Int8Arr0Ty(ArrayType::get(Type::getInt8Ty(M.getContext()), 0)),
         RemarksEnabled(areRemarksEnabled()), OREGetter(OREGetter) {
     assert(!(ExportSummary && ImportSummary));
+    FunctionsToSkip.init(SkipFunctionNames);
   }
 
   bool areRemarksEnabled();
 
-  void scanTypeTestUsers(Function *TypeTestFunc);
+  void
+  scanTypeTestUsers(Function *TypeTestFunc,
+                    DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap);
   void scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc);
 
   void buildTypeIdentifierMap(
@@ -592,12 +646,16 @@ struct DevirtIndex {
 
   MapVector<VTableSlotSummary, VTableSlotInfo> CallSlots;
 
+  PatternList FunctionsToSkip;
+
   DevirtIndex(
       ModuleSummaryIndex &ExportSummary,
       std::set<GlobalValue::GUID> &ExportedGUIDs,
       std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap)
       : ExportSummary(ExportSummary), ExportedGUIDs(ExportedGUIDs),
-        LocalWPDTargetsMap(LocalWPDTargetsMap) {}
+        LocalWPDTargetsMap(LocalWPDTargetsMap) {
+    FunctionsToSkip.init(SkipFunctionNames);
+  }
 
   bool tryFindVirtualCallTargets(std::vector<ValueInfo> &TargetsForSlot,
                                  const TypeIdCompatibleVtableInfo TIdInfo,
@@ -702,7 +760,49 @@ PreservedAnalyses WholeProgramDevirtPass::run(Module &M,
   return PreservedAnalyses::none();
 }
 
+// Enable whole program visibility if enabled by client (e.g. linker) or
+// internal option, and not force disabled.
+static bool hasWholeProgramVisibility(bool WholeProgramVisibilityEnabledInLTO) {
+  return (WholeProgramVisibilityEnabledInLTO || WholeProgramVisibility) &&
+         !DisableWholeProgramVisibility;
+}
+
 namespace llvm {
+
+/// If whole program visibility asserted, then upgrade all public vcall
+/// visibility metadata on vtable definitions to linkage unit visibility in
+/// Module IR (for regular or hybrid LTO).
+void updateVCallVisibilityInModule(Module &M,
+                                   bool WholeProgramVisibilityEnabledInLTO) {
+  if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
+    return;
+  for (GlobalVariable &GV : M.globals())
+    // Add linkage unit visibility to any variable with type metadata, which are
+    // the vtable definitions. We won't have an existing vcall_visibility
+    // metadata on vtable definitions with public visibility.
+    if (GV.hasMetadata(LLVMContext::MD_type) &&
+        GV.getVCallVisibility() == GlobalObject::VCallVisibilityPublic)
+      GV.setVCallVisibilityMetadata(GlobalObject::VCallVisibilityLinkageUnit);
+}
+
+/// If whole program visibility asserted, then upgrade all public vcall
+/// visibility metadata on vtable definition summaries to linkage unit
+/// visibility in Module summary index (for ThinLTO).
+void updateVCallVisibilityInIndex(ModuleSummaryIndex &Index,
+                                  bool WholeProgramVisibilityEnabledInLTO) {
+  if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
+    return;
+  for (auto &P : Index) {
+    for (auto &S : P.second.SummaryList) {
+      auto *GVar = dyn_cast<GlobalVarSummary>(S.get());
+      if (!GVar || GVar->vTableFuncs().empty() ||
+          GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
+        continue;
+      GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit);
+    }
+  }
+}
+
 void runWholeProgramDevirtOnIndex(
     ModuleSummaryIndex &Summary, std::set<GlobalValue::GUID> &ExportedGUIDs,
     std::map<ValueInfo, std::vector<VTableSlotSummary>> &LocalWPDTargetsMap) {
@@ -737,11 +837,27 @@ void updateIndexWPDForExports(
 
 } // end namespace llvm
 
+static Error checkCombinedSummaryForTesting(ModuleSummaryIndex *Summary) {
+  // Check that summary index contains regular LTO module when performing
+  // export to prevent occasional use of index from pure ThinLTO compilation
+  // (-fno-split-lto-module). This kind of summary index is passed to
+  // DevirtIndex::run, not to DevirtModule::run used by opt/runForTesting.
+  const auto &ModPaths = Summary->modulePaths();
+  if (ClSummaryAction != PassSummaryAction::Import &&
+      ModPaths.find(ModuleSummaryIndex::getRegularLTOModuleName()) ==
+          ModPaths.end())
+    return createStringError(
+        errc::invalid_argument,
+        "combined summary should contain Regular LTO module");
+  return ErrorSuccess();
+}
+
 bool DevirtModule::runForTesting(
     Module &M, function_ref<AAResults &(Function &)> AARGetter,
     function_ref<OptimizationRemarkEmitter &(Function *)> OREGetter,
     function_ref<DominatorTree &(Function &)> LookupDomTree) {
-  ModuleSummaryIndex Summary(/*HaveGVs=*/false);
+  std::unique_ptr<ModuleSummaryIndex> Summary =
+      std::make_unique<ModuleSummaryIndex>(/*HaveGVs=*/false);
 
   // Handle the command-line summary arguments. This code is for testing
   // purposes only, so we handle errors directly.
@@ -750,28 +866,41 @@ bool DevirtModule::runForTesting(
                           ": ");
     auto ReadSummaryFile =
         ExitOnErr(errorOrToExpected(MemoryBuffer::getFile(ClReadSummary)));
-
-    yaml::Input In(ReadSummaryFile->getBuffer());
-    In >> Summary;
-    ExitOnErr(errorCodeToError(In.error()));
+    if (Expected<std::unique_ptr<ModuleSummaryIndex>> SummaryOrErr =
+            getModuleSummaryIndex(*ReadSummaryFile)) {
+      Summary = std::move(*SummaryOrErr);
+      ExitOnErr(checkCombinedSummaryForTesting(Summary.get()));
+    } else {
+      // Try YAML if we've failed with bitcode.
+      consumeError(SummaryOrErr.takeError());
+      yaml::Input In(ReadSummaryFile->getBuffer());
+      In >> *Summary;
+      ExitOnErr(errorCodeToError(In.error()));
+    }
   }
 
   bool Changed =
-      DevirtModule(
-          M, AARGetter, OREGetter, LookupDomTree,
-          ClSummaryAction == PassSummaryAction::Export ? &Summary : nullptr,
-          ClSummaryAction == PassSummaryAction::Import ? &Summary : nullptr)
+      DevirtModule(M, AARGetter, OREGetter, LookupDomTree,
+                   ClSummaryAction == PassSummaryAction::Export ? Summary.get()
+                                                                : nullptr,
+                   ClSummaryAction == PassSummaryAction::Import ? Summary.get()
+                                                                : nullptr)
           .run();
 
   if (!ClWriteSummary.empty()) {
     ExitOnError ExitOnErr(
         "-wholeprogramdevirt-write-summary: " + ClWriteSummary + ": ");
     std::error_code EC;
-    raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text);
-    ExitOnErr(errorCodeToError(EC));
-
-    yaml::Output Out(OS);
-    Out << Summary;
+    if (StringRef(ClWriteSummary).endswith(".bc")) {
+      raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_None);
+      ExitOnErr(errorCodeToError(EC));
+      WriteIndexToFile(*Summary, OS);
+    } else {
+      raw_fd_ostream OS(ClWriteSummary, EC, sys::fs::OF_Text);
+      ExitOnErr(errorCodeToError(EC));
+      yaml::Output Out(OS);
+      Out << *Summary;
+    }
   }
 
   return Changed;
@@ -818,6 +947,12 @@ bool DevirtModule::tryFindVirtualCallTargets(
     if (!TM.Bits->GV->isConstant())
       return false;
 
+    // We cannot perform whole program devirtualization analysis on a vtable
+    // with public LTO visibility.
+    if (TM.Bits->GV->getVCallVisibility() ==
+        GlobalObject::VCallVisibilityPublic)
+      return false;
+
     Constant *Ptr = getPointerAtOffset(TM.Bits->GV->getInitializer(),
                                        TM.Offset + ByteOffset, M);
     if (!Ptr)
@@ -827,6 +962,9 @@ bool DevirtModule::tryFindVirtualCallTargets(
     if (!Fn)
       return false;
 
+    if (FunctionsToSkip.match(Fn->getName()))
+      return false;
+
     // We can disregard __cxa_pure_virtual as a possible call target, as
     // calls to pure virtuals are UB.
     if (Fn->getName() == "__cxa_pure_virtual")
@@ -863,8 +1001,13 @@ bool DevirtIndex::tryFindVirtualCallTargets(
           return false;
         LocalFound = true;
       }
-      if (!GlobalValue::isAvailableExternallyLinkage(S->linkage()))
+      if (!GlobalValue::isAvailableExternallyLinkage(S->linkage())) {
         VS = cast<GlobalVarSummary>(S->getBaseObject());
+        // We cannot perform whole program devirtualization analysis on a vtable
+        // with public LTO visibility.
+        if (VS->getVCallVisibility() == GlobalObject::VCallVisibilityPublic)
+          return false;
+      }
     }
     if (!VS->isLive())
       continue;
@@ -887,8 +1030,8 @@ void DevirtModule::applySingleImplDevirt(VTableSlotInfo &SlotInfo,
       if (RemarksEnabled)
         VCallSite.emitRemark("single-impl",
                              TheFn->stripPointerCasts()->getName(), OREGetter);
-      VCallSite.CS.setCalledFunction(ConstantExpr::getBitCast(
-          TheFn, VCallSite.CS.getCalledValue()->getType()));
+      VCallSite.CB.setCalledOperand(ConstantExpr::getBitCast(
+          TheFn, VCallSite.CB.getCalledOperand()->getType()));
       // This use is no longer unsafe.
       if (VCallSite.NumUnsafeUses)
         --*VCallSite.NumUnsafeUses;
@@ -979,7 +1122,7 @@ bool DevirtModule::trySingleImplDevirt(
     AddCalls(SlotInfo, TheFnVI);
 
   Res->TheKind = WholeProgramDevirtResolution::SingleImpl;
-  Res->SingleImplName = TheFn->getName();
+  Res->SingleImplName = std::string(TheFn->getName());
 
   return true;
 }
@@ -1001,6 +1144,11 @@ bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
   if (!Size)
     return false;
 
+  // Don't devirtualize function if we're told to skip it
+  // in -wholeprogramdevirt-skip.
+  if (FunctionsToSkip.match(TheFn.name()))
+    return false;
+
   // If the summary list contains multiple summaries where at least one is
   // a local, give up, as we won't know which (possibly promoted) name to use.
   for (auto &S : TheFn.getSummaryList())
@@ -1028,10 +1176,10 @@ bool DevirtIndex::trySingleImplDevirt(MutableArrayRef<ValueInfo> TargetsForSlot,
           TheFn.name(), ExportSummary.getModuleHash(S->modulePath()));
     else {
       LocalWPDTargetsMap[TheFn].push_back(SlotSummary);
-      Res->SingleImplName = TheFn.name();
+      Res->SingleImplName = std::string(TheFn.name());
     }
   } else
-    Res->SingleImplName = TheFn.name();
+    Res->SingleImplName = std::string(TheFn.name());
 
   // Name will be empty if this thin link driven off of serialized combined
   // index (e.g. llvm-lto). However, WPD is not supported/invoked for the
@@ -1106,10 +1254,10 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
     if (CSInfo.AllCallSitesDevirted)
       return;
     for (auto &&VCallSite : CSInfo.CallSites) {
-      CallSite CS = VCallSite.CS;
+      CallBase &CB = VCallSite.CB;
 
       // Jump tables are only profitable if the retpoline mitigation is enabled.
-      Attribute FSAttr = CS.getCaller()->getFnAttribute("target-features");
+      Attribute FSAttr = CB.getCaller()->getFnAttribute("target-features");
       if (FSAttr.hasAttribute(Attribute::None) ||
           !FSAttr.getValueAsString().contains("+retpoline"))
         continue;
@@ -1122,42 +1270,40 @@ void DevirtModule::applyICallBranchFunnel(VTableSlotInfo &SlotInfo,
       // x86_64.
       std::vector<Type *> NewArgs;
       NewArgs.push_back(Int8PtrTy);
-      for (Type *T : CS.getFunctionType()->params())
+      for (Type *T : CB.getFunctionType()->params())
         NewArgs.push_back(T);
       FunctionType *NewFT =
-          FunctionType::get(CS.getFunctionType()->getReturnType(), NewArgs,
-                            CS.getFunctionType()->isVarArg());
+          FunctionType::get(CB.getFunctionType()->getReturnType(), NewArgs,
+                            CB.getFunctionType()->isVarArg());
       PointerType *NewFTPtr = PointerType::getUnqual(NewFT);
 
-      IRBuilder<> IRB(CS.getInstruction());
+      IRBuilder<> IRB(&CB);
       std::vector<Value *> Args;
       Args.push_back(IRB.CreateBitCast(VCallSite.VTable, Int8PtrTy));
-      for (unsigned I = 0; I != CS.getNumArgOperands(); ++I)
-        Args.push_back(CS.getArgOperand(I));
+      Args.insert(Args.end(), CB.arg_begin(), CB.arg_end());
 
-      CallSite NewCS;
-      if (CS.isCall())
+      CallBase *NewCS = nullptr;
+      if (isa<CallInst>(CB))
         NewCS = IRB.CreateCall(NewFT, IRB.CreateBitCast(JT, NewFTPtr), Args);
       else
-        NewCS = IRB.CreateInvoke(
-            NewFT, IRB.CreateBitCast(JT, NewFTPtr),
-            cast<InvokeInst>(CS.getInstruction())->getNormalDest(),
-            cast<InvokeInst>(CS.getInstruction())->getUnwindDest(), Args);
-      NewCS.setCallingConv(CS.getCallingConv());
+        NewCS = IRB.CreateInvoke(NewFT, IRB.CreateBitCast(JT, NewFTPtr),
+                                 cast<InvokeInst>(CB).getNormalDest(),
+                                 cast<InvokeInst>(CB).getUnwindDest(), Args);
+      NewCS->setCallingConv(CB.getCallingConv());
 
-      AttributeList Attrs = CS.getAttributes();
+      AttributeList Attrs = CB.getAttributes();
       std::vector<AttributeSet> NewArgAttrs;
       NewArgAttrs.push_back(AttributeSet::get(
           M.getContext(), ArrayRef<Attribute>{Attribute::get(
                               M.getContext(), Attribute::Nest)}));
       for (unsigned I = 0; I + 2 <  Attrs.getNumAttrSets(); ++I)
         NewArgAttrs.push_back(Attrs.getParamAttributes(I));
-      NewCS.setAttributes(
+      NewCS->setAttributes(
           AttributeList::get(M.getContext(), Attrs.getFnAttributes(),
                              Attrs.getRetAttributes(), NewArgAttrs));
 
-      CS->replaceAllUsesWith(NewCS.getInstruction());
-      CS->eraseFromParent();
+      CB.replaceAllUsesWith(NewCS);
+      CB.eraseFromParent();
 
       // This use is no longer unsafe.
       if (VCallSite.NumUnsafeUses)
@@ -1208,7 +1354,7 @@ void DevirtModule::applyUniformRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
   for (auto Call : CSInfo.CallSites)
     Call.replaceAndErase(
         "uniform-ret-val", FnName, RemarksEnabled, OREGetter,
-        ConstantInt::get(cast<IntegerType>(Call.CS.getType()), TheRetVal));
+        ConstantInt::get(cast<IntegerType>(Call.CB.getType()), TheRetVal));
   CSInfo.markDevirt();
 }
 
@@ -1273,7 +1419,8 @@ void DevirtModule::exportConstant(VTableSlot Slot, ArrayRef<uint64_t> Args,
 
 Constant *DevirtModule::importGlobal(VTableSlot Slot, ArrayRef<uint64_t> Args,
                                      StringRef Name) {
-  Constant *C = M.getOrInsertGlobal(getGlobalName(Slot, Args, Name), Int8Ty);
+  Constant *C =
+      M.getOrInsertGlobal(getGlobalName(Slot, Args, Name), Int8Arr0Ty);
   auto *GV = dyn_cast<GlobalVariable>(C);
   if (GV)
     GV->setVisibility(GlobalValue::HiddenVisibility);
@@ -1313,11 +1460,11 @@ void DevirtModule::applyUniqueRetValOpt(CallSiteInfo &CSInfo, StringRef FnName,
                                         bool IsOne,
                                         Constant *UniqueMemberAddr) {
   for (auto &&Call : CSInfo.CallSites) {
-    IRBuilder<> B(Call.CS.getInstruction());
+    IRBuilder<> B(&Call.CB);
     Value *Cmp =
-        B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE,
-                     B.CreateBitCast(Call.VTable, Int8PtrTy), UniqueMemberAddr);
-    Cmp = B.CreateZExt(Cmp, Call.CS->getType());
+        B.CreateICmp(IsOne ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE, Call.VTable,
+                     B.CreateBitCast(UniqueMemberAddr, Call.VTable->getType()));
+    Cmp = B.CreateZExt(Cmp, Call.CB.getType());
     Call.replaceAndErase("unique-ret-val", FnName, RemarksEnabled, OREGetter,
                          Cmp);
   }
@@ -1381,8 +1528,8 @@ bool DevirtModule::tryUniqueRetValOpt(
 void DevirtModule::applyVirtualConstProp(CallSiteInfo &CSInfo, StringRef FnName,
                                          Constant *Byte, Constant *Bit) {
   for (auto Call : CSInfo.CallSites) {
-    auto *RetType = cast<IntegerType>(Call.CS.getType());
-    IRBuilder<> B(Call.CS.getInstruction());
+    auto *RetType = cast<IntegerType>(Call.CB.getType());
+    IRBuilder<> B(&Call.CB);
     Value *Addr =
         B.CreateGEP(Int8Ty, B.CreateBitCast(Call.VTable, Int8PtrTy), Byte);
     if (RetType->getBitWidth() == 1) {
@@ -1507,10 +1654,8 @@ void DevirtModule::rebuildGlobal(VTableBits &B) {
 
   // Align the before byte array to the global's minimum alignment so that we
   // don't break any alignment requirements on the global.
-  MaybeAlign Alignment(B.GV->getAlignment());
-  if (!Alignment)
-    Alignment =
-        Align(M.getDataLayout().getABITypeAlignment(B.GV->getValueType()));
+  Align Alignment = M.getDataLayout().getValueOrABITypeAlignment(
+      B.GV->getAlign(), B.GV->getValueType());
   B.Before.Bytes.resize(alignTo(B.Before.Bytes.size(), Alignment));
 
   // Before was stored in reverse order; flip it now.
@@ -1562,13 +1707,14 @@ bool DevirtModule::areRemarksEnabled() {
   return false;
 }
 
-void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc) {
+void DevirtModule::scanTypeTestUsers(
+    Function *TypeTestFunc,
+    DenseMap<Metadata *, std::set<TypeMemberInfo>> &TypeIdMap) {
   // Find all virtual calls via a virtual table pointer %p under an assumption
   // of the form llvm.assume(llvm.type.test(%p, %md)). This indicates that %p
   // points to a member of the type identifier %md. Group calls by (type ID,
   // offset) pair (effectively the identity of the virtual function) and store
   // to CallSlots.
-  DenseSet<CallSite> SeenCallSites;
   for (auto I = TypeTestFunc->use_begin(), E = TypeTestFunc->use_end();
        I != E;) {
     auto CI = dyn_cast<CallInst>(I->getUser());
@@ -1582,29 +1728,59 @@ void DevirtModule::scanTypeTestUsers(Function *TypeTestFunc) {
     auto &DT = LookupDomTree(*CI->getFunction());
     findDevirtualizableCallsForTypeTest(DevirtCalls, Assumes, CI, DT);
 
+    Metadata *TypeId =
+        cast<MetadataAsValue>(CI->getArgOperand(1))->getMetadata();
     // If we found any, add them to CallSlots.
     if (!Assumes.empty()) {
-      Metadata *TypeId =
-          cast<MetadataAsValue>(CI->getArgOperand(1))->getMetadata();
       Value *Ptr = CI->getArgOperand(0)->stripPointerCasts();
-      for (DevirtCallSite Call : DevirtCalls) {
-        // Only add this CallSite if we haven't seen it before. The vtable
-        // pointer may have been CSE'd with pointers from other call sites,
-        // and we don't want to process call sites multiple times. We can't
-        // just skip the vtable Ptr if it has been seen before, however, since
-        // it may be shared by type tests that dominate different calls.
-        if (SeenCallSites.insert(Call.CS).second)
-          CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CS, nullptr);
-      }
+      for (DevirtCallSite Call : DevirtCalls)
+        CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CB, nullptr);
     }
 
-    // We no longer need the assumes or the type test.
-    for (auto Assume : Assumes)
-      Assume->eraseFromParent();
-    // We can't use RecursivelyDeleteTriviallyDeadInstructions here because we
-    // may use the vtable argument later.
-    if (CI->use_empty())
-      CI->eraseFromParent();
+    auto RemoveTypeTestAssumes = [&]() {
+      // We no longer need the assumes or the type test.
+      for (auto Assume : Assumes)
+        Assume->eraseFromParent();
+      // We can't use RecursivelyDeleteTriviallyDeadInstructions here because we
+      // may use the vtable argument later.
+      if (CI->use_empty())
+        CI->eraseFromParent();
+    };
+
+    // At this point we could remove all type test assume sequences, as they
+    // were originally inserted for WPD. However, we can keep these in the
+    // code stream for later analysis (e.g. to help drive more efficient ICP
+    // sequences). They will eventually be removed by a second LowerTypeTests
+    // invocation that cleans them up. In order to do this correctly, the first
+    // LowerTypeTests invocation needs to know that they have "Unknown" type
+    // test resolution, so that they aren't treated as Unsat and lowered to
+    // False, which will break any uses on assumes. Below we remove any type
+    // test assumes that will not be treated as Unknown by LTT.
+
+    // The type test assumes will be treated by LTT as Unsat if the type id is
+    // not used on a global (in which case it has no entry in the TypeIdMap).
+    if (!TypeIdMap.count(TypeId))
+      RemoveTypeTestAssumes();
+
+    // For ThinLTO importing, we need to remove the type test assumes if this is
+    // an MDString type id without a corresponding TypeIdSummary. Any
+    // non-MDString type ids are ignored and treated as Unknown by LTT, so their
+    // type test assumes can be kept. If the MDString type id is missing a
+    // TypeIdSummary (e.g. because there was no use on a vcall, preventing the
+    // exporting phase of WPD from analyzing it), then it would be treated as
+    // Unsat by LTT and we need to remove its type test assumes here. If not
+    // used on a vcall we don't need them for later optimization use in any
+    // case.
+    else if (ImportSummary && isa<MDString>(TypeId)) {
+      const TypeIdSummary *TidSummary =
+          ImportSummary->getTypeIdSummary(cast<MDString>(TypeId)->getString());
+      if (!TidSummary)
+        RemoveTypeTestAssumes();
+      else
+        // If one was created it should not be Unsat, because if we reached here
+        // the type id was used on a global.
+        assert(TidSummary->TTRes.TheKind != TypeTestResolution::Unsat);
+    }
   }
 }
 
@@ -1680,7 +1856,7 @@ void DevirtModule::scanTypeCheckedLoadUsers(Function *TypeCheckedLoadFunc) {
     if (HasNonCallUses)
       ++NumUnsafeUses;
     for (DevirtCallSite Call : DevirtCalls) {
-      CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CS,
+      CallSlots[{TypeId, Call.Offset}].addCallSite(Ptr, Call.CB,
                                                    &NumUnsafeUses);
     }
 
@@ -1796,8 +1972,13 @@ bool DevirtModule::run() {
       (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()))
     return false;
 
+  // Rebuild type metadata into a map for easy lookup.
+  std::vector<VTableBits> Bits;
+  DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
+  buildTypeIdentifierMap(Bits, TypeIdMap);
+
   if (TypeTestFunc && AssumeFunc)
-    scanTypeTestUsers(TypeTestFunc);
+    scanTypeTestUsers(TypeTestFunc, TypeIdMap);
 
   if (TypeCheckedLoadFunc)
     scanTypeCheckedLoadUsers(TypeCheckedLoadFunc);
@@ -1808,15 +1989,17 @@ bool DevirtModule::run() {
 
     removeRedundantTypeTests();
 
+    // We have lowered or deleted the type instrinsics, so we will no
+    // longer have enough information to reason about the liveness of virtual
+    // function pointers in GlobalDCE.
+    for (GlobalVariable &GV : M.globals())
+      GV.eraseMetadata(LLVMContext::MD_vcall_visibility);
+
     // The rest of the code is only necessary when exporting or during regular
     // LTO, so we are done.
     return true;
   }
 
-  // Rebuild type metadata into a map for easy lookup.
-  std::vector<VTableBits> Bits;
-  DenseMap<Metadata *, std::set<TypeMemberInfo>> TypeIdMap;
-  buildTypeIdentifierMap(Bits, TypeIdMap);
   if (TypeIdMap.empty())
     return true;
 
@@ -1873,14 +2056,22 @@ bool DevirtModule::run() {
     // function implementation at offset S.first.ByteOffset, and add to
     // TargetsForSlot.
     std::vector<VirtualCallTarget> TargetsForSlot;
-    if (tryFindVirtualCallTargets(TargetsForSlot, TypeIdMap[S.first.TypeID],
+    WholeProgramDevirtResolution *Res = nullptr;
+    const std::set<TypeMemberInfo> &TypeMemberInfos = TypeIdMap[S.first.TypeID];
+    if (ExportSummary && isa<MDString>(S.first.TypeID) &&
+        TypeMemberInfos.size())
+      // For any type id used on a global's type metadata, create the type id
+      // summary resolution regardless of whether we can devirtualize, so that
+      // lower type tests knows the type id is not Unsat. If it was not used on
+      // a global's type metadata, the TypeIdMap entry set will be empty, and
+      // we don't want to create an entry (with the default Unknown type
+      // resolution), which can prevent detection of the Unsat.
+      Res = &ExportSummary
+                 ->getOrInsertTypeIdSummary(
+                     cast<MDString>(S.first.TypeID)->getString())
+                 .WPDRes[S.first.ByteOffset];
+    if (tryFindVirtualCallTargets(TargetsForSlot, TypeMemberInfos,
                                   S.first.ByteOffset)) {
-      WholeProgramDevirtResolution *Res = nullptr;
-      if (ExportSummary && isa<MDString>(S.first.TypeID))
-        Res = &ExportSummary
-                   ->getOrInsertTypeIdSummary(
-                       cast<MDString>(S.first.TypeID)->getString())
-                   .WPDRes[S.first.ByteOffset];
 
       if (!trySingleImplDevirt(ExportSummary, TargetsForSlot, S.second, Res)) {
         DidVirtualConstProp |=
@@ -1893,7 +2084,7 @@ bool DevirtModule::run() {
       if (RemarksEnabled)
         for (const auto &T : TargetsForSlot)
           if (T.WasDevirt)
-            DevirtTargets[T.Fn->getName()] = T.Fn;
+            DevirtTargets[std::string(T.Fn->getName())] = T.Fn;
     }
 
     // CFI-specific: if we are exporting and any llvm.type.checked.load
@@ -1931,7 +2122,7 @@ bool DevirtModule::run() {
     for (VTableBits &B : Bits)
       rebuildGlobal(B);
 
-  // We have lowered or deleted the type checked load intrinsics, so we no
+  // We have lowered or deleted the type instrinsics, so we will no
   // longer have enough information to reason about the liveness of virtual
   // function pointers in GlobalDCE.
   for (GlobalVariable &GV : M.globals())
@@ -1994,11 +2185,14 @@ void DevirtIndex::run() {
     std::vector<ValueInfo> TargetsForSlot;
     auto TidSummary = ExportSummary.getTypeIdCompatibleVtableSummary(S.first.TypeID);
     assert(TidSummary);
+    // Create the type id summary resolution regardlness of whether we can
+    // devirtualize, so that lower type tests knows the type id is used on
+    // a global and not Unsat.
+    WholeProgramDevirtResolution *Res =
+        &ExportSummary.getOrInsertTypeIdSummary(S.first.TypeID)
+             .WPDRes[S.first.ByteOffset];
     if (tryFindVirtualCallTargets(TargetsForSlot, *TidSummary,
                                   S.first.ByteOffset)) {
-      WholeProgramDevirtResolution *Res =
-          &ExportSummary.getOrInsertTypeIdSummary(S.first.TypeID)
-               .WPDRes[S.first.ByteOffset];
 
       if (!trySingleImplDevirt(TargetsForSlot, S.first, S.second, Res,
                                DevirtTargets))
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
index ec976a971e3c..a7f5e0a7774d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAddSub.cpp
@@ -270,7 +270,7 @@ void FAddendCoef::operator=(const FAddendCoef &That) {
 }
 
 void FAddendCoef::operator+=(const FAddendCoef &That) {
-  enum APFloat::roundingMode RndMode = APFloat::rmNearestTiesToEven;
+  RoundingMode RndMode = RoundingMode::NearestTiesToEven;
   if (isInt() == That.isInt()) {
     if (isInt())
       IntVal += That.IntVal;
@@ -663,8 +663,7 @@ Value *FAddCombine::createFSub(Value *Opnd0, Value *Opnd1) {
 }
 
 Value *FAddCombine::createFNeg(Value *V) {
-  Value *Zero = cast<Value>(ConstantFP::getZeroValueForNegation(V->getType()));
-  Value *NewV = createFSub(Zero, V);
+  Value *NewV = Builder.CreateFNeg(V);
   if (Instruction *I = dyn_cast<Instruction>(NewV))
     createInstPostProc(I, true); // fneg's don't receive instruction numbers.
   return NewV;
@@ -724,8 +723,6 @@ unsigned FAddCombine::calcInstrNumber(const AddendVect &Opnds) {
     if (!CE.isMinusOne() && !CE.isOne())
       InstrNeeded++;
   }
-  if (NegOpndNum == OpndNum)
-    InstrNeeded++;
   return InstrNeeded;
 }
 
@@ -1044,8 +1041,7 @@ Value *InstCombiner::SimplifyAddWithRemainder(BinaryOperator &I) {
       // Match RemOpV = X / C0
       if (MatchDiv(RemOpV, DivOpV, DivOpC, IsSigned) && X == DivOpV &&
           C0 == DivOpC && !MulWillOverflow(C0, C1, IsSigned)) {
-        Value *NewDivisor =
-            ConstantInt::get(X->getType()->getContext(), C0 * C1);
+        Value *NewDivisor = ConstantInt::get(X->getType(), C0 * C1);
         return IsSigned ? Builder.CreateSRem(X, NewDivisor, "srem")
                         : Builder.CreateURem(X, NewDivisor, "urem");
       }
@@ -1307,9 +1303,28 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
       match(&I, m_BinOp(m_c_Add(m_Not(m_Value(B)), m_Value(A)), m_One())))
     return BinaryOperator::CreateSub(A, B);
 
+  // (A + RHS) + RHS --> A + (RHS << 1)
+  if (match(LHS, m_OneUse(m_c_Add(m_Value(A), m_Specific(RHS)))))
+    return BinaryOperator::CreateAdd(A, Builder.CreateShl(RHS, 1, "reass.add"));
+
+  // LHS + (A + LHS) --> A + (LHS << 1)
+  if (match(RHS, m_OneUse(m_c_Add(m_Value(A), m_Specific(LHS)))))
+    return BinaryOperator::CreateAdd(A, Builder.CreateShl(LHS, 1, "reass.add"));
+
   // X % C0 + (( X / C0 ) % C1) * C0 => X % (C0 * C1)
   if (Value *V = SimplifyAddWithRemainder(I)) return replaceInstUsesWith(I, V);
 
+  // ((X s/ C1) << C2) + X => X s% -C1 where -C1 is 1 << C2
+  const APInt *C1, *C2;
+  if (match(LHS, m_Shl(m_SDiv(m_Specific(RHS), m_APInt(C1)), m_APInt(C2)))) {
+    APInt one(C2->getBitWidth(), 1);
+    APInt minusC1 = -(*C1);
+    if (minusC1 == (one << *C2)) {
+      Constant *NewRHS = ConstantInt::get(RHS->getType(), minusC1);
+      return BinaryOperator::CreateSRem(RHS, NewRHS);
+    }
+  }
+
   // A+B --> A|B iff A and B have no bits set in common.
   if (haveNoCommonBitsSet(LHS, RHS, DL, &AC, &I, &DT))
     return BinaryOperator::CreateOr(LHS, RHS);
@@ -1380,8 +1395,9 @@ Instruction *InstCombiner::visitAdd(BinaryOperator &I) {
   // (add (and A, B) (or A, B)) --> (add A, B)
   if (match(&I, m_c_BinOp(m_Or(m_Value(A), m_Value(B)),
                           m_c_And(m_Deferred(A), m_Deferred(B))))) {
-    I.setOperand(0, A);
-    I.setOperand(1, B);
+    // Replacing operands in-place to preserve nuw/nsw flags.
+    replaceOperand(I, 0, A);
+    replaceOperand(I, 1, B);
     return &I;
   }
 
@@ -1685,12 +1701,10 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   if (Instruction *X = foldVectorBinop(I))
     return X;
 
-  // (A*B)-(A*C) -> A*(B-C) etc
-  if (Value *V = SimplifyUsingDistributiveLaws(I))
-    return replaceInstUsesWith(I, V);
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   // If this is a 'B = x-(-A)', change to B = x+A.
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  // We deal with this without involving Negator to preserve NSW flag.
   if (Value *V = dyn_castNegVal(Op1)) {
     BinaryOperator *Res = BinaryOperator::CreateAdd(Op0, V);
 
@@ -1707,6 +1721,45 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     return Res;
   }
 
+  auto TryToNarrowDeduceFlags = [this, &I, &Op0, &Op1]() -> Instruction * {
+    if (Instruction *Ext = narrowMathIfNoOverflow(I))
+      return Ext;
+
+    bool Changed = false;
+    if (!I.hasNoSignedWrap() && willNotOverflowSignedSub(Op0, Op1, I)) {
+      Changed = true;
+      I.setHasNoSignedWrap(true);
+    }
+    if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedSub(Op0, Op1, I)) {
+      Changed = true;
+      I.setHasNoUnsignedWrap(true);
+    }
+
+    return Changed ? &I : nullptr;
+  };
+
+  // First, let's try to interpret `sub a, b` as `add a, (sub 0, b)`,
+  // and let's try to sink `(sub 0, b)` into `b` itself. But only if this isn't
+  // a pure negation used by a select that looks like abs/nabs.
+  bool IsNegation = match(Op0, m_ZeroInt());
+  if (!IsNegation || none_of(I.users(), [&I, Op1](const User *U) {
+        const Instruction *UI = dyn_cast<Instruction>(U);
+        if (!UI)
+          return false;
+        return match(UI,
+                     m_Select(m_Value(), m_Specific(Op1), m_Specific(&I))) ||
+               match(UI, m_Select(m_Value(), m_Specific(&I), m_Specific(Op1)));
+      })) {
+    if (Value *NegOp1 = Negator::Negate(IsNegation, Op1, *this))
+      return BinaryOperator::CreateAdd(NegOp1, Op0);
+  }
+  if (IsNegation)
+    return TryToNarrowDeduceFlags(); // Should have been handled in Negator!
+
+  // (A*B)-(A*C) -> A*(B-C) etc
+  if (Value *V = SimplifyUsingDistributiveLaws(I))
+    return replaceInstUsesWith(I, V);
+
   if (I.getType()->isIntOrIntVectorTy(1))
     return BinaryOperator::CreateXor(Op0, Op1);
 
@@ -1723,33 +1776,40 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   if (match(Op0, m_OneUse(m_Add(m_Value(X), m_AllOnes()))))
     return BinaryOperator::CreateAdd(Builder.CreateNot(Op1), X);
 
-  // Y - (X + 1) --> ~X + Y
-  if (match(Op1, m_OneUse(m_Add(m_Value(X), m_One()))))
-    return BinaryOperator::CreateAdd(Builder.CreateNot(X), Op0);
+  // Reassociate sub/add sequences to create more add instructions and
+  // reduce dependency chains:
+  // ((X - Y) + Z) - Op1 --> (X + Z) - (Y + Op1)
+  Value *Z;
+  if (match(Op0, m_OneUse(m_c_Add(m_OneUse(m_Sub(m_Value(X), m_Value(Y))),
+                                  m_Value(Z))))) {
+    Value *XZ = Builder.CreateAdd(X, Z);
+    Value *YW = Builder.CreateAdd(Y, Op1);
+    return BinaryOperator::CreateSub(XZ, YW);
+  }
 
-  // Y - ~X --> (X + 1) + Y
-  if (match(Op1, m_OneUse(m_Not(m_Value(X))))) {
-    return BinaryOperator::CreateAdd(
-        Builder.CreateAdd(Op0, ConstantInt::get(I.getType(), 1)), X);
+  auto m_AddRdx = [](Value *&Vec) {
+    return m_OneUse(
+        m_Intrinsic<Intrinsic::experimental_vector_reduce_add>(m_Value(Vec)));
+  };
+  Value *V0, *V1;
+  if (match(Op0, m_AddRdx(V0)) && match(Op1, m_AddRdx(V1)) &&
+      V0->getType() == V1->getType()) {
+    // Difference of sums is sum of differences:
+    // add_rdx(V0) - add_rdx(V1) --> add_rdx(V0 - V1)
+    Value *Sub = Builder.CreateSub(V0, V1);
+    Value *Rdx = Builder.CreateIntrinsic(
+        Intrinsic::experimental_vector_reduce_add, {Sub->getType()}, {Sub});
+    return replaceInstUsesWith(I, Rdx);
   }
 
   if (Constant *C = dyn_cast<Constant>(Op0)) {
-    bool IsNegate = match(C, m_ZeroInt());
     Value *X;
-    if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
-      // 0 - (zext bool) --> sext bool
+    if (match(Op1, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
       // C - (zext bool) --> bool ? C - 1 : C
-      if (IsNegate)
-        return CastInst::CreateSExtOrBitCast(X, I.getType());
       return SelectInst::Create(X, SubOne(C), C);
-    }
-    if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1)) {
-      // 0 - (sext bool) --> zext bool
+    if (match(Op1, m_SExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
       // C - (sext bool) --> bool ? C + 1 : C
-      if (IsNegate)
-        return CastInst::CreateZExtOrBitCast(X, I.getType());
       return SelectInst::Create(X, AddOne(C), C);
-    }
 
     // C - ~X == X + (1+C)
     if (match(Op1, m_Not(m_Value(X))))
@@ -1768,7 +1828,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
     Constant *C2;
 
     // C-(C2-X) --> X+(C-C2)
-    if (match(Op1, m_Sub(m_Constant(C2), m_Value(X))))
+    if (match(Op1, m_Sub(m_Constant(C2), m_Value(X))) && !isa<ConstantExpr>(C2))
       return BinaryOperator::CreateAdd(X, ConstantExpr::getSub(C, C2));
 
     // C-(X+C2) --> (C-C2)-X
@@ -1777,62 +1837,12 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
   }
 
   const APInt *Op0C;
-  if (match(Op0, m_APInt(Op0C))) {
-
-    if (Op0C->isNullValue()) {
-      Value *Op1Wide;
-      match(Op1, m_TruncOrSelf(m_Value(Op1Wide)));
-      bool HadTrunc = Op1Wide != Op1;
-      bool NoTruncOrTruncIsOneUse = !HadTrunc || Op1->hasOneUse();
-      unsigned BitWidth = Op1Wide->getType()->getScalarSizeInBits();
-
-      Value *X;
-      const APInt *ShAmt;
-      // -(X >>u 31) -> (X >>s 31)
-      if (NoTruncOrTruncIsOneUse &&
-          match(Op1Wide, m_LShr(m_Value(X), m_APInt(ShAmt))) &&
-          *ShAmt == BitWidth - 1) {
-        Value *ShAmtOp = cast<Instruction>(Op1Wide)->getOperand(1);
-        Instruction *NewShift = BinaryOperator::CreateAShr(X, ShAmtOp);
-        NewShift->copyIRFlags(Op1Wide);
-        if (!HadTrunc)
-          return NewShift;
-        Builder.Insert(NewShift);
-        return TruncInst::CreateTruncOrBitCast(NewShift, Op1->getType());
-      }
-      // -(X >>s 31) -> (X >>u 31)
-      if (NoTruncOrTruncIsOneUse &&
-          match(Op1Wide, m_AShr(m_Value(X), m_APInt(ShAmt))) &&
-          *ShAmt == BitWidth - 1) {
-        Value *ShAmtOp = cast<Instruction>(Op1Wide)->getOperand(1);
-        Instruction *NewShift = BinaryOperator::CreateLShr(X, ShAmtOp);
-        NewShift->copyIRFlags(Op1Wide);
-        if (!HadTrunc)
-          return NewShift;
-        Builder.Insert(NewShift);
-        return TruncInst::CreateTruncOrBitCast(NewShift, Op1->getType());
-      }
-
-      if (!HadTrunc && Op1->hasOneUse()) {
-        Value *LHS, *RHS;
-        SelectPatternFlavor SPF = matchSelectPattern(Op1, LHS, RHS).Flavor;
-        if (SPF == SPF_ABS || SPF == SPF_NABS) {
-          // This is a negate of an ABS/NABS pattern. Just swap the operands
-          // of the select.
-          cast<SelectInst>(Op1)->swapValues();
-          // Don't swap prof metadata, we didn't change the branch behavior.
-          return replaceInstUsesWith(I, Op1);
-        }
-      }
-    }
-
+  if (match(Op0, m_APInt(Op0C)) && Op0C->isMask()) {
     // Turn this into a xor if LHS is 2^n-1 and the remaining bits are known
     // zero.
-    if (Op0C->isMask()) {
-      KnownBits RHSKnown = computeKnownBits(Op1, 0, &I);
-      if ((*Op0C | RHSKnown.Zero).isAllOnesValue())
-        return BinaryOperator::CreateXor(Op1, Op0);
-    }
+    KnownBits RHSKnown = computeKnownBits(Op1, 0, &I);
+    if ((*Op0C | RHSKnown.Zero).isAllOnesValue())
+      return BinaryOperator::CreateXor(Op1, Op0);
   }
 
   {
@@ -1956,71 +1966,11 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
       return NewSel;
   }
 
-  if (Op1->hasOneUse()) {
-    Value *X = nullptr, *Y = nullptr, *Z = nullptr;
-    Constant *C = nullptr;
-
-    // (X - (Y - Z))  -->  (X + (Z - Y)).
-    if (match(Op1, m_Sub(m_Value(Y), m_Value(Z))))
-      return BinaryOperator::CreateAdd(Op0,
-                                      Builder.CreateSub(Z, Y, Op1->getName()));
-
-    // (X - (X & Y))   -->   (X & ~Y)
-    if (match(Op1, m_c_And(m_Value(Y), m_Specific(Op0))))
-      return BinaryOperator::CreateAnd(Op0,
-                                  Builder.CreateNot(Y, Y->getName() + ".not"));
-
-    // 0 - (X sdiv C)  -> (X sdiv -C)  provided the negation doesn't overflow.
-    if (match(Op0, m_Zero())) {
-      Constant *Op11C;
-      if (match(Op1, m_SDiv(m_Value(X), m_Constant(Op11C))) &&
-          !Op11C->containsUndefElement() && Op11C->isNotMinSignedValue() &&
-          Op11C->isNotOneValue()) {
-        Instruction *BO =
-            BinaryOperator::CreateSDiv(X, ConstantExpr::getNeg(Op11C));
-        BO->setIsExact(cast<BinaryOperator>(Op1)->isExact());
-        return BO;
-      }
-    }
-
-    // 0 - (X << Y)  -> (-X << Y)   when X is freely negatable.
-    if (match(Op1, m_Shl(m_Value(X), m_Value(Y))) && match(Op0, m_Zero()))
-      if (Value *XNeg = dyn_castNegVal(X))
-        return BinaryOperator::CreateShl(XNeg, Y);
-
-    // Subtracting -1/0 is the same as adding 1/0:
-    // sub [nsw] Op0, sext(bool Y) -> add [nsw] Op0, zext(bool Y)
-    // 'nuw' is dropped in favor of the canonical form.
-    if (match(Op1, m_SExt(m_Value(Y))) &&
-        Y->getType()->getScalarSizeInBits() == 1) {
-      Value *Zext = Builder.CreateZExt(Y, I.getType());
-      BinaryOperator *Add = BinaryOperator::CreateAdd(Op0, Zext);
-      Add->setHasNoSignedWrap(I.hasNoSignedWrap());
-      return Add;
-    }
-    // sub [nsw] X, zext(bool Y) -> add [nsw] X, sext(bool Y)
-    // 'nuw' is dropped in favor of the canonical form.
-    if (match(Op1, m_ZExt(m_Value(Y))) && Y->getType()->isIntOrIntVectorTy(1)) {
-      Value *Sext = Builder.CreateSExt(Y, I.getType());
-      BinaryOperator *Add = BinaryOperator::CreateAdd(Op0, Sext);
-      Add->setHasNoSignedWrap(I.hasNoSignedWrap());
-      return Add;
-    }
-
-    // X - A*-B -> X + A*B
-    // X - -A*B -> X + A*B
-    Value *A, *B;
-    if (match(Op1, m_c_Mul(m_Value(A), m_Neg(m_Value(B)))))
-      return BinaryOperator::CreateAdd(Op0, Builder.CreateMul(A, B));
-
-    // X - A*C -> X + A*-C
-    // No need to handle commuted multiply because multiply handling will
-    // ensure constant will be move to the right hand side.
-    if (match(Op1, m_Mul(m_Value(A), m_Constant(C))) && !isa<ConstantExpr>(C)) {
-      Value *NewMul = Builder.CreateMul(A, ConstantExpr::getNeg(C));
-      return BinaryOperator::CreateAdd(Op0, NewMul);
-    }
-  }
+  // (X - (X & Y))   -->   (X & ~Y)
+  if (match(Op1, m_c_And(m_Specific(Op0), m_Value(Y))) &&
+      (Op1->hasOneUse() || isa<Constant>(Y)))
+    return BinaryOperator::CreateAnd(
+        Op0, Builder.CreateNot(Y, Y->getName() + ".not"));
 
   {
     // ~A - Min/Max(~A, O) -> Max/Min(A, ~O) - A
@@ -2096,20 +2046,7 @@ Instruction *InstCombiner::visitSub(BinaryOperator &I) {
           canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
     return V;
 
-  if (Instruction *Ext = narrowMathIfNoOverflow(I))
-    return Ext;
-
-  bool Changed = false;
-  if (!I.hasNoSignedWrap() && willNotOverflowSignedSub(Op0, Op1, I)) {
-    Changed = true;
-    I.setHasNoSignedWrap(true);
-  }
-  if (!I.hasNoUnsignedWrap() && willNotOverflowUnsignedSub(Op0, Op1, I)) {
-    Changed = true;
-    I.setHasNoUnsignedWrap(true);
-  }
-
-  return Changed ? &I : nullptr;
+  return TryToNarrowDeduceFlags();
 }
 
 /// This eliminates floating-point negation in either 'fneg(X)' or
@@ -2132,6 +2069,12 @@ static Instruction *foldFNegIntoConstant(Instruction &I) {
   if (match(&I, m_FNeg(m_OneUse(m_FDiv(m_Constant(C), m_Value(X))))))
     return BinaryOperator::CreateFDivFMF(ConstantExpr::getFNeg(C), X, &I);
 
+  // With NSZ [ counter-example with -0.0: -(-0.0 + 0.0) != 0.0 + -0.0 ]:
+  // -(X + C) --> -X + -C --> -C - X
+  if (I.hasNoSignedZeros() &&
+      match(&I, m_FNeg(m_OneUse(m_FAdd(m_Value(X), m_Constant(C))))))
+    return BinaryOperator::CreateFSubFMF(ConstantExpr::getFNeg(C), X, &I);
+
   return nullptr;
 }
 
@@ -2184,10 +2127,15 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
     return X;
 
   // Subtraction from -0.0 is the canonical form of fneg.
-  // fsub nsz 0, X ==> fsub nsz -0.0, X
-  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
-  if (I.hasNoSignedZeros() && match(Op0, m_PosZeroFP()))
-    return BinaryOperator::CreateFNegFMF(Op1, &I);
+  // fsub -0.0, X ==> fneg X
+  // fsub nsz 0.0, X ==> fneg nsz X
+  //
+  // FIXME This matcher does not respect FTZ or DAZ yet:
+  // fsub -0.0, Denorm ==> +-0
+  // fneg Denorm ==> -Denorm
+  Value *Op;
+  if (match(&I, m_FNeg(m_Value(Op))))
+    return UnaryOperator::CreateFNegFMF(Op, &I);
 
   if (Instruction *X = foldFNegIntoConstant(I))
     return X;
@@ -2198,6 +2146,7 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   Value *X, *Y;
   Constant *C;
 
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   // If Op0 is not -0.0 or we can ignore -0.0: Z - (X - Y) --> Z + (Y - X)
   // Canonicalize to fadd to make analysis easier.
   // This can also help codegen because fadd is commutative.
@@ -2211,6 +2160,13 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
     }
   }
 
+  // (-X) - Op1 --> -(X + Op1)
+  if (I.hasNoSignedZeros() && !isa<ConstantExpr>(Op0) &&
+      match(Op0, m_OneUse(m_FNeg(m_Value(X))))) {
+    Value *FAdd = Builder.CreateFAddFMF(X, Op1, &I);
+    return UnaryOperator::CreateFNegFMF(FAdd, &I);
+  }
+
   if (isa<Constant>(Op0))
     if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
       if (Instruction *NV = FoldOpIntoSelect(I, SI))
@@ -2258,12 +2214,12 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
   if (I.hasAllowReassoc() && I.hasNoSignedZeros()) {
     // (Y - X) - Y --> -X
     if (match(Op0, m_FSub(m_Specific(Op1), m_Value(X))))
-      return BinaryOperator::CreateFNegFMF(X, &I);
+      return UnaryOperator::CreateFNegFMF(X, &I);
 
     // Y - (X + Y) --> -X
     // Y - (Y + X) --> -X
     if (match(Op1, m_c_FAdd(m_Specific(Op0), m_Value(X))))
-      return BinaryOperator::CreateFNegFMF(X, &I);
+      return UnaryOperator::CreateFNegFMF(X, &I);
 
     // (X * C) - X --> X * (C - 1.0)
     if (match(Op0, m_FMul(m_Specific(Op1), m_Constant(C)))) {
@@ -2276,6 +2232,34 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
       return BinaryOperator::CreateFMulFMF(Op0, OneSubC, &I);
     }
 
+    // Reassociate fsub/fadd sequences to create more fadd instructions and
+    // reduce dependency chains:
+    // ((X - Y) + Z) - Op1 --> (X + Z) - (Y + Op1)
+    Value *Z;
+    if (match(Op0, m_OneUse(m_c_FAdd(m_OneUse(m_FSub(m_Value(X), m_Value(Y))),
+                                     m_Value(Z))))) {
+      Value *XZ = Builder.CreateFAddFMF(X, Z, &I);
+      Value *YW = Builder.CreateFAddFMF(Y, Op1, &I);
+      return BinaryOperator::CreateFSubFMF(XZ, YW, &I);
+    }
+
+    auto m_FaddRdx = [](Value *&Sum, Value *&Vec) {
+      return m_OneUse(
+          m_Intrinsic<Intrinsic::experimental_vector_reduce_v2_fadd>(
+              m_Value(Sum), m_Value(Vec)));
+    };
+    Value *A0, *A1, *V0, *V1;
+    if (match(Op0, m_FaddRdx(A0, V0)) && match(Op1, m_FaddRdx(A1, V1)) &&
+        V0->getType() == V1->getType()) {
+      // Difference of sums is sum of differences:
+      // add_rdx(A0, V0) - add_rdx(A1, V1) --> add_rdx(A0, V0 - V1) - A1
+      Value *Sub = Builder.CreateFSubFMF(V0, V1, &I);
+      Value *Rdx = Builder.CreateIntrinsic(
+          Intrinsic::experimental_vector_reduce_v2_fadd,
+          {A0->getType(), Sub->getType()}, {A0, Sub}, &I);
+      return BinaryOperator::CreateFSubFMF(Rdx, A1, &I);
+    }
+
     if (Instruction *F = factorizeFAddFSub(I, Builder))
       return F;
 
@@ -2285,6 +2269,12 @@ Instruction *InstCombiner::visitFSub(BinaryOperator &I) {
     // complex pattern matching and remove this from InstCombine.
     if (Value *V = FAddCombine(Builder).simplify(&I))
       return replaceInstUsesWith(I, V);
+
+    // (X - Y) - Op1 --> X - (Y + Op1)
+    if (match(Op0, m_OneUse(m_FSub(m_Value(X), m_Value(Y))))) {
+      Value *FAdd = Builder.CreateFAddFMF(Y, Op1, &I);
+      return BinaryOperator::CreateFSubFMF(X, FAdd, &I);
+    }
   }
 
   return nullptr;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index cc0a9127f8b1..d3c718a919c0 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -143,8 +143,7 @@ Instruction *InstCombiner::OptAndOp(BinaryOperator *Op,
           // the XOR is to toggle the bit.  If it is clear, then the ADD has
           // no effect.
           if ((AddRHS & AndRHSV).isNullValue()) { // Bit is not set, noop
-            TheAnd.setOperand(0, X);
-            return &TheAnd;
+            return replaceOperand(TheAnd, 0, X);
           } else {
             // Pull the XOR out of the AND.
             Value *NewAnd = Builder.CreateAnd(X, AndRHS);
@@ -858,8 +857,10 @@ foldAndOrOfEqualityCmpsWithConstants(ICmpInst *LHS, ICmpInst *RHS,
 // Fold (iszero(A & K1) | iszero(A & K2)) -> (A & (K1 | K2)) != (K1 | K2)
 // Fold (!iszero(A & K1) & !iszero(A & K2)) -> (A & (K1 | K2)) == (K1 | K2)
 Value *InstCombiner::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
-                                                   bool JoinedByAnd,
-                                                   Instruction &CxtI) {
+                                                   BinaryOperator &Logic) {
+  bool JoinedByAnd = Logic.getOpcode() == Instruction::And;
+  assert((JoinedByAnd || Logic.getOpcode() == Instruction::Or) &&
+         "Wrong opcode");
   ICmpInst::Predicate Pred = LHS->getPredicate();
   if (Pred != RHS->getPredicate())
     return nullptr;
@@ -883,8 +884,8 @@ Value *InstCombiner::foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
       std::swap(A, B);
 
     if (A == C &&
-        isKnownToBeAPowerOfTwo(B, false, 0, &CxtI) &&
-        isKnownToBeAPowerOfTwo(D, false, 0, &CxtI)) {
+        isKnownToBeAPowerOfTwo(B, false, 0, &Logic) &&
+        isKnownToBeAPowerOfTwo(D, false, 0, &Logic)) {
       Value *Mask = Builder.CreateOr(B, D);
       Value *Masked = Builder.CreateAnd(A, Mask);
       auto NewPred = JoinedByAnd ? ICmpInst::ICMP_EQ : ICmpInst::ICMP_NE;
@@ -1072,9 +1073,6 @@ static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp,
             m_c_ICmp(UnsignedPred, m_Specific(ZeroCmpOp), m_Value(A))) &&
       match(ZeroCmpOp, m_c_Add(m_Specific(A), m_Value(B))) &&
       (ZeroICmp->hasOneUse() || UnsignedICmp->hasOneUse())) {
-    if (UnsignedICmp->getOperand(0) != ZeroCmpOp)
-      UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
-
     auto GetKnownNonZeroAndOther = [&](Value *&NonZero, Value *&Other) {
       if (!IsKnownNonZero(NonZero))
         std::swap(NonZero, Other);
@@ -1111,8 +1109,6 @@ static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp,
              m_c_ICmp(UnsignedPred, m_Specific(Base), m_Specific(Offset))) ||
       !ICmpInst::isUnsigned(UnsignedPred))
     return nullptr;
-  if (UnsignedICmp->getOperand(0) != Base)
-    UnsignedPred = ICmpInst::getSwappedPredicate(UnsignedPred);
 
   // Base >=/> Offset && (Base - Offset) != 0  <-->  Base > Offset
   // (no overflow and not null)
@@ -1141,14 +1137,59 @@ static Value *foldUnsignedUnderflowCheck(ICmpInst *ZeroICmp,
   return nullptr;
 }
 
+/// Reduce logic-of-compares with equality to a constant by substituting a
+/// common operand with the constant. Callers are expected to call this with
+/// Cmp0/Cmp1 switched to handle logic op commutativity.
+static Value *foldAndOrOfICmpsWithConstEq(ICmpInst *Cmp0, ICmpInst *Cmp1,
+                                          BinaryOperator &Logic,
+                                          InstCombiner::BuilderTy &Builder,
+                                          const SimplifyQuery &Q) {
+  bool IsAnd = Logic.getOpcode() == Instruction::And;
+  assert((IsAnd || Logic.getOpcode() == Instruction::Or) && "Wrong logic op");
+
+  // Match an equality compare with a non-poison constant as Cmp0.
+  ICmpInst::Predicate Pred0;
+  Value *X;
+  Constant *C;
+  if (!match(Cmp0, m_ICmp(Pred0, m_Value(X), m_Constant(C))) ||
+      !isGuaranteedNotToBeUndefOrPoison(C))
+    return nullptr;
+  if ((IsAnd && Pred0 != ICmpInst::ICMP_EQ) ||
+      (!IsAnd && Pred0 != ICmpInst::ICMP_NE))
+    return nullptr;
+
+  // The other compare must include a common operand (X). Canonicalize the
+  // common operand as operand 1 (Pred1 is swapped if the common operand was
+  // operand 0).
+  Value *Y;
+  ICmpInst::Predicate Pred1;
+  if (!match(Cmp1, m_c_ICmp(Pred1, m_Value(Y), m_Deferred(X))))
+    return nullptr;
+
+  // Replace variable with constant value equivalence to remove a variable use:
+  // (X == C) && (Y Pred1 X) --> (X == C) && (Y Pred1 C)
+  // (X != C) || (Y Pred1 X) --> (X != C) || (Y Pred1 C)
+  // Can think of the 'or' substitution with the 'and' bool equivalent:
+  // A || B --> A || (!A && B)
+  Value *SubstituteCmp = SimplifyICmpInst(Pred1, Y, C, Q);
+  if (!SubstituteCmp) {
+    // If we need to create a new instruction, require that the old compare can
+    // be removed.
+    if (!Cmp1->hasOneUse())
+      return nullptr;
+    SubstituteCmp = Builder.CreateICmp(Pred1, Y, C);
+  }
+  return Builder.CreateBinOp(Logic.getOpcode(), Cmp0, SubstituteCmp);
+}
+
 /// Fold (icmp)&(icmp) if possible.
 Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                    Instruction &CxtI) {
-  const SimplifyQuery Q = SQ.getWithInstruction(&CxtI);
+                                    BinaryOperator &And) {
+  const SimplifyQuery Q = SQ.getWithInstruction(&And);
 
   // Fold (!iszero(A & K1) & !iszero(A & K2)) ->  (A & (K1 | K2)) == (K1 | K2)
   // if K1 and K2 are a one-bit mask.
-  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, true, CxtI))
+  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, And))
     return V;
 
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
@@ -1171,6 +1212,11 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   if (Value *V = foldLogOpOfMaskedICmps(LHS, RHS, true, Builder))
     return V;
 
+  if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, And, Builder, Q))
+    return V;
+  if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, And, Builder, Q))
+    return V;
+
   // E.g. (icmp sge x, 0) & (icmp slt x, n) --> icmp ult x, n
   if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/false))
     return V;
@@ -1182,7 +1228,7 @@ Value *InstCombiner::foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   if (Value *V = foldAndOrOfEqualityCmpsWithConstants(LHS, RHS, true, Builder))
     return V;
 
-  if (Value *V = foldSignedTruncationCheck(LHS, RHS, CxtI, Builder))
+  if (Value *V = foldSignedTruncationCheck(LHS, RHS, And, Builder))
     return V;
 
   if (Value *V = foldIsPowerOf2(LHS, RHS, true /* JoinedByAnd */, Builder))
@@ -1658,7 +1704,7 @@ static bool canNarrowShiftAmt(Constant *C, unsigned BitWidth) {
 
   if (C->getType()->isVectorTy()) {
     // Check each element of a constant vector.
-    unsigned NumElts = C->getType()->getVectorNumElements();
+    unsigned NumElts = cast<VectorType>(C->getType())->getNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Elt = C->getAggregateElement(i);
       if (!Elt)
@@ -1802,7 +1848,17 @@ Instruction *InstCombiner::visitAnd(BinaryOperator &I) {
         return BinaryOperator::Create(BinOp, NewLHS, Y);
       }
     }
-
+    const APInt *ShiftC;
+    if (match(Op0, m_OneUse(m_SExt(m_AShr(m_Value(X), m_APInt(ShiftC)))))) {
+      unsigned Width = I.getType()->getScalarSizeInBits();
+      if (*C == APInt::getLowBitsSet(Width, Width - ShiftC->getZExtValue())) {
+        // We are clearing high bits that were potentially set by sext+ashr:
+        // and (sext (ashr X, ShiftC)), C --> lshr (sext X), ShiftC
+        Value *Sext = Builder.CreateSExt(X, I.getType());
+        Constant *ShAmtC = ConstantInt::get(I.getType(), ShiftC->zext(Width));
+        return BinaryOperator::CreateLShr(Sext, ShAmtC);
+      }
+    }
   }
 
   if (ConstantInt *AndRHS = dyn_cast<ConstantInt>(Op1)) {
@@ -2020,7 +2076,7 @@ Instruction *InstCombiner::matchBSwap(BinaryOperator &Or) {
   LastInst->removeFromParent();
 
   for (auto *Inst : Insts)
-    Worklist.Add(Inst);
+    Worklist.push(Inst);
   return LastInst;
 }
 
@@ -2086,9 +2142,62 @@ static Instruction *matchRotate(Instruction &Or) {
   return IntrinsicInst::Create(F, { ShVal, ShVal, ShAmt });
 }
 
+/// Attempt to combine or(zext(x),shl(zext(y),bw/2) concat packing patterns.
+static Instruction *matchOrConcat(Instruction &Or,
+                                  InstCombiner::BuilderTy &Builder) {
+  assert(Or.getOpcode() == Instruction::Or && "bswap requires an 'or'");
+  Value *Op0 = Or.getOperand(0), *Op1 = Or.getOperand(1);
+  Type *Ty = Or.getType();
+
+  unsigned Width = Ty->getScalarSizeInBits();
+  if ((Width & 1) != 0)
+    return nullptr;
+  unsigned HalfWidth = Width / 2;
+
+  // Canonicalize zext (lower half) to LHS.
+  if (!isa<ZExtInst>(Op0))
+    std::swap(Op0, Op1);
+
+  // Find lower/upper half.
+  Value *LowerSrc, *ShlVal, *UpperSrc;
+  const APInt *C;
+  if (!match(Op0, m_OneUse(m_ZExt(m_Value(LowerSrc)))) ||
+      !match(Op1, m_OneUse(m_Shl(m_Value(ShlVal), m_APInt(C)))) ||
+      !match(ShlVal, m_OneUse(m_ZExt(m_Value(UpperSrc)))))
+    return nullptr;
+  if (*C != HalfWidth || LowerSrc->getType() != UpperSrc->getType() ||
+      LowerSrc->getType()->getScalarSizeInBits() != HalfWidth)
+    return nullptr;
+
+  auto ConcatIntrinsicCalls = [&](Intrinsic::ID id, Value *Lo, Value *Hi) {
+    Value *NewLower = Builder.CreateZExt(Lo, Ty);
+    Value *NewUpper = Builder.CreateZExt(Hi, Ty);
+    NewUpper = Builder.CreateShl(NewUpper, HalfWidth);
+    Value *BinOp = Builder.CreateOr(NewLower, NewUpper);
+    Function *F = Intrinsic::getDeclaration(Or.getModule(), id, Ty);
+    return Builder.CreateCall(F, BinOp);
+  };
+
+  // BSWAP: Push the concat down, swapping the lower/upper sources.
+  // concat(bswap(x),bswap(y)) -> bswap(concat(x,y))
+  Value *LowerBSwap, *UpperBSwap;
+  if (match(LowerSrc, m_BSwap(m_Value(LowerBSwap))) &&
+      match(UpperSrc, m_BSwap(m_Value(UpperBSwap))))
+    return ConcatIntrinsicCalls(Intrinsic::bswap, UpperBSwap, LowerBSwap);
+
+  // BITREVERSE: Push the concat down, swapping the lower/upper sources.
+  // concat(bitreverse(x),bitreverse(y)) -> bitreverse(concat(x,y))
+  Value *LowerBRev, *UpperBRev;
+  if (match(LowerSrc, m_BitReverse(m_Value(LowerBRev))) &&
+      match(UpperSrc, m_BitReverse(m_Value(UpperBRev))))
+    return ConcatIntrinsicCalls(Intrinsic::bitreverse, UpperBRev, LowerBRev);
+
+  return nullptr;
+}
+
 /// If all elements of two constant vectors are 0/-1 and inverses, return true.
 static bool areInverseVectorBitmasks(Constant *C1, Constant *C2) {
-  unsigned NumElts = C1->getType()->getVectorNumElements();
+  unsigned NumElts = cast<VectorType>(C1->getType())->getNumElements();
   for (unsigned i = 0; i != NumElts; ++i) {
     Constant *EltC1 = C1->getAggregateElement(i);
     Constant *EltC2 = C2->getAggregateElement(i);
@@ -2185,12 +2294,12 @@ Value *InstCombiner::matchSelectFromAndOr(Value *A, Value *C, Value *B,
 
 /// Fold (icmp)|(icmp) if possible.
 Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
-                                   Instruction &CxtI) {
-  const SimplifyQuery Q = SQ.getWithInstruction(&CxtI);
+                                   BinaryOperator &Or) {
+  const SimplifyQuery Q = SQ.getWithInstruction(&Or);
 
   // Fold (iszero(A & K1) | iszero(A & K2)) ->  (A & (K1 | K2)) != (K1 | K2)
   // if K1 and K2 are a one-bit mask.
-  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, false, CxtI))
+  if (Value *V = foldAndOrOfICmpsOfAndWithPow2(LHS, RHS, Or))
     return V;
 
   ICmpInst::Predicate PredL = LHS->getPredicate(), PredR = RHS->getPredicate();
@@ -2299,6 +2408,11 @@ Value *InstCombiner::foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
           Builder.CreateAdd(B, ConstantInt::getSigned(B->getType(), -1)), A);
   }
 
+  if (Value *V = foldAndOrOfICmpsWithConstEq(LHS, RHS, Or, Builder, Q))
+    return V;
+  if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, Or, Builder, Q))
+    return V;
+
   // E.g. (icmp slt x, 0) | (icmp sgt x, n) --> icmp ugt x, n
   if (Value *V = simplifyRangeCheck(LHS, RHS, /*Inverted=*/true))
     return V;
@@ -2481,6 +2595,9 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
   if (Instruction *Rotate = matchRotate(I))
     return Rotate;
 
+  if (Instruction *Concat = matchOrConcat(I, Builder))
+    return replaceInstUsesWith(I, Concat);
+
   Value *X, *Y;
   const APInt *CV;
   if (match(&I, m_c_Or(m_OneUse(m_Xor(m_Value(X), m_APInt(CV))), m_Value(Y))) &&
@@ -2729,6 +2846,32 @@ Instruction *InstCombiner::visitOr(BinaryOperator &I) {
           canonicalizeCondSignextOfHighBitExtractToSignextHighBitExtract(I))
     return V;
 
+  CmpInst::Predicate Pred;
+  Value *Mul, *Ov, *MulIsNotZero, *UMulWithOv;
+  // Check if the OR weakens the overflow condition for umul.with.overflow by
+  // treating any non-zero result as overflow. In that case, we overflow if both
+  // umul.with.overflow operands are != 0, as in that case the result can only
+  // be 0, iff the multiplication overflows.
+  if (match(&I,
+            m_c_Or(m_CombineAnd(m_ExtractValue<1>(m_Value(UMulWithOv)),
+                                m_Value(Ov)),
+                   m_CombineAnd(m_ICmp(Pred,
+                                       m_CombineAnd(m_ExtractValue<0>(
+                                                        m_Deferred(UMulWithOv)),
+                                                    m_Value(Mul)),
+                                       m_ZeroInt()),
+                                m_Value(MulIsNotZero)))) &&
+      (Ov->hasOneUse() || (MulIsNotZero->hasOneUse() && Mul->hasOneUse())) &&
+      Pred == CmpInst::ICMP_NE) {
+    Value *A, *B;
+    if (match(UMulWithOv, m_Intrinsic<Intrinsic::umul_with_overflow>(
+                              m_Value(A), m_Value(B)))) {
+      Value *NotNullA = Builder.CreateIsNotNull(A);
+      Value *NotNullB = Builder.CreateIsNotNull(B);
+      return BinaryOperator::CreateAnd(NotNullA, NotNullB);
+    }
+  }
+
   return nullptr;
 }
 
@@ -2748,33 +2891,24 @@ static Instruction *foldXorToXor(BinaryOperator &I,
   // (A | B) ^ (A & B) -> A ^ B
   // (A | B) ^ (B & A) -> A ^ B
   if (match(&I, m_c_Xor(m_And(m_Value(A), m_Value(B)),
-                        m_c_Or(m_Deferred(A), m_Deferred(B))))) {
-    I.setOperand(0, A);
-    I.setOperand(1, B);
-    return &I;
-  }
+                        m_c_Or(m_Deferred(A), m_Deferred(B)))))
+    return BinaryOperator::CreateXor(A, B);
 
   // (A | ~B) ^ (~A | B) -> A ^ B
   // (~B | A) ^ (~A | B) -> A ^ B
   // (~A | B) ^ (A | ~B) -> A ^ B
   // (B | ~A) ^ (A | ~B) -> A ^ B
   if (match(&I, m_Xor(m_c_Or(m_Value(A), m_Not(m_Value(B))),
-                      m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B))))) {
-    I.setOperand(0, A);
-    I.setOperand(1, B);
-    return &I;
-  }
+                      m_c_Or(m_Not(m_Deferred(A)), m_Deferred(B)))))
+    return BinaryOperator::CreateXor(A, B);
 
   // (A & ~B) ^ (~A & B) -> A ^ B
   // (~B & A) ^ (~A & B) -> A ^ B
   // (~A & B) ^ (A & ~B) -> A ^ B
   // (B & ~A) ^ (A & ~B) -> A ^ B
   if (match(&I, m_Xor(m_c_And(m_Value(A), m_Not(m_Value(B))),
-                      m_c_And(m_Not(m_Deferred(A)), m_Deferred(B))))) {
-    I.setOperand(0, A);
-    I.setOperand(1, B);
-    return &I;
-  }
+                      m_c_And(m_Not(m_Deferred(A)), m_Deferred(B)))))
+    return BinaryOperator::CreateXor(A, B);
 
   // For the remaining cases we need to get rid of one of the operands.
   if (!Op0->hasOneUse() && !Op1->hasOneUse())
@@ -2878,6 +3012,7 @@ Value *InstCombiner::foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS,
           Builder.SetInsertPoint(Y->getParent(), ++(Y->getIterator()));
           Value *NotY = Builder.CreateNot(Y, Y->getName() + ".not");
           // Replace all uses of Y (excluding the one in NotY!) with NotY.
+          Worklist.pushUsersToWorkList(*Y);
           Y->replaceUsesWithIf(NotY,
                                [NotY](Use &U) { return U.getUser() != NotY; });
         }
@@ -2924,6 +3059,9 @@ static Instruction *visitMaskedMerge(BinaryOperator &I,
 
   Constant *C;
   if (D->hasOneUse() && match(M, m_Constant(C))) {
+    // Propagating undef is unsafe. Clamp undef elements to -1.
+    Type *EltTy = C->getType()->getScalarType();
+    C = Constant::replaceUndefsWith(C, ConstantInt::getAllOnesValue(EltTy));
     // Unfold.
     Value *LHS = Builder.CreateAnd(X, C);
     Value *NotC = Builder.CreateNot(C);
@@ -3058,13 +3196,23 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
     // ~(C >>s Y) --> ~C >>u Y (when inverting the replicated sign bits)
     Constant *C;
     if (match(NotVal, m_AShr(m_Constant(C), m_Value(Y))) &&
-        match(C, m_Negative()))
+        match(C, m_Negative())) {
+      // We matched a negative constant, so propagating undef is unsafe.
+      // Clamp undef elements to -1.
+      Type *EltTy = C->getType()->getScalarType();
+      C = Constant::replaceUndefsWith(C, ConstantInt::getAllOnesValue(EltTy));
       return BinaryOperator::CreateLShr(ConstantExpr::getNot(C), Y);
+    }
 
     // ~(C >>u Y) --> ~C >>s Y (when inverting the replicated sign bits)
     if (match(NotVal, m_LShr(m_Constant(C), m_Value(Y))) &&
-        match(C, m_NonNegative()))
+        match(C, m_NonNegative())) {
+      // We matched a non-negative constant, so propagating undef is unsafe.
+      // Clamp undef elements to 0.
+      Type *EltTy = C->getType()->getScalarType();
+      C = Constant::replaceUndefsWith(C, ConstantInt::getNullValue(EltTy));
       return BinaryOperator::CreateAShr(ConstantExpr::getNot(C), Y);
+    }
 
     // ~(X + C) --> -(C + 1) - X
     if (match(Op0, m_Add(m_Value(X), m_Constant(C))))
@@ -3114,10 +3262,7 @@ Instruction *InstCombiner::visitXor(BinaryOperator &I) {
       if (match(Op0, m_Or(m_Value(X), m_APInt(C))) &&
           MaskedValueIsZero(X, *C, 0, &I)) {
         Constant *NewC = ConstantInt::get(I.getType(), *C ^ *RHSC);
-        Worklist.Add(cast<Instruction>(Op0));
-        I.setOperand(0, X);
-        I.setOperand(1, NewC);
-        return &I;
+        return BinaryOperator::CreateXor(X, NewC);
       }
     }
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
index 825f4b468b0a..ba1cf982229d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAtomicRMW.cpp
@@ -124,7 +124,7 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
     auto *SI = new StoreInst(RMWI.getValOperand(),
                              RMWI.getPointerOperand(), &RMWI);
     SI->setAtomic(Ordering, RMWI.getSyncScopeID());
-    SI->setAlignment(MaybeAlign(DL.getABITypeAlignment(RMWI.getType())));
+    SI->setAlignment(DL.getABITypeAlign(RMWI.getType()));
     return eraseInstFromFunction(RMWI);
   }
   
@@ -138,13 +138,11 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
   if (RMWI.getType()->isIntegerTy() &&
       RMWI.getOperation() != AtomicRMWInst::Or) {
     RMWI.setOperation(AtomicRMWInst::Or);
-    RMWI.setOperand(1, ConstantInt::get(RMWI.getType(), 0));
-    return &RMWI;
+    return replaceOperand(RMWI, 1, ConstantInt::get(RMWI.getType(), 0));
   } else if (RMWI.getType()->isFloatingPointTy() &&
              RMWI.getOperation() != AtomicRMWInst::FAdd) {
     RMWI.setOperation(AtomicRMWInst::FAdd);
-    RMWI.setOperand(1, ConstantFP::getNegativeZero(RMWI.getType()));
-    return &RMWI;
+    return replaceOperand(RMWI, 1, ConstantFP::getNegativeZero(RMWI.getType()));
   }
 
   // Check if the required ordering is compatible with an atomic load.
@@ -152,8 +150,8 @@ Instruction *InstCombiner::visitAtomicRMWInst(AtomicRMWInst &RMWI) {
       Ordering != AtomicOrdering::Monotonic)
     return nullptr;
   
-  LoadInst *Load = new LoadInst(RMWI.getType(), RMWI.getPointerOperand());
-  Load->setAtomic(Ordering, RMWI.getSyncScopeID());
-  Load->setAlignment(MaybeAlign(DL.getABITypeAlignment(RMWI.getType())));
+  LoadInst *Load = new LoadInst(RMWI.getType(), RMWI.getPointerOperand(), "",
+                                false, DL.getABITypeAlign(RMWI.getType()),
+                                Ordering, RMWI.getSyncScopeID());
   return Load;
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index f463c5fa1138..c734c9a68fb2 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -15,12 +15,15 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/FloatingPointMode.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/Twine.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/Loads.h"
@@ -40,12 +43,13 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IntrinsicsX86.h"
-#include "llvm/IR/IntrinsicsARM.h"
 #include "llvm/IR/IntrinsicsAArch64.h"
-#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/IntrinsicsAMDGPU.h"
+#include "llvm/IR/IntrinsicsARM.h"
+#include "llvm/IR/IntrinsicsHexagon.h"
+#include "llvm/IR/IntrinsicsNVPTX.h"
 #include "llvm/IR/IntrinsicsPowerPC.h"
+#include "llvm/IR/IntrinsicsX86.h"
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PatternMatch.h"
@@ -114,16 +118,16 @@ static Constant *getNegativeIsTrueBoolVec(ConstantDataVector *V) {
 }
 
 Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
-  unsigned DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
-  unsigned CopyDstAlign = MI->getDestAlignment();
-  if (CopyDstAlign < DstAlign){
+  Align DstAlign = getKnownAlignment(MI->getRawDest(), DL, MI, &AC, &DT);
+  MaybeAlign CopyDstAlign = MI->getDestAlign();
+  if (!CopyDstAlign || *CopyDstAlign < DstAlign) {
     MI->setDestAlignment(DstAlign);
     return MI;
   }
 
-  unsigned SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT);
-  unsigned CopySrcAlign = MI->getSourceAlignment();
-  if (CopySrcAlign < SrcAlign) {
+  Align SrcAlign = getKnownAlignment(MI->getRawSource(), DL, MI, &AC, &DT);
+  MaybeAlign CopySrcAlign = MI->getSourceAlign();
+  if (!CopySrcAlign || *CopySrcAlign < SrcAlign) {
     MI->setSourceAlignment(SrcAlign);
     return MI;
   }
@@ -157,7 +161,7 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
   // into libcall in CodeGen. This is not evident performance gain so disable
   // it now.
   if (isa<AtomicMemTransferInst>(MI))
-    if (CopyDstAlign < Size || CopySrcAlign < Size)
+    if (*CopyDstAlign < Size || *CopySrcAlign < Size)
       return nullptr;
 
   // Use an integer load+store unless we can find something better.
@@ -191,8 +195,7 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
   Value *Dest = Builder.CreateBitCast(MI->getArgOperand(0), NewDstPtrTy);
   LoadInst *L = Builder.CreateLoad(IntType, Src);
   // Alignment from the mem intrinsic will be better, so use it.
-  L->setAlignment(
-      MaybeAlign(CopySrcAlign)); // FIXME: Check if we can use Align instead.
+  L->setAlignment(*CopySrcAlign);
   if (CopyMD)
     L->setMetadata(LLVMContext::MD_tbaa, CopyMD);
   MDNode *LoopMemParallelMD =
@@ -205,8 +208,7 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
 
   StoreInst *S = Builder.CreateStore(L, Dest);
   // Alignment from the mem intrinsic will be better, so use it.
-  S->setAlignment(
-      MaybeAlign(CopyDstAlign)); // FIXME: Check if we can use Align instead.
+  S->setAlignment(*CopyDstAlign);
   if (CopyMD)
     S->setMetadata(LLVMContext::MD_tbaa, CopyMD);
   if (LoopMemParallelMD)
@@ -231,9 +233,10 @@ Instruction *InstCombiner::SimplifyAnyMemTransfer(AnyMemTransferInst *MI) {
 }
 
 Instruction *InstCombiner::SimplifyAnyMemSet(AnyMemSetInst *MI) {
-  const unsigned KnownAlignment =
+  const Align KnownAlignment =
       getKnownAlignment(MI->getDest(), DL, MI, &AC, &DT);
-  if (MI->getDestAlignment() < KnownAlignment) {
+  MaybeAlign MemSetAlign = MI->getDestAlign();
+  if (!MemSetAlign || *MemSetAlign < KnownAlignment) {
     MI->setDestAlignment(KnownAlignment);
     return MI;
   }
@@ -293,106 +296,154 @@ static Value *simplifyX86immShift(const IntrinsicInst &II,
                                   InstCombiner::BuilderTy &Builder) {
   bool LogicalShift = false;
   bool ShiftLeft = false;
+  bool IsImm = false;
 
   switch (II.getIntrinsicID()) {
   default: llvm_unreachable("Unexpected intrinsic!");
-  case Intrinsic::x86_sse2_psra_d:
-  case Intrinsic::x86_sse2_psra_w:
   case Intrinsic::x86_sse2_psrai_d:
   case Intrinsic::x86_sse2_psrai_w:
-  case Intrinsic::x86_avx2_psra_d:
-  case Intrinsic::x86_avx2_psra_w:
   case Intrinsic::x86_avx2_psrai_d:
   case Intrinsic::x86_avx2_psrai_w:
-  case Intrinsic::x86_avx512_psra_q_128:
   case Intrinsic::x86_avx512_psrai_q_128:
-  case Intrinsic::x86_avx512_psra_q_256:
   case Intrinsic::x86_avx512_psrai_q_256:
-  case Intrinsic::x86_avx512_psra_d_512:
-  case Intrinsic::x86_avx512_psra_q_512:
-  case Intrinsic::x86_avx512_psra_w_512:
   case Intrinsic::x86_avx512_psrai_d_512:
   case Intrinsic::x86_avx512_psrai_q_512:
   case Intrinsic::x86_avx512_psrai_w_512:
-    LogicalShift = false; ShiftLeft = false;
+    IsImm = true;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::x86_sse2_psra_d:
+  case Intrinsic::x86_sse2_psra_w:
+  case Intrinsic::x86_avx2_psra_d:
+  case Intrinsic::x86_avx2_psra_w:
+  case Intrinsic::x86_avx512_psra_q_128:
+  case Intrinsic::x86_avx512_psra_q_256:
+  case Intrinsic::x86_avx512_psra_d_512:
+  case Intrinsic::x86_avx512_psra_q_512:
+  case Intrinsic::x86_avx512_psra_w_512:
+    LogicalShift = false;
+    ShiftLeft = false;
     break;
-  case Intrinsic::x86_sse2_psrl_d:
-  case Intrinsic::x86_sse2_psrl_q:
-  case Intrinsic::x86_sse2_psrl_w:
   case Intrinsic::x86_sse2_psrli_d:
   case Intrinsic::x86_sse2_psrli_q:
   case Intrinsic::x86_sse2_psrli_w:
-  case Intrinsic::x86_avx2_psrl_d:
-  case Intrinsic::x86_avx2_psrl_q:
-  case Intrinsic::x86_avx2_psrl_w:
   case Intrinsic::x86_avx2_psrli_d:
   case Intrinsic::x86_avx2_psrli_q:
   case Intrinsic::x86_avx2_psrli_w:
-  case Intrinsic::x86_avx512_psrl_d_512:
-  case Intrinsic::x86_avx512_psrl_q_512:
-  case Intrinsic::x86_avx512_psrl_w_512:
   case Intrinsic::x86_avx512_psrli_d_512:
   case Intrinsic::x86_avx512_psrli_q_512:
   case Intrinsic::x86_avx512_psrli_w_512:
-    LogicalShift = true; ShiftLeft = false;
+    IsImm = true;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::x86_sse2_psrl_d:
+  case Intrinsic::x86_sse2_psrl_q:
+  case Intrinsic::x86_sse2_psrl_w:
+  case Intrinsic::x86_avx2_psrl_d:
+  case Intrinsic::x86_avx2_psrl_q:
+  case Intrinsic::x86_avx2_psrl_w:
+  case Intrinsic::x86_avx512_psrl_d_512:
+  case Intrinsic::x86_avx512_psrl_q_512:
+  case Intrinsic::x86_avx512_psrl_w_512:
+    LogicalShift = true;
+    ShiftLeft = false;
     break;
-  case Intrinsic::x86_sse2_psll_d:
-  case Intrinsic::x86_sse2_psll_q:
-  case Intrinsic::x86_sse2_psll_w:
   case Intrinsic::x86_sse2_pslli_d:
   case Intrinsic::x86_sse2_pslli_q:
   case Intrinsic::x86_sse2_pslli_w:
-  case Intrinsic::x86_avx2_psll_d:
-  case Intrinsic::x86_avx2_psll_q:
-  case Intrinsic::x86_avx2_psll_w:
   case Intrinsic::x86_avx2_pslli_d:
   case Intrinsic::x86_avx2_pslli_q:
   case Intrinsic::x86_avx2_pslli_w:
-  case Intrinsic::x86_avx512_psll_d_512:
-  case Intrinsic::x86_avx512_psll_q_512:
-  case Intrinsic::x86_avx512_psll_w_512:
   case Intrinsic::x86_avx512_pslli_d_512:
   case Intrinsic::x86_avx512_pslli_q_512:
   case Intrinsic::x86_avx512_pslli_w_512:
-    LogicalShift = true; ShiftLeft = true;
+    IsImm = true;
+    LLVM_FALLTHROUGH;
+  case Intrinsic::x86_sse2_psll_d:
+  case Intrinsic::x86_sse2_psll_q:
+  case Intrinsic::x86_sse2_psll_w:
+  case Intrinsic::x86_avx2_psll_d:
+  case Intrinsic::x86_avx2_psll_q:
+  case Intrinsic::x86_avx2_psll_w:
+  case Intrinsic::x86_avx512_psll_d_512:
+  case Intrinsic::x86_avx512_psll_q_512:
+  case Intrinsic::x86_avx512_psll_w_512:
+    LogicalShift = true;
+    ShiftLeft = true;
     break;
   }
   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
 
-  // Simplify if count is constant.
-  auto Arg1 = II.getArgOperand(1);
-  auto CAZ = dyn_cast<ConstantAggregateZero>(Arg1);
-  auto CDV = dyn_cast<ConstantDataVector>(Arg1);
-  auto CInt = dyn_cast<ConstantInt>(Arg1);
-  if (!CAZ && !CDV && !CInt)
-    return nullptr;
-
-  APInt Count(64, 0);
-  if (CDV) {
-    // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
-    // operand to compute the shift amount.
-    auto VT = cast<VectorType>(CDV->getType());
-    unsigned BitWidth = VT->getElementType()->getPrimitiveSizeInBits();
-    assert((64 % BitWidth) == 0 && "Unexpected packed shift size");
-    unsigned NumSubElts = 64 / BitWidth;
-
-    // Concatenate the sub-elements to create the 64-bit value.
-    for (unsigned i = 0; i != NumSubElts; ++i) {
-      unsigned SubEltIdx = (NumSubElts - 1) - i;
-      auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
-      Count <<= BitWidth;
-      Count |= SubElt->getValue().zextOrTrunc(64);
-    }
-  }
-  else if (CInt)
-    Count = CInt->getValue();
-
   auto Vec = II.getArgOperand(0);
+  auto Amt = II.getArgOperand(1);
   auto VT = cast<VectorType>(Vec->getType());
   auto SVT = VT->getElementType();
+  auto AmtVT = Amt->getType();
   unsigned VWidth = VT->getNumElements();
   unsigned BitWidth = SVT->getPrimitiveSizeInBits();
 
+  // If the shift amount is guaranteed to be in-range we can replace it with a
+  // generic shift. If its guaranteed to be out of range, logical shifts combine to
+  // zero and arithmetic shifts are clamped to (BitWidth - 1).
+  if (IsImm) {
+    assert(AmtVT ->isIntegerTy(32) &&
+           "Unexpected shift-by-immediate type");
+    KnownBits KnownAmtBits =
+        llvm::computeKnownBits(Amt, II.getModule()->getDataLayout());
+    if (KnownAmtBits.getMaxValue().ult(BitWidth)) {
+      Amt = Builder.CreateZExtOrTrunc(Amt, SVT);
+      Amt = Builder.CreateVectorSplat(VWidth, Amt);
+      return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+                                        : Builder.CreateLShr(Vec, Amt))
+                           : Builder.CreateAShr(Vec, Amt));
+    }
+    if (KnownAmtBits.getMinValue().uge(BitWidth)) {
+      if (LogicalShift)
+        return ConstantAggregateZero::get(VT);
+      Amt = ConstantInt::get(SVT, BitWidth - 1);
+      return Builder.CreateAShr(Vec, Builder.CreateVectorSplat(VWidth, Amt));
+    }
+  } else {
+    // Ensure the first element has an in-range value and the rest of the
+    // elements in the bottom 64 bits are zero.
+    assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
+           cast<VectorType>(AmtVT)->getElementType() == SVT &&
+           "Unexpected shift-by-scalar type");
+    unsigned NumAmtElts = cast<VectorType>(AmtVT)->getNumElements();
+    APInt DemandedLower = APInt::getOneBitSet(NumAmtElts, 0);
+    APInt DemandedUpper = APInt::getBitsSet(NumAmtElts, 1, NumAmtElts / 2);
+    KnownBits KnownLowerBits = llvm::computeKnownBits(
+        Amt, DemandedLower, II.getModule()->getDataLayout());
+    KnownBits KnownUpperBits = llvm::computeKnownBits(
+        Amt, DemandedUpper, II.getModule()->getDataLayout());
+    if (KnownLowerBits.getMaxValue().ult(BitWidth) &&
+        (DemandedUpper.isNullValue() || KnownUpperBits.isZero())) {
+      SmallVector<int, 16> ZeroSplat(VWidth, 0);
+      Amt = Builder.CreateShuffleVector(Amt, Amt, ZeroSplat);
+      return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+                                        : Builder.CreateLShr(Vec, Amt))
+                           : Builder.CreateAShr(Vec, Amt));
+    }
+  }
+
+  // Simplify if count is constant vector.
+  auto CDV = dyn_cast<ConstantDataVector>(Amt);
+  if (!CDV)
+    return nullptr;
+
+  // SSE2/AVX2 uses all the first 64-bits of the 128-bit vector
+  // operand to compute the shift amount.
+  assert(AmtVT->isVectorTy() && AmtVT->getPrimitiveSizeInBits() == 128 &&
+         cast<VectorType>(AmtVT)->getElementType() == SVT &&
+         "Unexpected shift-by-scalar type");
+
+  // Concatenate the sub-elements to create the 64-bit value.
+  APInt Count(64, 0);
+  for (unsigned i = 0, NumSubElts = 64 / BitWidth; i != NumSubElts; ++i) {
+    unsigned SubEltIdx = (NumSubElts - 1) - i;
+    auto SubElt = cast<ConstantInt>(CDV->getElementAsConstant(SubEltIdx));
+    Count <<= BitWidth;
+    Count |= SubElt->getValue().zextOrTrunc(64);
+  }
+
   // If shift-by-zero then just return the original value.
   if (Count.isNullValue())
     return Vec;
@@ -469,17 +520,29 @@ static Value *simplifyX86varShift(const IntrinsicInst &II,
   }
   assert((LogicalShift || !ShiftLeft) && "Only logical shifts can shift left");
 
-  // Simplify if all shift amounts are constant/undef.
-  auto *CShift = dyn_cast<Constant>(II.getArgOperand(1));
-  if (!CShift)
-    return nullptr;
-
   auto Vec = II.getArgOperand(0);
+  auto Amt = II.getArgOperand(1);
   auto VT = cast<VectorType>(II.getType());
-  auto SVT = VT->getVectorElementType();
+  auto SVT = VT->getElementType();
   int NumElts = VT->getNumElements();
   int BitWidth = SVT->getIntegerBitWidth();
 
+  // If the shift amount is guaranteed to be in-range we can replace it with a
+  // generic shift.
+  APInt UpperBits =
+      APInt::getHighBitsSet(BitWidth, BitWidth - Log2_32(BitWidth));
+  if (llvm::MaskedValueIsZero(Amt, UpperBits,
+                              II.getModule()->getDataLayout())) {
+    return (LogicalShift ? (ShiftLeft ? Builder.CreateShl(Vec, Amt)
+                                      : Builder.CreateLShr(Vec, Amt))
+                         : Builder.CreateAShr(Vec, Amt));
+  }
+
+  // Simplify if all shift amounts are constant/undef.
+  auto *CShift = dyn_cast<Constant>(Amt);
+  if (!CShift)
+    return nullptr;
+
   // Collect each element's shift amount.
   // We also collect special cases: UNDEF = -1, OUT-OF-RANGE = BitWidth.
   bool AnyOutOfRange = false;
@@ -557,10 +620,10 @@ static Value *simplifyX86pack(IntrinsicInst &II,
   if (isa<UndefValue>(Arg0) && isa<UndefValue>(Arg1))
     return UndefValue::get(ResTy);
 
-  Type *ArgTy = Arg0->getType();
+  auto *ArgTy = cast<VectorType>(Arg0->getType());
   unsigned NumLanes = ResTy->getPrimitiveSizeInBits() / 128;
-  unsigned NumSrcElts = ArgTy->getVectorNumElements();
-  assert(ResTy->getVectorNumElements() == (2 * NumSrcElts) &&
+  unsigned NumSrcElts = ArgTy->getNumElements();
+  assert(cast<VectorType>(ResTy)->getNumElements() == (2 * NumSrcElts) &&
          "Unexpected packing types");
 
   unsigned NumSrcEltsPerLane = NumSrcElts / NumLanes;
@@ -600,7 +663,7 @@ static Value *simplifyX86pack(IntrinsicInst &II,
   Arg1 = Builder.CreateSelect(Builder.CreateICmpSGT(Arg1, MaxC), MaxC, Arg1);
 
   // Shuffle clamped args together at the lane level.
-  SmallVector<unsigned, 32> PackMask;
+  SmallVector<int, 32> PackMask;
   for (unsigned Lane = 0; Lane != NumLanes; ++Lane) {
     for (unsigned Elt = 0; Elt != NumSrcEltsPerLane; ++Elt)
       PackMask.push_back(Elt + (Lane * NumSrcEltsPerLane));
@@ -617,14 +680,14 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II,
                                 InstCombiner::BuilderTy &Builder) {
   Value *Arg = II.getArgOperand(0);
   Type *ResTy = II.getType();
-  Type *ArgTy = Arg->getType();
 
   // movmsk(undef) -> zero as we must ensure the upper bits are zero.
   if (isa<UndefValue>(Arg))
     return Constant::getNullValue(ResTy);
 
+  auto *ArgTy = dyn_cast<VectorType>(Arg->getType());
   // We can't easily peek through x86_mmx types.
-  if (!ArgTy->isVectorTy())
+  if (!ArgTy)
     return nullptr;
 
   // Expand MOVMSK to compare/bitcast/zext:
@@ -632,8 +695,8 @@ static Value *simplifyX86movmsk(const IntrinsicInst &II,
   // %cmp = icmp slt <16 x i8> %x, zeroinitializer
   // %int = bitcast <16 x i1> %cmp to i16
   // %res = zext i16 %int to i32
-  unsigned NumElts = ArgTy->getVectorNumElements();
-  Type *IntegerVecTy = VectorType::getInteger(cast<VectorType>(ArgTy));
+  unsigned NumElts = ArgTy->getNumElements();
+  Type *IntegerVecTy = VectorType::getInteger(ArgTy);
   Type *IntegerTy = Builder.getIntNTy(NumElts);
 
   Value *Res = Builder.CreateBitCast(Arg, IntegerVecTy);
@@ -697,7 +760,7 @@ static Value *simplifyX86insertps(const IntrinsicInst &II,
     return ZeroVector;
 
   // Initialize by passing all of the first source bits through.
-  uint32_t ShuffleMask[4] = { 0, 1, 2, 3 };
+  int ShuffleMask[4] = {0, 1, 2, 3};
 
   // We may replace the second operand with the zero vector.
   Value *V1 = II.getArgOperand(1);
@@ -777,22 +840,19 @@ static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
       Index /= 8;
 
       Type *IntTy8 = Type::getInt8Ty(II.getContext());
-      Type *IntTy32 = Type::getInt32Ty(II.getContext());
-      VectorType *ShufTy = VectorType::get(IntTy8, 16);
+      auto *ShufTy = FixedVectorType::get(IntTy8, 16);
 
-      SmallVector<Constant *, 16> ShuffleMask;
+      SmallVector<int, 16> ShuffleMask;
       for (int i = 0; i != (int)Length; ++i)
-        ShuffleMask.push_back(
-            Constant::getIntegerValue(IntTy32, APInt(32, i + Index)));
+        ShuffleMask.push_back(i + Index);
       for (int i = Length; i != 8; ++i)
-        ShuffleMask.push_back(
-            Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
+        ShuffleMask.push_back(i + 16);
       for (int i = 8; i != 16; ++i)
-        ShuffleMask.push_back(UndefValue::get(IntTy32));
+        ShuffleMask.push_back(-1);
 
       Value *SV = Builder.CreateShuffleVector(
           Builder.CreateBitCast(Op0, ShufTy),
-          ConstantAggregateZero::get(ShufTy), ConstantVector::get(ShuffleMask));
+          ConstantAggregateZero::get(ShufTy), ShuffleMask);
       return Builder.CreateBitCast(SV, II.getType());
     }
 
@@ -857,23 +917,21 @@ static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
     Index /= 8;
 
     Type *IntTy8 = Type::getInt8Ty(II.getContext());
-    Type *IntTy32 = Type::getInt32Ty(II.getContext());
-    VectorType *ShufTy = VectorType::get(IntTy8, 16);
+    auto *ShufTy = FixedVectorType::get(IntTy8, 16);
 
-    SmallVector<Constant *, 16> ShuffleMask;
+    SmallVector<int, 16> ShuffleMask;
     for (int i = 0; i != (int)Index; ++i)
-      ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i)));
+      ShuffleMask.push_back(i);
     for (int i = 0; i != (int)Length; ++i)
-      ShuffleMask.push_back(
-          Constant::getIntegerValue(IntTy32, APInt(32, i + 16)));
+      ShuffleMask.push_back(i + 16);
     for (int i = Index + Length; i != 8; ++i)
-      ShuffleMask.push_back(Constant::getIntegerValue(IntTy32, APInt(32, i)));
+      ShuffleMask.push_back(i);
     for (int i = 8; i != 16; ++i)
-      ShuffleMask.push_back(UndefValue::get(IntTy32));
+      ShuffleMask.push_back(-1);
 
     Value *SV = Builder.CreateShuffleVector(Builder.CreateBitCast(Op0, ShufTy),
                                             Builder.CreateBitCast(Op1, ShufTy),
-                                            ConstantVector::get(ShuffleMask));
+                                            ShuffleMask);
     return Builder.CreateBitCast(SV, II.getType());
   }
 
@@ -925,13 +983,12 @@ static Value *simplifyX86pshufb(const IntrinsicInst &II,
     return nullptr;
 
   auto *VecTy = cast<VectorType>(II.getType());
-  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
   unsigned NumElts = VecTy->getNumElements();
   assert((NumElts == 16 || NumElts == 32 || NumElts == 64) &&
          "Unexpected number of elements in shuffle mask!");
 
   // Construct a shuffle mask from constant integers or UNDEFs.
-  Constant *Indexes[64] = {nullptr};
+  int Indexes[64];
 
   // Each byte in the shuffle control mask forms an index to permute the
   // corresponding byte in the destination operand.
@@ -941,7 +998,7 @@ static Value *simplifyX86pshufb(const IntrinsicInst &II,
       return nullptr;
 
     if (isa<UndefValue>(COp)) {
-      Indexes[I] = UndefValue::get(MaskEltTy);
+      Indexes[I] = -1;
       continue;
     }
 
@@ -955,13 +1012,12 @@ static Value *simplifyX86pshufb(const IntrinsicInst &II,
     // The value of each index for the high 128-bit lane is the least
     // significant 4 bits of the respective shuffle control byte.
     Index = ((Index < 0) ? NumElts : Index & 0x0F) + (I & 0xF0);
-    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
+    Indexes[I] = Index;
   }
 
-  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts));
   auto V1 = II.getArgOperand(0);
   auto V2 = Constant::getNullValue(VecTy);
-  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
+  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
 }
 
 /// Attempt to convert vpermilvar* to shufflevector if the mask is constant.
@@ -972,14 +1028,13 @@ static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
     return nullptr;
 
   auto *VecTy = cast<VectorType>(II.getType());
-  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
-  unsigned NumElts = VecTy->getVectorNumElements();
+  unsigned NumElts = VecTy->getNumElements();
   bool IsPD = VecTy->getScalarType()->isDoubleTy();
   unsigned NumLaneElts = IsPD ? 2 : 4;
   assert(NumElts == 16 || NumElts == 8 || NumElts == 4 || NumElts == 2);
 
   // Construct a shuffle mask from constant integers or UNDEFs.
-  Constant *Indexes[16] = {nullptr};
+  int Indexes[16];
 
   // The intrinsics only read one or two bits, clear the rest.
   for (unsigned I = 0; I < NumElts; ++I) {
@@ -988,7 +1043,7 @@ static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
       return nullptr;
 
     if (isa<UndefValue>(COp)) {
-      Indexes[I] = UndefValue::get(MaskEltTy);
+      Indexes[I] = -1;
       continue;
     }
 
@@ -1005,13 +1060,12 @@ static Value *simplifyX86vpermilvar(const IntrinsicInst &II,
     // shuffle, we have to make that explicit.
     Index += APInt(32, (I / NumLaneElts) * NumLaneElts);
 
-    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
+    Indexes[I] = Index.getZExtValue();
   }
 
-  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, NumElts));
   auto V1 = II.getArgOperand(0);
   auto V2 = UndefValue::get(V1->getType());
-  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
+  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, NumElts));
 }
 
 /// Attempt to convert vpermd/vpermps to shufflevector if the mask is constant.
@@ -1022,13 +1076,12 @@ static Value *simplifyX86vpermv(const IntrinsicInst &II,
     return nullptr;
 
   auto *VecTy = cast<VectorType>(II.getType());
-  auto *MaskEltTy = Type::getInt32Ty(II.getContext());
   unsigned Size = VecTy->getNumElements();
   assert((Size == 4 || Size == 8 || Size == 16 || Size == 32 || Size == 64) &&
          "Unexpected shuffle mask size");
 
   // Construct a shuffle mask from constant integers or UNDEFs.
-  Constant *Indexes[64] = {nullptr};
+  int Indexes[64];
 
   for (unsigned I = 0; I < Size; ++I) {
     Constant *COp = V->getAggregateElement(I);
@@ -1036,26 +1089,26 @@ static Value *simplifyX86vpermv(const IntrinsicInst &II,
       return nullptr;
 
     if (isa<UndefValue>(COp)) {
-      Indexes[I] = UndefValue::get(MaskEltTy);
+      Indexes[I] = -1;
       continue;
     }
 
     uint32_t Index = cast<ConstantInt>(COp)->getZExtValue();
     Index &= Size - 1;
-    Indexes[I] = ConstantInt::get(MaskEltTy, Index);
+    Indexes[I] = Index;
   }
 
-  auto ShuffleMask = ConstantVector::get(makeArrayRef(Indexes, Size));
   auto V1 = II.getArgOperand(0);
   auto V2 = UndefValue::get(VecTy);
-  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
+  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes, Size));
 }
 
 // TODO, Obvious Missing Transforms:
 // * Narrow width by halfs excluding zero/undef lanes
 Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) {
   Value *LoadPtr = II.getArgOperand(0);
-  unsigned Alignment = cast<ConstantInt>(II.getArgOperand(1))->getZExtValue();
+  const Align Alignment =
+      cast<ConstantInt>(II.getArgOperand(1))->getAlignValue();
 
   // If the mask is all ones or undefs, this is a plain vector load of the 1st
   // argument.
@@ -1065,9 +1118,9 @@ Value *InstCombiner::simplifyMaskedLoad(IntrinsicInst &II) {
 
   // If we can unconditionally load from this address, replace with a
   // load/select idiom. TODO: use DT for context sensitive query
-  if (isDereferenceableAndAlignedPointer(
-          LoadPtr, II.getType(), MaybeAlign(Alignment),
-          II.getModule()->getDataLayout(), &II, nullptr)) {
+  if (isDereferenceableAndAlignedPointer(LoadPtr, II.getType(), Alignment,
+                                         II.getModule()->getDataLayout(), &II,
+                                         nullptr)) {
     Value *LI = Builder.CreateAlignedLoad(II.getType(), LoadPtr, Alignment,
                                          "unmaskedload");
     return Builder.CreateSelect(II.getArgOperand(2), LI, II.getArgOperand(3));
@@ -1091,8 +1144,7 @@ Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) {
   // If the mask is all ones, this is a plain vector store of the 1st argument.
   if (ConstMask->isAllOnesValue()) {
     Value *StorePtr = II.getArgOperand(1);
-    MaybeAlign Alignment(
-        cast<ConstantInt>(II.getArgOperand(2))->getZExtValue());
+    Align Alignment = cast<ConstantInt>(II.getArgOperand(2))->getAlignValue();
     return new StoreInst(II.getArgOperand(0), StorePtr, false, Alignment);
   }
 
@@ -1100,10 +1152,8 @@ Instruction *InstCombiner::simplifyMaskedStore(IntrinsicInst &II) {
   APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
   APInt UndefElts(DemandedElts.getBitWidth(), 0);
   if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0),
-                                            DemandedElts, UndefElts)) {
-    II.setOperand(0, V);
-    return &II;
-  }
+                                            DemandedElts, UndefElts))
+    return replaceOperand(II, 0, V);
 
   return nullptr;
 }
@@ -1138,15 +1188,11 @@ Instruction *InstCombiner::simplifyMaskedScatter(IntrinsicInst &II) {
   APInt DemandedElts = possiblyDemandedEltsInMask(ConstMask);
   APInt UndefElts(DemandedElts.getBitWidth(), 0);
   if (Value *V = SimplifyDemandedVectorElts(II.getOperand(0),
-                                            DemandedElts, UndefElts)) {
-    II.setOperand(0, V);
-    return &II;
-  }
+                                            DemandedElts, UndefElts))
+    return replaceOperand(II, 0, V);
   if (Value *V = SimplifyDemandedVectorElts(II.getOperand(1),
-                                            DemandedElts, UndefElts)) {
-    II.setOperand(1, V);
-    return &II;
-  }
+                                            DemandedElts, UndefElts))
+    return replaceOperand(II, 1, V);
 
   return nullptr;
 }
@@ -1202,19 +1248,15 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
 
   if (IsTZ) {
     // cttz(-x) -> cttz(x)
-    if (match(Op0, m_Neg(m_Value(X)))) {
-      II.setOperand(0, X);
-      return &II;
-    }
+    if (match(Op0, m_Neg(m_Value(X))))
+      return IC.replaceOperand(II, 0, X);
 
     // cttz(abs(x)) -> cttz(x)
     // cttz(nabs(x)) -> cttz(x)
     Value *Y;
     SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
-    if (SPF == SPF_ABS || SPF == SPF_NABS) {
-      II.setOperand(0, X);
-      return &II;
-    }
+    if (SPF == SPF_ABS || SPF == SPF_NABS)
+      return IC.replaceOperand(II, 0, X);
   }
 
   KnownBits Known = IC.computeKnownBits(Op0, 0, &II);
@@ -1240,10 +1282,8 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
   if (!Known.One.isNullValue() ||
       isKnownNonZero(Op0, IC.getDataLayout(), 0, &IC.getAssumptionCache(), &II,
                      &IC.getDominatorTree())) {
-    if (!match(II.getArgOperand(1), m_One())) {
-      II.setOperand(1, IC.Builder.getTrue());
-      return &II;
-    }
+    if (!match(II.getArgOperand(1), m_One()))
+      return IC.replaceOperand(II, 1, IC.Builder.getTrue());
   }
 
   // Add range metadata since known bits can't completely reflect what we know.
@@ -1264,21 +1304,39 @@ static Instruction *foldCttzCtlz(IntrinsicInst &II, InstCombiner &IC) {
 static Instruction *foldCtpop(IntrinsicInst &II, InstCombiner &IC) {
   assert(II.getIntrinsicID() == Intrinsic::ctpop &&
          "Expected ctpop intrinsic");
+  Type *Ty = II.getType();
+  unsigned BitWidth = Ty->getScalarSizeInBits();
   Value *Op0 = II.getArgOperand(0);
   Value *X;
+
   // ctpop(bitreverse(x)) -> ctpop(x)
   // ctpop(bswap(x)) -> ctpop(x)
-  if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X)))) {
-    II.setOperand(0, X);
-    return &II;
+  if (match(Op0, m_BitReverse(m_Value(X))) || match(Op0, m_BSwap(m_Value(X))))
+    return IC.replaceOperand(II, 0, X);
+
+  // ctpop(x | -x) -> bitwidth - cttz(x, false)
+  if (Op0->hasOneUse() &&
+      match(Op0, m_c_Or(m_Value(X), m_Neg(m_Deferred(X))))) {
+    Function *F =
+        Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty);
+    auto *Cttz = IC.Builder.CreateCall(F, {X, IC.Builder.getFalse()});
+    auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth));
+    return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz));
+  }
+
+  // ctpop(~x & (x - 1)) -> cttz(x, false)
+  if (match(Op0,
+            m_c_And(m_Not(m_Value(X)), m_Add(m_Deferred(X), m_AllOnes())))) {
+    Function *F =
+        Intrinsic::getDeclaration(II.getModule(), Intrinsic::cttz, Ty);
+    return CallInst::Create(F, {X, IC.Builder.getFalse()});
   }
 
   // FIXME: Try to simplify vectors of integers.
-  auto *IT = dyn_cast<IntegerType>(Op0->getType());
+  auto *IT = dyn_cast<IntegerType>(Ty);
   if (!IT)
     return nullptr;
 
-  unsigned BitWidth = IT->getBitWidth();
   KnownBits Known(BitWidth);
   IC.computeKnownBits(Op0, Known, 0, &II);
 
@@ -1330,7 +1388,7 @@ static Instruction *simplifyX86MaskedLoad(IntrinsicInst &II, InstCombiner &IC) {
 
   // The pass-through vector for an x86 masked load is a zero vector.
   CallInst *NewMaskedLoad =
-      IC.Builder.CreateMaskedLoad(PtrCast, 1, BoolMask, ZeroVec);
+      IC.Builder.CreateMaskedLoad(PtrCast, Align(1), BoolMask, ZeroVec);
   return IC.replaceInstUsesWith(II, NewMaskedLoad);
 }
 
@@ -1371,7 +1429,7 @@ static bool simplifyX86MaskedStore(IntrinsicInst &II, InstCombiner &IC) {
   // on each element's most significant bit (the sign bit).
   Constant *BoolMask = getNegativeIsTrueBoolVec(ConstMask);
 
-  IC.Builder.CreateMaskedStore(Vec, PtrCast, 1, BoolMask);
+  IC.Builder.CreateMaskedStore(Vec, PtrCast, Align(1), BoolMask);
 
   // 'Replace uses' doesn't work for stores. Erase the original masked store.
   IC.eraseInstFromFunction(II);
@@ -1417,7 +1475,7 @@ static Value *simplifyNeonTbl1(const IntrinsicInst &II,
   if (!VecTy->getElementType()->isIntegerTy(8) || NumElts != 8)
     return nullptr;
 
-  uint32_t Indexes[8];
+  int Indexes[8];
 
   for (unsigned I = 0; I < NumElts; ++I) {
     Constant *COp = C->getAggregateElement(I);
@@ -1428,15 +1486,13 @@ static Value *simplifyNeonTbl1(const IntrinsicInst &II,
     Indexes[I] = cast<ConstantInt>(COp)->getLimitedValue();
 
     // Make sure the mask indices are in range.
-    if (Indexes[I] >= NumElts)
+    if ((unsigned)Indexes[I] >= NumElts)
       return nullptr;
   }
 
-  auto *ShuffleMask = ConstantDataVector::get(II.getContext(),
-                                              makeArrayRef(Indexes));
   auto *V1 = II.getArgOperand(0);
   auto *V2 = Constant::getNullValue(V1->getType());
-  return Builder.CreateShuffleVector(V1, V2, ShuffleMask);
+  return Builder.CreateShuffleVector(V1, V2, makeArrayRef(Indexes));
 }
 
 /// Convert a vector load intrinsic into a simple llvm load instruction.
@@ -1458,7 +1514,7 @@ static Value *simplifyNeonVld1(const IntrinsicInst &II,
 
   auto *BCastInst = Builder.CreateBitCast(II.getArgOperand(0),
                                           PointerType::get(II.getType(), 0));
-  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Alignment);
+  return Builder.CreateAlignedLoad(II.getType(), BCastInst, Align(Alignment));
 }
 
 // Returns true iff the 2 intrinsics have the same operands, limiting the
@@ -1478,24 +1534,30 @@ static bool haveSameOperands(const IntrinsicInst &I, const IntrinsicInst &E,
 // start/end intrinsics in between). As this handles only the most trivial
 // cases, tracking the nesting level is not needed:
 //
-//   call @llvm.foo.start(i1 0) ; &I
 //   call @llvm.foo.start(i1 0)
-//   call @llvm.foo.end(i1 0) ; This one will not be skipped: it will be removed
+//   call @llvm.foo.start(i1 0) ; This one won't be skipped: it will be removed
 //   call @llvm.foo.end(i1 0)
-static bool removeTriviallyEmptyRange(IntrinsicInst &I, unsigned StartID,
-                                      unsigned EndID, InstCombiner &IC) {
-  assert(I.getIntrinsicID() == StartID &&
-         "Start intrinsic does not have expected ID");
-  BasicBlock::iterator BI(I), BE(I.getParent()->end());
-  for (++BI; BI != BE; ++BI) {
-    if (auto *E = dyn_cast<IntrinsicInst>(BI)) {
-      if (isa<DbgInfoIntrinsic>(E) || E->getIntrinsicID() == StartID)
+//   call @llvm.foo.end(i1 0) ; &I
+static bool removeTriviallyEmptyRange(
+    IntrinsicInst &EndI, InstCombiner &IC,
+    std::function<bool(const IntrinsicInst &)> IsStart) {
+  // We start from the end intrinsic and scan backwards, so that InstCombine
+  // has already processed (and potentially removed) all the instructions
+  // before the end intrinsic.
+  BasicBlock::reverse_iterator BI(EndI), BE(EndI.getParent()->rend());
+  for (; BI != BE; ++BI) {
+    if (auto *I = dyn_cast<IntrinsicInst>(&*BI)) {
+      if (isa<DbgInfoIntrinsic>(I) ||
+          I->getIntrinsicID() == EndI.getIntrinsicID())
+        continue;
+      if (IsStart(*I)) {
+        if (haveSameOperands(EndI, *I, EndI.getNumArgOperands())) {
+          IC.eraseInstFromFunction(*I);
+          IC.eraseInstFromFunction(EndI);
+          return true;
+        }
+        // Skip start intrinsics that don't pair with this end intrinsic.
         continue;
-      if (E->getIntrinsicID() == EndID &&
-          haveSameOperands(I, *E, E->getNumArgOperands())) {
-        IC.eraseInstFromFunction(*E);
-        IC.eraseInstFromFunction(I);
-        return true;
       }
     }
     break;
@@ -1709,9 +1771,11 @@ static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
   // intrinsic, we don't have to look up any module metadata, as
   // FtzRequirementTy will be FTZ_Any.)
   if (Action.FtzRequirement != FTZ_Any) {
-    bool FtzEnabled =
-        II->getFunction()->getFnAttribute("nvptx-f32ftz").getValueAsString() ==
-        "true";
+    StringRef Attr = II->getFunction()
+                         ->getFnAttribute("denormal-fp-math-f32")
+                         .getValueAsString();
+    DenormalMode Mode = parseDenormalFPAttribute(Attr);
+    bool FtzEnabled = Mode.Output != DenormalMode::IEEE;
 
     if (FtzEnabled != (Action.FtzRequirement == FTZ_MustBeOn))
       return nullptr;
@@ -1751,13 +1815,11 @@ static Instruction *SimplifyNVVMIntrinsic(IntrinsicInst *II, InstCombiner &IC) {
   llvm_unreachable("All SpecialCase enumerators should be handled in switch.");
 }
 
-Instruction *InstCombiner::visitVAStartInst(VAStartInst &I) {
-  removeTriviallyEmptyRange(I, Intrinsic::vastart, Intrinsic::vaend, *this);
-  return nullptr;
-}
-
-Instruction *InstCombiner::visitVACopyInst(VACopyInst &I) {
-  removeTriviallyEmptyRange(I, Intrinsic::vacopy, Intrinsic::vaend, *this);
+Instruction *InstCombiner::visitVAEndInst(VAEndInst &I) {
+  removeTriviallyEmptyRange(I, *this, [](const IntrinsicInst &I) {
+    return I.getIntrinsicID() == Intrinsic::vastart ||
+           I.getIntrinsicID() == Intrinsic::vacopy;
+  });
   return nullptr;
 }
 
@@ -1786,8 +1848,11 @@ Instruction *InstCombiner::foldIntrinsicWithOverflowCommon(IntrinsicInst *II) {
 /// instructions. For normal calls, it allows visitCallBase to do the heavy
 /// lifting.
 Instruction *InstCombiner::visitCallInst(CallInst &CI) {
-  if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI)))
-    return replaceInstUsesWith(CI, V);
+  // Don't try to simplify calls without uses. It will not do anything useful,
+  // but will result in the following folds being skipped.
+  if (!CI.use_empty())
+    if (Value *V = SimplifyCall(&CI, SQ.getWithInstruction(&CI)))
+      return replaceInstUsesWith(CI, V);
 
   if (isFreeCall(&CI, &TLI))
     return visitFree(CI);
@@ -1802,6 +1867,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CI);
   if (!II) return visitCallBase(CI);
 
+  // For atomic unordered mem intrinsics if len is not a positive or
+  // not a multiple of element size then behavior is undefined.
+  if (auto *AMI = dyn_cast<AtomicMemIntrinsic>(II))
+    if (ConstantInt *NumBytes = dyn_cast<ConstantInt>(AMI->getLength()))
+      if (NumBytes->getSExtValue() < 0 ||
+          (NumBytes->getZExtValue() % AMI->getElementSizeInBytes() != 0)) {
+        CreateNonTerminatorUnreachable(AMI);
+        assert(AMI->getType()->isVoidTy() &&
+               "non void atomic unordered mem intrinsic");
+        return eraseInstFromFunction(*AMI);
+      }
+
   // Intrinsics cannot occur in an invoke or a callbr, so handle them here
   // instead of in visitCallBase.
   if (auto *MI = dyn_cast<AnyMemIntrinsic>(II)) {
@@ -1863,9 +1940,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (Changed) return II;
   }
 
-  // For vector result intrinsics, use the generic demanded vector support.
-  if (II->getType()->isVectorTy()) {
-    auto VWidth = II->getType()->getVectorNumElements();
+  // For fixed width vector result intrinsics, use the generic demanded vector
+  // support.
+  if (auto *IIFVTy = dyn_cast<FixedVectorType>(II->getType())) {
+    auto VWidth = IIFVTy->getNumElements();
     APInt UndefElts(VWidth, 0);
     APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
     if (Value *V = SimplifyDemandedVectorElts(II, AllOnesEltMask, UndefElts)) {
@@ -1958,10 +2036,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       // Canonicalize a shift amount constant operand to modulo the bit-width.
       Constant *WidthC = ConstantInt::get(Ty, BitWidth);
       Constant *ModuloC = ConstantExpr::getURem(ShAmtC, WidthC);
-      if (ModuloC != ShAmtC) {
-        II->setArgOperand(2, ModuloC);
-        return II;
-      }
+      if (ModuloC != ShAmtC)
+        return replaceOperand(*II, 2, ModuloC);
+
       assert(ConstantExpr::getICmp(ICmpInst::ICMP_UGT, WidthC, ShAmtC) ==
                  ConstantInt::getTrue(CmpInst::makeCmpResultType(Ty)) &&
              "Shift amount expected to be modulo bitwidth");
@@ -2189,7 +2266,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         llvm_unreachable("unexpected intrinsic ID");
       }
       Value *NewCall = Builder.CreateBinaryIntrinsic(NewIID, X, Y, II);
-      Instruction *FNeg = BinaryOperator::CreateFNeg(NewCall);
+      Instruction *FNeg = UnaryOperator::CreateFNeg(NewCall);
       FNeg->copyIRFlags(II);
       return FNeg;
     }
@@ -2220,12 +2297,31 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           llvm_unreachable("unexpected intrinsic ID");
         }
         Instruction *NewCall = Builder.CreateBinaryIntrinsic(
-            IID, X, ConstantFP::get(Arg0->getType(), Res));
-        NewCall->copyIRFlags(II);
+            IID, X, ConstantFP::get(Arg0->getType(), Res), II);
+        // TODO: Conservatively intersecting FMF. If Res == C2, the transform
+        //       was a simplification (so Arg0 and its original flags could
+        //       propagate?)
+        NewCall->andIRFlags(M);
         return replaceInstUsesWith(*II, NewCall);
       }
     }
 
+    Value *ExtSrc0;
+    Value *ExtSrc1;
+
+    // minnum (fpext x), (fpext y) -> minnum x, y
+    // maxnum (fpext x), (fpext y) -> maxnum x, y
+    if (match(II->getArgOperand(0), m_OneUse(m_FPExt(m_Value(ExtSrc0)))) &&
+        match(II->getArgOperand(1), m_OneUse(m_FPExt(m_Value(ExtSrc1)))) &&
+        ExtSrc0->getType() == ExtSrc1->getType()) {
+      Function *F = Intrinsic::getDeclaration(
+          II->getModule(), II->getIntrinsicID(), {ExtSrc0->getType()});
+      CallInst *NewCall = Builder.CreateCall(F, { ExtSrc0, ExtSrc1 });
+      NewCall->copyFastMathFlags(II);
+      NewCall->takeName(II);
+      return new FPExtInst(NewCall, II->getType());
+    }
+
     break;
   }
   case Intrinsic::fmuladd: {
@@ -2260,16 +2356,16 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *Src1 = II->getArgOperand(1);
     Value *X, *Y;
     if (match(Src0, m_FNeg(m_Value(X))) && match(Src1, m_FNeg(m_Value(Y)))) {
-      II->setArgOperand(0, X);
-      II->setArgOperand(1, Y);
+      replaceOperand(*II, 0, X);
+      replaceOperand(*II, 1, Y);
       return II;
     }
 
     // fma fabs(x), fabs(x), z -> fma x, x, z
     if (match(Src0, m_FAbs(m_Value(X))) &&
         match(Src1, m_FAbs(m_Specific(X)))) {
-      II->setArgOperand(0, X);
-      II->setArgOperand(1, X);
+      replaceOperand(*II, 0, X);
+      replaceOperand(*II, 1, X);
       return II;
     }
 
@@ -2283,6 +2379,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return FAdd;
     }
 
+    // fma x, y, 0 -> fmul x, y
+    // This is always valid for -0.0, but requires nsz for +0.0 as
+    // -0.0 + 0.0 = 0.0, which would not be the same as the fmul on its own.
+    if (match(II->getArgOperand(2), m_NegZeroFP()) ||
+        (match(II->getArgOperand(2), m_PosZeroFP()) &&
+         II->getFastMathFlags().noSignedZeros()))
+      return BinaryOperator::CreateFMulFMF(Src0, Src1, II);
+
     break;
   }
   case Intrinsic::copysign: {
@@ -2307,10 +2411,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // copysign X, (copysign ?, SignArg) --> copysign X, SignArg
     Value *SignArg;
     if (match(II->getArgOperand(1),
-              m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(SignArg)))) {
-      II->setArgOperand(1, SignArg);
-      return II;
-    }
+              m_Intrinsic<Intrinsic::copysign>(m_Value(), m_Value(SignArg))))
+      return replaceOperand(*II, 1, SignArg);
 
     break;
   }
@@ -2329,6 +2431,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ceil:
   case Intrinsic::floor:
   case Intrinsic::round:
+  case Intrinsic::roundeven:
   case Intrinsic::nearbyint:
   case Intrinsic::rint:
   case Intrinsic::trunc: {
@@ -2347,8 +2450,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (match(Src, m_FNeg(m_Value(X))) || match(Src, m_FAbs(m_Value(X)))) {
       // cos(-x) -> cos(x)
       // cos(fabs(x)) -> cos(x)
-      II->setArgOperand(0, X);
-      return II;
+      return replaceOperand(*II, 0, X);
     }
     break;
   }
@@ -2357,7 +2459,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (match(II->getArgOperand(0), m_OneUse(m_FNeg(m_Value(X))))) {
       // sin(-x) --> -sin(x)
       Value *NewSin = Builder.CreateUnaryIntrinsic(Intrinsic::sin, X, II);
-      Instruction *FNeg = BinaryOperator::CreateFNeg(NewSin);
+      Instruction *FNeg = UnaryOperator::CreateFNeg(NewSin);
       FNeg->copyFastMathFlags(II);
       return FNeg;
     }
@@ -2366,11 +2468,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::ppc_altivec_lvx:
   case Intrinsic::ppc_altivec_lvxl:
     // Turn PPC lvx -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(16), DL, II, &AC,
                                    &DT) >= 16) {
       Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
                                          PointerType::getUnqual(II->getType()));
-      return new LoadInst(II->getType(), Ptr);
+      return new LoadInst(II->getType(), Ptr, "", false, Align(16));
     }
     break;
   case Intrinsic::ppc_vsx_lxvw4x:
@@ -2378,17 +2480,17 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Turn PPC VSX loads into normal loads.
     Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
                                        PointerType::getUnqual(II->getType()));
-    return new LoadInst(II->getType(), Ptr, Twine(""), false, Align::None());
+    return new LoadInst(II->getType(), Ptr, Twine(""), false, Align(1));
   }
   case Intrinsic::ppc_altivec_stvx:
   case Intrinsic::ppc_altivec_stvxl:
     // Turn stvx -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(16), DL, II, &AC,
                                    &DT) >= 16) {
       Type *OpPtrTy =
         PointerType::getUnqual(II->getArgOperand(0)->getType());
       Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
-      return new StoreInst(II->getArgOperand(0), Ptr);
+      return new StoreInst(II->getArgOperand(0), Ptr, false, Align(16));
     }
     break;
   case Intrinsic::ppc_vsx_stxvw4x:
@@ -2396,14 +2498,15 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // Turn PPC VSX stores into normal stores.
     Type *OpPtrTy = PointerType::getUnqual(II->getArgOperand(0)->getType());
     Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
-    return new StoreInst(II->getArgOperand(0), Ptr, false, Align::None());
+    return new StoreInst(II->getArgOperand(0), Ptr, false, Align(1));
   }
   case Intrinsic::ppc_qpx_qvlfs:
     // Turn PPC QPX qvlfs -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 16, DL, II, &AC,
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(16), DL, II, &AC,
                                    &DT) >= 16) {
-      Type *VTy = VectorType::get(Builder.getFloatTy(),
-                                  II->getType()->getVectorNumElements());
+      Type *VTy =
+          VectorType::get(Builder.getFloatTy(),
+                          cast<VectorType>(II->getType())->getElementCount());
       Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
                                          PointerType::getUnqual(VTy));
       Value *Load = Builder.CreateLoad(VTy, Ptr);
@@ -2412,33 +2515,34 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
   case Intrinsic::ppc_qpx_qvlfd:
     // Turn PPC QPX qvlfd -> load if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(0), 32, DL, II, &AC,
+    if (getOrEnforceKnownAlignment(II->getArgOperand(0), Align(32), DL, II, &AC,
                                    &DT) >= 32) {
       Value *Ptr = Builder.CreateBitCast(II->getArgOperand(0),
                                          PointerType::getUnqual(II->getType()));
-      return new LoadInst(II->getType(), Ptr);
+      return new LoadInst(II->getType(), Ptr, "", false, Align(32));
     }
     break;
   case Intrinsic::ppc_qpx_qvstfs:
     // Turn PPC QPX qvstfs -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 16, DL, II, &AC,
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(16), DL, II, &AC,
                                    &DT) >= 16) {
-      Type *VTy = VectorType::get(Builder.getFloatTy(),
-          II->getArgOperand(0)->getType()->getVectorNumElements());
+      Type *VTy = VectorType::get(
+          Builder.getFloatTy(),
+          cast<VectorType>(II->getArgOperand(0)->getType())->getElementCount());
       Value *TOp = Builder.CreateFPTrunc(II->getArgOperand(0), VTy);
       Type *OpPtrTy = PointerType::getUnqual(VTy);
       Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
-      return new StoreInst(TOp, Ptr);
+      return new StoreInst(TOp, Ptr, false, Align(16));
     }
     break;
   case Intrinsic::ppc_qpx_qvstfd:
     // Turn PPC QPX qvstfd -> store if the pointer is known aligned.
-    if (getOrEnforceKnownAlignment(II->getArgOperand(1), 32, DL, II, &AC,
+    if (getOrEnforceKnownAlignment(II->getArgOperand(1), Align(32), DL, II, &AC,
                                    &DT) >= 32) {
       Type *OpPtrTy =
         PointerType::getUnqual(II->getArgOperand(0)->getType());
       Value *Ptr = Builder.CreateBitCast(II->getArgOperand(1), OpPtrTy);
-      return new StoreInst(II->getArgOperand(0), Ptr);
+      return new StoreInst(II->getArgOperand(0), Ptr, false, Align(32));
     }
     break;
 
@@ -2546,50 +2650,6 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
     break;
 
-  case Intrinsic::x86_vcvtph2ps_128:
-  case Intrinsic::x86_vcvtph2ps_256: {
-    auto Arg = II->getArgOperand(0);
-    auto ArgType = cast<VectorType>(Arg->getType());
-    auto RetType = cast<VectorType>(II->getType());
-    unsigned ArgWidth = ArgType->getNumElements();
-    unsigned RetWidth = RetType->getNumElements();
-    assert(RetWidth <= ArgWidth && "Unexpected input/return vector widths");
-    assert(ArgType->isIntOrIntVectorTy() &&
-           ArgType->getScalarSizeInBits() == 16 &&
-           "CVTPH2PS input type should be 16-bit integer vector");
-    assert(RetType->getScalarType()->isFloatTy() &&
-           "CVTPH2PS output type should be 32-bit float vector");
-
-    // Constant folding: Convert to generic half to single conversion.
-    if (isa<ConstantAggregateZero>(Arg))
-      return replaceInstUsesWith(*II, ConstantAggregateZero::get(RetType));
-
-    if (isa<ConstantDataVector>(Arg)) {
-      auto VectorHalfAsShorts = Arg;
-      if (RetWidth < ArgWidth) {
-        SmallVector<uint32_t, 8> SubVecMask;
-        for (unsigned i = 0; i != RetWidth; ++i)
-          SubVecMask.push_back((int)i);
-        VectorHalfAsShorts = Builder.CreateShuffleVector(
-            Arg, UndefValue::get(ArgType), SubVecMask);
-      }
-
-      auto VectorHalfType =
-          VectorType::get(Type::getHalfTy(II->getContext()), RetWidth);
-      auto VectorHalfs =
-          Builder.CreateBitCast(VectorHalfAsShorts, VectorHalfType);
-      auto VectorFloats = Builder.CreateFPExt(VectorHalfs, RetType);
-      return replaceInstUsesWith(*II, VectorFloats);
-    }
-
-    // We only use the lowest lanes of the argument.
-    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, ArgWidth, RetWidth)) {
-      II->setArgOperand(0, V);
-      return II;
-    }
-    break;
-  }
-
   case Intrinsic::x86_sse_cvtss2si:
   case Intrinsic::x86_sse_cvtss2si64:
   case Intrinsic::x86_sse_cvttss2si:
@@ -2617,11 +2677,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // These intrinsics only demand the 0th element of their input vectors. If
     // we can simplify the input based on that, do so now.
     Value *Arg = II->getArgOperand(0);
-    unsigned VWidth = Arg->getType()->getVectorNumElements();
-    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1)) {
-      II->setArgOperand(0, V);
-      return II;
-    }
+    unsigned VWidth = cast<VectorType>(Arg->getType())->getNumElements();
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg, VWidth, 1))
+      return replaceOperand(*II, 0, V);
     break;
   }
 
@@ -2669,13 +2727,13 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     bool MadeChange = false;
     Value *Arg0 = II->getArgOperand(0);
     Value *Arg1 = II->getArgOperand(1);
-    unsigned VWidth = Arg0->getType()->getVectorNumElements();
+    unsigned VWidth = cast<VectorType>(Arg0->getType())->getNumElements();
     if (Value *V = SimplifyDemandedVectorEltsLow(Arg0, VWidth, 1)) {
-      II->setArgOperand(0, V);
+      replaceOperand(*II, 0, V);
       MadeChange = true;
     }
     if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, 1)) {
-      II->setArgOperand(1, V);
+      replaceOperand(*II, 1, V);
       MadeChange = true;
     }
     if (MadeChange)
@@ -2707,8 +2765,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
          cast<Instruction>(Arg0)->getFastMathFlags().noInfs())) {
       if (Arg0IsZero)
         std::swap(A, B);
-      II->setArgOperand(0, A);
-      II->setArgOperand(1, B);
+      replaceOperand(*II, 0, A);
+      replaceOperand(*II, 1, B);
       return II;
     }
     break;
@@ -2800,8 +2858,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         // We don't need a select if we know the mask bit is a 1.
         if (!C || !C->getValue()[0]) {
           // Cast the mask to an i1 vector and then extract the lowest element.
-          auto *MaskTy = VectorType::get(Builder.getInt1Ty(),
-                             cast<IntegerType>(Mask->getType())->getBitWidth());
+          auto *MaskTy = FixedVectorType::get(
+              Builder.getInt1Ty(),
+              cast<IntegerType>(Mask->getType())->getBitWidth());
           Mask = Builder.CreateBitCast(Mask, MaskTy);
           Mask = Builder.CreateExtractElement(Mask, (uint64_t)0);
           // Extract the lowest element from the passthru operand.
@@ -2887,12 +2946,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *Arg1 = II->getArgOperand(1);
     assert(Arg1->getType()->getPrimitiveSizeInBits() == 128 &&
            "Unexpected packed shift size");
-    unsigned VWidth = Arg1->getType()->getVectorNumElements();
+    unsigned VWidth = cast<VectorType>(Arg1->getType())->getNumElements();
 
-    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2)) {
-      II->setArgOperand(1, V);
-      return II;
-    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Arg1, VWidth, VWidth / 2))
+      return replaceOperand(*II, 1, V);
     break;
   }
 
@@ -2956,14 +3013,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       bool MadeChange = false;
       Value *Arg0 = II->getArgOperand(0);
       Value *Arg1 = II->getArgOperand(1);
-      unsigned VWidth = Arg0->getType()->getVectorNumElements();
+      unsigned VWidth = cast<VectorType>(Arg0->getType())->getNumElements();
 
       APInt UndefElts1(VWidth, 0);
       APInt DemandedElts1 = APInt::getSplat(VWidth,
                                             APInt(2, (Imm & 0x01) ? 2 : 1));
       if (Value *V = SimplifyDemandedVectorElts(Arg0, DemandedElts1,
                                                 UndefElts1)) {
-        II->setArgOperand(0, V);
+        replaceOperand(*II, 0, V);
         MadeChange = true;
       }
 
@@ -2972,7 +3029,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
                                             APInt(2, (Imm & 0x10) ? 2 : 1));
       if (Value *V = SimplifyDemandedVectorElts(Arg1, DemandedElts2,
                                                 UndefElts2)) {
-        II->setArgOperand(1, V);
+        replaceOperand(*II, 1, V);
         MadeChange = true;
       }
 
@@ -2996,8 +3053,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::x86_sse4a_extrq: {
     Value *Op0 = II->getArgOperand(0);
     Value *Op1 = II->getArgOperand(1);
-    unsigned VWidth0 = Op0->getType()->getVectorNumElements();
-    unsigned VWidth1 = Op1->getType()->getVectorNumElements();
+    unsigned VWidth0 = cast<VectorType>(Op0->getType())->getNumElements();
+    unsigned VWidth1 = cast<VectorType>(Op1->getType())->getNumElements();
     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
            VWidth1 == 16 && "Unexpected operand sizes");
@@ -3019,11 +3076,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // operands and the lowest 16-bits of the second.
     bool MadeChange = false;
     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
-      II->setArgOperand(0, V);
+      replaceOperand(*II, 0, V);
       MadeChange = true;
     }
     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 2)) {
-      II->setArgOperand(1, V);
+      replaceOperand(*II, 1, V);
       MadeChange = true;
     }
     if (MadeChange)
@@ -3035,7 +3092,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // EXTRQI: Extract Length bits starting from Index. Zero pad the remaining
     // bits of the lower 64-bits. The upper 64-bits are undefined.
     Value *Op0 = II->getArgOperand(0);
-    unsigned VWidth = Op0->getType()->getVectorNumElements();
+    unsigned VWidth = cast<VectorType>(Op0->getType())->getNumElements();
     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
            "Unexpected operand size");
 
@@ -3049,20 +3106,18 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     // EXTRQI only uses the lowest 64-bits of the first 128-bit vector
     // operand.
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
-      II->setArgOperand(0, V);
-      return II;
-    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1))
+      return replaceOperand(*II, 0, V);
     break;
   }
 
   case Intrinsic::x86_sse4a_insertq: {
     Value *Op0 = II->getArgOperand(0);
     Value *Op1 = II->getArgOperand(1);
-    unsigned VWidth = Op0->getType()->getVectorNumElements();
+    unsigned VWidth = cast<VectorType>(Op0->getType())->getNumElements();
     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth == 2 &&
-           Op1->getType()->getVectorNumElements() == 2 &&
+           cast<VectorType>(Op1->getType())->getNumElements() == 2 &&
            "Unexpected operand size");
 
     // See if we're dealing with constant values.
@@ -3082,10 +3137,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     // INSERTQ only uses the lowest 64-bits of the first 128-bit vector
     // operand.
-    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1)) {
-      II->setArgOperand(0, V);
-      return II;
-    }
+    if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth, 1))
+      return replaceOperand(*II, 0, V);
     break;
   }
 
@@ -3095,8 +3148,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // undefined.
     Value *Op0 = II->getArgOperand(0);
     Value *Op1 = II->getArgOperand(1);
-    unsigned VWidth0 = Op0->getType()->getVectorNumElements();
-    unsigned VWidth1 = Op1->getType()->getVectorNumElements();
+    unsigned VWidth0 = cast<VectorType>(Op0->getType())->getNumElements();
+    unsigned VWidth1 = cast<VectorType>(Op1->getType())->getNumElements();
     assert(Op0->getType()->getPrimitiveSizeInBits() == 128 &&
            Op1->getType()->getPrimitiveSizeInBits() == 128 && VWidth0 == 2 &&
            VWidth1 == 2 && "Unexpected operand sizes");
@@ -3117,11 +3170,11 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // operands.
     bool MadeChange = false;
     if (Value *V = SimplifyDemandedVectorEltsLow(Op0, VWidth0, 1)) {
-      II->setArgOperand(0, V);
+      replaceOperand(*II, 0, V);
       MadeChange = true;
     }
     if (Value *V = SimplifyDemandedVectorEltsLow(Op1, VWidth1, 1)) {
-      II->setArgOperand(1, V);
+      replaceOperand(*II, 1, V);
       MadeChange = true;
     }
     if (MadeChange)
@@ -3163,8 +3216,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
              II->getType()->getPrimitiveSizeInBits() &&
              "Not expecting mask and operands with different sizes");
 
-      unsigned NumMaskElts = Mask->getType()->getVectorNumElements();
-      unsigned NumOperandElts = II->getType()->getVectorNumElements();
+      unsigned NumMaskElts =
+          cast<VectorType>(Mask->getType())->getNumElements();
+      unsigned NumOperandElts =
+          cast<VectorType>(II->getType())->getNumElements();
       if (NumMaskElts == NumOperandElts)
         return SelectInst::Create(BoolVec, Op1, Op0);
 
@@ -3255,7 +3310,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // the permutation mask with respect to 31 and reverse the order of
     // V1 and V2.
     if (Constant *Mask = dyn_cast<Constant>(II->getArgOperand(2))) {
-      assert(Mask->getType()->getVectorNumElements() == 16 &&
+      assert(cast<VectorType>(Mask->getType())->getNumElements() == 16 &&
              "Bad type for intrinsic!");
 
       // Check that all of the elements are integer constants or undefs.
@@ -3307,9 +3362,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     break;
 
   case Intrinsic::arm_neon_vld1: {
-    unsigned MemAlign = getKnownAlignment(II->getArgOperand(0),
-                                          DL, II, &AC, &DT);
-    if (Value *V = simplifyNeonVld1(*II, MemAlign, Builder))
+    Align MemAlign = getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
+    if (Value *V = simplifyNeonVld1(*II, MemAlign.value(), Builder))
       return replaceInstUsesWith(*II, V);
     break;
   }
@@ -3327,16 +3381,14 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
   case Intrinsic::arm_neon_vst2lane:
   case Intrinsic::arm_neon_vst3lane:
   case Intrinsic::arm_neon_vst4lane: {
-    unsigned MemAlign =
-        getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
+    Align MemAlign = getKnownAlignment(II->getArgOperand(0), DL, II, &AC, &DT);
     unsigned AlignArg = II->getNumArgOperands() - 1;
-    ConstantInt *IntrAlign = dyn_cast<ConstantInt>(II->getArgOperand(AlignArg));
-    if (IntrAlign && IntrAlign->getZExtValue() < MemAlign) {
-      II->setArgOperand(AlignArg,
-                        ConstantInt::get(Type::getInt32Ty(II->getContext()),
-                                         MemAlign, false));
-      return II;
-    }
+    Value *AlignArgOp = II->getArgOperand(AlignArg);
+    MaybeAlign Align = cast<ConstantInt>(AlignArgOp)->getMaybeAlignValue();
+    if (Align && *Align < MemAlign)
+      return replaceOperand(*II, AlignArg,
+                            ConstantInt::get(Type::getInt32Ty(II->getContext()),
+                                             MemAlign.value(), false));
     break;
   }
 
@@ -3395,8 +3447,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *Data, *Key;
     if (match(KeyArg, m_ZeroInt()) &&
         match(DataArg, m_Xor(m_Value(Data), m_Value(Key)))) {
-      II->setArgOperand(0, Data);
-      II->setArgOperand(1, Key);
+      replaceOperand(*II, 0, Data);
+      replaceOperand(*II, 1, Key);
       return II;
     }
     break;
@@ -3415,7 +3467,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       if (auto *CI = dyn_cast<ConstantInt>(XorMask)) {
         if (CI->getValue().trunc(16).isAllOnesValue()) {
           auto TrueVector = Builder.CreateVectorSplat(
-              II->getType()->getVectorNumElements(), Builder.getTrue());
+              cast<VectorType>(II->getType())->getNumElements(),
+              Builder.getTrue());
           return BinaryOperator::Create(Instruction::Xor, ArgArg, TrueVector);
         }
       }
@@ -3459,18 +3512,25 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *Src = II->getArgOperand(0);
 
     // TODO: Move to ConstantFolding/InstSimplify?
-    if (isa<UndefValue>(Src))
-      return replaceInstUsesWith(CI, Src);
+    if (isa<UndefValue>(Src)) {
+      Type *Ty = II->getType();
+      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+      return replaceInstUsesWith(CI, QNaN);
+    }
+
+    if (II->isStrictFP())
+      break;
 
     if (const ConstantFP *C = dyn_cast<ConstantFP>(Src)) {
       const APFloat &ArgVal = C->getValueAPF();
       APFloat Val(ArgVal.getSemantics(), 1);
-      APFloat::opStatus Status = Val.divide(ArgVal,
-                                            APFloat::rmNearestTiesToEven);
-      // Only do this if it was exact and therefore not dependent on the
-      // rounding mode.
-      if (Status == APFloat::opOK)
-        return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
+      Val.divide(ArgVal, APFloat::rmNearestTiesToEven);
+
+      // This is more precise than the instruction may give.
+      //
+      // TODO: The instruction always flushes denormal results (except for f16),
+      // should this also?
+      return replaceInstUsesWith(CI, ConstantFP::get(II->getContext(), Val));
     }
 
     break;
@@ -3479,8 +3539,12 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     Value *Src = II->getArgOperand(0);
 
     // TODO: Move to ConstantFolding/InstSimplify?
-    if (isa<UndefValue>(Src))
-      return replaceInstUsesWith(CI, Src);
+    if (isa<UndefValue>(Src)) {
+      Type *Ty = II->getType();
+      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+      return replaceInstUsesWith(CI, QNaN);
+    }
+
     break;
   }
   case Intrinsic::amdgcn_frexp_mant:
@@ -3563,11 +3627,9 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     }
 
     // fp_class (nnan x), qnan|snan|other -> fp_class (nnan x), other
-    if (((Mask & S_NAN) || (Mask & Q_NAN)) && isKnownNeverNaN(Src0, &TLI)) {
-      II->setArgOperand(1, ConstantInt::get(Src1->getType(),
-                                            Mask & ~(S_NAN | Q_NAN)));
-      return II;
-    }
+    if (((Mask & S_NAN) || (Mask & Q_NAN)) && isKnownNeverNaN(Src0, &TLI))
+      return replaceOperand(*II, 1, ConstantInt::get(Src1->getType(),
+                                                     Mask & ~(S_NAN | Q_NAN)));
 
     const ConstantFP *CVal = dyn_cast<ConstantFP>(Src0);
     if (!CVal) {
@@ -3657,23 +3719,19 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       if ((Width & (IntSize - 1)) == 0)
         return replaceInstUsesWith(*II, ConstantInt::getNullValue(Ty));
 
-      if (Width >= IntSize) {
-        // Hardware ignores high bits, so remove those.
-        II->setArgOperand(2, ConstantInt::get(CWidth->getType(),
-                                              Width & (IntSize - 1)));
-        return II;
-      }
+      // Hardware ignores high bits, so remove those.
+      if (Width >= IntSize)
+        return replaceOperand(*II, 2, ConstantInt::get(CWidth->getType(),
+                                                       Width & (IntSize - 1)));
     }
 
     unsigned Offset;
     ConstantInt *COffset = dyn_cast<ConstantInt>(II->getArgOperand(1));
     if (COffset) {
       Offset = COffset->getZExtValue();
-      if (Offset >= IntSize) {
-        II->setArgOperand(1, ConstantInt::get(COffset->getType(),
-                                              Offset & (IntSize - 1)));
-        return II;
-      }
+      if (Offset >= IntSize)
+        return replaceOperand(*II, 1, ConstantInt::get(COffset->getType(),
+                                                       Offset & (IntSize - 1)));
     }
 
     bool Signed = IID == Intrinsic::amdgcn_sbfe;
@@ -3716,7 +3774,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           (IsCompr && ((EnBits & (0x3 << (2 * I))) == 0))) {
         Value *Src = II->getArgOperand(I + 2);
         if (!isa<UndefValue>(Src)) {
-          II->setArgOperand(I + 2, UndefValue::get(Src->getType()));
+          replaceOperand(*II, I + 2, UndefValue::get(Src->getType()));
           Changed = true;
         }
       }
@@ -3855,8 +3913,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         ((match(Src1, m_One()) && match(Src0, m_ZExt(m_Value(ExtSrc)))) ||
          (match(Src1, m_AllOnes()) && match(Src0, m_SExt(m_Value(ExtSrc))))) &&
         ExtSrc->getType()->isIntegerTy(1)) {
-      II->setArgOperand(1, ConstantInt::getNullValue(Src1->getType()));
-      II->setArgOperand(2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
+      replaceOperand(*II, 1, ConstantInt::getNullValue(Src1->getType()));
+      replaceOperand(*II, 2, ConstantInt::get(CC->getType(), CmpInst::ICMP_NE));
       return II;
     }
 
@@ -3928,6 +3986,35 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     break;
   }
+  case Intrinsic::amdgcn_ballot: {
+    if (auto *Src = dyn_cast<ConstantInt>(II->getArgOperand(0))) {
+      if (Src->isZero()) {
+        // amdgcn.ballot(i1 0) is zero.
+        return replaceInstUsesWith(*II, Constant::getNullValue(II->getType()));
+      }
+
+      if (Src->isOne()) {
+        // amdgcn.ballot(i1 1) is exec.
+        const char *RegName = "exec";
+        if (II->getType()->isIntegerTy(32))
+          RegName = "exec_lo";
+        else if (!II->getType()->isIntegerTy(64))
+          break;
+
+        Function *NewF = Intrinsic::getDeclaration(
+            II->getModule(), Intrinsic::read_register, II->getType());
+        Metadata *MDArgs[] = {MDString::get(II->getContext(), RegName)};
+        MDNode *MD = MDNode::get(II->getContext(), MDArgs);
+        Value *Args[] = {MetadataAsValue::get(II->getContext(), MD)};
+        CallInst *NewCall = Builder.CreateCall(NewF, Args);
+        NewCall->addAttribute(AttributeList::FunctionIndex,
+                              Attribute::Convergent);
+        NewCall->takeName(II);
+        return replaceInstUsesWith(*II, NewCall);
+      }
+    }
+    break;
+  }
   case Intrinsic::amdgcn_wqm_vote: {
     // wqm_vote is identity when the argument is constant.
     if (!isa<Constant>(II->getArgOperand(0)))
@@ -3956,8 +4043,21 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       break;
 
     // If bound_ctrl = 1, row mask = bank mask = 0xf we can omit old value.
-    II->setOperand(0, UndefValue::get(Old->getType()));
-    return II;
+    return replaceOperand(*II, 0, UndefValue::get(Old->getType()));
+  }
+  case Intrinsic::amdgcn_permlane16:
+  case Intrinsic::amdgcn_permlanex16: {
+    // Discard vdst_in if it's not going to be read.
+    Value *VDstIn = II->getArgOperand(0);
+   if (isa<UndefValue>(VDstIn))
+     break;
+
+    ConstantInt *FetchInvalid = cast<ConstantInt>(II->getArgOperand(4));
+    ConstantInt *BoundCtrl = cast<ConstantInt>(II->getArgOperand(5));
+    if (!FetchInvalid->getZExtValue() && !BoundCtrl->getZExtValue())
+      break;
+
+    return replaceOperand(*II, 0, UndefValue::get(VDstIn->getType()));
   }
   case Intrinsic::amdgcn_readfirstlane:
   case Intrinsic::amdgcn_readlane: {
@@ -3990,6 +4090,71 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
 
     break;
   }
+  case Intrinsic::amdgcn_ldexp: {
+    // FIXME: This doesn't introduce new instructions and belongs in
+    // InstructionSimplify.
+    Type *Ty = II->getType();
+    Value *Op0 = II->getArgOperand(0);
+    Value *Op1 = II->getArgOperand(1);
+
+    // Folding undef to qnan is safe regardless of the FP mode.
+    if (isa<UndefValue>(Op0)) {
+      auto *QNaN = ConstantFP::get(Ty, APFloat::getQNaN(Ty->getFltSemantics()));
+      return replaceInstUsesWith(*II, QNaN);
+    }
+
+    const APFloat *C = nullptr;
+    match(Op0, m_APFloat(C));
+
+    // FIXME: Should flush denorms depending on FP mode, but that's ignored
+    // everywhere else.
+    //
+    // These cases should be safe, even with strictfp.
+    // ldexp(0.0, x) -> 0.0
+    // ldexp(-0.0, x) -> -0.0
+    // ldexp(inf, x) -> inf
+    // ldexp(-inf, x) -> -inf
+    if (C && (C->isZero() || C->isInfinity()))
+      return replaceInstUsesWith(*II, Op0);
+
+    // With strictfp, be more careful about possibly needing to flush denormals
+    // or not, and snan behavior depends on ieee_mode.
+    if (II->isStrictFP())
+      break;
+
+    if (C && C->isNaN()) {
+      // FIXME: We just need to make the nan quiet here, but that's unavailable
+      // on APFloat, only IEEEfloat
+      auto *Quieted = ConstantFP::get(
+        Ty, scalbn(*C, 0, APFloat::rmNearestTiesToEven));
+      return replaceInstUsesWith(*II, Quieted);
+    }
+
+    // ldexp(x, 0) -> x
+    // ldexp(x, undef) -> x
+    if (isa<UndefValue>(Op1) || match(Op1, m_ZeroInt()))
+      return replaceInstUsesWith(*II, Op0);
+
+    break;
+  }
+  case Intrinsic::hexagon_V6_vandvrt:
+  case Intrinsic::hexagon_V6_vandvrt_128B: {
+    // Simplify Q -> V -> Q conversion.
+    if (auto Op0 = dyn_cast<IntrinsicInst>(II->getArgOperand(0))) {
+      Intrinsic::ID ID0 = Op0->getIntrinsicID();
+      if (ID0 != Intrinsic::hexagon_V6_vandqrt &&
+          ID0 != Intrinsic::hexagon_V6_vandqrt_128B)
+        break;
+      Value *Bytes = Op0->getArgOperand(1), *Mask = II->getArgOperand(1);
+      uint64_t Bytes1 = computeKnownBits(Bytes, 0, Op0).One.getZExtValue();
+      uint64_t Mask1 = computeKnownBits(Mask, 0, II).One.getZExtValue();
+      // Check if every byte has common bits in Bytes and Mask.
+      uint64_t C = Bytes1 & Mask1;
+      if ((C & 0xFF) && (C & 0xFF00) && (C & 0xFF0000) && (C & 0xFF000000))
+        return replaceInstUsesWith(*II, Op0->getArgOperand(0));
+    }
+    break;
+  }
   case Intrinsic::stackrestore: {
     // If the save is right next to the restore, remove the restore.  This can
     // happen when variable allocas are DCE'd.
@@ -4040,7 +4205,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
       return eraseInstFromFunction(CI);
     break;
   }
-  case Intrinsic::lifetime_start:
+  case Intrinsic::lifetime_end:
     // Asan needs to poison memory to detect invalid access which is possible
     // even for empty lifetime range.
     if (II->getFunction()->hasFnAttribute(Attribute::SanitizeAddress) ||
@@ -4048,34 +4213,41 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         II->getFunction()->hasFnAttribute(Attribute::SanitizeHWAddress))
       break;
 
-    if (removeTriviallyEmptyRange(*II, Intrinsic::lifetime_start,
-                                  Intrinsic::lifetime_end, *this))
+    if (removeTriviallyEmptyRange(*II, *this, [](const IntrinsicInst &I) {
+          return I.getIntrinsicID() == Intrinsic::lifetime_start;
+        }))
       return nullptr;
     break;
   case Intrinsic::assume: {
     Value *IIOperand = II->getArgOperand(0);
+    SmallVector<OperandBundleDef, 4> OpBundles;
+    II->getOperandBundlesAsDefs(OpBundles);
+    bool HasOpBundles = !OpBundles.empty();
     // Remove an assume if it is followed by an identical assume.
     // TODO: Do we need this? Unless there are conflicting assumptions, the
     // computeKnownBits(IIOperand) below here eliminates redundant assumes.
     Instruction *Next = II->getNextNonDebugInstruction();
-    if (match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))))
+    if (HasOpBundles &&
+        match(Next, m_Intrinsic<Intrinsic::assume>(m_Specific(IIOperand))) &&
+        !cast<IntrinsicInst>(Next)->hasOperandBundles())
       return eraseInstFromFunction(CI);
 
     // Canonicalize assume(a && b) -> assume(a); assume(b);
     // Note: New assumption intrinsics created here are registered by
     // the InstCombineIRInserter object.
     FunctionType *AssumeIntrinsicTy = II->getFunctionType();
-    Value *AssumeIntrinsic = II->getCalledValue();
+    Value *AssumeIntrinsic = II->getCalledOperand();
     Value *A, *B;
     if (match(IIOperand, m_And(m_Value(A), m_Value(B)))) {
-      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, II->getName());
+      Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, A, OpBundles,
+                         II->getName());
       Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic, B, II->getName());
       return eraseInstFromFunction(*II);
     }
     // assume(!(a || b)) -> assume(!a); assume(!b);
     if (match(IIOperand, m_Not(m_Or(m_Value(A), m_Value(B))))) {
       Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
-                         Builder.CreateNot(A), II->getName());
+                         Builder.CreateNot(A), OpBundles, II->getName());
       Builder.CreateCall(AssumeIntrinsicTy, AssumeIntrinsic,
                          Builder.CreateNot(B), II->getName());
       return eraseInstFromFunction(*II);
@@ -4091,7 +4263,8 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
         isValidAssumeForContext(II, LHS, &DT)) {
       MDNode *MD = MDNode::get(II->getContext(), None);
       LHS->setMetadata(LLVMContext::MD_nonnull, MD);
-      return eraseInstFromFunction(*II);
+      if (!HasOpBundles)
+        return eraseInstFromFunction(*II);
 
       // TODO: apply nonnull return attributes to calls and invokes
       // TODO: apply range metadata for range check patterns?
@@ -4101,7 +4274,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     // then this one is redundant, and should be removed.
     KnownBits Known(1);
     computeKnownBits(IIOperand, Known, 0, II);
-    if (Known.isAllOnes())
+    if (Known.isAllOnes() && isAssumeWithEmptyBundle(*II))
       return eraseInstFromFunction(*II);
 
     // Update the cache of affected values for this assumption (we might be
@@ -4117,10 +4290,10 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
     if (GCR.getBasePtr() == GCR.getDerivedPtr() &&
         GCR.getBasePtrIndex() != GCR.getDerivedPtrIndex()) {
       auto *OpIntTy = GCR.getOperand(2)->getType();
-      II->setOperand(2, ConstantInt::get(OpIntTy, GCR.getBasePtrIndex()));
-      return II;
+      return replaceOperand(*II, 2,
+          ConstantInt::get(OpIntTy, GCR.getBasePtrIndex()));
     }
-    
+
     // Translate facts known about a pointer before relocating into
     // facts about the relocate value, while being careful to
     // preserve relocation semantics.
@@ -4187,7 +4360,7 @@ Instruction *InstCombiner::visitCallInst(CallInst &CI) {
           MoveI = MoveI->getNextNonDebugInstruction();
           Temp->moveBefore(II);
         }
-        II->setArgOperand(0, Builder.CreateAnd(CurrCond, NextCond));
+        replaceOperand(*II, 0, Builder.CreateAnd(CurrCond, NextCond));
       }
       eraseInstFromFunction(*NextInst);
       return II;
@@ -4232,13 +4405,14 @@ static bool isSafeToEliminateVarargsCast(const CallBase &Call,
   // TODO: This is probably something which should be expanded to all
   // intrinsics since the entire point of intrinsics is that
   // they are understandable by the optimizer.
-  if (isStatepoint(&Call) || isGCRelocate(&Call) || isGCResult(&Call))
+  if (isa<GCStatepointInst>(Call) || isa<GCRelocateInst>(Call) ||
+      isa<GCResultInst>(Call))
     return false;
 
   // The size of ByVal or InAlloca arguments is derived from the type, so we
   // can't change to a type with a different size.  If the size were
   // passed explicitly we could avoid this check.
-  if (!Call.isByValOrInAllocaArgument(ix))
+  if (!Call.isPassPointeeByValueArgument(ix))
     return true;
 
   Type* SrcTy =
@@ -4264,7 +4438,7 @@ Instruction *InstCombiner::tryOptimizeCall(CallInst *CI) {
   };
   LibCallSimplifier Simplifier(DL, &TLI, ORE, BFI, PSI, InstCombineRAUW,
                                InstCombineErase);
-  if (Value *With = Simplifier.optimizeCall(CI)) {
+  if (Value *With = Simplifier.optimizeCall(CI, Builder)) {
     ++NumSimplified;
     return CI->use_empty() ? CI : replaceInstUsesWith(*CI, With);
   }
@@ -4353,7 +4527,8 @@ static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) {
   ConstantInt *Op0C = dyn_cast<ConstantInt>(Call.getOperand(0));
   ConstantInt *Op1C =
       (NumArgs == 1) ? nullptr : dyn_cast<ConstantInt>(Call.getOperand(1));
-  // Bail out if the allocation size is zero.
+  // Bail out if the allocation size is zero (or an invalid alignment of zero
+  // with aligned_alloc).
   if ((Op0C && Op0C->isNullValue()) || (Op1C && Op1C->isNullValue()))
     return;
 
@@ -4366,6 +4541,18 @@ static void annotateAnyAllocSite(CallBase &Call, const TargetLibraryInfo *TLI) {
       Call.addAttribute(AttributeList::ReturnIndex,
                         Attribute::getWithDereferenceableOrNullBytes(
                             Call.getContext(), Op0C->getZExtValue()));
+  } else if (isAlignedAllocLikeFn(&Call, TLI) && Op1C) {
+    Call.addAttribute(AttributeList::ReturnIndex,
+                      Attribute::getWithDereferenceableOrNullBytes(
+                          Call.getContext(), Op1C->getZExtValue()));
+    // Add alignment attribute if alignment is a power of two constant.
+    if (Op0C && Op0C->getValue().ult(llvm::Value::MaximumAlignment)) {
+      uint64_t AlignmentVal = Op0C->getZExtValue();
+      if (llvm::isPowerOf2_64(AlignmentVal))
+        Call.addAttribute(AttributeList::ReturnIndex,
+                          Attribute::getWithAlignment(Call.getContext(),
+                                                      Align(AlignmentVal)));
+    }
   } else if (isReallocLikeFn(&Call, TLI) && Op1C) {
     Call.addAttribute(AttributeList::ReturnIndex,
                       Attribute::getWithDereferenceableOrNullBytes(
@@ -4430,7 +4617,7 @@ Instruction *InstCombiner::visitCallBase(CallBase &Call) {
 
   // If the callee is a pointer to a function, attempt to move any casts to the
   // arguments of the call/callbr/invoke.
-  Value *Callee = Call.getCalledValue();
+  Value *Callee = Call.getCalledOperand();
   if (!isa<Function>(Callee) && transformConstExprCastCall(Call))
     return nullptr;
 
@@ -4500,7 +4687,7 @@ Instruction *InstCombiner::visitCallBase(CallBase &Call) {
          I != E; ++I, ++ix) {
       CastInst *CI = dyn_cast<CastInst>(*I);
       if (CI && isSafeToEliminateVarargsCast(Call, DL, CI, ix)) {
-        *I = CI->getOperand(0);
+        replaceUse(*I, CI->getOperand(0));
 
         // Update the byval type to match the argument type.
         if (Call.isByValArgument(ix)) {
@@ -4531,6 +4718,15 @@ Instruction *InstCombiner::visitCallBase(CallBase &Call) {
     if (I) return eraseInstFromFunction(*I);
   }
 
+  if (!Call.use_empty() && !Call.isMustTailCall())
+    if (Value *ReturnedArg = Call.getReturnedArgOperand()) {
+      Type *CallTy = Call.getType();
+      Type *RetArgTy = ReturnedArg->getType();
+      if (RetArgTy->canLosslesslyBitCastTo(CallTy))
+        return replaceInstUsesWith(
+            Call, Builder.CreateBitOrPointerCast(ReturnedArg, CallTy));
+    }
+
   if (isAllocLikeFn(&Call, &TLI))
     return visitAllocSite(Call);
 
@@ -4540,7 +4736,8 @@ Instruction *InstCombiner::visitCallBase(CallBase &Call) {
 /// If the callee is a constexpr cast of a function, attempt to move the cast to
 /// the arguments of the call/callbr/invoke.
 bool InstCombiner::transformConstExprCastCall(CallBase &Call) {
-  auto *Callee = dyn_cast<Function>(Call.getCalledValue()->stripPointerCasts());
+  auto *Callee =
+      dyn_cast<Function>(Call.getCalledOperand()->stripPointerCasts());
   if (!Callee)
     return false;
 
@@ -4618,6 +4815,7 @@ bool InstCombiner::transformConstExprCastCall(CallBase &Call) {
   //
   //  Similarly, avoid folding away bitcasts of byval calls.
   if (Callee->getAttributes().hasAttrSomewhere(Attribute::InAlloca) ||
+      Callee->getAttributes().hasAttrSomewhere(Attribute::Preallocated) ||
       Callee->getAttributes().hasAttrSomewhere(Attribute::ByVal))
     return false;
 
@@ -4658,7 +4856,7 @@ bool InstCombiner::transformConstExprCastCall(CallBase &Call) {
     // If the callee is just a declaration, don't change the varargsness of the
     // call.  We don't want to introduce a varargs call where one doesn't
     // already exist.
-    PointerType *APTy = cast<PointerType>(Call.getCalledValue()->getType());
+    PointerType *APTy = cast<PointerType>(Call.getCalledOperand()->getType());
     if (FT->isVarArg()!=cast<FunctionType>(APTy->getElementType())->isVarArg())
       return false;
 
@@ -4774,11 +4972,8 @@ bool InstCombiner::transformConstExprCastCall(CallBase &Call) {
   NewCall->setCallingConv(Call.getCallingConv());
   NewCall->setAttributes(NewCallerPAL);
 
-  // Preserve the weight metadata for the new call instruction. The metadata
-  // is used by SamplePGO to check callsite's hotness.
-  uint64_t W;
-  if (Caller->extractProfTotalWeight(W))
-    NewCall->setProfWeight(W);
+  // Preserve prof metadata if any.
+  NewCall->copyMetadata(*Caller, {LLVMContext::MD_prof});
 
   // Insert a cast of the return type as necessary.
   Instruction *NC = NewCall;
@@ -4800,7 +4995,7 @@ bool InstCombiner::transformConstExprCastCall(CallBase &Call) {
         // Otherwise, it's a call, just insert cast right after the call.
         InsertNewInstBefore(NC, *Caller);
       }
-      Worklist.AddUsersToWorkList(*Caller);
+      Worklist.pushUsersToWorkList(*Caller);
     } else {
       NV = UndefValue::get(Caller->getType());
     }
@@ -4826,7 +5021,7 @@ bool InstCombiner::transformConstExprCastCall(CallBase &Call) {
 Instruction *
 InstCombiner::transformCallThroughTrampoline(CallBase &Call,
                                              IntrinsicInst &Tramp) {
-  Value *Callee = Call.getCalledValue();
+  Value *Callee = Call.getCalledOperand();
   Type *CalleeTy = Callee->getType();
   FunctionType *FTy = Call.getFunctionType();
   AttributeList Attrs = Call.getAttributes();
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
index 71b7f279e5fa..3639edb5df4d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCasts.cpp
@@ -85,16 +85,16 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
                                                    AllocaInst &AI) {
   PointerType *PTy = cast<PointerType>(CI.getType());
 
-  BuilderTy AllocaBuilder(Builder);
-  AllocaBuilder.SetInsertPoint(&AI);
+  IRBuilderBase::InsertPointGuard Guard(Builder);
+  Builder.SetInsertPoint(&AI);
 
   // Get the type really allocated and the type casted to.
   Type *AllocElTy = AI.getAllocatedType();
   Type *CastElTy = PTy->getElementType();
   if (!AllocElTy->isSized() || !CastElTy->isSized()) return nullptr;
 
-  unsigned AllocElTyAlign = DL.getABITypeAlignment(AllocElTy);
-  unsigned CastElTyAlign = DL.getABITypeAlignment(CastElTy);
+  Align AllocElTyAlign = DL.getABITypeAlign(AllocElTy);
+  Align CastElTyAlign = DL.getABITypeAlign(CastElTy);
   if (CastElTyAlign < AllocElTyAlign) return nullptr;
 
   // If the allocation has multiple uses, only promote it if we are strictly
@@ -131,17 +131,17 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   } else {
     Amt = ConstantInt::get(AI.getArraySize()->getType(), Scale);
     // Insert before the alloca, not before the cast.
-    Amt = AllocaBuilder.CreateMul(Amt, NumElements);
+    Amt = Builder.CreateMul(Amt, NumElements);
   }
 
   if (uint64_t Offset = (AllocElTySize*ArrayOffset)/CastElTySize) {
     Value *Off = ConstantInt::get(AI.getArraySize()->getType(),
                                   Offset, true);
-    Amt = AllocaBuilder.CreateAdd(Amt, Off);
+    Amt = Builder.CreateAdd(Amt, Off);
   }
 
-  AllocaInst *New = AllocaBuilder.CreateAlloca(CastElTy, Amt);
-  New->setAlignment(MaybeAlign(AI.getAlignment()));
+  AllocaInst *New = Builder.CreateAlloca(CastElTy, Amt);
+  New->setAlignment(AI.getAlign());
   New->takeName(&AI);
   New->setUsedWithInAlloca(AI.isUsedWithInAlloca());
 
@@ -151,8 +151,9 @@ Instruction *InstCombiner::PromoteCastOfAllocation(BitCastInst &CI,
   if (!AI.hasOneUse()) {
     // New is the allocation instruction, pointer typed. AI is the original
     // allocation instruction, also pointer typed. Thus, cast to use is BitCast.
-    Value *NewCast = AllocaBuilder.CreateBitCast(New, AI.getType(), "tmpcast");
+    Value *NewCast = Builder.CreateBitCast(New, AI.getType(), "tmpcast");
     replaceInstUsesWith(AI, NewCast);
+    eraseInstFromFunction(AI);
   }
   return replaceInstUsesWith(CI, New);
 }
@@ -164,9 +165,7 @@ Value *InstCombiner::EvaluateInDifferentType(Value *V, Type *Ty,
   if (Constant *C = dyn_cast<Constant>(V)) {
     C = ConstantExpr::getIntegerCast(C, Ty, isSigned /*Sext or ZExt*/);
     // If we got a constantexpr back, try to simplify it with DL info.
-    if (Constant *FoldedC = ConstantFoldConstant(C, DL, &TLI))
-      C = FoldedC;
-    return C;
+    return ConstantFoldConstant(C, DL, &TLI);
   }
 
   // Otherwise, it must be an instruction.
@@ -276,16 +275,20 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
   }
 
   if (auto *Sel = dyn_cast<SelectInst>(Src)) {
-    // We are casting a select. Try to fold the cast into the select, but only
-    // if the select does not have a compare instruction with matching operand
-    // types. Creating a select with operands that are different sizes than its
+    // We are casting a select. Try to fold the cast into the select if the
+    // select does not have a compare instruction with matching operand types
+    // or the select is likely better done in a narrow type.
+    // Creating a select with operands that are different sizes than its
     // condition may inhibit other folds and lead to worse codegen.
     auto *Cmp = dyn_cast<CmpInst>(Sel->getCondition());
-    if (!Cmp || Cmp->getOperand(0)->getType() != Sel->getType())
+    if (!Cmp || Cmp->getOperand(0)->getType() != Sel->getType() ||
+        (CI.getOpcode() == Instruction::Trunc &&
+         shouldChangeType(CI.getSrcTy(), CI.getType()))) {
       if (Instruction *NV = FoldOpIntoSelect(CI, Sel)) {
         replaceAllDbgUsesWith(*Sel, *NV, CI, DT);
         return NV;
       }
+    }
   }
 
   // If we are casting a PHI, then fold the cast into the PHI.
@@ -293,7 +296,7 @@ Instruction *InstCombiner::commonCastTransforms(CastInst &CI) {
     // Don't do this if it would create a PHI node with an illegal type from a
     // legal type.
     if (!Src->getType()->isIntegerTy() || !CI.getType()->isIntegerTy() ||
-        shouldChangeType(CI.getType(), Src->getType()))
+        shouldChangeType(CI.getSrcTy(), CI.getType()))
       if (Instruction *NV = foldOpIntoPhi(CI, PN))
         return NV;
   }
@@ -374,29 +377,31 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
     break;
   }
   case Instruction::Shl: {
-    // If we are truncating the result of this SHL, and if it's a shift of a
-    // constant amount, we can always perform a SHL in a smaller type.
-    const APInt *Amt;
-    if (match(I->getOperand(1), m_APInt(Amt))) {
-      uint32_t BitWidth = Ty->getScalarSizeInBits();
-      if (Amt->getLimitedValue(BitWidth) < BitWidth)
-        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
-    }
+    // If we are truncating the result of this SHL, and if it's a shift of an
+    // inrange amount, we can always perform a SHL in a smaller type.
+    uint32_t BitWidth = Ty->getScalarSizeInBits();
+    KnownBits AmtKnownBits =
+        llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
+    if (AmtKnownBits.getMaxValue().ult(BitWidth))
+      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
     break;
   }
   case Instruction::LShr: {
     // If this is a truncate of a logical shr, we can truncate it to a smaller
     // lshr iff we know that the bits we would otherwise be shifting in are
     // already zeros.
-    const APInt *Amt;
-    if (match(I->getOperand(1), m_APInt(Amt))) {
-      uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
-      uint32_t BitWidth = Ty->getScalarSizeInBits();
-      if (Amt->getLimitedValue(BitWidth) < BitWidth &&
-          IC.MaskedValueIsZero(I->getOperand(0),
-            APInt::getBitsSetFrom(OrigBitWidth, BitWidth), 0, CxtI)) {
-        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
-      }
+    // TODO: It is enough to check that the bits we would be shifting in are
+    //       zero - use AmtKnownBits.getMaxValue().
+    uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+    uint32_t BitWidth = Ty->getScalarSizeInBits();
+    KnownBits AmtKnownBits =
+        llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
+    APInt ShiftedBits = APInt::getBitsSetFrom(OrigBitWidth, BitWidth);
+    if (AmtKnownBits.getMaxValue().ult(BitWidth) &&
+        IC.MaskedValueIsZero(I->getOperand(0), ShiftedBits, 0, CxtI)) {
+      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
     }
     break;
   }
@@ -406,15 +411,15 @@ static bool canEvaluateTruncated(Value *V, Type *Ty, InstCombiner &IC,
     // original type and the sign bit of the truncate type are similar.
     // TODO: It is enough to check that the bits we would be shifting in are
     //       similar to sign bit of the truncate type.
-    const APInt *Amt;
-    if (match(I->getOperand(1), m_APInt(Amt))) {
-      uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
-      uint32_t BitWidth = Ty->getScalarSizeInBits();
-      if (Amt->getLimitedValue(BitWidth) < BitWidth &&
-          OrigBitWidth - BitWidth <
-              IC.ComputeNumSignBits(I->getOperand(0), 0, CxtI))
-        return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI);
-    }
+    uint32_t OrigBitWidth = OrigTy->getScalarSizeInBits();
+    uint32_t BitWidth = Ty->getScalarSizeInBits();
+    KnownBits AmtKnownBits =
+        llvm::computeKnownBits(I->getOperand(1), IC.getDataLayout());
+    unsigned ShiftedBits = OrigBitWidth - BitWidth;
+    if (AmtKnownBits.getMaxValue().ult(BitWidth) &&
+        ShiftedBits < IC.ComputeNumSignBits(I->getOperand(0), 0, CxtI))
+      return canEvaluateTruncated(I->getOperand(0), Ty, IC, CxtI) &&
+             canEvaluateTruncated(I->getOperand(1), Ty, IC, CxtI);
     break;
   }
   case Instruction::Trunc:
@@ -480,7 +485,7 @@ static Instruction *foldVecTruncToExtElt(TruncInst &Trunc, InstCombiner &IC) {
   // bitcast it to a vector type that we can extract from.
   unsigned NumVecElts = VecWidth / DestWidth;
   if (VecType->getElementType() != DestType) {
-    VecType = VectorType::get(DestType, NumVecElts);
+    VecType = FixedVectorType::get(DestType, NumVecElts);
     VecInput = IC.Builder.CreateBitCast(VecInput, VecType, "bc");
   }
 
@@ -639,12 +644,12 @@ static Instruction *shrinkSplatShuffle(TruncInst &Trunc,
                                        InstCombiner::BuilderTy &Builder) {
   auto *Shuf = dyn_cast<ShuffleVectorInst>(Trunc.getOperand(0));
   if (Shuf && Shuf->hasOneUse() && isa<UndefValue>(Shuf->getOperand(1)) &&
-      Shuf->getMask()->getSplatValue() &&
+      is_splat(Shuf->getShuffleMask()) &&
       Shuf->getType() == Shuf->getOperand(0)->getType()) {
     // trunc (shuf X, Undef, SplatMask) --> shuf (trunc X), Undef, SplatMask
     Constant *NarrowUndef = UndefValue::get(Trunc.getType());
     Value *NarrowOp = Builder.CreateTrunc(Shuf->getOperand(0), Trunc.getType());
-    return new ShuffleVectorInst(NarrowOp, NarrowUndef, Shuf->getMask());
+    return new ShuffleVectorInst(NarrowOp, NarrowUndef, Shuf->getShuffleMask());
   }
 
   return nullptr;
@@ -682,29 +687,51 @@ static Instruction *shrinkInsertElt(CastInst &Trunc,
   return nullptr;
 }
 
-Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
-  if (Instruction *Result = commonCastTransforms(CI))
+Instruction *InstCombiner::visitTrunc(TruncInst &Trunc) {
+  if (Instruction *Result = commonCastTransforms(Trunc))
     return Result;
 
-  Value *Src = CI.getOperand(0);
-  Type *DestTy = CI.getType(), *SrcTy = Src->getType();
+  Value *Src = Trunc.getOperand(0);
+  Type *DestTy = Trunc.getType(), *SrcTy = Src->getType();
+  unsigned DestWidth = DestTy->getScalarSizeInBits();
+  unsigned SrcWidth = SrcTy->getScalarSizeInBits();
+  ConstantInt *Cst;
 
   // Attempt to truncate the entire input expression tree to the destination
   // type.   Only do this if the dest type is a simple type, don't convert the
   // expression tree to something weird like i93 unless the source is also
   // strange.
   if ((DestTy->isVectorTy() || shouldChangeType(SrcTy, DestTy)) &&
-      canEvaluateTruncated(Src, DestTy, *this, &CI)) {
+      canEvaluateTruncated(Src, DestTy, *this, &Trunc)) {
 
     // If this cast is a truncate, evaluting in a different type always
     // eliminates the cast, so it is always a win.
     LLVM_DEBUG(
         dbgs() << "ICE: EvaluateInDifferentType converting expression type"
                   " to avoid cast: "
-               << CI << '\n');
+               << Trunc << '\n');
     Value *Res = EvaluateInDifferentType(Src, DestTy, false);
     assert(Res->getType() == DestTy);
-    return replaceInstUsesWith(CI, Res);
+    return replaceInstUsesWith(Trunc, Res);
+  }
+
+  // For integer types, check if we can shorten the entire input expression to
+  // DestWidth * 2, which won't allow removing the truncate, but reducing the
+  // width may enable further optimizations, e.g. allowing for larger
+  // vectorization factors.
+  if (auto *DestITy = dyn_cast<IntegerType>(DestTy)) {
+    if (DestWidth * 2 < SrcWidth) {
+      auto *NewDestTy = DestITy->getExtendedType();
+      if (shouldChangeType(SrcTy, NewDestTy) &&
+          canEvaluateTruncated(Src, NewDestTy, *this, &Trunc)) {
+        LLVM_DEBUG(
+            dbgs() << "ICE: EvaluateInDifferentType converting expression type"
+                      " to reduce the width of operand of"
+                   << Trunc << '\n');
+        Value *Res = EvaluateInDifferentType(Src, NewDestTy, false);
+        return new TruncInst(Res, DestTy);
+      }
+    }
   }
 
   // Test if the trunc is the user of a select which is part of a
@@ -712,17 +739,17 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   // Even simplifying demanded bits can break the canonical form of a
   // min/max.
   Value *LHS, *RHS;
-  if (SelectInst *SI = dyn_cast<SelectInst>(CI.getOperand(0)))
-    if (matchSelectPattern(SI, LHS, RHS).Flavor != SPF_UNKNOWN)
+  if (SelectInst *Sel = dyn_cast<SelectInst>(Src))
+    if (matchSelectPattern(Sel, LHS, RHS).Flavor != SPF_UNKNOWN)
       return nullptr;
 
   // See if we can simplify any instructions used by the input whose sole
   // purpose is to compute bits we don't care about.
-  if (SimplifyDemandedInstructionBits(CI))
-    return &CI;
+  if (SimplifyDemandedInstructionBits(Trunc))
+    return &Trunc;
 
-  if (DestTy->getScalarSizeInBits() == 1) {
-    Value *Zero = Constant::getNullValue(Src->getType());
+  if (DestWidth == 1) {
+    Value *Zero = Constant::getNullValue(SrcTy);
     if (DestTy->isIntegerTy()) {
       // Canonicalize trunc x to i1 -> icmp ne (and x, 1), 0 (scalar only).
       // TODO: We canonicalize to more instructions here because we are probably
@@ -736,18 +763,21 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     // For vectors, we do not canonicalize all truncs to icmp, so optimize
     // patterns that would be covered within visitICmpInst.
     Value *X;
-    const APInt *C;
-    if (match(Src, m_OneUse(m_LShr(m_Value(X), m_APInt(C))))) {
+    Constant *C;
+    if (match(Src, m_OneUse(m_LShr(m_Value(X), m_Constant(C))))) {
       // trunc (lshr X, C) to i1 --> icmp ne (and X, C'), 0
-      APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C);
-      Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC));
+      Constant *One = ConstantInt::get(SrcTy, APInt(SrcWidth, 1));
+      Constant *MaskC = ConstantExpr::getShl(One, C);
+      Value *And = Builder.CreateAnd(X, MaskC);
       return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
     }
-    if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_APInt(C)),
+    if (match(Src, m_OneUse(m_c_Or(m_LShr(m_Value(X), m_Constant(C)),
                                    m_Deferred(X))))) {
       // trunc (or (lshr X, C), X) to i1 --> icmp ne (and X, C'), 0
-      APInt MaskC = APInt(SrcTy->getScalarSizeInBits(), 1).shl(*C) | 1;
-      Value *And = Builder.CreateAnd(X, ConstantInt::get(SrcTy, MaskC));
+      Constant *One = ConstantInt::get(SrcTy, APInt(SrcWidth, 1));
+      Constant *MaskC = ConstantExpr::getShl(One, C);
+      MaskC = ConstantExpr::getOr(MaskC, One);
+      Value *And = Builder.CreateAnd(X, MaskC);
       return new ICmpInst(ICmpInst::ICMP_NE, And, Zero);
     }
   }
@@ -756,7 +786,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
   // more efficiently. Support vector types. Cleanup code by using m_OneUse.
 
   // Transform trunc(lshr (zext A), Cst) to eliminate one type conversion.
-  Value *A = nullptr; ConstantInt *Cst = nullptr;
+  Value *A = nullptr;
   if (Src->hasOneUse() &&
       match(Src, m_LShr(m_ZExt(m_Value(A)), m_ConstantInt(Cst)))) {
     // We have three types to worry about here, the type of A, the source of
@@ -768,7 +798,7 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     // If the shift amount is larger than the size of A, then the result is
     // known to be zero because all the input bits got shifted out.
     if (Cst->getZExtValue() >= ASize)
-      return replaceInstUsesWith(CI, Constant::getNullValue(DestTy));
+      return replaceInstUsesWith(Trunc, Constant::getNullValue(DestTy));
 
     // Since we're doing an lshr and a zero extend, and know that the shift
     // amount is smaller than ASize, it is always safe to do the shift in A's
@@ -778,45 +808,37 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
     return CastInst::CreateIntegerCast(Shift, DestTy, false);
   }
 
-  // FIXME: We should canonicalize to zext/trunc and remove this transform.
-  // Transform trunc(lshr (sext A), Cst) to ashr A, Cst to eliminate type
-  // conversion.
-  // It works because bits coming from sign extension have the same value as
-  // the sign bit of the original value; performing ashr instead of lshr
-  // generates bits of the same value as the sign bit.
-  if (Src->hasOneUse() &&
-      match(Src, m_LShr(m_SExt(m_Value(A)), m_ConstantInt(Cst)))) {
-    Value *SExt = cast<Instruction>(Src)->getOperand(0);
-    const unsigned SExtSize = SExt->getType()->getPrimitiveSizeInBits();
-    const unsigned ASize = A->getType()->getPrimitiveSizeInBits();
-    const unsigned CISize = CI.getType()->getPrimitiveSizeInBits();
-    const unsigned MaxAmt = SExtSize - std::max(CISize, ASize);
-    unsigned ShiftAmt = Cst->getZExtValue();
-
-    // This optimization can be only performed when zero bits generated by
-    // the original lshr aren't pulled into the value after truncation, so we
-    // can only shift by values no larger than the number of extension bits.
-    // FIXME: Instead of bailing when the shift is too large, use and to clear
-    // the extra bits.
-    if (ShiftAmt <= MaxAmt) {
-      if (CISize == ASize)
-        return BinaryOperator::CreateAShr(A, ConstantInt::get(CI.getType(),
-                                          std::min(ShiftAmt, ASize - 1)));
-      if (SExt->hasOneUse()) {
-        Value *Shift = Builder.CreateAShr(A, std::min(ShiftAmt, ASize - 1));
-        Shift->takeName(Src);
-        return CastInst::CreateIntegerCast(Shift, CI.getType(), true);
+  const APInt *C;
+  if (match(Src, m_LShr(m_SExt(m_Value(A)), m_APInt(C)))) {
+    unsigned AWidth = A->getType()->getScalarSizeInBits();
+    unsigned MaxShiftAmt = SrcWidth - std::max(DestWidth, AWidth);
+
+    // If the shift is small enough, all zero bits created by the shift are
+    // removed by the trunc.
+    if (C->getZExtValue() <= MaxShiftAmt) {
+      // trunc (lshr (sext A), C) --> ashr A, C
+      if (A->getType() == DestTy) {
+        unsigned ShAmt = std::min((unsigned)C->getZExtValue(), DestWidth - 1);
+        return BinaryOperator::CreateAShr(A, ConstantInt::get(DestTy, ShAmt));
+      }
+      // The types are mismatched, so create a cast after shifting:
+      // trunc (lshr (sext A), C) --> sext/trunc (ashr A, C)
+      if (Src->hasOneUse()) {
+        unsigned ShAmt = std::min((unsigned)C->getZExtValue(), AWidth - 1);
+        Value *Shift = Builder.CreateAShr(A, ShAmt);
+        return CastInst::CreateIntegerCast(Shift, DestTy, true);
       }
     }
+    // TODO: Mask high bits with 'and'.
   }
 
-  if (Instruction *I = narrowBinOp(CI))
+  if (Instruction *I = narrowBinOp(Trunc))
     return I;
 
-  if (Instruction *I = shrinkSplatShuffle(CI, Builder))
+  if (Instruction *I = shrinkSplatShuffle(Trunc, Builder))
     return I;
 
-  if (Instruction *I = shrinkInsertElt(CI, Builder))
+  if (Instruction *I = shrinkInsertElt(Trunc, Builder))
     return I;
 
   if (Src->hasOneUse() && isa<IntegerType>(SrcTy) &&
@@ -827,20 +849,48 @@ Instruction *InstCombiner::visitTrunc(TruncInst &CI) {
         !match(A, m_Shr(m_Value(), m_Constant()))) {
       // Skip shifts of shift by constants. It undoes a combine in
       // FoldShiftByConstant and is the extend in reg pattern.
-      const unsigned DestSize = DestTy->getScalarSizeInBits();
-      if (Cst->getValue().ult(DestSize)) {
+      if (Cst->getValue().ult(DestWidth)) {
         Value *NewTrunc = Builder.CreateTrunc(A, DestTy, A->getName() + ".tr");
 
         return BinaryOperator::Create(
           Instruction::Shl, NewTrunc,
-          ConstantInt::get(DestTy, Cst->getValue().trunc(DestSize)));
+          ConstantInt::get(DestTy, Cst->getValue().trunc(DestWidth)));
       }
     }
   }
 
-  if (Instruction *I = foldVecTruncToExtElt(CI, *this))
+  if (Instruction *I = foldVecTruncToExtElt(Trunc, *this))
     return I;
 
+  // Whenever an element is extracted from a vector, and then truncated,
+  // canonicalize by converting it to a bitcast followed by an
+  // extractelement.
+  //
+  // Example (little endian):
+  //   trunc (extractelement <4 x i64> %X, 0) to i32
+  //   --->
+  //   extractelement <8 x i32> (bitcast <4 x i64> %X to <8 x i32>), i32 0
+  Value *VecOp;
+  if (match(Src, m_OneUse(m_ExtractElt(m_Value(VecOp), m_ConstantInt(Cst))))) {
+    auto *VecOpTy = cast<VectorType>(VecOp->getType());
+    unsigned VecNumElts = VecOpTy->getNumElements();
+
+    // A badly fit destination size would result in an invalid cast.
+    if (SrcWidth % DestWidth == 0) {
+      uint64_t TruncRatio = SrcWidth / DestWidth;
+      uint64_t BitCastNumElts = VecNumElts * TruncRatio;
+      uint64_t VecOpIdx = Cst->getZExtValue();
+      uint64_t NewIdx = DL.isBigEndian() ? (VecOpIdx + 1) * TruncRatio - 1
+                                         : VecOpIdx * TruncRatio;
+      assert(BitCastNumElts <= std::numeric_limits<uint32_t>::max() &&
+             "overflow 32-bits");
+
+      auto *BitCastTo = FixedVectorType::get(DestTy, BitCastNumElts);
+      Value *BitCast = Builder.CreateBitCast(VecOp, BitCastTo);
+      return ExtractElementInst::Create(BitCast, Builder.getInt32(NewIdx));
+    }
+  }
+
   return nullptr;
 }
 
@@ -1431,16 +1481,17 @@ Instruction *InstCombiner::visitSExt(SExtInst &CI) {
   //   %d = ashr i32 %a, 30
   Value *A = nullptr;
   // TODO: Eventually this could be subsumed by EvaluateInDifferentType.
-  ConstantInt *BA = nullptr, *CA = nullptr;
-  if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_ConstantInt(BA)),
-                        m_ConstantInt(CA))) &&
+  Constant *BA = nullptr, *CA = nullptr;
+  if (match(Src, m_AShr(m_Shl(m_Trunc(m_Value(A)), m_Constant(BA)),
+                        m_Constant(CA))) &&
       BA == CA && A->getType() == CI.getType()) {
     unsigned MidSize = Src->getType()->getScalarSizeInBits();
     unsigned SrcDstSize = CI.getType()->getScalarSizeInBits();
-    unsigned ShAmt = CA->getZExtValue()+SrcDstSize-MidSize;
-    Constant *ShAmtV = ConstantInt::get(CI.getType(), ShAmt);
-    A = Builder.CreateShl(A, ShAmtV, CI.getName());
-    return BinaryOperator::CreateAShr(A, ShAmtV);
+    Constant *SizeDiff = ConstantInt::get(CA->getType(), SrcDstSize - MidSize);
+    Constant *ShAmt = ConstantExpr::getAdd(CA, SizeDiff);
+    Constant *ShAmtExt = ConstantExpr::getSExt(ShAmt, CI.getType());
+    A = Builder.CreateShl(A, ShAmtExt, CI.getName());
+    return BinaryOperator::CreateAShr(A, ShAmtExt);
   }
 
   return nullptr;
@@ -1478,12 +1529,13 @@ static Type *shrinkFPConstant(ConstantFP *CFP) {
 // TODO: Make these support undef elements.
 static Type *shrinkFPConstantVector(Value *V) {
   auto *CV = dyn_cast<Constant>(V);
-  if (!CV || !CV->getType()->isVectorTy())
+  auto *CVVTy = dyn_cast<VectorType>(V->getType());
+  if (!CV || !CVVTy)
     return nullptr;
 
   Type *MinType = nullptr;
 
-  unsigned NumElts = CV->getType()->getVectorNumElements();
+  unsigned NumElts = CVVTy->getNumElements();
   for (unsigned i = 0; i != NumElts; ++i) {
     auto *CFP = dyn_cast_or_null<ConstantFP>(CV->getAggregateElement(i));
     if (!CFP)
@@ -1500,7 +1552,7 @@ static Type *shrinkFPConstantVector(Value *V) {
   }
 
   // Make a vector type from the minimal type.
-  return VectorType::get(MinType, NumElts);
+  return FixedVectorType::get(MinType, NumElts);
 }
 
 /// Find the minimum FP type we can safely truncate to.
@@ -1522,6 +1574,48 @@ static Type *getMinimumFPType(Value *V) {
   return V->getType();
 }
 
+/// Return true if the cast from integer to FP can be proven to be exact for all
+/// possible inputs (the conversion does not lose any precision).
+static bool isKnownExactCastIntToFP(CastInst &I) {
+  CastInst::CastOps Opcode = I.getOpcode();
+  assert((Opcode == CastInst::SIToFP || Opcode == CastInst::UIToFP) &&
+         "Unexpected cast");
+  Value *Src = I.getOperand(0);
+  Type *SrcTy = Src->getType();
+  Type *FPTy = I.getType();
+  bool IsSigned = Opcode == Instruction::SIToFP;
+  int SrcSize = (int)SrcTy->getScalarSizeInBits() - IsSigned;
+
+  // Easy case - if the source integer type has less bits than the FP mantissa,
+  // then the cast must be exact.
+  int DestNumSigBits = FPTy->getFPMantissaWidth();
+  if (SrcSize <= DestNumSigBits)
+    return true;
+
+  // Cast from FP to integer and back to FP is independent of the intermediate
+  // integer width because of poison on overflow.
+  Value *F;
+  if (match(Src, m_FPToSI(m_Value(F))) || match(Src, m_FPToUI(m_Value(F)))) {
+    // If this is uitofp (fptosi F), the source needs an extra bit to avoid
+    // potential rounding of negative FP input values.
+    int SrcNumSigBits = F->getType()->getFPMantissaWidth();
+    if (!IsSigned && match(Src, m_FPToSI(m_Value())))
+      SrcNumSigBits++;
+
+    // [su]itofp (fpto[su]i F) --> exact if the source type has less or equal
+    // significant bits than the destination (and make sure neither type is
+    // weird -- ppc_fp128).
+    if (SrcNumSigBits > 0 && DestNumSigBits > 0 &&
+        SrcNumSigBits <= DestNumSigBits)
+      return true;
+  }
+
+  // TODO:
+  // Try harder to find if the source integer type has less significant bits.
+  // For example, compute number of sign bits or compute low bit mask.
+  return false;
+}
+
 Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
   if (Instruction *I = commonCastTransforms(FPT))
     return I;
@@ -1632,10 +1726,6 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
     if (match(Op, m_FNeg(m_Value(X)))) {
       Value *InnerTrunc = Builder.CreateFPTrunc(X, Ty);
 
-      // FIXME: Once we're sure that unary FNeg optimizations are on par with
-      // binary FNeg, this should always return a unary operator.
-      if (isa<BinaryOperator>(Op))
-        return BinaryOperator::CreateFNegFMF(InnerTrunc, Op);
       return UnaryOperator::CreateFNegFMF(InnerTrunc, Op);
     }
 
@@ -1667,6 +1757,7 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
     case Intrinsic::nearbyint:
     case Intrinsic::rint:
     case Intrinsic::round:
+    case Intrinsic::roundeven:
     case Intrinsic::trunc: {
       Value *Src = II->getArgOperand(0);
       if (!Src->hasOneUse())
@@ -1699,74 +1790,83 @@ Instruction *InstCombiner::visitFPTrunc(FPTruncInst &FPT) {
   if (Instruction *I = shrinkInsertElt(FPT, Builder))
     return I;
 
+  Value *Src = FPT.getOperand(0);
+  if (isa<SIToFPInst>(Src) || isa<UIToFPInst>(Src)) {
+    auto *FPCast = cast<CastInst>(Src);
+    if (isKnownExactCastIntToFP(*FPCast))
+      return CastInst::Create(FPCast->getOpcode(), FPCast->getOperand(0), Ty);
+  }
+
   return nullptr;
 }
 
-Instruction *InstCombiner::visitFPExt(CastInst &CI) {
-  return commonCastTransforms(CI);
+Instruction *InstCombiner::visitFPExt(CastInst &FPExt) {
+  // If the source operand is a cast from integer to FP and known exact, then
+  // cast the integer operand directly to the destination type.
+  Type *Ty = FPExt.getType();
+  Value *Src = FPExt.getOperand(0);
+  if (isa<SIToFPInst>(Src) || isa<UIToFPInst>(Src)) {
+    auto *FPCast = cast<CastInst>(Src);
+    if (isKnownExactCastIntToFP(*FPCast))
+      return CastInst::Create(FPCast->getOpcode(), FPCast->getOperand(0), Ty);
+  }
+
+  return commonCastTransforms(FPExt);
 }
 
-// fpto{s/u}i({u/s}itofp(X)) --> X or zext(X) or sext(X) or trunc(X)
-// This is safe if the intermediate type has enough bits in its mantissa to
-// accurately represent all values of X.  For example, this won't work with
-// i64 -> float -> i64.
-Instruction *InstCombiner::FoldItoFPtoI(Instruction &FI) {
+/// fpto{s/u}i({u/s}itofp(X)) --> X or zext(X) or sext(X) or trunc(X)
+/// This is safe if the intermediate type has enough bits in its mantissa to
+/// accurately represent all values of X.  For example, this won't work with
+/// i64 -> float -> i64.
+Instruction *InstCombiner::foldItoFPtoI(CastInst &FI) {
   if (!isa<UIToFPInst>(FI.getOperand(0)) && !isa<SIToFPInst>(FI.getOperand(0)))
     return nullptr;
-  Instruction *OpI = cast<Instruction>(FI.getOperand(0));
 
-  Value *SrcI = OpI->getOperand(0);
-  Type *FITy = FI.getType();
-  Type *OpITy = OpI->getType();
-  Type *SrcTy = SrcI->getType();
-  bool IsInputSigned = isa<SIToFPInst>(OpI);
+  auto *OpI = cast<CastInst>(FI.getOperand(0));
+  Value *X = OpI->getOperand(0);
+  Type *XType = X->getType();
+  Type *DestType = FI.getType();
   bool IsOutputSigned = isa<FPToSIInst>(FI);
 
-  // We can safely assume the conversion won't overflow the output range,
-  // because (for example) (uint8_t)18293.f is undefined behavior.
-
   // Since we can assume the conversion won't overflow, our decision as to
   // whether the input will fit in the float should depend on the minimum
   // of the input range and output range.
 
   // This means this is also safe for a signed input and unsigned output, since
   // a negative input would lead to undefined behavior.
-  int InputSize = (int)SrcTy->getScalarSizeInBits() - IsInputSigned;
-  int OutputSize = (int)FITy->getScalarSizeInBits() - IsOutputSigned;
-  int ActualSize = std::min(InputSize, OutputSize);
-
-  if (ActualSize <= OpITy->getFPMantissaWidth()) {
-    if (FITy->getScalarSizeInBits() > SrcTy->getScalarSizeInBits()) {
-      if (IsInputSigned && IsOutputSigned)
-        return new SExtInst(SrcI, FITy);
-      return new ZExtInst(SrcI, FITy);
-    }
-    if (FITy->getScalarSizeInBits() < SrcTy->getScalarSizeInBits())
-      return new TruncInst(SrcI, FITy);
-    if (SrcTy == FITy)
-      return replaceInstUsesWith(FI, SrcI);
-    return new BitCastInst(SrcI, FITy);
+  if (!isKnownExactCastIntToFP(*OpI)) {
+    // The first cast may not round exactly based on the source integer width
+    // and FP width, but the overflow UB rules can still allow this to fold.
+    // If the destination type is narrow, that means the intermediate FP value
+    // must be large enough to hold the source value exactly.
+    // For example, (uint8_t)((float)(uint32_t 16777217) is undefined behavior.
+    int OutputSize = (int)DestType->getScalarSizeInBits() - IsOutputSigned;
+    if (OutputSize > OpI->getType()->getFPMantissaWidth())
+      return nullptr;
   }
-  return nullptr;
+
+  if (DestType->getScalarSizeInBits() > XType->getScalarSizeInBits()) {
+    bool IsInputSigned = isa<SIToFPInst>(OpI);
+    if (IsInputSigned && IsOutputSigned)
+      return new SExtInst(X, DestType);
+    return new ZExtInst(X, DestType);
+  }
+  if (DestType->getScalarSizeInBits() < XType->getScalarSizeInBits())
+    return new TruncInst(X, DestType);
+
+  assert(XType == DestType && "Unexpected types for int to FP to int casts");
+  return replaceInstUsesWith(FI, X);
 }
 
 Instruction *InstCombiner::visitFPToUI(FPToUIInst &FI) {
-  Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
-  if (!OpI)
-    return commonCastTransforms(FI);
-
-  if (Instruction *I = FoldItoFPtoI(FI))
+  if (Instruction *I = foldItoFPtoI(FI))
     return I;
 
   return commonCastTransforms(FI);
 }
 
 Instruction *InstCombiner::visitFPToSI(FPToSIInst &FI) {
-  Instruction *OpI = dyn_cast<Instruction>(FI.getOperand(0));
-  if (!OpI)
-    return commonCastTransforms(FI);
-
-  if (Instruction *I = FoldItoFPtoI(FI))
+  if (Instruction *I = foldItoFPtoI(FI))
     return I;
 
   return commonCastTransforms(FI);
@@ -1788,8 +1888,9 @@ Instruction *InstCombiner::visitIntToPtr(IntToPtrInst &CI) {
   if (CI.getOperand(0)->getType()->getScalarSizeInBits() !=
       DL.getPointerSizeInBits(AS)) {
     Type *Ty = DL.getIntPtrType(CI.getContext(), AS);
-    if (CI.getType()->isVectorTy()) // Handle vectors of pointers.
-      Ty = VectorType::get(Ty, CI.getType()->getVectorNumElements());
+    // Handle vectors of pointers.
+    if (auto *CIVTy = dyn_cast<VectorType>(CI.getType()))
+      Ty = VectorType::get(Ty, CIVTy->getElementCount());
 
     Value *P = Builder.CreateZExtOrTrunc(CI.getOperand(0), Ty);
     return new IntToPtrInst(P, CI.getType());
@@ -1817,9 +1918,7 @@ Instruction *InstCombiner::commonPointerCastTransforms(CastInst &CI) {
       // Changing the cast operand is usually not a good idea but it is safe
       // here because the pointer operand is being replaced with another
       // pointer operand so the opcode doesn't need to change.
-      Worklist.Add(GEP);
-      CI.setOperand(0, GEP->getOperand(0));
-      return &CI;
+      return replaceOperand(CI, 0, GEP->getOperand(0));
     }
   }
 
@@ -1838,8 +1937,11 @@ Instruction *InstCombiner::visitPtrToInt(PtrToIntInst &CI) {
     return commonPointerCastTransforms(CI);
 
   Type *PtrTy = DL.getIntPtrType(CI.getContext(), AS);
-  if (Ty->isVectorTy()) // Handle vectors of pointers.
-    PtrTy = VectorType::get(PtrTy, Ty->getVectorNumElements());
+  if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+    // Handle vectors of pointers.
+    // FIXME: what should happen for scalable vectors?
+    PtrTy = FixedVectorType::get(PtrTy, VTy->getNumElements());
+  }
 
   Value *P = Builder.CreatePtrToInt(CI.getOperand(0), PtrTy);
   return CastInst::CreateIntegerCast(P, Ty, /*isSigned=*/false);
@@ -1878,7 +1980,8 @@ static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal,
         DestTy->getElementType()->getPrimitiveSizeInBits())
       return nullptr;
 
-    SrcTy = VectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
+    SrcTy =
+        FixedVectorType::get(DestTy->getElementType(), SrcTy->getNumElements());
     InVal = IC.Builder.CreateBitCast(InVal, SrcTy);
   }
 
@@ -1891,8 +1994,8 @@ static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal,
   // Now that the element types match, get the shuffle mask and RHS of the
   // shuffle to use, which depends on whether we're increasing or decreasing the
   // size of the input.
-  SmallVector<uint32_t, 16> ShuffleMaskStorage;
-  ArrayRef<uint32_t> ShuffleMask;
+  SmallVector<int, 16> ShuffleMaskStorage;
+  ArrayRef<int> ShuffleMask;
   Value *V2;
 
   // Produce an identify shuffle mask for the src vector.
@@ -1931,9 +2034,7 @@ static Instruction *optimizeVectorResizeWithIntegerBitCasts(Value *InVal,
     ShuffleMask = ShuffleMaskStorage;
   }
 
-  return new ShuffleVectorInst(InVal, V2,
-                               ConstantDataVector::get(V2->getContext(),
-                                                       ShuffleMask));
+  return new ShuffleVectorInst(InVal, V2, ShuffleMask);
 }
 
 static bool isMultipleOfTypeSize(unsigned Value, Type *Ty) {
@@ -2106,7 +2207,7 @@ static Instruction *canonicalizeBitCastExtElt(BitCastInst &BitCast,
     return nullptr;
 
   unsigned NumElts = ExtElt->getVectorOperandType()->getNumElements();
-  auto *NewVecType = VectorType::get(DestType, NumElts);
+  auto *NewVecType = FixedVectorType::get(DestType, NumElts);
   auto *NewBC = IC.Builder.CreateBitCast(ExtElt->getVectorOperand(),
                                          NewVecType, "bc");
   return ExtractElementInst::Create(NewBC, ExtElt->getIndexOperand());
@@ -2151,7 +2252,7 @@ static Instruction *foldBitCastBitwiseLogic(BitCastInst &BitCast,
   if (match(BO->getOperand(1), m_Constant(C))) {
     // bitcast (logic X, C) --> logic (bitcast X, C')
     Value *CastedOp0 = Builder.CreateBitCast(BO->getOperand(0), DestTy);
-    Value *CastedC = ConstantExpr::getBitCast(C, DestTy);
+    Value *CastedC = Builder.CreateBitCast(C, DestTy);
     return BinaryOperator::Create(BO->getOpcode(), CastedOp0, CastedC);
   }
 
@@ -2169,10 +2270,10 @@ static Instruction *foldBitCastSelect(BitCastInst &BitCast,
   // A vector select must maintain the same number of elements in its operands.
   Type *CondTy = Cond->getType();
   Type *DestTy = BitCast.getType();
-  if (CondTy->isVectorTy()) {
+  if (auto *CondVTy = dyn_cast<VectorType>(CondTy)) {
     if (!DestTy->isVectorTy())
       return nullptr;
-    if (DestTy->getVectorNumElements() != CondTy->getVectorNumElements())
+    if (cast<VectorType>(DestTy)->getNumElements() != CondVTy->getNumElements())
       return nullptr;
   }
 
@@ -2359,7 +2460,7 @@ Instruction *InstCombiner::optimizeBitCastFromPhi(CastInst &CI, PHINode *PN) {
         auto *NewBC =
           cast<BitCastInst>(Builder.CreateBitCast(NewPN, SrcTy));
         SI->setOperand(0, NewBC);
-        Worklist.Add(SI);
+        Worklist.push(SI);
         assert(hasStoreUsersOnly(*NewBC));
       }
       else if (auto *BCI = dyn_cast<BitCastInst>(V)) {
@@ -2395,8 +2496,9 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
   if (DestTy == Src->getType())
     return replaceInstUsesWith(CI, Src);
 
-  if (PointerType *DstPTy = dyn_cast<PointerType>(DestTy)) {
+  if (isa<PointerType>(SrcTy) && isa<PointerType>(DestTy)) {
     PointerType *SrcPTy = cast<PointerType>(SrcTy);
+    PointerType *DstPTy = cast<PointerType>(DestTy);
     Type *DstElTy = DstPTy->getElementType();
     Type *SrcElTy = SrcPTy->getElementType();
 
@@ -2425,10 +2527,8 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     // to a getelementptr X, 0, 0, 0...  turn it into the appropriate gep.
     // This can enhance SROA and other transforms that want type-safe pointers.
     unsigned NumZeros = 0;
-    while (SrcElTy != DstElTy &&
-           isa<CompositeType>(SrcElTy) && !SrcElTy->isPointerTy() &&
-           SrcElTy->getNumContainedTypes() /* not "{}" */) {
-      SrcElTy = cast<CompositeType>(SrcElTy)->getTypeAtIndex(0U);
+    while (SrcElTy && SrcElTy != DstElTy) {
+      SrcElTy = GetElementPtrInst::getTypeAtIndex(SrcElTy, (uint64_t)0);
       ++NumZeros;
     }
 
@@ -2455,12 +2555,12 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
-  if (VectorType *DestVTy = dyn_cast<VectorType>(DestTy)) {
-    if (DestVTy->getNumElements() == 1 && !SrcTy->isVectorTy()) {
+  if (FixedVectorType *DestVTy = dyn_cast<FixedVectorType>(DestTy)) {
+    // Beware: messing with this target-specific oddity may cause trouble.
+    if (DestVTy->getNumElements() == 1 && SrcTy->isX86_MMXTy()) {
       Value *Elem = Builder.CreateBitCast(Src, DestVTy->getElementType());
       return InsertElementInst::Create(UndefValue::get(DestTy), Elem,
                      Constant::getNullValue(Type::getInt32Ty(CI.getContext())));
-      // FIXME: Canonicalize bitcast(insertelement) -> insertelement(bitcast)
     }
 
     if (isa<IntegerType>(SrcTy)) {
@@ -2484,7 +2584,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     }
   }
 
-  if (VectorType *SrcVTy = dyn_cast<VectorType>(SrcTy)) {
+  if (FixedVectorType *SrcVTy = dyn_cast<FixedVectorType>(SrcTy)) {
     if (SrcVTy->getNumElements() == 1) {
       // If our destination is not a vector, then make this a straight
       // scalar-scalar cast.
@@ -2508,10 +2608,11 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
     // a bitcast to a vector with the same # elts.
     Value *ShufOp0 = Shuf->getOperand(0);
     Value *ShufOp1 = Shuf->getOperand(1);
-    unsigned NumShufElts = Shuf->getType()->getVectorNumElements();
-    unsigned NumSrcVecElts = ShufOp0->getType()->getVectorNumElements();
+    unsigned NumShufElts = Shuf->getType()->getNumElements();
+    unsigned NumSrcVecElts =
+        cast<VectorType>(ShufOp0->getType())->getNumElements();
     if (Shuf->hasOneUse() && DestTy->isVectorTy() &&
-        DestTy->getVectorNumElements() == NumShufElts &&
+        cast<VectorType>(DestTy)->getNumElements() == NumShufElts &&
         NumShufElts == NumSrcVecElts) {
       BitCastInst *Tmp;
       // If either of the operands is a cast from CI.getType(), then
@@ -2525,7 +2626,7 @@ Instruction *InstCombiner::visitBitCast(BitCastInst &CI) {
         Value *RHS = Builder.CreateBitCast(ShufOp1, DestTy);
         // Return a new shuffle vector.  Use the same element ID's, as we
         // know the vector types match #elts.
-        return new ShuffleVectorInst(LHS, RHS, Shuf->getOperand(2));
+        return new ShuffleVectorInst(LHS, RHS, Shuf->getShuffleMask());
       }
     }
 
@@ -2578,7 +2679,8 @@ Instruction *InstCombiner::visitAddrSpaceCast(AddrSpaceCastInst &CI) {
     Type *MidTy = PointerType::get(DestElemTy, SrcTy->getAddressSpace());
     if (VectorType *VT = dyn_cast<VectorType>(CI.getType())) {
       // Handle vectors of pointers.
-      MidTy = VectorType::get(MidTy, VT->getNumElements());
+      // FIXME: what should happen for scalable vectors?
+      MidTy = FixedVectorType::get(MidTy, VT->getNumElements());
     }
 
     Value *NewBitCast = Builder.CreateBitCast(Src, MidTy);
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index f38dc436722d..f1233b62445d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -897,7 +897,7 @@ Instruction *InstCombiner::foldGEPICmp(GEPOperator *GEPLHS, Value *RHS,
     // For vectors, we apply the same reasoning on a per-lane basis.
     auto *Base = GEPLHS->getPointerOperand();
     if (GEPLHS->getType()->isVectorTy() && Base->getType()->isPointerTy()) {
-      int NumElts = GEPLHS->getType()->getVectorNumElements();
+      int NumElts = cast<VectorType>(GEPLHS->getType())->getNumElements();
       Base = Builder.CreateVectorSplat(NumElts, Base);
     }
     return new ICmpInst(Cond, Base,
@@ -1330,6 +1330,7 @@ static Instruction *processUGT_ADDCST_ADD(ICmpInst &I, Value *A, Value *B,
   // The inner add was the result of the narrow add, zero extended to the
   // wider type.  Replace it with the result computed by the intrinsic.
   IC.replaceInstUsesWith(*OrigAdd, ZExt);
+  IC.eraseInstFromFunction(*OrigAdd);
 
   // The original icmp gets replaced with the overflow value.
   return ExtractValueInst::Create(Call, 1, "sadd.overflow");
@@ -1451,6 +1452,27 @@ Instruction *InstCombiner::foldICmpWithConstant(ICmpInst &Cmp) {
     if (Instruction *Res = processUGT_ADDCST_ADD(Cmp, A, B, CI2, CI, *this))
       return Res;
 
+  // icmp(phi(C1, C2, ...), C) -> phi(icmp(C1, C), icmp(C2, C), ...).
+  Constant *C = dyn_cast<Constant>(Op1);
+  if (!C)
+    return nullptr;
+
+  if (auto *Phi = dyn_cast<PHINode>(Op0))
+    if (all_of(Phi->operands(), [](Value *V) { return isa<Constant>(V); })) {
+      Type *Ty = Cmp.getType();
+      Builder.SetInsertPoint(Phi);
+      PHINode *NewPhi =
+          Builder.CreatePHI(Ty, Phi->getNumOperands());
+      for (BasicBlock *Predecessor : predecessors(Phi->getParent())) {
+        auto *Input =
+            cast<Constant>(Phi->getIncomingValueForBlock(Predecessor));
+        auto *BoolInput = ConstantExpr::getCompare(Pred, Input, C);
+        NewPhi->addIncoming(BoolInput, Predecessor);
+      }
+      NewPhi->takeName(&Cmp);
+      return replaceInstUsesWith(Cmp, NewPhi);
+    }
+
   return nullptr;
 }
 
@@ -1575,11 +1597,8 @@ Instruction *InstCombiner::foldICmpXorConstant(ICmpInst &Cmp,
 
     // If the sign bit of the XorCst is not set, there is no change to
     // the operation, just stop using the Xor.
-    if (!XorC->isNegative()) {
-      Cmp.setOperand(0, X);
-      Worklist.Add(Xor);
-      return &Cmp;
-    }
+    if (!XorC->isNegative())
+      return replaceOperand(Cmp, 0, X);
 
     // Emit the opposite comparison.
     if (TrueIfSigned)
@@ -1645,51 +1664,53 @@ Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
   bool IsShl = ShiftOpcode == Instruction::Shl;
   const APInt *C3;
   if (match(Shift->getOperand(1), m_APInt(C3))) {
-    bool CanFold = false;
+    APInt NewAndCst, NewCmpCst;
+    bool AnyCmpCstBitsShiftedOut;
     if (ShiftOpcode == Instruction::Shl) {
       // For a left shift, we can fold if the comparison is not signed. We can
       // also fold a signed comparison if the mask value and comparison value
       // are not negative. These constraints may not be obvious, but we can
       // prove that they are correct using an SMT solver.
-      if (!Cmp.isSigned() || (!C2.isNegative() && !C1.isNegative()))
-        CanFold = true;
-    } else {
-      bool IsAshr = ShiftOpcode == Instruction::AShr;
+      if (Cmp.isSigned() && (C2.isNegative() || C1.isNegative()))
+        return nullptr;
+
+      NewCmpCst = C1.lshr(*C3);
+      NewAndCst = C2.lshr(*C3);
+      AnyCmpCstBitsShiftedOut = NewCmpCst.shl(*C3) != C1;
+    } else if (ShiftOpcode == Instruction::LShr) {
       // For a logical right shift, we can fold if the comparison is not signed.
       // We can also fold a signed comparison if the shifted mask value and the
       // shifted comparison value are not negative. These constraints may not be
       // obvious, but we can prove that they are correct using an SMT solver.
-      // For an arithmetic shift right we can do the same, if we ensure
-      // the And doesn't use any bits being shifted in. Normally these would
-      // be turned into lshr by SimplifyDemandedBits, but not if there is an
-      // additional user.
-      if (!IsAshr || (C2.shl(*C3).lshr(*C3) == C2)) {
-        if (!Cmp.isSigned() ||
-            (!C2.shl(*C3).isNegative() && !C1.shl(*C3).isNegative()))
-          CanFold = true;
-      }
+      NewCmpCst = C1.shl(*C3);
+      NewAndCst = C2.shl(*C3);
+      AnyCmpCstBitsShiftedOut = NewCmpCst.lshr(*C3) != C1;
+      if (Cmp.isSigned() && (NewAndCst.isNegative() || NewCmpCst.isNegative()))
+        return nullptr;
+    } else {
+      // For an arithmetic shift, check that both constants don't use (in a
+      // signed sense) the top bits being shifted out.
+      assert(ShiftOpcode == Instruction::AShr && "Unknown shift opcode");
+      NewCmpCst = C1.shl(*C3);
+      NewAndCst = C2.shl(*C3);
+      AnyCmpCstBitsShiftedOut = NewCmpCst.ashr(*C3) != C1;
+      if (NewAndCst.ashr(*C3) != C2)
+        return nullptr;
     }
 
-    if (CanFold) {
-      APInt NewCst = IsShl ? C1.lshr(*C3) : C1.shl(*C3);
-      APInt SameAsC1 = IsShl ? NewCst.shl(*C3) : NewCst.lshr(*C3);
-      // Check to see if we are shifting out any of the bits being compared.
-      if (SameAsC1 != C1) {
-        // If we shifted bits out, the fold is not going to work out. As a
-        // special case, check to see if this means that the result is always
-        // true or false now.
-        if (Cmp.getPredicate() == ICmpInst::ICMP_EQ)
-          return replaceInstUsesWith(Cmp, ConstantInt::getFalse(Cmp.getType()));
-        if (Cmp.getPredicate() == ICmpInst::ICMP_NE)
-          return replaceInstUsesWith(Cmp, ConstantInt::getTrue(Cmp.getType()));
-      } else {
-        Cmp.setOperand(1, ConstantInt::get(And->getType(), NewCst));
-        APInt NewAndCst = IsShl ? C2.lshr(*C3) : C2.shl(*C3);
-        And->setOperand(1, ConstantInt::get(And->getType(), NewAndCst));
-        And->setOperand(0, Shift->getOperand(0));
-        Worklist.Add(Shift); // Shift is dead.
-        return &Cmp;
-      }
+    if (AnyCmpCstBitsShiftedOut) {
+      // If we shifted bits out, the fold is not going to work out. As a
+      // special case, check to see if this means that the result is always
+      // true or false now.
+      if (Cmp.getPredicate() == ICmpInst::ICMP_EQ)
+        return replaceInstUsesWith(Cmp, ConstantInt::getFalse(Cmp.getType()));
+      if (Cmp.getPredicate() == ICmpInst::ICMP_NE)
+        return replaceInstUsesWith(Cmp, ConstantInt::getTrue(Cmp.getType()));
+    } else {
+      Value *NewAnd = Builder.CreateAnd(
+          Shift->getOperand(0), ConstantInt::get(And->getType(), NewAndCst));
+      return new ICmpInst(Cmp.getPredicate(),
+          NewAnd, ConstantInt::get(And->getType(), NewCmpCst));
     }
   }
 
@@ -1705,8 +1726,7 @@ Instruction *InstCombiner::foldICmpAndShift(ICmpInst &Cmp, BinaryOperator *And,
 
     // Compute X & (C2 << Y).
     Value *NewAnd = Builder.CreateAnd(Shift->getOperand(0), NewShift);
-    Cmp.setOperand(0, NewAnd);
-    return &Cmp;
+    return replaceOperand(Cmp, 0, NewAnd);
   }
 
   return nullptr;
@@ -1812,8 +1832,7 @@ Instruction *InstCombiner::foldICmpAndConstConst(ICmpInst &Cmp,
       }
       if (NewOr) {
         Value *NewAnd = Builder.CreateAnd(A, NewOr, And->getName());
-        Cmp.setOperand(0, NewAnd);
-        return &Cmp;
+        return replaceOperand(Cmp, 0, NewAnd);
       }
     }
   }
@@ -1863,8 +1882,8 @@ Instruction *InstCombiner::foldICmpAndConstant(ICmpInst &Cmp,
     int32_t ExactLogBase2 = C2->exactLogBase2();
     if (ExactLogBase2 != -1 && DL.isLegalInteger(ExactLogBase2 + 1)) {
       Type *NTy = IntegerType::get(Cmp.getContext(), ExactLogBase2 + 1);
-      if (And->getType()->isVectorTy())
-        NTy = VectorType::get(NTy, And->getType()->getVectorNumElements());
+      if (auto *AndVTy = dyn_cast<VectorType>(And->getType()))
+        NTy = FixedVectorType::get(NTy, AndVTy->getNumElements());
       Value *Trunc = Builder.CreateTrunc(X, NTy);
       auto NewPred = Cmp.getPredicate() == CmpInst::ICMP_EQ ? CmpInst::ICMP_SGE
                                                             : CmpInst::ICMP_SLT;
@@ -1888,20 +1907,24 @@ Instruction *InstCombiner::foldICmpOrConstant(ICmpInst &Cmp, BinaryOperator *Or,
   }
 
   Value *OrOp0 = Or->getOperand(0), *OrOp1 = Or->getOperand(1);
-  if (Cmp.isEquality() && Cmp.getOperand(1) == OrOp1) {
-    // X | C == C --> X <=u C
-    // X | C != C --> X  >u C
-    //   iff C+1 is a power of 2 (C is a bitmask of the low bits)
-    if ((C + 1).isPowerOf2()) {
+  const APInt *MaskC;
+  if (match(OrOp1, m_APInt(MaskC)) && Cmp.isEquality()) {
+    if (*MaskC == C && (C + 1).isPowerOf2()) {
+      // X | C == C --> X <=u C
+      // X | C != C --> X  >u C
+      //   iff C+1 is a power of 2 (C is a bitmask of the low bits)
       Pred = (Pred == CmpInst::ICMP_EQ) ? CmpInst::ICMP_ULE : CmpInst::ICMP_UGT;
       return new ICmpInst(Pred, OrOp0, OrOp1);
     }
-    // More general: are all bits outside of a mask constant set or not set?
-    // X | C == C --> (X & ~C) == 0
-    // X | C != C --> (X & ~C) != 0
+
+    // More general: canonicalize 'equality with set bits mask' to
+    // 'equality with clear bits mask'.
+    // (X | MaskC) == C --> (X & ~MaskC) == C ^ MaskC
+    // (X | MaskC) != C --> (X & ~MaskC) != C ^ MaskC
     if (Or->hasOneUse()) {
-      Value *A = Builder.CreateAnd(OrOp0, ~C);
-      return new ICmpInst(Pred, A, ConstantInt::getNullValue(OrOp0->getType()));
+      Value *And = Builder.CreateAnd(OrOp0, ~(*MaskC));
+      Constant *NewC = ConstantInt::get(Or->getType(), C ^ (*MaskC));
+      return new ICmpInst(Pred, And, NewC);
     }
   }
 
@@ -2149,8 +2172,8 @@ Instruction *InstCombiner::foldICmpShlConstant(ICmpInst &Cmp,
   if (Shl->hasOneUse() && Amt != 0 && C.countTrailingZeros() >= Amt &&
       DL.isLegalInteger(TypeBits - Amt)) {
     Type *TruncTy = IntegerType::get(Cmp.getContext(), TypeBits - Amt);
-    if (ShType->isVectorTy())
-      TruncTy = VectorType::get(TruncTy, ShType->getVectorNumElements());
+    if (auto *ShVTy = dyn_cast<VectorType>(ShType))
+      TruncTy = FixedVectorType::get(TruncTy, ShVTy->getNumElements());
     Constant *NewC =
         ConstantInt::get(TruncTy, C.ashr(*ShiftAmt).trunc(TypeBits - Amt));
     return new ICmpInst(Pred, Builder.CreateTrunc(X, TruncTy), NewC);
@@ -2763,6 +2786,37 @@ static Instruction *foldICmpBitCast(ICmpInst &Cmp,
     if (match(BCSrcOp, m_UIToFP(m_Value(X))))
       if (Cmp.isEquality() && match(Op1, m_Zero()))
         return new ICmpInst(Pred, X, ConstantInt::getNullValue(X->getType()));
+
+    // If this is a sign-bit test of a bitcast of a casted FP value, eliminate
+    // the FP extend/truncate because that cast does not change the sign-bit.
+    // This is true for all standard IEEE-754 types and the X86 80-bit type.
+    // The sign-bit is always the most significant bit in those types.
+    const APInt *C;
+    bool TrueIfSigned;
+    if (match(Op1, m_APInt(C)) && Bitcast->hasOneUse() &&
+        isSignBitCheck(Pred, *C, TrueIfSigned)) {
+      if (match(BCSrcOp, m_FPExt(m_Value(X))) ||
+          match(BCSrcOp, m_FPTrunc(m_Value(X)))) {
+        // (bitcast (fpext/fptrunc X)) to iX) < 0 --> (bitcast X to iY) < 0
+        // (bitcast (fpext/fptrunc X)) to iX) > -1 --> (bitcast X to iY) > -1
+        Type *XType = X->getType();
+
+        // We can't currently handle Power style floating point operations here.
+        if (!(XType->isPPC_FP128Ty() || BCSrcOp->getType()->isPPC_FP128Ty())) {
+
+          Type *NewType = Builder.getIntNTy(XType->getScalarSizeInBits());
+          if (auto *XVTy = dyn_cast<VectorType>(XType))
+            NewType = FixedVectorType::get(NewType, XVTy->getNumElements());
+          Value *NewBitcast = Builder.CreateBitCast(X, NewType);
+          if (TrueIfSigned)
+            return new ICmpInst(ICmpInst::ICMP_SLT, NewBitcast,
+                                ConstantInt::getNullValue(NewType));
+          else
+            return new ICmpInst(ICmpInst::ICMP_SGT, NewBitcast,
+                                ConstantInt::getAllOnesValue(NewType));
+        }
+      }
+    }
   }
 
   // Test to see if the operands of the icmp are casted versions of other
@@ -2792,11 +2846,10 @@ static Instruction *foldICmpBitCast(ICmpInst &Cmp,
     return nullptr;
 
   Value *Vec;
-  Constant *Mask;
-  if (match(BCSrcOp,
-            m_ShuffleVector(m_Value(Vec), m_Undef(), m_Constant(Mask)))) {
+  ArrayRef<int> Mask;
+  if (match(BCSrcOp, m_Shuffle(m_Value(Vec), m_Undef(), m_Mask(Mask)))) {
     // Check whether every element of Mask is the same constant
-    if (auto *Elem = dyn_cast_or_null<ConstantInt>(Mask->getSplatValue())) {
+    if (is_splat(Mask)) {
       auto *VecTy = cast<VectorType>(BCSrcOp->getType());
       auto *EltTy = cast<IntegerType>(VecTy->getElementType());
       if (C->isSplat(EltTy->getBitWidth())) {
@@ -2805,6 +2858,7 @@ static Instruction *foldICmpBitCast(ICmpInst &Cmp,
         // then:
         //   =>  %E = extractelement <N x iK> %vec, i32 Elem
         //       icmp <pred> iK %SplatVal, <pattern>
+        Value *Elem = Builder.getInt32(Mask[0]);
         Value *Extract = Builder.CreateExtractElement(Vec, Elem);
         Value *NewC = ConstantInt::get(EltTy, C->trunc(EltTy->getBitWidth()));
         return new ICmpInst(Pred, Extract, NewC);
@@ -2928,12 +2982,9 @@ Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
     break;
   case Instruction::Add: {
     // Replace ((add A, B) != C) with (A != C-B) if B & C are constants.
-    const APInt *BOC;
-    if (match(BOp1, m_APInt(BOC))) {
-      if (BO->hasOneUse()) {
-        Constant *SubC = ConstantExpr::getSub(RHS, cast<Constant>(BOp1));
-        return new ICmpInst(Pred, BOp0, SubC);
-      }
+    if (Constant *BOC = dyn_cast<Constant>(BOp1)) {
+      if (BO->hasOneUse())
+        return new ICmpInst(Pred, BOp0, ConstantExpr::getSub(RHS, BOC));
     } else if (C.isNullValue()) {
       // Replace ((add A, B) != 0) with (A != -B) if A or B is
       // efficiently invertible, or if the add has just this one use.
@@ -2963,11 +3014,11 @@ Instruction *InstCombiner::foldICmpBinOpEqualityWithConstant(ICmpInst &Cmp,
     break;
   case Instruction::Sub:
     if (BO->hasOneUse()) {
-      const APInt *BOC;
-      if (match(BOp0, m_APInt(BOC))) {
+      // Only check for constant LHS here, as constant RHS will be canonicalized
+      // to add and use the fold above.
+      if (Constant *BOC = dyn_cast<Constant>(BOp0)) {
         // Replace ((sub BOC, B) != C) with (B != BOC-C).
-        Constant *SubC = ConstantExpr::getSub(cast<Constant>(BOp0), RHS);
-        return new ICmpInst(Pred, BOp1, SubC);
+        return new ICmpInst(Pred, BOp1, ConstantExpr::getSub(BOC, RHS));
       } else if (C.isNullValue()) {
         // Replace ((sub A, B) != 0) with (A != B).
         return new ICmpInst(Pred, BOp0, BOp1);
@@ -3028,20 +3079,16 @@ Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp,
   unsigned BitWidth = C.getBitWidth();
   switch (II->getIntrinsicID()) {
   case Intrinsic::bswap:
-    Worklist.Add(II);
-    Cmp.setOperand(0, II->getArgOperand(0));
-    Cmp.setOperand(1, ConstantInt::get(Ty, C.byteSwap()));
-    return &Cmp;
+    // bswap(A) == C  ->  A == bswap(C)
+    return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
+                        ConstantInt::get(Ty, C.byteSwap()));
 
   case Intrinsic::ctlz:
   case Intrinsic::cttz: {
     // ctz(A) == bitwidth(A)  ->  A == 0 and likewise for !=
-    if (C == BitWidth) {
-      Worklist.Add(II);
-      Cmp.setOperand(0, II->getArgOperand(0));
-      Cmp.setOperand(1, ConstantInt::getNullValue(Ty));
-      return &Cmp;
-    }
+    if (C == BitWidth)
+      return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
+                          ConstantInt::getNullValue(Ty));
 
     // ctz(A) == C -> A & Mask1 == Mask2, where Mask2 only has bit C set
     // and Mask1 has bits 0..C+1 set. Similar for ctl, but for high bits.
@@ -3054,10 +3101,9 @@ Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp,
       APInt Mask2 = IsTrailing
         ? APInt::getOneBitSet(BitWidth, Num)
         : APInt::getOneBitSet(BitWidth, BitWidth - Num - 1);
-      Cmp.setOperand(0, Builder.CreateAnd(II->getArgOperand(0), Mask1));
-      Cmp.setOperand(1, ConstantInt::get(Ty, Mask2));
-      Worklist.Add(II);
-      return &Cmp;
+      return new ICmpInst(Cmp.getPredicate(),
+          Builder.CreateAnd(II->getArgOperand(0), Mask1),
+          ConstantInt::get(Ty, Mask2));
     }
     break;
   }
@@ -3066,14 +3112,10 @@ Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp,
     // popcount(A) == 0  ->  A == 0 and likewise for !=
     // popcount(A) == bitwidth(A)  ->  A == -1 and likewise for !=
     bool IsZero = C.isNullValue();
-    if (IsZero || C == BitWidth) {
-      Worklist.Add(II);
-      Cmp.setOperand(0, II->getArgOperand(0));
-      auto *NewOp =
-          IsZero ? Constant::getNullValue(Ty) : Constant::getAllOnesValue(Ty);
-      Cmp.setOperand(1, NewOp);
-      return &Cmp;
-    }
+    if (IsZero || C == BitWidth)
+      return new ICmpInst(Cmp.getPredicate(), II->getArgOperand(0),
+          IsZero ? Constant::getNullValue(Ty) : Constant::getAllOnesValue(Ty));
+
     break;
   }
 
@@ -3081,9 +3123,7 @@ Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp,
     // uadd.sat(a, b) == 0  ->  (a | b) == 0
     if (C.isNullValue()) {
       Value *Or = Builder.CreateOr(II->getArgOperand(0), II->getArgOperand(1));
-      return replaceInstUsesWith(Cmp, Builder.CreateICmp(
-          Cmp.getPredicate(), Or, Constant::getNullValue(Ty)));
-
+      return new ICmpInst(Cmp.getPredicate(), Or, Constant::getNullValue(Ty));
     }
     break;
   }
@@ -3093,8 +3133,7 @@ Instruction *InstCombiner::foldICmpEqIntrinsicWithConstant(ICmpInst &Cmp,
     if (C.isNullValue()) {
       ICmpInst::Predicate NewPred = Cmp.getPredicate() == ICmpInst::ICMP_EQ
           ? ICmpInst::ICMP_ULE : ICmpInst::ICMP_UGT;
-      return ICmpInst::Create(Instruction::ICmp, NewPred,
-                              II->getArgOperand(0), II->getArgOperand(1));
+      return new ICmpInst(NewPred, II->getArgOperand(0), II->getArgOperand(1));
     }
     break;
   }
@@ -3300,30 +3339,19 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I,
     //  x & (-1 >> y) != x    ->    x u> (-1 >> y)
     DstPred = ICmpInst::Predicate::ICMP_UGT;
     break;
-  case ICmpInst::Predicate::ICMP_UGT:
+  case ICmpInst::Predicate::ICMP_ULT:
+    //  x & (-1 >> y) u< x    ->    x u> (-1 >> y)
     //  x u> x & (-1 >> y)    ->    x u> (-1 >> y)
-    assert(X == I.getOperand(0) && "instsimplify took care of commut. variant");
     DstPred = ICmpInst::Predicate::ICMP_UGT;
     break;
   case ICmpInst::Predicate::ICMP_UGE:
     //  x & (-1 >> y) u>= x    ->    x u<= (-1 >> y)
-    assert(X == I.getOperand(1) && "instsimplify took care of commut. variant");
-    DstPred = ICmpInst::Predicate::ICMP_ULE;
-    break;
-  case ICmpInst::Predicate::ICMP_ULT:
-    //  x & (-1 >> y) u< x    ->    x u> (-1 >> y)
-    assert(X == I.getOperand(1) && "instsimplify took care of commut. variant");
-    DstPred = ICmpInst::Predicate::ICMP_UGT;
-    break;
-  case ICmpInst::Predicate::ICMP_ULE:
     //  x u<= x & (-1 >> y)    ->    x u<= (-1 >> y)
-    assert(X == I.getOperand(0) && "instsimplify took care of commut. variant");
     DstPred = ICmpInst::Predicate::ICMP_ULE;
     break;
-  case ICmpInst::Predicate::ICMP_SGT:
+  case ICmpInst::Predicate::ICMP_SLT:
+    //  x & (-1 >> y) s< x    ->    x s> (-1 >> y)
     //  x s> x & (-1 >> y)    ->    x s> (-1 >> y)
-    if (X != I.getOperand(0)) // X must be on LHS of comparison!
-      return nullptr;         // Ignore the other case.
     if (!match(M, m_Constant())) // Can not do this fold with non-constant.
       return nullptr;
     if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
@@ -3332,33 +3360,19 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I,
     break;
   case ICmpInst::Predicate::ICMP_SGE:
     //  x & (-1 >> y) s>= x    ->    x s<= (-1 >> y)
-    if (X != I.getOperand(1)) // X must be on RHS of comparison!
-      return nullptr;         // Ignore the other case.
+    //  x s<= x & (-1 >> y)    ->    x s<= (-1 >> y)
     if (!match(M, m_Constant())) // Can not do this fold with non-constant.
       return nullptr;
     if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
       return nullptr;
     DstPred = ICmpInst::Predicate::ICMP_SLE;
     break;
-  case ICmpInst::Predicate::ICMP_SLT:
-    //  x & (-1 >> y) s< x    ->    x s> (-1 >> y)
-    if (X != I.getOperand(1)) // X must be on RHS of comparison!
-      return nullptr;         // Ignore the other case.
-    if (!match(M, m_Constant())) // Can not do this fold with non-constant.
-      return nullptr;
-    if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
-      return nullptr;
-    DstPred = ICmpInst::Predicate::ICMP_SGT;
-    break;
+  case ICmpInst::Predicate::ICMP_SGT:
   case ICmpInst::Predicate::ICMP_SLE:
-    //  x s<= x & (-1 >> y)    ->    x s<= (-1 >> y)
-    if (X != I.getOperand(0)) // X must be on LHS of comparison!
-      return nullptr;         // Ignore the other case.
-    if (!match(M, m_Constant())) // Can not do this fold with non-constant.
-      return nullptr;
-    if (!match(M, m_NonNegative())) // Must not have any -1 vector elements.
-      return nullptr;
-    DstPred = ICmpInst::Predicate::ICMP_SLE;
+    return nullptr;
+  case ICmpInst::Predicate::ICMP_UGT:
+  case ICmpInst::Predicate::ICMP_ULE:
+    llvm_unreachable("Instsimplify took care of commut. variant");
     break;
   default:
     llvm_unreachable("All possible folds are handled.");
@@ -3370,8 +3384,9 @@ static Value *foldICmpWithLowBitMaskedVal(ICmpInst &I,
   Type *OpTy = M->getType();
   auto *VecC = dyn_cast<Constant>(M);
   if (OpTy->isVectorTy() && VecC && VecC->containsUndefElement()) {
+    auto *OpVTy = cast<VectorType>(OpTy);
     Constant *SafeReplacementConstant = nullptr;
-    for (unsigned i = 0, e = OpTy->getVectorNumElements(); i != e; ++i) {
+    for (unsigned i = 0, e = OpVTy->getNumElements(); i != e; ++i) {
       if (!isa<UndefValue>(VecC->getAggregateElement(i))) {
         SafeReplacementConstant = VecC->getAggregateElement(i);
         break;
@@ -3494,7 +3509,8 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
   Instruction *NarrowestShift = XShift;
 
   Type *WidestTy = WidestShift->getType();
-  assert(NarrowestShift->getType() == I.getOperand(0)->getType() &&
+  Type *NarrowestTy = NarrowestShift->getType();
+  assert(NarrowestTy == I.getOperand(0)->getType() &&
          "We did not look past any shifts while matching XShift though.");
   bool HadTrunc = WidestTy != I.getOperand(0)->getType();
 
@@ -3533,6 +3549,23 @@ foldShiftIntoShiftInAnotherHandOfAndInICmp(ICmpInst &I, const SimplifyQuery SQ,
   if (XShAmt->getType() != YShAmt->getType())
     return nullptr;
 
+  // As input, we have the following pattern:
+  //   icmp eq/ne (and ((x shift Q), (y oppositeshift K))), 0
+  // We want to rewrite that as:
+  //   icmp eq/ne (and (x shift (Q+K)), y), 0  iff (Q+K) u< bitwidth(x)
+  // While we know that originally (Q+K) would not overflow
+  // (because  2 * (N-1) u<= iN -1), we have looked past extensions of
+  // shift amounts. so it may now overflow in smaller bitwidth.
+  // To ensure that does not happen, we need to ensure that the total maximal
+  // shift amount is still representable in that smaller bit width.
+  unsigned MaximalPossibleTotalShiftAmount =
+      (WidestTy->getScalarSizeInBits() - 1) +
+      (NarrowestTy->getScalarSizeInBits() - 1);
+  APInt MaximalRepresentableShiftAmount =
+      APInt::getAllOnesValue(XShAmt->getType()->getScalarSizeInBits());
+  if (MaximalRepresentableShiftAmount.ult(MaximalPossibleTotalShiftAmount))
+    return nullptr;
+
   // Can we fold (XShAmt+YShAmt) ?
   auto *NewShAmt = dyn_cast_or_null<Constant>(
       SimplifyAddInst(XShAmt, YShAmt, /*isNSW=*/false,
@@ -3627,9 +3660,6 @@ Value *InstCombiner::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
       match(&I, m_c_ICmp(Pred, m_OneUse(m_UDiv(m_AllOnes(), m_Value(X))),
                          m_Value(Y)))) {
     Mul = nullptr;
-    // Canonicalize as-if y was on RHS.
-    if (I.getOperand(1) != Y)
-      Pred = I.getSwappedPredicate();
 
     // Are we checking that overflow does not happen, or does happen?
     switch (Pred) {
@@ -3674,6 +3704,11 @@ Value *InstCombiner::foldUnsignedMultiplicationOverflowCheck(ICmpInst &I) {
   if (NeedNegation) // This technically increases instruction count.
     Res = Builder.CreateNot(Res, "umul.not.ov");
 
+  // If we replaced the mul, erase it. Do this after all uses of Builder,
+  // as the mul is used as insertion point.
+  if (MulHadOtherUses)
+    eraseInstFromFunction(*Mul);
+
   return Res;
 }
 
@@ -4202,9 +4237,7 @@ Instruction *InstCombiner::foldICmpEquality(ICmpInst &I) {
     if (X) { // Build (X^Y) & Z
       Op1 = Builder.CreateXor(X, Y);
       Op1 = Builder.CreateAnd(Op1, Z);
-      I.setOperand(0, Op1);
-      I.setOperand(1, Constant::getNullValue(Op1->getType()));
-      return &I;
+      return new ICmpInst(Pred, Op1, Constant::getNullValue(Op1->getType()));
     }
   }
 
@@ -4613,17 +4646,6 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
   case ICmpInst::ICMP_NE:
     // Recognize pattern:
     //   mulval = mul(zext A, zext B)
-    //   cmp eq/neq mulval, zext trunc mulval
-    if (ZExtInst *Zext = dyn_cast<ZExtInst>(OtherVal))
-      if (Zext->hasOneUse()) {
-        Value *ZextArg = Zext->getOperand(0);
-        if (TruncInst *Trunc = dyn_cast<TruncInst>(ZextArg))
-          if (Trunc->getType()->getPrimitiveSizeInBits() == MulWidth)
-            break; //Recognized
-      }
-
-    // Recognize pattern:
-    //   mulval = mul(zext A, zext B)
     //   cmp eq/neq mulval, and(mulval, mask), mask selects low MulWidth bits.
     ConstantInt *CI;
     Value *ValToMask;
@@ -4701,7 +4723,7 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
   Function *F = Intrinsic::getDeclaration(
       I.getModule(), Intrinsic::umul_with_overflow, MulType);
   CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul");
-  IC.Worklist.Add(MulInstr);
+  IC.Worklist.push(MulInstr);
 
   // If there are uses of mul result other than the comparison, we know that
   // they are truncation or binary AND. Change them to use result of
@@ -4723,18 +4745,16 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
         ConstantInt *CI = cast<ConstantInt>(BO->getOperand(1));
         APInt ShortMask = CI->getValue().trunc(MulWidth);
         Value *ShortAnd = Builder.CreateAnd(Mul, ShortMask);
-        Instruction *Zext =
-            cast<Instruction>(Builder.CreateZExt(ShortAnd, BO->getType()));
-        IC.Worklist.Add(Zext);
+        Value *Zext = Builder.CreateZExt(ShortAnd, BO->getType());
         IC.replaceInstUsesWith(*BO, Zext);
       } else {
         llvm_unreachable("Unexpected Binary operation");
       }
-      IC.Worklist.Add(cast<Instruction>(U));
+      IC.Worklist.push(cast<Instruction>(U));
     }
   }
   if (isa<Instruction>(OtherVal))
-    IC.Worklist.Add(cast<Instruction>(OtherVal));
+    IC.Worklist.push(cast<Instruction>(OtherVal));
 
   // The original icmp gets replaced with the overflow value, maybe inverted
   // depending on predicate.
@@ -5189,8 +5209,8 @@ llvm::getFlippedStrictnessPredicateAndConstant(CmpInst::Predicate Pred,
     // Bail out if the constant can't be safely incremented/decremented.
     if (!ConstantIsOk(CI))
       return llvm::None;
-  } else if (Type->isVectorTy()) {
-    unsigned NumElts = Type->getVectorNumElements();
+  } else if (auto *VTy = dyn_cast<VectorType>(Type)) {
+    unsigned NumElts = VTy->getNumElements();
     for (unsigned i = 0; i != NumElts; ++i) {
       Constant *Elt = C->getAggregateElement(i);
       if (!Elt)
@@ -5252,6 +5272,47 @@ static ICmpInst *canonicalizeCmpWithConstant(ICmpInst &I) {
   return new ICmpInst(FlippedStrictness->first, Op0, FlippedStrictness->second);
 }
 
+/// If we have a comparison with a non-canonical predicate, if we can update
+/// all the users, invert the predicate and adjust all the users.
+static CmpInst *canonicalizeICmpPredicate(CmpInst &I) {
+  // Is the predicate already canonical?
+  CmpInst::Predicate Pred = I.getPredicate();
+  if (isCanonicalPredicate(Pred))
+    return nullptr;
+
+  // Can all users be adjusted to predicate inversion?
+  if (!canFreelyInvertAllUsersOf(&I, /*IgnoredUser=*/nullptr))
+    return nullptr;
+
+  // Ok, we can canonicalize comparison!
+  // Let's first invert the comparison's predicate.
+  I.setPredicate(CmpInst::getInversePredicate(Pred));
+  I.setName(I.getName() + ".not");
+
+  // And now let's adjust every user.
+  for (User *U : I.users()) {
+    switch (cast<Instruction>(U)->getOpcode()) {
+    case Instruction::Select: {
+      auto *SI = cast<SelectInst>(U);
+      SI->swapValues();
+      SI->swapProfMetadata();
+      break;
+    }
+    case Instruction::Br:
+      cast<BranchInst>(U)->swapSuccessors(); // swaps prof metadata too
+      break;
+    case Instruction::Xor:
+      U->replaceAllUsesWith(&I);
+      break;
+    default:
+      llvm_unreachable("Got unexpected user - out of sync with "
+                       "canFreelyInvertAllUsersOf() ?");
+    }
+  }
+
+  return &I;
+}
+
 /// Integer compare with boolean values can always be turned into bitwise ops.
 static Instruction *canonicalizeICmpBool(ICmpInst &I,
                                          InstCombiner::BuilderTy &Builder) {
@@ -5338,10 +5399,6 @@ static Instruction *foldICmpWithHighBitMask(ICmpInst &Cmp,
   Value *X, *Y;
   if (match(&Cmp,
             m_c_ICmp(Pred, m_OneUse(m_Shl(m_One(), m_Value(Y))), m_Value(X)))) {
-    // We want X to be the icmp's second operand, so swap predicate if it isn't.
-    if (Cmp.getOperand(0) == X)
-      Pred = Cmp.getSwappedPredicate();
-
     switch (Pred) {
     case ICmpInst::ICMP_ULE:
       NewPred = ICmpInst::ICMP_NE;
@@ -5361,10 +5418,6 @@ static Instruction *foldICmpWithHighBitMask(ICmpInst &Cmp,
     // The variant with 'add' is not canonical, (the variant with 'not' is)
     // we only get it because it has extra uses, and can't be canonicalized,
 
-    // We want X to be the icmp's second operand, so swap predicate if it isn't.
-    if (Cmp.getOperand(0) == X)
-      Pred = Cmp.getSwappedPredicate();
-
     switch (Pred) {
     case ICmpInst::ICMP_ULT:
       NewPred = ICmpInst::ICMP_NE;
@@ -5385,21 +5438,45 @@ static Instruction *foldICmpWithHighBitMask(ICmpInst &Cmp,
 
 static Instruction *foldVectorCmp(CmpInst &Cmp,
                                   InstCombiner::BuilderTy &Builder) {
-  // If both arguments of the cmp are shuffles that use the same mask and
-  // shuffle within a single vector, move the shuffle after the cmp.
+  const CmpInst::Predicate Pred = Cmp.getPredicate();
   Value *LHS = Cmp.getOperand(0), *RHS = Cmp.getOperand(1);
   Value *V1, *V2;
-  Constant *M;
-  if (match(LHS, m_ShuffleVector(m_Value(V1), m_Undef(), m_Constant(M))) &&
-      match(RHS, m_ShuffleVector(m_Value(V2), m_Undef(), m_Specific(M))) &&
-      V1->getType() == V2->getType() &&
-      (LHS->hasOneUse() || RHS->hasOneUse())) {
-    // cmp (shuffle V1, M), (shuffle V2, M) --> shuffle (cmp V1, V2), M
-    CmpInst::Predicate P = Cmp.getPredicate();
-    Value *NewCmp = isa<ICmpInst>(Cmp) ? Builder.CreateICmp(P, V1, V2)
-                                       : Builder.CreateFCmp(P, V1, V2);
+  ArrayRef<int> M;
+  if (!match(LHS, m_Shuffle(m_Value(V1), m_Undef(), m_Mask(M))))
+    return nullptr;
+
+  // If both arguments of the cmp are shuffles that use the same mask and
+  // shuffle within a single vector, move the shuffle after the cmp:
+  // cmp (shuffle V1, M), (shuffle V2, M) --> shuffle (cmp V1, V2), M
+  Type *V1Ty = V1->getType();
+  if (match(RHS, m_Shuffle(m_Value(V2), m_Undef(), m_SpecificMask(M))) &&
+      V1Ty == V2->getType() && (LHS->hasOneUse() || RHS->hasOneUse())) {
+    Value *NewCmp = Builder.CreateCmp(Pred, V1, V2);
     return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()), M);
   }
+
+  // Try to canonicalize compare with splatted operand and splat constant.
+  // TODO: We could generalize this for more than splats. See/use the code in
+  //       InstCombiner::foldVectorBinop().
+  Constant *C;
+  if (!LHS->hasOneUse() || !match(RHS, m_Constant(C)))
+    return nullptr;
+
+  // Length-changing splats are ok, so adjust the constants as needed:
+  // cmp (shuffle V1, M), C --> shuffle (cmp V1, C'), M
+  Constant *ScalarC = C->getSplatValue(/* AllowUndefs */ true);
+  int MaskSplatIndex;
+  if (ScalarC && match(M, m_SplatOrUndefMask(MaskSplatIndex))) {
+    // We allow undefs in matching, but this transform removes those for safety.
+    // Demanded elements analysis should be able to recover some/all of that.
+    C = ConstantVector::getSplat(cast<VectorType>(V1Ty)->getElementCount(),
+                                 ScalarC);
+    SmallVector<int, 8> NewM(M.size(), MaskSplatIndex);
+    Value *NewCmp = Builder.CreateCmp(Pred, V1, C);
+    return new ShuffleVectorInst(NewCmp, UndefValue::get(NewCmp->getType()),
+                                 NewM);
+  }
+
   return nullptr;
 }
 
@@ -5474,8 +5551,11 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
     if (Instruction *Res = canonicalizeICmpBool(I, Builder))
       return Res;
 
-  if (ICmpInst *NewICmp = canonicalizeCmpWithConstant(I))
-    return NewICmp;
+  if (Instruction *Res = canonicalizeCmpWithConstant(I))
+    return Res;
+
+  if (Instruction *Res = canonicalizeICmpPredicate(I))
+    return Res;
 
   if (Instruction *Res = foldICmpWithConstant(I))
     return Res;
@@ -5565,6 +5645,7 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
   if (Instruction *Res = foldICmpBitCast(I, Builder))
     return Res;
 
+  // TODO: Hoist this above the min/max bailout.
   if (Instruction *R = foldICmpWithCastOp(I))
     return R;
 
@@ -5600,9 +5681,13 @@ Instruction *InstCombiner::visitICmpInst(ICmpInst &I) {
         isa<IntegerType>(A->getType())) {
       Value *Result;
       Constant *Overflow;
-      if (OptimizeOverflowCheck(Instruction::Add, /*Signed*/false, A, B,
-                                *AddI, Result, Overflow)) {
+      // m_UAddWithOverflow can match patterns that do not include  an explicit
+      // "add" instruction, so check the opcode of the matched op.
+      if (AddI->getOpcode() == Instruction::Add &&
+          OptimizeOverflowCheck(Instruction::Add, /*Signed*/ false, A, B, *AddI,
+                                Result, Overflow)) {
         replaceInstUsesWith(*AddI, Result);
+        eraseInstFromFunction(*AddI);
         return replaceInstUsesWith(I, Overflow);
       }
     }
@@ -5689,7 +5774,7 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
       // TODO: Can never be -0.0 and other non-representable values
       APFloat RHSRoundInt(RHS);
       RHSRoundInt.roundToIntegral(APFloat::rmNearestTiesToEven);
-      if (RHS.compare(RHSRoundInt) != APFloat::cmpEqual) {
+      if (RHS != RHSRoundInt) {
         if (P == FCmpInst::FCMP_OEQ || P == FCmpInst::FCMP_UEQ)
           return replaceInstUsesWith(I, Builder.getFalse());
 
@@ -5777,7 +5862,7 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
     APFloat SMax(RHS.getSemantics());
     SMax.convertFromAPInt(APInt::getSignedMaxValue(IntWidth), true,
                           APFloat::rmNearestTiesToEven);
-    if (SMax.compare(RHS) == APFloat::cmpLessThan) {  // smax < 13123.0
+    if (SMax < RHS) { // smax < 13123.0
       if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_SLT ||
           Pred == ICmpInst::ICMP_SLE)
         return replaceInstUsesWith(I, Builder.getTrue());
@@ -5789,7 +5874,7 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
     APFloat UMax(RHS.getSemantics());
     UMax.convertFromAPInt(APInt::getMaxValue(IntWidth), false,
                           APFloat::rmNearestTiesToEven);
-    if (UMax.compare(RHS) == APFloat::cmpLessThan) {  // umax < 13123.0
+    if (UMax < RHS) { // umax < 13123.0
       if (Pred == ICmpInst::ICMP_NE  || Pred == ICmpInst::ICMP_ULT ||
           Pred == ICmpInst::ICMP_ULE)
         return replaceInstUsesWith(I, Builder.getTrue());
@@ -5802,7 +5887,7 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
     APFloat SMin(RHS.getSemantics());
     SMin.convertFromAPInt(APInt::getSignedMinValue(IntWidth), true,
                           APFloat::rmNearestTiesToEven);
-    if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // smin > 12312.0
+    if (SMin > RHS) { // smin > 12312.0
       if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_SGT ||
           Pred == ICmpInst::ICMP_SGE)
         return replaceInstUsesWith(I, Builder.getTrue());
@@ -5810,10 +5895,10 @@ Instruction *InstCombiner::foldFCmpIntToFPConst(FCmpInst &I, Instruction *LHSI,
     }
   } else {
     // See if the RHS value is < UnsignedMin.
-    APFloat SMin(RHS.getSemantics());
-    SMin.convertFromAPInt(APInt::getMinValue(IntWidth), true,
+    APFloat UMin(RHS.getSemantics());
+    UMin.convertFromAPInt(APInt::getMinValue(IntWidth), false,
                           APFloat::rmNearestTiesToEven);
-    if (SMin.compare(RHS) == APFloat::cmpGreaterThan) { // umin > 12312.0
+    if (UMin > RHS) { // umin > 12312.0
       if (Pred == ICmpInst::ICMP_NE || Pred == ICmpInst::ICMP_UGT ||
           Pred == ICmpInst::ICMP_UGE)
         return replaceInstUsesWith(I, Builder.getTrue());
@@ -5949,16 +6034,15 @@ static Instruction *foldFCmpReciprocalAndZero(FCmpInst &I, Instruction *LHSI,
 }
 
 /// Optimize fabs(X) compared with zero.
-static Instruction *foldFabsWithFcmpZero(FCmpInst &I) {
+static Instruction *foldFabsWithFcmpZero(FCmpInst &I, InstCombiner &IC) {
   Value *X;
   if (!match(I.getOperand(0), m_Intrinsic<Intrinsic::fabs>(m_Value(X))) ||
       !match(I.getOperand(1), m_PosZeroFP()))
     return nullptr;
 
-  auto replacePredAndOp0 = [](FCmpInst *I, FCmpInst::Predicate P, Value *X) {
+  auto replacePredAndOp0 = [&IC](FCmpInst *I, FCmpInst::Predicate P, Value *X) {
     I->setPredicate(P);
-    I->setOperand(0, X);
-    return I;
+    return IC.replaceOperand(*I, 0, X);
   };
 
   switch (I.getPredicate()) {
@@ -6058,14 +6142,11 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
   // If we're just checking for a NaN (ORD/UNO) and have a non-NaN operand,
   // then canonicalize the operand to 0.0.
   if (Pred == CmpInst::FCMP_ORD || Pred == CmpInst::FCMP_UNO) {
-    if (!match(Op0, m_PosZeroFP()) && isKnownNeverNaN(Op0, &TLI)) {
-      I.setOperand(0, ConstantFP::getNullValue(OpType));
-      return &I;
-    }
-    if (!match(Op1, m_PosZeroFP()) && isKnownNeverNaN(Op1, &TLI)) {
-      I.setOperand(1, ConstantFP::getNullValue(OpType));
-      return &I;
-    }
+    if (!match(Op0, m_PosZeroFP()) && isKnownNeverNaN(Op0, &TLI))
+      return replaceOperand(I, 0, ConstantFP::getNullValue(OpType));
+
+    if (!match(Op1, m_PosZeroFP()) && isKnownNeverNaN(Op1, &TLI))
+      return replaceOperand(I, 1, ConstantFP::getNullValue(OpType));
   }
 
   // fcmp pred (fneg X), (fneg Y) -> fcmp swap(pred) X, Y
@@ -6090,10 +6171,8 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
 
   // The sign of 0.0 is ignored by fcmp, so canonicalize to +0.0:
   // fcmp Pred X, -0.0 --> fcmp Pred X, 0.0
-  if (match(Op1, m_AnyZeroFP()) && !match(Op1, m_PosZeroFP())) {
-    I.setOperand(1, ConstantFP::getNullValue(OpType));
-    return &I;
-  }
+  if (match(Op1, m_AnyZeroFP()) && !match(Op1, m_PosZeroFP()))
+    return replaceOperand(I, 1, ConstantFP::getNullValue(OpType));
 
   // Handle fcmp with instruction LHS and constant RHS.
   Instruction *LHSI;
@@ -6128,7 +6207,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
   }
   }
 
-  if (Instruction *R = foldFabsWithFcmpZero(I))
+  if (Instruction *R = foldFabsWithFcmpZero(I, *this))
     return R;
 
   if (match(Op0, m_FNeg(m_Value(X)))) {
@@ -6159,8 +6238,7 @@ Instruction *InstCombiner::visitFCmpInst(FCmpInst &I) {
       APFloat Fabs = TruncC;
       Fabs.clearSign();
       if (!Lossy &&
-          ((Fabs.compare(APFloat::getSmallestNormalized(FPSem)) !=
-            APFloat::cmpLessThan) || Fabs.isZero())) {
+          (!(Fabs < APFloat::getSmallestNormalized(FPSem)) || Fabs.isZero())) {
         Constant *NewC = ConstantFP::get(X->getType(), TruncC);
         return new FCmpInst(Pred, X, NewC, "", &I);
       }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 1a746cb87abb..f918dc7198ca 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -16,7 +16,8 @@
 #define LLVM_LIB_TRANSFORMS_INSTCOMBINE_INSTCOMBINEINTERNAL_H
 
 #include "llvm/ADT/ArrayRef.h"
-#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -50,6 +51,7 @@ using namespace llvm::PatternMatch;
 
 namespace llvm {
 
+class AAResults;
 class APInt;
 class AssumptionCache;
 class BlockFrequencyInfo;
@@ -213,18 +215,23 @@ static inline bool isFreeToInvert(Value *V, bool WillInvertAllUses) {
 }
 
 /// Given i1 V, can every user of V be freely adapted if V is changed to !V ?
+/// InstCombine's canonicalizeICmpPredicate() must be kept in sync with this fn.
 ///
 /// See also: isFreeToInvert()
 static inline bool canFreelyInvertAllUsersOf(Value *V, Value *IgnoredUser) {
   // Look at every user of V.
-  for (User *U : V->users()) {
-    if (U == IgnoredUser)
+  for (Use &U : V->uses()) {
+    if (U.getUser() == IgnoredUser)
       continue; // Don't consider this user.
 
-    auto *I = cast<Instruction>(U);
+    auto *I = cast<Instruction>(U.getUser());
     switch (I->getOpcode()) {
     case Instruction::Select:
+      if (U.getOperandNo() != 0) // Only if the value is used as select cond.
+        return false;
+      break;
     case Instruction::Br:
+      assert(U.getOperandNo() == 0 && "Must be branching on that value.");
       break; // Free to invert by swapping true/false values/destinations.
     case Instruction::Xor: // Can invert 'xor' if it's a 'not', by ignoring it.
       if (!match(I, m_Not(m_Value())))
@@ -244,9 +251,10 @@ static inline bool canFreelyInvertAllUsersOf(Value *V, Value *IgnoredUser) {
 /// If no identity constant exists, replace undef with some other safe constant.
 static inline Constant *getSafeVectorConstantForBinop(
       BinaryOperator::BinaryOps Opcode, Constant *In, bool IsRHSConstant) {
-  assert(In->getType()->isVectorTy() && "Not expecting scalars here");
+  auto *InVTy = dyn_cast<VectorType>(In->getType());
+  assert(InVTy && "Not expecting scalars here");
 
-  Type *EltTy = In->getType()->getVectorElementType();
+  Type *EltTy = InVTy->getElementType();
   auto *SafeC = ConstantExpr::getBinOpIdentity(Opcode, EltTy, IsRHSConstant);
   if (!SafeC) {
     // TODO: Should this be available as a constant utility function? It is
@@ -284,7 +292,7 @@ static inline Constant *getSafeVectorConstantForBinop(
     }
   }
   assert(SafeC && "Must have safe constant for binop");
-  unsigned NumElts = In->getType()->getVectorNumElements();
+  unsigned NumElts = InVTy->getNumElements();
   SmallVector<Constant *, 16> Out(NumElts);
   for (unsigned i = 0; i != NumElts; ++i) {
     Constant *C = In->getAggregateElement(i);
@@ -313,10 +321,7 @@ private:
   // Mode in which we are running the combiner.
   const bool MinimizeSize;
 
-  /// Enable combines that trigger rarely but are costly in compiletime.
-  const bool ExpensiveCombines;
-
-  AliasAnalysis *AA;
+  AAResults *AA;
 
   // Required analyses.
   AssumptionCache &AC;
@@ -336,12 +341,12 @@ private:
 
 public:
   InstCombiner(InstCombineWorklist &Worklist, BuilderTy &Builder,
-               bool MinimizeSize, bool ExpensiveCombines, AliasAnalysis *AA,
+               bool MinimizeSize, AAResults *AA,
                AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
                OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
                ProfileSummaryInfo *PSI, const DataLayout &DL, LoopInfo *LI)
       : Worklist(Worklist), Builder(Builder), MinimizeSize(MinimizeSize),
-        ExpensiveCombines(ExpensiveCombines), AA(AA), AC(AC), TLI(TLI), DT(DT),
+        AA(AA), AC(AC), TLI(TLI), DT(DT),
         DL(DL), SQ(DL, &TLI, &DT, &AC), ORE(ORE), BFI(BFI), PSI(PSI), LI(LI) {}
 
   /// Run the combiner over the entire worklist until it is empty.
@@ -420,7 +425,7 @@ public:
   Instruction *visitIntToPtr(IntToPtrInst &CI);
   Instruction *visitBitCast(BitCastInst &CI);
   Instruction *visitAddrSpaceCast(AddrSpaceCastInst &CI);
-  Instruction *FoldItoFPtoI(Instruction &FI);
+  Instruction *foldItoFPtoI(CastInst &FI);
   Instruction *visitSelectInst(SelectInst &SI);
   Instruction *visitCallInst(CallInst &CI);
   Instruction *visitInvokeInst(InvokeInst &II);
@@ -435,6 +440,7 @@ public:
   Instruction *visitLoadInst(LoadInst &LI);
   Instruction *visitStoreInst(StoreInst &SI);
   Instruction *visitAtomicRMWInst(AtomicRMWInst &SI);
+  Instruction *visitUnconditionalBranchInst(BranchInst &BI);
   Instruction *visitBranchInst(BranchInst &BI);
   Instruction *visitFenceInst(FenceInst &FI);
   Instruction *visitSwitchInst(SwitchInst &SI);
@@ -445,8 +451,7 @@ public:
   Instruction *visitShuffleVectorInst(ShuffleVectorInst &SVI);
   Instruction *visitExtractValueInst(ExtractValueInst &EV);
   Instruction *visitLandingPadInst(LandingPadInst &LI);
-  Instruction *visitVAStartInst(VAStartInst &I);
-  Instruction *visitVACopyInst(VACopyInst &I);
+  Instruction *visitVAEndInst(VAEndInst &I);
   Instruction *visitFreeze(FreezeInst &I);
 
   /// Specify what to return for unhandled instructions.
@@ -515,7 +520,7 @@ private:
   Instruction *simplifyMaskedStore(IntrinsicInst &II);
   Instruction *simplifyMaskedGather(IntrinsicInst &II);
   Instruction *simplifyMaskedScatter(IntrinsicInst &II);
-  
+
   /// Transform (zext icmp) to bitwise / integer operations in order to
   /// eliminate it.
   ///
@@ -621,9 +626,9 @@ private:
   Instruction::CastOps isEliminableCastPair(const CastInst *CI1,
                                             const CastInst *CI2);
 
-  Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI);
-  Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, Instruction &CxtI);
-  Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &I);
+  Value *foldAndOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &And);
+  Value *foldOrOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Or);
+  Value *foldXorOfICmps(ICmpInst *LHS, ICmpInst *RHS, BinaryOperator &Xor);
 
   /// Optimize (fcmp)&(fcmp) or (fcmp)|(fcmp).
   /// NOTE: Unlike most of instcombine, this returns a Value which should
@@ -631,11 +636,12 @@ private:
   Value *foldLogicOfFCmps(FCmpInst *LHS, FCmpInst *RHS, bool IsAnd);
 
   Value *foldAndOrOfICmpsOfAndWithPow2(ICmpInst *LHS, ICmpInst *RHS,
-                                       bool JoinedByAnd, Instruction &CxtI);
+                                       BinaryOperator &Logic);
   Value *matchSelectFromAndOr(Value *A, Value *B, Value *C, Value *D);
   Value *getSelectCondition(Value *A, Value *B);
 
   Instruction *foldIntrinsicWithOverflowCommon(IntrinsicInst *II);
+  Instruction *foldFPSignBitOps(BinaryOperator &I);
 
 public:
   /// Inserts an instruction \p New before instruction \p Old
@@ -647,7 +653,7 @@ public:
            "New instruction already inserted into a basic block!");
     BasicBlock *BB = Old.getParent();
     BB->getInstList().insert(Old.getIterator(), New); // Insert inst
-    Worklist.Add(New);
+    Worklist.push(New);
     return New;
   }
 
@@ -668,7 +674,7 @@ public:
     // no changes were made to the program.
     if (I.use_empty()) return nullptr;
 
-    Worklist.AddUsersToWorkList(I); // Add all modified instrs to worklist.
+    Worklist.pushUsersToWorkList(I); // Add all modified instrs to worklist.
 
     // If we are replacing the instruction with itself, this must be in a
     // segment of unreachable code, so just clobber the instruction.
@@ -682,6 +688,19 @@ public:
     return &I;
   }
 
+  /// Replace operand of instruction and add old operand to the worklist.
+  Instruction *replaceOperand(Instruction &I, unsigned OpNum, Value *V) {
+    Worklist.addValue(I.getOperand(OpNum));
+    I.setOperand(OpNum, V);
+    return &I;
+  }
+
+  /// Replace use and add the previously used value to the worklist.
+  void replaceUse(Use &U, Value *NewValue) {
+    Worklist.addValue(U);
+    U = NewValue;
+  }
+
   /// Creates a result tuple for an overflow intrinsic \p II with a given
   /// \p Result and a constant \p Overflow value.
   Instruction *CreateOverflowTuple(IntrinsicInst *II, Value *Result,
@@ -710,16 +729,15 @@ public:
   Instruction *eraseInstFromFunction(Instruction &I) {
     LLVM_DEBUG(dbgs() << "IC: ERASE " << I << '\n');
     assert(I.use_empty() && "Cannot erase instruction that is used!");
-    salvageDebugInfoOrMarkUndef(I);
+    salvageDebugInfo(I);
 
     // Make sure that we reprocess all operands now that we reduced their
     // use counts.
-    if (I.getNumOperands() < 8) {
-      for (Use &Operand : I.operands())
-        if (auto *Inst = dyn_cast<Instruction>(Operand))
-          Worklist.Add(Inst);
-    }
-    Worklist.Remove(&I);
+    for (Use &Operand : I.operands())
+      if (auto *Inst = dyn_cast<Instruction>(Operand))
+        Worklist.add(Inst);
+
+    Worklist.remove(&I);
     I.eraseFromParent();
     MadeIRChange = true;
     return nullptr; // Don't do anything with FI
@@ -869,6 +887,7 @@ private:
 
   /// Canonicalize the position of binops relative to shufflevector.
   Instruction *foldVectorBinop(BinaryOperator &Inst);
+  Instruction *foldVectorSelect(SelectInst &Sel);
 
   /// Given a binary operator, cast instruction, or select which has a PHI node
   /// as operand #0, see if we can fold the instruction into the PHI (which is
@@ -1004,6 +1023,64 @@ private:
   Value *Descale(Value *Val, APInt Scale, bool &NoSignedWrap);
 };
 
+namespace {
+
+// As a default, let's assume that we want to be aggressive,
+// and attempt to traverse with no limits in attempt to sink negation.
+static constexpr unsigned NegatorDefaultMaxDepth = ~0U;
+
+// Let's guesstimate that most often we will end up visiting/producing
+// fairly small number of new instructions.
+static constexpr unsigned NegatorMaxNodesSSO = 16;
+
+} // namespace
+
+class Negator final {
+  /// Top-to-bottom, def-to-use negated instruction tree we produced.
+  SmallVector<Instruction *, NegatorMaxNodesSSO> NewInstructions;
+
+  using BuilderTy = IRBuilder<TargetFolder, IRBuilderCallbackInserter>;
+  BuilderTy Builder;
+
+  const DataLayout &DL;
+  AssumptionCache &AC;
+  const DominatorTree &DT;
+
+  const bool IsTrulyNegation;
+
+  SmallDenseMap<Value *, Value *> NegationsCache;
+
+  Negator(LLVMContext &C, const DataLayout &DL, AssumptionCache &AC,
+          const DominatorTree &DT, bool IsTrulyNegation);
+
+#if LLVM_ENABLE_STATS
+  unsigned NumValuesVisitedInThisNegator = 0;
+  ~Negator();
+#endif
+
+  using Result = std::pair<ArrayRef<Instruction *> /*NewInstructions*/,
+                           Value * /*NegatedRoot*/>;
+
+  LLVM_NODISCARD Value *visitImpl(Value *V, unsigned Depth);
+
+  LLVM_NODISCARD Value *negate(Value *V, unsigned Depth);
+
+  /// Recurse depth-first and attempt to sink the negation.
+  /// FIXME: use worklist?
+  LLVM_NODISCARD Optional<Result> run(Value *Root);
+
+  Negator(const Negator &) = delete;
+  Negator(Negator &&) = delete;
+  Negator &operator=(const Negator &) = delete;
+  Negator &operator=(Negator &&) = delete;
+
+public:
+  /// Attempt to negate \p Root. Retuns nullptr if negation can't be performed,
+  /// otherwise returns negated value.
+  LLVM_NODISCARD static Value *Negate(bool LHSIsZero, Value *Root,
+                                      InstCombiner &IC);
+};
+
 } // end namespace llvm
 
 #undef DEBUG_TYPE
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
index ebf9d24eecc4..dad2f23120bd 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineLoadStoreAlloca.cpp
@@ -14,8 +14,8 @@
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/SmallString.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/Loads.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -24,6 +24,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
 using namespace PatternMatch;
 
@@ -32,22 +33,6 @@ using namespace PatternMatch;
 STATISTIC(NumDeadStore,    "Number of dead stores eliminated");
 STATISTIC(NumGlobalCopies, "Number of allocas copied from constant global");
 
-/// pointsToConstantGlobal - Return true if V (possibly indirectly) points to
-/// some part of a constant global variable.  This intentionally only accepts
-/// constant expressions because we can't rewrite arbitrary instructions.
-static bool pointsToConstantGlobal(Value *V) {
-  if (GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
-    return GV->isConstant();
-
-  if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
-    if (CE->getOpcode() == Instruction::BitCast ||
-        CE->getOpcode() == Instruction::AddrSpaceCast ||
-        CE->getOpcode() == Instruction::GetElementPtr)
-      return pointsToConstantGlobal(CE->getOperand(0));
-  }
-  return false;
-}
-
 /// isOnlyCopiedFromConstantGlobal - Recursively walk the uses of a (derived)
 /// pointer to an alloca.  Ignore any reads of the pointer, return false if we
 /// see any stores or other unknown uses.  If we see pointer arithmetic, keep
@@ -56,7 +41,8 @@ static bool pointsToConstantGlobal(Value *V) {
 /// the alloca, and if the source pointer is a pointer to a constant global, we
 /// can optimize this.
 static bool
-isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
+isOnlyCopiedFromConstantMemory(AAResults *AA,
+                               Value *V, MemTransferInst *&TheCopy,
                                SmallVectorImpl<Instruction *> &ToDelete) {
   // We track lifetime intrinsics as we encounter them.  If we decide to go
   // ahead and replace the value with the global, this lets the caller quickly
@@ -145,7 +131,7 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
       if (U.getOperandNo() != 0) return false;
 
       // If the source of the memcpy/move is not a constant global, reject it.
-      if (!pointsToConstantGlobal(MI->getSource()))
+      if (!AA->pointsToConstantMemory(MI->getSource()))
         return false;
 
       // Otherwise, the transform is safe.  Remember the copy instruction.
@@ -159,10 +145,11 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
 /// modified by a copy from a constant global.  If we can prove this, we can
 /// replace any uses of the alloca with uses of the global directly.
 static MemTransferInst *
-isOnlyCopiedFromConstantGlobal(AllocaInst *AI,
+isOnlyCopiedFromConstantMemory(AAResults *AA,
+                               AllocaInst *AI,
                                SmallVectorImpl<Instruction *> &ToDelete) {
   MemTransferInst *TheCopy = nullptr;
-  if (isOnlyCopiedFromConstantGlobal(AI, TheCopy, ToDelete))
+  if (isOnlyCopiedFromConstantMemory(AA, AI, TheCopy, ToDelete))
     return TheCopy;
   return nullptr;
 }
@@ -187,9 +174,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
       return nullptr;
 
     // Canonicalize it.
-    Value *V = IC.Builder.getInt32(1);
-    AI.setOperand(0, V);
-    return &AI;
+    return IC.replaceOperand(AI, 0, IC.Builder.getInt32(1));
   }
 
   // Convert: alloca Ty, C - where C is a constant != 1 into: alloca [C x Ty], 1
@@ -197,7 +182,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
     if (C->getValue().getActiveBits() <= 64) {
       Type *NewTy = ArrayType::get(AI.getAllocatedType(), C->getZExtValue());
       AllocaInst *New = IC.Builder.CreateAlloca(NewTy, nullptr, AI.getName());
-      New->setAlignment(MaybeAlign(AI.getAlignment()));
+      New->setAlignment(AI.getAlign());
 
       // Scan to the end of the allocation instructions, to skip over a block of
       // allocas if possible...also skip interleaved debug info
@@ -230,8 +215,7 @@ static Instruction *simplifyAllocaArraySize(InstCombiner &IC, AllocaInst &AI) {
   Type *IntPtrTy = IC.getDataLayout().getIntPtrType(AI.getType());
   if (AI.getArraySize()->getType() != IntPtrTy) {
     Value *V = IC.Builder.CreateIntCast(AI.getArraySize(), IntPtrTy, false);
-    AI.setOperand(0, V);
-    return &AI;
+    return IC.replaceOperand(AI, 0, V);
   }
 
   return nullptr;
@@ -298,7 +282,8 @@ void PointerReplacer::replace(Instruction *I) {
   if (auto *LT = dyn_cast<LoadInst>(I)) {
     auto *V = getReplacement(LT->getPointerOperand());
     assert(V && "Operand not replaced");
-    auto *NewI = new LoadInst(I->getType(), V);
+    auto *NewI = new LoadInst(I->getType(), V, "", false,
+                              IC.getDataLayout().getABITypeAlign(I->getType()));
     NewI->takeName(LT);
     IC.InsertNewInstWith(NewI, *LT);
     IC.replaceInstUsesWith(*LT, NewI);
@@ -343,22 +328,16 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     return I;
 
   if (AI.getAllocatedType()->isSized()) {
-    // If the alignment is 0 (unspecified), assign it the preferred alignment.
-    if (AI.getAlignment() == 0)
-      AI.setAlignment(
-          MaybeAlign(DL.getPrefTypeAlignment(AI.getAllocatedType())));
-
     // Move all alloca's of zero byte objects to the entry block and merge them
     // together.  Note that we only do this for alloca's, because malloc should
     // allocate and return a unique pointer, even for a zero byte allocation.
-    if (DL.getTypeAllocSize(AI.getAllocatedType()) == 0) {
+    if (DL.getTypeAllocSize(AI.getAllocatedType()).getKnownMinSize() == 0) {
       // For a zero sized alloca there is no point in doing an array allocation.
       // This is helpful if the array size is a complicated expression not used
       // elsewhere.
-      if (AI.isArrayAllocation()) {
-        AI.setOperand(0, ConstantInt::get(AI.getArraySize()->getType(), 1));
-        return &AI;
-      }
+      if (AI.isArrayAllocation())
+        return replaceOperand(AI, 0,
+            ConstantInt::get(AI.getArraySize()->getType(), 1));
 
       // Get the first instruction in the entry block.
       BasicBlock &EntryBlock = AI.getParent()->getParent()->getEntryBlock();
@@ -369,21 +348,16 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
         // dominance as the array size was forced to a constant earlier already.
         AllocaInst *EntryAI = dyn_cast<AllocaInst>(FirstInst);
         if (!EntryAI || !EntryAI->getAllocatedType()->isSized() ||
-            DL.getTypeAllocSize(EntryAI->getAllocatedType()) != 0) {
+            DL.getTypeAllocSize(EntryAI->getAllocatedType())
+                    .getKnownMinSize() != 0) {
           AI.moveBefore(FirstInst);
           return &AI;
         }
 
-        // If the alignment of the entry block alloca is 0 (unspecified),
-        // assign it the preferred alignment.
-        if (EntryAI->getAlignment() == 0)
-          EntryAI->setAlignment(
-              MaybeAlign(DL.getPrefTypeAlignment(EntryAI->getAllocatedType())));
         // Replace this zero-sized alloca with the one at the start of the entry
         // block after ensuring that the address will be aligned enough for both
         // types.
-        const MaybeAlign MaxAlign(
-            std::max(EntryAI->getAlignment(), AI.getAlignment()));
+        const Align MaxAlign = std::max(EntryAI->getAlign(), AI.getAlign());
         EntryAI->setAlignment(MaxAlign);
         if (AI.getType() != EntryAI->getType())
           return new BitCastInst(EntryAI, AI.getType());
@@ -392,41 +366,40 @@ Instruction *InstCombiner::visitAllocaInst(AllocaInst &AI) {
     }
   }
 
-  if (AI.getAlignment()) {
-    // Check to see if this allocation is only modified by a memcpy/memmove from
-    // a constant global whose alignment is equal to or exceeds that of the
-    // allocation.  If this is the case, we can change all users to use
-    // the constant global instead.  This is commonly produced by the CFE by
-    // constructs like "void foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A'
-    // is only subsequently read.
-    SmallVector<Instruction *, 4> ToDelete;
-    if (MemTransferInst *Copy = isOnlyCopiedFromConstantGlobal(&AI, ToDelete)) {
-      unsigned SourceAlign = getOrEnforceKnownAlignment(
-          Copy->getSource(), AI.getAlignment(), DL, &AI, &AC, &DT);
-      if (AI.getAlignment() <= SourceAlign &&
-          isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) {
-        LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
-        LLVM_DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
-        for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
-          eraseInstFromFunction(*ToDelete[i]);
-        Constant *TheSrc = cast<Constant>(Copy->getSource());
-        auto *SrcTy = TheSrc->getType();
-        auto *DestTy = PointerType::get(AI.getType()->getPointerElementType(),
-                                        SrcTy->getPointerAddressSpace());
-        Constant *Cast =
-            ConstantExpr::getPointerBitCastOrAddrSpaceCast(TheSrc, DestTy);
-        if (AI.getType()->getPointerAddressSpace() ==
-            SrcTy->getPointerAddressSpace()) {
-          Instruction *NewI = replaceInstUsesWith(AI, Cast);
-          eraseInstFromFunction(*Copy);
-          ++NumGlobalCopies;
-          return NewI;
-        } else {
-          PointerReplacer PtrReplacer(*this);
-          PtrReplacer.replacePointer(AI, Cast);
-          ++NumGlobalCopies;
-        }
+  // Check to see if this allocation is only modified by a memcpy/memmove from
+  // a constant whose alignment is equal to or exceeds that of the allocation.
+  // If this is the case, we can change all users to use the constant global
+  // instead.  This is commonly produced by the CFE by constructs like "void
+  // foo() { int A[] = {1,2,3,4,5,6,7,8,9...}; }" if 'A' is only subsequently
+  // read.
+  SmallVector<Instruction *, 4> ToDelete;
+  if (MemTransferInst *Copy = isOnlyCopiedFromConstantMemory(AA, &AI, ToDelete)) {
+    Align AllocaAlign = AI.getAlign();
+    Align SourceAlign = getOrEnforceKnownAlignment(
+        Copy->getSource(), AllocaAlign, DL, &AI, &AC, &DT);
+    if (AllocaAlign <= SourceAlign &&
+        isDereferenceableForAllocaSize(Copy->getSource(), &AI, DL)) {
+      LLVM_DEBUG(dbgs() << "Found alloca equal to global: " << AI << '\n');
+      LLVM_DEBUG(dbgs() << "  memcpy = " << *Copy << '\n');
+      for (unsigned i = 0, e = ToDelete.size(); i != e; ++i)
+        eraseInstFromFunction(*ToDelete[i]);
+      Value *TheSrc = Copy->getSource();
+      auto *SrcTy = TheSrc->getType();
+      auto *DestTy = PointerType::get(AI.getType()->getPointerElementType(),
+                                      SrcTy->getPointerAddressSpace());
+      Value *Cast =
+        Builder.CreatePointerBitCastOrAddrSpaceCast(TheSrc, DestTy);
+      if (AI.getType()->getPointerAddressSpace() ==
+          SrcTy->getPointerAddressSpace()) {
+        Instruction *NewI = replaceInstUsesWith(AI, Cast);
+        eraseInstFromFunction(*Copy);
+        ++NumGlobalCopies;
+        return NewI;
       }
+
+      PointerReplacer PtrReplacer(*this);
+      PtrReplacer.replacePointer(AI, Cast);
+      ++NumGlobalCopies;
     }
   }
 
@@ -462,15 +435,8 @@ LoadInst *InstCombiner::combineLoadToNewType(LoadInst &LI, Type *NewTy,
         NewPtr->getType()->getPointerAddressSpace() == AS))
     NewPtr = Builder.CreateBitCast(Ptr, NewTy->getPointerTo(AS));
 
-  unsigned Align = LI.getAlignment();
-  if (!Align)
-    // If old load did not have an explicit alignment specified,
-    // manually preserve the implied (ABI) alignment of the load.
-    // Else we may inadvertently incorrectly over-promise alignment.
-    Align = getDataLayout().getABITypeAlignment(LI.getType());
-
   LoadInst *NewLoad = Builder.CreateAlignedLoad(
-      NewTy, NewPtr, Align, LI.isVolatile(), LI.getName() + Suffix);
+      NewTy, NewPtr, LI.getAlign(), LI.isVolatile(), LI.getName() + Suffix);
   NewLoad->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
   copyMetadataForLoad(*NewLoad, LI);
   return NewLoad;
@@ -490,7 +456,7 @@ static StoreInst *combineStoreToNewValue(InstCombiner &IC, StoreInst &SI, Value
 
   StoreInst *NewStore = IC.Builder.CreateAlignedStore(
       V, IC.Builder.CreateBitCast(Ptr, V->getType()->getPointerTo(AS)),
-      SI.getAlignment(), SI.isVolatile());
+      SI.getAlign(), SI.isVolatile());
   NewStore->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
   for (const auto &MDPair : MD) {
     unsigned ID = MDPair.first;
@@ -594,11 +560,9 @@ static Instruction *combineLoadToOperationType(InstCombiner &IC, LoadInst &LI) {
   // Do not perform canonicalization if minmax pattern is found (to avoid
   // infinite loop).
   Type *Dummy;
-  if (!Ty->isIntegerTy() && Ty->isSized() &&
-      !(Ty->isVectorTy() && Ty->getVectorIsScalable()) &&
+  if (!Ty->isIntegerTy() && Ty->isSized() && !isa<ScalableVectorType>(Ty) &&
       DL.isLegalInteger(DL.getTypeStoreSizeInBits(Ty)) &&
-      DL.typeSizeEqualsStoreSize(Ty) &&
-      !DL.isNonIntegralPointerType(Ty) &&
+      DL.typeSizeEqualsStoreSize(Ty) && !DL.isNonIntegralPointerType(Ty) &&
       !isMinMaxWithLoads(
           peekThroughBitcast(LI.getPointerOperand(), /*OneUseOnly=*/true),
           Dummy)) {
@@ -674,10 +638,7 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
     if (SL->hasPadding())
       return nullptr;
 
-    auto Align = LI.getAlignment();
-    if (!Align)
-      Align = DL.getABITypeAlignment(ST);
-
+    const auto Align = LI.getAlign();
     auto *Addr = LI.getPointerOperand();
     auto *IdxType = Type::getInt32Ty(T->getContext());
     auto *Zero = ConstantInt::get(IdxType, 0);
@@ -690,9 +651,9 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
       };
       auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
                                                Name + ".elt");
-      auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
-      auto *L = IC.Builder.CreateAlignedLoad(ST->getElementType(i), Ptr,
-                                             EltAlign, Name + ".unpack");
+      auto *L = IC.Builder.CreateAlignedLoad(
+          ST->getElementType(i), Ptr,
+          commonAlignment(Align, SL->getElementOffset(i)), Name + ".unpack");
       // Propagate AA metadata. It'll still be valid on the narrowed load.
       AAMDNodes AAMD;
       LI.getAAMetadata(AAMD);
@@ -725,9 +686,7 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
 
     const DataLayout &DL = IC.getDataLayout();
     auto EltSize = DL.getTypeAllocSize(ET);
-    auto Align = LI.getAlignment();
-    if (!Align)
-      Align = DL.getABITypeAlignment(T);
+    const auto Align = LI.getAlign();
 
     auto *Addr = LI.getPointerOperand();
     auto *IdxType = Type::getInt64Ty(T->getContext());
@@ -742,8 +701,9 @@ static Instruction *unpackLoadToAggregate(InstCombiner &IC, LoadInst &LI) {
       };
       auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
                                                Name + ".elt");
-      auto *L = IC.Builder.CreateAlignedLoad(
-          AT->getElementType(), Ptr, MinAlign(Align, Offset), Name + ".unpack");
+      auto *L = IC.Builder.CreateAlignedLoad(AT->getElementType(), Ptr,
+                                             commonAlignment(Align, Offset),
+                                             Name + ".unpack");
       AAMDNodes AAMD;
       LI.getAAMetadata(AAMD);
       L->setAAMetadata(AAMD);
@@ -964,20 +924,14 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     return Res;
 
   // Attempt to improve the alignment.
-  unsigned KnownAlign = getOrEnforceKnownAlignment(
-      Op, DL.getPrefTypeAlignment(LI.getType()), DL, &LI, &AC, &DT);
-  unsigned LoadAlign = LI.getAlignment();
-  unsigned EffectiveLoadAlign =
-      LoadAlign != 0 ? LoadAlign : DL.getABITypeAlignment(LI.getType());
-
-  if (KnownAlign > EffectiveLoadAlign)
-    LI.setAlignment(MaybeAlign(KnownAlign));
-  else if (LoadAlign == 0)
-    LI.setAlignment(MaybeAlign(EffectiveLoadAlign));
+  Align KnownAlign = getOrEnforceKnownAlignment(
+      Op, DL.getPrefTypeAlign(LI.getType()), DL, &LI, &AC, &DT);
+  if (KnownAlign > LI.getAlign())
+    LI.setAlignment(KnownAlign);
 
   // Replace GEP indices if possible.
   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Op, LI)) {
-      Worklist.Add(NewGEPI);
+      Worklist.push(NewGEPI);
       return &LI;
   }
 
@@ -1030,7 +984,7 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
     //
     if (SelectInst *SI = dyn_cast<SelectInst>(Op)) {
       // load (select (Cond, &V1, &V2))  --> select(Cond, load &V1, load &V2).
-      const MaybeAlign Alignment(LI.getAlignment());
+      Align Alignment = LI.getAlign();
       if (isSafeToLoadUnconditionally(SI->getOperand(1), LI.getType(),
                                       Alignment, DL, SI) &&
           isSafeToLoadUnconditionally(SI->getOperand(2), LI.getType(),
@@ -1052,18 +1006,14 @@ Instruction *InstCombiner::visitLoadInst(LoadInst &LI) {
       // load (select (cond, null, P)) -> load P
       if (isa<ConstantPointerNull>(SI->getOperand(1)) &&
           !NullPointerIsDefined(SI->getFunction(),
-                                LI.getPointerAddressSpace())) {
-        LI.setOperand(0, SI->getOperand(2));
-        return &LI;
-      }
+                                LI.getPointerAddressSpace()))
+        return replaceOperand(LI, 0, SI->getOperand(2));
 
       // load (select (cond, P, null)) -> load P
       if (isa<ConstantPointerNull>(SI->getOperand(2)) &&
           !NullPointerIsDefined(SI->getFunction(),
-                                LI.getPointerAddressSpace())) {
-        LI.setOperand(0, SI->getOperand(1));
-        return &LI;
-      }
+                                LI.getPointerAddressSpace()))
+        return replaceOperand(LI, 0, SI->getOperand(1));
     }
   }
   return nullptr;
@@ -1204,9 +1154,7 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
     if (SL->hasPadding())
       return false;
 
-    auto Align = SI.getAlignment();
-    if (!Align)
-      Align = DL.getABITypeAlignment(ST);
+    const auto Align = SI.getAlign();
 
     SmallString<16> EltName = V->getName();
     EltName += ".elt";
@@ -1224,7 +1172,7 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
       auto *Ptr = IC.Builder.CreateInBoundsGEP(ST, Addr, makeArrayRef(Indices),
                                                AddrName);
       auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
-      auto EltAlign = MinAlign(Align, SL->getElementOffset(i));
+      auto EltAlign = commonAlignment(Align, SL->getElementOffset(i));
       llvm::Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
       AAMDNodes AAMD;
       SI.getAAMetadata(AAMD);
@@ -1252,9 +1200,7 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
 
     const DataLayout &DL = IC.getDataLayout();
     auto EltSize = DL.getTypeAllocSize(AT->getElementType());
-    auto Align = SI.getAlignment();
-    if (!Align)
-      Align = DL.getABITypeAlignment(T);
+    const auto Align = SI.getAlign();
 
     SmallString<16> EltName = V->getName();
     EltName += ".elt";
@@ -1274,7 +1220,7 @@ static bool unpackStoreToAggregate(InstCombiner &IC, StoreInst &SI) {
       auto *Ptr = IC.Builder.CreateInBoundsGEP(AT, Addr, makeArrayRef(Indices),
                                                AddrName);
       auto *Val = IC.Builder.CreateExtractValue(V, i, EltName);
-      auto EltAlign = MinAlign(Align, Offset);
+      auto EltAlign = commonAlignment(Align, Offset);
       Instruction *NS = IC.Builder.CreateAlignedStore(Val, Ptr, EltAlign);
       AAMDNodes AAMD;
       SI.getAAMetadata(AAMD);
@@ -1336,6 +1282,11 @@ static bool removeBitcastsFromLoadStoreOnMinMax(InstCombiner &IC,
   if (!isMinMaxWithLoads(LoadAddr, CmpLoadTy))
     return false;
 
+  // Make sure the type would actually change.
+  // This condition can be hit with chains of bitcasts.
+  if (LI->getType() == CmpLoadTy)
+    return false;
+
   // Make sure we're not changing the size of the load/store.
   const auto &DL = IC.getDataLayout();
   if (DL.getTypeStoreSizeInBits(LI->getType()) !=
@@ -1372,16 +1323,10 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
     return eraseInstFromFunction(SI);
 
   // Attempt to improve the alignment.
-  const Align KnownAlign = Align(getOrEnforceKnownAlignment(
-      Ptr, DL.getPrefTypeAlignment(Val->getType()), DL, &SI, &AC, &DT));
-  const MaybeAlign StoreAlign = MaybeAlign(SI.getAlignment());
-  const Align EffectiveStoreAlign =
-      StoreAlign ? *StoreAlign : Align(DL.getABITypeAlignment(Val->getType()));
-
-  if (KnownAlign > EffectiveStoreAlign)
+  const Align KnownAlign = getOrEnforceKnownAlignment(
+      Ptr, DL.getPrefTypeAlign(Val->getType()), DL, &SI, &AC, &DT);
+  if (KnownAlign > SI.getAlign())
     SI.setAlignment(KnownAlign);
-  else if (!StoreAlign)
-    SI.setAlignment(EffectiveStoreAlign);
 
   // Try to canonicalize the stored type.
   if (unpackStoreToAggregate(*this, SI))
@@ -1392,7 +1337,7 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
 
   // Replace GEP indices if possible.
   if (Instruction *NewGEPI = replaceGEPIdxWithZero(*this, Ptr, SI)) {
-      Worklist.Add(NewGEPI);
+      Worklist.push(NewGEPI);
       return &SI;
   }
 
@@ -1439,9 +1384,12 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
       if (PrevSI->isUnordered() && equivalentAddressValues(PrevSI->getOperand(1),
                                                         SI.getOperand(1))) {
         ++NumDeadStore;
-        ++BBI;
+        // Manually add back the original store to the worklist now, so it will
+        // be processed after the operands of the removed store, as this may
+        // expose additional DSE opportunities.
+        Worklist.push(&SI);
         eraseInstFromFunction(*PrevSI);
-        continue;
+        return nullptr;
       }
       break;
     }
@@ -1468,11 +1416,8 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   // store X, null    -> turns into 'unreachable' in SimplifyCFG
   // store X, GEP(null, Y) -> turns into 'unreachable' in SimplifyCFG
   if (canSimplifyNullStoreOrGEP(SI)) {
-    if (!isa<UndefValue>(Val)) {
-      SI.setOperand(0, UndefValue::get(Val->getType()));
-      if (Instruction *U = dyn_cast<Instruction>(Val))
-        Worklist.Add(U);  // Dropped a use.
-    }
+    if (!isa<UndefValue>(Val))
+      return replaceOperand(SI, 0, UndefValue::get(Val->getType()));
     return nullptr;  // Do not modify these!
   }
 
@@ -1480,19 +1425,6 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
   if (isa<UndefValue>(Val))
     return eraseInstFromFunction(SI);
 
-  // If this store is the second-to-last instruction in the basic block
-  // (excluding debug info and bitcasts of pointers) and if the block ends with
-  // an unconditional branch, try to move the store to the successor block.
-  BBI = SI.getIterator();
-  do {
-    ++BBI;
-  } while (isa<DbgInfoIntrinsic>(BBI) ||
-           (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy()));
-
-  if (BranchInst *BI = dyn_cast<BranchInst>(BBI))
-    if (BI->isUnconditional())
-      mergeStoreIntoSuccessor(SI);
-
   return nullptr;
 }
 
@@ -1502,8 +1434,8 @@ Instruction *InstCombiner::visitStoreInst(StoreInst &SI) {
 ///   *P = v1; if () { *P = v2; }
 /// into a phi node with a store in the successor.
 bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) {
-  assert(SI.isUnordered() &&
-         "This code has not been audited for volatile or ordered store case.");
+  if (!SI.isUnordered())
+    return false; // This code has not been audited for volatile/ordered case.
 
   // Check if the successor block has exactly 2 incoming edges.
   BasicBlock *StoreBB = SI.getParent();
@@ -1595,9 +1527,9 @@ bool InstCombiner::mergeStoreIntoSuccessor(StoreInst &SI) {
 
   // Advance to a place where it is safe to insert the new store and insert it.
   BBI = DestBB->getFirstInsertionPt();
-  StoreInst *NewSI = new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(),
-                                   MaybeAlign(SI.getAlignment()),
-                                   SI.getOrdering(), SI.getSyncScopeID());
+  StoreInst *NewSI =
+      new StoreInst(MergedVal, SI.getOperand(1), SI.isVolatile(), SI.getAlign(),
+                    SI.getOrdering(), SI.getSyncScopeID());
   InsertNewInstBefore(NewSI, *BBI);
   NewSI->setDebugLoc(MergedLoc);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
index 2774e46151fa..c6233a68847d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineMulDivRem.cpp
@@ -72,7 +72,7 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
     // We know that this is an exact/nuw shift and that the input is a
     // non-zero context as well.
     if (Value *V2 = simplifyValueKnownNonZero(I->getOperand(0), IC, CxtI)) {
-      I->setOperand(0, V2);
+      IC.replaceOperand(*I, 0, V2);
       MadeChange = true;
     }
 
@@ -96,19 +96,22 @@ static Value *simplifyValueKnownNonZero(Value *V, InstCombiner &IC,
 
 /// A helper routine of InstCombiner::visitMul().
 ///
-/// If C is a scalar/vector of known powers of 2, then this function returns
-/// a new scalar/vector obtained from logBase2 of C.
+/// If C is a scalar/fixed width vector of known powers of 2, then this
+/// function returns a new scalar/fixed width vector obtained from logBase2
+/// of C.
 /// Return a null pointer otherwise.
 static Constant *getLogBase2(Type *Ty, Constant *C) {
   const APInt *IVal;
   if (match(C, m_APInt(IVal)) && IVal->isPowerOf2())
     return ConstantInt::get(Ty, IVal->logBase2());
 
-  if (!Ty->isVectorTy())
+  // FIXME: We can extract pow of 2 of splat constant for scalable vectors.
+  if (!isa<FixedVectorType>(Ty))
     return nullptr;
 
   SmallVector<Constant *, 4> Elts;
-  for (unsigned I = 0, E = Ty->getVectorNumElements(); I != E; ++I) {
+  for (unsigned I = 0, E = cast<FixedVectorType>(Ty)->getNumElements(); I != E;
+       ++I) {
     Constant *Elt = C->getAggregateElement(I);
     if (!Elt)
       return nullptr;
@@ -274,6 +277,15 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     }
   }
 
+  // abs(X) * abs(X) -> X * X
+  // nabs(X) * nabs(X) -> X * X
+  if (Op0 == Op1) {
+    Value *X, *Y;
+    SelectPatternFlavor SPF = matchSelectPattern(Op0, X, Y).Flavor;
+    if (SPF == SPF_ABS || SPF == SPF_NABS)
+      return BinaryOperator::CreateMul(X, X);
+  }
+
   // -X * C --> X * -C
   Value *X, *Y;
   Constant *Op1C;
@@ -354,6 +366,27 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
     }
   }
 
+  // (zext bool X) * (zext bool Y) --> zext (and X, Y)
+  // (sext bool X) * (sext bool Y) --> zext (and X, Y)
+  // Note: -1 * -1 == 1 * 1 == 1 (if the extends match, the result is the same)
+  if (((match(Op0, m_ZExt(m_Value(X))) && match(Op1, m_ZExt(m_Value(Y)))) ||
+       (match(Op0, m_SExt(m_Value(X))) && match(Op1, m_SExt(m_Value(Y))))) &&
+      X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() &&
+      (Op0->hasOneUse() || Op1->hasOneUse())) {
+    Value *And = Builder.CreateAnd(X, Y, "mulbool");
+    return CastInst::Create(Instruction::ZExt, And, I.getType());
+  }
+  // (sext bool X) * (zext bool Y) --> sext (and X, Y)
+  // (zext bool X) * (sext bool Y) --> sext (and X, Y)
+  // Note: -1 * 1 == 1 * -1  == -1
+  if (((match(Op0, m_SExt(m_Value(X))) && match(Op1, m_ZExt(m_Value(Y)))) ||
+       (match(Op0, m_ZExt(m_Value(X))) && match(Op1, m_SExt(m_Value(Y))))) &&
+      X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType() &&
+      (Op0->hasOneUse() || Op1->hasOneUse())) {
+    Value *And = Builder.CreateAnd(X, Y, "mulbool");
+    return CastInst::Create(Instruction::SExt, And, I.getType());
+  }
+
   // (bool X) * Y --> X ? Y : 0
   // Y * (bool X) --> X ? Y : 0
   if (match(Op0, m_ZExt(m_Value(X))) && X->getType()->isIntOrIntVectorTy(1))
@@ -390,6 +423,40 @@ Instruction *InstCombiner::visitMul(BinaryOperator &I) {
   return Changed ? &I : nullptr;
 }
 
+Instruction *InstCombiner::foldFPSignBitOps(BinaryOperator &I) {
+  BinaryOperator::BinaryOps Opcode = I.getOpcode();
+  assert((Opcode == Instruction::FMul || Opcode == Instruction::FDiv) &&
+         "Expected fmul or fdiv");
+
+  Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
+  Value *X, *Y;
+
+  // -X * -Y --> X * Y
+  // -X / -Y --> X / Y
+  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
+    return BinaryOperator::CreateWithCopiedFlags(Opcode, X, Y, &I);
+
+  // fabs(X) * fabs(X) -> X * X
+  // fabs(X) / fabs(X) -> X / X
+  if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X))))
+    return BinaryOperator::CreateWithCopiedFlags(Opcode, X, X, &I);
+
+  // fabs(X) * fabs(Y) --> fabs(X * Y)
+  // fabs(X) / fabs(Y) --> fabs(X / Y)
+  if (match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X))) &&
+      match(Op1, m_Intrinsic<Intrinsic::fabs>(m_Value(Y))) &&
+      (Op0->hasOneUse() || Op1->hasOneUse())) {
+    IRBuilder<>::FastMathFlagGuard FMFGuard(Builder);
+    Builder.setFastMathFlags(I.getFastMathFlags());
+    Value *XY = Builder.CreateBinOp(Opcode, X, Y);
+    Value *Fabs = Builder.CreateUnaryIntrinsic(Intrinsic::fabs, XY);
+    Fabs->takeName(&I);
+    return replaceInstUsesWith(I, Fabs);
+  }
+
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   if (Value *V = SimplifyFMulInst(I.getOperand(0), I.getOperand(1),
                                   I.getFastMathFlags(),
@@ -408,25 +475,20 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
   if (Value *FoldedMul = foldMulSelectToNegate(I, Builder))
     return replaceInstUsesWith(I, FoldedMul);
 
+  if (Instruction *R = foldFPSignBitOps(I))
+    return R;
+
   // X * -1.0 --> -X
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   if (match(Op1, m_SpecificFP(-1.0)))
-    return BinaryOperator::CreateFNegFMF(Op0, &I);
-
-  // -X * -Y --> X * Y
-  Value *X, *Y;
-  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y))))
-    return BinaryOperator::CreateFMulFMF(X, Y, &I);
+    return UnaryOperator::CreateFNegFMF(Op0, &I);
 
   // -X * C --> X * -C
+  Value *X, *Y;
   Constant *C;
   if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_Constant(C)))
     return BinaryOperator::CreateFMulFMF(X, ConstantExpr::getFNeg(C), &I);
 
-  // fabs(X) * fabs(X) -> X * X
-  if (Op0 == Op1 && match(Op0, m_Intrinsic<Intrinsic::fabs>(m_Value(X))))
-    return BinaryOperator::CreateFMulFMF(X, X, &I);
-
   // (select A, B, C) * (select A, D, E) --> select A, (B*D), (C*E)
   if (Value *V = SimplifySelectsFeedingBinaryOp(I, Op0, Op1))
     return replaceInstUsesWith(I, V);
@@ -563,8 +625,7 @@ Instruction *InstCombiner::visitFMul(BinaryOperator &I) {
       Y = Op0;
     }
     if (Log2) {
-      Log2->setArgOperand(0, X);
-      Log2->copyFastMathFlags(&I);
+      Value *Log2 = Builder.CreateUnaryIntrinsic(Intrinsic::log2, X, &I);
       Value *LogXTimesY = Builder.CreateFMulFMF(Log2, Y, &I);
       return BinaryOperator::CreateFSubFMF(LogXTimesY, Y, &I);
     }
@@ -592,7 +653,7 @@ bool InstCombiner::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
     return false;
 
   // Change the div/rem to use 'Y' instead of the select.
-  I.setOperand(1, SI->getOperand(NonNullOperand));
+  replaceOperand(I, 1, SI->getOperand(NonNullOperand));
 
   // Okay, we know we replace the operand of the div/rem with 'Y' with no
   // problem.  However, the select, or the condition of the select may have
@@ -620,12 +681,12 @@ bool InstCombiner::simplifyDivRemOfSelectWithZeroOp(BinaryOperator &I) {
     for (Instruction::op_iterator I = BBI->op_begin(), E = BBI->op_end();
          I != E; ++I) {
       if (*I == SI) {
-        *I = SI->getOperand(NonNullOperand);
-        Worklist.Add(&*BBI);
+        replaceUse(*I, SI->getOperand(NonNullOperand));
+        Worklist.push(&*BBI);
       } else if (*I == SelectCond) {
-        *I = NonNullOperand == 1 ? ConstantInt::getTrue(CondTy)
-                                 : ConstantInt::getFalse(CondTy);
-        Worklist.Add(&*BBI);
+        replaceUse(*I, NonNullOperand == 1 ? ConstantInt::getTrue(CondTy)
+                                           : ConstantInt::getFalse(CondTy));
+        Worklist.push(&*BBI);
       }
     }
 
@@ -683,10 +744,8 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
   Type *Ty = I.getType();
 
   // The RHS is known non-zero.
-  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) {
-    I.setOperand(1, V);
-    return &I;
-  }
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I))
+    return replaceOperand(I, 1, V);
 
   // Handle cases involving: [su]div X, (select Cond, Y, Z)
   // This does not apply for fdiv.
@@ -800,8 +859,8 @@ Instruction *InstCombiner::commonIDivTransforms(BinaryOperator &I) {
     bool HasNSW = cast<OverflowingBinaryOperator>(Op1)->hasNoSignedWrap();
     bool HasNUW = cast<OverflowingBinaryOperator>(Op1)->hasNoUnsignedWrap();
     if ((IsSigned && HasNSW) || (!IsSigned && HasNUW)) {
-      I.setOperand(0, ConstantInt::get(Ty, 1));
-      I.setOperand(1, Y);
+      replaceOperand(I, 0, ConstantInt::get(Ty, 1));
+      replaceOperand(I, 1, Y);
       return &I;
     }
   }
@@ -1214,6 +1273,9 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
   if (Instruction *R = foldFDivConstantDividend(I))
     return R;
 
+  if (Instruction *R = foldFPSignBitOps(I))
+    return R;
+
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
   if (isa<Constant>(Op0))
     if (SelectInst *SI = dyn_cast<SelectInst>(Op1))
@@ -1274,21 +1336,14 @@ Instruction *InstCombiner::visitFDiv(BinaryOperator &I) {
     }
   }
 
-  // -X / -Y -> X / Y
-  Value *X, *Y;
-  if (match(Op0, m_FNeg(m_Value(X))) && match(Op1, m_FNeg(m_Value(Y)))) {
-    I.setOperand(0, X);
-    I.setOperand(1, Y);
-    return &I;
-  }
-
   // X / (X * Y) --> 1.0 / Y
   // Reassociate to (X / X -> 1.0) is legal when NaNs are not allowed.
   // We can ignore the possibility that X is infinity because INF/INF is NaN.
+  Value *X, *Y;
   if (I.hasNoNaNs() && I.hasAllowReassoc() &&
       match(Op1, m_c_FMul(m_Specific(Op0), m_Value(Y)))) {
-    I.setOperand(0, ConstantFP::get(I.getType(), 1.0));
-    I.setOperand(1, Y);
+    replaceOperand(I, 0, ConstantFP::get(I.getType(), 1.0));
+    replaceOperand(I, 1, Y);
     return &I;
   }
 
@@ -1314,10 +1369,8 @@ Instruction *InstCombiner::commonIRemTransforms(BinaryOperator &I) {
   Value *Op0 = I.getOperand(0), *Op1 = I.getOperand(1);
 
   // The RHS is known non-zero.
-  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I)) {
-    I.setOperand(1, V);
-    return &I;
-  }
+  if (Value *V = simplifyValueKnownNonZero(I.getOperand(1), *this, I))
+    return replaceOperand(I, 1, V);
 
   // Handle cases involving: rem X, (select Cond, Y, Z)
   if (simplifyDivRemOfSelectWithZeroOp(I))
@@ -1417,11 +1470,8 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
   {
     const APInt *Y;
     // X % -Y -> X % Y
-    if (match(Op1, m_Negative(Y)) && !Y->isMinSignedValue()) {
-      Worklist.AddValue(I.getOperand(1));
-      I.setOperand(1, ConstantInt::get(I.getType(), -*Y));
-      return &I;
-    }
+    if (match(Op1, m_Negative(Y)) && !Y->isMinSignedValue())
+      return replaceOperand(I, 1, ConstantInt::get(I.getType(), -*Y));
   }
 
   // -X srem Y --> -(X srem Y)
@@ -1441,7 +1491,7 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
   // If it's a constant vector, flip any negative values positive.
   if (isa<ConstantVector>(Op1) || isa<ConstantDataVector>(Op1)) {
     Constant *C = cast<Constant>(Op1);
-    unsigned VWidth = C->getType()->getVectorNumElements();
+    unsigned VWidth = cast<VectorType>(C->getType())->getNumElements();
 
     bool hasNegative = false;
     bool hasMissing = false;
@@ -1468,11 +1518,8 @@ Instruction *InstCombiner::visitSRem(BinaryOperator &I) {
       }
 
       Constant *NewRHSV = ConstantVector::get(Elts);
-      if (NewRHSV != C) {  // Don't loop on -MININT
-        Worklist.AddValue(I.getOperand(1));
-        I.setOperand(1, NewRHSV);
-        return &I;
-      }
+      if (NewRHSV != C)  // Don't loop on -MININT
+        return replaceOperand(I, 1, NewRHSV);
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp b/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
new file mode 100644
index 000000000000..3fe615ac5439
--- /dev/null
+++ b/llvm/lib/Transforms/InstCombine/InstCombineNegator.cpp
@@ -0,0 +1,474 @@
+//===- InstCombineNegator.cpp -----------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements sinking of negation into expression trees,
+// as long as that can be done without increasing instruction count.
+//
+//===----------------------------------------------------------------------===//
+
+#include "InstCombineInternal.h"
+#include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/None.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/TargetFolder.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Type.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/raw_ostream.h"
+#include <functional>
+#include <tuple>
+#include <type_traits>
+#include <utility>
+
+namespace llvm {
+class AssumptionCache;
+class DataLayout;
+class DominatorTree;
+class LLVMContext;
+} // namespace llvm
+
+using namespace llvm;
+
+#define DEBUG_TYPE "instcombine"
+
+STATISTIC(NegatorTotalNegationsAttempted,
+          "Negator: Number of negations attempted to be sinked");
+STATISTIC(NegatorNumTreesNegated,
+          "Negator: Number of negations successfully sinked");
+STATISTIC(NegatorMaxDepthVisited, "Negator: Maximal traversal depth ever "
+                                  "reached while attempting to sink negation");
+STATISTIC(NegatorTimesDepthLimitReached,
+          "Negator: How many times did the traversal depth limit was reached "
+          "during sinking");
+STATISTIC(
+    NegatorNumValuesVisited,
+    "Negator: Total number of values visited during attempts to sink negation");
+STATISTIC(NegatorNumNegationsFoundInCache,
+          "Negator: How many negations did we retrieve/reuse from cache");
+STATISTIC(NegatorMaxTotalValuesVisited,
+          "Negator: Maximal number of values ever visited while attempting to "
+          "sink negation");
+STATISTIC(NegatorNumInstructionsCreatedTotal,
+          "Negator: Number of new negated instructions created, total");
+STATISTIC(NegatorMaxInstructionsCreated,
+          "Negator: Maximal number of new instructions created during negation "
+          "attempt");
+STATISTIC(NegatorNumInstructionsNegatedSuccess,
+          "Negator: Number of new negated instructions created in successful "
+          "negation sinking attempts");
+
+DEBUG_COUNTER(NegatorCounter, "instcombine-negator",
+              "Controls Negator transformations in InstCombine pass");
+
+static cl::opt<bool>
+    NegatorEnabled("instcombine-negator-enabled", cl::init(true),
+                   cl::desc("Should we attempt to sink negations?"));
+
+static cl::opt<unsigned>
+    NegatorMaxDepth("instcombine-negator-max-depth",
+                    cl::init(NegatorDefaultMaxDepth),
+                    cl::desc("What is the maximal lookup depth when trying to "
+                             "check for viability of negation sinking."));
+
+Negator::Negator(LLVMContext &C, const DataLayout &DL_, AssumptionCache &AC_,
+                 const DominatorTree &DT_, bool IsTrulyNegation_)
+    : Builder(C, TargetFolder(DL_),
+              IRBuilderCallbackInserter([&](Instruction *I) {
+                ++NegatorNumInstructionsCreatedTotal;
+                NewInstructions.push_back(I);
+              })),
+      DL(DL_), AC(AC_), DT(DT_), IsTrulyNegation(IsTrulyNegation_) {}
+
+#if LLVM_ENABLE_STATS
+Negator::~Negator() {
+  NegatorMaxTotalValuesVisited.updateMax(NumValuesVisitedInThisNegator);
+}
+#endif
+
+// FIXME: can this be reworked into a worklist-based algorithm while preserving
+// the depth-first, early bailout traversal?
+LLVM_NODISCARD Value *Negator::visitImpl(Value *V, unsigned Depth) {
+  // -(undef) -> undef.
+  if (match(V, m_Undef()))
+    return V;
+
+  // In i1, negation can simply be ignored.
+  if (V->getType()->isIntOrIntVectorTy(1))
+    return V;
+
+  Value *X;
+
+  // -(-(X)) -> X.
+  if (match(V, m_Neg(m_Value(X))))
+    return X;
+
+  // Integral constants can be freely negated.
+  if (match(V, m_AnyIntegralConstant()))
+    return ConstantExpr::getNeg(cast<Constant>(V), /*HasNUW=*/false,
+                                /*HasNSW=*/false);
+
+  // If we have a non-instruction, then give up.
+  if (!isa<Instruction>(V))
+    return nullptr;
+
+  // If we have started with a true negation (i.e. `sub 0, %y`), then if we've
+  // got instruction that does not require recursive reasoning, we can still
+  // negate it even if it has other uses, without increasing instruction count.
+  if (!V->hasOneUse() && !IsTrulyNegation)
+    return nullptr;
+
+  auto *I = cast<Instruction>(V);
+  unsigned BitWidth = I->getType()->getScalarSizeInBits();
+
+  // We must preserve the insertion point and debug info that is set in the
+  // builder at the time this function is called.
+  InstCombiner::BuilderTy::InsertPointGuard Guard(Builder);
+  // And since we are trying to negate instruction I, that tells us about the
+  // insertion point and the debug info that we need to keep.
+  Builder.SetInsertPoint(I);
+
+  // In some cases we can give the answer without further recursion.
+  switch (I->getOpcode()) {
+  case Instruction::Add:
+    // `inc` is always negatible.
+    if (match(I->getOperand(1), m_One()))
+      return Builder.CreateNot(I->getOperand(0), I->getName() + ".neg");
+    break;
+  case Instruction::Xor:
+    // `not` is always negatible.
+    if (match(I, m_Not(m_Value(X))))
+      return Builder.CreateAdd(X, ConstantInt::get(X->getType(), 1),
+                               I->getName() + ".neg");
+    break;
+  case Instruction::AShr:
+  case Instruction::LShr: {
+    // Right-shift sign bit smear is negatible.
+    const APInt *Op1Val;
+    if (match(I->getOperand(1), m_APInt(Op1Val)) && *Op1Val == BitWidth - 1) {
+      Value *BO = I->getOpcode() == Instruction::AShr
+                      ? Builder.CreateLShr(I->getOperand(0), I->getOperand(1))
+                      : Builder.CreateAShr(I->getOperand(0), I->getOperand(1));
+      if (auto *NewInstr = dyn_cast<Instruction>(BO)) {
+        NewInstr->copyIRFlags(I);
+        NewInstr->setName(I->getName() + ".neg");
+      }
+      return BO;
+    }
+    break;
+  }
+  case Instruction::SExt:
+  case Instruction::ZExt:
+    // `*ext` of i1 is always negatible
+    if (I->getOperand(0)->getType()->isIntOrIntVectorTy(1))
+      return I->getOpcode() == Instruction::SExt
+                 ? Builder.CreateZExt(I->getOperand(0), I->getType(),
+                                      I->getName() + ".neg")
+                 : Builder.CreateSExt(I->getOperand(0), I->getType(),
+                                      I->getName() + ".neg");
+    break;
+  default:
+    break; // Other instructions require recursive reasoning.
+  }
+
+  // Some other cases, while still don't require recursion,
+  // are restricted to the one-use case.
+  if (!V->hasOneUse())
+    return nullptr;
+
+  switch (I->getOpcode()) {
+  case Instruction::Sub:
+    // `sub` is always negatible.
+    // But if the old `sub` sticks around, even thought we don't increase
+    // instruction count, this is a likely regression since we increased
+    // live-range of *both* of the operands, which might lead to more spilling.
+    return Builder.CreateSub(I->getOperand(1), I->getOperand(0),
+                             I->getName() + ".neg");
+  case Instruction::SDiv:
+    // `sdiv` is negatible if divisor is not undef/INT_MIN/1.
+    // While this is normally not behind a use-check,
+    // let's consider division to be special since it's costly.
+    if (auto *Op1C = dyn_cast<Constant>(I->getOperand(1))) {
+      if (!Op1C->containsUndefElement() && Op1C->isNotMinSignedValue() &&
+          Op1C->isNotOneValue()) {
+        Value *BO =
+            Builder.CreateSDiv(I->getOperand(0), ConstantExpr::getNeg(Op1C),
+                               I->getName() + ".neg");
+        if (auto *NewInstr = dyn_cast<Instruction>(BO))
+          NewInstr->setIsExact(I->isExact());
+        return BO;
+      }
+    }
+    break;
+  }
+
+  // Rest of the logic is recursive, so if it's time to give up then it's time.
+  if (Depth > NegatorMaxDepth) {
+    LLVM_DEBUG(dbgs() << "Negator: reached maximal allowed traversal depth in "
+                      << *V << ". Giving up.\n");
+    ++NegatorTimesDepthLimitReached;
+    return nullptr;
+  }
+
+  switch (I->getOpcode()) {
+  case Instruction::PHI: {
+    // `phi` is negatible if all the incoming values are negatible.
+    auto *PHI = cast<PHINode>(I);
+    SmallVector<Value *, 4> NegatedIncomingValues(PHI->getNumOperands());
+    for (auto I : zip(PHI->incoming_values(), NegatedIncomingValues)) {
+      if (!(std::get<1>(I) =
+                negate(std::get<0>(I), Depth + 1))) // Early return.
+        return nullptr;
+    }
+    // All incoming values are indeed negatible. Create negated PHI node.
+    PHINode *NegatedPHI = Builder.CreatePHI(
+        PHI->getType(), PHI->getNumOperands(), PHI->getName() + ".neg");
+    for (auto I : zip(NegatedIncomingValues, PHI->blocks()))
+      NegatedPHI->addIncoming(std::get<0>(I), std::get<1>(I));
+    return NegatedPHI;
+  }
+  case Instruction::Select: {
+    {
+      // `abs`/`nabs` is always negatible.
+      Value *LHS, *RHS;
+      SelectPatternFlavor SPF =
+          matchSelectPattern(I, LHS, RHS, /*CastOp=*/nullptr, Depth).Flavor;
+      if (SPF == SPF_ABS || SPF == SPF_NABS) {
+        auto *NewSelect = cast<SelectInst>(I->clone());
+        // Just swap the operands of the select.
+        NewSelect->swapValues();
+        // Don't swap prof metadata, we didn't change the branch behavior.
+        NewSelect->setName(I->getName() + ".neg");
+        Builder.Insert(NewSelect);
+        return NewSelect;
+      }
+    }
+    // `select` is negatible if both hands of `select` are negatible.
+    Value *NegOp1 = negate(I->getOperand(1), Depth + 1);
+    if (!NegOp1) // Early return.
+      return nullptr;
+    Value *NegOp2 = negate(I->getOperand(2), Depth + 1);
+    if (!NegOp2)
+      return nullptr;
+    // Do preserve the metadata!
+    return Builder.CreateSelect(I->getOperand(0), NegOp1, NegOp2,
+                                I->getName() + ".neg", /*MDFrom=*/I);
+  }
+  case Instruction::ShuffleVector: {
+    // `shufflevector` is negatible if both operands are negatible.
+    auto *Shuf = cast<ShuffleVectorInst>(I);
+    Value *NegOp0 = negate(I->getOperand(0), Depth + 1);
+    if (!NegOp0) // Early return.
+      return nullptr;
+    Value *NegOp1 = negate(I->getOperand(1), Depth + 1);
+    if (!NegOp1)
+      return nullptr;
+    return Builder.CreateShuffleVector(NegOp0, NegOp1, Shuf->getShuffleMask(),
+                                       I->getName() + ".neg");
+  }
+  case Instruction::ExtractElement: {
+    // `extractelement` is negatible if source operand is negatible.
+    auto *EEI = cast<ExtractElementInst>(I);
+    Value *NegVector = negate(EEI->getVectorOperand(), Depth + 1);
+    if (!NegVector) // Early return.
+      return nullptr;
+    return Builder.CreateExtractElement(NegVector, EEI->getIndexOperand(),
+                                        I->getName() + ".neg");
+  }
+  case Instruction::InsertElement: {
+    // `insertelement` is negatible if both the source vector and
+    // element-to-be-inserted are negatible.
+    auto *IEI = cast<InsertElementInst>(I);
+    Value *NegVector = negate(IEI->getOperand(0), Depth + 1);
+    if (!NegVector) // Early return.
+      return nullptr;
+    Value *NegNewElt = negate(IEI->getOperand(1), Depth + 1);
+    if (!NegNewElt) // Early return.
+      return nullptr;
+    return Builder.CreateInsertElement(NegVector, NegNewElt, IEI->getOperand(2),
+                                       I->getName() + ".neg");
+  }
+  case Instruction::Trunc: {
+    // `trunc` is negatible if its operand is negatible.
+    Value *NegOp = negate(I->getOperand(0), Depth + 1);
+    if (!NegOp) // Early return.
+      return nullptr;
+    return Builder.CreateTrunc(NegOp, I->getType(), I->getName() + ".neg");
+  }
+  case Instruction::Shl: {
+    // `shl` is negatible if the first operand is negatible.
+    Value *NegOp0 = negate(I->getOperand(0), Depth + 1);
+    if (!NegOp0) // Early return.
+      return nullptr;
+    return Builder.CreateShl(NegOp0, I->getOperand(1), I->getName() + ".neg");
+  }
+  case Instruction::Or:
+    if (!haveNoCommonBitsSet(I->getOperand(0), I->getOperand(1), DL, &AC, I,
+                             &DT))
+      return nullptr; // Don't know how to handle `or` in general.
+    // `or`/`add` are interchangeable when operands have no common bits set.
+    // `inc` is always negatible.
+    if (match(I->getOperand(1), m_One()))
+      return Builder.CreateNot(I->getOperand(0), I->getName() + ".neg");
+    // Else, just defer to Instruction::Add handling.
+    LLVM_FALLTHROUGH;
+  case Instruction::Add: {
+    // `add` is negatible if both of its operands are negatible.
+    Value *NegOp0 = negate(I->getOperand(0), Depth + 1);
+    if (!NegOp0) // Early return.
+      return nullptr;
+    Value *NegOp1 = negate(I->getOperand(1), Depth + 1);
+    if (!NegOp1)
+      return nullptr;
+    return Builder.CreateAdd(NegOp0, NegOp1, I->getName() + ".neg");
+  }
+  case Instruction::Xor:
+    // `xor` is negatible if one of its operands is invertible.
+    // FIXME: InstCombineInverter? But how to connect Inverter and Negator?
+    if (auto *C = dyn_cast<Constant>(I->getOperand(1))) {
+      Value *Xor = Builder.CreateXor(I->getOperand(0), ConstantExpr::getNot(C));
+      return Builder.CreateAdd(Xor, ConstantInt::get(Xor->getType(), 1),
+                               I->getName() + ".neg");
+    }
+    return nullptr;
+  case Instruction::Mul: {
+    // `mul` is negatible if one of its operands is negatible.
+    Value *NegatedOp, *OtherOp;
+    // First try the second operand, in case it's a constant it will be best to
+    // just invert it instead of sinking the `neg` deeper.
+    if (Value *NegOp1 = negate(I->getOperand(1), Depth + 1)) {
+      NegatedOp = NegOp1;
+      OtherOp = I->getOperand(0);
+    } else if (Value *NegOp0 = negate(I->getOperand(0), Depth + 1)) {
+      NegatedOp = NegOp0;
+      OtherOp = I->getOperand(1);
+    } else
+      // Can't negate either of them.
+      return nullptr;
+    return Builder.CreateMul(NegatedOp, OtherOp, I->getName() + ".neg");
+  }
+  default:
+    return nullptr; // Don't know, likely not negatible for free.
+  }
+
+  llvm_unreachable("Can't get here. We always return from switch.");
+}
+
+LLVM_NODISCARD Value *Negator::negate(Value *V, unsigned Depth) {
+  NegatorMaxDepthVisited.updateMax(Depth);
+  ++NegatorNumValuesVisited;
+
+#if LLVM_ENABLE_STATS
+  ++NumValuesVisitedInThisNegator;
+#endif
+
+#ifndef NDEBUG
+  // We can't ever have a Value with such an address.
+  Value *Placeholder = reinterpret_cast<Value *>(static_cast<uintptr_t>(-1));
+#endif
+
+  // Did we already try to negate this value?
+  auto NegationsCacheIterator = NegationsCache.find(V);
+  if (NegationsCacheIterator != NegationsCache.end()) {
+    ++NegatorNumNegationsFoundInCache;
+    Value *NegatedV = NegationsCacheIterator->second;
+    assert(NegatedV != Placeholder && "Encountered a cycle during negation.");
+    return NegatedV;
+  }
+
+#ifndef NDEBUG
+  // We did not find a cached result for negation of V. While there,
+  // let's temporairly cache a placeholder value, with the idea that if later
+  // during negation we fetch it from cache, we'll know we're in a cycle.
+  NegationsCache[V] = Placeholder;
+#endif
+
+  // No luck. Try negating it for real.
+  Value *NegatedV = visitImpl(V, Depth);
+  // And cache the (real) result for the future.
+  NegationsCache[V] = NegatedV;
+
+  return NegatedV;
+}
+
+LLVM_NODISCARD Optional<Negator::Result> Negator::run(Value *Root) {
+  Value *Negated = negate(Root, /*Depth=*/0);
+  if (!Negated) {
+    // We must cleanup newly-inserted instructions, to avoid any potential
+    // endless combine looping.
+    llvm::for_each(llvm::reverse(NewInstructions),
+                   [&](Instruction *I) { I->eraseFromParent(); });
+    return llvm::None;
+  }
+  return std::make_pair(ArrayRef<Instruction *>(NewInstructions), Negated);
+}
+
+LLVM_NODISCARD Value *Negator::Negate(bool LHSIsZero, Value *Root,
+                                      InstCombiner &IC) {
+  ++NegatorTotalNegationsAttempted;
+  LLVM_DEBUG(dbgs() << "Negator: attempting to sink negation into " << *Root
+                    << "\n");
+
+  if (!NegatorEnabled || !DebugCounter::shouldExecute(NegatorCounter))
+    return nullptr;
+
+  Negator N(Root->getContext(), IC.getDataLayout(), IC.getAssumptionCache(),
+            IC.getDominatorTree(), LHSIsZero);
+  Optional<Result> Res = N.run(Root);
+  if (!Res) { // Negation failed.
+    LLVM_DEBUG(dbgs() << "Negator: failed to sink negation into " << *Root
+                      << "\n");
+    return nullptr;
+  }
+
+  LLVM_DEBUG(dbgs() << "Negator: successfully sunk negation into " << *Root
+                    << "\n         NEW: " << *Res->second << "\n");
+  ++NegatorNumTreesNegated;
+
+  // We must temporarily unset the 'current' insertion point and DebugLoc of the
+  // InstCombine's IRBuilder so that it won't interfere with the ones we have
+  // already specified when producing negated instructions.
+  InstCombiner::BuilderTy::InsertPointGuard Guard(IC.Builder);
+  IC.Builder.ClearInsertionPoint();
+  IC.Builder.SetCurrentDebugLocation(DebugLoc());
+
+  // And finally, we must add newly-created instructions into the InstCombine's
+  // worklist (in a proper order!) so it can attempt to combine them.
+  LLVM_DEBUG(dbgs() << "Negator: Propagating " << Res->first.size()
+                    << " instrs to InstCombine\n");
+  NegatorMaxInstructionsCreated.updateMax(Res->first.size());
+  NegatorNumInstructionsNegatedSuccess += Res->first.size();
+
+  // They are in def-use order, so nothing fancy, just insert them in order.
+  llvm::for_each(Res->first,
+                 [&](Instruction *I) { IC.Builder.Insert(I, I->getName()); });
+
+  // And return the new root.
+  return Res->second;
+}
diff --git a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
index 74e015a4f1d4..2b2f2e1b9470 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombinePHI.cpp
@@ -218,13 +218,21 @@ Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) {
     return nullptr;
 
   // If any of the operand that requires casting is a terminator
-  // instruction, do not do it.
+  // instruction, do not do it. Similarly, do not do the transform if the value
+  // is PHI in a block with no insertion point, for example, a catchswitch
+  // block, since we will not be able to insert a cast after the PHI.
   if (any_of(AvailablePtrVals, [&](Value *V) {
         if (V->getType() == IntToPtr->getType())
           return false;
-
         auto *Inst = dyn_cast<Instruction>(V);
-        return Inst && Inst->isTerminator();
+        if (!Inst)
+          return false;
+        if (Inst->isTerminator())
+          return true;
+        auto *BB = Inst->getParent();
+        if (isa<PHINode>(Inst) && BB->getFirstInsertionPt() == BB->end())
+          return true;
+        return false;
       }))
     return nullptr;
 
@@ -264,8 +272,10 @@ Instruction *InstCombiner::FoldIntegerTypedPHI(PHINode &PN) {
       if (auto *IncomingI = dyn_cast<Instruction>(IncomingVal)) {
         BasicBlock::iterator InsertPos(IncomingI);
         InsertPos++;
+        BasicBlock *BB = IncomingI->getParent();
         if (isa<PHINode>(IncomingI))
-          InsertPos = IncomingI->getParent()->getFirstInsertionPt();
+          InsertPos = BB->getFirstInsertionPt();
+        assert(InsertPos != BB->end() && "should have checked above");
         InsertNewInstBefore(CI, *InsertPos);
       } else {
         auto *InsertBB = &IncomingBB->getParent()->getEntryBlock();
@@ -544,7 +554,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
   // visitLoadInst will propagate an alignment onto the load when TD is around,
   // and if TD isn't around, we can't handle the mixed case.
   bool isVolatile = FirstLI->isVolatile();
-  MaybeAlign LoadAlignment(FirstLI->getAlignment());
+  Align LoadAlignment = FirstLI->getAlign();
   unsigned LoadAddrSpace = FirstLI->getPointerAddressSpace();
 
   // We can't sink the load if the loaded value could be modified between the
@@ -574,12 +584,7 @@ Instruction *InstCombiner::FoldPHIArgLoadIntoPHI(PHINode &PN) {
         !isSafeAndProfitableToSinkLoad(LI))
       return nullptr;
 
-    // If some of the loads have an alignment specified but not all of them,
-    // we can't do the transformation.
-    if ((LoadAlignment.hasValue()) != (LI->getAlignment() != 0))
-      return nullptr;
-
-    LoadAlignment = std::min(LoadAlignment, MaybeAlign(LI->getAlignment()));
+    LoadAlignment = std::min(LoadAlignment, Align(LI->getAlign()));
 
     // If the PHI is of volatile loads and the load block has multiple
     // successors, sinking it would remove a load of the volatile value from
@@ -1184,15 +1189,22 @@ Instruction *InstCombiner::visitPHINode(PHINode &PN) {
     if (CmpInst && isa<IntegerType>(PN.getType()) && CmpInst->isEquality() &&
         match(CmpInst->getOperand(1), m_Zero())) {
       ConstantInt *NonZeroConst = nullptr;
+      bool MadeChange = false;
       for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
         Instruction *CtxI = PN.getIncomingBlock(i)->getTerminator();
         Value *VA = PN.getIncomingValue(i);
         if (isKnownNonZero(VA, DL, 0, &AC, CtxI, &DT)) {
           if (!NonZeroConst)
             NonZeroConst = GetAnyNonZeroConstInt(PN);
-          PN.setIncomingValue(i, NonZeroConst);
+
+          if (NonZeroConst != VA) {
+            replaceOperand(PN, i, NonZeroConst);
+            MadeChange = true;
+          }
         }
       }
+      if (MadeChange)
+        return &PN;
     }
   }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 05a624fde86b..17124f717af7 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -56,7 +56,8 @@ static Value *createMinMax(InstCombiner::BuilderTy &Builder,
 /// Replace a select operand based on an equality comparison with the identity
 /// constant of a binop.
 static Instruction *foldSelectBinOpIdentity(SelectInst &Sel,
-                                            const TargetLibraryInfo &TLI) {
+                                            const TargetLibraryInfo &TLI,
+                                            InstCombiner &IC) {
   // The select condition must be an equality compare with a constant operand.
   Value *X;
   Constant *C;
@@ -107,8 +108,7 @@ static Instruction *foldSelectBinOpIdentity(SelectInst &Sel,
   // S = { select (cmp eq X, C), BO, ? } or { select (cmp ne X, C), ?, BO }
   // =>
   // S = { select (cmp eq X, C),  Y, ? } or { select (cmp ne X, C), ?,  Y }
-  Sel.setOperand(IsEq ? 1 : 2, Y);
-  return &Sel;
+  return IC.replaceOperand(Sel, IsEq ? 1 : 2, Y);
 }
 
 /// This folds:
@@ -301,10 +301,11 @@ Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
 
     // The select condition may be a vector. We may only change the operand
     // type if the vector width remains the same (and matches the condition).
-    if (CondTy->isVectorTy()) {
+    if (auto *CondVTy = dyn_cast<VectorType>(CondTy)) {
       if (!FIOpndTy->isVectorTy())
         return nullptr;
-      if (CondTy->getVectorNumElements() != FIOpndTy->getVectorNumElements())
+      if (CondVTy->getNumElements() !=
+          cast<VectorType>(FIOpndTy)->getNumElements())
         return nullptr;
 
       // TODO: If the backend knew how to deal with casts better, we could
@@ -338,11 +339,7 @@ Instruction *InstCombiner::foldSelectOpOp(SelectInst &SI, Instruction *TI,
   if (match(TI, m_FNeg(m_Value(X))) && match(FI, m_FNeg(m_Value(Y))) &&
       (TI->hasOneUse() || FI->hasOneUse())) {
     Value *NewSel = Builder.CreateSelect(Cond, X, Y, SI.getName() + ".v", &SI);
-    // TODO: Remove the hack for the binop form when the unary op is optimized
-    //       properly with all IR passes.
-    if (TI->getOpcode() != Instruction::FNeg)
-      return BinaryOperator::CreateFNegFMF(NewSel, cast<BinaryOperator>(TI));
-    return UnaryOperator::CreateFNeg(NewSel);
+    return UnaryOperator::CreateFNegFMF(NewSel, TI);
   }
 
   // Only handle binary operators (including two-operand getelementptr) with
@@ -674,6 +671,38 @@ static Value *foldSelectICmpAndOr(const ICmpInst *IC, Value *TrueVal,
   return Builder.CreateOr(V, Y);
 }
 
+/// Canonicalize a set or clear of a masked set of constant bits to
+/// select-of-constants form.
+static Instruction *foldSetClearBits(SelectInst &Sel,
+                                     InstCombiner::BuilderTy &Builder) {
+  Value *Cond = Sel.getCondition();
+  Value *T = Sel.getTrueValue();
+  Value *F = Sel.getFalseValue();
+  Type *Ty = Sel.getType();
+  Value *X;
+  const APInt *NotC, *C;
+
+  // Cond ? (X & ~C) : (X | C) --> (X & ~C) | (Cond ? 0 : C)
+  if (match(T, m_And(m_Value(X), m_APInt(NotC))) &&
+      match(F, m_OneUse(m_Or(m_Specific(X), m_APInt(C)))) && *NotC == ~(*C)) {
+    Constant *Zero = ConstantInt::getNullValue(Ty);
+    Constant *OrC = ConstantInt::get(Ty, *C);
+    Value *NewSel = Builder.CreateSelect(Cond, Zero, OrC, "masksel", &Sel);
+    return BinaryOperator::CreateOr(T, NewSel);
+  }
+
+  // Cond ? (X | C) : (X & ~C) --> (X & ~C) | (Cond ? C : 0)
+  if (match(F, m_And(m_Value(X), m_APInt(NotC))) &&
+      match(T, m_OneUse(m_Or(m_Specific(X), m_APInt(C)))) && *NotC == ~(*C)) {
+    Constant *Zero = ConstantInt::getNullValue(Ty);
+    Constant *OrC = ConstantInt::get(Ty, *C);
+    Value *NewSel = Builder.CreateSelect(Cond, OrC, Zero, "masksel", &Sel);
+    return BinaryOperator::CreateOr(F, NewSel);
+  }
+
+  return nullptr;
+}
+
 /// Transform patterns such as (a > b) ? a - b : 0 into usub.sat(a, b).
 /// There are 8 commuted/swapped variants of this pattern.
 /// TODO: Also support a - UMIN(a,b) patterns.
@@ -857,16 +886,16 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
   if (!ICI->isEquality() || !match(CmpRHS, m_Zero()))
     return nullptr;
 
-  Value *Count = FalseVal;
+  Value *SelectArg = FalseVal;
   Value *ValueOnZero = TrueVal;
   if (Pred == ICmpInst::ICMP_NE)
-    std::swap(Count, ValueOnZero);
+    std::swap(SelectArg, ValueOnZero);
 
   // Skip zero extend/truncate.
-  Value *V = nullptr;
-  if (match(Count, m_ZExt(m_Value(V))) ||
-      match(Count, m_Trunc(m_Value(V))))
-    Count = V;
+  Value *Count = nullptr;
+  if (!match(SelectArg, m_ZExt(m_Value(Count))) &&
+      !match(SelectArg, m_Trunc(m_Value(Count))))
+    Count = SelectArg;
 
   // Check that 'Count' is a call to intrinsic cttz/ctlz. Also check that the
   // input to the cttz/ctlz is used as LHS for the compare instruction.
@@ -880,17 +909,17 @@ static Value *foldSelectCttzCtlz(ICmpInst *ICI, Value *TrueVal, Value *FalseVal,
   // sizeof in bits of 'Count'.
   unsigned SizeOfInBits = Count->getType()->getScalarSizeInBits();
   if (match(ValueOnZero, m_SpecificInt(SizeOfInBits))) {
-    // Explicitly clear the 'undef_on_zero' flag.
-    IntrinsicInst *NewI = cast<IntrinsicInst>(II->clone());
-    NewI->setArgOperand(1, ConstantInt::getFalse(NewI->getContext()));
-    Builder.Insert(NewI);
-    return Builder.CreateZExtOrTrunc(NewI, ValueOnZero->getType());
+    // Explicitly clear the 'undef_on_zero' flag. It's always valid to go from
+    // true to false on this flag, so we can replace it for all users.
+    II->setArgOperand(1, ConstantInt::getFalse(II->getContext()));
+    return SelectArg;
   }
 
-  // If the ValueOnZero is not the bitwidth, we can at least make use of the
-  // fact that the cttz/ctlz result will not be used if the input is zero, so
-  // it's okay to relax it to undef for that case.
-  if (II->hasOneUse() && !match(II->getArgOperand(1), m_One()))
+  // The ValueOnZero is not the bitwidth. But if the cttz/ctlz (and optional
+  // zext/trunc) have one use (ending at the select), the cttz/ctlz result will
+  // not be used if the input is zero. Relax to 'undef_on_zero' for that case.
+  if (II->hasOneUse() && SelectArg->hasOneUse() &&
+      !match(II->getArgOperand(1), m_One()))
     II->setArgOperand(1, ConstantInt::getTrue(II->getContext()));
 
   return nullptr;
@@ -997,7 +1026,7 @@ static bool adjustMinMax(SelectInst &Sel, ICmpInst &Cmp) {
 /// constant operand of the select.
 static Instruction *
 canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
-                               InstCombiner::BuilderTy &Builder) {
+                               InstCombiner &IC) {
   if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
     return nullptr;
 
@@ -1013,8 +1042,14 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
       Cmp.getPredicate() == CanonicalPred)
     return nullptr;
 
+  // Bail out on unsimplified X-0 operand (due to some worklist management bug),
+  // as this may cause an infinite combine loop. Let the sub be folded first.
+  if (match(LHS, m_Sub(m_Value(), m_Zero())) ||
+      match(RHS, m_Sub(m_Value(), m_Zero())))
+    return nullptr;
+
   // Create the canonical compare and plug it into the select.
-  Sel.setCondition(Builder.CreateICmp(CanonicalPred, LHS, RHS));
+  IC.replaceOperand(Sel, 0, IC.Builder.CreateICmp(CanonicalPred, LHS, RHS));
 
   // If the select operands did not change, we're done.
   if (Sel.getTrueValue() == LHS && Sel.getFalseValue() == RHS)
@@ -1035,7 +1070,7 @@ canonicalizeMinMaxWithConstant(SelectInst &Sel, ICmpInst &Cmp,
 /// Canonicalize all these variants to 1 pattern.
 /// This makes CSE more likely.
 static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
-                                        InstCombiner::BuilderTy &Builder) {
+                                        InstCombiner &IC) {
   if (!Cmp.hasOneUse() || !isa<Constant>(Cmp.getOperand(1)))
     return nullptr;
 
@@ -1067,10 +1102,11 @@ static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
   if (CmpCanonicalized && RHSCanonicalized)
     return nullptr;
 
-  // If RHS is used by other instructions except compare and select, don't
-  // canonicalize it to not increase the instruction count.
-  if (!(RHS->hasOneUse() || (RHS->hasNUses(2) && CmpUsesNegatedOp)))
-    return nullptr;
+  // If RHS is not canonical but is used by other instructions, don't
+  // canonicalize it and potentially increase the instruction count.
+  if (!RHSCanonicalized)
+    if (!(RHS->hasOneUse() || (RHS->hasNUses(2) && CmpUsesNegatedOp)))
+      return nullptr;
 
   // Create the canonical compare: icmp slt LHS 0.
   if (!CmpCanonicalized) {
@@ -1083,12 +1119,14 @@ static Instruction *canonicalizeAbsNabs(SelectInst &Sel, ICmpInst &Cmp,
   // Create the canonical RHS: RHS = sub (0, LHS).
   if (!RHSCanonicalized) {
     assert(RHS->hasOneUse() && "RHS use number is not right");
-    RHS = Builder.CreateNeg(LHS);
+    RHS = IC.Builder.CreateNeg(LHS);
     if (TVal == LHS) {
-      Sel.setFalseValue(RHS);
+      // Replace false value.
+      IC.replaceOperand(Sel, 2, RHS);
       FVal = RHS;
     } else {
-      Sel.setTrueValue(RHS);
+      // Replace true value.
+      IC.replaceOperand(Sel, 1, RHS);
       TVal = RHS;
     }
   }
@@ -1322,7 +1360,7 @@ static Instruction *canonicalizeClampLike(SelectInst &Sel0, ICmpInst &Cmp0,
 // and swap the hands of select.
 static Instruction *
 tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
-                                         InstCombiner::BuilderTy &Builder) {
+                                         InstCombiner &IC) {
   ICmpInst::Predicate Pred;
   Value *X;
   Constant *C0;
@@ -1374,13 +1412,13 @@ tryToReuseConstantFromSelectInComparison(SelectInst &Sel, ICmpInst &Cmp,
     return nullptr;
 
   // It matched! Lets insert the new comparison just before select.
-  InstCombiner::BuilderTy::InsertPointGuard Guard(Builder);
-  Builder.SetInsertPoint(&Sel);
+  InstCombiner::BuilderTy::InsertPointGuard Guard(IC.Builder);
+  IC.Builder.SetInsertPoint(&Sel);
 
   Pred = ICmpInst::getSwappedPredicate(Pred); // Yes, swapped.
-  Value *NewCmp = Builder.CreateICmp(Pred, X, FlippedStrictness->second,
-                                     Cmp.getName() + ".inv");
-  Sel.setCondition(NewCmp);
+  Value *NewCmp = IC.Builder.CreateICmp(Pred, X, FlippedStrictness->second,
+                                        Cmp.getName() + ".inv");
+  IC.replaceOperand(Sel, 0, NewCmp);
   Sel.swapValues();
   Sel.swapProfMetadata();
 
@@ -1393,17 +1431,17 @@ Instruction *InstCombiner::foldSelectInstWithICmp(SelectInst &SI,
   if (Value *V = foldSelectValueEquivalence(SI, *ICI, SQ))
     return replaceInstUsesWith(SI, V);
 
-  if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, Builder))
+  if (Instruction *NewSel = canonicalizeMinMaxWithConstant(SI, *ICI, *this))
     return NewSel;
 
-  if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, Builder))
+  if (Instruction *NewAbs = canonicalizeAbsNabs(SI, *ICI, *this))
     return NewAbs;
 
   if (Instruction *NewAbs = canonicalizeClampLike(SI, *ICI, Builder))
     return NewAbs;
 
   if (Instruction *NewSel =
-          tryToReuseConstantFromSelectInComparison(SI, *ICI, Builder))
+          tryToReuseConstantFromSelectInComparison(SI, *ICI, *this))
     return NewSel;
 
   bool Changed = adjustMinMax(SI, *ICI);
@@ -1892,7 +1930,7 @@ Instruction *InstCombiner::foldSelectExtConst(SelectInst &Sel) {
   Type *SelType = Sel.getType();
   Constant *TruncC = ConstantExpr::getTrunc(C, SmallType);
   Constant *ExtC = ConstantExpr::getCast(ExtOpcode, TruncC, SelType);
-  if (ExtC == C) {
+  if (ExtC == C && ExtInst->hasOneUse()) {
     Value *TruncCVal = cast<Value>(TruncC);
     if (ExtInst == Sel.getFalseValue())
       std::swap(X, TruncCVal);
@@ -1931,10 +1969,9 @@ static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
   if (!CondVal->getType()->isVectorTy() || !match(CondVal, m_Constant(CondC)))
     return nullptr;
 
-  unsigned NumElts = CondVal->getType()->getVectorNumElements();
-  SmallVector<Constant *, 16> Mask;
+  unsigned NumElts = cast<VectorType>(CondVal->getType())->getNumElements();
+  SmallVector<int, 16> Mask;
   Mask.reserve(NumElts);
-  Type *Int32Ty = Type::getInt32Ty(CondVal->getContext());
   for (unsigned i = 0; i != NumElts; ++i) {
     Constant *Elt = CondC->getAggregateElement(i);
     if (!Elt)
@@ -1942,10 +1979,10 @@ static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
 
     if (Elt->isOneValue()) {
       // If the select condition element is true, choose from the 1st vector.
-      Mask.push_back(ConstantInt::get(Int32Ty, i));
+      Mask.push_back(i);
     } else if (Elt->isNullValue()) {
       // If the select condition element is false, choose from the 2nd vector.
-      Mask.push_back(ConstantInt::get(Int32Ty, i + NumElts));
+      Mask.push_back(i + NumElts);
     } else if (isa<UndefValue>(Elt)) {
       // Undef in a select condition (choose one of the operands) does not mean
       // the same thing as undef in a shuffle mask (any value is acceptable), so
@@ -1957,8 +1994,7 @@ static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
     }
   }
 
-  return new ShuffleVectorInst(SI.getTrueValue(), SI.getFalseValue(),
-                               ConstantVector::get(Mask));
+  return new ShuffleVectorInst(SI.getTrueValue(), SI.getFalseValue(), Mask);
 }
 
 /// If we have a select of vectors with a scalar condition, try to convert that
@@ -1966,23 +2002,21 @@ static Instruction *canonicalizeSelectToShuffle(SelectInst &SI) {
 /// other operations in IR and having all operands of a select be vector types
 /// is likely better for vector codegen.
 static Instruction *canonicalizeScalarSelectOfVecs(
-    SelectInst &Sel, InstCombiner::BuilderTy &Builder) {
-  Type *Ty = Sel.getType();
-  if (!Ty->isVectorTy())
+    SelectInst &Sel, InstCombiner &IC) {
+  auto *Ty = dyn_cast<VectorType>(Sel.getType());
+  if (!Ty)
     return nullptr;
 
   // We can replace a single-use extract with constant index.
   Value *Cond = Sel.getCondition();
-  if (!match(Cond, m_OneUse(m_ExtractElement(m_Value(), m_ConstantInt()))))
+  if (!match(Cond, m_OneUse(m_ExtractElt(m_Value(), m_ConstantInt()))))
     return nullptr;
 
   // select (extelt V, Index), T, F --> select (splat V, Index), T, F
   // Splatting the extracted condition reduces code (we could directly create a
   // splat shuffle of the source vector to eliminate the intermediate step).
-  unsigned NumElts = Ty->getVectorNumElements();
-  Value *SplatCond = Builder.CreateVectorSplat(NumElts, Cond);
-  Sel.setCondition(SplatCond);
-  return &Sel;
+  unsigned NumElts = Ty->getNumElements();
+  return IC.replaceOperand(Sel, 0, IC.Builder.CreateVectorSplat(NumElts, Cond));
 }
 
 /// Reuse bitcasted operands between a compare and select:
@@ -2055,7 +2089,7 @@ static Instruction *foldSelectCmpBitcasts(SelectInst &Sel,
 ///   %1 = extractvalue { i64, i1 } %0, 0
 ///   ret i64 %1
 ///
-static Instruction *foldSelectCmpXchg(SelectInst &SI) {
+static Value *foldSelectCmpXchg(SelectInst &SI) {
   // A helper that determines if V is an extractvalue instruction whose
   // aggregate operand is a cmpxchg instruction and whose single index is equal
   // to I. If such conditions are true, the helper returns the cmpxchg
@@ -2087,19 +2121,15 @@ static Instruction *foldSelectCmpXchg(SelectInst &SI) {
   // value of the same cmpxchg used by the condition, and the false value is the
   // cmpxchg instruction's compare operand.
   if (auto *X = isExtractFromCmpXchg(SI.getTrueValue(), 0))
-    if (X == CmpXchg && X->getCompareOperand() == SI.getFalseValue()) {
-      SI.setTrueValue(SI.getFalseValue());
-      return &SI;
-    }
+    if (X == CmpXchg && X->getCompareOperand() == SI.getFalseValue())
+      return SI.getFalseValue();
 
   // Check the false value case: The false value of the select is the returned
   // value of the same cmpxchg used by the condition, and the true value is the
   // cmpxchg instruction's compare operand.
   if (auto *X = isExtractFromCmpXchg(SI.getFalseValue(), 0))
-    if (X == CmpXchg && X->getCompareOperand() == SI.getTrueValue()) {
-      SI.setTrueValue(SI.getFalseValue());
-      return &SI;
-    }
+    if (X == CmpXchg && X->getCompareOperand() == SI.getTrueValue())
+      return SI.getFalseValue();
 
   return nullptr;
 }
@@ -2317,6 +2347,174 @@ static Instruction *foldSelectRotate(SelectInst &Sel) {
   return IntrinsicInst::Create(F, { TVal, TVal, ShAmt });
 }
 
+static Instruction *foldSelectToCopysign(SelectInst &Sel,
+                                         InstCombiner::BuilderTy &Builder) {
+  Value *Cond = Sel.getCondition();
+  Value *TVal = Sel.getTrueValue();
+  Value *FVal = Sel.getFalseValue();
+  Type *SelType = Sel.getType();
+
+  // Match select ?, TC, FC where the constants are equal but negated.
+  // TODO: Generalize to handle a negated variable operand?
+  const APFloat *TC, *FC;
+  if (!match(TVal, m_APFloat(TC)) || !match(FVal, m_APFloat(FC)) ||
+      !abs(*TC).bitwiseIsEqual(abs(*FC)))
+    return nullptr;
+
+  assert(TC != FC && "Expected equal select arms to simplify");
+
+  Value *X;
+  const APInt *C;
+  bool IsTrueIfSignSet;
+  ICmpInst::Predicate Pred;
+  if (!match(Cond, m_OneUse(m_ICmp(Pred, m_BitCast(m_Value(X)), m_APInt(C)))) ||
+      !isSignBitCheck(Pred, *C, IsTrueIfSignSet) || X->getType() != SelType)
+    return nullptr;
+
+  // If needed, negate the value that will be the sign argument of the copysign:
+  // (bitcast X) <  0 ? -TC :  TC --> copysign(TC,  X)
+  // (bitcast X) <  0 ?  TC : -TC --> copysign(TC, -X)
+  // (bitcast X) >= 0 ? -TC :  TC --> copysign(TC, -X)
+  // (bitcast X) >= 0 ?  TC : -TC --> copysign(TC,  X)
+  if (IsTrueIfSignSet ^ TC->isNegative())
+    X = Builder.CreateFNegFMF(X, &Sel);
+
+  // Canonicalize the magnitude argument as the positive constant since we do
+  // not care about its sign.
+  Value *MagArg = TC->isNegative() ? FVal : TVal;
+  Function *F = Intrinsic::getDeclaration(Sel.getModule(), Intrinsic::copysign,
+                                          Sel.getType());
+  Instruction *CopySign = IntrinsicInst::Create(F, { MagArg, X });
+  CopySign->setFastMathFlags(Sel.getFastMathFlags());
+  return CopySign;
+}
+
+Instruction *InstCombiner::foldVectorSelect(SelectInst &Sel) {
+  auto *VecTy = dyn_cast<FixedVectorType>(Sel.getType());
+  if (!VecTy)
+    return nullptr;
+
+  unsigned NumElts = VecTy->getNumElements();
+  APInt UndefElts(NumElts, 0);
+  APInt AllOnesEltMask(APInt::getAllOnesValue(NumElts));
+  if (Value *V = SimplifyDemandedVectorElts(&Sel, AllOnesEltMask, UndefElts)) {
+    if (V != &Sel)
+      return replaceInstUsesWith(Sel, V);
+    return &Sel;
+  }
+
+  // A select of a "select shuffle" with a common operand can be rearranged
+  // to select followed by "select shuffle". Because of poison, this only works
+  // in the case of a shuffle with no undefined mask elements.
+  Value *Cond = Sel.getCondition();
+  Value *TVal = Sel.getTrueValue();
+  Value *FVal = Sel.getFalseValue();
+  Value *X, *Y;
+  ArrayRef<int> Mask;
+  if (match(TVal, m_OneUse(m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask)))) &&
+      !is_contained(Mask, UndefMaskElem) &&
+      cast<ShuffleVectorInst>(TVal)->isSelect()) {
+    if (X == FVal) {
+      // select Cond, (shuf_sel X, Y), X --> shuf_sel X, (select Cond, Y, X)
+      Value *NewSel = Builder.CreateSelect(Cond, Y, X, "sel", &Sel);
+      return new ShuffleVectorInst(X, NewSel, Mask);
+    }
+    if (Y == FVal) {
+      // select Cond, (shuf_sel X, Y), Y --> shuf_sel (select Cond, X, Y), Y
+      Value *NewSel = Builder.CreateSelect(Cond, X, Y, "sel", &Sel);
+      return new ShuffleVectorInst(NewSel, Y, Mask);
+    }
+  }
+  if (match(FVal, m_OneUse(m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask)))) &&
+      !is_contained(Mask, UndefMaskElem) &&
+      cast<ShuffleVectorInst>(FVal)->isSelect()) {
+    if (X == TVal) {
+      // select Cond, X, (shuf_sel X, Y) --> shuf_sel X, (select Cond, X, Y)
+      Value *NewSel = Builder.CreateSelect(Cond, X, Y, "sel", &Sel);
+      return new ShuffleVectorInst(X, NewSel, Mask);
+    }
+    if (Y == TVal) {
+      // select Cond, Y, (shuf_sel X, Y) --> shuf_sel (select Cond, Y, X), Y
+      Value *NewSel = Builder.CreateSelect(Cond, Y, X, "sel", &Sel);
+      return new ShuffleVectorInst(NewSel, Y, Mask);
+    }
+  }
+
+  return nullptr;
+}
+
+static Instruction *foldSelectToPhiImpl(SelectInst &Sel, BasicBlock *BB,
+                                        const DominatorTree &DT,
+                                        InstCombiner::BuilderTy &Builder) {
+  // Find the block's immediate dominator that ends with a conditional branch
+  // that matches select's condition (maybe inverted).
+  auto *IDomNode = DT[BB]->getIDom();
+  if (!IDomNode)
+    return nullptr;
+  BasicBlock *IDom = IDomNode->getBlock();
+
+  Value *Cond = Sel.getCondition();
+  Value *IfTrue, *IfFalse;
+  BasicBlock *TrueSucc, *FalseSucc;
+  if (match(IDom->getTerminator(),
+            m_Br(m_Specific(Cond), m_BasicBlock(TrueSucc),
+                 m_BasicBlock(FalseSucc)))) {
+    IfTrue = Sel.getTrueValue();
+    IfFalse = Sel.getFalseValue();
+  } else if (match(IDom->getTerminator(),
+                   m_Br(m_Not(m_Specific(Cond)), m_BasicBlock(TrueSucc),
+                        m_BasicBlock(FalseSucc)))) {
+    IfTrue = Sel.getFalseValue();
+    IfFalse = Sel.getTrueValue();
+  } else
+    return nullptr;
+
+  // We want to replace select %cond, %a, %b with a phi that takes value %a
+  // for all incoming edges that are dominated by condition `%cond == true`,
+  // and value %b for edges dominated by condition `%cond == false`. If %a
+  // or %b are also phis from the same basic block, we can go further and take
+  // their incoming values from the corresponding blocks.
+  BasicBlockEdge TrueEdge(IDom, TrueSucc);
+  BasicBlockEdge FalseEdge(IDom, FalseSucc);
+  DenseMap<BasicBlock *, Value *> Inputs;
+  for (auto *Pred : predecessors(BB)) {
+    // Check implication.
+    BasicBlockEdge Incoming(Pred, BB);
+    if (DT.dominates(TrueEdge, Incoming))
+      Inputs[Pred] = IfTrue->DoPHITranslation(BB, Pred);
+    else if (DT.dominates(FalseEdge, Incoming))
+      Inputs[Pred] = IfFalse->DoPHITranslation(BB, Pred);
+    else
+      return nullptr;
+    // Check availability.
+    if (auto *Insn = dyn_cast<Instruction>(Inputs[Pred]))
+      if (!DT.dominates(Insn, Pred->getTerminator()))
+        return nullptr;
+  }
+
+  Builder.SetInsertPoint(&*BB->begin());
+  auto *PN = Builder.CreatePHI(Sel.getType(), Inputs.size());
+  for (auto *Pred : predecessors(BB))
+    PN->addIncoming(Inputs[Pred], Pred);
+  PN->takeName(&Sel);
+  return PN;
+}
+
+static Instruction *foldSelectToPhi(SelectInst &Sel, const DominatorTree &DT,
+                                    InstCombiner::BuilderTy &Builder) {
+  // Try to replace this select with Phi in one of these blocks.
+  SmallSetVector<BasicBlock *, 4> CandidateBlocks;
+  CandidateBlocks.insert(Sel.getParent());
+  for (Value *V : Sel.operands())
+    if (auto *I = dyn_cast<Instruction>(V))
+      CandidateBlocks.insert(I->getParent());
+
+  for (BasicBlock *BB : CandidateBlocks)
+    if (auto *PN = foldSelectToPhiImpl(Sel, BB, DT, Builder))
+      return PN;
+  return nullptr;
+}
+
 Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   Value *CondVal = SI.getCondition();
   Value *TrueVal = SI.getTrueValue();
@@ -2346,25 +2544,10 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
   if (Instruction *I = canonicalizeSelectToShuffle(SI))
     return I;
 
-  if (Instruction *I = canonicalizeScalarSelectOfVecs(SI, Builder))
+  if (Instruction *I = canonicalizeScalarSelectOfVecs(SI, *this))
     return I;
 
-  // Canonicalize a one-use integer compare with a non-canonical predicate by
-  // inverting the predicate and swapping the select operands. This matches a
-  // compare canonicalization for conditional branches.
-  // TODO: Should we do the same for FP compares?
   CmpInst::Predicate Pred;
-  if (match(CondVal, m_OneUse(m_ICmp(Pred, m_Value(), m_Value()))) &&
-      !isCanonicalPredicate(Pred)) {
-    // Swap true/false values and condition.
-    CmpInst *Cond = cast<CmpInst>(CondVal);
-    Cond->setPredicate(CmpInst::getInversePredicate(Pred));
-    SI.setOperand(1, FalseVal);
-    SI.setOperand(2, TrueVal);
-    SI.swapProfMetadata();
-    Worklist.Add(Cond);
-    return &SI;
-  }
 
   if (SelType->isIntOrIntVectorTy(1) &&
       TrueVal->getType() == CondVal->getType()) {
@@ -2514,6 +2697,8 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     return Add;
   if (Instruction *Add = foldOverflowingAddSubSelect(SI, Builder))
     return Add;
+  if (Instruction *Or = foldSetClearBits(SI, Builder))
+    return Or;
 
   // Turn (select C, (op X, Y), (op X, Z)) -> (op X, (select C, Y, Z))
   auto *TI = dyn_cast<Instruction>(TrueVal);
@@ -2650,16 +2835,15 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       if (TrueSI->getCondition() == CondVal) {
         if (SI.getTrueValue() == TrueSI->getTrueValue())
           return nullptr;
-        SI.setOperand(1, TrueSI->getTrueValue());
-        return &SI;
+        return replaceOperand(SI, 1, TrueSI->getTrueValue());
       }
       // select(C0, select(C1, a, b), b) -> select(C0&C1, a, b)
       // We choose this as normal form to enable folding on the And and shortening
       // paths for the values (this helps GetUnderlyingObjects() for example).
       if (TrueSI->getFalseValue() == FalseVal && TrueSI->hasOneUse()) {
         Value *And = Builder.CreateAnd(CondVal, TrueSI->getCondition());
-        SI.setOperand(0, And);
-        SI.setOperand(1, TrueSI->getTrueValue());
+        replaceOperand(SI, 0, And);
+        replaceOperand(SI, 1, TrueSI->getTrueValue());
         return &SI;
       }
     }
@@ -2670,14 +2854,13 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       if (FalseSI->getCondition() == CondVal) {
         if (SI.getFalseValue() == FalseSI->getFalseValue())
           return nullptr;
-        SI.setOperand(2, FalseSI->getFalseValue());
-        return &SI;
+        return replaceOperand(SI, 2, FalseSI->getFalseValue());
       }
       // select(C0, a, select(C1, a, b)) -> select(C0|C1, a, b)
       if (FalseSI->getTrueValue() == TrueVal && FalseSI->hasOneUse()) {
         Value *Or = Builder.CreateOr(CondVal, FalseSI->getCondition());
-        SI.setOperand(0, Or);
-        SI.setOperand(2, FalseSI->getFalseValue());
+        replaceOperand(SI, 0, Or);
+        replaceOperand(SI, 2, FalseSI->getFalseValue());
         return &SI;
       }
     }
@@ -2704,15 +2887,15 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       canMergeSelectThroughBinop(TrueBO)) {
     if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(0))) {
       if (TrueBOSI->getCondition() == CondVal) {
-        TrueBO->setOperand(0, TrueBOSI->getTrueValue());
-        Worklist.Add(TrueBO);
+        replaceOperand(*TrueBO, 0, TrueBOSI->getTrueValue());
+        Worklist.push(TrueBO);
         return &SI;
       }
     }
     if (auto *TrueBOSI = dyn_cast<SelectInst>(TrueBO->getOperand(1))) {
       if (TrueBOSI->getCondition() == CondVal) {
-        TrueBO->setOperand(1, TrueBOSI->getTrueValue());
-        Worklist.Add(TrueBO);
+        replaceOperand(*TrueBO, 1, TrueBOSI->getTrueValue());
+        Worklist.push(TrueBO);
         return &SI;
       }
     }
@@ -2724,15 +2907,15 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
       canMergeSelectThroughBinop(FalseBO)) {
     if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(0))) {
       if (FalseBOSI->getCondition() == CondVal) {
-        FalseBO->setOperand(0, FalseBOSI->getFalseValue());
-        Worklist.Add(FalseBO);
+        replaceOperand(*FalseBO, 0, FalseBOSI->getFalseValue());
+        Worklist.push(FalseBO);
         return &SI;
       }
     }
     if (auto *FalseBOSI = dyn_cast<SelectInst>(FalseBO->getOperand(1))) {
       if (FalseBOSI->getCondition() == CondVal) {
-        FalseBO->setOperand(1, FalseBOSI->getFalseValue());
-        Worklist.Add(FalseBO);
+        replaceOperand(*FalseBO, 1, FalseBOSI->getFalseValue());
+        Worklist.push(FalseBO);
         return &SI;
       }
     }
@@ -2740,23 +2923,14 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
 
   Value *NotCond;
   if (match(CondVal, m_Not(m_Value(NotCond)))) {
-    SI.setOperand(0, NotCond);
-    SI.setOperand(1, FalseVal);
-    SI.setOperand(2, TrueVal);
+    replaceOperand(SI, 0, NotCond);
+    SI.swapValues();
     SI.swapProfMetadata();
     return &SI;
   }
 
-  if (VectorType *VecTy = dyn_cast<VectorType>(SelType)) {
-    unsigned VWidth = VecTy->getNumElements();
-    APInt UndefElts(VWidth, 0);
-    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
-    if (Value *V = SimplifyDemandedVectorElts(&SI, AllOnesEltMask, UndefElts)) {
-      if (V != &SI)
-        return replaceInstUsesWith(SI, V);
-      return &SI;
-    }
-  }
+  if (Instruction *I = foldVectorSelect(SI))
+    return I;
 
   // If we can compute the condition, there's no need for a select.
   // Like the above fold, we are attempting to reduce compile-time cost by
@@ -2776,14 +2950,20 @@ Instruction *InstCombiner::visitSelectInst(SelectInst &SI) {
     return BitCastSel;
 
   // Simplify selects that test the returned flag of cmpxchg instructions.
-  if (Instruction *Select = foldSelectCmpXchg(SI))
-    return Select;
+  if (Value *V = foldSelectCmpXchg(SI))
+    return replaceInstUsesWith(SI, V);
 
-  if (Instruction *Select = foldSelectBinOpIdentity(SI, TLI))
+  if (Instruction *Select = foldSelectBinOpIdentity(SI, TLI, *this))
     return Select;
 
   if (Instruction *Rot = foldSelectRotate(SI))
     return Rot;
 
+  if (Instruction *Copysign = foldSelectToCopysign(SI, Builder))
+    return Copysign;
+
+  if (Instruction *PN = foldSelectToPhi(SI, DT, Builder))
+    return replaceInstUsesWith(SI, PN);
+
   return nullptr;
 }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
index fbff5dd4a8cd..0a842b4e1047 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineShifts.cpp
@@ -23,8 +23,11 @@ using namespace PatternMatch;
 // Given pattern:
 //   (x shiftopcode Q) shiftopcode K
 // we should rewrite it as
-//   x shiftopcode (Q+K)  iff (Q+K) u< bitwidth(x)
-// This is valid for any shift, but they must be identical.
+//   x shiftopcode (Q+K)  iff (Q+K) u< bitwidth(x) and
+//
+// This is valid for any shift, but they must be identical, and we must be
+// careful in case we have (zext(Q)+zext(K)) and look past extensions,
+// (Q+K) must not overflow or else (Q+K) u< bitwidth(x) is bogus.
 //
 // AnalyzeForSignBitExtraction indicates that we will only analyze whether this
 // pattern has any 2 right-shifts that sum to 1 less than original bit width.
@@ -58,6 +61,23 @@ Value *InstCombiner::reassociateShiftAmtsOfTwoSameDirectionShifts(
   if (ShAmt0->getType() != ShAmt1->getType())
     return nullptr;
 
+  // As input, we have the following pattern:
+  //   Sh0 (Sh1 X, Q), K
+  // We want to rewrite that as:
+  //   Sh x, (Q+K)  iff (Q+K) u< bitwidth(x)
+  // While we know that originally (Q+K) would not overflow
+  // (because  2 * (N-1) u<= iN -1), we have looked past extensions of
+  // shift amounts. so it may now overflow in smaller bitwidth.
+  // To ensure that does not happen, we need to ensure that the total maximal
+  // shift amount is still representable in that smaller bit width.
+  unsigned MaximalPossibleTotalShiftAmount =
+      (Sh0->getType()->getScalarSizeInBits() - 1) +
+      (Sh1->getType()->getScalarSizeInBits() - 1);
+  APInt MaximalRepresentableShiftAmount =
+      APInt::getAllOnesValue(ShAmt0->getType()->getScalarSizeInBits());
+  if (MaximalRepresentableShiftAmount.ult(MaximalPossibleTotalShiftAmount))
+    return nullptr;
+
   // We are only looking for signbit extraction if we have two right shifts.
   bool HadTwoRightShifts = match(Sh0, m_Shr(m_Value(), m_Value())) &&
                            match(Sh1, m_Shr(m_Value(), m_Value()));
@@ -388,8 +408,7 @@ Instruction *InstCombiner::commonShiftTransforms(BinaryOperator &I) {
     // demand the sign bit (and many others) here??
     Value *Rem = Builder.CreateAnd(A, ConstantInt::get(I.getType(), *B - 1),
                                    Op1->getName());
-    I.setOperand(1, Rem);
-    return &I;
+    return replaceOperand(I, 1, Rem);
   }
 
   if (Instruction *Logic = foldShiftOfShiftedLogic(I, Builder))
@@ -593,19 +612,13 @@ static Value *getShiftedValue(Value *V, unsigned NumBits, bool isLeftShift,
   // We can always evaluate constants shifted.
   if (Constant *C = dyn_cast<Constant>(V)) {
     if (isLeftShift)
-      V = IC.Builder.CreateShl(C, NumBits);
+      return IC.Builder.CreateShl(C, NumBits);
     else
-      V = IC.Builder.CreateLShr(C, NumBits);
-    // If we got a constantexpr back, try to simplify it with TD info.
-    if (auto *C = dyn_cast<Constant>(V))
-      if (auto *FoldedC =
-              ConstantFoldConstant(C, DL, &IC.getTargetLibraryInfo()))
-        V = FoldedC;
-    return V;
+      return IC.Builder.CreateLShr(C, NumBits);
   }
 
   Instruction *I = cast<Instruction>(V);
-  IC.Worklist.Add(I);
+  IC.Worklist.push(I);
 
   switch (I->getOpcode()) {
   default: llvm_unreachable("Inconsistency with CanEvaluateShifted");
@@ -761,7 +774,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
           APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
           Constant *Mask = ConstantInt::get(I.getContext(), Bits);
           if (VectorType *VT = dyn_cast<VectorType>(X->getType()))
-            Mask = ConstantVector::getSplat(VT->getNumElements(), Mask);
+            Mask = ConstantVector::getSplat(VT->getElementCount(), Mask);
           return BinaryOperator::CreateAnd(X, Mask);
         }
 
@@ -796,7 +809,7 @@ Instruction *InstCombiner::FoldShiftByConstant(Value *Op0, Constant *Op1,
           APInt Bits = APInt::getHighBitsSet(TypeBits, TypeBits - Op1Val);
           Constant *Mask = ConstantInt::get(I.getContext(), Bits);
           if (VectorType *VT = dyn_cast<VectorType>(X->getType()))
-            Mask = ConstantVector::getSplat(VT->getNumElements(), Mask);
+            Mask = ConstantVector::getSplat(VT->getElementCount(), Mask);
           return BinaryOperator::CreateAnd(X, Mask);
         }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
index 47ce83974c8d..7cfe4c8b5892 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSimplifyDemanded.cpp
@@ -87,7 +87,10 @@ bool InstCombiner::SimplifyDemandedBits(Instruction *I, unsigned OpNo,
   Value *NewVal = SimplifyDemandedUseBits(U.get(), DemandedMask, Known,
                                           Depth, I);
   if (!NewVal) return false;
-  U = NewVal;
+  if (Instruction* OpInst = dyn_cast<Instruction>(U))
+    salvageDebugInfo(*OpInst);
+    
+  replaceUse(U, NewVal);
   return true;
 }
 
@@ -173,15 +176,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
     assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
 
-    // Output known-0 are known to be clear if zero in either the LHS | RHS.
-    APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero;
-    // Output known-1 bits are only known if set in both the LHS & RHS.
-    APInt IKnownOne = RHSKnown.One & LHSKnown.One;
+    Known = LHSKnown & RHSKnown;
 
     // If the client is only demanding bits that we know, return the known
     // constant.
-    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
-      return Constant::getIntegerValue(VTy, IKnownOne);
+    if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+      return Constant::getIntegerValue(VTy, Known.One);
 
     // If all of the demanded bits are known 1 on one side, return the other.
     // These bits cannot contribute to the result of the 'and'.
@@ -194,8 +194,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (ShrinkDemandedConstant(I, 1, DemandedMask & ~LHSKnown.Zero))
       return I;
 
-    Known.Zero = std::move(IKnownZero);
-    Known.One  = std::move(IKnownOne);
     break;
   }
   case Instruction::Or: {
@@ -207,15 +205,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
     assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
 
-    // Output known-0 bits are only known if clear in both the LHS & RHS.
-    APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero;
-    // Output known-1 are known. to be set if s.et in either the LHS | RHS.
-    APInt IKnownOne = RHSKnown.One | LHSKnown.One;
+    Known = LHSKnown | RHSKnown;
 
     // If the client is only demanding bits that we know, return the known
     // constant.
-    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
-      return Constant::getIntegerValue(VTy, IKnownOne);
+    if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+      return Constant::getIntegerValue(VTy, Known.One);
 
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'or'.
@@ -228,8 +223,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (ShrinkDemandedConstant(I, 1, DemandedMask))
       return I;
 
-    Known.Zero = std::move(IKnownZero);
-    Known.One  = std::move(IKnownOne);
     break;
   }
   case Instruction::Xor: {
@@ -239,17 +232,12 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     assert(!RHSKnown.hasConflict() && "Bits known to be one AND zero?");
     assert(!LHSKnown.hasConflict() && "Bits known to be one AND zero?");
 
-    // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) |
-                       (RHSKnown.One & LHSKnown.One);
-    // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    APInt IKnownOne =  (RHSKnown.Zero & LHSKnown.One) |
-                       (RHSKnown.One & LHSKnown.Zero);
+    Known = LHSKnown ^ RHSKnown;
 
     // If the client is only demanding bits that we know, return the known
     // constant.
-    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
-      return Constant::getIntegerValue(VTy, IKnownOne);
+    if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+      return Constant::getIntegerValue(VTy, Known.One);
 
     // If all of the demanded bits are known zero on one side, return the other.
     // These bits cannot contribute to the result of the 'xor'.
@@ -309,10 +297,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
         return InsertNewInstWith(NewXor, *I);
       }
 
-    // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    Known.Zero = std::move(IKnownZero);
-    // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    Known.One  = std::move(IKnownOne);
     break;
   }
   case Instruction::Select: {
@@ -396,8 +380,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     if (SimplifyDemandedBits(I, 0, InputDemandedMask, InputKnown, Depth + 1))
       return I;
     assert(InputKnown.getBitWidth() == SrcBitWidth && "Src width changed?");
-    Known = InputKnown.zextOrTrunc(BitWidth,
-                                   true /* ExtendedBitsAreKnownZero */);
+    Known = InputKnown.zextOrTrunc(BitWidth);
     assert(!Known.hasConflict() && "Bits known to be one AND zero?");
     break;
   }
@@ -453,6 +436,43 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     break;
   }
   case Instruction::Add:
+    if ((DemandedMask & 1) == 0) {
+      // If we do not need the low bit, try to convert bool math to logic:
+      // add iN (zext i1 X), (sext i1 Y) --> sext (~X & Y) to iN
+      Value *X, *Y;
+      if (match(I, m_c_Add(m_OneUse(m_ZExt(m_Value(X))),
+                           m_OneUse(m_SExt(m_Value(Y))))) &&
+          X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) {
+        // Truth table for inputs and output signbits:
+        //       X:0 | X:1
+        //      ----------
+        // Y:0  |  0 | 0 |
+        // Y:1  | -1 | 0 |
+        //      ----------
+        IRBuilderBase::InsertPointGuard Guard(Builder);
+        Builder.SetInsertPoint(I);
+        Value *AndNot = Builder.CreateAnd(Builder.CreateNot(X), Y);
+        return Builder.CreateSExt(AndNot, VTy);
+      }
+
+      // add iN (sext i1 X), (sext i1 Y) --> sext (X | Y) to iN
+      // TODO: Relax the one-use checks because we are removing an instruction?
+      if (match(I, m_Add(m_OneUse(m_SExt(m_Value(X))),
+                         m_OneUse(m_SExt(m_Value(Y))))) &&
+          X->getType()->isIntOrIntVectorTy(1) && X->getType() == Y->getType()) {
+        // Truth table for inputs and output signbits:
+        //       X:0 | X:1
+        //      -----------
+        // Y:0  | -1 | -1 |
+        // Y:1  | -1 |  0 |
+        //      -----------
+        IRBuilderBase::InsertPointGuard Guard(Builder);
+        Builder.SetInsertPoint(I);
+        Value *Or = Builder.CreateOr(X, Y);
+        return Builder.CreateSExt(Or, VTy);
+      }
+    }
+    LLVM_FALLTHROUGH;
   case Instruction::Sub: {
     /// If the high-bits of an ADD/SUB are not demanded, then we do not care
     /// about the high bits of the operands.
@@ -515,11 +535,27 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       if (SimplifyDemandedBits(I, 0, DemandedMaskIn, Known, Depth + 1))
         return I;
       assert(!Known.hasConflict() && "Bits known to be one AND zero?");
+
+      bool SignBitZero = Known.Zero.isSignBitSet();
+      bool SignBitOne = Known.One.isSignBitSet();
       Known.Zero <<= ShiftAmt;
       Known.One  <<= ShiftAmt;
       // low bits known zero.
       if (ShiftAmt)
         Known.Zero.setLowBits(ShiftAmt);
+
+      // If this shift has "nsw" keyword, then the result is either a poison
+      // value or has the same sign bit as the first operand.
+      if (IOp->hasNoSignedWrap()) {
+        if (SignBitZero)
+          Known.Zero.setSignBit();
+        else if (SignBitOne)
+          Known.One.setSignBit();
+        if (Known.hasConflict())
+          return UndefValue::get(I->getType());
+      }
+    } else {
+      computeKnownBits(I, Known, Depth, CxtI);
     }
     break;
   }
@@ -543,6 +579,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       Known.One.lshrInPlace(ShiftAmt);
       if (ShiftAmt)
         Known.Zero.setHighBits(ShiftAmt);  // high bits known zero.
+    } else {
+      computeKnownBits(I, Known, Depth, CxtI);
     }
     break;
   }
@@ -603,6 +641,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       } else if (Known.One[BitWidth-ShiftAmt-1]) { // New bits are known one.
         Known.One |= HighBits;
       }
+    } else {
+      computeKnownBits(I, Known, Depth, CxtI);
     }
     break;
   }
@@ -624,6 +664,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
       // Propagate zero bits from the input.
       Known.Zero.setHighBits(std::min(
           BitWidth, LHSKnown.Zero.countLeadingOnes() + RHSTrailingZeros));
+    } else {
+      computeKnownBits(I, Known, Depth, CxtI);
     }
     break;
   }
@@ -682,7 +724,8 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
     Known.Zero = APInt::getHighBitsSet(BitWidth, Leaders) & DemandedMask;
     break;
   }
-  case Instruction::Call:
+  case Instruction::Call: {
+    bool KnownBitsComputed = false;
     if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
       switch (II->getIntrinsicID()) {
       default: break;
@@ -714,8 +757,6 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
           NewVal->takeName(I);
           return InsertNewInstWith(NewVal, *I);
         }
-
-        // TODO: Could compute known zero/one bits based on the input.
         break;
       }
       case Intrinsic::fshr:
@@ -740,6 +781,7 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
                      RHSKnown.Zero.lshr(BitWidth - ShiftAmt);
         Known.One = LHSKnown.One.shl(ShiftAmt) |
                     RHSKnown.One.lshr(BitWidth - ShiftAmt);
+        KnownBitsComputed = true;
         break;
       }
       case Intrinsic::x86_mmx_pmovmskb:
@@ -768,16 +810,21 @@ Value *InstCombiner::SimplifyDemandedUseBits(Value *V, APInt DemandedMask,
 
         // We know that the upper bits are set to zero.
         Known.Zero.setBitsFrom(ArgWidth);
-        return nullptr;
+        KnownBitsComputed = true;
+        break;
       }
       case Intrinsic::x86_sse42_crc32_64_64:
         Known.Zero.setBitsFrom(32);
-        return nullptr;
+        KnownBitsComputed = true;
+        break;
       }
     }
-    computeKnownBits(V, Known, Depth, CxtI);
+
+    if (!KnownBitsComputed)
+      computeKnownBits(V, Known, Depth, CxtI);
     break;
   }
+  }
 
   // If the client is only demanding bits that we know, return the known
   // constant.
@@ -811,15 +858,12 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
     computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
                      CxtI);
 
-    // Output known-0 are known to be clear if zero in either the LHS | RHS.
-    APInt IKnownZero = RHSKnown.Zero | LHSKnown.Zero;
-    // Output known-1 bits are only known if set in both the LHS & RHS.
-    APInt IKnownOne = RHSKnown.One & LHSKnown.One;
+    Known = LHSKnown & RHSKnown;
 
     // If the client is only demanding bits that we know, return the known
     // constant.
-    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
-      return Constant::getIntegerValue(ITy, IKnownOne);
+    if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+      return Constant::getIntegerValue(ITy, Known.One);
 
     // If all of the demanded bits are known 1 on one side, return the other.
     // These bits cannot contribute to the result of the 'and' in this
@@ -829,8 +873,6 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
     if (DemandedMask.isSubsetOf(RHSKnown.Zero | LHSKnown.One))
       return I->getOperand(1);
 
-    Known.Zero = std::move(IKnownZero);
-    Known.One  = std::move(IKnownOne);
     break;
   }
   case Instruction::Or: {
@@ -842,15 +884,12 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
     computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
                      CxtI);
 
-    // Output known-0 bits are only known if clear in both the LHS & RHS.
-    APInt IKnownZero = RHSKnown.Zero & LHSKnown.Zero;
-    // Output known-1 are known to be set if set in either the LHS | RHS.
-    APInt IKnownOne = RHSKnown.One | LHSKnown.One;
+    Known = LHSKnown | RHSKnown;
 
     // If the client is only demanding bits that we know, return the known
     // constant.
-    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
-      return Constant::getIntegerValue(ITy, IKnownOne);
+    if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+      return Constant::getIntegerValue(ITy, Known.One);
 
     // If all of the demanded bits are known zero on one side, return the
     // other.  These bits cannot contribute to the result of the 'or' in this
@@ -860,8 +899,6 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
     if (DemandedMask.isSubsetOf(RHSKnown.One | LHSKnown.Zero))
       return I->getOperand(1);
 
-    Known.Zero = std::move(IKnownZero);
-    Known.One  = std::move(IKnownOne);
     break;
   }
   case Instruction::Xor: {
@@ -872,17 +909,12 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
     computeKnownBits(I->getOperand(0), LHSKnown, Depth + 1,
                      CxtI);
 
-    // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    APInt IKnownZero = (RHSKnown.Zero & LHSKnown.Zero) |
-                       (RHSKnown.One & LHSKnown.One);
-    // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    APInt IKnownOne =  (RHSKnown.Zero & LHSKnown.One) |
-                       (RHSKnown.One & LHSKnown.Zero);
+    Known = LHSKnown ^ RHSKnown;
 
     // If the client is only demanding bits that we know, return the known
     // constant.
-    if (DemandedMask.isSubsetOf(IKnownZero|IKnownOne))
-      return Constant::getIntegerValue(ITy, IKnownOne);
+    if (DemandedMask.isSubsetOf(Known.Zero | Known.One))
+      return Constant::getIntegerValue(ITy, Known.One);
 
     // If all of the demanded bits are known zero on one side, return the
     // other.
@@ -891,10 +923,6 @@ Value *InstCombiner::SimplifyMultipleUseDemandedBits(Instruction *I,
     if (DemandedMask.isSubsetOf(LHSKnown.Zero))
       return I->getOperand(1);
 
-    // Output known-0 bits are known if clear or set in both the LHS & RHS.
-    Known.Zero = std::move(IKnownZero);
-    // Output known-1 are known to be set if set in only one of the LHS, RHS.
-    Known.One  = std::move(IKnownOne);
     break;
   }
   default:
@@ -1008,17 +1036,69 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
       DemandedElts.getActiveBits() == 3)
     return nullptr;
 
-  unsigned VWidth = II->getType()->getVectorNumElements();
+  auto *IIVTy = cast<VectorType>(II->getType());
+  unsigned VWidth = IIVTy->getNumElements();
   if (VWidth == 1)
     return nullptr;
 
-  ConstantInt *NewDMask = nullptr;
+  IRBuilderBase::InsertPointGuard Guard(Builder);
+  Builder.SetInsertPoint(II);
+
+  // Assume the arguments are unchanged and later override them, if needed.
+  SmallVector<Value *, 16> Args(II->arg_begin(), II->arg_end());
 
   if (DMaskIdx < 0) {
-    // Pretend that a prefix of elements is demanded to simplify the code
-    // below.
-    DemandedElts = (1 << DemandedElts.getActiveBits()) - 1;
+    // Buffer case.
+
+    const unsigned ActiveBits = DemandedElts.getActiveBits();
+    const unsigned UnusedComponentsAtFront = DemandedElts.countTrailingZeros();
+
+    // Start assuming the prefix of elements is demanded, but possibly clear
+    // some other bits if there are trailing zeros (unused components at front)
+    // and update offset.
+    DemandedElts = (1 << ActiveBits) - 1;
+
+    if (UnusedComponentsAtFront > 0) {
+      static const unsigned InvalidOffsetIdx = 0xf;
+
+      unsigned OffsetIdx;
+      switch (II->getIntrinsicID()) {
+      case Intrinsic::amdgcn_raw_buffer_load:
+        OffsetIdx = 1;
+        break;
+      case Intrinsic::amdgcn_s_buffer_load:
+        // If resulting type is vec3, there is no point in trimming the
+        // load with updated offset, as the vec3 would most likely be widened to
+        // vec4 anyway during lowering.
+        if (ActiveBits == 4 && UnusedComponentsAtFront == 1)
+          OffsetIdx = InvalidOffsetIdx;
+        else
+          OffsetIdx = 1;
+        break;
+      case Intrinsic::amdgcn_struct_buffer_load:
+        OffsetIdx = 2;
+        break;
+      default:
+        // TODO: handle tbuffer* intrinsics.
+        OffsetIdx = InvalidOffsetIdx;
+        break;
+      }
+
+      if (OffsetIdx != InvalidOffsetIdx) {
+        // Clear demanded bits and update the offset.
+        DemandedElts &= ~((1 << UnusedComponentsAtFront) - 1);
+        auto *Offset = II->getArgOperand(OffsetIdx);
+        unsigned SingleComponentSizeInBits =
+            getDataLayout().getTypeSizeInBits(II->getType()->getScalarType());
+        unsigned OffsetAdd =
+            UnusedComponentsAtFront * SingleComponentSizeInBits / 8;
+        auto *OffsetAddVal = ConstantInt::get(Offset->getType(), OffsetAdd);
+        Args[OffsetIdx] = Builder.CreateAdd(Offset, OffsetAddVal);
+      }
+    }
   } else {
+    // Image case.
+
     ConstantInt *DMask = cast<ConstantInt>(II->getArgOperand(DMaskIdx));
     unsigned DMaskVal = DMask->getZExtValue() & 0xf;
 
@@ -1037,7 +1117,7 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
     }
 
     if (DMaskVal != NewDMaskVal)
-      NewDMask = ConstantInt::get(DMask->getType(), NewDMaskVal);
+      Args[DMaskIdx] = ConstantInt::get(DMask->getType(), NewDMaskVal);
   }
 
   unsigned NewNumElts = DemandedElts.countPopulation();
@@ -1045,39 +1125,25 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
     return UndefValue::get(II->getType());
 
   if (NewNumElts >= VWidth && DemandedElts.isMask()) {
-    if (NewDMask)
-      II->setArgOperand(DMaskIdx, NewDMask);
+    if (DMaskIdx >= 0)
+      II->setArgOperand(DMaskIdx, Args[DMaskIdx]);
     return nullptr;
   }
 
-  // Determine the overload types of the original intrinsic.
-  auto IID = II->getIntrinsicID();
-  SmallVector<Intrinsic::IITDescriptor, 16> Table;
-  getIntrinsicInfoTableEntries(IID, Table);
-  ArrayRef<Intrinsic::IITDescriptor> TableRef = Table;
-
   // Validate function argument and return types, extracting overloaded types
   // along the way.
-  FunctionType *FTy = II->getCalledFunction()->getFunctionType();
   SmallVector<Type *, 6> OverloadTys;
-  Intrinsic::matchIntrinsicSignature(FTy, TableRef, OverloadTys);
+  if (!Intrinsic::getIntrinsicSignature(II->getCalledFunction(), OverloadTys))
+    return nullptr;
 
   Module *M = II->getParent()->getParent()->getParent();
-  Type *EltTy = II->getType()->getVectorElementType();
-  Type *NewTy = (NewNumElts == 1) ? EltTy : VectorType::get(EltTy, NewNumElts);
+  Type *EltTy = IIVTy->getElementType();
+  Type *NewTy =
+      (NewNumElts == 1) ? EltTy : FixedVectorType::get(EltTy, NewNumElts);
 
   OverloadTys[0] = NewTy;
-  Function *NewIntrin = Intrinsic::getDeclaration(M, IID, OverloadTys);
-
-  SmallVector<Value *, 16> Args;
-  for (unsigned I = 0, E = II->getNumArgOperands(); I != E; ++I)
-    Args.push_back(II->getArgOperand(I));
-
-  if (NewDMask)
-    Args[DMaskIdx] = NewDMask;
-
-  IRBuilderBase::InsertPointGuard Guard(Builder);
-  Builder.SetInsertPoint(II);
+  Function *NewIntrin =
+      Intrinsic::getDeclaration(M, II->getIntrinsicID(), OverloadTys);
 
   CallInst *NewCall = Builder.CreateCall(NewIntrin, Args);
   NewCall->takeName(II);
@@ -1088,7 +1154,7 @@ Value *InstCombiner::simplifyAMDGCNMemoryIntrinsicDemanded(IntrinsicInst *II,
                                        DemandedElts.countTrailingZeros());
   }
 
-  SmallVector<uint32_t, 8> EltMask;
+  SmallVector<int, 8> EltMask;
   unsigned NewLoadIdx = 0;
   for (unsigned OrigLoadIdx = 0; OrigLoadIdx < VWidth; ++OrigLoadIdx) {
     if (!!DemandedElts[OrigLoadIdx])
@@ -1120,7 +1186,12 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
                                                 APInt &UndefElts,
                                                 unsigned Depth,
                                                 bool AllowMultipleUsers) {
-  unsigned VWidth = V->getType()->getVectorNumElements();
+  // Cannot analyze scalable type. The number of vector elements is not a
+  // compile-time constant.
+  if (isa<ScalableVectorType>(V->getType()))
+    return nullptr;
+
+  unsigned VWidth = cast<FixedVectorType>(V->getType())->getNumElements();
   APInt EltMask(APInt::getAllOnesValue(VWidth));
   assert((DemandedElts & ~EltMask) == 0 && "Invalid DemandedElts!");
 
@@ -1199,10 +1270,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     auto *II = dyn_cast<IntrinsicInst>(Inst);
     Value *Op = II ? II->getArgOperand(OpNum) : Inst->getOperand(OpNum);
     if (Value *V = SimplifyDemandedVectorElts(Op, Demanded, Undef, Depth + 1)) {
-      if (II)
-        II->setArgOperand(OpNum, V);
-      else
-        Inst->setOperand(OpNum, V);
+      replaceOperand(*Inst, OpNum, V);
       MadeChange = true;
     }
   };
@@ -1268,7 +1336,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     // If this is inserting an element that isn't demanded, remove this
     // insertelement.
     if (IdxNo >= VWidth || !DemandedElts[IdxNo]) {
-      Worklist.Add(I);
+      Worklist.push(I);
       return I->getOperand(0);
     }
 
@@ -1282,7 +1350,25 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
            Shuffle->getOperand(1)->getType() &&
            "Expected shuffle operands to have same type");
     unsigned OpWidth =
-        Shuffle->getOperand(0)->getType()->getVectorNumElements();
+        cast<VectorType>(Shuffle->getOperand(0)->getType())->getNumElements();
+    // Handle trivial case of a splat. Only check the first element of LHS
+    // operand.
+    if (all_of(Shuffle->getShuffleMask(), [](int Elt) { return Elt == 0; }) &&
+        DemandedElts.isAllOnesValue()) {
+      if (!isa<UndefValue>(I->getOperand(1))) {
+        I->setOperand(1, UndefValue::get(I->getOperand(1)->getType()));
+        MadeChange = true;
+      }
+      APInt LeftDemanded(OpWidth, 1);
+      APInt LHSUndefElts(OpWidth, 0);
+      simplifyAndSetOp(I, 0, LeftDemanded, LHSUndefElts);
+      if (LHSUndefElts[0])
+        UndefElts = EltMask;
+      else
+        UndefElts.clearAllBits();
+      break;
+    }
+
     APInt LeftDemanded(OpWidth, 0), RightDemanded(OpWidth, 0);
     for (unsigned i = 0; i < VWidth; i++) {
       if (DemandedElts[i]) {
@@ -1396,15 +1482,14 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     }
     if (NewUndefElts) {
       // Add additional discovered undefs.
-      SmallVector<Constant*, 16> Elts;
+      SmallVector<int, 16> Elts;
       for (unsigned i = 0; i < VWidth; ++i) {
         if (UndefElts[i])
-          Elts.push_back(UndefValue::get(Type::getInt32Ty(I->getContext())));
+          Elts.push_back(UndefMaskElem);
         else
-          Elts.push_back(ConstantInt::get(Type::getInt32Ty(I->getContext()),
-                                          Shuffle->getMaskValue(i)));
+          Elts.push_back(Shuffle->getMaskValue(i));
       }
-      I->setOperand(2, ConstantVector::get(Elts));
+      Shuffle->setShuffleMask(Elts);
       MadeChange = true;
     }
     break;
@@ -1549,7 +1634,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
       // use Arg0 if DemandedElts[0] is clear like we do for other intrinsics.
       // Instead we should return a zero vector.
       if (!DemandedElts[0]) {
-        Worklist.Add(II);
+        Worklist.push(II);
         return ConstantAggregateZero::get(II->getType());
       }
 
@@ -1568,7 +1653,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
 
       // If lowest element of a scalar op isn't used then use Arg0.
       if (!DemandedElts[0]) {
-        Worklist.Add(II);
+        Worklist.push(II);
         return II->getArgOperand(0);
       }
       // TODO: If only low elt lower SQRT to FSQRT (with rounding/exceptions
@@ -1588,7 +1673,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
 
       // If lowest element of a scalar op isn't used then use Arg0.
       if (!DemandedElts[0]) {
-        Worklist.Add(II);
+        Worklist.push(II);
         return II->getArgOperand(0);
       }
 
@@ -1615,7 +1700,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
 
       // If lowest element of a scalar op isn't used then use Arg0.
       if (!DemandedElts[0]) {
-        Worklist.Add(II);
+        Worklist.push(II);
         return II->getArgOperand(0);
       }
 
@@ -1649,7 +1734,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
 
       // If lowest element of a scalar op isn't used then use Arg0.
       if (!DemandedElts[0]) {
-        Worklist.Add(II);
+        Worklist.push(II);
         return II->getArgOperand(0);
       }
 
@@ -1678,7 +1763,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     case Intrinsic::x86_avx512_packusdw_512:
     case Intrinsic::x86_avx512_packuswb_512: {
       auto *Ty0 = II->getArgOperand(0)->getType();
-      unsigned InnerVWidth = Ty0->getVectorNumElements();
+      unsigned InnerVWidth = cast<VectorType>(Ty0)->getNumElements();
       assert(VWidth == (InnerVWidth * 2) && "Unexpected input size");
 
       unsigned NumLanes = Ty0->getPrimitiveSizeInBits() / 128;
@@ -1747,6 +1832,7 @@ Value *InstCombiner::SimplifyDemandedVectorElts(Value *V, APInt DemandedElts,
     case Intrinsic::amdgcn_raw_buffer_load:
     case Intrinsic::amdgcn_raw_buffer_load_format:
     case Intrinsic::amdgcn_raw_tbuffer_load:
+    case Intrinsic::amdgcn_s_buffer_load:
     case Intrinsic::amdgcn_struct_buffer_load:
     case Intrinsic::amdgcn_struct_buffer_load_format:
     case Intrinsic::amdgcn_struct_tbuffer_load:
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index f604c9dc32ca..ff70347569ab 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -16,6 +16,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallBitVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/VectorUtils.h"
@@ -57,12 +58,15 @@ static bool cheapToScalarize(Value *V, bool IsConstantExtractIndex) {
   // An insertelement to the same constant index as our extract will simplify
   // to the scalar inserted element. An insertelement to a different constant
   // index is irrelevant to our extract.
-  if (match(V, m_InsertElement(m_Value(), m_Value(), m_ConstantInt())))
+  if (match(V, m_InsertElt(m_Value(), m_Value(), m_ConstantInt())))
     return IsConstantExtractIndex;
 
   if (match(V, m_OneUse(m_Load(m_Value()))))
     return true;
 
+  if (match(V, m_OneUse(m_UnOp())))
+    return true;
+
   Value *V0, *V1;
   if (match(V, m_OneUse(m_BinOp(m_Value(V0), m_Value(V1)))))
     if (cheapToScalarize(V0, IsConstantExtractIndex) ||
@@ -172,9 +176,9 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
   // If this extractelement is using a bitcast from a vector of the same number
   // of elements, see if we can find the source element from the source vector:
   // extelt (bitcast VecX), IndexC --> bitcast X[IndexC]
-  Type *SrcTy = X->getType();
+  auto *SrcTy = cast<VectorType>(X->getType());
   Type *DestTy = Ext.getType();
-  unsigned NumSrcElts = SrcTy->getVectorNumElements();
+  unsigned NumSrcElts = SrcTy->getNumElements();
   unsigned NumElts = Ext.getVectorOperandType()->getNumElements();
   if (NumSrcElts == NumElts)
     if (Value *Elt = findScalarElement(X, ExtIndexC))
@@ -185,8 +189,8 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
   if (NumSrcElts < NumElts) {
     Value *Scalar;
     uint64_t InsIndexC;
-    if (!match(X, m_InsertElement(m_Value(), m_Value(Scalar),
-                                  m_ConstantInt(InsIndexC))))
+    if (!match(X, m_InsertElt(m_Value(), m_Value(Scalar),
+                              m_ConstantInt(InsIndexC))))
       return nullptr;
 
     // The extract must be from the subset of vector elements that we inserted
@@ -255,7 +259,7 @@ static Instruction *foldBitcastExtElt(ExtractElementInst &Ext,
 
 /// Find elements of V demanded by UserInstr.
 static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
-  unsigned VWidth = V->getType()->getVectorNumElements();
+  unsigned VWidth = cast<VectorType>(V->getType())->getNumElements();
 
   // Conservatively assume that all elements are needed.
   APInt UsedElts(APInt::getAllOnesValue(VWidth));
@@ -272,7 +276,8 @@ static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
   }
   case Instruction::ShuffleVector: {
     ShuffleVectorInst *Shuffle = cast<ShuffleVectorInst>(UserInstr);
-    unsigned MaskNumElts = UserInstr->getType()->getVectorNumElements();
+    unsigned MaskNumElts =
+        cast<VectorType>(UserInstr->getType())->getNumElements();
 
     UsedElts = APInt(VWidth, 0);
     for (unsigned i = 0; i < MaskNumElts; i++) {
@@ -298,7 +303,7 @@ static APInt findDemandedEltsBySingleUser(Value *V, Instruction *UserInstr) {
 /// no user demands an element of V, then the corresponding bit
 /// remains unset in the returned value.
 static APInt findDemandedEltsByAllUsers(Value *V) {
-  unsigned VWidth = V->getType()->getVectorNumElements();
+  unsigned VWidth = cast<VectorType>(V->getType())->getNumElements();
 
   APInt UnionUsedElts(VWidth, 0);
   for (const Use &U : V->uses()) {
@@ -327,14 +332,18 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
   // find a previously computed scalar that was inserted into the vector.
   auto *IndexC = dyn_cast<ConstantInt>(Index);
   if (IndexC) {
-    unsigned NumElts = EI.getVectorOperandType()->getNumElements();
+    ElementCount EC = EI.getVectorOperandType()->getElementCount();
+    unsigned NumElts = EC.Min;
 
     // InstSimplify should handle cases where the index is invalid.
-    if (!IndexC->getValue().ule(NumElts))
+    // For fixed-length vector, it's invalid to extract out-of-range element.
+    if (!EC.Scalable && IndexC->getValue().uge(NumElts))
       return nullptr;
 
     // This instruction only demands the single element from the input vector.
-    if (NumElts != 1) {
+    // Skip for scalable type, the number of elements is unknown at
+    // compile-time.
+    if (!EC.Scalable && NumElts != 1) {
       // If the input vector has a single use, simplify it based on this use
       // property.
       if (SrcVec->hasOneUse()) {
@@ -342,10 +351,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
         APInt DemandedElts(NumElts, 0);
         DemandedElts.setBit(IndexC->getZExtValue());
         if (Value *V =
-                SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts)) {
-          EI.setOperand(0, V);
-          return &EI;
-        }
+                SimplifyDemandedVectorElts(SrcVec, DemandedElts, UndefElts))
+          return replaceOperand(EI, 0, V);
       } else {
         // If the input vector has multiple uses, simplify it based on a union
         // of all elements used.
@@ -373,6 +380,16 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
         return ScalarPHI;
   }
 
+  // TODO come up with a n-ary matcher that subsumes both unary and
+  // binary matchers.
+  UnaryOperator *UO;
+  if (match(SrcVec, m_UnOp(UO)) && cheapToScalarize(SrcVec, IndexC)) {
+    // extelt (unop X), Index --> unop (extelt X, Index)
+    Value *X = UO->getOperand(0);
+    Value *E = Builder.CreateExtractElement(X, Index);
+    return UnaryOperator::CreateWithCopiedFlags(UO->getOpcode(), E, UO);
+  }
+
   BinaryOperator *BO;
   if (match(SrcVec, m_BinOp(BO)) && cheapToScalarize(SrcVec, IndexC)) {
     // extelt (binop X, Y), Index --> binop (extelt X, Index), (extelt Y, Index)
@@ -399,19 +416,18 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
         return replaceInstUsesWith(EI, IE->getOperand(1));
       // If the inserted and extracted elements are constants, they must not
       // be the same value, extract from the pre-inserted value instead.
-      if (isa<Constant>(IE->getOperand(2)) && IndexC) {
-        Worklist.AddValue(SrcVec);
-        EI.setOperand(0, IE->getOperand(0));
-        return &EI;
-      }
+      if (isa<Constant>(IE->getOperand(2)) && IndexC)
+        return replaceOperand(EI, 0, IE->getOperand(0));
     } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
       // If this is extracting an element from a shufflevector, figure out where
       // it came from and extract from the appropriate input element instead.
-      if (auto *Elt = dyn_cast<ConstantInt>(Index)) {
-        int SrcIdx = SVI->getMaskValue(Elt->getZExtValue());
+      // Restrict the following transformation to fixed-length vector.
+      if (isa<FixedVectorType>(SVI->getType()) && isa<ConstantInt>(Index)) {
+        int SrcIdx =
+            SVI->getMaskValue(cast<ConstantInt>(Index)->getZExtValue());
         Value *Src;
-        unsigned LHSWidth =
-          SVI->getOperand(0)->getType()->getVectorNumElements();
+        unsigned LHSWidth = cast<FixedVectorType>(SVI->getOperand(0)->getType())
+                                ->getNumElements();
 
         if (SrcIdx < 0)
           return replaceInstUsesWith(EI, UndefValue::get(EI.getType()));
@@ -422,9 +438,8 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
           Src = SVI->getOperand(1);
         }
         Type *Int32Ty = Type::getInt32Ty(EI.getContext());
-        return ExtractElementInst::Create(Src,
-                                          ConstantInt::get(Int32Ty,
-                                                           SrcIdx, false));
+        return ExtractElementInst::Create(
+            Src, ConstantInt::get(Int32Ty, SrcIdx, false));
       }
     } else if (auto *CI = dyn_cast<CastInst>(I)) {
       // Canonicalize extractelement(cast) -> cast(extractelement).
@@ -432,7 +447,6 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
       // nothing.
       if (CI->hasOneUse() && (CI->getOpcode() != Instruction::BitCast)) {
         Value *EE = Builder.CreateExtractElement(CI->getOperand(0), Index);
-        Worklist.AddValue(EE);
         return CastInst::Create(CI->getOpcode(), EE, EI.getType());
       }
     }
@@ -443,26 +457,25 @@ Instruction *InstCombiner::visitExtractElementInst(ExtractElementInst &EI) {
 /// If V is a shuffle of values that ONLY returns elements from either LHS or
 /// RHS, return the shuffle mask and true. Otherwise, return false.
 static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
-                                         SmallVectorImpl<Constant*> &Mask) {
+                                         SmallVectorImpl<int> &Mask) {
   assert(LHS->getType() == RHS->getType() &&
          "Invalid CollectSingleShuffleElements");
-  unsigned NumElts = V->getType()->getVectorNumElements();
+  unsigned NumElts = cast<VectorType>(V->getType())->getNumElements();
 
   if (isa<UndefValue>(V)) {
-    Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
+    Mask.assign(NumElts, -1);
     return true;
   }
 
   if (V == LHS) {
     for (unsigned i = 0; i != NumElts; ++i)
-      Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i));
+      Mask.push_back(i);
     return true;
   }
 
   if (V == RHS) {
     for (unsigned i = 0; i != NumElts; ++i)
-      Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()),
-                                      i+NumElts));
+      Mask.push_back(i + NumElts);
     return true;
   }
 
@@ -481,14 +494,15 @@ static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
       // transitively ok.
       if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
         // If so, update the mask to reflect the inserted undef.
-        Mask[InsertedIdx] = UndefValue::get(Type::getInt32Ty(V->getContext()));
+        Mask[InsertedIdx] = -1;
         return true;
       }
     } else if (ExtractElementInst *EI = dyn_cast<ExtractElementInst>(ScalarOp)){
       if (isa<ConstantInt>(EI->getOperand(1))) {
         unsigned ExtractedIdx =
         cast<ConstantInt>(EI->getOperand(1))->getZExtValue();
-        unsigned NumLHSElts = LHS->getType()->getVectorNumElements();
+        unsigned NumLHSElts =
+            cast<VectorType>(LHS->getType())->getNumElements();
 
         // This must be extracting from either LHS or RHS.
         if (EI->getOperand(0) == LHS || EI->getOperand(0) == RHS) {
@@ -497,14 +511,10 @@ static bool collectSingleShuffleElements(Value *V, Value *LHS, Value *RHS,
           if (collectSingleShuffleElements(VecOp, LHS, RHS, Mask)) {
             // If so, update the mask to reflect the inserted value.
             if (EI->getOperand(0) == LHS) {
-              Mask[InsertedIdx % NumElts] =
-              ConstantInt::get(Type::getInt32Ty(V->getContext()),
-                               ExtractedIdx);
+              Mask[InsertedIdx % NumElts] = ExtractedIdx;
             } else {
               assert(EI->getOperand(0) == RHS);
-              Mask[InsertedIdx % NumElts] =
-              ConstantInt::get(Type::getInt32Ty(V->getContext()),
-                               ExtractedIdx + NumLHSElts);
+              Mask[InsertedIdx % NumElts] = ExtractedIdx + NumLHSElts;
             }
             return true;
           }
@@ -524,8 +534,8 @@ static void replaceExtractElements(InsertElementInst *InsElt,
                                    InstCombiner &IC) {
   VectorType *InsVecType = InsElt->getType();
   VectorType *ExtVecType = ExtElt->getVectorOperandType();
-  unsigned NumInsElts = InsVecType->getVectorNumElements();
-  unsigned NumExtElts = ExtVecType->getVectorNumElements();
+  unsigned NumInsElts = InsVecType->getNumElements();
+  unsigned NumExtElts = ExtVecType->getNumElements();
 
   // The inserted-to vector must be wider than the extracted-from vector.
   if (InsVecType->getElementType() != ExtVecType->getElementType() ||
@@ -536,12 +546,11 @@ static void replaceExtractElements(InsertElementInst *InsElt,
   // values. The mask selects all of the values of the original vector followed
   // by as many undefined values as needed to create a vector of the same length
   // as the inserted-to vector.
-  SmallVector<Constant *, 16> ExtendMask;
-  IntegerType *IntType = Type::getInt32Ty(InsElt->getContext());
+  SmallVector<int, 16> ExtendMask;
   for (unsigned i = 0; i < NumExtElts; ++i)
-    ExtendMask.push_back(ConstantInt::get(IntType, i));
+    ExtendMask.push_back(i);
   for (unsigned i = NumExtElts; i < NumInsElts; ++i)
-    ExtendMask.push_back(UndefValue::get(IntType));
+    ExtendMask.push_back(-1);
 
   Value *ExtVecOp = ExtElt->getVectorOperand();
   auto *ExtVecOpInst = dyn_cast<Instruction>(ExtVecOp);
@@ -569,8 +578,8 @@ static void replaceExtractElements(InsertElementInst *InsElt,
   if (InsElt->hasOneUse() && isa<InsertElementInst>(InsElt->user_back()))
     return;
 
-  auto *WideVec = new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType),
-                                        ConstantVector::get(ExtendMask));
+  auto *WideVec =
+      new ShuffleVectorInst(ExtVecOp, UndefValue::get(ExtVecType), ExtendMask);
 
   // Insert the new shuffle after the vector operand of the extract is defined
   // (as long as it's not a PHI) or at the start of the basic block of the
@@ -603,21 +612,20 @@ static void replaceExtractElements(InsertElementInst *InsElt,
 /// often been chosen carefully to be efficiently implementable on the target.
 using ShuffleOps = std::pair<Value *, Value *>;
 
-static ShuffleOps collectShuffleElements(Value *V,
-                                         SmallVectorImpl<Constant *> &Mask,
+static ShuffleOps collectShuffleElements(Value *V, SmallVectorImpl<int> &Mask,
                                          Value *PermittedRHS,
                                          InstCombiner &IC) {
   assert(V->getType()->isVectorTy() && "Invalid shuffle!");
-  unsigned NumElts = V->getType()->getVectorNumElements();
+  unsigned NumElts = cast<FixedVectorType>(V->getType())->getNumElements();
 
   if (isa<UndefValue>(V)) {
-    Mask.assign(NumElts, UndefValue::get(Type::getInt32Ty(V->getContext())));
+    Mask.assign(NumElts, -1);
     return std::make_pair(
         PermittedRHS ? UndefValue::get(PermittedRHS->getType()) : V, nullptr);
   }
 
   if (isa<ConstantAggregateZero>(V)) {
-    Mask.assign(NumElts, ConstantInt::get(Type::getInt32Ty(V->getContext()),0));
+    Mask.assign(NumElts, 0);
     return std::make_pair(V, nullptr);
   }
 
@@ -648,14 +656,13 @@ static ShuffleOps collectShuffleElements(Value *V,
             // We tried our best, but we can't find anything compatible with RHS
             // further up the chain. Return a trivial shuffle.
             for (unsigned i = 0; i < NumElts; ++i)
-              Mask[i] = ConstantInt::get(Type::getInt32Ty(V->getContext()), i);
+              Mask[i] = i;
             return std::make_pair(V, nullptr);
           }
 
-          unsigned NumLHSElts = RHS->getType()->getVectorNumElements();
-          Mask[InsertedIdx % NumElts] =
-            ConstantInt::get(Type::getInt32Ty(V->getContext()),
-                             NumLHSElts+ExtractedIdx);
+          unsigned NumLHSElts =
+              cast<VectorType>(RHS->getType())->getNumElements();
+          Mask[InsertedIdx % NumElts] = NumLHSElts + ExtractedIdx;
           return std::make_pair(LR.first, RHS);
         }
 
@@ -663,11 +670,9 @@ static ShuffleOps collectShuffleElements(Value *V,
           // We've gone as far as we can: anything on the other side of the
           // extractelement will already have been converted into a shuffle.
           unsigned NumLHSElts =
-              EI->getOperand(0)->getType()->getVectorNumElements();
+              cast<VectorType>(EI->getOperand(0)->getType())->getNumElements();
           for (unsigned i = 0; i != NumElts; ++i)
-            Mask.push_back(ConstantInt::get(
-                Type::getInt32Ty(V->getContext()),
-                i == InsertedIdx ? ExtractedIdx : NumLHSElts + i));
+            Mask.push_back(i == InsertedIdx ? ExtractedIdx : NumLHSElts + i);
           return std::make_pair(EI->getOperand(0), PermittedRHS);
         }
 
@@ -683,7 +688,7 @@ static ShuffleOps collectShuffleElements(Value *V,
 
   // Otherwise, we can't do anything fancy. Return an identity vector.
   for (unsigned i = 0; i != NumElts; ++i)
-    Mask.push_back(ConstantInt::get(Type::getInt32Ty(V->getContext()), i));
+    Mask.push_back(i);
   return std::make_pair(V, nullptr);
 }
 
@@ -723,8 +728,14 @@ Instruction *InstCombiner::visitInsertValueInst(InsertValueInst &I) {
 }
 
 static bool isShuffleEquivalentToSelect(ShuffleVectorInst &Shuf) {
-  int MaskSize = Shuf.getMask()->getType()->getVectorNumElements();
-  int VecSize = Shuf.getOperand(0)->getType()->getVectorNumElements();
+  // Can not analyze scalable type, the number of elements is not a compile-time
+  // constant.
+  if (isa<ScalableVectorType>(Shuf.getOperand(0)->getType()))
+    return false;
+
+  int MaskSize = Shuf.getShuffleMask().size();
+  int VecSize =
+      cast<FixedVectorType>(Shuf.getOperand(0)->getType())->getNumElements();
 
   // A vector select does not change the size of the operands.
   if (MaskSize != VecSize)
@@ -750,8 +761,12 @@ static Instruction *foldInsSequenceIntoSplat(InsertElementInst &InsElt) {
   if (InsElt.hasOneUse() && isa<InsertElementInst>(InsElt.user_back()))
     return nullptr;
 
-  auto *VecTy = cast<VectorType>(InsElt.getType());
-  unsigned NumElements = VecTy->getNumElements();
+  VectorType *VecTy = InsElt.getType();
+  // Can not handle scalable type, the number of elements is not a compile-time
+  // constant.
+  if (isa<ScalableVectorType>(VecTy))
+    return nullptr;
+  unsigned NumElements = cast<FixedVectorType>(VecTy)->getNumElements();
 
   // Do not try to do this for a one-element vector, since that's a nop,
   // and will cause an inf-loop.
@@ -760,7 +775,7 @@ static Instruction *foldInsSequenceIntoSplat(InsertElementInst &InsElt) {
 
   Value *SplatVal = InsElt.getOperand(1);
   InsertElementInst *CurrIE = &InsElt;
-  SmallVector<bool, 16> ElementPresent(NumElements, false);
+  SmallBitVector ElementPresent(NumElements, false);
   InsertElementInst *FirstIE = nullptr;
 
   // Walk the chain backwards, keeping track of which indices we inserted into,
@@ -792,7 +807,7 @@ static Instruction *foldInsSequenceIntoSplat(InsertElementInst &InsElt) {
   // TODO: If the base vector is not undef, it might be better to create a splat
   //       and then a select-shuffle (blend) with the base vector.
   if (!isa<UndefValue>(FirstIE->getOperand(0)))
-    if (any_of(ElementPresent, [](bool Present) { return !Present; }))
+    if (!ElementPresent.all())
       return nullptr;
 
   // Create the insert + shuffle.
@@ -803,12 +818,12 @@ static Instruction *foldInsSequenceIntoSplat(InsertElementInst &InsElt) {
     FirstIE = InsertElementInst::Create(UndefVec, SplatVal, Zero, "", &InsElt);
 
   // Splat from element 0, but replace absent elements with undef in the mask.
-  SmallVector<Constant *, 16> Mask(NumElements, Zero);
+  SmallVector<int, 16> Mask(NumElements, 0);
   for (unsigned i = 0; i != NumElements; ++i)
     if (!ElementPresent[i])
-      Mask[i] = UndefValue::get(Int32Ty);
+      Mask[i] = -1;
 
-  return new ShuffleVectorInst(FirstIE, UndefVec, ConstantVector::get(Mask));
+  return new ShuffleVectorInst(FirstIE, UndefVec, Mask);
 }
 
 /// Try to fold an insert element into an existing splat shuffle by changing
@@ -819,6 +834,11 @@ static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) {
   if (!Shuf || !Shuf->isZeroEltSplat())
     return nullptr;
 
+  // Bail out early if shuffle is scalable type. The number of elements in
+  // shuffle mask is unknown at compile-time.
+  if (isa<ScalableVectorType>(Shuf->getType()))
+    return nullptr;
+
   // Check for a constant insertion index.
   uint64_t IdxC;
   if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))
@@ -827,21 +847,18 @@ static Instruction *foldInsEltIntoSplat(InsertElementInst &InsElt) {
   // Check if the splat shuffle's input is the same as this insert's scalar op.
   Value *X = InsElt.getOperand(1);
   Value *Op0 = Shuf->getOperand(0);
-  if (!match(Op0, m_InsertElement(m_Undef(), m_Specific(X), m_ZeroInt())))
+  if (!match(Op0, m_InsertElt(m_Undef(), m_Specific(X), m_ZeroInt())))
     return nullptr;
 
   // Replace the shuffle mask element at the index of this insert with a zero.
   // For example:
   // inselt (shuf (inselt undef, X, 0), undef, <0,undef,0,undef>), X, 1
   //   --> shuf (inselt undef, X, 0), undef, <0,0,0,undef>
-  unsigned NumMaskElts = Shuf->getType()->getVectorNumElements();
-  SmallVector<Constant *, 16> NewMaskVec(NumMaskElts);
-  Type *I32Ty = IntegerType::getInt32Ty(Shuf->getContext());
-  Constant *Zero = ConstantInt::getNullValue(I32Ty);
+  unsigned NumMaskElts = Shuf->getType()->getNumElements();
+  SmallVector<int, 16> NewMask(NumMaskElts);
   for (unsigned i = 0; i != NumMaskElts; ++i)
-    NewMaskVec[i] = i == IdxC ? Zero : Shuf->getMask()->getAggregateElement(i);
+    NewMask[i] = i == IdxC ? 0 : Shuf->getMaskValue(i);
 
-  Constant *NewMask = ConstantVector::get(NewMaskVec);
   return new ShuffleVectorInst(Op0, UndefValue::get(Op0->getType()), NewMask);
 }
 
@@ -854,6 +871,11 @@ static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) {
       !(Shuf->isIdentityWithExtract() || Shuf->isIdentityWithPadding()))
     return nullptr;
 
+  // Bail out early if shuffle is scalable type. The number of elements in
+  // shuffle mask is unknown at compile-time.
+  if (isa<ScalableVectorType>(Shuf->getType()))
+    return nullptr;
+
   // Check for a constant insertion index.
   uint64_t IdxC;
   if (!match(InsElt.getOperand(2), m_ConstantInt(IdxC)))
@@ -863,34 +885,31 @@ static Instruction *foldInsEltIntoIdentityShuffle(InsertElementInst &InsElt) {
   // input vector.
   Value *Scalar = InsElt.getOperand(1);
   Value *X = Shuf->getOperand(0);
-  if (!match(Scalar, m_ExtractElement(m_Specific(X), m_SpecificInt(IdxC))))
+  if (!match(Scalar, m_ExtractElt(m_Specific(X), m_SpecificInt(IdxC))))
     return nullptr;
 
   // Replace the shuffle mask element at the index of this extract+insert with
   // that same index value.
   // For example:
   // inselt (shuf X, IdMask), (extelt X, IdxC), IdxC --> shuf X, IdMask'
-  unsigned NumMaskElts = Shuf->getType()->getVectorNumElements();
-  SmallVector<Constant *, 16> NewMaskVec(NumMaskElts);
-  Type *I32Ty = IntegerType::getInt32Ty(Shuf->getContext());
-  Constant *NewMaskEltC = ConstantInt::get(I32Ty, IdxC);
-  Constant *OldMask = Shuf->getMask();
+  unsigned NumMaskElts = Shuf->getType()->getNumElements();
+  SmallVector<int, 16> NewMask(NumMaskElts);
+  ArrayRef<int> OldMask = Shuf->getShuffleMask();
   for (unsigned i = 0; i != NumMaskElts; ++i) {
     if (i != IdxC) {
       // All mask elements besides the inserted element remain the same.
-      NewMaskVec[i] = OldMask->getAggregateElement(i);
-    } else if (OldMask->getAggregateElement(i) == NewMaskEltC) {
+      NewMask[i] = OldMask[i];
+    } else if (OldMask[i] == (int)IdxC) {
       // If the mask element was already set, there's nothing to do
       // (demanded elements analysis may unset it later).
       return nullptr;
     } else {
-      assert(isa<UndefValue>(OldMask->getAggregateElement(i)) &&
+      assert(OldMask[i] == UndefMaskElem &&
              "Unexpected shuffle mask element for identity shuffle");
-      NewMaskVec[i] = NewMaskEltC;
+      NewMask[i] = IdxC;
     }
   }
 
-  Constant *NewMask = ConstantVector::get(NewMaskVec);
   return new ShuffleVectorInst(X, Shuf->getOperand(1), NewMask);
 }
 
@@ -958,31 +977,34 @@ static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
     // mask vector with the insertelt index plus the length of the vector
     // (because the constant vector operand of a shuffle is always the 2nd
     // operand).
-    Constant *Mask = Shuf->getMask();
-    unsigned NumElts = Mask->getType()->getVectorNumElements();
+    ArrayRef<int> Mask = Shuf->getShuffleMask();
+    unsigned NumElts = Mask.size();
     SmallVector<Constant *, 16> NewShufElts(NumElts);
-    SmallVector<Constant *, 16> NewMaskElts(NumElts);
+    SmallVector<int, 16> NewMaskElts(NumElts);
     for (unsigned I = 0; I != NumElts; ++I) {
       if (I == InsEltIndex) {
         NewShufElts[I] = InsEltScalar;
-        Type *Int32Ty = Type::getInt32Ty(Shuf->getContext());
-        NewMaskElts[I] = ConstantInt::get(Int32Ty, InsEltIndex + NumElts);
+        NewMaskElts[I] = InsEltIndex + NumElts;
       } else {
         // Copy over the existing values.
         NewShufElts[I] = ShufConstVec->getAggregateElement(I);
-        NewMaskElts[I] = Mask->getAggregateElement(I);
+        NewMaskElts[I] = Mask[I];
       }
     }
 
     // Create new operands for a shuffle that includes the constant of the
     // original insertelt. The old shuffle will be dead now.
     return new ShuffleVectorInst(Shuf->getOperand(0),
-                                 ConstantVector::get(NewShufElts),
-                                 ConstantVector::get(NewMaskElts));
+                                 ConstantVector::get(NewShufElts), NewMaskElts);
   } else if (auto *IEI = dyn_cast<InsertElementInst>(Inst)) {
     // Transform sequences of insertelements ops with constant data/indexes into
     // a single shuffle op.
-    unsigned NumElts = InsElt.getType()->getNumElements();
+    // Can not handle scalable type, the number of elements needed to create
+    // shuffle mask is not a compile-time constant.
+    if (isa<ScalableVectorType>(InsElt.getType()))
+      return nullptr;
+    unsigned NumElts =
+        cast<FixedVectorType>(InsElt.getType())->getNumElements();
 
     uint64_t InsertIdx[2];
     Constant *Val[2];
@@ -992,33 +1014,29 @@ static Instruction *foldConstantInsEltIntoShuffle(InsertElementInst &InsElt) {
         !match(IEI->getOperand(1), m_Constant(Val[1])))
       return nullptr;
     SmallVector<Constant *, 16> Values(NumElts);
-    SmallVector<Constant *, 16> Mask(NumElts);
+    SmallVector<int, 16> Mask(NumElts);
     auto ValI = std::begin(Val);
     // Generate new constant vector and mask.
     // We have 2 values/masks from the insertelements instructions. Insert them
     // into new value/mask vectors.
     for (uint64_t I : InsertIdx) {
       if (!Values[I]) {
-        assert(!Mask[I]);
         Values[I] = *ValI;
-        Mask[I] = ConstantInt::get(Type::getInt32Ty(InsElt.getContext()),
-                                   NumElts + I);
+        Mask[I] = NumElts + I;
       }
       ++ValI;
     }
     // Remaining values are filled with 'undef' values.
     for (unsigned I = 0; I < NumElts; ++I) {
       if (!Values[I]) {
-        assert(!Mask[I]);
         Values[I] = UndefValue::get(InsElt.getType()->getElementType());
-        Mask[I] = ConstantInt::get(Type::getInt32Ty(InsElt.getContext()), I);
+        Mask[I] = I;
       }
     }
     // Create new operands for a shuffle that includes the constant of the
     // original insertelt.
     return new ShuffleVectorInst(IEI->getOperand(0),
-                                 ConstantVector::get(Values),
-                                 ConstantVector::get(Mask));
+                                 ConstantVector::get(Values), Mask);
   }
   return nullptr;
 }
@@ -1032,28 +1050,51 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
           VecOp, ScalarOp, IdxOp, SQ.getWithInstruction(&IE)))
     return replaceInstUsesWith(IE, V);
 
+  // If the scalar is bitcast and inserted into undef, do the insert in the
+  // source type followed by bitcast.
+  // TODO: Generalize for insert into any constant, not just undef?
+  Value *ScalarSrc;
+  if (match(VecOp, m_Undef()) &&
+      match(ScalarOp, m_OneUse(m_BitCast(m_Value(ScalarSrc)))) &&
+      (ScalarSrc->getType()->isIntegerTy() ||
+       ScalarSrc->getType()->isFloatingPointTy())) {
+    // inselt undef, (bitcast ScalarSrc), IdxOp -->
+    //   bitcast (inselt undef, ScalarSrc, IdxOp)
+    Type *ScalarTy = ScalarSrc->getType();
+    Type *VecTy = VectorType::get(ScalarTy, IE.getType()->getElementCount());
+    UndefValue *NewUndef = UndefValue::get(VecTy);
+    Value *NewInsElt = Builder.CreateInsertElement(NewUndef, ScalarSrc, IdxOp);
+    return new BitCastInst(NewInsElt, IE.getType());
+  }
+
   // If the vector and scalar are both bitcast from the same element type, do
   // the insert in that source type followed by bitcast.
-  Value *VecSrc, *ScalarSrc;
+  Value *VecSrc;
   if (match(VecOp, m_BitCast(m_Value(VecSrc))) &&
       match(ScalarOp, m_BitCast(m_Value(ScalarSrc))) &&
       (VecOp->hasOneUse() || ScalarOp->hasOneUse()) &&
       VecSrc->getType()->isVectorTy() && !ScalarSrc->getType()->isVectorTy() &&
-      VecSrc->getType()->getVectorElementType() == ScalarSrc->getType()) {
+      cast<VectorType>(VecSrc->getType())->getElementType() ==
+          ScalarSrc->getType()) {
     // inselt (bitcast VecSrc), (bitcast ScalarSrc), IdxOp -->
     //   bitcast (inselt VecSrc, ScalarSrc, IdxOp)
     Value *NewInsElt = Builder.CreateInsertElement(VecSrc, ScalarSrc, IdxOp);
     return new BitCastInst(NewInsElt, IE.getType());
   }
 
-  // If the inserted element was extracted from some other vector and both
-  // indexes are valid constants, try to turn this into a shuffle.
+  // If the inserted element was extracted from some other fixed-length vector
+  // and both indexes are valid constants, try to turn this into a shuffle.
+  // Can not handle scalable vector type, the number of elements needed to
+  // create shuffle mask is not a compile-time constant.
   uint64_t InsertedIdx, ExtractedIdx;
   Value *ExtVecOp;
-  if (match(IdxOp, m_ConstantInt(InsertedIdx)) &&
-      match(ScalarOp, m_ExtractElement(m_Value(ExtVecOp),
-                                       m_ConstantInt(ExtractedIdx))) &&
-      ExtractedIdx < ExtVecOp->getType()->getVectorNumElements()) {
+  if (isa<FixedVectorType>(IE.getType()) &&
+      match(IdxOp, m_ConstantInt(InsertedIdx)) &&
+      match(ScalarOp,
+            m_ExtractElt(m_Value(ExtVecOp), m_ConstantInt(ExtractedIdx))) &&
+      isa<FixedVectorType>(ExtVecOp->getType()) &&
+      ExtractedIdx <
+          cast<FixedVectorType>(ExtVecOp->getType())->getNumElements()) {
     // TODO: Looking at the user(s) to determine if this insert is a
     // fold-to-shuffle opportunity does not match the usual instcombine
     // constraints. We should decide if the transform is worthy based only
@@ -1079,7 +1120,7 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
 
     // Try to form a shuffle from a chain of extract-insert ops.
     if (isShuffleRootCandidate(IE)) {
-      SmallVector<Constant*, 16> Mask;
+      SmallVector<int, 16> Mask;
       ShuffleOps LR = collectShuffleElements(&IE, Mask, nullptr, *this);
 
       // The proposed shuffle may be trivial, in which case we shouldn't
@@ -1088,19 +1129,20 @@ Instruction *InstCombiner::visitInsertElementInst(InsertElementInst &IE) {
         // We now have a shuffle of LHS, RHS, Mask.
         if (LR.second == nullptr)
           LR.second = UndefValue::get(LR.first->getType());
-        return new ShuffleVectorInst(LR.first, LR.second,
-                                     ConstantVector::get(Mask));
+        return new ShuffleVectorInst(LR.first, LR.second, Mask);
       }
     }
   }
 
-  unsigned VWidth = VecOp->getType()->getVectorNumElements();
-  APInt UndefElts(VWidth, 0);
-  APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
-  if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) {
-    if (V != &IE)
-      return replaceInstUsesWith(IE, V);
-    return &IE;
+  if (auto VecTy = dyn_cast<FixedVectorType>(VecOp->getType())) {
+    unsigned VWidth = VecTy->getNumElements();
+    APInt UndefElts(VWidth, 0);
+    APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
+    if (Value *V = SimplifyDemandedVectorElts(&IE, AllOnesEltMask, UndefElts)) {
+      if (V != &IE)
+        return replaceInstUsesWith(IE, V);
+      return &IE;
+    }
   }
 
   if (Instruction *Shuf = foldConstantInsEltIntoShuffle(IE))
@@ -1179,7 +1221,8 @@ static bool canEvaluateShuffled(Value *V, ArrayRef<int> Mask,
       // Bail out if we would create longer vector ops. We could allow creating
       // longer vector ops, but that may result in more expensive codegen.
       Type *ITy = I->getType();
-      if (ITy->isVectorTy() && Mask.size() > ITy->getVectorNumElements())
+      if (ITy->isVectorTy() &&
+          Mask.size() > cast<VectorType>(ITy)->getNumElements())
         return false;
       for (Value *Operand : I->operands()) {
         if (!canEvaluateShuffled(Operand, Mask, Depth - 1))
@@ -1267,9 +1310,9 @@ static Value *buildNew(Instruction *I, ArrayRef<Value*> NewOps) {
     case Instruction::FPExt: {
       // It's possible that the mask has a different number of elements from
       // the original cast. We recompute the destination type to match the mask.
-      Type *DestTy =
-          VectorType::get(I->getType()->getScalarType(),
-                          NewOps[0]->getType()->getVectorNumElements());
+      Type *DestTy = VectorType::get(
+          I->getType()->getScalarType(),
+          cast<VectorType>(NewOps[0]->getType())->getElementCount());
       assert(NewOps.size() == 1 && "cast with #ops != 1");
       return CastInst::Create(cast<CastInst>(I)->getOpcode(), NewOps[0], DestTy,
                               "", I);
@@ -1293,22 +1336,14 @@ static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
   Type *EltTy = V->getType()->getScalarType();
   Type *I32Ty = IntegerType::getInt32Ty(V->getContext());
   if (isa<UndefValue>(V))
-    return UndefValue::get(VectorType::get(EltTy, Mask.size()));
+    return UndefValue::get(FixedVectorType::get(EltTy, Mask.size()));
 
   if (isa<ConstantAggregateZero>(V))
-    return ConstantAggregateZero::get(VectorType::get(EltTy, Mask.size()));
+    return ConstantAggregateZero::get(FixedVectorType::get(EltTy, Mask.size()));
 
-  if (Constant *C = dyn_cast<Constant>(V)) {
-    SmallVector<Constant *, 16> MaskValues;
-    for (int i = 0, e = Mask.size(); i != e; ++i) {
-      if (Mask[i] == -1)
-        MaskValues.push_back(UndefValue::get(I32Ty));
-      else
-        MaskValues.push_back(ConstantInt::get(I32Ty, Mask[i]));
-    }
+  if (Constant *C = dyn_cast<Constant>(V))
     return ConstantExpr::getShuffleVector(C, UndefValue::get(C->getType()),
-                                          ConstantVector::get(MaskValues));
-  }
+                                          Mask);
 
   Instruction *I = cast<Instruction>(V);
   switch (I->getOpcode()) {
@@ -1344,7 +1379,8 @@ static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
     case Instruction::Select:
     case Instruction::GetElementPtr: {
       SmallVector<Value*, 8> NewOps;
-      bool NeedsRebuild = (Mask.size() != I->getType()->getVectorNumElements());
+      bool NeedsRebuild =
+          (Mask.size() != cast<VectorType>(I->getType())->getNumElements());
       for (int i = 0, e = I->getNumOperands(); i != e; ++i) {
         Value *V;
         // Recursively call evaluateInDifferentElementOrder on vector arguments
@@ -1397,8 +1433,9 @@ static Value *evaluateInDifferentElementOrder(Value *V, ArrayRef<int> Mask) {
 //   Shuffles to:  |EE|FF|GG|HH|
 //                 +--+--+--+--+
 static bool isShuffleExtractingFromLHS(ShuffleVectorInst &SVI,
-                                       SmallVector<int, 16> &Mask) {
-  unsigned LHSElems = SVI.getOperand(0)->getType()->getVectorNumElements();
+                                       ArrayRef<int> Mask) {
+  unsigned LHSElems =
+      cast<VectorType>(SVI.getOperand(0)->getType())->getNumElements();
   unsigned MaskElems = Mask.size();
   unsigned BegIdx = Mask.front();
   unsigned EndIdx = Mask.back();
@@ -1480,12 +1517,12 @@ static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
   // Example: shuf (mul X, {-1,-2,-3,-4}), X, {0,5,6,3} --> mul X, {-1,1,1,-4}
   // Example: shuf X, (add X, {-1,-2,-3,-4}), {0,1,6,7} --> add X, {0,0,-3,-4}
   // The existing binop constant vector remains in the same operand position.
-  Constant *Mask = Shuf.getMask();
+  ArrayRef<int> Mask = Shuf.getShuffleMask();
   Constant *NewC = Op0IsBinop ? ConstantExpr::getShuffleVector(C, IdC, Mask) :
                                 ConstantExpr::getShuffleVector(IdC, C, Mask);
 
   bool MightCreatePoisonOrUB =
-      Mask->containsUndefElement() &&
+      is_contained(Mask, UndefMaskElem) &&
       (Instruction::isIntDivRem(BOpcode) || Instruction::isShift(BOpcode));
   if (MightCreatePoisonOrUB)
     NewC = getSafeVectorConstantForBinop(BOpcode, NewC, true);
@@ -1499,7 +1536,7 @@ static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
   // An undef shuffle mask element may propagate as an undef constant element in
   // the new binop. That would produce poison where the original code might not.
   // If we already made a safe constant, then there's no danger.
-  if (Mask->containsUndefElement() && !MightCreatePoisonOrUB)
+  if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
     NewBO->dropPoisonGeneratingFlags();
   return NewBO;
 }
@@ -1511,14 +1548,14 @@ static Instruction *foldSelectShuffleWith1Binop(ShuffleVectorInst &Shuf) {
 static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf,
                                             InstCombiner::BuilderTy &Builder) {
   Value *Op0 = Shuf.getOperand(0), *Op1 = Shuf.getOperand(1);
-  Constant *Mask = Shuf.getMask();
+  ArrayRef<int> Mask = Shuf.getShuffleMask();
   Value *X;
   uint64_t IndexC;
 
   // Match a shuffle that is a splat to a non-zero element.
-  if (!match(Op0, m_OneUse(m_InsertElement(m_Undef(), m_Value(X),
-                                           m_ConstantInt(IndexC)))) ||
-      !match(Op1, m_Undef()) || match(Mask, m_ZeroInt()) || IndexC == 0)
+  if (!match(Op0, m_OneUse(m_InsertElt(m_Undef(), m_Value(X),
+                                       m_ConstantInt(IndexC)))) ||
+      !match(Op1, m_Undef()) || match(Mask, m_ZeroMask()) || IndexC == 0)
     return nullptr;
 
   // Insert into element 0 of an undef vector.
@@ -1530,13 +1567,13 @@ static Instruction *canonicalizeInsertSplat(ShuffleVectorInst &Shuf,
   // For example:
   // shuf (inselt undef, X, 2), undef, <2,2,undef>
   //   --> shuf (inselt undef, X, 0), undef, <0,0,undef>
-  unsigned NumMaskElts = Shuf.getType()->getVectorNumElements();
-  SmallVector<Constant *, 16> NewMask(NumMaskElts, Zero);
+  unsigned NumMaskElts = Shuf.getType()->getNumElements();
+  SmallVector<int, 16> NewMask(NumMaskElts, 0);
   for (unsigned i = 0; i != NumMaskElts; ++i)
-    if (isa<UndefValue>(Mask->getAggregateElement(i)))
-      NewMask[i] = Mask->getAggregateElement(i);
+    if (Mask[i] == UndefMaskElem)
+      NewMask[i] = Mask[i];
 
-  return new ShuffleVectorInst(NewIns, UndefVec, ConstantVector::get(NewMask));
+  return new ShuffleVectorInst(NewIns, UndefVec, NewMask);
 }
 
 /// Try to fold shuffles that are the equivalent of a vector select.
@@ -1548,7 +1585,7 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
 
   // Canonicalize to choose from operand 0 first unless operand 1 is undefined.
   // Commuting undef to operand 0 conflicts with another canonicalization.
-  unsigned NumElts = Shuf.getType()->getVectorNumElements();
+  unsigned NumElts = Shuf.getType()->getNumElements();
   if (!isa<UndefValue>(Shuf.getOperand(1)) &&
       Shuf.getMaskValue(0) >= (int)NumElts) {
     // TODO: Can we assert that both operands of a shuffle-select are not undef
@@ -1605,14 +1642,14 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
   BinaryOperator::BinaryOps BOpc = Opc0;
 
   // Select the constant elements needed for the single binop.
-  Constant *Mask = Shuf.getMask();
+  ArrayRef<int> Mask = Shuf.getShuffleMask();
   Constant *NewC = ConstantExpr::getShuffleVector(C0, C1, Mask);
 
   // We are moving a binop after a shuffle. When a shuffle has an undefined
   // mask element, the result is undefined, but it is not poison or undefined
   // behavior. That is not necessarily true for div/rem/shift.
   bool MightCreatePoisonOrUB =
-      Mask->containsUndefElement() &&
+      is_contained(Mask, UndefMaskElem) &&
       (Instruction::isIntDivRem(BOpc) || Instruction::isShift(BOpc));
   if (MightCreatePoisonOrUB)
     NewC = getSafeVectorConstantForBinop(BOpc, NewC, ConstantsAreOp1);
@@ -1661,11 +1698,53 @@ static Instruction *foldSelectShuffle(ShuffleVectorInst &Shuf,
   NewBO->andIRFlags(B1);
   if (DropNSW)
     NewBO->setHasNoSignedWrap(false);
-  if (Mask->containsUndefElement() && !MightCreatePoisonOrUB)
+  if (is_contained(Mask, UndefMaskElem) && !MightCreatePoisonOrUB)
     NewBO->dropPoisonGeneratingFlags();
   return NewBO;
 }
 
+/// Convert a narrowing shuffle of a bitcasted vector into a vector truncate.
+/// Example (little endian):
+/// shuf (bitcast <4 x i16> X to <8 x i8>), <0, 2, 4, 6> --> trunc X to <4 x i8>
+static Instruction *foldTruncShuffle(ShuffleVectorInst &Shuf,
+                                     bool IsBigEndian) {
+  // This must be a bitcasted shuffle of 1 vector integer operand.
+  Type *DestType = Shuf.getType();
+  Value *X;
+  if (!match(Shuf.getOperand(0), m_BitCast(m_Value(X))) ||
+      !match(Shuf.getOperand(1), m_Undef()) || !DestType->isIntOrIntVectorTy())
+    return nullptr;
+
+  // The source type must have the same number of elements as the shuffle,
+  // and the source element type must be larger than the shuffle element type.
+  Type *SrcType = X->getType();
+  if (!SrcType->isVectorTy() || !SrcType->isIntOrIntVectorTy() ||
+      cast<VectorType>(SrcType)->getNumElements() !=
+          cast<VectorType>(DestType)->getNumElements() ||
+      SrcType->getScalarSizeInBits() % DestType->getScalarSizeInBits() != 0)
+    return nullptr;
+
+  assert(Shuf.changesLength() && !Shuf.increasesLength() &&
+         "Expected a shuffle that decreases length");
+
+  // Last, check that the mask chooses the correct low bits for each narrow
+  // element in the result.
+  uint64_t TruncRatio =
+      SrcType->getScalarSizeInBits() / DestType->getScalarSizeInBits();
+  ArrayRef<int> Mask = Shuf.getShuffleMask();
+  for (unsigned i = 0, e = Mask.size(); i != e; ++i) {
+    if (Mask[i] == UndefMaskElem)
+      continue;
+    uint64_t LSBIndex = IsBigEndian ? (i + 1) * TruncRatio - 1 : i * TruncRatio;
+    assert(LSBIndex <= std::numeric_limits<int32_t>::max() &&
+           "Overflowed 32-bits");
+    if (Mask[i] != (int)LSBIndex)
+      return nullptr;
+  }
+
+  return new TruncInst(X, DestType);
+}
+
 /// Match a shuffle-select-shuffle pattern where the shuffles are widening and
 /// narrowing (concatenating with undef and extracting back to the original
 /// length). This allows replacing the wide select with a narrow select.
@@ -1685,19 +1764,19 @@ static Instruction *narrowVectorSelect(ShuffleVectorInst &Shuf,
 
   // We need a narrow condition value. It must be extended with undef elements
   // and have the same number of elements as this shuffle.
-  unsigned NarrowNumElts = Shuf.getType()->getVectorNumElements();
+  unsigned NarrowNumElts = Shuf.getType()->getNumElements();
   Value *NarrowCond;
-  if (!match(Cond, m_OneUse(m_ShuffleVector(m_Value(NarrowCond), m_Undef(),
-                                            m_Constant()))) ||
-      NarrowCond->getType()->getVectorNumElements() != NarrowNumElts ||
+  if (!match(Cond, m_OneUse(m_Shuffle(m_Value(NarrowCond), m_Undef()))) ||
+      cast<VectorType>(NarrowCond->getType())->getNumElements() !=
+          NarrowNumElts ||
       !cast<ShuffleVectorInst>(Cond)->isIdentityWithPadding())
     return nullptr;
 
   // shuf (sel (shuf NarrowCond, undef, WideMask), X, Y), undef, NarrowMask) -->
   // sel NarrowCond, (shuf X, undef, NarrowMask), (shuf Y, undef, NarrowMask)
   Value *Undef = UndefValue::get(X->getType());
-  Value *NarrowX = Builder.CreateShuffleVector(X, Undef, Shuf.getMask());
-  Value *NarrowY = Builder.CreateShuffleVector(Y, Undef, Shuf.getMask());
+  Value *NarrowX = Builder.CreateShuffleVector(X, Undef, Shuf.getShuffleMask());
+  Value *NarrowY = Builder.CreateShuffleVector(Y, Undef, Shuf.getShuffleMask());
   return SelectInst::Create(NarrowCond, NarrowX, NarrowY);
 }
 
@@ -1708,8 +1787,8 @@ static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
     return nullptr;
 
   Value *X, *Y;
-  Constant *Mask;
-  if (!match(Op0, m_ShuffleVector(m_Value(X), m_Value(Y), m_Constant(Mask))))
+  ArrayRef<int> Mask;
+  if (!match(Op0, m_Shuffle(m_Value(X), m_Value(Y), m_Mask(Mask))))
     return nullptr;
 
   // Be conservative with shuffle transforms. If we can't kill the 1st shuffle,
@@ -1728,30 +1807,32 @@ static Instruction *foldIdentityExtractShuffle(ShuffleVectorInst &Shuf) {
   // new shuffle mask. Otherwise, copy the original mask element. Example:
   //   shuf (shuf X, Y, <C0, C1, C2, undef, C4>), undef, <0, undef, 2, 3> -->
   //   shuf X, Y, <C0, undef, C2, undef>
-  unsigned NumElts = Shuf.getType()->getVectorNumElements();
-  SmallVector<Constant *, 16> NewMask(NumElts);
-  assert(NumElts < Mask->getType()->getVectorNumElements() &&
+  unsigned NumElts = Shuf.getType()->getNumElements();
+  SmallVector<int, 16> NewMask(NumElts);
+  assert(NumElts < Mask.size() &&
          "Identity with extract must have less elements than its inputs");
 
   for (unsigned i = 0; i != NumElts; ++i) {
-    Constant *ExtractMaskElt = Shuf.getMask()->getAggregateElement(i);
-    Constant *MaskElt = Mask->getAggregateElement(i);
-    NewMask[i] = isa<UndefValue>(ExtractMaskElt) ? ExtractMaskElt : MaskElt;
+    int ExtractMaskElt = Shuf.getMaskValue(i);
+    int MaskElt = Mask[i];
+    NewMask[i] = ExtractMaskElt == UndefMaskElem ? ExtractMaskElt : MaskElt;
   }
-  return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask));
+  return new ShuffleVectorInst(X, Y, NewMask);
 }
 
 /// Try to replace a shuffle with an insertelement or try to replace a shuffle
 /// operand with the operand of an insertelement.
-static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf) {
+static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf,
+                                          InstCombiner &IC) {
   Value *V0 = Shuf.getOperand(0), *V1 = Shuf.getOperand(1);
-  SmallVector<int, 16> Mask = Shuf.getShuffleMask();
+  SmallVector<int, 16> Mask;
+  Shuf.getShuffleMask(Mask);
 
   // The shuffle must not change vector sizes.
   // TODO: This restriction could be removed if the insert has only one use
   //       (because the transform would require a new length-changing shuffle).
   int NumElts = Mask.size();
-  if (NumElts != (int)(V0->getType()->getVectorNumElements()))
+  if (NumElts != (int)(cast<VectorType>(V0->getType())->getNumElements()))
     return nullptr;
 
   // This is a specialization of a fold in SimplifyDemandedVectorElts. We may
@@ -1761,29 +1842,25 @@ static Instruction *foldShuffleWithInsert(ShuffleVectorInst &Shuf) {
   // operand with the source vector of the insertelement.
   Value *X;
   uint64_t IdxC;
-  if (match(V0, m_InsertElement(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
+  if (match(V0, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
     // shuf (inselt X, ?, IdxC), ?, Mask --> shuf X, ?, Mask
-    if (none_of(Mask, [IdxC](int MaskElt) { return MaskElt == (int)IdxC; })) {
-      Shuf.setOperand(0, X);
-      return &Shuf;
-    }
+    if (none_of(Mask, [IdxC](int MaskElt) { return MaskElt == (int)IdxC; }))
+      return IC.replaceOperand(Shuf, 0, X);
   }
-  if (match(V1, m_InsertElement(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
+  if (match(V1, m_InsertElt(m_Value(X), m_Value(), m_ConstantInt(IdxC)))) {
     // Offset the index constant by the vector width because we are checking for
     // accesses to the 2nd vector input of the shuffle.
     IdxC += NumElts;
     // shuf ?, (inselt X, ?, IdxC), Mask --> shuf ?, X, Mask
-    if (none_of(Mask, [IdxC](int MaskElt) { return MaskElt == (int)IdxC; })) {
-      Shuf.setOperand(1, X);
-      return &Shuf;
-    }
+    if (none_of(Mask, [IdxC](int MaskElt) { return MaskElt == (int)IdxC; }))
+      return IC.replaceOperand(Shuf, 1, X);
   }
 
   // shuffle (insert ?, Scalar, IndexC), V1, Mask --> insert V1, Scalar, IndexC'
   auto isShufflingScalarIntoOp1 = [&](Value *&Scalar, ConstantInt *&IndexC) {
     // We need an insertelement with a constant index.
-    if (!match(V0, m_InsertElement(m_Value(), m_Value(Scalar),
-                                   m_ConstantInt(IndexC))))
+    if (!match(V0, m_InsertElt(m_Value(), m_Value(Scalar),
+                               m_ConstantInt(IndexC))))
       return false;
 
     // Test the shuffle mask to see if it splices the inserted scalar into the
@@ -1850,9 +1927,9 @@ static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) {
   Value *X = Shuffle0->getOperand(0);
   Value *Y = Shuffle1->getOperand(0);
   if (X->getType() != Y->getType() ||
-      !isPowerOf2_32(Shuf.getType()->getVectorNumElements()) ||
-      !isPowerOf2_32(Shuffle0->getType()->getVectorNumElements()) ||
-      !isPowerOf2_32(X->getType()->getVectorNumElements()) ||
+      !isPowerOf2_32(Shuf.getType()->getNumElements()) ||
+      !isPowerOf2_32(Shuffle0->getType()->getNumElements()) ||
+      !isPowerOf2_32(cast<VectorType>(X->getType())->getNumElements()) ||
       isa<UndefValue>(X) || isa<UndefValue>(Y))
     return nullptr;
   assert(isa<UndefValue>(Shuffle0->getOperand(1)) &&
@@ -1863,13 +1940,12 @@ static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) {
   // operands directly by adjusting the shuffle mask to account for the narrower
   // types:
   // shuf (widen X), (widen Y), Mask --> shuf X, Y, Mask'
-  int NarrowElts = X->getType()->getVectorNumElements();
-  int WideElts = Shuffle0->getType()->getVectorNumElements();
+  int NarrowElts = cast<VectorType>(X->getType())->getNumElements();
+  int WideElts = Shuffle0->getType()->getNumElements();
   assert(WideElts > NarrowElts && "Unexpected types for identity with padding");
 
-  Type *I32Ty = IntegerType::getInt32Ty(Shuf.getContext());
-  SmallVector<int, 16> Mask = Shuf.getShuffleMask();
-  SmallVector<Constant *, 16> NewMask(Mask.size(), UndefValue::get(I32Ty));
+  ArrayRef<int> Mask = Shuf.getShuffleMask();
+  SmallVector<int, 16> NewMask(Mask.size(), -1);
   for (int i = 0, e = Mask.size(); i != e; ++i) {
     if (Mask[i] == -1)
       continue;
@@ -1889,42 +1965,71 @@ static Instruction *foldIdentityPaddedShuffles(ShuffleVectorInst &Shuf) {
     // element is offset down to adjust for the narrow vector widths.
     if (Mask[i] < WideElts) {
       assert(Mask[i] < NarrowElts && "Unexpected shuffle mask");
-      NewMask[i] = ConstantInt::get(I32Ty, Mask[i]);
+      NewMask[i] = Mask[i];
     } else {
       assert(Mask[i] < (WideElts + NarrowElts) && "Unexpected shuffle mask");
-      NewMask[i] = ConstantInt::get(I32Ty, Mask[i] - (WideElts - NarrowElts));
+      NewMask[i] = Mask[i] - (WideElts - NarrowElts);
     }
   }
-  return new ShuffleVectorInst(X, Y, ConstantVector::get(NewMask));
+  return new ShuffleVectorInst(X, Y, NewMask);
 }
 
 Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   Value *LHS = SVI.getOperand(0);
   Value *RHS = SVI.getOperand(1);
-  if (auto *V = SimplifyShuffleVectorInst(
-          LHS, RHS, SVI.getMask(), SVI.getType(), SQ.getWithInstruction(&SVI)))
+  SimplifyQuery ShufQuery = SQ.getWithInstruction(&SVI);
+  if (auto *V = SimplifyShuffleVectorInst(LHS, RHS, SVI.getShuffleMask(),
+                                          SVI.getType(), ShufQuery))
     return replaceInstUsesWith(SVI, V);
 
   // shuffle x, x, mask --> shuffle x, undef, mask'
-  unsigned VWidth = SVI.getType()->getVectorNumElements();
-  unsigned LHSWidth = LHS->getType()->getVectorNumElements();
-  SmallVector<int, 16> Mask = SVI.getShuffleMask();
+  unsigned VWidth = SVI.getType()->getNumElements();
+  unsigned LHSWidth = cast<VectorType>(LHS->getType())->getNumElements();
+  ArrayRef<int> Mask = SVI.getShuffleMask();
   Type *Int32Ty = Type::getInt32Ty(SVI.getContext());
+
+  // Peek through a bitcasted shuffle operand by scaling the mask. If the
+  // simulated shuffle can simplify, then this shuffle is unnecessary:
+  // shuf (bitcast X), undef, Mask --> bitcast X'
+  // TODO: This could be extended to allow length-changing shuffles.
+  //       The transform might also be obsoleted if we allowed canonicalization
+  //       of bitcasted shuffles.
+  Value *X;
+  if (match(LHS, m_BitCast(m_Value(X))) && match(RHS, m_Undef()) &&
+      X->getType()->isVectorTy() && VWidth == LHSWidth) {
+    // Try to create a scaled mask constant.
+    auto *XType = cast<VectorType>(X->getType());
+    unsigned XNumElts = XType->getNumElements();
+    SmallVector<int, 16> ScaledMask;
+    if (XNumElts >= VWidth) {
+      assert(XNumElts % VWidth == 0 && "Unexpected vector bitcast");
+      narrowShuffleMaskElts(XNumElts / VWidth, Mask, ScaledMask);
+    } else {
+      assert(VWidth % XNumElts == 0 && "Unexpected vector bitcast");
+      if (!widenShuffleMaskElts(VWidth / XNumElts, Mask, ScaledMask))
+        ScaledMask.clear();
+    }
+    if (!ScaledMask.empty()) {
+      // If the shuffled source vector simplifies, cast that value to this
+      // shuffle's type.
+      if (auto *V = SimplifyShuffleVectorInst(X, UndefValue::get(XType),
+                                              ScaledMask, XType, ShufQuery))
+        return BitCastInst::Create(Instruction::BitCast, V, SVI.getType());
+    }
+  }
+
   if (LHS == RHS) {
     assert(!isa<UndefValue>(RHS) && "Shuffle with 2 undef ops not simplified?");
     // Remap any references to RHS to use LHS.
-    SmallVector<Constant*, 16> Elts;
+    SmallVector<int, 16> Elts;
     for (unsigned i = 0; i != VWidth; ++i) {
       // Propagate undef elements or force mask to LHS.
       if (Mask[i] < 0)
-        Elts.push_back(UndefValue::get(Int32Ty));
+        Elts.push_back(UndefMaskElem);
       else
-        Elts.push_back(ConstantInt::get(Int32Ty, Mask[i] % LHSWidth));
+        Elts.push_back(Mask[i] % LHSWidth);
     }
-    SVI.setOperand(0, SVI.getOperand(1));
-    SVI.setOperand(1, UndefValue::get(RHS->getType()));
-    SVI.setOperand(2, ConstantVector::get(Elts));
-    return &SVI;
+    return new ShuffleVectorInst(LHS, UndefValue::get(RHS->getType()), Elts);
   }
 
   // shuffle undef, x, mask --> shuffle x, undef, mask'
@@ -1939,6 +2044,9 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (Instruction *I = foldSelectShuffle(SVI, Builder, DL))
     return I;
 
+  if (Instruction *I = foldTruncShuffle(SVI, DL.isBigEndian()))
+    return I;
+
   if (Instruction *I = narrowVectorSelect(SVI, Builder))
     return I;
 
@@ -1955,7 +2063,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
 
   // These transforms have the potential to lose undef knowledge, so they are
   // intentionally placed after SimplifyDemandedVectorElts().
-  if (Instruction *I = foldShuffleWithInsert(SVI))
+  if (Instruction *I = foldShuffleWithInsert(SVI, *this))
     return I;
   if (Instruction *I = foldIdentityPaddedShuffles(SVI))
     return I;
@@ -1999,7 +2107,7 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
     Value *V = LHS;
     unsigned MaskElems = Mask.size();
     VectorType *SrcTy = cast<VectorType>(V->getType());
-    unsigned VecBitWidth = SrcTy->getBitWidth();
+    unsigned VecBitWidth = SrcTy->getPrimitiveSizeInBits().getFixedSize();
     unsigned SrcElemBitWidth = DL.getTypeSizeInBits(SrcTy->getElementType());
     assert(SrcElemBitWidth && "vector elements must have a bitwidth");
     unsigned SrcNumElems = SrcTy->getNumElements();
@@ -2023,16 +2131,15 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
         continue;
       if (!VectorType::isValidElementType(TgtTy))
         continue;
-      VectorType *CastSrcTy = VectorType::get(TgtTy, TgtNumElems);
+      auto *CastSrcTy = FixedVectorType::get(TgtTy, TgtNumElems);
       if (!BegIsAligned) {
         // Shuffle the input so [0,NumElements) contains the output, and
         // [NumElems,SrcNumElems) is undef.
-        SmallVector<Constant *, 16> ShuffleMask(SrcNumElems,
-                                                UndefValue::get(Int32Ty));
+        SmallVector<int, 16> ShuffleMask(SrcNumElems, -1);
         for (unsigned I = 0, E = MaskElems, Idx = BegIdx; I != E; ++Idx, ++I)
-          ShuffleMask[I] = ConstantInt::get(Int32Ty, Idx);
+          ShuffleMask[I] = Idx;
         V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
-                                        ConstantVector::get(ShuffleMask),
+                                        ShuffleMask,
                                         SVI.getName() + ".extract");
         BegIdx = 0;
       }
@@ -2117,11 +2224,11 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (LHSShuffle) {
     LHSOp0 = LHSShuffle->getOperand(0);
     LHSOp1 = LHSShuffle->getOperand(1);
-    LHSOp0Width = LHSOp0->getType()->getVectorNumElements();
+    LHSOp0Width = cast<VectorType>(LHSOp0->getType())->getNumElements();
   }
   if (RHSShuffle) {
     RHSOp0 = RHSShuffle->getOperand(0);
-    RHSOp0Width = RHSOp0->getType()->getVectorNumElements();
+    RHSOp0Width = cast<VectorType>(RHSOp0->getType())->getNumElements();
   }
   Value* newLHS = LHS;
   Value* newRHS = RHS;
@@ -2149,8 +2256,8 @@ Instruction *InstCombiner::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   if (newLHS == LHS && newRHS == RHS)
     return MadeChange ? &SVI : nullptr;
 
-  SmallVector<int, 16> LHSMask;
-  SmallVector<int, 16> RHSMask;
+  ArrayRef<int> LHSMask;
+  ArrayRef<int> RHSMask;
   if (newLHS != LHS)
     LHSMask = LHSShuffle->getShuffleMask();
   if (RHSShuffle && newRHS != RHS)
diff --git a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
index 801c09a317a7..b3254c10a0b2 100644
--- a/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -60,6 +60,7 @@
 #include "llvm/Analysis/TargetFolder.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
@@ -129,10 +130,6 @@ static cl::opt<bool>
 EnableCodeSinking("instcombine-code-sinking", cl::desc("Enable code sinking"),
                                               cl::init(true));
 
-static cl::opt<bool>
-EnableExpensiveCombines("expensive-combines",
-                        cl::desc("Enable expensive instruction combines"));
-
 static cl::opt<unsigned> LimitMaxIterations(
     "instcombine-max-iterations",
     cl::desc("Limit the maximum number of instruction combining iterations"),
@@ -267,7 +264,7 @@ static void ClearSubclassDataAfterReassociation(BinaryOperator &I) {
 /// cast to eliminate one of the associative operations:
 /// (op (cast (op X, C2)), C1) --> (cast (op X, op (C1, C2)))
 /// (op (cast (op X, C2)), C1) --> (op (cast X), op (C1, C2))
-static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1) {
+static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1, InstCombiner &IC) {
   auto *Cast = dyn_cast<CastInst>(BinOp1->getOperand(0));
   if (!Cast || !Cast->hasOneUse())
     return false;
@@ -300,8 +297,8 @@ static bool simplifyAssocCastAssoc(BinaryOperator *BinOp1) {
   Type *DestTy = C1->getType();
   Constant *CastC2 = ConstantExpr::getCast(CastOpcode, C2, DestTy);
   Constant *FoldedC = ConstantExpr::get(AssocOpcode, C1, CastC2);
-  Cast->setOperand(0, BinOp2->getOperand(0));
-  BinOp1->setOperand(1, FoldedC);
+  IC.replaceOperand(*Cast, 0, BinOp2->getOperand(0));
+  IC.replaceOperand(*BinOp1, 1, FoldedC);
   return true;
 }
 
@@ -350,8 +347,8 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         // Does "B op C" simplify?
         if (Value *V = SimplifyBinOp(Opcode, B, C, SQ.getWithInstruction(&I))) {
           // It simplifies to V.  Form "A op V".
-          I.setOperand(0, A);
-          I.setOperand(1, V);
+          replaceOperand(I, 0, A);
+          replaceOperand(I, 1, V);
           bool IsNUW = hasNoUnsignedWrap(I) && hasNoUnsignedWrap(*Op0);
           bool IsNSW = maintainNoSignedWrap(I, B, C) && hasNoSignedWrap(*Op0);
 
@@ -383,8 +380,8 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         // Does "A op B" simplify?
         if (Value *V = SimplifyBinOp(Opcode, A, B, SQ.getWithInstruction(&I))) {
           // It simplifies to V.  Form "V op C".
-          I.setOperand(0, V);
-          I.setOperand(1, C);
+          replaceOperand(I, 0, V);
+          replaceOperand(I, 1, C);
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
           ClearSubclassDataAfterReassociation(I);
@@ -396,7 +393,7 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
     }
 
     if (I.isAssociative() && I.isCommutative()) {
-      if (simplifyAssocCastAssoc(&I)) {
+      if (simplifyAssocCastAssoc(&I, *this)) {
         Changed = true;
         ++NumReassoc;
         continue;
@@ -411,8 +408,8 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         // Does "C op A" simplify?
         if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
           // It simplifies to V.  Form "V op B".
-          I.setOperand(0, V);
-          I.setOperand(1, B);
+          replaceOperand(I, 0, V);
+          replaceOperand(I, 1, B);
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
           ClearSubclassDataAfterReassociation(I);
@@ -431,8 +428,8 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         // Does "C op A" simplify?
         if (Value *V = SimplifyBinOp(Opcode, C, A, SQ.getWithInstruction(&I))) {
           // It simplifies to V.  Form "B op V".
-          I.setOperand(0, B);
-          I.setOperand(1, V);
+          replaceOperand(I, 0, B);
+          replaceOperand(I, 1, V);
           // Conservatively clear the optional flags, since they may not be
           // preserved by the reassociation.
           ClearSubclassDataAfterReassociation(I);
@@ -465,8 +462,8 @@ bool InstCombiner::SimplifyAssociativeOrCommutative(BinaryOperator &I) {
         }
         InsertNewInstWith(NewBO, I);
         NewBO->takeName(Op1);
-        I.setOperand(0, NewBO);
-        I.setOperand(1, ConstantExpr::get(Opcode, C1, C2));
+        replaceOperand(I, 0, NewBO);
+        replaceOperand(I, 1, ConstantExpr::get(Opcode, C1, C2));
         // Conservatively clear the optional flags, since they may not be
         // preserved by the reassociation.
         ClearSubclassDataAfterReassociation(I);
@@ -925,8 +922,31 @@ Instruction *InstCombiner::FoldOpIntoSelect(Instruction &Op, SelectInst *SI) {
   if (auto *CI = dyn_cast<CmpInst>(SI->getCondition())) {
     if (CI->hasOneUse()) {
       Value *Op0 = CI->getOperand(0), *Op1 = CI->getOperand(1);
-      if ((SI->getOperand(1) == Op0 && SI->getOperand(2) == Op1) ||
-          (SI->getOperand(2) == Op0 && SI->getOperand(1) == Op1))
+
+      // FIXME: This is a hack to avoid infinite looping with min/max patterns.
+      //        We have to ensure that vector constants that only differ with
+      //        undef elements are treated as equivalent.
+      auto areLooselyEqual = [](Value *A, Value *B) {
+        if (A == B)
+          return true;
+
+        // Test for vector constants.
+        Constant *ConstA, *ConstB;
+        if (!match(A, m_Constant(ConstA)) || !match(B, m_Constant(ConstB)))
+          return false;
+
+        // TODO: Deal with FP constants?
+        if (!A->getType()->isIntOrIntVectorTy() || A->getType() != B->getType())
+          return false;
+
+        // Compare for equality including undefs as equal.
+        auto *Cmp = ConstantExpr::getCompare(ICmpInst::ICMP_EQ, ConstA, ConstB);
+        const APInt *C;
+        return match(Cmp, m_APIntAllowUndef(C)) && C->isOneValue();
+      };
+
+      if ((areLooselyEqual(TV, Op0) && areLooselyEqual(FV, Op1)) ||
+          (areLooselyEqual(FV, Op0) && areLooselyEqual(TV, Op1)))
         return nullptr;
     }
   }
@@ -951,7 +971,7 @@ static Value *foldOperationIntoPhiValue(BinaryOperator *I, Value *InV,
   if (!ConstIsRHS)
     std::swap(Op0, Op1);
 
-  Value *RI = Builder.CreateBinOp(I->getOpcode(), Op0, Op1, "phitmp");
+  Value *RI = Builder.CreateBinOp(I->getOpcode(), Op0, Op1, "phi.bo");
   auto *FPInst = dyn_cast<Instruction>(RI);
   if (FPInst && isa<FPMathOperator>(FPInst))
     FPInst->copyFastMathFlags(I);
@@ -1056,7 +1076,7 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
         // the select would be generated exactly once in the NonConstBB.
         Builder.SetInsertPoint(ThisBB->getTerminator());
         InV = Builder.CreateSelect(PN->getIncomingValue(i), TrueVInPred,
-                                   FalseVInPred, "phitmp");
+                                   FalseVInPred, "phi.sel");
       }
       NewPN->addIncoming(InV, ThisBB);
     }
@@ -1064,14 +1084,11 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
     Constant *C = cast<Constant>(I.getOperand(1));
     for (unsigned i = 0; i != NumPHIValues; ++i) {
       Value *InV = nullptr;
-      if (Constant *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
+      if (auto *InC = dyn_cast<Constant>(PN->getIncomingValue(i)))
         InV = ConstantExpr::getCompare(CI->getPredicate(), InC, C);
-      else if (isa<ICmpInst>(CI))
-        InV = Builder.CreateICmp(CI->getPredicate(), PN->getIncomingValue(i),
-                                 C, "phitmp");
       else
-        InV = Builder.CreateFCmp(CI->getPredicate(), PN->getIncomingValue(i),
-                                 C, "phitmp");
+        InV = Builder.CreateCmp(CI->getPredicate(), PN->getIncomingValue(i),
+                                C, "phi.cmp");
       NewPN->addIncoming(InV, PN->getIncomingBlock(i));
     }
   } else if (auto *BO = dyn_cast<BinaryOperator>(&I)) {
@@ -1089,7 +1106,7 @@ Instruction *InstCombiner::foldOpIntoPhi(Instruction &I, PHINode *PN) {
         InV = ConstantExpr::getCast(CI->getOpcode(), InC, RetTy);
       else
         InV = Builder.CreateCast(CI->getOpcode(), PN->getIncomingValue(i),
-                                 I.getType(), "phitmp");
+                                 I.getType(), "phi.cast");
       NewPN->addIncoming(InV, PN->getIncomingBlock(i));
     }
   }
@@ -1391,8 +1408,8 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
   assert(Parent.first->hasOneUse() && "Drilled down when more than one use!");
   assert(Op != Parent.first->getOperand(Parent.second) &&
          "Descaling was a no-op?");
-  Parent.first->setOperand(Parent.second, Op);
-  Worklist.Add(Parent.first);
+  replaceOperand(*Parent.first, Parent.second, Op);
+  Worklist.push(Parent.first);
 
   // Now work back up the expression correcting nsw flags.  The logic is based
   // on the following observation: if X * Y is known not to overflow as a signed
@@ -1410,7 +1427,7 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
       NoSignedWrap &= OpNoSignedWrap;
       if (NoSignedWrap != OpNoSignedWrap) {
         BO->setHasNoSignedWrap(NoSignedWrap);
-        Worklist.Add(Ancestor);
+        Worklist.push(Ancestor);
       }
     } else if (Ancestor->getOpcode() == Instruction::Trunc) {
       // The fact that the descaled input to the trunc has smaller absolute
@@ -1432,21 +1449,24 @@ Value *InstCombiner::Descale(Value *Val, APInt Scale, bool &NoSignedWrap) {
 }
 
 Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
-  if (!Inst.getType()->isVectorTy()) return nullptr;
+  // FIXME: some of this is likely fine for scalable vectors
+  if (!isa<FixedVectorType>(Inst.getType()))
+    return nullptr;
 
   BinaryOperator::BinaryOps Opcode = Inst.getOpcode();
-  unsigned NumElts = cast<VectorType>(Inst.getType())->getNumElements();
   Value *LHS = Inst.getOperand(0), *RHS = Inst.getOperand(1);
-  assert(cast<VectorType>(LHS->getType())->getNumElements() == NumElts);
-  assert(cast<VectorType>(RHS->getType())->getNumElements() == NumElts);
+  assert(cast<VectorType>(LHS->getType())->getElementCount() ==
+         cast<VectorType>(Inst.getType())->getElementCount());
+  assert(cast<VectorType>(RHS->getType())->getElementCount() ==
+         cast<VectorType>(Inst.getType())->getElementCount());
 
   // If both operands of the binop are vector concatenations, then perform the
   // narrow binop on each pair of the source operands followed by concatenation
   // of the results.
   Value *L0, *L1, *R0, *R1;
-  Constant *Mask;
-  if (match(LHS, m_ShuffleVector(m_Value(L0), m_Value(L1), m_Constant(Mask))) &&
-      match(RHS, m_ShuffleVector(m_Value(R0), m_Value(R1), m_Specific(Mask))) &&
+  ArrayRef<int> Mask;
+  if (match(LHS, m_Shuffle(m_Value(L0), m_Value(L1), m_Mask(Mask))) &&
+      match(RHS, m_Shuffle(m_Value(R0), m_Value(R1), m_SpecificMask(Mask))) &&
       LHS->hasOneUse() && RHS->hasOneUse() &&
       cast<ShuffleVectorInst>(LHS)->isConcat() &&
       cast<ShuffleVectorInst>(RHS)->isConcat()) {
@@ -1470,7 +1490,7 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
   if (!isSafeToSpeculativelyExecute(&Inst))
     return nullptr;
 
-  auto createBinOpShuffle = [&](Value *X, Value *Y, Constant *M) {
+  auto createBinOpShuffle = [&](Value *X, Value *Y, ArrayRef<int> M) {
     Value *XY = Builder.CreateBinOp(Opcode, X, Y);
     if (auto *BO = dyn_cast<BinaryOperator>(XY))
       BO->copyIRFlags(&Inst);
@@ -1480,8 +1500,8 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
   // If both arguments of the binary operation are shuffles that use the same
   // mask and shuffle within a single vector, move the shuffle after the binop.
   Value *V1, *V2;
-  if (match(LHS, m_ShuffleVector(m_Value(V1), m_Undef(), m_Constant(Mask))) &&
-      match(RHS, m_ShuffleVector(m_Value(V2), m_Undef(), m_Specific(Mask))) &&
+  if (match(LHS, m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))) &&
+      match(RHS, m_Shuffle(m_Value(V2), m_Undef(), m_SpecificMask(Mask))) &&
       V1->getType() == V2->getType() &&
       (LHS->hasOneUse() || RHS->hasOneUse() || LHS == RHS)) {
     // Op(shuffle(V1, Mask), shuffle(V2, Mask)) -> shuffle(Op(V1, V2), Mask)
@@ -1491,17 +1511,19 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
   // If both arguments of a commutative binop are select-shuffles that use the
   // same mask with commuted operands, the shuffles are unnecessary.
   if (Inst.isCommutative() &&
-      match(LHS, m_ShuffleVector(m_Value(V1), m_Value(V2), m_Constant(Mask))) &&
-      match(RHS, m_ShuffleVector(m_Specific(V2), m_Specific(V1),
-                                 m_Specific(Mask)))) {
+      match(LHS, m_Shuffle(m_Value(V1), m_Value(V2), m_Mask(Mask))) &&
+      match(RHS,
+            m_Shuffle(m_Specific(V2), m_Specific(V1), m_SpecificMask(Mask)))) {
     auto *LShuf = cast<ShuffleVectorInst>(LHS);
     auto *RShuf = cast<ShuffleVectorInst>(RHS);
     // TODO: Allow shuffles that contain undefs in the mask?
     //       That is legal, but it reduces undef knowledge.
     // TODO: Allow arbitrary shuffles by shuffling after binop?
     //       That might be legal, but we have to deal with poison.
-    if (LShuf->isSelect() && !LShuf->getMask()->containsUndefElement() &&
-        RShuf->isSelect() && !RShuf->getMask()->containsUndefElement()) {
+    if (LShuf->isSelect() &&
+        !is_contained(LShuf->getShuffleMask(), UndefMaskElem) &&
+        RShuf->isSelect() &&
+        !is_contained(RShuf->getShuffleMask(), UndefMaskElem)) {
       // Example:
       // LHS = shuffle V1, V2, <0, 5, 6, 3>
       // RHS = shuffle V2, V1, <0, 5, 6, 3>
@@ -1517,11 +1539,12 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
   // intends to move shuffles closer to other shuffles and binops closer to
   // other binops, so they can be folded. It may also enable demanded elements
   // transforms.
+  unsigned NumElts = cast<FixedVectorType>(Inst.getType())->getNumElements();
   Constant *C;
-  if (match(&Inst, m_c_BinOp(
-          m_OneUse(m_ShuffleVector(m_Value(V1), m_Undef(), m_Constant(Mask))),
-          m_Constant(C))) &&
-      V1->getType()->getVectorNumElements() <= NumElts) {
+  if (match(&Inst,
+            m_c_BinOp(m_OneUse(m_Shuffle(m_Value(V1), m_Undef(), m_Mask(Mask))),
+                      m_Constant(C))) &&
+      cast<FixedVectorType>(V1->getType())->getNumElements() <= NumElts) {
     assert(Inst.getType()->getScalarType() == V1->getType()->getScalarType() &&
            "Shuffle should not change scalar type");
 
@@ -1531,9 +1554,9 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
     // reorder is not possible. A 1-to-1 mapping is not required. Example:
     // ShMask = <1,1,2,2> and C = <5,5,6,6> --> NewC = <undef,5,6,undef>
     bool ConstOp1 = isa<Constant>(RHS);
-    SmallVector<int, 16> ShMask;
-    ShuffleVectorInst::getShuffleMask(Mask, ShMask);
-    unsigned SrcVecNumElts = V1->getType()->getVectorNumElements();
+    ArrayRef<int> ShMask = Mask;
+    unsigned SrcVecNumElts =
+        cast<FixedVectorType>(V1->getType())->getNumElements();
     UndefValue *UndefScalar = UndefValue::get(C->getType()->getScalarType());
     SmallVector<Constant *, 16> NewVecC(SrcVecNumElts, UndefScalar);
     bool MayChange = true;
@@ -1590,6 +1613,57 @@ Instruction *InstCombiner::foldVectorBinop(BinaryOperator &Inst) {
     }
   }
 
+  // Try to reassociate to sink a splat shuffle after a binary operation.
+  if (Inst.isAssociative() && Inst.isCommutative()) {
+    // Canonicalize shuffle operand as LHS.
+    if (isa<ShuffleVectorInst>(RHS))
+      std::swap(LHS, RHS);
+
+    Value *X;
+    ArrayRef<int> MaskC;
+    int SplatIndex;
+    BinaryOperator *BO;
+    if (!match(LHS,
+               m_OneUse(m_Shuffle(m_Value(X), m_Undef(), m_Mask(MaskC)))) ||
+        !match(MaskC, m_SplatOrUndefMask(SplatIndex)) ||
+        X->getType() != Inst.getType() || !match(RHS, m_OneUse(m_BinOp(BO))) ||
+        BO->getOpcode() != Opcode)
+      return nullptr;
+
+    // FIXME: This may not be safe if the analysis allows undef elements. By
+    //        moving 'Y' before the splat shuffle, we are implicitly assuming
+    //        that it is not undef/poison at the splat index.
+    Value *Y, *OtherOp;
+    if (isSplatValue(BO->getOperand(0), SplatIndex)) {
+      Y = BO->getOperand(0);
+      OtherOp = BO->getOperand(1);
+    } else if (isSplatValue(BO->getOperand(1), SplatIndex)) {
+      Y = BO->getOperand(1);
+      OtherOp = BO->getOperand(0);
+    } else {
+      return nullptr;
+    }
+
+    // X and Y are splatted values, so perform the binary operation on those
+    // values followed by a splat followed by the 2nd binary operation:
+    // bo (splat X), (bo Y, OtherOp) --> bo (splat (bo X, Y)), OtherOp
+    Value *NewBO = Builder.CreateBinOp(Opcode, X, Y);
+    UndefValue *Undef = UndefValue::get(Inst.getType());
+    SmallVector<int, 8> NewMask(MaskC.size(), SplatIndex);
+    Value *NewSplat = Builder.CreateShuffleVector(NewBO, Undef, NewMask);
+    Instruction *R = BinaryOperator::Create(Opcode, NewSplat, OtherOp);
+
+    // Intersect FMF on both new binops. Other (poison-generating) flags are
+    // dropped to be safe.
+    if (isa<FPMathOperator>(R)) {
+      R->copyFastMathFlags(&Inst);
+      R->andIRFlags(BO);
+    }
+    if (auto *NewInstBO = dyn_cast<BinaryOperator>(NewBO))
+      NewInstBO->copyIRFlags(R);
+    return R;
+  }
+
   return nullptr;
 }
 
@@ -1658,16 +1732,46 @@ static bool isMergedGEPInBounds(GEPOperator &GEP1, GEPOperator &GEP2) {
          (GEP2.isInBounds() || GEP2.hasAllZeroIndices());
 }
 
+/// Thread a GEP operation with constant indices through the constant true/false
+/// arms of a select.
+static Instruction *foldSelectGEP(GetElementPtrInst &GEP,
+                                  InstCombiner::BuilderTy &Builder) {
+  if (!GEP.hasAllConstantIndices())
+    return nullptr;
+
+  Instruction *Sel;
+  Value *Cond;
+  Constant *TrueC, *FalseC;
+  if (!match(GEP.getPointerOperand(), m_Instruction(Sel)) ||
+      !match(Sel,
+             m_Select(m_Value(Cond), m_Constant(TrueC), m_Constant(FalseC))))
+    return nullptr;
+
+  // gep (select Cond, TrueC, FalseC), IndexC --> select Cond, TrueC', FalseC'
+  // Propagate 'inbounds' and metadata from existing instructions.
+  // Note: using IRBuilder to create the constants for efficiency.
+  SmallVector<Value *, 4> IndexC(GEP.idx_begin(), GEP.idx_end());
+  bool IsInBounds = GEP.isInBounds();
+  Value *NewTrueC = IsInBounds ? Builder.CreateInBoundsGEP(TrueC, IndexC)
+                               : Builder.CreateGEP(TrueC, IndexC);
+  Value *NewFalseC = IsInBounds ? Builder.CreateInBoundsGEP(FalseC, IndexC)
+                                : Builder.CreateGEP(FalseC, IndexC);
+  return SelectInst::Create(Cond, NewTrueC, NewFalseC, "", nullptr, Sel);
+}
+
 Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
   SmallVector<Value*, 8> Ops(GEP.op_begin(), GEP.op_end());
   Type *GEPType = GEP.getType();
   Type *GEPEltType = GEP.getSourceElementType();
+  bool IsGEPSrcEleScalable = isa<ScalableVectorType>(GEPEltType);
   if (Value *V = SimplifyGEPInst(GEPEltType, Ops, SQ.getWithInstruction(&GEP)))
     return replaceInstUsesWith(GEP, V);
 
   // For vector geps, use the generic demanded vector support.
-  if (GEP.getType()->isVectorTy()) {
-    auto VWidth = GEP.getType()->getVectorNumElements();
+  // Skip if GEP return type is scalable. The number of elements is unknown at
+  // compile-time.
+  if (auto *GEPFVTy = dyn_cast<FixedVectorType>(GEPType)) {
+    auto VWidth = GEPFVTy->getNumElements();
     APInt UndefElts(VWidth, 0);
     APInt AllOnesEltMask(APInt::getAllOnesValue(VWidth));
     if (Value *V = SimplifyDemandedVectorElts(&GEP, AllOnesEltMask,
@@ -1679,7 +1783,7 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
 
     // TODO: 1) Scalarize splat operands, 2) scalarize entire instruction if
     // possible (decide on canonical form for pointer broadcast), 3) exploit
-    // undef elements to decrease demanded bits  
+    // undef elements to decrease demanded bits
   }
 
   Value *PtrOp = GEP.getOperand(0);
@@ -1703,13 +1807,14 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     Type *IndexTy = (*I)->getType();
     Type *NewIndexType =
         IndexTy->isVectorTy()
-            ? VectorType::get(NewScalarIndexTy, IndexTy->getVectorNumElements())
+            ? VectorType::get(NewScalarIndexTy,
+                              cast<VectorType>(IndexTy)->getElementCount())
             : NewScalarIndexTy;
 
     // If the element type has zero size then any index over it is equivalent
     // to an index of zero, so replace it with zero if it is not zero already.
     Type *EltTy = GTI.getIndexedType();
-    if (EltTy->isSized() && DL.getTypeAllocSize(EltTy) == 0)
+    if (EltTy->isSized() && DL.getTypeAllocSize(EltTy).isZero())
       if (!isa<Constant>(*I) || !match(I->get(), m_Zero())) {
         *I = Constant::getNullValue(NewIndexType);
         MadeChange = true;
@@ -1789,10 +1894,9 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
         if (J > 0) {
           if (J == 1) {
             CurTy = Op1->getSourceElementType();
-          } else if (auto *CT = dyn_cast<CompositeType>(CurTy)) {
-            CurTy = CT->getTypeAtIndex(Op1->getOperand(J));
           } else {
-            CurTy = nullptr;
+            CurTy =
+                GetElementPtrInst::getTypeAtIndex(CurTy, Op1->getOperand(J));
           }
         }
       }
@@ -1808,8 +1912,6 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     if (DI == -1) {
       // All the GEPs feeding the PHI are identical. Clone one down into our
       // BB so that it can be merged with the current GEP.
-      GEP.getParent()->getInstList().insert(
-          GEP.getParent()->getFirstInsertionPt(), NewGEP);
     } else {
       // All the GEPs feeding the PHI differ at a single offset. Clone a GEP
       // into the current block so it can be merged, and create a new PHI to
@@ -1827,12 +1929,11 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
                            PN->getIncomingBlock(I));
 
       NewGEP->setOperand(DI, NewPN);
-      GEP.getParent()->getInstList().insert(
-          GEP.getParent()->getFirstInsertionPt(), NewGEP);
-      NewGEP->setOperand(DI, NewPN);
     }
 
-    GEP.setOperand(0, NewGEP);
+    GEP.getParent()->getInstList().insert(
+        GEP.getParent()->getFirstInsertionPt(), NewGEP);
+    replaceOperand(GEP, 0, NewGEP);
     PtrOp = NewGEP;
   }
 
@@ -1932,8 +2033,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       // Update the GEP in place if possible.
       if (Src->getNumOperands() == 2) {
         GEP.setIsInBounds(isMergedGEPInBounds(*Src, *cast<GEPOperator>(&GEP)));
-        GEP.setOperand(0, Src->getOperand(0));
-        GEP.setOperand(1, Sum);
+        replaceOperand(GEP, 0, Src->getOperand(0));
+        replaceOperand(GEP, 1, Sum);
         return &GEP;
       }
       Indices.append(Src->op_begin()+1, Src->op_end()-1);
@@ -1957,11 +2058,13 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
                                              GEP.getName());
   }
 
-  if (GEP.getNumIndices() == 1) {
+  // Skip if GEP source element type is scalable. The type alloc size is unknown
+  // at compile-time.
+  if (GEP.getNumIndices() == 1 && !IsGEPSrcEleScalable) {
     unsigned AS = GEP.getPointerAddressSpace();
     if (GEP.getOperand(1)->getType()->getScalarSizeInBits() ==
         DL.getIndexSizeInBits(AS)) {
-      uint64_t TyAllocSize = DL.getTypeAllocSize(GEPEltType);
+      uint64_t TyAllocSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
 
       bool Matched = false;
       uint64_t C;
@@ -2051,9 +2154,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
             // array.  Because the array type is never stepped over (there
             // is a leading zero) we can fold the cast into this GEP.
             if (StrippedPtrTy->getAddressSpace() == GEP.getAddressSpace()) {
-              GEP.setOperand(0, StrippedPtr);
               GEP.setSourceElementType(XATy);
-              return &GEP;
+              return replaceOperand(GEP, 0, StrippedPtr);
             }
             // Cannot replace the base pointer directly because StrippedPtr's
             // address space is different. Instead, create a new GEP followed by
@@ -2075,10 +2177,12 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           }
         }
       }
-    } else if (GEP.getNumOperands() == 2) {
-      // Transform things like:
-      // %t = getelementptr i32* bitcast ([2 x i32]* %str to i32*), i32 %V
-      // into:  %t1 = getelementptr [2 x i32]* %str, i32 0, i32 %V; bitcast
+    } else if (GEP.getNumOperands() == 2 && !IsGEPSrcEleScalable) {
+      // Skip if GEP source element type is scalable. The type alloc size is
+      // unknown at compile-time.
+      // Transform things like: %t = getelementptr i32*
+      // bitcast ([2 x i32]* %str to i32*), i32 %V into:  %t1 = getelementptr [2
+      // x i32]* %str, i32 0, i32 %V; bitcast
       if (StrippedPtrEltTy->isArrayTy() &&
           DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType()) ==
               DL.getTypeAllocSize(GEPEltType)) {
@@ -2102,8 +2206,8 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
       if (GEPEltType->isSized() && StrippedPtrEltTy->isSized()) {
         // Check that changing the type amounts to dividing the index by a scale
         // factor.
-        uint64_t ResSize = DL.getTypeAllocSize(GEPEltType);
-        uint64_t SrcSize = DL.getTypeAllocSize(StrippedPtrEltTy);
+        uint64_t ResSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
+        uint64_t SrcSize = DL.getTypeAllocSize(StrippedPtrEltTy).getFixedSize();
         if (ResSize && SrcSize % ResSize == 0) {
           Value *Idx = GEP.getOperand(1);
           unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
@@ -2142,9 +2246,10 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
           StrippedPtrEltTy->isArrayTy()) {
         // Check that changing to the array element type amounts to dividing the
         // index by a scale factor.
-        uint64_t ResSize = DL.getTypeAllocSize(GEPEltType);
+        uint64_t ResSize = DL.getTypeAllocSize(GEPEltType).getFixedSize();
         uint64_t ArrayEltSize =
-            DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType());
+            DL.getTypeAllocSize(StrippedPtrEltTy->getArrayElementType())
+                .getFixedSize();
         if (ResSize && ArrayEltSize % ResSize == 0) {
           Value *Idx = GEP.getOperand(1);
           unsigned BitWidth = Idx->getType()->getPrimitiveSizeInBits();
@@ -2203,8 +2308,9 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     // gep (bitcast [c x ty]* X to <c x ty>*), Y, Z --> gep X, Y, Z
     auto areMatchingArrayAndVecTypes = [](Type *ArrTy, Type *VecTy,
                                           const DataLayout &DL) {
-      return ArrTy->getArrayElementType() == VecTy->getVectorElementType() &&
-             ArrTy->getArrayNumElements() == VecTy->getVectorNumElements() &&
+      auto *VecVTy = cast<VectorType>(VecTy);
+      return ArrTy->getArrayElementType() == VecVTy->getElementType() &&
+             ArrTy->getArrayNumElements() == VecVTy->getNumElements() &&
              DL.getTypeAllocSize(ArrTy) == DL.getTypeAllocSize(VecTy);
     };
     if (GEP.getNumOperands() == 3 &&
@@ -2291,7 +2397,9 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     if (auto *AI = dyn_cast<AllocaInst>(UnderlyingPtrOp)) {
       if (GEP.accumulateConstantOffset(DL, BasePtrOffset) &&
           BasePtrOffset.isNonNegative()) {
-        APInt AllocSize(IdxWidth, DL.getTypeAllocSize(AI->getAllocatedType()));
+        APInt AllocSize(
+            IdxWidth,
+            DL.getTypeAllocSize(AI->getAllocatedType()).getKnownMinSize());
         if (BasePtrOffset.ule(AllocSize)) {
           return GetElementPtrInst::CreateInBounds(
               GEP.getSourceElementType(), PtrOp, makeArrayRef(Ops).slice(1),
@@ -2301,6 +2409,9 @@ Instruction *InstCombiner::visitGetElementPtrInst(GetElementPtrInst &GEP) {
     }
   }
 
+  if (Instruction *R = foldSelectGEP(GEP, Builder))
+    return R;
+
   return nullptr;
 }
 
@@ -2369,6 +2480,7 @@ static bool isAllocSiteRemovable(Instruction *AI,
               return false;
             LLVM_FALLTHROUGH;
           }
+          case Intrinsic::assume:
           case Intrinsic::invariant_start:
           case Intrinsic::invariant_end:
           case Intrinsic::lifetime_start:
@@ -2517,7 +2629,7 @@ static Instruction *tryToMoveFreeBeforeNullTest(CallInst &FI,
   // If there are more than 2 instructions, check that they are noops
   // i.e., they won't hurt the performance of the generated code.
   if (FreeInstrBB->size() != 2) {
-    for (const Instruction &Inst : *FreeInstrBB) {
+    for (const Instruction &Inst : FreeInstrBB->instructionsWithoutDebug()) {
       if (&Inst == &FI || &Inst == FreeInstrBBTerminator)
         continue;
       auto *Cast = dyn_cast<CastInst>(&Inst);
@@ -2579,60 +2691,108 @@ Instruction *InstCombiner::visitFree(CallInst &FI) {
   // if (foo) free(foo);
   // into
   // free(foo);
-  if (MinimizeSize)
-    if (Instruction *I = tryToMoveFreeBeforeNullTest(FI, DL))
-      return I;
+  //
+  // Note that we can only do this for 'free' and not for any flavor of
+  // 'operator delete'; there is no 'operator delete' symbol for which we are
+  // permitted to invent a call, even if we're passing in a null pointer.
+  if (MinimizeSize) {
+    LibFunc Func;
+    if (TLI.getLibFunc(FI, Func) && TLI.has(Func) && Func == LibFunc_free)
+      if (Instruction *I = tryToMoveFreeBeforeNullTest(FI, DL))
+        return I;
+  }
 
   return nullptr;
 }
 
+static bool isMustTailCall(Value *V) {
+  if (auto *CI = dyn_cast<CallInst>(V))
+    return CI->isMustTailCall();
+  return false;
+}
+
 Instruction *InstCombiner::visitReturnInst(ReturnInst &RI) {
   if (RI.getNumOperands() == 0) // ret void
     return nullptr;
 
   Value *ResultOp = RI.getOperand(0);
   Type *VTy = ResultOp->getType();
-  if (!VTy->isIntegerTy())
+  if (!VTy->isIntegerTy() || isa<Constant>(ResultOp))
+    return nullptr;
+
+  // Don't replace result of musttail calls.
+  if (isMustTailCall(ResultOp))
     return nullptr;
 
   // There might be assume intrinsics dominating this return that completely
   // determine the value. If so, constant fold it.
   KnownBits Known = computeKnownBits(ResultOp, 0, &RI);
   if (Known.isConstant())
-    RI.setOperand(0, Constant::getIntegerValue(VTy, Known.getConstant()));
+    return replaceOperand(RI, 0,
+        Constant::getIntegerValue(VTy, Known.getConstant()));
+
+  return nullptr;
+}
+
+Instruction *InstCombiner::visitUnconditionalBranchInst(BranchInst &BI) {
+  assert(BI.isUnconditional() && "Only for unconditional branches.");
+
+  // If this store is the second-to-last instruction in the basic block
+  // (excluding debug info and bitcasts of pointers) and if the block ends with
+  // an unconditional branch, try to move the store to the successor block.
+
+  auto GetLastSinkableStore = [](BasicBlock::iterator BBI) {
+    auto IsNoopInstrForStoreMerging = [](BasicBlock::iterator BBI) {
+      return isa<DbgInfoIntrinsic>(BBI) ||
+             (isa<BitCastInst>(BBI) && BBI->getType()->isPointerTy());
+    };
+
+    BasicBlock::iterator FirstInstr = BBI->getParent()->begin();
+    do {
+      if (BBI != FirstInstr)
+        --BBI;
+    } while (BBI != FirstInstr && IsNoopInstrForStoreMerging(BBI));
+
+    return dyn_cast<StoreInst>(BBI);
+  };
+
+  if (StoreInst *SI = GetLastSinkableStore(BasicBlock::iterator(BI)))
+    if (mergeStoreIntoSuccessor(*SI))
+      return &BI;
 
   return nullptr;
 }
 
 Instruction *InstCombiner::visitBranchInst(BranchInst &BI) {
+  if (BI.isUnconditional())
+    return visitUnconditionalBranchInst(BI);
+
   // Change br (not X), label True, label False to: br X, label False, True
   Value *X = nullptr;
   if (match(&BI, m_Br(m_Not(m_Value(X)), m_BasicBlock(), m_BasicBlock())) &&
       !isa<Constant>(X)) {
     // Swap Destinations and condition...
-    BI.setCondition(X);
     BI.swapSuccessors();
-    return &BI;
+    return replaceOperand(BI, 0, X);
   }
 
   // If the condition is irrelevant, remove the use so that other
   // transforms on the condition become more effective.
-  if (BI.isConditional() && !isa<ConstantInt>(BI.getCondition()) &&
-      BI.getSuccessor(0) == BI.getSuccessor(1)) {
-    BI.setCondition(ConstantInt::getFalse(BI.getCondition()->getType()));
-    return &BI;
-  }
+  if (!isa<ConstantInt>(BI.getCondition()) &&
+      BI.getSuccessor(0) == BI.getSuccessor(1))
+    return replaceOperand(
+        BI, 0, ConstantInt::getFalse(BI.getCondition()->getType()));
 
-  // Canonicalize, for example, icmp_ne -> icmp_eq or fcmp_one -> fcmp_oeq.
+  // Canonicalize, for example, fcmp_one -> fcmp_oeq.
   CmpInst::Predicate Pred;
-  if (match(&BI, m_Br(m_OneUse(m_Cmp(Pred, m_Value(), m_Value())),
+  if (match(&BI, m_Br(m_OneUse(m_FCmp(Pred, m_Value(), m_Value())),
                       m_BasicBlock(), m_BasicBlock())) &&
       !isCanonicalPredicate(Pred)) {
     // Swap destinations and condition.
     CmpInst *Cond = cast<CmpInst>(BI.getCondition());
     Cond->setPredicate(CmpInst::getInversePredicate(Pred));
     BI.swapSuccessors();
-    Worklist.Add(Cond);
+    Worklist.push(Cond);
     return &BI;
   }
 
@@ -2651,8 +2811,7 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
              "Result of expression should be constant");
       Case.setValue(cast<ConstantInt>(NewCase));
     }
-    SI.setCondition(Op0);
-    return &SI;
+    return replaceOperand(SI, 0, Op0);
   }
 
   KnownBits Known = computeKnownBits(Cond, 0, &SI);
@@ -2679,13 +2838,12 @@ Instruction *InstCombiner::visitSwitchInst(SwitchInst &SI) {
     IntegerType *Ty = IntegerType::get(SI.getContext(), NewWidth);
     Builder.SetInsertPoint(&SI);
     Value *NewCond = Builder.CreateTrunc(Cond, Ty, "trunc");
-    SI.setCondition(NewCond);
 
     for (auto Case : SI.cases()) {
       APInt TruncatedCase = Case.getCaseValue()->getValue().trunc(NewWidth);
       Case.setValue(ConstantInt::get(SI.getContext(), TruncatedCase));
     }
-    return &SI;
+    return replaceOperand(SI, 0, NewCond);
   }
 
   return nullptr;
@@ -3175,7 +3333,7 @@ Instruction *InstCombiner::visitFreeze(FreezeInst &I) {
 /// instruction past all of the instructions between it and the end of its
 /// block.
 static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
-  assert(I->hasOneUse() && "Invariants didn't hold!");
+  assert(I->getSingleUndroppableUse() && "Invariants didn't hold!");
   BasicBlock *SrcBlock = I->getParent();
 
   // Cannot move control-flow-involving, volatile loads, vaarg, etc.
@@ -3202,12 +3360,26 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
   // We can only sink load instructions if there is nothing between the load and
   // the end of block that could change the value.
   if (I->mayReadFromMemory()) {
+    // We don't want to do any sophisticated alias analysis, so we only check
+    // the instructions after I in I's parent block if we try to sink to its
+    // successor block.
+    if (DestBlock->getUniquePredecessor() != I->getParent())
+      return false;
     for (BasicBlock::iterator Scan = I->getIterator(),
                               E = I->getParent()->end();
          Scan != E; ++Scan)
       if (Scan->mayWriteToMemory())
         return false;
   }
+
+  I->dropDroppableUses([DestBlock](const Use *U) {
+    if (auto *I = dyn_cast<Instruction>(U->getUser()))
+      return I->getParent() != DestBlock;
+    return true;
+  });
+  /// FIXME: We could remove droppable uses that are not dominated by
+  /// the new position.
+
   BasicBlock::iterator InsertPos = DestBlock->getFirstInsertionPt();
   I->moveBefore(&*InsertPos);
   ++NumSunkInst;
@@ -3219,60 +3391,70 @@ static bool TryToSinkInstruction(Instruction *I, BasicBlock *DestBlock) {
   // here, but that computation has been sunk.
   SmallVector<DbgVariableIntrinsic *, 2> DbgUsers;
   findDbgUsers(DbgUsers, I);
-  for (auto *DII : reverse(DbgUsers)) {
-    if (DII->getParent() == SrcBlock) {
-      if (isa<DbgDeclareInst>(DII)) {
-        // A dbg.declare instruction should not be cloned, since there can only be
-        // one per variable fragment. It should be left in the original place since
-        // sunk instruction is not an alloca(otherwise we could not be here).
-        // But we need to update arguments of dbg.declare instruction, so that it
-        // would not point into sunk instruction.
-        if (!isa<CastInst>(I))
-          continue; // dbg.declare points at something it shouldn't
-
-        DII->setOperand(
-            0, MetadataAsValue::get(I->getContext(),
-                                    ValueAsMetadata::get(I->getOperand(0))));
-        continue;
-      }
 
-      // dbg.value is in the same basic block as the sunk inst, see if we can
-      // salvage it. Clone a new copy of the instruction: on success we need
-      // both salvaged and unsalvaged copies.
-      SmallVector<DbgVariableIntrinsic *, 1> TmpUser{
-          cast<DbgVariableIntrinsic>(DII->clone())};
-
-      if (!salvageDebugInfoForDbgValues(*I, TmpUser)) {
-        // We are unable to salvage: sink the cloned dbg.value, and mark the
-        // original as undef, terminating any earlier variable location.
-        LLVM_DEBUG(dbgs() << "SINK: " << *DII << '\n');
-        TmpUser[0]->insertBefore(&*InsertPos);
-        Value *Undef = UndefValue::get(I->getType());
-        DII->setOperand(0, MetadataAsValue::get(DII->getContext(),
-                                                ValueAsMetadata::get(Undef)));
-      } else {
-        // We successfully salvaged: place the salvaged dbg.value in the
-        // original location, and move the unmodified dbg.value to sink with
-        // the sunk inst.
-        TmpUser[0]->insertBefore(DII);
-        DII->moveBefore(&*InsertPos);
-      }
+  // Update the arguments of a dbg.declare instruction, so that it
+  // does not point into a sunk instruction.
+  auto updateDbgDeclare = [&I](DbgVariableIntrinsic *DII) {
+    if (!isa<DbgDeclareInst>(DII))
+      return false;
+
+    if (isa<CastInst>(I))
+      DII->setOperand(
+          0, MetadataAsValue::get(I->getContext(),
+                                  ValueAsMetadata::get(I->getOperand(0))));
+    return true;
+  };
+
+  SmallVector<DbgVariableIntrinsic *, 2> DIIClones;
+  for (auto User : DbgUsers) {
+    // A dbg.declare instruction should not be cloned, since there can only be
+    // one per variable fragment. It should be left in the original place
+    // because the sunk instruction is not an alloca (otherwise we could not be
+    // here).
+    if (User->getParent() != SrcBlock || updateDbgDeclare(User))
+      continue;
+
+    DIIClones.emplace_back(cast<DbgVariableIntrinsic>(User->clone()));
+    LLVM_DEBUG(dbgs() << "CLONE: " << *DIIClones.back() << '\n');
+  }
+
+  // Perform salvaging without the clones, then sink the clones.
+  if (!DIIClones.empty()) {
+    salvageDebugInfoForDbgValues(*I, DbgUsers);
+    for (auto &DIIClone : DIIClones) {
+      DIIClone->insertBefore(&*InsertPos);
+      LLVM_DEBUG(dbgs() << "SINK: " << *DIIClone << '\n');
     }
   }
+
   return true;
 }
 
 bool InstCombiner::run() {
   while (!Worklist.isEmpty()) {
-    Instruction *I = Worklist.RemoveOne();
+    // Walk deferred instructions in reverse order, and push them to the
+    // worklist, which means they'll end up popped from the worklist in-order.
+    while (Instruction *I = Worklist.popDeferred()) {
+      // Check to see if we can DCE the instruction. We do this already here to
+      // reduce the number of uses and thus allow other folds to trigger.
+      // Note that eraseInstFromFunction() may push additional instructions on
+      // the deferred worklist, so this will DCE whole instruction chains.
+      if (isInstructionTriviallyDead(I, &TLI)) {
+        eraseInstFromFunction(*I);
+        ++NumDeadInst;
+        continue;
+      }
+
+      Worklist.push(I);
+    }
+
+    Instruction *I = Worklist.removeOne();
     if (I == nullptr) continue;  // skip null values.
 
     // Check to see if we can DCE the instruction.
     if (isInstructionTriviallyDead(I, &TLI)) {
-      LLVM_DEBUG(dbgs() << "IC: DCE: " << *I << '\n');
       eraseInstFromFunction(*I);
       ++NumDeadInst;
-      MadeIRChange = true;
       continue;
     }
 
@@ -3296,65 +3478,51 @@ bool InstCombiner::run() {
       }
     }
 
-    // In general, it is possible for computeKnownBits to determine all bits in
-    // a value even when the operands are not all constants.
-    Type *Ty = I->getType();
-    if (ExpensiveCombines && !I->use_empty() && Ty->isIntOrIntVectorTy()) {
-      KnownBits Known = computeKnownBits(I, /*Depth*/0, I);
-      if (Known.isConstant()) {
-        Constant *C = ConstantInt::get(Ty, Known.getConstant());
-        LLVM_DEBUG(dbgs() << "IC: ConstFold (all bits known) to: " << *C
-                          << " from: " << *I << '\n');
-
-        // Add operands to the worklist.
-        replaceInstUsesWith(*I, C);
-        ++NumConstProp;
-        if (isInstructionTriviallyDead(I, &TLI))
-          eraseInstFromFunction(*I);
-        MadeIRChange = true;
-        continue;
-      }
-    }
-
-    // See if we can trivially sink this instruction to a successor basic block.
-    if (EnableCodeSinking && I->hasOneUse()) {
-      BasicBlock *BB = I->getParent();
-      Instruction *UserInst = cast<Instruction>(*I->user_begin());
-      BasicBlock *UserParent;
-
-      // Get the block the use occurs in.
-      if (PHINode *PN = dyn_cast<PHINode>(UserInst))
-        UserParent = PN->getIncomingBlock(*I->use_begin());
-      else
-        UserParent = UserInst->getParent();
-
-      if (UserParent != BB) {
-        bool UserIsSuccessor = false;
-        // See if the user is one of our successors.
-        for (succ_iterator SI = succ_begin(BB), E = succ_end(BB); SI != E; ++SI)
-          if (*SI == UserParent) {
-            UserIsSuccessor = true;
-            break;
+    // See if we can trivially sink this instruction to its user if we can
+    // prove that the successor is not executed more frequently than our block.
+    if (EnableCodeSinking)
+      if (Use *SingleUse = I->getSingleUndroppableUse()) {
+        BasicBlock *BB = I->getParent();
+        Instruction *UserInst = cast<Instruction>(SingleUse->getUser());
+        BasicBlock *UserParent;
+
+        // Get the block the use occurs in.
+        if (PHINode *PN = dyn_cast<PHINode>(UserInst))
+          UserParent = PN->getIncomingBlock(*SingleUse);
+        else
+          UserParent = UserInst->getParent();
+
+        if (UserParent != BB) {
+          // See if the user is one of our successors that has only one
+          // predecessor, so that we don't have to split the critical edge.
+          bool ShouldSink = UserParent->getUniquePredecessor() == BB;
+          // Another option where we can sink is a block that ends with a
+          // terminator that does not pass control to other block (such as
+          // return or unreachable). In this case:
+          //   - I dominates the User (by SSA form);
+          //   - the User will be executed at most once.
+          // So sinking I down to User is always profitable or neutral.
+          if (!ShouldSink) {
+            auto *Term = UserParent->getTerminator();
+            ShouldSink = isa<ReturnInst>(Term) || isa<UnreachableInst>(Term);
           }
-
-        // If the user is one of our immediate successors, and if that successor
-        // only has us as a predecessors (we'd have to split the critical edge
-        // otherwise), we can keep going.
-        if (UserIsSuccessor && UserParent->getUniquePredecessor()) {
-          // Okay, the CFG is simple enough, try to sink this instruction.
-          if (TryToSinkInstruction(I, UserParent)) {
-            LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
-            MadeIRChange = true;
-            // We'll add uses of the sunk instruction below, but since sinking
-            // can expose opportunities for it's *operands* add them to the
-            // worklist
-            for (Use &U : I->operands())
-              if (Instruction *OpI = dyn_cast<Instruction>(U.get()))
-                Worklist.Add(OpI);
+          if (ShouldSink) {
+            assert(DT.dominates(BB, UserParent) &&
+                   "Dominance relation broken?");
+            // Okay, the CFG is simple enough, try to sink this instruction.
+            if (TryToSinkInstruction(I, UserParent)) {
+              LLVM_DEBUG(dbgs() << "IC: Sink: " << *I << '\n');
+              MadeIRChange = true;
+              // We'll add uses of the sunk instruction below, but since sinking
+              // can expose opportunities for it's *operands* add them to the
+              // worklist
+              for (Use &U : I->operands())
+                if (Instruction *OpI = dyn_cast<Instruction>(U.get()))
+                  Worklist.push(OpI);
+            }
           }
         }
       }
-    }
 
     // Now that we have an instruction, try combining it to simplify it.
     Builder.SetInsertPoint(I);
@@ -3393,8 +3561,8 @@ bool InstCombiner::run() {
         InstParent->getInstList().insert(InsertPos, Result);
 
         // Push the new instruction and any users onto the worklist.
-        Worklist.AddUsersToWorkList(*Result);
-        Worklist.Add(Result);
+        Worklist.pushUsersToWorkList(*Result);
+        Worklist.push(Result);
 
         eraseInstFromFunction(*I);
       } else {
@@ -3406,39 +3574,39 @@ bool InstCombiner::run() {
         if (isInstructionTriviallyDead(I, &TLI)) {
           eraseInstFromFunction(*I);
         } else {
-          Worklist.AddUsersToWorkList(*I);
-          Worklist.Add(I);
+          Worklist.pushUsersToWorkList(*I);
+          Worklist.push(I);
         }
       }
       MadeIRChange = true;
     }
   }
 
-  Worklist.Zap();
+  Worklist.zap();
   return MadeIRChange;
 }
 
-/// Walk the function in depth-first order, adding all reachable code to the
-/// worklist.
+/// Populate the IC worklist from a function, by walking it in depth-first
+/// order and adding all reachable code to the worklist.
 ///
 /// This has a couple of tricks to make the code faster and more powerful.  In
 /// particular, we constant fold and DCE instructions as we go, to avoid adding
 /// them to the worklist (this significantly speeds up instcombine on code where
 /// many instructions are dead or constant).  Additionally, if we find a branch
 /// whose condition is a known constant, we only visit the reachable successors.
-static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
-                                       SmallPtrSetImpl<BasicBlock *> &Visited,
-                                       InstCombineWorklist &ICWorklist,
-                                       const TargetLibraryInfo *TLI) {
+static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
+                                          const TargetLibraryInfo *TLI,
+                                          InstCombineWorklist &ICWorklist) {
   bool MadeIRChange = false;
+  SmallPtrSet<BasicBlock *, 32> Visited;
   SmallVector<BasicBlock*, 256> Worklist;
-  Worklist.push_back(BB);
+  Worklist.push_back(&F.front());
 
   SmallVector<Instruction*, 128> InstrsForInstCombineWorklist;
   DenseMap<Constant *, Constant *> FoldedConstants;
 
   do {
-    BB = Worklist.pop_back_val();
+    BasicBlock *BB = Worklist.pop_back_val();
 
     // We have now visited this block!  If we've already been here, ignore it.
     if (!Visited.insert(BB).second)
@@ -3447,16 +3615,6 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
     for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E; ) {
       Instruction *Inst = &*BBI++;
 
-      // DCE instruction if trivially dead.
-      if (isInstructionTriviallyDead(Inst, TLI)) {
-        ++NumDeadInst;
-        LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
-        salvageDebugInfoOrMarkUndef(*Inst);
-        Inst->eraseFromParent();
-        MadeIRChange = true;
-        continue;
-      }
-
       // ConstantProp instruction if trivially constant.
       if (!Inst->use_empty() &&
           (Inst->getNumOperands() == 0 || isa<Constant>(Inst->getOperand(0))))
@@ -3480,8 +3638,6 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
         Constant *&FoldRes = FoldedConstants[C];
         if (!FoldRes)
           FoldRes = ConstantFoldConstant(C, DL, TLI);
-        if (!FoldRes)
-          FoldRes = C;
 
         if (FoldRes != C) {
           LLVM_DEBUG(dbgs() << "IC: ConstFold operand of: " << *Inst
@@ -3519,36 +3675,9 @@ static bool AddReachableCodeToWorklist(BasicBlock *BB, const DataLayout &DL,
       Worklist.push_back(SuccBB);
   } while (!Worklist.empty());
 
-  // Once we've found all of the instructions to add to instcombine's worklist,
-  // add them in reverse order.  This way instcombine will visit from the top
-  // of the function down.  This jives well with the way that it adds all uses
-  // of instructions to the worklist after doing a transformation, thus avoiding
-  // some N^2 behavior in pathological cases.
-  ICWorklist.AddInitialGroup(InstrsForInstCombineWorklist);
-
-  return MadeIRChange;
-}
-
-/// Populate the IC worklist from a function, and prune any dead basic
-/// blocks discovered in the process.
-///
-/// This also does basic constant propagation and other forward fixing to make
-/// the combiner itself run much faster.
-static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
-                                          TargetLibraryInfo *TLI,
-                                          InstCombineWorklist &ICWorklist) {
-  bool MadeIRChange = false;
-
-  // Do a depth-first traversal of the function, populate the worklist with
-  // the reachable instructions.  Ignore blocks that are not reachable.  Keep
-  // track of which blocks we visit.
-  SmallPtrSet<BasicBlock *, 32> Visited;
-  MadeIRChange |=
-      AddReachableCodeToWorklist(&F.front(), DL, Visited, ICWorklist, TLI);
-
-  // Do a quick scan over the function.  If we find any blocks that are
-  // unreachable, remove any instructions inside of them.  This prevents
-  // the instcombine code from having to deal with some bad special cases.
+  // Remove instructions inside unreachable blocks. This prevents the
+  // instcombine code from having to deal with some bad special cases, and
+  // reduces use counts of instructions.
   for (BasicBlock &BB : F) {
     if (Visited.count(&BB))
       continue;
@@ -3558,6 +3687,27 @@ static bool prepareICWorklistFromFunction(Function &F, const DataLayout &DL,
     NumDeadInst += NumDeadInstInBB;
   }
 
+  // Once we've found all of the instructions to add to instcombine's worklist,
+  // add them in reverse order.  This way instcombine will visit from the top
+  // of the function down.  This jives well with the way that it adds all uses
+  // of instructions to the worklist after doing a transformation, thus avoiding
+  // some N^2 behavior in pathological cases.
+  ICWorklist.reserve(InstrsForInstCombineWorklist.size());
+  for (Instruction *Inst : reverse(InstrsForInstCombineWorklist)) {
+    // DCE instruction if trivially dead. As we iterate in reverse program
+    // order here, we will clean up whole chains of dead instructions.
+    if (isInstructionTriviallyDead(Inst, TLI)) {
+      ++NumDeadInst;
+      LLVM_DEBUG(dbgs() << "IC: DCE: " << *Inst << '\n');
+      salvageDebugInfo(*Inst);
+      Inst->eraseFromParent();
+      MadeIRChange = true;
+      continue;
+    }
+
+    ICWorklist.push(Inst);
+  }
+
   return MadeIRChange;
 }
 
@@ -3565,10 +3715,8 @@ static bool combineInstructionsOverFunction(
     Function &F, InstCombineWorklist &Worklist, AliasAnalysis *AA,
     AssumptionCache &AC, TargetLibraryInfo &TLI, DominatorTree &DT,
     OptimizationRemarkEmitter &ORE, BlockFrequencyInfo *BFI,
-    ProfileSummaryInfo *PSI, bool ExpensiveCombines, unsigned MaxIterations,
-    LoopInfo *LI) {
+    ProfileSummaryInfo *PSI, unsigned MaxIterations, LoopInfo *LI) {
   auto &DL = F.getParent()->getDataLayout();
-  ExpensiveCombines |= EnableExpensiveCombines;
   MaxIterations = std::min(MaxIterations, LimitMaxIterations.getValue());
 
   /// Builder - This is an IRBuilder that automatically inserts new
@@ -3576,7 +3724,7 @@ static bool combineInstructionsOverFunction(
   IRBuilder<TargetFolder, IRBuilderCallbackInserter> Builder(
       F.getContext(), TargetFolder(DL),
       IRBuilderCallbackInserter([&Worklist, &AC](Instruction *I) {
-        Worklist.Add(I);
+        Worklist.add(I);
         if (match(I, m_Intrinsic<Intrinsic::assume>()))
           AC.registerAssumption(cast<CallInst>(I));
       }));
@@ -3610,7 +3758,7 @@ static bool combineInstructionsOverFunction(
 
     MadeIRChange |= prepareICWorklistFromFunction(F, DL, &TLI, Worklist);
 
-    InstCombiner IC(Worklist, Builder, F.hasMinSize(), ExpensiveCombines, AA,
+    InstCombiner IC(Worklist, Builder, F.hasMinSize(), AA,
                     AC, TLI, DT, ORE, BFI, PSI, DL, LI);
     IC.MaxArraySizeForCombine = MaxArraySize;
 
@@ -3623,11 +3771,10 @@ static bool combineInstructionsOverFunction(
   return MadeIRChange;
 }
 
-InstCombinePass::InstCombinePass(bool ExpensiveCombines)
-    : ExpensiveCombines(ExpensiveCombines), MaxIterations(LimitMaxIterations) {}
+InstCombinePass::InstCombinePass() : MaxIterations(LimitMaxIterations) {}
 
-InstCombinePass::InstCombinePass(bool ExpensiveCombines, unsigned MaxIterations)
-    : ExpensiveCombines(ExpensiveCombines), MaxIterations(MaxIterations) {}
+InstCombinePass::InstCombinePass(unsigned MaxIterations)
+    : MaxIterations(MaxIterations) {}
 
 PreservedAnalyses InstCombinePass::run(Function &F,
                                        FunctionAnalysisManager &AM) {
@@ -3639,16 +3786,14 @@ PreservedAnalyses InstCombinePass::run(Function &F,
   auto *LI = AM.getCachedResult<LoopAnalysis>(F);
 
   auto *AA = &AM.getResult<AAManager>(F);
-  const ModuleAnalysisManager &MAM =
-      AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
+  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
   ProfileSummaryInfo *PSI =
-      MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+      MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
   auto *BFI = (PSI && PSI->hasProfileSummary()) ?
       &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
 
   if (!combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
-                                       PSI, ExpensiveCombines, MaxIterations,
-                                       LI))
+                                       PSI, MaxIterations, LI))
     // No changes, all analyses are preserved.
     return PreservedAnalyses::all();
 
@@ -3698,22 +3843,18 @@ bool InstructionCombiningPass::runOnFunction(Function &F) {
       nullptr;
 
   return combineInstructionsOverFunction(F, Worklist, AA, AC, TLI, DT, ORE, BFI,
-                                         PSI, ExpensiveCombines, MaxIterations,
-                                         LI);
+                                         PSI, MaxIterations, LI);
 }
 
 char InstructionCombiningPass::ID = 0;
 
-InstructionCombiningPass::InstructionCombiningPass(bool ExpensiveCombines)
-    : FunctionPass(ID), ExpensiveCombines(ExpensiveCombines),
-      MaxIterations(InstCombineDefaultMaxIterations) {
+InstructionCombiningPass::InstructionCombiningPass()
+    : FunctionPass(ID), MaxIterations(InstCombineDefaultMaxIterations) {
   initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
 }
 
-InstructionCombiningPass::InstructionCombiningPass(bool ExpensiveCombines,
-                                                   unsigned MaxIterations)
-    : FunctionPass(ID), ExpensiveCombines(ExpensiveCombines),
-      MaxIterations(MaxIterations) {
+InstructionCombiningPass::InstructionCombiningPass(unsigned MaxIterations)
+    : FunctionPass(ID), MaxIterations(MaxIterations) {
   initializeInstructionCombiningPassPass(*PassRegistry::getPassRegistry());
 }
 
@@ -3739,13 +3880,12 @@ void LLVMInitializeInstCombine(LLVMPassRegistryRef R) {
   initializeInstructionCombiningPassPass(*unwrap(R));
 }
 
-FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines) {
-  return new InstructionCombiningPass(ExpensiveCombines);
+FunctionPass *llvm::createInstructionCombiningPass() {
+  return new InstructionCombiningPass();
 }
 
-FunctionPass *llvm::createInstructionCombiningPass(bool ExpensiveCombines,
-                                                   unsigned MaxIterations) {
-  return new InstructionCombiningPass(ExpensiveCombines, MaxIterations);
+FunctionPass *llvm::createInstructionCombiningPass(unsigned MaxIterations) {
+  return new InstructionCombiningPass(MaxIterations);
 }
 
 void LLVMAddInstructionCombiningPass(LLVMPassManagerRef PM) {
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 79c119489a65..ee09a4d9db7e 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -10,6 +10,8 @@
 // Details of the algorithm:
 //  https://github.com/google/sanitizers/wiki/AddressSanitizerAlgorithm
 //
+// FIXME: This sanitizer does not yet handle scalable vectors
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Instrumentation/AddressSanitizer.h"
@@ -30,7 +32,6 @@
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Comdat.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -70,6 +71,7 @@
 #include "llvm/Support/ScopedPrinter.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
 #include "llvm/Transforms/Utils/ASanStackFrameLayout.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
@@ -213,6 +215,11 @@ static cl::opt<bool> ClInstrumentAtomics(
     cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
     cl::init(true));
 
+static cl::opt<bool>
+    ClInstrumentByval("asan-instrument-byval",
+                      cl::desc("instrument byval call arguments"), cl::Hidden,
+                      cl::init(true));
+
 static cl::opt<bool> ClAlwaysSlowPath(
     "asan-always-slow-path",
     cl::desc("use instrumentation with slow path for all accesses"), cl::Hidden,
@@ -532,7 +539,7 @@ static ShadowMapping getShadowMapping(Triple &TargetTriple, int LongSize,
   return Mapping;
 }
 
-static size_t RedzoneSizeForScale(int MappingScale) {
+static uint64_t getRedzoneSizeForScale(int MappingScale) {
   // Redzone used for stack and globals is at least 32 bytes.
   // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
   return std::max(32U, 1U << MappingScale);
@@ -584,11 +591,10 @@ struct AddressSanitizer {
   AddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
                    bool CompileKernel = false, bool Recover = false,
                    bool UseAfterScope = false)
-      : UseAfterScope(UseAfterScope || ClUseAfterScope), GlobalsMD(*GlobalsMD) {
-    this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
-    this->CompileKernel =
-        ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan : CompileKernel;
-
+      : CompileKernel(ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan
+                                                            : CompileKernel),
+        Recover(ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover),
+        UseAfterScope(UseAfterScope || ClUseAfterScope), GlobalsMD(*GlobalsMD) {
     C = &(M.getContext());
     LongSize = M.getDataLayout().getPointerSizeInBits();
     IntptrTy = Type::getIntNTy(*C, LongSize);
@@ -613,16 +619,13 @@ struct AddressSanitizer {
   /// Check if we want (and can) handle this alloca.
   bool isInterestingAlloca(const AllocaInst &AI);
 
-  /// If it is an interesting memory access, return the PointerOperand
-  /// and set IsWrite/Alignment. Otherwise return nullptr.
-  /// MaybeMask is an output parameter for the mask Value, if we're looking at a
-  /// masked load/store.
-  Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite,
-                                   uint64_t *TypeSize, unsigned *Alignment,
-                                   Value **MaybeMask = nullptr);
+  bool ignoreAccess(Value *Ptr);
+  void getInterestingMemoryOperands(
+      Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting);
 
-  void instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis, Instruction *I,
-                     bool UseCalls, const DataLayout &DL);
+  void instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
+                     InterestingMemoryOperand &O, bool UseCalls,
+                     const DataLayout &DL);
   void instrumentPointerComparisonOrSubtraction(Instruction *I);
   void instrumentAddress(Instruction *OrigIns, Instruction *InsertBefore,
                          Value *Addr, uint32_t TypeSize, bool IsWrite,
@@ -639,9 +642,10 @@ struct AddressSanitizer {
                                  Value *SizeArgument, uint32_t Exp);
   void instrumentMemIntrinsic(MemIntrinsic *MI);
   Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool suppressInstrumentationSiteForDebug(int &Instrumented);
   bool instrumentFunction(Function &F, const TargetLibraryInfo *TLI);
   bool maybeInsertAsanInitAtFunctionEntry(Function &F);
-  void maybeInsertDynamicShadowAtFunctionEntry(Function &F);
+  bool maybeInsertDynamicShadowAtFunctionEntry(Function &F);
   void markEscapedLocalAllocas(Function &F);
 
 private:
@@ -691,7 +695,6 @@ private:
   FunctionCallee AsanMemoryAccessCallbackSized[2][2];
 
   FunctionCallee AsanMemmove, AsanMemcpy, AsanMemset;
-  InlineAsm *EmptyAsm;
   Value *LocalDynamicShadow = nullptr;
   const GlobalsMetadata &GlobalsMD;
   DenseMap<const AllocaInst *, bool> ProcessedAllocas;
@@ -739,7 +742,11 @@ public:
   ModuleAddressSanitizer(Module &M, const GlobalsMetadata *GlobalsMD,
                          bool CompileKernel = false, bool Recover = false,
                          bool UseGlobalsGC = true, bool UseOdrIndicator = false)
-      : GlobalsMD(*GlobalsMD), UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC),
+      : GlobalsMD(*GlobalsMD),
+        CompileKernel(ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan
+                                                            : CompileKernel),
+        Recover(ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover),
+        UseGlobalsGC(UseGlobalsGC && ClUseGlobalsGC && !this->CompileKernel),
         // Enable aliases as they should have no downside with ODR indicators.
         UsePrivateAlias(UseOdrIndicator || ClUsePrivateAlias),
         UseOdrIndicator(UseOdrIndicator || ClUseOdrIndicator),
@@ -750,11 +757,7 @@ public:
         // argument is designed as workaround. Therefore, disable both
         // ClWithComdat and ClUseGlobalsGC unless the frontend says it's ok to
         // do globals-gc.
-        UseCtorComdat(UseGlobalsGC && ClWithComdat) {
-    this->Recover = ClRecover.getNumOccurrences() > 0 ? ClRecover : Recover;
-    this->CompileKernel =
-        ClEnableKasan.getNumOccurrences() > 0 ? ClEnableKasan : CompileKernel;
-
+        UseCtorComdat(UseGlobalsGC && ClWithComdat && !this->CompileKernel) {
     C = &(M.getContext());
     int LongSize = M.getDataLayout().getPointerSizeInBits();
     IntptrTy = Type::getIntNTy(*C, LongSize);
@@ -787,16 +790,18 @@ private:
                                        StringRef OriginalName);
   void SetComdatForGlobalMetadata(GlobalVariable *G, GlobalVariable *Metadata,
                                   StringRef InternalSuffix);
-  IRBuilder<> CreateAsanModuleDtor(Module &M);
+  Instruction *CreateAsanModuleDtor(Module &M);
 
-  bool ShouldInstrumentGlobal(GlobalVariable *G);
+  bool canInstrumentAliasedGlobal(const GlobalAlias &GA) const;
+  bool shouldInstrumentGlobal(GlobalVariable *G) const;
   bool ShouldUseMachOGlobalsSection() const;
   StringRef getGlobalMetadataSection() const;
   void poisonOneInitializer(Function &GlobalInit, GlobalValue *ModuleName);
   void createInitializerPoisonCalls(Module &M, GlobalValue *ModuleName);
-  size_t MinRedzoneSizeForGlobal() const {
-    return RedzoneSizeForScale(Mapping.Scale);
+  uint64_t getMinRedzoneSizeForGlobal() const {
+    return getRedzoneSizeForScale(Mapping.Scale);
   }
+  uint64_t getRedzoneSizeForGlobal(uint64_t SizeInBytes) const;
   int GetAsanVersion(const Module &M) const;
 
   const GlobalsMetadata &GlobalsMD;
@@ -907,16 +912,14 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
   using AllocaForValueMapTy = DenseMap<Value *, AllocaInst *>;
   AllocaForValueMapTy AllocaForValue;
 
-  bool HasNonEmptyInlineAsm = false;
+  bool HasInlineAsm = false;
   bool HasReturnsTwiceCall = false;
-  std::unique_ptr<CallInst> EmptyInlineAsm;
 
   FunctionStackPoisoner(Function &F, AddressSanitizer &ASan)
       : F(F), ASan(ASan), DIB(*F.getParent(), /*AllowUnresolved*/ false),
         C(ASan.C), IntptrTy(ASan.IntptrTy),
         IntptrPtrTy(PointerType::get(IntptrTy, 0)), Mapping(ASan.Mapping),
-        StackAlignment(1 << Mapping.Scale),
-        EmptyInlineAsm(CallInst::Create(ASan.EmptyAsm)) {}
+        StackAlignment(1 << Mapping.Scale) {}
 
   bool runOnFunction() {
     if (!ClStack) return false;
@@ -1076,12 +1079,9 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
       DynamicAllocaPoisonCallVec.push_back(APC);
   }
 
-  void visitCallSite(CallSite CS) {
-    Instruction *I = CS.getInstruction();
-    if (CallInst *CI = dyn_cast<CallInst>(I)) {
-      HasNonEmptyInlineAsm |= CI->isInlineAsm() &&
-                              !CI->isIdenticalTo(EmptyInlineAsm.get()) &&
-                              I != ASan.LocalDynamicShadow;
+  void visitCallBase(CallBase &CB) {
+    if (CallInst *CI = dyn_cast<CallInst>(&CB)) {
+      HasInlineAsm |= CI->isInlineAsm() && &CB != ASan.LocalDynamicShadow;
       HasReturnsTwiceCall |= CI->canReturnTwice();
     }
   }
@@ -1147,9 +1147,9 @@ GlobalsMetadata::GlobalsMetadata(Module &M) {
       E.Name = Name->getString();
     ConstantInt *IsDynInit = mdconst::extract<ConstantInt>(MDN->getOperand(3));
     E.IsDynInit |= IsDynInit->isOne();
-    ConstantInt *IsBlacklisted =
+    ConstantInt *IsExcluded =
         mdconst::extract<ConstantInt>(MDN->getOperand(4));
-    E.IsBlacklisted |= IsBlacklisted->isOne();
+    E.IsExcluded |= IsExcluded->isOne();
   }
 }
 
@@ -1168,9 +1168,8 @@ AddressSanitizerPass::AddressSanitizerPass(bool CompileKernel, bool Recover,
 PreservedAnalyses AddressSanitizerPass::run(Function &F,
                                             AnalysisManager<Function> &AM) {
   auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
-  auto &MAM = MAMProxy.getManager();
   Module &M = *F.getParent();
-  if (auto *R = MAM.getCachedResult<ASanGlobalsMetadataAnalysis>(M)) {
+  if (auto *R = MAMProxy.getCachedResult<ASanGlobalsMetadataAnalysis>(M)) {
     const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
     AddressSanitizer Sanitizer(M, R, CompileKernel, Recover, UseAfterScope);
     if (Sanitizer.instrumentFunction(F, TLI))
@@ -1341,98 +1340,90 @@ bool AddressSanitizer::isInterestingAlloca(const AllocaInst &AI) {
   return IsInteresting;
 }
 
-Value *AddressSanitizer::isInterestingMemoryAccess(Instruction *I,
-                                                   bool *IsWrite,
-                                                   uint64_t *TypeSize,
-                                                   unsigned *Alignment,
-                                                   Value **MaybeMask) {
+bool AddressSanitizer::ignoreAccess(Value *Ptr) {
+  // Do not instrument acesses from different address spaces; we cannot deal
+  // with them.
+  Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
+  if (PtrTy->getPointerAddressSpace() != 0)
+    return true;
+
+  // Ignore swifterror addresses.
+  // swifterror memory addresses are mem2reg promoted by instruction
+  // selection. As such they cannot have regular uses like an instrumentation
+  // function and it makes no sense to track them as memory.
+  if (Ptr->isSwiftError())
+    return true;
+
+  // Treat memory accesses to promotable allocas as non-interesting since they
+  // will not cause memory violations. This greatly speeds up the instrumented
+  // executable at -O0.
+  if (auto AI = dyn_cast_or_null<AllocaInst>(Ptr))
+    if (ClSkipPromotableAllocas && !isInterestingAlloca(*AI))
+      return true;
+
+  return false;
+}
+
+void AddressSanitizer::getInterestingMemoryOperands(
+    Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting) {
   // Skip memory accesses inserted by another instrumentation.
-  if (I->hasMetadata("nosanitize")) return nullptr;
+  if (I->hasMetadata("nosanitize"))
+    return;
 
   // Do not instrument the load fetching the dynamic shadow address.
   if (LocalDynamicShadow == I)
-    return nullptr;
+    return;
 
-  Value *PtrOperand = nullptr;
-  const DataLayout &DL = I->getModule()->getDataLayout();
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-    if (!ClInstrumentReads) return nullptr;
-    *IsWrite = false;
-    *TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
-    *Alignment = LI->getAlignment();
-    PtrOperand = LI->getPointerOperand();
+    if (!ClInstrumentReads || ignoreAccess(LI->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, LI->getPointerOperandIndex(), false,
+                             LI->getType(), LI->getAlign());
   } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-    if (!ClInstrumentWrites) return nullptr;
-    *IsWrite = true;
-    *TypeSize = DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
-    *Alignment = SI->getAlignment();
-    PtrOperand = SI->getPointerOperand();
+    if (!ClInstrumentWrites || ignoreAccess(SI->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, SI->getPointerOperandIndex(), true,
+                             SI->getValueOperand()->getType(), SI->getAlign());
   } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
-    if (!ClInstrumentAtomics) return nullptr;
-    *IsWrite = true;
-    *TypeSize = DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
-    *Alignment = 0;
-    PtrOperand = RMW->getPointerOperand();
+    if (!ClInstrumentAtomics || ignoreAccess(RMW->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true,
+                             RMW->getValOperand()->getType(), None);
   } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
-    if (!ClInstrumentAtomics) return nullptr;
-    *IsWrite = true;
-    *TypeSize = DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
-    *Alignment = 0;
-    PtrOperand = XCHG->getPointerOperand();
+    if (!ClInstrumentAtomics || ignoreAccess(XCHG->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
+                             XCHG->getCompareOperand()->getType(), None);
   } else if (auto CI = dyn_cast<CallInst>(I)) {
-    auto *F = dyn_cast<Function>(CI->getCalledValue());
+    auto *F = CI->getCalledFunction();
     if (F && (F->getName().startswith("llvm.masked.load.") ||
               F->getName().startswith("llvm.masked.store."))) {
-      unsigned OpOffset = 0;
-      if (F->getName().startswith("llvm.masked.store.")) {
-        if (!ClInstrumentWrites)
-          return nullptr;
-        // Masked store has an initial operand for the value.
-        OpOffset = 1;
-        *IsWrite = true;
-      } else {
-        if (!ClInstrumentReads)
-          return nullptr;
-        *IsWrite = false;
-      }
-
-      auto BasePtr = CI->getOperand(0 + OpOffset);
+      bool IsWrite = F->getName().startswith("llvm.masked.store.");
+      // Masked store has an initial operand for the value.
+      unsigned OpOffset = IsWrite ? 1 : 0;
+      if (IsWrite ? !ClInstrumentWrites : !ClInstrumentReads)
+        return;
+
+      auto BasePtr = CI->getOperand(OpOffset);
+      if (ignoreAccess(BasePtr))
+        return;
       auto Ty = cast<PointerType>(BasePtr->getType())->getElementType();
-      *TypeSize = DL.getTypeStoreSizeInBits(Ty);
-      if (auto AlignmentConstant =
-              dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
-        *Alignment = (unsigned)AlignmentConstant->getZExtValue();
-      else
-        *Alignment = 1; // No alignment guarantees. We probably got Undef
-      if (MaybeMask)
-        *MaybeMask = CI->getOperand(2 + OpOffset);
-      PtrOperand = BasePtr;
+      MaybeAlign Alignment = Align(1);
+      // Otherwise no alignment guarantees. We probably got Undef.
+      if (auto *Op = dyn_cast<ConstantInt>(CI->getOperand(1 + OpOffset)))
+        Alignment = Op->getMaybeAlignValue();
+      Value *Mask = CI->getOperand(2 + OpOffset);
+      Interesting.emplace_back(I, OpOffset, IsWrite, Ty, Alignment, Mask);
+    } else {
+      for (unsigned ArgNo = 0; ArgNo < CI->getNumArgOperands(); ArgNo++) {
+        if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) ||
+            ignoreAccess(CI->getArgOperand(ArgNo)))
+          continue;
+        Type *Ty = CI->getParamByValType(ArgNo);
+        Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
+      }
     }
   }
-
-  if (PtrOperand) {
-    // Do not instrument acesses from different address spaces; we cannot deal
-    // with them.
-    Type *PtrTy = cast<PointerType>(PtrOperand->getType()->getScalarType());
-    if (PtrTy->getPointerAddressSpace() != 0)
-      return nullptr;
-
-    // Ignore swifterror addresses.
-    // swifterror memory addresses are mem2reg promoted by instruction
-    // selection. As such they cannot have regular uses like an instrumentation
-    // function and it makes no sense to track them as memory.
-    if (PtrOperand->isSwiftError())
-      return nullptr;
-  }
-
-  // Treat memory accesses to promotable allocas as non-interesting since they
-  // will not cause memory violations. This greatly speeds up the instrumented
-  // executable at -O0.
-  if (ClSkipPromotableAllocas)
-    if (auto AI = dyn_cast_or_null<AllocaInst>(PtrOperand))
-      return isInterestingAlloca(*AI) ? AI : nullptr;
-
-  return PtrOperand;
 }
 
 static bool isPointerOperand(Value *V) {
@@ -1491,7 +1482,7 @@ void AddressSanitizer::instrumentPointerComparisonOrSubtraction(
 
 static void doInstrumentAddress(AddressSanitizer *Pass, Instruction *I,
                                 Instruction *InsertBefore, Value *Addr,
-                                unsigned Alignment, unsigned Granularity,
+                                MaybeAlign Alignment, unsigned Granularity,
                                 uint32_t TypeSize, bool IsWrite,
                                 Value *SizeArgument, bool UseCalls,
                                 uint32_t Exp) {
@@ -1499,7 +1490,7 @@ static void doInstrumentAddress(AddressSanitizer *Pass, Instruction *I,
   // if the data is properly aligned.
   if ((TypeSize == 8 || TypeSize == 16 || TypeSize == 32 || TypeSize == 64 ||
        TypeSize == 128) &&
-      (Alignment >= Granularity || Alignment == 0 || Alignment >= TypeSize / 8))
+      (!Alignment || *Alignment >= Granularity || *Alignment >= TypeSize / 8))
     return Pass->instrumentAddress(I, InsertBefore, Addr, TypeSize, IsWrite,
                                    nullptr, UseCalls, Exp);
   Pass->instrumentUnusualSizeOrAlignment(I, InsertBefore, Addr, TypeSize,
@@ -1509,13 +1500,14 @@ static void doInstrumentAddress(AddressSanitizer *Pass, Instruction *I,
 static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
                                         const DataLayout &DL, Type *IntptrTy,
                                         Value *Mask, Instruction *I,
-                                        Value *Addr, unsigned Alignment,
+                                        Value *Addr, MaybeAlign Alignment,
                                         unsigned Granularity, uint32_t TypeSize,
                                         bool IsWrite, Value *SizeArgument,
                                         bool UseCalls, uint32_t Exp) {
-  auto *VTy = cast<PointerType>(Addr->getType())->getElementType();
+  auto *VTy = cast<FixedVectorType>(
+      cast<PointerType>(Addr->getType())->getElementType());
   uint64_t ElemTypeSize = DL.getTypeStoreSizeInBits(VTy->getScalarType());
-  unsigned Num = VTy->getVectorNumElements();
+  unsigned Num = VTy->getNumElements();
   auto Zero = ConstantInt::get(IntptrTy, 0);
   for (unsigned Idx = 0; Idx < Num; ++Idx) {
     Value *InstrumentedAddress = nullptr;
@@ -1546,15 +1538,9 @@ static void instrumentMaskedLoadOrStore(AddressSanitizer *Pass,
 }
 
 void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
-                                     Instruction *I, bool UseCalls,
+                                     InterestingMemoryOperand &O, bool UseCalls,
                                      const DataLayout &DL) {
-  bool IsWrite = false;
-  unsigned Alignment = 0;
-  uint64_t TypeSize = 0;
-  Value *MaybeMask = nullptr;
-  Value *Addr =
-      isInterestingMemoryAccess(I, &IsWrite, &TypeSize, &Alignment, &MaybeMask);
-  assert(Addr);
+  Value *Addr = O.getPtr();
 
   // Optimization experiments.
   // The experiments can be used to evaluate potential optimizations that remove
@@ -1574,7 +1560,7 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
     // dynamically initialized global is always valid.
     GlobalVariable *G = dyn_cast<GlobalVariable>(GetUnderlyingObject(Addr, DL));
     if (G && (!ClInitializers || GlobalIsLinkerInitialized(G)) &&
-        isSafeAccess(ObjSizeVis, Addr, TypeSize)) {
+        isSafeAccess(ObjSizeVis, Addr, O.TypeSize)) {
       NumOptimizedAccessesToGlobalVar++;
       return;
     }
@@ -1583,25 +1569,26 @@ void AddressSanitizer::instrumentMop(ObjectSizeOffsetVisitor &ObjSizeVis,
   if (ClOpt && ClOptStack) {
     // A direct inbounds access to a stack variable is always valid.
     if (isa<AllocaInst>(GetUnderlyingObject(Addr, DL)) &&
-        isSafeAccess(ObjSizeVis, Addr, TypeSize)) {
+        isSafeAccess(ObjSizeVis, Addr, O.TypeSize)) {
       NumOptimizedAccessesToStackVar++;
       return;
     }
   }
 
-  if (IsWrite)
+  if (O.IsWrite)
     NumInstrumentedWrites++;
   else
     NumInstrumentedReads++;
 
   unsigned Granularity = 1 << Mapping.Scale;
-  if (MaybeMask) {
-    instrumentMaskedLoadOrStore(this, DL, IntptrTy, MaybeMask, I, Addr,
-                                Alignment, Granularity, TypeSize, IsWrite,
-                                nullptr, UseCalls, Exp);
+  if (O.MaybeMask) {
+    instrumentMaskedLoadOrStore(this, DL, IntptrTy, O.MaybeMask, O.getInsn(),
+                                Addr, O.Alignment, Granularity, O.TypeSize,
+                                O.IsWrite, nullptr, UseCalls, Exp);
   } else {
-    doInstrumentAddress(this, I, I, Addr, Alignment, Granularity, TypeSize,
-                        IsWrite, nullptr, UseCalls, Exp);
+    doInstrumentAddress(this, O.getInsn(), O.getInsn(), Addr, O.Alignment,
+                        Granularity, O.TypeSize, O.IsWrite, nullptr, UseCalls,
+                        Exp);
   }
 }
 
@@ -1629,10 +1616,7 @@ Instruction *AddressSanitizer::generateCrashCode(Instruction *InsertBefore,
                             {Addr, ExpVal});
   }
 
-  // We don't do Call->setDoesNotReturn() because the BB already has
-  // UnreachableInst at the end.
-  // This EmptyAsm is required to avoid callback merge.
-  IRB.CreateCall(EmptyAsm, {});
+  Call->setCannotMerge();
   return Call;
 }
 
@@ -1800,13 +1784,29 @@ void ModuleAddressSanitizer::createInitializerPoisonCalls(
   }
 }
 
-bool ModuleAddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) {
+bool ModuleAddressSanitizer::canInstrumentAliasedGlobal(
+    const GlobalAlias &GA) const {
+  // In case this function should be expanded to include rules that do not just
+  // apply when CompileKernel is true, either guard all existing rules with an
+  // 'if (CompileKernel) { ... }' or be absolutely sure that all these rules
+  // should also apply to user space.
+  assert(CompileKernel && "Only expecting to be called when compiling kernel");
+
+  // When compiling the kernel, globals that are aliased by symbols prefixed
+  // by "__" are special and cannot be padded with a redzone.
+  if (GA.getName().startswith("__"))
+    return false;
+
+  return true;
+}
+
+bool ModuleAddressSanitizer::shouldInstrumentGlobal(GlobalVariable *G) const {
   Type *Ty = G->getValueType();
   LLVM_DEBUG(dbgs() << "GLOBAL: " << *G << "\n");
 
   // FIXME: Metadata should be attched directly to the global directly instead
   // of being added to llvm.asan.globals.
-  if (GlobalsMD.get(G).IsBlacklisted) return false;
+  if (GlobalsMD.get(G).IsExcluded) return false;
   if (!Ty->isSized()) return false;
   if (!G->hasInitializer()) return false;
   // Only instrument globals of default address spaces
@@ -1817,7 +1817,7 @@ bool ModuleAddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) {
   //   - Need to poison all copies, not just the main thread's one.
   if (G->isThreadLocal()) return false;
   // For now, just ignore this Global if the alignment is large.
-  if (G->getAlignment() > MinRedzoneSizeForGlobal()) return false;
+  if (G->getAlignment() > getMinRedzoneSizeForGlobal()) return false;
 
   // For non-COFF targets, only instrument globals known to be defined by this
   // TU.
@@ -1847,6 +1847,12 @@ bool ModuleAddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) {
   }
 
   if (G->hasSection()) {
+    // The kernel uses explicit sections for mostly special global variables
+    // that we should not instrument. E.g. the kernel may rely on their layout
+    // without redzones, or remove them at link time ("discard.*"), etc.
+    if (CompileKernel)
+      return false;
+
     StringRef Section = G->getSection();
 
     // Globals from llvm.metadata aren't emitted, do not instrument them.
@@ -1913,6 +1919,13 @@ bool ModuleAddressSanitizer::ShouldInstrumentGlobal(GlobalVariable *G) {
     }
   }
 
+  if (CompileKernel) {
+    // Globals that prefixed by "__" are special and cannot be padded with a
+    // redzone.
+    if (G->getName().startswith("__"))
+      return false;
+  }
+
   return true;
 }
 
@@ -1993,7 +2006,7 @@ void ModuleAddressSanitizer::SetComdatForGlobalMetadata(
     }
 
     if (!InternalSuffix.empty() && G->hasLocalLinkage()) {
-      std::string Name = G->getName();
+      std::string Name = std::string(G->getName());
       Name += InternalSuffix;
       C = M.getOrInsertComdat(Name);
     } else {
@@ -2030,13 +2043,13 @@ ModuleAddressSanitizer::CreateMetadataGlobal(Module &M, Constant *Initializer,
   return Metadata;
 }
 
-IRBuilder<> ModuleAddressSanitizer::CreateAsanModuleDtor(Module &M) {
+Instruction *ModuleAddressSanitizer::CreateAsanModuleDtor(Module &M) {
   AsanDtorFunction =
       Function::Create(FunctionType::get(Type::getVoidTy(*C), false),
                        GlobalValue::InternalLinkage, kAsanModuleDtorName, &M);
   BasicBlock *AsanDtorBB = BasicBlock::Create(*C, "", AsanDtorFunction);
 
-  return IRBuilder<>(ReturnInst::Create(*C, AsanDtorBB));
+  return ReturnInst::Create(*C, AsanDtorBB);
 }
 
 void ModuleAddressSanitizer::InstrumentGlobalsCOFF(
@@ -2045,11 +2058,15 @@ void ModuleAddressSanitizer::InstrumentGlobalsCOFF(
   assert(ExtendedGlobals.size() == MetadataInitializers.size());
   auto &DL = M.getDataLayout();
 
+  SmallVector<GlobalValue *, 16> MetadataGlobals(ExtendedGlobals.size());
   for (size_t i = 0; i < ExtendedGlobals.size(); i++) {
     Constant *Initializer = MetadataInitializers[i];
     GlobalVariable *G = ExtendedGlobals[i];
     GlobalVariable *Metadata =
         CreateMetadataGlobal(M, Initializer, G->getName());
+    MDNode *MD = MDNode::get(M.getContext(), ValueAsMetadata::get(G));
+    Metadata->setMetadata(LLVMContext::MD_associated, MD);
+    MetadataGlobals[i] = Metadata;
 
     // The MSVC linker always inserts padding when linking incrementally. We
     // cope with that by aligning each struct to its size, which must be a power
@@ -2061,6 +2078,11 @@ void ModuleAddressSanitizer::InstrumentGlobalsCOFF(
 
     SetComdatForGlobalMetadata(G, Metadata, "");
   }
+
+  // Update llvm.compiler.used, adding the new metadata globals. This is
+  // needed so that during LTO these variables stay alive.
+  if (!MetadataGlobals.empty())
+    appendToCompilerUsed(M, MetadataGlobals);
 }
 
 void ModuleAddressSanitizer::InstrumentGlobalsELF(
@@ -2081,10 +2103,23 @@ void ModuleAddressSanitizer::InstrumentGlobalsELF(
     SetComdatForGlobalMetadata(G, Metadata, UniqueModuleId);
   }
 
+  // This should never be called when there are no globals, by the logic that
+  // computes the UniqueModuleId string, which is "" when there are no globals.
+  // It's important that this path is only used when there are actually some
+  // globals, because that means that there will certainly be a live
+  // `asan_globals` input section at link time and thus `__start_asan_globals`
+  // and `__stop_asan_globals` symbols will definitely be defined at link time.
+  // This means there's no need for the references to them to be weak, which
+  // enables better code generation because ExternalWeakLinkage implies
+  // isInterposable() and thus requires GOT indirection for PIC.  Since these
+  // are known-defined hidden/dso_local symbols, direct PIC accesses without
+  // dynamic relocation are always sufficient.
+  assert(!MetadataGlobals.empty());
+  assert(!UniqueModuleId.empty());
+
   // Update llvm.compiler.used, adding the new metadata globals. This is
   // needed so that during LTO these variables stay alive.
-  if (!MetadataGlobals.empty())
-    appendToCompilerUsed(M, MetadataGlobals);
+  appendToCompilerUsed(M, MetadataGlobals);
 
   // RegisteredFlag serves two purposes. First, we can pass it to dladdr()
   // to look up the loaded image that contains it. Second, we can store in it
@@ -2097,15 +2132,18 @@ void ModuleAddressSanitizer::InstrumentGlobalsELF(
       ConstantInt::get(IntptrTy, 0), kAsanGlobalsRegisteredFlagName);
   RegisteredFlag->setVisibility(GlobalVariable::HiddenVisibility);
 
-  // Create start and stop symbols.
-  GlobalVariable *StartELFMetadata = new GlobalVariable(
-      M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
-      "__start_" + getGlobalMetadataSection());
-  StartELFMetadata->setVisibility(GlobalVariable::HiddenVisibility);
-  GlobalVariable *StopELFMetadata = new GlobalVariable(
-      M, IntptrTy, false, GlobalVariable::ExternalWeakLinkage, nullptr,
-      "__stop_" + getGlobalMetadataSection());
-  StopELFMetadata->setVisibility(GlobalVariable::HiddenVisibility);
+  // Create start and stop symbols.  These are known to be defined by
+  // the linker, see comment above.
+  auto MakeStartStopGV = [&](const char *Prefix) {
+    GlobalVariable *StartStop =
+        new GlobalVariable(M, IntptrTy, false, GlobalVariable::ExternalLinkage,
+                           nullptr, Prefix + getGlobalMetadataSection());
+    StartStop->setVisibility(GlobalVariable::HiddenVisibility);
+    assert(StartStop->isImplicitDSOLocal());
+    return StartStop;
+  };
+  GlobalVariable *StartELFMetadata = MakeStartStopGV("__start_");
+  GlobalVariable *StopELFMetadata = MakeStartStopGV("__stop_");
 
   // Create a call to register the globals with the runtime.
   IRB.CreateCall(AsanRegisterElfGlobals,
@@ -2115,7 +2153,7 @@ void ModuleAddressSanitizer::InstrumentGlobalsELF(
 
   // We also need to unregister globals at the end, e.g., when a shared library
   // gets closed.
-  IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
+  IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M));
   IRB_Dtor.CreateCall(AsanUnregisterElfGlobals,
                       {IRB.CreatePointerCast(RegisteredFlag, IntptrTy),
                        IRB.CreatePointerCast(StartELFMetadata, IntptrTy),
@@ -2174,7 +2212,7 @@ void ModuleAddressSanitizer::InstrumentGlobalsMachO(
 
   // We also need to unregister globals at the end, e.g., when a shared library
   // gets closed.
-  IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
+  IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M));
   IRB_Dtor.CreateCall(AsanUnregisterImageGlobals,
                       {IRB.CreatePointerCast(RegisteredFlag, IntptrTy)});
 }
@@ -2202,7 +2240,7 @@ void ModuleAddressSanitizer::InstrumentGlobalsWithMetadataArray(
 
   // We also need to unregister globals at the end, e.g., when a shared library
   // gets closed.
-  IRBuilder<> IRB_Dtor = CreateAsanModuleDtor(M);
+  IRBuilder<> IRB_Dtor(CreateAsanModuleDtor(M));
   IRB_Dtor.CreateCall(AsanUnregisterGlobals,
                       {IRB.CreatePointerCast(AllGlobals, IntptrTy),
                        ConstantInt::get(IntptrTy, N)});
@@ -2217,10 +2255,22 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
                                                bool *CtorComdat) {
   *CtorComdat = false;
 
-  SmallVector<GlobalVariable *, 16> GlobalsToChange;
+  // Build set of globals that are aliased by some GA, where
+  // canInstrumentAliasedGlobal(GA) returns false.
+  SmallPtrSet<const GlobalVariable *, 16> AliasedGlobalExclusions;
+  if (CompileKernel) {
+    for (auto &GA : M.aliases()) {
+      if (const auto *GV = dyn_cast<GlobalVariable>(GA.getAliasee())) {
+        if (!canInstrumentAliasedGlobal(GA))
+          AliasedGlobalExclusions.insert(GV);
+      }
+    }
+  }
 
+  SmallVector<GlobalVariable *, 16> GlobalsToChange;
   for (auto &G : M.globals()) {
-    if (ShouldInstrumentGlobal(&G)) GlobalsToChange.push_back(&G);
+    if (!AliasedGlobalExclusions.count(&G) && shouldInstrumentGlobal(&G))
+      GlobalsToChange.push_back(&G);
   }
 
   size_t n = GlobalsToChange.size();
@@ -2255,7 +2305,6 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
       M, M.getModuleIdentifier(), /*AllowMerging*/ false, kAsanGenPrefix);
 
   for (size_t i = 0; i < n; i++) {
-    static const uint64_t kMaxGlobalRedzone = 1 << 18;
     GlobalVariable *G = GlobalsToChange[i];
 
     // FIXME: Metadata should be attched directly to the global directly instead
@@ -2269,16 +2318,8 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
         /*AllowMerging*/ true, kAsanGenPrefix);
 
     Type *Ty = G->getValueType();
-    uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
-    uint64_t MinRZ = MinRedzoneSizeForGlobal();
-    // MinRZ <= RZ <= kMaxGlobalRedzone
-    // and trying to make RZ to be ~ 1/4 of SizeInBytes.
-    uint64_t RZ = std::max(
-        MinRZ, std::min(kMaxGlobalRedzone, (SizeInBytes / MinRZ / 4) * MinRZ));
-    uint64_t RightRedzoneSize = RZ;
-    // Round up to MinRZ
-    if (SizeInBytes % MinRZ) RightRedzoneSize += MinRZ - (SizeInBytes % MinRZ);
-    assert(((RightRedzoneSize + SizeInBytes) % MinRZ) == 0);
+    const uint64_t SizeInBytes = DL.getTypeAllocSize(Ty);
+    const uint64_t RightRedzoneSize = getRedzoneSizeForGlobal(SizeInBytes);
     Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
 
     StructType *NewTy = StructType::get(Ty, RightRedZoneTy);
@@ -2294,7 +2335,7 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
                            "", G, G->getThreadLocalMode());
     NewGlobal->copyAttributesFrom(G);
     NewGlobal->setComdat(G->getComdat());
-    NewGlobal->setAlignment(MaybeAlign(MinRZ));
+    NewGlobal->setAlignment(MaybeAlign(getMinRedzoneSizeForGlobal()));
     // Don't fold globals with redzones. ODR violation detector and redzone
     // poisoning implicitly creates a dependence on the global's address, so it
     // is no longer valid for it to be marked unnamed_addr.
@@ -2362,7 +2403,7 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
       // Set meaningful attributes for indicator symbol.
       ODRIndicatorSym->setVisibility(NewGlobal->getVisibility());
       ODRIndicatorSym->setDLLStorageClass(NewGlobal->getDLLStorageClass());
-      ODRIndicatorSym->setAlignment(Align::None());
+      ODRIndicatorSym->setAlignment(Align(1));
       ODRIndicator = ODRIndicatorSym;
     }
 
@@ -2416,6 +2457,23 @@ bool ModuleAddressSanitizer::InstrumentGlobals(IRBuilder<> &IRB, Module &M,
   return true;
 }
 
+uint64_t
+ModuleAddressSanitizer::getRedzoneSizeForGlobal(uint64_t SizeInBytes) const {
+  constexpr uint64_t kMaxRZ = 1 << 18;
+  const uint64_t MinRZ = getMinRedzoneSizeForGlobal();
+
+  // Calculate RZ, where MinRZ <= RZ <= MaxRZ, and RZ ~ 1/4 * SizeInBytes.
+  uint64_t RZ =
+      std::max(MinRZ, std::min(kMaxRZ, (SizeInBytes / MinRZ / 4) * MinRZ));
+
+  // Round up to multiple of MinRZ.
+  if (SizeInBytes % MinRZ)
+    RZ += MinRZ - (SizeInBytes % MinRZ);
+  assert((RZ + SizeInBytes) % MinRZ == 0);
+
+  return RZ;
+}
+
 int ModuleAddressSanitizer::GetAsanVersion(const Module &M) const {
   int LongSize = M.getDataLayout().getPointerSizeInBits();
   bool isAndroid = Triple(M.getTargetTriple()).isAndroid();
@@ -2429,20 +2487,23 @@ int ModuleAddressSanitizer::GetAsanVersion(const Module &M) const {
 bool ModuleAddressSanitizer::instrumentModule(Module &M) {
   initializeCallbacks(M);
 
-  if (CompileKernel)
-    return false;
-
   // Create a module constructor. A destructor is created lazily because not all
   // platforms, and not all modules need it.
-  std::string AsanVersion = std::to_string(GetAsanVersion(M));
-  std::string VersionCheckName =
-      ClInsertVersionCheck ? (kAsanVersionCheckNamePrefix + AsanVersion) : "";
-  std::tie(AsanCtorFunction, std::ignore) = createSanitizerCtorAndInitFunctions(
-      M, kAsanModuleCtorName, kAsanInitName, /*InitArgTypes=*/{},
-      /*InitArgs=*/{}, VersionCheckName);
+  if (CompileKernel) {
+    // The kernel always builds with its own runtime, and therefore does not
+    // need the init and version check calls.
+    AsanCtorFunction = createSanitizerCtor(M, kAsanModuleCtorName);
+  } else {
+    std::string AsanVersion = std::to_string(GetAsanVersion(M));
+    std::string VersionCheckName =
+        ClInsertVersionCheck ? (kAsanVersionCheckNamePrefix + AsanVersion) : "";
+    std::tie(AsanCtorFunction, std::ignore) =
+        createSanitizerCtorAndInitFunctions(M, kAsanModuleCtorName,
+                                            kAsanInitName, /*InitArgTypes=*/{},
+                                            /*InitArgs=*/{}, VersionCheckName);
+  }
 
   bool CtorComdat = true;
-  // TODO(glider): temporarily disabled globals instrumentation for KASan.
   if (ClGlobals) {
     IRBuilder<> IRB(AsanCtorFunction->getEntryBlock().getTerminator());
     InstrumentGlobals(IRB, M, &CtorComdat);
@@ -2529,10 +2590,6 @@ void AddressSanitizer::initializeCallbacks(Module &M) {
       M.getOrInsertFunction(kAsanPtrCmp, IRB.getVoidTy(), IntptrTy, IntptrTy);
   AsanPtrSubFunction =
       M.getOrInsertFunction(kAsanPtrSub, IRB.getVoidTy(), IntptrTy, IntptrTy);
-  // We insert an empty inline asm after __asan_report* to avoid callback merge.
-  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
-                            StringRef(""), StringRef(""),
-                            /*hasSideEffects=*/true);
   if (Mapping.InGlobal)
     AsanShadowGlobal = M.getOrInsertGlobal("__asan_shadow",
                                            ArrayType::get(IRB.getInt8Ty(), 0));
@@ -2556,10 +2613,10 @@ bool AddressSanitizer::maybeInsertAsanInitAtFunctionEntry(Function &F) {
   return false;
 }
 
-void AddressSanitizer::maybeInsertDynamicShadowAtFunctionEntry(Function &F) {
+bool AddressSanitizer::maybeInsertDynamicShadowAtFunctionEntry(Function &F) {
   // Generate code only when dynamic addressing is needed.
   if (Mapping.Offset != kDynamicShadowSentinel)
-    return;
+    return false;
 
   IRBuilder<> IRB(&F.front().front());
   if (Mapping.InGlobal) {
@@ -2581,6 +2638,7 @@ void AddressSanitizer::maybeInsertDynamicShadowAtFunctionEntry(Function &F) {
         kAsanShadowMemoryDynamicAddress, IntptrTy);
     LocalDynamicShadow = IRB.CreateLoad(IntptrTy, GlobalDynamicAddress);
   }
+  return true;
 }
 
 void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
@@ -2611,6 +2669,14 @@ void AddressSanitizer::markEscapedLocalAllocas(Function &F) {
   }
 }
 
+bool AddressSanitizer::suppressInstrumentationSiteForDebug(int &Instrumented) {
+  bool ShouldInstrument =
+      ClDebugMin < 0 || ClDebugMax < 0 ||
+      (Instrumented >= ClDebugMin && Instrumented <= ClDebugMax);
+  Instrumented++;
+  return !ShouldInstrument;
+}
+
 bool AddressSanitizer::instrumentFunction(Function &F,
                                           const TargetLibraryInfo *TLI) {
   if (F.getLinkage() == GlobalValue::AvailableExternallyLinkage) return false;
@@ -2634,7 +2700,7 @@ bool AddressSanitizer::instrumentFunction(Function &F,
 
   FunctionStateRAII CleanupObj(this);
 
-  maybeInsertDynamicShadowAtFunctionEntry(F);
+  FunctionModified |= maybeInsertDynamicShadowAtFunctionEntry(F);
 
   // We can't instrument allocas used with llvm.localescape. Only static allocas
   // can be passed to that intrinsic.
@@ -2643,14 +2709,12 @@ bool AddressSanitizer::instrumentFunction(Function &F,
   // We want to instrument every address only once per basic block (unless there
   // are calls between uses).
   SmallPtrSet<Value *, 16> TempsToInstrument;
-  SmallVector<Instruction *, 16> ToInstrument;
+  SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
+  SmallVector<MemIntrinsic *, 16> IntrinToInstrument;
   SmallVector<Instruction *, 8> NoReturnCalls;
   SmallVector<BasicBlock *, 16> AllBlocks;
   SmallVector<Instruction *, 16> PointerComparisonsOrSubtracts;
   int NumAllocas = 0;
-  bool IsWrite;
-  unsigned Alignment;
-  uint64_t TypeSize;
 
   // Fill the set of memory operations to instrument.
   for (auto &BB : F) {
@@ -2659,51 +2723,54 @@ bool AddressSanitizer::instrumentFunction(Function &F,
     int NumInsnsPerBB = 0;
     for (auto &Inst : BB) {
       if (LooksLikeCodeInBug11395(&Inst)) return false;
-      Value *MaybeMask = nullptr;
-      if (Value *Addr = isInterestingMemoryAccess(&Inst, &IsWrite, &TypeSize,
-                                                  &Alignment, &MaybeMask)) {
-        if (ClOpt && ClOptSameTemp) {
-          // If we have a mask, skip instrumentation if we've already
-          // instrumented the full object. But don't add to TempsToInstrument
-          // because we might get another load/store with a different mask.
-          if (MaybeMask) {
-            if (TempsToInstrument.count(Addr))
-              continue; // We've seen this (whole) temp in the current BB.
-          } else {
-            if (!TempsToInstrument.insert(Addr).second)
-              continue; // We've seen this temp in the current BB.
+      SmallVector<InterestingMemoryOperand, 1> InterestingOperands;
+      getInterestingMemoryOperands(&Inst, InterestingOperands);
+
+      if (!InterestingOperands.empty()) {
+        for (auto &Operand : InterestingOperands) {
+          if (ClOpt && ClOptSameTemp) {
+            Value *Ptr = Operand.getPtr();
+            // If we have a mask, skip instrumentation if we've already
+            // instrumented the full object. But don't add to TempsToInstrument
+            // because we might get another load/store with a different mask.
+            if (Operand.MaybeMask) {
+              if (TempsToInstrument.count(Ptr))
+                continue; // We've seen this (whole) temp in the current BB.
+            } else {
+              if (!TempsToInstrument.insert(Ptr).second)
+                continue; // We've seen this temp in the current BB.
+            }
           }
+          OperandsToInstrument.push_back(Operand);
+          NumInsnsPerBB++;
         }
       } else if (((ClInvalidPointerPairs || ClInvalidPointerCmp) &&
                   isInterestingPointerComparison(&Inst)) ||
                  ((ClInvalidPointerPairs || ClInvalidPointerSub) &&
                   isInterestingPointerSubtraction(&Inst))) {
         PointerComparisonsOrSubtracts.push_back(&Inst);
-        continue;
-      } else if (isa<MemIntrinsic>(Inst)) {
+      } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst)) {
         // ok, take it.
+        IntrinToInstrument.push_back(MI);
+        NumInsnsPerBB++;
       } else {
         if (isa<AllocaInst>(Inst)) NumAllocas++;
-        CallSite CS(&Inst);
-        if (CS) {
+        if (auto *CB = dyn_cast<CallBase>(&Inst)) {
           // A call inside BB.
           TempsToInstrument.clear();
-          if (CS.doesNotReturn() && !CS->hasMetadata("nosanitize"))
-            NoReturnCalls.push_back(CS.getInstruction());
+          if (CB->doesNotReturn() && !CB->hasMetadata("nosanitize"))
+            NoReturnCalls.push_back(CB);
         }
         if (CallInst *CI = dyn_cast<CallInst>(&Inst))
           maybeMarkSanitizerLibraryCallNoBuiltin(CI, TLI);
-        continue;
       }
-      ToInstrument.push_back(&Inst);
-      NumInsnsPerBB++;
       if (NumInsnsPerBB >= ClMaxInsnsToInstrumentPerBB) break;
     }
   }
 
-  bool UseCalls =
-      (ClInstrumentationWithCallsThreshold >= 0 &&
-       ToInstrument.size() > (unsigned)ClInstrumentationWithCallsThreshold);
+  bool UseCalls = (ClInstrumentationWithCallsThreshold >= 0 &&
+                   OperandsToInstrument.size() + IntrinToInstrument.size() >
+                       (unsigned)ClInstrumentationWithCallsThreshold);
   const DataLayout &DL = F.getParent()->getDataLayout();
   ObjectSizeOpts ObjSizeOpts;
   ObjSizeOpts.RoundToAlign = true;
@@ -2711,16 +2778,16 @@ bool AddressSanitizer::instrumentFunction(Function &F,
 
   // Instrument.
   int NumInstrumented = 0;
-  for (auto Inst : ToInstrument) {
-    if (ClDebugMin < 0 || ClDebugMax < 0 ||
-        (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
-      if (isInterestingMemoryAccess(Inst, &IsWrite, &TypeSize, &Alignment))
-        instrumentMop(ObjSizeVis, Inst, UseCalls,
-                      F.getParent()->getDataLayout());
-      else
-        instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
-    }
-    NumInstrumented++;
+  for (auto &Operand : OperandsToInstrument) {
+    if (!suppressInstrumentationSiteForDebug(NumInstrumented))
+      instrumentMop(ObjSizeVis, Operand, UseCalls,
+                    F.getParent()->getDataLayout());
+    FunctionModified = true;
+  }
+  for (auto Inst : IntrinToInstrument) {
+    if (!suppressInstrumentationSiteForDebug(NumInstrumented))
+      instrumentMemIntrinsic(Inst);
+    FunctionModified = true;
   }
 
   FunctionStackPoisoner FSP(F, *this);
@@ -2735,10 +2802,10 @@ bool AddressSanitizer::instrumentFunction(Function &F,
 
   for (auto Inst : PointerComparisonsOrSubtracts) {
     instrumentPointerComparisonOrSubtraction(Inst);
-    NumInstrumented++;
+    FunctionModified = true;
   }
 
-  if (NumInstrumented > 0 || ChangedStack || !NoReturnCalls.empty())
+  if (ChangedStack || !NoReturnCalls.empty())
     FunctionModified = true;
 
   LLVM_DEBUG(dbgs() << "ASAN done instrumenting: " << FunctionModified << " "
@@ -2836,7 +2903,8 @@ void FunctionStackPoisoner::copyToShadowInline(ArrayRef<uint8_t> ShadowMask,
     Value *Ptr = IRB.CreateAdd(ShadowBase, ConstantInt::get(IntptrTy, i));
     Value *Poison = IRB.getIntN(StoreSizeInBytes * 8, Val);
     IRB.CreateAlignedStore(
-        Poison, IRB.CreateIntToPtr(Ptr, Poison->getType()->getPointerTo()), 1);
+        Poison, IRB.CreateIntToPtr(Ptr, Poison->getType()->getPointerTo()),
+        Align(1));
 
     i += StoreSizeInBytes;
   }
@@ -2900,7 +2968,7 @@ void FunctionStackPoisoner::copyArgsPassedByValToAllocas() {
   const DataLayout &DL = F.getParent()->getDataLayout();
   for (Argument &Arg : F.args()) {
     if (Arg.hasByValAttr()) {
-      Type *Ty = Arg.getType()->getPointerElementType();
+      Type *Ty = Arg.getParamByValType();
       const Align Alignment =
           DL.getValueOrABITypeAlignment(Arg.getParamAlign(), Ty);
 
@@ -2943,7 +3011,7 @@ Value *FunctionStackPoisoner::createAllocaForLayout(
   }
   assert((ClRealignStack & (ClRealignStack - 1)) == 0);
   size_t FrameAlignment = std::max(L.FrameAlignment, (size_t)ClRealignStack);
-  Alloca->setAlignment(MaybeAlign(FrameAlignment));
+  Alloca->setAlignment(Align(FrameAlignment));
   return IRB.CreatePointerCast(Alloca, IntptrTy);
 }
 
@@ -2982,6 +3050,59 @@ void FunctionStackPoisoner::processDynamicAllocas() {
   unpoisonDynamicAllocas();
 }
 
+/// Collect instructions in the entry block after \p InsBefore which initialize
+/// permanent storage for a function argument. These instructions must remain in
+/// the entry block so that uninitialized values do not appear in backtraces. An
+/// added benefit is that this conserves spill slots. This does not move stores
+/// before instrumented / "interesting" allocas.
+static void findStoresToUninstrumentedArgAllocas(
+    AddressSanitizer &ASan, Instruction &InsBefore,
+    SmallVectorImpl<Instruction *> &InitInsts) {
+  Instruction *Start = InsBefore.getNextNonDebugInstruction();
+  for (Instruction *It = Start; It; It = It->getNextNonDebugInstruction()) {
+    // Argument initialization looks like:
+    // 1) store <Argument>, <Alloca> OR
+    // 2) <CastArgument> = cast <Argument> to ...
+    //    store <CastArgument> to <Alloca>
+    // Do not consider any other kind of instruction.
+    //
+    // Note: This covers all known cases, but may not be exhaustive. An
+    // alternative to pattern-matching stores is to DFS over all Argument uses:
+    // this might be more general, but is probably much more complicated.
+    if (isa<AllocaInst>(It) || isa<CastInst>(It))
+      continue;
+    if (auto *Store = dyn_cast<StoreInst>(It)) {
+      // The store destination must be an alloca that isn't interesting for
+      // ASan to instrument. These are moved up before InsBefore, and they're
+      // not interesting because allocas for arguments can be mem2reg'd.
+      auto *Alloca = dyn_cast<AllocaInst>(Store->getPointerOperand());
+      if (!Alloca || ASan.isInterestingAlloca(*Alloca))
+        continue;
+
+      Value *Val = Store->getValueOperand();
+      bool IsDirectArgInit = isa<Argument>(Val);
+      bool IsArgInitViaCast =
+          isa<CastInst>(Val) &&
+          isa<Argument>(cast<CastInst>(Val)->getOperand(0)) &&
+          // Check that the cast appears directly before the store. Otherwise
+          // moving the cast before InsBefore may break the IR.
+          Val == It->getPrevNonDebugInstruction();
+      bool IsArgInit = IsDirectArgInit || IsArgInitViaCast;
+      if (!IsArgInit)
+        continue;
+
+      if (IsArgInitViaCast)
+        InitInsts.push_back(cast<Instruction>(Val));
+      InitInsts.push_back(Store);
+      continue;
+    }
+
+    // Do not reorder past unknown instructions: argument initialization should
+    // only involve casts and stores.
+    return;
+  }
+}
+
 void FunctionStackPoisoner::processStaticAllocas() {
   if (AllocaVec.empty()) {
     assert(StaticAllocaPoisonCallVec.empty());
@@ -3005,6 +3126,15 @@ void FunctionStackPoisoner::processStaticAllocas() {
     if (AI->getParent() == InsBeforeB)
       AI->moveBefore(InsBefore);
 
+  // Move stores of arguments into entry-block allocas as well. This prevents
+  // extra stack slots from being generated (to house the argument values until
+  // they can be stored into the allocas). This also prevents uninitialized
+  // values from being shown in backtraces.
+  SmallVector<Instruction *, 8> ArgInitInsts;
+  findStoresToUninstrumentedArgAllocas(ASan, *InsBefore, ArgInitInsts);
+  for (Instruction *ArgInitInst : ArgInitInsts)
+    ArgInitInst->moveBefore(InsBefore);
+
   // If we have a call to llvm.localescape, keep it in the entry block.
   if (LocalEscapeCall) LocalEscapeCall->moveBefore(InsBefore);
 
@@ -3063,8 +3193,8 @@ void FunctionStackPoisoner::processStaticAllocas() {
   // 2) There is a returns_twice call (typically setjmp), which is
   //    optimization-hostile, and doesn't play well with introduced indirect
   //    register-relative calculation of local variable addresses.
-  DoDynamicAlloca &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall;
-  DoStackMalloc &= !HasNonEmptyInlineAsm && !HasReturnsTwiceCall;
+  DoDynamicAlloca &= !HasInlineAsm && !HasReturnsTwiceCall;
+  DoStackMalloc &= !HasInlineAsm && !HasReturnsTwiceCall;
 
   Value *StaticAlloca =
       DoDynamicAlloca ? nullptr : createAllocaForLayout(IRB, L, false);
@@ -3118,11 +3248,21 @@ void FunctionStackPoisoner::processStaticAllocas() {
     LocalStackBaseAlloca = LocalStackBase;
   }
 
+  // It shouldn't matter whether we pass an `alloca` or a `ptrtoint` as the
+  // dbg.declare address opereand, but passing a `ptrtoint` seems to confuse
+  // later passes and can result in dropped variable coverage in debug info.
+  Value *LocalStackBaseAllocaPtr =
+      isa<PtrToIntInst>(LocalStackBaseAlloca)
+          ? cast<PtrToIntInst>(LocalStackBaseAlloca)->getPointerOperand()
+          : LocalStackBaseAlloca;
+  assert(isa<AllocaInst>(LocalStackBaseAllocaPtr) &&
+         "Variable descriptions relative to ASan stack base will be dropped");
+
   // Replace Alloca instructions with base+offset.
   for (const auto &Desc : SVD) {
     AllocaInst *AI = Desc.AI;
-    replaceDbgDeclareForAlloca(AI, LocalStackBaseAlloca, DIB, DIExprFlags,
-                               Desc.Offset);
+    replaceDbgDeclare(AI, LocalStackBaseAllocaPtr, DIB, DIExprFlags,
+                      Desc.Offset);
     Value *NewAllocaPtr = IRB.CreateIntToPtr(
         IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Desc.Offset)),
         AI->getType());
@@ -3256,7 +3396,7 @@ void FunctionStackPoisoner::poisonAlloca(Value *V, uint64_t Size,
 void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
   IRBuilder<> IRB(AI);
 
-  const unsigned Align = std::max(kAllocaRzSize, AI->getAlignment());
+  const unsigned Alignment = std::max(kAllocaRzSize, AI->getAlignment());
   const uint64_t AllocaRedzoneMask = kAllocaRzSize - 1;
 
   Value *Zero = Constant::getNullValue(IntptrTy);
@@ -3283,21 +3423,21 @@ void FunctionStackPoisoner::handleDynamicAllocaCall(AllocaInst *AI) {
   Value *Cond = IRB.CreateICmpNE(Misalign, AllocaRzSize);
   Value *PartialPadding = IRB.CreateSelect(Cond, Misalign, Zero);
 
-  // AdditionalChunkSize = Align + PartialPadding + kAllocaRzSize
-  // Align is added to locate left redzone, PartialPadding for possible
+  // AdditionalChunkSize = Alignment + PartialPadding + kAllocaRzSize
+  // Alignment is added to locate left redzone, PartialPadding for possible
   // partial redzone and kAllocaRzSize for right redzone respectively.
   Value *AdditionalChunkSize = IRB.CreateAdd(
-      ConstantInt::get(IntptrTy, Align + kAllocaRzSize), PartialPadding);
+      ConstantInt::get(IntptrTy, Alignment + kAllocaRzSize), PartialPadding);
 
   Value *NewSize = IRB.CreateAdd(OldSize, AdditionalChunkSize);
 
-  // Insert new alloca with new NewSize and Align params.
+  // Insert new alloca with new NewSize and Alignment params.
   AllocaInst *NewAlloca = IRB.CreateAlloca(IRB.getInt8Ty(), NewSize);
-  NewAlloca->setAlignment(MaybeAlign(Align));
+  NewAlloca->setAlignment(Align(Alignment));
 
-  // NewAddress = Address + Align
+  // NewAddress = Address + Alignment
   Value *NewAddress = IRB.CreateAdd(IRB.CreatePtrToInt(NewAlloca, IntptrTy),
-                                    ConstantInt::get(IntptrTy, Align));
+                                    ConstantInt::get(IntptrTy, Alignment));
 
   // Insert __asan_alloca_poison call for new created alloca.
   IRB.CreateCall(AsanAllocaPoisonFunc, {NewAddress, OldSize});
diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 9abb62ac788c..efb11b68a1e3 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -112,7 +112,7 @@ static Value *getBoundsCheckCond(Value *Ptr, Value *InstVal,
 ///
 /// \p GetTrapBB is a callable that returns the trap BB to use on failure.
 template <typename GetTrapBBT>
-static void insertBoundsCheck(Value *Or, BuilderTy IRB, GetTrapBBT GetTrapBB) {
+static void insertBoundsCheck(Value *Or, BuilderTy &IRB, GetTrapBBT GetTrapBB) {
   // check if the comparison is always false
   ConstantInt *C = dyn_cast_or_null<ConstantInt>(Or);
   if (C) {
@@ -154,17 +154,22 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
     Value *Or = nullptr;
     BuilderTy IRB(I.getParent(), BasicBlock::iterator(&I), TargetFolder(DL));
     if (LoadInst *LI = dyn_cast<LoadInst>(&I)) {
-      Or = getBoundsCheckCond(LI->getPointerOperand(), LI, DL, TLI,
-                              ObjSizeEval, IRB, SE);
+      if (!LI->isVolatile())
+        Or = getBoundsCheckCond(LI->getPointerOperand(), LI, DL, TLI,
+                                ObjSizeEval, IRB, SE);
     } else if (StoreInst *SI = dyn_cast<StoreInst>(&I)) {
-      Or = getBoundsCheckCond(SI->getPointerOperand(), SI->getValueOperand(),
-                              DL, TLI, ObjSizeEval, IRB, SE);
+      if (!SI->isVolatile())
+        Or = getBoundsCheckCond(SI->getPointerOperand(), SI->getValueOperand(),
+                                DL, TLI, ObjSizeEval, IRB, SE);
     } else if (AtomicCmpXchgInst *AI = dyn_cast<AtomicCmpXchgInst>(&I)) {
-      Or = getBoundsCheckCond(AI->getPointerOperand(), AI->getCompareOperand(),
-                              DL, TLI, ObjSizeEval, IRB, SE);
+      if (!AI->isVolatile())
+        Or =
+            getBoundsCheckCond(AI->getPointerOperand(), AI->getCompareOperand(),
+                               DL, TLI, ObjSizeEval, IRB, SE);
     } else if (AtomicRMWInst *AI = dyn_cast<AtomicRMWInst>(&I)) {
-      Or = getBoundsCheckCond(AI->getPointerOperand(), AI->getValOperand(), DL,
-                              TLI, ObjSizeEval, IRB, SE);
+      if (!AI->isVolatile())
+        Or = getBoundsCheckCond(AI->getPointerOperand(), AI->getValOperand(),
+                                DL, TLI, ObjSizeEval, IRB, SE);
     }
     if (Or)
       TrapInfo.push_back(std::make_pair(&I, Or));
diff --git a/llvm/lib/Transforms/Instrumentation/CFGMST.h b/llvm/lib/Transforms/Instrumentation/CFGMST.h
index 8bb6f47c4846..9addb5d1ba93 100644
--- a/llvm/lib/Transforms/Instrumentation/CFGMST.h
+++ b/llvm/lib/Transforms/Instrumentation/CFGMST.h
@@ -20,6 +20,7 @@
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
 #include "llvm/Support/BranchProbability.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -28,6 +29,11 @@
 
 #define DEBUG_TYPE "cfgmst"
 
+using namespace llvm;
+static cl::opt<bool> PGOInstrumentEntry(
+    "pgo-instrument-entry", cl::init(false), cl::Hidden,
+    cl::desc("Force to instrument function entry basicblock."));
+
 namespace llvm {
 
 /// An union-find based Minimum Spanning Tree for CFG
@@ -100,8 +106,11 @@ public:
 
     const BasicBlock *Entry = &(F.getEntryBlock());
     uint64_t EntryWeight = (BFI != nullptr ? BFI->getEntryFreq() : 2);
+    // If we want to instrument the entry count, lower the weight to 0.
+    if (PGOInstrumentEntry)
+      EntryWeight = 0;
     Edge *EntryIncoming = nullptr, *EntryOutgoing = nullptr,
-        *ExitOutgoing = nullptr, *ExitIncoming = nullptr;
+         *ExitOutgoing = nullptr, *ExitIncoming = nullptr;
     uint64_t MaxEntryOutWeight = 0, MaxExitOutWeight = 0, MaxExitInWeight = 0;
 
     // Add a fake edge to the entry.
@@ -135,6 +144,8 @@ public:
           }
           if (BPI != nullptr)
             Weight = BPI->getEdgeProbability(&*BB, TargetBB).scale(scaleFactor);
+          if (Weight == 0)
+            Weight++;
           auto *E = &addEdge(&*BB, TargetBB, Weight);
           E->IsCritical = Critical;
           LLVM_DEBUG(dbgs() << "  Edge: from " << BB->getName() << " to "
@@ -278,6 +289,9 @@ public:
     buildEdges();
     sortEdgesByWeight();
     computeMinimumSpanningTree();
+    if (PGOInstrumentEntry && (AllEdges.size() > 1))
+      std::iter_swap(std::move(AllEdges.begin()),
+                     std::move(AllEdges.begin() + AllEdges.size() - 1));
   }
 };
 
diff --git a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
index 358abab3cceb..0cc0d9b07387 100644
--- a/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
+++ b/llvm/lib/Transforms/Instrumentation/CGProfile.cpp
@@ -10,12 +10,13 @@
 
 #include "llvm/ADT/MapVector.h"
 #include "llvm/Analysis/BlockFrequencyInfo.h"
+#include "llvm/Analysis/LazyBlockFrequencyInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Transforms/Instrumentation.h"
 
@@ -23,10 +24,32 @@
 
 using namespace llvm;
 
-PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) {
+static bool
+addModuleFlags(Module &M,
+               MapVector<std::pair<Function *, Function *>, uint64_t> &Counts) {
+  if (Counts.empty())
+    return false;
+
+  LLVMContext &Context = M.getContext();
+  MDBuilder MDB(Context);
+  std::vector<Metadata *> Nodes;
+
+  for (auto E : Counts) {
+    Metadata *Vals[] = {ValueAsMetadata::get(E.first.first),
+                        ValueAsMetadata::get(E.first.second),
+                        MDB.createConstant(ConstantInt::get(
+                            Type::getInt64Ty(Context), E.second))};
+    Nodes.push_back(MDNode::get(Context, Vals));
+  }
+
+  M.addModuleFlag(Module::Append, "CG Profile", MDNode::get(Context, Nodes));
+  return true;
+}
+
+static bool runCGProfilePass(
+    Module &M, function_ref<BlockFrequencyInfo &(Function &)> GetBFI,
+    function_ref<TargetTransformInfo &(Function &)> GetTTI, bool LazyBFI) {
   MapVector<std::pair<Function *, Function *>, uint64_t> Counts;
-  FunctionAnalysisManager &FAM =
-      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
   InstrProfSymtab Symtab;
   auto UpdateCounts = [&](TargetTransformInfo &TTI, Function *F,
                           Function *CalledF, uint64_t NewCount) {
@@ -36,29 +59,32 @@ PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) {
     Count = SaturatingAdd(Count, NewCount);
   };
   // Ignore error here.  Indirect calls are ignored if this fails.
-  (void)(bool)Symtab.create(M);
+  (void)(bool) Symtab.create(M);
   for (auto &F : M) {
-    if (F.isDeclaration())
+    // Avoid extra cost of running passes for BFI when the function doesn't have
+    // entry count. Since LazyBlockFrequencyInfoPass only exists in LPM, check
+    // if using LazyBlockFrequencyInfoPass.
+    // TODO: Remove LazyBFI when LazyBlockFrequencyInfoPass is available in NPM.
+    if (F.isDeclaration() || (LazyBFI && !F.getEntryCount()))
       continue;
-    auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
+    auto &BFI = GetBFI(F);
     if (BFI.getEntryFreq() == 0)
       continue;
-    TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+    TargetTransformInfo &TTI = GetTTI(F);
     for (auto &BB : F) {
       Optional<uint64_t> BBCount = BFI.getBlockProfileCount(&BB);
       if (!BBCount)
         continue;
       for (auto &I : BB) {
-        CallSite CS(&I);
-        if (!CS)
+        CallBase *CB = dyn_cast<CallBase>(&I);
+        if (!CB)
           continue;
-        if (CS.isIndirectCall()) {
+        if (CB->isIndirectCall()) {
           InstrProfValueData ValueData[8];
           uint32_t ActualNumValueData;
           uint64_t TotalC;
-          if (!getValueProfDataFromInst(*CS.getInstruction(),
-                                        IPVK_IndirectCallTarget, 8, ValueData,
-                                        ActualNumValueData, TotalC))
+          if (!getValueProfDataFromInst(*CB, IPVK_IndirectCallTarget, 8,
+                                        ValueData, ActualNumValueData, TotalC))
             continue;
           for (const auto &VD :
                ArrayRef<InstrProfValueData>(ValueData, ActualNumValueData)) {
@@ -66,33 +92,61 @@ PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) {
           }
           continue;
         }
-        UpdateCounts(TTI, &F, CS.getCalledFunction(), *BBCount);
+        UpdateCounts(TTI, &F, CB->getCalledFunction(), *BBCount);
       }
     }
   }
 
-  addModuleFlags(M, Counts);
-
-  return PreservedAnalyses::all();
+  return addModuleFlags(M, Counts);
 }
 
-void CGProfilePass::addModuleFlags(
-    Module &M,
-    MapVector<std::pair<Function *, Function *>, uint64_t> &Counts) const {
-  if (Counts.empty())
-    return;
+namespace {
+struct CGProfileLegacyPass final : public ModulePass {
+  static char ID;
+  CGProfileLegacyPass() : ModulePass(ID) {
+    initializeCGProfileLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
 
-  LLVMContext &Context = M.getContext();
-  MDBuilder MDB(Context);
-  std::vector<Metadata *> Nodes;
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.setPreservesCFG();
+    AU.addRequired<LazyBlockFrequencyInfoPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+  }
 
-  for (auto E : Counts) {
-    Metadata *Vals[] = {ValueAsMetadata::get(E.first.first),
-                        ValueAsMetadata::get(E.first.second),
-                        MDB.createConstant(ConstantInt::get(
-                            Type::getInt64Ty(Context), E.second))};
-    Nodes.push_back(MDNode::get(Context, Vals));
+  bool runOnModule(Module &M) override {
+    auto GetBFI = [this](Function &F) -> BlockFrequencyInfo & {
+      return this->getAnalysis<LazyBlockFrequencyInfoPass>(F).getBFI();
+    };
+    auto GetTTI = [this](Function &F) -> TargetTransformInfo & {
+      return this->getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    };
+
+    return runCGProfilePass(M, GetBFI, GetTTI, true);
   }
+};
 
-  M.addModuleFlag(Module::Append, "CG Profile", MDNode::get(Context, Nodes));
+} // namespace
+
+char CGProfileLegacyPass::ID = 0;
+
+INITIALIZE_PASS(CGProfileLegacyPass, "cg-profile", "Call Graph Profile", false,
+                false)
+
+ModulePass *llvm::createCGProfileLegacyPass() {
+  return new CGProfileLegacyPass();
+}
+
+PreservedAnalyses CGProfilePass::run(Module &M, ModuleAnalysisManager &MAM) {
+  FunctionAnalysisManager &FAM =
+      MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto GetBFI = [&FAM](Function &F) -> BlockFrequencyInfo & {
+    return FAM.getResult<BlockFrequencyAnalysis>(F);
+  };
+  auto GetTTI = [&FAM](Function &F) -> TargetTransformInfo & {
+    return FAM.getResult<TargetIRAnalysis>(F);
+  };
+
+  runCGProfilePass(M, GetBFI, GetTTI, false);
+
+  return PreservedAnalyses::all();
 }
diff --git a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
index d35abb92dd08..a99c58b74fb1 100644
--- a/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ControlHeightReduction.cpp
@@ -221,10 +221,8 @@ class CHRScope {
            "Must be siblings");
     assert(getExitBlock() == Next->getEntryBlock() &&
            "Must be adjacent");
-    for (RegInfo &RI : Next->RegInfos)
-      RegInfos.push_back(RI);
-    for (CHRScope *Sub : Next->Subs)
-      Subs.push_back(Sub);
+    RegInfos.append(Next->RegInfos.begin(), Next->RegInfos.end());
+    Subs.append(Next->Subs.begin(), Next->Subs.end());
   }
 
   void addSub(CHRScope *SubIn) {
@@ -246,37 +244,36 @@ class CHRScope {
     assert(Boundary && "Boundary null");
     assert(RegInfos.begin()->R != Boundary &&
            "Can't be split at beginning");
-    auto BoundaryIt = std::find_if(RegInfos.begin(), RegInfos.end(),
-                                   [&Boundary](const RegInfo& RI) {
-                                     return Boundary == RI.R;
-                                   });
+    auto BoundaryIt = llvm::find_if(
+        RegInfos, [&Boundary](const RegInfo &RI) { return Boundary == RI.R; });
     if (BoundaryIt == RegInfos.end())
       return nullptr;
-    SmallVector<RegInfo, 8> TailRegInfos;
-    SmallVector<CHRScope *, 8> TailSubs;
-    TailRegInfos.insert(TailRegInfos.begin(), BoundaryIt, RegInfos.end());
-    RegInfos.resize(BoundaryIt - RegInfos.begin());
+    ArrayRef<RegInfo> TailRegInfos(BoundaryIt, RegInfos.end());
     DenseSet<Region *> TailRegionSet;
-    for (RegInfo &RI : TailRegInfos)
+    for (const RegInfo &RI : TailRegInfos)
       TailRegionSet.insert(RI.R);
-    for (auto It = Subs.begin(); It != Subs.end(); ) {
-      CHRScope *Sub = *It;
-      assert(Sub && "null Sub");
-      Region *Parent = Sub->getParentRegion();
-      if (TailRegionSet.count(Parent)) {
-        TailSubs.push_back(Sub);
-        It = Subs.erase(It);
-      } else {
-        assert(std::find_if(RegInfos.begin(), RegInfos.end(),
-                            [&Parent](const RegInfo& RI) {
-                              return Parent == RI.R;
-                            }) != RegInfos.end() &&
-               "Must be in head");
-        ++It;
-      }
-    }
+
+    auto TailIt =
+        std::stable_partition(Subs.begin(), Subs.end(), [&](CHRScope *Sub) {
+          assert(Sub && "null Sub");
+          Region *Parent = Sub->getParentRegion();
+          if (TailRegionSet.count(Parent))
+            return false;
+
+          assert(llvm::find_if(RegInfos,
+                               [&Parent](const RegInfo &RI) {
+                                 return Parent == RI.R;
+                               }) != RegInfos.end() &&
+                 "Must be in head");
+          return true;
+        });
+    ArrayRef<CHRScope *> TailSubs(TailIt, Subs.end());
+
     assert(HoistStopMap.empty() && "MapHoistStops must be empty");
-    return new CHRScope(TailRegInfos, TailSubs);
+    auto *Scope = new CHRScope(TailRegInfos, TailSubs);
+    RegInfos.erase(BoundaryIt, RegInfos.end());
+    Subs.erase(TailIt, Subs.end());
+    return Scope;
   }
 
   bool contains(Instruction *I) const {
@@ -314,9 +311,9 @@ class CHRScope {
   HoistStopMapTy HoistStopMap;
 
  private:
-  CHRScope(SmallVector<RegInfo, 8> &RegInfosIn,
-           SmallVector<CHRScope *, 8> &SubsIn)
-    : RegInfos(RegInfosIn), Subs(SubsIn), BranchInsertPoint(nullptr) {}
+   CHRScope(ArrayRef<RegInfo> RegInfosIn, ArrayRef<CHRScope *> SubsIn)
+       : RegInfos(RegInfosIn.begin(), RegInfosIn.end()),
+         Subs(SubsIn.begin(), SubsIn.end()), BranchInsertPoint(nullptr) {}
 };
 
 class CHR {
@@ -340,8 +337,7 @@ class CHR {
 
   void findScopes(SmallVectorImpl<CHRScope *> &Output) {
     Region *R = RI.getTopLevelRegion();
-    CHRScope *Scope = findScopes(R, nullptr, nullptr, Output);
-    if (Scope) {
+    if (CHRScope *Scope = findScopes(R, nullptr, nullptr, Output)) {
       Output.push_back(Scope);
     }
   }
@@ -514,39 +510,36 @@ static bool isHoistable(Instruction *I, DominatorTree &DT) {
 // first-region entry block) or the (hoistable or unhoistable) base values that
 // are defined outside (including the first-region entry block) of the
 // scope. The returned set doesn't include constants.
-static std::set<Value *> getBaseValues(
-    Value *V, DominatorTree &DT,
-    DenseMap<Value *, std::set<Value *>> &Visited) {
-  if (Visited.count(V)) {
-    return Visited[V];
+static const std::set<Value *> &
+getBaseValues(Value *V, DominatorTree &DT,
+              DenseMap<Value *, std::set<Value *>> &Visited) {
+  auto It = Visited.find(V);
+  if (It != Visited.end()) {
+    return It->second;
   }
   std::set<Value *> Result;
   if (auto *I = dyn_cast<Instruction>(V)) {
-    // We don't stop at a block that's not in the Scope because we would miss some
-    // instructions that are based on the same base values if we stop there.
+    // We don't stop at a block that's not in the Scope because we would miss
+    // some instructions that are based on the same base values if we stop
+    // there.
     if (!isHoistable(I, DT)) {
       Result.insert(I);
-      Visited.insert(std::make_pair(V, Result));
-      return Result;
+      return Visited.insert(std::make_pair(V, std::move(Result))).first->second;
     }
     // I is hoistable above the Scope.
     for (Value *Op : I->operands()) {
-      std::set<Value *> OpResult = getBaseValues(Op, DT, Visited);
+      const std::set<Value *> &OpResult = getBaseValues(Op, DT, Visited);
       Result.insert(OpResult.begin(), OpResult.end());
     }
-    Visited.insert(std::make_pair(V, Result));
-    return Result;
+    return Visited.insert(std::make_pair(V, std::move(Result))).first->second;
   }
   if (isa<Argument>(V)) {
     Result.insert(V);
-    Visited.insert(std::make_pair(V, Result));
-    return Result;
   }
   // We don't include others like constants because those won't lead to any
   // chance of folding of conditions (eg two bit checks merged into one check)
   // after CHR.
-  Visited.insert(std::make_pair(V, Result));
-  return Result;  // empty
+  return Visited.insert(std::make_pair(V, std::move(Result))).first->second;
 }
 
 // Return true if V is already hoisted or can be hoisted (along with its
@@ -560,8 +553,9 @@ checkHoistValue(Value *V, Instruction *InsertPoint, DominatorTree &DT,
                 DenseMap<Instruction *, bool> &Visited) {
   assert(InsertPoint && "Null InsertPoint");
   if (auto *I = dyn_cast<Instruction>(V)) {
-    if (Visited.count(I)) {
-      return Visited[I];
+    auto It = Visited.find(I);
+    if (It != Visited.end()) {
+      return It->second;
     }
     assert(DT.getNode(I->getParent()) && "DT must contain I's parent block");
     assert(DT.getNode(InsertPoint->getParent()) && "DT must contain Destination");
@@ -1094,11 +1088,11 @@ static bool shouldSplit(Instruction *InsertPoint,
     std::set<Value *> PrevBases, Bases;
     DenseMap<Value *, std::set<Value *>> Visited;
     for (Value *V : PrevConditionValues) {
-      std::set<Value *> BaseValues = getBaseValues(V, DT, Visited);
+      const std::set<Value *> &BaseValues = getBaseValues(V, DT, Visited);
       PrevBases.insert(BaseValues.begin(), BaseValues.end());
     }
     for (Value *V : ConditionValues) {
-      std::set<Value *> BaseValues = getBaseValues(V, DT, Visited);
+      const std::set<Value *> &BaseValues = getBaseValues(V, DT, Visited);
       Bases.insert(BaseValues.begin(), BaseValues.end());
     }
     CHR_DEBUG(
@@ -1111,10 +1105,9 @@ static bool shouldSplit(Instruction *InsertPoint,
           dbgs() << *V << ", ";
         }
         dbgs() << "\n");
-    std::set<Value *> Intersection;
-    std::set_intersection(PrevBases.begin(), PrevBases.end(),
-                          Bases.begin(), Bases.end(),
-                          std::inserter(Intersection, Intersection.begin()));
+    std::vector<Value *> Intersection;
+    std::set_intersection(PrevBases.begin(), PrevBases.end(), Bases.begin(),
+                          Bases.end(), std::back_inserter(Intersection));
     if (Intersection.empty()) {
       // Empty intersection, split.
       CHR_DEBUG(dbgs() << "Split. Intersection empty\n");
@@ -1439,7 +1432,7 @@ void CHR::setCHRRegions(CHRScope *Scope, CHRScope *OutermostScope) {
     setCHRRegions(Sub, OutermostScope);
 }
 
-bool CHRScopeSorter(CHRScope *Scope1, CHRScope *Scope2) {
+static bool CHRScopeSorter(CHRScope *Scope1, CHRScope *Scope2) {
   return Scope1->RegInfos[0].R->getDepth() < Scope2->RegInfos[0].R->getDepth();
 }
 
@@ -1578,26 +1571,24 @@ static bool negateICmpIfUsedByBranchOrSelectOnly(ICmpInst *ICmp,
 static void insertTrivialPHIs(CHRScope *Scope,
                               BasicBlock *EntryBlock, BasicBlock *ExitBlock,
                               DenseSet<PHINode *> &TrivialPHIs) {
-  DenseSet<BasicBlock *> BlocksInScopeSet;
-  SmallVector<BasicBlock *, 8> BlocksInScopeVec;
+  SmallSetVector<BasicBlock *, 8> BlocksInScope;
   for (RegInfo &RI : Scope->RegInfos) {
     for (BasicBlock *BB : RI.R->blocks()) { // This includes the blocks in the
                                             // sub-Scopes.
-      BlocksInScopeSet.insert(BB);
-      BlocksInScopeVec.push_back(BB);
+      BlocksInScope.insert(BB);
     }
   }
-  CHR_DEBUG(
-      dbgs() << "Inserting redudant phis\n";
-      for (BasicBlock *BB : BlocksInScopeVec) {
-        dbgs() << "BlockInScope " << BB->getName() << "\n";
-      });
-  for (BasicBlock *BB : BlocksInScopeVec) {
+  CHR_DEBUG({
+    dbgs() << "Inserting redundant phis\n";
+    for (BasicBlock *BB : BlocksInScope)
+      dbgs() << "BlockInScope " << BB->getName() << "\n";
+  });
+  for (BasicBlock *BB : BlocksInScope) {
     for (Instruction &I : *BB) {
       SmallVector<Instruction *, 8> Users;
       for (User *U : I.users()) {
         if (auto *UI = dyn_cast<Instruction>(U)) {
-          if (BlocksInScopeSet.count(UI->getParent()) == 0 &&
+          if (BlocksInScope.count(UI->getParent()) == 0 &&
               // Unless there's already a phi for I at the exit block.
               !(isa<PHINode>(UI) && UI->getParent() == ExitBlock)) {
             CHR_DEBUG(dbgs() << "V " << I << "\n");
@@ -1874,9 +1865,10 @@ void CHR::fixupBranchesAndSelects(CHRScope *Scope,
         << " branches or selects";
   });
   MergedBR->setCondition(MergedCondition);
-  SmallVector<uint32_t, 2> Weights;
-  Weights.push_back(static_cast<uint32_t>(CHRBranchBias.scale(1000)));
-  Weights.push_back(static_cast<uint32_t>(CHRBranchBias.getCompl().scale(1000)));
+  uint32_t Weights[] = {
+      static_cast<uint32_t>(CHRBranchBias.scale(1000)),
+      static_cast<uint32_t>(CHRBranchBias.getCompl().scale(1000)),
+  };
   MDBuilder MDB(F.getContext());
   MergedBR->setMetadata(LLVMContext::MD_prof, MDB.createBranchWeights(Weights));
   CHR_DEBUG(dbgs() << "CHR branch bias " << Weights[0] << ":" << Weights[1]
@@ -2101,8 +2093,7 @@ PreservedAnalyses ControlHeightReductionPass::run(
   auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
   auto &DT = FAM.getResult<DominatorTreeAnalysis>(F);
   auto &MAMProxy = FAM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
-  auto &MAM = MAMProxy.getManager();
-  auto &PSI = *MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+  auto &PSI = *MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
   auto &RI = FAM.getResult<RegionInfoAnalysis>(F);
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   bool Changed = CHR(F, BFI, DT, PSI, RI, ORE).run();
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index cf9a6a321c7a..284631900731 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -59,7 +59,6 @@
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -162,10 +161,25 @@ static cl::opt<bool> ClDebugNonzeroLabels(
              "load or return with a nonzero label"),
     cl::Hidden);
 
+// Experimental feature that inserts callbacks for certain data events.
+// Currently callbacks are only inserted for loads, stores, memory transfers
+// (i.e. memcpy and memmove), and comparisons.
+//
+// If this flag is set to true, the user must provide definitions for the
+// following callback functions:
+//   void __dfsan_load_callback(dfsan_label Label);
+//   void __dfsan_store_callback(dfsan_label Label);
+//   void __dfsan_mem_transfer_callback(dfsan_label *Start, size_t Len);
+//   void __dfsan_cmp_callback(dfsan_label CombinedLabel);
+static cl::opt<bool> ClEventCallbacks(
+    "dfsan-event-callbacks",
+    cl::desc("Insert calls to __dfsan_*_callback functions on data events."),
+    cl::Hidden, cl::init(false));
+
 static StringRef GetGlobalTypeString(const GlobalValue &G) {
   // Types of GlobalVariables are always pointer types.
   Type *GType = G.getValueType();
-  // For now we support blacklisting struct types only.
+  // For now we support excluding struct types only.
   if (StructType *SGType = dyn_cast<StructType>(GType)) {
     if (!SGType->isLiteral())
       return SGType->getName();
@@ -282,9 +296,7 @@ class DataFlowSanitizer : public ModulePass {
   friend struct DFSanFunction;
   friend class DFSanVisitor;
 
-  enum {
-    ShadowWidth = 16
-  };
+  enum { ShadowWidthBits = 16, ShadowWidthBytes = ShadowWidthBits / 8 };
 
   /// Which ABI should be used for instrumented functions?
   enum InstrumentedABI {
@@ -345,6 +357,8 @@ class DataFlowSanitizer : public ModulePass {
   FunctionType *DFSanSetLabelFnTy;
   FunctionType *DFSanNonzeroLabelFnTy;
   FunctionType *DFSanVarargWrapperFnTy;
+  FunctionType *DFSanLoadStoreCmpCallbackFnTy;
+  FunctionType *DFSanMemTransferCallbackFnTy;
   FunctionCallee DFSanUnionFn;
   FunctionCallee DFSanCheckedUnionFn;
   FunctionCallee DFSanUnionLoadFn;
@@ -352,6 +366,10 @@ class DataFlowSanitizer : public ModulePass {
   FunctionCallee DFSanSetLabelFn;
   FunctionCallee DFSanNonzeroLabelFn;
   FunctionCallee DFSanVarargWrapperFn;
+  FunctionCallee DFSanLoadCallbackFn;
+  FunctionCallee DFSanStoreCallbackFn;
+  FunctionCallee DFSanMemTransferCallbackFn;
+  FunctionCallee DFSanCmpCallbackFn;
   MDNode *ColdCallWeights;
   DFSanABIList ABIList;
   DenseMap<Value *, Function *> UnwrappedFnMap;
@@ -371,6 +389,8 @@ class DataFlowSanitizer : public ModulePass {
                                  GlobalValue::LinkageTypes NewFLink,
                                  FunctionType *NewFT);
   Constant *getOrBuildTrampolineFunction(FunctionType *FT, StringRef FName);
+  void initializeCallbackFunctions(Module &M);
+  void initializeRuntimeFunctions(Module &M);
 
 public:
   static char ID;
@@ -424,7 +444,7 @@ struct DFSanFunction {
   Value *combineOperandShadows(Instruction *Inst);
   Value *loadShadow(Value *ShadowAddr, uint64_t Size, uint64_t Align,
                     Instruction *Pos);
-  void storeShadow(Value *Addr, uint64_t Size, uint64_t Align, Value *Shadow,
+  void storeShadow(Value *Addr, uint64_t Size, Align Alignment, Value *Shadow,
                    Instruction *Pos);
 };
 
@@ -438,7 +458,10 @@ public:
     return DFSF.F->getParent()->getDataLayout();
   }
 
-  void visitOperandShadowInst(Instruction &I);
+  // Combines shadow values for all of I's operands. Returns the combined shadow
+  // value.
+  Value *visitOperandShadowInst(Instruction &I);
+
   void visitUnaryOperator(UnaryOperator &UO);
   void visitBinaryOperator(BinaryOperator &BO);
   void visitCastInst(CastInst &CI);
@@ -447,7 +470,7 @@ public:
   void visitLoadInst(LoadInst &LI);
   void visitStoreInst(StoreInst &SI);
   void visitReturnInst(ReturnInst &RI);
-  void visitCallSite(CallSite CS);
+  void visitCallBase(CallBase &CB);
   void visitPHINode(PHINode &PN);
   void visitExtractElementInst(ExtractElementInst &I);
   void visitInsertElementInst(InsertElementInst &I);
@@ -553,11 +576,11 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
 
   Mod = &M;
   Ctx = &M.getContext();
-  ShadowTy = IntegerType::get(*Ctx, ShadowWidth);
+  ShadowTy = IntegerType::get(*Ctx, ShadowWidthBits);
   ShadowPtrTy = PointerType::getUnqual(ShadowTy);
   IntptrTy = DL.getIntPtrType(*Ctx);
   ZeroShadow = ConstantInt::getSigned(ShadowTy, 0);
-  ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidth / 8);
+  ShadowPtrMul = ConstantInt::getSigned(IntptrTy, ShadowWidthBytes);
   if (IsX86_64)
     ShadowPtrMask = ConstantInt::getSigned(IntptrTy, ~0x700000000000LL);
   else if (IsMIPS64)
@@ -583,6 +606,12 @@ bool DataFlowSanitizer::doInitialization(Module &M) {
       Type::getVoidTy(*Ctx), None, /*isVarArg=*/false);
   DFSanVarargWrapperFnTy = FunctionType::get(
       Type::getVoidTy(*Ctx), Type::getInt8PtrTy(*Ctx), /*isVarArg=*/false);
+  DFSanLoadStoreCmpCallbackFnTy =
+      FunctionType::get(Type::getVoidTy(*Ctx), ShadowTy, /*isVarArg=*/false);
+  Type *DFSanMemTransferCallbackArgs[2] = {ShadowPtrTy, IntptrTy};
+  DFSanMemTransferCallbackFnTy =
+      FunctionType::get(Type::getVoidTy(*Ctx), DFSanMemTransferCallbackArgs,
+                        /*isVarArg=*/false);
 
   if (GetArgTLSPtr) {
     Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
@@ -628,7 +657,7 @@ DataFlowSanitizer::WrapperKind DataFlowSanitizer::getWrapperKind(Function *F) {
 }
 
 void DataFlowSanitizer::addGlobalNamePrefix(GlobalValue *GV) {
-  std::string GVName = GV->getName(), Prefix = "dfs$";
+  std::string GVName = std::string(GV->getName()), Prefix = "dfs$";
   GV->setName(Prefix + GVName);
 
   // Try to change the name of the function in module inline asm.  We only do
@@ -713,25 +742,8 @@ Constant *DataFlowSanitizer::getOrBuildTrampolineFunction(FunctionType *FT,
   return cast<Constant>(C.getCallee());
 }
 
-bool DataFlowSanitizer::runOnModule(Module &M) {
-  if (ABIList.isIn(M, "skip"))
-    return false;
-
-  if (!GetArgTLSPtr) {
-    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
-    ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
-    if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS))
-      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
-  }
-  if (!GetRetvalTLSPtr) {
-    RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy);
-    if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS))
-      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
-  }
-
-  ExternalShadowMask =
-      Mod->getOrInsertGlobal(kDFSanExternShadowPtrMask, IntptrTy);
-
+// Initialize DataFlowSanitizer runtime functions and declare them in the module
+void DataFlowSanitizer::initializeRuntimeFunctions(Module &M) {
   {
     AttributeList AL;
     AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
@@ -745,7 +757,6 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
     DFSanUnionFn =
         Mod->getOrInsertFunction("__dfsan_union", DFSanUnionFnTy, AL);
   }
-
   {
     AttributeList AL;
     AL = AL.addAttribute(M.getContext(), AttributeList::FunctionIndex,
@@ -782,6 +793,50 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
       Mod->getOrInsertFunction("__dfsan_nonzero_label", DFSanNonzeroLabelFnTy);
   DFSanVarargWrapperFn = Mod->getOrInsertFunction("__dfsan_vararg_wrapper",
                                                   DFSanVarargWrapperFnTy);
+}
+
+// Initializes event callback functions and declare them in the module
+void DataFlowSanitizer::initializeCallbackFunctions(Module &M) {
+  DFSanLoadCallbackFn = Mod->getOrInsertFunction("__dfsan_load_callback",
+                                                 DFSanLoadStoreCmpCallbackFnTy);
+  DFSanStoreCallbackFn = Mod->getOrInsertFunction(
+      "__dfsan_store_callback", DFSanLoadStoreCmpCallbackFnTy);
+  DFSanMemTransferCallbackFn = Mod->getOrInsertFunction(
+      "__dfsan_mem_transfer_callback", DFSanMemTransferCallbackFnTy);
+  DFSanCmpCallbackFn = Mod->getOrInsertFunction("__dfsan_cmp_callback",
+                                                DFSanLoadStoreCmpCallbackFnTy);
+}
+
+bool DataFlowSanitizer::runOnModule(Module &M) {
+  if (ABIList.isIn(M, "skip"))
+    return false;
+
+  const unsigned InitialGlobalSize = M.global_size();
+  const unsigned InitialModuleSize = M.size();
+
+  bool Changed = false;
+
+  if (!GetArgTLSPtr) {
+    Type *ArgTLSTy = ArrayType::get(ShadowTy, 64);
+    ArgTLS = Mod->getOrInsertGlobal("__dfsan_arg_tls", ArgTLSTy);
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(ArgTLS)) {
+      Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel;
+      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
+    }
+  }
+  if (!GetRetvalTLSPtr) {
+    RetvalTLS = Mod->getOrInsertGlobal("__dfsan_retval_tls", ShadowTy);
+    if (GlobalVariable *G = dyn_cast<GlobalVariable>(RetvalTLS)) {
+      Changed |= G->getThreadLocalMode() != GlobalVariable::InitialExecTLSModel;
+      G->setThreadLocalMode(GlobalVariable::InitialExecTLSModel);
+    }
+  }
+
+  ExternalShadowMask =
+      Mod->getOrInsertGlobal(kDFSanExternShadowPtrMask, IntptrTy);
+
+  initializeCallbackFunctions(M);
+  initializeRuntimeFunctions(M);
 
   std::vector<Function *> FnsToInstrument;
   SmallPtrSet<Function *, 2> FnsWithNativeABI;
@@ -793,7 +848,11 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
         &i != DFSanUnimplementedFn.getCallee()->stripPointerCasts() &&
         &i != DFSanSetLabelFn.getCallee()->stripPointerCasts() &&
         &i != DFSanNonzeroLabelFn.getCallee()->stripPointerCasts() &&
-        &i != DFSanVarargWrapperFn.getCallee()->stripPointerCasts())
+        &i != DFSanVarargWrapperFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanLoadCallbackFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanStoreCallbackFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanMemTransferCallbackFn.getCallee()->stripPointerCasts() &&
+        &i != DFSanCmpCallbackFn.getCallee()->stripPointerCasts())
       FnsToInstrument.push_back(&i);
   }
 
@@ -994,7 +1053,8 @@ bool DataFlowSanitizer::runOnModule(Module &M) {
     }
   }
 
-  return false;
+  return Changed || !FnsToInstrument.empty() ||
+         M.global_size() != InitialGlobalSize || M.size() != InitialModuleSize;
 }
 
 Value *DFSanFunction::getArgTLSPtr() {
@@ -1177,9 +1237,10 @@ Value *DFSanFunction::combineOperandShadows(Instruction *Inst) {
   return Shadow;
 }
 
-void DFSanVisitor::visitOperandShadowInst(Instruction &I) {
+Value *DFSanVisitor::visitOperandShadowInst(Instruction &I) {
   Value *CombinedShadow = DFSF.combineOperandShadows(&I);
   DFSF.setShadow(&I, CombinedShadow);
+  return CombinedShadow;
 }
 
 // Generates IR to load shadow corresponding to bytes [Addr, Addr+Size), where
@@ -1194,7 +1255,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
     }
   }
 
-  uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
+  const llvm::Align ShadowAlign(Align * DFS.ShadowWidthBytes);
   SmallVector<const Value *, 2> Objs;
   GetUnderlyingObjects(Addr, Objs, Pos->getModule()->getDataLayout());
   bool AllConstants = true;
@@ -1216,7 +1277,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
     return DFS.ZeroShadow;
   case 1: {
     LoadInst *LI = new LoadInst(DFS.ShadowTy, ShadowAddr, "", Pos);
-    LI->setAlignment(MaybeAlign(ShadowAlign));
+    LI->setAlignment(ShadowAlign);
     return LI;
   }
   case 2: {
@@ -1228,7 +1289,7 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
         IRB.CreateAlignedLoad(DFS.ShadowTy, ShadowAddr1, ShadowAlign), Pos);
   }
   }
-  if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidth) == 0) {
+  if (!AvoidNewBlocks && Size % (64 / DFS.ShadowWidthBits) == 0) {
     // Fast path for the common case where each byte has identical shadow: load
     // shadow 64 bits at a time, fall out to a __dfsan_union_load call if any
     // shadow is non-equal.
@@ -1240,15 +1301,15 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
     FallbackCall->addAttribute(AttributeList::ReturnIndex, Attribute::ZExt);
 
     // Compare each of the shadows stored in the loaded 64 bits to each other,
-    // by computing (WideShadow rotl ShadowWidth) == WideShadow.
+    // by computing (WideShadow rotl ShadowWidthBits) == WideShadow.
     IRBuilder<> IRB(Pos);
     Value *WideAddr =
         IRB.CreateBitCast(ShadowAddr, Type::getInt64PtrTy(*DFS.Ctx));
     Value *WideShadow =
         IRB.CreateAlignedLoad(IRB.getInt64Ty(), WideAddr, ShadowAlign);
     Value *TruncShadow = IRB.CreateTrunc(WideShadow, DFS.ShadowTy);
-    Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidth);
-    Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidth);
+    Value *ShlShadow = IRB.CreateShl(WideShadow, DFS.ShadowWidthBits);
+    Value *ShrShadow = IRB.CreateLShr(WideShadow, 64 - DFS.ShadowWidthBits);
     Value *RotShadow = IRB.CreateOr(ShlShadow, ShrShadow);
     Value *ShadowsEq = IRB.CreateICmpEQ(WideShadow, RotShadow);
 
@@ -1271,8 +1332,8 @@ Value *DFSanFunction::loadShadow(Value *Addr, uint64_t Size, uint64_t Align,
     ReplaceInstWithInst(Head->getTerminator(), LastBr);
     DT.addNewBlock(FallbackBB, Head);
 
-    for (uint64_t Ofs = 64 / DFS.ShadowWidth; Ofs != Size;
-         Ofs += 64 / DFS.ShadowWidth) {
+    for (uint64_t Ofs = 64 / DFS.ShadowWidthBits; Ofs != Size;
+         Ofs += 64 / DFS.ShadowWidthBits) {
       BasicBlock *NextBB = BasicBlock::Create(*DFS.Ctx, "", F);
       DT.addNewBlock(NextBB, LastBr->getParent());
       IRBuilder<> NextIRB(NextBB);
@@ -1308,16 +1369,9 @@ void DFSanVisitor::visitLoadInst(LoadInst &LI) {
     return;
   }
 
-  uint64_t Align;
-  if (ClPreserveAlignment) {
-    Align = LI.getAlignment();
-    if (Align == 0)
-      Align = DL.getABITypeAlignment(LI.getType());
-  } else {
-    Align = 1;
-  }
-  IRBuilder<> IRB(&LI);
-  Value *Shadow = DFSF.loadShadow(LI.getPointerOperand(), Size, Align, &LI);
+  Align Alignment = ClPreserveAlignment ? LI.getAlign() : Align(1);
+  Value *Shadow =
+      DFSF.loadShadow(LI.getPointerOperand(), Size, Alignment.value(), &LI);
   if (ClCombinePointerLabelsOnLoad) {
     Value *PtrShadow = DFSF.getShadow(LI.getPointerOperand());
     Shadow = DFSF.combineShadows(Shadow, PtrShadow, &LI);
@@ -1326,9 +1380,13 @@ void DFSanVisitor::visitLoadInst(LoadInst &LI) {
     DFSF.NonZeroChecks.push_back(Shadow);
 
   DFSF.setShadow(&LI, Shadow);
+  if (ClEventCallbacks) {
+    IRBuilder<> IRB(&LI);
+    IRB.CreateCall(DFSF.DFS.DFSanLoadCallbackFn, Shadow);
+  }
 }
 
-void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align,
+void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, Align Alignment,
                                 Value *Shadow, Instruction *Pos) {
   if (AllocaInst *AI = dyn_cast<AllocaInst>(Addr)) {
     const auto i = AllocaShadowMap.find(AI);
@@ -1339,11 +1397,12 @@ void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align,
     }
   }
 
-  uint64_t ShadowAlign = Align * DFS.ShadowWidth / 8;
+  const Align ShadowAlign(Alignment.value() * DFS.ShadowWidthBytes);
   IRBuilder<> IRB(Pos);
   Value *ShadowAddr = DFS.getShadowAddress(Addr, Pos);
   if (Shadow == DFS.ZeroShadow) {
-    IntegerType *ShadowTy = IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidth);
+    IntegerType *ShadowTy =
+        IntegerType::get(*DFS.Ctx, Size * DFS.ShadowWidthBits);
     Value *ExtZeroShadow = ConstantInt::get(ShadowTy, 0);
     Value *ExtShadowAddr =
         IRB.CreateBitCast(ShadowAddr, PointerType::getUnqual(ShadowTy));
@@ -1351,10 +1410,10 @@ void DFSanFunction::storeShadow(Value *Addr, uint64_t Size, uint64_t Align,
     return;
   }
 
-  const unsigned ShadowVecSize = 128 / DFS.ShadowWidth;
+  const unsigned ShadowVecSize = 128 / DFS.ShadowWidthBits;
   uint64_t Offset = 0;
   if (Size >= ShadowVecSize) {
-    VectorType *ShadowVecTy = VectorType::get(DFS.ShadowTy, ShadowVecSize);
+    auto *ShadowVecTy = FixedVectorType::get(DFS.ShadowTy, ShadowVecSize);
     Value *ShadowVec = UndefValue::get(ShadowVecTy);
     for (unsigned i = 0; i != ShadowVecSize; ++i) {
       ShadowVec = IRB.CreateInsertElement(
@@ -1386,21 +1445,18 @@ void DFSanVisitor::visitStoreInst(StoreInst &SI) {
   if (Size == 0)
     return;
 
-  uint64_t Align;
-  if (ClPreserveAlignment) {
-    Align = SI.getAlignment();
-    if (Align == 0)
-      Align = DL.getABITypeAlignment(SI.getValueOperand()->getType());
-  } else {
-    Align = 1;
-  }
+  const Align Alignment = ClPreserveAlignment ? SI.getAlign() : Align(1);
 
   Value* Shadow = DFSF.getShadow(SI.getValueOperand());
   if (ClCombinePointerLabelsOnStore) {
     Value *PtrShadow = DFSF.getShadow(SI.getPointerOperand());
     Shadow = DFSF.combineShadows(Shadow, PtrShadow, &SI);
   }
-  DFSF.storeShadow(SI.getPointerOperand(), Size, Align, Shadow, &SI);
+  DFSF.storeShadow(SI.getPointerOperand(), Size, Alignment, Shadow, &SI);
+  if (ClEventCallbacks) {
+    IRBuilder<> IRB(&SI);
+    IRB.CreateCall(DFSF.DFS.DFSanStoreCallbackFn, Shadow);
+  }
 }
 
 void DFSanVisitor::visitUnaryOperator(UnaryOperator &UO) {
@@ -1413,7 +1469,13 @@ void DFSanVisitor::visitBinaryOperator(BinaryOperator &BO) {
 
 void DFSanVisitor::visitCastInst(CastInst &CI) { visitOperandShadowInst(CI); }
 
-void DFSanVisitor::visitCmpInst(CmpInst &CI) { visitOperandShadowInst(CI); }
+void DFSanVisitor::visitCmpInst(CmpInst &CI) {
+  Value *CombinedShadow = visitOperandShadowInst(CI);
+  if (ClEventCallbacks) {
+    IRBuilder<> IRB(&CI);
+    IRB.CreateCall(DFSF.DFS.DFSanCmpCallbackFn, CombinedShadow);
+  }
+}
 
 void DFSanVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
   visitOperandShadowInst(GEPI);
@@ -1493,23 +1555,27 @@ void DFSanVisitor::visitMemSetInst(MemSetInst &I) {
 
 void DFSanVisitor::visitMemTransferInst(MemTransferInst &I) {
   IRBuilder<> IRB(&I);
-  Value *DestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I);
+  Value *RawDestShadow = DFSF.DFS.getShadowAddress(I.getDest(), &I);
   Value *SrcShadow = DFSF.DFS.getShadowAddress(I.getSource(), &I);
-  Value *LenShadow = IRB.CreateMul(
-      I.getLength(),
-      ConstantInt::get(I.getLength()->getType(), DFSF.DFS.ShadowWidth / 8));
+  Value *LenShadow =
+      IRB.CreateMul(I.getLength(), ConstantInt::get(I.getLength()->getType(),
+                                                    DFSF.DFS.ShadowWidthBytes));
   Type *Int8Ptr = Type::getInt8PtrTy(*DFSF.DFS.Ctx);
-  DestShadow = IRB.CreateBitCast(DestShadow, Int8Ptr);
+  Value *DestShadow = IRB.CreateBitCast(RawDestShadow, Int8Ptr);
   SrcShadow = IRB.CreateBitCast(SrcShadow, Int8Ptr);
   auto *MTI = cast<MemTransferInst>(
-      IRB.CreateCall(I.getFunctionType(), I.getCalledValue(),
+      IRB.CreateCall(I.getFunctionType(), I.getCalledOperand(),
                      {DestShadow, SrcShadow, LenShadow, I.getVolatileCst()}));
   if (ClPreserveAlignment) {
-    MTI->setDestAlignment(I.getDestAlignment() * (DFSF.DFS.ShadowWidth / 8));
-    MTI->setSourceAlignment(I.getSourceAlignment() * (DFSF.DFS.ShadowWidth / 8));
+    MTI->setDestAlignment(I.getDestAlign() * DFSF.DFS.ShadowWidthBytes);
+    MTI->setSourceAlignment(I.getSourceAlign() * DFSF.DFS.ShadowWidthBytes);
   } else {
-    MTI->setDestAlignment(DFSF.DFS.ShadowWidth / 8);
-    MTI->setSourceAlignment(DFSF.DFS.ShadowWidth / 8);
+    MTI->setDestAlignment(Align(DFSF.DFS.ShadowWidthBytes));
+    MTI->setSourceAlignment(Align(DFSF.DFS.ShadowWidthBytes));
+  }
+  if (ClEventCallbacks) {
+    IRB.CreateCall(DFSF.DFS.DFSanMemTransferCallbackFn,
+                   {RawDestShadow, I.getLength()});
   }
 }
 
@@ -1536,10 +1602,10 @@ void DFSanVisitor::visitReturnInst(ReturnInst &RI) {
   }
 }
 
-void DFSanVisitor::visitCallSite(CallSite CS) {
-  Function *F = CS.getCalledFunction();
-  if ((F && F->isIntrinsic()) || isa<InlineAsm>(CS.getCalledValue())) {
-    visitOperandShadowInst(*CS.getInstruction());
+void DFSanVisitor::visitCallBase(CallBase &CB) {
+  Function *F = CB.getCalledFunction();
+  if ((F && F->isIntrinsic()) || CB.isInlineAsm()) {
+    visitOperandShadowInst(CB);
     return;
   }
 
@@ -1548,32 +1614,32 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
   if (F == DFSF.DFS.DFSanVarargWrapperFn.getCallee()->stripPointerCasts())
     return;
 
-  IRBuilder<> IRB(CS.getInstruction());
+  IRBuilder<> IRB(&CB);
 
   DenseMap<Value *, Function *>::iterator i =
-      DFSF.DFS.UnwrappedFnMap.find(CS.getCalledValue());
+      DFSF.DFS.UnwrappedFnMap.find(CB.getCalledOperand());
   if (i != DFSF.DFS.UnwrappedFnMap.end()) {
     Function *F = i->second;
     switch (DFSF.DFS.getWrapperKind(F)) {
     case DataFlowSanitizer::WK_Warning:
-      CS.setCalledFunction(F);
+      CB.setCalledFunction(F);
       IRB.CreateCall(DFSF.DFS.DFSanUnimplementedFn,
                      IRB.CreateGlobalStringPtr(F->getName()));
-      DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
+      DFSF.setShadow(&CB, DFSF.DFS.ZeroShadow);
       return;
     case DataFlowSanitizer::WK_Discard:
-      CS.setCalledFunction(F);
-      DFSF.setShadow(CS.getInstruction(), DFSF.DFS.ZeroShadow);
+      CB.setCalledFunction(F);
+      DFSF.setShadow(&CB, DFSF.DFS.ZeroShadow);
       return;
     case DataFlowSanitizer::WK_Functional:
-      CS.setCalledFunction(F);
-      visitOperandShadowInst(*CS.getInstruction());
+      CB.setCalledFunction(F);
+      visitOperandShadowInst(CB);
       return;
     case DataFlowSanitizer::WK_Custom:
       // Don't try to handle invokes of custom functions, it's too complicated.
       // Instead, invoke the dfsw$ wrapper, which will in turn call the __dfsw_
       // wrapper.
-      if (CallInst *CI = dyn_cast<CallInst>(CS.getInstruction())) {
+      if (CallInst *CI = dyn_cast<CallInst>(&CB)) {
         FunctionType *FT = F->getFunctionType();
         TransformedFunction CustomFn = DFSF.DFS.getCustomFunctionType(FT);
         std::string CustomFName = "__dfsw_";
@@ -1592,7 +1658,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
 
         std::vector<Value *> Args;
 
-        CallSite::arg_iterator i = CS.arg_begin();
+        auto i = CB.arg_begin();
         for (unsigned n = FT->getNumParams(); n != 0; ++i, --n) {
           Type *T = (*i)->getType();
           FunctionType *ParamFT;
@@ -1612,19 +1678,19 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
           }
         }
 
-        i = CS.arg_begin();
+        i = CB.arg_begin();
         const unsigned ShadowArgStart = Args.size();
         for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
           Args.push_back(DFSF.getShadow(*i));
 
         if (FT->isVarArg()) {
           auto *LabelVATy = ArrayType::get(DFSF.DFS.ShadowTy,
-                                           CS.arg_size() - FT->getNumParams());
+                                           CB.arg_size() - FT->getNumParams());
           auto *LabelVAAlloca = new AllocaInst(
               LabelVATy, getDataLayout().getAllocaAddrSpace(),
               "labelva", &DFSF.F->getEntryBlock().front());
 
-          for (unsigned n = 0; i != CS.arg_end(); ++i, ++n) {
+          for (unsigned n = 0; i != CB.arg_end(); ++i, ++n) {
             auto LabelVAPtr = IRB.CreateStructGEP(LabelVATy, LabelVAAlloca, n);
             IRB.CreateStore(DFSF.getShadow(*i), LabelVAPtr);
           }
@@ -1642,7 +1708,7 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
           Args.push_back(DFSF.LabelReturnAlloca);
         }
 
-        for (i = CS.arg_begin() + FT->getNumParams(); i != CS.arg_end(); ++i)
+        for (i = CB.arg_begin() + FT->getNumParams(); i != CB.arg_end(); ++i)
           Args.push_back(*i);
 
         CallInst *CustomCI = IRB.CreateCall(CustomF, Args);
@@ -1673,18 +1739,17 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
     }
   }
 
-  FunctionType *FT = cast<FunctionType>(
-      CS.getCalledValue()->getType()->getPointerElementType());
+  FunctionType *FT = CB.getFunctionType();
   if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
     for (unsigned i = 0, n = FT->getNumParams(); i != n; ++i) {
-      IRB.CreateStore(DFSF.getShadow(CS.getArgument(i)),
-                      DFSF.getArgTLS(i, CS.getInstruction()));
+      IRB.CreateStore(DFSF.getShadow(CB.getArgOperand(i)),
+                      DFSF.getArgTLS(i, &CB));
     }
   }
 
   Instruction *Next = nullptr;
-  if (!CS.getType()->isVoidTy()) {
-    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
+  if (!CB.getType()->isVoidTy()) {
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
       if (II->getNormalDest()->getSinglePredecessor()) {
         Next = &II->getNormalDest()->front();
       } else {
@@ -1693,15 +1758,15 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
         Next = &NewBB->front();
       }
     } else {
-      assert(CS->getIterator() != CS->getParent()->end());
-      Next = CS->getNextNode();
+      assert(CB.getIterator() != CB.getParent()->end());
+      Next = CB.getNextNode();
     }
 
     if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_TLS) {
       IRBuilder<> NextIRB(Next);
       LoadInst *LI = NextIRB.CreateLoad(DFSF.DFS.ShadowTy, DFSF.getRetvalTLS());
       DFSF.SkipInsts.insert(LI);
-      DFSF.setShadow(CS.getInstruction(), LI);
+      DFSF.setShadow(&CB, LI);
       DFSF.NonZeroChecks.push_back(LI);
     }
   }
@@ -1711,25 +1776,25 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
   if (DFSF.DFS.getInstrumentedABI() == DataFlowSanitizer::IA_Args) {
     FunctionType *NewFT = DFSF.DFS.getArgsFunctionType(FT);
     Value *Func =
-        IRB.CreateBitCast(CS.getCalledValue(), PointerType::getUnqual(NewFT));
+        IRB.CreateBitCast(CB.getCalledOperand(), PointerType::getUnqual(NewFT));
     std::vector<Value *> Args;
 
-    CallSite::arg_iterator i = CS.arg_begin(), e = CS.arg_end();
+    auto i = CB.arg_begin(), E = CB.arg_end();
     for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
       Args.push_back(*i);
 
-    i = CS.arg_begin();
+    i = CB.arg_begin();
     for (unsigned n = FT->getNumParams(); n != 0; ++i, --n)
       Args.push_back(DFSF.getShadow(*i));
 
     if (FT->isVarArg()) {
-      unsigned VarArgSize = CS.arg_size() - FT->getNumParams();
+      unsigned VarArgSize = CB.arg_size() - FT->getNumParams();
       ArrayType *VarArgArrayTy = ArrayType::get(DFSF.DFS.ShadowTy, VarArgSize);
       AllocaInst *VarArgShadow =
         new AllocaInst(VarArgArrayTy, getDataLayout().getAllocaAddrSpace(),
                        "", &DFSF.F->getEntryBlock().front());
       Args.push_back(IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, 0));
-      for (unsigned n = 0; i != e; ++i, ++n) {
+      for (unsigned n = 0; i != E; ++i, ++n) {
         IRB.CreateStore(
             DFSF.getShadow(*i),
             IRB.CreateConstGEP2_32(VarArgArrayTy, VarArgShadow, 0, n));
@@ -1737,32 +1802,30 @@ void DFSanVisitor::visitCallSite(CallSite CS) {
       }
     }
 
-    CallSite NewCS;
-    if (InvokeInst *II = dyn_cast<InvokeInst>(CS.getInstruction())) {
-      NewCS = IRB.CreateInvoke(NewFT, Func, II->getNormalDest(),
+    CallBase *NewCB;
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+      NewCB = IRB.CreateInvoke(NewFT, Func, II->getNormalDest(),
                                II->getUnwindDest(), Args);
     } else {
-      NewCS = IRB.CreateCall(NewFT, Func, Args);
+      NewCB = IRB.CreateCall(NewFT, Func, Args);
     }
-    NewCS.setCallingConv(CS.getCallingConv());
-    NewCS.setAttributes(CS.getAttributes().removeAttributes(
+    NewCB->setCallingConv(CB.getCallingConv());
+    NewCB->setAttributes(CB.getAttributes().removeAttributes(
         *DFSF.DFS.Ctx, AttributeList::ReturnIndex,
-        AttributeFuncs::typeIncompatible(NewCS.getInstruction()->getType())));
+        AttributeFuncs::typeIncompatible(NewCB->getType())));
 
     if (Next) {
-      ExtractValueInst *ExVal =
-          ExtractValueInst::Create(NewCS.getInstruction(), 0, "", Next);
+      ExtractValueInst *ExVal = ExtractValueInst::Create(NewCB, 0, "", Next);
       DFSF.SkipInsts.insert(ExVal);
-      ExtractValueInst *ExShadow =
-          ExtractValueInst::Create(NewCS.getInstruction(), 1, "", Next);
+      ExtractValueInst *ExShadow = ExtractValueInst::Create(NewCB, 1, "", Next);
       DFSF.SkipInsts.insert(ExShadow);
       DFSF.setShadow(ExVal, ExShadow);
       DFSF.NonZeroChecks.push_back(ExShadow);
 
-      CS.getInstruction()->replaceAllUsesWith(ExVal);
+      CB.replaceAllUsesWith(ExVal);
     }
 
-    CS.getInstruction()->eraseFromParent();
+    CB.eraseFromParent();
   }
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index bf3e4ed3e31f..d8a965a90127 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -45,24 +45,34 @@
 #include <memory>
 #include <string>
 #include <utility>
+
 using namespace llvm;
+namespace endian = llvm::support::endian;
 
 #define DEBUG_TYPE "insert-gcov-profiling"
 
-static cl::opt<std::string>
-DefaultGCOVVersion("default-gcov-version", cl::init("402*"), cl::Hidden,
-                   cl::ValueRequired);
-static cl::opt<bool> DefaultExitBlockBeforeBody("gcov-exit-block-before-body",
-                                                cl::init(false), cl::Hidden);
+enum : uint32_t {
+  GCOV_TAG_FUNCTION = 0x01000000,
+  GCOV_TAG_BLOCKS = 0x01410000,
+  GCOV_TAG_ARCS = 0x01430000,
+  GCOV_TAG_LINES = 0x01450000,
+};
+
+static cl::opt<std::string> DefaultGCOVVersion("default-gcov-version",
+                                               cl::init("408*"), cl::Hidden,
+                                               cl::ValueRequired);
+
+// Returns the number of words which will be used to represent this string.
+static unsigned wordsOfString(StringRef s) {
+  // Length + NUL-terminated string + 0~3 padding NULs.
+  return (s.size() / 4) + 2;
+}
 
 GCOVOptions GCOVOptions::getDefault() {
   GCOVOptions Options;
   Options.EmitNotes = true;
   Options.EmitData = true;
-  Options.UseCfgChecksum = false;
   Options.NoRedZone = false;
-  Options.FunctionNamesInData = true;
-  Options.ExitBlockBeforeBody = DefaultExitBlockBeforeBody;
 
   if (DefaultGCOVVersion.size() != 4) {
     llvm::report_fatal_error(std::string("Invalid -default-gcov-version: ") +
@@ -78,19 +88,23 @@ class GCOVFunction;
 class GCOVProfiler {
 public:
   GCOVProfiler() : GCOVProfiler(GCOVOptions::getDefault()) {}
-  GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {
-    assert((Options.EmitNotes || Options.EmitData) &&
-           "GCOVProfiler asked to do nothing?");
-    ReversedVersion[0] = Options.Version[3];
-    ReversedVersion[1] = Options.Version[2];
-    ReversedVersion[2] = Options.Version[1];
-    ReversedVersion[3] = Options.Version[0];
-    ReversedVersion[4] = '\0';
-  }
+  GCOVProfiler(const GCOVOptions &Opts) : Options(Opts) {}
   bool
   runOnModule(Module &M,
               std::function<const TargetLibraryInfo &(Function &F)> GetTLI);
 
+  void write(uint32_t i) {
+    char Bytes[4];
+    endian::write32(Bytes, i, Endian);
+    os->write(Bytes, 4);
+  }
+  void writeString(StringRef s) {
+    write(wordsOfString(s) - 1);
+    os->write(s.data(), s.size());
+    os->write_zeros(4 - s.size() % 4);
+  }
+  void writeBytes(const char *Bytes, int Size) { os->write(Bytes, Size); }
+
 private:
   // Create the .gcno files for the Module based on DebugInfo.
   void emitProfileNotes();
@@ -115,17 +129,18 @@ private:
   // list.
   Function *
   insertCounterWriteout(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
-  Function *insertFlush(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
+  Function *insertReset(ArrayRef<std::pair<GlobalVariable *, MDNode *>>);
+  Function *insertFlush(Function *ResetF);
 
-  void AddFlushBeforeForkAndExec();
+  bool AddFlushBeforeForkAndExec();
 
   enum class GCovFileType { GCNO, GCDA };
   std::string mangleName(const DICompileUnit *CU, GCovFileType FileType);
 
   GCOVOptions Options;
+  support::endianness Endian;
+  raw_ostream *os;
 
-  // Reversed, NUL-terminated copy of Options.Version.
-  char ReversedVersion[5];
   // Checksum, produced by hash of EdgeDestinations
   SmallVector<uint32_t, 4> FileChecksums;
 
@@ -200,48 +215,15 @@ static SmallString<128> getFilename(const DISubprogram *SP) {
 
 namespace {
   class GCOVRecord {
-   protected:
-    static const char *const LinesTag;
-    static const char *const FunctionTag;
-    static const char *const BlockTag;
-    static const char *const EdgeTag;
-
-    GCOVRecord() = default;
-
-    void writeBytes(const char *Bytes, int Size) {
-      os->write(Bytes, Size);
-    }
-
-    void write(uint32_t i) {
-      writeBytes(reinterpret_cast<char*>(&i), 4);
-    }
-
-    // Returns the length measured in 4-byte blocks that will be used to
-    // represent this string in a GCOV file
-    static unsigned lengthOfGCOVString(StringRef s) {
-      // A GCOV string is a length, followed by a NUL, then between 0 and 3 NULs
-      // padding out to the next 4-byte word. The length is measured in 4-byte
-      // words including padding, not bytes of actual string.
-      return (s.size() / 4) + 1;
-    }
-
-    void writeGCOVString(StringRef s) {
-      uint32_t Len = lengthOfGCOVString(s);
-      write(Len);
-      writeBytes(s.data(), s.size());
+  protected:
+    GCOVProfiler *P;
 
-      // Write 1 to 4 bytes of NUL padding.
-      assert((unsigned)(4 - (s.size() % 4)) > 0);
-      assert((unsigned)(4 - (s.size() % 4)) <= 4);
-      writeBytes("\0\0\0\0", 4 - (s.size() % 4));
-    }
+    GCOVRecord(GCOVProfiler *P) : P(P) {}
 
-    raw_ostream *os;
+    void write(uint32_t i) { P->write(i); }
+    void writeString(StringRef s) { P->writeString(s); }
+    void writeBytes(const char *Bytes, int Size) { P->writeBytes(Bytes, Size); }
   };
-  const char *const GCOVRecord::LinesTag = "\0\0\x45\x01";
-  const char *const GCOVRecord::FunctionTag = "\0\0\0\1";
-  const char *const GCOVRecord::BlockTag = "\0\0\x41\x01";
-  const char *const GCOVRecord::EdgeTag = "\0\0\x43\x01";
 
   class GCOVFunction;
   class GCOVBlock;
@@ -257,23 +239,20 @@ namespace {
     }
 
     uint32_t length() const {
-      // Here 2 = 1 for string length + 1 for '0' id#.
-      return lengthOfGCOVString(Filename) + 2 + Lines.size();
+      return 1 + wordsOfString(Filename) + Lines.size();
     }
 
     void writeOut() {
       write(0);
-      writeGCOVString(Filename);
+      writeString(Filename);
       for (int i = 0, e = Lines.size(); i != e; ++i)
         write(Lines[i]);
     }
 
-    GCOVLines(StringRef F, raw_ostream *os)
-      : Filename(F) {
-      this->os = os;
-    }
+    GCOVLines(GCOVProfiler *P, StringRef F)
+        : GCOVRecord(P), Filename(std::string(F)) {}
 
-   private:
+  private:
     std::string Filename;
     SmallVector<uint32_t, 32> Lines;
   };
@@ -285,7 +264,7 @@ namespace {
   class GCOVBlock : public GCOVRecord {
    public:
     GCOVLines &getFile(StringRef Filename) {
-      return LinesByFile.try_emplace(Filename, Filename, os).first->second;
+      return LinesByFile.try_emplace(Filename, P, Filename).first->second;
     }
 
     void addEdge(GCOVBlock &Successor) {
@@ -300,7 +279,7 @@ namespace {
         SortedLinesByFile.push_back(&I);
       }
 
-      writeBytes(LinesTag, 4);
+      write(GCOV_TAG_LINES);
       write(Len);
       write(Number);
 
@@ -325,10 +304,8 @@ namespace {
    private:
     friend class GCOVFunction;
 
-    GCOVBlock(uint32_t Number, raw_ostream *os)
-        : Number(Number) {
-      this->os = os;
-    }
+    GCOVBlock(GCOVProfiler *P, uint32_t Number)
+        : GCOVRecord(P), Number(Number) {}
 
     uint32_t Number;
     StringMap<GCOVLines> LinesByFile;
@@ -339,21 +316,19 @@ namespace {
   // set of blocks and a map of edges between blocks. This is the only GCOV
   // object users can construct, the blocks and lines will be rooted here.
   class GCOVFunction : public GCOVRecord {
-   public:
-     GCOVFunction(const DISubprogram *SP, Function *F, raw_ostream *os,
-                  uint32_t Ident, bool UseCfgChecksum, bool ExitBlockBeforeBody)
-         : SP(SP), Ident(Ident), UseCfgChecksum(UseCfgChecksum), CfgChecksum(0),
-           ReturnBlock(1, os) {
-      this->os = os;
-
+  public:
+    GCOVFunction(GCOVProfiler *P, Function *F, const DISubprogram *SP,
+                 unsigned EndLine, uint32_t Ident, int Version)
+        : GCOVRecord(P), SP(SP), EndLine(EndLine), Ident(Ident),
+          Version(Version), ReturnBlock(P, 1) {
       LLVM_DEBUG(dbgs() << "Function: " << getFunctionName(SP) << "\n");
-
+      bool ExitBlockBeforeBody = Version >= 48;
       uint32_t i = 0;
       for (auto &BB : *F) {
         // Skip index 1 if it's assigned to the ReturnBlock.
         if (i == 1 && ExitBlockBeforeBody)
           ++i;
-        Blocks.insert(std::make_pair(&BB, GCOVBlock(i++, os)));
+        Blocks.insert(std::make_pair(&BB, GCOVBlock(P, i++)));
       }
       if (!ExitBlockBeforeBody)
         ReturnBlock.Number = i;
@@ -389,42 +364,56 @@ namespace {
       return FuncChecksum;
     }
 
-    void setCfgChecksum(uint32_t Checksum) {
-      CfgChecksum = Checksum;
-    }
-
-    void writeOut() {
-      writeBytes(FunctionTag, 4);
+    void writeOut(uint32_t CfgChecksum) {
+      write(GCOV_TAG_FUNCTION);
       SmallString<128> Filename = getFilename(SP);
-      uint32_t BlockLen = 1 + 1 + 1 + lengthOfGCOVString(getFunctionName(SP)) +
-                          1 + lengthOfGCOVString(Filename) + 1;
-      if (UseCfgChecksum)
-        ++BlockLen;
+      uint32_t BlockLen =
+          2 + (Version >= 47) + wordsOfString(getFunctionName(SP));
+      if (Version < 80)
+        BlockLen += wordsOfString(Filename) + 1;
+      else
+        BlockLen += 1 + wordsOfString(Filename) + 3 + (Version >= 90);
+
       write(BlockLen);
       write(Ident);
       write(FuncChecksum);
-      if (UseCfgChecksum)
+      if (Version >= 47)
         write(CfgChecksum);
-      writeGCOVString(getFunctionName(SP));
-      writeGCOVString(Filename);
-      write(SP->getLine());
+      writeString(getFunctionName(SP));
+      if (Version < 80) {
+        writeString(Filename);
+        write(SP->getLine());
+      } else {
+        write(SP->isArtificial()); // artificial
+        writeString(Filename);
+        write(SP->getLine()); // start_line
+        write(0);             // start_column
+        // EndLine is the last line with !dbg. It is not the } line as in GCC,
+        // but good enough.
+        write(EndLine);
+        if (Version >= 90)
+          write(0); // end_column
+      }
 
       // Emit count of blocks.
-      writeBytes(BlockTag, 4);
-      write(Blocks.size() + 1);
-      for (int i = 0, e = Blocks.size() + 1; i != e; ++i) {
-        write(0);  // No flags on our blocks.
+      write(GCOV_TAG_BLOCKS);
+      if (Version < 80) {
+        write(Blocks.size() + 1);
+        for (int i = Blocks.size() + 1; i; --i)
+          write(0);
+      } else {
+        write(1);
+        write(Blocks.size() + 1);
       }
-      LLVM_DEBUG(dbgs() << Blocks.size() << " blocks.\n");
+      LLVM_DEBUG(dbgs() << (Blocks.size() + 1) << " blocks\n");
 
       // Emit edges between blocks.
-      if (Blocks.empty()) return;
       Function *F = Blocks.begin()->first->getParent();
       for (BasicBlock &I : *F) {
         GCOVBlock &Block = getBlock(&I);
         if (Block.OutEdges.empty()) continue;
 
-        writeBytes(EdgeTag, 4);
+        write(GCOV_TAG_ARCS);
         write(Block.OutEdges.size() * 2 + 1);
         write(Block.Number);
         for (int i = 0, e = Block.OutEdges.size(); i != e; ++i) {
@@ -440,12 +429,12 @@ namespace {
         getBlock(&I).writeOut();
     }
 
-   private:
-     const DISubprogram *SP;
+  private:
+    const DISubprogram *SP;
+    unsigned EndLine;
     uint32_t Ident;
     uint32_t FuncChecksum;
-    bool UseCfgChecksum;
-    uint32_t CfgChecksum;
+    int Version;
     DenseMap<BasicBlock *, GCOVBlock> Blocks;
     GCOVBlock ReturnBlock;
   };
@@ -473,11 +462,9 @@ std::vector<Regex> GCOVProfiler::createRegexesFromString(StringRef RegexesStr) {
 
 bool GCOVProfiler::doesFilenameMatchARegex(StringRef Filename,
                                            std::vector<Regex> &Regexes) {
-  for (Regex &Re : Regexes) {
-    if (Re.match(Filename)) {
+  for (Regex &Re : Regexes)
+    if (Re.match(Filename))
       return true;
-    }
-  }
   return false;
 }
 
@@ -537,7 +524,8 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
         MDString *DataFile = dyn_cast<MDString>(N->getOperand(1));
         if (!NotesFile || !DataFile)
           continue;
-        return Notes ? NotesFile->getString() : DataFile->getString();
+        return std::string(Notes ? NotesFile->getString()
+                                 : DataFile->getString());
       }
 
       MDString *GCovFile = dyn_cast<MDString>(N->getOperand(0));
@@ -546,7 +534,7 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
 
       SmallString<128> Filename = GCovFile->getString();
       sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
-      return Filename.str();
+      return std::string(Filename.str());
     }
   }
 
@@ -554,9 +542,10 @@ std::string GCOVProfiler::mangleName(const DICompileUnit *CU,
   sys::path::replace_extension(Filename, Notes ? "gcno" : "gcda");
   StringRef FName = sys::path::filename(Filename);
   SmallString<128> CurPath;
-  if (sys::fs::current_path(CurPath)) return FName;
+  if (sys::fs::current_path(CurPath))
+    return std::string(FName);
   sys::path::append(CurPath, FName);
-  return CurPath.str();
+  return std::string(CurPath.str());
 }
 
 bool GCOVProfiler::runOnModule(
@@ -565,14 +554,15 @@ bool GCOVProfiler::runOnModule(
   this->GetTLI = std::move(GetTLI);
   Ctx = &M.getContext();
 
-  AddFlushBeforeForkAndExec();
+  bool Modified = AddFlushBeforeForkAndExec();
 
   FilterRe = createRegexesFromString(Options.Filter);
   ExcludeRe = createRegexesFromString(Options.Exclude);
 
   if (Options.EmitNotes) emitProfileNotes();
-  if (Options.EmitData) return emitProfileArcs();
-  return false;
+  if (Options.EmitData)
+    Modified |= emitProfileArcs();
+  return Modified;
 }
 
 PreservedAnalyses GCOVProfilerPass::run(Module &M,
@@ -590,9 +580,10 @@ PreservedAnalyses GCOVProfilerPass::run(Module &M,
   return PreservedAnalyses::none();
 }
 
-static bool functionHasLines(Function &F) {
+static bool functionHasLines(const Function &F, unsigned &EndLine) {
   // Check whether this function actually has any source lines. Not only
   // do these waste space, they also can crash gcov.
+  EndLine = 0;
   for (auto &BB : F) {
     for (auto &I : BB) {
       // Debug intrinsic locations correspond to the location of the
@@ -605,6 +596,7 @@ static bool functionHasLines(Function &F) {
 
       // Artificial lines such as calls to the global constructors.
       if (Loc.getLine() == 0) continue;
+      EndLine = std::max(EndLine, Loc.getLine());
 
       return true;
     }
@@ -629,43 +621,95 @@ static bool shouldKeepInEntry(BasicBlock::iterator It) {
 	return false;
 }
 
-void GCOVProfiler::AddFlushBeforeForkAndExec() {
-  SmallVector<Instruction *, 2> ForkAndExecs;
+bool GCOVProfiler::AddFlushBeforeForkAndExec() {
+  SmallVector<CallInst *, 2> Forks;
+  SmallVector<CallInst *, 2> Execs;
   for (auto &F : M->functions()) {
     auto *TLI = &GetTLI(F);
     for (auto &I : instructions(F)) {
       if (CallInst *CI = dyn_cast<CallInst>(&I)) {
         if (Function *Callee = CI->getCalledFunction()) {
           LibFunc LF;
-          if (TLI->getLibFunc(*Callee, LF) &&
-              (LF == LibFunc_fork || LF == LibFunc_execl ||
-               LF == LibFunc_execle || LF == LibFunc_execlp ||
-               LF == LibFunc_execv || LF == LibFunc_execvp ||
-               LF == LibFunc_execve || LF == LibFunc_execvpe ||
-               LF == LibFunc_execvP)) {
-            ForkAndExecs.push_back(&I);
+          if (TLI->getLibFunc(*Callee, LF)) {
+            if (LF == LibFunc_fork) {
+#if !defined(_WIN32)
+              Forks.push_back(CI);
+#endif
+            } else if (LF == LibFunc_execl || LF == LibFunc_execle ||
+                       LF == LibFunc_execlp || LF == LibFunc_execv ||
+                       LF == LibFunc_execvp || LF == LibFunc_execve ||
+                       LF == LibFunc_execvpe || LF == LibFunc_execvP) {
+              Execs.push_back(CI);
+            }
           }
         }
       }
     }
   }
 
-  // We need to split the block after the fork/exec call
-  // because else the counters for the lines after will be
-  // the same as before the call.
-  for (auto I : ForkAndExecs) {
-    IRBuilder<> Builder(I);
+  for (auto F : Forks) {
+    IRBuilder<> Builder(F);
+    BasicBlock *Parent = F->getParent();
+    auto NextInst = ++F->getIterator();
+
+    // We've a fork so just reset the counters in the child process
+    FunctionType *FTy = FunctionType::get(Builder.getInt32Ty(), {}, false);
+    FunctionCallee GCOVFork = M->getOrInsertFunction("__gcov_fork", FTy);
+    F->setCalledFunction(GCOVFork);
+
+    // We split just after the fork to have a counter for the lines after
+    // Anyway there's a bug:
+    // void foo() { fork(); }
+    // void bar() { foo(); blah(); }
+    // then "blah();" will be called 2 times but showed as 1
+    // because "blah()" belongs to the same block as "foo();"
+    Parent->splitBasicBlock(NextInst);
+
+    // back() is a br instruction with a debug location
+    // equals to the one from NextAfterFork
+    // So to avoid to have two debug locs on two blocks just change it
+    DebugLoc Loc = F->getDebugLoc();
+    Parent->back().setDebugLoc(Loc);
+  }
+
+  for (auto E : Execs) {
+    IRBuilder<> Builder(E);
+    BasicBlock *Parent = E->getParent();
+    auto NextInst = ++E->getIterator();
+
+    // Since the process is replaced by a new one we need to write out gcdas
+    // No need to reset the counters since they'll be lost after the exec**
     FunctionType *FTy = FunctionType::get(Builder.getVoidTy(), {}, false);
-    FunctionCallee GCOVFlush = M->getOrInsertFunction("__gcov_flush", FTy);
-    Builder.CreateCall(GCOVFlush);
-    I->getParent()->splitBasicBlock(I);
+    FunctionCallee WriteoutF =
+        M->getOrInsertFunction("llvm_writeout_files", FTy);
+    Builder.CreateCall(WriteoutF);
+
+    DebugLoc Loc = E->getDebugLoc();
+    Builder.SetInsertPoint(&*NextInst);
+    // If the exec** fails we must reset the counters since they've been
+    // dumped
+    FunctionCallee ResetF = M->getOrInsertFunction("llvm_reset_counters", FTy);
+    Builder.CreateCall(ResetF)->setDebugLoc(Loc);
+    Parent->splitBasicBlock(NextInst);
+    Parent->back().setDebugLoc(Loc);
   }
+
+  return !Forks.empty() || !Execs.empty();
 }
 
 void GCOVProfiler::emitProfileNotes() {
   NamedMDNode *CU_Nodes = M->getNamedMetadata("llvm.dbg.cu");
   if (!CU_Nodes) return;
 
+  int Version;
+  {
+    uint8_t c3 = Options.Version[0];
+    uint8_t c2 = Options.Version[1];
+    uint8_t c1 = Options.Version[2];
+    Version = c3 >= 'A' ? (c3 - 'A') * 100 + (c2 - '0') * 10 + c1 - '0'
+                        : (c3 - '0') * 10 + c1 - '0';
+  }
+
   for (unsigned i = 0, e = CU_Nodes->getNumOperands(); i != e; ++i) {
     // Each compile unit gets its own .gcno file. This means that whether we run
     // this pass over the original .o's as they're produced, or run it after
@@ -688,11 +732,14 @@ void GCOVProfiler::emitProfileNotes() {
 
     std::string EdgeDestinations;
 
+    Endian = M->getDataLayout().isLittleEndian() ? support::endianness::little
+                                                 : support::endianness::big;
     unsigned FunctionIdent = 0;
     for (auto &F : M->functions()) {
       DISubprogram *SP = F.getSubprogram();
+      unsigned EndLine;
       if (!SP) continue;
-      if (!functionHasLines(F) || !isFunctionInstrumented(F))
+      if (!functionHasLines(F, EndLine) || !isFunctionInstrumented(F))
         continue;
       // TODO: Functions using scope-based EH are currently not supported.
       if (isUsingScopeBasedEH(F)) continue;
@@ -705,9 +752,8 @@ void GCOVProfiler::emitProfileNotes() {
         ++It;
       EntryBlock.splitBasicBlock(It);
 
-      Funcs.push_back(std::make_unique<GCOVFunction>(SP, &F, &out, FunctionIdent++,
-                                                Options.UseCfgChecksum,
-                                                Options.ExitBlockBeforeBody));
+      Funcs.push_back(std::make_unique<GCOVFunction>(this, &F, SP, EndLine,
+                                                     FunctionIdent++, Version));
       GCOVFunction &Func = *Funcs.back();
 
       // Add the function line number to the lines of the entry block
@@ -756,17 +802,29 @@ void GCOVProfiler::emitProfileNotes() {
       EdgeDestinations += Func.getEdgeDestinations();
     }
 
-    FileChecksums.push_back(hash_value(EdgeDestinations));
-    out.write("oncg", 4);
-    out.write(ReversedVersion, 4);
-    out.write(reinterpret_cast<char*>(&FileChecksums.back()), 4);
-
-    for (auto &Func : Funcs) {
-      Func->setCfgChecksum(FileChecksums.back());
-      Func->writeOut();
+    char Tmp[4];
+    os = &out;
+    auto Stamp = static_cast<uint32_t>(hash_value(EdgeDestinations));
+    FileChecksums.push_back(Stamp);
+    if (Endian == support::endianness::big) {
+      out.write("gcno", 4);
+      out.write(Options.Version, 4);
+    } else {
+      out.write("oncg", 4);
+      std::reverse_copy(Options.Version, Options.Version + 4, Tmp);
+      out.write(Tmp, 4);
     }
+    write(Stamp);
+    if (Version >= 90)
+      writeString(""); // unuseful current_working_directory
+    if (Version >= 80)
+      write(0); // unuseful has_unexecuted_blocks
 
-    out.write("\0\0\0\0\0\0\0\0", 8);  // EOF
+    for (auto &Func : Funcs)
+      Func->writeOut(Stamp);
+
+    write(0);
+    write(0);
     out.close();
   }
 }
@@ -780,12 +838,12 @@ bool GCOVProfiler::emitProfileArcs() {
     SmallVector<std::pair<GlobalVariable *, MDNode *>, 8> CountersBySP;
     for (auto &F : M->functions()) {
       DISubprogram *SP = F.getSubprogram();
+      unsigned EndLine;
       if (!SP) continue;
-      if (!functionHasLines(F) || !isFunctionInstrumented(F))
+      if (!functionHasLines(F, EndLine) || !isFunctionInstrumented(F))
         continue;
       // TODO: Functions using scope-based EH are currently not supported.
       if (isUsingScopeBasedEH(F)) continue;
-      if (!Result) Result = true;
 
       DenseMap<std::pair<BasicBlock *, BasicBlock *>, unsigned> EdgeToCounter;
       unsigned Edges = 0;
@@ -850,7 +908,8 @@ bool GCOVProfiler::emitProfileArcs() {
     }
 
     Function *WriteoutF = insertCounterWriteout(CountersBySP);
-    Function *FlushF = insertFlush(CountersBySP);
+    Function *ResetF = insertReset(CountersBySP);
+    Function *FlushF = insertFlush(ResetF);
 
     // Create a small bit of code that registers the "__llvm_gcov_writeout" to
     // be executed at exit and the "__llvm_gcov_flush" function to be executed
@@ -868,19 +927,18 @@ bool GCOVProfiler::emitProfileArcs() {
     IRBuilder<> Builder(BB);
 
     FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
-    Type *Params[] = {
-      PointerType::get(FTy, 0),
-      PointerType::get(FTy, 0)
-    };
+    Type *Params[] = {PointerType::get(FTy, 0), PointerType::get(FTy, 0),
+                      PointerType::get(FTy, 0)};
     FTy = FunctionType::get(Builder.getVoidTy(), Params, false);
 
-    // Initialize the environment and register the local writeout and flush
-    // functions.
+    // Initialize the environment and register the local writeout, flush and
+    // reset functions.
     FunctionCallee GCOVInit = M->getOrInsertFunction("llvm_gcov_init", FTy);
-    Builder.CreateCall(GCOVInit, {WriteoutF, FlushF});
+    Builder.CreateCall(GCOVInit, {WriteoutF, FlushF, ResetF});
     Builder.CreateRetVoid();
 
     appendToGlobalCtors(*M, F, 0);
+    Result = true;
   }
 
   return Result;
@@ -888,9 +946,9 @@ bool GCOVProfiler::emitProfileArcs() {
 
 FunctionCallee GCOVProfiler::getStartFileFunc(const TargetLibraryInfo *TLI) {
   Type *Args[] = {
-    Type::getInt8PtrTy(*Ctx),  // const char *orig_filename
-    Type::getInt8PtrTy(*Ctx),  // const char version[4]
-    Type::getInt32Ty(*Ctx),    // uint32_t checksum
+      Type::getInt8PtrTy(*Ctx), // const char *orig_filename
+      Type::getInt32Ty(*Ctx),   // uint32_t version
+      Type::getInt32Ty(*Ctx),   // uint32_t checksum
   };
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
   AttributeList AL;
@@ -903,18 +961,15 @@ FunctionCallee GCOVProfiler::getStartFileFunc(const TargetLibraryInfo *TLI) {
 FunctionCallee GCOVProfiler::getEmitFunctionFunc(const TargetLibraryInfo *TLI) {
   Type *Args[] = {
     Type::getInt32Ty(*Ctx),    // uint32_t ident
-    Type::getInt8PtrTy(*Ctx),  // const char *function_name
     Type::getInt32Ty(*Ctx),    // uint32_t func_checksum
-    Type::getInt8Ty(*Ctx),     // uint8_t use_extra_checksum
     Type::getInt32Ty(*Ctx),    // uint32_t cfg_checksum
   };
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), Args, false);
   AttributeList AL;
   if (auto AK = TLI->getExtAttrForI32Param(false)) {
     AL = AL.addParamAttribute(*Ctx, 0, AK);
+    AL = AL.addParamAttribute(*Ctx, 1, AK);
     AL = AL.addParamAttribute(*Ctx, 2, AK);
-    AL = AL.addParamAttribute(*Ctx, 3, AK);
-    AL = AL.addParamAttribute(*Ctx, 4, AK);
   }
   return M->getOrInsertFunction("llvm_gcda_emit_function", FTy);
 }
@@ -973,10 +1028,9 @@ Function *GCOVProfiler::insertCounterWriteout(
   // Collect the relevant data into a large constant data structure that we can
   // walk to write out everything.
   StructType *StartFileCallArgsTy = StructType::create(
-      {Builder.getInt8PtrTy(), Builder.getInt8PtrTy(), Builder.getInt32Ty()});
+      {Builder.getInt8PtrTy(), Builder.getInt32Ty(), Builder.getInt32Ty()});
   StructType *EmitFunctionCallArgsTy = StructType::create(
-      {Builder.getInt32Ty(), Builder.getInt8PtrTy(), Builder.getInt32Ty(),
-       Builder.getInt8Ty(), Builder.getInt32Ty()});
+      {Builder.getInt32Ty(), Builder.getInt32Ty(), Builder.getInt32Ty()});
   StructType *EmitArcsCallArgsTy = StructType::create(
       {Builder.getInt32Ty(), Builder.getInt64Ty()->getPointerTo()});
   StructType *FileInfoTy =
@@ -999,23 +1053,19 @@ Function *GCOVProfiler::insertCounterWriteout(
     std::string FilenameGcda = mangleName(CU, GCovFileType::GCDA);
     uint32_t CfgChecksum = FileChecksums.empty() ? 0 : FileChecksums[i];
     auto *StartFileCallArgs = ConstantStruct::get(
-        StartFileCallArgsTy, {Builder.CreateGlobalStringPtr(FilenameGcda),
-                              Builder.CreateGlobalStringPtr(ReversedVersion),
-                              Builder.getInt32(CfgChecksum)});
+        StartFileCallArgsTy,
+        {Builder.CreateGlobalStringPtr(FilenameGcda),
+         Builder.getInt32(endian::read32be(Options.Version)),
+         Builder.getInt32(CfgChecksum)});
 
     SmallVector<Constant *, 8> EmitFunctionCallArgsArray;
     SmallVector<Constant *, 8> EmitArcsCallArgsArray;
     for (int j : llvm::seq<int>(0, CountersBySP.size())) {
-      auto *SP = cast_or_null<DISubprogram>(CountersBySP[j].second);
       uint32_t FuncChecksum = Funcs.empty() ? 0 : Funcs[j]->getFuncChecksum();
       EmitFunctionCallArgsArray.push_back(ConstantStruct::get(
           EmitFunctionCallArgsTy,
           {Builder.getInt32(j),
-           Options.FunctionNamesInData
-               ? Builder.CreateGlobalStringPtr(getFunctionName(SP))
-               : Constant::getNullValue(Builder.getInt8PtrTy()),
            Builder.getInt32(FuncChecksum),
-           Builder.getInt8(Options.UseCfgChecksum),
            Builder.getInt32(CfgChecksum)}));
 
       GlobalVariable *GV = CountersBySP[j].first;
@@ -1144,19 +1194,12 @@ Function *GCOVProfiler::insertCounterWriteout(
                                                   EmitFunctionCallArgsPtr, 1)),
        Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(2),
                           Builder.CreateStructGEP(EmitFunctionCallArgsTy,
-                                                  EmitFunctionCallArgsPtr, 2)),
-       Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(3),
-                          Builder.CreateStructGEP(EmitFunctionCallArgsTy,
-                                                  EmitFunctionCallArgsPtr, 3)),
-       Builder.CreateLoad(EmitFunctionCallArgsTy->getElementType(4),
-                          Builder.CreateStructGEP(EmitFunctionCallArgsTy,
                                                   EmitFunctionCallArgsPtr,
-                                                  4))});
+                                                  2))});
   if (auto AK = TLI->getExtAttrForI32Param(false)) {
     EmitFunctionCall->addParamAttr(0, AK);
+    EmitFunctionCall->addParamAttr(1, AK);
     EmitFunctionCall->addParamAttr(2, AK);
-    EmitFunctionCall->addParamAttr(3, AK);
-    EmitFunctionCall->addParamAttr(4, AK);
   }
   auto *EmitArcsCallArgsPtr =
       Builder.CreateInBoundsGEP(EmitArcsCallArgsTy, EmitArcsCallArgsArray, JV);
@@ -1190,15 +1233,46 @@ Function *GCOVProfiler::insertCounterWriteout(
   return WriteoutF;
 }
 
-Function *GCOVProfiler::
-insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
+Function *GCOVProfiler::insertReset(
+    ArrayRef<std::pair<GlobalVariable *, MDNode *>> CountersBySP) {
+  FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
+  Function *ResetF = M->getFunction("__llvm_gcov_reset");
+  if (!ResetF)
+    ResetF = Function::Create(FTy, GlobalValue::InternalLinkage,
+                              "__llvm_gcov_reset", M);
+  ResetF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
+  ResetF->addFnAttr(Attribute::NoInline);
+  if (Options.NoRedZone)
+    ResetF->addFnAttr(Attribute::NoRedZone);
+
+  BasicBlock *Entry = BasicBlock::Create(*Ctx, "entry", ResetF);
+  IRBuilder<> Builder(Entry);
+
+  // Zero out the counters.
+  for (const auto &I : CountersBySP) {
+    GlobalVariable *GV = I.first;
+    Constant *Null = Constant::getNullValue(GV->getValueType());
+    Builder.CreateStore(Null, GV);
+  }
+
+  Type *RetTy = ResetF->getReturnType();
+  if (RetTy->isVoidTy())
+    Builder.CreateRetVoid();
+  else if (RetTy->isIntegerTy())
+    // Used if __llvm_gcov_reset was implicitly declared.
+    Builder.CreateRet(ConstantInt::get(RetTy, 0));
+  else
+    report_fatal_error("invalid return type for __llvm_gcov_reset");
+
+  return ResetF;
+}
+
+Function *GCOVProfiler::insertFlush(Function *ResetF) {
   FunctionType *FTy = FunctionType::get(Type::getVoidTy(*Ctx), false);
   Function *FlushF = M->getFunction("__llvm_gcov_flush");
   if (!FlushF)
     FlushF = Function::Create(FTy, GlobalValue::InternalLinkage,
                               "__llvm_gcov_flush", M);
-  else
-    FlushF->setLinkage(GlobalValue::InternalLinkage);
   FlushF->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
   FlushF->addFnAttr(Attribute::NoInline);
   if (Options.NoRedZone)
@@ -1212,16 +1286,10 @@ insertFlush(ArrayRef<std::pair<GlobalVariable*, MDNode*> > CountersBySP) {
 
   IRBuilder<> Builder(Entry);
   Builder.CreateCall(WriteoutF, {});
-
-  // Zero out the counters.
-  for (const auto &I : CountersBySP) {
-    GlobalVariable *GV = I.first;
-    Constant *Null = Constant::getNullValue(GV->getValueType());
-    Builder.CreateStore(Null, GV);
-  }
+  Builder.CreateCall(ResetF, {});
 
   Type *RetTy = FlushF->getReturnType();
-  if (RetTy == Type::getVoidTy(*Ctx))
+  if (RetTy->isVoidTy())
     Builder.CreateRetVoid();
   else if (RetTy->isIntegerTy())
     // Used if __llvm_gcov_flush was implicitly declared.
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 7e8f8e27a97b..2e71d613714a 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -45,6 +45,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Instrumentation/AddressSanitizerCommon.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 #include "llvm/Transforms/Utils/PromoteMemToReg.h"
@@ -96,6 +97,10 @@ static cl::opt<bool> ClInstrumentAtomics(
     cl::desc("instrument atomic instructions (rmw, cmpxchg)"), cl::Hidden,
     cl::init(true));
 
+static cl::opt<bool> ClInstrumentByval("hwasan-instrument-byval",
+                                       cl::desc("instrument byval arguments"),
+                                       cl::Hidden, cl::init(true));
+
 static cl::opt<bool> ClRecover(
     "hwasan-recover",
     cl::desc("Enable recovery mode (continue-after-error)."),
@@ -119,7 +124,7 @@ static cl::opt<bool> ClGenerateTagsWithCalls(
     cl::init(false));
 
 static cl::opt<bool> ClGlobals("hwasan-globals", cl::desc("Instrument globals"),
-                               cl::Hidden, cl::init(false));
+                               cl::Hidden, cl::init(false), cl::ZeroOrMore);
 
 static cl::opt<int> ClMatchAllTag(
     "hwasan-match-all-tag",
@@ -211,10 +216,10 @@ public:
                                  unsigned AccessSizeIndex,
                                  Instruction *InsertBefore);
   void instrumentMemIntrinsic(MemIntrinsic *MI);
-  bool instrumentMemAccess(Instruction *I);
-  Value *isInterestingMemoryAccess(Instruction *I, bool *IsWrite,
-                                   uint64_t *TypeSize, unsigned *Alignment,
-                                   Value **MaybeMask);
+  bool instrumentMemAccess(InterestingMemoryOperand &O);
+  bool ignoreAccess(Value *Ptr);
+  void getInterestingMemoryOperands(
+      Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting);
 
   bool isInterestingAlloca(const AllocaInst &AI);
   bool tagAlloca(IRBuilder<> &IRB, AllocaInst *AI, Value *Tag, size_t Size);
@@ -300,7 +305,10 @@ public:
 
   explicit HWAddressSanitizerLegacyPass(bool CompileKernel = false,
                                         bool Recover = false)
-      : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover) {}
+      : FunctionPass(ID), CompileKernel(CompileKernel), Recover(Recover) {
+    initializeHWAddressSanitizerLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
 
   StringRef getPassName() const override { return "HWAddressSanitizer"; }
 
@@ -500,62 +508,62 @@ Value *HWAddressSanitizer::getDynamicShadowNonTls(IRBuilder<> &IRB) {
   }
 }
 
-Value *HWAddressSanitizer::isInterestingMemoryAccess(Instruction *I,
-                                                     bool *IsWrite,
-                                                     uint64_t *TypeSize,
-                                                     unsigned *Alignment,
-                                                     Value **MaybeMask) {
+bool HWAddressSanitizer::ignoreAccess(Value *Ptr) {
+  // Do not instrument acesses from different address spaces; we cannot deal
+  // with them.
+  Type *PtrTy = cast<PointerType>(Ptr->getType()->getScalarType());
+  if (PtrTy->getPointerAddressSpace() != 0)
+    return true;
+
+  // Ignore swifterror addresses.
+  // swifterror memory addresses are mem2reg promoted by instruction
+  // selection. As such they cannot have regular uses like an instrumentation
+  // function and it makes no sense to track them as memory.
+  if (Ptr->isSwiftError())
+    return true;
+
+  return false;
+}
+
+void HWAddressSanitizer::getInterestingMemoryOperands(
+    Instruction *I, SmallVectorImpl<InterestingMemoryOperand> &Interesting) {
   // Skip memory accesses inserted by another instrumentation.
-  if (I->hasMetadata("nosanitize")) return nullptr;
+  if (I->hasMetadata("nosanitize"))
+    return;
 
   // Do not instrument the load fetching the dynamic shadow address.
   if (LocalDynamicShadow == I)
-    return nullptr;
+    return;
 
-  Value *PtrOperand = nullptr;
-  const DataLayout &DL = I->getModule()->getDataLayout();
   if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-    if (!ClInstrumentReads) return nullptr;
-    *IsWrite = false;
-    *TypeSize = DL.getTypeStoreSizeInBits(LI->getType());
-    *Alignment = LI->getAlignment();
-    PtrOperand = LI->getPointerOperand();
+    if (!ClInstrumentReads || ignoreAccess(LI->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, LI->getPointerOperandIndex(), false,
+                             LI->getType(), LI->getAlign());
   } else if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-    if (!ClInstrumentWrites) return nullptr;
-    *IsWrite = true;
-    *TypeSize = DL.getTypeStoreSizeInBits(SI->getValueOperand()->getType());
-    *Alignment = SI->getAlignment();
-    PtrOperand = SI->getPointerOperand();
+    if (!ClInstrumentWrites || ignoreAccess(SI->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, SI->getPointerOperandIndex(), true,
+                             SI->getValueOperand()->getType(), SI->getAlign());
   } else if (AtomicRMWInst *RMW = dyn_cast<AtomicRMWInst>(I)) {
-    if (!ClInstrumentAtomics) return nullptr;
-    *IsWrite = true;
-    *TypeSize = DL.getTypeStoreSizeInBits(RMW->getValOperand()->getType());
-    *Alignment = 0;
-    PtrOperand = RMW->getPointerOperand();
+    if (!ClInstrumentAtomics || ignoreAccess(RMW->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, RMW->getPointerOperandIndex(), true,
+                             RMW->getValOperand()->getType(), None);
   } else if (AtomicCmpXchgInst *XCHG = dyn_cast<AtomicCmpXchgInst>(I)) {
-    if (!ClInstrumentAtomics) return nullptr;
-    *IsWrite = true;
-    *TypeSize = DL.getTypeStoreSizeInBits(XCHG->getCompareOperand()->getType());
-    *Alignment = 0;
-    PtrOperand = XCHG->getPointerOperand();
-  }
-
-  if (PtrOperand) {
-    // Do not instrument accesses from different address spaces; we cannot deal
-    // with them.
-    Type *PtrTy = cast<PointerType>(PtrOperand->getType()->getScalarType());
-    if (PtrTy->getPointerAddressSpace() != 0)
-      return nullptr;
-
-    // Ignore swifterror addresses.
-    // swifterror memory addresses are mem2reg promoted by instruction
-    // selection. As such they cannot have regular uses like an instrumentation
-    // function and it makes no sense to track them as memory.
-    if (PtrOperand->isSwiftError())
-      return nullptr;
+    if (!ClInstrumentAtomics || ignoreAccess(XCHG->getPointerOperand()))
+      return;
+    Interesting.emplace_back(I, XCHG->getPointerOperandIndex(), true,
+                             XCHG->getCompareOperand()->getType(), None);
+  } else if (auto CI = dyn_cast<CallInst>(I)) {
+    for (unsigned ArgNo = 0; ArgNo < CI->getNumArgOperands(); ArgNo++) {
+      if (!ClInstrumentByval || !CI->isByValArgument(ArgNo) ||
+          ignoreAccess(CI->getArgOperand(ArgNo)))
+        continue;
+      Type *Ty = CI->getParamByValType(ArgNo);
+      Interesting.emplace_back(I, ArgNo, false, Ty, Align(1));
+    }
   }
-
-  return PtrOperand;
 }
 
 static unsigned getPointerOperandIndex(Instruction *I) {
@@ -713,45 +721,32 @@ void HWAddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
   MI->eraseFromParent();
 }
 
-bool HWAddressSanitizer::instrumentMemAccess(Instruction *I) {
-  LLVM_DEBUG(dbgs() << "Instrumenting: " << *I << "\n");
-  bool IsWrite = false;
-  unsigned Alignment = 0;
-  uint64_t TypeSize = 0;
-  Value *MaybeMask = nullptr;
+bool HWAddressSanitizer::instrumentMemAccess(InterestingMemoryOperand &O) {
+  Value *Addr = O.getPtr();
 
-  if (ClInstrumentMemIntrinsics && isa<MemIntrinsic>(I)) {
-    instrumentMemIntrinsic(cast<MemIntrinsic>(I));
-    return true;
-  }
-
-  Value *Addr =
-      isInterestingMemoryAccess(I, &IsWrite, &TypeSize, &Alignment, &MaybeMask);
-
-  if (!Addr)
-    return false;
+  LLVM_DEBUG(dbgs() << "Instrumenting: " << O.getInsn() << "\n");
 
-  if (MaybeMask)
+  if (O.MaybeMask)
     return false; //FIXME
 
-  IRBuilder<> IRB(I);
-  if (isPowerOf2_64(TypeSize) &&
-      (TypeSize / 8 <= (1UL << (kNumberOfAccessSizes - 1))) &&
-      (Alignment >= (1UL << Mapping.Scale) || Alignment == 0 ||
-       Alignment >= TypeSize / 8)) {
-    size_t AccessSizeIndex = TypeSizeToSizeIndex(TypeSize);
+  IRBuilder<> IRB(O.getInsn());
+  if (isPowerOf2_64(O.TypeSize) &&
+      (O.TypeSize / 8 <= (1ULL << (kNumberOfAccessSizes - 1))) &&
+      (!O.Alignment || *O.Alignment >= (1ULL << Mapping.Scale) ||
+       *O.Alignment >= O.TypeSize / 8)) {
+    size_t AccessSizeIndex = TypeSizeToSizeIndex(O.TypeSize);
     if (ClInstrumentWithCalls) {
-      IRB.CreateCall(HwasanMemoryAccessCallback[IsWrite][AccessSizeIndex],
+      IRB.CreateCall(HwasanMemoryAccessCallback[O.IsWrite][AccessSizeIndex],
                      IRB.CreatePointerCast(Addr, IntptrTy));
     } else {
-      instrumentMemAccessInline(Addr, IsWrite, AccessSizeIndex, I);
+      instrumentMemAccessInline(Addr, O.IsWrite, AccessSizeIndex, O.getInsn());
     }
   } else {
-    IRB.CreateCall(HwasanMemoryAccessCallbackSized[IsWrite],
+    IRB.CreateCall(HwasanMemoryAccessCallbackSized[O.IsWrite],
                    {IRB.CreatePointerCast(Addr, IntptrTy),
-                    ConstantInt::get(IntptrTy, TypeSize / 8)});
+                    ConstantInt::get(IntptrTy, O.TypeSize / 8)});
   }
-  untagPointerOperand(I, Addr);
+  untagPointerOperand(O.getInsn(), Addr);
 
   return true;
 }
@@ -789,7 +784,7 @@ bool HWAddressSanitizer::tagAlloca(IRBuilder<> &IRB, AllocaInst *AI,
     // llvm.memset right here into either a sequence of stores, or a call to
     // hwasan_tag_memory.
     if (ShadowSize)
-      IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, Align::None());
+      IRB.CreateMemSet(ShadowPtr, JustTag, ShadowSize, Align(1));
     if (Size != AlignedSize) {
       IRB.CreateStore(
           ConstantInt::get(Int8Ty, Size % Mapping.getObjectAlignment()),
@@ -1089,7 +1084,8 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
 
   LLVM_DEBUG(dbgs() << "Function: " << F.getName() << "\n");
 
-  SmallVector<Instruction*, 16> ToInstrument;
+  SmallVector<InterestingMemoryOperand, 16> OperandsToInstrument;
+  SmallVector<MemIntrinsic *, 16> IntrinToInstrument;
   SmallVector<AllocaInst*, 8> AllocasToInstrument;
   SmallVector<Instruction*, 8> RetVec;
   SmallVector<Instruction*, 8> LandingPadVec;
@@ -1115,31 +1111,31 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
       if (InstrumentLandingPads && isa<LandingPadInst>(Inst))
         LandingPadVec.push_back(&Inst);
 
-      Value *MaybeMask = nullptr;
-      bool IsWrite;
-      unsigned Alignment;
-      uint64_t TypeSize;
-      Value *Addr = isInterestingMemoryAccess(&Inst, &IsWrite, &TypeSize,
-                                              &Alignment, &MaybeMask);
-      if (Addr || isa<MemIntrinsic>(Inst))
-        ToInstrument.push_back(&Inst);
+      getInterestingMemoryOperands(&Inst, OperandsToInstrument);
+
+      if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(&Inst))
+        IntrinToInstrument.push_back(MI);
     }
   }
 
   initializeCallbacks(*F.getParent());
 
+  bool Changed = false;
+
   if (!LandingPadVec.empty())
-    instrumentLandingPads(LandingPadVec);
+    Changed |= instrumentLandingPads(LandingPadVec);
 
   if (AllocasToInstrument.empty() && F.hasPersonalityFn() &&
       F.getPersonalityFn()->getName() == kHwasanPersonalityThunkName) {
     // __hwasan_personality_thunk is a no-op for functions without an
     // instrumented stack, so we can drop it.
     F.setPersonalityFn(nullptr);
+    Changed = true;
   }
 
-  if (AllocasToInstrument.empty() && ToInstrument.empty())
-    return false;
+  if (AllocasToInstrument.empty() && OperandsToInstrument.empty() &&
+      IntrinToInstrument.empty())
+    return Changed;
 
   assert(!LocalDynamicShadow);
 
@@ -1149,14 +1145,11 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
                /*WithFrameRecord*/ ClRecordStackHistory &&
                    !AllocasToInstrument.empty());
 
-  bool Changed = false;
   if (!AllocasToInstrument.empty()) {
     Value *StackTag =
         ClGenerateTagsWithCalls ? nullptr : getStackBaseTag(EntryIRB);
-    Changed |= instrumentStack(AllocasToInstrument, AllocaDbgMap, RetVec,
-                               StackTag);
+    instrumentStack(AllocasToInstrument, AllocaDbgMap, RetVec, StackTag);
   }
-
   // Pad and align each of the allocas that we instrumented to stop small
   // uninteresting allocas from hiding in instrumented alloca's padding and so
   // that we have enough space to store real tags for short granules.
@@ -1165,7 +1158,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
     uint64_t Size = getAllocaSizeInBytes(*AI);
     uint64_t AlignedSize = alignTo(Size, Mapping.getObjectAlignment());
     AI->setAlignment(
-        MaybeAlign(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
+        Align(std::max(AI->getAlignment(), Mapping.getObjectAlignment())));
     if (Size != AlignedSize) {
       Type *AllocatedType = AI->getAllocatedType();
       if (AI->isArrayAllocation()) {
@@ -1178,7 +1171,7 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
       auto *NewAI = new AllocaInst(
           TypeWithPadding, AI->getType()->getAddressSpace(), nullptr, "", AI);
       NewAI->takeName(AI);
-      NewAI->setAlignment(MaybeAlign(AI->getAlignment()));
+      NewAI->setAlignment(AI->getAlign());
       NewAI->setUsedWithInAlloca(AI->isUsedWithInAlloca());
       NewAI->setSwiftError(AI->isSwiftError());
       NewAI->copyMetadata(*AI);
@@ -1216,13 +1209,18 @@ bool HWAddressSanitizer::sanitizeFunction(Function &F) {
     }
   }
 
-  for (auto Inst : ToInstrument)
-    Changed |= instrumentMemAccess(Inst);
+  for (auto &Operand : OperandsToInstrument)
+    instrumentMemAccess(Operand);
+
+  if (ClInstrumentMemIntrinsics && !IntrinToInstrument.empty()) {
+    for (auto Inst : IntrinToInstrument)
+      instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+  }
 
   LocalDynamicShadow = nullptr;
   StackBaseTag = nullptr;
 
-  return Changed;
+  return true;
 }
 
 void HWAddressSanitizer::instrumentGlobal(GlobalVariable *GV, uint8_t Tag) {
@@ -1325,8 +1323,9 @@ void HWAddressSanitizer::instrumentGlobals() {
   // cases where two libraries mutually depend on each other.
   //
   // We only need one note per binary, so put everything for the note in a
-  // comdat.
-  Comdat *NoteComdat = M.getOrInsertComdat(kHwasanNoteName);
+  // comdat. This need to be a comdat with an .init_array section to prevent
+  // newer versions of lld from discarding the note.
+  Comdat *NoteComdat = M.getOrInsertComdat(kHwasanModuleCtorName);
 
   Type *Int8Arr0Ty = ArrayType::get(Int8Ty, 0);
   auto Start =
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index d5787c8f62a1..bcd4e2e8e33c 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Function.h"
@@ -194,12 +193,12 @@ private:
   // TotalCount is the total profiled count of call executions, and
   // NumCandidates is the number of candidate entries in ValueDataRef.
   std::vector<PromotionCandidate> getPromotionCandidatesForCallSite(
-      Instruction *Inst, const ArrayRef<InstrProfValueData> &ValueDataRef,
+      const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
       uint64_t TotalCount, uint32_t NumCandidates);
 
   // Promote a list of targets for one indirect-call callsite. Return
   // the number of promotions.
-  uint32_t tryToPromote(Instruction *Inst,
+  uint32_t tryToPromote(CallBase &CB,
                         const std::vector<PromotionCandidate> &Candidates,
                         uint64_t &TotalCount);
 
@@ -219,11 +218,11 @@ public:
 // the count. Stop at the first target that is not promoted.
 std::vector<ICallPromotionFunc::PromotionCandidate>
 ICallPromotionFunc::getPromotionCandidatesForCallSite(
-    Instruction *Inst, const ArrayRef<InstrProfValueData> &ValueDataRef,
+    const CallBase &CB, const ArrayRef<InstrProfValueData> &ValueDataRef,
     uint64_t TotalCount, uint32_t NumCandidates) {
   std::vector<PromotionCandidate> Ret;
 
-  LLVM_DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << *Inst
+  LLVM_DEBUG(dbgs() << " \nWork on callsite #" << NumOfPGOICallsites << CB
                     << " Num_targets: " << ValueDataRef.size()
                     << " Num_candidates: " << NumCandidates << "\n");
   NumOfPGOICallsites++;
@@ -239,18 +238,18 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
     LLVM_DEBUG(dbgs() << " Candidate " << I << " Count=" << Count
                       << "  Target_func: " << Target << "\n");
 
-    if (ICPInvokeOnly && isa<CallInst>(Inst)) {
+    if (ICPInvokeOnly && isa<CallInst>(CB)) {
       LLVM_DEBUG(dbgs() << " Not promote: User options.\n");
       ORE.emit([&]() {
-        return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", Inst)
+        return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", &CB)
                << " Not promote: User options";
       });
       break;
     }
-    if (ICPCallOnly && isa<InvokeInst>(Inst)) {
+    if (ICPCallOnly && isa<InvokeInst>(CB)) {
       LLVM_DEBUG(dbgs() << " Not promote: User option.\n");
       ORE.emit([&]() {
-        return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", Inst)
+        return OptimizationRemarkMissed(DEBUG_TYPE, "UserOptions", &CB)
                << " Not promote: User options";
       });
       break;
@@ -258,7 +257,7 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
     if (ICPCutOff != 0 && NumOfPGOICallPromotion >= ICPCutOff) {
       LLVM_DEBUG(dbgs() << " Not promote: Cutoff reached.\n");
       ORE.emit([&]() {
-        return OptimizationRemarkMissed(DEBUG_TYPE, "CutOffReached", Inst)
+        return OptimizationRemarkMissed(DEBUG_TYPE, "CutOffReached", &CB)
                << " Not promote: Cutoff reached";
       });
       break;
@@ -268,7 +267,7 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
     if (TargetFunction == nullptr) {
       LLVM_DEBUG(dbgs() << " Not promote: Cannot find the target\n");
       ORE.emit([&]() {
-        return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", Inst)
+        return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToFindTarget", &CB)
                << "Cannot promote indirect call: target with md5sum "
                << ore::NV("target md5sum", Target) << " not found";
       });
@@ -276,11 +275,11 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
     }
 
     const char *Reason = nullptr;
-    if (!isLegalToPromote(CallSite(Inst), TargetFunction, &Reason)) {
+    if (!isLegalToPromote(CB, TargetFunction, &Reason)) {
       using namespace ore;
 
       ORE.emit([&]() {
-        return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", Inst)
+        return OptimizationRemarkMissed(DEBUG_TYPE, "UnableToPromote", &CB)
                << "Cannot promote indirect call to "
                << NV("TargetFunction", TargetFunction) << " with count of "
                << NV("Count", Count) << ": " << Reason;
@@ -294,25 +293,24 @@ ICallPromotionFunc::getPromotionCandidatesForCallSite(
   return Ret;
 }
 
-Instruction *llvm::pgo::promoteIndirectCall(Instruction *Inst,
-                                            Function *DirectCallee,
-                                            uint64_t Count, uint64_t TotalCount,
-                                            bool AttachProfToDirectCall,
-                                            OptimizationRemarkEmitter *ORE) {
+CallBase &llvm::pgo::promoteIndirectCall(CallBase &CB, Function *DirectCallee,
+                                         uint64_t Count, uint64_t TotalCount,
+                                         bool AttachProfToDirectCall,
+                                         OptimizationRemarkEmitter *ORE) {
 
   uint64_t ElseCount = TotalCount - Count;
   uint64_t MaxCount = (Count >= ElseCount ? Count : ElseCount);
   uint64_t Scale = calculateCountScale(MaxCount);
-  MDBuilder MDB(Inst->getContext());
+  MDBuilder MDB(CB.getContext());
   MDNode *BranchWeights = MDB.createBranchWeights(
       scaleBranchCount(Count, Scale), scaleBranchCount(ElseCount, Scale));
 
-  Instruction *NewInst =
-      promoteCallWithIfThenElse(CallSite(Inst), DirectCallee, BranchWeights);
+  CallBase &NewInst =
+      promoteCallWithIfThenElse(CB, DirectCallee, BranchWeights);
 
   if (AttachProfToDirectCall) {
-    MDBuilder MDB(NewInst->getContext());
-    NewInst->setMetadata(
+    MDBuilder MDB(NewInst.getContext());
+    NewInst.setMetadata(
         LLVMContext::MD_prof,
         MDB.createBranchWeights({static_cast<uint32_t>(Count)}));
   }
@@ -321,7 +319,7 @@ Instruction *llvm::pgo::promoteIndirectCall(Instruction *Inst,
 
   if (ORE)
     ORE->emit([&]() {
-      return OptimizationRemark(DEBUG_TYPE, "Promoted", Inst)
+      return OptimizationRemark(DEBUG_TYPE, "Promoted", &CB)
              << "Promote indirect call to " << NV("DirectCallee", DirectCallee)
              << " with count " << NV("Count", Count) << " out of "
              << NV("TotalCount", TotalCount);
@@ -331,14 +329,14 @@ Instruction *llvm::pgo::promoteIndirectCall(Instruction *Inst,
 
 // Promote indirect-call to conditional direct-call for one callsite.
 uint32_t ICallPromotionFunc::tryToPromote(
-    Instruction *Inst, const std::vector<PromotionCandidate> &Candidates,
+    CallBase &CB, const std::vector<PromotionCandidate> &Candidates,
     uint64_t &TotalCount) {
   uint32_t NumPromoted = 0;
 
   for (auto &C : Candidates) {
     uint64_t Count = C.Count;
-    pgo::promoteIndirectCall(Inst, C.TargetFunction, Count, TotalCount,
-                             SamplePGO, &ORE);
+    pgo::promoteIndirectCall(CB, C.TargetFunction, Count, TotalCount, SamplePGO,
+                             &ORE);
     assert(TotalCount >= Count);
     TotalCount -= Count;
     NumOfPGOICallPromotion++;
@@ -352,28 +350,28 @@ uint32_t ICallPromotionFunc::tryToPromote(
 bool ICallPromotionFunc::processFunction(ProfileSummaryInfo *PSI) {
   bool Changed = false;
   ICallPromotionAnalysis ICallAnalysis;
-  for (auto &I : findIndirectCalls(F)) {
+  for (auto *CB : findIndirectCalls(F)) {
     uint32_t NumVals, NumCandidates;
     uint64_t TotalCount;
     auto ICallProfDataRef = ICallAnalysis.getPromotionCandidatesForInstruction(
-        I, NumVals, TotalCount, NumCandidates);
+        CB, NumVals, TotalCount, NumCandidates);
     if (!NumCandidates ||
         (PSI && PSI->hasProfileSummary() && !PSI->isHotCount(TotalCount)))
       continue;
     auto PromotionCandidates = getPromotionCandidatesForCallSite(
-        I, ICallProfDataRef, TotalCount, NumCandidates);
-    uint32_t NumPromoted = tryToPromote(I, PromotionCandidates, TotalCount);
+        *CB, ICallProfDataRef, TotalCount, NumCandidates);
+    uint32_t NumPromoted = tryToPromote(*CB, PromotionCandidates, TotalCount);
     if (NumPromoted == 0)
       continue;
 
     Changed = true;
     // Adjust the MD.prof metadata. First delete the old one.
-    I->setMetadata(LLVMContext::MD_prof, nullptr);
+    CB->setMetadata(LLVMContext::MD_prof, nullptr);
     // If all promoted, we don't need the MD.prof metadata.
     if (TotalCount == 0 || NumPromoted == NumVals)
       continue;
     // Otherwise we need update with the un-promoted records back.
-    annotateValueSite(*M, *I, ICallProfDataRef.slice(NumPromoted), TotalCount,
+    annotateValueSite(*M, *CB, ICallProfDataRef.slice(NumPromoted), TotalCount,
                       IPVK_IndirectCallTarget, NumCandidates);
   }
   return Changed;
diff --git a/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp b/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
index 518b8895e836..853385fbf863 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrOrderFile.cpp
@@ -11,7 +11,6 @@
 
 #include "llvm/Transforms/Instrumentation/InstrOrderFile.h"
 #include "llvm/ADT/Statistic.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 04c7e856b5d4..7b03bbfcdfe4 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -74,15 +74,16 @@ cl::opt<unsigned> MemOPSizeLarge(
 
 namespace {
 
-cl::opt<bool> DoNameCompression("enable-name-compression",
-                                cl::desc("Enable name string compression"),
-                                cl::init(true));
-
 cl::opt<bool> DoHashBasedCounterSplit(
     "hash-based-counter-split",
     cl::desc("Rename counter variable of a comdat function based on cfg hash"),
     cl::init(true));
 
+cl::opt<bool> RuntimeCounterRelocation(
+    "runtime-counter-relocation",
+    cl::desc("Enable relocating counters at runtime."),
+    cl::init(false));
+
 cl::opt<bool> ValueProfileStaticAlloc(
     "vp-static-alloc",
     cl::desc("Do static counter allocation for value profiler"),
@@ -109,6 +110,12 @@ cl::opt<bool> AtomicCounterUpdatePromoted(
              " for promoted counters only"),
     cl::init(false));
 
+cl::opt<bool> AtomicFirstCounter(
+    "atomic-first-counter", cl::ZeroOrMore,
+    cl::desc("Use atomic fetch add for first counter in a function (usually "
+             "the entry counter)"),
+    cl::init(false));
+
 // If the option is not specified, the default behavior about whether
 // counter promotion is done depends on how instrumentaiton lowering
 // pipeline is setup, i.e., the default value of true of this option
@@ -151,7 +158,9 @@ public:
 
   InstrProfilingLegacyPass() : ModulePass(ID) {}
   InstrProfilingLegacyPass(const InstrProfOptions &Options, bool IsCS = false)
-      : ModulePass(ID), InstrProf(Options, IsCS) {}
+      : ModulePass(ID), InstrProf(Options, IsCS) {
+    initializeInstrProfilingLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
 
   StringRef getPassName() const override {
     return "Frontend instrumentation-based coverage lowering";
@@ -242,9 +251,14 @@ public:
       : LoopToCandidates(LoopToCands), ExitBlocks(), InsertPts(), L(CurLoop),
         LI(LI), BFI(BFI) {
 
+    // Skip collection of ExitBlocks and InsertPts for loops that will not be
+    // able to have counters promoted.
     SmallVector<BasicBlock *, 8> LoopExitBlocks;
     SmallPtrSet<BasicBlock *, 8> BlockSet;
+
     L.getExitBlocks(LoopExitBlocks);
+    if (!isPromotionPossible(&L, LoopExitBlocks))
+      return;
 
     for (BasicBlock *ExitBlock : LoopExitBlocks) {
       if (BlockSet.insert(ExitBlock).second) {
@@ -313,21 +327,31 @@ private:
     return true;
   }
 
-  // Returns the max number of Counter Promotions for LP.
-  unsigned getMaxNumOfPromotionsInLoop(Loop *LP) {
+  // Check whether the loop satisfies the basic conditions needed to perform
+  // Counter Promotions.
+  bool isPromotionPossible(Loop *LP,
+                           const SmallVectorImpl<BasicBlock *> &LoopExitBlocks) {
     // We can't insert into a catchswitch.
-    SmallVector<BasicBlock *, 8> LoopExitBlocks;
-    LP->getExitBlocks(LoopExitBlocks);
     if (llvm::any_of(LoopExitBlocks, [](BasicBlock *Exit) {
           return isa<CatchSwitchInst>(Exit->getTerminator());
         }))
-      return 0;
+      return false;
 
     if (!LP->hasDedicatedExits())
-      return 0;
+      return false;
 
     BasicBlock *PH = LP->getLoopPreheader();
     if (!PH)
+      return false;
+
+    return true;
+  }
+
+  // Returns the max number of Counter Promotions for LP.
+  unsigned getMaxNumOfPromotionsInLoop(Loop *LP) {
+    SmallVector<BasicBlock *, 8> LoopExitBlocks;
+    LP->getExitBlocks(LoopExitBlocks);
+    if (!isPromotionPossible(LP, LoopExitBlocks))
       return 0;
 
     SmallVector<BasicBlock *, 8> ExitingBlocks;
@@ -431,6 +455,13 @@ bool InstrProfiling::lowerIntrinsics(Function *F) {
   return true;
 }
 
+bool InstrProfiling::isRuntimeCounterRelocationEnabled() const {
+  if (RuntimeCounterRelocation.getNumOccurrences() > 0)
+    return RuntimeCounterRelocation;
+
+  return TT.isOSFuchsia();
+}
+
 bool InstrProfiling::isCounterPromotionEnabled() const {
   if (DoCounterPromotion.getNumOccurrences() > 0)
     return DoCounterPromotion;
@@ -611,11 +642,19 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
                   llvm::InstrProfValueKind::IPVK_MemOPSize);
   CallInst *Call = nullptr;
   auto *TLI = &GetTLI(*Ind->getFunction());
+
+  // To support value profiling calls within Windows exception handlers, funclet
+  // information contained within operand bundles needs to be copied over to
+  // the library call. This is required for the IR to be processed by the
+  // WinEHPrepare pass.
+  SmallVector<OperandBundleDef, 1> OpBundles;
+  Ind->getOperandBundlesAsDefs(OpBundles);
   if (!IsRange) {
     Value *Args[3] = {Ind->getTargetValue(),
                       Builder.CreateBitCast(DataVar, Builder.getInt8PtrTy()),
                       Builder.getInt32(Index)};
-    Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI), Args);
+    Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI), Args,
+                              OpBundles);
   } else {
     Value *Args[6] = {
         Ind->getTargetValue(),
@@ -624,8 +663,8 @@ void InstrProfiling::lowerValueProfileInst(InstrProfValueProfileInst *Ind) {
         Builder.getInt64(MemOPSizeRangeStart),
         Builder.getInt64(MemOPSizeRangeLast),
         Builder.getInt64(MemOPSizeLarge == 0 ? INT64_MIN : MemOPSizeLarge)};
-    Call =
-        Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI, true), Args);
+    Call = Builder.CreateCall(getOrInsertValueProfilingCall(*M, *TLI, true),
+                              Args, OpBundles);
   }
   if (auto AK = TLI->getExtAttrForI32Param(false))
     Call->addParamAttr(2, AK);
@@ -641,7 +680,30 @@ void InstrProfiling::lowerIncrement(InstrProfIncrementInst *Inc) {
   Value *Addr = Builder.CreateConstInBoundsGEP2_64(Counters->getValueType(),
                                                    Counters, 0, Index);
 
-  if (Options.Atomic || AtomicCounterUpdateAll) {
+  if (isRuntimeCounterRelocationEnabled()) {
+    Type *Int64Ty = Type::getInt64Ty(M->getContext());
+    Type *Int64PtrTy = Type::getInt64PtrTy(M->getContext());
+    Function *Fn = Inc->getParent()->getParent();
+    Instruction &I = Fn->getEntryBlock().front();
+    LoadInst *LI = dyn_cast<LoadInst>(&I);
+    if (!LI) {
+      IRBuilder<> Builder(&I);
+      Type *Int64Ty = Type::getInt64Ty(M->getContext());
+      GlobalVariable *Bias = M->getGlobalVariable(getInstrProfCounterBiasVarName());
+      if (!Bias) {
+        Bias = new GlobalVariable(*M, Int64Ty, false, GlobalValue::LinkOnceODRLinkage,
+                                  Constant::getNullValue(Int64Ty),
+                                  getInstrProfCounterBiasVarName());
+        Bias->setVisibility(GlobalVariable::HiddenVisibility);
+      }
+      LI = Builder.CreateLoad(Int64Ty, Bias);
+    }
+    auto *Add = Builder.CreateAdd(Builder.CreatePtrToInt(Addr, Int64Ty), LI);
+    Addr = Builder.CreateIntToPtr(Add, Int64PtrTy);
+  }
+
+  if (Options.Atomic || AtomicCounterUpdateAll ||
+      (Index == 0 && AtomicFirstCounter)) {
     Builder.CreateAtomicRMW(AtomicRMWInst::Add, Addr, Inc->getStep(),
                             AtomicOrdering::Monotonic);
   } else {
@@ -916,7 +978,7 @@ void InstrProfiling::emitNameData() {
 
   std::string CompressedNameStr;
   if (Error E = collectPGOFuncNameStrings(ReferencedNames, CompressedNameStr,
-                                          DoNameCompression)) {
+                                          DoInstrProfNameCompression)) {
     report_fatal_error(toString(std::move(E)), false);
   }
 
@@ -932,7 +994,7 @@ void InstrProfiling::emitNameData() {
   // On COFF, it's important to reduce the alignment down to 1 to prevent the
   // linker from inserting padding before the start of the names section or
   // between names entries.
-  NamesVar->setAlignment(Align::None());
+  NamesVar->setAlignment(Align(1));
   UsedVars.push_back(NamesVar);
 
   for (auto *NamePtr : ReferencedNames)
@@ -979,9 +1041,9 @@ void InstrProfiling::emitRegistration() {
 }
 
 bool InstrProfiling::emitRuntimeHook() {
-  // We expect the linker to be invoked with -u<hook_var> flag for linux,
-  // for which case there is no need to emit the user function.
-  if (TT.isOSLinux())
+  // We expect the linker to be invoked with -u<hook_var> flag for Linux or
+  // Fuchsia, in which case there is no need to emit the user function.
+  if (TT.isOSLinux() || TT.isOSFuchsia())
     return false;
 
   // If the module's provided its own runtime, we don't need to do anything.
diff --git a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
index a6c2c9b464b6..ad238f1357c6 100644
--- a/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -68,8 +68,8 @@ GlobalVariable *llvm::createPrivateGlobalForString(Module &M, StringRef Str,
                          GlobalValue::PrivateLinkage, StrConst, NamePrefix);
   if (AllowMerging)
     GV->setUnnamedAddr(GlobalValue::UnnamedAddr::Global);
-  GV->setAlignment(Align::None()); // Strings may not be merged w/o setting
-                                   // alignment explicitly.
+  GV->setAlignment(Align(1)); // Strings may not be merged w/o setting
+                              // alignment explicitly.
   return GV;
 }
 
@@ -78,7 +78,7 @@ Comdat *llvm::GetOrCreateFunctionComdat(Function &F, Triple &T,
   if (auto Comdat = F.getComdat()) return Comdat;
   assert(F.hasName());
   Module *M = F.getParent();
-  std::string Name = F.getName();
+  std::string Name = std::string(F.getName());
 
   // Make a unique comdat name for internal linkage things on ELF. On COFF, the
   // name of the comdat group identifies the leader symbol of the comdat group.
@@ -112,6 +112,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializePGOInstrumentationUseLegacyPassPass(Registry);
   initializePGOIndirectCallPromotionLegacyPassPass(Registry);
   initializePGOMemOPSizeOptLegacyPassPass(Registry);
+  initializeCGProfileLegacyPassPass(Registry);
   initializeInstrOrderFileLegacyPassPass(Registry);
   initializeInstrProfilingLegacyPassPass(Registry);
   initializeMemorySanitizerLegacyPassPass(Registry);
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 80acab307578..fcf7f470b3e1 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -62,7 +62,7 @@
 ///
 /// Origins are meaningless for fully initialized values, so MemorySanitizer
 /// avoids storing origin to memory when a fully initialized value is stored.
-/// This way it avoids needless overwritting origin of the 4-byte region on
+/// This way it avoids needless overwriting origin of the 4-byte region on
 /// a short (i.e. 1 byte) clean store, and it is also good for performance.
 ///
 ///                            Atomic handling.
@@ -137,6 +137,9 @@
 ///
 ///  KernelMemorySanitizer only supports X86_64 at the moment.
 ///
+//
+// FIXME: This sanitizer does not yet handle scalable vectors
+//
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Instrumentation/MemorySanitizer.h"
@@ -153,7 +156,6 @@
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/CallingConv.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -282,6 +284,11 @@ static cl::opt<bool> ClCheckAccessAddress("msan-check-access-address",
        cl::desc("report accesses through a pointer which has poisoned shadow"),
        cl::Hidden, cl::init(true));
 
+static cl::opt<bool> ClEagerChecks(
+    "msan-eager-checks",
+    cl::desc("check arguments and return values at function call boundaries"),
+    cl::Hidden, cl::init(false));
+
 static cl::opt<bool> ClDumpStrictInstructions("msan-dump-strict-instructions",
        cl::desc("print out instructions with default strict semantics"),
        cl::Hidden, cl::init(false));
@@ -392,6 +399,14 @@ static const MemoryMapParams Linux_PowerPC64_MemoryMapParams = {
   0x1C0000000000,  // OriginBase
 };
 
+// s390x Linux
+static const MemoryMapParams Linux_S390X_MemoryMapParams = {
+    0xC00000000000, // AndMask
+    0,              // XorMask (not used)
+    0x080000000000, // ShadowBase
+    0x1C0000000000, // OriginBase
+};
+
 // aarch64 Linux
 static const MemoryMapParams Linux_AArch64_MemoryMapParams = {
   0,               // AndMask (not used)
@@ -439,6 +454,11 @@ static const PlatformMemoryMapParams Linux_PowerPC_MemoryMapParams = {
   &Linux_PowerPC64_MemoryMapParams,
 };
 
+static const PlatformMemoryMapParams Linux_S390_MemoryMapParams = {
+    nullptr,
+    &Linux_S390X_MemoryMapParams,
+};
+
 static const PlatformMemoryMapParams Linux_ARM_MemoryMapParams = {
   nullptr,
   &Linux_AArch64_MemoryMapParams,
@@ -484,6 +504,7 @@ private:
   friend struct VarArgMIPS64Helper;
   friend struct VarArgAArch64Helper;
   friend struct VarArgPowerPC64Helper;
+  friend struct VarArgSystemZHelper;
 
   void initializeModule(Module &M);
   void initializeCallbacks(Module &M);
@@ -530,10 +551,6 @@ private:
   /// (x86_64-specific).
   Value *VAArgOverflowSizeTLS;
 
-  /// Thread-local space used to pass origin value to the UMR reporting
-  /// function.
-  Value *OriginTLS;
-
   /// Are the instrumentation callbacks set up?
   bool CallbacksInitialized = false;
 
@@ -586,9 +603,6 @@ private:
 
   /// Branch weights for origin store.
   MDNode *OriginStoreWeights;
-
-  /// An empty volatile inline asm that prevents callback merge.
-  InlineAsm *EmptyAsm;
 };
 
 void insertModuleCtor(Module &M) {
@@ -611,13 +625,15 @@ void insertModuleCtor(Module &M) {
 
 /// A legacy function pass for msan instrumentation.
 ///
-/// Instruments functions to detect unitialized reads.
+/// Instruments functions to detect uninitialized reads.
 struct MemorySanitizerLegacyPass : public FunctionPass {
   // Pass identification, replacement for typeid.
   static char ID;
 
   MemorySanitizerLegacyPass(MemorySanitizerOptions Options = {})
-      : FunctionPass(ID), Options(Options) {}
+      : FunctionPass(ID), Options(Options) {
+    initializeMemorySanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
   StringRef getPassName() const override { return "MemorySanitizerLegacyPass"; }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -700,10 +716,7 @@ void MemorySanitizer::createKernelApi(Module &M) {
   VAArgTLS = nullptr;
   VAArgOriginTLS = nullptr;
   VAArgOverflowSizeTLS = nullptr;
-  // OriginTLS is unused in the kernel.
-  OriginTLS = nullptr;
 
-  // __msan_warning() in the kernel takes an origin.
   WarningFn = M.getOrInsertFunction("__msan_warning", IRB.getVoidTy(),
                                     IRB.getInt32Ty());
   // Requests the per-task context state (kmsan_context_state*) from the
@@ -758,12 +771,14 @@ static Constant *getOrInsertGlobal(Module &M, StringRef Name, Type *Ty) {
 /// Insert declarations for userspace-specific functions and globals.
 void MemorySanitizer::createUserspaceApi(Module &M) {
   IRBuilder<> IRB(*C);
+
   // Create the callback.
   // FIXME: this function should have "Cold" calling conv,
   // which is not yet implemented.
-  StringRef WarningFnName = Recover ? "__msan_warning"
-                                    : "__msan_warning_noreturn";
-  WarningFn = M.getOrInsertFunction(WarningFnName, IRB.getVoidTy());
+  StringRef WarningFnName = Recover ? "__msan_warning_with_origin"
+                                    : "__msan_warning_with_origin_noreturn";
+  WarningFn =
+      M.getOrInsertFunction(WarningFnName, IRB.getVoidTy(), IRB.getInt32Ty());
 
   // Create the global TLS variables.
   RetvalTLS =
@@ -790,20 +805,30 @@ void MemorySanitizer::createUserspaceApi(Module &M) {
 
   VAArgOverflowSizeTLS =
       getOrInsertGlobal(M, "__msan_va_arg_overflow_size_tls", IRB.getInt64Ty());
-  OriginTLS = getOrInsertGlobal(M, "__msan_origin_tls", IRB.getInt32Ty());
 
   for (size_t AccessSizeIndex = 0; AccessSizeIndex < kNumberOfAccessSizes;
        AccessSizeIndex++) {
     unsigned AccessSize = 1 << AccessSizeIndex;
     std::string FunctionName = "__msan_maybe_warning_" + itostr(AccessSize);
+    SmallVector<std::pair<unsigned, Attribute>, 2> MaybeWarningFnAttrs;
+    MaybeWarningFnAttrs.push_back(std::make_pair(
+        AttributeList::FirstArgIndex, Attribute::get(*C, Attribute::ZExt)));
+    MaybeWarningFnAttrs.push_back(std::make_pair(
+        AttributeList::FirstArgIndex + 1, Attribute::get(*C, Attribute::ZExt)));
     MaybeWarningFn[AccessSizeIndex] = M.getOrInsertFunction(
-        FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
-        IRB.getInt32Ty());
+        FunctionName, AttributeList::get(*C, MaybeWarningFnAttrs),
+        IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8), IRB.getInt32Ty());
 
     FunctionName = "__msan_maybe_store_origin_" + itostr(AccessSize);
+    SmallVector<std::pair<unsigned, Attribute>, 2> MaybeStoreOriginFnAttrs;
+    MaybeStoreOriginFnAttrs.push_back(std::make_pair(
+        AttributeList::FirstArgIndex, Attribute::get(*C, Attribute::ZExt)));
+    MaybeStoreOriginFnAttrs.push_back(std::make_pair(
+        AttributeList::FirstArgIndex + 2, Attribute::get(*C, Attribute::ZExt)));
     MaybeStoreOriginFn[AccessSizeIndex] = M.getOrInsertFunction(
-        FunctionName, IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8),
-        IRB.getInt8PtrTy(), IRB.getInt32Ty());
+        FunctionName, AttributeList::get(*C, MaybeStoreOriginFnAttrs),
+        IRB.getVoidTy(), IRB.getIntNTy(AccessSize * 8), IRB.getInt8PtrTy(),
+        IRB.getInt32Ty());
   }
 
   MsanSetAllocaOrigin4Fn = M.getOrInsertFunction(
@@ -834,10 +859,6 @@ void MemorySanitizer::initializeCallbacks(Module &M) {
   MemsetFn = M.getOrInsertFunction(
     "__msan_memset", IRB.getInt8PtrTy(), IRB.getInt8PtrTy(), IRB.getInt32Ty(),
     IntptrTy);
-  // We insert an empty inline asm after __msan_report* to avoid callback merge.
-  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
-                            StringRef(""), StringRef(""),
-                            /*hasSideEffects=*/true);
 
   MsanInstrumentAsmStoreFn =
       M.getOrInsertFunction("__msan_instrument_asm_store", IRB.getVoidTy(),
@@ -924,6 +945,9 @@ void MemorySanitizer::initializeModule(Module &M) {
           case Triple::ppc64le:
             MapParams = Linux_PowerPC_MemoryMapParams.bits64;
             break;
+          case Triple::systemz:
+            MapParams = Linux_S390_MemoryMapParams.bits64;
+            break;
           case Triple::aarch64:
           case Triple::aarch64_be:
             MapParams = Linux_ARM_MemoryMapParams.bits64;
@@ -982,8 +1006,8 @@ namespace {
 struct VarArgHelper {
   virtual ~VarArgHelper() = default;
 
-  /// Visit a CallSite.
-  virtual void visitCallSite(CallSite &CS, IRBuilder<> &IRB) = 0;
+  /// Visit a CallBase.
+  virtual void visitCallBase(CallBase &CB, IRBuilder<> &IRB) = 0;
 
   /// Visit a va_start call.
   virtual void visitVAStartInst(VAStartInst &I) = 0;
@@ -1028,12 +1052,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   BasicBlock *ActualFnStart;
 
   // The following flags disable parts of MSan instrumentation based on
-  // blacklist contents and command-line options.
+  // exclusion list contents and command-line options.
   bool InsertChecks;
   bool PropagateShadow;
   bool PoisonStack;
   bool PoisonUndef;
-  bool CheckReturnValue;
 
   struct ShadowOriginAndInsertPoint {
     Value *Shadow;
@@ -1057,9 +1080,6 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     PropagateShadow = SanitizeFunction;
     PoisonStack = SanitizeFunction && ClPoisonStack;
     PoisonUndef = SanitizeFunction && ClPoisonUndef;
-    // FIXME: Consider using SpecialCaseList to specify a list of functions that
-    // must always return fully initialized values. For now, we hardcode "main".
-    CheckReturnValue = SanitizeFunction && (F.getName() == "main");
 
     MS.initializeCallbacks(*F.getParent());
     if (MS.CompileKernel)
@@ -1090,7 +1110,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void paintOrigin(IRBuilder<> &IRB, Value *Origin, Value *OriginPtr,
                    unsigned Size, Align Alignment) {
     const DataLayout &DL = F.getParent()->getDataLayout();
-    const Align IntptrAlignment = Align(DL.getABITypeAlignment(MS.IntptrTy));
+    const Align IntptrAlignment = DL.getABITypeAlign(MS.IntptrTy);
     unsigned IntptrSize = DL.getTypeStoreSize(MS.IntptrTy);
     assert(IntptrAlignment >= kMinOriginAlignment);
     assert(IntptrSize >= kOriginSize);
@@ -1104,7 +1124,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       for (unsigned i = 0; i < Size / IntptrSize; ++i) {
         Value *Ptr = i ? IRB.CreateConstGEP1_32(MS.IntptrTy, IntptrOriginPtr, i)
                        : IntptrOriginPtr;
-        IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment.value());
+        IRB.CreateAlignedStore(IntptrOrigin, Ptr, CurrentAlignment);
         Ofs += IntptrSize / kOriginSize;
         CurrentAlignment = IntptrAlignment;
       }
@@ -1113,7 +1133,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     for (unsigned i = Ofs; i < (Size + kOriginSize - 1) / kOriginSize; ++i) {
       Value *GEP =
           i ? IRB.CreateConstGEP1_32(MS.OriginTy, OriginPtr, i) : OriginPtr;
-      IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment.value());
+      IRB.CreateAlignedStore(Origin, GEP, CurrentAlignment);
       CurrentAlignment = kMinOriginAlignment;
     }
   }
@@ -1170,8 +1190,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       std::tie(ShadowPtr, OriginPtr) =
           getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ true);
 
-      StoreInst *NewSI =
-          IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment.value());
+      StoreInst *NewSI = IRB.CreateAlignedStore(Shadow, ShadowPtr, Alignment);
       LLVM_DEBUG(dbgs() << "  STORE: " << *NewSI << "\n");
       (void)NewSI;
 
@@ -1188,15 +1207,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   void insertWarningFn(IRBuilder<> &IRB, Value *Origin) {
     if (!Origin)
       Origin = (Value *)IRB.getInt32(0);
-    if (MS.CompileKernel) {
-      IRB.CreateCall(MS.WarningFn, Origin);
-    } else {
-      if (MS.TrackOrigins) {
-        IRB.CreateStore(Origin, MS.OriginTLS);
-      }
-      IRB.CreateCall(MS.WarningFn, {});
-    }
-    IRB.CreateCall(MS.EmptyAsm, {});
+    assert(Origin->getType()->isIntegerTy());
+    IRB.CreateCall(MS.WarningFn, Origin)->setCannotMerge();
     // FIXME: Insert UnreachableInst if !MS.Recover?
     // This may invalidate some of the following checks and needs to be done
     // at the very end.
@@ -1346,8 +1358,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     const DataLayout &DL = F.getParent()->getDataLayout();
     if (VectorType *VT = dyn_cast<VectorType>(OrigTy)) {
       uint32_t EltSize = DL.getTypeSizeInBits(VT->getElementType());
-      return VectorType::get(IntegerType::get(*MS.C, EltSize),
-                             VT->getNumElements());
+      return FixedVectorType::get(IntegerType::get(*MS.C, EltSize),
+                                  cast<FixedVectorType>(VT)->getNumElements());
     }
     if (ArrayType *AT = dyn_cast<ArrayType>(OrigTy)) {
       return ArrayType::get(getShadowTy(AT->getElementType()),
@@ -1368,7 +1380,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// Flatten a vector type.
   Type *getShadowTyNoVec(Type *ty) {
     if (VectorType *vt = dyn_cast<VectorType>(ty))
-      return IntegerType::get(*MS.C, vt->getBitWidth());
+      return IntegerType::get(*MS.C,
+                              vt->getPrimitiveSizeInBits().getFixedSize());
     return ty;
   }
 
@@ -1606,20 +1619,28 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
           LLVM_DEBUG(dbgs() << "Arg is not sized\n");
           continue;
         }
+
+        bool FArgByVal = FArg.hasByValAttr();
+        bool FArgNoUndef = FArg.hasAttribute(Attribute::NoUndef);
+        bool FArgEagerCheck = ClEagerChecks && !FArgByVal && FArgNoUndef;
         unsigned Size =
             FArg.hasByValAttr()
-                ? DL.getTypeAllocSize(FArg.getType()->getPointerElementType())
+                ? DL.getTypeAllocSize(FArg.getParamByValType())
                 : DL.getTypeAllocSize(FArg.getType());
+
         if (A == &FArg) {
           bool Overflow = ArgOffset + Size > kParamTLSSize;
-          Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
-          if (FArg.hasByValAttr()) {
+          if (FArgEagerCheck) {
+            *ShadowPtr = getCleanShadow(V);
+            setOrigin(A, getCleanOrigin());
+            continue;
+          } else if (FArgByVal) {
+            Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
             // ByVal pointer itself has clean shadow. We copy the actual
             // argument shadow to the underlying memory.
             // Figure out maximal valid memcpy alignment.
             const Align ArgAlign = DL.getValueOrABITypeAlignment(
-                MaybeAlign(FArg.getParamAlignment()),
-                A->getType()->getPointerElementType());
+                MaybeAlign(FArg.getParamAlignment()), FArg.getParamByValType());
             Value *CpShadowPtr =
                 getShadowOriginPtr(V, EntryIRB, EntryIRB.getInt8Ty(), ArgAlign,
                                    /*isStore*/ true)
@@ -1639,12 +1660,14 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
             }
             *ShadowPtr = getCleanShadow(V);
           } else {
+            // Shadow over TLS
+            Value *Base = getShadowPtrForArgument(&FArg, EntryIRB, ArgOffset);
             if (Overflow) {
               // ParamTLS overflow.
               *ShadowPtr = getCleanShadow(V);
             } else {
-              *ShadowPtr = EntryIRB.CreateAlignedLoad(
-                  getShadowTy(&FArg), Base, kShadowTLSAlignment.value());
+              *ShadowPtr = EntryIRB.CreateAlignedLoad(getShadowTy(&FArg), Base,
+                                                      kShadowTLSAlignment);
             }
           }
           LLVM_DEBUG(dbgs()
@@ -1657,7 +1680,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
             setOrigin(A, getCleanOrigin());
           }
         }
-        ArgOffset += alignTo(Size, kShadowTLSAlignment);
+
+        if (!FArgEagerCheck)
+          ArgOffset += alignTo(Size, kShadowTLSAlignment);
       }
       assert(*ShadowPtr && "Could not find shadow for an argument");
       return *ShadowPtr;
@@ -1783,8 +1808,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (PropagateShadow) {
       std::tie(ShadowPtr, OriginPtr) =
           getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
-      setShadow(&I, IRB.CreateAlignedLoad(ShadowTy, ShadowPtr,
-                                          Alignment.value(), "_msld"));
+      setShadow(&I,
+                IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld"));
     } else {
       setShadow(&I, getCleanShadow(&I));
     }
@@ -1798,8 +1823,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (MS.TrackOrigins) {
       if (PropagateShadow) {
         const Align OriginAlignment = std::max(kMinOriginAlignment, Alignment);
-        setOrigin(&I, IRB.CreateAlignedLoad(MS.OriginTy, OriginPtr,
-                                            OriginAlignment.value()));
+        setOrigin(
+            &I, IRB.CreateAlignedLoad(MS.OriginTy, OriginPtr, OriginAlignment));
       } else {
         setOrigin(&I, getCleanOrigin());
       }
@@ -1821,7 +1846,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
     IRBuilder<> IRB(&I);
     Value *Addr = I.getOperand(0);
-    Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(), Align::None(),
+    Value *ShadowPtr = getShadowOriginPtr(Addr, IRB, I.getType(), Align(1),
                                           /*isStore*/ true)
                            .first;
 
@@ -1868,10 +1893,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   }
 
   void visitShuffleVectorInst(ShuffleVectorInst &I) {
-    insertShadowCheck(I.getOperand(2), &I);
     IRBuilder<> IRB(&I);
     setShadow(&I, IRB.CreateShuffleVector(getShadow(&I, 0), getShadow(&I, 1),
-              I.getOperand(2), "_msprop"));
+                                          I.getShuffleMask(), "_msprop"));
     setOriginForNaryOp(I);
   }
 
@@ -2070,9 +2094,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   size_t VectorOrPrimitiveTypeSizeInBits(Type *Ty) {
     assert(!(Ty->isVectorTy() && Ty->getScalarType()->isPointerTy()) &&
            "Vector of pointers is not a valid shadow type");
-    return Ty->isVectorTy() ?
-      Ty->getVectorNumElements() * Ty->getScalarSizeInBits() :
-      Ty->getPrimitiveSizeInBits();
+    return Ty->isVectorTy() ? cast<FixedVectorType>(Ty)->getNumElements() *
+                                  Ty->getScalarSizeInBits()
+                            : Ty->getPrimitiveSizeInBits();
   }
 
   /// Cast between two shadow types, extending or truncating as
@@ -2088,7 +2112,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (dstTy->isIntegerTy() && srcTy->isIntegerTy())
       return IRB.CreateIntCast(V, dstTy, Signed);
     if (dstTy->isVectorTy() && srcTy->isVectorTy() &&
-        dstTy->getVectorNumElements() == srcTy->getVectorNumElements())
+        cast<FixedVectorType>(dstTy)->getNumElements() ==
+            cast<FixedVectorType>(srcTy)->getNumElements())
       return IRB.CreateIntCast(V, dstTy, Signed);
     Value *V1 = IRB.CreateBitCast(V, Type::getIntNTy(*MS.C, srcSizeInBits));
     Value *V2 =
@@ -2132,9 +2157,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                            Value *OtherArg) {
     Constant *ShadowMul;
     Type *Ty = ConstArg->getType();
-    if (Ty->isVectorTy()) {
-      unsigned NumElements = Ty->getVectorNumElements();
-      Type *EltTy = Ty->getSequentialElementType();
+    if (auto *VTy = dyn_cast<VectorType>(Ty)) {
+      unsigned NumElements = cast<FixedVectorType>(VTy)->getNumElements();
+      Type *EltTy = VTy->getElementType();
       SmallVector<Constant *, 16> Elements;
       for (unsigned Idx = 0; Idx < NumElements; ++Idx) {
         if (ConstantInt *Elt =
@@ -2454,8 +2479,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // We don't know the pointer alignment (could be unaligned SSE store!).
     // Have to assume to worst case.
     std::tie(ShadowPtr, OriginPtr) = getShadowOriginPtr(
-        Addr, IRB, Shadow->getType(), Align::None(), /*isStore*/ true);
-    IRB.CreateAlignedStore(Shadow, ShadowPtr, 1);
+        Addr, IRB, Shadow->getType(), Align(1), /*isStore*/ true);
+    IRB.CreateAlignedStore(Shadow, ShadowPtr, Align(1));
 
     if (ClCheckAccessAddress)
       insertShadowCheck(Addr, &I);
@@ -2478,11 +2503,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (PropagateShadow) {
       // We don't know the pointer alignment (could be unaligned SSE load!).
       // Have to assume to worst case.
-      const Align Alignment = Align::None();
+      const Align Alignment = Align(1);
       std::tie(ShadowPtr, OriginPtr) =
           getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
-      setShadow(&I, IRB.CreateAlignedLoad(ShadowTy, ShadowPtr,
-                                          Alignment.value(), "_msld"));
+      setShadow(&I,
+                IRB.CreateAlignedLoad(ShadowTy, ShadowPtr, Alignment, "_msld"));
     } else {
       setShadow(&I, getCleanShadow(&I));
     }
@@ -2534,7 +2559,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
   /// The main purpose of this code is to do something reasonable with all
   /// random intrinsics we might encounter, most importantly - SIMD intrinsics.
   /// We recognize several classes of intrinsics by their argument types and
-  /// ModRefBehaviour and apply special intrumentation when we are reasonably
+  /// ModRefBehaviour and apply special instrumentation when we are reasonably
   /// sure that we know what the intrinsic does.
   ///
   /// We special-case intrinsics where this approach fails. See llvm.bswap
@@ -2595,7 +2620,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOrigin(&I, getOrigin(Op));
   }
 
-  // Instrument vector convert instrinsic.
+  // Instrument vector convert intrinsic.
   //
   // This function instruments intrinsics like cvtsi2ss:
   // %Out = int_xxx_cvtyyy(%ConvertOp)
@@ -2659,7 +2684,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       assert(CopyOp->getType() == I.getType());
       assert(CopyOp->getType()->isVectorTy());
       Value *ResultShadow = getShadow(CopyOp);
-      Type *EltTy = ResultShadow->getType()->getVectorElementType();
+      Type *EltTy = cast<VectorType>(ResultShadow->getType())->getElementType();
       for (int i = 0; i < NumUsedElements; ++i) {
         ResultShadow = IRB.CreateInsertElement(
             ResultShadow, ConstantInt::getNullValue(EltTy),
@@ -2698,7 +2723,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     return IRB.CreateSExt(S2, T);
   }
 
-  // Instrument vector shift instrinsic.
+  // Instrument vector shift intrinsic.
   //
   // This function instruments intrinsics like int_x86_avx2_psll_w.
   // Intrinsic shifts %In by %ShiftSize bits.
@@ -2716,7 +2741,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                              : Lower64ShadowExtend(IRB, S2, getShadowTy(&I));
     Value *V1 = I.getOperand(0);
     Value *V2 = I.getOperand(1);
-    Value *Shift = IRB.CreateCall(I.getFunctionType(), I.getCalledValue(),
+    Value *Shift = IRB.CreateCall(I.getFunctionType(), I.getCalledOperand(),
                                   {IRB.CreateBitCast(S1, V1->getType()), V2});
     Shift = IRB.CreateBitCast(Shift, getShadowTy(&I));
     setShadow(&I, IRB.CreateOr(Shift, S2Conv));
@@ -2728,8 +2753,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     const unsigned X86_MMXSizeInBits = 64;
     assert(EltSizeInBits != 0 && (X86_MMXSizeInBits % EltSizeInBits) == 0 &&
            "Illegal MMX vector element size");
-    return VectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
-                           X86_MMXSizeInBits / EltSizeInBits);
+    return FixedVectorType::get(IntegerType::get(*MS.C, EltSizeInBits),
+                                X86_MMXSizeInBits / EltSizeInBits);
   }
 
   // Returns a signed counterpart for an (un)signed-saturate-and-pack
@@ -2763,7 +2788,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
   }
 
-  // Instrument vector pack instrinsic.
+  // Instrument vector pack intrinsic.
   //
   // This function instruments intrinsics like x86_mmx_packsswb, that
   // packs elements of 2 input vectors into half as many bits with saturation.
@@ -2806,7 +2831,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
-  // Instrument sum-of-absolute-differencies intrinsic.
+  // Instrument sum-of-absolute-differences intrinsic.
   void handleVectorSadIntrinsic(IntrinsicInst &I) {
     const unsigned SignificantBitsPerResultElement = 16;
     bool isX86_MMX = I.getOperand(0)->getType()->isX86_MMXTy();
@@ -2864,13 +2889,56 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  // Instrument generic vector reduction intrinsics
+  // by ORing together all their fields.
+  void handleVectorReduceIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *S = IRB.CreateOrReduce(getShadow(&I, 0));
+    setShadow(&I, S);
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  // Instrument experimental.vector.reduce.or intrinsic.
+  // Valid (non-poisoned) set bits in the operand pull low the
+  // corresponding shadow bits.
+  void handleVectorReduceOrIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *OperandShadow = getShadow(&I, 0);
+    Value *OperandUnsetBits = IRB.CreateNot(I.getOperand(0));
+    Value *OperandUnsetOrPoison = IRB.CreateOr(OperandUnsetBits, OperandShadow);
+    // Bit N is clean if any field's bit N is 1 and unpoison
+    Value *OutShadowMask = IRB.CreateAndReduce(OperandUnsetOrPoison);
+    // Otherwise, it is clean if every field's bit N is unpoison
+    Value *OrShadow = IRB.CreateOrReduce(OperandShadow);
+    Value *S = IRB.CreateAnd(OutShadowMask, OrShadow);
+
+    setShadow(&I, S);
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
+  // Instrument experimental.vector.reduce.or intrinsic.
+  // Valid (non-poisoned) unset bits in the operand pull down the
+  // corresponding shadow bits.
+  void handleVectorReduceAndIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *OperandShadow = getShadow(&I, 0);
+    Value *OperandSetOrPoison = IRB.CreateOr(I.getOperand(0), OperandShadow);
+    // Bit N is clean if any field's bit N is 0 and unpoison
+    Value *OutShadowMask = IRB.CreateAndReduce(OperandSetOrPoison);
+    // Otherwise, it is clean if every field's bit N is unpoison
+    Value *OrShadow = IRB.CreateOrReduce(OperandShadow);
+    Value *S = IRB.CreateAnd(OutShadowMask, OrShadow);
+
+    setShadow(&I, S);
+    setOrigin(&I, getOrigin(&I, 0));
+  }
+
   void handleStmxcsr(IntrinsicInst &I) {
     IRBuilder<> IRB(&I);
     Value* Addr = I.getArgOperand(0);
     Type *Ty = IRB.getInt32Ty();
     Value *ShadowPtr =
-        getShadowOriginPtr(Addr, IRB, Ty, Align::None(), /*isStore*/ true)
-            .first;
+        getShadowOriginPtr(Addr, IRB, Ty, Align(1), /*isStore*/ true).first;
 
     IRB.CreateStore(getCleanShadow(Ty),
                     IRB.CreatePointerCast(ShadowPtr, Ty->getPointerTo()));
@@ -2885,7 +2953,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRB(&I);
     Value *Addr = I.getArgOperand(0);
     Type *Ty = IRB.getInt32Ty();
-    const Align Alignment = Align::None();
+    const Align Alignment = Align(1);
     Value *ShadowPtr, *OriginPtr;
     std::tie(ShadowPtr, OriginPtr) =
         getShadowOriginPtr(Addr, IRB, Ty, Alignment, /*isStore*/ false);
@@ -2893,8 +2961,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (ClCheckAccessAddress)
       insertShadowCheck(Addr, &I);
 
-    Value *Shadow =
-        IRB.CreateAlignedLoad(Ty, ShadowPtr, Alignment.value(), "_ldmxcsr");
+    Value *Shadow = IRB.CreateAlignedLoad(Ty, ShadowPtr, Alignment, "_ldmxcsr");
     Value *Origin = MS.TrackOrigins ? IRB.CreateLoad(MS.OriginTy, OriginPtr)
                                     : getCleanOrigin();
     insertShadowCheck(Shadow, Origin, &I);
@@ -2904,7 +2971,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRB(&I);
     Value *V = I.getArgOperand(0);
     Value *Addr = I.getArgOperand(1);
-    const MaybeAlign Alignment(
+    const Align Alignment(
         cast<ConstantInt>(I.getArgOperand(2))->getZExtValue());
     Value *Mask = I.getArgOperand(3);
     Value *Shadow = getShadow(V);
@@ -2921,21 +2988,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       insertShadowCheck(Mask, &I);
     }
 
-    IRB.CreateMaskedStore(Shadow, ShadowPtr, Alignment ? Alignment->value() : 0,
-                          Mask);
+    IRB.CreateMaskedStore(Shadow, ShadowPtr, Alignment, Mask);
 
     if (MS.TrackOrigins) {
       auto &DL = F.getParent()->getDataLayout();
       paintOrigin(IRB, getOrigin(V), OriginPtr,
                   DL.getTypeStoreSize(Shadow->getType()),
-                  llvm::max(Alignment, kMinOriginAlignment));
+                  std::max(Alignment, kMinOriginAlignment));
     }
   }
 
   bool handleMaskedLoad(IntrinsicInst &I) {
     IRBuilder<> IRB(&I);
     Value *Addr = I.getArgOperand(0);
-    const MaybeAlign Alignment(
+    const Align Alignment(
         cast<ConstantInt>(I.getArgOperand(1))->getZExtValue());
     Value *Mask = I.getArgOperand(2);
     Value *PassThru = I.getArgOperand(3);
@@ -2945,9 +3011,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     if (PropagateShadow) {
       std::tie(ShadowPtr, OriginPtr) =
           getShadowOriginPtr(Addr, IRB, ShadowTy, Alignment, /*isStore*/ false);
-      setShadow(&I, IRB.CreateMaskedLoad(
-                        ShadowPtr, Alignment ? Alignment->value() : 0, Mask,
-                        getShadow(PassThru), "_msmaskedld"));
+      setShadow(&I, IRB.CreateMaskedLoad(ShadowPtr, Alignment, Mask,
+                                         getShadow(PassThru), "_msmaskedld"));
     } else {
       setShadow(&I, getCleanShadow(&I));
     }
@@ -2965,8 +3030,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
         Value *Acc = IRB.CreateExtractElement(
             MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), 0));
-        for (int i = 1, N = PassThru->getType()->getVectorNumElements(); i < N;
-             ++i) {
+        for (int i = 1, N = cast<FixedVectorType>(PassThru->getType())
+                                ->getNumElements();
+             i < N; ++i) {
           Value *More = IRB.CreateExtractElement(
               MaskedPassThruShadow, ConstantInt::get(IRB.getInt32Ty(), i));
           Acc = IRB.CreateOr(Acc, More);
@@ -3005,6 +3071,68 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     setOriginForNaryOp(I);
   }
 
+  SmallVector<int, 8> getPclmulMask(unsigned Width, bool OddElements) {
+    SmallVector<int, 8> Mask;
+    for (unsigned X = OddElements ? 1 : 0; X < Width; X += 2) {
+      Mask.append(2, X);
+    }
+    return Mask;
+  }
+
+  // Instrument pclmul intrinsics.
+  // These intrinsics operate either on odd or on even elements of the input
+  // vectors, depending on the constant in the 3rd argument, ignoring the rest.
+  // Replace the unused elements with copies of the used ones, ex:
+  //   (0, 1, 2, 3) -> (0, 0, 2, 2) (even case)
+  // or
+  //   (0, 1, 2, 3) -> (1, 1, 3, 3) (odd case)
+  // and then apply the usual shadow combining logic.
+  void handlePclmulIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Type *ShadowTy = getShadowTy(&I);
+    unsigned Width =
+        cast<FixedVectorType>(I.getArgOperand(0)->getType())->getNumElements();
+    assert(isa<ConstantInt>(I.getArgOperand(2)) &&
+           "pclmul 3rd operand must be a constant");
+    unsigned Imm = cast<ConstantInt>(I.getArgOperand(2))->getZExtValue();
+    Value *Shuf0 =
+        IRB.CreateShuffleVector(getShadow(&I, 0), UndefValue::get(ShadowTy),
+                                getPclmulMask(Width, Imm & 0x01));
+    Value *Shuf1 =
+        IRB.CreateShuffleVector(getShadow(&I, 1), UndefValue::get(ShadowTy),
+                                getPclmulMask(Width, Imm & 0x10));
+    ShadowAndOriginCombiner SOC(this, IRB);
+    SOC.Add(Shuf0, getOrigin(&I, 0));
+    SOC.Add(Shuf1, getOrigin(&I, 1));
+    SOC.Done(&I);
+  }
+
+  // Instrument _mm_*_sd intrinsics
+  void handleUnarySdIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *First = getShadow(&I, 0);
+    Value *Second = getShadow(&I, 1);
+    // High word of first operand, low word of second
+    Value *Shadow =
+        IRB.CreateShuffleVector(First, Second, llvm::makeArrayRef<int>({2, 1}));
+
+    setShadow(&I, Shadow);
+    setOriginForNaryOp(I);
+  }
+
+  void handleBinarySdIntrinsic(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *First = getShadow(&I, 0);
+    Value *Second = getShadow(&I, 1);
+    Value *OrShadow = IRB.CreateOr(First, Second);
+    // High word of first operand, low word of both OR'd together
+    Value *Shadow = IRB.CreateShuffleVector(First, OrShadow,
+                                            llvm::makeArrayRef<int>({2, 1}));
+
+    setShadow(&I, Shadow);
+    setOriginForNaryOp(I);
+  }
+
   void visitIntrinsicInst(IntrinsicInst &I) {
     switch (I.getIntrinsicID()) {
     case Intrinsic::lifetime_start:
@@ -3023,6 +3151,17 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     case Intrinsic::masked_load:
       handleMaskedLoad(I);
       break;
+    case Intrinsic::experimental_vector_reduce_and:
+      handleVectorReduceAndIntrinsic(I);
+      break;
+    case Intrinsic::experimental_vector_reduce_or:
+      handleVectorReduceOrIntrinsic(I);
+      break;
+    case Intrinsic::experimental_vector_reduce_add:
+    case Intrinsic::experimental_vector_reduce_xor:
+    case Intrinsic::experimental_vector_reduce_mul:
+      handleVectorReduceIntrinsic(I);
+      break;
     case Intrinsic::x86_sse_stmxcsr:
       handleStmxcsr(I);
       break;
@@ -3238,6 +3377,20 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       handleBmiIntrinsic(I);
       break;
 
+    case Intrinsic::x86_pclmulqdq:
+    case Intrinsic::x86_pclmulqdq_256:
+    case Intrinsic::x86_pclmulqdq_512:
+      handlePclmulIntrinsic(I);
+      break;
+
+    case Intrinsic::x86_sse41_round_sd:
+      handleUnarySdIntrinsic(I);
+      break;
+    case Intrinsic::x86_sse2_max_sd:
+    case Intrinsic::x86_sse2_min_sd:
+      handleBinarySdIntrinsic(I);
+      break;
+
     case Intrinsic::is_constant:
       // The result of llvm.is.constant() is always defined.
       setShadow(&I, getCleanShadow(&I));
@@ -3251,25 +3404,21 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
   }
 
-  void visitCallSite(CallSite CS) {
-    Instruction &I = *CS.getInstruction();
-    assert(!I.getMetadata("nosanitize"));
-    assert((CS.isCall() || CS.isInvoke() || CS.isCallBr()) &&
-           "Unknown type of CallSite");
-    if (CS.isCallBr() || (CS.isCall() && cast<CallInst>(&I)->isInlineAsm())) {
+  void visitCallBase(CallBase &CB) {
+    assert(!CB.getMetadata("nosanitize"));
+    if (CB.isInlineAsm()) {
       // For inline asm (either a call to asm function, or callbr instruction),
       // do the usual thing: check argument shadow and mark all outputs as
       // clean. Note that any side effects of the inline asm that are not
       // immediately visible in its constraints are not handled.
       if (ClHandleAsmConservative && MS.CompileKernel)
-        visitAsmInstruction(I);
+        visitAsmInstruction(CB);
       else
-        visitInstruction(I);
+        visitInstruction(CB);
       return;
     }
-    if (CS.isCall()) {
-      CallInst *Call = cast<CallInst>(&I);
-      assert(!isa<IntrinsicInst>(&I) && "intrinsics are handled elsewhere");
+    if (auto *Call = dyn_cast<CallInst>(&CB)) {
+      assert(!isa<IntrinsicInst>(Call) && "intrinsics are handled elsewhere");
 
       // We are going to insert code that relies on the fact that the callee
       // will become a non-readonly function after it is instrumented by us. To
@@ -3288,16 +3437,16 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 
       maybeMarkSanitizerLibraryCallNoBuiltin(Call, TLI);
     }
-    IRBuilder<> IRB(&I);
+    IRBuilder<> IRB(&CB);
 
     unsigned ArgOffset = 0;
-    LLVM_DEBUG(dbgs() << "  CallSite: " << I << "\n");
-    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
-         ArgIt != End; ++ArgIt) {
+    LLVM_DEBUG(dbgs() << "  CallSite: " << CB << "\n");
+    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+         ++ArgIt) {
       Value *A = *ArgIt;
-      unsigned i = ArgIt - CS.arg_begin();
+      unsigned i = ArgIt - CB.arg_begin();
       if (!A->getType()->isSized()) {
-        LLVM_DEBUG(dbgs() << "Arg " << i << " is not sized: " << I << "\n");
+        LLVM_DEBUG(dbgs() << "Arg " << i << " is not sized: " << CB << "\n");
         continue;
       }
       unsigned Size = 0;
@@ -3311,12 +3460,23 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                         << " Shadow: " << *ArgShadow << "\n");
       bool ArgIsInitialized = false;
       const DataLayout &DL = F.getParent()->getDataLayout();
-      if (CS.paramHasAttr(i, Attribute::ByVal)) {
+
+      bool ByVal = CB.paramHasAttr(i, Attribute::ByVal);
+      bool NoUndef = CB.paramHasAttr(i, Attribute::NoUndef);
+      bool EagerCheck = ClEagerChecks && !ByVal && NoUndef;
+
+      if (EagerCheck) {
+        insertShadowCheck(A, &CB);
+        continue;
+      }
+      if (ByVal) {
+        // ByVal requires some special handling as it's too big for a single
+        // load
         assert(A->getType()->isPointerTy() &&
                "ByVal argument is not a pointer!");
-        Size = DL.getTypeAllocSize(A->getType()->getPointerElementType());
+        Size = DL.getTypeAllocSize(CB.getParamByValType(i));
         if (ArgOffset + Size > kParamTLSSize) break;
-        const MaybeAlign ParamAlignment(CS.getParamAlignment(i));
+        const MaybeAlign ParamAlignment(CB.getParamAlign(i));
         MaybeAlign Alignment = llvm::None;
         if (ParamAlignment)
           Alignment = std::min(*ParamAlignment, kShadowTLSAlignment);
@@ -3329,10 +3489,11 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
                                  Alignment, Size);
         // TODO(glider): need to copy origins.
       } else {
+        // Any other parameters mean we need bit-grained tracking of uninit data
         Size = DL.getTypeAllocSize(A->getType());
         if (ArgOffset + Size > kParamTLSSize) break;
         Store = IRB.CreateAlignedStore(ArgShadow, ArgShadowBase,
-                                       kShadowTLSAlignment.value());
+                                       kShadowTLSAlignment);
         Constant *Cst = dyn_cast<Constant>(ArgShadow);
         if (Cst && Cst->isNullValue()) ArgIsInitialized = true;
       }
@@ -3346,32 +3507,41 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
     LLVM_DEBUG(dbgs() << "  done with call args\n");
 
-    FunctionType *FT = CS.getFunctionType();
+    FunctionType *FT = CB.getFunctionType();
     if (FT->isVarArg()) {
-      VAHelper->visitCallSite(CS, IRB);
+      VAHelper->visitCallBase(CB, IRB);
     }
 
     // Now, get the shadow for the RetVal.
-    if (!I.getType()->isSized()) return;
+    if (!CB.getType()->isSized())
+      return;
     // Don't emit the epilogue for musttail call returns.
-    if (CS.isCall() && cast<CallInst>(&I)->isMustTailCall()) return;
-    IRBuilder<> IRBBefore(&I);
+    if (isa<CallInst>(CB) && cast<CallInst>(CB).isMustTailCall())
+      return;
+
+    if (ClEagerChecks && CB.hasRetAttr(Attribute::NoUndef)) {
+      setShadow(&CB, getCleanShadow(&CB));
+      setOrigin(&CB, getCleanOrigin());
+      return;
+    }
+
+    IRBuilder<> IRBBefore(&CB);
     // Until we have full dynamic coverage, make sure the retval shadow is 0.
-    Value *Base = getShadowPtrForRetval(&I, IRBBefore);
-    IRBBefore.CreateAlignedStore(getCleanShadow(&I), Base,
-                                 kShadowTLSAlignment.value());
+    Value *Base = getShadowPtrForRetval(&CB, IRBBefore);
+    IRBBefore.CreateAlignedStore(getCleanShadow(&CB), Base,
+                                 kShadowTLSAlignment);
     BasicBlock::iterator NextInsn;
-    if (CS.isCall()) {
-      NextInsn = ++I.getIterator();
-      assert(NextInsn != I.getParent()->end());
+    if (isa<CallInst>(CB)) {
+      NextInsn = ++CB.getIterator();
+      assert(NextInsn != CB.getParent()->end());
     } else {
-      BasicBlock *NormalDest = cast<InvokeInst>(&I)->getNormalDest();
+      BasicBlock *NormalDest = cast<InvokeInst>(CB).getNormalDest();
       if (!NormalDest->getSinglePredecessor()) {
         // FIXME: this case is tricky, so we are just conservative here.
         // Perhaps we need to split the edge between this BB and NormalDest,
         // but a naive attempt to use SplitEdge leads to a crash.
-        setShadow(&I, getCleanShadow(&I));
-        setOrigin(&I, getCleanOrigin());
+        setShadow(&CB, getCleanShadow(&CB));
+        setOrigin(&CB, getCleanOrigin());
         return;
       }
       // FIXME: NextInsn is likely in a basic block that has not been visited yet.
@@ -3382,12 +3552,12 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     }
     IRBuilder<> IRBAfter(&*NextInsn);
     Value *RetvalShadow = IRBAfter.CreateAlignedLoad(
-        getShadowTy(&I), getShadowPtrForRetval(&I, IRBAfter),
-        kShadowTLSAlignment.value(), "_msret");
-    setShadow(&I, RetvalShadow);
+        getShadowTy(&CB), getShadowPtrForRetval(&CB, IRBAfter),
+        kShadowTLSAlignment, "_msret");
+    setShadow(&CB, RetvalShadow);
     if (MS.TrackOrigins)
-      setOrigin(&I, IRBAfter.CreateLoad(MS.OriginTy,
-                                        getOriginPtrForRetval(IRBAfter)));
+      setOrigin(&CB, IRBAfter.CreateLoad(MS.OriginTy,
+                                         getOriginPtrForRetval(IRBAfter)));
   }
 
   bool isAMustTailRetVal(Value *RetVal) {
@@ -3407,14 +3577,26 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     // Don't emit the epilogue for musttail call returns.
     if (isAMustTailRetVal(RetVal)) return;
     Value *ShadowPtr = getShadowPtrForRetval(RetVal, IRB);
-    if (CheckReturnValue) {
+    bool HasNoUndef =
+        F.hasAttribute(AttributeList::ReturnIndex, Attribute::NoUndef);
+    bool StoreShadow = !(ClEagerChecks && HasNoUndef);
+    // FIXME: Consider using SpecialCaseList to specify a list of functions that
+    // must always return fully initialized values. For now, we hardcode "main".
+    bool EagerCheck = (ClEagerChecks && HasNoUndef) || (F.getName() == "main");
+
+    Value *Shadow = getShadow(RetVal);
+    bool StoreOrigin = true;
+    if (EagerCheck) {
       insertShadowCheck(RetVal, &I);
-      Value *Shadow = getCleanShadow(RetVal);
-      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment.value());
-    } else {
-      Value *Shadow = getShadow(RetVal);
-      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment.value());
-      if (MS.TrackOrigins)
+      Shadow = getCleanShadow(RetVal);
+      StoreOrigin = false;
+    }
+
+    // The caller may still expect information passed over TLS if we pass our
+    // check
+    if (StoreShadow) {
+      IRB.CreateAlignedStore(Shadow, ShadowPtr, kShadowTLSAlignment);
+      if (MS.TrackOrigins && StoreOrigin)
         IRB.CreateStore(getOrigin(RetVal), getOriginPtrForRetval(IRB));
     }
   }
@@ -3455,7 +3637,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     } else {
       Value *ShadowBase, *OriginBase;
       std::tie(ShadowBase, OriginBase) = getShadowOriginPtr(
-          &I, IRB, IRB.getInt8Ty(), Align::None(), /*isStore*/ true);
+          &I, IRB, IRB.getInt8Ty(), Align(1), /*isStore*/ true);
 
       Value *PoisonValue = IRB.getInt8(PoisonStack ? ClPoisonStackPattern : 0);
       IRB.CreateMemSet(ShadowBase, PoisonValue, Len,
@@ -3697,7 +3879,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     const DataLayout &DL = F.getParent()->getDataLayout();
     CallBase *CB = cast<CallBase>(&I);
     IRBuilder<> IRB(&I);
-    InlineAsm *IA = cast<InlineAsm>(CB->getCalledValue());
+    InlineAsm *IA = cast<InlineAsm>(CB->getCalledOperand());
     int OutputArgs = getNumOutputArgs(IA, CB);
     // The last operand of a CallInst is the function itself.
     int NumOperands = CB->getNumOperands() - 1;
@@ -3738,7 +3920,7 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
 /// AMD64-specific implementation of VarArgHelper.
 struct VarArgAMD64Helper : public VarArgHelper {
   // An unfortunate workaround for asymmetric lowering of va_arg stuff.
-  // See a comment in visitCallSite for more details.
+  // See a comment in visitCallBase for more details.
   static const unsigned AMD64GpEndOffset = 48;  // AMD64 ABI Draft 0.99.6 p3.5.7
   static const unsigned AMD64FpEndOffsetSSE = 176;
   // If SSE is disabled, fp_offset in va_list is zero.
@@ -3790,17 +3972,17 @@ struct VarArgAMD64Helper : public VarArgHelper {
   // would have been to associate each live instance of va_list with a copy of
   // MSanParamTLS, and extract shadow on va_arg() call in the argument list
   // order.
-  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
     unsigned GpOffset = 0;
     unsigned FpOffset = AMD64GpEndOffset;
     unsigned OverflowOffset = AMD64FpEndOffset;
     const DataLayout &DL = F.getParent()->getDataLayout();
-    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
-         ArgIt != End; ++ArgIt) {
+    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+         ++ArgIt) {
       Value *A = *ArgIt;
-      unsigned ArgNo = CS.getArgumentNo(ArgIt);
-      bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
-      bool IsByVal = CS.paramHasAttr(ArgNo, Attribute::ByVal);
+      unsigned ArgNo = CB.getArgOperandNo(ArgIt);
+      bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
+      bool IsByVal = CB.paramHasAttr(ArgNo, Attribute::ByVal);
       if (IsByVal) {
         // ByVal arguments always go to the overflow area.
         // Fixed arguments passed through the overflow area will be stepped
@@ -3808,7 +3990,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
         if (IsFixed)
           continue;
         assert(A->getType()->isPointerTy());
-        Type *RealTy = A->getType()->getPointerElementType();
+        Type *RealTy = CB.getParamByValType(ArgNo);
         uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
         Value *ShadowBase = getShadowPtrForVAArgument(
             RealTy, IRB, OverflowOffset, alignTo(ArgSize, 8));
@@ -3871,7 +4053,7 @@ struct VarArgAMD64Helper : public VarArgHelper {
         if (!ShadowBase)
           continue;
         Value *Shadow = MSV.getShadow(A);
-        IRB.CreateAlignedStore(Shadow, ShadowBase, kShadowTLSAlignment.value());
+        IRB.CreateAlignedStore(Shadow, ShadowBase, kShadowTLSAlignment);
         if (MS.TrackOrigins) {
           Value *Origin = MSV.getOrigin(A);
           unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
@@ -4020,11 +4202,11 @@ struct VarArgMIPS64Helper : public VarArgHelper {
   VarArgMIPS64Helper(Function &F, MemorySanitizer &MS,
                     MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {}
 
-  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
     unsigned VAArgOffset = 0;
     const DataLayout &DL = F.getParent()->getDataLayout();
-    for (CallSite::arg_iterator ArgIt = CS.arg_begin() +
-         CS.getFunctionType()->getNumParams(), End = CS.arg_end();
+    for (auto ArgIt = CB.arg_begin() + CB.getFunctionType()->getNumParams(),
+              End = CB.arg_end();
          ArgIt != End; ++ArgIt) {
       Triple TargetTriple(F.getParent()->getTargetTriple());
       Value *A = *ArgIt;
@@ -4041,8 +4223,7 @@ struct VarArgMIPS64Helper : public VarArgHelper {
       VAArgOffset = alignTo(VAArgOffset, 8);
       if (!Base)
         continue;
-      IRB.CreateAlignedStore(MSV.getShadow(A), Base,
-                             kShadowTLSAlignment.value());
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
     }
 
     Constant *TotalVAArgSize = ConstantInt::get(IRB.getInt64Ty(), VAArgOffset);
@@ -4170,17 +4351,17 @@ struct VarArgAArch64Helper : public VarArgHelper {
   // the remaining arguments.
   // Using constant offset within the va_arg TLS array allows fast copy
   // in the finalize instrumentation.
-  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
     unsigned GrOffset = AArch64GrBegOffset;
     unsigned VrOffset = AArch64VrBegOffset;
     unsigned OverflowOffset = AArch64VAEndOffset;
 
     const DataLayout &DL = F.getParent()->getDataLayout();
-    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
-         ArgIt != End; ++ArgIt) {
+    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+         ++ArgIt) {
       Value *A = *ArgIt;
-      unsigned ArgNo = CS.getArgumentNo(ArgIt);
-      bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
+      unsigned ArgNo = CB.getArgOperandNo(ArgIt);
+      bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
       ArgKind AK = classifyArgument(A);
       if (AK == AK_GeneralPurpose && GrOffset >= AArch64GrEndOffset)
         AK = AK_Memory;
@@ -4213,8 +4394,7 @@ struct VarArgAArch64Helper : public VarArgHelper {
         continue;
       if (!Base)
         continue;
-      IRB.CreateAlignedStore(MSV.getShadow(A), Base,
-                             kShadowTLSAlignment.value());
+      IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
     }
     Constant *OverflowSize =
       ConstantInt::get(IRB.getInt64Ty(), OverflowOffset - AArch64VAEndOffset);
@@ -4310,7 +4490,7 @@ struct VarArgAArch64Helper : public VarArgHelper {
       // for 128-bit FP/SIMD vn-v7).
       // We need then to propagate the shadow arguments on both regions
       // 'va::__gr_top + va::__gr_offs' and 'va::__vr_top + va::__vr_offs'.
-      // The remaning arguments are saved on shadow for 'va::stack'.
+      // The remaining arguments are saved on shadow for 'va::stack'.
       // One caveat is it requires only to propagate the non-named arguments,
       // however on the call site instrumentation 'all' the arguments are
       // saved. So to copy the shadow values from the va_arg TLS array
@@ -4400,7 +4580,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
   VarArgPowerPC64Helper(Function &F, MemorySanitizer &MS,
                     MemorySanitizerVisitor &MSV) : F(F), MS(MS), MSV(MSV) {}
 
-  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {
+  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
     // For PowerPC, we need to deal with alignment of stack arguments -
     // they are mostly aligned to 8 bytes, but vectors and i128 arrays
     // are aligned to 16 bytes, byvals can be aligned to 8 or 16 bytes,
@@ -4411,7 +4591,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
     Triple TargetTriple(F.getParent()->getTargetTriple());
     // Parameter save area starts at 48 bytes from frame pointer for ABIv1,
     // and 32 bytes for ABIv2.  This is usually determined by target
-    // endianness, but in theory could be overriden by function attribute.
+    // endianness, but in theory could be overridden by function attribute.
     // For simplicity, we ignore it here (it'd only matter for QPX vectors).
     if (TargetTriple.getArch() == Triple::ppc64)
       VAArgBase = 48;
@@ -4419,19 +4599,19 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
       VAArgBase = 32;
     unsigned VAArgOffset = VAArgBase;
     const DataLayout &DL = F.getParent()->getDataLayout();
-    for (CallSite::arg_iterator ArgIt = CS.arg_begin(), End = CS.arg_end();
-         ArgIt != End; ++ArgIt) {
+    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+         ++ArgIt) {
       Value *A = *ArgIt;
-      unsigned ArgNo = CS.getArgumentNo(ArgIt);
-      bool IsFixed = ArgNo < CS.getFunctionType()->getNumParams();
-      bool IsByVal = CS.paramHasAttr(ArgNo, Attribute::ByVal);
+      unsigned ArgNo = CB.getArgOperandNo(ArgIt);
+      bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
+      bool IsByVal = CB.paramHasAttr(ArgNo, Attribute::ByVal);
       if (IsByVal) {
         assert(A->getType()->isPointerTy());
-        Type *RealTy = A->getType()->getPointerElementType();
+        Type *RealTy = CB.getParamByValType(ArgNo);
         uint64_t ArgSize = DL.getTypeAllocSize(RealTy);
-        uint64_t ArgAlign = CS.getParamAlignment(ArgNo);
-        if (ArgAlign < 8)
-          ArgAlign = 8;
+        MaybeAlign ArgAlign = CB.getParamAlign(ArgNo);
+        if (!ArgAlign || *ArgAlign < Align(8))
+          ArgAlign = Align(8);
         VAArgOffset = alignTo(VAArgOffset, ArgAlign);
         if (!IsFixed) {
           Value *Base = getShadowPtrForVAArgument(
@@ -4474,8 +4654,7 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
           Base = getShadowPtrForVAArgument(A->getType(), IRB,
                                            VAArgOffset - VAArgBase, ArgSize);
           if (Base)
-            IRB.CreateAlignedStore(MSV.getShadow(A), Base,
-                                   kShadowTLSAlignment.value());
+            IRB.CreateAlignedStore(MSV.getShadow(A), Base, kShadowTLSAlignment);
         }
         VAArgOffset += ArgSize;
         VAArgOffset = alignTo(VAArgOffset, 8);
@@ -4566,12 +4745,324 @@ struct VarArgPowerPC64Helper : public VarArgHelper {
   }
 };
 
+/// SystemZ-specific implementation of VarArgHelper.
+struct VarArgSystemZHelper : public VarArgHelper {
+  static const unsigned SystemZGpOffset = 16;
+  static const unsigned SystemZGpEndOffset = 56;
+  static const unsigned SystemZFpOffset = 128;
+  static const unsigned SystemZFpEndOffset = 160;
+  static const unsigned SystemZMaxVrArgs = 8;
+  static const unsigned SystemZRegSaveAreaSize = 160;
+  static const unsigned SystemZOverflowOffset = 160;
+  static const unsigned SystemZVAListTagSize = 32;
+  static const unsigned SystemZOverflowArgAreaPtrOffset = 16;
+  static const unsigned SystemZRegSaveAreaPtrOffset = 24;
+
+  Function &F;
+  MemorySanitizer &MS;
+  MemorySanitizerVisitor &MSV;
+  Value *VAArgTLSCopy = nullptr;
+  Value *VAArgTLSOriginCopy = nullptr;
+  Value *VAArgOverflowSize = nullptr;
+
+  SmallVector<CallInst *, 16> VAStartInstrumentationList;
+
+  enum class ArgKind {
+    GeneralPurpose,
+    FloatingPoint,
+    Vector,
+    Memory,
+    Indirect,
+  };
+
+  enum class ShadowExtension { None, Zero, Sign };
+
+  VarArgSystemZHelper(Function &F, MemorySanitizer &MS,
+                      MemorySanitizerVisitor &MSV)
+      : F(F), MS(MS), MSV(MSV) {}
+
+  ArgKind classifyArgument(Type *T, bool IsSoftFloatABI) {
+    // T is a SystemZABIInfo::classifyArgumentType() output, and there are
+    // only a few possibilities of what it can be. In particular, enums, single
+    // element structs and large types have already been taken care of.
+
+    // Some i128 and fp128 arguments are converted to pointers only in the
+    // back end.
+    if (T->isIntegerTy(128) || T->isFP128Ty())
+      return ArgKind::Indirect;
+    if (T->isFloatingPointTy())
+      return IsSoftFloatABI ? ArgKind::GeneralPurpose : ArgKind::FloatingPoint;
+    if (T->isIntegerTy() || T->isPointerTy())
+      return ArgKind::GeneralPurpose;
+    if (T->isVectorTy())
+      return ArgKind::Vector;
+    return ArgKind::Memory;
+  }
+
+  ShadowExtension getShadowExtension(const CallBase &CB, unsigned ArgNo) {
+    // ABI says: "One of the simple integer types no more than 64 bits wide.
+    // ... If such an argument is shorter than 64 bits, replace it by a full
+    // 64-bit integer representing the same number, using sign or zero
+    // extension". Shadow for an integer argument has the same type as the
+    // argument itself, so it can be sign or zero extended as well.
+    bool ZExt = CB.paramHasAttr(ArgNo, Attribute::ZExt);
+    bool SExt = CB.paramHasAttr(ArgNo, Attribute::SExt);
+    if (ZExt) {
+      assert(!SExt);
+      return ShadowExtension::Zero;
+    }
+    if (SExt) {
+      assert(!ZExt);
+      return ShadowExtension::Sign;
+    }
+    return ShadowExtension::None;
+  }
+
+  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {
+    bool IsSoftFloatABI = CB.getCalledFunction()
+                              ->getFnAttribute("use-soft-float")
+                              .getValueAsString() == "true";
+    unsigned GpOffset = SystemZGpOffset;
+    unsigned FpOffset = SystemZFpOffset;
+    unsigned VrIndex = 0;
+    unsigned OverflowOffset = SystemZOverflowOffset;
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    for (auto ArgIt = CB.arg_begin(), End = CB.arg_end(); ArgIt != End;
+         ++ArgIt) {
+      Value *A = *ArgIt;
+      unsigned ArgNo = CB.getArgOperandNo(ArgIt);
+      bool IsFixed = ArgNo < CB.getFunctionType()->getNumParams();
+      // SystemZABIInfo does not produce ByVal parameters.
+      assert(!CB.paramHasAttr(ArgNo, Attribute::ByVal));
+      Type *T = A->getType();
+      ArgKind AK = classifyArgument(T, IsSoftFloatABI);
+      if (AK == ArgKind::Indirect) {
+        T = PointerType::get(T, 0);
+        AK = ArgKind::GeneralPurpose;
+      }
+      if (AK == ArgKind::GeneralPurpose && GpOffset >= SystemZGpEndOffset)
+        AK = ArgKind::Memory;
+      if (AK == ArgKind::FloatingPoint && FpOffset >= SystemZFpEndOffset)
+        AK = ArgKind::Memory;
+      if (AK == ArgKind::Vector && (VrIndex >= SystemZMaxVrArgs || !IsFixed))
+        AK = ArgKind::Memory;
+      Value *ShadowBase = nullptr;
+      Value *OriginBase = nullptr;
+      ShadowExtension SE = ShadowExtension::None;
+      switch (AK) {
+      case ArgKind::GeneralPurpose: {
+        // Always keep track of GpOffset, but store shadow only for varargs.
+        uint64_t ArgSize = 8;
+        if (GpOffset + ArgSize <= kParamTLSSize) {
+          if (!IsFixed) {
+            SE = getShadowExtension(CB, ArgNo);
+            uint64_t GapSize = 0;
+            if (SE == ShadowExtension::None) {
+              uint64_t ArgAllocSize = DL.getTypeAllocSize(T);
+              assert(ArgAllocSize <= ArgSize);
+              GapSize = ArgSize - ArgAllocSize;
+            }
+            ShadowBase = getShadowAddrForVAArgument(IRB, GpOffset + GapSize);
+            if (MS.TrackOrigins)
+              OriginBase = getOriginPtrForVAArgument(IRB, GpOffset + GapSize);
+          }
+          GpOffset += ArgSize;
+        } else {
+          GpOffset = kParamTLSSize;
+        }
+        break;
+      }
+      case ArgKind::FloatingPoint: {
+        // Always keep track of FpOffset, but store shadow only for varargs.
+        uint64_t ArgSize = 8;
+        if (FpOffset + ArgSize <= kParamTLSSize) {
+          if (!IsFixed) {
+            // PoP says: "A short floating-point datum requires only the
+            // left-most 32 bit positions of a floating-point register".
+            // Therefore, in contrast to AK_GeneralPurpose and AK_Memory,
+            // don't extend shadow and don't mind the gap.
+            ShadowBase = getShadowAddrForVAArgument(IRB, FpOffset);
+            if (MS.TrackOrigins)
+              OriginBase = getOriginPtrForVAArgument(IRB, FpOffset);
+          }
+          FpOffset += ArgSize;
+        } else {
+          FpOffset = kParamTLSSize;
+        }
+        break;
+      }
+      case ArgKind::Vector: {
+        // Keep track of VrIndex. No need to store shadow, since vector varargs
+        // go through AK_Memory.
+        assert(IsFixed);
+        VrIndex++;
+        break;
+      }
+      case ArgKind::Memory: {
+        // Keep track of OverflowOffset and store shadow only for varargs.
+        // Ignore fixed args, since we need to copy only the vararg portion of
+        // the overflow area shadow.
+        if (!IsFixed) {
+          uint64_t ArgAllocSize = DL.getTypeAllocSize(T);
+          uint64_t ArgSize = alignTo(ArgAllocSize, 8);
+          if (OverflowOffset + ArgSize <= kParamTLSSize) {
+            SE = getShadowExtension(CB, ArgNo);
+            uint64_t GapSize =
+                SE == ShadowExtension::None ? ArgSize - ArgAllocSize : 0;
+            ShadowBase =
+                getShadowAddrForVAArgument(IRB, OverflowOffset + GapSize);
+            if (MS.TrackOrigins)
+              OriginBase =
+                  getOriginPtrForVAArgument(IRB, OverflowOffset + GapSize);
+            OverflowOffset += ArgSize;
+          } else {
+            OverflowOffset = kParamTLSSize;
+          }
+        }
+        break;
+      }
+      case ArgKind::Indirect:
+        llvm_unreachable("Indirect must be converted to GeneralPurpose");
+      }
+      if (ShadowBase == nullptr)
+        continue;
+      Value *Shadow = MSV.getShadow(A);
+      if (SE != ShadowExtension::None)
+        Shadow = MSV.CreateShadowCast(IRB, Shadow, IRB.getInt64Ty(),
+                                      /*Signed*/ SE == ShadowExtension::Sign);
+      ShadowBase = IRB.CreateIntToPtr(
+          ShadowBase, PointerType::get(Shadow->getType(), 0), "_msarg_va_s");
+      IRB.CreateStore(Shadow, ShadowBase);
+      if (MS.TrackOrigins) {
+        Value *Origin = MSV.getOrigin(A);
+        unsigned StoreSize = DL.getTypeStoreSize(Shadow->getType());
+        MSV.paintOrigin(IRB, Origin, OriginBase, StoreSize,
+                        kMinOriginAlignment);
+      }
+    }
+    Constant *OverflowSize = ConstantInt::get(
+        IRB.getInt64Ty(), OverflowOffset - SystemZOverflowOffset);
+    IRB.CreateStore(OverflowSize, MS.VAArgOverflowSizeTLS);
+  }
+
+  Value *getShadowAddrForVAArgument(IRBuilder<> &IRB, unsigned ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgTLS, MS.IntptrTy);
+    return IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+  }
+
+  Value *getOriginPtrForVAArgument(IRBuilder<> &IRB, int ArgOffset) {
+    Value *Base = IRB.CreatePointerCast(MS.VAArgOriginTLS, MS.IntptrTy);
+    Base = IRB.CreateAdd(Base, ConstantInt::get(MS.IntptrTy, ArgOffset));
+    return IRB.CreateIntToPtr(Base, PointerType::get(MS.OriginTy, 0),
+                              "_msarg_va_o");
+  }
+
+  void unpoisonVAListTagForInst(IntrinsicInst &I) {
+    IRBuilder<> IRB(&I);
+    Value *VAListTag = I.getArgOperand(0);
+    Value *ShadowPtr, *OriginPtr;
+    const Align Alignment = Align(8);
+    std::tie(ShadowPtr, OriginPtr) =
+        MSV.getShadowOriginPtr(VAListTag, IRB, IRB.getInt8Ty(), Alignment,
+                               /*isStore*/ true);
+    IRB.CreateMemSet(ShadowPtr, Constant::getNullValue(IRB.getInt8Ty()),
+                     SystemZVAListTagSize, Alignment, false);
+  }
+
+  void visitVAStartInst(VAStartInst &I) override {
+    VAStartInstrumentationList.push_back(&I);
+    unpoisonVAListTagForInst(I);
+  }
+
+  void visitVACopyInst(VACopyInst &I) override { unpoisonVAListTagForInst(I); }
+
+  void copyRegSaveArea(IRBuilder<> &IRB, Value *VAListTag) {
+    Type *RegSaveAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+    Value *RegSaveAreaPtrPtr = IRB.CreateIntToPtr(
+        IRB.CreateAdd(
+            IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+            ConstantInt::get(MS.IntptrTy, SystemZRegSaveAreaPtrOffset)),
+        PointerType::get(RegSaveAreaPtrTy, 0));
+    Value *RegSaveAreaPtr = IRB.CreateLoad(RegSaveAreaPtrTy, RegSaveAreaPtrPtr);
+    Value *RegSaveAreaShadowPtr, *RegSaveAreaOriginPtr;
+    const Align Alignment = Align(8);
+    std::tie(RegSaveAreaShadowPtr, RegSaveAreaOriginPtr) =
+        MSV.getShadowOriginPtr(RegSaveAreaPtr, IRB, IRB.getInt8Ty(), Alignment,
+                               /*isStore*/ true);
+    // TODO(iii): copy only fragments filled by visitCallBase()
+    IRB.CreateMemCpy(RegSaveAreaShadowPtr, Alignment, VAArgTLSCopy, Alignment,
+                     SystemZRegSaveAreaSize);
+    if (MS.TrackOrigins)
+      IRB.CreateMemCpy(RegSaveAreaOriginPtr, Alignment, VAArgTLSOriginCopy,
+                       Alignment, SystemZRegSaveAreaSize);
+  }
+
+  void copyOverflowArea(IRBuilder<> &IRB, Value *VAListTag) {
+    Type *OverflowArgAreaPtrTy = Type::getInt64PtrTy(*MS.C);
+    Value *OverflowArgAreaPtrPtr = IRB.CreateIntToPtr(
+        IRB.CreateAdd(
+            IRB.CreatePtrToInt(VAListTag, MS.IntptrTy),
+            ConstantInt::get(MS.IntptrTy, SystemZOverflowArgAreaPtrOffset)),
+        PointerType::get(OverflowArgAreaPtrTy, 0));
+    Value *OverflowArgAreaPtr =
+        IRB.CreateLoad(OverflowArgAreaPtrTy, OverflowArgAreaPtrPtr);
+    Value *OverflowArgAreaShadowPtr, *OverflowArgAreaOriginPtr;
+    const Align Alignment = Align(8);
+    std::tie(OverflowArgAreaShadowPtr, OverflowArgAreaOriginPtr) =
+        MSV.getShadowOriginPtr(OverflowArgAreaPtr, IRB, IRB.getInt8Ty(),
+                               Alignment, /*isStore*/ true);
+    Value *SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSCopy,
+                                           SystemZOverflowOffset);
+    IRB.CreateMemCpy(OverflowArgAreaShadowPtr, Alignment, SrcPtr, Alignment,
+                     VAArgOverflowSize);
+    if (MS.TrackOrigins) {
+      SrcPtr = IRB.CreateConstGEP1_32(IRB.getInt8Ty(), VAArgTLSOriginCopy,
+                                      SystemZOverflowOffset);
+      IRB.CreateMemCpy(OverflowArgAreaOriginPtr, Alignment, SrcPtr, Alignment,
+                       VAArgOverflowSize);
+    }
+  }
+
+  void finalizeInstrumentation() override {
+    assert(!VAArgOverflowSize && !VAArgTLSCopy &&
+           "finalizeInstrumentation called twice");
+    if (!VAStartInstrumentationList.empty()) {
+      // If there is a va_start in this function, make a backup copy of
+      // va_arg_tls somewhere in the function entry block.
+      IRBuilder<> IRB(MSV.ActualFnStart->getFirstNonPHI());
+      VAArgOverflowSize =
+          IRB.CreateLoad(IRB.getInt64Ty(), MS.VAArgOverflowSizeTLS);
+      Value *CopySize =
+          IRB.CreateAdd(ConstantInt::get(MS.IntptrTy, SystemZOverflowOffset),
+                        VAArgOverflowSize);
+      VAArgTLSCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+      IRB.CreateMemCpy(VAArgTLSCopy, Align(8), MS.VAArgTLS, Align(8), CopySize);
+      if (MS.TrackOrigins) {
+        VAArgTLSOriginCopy = IRB.CreateAlloca(Type::getInt8Ty(*MS.C), CopySize);
+        IRB.CreateMemCpy(VAArgTLSOriginCopy, Align(8), MS.VAArgOriginTLS,
+                         Align(8), CopySize);
+      }
+    }
+
+    // Instrument va_start.
+    // Copy va_list shadow from the backup copy of the TLS contents.
+    for (size_t VaStartNo = 0, VaStartNum = VAStartInstrumentationList.size();
+         VaStartNo < VaStartNum; VaStartNo++) {
+      CallInst *OrigInst = VAStartInstrumentationList[VaStartNo];
+      IRBuilder<> IRB(OrigInst->getNextNode());
+      Value *VAListTag = OrigInst->getArgOperand(0);
+      copyRegSaveArea(IRB, VAListTag);
+      copyOverflowArea(IRB, VAListTag);
+    }
+  }
+};
+
 /// A no-op implementation of VarArgHelper.
 struct VarArgNoOpHelper : public VarArgHelper {
   VarArgNoOpHelper(Function &F, MemorySanitizer &MS,
                    MemorySanitizerVisitor &MSV) {}
 
-  void visitCallSite(CallSite &CS, IRBuilder<> &IRB) override {}
+  void visitCallBase(CallBase &CB, IRBuilder<> &IRB) override {}
 
   void visitVAStartInst(VAStartInst &I) override {}
 
@@ -4596,6 +5087,8 @@ static VarArgHelper *CreateVarArgHelper(Function &Func, MemorySanitizer &Msan,
   else if (TargetTriple.getArch() == Triple::ppc64 ||
            TargetTriple.getArch() == Triple::ppc64le)
     return new VarArgPowerPC64Helper(Func, Msan, Visitor);
+  else if (TargetTriple.getArch() == Triple::systemz)
+    return new VarArgSystemZHelper(Func, Msan, Visitor);
   else
     return new VarArgNoOpHelper(Func, Msan, Visitor);
 }
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index cc96bdd1d516..dcfc28887a48 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -52,6 +52,7 @@
 #include "ValueProfileCollector.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -63,13 +64,13 @@
 #include "llvm/Analysis/BlockFrequencyInfo.h"
 #include "llvm/Analysis/BranchProbabilityInfo.h"
 #include "llvm/Analysis/CFG.h"
+#include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Comdat.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -376,6 +377,7 @@ private:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 
@@ -404,6 +406,7 @@ private:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<ProfileSummaryInfoWrapperPass>();
     AU.addRequired<BlockFrequencyInfoWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 
@@ -436,6 +439,7 @@ INITIALIZE_PASS_BEGIN(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
                       "PGO instrumentation.", false, false)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(PGOInstrumentationGenLegacyPass, "pgo-instr-gen",
                     "PGO instrumentation.", false, false)
 
@@ -467,7 +471,7 @@ INITIALIZE_PASS(PGOInstrumentationGenCreateVarLegacyPass,
 
 ModulePass *
 llvm::createPGOInstrumentationGenCreateVarLegacyPass(StringRef CSInstrName) {
-  return new PGOInstrumentationGenCreateVarLegacyPass(CSInstrName);
+  return new PGOInstrumentationGenCreateVarLegacyPass(std::string(CSInstrName));
 }
 
 namespace {
@@ -565,11 +569,11 @@ public:
   }
 
   FuncPGOInstrumentation(
-      Function &Func,
+      Function &Func, TargetLibraryInfo &TLI,
       std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
       bool CreateGlobalVar = false, BranchProbabilityInfo *BPI = nullptr,
       BlockFrequencyInfo *BFI = nullptr, bool IsCS = false)
-      : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func),
+      : F(Func), IsCS(IsCS), ComdatMembers(ComdatMembers), VPC(Func, TLI),
         ValueSites(IPVK_Last + 1), SIVisitor(Func), MST(F, BPI, BFI) {
     // This should be done before CFG hash computation.
     SIVisitor.countSelects(Func);
@@ -799,18 +803,50 @@ BasicBlock *FuncPGOInstrumentation<Edge, BBInfo>::getInstrBB(Edge *E) {
   return canInstrument(InstrBB);
 }
 
+// When generating value profiling calls on Windows routines that make use of
+// handler funclets for exception processing an operand bundle needs to attached
+// to the called function. This routine will set \p OpBundles to contain the
+// funclet information, if any is needed, that should be placed on the generated
+// value profiling call for the value profile candidate call.
+static void
+populateEHOperandBundle(VPCandidateInfo &Cand,
+                        DenseMap<BasicBlock *, ColorVector> &BlockColors,
+                        SmallVectorImpl<OperandBundleDef> &OpBundles) {
+  auto *OrigCall = dyn_cast<CallBase>(Cand.AnnotatedInst);
+  if (OrigCall && !isa<IntrinsicInst>(OrigCall)) {
+    // The instrumentation call should belong to the same funclet as a
+    // non-intrinsic call, so just copy the operand bundle, if any exists.
+    Optional<OperandBundleUse> ParentFunclet =
+        OrigCall->getOperandBundle(LLVMContext::OB_funclet);
+    if (ParentFunclet)
+      OpBundles.emplace_back(OperandBundleDef(*ParentFunclet));
+  } else {
+    // Intrinsics or other instructions do not get funclet information from the
+    // front-end. Need to use the BlockColors that was computed by the routine
+    // colorEHFunclets to determine whether a funclet is needed.
+    if (!BlockColors.empty()) {
+      const ColorVector &CV = BlockColors.find(OrigCall->getParent())->second;
+      assert(CV.size() == 1 && "non-unique color for block!");
+      Instruction *EHPad = CV.front()->getFirstNonPHI();
+      if (EHPad->isEHPad())
+        OpBundles.emplace_back("funclet", EHPad);
+    }
+  }
+}
+
 // Visit all edge and instrument the edges not in MST, and do value profiling.
 // Critical edges will be split.
 static void instrumentOneFunc(
-    Function &F, Module *M, BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFI,
+    Function &F, Module *M, TargetLibraryInfo &TLI, BranchProbabilityInfo *BPI,
+    BlockFrequencyInfo *BFI,
     std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
     bool IsCS) {
   // Split indirectbr critical edges here before computing the MST rather than
   // later in getInstrBB() to avoid invalidating it.
   SplitIndirectBrCriticalEdges(F, BPI, BFI);
 
-  FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, ComdatMembers, true, BPI,
-                                                   BFI, IsCS);
+  FuncPGOInstrumentation<PGOEdge, BBInfo> FuncInfo(F, TLI, ComdatMembers, true,
+                                                   BPI, BFI, IsCS);
   std::vector<BasicBlock *> InstrumentBBs;
   FuncInfo.getInstrumentBBs(InstrumentBBs);
   unsigned NumCounters =
@@ -839,6 +875,15 @@ static void instrumentOneFunc(
 
   NumOfPGOICall += FuncInfo.ValueSites[IPVK_IndirectCallTarget].size();
 
+  // Intrinsic function calls do not have funclet operand bundles needed for
+  // Windows exception handling attached to them. However, if value profiling is
+  // inserted for one of these calls, then a funclet value will need to be set
+  // on the instrumentation call based on the funclet coloring.
+  DenseMap<BasicBlock *, ColorVector> BlockColors;
+  if (F.hasPersonalityFn() &&
+      isFuncletEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
+    BlockColors = colorEHFunclets(F);
+
   // For each VP Kind, walk the VP candidates and instrument each one.
   for (uint32_t Kind = IPVK_First; Kind <= IPVK_Last; ++Kind) {
     unsigned SiteIndex = 0;
@@ -860,11 +905,14 @@ static void instrumentOneFunc(
         ToProfile = Builder.CreatePtrToInt(Cand.V, Builder.getInt64Ty());
       assert(ToProfile && "value profiling Value is of unexpected type");
 
+      SmallVector<OperandBundleDef, 1> OpBundles;
+      populateEHOperandBundle(Cand, BlockColors, OpBundles);
       Builder.CreateCall(
           Intrinsic::getDeclaration(M, Intrinsic::instrprof_value_profile),
           {ConstantExpr::getBitCast(FuncInfo.FuncNameVar, I8PtrTy),
            Builder.getInt64(FuncInfo.FunctionHash), ToProfile,
-           Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)});
+           Builder.getInt32(Kind), Builder.getInt32(SiteIndex++)},
+          OpBundles);
     }
   } // IPVK_First <= Kind <= IPVK_Last
 }
@@ -953,12 +1001,12 @@ namespace {
 
 class PGOUseFunc {
 public:
-  PGOUseFunc(Function &Func, Module *Modu,
+  PGOUseFunc(Function &Func, Module *Modu, TargetLibraryInfo &TLI,
              std::unordered_multimap<Comdat *, GlobalValue *> &ComdatMembers,
              BranchProbabilityInfo *BPI, BlockFrequencyInfo *BFIin,
              ProfileSummaryInfo *PSI, bool IsCS)
       : F(Func), M(Modu), BFI(BFIin), PSI(PSI),
-        FuncInfo(Func, ComdatMembers, false, BPI, BFIin, IsCS),
+        FuncInfo(Func, TLI, ComdatMembers, false, BPI, BFIin, IsCS),
         FreqAttr(FFA_Normal), IsCS(IsCS) {}
 
   // Read counts for the instrumented BB from profile.
@@ -1295,7 +1343,7 @@ void PGOUseFunc::setBranchWeights() {
     if (TI->getNumSuccessors() < 2)
       continue;
     if (!(isa<BranchInst>(TI) || isa<SwitchInst>(TI) ||
-          isa<IndirectBrInst>(TI)))
+          isa<IndirectBrInst>(TI) || isa<InvokeInst>(TI)))
       continue;
 
     if (getBBInfo(&BB).CountValue == 0)
@@ -1460,7 +1508,8 @@ static void collectComdatMembers(
 }
 
 static bool InstrumentAllFunctions(
-    Module &M, function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
+    Module &M, function_ref<TargetLibraryInfo &(Function &)> LookupTLI,
+    function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
     function_ref<BlockFrequencyInfo *(Function &)> LookupBFI, bool IsCS) {
   // For the context-sensitve instrumentation, we should have a separated pass
   // (before LTO/ThinLTO linking) to create these variables.
@@ -1472,9 +1521,10 @@ static bool InstrumentAllFunctions(
   for (auto &F : M) {
     if (F.isDeclaration())
       continue;
+    auto &TLI = LookupTLI(F);
     auto *BPI = LookupBPI(F);
     auto *BFI = LookupBFI(F);
-    instrumentOneFunc(F, &M, BPI, BFI, ComdatMembers, IsCS);
+    instrumentOneFunc(F, &M, TLI, BPI, BFI, ComdatMembers, IsCS);
   }
   return true;
 }
@@ -1490,27 +1540,32 @@ bool PGOInstrumentationGenLegacyPass::runOnModule(Module &M) {
   if (skipModule(M))
     return false;
 
+  auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
+    return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  };
   auto LookupBPI = [this](Function &F) {
     return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
   };
   auto LookupBFI = [this](Function &F) {
     return &this->getAnalysis<BlockFrequencyInfoWrapperPass>(F).getBFI();
   };
-  return InstrumentAllFunctions(M, LookupBPI, LookupBFI, IsCS);
+  return InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS);
 }
 
 PreservedAnalyses PGOInstrumentationGen::run(Module &M,
                                              ModuleAnalysisManager &AM) {
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
   auto LookupBPI = [&FAM](Function &F) {
     return &FAM.getResult<BranchProbabilityAnalysis>(F);
   };
-
   auto LookupBFI = [&FAM](Function &F) {
     return &FAM.getResult<BlockFrequencyAnalysis>(F);
   };
 
-  if (!InstrumentAllFunctions(M, LookupBPI, LookupBFI, IsCS))
+  if (!InstrumentAllFunctions(M, LookupTLI, LookupBPI, LookupBFI, IsCS))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -1518,6 +1573,7 @@ PreservedAnalyses PGOInstrumentationGen::run(Module &M,
 
 static bool annotateAllFunctions(
     Module &M, StringRef ProfileFileName, StringRef ProfileRemappingFileName,
+    function_ref<TargetLibraryInfo &(Function &)> LookupTLI,
     function_ref<BranchProbabilityInfo *(Function &)> LookupBPI,
     function_ref<BlockFrequencyInfo *(Function &)> LookupBFI,
     ProfileSummaryInfo *PSI, bool IsCS) {
@@ -1557,6 +1613,7 @@ static bool annotateAllFunctions(
   M.setProfileSummary(PGOReader->getSummary(IsCS).getMD(M.getContext()),
                       IsCS ? ProfileSummary::PSK_CSInstr
                            : ProfileSummary::PSK_Instr);
+  PSI->refresh();
 
   std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
   collectComdatMembers(M, ComdatMembers);
@@ -1565,12 +1622,13 @@ static bool annotateAllFunctions(
   for (auto &F : M) {
     if (F.isDeclaration())
       continue;
+    auto &TLI = LookupTLI(F);
     auto *BPI = LookupBPI(F);
     auto *BFI = LookupBFI(F);
     // Split indirectbr critical edges here before computing the MST rather than
     // later in getInstrBB() to avoid invalidating it.
     SplitIndirectBrCriticalEdges(F, BPI, BFI);
-    PGOUseFunc Func(F, &M, ComdatMembers, BPI, BFI, PSI, IsCS);
+    PGOUseFunc Func(F, &M, TLI, ComdatMembers, BPI, BFI, PSI, IsCS);
     bool AllZeros = false;
     if (!Func.readCounters(PGOReader.get(), AllZeros))
       continue;
@@ -1651,10 +1709,12 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M,
                                              ModuleAnalysisManager &AM) {
 
   auto &FAM = AM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
+  auto LookupTLI = [&FAM](Function &F) -> TargetLibraryInfo & {
+    return FAM.getResult<TargetLibraryAnalysis>(F);
+  };
   auto LookupBPI = [&FAM](Function &F) {
     return &FAM.getResult<BranchProbabilityAnalysis>(F);
   };
-
   auto LookupBFI = [&FAM](Function &F) {
     return &FAM.getResult<BlockFrequencyAnalysis>(F);
   };
@@ -1662,7 +1722,7 @@ PreservedAnalyses PGOInstrumentationUse::run(Module &M,
   auto *PSI = &AM.getResult<ProfileSummaryAnalysis>(M);
 
   if (!annotateAllFunctions(M, ProfileFileName, ProfileRemappingFileName,
-                            LookupBPI, LookupBFI, PSI, IsCS))
+                            LookupTLI, LookupBPI, LookupBFI, PSI, IsCS))
     return PreservedAnalyses::all();
 
   return PreservedAnalyses::none();
@@ -1672,6 +1732,9 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
   if (skipModule(M))
     return false;
 
+  auto LookupTLI = [this](Function &F) -> TargetLibraryInfo & {
+    return this->getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  };
   auto LookupBPI = [this](Function &F) {
     return &this->getAnalysis<BranchProbabilityInfoWrapperPass>(F).getBPI();
   };
@@ -1680,13 +1743,13 @@ bool PGOInstrumentationUseLegacyPass::runOnModule(Module &M) {
   };
 
   auto *PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
-  return annotateAllFunctions(M, ProfileFileName, "", LookupBPI, LookupBFI, PSI,
-                              IsCS);
+  return annotateAllFunctions(M, ProfileFileName, "", LookupTLI, LookupBPI,
+                              LookupBFI, PSI, IsCS);
 }
 
 static std::string getSimpleNodeName(const BasicBlock *Node) {
   if (!Node->getName().empty())
-    return Node->getName();
+    return std::string(Node->getName());
 
   std::string SimpleNodeName;
   raw_string_ostream OS(SimpleNodeName);
@@ -1750,7 +1813,7 @@ void setIrrLoopHeaderMetadata(Module *M, Instruction *TI, uint64_t Count) {
 
 template <> struct GraphTraits<PGOUseFunc *> {
   using NodeRef = const BasicBlock *;
-  using ChildIteratorType = succ_const_iterator;
+  using ChildIteratorType = const_succ_iterator;
   using nodes_iterator = pointer_iterator<Function::const_iterator>;
 
   static NodeRef getEntryNode(const PGOUseFunc *G) {
@@ -1777,7 +1840,7 @@ template <> struct DOTGraphTraits<PGOUseFunc *> : DefaultDOTGraphTraits {
       : DefaultDOTGraphTraits(isSimple) {}
 
   static std::string getGraphName(const PGOUseFunc *G) {
-    return G->getFunc().getName();
+    return std::string(G->getFunc().getName());
   }
 
   std::string getNodeLabel(const BasicBlock *Node, const PGOUseFunc *Graph) {
diff --git a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
index d0afe2959b39..2b7b859891dc 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOMemOPSizeOpt.cpp
@@ -23,7 +23,6 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
@@ -38,7 +37,6 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/PassRegistry.h"
-#include "llvm/PassSupport.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
@@ -97,6 +95,11 @@ extern cl::opt<std::string> MemOPSizeRange;
 // This option sets the value that groups large memop sizes
 extern cl::opt<unsigned> MemOPSizeLarge;
 
+cl::opt<bool>
+    MemOPOptMemcmpBcmp("pgo-memop-optimize-memcmp-bcmp", cl::init(true),
+                       cl::Hidden,
+                       cl::desc("Size-specialize memcmp and bcmp calls"));
+
 namespace {
 class PGOMemOPSizeOptLegacyPass : public FunctionPass {
 public:
@@ -115,6 +118,7 @@ private:
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetLibraryInfoWrapperPass>();
   }
 };
 } // end anonymous namespace
@@ -124,6 +128,7 @@ INITIALIZE_PASS_BEGIN(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
                       "Optimize memory intrinsic using its size value profile",
                       false, false)
 INITIALIZE_PASS_DEPENDENCY(BlockFrequencyInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(PGOMemOPSizeOptLegacyPass, "pgo-memop-opt",
                     "Optimize memory intrinsic using its size value profile",
                     false, false)
@@ -133,11 +138,90 @@ FunctionPass *llvm::createPGOMemOPSizeOptLegacyPass() {
 }
 
 namespace {
+
+static const char *getMIName(const MemIntrinsic *MI) {
+  switch (MI->getIntrinsicID()) {
+  case Intrinsic::memcpy:
+    return "memcpy";
+  case Intrinsic::memmove:
+    return "memmove";
+  case Intrinsic::memset:
+    return "memset";
+  default:
+    return "unknown";
+  }
+}
+
+// A class that abstracts a memop (memcpy, memmove, memset, memcmp and bcmp).
+struct MemOp {
+  Instruction *I;
+  MemOp(MemIntrinsic *MI) : I(MI) {}
+  MemOp(CallInst *CI) : I(CI) {}
+  MemIntrinsic *asMI() { return dyn_cast<MemIntrinsic>(I); }
+  CallInst *asCI() { return cast<CallInst>(I); }
+  MemOp clone() {
+    if (auto MI = asMI())
+      return MemOp(cast<MemIntrinsic>(MI->clone()));
+    return MemOp(cast<CallInst>(asCI()->clone()));
+  }
+  Value *getLength() {
+    if (auto MI = asMI())
+      return MI->getLength();
+    return asCI()->getArgOperand(2);
+  }
+  void setLength(Value *Length) {
+    if (auto MI = asMI())
+      return MI->setLength(Length);
+    asCI()->setArgOperand(2, Length);
+  }
+  StringRef getFuncName() {
+    if (auto MI = asMI())
+      return MI->getCalledFunction()->getName();
+    return asCI()->getCalledFunction()->getName();
+  }
+  bool isMemmove() {
+    if (auto MI = asMI())
+      if (MI->getIntrinsicID() == Intrinsic::memmove)
+        return true;
+    return false;
+  }
+  bool isMemcmp(TargetLibraryInfo &TLI) {
+    LibFunc Func;
+    if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
+        Func == LibFunc_memcmp) {
+      return true;
+    }
+    return false;
+  }
+  bool isBcmp(TargetLibraryInfo &TLI) {
+    LibFunc Func;
+    if (asMI() == nullptr && TLI.getLibFunc(*asCI(), Func) &&
+        Func == LibFunc_bcmp) {
+      return true;
+    }
+    return false;
+  }
+  const char *getName(TargetLibraryInfo &TLI) {
+    if (auto MI = asMI())
+      return getMIName(MI);
+    LibFunc Func;
+    if (TLI.getLibFunc(*asCI(), Func)) {
+      if (Func == LibFunc_memcmp)
+        return "memcmp";
+      if (Func == LibFunc_bcmp)
+        return "bcmp";
+    }
+    llvm_unreachable("Must be MemIntrinsic or memcmp/bcmp CallInst");
+    return nullptr;
+  }
+};
+
 class MemOPSizeOpt : public InstVisitor<MemOPSizeOpt> {
 public:
   MemOPSizeOpt(Function &Func, BlockFrequencyInfo &BFI,
-               OptimizationRemarkEmitter &ORE, DominatorTree *DT)
-      : Func(Func), BFI(BFI), ORE(ORE), DT(DT), Changed(false) {
+               OptimizationRemarkEmitter &ORE, DominatorTree *DT,
+               TargetLibraryInfo &TLI)
+      : Func(Func), BFI(BFI), ORE(ORE), DT(DT), TLI(TLI), Changed(false) {
     ValueDataArray =
         std::make_unique<InstrProfValueData[]>(MemOPMaxVersion + 2);
     // Get the MemOPSize range information from option MemOPSizeRange,
@@ -149,13 +233,12 @@ public:
     WorkList.clear();
     visit(Func);
 
-    for (auto &MI : WorkList) {
+    for (auto &MO : WorkList) {
       ++NumOfPGOMemOPAnnotate;
-      if (perform(MI)) {
+      if (perform(MO)) {
         Changed = true;
         ++NumOfPGOMemOPOpt;
-        LLVM_DEBUG(dbgs() << "MemOP call: "
-                          << MI->getCalledFunction()->getName()
+        LLVM_DEBUG(dbgs() << "MemOP call: " << MO.getFuncName()
                           << "is Transformed.\n");
       }
     }
@@ -166,7 +249,16 @@ public:
     // Not perform on constant length calls.
     if (dyn_cast<ConstantInt>(Length))
       return;
-    WorkList.push_back(&MI);
+    WorkList.push_back(MemOp(&MI));
+  }
+
+  void visitCallInst(CallInst &CI) {
+    LibFunc Func;
+    if (TLI.getLibFunc(CI, Func) &&
+        (Func == LibFunc_memcmp || Func == LibFunc_bcmp) &&
+        !dyn_cast<ConstantInt>(CI.getArgOperand(2))) {
+      WorkList.push_back(MemOp(&CI));
+    }
   }
 
 private:
@@ -174,15 +266,16 @@ private:
   BlockFrequencyInfo &BFI;
   OptimizationRemarkEmitter &ORE;
   DominatorTree *DT;
+  TargetLibraryInfo &TLI;
   bool Changed;
-  std::vector<MemIntrinsic *> WorkList;
+  std::vector<MemOp> WorkList;
   // Start of the previse range.
   int64_t PreciseRangeStart;
   // Last value of the previse range.
   int64_t PreciseRangeLast;
   // The space to read the profile annotation.
   std::unique_ptr<InstrProfValueData[]> ValueDataArray;
-  bool perform(MemIntrinsic *MI);
+  bool perform(MemOp MO);
 
   // This kind shows which group the value falls in. For PreciseValue, we have
   // the profile count for that value. LargeGroup groups the values that are in
@@ -198,19 +291,6 @@ private:
   }
 };
 
-static const char *getMIName(const MemIntrinsic *MI) {
-  switch (MI->getIntrinsicID()) {
-  case Intrinsic::memcpy:
-    return "memcpy";
-  case Intrinsic::memmove:
-    return "memmove";
-  case Intrinsic::memset:
-    return "memset";
-  default:
-    return "unknown";
-  }
-}
-
 static bool isProfitable(uint64_t Count, uint64_t TotalCount) {
   assert(Count <= TotalCount);
   if (Count < MemOPCountThreshold)
@@ -229,21 +309,23 @@ static inline uint64_t getScaledCount(uint64_t Count, uint64_t Num,
   return ScaleCount / Denom;
 }
 
-bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
-  assert(MI);
-  if (MI->getIntrinsicID() == Intrinsic::memmove)
+bool MemOPSizeOpt::perform(MemOp MO) {
+  assert(MO.I);
+  if (MO.isMemmove())
+    return false;
+  if (!MemOPOptMemcmpBcmp && (MO.isMemcmp(TLI) || MO.isBcmp(TLI)))
     return false;
 
   uint32_t NumVals, MaxNumPromotions = MemOPMaxVersion + 2;
   uint64_t TotalCount;
-  if (!getValueProfDataFromInst(*MI, IPVK_MemOPSize, MaxNumPromotions,
+  if (!getValueProfDataFromInst(*MO.I, IPVK_MemOPSize, MaxNumPromotions,
                                 ValueDataArray.get(), NumVals, TotalCount))
     return false;
 
   uint64_t ActualCount = TotalCount;
   uint64_t SavedTotalCount = TotalCount;
   if (MemOPScaleCount) {
-    auto BBEdgeCount = BFI.getBlockProfileCount(MI->getParent());
+    auto BBEdgeCount = BFI.getBlockProfileCount(MO.I->getParent());
     if (!BBEdgeCount)
       return false;
     ActualCount = *BBEdgeCount;
@@ -335,13 +417,13 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
   // }
   // merge_bb:
 
-  BasicBlock *BB = MI->getParent();
+  BasicBlock *BB = MO.I->getParent();
   LLVM_DEBUG(dbgs() << "\n\n== Basic Block Before ==\n");
   LLVM_DEBUG(dbgs() << *BB << "\n");
   auto OrigBBFreq = BFI.getBlockFreq(BB);
 
-  BasicBlock *DefaultBB = SplitBlock(BB, MI, DT);
-  BasicBlock::iterator It(*MI);
+  BasicBlock *DefaultBB = SplitBlock(BB, MO.I, DT);
+  BasicBlock::iterator It(*MO.I);
   ++It;
   assert(It != DefaultBB->end());
   BasicBlock *MergeBB = SplitBlock(DefaultBB, &(*It), DT);
@@ -353,15 +435,24 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
   auto &Ctx = Func.getContext();
   IRBuilder<> IRB(BB);
   BB->getTerminator()->eraseFromParent();
-  Value *SizeVar = MI->getLength();
+  Value *SizeVar = MO.getLength();
   SwitchInst *SI = IRB.CreateSwitch(SizeVar, DefaultBB, SizeIds.size());
+  Type *MemOpTy = MO.I->getType();
+  PHINode *PHI = nullptr;
+  if (!MemOpTy->isVoidTy()) {
+    // Insert a phi for the return values at the merge block.
+    IRBuilder<> IRBM(MergeBB->getFirstNonPHI());
+    PHI = IRBM.CreatePHI(MemOpTy, SizeIds.size() + 1, "MemOP.RVMerge");
+    MO.I->replaceAllUsesWith(PHI);
+    PHI->addIncoming(MO.I, DefaultBB);
+  }
 
   // Clear the value profile data.
-  MI->setMetadata(LLVMContext::MD_prof, nullptr);
+  MO.I->setMetadata(LLVMContext::MD_prof, nullptr);
   // If all promoted, we don't need the MD.prof metadata.
   if (SavedRemainCount > 0 || Version != NumVals)
     // Otherwise we need update with the un-promoted records back.
-    annotateValueSite(*Func.getParent(), *MI, VDs.slice(Version),
+    annotateValueSite(*Func.getParent(), *MO.I, VDs.slice(Version),
                       SavedRemainCount, IPVK_MemOPSize, NumVals);
 
   LLVM_DEBUG(dbgs() << "\n\n== Basic Block After==\n");
@@ -373,17 +464,18 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
   for (uint64_t SizeId : SizeIds) {
     BasicBlock *CaseBB = BasicBlock::Create(
         Ctx, Twine("MemOP.Case.") + Twine(SizeId), &Func, DefaultBB);
-    Instruction *NewInst = MI->clone();
+    MemOp NewMO = MO.clone();
     // Fix the argument.
-    auto *MemI = cast<MemIntrinsic>(NewInst);
-    auto *SizeType = dyn_cast<IntegerType>(MemI->getLength()->getType());
+    auto *SizeType = dyn_cast<IntegerType>(NewMO.getLength()->getType());
     assert(SizeType && "Expected integer type size argument.");
     ConstantInt *CaseSizeId = ConstantInt::get(SizeType, SizeId);
-    MemI->setLength(CaseSizeId);
-    CaseBB->getInstList().push_back(NewInst);
+    NewMO.setLength(CaseSizeId);
+    CaseBB->getInstList().push_back(NewMO.I);
     IRBuilder<> IRBCase(CaseBB);
     IRBCase.CreateBr(MergeBB);
     SI->addCase(CaseSizeId, CaseBB);
+    if (!MemOpTy->isVoidTy())
+      PHI->addIncoming(NewMO.I, CaseBB);
     if (DT) {
       Updates.push_back({DominatorTree::Insert, CaseBB, MergeBB});
       Updates.push_back({DominatorTree::Insert, BB, CaseBB});
@@ -401,11 +493,10 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
 
   ORE.emit([&]() {
     using namespace ore;
-    return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MI)
-             << "optimized " << NV("Intrinsic", StringRef(getMIName(MI)))
-             << " with count " << NV("Count", SumForOpt) << " out of "
-             << NV("Total", TotalCount) << " for " << NV("Versions", Version)
-             << " versions";
+    return OptimizationRemark(DEBUG_TYPE, "memopt-opt", MO.I)
+           << "optimized " << NV("Memop", MO.getName(TLI)) << " with count "
+           << NV("Count", SumForOpt) << " out of " << NV("Total", TotalCount)
+           << " for " << NV("Versions", Version) << " versions";
   });
 
   return true;
@@ -414,13 +505,13 @@ bool MemOPSizeOpt::perform(MemIntrinsic *MI) {
 
 static bool PGOMemOPSizeOptImpl(Function &F, BlockFrequencyInfo &BFI,
                                 OptimizationRemarkEmitter &ORE,
-                                DominatorTree *DT) {
+                                DominatorTree *DT, TargetLibraryInfo &TLI) {
   if (DisableMemOPOPT)
     return false;
 
   if (F.hasFnAttribute(Attribute::OptimizeForSize))
     return false;
-  MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT);
+  MemOPSizeOpt MemOPSizeOpt(F, BFI, ORE, DT, TLI);
   MemOPSizeOpt.perform();
   return MemOPSizeOpt.isChanged();
 }
@@ -431,7 +522,9 @@ bool PGOMemOPSizeOptLegacyPass::runOnFunction(Function &F) {
   auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
   auto *DTWP = getAnalysisIfAvailable<DominatorTreeWrapperPass>();
   DominatorTree *DT = DTWP ? &DTWP->getDomTree() : nullptr;
-  return PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
+  TargetLibraryInfo &TLI =
+      getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+  return PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
 }
 
 namespace llvm {
@@ -442,7 +535,8 @@ PreservedAnalyses PGOMemOPSizeOpt::run(Function &F,
   auto &BFI = FAM.getResult<BlockFrequencyAnalysis>(F);
   auto &ORE = FAM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   auto *DT = FAM.getCachedResult<DominatorTreeAnalysis>(F);
-  bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT);
+  auto &TLI = FAM.getResult<TargetLibraryAnalysis>(F);
+  bool Changed = PGOMemOPSizeOptImpl(F, BFI, ORE, DT, TLI);
   if (!Changed)
     return PreservedAnalyses::all();
   auto PA = PreservedAnalyses();
diff --git a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
index 71ecfd9a2642..85e096112fca 100644
--- a/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PoisonChecking.cpp
@@ -12,26 +12,24 @@
 // LangRef.  There are obvious parallels to the sanitizer tools, but this pass
 // is focused purely on the semantics of LLVM IR, not any particular source
 // language.   If you're looking for something to see if your C/C++ contains
-// UB, this is not it.  
-// 
+// UB, this is not it.
+//
 // The rewritten semantics of each instruction will include the following
-// components: 
+// components:
 //
 // 1) The original instruction, unmodified.
 // 2) A propagation rule which translates dynamic information about the poison
 //    state of each input to whether the dynamic output of the instruction
 //    produces poison.
-// 3) A flag validation rule which validates any poison producing flags on the
+// 3) A creation rule which validates any poison producing flags on the
 //    instruction itself (e.g. checks for overflow on nsw).
 // 4) A check rule which traps (to a handler function) if this instruction must
 //    execute undefined behavior given the poison state of it's inputs.
 //
-// At the moment, the UB detection is done in a best effort manner; that is,
-// the resulting code may produce a false negative result (not report UB when
-// it actually exists according to the LangRef spec), but should never produce
-// a false positive (report UB where it doesn't exist).  The intention is to
-// eventually support a "strict" mode which never dynamically reports a false
-// negative at the cost of rejecting some valid inputs to translation.
+// This is a must analysis based transform; that is, the resulting code may
+// produce a false negative result (not report UB when actually exists
+// according to the LangRef spec), but should never produce a false positive
+// (report UB where it doesn't exist).
 //
 // Use cases for this pass include:
 // - Understanding (and testing!) the implications of the definition of poison
@@ -40,7 +38,7 @@
 //   are well defined on the specific input used.
 // - Finding/confirming poison specific miscompiles by checking the poison
 //   status of an input/IR pair is the same before and after an optimization
-//   transform. 
+//   transform.
 // - Checking that a bugpoint reduction does not introduce UB which didn't
 //   exist in the original program being reduced.
 //
@@ -56,7 +54,7 @@
 //   moment, all arguments and return values are assumed not to be poison.
 // - Undef is not modeled.  In particular, the optimizer's freedom to pick
 //   concrete values for undef bits so as to maximize potential for producing
-//   poison is not modeled.  
+//   poison is not modeled.
 //
 //===----------------------------------------------------------------------===//
 
@@ -103,10 +101,10 @@ static Value *buildOrChain(IRBuilder<> &B, ArrayRef<Value*> Ops) {
   return Accum;
 }
 
-static void generatePoisonChecksForBinOp(Instruction &I,
-                                         SmallVector<Value*, 2> &Checks) {
+static void generateCreationChecksForBinOp(Instruction &I,
+                                           SmallVectorImpl<Value*> &Checks) {
   assert(isa<BinaryOperator>(I));
-  
+
   IRBuilder<> B(&I);
   Value *LHS = I.getOperand(0);
   Value *RHS = I.getOperand(1);
@@ -183,22 +181,28 @@ static void generatePoisonChecksForBinOp(Instruction &I,
   };
 }
 
-static Value* generatePoisonChecks(Instruction &I) {
+/// Given an instruction which can produce poison on non-poison inputs
+/// (i.e. canCreatePoison returns true), generate runtime checks to produce
+/// boolean indicators of when poison would result.
+static void generateCreationChecks(Instruction &I,
+                                   SmallVectorImpl<Value*> &Checks) {
   IRBuilder<> B(&I);
-  SmallVector<Value*, 2> Checks;
   if (isa<BinaryOperator>(I) && !I.getType()->isVectorTy())
-    generatePoisonChecksForBinOp(I, Checks);
+    generateCreationChecksForBinOp(I, Checks);
 
-  // Handle non-binops seperately
+  // Handle non-binops separately
   switch (I.getOpcode()) {
   default:
+    // Note there are a couple of missing cases here, once implemented, this
+    // should become an llvm_unreachable.
     break;
   case Instruction::ExtractElement: {
     Value *Vec = I.getOperand(0);
-    if (Vec->getType()->getVectorIsScalable())
+    auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType());
+    if (!VecVTy)
       break;
     Value *Idx = I.getOperand(1);
-    unsigned NumElts = Vec->getType()->getVectorNumElements();
+    unsigned NumElts = VecVTy->getNumElements();
     Value *Check =
       B.CreateICmp(ICmpInst::ICMP_UGE, Idx,
                    ConstantInt::get(Idx->getType(), NumElts));
@@ -207,10 +211,11 @@ static Value* generatePoisonChecks(Instruction &I) {
   }
   case Instruction::InsertElement: {
     Value *Vec = I.getOperand(0);
-    if (Vec->getType()->getVectorIsScalable())
+    auto *VecVTy = dyn_cast<FixedVectorType>(Vec->getType());
+    if (!VecVTy)
       break;
     Value *Idx = I.getOperand(2);
-    unsigned NumElts = Vec->getType()->getVectorNumElements();
+    unsigned NumElts = VecVTy->getNumElements();
     Value *Check =
       B.CreateICmp(ICmpInst::ICMP_UGE, Idx,
                    ConstantInt::get(Idx->getType(), NumElts));
@@ -218,7 +223,6 @@ static Value* generatePoisonChecks(Instruction &I) {
     break;
   }
   };
-  return buildOrChain(B, Checks);
 }
 
 static Value *getPoisonFor(DenseMap<Value *, Value *> &ValToPoison, Value *V) {
@@ -262,24 +266,23 @@ static bool rewrite(Function &F) {
   for (BasicBlock &BB : F)
     for (auto I = BB.begin(); isa<PHINode>(&*I); I++) {
       auto *OldPHI = cast<PHINode>(&*I);
-      auto *NewPHI = PHINode::Create(Int1Ty, 
-                                     OldPHI->getNumIncomingValues());
+      auto *NewPHI = PHINode::Create(Int1Ty, OldPHI->getNumIncomingValues());
       for (unsigned i = 0; i < OldPHI->getNumIncomingValues(); i++)
         NewPHI->addIncoming(UndefValue::get(Int1Ty),
                             OldPHI->getIncomingBlock(i));
       NewPHI->insertBefore(OldPHI);
       ValToPoison[OldPHI] = NewPHI;
     }
-  
+
   for (BasicBlock &BB : F)
     for (Instruction &I : BB) {
       if (isa<PHINode>(I)) continue;
 
       IRBuilder<> B(cast<Instruction>(&I));
-      
+
       // Note: There are many more sources of documented UB, but this pass only
       // attempts to find UB triggered by propagation of poison.
-      if (Value *Op = const_cast<Value*>(getGuaranteedNonFullPoisonOp(&I)))
+      if (Value *Op = const_cast<Value*>(getGuaranteedNonPoisonOp(&I)))
         CreateAssertNot(B, getPoisonFor(ValToPoison, Op));
 
       if (LocalCheck)
@@ -290,12 +293,12 @@ static bool rewrite(Function &F) {
           }
 
       SmallVector<Value*, 4> Checks;
-      if (propagatesFullPoison(&I))
+      if (propagatesPoison(&I))
         for (Value *V : I.operands())
           Checks.push_back(getPoisonFor(ValToPoison, V));
 
-      if (auto *Check = generatePoisonChecks(I))
-        Checks.push_back(Check);
+      if (canCreatePoison(&I))
+        generateCreationChecks(I, Checks);
       ValToPoison[&I] = buildOrChain(B, Checks);
     }
 
@@ -328,7 +331,6 @@ PreservedAnalyses PoisonCheckingPass::run(Function &F,
   return rewrite(F) ? PreservedAnalyses::none() : PreservedAnalyses::all();
 }
 
-
 /* Major TODO Items:
    - Control dependent poison UB
    - Strict mode - (i.e. must analyze every operand)
@@ -338,10 +340,7 @@ PreservedAnalyses PoisonCheckingPass::run(Function &F,
 
    Instructions w/Unclear Semantics:
    - shufflevector - It would seem reasonable for an out of bounds mask element
-     to produce poison, but the LangRef does not state.  
-   - and/or - It would seem reasonable for poison to propagate from both
-     arguments, but LangRef doesn't state and propagatesFullPoison doesn't
-     include these two.
+     to produce poison, but the LangRef does not state.
    - all binary ops w/vector operands - The likely interpretation would be that
      any element overflowing should produce poison for the entire result, but
      the LangRef does not state.
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index e6dc684c2e77..b6a9df57e431 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -16,7 +16,6 @@
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfo.h"
@@ -35,6 +34,8 @@
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/SpecialCaseList.h"
+#include "llvm/Support/VirtualFileSystem.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Instrumentation.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
@@ -67,6 +68,8 @@ static const char *const SanCovModuleCtorTracePcGuardName =
     "sancov.module_ctor_trace_pc_guard";
 static const char *const SanCovModuleCtor8bitCountersName =
     "sancov.module_ctor_8bit_counters";
+static const char *const SanCovModuleCtorBoolFlagName =
+    "sancov.module_ctor_bool_flag";
 static const uint64_t SanCtorAndDtorPriority = 2;
 
 static const char *const SanCovTracePCGuardName =
@@ -75,10 +78,13 @@ static const char *const SanCovTracePCGuardInitName =
     "__sanitizer_cov_trace_pc_guard_init";
 static const char *const SanCov8bitCountersInitName =
     "__sanitizer_cov_8bit_counters_init";
+static const char *const SanCovBoolFlagInitName =
+    "__sanitizer_cov_bool_flag_init";
 static const char *const SanCovPCsInitName = "__sanitizer_cov_pcs_init";
 
 static const char *const SanCovGuardsSectionName = "sancov_guards";
 static const char *const SanCovCountersSectionName = "sancov_cntrs";
+static const char *const SanCovBoolFlagSectionName = "sancov_bools";
 static const char *const SanCovPCsSectionName = "sancov_pcs";
 
 static const char *const SanCovLowestStackName = "__sancov_lowest_stack";
@@ -101,7 +107,8 @@ static cl::opt<bool> ClTracePCGuard("sanitizer-coverage-trace-pc-guard",
 // BBs, put this global into a named section, and pass this section's bounds
 // to __sanitizer_cov_pcs_init.
 // This way the coverage instrumentation does not need to acquire the PCs
-// at run-time. Works with trace-pc-guard and inline-8bit-counters.
+// at run-time. Works with trace-pc-guard, inline-8bit-counters, and
+// inline-bool-flag.
 static cl::opt<bool> ClCreatePCTable("sanitizer-coverage-pc-table",
                                      cl::desc("create a static PC table"),
                                      cl::Hidden, cl::init(false));
@@ -112,6 +119,11 @@ static cl::opt<bool>
                          cl::Hidden, cl::init(false));
 
 static cl::opt<bool>
+    ClInlineBoolFlag("sanitizer-coverage-inline-bool-flag",
+                     cl::desc("sets a boolean flag for every edge"), cl::Hidden,
+                     cl::init(false));
+
+static cl::opt<bool>
     ClCMPTracing("sanitizer-coverage-trace-compares",
                  cl::desc("Tracing of CMP and similar instructions"),
                  cl::Hidden, cl::init(false));
@@ -169,11 +181,13 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
   Options.TracePC |= ClTracePC;
   Options.TracePCGuard |= ClTracePCGuard;
   Options.Inline8bitCounters |= ClInline8bitCounters;
+  Options.InlineBoolFlag |= ClInlineBoolFlag;
   Options.PCTable |= ClCreatePCTable;
   Options.NoPrune |= !ClPruneBlocks;
   Options.StackDepth |= ClStackDepth;
   if (!Options.TracePCGuard && !Options.TracePC &&
-      !Options.Inline8bitCounters && !Options.StackDepth)
+      !Options.Inline8bitCounters && !Options.StackDepth &&
+      !Options.InlineBoolFlag)
     Options.TracePCGuard = true; // TracePCGuard is default.
   return Options;
 }
@@ -185,8 +199,11 @@ using PostDomTreeCallback =
 class ModuleSanitizerCoverage {
 public:
   ModuleSanitizerCoverage(
-      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions())
-      : Options(OverrideFromCL(Options)) {}
+      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions(),
+      const SpecialCaseList *Allowlist = nullptr,
+      const SpecialCaseList *Blocklist = nullptr)
+      : Options(OverrideFromCL(Options)), Allowlist(Allowlist),
+        Blocklist(Blocklist) {}
   bool instrumentModule(Module &M, DomTreeCallback DTCallback,
                         PostDomTreeCallback PDTCallback);
 
@@ -233,9 +250,8 @@ private:
   FunctionCallee SanCovTraceGepFunction;
   FunctionCallee SanCovTraceSwitchFunction;
   GlobalVariable *SanCovLowestStack;
-  InlineAsm *EmptyAsm;
   Type *IntptrTy, *IntptrPtrTy, *Int64Ty, *Int64PtrTy, *Int32Ty, *Int32PtrTy,
-      *Int16Ty, *Int8Ty, *Int8PtrTy;
+      *Int16Ty, *Int8Ty, *Int8PtrTy, *Int1Ty, *Int1PtrTy;
   Module *CurModule;
   std::string CurModuleUniqueId;
   Triple TargetTriple;
@@ -244,23 +260,38 @@ private:
 
   GlobalVariable *FunctionGuardArray;  // for trace-pc-guard.
   GlobalVariable *Function8bitCounterArray;  // for inline-8bit-counters.
+  GlobalVariable *FunctionBoolArray;         // for inline-bool-flag.
   GlobalVariable *FunctionPCsArray;  // for pc-table.
   SmallVector<GlobalValue *, 20> GlobalsToAppendToUsed;
   SmallVector<GlobalValue *, 20> GlobalsToAppendToCompilerUsed;
 
   SanitizerCoverageOptions Options;
+
+  const SpecialCaseList *Allowlist;
+  const SpecialCaseList *Blocklist;
 };
 
 class ModuleSanitizerCoverageLegacyPass : public ModulePass {
 public:
   ModuleSanitizerCoverageLegacyPass(
-      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions())
+      const SanitizerCoverageOptions &Options = SanitizerCoverageOptions(),
+      const std::vector<std::string> &AllowlistFiles =
+          std::vector<std::string>(),
+      const std::vector<std::string> &BlocklistFiles =
+          std::vector<std::string>())
       : ModulePass(ID), Options(Options) {
+    if (AllowlistFiles.size() > 0)
+      Allowlist = SpecialCaseList::createOrDie(AllowlistFiles,
+                                               *vfs::getRealFileSystem());
+    if (BlocklistFiles.size() > 0)
+      Blocklist = SpecialCaseList::createOrDie(BlocklistFiles,
+                                               *vfs::getRealFileSystem());
     initializeModuleSanitizerCoverageLegacyPassPass(
         *PassRegistry::getPassRegistry());
   }
   bool runOnModule(Module &M) override {
-    ModuleSanitizerCoverage ModuleSancov(Options);
+    ModuleSanitizerCoverage ModuleSancov(Options, Allowlist.get(),
+                                         Blocklist.get());
     auto DTCallback = [this](Function &F) -> const DominatorTree * {
       return &this->getAnalysis<DominatorTreeWrapperPass>(F).getDomTree();
     };
@@ -281,13 +312,17 @@ public:
 
 private:
   SanitizerCoverageOptions Options;
+
+  std::unique_ptr<SpecialCaseList> Allowlist;
+  std::unique_ptr<SpecialCaseList> Blocklist;
 };
 
 } // namespace
 
 PreservedAnalyses ModuleSanitizerCoveragePass::run(Module &M,
                                                    ModuleAnalysisManager &MAM) {
-  ModuleSanitizerCoverage ModuleSancov(Options);
+  ModuleSanitizerCoverage ModuleSancov(Options, Allowlist.get(),
+                                       Blocklist.get());
   auto &FAM = MAM.getResult<FunctionAnalysisManagerModuleProxy>(M).getManager();
   auto DTCallback = [&FAM](Function &F) -> const DominatorTree * {
     return &FAM.getResult<DominatorTreeAnalysis>(F);
@@ -360,6 +395,12 @@ bool ModuleSanitizerCoverage::instrumentModule(
     Module &M, DomTreeCallback DTCallback, PostDomTreeCallback PDTCallback) {
   if (Options.CoverageType == SanitizerCoverageOptions::SCK_None)
     return false;
+  if (Allowlist &&
+      !Allowlist->inSection("coverage", "src", M.getSourceFileName()))
+    return false;
+  if (Blocklist &&
+      Blocklist->inSection("coverage", "src", M.getSourceFileName()))
+    return false;
   C = &(M.getContext());
   DL = &M.getDataLayout();
   CurModule = &M;
@@ -367,6 +408,7 @@ bool ModuleSanitizerCoverage::instrumentModule(
   TargetTriple = Triple(M.getTargetTriple());
   FunctionGuardArray = nullptr;
   Function8bitCounterArray = nullptr;
+  FunctionBoolArray = nullptr;
   FunctionPCsArray = nullptr;
   IntptrTy = Type::getIntNTy(*C, DL->getPointerSizeInBits());
   IntptrPtrTy = PointerType::getUnqual(IntptrTy);
@@ -375,10 +417,12 @@ bool ModuleSanitizerCoverage::instrumentModule(
   Int64PtrTy = PointerType::getUnqual(IRB.getInt64Ty());
   Int32PtrTy = PointerType::getUnqual(IRB.getInt32Ty());
   Int8PtrTy = PointerType::getUnqual(IRB.getInt8Ty());
+  Int1PtrTy = PointerType::getUnqual(IRB.getInt1Ty());
   Int64Ty = IRB.getInt64Ty();
   Int32Ty = IRB.getInt32Ty();
   Int16Ty = IRB.getInt16Ty();
   Int8Ty = IRB.getInt8Ty();
+  Int1Ty = IRB.getInt1Ty();
 
   SanCovTracePCIndir =
       M.getOrInsertFunction(SanCovTracePCIndirName, VoidTy, IntptrTy);
@@ -440,11 +484,6 @@ bool ModuleSanitizerCoverage::instrumentModule(
   if (Options.StackDepth && !SanCovLowestStack->isDeclaration())
     SanCovLowestStack->setInitializer(Constant::getAllOnesValue(IntptrTy));
 
-  // We insert an empty inline asm after cov callbacks to avoid callback merge.
-  EmptyAsm = InlineAsm::get(FunctionType::get(IRB.getVoidTy(), false),
-                            StringRef(""), StringRef(""),
-                            /*hasSideEffects=*/true);
-
   SanCovTracePC = M.getOrInsertFunction(SanCovTracePCName, VoidTy);
   SanCovTracePCGuard =
       M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, Int32PtrTy);
@@ -462,6 +501,11 @@ bool ModuleSanitizerCoverage::instrumentModule(
     Ctor = CreateInitCallsForSections(M, SanCovModuleCtor8bitCountersName,
                                       SanCov8bitCountersInitName, Int8PtrTy,
                                       SanCovCountersSectionName);
+  if (FunctionBoolArray) {
+    Ctor = CreateInitCallsForSections(M, SanCovModuleCtorBoolFlagName,
+                                      SanCovBoolFlagInitName, Int1PtrTy,
+                                      SanCovBoolFlagSectionName);
+  }
   if (Ctor && Options.PCTable) {
     auto SecStartEnd = CreateSecStartEnd(M, SanCovPCsSectionName, IntptrPtrTy);
     FunctionCallee InitFunction = declareSanitizerInitFunction(
@@ -589,6 +633,10 @@ void ModuleSanitizerCoverage::instrumentFunction(
   if (F.hasPersonalityFn() &&
       isAsynchronousEHPersonality(classifyEHPersonality(F.getPersonalityFn())))
     return;
+  if (Allowlist && !Allowlist->inSection("coverage", "fun", F.getName()))
+    return;
+  if (Blocklist && Blocklist->inSection("coverage", "fun", F.getName()))
+    return;
   if (Options.CoverageType >= SanitizerCoverageOptions::SCK_Edge)
     SplitAllCriticalEdges(F, CriticalEdgeSplittingOptions().setIgnoreUnreachableDests());
   SmallVector<Instruction *, 8> IndirCalls;
@@ -607,8 +655,8 @@ void ModuleSanitizerCoverage::instrumentFunction(
       BlocksToInstrument.push_back(&BB);
     for (auto &Inst : BB) {
       if (Options.IndirectCalls) {
-        CallSite CS(&Inst);
-        if (CS && !CS.getCalledFunction())
+        CallBase *CB = dyn_cast<CallBase>(&Inst);
+        if (CB && !CB->getCalledFunction())
           IndirCalls.push_back(&Inst);
       }
       if (Options.TraceCmp) {
@@ -653,9 +701,7 @@ GlobalVariable *ModuleSanitizerCoverage::CreateFunctionLocalArrayInSection(
             GetOrCreateFunctionComdat(F, TargetTriple, CurModuleUniqueId))
       Array->setComdat(Comdat);
   Array->setSection(getSectionName(Section));
-  Array->setAlignment(Align(Ty->isPointerTy()
-                                ? DL->getPointerSize()
-                                : Ty->getPrimitiveSizeInBits() / 8));
+  Array->setAlignment(Align(DL->getTypeStoreSize(Ty).getFixedSize()));
   GlobalsToAppendToUsed.push_back(Array);
   GlobalsToAppendToCompilerUsed.push_back(Array);
   MDNode *MD = MDNode::get(F.getContext(), ValueAsMetadata::get(&F));
@@ -701,6 +747,9 @@ void ModuleSanitizerCoverage::CreateFunctionLocalArrays(
   if (Options.Inline8bitCounters)
     Function8bitCounterArray = CreateFunctionLocalArrayInSection(
         AllBlocks.size(), F, Int8Ty, SanCovCountersSectionName);
+  if (Options.InlineBoolFlag)
+    FunctionBoolArray = CreateFunctionLocalArrayInSection(
+        AllBlocks.size(), F, Int1Ty, SanCovBoolFlagSectionName);
 
   if (Options.PCTable)
     FunctionPCsArray = CreatePCArray(F, AllBlocks);
@@ -727,11 +776,12 @@ void ModuleSanitizerCoverage::InjectCoverageForIndirectCalls(
     Function &F, ArrayRef<Instruction *> IndirCalls) {
   if (IndirCalls.empty())
     return;
-  assert(Options.TracePC || Options.TracePCGuard || Options.Inline8bitCounters);
+  assert(Options.TracePC || Options.TracePCGuard ||
+         Options.Inline8bitCounters || Options.InlineBoolFlag);
   for (auto I : IndirCalls) {
     IRBuilder<> IRB(I);
-    CallSite CS(I);
-    Value *Callee = CS.getCalledValue();
+    CallBase &CB = cast<CallBase>(*I);
+    Value *Callee = CB.getCalledOperand();
     if (isa<InlineAsm>(Callee))
       continue;
     IRB.CreateCall(SanCovTracePCIndir, IRB.CreatePointerCast(Callee, IntptrTy));
@@ -865,16 +915,15 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   IRBuilder<> IRB(&*IP);
   IRB.SetCurrentDebugLocation(EntryLoc);
   if (Options.TracePC) {
-    IRB.CreateCall(SanCovTracePC); // gets the PC using GET_CALLER_PC.
-    IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
+    IRB.CreateCall(SanCovTracePC)
+        ->setCannotMerge(); // gets the PC using GET_CALLER_PC.
   }
   if (Options.TracePCGuard) {
     auto GuardPtr = IRB.CreateIntToPtr(
         IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy),
                       ConstantInt::get(IntptrTy, Idx * 4)),
         Int32PtrTy);
-    IRB.CreateCall(SanCovTracePCGuard, GuardPtr);
-    IRB.CreateCall(EmptyAsm, {}); // Avoids callback merge.
+    IRB.CreateCall(SanCovTracePCGuard, GuardPtr)->setCannotMerge();
   }
   if (Options.Inline8bitCounters) {
     auto CounterPtr = IRB.CreateGEP(
@@ -886,6 +935,18 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
     SetNoSanitizeMetadata(Load);
     SetNoSanitizeMetadata(Store);
   }
+  if (Options.InlineBoolFlag) {
+    auto FlagPtr = IRB.CreateGEP(
+        FunctionBoolArray->getValueType(), FunctionBoolArray,
+        {ConstantInt::get(IntptrTy, 0), ConstantInt::get(IntptrTy, Idx)});
+    auto Load = IRB.CreateLoad(Int1Ty, FlagPtr);
+    auto ThenTerm =
+        SplitBlockAndInsertIfThen(IRB.CreateIsNull(Load), &*IP, false);
+    IRBuilder<> ThenIRB(ThenTerm);
+    auto Store = ThenIRB.CreateStore(ConstantInt::getTrue(Int1Ty), FlagPtr);
+    SetNoSanitizeMetadata(Load);
+    SetNoSanitizeMetadata(Store);
+  }
   if (Options.StackDepth && IsEntryBB && !IsLeafFunc) {
     // Check stack depth.  If it's the deepest so far, record it.
     Module *M = F.getParent();
@@ -910,6 +971,8 @@ ModuleSanitizerCoverage::getSectionName(const std::string &Section) const {
   if (TargetTriple.isOSBinFormatCOFF()) {
     if (Section == SanCovCountersSectionName)
       return ".SCOV$CM";
+    if (Section == SanCovBoolFlagSectionName)
+      return ".SCOV$BM";
     if (Section == SanCovPCsSectionName)
       return ".SCOVP$M";
     return ".SCOV$GM"; // For SanCovGuardsSectionName.
@@ -943,6 +1006,9 @@ INITIALIZE_PASS_END(ModuleSanitizerCoverageLegacyPass, "sancov",
                     "Pass for instrumenting coverage on functions", false,
                     false)
 ModulePass *llvm::createModuleSanitizerCoverageLegacyPassPass(
-    const SanitizerCoverageOptions &Options) {
-  return new ModuleSanitizerCoverageLegacyPass(Options);
+    const SanitizerCoverageOptions &Options,
+    const std::vector<std::string> &AllowlistFiles,
+    const std::vector<std::string> &BlocklistFiles) {
+  return new ModuleSanitizerCoverageLegacyPass(Options, AllowlistFiles,
+                                               BlocklistFiles);
 }
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 9b7edad3444b..c911b37afac7 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -68,6 +68,14 @@ static cl::opt<bool>  ClInstrumentAtomics(
 static cl::opt<bool>  ClInstrumentMemIntrinsics(
     "tsan-instrument-memintrinsics", cl::init(true),
     cl::desc("Instrument memintrinsics (memset/memcpy/memmove)"), cl::Hidden);
+static cl::opt<bool>  ClDistinguishVolatile(
+    "tsan-distinguish-volatile", cl::init(false),
+    cl::desc("Emit special instrumentation for accesses to volatiles"),
+    cl::Hidden);
+static cl::opt<bool>  ClInstrumentReadBeforeWrite(
+    "tsan-instrument-read-before-write", cl::init(false),
+    cl::desc("Do not eliminate read instrumentation for read-before-writes"),
+    cl::Hidden);
 
 STATISTIC(NumInstrumentedReads, "Number of instrumented reads");
 STATISTIC(NumInstrumentedWrites, "Number of instrumented writes");
@@ -118,6 +126,10 @@ private:
   FunctionCallee TsanWrite[kNumberOfAccessSizes];
   FunctionCallee TsanUnalignedRead[kNumberOfAccessSizes];
   FunctionCallee TsanUnalignedWrite[kNumberOfAccessSizes];
+  FunctionCallee TsanVolatileRead[kNumberOfAccessSizes];
+  FunctionCallee TsanVolatileWrite[kNumberOfAccessSizes];
+  FunctionCallee TsanUnalignedVolatileRead[kNumberOfAccessSizes];
+  FunctionCallee TsanUnalignedVolatileWrite[kNumberOfAccessSizes];
   FunctionCallee TsanAtomicLoad[kNumberOfAccessSizes];
   FunctionCallee TsanAtomicStore[kNumberOfAccessSizes];
   FunctionCallee TsanAtomicRMW[AtomicRMWInst::LAST_BINOP + 1]
@@ -131,7 +143,9 @@ private:
 };
 
 struct ThreadSanitizerLegacyPass : FunctionPass {
-  ThreadSanitizerLegacyPass() : FunctionPass(ID) {}
+  ThreadSanitizerLegacyPass() : FunctionPass(ID) {
+    initializeThreadSanitizerLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
   StringRef getPassName() const override;
   void getAnalysisUsage(AnalysisUsage &AU) const override;
   bool runOnFunction(Function &F) override;
@@ -236,6 +250,24 @@ void ThreadSanitizer::initialize(Module &M) {
     TsanUnalignedWrite[i] = M.getOrInsertFunction(
         UnalignedWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
 
+    SmallString<64> VolatileReadName("__tsan_volatile_read" + ByteSizeStr);
+    TsanVolatileRead[i] = M.getOrInsertFunction(
+        VolatileReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+    SmallString<64> VolatileWriteName("__tsan_volatile_write" + ByteSizeStr);
+    TsanVolatileWrite[i] = M.getOrInsertFunction(
+        VolatileWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+    SmallString<64> UnalignedVolatileReadName("__tsan_unaligned_volatile_read" +
+                                              ByteSizeStr);
+    TsanUnalignedVolatileRead[i] = M.getOrInsertFunction(
+        UnalignedVolatileReadName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
+    SmallString<64> UnalignedVolatileWriteName(
+        "__tsan_unaligned_volatile_write" + ByteSizeStr);
+    TsanUnalignedVolatileWrite[i] = M.getOrInsertFunction(
+        UnalignedVolatileWriteName, Attr, IRB.getVoidTy(), IRB.getInt8PtrTy());
+
     Type *Ty = Type::getIntNTy(M.getContext(), BitSize);
     Type *PtrTy = Ty->getPointerTo();
     SmallString<32> AtomicLoadName("__tsan_atomic" + BitSizeStr + "_load");
@@ -246,28 +278,28 @@ void ThreadSanitizer::initialize(Module &M) {
     TsanAtomicStore[i] = M.getOrInsertFunction(
         AtomicStoreName, Attr, IRB.getVoidTy(), PtrTy, Ty, OrdTy);
 
-    for (int op = AtomicRMWInst::FIRST_BINOP;
-        op <= AtomicRMWInst::LAST_BINOP; ++op) {
-      TsanAtomicRMW[op][i] = nullptr;
+    for (unsigned Op = AtomicRMWInst::FIRST_BINOP;
+         Op <= AtomicRMWInst::LAST_BINOP; ++Op) {
+      TsanAtomicRMW[Op][i] = nullptr;
       const char *NamePart = nullptr;
-      if (op == AtomicRMWInst::Xchg)
+      if (Op == AtomicRMWInst::Xchg)
         NamePart = "_exchange";
-      else if (op == AtomicRMWInst::Add)
+      else if (Op == AtomicRMWInst::Add)
         NamePart = "_fetch_add";
-      else if (op == AtomicRMWInst::Sub)
+      else if (Op == AtomicRMWInst::Sub)
         NamePart = "_fetch_sub";
-      else if (op == AtomicRMWInst::And)
+      else if (Op == AtomicRMWInst::And)
         NamePart = "_fetch_and";
-      else if (op == AtomicRMWInst::Or)
+      else if (Op == AtomicRMWInst::Or)
         NamePart = "_fetch_or";
-      else if (op == AtomicRMWInst::Xor)
+      else if (Op == AtomicRMWInst::Xor)
         NamePart = "_fetch_xor";
-      else if (op == AtomicRMWInst::Nand)
+      else if (Op == AtomicRMWInst::Nand)
         NamePart = "_fetch_nand";
       else
         continue;
       SmallString<32> RMWName("__tsan_atomic" + itostr(BitSize) + NamePart);
-      TsanAtomicRMW[op][i] =
+      TsanAtomicRMW[Op][i] =
           M.getOrInsertFunction(RMWName, Attr, Ty, PtrTy, Ty, OrdTy);
     }
 
@@ -385,7 +417,7 @@ void ThreadSanitizer::chooseInstructionsToInstrument(
       Value *Addr = Load->getPointerOperand();
       if (!shouldInstrumentReadWriteFromAddress(I->getModule(), Addr))
         continue;
-      if (WriteTargets.count(Addr)) {
+      if (!ClInstrumentReadBeforeWrite && WriteTargets.count(Addr)) {
         // We will write to this temp, so no reason to analyze the read.
         NumOmittedReadsBeforeWrite++;
         continue;
@@ -441,6 +473,11 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
   // the module constructor.
   if (F.getName() == kTsanModuleCtorName)
     return false;
+  // Naked functions can not have prologue/epilogue
+  // (__tsan_func_entry/__tsan_func_exit) generated, so don't instrument them at
+  // all.
+  if (F.hasFnAttribute(Attribute::Naked))
+    return false;
   initialize(*F.getParent());
   SmallVector<Instruction*, 8> AllLoadsAndStores;
   SmallVector<Instruction*, 8> LocalLoadsAndStores;
@@ -560,13 +597,24 @@ bool ThreadSanitizer::instrumentLoadOrStore(Instruction *I,
   const unsigned Alignment = IsWrite
       ? cast<StoreInst>(I)->getAlignment()
       : cast<LoadInst>(I)->getAlignment();
+  const bool IsVolatile =
+      ClDistinguishVolatile && (IsWrite ? cast<StoreInst>(I)->isVolatile()
+                                        : cast<LoadInst>(I)->isVolatile());
   Type *OrigTy = cast<PointerType>(Addr->getType())->getElementType();
   const uint32_t TypeSize = DL.getTypeStoreSizeInBits(OrigTy);
   FunctionCallee OnAccessFunc = nullptr;
-  if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0)
-    OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
-  else
-    OnAccessFunc = IsWrite ? TsanUnalignedWrite[Idx] : TsanUnalignedRead[Idx];
+  if (Alignment == 0 || Alignment >= 8 || (Alignment % (TypeSize / 8)) == 0) {
+    if (IsVolatile)
+      OnAccessFunc = IsWrite ? TsanVolatileWrite[Idx] : TsanVolatileRead[Idx];
+    else
+      OnAccessFunc = IsWrite ? TsanWrite[Idx] : TsanRead[Idx];
+  } else {
+    if (IsVolatile)
+      OnAccessFunc = IsWrite ? TsanUnalignedVolatileWrite[Idx]
+                             : TsanUnalignedVolatileRead[Idx];
+    else
+      OnAccessFunc = IsWrite ? TsanUnalignedWrite[Idx] : TsanUnalignedRead[Idx];
+  }
   IRB.CreateCall(OnAccessFunc, IRB.CreatePointerCast(Addr, IRB.getInt8PtrTy()));
   if (IsWrite) NumInstrumentedWrites++;
   else         NumInstrumentedReads++;
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
index 604726d4f40f..cd4f636ff132 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.cpp
@@ -38,7 +38,7 @@ using PluginChainFinal = PluginChain<VP_PLUGIN_LIST>;
 
 template <> class PluginChain<> {
 public:
-  PluginChain(Function &F) {}
+  PluginChain(Function &F, TargetLibraryInfo &TLI) {}
   void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {}
 };
 
@@ -48,7 +48,8 @@ class PluginChain<PluginT, Ts...> : public PluginChain<Ts...> {
   using Base = PluginChain<Ts...>;
 
 public:
-  PluginChain(Function &F) : PluginChain<Ts...>(F), Plugin(F) {}
+  PluginChain(Function &F, TargetLibraryInfo &TLI)
+      : PluginChain<Ts...>(F, TLI), Plugin(F, TLI) {}
 
   void get(InstrProfValueKind K, std::vector<CandidateInfo> &Candidates) {
     if (K == PluginT::Kind)
@@ -65,8 +66,9 @@ public:
   using PluginChainFinal::PluginChainFinal;
 };
 
-ValueProfileCollector::ValueProfileCollector(Function &F)
-    : PImpl(new ValueProfileCollectorImpl(F)) {}
+ValueProfileCollector::ValueProfileCollector(Function &F,
+                                             TargetLibraryInfo &TLI)
+    : PImpl(new ValueProfileCollectorImpl(F, TLI)) {}
 
 ValueProfileCollector::~ValueProfileCollector() = default;
 
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
index ff883c8d0c77..c3f549c2e7cc 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfileCollector.h
@@ -16,6 +16,7 @@
 #ifndef LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
 #define LLVM_ANALYSIS_PROFILE_GEN_ANALYSIS_H
 
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/Pass.h"
@@ -58,7 +59,7 @@ public:
     Instruction *AnnotatedInst; // Where metadata is attached.
   };
 
-  ValueProfileCollector(Function &Fn);
+  ValueProfileCollector(Function &Fn, TargetLibraryInfo &TLI);
   ValueProfileCollector(ValueProfileCollector &&) = delete;
   ValueProfileCollector &operator=(ValueProfileCollector &&) = delete;
 
diff --git a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
index 4cc4c6c848c3..8d0cf5843ebc 100644
--- a/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
+++ b/llvm/lib/Transforms/Instrumentation/ValueProfilePlugins.inc
@@ -20,15 +20,19 @@
 using namespace llvm;
 using CandidateInfo = ValueProfileCollector::CandidateInfo;
 
+extern cl::opt<bool> MemOPOptMemcmpBcmp;
+
 ///--------------------------- MemIntrinsicPlugin ------------------------------
 class MemIntrinsicPlugin : public InstVisitor<MemIntrinsicPlugin> {
   Function &F;
+  TargetLibraryInfo &TLI;
   std::vector<CandidateInfo> *Candidates;
 
 public:
   static constexpr InstrProfValueKind Kind = IPVK_MemOPSize;
 
-  MemIntrinsicPlugin(Function &Fn) : F(Fn), Candidates(nullptr) {}
+  MemIntrinsicPlugin(Function &Fn, TargetLibraryInfo &TLI)
+      : F(Fn), TLI(TLI), Candidates(nullptr) {}
 
   void run(std::vector<CandidateInfo> &Cs) {
     Candidates = &Cs;
@@ -45,6 +49,24 @@ public:
     Instruction *AnnotatedInst = &MI;
     Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst});
   }
+  void visitCallInst(CallInst &CI) {
+    if (!MemOPOptMemcmpBcmp)
+      return;
+    auto *F = CI.getCalledFunction();
+    if (!F)
+      return;
+    LibFunc Func;
+    if (TLI.getLibFunc(CI, Func) &&
+        (Func == LibFunc_memcmp || Func == LibFunc_bcmp)) {
+      Value *Length = CI.getArgOperand(2);
+      // Not instrument constant length calls.
+      if (dyn_cast<ConstantInt>(Length))
+        return;
+      Instruction *InsertPt = &CI;
+      Instruction *AnnotatedInst = &CI;
+      Candidates->emplace_back(CandidateInfo{Length, InsertPt, AnnotatedInst});
+    }
+  }
 };
 
 ///------------------------ IndirectCallPromotionPlugin ------------------------
@@ -54,12 +76,12 @@ class IndirectCallPromotionPlugin {
 public:
   static constexpr InstrProfValueKind Kind = IPVK_IndirectCallTarget;
 
-  IndirectCallPromotionPlugin(Function &Fn) : F(Fn) {}
+  IndirectCallPromotionPlugin(Function &Fn, TargetLibraryInfo &TLI) : F(Fn) {}
 
   void run(std::vector<CandidateInfo> &Candidates) {
-    std::vector<Instruction *> Result = findIndirectCalls(F);
+    std::vector<CallBase *> Result = findIndirectCalls(F);
     for (Instruction *I : Result) {
-      Value *Callee = CallSite(I).getCalledValue();
+      Value *Callee = cast<CallBase>(I)->getCalledOperand();
       Instruction *InsertPt = I;
       Instruction *AnnotatedInst = I;
       Candidates.emplace_back(CandidateInfo{Callee, InsertPt, AnnotatedInst});
diff --git a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
index e1e95cd6a407..258dc9240815 100644
--- a/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
+++ b/llvm/lib/Transforms/ObjCARC/ARCRuntimeEntryPoints.h
@@ -22,19 +22,15 @@
 #ifndef LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
 #define LLVM_LIB_TRANSFORMS_OBJCARC_ARCRUNTIMEENTRYPOINTS_H
 
-#include "llvm/ADT/StringRef.h"
 #include "llvm/IR/Attributes.h"
-#include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/Module.h"
-#include "llvm/IR/Type.h"
 #include "llvm/Support/ErrorHandling.h"
 #include <cassert>
 
 namespace llvm {
 
 class Function;
-class LLVMContext;
+class Module;
 
 namespace objcarc {
 
diff --git a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
index e8f8fb6f3a7c..46bc586fe688 100644
--- a/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
+++ b/llvm/lib/Transforms/ObjCARC/DependencyAnalysis.cpp
@@ -95,10 +95,9 @@ bool llvm::objcarc::CanUse(const Instruction *Inst, const Value *Ptr,
     // of any other dynamic reference-counted pointers.
     if (!IsPotentialRetainableObjPtr(ICI->getOperand(1), *PA.getAA()))
       return false;
-  } else if (auto CS = ImmutableCallSite(Inst)) {
+  } else if (const auto *CS = dyn_cast<CallBase>(Inst)) {
     // For calls, just check the arguments (and not the callee operand).
-    for (ImmutableCallSite::arg_iterator OI = CS.arg_begin(),
-         OE = CS.arg_end(); OI != OE; ++OI) {
+    for (auto OI = CS->arg_begin(), OE = CS->arg_end(); OI != OE; ++OI) {
       const Value *Op = *OI;
       if (IsPotentialRetainableObjPtr(Op, *PA.getAA()) &&
           PA.related(Ptr, Op, DL))
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARC.h b/llvm/lib/Transforms/ObjCARC/ObjCARC.h
index d465630800b9..b496842fcfc5 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARC.h
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARC.h
@@ -22,24 +22,12 @@
 #ifndef LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
 #define LLVM_LIB_TRANSFORMS_OBJCARC_OBJCARC_H
 
-#include "llvm/ADT/StringSwitch.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ObjCARCAnalysisUtils.h"
 #include "llvm/Analysis/ObjCARCInstKind.h"
-#include "llvm/Analysis/Passes.h"
 #include "llvm/Transforms/Utils/Local.h"
-#include "llvm/Analysis/ValueTracking.h"
-#include "llvm/IR/CallSite.h"
-#include "llvm/IR/InstIterator.h"
-#include "llvm/IR/Module.h"
-#include "llvm/Pass.h"
 #include "llvm/Transforms/ObjCARC.h"
 
 namespace llvm {
-class raw_ostream;
-}
-
-namespace llvm {
 namespace objcarc {
 
 /// Erase the given instruction.
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
index 7a01ec967fb5..ac1db27f5e64 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCAPElim.cpp
@@ -41,7 +41,7 @@ namespace {
     void getAnalysisUsage(AnalysisUsage &AU) const override;
     bool runOnModule(Module &M) override;
 
-    static bool MayAutorelease(ImmutableCallSite CS, unsigned Depth = 0);
+    static bool MayAutorelease(const CallBase &CB, unsigned Depth = 0);
     static bool OptimizeBB(BasicBlock *BB);
 
   public:
@@ -68,18 +68,17 @@ void ObjCARCAPElim::getAnalysisUsage(AnalysisUsage &AU) const {
 
 /// Interprocedurally determine if calls made by the given call site can
 /// possibly produce autoreleases.
-bool ObjCARCAPElim::MayAutorelease(ImmutableCallSite CS, unsigned Depth) {
-  if (const Function *Callee = CS.getCalledFunction()) {
+bool ObjCARCAPElim::MayAutorelease(const CallBase &CB, unsigned Depth) {
+  if (const Function *Callee = CB.getCalledFunction()) {
     if (!Callee->hasExactDefinition())
       return true;
     for (const BasicBlock &BB : *Callee) {
       for (const Instruction &I : BB)
-        if (ImmutableCallSite JCS = ImmutableCallSite(&I))
+        if (const CallBase *JCB = dyn_cast<CallBase>(&I))
           // This recursion depth limit is arbitrary. It's just great
           // enough to cover known interesting testcases.
-          if (Depth < 3 &&
-              !JCS.onlyReadsMemory() &&
-              MayAutorelease(JCS, Depth + 1))
+          if (Depth < 3 && !JCB->onlyReadsMemory() &&
+              MayAutorelease(*JCB, Depth + 1))
             return true;
     }
     return false;
@@ -115,7 +114,7 @@ bool ObjCARCAPElim::OptimizeBB(BasicBlock *BB) {
       Push = nullptr;
       break;
     case ARCInstKind::CallOrUser:
-      if (MayAutorelease(ImmutableCallSite(Inst)))
+      if (MayAutorelease(cast<CallBase>(*Inst)))
         Push = nullptr;
       break;
     default:
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
index ecf8220ae95d..7fd4857c4490 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCContract.cpp
@@ -33,6 +33,7 @@
 #include "llvm/Analysis/EHPersonalities.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InlineAsm.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
@@ -47,10 +48,6 @@ using namespace llvm::objcarc;
 STATISTIC(NumPeeps,       "Number of calls peephole-optimized");
 STATISTIC(NumStoreStrongs, "Number objc_storeStrong calls formed");
 
-static cl::opt<unsigned> MaxBBSize("arc-contract-max-bb-size", cl::Hidden,
-    cl::desc("Maximum basic block size to discover the dominance relation of "
-             "two instructions in the same basic block"), cl::init(65535));
-
 //===----------------------------------------------------------------------===//
 //                                Declarations
 //===----------------------------------------------------------------------===//
@@ -119,8 +116,7 @@ namespace {
 /// return value. We do this late so we do not disrupt the dataflow analysis in
 /// ObjCARCOpt.
 bool ObjCARCContract::optimizeRetainCall(Function &F, Instruction *Retain) {
-  ImmutableCallSite CS(GetArgRCIdentityRoot(Retain));
-  const Instruction *Call = CS.getInstruction();
+  const auto *Call = dyn_cast<CallBase>(GetArgRCIdentityRoot(Retain));
   if (!Call)
     return false;
   if (Call->getParent() != Retain->getParent())
@@ -534,6 +530,7 @@ bool ObjCARCContract::tryToPeepholeInstruction(
     return true;
   case ARCInstKind::IntrinsicUser:
     // Remove calls to @llvm.objc.clang.arc.use(...).
+    Changed = true;
     Inst->eraseFromParent();
     return true;
   default:
@@ -580,23 +577,6 @@ bool ObjCARCContract::runOnFunction(Function &F) {
   SmallPtrSet<Instruction *, 4> DependingInstructions;
   SmallPtrSet<const BasicBlock *, 4> Visited;
 
-  // Cache the basic block size.
-  DenseMap<const BasicBlock *, unsigned> BBSizeMap;
-
-  // A lambda that lazily computes the size of a basic block and determines
-  // whether the size exceeds MaxBBSize.
-  auto IsLargeBB = [&](const BasicBlock *BB) {
-    unsigned BBSize;
-    auto I = BBSizeMap.find(BB);
-
-    if (I != BBSizeMap.end())
-      BBSize = I->second;
-    else
-      BBSize = BBSizeMap[BB] = BB->size();
-
-    return BBSize > MaxBBSize;
-  };
-
   for (inst_iterator I = inst_begin(&F), E = inst_end(&F); I != E;) {
     Instruction *Inst = &*I++;
 
@@ -614,7 +594,7 @@ bool ObjCARCContract::runOnFunction(Function &F) {
     // and such; to do the replacement, the argument must have type i8*.
 
     // Function for replacing uses of Arg dominated by Inst.
-    auto ReplaceArgUses = [Inst, IsLargeBB, this](Value *Arg) {
+    auto ReplaceArgUses = [Inst, this](Value *Arg) {
       // If we're compiling bugpointed code, don't get in trouble.
       if (!isa<Instruction>(Arg) && !isa<Argument>(Arg))
         return;
@@ -626,17 +606,6 @@ bool ObjCARCContract::runOnFunction(Function &F) {
         Use &U = *UI++;
         unsigned OperandNo = U.getOperandNo();
 
-        // Don't replace the uses if Inst and the user belong to the same basic
-        // block and the size of the basic block is large. We don't want to call
-        // DominatorTree::dominate in that case. We can remove this check if we
-        // can use OrderedBasicBlock to compute the dominance relation between
-        // two instructions, but that's not currently possible since it doesn't
-        // recompute the instruction ordering when new instructions are inserted
-        // to the basic block.
-        if (Inst->getParent() == cast<Instruction>(U.getUser())->getParent() &&
-            IsLargeBB(Inst->getParent()))
-          continue;
-
         // If the call's return value dominates a use of the call's argument
         // value, rewrite the use to use the return value. We check for
         // reachability here because an unreachable call is considered to
@@ -689,7 +658,6 @@ bool ObjCARCContract::runOnFunction(Function &F) {
       }
     };
 
-
     Value *Arg = cast<CallInst>(Inst)->getArgOperand(0);
     Value *OrigArg = Arg;
 
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
index 205d8ddf151d..f8d872a7c995 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCExpand.cpp
@@ -30,9 +30,7 @@
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
-#include "llvm/PassAnalysisSupport.h"
 #include "llvm/PassRegistry.h"
-#include "llvm/PassSupport.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
index b80c1675050b..cb1fa804fa11 100644
--- a/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
+++ b/llvm/lib/Transforms/ObjCARC/ObjCARCOpts.cpp
@@ -43,7 +43,6 @@
 #include "llvm/Analysis/ObjCARCInstKind.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -610,8 +609,7 @@ bool
 ObjCARCOpt::OptimizeRetainRVCall(Function &F, Instruction *RetainRV) {
   // Check for the argument being from an immediately preceding call or invoke.
   const Value *Arg = GetArgRCIdentityRoot(RetainRV);
-  ImmutableCallSite CS(Arg);
-  if (const Instruction *Call = CS.getInstruction()) {
+  if (const Instruction *Call = dyn_cast<CallBase>(Arg)) {
     if (Call->getParent() == RetainRV->getParent()) {
       BasicBlock::const_iterator I(Call);
       ++I;
@@ -678,6 +676,7 @@ bool ObjCARCOpt::OptimizeInlinedAutoreleaseRVCall(
   // Delete the RV pair, starting with the AutoreleaseRV.
   AutoreleaseRV->replaceAllUsesWith(
       cast<CallInst>(AutoreleaseRV)->getArgOperand(0));
+  Changed = true;
   EraseInstruction(AutoreleaseRV);
   if (Class == ARCInstKind::RetainRV) {
     // AutoreleaseRV and RetainRV cancel out.  Delete the RetainRV.
@@ -877,23 +876,49 @@ void ObjCARCOpt::OptimizeIndividualCalls(Function &F) {
   optimizeDelayedAutoreleaseRV();
 }
 
+/// This function returns true if the value is inert. An ObjC ARC runtime call
+/// taking an inert operand can be safely deleted.
+static bool isInertARCValue(Value *V, SmallPtrSet<Value *, 1> &VisitedPhis) {
+  V = V->stripPointerCasts();
+
+  if (IsNullOrUndef(V))
+    return true;
+
+  // See if this is a global attribute annotated with an 'objc_arc_inert'.
+  if (auto *GV = dyn_cast<GlobalVariable>(V))
+    if (GV->hasAttribute("objc_arc_inert"))
+      return true;
+
+  if (auto PN = dyn_cast<PHINode>(V)) {
+    // Ignore this phi if it has already been discovered.
+    if (!VisitedPhis.insert(PN).second)
+      return true;
+    // Look through phis's operands.
+    for (Value *Opnd : PN->incoming_values())
+      if (!isInertARCValue(Opnd, VisitedPhis))
+        return false;
+    return true;
+  }
+
+  return false;
+}
+
 void ObjCARCOpt::OptimizeIndividualCallImpl(
     Function &F, DenseMap<BasicBlock *, ColorVector> &BlockColors,
     Instruction *Inst, ARCInstKind Class, const Value *Arg) {
   LLVM_DEBUG(dbgs() << "Visiting: Class: " << Class << "; " << *Inst << "\n");
 
-  // Some of the ARC calls can be deleted if their arguments are global
-  // variables that are inert in ARC.
-  if (IsNoopOnGlobal(Class)) {
-    Value *Opnd = Inst->getOperand(0);
-    if (auto *GV = dyn_cast<GlobalVariable>(Opnd->stripPointerCasts()))
-      if (GV->hasAttribute("objc_arc_inert")) {
-        if (!Inst->getType()->isVoidTy())
-          Inst->replaceAllUsesWith(Opnd);
-        Inst->eraseFromParent();
-        return;
-      }
-  }
+  // We can delete this call if it takes an inert value.
+  SmallPtrSet<Value *, 1> VisitedPhis;
+
+  if (IsNoopOnGlobal(Class))
+    if (isInertARCValue(Inst->getOperand(0), VisitedPhis)) {
+      if (!Inst->getType()->isVoidTy())
+        Inst->replaceAllUsesWith(Inst->getOperand(0));
+      Inst->eraseFromParent();
+      Changed = true;
+      return;
+    }
 
   switch (Class) {
   default:
@@ -1544,6 +1569,15 @@ ObjCARCOpt::VisitTopDown(BasicBlock *BB,
     }
   }
 
+  // Check that BB and MyStates have the same number of predecessors. This
+  // prevents retain calls that live outside a loop from being moved into the
+  // loop.
+  if (!BB->hasNPredecessors(MyStates.pred_end() - MyStates.pred_begin()))
+    for (auto I = MyStates.top_down_ptr_begin(),
+              E = MyStates.top_down_ptr_end();
+         I != E; ++I)
+      I->second.SetCFGHazardAfflicted(true);
+
   LLVM_DEBUG(dbgs() << "Before:\n"
                     << BBStates[BB] << "\n"
                     << "Performing Dataflow:\n");
@@ -2020,6 +2054,7 @@ void ObjCARCOpt::OptimizeWeakCalls(Function &F) {
     // Delete objc_loadWeak calls with no users.
     if (Class == ARCInstKind::LoadWeak && Inst->use_empty()) {
       Inst->eraseFromParent();
+      Changed = true;
       continue;
     }
 
@@ -2310,6 +2345,14 @@ void ObjCARCOpt::OptimizeReturns(Function &F) {
     bool HasSafePathToCall = HasSafePathToPredecessorCall(Arg, Retain,
                                                           DependingInstructions,
                                                           Visited, PA);
+
+    // Don't remove retainRV/autoreleaseRV pairs if the call isn't a tail call.
+    if (HasSafePathToCall &&
+        GetBasicARCInstKind(Retain) == ARCInstKind::RetainRV &&
+        GetBasicARCInstKind(Autorelease) == ARCInstKind::AutoreleaseRV &&
+        !cast<CallInst>(*DependingInstructions.begin())->isTailCall())
+      continue;
+
     DependingInstructions.clear();
     Visited.clear();
 
diff --git a/llvm/lib/Transforms/Scalar/ADCE.cpp b/llvm/lib/Transforms/Scalar/ADCE.cpp
index cc3d3bf7cdbf..c3709b9afffb 100644
--- a/llvm/lib/Transforms/Scalar/ADCE.cpp
+++ b/llvm/lib/Transforms/Scalar/ADCE.cpp
@@ -182,7 +182,7 @@ class AggressiveDeadCodeElimination {
 
   /// Identify connected sections of the control flow graph which have
   /// dead terminators and rewrite the control flow graph to remove them.
-  void updateDeadRegions();
+  bool updateDeadRegions();
 
   /// Set the BlockInfo::PostOrder field based on a post-order
   /// numbering of the reverse control flow graph.
@@ -505,7 +505,7 @@ void AggressiveDeadCodeElimination::markLiveBranchesFromControlDependences() {
 //===----------------------------------------------------------------------===//
 bool AggressiveDeadCodeElimination::removeDeadInstructions() {
   // Updates control and dataflow around dead blocks
-  updateDeadRegions();
+  bool RegionsUpdated = updateDeadRegions();
 
   LLVM_DEBUG({
     for (Instruction &I : instructions(F)) {
@@ -556,11 +556,11 @@ bool AggressiveDeadCodeElimination::removeDeadInstructions() {
     I->eraseFromParent();
   }
 
-  return !Worklist.empty();
+  return !Worklist.empty() || RegionsUpdated;
 }
 
 // A dead region is the set of dead blocks with a common live post-dominator.
-void AggressiveDeadCodeElimination::updateDeadRegions() {
+bool AggressiveDeadCodeElimination::updateDeadRegions() {
   LLVM_DEBUG({
     dbgs() << "final dead terminator blocks: " << '\n';
     for (auto *BB : BlocksWithDeadTerminators)
@@ -570,6 +570,7 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
 
   // Don't compute the post ordering unless we needed it.
   bool HavePostOrder = false;
+  bool Changed = false;
 
   for (auto *BB : BlocksWithDeadTerminators) {
     auto &Info = BlockInfo[BB];
@@ -624,7 +625,10 @@ void AggressiveDeadCodeElimination::updateDeadRegions() {
         .applyUpdates(DeletedEdges);
 
     NumBranchesRemoved += 1;
+    Changed = true;
   }
+
+  return Changed;
 }
 
 // reverse top-sort order
@@ -685,10 +689,14 @@ PreservedAnalyses ADCEPass::run(Function &F, FunctionAnalysisManager &FAM) {
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
-  PA.preserveSet<CFGAnalyses>();
+  // TODO: We could track if we have actually done CFG changes.
+  if (!RemoveControlFlowFlag)
+    PA.preserveSet<CFGAnalyses>();
+  else {
+    PA.preserve<DominatorTreeAnalysis>();
+    PA.preserve<PostDominatorTreeAnalysis>();
+  }
   PA.preserve<GlobalsAA>();
-  PA.preserve<DominatorTreeAnalysis>();
-  PA.preserve<PostDominatorTreeAnalysis>();
   return PA;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
index 06deaf3c4f9a..bccf94fc217f 100644
--- a/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
+++ b/llvm/lib/Transforms/Scalar/AlignmentFromAssumptions.cpp
@@ -15,6 +15,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/IR/Instructions.h"
 #include "llvm/InitializePasses.h"
 #define AA_NAME "alignment-from-assumptions"
 #define DEBUG_TYPE AA_NAME
@@ -30,6 +31,7 @@
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/Debug.h"
@@ -90,9 +92,9 @@ FunctionPass *llvm::createAlignmentFromAssumptionsPass() {
 // to a constant. Using SCEV to compute alignment handles the case where
 // DiffSCEV is a recurrence with constant start such that the aligned offset
 // is constant. e.g. {16,+,32} % 32 -> 16.
-static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
-                                    const SCEV *AlignSCEV,
-                                    ScalarEvolution *SE) {
+static MaybeAlign getNewAlignmentDiff(const SCEV *DiffSCEV,
+                                      const SCEV *AlignSCEV,
+                                      ScalarEvolution *SE) {
   // DiffUnits = Diff % int64_t(Alignment)
   const SCEV *DiffUnitsSCEV = SE->getURemExpr(DiffSCEV, AlignSCEV);
 
@@ -107,26 +109,30 @@ static unsigned getNewAlignmentDiff(const SCEV *DiffSCEV,
     // displaced pointer has the same alignment as the aligned pointer, so
     // return the alignment value.
     if (!DiffUnits)
-      return (unsigned)
-        cast<SCEVConstant>(AlignSCEV)->getValue()->getSExtValue();
+      return cast<SCEVConstant>(AlignSCEV)->getValue()->getAlignValue();
 
     // If the displacement is not an exact multiple, but the remainder is a
     // constant, then return this remainder (but only if it is a power of 2).
     uint64_t DiffUnitsAbs = std::abs(DiffUnits);
     if (isPowerOf2_64(DiffUnitsAbs))
-      return (unsigned) DiffUnitsAbs;
+      return Align(DiffUnitsAbs);
   }
 
-  return 0;
+  return None;
 }
 
 // There is an address given by an offset OffSCEV from AASCEV which has an
 // alignment AlignSCEV. Use that information, if possible, to compute a new
 // alignment for Ptr.
-static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
-                                const SCEV *OffSCEV, Value *Ptr,
-                                ScalarEvolution *SE) {
+static Align getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
+                             const SCEV *OffSCEV, Value *Ptr,
+                             ScalarEvolution *SE) {
   const SCEV *PtrSCEV = SE->getSCEV(Ptr);
+  // On a platform with 32-bit allocas, but 64-bit flat/global pointer sizes
+  // (*cough* AMDGPU), the effective SCEV type of AASCEV and PtrSCEV
+  // may disagree. Trunc/extend so they agree.
+  PtrSCEV = SE->getTruncateOrZeroExtend(
+      PtrSCEV, SE->getEffectiveSCEVType(AASCEV->getType()));
   const SCEV *DiffSCEV = SE->getMinusSCEV(PtrSCEV, AASCEV);
 
   // On 32-bit platforms, DiffSCEV might now have type i32 -- we've always
@@ -141,13 +147,12 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
                     << *AlignSCEV << " and offset " << *OffSCEV
                     << " using diff " << *DiffSCEV << "\n");
 
-  unsigned NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE);
-  LLVM_DEBUG(dbgs() << "\tnew alignment: " << NewAlignment << "\n");
+  if (MaybeAlign NewAlignment = getNewAlignmentDiff(DiffSCEV, AlignSCEV, SE)) {
+    LLVM_DEBUG(dbgs() << "\tnew alignment: " << DebugStr(NewAlignment) << "\n");
+    return *NewAlignment;
+  }
 
-  if (NewAlignment) {
-    return NewAlignment;
-  } else if (const SCEVAddRecExpr *DiffARSCEV =
-             dyn_cast<SCEVAddRecExpr>(DiffSCEV)) {
+  if (const SCEVAddRecExpr *DiffARSCEV = dyn_cast<SCEVAddRecExpr>(DiffSCEV)) {
     // The relative offset to the alignment assumption did not yield a constant,
     // but we should try harder: if we assume that a is 32-byte aligned, then in
     // for (i = 0; i < 1024; i += 4) r += a[i]; not all of the loads from a are
@@ -165,134 +170,67 @@ static unsigned getNewAlignment(const SCEV *AASCEV, const SCEV *AlignSCEV,
     // first iteration, and also the alignment using the per-iteration delta.
     // If these are the same, then use that answer. Otherwise, use the smaller
     // one, but only if it divides the larger one.
-    NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
-    unsigned NewIncAlignment = getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
-
-    LLVM_DEBUG(dbgs() << "\tnew start alignment: " << NewAlignment << "\n");
-    LLVM_DEBUG(dbgs() << "\tnew inc alignment: " << NewIncAlignment << "\n");
-
-    if (!NewAlignment || !NewIncAlignment) {
-      return 0;
-    } else if (NewAlignment > NewIncAlignment) {
-      if (NewAlignment % NewIncAlignment == 0) {
-        LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewIncAlignment
-                          << "\n");
-        return NewIncAlignment;
-      }
-    } else if (NewIncAlignment > NewAlignment) {
-      if (NewIncAlignment % NewAlignment == 0) {
-        LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewAlignment
-                          << "\n");
-        return NewAlignment;
-      }
-    } else if (NewIncAlignment == NewAlignment) {
-      LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << NewAlignment
+    MaybeAlign NewAlignment = getNewAlignmentDiff(DiffStartSCEV, AlignSCEV, SE);
+    MaybeAlign NewIncAlignment =
+        getNewAlignmentDiff(DiffIncSCEV, AlignSCEV, SE);
+
+    LLVM_DEBUG(dbgs() << "\tnew start alignment: " << DebugStr(NewAlignment)
+                      << "\n");
+    LLVM_DEBUG(dbgs() << "\tnew inc alignment: " << DebugStr(NewIncAlignment)
+                      << "\n");
+
+    if (!NewAlignment || !NewIncAlignment)
+      return Align(1);
+
+    const Align NewAlign = *NewAlignment;
+    const Align NewIncAlign = *NewIncAlignment;
+    if (NewAlign > NewIncAlign) {
+      LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: "
+                        << DebugStr(NewIncAlign) << "\n");
+      return NewIncAlign;
+    }
+    if (NewIncAlign > NewAlign) {
+      LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << DebugStr(NewAlign)
                         << "\n");
-      return NewAlignment;
+      return NewAlign;
     }
+    assert(NewIncAlign == NewAlign);
+    LLVM_DEBUG(dbgs() << "\tnew start/inc alignment: " << DebugStr(NewAlign)
+                      << "\n");
+    return NewAlign;
   }
 
-  return 0;
+  return Align(1);
 }
 
 bool AlignmentFromAssumptionsPass::extractAlignmentInfo(CallInst *I,
+                                                        unsigned Idx,
                                                         Value *&AAPtr,
                                                         const SCEV *&AlignSCEV,
                                                         const SCEV *&OffSCEV) {
-  // An alignment assume must be a statement about the least-significant
-  // bits of the pointer being zero, possibly with some offset.
-  ICmpInst *ICI = dyn_cast<ICmpInst>(I->getArgOperand(0));
-  if (!ICI)
-    return false;
-
-  // This must be an expression of the form: x & m == 0.
-  if (ICI->getPredicate() != ICmpInst::ICMP_EQ)
-    return false;
-
-  // Swap things around so that the RHS is 0.
-  Value *CmpLHS = ICI->getOperand(0);
-  Value *CmpRHS = ICI->getOperand(1);
-  const SCEV *CmpLHSSCEV = SE->getSCEV(CmpLHS);
-  const SCEV *CmpRHSSCEV = SE->getSCEV(CmpRHS);
-  if (CmpLHSSCEV->isZero())
-    std::swap(CmpLHS, CmpRHS);
-  else if (!CmpRHSSCEV->isZero())
+  Type *Int64Ty = Type::getInt64Ty(I->getContext());
+  OperandBundleUse AlignOB = I->getOperandBundleAt(Idx);
+  if (AlignOB.getTagName() != "align")
     return false;
-
-  BinaryOperator *CmpBO = dyn_cast<BinaryOperator>(CmpLHS);
-  if (!CmpBO || CmpBO->getOpcode() != Instruction::And)
-    return false;
-
-  // Swap things around so that the right operand of the and is a constant
-  // (the mask); we cannot deal with variable masks.
-  Value *AndLHS = CmpBO->getOperand(0);
-  Value *AndRHS = CmpBO->getOperand(1);
-  const SCEV *AndLHSSCEV = SE->getSCEV(AndLHS);
-  const SCEV *AndRHSSCEV = SE->getSCEV(AndRHS);
-  if (isa<SCEVConstant>(AndLHSSCEV)) {
-    std::swap(AndLHS, AndRHS);
-    std::swap(AndLHSSCEV, AndRHSSCEV);
-  }
-
-  const SCEVConstant *MaskSCEV = dyn_cast<SCEVConstant>(AndRHSSCEV);
-  if (!MaskSCEV)
-    return false;
-
-  // The mask must have some trailing ones (otherwise the condition is
-  // trivial and tells us nothing about the alignment of the left operand).
-  unsigned TrailingOnes = MaskSCEV->getAPInt().countTrailingOnes();
-  if (!TrailingOnes)
-    return false;
-
-  // Cap the alignment at the maximum with which LLVM can deal (and make sure
-  // we don't overflow the shift).
-  uint64_t Alignment;
-  TrailingOnes = std::min(TrailingOnes,
-    unsigned(sizeof(unsigned) * CHAR_BIT - 1));
-  Alignment = std::min(1u << TrailingOnes, +Value::MaximumAlignment);
-
-  Type *Int64Ty = Type::getInt64Ty(I->getParent()->getParent()->getContext());
-  AlignSCEV = SE->getConstant(Int64Ty, Alignment);
-
-  // The LHS might be a ptrtoint instruction, or it might be the pointer
-  // with an offset.
-  AAPtr = nullptr;
-  OffSCEV = nullptr;
-  if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(AndLHS)) {
-    AAPtr = PToI->getPointerOperand();
+  assert(AlignOB.Inputs.size() >= 2);
+  AAPtr = AlignOB.Inputs[0].get();
+  // TODO: Consider accumulating the offset to the base.
+  AAPtr = AAPtr->stripPointerCastsSameRepresentation();
+  AlignSCEV = SE->getSCEV(AlignOB.Inputs[1].get());
+  AlignSCEV = SE->getTruncateOrZeroExtend(AlignSCEV, Int64Ty);
+  if (AlignOB.Inputs.size() == 3)
+    OffSCEV = SE->getSCEV(AlignOB.Inputs[2].get());
+  else
     OffSCEV = SE->getZero(Int64Ty);
-  } else if (const SCEVAddExpr* AndLHSAddSCEV =
-             dyn_cast<SCEVAddExpr>(AndLHSSCEV)) {
-    // Try to find the ptrtoint; subtract it and the rest is the offset.
-    for (SCEVAddExpr::op_iterator J = AndLHSAddSCEV->op_begin(),
-         JE = AndLHSAddSCEV->op_end(); J != JE; ++J)
-      if (const SCEVUnknown *OpUnk = dyn_cast<SCEVUnknown>(*J))
-        if (PtrToIntInst *PToI = dyn_cast<PtrToIntInst>(OpUnk->getValue())) {
-          AAPtr = PToI->getPointerOperand();
-          OffSCEV = SE->getMinusSCEV(AndLHSAddSCEV, *J);
-          break;
-        }
-  }
-
-  if (!AAPtr)
-    return false;
-
-  // Sign extend the offset to 64 bits (so that it is like all of the other
-  // expressions).
-  unsigned OffSCEVBits = OffSCEV->getType()->getPrimitiveSizeInBits();
-  if (OffSCEVBits < 64)
-    OffSCEV = SE->getSignExtendExpr(OffSCEV, Int64Ty);
-  else if (OffSCEVBits > 64)
-    return false;
-
-  AAPtr = AAPtr->stripPointerCasts();
+  OffSCEV = SE->getTruncateOrZeroExtend(OffSCEV, Int64Ty);
   return true;
 }
 
-bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
+bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall,
+                                                     unsigned Idx) {
   Value *AAPtr;
   const SCEV *AlignSCEV, *OffSCEV;
-  if (!extractAlignmentInfo(ACall, AAPtr, AlignSCEV, OffSCEV))
+  if (!extractAlignmentInfo(ACall, Idx, AAPtr, AlignSCEV, OffSCEV))
     return false;
 
   // Skip ConstantPointerNull and UndefValue.  Assumptions on these shouldn't
@@ -310,35 +248,38 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
       continue;
 
     if (Instruction *K = dyn_cast<Instruction>(J))
-      if (isValidAssumeForContext(ACall, K, DT))
         WorkList.push_back(K);
   }
 
   while (!WorkList.empty()) {
     Instruction *J = WorkList.pop_back_val();
-
     if (LoadInst *LI = dyn_cast<LoadInst>(J)) {
-      unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
-        LI->getPointerOperand(), SE);
-
-      if (NewAlignment > LI->getAlignment()) {
-        LI->setAlignment(MaybeAlign(NewAlignment));
+      if (!isValidAssumeForContext(ACall, J, DT))
+        continue;
+      Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+                                           LI->getPointerOperand(), SE);
+      if (NewAlignment > LI->getAlign()) {
+        LI->setAlignment(NewAlignment);
         ++NumLoadAlignChanged;
       }
     } else if (StoreInst *SI = dyn_cast<StoreInst>(J)) {
-      unsigned NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
-        SI->getPointerOperand(), SE);
-
-      if (NewAlignment > SI->getAlignment()) {
-        SI->setAlignment(MaybeAlign(NewAlignment));
+      if (!isValidAssumeForContext(ACall, J, DT))
+        continue;
+      Align NewAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
+                                           SI->getPointerOperand(), SE);
+      if (NewAlignment > SI->getAlign()) {
+        SI->setAlignment(NewAlignment);
         ++NumStoreAlignChanged;
       }
     } else if (MemIntrinsic *MI = dyn_cast<MemIntrinsic>(J)) {
-      unsigned NewDestAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
-        MI->getDest(), SE);
-
-      LLVM_DEBUG(dbgs() << "\tmem inst: " << NewDestAlignment << "\n";);
-      if (NewDestAlignment > MI->getDestAlignment()) {
+      if (!isValidAssumeForContext(ACall, J, DT))
+        continue;
+      Align NewDestAlignment =
+          getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MI->getDest(), SE);
+
+      LLVM_DEBUG(dbgs() << "\tmem inst: " << DebugStr(NewDestAlignment)
+                        << "\n";);
+      if (NewDestAlignment > *MI->getDestAlign()) {
         MI->setDestAlignment(NewDestAlignment);
         ++NumMemIntAlignChanged;
       }
@@ -346,12 +287,13 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
       // For memory transfers, there is also a source alignment that
       // can be set.
       if (MemTransferInst *MTI = dyn_cast<MemTransferInst>(MI)) {
-        unsigned NewSrcAlignment = getNewAlignment(AASCEV, AlignSCEV, OffSCEV,
-          MTI->getSource(), SE);
+        Align NewSrcAlignment =
+            getNewAlignment(AASCEV, AlignSCEV, OffSCEV, MTI->getSource(), SE);
 
-        LLVM_DEBUG(dbgs() << "\tmem trans: " << NewSrcAlignment << "\n";);
+        LLVM_DEBUG(dbgs() << "\tmem trans: " << DebugStr(NewSrcAlignment)
+                          << "\n";);
 
-        if (NewSrcAlignment > MTI->getSourceAlignment()) {
+        if (NewSrcAlignment > *MTI->getSourceAlign()) {
           MTI->setSourceAlignment(NewSrcAlignment);
           ++NumMemIntAlignChanged;
         }
@@ -363,7 +305,7 @@ bool AlignmentFromAssumptionsPass::processAssumption(CallInst *ACall) {
     Visited.insert(J);
     for (User *UJ : J->users()) {
       Instruction *K = cast<Instruction>(UJ);
-      if (!Visited.count(K) && isValidAssumeForContext(ACall, K, DT))
+      if (!Visited.count(K))
         WorkList.push_back(K);
     }
   }
@@ -390,8 +332,11 @@ bool AlignmentFromAssumptionsPass::runImpl(Function &F, AssumptionCache &AC,
 
   bool Changed = false;
   for (auto &AssumeVH : AC.assumptions())
-    if (AssumeVH)
-      Changed |= processAssumption(cast<CallInst>(AssumeVH));
+    if (AssumeVH) {
+      CallInst *Call = cast<CallInst>(AssumeVH);
+      for (unsigned Idx = 0; Idx < Call->getNumOperandBundles(); Idx++)
+        Changed |= processAssumption(Call, Idx);
+    }
 
   return Changed;
 }
diff --git a/llvm/lib/Transforms/Scalar/BDCE.cpp b/llvm/lib/Transforms/Scalar/BDCE.cpp
index 0fa38fa80b17..767c7656dcfa 100644
--- a/llvm/lib/Transforms/Scalar/BDCE.cpp
+++ b/llvm/lib/Transforms/Scalar/BDCE.cpp
@@ -9,7 +9,8 @@
 // This file implements the Bit-Tracking Dead Code Elimination pass. Some
 // instructions (shifts, some ands, ors, etc.) kill some of their input bits.
 // We track these dead bits and remove instructions that compute only these
-// dead bits.
+// dead bits. We also simplify sext that generates unused extension bits,
+// converting it to a zext.
 //
 //===----------------------------------------------------------------------===//
 
@@ -19,6 +20,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/DemandedBits.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/InitializePasses.h"
@@ -33,6 +35,8 @@ using namespace llvm;
 
 STATISTIC(NumRemoved, "Number of instructions removed (unused)");
 STATISTIC(NumSimplified, "Number of instructions trivialized (dead bits)");
+STATISTIC(NumSExt2ZExt,
+          "Number of sign extension instructions converted to zero extension");
 
 /// If an instruction is trivialized (dead), then the chain of users of that
 /// instruction may need to be cleared of assumptions that can no longer be
@@ -102,13 +106,31 @@ static bool bitTrackingDCE(Function &F, DemandedBits &DB) {
         (I.getType()->isIntOrIntVectorTy() &&
          DB.getDemandedBits(&I).isNullValue() &&
          wouldInstructionBeTriviallyDead(&I))) {
-      salvageDebugInfoOrMarkUndef(I);
+      salvageDebugInfo(I);
       Worklist.push_back(&I);
       I.dropAllReferences();
       Changed = true;
       continue;
     }
 
+    // Convert SExt into ZExt if none of the extension bits is required
+    if (SExtInst *SE = dyn_cast<SExtInst>(&I)) {
+      APInt Demanded = DB.getDemandedBits(SE);
+      const uint32_t SrcBitSize = SE->getSrcTy()->getScalarSizeInBits();
+      auto *const DstTy = SE->getDestTy();
+      const uint32_t DestBitSize = DstTy->getScalarSizeInBits();
+      if (Demanded.countLeadingZeros() >= (DestBitSize - SrcBitSize)) {
+        clearAssumptionsOfUsers(SE, DB);
+        IRBuilder<> Builder(SE);
+        I.replaceAllUsesWith(
+            Builder.CreateZExt(SE->getOperand(0), DstTy, SE->getName()));
+        Worklist.push_back(SE);
+        Changed = true;
+        NumSExt2ZExt++;
+        continue;
+      }
+    }
+
     for (Use &U : I.operands()) {
       // DemandedBits only detects dead integer uses.
       if (!U->getType()->isIntOrIntVectorTy())
diff --git a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
index e34c011b1c87..b26bd1114bd4 100644
--- a/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
+++ b/llvm/lib/Transforms/Scalar/CallSiteSplitting.cpp
@@ -85,37 +85,36 @@ static cl::opt<unsigned>
                                   "their cost is below DuplicationThreshold"),
                          cl::init(5));
 
-static void addNonNullAttribute(CallSite CS, Value *Op) {
+static void addNonNullAttribute(CallBase &CB, Value *Op) {
   unsigned ArgNo = 0;
-  for (auto &I : CS.args()) {
+  for (auto &I : CB.args()) {
     if (&*I == Op)
-      CS.addParamAttr(ArgNo, Attribute::NonNull);
+      CB.addParamAttr(ArgNo, Attribute::NonNull);
     ++ArgNo;
   }
 }
 
-static void setConstantInArgument(CallSite CS, Value *Op,
+static void setConstantInArgument(CallBase &CB, Value *Op,
                                   Constant *ConstValue) {
   unsigned ArgNo = 0;
-  for (auto &I : CS.args()) {
+  for (auto &I : CB.args()) {
     if (&*I == Op) {
       // It is possible we have already added the non-null attribute to the
       // parameter by using an earlier constraining condition.
-      CS.removeParamAttr(ArgNo, Attribute::NonNull);
-      CS.setArgument(ArgNo, ConstValue);
+      CB.removeParamAttr(ArgNo, Attribute::NonNull);
+      CB.setArgOperand(ArgNo, ConstValue);
     }
     ++ArgNo;
   }
 }
 
-static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallSite CS) {
+static bool isCondRelevantToAnyCallArgument(ICmpInst *Cmp, CallBase &CB) {
   assert(isa<Constant>(Cmp->getOperand(1)) && "Expected a constant operand.");
   Value *Op0 = Cmp->getOperand(0);
   unsigned ArgNo = 0;
-  for (CallSite::arg_iterator I = CS.arg_begin(), E = CS.arg_end(); I != E;
-       ++I, ++ArgNo) {
+  for (auto I = CB.arg_begin(), E = CB.arg_end(); I != E; ++I, ++ArgNo) {
     // Don't consider constant or arguments that are already known non-null.
-    if (isa<Constant>(*I) || CS.paramHasAttr(ArgNo, Attribute::NonNull))
+    if (isa<Constant>(*I) || CB.paramHasAttr(ArgNo, Attribute::NonNull))
       continue;
 
     if (*I == Op0)
@@ -128,8 +127,8 @@ typedef std::pair<ICmpInst *, unsigned> ConditionTy;
 typedef SmallVector<ConditionTy, 2> ConditionsTy;
 
 /// If From has a conditional jump to To, add the condition to Conditions,
-/// if it is relevant to any argument at CS.
-static void recordCondition(CallSite CS, BasicBlock *From, BasicBlock *To,
+/// if it is relevant to any argument at CB.
+static void recordCondition(CallBase &CB, BasicBlock *From, BasicBlock *To,
                             ConditionsTy &Conditions) {
   auto *BI = dyn_cast<BranchInst>(From->getTerminator());
   if (!BI || !BI->isConditional())
@@ -142,38 +141,38 @@ static void recordCondition(CallSite CS, BasicBlock *From, BasicBlock *To,
 
   ICmpInst *Cmp = cast<ICmpInst>(Cond);
   if (Pred == ICmpInst::ICMP_EQ || Pred == ICmpInst::ICMP_NE)
-    if (isCondRelevantToAnyCallArgument(Cmp, CS))
+    if (isCondRelevantToAnyCallArgument(Cmp, CB))
       Conditions.push_back({Cmp, From->getTerminator()->getSuccessor(0) == To
                                      ? Pred
                                      : Cmp->getInversePredicate()});
 }
 
-/// Record ICmp conditions relevant to any argument in CS following Pred's
+/// Record ICmp conditions relevant to any argument in CB following Pred's
 /// single predecessors. If there are conflicting conditions along a path, like
 /// x == 1 and x == 0, the first condition will be used. We stop once we reach
 /// an edge to StopAt.
-static void recordConditions(CallSite CS, BasicBlock *Pred,
+static void recordConditions(CallBase &CB, BasicBlock *Pred,
                              ConditionsTy &Conditions, BasicBlock *StopAt) {
   BasicBlock *From = Pred;
   BasicBlock *To = Pred;
   SmallPtrSet<BasicBlock *, 4> Visited;
   while (To != StopAt && !Visited.count(From->getSinglePredecessor()) &&
          (From = From->getSinglePredecessor())) {
-    recordCondition(CS, From, To, Conditions);
+    recordCondition(CB, From, To, Conditions);
     Visited.insert(From);
     To = From;
   }
 }
 
-static void addConditions(CallSite CS, const ConditionsTy &Conditions) {
+static void addConditions(CallBase &CB, const ConditionsTy &Conditions) {
   for (auto &Cond : Conditions) {
     Value *Arg = Cond.first->getOperand(0);
     Constant *ConstVal = cast<Constant>(Cond.first->getOperand(1));
     if (Cond.second == ICmpInst::ICMP_EQ)
-      setConstantInArgument(CS, Arg, ConstVal);
+      setConstantInArgument(CB, Arg, ConstVal);
     else if (ConstVal->getType()->isPointerTy() && ConstVal->isNullValue()) {
       assert(Cond.second == ICmpInst::ICMP_NE);
-      addNonNullAttribute(CS, Arg);
+      addNonNullAttribute(CB, Arg);
     }
   }
 }
@@ -184,17 +183,16 @@ static SmallVector<BasicBlock *, 2> getTwoPredecessors(BasicBlock *BB) {
   return Preds;
 }
 
-static bool canSplitCallSite(CallSite CS, TargetTransformInfo &TTI) {
-  if (CS.isConvergent() || CS.cannotDuplicate())
+static bool canSplitCallSite(CallBase &CB, TargetTransformInfo &TTI) {
+  if (CB.isConvergent() || CB.cannotDuplicate())
     return false;
 
   // FIXME: As of now we handle only CallInst. InvokeInst could be handled
   // without too much effort.
-  Instruction *Instr = CS.getInstruction();
-  if (!isa<CallInst>(Instr))
+  if (!isa<CallInst>(CB))
     return false;
 
-  BasicBlock *CallSiteBB = Instr->getParent();
+  BasicBlock *CallSiteBB = CB.getParent();
   // Need 2 predecessors and cannot split an edge from an IndirectBrInst.
   SmallVector<BasicBlock *, 2> Preds(predecessors(CallSiteBB));
   if (Preds.size() != 2 || isa<IndirectBrInst>(Preds[0]->getTerminator()) ||
@@ -212,7 +210,7 @@ static bool canSplitCallSite(CallSite CS, TargetTransformInfo &TTI) {
   // corresponding uses will be updated.
   unsigned Cost = 0;
   for (auto &InstBeforeCall :
-       llvm::make_range(CallSiteBB->begin(), Instr->getIterator())) {
+       llvm::make_range(CallSiteBB->begin(), CB.getIterator())) {
     Cost += TTI.getInstructionCost(&InstBeforeCall,
                                    TargetTransformInfo::TCK_CodeSize);
     if (Cost >= DuplicationThreshold)
@@ -304,24 +302,23 @@ static void copyMustTailReturn(BasicBlock *SplitBB, Instruction *CI,
 /// predecessors, new call-sites with more constrained arguments will be
 /// created in createCallSitesOnPredicatedArgument().
 static void splitCallSite(
-    CallSite CS,
+    CallBase &CB,
     const SmallVectorImpl<std::pair<BasicBlock *, ConditionsTy>> &Preds,
     DomTreeUpdater &DTU) {
-  Instruction *Instr = CS.getInstruction();
-  BasicBlock *TailBB = Instr->getParent();
-  bool IsMustTailCall = CS.isMustTailCall();
+  BasicBlock *TailBB = CB.getParent();
+  bool IsMustTailCall = CB.isMustTailCall();
 
   PHINode *CallPN = nullptr;
 
   // `musttail` calls must be followed by optional `bitcast`, and `ret`. The
   // split blocks will be terminated right after that so there're no users for
   // this phi in a `TailBB`.
-  if (!IsMustTailCall && !Instr->use_empty()) {
-    CallPN = PHINode::Create(Instr->getType(), Preds.size(), "phi.call");
-    CallPN->setDebugLoc(Instr->getDebugLoc());
+  if (!IsMustTailCall && !CB.use_empty()) {
+    CallPN = PHINode::Create(CB.getType(), Preds.size(), "phi.call");
+    CallPN->setDebugLoc(CB.getDebugLoc());
   }
 
-  LLVM_DEBUG(dbgs() << "split call-site : " << *Instr << " into \n");
+  LLVM_DEBUG(dbgs() << "split call-site : " << CB << " into \n");
 
   assert(Preds.size() == 2 && "The ValueToValueMaps array has size 2.");
   // ValueToValueMapTy is neither copy nor moveable, so we use a simple array
@@ -330,21 +327,20 @@ static void splitCallSite(
   for (unsigned i = 0; i < Preds.size(); i++) {
     BasicBlock *PredBB = Preds[i].first;
     BasicBlock *SplitBlock = DuplicateInstructionsInSplitBetween(
-        TailBB, PredBB, &*std::next(Instr->getIterator()), ValueToValueMaps[i],
+        TailBB, PredBB, &*std::next(CB.getIterator()), ValueToValueMaps[i],
         DTU);
     assert(SplitBlock && "Unexpected new basic block split.");
 
-    Instruction *NewCI =
-        &*std::prev(SplitBlock->getTerminator()->getIterator());
-    CallSite NewCS(NewCI);
-    addConditions(NewCS, Preds[i].second);
+    auto *NewCI =
+        cast<CallBase>(&*std::prev(SplitBlock->getTerminator()->getIterator()));
+    addConditions(*NewCI, Preds[i].second);
 
     // Handle PHIs used as arguments in the call-site.
     for (PHINode &PN : TailBB->phis()) {
       unsigned ArgNo = 0;
-      for (auto &CI : CS.args()) {
+      for (auto &CI : CB.args()) {
         if (&*CI == &PN) {
-          NewCS.setArgument(ArgNo, PN.getIncomingValueForBlock(SplitBlock));
+          NewCI->setArgOperand(ArgNo, PN.getIncomingValueForBlock(SplitBlock));
         }
         ++ArgNo;
       }
@@ -356,7 +352,7 @@ static void splitCallSite(
 
     // Clone and place bitcast and return instructions before `TI`
     if (IsMustTailCall)
-      copyMustTailReturn(SplitBlock, Instr, NewCI);
+      copyMustTailReturn(SplitBlock, &CB, NewCI);
   }
 
   NumCallSiteSplit++;
@@ -383,7 +379,7 @@ static void splitCallSite(
   // Replace users of the original call with a PHI mering call-sites split.
   if (CallPN) {
     CallPN->insertBefore(OriginalBegin);
-    Instr->replaceAllUsesWith(CallPN);
+    CB.replaceAllUsesWith(CallPN);
   }
 
   // Remove instructions moved to split blocks from TailBB, from the duplicated
@@ -393,7 +389,7 @@ static void splitCallSite(
   // instruction, so we do not end up deleting them. By using reverse-order, we
   // do not introduce unnecessary PHI nodes for def-use chains from the call
   // instruction to the beginning of the block.
-  auto I = Instr->getReverseIterator();
+  auto I = CB.getReverseIterator();
   while (I != TailBB->rend()) {
     Instruction *CurrentI = &*I++;
     if (!CurrentI->use_empty()) {
@@ -418,28 +414,25 @@ static void splitCallSite(
 
 // Return true if the call-site has an argument which is a PHI with only
 // constant incoming values.
-static bool isPredicatedOnPHI(CallSite CS) {
-  Instruction *Instr = CS.getInstruction();
-  BasicBlock *Parent = Instr->getParent();
-  if (Instr != Parent->getFirstNonPHIOrDbg())
+static bool isPredicatedOnPHI(CallBase &CB) {
+  BasicBlock *Parent = CB.getParent();
+  if (&CB != Parent->getFirstNonPHIOrDbg())
     return false;
 
-  for (auto &BI : *Parent) {
-    if (PHINode *PN = dyn_cast<PHINode>(&BI)) {
-      for (auto &I : CS.args())
-        if (&*I == PN) {
-          assert(PN->getNumIncomingValues() == 2 &&
-                 "Unexpected number of incoming values");
-          if (PN->getIncomingBlock(0) == PN->getIncomingBlock(1))
-            return false;
-          if (PN->getIncomingValue(0) == PN->getIncomingValue(1))
-            continue;
-          if (isa<Constant>(PN->getIncomingValue(0)) &&
-              isa<Constant>(PN->getIncomingValue(1)))
-            return true;
-        }
+  for (auto &PN : Parent->phis()) {
+    for (auto &Arg : CB.args()) {
+      if (&*Arg != &PN)
+        continue;
+      assert(PN.getNumIncomingValues() == 2 &&
+             "Unexpected number of incoming values");
+      if (PN.getIncomingBlock(0) == PN.getIncomingBlock(1))
+        return false;
+      if (PN.getIncomingValue(0) == PN.getIncomingValue(1))
+        continue;
+      if (isa<Constant>(PN.getIncomingValue(0)) &&
+          isa<Constant>(PN.getIncomingValue(1)))
+        return true;
     }
-    break;
   }
   return false;
 }
@@ -448,20 +441,20 @@ using PredsWithCondsTy = SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2>;
 
 // Check if any of the arguments in CS are predicated on a PHI node and return
 // the set of predecessors we should use for splitting.
-static PredsWithCondsTy shouldSplitOnPHIPredicatedArgument(CallSite CS) {
-  if (!isPredicatedOnPHI(CS))
+static PredsWithCondsTy shouldSplitOnPHIPredicatedArgument(CallBase &CB) {
+  if (!isPredicatedOnPHI(CB))
     return {};
 
-  auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
+  auto Preds = getTwoPredecessors(CB.getParent());
   return {{Preds[0], {}}, {Preds[1], {}}};
 }
 
 // Checks if any of the arguments in CS are predicated in a predecessor and
 // returns a list of predecessors with the conditions that hold on their edges
 // to CS.
-static PredsWithCondsTy shouldSplitOnPredicatedArgument(CallSite CS,
+static PredsWithCondsTy shouldSplitOnPredicatedArgument(CallBase &CB,
                                                         DomTreeUpdater &DTU) {
-  auto Preds = getTwoPredecessors(CS.getInstruction()->getParent());
+  auto Preds = getTwoPredecessors(CB.getParent());
   if (Preds[0] == Preds[1])
     return {};
 
@@ -470,16 +463,16 @@ static PredsWithCondsTy shouldSplitOnPredicatedArgument(CallSite CS,
   // that node will be the same for all paths to the call site and splitting
   // is not beneficial.
   assert(DTU.hasDomTree() && "We need a DTU with a valid DT!");
-  auto *CSDTNode = DTU.getDomTree().getNode(CS.getInstruction()->getParent());
+  auto *CSDTNode = DTU.getDomTree().getNode(CB.getParent());
   BasicBlock *StopAt = CSDTNode ? CSDTNode->getIDom()->getBlock() : nullptr;
 
   SmallVector<std::pair<BasicBlock *, ConditionsTy>, 2> PredsCS;
   for (auto *Pred : make_range(Preds.rbegin(), Preds.rend())) {
     ConditionsTy Conditions;
     // Record condition on edge BB(CS) <- Pred
-    recordCondition(CS, Pred, CS.getInstruction()->getParent(), Conditions);
+    recordCondition(CB, Pred, CB.getParent(), Conditions);
     // Record conditions following Pred's single predecessors.
-    recordConditions(CS, Pred, Conditions, StopAt);
+    recordConditions(CB, Pred, Conditions, StopAt);
     PredsCS.push_back({Pred, Conditions});
   }
 
@@ -491,19 +484,19 @@ static PredsWithCondsTy shouldSplitOnPredicatedArgument(CallSite CS,
   return PredsCS;
 }
 
-static bool tryToSplitCallSite(CallSite CS, TargetTransformInfo &TTI,
+static bool tryToSplitCallSite(CallBase &CB, TargetTransformInfo &TTI,
                                DomTreeUpdater &DTU) {
   // Check if we can split the call site.
-  if (!CS.arg_size() || !canSplitCallSite(CS, TTI))
+  if (!CB.arg_size() || !canSplitCallSite(CB, TTI))
     return false;
 
-  auto PredsWithConds = shouldSplitOnPredicatedArgument(CS, DTU);
+  auto PredsWithConds = shouldSplitOnPredicatedArgument(CB, DTU);
   if (PredsWithConds.empty())
-    PredsWithConds = shouldSplitOnPHIPredicatedArgument(CS);
+    PredsWithConds = shouldSplitOnPHIPredicatedArgument(CB);
   if (PredsWithConds.empty())
     return false;
 
-  splitCallSite(CS, PredsWithConds, DTU);
+  splitCallSite(CB, PredsWithConds, DTU);
   return true;
 }
 
@@ -521,20 +514,19 @@ static bool doCallSiteSplitting(Function &F, TargetLibraryInfo &TLI,
     // case, IE will be invalidated and we also have to check the current
     // terminator.
     while (II != IE && &*II != BB.getTerminator()) {
-      Instruction *I = &*II++;
-      CallSite CS(cast<Value>(I));
-      if (!CS || isa<IntrinsicInst>(I) || isInstructionTriviallyDead(I, &TLI))
+      CallBase *CB = dyn_cast<CallBase>(&*II++);
+      if (!CB || isa<IntrinsicInst>(CB) || isInstructionTriviallyDead(CB, &TLI))
         continue;
 
-      Function *Callee = CS.getCalledFunction();
+      Function *Callee = CB->getCalledFunction();
       if (!Callee || Callee->isDeclaration())
         continue;
 
       // Successful musttail call-site splits result in erased CI and erased BB.
       // Check if such path is possible before attempting the splitting.
-      bool IsMustTail = CS.isMustTailCall();
+      bool IsMustTail = CB->isMustTailCall();
 
-      Changed |= tryToSplitCallSite(CS, TTI, DTU);
+      Changed |= tryToSplitCallSite(*CB, TTI, DTU);
 
       // There're no interesting instructions after this. The call site
       // itself might have been erased on splitting.
diff --git a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
index 5bfece010bec..7c14b69d658d 100644
--- a/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
+++ b/llvm/lib/Transforms/Scalar/ConstantHoisting.cpp
@@ -250,7 +250,7 @@ static void findBestInsertionSet(DominatorTree &DT, BlockFrequencyInfo &BFI,
   Orders.push_back(Entry);
   while (Idx != Orders.size()) {
     BasicBlock *Node = Orders[Idx++];
-    for (auto ChildDomNode : DT.getNode(Node)->getChildren()) {
+    for (auto ChildDomNode : DT.getNode(Node)->children()) {
       if (Candidates.count(ChildDomNode->getBlock()))
         Orders.push_back(ChildDomNode->getBlock());
     }
@@ -363,10 +363,12 @@ void ConstantHoistingPass::collectConstantCandidates(
   // instruction and operand index.
   if (auto IntrInst = dyn_cast<IntrinsicInst>(Inst))
     Cost = TTI->getIntImmCostIntrin(IntrInst->getIntrinsicID(), Idx,
-                                    ConstInt->getValue(), ConstInt->getType());
+                                    ConstInt->getValue(), ConstInt->getType(),
+                                    TargetTransformInfo::TCK_SizeAndLatency);
   else
     Cost = TTI->getIntImmCostInst(Inst->getOpcode(), Idx, ConstInt->getValue(),
-                                  ConstInt->getType());
+                                  ConstInt->getType(),
+                                  TargetTransformInfo::TCK_SizeAndLatency);
 
   // Ignore cheap integer constants.
   if (Cost > TargetTransformInfo::TCC_Basic) {
@@ -416,7 +418,8 @@ void ConstantHoistingPass::collectConstantCandidates(
   // usually lowered to a load from constant pool. Such operation is unlikely
   // to be cheaper than compute it by <Base + Offset>, which can be lowered to
   // an ADD instruction or folded into Load/Store instruction.
-  int Cost = TTI->getIntImmCostInst(Instruction::Add, 1, Offset, PtrIntTy);
+  int Cost = TTI->getIntImmCostInst(Instruction::Add, 1, Offset, PtrIntTy,
+                                    TargetTransformInfo::TCK_SizeAndLatency);
   ConstCandVecType &ExprCandVec = ConstGEPCandMap[BaseGV];
   ConstCandMapType::iterator Itr;
   bool Inserted;
@@ -491,7 +494,7 @@ void ConstantHoistingPass::collectConstantCandidates(
     // take constant variables is lower than `TargetTransformInfo::TCC_Basic`.
     // So it's safe for us to collect constant candidates from all
     // IntrinsicInsts.
-    if (canReplaceOperandWithVariable(Inst, Idx) || isa<IntrinsicInst>(Inst)) {
+    if (canReplaceOperandWithVariable(Inst, Idx)) {
       collectConstantCandidates(ConstCandMap, Inst, Idx);
     }
   } // end of for all operands
@@ -582,7 +585,8 @@ ConstantHoistingPass::maximizeConstantsInRange(ConstCandVecType::iterator S,
     for (auto User : ConstCand->Uses) {
       unsigned Opcode = User.Inst->getOpcode();
       unsigned OpndIdx = User.OpndIdx;
-      Cost += TTI->getIntImmCostInst(Opcode, OpndIdx, Value, Ty);
+      Cost += TTI->getIntImmCostInst(Opcode, OpndIdx, Value, Ty,
+                                     TargetTransformInfo::TCK_SizeAndLatency);
       LLVM_DEBUG(dbgs() << "Cost: " << Cost << "\n");
 
       for (auto C2 = S; C2 != E; ++C2) {
@@ -975,8 +979,8 @@ PreservedAnalyses ConstantHoistingPass::run(Function &F,
   auto BFI = ConstHoistWithBlockFrequency
                  ? &AM.getResult<BlockFrequencyAnalysis>(F)
                  : nullptr;
-  auto &MAM = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
-  auto *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  auto *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
   if (!runImpl(F, TTI, DT, BFI, F.getEntryBlock(), PSI))
     return PreservedAnalyses::all();
 
diff --git a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
index 3435bc7f5eaa..cd2f4ca36f3b 100644
--- a/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
+++ b/llvm/lib/Transforms/Scalar/CorrelatedValuePropagation.cpp
@@ -22,7 +22,6 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
@@ -125,7 +124,7 @@ Pass *llvm::createCorrelatedValuePropagationPass() {
 
 static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
   if (S->getType()->isVectorTy()) return false;
-  if (isa<Constant>(S->getOperand(0))) return false;
+  if (isa<Constant>(S->getCondition())) return false;
 
   Constant *C = LVI->getConstant(S->getCondition(), S->getParent(), S);
   if (!C) return false;
@@ -133,11 +132,7 @@ static bool processSelect(SelectInst *S, LazyValueInfo *LVI) {
   ConstantInt *CI = dyn_cast<ConstantInt>(C);
   if (!CI) return false;
 
-  Value *ReplaceWith = S->getTrueValue();
-  Value *Other = S->getFalseValue();
-  if (!CI->isOne()) std::swap(ReplaceWith, Other);
-  if (ReplaceWith == S) ReplaceWith = UndefValue::get(S->getType());
-
+  Value *ReplaceWith = CI->isOne() ? S->getTrueValue() : S->getFalseValue();
   S->replaceAllUsesWith(ReplaceWith);
   S->eraseFromParent();
 
@@ -310,9 +305,10 @@ static bool processCmp(CmpInst *Cmp, LazyValueInfo *LVI) {
   // the comparison is testing local values.  While LVI can sometimes reason
   // about such cases, it's not its primary purpose.  We do make sure to do
   // the block local query for uses from terminator instructions, but that's
-  // handled in the code for each terminator.
+  // handled in the code for each terminator. As an exception, we allow phi
+  // nodes, for which LVI can thread the condition into predecessors.
   auto *I = dyn_cast<Instruction>(Op0);
-  if (I && I->getParent() == Cmp->getParent())
+  if (I && I->getParent() == Cmp->getParent() && !isa<PHINode>(I))
     return false;
 
   LazyValueInfo::Tristate Result =
@@ -535,18 +531,18 @@ static void processSaturatingInst(SaturatingInst *SI, LazyValueInfo *LVI) {
 }
 
 /// Infer nonnull attributes for the arguments at the specified callsite.
-static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
+static bool processCallSite(CallBase &CB, LazyValueInfo *LVI) {
   SmallVector<unsigned, 4> ArgNos;
   unsigned ArgNo = 0;
 
-  if (auto *WO = dyn_cast<WithOverflowInst>(CS.getInstruction())) {
+  if (auto *WO = dyn_cast<WithOverflowInst>(&CB)) {
     if (WO->getLHS()->getType()->isIntegerTy() && willNotOverflow(WO, LVI)) {
       processOverflowIntrinsic(WO, LVI);
       return true;
     }
   }
 
-  if (auto *SI = dyn_cast<SaturatingInst>(CS.getInstruction())) {
+  if (auto *SI = dyn_cast<SaturatingInst>(&CB)) {
     if (SI->getType()->isIntegerTy() && willNotOverflow(SI, LVI)) {
       processSaturatingInst(SI, LVI);
       return true;
@@ -559,8 +555,8 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
   // desireable since it may allow further optimization of that value (e.g. via
   // single use rules in instcombine).  Since deopt uses tend to,
   // idiomatically, appear along rare conditional paths, it's reasonable likely
-  // we may have a conditional fact with which LVI can fold.   
-  if (auto DeoptBundle = CS.getOperandBundle(LLVMContext::OB_deopt)) {
+  // we may have a conditional fact with which LVI can fold.
+  if (auto DeoptBundle = CB.getOperandBundle(LLVMContext::OB_deopt)) {
     bool Progress = false;
     for (const Use &ConstU : DeoptBundle->Inputs) {
       Use &U = const_cast<Use&>(ConstU);
@@ -568,7 +564,7 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
       if (V->getType()->isVectorTy()) continue;
       if (isa<Constant>(V)) continue;
 
-      Constant *C = LVI->getConstant(V, CS.getParent(), CS.getInstruction());
+      Constant *C = LVI->getConstant(V, CB.getParent(), &CB);
       if (!C) continue;
       U.set(C);
       Progress = true;
@@ -577,30 +573,30 @@ static bool processCallSite(CallSite CS, LazyValueInfo *LVI) {
       return true;
   }
 
-  for (Value *V : CS.args()) {
+  for (Value *V : CB.args()) {
     PointerType *Type = dyn_cast<PointerType>(V->getType());
     // Try to mark pointer typed parameters as non-null.  We skip the
     // relatively expensive analysis for constants which are obviously either
     // null or non-null to start with.
-    if (Type && !CS.paramHasAttr(ArgNo, Attribute::NonNull) &&
+    if (Type && !CB.paramHasAttr(ArgNo, Attribute::NonNull) &&
         !isa<Constant>(V) &&
         LVI->getPredicateAt(ICmpInst::ICMP_EQ, V,
                             ConstantPointerNull::get(Type),
-                            CS.getInstruction()) == LazyValueInfo::False)
+                            &CB) == LazyValueInfo::False)
       ArgNos.push_back(ArgNo);
     ArgNo++;
   }
 
-  assert(ArgNo == CS.arg_size() && "sanity check");
+  assert(ArgNo == CB.arg_size() && "sanity check");
 
   if (ArgNos.empty())
     return false;
 
-  AttributeList AS = CS.getAttributes();
-  LLVMContext &Ctx = CS.getInstruction()->getContext();
+  AttributeList AS = CB.getAttributes();
+  LLVMContext &Ctx = CB.getContext();
   AS = AS.addParamAttribute(Ctx, ArgNos,
                             Attribute::get(Ctx, Attribute::NonNull));
-  CS.setAttributes(AS);
+  CB.setAttributes(AS);
 
   return true;
 }
@@ -793,7 +789,10 @@ static bool processAnd(BinaryOperator *BinOp, LazyValueInfo *LVI) {
   if (!RHS || !RHS->getValue().isMask())
     return false;
 
-  ConstantRange LRange = LVI->getConstantRange(LHS, BB, BinOp);
+  // We can only replace the AND with LHS based on range info if the range does
+  // not include undef.
+  ConstantRange LRange =
+      LVI->getConstantRange(LHS, BB, BinOp, /*UndefAllowed=*/false);
   if (!LRange.getUnsignedMax().ule(RHS->getValue()))
     return false;
 
@@ -856,7 +855,7 @@ static bool runImpl(Function &F, LazyValueInfo *LVI, DominatorTree *DT,
         break;
       case Instruction::Call:
       case Instruction::Invoke:
-        BBChanged |= processCallSite(CallSite(II), LVI);
+        BBChanged |= processCallSite(cast<CallBase>(*II), LVI);
         break;
       case Instruction::SRem:
         BBChanged |= processSRem(cast<BinaryOperator>(II), LVI);
diff --git a/llvm/lib/Transforms/Scalar/DCE.cpp b/llvm/lib/Transforms/Scalar/DCE.cpp
index a4b0c8df98f6..28947482e303 100644
--- a/llvm/lib/Transforms/Scalar/DCE.cpp
+++ b/llvm/lib/Transforms/Scalar/DCE.cpp
@@ -25,6 +25,7 @@
 #include "llvm/Pass.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 using namespace llvm;
@@ -127,6 +128,7 @@ static bool DCEInstruction(Instruction *I,
       return false;
 
     salvageDebugInfo(*I);
+    salvageKnowledge(I);
 
     // Null out all of the instruction's operands to see if any operand becomes
     // dead as we go.
diff --git a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 1ba4aab999e1..e58db03225ee 100644
--- a/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -18,6 +18,7 @@
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
@@ -29,17 +30,19 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/OrderedBasicBlock.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -48,16 +51,19 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PassManager.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <algorithm>
 #include <cassert>
@@ -68,14 +74,23 @@
 #include <utility>
 
 using namespace llvm;
+using namespace PatternMatch;
 
 #define DEBUG_TYPE "dse"
 
+STATISTIC(NumRemainingStores, "Number of stores remaining after DSE");
 STATISTIC(NumRedundantStores, "Number of redundant stores deleted");
 STATISTIC(NumFastStores, "Number of stores deleted");
 STATISTIC(NumFastOther, "Number of other instrs removed");
 STATISTIC(NumCompletePartials, "Number of stores dead by later partials");
 STATISTIC(NumModifiedStores, "Number of stores modified");
+STATISTIC(NumNoopStores, "Number of noop stores deleted");
+STATISTIC(NumCFGChecks, "Number of stores modified");
+STATISTIC(NumCFGTries, "Number of stores modified");
+STATISTIC(NumCFGSuccess, "Number of stores modified");
+
+DEBUG_COUNTER(MemorySSACounter, "dse-memoryssa",
+              "Controls which MemoryDefs are eliminated.");
 
 static cl::opt<bool>
 EnablePartialOverwriteTracking("enable-dse-partial-overwrite-tracking",
@@ -87,6 +102,25 @@ EnablePartialStoreMerging("enable-dse-partial-store-merging",
   cl::init(true), cl::Hidden,
   cl::desc("Enable partial store merging in DSE"));
 
+static cl::opt<bool>
+    EnableMemorySSA("enable-dse-memoryssa", cl::init(false), cl::Hidden,
+                    cl::desc("Use the new MemorySSA-backed DSE."));
+
+static cl::opt<unsigned>
+    MemorySSAScanLimit("dse-memoryssa-scanlimit", cl::init(100), cl::Hidden,
+                       cl::desc("The number of memory instructions to scan for "
+                                "dead store elimination (default = 100)"));
+
+static cl::opt<unsigned> MemorySSADefsPerBlockLimit(
+    "dse-memoryssa-defs-per-block-limit", cl::init(5000), cl::Hidden,
+    cl::desc("The number of MemoryDefs we consider as candidates to eliminated "
+             "other stores per basic block (default = 5000)"));
+
+static cl::opt<unsigned> MemorySSAPathCheckLimit(
+    "dse-memoryssa-path-check-limit", cl::init(50), cl::Hidden,
+    cl::desc("The maximum number of blocks to check when trying to prove that "
+             "all paths to an exit go through a killing block (default = 50)"));
+
 //===----------------------------------------------------------------------===//
 // Helper functions
 //===----------------------------------------------------------------------===//
@@ -100,7 +134,7 @@ using InstOverlapIntervalsTy = DenseMap<Instruction *, OverlapIntervalsTy>;
 static void
 deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
                       MemoryDependenceResults &MD, const TargetLibraryInfo &TLI,
-                      InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB,
+                      InstOverlapIntervalsTy &IOL,
                       MapVector<Instruction *, bool> &ThrowableInst,
                       SmallSetVector<const Value *, 16> *ValueSet = nullptr) {
   SmallVector<Instruction*, 32> NowDeadInsts;
@@ -123,6 +157,7 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
 
     // Try to preserve debug information attached to the dead instruction.
     salvageDebugInfo(*DeadInst);
+    salvageKnowledge(DeadInst);
 
     // This instruction is dead, zap it, in stages.  Start by removing it from
     // MemDep, which needs to know the operands and needs it to be in the
@@ -143,7 +178,6 @@ deleteDeadInstruction(Instruction *I, BasicBlock::iterator *BBI,
 
     if (ValueSet) ValueSet->remove(DeadInst);
     IOL.erase(DeadInst);
-    OBB.eraseInstruction(DeadInst);
 
     if (NewIter == DeadInst->getIterator())
       NewIter = DeadInst->eraseFromParent();
@@ -177,19 +211,17 @@ static bool hasAnalyzableMemoryWrite(Instruction *I,
       return true;
     }
   }
-  if (auto CS = CallSite(I)) {
-    if (Function *F = CS.getCalledFunction()) {
-      LibFunc LF;
-      if (TLI.getLibFunc(*F, LF) && TLI.has(LF)) {
-        switch (LF) {
-        case LibFunc_strcpy:
-        case LibFunc_strncpy:
-        case LibFunc_strcat:
-        case LibFunc_strncat:
-          return true;
-        default:
-          return false;
-        }
+  if (auto *CB = dyn_cast<CallBase>(I)) {
+    LibFunc LF;
+    if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
+      switch (LF) {
+      case LibFunc_strcpy:
+      case LibFunc_strncpy:
+      case LibFunc_strcat:
+      case LibFunc_strncat:
+        return true;
+      default:
+        return false;
       }
     }
   }
@@ -222,10 +254,10 @@ static MemoryLocation getLocForWrite(Instruction *Inst) {
     }
     }
   }
-  if (auto CS = CallSite(Inst))
+  if (auto *CB = dyn_cast<CallBase>(Inst))
     // All the supported TLI functions so far happen to have dest as their
     // first argument.
-    return MemoryLocation(CS.getArgument(0));
+    return MemoryLocation(CB->getArgOperand(0));
   return MemoryLocation();
 }
 
@@ -272,8 +304,8 @@ static bool isRemovable(Instruction *I) {
   }
 
   // note: only get here for calls with analyzable writes - i.e. libcalls
-  if (auto CS = CallSite(I))
-    return CS.getInstruction()->use_empty();
+  if (auto *CB = dyn_cast<CallBase>(I))
+    return CB->use_empty();
 
   return false;
 }
@@ -597,51 +629,82 @@ static bool isPossibleSelfRead(Instruction *Inst,
 /// instruction.
 static bool memoryIsNotModifiedBetween(Instruction *FirstI,
                                        Instruction *SecondI,
-                                       AliasAnalysis *AA) {
-  SmallVector<BasicBlock *, 16> WorkList;
-  SmallPtrSet<BasicBlock *, 8> Visited;
+                                       AliasAnalysis *AA,
+                                       const DataLayout &DL,
+                                       DominatorTree *DT) {
+  // Do a backwards scan through the CFG from SecondI to FirstI. Look for
+  // instructions which can modify the memory location accessed by SecondI.
+  //
+  // While doing the walk keep track of the address to check. It might be
+  // different in different basic blocks due to PHI translation.
+  using BlockAddressPair = std::pair<BasicBlock *, PHITransAddr>;
+  SmallVector<BlockAddressPair, 16> WorkList;
+  // Keep track of the address we visited each block with. Bail out if we
+  // visit a block with different addresses.
+  DenseMap<BasicBlock *, Value *> Visited;
+
   BasicBlock::iterator FirstBBI(FirstI);
   ++FirstBBI;
   BasicBlock::iterator SecondBBI(SecondI);
   BasicBlock *FirstBB = FirstI->getParent();
   BasicBlock *SecondBB = SecondI->getParent();
   MemoryLocation MemLoc = MemoryLocation::get(SecondI);
+  auto *MemLocPtr = const_cast<Value *>(MemLoc.Ptr);
 
-  // Start checking the store-block.
-  WorkList.push_back(SecondBB);
+  // Start checking the SecondBB.
+  WorkList.push_back(
+      std::make_pair(SecondBB, PHITransAddr(MemLocPtr, DL, nullptr)));
   bool isFirstBlock = true;
 
-  // Check all blocks going backward until we reach the load-block.
+  // Check all blocks going backward until we reach the FirstBB.
   while (!WorkList.empty()) {
-    BasicBlock *B = WorkList.pop_back_val();
+    BlockAddressPair Current = WorkList.pop_back_val();
+    BasicBlock *B = Current.first;
+    PHITransAddr &Addr = Current.second;
+    Value *Ptr = Addr.getAddr();
 
-    // Ignore instructions before LI if this is the FirstBB.
+    // Ignore instructions before FirstI if this is the FirstBB.
     BasicBlock::iterator BI = (B == FirstBB ? FirstBBI : B->begin());
 
     BasicBlock::iterator EI;
     if (isFirstBlock) {
-      // Ignore instructions after SI if this is the first visit of SecondBB.
+      // Ignore instructions after SecondI if this is the first visit of SecondBB.
       assert(B == SecondBB && "first block is not the store block");
       EI = SecondBBI;
       isFirstBlock = false;
     } else {
       // It's not SecondBB or (in case of a loop) the second visit of SecondBB.
-      // In this case we also have to look at instructions after SI.
+      // In this case we also have to look at instructions after SecondI.
       EI = B->end();
     }
     for (; BI != EI; ++BI) {
       Instruction *I = &*BI;
       if (I->mayWriteToMemory() && I != SecondI)
-        if (isModSet(AA->getModRefInfo(I, MemLoc)))
+        if (isModSet(AA->getModRefInfo(I, MemLoc.getWithNewPtr(Ptr))))
           return false;
     }
     if (B != FirstBB) {
       assert(B != &FirstBB->getParent()->getEntryBlock() &&
           "Should not hit the entry block because SI must be dominated by LI");
       for (auto PredI = pred_begin(B), PE = pred_end(B); PredI != PE; ++PredI) {
-        if (!Visited.insert(*PredI).second)
+        PHITransAddr PredAddr = Addr;
+        if (PredAddr.NeedsPHITranslationFromBlock(B)) {
+          if (!PredAddr.IsPotentiallyPHITranslatable())
+            return false;
+          if (PredAddr.PHITranslateValue(B, *PredI, DT, false))
+            return false;
+        }
+        Value *TranslatedPtr = PredAddr.getAddr();
+        auto Inserted = Visited.insert(std::make_pair(*PredI, TranslatedPtr));
+        if (!Inserted.second) {
+          // We already visited this block before. If it was with a different
+          // address - bail out!
+          if (TranslatedPtr != Inserted.first->second)
+            return false;
+          // ... otherwise just skip it.
           continue;
-        WorkList.push_back(*PredI);
+        }
+        WorkList.push_back(std::make_pair(*PredI, PredAddr));
       }
     }
   }
@@ -669,7 +732,7 @@ static void findUnconditionalPreds(SmallVectorImpl<BasicBlock *> &Blocks,
 static bool handleFree(CallInst *F, AliasAnalysis *AA,
                        MemoryDependenceResults *MD, DominatorTree *DT,
                        const TargetLibraryInfo *TLI,
-                       InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB,
+                       InstOverlapIntervalsTy &IOL,
                        MapVector<Instruction *, bool> &ThrowableInst) {
   bool MadeChange = false;
 
@@ -704,7 +767,7 @@ static bool handleFree(CallInst *F, AliasAnalysis *AA,
 
       // DCE instructions only used to calculate that store.
       BasicBlock::iterator BBI(Dependency);
-      deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL, OBB,
+      deleteDeadInstruction(Dependency, &BBI, *MD, *TLI, IOL,
                             ThrowableInst);
       ++NumFastStores;
       MadeChange = true;
@@ -762,7 +825,7 @@ static void removeAccessedObjects(const MemoryLocation &LoadedLoc,
 static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
                            MemoryDependenceResults *MD,
                            const TargetLibraryInfo *TLI,
-                           InstOverlapIntervalsTy &IOL, OrderedBasicBlock &OBB,
+                           InstOverlapIntervalsTy &IOL,
                            MapVector<Instruction *, bool> &ThrowableInst) {
   bool MadeChange = false;
 
@@ -785,7 +848,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
   // Treat byval or inalloca arguments the same, stores to them are dead at the
   // end of the function.
   for (Argument &AI : BB.getParent()->args())
-    if (AI.hasByValOrInAllocaAttr())
+    if (AI.hasPassPointeeByValueAttr())
       DeadStackObjects.insert(&AI);
 
   const DataLayout &DL = BB.getModule()->getDataLayout();
@@ -824,7 +887,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
                    << '\n');
 
         // DCE instructions only used to calculate that store.
-        deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst,
+        deleteDeadInstruction(Dead, &BBI, *MD, *TLI, IOL, ThrowableInst,
                               &DeadStackObjects);
         ++NumFastStores;
         MadeChange = true;
@@ -836,7 +899,7 @@ static bool handleEndBlock(BasicBlock &BB, AliasAnalysis *AA,
     if (isInstructionTriviallyDead(&*BBI, TLI)) {
       LLVM_DEBUG(dbgs() << "DSE: Removing trivially dead instruction:\n  DEAD: "
                         << *&*BBI << '\n');
-      deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst,
+      deleteDeadInstruction(&*BBI, &BBI, *MD, *TLI, IOL, ThrowableInst,
                             &DeadStackObjects);
       ++NumFastOther;
       MadeChange = true;
@@ -1043,8 +1106,8 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
                                const DataLayout &DL,
                                const TargetLibraryInfo *TLI,
                                InstOverlapIntervalsTy &IOL,
-                               OrderedBasicBlock &OBB,
-                               MapVector<Instruction *, bool> &ThrowableInst) {
+                               MapVector<Instruction *, bool> &ThrowableInst,
+                               DominatorTree *DT) {
   // Must be a store instruction.
   StoreInst *SI = dyn_cast<StoreInst>(Inst);
   if (!SI)
@@ -1054,13 +1117,14 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
   // then the store can be removed.
   if (LoadInst *DepLoad = dyn_cast<LoadInst>(SI->getValueOperand())) {
     if (SI->getPointerOperand() == DepLoad->getPointerOperand() &&
-        isRemovable(SI) && memoryIsNotModifiedBetween(DepLoad, SI, AA)) {
+        isRemovable(SI) &&
+        memoryIsNotModifiedBetween(DepLoad, SI, AA, DL, DT)) {
 
       LLVM_DEBUG(
           dbgs() << "DSE: Remove Store Of Load from same pointer:\n  LOAD: "
                  << *DepLoad << "\n  STORE: " << *SI << '\n');
 
-      deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst);
+      deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, ThrowableInst);
       ++NumRedundantStores;
       return true;
     }
@@ -1073,12 +1137,12 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
         dyn_cast<Instruction>(GetUnderlyingObject(SI->getPointerOperand(), DL));
 
     if (UnderlyingPointer && isCallocLikeFn(UnderlyingPointer, TLI) &&
-        memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA)) {
+        memoryIsNotModifiedBetween(UnderlyingPointer, SI, AA, DL, DT)) {
       LLVM_DEBUG(
           dbgs() << "DSE: Remove null store to the calloc'ed object:\n  DEAD: "
                  << *Inst << "\n  OBJECT: " << *UnderlyingPointer << '\n');
 
-      deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, OBB, ThrowableInst);
+      deleteDeadInstruction(SI, &BBI, *MD, *TLI, IOL, ThrowableInst);
       ++NumRedundantStores;
       return true;
     }
@@ -1086,13 +1150,58 @@ static bool eliminateNoopStore(Instruction *Inst, BasicBlock::iterator &BBI,
   return false;
 }
 
+static Constant *
+tryToMergePartialOverlappingStores(StoreInst *Earlier, StoreInst *Later,
+                                   int64_t InstWriteOffset,
+                                   int64_t DepWriteOffset, const DataLayout &DL,
+                                   AliasAnalysis *AA, DominatorTree *DT) {
+
+  if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
+      DL.typeSizeEqualsStoreSize(Earlier->getValueOperand()->getType()) &&
+      Later && isa<ConstantInt>(Later->getValueOperand()) &&
+      DL.typeSizeEqualsStoreSize(Later->getValueOperand()->getType()) &&
+      memoryIsNotModifiedBetween(Earlier, Later, AA, DL, DT)) {
+    // If the store we find is:
+    //   a) partially overwritten by the store to 'Loc'
+    //   b) the later store is fully contained in the earlier one and
+    //   c) they both have a constant value
+    //   d) none of the two stores need padding
+    // Merge the two stores, replacing the earlier store's value with a
+    // merge of both values.
+    // TODO: Deal with other constant types (vectors, etc), and probably
+    // some mem intrinsics (if needed)
+
+    APInt EarlierValue =
+        cast<ConstantInt>(Earlier->getValueOperand())->getValue();
+    APInt LaterValue = cast<ConstantInt>(Later->getValueOperand())->getValue();
+    unsigned LaterBits = LaterValue.getBitWidth();
+    assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth());
+    LaterValue = LaterValue.zext(EarlierValue.getBitWidth());
+
+    // Offset of the smaller store inside the larger store
+    unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8;
+    unsigned LShiftAmount = DL.isBigEndian() ? EarlierValue.getBitWidth() -
+                                                   BitOffsetDiff - LaterBits
+                                             : BitOffsetDiff;
+    APInt Mask = APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount,
+                                   LShiftAmount + LaterBits);
+    // Clear the bits we'll be replacing, then OR with the smaller
+    // store, shifted appropriately.
+    APInt Merged = (EarlierValue & ~Mask) | (LaterValue << LShiftAmount);
+    LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n  Earlier: " << *Earlier
+                      << "\n  Later: " << *Later
+                      << "\n  Merged Value: " << Merged << '\n');
+    return ConstantInt::get(Earlier->getValueOperand()->getType(), Merged);
+  }
+  return nullptr;
+}
+
 static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
                                 MemoryDependenceResults *MD, DominatorTree *DT,
                                 const TargetLibraryInfo *TLI) {
   const DataLayout &DL = BB.getModule()->getDataLayout();
   bool MadeChange = false;
 
-  OrderedBasicBlock OBB(&BB);
   MapVector<Instruction *, bool> ThrowableInst;
 
   // A map of interval maps representing partially-overwritten value parts.
@@ -1102,7 +1211,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
   for (BasicBlock::iterator BBI = BB.begin(), BBE = BB.end(); BBI != BBE; ) {
     // Handle 'free' calls specially.
     if (CallInst *F = isFreeCall(&*BBI, TLI)) {
-      MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, OBB, ThrowableInst);
+      MadeChange |= handleFree(F, AA, MD, DT, TLI, IOL, ThrowableInst);
       // Increment BBI after handleFree has potentially deleted instructions.
       // This ensures we maintain a valid iterator.
       ++BBI;
@@ -1121,14 +1230,14 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
       continue;
 
     // eliminateNoopStore will update in iterator, if necessary.
-    if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL, OBB,
-                           ThrowableInst)) {
+    if (eliminateNoopStore(Inst, BBI, AA, MD, DL, TLI, IOL,
+                           ThrowableInst, DT)) {
       MadeChange = true;
       continue;
     }
 
     // If we find something that writes memory, get its memory dependence.
-    MemDepResult InstDep = MD->getDependency(Inst, &OBB);
+    MemDepResult InstDep = MD->getDependency(Inst);
 
     // Ignore any store where we can't find a local dependence.
     // FIXME: cross-block DSE would be fun. :)
@@ -1179,7 +1288,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
       // If the underlying object is a non-escaping memory allocation, any store
       // to it is dead along the unwind edge. Otherwise, we need to preserve
       // the store.
-      if (LastThrowing && OBB.dominates(DepWrite, LastThrowing)) {
+      if (LastThrowing && DepWrite->comesBefore(LastThrowing)) {
         const Value* Underlying = GetUnderlyingObject(DepLoc.Ptr, DL);
         bool IsStoreDeadOnUnwind = isa<AllocaInst>(Underlying);
         if (!IsStoreDeadOnUnwind) {
@@ -1210,13 +1319,13 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
                             << "\n  KILLER: " << *Inst << '\n');
 
           // Delete the store and now-dead instructions that feed it.
-          deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB,
+          deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL,
                                 ThrowableInst);
           ++NumFastStores;
           MadeChange = true;
 
           // We erased DepWrite; start over.
-          InstDep = MD->getDependency(Inst, &OBB);
+          InstDep = MD->getDependency(Inst);
           continue;
         } else if ((OR == OW_End && isShortenableAtTheEnd(DepWrite)) ||
                    ((OR == OW_Begin &&
@@ -1234,53 +1343,12 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
                    OR == OW_PartialEarlierWithFullLater) {
           auto *Earlier = dyn_cast<StoreInst>(DepWrite);
           auto *Later = dyn_cast<StoreInst>(Inst);
-          if (Earlier && isa<ConstantInt>(Earlier->getValueOperand()) &&
-              DL.typeSizeEqualsStoreSize(
-                  Earlier->getValueOperand()->getType()) &&
-              Later && isa<ConstantInt>(Later->getValueOperand()) &&
-              DL.typeSizeEqualsStoreSize(
-                  Later->getValueOperand()->getType()) &&
-              memoryIsNotModifiedBetween(Earlier, Later, AA)) {
-            // If the store we find is:
-            //   a) partially overwritten by the store to 'Loc'
-            //   b) the later store is fully contained in the earlier one and
-            //   c) they both have a constant value
-            //   d) none of the two stores need padding
-            // Merge the two stores, replacing the earlier store's value with a
-            // merge of both values.
-            // TODO: Deal with other constant types (vectors, etc), and probably
-            // some mem intrinsics (if needed)
-
-            APInt EarlierValue =
-                cast<ConstantInt>(Earlier->getValueOperand())->getValue();
-            APInt LaterValue =
-                cast<ConstantInt>(Later->getValueOperand())->getValue();
-            unsigned LaterBits = LaterValue.getBitWidth();
-            assert(EarlierValue.getBitWidth() > LaterValue.getBitWidth());
-            LaterValue = LaterValue.zext(EarlierValue.getBitWidth());
-
-            // Offset of the smaller store inside the larger store
-            unsigned BitOffsetDiff = (InstWriteOffset - DepWriteOffset) * 8;
-            unsigned LShiftAmount =
-                DL.isBigEndian()
-                    ? EarlierValue.getBitWidth() - BitOffsetDiff - LaterBits
-                    : BitOffsetDiff;
-            APInt Mask =
-                APInt::getBitsSet(EarlierValue.getBitWidth(), LShiftAmount,
-                                  LShiftAmount + LaterBits);
-            // Clear the bits we'll be replacing, then OR with the smaller
-            // store, shifted appropriately.
-            APInt Merged =
-                (EarlierValue & ~Mask) | (LaterValue << LShiftAmount);
-            LLVM_DEBUG(dbgs() << "DSE: Merge Stores:\n  Earlier: " << *DepWrite
-                              << "\n  Later: " << *Inst
-                              << "\n  Merged Value: " << Merged << '\n');
-
+          if (Constant *C = tryToMergePartialOverlappingStores(
+                  Earlier, Later, InstWriteOffset, DepWriteOffset, DL, AA,
+                  DT)) {
             auto *SI = new StoreInst(
-                ConstantInt::get(Earlier->getValueOperand()->getType(), Merged),
-                Earlier->getPointerOperand(), false,
-                MaybeAlign(Earlier->getAlignment()), Earlier->getOrdering(),
-                Earlier->getSyncScopeID(), DepWrite);
+                C, Earlier->getPointerOperand(), false, Earlier->getAlign(),
+                Earlier->getOrdering(), Earlier->getSyncScopeID(), DepWrite);
 
             unsigned MDToKeep[] = {LLVMContext::MD_dbg, LLVMContext::MD_tbaa,
                                    LLVMContext::MD_alias_scope,
@@ -1289,13 +1357,10 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
             SI->copyMetadata(*DepWrite, MDToKeep);
             ++NumModifiedStores;
 
-            // Remove earlier, wider, store
-            OBB.replaceInstruction(DepWrite, SI);
-
             // Delete the old stores and now-dead instructions that feed them.
-            deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL, OBB,
+            deleteDeadInstruction(Inst, &BBI, *MD, *TLI, IOL,
                                   ThrowableInst);
-            deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL, OBB,
+            deleteDeadInstruction(DepWrite, &BBI, *MD, *TLI, IOL,
                                   ThrowableInst);
             MadeChange = true;
 
@@ -1331,7 +1396,7 @@ static bool eliminateDeadStores(BasicBlock &BB, AliasAnalysis *AA,
   // If this block ends in a return, unwind, or unreachable, all allocas are
   // dead at its end, which means stores to them are also dead.
   if (BB.getTerminator()->getNumSuccessors() == 0)
-    MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, OBB, ThrowableInst);
+    MadeChange |= handleEndBlock(BB, AA, MD, TLI, IOL, ThrowableInst);
 
   return MadeChange;
 }
@@ -1349,22 +1414,913 @@ static bool eliminateDeadStores(Function &F, AliasAnalysis *AA,
   return MadeChange;
 }
 
+namespace {
+//=============================================================================
+// MemorySSA backed dead store elimination.
+//
+// The code below implements dead store elimination using MemorySSA. It uses
+// the following general approach: given a MemoryDef, walk upwards to find
+// clobbering MemoryDefs that may be killed by the starting def. Then check
+// that there are no uses that may read the location of the original MemoryDef
+// in between both MemoryDefs. A bit more concretely:
+//
+// For all MemoryDefs StartDef:
+// 1. Get the next dominating clobbering MemoryDef (DomAccess) by walking
+//    upwards.
+// 2. Check that there are no reads between DomAccess and the StartDef by
+//    checking all uses starting at DomAccess and walking until we see StartDef.
+// 3. For each found DomDef, check that:
+//   1. There are no barrier instructions between DomDef and StartDef (like
+//       throws or stores with ordering constraints).
+//   2. StartDef is executed whenever DomDef is executed.
+//   3. StartDef completely overwrites DomDef.
+// 4. Erase DomDef from the function and MemorySSA.
+
+// Returns true if \p M is an intrisnic that does not read or write memory.
+bool isNoopIntrinsic(MemoryUseOrDef *M) {
+  if (const IntrinsicInst *II = dyn_cast<IntrinsicInst>(M->getMemoryInst())) {
+    switch (II->getIntrinsicID()) {
+    case Intrinsic::lifetime_start:
+    case Intrinsic::lifetime_end:
+    case Intrinsic::invariant_end:
+    case Intrinsic::launder_invariant_group:
+    case Intrinsic::assume:
+      return true;
+    case Intrinsic::dbg_addr:
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_label:
+    case Intrinsic::dbg_value:
+      llvm_unreachable("Intrinsic should not be modeled in MemorySSA");
+    default:
+      return false;
+    }
+  }
+  return false;
+}
+
+// Check if we can ignore \p D for DSE.
+bool canSkipDef(MemoryDef *D, bool DefVisibleToCaller) {
+  Instruction *DI = D->getMemoryInst();
+  // Calls that only access inaccessible memory cannot read or write any memory
+  // locations we consider for elimination.
+  if (auto *CB = dyn_cast<CallBase>(DI))
+    if (CB->onlyAccessesInaccessibleMemory())
+      return true;
+
+  // We can eliminate stores to locations not visible to the caller across
+  // throwing instructions.
+  if (DI->mayThrow() && !DefVisibleToCaller)
+    return true;
+
+  // We can remove the dead stores, irrespective of the fence and its ordering
+  // (release/acquire/seq_cst). Fences only constraints the ordering of
+  // already visible stores, it does not make a store visible to other
+  // threads. So, skipping over a fence does not change a store from being
+  // dead.
+  if (isa<FenceInst>(DI))
+    return true;
+
+  // Skip intrinsics that do not really read or modify memory.
+  if (isNoopIntrinsic(D))
+    return true;
+
+  return false;
+}
+
+struct DSEState {
+  Function &F;
+  AliasAnalysis &AA;
+  MemorySSA &MSSA;
+  DominatorTree &DT;
+  PostDominatorTree &PDT;
+  const TargetLibraryInfo &TLI;
+
+  // All MemoryDefs that potentially could kill other MemDefs.
+  SmallVector<MemoryDef *, 64> MemDefs;
+  // Any that should be skipped as they are already deleted
+  SmallPtrSet<MemoryAccess *, 4> SkipStores;
+  // Keep track of all of the objects that are invisible to the caller before
+  // the function returns.
+  SmallPtrSet<const Value *, 16> InvisibleToCallerBeforeRet;
+  // Keep track of all of the objects that are invisible to the caller after
+  // the function returns.
+  SmallPtrSet<const Value *, 16> InvisibleToCallerAfterRet;
+  // Keep track of blocks with throwing instructions not modeled in MemorySSA.
+  SmallPtrSet<BasicBlock *, 16> ThrowingBlocks;
+  // Post-order numbers for each basic block. Used to figure out if memory
+  // accesses are executed before another access.
+  DenseMap<BasicBlock *, unsigned> PostOrderNumbers;
+
+  /// Keep track of instructions (partly) overlapping with killing MemoryDefs per
+  /// basic block.
+  DenseMap<BasicBlock *, InstOverlapIntervalsTy> IOLs;
+
+  DSEState(Function &F, AliasAnalysis &AA, MemorySSA &MSSA, DominatorTree &DT,
+           PostDominatorTree &PDT, const TargetLibraryInfo &TLI)
+      : F(F), AA(AA), MSSA(MSSA), DT(DT), PDT(PDT), TLI(TLI) {}
+
+  static DSEState get(Function &F, AliasAnalysis &AA, MemorySSA &MSSA,
+                      DominatorTree &DT, PostDominatorTree &PDT,
+                      const TargetLibraryInfo &TLI) {
+    DSEState State(F, AA, MSSA, DT, PDT, TLI);
+    // Collect blocks with throwing instructions not modeled in MemorySSA and
+    // alloc-like objects.
+    unsigned PO = 0;
+    for (BasicBlock *BB : post_order(&F)) {
+      State.PostOrderNumbers[BB] = PO++;
+      for (Instruction &I : *BB) {
+        MemoryAccess *MA = MSSA.getMemoryAccess(&I);
+        if (I.mayThrow() && !MA)
+          State.ThrowingBlocks.insert(I.getParent());
+
+        auto *MD = dyn_cast_or_null<MemoryDef>(MA);
+        if (MD && State.MemDefs.size() < MemorySSADefsPerBlockLimit &&
+            (State.getLocForWriteEx(&I) || State.isMemTerminatorInst(&I)))
+          State.MemDefs.push_back(MD);
+
+        // Track whether alloca and alloca-like objects are visible in the
+        // caller before and after the function returns. Alloca objects are
+        // invalid in the caller, so they are neither visible before or after
+        // the function returns.
+        if (isa<AllocaInst>(&I)) {
+          State.InvisibleToCallerBeforeRet.insert(&I);
+          State.InvisibleToCallerAfterRet.insert(&I);
+        }
+
+        // For alloca-like objects we need to check if they are captured before
+        // the function returns and if the return might capture the object.
+        if (isAllocLikeFn(&I, &TLI)) {
+          bool CapturesBeforeRet = PointerMayBeCaptured(&I, false, true);
+          if (!CapturesBeforeRet) {
+            State.InvisibleToCallerBeforeRet.insert(&I);
+            if (!PointerMayBeCaptured(&I, true, false))
+              State.InvisibleToCallerAfterRet.insert(&I);
+          }
+        }
+      }
+    }
+
+    // Treat byval or inalloca arguments the same as Allocas, stores to them are
+    // dead at the end of the function.
+    for (Argument &AI : F.args())
+      if (AI.hasPassPointeeByValueAttr()) {
+        // For byval, the caller doesn't know the address of the allocation.
+        if (AI.hasByValAttr())
+          State.InvisibleToCallerBeforeRet.insert(&AI);
+        State.InvisibleToCallerAfterRet.insert(&AI);
+      }
+
+    return State;
+  }
+
+  Optional<MemoryLocation> getLocForWriteEx(Instruction *I) const {
+    if (!I->mayWriteToMemory())
+      return None;
+
+    if (auto *MTI = dyn_cast<AnyMemIntrinsic>(I))
+      return {MemoryLocation::getForDest(MTI)};
+
+    if (auto *CB = dyn_cast<CallBase>(I)) {
+      LibFunc LF;
+      if (TLI.getLibFunc(*CB, LF) && TLI.has(LF)) {
+        switch (LF) {
+        case LibFunc_strcpy:
+        case LibFunc_strncpy:
+        case LibFunc_strcat:
+        case LibFunc_strncat:
+          return {MemoryLocation(CB->getArgOperand(0))};
+        default:
+          break;
+        }
+      }
+      return None;
+    }
+
+    return MemoryLocation::getOrNone(I);
+  }
+
+  /// Returns true if \p Use completely overwrites \p DefLoc.
+  bool isCompleteOverwrite(MemoryLocation DefLoc, Instruction *UseInst) const {
+    // UseInst has a MemoryDef associated in MemorySSA. It's possible for a
+    // MemoryDef to not write to memory, e.g. a volatile load is modeled as a
+    // MemoryDef.
+    if (!UseInst->mayWriteToMemory())
+      return false;
+
+    if (auto *CB = dyn_cast<CallBase>(UseInst))
+      if (CB->onlyAccessesInaccessibleMemory())
+        return false;
+
+    int64_t InstWriteOffset, DepWriteOffset;
+    auto CC = getLocForWriteEx(UseInst);
+    InstOverlapIntervalsTy IOL;
+
+    const DataLayout &DL = F.getParent()->getDataLayout();
+
+    return CC &&
+           isOverwrite(*CC, DefLoc, DL, TLI, DepWriteOffset, InstWriteOffset,
+                       UseInst, IOL, AA, &F) == OW_Complete;
+  }
+
+  /// Returns true if \p Def is not read before returning from the function.
+  bool isWriteAtEndOfFunction(MemoryDef *Def) {
+    LLVM_DEBUG(dbgs() << "  Check if def " << *Def << " ("
+                      << *Def->getMemoryInst()
+                      << ") is at the end the function \n");
+
+    auto MaybeLoc = getLocForWriteEx(Def->getMemoryInst());
+    if (!MaybeLoc) {
+      LLVM_DEBUG(dbgs() << "  ... could not get location for write.\n");
+      return false;
+    }
+
+    SmallVector<MemoryAccess *, 4> WorkList;
+    SmallPtrSet<MemoryAccess *, 8> Visited;
+    auto PushMemUses = [&WorkList, &Visited](MemoryAccess *Acc) {
+      if (!Visited.insert(Acc).second)
+        return;
+      for (Use &U : Acc->uses())
+        WorkList.push_back(cast<MemoryAccess>(U.getUser()));
+    };
+    PushMemUses(Def);
+    for (unsigned I = 0; I < WorkList.size(); I++) {
+      if (WorkList.size() >= MemorySSAScanLimit) {
+        LLVM_DEBUG(dbgs() << "  ... hit exploration limit.\n");
+        return false;
+      }
+
+      MemoryAccess *UseAccess = WorkList[I];
+      if (isa<MemoryPhi>(UseAccess)) {
+        PushMemUses(UseAccess);
+        continue;
+      }
+
+      // TODO: Checking for aliasing is expensive. Consider reducing the amount
+      // of times this is called and/or caching it.
+      Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
+      if (isReadClobber(*MaybeLoc, UseInst)) {
+        LLVM_DEBUG(dbgs() << "  ... hit read clobber " << *UseInst << ".\n");
+        return false;
+      }
+
+      if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess))
+        PushMemUses(UseDef);
+    }
+    return true;
+  }
+
+  /// If \p I is a memory  terminator like llvm.lifetime.end or free, return a
+  /// pair with the MemoryLocation terminated by \p I and a boolean flag
+  /// indicating whether \p I is a free-like call.
+  Optional<std::pair<MemoryLocation, bool>>
+  getLocForTerminator(Instruction *I) const {
+    uint64_t Len;
+    Value *Ptr;
+    if (match(I, m_Intrinsic<Intrinsic::lifetime_end>(m_ConstantInt(Len),
+                                                      m_Value(Ptr))))
+      return {std::make_pair(MemoryLocation(Ptr, Len), false)};
+
+    if (auto *CB = dyn_cast<CallBase>(I)) {
+      if (isFreeCall(I, &TLI))
+        return {std::make_pair(MemoryLocation(CB->getArgOperand(0)), true)};
+    }
+
+    return None;
+  }
+
+  /// Returns true if \p I is a memory terminator instruction like
+  /// llvm.lifetime.end or free.
+  bool isMemTerminatorInst(Instruction *I) const {
+    IntrinsicInst *II = dyn_cast<IntrinsicInst>(I);
+    return (II && II->getIntrinsicID() == Intrinsic::lifetime_end) ||
+           isFreeCall(I, &TLI);
+  }
+
+  /// Returns true if \p MaybeTerm is a memory terminator for the same
+  /// underlying object as \p DefLoc.
+  bool isMemTerminator(MemoryLocation DefLoc, Instruction *MaybeTerm) const {
+    Optional<std::pair<MemoryLocation, bool>> MaybeTermLoc =
+        getLocForTerminator(MaybeTerm);
+
+    if (!MaybeTermLoc)
+      return false;
+
+    // If the terminator is a free-like call, all accesses to the underlying
+    // object can be considered terminated.
+    if (MaybeTermLoc->second) {
+      DataLayout DL = MaybeTerm->getParent()->getModule()->getDataLayout();
+      DefLoc = MemoryLocation(GetUnderlyingObject(DefLoc.Ptr, DL));
+    }
+    return AA.isMustAlias(MaybeTermLoc->first, DefLoc);
+  }
+
+  // Returns true if \p Use may read from \p DefLoc.
+  bool isReadClobber(MemoryLocation DefLoc, Instruction *UseInst) const {
+    if (!UseInst->mayReadFromMemory())
+      return false;
+
+    if (auto *CB = dyn_cast<CallBase>(UseInst))
+      if (CB->onlyAccessesInaccessibleMemory())
+        return false;
+
+    ModRefInfo MR = AA.getModRefInfo(UseInst, DefLoc);
+    // If necessary, perform additional analysis.
+    if (isRefSet(MR))
+      MR = AA.callCapturesBefore(UseInst, DefLoc, &DT);
+    return isRefSet(MR);
+  }
+
+  // Find a MemoryDef writing to \p DefLoc and dominating \p Current, with no
+  // read access between them or on any other path to a function exit block if
+  // \p DefLoc is not accessible after the function returns. If there is no such
+  // MemoryDef, return None. The returned value may not (completely) overwrite
+  // \p DefLoc. Currently we bail out when we encounter an aliasing MemoryUse
+  // (read).
+  Optional<MemoryAccess *>
+  getDomMemoryDef(MemoryDef *KillingDef, MemoryAccess *Current,
+                  MemoryLocation DefLoc, bool DefVisibleToCallerBeforeRet,
+                  bool DefVisibleToCallerAfterRet, int &ScanLimit) const {
+    MemoryAccess *DomAccess;
+    bool StepAgain;
+    LLVM_DEBUG(dbgs() << "  trying to get dominating access for " << *Current
+                      << "\n");
+    // Find the next clobbering Mod access for DefLoc, starting at Current.
+    do {
+      StepAgain = false;
+      // Reached TOP.
+      if (MSSA.isLiveOnEntryDef(Current))
+        return None;
+
+      if (isa<MemoryPhi>(Current)) {
+        DomAccess = Current;
+        break;
+      }
+      MemoryUseOrDef *CurrentUD = cast<MemoryUseOrDef>(Current);
+      // Look for access that clobber DefLoc.
+      DomAccess = MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(CurrentUD,
+                                                                      DefLoc);
+      if (MSSA.isLiveOnEntryDef(DomAccess))
+        return None;
+
+      if (isa<MemoryPhi>(DomAccess))
+        break;
+
+      // Check if we can skip DomDef for DSE.
+      MemoryDef *DomDef = dyn_cast<MemoryDef>(DomAccess);
+      if (DomDef && canSkipDef(DomDef, DefVisibleToCallerBeforeRet)) {
+        StepAgain = true;
+        Current = DomDef->getDefiningAccess();
+      }
+
+    } while (StepAgain);
+
+    // Accesses to objects accessible after the function returns can only be
+    // eliminated if the access is killed along all paths to the exit. Collect
+    // the blocks with killing (=completely overwriting MemoryDefs) and check if
+    // they cover all paths from DomAccess to any function exit.
+    SmallPtrSet<BasicBlock *, 16> KillingBlocks = {KillingDef->getBlock()};
+    LLVM_DEBUG({
+      dbgs() << "  Checking for reads of " << *DomAccess;
+      if (isa<MemoryDef>(DomAccess))
+        dbgs() << " (" << *cast<MemoryDef>(DomAccess)->getMemoryInst() << ")\n";
+      else
+        dbgs() << ")\n";
+    });
+
+    SmallSetVector<MemoryAccess *, 32> WorkList;
+    auto PushMemUses = [&WorkList](MemoryAccess *Acc) {
+      for (Use &U : Acc->uses())
+        WorkList.insert(cast<MemoryAccess>(U.getUser()));
+    };
+    PushMemUses(DomAccess);
+
+    // Check if DomDef may be read.
+    for (unsigned I = 0; I < WorkList.size(); I++) {
+      MemoryAccess *UseAccess = WorkList[I];
+
+      LLVM_DEBUG(dbgs() << "   " << *UseAccess);
+      if (--ScanLimit == 0) {
+        LLVM_DEBUG(dbgs() << "\n    ...  hit scan limit\n");
+        return None;
+      }
+
+      if (isa<MemoryPhi>(UseAccess)) {
+        LLVM_DEBUG(dbgs() << "\n    ... adding PHI uses\n");
+        PushMemUses(UseAccess);
+        continue;
+      }
+
+      Instruction *UseInst = cast<MemoryUseOrDef>(UseAccess)->getMemoryInst();
+      LLVM_DEBUG(dbgs() << " (" << *UseInst << ")\n");
+
+      if (isNoopIntrinsic(cast<MemoryUseOrDef>(UseAccess))) {
+        LLVM_DEBUG(dbgs() << "    ... adding uses of intrinsic\n");
+        PushMemUses(UseAccess);
+        continue;
+      }
+
+      // A memory terminator kills all preceeding MemoryDefs and all succeeding
+      // MemoryAccesses. We do not have to check it's users.
+      if (isMemTerminator(DefLoc, UseInst))
+        continue;
+
+      // Uses which may read the original MemoryDef mean we cannot eliminate the
+      // original MD. Stop walk.
+      if (isReadClobber(DefLoc, UseInst)) {
+        LLVM_DEBUG(dbgs() << "    ... found read clobber\n");
+        return None;
+      }
+
+      // For the KillingDef and DomAccess we only have to check if it reads the
+      // memory location.
+      // TODO: It would probably be better to check for self-reads before
+      // calling the function.
+      if (KillingDef == UseAccess || DomAccess == UseAccess) {
+        LLVM_DEBUG(dbgs() << "    ... skipping killing def/dom access\n");
+        continue;
+      }
+
+      // Check all uses for MemoryDefs, except for defs completely overwriting
+      // the original location. Otherwise we have to check uses of *all*
+      // MemoryDefs we discover, including non-aliasing ones. Otherwise we might
+      // miss cases like the following
+      //   1 = Def(LoE) ; <----- DomDef stores [0,1]
+      //   2 = Def(1)   ; (2, 1) = NoAlias,   stores [2,3]
+      //   Use(2)       ; MayAlias 2 *and* 1, loads [0, 3].
+      //                  (The Use points to the *first* Def it may alias)
+      //   3 = Def(1)   ; <---- Current  (3, 2) = NoAlias, (3,1) = MayAlias,
+      //                  stores [0,1]
+      if (MemoryDef *UseDef = dyn_cast<MemoryDef>(UseAccess)) {
+        if (isCompleteOverwrite(DefLoc, UseInst)) {
+          if (DefVisibleToCallerAfterRet && UseAccess != DomAccess) {
+            BasicBlock *MaybeKillingBlock = UseInst->getParent();
+            if (PostOrderNumbers.find(MaybeKillingBlock)->second <
+                PostOrderNumbers.find(DomAccess->getBlock())->second) {
+
+              LLVM_DEBUG(dbgs() << "    ... found killing block "
+                                << MaybeKillingBlock->getName() << "\n");
+              KillingBlocks.insert(MaybeKillingBlock);
+            }
+          }
+        } else
+          PushMemUses(UseDef);
+      }
+    }
+
+    // For accesses to locations visible after the function returns, make sure
+    // that the location is killed (=overwritten) along all paths from DomAccess
+    // to the exit.
+    if (DefVisibleToCallerAfterRet) {
+      assert(!KillingBlocks.empty() &&
+             "Expected at least a single killing block");
+      // Find the common post-dominator of all killing blocks.
+      BasicBlock *CommonPred = *KillingBlocks.begin();
+      for (auto I = std::next(KillingBlocks.begin()), E = KillingBlocks.end();
+           I != E; I++) {
+        if (!CommonPred)
+          break;
+        CommonPred = PDT.findNearestCommonDominator(CommonPred, *I);
+      }
+
+      // If CommonPred is in the set of killing blocks, just check if it
+      // post-dominates DomAccess.
+      if (KillingBlocks.count(CommonPred)) {
+        if (PDT.dominates(CommonPred, DomAccess->getBlock()))
+          return {DomAccess};
+        return None;
+      }
+
+      // If the common post-dominator does not post-dominate DomAccess, there
+      // is a path from DomAccess to an exit not going through a killing block.
+      if (PDT.dominates(CommonPred, DomAccess->getBlock())) {
+        SetVector<BasicBlock *> WorkList;
+
+        // DomAccess's post-order number provides an upper bound of the blocks
+        // on a path starting at DomAccess.
+        unsigned UpperBound =
+            PostOrderNumbers.find(DomAccess->getBlock())->second;
+
+        // If CommonPred is null, there are multiple exits from the function.
+        // They all have to be added to the worklist.
+        if (CommonPred)
+          WorkList.insert(CommonPred);
+        else
+          for (BasicBlock *R : PDT.roots())
+            WorkList.insert(R);
+
+        NumCFGTries++;
+        // Check if all paths starting from an exit node go through one of the
+        // killing blocks before reaching DomAccess.
+        for (unsigned I = 0; I < WorkList.size(); I++) {
+          NumCFGChecks++;
+          BasicBlock *Current = WorkList[I];
+          if (KillingBlocks.count(Current))
+            continue;
+          if (Current == DomAccess->getBlock())
+            return None;
+
+          // DomAccess is reachable from the entry, so we don't have to explore
+          // unreachable blocks further.
+          if (!DT.isReachableFromEntry(Current))
+            continue;
+
+          unsigned CPO = PostOrderNumbers.find(Current)->second;
+          // Current block is not on a path starting at DomAccess.
+          if (CPO > UpperBound)
+            continue;
+          for (BasicBlock *Pred : predecessors(Current))
+            WorkList.insert(Pred);
+
+          if (WorkList.size() >= MemorySSAPathCheckLimit)
+            return None;
+        }
+        NumCFGSuccess++;
+        return {DomAccess};
+      }
+      return None;
+    }
+
+    // No aliasing MemoryUses of DomAccess found, DomAccess is potentially dead.
+    return {DomAccess};
+  }
+
+  // Delete dead memory defs
+  void deleteDeadInstruction(Instruction *SI) {
+    MemorySSAUpdater Updater(&MSSA);
+    SmallVector<Instruction *, 32> NowDeadInsts;
+    NowDeadInsts.push_back(SI);
+    --NumFastOther;
+
+    while (!NowDeadInsts.empty()) {
+      Instruction *DeadInst = NowDeadInsts.pop_back_val();
+      ++NumFastOther;
+
+      // Try to preserve debug information attached to the dead instruction.
+      salvageDebugInfo(*DeadInst);
+      salvageKnowledge(DeadInst);
+
+      // Remove the Instruction from MSSA.
+      if (MemoryAccess *MA = MSSA.getMemoryAccess(DeadInst)) {
+        if (MemoryDef *MD = dyn_cast<MemoryDef>(MA)) {
+          SkipStores.insert(MD);
+        }
+        Updater.removeMemoryAccess(MA);
+      }
+
+      auto I = IOLs.find(DeadInst->getParent());
+      if (I != IOLs.end())
+        I->second.erase(DeadInst);
+      // Remove its operands
+      for (Use &O : DeadInst->operands())
+        if (Instruction *OpI = dyn_cast<Instruction>(O)) {
+          O = nullptr;
+          if (isInstructionTriviallyDead(OpI, &TLI))
+            NowDeadInsts.push_back(OpI);
+        }
+
+      DeadInst->eraseFromParent();
+    }
+  }
+
+  // Check for any extra throws between SI and NI that block DSE.  This only
+  // checks extra maythrows (those that aren't MemoryDef's). MemoryDef that may
+  // throw are handled during the walk from one def to the next.
+  bool mayThrowBetween(Instruction *SI, Instruction *NI,
+                       const Value *SILocUnd) const {
+    // First see if we can ignore it by using the fact that SI is an
+    // alloca/alloca like object that is not visible to the caller during
+    // execution of the function.
+    if (SILocUnd && InvisibleToCallerBeforeRet.count(SILocUnd))
+      return false;
+
+    if (SI->getParent() == NI->getParent())
+      return ThrowingBlocks.count(SI->getParent());
+    return !ThrowingBlocks.empty();
+  }
+
+  // Check if \p NI acts as a DSE barrier for \p SI. The following instructions
+  // act as barriers:
+  //  * A memory instruction that may throw and \p SI accesses a non-stack
+  //  object.
+  //  * Atomic stores stronger that monotonic.
+  bool isDSEBarrier(const Value *SILocUnd, Instruction *NI) const {
+    // If NI may throw it acts as a barrier, unless we are to an alloca/alloca
+    // like object that does not escape.
+    if (NI->mayThrow() && !InvisibleToCallerBeforeRet.count(SILocUnd))
+      return true;
+
+    // If NI is an atomic load/store stronger than monotonic, do not try to
+    // eliminate/reorder it.
+    if (NI->isAtomic()) {
+      if (auto *LI = dyn_cast<LoadInst>(NI))
+        return isStrongerThanMonotonic(LI->getOrdering());
+      if (auto *SI = dyn_cast<StoreInst>(NI))
+        return isStrongerThanMonotonic(SI->getOrdering());
+      llvm_unreachable("other instructions should be skipped in MemorySSA");
+    }
+    return false;
+  }
+
+  /// Eliminate writes to objects that are not visible in the caller and are not
+  /// accessed before returning from the function.
+  bool eliminateDeadWritesAtEndOfFunction() {
+    const DataLayout &DL = F.getParent()->getDataLayout();
+    bool MadeChange = false;
+    LLVM_DEBUG(
+        dbgs()
+        << "Trying to eliminate MemoryDefs at the end of the function\n");
+    for (int I = MemDefs.size() - 1; I >= 0; I--) {
+      MemoryDef *Def = MemDefs[I];
+      if (SkipStores.find(Def) != SkipStores.end() ||
+          !isRemovable(Def->getMemoryInst()))
+        continue;
+
+      // TODO: Consider doing the underlying object check first, if it is
+      // beneficial compile-time wise.
+      if (isWriteAtEndOfFunction(Def)) {
+        Instruction *DefI = Def->getMemoryInst();
+        // See through pointer-to-pointer bitcasts
+        SmallVector<const Value *, 4> Pointers;
+        GetUnderlyingObjects(getLocForWriteEx(DefI)->Ptr, Pointers, DL);
+
+        LLVM_DEBUG(dbgs() << "   ... MemoryDef is not accessed until the end "
+                             "of the function\n");
+        bool CanKill = true;
+        for (const Value *Pointer : Pointers) {
+          if (!InvisibleToCallerAfterRet.count(Pointer)) {
+            CanKill = false;
+            break;
+          }
+        }
+
+        if (CanKill) {
+          deleteDeadInstruction(DefI);
+          ++NumFastStores;
+          MadeChange = true;
+        }
+      }
+    }
+    return MadeChange;
+  }
+
+  /// \returns true if \p Def is a no-op store, either because it
+  /// directly stores back a loaded value or stores zero to a calloced object.
+  bool storeIsNoop(MemoryDef *Def, MemoryLocation DefLoc, const Value *DefUO) {
+    StoreInst *Store = dyn_cast<StoreInst>(Def->getMemoryInst());
+    if (!Store)
+      return false;
+
+    if (auto *LoadI = dyn_cast<LoadInst>(Store->getOperand(0))) {
+      if (LoadI->getPointerOperand() == Store->getOperand(1)) {
+        auto *LoadAccess = MSSA.getMemoryAccess(LoadI)->getDefiningAccess();
+        // If both accesses share the same defining access, no instructions
+        // between them can modify the memory location.
+        return LoadAccess == Def->getDefiningAccess();
+      }
+    }
+
+    Constant *StoredConstant = dyn_cast<Constant>(Store->getOperand(0));
+    if (StoredConstant && StoredConstant->isNullValue()) {
+      auto *DefUOInst = dyn_cast<Instruction>(DefUO);
+      if (DefUOInst && isCallocLikeFn(DefUOInst, &TLI)) {
+        auto *UnderlyingDef = cast<MemoryDef>(MSSA.getMemoryAccess(DefUOInst));
+        // If UnderlyingDef is the clobbering access of Def, no instructions
+        // between them can modify the memory location.
+        auto *ClobberDef =
+            MSSA.getSkipSelfWalker()->getClobberingMemoryAccess(Def);
+        return UnderlyingDef == ClobberDef;
+      }
+    }
+    return false;
+  }
+};
+
+bool eliminateDeadStoresMemorySSA(Function &F, AliasAnalysis &AA,
+                                  MemorySSA &MSSA, DominatorTree &DT,
+                                  PostDominatorTree &PDT,
+                                  const TargetLibraryInfo &TLI) {
+  const DataLayout &DL = F.getParent()->getDataLayout();
+  bool MadeChange = false;
+
+  DSEState State = DSEState::get(F, AA, MSSA, DT, PDT, TLI);
+  // For each store:
+  for (unsigned I = 0; I < State.MemDefs.size(); I++) {
+    MemoryDef *KillingDef = State.MemDefs[I];
+    if (State.SkipStores.count(KillingDef))
+      continue;
+    Instruction *SI = KillingDef->getMemoryInst();
+
+    auto MaybeSILoc = State.getLocForWriteEx(SI);
+    if (State.isMemTerminatorInst(SI))
+      MaybeSILoc = State.getLocForTerminator(SI).map(
+          [](const std::pair<MemoryLocation, bool> &P) { return P.first; });
+    else
+      MaybeSILoc = State.getLocForWriteEx(SI);
+
+    if (!MaybeSILoc) {
+      LLVM_DEBUG(dbgs() << "Failed to find analyzable write location for "
+                        << *SI << "\n");
+      continue;
+    }
+    MemoryLocation SILoc = *MaybeSILoc;
+    assert(SILoc.Ptr && "SILoc should not be null");
+    const Value *SILocUnd = GetUnderlyingObject(SILoc.Ptr, DL);
+
+    // Check if the store is a no-op.
+    if (isRemovable(SI) && State.storeIsNoop(KillingDef, SILoc, SILocUnd)) {
+      LLVM_DEBUG(dbgs() << "DSE: Remove No-Op Store:\n  DEAD: " << *SI << '\n');
+      State.deleteDeadInstruction(SI);
+      NumNoopStores++;
+      MadeChange = true;
+      continue;
+    }
+
+    Instruction *DefObj =
+        const_cast<Instruction *>(dyn_cast<Instruction>(SILocUnd));
+    bool DefVisibleToCallerBeforeRet =
+        !State.InvisibleToCallerBeforeRet.count(SILocUnd);
+    bool DefVisibleToCallerAfterRet =
+        !State.InvisibleToCallerAfterRet.count(SILocUnd);
+    if (DefObj && isAllocLikeFn(DefObj, &TLI)) {
+      if (DefVisibleToCallerBeforeRet)
+        DefVisibleToCallerBeforeRet =
+            PointerMayBeCapturedBefore(DefObj, false, true, SI, &DT);
+    }
+
+    MemoryAccess *Current = KillingDef;
+    LLVM_DEBUG(dbgs() << "Trying to eliminate MemoryDefs killed by "
+                      << *KillingDef << " (" << *SI << ")\n");
+
+    int ScanLimit = MemorySSAScanLimit;
+    // Worklist of MemoryAccesses that may be killed by KillingDef.
+    SetVector<MemoryAccess *> ToCheck;
+    ToCheck.insert(KillingDef->getDefiningAccess());
+
+    // Check if MemoryAccesses in the worklist are killed by KillingDef.
+    for (unsigned I = 0; I < ToCheck.size(); I++) {
+      Current = ToCheck[I];
+      if (State.SkipStores.count(Current))
+        continue;
+
+      Optional<MemoryAccess *> Next = State.getDomMemoryDef(
+          KillingDef, Current, SILoc, DefVisibleToCallerBeforeRet,
+          DefVisibleToCallerAfterRet, ScanLimit);
+
+      if (!Next) {
+        LLVM_DEBUG(dbgs() << "  finished walk\n");
+        continue;
+      }
+
+      MemoryAccess *DomAccess = *Next;
+      LLVM_DEBUG(dbgs() << " Checking if we can kill " << *DomAccess);
+      if (isa<MemoryPhi>(DomAccess)) {
+        LLVM_DEBUG(dbgs() << "\n  ... adding incoming values to worklist\n");
+        for (Value *V : cast<MemoryPhi>(DomAccess)->incoming_values()) {
+          MemoryAccess *IncomingAccess = cast<MemoryAccess>(V);
+          BasicBlock *IncomingBlock = IncomingAccess->getBlock();
+          BasicBlock *PhiBlock = DomAccess->getBlock();
+
+          // We only consider incoming MemoryAccesses that come before the
+          // MemoryPhi. Otherwise we could discover candidates that do not
+          // strictly dominate our starting def.
+          if (State.PostOrderNumbers[IncomingBlock] >
+              State.PostOrderNumbers[PhiBlock])
+            ToCheck.insert(IncomingAccess);
+        }
+        continue;
+      }
+      MemoryDef *NextDef = dyn_cast<MemoryDef>(DomAccess);
+      Instruction *NI = NextDef->getMemoryInst();
+      LLVM_DEBUG(dbgs() << " (" << *NI << ")\n");
+
+      // Before we try to remove anything, check for any extra throwing
+      // instructions that block us from DSEing
+      if (State.mayThrowBetween(SI, NI, SILocUnd)) {
+        LLVM_DEBUG(dbgs() << "  ... skip, may throw!\n");
+        break;
+      }
+
+      // Check for anything that looks like it will be a barrier to further
+      // removal
+      if (State.isDSEBarrier(SILocUnd, NI)) {
+        LLVM_DEBUG(dbgs() << "  ... skip, barrier\n");
+        continue;
+      }
+
+      ToCheck.insert(NextDef->getDefiningAccess());
+
+      if (!hasAnalyzableMemoryWrite(NI, TLI)) {
+        LLVM_DEBUG(dbgs() << "  ... skip, cannot analyze def\n");
+        continue;
+      }
+
+      if (!isRemovable(NI)) {
+        LLVM_DEBUG(dbgs() << "  ... skip, cannot remove def\n");
+        continue;
+      }
+
+      if (!DebugCounter::shouldExecute(MemorySSACounter))
+        continue;
+
+      MemoryLocation NILoc = *State.getLocForWriteEx(NI);
+
+      if (State.isMemTerminatorInst(SI)) {
+        const Value *NIUnd = GetUnderlyingObject(NILoc.Ptr, DL);
+        if (!SILocUnd || SILocUnd != NIUnd)
+          continue;
+        LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI
+                          << "\n  KILLER: " << *SI << '\n');
+        State.deleteDeadInstruction(NI);
+        ++NumFastStores;
+        MadeChange = true;
+      } else {
+        // Check if NI overwrites SI.
+        int64_t InstWriteOffset, DepWriteOffset;
+        auto Iter = State.IOLs.insert(
+            std::make_pair<BasicBlock *, InstOverlapIntervalsTy>(
+                NI->getParent(), InstOverlapIntervalsTy()));
+        auto &IOL = Iter.first->second;
+        OverwriteResult OR = isOverwrite(SILoc, NILoc, DL, TLI, DepWriteOffset,
+                                         InstWriteOffset, NI, IOL, AA, &F);
+
+        if (EnablePartialStoreMerging && OR == OW_PartialEarlierWithFullLater) {
+          auto *Earlier = dyn_cast<StoreInst>(NI);
+          auto *Later = dyn_cast<StoreInst>(SI);
+          if (Constant *Merged = tryToMergePartialOverlappingStores(
+                  Earlier, Later, InstWriteOffset, DepWriteOffset, DL, &AA,
+                  &DT)) {
+
+            // Update stored value of earlier store to merged constant.
+            Earlier->setOperand(0, Merged);
+            ++NumModifiedStores;
+            MadeChange = true;
+
+            // Remove later store and remove any outstanding overlap intervals
+            // for the updated store.
+            State.deleteDeadInstruction(Later);
+            auto I = State.IOLs.find(Earlier->getParent());
+            if (I != State.IOLs.end())
+              I->second.erase(Earlier);
+            break;
+          }
+        }
+
+        if (OR == OW_Complete) {
+          LLVM_DEBUG(dbgs() << "DSE: Remove Dead Store:\n  DEAD: " << *NI
+                            << "\n  KILLER: " << *SI << '\n');
+          State.deleteDeadInstruction(NI);
+          ++NumFastStores;
+          MadeChange = true;
+        }
+      }
+    }
+  }
+
+  if (EnablePartialOverwriteTracking)
+    for (auto &KV : State.IOLs)
+      MadeChange |= removePartiallyOverlappedStores(&AA, DL, KV.second);
+
+  MadeChange |= State.eliminateDeadWritesAtEndOfFunction();
+  return MadeChange;
+}
+} // end anonymous namespace
+
 //===----------------------------------------------------------------------===//
 // DSE Pass
 //===----------------------------------------------------------------------===//
 PreservedAnalyses DSEPass::run(Function &F, FunctionAnalysisManager &AM) {
-  AliasAnalysis *AA = &AM.getResult<AAManager>(F);
-  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
-  MemoryDependenceResults *MD = &AM.getResult<MemoryDependenceAnalysis>(F);
-  const TargetLibraryInfo *TLI = &AM.getResult<TargetLibraryAnalysis>(F);
+  AliasAnalysis &AA = AM.getResult<AAManager>(F);
+  const TargetLibraryInfo &TLI = AM.getResult<TargetLibraryAnalysis>(F);
+  DominatorTree &DT = AM.getResult<DominatorTreeAnalysis>(F);
+
+  bool Changed = false;
+  if (EnableMemorySSA) {
+    MemorySSA &MSSA = AM.getResult<MemorySSAAnalysis>(F).getMSSA();
+    PostDominatorTree &PDT = AM.getResult<PostDominatorTreeAnalysis>(F);
 
-  if (!eliminateDeadStores(F, AA, MD, DT, TLI))
+    Changed = eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI);
+  } else {
+    MemoryDependenceResults &MD = AM.getResult<MemoryDependenceAnalysis>(F);
+
+    Changed = eliminateDeadStores(F, &AA, &MD, &DT, &TLI);
+  }
+
+#ifdef LLVM_ENABLE_STATS
+  if (AreStatisticsEnabled())
+    for (auto &I : instructions(F))
+      NumRemainingStores += isa<StoreInst>(&I);
+#endif
+
+  if (!Changed)
     return PreservedAnalyses::all();
 
   PreservedAnalyses PA;
   PA.preserveSet<CFGAnalyses>();
   PA.preserve<GlobalsAA>();
-  PA.preserve<MemoryDependenceAnalysis>();
+  if (EnableMemorySSA)
+    PA.preserve<MemorySSAAnalysis>();
+  else
+    PA.preserve<MemoryDependenceAnalysis>();
   return PA;
 }
 
@@ -1383,25 +2339,51 @@ public:
     if (skipFunction(F))
       return false;
 
-    DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-    AliasAnalysis *AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
-    MemoryDependenceResults *MD =
-        &getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
-    const TargetLibraryInfo *TLI =
-        &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+    AliasAnalysis &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    const TargetLibraryInfo &TLI =
+        getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
+
+    bool Changed = false;
+    if (EnableMemorySSA) {
+      MemorySSA &MSSA = getAnalysis<MemorySSAWrapperPass>().getMSSA();
+      PostDominatorTree &PDT =
+          getAnalysis<PostDominatorTreeWrapperPass>().getPostDomTree();
+
+      Changed = eliminateDeadStoresMemorySSA(F, AA, MSSA, DT, PDT, TLI);
+    } else {
+      MemoryDependenceResults &MD =
+          getAnalysis<MemoryDependenceWrapperPass>().getMemDep();
+
+      Changed = eliminateDeadStores(F, &AA, &MD, &DT, &TLI);
+    }
 
-    return eliminateDeadStores(F, AA, MD, DT, TLI);
+#ifdef LLVM_ENABLE_STATS
+    if (AreStatisticsEnabled())
+      for (auto &I : instructions(F))
+        NumRemainingStores += isa<StoreInst>(&I);
+#endif
+
+    return Changed;
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
-    AU.addRequired<MemoryDependenceWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
-    AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<GlobalsAAWrapperPass>();
-    AU.addPreserved<MemoryDependenceWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+
+    if (EnableMemorySSA) {
+      AU.addRequired<PostDominatorTreeWrapperPass>();
+      AU.addRequired<MemorySSAWrapperPass>();
+      AU.addPreserved<PostDominatorTreeWrapperPass>();
+      AU.addPreserved<MemorySSAWrapperPass>();
+    } else {
+      AU.addRequired<MemoryDependenceWrapperPass>();
+      AU.addPreserved<MemoryDependenceWrapperPass>();
+    }
   }
 };
 
@@ -1412,8 +2394,10 @@ char DSELegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(DSELegacyPass, "dse", "Dead Store Elimination", false,
                       false)
 INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(PostDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(GlobalsAAWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MemoryDependenceWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(TargetLibraryInfoWrapperPass)
 INITIALIZE_PASS_END(DSELegacyPass, "dse", "Dead Store Elimination", false,
diff --git a/llvm/lib/Transforms/Scalar/DivRemPairs.cpp b/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
index 132dfc8f6da1..d44a5979a8b2 100644
--- a/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
+++ b/llvm/lib/Transforms/Scalar/DivRemPairs.cpp
@@ -17,6 +17,7 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/PatternMatch.h"
@@ -71,6 +72,7 @@ static llvm::Optional<ExpandedMatch> matchExpandedRem(Instruction &I) {
   return M;
 }
 
+namespace {
 /// A thin wrapper to store two values that we matched as div-rem pair.
 /// We want this extra indirection to avoid dealing with RAUW'ing the map keys.
 struct DivRemPairWorklistEntry {
@@ -111,6 +113,7 @@ struct DivRemPairWorklistEntry {
     }
   }
 };
+} // namespace
 using DivRemWorklistTy = SmallVector<DivRemPairWorklistEntry, 4>;
 
 /// Find matching pairs of integer div/rem ops (they have the same numerator,
@@ -218,6 +221,7 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
       NumRecomposed++;
       // Note that we have left ((X / Y) * Y) around.
       // If it had other uses we could rewrite it as X - X % Y
+      Changed = true;
     }
 
     assert((!E.isRemExpanded() || !HasDivRemOp) &&
@@ -301,6 +305,29 @@ static bool optimizeDivRem(Function &F, const TargetTransformInfo &TTI,
       Mul->insertAfter(RemInst);
       Sub->insertAfter(Mul);
 
+      // If X can be undef, X should be frozen first.
+      // For example, let's assume that Y = 1 & X = undef:
+      //   %div = sdiv undef, 1 // %div = undef
+      //   %rem = srem undef, 1 // %rem = 0
+      // =>
+      //   %div = sdiv undef, 1 // %div = undef
+      //   %mul = mul %div, 1   // %mul = undef
+      //   %rem = sub %x, %mul  // %rem = undef - undef = undef
+      // If X is not frozen, %rem becomes undef after transformation.
+      // TODO: We need a undef-specific checking function in ValueTracking
+      if (!isGuaranteedNotToBeUndefOrPoison(X, DivInst, &DT)) {
+        auto *FrX = new FreezeInst(X, X->getName() + ".frozen", DivInst);
+        DivInst->setOperand(0, FrX);
+        Sub->setOperand(0, FrX);
+      }
+      // Same for Y. If X = 1 and Y = (undef | 1), %rem in src is either 1 or 0,
+      // but %rem in tgt can be one of many integer values.
+      if (!isGuaranteedNotToBeUndefOrPoison(Y, DivInst, &DT)) {
+        auto *FrY = new FreezeInst(Y, Y->getName() + ".frozen", DivInst);
+        DivInst->setOperand(1, FrY);
+        Mul->setOperand(1, FrY);
+      }
+
       // Now kill the explicit remainder. We have replaced it with:
       // (sub X, (mul (div X, Y), Y)
       Sub->setName(RemInst->getName() + ".decomposed");
diff --git a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
index 40c1ba88354f..ddfc8555b0a0 100644
--- a/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
+++ b/llvm/lib/Transforms/Scalar/EarlyCSE.cpp
@@ -41,6 +41,7 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/IR/Statepoint.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
@@ -54,6 +55,7 @@
 #include "llvm/Support/RecyclingAllocator.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/GuardUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
@@ -114,7 +116,7 @@ struct SimpleValue {
            isa<CmpInst>(Inst) || isa<SelectInst>(Inst) ||
            isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
            isa<ShuffleVectorInst>(Inst) || isa<ExtractValueInst>(Inst) ||
-           isa<InsertValueInst>(Inst);
+           isa<InsertValueInst>(Inst) || isa<FreezeInst>(Inst);
   }
 };
 
@@ -152,13 +154,50 @@ static bool matchSelectWithOptionalNotCond(Value *V, Value *&Cond, Value *&A,
     std::swap(A, B);
   }
 
-  // Set flavor if we find a match, or set it to unknown otherwise; in
-  // either case, return true to indicate that this is a select we can
-  // process.
-  if (auto *CmpI = dyn_cast<ICmpInst>(Cond))
-    Flavor = matchDecomposedSelectPattern(CmpI, A, B, A, B).Flavor;
-  else
-    Flavor = SPF_UNKNOWN;
+  // Match canonical forms of abs/nabs/min/max. We are not using ValueTracking's
+  // more powerful matchSelectPattern() because it may rely on instruction flags
+  // such as "nsw". That would be incompatible with the current hashing
+  // mechanism that may remove flags to increase the likelihood of CSE.
+
+  // These are the canonical forms of abs(X) and nabs(X) created by instcombine:
+  // %N = sub i32 0, %X
+  // %C = icmp slt i32 %X, 0
+  // %ABS = select i1 %C, i32 %N, i32 %X
+  //
+  // %N = sub i32 0, %X
+  // %C = icmp slt i32 %X, 0
+  // %NABS = select i1 %C, i32 %X, i32 %N
+  Flavor = SPF_UNKNOWN;
+  CmpInst::Predicate Pred;
+  if (match(Cond, m_ICmp(Pred, m_Specific(B), m_ZeroInt())) &&
+      Pred == ICmpInst::ICMP_SLT && match(A, m_Neg(m_Specific(B)))) {
+    // ABS: B < 0 ? -B : B
+    Flavor = SPF_ABS;
+    return true;
+  }
+  if (match(Cond, m_ICmp(Pred, m_Specific(A), m_ZeroInt())) &&
+      Pred == ICmpInst::ICMP_SLT && match(B, m_Neg(m_Specific(A)))) {
+    // NABS: A < 0 ? A : -A
+    Flavor = SPF_NABS;
+    return true;
+  }
+
+  if (!match(Cond, m_ICmp(Pred, m_Specific(A), m_Specific(B)))) {
+    // Check for commuted variants of min/max by swapping predicate.
+    // If we do not match the standard or commuted patterns, this is not a
+    // recognized form of min/max, but it is still a select, so return true.
+    if (!match(Cond, m_ICmp(Pred, m_Specific(B), m_Specific(A))))
+      return true;
+    Pred = ICmpInst::getSwappedPredicate(Pred);
+  }
+
+  switch (Pred) {
+  case CmpInst::ICMP_UGT: Flavor = SPF_UMAX; break;
+  case CmpInst::ICMP_ULT: Flavor = SPF_UMIN; break;
+  case CmpInst::ICMP_SGT: Flavor = SPF_SMAX; break;
+  case CmpInst::ICMP_SLT: Flavor = SPF_SMIN; break;
+  default: break;
+  }
 
   return true;
 }
@@ -231,6 +270,9 @@ static unsigned getHashValueImpl(SimpleValue Val) {
   if (CastInst *CI = dyn_cast<CastInst>(Inst))
     return hash_combine(CI->getOpcode(), CI->getType(), CI->getOperand(0));
 
+  if (FreezeInst *FI = dyn_cast<FreezeInst>(Inst))
+    return hash_combine(FI->getOpcode(), FI->getOperand(0));
+
   if (const ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(Inst))
     return hash_combine(EVI->getOpcode(), EVI->getOperand(0),
                         hash_combine_range(EVI->idx_begin(), EVI->idx_end()));
@@ -242,7 +284,8 @@ static unsigned getHashValueImpl(SimpleValue Val) {
 
   assert((isa<CallInst>(Inst) || isa<GetElementPtrInst>(Inst) ||
           isa<ExtractElementInst>(Inst) || isa<InsertElementInst>(Inst) ||
-          isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst)) &&
+          isa<ShuffleVectorInst>(Inst) || isa<UnaryOperator>(Inst) ||
+          isa<FreezeInst>(Inst)) &&
          "Invalid/unknown instruction");
 
   // Mix in the opcode.
@@ -414,6 +457,14 @@ template <> struct DenseMapInfo<CallValue> {
 
 unsigned DenseMapInfo<CallValue>::getHashValue(CallValue Val) {
   Instruction *Inst = Val.Inst;
+
+  // gc.relocate is 'special' call: its second and third operands are
+  // not real values, but indices into statepoint's argument list.
+  // Get values they point to.
+  if (const GCRelocateInst *GCR = dyn_cast<GCRelocateInst>(Inst))
+    return hash_combine(GCR->getOpcode(), GCR->getOperand(0),
+                        GCR->getBasePtr(), GCR->getDerivedPtr());
+
   // Hash all of the operands as pointers and mix in the opcode.
   return hash_combine(
       Inst->getOpcode(),
@@ -424,6 +475,14 @@ bool DenseMapInfo<CallValue>::isEqual(CallValue LHS, CallValue RHS) {
   Instruction *LHSI = LHS.Inst, *RHSI = RHS.Inst;
   if (LHS.isSentinel() || RHS.isSentinel())
     return LHSI == RHSI;
+
+  // See comment above in `getHashValue()`.
+  if (const GCRelocateInst *GCR1 = dyn_cast<GCRelocateInst>(LHSI))
+    if (const GCRelocateInst *GCR2 = dyn_cast<GCRelocateInst>(RHSI))
+      return GCR1->getOperand(0) == GCR2->getOperand(0) &&
+             GCR1->getBasePtr() == GCR2->getBasePtr() &&
+             GCR1->getDerivedPtr() == GCR2->getDerivedPtr();
+
   return LHSI->isIdenticalTo(RHSI);
 }
 
@@ -561,8 +620,8 @@ private:
   public:
     StackNode(ScopedHTType &AvailableValues, LoadHTType &AvailableLoads,
               InvariantHTType &AvailableInvariants, CallHTType &AvailableCalls,
-              unsigned cg, DomTreeNode *n, DomTreeNode::iterator child,
-              DomTreeNode::iterator end)
+              unsigned cg, DomTreeNode *n, DomTreeNode::const_iterator child,
+              DomTreeNode::const_iterator end)
         : CurrentGeneration(cg), ChildGeneration(cg), Node(n), ChildIter(child),
           EndIter(end),
           Scopes(AvailableValues, AvailableLoads, AvailableInvariants,
@@ -576,7 +635,7 @@ private:
     unsigned childGeneration() { return ChildGeneration; }
     void childGeneration(unsigned generation) { ChildGeneration = generation; }
     DomTreeNode *node() { return Node; }
-    DomTreeNode::iterator childIter() { return ChildIter; }
+    DomTreeNode::const_iterator childIter() { return ChildIter; }
 
     DomTreeNode *nextChild() {
       DomTreeNode *child = *ChildIter;
@@ -584,7 +643,7 @@ private:
       return child;
     }
 
-    DomTreeNode::iterator end() { return EndIter; }
+    DomTreeNode::const_iterator end() { return EndIter; }
     bool isProcessed() { return Processed; }
     void process() { Processed = true; }
 
@@ -592,8 +651,8 @@ private:
     unsigned CurrentGeneration;
     unsigned ChildGeneration;
     DomTreeNode *Node;
-    DomTreeNode::iterator ChildIter;
-    DomTreeNode::iterator EndIter;
+    DomTreeNode::const_iterator ChildIter;
+    DomTreeNode::const_iterator EndIter;
     NodeScope Scopes;
     bool Processed = false;
   };
@@ -716,7 +775,7 @@ private:
   bool isSameMemGeneration(unsigned EarlierGeneration, unsigned LaterGeneration,
                            Instruction *EarlierInst, Instruction *LaterInst);
 
-  void removeMSSA(Instruction *Inst) {
+  void removeMSSA(Instruction &Inst) {
     if (!MSSA)
       return;
     if (VerifyMemorySSA)
@@ -727,7 +786,7 @@ private:
     // is handled by MemorySSA when passing OptimizePhis = true to
     // removeMemoryAccess.  The non-optimized MemoryUse case is lazily updated
     // by MemorySSA's getClobberingMemoryAccess.
-    MSSAUpdater->removeMemoryAccess(Inst, true);
+    MSSAUpdater->removeMemoryAccess(&Inst, true);
   }
 };
 
@@ -897,20 +956,19 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
   // See if any instructions in the block can be eliminated.  If so, do it.  If
   // not, add them to AvailableValues.
-  for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E;) {
-    Instruction *Inst = &*I++;
-
+  for (Instruction &Inst : make_early_inc_range(BB->getInstList())) {
     // Dead instructions should just be removed.
-    if (isInstructionTriviallyDead(Inst, &TLI)) {
-      LLVM_DEBUG(dbgs() << "EarlyCSE DCE: " << *Inst << '\n');
+    if (isInstructionTriviallyDead(&Inst, &TLI)) {
+      LLVM_DEBUG(dbgs() << "EarlyCSE DCE: " << Inst << '\n');
       if (!DebugCounter::shouldExecute(CSECounter)) {
         LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
         continue;
       }
 
-      salvageDebugInfoOrMarkUndef(*Inst);
+      salvageKnowledge(&Inst, &AC);
+      salvageDebugInfo(Inst);
       removeMSSA(Inst);
-      Inst->eraseFromParent();
+      Inst.eraseFromParent();
       Changed = true;
       ++NumSimplify;
       continue;
@@ -920,21 +978,21 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     // they're marked as such to ensure preservation of control dependencies),
     // and this pass will not bother with its removal. However, we should mark
     // its condition as true for all dominated blocks.
-    if (match(Inst, m_Intrinsic<Intrinsic::assume>())) {
+    if (match(&Inst, m_Intrinsic<Intrinsic::assume>())) {
       auto *CondI =
-          dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0));
+          dyn_cast<Instruction>(cast<CallInst>(Inst).getArgOperand(0));
       if (CondI && SimpleValue::canHandle(CondI)) {
-        LLVM_DEBUG(dbgs() << "EarlyCSE considering assumption: " << *Inst
+        LLVM_DEBUG(dbgs() << "EarlyCSE considering assumption: " << Inst
                           << '\n');
         AvailableValues.insert(CondI, ConstantInt::getTrue(BB->getContext()));
       } else
-        LLVM_DEBUG(dbgs() << "EarlyCSE skipping assumption: " << *Inst << '\n');
+        LLVM_DEBUG(dbgs() << "EarlyCSE skipping assumption: " << Inst << '\n');
       continue;
     }
 
     // Skip sideeffect intrinsics, for the same reason as assume intrinsics.
-    if (match(Inst, m_Intrinsic<Intrinsic::sideeffect>())) {
-      LLVM_DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << *Inst << '\n');
+    if (match(&Inst, m_Intrinsic<Intrinsic::sideeffect>())) {
+      LLVM_DEBUG(dbgs() << "EarlyCSE skipping sideeffect: " << Inst << '\n');
       continue;
     }
 
@@ -951,21 +1009,21 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     // store 40, i8* p
     // We can DSE the store to 30, since the store 40 to invariant location p
     // causes undefined behaviour.
-    if (match(Inst, m_Intrinsic<Intrinsic::invariant_start>())) {
+    if (match(&Inst, m_Intrinsic<Intrinsic::invariant_start>())) {
       // If there are any uses, the scope might end.
-      if (!Inst->use_empty())
+      if (!Inst.use_empty())
         continue;
-      auto *CI = cast<CallInst>(Inst);
-      MemoryLocation MemLoc = MemoryLocation::getForArgument(CI, 1, TLI);
+      MemoryLocation MemLoc =
+          MemoryLocation::getForArgument(&cast<CallInst>(Inst), 1, TLI);
       // Don't start a scope if we already have a better one pushed
       if (!AvailableInvariants.count(MemLoc))
         AvailableInvariants.insert(MemLoc, CurrentGeneration);
       continue;
     }
 
-    if (isGuard(Inst)) {
+    if (isGuard(&Inst)) {
       if (auto *CondI =
-              dyn_cast<Instruction>(cast<CallInst>(Inst)->getArgOperand(0))) {
+              dyn_cast<Instruction>(cast<CallInst>(Inst).getArgOperand(0))) {
         if (SimpleValue::canHandle(CondI)) {
           // Do we already know the actual value of this condition?
           if (auto *KnownCond = AvailableValues.lookup(CondI)) {
@@ -973,14 +1031,15 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
             if (isa<ConstantInt>(KnownCond) &&
                 cast<ConstantInt>(KnownCond)->isOne()) {
               LLVM_DEBUG(dbgs()
-                         << "EarlyCSE removing guard: " << *Inst << '\n');
+                         << "EarlyCSE removing guard: " << Inst << '\n');
+              salvageKnowledge(&Inst, &AC);
               removeMSSA(Inst);
-              Inst->eraseFromParent();
+              Inst.eraseFromParent();
               Changed = true;
               continue;
             } else
               // Use the known value if it wasn't true.
-              cast<CallInst>(Inst)->setArgOperand(0, KnownCond);
+              cast<CallInst>(Inst).setArgOperand(0, KnownCond);
           }
           // The condition we're on guarding here is true for all dominated
           // locations.
@@ -997,20 +1056,21 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
 
     // If the instruction can be simplified (e.g. X+0 = X) then replace it with
     // its simpler value.
-    if (Value *V = SimplifyInstruction(Inst, SQ)) {
-      LLVM_DEBUG(dbgs() << "EarlyCSE Simplify: " << *Inst << "  to: " << *V
+    if (Value *V = SimplifyInstruction(&Inst, SQ)) {
+      LLVM_DEBUG(dbgs() << "EarlyCSE Simplify: " << Inst << "  to: " << *V
                         << '\n');
       if (!DebugCounter::shouldExecute(CSECounter)) {
         LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
       } else {
         bool Killed = false;
-        if (!Inst->use_empty()) {
-          Inst->replaceAllUsesWith(V);
+        if (!Inst.use_empty()) {
+          Inst.replaceAllUsesWith(V);
           Changed = true;
         }
-        if (isInstructionTriviallyDead(Inst, &TLI)) {
+        if (isInstructionTriviallyDead(&Inst, &TLI)) {
+          salvageKnowledge(&Inst, &AC);
           removeMSSA(Inst);
-          Inst->eraseFromParent();
+          Inst.eraseFromParent();
           Changed = true;
           Killed = true;
         }
@@ -1022,31 +1082,32 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     }
 
     // If this is a simple instruction that we can value number, process it.
-    if (SimpleValue::canHandle(Inst)) {
+    if (SimpleValue::canHandle(&Inst)) {
       // See if the instruction has an available value.  If so, use it.
-      if (Value *V = AvailableValues.lookup(Inst)) {
-        LLVM_DEBUG(dbgs() << "EarlyCSE CSE: " << *Inst << "  to: " << *V
+      if (Value *V = AvailableValues.lookup(&Inst)) {
+        LLVM_DEBUG(dbgs() << "EarlyCSE CSE: " << Inst << "  to: " << *V
                           << '\n');
         if (!DebugCounter::shouldExecute(CSECounter)) {
           LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
           continue;
         }
         if (auto *I = dyn_cast<Instruction>(V))
-          I->andIRFlags(Inst);
-        Inst->replaceAllUsesWith(V);
+          I->andIRFlags(&Inst);
+        Inst.replaceAllUsesWith(V);
+        salvageKnowledge(&Inst, &AC);
         removeMSSA(Inst);
-        Inst->eraseFromParent();
+        Inst.eraseFromParent();
         Changed = true;
         ++NumCSE;
         continue;
       }
 
       // Otherwise, just remember that this value is available.
-      AvailableValues.insert(Inst, Inst);
+      AvailableValues.insert(&Inst, &Inst);
       continue;
     }
 
-    ParseMemoryInst MemInst(Inst, TTI);
+    ParseMemoryInst MemInst(&Inst, TTI);
     // If this is a non-volatile load, process it.
     if (MemInst.isValid() && MemInst.isLoad()) {
       // (conservatively) we can't peak past the ordering implied by this
@@ -1062,7 +1123,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         // We conservatively treat the invariant_load as that moment.  If we
         // pass a invariant load after already establishing a scope, don't
         // restart it since we want to preserve the earliest point seen.
-        auto MemLoc = MemoryLocation::get(Inst);
+        auto MemLoc = MemoryLocation::get(&Inst);
         if (!AvailableInvariants.count(MemLoc))
           AvailableInvariants.insert(MemLoc, CurrentGeneration);
       }
@@ -1081,21 +1142,22 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
           !MemInst.isVolatile() && MemInst.isUnordered() &&
           // We can't replace an atomic load with one which isn't also atomic.
           InVal.IsAtomic >= MemInst.isAtomic() &&
-          (isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
+          (isOperatingOnInvariantMemAt(&Inst, InVal.Generation) ||
            isSameMemGeneration(InVal.Generation, CurrentGeneration,
-                               InVal.DefInst, Inst))) {
-        Value *Op = getOrCreateResult(InVal.DefInst, Inst->getType());
+                               InVal.DefInst, &Inst))) {
+        Value *Op = getOrCreateResult(InVal.DefInst, Inst.getType());
         if (Op != nullptr) {
-          LLVM_DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << *Inst
+          LLVM_DEBUG(dbgs() << "EarlyCSE CSE LOAD: " << Inst
                             << "  to: " << *InVal.DefInst << '\n');
           if (!DebugCounter::shouldExecute(CSECounter)) {
             LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
             continue;
           }
-          if (!Inst->use_empty())
-            Inst->replaceAllUsesWith(Op);
+          if (!Inst.use_empty())
+            Inst.replaceAllUsesWith(Op);
+          salvageKnowledge(&Inst, &AC);
           removeMSSA(Inst);
-          Inst->eraseFromParent();
+          Inst.eraseFromParent();
           Changed = true;
           ++NumCSELoad;
           continue;
@@ -1103,10 +1165,10 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
       }
 
       // Otherwise, remember that we have this instruction.
-      AvailableLoads.insert(
-          MemInst.getPointerOperand(),
-          LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
-                    MemInst.isAtomic()));
+      AvailableLoads.insert(MemInst.getPointerOperand(),
+                            LoadValue(&Inst, CurrentGeneration,
+                                      MemInst.getMatchingId(),
+                                      MemInst.isAtomic()));
       LastStore = nullptr;
       continue;
     }
@@ -1117,36 +1179,36 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     // may override this (e.g. so that a store intrinsic does not read from
     // memory, and thus will be treated the same as a regular store for
     // commoning purposes).
-    if ((Inst->mayReadFromMemory() || Inst->mayThrow()) &&
+    if ((Inst.mayReadFromMemory() || Inst.mayThrow()) &&
         !(MemInst.isValid() && !MemInst.mayReadFromMemory()))
       LastStore = nullptr;
 
     // If this is a read-only call, process it.
-    if (CallValue::canHandle(Inst)) {
+    if (CallValue::canHandle(&Inst)) {
       // If we have an available version of this call, and if it is the right
       // generation, replace this instruction.
-      std::pair<Instruction *, unsigned> InVal = AvailableCalls.lookup(Inst);
+      std::pair<Instruction *, unsigned> InVal = AvailableCalls.lookup(&Inst);
       if (InVal.first != nullptr &&
           isSameMemGeneration(InVal.second, CurrentGeneration, InVal.first,
-                              Inst)) {
-        LLVM_DEBUG(dbgs() << "EarlyCSE CSE CALL: " << *Inst
+                              &Inst)) {
+        LLVM_DEBUG(dbgs() << "EarlyCSE CSE CALL: " << Inst
                           << "  to: " << *InVal.first << '\n');
         if (!DebugCounter::shouldExecute(CSECounter)) {
           LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
           continue;
         }
-        if (!Inst->use_empty())
-          Inst->replaceAllUsesWith(InVal.first);
+        if (!Inst.use_empty())
+          Inst.replaceAllUsesWith(InVal.first);
+        salvageKnowledge(&Inst, &AC);
         removeMSSA(Inst);
-        Inst->eraseFromParent();
+        Inst.eraseFromParent();
         Changed = true;
         ++NumCSECall;
         continue;
       }
 
       // Otherwise, remember that we have this instruction.
-      AvailableCalls.insert(
-          Inst, std::pair<Instruction *, unsigned>(Inst, CurrentGeneration));
+      AvailableCalls.insert(&Inst, std::make_pair(&Inst, CurrentGeneration));
       continue;
     }
 
@@ -1155,9 +1217,9 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     // result, we don't need to consider it as writing to memory and don't need
     // to advance the generation.  We do need to prevent DSE across the fence,
     // but that's handled above.
-    if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+    if (auto *FI = dyn_cast<FenceInst>(&Inst))
       if (FI->getOrdering() == AtomicOrdering::Release) {
-        assert(Inst->mayReadFromMemory() && "relied on to prevent DSE above");
+        assert(Inst.mayReadFromMemory() && "relied on to prevent DSE above");
         continue;
       }
 
@@ -1169,13 +1231,13 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     if (MemInst.isValid() && MemInst.isStore()) {
       LoadValue InVal = AvailableLoads.lookup(MemInst.getPointerOperand());
       if (InVal.DefInst &&
-          InVal.DefInst == getOrCreateResult(Inst, InVal.DefInst->getType()) &&
+          InVal.DefInst == getOrCreateResult(&Inst, InVal.DefInst->getType()) &&
           InVal.MatchingId == MemInst.getMatchingId() &&
           // We don't yet handle removing stores with ordering of any kind.
           !MemInst.isVolatile() && MemInst.isUnordered() &&
-          (isOperatingOnInvariantMemAt(Inst, InVal.Generation) ||
+          (isOperatingOnInvariantMemAt(&Inst, InVal.Generation) ||
            isSameMemGeneration(InVal.Generation, CurrentGeneration,
-                               InVal.DefInst, Inst))) {
+                               InVal.DefInst, &Inst))) {
         // It is okay to have a LastStore to a different pointer here if MemorySSA
         // tells us that the load and store are from the same memory generation.
         // In that case, LastStore should keep its present value since we're
@@ -1185,13 +1247,14 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
                     MemInst.getPointerOperand() ||
                 MSSA) &&
                "can't have an intervening store if not using MemorySSA!");
-        LLVM_DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << *Inst << '\n');
+        LLVM_DEBUG(dbgs() << "EarlyCSE DSE (writeback): " << Inst << '\n');
         if (!DebugCounter::shouldExecute(CSECounter)) {
           LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
           continue;
         }
+        salvageKnowledge(&Inst, &AC);
         removeMSSA(Inst);
-        Inst->eraseFromParent();
+        Inst.eraseFromParent();
         Changed = true;
         ++NumDSE;
         // We can avoid incrementing the generation count since we were able
@@ -1203,7 +1266,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
     // Okay, this isn't something we can CSE at all.  Check to see if it is
     // something that could modify memory.  If so, our available memory values
     // cannot be used so bump the generation count.
-    if (Inst->mayWriteToMemory()) {
+    if (Inst.mayWriteToMemory()) {
       ++CurrentGeneration;
 
       if (MemInst.isValid() && MemInst.isStore()) {
@@ -1221,11 +1284,12 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
                  "Violated invariant");
           if (LastStoreMemInst.isMatchingMemLoc(MemInst)) {
             LLVM_DEBUG(dbgs() << "EarlyCSE DEAD STORE: " << *LastStore
-                              << "  due to: " << *Inst << '\n');
+                              << "  due to: " << Inst << '\n');
             if (!DebugCounter::shouldExecute(CSECounter)) {
               LLVM_DEBUG(dbgs() << "Skipping due to debug counter\n");
             } else {
-              removeMSSA(LastStore);
+              salvageKnowledge(&Inst, &AC);
+              removeMSSA(*LastStore);
               LastStore->eraseFromParent();
               Changed = true;
               ++NumDSE;
@@ -1240,10 +1304,10 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         // version of the pointer.  It is safe to forward from volatile stores
         // to non-volatile loads, so we don't have to check for volatility of
         // the store.
-        AvailableLoads.insert(
-            MemInst.getPointerOperand(),
-            LoadValue(Inst, CurrentGeneration, MemInst.getMatchingId(),
-                      MemInst.isAtomic()));
+        AvailableLoads.insert(MemInst.getPointerOperand(),
+                              LoadValue(&Inst, CurrentGeneration,
+                                        MemInst.getMatchingId(),
+                                        MemInst.isAtomic()));
 
         // Remember that this was the last unordered store we saw for DSE. We
         // don't yet handle DSE on ordered or volatile stores since we don't
@@ -1253,7 +1317,7 @@ bool EarlyCSE::processNode(DomTreeNode *Node) {
         // it's not clear this is a profitable transform. Another option would
         // be to merge the ordering with that of the post dominating store.
         if (MemInst.isUnordered() && !MemInst.isVolatile())
-          LastStore = Inst;
+          LastStore = &Inst;
         else
           LastStore = nullptr;
       }
diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp
index af223cc837f2..83f4c402ed4d 100644
--- a/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -120,8 +120,7 @@ static Instruction::BinaryOps mapBinOpcode(unsigned Opcode) {
 
 // Find the roots - instructions that convert from the FP domain to
 // integer domain.
-void Float2IntPass::findRoots(Function &F, const DominatorTree &DT,
-                              SmallPtrSet<Instruction*,8> &Roots) {
+void Float2IntPass::findRoots(Function &F, const DominatorTree &DT) {
   for (BasicBlock &BB : F) {
     // Unreachable code can take on strange forms that we are not prepared to
     // handle. For example, an instruction may have itself as an operand.
@@ -184,7 +183,7 @@ ConstantRange Float2IntPass::validateRange(ConstantRange R) {
 
 // Breadth-first walk of the use-def graph; determine the set of nodes
 // we care about and eagerly determine if some of them are poisonous.
-void Float2IntPass::walkBackwards(const SmallPtrSetImpl<Instruction*> &Roots) {
+void Float2IntPass::walkBackwards() {
   std::deque<Instruction*> Worklist(Roots.begin(), Roots.end());
   while (!Worklist.empty()) {
     Instruction *I = Worklist.back();
@@ -327,7 +326,7 @@ void Float2IntPass::walkForwards() {
 
         APFloat NewF = F;
         auto Res = NewF.roundToIntegral(APFloat::rmNearestTiesToEven);
-        if (Res != APFloat::opOK || NewF.compare(F) != APFloat::cmpEqual) {
+        if (Res != APFloat::opOK || NewF != F) {
           seen(I, badRange());
           Abort = true;
           break;
@@ -525,9 +524,9 @@ bool Float2IntPass::runImpl(Function &F, const DominatorTree &DT) {
 
   Ctx = &F.getParent()->getContext();
 
-  findRoots(F, DT, Roots);
+  findRoots(F, DT);
 
-  walkBackwards(Roots);
+  walkBackwards();
   walkForwards();
 
   bool Modified = validateAndTransform();
diff --git a/llvm/lib/Transforms/Scalar/GVN.cpp b/llvm/lib/Transforms/Scalar/GVN.cpp
index 1e6aab14e7b4..b16f8591b5a4 100644
--- a/llvm/lib/Transforms/Scalar/GVN.cpp
+++ b/llvm/lib/Transforms/Scalar/GVN.cpp
@@ -26,6 +26,7 @@
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/CFG.h"
@@ -42,7 +43,6 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -72,6 +72,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
@@ -97,10 +98,11 @@ STATISTIC(NumGVNSimpl,  "Number of instructions simplified");
 STATISTIC(NumGVNEqProp, "Number of equalities propagated");
 STATISTIC(NumPRELoad,   "Number of loads PRE'd");
 
-static cl::opt<bool> EnablePRE("enable-pre",
-                               cl::init(true), cl::Hidden);
-static cl::opt<bool> EnableLoadPRE("enable-load-pre", cl::init(true));
-static cl::opt<bool> EnableMemDep("enable-gvn-memdep", cl::init(true));
+static cl::opt<bool> GVNEnablePRE("enable-pre", cl::init(true), cl::Hidden);
+static cl::opt<bool> GVNEnableLoadPRE("enable-load-pre", cl::init(true));
+static cl::opt<bool> GVNEnableLoadInLoopPRE("enable-load-in-loop-pre",
+                                            cl::init(true));
+static cl::opt<bool> GVNEnableMemDep("enable-gvn-memdep", cl::init(true));
 
 // Maximum allowed recursion depth.
 static cl::opt<uint32_t>
@@ -113,8 +115,8 @@ static cl::opt<uint32_t> MaxNumDeps(
 
 struct llvm::GVN::Expression {
   uint32_t opcode;
-  Type *type = nullptr;
   bool commutative = false;
+  Type *type = nullptr;
   SmallVector<uint32_t, 4> varargs;
 
   Expression(uint32_t o = ~2U) : opcode(o) {}
@@ -288,7 +290,7 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
     e.commutative = true;
   }
 
-  if (CmpInst *C = dyn_cast<CmpInst>(I)) {
+  if (auto *C = dyn_cast<CmpInst>(I)) {
     // Sort the operand value numbers so x<y and y>x get the same value number.
     CmpInst::Predicate Predicate = C->getPredicate();
     if (e.varargs[0] > e.varargs[1]) {
@@ -297,10 +299,11 @@ GVN::Expression GVN::ValueTable::createExpr(Instruction *I) {
     }
     e.opcode = (C->getOpcode() << 8) | Predicate;
     e.commutative = true;
-  } else if (InsertValueInst *E = dyn_cast<InsertValueInst>(I)) {
-    for (InsertValueInst::idx_iterator II = E->idx_begin(), IE = E->idx_end();
-         II != IE; ++II)
-      e.varargs.push_back(*II);
+  } else if (auto *E = dyn_cast<InsertValueInst>(I)) {
+    e.varargs.append(E->idx_begin(), E->idx_end());
+  } else if (auto *SVI = dyn_cast<ShuffleVectorInst>(I)) {
+    ArrayRef<int> ShuffleMask = SVI->getShuffleMask();
+    e.varargs.append(ShuffleMask.begin(), ShuffleMask.end());
   }
 
   return e;
@@ -530,6 +533,7 @@ uint32_t GVN::ValueTable::lookupOrAdd(Value *V) {
     case Instruction::AddrSpaceCast:
     case Instruction::BitCast:
     case Instruction::Select:
+    case Instruction::Freeze:
     case Instruction::ExtractElement:
     case Instruction::InsertElement:
     case Instruction::ShuffleVector:
@@ -610,6 +614,22 @@ void GVN::ValueTable::verifyRemoved(const Value *V) const {
 //                                GVN Pass
 //===----------------------------------------------------------------------===//
 
+bool GVN::isPREEnabled() const {
+  return Options.AllowPRE.getValueOr(GVNEnablePRE);
+}
+
+bool GVN::isLoadPREEnabled() const {
+  return Options.AllowLoadPRE.getValueOr(GVNEnableLoadPRE);
+}
+
+bool GVN::isLoadInLoopPREEnabled() const {
+  return Options.AllowLoadInLoopPRE.getValueOr(GVNEnableLoadInLoopPRE);
+}
+
+bool GVN::isMemDepEnabled() const {
+  return Options.AllowMemDep.getValueOr(GVNEnableMemDep);
+}
+
 PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
   // FIXME: The order of evaluation of these 'getResult' calls is very
   // significant! Re-ordering these variables will cause GVN when run alone to
@@ -619,10 +639,11 @@ PreservedAnalyses GVN::run(Function &F, FunctionAnalysisManager &AM) {
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &AA = AM.getResult<AAManager>(F);
-  auto &MemDep = AM.getResult<MemoryDependenceAnalysis>(F);
+  auto *MemDep =
+      isMemDepEnabled() ? &AM.getResult<MemoryDependenceAnalysis>(F) : nullptr;
   auto *LI = AM.getCachedResult<LoopAnalysis>(F);
   auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
-  bool Changed = runImpl(F, AC, DT, TLI, AA, &MemDep, LI, &ORE);
+  bool Changed = runImpl(F, AC, DT, TLI, AA, MemDep, LI, &ORE);
   if (!Changed)
     return PreservedAnalyses::all();
   PreservedAnalyses PA;
@@ -927,6 +948,7 @@ bool GVN::AnalyzeLoadAvailability(LoadInst *LI, MemDepResult DepInfo,
 
   // Loading the allocation -> undef.
   if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
+      isAlignedAllocLikeFn(DepInst, TLI) ||
       // Loading immediately after lifetime begin -> undef.
       isLifetimeStart(DepInst)) {
     Res = AvailableValue::get(UndefValue::get(LI->getType()));
@@ -1245,7 +1267,7 @@ bool GVN::PerformLoadPRE(LoadInst *LI, AvailValInBlkVect &ValuesPerBlock,
 
     auto *NewLoad = new LoadInst(
         LI->getType(), LoadPtr, LI->getName() + ".pre", LI->isVolatile(),
-        MaybeAlign(LI->getAlignment()), LI->getOrdering(), LI->getSyncScopeID(),
+        LI->getAlign(), LI->getOrdering(), LI->getSyncScopeID(),
         UnavailablePred->getTerminator());
     NewLoad->setDebugLoc(LI->getDebugLoc());
 
@@ -1383,7 +1405,10 @@ bool GVN::processNonLocalLoad(LoadInst *LI) {
   }
 
   // Step 4: Eliminate partial redundancy.
-  if (!EnablePRE || !EnableLoadPRE)
+  if (!isPREEnabled() || !isLoadPREEnabled())
+    return false;
+  if (!isLoadInLoopPREEnabled() && this->LI &&
+      this->LI->getLoopFor(LI->getParent()))
     return false;
 
   return PerformLoadPRE(LI, ValuesPerBlock, UnavailableBlocks);
@@ -1428,7 +1453,7 @@ static bool impliesEquivalanceIfFalse(CmpInst* Cmp) {
       Value *LHS = Cmp->getOperand(0);
       Value *RHS = Cmp->getOperand(1);
       // If we can prove either side non-zero, then equality must imply
-      // equivalence. 
+      // equivalence.
       // FIXME: We should do this optimization if 'no signed zeros' is
       // applicable via an instruction-level fast-math-flag or some other
       // indicator that relaxed FP semantics are being used.
@@ -1465,7 +1490,8 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
                     Constant::getNullValue(Int8Ty->getPointerTo()),
                     IntrinsicI);
     }
-    markInstructionForDeletion(IntrinsicI);
+    if (isAssumeWithEmptyBundle(*IntrinsicI))
+      markInstructionForDeletion(IntrinsicI);
     return false;
   } else if (isa<Constant>(V)) {
     // If it's not false, and constant, it must evaluate to true. This means our
@@ -1493,10 +1519,10 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
   // If we find an equality fact, canonicalize all dominated uses in this block
   // to one of the two values.  We heuristically choice the "oldest" of the
   // two where age is determined by value number. (Note that propagateEquality
-  // above handles the cross block case.) 
-  // 
+  // above handles the cross block case.)
+  //
   // Key case to cover are:
-  // 1) 
+  // 1)
   // %cmp = fcmp oeq float 3.000000e+00, %0 ; const on lhs could happen
   // call void @llvm.assume(i1 %cmp)
   // ret float %0 ; will change it to ret float 3.000000e+00
@@ -1537,7 +1563,7 @@ bool GVN::processAssumeIntrinsic(IntrinsicInst *IntrinsicI) {
                  << *CmpLHS << " with "
                  << *CmpRHS << " in block "
                  << IntrinsicI->getParent()->getName() << "\n");
-      
+
 
       // Setup the replacement map - this handles uses within the same block
       if (hasUsersIn(CmpLHS, IntrinsicI->getParent()))
@@ -1710,7 +1736,8 @@ uint32_t GVN::ValueTable::phiTranslateImpl(const BasicBlock *Pred,
     // instead of value numbers. Those index numbers should not be
     // translated.
     if ((i > 1 && Exp.opcode == Instruction::InsertValue) ||
-        (i > 0 && Exp.opcode == Instruction::ExtractValue))
+        (i > 0 && Exp.opcode == Instruction::ExtractValue) ||
+        (i > 1 && Exp.opcode == Instruction::ShuffleVector))
       continue;
     Exp.varargs[i] = phiTranslate(Pred, PhiBlock, Exp.varargs[i], Gvn);
   }
@@ -1802,7 +1829,7 @@ void GVN::assignBlockRPONumber(Function &F) {
 bool GVN::replaceOperandsForInBlockEquality(Instruction *Instr) const {
   bool Changed = false;
   for (unsigned OpNum = 0; OpNum < Instr->getNumOperands(); ++OpNum) {
-    Value *Operand = Instr->getOperand(OpNum); 
+    Value *Operand = Instr->getOperand(OpNum);
     auto it = ReplaceOperandsWithMap.find(Operand);
     if (it != ReplaceOperandsWithMap.end()) {
       LLVM_DEBUG(dbgs() << "GVN replacing: " << *Operand << " with "
@@ -1922,7 +1949,7 @@ bool GVN::propagateEquality(Value *LHS, Value *RHS, const BasicBlockEdge &Root,
 
       // If "A == B" is known true, or "A != B" is known false, then replace
       // A with B everywhere in the scope.  For floating point operations, we
-      // have to be careful since equality does not always imply equivalance.  
+      // have to be careful since equality does not always imply equivalance.
       if ((isKnownTrue && impliesEquivalanceIfTrue(Cmp)) ||
           (isKnownFalse && impliesEquivalanceIfFalse(Cmp)))
         Worklist.push_back(std::make_pair(Op0, Op1));
@@ -2117,7 +2144,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
   TLI = &RunTLI;
   VN.setAliasAnalysis(&RunAA);
   MD = RunMD;
-  ImplicitControlFlowTracking ImplicitCFT(DT);
+  ImplicitControlFlowTracking ImplicitCFT;
   ICF = &ImplicitCFT;
   this->LI = LI;
   VN.setMemDep(MD);
@@ -2148,7 +2175,7 @@ bool GVN::runImpl(Function &F, AssumptionCache &RunAC, DominatorTree &RunDT,
     ++Iteration;
   }
 
-  if (EnablePRE) {
+  if (isPREEnabled()) {
     // Fabricate val-num for dead-code in order to suppress assertion in
     // performPRE().
     assignValNumForDeadCode();
@@ -2206,6 +2233,7 @@ bool GVN::processBlock(BasicBlock *BB) {
     for (auto *I : InstrsToErase) {
       assert(I->getParent() == BB && "Removing instruction from wrong block?");
       LLVM_DEBUG(dbgs() << "GVN removed: " << *I << '\n');
+      salvageKnowledge(I, AC);
       salvageDebugInfo(*I);
       if (MD) MD->removeInstruction(I);
       LLVM_DEBUG(verifyRemoved(I));
@@ -2478,8 +2506,11 @@ bool GVN::performPRE(Function &F) {
 /// Split the critical edge connecting the given two blocks, and return
 /// the block inserted to the critical edge.
 BasicBlock *GVN::splitCriticalEdges(BasicBlock *Pred, BasicBlock *Succ) {
-  BasicBlock *BB =
-      SplitCriticalEdge(Pred, Succ, CriticalEdgeSplittingOptions(DT, LI));
+  // GVN does not require loop-simplify, do not try to preserve it if it is not
+  // possible.
+  BasicBlock *BB = SplitCriticalEdge(
+      Pred, Succ,
+      CriticalEdgeSplittingOptions(DT, LI).unsetPreserveLoopSimplify());
   if (MD)
     MD->invalidateCachedPredecessors();
   InvalidBlockRPONumbers = true;
@@ -2682,8 +2713,8 @@ class llvm::gvn::GVNLegacyPass : public FunctionPass {
 public:
   static char ID; // Pass identification, replacement for typeid
 
-  explicit GVNLegacyPass(bool NoMemDepAnalysis = !EnableMemDep)
-      : FunctionPass(ID), NoMemDepAnalysis(NoMemDepAnalysis) {
+  explicit GVNLegacyPass(bool NoMemDepAnalysis = !GVNEnableMemDep)
+      : FunctionPass(ID), Impl(GVNOptions().setMemDep(!NoMemDepAnalysis)) {
     initializeGVNLegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
@@ -2698,9 +2729,9 @@ public:
         getAnalysis<DominatorTreeWrapperPass>().getDomTree(),
         getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F),
         getAnalysis<AAResultsWrapperPass>().getAAResults(),
-        NoMemDepAnalysis
-            ? nullptr
-            : &getAnalysis<MemoryDependenceWrapperPass>().getMemDep(),
+        Impl.isMemDepEnabled()
+            ? &getAnalysis<MemoryDependenceWrapperPass>().getMemDep()
+            : nullptr,
         LIWP ? &LIWP->getLoopInfo() : nullptr,
         &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE());
   }
@@ -2710,7 +2741,7 @@ public:
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
-    if (!NoMemDepAnalysis)
+    if (Impl.isMemDepEnabled())
       AU.addRequired<MemoryDependenceWrapperPass>();
     AU.addRequired<AAResultsWrapperPass>();
 
@@ -2718,12 +2749,10 @@ public:
     AU.addPreserved<GlobalsAAWrapperPass>();
     AU.addPreserved<TargetLibraryInfoWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
-    AU.addPreservedID(LoopSimplifyID);
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
   }
 
 private:
-  bool NoMemDepAnalysis;
   GVN Impl;
 };
 
diff --git a/llvm/lib/Transforms/Scalar/GVNHoist.cpp b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
index e1796f6bf05a..9c4cdf2feb56 100644
--- a/llvm/lib/Transforms/Scalar/GVNHoist.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNHoist.cpp
@@ -890,18 +890,16 @@ private:
 
   void updateAlignment(Instruction *I, Instruction *Repl) {
     if (auto *ReplacementLoad = dyn_cast<LoadInst>(Repl)) {
-      ReplacementLoad->setAlignment(MaybeAlign(std::min(
-          ReplacementLoad->getAlignment(), cast<LoadInst>(I)->getAlignment())));
+      ReplacementLoad->setAlignment(
+          std::min(ReplacementLoad->getAlign(), cast<LoadInst>(I)->getAlign()));
       ++NumLoadsRemoved;
     } else if (auto *ReplacementStore = dyn_cast<StoreInst>(Repl)) {
-      ReplacementStore->setAlignment(
-          MaybeAlign(std::min(ReplacementStore->getAlignment(),
-                              cast<StoreInst>(I)->getAlignment())));
+      ReplacementStore->setAlignment(std::min(ReplacementStore->getAlign(),
+                                              cast<StoreInst>(I)->getAlign()));
       ++NumStoresRemoved;
     } else if (auto *ReplacementAlloca = dyn_cast<AllocaInst>(Repl)) {
-      ReplacementAlloca->setAlignment(
-          MaybeAlign(std::max(ReplacementAlloca->getAlignment(),
-                              cast<AllocaInst>(I)->getAlignment())));
+      ReplacementAlloca->setAlignment(std::max(
+          ReplacementAlloca->getAlign(), cast<AllocaInst>(I)->getAlign()));
     } else if (isa<CallInst>(Repl)) {
       ++NumCallsRemoved;
     }
diff --git a/llvm/lib/Transforms/Scalar/GVNSink.cpp b/llvm/lib/Transforms/Scalar/GVNSink.cpp
index 6d0a4975e266..dfb4b7e038ba 100644
--- a/llvm/lib/Transforms/Scalar/GVNSink.cpp
+++ b/llvm/lib/Transforms/Scalar/GVNSink.cpp
@@ -350,6 +350,7 @@ using ModelledPHISet = DenseSet<ModelledPHI, DenseMapInfo<ModelledPHI>>;
 class InstructionUseExpr : public GVNExpression::BasicExpression {
   unsigned MemoryUseOrder = -1;
   bool Volatile = false;
+  ArrayRef<int> ShuffleMask;
 
 public:
   InstructionUseExpr(Instruction *I, ArrayRecycler<Value *> &R,
@@ -359,6 +360,9 @@ public:
     setOpcode(I->getOpcode());
     setType(I->getType());
 
+    if (ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(I))
+      ShuffleMask = SVI->getShuffleMask().copy(A);
+
     for (auto &U : I->uses())
       op_push_back(U.getUser());
     llvm::sort(op_begin(), op_end());
@@ -369,12 +373,12 @@ public:
 
   hash_code getHashValue() const override {
     return hash_combine(GVNExpression::BasicExpression::getHashValue(),
-                        MemoryUseOrder, Volatile);
+                        MemoryUseOrder, Volatile, ShuffleMask);
   }
 
   template <typename Function> hash_code getHashValue(Function MapFn) {
-    hash_code H =
-        hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile);
+    hash_code H = hash_combine(getOpcode(), getType(), MemoryUseOrder, Volatile,
+                               ShuffleMask);
     for (auto *V : operands())
       H = hash_combine(H, MapFn(V));
     return H;
@@ -475,6 +479,7 @@ public:
     case Instruction::PtrToInt:
     case Instruction::IntToPtr:
     case Instruction::BitCast:
+    case Instruction::AddrSpaceCast:
     case Instruction::Select:
     case Instruction::ExtractElement:
     case Instruction::InsertElement:
@@ -576,7 +581,7 @@ public:
 private:
   ValueTable VN;
 
-  bool isInstructionBlacklisted(Instruction *I) {
+  bool shouldAvoidSinkingInstruction(Instruction *I) {
     // These instructions may change or break semantics if moved.
     if (isa<PHINode>(I) || I->isEHPad() || isa<AllocaInst>(I) ||
         I->getType()->isTokenTy())
@@ -668,7 +673,7 @@ Optional<SinkingInstructionCandidate> GVNSink::analyzeInstructionForSinking(
       NewInsts.push_back(I);
   }
   for (auto *I : NewInsts)
-    if (isInstructionBlacklisted(I))
+    if (shouldAvoidSinkingInstruction(I))
       return None;
 
   // If we've restricted the incoming blocks, restrict all needed PHIs also
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index d8d7acae5c9f..0f36c3f772e6 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -38,8 +38,9 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -81,6 +82,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include <cassert>
 #include <cstdint>
@@ -100,10 +102,10 @@ STATISTIC(NumElimIV      , "Number of congruent IVs eliminated");
 // implement a strong expression equivalence checker in SCEV. Until then, we
 // use the verify-indvars flag, which may assert in some cases.
 static cl::opt<bool> VerifyIndvars(
-  "verify-indvars", cl::Hidden,
-  cl::desc("Verify the ScalarEvolution result after running indvars"));
-
-enum ReplaceExitVal { NeverRepl, OnlyCheapRepl, NoHardUse, AlwaysRepl };
+    "verify-indvars", cl::Hidden,
+    cl::desc("Verify the ScalarEvolution result after running indvars. Has no "
+             "effect in release builds. (Note: this adds additional SCEV "
+             "queries potentially changing the analysis result)"));
 
 static cl::opt<ReplaceExitVal> ReplaceExitValue(
     "replexitval", cl::Hidden, cl::init(OnlyCheapRepl),
@@ -140,11 +142,10 @@ class IndVarSimplify {
   const DataLayout &DL;
   TargetLibraryInfo *TLI;
   const TargetTransformInfo *TTI;
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
 
   SmallVector<WeakTrackingVH, 16> DeadInsts;
 
-  bool isValidRewrite(Value *FromVal, Value *ToVal);
-
   bool handleFloatingPointIV(Loop *L, PHINode *PH);
   bool rewriteNonIntegerIVs(Loop *L);
 
@@ -155,10 +156,7 @@ class IndVarSimplify {
   /// iterations of the loop run when that is unobservable.
   bool predicateLoopExits(Loop *L, SCEVExpander &Rewriter);
 
-  bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet);
-  bool rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter);
   bool rewriteFirstIterationLoopExitValues(Loop *L);
-  bool hasHardUserWithinLoop(const Loop *L, const Instruction *I) const;
 
   bool linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
                                  const SCEV *ExitCount,
@@ -169,66 +167,17 @@ class IndVarSimplify {
 public:
   IndVarSimplify(LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
                  const DataLayout &DL, TargetLibraryInfo *TLI,
-                 TargetTransformInfo *TTI)
-      : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI) {}
+                 TargetTransformInfo *TTI, MemorySSA *MSSA)
+      : LI(LI), SE(SE), DT(DT), DL(DL), TLI(TLI), TTI(TTI) {
+    if (MSSA)
+      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+  }
 
   bool run(Loop *L);
 };
 
 } // end anonymous namespace
 
-/// Return true if the SCEV expansion generated by the rewriter can replace the
-/// original value. SCEV guarantees that it produces the same value, but the way
-/// it is produced may be illegal IR.  Ideally, this function will only be
-/// called for verification.
-bool IndVarSimplify::isValidRewrite(Value *FromVal, Value *ToVal) {
-  // If an SCEV expression subsumed multiple pointers, its expansion could
-  // reassociate the GEP changing the base pointer. This is illegal because the
-  // final address produced by a GEP chain must be inbounds relative to its
-  // underlying object. Otherwise basic alias analysis, among other things,
-  // could fail in a dangerous way. Ultimately, SCEV will be improved to avoid
-  // producing an expression involving multiple pointers. Until then, we must
-  // bail out here.
-  //
-  // Retrieve the pointer operand of the GEP. Don't use GetUnderlyingObject
-  // because it understands lcssa phis while SCEV does not.
-  Value *FromPtr = FromVal;
-  Value *ToPtr = ToVal;
-  if (auto *GEP = dyn_cast<GEPOperator>(FromVal)) {
-    FromPtr = GEP->getPointerOperand();
-  }
-  if (auto *GEP = dyn_cast<GEPOperator>(ToVal)) {
-    ToPtr = GEP->getPointerOperand();
-  }
-  if (FromPtr != FromVal || ToPtr != ToVal) {
-    // Quickly check the common case
-    if (FromPtr == ToPtr)
-      return true;
-
-    // SCEV may have rewritten an expression that produces the GEP's pointer
-    // operand. That's ok as long as the pointer operand has the same base
-    // pointer. Unlike GetUnderlyingObject(), getPointerBase() will find the
-    // base of a recurrence. This handles the case in which SCEV expansion
-    // converts a pointer type recurrence into a nonrecurrent pointer base
-    // indexed by an integer recurrence.
-
-    // If the GEP base pointer is a vector of pointers, abort.
-    if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy())
-      return false;
-
-    const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr));
-    const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr));
-    if (FromBase == ToBase)
-      return true;
-
-    LLVM_DEBUG(dbgs() << "INDVARS: GEP rewrite bail out " << *FromBase
-                      << " != " << *ToBase << "\n");
-
-    return false;
-  }
-  return true;
-}
-
 /// Determine the insertion point for this user. By default, insert immediately
 /// before the user. SCEVExpander or LICM will hoist loop invariants out of the
 /// loop. For PHI nodes, there may be multiple uses, so compute the nearest
@@ -477,11 +426,11 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
   // new comparison.
   NewCompare->takeName(Compare);
   Compare->replaceAllUsesWith(NewCompare);
-  RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI);
+  RecursivelyDeleteTriviallyDeadInstructions(Compare, TLI, MSSAU.get());
 
   // Delete the old floating point increment.
   Incr->replaceAllUsesWith(UndefValue::get(Incr->getType()));
-  RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI);
+  RecursivelyDeleteTriviallyDeadInstructions(Incr, TLI, MSSAU.get());
 
   // If the FP induction variable still has uses, this is because something else
   // in the loop uses its value.  In order to canonicalize the induction
@@ -494,7 +443,7 @@ bool IndVarSimplify::handleFloatingPointIV(Loop *L, PHINode *PN) {
     Value *Conv = new SIToFPInst(NewPHI, PN->getType(), "indvar.conv",
                                  &*PN->getParent()->getFirstInsertionPt());
     PN->replaceAllUsesWith(Conv);
-    RecursivelyDeleteTriviallyDeadInstructions(PN, TLI);
+    RecursivelyDeleteTriviallyDeadInstructions(PN, TLI, MSSAU.get());
   }
   return true;
 }
@@ -522,222 +471,6 @@ bool IndVarSimplify::rewriteNonIntegerIVs(Loop *L) {
   return Changed;
 }
 
-namespace {
-
-// Collect information about PHI nodes which can be transformed in
-// rewriteLoopExitValues.
-struct RewritePhi {
-  PHINode *PN;
-
-  // Ith incoming value.
-  unsigned Ith;
-
-  // Exit value after expansion.
-  Value *Val;
-
-  // High Cost when expansion.
-  bool HighCost;
-
-  RewritePhi(PHINode *P, unsigned I, Value *V, bool H)
-      : PN(P), Ith(I), Val(V), HighCost(H) {}
-};
-
-} // end anonymous namespace
-
-//===----------------------------------------------------------------------===//
-// rewriteLoopExitValues - Optimize IV users outside the loop.
-// As a side effect, reduces the amount of IV processing within the loop.
-//===----------------------------------------------------------------------===//
-
-bool IndVarSimplify::hasHardUserWithinLoop(const Loop *L, const Instruction *I) const {
-  SmallPtrSet<const Instruction *, 8> Visited;
-  SmallVector<const Instruction *, 8> WorkList;
-  Visited.insert(I);
-  WorkList.push_back(I);
-  while (!WorkList.empty()) {
-    const Instruction *Curr = WorkList.pop_back_val();
-    // This use is outside the loop, nothing to do.
-    if (!L->contains(Curr))
-      continue;
-    // Do we assume it is a "hard" use which will not be eliminated easily?
-    if (Curr->mayHaveSideEffects())
-      return true;
-    // Otherwise, add all its users to worklist.
-    for (auto U : Curr->users()) {
-      auto *UI = cast<Instruction>(U);
-      if (Visited.insert(UI).second)
-        WorkList.push_back(UI);
-    }
-  }
-  return false;
-}
-
-/// Check to see if this loop has a computable loop-invariant execution count.
-/// If so, this means that we can compute the final value of any expressions
-/// that are recurrent in the loop, and substitute the exit values from the loop
-/// into any instructions outside of the loop that use the final values of the
-/// current expressions.
-///
-/// This is mostly redundant with the regular IndVarSimplify activities that
-/// happen later, except that it's more powerful in some cases, because it's
-/// able to brute-force evaluate arbitrary instructions as long as they have
-/// constant operands at the beginning of the loop.
-bool IndVarSimplify::rewriteLoopExitValues(Loop *L, SCEVExpander &Rewriter) {
-  // Check a pre-condition.
-  assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
-         "Indvars did not preserve LCSSA!");
-
-  SmallVector<BasicBlock*, 8> ExitBlocks;
-  L->getUniqueExitBlocks(ExitBlocks);
-
-  SmallVector<RewritePhi, 8> RewritePhiSet;
-  // Find all values that are computed inside the loop, but used outside of it.
-  // Because of LCSSA, these values will only occur in LCSSA PHI Nodes.  Scan
-  // the exit blocks of the loop to find them.
-  for (BasicBlock *ExitBB : ExitBlocks) {
-    // If there are no PHI nodes in this exit block, then no values defined
-    // inside the loop are used on this path, skip it.
-    PHINode *PN = dyn_cast<PHINode>(ExitBB->begin());
-    if (!PN) continue;
-
-    unsigned NumPreds = PN->getNumIncomingValues();
-
-    // Iterate over all of the PHI nodes.
-    BasicBlock::iterator BBI = ExitBB->begin();
-    while ((PN = dyn_cast<PHINode>(BBI++))) {
-      if (PN->use_empty())
-        continue; // dead use, don't replace it
-
-      if (!SE->isSCEVable(PN->getType()))
-        continue;
-
-      // It's necessary to tell ScalarEvolution about this explicitly so that
-      // it can walk the def-use list and forget all SCEVs, as it may not be
-      // watching the PHI itself. Once the new exit value is in place, there
-      // may not be a def-use connection between the loop and every instruction
-      // which got a SCEVAddRecExpr for that loop.
-      SE->forgetValue(PN);
-
-      // Iterate over all of the values in all the PHI nodes.
-      for (unsigned i = 0; i != NumPreds; ++i) {
-        // If the value being merged in is not integer or is not defined
-        // in the loop, skip it.
-        Value *InVal = PN->getIncomingValue(i);
-        if (!isa<Instruction>(InVal))
-          continue;
-
-        // If this pred is for a subloop, not L itself, skip it.
-        if (LI->getLoopFor(PN->getIncomingBlock(i)) != L)
-          continue; // The Block is in a subloop, skip it.
-
-        // Check that InVal is defined in the loop.
-        Instruction *Inst = cast<Instruction>(InVal);
-        if (!L->contains(Inst))
-          continue;
-
-        // Okay, this instruction has a user outside of the current loop
-        // and varies predictably *inside* the loop.  Evaluate the value it
-        // contains when the loop exits, if possible.  We prefer to start with
-        // expressions which are true for all exits (so as to maximize
-        // expression reuse by the SCEVExpander), but resort to per-exit
-        // evaluation if that fails.  
-        const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
-        if (isa<SCEVCouldNotCompute>(ExitValue) ||
-            !SE->isLoopInvariant(ExitValue, L) ||
-            !isSafeToExpand(ExitValue, *SE)) {
-          // TODO: This should probably be sunk into SCEV in some way; maybe a
-          // getSCEVForExit(SCEV*, L, ExitingBB)?  It can be generalized for
-          // most SCEV expressions and other recurrence types (e.g. shift
-          // recurrences).  Is there existing code we can reuse?
-          const SCEV *ExitCount = SE->getExitCount(L, PN->getIncomingBlock(i));
-          if (isa<SCEVCouldNotCompute>(ExitCount))
-            continue;
-          if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Inst)))
-            if (AddRec->getLoop() == L)
-              ExitValue = AddRec->evaluateAtIteration(ExitCount, *SE);
-          if (isa<SCEVCouldNotCompute>(ExitValue) ||
-              !SE->isLoopInvariant(ExitValue, L) ||
-              !isSafeToExpand(ExitValue, *SE))
-            continue;
-        }
-        
-        // Computing the value outside of the loop brings no benefit if it is
-        // definitely used inside the loop in a way which can not be optimized
-        // away.  Avoid doing so unless we know we have a value which computes
-        // the ExitValue already.  TODO: This should be merged into SCEV
-        // expander to leverage its knowledge of existing expressions.
-        if (ReplaceExitValue != AlwaysRepl &&
-            !isa<SCEVConstant>(ExitValue) && !isa<SCEVUnknown>(ExitValue) &&
-            hasHardUserWithinLoop(L, Inst))
-          continue;
-
-        bool HighCost = Rewriter.isHighCostExpansion(ExitValue, L, Inst);
-        Value *ExitVal = Rewriter.expandCodeFor(ExitValue, PN->getType(), Inst);
-
-        LLVM_DEBUG(dbgs() << "INDVARS: RLEV: AfterLoopVal = " << *ExitVal
-                          << '\n'
-                          << "  LoopVal = " << *Inst << "\n");
-
-        if (!isValidRewrite(Inst, ExitVal)) {
-          DeadInsts.push_back(ExitVal);
-          continue;
-        }
-
-#ifndef NDEBUG
-        // If we reuse an instruction from a loop which is neither L nor one of
-        // its containing loops, we end up breaking LCSSA form for this loop by
-        // creating a new use of its instruction.
-        if (auto *ExitInsn = dyn_cast<Instruction>(ExitVal))
-          if (auto *EVL = LI->getLoopFor(ExitInsn->getParent()))
-            if (EVL != L)
-              assert(EVL->contains(L) && "LCSSA breach detected!");
-#endif
-
-        // Collect all the candidate PHINodes to be rewritten.
-        RewritePhiSet.emplace_back(PN, i, ExitVal, HighCost);
-      }
-    }
-  }
-
-  bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet);
-
-  bool Changed = false;
-  // Transformation.
-  for (const RewritePhi &Phi : RewritePhiSet) {
-    PHINode *PN = Phi.PN;
-    Value *ExitVal = Phi.Val;
-
-    // Only do the rewrite when the ExitValue can be expanded cheaply.
-    // If LoopCanBeDel is true, rewrite exit value aggressively.
-    if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) {
-      DeadInsts.push_back(ExitVal);
-      continue;
-    }
-
-    Changed = true;
-    ++NumReplaced;
-    Instruction *Inst = cast<Instruction>(PN->getIncomingValue(Phi.Ith));
-    PN->setIncomingValue(Phi.Ith, ExitVal);
-
-    // If this instruction is dead now, delete it. Don't do it now to avoid
-    // invalidating iterators.
-    if (isInstructionTriviallyDead(Inst, TLI))
-      DeadInsts.push_back(Inst);
-
-    // Replace PN with ExitVal if that is legal and does not break LCSSA.
-    if (PN->getNumIncomingValues() == 1 &&
-        LI->replacementPreservesLCSSAForm(PN, ExitVal)) {
-      PN->replaceAllUsesWith(ExitVal);
-      PN->eraseFromParent();
-    }
-  }
-
-  // The insertion point instruction may have been deleted; clear it out
-  // so that the rewriter doesn't trip over it later.
-  Rewriter.clearInsertPoint();
-  return Changed;
-}
-
 //===---------------------------------------------------------------------===//
 // rewriteFirstIterationLoopExitValues: Rewrite loop exit values if we know
 // they will exit at the first iteration.
@@ -813,61 +546,6 @@ bool IndVarSimplify::rewriteFirstIterationLoopExitValues(Loop *L) {
   return MadeAnyChanges;
 }
 
-/// Check whether it is possible to delete the loop after rewriting exit
-/// value. If it is possible, ignore ReplaceExitValue and do rewriting
-/// aggressively.
-bool IndVarSimplify::canLoopBeDeleted(
-    Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) {
-  BasicBlock *Preheader = L->getLoopPreheader();
-  // If there is no preheader, the loop will not be deleted.
-  if (!Preheader)
-    return false;
-
-  // In LoopDeletion pass Loop can be deleted when ExitingBlocks.size() > 1.
-  // We obviate multiple ExitingBlocks case for simplicity.
-  // TODO: If we see testcase with multiple ExitingBlocks can be deleted
-  // after exit value rewriting, we can enhance the logic here.
-  SmallVector<BasicBlock *, 4> ExitingBlocks;
-  L->getExitingBlocks(ExitingBlocks);
-  SmallVector<BasicBlock *, 8> ExitBlocks;
-  L->getUniqueExitBlocks(ExitBlocks);
-  if (ExitBlocks.size() != 1 || ExitingBlocks.size() != 1)
-    return false;
-
-  BasicBlock *ExitBlock = ExitBlocks[0];
-  BasicBlock::iterator BI = ExitBlock->begin();
-  while (PHINode *P = dyn_cast<PHINode>(BI)) {
-    Value *Incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
-
-    // If the Incoming value of P is found in RewritePhiSet, we know it
-    // could be rewritten to use a loop invariant value in transformation
-    // phase later. Skip it in the loop invariant check below.
-    bool found = false;
-    for (const RewritePhi &Phi : RewritePhiSet) {
-      unsigned i = Phi.Ith;
-      if (Phi.PN == P && (Phi.PN)->getIncomingValue(i) == Incoming) {
-        found = true;
-        break;
-      }
-    }
-
-    Instruction *I;
-    if (!found && (I = dyn_cast<Instruction>(Incoming)))
-      if (!L->hasLoopInvariantOperands(I))
-        return false;
-
-    ++BI;
-  }
-
-  for (auto *BB : L->blocks())
-    if (llvm::any_of(*BB, [](Instruction &I) {
-          return I.mayHaveSideEffects();
-        }))
-      return false;
-
-  return true;
-}
-
 //===----------------------------------------------------------------------===//
 //  IV Widening - Extend the width of an IV to cover its widest uses.
 //===----------------------------------------------------------------------===//
@@ -1060,8 +738,8 @@ protected:
   Instruction *widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter);
 
   bool widenLoopCompare(NarrowIVDefUse DU);
-  bool widenWithVariantLoadUse(NarrowIVDefUse DU);
-  void widenWithVariantLoadUseCodegen(NarrowIVDefUse DU);
+  bool widenWithVariantUse(NarrowIVDefUse DU);
+  void widenWithVariantUseCodegen(NarrowIVDefUse DU);
 
   void pushNarrowIVUsers(Instruction *NarrowDef, Instruction *WideDef);
 };
@@ -1399,20 +1077,27 @@ bool WidenIV::widenLoopCompare(NarrowIVDefUse DU) {
   return true;
 }
 
-/// If the narrow use is an instruction whose two operands are the defining
-/// instruction of DU and a load instruction, then we have the following:
-/// if the load is hoisted outside the loop, then we do not reach this function
-/// as scalar evolution analysis works fine in widenIVUse with variables
-/// hoisted outside the loop and efficient code is subsequently generated by
-/// not emitting truncate instructions. But when the load is not hoisted
-/// (whether due to limitation in alias analysis or due to a true legality),
-/// then scalar evolution can not proceed with loop variant values and
-/// inefficient code is generated. This function handles the non-hoisted load
-/// special case by making the optimization generate the same type of code for
-/// hoisted and non-hoisted load (widen use and eliminate sign extend
-/// instruction). This special case is important especially when the induction
-/// variables are affecting addressing mode in code generation.
-bool WidenIV::widenWithVariantLoadUse(NarrowIVDefUse DU) {
+// The widenIVUse avoids generating trunc by evaluating the use as AddRec, this
+// will not work when:
+//    1) SCEV traces back to an instruction inside the loop that SCEV can not
+// expand, eg. add %indvar, (load %addr)
+//    2) SCEV finds a loop variant, eg. add %indvar, %loopvariant
+// While SCEV fails to avoid trunc, we can still try to use instruction
+// combining approach to prove trunc is not required. This can be further
+// extended with other instruction combining checks, but for now we handle the
+// following case (sub can be "add" and "mul", "nsw + sext" can be "nus + zext")
+//
+// Src:
+//   %c = sub nsw %b, %indvar
+//   %d = sext %c to i64
+// Dst:
+//   %indvar.ext1 = sext %indvar to i64
+//   %m = sext %b to i64
+//   %d = sub nsw i64 %m, %indvar.ext1
+// Therefore, as long as the result of add/sub/mul is extended to wide type, no
+// trunc is required regardless of how %b is generated. This pattern is common
+// when calculating address in 64 bit architecture
+bool WidenIV::widenWithVariantUse(NarrowIVDefUse DU) {
   Instruction *NarrowUse = DU.NarrowUse;
   Instruction *NarrowDef = DU.NarrowDef;
   Instruction *WideDef = DU.WideDef;
@@ -1443,12 +1128,6 @@ bool WidenIV::widenWithVariantLoadUse(NarrowIVDefUse DU) {
   else
     return false;
 
-  // We are interested in the other operand being a load instruction.
-  // But, we should look into relaxing this restriction later on.
-  auto *I = dyn_cast<Instruction>(NarrowUse->getOperand(ExtendOperIdx));
-  if (I && I->getOpcode() != Instruction::Load)
-    return false;
-
   // Verifying that Defining operand is an AddRec
   const SCEV *Op1 = SE->getSCEV(WideDef);
   const SCEVAddRecExpr *AddRecOp1 = dyn_cast<SCEVAddRecExpr>(Op1);
@@ -1480,9 +1159,9 @@ bool WidenIV::widenWithVariantLoadUse(NarrowIVDefUse DU) {
   return true;
 }
 
-/// Special Case for widening with variant Loads (see
-/// WidenIV::widenWithVariantLoadUse). This is the code generation part.
-void WidenIV::widenWithVariantLoadUseCodegen(NarrowIVDefUse DU) {
+/// Special Case for widening with loop variant (see
+/// WidenIV::widenWithVariant). This is the code generation part.
+void WidenIV::widenWithVariantUseCodegen(NarrowIVDefUse DU) {
   Instruction *NarrowUse = DU.NarrowUse;
   Instruction *NarrowDef = DU.NarrowDef;
   Instruction *WideDef = DU.WideDef;
@@ -1508,33 +1187,22 @@ void WidenIV::widenWithVariantLoadUseCodegen(NarrowIVDefUse DU) {
   Builder.Insert(WideBO);
   WideBO->copyIRFlags(NarrowBO);
 
-  if (ExtKind == SignExtended)
-    ExtendKindMap[NarrowUse] = SignExtended;
-  else
-    ExtendKindMap[NarrowUse] = ZeroExtended;
+  assert(ExtKind != Unknown && "Unknown ExtKind not handled");
 
-  // Update the Use.
-  if (ExtKind == SignExtended) {
-    for (Use &U : NarrowUse->uses()) {
-      SExtInst *User = dyn_cast<SExtInst>(U.getUser());
-      if (User && User->getType() == WideType) {
-        LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *User << " replaced by "
-                          << *WideBO << "\n");
-        ++NumElimExt;
-        User->replaceAllUsesWith(WideBO);
-        DeadInsts.emplace_back(User);
-      }
-    }
-  } else { // ExtKind == ZeroExtended
-    for (Use &U : NarrowUse->uses()) {
-      ZExtInst *User = dyn_cast<ZExtInst>(U.getUser());
-      if (User && User->getType() == WideType) {
-        LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *User << " replaced by "
-                          << *WideBO << "\n");
-        ++NumElimExt;
-        User->replaceAllUsesWith(WideBO);
-        DeadInsts.emplace_back(User);
-      }
+  ExtendKindMap[NarrowUse] = ExtKind;
+
+  for (Use &U : NarrowUse->uses()) {
+    Instruction *User = nullptr;
+    if (ExtKind == SignExtended)
+      User = dyn_cast<SExtInst>(U.getUser());
+    else
+      User = dyn_cast<ZExtInst>(U.getUser());
+    if (User && User->getType() == WideType) {
+      LLVM_DEBUG(dbgs() << "INDVARS: eliminating " << *User << " replaced by "
+                        << *WideBO << "\n");
+      ++NumElimExt;
+      User->replaceAllUsesWith(WideBO);
+      DeadInsts.emplace_back(User);
     }
   }
 }
@@ -1641,8 +1309,8 @@ Instruction *WidenIV::widenIVUse(NarrowIVDefUse DU, SCEVExpander &Rewriter) {
     // in WideAddRec.first does not indicate a polynomial induction expression.
     // In that case, look at the operands of the use instruction to determine
     // if we can still widen the use instead of truncating its operand.
-    if (widenWithVariantLoadUse(DU)) {
-      widenWithVariantLoadUseCodegen(DU);
+    if (widenWithVariantUse(DU)) {
+      widenWithVariantUseCodegen(DU);
       return nullptr;
     }
 
@@ -1992,8 +1660,8 @@ bool IndVarSimplify::simplifyAndExtend(Loop *L,
       // Information about sign/zero extensions of CurrIV.
       IndVarSimplifyVisitor Visitor(CurrIV, SE, TTI, DT);
 
-      Changed |=
-          simplifyUsersOfIV(CurrIV, SE, DT, LI, DeadInsts, Rewriter, &Visitor);
+      Changed |= simplifyUsersOfIV(CurrIV, SE, DT, LI, TTI, DeadInsts, Rewriter,
+                                   &Visitor);
 
       if (Visitor.WI.WidestNativeType) {
         WideIVs.push_back(Visitor.WI);
@@ -2017,7 +1685,7 @@ bool IndVarSimplify::simplifyAndExtend(Loop *L,
 
 /// Given an Value which is hoped to be part of an add recurance in the given
 /// loop, return the associated Phi node if so.  Otherwise, return null.  Note
-/// that this is less general than SCEVs AddRec checking.  
+/// that this is less general than SCEVs AddRec checking.
 static PHINode *getLoopPhiForCounter(Value *IncV, Loop *L) {
   Instruction *IncI = dyn_cast<Instruction>(IncV);
   if (!IncI)
@@ -2079,7 +1747,7 @@ static bool needsLFTR(Loop *L, BasicBlock *ExitingBB) {
   BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
   if (L->isLoopInvariant(BI->getCondition()))
     return false;
-  
+
   // Do LFTR to simplify the exit condition to an ICMP.
   ICmpInst *Cond = dyn_cast<ICmpInst>(BI->getCondition());
   if (!Cond)
@@ -2122,9 +1790,9 @@ static bool needsLFTR(Loop *L, BasicBlock *ExitingBB) {
 /// actually poison.  This can be used to assess whether a new use of Root can
 /// be added at a location which is control equivalent with OnPathTo (such as
 /// immediately before it) without introducing UB which didn't previously
-/// exist.  Note that a false result conveys no information.  
+/// exist.  Note that a false result conveys no information.
 static bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root,
-                                          Instruction *OnPathTo, 
+                                          Instruction *OnPathTo,
                                           DominatorTree *DT) {
   // Basic approach is to assume Root is poison, propagate poison forward
   // through all users we can easily track, and then check whether any of those
@@ -2142,10 +1810,10 @@ static bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root,
     // If we know this must trigger UB on a path leading our target.
     if (mustTriggerUB(I, KnownPoison) && DT->dominates(I, OnPathTo))
       return true;
-    
+
     // If we can't analyze propagation through this instruction, just skip it
     // and transitive users.  Safe as false is a conservative result.
-    if (!propagatesFullPoison(I) && I != Root)
+    if (!propagatesPoison(I) && I != Root)
       continue;
 
     if (KnownPoison.insert(I).second)
@@ -2154,7 +1822,7 @@ static bool mustExecuteUBIfPoisonOnPathTo(Instruction *Root,
   }
 
   // Might be non-UB, or might have a path we couldn't prove must execute on
-  // way to exiting bb. 
+  // way to exiting bb.
   return false;
 }
 
@@ -2221,7 +1889,7 @@ static bool isLoopCounter(PHINode* Phi, Loop *L,
                           ScalarEvolution *SE) {
   assert(Phi->getParent() == L->getHeader());
   assert(L->getLoopLatch());
-  
+
   if (!SE->isSCEVable(Phi->getType()))
     return false;
 
@@ -2282,7 +1950,7 @@ static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB,
     if (!hasConcreteDef(Phi)) {
       // We explicitly allow unknown phis as long as they are already used by
       // the loop exit test.  This is legal since performing LFTR could not
-      // increase the number of undef users. 
+      // increase the number of undef users.
       Value *IncPhi = Phi->getIncomingValueForBlock(LatchBlock);
       if (!isLoopExitTestBasedOn(Phi, ExitingBB) &&
           !isLoopExitTestBasedOn(IncPhi, ExitingBB))
@@ -2300,7 +1968,7 @@ static PHINode *FindLoopCounter(Loop *L, BasicBlock *ExitingBB,
     if (!Phi->getType()->isIntegerTy() &&
         !mustExecuteUBIfPoisonOnPathTo(Phi, ExitingBB->getTerminator(), DT))
       continue;
-    
+
     const SCEV *Init = AR->getStart();
 
     if (BestPhi && !AlmostDeadIV(BestPhi, LatchBlock, Cond)) {
@@ -2506,14 +2174,14 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
     // reasoning as from SimplifyIndvar::eliminateTrunc to see if we can extend
     // the other side of the comparison instead.  We still evaluate the limit
     // in the narrower bitwidth, we just prefer a zext/sext outside the loop to
-    // a truncate within in.  
+    // a truncate within in.
     bool Extended = false;
     const SCEV *IV = SE->getSCEV(CmpIndVar);
     const SCEV *TruncatedIV = SE->getTruncateExpr(SE->getSCEV(CmpIndVar),
                                                   ExitCnt->getType());
     const SCEV *ZExtTrunc =
       SE->getZeroExtendExpr(TruncatedIV, CmpIndVar->getType());
-    
+
     if (ZExtTrunc == IV) {
       Extended = true;
       ExitCnt = Builder.CreateZExt(ExitCnt, IndVar->getType(),
@@ -2531,7 +2199,7 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
     if (Extended) {
       bool Discard;
       L->makeLoopInvariant(ExitCnt, Discard);
-    } else 
+    } else
       CmpIndVar = Builder.CreateTrunc(CmpIndVar, ExitCnt->getType(),
                                       "lftr.wideiv");
   }
@@ -2551,7 +2219,7 @@ linearFunctionTestReplace(Loop *L, BasicBlock *ExitingBB,
   // update the branch to use the new comparison; in the common case this
   // will make old comparison dead.
   BI->setCondition(Cond);
-  DeadInsts.push_back(OrigCond);
+  DeadInsts.emplace_back(OrigCond);
 
   ++NumLFTR;
   return true;
@@ -2685,11 +2353,10 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
   L->getExitingBlocks(ExitingBlocks);
 
   // Remove all exits which aren't both rewriteable and analyzeable.
-  auto NewEnd = llvm::remove_if(ExitingBlocks,
-                                [&](BasicBlock *ExitingBB) {
+  auto NewEnd = llvm::remove_if(ExitingBlocks, [&](BasicBlock *ExitingBB) {
     // If our exitting block exits multiple loops, we can only rewrite the
     // innermost one.  Otherwise, we're changing how many times the innermost
-    // loop runs before it exits. 
+    // loop runs before it exits.
     if (LI->getLoopFor(ExitingBB) != L)
       return true;
 
@@ -2701,18 +2368,18 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
     // If already constant, nothing to do.
     if (isa<Constant>(BI->getCondition()))
       return true;
-    
+
     const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
     if (isa<SCEVCouldNotCompute>(ExitCount))
       return true;
     return false;
-   });
+  });
   ExitingBlocks.erase(NewEnd, ExitingBlocks.end());
 
   if (ExitingBlocks.empty())
     return false;
-  
-  // Get a symbolic upper bound on the loop backedge taken count.  
+
+  // Get a symbolic upper bound on the loop backedge taken count.
   const SCEV *MaxExitCount = getMaxBackedgeTakenCount(*SE, *DT, L);
   if (isa<SCEVCouldNotCompute>(MaxExitCount))
     return false;
@@ -2720,11 +2387,12 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
   // Visit our exit blocks in order of dominance.  We know from the fact that
   // all exits (left) are analyzeable that the must be a total dominance order
   // between them as each must dominate the latch.  The visit order only
-  // matters for the provably equal case.  
+  // matters for the provably equal case.
   llvm::sort(ExitingBlocks,
              [&](BasicBlock *A, BasicBlock *B) {
                // std::sort sorts in ascending order, so we want the inverse of
                // the normal dominance relation.
+               if (A == B) return false;
                if (DT->properlyDominates(A, B)) return true;
                if (DT->properlyDominates(B, A)) return false;
                llvm_unreachable("expected total dominance order!");
@@ -2734,7 +2402,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
     assert(DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]));
   }
 #endif
-  
+
   auto FoldExit = [&](BasicBlock *ExitingBB, bool IsTaken) {
     BranchInst *BI = cast<BranchInst>(ExitingBB->getTerminator());
     bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
@@ -2743,7 +2411,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
                                      IsTaken ? ExitIfTrue : !ExitIfTrue);
     BI->setCondition(NewCond);
     if (OldCond->use_empty())
-      DeadInsts.push_back(OldCond);
+      DeadInsts.emplace_back(OldCond);
   };
 
   bool Changed = false;
@@ -2751,7 +2419,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
   for (BasicBlock *ExitingBB : ExitingBlocks) {
     const SCEV *ExitCount = SE->getExitCount(L, ExitingBB);
     assert(!isa<SCEVCouldNotCompute>(ExitCount) && "checked above");
-    
+
     // If we know we'd exit on the first iteration, rewrite the exit to
     // reflect this.  This does not imply the loop must exit through this
     // exit; there may be an earlier one taken on the first iteration.
@@ -2769,13 +2437,13 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
     if (!ExitCount->getType()->isIntegerTy() ||
         !MaxExitCount->getType()->isIntegerTy())
       continue;
-    
+
     Type *WiderType =
       SE->getWiderType(MaxExitCount->getType(), ExitCount->getType());
     ExitCount = SE->getNoopOrZeroExtend(ExitCount, WiderType);
     MaxExitCount = SE->getNoopOrZeroExtend(MaxExitCount, WiderType);
     assert(MaxExitCount->getType() == ExitCount->getType());
-    
+
     // Can we prove that some other exit must be taken strictly before this
     // one?
     if (SE->isLoopEntryGuardedByCond(L, CmpInst::ICMP_ULT,
@@ -2788,7 +2456,7 @@ bool IndVarSimplify::optimizeLoopExits(Loop *L, SCEVExpander &Rewriter) {
     // As we run, keep track of which exit counts we've encountered.  If we
     // find a duplicate, we've found an exit which would have exited on the
     // exiting iteration, but (from the visit order) strictly follows another
-    // which does the same and is thus dead. 
+    // which does the same and is thus dead.
     if (!DominatingExitCounts.insert(ExitCount).second) {
       FoldExit(ExitingBB, false);
       Changed = true;
@@ -2809,22 +2477,20 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   SmallVector<BasicBlock*, 16> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
 
-  bool Changed = false;
-
   // Finally, see if we can rewrite our exit conditions into a loop invariant
-  // form.  If we have a read-only loop, and we can tell that we must exit down
+  // form. If we have a read-only loop, and we can tell that we must exit down
   // a path which does not need any of the values computed within the loop, we
   // can rewrite the loop to exit on the first iteration.  Note that this
   // doesn't either a) tell us the loop exits on the first iteration (unless
   // *all* exits are predicateable) or b) tell us *which* exit might be taken.
   // This transformation looks a lot like a restricted form of dead loop
   // elimination, but restricted to read-only loops and without neccesssarily
-  // needing to kill the loop entirely. 
+  // needing to kill the loop entirely.
   if (!LoopPredication)
-    return Changed;
+    return false;
 
   if (!SE->hasLoopInvariantBackedgeTakenCount(L))
-    return Changed;
+    return false;
 
   // Note: ExactBTC is the exact backedge taken count *iff* the loop exits
   // through *explicit* control flow.  We have to eliminate the possibility of
@@ -2833,16 +2499,16 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   if (isa<SCEVCouldNotCompute>(ExactBTC) ||
       !SE->isLoopInvariant(ExactBTC, L) ||
       !isSafeToExpand(ExactBTC, *SE))
-    return Changed;
+    return false;
 
   // If we end up with a pointer exit count, bail.  It may be unsized.
   if (!ExactBTC->getType()->isIntegerTy())
-    return Changed;
+    return false;
 
   auto BadExit = [&](BasicBlock *ExitingBB) {
     // If our exiting block exits multiple loops, we can only rewrite the
     // innermost one.  Otherwise, we're changing how many times the innermost
-    // loop runs before it exits. 
+    // loop runs before it exits.
     if (LI->getLoopFor(ExitingBB) != L)
       return true;
 
@@ -2897,18 +2563,18 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   // is complicated and we choose not to for now.
   for (unsigned i = 1; i < ExitingBlocks.size(); i++)
     if (!DT->dominates(ExitingBlocks[i-1], ExitingBlocks[i]))
-      return Changed;
+      return false;
 
   // Given our sorted total order, we know that exit[j] must be evaluated
   // after all exit[i] such j > i.
   for (unsigned i = 0, e = ExitingBlocks.size(); i < e; i++)
     if (BadExit(ExitingBlocks[i])) {
-      ExitingBlocks.resize(i);  
+      ExitingBlocks.resize(i);
       break;
     }
 
   if (ExitingBlocks.empty())
-    return Changed;
+    return false;
 
   // We rely on not being able to reach an exiting block on a later iteration
   // then it's statically compute exit count.  The implementaton of
@@ -2930,8 +2596,9 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
     for (auto &I : *BB)
       // TODO:isGuaranteedToTransfer
       if (I.mayHaveSideEffects() || I.mayThrow())
-        return Changed;
+        return false;
 
+  bool Changed = false;
   // Finally, do the actual predication for all predicatable blocks.  A couple
   // of notes here:
   // 1) We don't bother to constant fold dominated exits with identical exit
@@ -2970,7 +2637,7 @@ bool IndVarSimplify::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
     Value *OldCond = BI->getCondition();
     BI->setCondition(NewCond);
     if (OldCond->use_empty())
-      DeadInsts.push_back(OldCond);
+      DeadInsts.emplace_back(OldCond);
     Changed = true;
   }
 
@@ -2985,7 +2652,6 @@ bool IndVarSimplify::run(Loop *L) {
   // We need (and expect!) the incoming loop to be in LCSSA.
   assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
          "LCSSA required to run indvars!");
-  bool Changed = false;
 
   // If LoopSimplify form is not available, stay out of trouble. Some notes:
   //  - LSR currently only supports LoopSimplify-form loops. Indvars'
@@ -3001,9 +2667,15 @@ bool IndVarSimplify::run(Loop *L) {
 
 #ifndef NDEBUG
   // Used below for a consistency check only
-  const SCEV *BackedgeTakenCount = SE->getBackedgeTakenCount(L);
+  // Note: Since the result returned by ScalarEvolution may depend on the order
+  // in which previous results are added to its cache, the call to
+  // getBackedgeTakenCount() may change following SCEV queries.
+  const SCEV *BackedgeTakenCount;
+  if (VerifyIndvars)
+    BackedgeTakenCount = SE->getBackedgeTakenCount(L);
 #endif
 
+  bool Changed = false;
   // If there are any floating-point recurrences, attempt to
   // transform them to use integer recurrences.
   Changed |= rewriteNonIntegerIVs(L);
@@ -3027,8 +2699,13 @@ bool IndVarSimplify::run(Loop *L) {
   // that are recurrent in the loop, and substitute the exit values from the
   // loop into any instructions outside of the loop that use the final values
   // of the current expressions.
-  if (ReplaceExitValue != NeverRepl)
-    Changed |= rewriteLoopExitValues(L, Rewriter);
+  if (ReplaceExitValue != NeverRepl) {
+    if (int Rewrites = rewriteLoopExitValues(L, LI, TLI, SE, TTI, Rewriter, DT,
+                                             ReplaceExitValue, DeadInsts)) {
+      NumReplaced += Rewrites;
+      Changed = true;
+    }
+  }
 
   // Eliminate redundant IV cycles.
   NumElimIV += Rewriter.replaceCongruentIVs(L, DT, DeadInsts);
@@ -3039,7 +2716,7 @@ bool IndVarSimplify::run(Loop *L) {
     // Given we've changed exit counts, notify SCEV
     SE->forgetLoop(L);
   }
-  
+
   // Try to form loop invariant tests for loop exits by changing how many
   // iterations of the loop run when that is unobservable.
   if (predicateLoopExits(L, Rewriter)) {
@@ -3049,8 +2726,11 @@ bool IndVarSimplify::run(Loop *L) {
   }
 
   // If we have a trip count expression, rewrite the loop's exit condition
-  // using it.  
+  // using it.
   if (!DisableLFTR) {
+    BasicBlock *PreHeader = L->getLoopPreheader();
+    BranchInst *PreHeaderBR = cast<BranchInst>(PreHeader->getTerminator());
+
     SmallVector<BasicBlock*, 16> ExitingBlocks;
     L->getExitingBlocks(ExitingBlocks);
     for (BasicBlock *ExitingBB : ExitingBlocks) {
@@ -3060,10 +2740,10 @@ bool IndVarSimplify::run(Loop *L) {
 
       // If our exitting block exits multiple loops, we can only rewrite the
       // innermost one.  Otherwise, we're changing how many times the innermost
-      // loop runs before it exits. 
+      // loop runs before it exits.
       if (LI->getLoopFor(ExitingBB) != L)
         continue;
-      
+
       if (!needsLFTR(L, ExitingBB))
         continue;
 
@@ -3077,14 +2757,15 @@ bool IndVarSimplify::run(Loop *L) {
       // until stable to handle cases like this better.
       if (ExitCount->isZero())
         continue;
-      
+
       PHINode *IndVar = FindLoopCounter(L, ExitingBB, ExitCount, SE, DT);
       if (!IndVar)
         continue;
-      
+
       // Avoid high cost expansions.  Note: This heuristic is questionable in
-      // that our definition of "high cost" is not exactly principled.  
-      if (Rewriter.isHighCostExpansion(ExitCount, L))
+      // that our definition of "high cost" is not exactly principled.
+      if (Rewriter.isHighCostExpansion(ExitCount, L, SCEVCheapExpansionBudget,
+                                       TTI, PreHeaderBR))
         continue;
 
       // Check preconditions for proper SCEVExpander operation. SCEV does not
@@ -3092,7 +2773,7 @@ bool IndVarSimplify::run(Loop *L) {
       // any pass that uses the SCEVExpander must do it. This does not work
       // well for loop passes because SCEVExpander makes assumptions about
       // all loops, while LoopPassManager only forces the current loop to be
-      // simplified. 
+      // simplified.
       //
       // FIXME: SCEV expansion has no way to bail out, so the caller must
       // explicitly check any assumptions made by SCEV. Brittle.
@@ -3113,7 +2794,8 @@ bool IndVarSimplify::run(Loop *L) {
   while (!DeadInsts.empty())
     if (Instruction *Inst =
             dyn_cast_or_null<Instruction>(DeadInsts.pop_back_val()))
-      Changed |= RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI);
+      Changed |=
+          RecursivelyDeleteTriviallyDeadInstructions(Inst, TLI, MSSAU.get());
 
   // The Rewriter may not be used from this point on.
 
@@ -3127,7 +2809,7 @@ bool IndVarSimplify::run(Loop *L) {
   Changed |= rewriteFirstIterationLoopExitValues(L);
 
   // Clean up dead instructions.
-  Changed |= DeleteDeadPHIs(L->getHeader(), TLI);
+  Changed |= DeleteDeadPHIs(L->getHeader(), TLI, MSSAU.get());
 
   // Check a post-condition.
   assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
@@ -3150,6 +2832,8 @@ bool IndVarSimplify::run(Loop *L) {
     assert(!SE->isKnownPredicate(ICmpInst::ICMP_ULT, BackedgeTakenCount,
                                  NewBECount) && "indvars must preserve SCEV");
   }
+  if (VerifyMemorySSA && MSSAU)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
 #endif
 
   return Changed;
@@ -3161,12 +2845,14 @@ PreservedAnalyses IndVarSimplifyPass::run(Loop &L, LoopAnalysisManager &AM,
   Function *F = L.getHeader()->getParent();
   const DataLayout &DL = F->getParent()->getDataLayout();
 
-  IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI);
+  IndVarSimplify IVS(&AR.LI, &AR.SE, &AR.DT, DL, &AR.TLI, &AR.TTI, AR.MSSA);
   if (!IVS.run(&L))
     return PreservedAnalyses::all();
 
   auto PA = getLoopPassPreservedAnalyses();
   PA.preserveSet<CFGAnalyses>();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
   return PA;
 }
 
@@ -3191,13 +2877,18 @@ struct IndVarSimplifyLegacyPass : public LoopPass {
     auto *TTIP = getAnalysisIfAvailable<TargetTransformInfoWrapperPass>();
     auto *TTI = TTIP ? &TTIP->getTTI(*L->getHeader()->getParent()) : nullptr;
     const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
+    auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+    MemorySSA *MSSA = nullptr;
+    if (MSSAAnalysis)
+      MSSA = &MSSAAnalysis->getMSSA();
 
-    IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI);
+    IndVarSimplify IVS(LI, SE, DT, DL, TLI, TTI, MSSA);
     return IVS.run(L);
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
+    AU.addPreserved<MemorySSAWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 };
diff --git a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
index 58469749600e..30e4822b6769 100644
--- a/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/InductiveRangeCheckElimination.cpp
@@ -47,6 +47,7 @@
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
 #include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PriorityWorklist.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringRef.h"
@@ -55,8 +56,8 @@
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
@@ -87,6 +88,7 @@
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
@@ -242,20 +244,25 @@ public:
   bool run(Loop *L, function_ref<void(Loop *, bool)> LPMAddNewLoop);
 };
 
-class IRCELegacyPass : public LoopPass {
+class IRCELegacyPass : public FunctionPass {
 public:
   static char ID;
 
-  IRCELegacyPass() : LoopPass(ID) {
+  IRCELegacyPass() : FunctionPass(ID) {
     initializeIRCELegacyPassPass(*PassRegistry::getPassRegistry());
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<BranchProbabilityInfoWrapperPass>();
-    getLoopAnalysisUsage(AU);
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addRequired<ScalarEvolutionWrapperPass>();
+    AU.addPreserved<ScalarEvolutionWrapperPass>();
   }
 
-  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+  bool runOnFunction(Function &F) override;
 };
 
 } // end anonymous namespace
@@ -265,7 +272,9 @@ char IRCELegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(IRCELegacyPass, "irce",
                       "Inductive range check elimination", false, false)
 INITIALIZE_PASS_DEPENDENCY(BranchProbabilityInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(LoopPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_END(IRCELegacyPass, "irce", "Inductive range check elimination",
                     false, false)
 
@@ -866,7 +875,14 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
   const SCEV *IndVarStart = SE.getAddExpr(StartNext, Addend);
   const SCEV *Step = SE.getSCEV(StepCI);
 
-  ConstantInt *One = ConstantInt::get(IndVarTy, 1);
+  const SCEV *FixedRightSCEV = nullptr;
+
+  // If RightValue resides within loop (but still being loop invariant),
+  // regenerate it as preheader.
+  if (auto *I = dyn_cast<Instruction>(RightValue))
+    if (L.contains(I->getParent()))
+      FixedRightSCEV = RightSCEV;
+
   if (IsIncreasing) {
     bool DecreasedRightValueByOne = false;
     if (StepCI->isOne()) {
@@ -928,10 +944,9 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
     if (LatchBrExitIdx == 0) {
       // We need to increase the right value unless we have already decreased
       // it virtually when we replaced EQ with SGT.
-      if (!DecreasedRightValueByOne) {
-        IRBuilder<> B(Preheader->getTerminator());
-        RightValue = B.CreateAdd(RightValue, One);
-      }
+      if (!DecreasedRightValueByOne)
+        FixedRightSCEV =
+            SE.getAddExpr(RightSCEV, SE.getOne(RightSCEV->getType()));
     } else {
       assert(!DecreasedRightValueByOne &&
              "Right value can be decreased only for LatchBrExitIdx == 0!");
@@ -995,10 +1010,9 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
     if (LatchBrExitIdx == 0) {
       // We need to decrease the right value unless we have already increased
       // it virtually when we replaced EQ with SLT.
-      if (!IncreasedRightValueByOne) {
-        IRBuilder<> B(Preheader->getTerminator());
-        RightValue = B.CreateSub(RightValue, One);
-      }
+      if (!IncreasedRightValueByOne)
+        FixedRightSCEV =
+            SE.getMinusSCEV(RightSCEV, SE.getOne(RightSCEV->getType()));
     } else {
       assert(!IncreasedRightValueByOne &&
              "Right value can be increased only for LatchBrExitIdx == 0!");
@@ -1012,9 +1026,14 @@ LoopStructure::parseLoopStructure(ScalarEvolution &SE,
 
   assert(!L.contains(LatchExit) && "expected an exit block!");
   const DataLayout &DL = Preheader->getModule()->getDataLayout();
-  Value *IndVarStartV =
-      SCEVExpander(SE, DL, "irce")
-          .expandCodeFor(IndVarStart, IndVarTy, Preheader->getTerminator());
+  SCEVExpander Expander(SE, DL, "irce");
+  Instruction *Ins = Preheader->getTerminator();
+
+  if (FixedRightSCEV)
+    RightValue =
+        Expander.expandCodeFor(FixedRightSCEV, FixedRightSCEV->getType(), Ins);
+
+  Value *IndVarStartV = Expander.expandCodeFor(IndVarStart, IndVarTy, Ins);
   IndVarStartV->setName("indvar.start");
 
   LoopStructure Result;
@@ -1747,27 +1766,41 @@ IntersectUnsignedRange(ScalarEvolution &SE,
   return Ret;
 }
 
-PreservedAnalyses IRCEPass::run(Loop &L, LoopAnalysisManager &AM,
-                                LoopStandardAnalysisResults &AR,
-                                LPMUpdater &U) {
-  Function *F = L.getHeader()->getParent();
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
-  auto *BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(*F);
-  InductiveRangeCheckElimination IRCE(AR.SE, BPI, AR.DT, AR.LI);
-  auto LPMAddNewLoop = [&U](Loop *NL, bool IsSubloop) {
+PreservedAnalyses IRCEPass::run(Function &F, FunctionAnalysisManager &AM) {
+  auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &BPI = AM.getResult<BranchProbabilityAnalysis>(F);
+  LoopInfo &LI = AM.getResult<LoopAnalysis>(F);
+
+  InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI);
+
+  bool Changed = false;
+
+  for (const auto &L : LI) {
+    Changed |= simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr,
+                            /*PreserveLCSSA=*/false);
+    Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
+  }
+
+  SmallPriorityWorklist<Loop *, 4> Worklist;
+  appendLoopsToWorklist(LI, Worklist);
+  auto LPMAddNewLoop = [&Worklist](Loop *NL, bool IsSubloop) {
     if (!IsSubloop)
-      U.addSiblingLoops(NL);
+      appendLoopsToWorklist(*NL, Worklist);
   };
-  bool Changed = IRCE.run(&L, LPMAddNewLoop);
+
+  while (!Worklist.empty()) {
+    Loop *L = Worklist.pop_back_val();
+    Changed |= IRCE.run(L, LPMAddNewLoop);
+  }
+
   if (!Changed)
     return PreservedAnalyses::all();
-
   return getLoopPassPreservedAnalyses();
 }
 
-bool IRCELegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
-  if (skipLoop(L))
+bool IRCELegacyPass::runOnFunction(Function &F) {
+  if (skipFunction(F))
     return false;
 
   ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
@@ -1776,10 +1809,27 @@ bool IRCELegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
   auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   InductiveRangeCheckElimination IRCE(SE, &BPI, DT, LI);
-  auto LPMAddNewLoop = [&LPM](Loop *NL, bool /* IsSubLoop */) {
-    LPM.addLoop(*NL);
+
+  bool Changed = false;
+
+  for (const auto &L : LI) {
+    Changed |= simplifyLoop(L, &DT, &LI, &SE, nullptr, nullptr,
+                            /*PreserveLCSSA=*/false);
+    Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
+  }
+
+  SmallPriorityWorklist<Loop *, 4> Worklist;
+  appendLoopsToWorklist(LI, Worklist);
+  auto LPMAddNewLoop = [&](Loop *NL, bool IsSubloop) {
+    if (!IsSubloop)
+      appendLoopsToWorklist(*NL, Worklist);
   };
-  return IRCE.run(L, LPMAddNewLoop);
+
+  while (!Worklist.empty()) {
+    Loop *L = Worklist.pop_back_val();
+    Changed |= IRCE.run(L, LPMAddNewLoop);
+  }
+  return Changed;
 }
 
 bool InductiveRangeCheckElimination::run(
diff --git a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
index dfb1b6bfb739..db9cc58bbfc4 100644
--- a/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAddressSpaces.cpp
@@ -96,7 +96,6 @@
 #include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
-#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -116,11 +115,13 @@
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <cassert>
 #include <iterator>
@@ -132,16 +133,23 @@
 
 using namespace llvm;
 
+static cl::opt<bool> AssumeDefaultIsFlatAddressSpace(
+    "assume-default-is-flat-addrspace", cl::init(false), cl::ReallyHidden,
+    cl::desc("The default address space is assumed as the flat address space. "
+             "This is mainly for test purpose."));
+
 static const unsigned UninitializedAddressSpace =
     std::numeric_limits<unsigned>::max();
 
 namespace {
 
 using ValueToAddrSpaceMapTy = DenseMap<const Value *, unsigned>;
+using PostorderStackTy = llvm::SmallVector<PointerIntPair<Value *, 1, bool>, 4>;
 
 /// InferAddressSpaces
 class InferAddressSpaces : public FunctionPass {
   const TargetTransformInfo *TTI = nullptr;
+  const DataLayout *DL = nullptr;
 
   /// Target specific address space which uses of should be replaced if
   /// possible.
@@ -174,6 +182,11 @@ private:
 
   bool isSafeToCastConstAddrSpace(Constant *C, unsigned NewAS) const;
 
+  Value *cloneInstructionWithNewAddressSpace(
+      Instruction *I, unsigned NewAddrSpace,
+      const ValueToValueMapTy &ValueWithNewAddrSpace,
+      SmallVectorImpl<const Use *> *UndefUsesToFix) const;
+
   // Changes the flat address expressions in function F to point to specific
   // address spaces if InferredAddrSpace says so. Postorder is the postorder of
   // all flat expressions in the use-def graph of function F.
@@ -182,15 +195,14 @@ private:
       const ValueToAddrSpaceMapTy &InferredAddrSpace, Function *F) const;
 
   void appendsFlatAddressExpressionToPostorderStack(
-    Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack,
-    DenseSet<Value *> &Visited) const;
+      Value *V, PostorderStackTy &PostorderStack,
+      DenseSet<Value *> &Visited) const;
 
   bool rewriteIntrinsicOperands(IntrinsicInst *II,
                                 Value *OldV, Value *NewV) const;
-  void collectRewritableIntrinsicOperands(
-    IntrinsicInst *II,
-    std::vector<std::pair<Value *, bool>> &PostorderStack,
-    DenseSet<Value *> &Visited) const;
+  void collectRewritableIntrinsicOperands(IntrinsicInst *II,
+                                          PostorderStackTy &PostorderStack,
+                                          DenseSet<Value *> &Visited) const;
 
   std::vector<WeakTrackingVH> collectFlatAddressExpressions(Function &F) const;
 
@@ -214,24 +226,65 @@ void initializeInferAddressSpacesPass(PassRegistry &);
 INITIALIZE_PASS(InferAddressSpaces, DEBUG_TYPE, "Infer address spaces",
                 false, false)
 
+// Check whether that's no-op pointer bicast using a pair of
+// `ptrtoint`/`inttoptr` due to the missing no-op pointer bitcast over
+// different address spaces.
+static bool isNoopPtrIntCastPair(const Operator *I2P, const DataLayout &DL,
+                                 const TargetTransformInfo *TTI) {
+  assert(I2P->getOpcode() == Instruction::IntToPtr);
+  auto *P2I = dyn_cast<Operator>(I2P->getOperand(0));
+  if (!P2I || P2I->getOpcode() != Instruction::PtrToInt)
+    return false;
+  // Check it's really safe to treat that pair of `ptrtoint`/`inttoptr` as a
+  // no-op cast. Besides checking both of them are no-op casts, as the
+  // reinterpreted pointer may be used in other pointer arithmetic, we also
+  // need to double-check that through the target-specific hook. That ensures
+  // the underlying target also agrees that's a no-op address space cast and
+  // pointer bits are preserved.
+  // The current IR spec doesn't have clear rules on address space casts,
+  // especially a clear definition for pointer bits in non-default address
+  // spaces. It would be undefined if that pointer is dereferenced after an
+  // invalid reinterpret cast. Also, due to the unclearness for the meaning of
+  // bits in non-default address spaces in the current spec, the pointer
+  // arithmetic may also be undefined after invalid pointer reinterpret cast.
+  // However, as we confirm through the target hooks that it's a no-op
+  // addrspacecast, it doesn't matter since the bits should be the same.
+  return CastInst::isNoopCast(Instruction::CastOps(I2P->getOpcode()),
+                              I2P->getOperand(0)->getType(), I2P->getType(),
+                              DL) &&
+         CastInst::isNoopCast(Instruction::CastOps(P2I->getOpcode()),
+                              P2I->getOperand(0)->getType(), P2I->getType(),
+                              DL) &&
+         TTI->isNoopAddrSpaceCast(
+             P2I->getOperand(0)->getType()->getPointerAddressSpace(),
+             I2P->getType()->getPointerAddressSpace());
+}
+
 // Returns true if V is an address expression.
 // TODO: Currently, we consider only phi, bitcast, addrspacecast, and
 // getelementptr operators.
-static bool isAddressExpression(const Value &V) {
-  if (!isa<Operator>(V))
+static bool isAddressExpression(const Value &V, const DataLayout &DL,
+                                const TargetTransformInfo *TTI) {
+  const Operator *Op = dyn_cast<Operator>(&V);
+  if (!Op)
     return false;
 
-  const Operator &Op = cast<Operator>(V);
-  switch (Op.getOpcode()) {
+  switch (Op->getOpcode()) {
   case Instruction::PHI:
-    assert(Op.getType()->isPointerTy());
+    assert(Op->getType()->isPointerTy());
     return true;
   case Instruction::BitCast:
   case Instruction::AddrSpaceCast:
   case Instruction::GetElementPtr:
     return true;
   case Instruction::Select:
-    return Op.getType()->isPointerTy();
+    return Op->getType()->isPointerTy();
+  case Instruction::Call: {
+    const IntrinsicInst *II = dyn_cast<IntrinsicInst>(&V);
+    return II && II->getIntrinsicID() == Intrinsic::ptrmask;
+  }
+  case Instruction::IntToPtr:
+    return isNoopPtrIntCastPair(Op, DL, TTI);
   default:
     return false;
   }
@@ -240,7 +293,9 @@ static bool isAddressExpression(const Value &V) {
 // Returns the pointer operands of V.
 //
 // Precondition: V is an address expression.
-static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
+static SmallVector<Value *, 2>
+getPointerOperands(const Value &V, const DataLayout &DL,
+                   const TargetTransformInfo *TTI) {
   const Operator &Op = cast<Operator>(V);
   switch (Op.getOpcode()) {
   case Instruction::PHI: {
@@ -254,12 +309,22 @@ static SmallVector<Value *, 2> getPointerOperands(const Value &V) {
     return {Op.getOperand(0)};
   case Instruction::Select:
     return {Op.getOperand(1), Op.getOperand(2)};
+  case Instruction::Call: {
+    const IntrinsicInst &II = cast<IntrinsicInst>(Op);
+    assert(II.getIntrinsicID() == Intrinsic::ptrmask &&
+           "unexpected intrinsic call");
+    return {II.getArgOperand(0)};
+  }
+  case Instruction::IntToPtr: {
+    assert(isNoopPtrIntCastPair(&Op, DL, TTI));
+    auto *P2I = cast<Operator>(Op.getOperand(0));
+    return {P2I->getOperand(0)};
+  }
   default:
     llvm_unreachable("Unexpected instruction type.");
   }
 }
 
-// TODO: Move logic to TTI?
 bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
                                                   Value *OldV,
                                                   Value *NewV) const {
@@ -275,16 +340,26 @@ bool InferAddressSpaces::rewriteIntrinsicOperands(IntrinsicInst *II,
     II->setCalledFunction(NewDecl);
     return true;
   }
-  default:
-    return TTI->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
+  case Intrinsic::ptrmask:
+    // This is handled as an address expression, not as a use memory operation.
+    return false;
+  default: {
+    Value *Rewrite = TTI->rewriteIntrinsicWithAddressSpace(II, OldV, NewV);
+    if (!Rewrite)
+      return false;
+    if (Rewrite != II)
+      II->replaceAllUsesWith(Rewrite);
+    return true;
+  }
   }
 }
 
 void InferAddressSpaces::collectRewritableIntrinsicOperands(
-    IntrinsicInst *II, std::vector<std::pair<Value *, bool>> &PostorderStack,
+    IntrinsicInst *II, PostorderStackTy &PostorderStack,
     DenseSet<Value *> &Visited) const {
   auto IID = II->getIntrinsicID();
   switch (IID) {
+  case Intrinsic::ptrmask:
   case Intrinsic::objectsize:
     appendsFlatAddressExpressionToPostorderStack(II->getArgOperand(0),
                                                  PostorderStack, Visited);
@@ -305,7 +380,7 @@ void InferAddressSpaces::collectRewritableIntrinsicOperands(
 // If V is an unvisited flat address expression, appends V to PostorderStack
 // and marks it as visited.
 void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
-    Value *V, std::vector<std::pair<Value *, bool>> &PostorderStack,
+    Value *V, PostorderStackTy &PostorderStack,
     DenseSet<Value *> &Visited) const {
   assert(V->getType()->isPointerTy());
 
@@ -313,21 +388,21 @@ void InferAddressSpaces::appendsFlatAddressExpressionToPostorderStack(
   // expressions.
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(V)) {
     // TODO: Look in non-address parts, like icmp operands.
-    if (isAddressExpression(*CE) && Visited.insert(CE).second)
-      PostorderStack.push_back(std::make_pair(CE, false));
+    if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second)
+      PostorderStack.emplace_back(CE, false);
 
     return;
   }
 
-  if (isAddressExpression(*V) &&
+  if (isAddressExpression(*V, *DL, TTI) &&
       V->getType()->getPointerAddressSpace() == FlatAddrSpace) {
     if (Visited.insert(V).second) {
-      PostorderStack.push_back(std::make_pair(V, false));
+      PostorderStack.emplace_back(V, false);
 
       Operator *Op = cast<Operator>(V);
       for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I) {
         if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Op->getOperand(I))) {
-          if (isAddressExpression(*CE) && Visited.insert(CE).second)
+          if (isAddressExpression(*CE, *DL, TTI) && Visited.insert(CE).second)
             PostorderStack.emplace_back(CE, false);
         }
       }
@@ -341,7 +416,7 @@ std::vector<WeakTrackingVH>
 InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
   // This function implements a non-recursive postorder traversal of a partial
   // use-def graph of function F.
-  std::vector<std::pair<Value *, bool>> PostorderStack;
+  PostorderStackTy PostorderStack;
   // The set of visited expressions.
   DenseSet<Value *> Visited;
 
@@ -383,23 +458,27 @@ InferAddressSpaces::collectFlatAddressExpressions(Function &F) const {
     } else if (auto *ASC = dyn_cast<AddrSpaceCastInst>(&I)) {
       if (!ASC->getType()->isVectorTy())
         PushPtrOperand(ASC->getPointerOperand());
+    } else if (auto *I2P = dyn_cast<IntToPtrInst>(&I)) {
+      if (isNoopPtrIntCastPair(cast<Operator>(I2P), *DL, TTI))
+        PushPtrOperand(
+            cast<PtrToIntInst>(I2P->getOperand(0))->getPointerOperand());
     }
   }
 
   std::vector<WeakTrackingVH> Postorder; // The resultant postorder.
   while (!PostorderStack.empty()) {
-    Value *TopVal = PostorderStack.back().first;
+    Value *TopVal = PostorderStack.back().getPointer();
     // If the operands of the expression on the top are already explored,
     // adds that expression to the resultant postorder.
-    if (PostorderStack.back().second) {
+    if (PostorderStack.back().getInt()) {
       if (TopVal->getType()->getPointerAddressSpace() == FlatAddrSpace)
         Postorder.push_back(TopVal);
       PostorderStack.pop_back();
       continue;
     }
     // Otherwise, adds its operands to the stack and explores them.
-    PostorderStack.back().second = true;
-    for (Value *PtrOperand : getPointerOperands(*TopVal)) {
+    PostorderStack.back().setInt(true);
+    for (Value *PtrOperand : getPointerOperands(*TopVal, *DL, TTI)) {
       appendsFlatAddressExpressionToPostorderStack(PtrOperand, PostorderStack,
                                                    Visited);
     }
@@ -438,10 +517,13 @@ static Value *operandWithNewAddressSpaceOrCreateUndef(
 // Note that we do not necessarily clone `I`, e.g., if it is an addrspacecast
 // from a pointer whose type already matches. Therefore, this function returns a
 // Value* instead of an Instruction*.
-static Value *cloneInstructionWithNewAddressSpace(
+//
+// This may also return nullptr in the case the instruction could not be
+// rewritten.
+Value *InferAddressSpaces::cloneInstructionWithNewAddressSpace(
     Instruction *I, unsigned NewAddrSpace,
     const ValueToValueMapTy &ValueWithNewAddrSpace,
-    SmallVectorImpl<const Use *> *UndefUsesToFix) {
+    SmallVectorImpl<const Use *> *UndefUsesToFix) const {
   Type *NewPtrType =
       I->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
 
@@ -456,6 +538,23 @@ static Value *cloneInstructionWithNewAddressSpace(
     return Src;
   }
 
+  if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(I)) {
+    // Technically the intrinsic ID is a pointer typed argument, so specially
+    // handle calls early.
+    assert(II->getIntrinsicID() == Intrinsic::ptrmask);
+    Value *NewPtr = operandWithNewAddressSpaceOrCreateUndef(
+        II->getArgOperandUse(0), NewAddrSpace, ValueWithNewAddrSpace,
+        UndefUsesToFix);
+    Value *Rewrite =
+        TTI->rewriteIntrinsicWithAddressSpace(II, II->getArgOperand(0), NewPtr);
+    if (Rewrite) {
+      assert(Rewrite != II && "cannot modify this pointer operation in place");
+      return Rewrite;
+    }
+
+    return nullptr;
+  }
+
   // Computes the converted pointer operands.
   SmallVector<Value *, 4> NewPointerOperands;
   for (const Use &OperandUse : I->operands()) {
@@ -492,6 +591,14 @@ static Value *cloneInstructionWithNewAddressSpace(
     assert(I->getType()->isPointerTy());
     return SelectInst::Create(I->getOperand(0), NewPointerOperands[1],
                               NewPointerOperands[2], "", nullptr, I);
+  case Instruction::IntToPtr: {
+    assert(isNoopPtrIntCastPair(cast<Operator>(I), *DL, TTI));
+    Value *Src = cast<Operator>(I->getOperand(0))->getOperand(0);
+    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+    if (Src->getType() != NewPtrType)
+      return new BitCastInst(Src, NewPtrType);
+    return Src;
+  }
   default:
     llvm_unreachable("Unexpected opcode");
   }
@@ -501,8 +608,9 @@ static Value *cloneInstructionWithNewAddressSpace(
 // constant expression `CE` with its operands replaced as specified in
 // ValueWithNewAddrSpace.
 static Value *cloneConstantExprWithNewAddressSpace(
-  ConstantExpr *CE, unsigned NewAddrSpace,
-  const ValueToValueMapTy &ValueWithNewAddrSpace) {
+    ConstantExpr *CE, unsigned NewAddrSpace,
+    const ValueToValueMapTy &ValueWithNewAddrSpace, const DataLayout *DL,
+    const TargetTransformInfo *TTI) {
   Type *TargetType =
     CE->getType()->getPointerElementType()->getPointerTo(NewAddrSpace);
 
@@ -533,6 +641,13 @@ static Value *cloneConstantExprWithNewAddressSpace(
     }
   }
 
+  if (CE->getOpcode() == Instruction::IntToPtr) {
+    assert(isNoopPtrIntCastPair(cast<Operator>(CE), *DL, TTI));
+    Constant *Src = cast<ConstantExpr>(CE->getOperand(0))->getOperand(0);
+    assert(Src->getType()->getPointerAddressSpace() == NewAddrSpace);
+    return ConstantExpr::getBitCast(Src, TargetType);
+  }
+
   // Computes the operands of the new constant expression.
   bool IsNew = false;
   SmallVector<Constant *, 4> NewOperands;
@@ -550,7 +665,7 @@ static Value *cloneConstantExprWithNewAddressSpace(
     }
     if (auto CExpr = dyn_cast<ConstantExpr>(Operand))
       if (Value *NewOperand = cloneConstantExprWithNewAddressSpace(
-              CExpr, NewAddrSpace, ValueWithNewAddrSpace)) {
+              CExpr, NewAddrSpace, ValueWithNewAddrSpace, DL, TTI)) {
         IsNew = true;
         NewOperands.push_back(cast<Constant>(NewOperand));
         continue;
@@ -585,13 +700,13 @@ Value *InferAddressSpaces::cloneValueWithNewAddressSpace(
   const ValueToValueMapTy &ValueWithNewAddrSpace,
   SmallVectorImpl<const Use *> *UndefUsesToFix) const {
   // All values in Postorder are flat address expressions.
-  assert(isAddressExpression(*V) &&
+  assert(isAddressExpression(*V, *DL, TTI) &&
          V->getType()->getPointerAddressSpace() == FlatAddrSpace);
 
   if (Instruction *I = dyn_cast<Instruction>(V)) {
     Value *NewV = cloneInstructionWithNewAddressSpace(
       I, NewAddrSpace, ValueWithNewAddrSpace, UndefUsesToFix);
-    if (Instruction *NewI = dyn_cast<Instruction>(NewV)) {
+    if (Instruction *NewI = dyn_cast_or_null<Instruction>(NewV)) {
       if (NewI->getParent() == nullptr) {
         NewI->insertBefore(I);
         NewI->takeName(I);
@@ -601,7 +716,7 @@ Value *InferAddressSpaces::cloneValueWithNewAddressSpace(
   }
 
   return cloneConstantExprWithNewAddressSpace(
-    cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace);
+      cast<ConstantExpr>(V), NewAddrSpace, ValueWithNewAddrSpace, DL, TTI);
 }
 
 // Defines the join operation on the address space lattice (see the file header
@@ -625,6 +740,10 @@ bool InferAddressSpaces::runOnFunction(Function &F) {
     return false;
 
   TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+  DL = &F.getParent()->getDataLayout();
+
+  if (AssumeDefaultIsFlatAddressSpace)
+    FlatAddrSpace = 0;
 
   if (FlatAddrSpace == UninitializedAddressSpace) {
     FlatAddrSpace = TTI->getFlatAddressSpace();
@@ -729,7 +848,7 @@ Optional<unsigned> InferAddressSpaces::updateAddressSpace(
     else
       NewAS = joinAddressSpaces(Src0AS, Src1AS);
   } else {
-    for (Value *PtrOperand : getPointerOperands(V)) {
+    for (Value *PtrOperand : getPointerOperands(V, *DL, TTI)) {
       auto I = InferredAddrSpace.find(PtrOperand);
       unsigned OperandAS = I != InferredAddrSpace.end() ?
         I->second : PtrOperand->getType()->getPointerAddressSpace();
@@ -879,8 +998,10 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
   for (Value* V : Postorder) {
     unsigned NewAddrSpace = InferredAddrSpace.lookup(V);
     if (V->getType()->getPointerAddressSpace() != NewAddrSpace) {
-      ValueWithNewAddrSpace[V] = cloneValueWithNewAddressSpace(
-        V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+      Value *New = cloneValueWithNewAddressSpace(
+          V, NewAddrSpace, ValueWithNewAddrSpace, &UndefUsesToFix);
+      if (New)
+        ValueWithNewAddrSpace[V] = New;
     }
   }
 
@@ -890,7 +1011,10 @@ bool InferAddressSpaces::rewriteWithNewAddressSpaces(
   // Fixes all the undef uses generated by cloneInstructionWithNewAddressSpace.
   for (const Use *UndefUse : UndefUsesToFix) {
     User *V = UndefUse->getUser();
-    User *NewV = cast<User>(ValueWithNewAddrSpace.lookup(V));
+    User *NewV = cast_or_null<User>(ValueWithNewAddrSpace.lookup(V));
+    if (!NewV)
+      continue;
+
     unsigned OperandNo = UndefUse->getOperandNo();
     assert(isa<UndefValue>(NewV->getOperand(OperandNo)));
     NewV->setOperand(OperandNo, ValueWithNewAddrSpace.lookup(UndefUse->get()));
diff --git a/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp b/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
index e8bbf2936da6..e87b622ab19f 100644
--- a/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
+++ b/llvm/lib/Transforms/Scalar/InstSimplifyPass.cpp
@@ -40,7 +40,7 @@ static bool runImpl(Function &F, const SimplifyQuery &SQ,
       if (!SQ.DT->isReachableFromEntry(&BB))
         continue;
 
-      SmallVector<Instruction *, 8> DeadInstsInBB;
+      SmallVector<WeakTrackingVH, 8> DeadInstsInBB;
       for (Instruction &I : BB) {
         // The first time through the loop, ToSimplify is empty and we try to
         // simplify all instructions. On later iterations, ToSimplify is not
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 98c2fcb3dae0..9d0500419a7f 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -13,6 +13,7 @@
 #include "llvm/Transforms/Scalar/JumpThreading.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/Optional.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -170,7 +171,7 @@ FunctionPass *llvm::createJumpThreadingPass(int Threshold) {
 }
 
 JumpThreadingPass::JumpThreadingPass(int T) {
-  BBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
+  DefaultBBDupThreshold = (T == -1) ? BBDuplicateThreshold : unsigned(T);
 }
 
 // Update branch probability information according to conditional
@@ -213,11 +214,16 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
   if (!CondBr)
     return;
 
-  BranchProbability BP;
   uint64_t TrueWeight, FalseWeight;
   if (!CondBr->extractProfMetadata(TrueWeight, FalseWeight))
     return;
 
+  if (TrueWeight + FalseWeight == 0)
+    // Zero branch_weights do not give a hint for getting branch probabilities.
+    // Technically it would result in division by zero denominator, which is
+    // TrueWeight + FalseWeight.
+    return;
+
   // Returns the outgoing edge of the dominating predecessor block
   // that leads to the PhiNode's incoming block:
   auto GetPredOutEdge =
@@ -252,10 +258,11 @@ static void updatePredecessorProfileMetadata(PHINode *PN, BasicBlock *BB) {
     if (!CI || !CI->getType()->isIntegerTy(1))
       continue;
 
-    BP = (CI->isOne() ? BranchProbability::getBranchProbability(
-                            TrueWeight, TrueWeight + FalseWeight)
-                      : BranchProbability::getBranchProbability(
-                            FalseWeight, TrueWeight + FalseWeight));
+    BranchProbability BP =
+        (CI->isOne() ? BranchProbability::getBranchProbability(
+                           TrueWeight, TrueWeight + FalseWeight)
+                     : BranchProbability::getBranchProbability(
+                           FalseWeight, TrueWeight + FalseWeight));
 
     auto PredOutEdge = GetPredOutEdge(PN->getIncomingBlock(i), BB);
     if (!PredOutEdge.first)
@@ -298,8 +305,6 @@ bool JumpThreading::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
   auto TLI = &getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(F);
-  // Get DT analysis before LVI. When LVI is initialized it conditionally adds
-  // DT if it's available.
   auto DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   auto LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
   auto AA = &getAnalysis<AAResultsWrapperPass>().getAAResults();
@@ -316,7 +321,7 @@ bool JumpThreading::runOnFunction(Function &F) {
                               std::move(BFI), std::move(BPI));
   if (PrintLVIAfterJumpThreading) {
     dbgs() << "LVI for function '" << F.getName() << "':\n";
-    LVI->printLVI(F, *DT, dbgs());
+    LVI->printLVI(F, DTU.getDomTree(), dbgs());
   }
   return Changed;
 }
@@ -324,8 +329,6 @@ bool JumpThreading::runOnFunction(Function &F) {
 PreservedAnalyses JumpThreadingPass::run(Function &F,
                                          FunctionAnalysisManager &AM) {
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
-  // Get DT analysis before LVI. When LVI is initialized it conditionally adds
-  // DT if it's available.
   auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
   auto &LVI = AM.getResult<LazyValueAnalysis>(F);
   auto &AA = AM.getResult<AAManager>(F);
@@ -374,6 +377,15 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
     BFI = std::move(BFI_);
   }
 
+  // Reduce the number of instructions duplicated when optimizing strictly for
+  // size.
+  if (BBDuplicateThreshold.getNumOccurrences())
+    BBDupThreshold = BBDuplicateThreshold;
+  else if (F.hasFnAttribute(Attribute::MinSize))
+    BBDupThreshold = 3;
+  else
+    BBDupThreshold = DefaultBBDupThreshold;
+
   // JumpThreading must not processes blocks unreachable from entry. It's a
   // waste of compute time and can potentially lead to hangs.
   SmallPtrSet<BasicBlock *, 16> Unreachable;
@@ -396,6 +408,12 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
         continue;
       while (ProcessBlock(&BB)) // Thread all of the branches we can over BB.
         Changed = true;
+
+      // Jump threading may have introduced redundant debug values into BB
+      // which should be removed.
+      if (Changed)
+        RemoveRedundantDbgInstrs(&BB);
+
       // Stop processing BB if it's the entry or is now deleted. The following
       // routines attempt to eliminate BB and locating a suitable replacement
       // for the entry is non-trivial.
@@ -418,26 +436,27 @@ bool JumpThreadingPass::runImpl(Function &F, TargetLibraryInfo *TLI_,
       // ProcessBlock doesn't thread BBs with unconditional TIs. However, if BB
       // is "almost empty", we attempt to merge BB with its sole successor.
       auto *BI = dyn_cast<BranchInst>(BB.getTerminator());
-      if (BI && BI->isUnconditional() &&
-          // The terminator must be the only non-phi instruction in BB.
-          BB.getFirstNonPHIOrDbg()->isTerminator() &&
-          // Don't alter Loop headers and latches to ensure another pass can
-          // detect and transform nested loops later.
-          !LoopHeaders.count(&BB) && !LoopHeaders.count(BI->getSuccessor(0)) &&
-          TryToSimplifyUncondBranchFromEmptyBlock(&BB, DTU)) {
-        // BB is valid for cleanup here because we passed in DTU. F remains
-        // BB's parent until a DTU->getDomTree() event.
-        LVI->eraseBlock(&BB);
-        Changed = true;
+      if (BI && BI->isUnconditional()) {
+        BasicBlock *Succ = BI->getSuccessor(0);
+        if (
+            // The terminator must be the only non-phi instruction in BB.
+            BB.getFirstNonPHIOrDbg()->isTerminator() &&
+            // Don't alter Loop headers and latches to ensure another pass can
+            // detect and transform nested loops later.
+            !LoopHeaders.count(&BB) && !LoopHeaders.count(Succ) &&
+            TryToSimplifyUncondBranchFromEmptyBlock(&BB, DTU)) {
+          RemoveRedundantDbgInstrs(Succ);
+          // BB is valid for cleanup here because we passed in DTU. F remains
+          // BB's parent until a DTU->getDomTree() event.
+          LVI->eraseBlock(&BB);
+          Changed = true;
+        }
       }
     }
     EverChanged |= Changed;
   } while (Changed);
 
   LoopHeaders.clear();
-  // Flush only the Dominator Tree.
-  DTU->getDomTree();
-  LVI->enableDT();
   return EverChanged;
 }
 
@@ -592,20 +611,19 @@ static Constant *getKnownConstant(Value *Val, ConstantPreference Preference) {
 /// This returns true if there were any known values.
 bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
     Value *V, BasicBlock *BB, PredValueInfo &Result,
-    ConstantPreference Preference,
-    DenseSet<std::pair<Value *, BasicBlock *>> &RecursionSet,
+    ConstantPreference Preference, DenseSet<Value *> &RecursionSet,
     Instruction *CxtI) {
   // This method walks up use-def chains recursively.  Because of this, we could
   // get into an infinite loop going around loops in the use-def chain.  To
   // prevent this, keep track of what (value, block) pairs we've already visited
   // and terminate the search if we loop back to them
-  if (!RecursionSet.insert(std::make_pair(V, BB)).second)
+  if (!RecursionSet.insert(V).second)
     return false;
 
   // If V is a constant, then it is known in all predecessors.
   if (Constant *KC = getKnownConstant(V, Preference)) {
     for (BasicBlock *Pred : predecessors(BB))
-      Result.push_back(std::make_pair(KC, Pred));
+      Result.emplace_back(KC, Pred);
 
     return !Result.empty();
   }
@@ -627,17 +645,12 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
     // able to handle value inequalities better, for example if the compare is
     // "X < 4" and "X < 3" is known true but "X < 4" itself is not available.
     // Perhaps getConstantOnEdge should be smart enough to do this?
-
-    if (DTU->hasPendingDomTreeUpdates())
-      LVI->disableDT();
-    else
-      LVI->enableDT();
     for (BasicBlock *P : predecessors(BB)) {
       // If the value is known by LazyValueInfo to be a constant in a
       // predecessor, use that information to try to thread this block.
       Constant *PredCst = LVI->getConstantOnEdge(V, P, BB, CxtI);
       if (Constant *KC = getKnownConstant(PredCst, Preference))
-        Result.push_back(std::make_pair(KC, P));
+        Result.emplace_back(KC, P);
     }
 
     return !Result.empty();
@@ -645,20 +658,16 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
 
   /// If I is a PHI node, then we know the incoming values for any constants.
   if (PHINode *PN = dyn_cast<PHINode>(I)) {
-    if (DTU->hasPendingDomTreeUpdates())
-      LVI->disableDT();
-    else
-      LVI->enableDT();
     for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
       Value *InVal = PN->getIncomingValue(i);
       if (Constant *KC = getKnownConstant(InVal, Preference)) {
-        Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
+        Result.emplace_back(KC, PN->getIncomingBlock(i));
       } else {
         Constant *CI = LVI->getConstantOnEdge(InVal,
                                               PN->getIncomingBlock(i),
                                               BB, CxtI);
         if (Constant *KC = getKnownConstant(CI, Preference))
-          Result.push_back(std::make_pair(KC, PN->getIncomingBlock(i)));
+          Result.emplace_back(KC, PN->getIncomingBlock(i));
       }
     }
 
@@ -757,7 +766,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
         Constant *Folded = ConstantExpr::get(BO->getOpcode(), V, CI);
 
         if (Constant *KC = getKnownConstant(Folded, WantInteger))
-          Result.push_back(std::make_pair(KC, LHSVal.second));
+          Result.emplace_back(KC, LHSVal.second);
       }
     }
 
@@ -779,10 +788,6 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
       const DataLayout &DL = PN->getModule()->getDataLayout();
       // We can do this simplification if any comparisons fold to true or false.
       // See if any do.
-      if (DTU->hasPendingDomTreeUpdates())
-        LVI->disableDT();
-      else
-        LVI->enableDT();
       for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
         BasicBlock *PredBB = PN->getIncomingBlock(i);
         Value *LHS, *RHS;
@@ -813,7 +818,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
         }
 
         if (Constant *KC = getKnownConstant(Res, WantInteger))
-          Result.push_back(std::make_pair(KC, PredBB));
+          Result.emplace_back(KC, PredBB);
       }
 
       return !Result.empty();
@@ -826,10 +831,6 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
 
       if (!isa<Instruction>(CmpLHS) ||
           cast<Instruction>(CmpLHS)->getParent() != BB) {
-        if (DTU->hasPendingDomTreeUpdates())
-          LVI->disableDT();
-        else
-          LVI->enableDT();
         for (BasicBlock *P : predecessors(BB)) {
           // If the value is known by LazyValueInfo to be a constant in a
           // predecessor, use that information to try to thread this block.
@@ -840,7 +841,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
             continue;
 
           Constant *ResC = ConstantInt::get(CmpType, Res);
-          Result.push_back(std::make_pair(ResC, P));
+          Result.emplace_back(ResC, P);
         }
 
         return !Result.empty();
@@ -858,10 +859,6 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
             match(CmpLHS, m_Add(m_Value(AddLHS), m_ConstantInt(AddConst)))) {
           if (!isa<Instruction>(AddLHS) ||
               cast<Instruction>(AddLHS)->getParent() != BB) {
-            if (DTU->hasPendingDomTreeUpdates())
-              LVI->disableDT();
-            else
-              LVI->enableDT();
             for (BasicBlock *P : predecessors(BB)) {
               // If the value is known by LazyValueInfo to be a ConstantRange in
               // a predecessor, use that information to try to thread this
@@ -883,7 +880,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
               else
                 continue;
 
-              Result.push_back(std::make_pair(ResC, P));
+              Result.emplace_back(ResC, P);
             }
 
             return !Result.empty();
@@ -901,7 +898,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
         Constant *V = LHSVal.first;
         Constant *Folded = ConstantExpr::getCompare(Pred, V, CmpConst);
         if (Constant *KC = getKnownConstant(Folded, WantInteger))
-          Result.push_back(std::make_pair(KC, LHSVal.second));
+          Result.emplace_back(KC, LHSVal.second);
       }
 
       return !Result.empty();
@@ -935,7 +932,7 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
 
         // See if the select has a known constant value for this predecessor.
         if (Constant *Val = KnownCond ? TrueVal : FalseVal)
-          Result.push_back(std::make_pair(Val, C.second));
+          Result.emplace_back(Val, C.second);
       }
 
       return !Result.empty();
@@ -943,14 +940,10 @@ bool JumpThreadingPass::ComputeValueKnownInPredecessorsImpl(
   }
 
   // If all else fails, see if LVI can figure out a constant value for us.
-  if (DTU->hasPendingDomTreeUpdates())
-    LVI->disableDT();
-  else
-    LVI->enableDT();
   Constant *CI = LVI->getConstant(V, BB, CxtI);
   if (Constant *KC = getKnownConstant(CI, Preference)) {
     for (BasicBlock *Pred : predecessors(BB))
-      Result.push_back(std::make_pair(KC, Pred));
+      Result.emplace_back(KC, Pred);
   }
 
   return !Result.empty();
@@ -1106,10 +1099,6 @@ bool JumpThreadingPass::ProcessBlock(BasicBlock *BB) {
       // threading is concerned.
       assert(CondBr->isConditional() && "Threading on unconditional terminator");
 
-      if (DTU->hasPendingDomTreeUpdates())
-        LVI->disableDT();
-      else
-        LVI->enableDT();
       LazyValueInfo::Tristate Ret =
         LVI->getPredicateAt(CondCmp->getPredicate(), CondCmp->getOperand(0),
                             CondConst, CondBr);
@@ -1363,7 +1352,7 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
 
     // If so, this load is partially redundant.  Remember this info so that we
     // can create a PHI node.
-    AvailablePreds.push_back(std::make_pair(PredBB, PredAvailable));
+    AvailablePreds.emplace_back(PredBB, PredAvailable);
   }
 
   // If the loaded value isn't available in any predecessor, it isn't partially
@@ -1430,14 +1419,14 @@ bool JumpThreadingPass::SimplifyPartiallyRedundantLoad(LoadInst *LoadI) {
            "Can't handle critical edge here!");
     LoadInst *NewVal = new LoadInst(
         LoadI->getType(), LoadedPtr->DoPHITranslation(LoadBB, UnavailablePred),
-        LoadI->getName() + ".pr", false, MaybeAlign(LoadI->getAlignment()),
+        LoadI->getName() + ".pr", false, LoadI->getAlign(),
         LoadI->getOrdering(), LoadI->getSyncScopeID(),
         UnavailablePred->getTerminator());
     NewVal->setDebugLoc(LoadI->getDebugLoc());
     if (AATags)
       NewVal->setAAMetadata(AATags);
 
-    AvailablePreds.push_back(std::make_pair(UnavailablePred, NewVal));
+    AvailablePreds.emplace_back(UnavailablePred, NewVal);
   }
 
   // Now we know that each predecessor of this block has a value in
@@ -1496,56 +1485,70 @@ FindMostPopularDest(BasicBlock *BB,
   // explicitly choose to ignore 'undef' destinations.  We prefer to thread
   // blocks with known and real destinations to threading undef.  We'll handle
   // them later if interesting.
-  DenseMap<BasicBlock*, unsigned> DestPopularity;
+  MapVector<BasicBlock *, unsigned> DestPopularity;
+
+  // Populate DestPopularity with the successors in the order they appear in the
+  // successor list.  This way, we ensure determinism by iterating it in the
+  // same order in std::max_element below.  We map nullptr to 0 so that we can
+  // return nullptr when PredToDestList contains nullptr only.
+  DestPopularity[nullptr] = 0;
+  for (auto *SuccBB : successors(BB))
+    DestPopularity[SuccBB] = 0;
+
   for (const auto &PredToDest : PredToDestList)
     if (PredToDest.second)
       DestPopularity[PredToDest.second]++;
 
-  if (DestPopularity.empty())
-    return nullptr;
-
   // Find the most popular dest.
-  DenseMap<BasicBlock*, unsigned>::iterator DPI = DestPopularity.begin();
-  BasicBlock *MostPopularDest = DPI->first;
-  unsigned Popularity = DPI->second;
-  SmallVector<BasicBlock*, 4> SamePopularity;
-
-  for (++DPI; DPI != DestPopularity.end(); ++DPI) {
-    // If the popularity of this entry isn't higher than the popularity we've
-    // seen so far, ignore it.
-    if (DPI->second < Popularity)
-      ; // ignore.
-    else if (DPI->second == Popularity) {
-      // If it is the same as what we've seen so far, keep track of it.
-      SamePopularity.push_back(DPI->first);
-    } else {
-      // If it is more popular, remember it.
-      SamePopularity.clear();
-      MostPopularDest = DPI->first;
-      Popularity = DPI->second;
-    }
+  using VT = decltype(DestPopularity)::value_type;
+  auto MostPopular = std::max_element(
+      DestPopularity.begin(), DestPopularity.end(),
+      [](const VT &L, const VT &R) { return L.second < R.second; });
+
+  // Okay, we have finally picked the most popular destination.
+  return MostPopular->first;
+}
+
+// Try to evaluate the value of V when the control flows from PredPredBB to
+// BB->getSinglePredecessor() and then on to BB.
+Constant *JumpThreadingPass::EvaluateOnPredecessorEdge(BasicBlock *BB,
+                                                       BasicBlock *PredPredBB,
+                                                       Value *V) {
+  BasicBlock *PredBB = BB->getSinglePredecessor();
+  assert(PredBB && "Expected a single predecessor");
+
+  if (Constant *Cst = dyn_cast<Constant>(V)) {
+    return Cst;
   }
 
-  // Okay, now we know the most popular destination.  If there is more than one
-  // destination, we need to determine one.  This is arbitrary, but we need
-  // to make a deterministic decision.  Pick the first one that appears in the
-  // successor list.
-  if (!SamePopularity.empty()) {
-    SamePopularity.push_back(MostPopularDest);
-    Instruction *TI = BB->getTerminator();
-    for (unsigned i = 0; ; ++i) {
-      assert(i != TI->getNumSuccessors() && "Didn't find any successor!");
+  // Consult LVI if V is not an instruction in BB or PredBB.
+  Instruction *I = dyn_cast<Instruction>(V);
+  if (!I || (I->getParent() != BB && I->getParent() != PredBB)) {
+    return LVI->getConstantOnEdge(V, PredPredBB, PredBB, nullptr);
+  }
 
-      if (!is_contained(SamePopularity, TI->getSuccessor(i)))
-        continue;
+  // Look into a PHI argument.
+  if (PHINode *PHI = dyn_cast<PHINode>(V)) {
+    if (PHI->getParent() == PredBB)
+      return dyn_cast<Constant>(PHI->getIncomingValueForBlock(PredPredBB));
+    return nullptr;
+  }
 
-      MostPopularDest = TI->getSuccessor(i);
-      break;
+  // If we have a CmpInst, try to fold it for each incoming edge into PredBB.
+  if (CmpInst *CondCmp = dyn_cast<CmpInst>(V)) {
+    if (CondCmp->getParent() == BB) {
+      Constant *Op0 =
+          EvaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(0));
+      Constant *Op1 =
+          EvaluateOnPredecessorEdge(BB, PredPredBB, CondCmp->getOperand(1));
+      if (Op0 && Op1) {
+        return ConstantExpr::getCompare(CondCmp->getPredicate(), Op0, Op1);
+      }
     }
+    return nullptr;
   }
 
-  // Okay, we have finally picked the most popular destination.
-  return MostPopularDest;
+  return nullptr;
 }
 
 bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
@@ -1557,8 +1560,12 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
     return false;
 
   PredValueInfoTy PredValues;
-  if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference, CxtI))
-    return false;
+  if (!ComputeValueKnownInPredecessors(Cond, BB, PredValues, Preference,
+                                       CxtI)) {
+    // We don't have known values in predecessors.  See if we can thread through
+    // BB and its sole predecessor.
+    return MaybeThreadThroughTwoBasicBlocks(BB, Cond);
+  }
 
   assert(!PredValues.empty() &&
          "ComputeValueKnownInPredecessors returned true with no values");
@@ -1624,7 +1631,7 @@ bool JumpThreadingPass::ProcessThreadableEdges(Value *Cond, BasicBlock *BB,
         isa<CallBrInst>(Pred->getTerminator()))
       continue;
 
-    PredToDestList.push_back(std::make_pair(Pred, DestBB));
+    PredToDestList.emplace_back(Pred, DestBB);
   }
 
   // If all edges were unthreadable, we fail.
@@ -2015,6 +2022,205 @@ JumpThreadingPass::CloneInstructions(BasicBlock::iterator BI,
   return ValueMapping;
 }
 
+/// Attempt to thread through two successive basic blocks.
+bool JumpThreadingPass::MaybeThreadThroughTwoBasicBlocks(BasicBlock *BB,
+                                                         Value *Cond) {
+  // Consider:
+  //
+  // PredBB:
+  //   %var = phi i32* [ null, %bb1 ], [ @a, %bb2 ]
+  //   %tobool = icmp eq i32 %cond, 0
+  //   br i1 %tobool, label %BB, label ...
+  //
+  // BB:
+  //   %cmp = icmp eq i32* %var, null
+  //   br i1 %cmp, label ..., label ...
+  //
+  // We don't know the value of %var at BB even if we know which incoming edge
+  // we take to BB.  However, once we duplicate PredBB for each of its incoming
+  // edges (say, PredBB1 and PredBB2), we know the value of %var in each copy of
+  // PredBB.  Then we can thread edges PredBB1->BB and PredBB2->BB through BB.
+
+  // Require that BB end with a Branch for simplicity.
+  BranchInst *CondBr = dyn_cast<BranchInst>(BB->getTerminator());
+  if (!CondBr)
+    return false;
+
+  // BB must have exactly one predecessor.
+  BasicBlock *PredBB = BB->getSinglePredecessor();
+  if (!PredBB)
+    return false;
+
+  // Require that PredBB end with a conditional Branch. If PredBB ends with an
+  // unconditional branch, we should be merging PredBB and BB instead. For
+  // simplicity, we don't deal with a switch.
+  BranchInst *PredBBBranch = dyn_cast<BranchInst>(PredBB->getTerminator());
+  if (!PredBBBranch || PredBBBranch->isUnconditional())
+    return false;
+
+  // If PredBB has exactly one incoming edge, we don't gain anything by copying
+  // PredBB.
+  if (PredBB->getSinglePredecessor())
+    return false;
+
+  // Don't thread through PredBB if it contains a successor edge to itself, in
+  // which case we would infinite loop.  Suppose we are threading an edge from
+  // PredPredBB through PredBB and BB to SuccBB with PredBB containing a
+  // successor edge to itself.  If we allowed jump threading in this case, we
+  // could duplicate PredBB and BB as, say, PredBB.thread and BB.thread.  Since
+  // PredBB.thread has a successor edge to PredBB, we would immediately come up
+  // with another jump threading opportunity from PredBB.thread through PredBB
+  // and BB to SuccBB.  This jump threading would repeatedly occur.  That is, we
+  // would keep peeling one iteration from PredBB.
+  if (llvm::is_contained(successors(PredBB), PredBB))
+    return false;
+
+  // Don't thread across a loop header.
+  if (LoopHeaders.count(PredBB))
+    return false;
+
+  // Avoid complication with duplicating EH pads.
+  if (PredBB->isEHPad())
+    return false;
+
+  // Find a predecessor that we can thread.  For simplicity, we only consider a
+  // successor edge out of BB to which we thread exactly one incoming edge into
+  // PredBB.
+  unsigned ZeroCount = 0;
+  unsigned OneCount = 0;
+  BasicBlock *ZeroPred = nullptr;
+  BasicBlock *OnePred = nullptr;
+  for (BasicBlock *P : predecessors(PredBB)) {
+    if (ConstantInt *CI = dyn_cast_or_null<ConstantInt>(
+            EvaluateOnPredecessorEdge(BB, P, Cond))) {
+      if (CI->isZero()) {
+        ZeroCount++;
+        ZeroPred = P;
+      } else if (CI->isOne()) {
+        OneCount++;
+        OnePred = P;
+      }
+    }
+  }
+
+  // Disregard complicated cases where we have to thread multiple edges.
+  BasicBlock *PredPredBB;
+  if (ZeroCount == 1) {
+    PredPredBB = ZeroPred;
+  } else if (OneCount == 1) {
+    PredPredBB = OnePred;
+  } else {
+    return false;
+  }
+
+  BasicBlock *SuccBB = CondBr->getSuccessor(PredPredBB == ZeroPred);
+
+  // If threading to the same block as we come from, we would infinite loop.
+  if (SuccBB == BB) {
+    LLVM_DEBUG(dbgs() << "  Not threading across BB '" << BB->getName()
+                      << "' - would thread to self!\n");
+    return false;
+  }
+
+  // If threading this would thread across a loop header, don't thread the edge.
+  // See the comments above FindLoopHeaders for justifications and caveats.
+  if (LoopHeaders.count(BB) || LoopHeaders.count(SuccBB)) {
+    LLVM_DEBUG({
+      bool BBIsHeader = LoopHeaders.count(BB);
+      bool SuccIsHeader = LoopHeaders.count(SuccBB);
+      dbgs() << "  Not threading across "
+             << (BBIsHeader ? "loop header BB '" : "block BB '")
+             << BB->getName() << "' to dest "
+             << (SuccIsHeader ? "loop header BB '" : "block BB '")
+             << SuccBB->getName()
+             << "' - it might create an irreducible loop!\n";
+    });
+    return false;
+  }
+
+  // Compute the cost of duplicating BB and PredBB.
+  unsigned BBCost =
+      getJumpThreadDuplicationCost(BB, BB->getTerminator(), BBDupThreshold);
+  unsigned PredBBCost = getJumpThreadDuplicationCost(
+      PredBB, PredBB->getTerminator(), BBDupThreshold);
+
+  // Give up if costs are too high.  We need to check BBCost and PredBBCost
+  // individually before checking their sum because getJumpThreadDuplicationCost
+  // return (unsigned)~0 for those basic blocks that cannot be duplicated.
+  if (BBCost > BBDupThreshold || PredBBCost > BBDupThreshold ||
+      BBCost + PredBBCost > BBDupThreshold) {
+    LLVM_DEBUG(dbgs() << "  Not threading BB '" << BB->getName()
+                      << "' - Cost is too high: " << PredBBCost
+                      << " for PredBB, " << BBCost << "for BB\n");
+    return false;
+  }
+
+  // Now we are ready to duplicate PredBB.
+  ThreadThroughTwoBasicBlocks(PredPredBB, PredBB, BB, SuccBB);
+  return true;
+}
+
+void JumpThreadingPass::ThreadThroughTwoBasicBlocks(BasicBlock *PredPredBB,
+                                                    BasicBlock *PredBB,
+                                                    BasicBlock *BB,
+                                                    BasicBlock *SuccBB) {
+  LLVM_DEBUG(dbgs() << "  Threading through '" << PredBB->getName() << "' and '"
+                    << BB->getName() << "'\n");
+
+  BranchInst *CondBr = cast<BranchInst>(BB->getTerminator());
+  BranchInst *PredBBBranch = cast<BranchInst>(PredBB->getTerminator());
+
+  BasicBlock *NewBB =
+      BasicBlock::Create(PredBB->getContext(), PredBB->getName() + ".thread",
+                         PredBB->getParent(), PredBB);
+  NewBB->moveAfter(PredBB);
+
+  // Set the block frequency of NewBB.
+  if (HasProfileData) {
+    auto NewBBFreq = BFI->getBlockFreq(PredPredBB) *
+                     BPI->getEdgeProbability(PredPredBB, PredBB);
+    BFI->setBlockFreq(NewBB, NewBBFreq.getFrequency());
+  }
+
+  // We are going to have to map operands from the original BB block to the new
+  // copy of the block 'NewBB'.  If there are PHI nodes in PredBB, evaluate them
+  // to account for entry from PredPredBB.
+  DenseMap<Instruction *, Value *> ValueMapping =
+      CloneInstructions(PredBB->begin(), PredBB->end(), NewBB, PredPredBB);
+
+  // Update the terminator of PredPredBB to jump to NewBB instead of PredBB.
+  // This eliminates predecessors from PredPredBB, which requires us to simplify
+  // any PHI nodes in PredBB.
+  Instruction *PredPredTerm = PredPredBB->getTerminator();
+  for (unsigned i = 0, e = PredPredTerm->getNumSuccessors(); i != e; ++i)
+    if (PredPredTerm->getSuccessor(i) == PredBB) {
+      PredBB->removePredecessor(PredPredBB, true);
+      PredPredTerm->setSuccessor(i, NewBB);
+    }
+
+  AddPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(0), PredBB, NewBB,
+                                  ValueMapping);
+  AddPHINodeEntriesForMappedBlock(PredBBBranch->getSuccessor(1), PredBB, NewBB,
+                                  ValueMapping);
+
+  DTU->applyUpdatesPermissive(
+      {{DominatorTree::Insert, NewBB, CondBr->getSuccessor(0)},
+       {DominatorTree::Insert, NewBB, CondBr->getSuccessor(1)},
+       {DominatorTree::Insert, PredPredBB, NewBB},
+       {DominatorTree::Delete, PredPredBB, PredBB}});
+
+  UpdateSSA(PredBB, NewBB, ValueMapping);
+
+  // Clean up things like PHI nodes with single operands, dead instructions,
+  // etc.
+  SimplifyInstructionsInBlock(NewBB, TLI);
+  SimplifyInstructionsInBlock(PredBB, TLI);
+
+  SmallVector<BasicBlock *, 1> PredsToFactor;
+  PredsToFactor.push_back(NewBB);
+  ThreadEdge(BB, PredsToFactor, SuccBB);
+}
+
 /// TryThreadEdge - Thread an edge if it's safe and profitable to do so.
 bool JumpThreadingPass::TryThreadEdge(
     BasicBlock *BB, const SmallVectorImpl<BasicBlock *> &PredBBs,
@@ -2078,10 +2284,6 @@ void JumpThreadingPass::ThreadEdge(BasicBlock *BB,
                     << "' to '" << SuccBB->getName()
                     << ", across block:\n    " << *BB << "\n");
 
-  if (DTU->hasPendingDomTreeUpdates())
-    LVI->disableDT();
-  else
-    LVI->enableDT();
   LVI->threadEdge(PredBB, BB, SuccBB);
 
   BasicBlock *NewBB = BasicBlock::Create(BB->getContext(),
@@ -2246,8 +2448,7 @@ void JumpThreadingPass::UpdateBlockFreqAndEdgeWeight(BasicBlock *PredBB,
   }
 
   // Update edge probabilities in BPI.
-  for (int I = 0, E = BBSuccProbs.size(); I < E; I++)
-    BPI->setEdgeProbability(BB, I, BBSuccProbs[I]);
+  BPI->setEdgeProbability(BB, BBSuccProbs);
 
   // Update the profile metadata as well.
   //
@@ -2524,10 +2725,6 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
     // Now check if one of the select values would allow us to constant fold the
     // terminator in BB. We don't do the transform if both sides fold, those
     // cases will be threaded in any case.
-    if (DTU->hasPendingDomTreeUpdates())
-      LVI->disableDT();
-    else
-      LVI->enableDT();
     LazyValueInfo::Tristate LHSFolds =
         LVI->getPredicateOnEdge(CondCmp->getPredicate(), SI->getOperand(1),
                                 CondRHS, Pred, BB, CondCmp);
@@ -2565,6 +2762,16 @@ bool JumpThreadingPass::TryToUnfoldSelect(CmpInst *CondCmp, BasicBlock *BB) {
 /// select is not jump-threaded, it will be folded again in the later
 /// optimizations.
 bool JumpThreadingPass::TryToUnfoldSelectInCurrBB(BasicBlock *BB) {
+  // This transform can introduce a UB (a conditional branch that depends on a
+  // poison value) that was not present in the original program. See
+  // @TryToUnfoldSelectInCurrBB test in test/Transforms/JumpThreading/select.ll.
+  // Disable this transform under MemorySanitizer.
+  // FIXME: either delete it or replace with a valid transform. This issue is
+  // not limited to MemorySanitizer (but has only been observed as an MSan false
+  // positive in practice so far).
+  if (BB->getParent()->hasFnAttribute(Attribute::SanitizeMemory))
+    return false;
+
   // If threading this would thread across a loop header, don't thread the edge.
   // See the comments above FindLoopHeaders for justifications and caveats.
   if (LoopHeaders.count(BB))
diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp
index 8c33045c2380..1a22edaf8726 100644
--- a/llvm/lib/Transforms/Scalar/LICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LICM.cpp
@@ -46,6 +46,7 @@
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
@@ -69,6 +70,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
@@ -151,11 +153,11 @@ static bool isSafeToExecuteUnconditionally(Instruction &Inst,
                                            const Instruction *CtxI = nullptr);
 static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
                                      AliasSetTracker *CurAST, Loop *CurLoop,
-                                     AliasAnalysis *AA);
+                                     AAResults *AA);
 static bool pointerInvalidatedByLoopWithMSSA(MemorySSA *MSSA, MemoryUse *MU,
                                              Loop *CurLoop,
                                              SinkAndHoistLICMFlags &Flags);
-static Instruction *CloneInstructionInExitBlock(
+static Instruction *cloneInstructionInExitBlock(
     Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI,
     const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU);
 
@@ -168,27 +170,24 @@ static void moveInstructionBefore(Instruction &I, Instruction &Dest,
 
 namespace {
 struct LoopInvariantCodeMotion {
-  using ASTrackerMapTy = DenseMap<Loop *, std::unique_ptr<AliasSetTracker>>;
-  bool runOnLoop(Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
+  bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
                  TargetLibraryInfo *TLI, TargetTransformInfo *TTI,
                  ScalarEvolution *SE, MemorySSA *MSSA,
-                 OptimizationRemarkEmitter *ORE, bool DeleteAST);
+                 OptimizationRemarkEmitter *ORE);
 
-  ASTrackerMapTy &getLoopToAliasSetMap() { return LoopToAliasSetMap; }
   LoopInvariantCodeMotion(unsigned LicmMssaOptCap,
                           unsigned LicmMssaNoAccForPromotionCap)
       : LicmMssaOptCap(LicmMssaOptCap),
         LicmMssaNoAccForPromotionCap(LicmMssaNoAccForPromotionCap) {}
 
 private:
-  ASTrackerMapTy LoopToAliasSetMap;
   unsigned LicmMssaOptCap;
   unsigned LicmMssaNoAccForPromotionCap;
 
   std::unique_ptr<AliasSetTracker>
-  collectAliasInfoForLoop(Loop *L, LoopInfo *LI, AliasAnalysis *AA);
+  collectAliasInfoForLoop(Loop *L, LoopInfo *LI, AAResults *AA);
   std::unique_ptr<AliasSetTracker>
-  collectAliasInfoForLoopWithMSSA(Loop *L, AliasAnalysis *AA,
+  collectAliasInfoForLoopWithMSSA(Loop *L, AAResults *AA,
                                   MemorySSAUpdater *MSSAU);
 };
 
@@ -202,13 +201,8 @@ struct LegacyLICMPass : public LoopPass {
   }
 
   bool runOnLoop(Loop *L, LPPassManager &LPM) override {
-    if (skipLoop(L)) {
-      // If we have run LICM on a previous loop but now we are skipping
-      // (because we've hit the opt-bisect limit), we need to clear the
-      // loop alias information.
-      LICM.getLoopToAliasSetMap().clear();
+    if (skipLoop(L))
       return false;
-    }
 
     auto *SE = getAnalysisIfAvailable<ScalarEvolutionWrapperPass>();
     MemorySSA *MSSA = EnableMSSALoopDependency
@@ -226,7 +220,7 @@ struct LegacyLICMPass : public LoopPass {
                               *L->getHeader()->getParent()),
                           &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
                               *L->getHeader()->getParent()),
-                          SE ? &SE->getSE() : nullptr, MSSA, &ORE, false);
+                          SE ? &SE->getSE() : nullptr, MSSA, &ORE);
   }
 
   /// This transformation requires natural loop information & requires that
@@ -244,53 +238,21 @@ struct LegacyLICMPass : public LoopPass {
     getLoopAnalysisUsage(AU);
   }
 
-  using llvm::Pass::doFinalization;
-
-  bool doFinalization() override {
-    auto &AliasSetMap = LICM.getLoopToAliasSetMap();
-    // All loops in the AliasSetMap should be cleaned up already. The only case
-    // where we fail to do so is if an outer loop gets deleted before LICM
-    // visits it.
-    assert(all_of(AliasSetMap,
-                  [](LoopInvariantCodeMotion::ASTrackerMapTy::value_type &KV) {
-                    return !KV.first->getParentLoop();
-                  }) &&
-           "Didn't free loop alias sets");
-    AliasSetMap.clear();
-    return false;
-  }
-
 private:
   LoopInvariantCodeMotion LICM;
-
-  /// cloneBasicBlockAnalysis - Simple Analysis hook. Clone alias set info.
-  void cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To,
-                               Loop *L) override;
-
-  /// deleteAnalysisValue - Simple Analysis hook. Delete value V from alias
-  /// set.
-  void deleteAnalysisValue(Value *V, Loop *L) override;
-
-  /// Simple Analysis hook. Delete loop L from alias set map.
-  void deleteAnalysisLoop(Loop *L) override;
 };
 } // namespace
 
 PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM,
                                 LoopStandardAnalysisResults &AR, LPMUpdater &) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
-  // FIXME: This should probably be optional rather than required.
-  if (!ORE)
-    report_fatal_error("LICM: OptimizationRemarkEmitterAnalysis not "
-                       "cached at a higher level");
+  // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
+  // pass.  Function analyses need to be preserved across loop transformations
+  // but ORE cannot be preserved (see comment before the pass definition).
+  OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
 
   LoopInvariantCodeMotion LICM(LicmMssaOptCap, LicmMssaNoAccForPromotionCap);
   if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.TLI, &AR.TTI, &AR.SE,
-                      AR.MSSA, ORE, true))
+                      AR.MSSA, &ORE))
     return PreservedAnalyses::all();
 
   auto PA = getLoopPassPreservedAnalyses();
@@ -322,13 +284,10 @@ Pass *llvm::createLICMPass(unsigned LicmMssaOptCap,
 /// Hoist expressions out of the specified loop. Note, alias info for inner
 /// loop is not preserved so it is not a good idea to run LICM multiple
 /// times on one loop.
-/// We should delete AST for inner loops in the new pass manager to avoid
-/// memory leak.
-///
 bool LoopInvariantCodeMotion::runOnLoop(
-    Loop *L, AliasAnalysis *AA, LoopInfo *LI, DominatorTree *DT,
+    Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT,
     TargetLibraryInfo *TLI, TargetTransformInfo *TTI, ScalarEvolution *SE,
-    MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool DeleteAST) {
+    MemorySSA *MSSA, OptimizationRemarkEmitter *ORE) {
   bool Changed = false;
 
   assert(L->isLCSSAForm(*DT) && "Loop is not in LCSSA form.");
@@ -372,7 +331,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
   BasicBlock *Preheader = L->getLoopPreheader();
 
   // Compute loop safety information.
-  ICFLoopSafetyInfo SafetyInfo(DT);
+  ICFLoopSafetyInfo SafetyInfo;
   SafetyInfo.computeLoopSafetyInfo(L);
 
   // We want to visit all of the instructions in this loop... that are not parts
@@ -476,11 +435,6 @@ bool LoopInvariantCodeMotion::runOnLoop(
   assert((!L->getParentLoop() || L->getParentLoop()->isLCSSAForm(*DT)) &&
          "Parent loop not left in LCSSA form after LICM!");
 
-  // If this loop is nested inside of another one, save the alias information
-  // for when we process the outer loop.
-  if (!MSSAU.get() && CurAST.get() && L->getParentLoop() && !DeleteAST)
-    LoopToAliasSetMap[L] = std::move(CurAST);
-
   if (MSSAU.get() && VerifyMemorySSA)
     MSSAU->getMemorySSA()->verifyMemorySSA();
 
@@ -494,7 +448,7 @@ bool LoopInvariantCodeMotion::runOnLoop(
 /// first order w.r.t the DominatorTree.  This allows us to visit uses before
 /// definitions, allowing us to sink a loop body in one pass without iteration.
 ///
-bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
+bool llvm::sinkRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                       DominatorTree *DT, TargetLibraryInfo *TLI,
                       TargetTransformInfo *TTI, Loop *CurLoop,
                       AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
@@ -529,6 +483,7 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       // used in the loop, instead, just delete it.
       if (isInstructionTriviallyDead(&I, TLI)) {
         LLVM_DEBUG(dbgs() << "LICM deleting dead inst: " << I << '\n');
+        salvageKnowledge(&I);
         salvageDebugInfo(I);
         ++II;
         eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
@@ -542,13 +497,14 @@ bool llvm::sinkRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
       // operands of the instruction are loop invariant.
       //
       bool FreeInLoop = false;
-      if (isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
+      if (!I.mayHaveSideEffects() &&
+          isNotUsedOrFreeInLoop(I, CurLoop, SafetyInfo, TTI, FreeInLoop) &&
           canSinkOrHoistInst(I, AA, DT, CurLoop, CurAST, MSSAU, true, &Flags,
-                             ORE) &&
-          !I.mayHaveSideEffects()) {
+                             ORE)) {
         if (sink(I, LI, DT, CurLoop, SafetyInfo, MSSAU, ORE)) {
           if (!FreeInLoop) {
             ++II;
+            salvageDebugInfo(I);
             eraseInstruction(I, *SafetyInfo, CurAST, MSSAU);
           }
           Changed = true;
@@ -790,47 +746,12 @@ public:
 };
 } // namespace
 
-
-/// Return true if we know how to rewrite all uses of the given alloca after
-/// hoisting it out of the loop.  The main concerns are a) potential captures
-/// and b) invariant.start markers which don't capture, but are no longer
-/// valid w/o a corresponding invariant.end.
-static bool canRewriteUsesOfAlloca(AllocaInst &AI) {
-  // TODO: This looks a lot like capture tracking, but we need to remove any
-  // invariant starts if we extend the lifetime of the alloca by hoisting it.
-  // We should probably refactor capture tracking into a form which allows us
-  // to reuse the relevant bits and remove the duplicated logic here.
-
-  SmallVector<Use *, 16> Worklist;
-  for (Use &U : AI.uses())
-    Worklist.push_back(&U);
-  
-  unsigned NumUsesExplored = 0;
-  while (!Worklist.empty()) {
-    Use *U = Worklist.pop_back_val();
-    Instruction *I = cast<Instruction>(U->getUser());
-    NumUsesExplored++;
-    if (NumUsesExplored > DefaultMaxUsesToExplore)
-      return false;
-    // Non capturing, terminating uses
-    if (isa<LoadInst>(I) ||
-        (isa<StoreInst>(I) && U->getOperandNo() == 1))
-      continue;
-    // Non capturing, non-terminating
-    if (!isa<BitCastInst>(I) && !isa<GetElementPtrInst>(I))
-      return false;
-    for (Use &U : I->uses())
-      Worklist.push_back(&U);
-  }
-  return true;
-}
-
 /// Walk the specified region of the CFG (defined by all blocks dominated by
 /// the specified block, and that are in the current loop) in depth first
 /// order w.r.t the DominatorTree.  This allows us to visit definitions before
 /// uses, allowing us to hoist a loop body in one pass without iteration.
 ///
-bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
+bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI,
                        DominatorTree *DT, TargetLibraryInfo *TLI, Loop *CurLoop,
                        AliasSetTracker *CurAST, MemorySSAUpdater *MSSAU,
                        ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo,
@@ -901,9 +822,8 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
 
       // Attempt to remove floating point division out of the loop by
       // converting it to a reciprocal multiplication.
-      if (I.getOpcode() == Instruction::FDiv &&
-          CurLoop->isLoopInvariant(I.getOperand(1)) &&
-          I.hasAllowReciprocal()) {
+      if (I.getOpcode() == Instruction::FDiv && I.hasAllowReciprocal() &&
+          CurLoop->isLoopInvariant(I.getOperand(1))) {
         auto Divisor = I.getOperand(1);
         auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0);
         auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor);
@@ -945,16 +865,6 @@ bool llvm::hoistRegion(DomTreeNode *N, AliasAnalysis *AA, LoopInfo *LI,
         continue;
       }
 
-      if (isa<AllocaInst>(&I) &&
-          SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) &&
-          canRewriteUsesOfAlloca(cast<AllocaInst>(I))) {
-        hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo,
-              MSSAU, SE, ORE);
-        HoistedInstructions.push_back(&I);
-        Changed = true;
-        continue;
-      }
-
       if (PHINode *PN = dyn_cast<PHINode>(&I)) {
         if (CFH.canHoistPHI(PN)) {
           // Redirect incoming blocks first to ensure that we create hoisted
@@ -1081,12 +991,12 @@ namespace {
 bool isHoistableAndSinkableInst(Instruction &I) {
   // Only these instructions are hoistable/sinkable.
   return (isa<LoadInst>(I) || isa<StoreInst>(I) || isa<CallInst>(I) ||
-          isa<FenceInst>(I) || isa<CastInst>(I) ||
-          isa<UnaryOperator>(I) || isa<BinaryOperator>(I) ||
-          isa<SelectInst>(I) || isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
+          isa<FenceInst>(I) || isa<CastInst>(I) || isa<UnaryOperator>(I) ||
+          isa<BinaryOperator>(I) || isa<SelectInst>(I) ||
+          isa<GetElementPtrInst>(I) || isa<CmpInst>(I) ||
           isa<InsertElementInst>(I) || isa<ExtractElementInst>(I) ||
           isa<ShuffleVectorInst>(I) || isa<ExtractValueInst>(I) ||
-          isa<InsertValueInst>(I));
+          isa<InsertValueInst>(I) || isa<FreezeInst>(I));
 }
 /// Return true if all of the alias sets within this AST are known not to
 /// contain a Mod, or if MSSA knows thare are no MemoryDefs in the loop.
@@ -1198,11 +1108,11 @@ bool llvm::canSinkOrHoistInst(Instruction &I, AAResults *AA, DominatorTree *DT,
     FunctionModRefBehavior Behavior = AA->getModRefBehavior(CI);
     if (Behavior == FMRB_DoesNotAccessMemory)
       return true;
-    if (AliasAnalysis::onlyReadsMemory(Behavior)) {
+    if (AAResults::onlyReadsMemory(Behavior)) {
       // A readonly argmemonly function only reads from memory pointed to by
       // it's arguments with arbitrary offsets.  If we can prove there are no
       // writes to this memory in the loop, we can hoist or sink.
-      if (AliasAnalysis::onlyAccessesArgPointees(Behavior)) {
+      if (AAResults::onlyAccessesArgPointees(Behavior)) {
         // TODO: expand to writeable arguments
         for (Value *Op : CI->arg_operands())
           if (Op->getType()->isPointerTy()) {
@@ -1351,7 +1261,8 @@ static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
                          const TargetTransformInfo *TTI) {
 
   if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(&I)) {
-    if (TTI->getUserCost(GEP) != TargetTransformInfo::TCC_Free)
+    if (TTI->getUserCost(GEP, TargetTransformInfo::TCK_SizeAndLatency) !=
+        TargetTransformInfo::TCC_Free)
       return false;
     // For a GEP, we cannot simply use getUserCost because currently it
     // optimistically assume that a GEP will fold into addressing mode
@@ -1366,7 +1277,8 @@ static bool isFreeInLoop(const Instruction &I, const Loop *CurLoop,
     }
     return true;
   } else
-    return TTI->getUserCost(&I) == TargetTransformInfo::TCC_Free;
+    return TTI->getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency) ==
+           TargetTransformInfo::TCC_Free;
 }
 
 /// Return true if the only users of this instruction are outside of
@@ -1407,7 +1319,7 @@ static bool isNotUsedOrFreeInLoop(const Instruction &I, const Loop *CurLoop,
   return true;
 }
 
-static Instruction *CloneInstructionInExitBlock(
+static Instruction *cloneInstructionInExitBlock(
     Instruction &I, BasicBlock &ExitBlock, PHINode &PN, const LoopInfo *LI,
     const LoopSafetyInfo *SafetyInfo, MemorySSAUpdater *MSSAU) {
   Instruction *New;
@@ -1520,7 +1432,7 @@ static Instruction *sinkThroughTriviallyReplaceablePHI(
   if (It != SunkCopies.end())
     New = It->second;
   else
-    New = SunkCopies[ExitBlock] = CloneInstructionInExitBlock(
+    New = SunkCopies[ExitBlock] = cloneInstructionInExitBlock(
         *I, *ExitBlock, *TPN, LI, SafetyInfo, MSSAU);
   return New;
 }
@@ -1537,7 +1449,8 @@ static bool canSplitPredecessors(PHINode *PN, LoopSafetyInfo *SafetyInfo) {
     return false;
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *BBPred = *PI;
-    if (isa<IndirectBrInst>(BBPred->getTerminator()))
+    if (isa<IndirectBrInst>(BBPred->getTerminator()) ||
+        isa<CallBrInst>(BBPred->getTerminator()))
       return false;
   }
   return true;
@@ -1857,7 +1770,7 @@ public:
       StoreInst *NewSI = new StoreInst(LiveInValue, Ptr, InsertPos);
       if (UnorderedAtomic)
         NewSI->setOrdering(AtomicOrdering::Unordered);
-      NewSI->setAlignment(MaybeAlign(Alignment));
+      NewSI->setAlignment(Align(Alignment));
       NewSI->setDebugLoc(DL);
       if (AATags)
         NewSI->setAAMetadata(AATags);
@@ -1981,7 +1894,7 @@ bool llvm::promoteLoopAccessesToScalars(
 
   // We start with an alignment of one and try to find instructions that allow
   // us to prove better alignment.
-  unsigned Alignment = 1;
+  Align Alignment;
   // Keep track of which types of access we see
   bool SawUnorderedAtomic = false;
   bool SawNotAtomic = false;
@@ -2029,10 +1942,7 @@ bool llvm::promoteLoopAccessesToScalars(
         SawUnorderedAtomic |= Load->isAtomic();
         SawNotAtomic |= !Load->isAtomic();
 
-        unsigned InstAlignment = Load->getAlignment();
-        if (!InstAlignment)
-          InstAlignment =
-              MDL.getABITypeAlignment(Load->getType());
+        Align InstAlignment = Load->getAlign();
 
         // Note that proving a load safe to speculate requires proving
         // sufficient alignment at the target location.  Proving it guaranteed
@@ -2060,10 +1970,7 @@ bool llvm::promoteLoopAccessesToScalars(
         // already know that promotion is safe, since it may have higher
         // alignment than any other guaranteed stores, in which case we can
         // raise the alignment on the promoted store.
-        unsigned InstAlignment = Store->getAlignment();
-        if (!InstAlignment)
-          InstAlignment =
-              MDL.getABITypeAlignment(Store->getValueOperand()->getType());
+        Align InstAlignment = Store->getAlign();
 
         if (!DereferenceableInPH || !SafeToInsertStore ||
             (InstAlignment > Alignment)) {
@@ -2090,8 +1997,7 @@ bool llvm::promoteLoopAccessesToScalars(
         if (!DereferenceableInPH) {
           DereferenceableInPH = isDereferenceableAndAlignedPointer(
               Store->getPointerOperand(), Store->getValueOperand()->getType(),
-              MaybeAlign(Store->getAlignment()), MDL,
-              Preheader->getTerminator(), DT);
+              Store->getAlign(), MDL, Preheader->getTerminator(), DT);
         }
       } else
         return false; // Not a load or store.
@@ -2156,18 +2062,19 @@ bool llvm::promoteLoopAccessesToScalars(
   });
   ++NumPromoted;
 
-  // Grab a debug location for the inserted loads/stores; given that the
-  // inserted loads/stores have little relation to the original loads/stores,
-  // this code just arbitrarily picks a location from one, since any debug
-  // location is better than none.
-  DebugLoc DL = LoopUses[0]->getDebugLoc();
+  // Look at all the loop uses, and try to merge their locations.
+  std::vector<const DILocation *> LoopUsesLocs;
+  for (auto U : LoopUses)
+    LoopUsesLocs.push_back(U->getDebugLoc().get());
+  auto DL = DebugLoc(DILocation::getMergedLocations(LoopUsesLocs));
 
   // We use the SSAUpdater interface to insert phi nodes as required.
   SmallVector<PHINode *, 16> NewPHIs;
   SSAUpdater SSA(&NewPHIs);
   LoopPromoter Promoter(SomePtr, LoopUses, SSA, PointerMustAliases, ExitBlocks,
                         InsertPts, MSSAInsertPts, PIC, *CurAST, MSSAU, *LI, DL,
-                        Alignment, SawUnorderedAtomic, AATags, *SafetyInfo);
+                        Alignment.value(), SawUnorderedAtomic, AATags,
+                        *SafetyInfo);
 
   // Set up the preheader to have a definition of the value.  It is the live-out
   // value from the preheader that uses in the loop will use.
@@ -2176,8 +2083,8 @@ bool llvm::promoteLoopAccessesToScalars(
       SomePtr->getName() + ".promoted", Preheader->getTerminator());
   if (SawUnorderedAtomic)
     PreheaderLoad->setOrdering(AtomicOrdering::Unordered);
-  PreheaderLoad->setAlignment(MaybeAlign(Alignment));
-  PreheaderLoad->setDebugLoc(DL);
+  PreheaderLoad->setAlignment(Alignment);
+  PreheaderLoad->setDebugLoc(DebugLoc());
   if (AATags)
     PreheaderLoad->setAAMetadata(AATags);
   SSA.AddAvailableValue(Preheader, PreheaderLoad);
@@ -2206,41 +2113,13 @@ bool llvm::promoteLoopAccessesToScalars(
 
 /// Returns an owning pointer to an alias set which incorporates aliasing info
 /// from L and all subloops of L.
-/// FIXME: In new pass manager, there is no helper function to handle loop
-/// analysis such as cloneBasicBlockAnalysis, so the AST needs to be recomputed
-/// from scratch for every loop. Hook up with the helper functions when
-/// available in the new pass manager to avoid redundant computation.
 std::unique_ptr<AliasSetTracker>
 LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
-                                                 AliasAnalysis *AA) {
-  std::unique_ptr<AliasSetTracker> CurAST;
-  SmallVector<Loop *, 4> RecomputeLoops;
-  for (Loop *InnerL : L->getSubLoops()) {
-    auto MapI = LoopToAliasSetMap.find(InnerL);
-    // If the AST for this inner loop is missing it may have been merged into
-    // some other loop's AST and then that loop unrolled, and so we need to
-    // recompute it.
-    if (MapI == LoopToAliasSetMap.end()) {
-      RecomputeLoops.push_back(InnerL);
-      continue;
-    }
-    std::unique_ptr<AliasSetTracker> InnerAST = std::move(MapI->second);
+                                                 AAResults *AA) {
+  auto CurAST = std::make_unique<AliasSetTracker>(*AA);
 
-    if (CurAST) {
-      // What if InnerLoop was modified by other passes ?
-      // Once we've incorporated the inner loop's AST into ours, we don't need
-      // the subloop's anymore.
-      CurAST->add(*InnerAST);
-    } else {
-      CurAST = std::move(InnerAST);
-    }
-    LoopToAliasSetMap.erase(MapI);
-  }
-  if (!CurAST)
-    CurAST = std::make_unique<AliasSetTracker>(*AA);
-
-  // Add everything from the sub loops that are no longer directly available.
-  for (Loop *InnerL : RecomputeLoops)
+  // Add everything from all the sub loops.
+  for (Loop *InnerL : L->getSubLoops())
     for (BasicBlock *BB : InnerL->blocks())
       CurAST->add(*BB);
 
@@ -2254,46 +2133,16 @@ LoopInvariantCodeMotion::collectAliasInfoForLoop(Loop *L, LoopInfo *LI,
 
 std::unique_ptr<AliasSetTracker>
 LoopInvariantCodeMotion::collectAliasInfoForLoopWithMSSA(
-    Loop *L, AliasAnalysis *AA, MemorySSAUpdater *MSSAU) {
+    Loop *L, AAResults *AA, MemorySSAUpdater *MSSAU) {
   auto *MSSA = MSSAU->getMemorySSA();
   auto CurAST = std::make_unique<AliasSetTracker>(*AA, MSSA, L);
   CurAST->addAllInstructionsInLoopUsingMSSA();
   return CurAST;
 }
 
-/// Simple analysis hook. Clone alias set info.
-///
-void LegacyLICMPass::cloneBasicBlockAnalysis(BasicBlock *From, BasicBlock *To,
-                                             Loop *L) {
-  auto ASTIt = LICM.getLoopToAliasSetMap().find(L);
-  if (ASTIt == LICM.getLoopToAliasSetMap().end())
-    return;
-
-  ASTIt->second->copyValue(From, To);
-}
-
-/// Simple Analysis hook. Delete value V from alias set
-///
-void LegacyLICMPass::deleteAnalysisValue(Value *V, Loop *L) {
-  auto ASTIt = LICM.getLoopToAliasSetMap().find(L);
-  if (ASTIt == LICM.getLoopToAliasSetMap().end())
-    return;
-
-  ASTIt->second->deleteValue(V);
-}
-
-/// Simple Analysis hook. Delete value L from alias set map.
-///
-void LegacyLICMPass::deleteAnalysisLoop(Loop *L) {
-  if (!LICM.getLoopToAliasSetMap().count(L))
-    return;
-
-  LICM.getLoopToAliasSetMap().erase(L);
-}
-
 static bool pointerInvalidatedByLoop(MemoryLocation MemLoc,
                                      AliasSetTracker *CurAST, Loop *CurLoop,
-                                     AliasAnalysis *AA) {
+                                     AAResults *AA) {
   // First check to see if any of the basic blocks in CurLoop invalidate *V.
   bool isInvalidatedAccordingToAST = CurAST->getAliasSetFor(MemLoc).isMod();
 
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index ab65f56d088f..687e14d6d7d2 100644
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -21,7 +21,6 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CFG.h"
@@ -32,6 +31,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 using namespace llvm;
 
@@ -61,10 +61,10 @@ namespace {
 /// Loop prefetch implementation class.
 class LoopDataPrefetch {
 public:
-  LoopDataPrefetch(AssumptionCache *AC, LoopInfo *LI, ScalarEvolution *SE,
-                   const TargetTransformInfo *TTI,
+  LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI,
+                   ScalarEvolution *SE, const TargetTransformInfo *TTI,
                    OptimizationRemarkEmitter *ORE)
-      : AC(AC), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
+      : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {}
 
   bool run();
 
@@ -73,12 +73,16 @@ private:
 
   /// Check if the stride of the accesses is large enough to
   /// warrant a prefetch.
-  bool isStrideLargeEnough(const SCEVAddRecExpr *AR);
+  bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride);
 
-  unsigned getMinPrefetchStride() {
+  unsigned getMinPrefetchStride(unsigned NumMemAccesses,
+                                unsigned NumStridedMemAccesses,
+                                unsigned NumPrefetches,
+                                bool HasCall) {
     if (MinPrefetchStride.getNumOccurrences() > 0)
       return MinPrefetchStride;
-    return TTI->getMinPrefetchStride();
+    return TTI->getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+                                     NumPrefetches, HasCall);
   }
 
   unsigned getPrefetchDistance() {
@@ -93,7 +97,14 @@ private:
     return TTI->getMaxPrefetchIterationsAhead();
   }
 
+  bool doPrefetchWrites() {
+    if (PrefetchWrites.getNumOccurrences() > 0)
+      return PrefetchWrites;
+    return TTI->enableWritePrefetching();
+  }
+
   AssumptionCache *AC;
+  DominatorTree *DT;
   LoopInfo *LI;
   ScalarEvolution *SE;
   const TargetTransformInfo *TTI;
@@ -110,6 +121,7 @@ public:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
+    AU.addRequired<DominatorTreeWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addRequired<LoopInfoWrapperPass>();
     AU.addPreserved<LoopInfoWrapperPass>();
@@ -138,8 +150,8 @@ FunctionPass *llvm::createLoopDataPrefetchPass() {
   return new LoopDataPrefetchLegacyPass();
 }
 
-bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
-  unsigned TargetMinStride = getMinPrefetchStride();
+bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR,
+                                           unsigned TargetMinStride) {
   // No need to check if any stride goes.
   if (TargetMinStride <= 1)
     return true;
@@ -156,6 +168,7 @@ bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR) {
 
 PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
                                             FunctionAnalysisManager &AM) {
+  DominatorTree *DT = &AM.getResult<DominatorTreeAnalysis>(F);
   LoopInfo *LI = &AM.getResult<LoopAnalysis>(F);
   ScalarEvolution *SE = &AM.getResult<ScalarEvolutionAnalysis>(F);
   AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
@@ -163,7 +176,7 @@ PreservedAnalyses LoopDataPrefetchPass::run(Function &F,
       &AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
   const TargetTransformInfo *TTI = &AM.getResult<TargetIRAnalysis>(F);
 
-  LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+  LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
   bool Changed = LDP.run();
 
   if (Changed) {
@@ -180,6 +193,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
   if (skipFunction(F))
     return false;
 
+  DominatorTree *DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   LoopInfo *LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
   ScalarEvolution *SE = &getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   AssumptionCache *AC =
@@ -189,7 +203,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) {
   const TargetTransformInfo *TTI =
       &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
 
-  LoopDataPrefetch LDP(AC, LI, SE, TTI, ORE);
+  LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE);
   return LDP.run();
 }
 
@@ -210,6 +224,49 @@ bool LoopDataPrefetch::run() {
   return MadeChange;
 }
 
+/// A record for a potential prefetch made during the initial scan of the
+/// loop. This is used to let a single prefetch target multiple memory accesses.
+struct Prefetch {
+  /// The address formula for this prefetch as returned by ScalarEvolution.
+  const SCEVAddRecExpr *LSCEVAddRec;
+  /// The point of insertion for the prefetch instruction.
+  Instruction *InsertPt;
+  /// True if targeting a write memory access.
+  bool Writes;
+  /// The (first seen) prefetched instruction.
+  Instruction *MemI;
+
+  /// Constructor to create a new Prefetch for \p I.
+  Prefetch(const SCEVAddRecExpr *L, Instruction *I)
+      : LSCEVAddRec(L), InsertPt(nullptr), Writes(false), MemI(nullptr) {
+    addInstruction(I);
+  };
+
+  /// Add the instruction \param I to this prefetch. If it's not the first
+  /// one, 'InsertPt' and 'Writes' will be updated as required.
+  /// \param PtrDiff the known constant address difference to the first added
+  /// instruction.
+  void addInstruction(Instruction *I, DominatorTree *DT = nullptr,
+                      int64_t PtrDiff = 0) {
+    if (!InsertPt) {
+      MemI = I;
+      InsertPt = I;
+      Writes = isa<StoreInst>(I);
+    } else {
+      BasicBlock *PrefBB = InsertPt->getParent();
+      BasicBlock *InsBB = I->getParent();
+      if (PrefBB != InsBB) {
+        BasicBlock *DomBB = DT->findNearestCommonDominator(PrefBB, InsBB);
+        if (DomBB != PrefBB)
+          InsertPt = DomBB->getTerminator();
+      }
+
+      if (isa<StoreInst>(I) && PtrDiff == 0)
+        Writes = true;
+    }
+  }
+};
+
 bool LoopDataPrefetch::runOnLoop(Loop *L) {
   bool MadeChange = false;
 
@@ -222,15 +279,22 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
 
   // Calculate the number of iterations ahead to prefetch
   CodeMetrics Metrics;
+  bool HasCall = false;
   for (const auto BB : L->blocks()) {
     // If the loop already has prefetches, then assume that the user knows
     // what they are doing and don't add any more.
-    for (auto &I : *BB)
-      if (CallInst *CI = dyn_cast<CallInst>(&I))
-        if (Function *F = CI->getCalledFunction())
+    for (auto &I : *BB) {
+      if (isa<CallInst>(&I) || isa<InvokeInst>(&I)) {
+        if (const Function *F = cast<CallBase>(I).getCalledFunction()) {
           if (F->getIntrinsicID() == Intrinsic::prefetch)
             return MadeChange;
-
+          if (TTI->isLoweredToCall(F))
+            HasCall = true;
+        } else { // indirect call.
+          HasCall = true;
+        }
+      }
+    }
     Metrics.analyzeBasicBlock(BB, *TTI, EphValues);
   }
   unsigned LoopSize = Metrics.NumInsts;
@@ -244,12 +308,14 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
   if (ItersAhead > getMaxPrefetchIterationsAhead())
     return MadeChange;
 
-  LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
-                    << " iterations ahead (loop size: " << LoopSize << ") in "
-                    << L->getHeader()->getParent()->getName() << ": " << *L);
+  unsigned ConstantMaxTripCount = SE->getSmallConstantMaxTripCount(L);
+  if (ConstantMaxTripCount && ConstantMaxTripCount < ItersAhead + 1)
+    return MadeChange;
 
-  SmallVector<std::pair<Instruction *, const SCEVAddRecExpr *>, 16> PrefLoads;
-  for (const auto BB : L->blocks()) {
+  unsigned NumMemAccesses = 0;
+  unsigned NumStridedMemAccesses = 0;
+  SmallVector<Prefetch, 16> Prefetches;
+  for (const auto BB : L->blocks())
     for (auto &I : *BB) {
       Value *PtrValue;
       Instruction *MemI;
@@ -258,7 +324,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
         MemI = LMemI;
         PtrValue = LMemI->getPointerOperand();
       } else if (StoreInst *SMemI = dyn_cast<StoreInst>(&I)) {
-        if (!PrefetchWrites) continue;
+        if (!doPrefetchWrites()) continue;
         MemI = SMemI;
         PtrValue = SMemI->getPointerOperand();
       } else continue;
@@ -266,7 +332,7 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
       unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace();
       if (PtrAddrSpace)
         continue;
-
+      NumMemAccesses++;
       if (L->isLoopInvariant(PtrValue))
         continue;
 
@@ -274,62 +340,79 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
       const SCEVAddRecExpr *LSCEVAddRec = dyn_cast<SCEVAddRecExpr>(LSCEV);
       if (!LSCEVAddRec)
         continue;
+      NumStridedMemAccesses++;
 
-      // Check if the stride of the accesses is large enough to warrant a
-      // prefetch.
-      if (!isStrideLargeEnough(LSCEVAddRec))
-        continue;
-
-      // We don't want to double prefetch individual cache lines. If this load
-      // is known to be within one cache line of some other load that has
-      // already been prefetched, then don't prefetch this one as well.
+      // We don't want to double prefetch individual cache lines. If this
+      // access is known to be within one cache line of some other one that
+      // has already been prefetched, then don't prefetch this one as well.
       bool DupPref = false;
-      for (const auto &PrefLoad : PrefLoads) {
-        const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, PrefLoad.second);
+      for (auto &Pref : Prefetches) {
+        const SCEV *PtrDiff = SE->getMinusSCEV(LSCEVAddRec, Pref.LSCEVAddRec);
         if (const SCEVConstant *ConstPtrDiff =
             dyn_cast<SCEVConstant>(PtrDiff)) {
           int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue());
           if (PD < (int64_t) TTI->getCacheLineSize()) {
+            Pref.addInstruction(MemI, DT, PD);
             DupPref = true;
             break;
           }
         }
       }
-      if (DupPref)
-        continue;
+      if (!DupPref)
+        Prefetches.push_back(Prefetch(LSCEVAddRec, MemI));
+    }
 
-      const SCEV *NextLSCEV = SE->getAddExpr(LSCEVAddRec, SE->getMulExpr(
-        SE->getConstant(LSCEVAddRec->getType(), ItersAhead),
-        LSCEVAddRec->getStepRecurrence(*SE)));
-      if (!isSafeToExpand(NextLSCEV, *SE))
-        continue;
+  unsigned TargetMinStride =
+    getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses,
+                         Prefetches.size(), HasCall);
 
-      PrefLoads.push_back(std::make_pair(MemI, LSCEVAddRec));
-
-      Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), PtrAddrSpace);
-      SCEVExpander SCEVE(*SE, I.getModule()->getDataLayout(), "prefaddr");
-      Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, MemI);
-
-      IRBuilder<> Builder(MemI);
-      Module *M = BB->getParent()->getParent();
-      Type *I32 = Type::getInt32Ty(BB->getContext());
-      Function *PrefetchFunc = Intrinsic::getDeclaration(
-          M, Intrinsic::prefetch, PrefPtrValue->getType());
-      Builder.CreateCall(
-          PrefetchFunc,
-          {PrefPtrValue,
-           ConstantInt::get(I32, MemI->mayReadFromMemory() ? 0 : 1),
-           ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
-      ++NumPrefetches;
-      LLVM_DEBUG(dbgs() << "  Access: " << *PtrValue << ", SCEV: " << *LSCEV
-                        << "\n");
-      ORE->emit([&]() {
-        return OptimizationRemark(DEBUG_TYPE, "Prefetched", MemI)
-               << "prefetched memory access";
+  LLVM_DEBUG(dbgs() << "Prefetching " << ItersAhead
+             << " iterations ahead (loop size: " << LoopSize << ") in "
+             << L->getHeader()->getParent()->getName() << ": " << *L);
+  LLVM_DEBUG(dbgs() << "Loop has: "
+             << NumMemAccesses << " memory accesses, "
+             << NumStridedMemAccesses << " strided memory accesses, "
+             << Prefetches.size() << " potential prefetch(es), "
+             << "a minimum stride of " << TargetMinStride << ", "
+             << (HasCall ? "calls" : "no calls") << ".\n");
+
+  for (auto &P : Prefetches) {
+    // Check if the stride of the accesses is large enough to warrant a
+    // prefetch.
+    if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride))
+      continue;
+
+    const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr(
+      SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead),
+      P.LSCEVAddRec->getStepRecurrence(*SE)));
+    if (!isSafeToExpand(NextLSCEV, *SE))
+      continue;
+
+    BasicBlock *BB = P.InsertPt->getParent();
+    Type *I8Ptr = Type::getInt8PtrTy(BB->getContext(), 0/*PtrAddrSpace*/);
+    SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr");
+    Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, P.InsertPt);
+
+    IRBuilder<> Builder(P.InsertPt);
+    Module *M = BB->getParent()->getParent();
+    Type *I32 = Type::getInt32Ty(BB->getContext());
+    Function *PrefetchFunc = Intrinsic::getDeclaration(
+        M, Intrinsic::prefetch, PrefPtrValue->getType());
+    Builder.CreateCall(
+        PrefetchFunc,
+        {PrefPtrValue,
+         ConstantInt::get(I32, P.Writes),
+         ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+    ++NumPrefetches;
+    LLVM_DEBUG(dbgs() << "  Access: "
+               << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1)
+               << ", SCEV: " << *P.LSCEVAddRec << "\n");
+    ORE->emit([&]() {
+        return OptimizationRemark(DEBUG_TYPE, "Prefetched", P.MemI)
+          << "prefetched memory access";
       });
 
-      MadeChange = true;
-    }
+    MadeChange = true;
   }
 
   return MadeChange;
diff --git a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
index 2451572d6171..be209d34be42 100644
--- a/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDeletion.cpp
@@ -18,6 +18,8 @@
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
@@ -134,7 +136,9 @@ static bool isLoopNeverExecuted(Loop *L) {
 /// is unable to delete it due to hoisting trivially loop invariant
 /// instructions out of the loop.
 static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
-                                           ScalarEvolution &SE, LoopInfo &LI) {
+                                           ScalarEvolution &SE, LoopInfo &LI,
+                                           MemorySSA *MSSA,
+                                           OptimizationRemarkEmitter &ORE) {
   assert(L->isLCSSAForm(DT) && "Expected LCSSA!");
 
   // We can only remove the loop if there is a preheader that we can branch from
@@ -164,7 +168,12 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
       std::fill(P.incoming_values().begin(), P.incoming_values().end(),
                 UndefValue::get(P.getType()));
     }
-    deleteDeadLoop(L, &DT, &SE, &LI);
+    ORE.emit([&]() {
+      return OptimizationRemark(DEBUG_TYPE, "NeverExecutes", L->getStartLoc(),
+                                L->getHeader())
+             << "Loop deleted because it never executes";
+    });
+    deleteDeadLoop(L, &DT, &SE, &LI, MSSA);
     ++NumDeleted;
     return LoopDeletionResult::Deleted;
   }
@@ -200,7 +209,12 @@ static LoopDeletionResult deleteLoopIfDead(Loop *L, DominatorTree &DT,
   }
 
   LLVM_DEBUG(dbgs() << "Loop is invariant, delete it!");
-  deleteDeadLoop(L, &DT, &SE, &LI);
+  ORE.emit([&]() {
+    return OptimizationRemark(DEBUG_TYPE, "Invariant", L->getStartLoc(),
+                              L->getHeader())
+           << "Loop deleted because it is invariant";
+  });
+  deleteDeadLoop(L, &DT, &SE, &LI, MSSA);
   ++NumDeleted;
 
   return LoopDeletionResult::Deleted;
@@ -212,15 +226,22 @@ PreservedAnalyses LoopDeletionPass::run(Loop &L, LoopAnalysisManager &AM,
 
   LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: ");
   LLVM_DEBUG(L.dump());
-  std::string LoopName = L.getName();
-  auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI);
+  std::string LoopName = std::string(L.getName());
+  // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
+  // pass. Function analyses need to be preserved across loop transformations
+  // but ORE cannot be preserved (see comment before the pass definition).
+  OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
+  auto Result = deleteLoopIfDead(&L, AR.DT, AR.SE, AR.LI, AR.MSSA, ORE);
   if (Result == LoopDeletionResult::Unmodified)
     return PreservedAnalyses::all();
 
   if (Result == LoopDeletionResult::Deleted)
     Updater.markLoopAsDeleted(L, LoopName);
 
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
 }
 
 namespace {
@@ -235,6 +256,7 @@ public:
   bool runOnLoop(Loop *L, LPPassManager &) override;
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<MemorySSAWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 };
@@ -255,11 +277,19 @@ bool LoopDeletionLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
   DominatorTree &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   ScalarEvolution &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
   LoopInfo &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+  MemorySSA *MSSA = nullptr;
+  if (MSSAAnalysis)
+    MSSA = &MSSAAnalysis->getMSSA();
+  // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
+  // pass.  Function analyses need to be preserved across loop transformations
+  // but ORE cannot be preserved (see comment before the pass definition).
+  OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
 
   LLVM_DEBUG(dbgs() << "Analyzing Loop for deletion: ");
   LLVM_DEBUG(L->dump());
 
-  LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI);
+  LoopDeletionResult Result = deleteLoopIfDead(L, DT, SE, LI, MSSA, ORE);
 
   if (Result == LoopDeletionResult::Deleted)
     LPM.markLoopAsDeleted(*L);
diff --git a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
index 8e04e6e0ffe8..7867a5468891 100644
--- a/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDistribute.cpp
@@ -789,12 +789,6 @@ public:
     // instructions to partitions.
     Partitions.setupPartitionIdOnInstructions();
 
-    // To keep things simple have an empty preheader before we version or clone
-    // the loop.  (Also split if this has no predecessor, i.e. entry, because we
-    // rely on PH having a predecessor.)
-    if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
-      SplitBlock(PH, PH->getTerminator(), DT, LI);
-
     // If we need run-time checks, version the loop now.
     auto PtrToPartition = Partitions.computePartitionSetForPointers(*LAI);
     const auto *RtPtrChecking = LAI->getRuntimePointerChecking();
@@ -807,6 +801,12 @@ public:
                   "may not insert runtime check with convergent operation");
     }
 
+    // To keep things simple have an empty preheader before we version or clone
+    // the loop.  (Also split if this has no predecessor, i.e. entry, because we
+    // rely on PH having a predecessor.)
+    if (!PH->getSinglePredecessor() || &*PH->begin() != PH->getTerminator())
+      SplitBlock(PH, PH->getTerminator(), DT, LI);
+
     if (!Pred.isAlwaysTrue() || !Checks.empty()) {
       assert(!LAI->hasConvergentOp() && "inserting illegal loop versioning");
 
@@ -903,15 +903,14 @@ private:
   /// \p PtrToPartition contains the partition number for pointers.  Partition
   /// number -1 means that the pointer is used in multiple partitions.  In this
   /// case we can't safely omit the check.
-  SmallVector<RuntimePointerChecking::PointerCheck, 4>
-  includeOnlyCrossPartitionChecks(
-      const SmallVectorImpl<RuntimePointerChecking::PointerCheck> &AllChecks,
+  SmallVector<RuntimePointerCheck, 4> includeOnlyCrossPartitionChecks(
+      const SmallVectorImpl<RuntimePointerCheck> &AllChecks,
       const SmallVectorImpl<int> &PtrToPartition,
       const RuntimePointerChecking *RtPtrChecking) {
-    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+    SmallVector<RuntimePointerCheck, 4> Checks;
 
     copy_if(AllChecks, std::back_inserter(Checks),
-            [&](const RuntimePointerChecking::PointerCheck &Check) {
+            [&](const RuntimePointerCheck &Check) {
               for (unsigned PtrIdx1 : Check.first->Members)
                 for (unsigned PtrIdx2 : Check.second->Members)
                   // Only include this check if there is a pair of pointers
diff --git a/llvm/lib/Transforms/Scalar/LoopFuse.cpp b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
index e1738f08eb23..20edc8699d79 100644
--- a/llvm/lib/Transforms/Scalar/LoopFuse.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFuse.cpp
@@ -86,11 +86,15 @@ STATISTIC(UnknownTripCount, "Loop has unknown trip count");
 STATISTIC(UncomputableTripCount, "SCEV cannot compute trip count of loop");
 STATISTIC(NonEqualTripCount, "Loop trip counts are not the same");
 STATISTIC(NonAdjacent, "Loops are not adjacent");
-STATISTIC(NonEmptyPreheader, "Loop has a non-empty preheader");
+STATISTIC(
+    NonEmptyPreheader,
+    "Loop has a non-empty preheader with instructions that cannot be moved");
 STATISTIC(FusionNotBeneficial, "Fusion is not beneficial");
 STATISTIC(NonIdenticalGuards, "Candidates have different guards");
-STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block");
-STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block");
+STATISTIC(NonEmptyExitBlock, "Candidate has a non-empty exit block with "
+                             "instructions that cannot be moved");
+STATISTIC(NonEmptyGuardBlock, "Candidate has a non-empty guard block with "
+                              "instructions that cannot be moved");
 STATISTIC(NotRotated, "Candidate is not rotated");
 
 enum FusionDependenceAnalysisChoice {
@@ -738,33 +742,40 @@ private:
             continue;
           }
 
-          // The following three checks look for empty blocks in FC0 and FC1. If
-          // any of these blocks are non-empty, we do not fuse. This is done
-          // because we currently do not have the safety checks to determine if
-          // it is safe to move the blocks past other blocks in the loop. Once
-          // these checks are added, these conditions can be relaxed.
-          if (!isEmptyPreheader(*FC1)) {
-            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty "
-                                 "preheader. Not fusing.\n");
+          if (!isSafeToMoveBefore(*FC1->Preheader,
+                                  *FC0->Preheader->getTerminator(), DT, &PDT,
+                                  &DI)) {
+            LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
+                                 "instructions in preheader. Not fusing.\n");
             reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
                                                        NonEmptyPreheader);
             continue;
           }
 
-          if (FC0->GuardBranch && !isEmptyExitBlock(*FC0)) {
-            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty exit "
-                                 "block. Not fusing.\n");
-            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
-                                                       NonEmptyExitBlock);
-            continue;
-          }
+          if (FC0->GuardBranch) {
+            assert(FC1->GuardBranch && "Expecting valid FC1 guard branch");
+
+            if (!isSafeToMoveBefore(*FC0->ExitBlock,
+                                    *FC1->ExitBlock->getFirstNonPHIOrDbg(), DT,
+                                    &PDT, &DI)) {
+              LLVM_DEBUG(dbgs() << "Fusion candidate contains unsafe "
+                                   "instructions in exit block. Not fusing.\n");
+              reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                         NonEmptyExitBlock);
+              continue;
+            }
 
-          if (FC1->GuardBranch && !isEmptyGuardBlock(*FC1)) {
-            LLVM_DEBUG(dbgs() << "Fusion candidate does not have empty guard "
-                                 "block. Not fusing.\n");
-            reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
-                                                       NonEmptyGuardBlock);
-            continue;
+            if (!isSafeToMoveBefore(
+                    *FC1->GuardBranch->getParent(),
+                    *FC0->GuardBranch->getParent()->getTerminator(), DT, &PDT,
+                    &DI)) {
+              LLVM_DEBUG(dbgs()
+                         << "Fusion candidate contains unsafe "
+                            "instructions in guard block. Not fusing.\n");
+              reportLoopFusion<OptimizationRemarkMissed>(*FC0, *FC1,
+                                                         NonEmptyGuardBlock);
+              continue;
+            }
           }
 
           // Check the dependencies across the loops and do not fuse if it would
@@ -1075,38 +1086,6 @@ private:
       return (FC1.GuardBranch->getSuccessor(1) == FC1.Preheader);
   }
 
-  /// Check that the guard for \p FC *only* contains the cmp/branch for the
-  /// guard.
-  /// Once we are able to handle intervening code, any code in the guard block
-  /// for FC1 will need to be treated as intervening code and checked whether
-  /// it can safely move around the loops.
-  bool isEmptyGuardBlock(const FusionCandidate &FC) const {
-    assert(FC.GuardBranch && "Expecting a fusion candidate with guard branch.");
-    if (auto *CmpInst = dyn_cast<Instruction>(FC.GuardBranch->getCondition())) {
-      auto *GuardBlock = FC.GuardBranch->getParent();
-      // If the generation of the cmp value is in GuardBlock, then the size of
-      // the guard block should be 2 (cmp + branch). If the generation of the
-      // cmp value is in a different block, then the size of the guard block
-      // should only be 1.
-      if (CmpInst->getParent() == GuardBlock)
-        return GuardBlock->size() == 2;
-      else
-        return GuardBlock->size() == 1;
-    }
-
-    return false;
-  }
-
-  bool isEmptyPreheader(const FusionCandidate &FC) const {
-    assert(FC.Preheader && "Expecting a valid preheader");
-    return FC.Preheader->size() == 1;
-  }
-
-  bool isEmptyExitBlock(const FusionCandidate &FC) const {
-    assert(FC.ExitBlock && "Expecting a valid exit block");
-    return FC.ExitBlock->size() == 1;
-  }
-
   /// Simplify the condition of the latch branch of \p FC to true, when both of
   /// its successors are the same.
   void simplifyLatchBranch(const FusionCandidate &FC) const {
@@ -1123,7 +1102,7 @@ private:
   /// Move instructions from FC0.Latch to FC1.Latch. If FC0.Latch has an unique
   /// successor, then merge FC0.Latch with its unique successor.
   void mergeLatch(const FusionCandidate &FC0, const FusionCandidate &FC1) {
-    moveInstsBottomUp(*FC0.Latch, *FC1.Latch, DT, PDT, DI);
+    moveInstructionsToTheBeginning(*FC0.Latch, *FC1.Latch, DT, PDT, DI);
     if (BasicBlock *Succ = FC0.Latch->getUniqueSuccessor()) {
       MergeBlockIntoPredecessor(Succ, &DTU, &LI);
       DTU.flush();
@@ -1166,6 +1145,10 @@ private:
     LLVM_DEBUG(dbgs() << "Fusion Candidate 0: \n"; FC0.dump();
                dbgs() << "Fusion Candidate 1: \n"; FC1.dump(););
 
+    // Move instructions from the preheader of FC1 to the end of the preheader
+    // of FC0.
+    moveInstructionsToTheEnd(*FC1.Preheader, *FC0.Preheader, DT, PDT, DI);
+
     // Fusing guarded loops is handled slightly differently than non-guarded
     // loops and has been broken out into a separate method instead of trying to
     // intersperse the logic within a single method.
@@ -1382,6 +1365,14 @@ private:
     BasicBlock *FC0NonLoopBlock = FC0.getNonLoopBlock();
     BasicBlock *FC1NonLoopBlock = FC1.getNonLoopBlock();
 
+    // Move instructions from the exit block of FC0 to the beginning of the exit
+    // block of FC1.
+    moveInstructionsToTheBeginning(*FC0.ExitBlock, *FC1.ExitBlock, DT, PDT, DI);
+
+    // Move instructions from the guard block of FC1 to the end of the guard
+    // block of FC0.
+    moveInstructionsToTheEnd(*FC1GuardBlock, *FC0GuardBlock, DT, PDT, DI);
+
     assert(FC0NonLoopBlock == FC1GuardBlock && "Loops are not adjacent");
 
     SmallVector<DominatorTree::UpdateType, 8> TreeUpdates;
@@ -1394,6 +1385,7 @@ private:
     // Thus, one path from the guard goes to the preheader for FC0 (and thus
     // executes the new fused loop) and the other path goes to the NonLoopBlock
     // for FC1 (where FC1 guard would have gone if FC1 was not executed).
+    FC1NonLoopBlock->replacePhiUsesWith(FC1GuardBlock, FC0GuardBlock);
     FC0.GuardBranch->replaceUsesOfWith(FC0NonLoopBlock, FC1NonLoopBlock);
     FC0.ExitBlock->getTerminator()->replaceUsesOfWith(FC1GuardBlock,
                                                       FC1.Header);
@@ -1545,7 +1537,10 @@ private:
     // Update DT/PDT
     DTU.applyUpdates(TreeUpdates);
 
+    LI.removeBlock(FC1GuardBlock);
     LI.removeBlock(FC1.Preheader);
+    LI.removeBlock(FC0.ExitBlock);
+    DTU.deleteBB(FC1GuardBlock);
     DTU.deleteBB(FC1.Preheader);
     DTU.deleteBB(FC0.ExitBlock);
     DTU.flush();
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index b77843d7cd71..3cb4df12e9b0 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -51,9 +51,11 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemoryLocation.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -91,6 +93,7 @@
 #include "llvm/Transforms/Utils/BuildLibCalls.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <algorithm>
 #include <cassert>
 #include <cstdint>
@@ -123,15 +126,19 @@ class LoopIdiomRecognize {
   const DataLayout *DL;
   OptimizationRemarkEmitter &ORE;
   bool ApplyCodeSizeHeuristics;
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
 
 public:
   explicit LoopIdiomRecognize(AliasAnalysis *AA, DominatorTree *DT,
                               LoopInfo *LI, ScalarEvolution *SE,
                               TargetLibraryInfo *TLI,
-                              const TargetTransformInfo *TTI,
+                              const TargetTransformInfo *TTI, MemorySSA *MSSA,
                               const DataLayout *DL,
                               OptimizationRemarkEmitter &ORE)
-      : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {}
+      : AA(AA), DT(DT), LI(LI), SE(SE), TLI(TLI), TTI(TTI), DL(DL), ORE(ORE) {
+    if (MSSA)
+      MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+  }
 
   bool runOnLoop(Loop *L);
 
@@ -224,13 +231,17 @@ public:
         &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
             *L->getHeader()->getParent());
     const DataLayout *DL = &L->getHeader()->getModule()->getDataLayout();
+    auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+    MemorySSA *MSSA = nullptr;
+    if (MSSAAnalysis)
+      MSSA = &MSSAAnalysis->getMSSA();
 
     // For the old PM, we can't use OptimizationRemarkEmitter as an analysis
     // pass.  Function analyses need to be preserved across loop transformations
     // but ORE cannot be preserved (see comment before the pass definition).
     OptimizationRemarkEmitter ORE(L->getHeader()->getParent());
 
-    LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, DL, ORE);
+    LoopIdiomRecognize LIR(AA, DT, LI, SE, TLI, TTI, MSSA, DL, ORE);
     return LIR.runOnLoop(L);
   }
 
@@ -239,6 +250,7 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetLibraryInfoWrapperPass>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.addPreserved<MemorySSAWrapperPass>();
     getLoopAnalysisUsage(AU);
   }
 };
@@ -252,23 +264,20 @@ PreservedAnalyses LoopIdiomRecognizePass::run(Loop &L, LoopAnalysisManager &AM,
                                               LPMUpdater &) {
   const auto *DL = &L.getHeader()->getModule()->getDataLayout();
 
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
-  // FIXME: This should probably be optional rather than required.
-  if (!ORE)
-    report_fatal_error(
-        "LoopIdiomRecognizePass: OptimizationRemarkEmitterAnalysis not cached "
-        "at a higher level");
+  // For the new PM, we also can't use OptimizationRemarkEmitter as an analysis
+  // pass.  Function analyses need to be preserved across loop transformations
+  // but ORE cannot be preserved (see comment before the pass definition).
+  OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
 
-  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI, DL,
-                         *ORE);
+  LoopIdiomRecognize LIR(&AR.AA, &AR.DT, &AR.LI, &AR.SE, &AR.TLI, &AR.TTI,
+                         AR.MSSA, DL, ORE);
   if (!LIR.runOnLoop(&L))
     return PreservedAnalyses::all();
 
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
 }
 
 INITIALIZE_PASS_BEGIN(LoopIdiomRecognizeLegacyPass, "loop-idiom",
@@ -339,14 +348,14 @@ bool LoopIdiomRecognize::runOnCountableLoop() {
                     << "] Countable Loop %" << CurLoop->getHeader()->getName()
                     << "\n");
 
-  bool MadeChange = false;
-
   // The following transforms hoist stores/memsets into the loop pre-header.
-  // Give up if the loop has instructions may throw.
+  // Give up if the loop has instructions that may throw.
   SimpleLoopSafetyInfo SafetyInfo;
   SafetyInfo.computeLoopSafetyInfo(CurLoop);
   if (SafetyInfo.anyBlockMayThrow())
-    return MadeChange;
+    return false;
+
+  bool MadeChange = false;
 
   // Scan all the blocks in the loop that are not in subloops.
   for (auto *BB : CurLoop->getBlocks()) {
@@ -968,11 +977,17 @@ bool LoopIdiomRecognize::processLoopStridedStore(
     Value *PatternPtr = ConstantExpr::getBitCast(GV, Int8PtrTy);
     NewCall = Builder.CreateCall(MSP, {BasePtr, PatternPtr, NumBytes});
   }
+  NewCall->setDebugLoc(TheStore->getDebugLoc());
+
+  if (MSSAU) {
+    MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
+        NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
+    MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
+  }
 
   LLVM_DEBUG(dbgs() << "  Formed memset: " << *NewCall << "\n"
                     << "    from store to: " << *Ev << " at: " << *TheStore
                     << "\n");
-  NewCall->setDebugLoc(TheStore->getDebugLoc());
 
   ORE.emit([&]() {
     return OptimizationRemark(DEBUG_TYPE, "ProcessLoopStridedStore",
@@ -984,12 +999,40 @@ bool LoopIdiomRecognize::processLoopStridedStore(
 
   // Okay, the memset has been formed.  Zap the original store and anything that
   // feeds into it.
-  for (auto *I : Stores)
+  for (auto *I : Stores) {
+    if (MSSAU)
+      MSSAU->removeMemoryAccess(I, true);
     deleteDeadInstruction(I);
+  }
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
   ++NumMemSet;
   return true;
 }
 
+class ExpandedValuesCleaner {
+  SCEVExpander &Expander;
+  TargetLibraryInfo *TLI;
+  SmallVector<Value *, 4> ExpandedValues;
+  bool Commit = false;
+
+public:
+  ExpandedValuesCleaner(SCEVExpander &Expander, TargetLibraryInfo *TLI)
+      : Expander(Expander), TLI(TLI) {}
+
+  void add(Value *V) { ExpandedValues.push_back(V); }
+
+  void commit() { Commit = true; }
+
+  ~ExpandedValuesCleaner() {
+    if (!Commit) {
+      Expander.clear();
+      for (auto *V : ExpandedValues)
+        RecursivelyDeleteTriviallyDeadInstructions(V, TLI);
+    }
+  }
+};
+
 /// If the stored value is a strided load in the same loop with the same stride
 /// this may be transformable into a memcpy.  This kicks in for stuff like
 /// for (i) A[i] = B[i];
@@ -1020,6 +1063,8 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   IRBuilder<> Builder(Preheader->getTerminator());
   SCEVExpander Expander(*SE, *DL, "loop-idiom");
 
+  ExpandedValuesCleaner EVC(Expander, TLI);
+
   const SCEV *StrStart = StoreEv->getStart();
   unsigned StrAS = SI->getPointerAddressSpace();
   Type *IntIdxTy = Builder.getIntNTy(DL->getIndexSizeInBits(StrAS));
@@ -1036,16 +1081,13 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   // checking everything.
   Value *StoreBasePtr = Expander.expandCodeFor(
       StrStart, Builder.getInt8PtrTy(StrAS), Preheader->getTerminator());
+  EVC.add(StoreBasePtr);
 
   SmallPtrSet<Instruction *, 1> Stores;
   Stores.insert(SI);
   if (mayLoopAccessLocation(StoreBasePtr, ModRefInfo::ModRef, CurLoop, BECount,
-                            StoreSize, *AA, Stores)) {
-    Expander.clear();
-    // If we generated new code for the base pointer, clean up.
-    RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
+                            StoreSize, *AA, Stores))
     return false;
-  }
 
   const SCEV *LdStart = LoadEv->getStart();
   unsigned LdAS = LI->getPointerAddressSpace();
@@ -1058,15 +1100,11 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   // mutated by the loop.
   Value *LoadBasePtr = Expander.expandCodeFor(
       LdStart, Builder.getInt8PtrTy(LdAS), Preheader->getTerminator());
+  EVC.add(LoadBasePtr);
 
   if (mayLoopAccessLocation(LoadBasePtr, ModRefInfo::Mod, CurLoop, BECount,
-                            StoreSize, *AA, Stores)) {
-    Expander.clear();
-    // If we generated new code for the base pointer, clean up.
-    RecursivelyDeleteTriviallyDeadInstructions(LoadBasePtr, TLI);
-    RecursivelyDeleteTriviallyDeadInstructions(StoreBasePtr, TLI);
+                            StoreSize, *AA, Stores))
     return false;
-  }
 
   if (avoidLIRForMultiBlockLoop())
     return false;
@@ -1078,6 +1116,7 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 
   Value *NumBytes =
       Expander.expandCodeFor(NumBytesS, IntIdxTy, Preheader->getTerminator());
+  EVC.add(NumBytes);
 
   CallInst *NewCall = nullptr;
   // Check whether to generate an unordered atomic memcpy:
@@ -1089,8 +1128,9 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
   else {
     // We cannot allow unaligned ops for unordered load/store, so reject
     // anything where the alignment isn't at least the element size.
-    unsigned Align = std::min(SI->getAlignment(), LI->getAlignment());
-    if (Align < StoreSize)
+    const Align StoreAlign = SI->getAlign();
+    const Align LoadAlign = LI->getAlign();
+    if (StoreAlign < StoreSize || LoadAlign < StoreSize)
       return false;
 
     // If the element.atomic memcpy is not lowered into explicit
@@ -1104,11 +1144,17 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
     // Note that unordered atomic loads/stores are *required* by the spec to
     // have an alignment but non-atomic loads/stores may not.
     NewCall = Builder.CreateElementUnorderedAtomicMemCpy(
-        StoreBasePtr, SI->getAlignment(), LoadBasePtr, LI->getAlignment(),
-        NumBytes, StoreSize);
+        StoreBasePtr, StoreAlign, LoadBasePtr, LoadAlign, NumBytes,
+        StoreSize);
   }
   NewCall->setDebugLoc(SI->getDebugLoc());
 
+  if (MSSAU) {
+    MemoryAccess *NewMemAcc = MSSAU->createMemoryAccessInBB(
+        NewCall, nullptr, NewCall->getParent(), MemorySSA::BeforeTerminator);
+    MSSAU->insertDef(cast<MemoryDef>(NewMemAcc), true);
+  }
+
   LLVM_DEBUG(dbgs() << "  Formed memcpy: " << *NewCall << "\n"
                     << "    from load ptr=" << *LoadEv << " at: " << *LI << "\n"
                     << "    from store ptr=" << *StoreEv << " at: " << *SI
@@ -1124,8 +1170,13 @@ bool LoopIdiomRecognize::processLoopStoreOfLoopLoad(StoreInst *SI,
 
   // Okay, the memcpy has been formed.  Zap the original store and anything that
   // feeds into it.
+  if (MSSAU)
+    MSSAU->removeMemoryAccess(SI, true);
   deleteDeadInstruction(SI);
+  if (MSSAU && VerifyMemorySSA)
+    MSSAU->getMemorySSA()->verifyMemorySSA();
   ++NumMemCpy;
+  EVC.commit();
   return true;
 }
 
@@ -1502,18 +1553,20 @@ bool LoopIdiomRecognize::recognizeAndInsertFFS() {
   //  %inc = add nsw %i.0, 1
   //  br i1 %tobool
 
-  const Value *Args[] =
-      {InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
-                        : ConstantInt::getFalse(InitX->getContext())};
+  const Value *Args[] = {
+      InitX, ZeroCheck ? ConstantInt::getTrue(InitX->getContext())
+                       : ConstantInt::getFalse(InitX->getContext())};
 
   // @llvm.dbg doesn't count as they have no semantic effect.
   auto InstWithoutDebugIt = CurLoop->getHeader()->instructionsWithoutDebug();
   uint32_t HeaderSize =
       std::distance(InstWithoutDebugIt.begin(), InstWithoutDebugIt.end());
 
+  IntrinsicCostAttributes Attrs(IntrinID, InitX->getType(), Args);
+  int Cost =
+    TTI->getIntrinsicInstrCost(Attrs, TargetTransformInfo::TCK_SizeAndLatency);
   if (HeaderSize != IdiomCanonicalSize &&
-      TTI->getIntrinsicCost(IntrinID, InitX->getType(), Args) >
-          TargetTransformInfo::TCC_Basic)
+      Cost > TargetTransformInfo::TCC_Basic)
     return false;
 
   transformLoopToCountable(IntrinID, PH, CntInst, CntPhi, InitX, DefX,
diff --git a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
index 901204181a7c..3153a8721193 100644
--- a/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInstSimplify.cpp
@@ -68,7 +68,7 @@ static bool simplifyLoopInst(Loop &L, DominatorTree &DT, LoopInfo &LI,
 
   // While simplifying we may discover dead code or cause code to become dead.
   // Keep track of all such instructions and we will delete them at the end.
-  SmallVector<Instruction *, 8> DeadInsts;
+  SmallVector<WeakTrackingVH, 8> DeadInsts;
 
   // First we want to create an RPO traversal of the loop body. By processing in
   // RPO we can ensure that definitions are processed prior to uses (for non PHI
diff --git a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
index 6ce2d06058cf..7787c0bccd4c 100644
--- a/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopInterchange.cpp
@@ -412,7 +412,6 @@ public:
 
 private:
   bool adjustLoopLinks();
-  void adjustLoopPreheaders();
   bool adjustLoopBranches();
 
   Loop *OuterLoop;
@@ -580,6 +579,12 @@ struct LoopInterchange : public LoopPass {
     LIT.transform();
     LLVM_DEBUG(dbgs() << "Loops interchanged.\n");
     LoopsInterchanged++;
+
+    assert(InnerLoop->isLCSSAForm(*DT) &&
+           "Inner loop not left in LCSSA form after loop interchange!");
+    assert(OuterLoop->isLCSSAForm(*DT) &&
+           "Outer loop not left in LCSSA form after loop interchange!");
+
     return true;
   }
 };
@@ -689,7 +694,7 @@ bool LoopInterchangeLegality::findInductionAndReductions(
       // PHIs in inner loops need to be part of a reduction in the outer loop,
       // discovered when checking the PHIs of the outer loop earlier.
       if (!InnerLoop) {
-        if (OuterInnerReductions.find(&PHI) == OuterInnerReductions.end()) {
+        if (!OuterInnerReductions.count(&PHI)) {
           LLVM_DEBUG(dbgs() << "Inner loop PHI is not part of reductions "
                                "across the outer loop.\n");
           return false;
@@ -903,8 +908,8 @@ areInnerLoopExitPHIsSupported(Loop *InnerL, Loop *OuterL,
       return false;
     if (any_of(PHI.users(), [&Reductions, OuterL](User *U) {
           PHINode *PN = dyn_cast<PHINode>(U);
-          return !PN || (Reductions.find(PN) == Reductions.end() &&
-                         OuterL->contains(PN->getParent()));
+          return !PN ||
+                 (!Reductions.count(PN) && OuterL->contains(PN->getParent()));
         })) {
       return false;
     }
@@ -1319,6 +1324,23 @@ static void moveBBContents(BasicBlock *FromBB, Instruction *InsertBefore) {
                 FromBB->getTerminator()->getIterator());
 }
 
+/// Swap instructions between \p BB1 and \p BB2 but keep terminators intact.
+static void swapBBContents(BasicBlock *BB1, BasicBlock *BB2) {
+  // Save all non-terminator instructions of BB1 into TempInstrs and unlink them
+  // from BB1 afterwards.
+  auto Iter = map_range(*BB1, [](Instruction &I) { return &I; });
+  SmallVector<Instruction *, 4> TempInstrs(Iter.begin(), std::prev(Iter.end()));
+  for (Instruction *I : TempInstrs)
+    I->removeFromParent();
+
+  // Move instructions from BB2 to BB1.
+  moveBBContents(BB2, BB1->getTerminator());
+
+  // Move instructions from TempInstrs to BB2.
+  for (Instruction *I : TempInstrs)
+    I->insertBefore(BB2->getTerminator());
+}
+
 // Update BI to jump to NewBB instead of OldBB. Records updates to the
 // dominator tree in DTUpdates. If \p MustUpdateOnce is true, assert that
 // \p OldBB  is exactly once in BI's successor list.
@@ -1560,13 +1582,11 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   // outer loop and all the remains to do is and updating the incoming blocks.
   for (PHINode *PHI : OuterLoopPHIs) {
     PHI->moveBefore(InnerLoopHeader->getFirstNonPHI());
-    assert(OuterInnerReductions.find(PHI) != OuterInnerReductions.end() &&
-           "Expected a reduction PHI node");
+    assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
   }
   for (PHINode *PHI : InnerLoopPHIs) {
     PHI->moveBefore(OuterLoopHeader->getFirstNonPHI());
-    assert(OuterInnerReductions.find(PHI) != OuterInnerReductions.end() &&
-           "Expected a reduction PHI node");
+    assert(OuterInnerReductions.count(PHI) && "Expected a reduction PHI node");
   }
 
   // Update the incoming blocks for moved PHI nodes.
@@ -1578,30 +1598,17 @@ bool LoopInterchangeTransform::adjustLoopBranches() {
   return true;
 }
 
-void LoopInterchangeTransform::adjustLoopPreheaders() {
-  // We have interchanged the preheaders so we need to interchange the data in
-  // the preheader as well.
-  // This is because the content of inner preheader was previously executed
-  // inside the outer loop.
-  BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
-  BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
-  BasicBlock *OuterLoopHeader = OuterLoop->getHeader();
-  BranchInst *InnerTermBI =
-      cast<BranchInst>(InnerLoopPreHeader->getTerminator());
-
-  // These instructions should now be executed inside the loop.
-  // Move instruction into a new block after outer header.
-  moveBBContents(InnerLoopPreHeader, OuterLoopHeader->getTerminator());
-  // These instructions were not executed previously in the loop so move them to
-  // the older inner loop preheader.
-  moveBBContents(OuterLoopPreHeader, InnerTermBI);
-}
-
 bool LoopInterchangeTransform::adjustLoopLinks() {
   // Adjust all branches in the inner and outer loop.
   bool Changed = adjustLoopBranches();
-  if (Changed)
-    adjustLoopPreheaders();
+  if (Changed) {
+    // We have interchanged the preheaders so we need to interchange the data in
+    // the preheaders as well. This is because the content of the inner
+    // preheader was previously executed inside the outer loop.
+    BasicBlock *OuterLoopPreHeader = OuterLoop->getLoopPreheader();
+    BasicBlock *InnerLoopPreHeader = InnerLoop->getLoopPreheader();
+    swapBBContents(OuterLoopPreHeader, InnerLoopPreHeader);
+  }
   return Changed;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
index 4e1b4e87ebc9..4412b3079461 100644
--- a/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopLoadElimination.cpp
@@ -38,7 +38,6 @@
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -58,6 +57,7 @@
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
 #include <algorithm>
 #include <cassert>
@@ -377,7 +377,7 @@ public:
 
   /// Determine the pointer alias checks to prove that there are no
   /// intervening stores.
-  SmallVector<RuntimePointerChecking::PointerCheck, 4> collectMemchecks(
+  SmallVector<RuntimePointerCheck, 4> collectMemchecks(
       const SmallVectorImpl<StoreToLoadForwardingCandidate> &Candidates) {
 
     SmallPtrSet<Value *, 4> PtrsWrittenOnFwdingPath =
@@ -391,10 +391,10 @@ public:
                    std::mem_fn(&StoreToLoadForwardingCandidate::getLoadPtr));
 
     const auto &AllChecks = LAI.getRuntimePointerChecking()->getChecks();
-    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks;
+    SmallVector<RuntimePointerCheck, 4> Checks;
 
     copy_if(AllChecks, std::back_inserter(Checks),
-            [&](const RuntimePointerChecking::PointerCheck &Check) {
+            [&](const RuntimePointerCheck &Check) {
               for (auto PtrIdx1 : Check.first->Members)
                 for (auto PtrIdx2 : Check.second->Members)
                   if (needsChecking(PtrIdx1, PtrIdx2, PtrsWrittenOnFwdingPath,
@@ -432,12 +432,12 @@ public:
     Value *Ptr = Cand.Load->getPointerOperand();
     auto *PtrSCEV = cast<SCEVAddRecExpr>(PSE.getSCEV(Ptr));
     auto *PH = L->getLoopPreheader();
+    assert(PH && "Preheader should exist!");
     Value *InitialPtr = SEE.expandCodeFor(PtrSCEV->getStart(), Ptr->getType(),
                                           PH->getTerminator());
     Value *Initial = new LoadInst(
         Cand.Load->getType(), InitialPtr, "load_initial",
-        /* isVolatile */ false, MaybeAlign(Cand.Load->getAlignment()),
-        PH->getTerminator());
+        /* isVolatile */ false, Cand.Load->getAlign(), PH->getTerminator());
 
     PHINode *PHI = PHINode::Create(Initial->getType(), 2, "store_forwarded",
                                    &L->getHeader()->front());
@@ -520,8 +520,7 @@ public:
 
     // Check intervening may-alias stores.  These need runtime checks for alias
     // disambiguation.
-    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks =
-        collectMemchecks(Candidates);
+    SmallVector<RuntimePointerCheck, 4> Checks = collectMemchecks(Candidates);
 
     // Too many checks are likely to outweigh the benefits of forwarding.
     if (Checks.size() > Candidates.size() * CheckPerElim) {
@@ -535,6 +534,11 @@ public:
       return false;
     }
 
+    if (!L->isLoopSimplifyForm()) {
+      LLVM_DEBUG(dbgs() << "Loop is not is loop-simplify form");
+      return false;
+    }
+
     if (!Checks.empty() || !LAI.getPSE().getUnionPredicate().isAlwaysTrue()) {
       if (LAI.hasConvergentOp()) {
         LLVM_DEBUG(dbgs() << "Versioning is needed but not allowed with "
@@ -554,11 +558,6 @@ public:
         return false;
       }
 
-      if (!L->isLoopSimplifyForm()) {
-        LLVM_DEBUG(dbgs() << "Loop is not is loop-simplify form");
-        return false;
-      }
-
       // Point of no-return, start the transformation.  First, version the loop
       // if necessary.
 
@@ -697,8 +696,8 @@ PreservedAnalyses LoopLoadEliminationPass::run(Function &F,
   auto &TLI = AM.getResult<TargetLibraryAnalysis>(F);
   auto &AA = AM.getResult<AAManager>(F);
   auto &AC = AM.getResult<AssumptionAnalysis>(F);
-  auto &MAM = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
-  auto *PSI = MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
+  auto *PSI = MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
   auto *BFI = (PSI && PSI->hasProfileSummary()) ?
       &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
   MemorySSA *MSSA = EnableMSSALoopDependency
diff --git a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
index f3bfbd3564ab..98889a9df116 100644
--- a/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPassManager.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/Support/TimeProfiler.h"
 #include "llvm/Transforms/Scalar/LoopPassManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 
@@ -33,15 +34,19 @@ PassManager<Loop, LoopAnalysisManager, LoopStandardAnalysisResults &,
   // instrumenting callbacks for the passes later.
   PassInstrumentation PI = AM.getResult<PassInstrumentationAnalysis>(L, AR);
   for (auto &Pass : Passes) {
-    if (DebugLogging)
-      dbgs() << "Running pass: " << Pass->name() << " on " << L;
-
     // Check the PassInstrumentation's BeforePass callbacks before running the
     // pass, skip its execution completely if asked to (callback returns false).
     if (!PI.runBeforePass<Loop>(*Pass, L))
       continue;
 
-    PreservedAnalyses PassPA = Pass->run(L, AM, AR, U);
+    if (DebugLogging)
+      dbgs() << "Running pass: " << Pass->name() << " on " << L;
+
+    PreservedAnalyses PassPA;
+    {
+      TimeTraceScope TimeScope(Pass->name(), L.getName());
+      PassPA = Pass->run(L, AM, AR, U);
+    }
 
     // do not pass deleted Loop into the instrumentation
     if (U.skipCurrentLoop())
diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index 1a42f6b23443..edde22d6708f 100644
--- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -184,7 +184,6 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
@@ -199,6 +198,7 @@
 #include "llvm/Transforms/Utils/GuardUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 #define DEBUG_TYPE "loop-predication"
 
@@ -268,7 +268,7 @@ class LoopPredication {
   /// Return an insertion point suitable for inserting a safe to speculate
   /// instruction whose only user will be 'User' which has operands 'Ops'.  A
   /// trivial result would be the at the User itself, but we try to return a
-  /// loop invariant location if possible.  
+  /// loop invariant location if possible.
   Instruction *findInsertPt(Instruction *User, ArrayRef<Value*> Ops);
   /// Same as above, *except* that this uses the SCEV definition of invariant
   /// which is that an expression *can be made* invariant via SCEVExpander.
@@ -278,7 +278,7 @@ class LoopPredication {
 
   /// Return true if the value is known to produce a single fixed value across
   /// all iterations on which it executes.  Note that this does not imply
-  /// speculation safety.  That must be established seperately.  
+  /// speculation safety.  That must be established separately.
   bool isLoopInvariantValue(const SCEV* S);
 
   Value *expandCheck(SCEVExpander &Expander, Instruction *Guard,
@@ -342,7 +342,7 @@ public:
 };
 
 char LoopPredicationLegacyPass::ID = 0;
-} // end namespace llvm
+} // end namespace
 
 INITIALIZE_PASS_BEGIN(LoopPredicationLegacyPass, "loop-predication",
                       "Loop predication", false, false)
@@ -358,11 +358,12 @@ Pass *llvm::createLoopPredicationPass() {
 PreservedAnalyses LoopPredicationPass::run(Loop &L, LoopAnalysisManager &AM,
                                            LoopStandardAnalysisResults &AR,
                                            LPMUpdater &U) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
   Function *F = L.getHeader()->getParent();
-  auto *BPI = FAM.getCachedResult<BranchProbabilityAnalysis>(*F);
-  LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, BPI);
+  // For the new PM, we also can't use BranchProbabilityInfo as an analysis
+  // pass. Function analyses need to be preserved across loop transformations
+  // but BPI is not preserved, hence a newly built one is needed.
+  BranchProbabilityInfo BPI(*F, AR.LI, &AR.TLI);
+  LoopPredication LP(&AR.AA, &AR.DT, &AR.SE, &AR.LI, &BPI);
   if (!LP.runOnLoop(&L))
     return PreservedAnalyses::all();
 
@@ -397,7 +398,7 @@ LoopPredication::parseLoopICmp(ICmpInst *ICI) {
 }
 
 Value *LoopPredication::expandCheck(SCEVExpander &Expander,
-                                    Instruction *Guard, 
+                                    Instruction *Guard,
                                     ICmpInst::Predicate Pred, const SCEV *LHS,
                                     const SCEV *RHS) {
   Type *Ty = LHS->getType();
@@ -521,7 +522,7 @@ Instruction *LoopPredication::findInsertPt(Instruction *Use,
   return Preheader->getTerminator();
 }
 
-bool LoopPredication::isLoopInvariantValue(const SCEV* S) { 
+bool LoopPredication::isLoopInvariantValue(const SCEV* S) {
   // Handling expressions which produce invariant results, but *haven't* yet
   // been removed from the loop serves two important purposes.
   // 1) Most importantly, it resolves a pass ordering cycle which would
@@ -534,12 +535,12 @@ bool LoopPredication::isLoopInvariantValue(const SCEV* S) {
   // much more obviously in the IR.  Otherwise, the cost modeling for other
   // transforms would end up needing to duplicate all of this logic to model a
   // check which becomes predictable based on a modeled peel or unswitch.
-  // 
+  //
   // The cost of doing so in the worst case is an extra fill from the stack  in
   // the loop to materialize the loop invariant test value instead of checking
   // against the original IV which is presumable in a register inside the loop.
   // Such cases are presumably rare, and hint at missing oppurtunities for
-  // other passes. 
+  // other passes.
 
   if (SE->isLoopInvariant(S, L))
     // Note: This the SCEV variant, so the original Value* may be within the
@@ -547,7 +548,7 @@ bool LoopPredication::isLoopInvariantValue(const SCEV* S) {
     return true;
 
   // Handle a particular important case which SCEV doesn't yet know about which
-  // shows up in range checks on arrays with immutable lengths.  
+  // shows up in range checks on arrays with immutable lengths.
   // TODO: This should be sunk inside SCEV.
   if (const SCEVUnknown *U = dyn_cast<SCEVUnknown>(S))
     if (const auto *LI = dyn_cast<LoadInst>(U->getValue()))
@@ -574,7 +575,7 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
   const SCEV *LatchLimit = LatchCheck.Limit;
   // Subtlety: We need all the values to be *invariant* across all iterations,
   // but we only need to check expansion safety for those which *aren't*
-  // already guaranteed to dominate the guard.  
+  // already guaranteed to dominate the guard.
   if (!isLoopInvariantValue(GuardStart) ||
       !isLoopInvariantValue(GuardLimit) ||
       !isLoopInvariantValue(LatchStart) ||
@@ -598,7 +599,7 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckIncrementingLoop(
   LLVM_DEBUG(dbgs() << "LHS: " << *LatchLimit << "\n");
   LLVM_DEBUG(dbgs() << "RHS: " << *RHS << "\n");
   LLVM_DEBUG(dbgs() << "Pred: " << LimitCheckPred << "\n");
- 
+
   auto *LimitCheck =
       expandCheck(Expander, Guard, LimitCheckPred, LatchLimit, RHS);
   auto *FirstIterationCheck = expandCheck(Expander, Guard, RangeCheck.Pred,
@@ -617,7 +618,7 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
   const SCEV *LatchLimit = LatchCheck.Limit;
   // Subtlety: We need all the values to be *invariant* across all iterations,
   // but we only need to check expansion safety for those which *aren't*
-  // already guaranteed to dominate the guard.  
+  // already guaranteed to dominate the guard.
   if (!isLoopInvariantValue(GuardStart) ||
       !isLoopInvariantValue(GuardLimit) ||
       !isLoopInvariantValue(LatchStart) ||
@@ -658,7 +659,7 @@ Optional<Value *> LoopPredication::widenICmpRangeCheckDecrementingLoop(
 static void normalizePredicate(ScalarEvolution *SE, Loop *L,
                                LoopICmp& RC) {
   // LFTR canonicalizes checks to the ICMP_NE/EQ form; normalize back to the
-  // ULT/UGE form for ease of handling by our caller. 
+  // ULT/UGE form for ease of handling by our caller.
   if (ICmpInst::isEquality(RC.Pred) &&
       RC.IV->getStepRecurrence(*SE)->isOne() &&
       SE->isKnownPredicate(ICmpInst::ICMP_ULE, RC.IV->getStart(), RC.Limit))
@@ -1020,17 +1021,6 @@ static const SCEV *getMinAnalyzeableBackedgeTakenCount(ScalarEvolution &SE,
   return SE.getUMinFromMismatchedTypes(ExitCounts);
 }
 
-/// Return true if we can be fairly sure that executing block BB will probably
-/// lead to executing an __llvm_deoptimize.  This is a profitability heuristic,
-/// not a legality constraint.
-static bool isVeryLikelyToDeopt(BasicBlock *BB) {
-  while (BB->getUniqueSuccessor())
-    // Will skip side effects, that's okay
-    BB = BB->getUniqueSuccessor();
-
-  return BB->getTerminatingDeoptimizeCall();
-}
-
 /// This implements an analogous, but entirely distinct transform from the main
 /// loop predication transform.  This one is phrased in terms of using a
 /// widenable branch *outside* the loop to allow us to simplify loop exits in a
@@ -1054,7 +1044,7 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   // inserting a branch on the value which can be either poison or undef.  In
   // this case, the branch can legally go either way; we just need to avoid
   // introducing UB.  This is achieved through the use of the freeze
-  // instruction.  
+  // instruction.
 
   SmallVector<BasicBlock *, 16> ExitingBlocks;
   L->getExitingBlocks(ExitingBlocks);
@@ -1082,7 +1072,7 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
   // analyzeable after dropping widenability.
   {
     bool Invalidate = false;
-    
+
     for (auto *ExitingBB : ExitingBlocks) {
       if (LI->getLoopFor(ExitingBB) != L)
         continue;
@@ -1150,10 +1140,13 @@ bool LoopPredication::predicateLoopExits(Loop *L, SCEVExpander &Rewriter) {
 
     const bool ExitIfTrue = !L->contains(*succ_begin(ExitingBB));
     BasicBlock *ExitBB = BI->getSuccessor(ExitIfTrue ? 0 : 1);
-    if (!isVeryLikelyToDeopt(ExitBB))
-      // Profitability: indicator of rarely/never taken exit
+    if (!ExitBB->getPostdominatingDeoptimizeCall())
       continue;
 
+    /// Here we can be fairly sure that executing this exit will most likely
+    /// lead to executing llvm.experimental.deoptimize.
+    /// This is a profitability heuristic, not a legality constraint.
+
     // If we found a widenable exit condition, do two things:
     // 1) fold the widened exit test into the widenable condition
     // 2) fold the branch to untaken - avoids infinite looping
diff --git a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
index da13a342ae12..3542d0a4ee73 100644
--- a/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRerollPass.cpp
@@ -24,7 +24,6 @@
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -55,6 +54,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <cassert>
 #include <cstddef>
 #include <cstdint>
@@ -880,6 +880,12 @@ bool LoopReroll::DAGRootTracker::validateRootSet(DAGRootSet &DRS) {
   if (DRS.Roots.empty())
     return false;
 
+  // If the value of the base instruction is used outside the loop, we cannot
+  // reroll the loop. Check for other root instructions is unnecessary because
+  // they don't match any base instructions if their values are used outside.
+  if (hasUsesOutsideLoop(DRS.BaseInst, L))
+    return false;
+
   // Consider a DAGRootSet with N-1 roots (so N different values including
   //   BaseInst).
   // Define d = Roots[0] - BaseInst, which should be the same as
@@ -1126,7 +1132,7 @@ static bool isIgnorableInst(const Instruction *I) {
     case Intrinsic::annotation:
     case Intrinsic::ptr_annotation:
     case Intrinsic::var_annotation:
-    // TODO: the following intrinsics may also be whitelisted:
+    // TODO: the following intrinsics may also be allowed:
     //   lifetime_start, lifetime_end, invariant_start, invariant_end
       return true;
   }
diff --git a/llvm/lib/Transforms/Scalar/LoopRotation.cpp b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
index 0868e742f4ee..f92566ba77ce 100644
--- a/llvm/lib/Transforms/Scalar/LoopRotation.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopRotation.cpp
@@ -81,10 +81,8 @@ public:
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<AssumptionCacheTracker>();
     AU.addRequired<TargetTransformInfoWrapperPass>();
-    if (EnableMSSALoopDependency) {
-      AU.addRequired<MemorySSAWrapperPass>();
+    if (EnableMSSALoopDependency)
       AU.addPreserved<MemorySSAWrapperPass>();
-    }
     getLoopAnalysisUsage(AU);
   }
 
@@ -101,15 +99,18 @@ public:
     const SimplifyQuery SQ = getBestSimplifyQuery(*this, F);
     Optional<MemorySSAUpdater> MSSAU;
     if (EnableMSSALoopDependency) {
-      MemorySSA *MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
-      MSSAU = MemorySSAUpdater(MSSA);
+      // Not requiring MemorySSA and getting it only if available will split
+      // the loop pass pipeline when LoopRotate is being run first.
+      auto *MSSAA = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+      if (MSSAA)
+        MSSAU = MemorySSAUpdater(&MSSAA->getMSSA());
     }
     return LoopRotation(L, LI, TTI, AC, &DT, &SE,
                         MSSAU.hasValue() ? MSSAU.getPointer() : nullptr, SQ,
                         false, MaxHeaderSize, false);
   }
 };
-}
+} // end namespace
 
 char LoopRotateLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopRotateLegacyPass, "loop-rotate", "Rotate Loops",
diff --git a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
index b27e65e0adb7..031e5b9c1d2c 100644
--- a/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopSimplifyCFG.cpp
@@ -23,6 +23,7 @@
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
@@ -30,6 +31,7 @@
 #include "llvm/Analysis/ScalarEvolutionAliasAnalysis.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Scalar.h"
@@ -673,13 +675,13 @@ static bool mergeBlocksIntoPredecessors(Loop &L, DominatorTree &DT,
 
 static bool simplifyLoopCFG(Loop &L, DominatorTree &DT, LoopInfo &LI,
                             ScalarEvolution &SE, MemorySSAUpdater *MSSAU,
-                            bool &isLoopDeleted) {
+                            bool &IsLoopDeleted) {
   bool Changed = false;
 
   // Constant-fold terminators with known constant conditions.
-  Changed |= constantFoldTerminators(L, DT, LI, SE, MSSAU, isLoopDeleted);
+  Changed |= constantFoldTerminators(L, DT, LI, SE, MSSAU, IsLoopDeleted);
 
-  if (isLoopDeleted)
+  if (IsLoopDeleted)
     return true;
 
   // Eliminate unconditional branches by merging blocks into their predecessors.
@@ -752,7 +754,7 @@ public:
     getLoopAnalysisUsage(AU);
   }
 };
-}
+} // end namespace
 
 char LoopSimplifyCFGLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(LoopSimplifyCFGLegacyPass, "loop-simplifycfg",
diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index e9f368628a08..cf02ef1e83f3 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -65,12 +65,14 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/iterator_range.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/IVUsers.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MemorySSA.h"
+#include "llvm/Analysis/MemorySSAUpdater.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/ScalarEvolutionNormalization.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -109,6 +111,7 @@
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include <algorithm>
 #include <cassert>
 #include <cstddef>
@@ -807,9 +810,14 @@ static bool isAddressUse(const TargetTransformInfo &TTI,
     switch (II->getIntrinsicID()) {
     case Intrinsic::memset:
     case Intrinsic::prefetch:
+    case Intrinsic::masked_load:
       if (II->getArgOperand(0) == OperandVal)
         isAddress = true;
       break;
+    case Intrinsic::masked_store:
+      if (II->getArgOperand(1) == OperandVal)
+        isAddress = true;
+      break;
     case Intrinsic::memmove:
     case Intrinsic::memcpy:
       if (II->getArgOperand(0) == OperandVal ||
@@ -859,6 +867,15 @@ static MemAccessTy getAccessType(const TargetTransformInfo &TTI,
       AccessTy.AddrSpace = OperandVal->getType()->getPointerAddressSpace();
       AccessTy.MemTy = OperandVal->getType();
       break;
+    case Intrinsic::masked_load:
+      AccessTy.AddrSpace =
+          II->getArgOperand(0)->getType()->getPointerAddressSpace();
+      break;
+    case Intrinsic::masked_store:
+      AccessTy.MemTy = II->getOperand(0)->getType();
+      AccessTy.AddrSpace =
+          II->getArgOperand(1)->getType()->getPointerAddressSpace();
+      break;
     default: {
       MemIntrinsicInfo IntrInfo;
       if (TTI.getTgtMemIntrinsic(II, IntrInfo) && IntrInfo.PtrVal) {
@@ -962,33 +979,6 @@ static bool isHighCostExpansion(const SCEV *S,
   return true;
 }
 
-/// If any of the instructions in the specified set are trivially dead, delete
-/// them and see if this makes any of their operands subsequently dead.
-static bool
-DeleteTriviallyDeadInstructions(SmallVectorImpl<WeakTrackingVH> &DeadInsts) {
-  bool Changed = false;
-
-  while (!DeadInsts.empty()) {
-    Value *V = DeadInsts.pop_back_val();
-    Instruction *I = dyn_cast_or_null<Instruction>(V);
-
-    if (!I || !isInstructionTriviallyDead(I))
-      continue;
-
-    for (Use &O : I->operands())
-      if (Instruction *U = dyn_cast<Instruction>(O)) {
-        O = nullptr;
-        if (U->use_empty())
-          DeadInsts.emplace_back(U);
-      }
-
-    I->eraseFromParent();
-    Changed = true;
-  }
-
-  return Changed;
-}
-
 namespace {
 
 class LSRUse;
@@ -1242,7 +1232,7 @@ void Cost::RateRegister(const Formula &F, const SCEV *Reg,
     // for now LSR only handles innermost loops).
     if (AR->getLoop() != L) {
       // If the AddRec exists, consider it's register free and leave it alone.
-      if (isExistingPhi(AR, *SE))
+      if (isExistingPhi(AR, *SE) && !TTI->shouldFavorPostInc())
         return;
 
       // It is bad to allow LSR for current loop to add induction variables
@@ -1913,9 +1903,10 @@ class LSRInstance {
   DominatorTree &DT;
   LoopInfo &LI;
   AssumptionCache &AC;
-  TargetLibraryInfo &LibInfo;
+  TargetLibraryInfo &TLI;
   const TargetTransformInfo &TTI;
   Loop *const L;
+  MemorySSAUpdater *MSSAU;
   bool FavorBackedgeIndex = false;
   bool Changed = false;
 
@@ -2018,6 +2009,7 @@ class LSRInstance {
   void NarrowSearchSpaceByCollapsingUnrolledCode();
   void NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
   void NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+  void NarrowSearchSpaceByFilterPostInc();
   void NarrowSearchSpaceByDeletingCostlyFormulas();
   void NarrowSearchSpaceByPickingWinnerRegs();
   void NarrowSearchSpaceUsingHeuristics();
@@ -2053,7 +2045,7 @@ class LSRInstance {
 public:
   LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE, DominatorTree &DT,
               LoopInfo &LI, const TargetTransformInfo &TTI, AssumptionCache &AC,
-              TargetLibraryInfo &LibInfo);
+              TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU);
 
   bool getChanged() const { return Changed; }
 
@@ -2830,9 +2822,10 @@ bool IVChain::isProfitableIncrement(const SCEV *OperExpr,
 /// increments can be computed in fewer registers when chained.
 ///
 /// TODO: Consider IVInc free if it's already used in another chains.
-static bool
-isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
-                  ScalarEvolution &SE) {
+static bool isProfitableChain(IVChain &Chain,
+                              SmallPtrSetImpl<Instruction *> &Users,
+                              ScalarEvolution &SE,
+                              const TargetTransformInfo &TTI) {
   if (StressIVChain)
     return true;
 
@@ -2861,7 +2854,14 @@ isProfitableChain(IVChain &Chain, SmallPtrSetImpl<Instruction*> &Users,
   unsigned NumConstIncrements = 0;
   unsigned NumVarIncrements = 0;
   unsigned NumReusedIncrements = 0;
+
+  if (TTI.isProfitableLSRChainElement(Chain.Incs[0].UserInst))
+    return true;
+
   for (const IVInc &Inc : Chain) {
+    if (TTI.isProfitableLSRChainElement(Inc.UserInst))
+      return true;
+
     if (Inc.IncExpr->isZero())
       continue;
 
@@ -3092,7 +3092,7 @@ void LSRInstance::CollectChains() {
   for (unsigned UsersIdx = 0, NChains = IVChainVec.size();
        UsersIdx < NChains; ++UsersIdx) {
     if (!isProfitableChain(IVChainVec[UsersIdx],
-                           ChainUsersVec[UsersIdx].FarUsers, SE))
+                           ChainUsersVec[UsersIdx].FarUsers, SE, TTI))
       continue;
     // Preserve the chain at UsesIdx.
     if (ChainIdx != UsersIdx)
@@ -3212,7 +3212,8 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
       IVOper = Builder.CreateTruncOrBitCast(IVOper, OperTy, "lsr.chain");
     }
     Inc.UserInst->replaceUsesOfWith(Inc.IVOperand, IVOper);
-    DeadInsts.emplace_back(Inc.IVOperand);
+    if (auto *OperandIsInstr = dyn_cast<Instruction>(Inc.IVOperand))
+      DeadInsts.emplace_back(OperandIsInstr);
   }
   // If LSR created a new, wider phi, we may also replace its postinc. We only
   // do this if we also found a wide value for the head of the chain.
@@ -3240,7 +3241,7 @@ void LSRInstance::GenerateIVChain(const IVChain &Chain, SCEVExpander &Rewriter,
 
 void LSRInstance::CollectFixupsAndInitialFormulae() {
   BranchInst *ExitBranch = nullptr;
-  bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &LibInfo);
+  bool SaveCmp = TTI.canSaveCmp(L, &ExitBranch, &SE, &LI, &DT, &AC, &TLI);
 
   for (const IVStrideUse &U : IU) {
     Instruction *UserInst = U.getUser();
@@ -3553,9 +3554,6 @@ static bool mayUsePostIncMode(const TargetTransformInfo &TTI,
   const SCEV *LoopStep = AR->getStepRecurrence(SE);
   if (!isa<SCEVConstant>(LoopStep))
     return false;
-  if (LU.AccessTy.getType()->getScalarSizeInBits() !=
-      LoopStep->getType()->getScalarSizeInBits())
-    return false;
   // Check if a post-indexed load/store can be used.
   if (TTI.isIndexedLoadLegal(TTI.MIM_PostInc, AR->getType()) ||
       TTI.isIndexedStoreLegal(TTI.MIM_PostInc, AR->getType())) {
@@ -4673,6 +4671,54 @@ void LSRInstance::NarrowSearchSpaceByFilterFormulaWithSameScaledReg() {
   });
 }
 
+/// If we are over the complexity limit, filter out any post-inc prefering
+/// variables to only post-inc values.
+void LSRInstance::NarrowSearchSpaceByFilterPostInc() {
+  if (!TTI.shouldFavorPostInc())
+    return;
+  if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+    return;
+
+  LLVM_DEBUG(dbgs() << "The search space is too complex.\n"
+                       "Narrowing the search space by choosing the lowest "
+                       "register Formula for PostInc Uses.\n");
+
+  for (size_t LUIdx = 0, NumUses = Uses.size(); LUIdx != NumUses; ++LUIdx) {
+    LSRUse &LU = Uses[LUIdx];
+
+    if (LU.Kind != LSRUse::Address)
+      continue;
+    if (!TTI.isIndexedLoadLegal(TTI.MIM_PostInc, LU.AccessTy.getType()) &&
+        !TTI.isIndexedStoreLegal(TTI.MIM_PostInc, LU.AccessTy.getType()))
+      continue;
+
+    size_t MinRegs = std::numeric_limits<size_t>::max();
+    for (const Formula &F : LU.Formulae)
+      MinRegs = std::min(F.getNumRegs(), MinRegs);
+
+    bool Any = false;
+    for (size_t FIdx = 0, NumForms = LU.Formulae.size(); FIdx != NumForms;
+         ++FIdx) {
+      Formula &F = LU.Formulae[FIdx];
+      if (F.getNumRegs() > MinRegs) {
+        LLVM_DEBUG(dbgs() << "  Filtering out formula "; F.print(dbgs());
+                   dbgs() << "\n");
+        LU.DeleteFormula(F);
+        --FIdx;
+        --NumForms;
+        Any = true;
+      }
+    }
+    if (Any)
+      LU.RecomputeRegs(LUIdx, RegUses);
+
+    if (EstimateSearchSpaceComplexity() < ComplexityLimit)
+      break;
+  }
+
+  LLVM_DEBUG(dbgs() << "After pre-selection:\n"; print_uses(dbgs()));
+}
+
 /// The function delete formulas with high registers number expectation.
 /// Assuming we don't know the value of each formula (already delete
 /// all inefficient), generate probability of not selecting for each
@@ -4883,6 +4929,7 @@ void LSRInstance::NarrowSearchSpaceUsingHeuristics() {
   NarrowSearchSpaceByRefilteringUndesirableDedicatedRegisters();
   if (FilterSameScaledReg)
     NarrowSearchSpaceByFilterFormulaWithSameScaledReg();
+  NarrowSearchSpaceByFilterPostInc();
   if (LSRExpNarrow)
     NarrowSearchSpaceByDeletingCostlyFormulas();
   else
@@ -4923,19 +4970,24 @@ void LSRInstance::SolveRecurse(SmallVectorImpl<const Formula *> &Solution,
     // Ignore formulae which may not be ideal in terms of register reuse of
     // ReqRegs.  The formula should use all required registers before
     // introducing new ones.
-    int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
-    for (const SCEV *Reg : ReqRegs) {
-      if ((F.ScaledReg && F.ScaledReg == Reg) ||
-          is_contained(F.BaseRegs, Reg)) {
-        --NumReqRegsToFind;
-        if (NumReqRegsToFind == 0)
-          break;
+    // This can sometimes (notably when trying to favour postinc) lead to
+    // sub-optimial decisions. There it is best left to the cost modelling to
+    // get correct.
+    if (!TTI.shouldFavorPostInc() || LU.Kind != LSRUse::Address) {
+      int NumReqRegsToFind = std::min(F.getNumRegs(), ReqRegs.size());
+      for (const SCEV *Reg : ReqRegs) {
+        if ((F.ScaledReg && F.ScaledReg == Reg) ||
+            is_contained(F.BaseRegs, Reg)) {
+          --NumReqRegsToFind;
+          if (NumReqRegsToFind == 0)
+            break;
+        }
+      }
+      if (NumReqRegsToFind != 0) {
+        // If none of the formulae satisfied the required registers, then we could
+        // clear ReqRegs and try again. Currently, we simply give up in this case.
+        continue;
       }
-    }
-    if (NumReqRegsToFind != 0) {
-      // If none of the formulae satisfied the required registers, then we could
-      // clear ReqRegs and try again. Currently, we simply give up in this case.
-      continue;
     }
 
     // Evaluate the cost of the current formula. If it's already worse than
@@ -5268,7 +5320,8 @@ Value *LSRInstance::Expand(const LSRUse &LU, const LSRFixup &LF,
   // form, update the ICmp's other operand.
   if (LU.Kind == LSRUse::ICmpZero) {
     ICmpInst *CI = cast<ICmpInst>(LF.UserInst);
-    DeadInsts.emplace_back(CI->getOperand(1));
+    if (auto *OperandIsInstr = dyn_cast<Instruction>(CI->getOperand(1)))
+      DeadInsts.emplace_back(OperandIsInstr);
     assert(!F.BaseGV && "ICmp does not support folding a global value and "
                            "a scale at the same time!");
     if (F.Scale == -1) {
@@ -5449,7 +5502,8 @@ void LSRInstance::Rewrite(const LSRUse &LU, const LSRFixup &LF,
       LF.UserInst->replaceUsesOfWith(LF.OperandValToReplace, FullV);
   }
 
-  DeadInsts.emplace_back(LF.OperandValToReplace);
+  if (auto *OperandIsInstr = dyn_cast<Instruction>(LF.OperandValToReplace))
+    DeadInsts.emplace_back(OperandIsInstr);
 }
 
 /// Rewrite all the fixup locations with new values, following the chosen
@@ -5490,16 +5544,17 @@ void LSRInstance::ImplementSolution(
   // instructions.
   Rewriter.clear();
 
-  Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
+  Changed |= RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts,
+                                                                  &TLI, MSSAU);
 }
 
 LSRInstance::LSRInstance(Loop *L, IVUsers &IU, ScalarEvolution &SE,
                          DominatorTree &DT, LoopInfo &LI,
                          const TargetTransformInfo &TTI, AssumptionCache &AC,
-                         TargetLibraryInfo &LibInfo)
-    : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), LibInfo(LibInfo), TTI(TTI), L(L),
-      FavorBackedgeIndex(EnableBackedgeIndexing &&
-                         TTI.shouldFavorBackedgeIndex(L)) {
+                         TargetLibraryInfo &TLI, MemorySSAUpdater *MSSAU)
+    : IU(IU), SE(SE), DT(DT), LI(LI), AC(AC), TLI(TLI), TTI(TTI), L(L),
+      MSSAU(MSSAU), FavorBackedgeIndex(EnableBackedgeIndexing &&
+                                       TTI.shouldFavorBackedgeIndex(L)) {
   // If LoopSimplify form is not available, stay out of trouble.
   if (!L->isLoopSimplifyForm())
     return;
@@ -5702,21 +5757,26 @@ void LoopStrengthReduce::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<IVUsersWrapperPass>();
   AU.addPreserved<IVUsersWrapperPass>();
   AU.addRequired<TargetTransformInfoWrapperPass>();
+  AU.addPreserved<MemorySSAWrapperPass>();
 }
 
 static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
                                DominatorTree &DT, LoopInfo &LI,
                                const TargetTransformInfo &TTI,
-                               AssumptionCache &AC,
-                               TargetLibraryInfo &LibInfo) {
+                               AssumptionCache &AC, TargetLibraryInfo &TLI,
+                               MemorySSA *MSSA) {
 
   bool Changed = false;
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  if (MSSA)
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
 
   // Run the main LSR transformation.
-  Changed |= LSRInstance(L, IU, SE, DT, LI, TTI, AC, LibInfo).getChanged();
+  Changed |=
+      LSRInstance(L, IU, SE, DT, LI, TTI, AC, TLI, MSSAU.get()).getChanged();
 
   // Remove any extra phis created by processing inner loops.
-  Changed |= DeleteDeadPHIs(L->getHeader());
+  Changed |= DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
   if (EnablePhiElim && L->isLoopSimplifyForm()) {
     SmallVector<WeakTrackingVH, 16> DeadInsts;
     const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
@@ -5727,8 +5787,9 @@ static bool ReduceLoopStrength(Loop *L, IVUsers &IU, ScalarEvolution &SE,
     unsigned numFolded = Rewriter.replaceCongruentIVs(L, &DT, DeadInsts, &TTI);
     if (numFolded) {
       Changed = true;
-      DeleteTriviallyDeadInstructions(DeadInsts);
-      DeleteDeadPHIs(L->getHeader());
+      RecursivelyDeleteTriviallyDeadInstructionsPermissive(DeadInsts, &TLI,
+                                                           MSSAU.get());
+      DeleteDeadPHIs(L->getHeader(), &TLI, MSSAU.get());
     }
   }
   return Changed;
@@ -5746,19 +5807,26 @@ bool LoopStrengthReduce::runOnLoop(Loop *L, LPPassManager & /*LPM*/) {
       *L->getHeader()->getParent());
   auto &AC = getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
       *L->getHeader()->getParent());
-  auto &LibInfo = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
+  auto &TLI = getAnalysis<TargetLibraryInfoWrapperPass>().getTLI(
       *L->getHeader()->getParent());
-  return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, LibInfo);
+  auto *MSSAAnalysis = getAnalysisIfAvailable<MemorySSAWrapperPass>();
+  MemorySSA *MSSA = nullptr;
+  if (MSSAAnalysis)
+    MSSA = &MSSAAnalysis->getMSSA();
+  return ReduceLoopStrength(L, IU, SE, DT, LI, TTI, AC, TLI, MSSA);
 }
 
 PreservedAnalyses LoopStrengthReducePass::run(Loop &L, LoopAnalysisManager &AM,
                                               LoopStandardAnalysisResults &AR,
                                               LPMUpdater &) {
   if (!ReduceLoopStrength(&L, AM.getResult<IVUsersAnalysis>(L, AR), AR.SE,
-                          AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI))
+                          AR.DT, AR.LI, AR.TTI, AR.AC, AR.TLI, AR.MSSA))
     return PreservedAnalyses::all();
 
-  return getLoopPassPreservedAnalyses();
+  auto PA = getLoopPassPreservedAnalyses();
+  if (AR.MSSA)
+    PA.preserve<MemorySSAAnalysis>();
+  return PA;
 }
 
 char LoopStrengthReduce::ID = 0;
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
index 92ad8dafa5ab..285cba6ee205 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollAndJamPass.cpp
@@ -11,8 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/LoopUnrollAndJamPass.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/None.h"
-#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PriorityWorklist.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Analysis/AssumptionCache.h"
@@ -20,37 +22,36 @@
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/LoopAnalysisManager.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CFG.h"
-#include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
-#include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/PassManager.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/PassRegistry.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/Debug.h"
-#include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
-#include "llvm/Transforms/Scalar/LoopPassManager.h"
-#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
-#include <algorithm>
 #include <cassert>
 #include <cstdint>
-#include <string>
+#include <vector>
+
+namespace llvm {
+class Instruction;
+class Value;
+} // namespace llvm
 
 using namespace llvm;
 
@@ -91,7 +92,7 @@ static cl::opt<unsigned> PragmaUnrollAndJamThreshold(
 // Returns the loop hint metadata node with the given name (for example,
 // "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
 // returned.
-static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) {
   if (MDNode *LoopID = L->getLoopID())
     return GetUnrollMetadata(LoopID, Name);
   return nullptr;
@@ -99,14 +100,14 @@ static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
 
 // Returns true if the loop has any metadata starting with Prefix. For example a
 // Prefix of "llvm.loop.unroll." returns true if we have any unroll metadata.
-static bool HasAnyUnrollPragma(const Loop *L, StringRef Prefix) {
+static bool hasAnyUnrollPragma(const Loop *L, StringRef Prefix) {
   if (MDNode *LoopID = L->getLoopID()) {
     // First operand should refer to the loop id itself.
     assert(LoopID->getNumOperands() > 0 && "requires at least one operand");
     assert(LoopID->getOperand(0) == LoopID && "invalid loop id");
 
-    for (unsigned i = 1, e = LoopID->getNumOperands(); i < e; ++i) {
-      MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(i));
+    for (unsigned I = 1, E = LoopID->getNumOperands(); I < E; ++I) {
+      MDNode *MD = dyn_cast<MDNode>(LoopID->getOperand(I));
       if (!MD)
         continue;
 
@@ -122,14 +123,14 @@ static bool HasAnyUnrollPragma(const Loop *L, StringRef Prefix) {
 }
 
 // Returns true if the loop has an unroll_and_jam(enable) pragma.
-static bool HasUnrollAndJamEnablePragma(const Loop *L) {
-  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable");
+static bool hasUnrollAndJamEnablePragma(const Loop *L) {
+  return getUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.enable");
 }
 
 // If loop has an unroll_and_jam_count pragma return the (necessarily
 // positive) value from the pragma.  Otherwise return 0.
-static unsigned UnrollAndJamCountPragmaValue(const Loop *L) {
-  MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.count");
+static unsigned unrollAndJamCountPragmaValue(const Loop *L) {
+  MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll_and_jam.count");
   if (MD) {
     assert(MD->getNumOperands() == 2 &&
            "Unroll count hint metadata should have two operands.");
@@ -157,7 +158,8 @@ static bool computeUnrollAndJamCount(
     const SmallPtrSetImpl<const Value *> &EphValues,
     OptimizationRemarkEmitter *ORE, unsigned OuterTripCount,
     unsigned OuterTripMultiple, unsigned OuterLoopSize, unsigned InnerTripCount,
-    unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP) {
+    unsigned InnerLoopSize, TargetTransformInfo::UnrollingPreferences &UP,
+    TargetTransformInfo::PeelingPreferences &PP) {
   // First up use computeUnrollCount from the loop unroller to get a count
   // for unrolling the outer loop, plus any loops requiring explicit
   // unrolling we leave to the unroller. This uses UP.Threshold /
@@ -167,7 +169,8 @@ static bool computeUnrollAndJamCount(
   bool UseUpperBound = false;
   bool ExplicitUnroll = computeUnrollCount(
       L, TTI, DT, LI, SE, EphValues, ORE, OuterTripCount, MaxTripCount,
-      /*MaxOrZero*/ false, OuterTripMultiple, OuterLoopSize, UP, UseUpperBound);
+      /*MaxOrZero*/ false, OuterTripMultiple, OuterLoopSize, UP, PP,
+      UseUpperBound);
   if (ExplicitUnroll || UseUpperBound) {
     // If the user explicitly set the loop as unrolled, dont UnJ it. Leave it
     // for the unroller instead.
@@ -190,7 +193,7 @@ static bool computeUnrollAndJamCount(
   }
 
   // Check for unroll_and_jam pragmas
-  unsigned PragmaCount = UnrollAndJamCountPragmaValue(L);
+  unsigned PragmaCount = unrollAndJamCountPragmaValue(L);
   if (PragmaCount > 0) {
     UP.Count = PragmaCount;
     UP.Runtime = true;
@@ -202,7 +205,7 @@ static bool computeUnrollAndJamCount(
       return true;
   }
 
-  bool PragmaEnableUnroll = HasUnrollAndJamEnablePragma(L);
+  bool PragmaEnableUnroll = hasUnrollAndJamEnablePragma(L);
   bool ExplicitUnrollAndJamCount = PragmaCount > 0 || UserUnrollCount;
   bool ExplicitUnrollAndJam = PragmaEnableUnroll || ExplicitUnrollAndJamCount;
 
@@ -279,24 +282,11 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
                       ScalarEvolution &SE, const TargetTransformInfo &TTI,
                       AssumptionCache &AC, DependenceInfo &DI,
                       OptimizationRemarkEmitter &ORE, int OptLevel) {
-  // Quick checks of the correct loop form
-  if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
-    return LoopUnrollResult::Unmodified;
-  Loop *SubLoop = L->getSubLoops()[0];
-  if (!SubLoop->isLoopSimplifyForm())
-    return LoopUnrollResult::Unmodified;
-
-  BasicBlock *Latch = L->getLoopLatch();
-  BasicBlock *Exit = L->getExitingBlock();
-  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
-  BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
-
-  if (Latch != Exit || SubLoopLatch != SubLoopExit)
-    return LoopUnrollResult::Unmodified;
-
   TargetTransformInfo::UnrollingPreferences UP =
       gatherUnrollingPreferences(L, SE, TTI, nullptr, nullptr, OptLevel, None,
-                                 None, None, None, None, None, None, None);
+                                 None, None, None, None, None);
+  TargetTransformInfo::PeelingPreferences PP =
+      gatherPeelingPreferences(L, SE, TTI, None, None);
   if (AllowUnrollAndJam.getNumOccurrences() > 0)
     UP.UnrollAndJam = AllowUnrollAndJam;
   if (UnrollAndJamThreshold.getNumOccurrences() > 0)
@@ -317,13 +307,13 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   // the unroller, so long as it does not explicitly have unroll_and_jam
   // metadata. This means #pragma nounroll will disable unroll and jam as well
   // as unrolling
-  if (HasAnyUnrollPragma(L, "llvm.loop.unroll.") &&
-      !HasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam.")) {
+  if (hasAnyUnrollPragma(L, "llvm.loop.unroll.") &&
+      !hasAnyUnrollPragma(L, "llvm.loop.unroll_and_jam.")) {
     LLVM_DEBUG(dbgs() << "  Disabled due to pragma.\n");
     return LoopUnrollResult::Unmodified;
   }
 
-  if (!isSafeToUnrollAndJam(L, SE, DT, DI)) {
+  if (!isSafeToUnrollAndJam(L, SE, DT, DI, *LI)) {
     LLVM_DEBUG(dbgs() << "  Disabled due to not being safe.\n");
     return LoopUnrollResult::Unmodified;
   }
@@ -334,6 +324,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   bool Convergent;
   SmallPtrSet<const Value *, 32> EphValues;
   CodeMetrics::collectEphemeralValues(L, &AC, EphValues);
+  Loop *SubLoop = L->getSubLoops()[0];
   unsigned InnerLoopSize =
       ApproximateLoopSize(SubLoop, NumInlineCandidates, NotDuplicatable,
                           Convergent, TTI, EphValues, UP.BEInsns);
@@ -371,6 +362,8 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
     SubLoop->setLoopID(NewInnerEpilogueLoopID.getValue());
 
   // Find trip count and trip multiple
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
   unsigned OuterTripCount = SE.getSmallConstantTripCount(L, Latch);
   unsigned OuterTripMultiple = SE.getSmallConstantTripMultiple(L, Latch);
   unsigned InnerTripCount = SE.getSmallConstantTripCount(SubLoop, SubLoopLatch);
@@ -378,7 +371,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   // Decide if, and by how much, to unroll
   bool IsCountSetExplicitly = computeUnrollAndJamCount(
       L, SubLoop, TTI, DT, LI, SE, EphValues, &ORE, OuterTripCount,
-      OuterTripMultiple, OuterLoopSize, InnerTripCount, InnerLoopSize, UP);
+      OuterTripMultiple, OuterLoopSize, InnerTripCount, InnerLoopSize, UP, PP);
   if (UP.Count <= 1)
     return LoopUnrollResult::Unmodified;
   // Unroll factor (Count) must be less or equal to TripCount.
@@ -388,7 +381,7 @@ tryToUnrollAndJamLoop(Loop *L, DominatorTree &DT, LoopInfo *LI,
   Loop *EpilogueOuterLoop = nullptr;
   LoopUnrollResult UnrollResult = UnrollAndJamLoop(
       L, UP.Count, OuterTripCount, OuterTripMultiple, UP.UnrollRemainder, LI,
-      &SE, &DT, &AC, &ORE, &EpilogueOuterLoop);
+      &SE, &DT, &AC, &TTI, &ORE, &EpilogueOuterLoop);
 
   // Assign new loop attributes.
   if (EpilogueOuterLoop) {
@@ -435,22 +428,23 @@ static bool tryToUnrollAndJamLoop(Function &F, DominatorTree &DT, LoopInfo &LI,
                                   int OptLevel) {
   bool DidSomething = false;
 
-  // The loop unroll and jam pass requires loops to be in simplified form, and also needs LCSSA.
-  // Since simplification may add new inner loops, it has to run before the
-  // legality and profitability checks. This means running the loop unroll and jam pass
-  // will simplify all loops, regardless of whether anything end up being
-  // unroll and jammed.
+  // The loop unroll and jam pass requires loops to be in simplified form, and
+  // also needs LCSSA. Since simplification may add new inner loops, it has to
+  // run before the legality and profitability checks. This means running the
+  // loop unroll and jam pass will simplify all loops, regardless of whether
+  // anything end up being unroll and jammed.
   for (auto &L : LI) {
     DidSomething |=
         simplifyLoop(L, &DT, &LI, &SE, &AC, nullptr, false /* PreserveLCSSA */);
     DidSomething |= formLCSSARecursively(*L, DT, &LI, &SE);
   }
 
+  // Add the loop nests in the reverse order of LoopInfo. See method
+  // declaration.
   SmallPriorityWorklist<Loop *, 4> Worklist;
-  internal::appendLoopsToWorklist(reverse(LI), Worklist);
+  appendLoopsToWorklist(LI, Worklist);
   while (!Worklist.empty()) {
     Loop *L = Worklist.pop_back_val();
-    formLCSSA(*L, DT, &LI, &SE);
     LoopUnrollResult Result =
         tryToUnrollAndJamLoop(L, DT, &LI, SE, TTI, AC, DI, ORE, OptLevel);
     if (Result != LoopUnrollResult::Unmodified)
diff --git a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
index 4c2b079c6bb5..87f40bb7ba85 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnrollPass.cpp
@@ -154,6 +154,10 @@ static cl::opt<bool>
                        cl::desc("Allows loops to be peeled when the dynamic "
                                 "trip count is known to be low."));
 
+static cl::opt<bool> UnrollAllowLoopNestsPeeling(
+    "unroll-allow-loop-nests-peeling", cl::init(false), cl::Hidden,
+    cl::desc("Allows loop nests to be peeled."));
+
 static cl::opt<bool> UnrollUnrollRemainder(
   "unroll-remainder", cl::Hidden,
   cl::desc("Allow the loop remainder to be unrolled."));
@@ -167,6 +171,16 @@ static cl::opt<bool> UnrollRevisitChildLoops(
              "This shouldn't typically be needed as child loops (or their "
              "clones) were already visited."));
 
+static cl::opt<unsigned> UnrollThresholdAggressive(
+    "unroll-threshold-aggressive", cl::init(300), cl::Hidden,
+    cl::desc("Threshold (max size of unrolled loop) to use in aggressive (O3) "
+             "optimizations"));
+static cl::opt<unsigned>
+    UnrollThresholdDefault("unroll-threshold-default", cl::init(150),
+                           cl::Hidden,
+                           cl::desc("Default threshold (max size of unrolled "
+                                    "loop), used in all but O3 optimizations"));
+
 /// A magic value for use with the Threshold parameter to indicate
 /// that the loop unroll should be performed regardless of how much
 /// code expansion would result.
@@ -179,19 +193,17 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     BlockFrequencyInfo *BFI, ProfileSummaryInfo *PSI, int OptLevel,
     Optional<unsigned> UserThreshold, Optional<unsigned> UserCount,
     Optional<bool> UserAllowPartial, Optional<bool> UserRuntime,
-    Optional<bool> UserUpperBound, Optional<bool> UserAllowPeeling,
-    Optional<bool> UserAllowProfileBasedPeeling,
-    Optional<unsigned> UserFullUnrollMaxCount) {
+    Optional<bool> UserUpperBound, Optional<unsigned> UserFullUnrollMaxCount) {
   TargetTransformInfo::UnrollingPreferences UP;
 
   // Set up the defaults
-  UP.Threshold = OptLevel > 2 ? 300 : 150;
+  UP.Threshold =
+      OptLevel > 2 ? UnrollThresholdAggressive : UnrollThresholdDefault;
   UP.MaxPercentThresholdBoost = 400;
   UP.OptSizeThreshold = 0;
   UP.PartialThreshold = 150;
   UP.PartialOptSizeThreshold = 0;
   UP.Count = 0;
-  UP.PeelCount = 0;
   UP.DefaultUnrollRuntimeCount = 8;
   UP.MaxCount = std::numeric_limits<unsigned>::max();
   UP.FullUnrollMaxCount = std::numeric_limits<unsigned>::max();
@@ -203,10 +215,9 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
   UP.AllowExpensiveTripCount = false;
   UP.Force = false;
   UP.UpperBound = false;
-  UP.AllowPeeling = true;
   UP.UnrollAndJam = false;
-  UP.PeelProfiledIterations = true;
   UP.UnrollAndJamInnerLoopThreshold = 60;
+  UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
 
   // Override with any target specific settings
   TTI.getUnrollingPreferences(L, SE, UP);
@@ -232,8 +243,6 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     UP.MaxCount = UnrollMaxCount;
   if (UnrollFullMaxCount.getNumOccurrences() > 0)
     UP.FullUnrollMaxCount = UnrollFullMaxCount;
-  if (UnrollPeelCount.getNumOccurrences() > 0)
-    UP.PeelCount = UnrollPeelCount;
   if (UnrollAllowPartial.getNumOccurrences() > 0)
     UP.Partial = UnrollAllowPartial;
   if (UnrollAllowRemainder.getNumOccurrences() > 0)
@@ -242,10 +251,10 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     UP.Runtime = UnrollRuntime;
   if (UnrollMaxUpperBound == 0)
     UP.UpperBound = false;
-  if (UnrollAllowPeeling.getNumOccurrences() > 0)
-    UP.AllowPeeling = UnrollAllowPeeling;
   if (UnrollUnrollRemainder.getNumOccurrences() > 0)
     UP.UnrollRemainder = UnrollUnrollRemainder;
+  if (UnrollMaxIterationsCountToAnalyze.getNumOccurrences() > 0)
+    UP.MaxIterationsCountToAnalyze = UnrollMaxIterationsCountToAnalyze;
 
   // Apply user values provided by argument
   if (UserThreshold.hasValue()) {
@@ -260,16 +269,45 @@ TargetTransformInfo::UnrollingPreferences llvm::gatherUnrollingPreferences(
     UP.Runtime = *UserRuntime;
   if (UserUpperBound.hasValue())
     UP.UpperBound = *UserUpperBound;
-  if (UserAllowPeeling.hasValue())
-    UP.AllowPeeling = *UserAllowPeeling;
-  if (UserAllowProfileBasedPeeling.hasValue())
-    UP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
   if (UserFullUnrollMaxCount.hasValue())
     UP.FullUnrollMaxCount = *UserFullUnrollMaxCount;
 
   return UP;
 }
 
+TargetTransformInfo::PeelingPreferences
+llvm::gatherPeelingPreferences(Loop *L, ScalarEvolution &SE,
+                               const TargetTransformInfo &TTI,
+                               Optional<bool> UserAllowPeeling,
+                               Optional<bool> UserAllowProfileBasedPeeling) {
+  TargetTransformInfo::PeelingPreferences PP;
+
+  // Default values
+  PP.PeelCount = 0;
+  PP.AllowPeeling = true;
+  PP.AllowLoopNestsPeeling = false;
+  PP.PeelProfiledIterations = true;
+
+  // Get Target Specifc Values
+  TTI.getPeelingPreferences(L, SE, PP);
+
+  // User Specified Values using cl::opt
+  if (UnrollPeelCount.getNumOccurrences() > 0)
+    PP.PeelCount = UnrollPeelCount;
+  if (UnrollAllowPeeling.getNumOccurrences() > 0)
+    PP.AllowPeeling = UnrollAllowPeeling;
+  if (UnrollAllowLoopNestsPeeling.getNumOccurrences() > 0)
+    PP.AllowLoopNestsPeeling = UnrollAllowLoopNestsPeeling;
+
+  // User Specifed values provided by argument
+  if (UserAllowPeeling.hasValue())
+    PP.AllowPeeling = *UserAllowPeeling;
+  if (UserAllowProfileBasedPeeling.hasValue())
+    PP.PeelProfiledIterations = *UserAllowProfileBasedPeeling;
+
+  return PP;
+}
+
 namespace {
 
 /// A struct to densely store the state of an instruction after unrolling at
@@ -335,11 +373,12 @@ struct EstimatedUnrollCost {
 static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
     const Loop *L, unsigned TripCount, DominatorTree &DT, ScalarEvolution &SE,
     const SmallPtrSetImpl<const Value *> &EphValues,
-    const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize) {
+    const TargetTransformInfo &TTI, unsigned MaxUnrolledLoopSize,
+    unsigned MaxIterationsCountToAnalyze) {
   // We want to be able to scale offsets by the trip count and add more offsets
   // to them without checking for overflows, and we already don't want to
   // analyze *massive* trip counts, so we force the max to be reasonably small.
-  assert(UnrollMaxIterationsCountToAnalyze <
+  assert(MaxIterationsCountToAnalyze <
              (unsigned)(std::numeric_limits<int>::max() / 2) &&
          "The unroll iterations max is too large!");
 
@@ -349,8 +388,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
     return None;
 
   // Don't simulate loops with a big or unknown tripcount
-  if (!UnrollMaxIterationsCountToAnalyze || !TripCount ||
-      TripCount > UnrollMaxIterationsCountToAnalyze)
+  if (!TripCount || TripCount > MaxIterationsCountToAnalyze)
     return None;
 
   SmallSetVector<BasicBlock *, 16> BBWorklist;
@@ -428,7 +466,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
 
         // First accumulate the cost of this instruction.
         if (!Cost.IsFree) {
-          UnrolledCost += TTI.getUserCost(I);
+          UnrolledCost += TTI.getUserCost(I, TargetTransformInfo::TCK_CodeSize);
           LLVM_DEBUG(dbgs() << "Adding cost of instruction (iteration "
                             << Iteration << "): ");
           LLVM_DEBUG(I->dump());
@@ -521,7 +559,7 @@ static Optional<EstimatedUnrollCost> analyzeLoopUnrollCost(
 
         // Track this instruction's expected baseline cost when executing the
         // rolled loop form.
-        RolledDynamicCost += TTI.getUserCost(&I);
+        RolledDynamicCost += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
 
         // Visit the instruction to analyze its loop cost after unrolling,
         // and if the visitor returns true, mark the instruction as free after
@@ -665,32 +703,32 @@ unsigned llvm::ApproximateLoopSize(
 // Returns the loop hint metadata node with the given name (for example,
 // "llvm.loop.unroll.count").  If no such metadata node exists, then nullptr is
 // returned.
-static MDNode *GetUnrollMetadataForLoop(const Loop *L, StringRef Name) {
+static MDNode *getUnrollMetadataForLoop(const Loop *L, StringRef Name) {
   if (MDNode *LoopID = L->getLoopID())
     return GetUnrollMetadata(LoopID, Name);
   return nullptr;
 }
 
 // Returns true if the loop has an unroll(full) pragma.
-static bool HasUnrollFullPragma(const Loop *L) {
-  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
+static bool hasUnrollFullPragma(const Loop *L) {
+  return getUnrollMetadataForLoop(L, "llvm.loop.unroll.full");
 }
 
 // Returns true if the loop has an unroll(enable) pragma. This metadata is used
 // for both "#pragma unroll" and "#pragma clang loop unroll(enable)" directives.
-static bool HasUnrollEnablePragma(const Loop *L) {
-  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
+static bool hasUnrollEnablePragma(const Loop *L) {
+  return getUnrollMetadataForLoop(L, "llvm.loop.unroll.enable");
 }
 
 // Returns true if the loop has an runtime unroll(disable) pragma.
-static bool HasRuntimeUnrollDisablePragma(const Loop *L) {
-  return GetUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
+static bool hasRuntimeUnrollDisablePragma(const Loop *L) {
+  return getUnrollMetadataForLoop(L, "llvm.loop.unroll.runtime.disable");
 }
 
 // If loop has an unroll_count pragma return the (necessarily
 // positive) value from the pragma.  Otherwise return 0.
-static unsigned UnrollCountPragmaValue(const Loop *L) {
-  MDNode *MD = GetUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
+static unsigned unrollCountPragmaValue(const Loop *L) {
+  MDNode *MD = getUnrollMetadataForLoop(L, "llvm.loop.unroll.count");
   if (MD) {
     assert(MD->getNumOperands() == 2 &&
            "Unroll count hint metadata should have two operands.");
@@ -740,7 +778,8 @@ bool llvm::computeUnrollCount(
     ScalarEvolution &SE, const SmallPtrSetImpl<const Value *> &EphValues,
     OptimizationRemarkEmitter *ORE, unsigned &TripCount, unsigned MaxTripCount,
     bool MaxOrZero, unsigned &TripMultiple, unsigned LoopSize,
-    TargetTransformInfo::UnrollingPreferences &UP, bool &UseUpperBound) {
+    TargetTransformInfo::UnrollingPreferences &UP,
+    TargetTransformInfo::PeelingPreferences &PP, bool &UseUpperBound) {
 
   // Check for explicit Count.
   // 1st priority is unroll count set by "unroll-count" option.
@@ -754,7 +793,7 @@ bool llvm::computeUnrollCount(
   }
 
   // 2nd priority is unroll count set by pragma.
-  unsigned PragmaCount = UnrollCountPragmaValue(L);
+  unsigned PragmaCount = unrollCountPragmaValue(L);
   if (PragmaCount > 0) {
     UP.Count = PragmaCount;
     UP.Runtime = true;
@@ -764,14 +803,14 @@ bool llvm::computeUnrollCount(
         getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
       return true;
   }
-  bool PragmaFullUnroll = HasUnrollFullPragma(L);
+  bool PragmaFullUnroll = hasUnrollFullPragma(L);
   if (PragmaFullUnroll && TripCount != 0) {
     UP.Count = TripCount;
     if (getUnrolledLoopSize(LoopSize, UP) < PragmaUnrollThreshold)
       return false;
   }
 
-  bool PragmaEnableUnroll = HasUnrollEnablePragma(L);
+  bool PragmaEnableUnroll = hasUnrollEnablePragma(L);
   bool ExplicitUnroll = PragmaCount > 0 || PragmaFullUnroll ||
                         PragmaEnableUnroll || UserUnrollCount;
 
@@ -827,7 +866,8 @@ bool llvm::computeUnrollCount(
       // To check that, run additional analysis on the loop.
       if (Optional<EstimatedUnrollCost> Cost = analyzeLoopUnrollCost(
               L, FullUnrollTripCount, DT, SE, EphValues, TTI,
-              UP.Threshold * UP.MaxPercentThresholdBoost / 100)) {
+              UP.Threshold * UP.MaxPercentThresholdBoost / 100,
+              UP.MaxIterationsCountToAnalyze)) {
         unsigned Boost =
             getFullUnrollBoostingFactor(*Cost, UP.MaxPercentThresholdBoost);
         if (Cost->UnrolledCost < UP.Threshold * Boost / 100) {
@@ -841,8 +881,8 @@ bool llvm::computeUnrollCount(
   }
 
   // 4th priority is loop peeling.
-  computePeelCount(L, LoopSize, UP, TripCount, SE);
-  if (UP.PeelCount) {
+  computePeelCount(L, LoopSize, UP, PP, TripCount, SE);
+  if (PP.PeelCount) {
     UP.Runtime = false;
     UP.Count = 1;
     return ExplicitUnroll;
@@ -925,7 +965,7 @@ bool llvm::computeUnrollCount(
 
   // 6th priority is runtime unrolling.
   // Don't unroll a runtime trip count loop when it is disabled.
-  if (HasRuntimeUnrollDisablePragma(L)) {
+  if (hasRuntimeUnrollDisablePragma(L)) {
     UP.Count = 0;
     return false;
   }
@@ -1045,8 +1085,9 @@ static LoopUnrollResult tryToUnrollLoop(
   TargetTransformInfo::UnrollingPreferences UP = gatherUnrollingPreferences(
       L, SE, TTI, BFI, PSI, OptLevel, ProvidedThreshold, ProvidedCount,
       ProvidedAllowPartial, ProvidedRuntime, ProvidedUpperBound,
-      ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling,
       ProvidedFullUnrollMaxCount);
+  TargetTransformInfo::PeelingPreferences PP = gatherPeelingPreferences(
+      L, SE, TTI, ProvidedAllowPeeling, ProvidedAllowProfileBasedPeeling);
 
   // Exit early if unrolling is disabled. For OptForSize, we pick the loop size
   // as threshold later on.
@@ -1120,7 +1161,7 @@ static LoopUnrollResult tryToUnrollLoop(
   bool UseUpperBound = false;
   bool IsCountSetExplicitly = computeUnrollCount(
       L, TTI, DT, LI, SE, EphValues, &ORE, TripCount, MaxTripCount, MaxOrZero,
-      TripMultiple, LoopSize, UP, UseUpperBound);
+      TripMultiple, LoopSize, UP, PP, UseUpperBound);
   if (!UP.Count)
     return LoopUnrollResult::Unmodified;
   // Unroll factor (Count) must be less or equal to TripCount.
@@ -1135,9 +1176,9 @@ static LoopUnrollResult tryToUnrollLoop(
   LoopUnrollResult UnrollResult = UnrollLoop(
       L,
       {UP.Count, TripCount, UP.Force, UP.Runtime, UP.AllowExpensiveTripCount,
-       UseUpperBound, MaxOrZero, TripMultiple, UP.PeelCount, UP.UnrollRemainder,
+       UseUpperBound, MaxOrZero, TripMultiple, PP.PeelCount, UP.UnrollRemainder,
        ForgetAllSCEV},
-      LI, &SE, &DT, &AC, &ORE, PreserveLCSSA, &RemainderLoop);
+      LI, &SE, &DT, &AC, &TTI, &ORE, PreserveLCSSA, &RemainderLoop);
   if (UnrollResult == LoopUnrollResult::Unmodified)
     return LoopUnrollResult::Unmodified;
 
@@ -1167,7 +1208,7 @@ static LoopUnrollResult tryToUnrollLoop(
   // If the loop was peeled, we already "used up" the profile information
   // we had, so we don't want to unroll or peel again.
   if (UnrollResult != LoopUnrollResult::FullyUnrolled &&
-      (IsCountSetExplicitly || (UP.PeelProfiledIterations && UP.PeelCount)))
+      (IsCountSetExplicitly || (PP.PeelProfiledIterations && PP.PeelCount)))
     L->setLoopAlreadyUnrolled();
 
   return UnrollResult;
@@ -1296,16 +1337,10 @@ Pass *llvm::createSimpleLoopUnrollPass(int OptLevel, bool OnlyWhenForced,
 PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
                                           LoopStandardAnalysisResults &AR,
                                           LPMUpdater &Updater) {
-  const auto &FAM =
-      AM.getResult<FunctionAnalysisManagerLoopProxy>(L, AR).getManager();
-  Function *F = L.getHeader()->getParent();
-
-  auto *ORE = FAM.getCachedResult<OptimizationRemarkEmitterAnalysis>(*F);
-  // FIXME: This should probably be optional rather than required.
-  if (!ORE)
-    report_fatal_error(
-        "LoopFullUnrollPass: OptimizationRemarkEmitterAnalysis not "
-        "cached at a higher level");
+  // For the new PM, we can't use OptimizationRemarkEmitter as an analysis
+  // pass. Function analyses need to be preserved across loop transformations
+  // but ORE cannot be preserved (see comment before the pass definition).
+  OptimizationRemarkEmitter ORE(L.getHeader()->getParent());
 
   // Keep track of the previous loop structure so we can identify new loops
   // created by unrolling.
@@ -1316,9 +1351,9 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
   else
     OldLoops.insert(AR.LI.begin(), AR.LI.end());
 
-  std::string LoopName = L.getName();
+  std::string LoopName = std::string(L.getName());
 
-  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, *ORE,
+  bool Changed = tryToUnrollLoop(&L, AR.DT, &AR.LI, AR.SE, AR.TTI, AR.AC, ORE,
                                  /*BFI*/ nullptr, /*PSI*/ nullptr,
                                  /*PreserveLCSSA*/ true, OptLevel,
                                  OnlyWhenForced, ForgetSCEV, /*Count*/ None,
@@ -1384,30 +1419,6 @@ PreservedAnalyses LoopFullUnrollPass::run(Loop &L, LoopAnalysisManager &AM,
   return getLoopPassPreservedAnalyses();
 }
 
-template <typename RangeT>
-static SmallVector<Loop *, 8> appendLoopsToWorklist(RangeT &&Loops) {
-  SmallVector<Loop *, 8> Worklist;
-  // We use an internal worklist to build up the preorder traversal without
-  // recursion.
-  SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
-
-  for (Loop *RootL : Loops) {
-    assert(PreOrderLoops.empty() && "Must start with an empty preorder walk.");
-    assert(PreOrderWorklist.empty() &&
-           "Must start with an empty preorder walk worklist.");
-    PreOrderWorklist.push_back(RootL);
-    do {
-      Loop *L = PreOrderWorklist.pop_back_val();
-      PreOrderWorklist.append(L->begin(), L->end());
-      PreOrderLoops.push_back(L);
-    } while (!PreOrderWorklist.empty());
-
-    Worklist.append(PreOrderLoops.begin(), PreOrderLoops.end());
-    PreOrderLoops.clear();
-  }
-  return Worklist;
-}
-
 PreservedAnalyses LoopUnrollPass::run(Function &F,
                                       FunctionAnalysisManager &AM) {
   auto &SE = AM.getResult<ScalarEvolutionAnalysis>(F);
@@ -1421,10 +1432,9 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
   if (auto *LAMProxy = AM.getCachedResult<LoopAnalysisManagerFunctionProxy>(F))
     LAM = &LAMProxy->getManager();
 
-  const ModuleAnalysisManager &MAM =
-      AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
+  auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
   ProfileSummaryInfo *PSI =
-      MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+      MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
   auto *BFI = (PSI && PSI->hasProfileSummary()) ?
       &AM.getResult<BlockFrequencyAnalysis>(F) : nullptr;
 
@@ -1441,7 +1451,10 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
     Changed |= formLCSSARecursively(*L, DT, &LI, &SE);
   }
 
-  SmallVector<Loop *, 8> Worklist = appendLoopsToWorklist(LI);
+  // Add the loop nests in the reverse order of LoopInfo. See method
+  // declaration.
+  SmallPriorityWorklist<Loop *, 4> Worklist;
+  appendLoopsToWorklist(LI, Worklist);
 
   while (!Worklist.empty()) {
     // Because the LoopInfo stores the loops in RPO, we walk the worklist
@@ -1459,7 +1472,7 @@ PreservedAnalyses LoopUnrollPass::run(Function &F,
     Optional<bool> LocalAllowPeeling = UnrollOpts.AllowPeeling;
     if (PSI && PSI->hasHugeWorkingSetSize())
       LocalAllowPeeling = false;
-    std::string LoopName = L.getName();
+    std::string LoopName = std::string(L.getName());
     // The API here is quite complex to call and we allow to select some
     // flavors of unrolling during construction time (by setting UnrollOpts).
     LoopUnrollResult Result = tryToUnrollLoop(
diff --git a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
index 915e053704b2..645a89bbd0ff 100644
--- a/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopUnswitch.cpp
@@ -38,11 +38,11 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
+#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -158,7 +158,7 @@ namespace {
 
     // Returns true if another unswitching could be done within the cost
     // threshold.
-    bool CostAllowsUnswitching();
+    bool costAllowsUnswitching();
 
     // Clone all loop-unswitch related loop properties.
     // Redistribute unswitching quotas.
@@ -173,20 +173,20 @@ namespace {
     AssumptionCache *AC;
 
     // Used to check if second loop needs processing after
-    // RewriteLoopBodyWithConditionConstant rewrites first loop.
+    // rewriteLoopBodyWithConditionConstant rewrites first loop.
     std::vector<Loop*> LoopProcessWorklist;
 
     LUAnalysisCache BranchesInfo;
 
     bool OptimizeForSize;
-    bool redoLoop = false;
+    bool RedoLoop = false;
 
-    Loop *currentLoop = nullptr;
+    Loop *CurrentLoop = nullptr;
     DominatorTree *DT = nullptr;
     MemorySSA *MSSA = nullptr;
     std::unique_ptr<MemorySSAUpdater> MSSAU;
-    BasicBlock *loopHeader = nullptr;
-    BasicBlock *loopPreheader = nullptr;
+    BasicBlock *LoopHeader = nullptr;
+    BasicBlock *LoopPreheader = nullptr;
 
     bool SanitizeMemory;
     SimpleLoopSafetyInfo SafetyInfo;
@@ -198,15 +198,15 @@ namespace {
     // NewBlocks contained cloned copy of basic blocks from LoopBlocks.
     std::vector<BasicBlock*> NewBlocks;
 
-    bool hasBranchDivergence;
+    bool HasBranchDivergence;
 
   public:
     static char ID; // Pass ID, replacement for typeid
 
-    explicit LoopUnswitch(bool Os = false, bool hasBranchDivergence = false)
+    explicit LoopUnswitch(bool Os = false, bool HasBranchDivergence = false)
         : LoopPass(ID), OptimizeForSize(Os),
-          hasBranchDivergence(hasBranchDivergence) {
-        initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
+          HasBranchDivergence(HasBranchDivergence) {
+      initializeLoopUnswitchPass(*PassRegistry::getPassRegistry());
     }
 
     bool runOnLoop(Loop *L, LPPassManager &LPM) override;
@@ -223,48 +223,46 @@ namespace {
         AU.addRequired<MemorySSAWrapperPass>();
         AU.addPreserved<MemorySSAWrapperPass>();
       }
-      if (hasBranchDivergence)
+      if (HasBranchDivergence)
         AU.addRequired<LegacyDivergenceAnalysis>();
       getLoopAnalysisUsage(AU);
     }
 
   private:
-    void releaseMemory() override {
-      BranchesInfo.forgetLoop(currentLoop);
-    }
+    void releaseMemory() override { BranchesInfo.forgetLoop(CurrentLoop); }
 
     void initLoopData() {
-      loopHeader = currentLoop->getHeader();
-      loopPreheader = currentLoop->getLoopPreheader();
+      LoopHeader = CurrentLoop->getHeader();
+      LoopPreheader = CurrentLoop->getLoopPreheader();
     }
 
     /// Split all of the edges from inside the loop to their exit blocks.
     /// Update the appropriate Phi nodes as we do so.
-    void SplitExitEdges(Loop *L,
+    void splitExitEdges(Loop *L,
                         const SmallVectorImpl<BasicBlock *> &ExitBlocks);
 
-    bool TryTrivialLoopUnswitch(bool &Changed);
+    bool tryTrivialLoopUnswitch(bool &Changed);
 
-    bool UnswitchIfProfitable(Value *LoopCond, Constant *Val,
+    bool unswitchIfProfitable(Value *LoopCond, Constant *Val,
                               Instruction *TI = nullptr);
-    void UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+    void unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
                                   BasicBlock *ExitBlock, Instruction *TI);
-    void UnswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L,
+    void unswitchNontrivialCondition(Value *LIC, Constant *OnVal, Loop *L,
                                      Instruction *TI);
 
-    void RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
-                                              Constant *Val, bool isEqual);
+    void rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+                                              Constant *Val, bool IsEqual);
 
-    void EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+    void emitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
                                         BasicBlock *TrueDest,
                                         BasicBlock *FalseDest,
                                         BranchInst *OldBranch, Instruction *TI);
 
-    void SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L);
+    void simplifyCode(std::vector<Instruction *> &Worklist, Loop *L);
 
     /// Given that the Invariant is not equal to Val. Simplify instructions
     /// in the loop.
-    Value *SimplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
+    Value *simplifyInstructionWithNotEqual(Instruction *Inst, Value *Invariant,
                                            Constant *Val);
   };
 
@@ -347,7 +345,7 @@ bool LUAnalysisCache::isUnswitched(const SwitchInst *SI, const Value *V) {
   return (*CurLoopInstructions)[SI].count(V);
 }
 
-bool LUAnalysisCache::CostAllowsUnswitching() {
+bool LUAnalysisCache::costAllowsUnswitching() {
   return CurrentLoopProperties->CanBeUnswitchedCount > 0;
 }
 
@@ -396,8 +394,8 @@ INITIALIZE_PASS_DEPENDENCY(MemorySSAWrapperPass)
 INITIALIZE_PASS_END(LoopUnswitch, "loop-unswitch", "Unswitch loops",
                       false, false)
 
-Pass *llvm::createLoopUnswitchPass(bool Os, bool hasBranchDivergence) {
-  return new LoopUnswitch(Os, hasBranchDivergence);
+Pass *llvm::createLoopUnswitchPass(bool Os, bool HasBranchDivergence) {
+  return new LoopUnswitch(Os, HasBranchDivergence);
 }
 
 /// Operator chain lattice.
@@ -411,15 +409,15 @@ enum OperatorChain {
 /// Cond is a condition that occurs in L. If it is invariant in the loop, or has
 /// an invariant piece, return the invariant. Otherwise, return null.
 //
-/// NOTE: FindLIVLoopCondition will not return a partial LIV by walking up a
-/// mixed operator chain, as we can not reliably find a value which will simplify
-/// the operator chain. If the chain is AND-only or OR-only, we can use 0 or ~0
-/// to simplify the chain.
+/// NOTE: findLIVLoopCondition will not return a partial LIV by walking up a
+/// mixed operator chain, as we can not reliably find a value which will
+/// simplify the operator chain. If the chain is AND-only or OR-only, we can use
+/// 0 or ~0 to simplify the chain.
 ///
 /// NOTE: In case a partial LIV and a mixed operator chain, we may be able to
 /// simplify the condition itself to a loop variant condition, but at the
 /// cost of creating an entirely new loop.
-static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+static Value *findLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
                                    OperatorChain &ParentChain,
                                    DenseMap<Value *, Value *> &Cache,
                                    MemorySSAUpdater *MSSAU) {
@@ -479,7 +477,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
         // If either the left or right side is invariant, we can unswitch on this,
         // which will cause the branch to go away in one loop and the condition to
         // simplify in the other one.
-        if (Value *LHS = FindLIVLoopCondition(BO->getOperand(0), L, Changed,
+        if (Value *LHS = findLIVLoopCondition(BO->getOperand(0), L, Changed,
                                               ParentChain, Cache, MSSAU)) {
           Cache[Cond] = LHS;
           return LHS;
@@ -487,7 +485,7 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
         // We did not manage to find a partial LIV in operand(0). Backtrack and try
         // operand(1).
         ParentChain = NewChain;
-        if (Value *RHS = FindLIVLoopCondition(BO->getOperand(1), L, Changed,
+        if (Value *RHS = findLIVLoopCondition(BO->getOperand(1), L, Changed,
                                               ParentChain, Cache, MSSAU)) {
           Cache[Cond] = RHS;
           return RHS;
@@ -503,11 +501,11 @@ static Value *FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
 /// an invariant piece, return the invariant along with the operator chain type.
 /// Otherwise, return null.
 static std::pair<Value *, OperatorChain>
-FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
+findLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
                      MemorySSAUpdater *MSSAU) {
   DenseMap<Value *, Value *> Cache;
   OperatorChain OpChain = OC_OpChainNone;
-  Value *FCond = FindLIVLoopCondition(Cond, L, Changed, OpChain, Cache, MSSAU);
+  Value *FCond = findLIVLoopCondition(Cond, L, Changed, OpChain, Cache, MSSAU);
 
   // In case we do find a LIV, it can not be obtained by walking up a mixed
   // operator chain.
@@ -516,22 +514,22 @@ FindLIVLoopCondition(Value *Cond, Loop *L, bool &Changed,
   return {FCond, OpChain};
 }
 
-bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
+bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPMRef) {
   if (skipLoop(L))
     return false;
 
   AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(
       *L->getHeader()->getParent());
   LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
-  LPM = &LPM_Ref;
+  LPM = &LPMRef;
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
   if (EnableMSSALoopDependency) {
     MSSA = &getAnalysis<MemorySSAWrapperPass>().getMSSA();
     MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
     assert(DT && "Cannot update MemorySSA without a valid DomTree.");
   }
-  currentLoop = L;
-  Function *F = currentLoop->getHeader()->getParent();
+  CurrentLoop = L;
+  Function *F = CurrentLoop->getHeader()->getParent();
 
   SanitizeMemory = F->hasFnAttribute(Attribute::SanitizeMemory);
   if (SanitizeMemory)
@@ -542,12 +540,12 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
 
   bool Changed = false;
   do {
-    assert(currentLoop->isLCSSAForm(*DT));
+    assert(CurrentLoop->isLCSSAForm(*DT));
     if (MSSA && VerifyMemorySSA)
       MSSA->verifyMemorySSA();
-    redoLoop = false;
+    RedoLoop = false;
     Changed |= processCurrentLoop();
-  } while(redoLoop);
+  } while (RedoLoop);
 
   if (MSSA && VerifyMemorySSA)
     MSSA->verifyMemorySSA();
@@ -560,7 +558,7 @@ bool LoopUnswitch::runOnLoop(Loop *L, LPPassManager &LPM_Ref) {
 bool LoopUnswitch::isUnreachableDueToPreviousUnswitching(BasicBlock *BB) {
   auto *Node = DT->getNode(BB)->getIDom();
   BasicBlock *DomBB = Node->getBlock();
-  while (currentLoop->contains(DomBB)) {
+  while (CurrentLoop->contains(DomBB)) {
     BranchInst *BInst = dyn_cast<BranchInst>(DomBB->getTerminator());
 
     Node = DT->getNode(DomBB)->getIDom();
@@ -591,7 +589,7 @@ bool LoopUnswitch::isUnreachableDueToPreviousUnswitching(BasicBlock *BB) {
 /// causing problems. Detail could be found in PR31652. Note if the
 /// func returns true, it is unsafe. But if it is false, it doesn't mean
 /// it is necessarily safe.
-static bool EqualityPropUnSafe(Value &LoopCond) {
+static bool equalityPropUnSafe(Value &LoopCond) {
   ICmpInst *CI = dyn_cast<ICmpInst>(&LoopCond);
   if (!CI || !CI->isEquality())
     return false;
@@ -601,7 +599,7 @@ static bool EqualityPropUnSafe(Value &LoopCond) {
   if (isa<UndefValue>(LHS) || isa<UndefValue>(RHS))
     return true;
 
-  auto hasUndefInPHI = [](PHINode &PN) {
+  auto HasUndefInPHI = [](PHINode &PN) {
     for (Value *Opd : PN.incoming_values()) {
       if (isa<UndefValue>(Opd))
         return true;
@@ -610,10 +608,10 @@ static bool EqualityPropUnSafe(Value &LoopCond) {
   };
   PHINode *LPHI = dyn_cast<PHINode>(LHS);
   PHINode *RPHI = dyn_cast<PHINode>(RHS);
-  if ((LPHI && hasUndefInPHI(*LPHI)) || (RPHI && hasUndefInPHI(*RPHI)))
+  if ((LPHI && HasUndefInPHI(*LPHI)) || (RPHI && HasUndefInPHI(*RPHI)))
     return true;
 
-  auto hasUndefInSelect = [](SelectInst &SI) {
+  auto HasUndefInSelect = [](SelectInst &SI) {
     if (isa<UndefValue>(SI.getTrueValue()) ||
         isa<UndefValue>(SI.getFalseValue()))
       return true;
@@ -621,7 +619,7 @@ static bool EqualityPropUnSafe(Value &LoopCond) {
   };
   SelectInst *LSI = dyn_cast<SelectInst>(LHS);
   SelectInst *RSI = dyn_cast<SelectInst>(RHS);
-  if ((LSI && hasUndefInSelect(*LSI)) || (RSI && hasUndefInSelect(*RSI)))
+  if ((LSI && HasUndefInSelect(*LSI)) || (RSI && HasUndefInSelect(*RSI)))
     return true;
   return false;
 }
@@ -633,35 +631,36 @@ bool LoopUnswitch::processCurrentLoop() {
   initLoopData();
 
   // If LoopSimplify was unable to form a preheader, don't do any unswitching.
-  if (!loopPreheader)
+  if (!LoopPreheader)
     return false;
 
   // Loops with indirectbr cannot be cloned.
-  if (!currentLoop->isSafeToClone())
+  if (!CurrentLoop->isSafeToClone())
     return false;
 
   // Without dedicated exits, splitting the exit edge may fail.
-  if (!currentLoop->hasDedicatedExits())
+  if (!CurrentLoop->hasDedicatedExits())
     return false;
 
-  LLVMContext &Context = loopHeader->getContext();
+  LLVMContext &Context = LoopHeader->getContext();
 
   // Analyze loop cost, and stop unswitching if loop content can not be duplicated.
   if (!BranchesInfo.countLoop(
-          currentLoop, getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
-                           *currentLoop->getHeader()->getParent()),
+          CurrentLoop,
+          getAnalysis<TargetTransformInfoWrapperPass>().getTTI(
+              *CurrentLoop->getHeader()->getParent()),
           AC))
     return false;
 
   // Try trivial unswitch first before loop over other basic blocks in the loop.
-  if (TryTrivialLoopUnswitch(Changed)) {
+  if (tryTrivialLoopUnswitch(Changed)) {
     return true;
   }
 
   // Do not do non-trivial unswitch while optimizing for size.
   // FIXME: Use Function::hasOptSize().
   if (OptimizeForSize ||
-      loopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
+      LoopHeader->getParent()->hasFnAttribute(Attribute::OptimizeForSize))
     return false;
 
   // Run through the instructions in the loop, keeping track of three things:
@@ -680,11 +679,12 @@ bool LoopUnswitch::processCurrentLoop() {
 
   SmallVector<IntrinsicInst *, 4> Guards;
 
-  for (const auto BB : currentLoop->blocks()) {
+  for (const auto BB : CurrentLoop->blocks()) {
     for (auto &I : *BB) {
-      auto CS = CallSite(&I);
-      if (!CS) continue;
-      if (CS.isConvergent())
+      auto *CB = dyn_cast<CallBase>(&I);
+      if (!CB)
+        continue;
+      if (CB->isConvergent())
         return false;
       if (auto *II = dyn_cast<InvokeInst>(&I))
         if (!II->getUnwindDest()->canSplitPredecessors())
@@ -696,11 +696,11 @@ bool LoopUnswitch::processCurrentLoop() {
   }
 
   for (IntrinsicInst *Guard : Guards) {
-    Value *LoopCond = FindLIVLoopCondition(Guard->getOperand(0), currentLoop,
+    Value *LoopCond = findLIVLoopCondition(Guard->getOperand(0), CurrentLoop,
                                            Changed, MSSAU.get())
                           .first;
     if (LoopCond &&
-        UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
+        unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
       // NB! Unswitching (if successful) could have erased some of the
       // instructions in Guards leaving dangling pointers there.  This is fine
       // because we're returning now, and won't look at Guards again.
@@ -712,8 +712,9 @@ bool LoopUnswitch::processCurrentLoop() {
   // Loop over all of the basic blocks in the loop.  If we find an interior
   // block that is branching on a loop-invariant condition, we can unswitch this
   // loop.
-  for (Loop::block_iterator I = currentLoop->block_begin(),
-         E = currentLoop->block_end(); I != E; ++I) {
+  for (Loop::block_iterator I = CurrentLoop->block_begin(),
+                            E = CurrentLoop->block_end();
+       I != E; ++I) {
     Instruction *TI = (*I)->getTerminator();
 
     // Unswitching on a potentially uninitialized predicate is not
@@ -723,7 +724,7 @@ bool LoopUnswitch::processCurrentLoop() {
     // This is a workaround for the discrepancy between LLVM IR and MSan
     // semantics. See PR28054 for more details.
     if (SanitizeMemory &&
-        !SafetyInfo.isGuaranteedToExecute(*TI, DT, currentLoop))
+        !SafetyInfo.isGuaranteedToExecute(*TI, DT, CurrentLoop))
       continue;
 
     if (BranchInst *BI = dyn_cast<BranchInst>(TI)) {
@@ -738,11 +739,11 @@ bool LoopUnswitch::processCurrentLoop() {
       if (BI->isConditional()) {
         // See if this, or some part of it, is loop invariant.  If so, we can
         // unswitch on it if we desire.
-        Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), currentLoop,
+        Value *LoopCond = findLIVLoopCondition(BI->getCondition(), CurrentLoop,
                                                Changed, MSSAU.get())
                               .first;
-        if (LoopCond && !EqualityPropUnSafe(*LoopCond) &&
-            UnswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
+        if (LoopCond && !equalityPropUnSafe(*LoopCond) &&
+            unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context), TI)) {
           ++NumBranches;
           return true;
         }
@@ -752,7 +753,7 @@ bool LoopUnswitch::processCurrentLoop() {
       Value *LoopCond;
       OperatorChain OpChain;
       std::tie(LoopCond, OpChain) =
-          FindLIVLoopCondition(SC, currentLoop, Changed, MSSAU.get());
+          findLIVLoopCondition(SC, CurrentLoop, Changed, MSSAU.get());
 
       unsigned NumCases = SI->getNumCases();
       if (LoopCond && NumCases) {
@@ -796,7 +797,7 @@ bool LoopUnswitch::processCurrentLoop() {
         if (!UnswitchVal)
           continue;
 
-        if (UnswitchIfProfitable(LoopCond, UnswitchVal)) {
+        if (unswitchIfProfitable(LoopCond, UnswitchVal)) {
           ++NumSwitches;
           // In case of a full LIV, UnswitchVal is the value we unswitched out.
           // In case of a partial LIV, we only unswitch when its an AND-chain
@@ -812,11 +813,11 @@ bool LoopUnswitch::processCurrentLoop() {
     for (BasicBlock::iterator BBI = (*I)->begin(), E = (*I)->end();
          BBI != E; ++BBI)
       if (SelectInst *SI = dyn_cast<SelectInst>(BBI)) {
-        Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), currentLoop,
+        Value *LoopCond = findLIVLoopCondition(SI->getCondition(), CurrentLoop,
                                                Changed, MSSAU.get())
                               .first;
-        if (LoopCond && UnswitchIfProfitable(LoopCond,
-                                             ConstantInt::getTrue(Context))) {
+        if (LoopCond &&
+            unswitchIfProfitable(LoopCond, ConstantInt::getTrue(Context))) {
           ++NumSelects;
           return true;
         }
@@ -875,62 +876,38 @@ static BasicBlock *isTrivialLoopExitBlock(Loop *L, BasicBlock *BB) {
   return nullptr;
 }
 
-/// We have found that we can unswitch currentLoop when LoopCond == Val to
+/// We have found that we can unswitch CurrentLoop when LoopCond == Val to
 /// simplify the loop.  If we decide that this is profitable,
 /// unswitch the loop, reprocess the pieces, then return true.
-bool LoopUnswitch::UnswitchIfProfitable(Value *LoopCond, Constant *Val,
+bool LoopUnswitch::unswitchIfProfitable(Value *LoopCond, Constant *Val,
                                         Instruction *TI) {
   // Check to see if it would be profitable to unswitch current loop.
-  if (!BranchesInfo.CostAllowsUnswitching()) {
+  if (!BranchesInfo.costAllowsUnswitching()) {
     LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
-                      << currentLoop->getHeader()->getName()
+                      << CurrentLoop->getHeader()->getName()
                       << " at non-trivial condition '" << *Val
                       << "' == " << *LoopCond << "\n"
                       << ". Cost too high.\n");
     return false;
   }
-  if (hasBranchDivergence &&
+  if (HasBranchDivergence &&
       getAnalysis<LegacyDivergenceAnalysis>().isDivergent(LoopCond)) {
     LLVM_DEBUG(dbgs() << "NOT unswitching loop %"
-                      << currentLoop->getHeader()->getName()
+                      << CurrentLoop->getHeader()->getName()
                       << " at non-trivial condition '" << *Val
                       << "' == " << *LoopCond << "\n"
                       << ". Condition is divergent.\n");
     return false;
   }
 
-  UnswitchNontrivialCondition(LoopCond, Val, currentLoop, TI);
+  unswitchNontrivialCondition(LoopCond, Val, CurrentLoop, TI);
   return true;
 }
 
-/// Recursively clone the specified loop and all of its children,
-/// mapping the blocks with the specified map.
-static Loop *CloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
-                       LoopInfo *LI, LPPassManager *LPM) {
-  Loop &New = *LI->AllocateLoop();
-  if (PL)
-    PL->addChildLoop(&New);
-  else
-    LI->addTopLevelLoop(&New);
-  LPM->addLoop(New);
-
-  // Add all of the blocks in L to the new loop.
-  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
-       I != E; ++I)
-    if (LI->getLoopFor(*I) == L)
-      New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
-
-  // Add all of the subloops to the new loop.
-  for (Loop *I : *L)
-    CloneLoop(I, &New, VM, LI, LPM);
-
-  return &New;
-}
-
 /// Emit a conditional branch on two values if LIC == Val, branch to TrueDst,
 /// otherwise branch to FalseDest. Insert the code immediately before OldBranch
 /// and remove (but not erase!) it from the function.
-void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
+void LoopUnswitch::emitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
                                                   BasicBlock *TrueDest,
                                                   BasicBlock *FalseDest,
                                                   BranchInst *OldBranch,
@@ -997,11 +974,11 @@ void LoopUnswitch::EmitPreheaderBranchOnCondition(Value *LIC, Constant *Val,
 /// that doesn't execute its body has no side-effects), unswitch it. This
 /// doesn't involve any code duplication, just moving the conditional branch
 /// outside of the loop and updating loop info.
-void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
+void LoopUnswitch::unswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
                                             BasicBlock *ExitBlock,
                                             Instruction *TI) {
   LLVM_DEBUG(dbgs() << "loop-unswitch: Trivial-Unswitch loop %"
-                    << loopHeader->getName() << " [" << L->getBlocks().size()
+                    << LoopHeader->getName() << " [" << L->getBlocks().size()
                     << " blocks] in Function "
                     << L->getHeader()->getParent()->getName()
                     << " on cond: " << *Val << " == " << *Cond << "\n");
@@ -1011,9 +988,9 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
     SEWP->getSE().forgetTopmostLoop(L);
 
   // First step, split the preheader, so that we know that there is a safe place
-  // to insert the conditional branch.  We will change loopPreheader to have a
+  // to insert the conditional branch.  We will change LoopPreheader to have a
   // conditional branch on Cond.
-  BasicBlock *NewPH = SplitEdge(loopPreheader, loopHeader, DT, LI, MSSAU.get());
+  BasicBlock *NewPH = SplitEdge(LoopPreheader, LoopHeader, DT, LI, MSSAU.get());
 
   // Now that we have a place to insert the conditional branch, create a place
   // to branch to: this is the exit block out of the loop that we should
@@ -1029,22 +1006,21 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
 
   // Okay, now we have a position to branch from and a position to branch to,
   // insert the new conditional branch.
-  auto *OldBranch = dyn_cast<BranchInst>(loopPreheader->getTerminator());
+  auto *OldBranch = dyn_cast<BranchInst>(LoopPreheader->getTerminator());
   assert(OldBranch && "Failed to split the preheader");
-  EmitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, OldBranch, TI);
-  LPM->deleteSimpleAnalysisValue(OldBranch, L);
+  emitPreheaderBranchOnCondition(Cond, Val, NewExit, NewPH, OldBranch, TI);
 
-  // EmitPreheaderBranchOnCondition removed the OldBranch from the function.
+  // emitPreheaderBranchOnCondition removed the OldBranch from the function.
   // Delete it, as it is no longer needed.
   delete OldBranch;
 
   // We need to reprocess this loop, it could be unswitched again.
-  redoLoop = true;
+  RedoLoop = true;
 
   // Now that we know that the loop is never entered when this condition is a
   // particular value, rewrite the loop with this info.  We know that this will
   // at least eliminate the old branch.
-  RewriteLoopBodyWithConditionConstant(L, Cond, Val, false);
+  rewriteLoopBodyWithConditionConstant(L, Cond, Val, /*IsEqual=*/false);
 
   ++NumTrivial;
 }
@@ -1055,8 +1031,8 @@ void LoopUnswitch::UnswitchTrivialCondition(Loop *L, Value *Cond, Constant *Val,
 /// produces no code duplications (equivalently, it produces a simpler loop and
 /// a new empty loop, which gets deleted). Therefore always unswitch trivial
 /// condition.
-bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
-  BasicBlock *CurrentBB = currentLoop->getHeader();
+bool LoopUnswitch::tryTrivialLoopUnswitch(bool &Changed) {
+  BasicBlock *CurrentBB = CurrentLoop->getHeader();
   Instruction *CurrentTerm = CurrentBB->getTerminator();
   LLVMContext &Context = CurrentBB->getContext();
 
@@ -1081,7 +1057,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     // we can not reach any trivial condition candidates (unfoldable
     // branch instructions or switch instructions) and no unswitch
     // can happen. Exit and return false.
-    if (!currentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second)
+    if (!CurrentLoop->contains(CurrentBB) || !Visited.insert(CurrentBB).second)
       return false;
 
     // Check if this loop will execute any side-effecting instructions (e.g.
@@ -1128,7 +1104,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     if (!BI->isConditional())
       return false;
 
-    Value *LoopCond = FindLIVLoopCondition(BI->getCondition(), currentLoop,
+    Value *LoopCond = findLIVLoopCondition(BI->getCondition(), CurrentLoop,
                                            Changed, MSSAU.get())
                           .first;
 
@@ -1141,11 +1117,11 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     // exit through a unique exit block without having any
     // side-effects.  If so, determine the value of Cond that causes
     // it to do this.
-    if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
-                                             BI->getSuccessor(0)))) {
+    if ((LoopExitBB =
+             isTrivialLoopExitBlock(CurrentLoop, BI->getSuccessor(0)))) {
       CondVal = ConstantInt::getTrue(Context);
-    } else if ((LoopExitBB = isTrivialLoopExitBlock(currentLoop,
-                                                    BI->getSuccessor(1)))) {
+    } else if ((LoopExitBB =
+                    isTrivialLoopExitBlock(CurrentLoop, BI->getSuccessor(1)))) {
       CondVal = ConstantInt::getFalse(Context);
     }
 
@@ -1154,16 +1130,16 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
       return false;   // Can't handle this.
 
-    if (EqualityPropUnSafe(*LoopCond))
+    if (equalityPropUnSafe(*LoopCond))
       return false;
 
-    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
+    unswitchTrivialCondition(CurrentLoop, LoopCond, CondVal, LoopExitBB,
                              CurrentTerm);
     ++NumBranches;
     return true;
   } else if (SwitchInst *SI = dyn_cast<SwitchInst>(CurrentTerm)) {
     // If this isn't switching on an invariant condition, we can't unswitch it.
-    Value *LoopCond = FindLIVLoopCondition(SI->getCondition(), currentLoop,
+    Value *LoopCond = findLIVLoopCondition(SI->getCondition(), CurrentLoop,
                                            Changed, MSSAU.get())
                           .first;
 
@@ -1181,7 +1157,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     for (auto Case : SI->cases()) {
       BasicBlock *LoopExitCandidate;
       if ((LoopExitCandidate =
-               isTrivialLoopExitBlock(currentLoop, Case.getCaseSuccessor()))) {
+               isTrivialLoopExitBlock(CurrentLoop, Case.getCaseSuccessor()))) {
         // Okay, we found a trivial case, remember the value that is trivial.
         ConstantInt *CaseVal = Case.getCaseValue();
 
@@ -1200,7 +1176,7 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
     if (!LoopExitBB || isa<PHINode>(LoopExitBB->begin()))
       return false;   // Can't handle this.
 
-    UnswitchTrivialCondition(currentLoop, LoopCond, CondVal, LoopExitBB,
+    unswitchTrivialCondition(CurrentLoop, LoopCond, CondVal, LoopExitBB,
                              nullptr);
 
     // We are only unswitching full LIV.
@@ -1213,11 +1189,11 @@ bool LoopUnswitch::TryTrivialLoopUnswitch(bool &Changed) {
 
 /// Split all of the edges from inside the loop to their exit blocks.
 /// Update the appropriate Phi nodes as we do so.
-void LoopUnswitch::SplitExitEdges(Loop *L,
-                               const SmallVectorImpl<BasicBlock *> &ExitBlocks){
+void LoopUnswitch::splitExitEdges(
+    Loop *L, const SmallVectorImpl<BasicBlock *> &ExitBlocks) {
 
-  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
-    BasicBlock *ExitBlock = ExitBlocks[i];
+  for (unsigned I = 0, E = ExitBlocks.size(); I != E; ++I) {
+    BasicBlock *ExitBlock = ExitBlocks[I];
     SmallVector<BasicBlock *, 4> Preds(pred_begin(ExitBlock),
                                        pred_end(ExitBlock));
 
@@ -1231,11 +1207,11 @@ void LoopUnswitch::SplitExitEdges(Loop *L,
 /// We determined that the loop is profitable to unswitch when LIC equal Val.
 /// Split it into loop versions and test the condition outside of either loop.
 /// Return the loops created as Out1/Out2.
-void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
+void LoopUnswitch::unswitchNontrivialCondition(Value *LIC, Constant *Val,
                                                Loop *L, Instruction *TI) {
-  Function *F = loopHeader->getParent();
+  Function *F = LoopHeader->getParent();
   LLVM_DEBUG(dbgs() << "loop-unswitch: Unswitching loop %"
-                    << loopHeader->getName() << " [" << L->getBlocks().size()
+                    << LoopHeader->getName() << " [" << L->getBlocks().size()
                     << " blocks] in Function " << F->getName() << " when '"
                     << *Val << "' == " << *LIC << "\n");
 
@@ -1253,7 +1229,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   // First step, split the preheader and exit blocks, and add these blocks to
   // the LoopBlocks list.
   BasicBlock *NewPreheader =
-      SplitEdge(loopPreheader, loopHeader, DT, LI, MSSAU.get());
+      SplitEdge(LoopPreheader, LoopHeader, DT, LI, MSSAU.get());
   LoopBlocks.push_back(NewPreheader);
 
   // We want the loop to come after the preheader, but before the exit blocks.
@@ -1264,7 +1240,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
 
   // Split all of the edges from inside the loop to their exit blocks.  Update
   // the appropriate Phi nodes as we do so.
-  SplitExitEdges(L, ExitBlocks);
+  splitExitEdges(L, ExitBlocks);
 
   // The exit blocks may have been changed due to edge splitting, recompute.
   ExitBlocks.clear();
@@ -1278,12 +1254,11 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   // the instructions and blocks.
   NewBlocks.reserve(LoopBlocks.size());
   ValueToValueMapTy VMap;
-  for (unsigned i = 0, e = LoopBlocks.size(); i != e; ++i) {
-    BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[i], VMap, ".us", F);
+  for (unsigned I = 0, E = LoopBlocks.size(); I != E; ++I) {
+    BasicBlock *NewBB = CloneBasicBlock(LoopBlocks[I], VMap, ".us", F);
 
     NewBlocks.push_back(NewBB);
-    VMap[LoopBlocks[i]] = NewBB;  // Keep the BB mapping.
-    LPM->cloneBasicBlockSimpleAnalysis(LoopBlocks[i], NewBB, L);
+    VMap[LoopBlocks[I]] = NewBB; // Keep the BB mapping.
   }
 
   // Splice the newly inserted blocks into the function right before the
@@ -1293,7 +1268,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
                                 NewBlocks[0]->getIterator(), F->end());
 
   // Now we create the new Loop object for the versioned loop.
-  Loop *NewLoop = CloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
+  Loop *NewLoop = cloneLoop(L, L->getParentLoop(), VMap, LI, LPM);
 
   // Recalculate unswitching quota, inherit simplified switches info for NewBB,
   // Probably clone more loop-unswitch related loop properties.
@@ -1306,10 +1281,10 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
     ParentLoop->addBasicBlockToLoop(NewBlocks[0], *LI);
   }
 
-  for (unsigned i = 0, e = ExitBlocks.size(); i != e; ++i) {
-    BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[i]]);
+  for (unsigned EBI = 0, EBE = ExitBlocks.size(); EBI != EBE; ++EBI) {
+    BasicBlock *NewExit = cast<BasicBlock>(VMap[ExitBlocks[EBI]]);
     // The new exit block should be in the same loop as the old one.
-    if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[i]))
+    if (Loop *ExitBBLoop = LI->getLoopFor(ExitBlocks[EBI]))
       ExitBBLoop->addBasicBlockToLoop(NewExit, *LI);
 
     assert(NewExit->getTerminator()->getNumSuccessors() == 1 &&
@@ -1319,7 +1294,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
     // If the successor of the exit block had PHI nodes, add an entry for
     // NewExit.
     for (PHINode &PN : ExitSucc->phis()) {
-      Value *V = PN.getIncomingValueForBlock(ExitBlocks[i]);
+      Value *V = PN.getIncomingValueForBlock(ExitBlocks[EBI]);
       ValueToValueMapTy::iterator It = VMap.find(V);
       if (It != VMap.end()) V = It->second;
       PN.addIncoming(V, NewExit);
@@ -1340,8 +1315,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   }
 
   // Rewrite the code to refer to itself.
-  for (unsigned i = 0, e = NewBlocks.size(); i != e; ++i) {
-    for (Instruction &I : *NewBlocks[i]) {
+  for (unsigned NBI = 0, NBE = NewBlocks.size(); NBI != NBE; ++NBI) {
+    for (Instruction &I : *NewBlocks[NBI]) {
       RemapInstruction(&I, VMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
       if (auto *II = dyn_cast<IntrinsicInst>(&I))
@@ -1351,7 +1326,7 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   }
 
   // Rewrite the original preheader to select between versions of the loop.
-  BranchInst *OldBR = cast<BranchInst>(loopPreheader->getTerminator());
+  BranchInst *OldBR = cast<BranchInst>(LoopPreheader->getTerminator());
   assert(OldBR->isUnconditional() && OldBR->getSuccessor(0) == LoopBlocks[0] &&
          "Preheader splitting did not work correctly!");
 
@@ -1364,9 +1339,8 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   }
 
   // Emit the new branch that selects between the two versions of this loop.
-  EmitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR,
+  emitPreheaderBranchOnCondition(LIC, Val, NewBlocks[0], LoopBlocks[0], OldBR,
                                  TI);
-  LPM->deleteSimpleAnalysisValue(OldBR, L);
   if (MSSAU) {
     // Update MemoryPhis in Exit blocks.
     MSSAU->updateExitBlocksForClonedLoop(ExitBlocks, VMap, *DT);
@@ -1375,11 +1349,11 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
   }
 
   // The OldBr was replaced by a new one and removed (but not erased) by
-  // EmitPreheaderBranchOnCondition. It is no longer needed, so delete it.
+  // emitPreheaderBranchOnCondition. It is no longer needed, so delete it.
   delete OldBR;
 
   LoopProcessWorklist.push_back(NewLoop);
-  redoLoop = true;
+  RedoLoop = true;
 
   // Keep a WeakTrackingVH holding onto LIC.  If the first call to
   // RewriteLoopBody
@@ -1390,22 +1364,23 @@ void LoopUnswitch::UnswitchNontrivialCondition(Value *LIC, Constant *Val,
 
   // Now we rewrite the original code to know that the condition is true and the
   // new code to know that the condition is false.
-  RewriteLoopBodyWithConditionConstant(L, LIC, Val, false);
+  rewriteLoopBodyWithConditionConstant(L, LIC, Val, /*IsEqual=*/false);
 
   // It's possible that simplifying one loop could cause the other to be
   // changed to another value or a constant.  If its a constant, don't simplify
   // it.
   if (!LoopProcessWorklist.empty() && LoopProcessWorklist.back() == NewLoop &&
       LICHandle && !isa<Constant>(LICHandle))
-    RewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val, true);
+    rewriteLoopBodyWithConditionConstant(NewLoop, LICHandle, Val,
+                                         /*IsEqual=*/true);
 
   if (MSSA && VerifyMemorySSA)
     MSSA->verifyMemorySSA();
 }
 
 /// Remove all instances of I from the worklist vector specified.
-static void RemoveFromWorklist(Instruction *I,
-                               std::vector<Instruction*> &Worklist) {
+static void removeFromWorklist(Instruction *I,
+                               std::vector<Instruction *> &Worklist) {
 
   Worklist.erase(std::remove(Worklist.begin(), Worklist.end(), I),
                  Worklist.end());
@@ -1413,7 +1388,7 @@ static void RemoveFromWorklist(Instruction *I,
 
 /// When we find that I really equals V, remove I from the
 /// program, replacing all uses with V and update the worklist.
-static void ReplaceUsesOfWith(Instruction *I, Value *V,
+static void replaceUsesOfWith(Instruction *I, Value *V,
                               std::vector<Instruction *> &Worklist, Loop *L,
                               LPPassManager *LPM, MemorySSAUpdater *MSSAU) {
   LLVM_DEBUG(dbgs() << "Replace with '" << *V << "': " << *I << "\n");
@@ -1426,8 +1401,7 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V,
   // Add users to the worklist which may be simplified now.
   for (User *U : I->users())
     Worklist.push_back(cast<Instruction>(U));
-  LPM->deleteSimpleAnalysisValue(I, L);
-  RemoveFromWorklist(I, Worklist);
+  removeFromWorklist(I, Worklist);
   I->replaceAllUsesWith(V);
   if (!I->mayHaveSideEffects()) {
     if (MSSAU)
@@ -1440,7 +1414,7 @@ static void ReplaceUsesOfWith(Instruction *I, Value *V,
 /// We know either that the value LIC has the value specified by Val in the
 /// specified loop, or we know it does NOT have that value.
 /// Rewrite any uses of LIC or of properties correlated to it.
-void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
+void LoopUnswitch::rewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
                                                         Constant *Val,
                                                         bool IsEqual) {
   assert(!isa<Constant>(LIC) && "Why are we unswitching on a constant?");
@@ -1478,7 +1452,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     for (Instruction *UI : Worklist)
       UI->replaceUsesOfWith(LIC, Replacement);
 
-    SimplifyCode(Worklist, L);
+    simplifyCode(Worklist, L);
     return;
   }
 
@@ -1492,7 +1466,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
 
     // At this point, we know LIC is definitely not Val. Try to use some simple
     // logic to simplify the user w.r.t. to the context.
-    if (Value *Replacement = SimplifyInstructionWithNotEqual(UI, LIC, Val)) {
+    if (Value *Replacement = simplifyInstructionWithNotEqual(UI, LIC, Val)) {
       if (LI->replacementPreservesLCSSAForm(UI, Replacement)) {
         // This in-loop instruction has been simplified w.r.t. its context,
         // i.e. LIC != Val, make sure we propagate its replacement value to
@@ -1506,7 +1480,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
       }
     }
 
-    // This is a LIC user, push it into the worklist so that SimplifyCode can
+    // This is a LIC user, push it into the worklist so that simplifyCode can
     // attempt to simplify it.
     Worklist.push_back(UI);
 
@@ -1568,7 +1542,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
     DT->addNewBlock(Abort, NewSISucc);
   }
 
-  SimplifyCode(Worklist, L);
+  simplifyCode(Worklist, L);
 }
 
 /// Now that we have simplified some instructions in the loop, walk over it and
@@ -1579,7 +1553,7 @@ void LoopUnswitch::RewriteLoopBodyWithConditionConstant(Loop *L, Value *LIC,
 /// FIXME: When the loop optimizer is more mature, separate this out to a new
 /// pass.
 ///
-void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
+void LoopUnswitch::simplifyCode(std::vector<Instruction *> &Worklist, Loop *L) {
   const DataLayout &DL = L->getHeader()->getModule()->getDataLayout();
   while (!Worklist.empty()) {
     Instruction *I = Worklist.back();
@@ -1593,8 +1567,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
       for (unsigned i = 0, e = I->getNumOperands(); i != e; ++i)
         if (Instruction *Use = dyn_cast<Instruction>(I->getOperand(i)))
           Worklist.push_back(Use);
-      LPM->deleteSimpleAnalysisValue(I, L);
-      RemoveFromWorklist(I, Worklist);
+      removeFromWorklist(I, Worklist);
       if (MSSAU)
         MSSAU->removeMemoryAccess(I);
       I->eraseFromParent();
@@ -1607,7 +1580,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
     // 'false'.  TODO: update the domtree properly so we can pass it here.
     if (Value *V = SimplifyInstruction(I, DL))
       if (LI->replacementPreservesLCSSAForm(I, V)) {
-        ReplaceUsesOfWith(I, V, Worklist, L, LPM, MSSAU.get());
+        replaceUsesOfWith(I, V, Worklist, L, LPM, MSSAU.get());
         continue;
       }
 
@@ -1624,9 +1597,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
         assert(SinglePred == Pred && "CFG broken");
 
         // Make the LPM and Worklist updates specific to LoopUnswitch.
-        LPM->deleteSimpleAnalysisValue(BI, L);
-        RemoveFromWorklist(BI, Worklist);
-        LPM->deleteSimpleAnalysisValue(Succ, L);
+        removeFromWorklist(BI, Worklist);
         auto SuccIt = Succ->begin();
         while (PHINode *PN = dyn_cast<PHINode>(SuccIt++)) {
           for (unsigned It = 0, E = PN->getNumOperands(); It != E; ++It)
@@ -1634,8 +1605,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
               Worklist.push_back(Use);
           for (User *U : PN->users())
             Worklist.push_back(cast<Instruction>(U));
-          LPM->deleteSimpleAnalysisValue(PN, L);
-          RemoveFromWorklist(PN, Worklist);
+          removeFromWorklist(PN, Worklist);
           ++NumSimplify;
         }
         // Merge the block and make the remaining analyses updates.
@@ -1652,7 +1622,7 @@ void LoopUnswitch::SimplifyCode(std::vector<Instruction*> &Worklist, Loop *L) {
 
 /// Simple simplifications we can do given the information that Cond is
 /// definitely not equal to Val.
-Value *LoopUnswitch::SimplifyInstructionWithNotEqual(Instruction *Inst,
+Value *LoopUnswitch::simplifyInstructionWithNotEqual(Instruction *Inst,
                                                      Value *Invariant,
                                                      Constant *Val) {
   // icmp eq cond, val -> false
diff --git a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
index 7b9af527d444..06b684ef1e70 100644
--- a/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopVersioningLICM.cpp
@@ -69,7 +69,6 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instruction.h"
diff --git a/llvm/lib/Transforms/Scalar/LowerAtomic.cpp b/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
index ab7b85e89e7b..d1f67b355b19 100644
--- a/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerAtomic.cpp
@@ -117,18 +117,17 @@ static bool LowerStoreInst(StoreInst *SI) {
 
 static bool runOnBasicBlock(BasicBlock &BB) {
   bool Changed = false;
-  for (BasicBlock::iterator DI = BB.begin(), DE = BB.end(); DI != DE;) {
-    Instruction *Inst = &*DI++;
-    if (FenceInst *FI = dyn_cast<FenceInst>(Inst))
+  for (Instruction &Inst : make_early_inc_range(BB)) {
+    if (FenceInst *FI = dyn_cast<FenceInst>(&Inst))
       Changed |= LowerFenceInst(FI);
-    else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(Inst))
+    else if (AtomicCmpXchgInst *CXI = dyn_cast<AtomicCmpXchgInst>(&Inst))
       Changed |= LowerAtomicCmpXchgInst(CXI);
-    else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(Inst))
+    else if (AtomicRMWInst *RMWI = dyn_cast<AtomicRMWInst>(&Inst))
       Changed |= LowerAtomicRMWInst(RMWI);
-    else if (LoadInst *LI = dyn_cast<LoadInst>(Inst)) {
+    else if (LoadInst *LI = dyn_cast<LoadInst>(&Inst)) {
       if (LI->isAtomic())
         LowerLoadInst(LI);
-    } else if (StoreInst *SI = dyn_cast<StoreInst>(Inst)) {
+    } else if (StoreInst *SI = dyn_cast<StoreInst>(&Inst)) {
       if (SI->isAtomic())
         LowerStoreInst(SI);
     }
diff --git a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
index 21c6c32e8e02..fddf28c281fc 100644
--- a/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerConstantIntrinsics.cpp
@@ -13,7 +13,9 @@
 
 #include "llvm/Transforms/Scalar/LowerConstantIntrinsics.h"
 #include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/MemoryBuiltins.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
@@ -135,8 +137,12 @@ static bool lowerConstantIntrinsics(Function &F, const TargetLibraryInfo *TLI) {
 
 PreservedAnalyses
 LowerConstantIntrinsicsPass::run(Function &F, FunctionAnalysisManager &AM) {
-  if (lowerConstantIntrinsics(F, AM.getCachedResult<TargetLibraryAnalysis>(F)))
-    return PreservedAnalyses::none();
+  if (lowerConstantIntrinsics(F,
+                              AM.getCachedResult<TargetLibraryAnalysis>(F))) {
+    PreservedAnalyses PA;
+    PA.preserve<GlobalsAA>();
+    return PA;
+  }
 
   return PreservedAnalyses::all();
 }
@@ -145,7 +151,7 @@ namespace {
 /// Legacy pass for lowering is.constant intrinsics out of the IR.
 ///
 /// When this pass is run over a function it converts is.constant intrinsics
-/// into 'true' or 'false'. This is completements the normal constand folding
+/// into 'true' or 'false'. This complements the normal constant folding
 /// to 'true' as part of Instruction Simplify passes.
 class LowerConstantIntrinsics : public FunctionPass {
 public:
@@ -159,6 +165,10 @@ public:
     const TargetLibraryInfo *TLI = TLIP ? &TLIP->getTLI(F) : nullptr;
     return lowerConstantIntrinsics(F, TLI);
   }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addPreserved<GlobalsAAWrapperPass>();
+  }
 };
 } // namespace
 
diff --git a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
index 53671c7bc3d1..0fe7dd9cfb39 100644
--- a/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerExpectIntrinsic.cpp
@@ -55,13 +55,35 @@ static cl::opt<uint32_t> UnlikelyBranchWeight(
     "unlikely-branch-weight", cl::Hidden, cl::init(1),
     cl::desc("Weight of the branch unlikely to be taken (default = 1)"));
 
+static std::tuple<uint32_t, uint32_t>
+getBranchWeight(Intrinsic::ID IntrinsicID, CallInst *CI, int BranchCount) {
+  if (IntrinsicID == Intrinsic::expect) {
+    // __builtin_expect
+    return std::make_tuple(LikelyBranchWeight.getValue(),
+                           UnlikelyBranchWeight.getValue());
+  } else {
+    // __builtin_expect_with_probability
+    assert(CI->getNumOperands() >= 3 &&
+           "expect with probability must have 3 arguments");
+    ConstantFP *Confidence = dyn_cast<ConstantFP>(CI->getArgOperand(2));
+    double TrueProb = Confidence->getValueAPF().convertToDouble();
+    assert((TrueProb >= 0.0 && TrueProb <= 1.0) &&
+           "probability value must be in the range [0.0, 1.0]");
+    double FalseProb = (1.0 - TrueProb) / (BranchCount - 1);
+    uint32_t LikelyBW = ceil((TrueProb * (double)(INT32_MAX - 1)) + 1.0);
+    uint32_t UnlikelyBW = ceil((FalseProb * (double)(INT32_MAX - 1)) + 1.0);
+    return std::make_tuple(LikelyBW, UnlikelyBW);
+  }
+}
+
 static bool handleSwitchExpect(SwitchInst &SI) {
   CallInst *CI = dyn_cast<CallInst>(SI.getCondition());
   if (!CI)
     return false;
 
   Function *Fn = CI->getCalledFunction();
-  if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+  if (!Fn || (Fn->getIntrinsicID() != Intrinsic::expect &&
+              Fn->getIntrinsicID() != Intrinsic::expect_with_probability))
     return false;
 
   Value *ArgValue = CI->getArgOperand(0);
@@ -71,15 +93,19 @@ static bool handleSwitchExpect(SwitchInst &SI) {
 
   SwitchInst::CaseHandle Case = *SI.findCaseValue(ExpectedValue);
   unsigned n = SI.getNumCases(); // +1 for default case.
-  SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeight);
+  uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal;
+  std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) =
+      getBranchWeight(Fn->getIntrinsicID(), CI, n + 1);
+
+  SmallVector<uint32_t, 16> Weights(n + 1, UnlikelyBranchWeightVal);
 
   uint64_t Index = (Case == *SI.case_default()) ? 0 : Case.getCaseIndex() + 1;
-  Weights[Index] = LikelyBranchWeight;
+  Weights[Index] = LikelyBranchWeightVal;
 
-  SI.setMetadata(
-      LLVMContext::MD_misexpect,
-      MDBuilder(CI->getContext())
-          .createMisExpect(Index, LikelyBranchWeight, UnlikelyBranchWeight));
+  SI.setMetadata(LLVMContext::MD_misexpect,
+                 MDBuilder(CI->getContext())
+                     .createMisExpect(Index, LikelyBranchWeightVal,
+                                      UnlikelyBranchWeightVal));
 
   SI.setCondition(ArgValue);
   misexpect::checkFrontendInstrumentation(SI);
@@ -223,15 +249,18 @@ static void handlePhiDef(CallInst *Expect) {
         return true;
       return false;
     };
+    uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal;
+    std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) = getBranchWeight(
+        Expect->getCalledFunction()->getIntrinsicID(), Expect, 2);
 
     if (IsOpndComingFromSuccessor(BI->getSuccessor(1)))
-      BI->setMetadata(
-          LLVMContext::MD_prof,
-          MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight));
+      BI->setMetadata(LLVMContext::MD_prof,
+                      MDB.createBranchWeights(LikelyBranchWeightVal,
+                                              UnlikelyBranchWeightVal));
     else if (IsOpndComingFromSuccessor(BI->getSuccessor(0)))
-      BI->setMetadata(
-          LLVMContext::MD_prof,
-          MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight));
+      BI->setMetadata(LLVMContext::MD_prof,
+                      MDB.createBranchWeights(UnlikelyBranchWeightVal,
+                                              LikelyBranchWeightVal));
   }
 }
 
@@ -277,7 +306,8 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
   }
 
   Function *Fn = CI->getCalledFunction();
-  if (!Fn || Fn->getIntrinsicID() != Intrinsic::expect)
+  if (!Fn || (Fn->getIntrinsicID() != Intrinsic::expect &&
+              Fn->getIntrinsicID() != Intrinsic::expect_with_probability))
     return false;
 
   Value *ArgValue = CI->getArgOperand(0);
@@ -289,13 +319,21 @@ template <class BrSelInst> static bool handleBrSelExpect(BrSelInst &BSI) {
   MDNode *Node;
   MDNode *ExpNode;
 
+  uint32_t LikelyBranchWeightVal, UnlikelyBranchWeightVal;
+  std::tie(LikelyBranchWeightVal, UnlikelyBranchWeightVal) =
+      getBranchWeight(Fn->getIntrinsicID(), CI, 2);
+
   if ((ExpectedValue->getZExtValue() == ValueComparedTo) ==
       (Predicate == CmpInst::ICMP_EQ)) {
-    Node = MDB.createBranchWeights(LikelyBranchWeight, UnlikelyBranchWeight);
-    ExpNode = MDB.createMisExpect(0, LikelyBranchWeight, UnlikelyBranchWeight);
+    Node =
+        MDB.createBranchWeights(LikelyBranchWeightVal, UnlikelyBranchWeightVal);
+    ExpNode =
+        MDB.createMisExpect(0, LikelyBranchWeightVal, UnlikelyBranchWeightVal);
   } else {
-    Node = MDB.createBranchWeights(UnlikelyBranchWeight, LikelyBranchWeight);
-    ExpNode = MDB.createMisExpect(1, LikelyBranchWeight, UnlikelyBranchWeight);
+    Node =
+        MDB.createBranchWeights(UnlikelyBranchWeightVal, LikelyBranchWeightVal);
+    ExpNode =
+        MDB.createMisExpect(1, LikelyBranchWeightVal, UnlikelyBranchWeightVal);
   }
 
   BSI.setMetadata(LLVMContext::MD_misexpect, ExpNode);
@@ -347,7 +385,8 @@ static bool lowerExpectIntrinsic(Function &F) {
       }
 
       Function *Fn = CI->getCalledFunction();
-      if (Fn && Fn->getIntrinsicID() == Intrinsic::expect) {
+      if (Fn && (Fn->getIntrinsicID() == Intrinsic::expect ||
+                 Fn->getIntrinsicID() == Intrinsic::expect_with_probability)) {
         // Before erasing the llvm.expect, walk backward to find
         // phi that define llvm.expect's first arg, and
         // infer branch probability:
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index 0ff6ee8bcfcc..90314b17b5e2 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -9,8 +9,11 @@
 // Lower matrix intrinsics to vector operations.
 //
 // TODO:
-//  * Implement multiply & add fusion
-//  * Add remark, summarizing the available matrix optimization opportunities.
+//  * Improve fusion:
+//   * Support more cases, e.g. multiply-add, multiply-sub, operands/results
+//     transposed.
+//   * Improve cost-modeling, e.g. choose different number of rows/columns
+//     columns for tiles, consider cost of copies on alias.
 //
 //===----------------------------------------------------------------------===//
 
@@ -18,10 +21,15 @@
 #include "llvm/ADT/GraphTraits.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/AliasAnalysis.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Instructions.h"
@@ -29,30 +37,69 @@
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
+#include "llvm/Support/Alignment.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
 using namespace PatternMatch;
 
 #define DEBUG_TYPE "lower-matrix-intrinsics"
 
-static cl::opt<bool> EnableShapePropagation("matrix-propagate-shape",
-                                            cl::init(true));
-
+static cl::opt<bool> EnableShapePropagation(
+    "matrix-propagate-shape", cl::init(true), cl::Hidden,
+    cl::desc("Enable/disable shape propagation from matrix intrinsics to other "
+             "instructions."));
+
+static cl::opt<bool>
+    FuseMatrix("fuse-matrix", cl::init(true), cl::Hidden,
+               cl::desc("Enable/disable fusing matrix instructions."));
+// TODO: Allow and use non-square tiles.
+static cl::opt<unsigned> TileSize(
+    "fuse-matrix-tile-size", cl::init(4), cl::Hidden,
+    cl::desc(
+        "Tile size for matrix instruction fusion using square-shaped tiles."));
+static cl::opt<bool> ForceFusion(
+    "force-fuse-matrix", cl::init(false), cl::Hidden,
+    cl::desc("Force matrix instruction fusion even if not profitable."));
 static cl::opt<bool> AllowContractEnabled(
     "matrix-allow-contract", cl::init(false), cl::Hidden,
     cl::desc("Allow the use of FMAs if available and profitable. This may "
              "result in different results, due to less rounding error."));
 
+enum class MatrixLayoutTy { ColumnMajor, RowMajor };
+
+static cl::opt<MatrixLayoutTy> MatrixLayout(
+    "matrix-default-layout", cl::init(MatrixLayoutTy::ColumnMajor),
+    cl::desc("Sets the default matrix layout"),
+    cl::values(clEnumValN(MatrixLayoutTy::ColumnMajor, "column-major",
+                          "Use column-major layout"),
+               clEnumValN(MatrixLayoutTy::RowMajor, "row-major",
+                          "Use row-major layout")));
+
+/// Helper function to either return Scope, if it is a subprogram or the
+/// attached subprogram for a local scope.
+static DISubprogram *getSubprogram(DIScope *Scope) {
+  if (auto *Subprogram = dyn_cast<DISubprogram>(Scope))
+    return Subprogram;
+  return cast<DILocalScope>(Scope)->getSubprogram();
+}
+
 namespace {
 
-// Given an element poitner \p BasePtr to the start of a (sub) matrix, compute
-// the start address of column \p Col with type (\p EltType x \p NumRows)
-// assuming \p Stride elements between start two consecutive columns.
-// \p Stride must be >= \p NumRows.
+// Given an element pointer \p BasePtr to the start of a (sub) matrix, compute
+// the start address of vector \p VecIdx with type (\p EltType x \p NumElements)
+// assuming \p Stride elements between start two consecutive vectors.
+// \p Stride must be >= \p NumElements.
+// For column-major matrixes, the function computes the address of a column
+// vectors and \p NumElements must be set to the number of elements in a column
+// (= number of rows of the matrix). For row-major matrixes, the function
+// computes the address of a row vector and \p NumElements must be set to the
+// number of elements in a column (= number of columns of the matrix).
 //
-// Consider a 4x4 matrix like below
+// Consider a 4x4 matrix in column-mjaor layout like below
 //
 //      0       1      2      3
 // 0   v_0_0  v_0_1  v_0_2  v_0_3
@@ -62,14 +109,14 @@ namespace {
 
 // To compute the column addresses for a 2x3 sub-matrix at row 1 and column 1,
 // we need a pointer to the first element of the submatrix as base pointer.
-// Then we can use computeColumnAddr to compute the addresses for the columns
+// Then we can use computeVectorAddr to compute the addresses for the columns
 // of the sub-matrix.
 //
-// Column 0: computeColumnAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..)
+// Column 0: computeVectorAddr(Base, 0 (column), 4 (stride), 2 (num rows), ..)
 //           -> just returns Base
-// Column 1: computeColumnAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..)
+// Column 1: computeVectorAddr(Base, 1 (column), 4 (stride), 2 (num rows), ..)
 //           -> returns Base + (1 * 4)
-// Column 2: computeColumnAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..)
+// Column 2: computeVectorAddr(Base, 2 (column), 4 (stride), 2 (num rows), ..)
 //           -> returns Base + (2 * 4)
 //
 // The graphic below illustrates the number of elements in a column (marked
@@ -82,30 +129,30 @@ namespace {
 //         v_2_0 |v_2_1 |v_2_2 |v_2_3
 //         v_3_0 {v_3_1 {v_3_2  v_3_3
 //
-Value *computeColumnAddr(Value *BasePtr, Value *Col, Value *Stride,
-                         unsigned NumRows, Type *EltType,
+Value *computeVectorAddr(Value *BasePtr, Value *VecIdx, Value *Stride,
+                         unsigned NumElements, Type *EltType,
                          IRBuilder<> &Builder) {
 
   assert((!isa<ConstantInt>(Stride) ||
-          cast<ConstantInt>(Stride)->getZExtValue() >= NumRows) &&
-         "Stride must be >= the number of rows.");
+          cast<ConstantInt>(Stride)->getZExtValue() >= NumElements) &&
+         "Stride must be >= the number of elements in the result vector.");
   unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
 
-  // Compute the start of the column with index Col as Col * Stride.
-  Value *ColumnStart = Builder.CreateMul(Col, Stride, "col.start");
+  // Compute the start of the vector with index VecIdx as VecIdx * Stride.
+  Value *VecStart = Builder.CreateMul(VecIdx, Stride, "vec.start");
 
-  // Get pointer to the start of the selected column. Skip GEP creation,
-  // if we select column 0.
-  if (isa<ConstantInt>(ColumnStart) && cast<ConstantInt>(ColumnStart)->isZero())
-    ColumnStart = BasePtr;
+  // Get pointer to the start of the selected vector. Skip GEP creation,
+  // if we select vector 0.
+  if (isa<ConstantInt>(VecStart) && cast<ConstantInt>(VecStart)->isZero())
+    VecStart = BasePtr;
   else
-    ColumnStart = Builder.CreateGEP(EltType, BasePtr, ColumnStart, "col.gep");
+    VecStart = Builder.CreateGEP(EltType, BasePtr, VecStart, "vec.gep");
 
-  // Cast elementwise column start pointer to a pointer to a column
-  // (EltType x NumRows)*.
-  Type *ColumnType = VectorType::get(EltType, NumRows);
-  Type *ColumnPtrType = PointerType::get(ColumnType, AS);
-  return Builder.CreatePointerCast(ColumnStart, ColumnPtrType, "col.cast");
+  // Cast elementwise vector start pointer to a pointer to a vector
+  // (EltType x NumElements)*.
+  auto *VecType = FixedVectorType::get(EltType, NumElements);
+  Type *VecPtrType = PointerType::get(VecType, AS);
+  return Builder.CreatePointerCast(VecStart, VecPtrType, "vec.cast");
 }
 
 /// LowerMatrixIntrinsics contains the methods used to lower matrix intrinsics.
@@ -113,15 +160,16 @@ Value *computeColumnAddr(Value *BasePtr, Value *Col, Value *Stride,
 /// Currently, the lowering for each matrix intrinsic is done as follows:
 /// 1. Propagate the shape information from intrinsics to connected
 /// instructions.
-/// 2. Lower instructions with shape information.
+/// 2. Lower instructions with shape information (assuming column-major layout).
+///  The lowering works similarly using row-major layout.
 ///  2.1. Get column vectors for each argument. If we already lowered the
 ///       definition of an argument, use the produced column vectors directly.
 ///       If not, split the operand vector containing an embedded matrix into
 ///       a set of column vectors,
-///  2.2. Lower the instruction in terms of columnwise operations, which yields
-///       a set of column vectors containing result matrix. Note that we lower
-///       all instructions that have shape information. Besides the intrinsics,
-///       this includes stores for example.
+///  2.2. Lower the instruction in terms of column major operations, which
+///       yields a set of column vectors containing result matrix. Note that we
+///       lower all instructions that have shape information. Besides the
+///       intrinsics, this includes stores for example.
 ///  2.3. Update uses of the lowered instruction. If we have shape information
 ///       for a user, there is nothing to do, as we will look up the result
 ///       column matrix when lowering the user. For other uses, we embed the
@@ -134,42 +182,157 @@ class LowerMatrixIntrinsics {
   Function &Func;
   const DataLayout &DL;
   const TargetTransformInfo &TTI;
+  AliasAnalysis &AA;
+  DominatorTree &DT;
+  LoopInfo &LI;
+  OptimizationRemarkEmitter &ORE;
+
+  /// Contains estimates of the number of operations (loads, stores, compute) required to lower a matrix operation.
+  struct OpInfoTy {
+    /// Number of stores emitted to generate this matrix.
+    unsigned NumStores = 0;
+    /// Number of loads emitted to generate this matrix.
+    unsigned NumLoads = 0;
+    /// Number of compute operations emitted to generate this matrix.
+    unsigned NumComputeOps = 0;
+
+    OpInfoTy &operator+=(const OpInfoTy &RHS) {
+      NumStores += RHS.NumStores;
+      NumLoads += RHS.NumLoads;
+      NumComputeOps += RHS.NumComputeOps;
+      return *this;
+    }
+  };
+
+  /// Wrapper class representing a matrix as a set of vectors, either in row or
+  /// column major layout. All vectors must have the same vector type.
+  class MatrixTy {
+    SmallVector<Value *, 16> Vectors;
+
+    OpInfoTy OpInfo;
 
-  /// Wrapper class representing a matrix as a set of column vectors.
-  /// All column vectors must have the same vector type.
-  class ColumnMatrixTy {
-    SmallVector<Value *, 16> Columns;
+    bool IsColumnMajor = true;
 
   public:
-    ColumnMatrixTy() : Columns() {}
-    ColumnMatrixTy(ArrayRef<Value *> Cols)
-        : Columns(Cols.begin(), Cols.end()) {}
+    MatrixTy()
+        : Vectors(),
+          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
+    MatrixTy(ArrayRef<Value *> Vectors)
+        : Vectors(Vectors.begin(), Vectors.end()),
+          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
+    MatrixTy(unsigned NumRows, unsigned NumColumns, Type *EltTy)
+        : IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {
+
+      unsigned D = isColumnMajor() ? NumColumns : NumRows;
+      for (unsigned J = 0; J < D; ++J)
+        addVector(UndefValue::get(FixedVectorType::get(
+            EltTy, isColumnMajor() ? NumRows : NumColumns)));
+    }
+
+    Value *getVector(unsigned i) const { return Vectors[i]; }
+    Value *getColumn(unsigned i) const {
+      assert(isColumnMajor() && "only supported for column-major matrixes");
+      return Vectors[i];
+    }
+    Value *getRow(unsigned i) const {
+      assert(!isColumnMajor() && "only supported for row-major matrixes");
+      return Vectors[i];
+    }
 
-    Value *getColumn(unsigned i) const { return Columns[i]; }
+    void setVector(unsigned i, Value *V) { Vectors[i] = V; }
 
-    void setColumn(unsigned i, Value *V) { Columns[i] = V; }
+    Type *getElementType() { return getVectorTy()->getElementType(); }
 
-    size_t getNumColumns() const { return Columns.size(); }
-    size_t getNumRows() const {
-      assert(Columns.size() > 0 && "Cannot call getNumRows without columns");
-      return cast<VectorType>(Columns[0]->getType())->getNumElements();
+    unsigned getNumVectors() const {
+      if (isColumnMajor())
+        return getNumColumns();
+      return getNumRows();
     }
 
-    const SmallVectorImpl<Value *> &getColumnVectors() const { return Columns; }
+    unsigned getNumColumns() const {
+      if (isColumnMajor())
+        return Vectors.size();
+      else {
+        assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");
+        return cast<FixedVectorType>(Vectors[0]->getType())->getNumElements();
+      }
+    }
+    unsigned getNumRows() const {
+      if (isColumnMajor()) {
+        assert(Vectors.size() > 0 && "Cannot call getNumRows without columns");
+        return cast<FixedVectorType>(Vectors[0]->getType())->getNumElements();
+      } else
+        return Vectors.size();
+    }
 
-    SmallVectorImpl<Value *> &getColumnVectors() { return Columns; }
+    void addVector(Value *V) { Vectors.push_back(V); }
+    VectorType *getColumnTy() {
+      assert(isColumnMajor() && "only supported for column-major matrixes");
+      return getVectorTy();
+    }
 
-    void addColumn(Value *V) { Columns.push_back(V); }
+    VectorType *getVectorTy() {
+      return cast<VectorType>(Vectors[0]->getType());
+    }
 
     iterator_range<SmallVector<Value *, 8>::iterator> columns() {
-      return make_range(Columns.begin(), Columns.end());
+      assert(isColumnMajor() &&
+             "columns() only supported for column-major matrixes");
+      return make_range(Vectors.begin(), Vectors.end());
     }
 
-    /// Embed the columns of the matrix into a flat vector by concatenating
+    iterator_range<SmallVector<Value *, 8>::iterator> vectors() {
+      return make_range(Vectors.begin(), Vectors.end());
+    }
+
+    /// Embed the vectors of the matrix into a flat vector by concatenating
     /// them.
     Value *embedInVector(IRBuilder<> &Builder) const {
-      return Columns.size() == 1 ? Columns[0]
-                                 : concatenateVectors(Builder, Columns);
+      return Vectors.size() == 1 ? Vectors[0]
+                                 : concatenateVectors(Builder, Vectors);
+    }
+
+    MatrixTy &addNumLoads(unsigned N) {
+      OpInfo.NumLoads += N;
+      return *this;
+    }
+
+    void setNumLoads(unsigned N) { OpInfo.NumLoads = N; }
+
+    MatrixTy &addNumStores(unsigned N) {
+      OpInfo.NumStores += N;
+      return *this;
+    }
+
+    MatrixTy &addNumComputeOps(unsigned N) {
+      OpInfo.NumComputeOps += N;
+      return *this;
+    }
+
+    unsigned getNumStores() const { return OpInfo.NumStores; }
+    unsigned getNumLoads() const { return OpInfo.NumLoads; }
+    unsigned getNumComputeOps() const { return OpInfo.NumComputeOps; }
+
+    const OpInfoTy &getOpInfo() const { return OpInfo; }
+
+    bool isColumnMajor() const { return IsColumnMajor; }
+
+    unsigned getStride() const {
+      if (isColumnMajor())
+        return getNumRows();
+      return getNumColumns();
+    }
+
+    /// Extract a vector of \p NumElts starting at index (\p I, \p J). If the
+    /// matrix is column-major, the result vector is extracted from a column
+    /// vector, otherwise from a row vector.
+    Value *extractVector(unsigned I, unsigned J, unsigned NumElts,
+                         IRBuilder<> &Builder) const {
+      Value *Vec = isColumnMajor() ? getColumn(J) : getRow(I);
+      Value *Undef = UndefValue::get(Vec->getType());
+      return Builder.CreateShuffleVector(
+          Vec, Undef, createSequentialMask(isColumnMajor() ? I : J, NumElts, 0),
+          "block");
     }
   };
 
@@ -177,12 +340,15 @@ class LowerMatrixIntrinsics {
     unsigned NumRows;
     unsigned NumColumns;
 
+    bool IsColumnMajor;
+
     ShapeInfo(unsigned NumRows = 0, unsigned NumColumns = 0)
-        : NumRows(NumRows), NumColumns(NumColumns) {}
+        : NumRows(NumRows), NumColumns(NumColumns),
+          IsColumnMajor(MatrixLayout == MatrixLayoutTy::ColumnMajor) {}
 
     ShapeInfo(Value *NumRows, Value *NumColumns)
-        : NumRows(cast<ConstantInt>(NumRows)->getZExtValue()),
-          NumColumns(cast<ConstantInt>(NumColumns)->getZExtValue()) {}
+        : ShapeInfo(cast<ConstantInt>(NumRows)->getZExtValue(),
+                    cast<ConstantInt>(NumColumns)->getZExtValue()) {}
 
     bool operator==(const ShapeInfo &other) {
       return NumRows == other.NumRows && NumColumns == other.NumColumns;
@@ -195,12 +361,24 @@ class LowerMatrixIntrinsics {
       assert(NumRows == 0 || NumColumns != 0);
       return NumRows != 0;
     }
+
+    unsigned getStride() const {
+      if (IsColumnMajor)
+        return NumRows;
+      return NumColumns;
+    }
+
+    unsigned getNumVectors() const {
+      if (IsColumnMajor)
+        return NumColumns;
+      return NumRows;
+    }
   };
 
   /// Maps instructions to their shape information. The shape information
   /// describes the shape to be used while lowering. This matches the shape of
   /// the result value of the instruction, with the only exceptions being store
-  /// instructions and the matrix_columnwise_store intrinsics. For those, the
+  /// instructions and the matrix_column_major_store intrinsics. For those, the
   /// shape information indicates that those instructions should be lowered
   /// using shape information as well.
   DenseMap<Value *, ShapeInfo> ShapeMap;
@@ -211,31 +389,49 @@ class LowerMatrixIntrinsics {
   SmallVector<Instruction *, 16> ToRemove;
 
   /// Map from instructions to their produced column matrix.
-  DenseMap<Value *, ColumnMatrixTy> Inst2ColumnMatrix;
+  MapVector<Value *, MatrixTy> Inst2ColumnMatrix;
 
 public:
-  LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI)
-      : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI) {}
+  LowerMatrixIntrinsics(Function &F, TargetTransformInfo &TTI,
+                        AliasAnalysis &AA, DominatorTree &DT, LoopInfo &LI,
+                        OptimizationRemarkEmitter &ORE)
+      : Func(F), DL(F.getParent()->getDataLayout()), TTI(TTI), AA(AA), DT(DT),
+        LI(LI), ORE(ORE) {}
+
+  unsigned getNumOps(Type *VT) {
+    assert(isa<VectorType>(VT) && "Expected vector type");
+    return getNumOps(VT->getScalarType(),
+                     cast<FixedVectorType>(VT)->getNumElements());
+  }
 
-  /// Return the set of column vectors that a matrix value is lowered to.
+  //
+  /// Return the estimated number of vector ops required for an operation on
+  /// \p VT * N.
+  unsigned getNumOps(Type *ST, unsigned N) {
+    return std::ceil((ST->getPrimitiveSizeInBits() * N).getFixedSize() /
+                     double(TTI.getRegisterBitWidth(true)));
+  }
+
+  /// Return the set of vectors that a matrix value is lowered to.
   ///
-  /// If we lowered \p MatrixVal, just return the cache result column matrix.
-  /// Otherwie split the flat vector \p MatrixVal containing a matrix with
-  /// shape \p SI into column vectors.
-  ColumnMatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI,
-                           IRBuilder<> Builder) {
+  /// If we lowered \p MatrixVal, just return the cache result matrix. Otherwise
+  /// split the flat vector \p MatrixVal containing a matrix with shape \p SI
+  /// into vectors.
+  MatrixTy getMatrix(Value *MatrixVal, const ShapeInfo &SI,
+                     IRBuilder<> &Builder) {
     VectorType *VType = dyn_cast<VectorType>(MatrixVal->getType());
     assert(VType && "MatrixVal must be a vector type");
-    assert(VType->getNumElements() == SI.NumRows * SI.NumColumns &&
+    assert(cast<FixedVectorType>(VType)->getNumElements() ==
+               SI.NumRows * SI.NumColumns &&
            "The vector size must match the number of matrix elements");
 
     // Check if we lowered MatrixVal using shape information. In that case,
-    // return the existing column matrix, if it matches the requested shape
+    // return the existing matrix, if it matches the requested shape
     // information. If there is a mis-match, embed the result in a flat
     // vector and split it later.
     auto Found = Inst2ColumnMatrix.find(MatrixVal);
     if (Found != Inst2ColumnMatrix.end()) {
-      ColumnMatrixTy &M = Found->second;
+      MatrixTy &M = Found->second;
       // Return the found matrix, if its shape matches the requested shape
       // information
       if (SI.NumRows == M.getNumRows() && SI.NumColumns == M.getNumColumns())
@@ -247,10 +443,12 @@ public:
     // Otherwise split MatrixVal.
     SmallVector<Value *, 16> SplitVecs;
     Value *Undef = UndefValue::get(VType);
-    for (unsigned MaskStart = 0; MaskStart < VType->getNumElements();
-         MaskStart += SI.NumRows) {
-      Constant *Mask = createSequentialMask(Builder, MaskStart, SI.NumRows, 0);
-      Value *V = Builder.CreateShuffleVector(MatrixVal, Undef, Mask, "split");
+    for (unsigned MaskStart = 0;
+         MaskStart < cast<FixedVectorType>(VType)->getNumElements();
+         MaskStart += SI.getStride()) {
+      Value *V = Builder.CreateShuffleVector(
+          MatrixVal, Undef, createSequentialMask(MaskStart, SI.getStride(), 0),
+          "split");
       SplitVecs.push_back(V);
     }
 
@@ -308,8 +506,8 @@ public:
       switch (II->getIntrinsicID()) {
       case Intrinsic::matrix_multiply:
       case Intrinsic::matrix_transpose:
-      case Intrinsic::matrix_columnwise_load:
-      case Intrinsic::matrix_columnwise_store:
+      case Intrinsic::matrix_column_major_load:
+      case Intrinsic::matrix_column_major_store:
         return true;
       default:
         return false;
@@ -348,13 +546,13 @@ public:
                                  m_Value(MatrixA), m_Value(M), m_Value(N)))) {
         // Flip dimensions.
         Propagate = setShapeInfo(Inst, {N, M});
-      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_columnwise_store>(
+      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_store>(
                                  m_Value(MatrixA), m_Value(), m_Value(),
-                                 m_Value(M), m_Value(N)))) {
+                                 m_Value(), m_Value(M), m_Value(N)))) {
         Propagate = setShapeInfo(Inst, {N, M});
-      } else if (match(Inst,
-                       m_Intrinsic<Intrinsic::matrix_columnwise_load>(
-                           m_Value(), m_Value(), m_Value(M), m_Value(N)))) {
+      } else if (match(Inst, m_Intrinsic<Intrinsic::matrix_column_major_load>(
+                                 m_Value(), m_Value(), m_Value(), m_Value(M),
+                                 m_Value(N)))) {
         Propagate = setShapeInfo(Inst, {M, N});
       } else if (match(Inst, m_Store(m_Value(MatrixA), m_Value()))) {
         auto OpShape = ShapeMap.find(MatrixA);
@@ -426,14 +624,14 @@ public:
         // Flip dimensions.
         if (setShapeInfo(MatrixA, {M, N}))
           pushInstruction(MatrixA, WorkList);
-      } else if (match(V, m_Intrinsic<Intrinsic::matrix_columnwise_store>(
-                              m_Value(MatrixA), m_Value(), m_Value(),
+      } else if (match(V, m_Intrinsic<Intrinsic::matrix_column_major_store>(
+                              m_Value(MatrixA), m_Value(), m_Value(), m_Value(),
                               m_Value(M), m_Value(N)))) {
         if (setShapeInfo(MatrixA, {M, N})) {
           pushInstruction(MatrixA, WorkList);
         }
       } else if (isa<LoadInst>(V) ||
-                 match(V, m_Intrinsic<Intrinsic::matrix_columnwise_load>())) {
+                 match(V, m_Intrinsic<Intrinsic::matrix_column_major_load>())) {
         // Nothing to do, no matrix input.
       } else if (isa<StoreInst>(V)) {
         // Nothing to do.  We forward-propagated to this so we would just
@@ -472,8 +670,8 @@ public:
           switch (II->getIntrinsicID()) {
           case Intrinsic::matrix_multiply:
           case Intrinsic::matrix_transpose:
-          case Intrinsic::matrix_columnwise_load:
-          case Intrinsic::matrix_columnwise_store:
+          case Intrinsic::matrix_column_major_load:
+          case Intrinsic::matrix_column_major_store:
             WorkList.push_back(&Inst);
             break;
           default:
@@ -487,45 +685,57 @@ public:
       }
     }
 
-    ReversePostOrderTraversal<Function *> RPOT(&Func);
     bool Changed = false;
-    for (auto *BB : RPOT) {
-      for (Instruction &Inst : make_early_inc_range(*BB)) {
-        IRBuilder<> Builder(&Inst);
-
-        if (CallInst *CInst = dyn_cast<CallInst>(&Inst))
-          Changed |= VisitCallInst(CInst);
-
-        Value *Op1;
-        Value *Op2;
-        if (auto *BinOp = dyn_cast<BinaryOperator>(&Inst))
-          Changed |= VisitBinaryOperator(BinOp);
-        if (match(&Inst, m_Load(m_Value(Op1))))
-          Changed |= VisitLoad(&Inst, Op1, Builder);
-        else if (match(&Inst, m_Store(m_Value(Op1), m_Value(Op2))))
-          Changed |= VisitStore(&Inst, Op1, Op2, Builder);
+    SmallVector<CallInst *, 16> MaybeFusableInsts;
+    SmallVector<Instruction *, 16> MatrixInsts;
+
+    // First, collect all instructions with shape information and candidates for
+    // fusion (currently only matrix multiplies).
+    ReversePostOrderTraversal<Function *> RPOT(&Func);
+    for (auto *BB : RPOT)
+      for (Instruction &I : *BB) {
+        if (ShapeMap.find(&I) == ShapeMap.end())
+          continue;
+        if (match(&I, m_Intrinsic<Intrinsic::matrix_multiply>()))
+          MaybeFusableInsts.push_back(cast<CallInst>(&I));
+        MatrixInsts.push_back(&I);
       }
+
+    // Second, try to fuse candidates.
+    SmallPtrSet<Instruction *, 16> FusedInsts;
+    for (CallInst *CI : MaybeFusableInsts)
+      LowerMatrixMultiplyFused(CI, FusedInsts);
+    Changed = !FusedInsts.empty();
+
+    // Third, lower remaining instructions with shape information.
+    for (Instruction *Inst : MatrixInsts) {
+      if (FusedInsts.count(Inst))
+        continue;
+
+      IRBuilder<> Builder(Inst);
+
+      if (CallInst *CInst = dyn_cast<CallInst>(Inst))
+        Changed |= VisitCallInst(CInst);
+
+      Value *Op1;
+      Value *Op2;
+      if (auto *BinOp = dyn_cast<BinaryOperator>(Inst))
+        Changed |= VisitBinaryOperator(BinOp);
+      if (match(Inst, m_Load(m_Value(Op1))))
+        Changed |= VisitLoad(cast<LoadInst>(Inst), Op1, Builder);
+      else if (match(Inst, m_Store(m_Value(Op1), m_Value(Op2))))
+        Changed |= VisitStore(cast<StoreInst>(Inst), Op1, Op2, Builder);
     }
 
+    RemarkGenerator RemarkGen(Inst2ColumnMatrix, ORE, Func);
+    RemarkGen.emitRemarks();
+
     for (Instruction *Inst : reverse(ToRemove))
       Inst->eraseFromParent();
 
     return Changed;
   }
 
-  LoadInst *createColumnLoad(Value *ColumnPtr, Type *EltType,
-                             IRBuilder<> Builder) {
-    unsigned Align = DL.getABITypeAlignment(EltType);
-    return Builder.CreateAlignedLoad(ColumnPtr, Align, "col.load");
-  }
-
-  StoreInst *createColumnStore(Value *ColumnValue, Value *ColumnPtr,
-                               Type *EltType, IRBuilder<> Builder) {
-    unsigned Align = DL.getABITypeAlignment(EltType);
-    return Builder.CreateAlignedStore(ColumnValue, ColumnPtr, Align);
-  }
-
-
   /// Turns \p BasePtr into an elementwise pointer to \p EltType.
   Value *createElementPtr(Value *BasePtr, Type *EltType, IRBuilder<> &Builder) {
     unsigned AS = cast<PointerType>(BasePtr->getType())->getAddressSpace();
@@ -545,11 +755,11 @@ public:
     case Intrinsic::matrix_transpose:
       LowerTranspose(Inst);
       break;
-    case Intrinsic::matrix_columnwise_load:
-      LowerColumnwiseLoad(Inst);
+    case Intrinsic::matrix_column_major_load:
+      LowerColumnMajorLoad(Inst);
       break;
-    case Intrinsic::matrix_columnwise_store:
-      LowerColumnwiseStore(Inst);
+    case Intrinsic::matrix_column_major_store:
+      LowerColumnMajorStore(Inst);
       break;
     default:
       return false;
@@ -557,108 +767,200 @@ public:
     return true;
   }
 
-  void LowerLoad(Instruction *Inst, Value *Ptr, Value *Stride,
-                 ShapeInfo Shape) {
-    IRBuilder<> Builder(Inst);
-    auto VType = cast<VectorType>(Inst->getType());
+  /// Compute the alignment for a column/row \p Idx with \p Stride between them.
+  /// The address at \p Idx == 0 has alignment \p A. If \p Stride is a
+  /// ConstantInt, reduce the initial alignment based on the byte offset. For
+  /// non-ConstantInt strides, return the common alignment of the initial
+  /// alignment and the element size in bytes.
+  Align getAlignForIndex(unsigned Idx, Value *Stride, Type *ElementTy,
+                         MaybeAlign A) const {
+    Align InitialAlign = DL.getValueOrABITypeAlignment(A, ElementTy);
+    if (Idx == 0)
+      return InitialAlign;
+
+    TypeSize ElementSizeInBits = DL.getTypeSizeInBits(ElementTy);
+    if (auto *ConstStride = dyn_cast<ConstantInt>(Stride)) {
+      uint64_t StrideInBytes =
+          ConstStride->getZExtValue() * ElementSizeInBits / 8;
+      return commonAlignment(InitialAlign, Idx * StrideInBytes);
+    }
+    return commonAlignment(InitialAlign, ElementSizeInBits / 8);
+  }
+
+  /// Load a matrix with \p Shape starting at \p Ptr and using \p Stride between
+  /// vectors.
+  MatrixTy loadMatrix(Type *Ty, Value *Ptr, MaybeAlign MAlign, Value *Stride,
+                      bool IsVolatile, ShapeInfo Shape, IRBuilder<> &Builder) {
+    auto VType = cast<VectorType>(Ty);
     Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
-    ColumnMatrixTy Result;
-    // Distance between start of one column and the start of the next
-    for (unsigned C = 0, E = Shape.NumColumns; C < E; ++C) {
-      Value *GEP =
-          computeColumnAddr(EltPtr, Builder.getInt32(C), Stride, Shape.NumRows,
-                            VType->getElementType(), Builder);
-      Value *Column = createColumnLoad(GEP, VType->getElementType(), Builder);
-      Result.addColumn(Column);
+    MatrixTy Result;
+    for (unsigned I = 0, E = Shape.getNumVectors(); I < E; ++I) {
+      Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(I), Stride,
+                                     Shape.getStride(), VType->getElementType(),
+                                     Builder);
+      Value *Vector = Builder.CreateAlignedLoad(
+          GEP, getAlignForIndex(I, Stride, VType->getElementType(), MAlign),
+          IsVolatile, "col.load");
+
+      Result.addVector(Vector);
     }
+    return Result.addNumLoads(getNumOps(Result.getVectorTy()) *
+                              Result.getNumVectors());
+  }
 
-    finalizeLowering(Inst, Result, Builder);
+  /// Loads a sub-matrix with shape \p ResultShape from a \p R x \p C matrix,
+  /// starting at \p MatrixPtr[I][J].
+  MatrixTy loadMatrix(Value *MatrixPtr, MaybeAlign Align, bool IsVolatile,
+                      ShapeInfo MatrixShape, Value *I, Value *J,
+                      ShapeInfo ResultShape, Type *EltTy,
+                      IRBuilder<> &Builder) {
+
+    Value *Offset = Builder.CreateAdd(
+        Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
+
+    unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace();
+    Value *EltPtr =
+        Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS));
+    Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset);
+    auto *TileTy = FixedVectorType::get(EltTy, ResultShape.NumRows *
+                                                   ResultShape.NumColumns);
+    Type *TilePtrTy = PointerType::get(TileTy, AS);
+    Value *TilePtr =
+        Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast");
+
+    return loadMatrix(TileTy, TilePtr, Align,
+                      Builder.getInt64(MatrixShape.getStride()), IsVolatile,
+                      ResultShape, Builder);
+  }
+
+  /// Lower a load instruction with shape information.
+  void LowerLoad(Instruction *Inst, Value *Ptr, MaybeAlign Align, Value *Stride,
+                 bool IsVolatile, ShapeInfo Shape) {
+    IRBuilder<> Builder(Inst);
+    finalizeLowering(Inst,
+                     loadMatrix(Inst->getType(), Ptr, Align, Stride, IsVolatile,
+                                Shape, Builder),
+                     Builder);
   }
 
-  /// Lowers llvm.matrix.columnwise.load.
+  /// Lowers llvm.matrix.column.major.load.
   ///
   /// The intrinsic loads a matrix from memory using a stride between columns.
-  void LowerColumnwiseLoad(CallInst *Inst) {
+  void LowerColumnMajorLoad(CallInst *Inst) {
+    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
+           "Intrinsic only supports column-major layout!");
     Value *Ptr = Inst->getArgOperand(0);
     Value *Stride = Inst->getArgOperand(1);
-    LowerLoad(Inst, Ptr, Stride,
-              {Inst->getArgOperand(2), Inst->getArgOperand(3)});
+    LowerLoad(Inst, Ptr, Inst->getParamAlign(0), Stride,
+              cast<ConstantInt>(Inst->getArgOperand(2))->isOne(),
+              {Inst->getArgOperand(3), Inst->getArgOperand(4)});
   }
 
-  void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, Value *Stride,
-                  ShapeInfo Shape) {
-    IRBuilder<> Builder(Inst);
-    auto VType = cast<VectorType>(Matrix->getType());
+  /// Stores a sub-matrix \p StoreVal into the \p R x \p C matrix starting at \p
+  /// MatrixPtr[I][J].
+  void storeMatrix(const MatrixTy &StoreVal, Value *MatrixPtr,
+                   MaybeAlign MAlign, bool IsVolatile, ShapeInfo MatrixShape,
+                   Value *I, Value *J, Type *EltTy, IRBuilder<> &Builder) {
+    Value *Offset = Builder.CreateAdd(
+        Builder.CreateMul(J, Builder.getInt64(MatrixShape.getStride())), I);
+
+    unsigned AS = cast<PointerType>(MatrixPtr->getType())->getAddressSpace();
+    Value *EltPtr =
+        Builder.CreatePointerCast(MatrixPtr, PointerType::get(EltTy, AS));
+    Value *TileStart = Builder.CreateGEP(EltTy, EltPtr, Offset);
+    auto *TileTy = FixedVectorType::get(EltTy, StoreVal.getNumRows() *
+                                                   StoreVal.getNumColumns());
+    Type *TilePtrTy = PointerType::get(TileTy, AS);
+    Value *TilePtr =
+        Builder.CreatePointerCast(TileStart, TilePtrTy, "col.cast");
+
+    storeMatrix(TileTy, StoreVal, TilePtr, MAlign,
+                Builder.getInt64(MatrixShape.getStride()), IsVolatile, Builder);
+  }
+
+  /// Store matrix \p StoreVal starting at \p Ptr and using \p Stride between
+  /// vectors.
+  MatrixTy storeMatrix(Type *Ty, MatrixTy StoreVal, Value *Ptr,
+                       MaybeAlign MAlign, Value *Stride, bool IsVolatile,
+                       IRBuilder<> &Builder) {
+    auto VType = cast<VectorType>(Ty);
     Value *EltPtr = createElementPtr(Ptr, VType->getElementType(), Builder);
-    auto LM = getMatrix(Matrix, Shape, Builder);
-    for (auto C : enumerate(LM.columns())) {
-      Value *GEP =
-          computeColumnAddr(EltPtr, Builder.getInt32(C.index()), Stride,
-                            Shape.NumRows, VType->getElementType(), Builder);
-      createColumnStore(C.value(), GEP, VType->getElementType(), Builder);
+    for (auto Vec : enumerate(StoreVal.vectors())) {
+      Value *GEP = computeVectorAddr(EltPtr, Builder.getInt64(Vec.index()),
+                                     Stride, StoreVal.getStride(),
+                                     VType->getElementType(), Builder);
+      Builder.CreateAlignedStore(Vec.value(), GEP,
+                                 getAlignForIndex(Vec.index(), Stride,
+                                                  VType->getElementType(),
+                                                  MAlign),
+                                 IsVolatile);
     }
+    return MatrixTy().addNumStores(getNumOps(StoreVal.getVectorTy()) *
+                                   StoreVal.getNumVectors());
+  }
 
-    ToRemove.push_back(Inst);
+  /// Lower a store instruction with shape information.
+  void LowerStore(Instruction *Inst, Value *Matrix, Value *Ptr, MaybeAlign A,
+                  Value *Stride, bool IsVolatile, ShapeInfo Shape) {
+    IRBuilder<> Builder(Inst);
+    auto StoreVal = getMatrix(Matrix, Shape, Builder);
+    finalizeLowering(Inst,
+                     storeMatrix(Matrix->getType(), StoreVal, Ptr, A, Stride,
+                                 IsVolatile, Builder),
+                     Builder);
   }
 
-  /// Lowers llvm.matrix.columnwise.store.
+  /// Lowers llvm.matrix.column.major.store.
   ///
   /// The intrinsic store a matrix back memory using a stride between columns.
-  void LowerColumnwiseStore(CallInst *Inst) {
+  void LowerColumnMajorStore(CallInst *Inst) {
+    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
+           "Intrinsic only supports column-major layout!");
     Value *Matrix = Inst->getArgOperand(0);
     Value *Ptr = Inst->getArgOperand(1);
     Value *Stride = Inst->getArgOperand(2);
-    LowerStore(Inst, Matrix, Ptr, Stride,
-               {Inst->getArgOperand(3), Inst->getArgOperand(4)});
-  }
-
-  /// Extract a column vector of \p NumElts starting at index (\p I, \p J) from
-  /// the matrix \p LM represented as a vector of column vectors.
-  Value *extractVector(const ColumnMatrixTy &LM, unsigned I, unsigned J,
-                       unsigned NumElts, IRBuilder<> Builder) {
-    Value *Col = LM.getColumn(J);
-    Value *Undef = UndefValue::get(Col->getType());
-    Constant *Mask = createSequentialMask(Builder, I, NumElts, 0);
-    return Builder.CreateShuffleVector(Col, Undef, Mask, "block");
+    LowerStore(Inst, Matrix, Ptr, Inst->getParamAlign(1), Stride,
+               cast<ConstantInt>(Inst->getArgOperand(3))->isOne(),
+               {Inst->getArgOperand(4), Inst->getArgOperand(5)});
   }
 
   // Set elements I..I+NumElts-1 to Block
   Value *insertVector(Value *Col, unsigned I, Value *Block,
-                      IRBuilder<> Builder) {
+                      IRBuilder<> &Builder) {
 
     // First, bring Block to the same size as Col
     unsigned BlockNumElts =
-        cast<VectorType>(Block->getType())->getNumElements();
-    unsigned NumElts = cast<VectorType>(Col->getType())->getNumElements();
+        cast<FixedVectorType>(Block->getType())->getNumElements();
+    unsigned NumElts = cast<FixedVectorType>(Col->getType())->getNumElements();
     assert(NumElts >= BlockNumElts && "Too few elements for current block");
 
-    Value *ExtendMask =
-        createSequentialMask(Builder, 0, BlockNumElts, NumElts - BlockNumElts);
     Value *Undef = UndefValue::get(Block->getType());
-    Block = Builder.CreateShuffleVector(Block, Undef, ExtendMask);
+    Block = Builder.CreateShuffleVector(
+        Block, Undef,
+        createSequentialMask(0, BlockNumElts, NumElts - BlockNumElts));
 
     // If Col is 7 long and I is 2 and BlockNumElts is 2 the mask is: 0, 1, 7,
     // 8, 4, 5, 6
-    SmallVector<Constant *, 16> Mask;
+    SmallVector<int, 16> Mask;
     unsigned i;
     for (i = 0; i < I; i++)
-      Mask.push_back(Builder.getInt32(i));
+      Mask.push_back(i);
 
-    unsigned VecNumElts = cast<VectorType>(Col->getType())->getNumElements();
+    unsigned VecNumElts =
+        cast<FixedVectorType>(Col->getType())->getNumElements();
     for (; i < I + BlockNumElts; i++)
-      Mask.push_back(Builder.getInt32(i - I + VecNumElts));
+      Mask.push_back(i - I + VecNumElts);
 
     for (; i < VecNumElts; i++)
-      Mask.push_back(Builder.getInt32(i));
-
-    Value *MaskVal = ConstantVector::get(Mask);
+      Mask.push_back(i);
 
-    return Builder.CreateShuffleVector(Col, Block, MaskVal);
+    return Builder.CreateShuffleVector(Col, Block, Mask);
   }
 
   Value *createMulAdd(Value *Sum, Value *A, Value *B, bool UseFPOp,
-                      IRBuilder<> &Builder, bool AllowContraction) {
-
+                      IRBuilder<> &Builder, bool AllowContraction,
+                      unsigned &NumComputeOps) {
+    NumComputeOps += getNumOps(A->getType());
     if (!Sum)
       return UseFPOp ? Builder.CreateFMul(A, B) : Builder.CreateMul(A, B);
 
@@ -666,14 +968,16 @@ public:
       if (AllowContraction) {
         // Use fmuladd for floating point operations and let the backend decide
         // if that's profitable.
-        Value *FMulAdd = Intrinsic::getDeclaration(
+        Function *FMulAdd = Intrinsic::getDeclaration(
             Func.getParent(), Intrinsic::fmuladd, A->getType());
         return Builder.CreateCall(FMulAdd, {A, B, Sum});
       }
+      NumComputeOps += getNumOps(A->getType());
       Value *Mul = Builder.CreateFMul(A, B);
       return Builder.CreateFAdd(Sum, Mul);
     }
 
+    NumComputeOps += getNumOps(A->getType());
     Value *Mul = Builder.CreateMul(A, B);
     return Builder.CreateAdd(Sum, Mul);
   }
@@ -683,7 +987,7 @@ public:
   /// cached value when they are lowered. For other users, \p Matrix is
   /// flattened and the uses are updated to use it. Also marks \p Inst for
   /// deletion.
-  void finalizeLowering(Instruction *Inst, ColumnMatrixTy Matrix,
+  void finalizeLowering(Instruction *Inst, MatrixTy Matrix,
                         IRBuilder<> &Builder) {
     Inst2ColumnMatrix.insert(std::make_pair(Inst, Matrix));
 
@@ -699,6 +1003,294 @@ public:
     }
   }
 
+  /// Compute \p Result += \p A * \p B for input matrices with left-associating
+  /// addition.
+  void emitMatrixMultiply(MatrixTy &Result, const MatrixTy &A,
+                          const MatrixTy &B, bool AllowContraction,
+                          IRBuilder<> &Builder, bool isTiled) {
+    const unsigned VF = std::max<unsigned>(
+        TTI.getRegisterBitWidth(true) /
+            Result.getElementType()->getPrimitiveSizeInBits().getFixedSize(),
+        1U);
+    unsigned R = Result.getNumRows();
+    unsigned C = Result.getNumColumns();
+    unsigned M = A.getNumColumns();
+
+    bool IsFP = Result.getElementType()->isFloatingPointTy();
+    assert(A.isColumnMajor() == B.isColumnMajor() &&
+           Result.isColumnMajor() == A.isColumnMajor() &&
+           "operands must agree on matrix layout");
+    unsigned NumComputeOps = 0;
+    if (A.isColumnMajor()) {
+      // Multiply columns from the first operand with scalars from the second
+      // operand. Then move along the K axes and accumulate the columns.  With
+      // this the adds can be vectorized without reassociation.
+      for (unsigned J = 0; J < C; ++J) {
+        unsigned BlockSize = VF;
+        // If Result is zero, we don't need to accumulate in the K==0 iteration.
+        bool isSumZero = isa<ConstantAggregateZero>(Result.getColumn(J));
+
+        for (unsigned I = 0; I < R; I += BlockSize) {
+          // Gradually lower the vectorization factor to cover the remainder.
+          while (I + BlockSize > R)
+            BlockSize /= 2;
+
+          Value *Sum = isTiled ? Result.extractVector(I, J, BlockSize, Builder)
+                               : nullptr;
+          for (unsigned K = 0; K < M; ++K) {
+            Value *L = A.extractVector(I, K, BlockSize, Builder);
+            Value *RH = Builder.CreateExtractElement(B.getColumn(J), K);
+            Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");
+            Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, L, Splat,
+                               Result.getElementType()->isFloatingPointTy(),
+                               Builder, AllowContraction, NumComputeOps);
+          }
+          Result.setVector(J,
+                           insertVector(Result.getVector(J), I, Sum, Builder));
+        }
+      }
+    } else {
+      // Multiply rows from the second operand with scalars from the first
+      // operand. Then move along the K axes and accumulate the rows.  With this
+      // the adds can be vectorized without reassociation.
+      for (unsigned I = 0; I < R; ++I) {
+        unsigned BlockSize = VF;
+        bool isSumZero = isa<ConstantAggregateZero>(Result.getRow(I));
+        for (unsigned J = 0; J < C; J += BlockSize) {
+          // Gradually lower the vectorization factor to cover the remainder.
+          while (J + BlockSize > C)
+            BlockSize /= 2;
+
+          Value *Sum = nullptr;
+          for (unsigned K = 0; K < M; ++K) {
+            Value *R = B.extractVector(K, J, BlockSize, Builder);
+            Value *LH = Builder.CreateExtractElement(A.getVector(I), K);
+            Value *Splat = Builder.CreateVectorSplat(BlockSize, LH, "splat");
+            Sum = createMulAdd(isSumZero && K == 0 ? nullptr : Sum, Splat, R,
+                               IsFP, Builder, AllowContraction, NumComputeOps);
+          }
+          Result.setVector(I,
+                           insertVector(Result.getVector(I), J, Sum, Builder));
+        }
+      }
+    }
+    Result.addNumComputeOps(NumComputeOps);
+  }
+
+  /// Ensure that the memory in \p Load does not alias \p Store by potentially
+  /// copying it to a new location.  This new or otherwise the original location
+  /// is returned.
+  Value *getNonAliasingPointer(LoadInst *Load, StoreInst *Store,
+                               CallInst *MatMul) {
+    MemoryLocation StoreLoc = MemoryLocation::get(Store);
+    MemoryLocation LoadLoc = MemoryLocation::get(Load);
+
+    AliasResult LdAliased = AA.alias(LoadLoc, StoreLoc);
+
+    // If we can statically determine noalias we're good.
+    if (!LdAliased)
+      return Load->getPointerOperand();
+
+    // Create code to check if the memory locations of the Load and Store
+    // overlap and if they do, copy Load's operand to a new buffer.
+
+    // First, create  new blocks for 2n part of the check and the copy.
+    BasicBlock *Check0 = MatMul->getParent();
+    // FIXME: Use lazy DTU and update SplitBlock to accept a DTU instead of a
+    // DT. Manually collect dominator tree updates, to avoid unnecessary work,
+    // as we adjust Check0 and Check1's branches.
+    SmallVector<DominatorTree::UpdateType, 4> DTUpdates;
+    for (BasicBlock *Succ : successors(Check0))
+      DTUpdates.push_back({DT.Delete, Check0, Succ});
+
+    BasicBlock *Check1 = SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI,
+                                    nullptr, "alias_cont");
+    BasicBlock *Copy =
+        SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI, nullptr, "copy");
+    BasicBlock *Fusion = SplitBlock(MatMul->getParent(), MatMul, nullptr, &LI,
+                                    nullptr, "no_alias");
+
+    // Check if the loaded memory location begins before the end of the store
+    // location. If the condition holds, they might overlap, otherwise they are
+    // guaranteed to not overlap.
+    IRBuilder<> Builder(MatMul);
+    Check0->getTerminator()->eraseFromParent();
+    Builder.SetInsertPoint(Check0);
+    Type *IntPtrTy = Builder.getIntPtrTy(Load->getModule()->getDataLayout());
+    Value *StoreBegin = Builder.CreatePtrToInt(
+        const_cast<Value *>(StoreLoc.Ptr), IntPtrTy, "store.begin");
+    Value *StoreEnd = Builder.CreateAdd(
+        StoreBegin, ConstantInt::get(IntPtrTy, StoreLoc.Size.getValue()),
+        "store.end", true, true);
+    Value *LoadBegin = Builder.CreatePtrToInt(const_cast<Value *>(LoadLoc.Ptr),
+                                              IntPtrTy, "load.begin");
+    Builder.CreateCondBr(Builder.CreateICmpULT(LoadBegin, StoreEnd), Check1,
+                         Fusion);
+
+    // Check if the store begins before the end of the load location. If the
+    // condition holds, they alias, otherwise they are guaranteed to not
+    // overlap.
+    Check1->getTerminator()->eraseFromParent();
+    Builder.SetInsertPoint(Check1, Check1->begin());
+    Value *LoadEnd = Builder.CreateAdd(
+        LoadBegin, ConstantInt::get(IntPtrTy, LoadLoc.Size.getValue()),
+        "load.end", true, true);
+    Builder.CreateCondBr(Builder.CreateICmpULT(StoreBegin, LoadEnd), Copy,
+                         Fusion);
+
+    // Copy load operand to new alloca.
+    Builder.SetInsertPoint(Copy, Copy->begin());
+    AllocaInst *NewLd =
+        Builder.CreateAlloca(Load->getType(), Load->getPointerAddressSpace());
+    Builder.CreateMemCpy(NewLd, NewLd->getAlign(),
+                         Load->getPointerOperand(), Load->getAlign(),
+                         LoadLoc.Size.getValue());
+    Builder.SetInsertPoint(Fusion, Fusion->begin());
+    PHINode *PHI = Builder.CreatePHI(Load->getPointerOperandType(), 3);
+    PHI->addIncoming(Load->getPointerOperand(), Check0);
+    PHI->addIncoming(Load->getPointerOperand(), Check1);
+    PHI->addIncoming(NewLd, Copy);
+
+    // Adjust DT.
+    DTUpdates.push_back({DT.Insert, Check0, Check1});
+    DTUpdates.push_back({DT.Insert, Check0, Fusion});
+    DTUpdates.push_back({DT.Insert, Check1, Copy});
+    DTUpdates.push_back({DT.Insert, Check1, Fusion});
+    DT.applyUpdates(DTUpdates);
+    return PHI;
+  }
+
+  bool isFusionProfitable(CallInst *MatMul) {
+    if (ForceFusion)
+      return true;
+
+    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
+    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
+
+    const unsigned R = LShape.NumRows;
+    const unsigned C = RShape.NumColumns;
+    const unsigned M = LShape.NumColumns;
+    auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
+
+    const unsigned VF =
+        std::max<unsigned>(TTI.getRegisterBitWidth(true) /
+                               EltType->getPrimitiveSizeInBits().getFixedSize(),
+                           1U);
+
+    // Cost model for tiling
+    //
+    // For tiling to be beneficial, we need reuse either along the R or
+    // the C axis.  We vectorize along the R axis so that means at least
+    // 3 elements.
+    // TODO: Also consider cost of copying if operands alias.
+    if (R <= VF && C == 1)
+      return false;
+    // Then we need enough elements to exceed the number of vector
+    // registers we have.  Note that this is an oversimplification since
+    // fusing also takes some extra loads which may exceed the number of
+    // reloads necessary.
+    unsigned Op0Regs = (R + VF - 1) / VF * M;
+    unsigned Op1Regs = (M + VF - 1) / VF * C;
+    return Op0Regs + Op1Regs > TTI.getNumberOfRegisters(true);
+  }
+
+  MatrixTy getZeroMatrix(Type *EltType, unsigned R, unsigned C) {
+    MatrixTy Res;
+    auto *ColumType = FixedVectorType::get(EltType, R);
+    for (unsigned I = 0; I < C; ++I)
+      Res.addVector(ConstantAggregateZero::get(ColumType));
+    return Res;
+  }
+
+  void emitSIMDTiling(CallInst *MatMul, LoadInst *LoadOp0, LoadInst *LoadOp1,
+                      StoreInst *Store,
+                      SmallPtrSetImpl<Instruction *> &FusedInsts) {
+    assert(MatrixLayout == MatrixLayoutTy::ColumnMajor &&
+           "Tiling only supported for column-major matrixes at the moment!");
+    if (!isFusionProfitable(MatMul))
+      return;
+
+    ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
+    ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
+
+    const unsigned R = LShape.NumRows;
+    const unsigned C = RShape.NumColumns;
+    const unsigned M = LShape.NumColumns;
+    auto *EltType = cast<VectorType>(MatMul->getType())->getElementType();
+
+    Value *APtr = getNonAliasingPointer(LoadOp0, Store, MatMul);
+    Value *BPtr = getNonAliasingPointer(LoadOp1, Store, MatMul);
+    Value *CPtr = Store->getPointerOperand();
+
+    bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
+                                                  MatMul->hasAllowContract());
+    IRBuilder<> Builder(Store);
+    for (unsigned J = 0; J < C; J += TileSize)
+      for (unsigned I = 0; I < R; I += TileSize) {
+        const unsigned TileR = std::min(R - I, unsigned(TileSize));
+        const unsigned TileC = std::min(C - J, unsigned(TileSize));
+        MatrixTy Res = getZeroMatrix(EltType, TileR, TileC);
+
+        for (unsigned K = 0; K < M; K += TileSize) {
+          const unsigned TileM = std::min(M - K, unsigned(TileSize));
+          MatrixTy A =
+              loadMatrix(APtr, LoadOp0->getAlign(), LoadOp0->isVolatile(),
+                         LShape, Builder.getInt64(I), Builder.getInt64(K),
+                         {TileR, TileM}, EltType, Builder);
+          MatrixTy B =
+              loadMatrix(BPtr, LoadOp1->getAlign(), LoadOp1->isVolatile(),
+                         RShape, Builder.getInt64(K), Builder.getInt64(J),
+                         {TileM, TileC}, EltType, Builder);
+          emitMatrixMultiply(Res, A, B, AllowContract, Builder, true);
+        }
+        storeMatrix(Res, CPtr, Store->getAlign(), Store->isVolatile(), {R, M},
+                    Builder.getInt64(I), Builder.getInt64(J), EltType, Builder);
+      }
+
+    // Mark eliminated instructions as fused and remove them.
+    FusedInsts.insert(Store);
+    FusedInsts.insert(MatMul);
+    Store->eraseFromParent();
+    MatMul->eraseFromParent();
+    if (LoadOp0->hasNUses(0)) {
+      FusedInsts.insert(LoadOp0);
+      LoadOp0->eraseFromParent();
+    }
+    if (LoadOp1->hasNUses(0)) {
+      FusedInsts.insert(LoadOp1);
+      LoadOp1->eraseFromParent();
+    }
+  }
+
+  /// Try to lower matrix multiply chains by fusing operations.
+  ///
+  /// Currently we only lower {ld, ld} -> matmul -> st chains.
+  //
+  /// No need to return a MatrixTy object for the result of the operation, since
+  /// the single store user will be lowered as part of this. Instructions that
+  /// are completely eliminated by fusion are added to \p FusedInsts.
+  void LowerMatrixMultiplyFused(CallInst *MatMul,
+                                SmallPtrSetImpl<Instruction *> &FusedInsts) {
+    if (!FuseMatrix || !MatMul->hasOneUse() ||
+        MatrixLayout != MatrixLayoutTy::ColumnMajor)
+      return;
+
+    auto *LoadOp0 = dyn_cast<LoadInst>(MatMul->getOperand(0));
+    auto *LoadOp1 = dyn_cast<LoadInst>(MatMul->getOperand(1));
+    auto *Store = dyn_cast<StoreInst>(*MatMul->user_begin());
+    if (LoadOp0 && LoadOp1 && Store) {
+      // The store address must dominate the MatMul instruction, otherwise
+      // we create invalid IR.
+      // FIXME: See if we can hoist the store address computation.
+      auto *AddrI = dyn_cast<Instruction>(Store->getOperand(1));
+      if (AddrI && (!DT.dominates(AddrI, MatMul)))
+        return;
+
+      emitSIMDTiling(MatMul, LoadOp0, LoadOp1, Store, FusedInsts);
+      return;
+    }
+  }
+
   /// Lowers llvm.matrix.multiply.
   void LowerMultiply(CallInst *MatMul) {
     IRBuilder<> Builder(MatMul);
@@ -706,97 +1298,80 @@ public:
     ShapeInfo LShape(MatMul->getArgOperand(2), MatMul->getArgOperand(3));
     ShapeInfo RShape(MatMul->getArgOperand(3), MatMul->getArgOperand(4));
 
-    const ColumnMatrixTy &Lhs =
-        getMatrix(MatMul->getArgOperand(0), LShape, Builder);
-    const ColumnMatrixTy &Rhs =
-        getMatrix(MatMul->getArgOperand(1), RShape, Builder);
+    const MatrixTy &Lhs = getMatrix(MatMul->getArgOperand(0), LShape, Builder);
+    const MatrixTy &Rhs = getMatrix(MatMul->getArgOperand(1), RShape, Builder);
 
     const unsigned R = LShape.NumRows;
-    const unsigned M = LShape.NumColumns;
     const unsigned C = RShape.NumColumns;
-    assert(M == RShape.NumRows);
+    assert(LShape.NumColumns == RShape.NumRows);
 
     // Initialize the output
-    ColumnMatrixTy Result;
-    for (unsigned J = 0; J < C; ++J)
-      Result.addColumn(UndefValue::get(VectorType::get(EltType, R)));
-
-    const unsigned VF = std::max(TTI.getRegisterBitWidth(true) /
-                                     EltType->getPrimitiveSizeInBits(),
-                                 uint64_t(1));
+    MatrixTy Result(R, C, EltType);
 
     bool AllowContract = AllowContractEnabled || (isa<FPMathOperator>(MatMul) &&
                                                   MatMul->hasAllowContract());
-    // Multiply columns from the first operand with scalars from the second
-    // operand.  Then move along the K axes and accumulate the columns.  With
-    // this the adds can be vectorized without reassociation.
-    for (unsigned J = 0; J < C; ++J) {
-      unsigned BlockSize = VF;
-      for (unsigned I = 0; I < R; I += BlockSize) {
-        // Gradually lower the vectorization factor to cover the remainder.
-        while (I + BlockSize > R)
-          BlockSize /= 2;
-
-        Value *Sum = nullptr;
-        for (unsigned K = 0; K < M; ++K) {
-          Value *L = extractVector(Lhs, I, K, BlockSize, Builder);
-          Value *RH = Builder.CreateExtractElement(Rhs.getColumn(J), K);
-          Value *Splat = Builder.CreateVectorSplat(BlockSize, RH, "splat");
-          Sum = createMulAdd(Sum, L, Splat, EltType->isFloatingPointTy(),
-                             Builder, AllowContract);
-        }
-        Result.setColumn(J, insertVector(Result.getColumn(J), I, Sum, Builder));
-      }
-    }
+    emitMatrixMultiply(Result, Lhs, Rhs, AllowContract, Builder, false);
     finalizeLowering(MatMul, Result, Builder);
   }
 
   /// Lowers llvm.matrix.transpose.
   void LowerTranspose(CallInst *Inst) {
-    ColumnMatrixTy Result;
+    MatrixTy Result;
     IRBuilder<> Builder(Inst);
     Value *InputVal = Inst->getArgOperand(0);
     VectorType *VectorTy = cast<VectorType>(InputVal->getType());
     ShapeInfo ArgShape(Inst->getArgOperand(1), Inst->getArgOperand(2));
-    ColumnMatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder);
-
-    for (unsigned Row = 0; Row < ArgShape.NumRows; ++Row) {
-      // Build a single column vector for this row. First initialize it.
-      Value *ResultColumn = UndefValue::get(
-          VectorType::get(VectorTy->getElementType(), ArgShape.NumColumns));
-
-      // Go through the elements of this row and insert it into the resulting
-      // column vector.
-      for (auto C : enumerate(InputMatrix.columns())) {
-        Value *Elt = Builder.CreateExtractElement(C.value(), Row);
-        // We insert at index Column since that is the row index after the
-        // transpose.
-        ResultColumn =
-            Builder.CreateInsertElement(ResultColumn, Elt, C.index());
+    MatrixTy InputMatrix = getMatrix(InputVal, ArgShape, Builder);
+
+    const unsigned NewNumVecs =
+        InputMatrix.isColumnMajor() ? ArgShape.NumRows : ArgShape.NumColumns;
+    const unsigned NewNumElts =
+        InputMatrix.isColumnMajor() ? ArgShape.NumColumns : ArgShape.NumRows;
+
+    for (unsigned I = 0; I < NewNumVecs; ++I) {
+      // Build a single result vector. First initialize it.
+      Value *ResultVector = UndefValue::get(
+          FixedVectorType::get(VectorTy->getElementType(), NewNumElts));
+      // Go through the old elements and insert it into the resulting vector.
+      for (auto J : enumerate(InputMatrix.vectors())) {
+        Value *Elt = Builder.CreateExtractElement(J.value(), I);
+        // Row and column indices are transposed.
+        ResultVector =
+            Builder.CreateInsertElement(ResultVector, Elt, J.index());
       }
-      Result.addColumn(ResultColumn);
+      Result.addVector(ResultVector);
     }
 
-    finalizeLowering(Inst, Result, Builder);
+    // TODO: Improve estimate of operations needed for transposes. Currently we
+    // just count the insertelement/extractelement instructions, but do not
+    // account for later simplifications/combines.
+    finalizeLowering(
+        Inst,
+        Result.addNumComputeOps(2 * ArgShape.NumRows * ArgShape.NumColumns),
+        Builder);
   }
 
   /// Lower load instructions, if shape information is available.
-  bool VisitLoad(Instruction *Inst, Value *Ptr, IRBuilder<> &Builder) {
+  bool VisitLoad(LoadInst *Inst, Value *Ptr, IRBuilder<> &Builder) {
     auto I = ShapeMap.find(Inst);
     if (I == ShapeMap.end())
       return false;
 
-    LowerLoad(Inst, Ptr, Builder.getInt32(I->second.NumRows), I->second);
+    LowerLoad(Inst, Ptr, Inst->getAlign(),
+              Builder.getInt64(I->second.getStride()), Inst->isVolatile(),
+              I->second);
     return true;
   }
 
-  bool VisitStore(Instruction *Inst, Value *StoredVal, Value *Ptr,
+  bool VisitStore(StoreInst *Inst, Value *StoredVal, Value *Ptr,
                   IRBuilder<> &Builder) {
     auto I = ShapeMap.find(StoredVal);
     if (I == ShapeMap.end())
       return false;
 
-    LowerStore(Inst, StoredVal, Ptr, Builder.getInt32(I->second.NumRows), I->second);
+    LowerStore(Inst, StoredVal, Ptr, Inst->getAlign(),
+               Builder.getInt64(I->second.getStride()), Inst->isVolatile(),
+               I->second);
     return true;
   }
 
@@ -812,12 +1387,15 @@ public:
     IRBuilder<> Builder(Inst);
     ShapeInfo &Shape = I->second;
 
-    ColumnMatrixTy LoweredLhs = getMatrix(Lhs, Shape, Builder);
-    ColumnMatrixTy LoweredRhs = getMatrix(Rhs, Shape, Builder);
+    MatrixTy Result;
+    MatrixTy A = getMatrix(Lhs, Shape, Builder);
+    MatrixTy B = getMatrix(Rhs, Shape, Builder);
+    assert(A.isColumnMajor() == B.isColumnMajor() &&
+           Result.isColumnMajor() == A.isColumnMajor() &&
+           "operands must agree on matrix layout");
 
-    // Add each column and store the result back into the opmapping
-    ColumnMatrixTy Result;
-    auto BuildColumnOp = [&Builder, Inst](Value *LHS, Value *RHS) {
+    // Helper to perform binary op on vectors.
+    auto BuildVectorOp = [&Builder, Inst](Value *LHS, Value *RHS) {
       switch (Inst->getOpcode()) {
       case Instruction::Add:
         return Builder.CreateAdd(LHS, RHS);
@@ -835,20 +1413,462 @@ public:
         llvm_unreachable("Unsupported binary operator for matrix");
       }
     };
-    for (unsigned C = 0; C < Shape.NumColumns; ++C)
-      Result.addColumn(
-          BuildColumnOp(LoweredLhs.getColumn(C), LoweredRhs.getColumn(C)));
 
-    finalizeLowering(Inst, Result, Builder);
+    for (unsigned I = 0; I < Shape.getNumVectors(); ++I)
+      Result.addVector(BuildVectorOp(A.getVector(I), B.getVector(I)));
+
+    finalizeLowering(Inst,
+                     Result.addNumComputeOps(getNumOps(Result.getVectorTy()) *
+                                             Result.getNumVectors()),
+                     Builder);
     return true;
   }
+
+  /// Helper to linearize a matrix expression tree into a string. Currently
+  /// matrix expressions are linarized by starting at an expression leaf and
+  /// linearizing bottom up.
+  struct ExprLinearizer {
+    unsigned LengthToBreak = 100;
+    std::string Str;
+    raw_string_ostream Stream;
+    unsigned LineLength = 0;
+    const DataLayout &DL;
+
+    /// Mapping from instructions to matrixes. It is used to identify
+    /// matrix instructions.
+    const MapVector<Value *, MatrixTy> &Inst2Matrix;
+
+    /// Mapping from values to the leaves of all expressions that the value is
+    /// part of.
+    const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared;
+
+    /// Set of matrix expressions in the scope of a given DISubprogram.
+    const SmallSetVector<Value *, 32> &ExprsInSubprogram;
+
+    /// Leaf node of the expression to linearize.
+    Value *Leaf;
+
+    /// Used to keep track of sub-expressions that get reused while linearizing
+    /// the expression. Re-used sub-expressions are marked as (reused).
+    SmallPtrSet<Value *, 8> ReusedExprs;
+
+    ExprLinearizer(const DataLayout &DL,
+                   const MapVector<Value *, MatrixTy> &Inst2Matrix,
+                   const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
+                   const SmallSetVector<Value *, 32> &ExprsInSubprogram,
+                   Value *Leaf)
+        : Str(), Stream(Str), DL(DL), Inst2Matrix(Inst2Matrix), Shared(Shared),
+          ExprsInSubprogram(ExprsInSubprogram), Leaf(Leaf) {}
+
+    void indent(unsigned N) {
+      LineLength += N;
+      for (unsigned i = 0; i < N; i++)
+        Stream << " ";
+    }
+
+    void lineBreak() {
+      Stream << "\n";
+      LineLength = 0;
+    }
+
+    void maybeIndent(unsigned Indent) {
+      if (LineLength >= LengthToBreak)
+        lineBreak();
+
+      if (LineLength == 0)
+        indent(Indent);
+    }
+
+    void write(StringRef S) {
+      LineLength += S.size();
+      Stream << S;
+    }
+
+    Value *getUnderlyingObjectThroughLoads(Value *V) {
+      if (Value *Ptr = getPointerOperand(V))
+        return getUnderlyingObjectThroughLoads(Ptr);
+      else if (V->getType()->isPointerTy())
+        return GetUnderlyingObject(V, DL);
+      return V;
+    }
+
+    /// Returns true if \p V is a matrix value in the given subprogram.
+    bool isMatrix(Value *V) const { return ExprsInSubprogram.count(V); }
+
+    /// If \p V is a matrix value, print its shape as as NumRows x NumColumns to
+    /// \p SS.
+    void prettyPrintMatrixType(Value *V, raw_string_ostream &SS) {
+      auto M = Inst2Matrix.find(V);
+      if (M == Inst2Matrix.end())
+        SS << "unknown";
+      else {
+        SS << M->second.getNumRows();
+        SS << "x";
+        SS << M->second.getNumColumns();
+      }
+    }
+
+    /// Write the called function name. Handles calls to llvm.matrix.*
+    /// specially: we write the name, followed by the dimensions of the input
+    /// matrixes, followed by the scalar type name.
+    void writeFnName(CallInst *CI) {
+      if (!CI->getCalledFunction())
+        write("<no called fn>");
+      else {
+        StringRef Name = CI->getCalledFunction()->getName();
+        if (!Name.startswith("llvm.matrix")) {
+          write(Name);
+          return;
+        }
+        IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI);
+        write(StringRef(Intrinsic::getName(II->getIntrinsicID(), {}))
+                  .drop_front(StringRef("llvm.matrix.").size()));
+        write(".");
+        std::string Tmp = "";
+        raw_string_ostream SS(Tmp);
+
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::matrix_multiply:
+          prettyPrintMatrixType(II->getOperand(0), SS);
+          SS << ".";
+          prettyPrintMatrixType(II->getOperand(1), SS);
+          SS << "." << *II->getType()->getScalarType();
+          break;
+        case Intrinsic::matrix_transpose:
+          prettyPrintMatrixType(II->getOperand(0), SS);
+          SS << "." << *II->getType()->getScalarType();
+          break;
+        case Intrinsic::matrix_column_major_load:
+          prettyPrintMatrixType(II, SS);
+          SS << "." << *II->getType()->getScalarType();
+          break;
+        case Intrinsic::matrix_column_major_store:
+          prettyPrintMatrixType(II->getOperand(0), SS);
+          SS << "." << *II->getOperand(0)->getType()->getScalarType();
+          break;
+        default:
+          llvm_unreachable("Unhandled case");
+        }
+        SS.flush();
+        write(Tmp);
+      }
+    }
+
+    unsigned getNumShapeArgs(CallInst *CI) const {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CI)) {
+        switch (II->getIntrinsicID()) {
+        case Intrinsic::matrix_multiply:
+          return 3;
+        case Intrinsic::matrix_transpose:
+          return 2;
+        case Intrinsic::matrix_column_major_load:
+        case Intrinsic::matrix_column_major_store:
+          return 3;
+        default:
+          return 0;
+        }
+      }
+      return 0;
+    }
+
+    /// Special printing for values: for pointers, we print if they refer to an
+    /// (function) external address or a stack address, for other values we
+    /// either print the constant or "scalar"/"matrix" for other values.
+    void write(Value *V) {
+      V = getUnderlyingObjectThroughLoads(V);
+      if (V->getType()->isPointerTy()) {
+        if (isa<AllocaInst>(V)) {
+          Stream << "stack addr";
+          LineLength += StringRef("stack addr").size();
+        } else {
+          Stream << "addr";
+          LineLength += StringRef("addr").size();
+        }
+        if (!V->getName().empty()) {
+          Stream << " %" << V->getName() << "";
+          LineLength += V->getName().size() + 2;
+        }
+        return;
+      }
+
+      std::string Tmp;
+      raw_string_ostream TmpStream(Tmp);
+
+      if (auto *CI = dyn_cast<ConstantInt>(V))
+        TmpStream << CI->getValue();
+      else if (isa<Constant>(V))
+        TmpStream << "constant";
+      else {
+        if (isMatrix(V))
+          TmpStream << "matrix";
+        else
+          TmpStream << "scalar";
+      }
+      TmpStream.flush();
+      Tmp = std::string(StringRef(Tmp).trim());
+      LineLength += Tmp.size();
+      Stream << Tmp;
+    }
+
+    /// Linearize expression \p Expr starting at an indentation of \p Indent.
+    /// Expressions that are re-used multiple times are prefixed with (reused)
+    /// at the re-used root instruction.
+    void linearizeExpr(Value *Expr, unsigned Indent, bool ParentReused,
+                       bool ParentShared) {
+      auto *I = cast<Instruction>(Expr);
+      maybeIndent(Indent);
+      SmallVector<Value *, 8> Ops;
+
+      // Is Expr shared with other expression leaves?
+      bool ExprShared = false;
+
+      // Deal with shared subtrees. Mark them as shared, if required.
+      if (!ParentShared) {
+        auto SI = Shared.find(Expr);
+        assert(SI != Shared.end() && SI->second.count(Leaf));
+
+        for (Value *S : SI->second) {
+          if (S == Leaf)
+            continue;
+          DebugLoc DL = cast<Instruction>(S)->getDebugLoc();
+          write("shared with remark at line " + std::to_string(DL.getLine()) +
+                " column " + std::to_string(DL.getCol()) + " (");
+        }
+        ExprShared = SI->second.size() > 1;
+      }
+
+      bool Reused = !ReusedExprs.insert(Expr).second;
+      if (Reused && !ParentReused)
+        write("(reused) ");
+
+      if (auto *CI = dyn_cast<CallInst>(I)) {
+        writeFnName(CI);
+
+        Ops.append(CI->arg_begin(), CI->arg_end() - getNumShapeArgs(CI));
+      } else if (isa<BitCastInst>(Expr)) {
+        // Special case bitcasts, which are used to materialize matrixes from
+        // non-matrix ops.
+        write("matrix");
+        return;
+      } else {
+        Ops.append(I->value_op_begin(), I->value_op_end());
+        write(std::string(I->getOpcodeName()));
+      }
+
+      write(std::string("("));
+
+      unsigned NumOpsToBreak = 1;
+      if (match(Expr, m_Intrinsic<Intrinsic::matrix_column_major_load>()))
+        NumOpsToBreak = 2;
+
+      for (Value *Op : Ops) {
+        if (Ops.size() > NumOpsToBreak)
+          lineBreak();
+
+        maybeIndent(Indent + 1);
+        if (isMatrix(Op))
+          linearizeExpr(Op, Indent + 1, Reused, ExprShared);
+        else
+          write(Op);
+        if (Op != Ops.back())
+          write(", ");
+      }
+
+      write(")");
+    }
+
+    const std::string &getResult() {
+      Stream.flush();
+      return Str;
+    }
+  };
+
+  /// Generate remarks for matrix operations in a function. To generate remarks
+  /// for matrix expressions, the following approach is used:
+  /// 1. Use the inlined-at debug information to group matrix operations to the
+  ///    DISubprograms they are contained in.
+  /// 2. Collect leaves of matrix expressions (done in
+  ///    RemarkGenerator::getExpressionLeaves) for each subprogram - expression
+  //     mapping.  Leaves are lowered matrix instructions without other matrix
+  //     users (like stores) in the current subprogram.
+  /// 3. For each leaf, create a remark containing a linearizied version of the
+  ///    matrix expression. The expression is linearized by a recursive
+  ///    bottom-up traversal of the matrix operands, starting at a leaf. Note
+  ///    that multiple leaves can share sub-expressions. Shared subexpressions
+  ///    are explicitly marked as shared().
+  struct RemarkGenerator {
+    const MapVector<Value *, MatrixTy> &Inst2Matrix;
+    OptimizationRemarkEmitter &ORE;
+    Function &Func;
+    const DataLayout &DL;
+
+    RemarkGenerator(const MapVector<Value *, MatrixTy> &Inst2Matrix,
+                    OptimizationRemarkEmitter &ORE, Function &Func)
+        : Inst2Matrix(Inst2Matrix), ORE(ORE), Func(Func),
+          DL(Func.getParent()->getDataLayout()) {}
+
+    /// Return all leaves of the expressions in \p ExprsInSubprogram. Those are
+    /// instructions in Inst2Matrix returning void or without any users in
+    /// \p ExprsInSubprogram. Currently that should only include stores.
+    SmallVector<Value *, 4>
+    getExpressionLeaves(const SmallSetVector<Value *, 32> &ExprsInSubprogram) {
+      SmallVector<Value *, 4> Leaves;
+      for (auto *Expr : ExprsInSubprogram)
+        if (Expr->getType()->isVoidTy() ||
+            !any_of(Expr->users(), [&ExprsInSubprogram](User *U) {
+              return ExprsInSubprogram.count(U);
+            }))
+          Leaves.push_back(Expr);
+      return Leaves;
+    }
+
+    /// Recursively traverse expression \p V starting at \p Leaf and add \p Leaf
+    /// to all visited expressions in \p Shared. Limit the matrix operations to
+    /// the ones in \p ExprsInSubprogram.
+    void collectSharedInfo(Value *Leaf, Value *V,
+                           const SmallSetVector<Value *, 32> &ExprsInSubprogram,
+                           DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) {
+
+      if (!ExprsInSubprogram.count(V))
+        return;
+
+      auto I = Shared.insert({V, {}});
+      I.first->second.insert(Leaf);
+
+      for (Value *Op : cast<Instruction>(V)->operand_values())
+        collectSharedInfo(Leaf, Op, ExprsInSubprogram, Shared);
+      return;
+    }
+
+    /// Calculate the number of exclusive and shared op counts for expression
+    /// starting at \p V. Expressions used multiple times are counted once.
+    /// Limit the matrix operations to the ones in \p ExprsInSubprogram.
+    std::pair<OpInfoTy, OpInfoTy>
+    sumOpInfos(Value *Root, SmallPtrSetImpl<Value *> &ReusedExprs,
+               const SmallSetVector<Value *, 32> &ExprsInSubprogram,
+               DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared) const {
+      if (!ExprsInSubprogram.count(Root))
+        return {};
+
+      // Already counted this expression. Stop.
+      if (!ReusedExprs.insert(Root).second)
+        return {};
+
+      OpInfoTy SharedCount;
+      OpInfoTy Count;
+
+      auto I = Shared.find(Root);
+      auto CM = Inst2Matrix.find(Root);
+      if (I->second.size() == 1)
+        Count = CM->second.getOpInfo();
+      else
+        SharedCount = CM->second.getOpInfo();
+
+      for (Value *Op : cast<Instruction>(Root)->operand_values()) {
+        auto C = sumOpInfos(Op, ReusedExprs, ExprsInSubprogram, Shared);
+        Count += C.first;
+        SharedCount += C.second;
+      }
+      return {Count, SharedCount};
+    }
+
+    void emitRemarks() {
+      if (!ORE.allowExtraAnalysis(DEBUG_TYPE))
+        return;
+
+      // Map matrix operations to their containting subprograms, by traversing
+      // the inlinedAt chain. If the function does not have a DISubprogram, we
+      // only map them to the containing function.
+      MapVector<DISubprogram *, SmallVector<Value *, 8>> Subprog2Exprs;
+      for (auto &KV : Inst2Matrix) {
+        if (Func.getSubprogram()) {
+          auto *I = cast<Instruction>(KV.first);
+          DILocation *Context = I->getDebugLoc();
+          while (Context) {
+            auto I =
+                Subprog2Exprs.insert({getSubprogram(Context->getScope()), {}});
+            I.first->second.push_back(KV.first);
+            Context = DebugLoc(Context).getInlinedAt();
+          }
+        } else {
+          auto I = Subprog2Exprs.insert({nullptr, {}});
+          I.first->second.push_back(KV.first);
+        }
+      }
+      for (auto &KV : Subprog2Exprs) {
+        SmallSetVector<Value *, 32> ExprsInSubprogram(KV.second.begin(),
+                                                      KV.second.end());
+        auto Leaves = getExpressionLeaves(ExprsInSubprogram);
+
+        DenseMap<Value *, SmallPtrSet<Value *, 2>> Shared;
+        for (Value *Leaf : Leaves)
+          collectSharedInfo(Leaf, Leaf, ExprsInSubprogram, Shared);
+
+        // Generate remarks for each leaf.
+        for (auto *L : Leaves) {
+
+          DebugLoc Loc = cast<Instruction>(L)->getDebugLoc();
+          DILocation *Context = cast<Instruction>(L)->getDebugLoc();
+          while (Context) {
+            if (getSubprogram(Context->getScope()) == KV.first) {
+              Loc = Context;
+              break;
+            }
+            Context = DebugLoc(Context).getInlinedAt();
+          }
+
+          SmallPtrSet<Value *, 8> ReusedExprs;
+          OpInfoTy Counts, SharedCounts;
+          std::tie(Counts, SharedCounts) =
+              sumOpInfos(L, ReusedExprs, ExprsInSubprogram, Shared);
+
+          OptimizationRemark Rem(DEBUG_TYPE, "matrix-lowered", Loc,
+                                 cast<Instruction>(L)->getParent());
+
+          Rem << "Lowered with ";
+          Rem << ore::NV("NumStores", Counts.NumStores) << " stores, "
+              << ore::NV("NumLoads", Counts.NumLoads) << " loads, "
+              << ore::NV("NumComputeOps", Counts.NumComputeOps)
+              << " compute ops";
+
+          if (SharedCounts.NumStores > 0 || SharedCounts.NumLoads > 0 ||
+              SharedCounts.NumComputeOps > 0) {
+            Rem << ",\nadditionally "
+                << ore::NV("NumStores", SharedCounts.NumStores) << " stores, "
+                << ore::NV("NumLoads", SharedCounts.NumLoads) << " loads, "
+                << ore::NV("NumFPOps", SharedCounts.NumComputeOps)
+                << " compute ops"
+                << " are shared with other expressions";
+          }
+
+          Rem << ("\n" + linearize(L, Shared, ExprsInSubprogram, DL));
+          ORE.emit(Rem);
+        }
+      }
+    }
+
+    std::string
+    linearize(Value *L,
+              const DenseMap<Value *, SmallPtrSet<Value *, 2>> &Shared,
+              const SmallSetVector<Value *, 32> &ExprsInSubprogram,
+              const DataLayout &DL) {
+      ExprLinearizer Lin(DL, Inst2Matrix, Shared, ExprsInSubprogram, L);
+      Lin.linearizeExpr(L, 0, false, false);
+      return Lin.getResult();
+    }
+  };
 };
 } // namespace
 
 PreservedAnalyses LowerMatrixIntrinsicsPass::run(Function &F,
                                                  FunctionAnalysisManager &AM) {
   auto &TTI = AM.getResult<TargetIRAnalysis>(F);
-  LowerMatrixIntrinsics LMT(F, TTI);
+  auto &ORE = AM.getResult<OptimizationRemarkEmitterAnalysis>(F);
+  auto &AA = AM.getResult<AAManager>(F);
+  auto &DT = AM.getResult<DominatorTreeAnalysis>(F);
+  auto &LI = AM.getResult<LoopAnalysis>(F);
+
+  LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE);
   if (LMT.Visit()) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
@@ -869,15 +1889,24 @@ public:
   }
 
   bool runOnFunction(Function &F) override {
-    auto *TTI = &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
-    LowerMatrixIntrinsics LMT(F, *TTI);
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &ORE = getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE();
+    auto &AA = getAnalysis<AAResultsWrapperPass>().getAAResults();
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+    LowerMatrixIntrinsics LMT(F, TTI, AA, DT, LI, ORE);
     bool C = LMT.Visit();
     return C;
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.addRequired<TargetTransformInfoWrapperPass>();
-    AU.setPreservesCFG();
+    AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addRequired<AAResultsWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
   }
 };
 } // namespace
@@ -886,6 +1915,10 @@ static const char pass_name[] = "Lower the matrix intrinsics";
 char LowerMatrixIntrinsicsLegacyPass::ID = 0;
 INITIALIZE_PASS_BEGIN(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
                       false, false)
+INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
 INITIALIZE_PASS_END(LowerMatrixIntrinsicsLegacyPass, DEBUG_TYPE, pass_name,
                     false, false)
 
diff --git a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index c24fa40860eb..4b4196edc12b 100644
--- a/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/llvm/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -27,7 +27,6 @@
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -173,8 +172,8 @@ public:
   void addStore(int64_t OffsetFromFirst, StoreInst *SI) {
     int64_t StoreSize = DL.getTypeStoreSize(SI->getOperand(0)->getType());
 
-    addRange(OffsetFromFirst, StoreSize,
-             SI->getPointerOperand(), SI->getAlignment(), SI);
+    addRange(OffsetFromFirst, StoreSize, SI->getPointerOperand(),
+             SI->getAlign().value(), SI);
   }
 
   void addMemSet(int64_t OffsetFromFirst, MemSetInst *MSI) {
@@ -387,13 +386,8 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
     // Get the starting pointer of the block.
     StartPtr = Range.StartPtr;
 
-    // Determine alignment
-    const Align Alignment = DL.getValueOrABITypeAlignment(
-        MaybeAlign(Range.Alignment),
-        cast<PointerType>(StartPtr->getType())->getElementType());
-
     AMemSet = Builder.CreateMemSet(StartPtr, ByteVal, Range.End - Range.Start,
-                                   Alignment);
+                                   MaybeAlign(Range.Alignment));
     LLVM_DEBUG(dbgs() << "Replace stores:\n"; for (Instruction *SI
                                                    : Range.TheStores) dbgs()
                                               << *SI << '\n';
@@ -413,23 +407,6 @@ Instruction *MemCpyOptPass::tryMergingIntoMemset(Instruction *StartInst,
   return AMemSet;
 }
 
-static Align findStoreAlignment(const DataLayout &DL, const StoreInst *SI) {
-  return DL.getValueOrABITypeAlignment(MaybeAlign(SI->getAlignment()),
-                                       SI->getOperand(0)->getType());
-}
-
-static Align findLoadAlignment(const DataLayout &DL, const LoadInst *LI) {
-  return DL.getValueOrABITypeAlignment(MaybeAlign(LI->getAlignment()),
-                                       LI->getType());
-}
-
-static Align findCommonAlignment(const DataLayout &DL, const StoreInst *SI,
-                                 const LoadInst *LI) {
-  Align StoreAlign = findStoreAlignment(DL, SI);
-  Align LoadAlign = findLoadAlignment(DL, LI);
-  return commonAlignment(StoreAlign, LoadAlign);
-}
-
 // This method try to lift a store instruction before position P.
 // It will lift the store and its argument + that anything that
 // may alias with these.
@@ -585,12 +562,12 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
           Instruction *M;
           if (UseMemMove)
             M = Builder.CreateMemMove(
-                SI->getPointerOperand(), findStoreAlignment(DL, SI),
-                LI->getPointerOperand(), findLoadAlignment(DL, LI), Size);
+                SI->getPointerOperand(), SI->getAlign(),
+                LI->getPointerOperand(), LI->getAlign(), Size);
           else
             M = Builder.CreateMemCpy(
-                SI->getPointerOperand(), findStoreAlignment(DL, SI),
-                LI->getPointerOperand(), findLoadAlignment(DL, LI), Size);
+                SI->getPointerOperand(), SI->getAlign(),
+                LI->getPointerOperand(), LI->getAlign(), Size);
 
           LLVM_DEBUG(dbgs() << "Promoting " << *LI << " to " << *SI << " => "
                             << *M << "\n");
@@ -642,7 +619,7 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
             LI, SI->getPointerOperand()->stripPointerCasts(),
             LI->getPointerOperand()->stripPointerCasts(),
             DL.getTypeStoreSize(SI->getOperand(0)->getType()),
-            findCommonAlignment(DL, SI, LI).value(), C);
+            commonAlignment(SI->getAlign(), LI->getAlign()), C);
         if (changed) {
           MD->removeInstruction(SI);
           SI->eraseFromParent();
@@ -675,11 +652,9 @@ bool MemCpyOptPass::processStore(StoreInst *SI, BasicBlock::iterator &BBI) {
     auto *T = V->getType();
     if (T->isAggregateType()) {
       uint64_t Size = DL.getTypeStoreSize(T);
-      const Align MA =
-          DL.getValueOrABITypeAlignment(MaybeAlign(SI->getAlignment()), T);
       IRBuilder<> Builder(SI);
-      auto *M =
-          Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size, MA);
+      auto *M = Builder.CreateMemSet(SI->getPointerOperand(), ByteVal, Size,
+                                     SI->getAlign());
 
       LLVM_DEBUG(dbgs() << "Promoting " << *SI << " to " << *M << "\n");
 
@@ -713,7 +688,7 @@ bool MemCpyOptPass::processMemSet(MemSetInst *MSI, BasicBlock::iterator &BBI) {
 /// the call write its result directly into the destination of the memcpy.
 bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
                                          Value *cpySrc, uint64_t cpyLen,
-                                         unsigned cpyAlign, CallInst *C) {
+                                         Align cpyAlign, CallInst *C) {
   // The general transformation to keep in mind is
   //
   //   call @func(..., src, ...)
@@ -733,10 +708,6 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
     if (F->isIntrinsic() && F->getIntrinsicID() == Intrinsic::lifetime_start)
       return false;
 
-  // Deliberately get the source and destination with bitcasts stripped away,
-  // because we'll need to do type comparisons based on the underlying type.
-  CallSite CS(C);
-
   // Require that src be an alloca.  This simplifies the reasoning considerably.
   AllocaInst *srcAlloca = dyn_cast<AllocaInst>(cpySrc);
   if (!srcAlloca)
@@ -795,9 +766,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   }
 
   // Check that dest points to memory that is at least as aligned as src.
-  unsigned srcAlign = srcAlloca->getAlignment();
-  if (!srcAlign)
-    srcAlign = DL.getABITypeAlignment(srcAlloca->getAllocatedType());
+  Align srcAlign = srcAlloca->getAlign();
   bool isDestSufficientlyAligned = srcAlign <= cpyAlign;
   // If dest is not aligned enough and we can't increase its alignment then
   // bail out.
@@ -836,8 +805,8 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
 
   // Check that src isn't captured by the called function since the
   // transformation can cause aliasing issues in that case.
-  for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
-    if (CS.getArgument(i) == cpySrc && !CS.doesNotCapture(i))
+  for (unsigned ArgI = 0, E = C->arg_size(); ArgI != E; ++ArgI)
+    if (C->getArgOperand(ArgI) == cpySrc && !C->doesNotCapture(ArgI))
       return false;
 
   // Since we're changing the parameter to the callsite, we need to make sure
@@ -864,25 +833,26 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   if (cpySrc->getType()->getPointerAddressSpace() !=
       cpyDest->getType()->getPointerAddressSpace())
     return false;
-  for (unsigned i = 0; i < CS.arg_size(); ++i)
-    if (CS.getArgument(i)->stripPointerCasts() == cpySrc &&
+  for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI)
+    if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc &&
         cpySrc->getType()->getPointerAddressSpace() !=
-        CS.getArgument(i)->getType()->getPointerAddressSpace())
+            C->getArgOperand(ArgI)->getType()->getPointerAddressSpace())
       return false;
 
   // All the checks have passed, so do the transformation.
   bool changedArgument = false;
-  for (unsigned i = 0; i < CS.arg_size(); ++i)
-    if (CS.getArgument(i)->stripPointerCasts() == cpySrc) {
+  for (unsigned ArgI = 0; ArgI < C->arg_size(); ++ArgI)
+    if (C->getArgOperand(ArgI)->stripPointerCasts() == cpySrc) {
       Value *Dest = cpySrc->getType() == cpyDest->getType() ?  cpyDest
         : CastInst::CreatePointerCast(cpyDest, cpySrc->getType(),
                                       cpyDest->getName(), C);
       changedArgument = true;
-      if (CS.getArgument(i)->getType() == Dest->getType())
-        CS.setArgument(i, Dest);
+      if (C->getArgOperand(ArgI)->getType() == Dest->getType())
+        C->setArgOperand(ArgI, Dest);
       else
-        CS.setArgument(i, CastInst::CreatePointerCast(Dest,
-                          CS.getArgument(i)->getType(), Dest->getName(), C));
+        C->setArgOperand(ArgI, CastInst::CreatePointerCast(
+                                   Dest, C->getArgOperand(ArgI)->getType(),
+                                   Dest->getName(), C));
     }
 
   if (!changedArgument)
@@ -891,7 +861,7 @@ bool MemCpyOptPass::performCallSlotOptzn(Instruction *cpy, Value *cpyDest,
   // If the destination wasn't sufficiently aligned then increase its alignment.
   if (!isDestSufficientlyAligned) {
     assert(isa<AllocaInst>(cpyDest) && "Can only increase alloca alignment!");
-    cast<AllocaInst>(cpyDest)->setAlignment(MaybeAlign(srcAlign));
+    cast<AllocaInst>(cpyDest)->setAlignment(srcAlign);
   }
 
   // Drop any cached information about the call, because we may have changed
@@ -1127,15 +1097,16 @@ bool MemCpyOptPass::performMemCpyToMemSetOptzn(MemCpyInst *MemCpy,
 /// B to be a memcpy from X to Z (or potentially a memmove, depending on
 /// circumstances). This allows later passes to remove the first memcpy
 /// altogether.
-bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
+bool MemCpyOptPass::processMemCpy(MemCpyInst *M, BasicBlock::iterator &BBI) {
   // We can only optimize non-volatile memcpy's.
   if (M->isVolatile()) return false;
 
   // If the source and destination of the memcpy are the same, then zap it.
   if (M->getSource() == M->getDest()) {
+    ++BBI;
     MD->removeInstruction(M);
     M->eraseFromParent();
-    return false;
+    return true;
   }
 
   // If copying from a constant, try to turn the memcpy into a memset.
@@ -1176,10 +1147,10 @@ bool MemCpyOptPass::processMemCpy(MemCpyInst *M) {
     if (CallInst *C = dyn_cast<CallInst>(DepInfo.getInst())) {
       // FIXME: Can we pass in either of dest/src alignment here instead
       // of conservatively taking the minimum?
-      unsigned Align = MinAlign(M->getDestAlignment(), M->getSourceAlignment());
+      Align Alignment = std::min(M->getDestAlign().valueOrOne(),
+                                 M->getSourceAlign().valueOrOne());
       if (performCallSlotOptzn(M, M->getDest(), M->getSource(),
-                               CopySize->getZExtValue(), Align,
-                               C)) {
+                               CopySize->getZExtValue(), Alignment, C)) {
         MD->removeInstruction(M);
         M->eraseFromParent();
         return true;
@@ -1247,15 +1218,15 @@ bool MemCpyOptPass::processMemMove(MemMoveInst *M) {
 }
 
 /// This is called on every byval argument in call sites.
-bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
-  const DataLayout &DL = CS.getCaller()->getParent()->getDataLayout();
+bool MemCpyOptPass::processByValArgument(CallBase &CB, unsigned ArgNo) {
+  const DataLayout &DL = CB.getCaller()->getParent()->getDataLayout();
   // Find out what feeds this byval argument.
-  Value *ByValArg = CS.getArgument(ArgNo);
+  Value *ByValArg = CB.getArgOperand(ArgNo);
   Type *ByValTy = cast<PointerType>(ByValArg->getType())->getElementType();
   uint64_t ByValSize = DL.getTypeAllocSize(ByValTy);
   MemDepResult DepInfo = MD->getPointerDependencyFrom(
       MemoryLocation(ByValArg, LocationSize::precise(ByValSize)), true,
-      CS.getInstruction()->getIterator(), CS.getInstruction()->getParent());
+      CB.getIterator(), CB.getParent());
   if (!DepInfo.isClobber())
     return false;
 
@@ -1274,16 +1245,17 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
 
   // Get the alignment of the byval.  If the call doesn't specify the alignment,
   // then it is some target specific value that we can't know.
-  unsigned ByValAlign = CS.getParamAlignment(ArgNo);
-  if (ByValAlign == 0) return false;
+  MaybeAlign ByValAlign = CB.getParamAlign(ArgNo);
+  if (!ByValAlign) return false;
 
   // If it is greater than the memcpy, then we check to see if we can force the
   // source of the memcpy to the alignment we need.  If we fail, we bail out.
   AssumptionCache &AC = LookupAssumptionCache();
   DominatorTree &DT = LookupDomTree();
-  if (MDep->getSourceAlignment() < ByValAlign &&
-      getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL,
-                                 CS.getInstruction(), &AC, &DT) < ByValAlign)
+  MaybeAlign MemDepAlign = MDep->getSourceAlign();
+  if ((!MemDepAlign || *MemDepAlign < *ByValAlign) &&
+      getOrEnforceKnownAlignment(MDep->getSource(), ByValAlign, DL, &CB, &AC,
+                                 &DT) < *ByValAlign)
     return false;
 
   // The address space of the memcpy source must match the byval argument
@@ -1302,21 +1274,25 @@ bool MemCpyOptPass::processByValArgument(CallSite CS, unsigned ArgNo) {
   // not just the defining memcpy.
   MemDepResult SourceDep = MD->getPointerDependencyFrom(
       MemoryLocation::getForSource(MDep), false,
-      CS.getInstruction()->getIterator(), MDep->getParent());
+      CB.getIterator(), MDep->getParent());
   if (!SourceDep.isClobber() || SourceDep.getInst() != MDep)
     return false;
 
   Value *TmpCast = MDep->getSource();
-  if (MDep->getSource()->getType() != ByValArg->getType())
-    TmpCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
-                              "tmpcast", CS.getInstruction());
+  if (MDep->getSource()->getType() != ByValArg->getType()) {
+    BitCastInst *TmpBitCast = new BitCastInst(MDep->getSource(), ByValArg->getType(),
+                                              "tmpcast", &CB);
+    // Set the tmpcast's DebugLoc to MDep's
+    TmpBitCast->setDebugLoc(MDep->getDebugLoc());
+    TmpCast = TmpBitCast;
+  }
 
   LLVM_DEBUG(dbgs() << "MemCpyOptPass: Forwarding memcpy to byval:\n"
                     << "  " << *MDep << "\n"
-                    << "  " << *CS.getInstruction() << "\n");
+                    << "  " << CB << "\n");
 
   // Otherwise we're good!  Update the byval argument.
-  CS.setArgument(ArgNo, TmpCast);
+  CB.setArgOperand(ArgNo, TmpCast);
   ++NumMemCpyInstr;
   return true;
 }
@@ -1347,13 +1323,13 @@ bool MemCpyOptPass::iterateOnFunction(Function &F) {
       else if (MemSetInst *M = dyn_cast<MemSetInst>(I))
         RepeatInstruction = processMemSet(M, BI);
       else if (MemCpyInst *M = dyn_cast<MemCpyInst>(I))
-        RepeatInstruction = processMemCpy(M);
+        RepeatInstruction = processMemCpy(M, BI);
       else if (MemMoveInst *M = dyn_cast<MemMoveInst>(I))
         RepeatInstruction = processMemMove(M);
-      else if (auto CS = CallSite(I)) {
-        for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
-          if (CS.isByValArgument(i))
-            MadeChange |= processByValArgument(CS, i);
+      else if (auto *CB = dyn_cast<CallBase>(I)) {
+        for (unsigned i = 0, e = CB->arg_size(); i != e; ++i)
+          if (CB->isByValArgument(i))
+            MadeChange |= processByValArgument(*CB, i);
       }
 
       // Reprocess the instruction if desired.
diff --git a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
index 6b0d0202d9bb..69aa0cebe170 100644
--- a/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
+++ b/llvm/lib/Transforms/Scalar/MergedLoadStoreMotion.cpp
@@ -354,15 +354,11 @@ bool MergedLoadStoreMotion::run(Function &F, AliasAnalysis &AA) {
   // optimization opportunities.
   // This loop doesn't care about newly inserted/split blocks 
   // since they never will be diamond heads.
-  for (Function::iterator FI = F.begin(), FE = F.end(); FI != FE;) {
-    BasicBlock *BB = &*FI++;
-
+  for (BasicBlock &BB : make_early_inc_range(F))
     // Hoist equivalent loads and sink stores
     // outside diamonds when possible
-    if (isDiamondHead(BB)) {
-      Changed |= mergeStores(BB);
-    }
-  }
+    if (isDiamondHead(&BB))
+      Changed |= mergeStores(&BB);
   return Changed;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
index bba9082e31b2..4e010f8704d0 100644
--- a/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/NaryReassociate.cpp
@@ -213,7 +213,7 @@ bool NaryReassociatePass::runImpl(Function &F, AssumptionCache *AC_,
   return Changed;
 }
 
-// Whitelist the instruction types NaryReassociate handles for now.
+// Explicitly list the instruction types NaryReassociate handles for now.
 static bool isPotentiallyNaryReassociable(Instruction *I) {
   switch (I->getOpcode()) {
   case Instruction::Add:
diff --git a/llvm/lib/Transforms/Scalar/NewGVN.cpp b/llvm/lib/Transforms/Scalar/NewGVN.cpp
index 6a643480f312..0ed1773373a7 100644
--- a/llvm/lib/Transforms/Scalar/NewGVN.cpp
+++ b/llvm/lib/Transforms/Scalar/NewGVN.cpp
@@ -106,6 +106,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Scalar/GVNExpression.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/PredicateInfo.h"
 #include "llvm/Transforms/Utils/VNCoercion.h"
@@ -495,6 +496,7 @@ class NewGVN {
   AliasAnalysis *AA = nullptr;
   MemorySSA *MSSA = nullptr;
   MemorySSAWalker *MSSAWalker = nullptr;
+  AssumptionCache *AC = nullptr;
   const DataLayout &DL;
   std::unique_ptr<PredicateInfo> PredInfo;
 
@@ -658,7 +660,7 @@ public:
   NewGVN(Function &F, DominatorTree *DT, AssumptionCache *AC,
          TargetLibraryInfo *TLI, AliasAnalysis *AA, MemorySSA *MSSA,
          const DataLayout &DL)
-      : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), DL(DL),
+      : F(F), DT(DT), TLI(TLI), AA(AA), MSSA(MSSA), AC(AC), DL(DL),
         PredInfo(std::make_unique<PredicateInfo>(F, *DT, *AC)),
         SQ(DL, TLI, DT, AC, /*CtxI=*/nullptr, /*UseInstrInfo=*/false) {}
 
@@ -898,7 +900,7 @@ bool NewGVN::isBackedge(BasicBlock *From, BasicBlock *To) const {
 
 #ifndef NDEBUG
 static std::string getBlockName(const BasicBlock *B) {
-  return DOTGraphTraits<const Function *>::getSimpleNodeLabel(B, nullptr);
+  return DOTGraphTraits<DOTFuncInfo *>::getSimpleNodeLabel(B, nullptr);
 }
 #endif
 
@@ -1334,8 +1336,6 @@ LoadExpression *NewGVN::createLoadExpression(Type *LoadType, Value *PointerOp,
   // Give store and loads same opcode so they value number together.
   E->setOpcode(0);
   E->op_push_back(PointerOp);
-  if (LI)
-    E->setAlignment(MaybeAlign(LI->getAlignment()));
 
   // TODO: Value number heap versions. We may be able to discover
   // things alias analysis can't on it's own (IE that a store and a
@@ -1470,7 +1470,8 @@ NewGVN::performSymbolicLoadCoercion(Type *LoadType, Value *LoadPtr,
   // undef value.  This can happen when loading for a fresh allocation with no
   // intervening stores, for example.  Note that this is only true in the case
   // that the result of the allocation is pointer equal to the load ptr.
-  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI)) {
+  if (isa<AllocaInst>(DepInst) || isMallocLikeFn(DepInst, TLI) ||
+      isAlignedAllocLikeFn(DepInst, TLI)) {
     return createConstantExpression(UndefValue::get(LoadType));
   }
   // If this load occurs either right after a lifetime begin,
@@ -2030,10 +2031,12 @@ NewGVN::performSymbolicEvaluation(Value *V,
     case Instruction::Select:
     case Instruction::ExtractElement:
     case Instruction::InsertElement:
-    case Instruction::ShuffleVector:
     case Instruction::GetElementPtr:
       E = createExpression(I);
       break;
+    case Instruction::ShuffleVector:
+      // FIXME: Add support for shufflevector to createExpression.
+      return nullptr;
     default:
       return nullptr;
     }
@@ -3433,7 +3436,7 @@ bool NewGVN::runGVN() {
   // Sort dominator tree children arrays into RPO.
   for (auto &B : RPOT) {
     auto *Node = DT->getNode(B);
-    if (Node->getChildren().size() > 1)
+    if (Node->getNumChildren() > 1)
       llvm::sort(Node->begin(), Node->end(),
                  [&](const DomTreeNode *A, const DomTreeNode *B) {
                    return RPOOrdering[A] < RPOOrdering[B];
@@ -3693,6 +3696,7 @@ void NewGVN::deleteInstructionsInBlock(BasicBlock *BB) {
       Inst.replaceAllUsesWith(UndefValue::get(Inst.getType()));
     if (isa<LandingPadInst>(Inst))
       continue;
+    salvageKnowledge(&Inst, AC);
 
     Inst.eraseFromParent();
     ++NumGVNInstrDeleted;
diff --git a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
index 5c4a89977c38..4553b23532f2 100644
--- a/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
+++ b/llvm/lib/Transforms/Scalar/PlaceSafepoints.cpp
@@ -189,7 +189,8 @@ static bool needsStatepoint(CallBase *Call, const TargetLibraryInfo &TLI) {
       return false;
   }
 
-  return !(isStatepoint(Call) || isGCRelocate(Call) || isGCResult(Call));
+  return !(isa<GCStatepointInst>(Call) || isa<GCRelocateInst>(Call) ||
+           isa<GCResultInst>(Call));
 }
 
 /// Returns true if this loop is known to contain a call safepoint which
@@ -650,7 +651,7 @@ InsertSafepointPoll(Instruction *InsertBefore,
 
   // Do the actual inlining
   InlineFunctionInfo IFI;
-  bool InlineStatus = InlineFunction(PollCall, IFI);
+  bool InlineStatus = InlineFunction(*PollCall, IFI).isSuccess();
   assert(InlineStatus && "inline must succeed");
   (void)InlineStatus; // suppress warning in release-asserts
 
diff --git a/llvm/lib/Transforms/Scalar/Reassociate.cpp b/llvm/lib/Transforms/Scalar/Reassociate.cpp
index 41940e980faa..ba7f367267fe 100644
--- a/llvm/lib/Transforms/Scalar/Reassociate.cpp
+++ b/llvm/lib/Transforms/Scalar/Reassociate.cpp
@@ -29,6 +29,7 @@
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Argument.h"
@@ -254,15 +255,15 @@ static BinaryOperator *CreateMul(Value *S1, Value *S2, const Twine &Name,
   }
 }
 
-static BinaryOperator *CreateNeg(Value *S1, const Twine &Name,
-                                 Instruction *InsertBefore, Value *FlagsOp) {
+static Instruction *CreateNeg(Value *S1, const Twine &Name,
+                              Instruction *InsertBefore, Value *FlagsOp) {
   if (S1->getType()->isIntOrIntVectorTy())
     return BinaryOperator::CreateNeg(S1, Name, InsertBefore);
-  else {
-    BinaryOperator *Res = BinaryOperator::CreateFNeg(S1, Name, InsertBefore);
-    Res->setFastMathFlags(cast<FPMathOperator>(FlagsOp)->getFastMathFlags());
-    return Res;
-  }
+
+  if (auto *FMFSource = dyn_cast<Instruction>(FlagsOp))
+    return UnaryOperator::CreateFNegFMF(S1, FMFSource, Name, InsertBefore);
+
+  return UnaryOperator::CreateFNeg(S1, Name, InsertBefore);
 }
 
 /// Replace 0-X with X*-1.
@@ -914,7 +915,7 @@ static Value *NegateValue(Value *V, Instruction *BI,
 
   // Insert a 'neg' instruction that subtracts the value from zero to get the
   // negation.
-  BinaryOperator *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
+  Instruction *NewNeg = CreateNeg(V, V->getName() + ".neg", BI, BI);
   ToRedo.insert(NewNeg);
   return NewNeg;
 }
@@ -975,7 +976,8 @@ static BinaryOperator *BreakUpSubtract(Instruction *Sub,
 /// this into a multiply by a constant to assist with further reassociation.
 static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
   Constant *MulCst = ConstantInt::get(Shl->getType(), 1);
-  MulCst = ConstantExpr::getShl(MulCst, cast<Constant>(Shl->getOperand(1)));
+  auto *SA = cast<ConstantInt>(Shl->getOperand(1));
+  MulCst = ConstantExpr::getShl(MulCst, SA);
 
   BinaryOperator *Mul =
     BinaryOperator::CreateMul(Shl->getOperand(0), MulCst, "", Shl);
@@ -988,10 +990,12 @@ static BinaryOperator *ConvertShiftToMul(Instruction *Shl) {
 
   // We can safely preserve the nuw flag in all cases.  It's also safe to turn a
   // nuw nsw shl into a nuw nsw mul.  However, nsw in isolation requires special
-  // handling.
+  // handling.  It can be preserved as long as we're not left shifting by
+  // bitwidth - 1.
   bool NSW = cast<BinaryOperator>(Shl)->hasNoSignedWrap();
   bool NUW = cast<BinaryOperator>(Shl)->hasNoUnsignedWrap();
-  if (NSW && NUW)
+  unsigned BitWidth = Shl->getType()->getIntegerBitWidth();
+  if (NSW && (NUW || SA->getValue().ult(BitWidth - 1)))
     Mul->setHasNoSignedWrap(true);
   Mul->setHasNoUnsignedWrap(NUW);
   return Mul;
@@ -1076,7 +1080,7 @@ Value *ReassociatePass::RemoveFactorFromExpression(Value *V, Value *Factor) {
         const APFloat &F1 = FC1->getValueAPF();
         APFloat F2(FC2->getValueAPF());
         F2.changeSign();
-        if (F1.compare(F2) == APFloat::cmpEqual) {
+        if (F1 == F2) {
           FoundFactor = NeedsNegate = true;
           Factors.erase(Factors.begin() + i);
           break;
@@ -1721,7 +1725,7 @@ static bool collectMultiplyFactors(SmallVectorImpl<ValueEntry> &Ops,
 }
 
 /// Build a tree of multiplies, computing the product of Ops.
-static Value *buildMultiplyTree(IRBuilder<> &Builder,
+static Value *buildMultiplyTree(IRBuilderBase &Builder,
                                 SmallVectorImpl<Value*> &Ops) {
   if (Ops.size() == 1)
     return Ops.back();
@@ -1744,7 +1748,7 @@ static Value *buildMultiplyTree(IRBuilder<> &Builder,
 /// DAG of multiplies to compute the final product, and return that product
 /// value.
 Value *
-ReassociatePass::buildMinimalMultiplyDAG(IRBuilder<> &Builder,
+ReassociatePass::buildMinimalMultiplyDAG(IRBuilderBase &Builder,
                                          SmallVectorImpl<Factor> &Factors) {
   assert(Factors[0].Power);
   SmallVector<Value *, 4> OuterProduct;
@@ -1899,7 +1903,7 @@ void ReassociatePass::RecursivelyEraseDeadInsts(Instruction *I,
   ValueRankMap.erase(I);
   Insts.remove(I);
   RedoInsts.remove(I);
-  llvm::salvageDebugInfoOrMarkUndef(*I);
+  llvm::salvageDebugInfo(*I);
   I->eraseFromParent();
   for (auto Op : Ops)
     if (Instruction *OpInst = dyn_cast<Instruction>(Op))
@@ -1916,7 +1920,7 @@ void ReassociatePass::EraseInst(Instruction *I) {
   // Erase the dead instruction.
   ValueRankMap.erase(I);
   RedoInsts.remove(I);
-  llvm::salvageDebugInfoOrMarkUndef(*I);
+  llvm::salvageDebugInfo(*I);
   I->eraseFromParent();
   // Optimize its operands.
   SmallPtrSet<Instruction *, 8> Visited; // Detect self-referential nodes.
@@ -2457,6 +2461,8 @@ PreservedAnalyses ReassociatePass::run(Function &F, FunctionAnalysisManager &) {
   if (MadeChange) {
     PreservedAnalyses PA;
     PA.preserveSet<CFGAnalyses>();
+    PA.preserve<AAManager>();
+    PA.preserve<BasicAA>();
     PA.preserve<GlobalsAA>();
     return PA;
   }
@@ -2487,6 +2493,8 @@ namespace {
 
     void getAnalysisUsage(AnalysisUsage &AU) const override {
       AU.setPreservesCFG();
+      AU.addPreserved<AAResultsWrapperPass>();
+      AU.addPreserved<BasicAAWrapperPass>();
       AU.addPreserved<GlobalsAAWrapperPass>();
     }
   };
diff --git a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
index b242f100faff..dc2ad14ae61e 100644
--- a/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
+++ b/llvm/lib/Transforms/Scalar/RewriteStatepointsForGC.cpp
@@ -271,7 +271,7 @@ struct PartiallyConstructedSafepointRecord {
 
   /// The *new* gc.statepoint instruction itself.  This produces the token
   /// that normal path gc.relocates and the gc.result are tied to.
-  Instruction *StatepointToken;
+  GCStatepointInst *StatepointToken;
 
   /// Instruction to which exceptional gc relocates are attached
   /// Makes it easier to iterate through them during relocationViaAlloca.
@@ -381,14 +381,19 @@ static void analyzeParsePointLiveness(
       dbgs() << " " << V->getName() << " " << *V << "\n";
   }
   if (PrintLiveSetSize) {
-    dbgs() << "Safepoint For: " << Call->getCalledValue()->getName() << "\n";
+    dbgs() << "Safepoint For: " << Call->getCalledOperand()->getName() << "\n";
     dbgs() << "Number live values: " << LiveSet.size() << "\n";
   }
   Result.LiveSet = LiveSet;
 }
 
+// Returns true is V is a knownBaseResult.
 static bool isKnownBaseResult(Value *V);
 
+// Returns true if V is a BaseResult that already exists in the IR, i.e. it is
+// not created by the findBasePointers algorithm.
+static bool isOriginalBaseResult(Value *V);
+
 namespace {
 
 /// A single base defining value - An immediate base defining value for an
@@ -633,15 +638,20 @@ static Value *findBaseOrBDV(Value *I, DefiningValueMapTy &Cache) {
   return Def;
 }
 
+/// This value is a base pointer that is not generated by RS4GC, i.e. it already
+/// exists in the code.
+static bool isOriginalBaseResult(Value *V) {
+  // no recursion possible
+  return !isa<PHINode>(V) && !isa<SelectInst>(V) &&
+         !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) &&
+         !isa<ShuffleVectorInst>(V);
+}
+
 /// Given the result of a call to findBaseDefiningValue, or findBaseOrBDV,
 /// is it known to be a base pointer?  Or do we need to continue searching.
 static bool isKnownBaseResult(Value *V) {
-  if (!isa<PHINode>(V) && !isa<SelectInst>(V) &&
-      !isa<ExtractElementInst>(V) && !isa<InsertElementInst>(V) &&
-      !isa<ShuffleVectorInst>(V)) {
-    // no recursion possible
+  if (isOriginalBaseResult(V))
     return true;
-  }
   if (isa<Instruction>(V) &&
       cast<Instruction>(V)->getMetadata("is_base_value")) {
     // This is a previously inserted base phi or select.  We know
@@ -653,6 +663,12 @@ static bool isKnownBaseResult(Value *V) {
   return false;
 }
 
+// Returns true if First and Second values are both scalar or both vector.
+static bool areBothVectorOrScalar(Value *First, Value *Second) {
+  return isa<VectorType>(First->getType()) ==
+         isa<VectorType>(Second->getType());
+}
+
 namespace {
 
 /// Models the state of a single base defining value in the findBasePointer
@@ -762,7 +778,7 @@ static BDVState meetBDVState(const BDVState &LHS, const BDVState &RHS) {
 static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
   Value *Def = findBaseOrBDV(I, Cache);
 
-  if (isKnownBaseResult(Def))
+  if (isKnownBaseResult(Def) && areBothVectorOrScalar(Def, I))
     return Def;
 
   // Here's the rough algorithm:
@@ -810,13 +826,16 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
     States.insert({Def, BDVState()});
     while (!Worklist.empty()) {
       Value *Current = Worklist.pop_back_val();
-      assert(!isKnownBaseResult(Current) && "why did it get added?");
+      assert(!isOriginalBaseResult(Current) && "why did it get added?");
 
       auto visitIncomingValue = [&](Value *InVal) {
         Value *Base = findBaseOrBDV(InVal, Cache);
-        if (isKnownBaseResult(Base))
+        if (isKnownBaseResult(Base) && areBothVectorOrScalar(Base, InVal))
           // Known bases won't need new instructions introduced and can be
-          // ignored safely
+          // ignored safely. However, this can only be done when InVal and Base
+          // are both scalar or both vector. Otherwise, we need to find a
+          // correct BDV for InVal, by creating an entry in the lattice
+          // (States).
           return;
         assert(isExpectedBDVType(Base) && "the only non-base values "
                "we see should be base defining values");
@@ -853,10 +872,10 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
 
   // Return a phi state for a base defining value.  We'll generate a new
   // base state for known bases and expect to find a cached state otherwise.
-  auto getStateForBDV = [&](Value *baseValue) {
-    if (isKnownBaseResult(baseValue))
-      return BDVState(baseValue);
-    auto I = States.find(baseValue);
+  auto GetStateForBDV = [&](Value *BaseValue, Value *Input) {
+    if (isKnownBaseResult(BaseValue) && areBothVectorOrScalar(BaseValue, Input))
+      return BDVState(BaseValue);
+    auto I = States.find(BaseValue);
     assert(I != States.end() && "lookup failed!");
     return I->second;
   };
@@ -873,13 +892,18 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
     // much faster.
     for (auto Pair : States) {
       Value *BDV = Pair.first;
-      assert(!isKnownBaseResult(BDV) && "why did it get added?");
+      // Only values that do not have known bases or those that have differing
+      // type (scalar versus vector) from a possible known base should be in the
+      // lattice.
+      assert((!isKnownBaseResult(BDV) ||
+             !areBothVectorOrScalar(BDV, Pair.second.getBaseValue())) &&
+                 "why did it get added?");
 
       // Given an input value for the current instruction, return a BDVState
       // instance which represents the BDV of that value.
       auto getStateForInput = [&](Value *V) mutable {
         Value *BDV = findBaseOrBDV(V, Cache);
-        return getStateForBDV(BDV);
+        return GetStateForBDV(BDV, V);
       };
 
       BDVState NewState;
@@ -926,20 +950,26 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
   }
 #endif
 
-  // Insert Phis for all conflicts
-  // TODO: adjust naming patterns to avoid this order of iteration dependency
+  // Handle all instructions that have a vector BDV, but the instruction itself
+  // is of scalar type.
   for (auto Pair : States) {
     Instruction *I = cast<Instruction>(Pair.first);
     BDVState State = Pair.second;
-    assert(!isKnownBaseResult(I) && "why did it get added?");
+    auto *BaseValue = State.getBaseValue();
+    // Only values that do not have known bases or those that have differing
+    // type (scalar versus vector) from a possible known base should be in the
+    // lattice.
+    assert((!isKnownBaseResult(I) || !areBothVectorOrScalar(I, BaseValue)) &&
+           "why did it get added?");
     assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
 
+    if (!State.isBase() || !isa<VectorType>(BaseValue->getType()))
+      continue;
     // extractelement instructions are a bit special in that we may need to
     // insert an extract even when we know an exact base for the instruction.
     // The problem is that we need to convert from a vector base to a scalar
     // base for the particular indice we're interested in.
-    if (State.isBase() && isa<ExtractElementInst>(I) &&
-        isa<VectorType>(State.getBaseValue()->getType())) {
+    if (isa<ExtractElementInst>(I)) {
       auto *EE = cast<ExtractElementInst>(I);
       // TODO: In many cases, the new instruction is just EE itself.  We should
       // exploit this, but can't do it here since it would break the invariant
@@ -948,7 +978,27 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
           State.getBaseValue(), EE->getIndexOperand(), "base_ee", EE);
       BaseInst->setMetadata("is_base_value", MDNode::get(I->getContext(), {}));
       States[I] = BDVState(BDVState::Base, BaseInst);
+    } else if (!isa<VectorType>(I->getType())) {
+      // We need to handle cases that have a vector base but the instruction is
+      // a scalar type (these could be phis or selects or any instruction that
+      // are of scalar type, but the base can be a vector type).  We
+      // conservatively set this as conflict.  Setting the base value for these
+      // conflicts is handled in the next loop which traverses States.
+      States[I] = BDVState(BDVState::Conflict);
     }
+  }
+
+  // Insert Phis for all conflicts
+  // TODO: adjust naming patterns to avoid this order of iteration dependency
+  for (auto Pair : States) {
+    Instruction *I = cast<Instruction>(Pair.first);
+    BDVState State = Pair.second;
+    // Only values that do not have known bases or those that have differing
+    // type (scalar versus vector) from a possible known base should be in the
+    // lattice.
+    assert((!isKnownBaseResult(I) || !areBothVectorOrScalar(I, State.getBaseValue())) &&
+           "why did it get added?");
+    assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
 
     // Since we're joining a vector and scalar base, they can never be the
     // same.  As a result, we should always see insert element having reached
@@ -987,7 +1037,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
         auto *SV = cast<ShuffleVectorInst>(I);
         UndefValue *VecUndef = UndefValue::get(SV->getOperand(0)->getType());
         std::string Name = suffixed_name_or(I, ".base", "base_sv");
-        return new ShuffleVectorInst(VecUndef, VecUndef, SV->getOperand(2),
+        return new ShuffleVectorInst(VecUndef, VecUndef, SV->getShuffleMask(),
                                      Name, SV);
       }
     };
@@ -1008,7 +1058,7 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
   auto getBaseForInput = [&](Value *Input, Instruction *InsertPt) {
     Value *BDV = findBaseOrBDV(Input, Cache);
     Value *Base = nullptr;
-    if (isKnownBaseResult(BDV)) {
+    if (isKnownBaseResult(BDV) && areBothVectorOrScalar(BDV, Input)) {
       Base = BDV;
     } else {
       // Either conflict or base.
@@ -1029,7 +1079,12 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
     Instruction *BDV = cast<Instruction>(Pair.first);
     BDVState State = Pair.second;
 
-    assert(!isKnownBaseResult(BDV) && "why did it get added?");
+    // Only values that do not have known bases or those that have differing
+    // type (scalar versus vector) from a possible known base should be in the
+    // lattice.
+    assert((!isKnownBaseResult(BDV) ||
+            !areBothVectorOrScalar(BDV, State.getBaseValue())) &&
+           "why did it get added?");
     assert(!State.isUnknown() && "Optimistic algorithm didn't complete!");
     if (!State.isConflict())
       continue;
@@ -1119,7 +1174,11 @@ static Value *findBasePointer(Value *I, DefiningValueMapTy &Cache) {
     auto *BDV = Pair.first;
     Value *Base = Pair.second.getBaseValue();
     assert(BDV && Base);
-    assert(!isKnownBaseResult(BDV) && "why did it get added?");
+    // Only values that do not have known bases or those that have differing
+    // type (scalar versus vector) from a possible known base should be in the
+    // lattice.
+    assert((!isKnownBaseResult(BDV) || !areBothVectorOrScalar(BDV, Base)) &&
+           "why did it get added?");
 
     LLVM_DEBUG(
         dbgs() << "Updating base value cache"
@@ -1238,7 +1297,8 @@ normalizeForInvokeSafepoint(BasicBlock *BB, BasicBlock *InvokeParent,
 
 // Create new attribute set containing only attributes which can be transferred
 // from original call to the safepoint.
-static AttributeList legalizeCallAttributes(AttributeList AL) {
+static AttributeList legalizeCallAttributes(LLVMContext &Ctx,
+                                            AttributeList AL) {
   if (AL.isEmpty())
     return AL;
 
@@ -1252,7 +1312,6 @@ static AttributeList legalizeCallAttributes(AttributeList AL) {
   }
 
   // Just skip parameter and return attributes for now
-  LLVMContext &Ctx = AL.getContext();
   return AttributeList::get(Ctx, AttributeList::FunctionIndex,
                             AttributeSet::get(Ctx, FnAttrs));
 }
@@ -1261,16 +1320,14 @@ static AttributeList legalizeCallAttributes(AttributeList AL) {
 /// statepoint.
 /// Inputs:
 ///   liveVariables - list of variables to be relocated.
-///   liveStart - index of the first live variable.
 ///   basePtrs - base pointers.
 ///   statepointToken - statepoint instruction to which relocates should be
 ///   bound.
 ///   Builder - Llvm IR builder to be used to construct new calls.
 static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
-                              const int LiveStart,
                               ArrayRef<Value *> BasePtrs,
                               Instruction *StatepointToken,
-                              IRBuilder<> Builder) {
+                              IRBuilder<> &Builder) {
   if (LiveVariables.empty())
     return;
 
@@ -1295,7 +1352,8 @@ static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
     auto AS = Ty->getScalarType()->getPointerAddressSpace();
     Type *NewTy = Type::getInt8PtrTy(M->getContext(), AS);
     if (auto *VT = dyn_cast<VectorType>(Ty))
-      NewTy = VectorType::get(NewTy, VT->getNumElements());
+      NewTy = FixedVectorType::get(NewTy,
+                                   cast<FixedVectorType>(VT)->getNumElements());
     return Intrinsic::getDeclaration(M, Intrinsic::experimental_gc_relocate,
                                      {NewTy});
   };
@@ -1307,9 +1365,8 @@ static void CreateGCRelocates(ArrayRef<Value *> LiveVariables,
 
   for (unsigned i = 0; i < LiveVariables.size(); i++) {
     // Generate the gc.relocate call and save the result
-    Value *BaseIdx =
-      Builder.getInt32(LiveStart + FindIndex(LiveVariables, BasePtrs[i]));
-    Value *LiveIdx = Builder.getInt32(LiveStart + i);
+    Value *BaseIdx = Builder.getInt32(FindIndex(LiveVariables, BasePtrs[i]));
+    Value *LiveIdx = Builder.getInt32(i);
 
     Type *Ty = LiveVariables[i]->getType();
     if (!TypeToDeclMap.count(Ty))
@@ -1431,12 +1488,14 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
   uint32_t Flags = uint32_t(StatepointFlags::None);
 
   ArrayRef<Use> CallArgs(Call->arg_begin(), Call->arg_end());
-  ArrayRef<Use> DeoptArgs = GetDeoptBundleOperands(Call);
-  ArrayRef<Use> TransitionArgs;
-  if (auto TransitionBundle =
-          Call->getOperandBundle(LLVMContext::OB_gc_transition)) {
+  Optional<ArrayRef<Use>> DeoptArgs;
+  if (auto Bundle = Call->getOperandBundle(LLVMContext::OB_deopt))
+    DeoptArgs = Bundle->Inputs;
+  Optional<ArrayRef<Use>> TransitionArgs;
+  if (auto Bundle = Call->getOperandBundle(LLVMContext::OB_gc_transition)) {
+    TransitionArgs = Bundle->Inputs;
+    // TODO: This flag no longer serves a purpose and can be removed later
     Flags |= uint32_t(StatepointFlags::GCTransition);
-    TransitionArgs = TransitionBundle->Inputs;
   }
 
   // Instead of lowering calls to @llvm.experimental.deoptimize as normal calls
@@ -1459,7 +1518,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
     assert(DeoptLowering.equals("live-through") && "Unsupported value!");
   }
 
-  Value *CallTarget = Call->getCalledValue();
+  Value *CallTarget = Call->getCalledOperand();
   if (Function *F = dyn_cast<Function>(CallTarget)) {
     if (F->getIntrinsicID() == Intrinsic::experimental_deoptimize) {
       // Calls to llvm.experimental.deoptimize are lowered to calls to the
@@ -1485,7 +1544,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
   }
 
   // Create the statepoint given all the arguments
-  Instruction *Token = nullptr;
+  GCStatepointInst *Token = nullptr;
   if (auto *CI = dyn_cast<CallInst>(Call)) {
     CallInst *SPCall = Builder.CreateGCStatepointCall(
         StatepointID, NumPatchBytes, CallTarget, Flags, CallArgs,
@@ -1498,9 +1557,10 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
     // function attributes.  In case if we can handle this set of attributes -
     // set up function attrs directly on statepoint and return attrs later for
     // gc_result intrinsic.
-    SPCall->setAttributes(legalizeCallAttributes(CI->getAttributes()));
+    SPCall->setAttributes(
+        legalizeCallAttributes(CI->getContext(), CI->getAttributes()));
 
-    Token = SPCall;
+    Token = cast<GCStatepointInst>(SPCall);
 
     // Put the following gc_result and gc_relocate calls immediately after the
     // the old call (which we're about to delete)
@@ -1524,9 +1584,10 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
     // function attributes.  In case if we can handle this set of attributes -
     // set up function attrs directly on statepoint and return attrs later for
     // gc_result intrinsic.
-    SPInvoke->setAttributes(legalizeCallAttributes(II->getAttributes()));
+    SPInvoke->setAttributes(
+        legalizeCallAttributes(II->getContext(), II->getAttributes()));
 
-    Token = SPInvoke;
+    Token = cast<GCStatepointInst>(SPInvoke);
 
     // Generate gc relocates in exceptional path
     BasicBlock *UnwindBlock = II->getUnwindDest();
@@ -1541,9 +1602,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
     Instruction *ExceptionalToken = UnwindBlock->getLandingPadInst();
     Result.UnwindToken = ExceptionalToken;
 
-    const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx();
-    CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, ExceptionalToken,
-                      Builder);
+    CreateGCRelocates(LiveVariables, BasePtrs, ExceptionalToken, Builder);
 
     // Generate gc relocates and returns for normal block
     BasicBlock *NormalDest = II->getNormalDest();
@@ -1589,8 +1648,7 @@ makeStatepointExplicitImpl(CallBase *Call, /* to replace */
   Result.StatepointToken = Token;
 
   // Second, create a gc.relocate for every live variable
-  const unsigned LiveStartIdx = Statepoint(Token).gcArgsStartIdx();
-  CreateGCRelocates(LiveVariables, LiveStartIdx, BasePtrs, Token, Builder);
+  CreateGCRelocates(LiveVariables, BasePtrs, Token, Builder);
 }
 
 // Replace an existing gc.statepoint with a new one and a set of gc.relocates
@@ -1651,8 +1709,8 @@ insertRelocationStores(iterator_range<Value::user_iterator> GCRelocs,
                             cast<AllocaInst>(Alloca)->getAllocatedType(),
                             suffixed_name_or(Relocate, ".casted", ""));
 
-    StoreInst *Store = new StoreInst(CastedRelocatedValue, Alloca);
-    Store->insertAfter(cast<Instruction>(CastedRelocatedValue));
+    new StoreInst(CastedRelocatedValue, Alloca,
+                  cast<Instruction>(CastedRelocatedValue)->getNextNode());
 
 #ifndef NDEBUG
     VisitedLiveValues.insert(OriginalValue);
@@ -1674,8 +1732,8 @@ static void insertRematerializationStores(
            "Can not find alloca for rematerialized value");
     Value *Alloca = AllocaMap[OriginalValue];
 
-    StoreInst *Store = new StoreInst(RematerializedValue, Alloca);
-    Store->insertAfter(RematerializedValue);
+    new StoreInst(RematerializedValue, Alloca,
+                  RematerializedValue->getNextNode());
 
 #ifndef NDEBUG
     VisitedLiveValues.insert(OriginalValue);
@@ -1780,8 +1838,7 @@ static void relocationViaAlloca(
         for (auto *AI : ToClobber) {
           auto PT = cast<PointerType>(AI->getAllocatedType());
           Constant *CPN = ConstantPointerNull::get(PT);
-          StoreInst *Store = new StoreInst(CPN, AI);
-          Store->insertBefore(IP);
+          new StoreInst(CPN, AI, IP);
         }
       };
 
@@ -1843,7 +1900,8 @@ static void relocationViaAlloca(
     // Emit store for the initial gc value.  Store must be inserted after load,
     // otherwise store will be in alloca's use list and an extra load will be
     // inserted before it.
-    StoreInst *Store = new StoreInst(Def, Alloca);
+    StoreInst *Store = new StoreInst(Def, Alloca, /*volatile*/ false,
+                                     DL.getABITypeAlign(Def->getType()));
     if (Instruction *Inst = dyn_cast<Instruction>(Def)) {
       if (InvokeInst *Invoke = dyn_cast<InvokeInst>(Inst)) {
         // InvokeInst is a terminator so the store need to be inserted into its
@@ -1966,7 +2024,9 @@ chainToBasePointerCost(SmallVectorImpl<Instruction*> &Chain,
              "non noop cast is found during rematerialization");
 
       Type *SrcTy = CI->getOperand(0)->getType();
-      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy, CI);
+      Cost += TTI.getCastInstrCost(CI->getOpcode(), CI->getType(), SrcTy,
+                                   TargetTransformInfo::TCK_SizeAndLatency,
+                                   CI);
 
     } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
       // Cost of the address calculation
@@ -2344,9 +2404,8 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
     // That Value* no longer exists and we need to use the new gc_result.
     // Thankfully, the live set is embedded in the statepoint (and updated), so
     // we just grab that.
-    Statepoint Statepoint(Info.StatepointToken);
-    Live.insert(Live.end(), Statepoint.gc_args_begin(),
-                Statepoint.gc_args_end());
+    Live.insert(Live.end(), Info.StatepointToken->gc_args_begin(),
+                Info.StatepointToken->gc_args_end());
 #ifndef NDEBUG
     // Do some basic sanity checks on our liveness results before performing
     // relocation.  Relocation can and will turn mistakes in liveness results
@@ -2354,7 +2413,7 @@ static bool insertParsePoints(Function &F, DominatorTree &DT,
     // TODO: It would be nice to test consistency as well
     assert(DT.isReachableFromEntry(Info.StatepointToken->getParent()) &&
            "statepoint must be reachable or liveness is meaningless");
-    for (Value *V : Statepoint.gc_args()) {
+    for (Value *V : Info.StatepointToken->gc_args()) {
       if (!isa<Instruction>(V))
         // Non-instruction values trivial dominate all possible uses
         continue;
@@ -2523,7 +2582,7 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
 
   auto NeedsRewrite = [&TLI](Instruction &I) {
     if (const auto *Call = dyn_cast<CallBase>(&I))
-      return !callsGCLeafFunction(Call, TLI) && !isStatepoint(Call);
+      return !callsGCLeafFunction(Call, TLI) && !isa<GCStatepointInst>(Call);
     return false;
   };
 
@@ -2608,10 +2667,10 @@ bool RewriteStatepointsForGC::runOnFunction(Function &F, DominatorTree &DT,
 
     unsigned VF = 0;
     for (unsigned i = 0; i < I.getNumOperands(); i++)
-      if (I.getOperand(i)->getType()->isVectorTy()) {
+      if (auto *OpndVTy = dyn_cast<VectorType>(I.getOperand(i)->getType())) {
         assert(VF == 0 ||
-               VF == I.getOperand(i)->getType()->getVectorNumElements());
-        VF = I.getOperand(i)->getType()->getVectorNumElements();
+               VF == cast<FixedVectorType>(OpndVTy)->getNumElements());
+        VF = cast<FixedVectorType>(OpndVTy)->getNumElements();
       }
 
     // It's the vector to scalar traversal through the pointer operand which
diff --git a/llvm/lib/Transforms/Scalar/SCCP.cpp b/llvm/lib/Transforms/Scalar/SCCP.cpp
index e696ea83a300..5ebd3b71fe78 100644
--- a/llvm/lib/Transforms/Scalar/SCCP.cpp
+++ b/llvm/lib/Transforms/Scalar/SCCP.cpp
@@ -27,12 +27,13 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/ConstantFolding.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/ValueLattice.h"
 #include "llvm/Analysis/ValueLatticeUtils.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -67,123 +68,44 @@ using namespace llvm;
 
 STATISTIC(NumInstRemoved, "Number of instructions removed");
 STATISTIC(NumDeadBlocks , "Number of basic blocks unreachable");
+STATISTIC(NumInstReplaced,
+          "Number of instructions replaced with (simpler) instruction");
 
 STATISTIC(IPNumInstRemoved, "Number of instructions removed by IPSCCP");
 STATISTIC(IPNumArgsElimed ,"Number of arguments constant propagated by IPSCCP");
 STATISTIC(IPNumGlobalConst, "Number of globals found to be constant by IPSCCP");
-
+STATISTIC(
+    IPNumInstReplaced,
+    "Number of instructions replaced with (simpler) instruction by IPSCCP");
+
+// The maximum number of range extensions allowed for operations requiring
+// widening.
+static const unsigned MaxNumRangeExtensions = 10;
+
+/// Returns MergeOptions with MaxWidenSteps set to MaxNumRangeExtensions.
+static ValueLatticeElement::MergeOptions getMaxWidenStepsOpts() {
+  return ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+      MaxNumRangeExtensions);
+}
 namespace {
 
-/// LatticeVal class - This class represents the different lattice values that
-/// an LLVM value may occupy.  It is a simple class with value semantics.
-///
-class LatticeVal {
-  enum LatticeValueTy {
-    /// unknown - This LLVM Value has no known value yet.
-    unknown,
-
-    /// constant - This LLVM Value has a specific constant value.
-    constant,
-
-    /// forcedconstant - This LLVM Value was thought to be undef until
-    /// ResolvedUndefsIn.  This is treated just like 'constant', but if merged
-    /// with another (different) constant, it goes to overdefined, instead of
-    /// asserting.
-    forcedconstant,
-
-    /// overdefined - This instruction is not known to be constant, and we know
-    /// it has a value.
-    overdefined
-  };
-
-  /// Val: This stores the current lattice value along with the Constant* for
-  /// the constant if this is a 'constant' or 'forcedconstant' value.
-  PointerIntPair<Constant *, 2, LatticeValueTy> Val;
-
-  LatticeValueTy getLatticeValue() const {
-    return Val.getInt();
-  }
-
-public:
-  LatticeVal() : Val(nullptr, unknown) {}
-
-  bool isUnknown() const { return getLatticeValue() == unknown; }
-
-  bool isConstant() const {
-    return getLatticeValue() == constant || getLatticeValue() == forcedconstant;
-  }
-
-  bool isOverdefined() const { return getLatticeValue() == overdefined; }
-
-  Constant *getConstant() const {
-    assert(isConstant() && "Cannot get the constant of a non-constant!");
-    return Val.getPointer();
-  }
-
-  /// markOverdefined - Return true if this is a change in status.
-  bool markOverdefined() {
-    if (isOverdefined())
-      return false;
-
-    Val.setInt(overdefined);
-    return true;
-  }
-
-  /// markConstant - Return true if this is a change in status.
-  bool markConstant(Constant *V) {
-    if (getLatticeValue() == constant) { // Constant but not forcedconstant.
-      assert(getConstant() == V && "Marking constant with different value");
-      return false;
-    }
-
-    if (isUnknown()) {
-      Val.setInt(constant);
-      assert(V && "Marking constant with NULL");
-      Val.setPointer(V);
-    } else {
-      assert(getLatticeValue() == forcedconstant &&
-             "Cannot move from overdefined to constant!");
-      // Stay at forcedconstant if the constant is the same.
-      if (V == getConstant()) return false;
-
-      // Otherwise, we go to overdefined.  Assumptions made based on the
-      // forced value are possibly wrong.  Assuming this is another constant
-      // could expose a contradiction.
-      Val.setInt(overdefined);
-    }
-    return true;
-  }
-
-  /// getConstantInt - If this is a constant with a ConstantInt value, return it
-  /// otherwise return null.
-  ConstantInt *getConstantInt() const {
-    if (isConstant())
-      return dyn_cast<ConstantInt>(getConstant());
-    return nullptr;
-  }
-
-  /// getBlockAddress - If this is a constant with a BlockAddress value, return
-  /// it, otherwise return null.
-  BlockAddress *getBlockAddress() const {
-    if (isConstant())
-      return dyn_cast<BlockAddress>(getConstant());
-    return nullptr;
-  }
-
-  void markForcedConstant(Constant *V) {
-    assert(isUnknown() && "Can't force a defined value!");
-    Val.setInt(forcedconstant);
-    Val.setPointer(V);
-  }
+// Helper to check if \p LV is either a constant or a constant
+// range with a single element. This should cover exactly the same cases as the
+// old ValueLatticeElement::isConstant() and is intended to be used in the
+// transition to ValueLatticeElement.
+bool isConstant(const ValueLatticeElement &LV) {
+  return LV.isConstant() ||
+         (LV.isConstantRange() && LV.getConstantRange().isSingleElement());
+}
 
-  ValueLatticeElement toValueLattice() const {
-    if (isOverdefined())
-      return ValueLatticeElement::getOverdefined();
-    if (isConstant())
-      return ValueLatticeElement::get(getConstant());
-    return ValueLatticeElement();
-  }
-};
+// Helper to check if \p LV is either overdefined or a constant range with more
+// than a single element. This should cover exactly the same cases as the old
+// ValueLatticeElement::isOverdefined() and is intended to be used in the
+// transition to ValueLatticeElement.
+bool isOverdefined(const ValueLatticeElement &LV) {
+  return LV.isOverdefined() ||
+         (LV.isConstantRange() && !LV.getConstantRange().isSingleElement());
+}
 
 //===----------------------------------------------------------------------===//
 //
@@ -194,28 +116,28 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
   const DataLayout &DL;
   std::function<const TargetLibraryInfo &(Function &)> GetTLI;
   SmallPtrSet<BasicBlock *, 8> BBExecutable; // The BBs that are executable.
-  DenseMap<Value *, LatticeVal> ValueState;  // The state each value is in.
-  // The state each parameter is in.
-  DenseMap<Value *, ValueLatticeElement> ParamState;
+  DenseMap<Value *, ValueLatticeElement>
+      ValueState; // The state each value is in.
 
   /// StructValueState - This maintains ValueState for values that have
   /// StructType, for example for formal arguments, calls, insertelement, etc.
-  DenseMap<std::pair<Value *, unsigned>, LatticeVal> StructValueState;
+  DenseMap<std::pair<Value *, unsigned>, ValueLatticeElement> StructValueState;
 
   /// GlobalValue - If we are tracking any values for the contents of a global
   /// variable, we keep a mapping from the constant accessor to the element of
   /// the global, to the currently known value.  If the value becomes
   /// overdefined, it's entry is simply removed from this map.
-  DenseMap<GlobalVariable *, LatticeVal> TrackedGlobals;
+  DenseMap<GlobalVariable *, ValueLatticeElement> TrackedGlobals;
 
   /// TrackedRetVals - If we are tracking arguments into and the return
   /// value out of a function, it will have an entry in this map, indicating
   /// what the known return value for the function is.
-  MapVector<Function *, LatticeVal> TrackedRetVals;
+  MapVector<Function *, ValueLatticeElement> TrackedRetVals;
 
   /// TrackedMultipleRetVals - Same as TrackedRetVals, but used for functions
   /// that return multiple values.
-  MapVector<std::pair<Function *, unsigned>, LatticeVal> TrackedMultipleRetVals;
+  MapVector<std::pair<Function *, unsigned>, ValueLatticeElement>
+      TrackedMultipleRetVals;
 
   /// MRVFunctionsTracked - Each function in TrackedMultipleRetVals is
   /// represented here for efficient lookup.
@@ -251,6 +173,8 @@ class SCCPSolver : public InstVisitor<SCCPSolver> {
   DenseMap<Function *, AnalysisResultsForFn> AnalysisResults;
   DenseMap<Value *, SmallPtrSet<User *, 2>> AdditionalUsers;
 
+  LLVMContext &Ctx;
+
 public:
   void addAnalysis(Function &F, AnalysisResultsForFn A) {
     AnalysisResults.insert({&F, std::move(A)});
@@ -270,8 +194,9 @@ public:
   }
 
   SCCPSolver(const DataLayout &DL,
-             std::function<const TargetLibraryInfo &(Function &)> GetTLI)
-      : DL(DL), GetTLI(std::move(GetTLI)) {}
+             std::function<const TargetLibraryInfo &(Function &)> GetTLI,
+             LLVMContext &Ctx)
+      : DL(DL), GetTLI(std::move(GetTLI)), Ctx(Ctx) {}
 
   /// MarkBlockExecutable - This method can be used by clients to mark all of
   /// the blocks that are known to be intrinsically live in the processed unit.
@@ -292,7 +217,7 @@ public:
   void TrackValueOfGlobalVariable(GlobalVariable *GV) {
     // We only track the contents of scalar globals.
     if (GV->getValueType()->isSingleValueType()) {
-      LatticeVal &IV = TrackedGlobals[GV];
+      ValueLatticeElement &IV = TrackedGlobals[GV];
       if (!isa<UndefValue>(GV->getInitializer()))
         IV.markConstant(GV->getInitializer());
     }
@@ -306,10 +231,10 @@ public:
     if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
       MRVFunctionsTracked.insert(F);
       for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
-        TrackedMultipleRetVals.insert(std::make_pair(std::make_pair(F, i),
-                                                     LatticeVal()));
+        TrackedMultipleRetVals.insert(
+            std::make_pair(std::make_pair(F, i), ValueLatticeElement()));
     } else
-      TrackedRetVals.insert(std::make_pair(F, LatticeVal()));
+      TrackedRetVals.insert(std::make_pair(F, ValueLatticeElement()));
   }
 
   /// AddMustTailCallee - If the SCCP solver finds that this function is called
@@ -352,8 +277,8 @@ public:
   // block to the 'To' basic block is currently feasible.
   bool isEdgeFeasible(BasicBlock *From, BasicBlock *To);
 
-  std::vector<LatticeVal> getStructLatticeValueFor(Value *V) const {
-    std::vector<LatticeVal> StructValues;
+  std::vector<ValueLatticeElement> getStructLatticeValueFor(Value *V) const {
+    std::vector<ValueLatticeElement> StructValues;
     auto *STy = dyn_cast<StructType>(V->getType());
     assert(STy && "getStructLatticeValueFor() can be called only on structs");
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
@@ -364,23 +289,26 @@ public:
     return StructValues;
   }
 
-  const LatticeVal &getLatticeValueFor(Value *V) const {
+  void removeLatticeValueFor(Value *V) { ValueState.erase(V); }
+
+  const ValueLatticeElement &getLatticeValueFor(Value *V) const {
     assert(!V->getType()->isStructTy() &&
            "Should use getStructLatticeValueFor");
-    DenseMap<Value *, LatticeVal>::const_iterator I = ValueState.find(V);
+    DenseMap<Value *, ValueLatticeElement>::const_iterator I =
+        ValueState.find(V);
     assert(I != ValueState.end() &&
            "V not found in ValueState nor Paramstate map!");
     return I->second;
   }
 
   /// getTrackedRetVals - Get the inferred return value map.
-  const MapVector<Function*, LatticeVal> &getTrackedRetVals() {
+  const MapVector<Function *, ValueLatticeElement> &getTrackedRetVals() {
     return TrackedRetVals;
   }
 
   /// getTrackedGlobals - Get and return the set of inferred initializers for
   /// global variables.
-  const DenseMap<GlobalVariable*, LatticeVal> &getTrackedGlobals() {
+  const DenseMap<GlobalVariable *, ValueLatticeElement> &getTrackedGlobals() {
     return TrackedGlobals;
   }
 
@@ -407,32 +335,59 @@ public:
   }
 
   // isStructLatticeConstant - Return true if all the lattice values
-  // corresponding to elements of the structure are not overdefined,
+  // corresponding to elements of the structure are constants,
   // false otherwise.
   bool isStructLatticeConstant(Function *F, StructType *STy) {
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
       const auto &It = TrackedMultipleRetVals.find(std::make_pair(F, i));
       assert(It != TrackedMultipleRetVals.end());
-      LatticeVal LV = It->second;
-      if (LV.isOverdefined())
+      ValueLatticeElement LV = It->second;
+      if (!isConstant(LV))
         return false;
     }
     return true;
   }
 
+  /// Helper to return a Constant if \p LV is either a constant or a constant
+  /// range with a single element.
+  Constant *getConstant(const ValueLatticeElement &LV) const {
+    if (LV.isConstant())
+      return LV.getConstant();
+
+    if (LV.isConstantRange()) {
+      auto &CR = LV.getConstantRange();
+      if (CR.getSingleElement())
+        return ConstantInt::get(Ctx, *CR.getSingleElement());
+    }
+    return nullptr;
+  }
+
 private:
-  // pushToWorkList - Helper for markConstant/markForcedConstant/markOverdefined
-  void pushToWorkList(LatticeVal &IV, Value *V) {
+  ConstantInt *getConstantInt(const ValueLatticeElement &IV) const {
+    return dyn_cast_or_null<ConstantInt>(getConstant(IV));
+  }
+
+  // pushToWorkList - Helper for markConstant/markOverdefined
+  void pushToWorkList(ValueLatticeElement &IV, Value *V) {
     if (IV.isOverdefined())
       return OverdefinedInstWorkList.push_back(V);
     InstWorkList.push_back(V);
   }
 
+  // Helper to push \p V to the worklist, after updating it to \p IV. Also
+  // prints a debug message with the updated value.
+  void pushToWorkListMsg(ValueLatticeElement &IV, Value *V) {
+    LLVM_DEBUG(dbgs() << "updated " << IV << ": " << *V << '\n');
+    pushToWorkList(IV, V);
+  }
+
   // markConstant - Make a value be marked as "constant".  If the value
   // is not already a constant, add it to the instruction work list so that
   // the users of the instruction are updated later.
-  bool markConstant(LatticeVal &IV, Value *V, Constant *C) {
-    if (!IV.markConstant(C)) return false;
+  bool markConstant(ValueLatticeElement &IV, Value *V, Constant *C,
+                    bool MayIncludeUndef = false) {
+    if (!IV.markConstant(C, MayIncludeUndef))
+      return false;
     LLVM_DEBUG(dbgs() << "markConstant: " << *C << ": " << *V << '\n');
     pushToWorkList(IV, V);
     return true;
@@ -443,18 +398,10 @@ private:
     return markConstant(ValueState[V], V, C);
   }
 
-  void markForcedConstant(Value *V, Constant *C) {
-    assert(!V->getType()->isStructTy() && "structs should use mergeInValue");
-    LatticeVal &IV = ValueState[V];
-    IV.markForcedConstant(C);
-    LLVM_DEBUG(dbgs() << "markForcedConstant: " << *C << ": " << *V << '\n');
-    pushToWorkList(IV, V);
-  }
-
   // markOverdefined - Make a value be marked as "overdefined". If the
   // value is not already overdefined, add it to the overdefined instruction
   // work list so that the users of the instruction are updated later.
-  bool markOverdefined(LatticeVal &IV, Value *V) {
+  bool markOverdefined(ValueLatticeElement &IV, Value *V) {
     if (!IV.markOverdefined()) return false;
 
     LLVM_DEBUG(dbgs() << "markOverdefined: ";
@@ -466,71 +413,59 @@ private:
     return true;
   }
 
-  bool mergeInValue(LatticeVal &IV, Value *V, LatticeVal MergeWithV) {
-    if (IV.isOverdefined() || MergeWithV.isUnknown())
-      return false; // Noop.
-    if (MergeWithV.isOverdefined())
-      return markOverdefined(IV, V);
-    if (IV.isUnknown())
-      return markConstant(IV, V, MergeWithV.getConstant());
-    if (IV.getConstant() != MergeWithV.getConstant())
-      return markOverdefined(IV, V);
+  /// Merge \p MergeWithV into \p IV and push \p V to the worklist, if \p IV
+  /// changes.
+  bool mergeInValue(ValueLatticeElement &IV, Value *V,
+                    ValueLatticeElement MergeWithV,
+                    ValueLatticeElement::MergeOptions Opts = {
+                        /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) {
+    if (IV.mergeIn(MergeWithV, Opts)) {
+      pushToWorkList(IV, V);
+      LLVM_DEBUG(dbgs() << "Merged " << MergeWithV << " into " << *V << " : "
+                        << IV << "\n");
+      return true;
+    }
     return false;
   }
 
-  bool mergeInValue(Value *V, LatticeVal MergeWithV) {
+  bool mergeInValue(Value *V, ValueLatticeElement MergeWithV,
+                    ValueLatticeElement::MergeOptions Opts = {
+                        /*MayIncludeUndef=*/false, /*CheckWiden=*/false}) {
     assert(!V->getType()->isStructTy() &&
            "non-structs should use markConstant");
-    return mergeInValue(ValueState[V], V, MergeWithV);
+    return mergeInValue(ValueState[V], V, MergeWithV, Opts);
   }
 
-  /// getValueState - Return the LatticeVal object that corresponds to the
-  /// value.  This function handles the case when the value hasn't been seen yet
-  /// by properly seeding constants etc.
-  LatticeVal &getValueState(Value *V) {
+  /// getValueState - Return the ValueLatticeElement object that corresponds to
+  /// the value.  This function handles the case when the value hasn't been seen
+  /// yet by properly seeding constants etc.
+  ValueLatticeElement &getValueState(Value *V) {
     assert(!V->getType()->isStructTy() && "Should use getStructValueState");
 
-    std::pair<DenseMap<Value*, LatticeVal>::iterator, bool> I =
-      ValueState.insert(std::make_pair(V, LatticeVal()));
-    LatticeVal &LV = I.first->second;
+    auto I = ValueState.insert(std::make_pair(V, ValueLatticeElement()));
+    ValueLatticeElement &LV = I.first->second;
 
     if (!I.second)
       return LV;  // Common case, already in the map.
 
-    if (auto *C = dyn_cast<Constant>(V)) {
-      // Undef values remain unknown.
-      if (!isa<UndefValue>(V))
-        LV.markConstant(C);          // Constants are constant
-    }
-
-    // All others are underdefined by default.
-    return LV;
-  }
-
-  ValueLatticeElement &getParamState(Value *V) {
-    assert(!V->getType()->isStructTy() && "Should use getStructValueState");
-
-    std::pair<DenseMap<Value*, ValueLatticeElement>::iterator, bool>
-        PI = ParamState.insert(std::make_pair(V, ValueLatticeElement()));
-    ValueLatticeElement &LV = PI.first->second;
-    if (PI.second)
-      LV = getValueState(V).toValueLattice();
+    if (auto *C = dyn_cast<Constant>(V))
+      LV.markConstant(C);          // Constants are constant
 
+    // All others are unknown by default.
     return LV;
   }
 
-  /// getStructValueState - Return the LatticeVal object that corresponds to the
-  /// value/field pair.  This function handles the case when the value hasn't
-  /// been seen yet by properly seeding constants etc.
-  LatticeVal &getStructValueState(Value *V, unsigned i) {
+  /// getStructValueState - Return the ValueLatticeElement object that
+  /// corresponds to the value/field pair.  This function handles the case when
+  /// the value hasn't been seen yet by properly seeding constants etc.
+  ValueLatticeElement &getStructValueState(Value *V, unsigned i) {
     assert(V->getType()->isStructTy() && "Should use getValueState");
     assert(i < cast<StructType>(V->getType())->getNumElements() &&
            "Invalid element #");
 
-    std::pair<DenseMap<std::pair<Value*, unsigned>, LatticeVal>::iterator,
-              bool> I = StructValueState.insert(
-                        std::make_pair(std::make_pair(V, i), LatticeVal()));
-    LatticeVal &LV = I.first->second;
+    auto I = StructValueState.insert(
+        std::make_pair(std::make_pair(V, i), ValueLatticeElement()));
+    ValueLatticeElement &LV = I.first->second;
 
     if (!I.second)
       return LV;  // Common case, already in the map.
@@ -589,9 +524,20 @@ private:
 
   // Mark I's users as changed, including AdditionalUsers.
   void markUsersAsChanged(Value *I) {
-    for (User *U : I->users())
-      if (auto *UI = dyn_cast<Instruction>(U))
-        OperandChangedState(UI);
+    // Functions include their arguments in the use-list. Changed function
+    // values mean that the result of the function changed. We only need to
+    // update the call sites with the new function result and do not have to
+    // propagate the call arguments.
+    if (isa<Function>(I)) {
+      for (User *U : I->users()) {
+        if (auto *CB = dyn_cast<CallBase>(U))
+          handleCallResult(*CB);
+      }
+    } else {
+      for (User *U : I->users())
+        if (auto *UI = dyn_cast<Instruction>(U))
+          OperandChangedState(UI);
+    }
 
     auto Iter = AdditionalUsers.find(I);
     if (Iter != AdditionalUsers.end()) {
@@ -600,6 +546,9 @@ private:
           OperandChangedState(UI);
     }
   }
+  void handleCallOverdefined(CallBase &CB);
+  void handleCallResult(CallBase &CB);
+  void handleCallArguments(CallBase &CB);
 
 private:
   friend class InstVisitor<SCCPSolver>;
@@ -634,20 +583,20 @@ private:
   void visitGetElementPtrInst(GetElementPtrInst &I);
 
   void visitCallInst      (CallInst &I) {
-    visitCallSite(&I);
+    visitCallBase(I);
   }
 
   void visitInvokeInst    (InvokeInst &II) {
-    visitCallSite(&II);
+    visitCallBase(II);
     visitTerminator(II);
   }
 
   void visitCallBrInst    (CallBrInst &CBI) {
-    visitCallSite(&CBI);
+    visitCallBase(CBI);
     visitTerminator(CBI);
   }
 
-  void visitCallSite      (CallSite CS);
+  void visitCallBase      (CallBase &CB);
   void visitResumeInst    (ResumeInst &I) { /*returns void*/ }
   void visitUnreachableInst(UnreachableInst &I) { /*returns void*/ }
   void visitFenceInst     (FenceInst &I) { /*returns void*/ }
@@ -673,12 +622,12 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
       return;
     }
 
-    LatticeVal BCValue = getValueState(BI->getCondition());
-    ConstantInt *CI = BCValue.getConstantInt();
+    ValueLatticeElement BCValue = getValueState(BI->getCondition());
+    ConstantInt *CI = getConstantInt(BCValue);
     if (!CI) {
       // Overdefined condition variables, and branches on unfoldable constant
       // conditions, mean the branch could go either way.
-      if (!BCValue.isUnknown())
+      if (!BCValue.isUnknownOrUndef())
         Succs[0] = Succs[1] = true;
       return;
     }
@@ -699,12 +648,12 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
       Succs[0] = true;
       return;
     }
-    LatticeVal SCValue = getValueState(SI->getCondition());
-    ConstantInt *CI = SCValue.getConstantInt();
+    ValueLatticeElement SCValue = getValueState(SI->getCondition());
+    ConstantInt *CI = getConstantInt(SCValue);
 
     if (!CI) {   // Overdefined or unknown condition?
       // All destinations are executable!
-      if (!SCValue.isUnknown())
+      if (!SCValue.isUnknownOrUndef())
         Succs.assign(TI.getNumSuccessors(), true);
       return;
     }
@@ -717,11 +666,11 @@ void SCCPSolver::getFeasibleSuccessors(Instruction &TI,
   // the target as executable.
   if (auto *IBR = dyn_cast<IndirectBrInst>(&TI)) {
     // Casts are folded by visitCastInst.
-    LatticeVal IBRValue = getValueState(IBR->getAddress());
-    BlockAddress *Addr = IBRValue.getBlockAddress();
+    ValueLatticeElement IBRValue = getValueState(IBR->getAddress());
+    BlockAddress *Addr = dyn_cast_or_null<BlockAddress>(getConstant(IBRValue));
     if (!Addr) {   // Overdefined or unknown condition?
       // All destinations are executable!
-      if (!IBRValue.isUnknown())
+      if (!IBRValue.isUnknownOrUndef())
         Succs.assign(TI.getNumSuccessors(), true);
       return;
     }
@@ -786,50 +735,43 @@ void SCCPSolver::visitPHINode(PHINode &PN) {
     return (void)markOverdefined(&PN);
 
   if (getValueState(&PN).isOverdefined())
-    return;  // Quick exit
+    return; // Quick exit
 
   // Super-extra-high-degree PHI nodes are unlikely to ever be marked constant,
   // and slow us down a lot.  Just mark them overdefined.
   if (PN.getNumIncomingValues() > 64)
     return (void)markOverdefined(&PN);
 
+  unsigned NumActiveIncoming = 0;
+
   // Look at all of the executable operands of the PHI node.  If any of them
   // are overdefined, the PHI becomes overdefined as well.  If they are all
   // constant, and they agree with each other, the PHI becomes the identical
-  // constant.  If they are constant and don't agree, the PHI is overdefined.
-  // If there are no executable operands, the PHI remains unknown.
-  Constant *OperandVal = nullptr;
+  // constant.  If they are constant and don't agree, the PHI is a constant
+  // range. If there are no executable operands, the PHI remains unknown.
+  ValueLatticeElement PhiState = getValueState(&PN);
   for (unsigned i = 0, e = PN.getNumIncomingValues(); i != e; ++i) {
-    LatticeVal IV = getValueState(PN.getIncomingValue(i));
-    if (IV.isUnknown()) continue;  // Doesn't influence PHI node.
-
     if (!isEdgeFeasible(PN.getIncomingBlock(i), PN.getParent()))
       continue;
 
-    if (IV.isOverdefined())    // PHI node becomes overdefined!
-      return (void)markOverdefined(&PN);
-
-    if (!OperandVal) {   // Grab the first value.
-      OperandVal = IV.getConstant();
-      continue;
-    }
-
-    // There is already a reachable operand.  If we conflict with it,
-    // then the PHI node becomes overdefined.  If we agree with it, we
-    // can continue on.
-
-    // Check to see if there are two different constants merging, if so, the PHI
-    // node is overdefined.
-    if (IV.getConstant() != OperandVal)
-      return (void)markOverdefined(&PN);
-  }
-
-  // If we exited the loop, this means that the PHI node only has constant
-  // arguments that agree with each other(and OperandVal is the constant) or
-  // OperandVal is null because there are no defined incoming arguments.  If
-  // this is the case, the PHI remains unknown.
-  if (OperandVal)
-    markConstant(&PN, OperandVal);      // Acquire operand value
+    ValueLatticeElement IV = getValueState(PN.getIncomingValue(i));
+    PhiState.mergeIn(IV);
+    NumActiveIncoming++;
+    if (PhiState.isOverdefined())
+      break;
+  }
+
+  // We allow up to 1 range extension per active incoming value and one
+  // additional extension. Note that we manually adjust the number of range
+  // extensions to match the number of active incoming values. This helps to
+  // limit multiple extensions caused by the same incoming value, if other
+  // incoming values are equal.
+  mergeInValue(&PN, PhiState,
+               ValueLatticeElement::MergeOptions().setMaxWidenSteps(
+                   NumActiveIncoming + 1));
+  ValueLatticeElement &PhiStateRef = getValueState(&PN);
+  PhiStateRef.setNumRangeExtensions(
+      std::max(NumActiveIncoming, PhiStateRef.getNumRangeExtensions()));
 }
 
 void SCCPSolver::visitReturnInst(ReturnInst &I) {
@@ -840,8 +782,7 @@ void SCCPSolver::visitReturnInst(ReturnInst &I) {
 
   // If we are tracking the return value of this function, merge it in.
   if (!TrackedRetVals.empty() && !ResultOp->getType()->isStructTy()) {
-    MapVector<Function*, LatticeVal>::iterator TFRVI =
-      TrackedRetVals.find(F);
+    auto TFRVI = TrackedRetVals.find(F);
     if (TFRVI != TrackedRetVals.end()) {
       mergeInValue(TFRVI->second, F, getValueState(ResultOp));
       return;
@@ -871,18 +812,28 @@ void SCCPSolver::visitTerminator(Instruction &TI) {
 }
 
 void SCCPSolver::visitCastInst(CastInst &I) {
-  LatticeVal OpSt = getValueState(I.getOperand(0));
-  if (OpSt.isOverdefined())          // Inherit overdefinedness of operand
-    markOverdefined(&I);
-  else if (OpSt.isConstant()) {
+  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+  // discover a concrete value later.
+  if (ValueState[&I].isOverdefined())
+    return;
+
+  ValueLatticeElement OpSt = getValueState(I.getOperand(0));
+  if (Constant *OpC = getConstant(OpSt)) {
     // Fold the constant as we build.
-    Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpSt.getConstant(),
-                                          I.getType(), DL);
+    Constant *C = ConstantFoldCastOperand(I.getOpcode(), OpC, I.getType(), DL);
     if (isa<UndefValue>(C))
       return;
     // Propagate constant value
     markConstant(&I, C);
-  }
+  } else if (OpSt.isConstantRange() && I.getDestTy()->isIntegerTy()) {
+    auto &LV = getValueState(&I);
+    ConstantRange OpRange = OpSt.getConstantRange();
+    Type *DestTy = I.getDestTy();
+    ConstantRange Res =
+        OpRange.castOp(I.getOpcode(), DL.getTypeSizeInBits(DestTy));
+    mergeInValue(LV, &I, ValueLatticeElement::getRange(Res));
+  } else if (!OpSt.isUnknownOrUndef())
+    markOverdefined(&I);
 }
 
 void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
@@ -891,6 +842,11 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
   if (EVI.getType()->isStructTy())
     return (void)markOverdefined(&EVI);
 
+  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+  // discover a concrete value later.
+  if (ValueState[&EVI].isOverdefined())
+    return (void)markOverdefined(&EVI);
+
   // If this is extracting from more than one level of struct, we don't know.
   if (EVI.getNumIndices() != 1)
     return (void)markOverdefined(&EVI);
@@ -898,7 +854,7 @@ void SCCPSolver::visitExtractValueInst(ExtractValueInst &EVI) {
   Value *AggVal = EVI.getAggregateOperand();
   if (AggVal->getType()->isStructTy()) {
     unsigned i = *EVI.idx_begin();
-    LatticeVal EltVal = getStructValueState(AggVal, i);
+    ValueLatticeElement EltVal = getStructValueState(AggVal, i);
     mergeInValue(getValueState(&EVI), &EVI, EltVal);
   } else {
     // Otherwise, must be extracting from an array.
@@ -911,6 +867,11 @@ void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
   if (!STy)
     return (void)markOverdefined(&IVI);
 
+  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+  // discover a concrete value later.
+  if (isOverdefined(ValueState[&IVI]))
+    return (void)markOverdefined(&IVI);
+
   // If this has more than one index, we can't handle it, drive all results to
   // undef.
   if (IVI.getNumIndices() != 1)
@@ -923,7 +884,7 @@ void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
   for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
     // This passes through all values that aren't the inserted element.
     if (i != Idx) {
-      LatticeVal EltVal = getStructValueState(Aggr, i);
+      ValueLatticeElement EltVal = getStructValueState(Aggr, i);
       mergeInValue(getStructValueState(&IVI, i), &IVI, EltVal);
       continue;
     }
@@ -933,7 +894,7 @@ void SCCPSolver::visitInsertValueInst(InsertValueInst &IVI) {
       // We don't track structs in structs.
       markOverdefined(getStructValueState(&IVI, i), &IVI);
     else {
-      LatticeVal InVal = getValueState(Val);
+      ValueLatticeElement InVal = getValueState(Val);
       mergeInValue(getStructValueState(&IVI, i), &IVI, InVal);
     }
   }
@@ -945,11 +906,16 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
   if (I.getType()->isStructTy())
     return (void)markOverdefined(&I);
 
-  LatticeVal CondValue = getValueState(I.getCondition());
-  if (CondValue.isUnknown())
+  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+  // discover a concrete value later.
+  if (ValueState[&I].isOverdefined())
+    return (void)markOverdefined(&I);
+
+  ValueLatticeElement CondValue = getValueState(I.getCondition());
+  if (CondValue.isUnknownOrUndef())
     return;
 
-  if (ConstantInt *CondCB = CondValue.getConstantInt()) {
+  if (ConstantInt *CondCB = getConstantInt(CondValue)) {
     Value *OpVal = CondCB->isZero() ? I.getFalseValue() : I.getTrueValue();
     mergeInValue(&I, getValueState(OpVal));
     return;
@@ -958,30 +924,27 @@ void SCCPSolver::visitSelectInst(SelectInst &I) {
   // Otherwise, the condition is overdefined or a constant we can't evaluate.
   // See if we can produce something better than overdefined based on the T/F
   // value.
-  LatticeVal TVal = getValueState(I.getTrueValue());
-  LatticeVal FVal = getValueState(I.getFalseValue());
-
-  // select ?, C, C -> C.
-  if (TVal.isConstant() && FVal.isConstant() &&
-      TVal.getConstant() == FVal.getConstant())
-    return (void)markConstant(&I, FVal.getConstant());
-
-  if (TVal.isUnknown())   // select ?, undef, X -> X.
-    return (void)mergeInValue(&I, FVal);
-  if (FVal.isUnknown())   // select ?, X, undef -> X.
-    return (void)mergeInValue(&I, TVal);
-  markOverdefined(&I);
+  ValueLatticeElement TVal = getValueState(I.getTrueValue());
+  ValueLatticeElement FVal = getValueState(I.getFalseValue());
+
+  bool Changed = ValueState[&I].mergeIn(TVal);
+  Changed |= ValueState[&I].mergeIn(FVal);
+  if (Changed)
+    pushToWorkListMsg(ValueState[&I], &I);
 }
 
 // Handle Unary Operators.
 void SCCPSolver::visitUnaryOperator(Instruction &I) {
-  LatticeVal V0State = getValueState(I.getOperand(0));
+  ValueLatticeElement V0State = getValueState(I.getOperand(0));
 
-  LatticeVal &IV = ValueState[&I];
-  if (IV.isOverdefined()) return;
+  ValueLatticeElement &IV = ValueState[&I];
+  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+  // discover a concrete value later.
+  if (isOverdefined(IV))
+    return (void)markOverdefined(&I);
 
-  if (V0State.isConstant()) {
-    Constant *C = ConstantExpr::get(I.getOpcode(), V0State.getConstant());
+  if (isConstant(V0State)) {
+    Constant *C = ConstantExpr::get(I.getOpcode(), getConstant(V0State));
 
     // op Y -> undef.
     if (isa<UndefValue>(C))
@@ -990,7 +953,7 @@ void SCCPSolver::visitUnaryOperator(Instruction &I) {
   }
 
   // If something is undef, wait for it to resolve.
-  if (!V0State.isOverdefined())
+  if (!isOverdefined(V0State))
     return;
 
   markOverdefined(&I);
@@ -998,101 +961,90 @@ void SCCPSolver::visitUnaryOperator(Instruction &I) {
 
 // Handle Binary Operators.
 void SCCPSolver::visitBinaryOperator(Instruction &I) {
-  LatticeVal V1State = getValueState(I.getOperand(0));
-  LatticeVal V2State = getValueState(I.getOperand(1));
+  ValueLatticeElement V1State = getValueState(I.getOperand(0));
+  ValueLatticeElement V2State = getValueState(I.getOperand(1));
 
-  LatticeVal &IV = ValueState[&I];
-  if (IV.isOverdefined()) return;
-
-  if (V1State.isConstant() && V2State.isConstant()) {
-    Constant *C = ConstantExpr::get(I.getOpcode(), V1State.getConstant(),
-                                    V2State.getConstant());
-    // X op Y -> undef.
-    if (isa<UndefValue>(C))
-      return;
-    return (void)markConstant(IV, &I, C);
-  }
+  ValueLatticeElement &IV = ValueState[&I];
+  if (IV.isOverdefined())
+    return;
 
   // If something is undef, wait for it to resolve.
-  if (!V1State.isOverdefined() && !V2State.isOverdefined())
+  if (V1State.isUnknownOrUndef() || V2State.isUnknownOrUndef())
     return;
 
-  // Otherwise, one of our operands is overdefined.  Try to produce something
-  // better than overdefined with some tricks.
-  // If this is 0 / Y, it doesn't matter that the second operand is
-  // overdefined, and we can replace it with zero.
-  if (I.getOpcode() == Instruction::UDiv || I.getOpcode() == Instruction::SDiv)
-    if (V1State.isConstant() && V1State.getConstant()->isNullValue())
-      return (void)markConstant(IV, &I, V1State.getConstant());
-
-  // If this is:
-  // -> AND/MUL with 0
-  // -> OR with -1
-  // it doesn't matter that the other operand is overdefined.
-  if (I.getOpcode() == Instruction::And || I.getOpcode() == Instruction::Mul ||
-      I.getOpcode() == Instruction::Or) {
-    LatticeVal *NonOverdefVal = nullptr;
-    if (!V1State.isOverdefined())
-      NonOverdefVal = &V1State;
-    else if (!V2State.isOverdefined())
-      NonOverdefVal = &V2State;
-
-    if (NonOverdefVal) {
-      if (NonOverdefVal->isUnknown())
-        return;
+  if (V1State.isOverdefined() && V2State.isOverdefined())
+    return (void)markOverdefined(&I);
 
-      if (I.getOpcode() == Instruction::And ||
-          I.getOpcode() == Instruction::Mul) {
-        // X and 0 = 0
-        // X * 0 = 0
-        if (NonOverdefVal->getConstant()->isNullValue())
-          return (void)markConstant(IV, &I, NonOverdefVal->getConstant());
-      } else {
-        // X or -1 = -1
-        if (ConstantInt *CI = NonOverdefVal->getConstantInt())
-          if (CI->isMinusOne())
-            return (void)markConstant(IV, &I, NonOverdefVal->getConstant());
-      }
+  // If either of the operands is a constant, try to fold it to a constant.
+  // TODO: Use information from notconstant better.
+  if ((V1State.isConstant() || V2State.isConstant())) {
+    Value *V1 = isConstant(V1State) ? getConstant(V1State) : I.getOperand(0);
+    Value *V2 = isConstant(V2State) ? getConstant(V2State) : I.getOperand(1);
+    Value *R = SimplifyBinOp(I.getOpcode(), V1, V2, SimplifyQuery(DL));
+    auto *C = dyn_cast_or_null<Constant>(R);
+    if (C) {
+      // X op Y -> undef.
+      if (isa<UndefValue>(C))
+        return;
+      // Conservatively assume that the result may be based on operands that may
+      // be undef. Note that we use mergeInValue to combine the constant with
+      // the existing lattice value for I, as different constants might be found
+      // after one of the operands go to overdefined, e.g. due to one operand
+      // being a special floating value.
+      ValueLatticeElement NewV;
+      NewV.markConstant(C, /*MayIncludeUndef=*/true);
+      return (void)mergeInValue(&I, NewV);
     }
   }
 
-  markOverdefined(&I);
+  // Only use ranges for binary operators on integers.
+  if (!I.getType()->isIntegerTy())
+    return markOverdefined(&I);
+
+  // Try to simplify to a constant range.
+  ConstantRange A = ConstantRange::getFull(I.getType()->getScalarSizeInBits());
+  ConstantRange B = ConstantRange::getFull(I.getType()->getScalarSizeInBits());
+  if (V1State.isConstantRange())
+    A = V1State.getConstantRange();
+  if (V2State.isConstantRange())
+    B = V2State.getConstantRange();
+
+  ConstantRange R = A.binaryOp(cast<BinaryOperator>(&I)->getOpcode(), B);
+  mergeInValue(&I, ValueLatticeElement::getRange(R));
+
+  // TODO: Currently we do not exploit special values that produce something
+  // better than overdefined with an overdefined operand for vector or floating
+  // point types, like and <4 x i32> overdefined, zeroinitializer.
 }
 
 // Handle ICmpInst instruction.
 void SCCPSolver::visitCmpInst(CmpInst &I) {
   // Do not cache this lookup, getValueState calls later in the function might
   // invalidate the reference.
-  if (ValueState[&I].isOverdefined()) return;
+  if (isOverdefined(ValueState[&I]))
+    return (void)markOverdefined(&I);
 
   Value *Op1 = I.getOperand(0);
   Value *Op2 = I.getOperand(1);
 
   // For parameters, use ParamState which includes constant range info if
   // available.
-  auto V1Param = ParamState.find(Op1);
-  ValueLatticeElement V1State = (V1Param != ParamState.end())
-                                    ? V1Param->second
-                                    : getValueState(Op1).toValueLattice();
-
-  auto V2Param = ParamState.find(Op2);
-  ValueLatticeElement V2State = V2Param != ParamState.end()
-                                    ? V2Param->second
-                                    : getValueState(Op2).toValueLattice();
+  auto V1State = getValueState(Op1);
+  auto V2State = getValueState(Op2);
 
   Constant *C = V1State.getCompare(I.getPredicate(), I.getType(), V2State);
   if (C) {
     if (isa<UndefValue>(C))
       return;
-    LatticeVal CV;
+    ValueLatticeElement CV;
     CV.markConstant(C);
     mergeInValue(&I, CV);
     return;
   }
 
   // If operands are still unknown, wait for it to resolve.
-  if (!V1State.isOverdefined() && !V2State.isOverdefined() &&
-      !ValueState[&I].isConstant())
+  if ((V1State.isUnknownOrUndef() || V2State.isUnknownOrUndef()) &&
+      !isConstant(ValueState[&I]))
     return;
 
   markOverdefined(&I);
@@ -1101,21 +1053,26 @@ void SCCPSolver::visitCmpInst(CmpInst &I) {
 // Handle getelementptr instructions.  If all operands are constants then we
 // can turn this into a getelementptr ConstantExpr.
 void SCCPSolver::visitGetElementPtrInst(GetElementPtrInst &I) {
-  if (ValueState[&I].isOverdefined()) return;
+  if (isOverdefined(ValueState[&I]))
+    return (void)markOverdefined(&I);
 
   SmallVector<Constant*, 8> Operands;
   Operands.reserve(I.getNumOperands());
 
   for (unsigned i = 0, e = I.getNumOperands(); i != e; ++i) {
-    LatticeVal State = getValueState(I.getOperand(i));
-    if (State.isUnknown())
+    ValueLatticeElement State = getValueState(I.getOperand(i));
+    if (State.isUnknownOrUndef())
       return;  // Operands are not resolved yet.
 
-    if (State.isOverdefined())
+    if (isOverdefined(State))
       return (void)markOverdefined(&I);
 
-    assert(State.isConstant() && "Unknown state!");
-    Operands.push_back(State.getConstant());
+    if (Constant *C = getConstant(State)) {
+      Operands.push_back(C);
+      continue;
+    }
+
+    return (void)markOverdefined(&I);
   }
 
   Constant *Ptr = Operands[0];
@@ -1136,230 +1093,297 @@ void SCCPSolver::visitStoreInst(StoreInst &SI) {
     return;
 
   GlobalVariable *GV = cast<GlobalVariable>(SI.getOperand(1));
-  DenseMap<GlobalVariable*, LatticeVal>::iterator I = TrackedGlobals.find(GV);
-  if (I == TrackedGlobals.end() || I->second.isOverdefined()) return;
+  auto I = TrackedGlobals.find(GV);
+  if (I == TrackedGlobals.end())
+    return;
 
   // Get the value we are storing into the global, then merge it.
-  mergeInValue(I->second, GV, getValueState(SI.getOperand(0)));
+  mergeInValue(I->second, GV, getValueState(SI.getOperand(0)),
+               ValueLatticeElement::MergeOptions().setCheckWiden(false));
   if (I->second.isOverdefined())
     TrackedGlobals.erase(I);      // No need to keep tracking this!
 }
 
+static ValueLatticeElement getValueFromMetadata(const Instruction *I) {
+  if (MDNode *Ranges = I->getMetadata(LLVMContext::MD_range))
+    if (I->getType()->isIntegerTy())
+      return ValueLatticeElement::getRange(
+          getConstantRangeFromMetadata(*Ranges));
+  // TODO: Also handle MD_nonnull.
+  return ValueLatticeElement::getOverdefined();
+}
+
 // Handle load instructions.  If the operand is a constant pointer to a constant
 // global, we can replace the load with the loaded constant value!
 void SCCPSolver::visitLoadInst(LoadInst &I) {
-  // If this load is of a struct, just mark the result overdefined.
-  if (I.getType()->isStructTy())
+  // If this load is of a struct or the load is volatile, just mark the result
+  // as overdefined.
+  if (I.getType()->isStructTy() || I.isVolatile())
     return (void)markOverdefined(&I);
 
-  LatticeVal PtrVal = getValueState(I.getOperand(0));
-  if (PtrVal.isUnknown()) return;   // The pointer is not resolved yet!
-
-  LatticeVal &IV = ValueState[&I];
-  if (IV.isOverdefined()) return;
+  // ResolvedUndefsIn might mark I as overdefined. Bail out, even if we would
+  // discover a concrete value later.
+  if (ValueState[&I].isOverdefined())
+    return (void)markOverdefined(&I);
 
-  if (!PtrVal.isConstant() || I.isVolatile())
-    return (void)markOverdefined(IV, &I);
+  ValueLatticeElement PtrVal = getValueState(I.getOperand(0));
+  if (PtrVal.isUnknownOrUndef())
+    return; // The pointer is not resolved yet!
 
-  Constant *Ptr = PtrVal.getConstant();
+  ValueLatticeElement &IV = ValueState[&I];
 
-  // load null is undefined.
-  if (isa<ConstantPointerNull>(Ptr)) {
-    if (NullPointerIsDefined(I.getFunction(), I.getPointerAddressSpace()))
-      return (void)markOverdefined(IV, &I);
-    else
-      return;
-  }
+  if (isConstant(PtrVal)) {
+    Constant *Ptr = getConstant(PtrVal);
 
-  // Transform load (constant global) into the value loaded.
-  if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) {
-    if (!TrackedGlobals.empty()) {
-      // If we are tracking this global, merge in the known value for it.
-      DenseMap<GlobalVariable*, LatticeVal>::iterator It =
-        TrackedGlobals.find(GV);
-      if (It != TrackedGlobals.end()) {
-        mergeInValue(IV, &I, It->second);
+    // load null is undefined.
+    if (isa<ConstantPointerNull>(Ptr)) {
+      if (NullPointerIsDefined(I.getFunction(), I.getPointerAddressSpace()))
+        return (void)markOverdefined(IV, &I);
+      else
         return;
+    }
+
+    // Transform load (constant global) into the value loaded.
+    if (auto *GV = dyn_cast<GlobalVariable>(Ptr)) {
+      if (!TrackedGlobals.empty()) {
+        // If we are tracking this global, merge in the known value for it.
+        auto It = TrackedGlobals.find(GV);
+        if (It != TrackedGlobals.end()) {
+          mergeInValue(IV, &I, It->second, getMaxWidenStepsOpts());
+          return;
+        }
       }
     }
-  }
 
-  // Transform load from a constant into a constant if possible.
-  if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) {
-    if (isa<UndefValue>(C))
-      return;
-    return (void)markConstant(IV, &I, C);
+    // Transform load from a constant into a constant if possible.
+    if (Constant *C = ConstantFoldLoadFromConstPtr(Ptr, I.getType(), DL)) {
+      if (isa<UndefValue>(C))
+        return;
+      return (void)markConstant(IV, &I, C);
+    }
   }
 
-  // Otherwise we cannot say for certain what value this load will produce.
-  // Bail out.
-  markOverdefined(IV, &I);
+  // Fall back to metadata.
+  mergeInValue(&I, getValueFromMetadata(&I));
 }
 
-void SCCPSolver::visitCallSite(CallSite CS) {
-  Function *F = CS.getCalledFunction();
-  Instruction *I = CS.getInstruction();
+void SCCPSolver::visitCallBase(CallBase &CB) {
+  handleCallResult(CB);
+  handleCallArguments(CB);
+}
 
-  if (auto *II = dyn_cast<IntrinsicInst>(I)) {
-    if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
-      if (ValueState[I].isOverdefined())
+void SCCPSolver::handleCallOverdefined(CallBase &CB) {
+  Function *F = CB.getCalledFunction();
+
+  // Void return and not tracking callee, just bail.
+  if (CB.getType()->isVoidTy())
+    return;
+
+  // Always mark struct return as overdefined.
+  if (CB.getType()->isStructTy())
+    return (void)markOverdefined(&CB);
+
+  // Otherwise, if we have a single return value case, and if the function is
+  // a declaration, maybe we can constant fold it.
+  if (F && F->isDeclaration() && canConstantFoldCallTo(&CB, F)) {
+    SmallVector<Constant *, 8> Operands;
+    for (auto AI = CB.arg_begin(), E = CB.arg_end(); AI != E; ++AI) {
+      if (AI->get()->getType()->isStructTy())
+        return markOverdefined(&CB); // Can't handle struct args.
+      ValueLatticeElement State = getValueState(*AI);
+
+      if (State.isUnknownOrUndef())
+        return; // Operands are not resolved yet.
+      if (isOverdefined(State))
+        return (void)markOverdefined(&CB);
+      assert(isConstant(State) && "Unknown state!");
+      Operands.push_back(getConstant(State));
+    }
+
+    if (isOverdefined(getValueState(&CB)))
+      return (void)markOverdefined(&CB);
+
+    // If we can constant fold this, mark the result of the call as a
+    // constant.
+    if (Constant *C = ConstantFoldCall(&CB, F, Operands, &GetTLI(*F))) {
+      // call -> undef.
+      if (isa<UndefValue>(C))
         return;
+      return (void)markConstant(&CB, C);
+    }
+  }
+
+  // Fall back to metadata.
+  mergeInValue(&CB, getValueFromMetadata(&CB));
+}
+
+void SCCPSolver::handleCallArguments(CallBase &CB) {
+  Function *F = CB.getCalledFunction();
+  // If this is a local function that doesn't have its address taken, mark its
+  // entry block executable and merge in the actual arguments to the call into
+  // the formal arguments of the function.
+  if (!TrackingIncomingArguments.empty() &&
+      TrackingIncomingArguments.count(F)) {
+    MarkBlockExecutable(&F->front());
+
+    // Propagate information from this call site into the callee.
+    auto CAI = CB.arg_begin();
+    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end(); AI != E;
+         ++AI, ++CAI) {
+      // If this argument is byval, and if the function is not readonly, there
+      // will be an implicit copy formed of the input aggregate.
+      if (AI->hasByValAttr() && !F->onlyReadsMemory()) {
+        markOverdefined(&*AI);
+        continue;
+      }
+
+      if (auto *STy = dyn_cast<StructType>(AI->getType())) {
+        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
+          ValueLatticeElement CallArg = getStructValueState(*CAI, i);
+          mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg,
+                       getMaxWidenStepsOpts());
+        }
+      } else
+        mergeInValue(&*AI, getValueState(*CAI), getMaxWidenStepsOpts());
+    }
+  }
+}
+
+void SCCPSolver::handleCallResult(CallBase &CB) {
+  Function *F = CB.getCalledFunction();
 
-      auto *PI = getPredicateInfoFor(I);
-      if (!PI)
+  if (auto *II = dyn_cast<IntrinsicInst>(&CB)) {
+    if (II->getIntrinsicID() == Intrinsic::ssa_copy) {
+      if (ValueState[&CB].isOverdefined())
         return;
 
-      Value *CopyOf = I->getOperand(0);
-      auto *PBranch = dyn_cast<PredicateBranch>(PI);
-      if (!PBranch) {
-        mergeInValue(ValueState[I], I, getValueState(CopyOf));
+      Value *CopyOf = CB.getOperand(0);
+      ValueLatticeElement CopyOfVal = getValueState(CopyOf);
+      auto *PI = getPredicateInfoFor(&CB);
+      assert(PI && "Missing predicate info for ssa.copy");
+
+      CmpInst *Cmp;
+      bool TrueEdge;
+      if (auto *PBranch = dyn_cast<PredicateBranch>(PI)) {
+        Cmp = dyn_cast<CmpInst>(PBranch->Condition);
+        TrueEdge = PBranch->TrueEdge;
+      } else if (auto *PAssume = dyn_cast<PredicateAssume>(PI)) {
+        Cmp = dyn_cast<CmpInst>(PAssume->Condition);
+        TrueEdge = true;
+      } else {
+        mergeInValue(ValueState[&CB], &CB, CopyOfVal);
         return;
       }
 
-      Value *Cond = PBranch->Condition;
-
       // Everything below relies on the condition being a comparison.
-      auto *Cmp = dyn_cast<CmpInst>(Cond);
       if (!Cmp) {
-        mergeInValue(ValueState[I], I, getValueState(CopyOf));
+        mergeInValue(ValueState[&CB], &CB, CopyOfVal);
         return;
       }
 
+      Value *RenamedOp = PI->RenamedOp;
       Value *CmpOp0 = Cmp->getOperand(0);
       Value *CmpOp1 = Cmp->getOperand(1);
-      if (CopyOf != CmpOp0 && CopyOf != CmpOp1) {
-        mergeInValue(ValueState[I], I, getValueState(CopyOf));
+      // Bail out if neither of the operands matches RenamedOp.
+      if (CmpOp0 != RenamedOp && CmpOp1 != RenamedOp) {
+        mergeInValue(ValueState[&CB], &CB, getValueState(CopyOf));
         return;
       }
 
-      if (CmpOp0 != CopyOf)
+      auto Pred = Cmp->getPredicate();
+      if (CmpOp1 == RenamedOp) {
         std::swap(CmpOp0, CmpOp1);
+        Pred = Cmp->getSwappedPredicate();
+      }
 
-      LatticeVal OriginalVal = getValueState(CopyOf);
-      LatticeVal EqVal = getValueState(CmpOp1);
-      LatticeVal &IV = ValueState[I];
-      if (PBranch->TrueEdge && Cmp->getPredicate() == CmpInst::ICMP_EQ) {
-        addAdditionalUser(CmpOp1, I);
-        if (OriginalVal.isConstant())
-          mergeInValue(IV, I, OriginalVal);
-        else
-          mergeInValue(IV, I, EqVal);
+      // Wait until CmpOp1 is resolved.
+      if (getValueState(CmpOp1).isUnknown()) {
+        addAdditionalUser(CmpOp1, &CB);
         return;
       }
-      if (!PBranch->TrueEdge && Cmp->getPredicate() == CmpInst::ICMP_NE) {
-        addAdditionalUser(CmpOp1, I);
-        if (OriginalVal.isConstant())
-          mergeInValue(IV, I, OriginalVal);
-        else
-          mergeInValue(IV, I, EqVal);
+
+      // The code below relies on PredicateInfo only inserting copies for the
+      // true branch when the branch condition is an AND and only inserting
+      // copies for the false branch when the branch condition is an OR. This
+      // ensures we can intersect the range from the condition with the range of
+      // CopyOf.
+      if (!TrueEdge)
+        Pred = CmpInst::getInversePredicate(Pred);
+
+      ValueLatticeElement CondVal = getValueState(CmpOp1);
+      ValueLatticeElement &IV = ValueState[&CB];
+      if (CondVal.isConstantRange() || CopyOfVal.isConstantRange()) {
+        auto ImposedCR =
+            ConstantRange::getFull(DL.getTypeSizeInBits(CopyOf->getType()));
+
+        // Get the range imposed by the condition.
+        if (CondVal.isConstantRange())
+          ImposedCR = ConstantRange::makeAllowedICmpRegion(
+              Pred, CondVal.getConstantRange());
+
+        // Combine range info for the original value with the new range from the
+        // condition.
+        auto CopyOfCR = CopyOfVal.isConstantRange()
+                            ? CopyOfVal.getConstantRange()
+                            : ConstantRange::getFull(
+                                  DL.getTypeSizeInBits(CopyOf->getType()));
+        auto NewCR = ImposedCR.intersectWith(CopyOfCR);
+        // If the existing information is != x, do not use the information from
+        // a chained predicate, as the != x information is more likely to be
+        // helpful in practice.
+        if (!CopyOfCR.contains(NewCR) && CopyOfCR.getSingleMissingElement())
+          NewCR = CopyOfCR;
+
+        addAdditionalUser(CmpOp1, &CB);
+        // TODO: Actually filp MayIncludeUndef for the created range to false,
+        // once most places in the optimizer respect the branches on
+        // undef/poison are UB rule. The reason why the new range cannot be
+        // undef is as follows below:
+        // The new range is based on a branch condition. That guarantees that
+        // neither of the compare operands can be undef in the branch targets,
+        // unless we have conditions that are always true/false (e.g. icmp ule
+        // i32, %a, i32_max). For the latter overdefined/empty range will be
+        // inferred, but the branch will get folded accordingly anyways.
+        mergeInValue(
+            IV, &CB,
+            ValueLatticeElement::getRange(NewCR, /*MayIncludeUndef=*/true));
+        return;
+      } else if (Pred == CmpInst::ICMP_EQ && CondVal.isConstant()) {
+        // For non-integer values or integer constant expressions, only
+        // propagate equal constants.
+        addAdditionalUser(CmpOp1, &CB);
+        mergeInValue(IV, &CB, CondVal);
         return;
       }
 
-      return (void)mergeInValue(IV, I, getValueState(CopyOf));
+      return (void)mergeInValue(IV, &CB, CopyOfVal);
     }
   }
 
   // The common case is that we aren't tracking the callee, either because we
   // are not doing interprocedural analysis or the callee is indirect, or is
   // external.  Handle these cases first.
-  if (!F || F->isDeclaration()) {
-CallOverdefined:
-    // Void return and not tracking callee, just bail.
-    if (I->getType()->isVoidTy()) return;
-
-    // Otherwise, if we have a single return value case, and if the function is
-    // a declaration, maybe we can constant fold it.
-    if (F && F->isDeclaration() && !I->getType()->isStructTy() &&
-        canConstantFoldCallTo(cast<CallBase>(CS.getInstruction()), F)) {
-      SmallVector<Constant*, 8> Operands;
-      for (CallSite::arg_iterator AI = CS.arg_begin(), E = CS.arg_end();
-           AI != E; ++AI) {
-        if (AI->get()->getType()->isStructTy())
-          return markOverdefined(I); // Can't handle struct args.
-        LatticeVal State = getValueState(*AI);
-
-        if (State.isUnknown())
-          return;  // Operands are not resolved yet.
-        if (State.isOverdefined())
-          return (void)markOverdefined(I);
-        assert(State.isConstant() && "Unknown state!");
-        Operands.push_back(State.getConstant());
-      }
-
-      if (getValueState(I).isOverdefined())
-        return;
-
-      // If we can constant fold this, mark the result of the call as a
-      // constant.
-      if (Constant *C = ConstantFoldCall(cast<CallBase>(CS.getInstruction()), F,
-                                         Operands, &GetTLI(*F))) {
-        // call -> undef.
-        if (isa<UndefValue>(C))
-          return;
-        return (void)markConstant(I, C);
-      }
-    }
-
-    // Otherwise, we don't know anything about this call, mark it overdefined.
-    return (void)markOverdefined(I);
-  }
-
-  // If this is a local function that doesn't have its address taken, mark its
-  // entry block executable and merge in the actual arguments to the call into
-  // the formal arguments of the function.
-  if (!TrackingIncomingArguments.empty() && TrackingIncomingArguments.count(F)){
-    MarkBlockExecutable(&F->front());
-
-    // Propagate information from this call site into the callee.
-    CallSite::arg_iterator CAI = CS.arg_begin();
-    for (Function::arg_iterator AI = F->arg_begin(), E = F->arg_end();
-         AI != E; ++AI, ++CAI) {
-      // If this argument is byval, and if the function is not readonly, there
-      // will be an implicit copy formed of the input aggregate.
-      if (AI->hasByValAttr() && !F->onlyReadsMemory()) {
-        markOverdefined(&*AI);
-        continue;
-      }
-
-      if (auto *STy = dyn_cast<StructType>(AI->getType())) {
-        for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-          LatticeVal CallArg = getStructValueState(*CAI, i);
-          mergeInValue(getStructValueState(&*AI, i), &*AI, CallArg);
-        }
-      } else {
-        // Most other parts of the Solver still only use the simpler value
-        // lattice, so we propagate changes for parameters to both lattices.
-        LatticeVal ConcreteArgument = getValueState(*CAI);
-        bool ParamChanged =
-            getParamState(&*AI).mergeIn(ConcreteArgument.toValueLattice(), DL);
-         bool ValueChanged = mergeInValue(&*AI, ConcreteArgument);
-        // Add argument to work list, if the state of a parameter changes but
-        // ValueState does not change (because it is already overdefined there),
-        // We have to take changes in ParamState into account, as it is used
-        // when evaluating Cmp instructions.
-        if (!ValueChanged && ParamChanged)
-          pushToWorkList(ValueState[&*AI], &*AI);
-      }
-    }
-  }
+  if (!F || F->isDeclaration())
+    return handleCallOverdefined(CB);
 
   // If this is a single/zero retval case, see if we're tracking the function.
   if (auto *STy = dyn_cast<StructType>(F->getReturnType())) {
     if (!MRVFunctionsTracked.count(F))
-      goto CallOverdefined;  // Not tracking this callee.
+      return handleCallOverdefined(CB); // Not tracking this callee.
 
     // If we are tracking this callee, propagate the result of the function
     // into this call site.
     for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i)
-      mergeInValue(getStructValueState(I, i), I,
-                   TrackedMultipleRetVals[std::make_pair(F, i)]);
+      mergeInValue(getStructValueState(&CB, i), &CB,
+                   TrackedMultipleRetVals[std::make_pair(F, i)],
+                   getMaxWidenStepsOpts());
   } else {
-    MapVector<Function*, LatticeVal>::iterator TFRVI = TrackedRetVals.find(F);
+    auto TFRVI = TrackedRetVals.find(F);
     if (TFRVI == TrackedRetVals.end())
-      goto CallOverdefined;  // Not tracking this callee.
+      return handleCallOverdefined(CB); // Not tracking this callee.
 
     // If so, propagate the return value of the callee into this call result.
-    mergeInValue(I, TFRVI->second);
+    mergeInValue(&CB, TFRVI->second, getMaxWidenStepsOpts());
   }
 }
 
@@ -1429,10 +1453,8 @@ void SCCPSolver::Solve() {
 /// constraints on the condition of the branch, as that would impact other users
 /// of the value.
 ///
-/// This scan also checks for values that use undefs, whose results are actually
-/// defined.  For example, 'zext i8 undef to i32' should produce all zeros
-/// conservatively, as "(zext i8 X -> i32) & 0xFF00" must always return zero,
-/// even if X isn't defined.
+/// This scan also checks for values that use undefs. It conservatively marks
+/// them as overdefined.
 bool SCCPSolver::ResolvedUndefsIn(Function &F) {
   for (BasicBlock &BB : F) {
     if (!BBExecutable.count(&BB))
@@ -1446,8 +1468,8 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         // Only a few things that can be structs matter for undef.
 
         // Tracked calls must never be marked overdefined in ResolvedUndefsIn.
-        if (CallSite CS = CallSite(&I))
-          if (Function *F = CS.getCalledFunction())
+        if (auto *CB = dyn_cast<CallBase>(&I))
+          if (Function *F = CB->getCalledFunction())
             if (MRVFunctionsTracked.count(F))
               continue;
 
@@ -1455,19 +1477,18 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
         // tracked as precisely as their operands.
         if (isa<ExtractValueInst>(I) || isa<InsertValueInst>(I))
           continue;
-
         // Send the results of everything else to overdefined.  We could be
         // more precise than this but it isn't worth bothering.
         for (unsigned i = 0, e = STy->getNumElements(); i != e; ++i) {
-          LatticeVal &LV = getStructValueState(&I, i);
-          if (LV.isUnknown())
+          ValueLatticeElement &LV = getStructValueState(&I, i);
+          if (LV.isUnknownOrUndef())
             markOverdefined(LV, &I);
         }
         continue;
       }
 
-      LatticeVal &LV = getValueState(&I);
-      if (!LV.isUnknown())
+      ValueLatticeElement &LV = getValueState(&I);
+      if (!LV.isUnknownOrUndef())
         continue;
 
       // There are two reasons a call can have an undef result
@@ -1475,195 +1496,20 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       // 2. It could be constant-foldable.
       // Because of the way we solve return values, tracked calls must
       // never be marked overdefined in ResolvedUndefsIn.
-      if (CallSite CS = CallSite(&I)) {
-        if (Function *F = CS.getCalledFunction())
+      if (auto *CB = dyn_cast<CallBase>(&I))
+        if (Function *F = CB->getCalledFunction())
           if (TrackedRetVals.count(F))
             continue;
 
-        // If the call is constant-foldable, we mark it overdefined because
-        // we do not know what return values are valid.
-        markOverdefined(&I);
-        return true;
-      }
-
-      // extractvalue is safe; check here because the argument is a struct.
-      if (isa<ExtractValueInst>(I))
-        continue;
-
-      // Compute the operand LatticeVals, for convenience below.
-      // Anything taking a struct is conservatively assumed to require
-      // overdefined markings.
-      if (I.getOperand(0)->getType()->isStructTy()) {
-        markOverdefined(&I);
-        return true;
-      }
-      LatticeVal Op0LV = getValueState(I.getOperand(0));
-      LatticeVal Op1LV;
-      if (I.getNumOperands() == 2) {
-        if (I.getOperand(1)->getType()->isStructTy()) {
-          markOverdefined(&I);
-          return true;
-        }
-
-        Op1LV = getValueState(I.getOperand(1));
-      }
-      // If this is an instructions whose result is defined even if the input is
-      // not fully defined, propagate the information.
-      Type *ITy = I.getType();
-      switch (I.getOpcode()) {
-      case Instruction::Add:
-      case Instruction::Sub:
-      case Instruction::Trunc:
-      case Instruction::FPTrunc:
-      case Instruction::BitCast:
-        break; // Any undef -> undef
-      case Instruction::FSub:
-      case Instruction::FAdd:
-      case Instruction::FMul:
-      case Instruction::FDiv:
-      case Instruction::FRem:
-        // Floating-point binary operation: be conservative.
-        if (Op0LV.isUnknown() && Op1LV.isUnknown())
-          markForcedConstant(&I, Constant::getNullValue(ITy));
-        else
-          markOverdefined(&I);
-        return true;
-      case Instruction::FNeg:
-        break; // fneg undef -> undef
-      case Instruction::ZExt:
-      case Instruction::SExt:
-      case Instruction::FPToUI:
-      case Instruction::FPToSI:
-      case Instruction::FPExt:
-      case Instruction::PtrToInt:
-      case Instruction::IntToPtr:
-      case Instruction::SIToFP:
-      case Instruction::UIToFP:
-        // undef -> 0; some outputs are impossible
-        markForcedConstant(&I, Constant::getNullValue(ITy));
-        return true;
-      case Instruction::Mul:
-      case Instruction::And:
-        // Both operands undef -> undef
-        if (Op0LV.isUnknown() && Op1LV.isUnknown())
-          break;
-        // undef * X -> 0.   X could be zero.
-        // undef & X -> 0.   X could be zero.
-        markForcedConstant(&I, Constant::getNullValue(ITy));
-        return true;
-      case Instruction::Or:
-        // Both operands undef -> undef
-        if (Op0LV.isUnknown() && Op1LV.isUnknown())
-          break;
-        // undef | X -> -1.   X could be -1.
-        markForcedConstant(&I, Constant::getAllOnesValue(ITy));
-        return true;
-      case Instruction::Xor:
-        // undef ^ undef -> 0; strictly speaking, this is not strictly
-        // necessary, but we try to be nice to people who expect this
-        // behavior in simple cases
-        if (Op0LV.isUnknown() && Op1LV.isUnknown()) {
-          markForcedConstant(&I, Constant::getNullValue(ITy));
-          return true;
-        }
-        // undef ^ X -> undef
-        break;
-      case Instruction::SDiv:
-      case Instruction::UDiv:
-      case Instruction::SRem:
-      case Instruction::URem:
-        // X / undef -> undef.  No change.
-        // X % undef -> undef.  No change.
-        if (Op1LV.isUnknown()) break;
-
-        // X / 0 -> undef.  No change.
-        // X % 0 -> undef.  No change.
-        if (Op1LV.isConstant() && Op1LV.getConstant()->isZeroValue())
-          break;
-
-        // undef / X -> 0.   X could be maxint.
-        // undef % X -> 0.   X could be 1.
-        markForcedConstant(&I, Constant::getNullValue(ITy));
-        return true;
-      case Instruction::AShr:
-        // X >>a undef -> undef.
-        if (Op1LV.isUnknown()) break;
-
-        // Shifting by the bitwidth or more is undefined.
-        if (Op1LV.isConstant()) {
-          if (auto *ShiftAmt = Op1LV.getConstantInt())
-            if (ShiftAmt->getLimitedValue() >=
-                ShiftAmt->getType()->getScalarSizeInBits())
-              break;
-        }
-
-        // undef >>a X -> 0
-        markForcedConstant(&I, Constant::getNullValue(ITy));
-        return true;
-      case Instruction::LShr:
-      case Instruction::Shl:
-        // X << undef -> undef.
-        // X >> undef -> undef.
-        if (Op1LV.isUnknown()) break;
-
-        // Shifting by the bitwidth or more is undefined.
-        if (Op1LV.isConstant()) {
-          if (auto *ShiftAmt = Op1LV.getConstantInt())
-            if (ShiftAmt->getLimitedValue() >=
-                ShiftAmt->getType()->getScalarSizeInBits())
-              break;
-        }
-
-        // undef << X -> 0
-        // undef >> X -> 0
-        markForcedConstant(&I, Constant::getNullValue(ITy));
-        return true;
-      case Instruction::Select:
-        Op1LV = getValueState(I.getOperand(1));
-        // undef ? X : Y  -> X or Y.  There could be commonality between X/Y.
-        if (Op0LV.isUnknown()) {
-          if (!Op1LV.isConstant())  // Pick the constant one if there is any.
-            Op1LV = getValueState(I.getOperand(2));
-        } else if (Op1LV.isUnknown()) {
-          // c ? undef : undef -> undef.  No change.
-          Op1LV = getValueState(I.getOperand(2));
-          if (Op1LV.isUnknown())
-            break;
-          // Otherwise, c ? undef : x -> x.
-        } else {
-          // Leave Op1LV as Operand(1)'s LatticeValue.
-        }
-
-        if (Op1LV.isConstant())
-          markForcedConstant(&I, Op1LV.getConstant());
-        else
-          markOverdefined(&I);
-        return true;
-      case Instruction::Load:
+      if (isa<LoadInst>(I)) {
         // A load here means one of two things: a load of undef from a global,
         // a load from an unknown pointer.  Either way, having it return undef
         // is okay.
-        break;
-      case Instruction::ICmp:
-        // X == undef -> undef.  Other comparisons get more complicated.
-        Op0LV = getValueState(I.getOperand(0));
-        Op1LV = getValueState(I.getOperand(1));
-
-        if ((Op0LV.isUnknown() || Op1LV.isUnknown()) &&
-            cast<ICmpInst>(&I)->isEquality())
-          break;
-        markOverdefined(&I);
-        return true;
-      case Instruction::Call:
-      case Instruction::Invoke:
-      case Instruction::CallBr:
-        llvm_unreachable("Call-like instructions should have be handled early");
-      default:
-        // If we don't know what should happen here, conservatively mark it
-        // overdefined.
-        markOverdefined(&I);
-        return true;
+        continue;
       }
+
+      markOverdefined(&I);
+      return true;
     }
 
     // Check to see if we have a branch or switch on an undefined value.  If so
@@ -1672,7 +1518,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
     Instruction *TI = BB.getTerminator();
     if (auto *BI = dyn_cast<BranchInst>(TI)) {
       if (!BI->isConditional()) continue;
-      if (!getValueState(BI->getCondition()).isUnknown())
+      if (!getValueState(BI->getCondition()).isUnknownOrUndef())
         continue;
 
       // If the input to SCCP is actually branch on undef, fix the undef to
@@ -1700,7 +1546,7 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
       if (IBR->getNumSuccessors() < 1)
         continue;
 
-      if (!getValueState(IBR->getAddress()).isUnknown())
+      if (!getValueState(IBR->getAddress()).isUnknownOrUndef())
         continue;
 
       // If the input to SCCP is actually branch on undef, fix the undef to
@@ -1724,7 +1570,8 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
     }
 
     if (auto *SI = dyn_cast<SwitchInst>(TI)) {
-      if (!SI->getNumCases() || !getValueState(SI->getCondition()).isUnknown())
+      if (!SI->getNumCases() ||
+          !getValueState(SI->getCondition()).isUnknownOrUndef())
         continue;
 
       // If the input to SCCP is actually switch on undef, fix the undef to
@@ -1753,25 +1600,26 @@ bool SCCPSolver::ResolvedUndefsIn(Function &F) {
 static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
   Constant *Const = nullptr;
   if (V->getType()->isStructTy()) {
-    std::vector<LatticeVal> IVs = Solver.getStructLatticeValueFor(V);
-    if (llvm::any_of(IVs,
-                     [](const LatticeVal &LV) { return LV.isOverdefined(); }))
+    std::vector<ValueLatticeElement> IVs = Solver.getStructLatticeValueFor(V);
+    if (any_of(IVs,
+               [](const ValueLatticeElement &LV) { return isOverdefined(LV); }))
       return false;
     std::vector<Constant *> ConstVals;
     auto *ST = cast<StructType>(V->getType());
     for (unsigned i = 0, e = ST->getNumElements(); i != e; ++i) {
-      LatticeVal V = IVs[i];
-      ConstVals.push_back(V.isConstant()
-                              ? V.getConstant()
+      ValueLatticeElement V = IVs[i];
+      ConstVals.push_back(isConstant(V)
+                              ? Solver.getConstant(V)
                               : UndefValue::get(ST->getElementType(i)));
     }
     Const = ConstantStruct::get(ST, ConstVals);
   } else {
-    const LatticeVal &IV = Solver.getLatticeValueFor(V);
-    if (IV.isOverdefined())
+    const ValueLatticeElement &IV = Solver.getLatticeValueFor(V);
+    if (isOverdefined(IV))
       return false;
 
-    Const = IV.isConstant() ? IV.getConstant() : UndefValue::get(V->getType());
+    Const =
+        isConstant(IV) ? Solver.getConstant(IV) : UndefValue::get(V->getType());
   }
   assert(Const && "Constant is nullptr here!");
 
@@ -1779,8 +1627,7 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
   // unless the call itself can be removed
   CallInst *CI = dyn_cast<CallInst>(V);
   if (CI && CI->isMustTailCall() && !CI->isSafeToRemove()) {
-    CallSite CS(CI);
-    Function *F = CS.getCalledFunction();
+    Function *F = CI->getCalledFunction();
 
     // Don't zap returns of the callee
     if (F)
@@ -1798,13 +1645,49 @@ static bool tryToReplaceWithConstant(SCCPSolver &Solver, Value *V) {
   return true;
 }
 
+static bool simplifyInstsInBlock(SCCPSolver &Solver, BasicBlock &BB,
+                                 SmallPtrSetImpl<Value *> &InsertedValues,
+                                 Statistic &InstRemovedStat,
+                                 Statistic &InstReplacedStat) {
+  bool MadeChanges = false;
+  for (Instruction &Inst : make_early_inc_range(BB)) {
+    if (Inst.getType()->isVoidTy())
+      continue;
+    if (tryToReplaceWithConstant(Solver, &Inst)) {
+      if (Inst.isSafeToRemove())
+        Inst.eraseFromParent();
+      // Hey, we just changed something!
+      MadeChanges = true;
+      ++InstRemovedStat;
+    } else if (isa<SExtInst>(&Inst)) {
+      Value *ExtOp = Inst.getOperand(0);
+      if (isa<Constant>(ExtOp) || InsertedValues.count(ExtOp))
+        continue;
+      const ValueLatticeElement &IV = Solver.getLatticeValueFor(ExtOp);
+      if (!IV.isConstantRange(/*UndefAllowed=*/false))
+        continue;
+      if (IV.getConstantRange().isAllNonNegative()) {
+        auto *ZExt = new ZExtInst(ExtOp, Inst.getType(), "", &Inst);
+        InsertedValues.insert(ZExt);
+        Inst.replaceAllUsesWith(ZExt);
+        Solver.removeLatticeValueFor(&Inst);
+        Inst.eraseFromParent();
+        InstReplacedStat++;
+        MadeChanges = true;
+      }
+    }
+  }
+  return MadeChanges;
+}
+
 // runSCCP() - Run the Sparse Conditional Constant Propagation algorithm,
 // and return true if the function was modified.
 static bool runSCCP(Function &F, const DataLayout &DL,
                     const TargetLibraryInfo *TLI) {
   LLVM_DEBUG(dbgs() << "SCCP on function '" << F.getName() << "'\n");
   SCCPSolver Solver(
-      DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; });
+      DL, [TLI](Function &F) -> const TargetLibraryInfo & { return *TLI; },
+      F.getContext());
 
   // Mark the first block of the function as being executable.
   Solver.MarkBlockExecutable(&F.front());
@@ -1827,6 +1710,7 @@ static bool runSCCP(Function &F, const DataLayout &DL,
   // delete their contents now.  Note that we cannot actually delete the blocks,
   // as we cannot modify the CFG of the function.
 
+  SmallPtrSet<Value *, 32> InsertedValues;
   for (BasicBlock &BB : F) {
     if (!Solver.isBlockExecutable(&BB)) {
       LLVM_DEBUG(dbgs() << "  BasicBlock Dead:" << BB);
@@ -1838,21 +1722,8 @@ static bool runSCCP(Function &F, const DataLayout &DL,
       continue;
     }
 
-    // Iterate over all of the instructions in a function, replacing them with
-    // constants if we have found them to be of constant values.
-    for (BasicBlock::iterator BI = BB.begin(), E = BB.end(); BI != E;) {
-      Instruction *Inst = &*BI++;
-      if (Inst->getType()->isVoidTy() || Inst->isTerminator())
-        continue;
-
-      if (tryToReplaceWithConstant(Solver, Inst)) {
-        if (isInstructionTriviallyDead(Inst))
-          Inst->eraseFromParent();
-        // Hey, we just changed something!
-        MadeChanges = true;
-        ++NumInstRemoved;
-      }
-    }
+    MadeChanges |= simplifyInstsInBlock(Solver, BB, InsertedValues,
+                                        NumInstRemoved, NumInstReplaced);
   }
 
   return MadeChanges;
@@ -1942,14 +1813,15 @@ static void findReturnsToZap(Function &F,
                // uses (like blockaddresses) could stuck around, without being
                // used in the underlying IR, meaning we do not have lattice
                // values for them.
-               if (!CallSite(U))
+               if (!isa<CallBase>(U))
                  return true;
                if (U->getType()->isStructTy()) {
-                 return all_of(
-                     Solver.getStructLatticeValueFor(U),
-                     [](const LatticeVal &LV) { return !LV.isOverdefined(); });
+                 return all_of(Solver.getStructLatticeValueFor(U),
+                               [](const ValueLatticeElement &LV) {
+                                 return !isOverdefined(LV);
+                               });
                }
-               return !Solver.getLatticeValueFor(U).isOverdefined();
+               return !isOverdefined(Solver.getLatticeValueFor(U));
              }) &&
       "We can only zap functions where all live users have a concrete value");
 
@@ -2006,7 +1878,7 @@ bool llvm::runIPSCCP(
     Module &M, const DataLayout &DL,
     std::function<const TargetLibraryInfo &(Function &)> GetTLI,
     function_ref<AnalysisResultsForFn(Function &)> getAnalysis) {
-  SCCPSolver Solver(DL, GetTLI);
+  SCCPSolver Solver(DL, GetTLI, M.getContext());
 
   // Loop over all functions, marking arguments to those with their addresses
   // taken or that are external as overdefined.
@@ -2080,30 +1952,21 @@ bool llvm::runIPSCCP(
         }
       }
 
-    for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
-      if (!Solver.isBlockExecutable(&*BB)) {
-        LLVM_DEBUG(dbgs() << "  BasicBlock Dead:" << *BB);
+    SmallPtrSet<Value *, 32> InsertedValues;
+    for (BasicBlock &BB : F) {
+      if (!Solver.isBlockExecutable(&BB)) {
+        LLVM_DEBUG(dbgs() << "  BasicBlock Dead:" << BB);
         ++NumDeadBlocks;
 
         MadeChanges = true;
 
-        if (&*BB != &F.front())
-          BlocksToErase.push_back(&*BB);
+        if (&BB != &F.front())
+          BlocksToErase.push_back(&BB);
         continue;
       }
 
-      for (BasicBlock::iterator BI = BB->begin(), E = BB->end(); BI != E; ) {
-        Instruction *Inst = &*BI++;
-        if (Inst->getType()->isVoidTy())
-          continue;
-        if (tryToReplaceWithConstant(Solver, Inst)) {
-          if (Inst->isSafeToRemove())
-            Inst->eraseFromParent();
-          // Hey, we just changed something!
-          MadeChanges = true;
-          ++IPNumInstRemoved;
-        }
-      }
+      MadeChanges |= simplifyInstsInBlock(Solver, BB, InsertedValues,
+                                          IPNumInstRemoved, IPNumInstReplaced);
     }
 
     DomTreeUpdater DTU = Solver.getDTU(F);
@@ -2189,10 +2052,9 @@ bool llvm::runIPSCCP(
   // whether other functions are optimizable.
   SmallVector<ReturnInst*, 8> ReturnsToZap;
 
-  const MapVector<Function*, LatticeVal> &RV = Solver.getTrackedRetVals();
-  for (const auto &I : RV) {
+  for (const auto &I : Solver.getTrackedRetVals()) {
     Function *F = I.first;
-    if (I.second.isOverdefined() || F->getReturnType()->isVoidTy())
+    if (isOverdefined(I.second) || F->getReturnType()->isVoidTy())
       continue;
     findReturnsToZap(*F, ReturnsToZap, Solver);
   }
@@ -2213,17 +2075,16 @@ bool llvm::runIPSCCP(
 
   // If we inferred constant or undef values for globals variables, we can
   // delete the global and any stores that remain to it.
-  const DenseMap<GlobalVariable*, LatticeVal> &TG = Solver.getTrackedGlobals();
-  for (DenseMap<GlobalVariable*, LatticeVal>::const_iterator I = TG.begin(),
-         E = TG.end(); I != E; ++I) {
-    GlobalVariable *GV = I->first;
-    assert(!I->second.isOverdefined() &&
-           "Overdefined values should have been taken out of the map!");
+  for (auto &I : make_early_inc_range(Solver.getTrackedGlobals())) {
+    GlobalVariable *GV = I.first;
+    if (isOverdefined(I.second))
+      continue;
     LLVM_DEBUG(dbgs() << "Found that GV '" << GV->getName()
                       << "' is constant!\n");
     while (!GV->use_empty()) {
       StoreInst *SI = cast<StoreInst>(GV->user_back());
       SI->eraseFromParent();
+      MadeChanges = true;
     }
     M.getGlobalList().erase(GV);
     ++IPNumGlobalConst;
diff --git a/llvm/lib/Transforms/Scalar/SROA.cpp b/llvm/lib/Transforms/Scalar/SROA.cpp
index 89916e43fce2..89f324deef9f 100644
--- a/llvm/lib/Transforms/Scalar/SROA.cpp
+++ b/llvm/lib/Transforms/Scalar/SROA.cpp
@@ -94,11 +94,6 @@
 #include <utility>
 #include <vector>
 
-#ifndef NDEBUG
-// We only use this for a debug check.
-#include <random>
-#endif
-
 using namespace llvm;
 using namespace llvm::sroa;
 
@@ -115,11 +110,6 @@ STATISTIC(NumLoadsSpeculated, "Number of loads speculated to allow promotion");
 STATISTIC(NumDeleted, "Number of instructions deleted");
 STATISTIC(NumVectorized, "Number of vectorized aggregates");
 
-/// Hidden option to enable randomly shuffling the slices to help uncover
-/// instability in their order.
-static cl::opt<bool> SROARandomShuffleSlices("sroa-random-shuffle-slices",
-                                             cl::init(false), cl::Hidden);
-
 /// Hidden option to experiment with completely strict handling of inbounds
 /// GEPs.
 static cl::opt<bool> SROAStrictInbounds("sroa-strict-inbounds", cl::init(false),
@@ -129,7 +119,7 @@ namespace {
 
 /// A custom IRBuilder inserter which prefixes all names, but only in
 /// Assert builds.
-class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter {
+class IRBuilderPrefixedInserter final : public IRBuilderDefaultInserter {
   std::string Prefix;
 
   const Twine getNameWithPrefix(const Twine &Name) const {
@@ -139,9 +129,8 @@ class IRBuilderPrefixedInserter : public IRBuilderDefaultInserter {
 public:
   void SetNamePrefix(const Twine &P) { Prefix = P.str(); }
 
-protected:
   void InsertHelper(Instruction *I, const Twine &Name, BasicBlock *BB,
-                    BasicBlock::iterator InsertPt) const {
+                    BasicBlock::iterator InsertPt) const override {
     IRBuilderDefaultInserter::InsertHelper(I, getNameWithPrefix(Name), BB,
                                            InsertPt);
   }
@@ -663,7 +652,8 @@ class AllocaSlices::SliceBuilder : public PtrUseVisitor<SliceBuilder> {
 public:
   SliceBuilder(const DataLayout &DL, AllocaInst &AI, AllocaSlices &AS)
       : PtrUseVisitor<SliceBuilder>(DL),
-        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType())), AS(AS) {}
+        AllocSize(DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize()),
+        AS(AS) {}
 
 private:
   void markAsDead(Instruction &I) {
@@ -752,8 +742,10 @@ private:
           // For array or vector indices, scale the index by the size of the
           // type.
           APInt Index = OpC->getValue().sextOrTrunc(Offset.getBitWidth());
-          GEPOffset += Index * APInt(Offset.getBitWidth(),
-                                     DL.getTypeAllocSize(GTI.getIndexedType()));
+          GEPOffset +=
+              Index *
+              APInt(Offset.getBitWidth(),
+                    DL.getTypeAllocSize(GTI.getIndexedType()).getFixedSize());
         }
 
         // If this index has computed an intermediate pointer which is not
@@ -788,7 +780,7 @@ private:
         LI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
       return PI.setAborted(&LI);
 
-    uint64_t Size = DL.getTypeStoreSize(LI.getType());
+    uint64_t Size = DL.getTypeStoreSize(LI.getType()).getFixedSize();
     return handleLoadOrStore(LI.getType(), LI, Offset, Size, LI.isVolatile());
   }
 
@@ -803,7 +795,7 @@ private:
         SI.getPointerAddressSpace() != DL.getAllocaAddrSpace())
       return PI.setAborted(&SI);
 
-    uint64_t Size = DL.getTypeStoreSize(ValOp->getType());
+    uint64_t Size = DL.getTypeStoreSize(ValOp->getType()).getFixedSize();
 
     // If this memory access can be shown to *statically* extend outside the
     // bounds of the allocation, it's behavior is undefined, so simply
@@ -1069,17 +1061,9 @@ AllocaSlices::AllocaSlices(const DataLayout &DL, AllocaInst &AI)
       llvm::remove_if(Slices, [](const Slice &S) { return S.isDead(); }),
       Slices.end());
 
-#ifndef NDEBUG
-  if (SROARandomShuffleSlices) {
-    std::mt19937 MT(static_cast<unsigned>(
-        std::chrono::system_clock::now().time_since_epoch().count()));
-    std::shuffle(Slices.begin(), Slices.end(), MT);
-  }
-#endif
-
   // Sort the uses. This arranges for the offsets to be in ascending order,
   // and the sizes to be in descending order.
-  llvm::sort(Slices);
+  std::stable_sort(Slices.begin(), Slices.end());
 }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
@@ -1200,7 +1184,7 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
   // TODO: Allow recursive phi users.
   // TODO: Allow stores.
   BasicBlock *BB = PN.getParent();
-  MaybeAlign MaxAlign;
+  Align MaxAlign;
   uint64_t APWidth = DL.getIndexTypeSizeInBits(PN.getType());
   APInt MaxSize(APWidth, 0);
   bool HaveLoad = false;
@@ -1221,8 +1205,8 @@ static bool isSafePHIToSpeculate(PHINode &PN) {
       if (BBI->mayWriteToMemory())
         return false;
 
-    uint64_t Size = DL.getTypeStoreSize(LI->getType());
-    MaxAlign = std::max(MaxAlign, MaybeAlign(LI->getAlignment()));
+    uint64_t Size = DL.getTypeStoreSize(LI->getType()).getFixedSize();
+    MaxAlign = std::max(MaxAlign, LI->getAlign());
     MaxSize = MaxSize.ult(Size) ? APInt(APWidth, Size) : MaxSize;
     HaveLoad = true;
   }
@@ -1273,7 +1257,7 @@ static void speculatePHINodeLoads(PHINode &PN) {
   // matter which one we get and if any differ.
   AAMDNodes AATags;
   SomeLoad->getAAMetadata(AATags);
-  const MaybeAlign Align = MaybeAlign(SomeLoad->getAlignment());
+  Align Alignment = SomeLoad->getAlign();
 
   // Rewrite all loads of the PN to use the new PHI.
   while (!PN.use_empty()) {
@@ -1300,11 +1284,10 @@ static void speculatePHINodeLoads(PHINode &PN) {
     Instruction *TI = Pred->getTerminator();
     IRBuilderTy PredBuilder(TI);
 
-    LoadInst *Load = PredBuilder.CreateLoad(
-        LoadTy, InVal,
+    LoadInst *Load = PredBuilder.CreateAlignedLoad(
+        LoadTy, InVal, Alignment,
         (PN.getName() + ".sroa.speculate.load." + Pred->getName()));
     ++NumLoadsSpeculated;
-    Load->setAlignment(Align);
     if (AATags)
       Load->setAAMetadata(AATags);
     NewPN->addIncoming(Load, Pred);
@@ -1342,10 +1325,10 @@ static bool isSafeSelectToSpeculate(SelectInst &SI) {
     // absolutely (e.g. allocas) or at this point because we can see other
     // accesses to it.
     if (!isSafeToLoadUnconditionally(TValue, LI->getType(),
-                                     MaybeAlign(LI->getAlignment()), DL, LI))
+                                     LI->getAlign(), DL, LI))
       return false;
     if (!isSafeToLoadUnconditionally(FValue, LI->getType(),
-                                     MaybeAlign(LI->getAlignment()), DL, LI))
+                                     LI->getAlign(), DL, LI))
       return false;
   }
 
@@ -1371,8 +1354,8 @@ static void speculateSelectInstLoads(SelectInst &SI) {
     NumLoadsSpeculated += 2;
 
     // Transfer alignment and AA info if present.
-    TL->setAlignment(MaybeAlign(LI->getAlignment()));
-    FL->setAlignment(MaybeAlign(LI->getAlignment()));
+    TL->setAlignment(LI->getAlign());
+    FL->setAlignment(LI->getAlign());
 
     AAMDNodes Tags;
     LI->getAAMetadata(Tags);
@@ -1479,14 +1462,15 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
   // extremely poorly defined currently. The long-term goal is to remove GEPing
   // over a vector from the IR completely.
   if (VectorType *VecTy = dyn_cast<VectorType>(Ty)) {
-    unsigned ElementSizeInBits = DL.getTypeSizeInBits(VecTy->getScalarType());
+    unsigned ElementSizeInBits =
+        DL.getTypeSizeInBits(VecTy->getScalarType()).getFixedSize();
     if (ElementSizeInBits % 8 != 0) {
       // GEPs over non-multiple of 8 size vector elements are invalid.
       return nullptr;
     }
     APInt ElementSize(Offset.getBitWidth(), ElementSizeInBits / 8);
     APInt NumSkippedElements = Offset.sdiv(ElementSize);
-    if (NumSkippedElements.ugt(VecTy->getNumElements()))
+    if (NumSkippedElements.ugt(cast<FixedVectorType>(VecTy)->getNumElements()))
       return nullptr;
     Offset -= NumSkippedElements * ElementSize;
     Indices.push_back(IRB.getInt(NumSkippedElements));
@@ -1496,7 +1480,8 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
 
   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
     Type *ElementTy = ArrTy->getElementType();
-    APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
+    APInt ElementSize(Offset.getBitWidth(),
+                      DL.getTypeAllocSize(ElementTy).getFixedSize());
     APInt NumSkippedElements = Offset.sdiv(ElementSize);
     if (NumSkippedElements.ugt(ArrTy->getNumElements()))
       return nullptr;
@@ -1518,7 +1503,7 @@ static Value *getNaturalGEPRecursively(IRBuilderTy &IRB, const DataLayout &DL,
   unsigned Index = SL->getElementContainingOffset(StructOffset);
   Offset -= APInt(Offset.getBitWidth(), SL->getElementOffset(Index));
   Type *ElementTy = STy->getElementType(Index);
-  if (Offset.uge(DL.getTypeAllocSize(ElementTy)))
+  if (Offset.uge(DL.getTypeAllocSize(ElementTy).getFixedSize()))
     return nullptr; // The offset points into alignment padding.
 
   Indices.push_back(IRB.getInt32(Index));
@@ -1550,7 +1535,8 @@ static Value *getNaturalGEPWithOffset(IRBuilderTy &IRB, const DataLayout &DL,
   Type *ElementTy = Ty->getElementType();
   if (!ElementTy->isSized())
     return nullptr; // We can't GEP through an unsized element.
-  APInt ElementSize(Offset.getBitWidth(), DL.getTypeAllocSize(ElementTy));
+  APInt ElementSize(Offset.getBitWidth(),
+                    DL.getTypeAllocSize(ElementTy).getFixedSize());
   if (ElementSize == 0)
     return nullptr; // Zero-length arrays can't help us build a natural GEP.
   APInt NumSkippedElements = Offset.sdiv(ElementSize);
@@ -1681,20 +1667,8 @@ static Value *getAdjustedPtr(IRBuilderTy &IRB, const DataLayout &DL, Value *Ptr,
 }
 
 /// Compute the adjusted alignment for a load or store from an offset.
-static Align getAdjustedAlignment(Instruction *I, uint64_t Offset,
-                                  const DataLayout &DL) {
-  MaybeAlign Alignment;
-  Type *Ty;
-  if (auto *LI = dyn_cast<LoadInst>(I)) {
-    Alignment = MaybeAlign(LI->getAlignment());
-    Ty = LI->getType();
-  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
-    Alignment = MaybeAlign(SI->getAlignment());
-    Ty = SI->getValueOperand()->getType();
-  } else {
-    llvm_unreachable("Only loads and stores are allowed!");
-  }
-  return commonAlignment(DL.getValueOrABITypeAlignment(Alignment, Ty), Offset);
+static Align getAdjustedAlignment(Instruction *I, uint64_t Offset) {
+  return commonAlignment(getLoadStoreAlignment(I), Offset);
 }
 
 /// Test whether we can convert a value from the old to the new type.
@@ -1717,7 +1691,8 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
     return false;
   }
 
-  if (DL.getTypeSizeInBits(NewTy) != DL.getTypeSizeInBits(OldTy))
+  if (DL.getTypeSizeInBits(NewTy).getFixedSize() !=
+      DL.getTypeSizeInBits(OldTy).getFixedSize())
     return false;
   if (!NewTy->isSingleValueType() || !OldTy->isSingleValueType())
     return false;
@@ -1728,8 +1703,15 @@ static bool canConvertValue(const DataLayout &DL, Type *OldTy, Type *NewTy) {
   NewTy = NewTy->getScalarType();
   if (NewTy->isPointerTy() || OldTy->isPointerTy()) {
     if (NewTy->isPointerTy() && OldTy->isPointerTy()) {
-      return cast<PointerType>(NewTy)->getPointerAddressSpace() ==
-        cast<PointerType>(OldTy)->getPointerAddressSpace();
+      unsigned OldAS = OldTy->getPointerAddressSpace();
+      unsigned NewAS = NewTy->getPointerAddressSpace();
+      // Convert pointers if they are pointers from the same address space or
+      // different integral (not non-integral) address spaces with the same
+      // pointer size.
+      return OldAS == NewAS ||
+             (!DL.isNonIntegralAddressSpace(OldAS) &&
+              !DL.isNonIntegralAddressSpace(NewAS) &&
+              DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
     }
 
     // We can convert integers to integral pointers, but not to non-integral
@@ -1765,36 +1747,40 @@ static Value *convertValue(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
   assert(!(isa<IntegerType>(OldTy) && isa<IntegerType>(NewTy)) &&
          "Integer types must be the exact same to convert.");
 
-  // See if we need inttoptr for this type pair. A cast involving both scalars
-  // and vectors requires and additional bitcast.
+  // See if we need inttoptr for this type pair. May require additional bitcast.
   if (OldTy->isIntOrIntVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
     // Expand <2 x i32> to i8* --> <2 x i32> to i64 to i8*
-    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
-      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
-                                NewTy);
-
     // Expand i128 to <2 x i8*> --> i128 to <2 x i64> to <2 x i8*>
-    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
-      return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
-                                NewTy);
-
-    return IRB.CreateIntToPtr(V, NewTy);
+    // Expand <4 x i32> to <2 x i8*> --> <4 x i32> to <2 x i64> to <2 x i8*>
+    // Directly handle i64 to i8*
+    return IRB.CreateIntToPtr(IRB.CreateBitCast(V, DL.getIntPtrType(NewTy)),
+                              NewTy);
   }
 
-  // See if we need ptrtoint for this type pair. A cast involving both scalars
-  // and vectors requires and additional bitcast.
+  // See if we need ptrtoint for this type pair. May require additional bitcast.
   if (OldTy->isPtrOrPtrVectorTy() && NewTy->isIntOrIntVectorTy()) {
     // Expand <2 x i8*> to i128 --> <2 x i8*> to <2 x i64> to i128
-    if (OldTy->isVectorTy() && !NewTy->isVectorTy())
-      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
-                               NewTy);
-
     // Expand i8* to <2 x i32> --> i8* to i64 to <2 x i32>
-    if (!OldTy->isVectorTy() && NewTy->isVectorTy())
-      return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
-                               NewTy);
+    // Expand <2 x i8*> to <4 x i32> --> <2 x i8*> to <2 x i64> to <4 x i32>
+    // Expand i8* to i64 --> i8* to i64 to i64
+    return IRB.CreateBitCast(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                             NewTy);
+  }
 
-    return IRB.CreatePtrToInt(V, NewTy);
+  if (OldTy->isPtrOrPtrVectorTy() && NewTy->isPtrOrPtrVectorTy()) {
+    unsigned OldAS = OldTy->getPointerAddressSpace();
+    unsigned NewAS = NewTy->getPointerAddressSpace();
+    // To convert pointers with different address spaces (they are already
+    // checked convertible, i.e. they have the same pointer size), so far we
+    // cannot use `bitcast` (which has restrict on the same address space) or
+    // `addrspacecast` (which is not always no-op casting). Instead, use a pair
+    // of no-op `ptrtoint`/`inttoptr` casts through an integer with the same bit
+    // size.
+    if (OldAS != NewAS) {
+      assert(DL.getPointerSize(OldAS) == DL.getPointerSize(NewAS));
+      return IRB.CreateIntToPtr(IRB.CreatePtrToInt(V, DL.getIntPtrType(OldTy)),
+                                NewTy);
+    }
   }
 
   return IRB.CreateBitCast(V, NewTy);
@@ -1813,19 +1799,20 @@ static bool isVectorPromotionViableForSlice(Partition &P, const Slice &S,
       std::max(S.beginOffset(), P.beginOffset()) - P.beginOffset();
   uint64_t BeginIndex = BeginOffset / ElementSize;
   if (BeginIndex * ElementSize != BeginOffset ||
-      BeginIndex >= Ty->getNumElements())
+      BeginIndex >= cast<FixedVectorType>(Ty)->getNumElements())
     return false;
   uint64_t EndOffset =
       std::min(S.endOffset(), P.endOffset()) - P.beginOffset();
   uint64_t EndIndex = EndOffset / ElementSize;
-  if (EndIndex * ElementSize != EndOffset || EndIndex > Ty->getNumElements())
+  if (EndIndex * ElementSize != EndOffset ||
+      EndIndex > cast<FixedVectorType>(Ty)->getNumElements())
     return false;
 
   assert(EndIndex > BeginIndex && "Empty vector!");
   uint64_t NumElements = EndIndex - BeginIndex;
   Type *SliceTy = (NumElements == 1)
                       ? Ty->getElementType()
-                      : VectorType::get(Ty->getElementType(), NumElements);
+                      : FixedVectorType::get(Ty->getElementType(), NumElements);
 
   Type *SplitIntTy =
       Type::getIntNTy(Ty->getContext(), NumElements * ElementSize * 8);
@@ -1890,7 +1877,8 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
       // Return if bitcast to vectors is different for total size in bits.
       if (!CandidateTys.empty()) {
         VectorType *V = CandidateTys[0];
-        if (DL.getTypeSizeInBits(VTy) != DL.getTypeSizeInBits(V)) {
+        if (DL.getTypeSizeInBits(VTy).getFixedSize() !=
+            DL.getTypeSizeInBits(V).getFixedSize()) {
           CandidateTys.clear();
           return;
         }
@@ -1936,13 +1924,15 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
     // they're all integer vectors. We sort by ascending number of elements.
     auto RankVectorTypes = [&DL](VectorType *RHSTy, VectorType *LHSTy) {
       (void)DL;
-      assert(DL.getTypeSizeInBits(RHSTy) == DL.getTypeSizeInBits(LHSTy) &&
+      assert(DL.getTypeSizeInBits(RHSTy).getFixedSize() ==
+                 DL.getTypeSizeInBits(LHSTy).getFixedSize() &&
              "Cannot have vector types of different sizes!");
       assert(RHSTy->getElementType()->isIntegerTy() &&
              "All non-integer types eliminated!");
       assert(LHSTy->getElementType()->isIntegerTy() &&
              "All non-integer types eliminated!");
-      return RHSTy->getNumElements() < LHSTy->getNumElements();
+      return cast<FixedVectorType>(RHSTy)->getNumElements() <
+             cast<FixedVectorType>(LHSTy)->getNumElements();
     };
     llvm::sort(CandidateTys, RankVectorTypes);
     CandidateTys.erase(
@@ -1964,13 +1954,14 @@ static VectorType *isVectorPromotionViable(Partition &P, const DataLayout &DL) {
 
   // Try each vector type, and return the one which works.
   auto CheckVectorTypeForPromotion = [&](VectorType *VTy) {
-    uint64_t ElementSize = DL.getTypeSizeInBits(VTy->getElementType());
+    uint64_t ElementSize =
+        DL.getTypeSizeInBits(VTy->getElementType()).getFixedSize();
 
     // While the definition of LLVM vectors is bitpacked, we don't support sizes
     // that aren't byte sized.
     if (ElementSize % 8)
       return false;
-    assert((DL.getTypeSizeInBits(VTy) % 8) == 0 &&
+    assert((DL.getTypeSizeInBits(VTy).getFixedSize() % 8) == 0 &&
            "vector size not a multiple of element size?");
     ElementSize /= 8;
 
@@ -2000,7 +1991,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
                                             Type *AllocaTy,
                                             const DataLayout &DL,
                                             bool &WholeAllocaOp) {
-  uint64_t Size = DL.getTypeStoreSize(AllocaTy);
+  uint64_t Size = DL.getTypeStoreSize(AllocaTy).getFixedSize();
 
   uint64_t RelBegin = S.beginOffset() - AllocBeginOffset;
   uint64_t RelEnd = S.endOffset() - AllocBeginOffset;
@@ -2016,7 +2007,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     if (LI->isVolatile())
       return false;
     // We can't handle loads that extend past the allocated memory.
-    if (DL.getTypeStoreSize(LI->getType()) > Size)
+    if (DL.getTypeStoreSize(LI->getType()).getFixedSize() > Size)
       return false;
     // So far, AllocaSliceRewriter does not support widening split slice tails
     // in rewriteIntegerLoad.
@@ -2028,7 +2019,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     if (!isa<VectorType>(LI->getType()) && RelBegin == 0 && RelEnd == Size)
       WholeAllocaOp = true;
     if (IntegerType *ITy = dyn_cast<IntegerType>(LI->getType())) {
-      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
+      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
         return false;
     } else if (RelBegin != 0 || RelEnd != Size ||
                !canConvertValue(DL, AllocaTy, LI->getType())) {
@@ -2041,7 +2032,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     if (SI->isVolatile())
       return false;
     // We can't handle stores that extend past the allocated memory.
-    if (DL.getTypeStoreSize(ValueTy) > Size)
+    if (DL.getTypeStoreSize(ValueTy).getFixedSize() > Size)
       return false;
     // So far, AllocaSliceRewriter does not support widening split slice tails
     // in rewriteIntegerStore.
@@ -2053,7 +2044,7 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
     if (!isa<VectorType>(ValueTy) && RelBegin == 0 && RelEnd == Size)
       WholeAllocaOp = true;
     if (IntegerType *ITy = dyn_cast<IntegerType>(ValueTy)) {
-      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy))
+      if (ITy->getBitWidth() < DL.getTypeStoreSizeInBits(ITy).getFixedSize())
         return false;
     } else if (RelBegin != 0 || RelEnd != Size ||
                !canConvertValue(DL, ValueTy, AllocaTy)) {
@@ -2084,13 +2075,13 @@ static bool isIntegerWideningViableForSlice(const Slice &S,
 /// promote the resulting alloca.
 static bool isIntegerWideningViable(Partition &P, Type *AllocaTy,
                                     const DataLayout &DL) {
-  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy);
+  uint64_t SizeInBits = DL.getTypeSizeInBits(AllocaTy).getFixedSize();
   // Don't create integer types larger than the maximum bitwidth.
   if (SizeInBits > IntegerType::MAX_INT_BITS)
     return false;
 
   // Don't try to handle allocas with bit-padding.
-  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy))
+  if (SizeInBits != DL.getTypeStoreSizeInBits(AllocaTy).getFixedSize())
     return false;
 
   // We need to ensure that an integer type with the appropriate bitwidth can
@@ -2129,11 +2120,13 @@ static Value *extractInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *V,
                              const Twine &Name) {
   LLVM_DEBUG(dbgs() << "       start: " << *V << "\n");
   IntegerType *IntTy = cast<IntegerType>(V->getType());
-  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
+             DL.getTypeStoreSize(IntTy).getFixedSize() &&
          "Element extends past full value");
   uint64_t ShAmt = 8 * Offset;
   if (DL.isBigEndian())
-    ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+    ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
+                 DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
   if (ShAmt) {
     V = IRB.CreateLShr(V, ShAmt, Name + ".shift");
     LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n");
@@ -2158,11 +2151,13 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
     V = IRB.CreateZExt(V, IntTy, Name + ".ext");
     LLVM_DEBUG(dbgs() << "    extended: " << *V << "\n");
   }
-  assert(DL.getTypeStoreSize(Ty) + Offset <= DL.getTypeStoreSize(IntTy) &&
+  assert(DL.getTypeStoreSize(Ty).getFixedSize() + Offset <=
+             DL.getTypeStoreSize(IntTy).getFixedSize() &&
          "Element store outside of alloca store");
   uint64_t ShAmt = 8 * Offset;
   if (DL.isBigEndian())
-    ShAmt = 8 * (DL.getTypeStoreSize(IntTy) - DL.getTypeStoreSize(Ty) - Offset);
+    ShAmt = 8 * (DL.getTypeStoreSize(IntTy).getFixedSize() -
+                 DL.getTypeStoreSize(Ty).getFixedSize() - Offset);
   if (ShAmt) {
     V = IRB.CreateShl(V, ShAmt, Name + ".shift");
     LLVM_DEBUG(dbgs() << "     shifted: " << *V << "\n");
@@ -2180,7 +2175,7 @@ static Value *insertInteger(const DataLayout &DL, IRBuilderTy &IRB, Value *Old,
 
 static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
                             unsigned EndIndex, const Twine &Name) {
-  VectorType *VecTy = cast<VectorType>(V->getType());
+  auto *VecTy = cast<FixedVectorType>(V->getType());
   unsigned NumElements = EndIndex - BeginIndex;
   assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
 
@@ -2194,12 +2189,12 @@ static Value *extractVector(IRBuilderTy &IRB, Value *V, unsigned BeginIndex,
     return V;
   }
 
-  SmallVector<Constant *, 8> Mask;
+  SmallVector<int, 8> Mask;
   Mask.reserve(NumElements);
   for (unsigned i = BeginIndex; i != EndIndex; ++i)
-    Mask.push_back(IRB.getInt32(i));
-  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()),
-                              ConstantVector::get(Mask), Name + ".extract");
+    Mask.push_back(i);
+  V = IRB.CreateShuffleVector(V, UndefValue::get(V->getType()), Mask,
+                              Name + ".extract");
   LLVM_DEBUG(dbgs() << "     shuffle: " << *V << "\n");
   return V;
 }
@@ -2218,21 +2213,23 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
     return V;
   }
 
-  assert(Ty->getNumElements() <= VecTy->getNumElements() &&
+  assert(cast<FixedVectorType>(Ty)->getNumElements() <=
+             cast<FixedVectorType>(VecTy)->getNumElements() &&
          "Too many elements!");
-  if (Ty->getNumElements() == VecTy->getNumElements()) {
+  if (cast<FixedVectorType>(Ty)->getNumElements() ==
+      cast<FixedVectorType>(VecTy)->getNumElements()) {
     assert(V->getType() == VecTy && "Vector type mismatch");
     return V;
   }
-  unsigned EndIndex = BeginIndex + Ty->getNumElements();
+  unsigned EndIndex = BeginIndex + cast<FixedVectorType>(Ty)->getNumElements();
 
   // When inserting a smaller vector into the larger to store, we first
   // use a shuffle vector to widen it with undef elements, and then
   // a second shuffle vector to select between the loaded vector and the
   // incoming vector.
   SmallVector<Constant *, 8> Mask;
-  Mask.reserve(VecTy->getNumElements());
-  for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+  Mask.reserve(cast<FixedVectorType>(VecTy)->getNumElements());
+  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
     if (i >= BeginIndex && i < EndIndex)
       Mask.push_back(IRB.getInt32(i - BeginIndex));
     else
@@ -2242,7 +2239,7 @@ static Value *insertVector(IRBuilderTy &IRB, Value *Old, Value *V,
   LLVM_DEBUG(dbgs() << "    shuffle: " << *V << "\n");
 
   Mask.clear();
-  for (unsigned i = 0; i != VecTy->getNumElements(); ++i)
+  for (unsigned i = 0; i != cast<FixedVectorType>(VecTy)->getNumElements(); ++i)
     Mask.push_back(IRB.getInt1(i >= BeginIndex && i < EndIndex));
 
   V = IRB.CreateSelect(ConstantVector::get(Mask), V, Old, Name + "blend");
@@ -2325,18 +2322,20 @@ public:
         NewAllocaBeginOffset(NewAllocaBeginOffset),
         NewAllocaEndOffset(NewAllocaEndOffset),
         NewAllocaTy(NewAI.getAllocatedType()),
-        IntTy(IsIntegerPromotable
-                  ? Type::getIntNTy(
-                        NewAI.getContext(),
-                        DL.getTypeSizeInBits(NewAI.getAllocatedType()))
-                  : nullptr),
+        IntTy(
+            IsIntegerPromotable
+                ? Type::getIntNTy(NewAI.getContext(),
+                                  DL.getTypeSizeInBits(NewAI.getAllocatedType())
+                                      .getFixedSize())
+                : nullptr),
         VecTy(PromotableVecTy),
         ElementTy(VecTy ? VecTy->getElementType() : nullptr),
-        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy) / 8 : 0),
+        ElementSize(VecTy ? DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8
+                          : 0),
         PHIUsers(PHIUsers), SelectUsers(SelectUsers),
         IRB(NewAI.getContext(), ConstantFolder()) {
     if (VecTy) {
-      assert((DL.getTypeSizeInBits(ElementTy) % 8) == 0 &&
+      assert((DL.getTypeSizeInBits(ElementTy).getFixedSize() % 8) == 0 &&
              "Only multiple-of-8 sized vector elements are viable");
       ++NumVectorized;
     }
@@ -2368,7 +2367,8 @@ public:
     Instruction *OldUserI = cast<Instruction>(OldUse->getUser());
     IRB.SetInsertPoint(OldUserI);
     IRB.SetCurrentDebugLocation(OldUserI->getDebugLoc());
-    IRB.SetNamePrefix(Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
+    IRB.getInserter().SetNamePrefix(
+        Twine(NewAI.getName()) + "." + Twine(BeginOffset) + ".");
 
     CanSROA &= visit(cast<Instruction>(OldUse->getUser()));
     if (VecTy || IntTy)
@@ -2429,14 +2429,9 @@ private:
   ///
   /// You can optionally pass a type to this routine and if that type's ABI
   /// alignment is itself suitable, this will return zero.
-  MaybeAlign getSliceAlign(Type *Ty = nullptr) {
-    const MaybeAlign NewAIAlign = DL.getValueOrABITypeAlignment(
-        MaybeAlign(NewAI.getAlignment()), NewAI.getAllocatedType());
-    const MaybeAlign Align =
-        commonAlignment(NewAIAlign, NewBeginOffset - NewAllocaBeginOffset);
-    return (Ty && Align && Align->value() == DL.getABITypeAlignment(Ty))
-               ? None
-               : Align;
+  Align getSliceAlign() {
+    return commonAlignment(NewAI.getAlign(),
+                           NewBeginOffset - NewAllocaBeginOffset);
   }
 
   unsigned getIndex(uint64_t Offset) {
@@ -2460,7 +2455,7 @@ private:
     assert(EndIndex > BeginIndex && "Empty vector!");
 
     Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
-                                     NewAI.getAlignment(), "load");
+                                     NewAI.getAlign(), "load");
     return extractVector(IRB, V, BeginIndex, EndIndex, "vec");
   }
 
@@ -2468,7 +2463,7 @@ private:
     assert(IntTy && "We cannot insert an integer to the alloca");
     assert(!LI.isVolatile());
     Value *V = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
-                                     NewAI.getAlignment(), "load");
+                                     NewAI.getAlign(), "load");
     V = convertValue(DL, IRB, V, IntTy);
     assert(NewBeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
     uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
@@ -2500,7 +2495,8 @@ private:
 
     Type *TargetTy = IsSplit ? Type::getIntNTy(LI.getContext(), SliceSize * 8)
                              : LI.getType();
-    const bool IsLoadPastEnd = DL.getTypeStoreSize(TargetTy) > SliceSize;
+    const bool IsLoadPastEnd =
+        DL.getTypeStoreSize(TargetTy).getFixedSize() > SliceSize;
     bool IsPtrAdjusted = false;
     Value *V;
     if (VecTy) {
@@ -2513,12 +2509,14 @@ private:
                 (IsLoadPastEnd && NewAllocaTy->isIntegerTy() &&
                  TargetTy->isIntegerTy()))) {
       LoadInst *NewLI = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
-                                              NewAI.getAlignment(),
-                                              LI.isVolatile(), LI.getName());
+                                              NewAI.getAlign(), LI.isVolatile(),
+                                              LI.getName());
       if (AATags)
         NewLI->setAAMetadata(AATags);
       if (LI.isVolatile())
         NewLI->setAtomic(LI.getOrdering(), LI.getSyncScopeID());
+      if (NewLI->isAtomic())
+        NewLI->setAlignment(LI.getAlign());
 
       // Any !nonnull metadata or !range metadata on the old load is also valid
       // on the new load. This is even true in some cases even when the loads
@@ -2549,9 +2547,9 @@ private:
           }
     } else {
       Type *LTy = TargetTy->getPointerTo(AS);
-      LoadInst *NewLI = IRB.CreateAlignedLoad(
-          TargetTy, getNewAllocaSlicePtr(IRB, LTy), getSliceAlign(TargetTy),
-          LI.isVolatile(), LI.getName());
+      LoadInst *NewLI =
+          IRB.CreateAlignedLoad(TargetTy, getNewAllocaSlicePtr(IRB, LTy),
+                                getSliceAlign(), LI.isVolatile(), LI.getName());
       if (AATags)
         NewLI->setAAMetadata(AATags);
       if (LI.isVolatile())
@@ -2566,7 +2564,7 @@ private:
       assert(!LI.isVolatile());
       assert(LI.getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
-      assert(SliceSize < DL.getTypeStoreSize(LI.getType()) &&
+      assert(SliceSize < DL.getTypeStoreSize(LI.getType()).getFixedSize() &&
              "Split load isn't smaller than original load");
       assert(DL.typeSizeEqualsStoreSize(LI.getType()) &&
              "Non-byte-multiple bit width");
@@ -2577,7 +2575,8 @@ private:
       // the computed value, and then replace the placeholder with LI, leaving
       // LI only used for this computation.
       Value *Placeholder = new LoadInst(
-          LI.getType(), UndefValue::get(LI.getType()->getPointerTo(AS)));
+          LI.getType(), UndefValue::get(LI.getType()->getPointerTo(AS)), "",
+          false, Align(1));
       V = insertInteger(DL, IRB, Placeholder, V, NewBeginOffset - BeginOffset,
                         "insert");
       LI.replaceAllUsesWith(V);
@@ -2600,19 +2599,20 @@ private:
       unsigned EndIndex = getIndex(NewEndOffset);
       assert(EndIndex > BeginIndex && "Empty vector!");
       unsigned NumElements = EndIndex - BeginIndex;
-      assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+      assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
+             "Too many elements!");
       Type *SliceTy = (NumElements == 1)
                           ? ElementTy
-                          : VectorType::get(ElementTy, NumElements);
+                          : FixedVectorType::get(ElementTy, NumElements);
       if (V->getType() != SliceTy)
         V = convertValue(DL, IRB, V, SliceTy);
 
       // Mix in the existing elements.
       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
-                                         NewAI.getAlignment(), "load");
+                                         NewAI.getAlign(), "load");
       V = insertVector(IRB, Old, V, BeginIndex, "vec");
     }
-    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
     if (AATags)
       Store->setAAMetadata(AATags);
     Pass.DeadInsts.insert(&SI);
@@ -2624,16 +2624,17 @@ private:
   bool rewriteIntegerStore(Value *V, StoreInst &SI, AAMDNodes AATags) {
     assert(IntTy && "We cannot extract an integer from the alloca");
     assert(!SI.isVolatile());
-    if (DL.getTypeSizeInBits(V->getType()) != IntTy->getBitWidth()) {
+    if (DL.getTypeSizeInBits(V->getType()).getFixedSize() !=
+        IntTy->getBitWidth()) {
       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
-                                         NewAI.getAlignment(), "oldload");
+                                         NewAI.getAlign(), "oldload");
       Old = convertValue(DL, IRB, Old, IntTy);
       assert(BeginOffset >= NewAllocaBeginOffset && "Out of bounds offset");
       uint64_t Offset = BeginOffset - NewAllocaBeginOffset;
       V = insertInteger(DL, IRB, Old, SI.getValueOperand(), Offset, "insert");
     }
     V = convertValue(DL, IRB, V, NewAllocaTy);
-    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment());
+    StoreInst *Store = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign());
     Store->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
                              LLVMContext::MD_access_group});
     if (AATags)
@@ -2659,7 +2660,7 @@ private:
       if (AllocaInst *AI = dyn_cast<AllocaInst>(V->stripInBoundsOffsets()))
         Pass.PostPromotionWorklist.insert(AI);
 
-    if (SliceSize < DL.getTypeStoreSize(V->getType())) {
+    if (SliceSize < DL.getTypeStoreSize(V->getType()).getFixedSize()) {
       assert(!SI.isVolatile());
       assert(V->getType()->isIntegerTy() &&
              "Only integer type loads and stores are split");
@@ -2675,7 +2676,8 @@ private:
     if (IntTy && V->getType()->isIntegerTy())
       return rewriteIntegerStore(V, SI, AATags);
 
-    const bool IsStorePastEnd = DL.getTypeStoreSize(V->getType()) > SliceSize;
+    const bool IsStorePastEnd =
+        DL.getTypeStoreSize(V->getType()).getFixedSize() > SliceSize;
     StoreInst *NewSI;
     if (NewBeginOffset == NewAllocaBeginOffset &&
         NewEndOffset == NewAllocaEndOffset &&
@@ -2695,13 +2697,13 @@ private:
           }
 
       V = convertValue(DL, IRB, V, NewAllocaTy);
-      NewSI = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
-                                     SI.isVolatile());
+      NewSI =
+          IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), SI.isVolatile());
     } else {
       unsigned AS = SI.getPointerAddressSpace();
       Value *NewPtr = getNewAllocaSlicePtr(IRB, V->getType()->getPointerTo(AS));
-      NewSI = IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(V->getType()),
-                                     SI.isVolatile());
+      NewSI =
+          IRB.CreateAlignedStore(V, NewPtr, getSliceAlign(), SI.isVolatile());
     }
     NewSI->copyMetadata(SI, {LLVMContext::MD_mem_parallel_loop_access,
                              LLVMContext::MD_access_group});
@@ -2709,6 +2711,8 @@ private:
       NewSI->setAAMetadata(AATags);
     if (SI.isVolatile())
       NewSI->setAtomic(SI.getOrdering(), SI.getSyncScopeID());
+    if (NewSI->isAtomic())
+      NewSI->setAlignment(SI.getAlign());
     Pass.DeadInsts.insert(&SI);
     deleteIfTriviallyDead(OldOp);
 
@@ -2786,9 +2790,9 @@ private:
         return false;
       const auto Len = C->getZExtValue();
       auto *Int8Ty = IntegerType::getInt8Ty(NewAI.getContext());
-      auto *SrcTy = VectorType::get(Int8Ty, Len);
+      auto *SrcTy = FixedVectorType::get(Int8Ty, Len);
       return canConvertValue(DL, SrcTy, AllocaTy) &&
-        DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy));
+             DL.isLegalInteger(DL.getTypeSizeInBits(ScalarTy).getFixedSize());
     }();
 
     // If this doesn't map cleanly onto the alloca type, and that type isn't
@@ -2820,16 +2824,17 @@ private:
       unsigned EndIndex = getIndex(NewEndOffset);
       assert(EndIndex > BeginIndex && "Empty vector!");
       unsigned NumElements = EndIndex - BeginIndex;
-      assert(NumElements <= VecTy->getNumElements() && "Too many elements!");
+      assert(NumElements <= cast<FixedVectorType>(VecTy)->getNumElements() &&
+             "Too many elements!");
 
-      Value *Splat =
-          getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ElementTy) / 8);
+      Value *Splat = getIntegerSplat(
+          II.getValue(), DL.getTypeSizeInBits(ElementTy).getFixedSize() / 8);
       Splat = convertValue(DL, IRB, Splat, ElementTy);
       if (NumElements > 1)
         Splat = getVectorSplat(Splat, NumElements);
 
       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
-                                         NewAI.getAlignment(), "oldload");
+                                         NewAI.getAlign(), "oldload");
       V = insertVector(IRB, Old, Splat, BeginIndex, "vec");
     } else if (IntTy) {
       // If this is a memset on an alloca where we can widen stores, insert the
@@ -2842,7 +2847,7 @@ private:
       if (IntTy && (BeginOffset != NewAllocaBeginOffset ||
                     EndOffset != NewAllocaBeginOffset)) {
         Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
-                                           NewAI.getAlignment(), "oldload");
+                                           NewAI.getAlign(), "oldload");
         Old = convertValue(DL, IRB, Old, IntTy);
         uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
         V = insertInteger(DL, IRB, Old, V, Offset, "insert");
@@ -2856,15 +2861,17 @@ private:
       assert(NewBeginOffset == NewAllocaBeginOffset);
       assert(NewEndOffset == NewAllocaEndOffset);
 
-      V = getIntegerSplat(II.getValue(), DL.getTypeSizeInBits(ScalarTy) / 8);
+      V = getIntegerSplat(II.getValue(),
+                          DL.getTypeSizeInBits(ScalarTy).getFixedSize() / 8);
       if (VectorType *AllocaVecTy = dyn_cast<VectorType>(AllocaTy))
-        V = getVectorSplat(V, AllocaVecTy->getNumElements());
+        V = getVectorSplat(
+            V, cast<FixedVectorType>(AllocaVecTy)->getNumElements());
 
       V = convertValue(DL, IRB, V, AllocaTy);
     }
 
-    StoreInst *New = IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlignment(),
-                                            II.isVolatile());
+    StoreInst *New =
+        IRB.CreateAlignedStore(V, &NewAI, NewAI.getAlign(), II.isVolatile());
     if (AATags)
       New->setAAMetadata(AATags);
     LLVM_DEBUG(dbgs() << "          to: " << *New << "\n");
@@ -2919,7 +2926,8 @@ private:
     bool EmitMemCpy =
         !VecTy && !IntTy &&
         (BeginOffset > NewAllocaBeginOffset || EndOffset < NewAllocaEndOffset ||
-         SliceSize != DL.getTypeStoreSize(NewAI.getAllocatedType()) ||
+         SliceSize !=
+             DL.getTypeStoreSize(NewAI.getAllocatedType()).getFixedSize() ||
          !NewAI.getAllocatedType()->isSingleValueType());
 
     // If we're just going to emit a memcpy, the alloca hasn't changed, and the
@@ -2955,7 +2963,7 @@ private:
     unsigned OffsetWidth = DL.getIndexSizeInBits(OtherAS);
     APInt OtherOffset(OffsetWidth, NewBeginOffset - BeginOffset);
     Align OtherAlign =
-        assumeAligned(IsDest ? II.getSourceAlignment() : II.getDestAlignment());
+        (IsDest ? II.getSourceAlign() : II.getDestAlign()).valueOrOne();
     OtherAlign =
         commonAlignment(OtherAlign, OtherOffset.zextOrTrunc(64).getZExtValue());
 
@@ -3007,7 +3015,7 @@ private:
       if (NumElements == 1)
         OtherTy = VecTy->getElementType();
       else
-        OtherTy = VectorType::get(VecTy->getElementType(), NumElements);
+        OtherTy = FixedVectorType::get(VecTy->getElementType(), NumElements);
     } else if (IntTy && !IsWholeAlloca) {
       OtherTy = SubIntTy;
     } else {
@@ -3028,11 +3036,11 @@ private:
     Value *Src;
     if (VecTy && !IsWholeAlloca && !IsDest) {
       Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
-                                  NewAI.getAlignment(), "load");
+                                  NewAI.getAlign(), "load");
       Src = extractVector(IRB, Src, BeginIndex, EndIndex, "vec");
     } else if (IntTy && !IsWholeAlloca && !IsDest) {
       Src = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
-                                  NewAI.getAlignment(), "load");
+                                  NewAI.getAlign(), "load");
       Src = convertValue(DL, IRB, Src, IntTy);
       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
       Src = extractInteger(DL, IRB, Src, SubIntTy, Offset, "extract");
@@ -3046,11 +3054,11 @@ private:
 
     if (VecTy && !IsWholeAlloca && IsDest) {
       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
-                                         NewAI.getAlignment(), "oldload");
+                                         NewAI.getAlign(), "oldload");
       Src = insertVector(IRB, Old, Src, BeginIndex, "vec");
     } else if (IntTy && !IsWholeAlloca && IsDest) {
       Value *Old = IRB.CreateAlignedLoad(NewAI.getAllocatedType(), &NewAI,
-                                         NewAI.getAlignment(), "oldload");
+                                         NewAI.getAlign(), "oldload");
       Old = convertValue(DL, IRB, Old, IntTy);
       uint64_t Offset = NewBeginOffset - NewAllocaBeginOffset;
       Src = insertInteger(DL, IRB, Old, Src, Offset, "insert");
@@ -3115,17 +3123,12 @@ private:
       Instruction *I = Uses.pop_back_val();
 
       if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
-        MaybeAlign LoadAlign = DL.getValueOrABITypeAlignment(
-            MaybeAlign(LI->getAlignment()), LI->getType());
-        LI->setAlignment(std::min(LoadAlign, getSliceAlign()));
+        LI->setAlignment(std::min(LI->getAlign(), getSliceAlign()));
         continue;
       }
       if (StoreInst *SI = dyn_cast<StoreInst>(I)) {
-          Value *Op = SI->getOperand(0);
-          MaybeAlign StoreAlign = DL.getValueOrABITypeAlignment(
-              MaybeAlign(SI->getAlignment()), Op->getType());
-          SI->setAlignment(std::min(StoreAlign, getSliceAlign()));
-          continue;
+        SI->setAlignment(std::min(SI->getAlign(), getSliceAlign()));
+        continue;
       }
 
       assert(isa<BitCastInst>(I) || isa<AddrSpaceCastInst>(I) ||
@@ -3146,14 +3149,14 @@ private:
     // as local as possible to the PHI. To do that, we re-use the location of
     // the old pointer, which necessarily must be in the right position to
     // dominate the PHI.
-    IRBuilderTy PtrBuilder(IRB);
+    IRBuilderBase::InsertPointGuard Guard(IRB);
     if (isa<PHINode>(OldPtr))
-      PtrBuilder.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
+      IRB.SetInsertPoint(&*OldPtr->getParent()->getFirstInsertionPt());
     else
-      PtrBuilder.SetInsertPoint(OldPtr);
-    PtrBuilder.SetCurrentDebugLocation(OldPtr->getDebugLoc());
+      IRB.SetInsertPoint(OldPtr);
+    IRB.SetCurrentDebugLocation(OldPtr->getDebugLoc());
 
-    Value *NewPtr = getNewAllocaSlicePtr(PtrBuilder, OldPtr->getType());
+    Value *NewPtr = getNewAllocaSlicePtr(IRB, OldPtr->getType());
     // Replace the operands which were using the old pointer.
     std::replace(PN.op_begin(), PN.op_end(), cast<Value>(OldPtr), NewPtr);
 
@@ -3357,7 +3360,7 @@ private:
       Value *GEP =
           IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
       LoadInst *Load =
-          IRB.CreateAlignedLoad(Ty, GEP, Alignment.value(), Name + ".load");
+          IRB.CreateAlignedLoad(Ty, GEP, Alignment, Name + ".load");
       if (AATags)
         Load->setAAMetadata(AATags);
       Agg = IRB.CreateInsertValue(Agg, Load, Indices, Name + ".insert");
@@ -3375,9 +3378,10 @@ private:
     AAMDNodes AATags;
     LI.getAAMetadata(AATags);
     LoadOpSplitter Splitter(&LI, *U, LI.getType(), AATags,
-                            getAdjustedAlignment(&LI, 0, DL), DL);
+                            getAdjustedAlignment(&LI, 0), DL);
     Value *V = UndefValue::get(LI.getType());
     Splitter.emitSplitOps(LI.getType(), V, LI.getName() + ".fca");
+    Visited.erase(&LI);
     LI.replaceAllUsesWith(V);
     LI.eraseFromParent();
     return true;
@@ -3403,7 +3407,7 @@ private:
       Value *InBoundsGEP =
           IRB.CreateInBoundsGEP(BaseTy, Ptr, GEPIndices, Name + ".gep");
       StoreInst *Store =
-          IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment.value());
+          IRB.CreateAlignedStore(ExtractValue, InBoundsGEP, Alignment);
       if (AATags)
         Store->setAAMetadata(AATags);
       LLVM_DEBUG(dbgs() << "          to: " << *Store << "\n");
@@ -3422,8 +3426,9 @@ private:
     AAMDNodes AATags;
     SI.getAAMetadata(AATags);
     StoreOpSplitter Splitter(&SI, *U, V->getType(), AATags,
-                             getAdjustedAlignment(&SI, 0, DL), DL);
+                             getAdjustedAlignment(&SI, 0), DL);
     Splitter.emitSplitOps(V->getType(), V, V->getName() + ".fca");
+    Visited.erase(&SI);
     SI.eraseFromParent();
     return true;
   }
@@ -3438,7 +3443,110 @@ private:
     return false;
   }
 
+  // Fold gep (select cond, ptr1, ptr2) => select cond, gep(ptr1), gep(ptr2)
+  bool foldGEPSelect(GetElementPtrInst &GEPI) {
+    if (!GEPI.hasAllConstantIndices())
+      return false;
+
+    SelectInst *Sel = cast<SelectInst>(GEPI.getPointerOperand());
+
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(select) -> select(gep):"
+                      << "\n    original: " << *Sel
+                      << "\n              " << GEPI);
+
+    IRBuilderTy Builder(&GEPI);
+    SmallVector<Value *, 4> Index(GEPI.idx_begin(), GEPI.idx_end());
+    bool IsInBounds = GEPI.isInBounds();
+
+    Value *True = Sel->getTrueValue();
+    Value *NTrue =
+        IsInBounds
+            ? Builder.CreateInBoundsGEP(True, Index,
+                                        True->getName() + ".sroa.gep")
+            : Builder.CreateGEP(True, Index, True->getName() + ".sroa.gep");
+
+    Value *False = Sel->getFalseValue();
+
+    Value *NFalse =
+        IsInBounds
+            ? Builder.CreateInBoundsGEP(False, Index,
+                                        False->getName() + ".sroa.gep")
+            : Builder.CreateGEP(False, Index, False->getName() + ".sroa.gep");
+
+    Value *NSel = Builder.CreateSelect(Sel->getCondition(), NTrue, NFalse,
+                                       Sel->getName() + ".sroa.sel");
+    Visited.erase(&GEPI);
+    GEPI.replaceAllUsesWith(NSel);
+    GEPI.eraseFromParent();
+    Instruction *NSelI = cast<Instruction>(NSel);
+    Visited.insert(NSelI);
+    enqueueUsers(*NSelI);
+
+    LLVM_DEBUG(dbgs() << "\n          to: " << *NTrue
+                      << "\n              " << *NFalse
+                      << "\n              " << *NSel << '\n');
+
+    return true;
+  }
+
+  // Fold gep (phi ptr1, ptr2) => phi gep(ptr1), gep(ptr2)
+  bool foldGEPPhi(GetElementPtrInst &GEPI) {
+    if (!GEPI.hasAllConstantIndices())
+      return false;
+
+    PHINode *PHI = cast<PHINode>(GEPI.getPointerOperand());
+    if (GEPI.getParent() != PHI->getParent() ||
+        llvm::any_of(PHI->incoming_values(), [](Value *In)
+          { Instruction *I = dyn_cast<Instruction>(In);
+            return !I || isa<GetElementPtrInst>(I) || isa<PHINode>(I) ||
+                   succ_empty(I->getParent()) ||
+                   !I->getParent()->isLegalToHoistInto();
+          }))
+      return false;
+
+    LLVM_DEBUG(dbgs() << "  Rewriting gep(phi) -> phi(gep):"
+                      << "\n    original: " << *PHI
+                      << "\n              " << GEPI
+                      << "\n          to: ");
+
+    SmallVector<Value *, 4> Index(GEPI.idx_begin(), GEPI.idx_end());
+    bool IsInBounds = GEPI.isInBounds();
+    IRBuilderTy PHIBuilder(GEPI.getParent()->getFirstNonPHI());
+    PHINode *NewPN = PHIBuilder.CreatePHI(GEPI.getType(),
+                                          PHI->getNumIncomingValues(),
+                                          PHI->getName() + ".sroa.phi");
+    for (unsigned I = 0, E = PHI->getNumIncomingValues(); I != E; ++I) {
+      Instruction *In = cast<Instruction>(PHI->getIncomingValue(I));
+
+      IRBuilderTy B(In->getParent(), std::next(In->getIterator()));
+      Value *NewVal = IsInBounds
+          ? B.CreateInBoundsGEP(In, Index, In->getName() + ".sroa.gep")
+          : B.CreateGEP(In, Index, In->getName() + ".sroa.gep");
+      NewPN->addIncoming(NewVal, PHI->getIncomingBlock(I));
+    }
+
+    Visited.erase(&GEPI);
+    GEPI.replaceAllUsesWith(NewPN);
+    GEPI.eraseFromParent();
+    Visited.insert(NewPN);
+    enqueueUsers(*NewPN);
+
+    LLVM_DEBUG(for (Value *In : NewPN->incoming_values())
+                 dbgs() << "\n              " << *In;
+               dbgs() << "\n              " << *NewPN << '\n');
+
+    return true;
+  }
+
   bool visitGetElementPtrInst(GetElementPtrInst &GEPI) {
+    if (isa<SelectInst>(GEPI.getPointerOperand()) &&
+        foldGEPSelect(GEPI))
+      return true;
+
+    if (isa<PHINode>(GEPI.getPointerOperand()) &&
+        foldGEPPhi(GEPI))
+      return true;
+
     enqueueUsers(GEPI);
     return false;
   }
@@ -3465,8 +3573,8 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
   if (Ty->isSingleValueType())
     return Ty;
 
-  uint64_t AllocSize = DL.getTypeAllocSize(Ty);
-  uint64_t TypeSize = DL.getTypeSizeInBits(Ty);
+  uint64_t AllocSize = DL.getTypeAllocSize(Ty).getFixedSize();
+  uint64_t TypeSize = DL.getTypeSizeInBits(Ty).getFixedSize();
 
   Type *InnerTy;
   if (ArrayType *ArrTy = dyn_cast<ArrayType>(Ty)) {
@@ -3479,8 +3587,8 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
     return Ty;
   }
 
-  if (AllocSize > DL.getTypeAllocSize(InnerTy) ||
-      TypeSize > DL.getTypeSizeInBits(InnerTy))
+  if (AllocSize > DL.getTypeAllocSize(InnerTy).getFixedSize() ||
+      TypeSize > DL.getTypeSizeInBits(InnerTy).getFixedSize())
     return Ty;
 
   return stripAggregateTypeWrapping(DL, InnerTy);
@@ -3501,17 +3609,28 @@ static Type *stripAggregateTypeWrapping(const DataLayout &DL, Type *Ty) {
 /// return a type if necessary.
 static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
                               uint64_t Size) {
-  if (Offset == 0 && DL.getTypeAllocSize(Ty) == Size)
+  if (Offset == 0 && DL.getTypeAllocSize(Ty).getFixedSize() == Size)
     return stripAggregateTypeWrapping(DL, Ty);
-  if (Offset > DL.getTypeAllocSize(Ty) ||
-      (DL.getTypeAllocSize(Ty) - Offset) < Size)
+  if (Offset > DL.getTypeAllocSize(Ty).getFixedSize() ||
+      (DL.getTypeAllocSize(Ty).getFixedSize() - Offset) < Size)
     return nullptr;
 
-  if (SequentialType *SeqTy = dyn_cast<SequentialType>(Ty)) {
-    Type *ElementTy = SeqTy->getElementType();
-    uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
+  if (isa<ArrayType>(Ty) || isa<VectorType>(Ty)) {
+     Type *ElementTy;
+     uint64_t TyNumElements;
+     if (auto *AT = dyn_cast<ArrayType>(Ty)) {
+       ElementTy = AT->getElementType();
+       TyNumElements = AT->getNumElements();
+     } else {
+       // FIXME: This isn't right for vectors with non-byte-sized or
+       // non-power-of-two sized elements.
+       auto *VT = cast<FixedVectorType>(Ty);
+       ElementTy = VT->getElementType();
+       TyNumElements = VT->getNumElements();
+    }
+    uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
     uint64_t NumSkippedElements = Offset / ElementSize;
-    if (NumSkippedElements >= SeqTy->getNumElements())
+    if (NumSkippedElements >= TyNumElements)
       return nullptr;
     Offset -= NumSkippedElements * ElementSize;
 
@@ -3549,7 +3668,7 @@ static Type *getTypePartition(const DataLayout &DL, Type *Ty, uint64_t Offset,
   Offset -= SL->getElementOffset(Index);
 
   Type *ElementTy = STy->getElementType(Index);
-  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy);
+  uint64_t ElementSize = DL.getTypeAllocSize(ElementTy).getFixedSize();
   if (Offset >= ElementSize)
     return nullptr; // The offset points into alignment padding.
 
@@ -3860,7 +3979,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
           getAdjustedPtr(IRB, DL, BasePtr,
                          APInt(DL.getIndexSizeInBits(AS), PartOffset),
                          PartPtrTy, BasePtr->getName() + "."),
-          getAdjustedAlignment(LI, PartOffset, DL).value(),
+          getAdjustedAlignment(LI, PartOffset),
           /*IsVolatile*/ false, LI->getName());
       PLoad->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
                                 LLVMContext::MD_access_group});
@@ -3918,7 +4037,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
             getAdjustedPtr(IRB, DL, StoreBasePtr,
                            APInt(DL.getIndexSizeInBits(AS), PartOffset),
                            PartPtrTy, StoreBasePtr->getName() + "."),
-            getAdjustedAlignment(SI, PartOffset, DL).value(),
+            getAdjustedAlignment(SI, PartOffset),
             /*IsVolatile*/ false);
         PStore->copyMetadata(*LI, {LLVMContext::MD_mem_parallel_loop_access,
                                    LLVMContext::MD_access_group});
@@ -4003,7 +4122,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
             getAdjustedPtr(IRB, DL, LoadBasePtr,
                            APInt(DL.getIndexSizeInBits(AS), PartOffset),
                            LoadPartPtrTy, LoadBasePtr->getName() + "."),
-            getAdjustedAlignment(LI, PartOffset, DL).value(),
+            getAdjustedAlignment(LI, PartOffset),
             /*IsVolatile*/ false, LI->getName());
       }
 
@@ -4015,7 +4134,7 @@ bool SROA::presplitLoadsAndStores(AllocaInst &AI, AllocaSlices &AS) {
           getAdjustedPtr(IRB, DL, StoreBasePtr,
                          APInt(DL.getIndexSizeInBits(AS), PartOffset),
                          StorePartPtrTy, StoreBasePtr->getName() + "."),
-          getAdjustedAlignment(SI, PartOffset, DL).value(),
+          getAdjustedAlignment(SI, PartOffset),
           /*IsVolatile*/ false);
 
       // Now build a new slice for the alloca.
@@ -4117,7 +4236,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
   Type *SliceTy = nullptr;
   const DataLayout &DL = AI.getModule()->getDataLayout();
   if (Type *CommonUseTy = findCommonType(P.begin(), P.end(), P.endOffset()))
-    if (DL.getTypeAllocSize(CommonUseTy) >= P.size())
+    if (DL.getTypeAllocSize(CommonUseTy).getFixedSize() >= P.size())
       SliceTy = CommonUseTy;
   if (!SliceTy)
     if (Type *TypePartitionTy = getTypePartition(DL, AI.getAllocatedType(),
@@ -4129,7 +4248,7 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     SliceTy = Type::getIntNTy(*C, P.size() * 8);
   if (!SliceTy)
     SliceTy = ArrayType::get(Type::getInt8Ty(*C), P.size());
-  assert(DL.getTypeAllocSize(SliceTy) >= P.size());
+  assert(DL.getTypeAllocSize(SliceTy).getFixedSize() >= P.size());
 
   bool IsIntegerPromotable = isIntegerWideningViable(P, SliceTy, DL);
 
@@ -4151,19 +4270,14 @@ AllocaInst *SROA::rewritePartition(AllocaInst &AI, AllocaSlices &AS,
     // FIXME: We might want to defer PHI speculation until after here.
     // FIXME: return nullptr;
   } else {
-    // If alignment is unspecified we fallback on the one required by the ABI
-    // for this type. We also make sure the alignment is compatible with
-    // P.beginOffset().
-    const Align Alignment = commonAlignment(
-        DL.getValueOrABITypeAlignment(MaybeAlign(AI.getAlignment()),
-                                      AI.getAllocatedType()),
-        P.beginOffset());
+    // Make sure the alignment is compatible with P.beginOffset().
+    const Align Alignment = commonAlignment(AI.getAlign(), P.beginOffset());
     // If we will get at least this much alignment from the type alone, leave
     // the alloca's alignment unconstrained.
-    const bool IsUnconstrained = Alignment <= DL.getABITypeAlignment(SliceTy);
+    const bool IsUnconstrained = Alignment <= DL.getABITypeAlign(SliceTy);
     NewAI = new AllocaInst(
         SliceTy, AI.getType()->getAddressSpace(), nullptr,
-        IsUnconstrained ? MaybeAlign() : Alignment,
+        IsUnconstrained ? DL.getPrefTypeAlign(SliceTy) : Alignment,
         AI.getName() + ".sroa." + Twine(P.begin() - AS.begin()), &AI);
     // Copy the old AI debug location over to the new one.
     NewAI->setDebugLoc(AI.getDebugLoc());
@@ -4270,7 +4384,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
   // to be rewritten into a partition.
   bool IsSorted = true;
 
-  uint64_t AllocaSize = DL.getTypeAllocSize(AI.getAllocatedType());
+  uint64_t AllocaSize =
+      DL.getTypeAllocSize(AI.getAllocatedType()).getFixedSize();
   const uint64_t MaxBitVectorSize = 1024;
   if (AllocaSize <= MaxBitVectorSize) {
     // If a byte boundary is included in any load or store, a slice starting or
@@ -4334,7 +4449,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
       Changed = true;
       if (NewAI != &AI) {
         uint64_t SizeOfByte = 8;
-        uint64_t AllocaSize = DL.getTypeSizeInBits(NewAI->getAllocatedType());
+        uint64_t AllocaSize =
+            DL.getTypeSizeInBits(NewAI->getAllocatedType()).getFixedSize();
         // Don't include any padding.
         uint64_t Size = std::min(AllocaSize, P.size() * SizeOfByte);
         Fragments.push_back(Fragment(NewAI, P.beginOffset() * SizeOfByte, Size));
@@ -4354,7 +4470,8 @@ bool SROA::splitAlloca(AllocaInst &AI, AllocaSlices &AS) {
     auto *Expr = DbgDeclares.front()->getExpression();
     auto VarSize = Var->getSizeInBits();
     DIBuilder DIB(*AI.getModule(), /*AllowUnresolved*/ false);
-    uint64_t AllocaSize = DL.getTypeSizeInBits(AI.getAllocatedType());
+    uint64_t AllocaSize =
+        DL.getTypeSizeInBits(AI.getAllocatedType()).getFixedSize();
     for (auto Fragment : Fragments) {
       // Create a fragment expression describing the new partition or reuse AI's
       // expression if there is only one partition.
@@ -4442,8 +4559,9 @@ bool SROA::runOnAlloca(AllocaInst &AI) {
   const DataLayout &DL = AI.getModule()->getDataLayout();
 
   // Skip alloca forms that this analysis can't handle.
-  if (AI.isArrayAllocation() || !AI.getAllocatedType()->isSized() ||
-      DL.getTypeAllocSize(AI.getAllocatedType()) == 0)
+  auto *AT = AI.getAllocatedType();
+  if (AI.isArrayAllocation() || !AT->isSized() || isa<ScalableVectorType>(AT) ||
+      DL.getTypeAllocSize(AT).getFixedSize() == 0)
     return false;
 
   bool Changed = false;
@@ -4563,8 +4681,14 @@ PreservedAnalyses SROA::runImpl(Function &F, DominatorTree &RunDT,
   BasicBlock &EntryBB = F.getEntryBlock();
   for (BasicBlock::iterator I = EntryBB.begin(), E = std::prev(EntryBB.end());
        I != E; ++I) {
-    if (AllocaInst *AI = dyn_cast<AllocaInst>(I))
-      Worklist.insert(AI);
+    if (AllocaInst *AI = dyn_cast<AllocaInst>(I)) {
+      if (isa<ScalableVectorType>(AI->getAllocatedType())) {
+        if (isAllocaPromotable(AI))
+          PromotableAllocas.push_back(AI);
+      } else {
+        Worklist.insert(AI);
+      }
+    }
   }
 
   bool Changed = false;
diff --git a/llvm/lib/Transforms/Scalar/Scalarizer.cpp b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
index c25c6c632b8f..851bd79cd6d8 100644
--- a/llvm/lib/Transforms/Scalar/Scalarizer.cpp
+++ b/llvm/lib/Transforms/Scalar/Scalarizer.cpp
@@ -22,8 +22,8 @@
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
-#include "llvm/IR/Dominators.h"
 #include "llvm/IR/DerivedTypes.h"
+#include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/InstVisitor.h"
@@ -41,6 +41,7 @@
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Transforms/Scalar.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include <cassert>
 #include <cstdint>
 #include <iterator>
@@ -51,6 +52,11 @@ using namespace llvm;
 
 #define DEBUG_TYPE "scalarizer"
 
+static cl::opt<bool> ScalarizeVariableInsertExtract(
+    "scalarize-variable-insert-extract", cl::init(true), cl::Hidden,
+    cl::desc("Allow the scalarizer pass to scalarize "
+             "insertelement/extractelement with variable index"));
+
 // This is disabled by default because having separate loads and stores
 // makes it more likely that the -combiner-alias-analysis limits will be
 // reached.
@@ -156,8 +162,8 @@ struct VectorLayout {
   VectorLayout() = default;
 
   // Return the alignment of element I.
-  uint64_t getElemAlign(unsigned I) {
-    return MinAlign(VecAlign, I * ElemSize);
+  Align getElemAlign(unsigned I) {
+    return commonAlignment(VecAlign, I * ElemSize);
   }
 
   // The type of the vector.
@@ -167,7 +173,7 @@ struct VectorLayout {
   Type *ElemTy = nullptr;
 
   // The alignment of the vector.
-  uint64_t VecAlign = 0;
+  Align VecAlign;
 
   // The size of each element.
   uint64_t ElemSize = 0;
@@ -192,6 +198,8 @@ public:
   bool visitGetElementPtrInst(GetElementPtrInst &GEPI);
   bool visitCastInst(CastInst &CI);
   bool visitBitCastInst(BitCastInst &BCI);
+  bool visitInsertElementInst(InsertElementInst &IEI);
+  bool visitExtractElementInst(ExtractElementInst &EEI);
   bool visitShuffleVectorInst(ShuffleVectorInst &SVI);
   bool visitPHINode(PHINode &PHI);
   bool visitLoadInst(LoadInst &LI);
@@ -203,8 +211,8 @@ private:
   void gather(Instruction *Op, const ValueVector &CV);
   bool canTransferMetadata(unsigned Kind);
   void transferMetadataAndIRFlags(Instruction *Op, const ValueVector &CV);
-  bool getVectorLayout(Type *Ty, unsigned Alignment, VectorLayout &Layout,
-                       const DataLayout &DL);
+  Optional<VectorLayout> getVectorLayout(Type *Ty, Align Alignment,
+                                         const DataLayout &DL);
   bool finish();
 
   template<typename T> bool splitUnary(Instruction &, const T &);
@@ -215,6 +223,8 @@ private:
   ScatterMap Scattered;
   GatherList Gathered;
 
+  SmallVector<WeakTrackingVH, 32> PotentiallyDeadInstrs;
+
   unsigned ParallelLoopAccessMDKind;
 
   DominatorTree *DT;
@@ -252,7 +262,7 @@ Scatterer::Scatterer(BasicBlock *bb, BasicBlock::iterator bbi, Value *v,
   PtrTy = dyn_cast<PointerType>(Ty);
   if (PtrTy)
     Ty = PtrTy->getElementType();
-  Size = Ty->getVectorNumElements();
+  Size = cast<FixedVectorType>(Ty)->getNumElements();
   if (!CachePtr)
     Tmp.resize(Size, nullptr);
   else if (CachePtr->empty())
@@ -269,7 +279,7 @@ Value *Scatterer::operator[](unsigned I) {
     return CV[I];
   IRBuilder<> Builder(BB, BBI);
   if (PtrTy) {
-    Type *ElTy = PtrTy->getElementType()->getVectorElementType();
+    Type *ElTy = cast<VectorType>(PtrTy->getElementType())->getElementType();
     if (!CV[0]) {
       Type *NewPtrTy = PointerType::get(ElTy, PtrTy->getAddressSpace());
       CV[0] = Builder.CreateBitCast(V, NewPtrTy, V->getName() + ".i0");
@@ -376,11 +386,6 @@ Scatterer ScalarizerVisitor::scatter(Instruction *Point, Value *V) {
 // so that we can avoid creating the gathered form if all uses of Op are
 // replaced with uses of CV.
 void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV) {
-  // Since we're not deleting Op yet, stub out its operands, so that it
-  // doesn't make anything live unnecessarily.
-  for (unsigned I = 0, E = Op->getNumOperands(); I != E; ++I)
-    Op->setOperand(I, UndefValue::get(Op->getOperand(I)->getType()));
-
   transferMetadataAndIRFlags(Op, CV);
 
   // If we already have a scattered form of Op (created from ExtractElements
@@ -389,13 +394,13 @@ void ScalarizerVisitor::gather(Instruction *Op, const ValueVector &CV) {
   if (!SV.empty()) {
     for (unsigned I = 0, E = SV.size(); I != E; ++I) {
       Value *V = SV[I];
-      if (V == nullptr)
+      if (V == nullptr || SV[I] == CV[I])
         continue;
 
       Instruction *Old = cast<Instruction>(V);
       CV[I]->takeName(Old);
       Old->replaceAllUsesWith(CV[I]);
-      Old->eraseFromParent();
+      PotentiallyDeadInstrs.emplace_back(Old);
     }
   }
   SV = CV;
@@ -434,25 +439,22 @@ void ScalarizerVisitor::transferMetadataAndIRFlags(Instruction *Op,
 }
 
 // Try to fill in Layout from Ty, returning true on success.  Alignment is
-// the alignment of the vector, or 0 if the ABI default should be used.
-bool ScalarizerVisitor::getVectorLayout(Type *Ty, unsigned Alignment,
-                                 VectorLayout &Layout, const DataLayout &DL) {
+// the alignment of the vector, or None if the ABI default should be used.
+Optional<VectorLayout>
+ScalarizerVisitor::getVectorLayout(Type *Ty, Align Alignment,
+                                   const DataLayout &DL) {
+  VectorLayout Layout;
   // Make sure we're dealing with a vector.
   Layout.VecTy = dyn_cast<VectorType>(Ty);
   if (!Layout.VecTy)
-    return false;
-
+    return None;
   // Check that we're dealing with full-byte elements.
   Layout.ElemTy = Layout.VecTy->getElementType();
   if (!DL.typeSizeEqualsStoreSize(Layout.ElemTy))
-    return false;
-
-  if (Alignment)
-    Layout.VecAlign = Alignment;
-  else
-    Layout.VecAlign = DL.getABITypeAlignment(Layout.VecTy);
+    return None;
+  Layout.VecAlign = Alignment;
   Layout.ElemSize = DL.getTypeStoreSize(Layout.ElemTy);
-  return true;
+  return Layout;
 }
 
 // Scalarize one-operand instruction I, using Split(Builder, X, Name)
@@ -463,7 +465,7 @@ bool ScalarizerVisitor::splitUnary(Instruction &I, const Splitter &Split) {
   if (!VT)
     return false;
 
-  unsigned NumElems = VT->getNumElements();
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
   IRBuilder<> Builder(&I);
   Scatterer Op = scatter(&I, I.getOperand(0));
   assert(Op.size() == NumElems && "Mismatched unary operation");
@@ -483,17 +485,19 @@ bool ScalarizerVisitor::splitBinary(Instruction &I, const Splitter &Split) {
   if (!VT)
     return false;
 
-  unsigned NumElems = VT->getNumElements();
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
   IRBuilder<> Builder(&I);
-  Scatterer Op0 = scatter(&I, I.getOperand(0));
-  Scatterer Op1 = scatter(&I, I.getOperand(1));
-  assert(Op0.size() == NumElems && "Mismatched binary operation");
-  assert(Op1.size() == NumElems && "Mismatched binary operation");
+  Scatterer VOp0 = scatter(&I, I.getOperand(0));
+  Scatterer VOp1 = scatter(&I, I.getOperand(1));
+  assert(VOp0.size() == NumElems && "Mismatched binary operation");
+  assert(VOp1.size() == NumElems && "Mismatched binary operation");
   ValueVector Res;
   Res.resize(NumElems);
-  for (unsigned Elem = 0; Elem < NumElems; ++Elem)
-    Res[Elem] = Split(Builder, Op0[Elem], Op1[Elem],
-                      I.getName() + ".i" + Twine(Elem));
+  for (unsigned Elem = 0; Elem < NumElems; ++Elem) {
+    Value *Op0 = VOp0[Elem];
+    Value *Op1 = VOp1[Elem];
+    Res[Elem] = Split(Builder, Op0, Op1, I.getName() + ".i" + Twine(Elem));
+  }
   gather(&I, Res);
   return true;
 }
@@ -524,7 +528,7 @@ bool ScalarizerVisitor::splitCall(CallInst &CI) {
   if (ID == Intrinsic::not_intrinsic || !isTriviallyScalariable(ID))
     return false;
 
-  unsigned NumElems = VT->getNumElements();
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
   unsigned NumArgs = CI.getNumArgOperands();
 
   ValueVector ScalarOperands(NumArgs);
@@ -574,26 +578,33 @@ bool ScalarizerVisitor::visitSelectInst(SelectInst &SI) {
   if (!VT)
     return false;
 
-  unsigned NumElems = VT->getNumElements();
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
   IRBuilder<> Builder(&SI);
-  Scatterer Op1 = scatter(&SI, SI.getOperand(1));
-  Scatterer Op2 = scatter(&SI, SI.getOperand(2));
-  assert(Op1.size() == NumElems && "Mismatched select");
-  assert(Op2.size() == NumElems && "Mismatched select");
+  Scatterer VOp1 = scatter(&SI, SI.getOperand(1));
+  Scatterer VOp2 = scatter(&SI, SI.getOperand(2));
+  assert(VOp1.size() == NumElems && "Mismatched select");
+  assert(VOp2.size() == NumElems && "Mismatched select");
   ValueVector Res;
   Res.resize(NumElems);
 
   if (SI.getOperand(0)->getType()->isVectorTy()) {
-    Scatterer Op0 = scatter(&SI, SI.getOperand(0));
-    assert(Op0.size() == NumElems && "Mismatched select");
-    for (unsigned I = 0; I < NumElems; ++I)
-      Res[I] = Builder.CreateSelect(Op0[I], Op1[I], Op2[I],
+    Scatterer VOp0 = scatter(&SI, SI.getOperand(0));
+    assert(VOp0.size() == NumElems && "Mismatched select");
+    for (unsigned I = 0; I < NumElems; ++I) {
+      Value *Op0 = VOp0[I];
+      Value *Op1 = VOp1[I];
+      Value *Op2 = VOp2[I];
+      Res[I] = Builder.CreateSelect(Op0, Op1, Op2,
                                     SI.getName() + ".i" + Twine(I));
+    }
   } else {
     Value *Op0 = SI.getOperand(0);
-    for (unsigned I = 0; I < NumElems; ++I)
-      Res[I] = Builder.CreateSelect(Op0, Op1[I], Op2[I],
+    for (unsigned I = 0; I < NumElems; ++I) {
+      Value *Op1 = VOp1[I];
+      Value *Op2 = VOp2[I];
+      Res[I] = Builder.CreateSelect(Op0, Op1, Op2,
                                     SI.getName() + ".i" + Twine(I));
+    }
   }
   gather(&SI, Res);
   return true;
@@ -621,7 +632,7 @@ bool ScalarizerVisitor::visitGetElementPtrInst(GetElementPtrInst &GEPI) {
     return false;
 
   IRBuilder<> Builder(&GEPI);
-  unsigned NumElems = VT->getNumElements();
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
   unsigned NumIndices = GEPI.getNumIndices();
 
   // The base pointer might be scalar even if it's a vector GEP. In those cases,
@@ -666,7 +677,7 @@ bool ScalarizerVisitor::visitCastInst(CastInst &CI) {
   if (!VT)
     return false;
 
-  unsigned NumElems = VT->getNumElements();
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
   IRBuilder<> Builder(&CI);
   Scatterer Op0 = scatter(&CI, CI.getOperand(0));
   assert(Op0.size() == NumElems && "Mismatched cast");
@@ -685,8 +696,8 @@ bool ScalarizerVisitor::visitBitCastInst(BitCastInst &BCI) {
   if (!DstVT || !SrcVT)
     return false;
 
-  unsigned DstNumElems = DstVT->getNumElements();
-  unsigned SrcNumElems = SrcVT->getNumElements();
+  unsigned DstNumElems = cast<FixedVectorType>(DstVT)->getNumElements();
+  unsigned SrcNumElems = cast<FixedVectorType>(SrcVT)->getNumElements();
   IRBuilder<> Builder(&BCI);
   Scatterer Op0 = scatter(&BCI, BCI.getOperand(0));
   ValueVector Res;
@@ -700,7 +711,7 @@ bool ScalarizerVisitor::visitBitCastInst(BitCastInst &BCI) {
     // <M x t1> -> <N*M x t2>.  Convert each t1 to <N x t2> and copy the
     // individual elements to the destination.
     unsigned FanOut = DstNumElems / SrcNumElems;
-    Type *MidTy = VectorType::get(DstVT->getElementType(), FanOut);
+    auto *MidTy = FixedVectorType::get(DstVT->getElementType(), FanOut);
     unsigned ResI = 0;
     for (unsigned Op0I = 0; Op0I < SrcNumElems; ++Op0I) {
       Value *V = Op0[Op0I];
@@ -718,7 +729,7 @@ bool ScalarizerVisitor::visitBitCastInst(BitCastInst &BCI) {
   } else {
     // <N*M x t1> -> <M x t2>.  Convert each group of <N x t1> into a t2.
     unsigned FanIn = SrcNumElems / DstNumElems;
-    Type *MidTy = VectorType::get(SrcVT->getElementType(), FanIn);
+    auto *MidTy = FixedVectorType::get(SrcVT->getElementType(), FanIn);
     unsigned Op0I = 0;
     for (unsigned ResI = 0; ResI < DstNumElems; ++ResI) {
       Value *V = UndefValue::get(MidTy);
@@ -734,12 +745,79 @@ bool ScalarizerVisitor::visitBitCastInst(BitCastInst &BCI) {
   return true;
 }
 
+bool ScalarizerVisitor::visitInsertElementInst(InsertElementInst &IEI) {
+  VectorType *VT = dyn_cast<VectorType>(IEI.getType());
+  if (!VT)
+    return false;
+
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
+  IRBuilder<> Builder(&IEI);
+  Scatterer Op0 = scatter(&IEI, IEI.getOperand(0));
+  Value *NewElt = IEI.getOperand(1);
+  Value *InsIdx = IEI.getOperand(2);
+
+  ValueVector Res;
+  Res.resize(NumElems);
+
+  if (auto *CI = dyn_cast<ConstantInt>(InsIdx)) {
+    for (unsigned I = 0; I < NumElems; ++I)
+      Res[I] = CI->getValue().getZExtValue() == I ? NewElt : Op0[I];
+  } else {
+    if (!ScalarizeVariableInsertExtract)
+      return false;
+
+    for (unsigned I = 0; I < NumElems; ++I) {
+      Value *ShouldReplace =
+          Builder.CreateICmpEQ(InsIdx, ConstantInt::get(InsIdx->getType(), I),
+                               InsIdx->getName() + ".is." + Twine(I));
+      Value *OldElt = Op0[I];
+      Res[I] = Builder.CreateSelect(ShouldReplace, NewElt, OldElt,
+                                    IEI.getName() + ".i" + Twine(I));
+    }
+  }
+
+  gather(&IEI, Res);
+  return true;
+}
+
+bool ScalarizerVisitor::visitExtractElementInst(ExtractElementInst &EEI) {
+  VectorType *VT = dyn_cast<VectorType>(EEI.getOperand(0)->getType());
+  if (!VT)
+    return false;
+
+  unsigned NumSrcElems = cast<FixedVectorType>(VT)->getNumElements();
+  IRBuilder<> Builder(&EEI);
+  Scatterer Op0 = scatter(&EEI, EEI.getOperand(0));
+  Value *ExtIdx = EEI.getOperand(1);
+
+  if (auto *CI = dyn_cast<ConstantInt>(ExtIdx)) {
+    Value *Res = Op0[CI->getValue().getZExtValue()];
+    gather(&EEI, {Res});
+    return true;
+  }
+
+  if (!ScalarizeVariableInsertExtract)
+    return false;
+
+  Value *Res = UndefValue::get(VT->getElementType());
+  for (unsigned I = 0; I < NumSrcElems; ++I) {
+    Value *ShouldExtract =
+        Builder.CreateICmpEQ(ExtIdx, ConstantInt::get(ExtIdx->getType(), I),
+                             ExtIdx->getName() + ".is." + Twine(I));
+    Value *Elt = Op0[I];
+    Res = Builder.CreateSelect(ShouldExtract, Elt, Res,
+                               EEI.getName() + ".upto" + Twine(I));
+  }
+  gather(&EEI, {Res});
+  return true;
+}
+
 bool ScalarizerVisitor::visitShuffleVectorInst(ShuffleVectorInst &SVI) {
   VectorType *VT = dyn_cast<VectorType>(SVI.getType());
   if (!VT)
     return false;
 
-  unsigned NumElems = VT->getNumElements();
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
   Scatterer Op0 = scatter(&SVI, SVI.getOperand(0));
   Scatterer Op1 = scatter(&SVI, SVI.getOperand(1));
   ValueVector Res;
@@ -763,7 +841,7 @@ bool ScalarizerVisitor::visitPHINode(PHINode &PHI) {
   if (!VT)
     return false;
 
-  unsigned NumElems = VT->getNumElements();
+  unsigned NumElems = cast<FixedVectorType>(VT)->getNumElements();
   IRBuilder<> Builder(&PHI);
   ValueVector Res;
   Res.resize(NumElems);
@@ -789,20 +867,20 @@ bool ScalarizerVisitor::visitLoadInst(LoadInst &LI) {
   if (!LI.isSimple())
     return false;
 
-  VectorLayout Layout;
-  if (!getVectorLayout(LI.getType(), LI.getAlignment(), Layout,
-                       LI.getModule()->getDataLayout()))
+  Optional<VectorLayout> Layout = getVectorLayout(
+      LI.getType(), LI.getAlign(), LI.getModule()->getDataLayout());
+  if (!Layout)
     return false;
 
-  unsigned NumElems = Layout.VecTy->getNumElements();
+  unsigned NumElems = cast<FixedVectorType>(Layout->VecTy)->getNumElements();
   IRBuilder<> Builder(&LI);
   Scatterer Ptr = scatter(&LI, LI.getPointerOperand());
   ValueVector Res;
   Res.resize(NumElems);
 
   for (unsigned I = 0; I < NumElems; ++I)
-    Res[I] = Builder.CreateAlignedLoad(Layout.VecTy->getElementType(), Ptr[I],
-                                       Layout.getElemAlign(I),
+    Res[I] = Builder.CreateAlignedLoad(Layout->VecTy->getElementType(), Ptr[I],
+                                       Align(Layout->getElemAlign(I)),
                                        LI.getName() + ".i" + Twine(I));
   gather(&LI, Res);
   return true;
@@ -814,22 +892,23 @@ bool ScalarizerVisitor::visitStoreInst(StoreInst &SI) {
   if (!SI.isSimple())
     return false;
 
-  VectorLayout Layout;
   Value *FullValue = SI.getValueOperand();
-  if (!getVectorLayout(FullValue->getType(), SI.getAlignment(), Layout,
-                       SI.getModule()->getDataLayout()))
+  Optional<VectorLayout> Layout = getVectorLayout(
+      FullValue->getType(), SI.getAlign(), SI.getModule()->getDataLayout());
+  if (!Layout)
     return false;
 
-  unsigned NumElems = Layout.VecTy->getNumElements();
+  unsigned NumElems = cast<FixedVectorType>(Layout->VecTy)->getNumElements();
   IRBuilder<> Builder(&SI);
-  Scatterer Ptr = scatter(&SI, SI.getPointerOperand());
-  Scatterer Val = scatter(&SI, FullValue);
+  Scatterer VPtr = scatter(&SI, SI.getPointerOperand());
+  Scatterer VVal = scatter(&SI, FullValue);
 
   ValueVector Stores;
   Stores.resize(NumElems);
   for (unsigned I = 0; I < NumElems; ++I) {
-    unsigned Align = Layout.getElemAlign(I);
-    Stores[I] = Builder.CreateAlignedStore(Val[I], Ptr[I], Align);
+    Value *Val = VVal[I];
+    Value *Ptr = VPtr[I];
+    Stores[I] = Builder.CreateAlignedStore(Val, Ptr, Layout->getElemAlign(I));
   }
   transferMetadataAndIRFlags(&SI, Stores);
   return true;
@@ -852,23 +931,32 @@ bool ScalarizerVisitor::finish() {
     if (!Op->use_empty()) {
       // The value is still needed, so recreate it using a series of
       // InsertElements.
-      Type *Ty = Op->getType();
-      Value *Res = UndefValue::get(Ty);
-      BasicBlock *BB = Op->getParent();
-      unsigned Count = Ty->getVectorNumElements();
-      IRBuilder<> Builder(Op);
-      if (isa<PHINode>(Op))
-        Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
-      for (unsigned I = 0; I < Count; ++I)
-        Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I),
-                                          Op->getName() + ".upto" + Twine(I));
+      Value *Res = UndefValue::get(Op->getType());
+      if (auto *Ty = dyn_cast<VectorType>(Op->getType())) {
+        BasicBlock *BB = Op->getParent();
+        unsigned Count = cast<FixedVectorType>(Ty)->getNumElements();
+        IRBuilder<> Builder(Op);
+        if (isa<PHINode>(Op))
+          Builder.SetInsertPoint(BB, BB->getFirstInsertionPt());
+        for (unsigned I = 0; I < Count; ++I)
+          Res = Builder.CreateInsertElement(Res, CV[I], Builder.getInt32(I),
+                                            Op->getName() + ".upto" + Twine(I));
+      } else {
+        assert(CV.size() == 1 && Op->getType() == CV[0]->getType());
+        Res = CV[0];
+        if (Op == Res)
+          continue;
+      }
       Res->takeName(Op);
       Op->replaceAllUsesWith(Res);
     }
-    Op->eraseFromParent();
+    PotentiallyDeadInstrs.emplace_back(Op);
   }
   Gathered.clear();
   Scattered.clear();
+
+  RecursivelyDeleteTriviallyDeadInstructionsPermissive(PotentiallyDeadInstrs);
+
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
index 2a1a040bf83e..f1d2e3c1ecfa 100644
--- a/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
+++ b/llvm/lib/Transforms/Scalar/SeparateConstOffsetFromGEP.cpp
@@ -431,8 +431,10 @@ private:
   bool reuniteExts(Instruction *I);
 
   /// Find the closest dominator of <Dominatee> that is equivalent to <Key>.
-  Instruction *findClosestMatchingDominator(const SCEV *Key,
-                                            Instruction *Dominatee);
+  Instruction *findClosestMatchingDominator(
+      const SCEV *Key, Instruction *Dominatee,
+      DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs);
+
   /// Verify F is free of dead code.
   void verifyNoDeadCode(Function &F);
 
@@ -456,7 +458,8 @@ private:
   /// multiple GEPs with a single index.
   bool LowerGEP;
 
-  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingExprs;
+  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingAdds;
+  DenseMap<const SCEV *, SmallVector<Instruction *, 2>> DominatingSubs;
 };
 
 } // end anonymous namespace
@@ -519,7 +522,7 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
     //   sext(a + b) = sext(a) + sext(b)
     // even if the addition is not marked nsw.
     //
-    // Leveraging this invarient, we can trace into an sext'ed inbound GEP
+    // Leveraging this invariant, we can trace into an sext'ed inbound GEP
     // index if the constant offset is non-negative.
     //
     // Verified in @sext_add in split-gep.ll.
@@ -549,6 +552,9 @@ bool ConstantOffsetExtractor::CanTraceInto(bool SignExtended,
 APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
                                                    bool SignExtended,
                                                    bool ZeroExtended) {
+  // Save off the current height of the chain, in case we need to restore it.
+  size_t ChainLength = UserChain.size();
+
   // BO being non-negative does not shed light on whether its operands are
   // non-negative. Clear the NonNegative flag here.
   APInt ConstantOffset = find(BO->getOperand(0), SignExtended, ZeroExtended,
@@ -559,12 +565,22 @@ APInt ConstantOffsetExtractor::findInEitherOperand(BinaryOperator *BO,
   // However, such cases are probably already handled by -instcombine,
   // given this pass runs after the standard optimizations.
   if (ConstantOffset != 0) return ConstantOffset;
+
+  // Reset the chain back to where it was when we started exploring this node,
+  // since visiting the LHS didn't pan out.
+  UserChain.resize(ChainLength);
+
   ConstantOffset = find(BO->getOperand(1), SignExtended, ZeroExtended,
                         /* NonNegative */ false);
   // If U is a sub operator, negate the constant offset found in the right
   // operand.
   if (BO->getOpcode() == Instruction::Sub)
     ConstantOffset = -ConstantOffset;
+
+  // If RHS wasn't a suitable candidate either, reset the chain again.
+  if (ConstantOffset == 0)
+    UserChain.resize(ChainLength);
+
   return ConstantOffset;
 }
 
@@ -688,7 +704,7 @@ Value *ConstantOffsetExtractor::removeConstOffset(unsigned ChainIndex) {
   }
 
   BinaryOperator *BO = cast<BinaryOperator>(UserChain[ChainIndex]);
-  assert(BO->getNumUses() <= 1 &&
+  assert((BO->use_empty() || BO->hasOneUse()) &&
          "distributeExtsAndCloneChain clones each BinaryOperator in "
          "UserChain, so no one should be used more than "
          "once");
@@ -1141,7 +1157,8 @@ bool SeparateConstOffsetFromGEP::runOnFunction(Function &F) {
 }
 
 Instruction *SeparateConstOffsetFromGEP::findClosestMatchingDominator(
-    const SCEV *Key, Instruction *Dominatee) {
+    const SCEV *Key, Instruction *Dominatee,
+    DenseMap<const SCEV *, SmallVector<Instruction *, 2>> &DominatingExprs) {
   auto Pos = DominatingExprs.find(Key);
   if (Pos == DominatingExprs.end())
     return nullptr;
@@ -1169,12 +1186,23 @@ bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
   // If Dom can't sign overflow and Dom dominates I, optimize I to sext(Dom).
   // TODO: handle zext
   Value *LHS = nullptr, *RHS = nullptr;
-  if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS)))) ||
-      match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
+  if (match(I, m_Add(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
     if (LHS->getType() == RHS->getType()) {
       const SCEV *Key =
           SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
-      if (auto *Dom = findClosestMatchingDominator(Key, I)) {
+      if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingAdds)) {
+        Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
+        NewSExt->takeName(I);
+        I->replaceAllUsesWith(NewSExt);
+        RecursivelyDeleteTriviallyDeadInstructions(I);
+        return true;
+      }
+    }
+  } else if (match(I, m_Sub(m_SExt(m_Value(LHS)), m_SExt(m_Value(RHS))))) {
+    if (LHS->getType() == RHS->getType()) {
+      const SCEV *Key =
+          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+      if (auto *Dom = findClosestMatchingDominator(Key, I, DominatingSubs)) {
         Instruction *NewSExt = new SExtInst(Dom, I->getType(), "", I);
         NewSExt->takeName(I);
         I->replaceAllUsesWith(NewSExt);
@@ -1185,12 +1213,17 @@ bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
   }
 
   // Add I to DominatingExprs if it's an add/sub that can't sign overflow.
-  if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS))) ||
-      match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
-    if (programUndefinedIfFullPoison(I)) {
+  if (match(I, m_NSWAdd(m_Value(LHS), m_Value(RHS)))) {
+    if (programUndefinedIfPoison(I)) {
+      const SCEV *Key =
+          SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
+      DominatingAdds[Key].push_back(I);
+    }
+  } else if (match(I, m_NSWSub(m_Value(LHS), m_Value(RHS)))) {
+    if (programUndefinedIfPoison(I)) {
       const SCEV *Key =
           SE->getAddExpr(SE->getUnknown(LHS), SE->getUnknown(RHS));
-      DominatingExprs[Key].push_back(I);
+      DominatingSubs[Key].push_back(I);
     }
   }
   return false;
@@ -1198,7 +1231,8 @@ bool SeparateConstOffsetFromGEP::reuniteExts(Instruction *I) {
 
 bool SeparateConstOffsetFromGEP::reuniteExts(Function &F) {
   bool Changed = false;
-  DominatingExprs.clear();
+  DominatingAdds.clear();
+  DominatingSubs.clear();
   for (const auto Node : depth_first(DT)) {
     BasicBlock *BB = Node->getBlock();
     for (auto I = BB->begin(); I != BB->end(); ) {
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index d7a34acb4318..6c6d6ca9cf65 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -26,7 +26,6 @@
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemorySSA.h"
 #include "llvm/Analysis/MemorySSAUpdater.h"
-#include "llvm/Analysis/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
@@ -36,6 +35,7 @@
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Use.h"
 #include "llvm/IR/Value.h"
 #include "llvm/InitializePasses.h"
@@ -182,7 +182,7 @@ static void buildPartialUnswitchConditionalBranch(BasicBlock &BB,
                                                   BasicBlock &UnswitchedSucc,
                                                   BasicBlock &NormalSucc) {
   IRBuilder<> IRB(&BB);
-  
+
   Value *Cond = Direction ? IRB.CreateOr(Invariants) :
     IRB.CreateAnd(Invariants);
   IRB.CreateCondBr(Cond, Direction ? &UnswitchedSucc : &NormalSucc,
@@ -598,19 +598,36 @@ static bool unswitchTrivialSwitch(Loop &L, SwitchInst &SI, DominatorTree &DT,
 
   auto *ParentBB = SI.getParent();
 
+  // The same check must be used both for the default and the exit cases. We
+  // should never leave edges from the switch instruction to a basic block that
+  // we are unswitching, hence the condition used to determine the default case
+  // needs to also be used to populate ExitCaseIndices, which is then used to
+  // remove cases from the switch.
+  auto IsTriviallyUnswitchableExitBlock = [&](BasicBlock &BBToCheck) {
+    // BBToCheck is not an exit block if it is inside loop L.
+    if (L.contains(&BBToCheck))
+      return false;
+    // BBToCheck is not trivial to unswitch if its phis aren't loop invariant.
+    if (!areLoopExitPHIsLoopInvariant(L, *ParentBB, BBToCheck))
+      return false;
+    // We do not unswitch a block that only has an unreachable statement, as
+    // it's possible this is a previously unswitched block. Only unswitch if
+    // either the terminator is not unreachable, or, if it is, it's not the only
+    // instruction in the block.
+    auto *TI = BBToCheck.getTerminator();
+    bool isUnreachable = isa<UnreachableInst>(TI);
+    return !isUnreachable ||
+           (isUnreachable && (BBToCheck.getFirstNonPHIOrDbg() != TI));
+  };
+
   SmallVector<int, 4> ExitCaseIndices;
-  for (auto Case : SI.cases()) {
-    auto *SuccBB = Case.getCaseSuccessor();
-    if (!L.contains(SuccBB) &&
-        areLoopExitPHIsLoopInvariant(L, *ParentBB, *SuccBB))
+  for (auto Case : SI.cases())
+    if (IsTriviallyUnswitchableExitBlock(*Case.getCaseSuccessor()))
       ExitCaseIndices.push_back(Case.getCaseIndex());
-  }
   BasicBlock *DefaultExitBB = nullptr;
   SwitchInstProfUpdateWrapper::CaseWeightOpt DefaultCaseWeight =
       SwitchInstProfUpdateWrapper::getSuccessorWeight(SI, 0);
-  if (!L.contains(SI.getDefaultDest()) &&
-      areLoopExitPHIsLoopInvariant(L, *ParentBB, *SI.getDefaultDest()) &&
-      !isa<UnreachableInst>(SI.getDefaultDest()->getTerminator())) {
+  if (IsTriviallyUnswitchableExitBlock(*SI.getDefaultDest())) {
     DefaultExitBB = SI.getDefaultDest();
   } else if (ExitCaseIndices.empty())
     return false;
@@ -1557,6 +1574,11 @@ static void deleteDeadBlocksFromLoop(Loop &L,
     // Check that the dominator tree has already been updated.
     assert(!DT.getNode(BB) && "Should already have cleared domtree!");
     LI.changeLoopFor(BB, nullptr);
+    // Drop all uses of the instructions to make sure we won't have dangling
+    // uses in other blocks.
+    for (auto &I : *BB)
+      if (!I.use_empty())
+        I.replaceAllUsesWith(UndefValue::get(I.getType()));
     BB->dropAllReferences();
   }
 
@@ -2465,7 +2487,7 @@ turnGuardIntoBranch(IntrinsicInst *GI, Loop &L,
 /// unswitch candidates, making adequate predictions instead of wild guesses.
 /// That requires knowing not just the number of "remaining" candidates but
 /// also costs of unswitching for each of these candidates.
-static int calculateUnswitchCostMultiplier(
+static int CalculateUnswitchCostMultiplier(
     Instruction &TI, Loop &L, LoopInfo &LI, DominatorTree &DT,
     ArrayRef<std::pair<Instruction *, TinyPtrVector<Value *>>>
         UnswitchCandidates) {
@@ -2656,11 +2678,11 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
 
       if (I.getType()->isTokenTy() && I.isUsedOutsideOfBlock(BB))
         return false;
-      if (auto CS = CallSite(&I))
-        if (CS.isConvergent() || CS.cannotDuplicate())
+      if (auto *CB = dyn_cast<CallBase>(&I))
+        if (CB->isConvergent() || CB->cannotDuplicate())
           return false;
 
-      Cost += TTI.getUserCost(&I);
+      Cost += TTI.getUserCost(&I, TargetTransformInfo::TCK_CodeSize);
     }
     assert(Cost >= 0 && "Must not have negative costs!");
     LoopCost += Cost;
@@ -2754,7 +2776,7 @@ unswitchBestCondition(Loop &L, DominatorTree &DT, LoopInfo &LI,
     // exponential behavior of loop-unswitch.
     if (EnableUnswitchCostMultiplier) {
       int CostMultiplier =
-          calculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates);
+          CalculateUnswitchCostMultiplier(TI, L, LI, DT, UnswitchCandidates);
       assert(
           (CostMultiplier > 0 && CostMultiplier <= UnswitchThreshold) &&
           "cost multiplier needs to be in the range of 1..UnswitchThreshold");
@@ -2868,7 +2890,7 @@ PreservedAnalyses SimpleLoopUnswitchPass::run(Loop &L, LoopAnalysisManager &AM,
 
   // Save the current loop name in a variable so that we can report it even
   // after it has been deleted.
-  std::string LoopName = L.getName();
+  std::string LoopName = std::string(L.getName());
 
   auto UnswitchCB = [&L, &U, &LoopName](bool CurrentLoopValid,
                                         ArrayRef<Loop *> NewLoops) {
@@ -2983,10 +3005,6 @@ bool SimpleLoopUnswitchLegacyPass::runOnLoop(Loop *L, LPPassManager &LPM) {
   if (MSSA && VerifyMemorySSA)
     MSSA->verifyMemorySSA();
 
-  // If anything was unswitched, also clear any cached information about this
-  // loop.
-  LPM.deleteSimpleAnalysisLoop(L);
-
   // Historically this pass has had issues with the dominator tree so verify it
   // in asserts builds.
   assert(DT.verify(DominatorTree::VerificationLevel::Fast));
diff --git a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
index 623a8b711ed8..2e459c9a64d4 100644
--- a/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
+++ b/llvm/lib/Transforms/Scalar/SimplifyCFGPass.cpp
@@ -104,6 +104,21 @@ static bool mergeEmptyReturnBlocks(Function &F) {
       continue;
     }
 
+    // Skip merging if this would result in a CallBr instruction with a
+    // duplicate destination. FIXME: See note in CodeGenPrepare.cpp.
+    bool SkipCallBr = false;
+    for (pred_iterator PI = pred_begin(&BB), E = pred_end(&BB);
+         PI != E && !SkipCallBr; ++PI) {
+      if (auto *CBI = dyn_cast<CallBrInst>((*PI)->getTerminator()))
+        for (unsigned i = 0, e = CBI->getNumSuccessors(); i != e; ++i)
+          if (RetBlock == CBI->getSuccessor(i)) {
+            SkipCallBr = true;
+            break;
+          }
+    }
+    if (SkipCallBr)
+      continue;
+
     // Otherwise, we found a duplicate return block.  Merge the two.
     Changed = true;
 
@@ -266,6 +281,14 @@ struct CFGSimplifyPass : public FunctionPass {
       return false;
 
     Options.AC = &getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    if (F.hasFnAttribute(Attribute::OptForFuzzing)) {
+      Options.setSimplifyCondBranch(false)
+             .setFoldTwoEntryPHINode(false);
+    } else {
+      Options.setSimplifyCondBranch(true)
+             .setFoldTwoEntryPHINode(true);
+    }
+
     auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
     return simplifyFunctionCFG(F, TTI, Options);
   }
diff --git a/llvm/lib/Transforms/Scalar/Sink.cpp b/llvm/lib/Transforms/Scalar/Sink.cpp
index 677d86f8c7b4..48f289c8f17d 100644
--- a/llvm/lib/Transforms/Scalar/Sink.cpp
+++ b/llvm/lib/Transforms/Scalar/Sink.cpp
@@ -166,8 +166,8 @@ static bool SinkInstruction(Instruction *Inst,
   // dominated by one of the successors.
   // Look at all the dominated blocks and see if we can sink it in one.
   DomTreeNode *DTN = DT.getNode(Inst->getParent());
-  for (DomTreeNode::iterator I = DTN->begin(), E = DTN->end();
-      I != E && SuccToSinkTo == nullptr; ++I) {
+  for (auto I = DTN->begin(), E = DTN->end(); I != E && SuccToSinkTo == nullptr;
+       ++I) {
     BasicBlock *Candidate = (*I)->getBlock();
     // A node always immediate-dominates its children on the dominator
     // tree.
diff --git a/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp b/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
index cd7bfb2f20dc..8258b92a716d 100644
--- a/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculateAroundPHIs.cpp
@@ -67,8 +67,8 @@ isSafeToSpeculatePHIUsers(PHINode &PN, DominatorTree &DT,
       return false;
     }
 
-    if (auto CS = ImmutableCallSite(UI)) {
-      if (CS.isConvergent() || CS.cannotDuplicate()) {
+    if (const auto *CS = dyn_cast<CallBase>(UI)) {
+      if (CS->isConvergent() || CS->cannotDuplicate()) {
         LLVM_DEBUG(dbgs() << "  Unsafe: convergent "
                    "callsite cannot de duplicated: " << *UI << '\n');
         return false;
@@ -232,7 +232,8 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
       continue;
 
     int &MatCost = InsertResult.first->second.MatCost;
-    MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType());
+    MatCost = TTI.getIntImmCost(IncomingC->getValue(), IncomingC->getType(),
+                                TargetTransformInfo::TCK_SizeAndLatency);
     NonFreeMat |= MatCost != TTI.TCC_Free;
   }
   if (!NonFreeMat) {
@@ -283,12 +284,15 @@ static bool isSafeAndProfitableToSpeculateAroundPHI(
       int MatCost = IncomingConstantAndCostsAndCount.second.MatCost;
       int &FoldedCost = IncomingConstantAndCostsAndCount.second.FoldedCost;
       if (IID)
-        FoldedCost += TTI.getIntImmCostIntrin(IID, Idx, IncomingC->getValue(),
-                                              IncomingC->getType());
+        FoldedCost +=
+          TTI.getIntImmCostIntrin(IID, Idx, IncomingC->getValue(),
+                                  IncomingC->getType(),
+                                  TargetTransformInfo::TCK_SizeAndLatency);
       else
         FoldedCost +=
             TTI.getIntImmCostInst(UserI->getOpcode(), Idx,
-                                  IncomingC->getValue(), IncomingC->getType());
+                                  IncomingC->getValue(), IncomingC->getType(),
+                                  TargetTransformInfo::TCK_SizeAndLatency);
 
       // If we accumulate more folded cost for this incoming constant than
       // materialized cost, then we'll regress any edge with this constant so
@@ -465,7 +469,7 @@ findProfitablePHIs(ArrayRef<PHINode *> PNs,
             if (CostMapIt != SpecCostMap.end())
               Cost += CostMapIt->second;
           }
-        Cost += TTI.getUserCost(I);
+        Cost += TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
         bool Inserted = SpecCostMap.insert({I, Cost}).second;
         (void)Inserted;
         assert(Inserted && "Must not re-insert a cost during the DFS!");
diff --git a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
index c8d899bb4871..f82a2936c762 100644
--- a/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
+++ b/llvm/lib/Transforms/Scalar/SpeculativeExecution.cpp
@@ -65,6 +65,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Operator.h"
 #include "llvm/InitializePasses.h"
@@ -244,19 +245,35 @@ static unsigned ComputeSpeculationCost(const Instruction *I,
     case Instruction::FNeg:
     case Instruction::ICmp:
     case Instruction::FCmp:
-      return TTI.getUserCost(I);
+      return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
 
     default:
-      return UINT_MAX; // Disallow anything not whitelisted.
+      return UINT_MAX; // Disallow anything not explicitly listed.
   }
 }
 
 bool SpeculativeExecutionPass::considerHoistingFromTo(
     BasicBlock &FromBlock, BasicBlock &ToBlock) {
   SmallPtrSet<const Instruction *, 8> NotHoisted;
-  const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](User *U) {
-    for (Value* V : U->operand_values()) {
-      if (Instruction *I = dyn_cast<Instruction>(V)) {
+  const auto AllPrecedingUsesFromBlockHoisted = [&NotHoisted](const User *U) {
+    // Debug variable has special operand to check it's not hoisted.
+    if (const auto *DVI = dyn_cast<DbgVariableIntrinsic>(U)) {
+      if (const auto *I =
+              dyn_cast_or_null<Instruction>(DVI->getVariableLocation()))
+        if (NotHoisted.count(I) == 0)
+          return true;
+      return false;
+    }
+
+    // Usially debug label instrinsic corresponds to label in LLVM IR. In these
+    // cases we should not move it here.
+    // TODO: Possible special processing needed to detect it is related to a
+    // hoisted instruction.
+    if (isa<DbgLabelInst>(U))
+      return false;
+
+    for (const Value *V : U->operand_values()) {
+      if (const Instruction *I = dyn_cast<Instruction>(V)) {
         if (NotHoisted.count(I) > 0)
           return false;
       }
@@ -265,7 +282,8 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
   };
 
   unsigned TotalSpeculationCost = 0;
-  for (auto& I : FromBlock) {
+  unsigned NotHoistedInstCount = 0;
+  for (const auto &I : FromBlock) {
     const unsigned Cost = ComputeSpeculationCost(&I, *TTI);
     if (Cost != UINT_MAX && isSafeToSpeculativelyExecute(&I) &&
         AllPrecedingUsesFromBlockHoisted(&I)) {
@@ -273,15 +291,15 @@ bool SpeculativeExecutionPass::considerHoistingFromTo(
       if (TotalSpeculationCost > SpecExecMaxSpeculationCost)
         return false;  // too much to hoist
     } else {
-      NotHoisted.insert(&I);
-      if (NotHoisted.size() > SpecExecMaxNotHoisted)
+      // Debug info instrinsics should not be counted for threshold.
+      if (!isa<DbgInfoIntrinsic>(I))
+        NotHoistedInstCount++;
+      if (NotHoistedInstCount > SpecExecMaxNotHoisted)
         return false; // too much left behind
+      NotHoisted.insert(&I);
     }
   }
 
-  if (TotalSpeculationCost == 0)
-    return false; // nothing to hoist
-
   for (auto I = FromBlock.begin(); I != FromBlock.end();) {
     // We have to increment I before moving Current as moving Current
     // changes the list that I is iterating through.
diff --git a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
index 4ce4ce46f67a..c20e57b02c1a 100644
--- a/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
+++ b/llvm/lib/Transforms/Scalar/StructurizeCFG.cpp
@@ -8,13 +8,12 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
-#include "llvm/ADT/PostOrderIterator.h"
+#include "llvm/ADT/SCCIterator.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/InstructionSimplify.h"
 #include "llvm/Analysis/LegacyDivergenceAnalysis.h"
-#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/RegionInfo.h"
 #include "llvm/Analysis/RegionIterator.h"
 #include "llvm/Analysis/RegionPass.h"
@@ -34,6 +33,7 @@
 #include "llvm/IR/Use.h"
 #include "llvm/IR/User.h"
 #include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Pass.h"
 #include "llvm/Support/Casting.h"
@@ -43,6 +43,7 @@
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Scalar.h"
 #include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/Local.h"
 #include "llvm/Transforms/Utils/SSAUpdater.h"
 #include <algorithm>
 #include <cassert>
@@ -88,6 +89,59 @@ using BBPredicates = DenseMap<BasicBlock *, Value *>;
 using PredMap = DenseMap<BasicBlock *, BBPredicates>;
 using BB2BBMap = DenseMap<BasicBlock *, BasicBlock *>;
 
+// A traits type that is intended to be used in graph algorithms. The graph
+// traits starts at an entry node, and traverses the RegionNodes that are in
+// the Nodes set.
+struct SubGraphTraits {
+  using NodeRef = std::pair<RegionNode *, SmallDenseSet<RegionNode *> *>;
+  using BaseSuccIterator = GraphTraits<RegionNode *>::ChildIteratorType;
+
+  // This wraps a set of Nodes into the iterator, so we know which edges to
+  // filter out.
+  class WrappedSuccIterator
+      : public iterator_adaptor_base<
+            WrappedSuccIterator, BaseSuccIterator,
+            typename std::iterator_traits<BaseSuccIterator>::iterator_category,
+            NodeRef, std::ptrdiff_t, NodeRef *, NodeRef> {
+    SmallDenseSet<RegionNode *> *Nodes;
+
+  public:
+    WrappedSuccIterator(BaseSuccIterator It, SmallDenseSet<RegionNode *> *Nodes)
+        : iterator_adaptor_base(It), Nodes(Nodes) {}
+
+    NodeRef operator*() const { return {*I, Nodes}; }
+  };
+
+  static bool filterAll(const NodeRef &N) { return true; }
+  static bool filterSet(const NodeRef &N) { return N.second->count(N.first); }
+
+  using ChildIteratorType =
+      filter_iterator<WrappedSuccIterator, bool (*)(const NodeRef &)>;
+
+  static NodeRef getEntryNode(Region *R) {
+    return {GraphTraits<Region *>::getEntryNode(R), nullptr};
+  }
+
+  static NodeRef getEntryNode(NodeRef N) { return N; }
+
+  static iterator_range<ChildIteratorType> children(const NodeRef &N) {
+    auto *filter = N.second ? &filterSet : &filterAll;
+    return make_filter_range(
+        make_range<WrappedSuccIterator>(
+            {GraphTraits<RegionNode *>::child_begin(N.first), N.second},
+            {GraphTraits<RegionNode *>::child_end(N.first), N.second}),
+        filter);
+  }
+
+  static ChildIteratorType child_begin(const NodeRef &N) {
+    return children(N).begin();
+  }
+
+  static ChildIteratorType child_end(const NodeRef &N) {
+    return children(N).end();
+  }
+};
+
 /// Finds the nearest common dominator of a set of BasicBlocks.
 ///
 /// For every BB you add to the set, you can specify whether we "remember" the
@@ -192,11 +246,11 @@ class StructurizeCFG : public RegionPass {
 
   LegacyDivergenceAnalysis *DA;
   DominatorTree *DT;
-  LoopInfo *LI;
 
   SmallVector<RegionNode *, 8> Order;
   BBSet Visited;
 
+  SmallVector<WeakVH, 8> AffectedPhis;
   BBPhiMap DeletedPhis;
   BB2BBVecMap AddedPhis;
 
@@ -211,13 +265,8 @@ class StructurizeCFG : public RegionPass {
 
   void orderNodes();
 
-  Loop *getAdjustedLoop(RegionNode *RN);
-  unsigned getAdjustedLoopDepth(RegionNode *RN);
-
   void analyzeLoops(RegionNode *N);
 
-  Value *invert(Value *Condition);
-
   Value *buildCondition(BranchInst *Term, unsigned Idx, bool Invert);
 
   void gatherPredicates(RegionNode *N);
@@ -232,6 +281,8 @@ class StructurizeCFG : public RegionPass {
 
   void setPhiValues();
 
+  void simplifyAffectedPhis();
+
   void killTerminator(BasicBlock *BB);
 
   void changeExit(RegionNode *Node, BasicBlock *NewExit,
@@ -279,7 +330,6 @@ public:
       AU.addRequired<LegacyDivergenceAnalysis>();
     AU.addRequiredID(LowerSwitchID);
     AU.addRequired<DominatorTreeWrapperPass>();
-    AU.addRequired<LoopInfoWrapperPass>();
 
     AU.addPreserved<DominatorTreeWrapperPass>();
     RegionPass::getAnalysisUsage(AU);
@@ -311,75 +361,60 @@ bool StructurizeCFG::doInitialization(Region *R, RGPassManager &RGM) {
   return false;
 }
 
-/// Use the exit block to determine the loop if RN is a SubRegion.
-Loop *StructurizeCFG::getAdjustedLoop(RegionNode *RN) {
-  if (RN->isSubRegion()) {
-    Region *SubRegion = RN->getNodeAs<Region>();
-    return LI->getLoopFor(SubRegion->getExit());
-  }
-
-  return LI->getLoopFor(RN->getEntry());
-}
-
-/// Use the exit block to determine the loop depth if RN is a SubRegion.
-unsigned StructurizeCFG::getAdjustedLoopDepth(RegionNode *RN) {
-  if (RN->isSubRegion()) {
-    Region *SubR = RN->getNodeAs<Region>();
-    return LI->getLoopDepth(SubR->getExit());
-  }
-
-  return LI->getLoopDepth(RN->getEntry());
-}
-
-/// Build up the general order of nodes
+/// Build up the general order of nodes, by performing a topological sort of the
+/// parent region's nodes, while ensuring that there is no outer cycle node
+/// between any two inner cycle nodes.
 void StructurizeCFG::orderNodes() {
-  ReversePostOrderTraversal<Region*> RPOT(ParentRegion);
-  SmallDenseMap<Loop*, unsigned, 8> LoopBlocks;
-
-  // The reverse post-order traversal of the list gives us an ordering close
-  // to what we want.  The only problem with it is that sometimes backedges
-  // for outer loops will be visited before backedges for inner loops.
-  for (RegionNode *RN : RPOT) {
-    Loop *Loop = getAdjustedLoop(RN);
-    ++LoopBlocks[Loop];
-  }
-
-  unsigned CurrentLoopDepth = 0;
-  Loop *CurrentLoop = nullptr;
-  for (auto I = RPOT.begin(), E = RPOT.end(); I != E; ++I) {
-    RegionNode *RN = cast<RegionNode>(*I);
-    unsigned LoopDepth = getAdjustedLoopDepth(RN);
-
-    if (is_contained(Order, *I))
-      continue;
-
-    if (LoopDepth < CurrentLoopDepth) {
-      // Make sure we have visited all blocks in this loop before moving back to
-      // the outer loop.
+  Order.resize(std::distance(GraphTraits<Region *>::nodes_begin(ParentRegion),
+                             GraphTraits<Region *>::nodes_end(ParentRegion)));
+  if (Order.empty())
+    return;
 
-      auto LoopI = I;
-      while (unsigned &BlockCount = LoopBlocks[CurrentLoop]) {
-        LoopI++;
-        if (getAdjustedLoop(cast<RegionNode>(*LoopI)) == CurrentLoop) {
-          --BlockCount;
-          Order.push_back(*LoopI);
-        }
+  SmallDenseSet<RegionNode *> Nodes;
+  auto EntryNode = SubGraphTraits::getEntryNode(ParentRegion);
+
+  // A list of range indices of SCCs in Order, to be processed.
+  SmallVector<std::pair<unsigned, unsigned>, 8> WorkList;
+  unsigned I = 0, E = Order.size();
+  while (true) {
+    // Run through all the SCCs in the subgraph starting with Entry.
+    for (auto SCCI =
+             scc_iterator<SubGraphTraits::NodeRef, SubGraphTraits>::begin(
+                 EntryNode);
+         !SCCI.isAtEnd(); ++SCCI) {
+      auto &SCC = *SCCI;
+
+      // An SCC up to the size of 2, can be reduced to an entry (the last node),
+      // and a possible additional node. Therefore, it is already in order, and
+      // there is no need to add it to the work-list.
+      unsigned Size = SCC.size();
+      if (Size > 2)
+        WorkList.emplace_back(I, I + Size);
+
+      // Add the SCC nodes to the Order array.
+      for (auto &N : SCC) {
+        assert(I < E && "SCC size mismatch!");
+        Order[I++] = N.first;
       }
     }
+    assert(I == E && "SCC size mismatch!");
 
-    CurrentLoop = getAdjustedLoop(RN);
-    if (CurrentLoop)
-      LoopBlocks[CurrentLoop]--;
+    // If there are no more SCCs to order, then we are done.
+    if (WorkList.empty())
+      break;
 
-    CurrentLoopDepth = LoopDepth;
-    Order.push_back(*I);
-  }
+    std::tie(I, E) = WorkList.pop_back_val();
+
+    // Collect the set of nodes in the SCC's subgraph. These are only the
+    // possible child nodes; we do not add the entry (last node) otherwise we
+    // will have the same exact SCC all over again.
+    Nodes.clear();
+    Nodes.insert(Order.begin() + I, Order.begin() + E - 1);
 
-  // This pass originally used a post-order traversal and then operated on
-  // the list in reverse. Now that we are using a reverse post-order traversal
-  // rather than re-working the whole pass to operate on the list in order,
-  // we just reverse the list and continue to operate on it in reverse.
-  std::reverse(Order.begin(), Order.end());
+    // Update the entry node.
+    EntryNode.first = Order[E - 1];
+    EntryNode.second = &Nodes;
+  }
 }
 
 /// Determine the end of the loops
@@ -401,39 +436,6 @@ void StructurizeCFG::analyzeLoops(RegionNode *N) {
   }
 }
 
-/// Invert the given condition
-Value *StructurizeCFG::invert(Value *Condition) {
-  // First: Check if it's a constant
-  if (Constant *C = dyn_cast<Constant>(Condition))
-    return ConstantExpr::getNot(C);
-
-  // Second: If the condition is already inverted, return the original value
-  Value *NotCondition;
-  if (match(Condition, m_Not(m_Value(NotCondition))))
-    return NotCondition;
-
-  if (Instruction *Inst = dyn_cast<Instruction>(Condition)) {
-    // Third: Check all the users for an invert
-    BasicBlock *Parent = Inst->getParent();
-    for (User *U : Condition->users())
-      if (Instruction *I = dyn_cast<Instruction>(U))
-        if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition))))
-          return I;
-
-    // Last option: Create a new instruction
-    return BinaryOperator::CreateNot(Condition, "", Parent->getTerminator());
-  }
-
-  if (Argument *Arg = dyn_cast<Argument>(Condition)) {
-    BasicBlock &EntryBlock = Arg->getParent()->getEntryBlock();
-    return BinaryOperator::CreateNot(Condition,
-                                     Arg->getName() + ".inv",
-                                     EntryBlock.getTerminator());
-  }
-
-  llvm_unreachable("Unhandled condition to invert");
-}
-
 /// Build the condition for one edge
 Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
                                       bool Invert) {
@@ -442,7 +444,7 @@ Value *StructurizeCFG::buildCondition(BranchInst *Term, unsigned Idx,
     Cond = Term->getCondition();
 
     if (Idx != (unsigned)Invert)
-      Cond = invert(Cond);
+      Cond = invertCondition(Cond);
   }
   return Cond;
 }
@@ -520,8 +522,7 @@ void StructurizeCFG::collectInfos() {
   for (RegionNode *RN : reverse(Order)) {
     LLVM_DEBUG(dbgs() << "Visiting: "
                       << (RN->isSubRegion() ? "SubRegion with entry: " : "")
-                      << RN->getEntry()->getName() << " Loop Depth: "
-                      << LI->getLoopDepth(RN->getEntry()) << "\n");
+                      << RN->getEntry()->getName() << "\n");
 
     // Analyze all the conditions leading to a node
     gatherPredicates(RN);
@@ -585,9 +586,14 @@ void StructurizeCFG::insertConditions(bool Loops) {
 void StructurizeCFG::delPhiValues(BasicBlock *From, BasicBlock *To) {
   PhiMap &Map = DeletedPhis[To];
   for (PHINode &Phi : To->phis()) {
+    bool Recorded = false;
     while (Phi.getBasicBlockIndex(From) != -1) {
       Value *Deleted = Phi.removeIncomingValue(From, false);
       Map[&Phi].push_back(std::make_pair(From, Deleted));
+      if (!Recorded) {
+        AffectedPhis.push_back(&Phi);
+        Recorded = true;
+      }
     }
   }
 }
@@ -632,28 +638,29 @@ void StructurizeCFG::setPhiValues() {
 
       for (BasicBlock *FI : From)
         Phi->setIncomingValueForBlock(FI, Updater.GetValueAtEndOfBlock(FI));
+      AffectedPhis.push_back(Phi);
     }
 
     DeletedPhis.erase(To);
   }
   assert(DeletedPhis.empty());
 
-  // Simplify any phis inserted by the SSAUpdater if possible
+  AffectedPhis.append(InsertedPhis.begin(), InsertedPhis.end());
+}
+
+void StructurizeCFG::simplifyAffectedPhis() {
   bool Changed;
   do {
     Changed = false;
-
     SimplifyQuery Q(Func->getParent()->getDataLayout());
     Q.DT = DT;
-    for (size_t i = 0; i < InsertedPhis.size(); ++i) {
-      PHINode *Phi = InsertedPhis[i];
-      if (Value *V = SimplifyInstruction(Phi, Q)) {
-        Phi->replaceAllUsesWith(V);
-        Phi->eraseFromParent();
-        InsertedPhis[i] = InsertedPhis.back();
-        InsertedPhis.pop_back();
-        i--;
-        Changed = true;
+    for (WeakVH VH : AffectedPhis) {
+      if (auto Phi = dyn_cast_or_null<PHINode>(VH)) {
+        if (auto NewValue = SimplifyInstruction(Phi, Q)) {
+          Phi->replaceAllUsesWith(NewValue);
+          Phi->eraseFromParent();
+          Changed = true;
+        }
       }
     }
   } while (Changed);
@@ -886,6 +893,7 @@ void StructurizeCFG::createFlow() {
   BasicBlock *Exit = ParentRegion->getExit();
   bool EntryDominatesExit = DT->dominates(ParentRegion->getEntry(), Exit);
 
+  AffectedPhis.clear();
   DeletedPhis.clear();
   AddedPhis.clear();
   Conditions.clear();
@@ -1036,7 +1044,6 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
   ParentRegion = R;
 
   DT = &getAnalysis<DominatorTreeWrapperPass>().getDomTree();
-  LI = &getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
 
   orderNodes();
   collectInfos();
@@ -1044,6 +1051,7 @@ bool StructurizeCFG::runOnRegion(Region *R, RGPassManager &RGM) {
   insertConditions(false);
   insertConditions(true);
   setPhiValues();
+  simplifyAffectedPhis();
   rebuildSSA();
 
   // Cleanup
diff --git a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
index 9f0ab9103d42..5bb1d54d7d12 100644
--- a/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
+++ b/llvm/lib/Transforms/Scalar/TailRecursionElimination.cpp
@@ -64,7 +64,6 @@
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DerivedTypes.h"
@@ -126,16 +125,16 @@ struct AllocaDerivedValueTracker {
       switch (I->getOpcode()) {
       case Instruction::Call:
       case Instruction::Invoke: {
-        CallSite CS(I);
+        auto &CB = cast<CallBase>(*I);
         // If the alloca-derived argument is passed byval it is not an escape
         // point, or a use of an alloca. Calling with byval copies the contents
         // of the alloca into argument registers or stack slots, which exist
         // beyond the lifetime of the current frame.
-        if (CS.isArgOperand(U) && CS.isByValArgument(CS.getArgumentNo(U)))
+        if (CB.isArgOperand(U) && CB.isByValArgument(CB.getArgOperandNo(U)))
           continue;
         bool IsNocapture =
-            CS.isDataOperand(U) && CS.doesNotCapture(CS.getDataOperandNo(U));
-        callUsesLocalStack(CS, IsNocapture);
+            CB.isDataOperand(U) && CB.doesNotCapture(CB.getDataOperandNo(U));
+        callUsesLocalStack(CB, IsNocapture);
         if (IsNocapture) {
           // If the alloca-derived argument is passed in as nocapture, then it
           // can't propagate to the call's return. That would be capturing.
@@ -168,17 +167,17 @@ struct AllocaDerivedValueTracker {
     }
   }
 
-  void callUsesLocalStack(CallSite CS, bool IsNocapture) {
+  void callUsesLocalStack(CallBase &CB, bool IsNocapture) {
     // Add it to the list of alloca users.
-    AllocaUsers.insert(CS.getInstruction());
+    AllocaUsers.insert(&CB);
 
     // If it's nocapture then it can't capture this alloca.
     if (IsNocapture)
       return;
 
     // If it can write to memory, it can leak the alloca value.
-    if (!CS.onlyReadsMemory())
-      EscapePoints.insert(CS.getInstruction());
+    if (!CB.onlyReadsMemory())
+      EscapePoints.insert(&CB);
   }
 
   SmallPtrSet<Instruction *, 32> AllocaUsers;
@@ -342,7 +341,7 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
       const DataLayout &DL = L->getModule()->getDataLayout();
       if (isModSet(AA->getModRefInfo(CI, MemoryLocation::get(L))) ||
           !isSafeToLoadUnconditionally(L->getPointerOperand(), L->getType(),
-                                       MaybeAlign(L->getAlignment()), DL, L))
+                                       L->getAlign(), DL, L))
         return false;
     }
   }
@@ -355,89 +354,23 @@ static bool canMoveAboveCall(Instruction *I, CallInst *CI, AliasAnalysis *AA) {
   return !is_contained(I->operands(), CI);
 }
 
-/// Return true if the specified value is the same when the return would exit
-/// as it was when the initial iteration of the recursive function was executed.
-///
-/// We currently handle static constants and arguments that are not modified as
-/// part of the recursion.
-static bool isDynamicConstant(Value *V, CallInst *CI, ReturnInst *RI) {
-  if (isa<Constant>(V)) return true; // Static constants are always dyn consts
-
-  // Check to see if this is an immutable argument, if so, the value
-  // will be available to initialize the accumulator.
-  if (Argument *Arg = dyn_cast<Argument>(V)) {
-    // Figure out which argument number this is...
-    unsigned ArgNo = 0;
-    Function *F = CI->getParent()->getParent();
-    for (Function::arg_iterator AI = F->arg_begin(); &*AI != Arg; ++AI)
-      ++ArgNo;
-
-    // If we are passing this argument into call as the corresponding
-    // argument operand, then the argument is dynamically constant.
-    // Otherwise, we cannot transform this function safely.
-    if (CI->getArgOperand(ArgNo) == Arg)
-      return true;
-  }
-
-  // Switch cases are always constant integers. If the value is being switched
-  // on and the return is only reachable from one of its cases, it's
-  // effectively constant.
-  if (BasicBlock *UniquePred = RI->getParent()->getUniquePredecessor())
-    if (SwitchInst *SI = dyn_cast<SwitchInst>(UniquePred->getTerminator()))
-      if (SI->getCondition() == V)
-        return SI->getDefaultDest() != RI->getParent();
-
-  // Not a constant or immutable argument, we can't safely transform.
-  return false;
-}
-
-/// Check to see if the function containing the specified tail call consistently
-/// returns the same runtime-constant value at all exit points except for
-/// IgnoreRI. If so, return the returned value.
-static Value *getCommonReturnValue(ReturnInst *IgnoreRI, CallInst *CI) {
-  Function *F = CI->getParent()->getParent();
-  Value *ReturnedValue = nullptr;
-
-  for (BasicBlock &BBI : *F) {
-    ReturnInst *RI = dyn_cast<ReturnInst>(BBI.getTerminator());
-    if (RI == nullptr || RI == IgnoreRI) continue;
-
-    // We can only perform this transformation if the value returned is
-    // evaluatable at the start of the initial invocation of the function,
-    // instead of at the end of the evaluation.
-    //
-    Value *RetOp = RI->getOperand(0);
-    if (!isDynamicConstant(RetOp, CI, RI))
-      return nullptr;
-
-    if (ReturnedValue && RetOp != ReturnedValue)
-      return nullptr;     // Cannot transform if differing values are returned.
-    ReturnedValue = RetOp;
-  }
-  return ReturnedValue;
-}
+static bool canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) {
+  if (!I->isAssociative() || !I->isCommutative())
+    return false;
 
-/// If the specified instruction can be transformed using accumulator recursion
-/// elimination, return the constant which is the start of the accumulator
-/// value.  Otherwise return null.
-static Value *canTransformAccumulatorRecursion(Instruction *I, CallInst *CI) {
-  if (!I->isAssociative() || !I->isCommutative()) return nullptr;
   assert(I->getNumOperands() == 2 &&
          "Associative/commutative operations should have 2 args!");
 
   // Exactly one operand should be the result of the call instruction.
   if ((I->getOperand(0) == CI && I->getOperand(1) == CI) ||
       (I->getOperand(0) != CI && I->getOperand(1) != CI))
-    return nullptr;
+    return false;
 
   // The only user of this instruction we allow is a single return instruction.
   if (!I->hasOneUse() || !isa<ReturnInst>(I->user_back()))
-    return nullptr;
+    return false;
 
-  // Ok, now we have to check all of the other return instructions in this
-  // function.  If they return non-constants or differing values, then we cannot
-  // transform the function safely.
-  return getCommonReturnValue(cast<ReturnInst>(I->user_back()), CI);
+  return true;
 }
 
 static Instruction *firstNonDbg(BasicBlock::iterator I) {
@@ -446,11 +379,73 @@ static Instruction *firstNonDbg(BasicBlock::iterator I) {
   return &*I;
 }
 
-static CallInst *findTRECandidate(Instruction *TI,
-                                  bool CannotTailCallElimCallsMarkedTail,
-                                  const TargetTransformInfo *TTI) {
+namespace {
+class TailRecursionEliminator {
+  Function &F;
+  const TargetTransformInfo *TTI;
+  AliasAnalysis *AA;
+  OptimizationRemarkEmitter *ORE;
+  DomTreeUpdater &DTU;
+
+  // The below are shared state we want to have available when eliminating any
+  // calls in the function. There values should be populated by
+  // createTailRecurseLoopHeader the first time we find a call we can eliminate.
+  BasicBlock *HeaderBB = nullptr;
+  SmallVector<PHINode *, 8> ArgumentPHIs;
+  bool RemovableCallsMustBeMarkedTail = false;
+
+  // PHI node to store our return value.
+  PHINode *RetPN = nullptr;
+
+  // i1 PHI node to track if we have a valid return value stored in RetPN.
+  PHINode *RetKnownPN = nullptr;
+
+  // Vector of select instructions we insereted. These selects use RetKnownPN
+  // to either propagate RetPN or select a new return value.
+  SmallVector<SelectInst *, 8> RetSelects;
+
+  // The below are shared state needed when performing accumulator recursion.
+  // There values should be populated by insertAccumulator the first time we
+  // find an elimination that requires an accumulator.
+
+  // PHI node to store our current accumulated value.
+  PHINode *AccPN = nullptr;
+
+  // The instruction doing the accumulating.
+  Instruction *AccumulatorRecursionInstr = nullptr;
+
+  TailRecursionEliminator(Function &F, const TargetTransformInfo *TTI,
+                          AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
+                          DomTreeUpdater &DTU)
+      : F(F), TTI(TTI), AA(AA), ORE(ORE), DTU(DTU) {}
+
+  CallInst *findTRECandidate(Instruction *TI,
+                             bool CannotTailCallElimCallsMarkedTail);
+
+  void createTailRecurseLoopHeader(CallInst *CI);
+
+  void insertAccumulator(Instruction *AccRecInstr);
+
+  bool eliminateCall(CallInst *CI);
+
+  bool foldReturnAndProcessPred(ReturnInst *Ret,
+                                bool CannotTailCallElimCallsMarkedTail);
+
+  bool processReturningBlock(ReturnInst *Ret,
+                             bool CannotTailCallElimCallsMarkedTail);
+
+  void cleanupAndFinalize();
+
+public:
+  static bool eliminate(Function &F, const TargetTransformInfo *TTI,
+                        AliasAnalysis *AA, OptimizationRemarkEmitter *ORE,
+                        DomTreeUpdater &DTU);
+};
+} // namespace
+
+CallInst *TailRecursionEliminator::findTRECandidate(
+    Instruction *TI, bool CannotTailCallElimCallsMarkedTail) {
   BasicBlock *BB = TI->getParent();
-  Function *F = BB->getParent();
 
   if (&BB->front() == TI) // Make sure there is something before the terminator.
     return nullptr;
@@ -461,7 +456,7 @@ static CallInst *findTRECandidate(Instruction *TI,
   BasicBlock::iterator BBI(TI);
   while (true) {
     CI = dyn_cast<CallInst>(BBI);
-    if (CI && CI->getCalledFunction() == F)
+    if (CI && CI->getCalledFunction() == &F)
       break;
 
     if (BBI == BB->begin())
@@ -478,16 +473,14 @@ static CallInst *findTRECandidate(Instruction *TI,
   //   double fabs(double f) { return __builtin_fabs(f); } // a 'fabs' call
   // and disable this xform in this case, because the code generator will
   // lower the call to fabs into inline code.
-  if (BB == &F->getEntryBlock() &&
+  if (BB == &F.getEntryBlock() &&
       firstNonDbg(BB->front().getIterator()) == CI &&
       firstNonDbg(std::next(BB->begin())) == TI && CI->getCalledFunction() &&
       !TTI->isLoweredToCall(CI->getCalledFunction())) {
     // A single-block function with just a call and a return. Check that
     // the arguments match.
-    CallSite::arg_iterator I = CallSite(CI).arg_begin(),
-                           E = CallSite(CI).arg_end();
-    Function::arg_iterator FI = F->arg_begin(),
-                           FE = F->arg_end();
+    auto I = CI->arg_begin(), E = CI->arg_end();
+    Function::arg_iterator FI = F.arg_begin(), FE = F.arg_end();
     for (; I != E && FI != FE; ++I, ++FI)
       if (*I != &*FI) break;
     if (I == E && FI == FE)
@@ -497,27 +490,106 @@ static CallInst *findTRECandidate(Instruction *TI,
   return CI;
 }
 
-static bool eliminateRecursiveTailCall(
-    CallInst *CI, ReturnInst *Ret, BasicBlock *&OldEntry,
-    bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs,
-    AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, DomTreeUpdater &DTU) {
-  // If we are introducing accumulator recursion to eliminate operations after
-  // the call instruction that are both associative and commutative, the initial
-  // value for the accumulator is placed in this variable.  If this value is set
-  // then we actually perform accumulator recursion elimination instead of
-  // simple tail recursion elimination.  If the operation is an LLVM instruction
-  // (eg: "add") then it is recorded in AccumulatorRecursionInstr.  If not, then
-  // we are handling the case when the return instruction returns a constant C
-  // which is different to the constant returned by other return instructions
-  // (which is recorded in AccumulatorRecursionEliminationInitVal).  This is a
-  // special case of accumulator recursion, the operation being "return C".
-  Value *AccumulatorRecursionEliminationInitVal = nullptr;
-  Instruction *AccumulatorRecursionInstr = nullptr;
+void TailRecursionEliminator::createTailRecurseLoopHeader(CallInst *CI) {
+  HeaderBB = &F.getEntryBlock();
+  BasicBlock *NewEntry = BasicBlock::Create(F.getContext(), "", &F, HeaderBB);
+  NewEntry->takeName(HeaderBB);
+  HeaderBB->setName("tailrecurse");
+  BranchInst *BI = BranchInst::Create(HeaderBB, NewEntry);
+  BI->setDebugLoc(CI->getDebugLoc());
+
+  // If this function has self recursive calls in the tail position where some
+  // are marked tail and some are not, only transform one flavor or another.
+  // We have to choose whether we move allocas in the entry block to the new
+  // entry block or not, so we can't make a good choice for both. We make this
+  // decision here based on whether the first call we found to remove is
+  // marked tail.
+  // NOTE: We could do slightly better here in the case that the function has
+  // no entry block allocas.
+  RemovableCallsMustBeMarkedTail = CI->isTailCall();
+
+  // If this tail call is marked 'tail' and if there are any allocas in the
+  // entry block, move them up to the new entry block.
+  if (RemovableCallsMustBeMarkedTail)
+    // Move all fixed sized allocas from HeaderBB to NewEntry.
+    for (BasicBlock::iterator OEBI = HeaderBB->begin(), E = HeaderBB->end(),
+                              NEBI = NewEntry->begin();
+         OEBI != E;)
+      if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
+        if (isa<ConstantInt>(AI->getArraySize()))
+          AI->moveBefore(&*NEBI);
+
+  // Now that we have created a new block, which jumps to the entry
+  // block, insert a PHI node for each argument of the function.
+  // For now, we initialize each PHI to only have the real arguments
+  // which are passed in.
+  Instruction *InsertPos = &HeaderBB->front();
+  for (Function::arg_iterator I = F.arg_begin(), E = F.arg_end(); I != E; ++I) {
+    PHINode *PN =
+        PHINode::Create(I->getType(), 2, I->getName() + ".tr", InsertPos);
+    I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
+    PN->addIncoming(&*I, NewEntry);
+    ArgumentPHIs.push_back(PN);
+  }
+
+  // If the function doen't return void, create the RetPN and RetKnownPN PHI
+  // nodes to track our return value. We initialize RetPN with undef and
+  // RetKnownPN with false since we can't know our return value at function
+  // entry.
+  Type *RetType = F.getReturnType();
+  if (!RetType->isVoidTy()) {
+    Type *BoolType = Type::getInt1Ty(F.getContext());
+    RetPN = PHINode::Create(RetType, 2, "ret.tr", InsertPos);
+    RetKnownPN = PHINode::Create(BoolType, 2, "ret.known.tr", InsertPos);
+
+    RetPN->addIncoming(UndefValue::get(RetType), NewEntry);
+    RetKnownPN->addIncoming(ConstantInt::getFalse(BoolType), NewEntry);
+  }
+
+  // The entry block was changed from HeaderBB to NewEntry.
+  // The forward DominatorTree needs to be recalculated when the EntryBB is
+  // changed. In this corner-case we recalculate the entire tree.
+  DTU.recalculate(*NewEntry->getParent());
+}
+
+void TailRecursionEliminator::insertAccumulator(Instruction *AccRecInstr) {
+  assert(!AccPN && "Trying to insert multiple accumulators");
+
+  AccumulatorRecursionInstr = AccRecInstr;
+
+  // Start by inserting a new PHI node for the accumulator.
+  pred_iterator PB = pred_begin(HeaderBB), PE = pred_end(HeaderBB);
+  AccPN = PHINode::Create(F.getReturnType(), std::distance(PB, PE) + 1,
+                          "accumulator.tr", &HeaderBB->front());
+
+  // Loop over all of the predecessors of the tail recursion block.  For the
+  // real entry into the function we seed the PHI with the identity constant for
+  // the accumulation operation.  For any other existing branches to this block
+  // (due to other tail recursions eliminated) the accumulator is not modified.
+  // Because we haven't added the branch in the current block to HeaderBB yet,
+  // it will not show up as a predecessor.
+  for (pred_iterator PI = PB; PI != PE; ++PI) {
+    BasicBlock *P = *PI;
+    if (P == &F.getEntryBlock()) {
+      Constant *Identity = ConstantExpr::getBinOpIdentity(
+          AccRecInstr->getOpcode(), AccRecInstr->getType());
+      AccPN->addIncoming(Identity, P);
+    } else {
+      AccPN->addIncoming(AccPN, P);
+    }
+  }
+
+  ++NumAccumAdded;
+}
+
+bool TailRecursionEliminator::eliminateCall(CallInst *CI) {
+  ReturnInst *Ret = cast<ReturnInst>(CI->getParent()->getTerminator());
 
   // Ok, we found a potential tail call.  We can currently only transform the
   // tail call if all of the instructions between the call and the return are
   // movable to above the call itself, leaving the call next to the return.
   // Check that this is the case now.
+  Instruction *AccRecInstr = nullptr;
   BasicBlock::iterator BBI(CI);
   for (++BBI; &*BBI != Ret; ++BBI) {
     if (canMoveAboveCall(&*BBI, CI, AA))
@@ -526,39 +598,16 @@ static bool eliminateRecursiveTailCall(
     // If we can't move the instruction above the call, it might be because it
     // is an associative and commutative operation that could be transformed
     // using accumulator recursion elimination.  Check to see if this is the
-    // case, and if so, remember the initial accumulator value for later.
-    if ((AccumulatorRecursionEliminationInitVal =
-             canTransformAccumulatorRecursion(&*BBI, CI))) {
-      // Yes, this is accumulator recursion.  Remember which instruction
-      // accumulates.
-      AccumulatorRecursionInstr = &*BBI;
-    } else {
-      return false;   // Otherwise, we cannot eliminate the tail recursion!
-    }
-  }
+    // case, and if so, remember which instruction accumulates for later.
+    if (AccPN || !canTransformAccumulatorRecursion(&*BBI, CI))
+      return false; // We cannot eliminate the tail recursion!
 
-  // We can only transform call/return pairs that either ignore the return value
-  // of the call and return void, ignore the value of the call and return a
-  // constant, return the value returned by the tail call, or that are being
-  // accumulator recursion variable eliminated.
-  if (Ret->getNumOperands() == 1 && Ret->getReturnValue() != CI &&
-      !isa<UndefValue>(Ret->getReturnValue()) &&
-      AccumulatorRecursionEliminationInitVal == nullptr &&
-      !getCommonReturnValue(nullptr, CI)) {
-    // One case remains that we are able to handle: the current return
-    // instruction returns a constant, and all other return instructions
-    // return a different constant.
-    if (!isDynamicConstant(Ret->getReturnValue(), CI, Ret))
-      return false; // Current return instruction does not return a constant.
-    // Check that all other return instructions return a common constant.  If
-    // so, record it in AccumulatorRecursionEliminationInitVal.
-    AccumulatorRecursionEliminationInitVal = getCommonReturnValue(Ret, CI);
-    if (!AccumulatorRecursionEliminationInitVal)
-      return false;
+    // Yes, this is accumulator recursion.  Remember which instruction
+    // accumulates.
+    AccRecInstr = &*BBI;
   }
 
   BasicBlock *BB = Ret->getParent();
-  Function *F = BB->getParent();
 
   using namespace ore;
   ORE->emit([&]() {
@@ -568,51 +617,10 @@ static bool eliminateRecursiveTailCall(
 
   // OK! We can transform this tail call.  If this is the first one found,
   // create the new entry block, allowing us to branch back to the old entry.
-  if (!OldEntry) {
-    OldEntry = &F->getEntryBlock();
-    BasicBlock *NewEntry = BasicBlock::Create(F->getContext(), "", F, OldEntry);
-    NewEntry->takeName(OldEntry);
-    OldEntry->setName("tailrecurse");
-    BranchInst *BI = BranchInst::Create(OldEntry, NewEntry);
-    BI->setDebugLoc(CI->getDebugLoc());
-
-    // If this tail call is marked 'tail' and if there are any allocas in the
-    // entry block, move them up to the new entry block.
-    TailCallsAreMarkedTail = CI->isTailCall();
-    if (TailCallsAreMarkedTail)
-      // Move all fixed sized allocas from OldEntry to NewEntry.
-      for (BasicBlock::iterator OEBI = OldEntry->begin(), E = OldEntry->end(),
-             NEBI = NewEntry->begin(); OEBI != E; )
-        if (AllocaInst *AI = dyn_cast<AllocaInst>(OEBI++))
-          if (isa<ConstantInt>(AI->getArraySize()))
-            AI->moveBefore(&*NEBI);
-
-    // Now that we have created a new block, which jumps to the entry
-    // block, insert a PHI node for each argument of the function.
-    // For now, we initialize each PHI to only have the real arguments
-    // which are passed in.
-    Instruction *InsertPos = &OldEntry->front();
-    for (Function::arg_iterator I = F->arg_begin(), E = F->arg_end();
-         I != E; ++I) {
-      PHINode *PN = PHINode::Create(I->getType(), 2,
-                                    I->getName() + ".tr", InsertPos);
-      I->replaceAllUsesWith(PN); // Everyone use the PHI node now!
-      PN->addIncoming(&*I, NewEntry);
-      ArgumentPHIs.push_back(PN);
-    }
-    // The entry block was changed from OldEntry to NewEntry.
-    // The forward DominatorTree needs to be recalculated when the EntryBB is
-    // changed. In this corner-case we recalculate the entire tree.
-    DTU.recalculate(*NewEntry->getParent());
-  }
+  if (!HeaderBB)
+    createTailRecurseLoopHeader(CI);
 
-  // If this function has self recursive calls in the tail position where some
-  // are marked tail and some are not, only transform one flavor or another.  We
-  // have to choose whether we move allocas in the entry block to the new entry
-  // block or not, so we can't make a good choice for both.  NOTE: We could do
-  // slightly better here in the case that the function has no entry block
-  // allocas.
-  if (TailCallsAreMarkedTail && !CI->isTailCall())
+  if (RemovableCallsMustBeMarkedTail && !CI->isTailCall())
     return false;
 
   // Ok, now that we know we have a pseudo-entry block WITH all of the
@@ -621,74 +629,53 @@ static bool eliminateRecursiveTailCall(
   for (unsigned i = 0, e = CI->getNumArgOperands(); i != e; ++i)
     ArgumentPHIs[i]->addIncoming(CI->getArgOperand(i), BB);
 
-  // If we are introducing an accumulator variable to eliminate the recursion,
-  // do so now.  Note that we _know_ that no subsequent tail recursion
-  // eliminations will happen on this function because of the way the
-  // accumulator recursion predicate is set up.
-  //
-  if (AccumulatorRecursionEliminationInitVal) {
-    Instruction *AccRecInstr = AccumulatorRecursionInstr;
-    // Start by inserting a new PHI node for the accumulator.
-    pred_iterator PB = pred_begin(OldEntry), PE = pred_end(OldEntry);
-    PHINode *AccPN = PHINode::Create(
-        AccumulatorRecursionEliminationInitVal->getType(),
-        std::distance(PB, PE) + 1, "accumulator.tr", &OldEntry->front());
-
-    // Loop over all of the predecessors of the tail recursion block.  For the
-    // real entry into the function we seed the PHI with the initial value,
-    // computed earlier.  For any other existing branches to this block (due to
-    // other tail recursions eliminated) the accumulator is not modified.
-    // Because we haven't added the branch in the current block to OldEntry yet,
-    // it will not show up as a predecessor.
-    for (pred_iterator PI = PB; PI != PE; ++PI) {
-      BasicBlock *P = *PI;
-      if (P == &F->getEntryBlock())
-        AccPN->addIncoming(AccumulatorRecursionEliminationInitVal, P);
-      else
-        AccPN->addIncoming(AccPN, P);
-    }
+  if (AccRecInstr) {
+    insertAccumulator(AccRecInstr);
 
-    if (AccRecInstr) {
-      // Add an incoming argument for the current block, which is computed by
-      // our associative and commutative accumulator instruction.
-      AccPN->addIncoming(AccRecInstr, BB);
+    // Rewrite the accumulator recursion instruction so that it does not use
+    // the result of the call anymore, instead, use the PHI node we just
+    // inserted.
+    AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+  }
 
-      // Next, rewrite the accumulator recursion instruction so that it does not
-      // use the result of the call anymore, instead, use the PHI node we just
-      // inserted.
-      AccRecInstr->setOperand(AccRecInstr->getOperand(0) != CI, AccPN);
+  // Update our return value tracking
+  if (RetPN) {
+    if (Ret->getReturnValue() == CI || AccRecInstr) {
+      // Defer selecting a return value
+      RetPN->addIncoming(RetPN, BB);
+      RetKnownPN->addIncoming(RetKnownPN, BB);
     } else {
-      // Add an incoming argument for the current block, which is just the
-      // constant returned by the current return instruction.
-      AccPN->addIncoming(Ret->getReturnValue(), BB);
+      // We found a return value we want to use, insert a select instruction to
+      // select it if we don't already know what our return value will be and
+      // store the result in our return value PHI node.
+      SelectInst *SI = SelectInst::Create(
+          RetKnownPN, RetPN, Ret->getReturnValue(), "current.ret.tr", Ret);
+      RetSelects.push_back(SI);
+
+      RetPN->addIncoming(SI, BB);
+      RetKnownPN->addIncoming(ConstantInt::getTrue(RetKnownPN->getType()), BB);
     }
 
-    // Finally, rewrite any return instructions in the program to return the PHI
-    // node instead of the "initval" that they do currently.  This loop will
-    // actually rewrite the return value we are destroying, but that's ok.
-    for (BasicBlock &BBI : *F)
-      if (ReturnInst *RI = dyn_cast<ReturnInst>(BBI.getTerminator()))
-        RI->setOperand(0, AccPN);
-    ++NumAccumAdded;
+    if (AccPN)
+      AccPN->addIncoming(AccRecInstr ? AccRecInstr : AccPN, BB);
   }
 
   // Now that all of the PHI nodes are in place, remove the call and
   // ret instructions, replacing them with an unconditional branch.
-  BranchInst *NewBI = BranchInst::Create(OldEntry, Ret);
+  BranchInst *NewBI = BranchInst::Create(HeaderBB, Ret);
   NewBI->setDebugLoc(CI->getDebugLoc());
 
   BB->getInstList().erase(Ret);  // Remove return.
   BB->getInstList().erase(CI);   // Remove call.
-  DTU.applyUpdates({{DominatorTree::Insert, BB, OldEntry}});
+  DTU.applyUpdates({{DominatorTree::Insert, BB, HeaderBB}});
   ++NumEliminated;
   return true;
 }
 
-static bool foldReturnAndProcessPred(
-    BasicBlock *BB, ReturnInst *Ret, BasicBlock *&OldEntry,
-    bool &TailCallsAreMarkedTail, SmallVectorImpl<PHINode *> &ArgumentPHIs,
-    bool CannotTailCallElimCallsMarkedTail, const TargetTransformInfo *TTI,
-    AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, DomTreeUpdater &DTU) {
+bool TailRecursionEliminator::foldReturnAndProcessPred(
+    ReturnInst *Ret, bool CannotTailCallElimCallsMarkedTail) {
+  BasicBlock *BB = Ret->getParent();
+
   bool Change = false;
 
   // Make sure this block is a trivial return block.
@@ -711,10 +698,11 @@ static bool foldReturnAndProcessPred(
   while (!UncondBranchPreds.empty()) {
     BranchInst *BI = UncondBranchPreds.pop_back_val();
     BasicBlock *Pred = BI->getParent();
-    if (CallInst *CI = findTRECandidate(BI, CannotTailCallElimCallsMarkedTail, TTI)){
+    if (CallInst *CI =
+            findTRECandidate(BI, CannotTailCallElimCallsMarkedTail)) {
       LLVM_DEBUG(dbgs() << "FOLDING: " << *BB
                         << "INTO UNCOND BRANCH PRED: " << *Pred);
-      ReturnInst *RI = FoldReturnIntoUncondBranch(Ret, BB, Pred, &DTU);
+      FoldReturnIntoUncondBranch(Ret, BB, Pred, &DTU);
 
       // Cleanup: if all predecessors of BB have been eliminated by
       // FoldReturnIntoUncondBranch, delete it.  It is important to empty it,
@@ -723,8 +711,7 @@ static bool foldReturnAndProcessPred(
       if (!BB->hasAddressTaken() && pred_begin(BB) == pred_end(BB))
         DTU.deleteBB(BB);
 
-      eliminateRecursiveTailCall(CI, RI, OldEntry, TailCallsAreMarkedTail,
-                                 ArgumentPHIs, AA, ORE, DTU);
+      eliminateCall(CI);
       ++NumRetDuped;
       Change = true;
     }
@@ -733,23 +720,92 @@ static bool foldReturnAndProcessPred(
   return Change;
 }
 
-static bool processReturningBlock(
-    ReturnInst *Ret, BasicBlock *&OldEntry, bool &TailCallsAreMarkedTail,
-    SmallVectorImpl<PHINode *> &ArgumentPHIs,
-    bool CannotTailCallElimCallsMarkedTail, const TargetTransformInfo *TTI,
-    AliasAnalysis *AA, OptimizationRemarkEmitter *ORE, DomTreeUpdater &DTU) {
-  CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail, TTI);
+bool TailRecursionEliminator::processReturningBlock(
+    ReturnInst *Ret, bool CannotTailCallElimCallsMarkedTail) {
+  CallInst *CI = findTRECandidate(Ret, CannotTailCallElimCallsMarkedTail);
   if (!CI)
     return false;
 
-  return eliminateRecursiveTailCall(CI, Ret, OldEntry, TailCallsAreMarkedTail,
-                                    ArgumentPHIs, AA, ORE, DTU);
+  return eliminateCall(CI);
+}
+
+void TailRecursionEliminator::cleanupAndFinalize() {
+  // If we eliminated any tail recursions, it's possible that we inserted some
+  // silly PHI nodes which just merge an initial value (the incoming operand)
+  // with themselves.  Check to see if we did and clean up our mess if so.  This
+  // occurs when a function passes an argument straight through to its tail
+  // call.
+  for (PHINode *PN : ArgumentPHIs) {
+    // If the PHI Node is a dynamic constant, replace it with the value it is.
+    if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
+      PN->replaceAllUsesWith(PNV);
+      PN->eraseFromParent();
+    }
+  }
+
+  if (RetPN) {
+    if (RetSelects.empty()) {
+      // If we didn't insert any select instructions, then we know we didn't
+      // store a return value and we can remove the PHI nodes we inserted.
+      RetPN->dropAllReferences();
+      RetPN->eraseFromParent();
+
+      RetKnownPN->dropAllReferences();
+      RetKnownPN->eraseFromParent();
+
+      if (AccPN) {
+        // We need to insert a copy of our accumulator instruction before any
+        // return in the function, and return its result instead.
+        Instruction *AccRecInstr = AccumulatorRecursionInstr;
+        for (BasicBlock &BB : F) {
+          ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
+          if (!RI)
+            continue;
+
+          Instruction *AccRecInstrNew = AccRecInstr->clone();
+          AccRecInstrNew->setName("accumulator.ret.tr");
+          AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN,
+                                     RI->getOperand(0));
+          AccRecInstrNew->insertBefore(RI);
+          RI->setOperand(0, AccRecInstrNew);
+        }
+      }
+    } else {
+      // We need to insert a select instruction before any return left in the
+      // function to select our stored return value if we have one.
+      for (BasicBlock &BB : F) {
+        ReturnInst *RI = dyn_cast<ReturnInst>(BB.getTerminator());
+        if (!RI)
+          continue;
+
+        SelectInst *SI = SelectInst::Create(
+            RetKnownPN, RetPN, RI->getOperand(0), "current.ret.tr", RI);
+        RetSelects.push_back(SI);
+        RI->setOperand(0, SI);
+      }
+
+      if (AccPN) {
+        // We need to insert a copy of our accumulator instruction before any
+        // of the selects we inserted, and select its result instead.
+        Instruction *AccRecInstr = AccumulatorRecursionInstr;
+        for (SelectInst *SI : RetSelects) {
+          Instruction *AccRecInstrNew = AccRecInstr->clone();
+          AccRecInstrNew->setName("accumulator.ret.tr");
+          AccRecInstrNew->setOperand(AccRecInstr->getOperand(0) == AccPN,
+                                     SI->getFalseValue());
+          AccRecInstrNew->insertBefore(SI);
+          SI->setFalseValue(AccRecInstrNew);
+        }
+      }
+    }
+  }
 }
 
-static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
-                                   AliasAnalysis *AA,
-                                   OptimizationRemarkEmitter *ORE,
-                                   DomTreeUpdater &DTU) {
+bool TailRecursionEliminator::eliminate(Function &F,
+                                        const TargetTransformInfo *TTI,
+                                        AliasAnalysis *AA,
+                                        OptimizationRemarkEmitter *ORE,
+                                        DomTreeUpdater &DTU) {
   if (F.getFnAttribute("disable-tail-calls").getValueAsString() == "true")
     return false;
 
@@ -762,17 +818,15 @@ static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
   // If this function is a varargs function, we won't be able to PHI the args
   // right, so don't even try to convert it...
   if (F.getFunctionType()->isVarArg())
-    return false;
-
-  BasicBlock *OldEntry = nullptr;
-  bool TailCallsAreMarkedTail = false;
-  SmallVector<PHINode*, 8> ArgumentPHIs;
+    return MadeChange;
 
   // If false, we cannot perform TRE on tail calls marked with the 'tail'
   // attribute, because doing so would cause the stack size to increase (real
   // TRE would deallocate variable sized allocas, TRE doesn't).
   bool CanTRETailMarkedCall = canTRE(F);
 
+  TailRecursionEliminator TRE(F, TTI, AA, ORE, DTU);
+
   // Change any tail recursive calls to loops.
   //
   // FIXME: The code generator produces really bad code when an 'escaping
@@ -782,29 +836,14 @@ static bool eliminateTailRecursion(Function &F, const TargetTransformInfo *TTI,
   for (Function::iterator BBI = F.begin(), E = F.end(); BBI != E; /*in loop*/) {
     BasicBlock *BB = &*BBI++; // foldReturnAndProcessPred may delete BB.
     if (ReturnInst *Ret = dyn_cast<ReturnInst>(BB->getTerminator())) {
-      bool Change = processReturningBlock(Ret, OldEntry, TailCallsAreMarkedTail,
-                                          ArgumentPHIs, !CanTRETailMarkedCall,
-                                          TTI, AA, ORE, DTU);
+      bool Change = TRE.processReturningBlock(Ret, !CanTRETailMarkedCall);
       if (!Change && BB->getFirstNonPHIOrDbg() == Ret)
-        Change = foldReturnAndProcessPred(
-            BB, Ret, OldEntry, TailCallsAreMarkedTail, ArgumentPHIs,
-            !CanTRETailMarkedCall, TTI, AA, ORE, DTU);
+        Change = TRE.foldReturnAndProcessPred(Ret, !CanTRETailMarkedCall);
       MadeChange |= Change;
     }
   }
 
-  // If we eliminated any tail recursions, it's possible that we inserted some
-  // silly PHI nodes which just merge an initial value (the incoming operand)
-  // with themselves.  Check to see if we did and clean up our mess if so.  This
-  // occurs when a function passes an argument straight through to its tail
-  // call.
-  for (PHINode *PN : ArgumentPHIs) {
-    // If the PHI Node is a dynamic constant, replace it with the value it is.
-    if (Value *PNV = SimplifyInstruction(PN, F.getParent()->getDataLayout())) {
-      PN->replaceAllUsesWith(PNV);
-      PN->eraseFromParent();
-    }
-  }
+  TRE.cleanupAndFinalize();
 
   return MadeChange;
 }
@@ -838,7 +877,7 @@ struct TailCallElim : public FunctionPass {
     // UpdateStrategy to Lazy if we find it profitable later.
     DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
 
-    return eliminateTailRecursion(
+    return TailRecursionEliminator::eliminate(
         F, &getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F),
         &getAnalysis<AAResultsWrapperPass>().getAAResults(),
         &getAnalysis<OptimizationRemarkEmitterWrapperPass>().getORE(), DTU);
@@ -871,7 +910,7 @@ PreservedAnalyses TailCallElimPass::run(Function &F,
   // UpdateStrategy based on some test results. It is feasible to switch the
   // UpdateStrategy to Lazy if we find it profitable later.
   DomTreeUpdater DTU(DT, PDT, DomTreeUpdater::UpdateStrategy::Eager);
-  bool Changed = eliminateTailRecursion(F, &TTI, &AA, &ORE, DTU);
+  bool Changed = TailRecursionEliminator::eliminate(F, &TTI, &AA, &ORE, DTU);
 
   if (!Changed)
     return PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
index c8461fdc1608..7c81e6352dec 100644
--- a/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
+++ b/llvm/lib/Transforms/Scalar/WarnMissedTransforms.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Scalar/WarnMissedTransforms.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
diff --git a/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
new file mode 100644
index 000000000000..84a66e1e96d2
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/AMDGPUEmitPrintf.cpp
@@ -0,0 +1,246 @@
+//===- AMDGPUEmitPrintf.cpp -----------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility function to lower a printf call into a series of device
+// library calls on the AMDGPU target.
+//
+// WARNING: This file knows about certain library functions. It recognizes them
+// by name, and hardwires knowledge of their semantics.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/AMDGPUEmitPrintf.h"
+#include "llvm/ADT/SparseBitVector.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/IRBuilder.h"
+
+#include <iostream>
+
+using namespace llvm;
+
+#define DEBUG_TYPE "amdgpu-emit-printf"
+
+static bool isCString(const Value *Arg) {
+  auto Ty = Arg->getType();
+  auto PtrTy = dyn_cast<PointerType>(Ty);
+  if (!PtrTy)
+    return false;
+
+  auto IntTy = dyn_cast<IntegerType>(PtrTy->getElementType());
+  if (!IntTy)
+    return false;
+
+  return IntTy->getBitWidth() == 8;
+}
+
+static Value *fitArgInto64Bits(IRBuilder<> &Builder, Value *Arg) {
+  auto Int64Ty = Builder.getInt64Ty();
+  auto Ty = Arg->getType();
+
+  if (auto IntTy = dyn_cast<IntegerType>(Ty)) {
+    switch (IntTy->getBitWidth()) {
+    case 32:
+      return Builder.CreateZExt(Arg, Int64Ty);
+    case 64:
+      return Arg;
+    }
+  }
+
+  if (Ty->getTypeID() == Type::DoubleTyID) {
+    return Builder.CreateBitCast(Arg, Int64Ty);
+  }
+
+  if (isa<PointerType>(Ty)) {
+    return Builder.CreatePtrToInt(Arg, Int64Ty);
+  }
+
+  llvm_unreachable("unexpected type");
+}
+
+static Value *callPrintfBegin(IRBuilder<> &Builder, Value *Version) {
+  auto Int64Ty = Builder.getInt64Ty();
+  auto M = Builder.GetInsertBlock()->getModule();
+  auto Fn = M->getOrInsertFunction("__ockl_printf_begin", Int64Ty, Int64Ty);
+  return Builder.CreateCall(Fn, Version);
+}
+
+static Value *callAppendArgs(IRBuilder<> &Builder, Value *Desc, int NumArgs,
+                             Value *Arg0, Value *Arg1, Value *Arg2, Value *Arg3,
+                             Value *Arg4, Value *Arg5, Value *Arg6,
+                             bool IsLast) {
+  auto Int64Ty = Builder.getInt64Ty();
+  auto Int32Ty = Builder.getInt32Ty();
+  auto M = Builder.GetInsertBlock()->getModule();
+  auto Fn = M->getOrInsertFunction("__ockl_printf_append_args", Int64Ty,
+                                   Int64Ty, Int32Ty, Int64Ty, Int64Ty, Int64Ty,
+                                   Int64Ty, Int64Ty, Int64Ty, Int64Ty, Int32Ty);
+  auto IsLastValue = Builder.getInt32(IsLast);
+  auto NumArgsValue = Builder.getInt32(NumArgs);
+  return Builder.CreateCall(Fn, {Desc, NumArgsValue, Arg0, Arg1, Arg2, Arg3,
+                                 Arg4, Arg5, Arg6, IsLastValue});
+}
+
+static Value *appendArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
+                        bool IsLast) {
+  auto Arg0 = fitArgInto64Bits(Builder, Arg);
+  auto Zero = Builder.getInt64(0);
+  return callAppendArgs(Builder, Desc, 1, Arg0, Zero, Zero, Zero, Zero, Zero,
+                        Zero, IsLast);
+}
+
+// The device library does not provide strlen, so we build our own loop
+// here. While we are at it, we also include the terminating null in the length.
+static Value *getStrlenWithNull(IRBuilder<> &Builder, Value *Str) {
+  auto *Prev = Builder.GetInsertBlock();
+  Module *M = Prev->getModule();
+
+  auto CharZero = Builder.getInt8(0);
+  auto One = Builder.getInt64(1);
+  auto Zero = Builder.getInt64(0);
+  auto Int64Ty = Builder.getInt64Ty();
+
+  // The length is either zero for a null pointer, or the computed value for an
+  // actual string. We need a join block for a phi that represents the final
+  // value.
+  //
+  //  Strictly speaking, the zero does not matter since
+  // __ockl_printf_append_string_n ignores the length if the pointer is null.
+  BasicBlock *Join = nullptr;
+  if (Prev->getTerminator()) {
+    Join = Prev->splitBasicBlock(Builder.GetInsertPoint(),
+                                 "strlen.join");
+    Prev->getTerminator()->eraseFromParent();
+  } else {
+    Join = BasicBlock::Create(M->getContext(), "strlen.join",
+                              Prev->getParent());
+  }
+  BasicBlock *While =
+      BasicBlock::Create(M->getContext(), "strlen.while",
+                         Prev->getParent(), Join);
+  BasicBlock *WhileDone = BasicBlock::Create(
+      M->getContext(), "strlen.while.done",
+      Prev->getParent(), Join);
+
+  // Emit an early return for when the pointer is null.
+  Builder.SetInsertPoint(Prev);
+  auto CmpNull =
+      Builder.CreateICmpEQ(Str, Constant::getNullValue(Str->getType()));
+  BranchInst::Create(Join, While, CmpNull, Prev);
+
+  // Entry to the while loop.
+  Builder.SetInsertPoint(While);
+
+  auto PtrPhi = Builder.CreatePHI(Str->getType(), 2);
+  PtrPhi->addIncoming(Str, Prev);
+  auto PtrNext = Builder.CreateGEP(PtrPhi, One);
+  PtrPhi->addIncoming(PtrNext, While);
+
+  // Condition for the while loop.
+  auto Data = Builder.CreateLoad(PtrPhi);
+  auto Cmp = Builder.CreateICmpEQ(Data, CharZero);
+  Builder.CreateCondBr(Cmp, WhileDone, While);
+
+  // Add one to the computed length.
+  Builder.SetInsertPoint(WhileDone, WhileDone->begin());
+  auto Begin = Builder.CreatePtrToInt(Str, Int64Ty);
+  auto End = Builder.CreatePtrToInt(PtrPhi, Int64Ty);
+  auto Len = Builder.CreateSub(End, Begin);
+  Len = Builder.CreateAdd(Len, One);
+
+  // Final join.
+  BranchInst::Create(Join, WhileDone);
+  Builder.SetInsertPoint(Join, Join->begin());
+  auto LenPhi = Builder.CreatePHI(Len->getType(), 2);
+  LenPhi->addIncoming(Len, WhileDone);
+  LenPhi->addIncoming(Zero, Prev);
+
+  return LenPhi;
+}
+
+static Value *callAppendStringN(IRBuilder<> &Builder, Value *Desc, Value *Str,
+                                Value *Length, bool isLast) {
+  auto Int64Ty = Builder.getInt64Ty();
+  auto CharPtrTy = Builder.getInt8PtrTy();
+  auto Int32Ty = Builder.getInt32Ty();
+  auto M = Builder.GetInsertBlock()->getModule();
+  auto Fn = M->getOrInsertFunction("__ockl_printf_append_string_n", Int64Ty,
+                                   Int64Ty, CharPtrTy, Int64Ty, Int32Ty);
+  auto IsLastInt32 = Builder.getInt32(isLast);
+  return Builder.CreateCall(Fn, {Desc, Str, Length, IsLastInt32});
+}
+
+static Value *appendString(IRBuilder<> &Builder, Value *Desc, Value *Arg,
+                           bool IsLast) {
+  auto Length = getStrlenWithNull(Builder, Arg);
+  return callAppendStringN(Builder, Desc, Arg, Length, IsLast);
+}
+
+static Value *processArg(IRBuilder<> &Builder, Value *Desc, Value *Arg,
+                         bool SpecIsCString, bool IsLast) {
+  if (SpecIsCString && isCString(Arg)) {
+    return appendString(Builder, Desc, Arg, IsLast);
+  }
+  // If the format specifies a string but the argument is not, the frontend will
+  // have printed a warning. We just rely on undefined behaviour and send the
+  // argument anyway.
+  return appendArg(Builder, Desc, Arg, IsLast);
+}
+
+// Scan the format string to locate all specifiers, and mark the ones that
+// specify a string, i.e, the "%s" specifier with optional '*' characters.
+static void locateCStrings(SparseBitVector<8> &BV, Value *Fmt) {
+  StringRef Str;
+  if (!getConstantStringInfo(Fmt, Str) || Str.empty())
+    return;
+
+  static const char ConvSpecifiers[] = "diouxXfFeEgGaAcspn";
+  size_t SpecPos = 0;
+  // Skip the first argument, the format string.
+  unsigned ArgIdx = 1;
+
+  while ((SpecPos = Str.find_first_of('%', SpecPos)) != StringRef::npos) {
+    if (Str[SpecPos + 1] == '%') {
+      SpecPos += 2;
+      continue;
+    }
+    auto SpecEnd = Str.find_first_of(ConvSpecifiers, SpecPos);
+    if (SpecEnd == StringRef::npos)
+      return;
+    auto Spec = Str.slice(SpecPos, SpecEnd + 1);
+    ArgIdx += Spec.count('*');
+    if (Str[SpecEnd] == 's') {
+      BV.set(ArgIdx);
+    }
+    SpecPos = SpecEnd + 1;
+    ++ArgIdx;
+  }
+}
+
+Value *llvm::emitAMDGPUPrintfCall(IRBuilder<> &Builder,
+                                  ArrayRef<Value *> Args) {
+  auto NumOps = Args.size();
+  assert(NumOps >= 1);
+
+  auto Fmt = Args[0];
+  SparseBitVector<8> SpecIsCString;
+  locateCStrings(SpecIsCString, Fmt);
+
+  auto Desc = callPrintfBegin(Builder, Builder.getIntN(64, 0));
+  Desc = appendString(Builder, Desc, Fmt, NumOps == 1);
+
+  // FIXME: This invokes hostcall once for each argument. We can pack up to
+  // seven scalar printf arguments in a single hostcall. See the signature of
+  // callAppendArgs().
+  for (unsigned int i = 1; i != NumOps; ++i) {
+    bool IsLast = i == NumOps - 1;
+    bool IsCString = SpecIsCString.test(i);
+    Desc = processArg(Builder, Desc, Args[i], IsCString, IsLast);
+  }
+
+  return Builder.CreateTrunc(Desc, Builder.getInt32Ty());
+}
diff --git a/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
new file mode 100644
index 000000000000..7ff73fcdada7
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/AssumeBundleBuilder.cpp
@@ -0,0 +1,618 @@
+//===- AssumeBundleBuilder.cpp - tools to preserve informations -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "assume-builder"
+
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
+#include "llvm/ADT/DepthFirstIterator.h"
+#include "llvm/ADT/MapVector.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
+#include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DebugCounter.h"
+#include "llvm/Transforms/Utils/Local.h"
+
+using namespace llvm;
+
+cl::opt<bool> ShouldPreserveAllAttributes(
+    "assume-preserve-all", cl::init(false), cl::Hidden,
+    cl::desc("enable preservation of all attrbitues. even those that are "
+             "unlikely to be usefull"));
+
+cl::opt<bool> EnableKnowledgeRetention(
+    "enable-knowledge-retention", cl::init(false), cl::Hidden,
+    cl::desc(
+        "enable preservation of attributes throughout code transformation"));
+
+STATISTIC(NumAssumeBuilt, "Number of assume built by the assume builder");
+STATISTIC(NumBundlesInAssumes, "Total number of Bundles in the assume built");
+STATISTIC(NumAssumesMerged,
+          "Number of assume merged by the assume simplify pass");
+STATISTIC(NumAssumesRemoved,
+          "Number of assume removed by the assume simplify pass");
+
+DEBUG_COUNTER(BuildAssumeCounter, "assume-builder-counter",
+              "Controls which assumes gets created");
+
+namespace {
+
+bool isUsefullToPreserve(Attribute::AttrKind Kind) {
+  switch (Kind) {
+    case Attribute::NonNull:
+    case Attribute::Alignment:
+    case Attribute::Dereferenceable:
+    case Attribute::DereferenceableOrNull:
+    case Attribute::Cold:
+      return true;
+    default:
+      return false;
+  }
+}
+
+/// This function will try to transform the given knowledge into a more
+/// canonical one. the canonical knowledge maybe the given one.
+RetainedKnowledge canonicalizedKnowledge(RetainedKnowledge RK, Module *M) {
+  switch (RK.AttrKind) {
+  default:
+    return RK;
+  case Attribute::NonNull:
+    RK.WasOn = GetUnderlyingObject(RK.WasOn, M->getDataLayout());
+    return RK;
+  case Attribute::Alignment: {
+    Value *V = RK.WasOn->stripInBoundsOffsets([&](const Value *Strip) {
+      if (auto *GEP = dyn_cast<GEPOperator>(Strip))
+        RK.ArgValue =
+            MinAlign(RK.ArgValue,
+                     GEP->getMaxPreservedAlignment(M->getDataLayout()).value());
+    });
+    RK.WasOn = V;
+    return RK;
+  }
+  case Attribute::Dereferenceable:
+  case Attribute::DereferenceableOrNull: {
+    int64_t Offset = 0;
+    Value *V = GetPointerBaseWithConstantOffset(
+        RK.WasOn, Offset, M->getDataLayout(), /*AllowNonInBounds*/ false);
+    if (Offset < 0)
+      return RK;
+    RK.ArgValue = RK.ArgValue + Offset;
+    RK.WasOn = V;
+  }
+  }
+  return RK;
+}
+
+/// This class contain all knowledge that have been gather while building an
+/// llvm.assume and the function to manipulate it.
+struct AssumeBuilderState {
+  Module *M;
+
+  using MapKey = std::pair<Value *, Attribute::AttrKind>;
+  SmallMapVector<MapKey, unsigned, 8> AssumedKnowledgeMap;
+  Instruction *InstBeingRemoved = nullptr;
+  AssumptionCache* AC = nullptr;
+  DominatorTree* DT = nullptr;
+
+  AssumeBuilderState(Module *M, Instruction *I = nullptr,
+                     AssumptionCache *AC = nullptr, DominatorTree *DT = nullptr)
+      : M(M), InstBeingRemoved(I), AC(AC), DT(DT) {}
+
+  bool tryToPreserveWithoutAddingAssume(RetainedKnowledge RK) {
+    if (!InstBeingRemoved || !RK.WasOn)
+      return false;
+    bool HasBeenPreserved = false;
+    Use* ToUpdate = nullptr;
+    getKnowledgeForValue(
+        RK.WasOn, {RK.AttrKind}, AC,
+        [&](RetainedKnowledge RKOther, Instruction *Assume,
+            const CallInst::BundleOpInfo *Bundle) {
+          if (!isValidAssumeForContext(Assume, InstBeingRemoved, DT))
+            return false;
+          if (RKOther.ArgValue >= RK.ArgValue) {
+            HasBeenPreserved = true;
+            return true;
+          } else if (isValidAssumeForContext(InstBeingRemoved, Assume,
+                                             DT)) {
+            HasBeenPreserved = true;
+            IntrinsicInst *Intr = cast<IntrinsicInst>(Assume);
+            ToUpdate = &Intr->op_begin()[Bundle->Begin + ABA_Argument];
+            return true;
+          }
+          return false;
+        });
+    if (ToUpdate)
+      ToUpdate->set(
+          ConstantInt::get(Type::getInt64Ty(M->getContext()), RK.ArgValue));
+    return HasBeenPreserved;
+  }
+
+  bool isKnowledgeWorthPreserving(RetainedKnowledge RK) {
+    if (!RK)
+      return false;
+    if (!RK.WasOn)
+      return true;
+    if (RK.WasOn->getType()->isPointerTy()) {
+      Value *UnderlyingPtr = GetUnderlyingObject(RK.WasOn, M->getDataLayout());
+      if (isa<AllocaInst>(UnderlyingPtr) || isa<GlobalValue>(UnderlyingPtr))
+        return false;
+    }
+    if (auto *Arg = dyn_cast<Argument>(RK.WasOn)) {
+      if (Arg->hasAttribute(RK.AttrKind) &&
+          (!Attribute::doesAttrKindHaveArgument(RK.AttrKind) ||
+           Arg->getAttribute(RK.AttrKind).getValueAsInt() >= RK.ArgValue))
+        return false;
+      return true;
+    }
+    if (auto *Inst = dyn_cast<Instruction>(RK.WasOn))
+      if (wouldInstructionBeTriviallyDead(Inst)) {
+        if (RK.WasOn->use_empty())
+          return false;
+        Use *SingleUse = RK.WasOn->getSingleUndroppableUse();
+        if (SingleUse && SingleUse->getUser() == InstBeingRemoved)
+          return false;
+      }
+    return true;
+  }
+
+  void addKnowledge(RetainedKnowledge RK) {
+    RK = canonicalizedKnowledge(RK, M);
+
+    if (!isKnowledgeWorthPreserving(RK))
+      return;
+
+    if (tryToPreserveWithoutAddingAssume(RK))
+      return;
+    MapKey Key{RK.WasOn, RK.AttrKind};
+    auto Lookup = AssumedKnowledgeMap.find(Key);
+    if (Lookup == AssumedKnowledgeMap.end()) {
+      AssumedKnowledgeMap[Key] = RK.ArgValue;
+      return;
+    }
+    assert(((Lookup->second == 0 && RK.ArgValue == 0) ||
+            (Lookup->second != 0 && RK.ArgValue != 0)) &&
+           "inconsistent argument value");
+
+    /// This is only desirable because for all attributes taking an argument
+    /// higher is better.
+    Lookup->second = std::max(Lookup->second, RK.ArgValue);
+  }
+
+  void addAttribute(Attribute Attr, Value *WasOn) {
+    if (Attr.isTypeAttribute() || Attr.isStringAttribute() ||
+        (!ShouldPreserveAllAttributes &&
+         !isUsefullToPreserve(Attr.getKindAsEnum())))
+      return;
+    unsigned AttrArg = 0;
+    if (Attr.isIntAttribute())
+      AttrArg = Attr.getValueAsInt();
+    addKnowledge({Attr.getKindAsEnum(), AttrArg, WasOn});
+  }
+
+  void addCall(const CallBase *Call) {
+    auto addAttrList = [&](AttributeList AttrList) {
+      for (unsigned Idx = AttributeList::FirstArgIndex;
+           Idx < AttrList.getNumAttrSets(); Idx++)
+        for (Attribute Attr : AttrList.getAttributes(Idx))
+          addAttribute(Attr, Call->getArgOperand(Idx - 1));
+      for (Attribute Attr : AttrList.getFnAttributes())
+        addAttribute(Attr, nullptr);
+    };
+    addAttrList(Call->getAttributes());
+    if (Function *Fn = Call->getCalledFunction())
+      addAttrList(Fn->getAttributes());
+  }
+
+  IntrinsicInst *build() {
+    if (AssumedKnowledgeMap.empty())
+      return nullptr;
+    if (!DebugCounter::shouldExecute(BuildAssumeCounter))
+      return nullptr;
+    Function *FnAssume = Intrinsic::getDeclaration(M, Intrinsic::assume);
+    LLVMContext &C = M->getContext();
+    SmallVector<OperandBundleDef, 8> OpBundle;
+    for (auto &MapElem : AssumedKnowledgeMap) {
+      SmallVector<Value *, 2> Args;
+      if (MapElem.first.first)
+        Args.push_back(MapElem.first.first);
+
+      /// This is only valid because for all attribute that currently exist a
+      /// value of 0 is useless. and should not be preserved.
+      if (MapElem.second)
+        Args.push_back(ConstantInt::get(Type::getInt64Ty(M->getContext()),
+                                        MapElem.second));
+      OpBundle.push_back(OperandBundleDefT<Value *>(
+          std::string(Attribute::getNameFromAttrKind(MapElem.first.second)),
+          Args));
+      NumBundlesInAssumes++;
+    }
+    NumAssumeBuilt++;
+    return cast<IntrinsicInst>(CallInst::Create(
+        FnAssume, ArrayRef<Value *>({ConstantInt::getTrue(C)}), OpBundle));
+  }
+
+  void addAccessedPtr(Instruction *MemInst, Value *Pointer, Type *AccType,
+                      MaybeAlign MA) {
+    unsigned DerefSize = MemInst->getModule()
+                             ->getDataLayout()
+                             .getTypeStoreSize(AccType)
+                             .getKnownMinSize();
+    if (DerefSize != 0) {
+      addKnowledge({Attribute::Dereferenceable, DerefSize, Pointer});
+      if (!NullPointerIsDefined(MemInst->getFunction(),
+                                Pointer->getType()->getPointerAddressSpace()))
+        addKnowledge({Attribute::NonNull, 0u, Pointer});
+    }
+    if (MA.valueOrOne() > 1)
+      addKnowledge(
+          {Attribute::Alignment, unsigned(MA.valueOrOne().value()), Pointer});
+  }
+
+  void addInstruction(Instruction *I) {
+    if (auto *Call = dyn_cast<CallBase>(I))
+      return addCall(Call);
+    if (auto *Load = dyn_cast<LoadInst>(I))
+      return addAccessedPtr(I, Load->getPointerOperand(), Load->getType(),
+                            Load->getAlign());
+    if (auto *Store = dyn_cast<StoreInst>(I))
+      return addAccessedPtr(I, Store->getPointerOperand(),
+                            Store->getValueOperand()->getType(),
+                            Store->getAlign());
+    // TODO: Add support for the other Instructions.
+    // TODO: Maybe we should look around and merge with other llvm.assume.
+  }
+};
+
+} // namespace
+
+IntrinsicInst *llvm::buildAssumeFromInst(Instruction *I) {
+  if (!EnableKnowledgeRetention)
+    return nullptr;
+  AssumeBuilderState Builder(I->getModule());
+  Builder.addInstruction(I);
+  return Builder.build();
+}
+
+void llvm::salvageKnowledge(Instruction *I, AssumptionCache *AC,
+                            DominatorTree *DT) {
+  if (!EnableKnowledgeRetention || I->isTerminator())
+    return;
+  AssumeBuilderState Builder(I->getModule(), I, AC, DT);
+  Builder.addInstruction(I);
+  if (IntrinsicInst *Intr = Builder.build()) {
+    Intr->insertBefore(I);
+    if (AC)
+      AC->registerAssumption(Intr);
+  }
+}
+
+namespace {
+
+struct AssumeSimplify {
+  Function &F;
+  AssumptionCache &AC;
+  DominatorTree *DT;
+  LLVMContext &C;
+  SmallDenseSet<IntrinsicInst *> CleanupToDo;
+  StringMapEntry<uint32_t> *IgnoreTag;
+  SmallDenseMap<BasicBlock *, SmallVector<IntrinsicInst *, 4>, 8> BBToAssume;
+  bool MadeChange = false;
+
+  AssumeSimplify(Function &F, AssumptionCache &AC, DominatorTree *DT,
+                 LLVMContext &C)
+      : F(F), AC(AC), DT(DT), C(C),
+        IgnoreTag(C.getOrInsertBundleTag(IgnoreBundleTag)) {}
+
+  void buildMapping(bool FilterBooleanArgument) {
+    BBToAssume.clear();
+    for (Value *V : AC.assumptions()) {
+      if (!V)
+        continue;
+      IntrinsicInst *Assume = cast<IntrinsicInst>(V);
+      if (FilterBooleanArgument) {
+        auto *Arg = dyn_cast<ConstantInt>(Assume->getOperand(0));
+        if (!Arg || Arg->isZero())
+          continue;
+      }
+      BBToAssume[Assume->getParent()].push_back(Assume);
+    }
+
+    for (auto &Elem : BBToAssume) {
+      llvm::sort(Elem.second,
+                 [](const IntrinsicInst *LHS, const IntrinsicInst *RHS) {
+                   return LHS->comesBefore(RHS);
+                 });
+    }
+  }
+
+  /// Remove all asumes in CleanupToDo if there boolean argument is true and
+  /// ForceCleanup is set or the assume doesn't hold valuable knowledge.
+  void RunCleanup(bool ForceCleanup) {
+    for (IntrinsicInst *Assume : CleanupToDo) {
+      auto *Arg = dyn_cast<ConstantInt>(Assume->getOperand(0));
+      if (!Arg || Arg->isZero() ||
+          (!ForceCleanup && !isAssumeWithEmptyBundle(*Assume)))
+        continue;
+      MadeChange = true;
+      if (ForceCleanup)
+        NumAssumesMerged++;
+      else
+        NumAssumesRemoved++;
+      Assume->eraseFromParent();
+    }
+    CleanupToDo.clear();
+  }
+
+  /// Remove knowledge stored in assume when it is already know by an attribute
+  /// or an other assume. This can when valid update an existing knowledge in an
+  /// attribute or an other assume.
+  void dropRedundantKnowledge() {
+    struct MapValue {
+      IntrinsicInst *Assume;
+      unsigned ArgValue;
+      CallInst::BundleOpInfo *BOI;
+    };
+    buildMapping(false);
+    SmallDenseMap<std::pair<Value *, Attribute::AttrKind>,
+                  SmallVector<MapValue, 2>, 16>
+        Knowledge;
+    for (BasicBlock *BB : depth_first(&F))
+      for (Value *V : BBToAssume[BB]) {
+        if (!V)
+          continue;
+        IntrinsicInst *Assume = cast<IntrinsicInst>(V);
+        for (CallInst::BundleOpInfo &BOI : Assume->bundle_op_infos()) {
+          auto RemoveFromAssume = [&]() {
+            CleanupToDo.insert(Assume);
+            if (BOI.Begin != BOI.End) {
+              Use *U = &Assume->op_begin()[BOI.Begin + ABA_WasOn];
+              U->set(UndefValue::get(U->get()->getType()));
+            }
+            BOI.Tag = IgnoreTag;
+          };
+          if (BOI.Tag == IgnoreTag) {
+            CleanupToDo.insert(Assume);
+            continue;
+          }
+          RetainedKnowledge RK = getKnowledgeFromBundle(*Assume, BOI);
+          if (auto *Arg = dyn_cast_or_null<Argument>(RK.WasOn)) {
+            bool HasSameKindAttr = Arg->hasAttribute(RK.AttrKind);
+            if (HasSameKindAttr)
+              if (!Attribute::doesAttrKindHaveArgument(RK.AttrKind) ||
+                  Arg->getAttribute(RK.AttrKind).getValueAsInt() >=
+                      RK.ArgValue) {
+                RemoveFromAssume();
+                continue;
+              }
+            if (isValidAssumeForContext(
+                    Assume, &*F.getEntryBlock().getFirstInsertionPt()) ||
+                Assume == &*F.getEntryBlock().getFirstInsertionPt()) {
+              if (HasSameKindAttr)
+                Arg->removeAttr(RK.AttrKind);
+              Arg->addAttr(Attribute::get(C, RK.AttrKind, RK.ArgValue));
+              MadeChange = true;
+              RemoveFromAssume();
+              continue;
+            }
+          }
+          auto &Lookup = Knowledge[{RK.WasOn, RK.AttrKind}];
+          for (MapValue &Elem : Lookup) {
+            if (!isValidAssumeForContext(Elem.Assume, Assume, DT))
+              continue;
+            if (Elem.ArgValue >= RK.ArgValue) {
+              RemoveFromAssume();
+              continue;
+            } else if (isValidAssumeForContext(Assume, Elem.Assume, DT)) {
+              Elem.Assume->op_begin()[Elem.BOI->Begin + ABA_Argument].set(
+                  ConstantInt::get(Type::getInt64Ty(C), RK.ArgValue));
+              MadeChange = true;
+              RemoveFromAssume();
+              continue;
+            }
+          }
+          Lookup.push_back({Assume, RK.ArgValue, &BOI});
+        }
+      }
+  }
+
+  using MergeIterator = SmallVectorImpl<IntrinsicInst *>::iterator;
+
+  /// Merge all Assumes from Begin to End in and insert the resulting assume as
+  /// high as possible in the basicblock.
+  void mergeRange(BasicBlock *BB, MergeIterator Begin, MergeIterator End) {
+    if (Begin == End || std::next(Begin) == End)
+      return;
+    /// Provide no additional information so that AssumeBuilderState doesn't
+    /// try to do any punning since it already has been done better.
+    AssumeBuilderState Builder(F.getParent());
+
+    /// For now it is initialized to the best value it could have
+    Instruction *InsertPt = BB->getFirstNonPHI();
+    if (isa<LandingPadInst>(InsertPt))
+      InsertPt = InsertPt->getNextNode();
+    for (IntrinsicInst *I : make_range(Begin, End)) {
+      CleanupToDo.insert(I);
+      for (CallInst::BundleOpInfo &BOI : I->bundle_op_infos()) {
+        RetainedKnowledge RK = getKnowledgeFromBundle(*I, BOI);
+        if (!RK)
+          continue;
+        Builder.addKnowledge(RK);
+        if (auto *I = dyn_cast_or_null<Instruction>(RK.WasOn))
+          if (I->getParent() == InsertPt->getParent() &&
+              (InsertPt->comesBefore(I) || InsertPt == I))
+            InsertPt = I->getNextNode();
+      }
+    }
+
+    /// Adjust InsertPt if it is before Begin, since mergeAssumes only
+    /// guarantees we can place the resulting assume between Begin and End.
+    if (InsertPt->comesBefore(*Begin))
+      for (auto It = (*Begin)->getIterator(), E = InsertPt->getIterator();
+           It != E; --It)
+        if (!isGuaranteedToTransferExecutionToSuccessor(&*It)) {
+          InsertPt = It->getNextNode();
+          break;
+        }
+    IntrinsicInst *MergedAssume = Builder.build();
+    if (!MergedAssume)
+      return;
+    MadeChange = true;
+    MergedAssume->insertBefore(InsertPt);
+    AC.registerAssumption(MergedAssume);
+  }
+
+  /// Merge assume when they are in the same BasicBlock and for all instruction
+  /// between them isGuaranteedToTransferExecutionToSuccessor returns true.
+  void mergeAssumes() {
+    buildMapping(true);
+
+    SmallVector<MergeIterator, 4> SplitPoints;
+    for (auto &Elem : BBToAssume) {
+      SmallVectorImpl<IntrinsicInst *> &AssumesInBB = Elem.second;
+      if (AssumesInBB.size() < 2)
+        continue;
+      /// AssumesInBB is already sorted by order in the block.
+
+      BasicBlock::iterator It = AssumesInBB.front()->getIterator();
+      BasicBlock::iterator E = AssumesInBB.back()->getIterator();
+      SplitPoints.push_back(AssumesInBB.begin());
+      MergeIterator LastSplit = AssumesInBB.begin();
+      for (; It != E; ++It)
+        if (!isGuaranteedToTransferExecutionToSuccessor(&*It)) {
+          for (; (*LastSplit)->comesBefore(&*It); ++LastSplit)
+            ;
+          if (SplitPoints.back() != LastSplit)
+            SplitPoints.push_back(LastSplit);
+        }
+      SplitPoints.push_back(AssumesInBB.end());
+      for (auto SplitIt = SplitPoints.begin();
+           SplitIt != std::prev(SplitPoints.end()); SplitIt++) {
+        mergeRange(Elem.first, *SplitIt, *(SplitIt + 1));
+      }
+      SplitPoints.clear();
+    }
+  }
+};
+
+bool simplifyAssumes(Function &F, AssumptionCache *AC, DominatorTree *DT) {
+  AssumeSimplify AS(F, *AC, DT, F.getContext());
+
+  /// Remove knowledge that is already known by a dominating other assume or an
+  /// attribute.
+  AS.dropRedundantKnowledge();
+
+  /// Remove assume that are empty.
+  AS.RunCleanup(false);
+
+  /// Merge assume in the same basicblock when possible.
+  AS.mergeAssumes();
+
+  /// Remove assume that were merged.
+  AS.RunCleanup(true);
+  return AS.MadeChange;
+}
+
+} // namespace
+
+PreservedAnalyses AssumeSimplifyPass::run(Function &F,
+                                          FunctionAnalysisManager &AM) {
+  if (!EnableKnowledgeRetention)
+    return PreservedAnalyses::all();
+  simplifyAssumes(F, &AM.getResult<AssumptionAnalysis>(F),
+                  AM.getCachedResult<DominatorTreeAnalysis>(F));
+  return PreservedAnalyses::all();
+}
+
+namespace {
+class AssumeSimplifyPassLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  AssumeSimplifyPassLegacyPass() : FunctionPass(ID) {
+    initializeAssumeSimplifyPassLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F) || !EnableKnowledgeRetention)
+      return false;
+    AssumptionCache &AC =
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    DominatorTreeWrapperPass *DTWP =
+        getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    return simplifyAssumes(F, &AC, DTWP ? &DTWP->getDomTree() : nullptr);
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+
+    AU.setPreservesAll();
+  }
+};
+} // namespace
+
+char AssumeSimplifyPassLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AssumeSimplifyPassLegacyPass, "assume-simplify",
+                      "Assume Simplify", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(AssumeSimplifyPassLegacyPass, "assume-simplify",
+                    "Assume Simplify", false, false)
+
+FunctionPass *llvm::createAssumeSimplifyPass() {
+  return new AssumeSimplifyPassLegacyPass();
+}
+
+PreservedAnalyses AssumeBuilderPass::run(Function &F,
+                                         FunctionAnalysisManager &AM) {
+  AssumptionCache *AC = &AM.getResult<AssumptionAnalysis>(F);
+  DominatorTree* DT = AM.getCachedResult<DominatorTreeAnalysis>(F);
+  for (Instruction &I : instructions(F))
+    salvageKnowledge(&I, AC, DT);
+  return PreservedAnalyses::all();
+}
+
+namespace {
+class AssumeBuilderPassLegacyPass : public FunctionPass {
+public:
+  static char ID;
+
+  AssumeBuilderPassLegacyPass() : FunctionPass(ID) {
+    initializeAssumeBuilderPassLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+  bool runOnFunction(Function &F) override {
+    AssumptionCache &AC =
+        getAnalysis<AssumptionCacheTracker>().getAssumptionCache(F);
+    DominatorTreeWrapperPass *DTWP =
+        getAnalysisIfAvailable<DominatorTreeWrapperPass>();
+    for (Instruction &I : instructions(F))
+      salvageKnowledge(&I, &AC, DTWP ? &DTWP->getDomTree() : nullptr);
+    return true;
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<AssumptionCacheTracker>();
+
+    AU.setPreservesAll();
+  }
+};
+} // namespace
+
+char AssumeBuilderPassLegacyPass::ID = 0;
+
+INITIALIZE_PASS_BEGIN(AssumeBuilderPassLegacyPass, "assume-builder",
+                      "Assume Builder", false, false)
+INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker)
+INITIALIZE_PASS_END(AssumeBuilderPassLegacyPass, "assume-builder",
+                    "Assume Builder", false, false)
diff --git a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
index c9eb4abfa21a..085d91031cf9 100644
--- a/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
+++ b/llvm/lib/Transforms/Utils/BasicBlockUtils.cpp
@@ -153,7 +153,8 @@ void llvm::FoldSingleEntryPHINodes(BasicBlock *BB,
   }
 }
 
-bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
+bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI,
+                          MemorySSAUpdater *MSSAU) {
   // Recursively deleting a PHI may cause multiple PHIs to be deleted
   // or RAUW'd undef, so use an array of WeakTrackingVH for the PHIs to delete.
   SmallVector<WeakTrackingVH, 8> PHIs;
@@ -163,7 +164,7 @@ bool llvm::DeleteDeadPHIs(BasicBlock *BB, const TargetLibraryInfo *TLI) {
   bool Changed = false;
   for (unsigned i = 0, e = PHIs.size(); i != e; ++i)
     if (PHINode *PN = dyn_cast_or_null<PHINode>(PHIs[i].operator Value*()))
-      Changed |= RecursivelyDeleteDeadPHINode(PN, TLI);
+      Changed |= RecursivelyDeleteDeadPHINode(PN, TLI, MSSAU);
 
   return Changed;
 }
@@ -314,6 +315,31 @@ bool llvm::MergeBlockIntoPredecessor(BasicBlock *BB, DomTreeUpdater *DTU,
   return true;
 }
 
+bool llvm::MergeBlockSuccessorsIntoGivenBlocks(
+    SmallPtrSetImpl<BasicBlock *> &MergeBlocks, Loop *L, DomTreeUpdater *DTU,
+    LoopInfo *LI) {
+  assert(!MergeBlocks.empty() && "MergeBlocks should not be empty");
+
+  bool BlocksHaveBeenMerged = false;
+  while (!MergeBlocks.empty()) {
+    BasicBlock *BB = *MergeBlocks.begin();
+    BasicBlock *Dest = BB->getSingleSuccessor();
+    if (Dest && (!L || L->contains(Dest))) {
+      BasicBlock *Fold = Dest->getUniquePredecessor();
+      (void)Fold;
+      if (MergeBlockIntoPredecessor(Dest, DTU, LI)) {
+        assert(Fold == BB &&
+               "Expecting BB to be unique predecessor of the Dest block");
+        MergeBlocks.erase(Dest);
+        BlocksHaveBeenMerged = true;
+      } else
+        MergeBlocks.erase(BB);
+    } else
+      MergeBlocks.erase(BB);
+  }
+  return BlocksHaveBeenMerged;
+}
+
 /// Remove redundant instructions within sequences of consecutive dbg.value
 /// instructions. This is done using a backward scan to keep the last dbg.value
 /// describing a specific variable/fragment.
@@ -505,7 +531,8 @@ llvm::SplitAllCriticalEdges(Function &F,
   unsigned NumBroken = 0;
   for (BasicBlock &BB : F) {
     Instruction *TI = BB.getTerminator();
-    if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI))
+    if (TI->getNumSuccessors() > 1 && !isa<IndirectBrInst>(TI) &&
+        !isa<CallBrInst>(TI))
       for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
         if (SplitCriticalEdge(TI, i, Options))
           ++NumBroken;
@@ -900,9 +927,25 @@ ReturnInst *llvm::FoldReturnIntoUncondBranch(ReturnInst *RI, BasicBlock *BB,
       Pred->getInstList().insert(NewRet->getIterator(), NewBC);
       *i = NewBC;
     }
+
+    Instruction *NewEV = nullptr;
+    if (ExtractValueInst *EVI = dyn_cast<ExtractValueInst>(V)) {
+      V = EVI->getOperand(0);
+      NewEV = EVI->clone();
+      if (NewBC) {
+        NewBC->setOperand(0, NewEV);
+        Pred->getInstList().insert(NewBC->getIterator(), NewEV);
+      } else {
+        Pred->getInstList().insert(NewRet->getIterator(), NewEV);
+        *i = NewEV;
+      }
+    }
+
     if (PHINode *PN = dyn_cast<PHINode>(V)) {
       if (PN->getParent() == BB) {
-        if (NewBC)
+        if (NewEV) {
+          NewEV->setOperand(0, PN->getIncomingValueForBlock(Pred));
+        } else if (NewBC)
           NewBC->setOperand(0, PN->getIncomingValueForBlock(Pred));
         else
           *i = PN->getIncomingValueForBlock(Pred);
@@ -1084,3 +1127,247 @@ Value *llvm::GetIfCondition(BasicBlock *BB, BasicBlock *&IfTrue,
   }
   return BI->getCondition();
 }
+
+// After creating a control flow hub, the operands of PHINodes in an outgoing
+// block Out no longer match the predecessors of that block. Predecessors of Out
+// that are incoming blocks to the hub are now replaced by just one edge from
+// the hub. To match this new control flow, the corresponding values from each
+// PHINode must now be moved a new PHINode in the first guard block of the hub.
+//
+// This operation cannot be performed with SSAUpdater, because it involves one
+// new use: If the block Out is in the list of Incoming blocks, then the newly
+// created PHI in the Hub will use itself along that edge from Out to Hub.
+static void reconnectPhis(BasicBlock *Out, BasicBlock *GuardBlock,
+                          const SetVector<BasicBlock *> &Incoming,
+                          BasicBlock *FirstGuardBlock) {
+  auto I = Out->begin();
+  while (I != Out->end() && isa<PHINode>(I)) {
+    auto Phi = cast<PHINode>(I);
+    auto NewPhi =
+        PHINode::Create(Phi->getType(), Incoming.size(),
+                        Phi->getName() + ".moved", &FirstGuardBlock->back());
+    for (auto In : Incoming) {
+      Value *V = UndefValue::get(Phi->getType());
+      if (In == Out) {
+        V = NewPhi;
+      } else if (Phi->getBasicBlockIndex(In) != -1) {
+        V = Phi->removeIncomingValue(In, false);
+      }
+      NewPhi->addIncoming(V, In);
+    }
+    assert(NewPhi->getNumIncomingValues() == Incoming.size());
+    if (Phi->getNumOperands() == 0) {
+      Phi->replaceAllUsesWith(NewPhi);
+      I = Phi->eraseFromParent();
+      continue;
+    }
+    Phi->addIncoming(NewPhi, GuardBlock);
+    ++I;
+  }
+}
+
+using BBPredicates = DenseMap<BasicBlock *, PHINode *>;
+using BBSetVector = SetVector<BasicBlock *>;
+
+// Redirects the terminator of the incoming block to the first guard
+// block in the hub. The condition of the original terminator (if it
+// was conditional) and its original successors are returned as a
+// tuple <condition, succ0, succ1>. The function additionally filters
+// out successors that are not in the set of outgoing blocks.
+//
+// - condition is non-null iff the branch is conditional.
+// - Succ1 is non-null iff the sole/taken target is an outgoing block.
+// - Succ2 is non-null iff condition is non-null and the fallthrough
+//         target is an outgoing block.
+static std::tuple<Value *, BasicBlock *, BasicBlock *>
+redirectToHub(BasicBlock *BB, BasicBlock *FirstGuardBlock,
+              const BBSetVector &Outgoing) {
+  auto Branch = cast<BranchInst>(BB->getTerminator());
+  auto Condition = Branch->isConditional() ? Branch->getCondition() : nullptr;
+
+  BasicBlock *Succ0 = Branch->getSuccessor(0);
+  BasicBlock *Succ1 = nullptr;
+  Succ0 = Outgoing.count(Succ0) ? Succ0 : nullptr;
+
+  if (Branch->isUnconditional()) {
+    Branch->setSuccessor(0, FirstGuardBlock);
+    assert(Succ0);
+  } else {
+    Succ1 = Branch->getSuccessor(1);
+    Succ1 = Outgoing.count(Succ1) ? Succ1 : nullptr;
+    assert(Succ0 || Succ1);
+    if (Succ0 && !Succ1) {
+      Branch->setSuccessor(0, FirstGuardBlock);
+    } else if (Succ1 && !Succ0) {
+      Branch->setSuccessor(1, FirstGuardBlock);
+    } else {
+      Branch->eraseFromParent();
+      BranchInst::Create(FirstGuardBlock, BB);
+    }
+  }
+
+  assert(Succ0 || Succ1);
+  return std::make_tuple(Condition, Succ0, Succ1);
+}
+
+// Capture the existing control flow as guard predicates, and redirect
+// control flow from every incoming block to the first guard block in
+// the hub.
+//
+// There is one guard predicate for each outgoing block OutBB. The
+// predicate is a PHINode with one input for each InBB which
+// represents whether the hub should transfer control flow to OutBB if
+// it arrived from InBB. These predicates are NOT ORTHOGONAL. The Hub
+// evaluates them in the same order as the Outgoing set-vector, and
+// control branches to the first outgoing block whose predicate
+// evaluates to true.
+static void convertToGuardPredicates(
+    BasicBlock *FirstGuardBlock, BBPredicates &GuardPredicates,
+    SmallVectorImpl<WeakVH> &DeletionCandidates, const BBSetVector &Incoming,
+    const BBSetVector &Outgoing) {
+  auto &Context = Incoming.front()->getContext();
+  auto BoolTrue = ConstantInt::getTrue(Context);
+  auto BoolFalse = ConstantInt::getFalse(Context);
+
+  // The predicate for the last outgoing is trivially true, and so we
+  // process only the first N-1 successors.
+  for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) {
+    auto Out = Outgoing[i];
+    LLVM_DEBUG(dbgs() << "Creating guard for " << Out->getName() << "\n");
+    auto Phi =
+        PHINode::Create(Type::getInt1Ty(Context), Incoming.size(),
+                        StringRef("Guard.") + Out->getName(), FirstGuardBlock);
+    GuardPredicates[Out] = Phi;
+  }
+
+  for (auto In : Incoming) {
+    Value *Condition;
+    BasicBlock *Succ0;
+    BasicBlock *Succ1;
+    std::tie(Condition, Succ0, Succ1) =
+        redirectToHub(In, FirstGuardBlock, Outgoing);
+
+    // Optimization: Consider an incoming block A with both successors
+    // Succ0 and Succ1 in the set of outgoing blocks. The predicates
+    // for Succ0 and Succ1 complement each other. If Succ0 is visited
+    // first in the loop below, control will branch to Succ0 using the
+    // corresponding predicate. But if that branch is not taken, then
+    // control must reach Succ1, which means that the predicate for
+    // Succ1 is always true.
+    bool OneSuccessorDone = false;
+    for (int i = 0, e = Outgoing.size() - 1; i != e; ++i) {
+      auto Out = Outgoing[i];
+      auto Phi = GuardPredicates[Out];
+      if (Out != Succ0 && Out != Succ1) {
+        Phi->addIncoming(BoolFalse, In);
+        continue;
+      }
+      // Optimization: When only one successor is an outgoing block,
+      // the predicate is always true.
+      if (!Succ0 || !Succ1 || OneSuccessorDone) {
+        Phi->addIncoming(BoolTrue, In);
+        continue;
+      }
+      assert(Succ0 && Succ1);
+      OneSuccessorDone = true;
+      if (Out == Succ0) {
+        Phi->addIncoming(Condition, In);
+        continue;
+      }
+      auto Inverted = invertCondition(Condition);
+      DeletionCandidates.push_back(Condition);
+      Phi->addIncoming(Inverted, In);
+    }
+  }
+}
+
+// For each outgoing block OutBB, create a guard block in the Hub. The
+// first guard block was already created outside, and available as the
+// first element in the vector of guard blocks.
+//
+// Each guard block terminates in a conditional branch that transfers
+// control to the corresponding outgoing block or the next guard
+// block. The last guard block has two outgoing blocks as successors
+// since the condition for the final outgoing block is trivially
+// true. So we create one less block (including the first guard block)
+// than the number of outgoing blocks.
+static void createGuardBlocks(SmallVectorImpl<BasicBlock *> &GuardBlocks,
+                              Function *F, const BBSetVector &Outgoing,
+                              BBPredicates &GuardPredicates, StringRef Prefix) {
+  for (int i = 0, e = Outgoing.size() - 2; i != e; ++i) {
+    GuardBlocks.push_back(
+        BasicBlock::Create(F->getContext(), Prefix + ".guard", F));
+  }
+  assert(GuardBlocks.size() == GuardPredicates.size());
+
+  // To help keep the loop simple, temporarily append the last
+  // outgoing block to the list of guard blocks.
+  GuardBlocks.push_back(Outgoing.back());
+
+  for (int i = 0, e = GuardBlocks.size() - 1; i != e; ++i) {
+    auto Out = Outgoing[i];
+    assert(GuardPredicates.count(Out));
+    BranchInst::Create(Out, GuardBlocks[i + 1], GuardPredicates[Out],
+                       GuardBlocks[i]);
+  }
+
+  // Remove the last block from the guard list.
+  GuardBlocks.pop_back();
+}
+
+BasicBlock *llvm::CreateControlFlowHub(
+    DomTreeUpdater *DTU, SmallVectorImpl<BasicBlock *> &GuardBlocks,
+    const BBSetVector &Incoming, const BBSetVector &Outgoing,
+    const StringRef Prefix) {
+  auto F = Incoming.front()->getParent();
+  auto FirstGuardBlock =
+      BasicBlock::Create(F->getContext(), Prefix + ".guard", F);
+
+  SmallVector<DominatorTree::UpdateType, 16> Updates;
+  if (DTU) {
+    for (auto In : Incoming) {
+      for (auto Succ : successors(In)) {
+        if (Outgoing.count(Succ))
+          Updates.push_back({DominatorTree::Delete, In, Succ});
+      }
+      Updates.push_back({DominatorTree::Insert, In, FirstGuardBlock});
+    }
+  }
+
+  BBPredicates GuardPredicates;
+  SmallVector<WeakVH, 8> DeletionCandidates;
+  convertToGuardPredicates(FirstGuardBlock, GuardPredicates, DeletionCandidates,
+                           Incoming, Outgoing);
+
+  GuardBlocks.push_back(FirstGuardBlock);
+  createGuardBlocks(GuardBlocks, F, Outgoing, GuardPredicates, Prefix);
+
+  // Update the PHINodes in each outgoing block to match the new control flow.
+  for (int i = 0, e = GuardBlocks.size(); i != e; ++i) {
+    reconnectPhis(Outgoing[i], GuardBlocks[i], Incoming, FirstGuardBlock);
+  }
+  reconnectPhis(Outgoing.back(), GuardBlocks.back(), Incoming, FirstGuardBlock);
+
+  if (DTU) {
+    int NumGuards = GuardBlocks.size();
+    assert((int)Outgoing.size() == NumGuards + 1);
+    for (int i = 0; i != NumGuards - 1; ++i) {
+      Updates.push_back({DominatorTree::Insert, GuardBlocks[i], Outgoing[i]});
+      Updates.push_back(
+          {DominatorTree::Insert, GuardBlocks[i], GuardBlocks[i + 1]});
+    }
+    Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1],
+                       Outgoing[NumGuards - 1]});
+    Updates.push_back({DominatorTree::Insert, GuardBlocks[NumGuards - 1],
+                       Outgoing[NumGuards]});
+    DTU->applyUpdates(Updates);
+  }
+
+  for (auto I : DeletionCandidates) {
+    if (I->use_empty())
+      if (auto Inst = dyn_cast_or_null<Instruction>(I))
+        Inst->eraseFromParent();
+  }
+
+  return FirstGuardBlock;
+}
diff --git a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
index 008cea333e6b..39fb504cf7b7 100644
--- a/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
+++ b/llvm/lib/Transforms/Utils/BreakCriticalEdges.cpp
@@ -150,14 +150,51 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
   // it in this generic function.
   if (DestBB->isEHPad()) return nullptr;
 
-  // Don't split the non-fallthrough edge from a callbr.
-  if (isa<CallBrInst>(TI) && SuccNum > 0)
-    return nullptr;
-
   if (Options.IgnoreUnreachableDests &&
       isa<UnreachableInst>(DestBB->getFirstNonPHIOrDbgOrLifetime()))
     return nullptr;
 
+  auto *LI = Options.LI;
+  SmallVector<BasicBlock *, 4> LoopPreds;
+  // Check if extra modifications will be required to preserve loop-simplify
+  // form after splitting. If it would require splitting blocks with IndirectBr
+  // terminators, bail out if preserving loop-simplify form is requested.
+  if (LI) {
+    if (Loop *TIL = LI->getLoopFor(TIBB)) {
+
+      // The only that we can break LoopSimplify form by splitting a critical
+      // edge is if after the split there exists some edge from TIL to DestBB
+      // *and* the only edge into DestBB from outside of TIL is that of
+      // NewBB. If the first isn't true, then LoopSimplify still holds, NewBB
+      // is the new exit block and it has no non-loop predecessors. If the
+      // second isn't true, then DestBB was not in LoopSimplify form prior to
+      // the split as it had a non-loop predecessor. In both of these cases,
+      // the predecessor must be directly in TIL, not in a subloop, or again
+      // LoopSimplify doesn't hold.
+      for (pred_iterator I = pred_begin(DestBB), E = pred_end(DestBB); I != E;
+           ++I) {
+        BasicBlock *P = *I;
+        if (P == TIBB)
+          continue; // The new block is known.
+        if (LI->getLoopFor(P) != TIL) {
+          // No need to re-simplify, it wasn't to start with.
+          LoopPreds.clear();
+          break;
+        }
+        LoopPreds.push_back(P);
+      }
+      // Loop-simplify form can be preserved, if we can split all in-loop
+      // predecessors.
+      if (any_of(LoopPreds, [](BasicBlock *Pred) {
+            return isa<IndirectBrInst>(Pred->getTerminator());
+          })) {
+        if (Options.PreserveLoopSimplify)
+          return nullptr;
+        LoopPreds.clear();
+      }
+    }
+  }
+
   // Create a new basic block, linking it into the CFG.
   BasicBlock *NewBB = BasicBlock::Create(TI->getContext(),
                       TIBB->getName() + "." + DestBB->getName() + "_crit_edge");
@@ -165,14 +202,14 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
   BranchInst *NewBI = BranchInst::Create(DestBB, NewBB);
   NewBI->setDebugLoc(TI->getDebugLoc());
 
-  // Branch to the new block, breaking the edge.
-  TI->setSuccessor(SuccNum, NewBB);
-
   // Insert the block into the function... right after the block TI lives in.
   Function &F = *TIBB->getParent();
   Function::iterator FBBI = TIBB->getIterator();
   F.getBasicBlockList().insert(++FBBI, NewBB);
 
+  // Branch to the new block, breaking the edge.
+  TI->setSuccessor(SuccNum, NewBB);
+
   // If there are any PHI nodes in DestBB, we need to update them so that they
   // merge incoming values from NewBB instead of from TIBB.
   {
@@ -212,7 +249,6 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
   // If we have nothing to update, just return.
   auto *DT = Options.DT;
   auto *PDT = Options.PDT;
-  auto *LI = Options.LI;
   auto *MSSAU = Options.MSSAU;
   if (MSSAU)
     MSSAU->wireOldPredecessorsToNewImmediatePredecessor(
@@ -281,28 +317,6 @@ llvm::SplitCriticalEdge(Instruction *TI, unsigned SuccNum,
           createPHIsForSplitLoopExit(TIBB, NewBB, DestBB);
         }
 
-        // The only that we can break LoopSimplify form by splitting a critical
-        // edge is if after the split there exists some edge from TIL to DestBB
-        // *and* the only edge into DestBB from outside of TIL is that of
-        // NewBB. If the first isn't true, then LoopSimplify still holds, NewBB
-        // is the new exit block and it has no non-loop predecessors. If the
-        // second isn't true, then DestBB was not in LoopSimplify form prior to
-        // the split as it had a non-loop predecessor. In both of these cases,
-        // the predecessor must be directly in TIL, not in a subloop, or again
-        // LoopSimplify doesn't hold.
-        SmallVector<BasicBlock *, 4> LoopPreds;
-        for (pred_iterator I = pred_begin(DestBB), E = pred_end(DestBB); I != E;
-             ++I) {
-          BasicBlock *P = *I;
-          if (P == NewBB)
-            continue; // The new block is known.
-          if (LI->getLoopFor(P) != TIL) {
-            // No need to re-simplify, it wasn't to start with.
-            LoopPreds.clear();
-            break;
-          }
-          LoopPreds.push_back(P);
-        }
         if (!LoopPreds.empty()) {
           assert(!DestBB->isEHPad() && "We don't split edges to EH pads!");
           BasicBlock *NewExitBB = SplitBlockPredecessors(
@@ -388,13 +402,20 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F,
     if (FirstNonPHI->isEHPad() || Target->isLandingPad())
       continue;
 
+    // Remember edge probabilities if needed.
+    SmallVector<BranchProbability, 4> EdgeProbabilities;
+    if (ShouldUpdateAnalysis) {
+      EdgeProbabilities.reserve(Target->getTerminator()->getNumSuccessors());
+      for (unsigned I = 0, E = Target->getTerminator()->getNumSuccessors();
+           I < E; ++I)
+        EdgeProbabilities.emplace_back(BPI->getEdgeProbability(Target, I));
+      BPI->eraseBlock(Target);
+    }
+
     BasicBlock *BodyBlock = Target->splitBasicBlock(FirstNonPHI, ".split");
     if (ShouldUpdateAnalysis) {
       // Copy the BFI/BPI from Target to BodyBlock.
-      for (unsigned I = 0, E = BodyBlock->getTerminator()->getNumSuccessors();
-           I < E; ++I)
-        BPI->setEdgeProbability(BodyBlock, I,
-                                BPI->getEdgeProbability(Target, I));
+      BPI->setEdgeProbability(BodyBlock, EdgeProbabilities);
       BFI->setBlockFreq(BodyBlock, BFI->getBlockFreq(Target).getFrequency());
     }
     // It's possible Target was its own successor through an indirectbr.
@@ -423,7 +444,6 @@ bool llvm::SplitIndirectBrCriticalEdges(Function &F,
       BlockFrequency NewBlockFreqForTarget =
           BFI->getBlockFreq(Target) - BlockFreqForDirectSucc;
       BFI->setBlockFreq(Target, NewBlockFreqForTarget.getFrequency());
-      BPI->eraseBlock(Target);
     }
 
     // Ok, now fix up the PHIs. We know the two blocks only have PHIs, and that
diff --git a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
index 71316ce8f758..c64ad147fdfe 100644
--- a/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -378,6 +378,10 @@ bool llvm::inferLibFuncAttributes(Function &F, const TargetLibraryInfo &TLI) {
     Changed |= setDoesNotCapture(F, 1);
     Changed |= setOnlyReadsMemory(F, 1);
     return Changed;
+  case LibFunc_aligned_alloc:
+    Changed |= setDoesNotThrow(F);
+    Changed |= setRetDoesNotAlias(F);
+    return Changed;
   case LibFunc_bcopy:
     Changed |= setDoesNotThrow(F);
     Changed |= setDoesNotCapture(F, 0);
@@ -819,14 +823,14 @@ StringRef llvm::getFloatFnName(const TargetLibraryInfo *TLI, Type *Ty,
 
 //- Emit LibCalls ------------------------------------------------------------//
 
-Value *llvm::castToCStr(Value *V, IRBuilder<> &B) {
+Value *llvm::castToCStr(Value *V, IRBuilderBase &B) {
   unsigned AS = V->getType()->getPointerAddressSpace();
   return B.CreateBitCast(V, B.getInt8PtrTy(AS), "cstr");
 }
 
 static Value *emitLibCall(LibFunc TheLibFunc, Type *ReturnType,
                           ArrayRef<Type *> ParamTypes,
-                          ArrayRef<Value *> Operands, IRBuilder<> &B,
+                          ArrayRef<Value *> Operands, IRBuilderBase &B,
                           const TargetLibraryInfo *TLI,
                           bool IsVaArgs = false) {
   if (!TLI->has(TheLibFunc))
@@ -844,20 +848,20 @@ static Value *emitLibCall(LibFunc TheLibFunc, Type *ReturnType,
   return CI;
 }
 
-Value *llvm::emitStrLen(Value *Ptr, IRBuilder<> &B, const DataLayout &DL,
+Value *llvm::emitStrLen(Value *Ptr, IRBuilderBase &B, const DataLayout &DL,
                         const TargetLibraryInfo *TLI) {
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   return emitLibCall(LibFunc_strlen, DL.getIntPtrType(Context),
                      B.getInt8PtrTy(), castToCStr(Ptr, B), B, TLI);
 }
 
-Value *llvm::emitStrDup(Value *Ptr, IRBuilder<> &B,
+Value *llvm::emitStrDup(Value *Ptr, IRBuilderBase &B,
                         const TargetLibraryInfo *TLI) {
   return emitLibCall(LibFunc_strdup, B.getInt8PtrTy(), B.getInt8PtrTy(),
                      castToCStr(Ptr, B), B, TLI);
 }
 
-Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
+Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilderBase &B,
                         const TargetLibraryInfo *TLI) {
   Type *I8Ptr = B.getInt8PtrTy();
   Type *I32Ty = B.getInt32Ty();
@@ -865,7 +869,7 @@ Value *llvm::emitStrChr(Value *Ptr, char C, IRBuilder<> &B,
                      {castToCStr(Ptr, B), ConstantInt::get(I32Ty, C)}, B, TLI);
 }
 
-Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
+Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
                          const DataLayout &DL, const TargetLibraryInfo *TLI) {
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   return emitLibCall(
@@ -874,28 +878,28 @@ Value *llvm::emitStrNCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
       {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI);
 }
 
-Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilder<> &B,
+Value *llvm::emitStrCpy(Value *Dst, Value *Src, IRBuilderBase &B,
                         const TargetLibraryInfo *TLI) {
   Type *I8Ptr = B.getInt8PtrTy();
   return emitLibCall(LibFunc_strcpy, I8Ptr, {I8Ptr, I8Ptr},
                      {castToCStr(Dst, B), castToCStr(Src, B)}, B, TLI);
 }
 
-Value *llvm::emitStpCpy(Value *Dst, Value *Src, IRBuilder<> &B,
+Value *llvm::emitStpCpy(Value *Dst, Value *Src, IRBuilderBase &B,
                         const TargetLibraryInfo *TLI) {
   Type *I8Ptr = B.getInt8PtrTy();
   return emitLibCall(LibFunc_stpcpy, I8Ptr, {I8Ptr, I8Ptr},
                      {castToCStr(Dst, B), castToCStr(Src, B)}, B, TLI);
 }
 
-Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
+Value *llvm::emitStrNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
                          const TargetLibraryInfo *TLI) {
   Type *I8Ptr = B.getInt8PtrTy();
   return emitLibCall(LibFunc_strncpy, I8Ptr, {I8Ptr, I8Ptr, Len->getType()},
                      {castToCStr(Dst, B), castToCStr(Src, B), Len}, B, TLI);
 }
 
-Value *llvm::emitStpNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
+Value *llvm::emitStpNCpy(Value *Dst, Value *Src, Value *Len, IRBuilderBase &B,
                          const TargetLibraryInfo *TLI) {
   Type *I8Ptr = B.getInt8PtrTy();
   return emitLibCall(LibFunc_stpncpy, I8Ptr, {I8Ptr, I8Ptr, Len->getType()},
@@ -903,7 +907,7 @@ Value *llvm::emitStpNCpy(Value *Dst, Value *Src, Value *Len, IRBuilder<> &B,
 }
 
 Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
-                           IRBuilder<> &B, const DataLayout &DL,
+                           IRBuilderBase &B, const DataLayout &DL,
                            const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc_memcpy_chk))
     return nullptr;
@@ -926,7 +930,7 @@ Value *llvm::emitMemCpyChk(Value *Dst, Value *Src, Value *Len, Value *ObjSize,
   return CI;
 }
 
-Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
+Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilderBase &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   return emitLibCall(
@@ -935,7 +939,7 @@ Value *llvm::emitMemChr(Value *Ptr, Value *Val, Value *Len, IRBuilder<> &B,
       {castToCStr(Ptr, B), Val, Len}, B, TLI);
 }
 
-Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
+Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   return emitLibCall(
@@ -944,7 +948,7 @@ Value *llvm::emitMemCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
       {castToCStr(Ptr1, B), castToCStr(Ptr2, B), Len}, B, TLI);
 }
 
-Value *llvm::emitBCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
+Value *llvm::emitBCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilderBase &B,
                       const DataLayout &DL, const TargetLibraryInfo *TLI) {
   LLVMContext &Context = B.GetInsertBlock()->getContext();
   return emitLibCall(
@@ -954,7 +958,7 @@ Value *llvm::emitBCmp(Value *Ptr1, Value *Ptr2, Value *Len, IRBuilder<> &B,
 }
 
 Value *llvm::emitMemCCpy(Value *Ptr1, Value *Ptr2, Value *Val, Value *Len,
-                         IRBuilder<> &B, const TargetLibraryInfo *TLI) {
+                         IRBuilderBase &B, const TargetLibraryInfo *TLI) {
   return emitLibCall(
       LibFunc_memccpy, B.getInt8PtrTy(),
       {B.getInt8PtrTy(), B.getInt8PtrTy(), B.getInt32Ty(), Len->getType()},
@@ -962,7 +966,7 @@ Value *llvm::emitMemCCpy(Value *Ptr1, Value *Ptr2, Value *Val, Value *Len,
 }
 
 Value *llvm::emitSNPrintf(Value *Dest, Value *Size, Value *Fmt,
-                          ArrayRef<Value *> VariadicArgs, IRBuilder<> &B,
+                          ArrayRef<Value *> VariadicArgs, IRBuilderBase &B,
                           const TargetLibraryInfo *TLI) {
   SmallVector<Value *, 8> Args{castToCStr(Dest, B), Size, castToCStr(Fmt, B)};
   Args.insert(Args.end(), VariadicArgs.begin(), VariadicArgs.end());
@@ -972,7 +976,7 @@ Value *llvm::emitSNPrintf(Value *Dest, Value *Size, Value *Fmt,
 }
 
 Value *llvm::emitSPrintf(Value *Dest, Value *Fmt,
-                         ArrayRef<Value *> VariadicArgs, IRBuilder<> &B,
+                         ArrayRef<Value *> VariadicArgs, IRBuilderBase &B,
                          const TargetLibraryInfo *TLI) {
   SmallVector<Value *, 8> Args{castToCStr(Dest, B), castToCStr(Fmt, B)};
   Args.insert(Args.end(), VariadicArgs.begin(), VariadicArgs.end());
@@ -981,28 +985,28 @@ Value *llvm::emitSPrintf(Value *Dest, Value *Fmt,
                      /*IsVaArgs=*/true);
 }
 
-Value *llvm::emitStrCat(Value *Dest, Value *Src, IRBuilder<> &B,
+Value *llvm::emitStrCat(Value *Dest, Value *Src, IRBuilderBase &B,
                         const TargetLibraryInfo *TLI) {
   return emitLibCall(LibFunc_strcat, B.getInt8PtrTy(),
                      {B.getInt8PtrTy(), B.getInt8PtrTy()},
                      {castToCStr(Dest, B), castToCStr(Src, B)}, B, TLI);
 }
 
-Value *llvm::emitStrLCpy(Value *Dest, Value *Src, Value *Size, IRBuilder<> &B,
+Value *llvm::emitStrLCpy(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B,
                          const TargetLibraryInfo *TLI) {
   return emitLibCall(LibFunc_strlcpy, Size->getType(),
                      {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()},
                      {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI);
 }
 
-Value *llvm::emitStrLCat(Value *Dest, Value *Src, Value *Size, IRBuilder<> &B,
+Value *llvm::emitStrLCat(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B,
                          const TargetLibraryInfo *TLI) {
   return emitLibCall(LibFunc_strlcat, Size->getType(),
                      {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()},
                      {castToCStr(Dest, B), castToCStr(Src, B), Size}, B, TLI);
 }
 
-Value *llvm::emitStrNCat(Value *Dest, Value *Src, Value *Size, IRBuilder<> &B,
+Value *llvm::emitStrNCat(Value *Dest, Value *Src, Value *Size, IRBuilderBase &B,
                          const TargetLibraryInfo *TLI) {
   return emitLibCall(LibFunc_strncat, B.getInt8PtrTy(),
                      {B.getInt8PtrTy(), B.getInt8PtrTy(), Size->getType()},
@@ -1010,7 +1014,7 @@ Value *llvm::emitStrNCat(Value *Dest, Value *Src, Value *Size, IRBuilder<> &B,
 }
 
 Value *llvm::emitVSNPrintf(Value *Dest, Value *Size, Value *Fmt, Value *VAList,
-                           IRBuilder<> &B, const TargetLibraryInfo *TLI) {
+                           IRBuilderBase &B, const TargetLibraryInfo *TLI) {
   return emitLibCall(
       LibFunc_vsnprintf, B.getInt32Ty(),
       {B.getInt8PtrTy(), Size->getType(), B.getInt8PtrTy(), VAList->getType()},
@@ -1018,7 +1022,7 @@ Value *llvm::emitVSNPrintf(Value *Dest, Value *Size, Value *Fmt, Value *VAList,
 }
 
 Value *llvm::emitVSPrintf(Value *Dest, Value *Fmt, Value *VAList,
-                          IRBuilder<> &B, const TargetLibraryInfo *TLI) {
+                          IRBuilderBase &B, const TargetLibraryInfo *TLI) {
   return emitLibCall(LibFunc_vsprintf, B.getInt32Ty(),
                      {B.getInt8PtrTy(), B.getInt8PtrTy(), VAList->getType()},
                      {castToCStr(Dest, B), castToCStr(Fmt, B), VAList}, B, TLI);
@@ -1040,7 +1044,7 @@ static void appendTypeSuffix(Value *Op, StringRef &Name,
 }
 
 static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
-                                         IRBuilder<> &B,
+                                         IRBuilderBase &B,
                                          const AttributeList &Attrs) {
   assert((Name != "") && "Must specify Name to emitUnaryFloatFnCall");
 
@@ -1062,7 +1066,7 @@ static Value *emitUnaryFloatFnCallHelper(Value *Op, StringRef Name,
   return CI;
 }
 
-Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
+Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilderBase &B,
                                   const AttributeList &Attrs) {
   SmallString<20> NameBuffer;
   appendTypeSuffix(Op, Name, NameBuffer);
@@ -1072,7 +1076,7 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
 
 Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
                                   LibFunc DoubleFn, LibFunc FloatFn,
-                                  LibFunc LongDoubleFn, IRBuilder<> &B,
+                                  LibFunc LongDoubleFn, IRBuilderBase &B,
                                   const AttributeList &Attrs) {
   // Get the name of the function according to TLI.
   StringRef Name = getFloatFnName(TLI, Op->getType(),
@@ -1082,7 +1086,7 @@ Value *llvm::emitUnaryFloatFnCall(Value *Op, const TargetLibraryInfo *TLI,
 }
 
 static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
-                                          StringRef Name, IRBuilder<> &B,
+                                          StringRef Name, IRBuilderBase &B,
                                           const AttributeList &Attrs) {
   assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
 
@@ -1105,7 +1109,8 @@ static Value *emitBinaryFloatFnCallHelper(Value *Op1, Value *Op2,
 }
 
 Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
-                                   IRBuilder<> &B, const AttributeList &Attrs) {
+                                   IRBuilderBase &B,
+                                   const AttributeList &Attrs) {
   assert((Name != "") && "Must specify Name to emitBinaryFloatFnCall");
 
   SmallString<20> NameBuffer;
@@ -1117,7 +1122,7 @@ Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2, StringRef Name,
 Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
                                    const TargetLibraryInfo *TLI,
                                    LibFunc DoubleFn, LibFunc FloatFn,
-                                   LibFunc LongDoubleFn, IRBuilder<> &B,
+                                   LibFunc LongDoubleFn, IRBuilderBase &B,
                                    const AttributeList &Attrs) {
   // Get the name of the function according to TLI.
   StringRef Name = getFloatFnName(TLI, Op1->getType(),
@@ -1126,7 +1131,7 @@ Value *llvm::emitBinaryFloatFnCall(Value *Op1, Value *Op2,
   return emitBinaryFloatFnCallHelper(Op1, Op2, Name, B, Attrs);
 }
 
-Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
+Value *llvm::emitPutChar(Value *Char, IRBuilderBase &B,
                          const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc_putchar))
     return nullptr;
@@ -1149,7 +1154,7 @@ Value *llvm::emitPutChar(Value *Char, IRBuilder<> &B,
   return CI;
 }
 
-Value *llvm::emitPutS(Value *Str, IRBuilder<> &B,
+Value *llvm::emitPutS(Value *Str, IRBuilderBase &B,
                       const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc_puts))
     return nullptr;
@@ -1166,7 +1171,7 @@ Value *llvm::emitPutS(Value *Str, IRBuilder<> &B,
   return CI;
 }
 
-Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
+Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilderBase &B,
                        const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc_fputc))
     return nullptr;
@@ -1187,27 +1192,7 @@ Value *llvm::emitFPutC(Value *Char, Value *File, IRBuilder<> &B,
   return CI;
 }
 
-Value *llvm::emitFPutCUnlocked(Value *Char, Value *File, IRBuilder<> &B,
-                               const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_fputc_unlocked))
-    return nullptr;
-
-  Module *M = B.GetInsertBlock()->getModule();
-  StringRef FPutcUnlockedName = TLI->getName(LibFunc_fputc_unlocked);
-  FunctionCallee F = M->getOrInsertFunction(FPutcUnlockedName, B.getInt32Ty(),
-                                            B.getInt32Ty(), File->getType());
-  if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M, FPutcUnlockedName, *TLI);
-  Char = B.CreateIntCast(Char, B.getInt32Ty(), /*isSigned*/ true, "chari");
-  CallInst *CI = B.CreateCall(F, {Char, File}, FPutcUnlockedName);
-
-  if (const Function *Fn =
-          dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
-    CI->setCallingConv(Fn->getCallingConv());
-  return CI;
-}
-
-Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
+Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilderBase &B,
                        const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc_fputs))
     return nullptr;
@@ -1226,26 +1211,7 @@ Value *llvm::emitFPutS(Value *Str, Value *File, IRBuilder<> &B,
   return CI;
 }
 
-Value *llvm::emitFPutSUnlocked(Value *Str, Value *File, IRBuilder<> &B,
-                               const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_fputs_unlocked))
-    return nullptr;
-
-  Module *M = B.GetInsertBlock()->getModule();
-  StringRef FPutsUnlockedName = TLI->getName(LibFunc_fputs_unlocked);
-  FunctionCallee F = M->getOrInsertFunction(FPutsUnlockedName, B.getInt32Ty(),
-                                            B.getInt8PtrTy(), File->getType());
-  if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M, FPutsUnlockedName, *TLI);
-  CallInst *CI = B.CreateCall(F, {castToCStr(Str, B), File}, FPutsUnlockedName);
-
-  if (const Function *Fn =
-          dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
-    CI->setCallingConv(Fn->getCallingConv());
-  return CI;
-}
-
-Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
+Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilderBase &B,
                         const DataLayout &DL, const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc_fwrite))
     return nullptr;
@@ -1269,7 +1235,7 @@ Value *llvm::emitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
   return CI;
 }
 
-Value *llvm::emitMalloc(Value *Num, IRBuilder<> &B, const DataLayout &DL,
+Value *llvm::emitMalloc(Value *Num, IRBuilderBase &B, const DataLayout &DL,
                         const TargetLibraryInfo *TLI) {
   if (!TLI->has(LibFunc_malloc))
     return nullptr;
@@ -1290,7 +1256,7 @@ Value *llvm::emitMalloc(Value *Num, IRBuilder<> &B, const DataLayout &DL,
 }
 
 Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
-                        IRBuilder<> &B, const TargetLibraryInfo &TLI) {
+                        IRBuilderBase &B, const TargetLibraryInfo &TLI) {
   if (!TLI.has(LibFunc_calloc))
     return nullptr;
 
@@ -1309,88 +1275,3 @@ Value *llvm::emitCalloc(Value *Num, Value *Size, const AttributeList &Attrs,
 
   return CI;
 }
-
-Value *llvm::emitFWriteUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
-                                IRBuilder<> &B, const DataLayout &DL,
-                                const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_fwrite_unlocked))
-    return nullptr;
-
-  Module *M = B.GetInsertBlock()->getModule();
-  LLVMContext &Context = B.GetInsertBlock()->getContext();
-  StringRef FWriteUnlockedName = TLI->getName(LibFunc_fwrite_unlocked);
-  FunctionCallee F = M->getOrInsertFunction(
-      FWriteUnlockedName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
-      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
-
-  if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M, FWriteUnlockedName, *TLI);
-  CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File});
-
-  if (const Function *Fn =
-          dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
-    CI->setCallingConv(Fn->getCallingConv());
-  return CI;
-}
-
-Value *llvm::emitFGetCUnlocked(Value *File, IRBuilder<> &B,
-                               const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_fgetc_unlocked))
-    return nullptr;
-
-  Module *M = B.GetInsertBlock()->getModule();
-  StringRef FGetCUnlockedName = TLI->getName(LibFunc_fgetc_unlocked);
-  FunctionCallee F = M->getOrInsertFunction(FGetCUnlockedName, B.getInt32Ty(),
-                                            File->getType());
-  if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M, FGetCUnlockedName, *TLI);
-  CallInst *CI = B.CreateCall(F, File, FGetCUnlockedName);
-
-  if (const Function *Fn =
-          dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
-    CI->setCallingConv(Fn->getCallingConv());
-  return CI;
-}
-
-Value *llvm::emitFGetSUnlocked(Value *Str, Value *Size, Value *File,
-                               IRBuilder<> &B, const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_fgets_unlocked))
-    return nullptr;
-
-  Module *M = B.GetInsertBlock()->getModule();
-  StringRef FGetSUnlockedName = TLI->getName(LibFunc_fgets_unlocked);
-  FunctionCallee F =
-      M->getOrInsertFunction(FGetSUnlockedName, B.getInt8PtrTy(),
-                             B.getInt8PtrTy(), B.getInt32Ty(), File->getType());
-  inferLibFuncAttributes(M, FGetSUnlockedName, *TLI);
-  CallInst *CI =
-      B.CreateCall(F, {castToCStr(Str, B), Size, File}, FGetSUnlockedName);
-
-  if (const Function *Fn =
-          dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
-    CI->setCallingConv(Fn->getCallingConv());
-  return CI;
-}
-
-Value *llvm::emitFReadUnlocked(Value *Ptr, Value *Size, Value *N, Value *File,
-                               IRBuilder<> &B, const DataLayout &DL,
-                               const TargetLibraryInfo *TLI) {
-  if (!TLI->has(LibFunc_fread_unlocked))
-    return nullptr;
-
-  Module *M = B.GetInsertBlock()->getModule();
-  LLVMContext &Context = B.GetInsertBlock()->getContext();
-  StringRef FReadUnlockedName = TLI->getName(LibFunc_fread_unlocked);
-  FunctionCallee F = M->getOrInsertFunction(
-      FReadUnlockedName, DL.getIntPtrType(Context), B.getInt8PtrTy(),
-      DL.getIntPtrType(Context), DL.getIntPtrType(Context), File->getType());
-
-  if (File->getType()->isPointerTy())
-    inferLibFuncAttributes(M, FReadUnlockedName, *TLI);
-  CallInst *CI = B.CreateCall(F, {castToCStr(Ptr, B), Size, N, File});
-
-  if (const Function *Fn =
-          dyn_cast<Function>(F.getCallee()->stripPointerCasts()))
-    CI->setCallingConv(Fn->getCallingConv());
-  return CI;
-}
diff --git a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
index 9a6761040bd8..833d04210629 100644
--- a/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
+++ b/llvm/lib/Transforms/Utils/BypassSlowDivision.cpp
@@ -213,9 +213,8 @@ bool FastDivInsertionTask::isHashLikeValue(Value *V, VisitedSetTy &Visited) {
       return false;
     // Do not visit nodes that have been visited already. We return true because
     // it means that we couldn't find any value that doesn't look hash-like.
-    if (Visited.find(I) != Visited.end())
+    if (!Visited.insert(I).second)
       return true;
-    Visited.insert(I);
     return llvm::all_of(cast<PHINode>(I)->incoming_values(), [&](Value *V) {
       // Ignore undef values as they probably don't affect the division
       // operands.
@@ -264,6 +263,7 @@ QuotRemWithBB FastDivInsertionTask::createSlowBB(BasicBlock *SuccessorBB) {
   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
                                      MainBB->getParent(), SuccessorBB);
   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
+  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
 
   Value *Dividend = SlowDivOrRem->getOperand(0);
   Value *Divisor = SlowDivOrRem->getOperand(1);
@@ -287,6 +287,7 @@ QuotRemWithBB FastDivInsertionTask::createFastBB(BasicBlock *SuccessorBB) {
   DivRemPair.BB = BasicBlock::Create(MainBB->getParent()->getContext(), "",
                                      MainBB->getParent(), SuccessorBB);
   IRBuilder<> Builder(DivRemPair.BB, DivRemPair.BB->begin());
+  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
 
   Value *Dividend = SlowDivOrRem->getOperand(0);
   Value *Divisor = SlowDivOrRem->getOperand(1);
@@ -312,6 +313,7 @@ QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
                                                        QuotRemWithBB &RHS,
                                                        BasicBlock *PhiBB) {
   IRBuilder<> Builder(PhiBB, PhiBB->begin());
+  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
   PHINode *QuoPhi = Builder.CreatePHI(getSlowType(), 2);
   QuoPhi->addIncoming(LHS.Quotient, LHS.BB);
   QuoPhi->addIncoming(RHS.Quotient, RHS.BB);
@@ -328,6 +330,7 @@ QuotRemPair FastDivInsertionTask::createDivRemPhiNodes(QuotRemWithBB &LHS,
 Value *FastDivInsertionTask::insertOperandRuntimeCheck(Value *Op1, Value *Op2) {
   assert((Op1 || Op2) && "Nothing to check");
   IRBuilder<> Builder(MainBB, MainBB->end());
+  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
 
   Value *OrV;
   if (Op1 && Op2)
@@ -396,6 +399,9 @@ Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
         isa<ConstantInt>(BCI->getOperand(0)))
       return None;
 
+  IRBuilder<> Builder(MainBB, MainBB->end());
+  Builder.SetCurrentDebugLocation(SlowDivOrRem->getDebugLoc());
+
   if (DividendShort && !isSignedOp()) {
     // If the division is unsigned and Dividend is known to be short, then
     // either
@@ -418,7 +424,6 @@ Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
     Long.Remainder = Dividend;
     QuotRemWithBB Fast = createFastBB(SuccessorBB);
     QuotRemPair Result = createDivRemPhiNodes(Fast, Long, SuccessorBB);
-    IRBuilder<> Builder(MainBB, MainBB->end());
     Value *CmpV = Builder.CreateICmpUGE(Dividend, Divisor);
     Builder.CreateCondBr(CmpV, Fast.BB, SuccessorBB);
     return Result;
@@ -435,7 +440,6 @@ Optional<QuotRemPair> FastDivInsertionTask::insertFastDivAndRem() {
     QuotRemPair Result = createDivRemPhiNodes(Fast, Slow, SuccessorBB);
     Value *CmpV = insertOperandRuntimeCheck(DividendShort ? nullptr : Dividend,
                                             DivisorShort ? nullptr : Divisor);
-    IRBuilder<> Builder(MainBB, MainBB->end());
     Builder.CreateCondBr(CmpV, Fast.BB, Slow.BB);
     return Result;
   }
diff --git a/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
new file mode 100644
index 000000000000..52e859361c59
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/CallGraphUpdater.cpp
@@ -0,0 +1,167 @@
+//===- CallGraphUpdater.cpp - A (lazy) call graph update helper -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+/// \file
+///
+/// This file provides interfaces used to manipulate a call graph, regardless
+/// if it is a "old style" CallGraph or an "new style" LazyCallGraph.
+///
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CallGraphUpdater.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+bool CallGraphUpdater::finalize() {
+  if (!DeadFunctionsInComdats.empty()) {
+    filterDeadComdatFunctions(*DeadFunctionsInComdats.front()->getParent(),
+                              DeadFunctionsInComdats);
+    DeadFunctions.append(DeadFunctionsInComdats.begin(),
+                         DeadFunctionsInComdats.end());
+  }
+
+  if (CG) {
+    // First remove all references, e.g., outgoing via called functions. This is
+    // necessary as we can delete functions that have circular references.
+    for (Function *DeadFn : DeadFunctions) {
+      DeadFn->removeDeadConstantUsers();
+      CallGraphNode *DeadCGN = (*CG)[DeadFn];
+      DeadCGN->removeAllCalledFunctions();
+      CG->getExternalCallingNode()->removeAnyCallEdgeTo(DeadCGN);
+      DeadFn->replaceAllUsesWith(UndefValue::get(DeadFn->getType()));
+    }
+
+    // Then remove the node and function from the module.
+    for (Function *DeadFn : DeadFunctions) {
+      CallGraphNode *DeadCGN = CG->getOrInsertFunction(DeadFn);
+      assert(DeadCGN->getNumReferences() == 0 &&
+             "References should have been handled by now");
+      delete CG->removeFunctionFromModule(DeadCGN);
+    }
+  } else {
+    // This is the code path for the new lazy call graph and for the case were
+    // no call graph was provided.
+    for (Function *DeadFn : DeadFunctions) {
+      DeadFn->removeDeadConstantUsers();
+      DeadFn->replaceAllUsesWith(UndefValue::get(DeadFn->getType()));
+
+      if (LCG && !ReplacedFunctions.count(DeadFn)) {
+        // Taken mostly from the inliner:
+        LazyCallGraph::Node &N = LCG->get(*DeadFn);
+        auto *DeadSCC = LCG->lookupSCC(N);
+        assert(DeadSCC && DeadSCC->size() == 1 &&
+               &DeadSCC->begin()->getFunction() == DeadFn);
+        auto &DeadRC = DeadSCC->getOuterRefSCC();
+
+        FunctionAnalysisManager &FAM =
+            AM->getResult<FunctionAnalysisManagerCGSCCProxy>(*DeadSCC, *LCG)
+                .getManager();
+
+        FAM.clear(*DeadFn, DeadFn->getName());
+        AM->clear(*DeadSCC, DeadSCC->getName());
+        LCG->removeDeadFunction(*DeadFn);
+
+        // Mark the relevant parts of the call graph as invalid so we don't
+        // visit them.
+        UR->InvalidatedSCCs.insert(DeadSCC);
+        UR->InvalidatedRefSCCs.insert(&DeadRC);
+      }
+
+      // The function is now really dead and de-attached from everything.
+      DeadFn->eraseFromParent();
+    }
+  }
+
+  bool Changed = !DeadFunctions.empty();
+  DeadFunctionsInComdats.clear();
+  DeadFunctions.clear();
+  return Changed;
+}
+
+void CallGraphUpdater::reanalyzeFunction(Function &Fn) {
+  if (CG) {
+    CallGraphNode *OldCGN = CG->getOrInsertFunction(&Fn);
+    OldCGN->removeAllCalledFunctions();
+    CG->populateCallGraphNode(OldCGN);
+  } else if (LCG) {
+    LazyCallGraph::Node &N = LCG->get(Fn);
+    LazyCallGraph::SCC *C = LCG->lookupSCC(N);
+    updateCGAndAnalysisManagerForCGSCCPass(*LCG, *C, N, *AM, *UR, *FAM);
+  }
+}
+
+void CallGraphUpdater::registerOutlinedFunction(Function &NewFn) {
+  if (CG)
+    CG->addToCallGraph(&NewFn);
+  else if (LCG)
+    LCG->addNewFunctionIntoSCC(NewFn, *SCC);
+}
+
+void CallGraphUpdater::removeFunction(Function &DeadFn) {
+  DeadFn.deleteBody();
+  DeadFn.setLinkage(GlobalValue::ExternalLinkage);
+  if (DeadFn.hasComdat())
+    DeadFunctionsInComdats.push_back(&DeadFn);
+  else
+    DeadFunctions.push_back(&DeadFn);
+
+  // For the old call graph we remove the function from the SCC right away.
+  if (CG && !ReplacedFunctions.count(&DeadFn)) {
+    CallGraphNode *DeadCGN = (*CG)[&DeadFn];
+    DeadCGN->removeAllCalledFunctions();
+    CGSCC->DeleteNode(DeadCGN);
+  }
+}
+
+void CallGraphUpdater::replaceFunctionWith(Function &OldFn, Function &NewFn) {
+  OldFn.removeDeadConstantUsers();
+  ReplacedFunctions.insert(&OldFn);
+  if (CG) {
+    // Update the call graph for the newly promoted function.
+    CallGraphNode *OldCGN = (*CG)[&OldFn];
+    CallGraphNode *NewCGN = CG->getOrInsertFunction(&NewFn);
+    NewCGN->stealCalledFunctionsFrom(OldCGN);
+    CG->ReplaceExternalCallEdge(OldCGN, NewCGN);
+
+    // And update the SCC we're iterating as well.
+    CGSCC->ReplaceNode(OldCGN, NewCGN);
+  } else if (LCG) {
+    // Directly substitute the functions in the call graph.
+    LazyCallGraph::Node &OldLCGN = LCG->get(OldFn);
+    SCC->getOuterRefSCC().replaceNodeFunction(OldLCGN, NewFn);
+  }
+  removeFunction(OldFn);
+}
+
+bool CallGraphUpdater::replaceCallSite(CallBase &OldCS, CallBase &NewCS) {
+  // This is only necessary in the (old) CG.
+  if (!CG)
+    return true;
+
+  Function *Caller = OldCS.getCaller();
+  CallGraphNode *NewCalleeNode =
+      CG->getOrInsertFunction(NewCS.getCalledFunction());
+  CallGraphNode *CallerNode = (*CG)[Caller];
+  if (llvm::none_of(*CallerNode, [&OldCS](const CallGraphNode::CallRecord &CR) {
+        return CR.first && *CR.first == &OldCS;
+      }))
+    return false;
+  CallerNode->replaceCallEdge(OldCS, NewCS, NewCalleeNode);
+  return true;
+}
+
+void CallGraphUpdater::removeCallSite(CallBase &CS) {
+  // This is only necessary in the (old) CG.
+  if (!CG)
+    return;
+
+  Function *Caller = CS.getCaller();
+  CallGraphNode *CallerNode = (*CG)[Caller];
+  CallerNode->removeCallEdgeFor(CS);
+}
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index f04d76e70c0d..5a47c1fd0b6c 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -12,7 +12,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/CallPromotionUtils.h"
+#include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/TypeMetadataUtils.h"
 #include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 
 using namespace llvm;
@@ -158,32 +161,31 @@ static void createRetPHINode(Instruction *OrigInst, Instruction *NewInst,
 ///     %t1 = bitcast i32 %t0 to ...
 ///     br label %normal_dst
 ///
-static void createRetBitCast(CallSite CS, Type *RetTy, CastInst **RetBitCast) {
+static void createRetBitCast(CallBase &CB, Type *RetTy, CastInst **RetBitCast) {
 
   // Save the users of the calling instruction. These uses will be changed to
   // use the bitcast after we create it.
   SmallVector<User *, 16> UsersToUpdate;
-  for (User *U : CS.getInstruction()->users())
+  for (User *U : CB.users())
     UsersToUpdate.push_back(U);
 
   // Determine an appropriate location to create the bitcast for the return
   // value. The location depends on if we have a call or invoke instruction.
   Instruction *InsertBefore = nullptr;
-  if (auto *Invoke = dyn_cast<InvokeInst>(CS.getInstruction()))
+  if (auto *Invoke = dyn_cast<InvokeInst>(&CB))
     InsertBefore =
         &SplitEdge(Invoke->getParent(), Invoke->getNormalDest())->front();
   else
-    InsertBefore = &*std::next(CS.getInstruction()->getIterator());
+    InsertBefore = &*std::next(CB.getIterator());
 
   // Bitcast the return value to the correct type.
-  auto *Cast = CastInst::CreateBitOrPointerCast(CS.getInstruction(), RetTy, "",
-                                                InsertBefore);
+  auto *Cast = CastInst::CreateBitOrPointerCast(&CB, RetTy, "", InsertBefore);
   if (RetBitCast)
     *RetBitCast = Cast;
 
   // Replace all the original uses of the calling instruction with the bitcast.
   for (User *U : UsersToUpdate)
-    U->replaceUsesOfWith(CS.getInstruction(), Cast);
+    U->replaceUsesOfWith(&CB, Cast);
 }
 
 /// Predicate and clone the given call site.
@@ -253,26 +255,91 @@ static void createRetBitCast(CallSite CS, Type *RetTy, CastInst **RetBitCast) {
 ///     %t2 = phi i32 [ %t0, %else_bb ], [ %t1, %then_bb ]
 ///     br %normal_dst
 ///
-static Instruction *versionCallSite(CallSite CS, Value *Callee,
-                                    MDNode *BranchWeights) {
-
-  IRBuilder<> Builder(CS.getInstruction());
-  Instruction *OrigInst = CS.getInstruction();
+/// An indirect musttail call is processed slightly differently in that:
+/// 1. No merge block needed for the orginal and the cloned callsite, since
+///    either one ends the flow. No phi node is needed either.
+/// 2. The return statement following the original call site is duplicated too
+///    and placed immediately after the cloned call site per the IR convention.
+///
+/// For example, the musttail call instruction below:
+///
+///   orig_bb:
+///     %t0 = musttail call i32 %ptr()
+///     ...
+///
+/// Is replaced by the following:
+///
+///   cond_bb:
+///     %cond = icmp eq i32 ()* %ptr, @func
+///     br i1 %cond, %then_bb, %orig_bb
+///
+///   then_bb:
+///     ; The clone of the original call instruction is placed in the "then"
+///     ; block. It is not yet promoted.
+///     %t1 = musttail call i32 %ptr()
+///     ret %t1
+///
+///   orig_bb:
+///     ; The original call instruction stays in its original block.
+///     %t0 = musttail call i32 %ptr()
+///     ret %t0
+static CallBase &versionCallSite(CallBase &CB, Value *Callee,
+                                 MDNode *BranchWeights) {
+
+  IRBuilder<> Builder(&CB);
+  CallBase *OrigInst = &CB;
   BasicBlock *OrigBlock = OrigInst->getParent();
 
   // Create the compare. The called value and callee must have the same type to
   // be compared.
-  if (CS.getCalledValue()->getType() != Callee->getType())
-    Callee = Builder.CreateBitCast(Callee, CS.getCalledValue()->getType());
-  auto *Cond = Builder.CreateICmpEQ(CS.getCalledValue(), Callee);
+  if (CB.getCalledOperand()->getType() != Callee->getType())
+    Callee = Builder.CreateBitCast(Callee, CB.getCalledOperand()->getType());
+  auto *Cond = Builder.CreateICmpEQ(CB.getCalledOperand(), Callee);
+
+  if (OrigInst->isMustTailCall()) {
+    // Create an if-then structure. The original instruction stays in its block,
+    // and a clone of the original instruction is placed in the "then" block.
+    Instruction *ThenTerm =
+        SplitBlockAndInsertIfThen(Cond, &CB, false, BranchWeights);
+    BasicBlock *ThenBlock = ThenTerm->getParent();
+    ThenBlock->setName("if.true.direct_targ");
+    CallBase *NewInst = cast<CallBase>(OrigInst->clone());
+    NewInst->insertBefore(ThenTerm);
+
+    // Place a clone of the optional bitcast after the new call site.
+    Value *NewRetVal = NewInst;
+    auto Next = OrigInst->getNextNode();
+    if (auto *BitCast = dyn_cast_or_null<BitCastInst>(Next)) {
+      assert(BitCast->getOperand(0) == OrigInst &&
+             "bitcast following musttail call must use the call");
+      auto NewBitCast = BitCast->clone();
+      NewBitCast->replaceUsesOfWith(OrigInst, NewInst);
+      NewBitCast->insertBefore(ThenTerm);
+      NewRetVal = NewBitCast;
+      Next = BitCast->getNextNode();
+    }
+
+    // Place a clone of the return instruction after the new call site.
+    ReturnInst *Ret = dyn_cast_or_null<ReturnInst>(Next);
+    assert(Ret && "musttail call must precede a ret with an optional bitcast");
+    auto NewRet = Ret->clone();
+    if (Ret->getReturnValue())
+      NewRet->replaceUsesOfWith(Ret->getReturnValue(), NewRetVal);
+    NewRet->insertBefore(ThenTerm);
+
+    // A return instructions is terminating, so we don't need the terminator
+    // instruction just created.
+    ThenTerm->eraseFromParent();
+
+    return *NewInst;
+  }
 
   // Create an if-then-else structure. The original instruction is moved into
   // the "else" block, and a clone of the original instruction is placed in the
   // "then" block.
   Instruction *ThenTerm = nullptr;
   Instruction *ElseTerm = nullptr;
-  SplitBlockAndInsertIfThenElse(Cond, CS.getInstruction(), &ThenTerm, &ElseTerm,
-                                BranchWeights);
+  SplitBlockAndInsertIfThenElse(Cond, &CB, &ThenTerm, &ElseTerm, BranchWeights);
   BasicBlock *ThenBlock = ThenTerm->getParent();
   BasicBlock *ElseBlock = ElseTerm->getParent();
   BasicBlock *MergeBlock = OrigInst->getParent();
@@ -281,7 +348,7 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee,
   ElseBlock->setName("if.false.orig_indirect");
   MergeBlock->setName("if.end.icp");
 
-  Instruction *NewInst = OrigInst->clone();
+  CallBase *NewInst = cast<CallBase>(OrigInst->clone());
   OrigInst->moveBefore(ElseTerm);
   NewInst->insertBefore(ThenTerm);
 
@@ -313,18 +380,18 @@ static Instruction *versionCallSite(CallSite CS, Value *Callee,
   // Create a phi node for the returned value of the call site.
   createRetPHINode(OrigInst, NewInst, MergeBlock, Builder);
 
-  return NewInst;
+  return *NewInst;
 }
 
-bool llvm::isLegalToPromote(CallSite CS, Function *Callee,
+bool llvm::isLegalToPromote(const CallBase &CB, Function *Callee,
                             const char **FailureReason) {
-  assert(!CS.getCalledFunction() && "Only indirect call sites can be promoted");
+  assert(!CB.getCalledFunction() && "Only indirect call sites can be promoted");
 
   auto &DL = Callee->getParent()->getDataLayout();
 
   // Check the return type. The callee's return value type must be bitcast
   // compatible with the call site's type.
-  Type *CallRetTy = CS.getInstruction()->getType();
+  Type *CallRetTy = CB.getType();
   Type *FuncRetTy = Callee->getReturnType();
   if (CallRetTy != FuncRetTy)
     if (!CastInst::isBitOrNoopPointerCastable(FuncRetTy, CallRetTy, DL)) {
@@ -336,9 +403,12 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee,
   // The number of formal arguments of the callee.
   unsigned NumParams = Callee->getFunctionType()->getNumParams();
 
+  // The number of actual arguments in the call.
+  unsigned NumArgs = CB.arg_size();
+
   // Check the number of arguments. The callee and call site must agree on the
   // number of arguments.
-  if (CS.arg_size() != NumParams && !Callee->isVarArg()) {
+  if (NumArgs != NumParams && !Callee->isVarArg()) {
     if (FailureReason)
       *FailureReason = "The number of arguments mismatch";
     return false;
@@ -347,9 +417,10 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee,
   // Check the argument types. The callee's formal argument types must be
   // bitcast compatible with the corresponding actual argument types of the call
   // site.
-  for (unsigned I = 0; I < NumParams; ++I) {
+  unsigned I = 0;
+  for (; I < NumParams; ++I) {
     Type *FormalTy = Callee->getFunctionType()->getFunctionParamType(I);
-    Type *ActualTy = CS.getArgument(I)->getType();
+    Type *ActualTy = CB.getArgOperand(I)->getType();
     if (FormalTy == ActualTy)
       continue;
     if (!CastInst::isBitOrNoopPointerCastable(ActualTy, FormalTy, DL)) {
@@ -358,35 +429,43 @@ bool llvm::isLegalToPromote(CallSite CS, Function *Callee,
       return false;
     }
   }
+  for (; I < NumArgs; I++) {
+    // Vararg functions can have more arguments than paramters.
+    assert(Callee->isVarArg());
+    if (CB.paramHasAttr(I, Attribute::StructRet)) {
+      *FailureReason = "SRet arg to vararg function";
+      return false;
+    }
+  }
 
   return true;
 }
 
-Instruction *llvm::promoteCall(CallSite CS, Function *Callee,
-                               CastInst **RetBitCast) {
-  assert(!CS.getCalledFunction() && "Only indirect call sites can be promoted");
+CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
+                            CastInst **RetBitCast) {
+  assert(!CB.getCalledFunction() && "Only indirect call sites can be promoted");
 
   // Set the called function of the call site to be the given callee (but don't
   // change the type).
-  cast<CallBase>(CS.getInstruction())->setCalledOperand(Callee);
+  CB.setCalledOperand(Callee);
 
   // Since the call site will no longer be direct, we must clear metadata that
   // is only appropriate for indirect calls. This includes !prof and !callees
   // metadata.
-  CS.getInstruction()->setMetadata(LLVMContext::MD_prof, nullptr);
-  CS.getInstruction()->setMetadata(LLVMContext::MD_callees, nullptr);
+  CB.setMetadata(LLVMContext::MD_prof, nullptr);
+  CB.setMetadata(LLVMContext::MD_callees, nullptr);
 
   // If the function type of the call site matches that of the callee, no
   // additional work is required.
-  if (CS.getFunctionType() == Callee->getFunctionType())
-    return CS.getInstruction();
+  if (CB.getFunctionType() == Callee->getFunctionType())
+    return CB;
 
   // Save the return types of the call site and callee.
-  Type *CallSiteRetTy = CS.getInstruction()->getType();
+  Type *CallSiteRetTy = CB.getType();
   Type *CalleeRetTy = Callee->getReturnType();
 
   // Change the function type of the call site the match that of the callee.
-  CS.mutateFunctionType(Callee->getFunctionType());
+  CB.mutateFunctionType(Callee->getFunctionType());
 
   // Inspect the arguments of the call site. If an argument's type doesn't
   // match the corresponding formal argument's type in the callee, bitcast it
@@ -395,19 +474,18 @@ Instruction *llvm::promoteCall(CallSite CS, Function *Callee,
   auto CalleeParamNum = CalleeType->getNumParams();
 
   LLVMContext &Ctx = Callee->getContext();
-  const AttributeList &CallerPAL = CS.getAttributes();
+  const AttributeList &CallerPAL = CB.getAttributes();
   // The new list of argument attributes.
   SmallVector<AttributeSet, 4> NewArgAttrs;
   bool AttributeChanged = false;
 
   for (unsigned ArgNo = 0; ArgNo < CalleeParamNum; ++ArgNo) {
-    auto *Arg = CS.getArgument(ArgNo);
+    auto *Arg = CB.getArgOperand(ArgNo);
     Type *FormalTy = CalleeType->getParamType(ArgNo);
     Type *ActualTy = Arg->getType();
     if (FormalTy != ActualTy) {
-      auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "",
-                                                    CS.getInstruction());
-      CS.setArgument(ArgNo, Cast);
+      auto *Cast = CastInst::CreateBitOrPointerCast(Arg, FormalTy, "", &CB);
+      CB.setArgOperand(ArgNo, Cast);
 
       // Remove any incompatible attributes for the argument.
       AttrBuilder ArgAttrs(CallerPAL.getParamAttributes(ArgNo));
@@ -432,30 +510,89 @@ Instruction *llvm::promoteCall(CallSite CS, Function *Callee,
   // Remove any incompatible return value attribute.
   AttrBuilder RAttrs(CallerPAL, AttributeList::ReturnIndex);
   if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) {
-    createRetBitCast(CS, CallSiteRetTy, RetBitCast);
+    createRetBitCast(CB, CallSiteRetTy, RetBitCast);
     RAttrs.remove(AttributeFuncs::typeIncompatible(CalleeRetTy));
     AttributeChanged = true;
   }
 
   // Set the new callsite attribute.
   if (AttributeChanged)
-    CS.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttributes(),
+    CB.setAttributes(AttributeList::get(Ctx, CallerPAL.getFnAttributes(),
                                         AttributeSet::get(Ctx, RAttrs),
                                         NewArgAttrs));
 
-  return CS.getInstruction();
+  return CB;
 }
 
-Instruction *llvm::promoteCallWithIfThenElse(CallSite CS, Function *Callee,
-                                             MDNode *BranchWeights) {
+CallBase &llvm::promoteCallWithIfThenElse(CallBase &CB, Function *Callee,
+                                          MDNode *BranchWeights) {
 
   // Version the indirect call site. If the called value is equal to the given
   // callee, 'NewInst' will be executed, otherwise the original call site will
   // be executed.
-  Instruction *NewInst = versionCallSite(CS, Callee, BranchWeights);
+  CallBase &NewInst = versionCallSite(CB, Callee, BranchWeights);
 
   // Promote 'NewInst' so that it directly calls the desired function.
-  return promoteCall(CallSite(NewInst), Callee);
+  return promoteCall(NewInst, Callee);
+}
+
+bool llvm::tryPromoteCall(CallBase &CB) {
+  assert(!CB.getCalledFunction());
+  Module *M = CB.getCaller()->getParent();
+  const DataLayout &DL = M->getDataLayout();
+  Value *Callee = CB.getCalledOperand();
+
+  LoadInst *VTableEntryLoad = dyn_cast<LoadInst>(Callee);
+  if (!VTableEntryLoad)
+    return false; // Not a vtable entry load.
+  Value *VTableEntryPtr = VTableEntryLoad->getPointerOperand();
+  APInt VTableOffset(DL.getTypeSizeInBits(VTableEntryPtr->getType()), 0);
+  Value *VTableBasePtr = VTableEntryPtr->stripAndAccumulateConstantOffsets(
+      DL, VTableOffset, /* AllowNonInbounds */ true);
+  LoadInst *VTablePtrLoad = dyn_cast<LoadInst>(VTableBasePtr);
+  if (!VTablePtrLoad)
+    return false; // Not a vtable load.
+  Value *Object = VTablePtrLoad->getPointerOperand();
+  APInt ObjectOffset(DL.getTypeSizeInBits(Object->getType()), 0);
+  Value *ObjectBase = Object->stripAndAccumulateConstantOffsets(
+      DL, ObjectOffset, /* AllowNonInbounds */ true);
+  if (!(isa<AllocaInst>(ObjectBase) && ObjectOffset == 0))
+    // Not an Alloca or the offset isn't zero.
+    return false;
+
+  // Look for the vtable pointer store into the object by the ctor.
+  BasicBlock::iterator BBI(VTablePtrLoad);
+  Value *VTablePtr = FindAvailableLoadedValue(
+      VTablePtrLoad, VTablePtrLoad->getParent(), BBI, 0, nullptr, nullptr);
+  if (!VTablePtr)
+    return false; // No vtable found.
+  APInt VTableOffsetGVBase(DL.getTypeSizeInBits(VTablePtr->getType()), 0);
+  Value *VTableGVBase = VTablePtr->stripAndAccumulateConstantOffsets(
+      DL, VTableOffsetGVBase, /* AllowNonInbounds */ true);
+  GlobalVariable *GV = dyn_cast<GlobalVariable>(VTableGVBase);
+  if (!(GV && GV->isConstant() && GV->hasDefinitiveInitializer()))
+    // Not in the form of a global constant variable with an initializer.
+    return false;
+
+  Constant *VTableGVInitializer = GV->getInitializer();
+  APInt VTableGVOffset = VTableOffsetGVBase + VTableOffset;
+  if (!(VTableGVOffset.getActiveBits() <= 64))
+    return false; // Out of range.
+  Constant *Ptr = getPointerAtOffset(VTableGVInitializer,
+                                     VTableGVOffset.getZExtValue(),
+                                     *M);
+  if (!Ptr)
+    return false; // No constant (function) pointer found.
+  Function *DirectCallee = dyn_cast<Function>(Ptr->stripPointerCasts());
+  if (!DirectCallee)
+    return false; // No function pointer found.
+
+  if (!isLegalToPromote(CB, DirectCallee))
+    return false;
+
+  // Success.
+  promoteCall(CB, DirectCallee);
+  return true;
 }
 
 #undef DEBUG_TYPE
diff --git a/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
new file mode 100644
index 000000000000..1ae17c64b8f6
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/CanonicalizeFreezeInLoops.cpp
@@ -0,0 +1,250 @@
+//==- CanonicalizeFreezeInLoops - Canonicalize freezes in a loop-*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass canonicalizes freeze instructions in a loop by pushing them out to
+// the preheader.
+//
+//   loop:
+//     i = phi init, i.next
+//     i.next = add nsw i, 1
+//     i.next.fr = freeze i.next // push this out of this loop
+//     use(i.next.fr)
+//     br i1 (i.next <= N), loop, exit
+//   =>
+//     init.fr = freeze init
+//   loop:
+//     i = phi init.fr, i.next
+//     i.next = add i, 1         // nsw is dropped here
+//     use(i.next)
+//     br i1 (i.next <= N), loop, exit
+//
+// Removing freezes from these chains help scalar evolution successfully analyze
+// expressions.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/CanonicalizeFreezeInLoops.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/Analysis/IVDescriptors.h"
+#include "llvm/Analysis/IVUsers.h"
+#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/ScalarEvolution.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Transforms/Utils.h"
+
+using namespace llvm;
+
+#define DEBUG_TYPE "canon-freeze"
+
+namespace {
+
+class CanonicalizeFreezeInLoops : public LoopPass {
+public:
+  static char ID;
+
+  CanonicalizeFreezeInLoops();
+
+private:
+  bool runOnLoop(Loop *L, LPPassManager &LPM) override;
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+};
+
+class CanonicalizeFreezeInLoopsImpl {
+  Loop *L;
+  ScalarEvolution &SE;
+  DominatorTree &DT;
+
+  struct FrozenIndPHIInfo {
+    // A freeze instruction that uses an induction phi
+    FreezeInst *FI = nullptr;
+    // The induction phi, step instruction, the operand idx of StepInst which is
+    // a step value
+    PHINode *PHI;
+    BinaryOperator *StepInst;
+    unsigned StepValIdx = 0;
+
+    FrozenIndPHIInfo(PHINode *PHI, BinaryOperator *StepInst)
+        : PHI(PHI), StepInst(StepInst) {}
+  };
+
+  // Can freeze instruction be pushed into operands of I?
+  // In order to do this, I should not create a poison after I's flags are
+  // stripped.
+  bool canHandleInst(const Instruction *I) {
+    auto Opc = I->getOpcode();
+    // If add/sub/mul, drop nsw/nuw flags.
+    return Opc == Instruction::Add || Opc == Instruction::Sub ||
+           Opc == Instruction::Mul;
+  }
+
+  void InsertFreezeAndForgetFromSCEV(Use &U);
+
+public:
+  CanonicalizeFreezeInLoopsImpl(Loop *L, ScalarEvolution &SE, DominatorTree &DT)
+      : L(L), SE(SE), DT(DT) {}
+  bool run();
+};
+
+} // anonymous namespace
+
+// Given U = (value, user), replace value with freeze(value), and let
+// SCEV forget user. The inserted freeze is placed in the preheader.
+void CanonicalizeFreezeInLoopsImpl::InsertFreezeAndForgetFromSCEV(Use &U) {
+  auto *PH = L->getLoopPreheader();
+
+  auto *UserI = cast<Instruction>(U.getUser());
+  auto *ValueToFr = U.get();
+  assert(L->contains(UserI->getParent()) &&
+         "Should not process an instruction that isn't inside the loop");
+  if (isGuaranteedNotToBeUndefOrPoison(ValueToFr, UserI, &DT))
+    return;
+
+  LLVM_DEBUG(dbgs() << "canonfr: inserting freeze:\n");
+  LLVM_DEBUG(dbgs() << "\tUser: " << *U.getUser() << "\n");
+  LLVM_DEBUG(dbgs() << "\tOperand: " << *U.get() << "\n");
+
+  U.set(new FreezeInst(ValueToFr, ValueToFr->getName() + ".frozen",
+                       PH->getTerminator()));
+
+  SE.forgetValue(UserI);
+}
+
+bool CanonicalizeFreezeInLoopsImpl::run() {
+  // The loop should be in LoopSimplify form.
+  if (!L->isLoopSimplifyForm())
+    return false;
+
+  SmallVector<FrozenIndPHIInfo, 4> Candidates;
+
+  for (auto &PHI : L->getHeader()->phis()) {
+    InductionDescriptor ID;
+    if (!InductionDescriptor::isInductionPHI(&PHI, L, &SE, ID))
+      continue;
+
+    LLVM_DEBUG(dbgs() << "canonfr: PHI: " << PHI << "\n");
+    FrozenIndPHIInfo Info(&PHI, ID.getInductionBinOp());
+    if (!Info.StepInst || !canHandleInst(Info.StepInst)) {
+      // The stepping instruction has unknown form.
+      // Ignore this PHI.
+      continue;
+    }
+
+    Info.StepValIdx = Info.StepInst->getOperand(0) == &PHI;
+    Value *StepV = Info.StepInst->getOperand(Info.StepValIdx);
+    if (auto *StepI = dyn_cast<Instruction>(StepV)) {
+      if (L->contains(StepI->getParent())) {
+        // The step value is inside the loop. Freezing step value will introduce
+        // another freeze into the loop, so skip this PHI.
+        continue;
+      }
+    }
+
+    auto Visit = [&](User *U) {
+      if (auto *FI = dyn_cast<FreezeInst>(U)) {
+        LLVM_DEBUG(dbgs() << "canonfr: found: " << *FI << "\n");
+        Info.FI = FI;
+        Candidates.push_back(Info);
+      }
+    };
+    for_each(PHI.users(), Visit);
+    for_each(Info.StepInst->users(), Visit);
+  }
+
+  if (Candidates.empty())
+    return false;
+
+  SmallSet<PHINode *, 8> ProcessedPHIs;
+  for (const auto &Info : Candidates) {
+    PHINode *PHI = Info.PHI;
+    if (!ProcessedPHIs.insert(Info.PHI).second)
+      continue;
+
+    BinaryOperator *StepI = Info.StepInst;
+    assert(StepI && "Step instruction should have been found");
+
+    // Drop flags from the step instruction.
+    if (!isGuaranteedNotToBeUndefOrPoison(StepI, StepI, &DT)) {
+      LLVM_DEBUG(dbgs() << "canonfr: drop flags: " << *StepI << "\n");
+      StepI->dropPoisonGeneratingFlags();
+      SE.forgetValue(StepI);
+    }
+
+    InsertFreezeAndForgetFromSCEV(StepI->getOperandUse(Info.StepValIdx));
+
+    unsigned OperandIdx =
+        PHI->getOperandNumForIncomingValue(PHI->getIncomingValue(0) == StepI);
+    InsertFreezeAndForgetFromSCEV(PHI->getOperandUse(OperandIdx));
+  }
+
+  // Finally, remove the old freeze instructions.
+  for (const auto &Item : Candidates) {
+    auto *FI = Item.FI;
+    LLVM_DEBUG(dbgs() << "canonfr: removing " << *FI << "\n");
+    SE.forgetValue(FI);
+    FI->replaceAllUsesWith(FI->getOperand(0));
+    FI->eraseFromParent();
+  }
+
+  return true;
+}
+
+CanonicalizeFreezeInLoops::CanonicalizeFreezeInLoops() : LoopPass(ID) {
+  initializeCanonicalizeFreezeInLoopsPass(*PassRegistry::getPassRegistry());
+}
+
+void CanonicalizeFreezeInLoops::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreservedID(LoopSimplifyID);
+  AU.addRequired<LoopInfoWrapperPass>();
+  AU.addPreserved<LoopInfoWrapperPass>();
+  AU.addRequiredID(LoopSimplifyID);
+  AU.addRequired<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+  AU.addRequired<DominatorTreeWrapperPass>();
+  AU.addPreserved<DominatorTreeWrapperPass>();
+}
+
+bool CanonicalizeFreezeInLoops::runOnLoop(Loop *L, LPPassManager &) {
+  if (skipLoop(L))
+    return false;
+
+  auto &SE = getAnalysis<ScalarEvolutionWrapperPass>().getSE();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+  return CanonicalizeFreezeInLoopsImpl(L, SE, DT).run();
+}
+
+PreservedAnalyses
+CanonicalizeFreezeInLoopsPass::run(Loop &L, LoopAnalysisManager &AM,
+                                   LoopStandardAnalysisResults &AR,
+                                   LPMUpdater &U) {
+  if (!CanonicalizeFreezeInLoopsImpl(&L, AR.SE, AR.DT).run())
+    return PreservedAnalyses::all();
+
+  return getLoopPassPreservedAnalyses();
+}
+
+INITIALIZE_PASS_BEGIN(CanonicalizeFreezeInLoops, "canon-freeze",
+                      "Canonicalize Freeze Instructions in Loops", false, false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
+INITIALIZE_PASS_END(CanonicalizeFreezeInLoops, "canon-freeze",
+                    "Canonicalize Freeze Instructions in Loops", false, false)
+
+Pass *llvm::createCanonicalizeFreezeInLoopsPass() {
+  return new CanonicalizeFreezeInLoops();
+}
+
+char CanonicalizeFreezeInLoops::ID = 0;
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index 75e8963303c2..788983c15690 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -46,7 +46,7 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
   if (BB->hasName())
     NewBB->setName(BB->getName() + NameSuffix);
 
-  bool hasCalls = false, hasDynamicAllocas = false, hasStaticAllocas = false;
+  bool hasCalls = false, hasDynamicAllocas = false;
   Module *TheModule = F ? F->getParent() : nullptr;
 
   // Loop over all instructions, and copy them over.
@@ -62,18 +62,15 @@ BasicBlock *llvm::CloneBasicBlock(const BasicBlock *BB, ValueToValueMapTy &VMap,
 
     hasCalls |= (isa<CallInst>(I) && !isa<DbgInfoIntrinsic>(I));
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(&I)) {
-      if (isa<ConstantInt>(AI->getArraySize()))
-        hasStaticAllocas = true;
-      else
+      if (!AI->isStaticAlloca()) {
         hasDynamicAllocas = true;
+      }
     }
   }
 
   if (CodeInfo) {
     CodeInfo->ContainsCalls          |= hasCalls;
     CodeInfo->ContainsDynamicAllocas |= hasDynamicAllocas;
-    CodeInfo->ContainsDynamicAllocas |= hasStaticAllocas &&
-                                        BB != &BB->getParent()->getEntryBlock();
   }
   return NewBB;
 }
@@ -367,8 +364,8 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
     hasCalls |= (isa<CallInst>(II) && !isa<DbgInfoIntrinsic>(II));
 
     if (CodeInfo)
-      if (auto CS = ImmutableCallSite(&*II))
-        if (CS.hasOperandBundles())
+      if (auto *CB = dyn_cast<CallBase>(&*II))
+        if (CB->hasOperandBundles())
           CodeInfo->OperandBundleCallSites.push_back(NewInst);
 
     if (const AllocaInst *AI = dyn_cast<AllocaInst>(II)) {
@@ -424,8 +421,8 @@ void PruningFunctionCloner::CloneBlock(const BasicBlock *BB,
     VMap[OldTI] = NewInst;             // Add instruction map to value.
 
     if (CodeInfo)
-      if (auto CS = ImmutableCallSite(OldTI))
-        if (CS.hasOperandBundles())
+      if (auto *CB = dyn_cast<CallBase>(OldTI))
+        if (CB->hasOperandBundles())
           CodeInfo->OperandBundleCallSites.push_back(NewInst);
 
     // Recursively clone any reachable successor blocks.
@@ -619,8 +616,9 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
 
     // Skip over non-intrinsic callsites, we don't want to remove any nodes from
     // the CGSCC.
-    CallSite CS = CallSite(I);
-    if (CS && CS.getCalledFunction() && !CS.getCalledFunction()->isIntrinsic())
+    CallBase *CB = dyn_cast<CallBase>(I);
+    if (CB && CB->getCalledFunction() &&
+        !CB->getCalledFunction()->isIntrinsic())
       continue;
 
     // See if this instruction simplifies.
@@ -804,8 +802,6 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
 
     // Update LoopInfo.
     NewLoop->addBasicBlockToLoop(NewBB, *LI);
-    if (BB == CurLoop->getHeader())
-      NewLoop->moveToHeader(NewBB);
 
     // Add DominatorTree node. After seeing all blocks, update to correct
     // IDom.
@@ -815,6 +811,11 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB,
   }
 
   for (BasicBlock *BB : OrigLoop->getBlocks()) {
+    // Update loop headers.
+    Loop *CurLoop = LI->getLoopFor(BB);
+    if (BB == CurLoop->getHeader())
+      LMap[CurLoop]->moveToHeader(cast<BasicBlock>(VMap[BB]));
+
     // Update DominatorTree.
     BasicBlock *IDomBB = DT->getNode(BB)->getIDom()->getBlock();
     DT->changeImmediateDominator(cast<BasicBlock>(VMap[BB]),
diff --git a/llvm/lib/Transforms/Utils/CodeExtractor.cpp b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
index 682af4a88d3e..8cdbb9d35652 100644
--- a/llvm/lib/Transforms/Utils/CodeExtractor.cpp
+++ b/llvm/lib/Transforms/Utils/CodeExtractor.cpp
@@ -31,11 +31,14 @@
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
+#include "llvm/IR/DIBuilder.h"
 #include "llvm/IR/DataLayout.h"
+#include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/InstIterator.h"
 #include "llvm/IR/InstrTypes.h"
 #include "llvm/IR/Instruction.h"
 #include "llvm/IR/Instructions.h"
@@ -448,18 +451,24 @@ CodeExtractor::getLifetimeMarkers(const CodeExtractorAnalysisCache &CEAC,
   for (User *U : Addr->users()) {
     IntrinsicInst *IntrInst = dyn_cast<IntrinsicInst>(U);
     if (IntrInst) {
+      // We don't model addresses with multiple start/end markers, but the
+      // markers do not need to be in the region.
       if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_start) {
-        // Do not handle the case where Addr has multiple start markers.
         if (Info.LifeStart)
           return {};
         Info.LifeStart = IntrInst;
+        continue;
       }
       if (IntrInst->getIntrinsicID() == Intrinsic::lifetime_end) {
         if (Info.LifeEnd)
           return {};
         Info.LifeEnd = IntrInst;
+        continue;
       }
-      continue;
+      // At this point, permit debug uses outside of the region.
+      // This is fixed in a later call to fixupDebugInfoPostExtraction().
+      if (isa<DbgInfoIntrinsic>(IntrInst))
+        continue;
     }
     // Find untracked uses of the address, bail.
     if (!definedInRegion(Blocks, U))
@@ -865,10 +874,13 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::NoAlias:
       case Attribute::NoBuiltin:
       case Attribute::NoCapture:
+      case Attribute::NoMerge:
       case Attribute::NoReturn:
       case Attribute::NoSync:
+      case Attribute::NoUndef:
       case Attribute::None:
       case Attribute::NonNull:
+      case Attribute::Preallocated:
       case Attribute::ReadNone:
       case Attribute::ReadOnly:
       case Attribute::Returned:
@@ -884,6 +896,8 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::ZExt:
       case Attribute::ImmArg:
       case Attribute::EndAttrKinds:
+      case Attribute::EmptyKey:
+      case Attribute::TombstoneKey:
         continue;
       // Those attributes should be safe to propagate to the extracted function.
       case Attribute::AlwaysInline:
@@ -898,6 +912,7 @@ Function *CodeExtractor::constructFunction(const ValueSet &inputs,
       case Attribute::NonLazyBind:
       case Attribute::NoRedZone:
       case Attribute::NoUnwind:
+      case Attribute::NullPointerIsValid:
       case Attribute::OptForFuzzing:
       case Attribute::OptimizeNone:
       case Attribute::OptimizeForSize:
@@ -1120,8 +1135,7 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
       GetElementPtrInst *GEP = GetElementPtrInst::Create(
           StructArgTy, Struct, Idx, "gep_" + StructValues[i]->getName());
       codeReplacer->getInstList().push_back(GEP);
-      StoreInst *SI = new StoreInst(StructValues[i], GEP);
-      codeReplacer->getInstList().push_back(SI);
+      new StoreInst(StructValues[i], GEP, codeReplacer);
     }
   }
 
@@ -1164,9 +1178,9 @@ CallInst *CodeExtractor::emitCallAndSwitchStatement(Function *newFunction,
       Output = ReloadOutputs[i];
     }
     LoadInst *load = new LoadInst(outputs[i]->getType(), Output,
-                                  outputs[i]->getName() + ".reload");
+                                  outputs[i]->getName() + ".reload",
+                                  codeReplacer);
     Reloads.push_back(load);
-    codeReplacer->getInstList().push_back(load);
     std::vector<User *> Users(outputs[i]->user_begin(), outputs[i]->user_end());
     for (unsigned u = 0, e = Users.size(); u != e; ++u) {
       Instruction *inst = cast<Instruction>(Users[u]);
@@ -1351,6 +1365,9 @@ void CodeExtractor::calculateNewCallTerminatorWeights(
   // Block Frequency distribution with dummy node.
   Distribution BranchDist;
 
+  SmallVector<BranchProbability, 4> EdgeProbabilities(
+      TI->getNumSuccessors(), BranchProbability::getUnknown());
+
   // Add each of the frequencies of the successors.
   for (unsigned i = 0, e = TI->getNumSuccessors(); i < e; ++i) {
     BlockNode ExitNode(i);
@@ -1358,12 +1375,14 @@ void CodeExtractor::calculateNewCallTerminatorWeights(
     if (ExitFreq != 0)
       BranchDist.addExit(ExitNode, ExitFreq);
     else
-      BPI->setEdgeProbability(CodeReplacer, i, BranchProbability::getZero());
+      EdgeProbabilities[i] = BranchProbability::getZero();
   }
 
   // Check for no total weight.
-  if (BranchDist.Total == 0)
+  if (BranchDist.Total == 0) {
+    BPI->setEdgeProbability(CodeReplacer, EdgeProbabilities);
     return;
+  }
 
   // Normalize the distribution so that they can fit in unsigned.
   BranchDist.normalize();
@@ -1375,13 +1394,133 @@ void CodeExtractor::calculateNewCallTerminatorWeights(
     // Get the weight and update the current BFI.
     BranchWeights[Weight.TargetNode.Index] = Weight.Amount;
     BranchProbability BP(Weight.Amount, BranchDist.Total);
-    BPI->setEdgeProbability(CodeReplacer, Weight.TargetNode.Index, BP);
+    EdgeProbabilities[Weight.TargetNode.Index] = BP;
   }
+  BPI->setEdgeProbability(CodeReplacer, EdgeProbabilities);
   TI->setMetadata(
       LLVMContext::MD_prof,
       MDBuilder(TI->getContext()).createBranchWeights(BranchWeights));
 }
 
+/// Erase debug info intrinsics which refer to values in \p F but aren't in
+/// \p F.
+static void eraseDebugIntrinsicsWithNonLocalRefs(Function &F) {
+  for (Instruction &I : instructions(F)) {
+    SmallVector<DbgVariableIntrinsic *, 4> DbgUsers;
+    findDbgUsers(DbgUsers, &I);
+    for (DbgVariableIntrinsic *DVI : DbgUsers)
+      if (DVI->getFunction() != &F)
+        DVI->eraseFromParent();
+  }
+}
+
+/// Fix up the debug info in the old and new functions by pointing line
+/// locations and debug intrinsics to the new subprogram scope, and by deleting
+/// intrinsics which point to values outside of the new function.
+static void fixupDebugInfoPostExtraction(Function &OldFunc, Function &NewFunc,
+                                         CallInst &TheCall) {
+  DISubprogram *OldSP = OldFunc.getSubprogram();
+  LLVMContext &Ctx = OldFunc.getContext();
+
+  if (!OldSP) {
+    // Erase any debug info the new function contains.
+    stripDebugInfo(NewFunc);
+    // Make sure the old function doesn't contain any non-local metadata refs.
+    eraseDebugIntrinsicsWithNonLocalRefs(NewFunc);
+    return;
+  }
+
+  // Create a subprogram for the new function. Leave out a description of the
+  // function arguments, as the parameters don't correspond to anything at the
+  // source level.
+  assert(OldSP->getUnit() && "Missing compile unit for subprogram");
+  DIBuilder DIB(*OldFunc.getParent(), /*AllowUnresolvedNodes=*/false,
+                OldSP->getUnit());
+  auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
+  DISubprogram::DISPFlags SPFlags = DISubprogram::SPFlagDefinition |
+                                    DISubprogram::SPFlagOptimized |
+                                    DISubprogram::SPFlagLocalToUnit;
+  auto NewSP = DIB.createFunction(
+      OldSP->getUnit(), NewFunc.getName(), NewFunc.getName(), OldSP->getFile(),
+      /*LineNo=*/0, SPType, /*ScopeLine=*/0, DINode::FlagZero, SPFlags);
+  NewFunc.setSubprogram(NewSP);
+
+  // Debug intrinsics in the new function need to be updated in one of two
+  // ways:
+  //  1) They need to be deleted, because they describe a value in the old
+  //     function.
+  //  2) They need to point to fresh metadata, e.g. because they currently
+  //     point to a variable in the wrong scope.
+  SmallDenseMap<DINode *, DINode *> RemappedMetadata;
+  SmallVector<Instruction *, 4> DebugIntrinsicsToDelete;
+  for (Instruction &I : instructions(NewFunc)) {
+    auto *DII = dyn_cast<DbgInfoIntrinsic>(&I);
+    if (!DII)
+      continue;
+
+    // Point the intrinsic to a fresh label within the new function.
+    if (auto *DLI = dyn_cast<DbgLabelInst>(&I)) {
+      DILabel *OldLabel = DLI->getLabel();
+      DINode *&NewLabel = RemappedMetadata[OldLabel];
+      if (!NewLabel)
+        NewLabel = DILabel::get(Ctx, NewSP, OldLabel->getName(),
+                                OldLabel->getFile(), OldLabel->getLine());
+      DLI->setArgOperand(0, MetadataAsValue::get(Ctx, NewLabel));
+      continue;
+    }
+
+    // If the location isn't a constant or an instruction, delete the
+    // intrinsic.
+    auto *DVI = cast<DbgVariableIntrinsic>(DII);
+    Value *Location = DVI->getVariableLocation();
+    if (!Location ||
+        (!isa<Constant>(Location) && !isa<Instruction>(Location))) {
+      DebugIntrinsicsToDelete.push_back(DVI);
+      continue;
+    }
+
+    // If the variable location is an instruction but isn't in the new
+    // function, delete the intrinsic.
+    Instruction *LocationInst = dyn_cast<Instruction>(Location);
+    if (LocationInst && LocationInst->getFunction() != &NewFunc) {
+      DebugIntrinsicsToDelete.push_back(DVI);
+      continue;
+    }
+
+    // Point the intrinsic to a fresh variable within the new function.
+    DILocalVariable *OldVar = DVI->getVariable();
+    DINode *&NewVar = RemappedMetadata[OldVar];
+    if (!NewVar)
+      NewVar = DIB.createAutoVariable(
+          NewSP, OldVar->getName(), OldVar->getFile(), OldVar->getLine(),
+          OldVar->getType(), /*AlwaysPreserve=*/false, DINode::FlagZero,
+          OldVar->getAlignInBits());
+    DVI->setArgOperand(1, MetadataAsValue::get(Ctx, NewVar));
+  }
+  for (auto *DII : DebugIntrinsicsToDelete)
+    DII->eraseFromParent();
+  DIB.finalizeSubprogram(NewSP);
+
+  // Fix up the scope information attached to the line locations in the new
+  // function.
+  for (Instruction &I : instructions(NewFunc)) {
+    if (const DebugLoc &DL = I.getDebugLoc())
+      I.setDebugLoc(DebugLoc::get(DL.getLine(), DL.getCol(), NewSP));
+
+    // Loop info metadata may contain line locations. Fix them up.
+    auto updateLoopInfoLoc = [&Ctx,
+                              NewSP](const DILocation &Loc) -> DILocation * {
+      return DILocation::get(Ctx, Loc.getLine(), Loc.getColumn(), NewSP,
+                             nullptr);
+    };
+    updateLoopMetadataDebugLocations(I, updateLoopInfoLoc);
+  }
+  if (!TheCall.getDebugLoc())
+    TheCall.setDebugLoc(DebugLoc::get(0, 0, OldSP));
+
+  eraseDebugIntrinsicsWithNonLocalRefs(NewFunc);
+}
+
 Function *
 CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) {
   if (!isEligible())
@@ -1405,13 +1544,19 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) {
     }
   }
 
-  if (AC) {
-    // Remove @llvm.assume calls that were moved to the new function from the
-    // old function's assumption cache.
-    for (BasicBlock *Block : Blocks)
-      for (auto &I : *Block)
-        if (match(&I, m_Intrinsic<Intrinsic::assume>()))
-          AC->unregisterAssumption(cast<CallInst>(&I));
+  // Remove @llvm.assume calls that will be moved to the new function from the
+  // old function's assumption cache.
+  for (BasicBlock *Block : Blocks) {
+    for (auto It = Block->begin(), End = Block->end(); It != End;) {
+      Instruction *I = &*It;
+      ++It;
+
+      if (match(I, m_Intrinsic<Intrinsic::assume>())) {
+        if (AC)
+          AC->unregisterAssumption(cast<CallInst>(I));
+        I->eraseFromParent();
+      }
+    }
   }
 
   // If we have any return instructions in the region, split those blocks so
@@ -1567,26 +1712,7 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) {
       }
     }
 
-  // Erase debug info intrinsics. Variable updates within the new function are
-  // invisible to debuggers. This could be improved by defining a DISubprogram
-  // for the new function.
-  for (BasicBlock &BB : *newFunction) {
-    auto BlockIt = BB.begin();
-    // Remove debug info intrinsics from the new function.
-    while (BlockIt != BB.end()) {
-      Instruction *Inst = &*BlockIt;
-      ++BlockIt;
-      if (isa<DbgInfoIntrinsic>(Inst))
-        Inst->eraseFromParent();
-    }
-    // Remove debug info intrinsics which refer to values in the new function
-    // from the old function.
-    SmallVector<DbgVariableIntrinsic *, 4> DbgUsers;
-    for (Instruction &I : BB)
-      findDbgUsers(DbgUsers, &I);
-    for (DbgVariableIntrinsic *DVI : DbgUsers)
-      DVI->eraseFromParent();
-  }
+  fixupDebugInfoPostExtraction(*oldFunction, *newFunction, *TheCall);
 
   // Mark the new function `noreturn` if applicable. Terminators which resume
   // exception propagation are treated as returning instructions. This is to
@@ -1604,17 +1730,36 @@ CodeExtractor::extractCodeRegion(const CodeExtractorAnalysisCache &CEAC) {
   });
   LLVM_DEBUG(if (verifyFunction(*oldFunction))
              report_fatal_error("verification of oldFunction failed!"));
-  LLVM_DEBUG(if (AC && verifyAssumptionCache(*oldFunction, AC))
-             report_fatal_error("Stale Asumption cache for old Function!"));
+  LLVM_DEBUG(if (AC && verifyAssumptionCache(*oldFunction, *newFunction, AC))
+                 report_fatal_error("Stale Asumption cache for old Function!"));
   return newFunction;
 }
 
-bool CodeExtractor::verifyAssumptionCache(const Function& F,
+bool CodeExtractor::verifyAssumptionCache(const Function &OldFunc,
+                                          const Function &NewFunc,
                                           AssumptionCache *AC) {
   for (auto AssumeVH : AC->assumptions()) {
-    CallInst *I = cast<CallInst>(AssumeVH);
-    if (I->getFunction() != &F)
+    CallInst *I = dyn_cast_or_null<CallInst>(AssumeVH);
+    if (!I)
+      continue;
+
+    // There shouldn't be any llvm.assume intrinsics in the new function.
+    if (I->getFunction() != &OldFunc)
       return true;
+
+    // There shouldn't be any stale affected values in the assumption cache
+    // that were previously in the old function, but that have now been moved
+    // to the new function.
+    for (auto AffectedValVH : AC->assumptionsFor(I->getOperand(0))) {
+      CallInst *AffectedCI = dyn_cast_or_null<CallInst>(AffectedValVH);
+      if (!AffectedCI)
+        continue;
+      if (AffectedCI->getFunction() != &OldFunc)
+        return true;
+      auto *AssumedInst = dyn_cast<Instruction>(AffectedCI->getOperand(0));
+      if (AssumedInst->getFunction() != &OldFunc)
+        return true;
+    }
   }
   return false;
 }
diff --git a/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp b/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
index 93395ac761ab..08047dc0f96e 100644
--- a/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CodeMoverUtils.cpp
@@ -12,6 +12,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/CodeMoverUtils.h"
+#include "llvm/ADT/Optional.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
 #include "llvm/Analysis/PostDominators.h"
@@ -30,6 +31,201 @@ STATISTIC(NotControlFlowEquivalent,
 STATISTIC(NotMovedPHINode, "Movement of PHINodes are not supported");
 STATISTIC(NotMovedTerminator, "Movement of Terminator are not supported");
 
+namespace {
+/// Represent a control condition. A control condition is a condition of a
+/// terminator to decide which successors to execute. The pointer field
+/// represents the address of the condition of the terminator. The integer field
+/// is a bool, it is true when the basic block is executed when V is true. For
+/// example, `br %cond, bb0, bb1` %cond is a control condition of bb0 with the
+/// integer field equals to true, while %cond is a control condition of bb1 with
+/// the integer field equals to false.
+using ControlCondition = PointerIntPair<Value *, 1, bool>;
+#ifndef NDEBUG
+raw_ostream &operator<<(raw_ostream &OS, const ControlCondition &C) {
+  OS << "[" << *C.getPointer() << ", " << (C.getInt() ? "true" : "false")
+     << "]";
+  return OS;
+}
+#endif
+
+/// Represent a set of control conditions required to execute ToBB from FromBB.
+class ControlConditions {
+  using ConditionVectorTy = SmallVector<ControlCondition, 6>;
+
+  /// A SmallVector of control conditions.
+  ConditionVectorTy Conditions;
+
+public:
+  /// Return a ControlConditions which stores all conditions required to execute
+  /// \p BB from \p Dominator. If \p MaxLookup is non-zero, it limits the
+  /// number of conditions to collect. Return None if not all conditions are
+  /// collected successfully, or we hit the limit.
+  static const Optional<ControlConditions>
+  collectControlConditions(const BasicBlock &BB, const BasicBlock &Dominator,
+                           const DominatorTree &DT,
+                           const PostDominatorTree &PDT,
+                           unsigned MaxLookup = 6);
+
+  /// Return true if there exists no control conditions required to execute ToBB
+  /// from FromBB.
+  bool isUnconditional() const { return Conditions.empty(); }
+
+  /// Return a constant reference of Conditions.
+  const ConditionVectorTy &getControlConditions() const { return Conditions; }
+
+  /// Add \p V as one of the ControlCondition in Condition with IsTrueCondition
+  /// equals to \p True. Return true if inserted successfully.
+  bool addControlCondition(ControlCondition C);
+
+  /// Return true if for all control conditions in Conditions, there exists an
+  /// equivalent control condition in \p Other.Conditions.
+  bool isEquivalent(const ControlConditions &Other) const;
+
+  /// Return true if \p C1 and \p C2 are equivalent.
+  static bool isEquivalent(const ControlCondition &C1,
+                           const ControlCondition &C2);
+
+private:
+  ControlConditions() = default;
+
+  static bool isEquivalent(const Value &V1, const Value &V2);
+  static bool isInverse(const Value &V1, const Value &V2);
+};
+} // namespace
+
+static bool domTreeLevelBefore(DominatorTree *DT, const Instruction *InstA,
+                               const Instruction *InstB) {
+  // Use ordered basic block in case the 2 instructions are in the same
+  // block.
+  if (InstA->getParent() == InstB->getParent())
+    return InstA->comesBefore(InstB);
+
+  DomTreeNode *DA = DT->getNode(InstA->getParent());
+  DomTreeNode *DB = DT->getNode(InstB->getParent());
+  return DA->getLevel() < DB->getLevel();
+}
+
+const Optional<ControlConditions> ControlConditions::collectControlConditions(
+    const BasicBlock &BB, const BasicBlock &Dominator, const DominatorTree &DT,
+    const PostDominatorTree &PDT, unsigned MaxLookup) {
+  assert(DT.dominates(&Dominator, &BB) && "Expecting Dominator to dominate BB");
+
+  ControlConditions Conditions;
+  unsigned NumConditions = 0;
+
+  // BB is executed unconditional from itself.
+  if (&Dominator == &BB)
+    return Conditions;
+
+  const BasicBlock *CurBlock = &BB;
+  // Walk up the dominator tree from the associated DT node for BB to the
+  // associated DT node for Dominator.
+  do {
+    assert(DT.getNode(CurBlock) && "Expecting a valid DT node for CurBlock");
+    BasicBlock *IDom = DT.getNode(CurBlock)->getIDom()->getBlock();
+    assert(DT.dominates(&Dominator, IDom) &&
+           "Expecting Dominator to dominate IDom");
+
+    // Limitation: can only handle branch instruction currently.
+    const BranchInst *BI = dyn_cast<BranchInst>(IDom->getTerminator());
+    if (!BI)
+      return None;
+
+    bool Inserted = false;
+    if (PDT.dominates(CurBlock, IDom)) {
+      LLVM_DEBUG(dbgs() << CurBlock->getName()
+                        << " is executed unconditionally from "
+                        << IDom->getName() << "\n");
+    } else if (PDT.dominates(CurBlock, BI->getSuccessor(0))) {
+      LLVM_DEBUG(dbgs() << CurBlock->getName() << " is executed when \""
+                        << *BI->getCondition() << "\" is true from "
+                        << IDom->getName() << "\n");
+      Inserted = Conditions.addControlCondition(
+          ControlCondition(BI->getCondition(), true));
+    } else if (PDT.dominates(CurBlock, BI->getSuccessor(1))) {
+      LLVM_DEBUG(dbgs() << CurBlock->getName() << " is executed when \""
+                        << *BI->getCondition() << "\" is false from "
+                        << IDom->getName() << "\n");
+      Inserted = Conditions.addControlCondition(
+          ControlCondition(BI->getCondition(), false));
+    } else
+      return None;
+
+    if (Inserted)
+      ++NumConditions;
+
+    if (MaxLookup != 0 && NumConditions > MaxLookup)
+      return None;
+
+    CurBlock = IDom;
+  } while (CurBlock != &Dominator);
+
+  return Conditions;
+}
+
+bool ControlConditions::addControlCondition(ControlCondition C) {
+  bool Inserted = false;
+  if (none_of(Conditions, [&](ControlCondition &Exists) {
+        return ControlConditions::isEquivalent(C, Exists);
+      })) {
+    Conditions.push_back(C);
+    Inserted = true;
+  }
+
+  LLVM_DEBUG(dbgs() << (Inserted ? "Inserted " : "Not inserted ") << C << "\n");
+  return Inserted;
+}
+
+bool ControlConditions::isEquivalent(const ControlConditions &Other) const {
+  if (Conditions.empty() && Other.Conditions.empty())
+    return true;
+
+  if (Conditions.size() != Other.Conditions.size())
+    return false;
+
+  return all_of(Conditions, [&](const ControlCondition &C) {
+    return any_of(Other.Conditions, [&](const ControlCondition &OtherC) {
+      return ControlConditions::isEquivalent(C, OtherC);
+    });
+  });
+}
+
+bool ControlConditions::isEquivalent(const ControlCondition &C1,
+                                     const ControlCondition &C2) {
+  if (C1.getInt() == C2.getInt()) {
+    if (isEquivalent(*C1.getPointer(), *C2.getPointer()))
+      return true;
+  } else if (isInverse(*C1.getPointer(), *C2.getPointer()))
+    return true;
+
+  return false;
+}
+
+// FIXME: Use SCEV and reuse GVN/CSE logic to check for equivalence between
+// Values.
+// Currently, isEquivalent rely on other passes to ensure equivalent conditions
+// have the same value, e.g. GVN.
+bool ControlConditions::isEquivalent(const Value &V1, const Value &V2) {
+  return &V1 == &V2;
+}
+
+bool ControlConditions::isInverse(const Value &V1, const Value &V2) {
+  if (const CmpInst *Cmp1 = dyn_cast<CmpInst>(&V1))
+    if (const CmpInst *Cmp2 = dyn_cast<CmpInst>(&V2)) {
+      if (Cmp1->getPredicate() == Cmp2->getInversePredicate() &&
+          Cmp1->getOperand(0) == Cmp2->getOperand(0) &&
+          Cmp1->getOperand(1) == Cmp2->getOperand(1))
+        return true;
+
+      if (Cmp1->getPredicate() ==
+              CmpInst::getSwappedPredicate(Cmp2->getInversePredicate()) &&
+          Cmp1->getOperand(0) == Cmp2->getOperand(1) &&
+          Cmp1->getOperand(1) == Cmp2->getOperand(0))
+        return true;
+    }
+  return false;
+}
+
 bool llvm::isControlFlowEquivalent(const Instruction &I0, const Instruction &I1,
                                    const DominatorTree &DT,
                                    const PostDominatorTree &PDT) {
@@ -42,8 +238,30 @@ bool llvm::isControlFlowEquivalent(const BasicBlock &BB0, const BasicBlock &BB1,
   if (&BB0 == &BB1)
     return true;
 
-  return ((DT.dominates(&BB0, &BB1) && PDT.dominates(&BB1, &BB0)) ||
-          (PDT.dominates(&BB0, &BB1) && DT.dominates(&BB1, &BB0)));
+  if ((DT.dominates(&BB0, &BB1) && PDT.dominates(&BB1, &BB0)) ||
+      (PDT.dominates(&BB0, &BB1) && DT.dominates(&BB1, &BB0)))
+    return true;
+
+  // If the set of conditions required to execute BB0 and BB1 from their common
+  // dominator are the same, then BB0 and BB1 are control flow equivalent.
+  const BasicBlock *CommonDominator = DT.findNearestCommonDominator(&BB0, &BB1);
+  LLVM_DEBUG(dbgs() << "The nearest common dominator of " << BB0.getName()
+                    << " and " << BB1.getName() << " is "
+                    << CommonDominator->getName() << "\n");
+
+  const Optional<ControlConditions> BB0Conditions =
+      ControlConditions::collectControlConditions(BB0, *CommonDominator, DT,
+                                                  PDT);
+  if (BB0Conditions == None)
+    return false;
+
+  const Optional<ControlConditions> BB1Conditions =
+      ControlConditions::collectControlConditions(BB1, *CommonDominator, DT,
+                                                  PDT);
+  if (BB1Conditions == None)
+    return false;
+
+  return BB0Conditions->isEquivalent(*BB1Conditions);
 }
 
 static bool reportInvalidCandidate(const Instruction &I,
@@ -90,9 +308,12 @@ collectInstructionsInBetween(Instruction &StartInst, const Instruction &EndInst,
 }
 
 bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
-                              const DominatorTree &DT,
-                              const PostDominatorTree &PDT,
-                              DependenceInfo &DI) {
+                              DominatorTree &DT, const PostDominatorTree *PDT,
+                              DependenceInfo *DI) {
+  // Skip tests when we don't have PDT or DI
+  if (!PDT || !DI)
+    return false;
+
   // Cannot move itself before itself.
   if (&I == &InsertPoint)
     return false;
@@ -108,28 +329,22 @@ bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
     return reportInvalidCandidate(I, NotMovedTerminator);
 
   // TODO remove this limitation.
-  if (!isControlFlowEquivalent(I, InsertPoint, DT, PDT))
+  if (!isControlFlowEquivalent(I, InsertPoint, DT, *PDT))
     return reportInvalidCandidate(I, NotControlFlowEquivalent);
 
-  // As I and InsertPoint are control flow equivalent, if I dominates
-  // InsertPoint, then I comes before InsertPoint.
-  const bool MoveForward = DT.dominates(&I, &InsertPoint);
-  if (MoveForward) {
-    // When I is being moved forward, we need to make sure the InsertPoint
-    // dominates every users. Or else, a user may be using an undefined I.
+  if (!DT.dominates(&InsertPoint, &I))
     for (const Use &U : I.uses())
       if (auto *UserInst = dyn_cast<Instruction>(U.getUser()))
         if (UserInst != &InsertPoint && !DT.dominates(&InsertPoint, U))
           return false;
-  } else {
-    // When I is being moved backward, we need to make sure all its opernads
-    // dominates the InsertPoint. Or else, an operand may be undefined for I.
+  if (!DT.dominates(&I, &InsertPoint))
     for (const Value *Op : I.operands())
       if (auto *OpInst = dyn_cast<Instruction>(Op))
         if (&InsertPoint == OpInst || !DT.dominates(OpInst, &InsertPoint))
           return false;
-  }
 
+  DT.updateDFSNumbers();
+  const bool MoveForward = domTreeLevelBefore(&DT, &I, &InsertPoint);
   Instruction &StartInst = (MoveForward ? I : InsertPoint);
   Instruction &EndInst = (MoveForward ? InsertPoint : I);
   SmallPtrSet<Instruction *, 10> InstsToCheck;
@@ -162,7 +377,7 @@ bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
   // StartInst to \p EndInst.
   if (std::any_of(InstsToCheck.begin(), InstsToCheck.end(),
                   [&DI, &I](Instruction *CurInst) {
-                    auto DepResult = DI.depends(&I, CurInst, true);
+                    auto DepResult = DI->depends(&I, CurInst, true);
                     if (DepResult &&
                         (DepResult->isOutput() || DepResult->isFlow() ||
                          DepResult->isAnti()))
@@ -174,16 +389,40 @@ bool llvm::isSafeToMoveBefore(Instruction &I, Instruction &InsertPoint,
   return true;
 }
 
-void llvm::moveInstsBottomUp(BasicBlock &FromBB, BasicBlock &ToBB,
-                             const DominatorTree &DT,
-                             const PostDominatorTree &PDT, DependenceInfo &DI) {
+bool llvm::isSafeToMoveBefore(BasicBlock &BB, Instruction &InsertPoint,
+                              DominatorTree &DT, const PostDominatorTree *PDT,
+                              DependenceInfo *DI) {
+  return llvm::all_of(BB, [&](Instruction &I) {
+    if (BB.getTerminator() == &I)
+      return true;
+
+    return isSafeToMoveBefore(I, InsertPoint, DT, PDT, DI);
+  });
+}
+
+void llvm::moveInstructionsToTheBeginning(BasicBlock &FromBB, BasicBlock &ToBB,
+                                          DominatorTree &DT,
+                                          const PostDominatorTree &PDT,
+                                          DependenceInfo &DI) {
   for (auto It = ++FromBB.rbegin(); It != FromBB.rend();) {
     Instruction *MovePos = ToBB.getFirstNonPHIOrDbg();
     Instruction &I = *It;
     // Increment the iterator before modifying FromBB.
     ++It;
 
-    if (isSafeToMoveBefore(I, *MovePos, DT, PDT, DI))
+    if (isSafeToMoveBefore(I, *MovePos, DT, &PDT, &DI))
+      I.moveBefore(MovePos);
+  }
+}
+
+void llvm::moveInstructionsToTheEnd(BasicBlock &FromBB, BasicBlock &ToBB,
+                                    DominatorTree &DT,
+                                    const PostDominatorTree &PDT,
+                                    DependenceInfo &DI) {
+  Instruction *MovePos = ToBB.getTerminator();
+  while (FromBB.size() > 1) {
+    Instruction &I = FromBB.front();
+    if (isSafeToMoveBefore(I, *MovePos, DT, &PDT, &DI))
       I.moveBefore(MovePos);
   }
 }
diff --git a/llvm/lib/Transforms/Utils/Debugify.cpp b/llvm/lib/Transforms/Utils/Debugify.cpp
index b7b4bfa3734d..8f98d81a3d79 100644
--- a/llvm/lib/Transforms/Utils/Debugify.cpp
+++ b/llvm/lib/Transforms/Utils/Debugify.cpp
@@ -30,6 +30,17 @@ namespace {
 cl::opt<bool> Quiet("debugify-quiet",
                     cl::desc("Suppress verbose debugify output"));
 
+enum class Level {
+  Locations,
+  LocationsAndVariables
+};
+cl::opt<Level> DebugifyLevel(
+    "debugify-level", cl::desc("Kind of debug info to add"),
+    cl::values(clEnumValN(Level::Locations, "locations", "Locations only"),
+               clEnumValN(Level::LocationsAndVariables, "location+variables",
+                          "Locations and Variables")),
+    cl::init(Level::LocationsAndVariables));
+
 raw_ostream &dbg() { return Quiet ? nulls() : errs(); }
 
 uint64_t getAllocSizeInBits(Module &M, Type *Ty) {
@@ -51,10 +62,11 @@ Instruction *findTerminatingInstruction(BasicBlock &BB) {
     return I;
   return BB.getTerminator();
 }
+} // end anonymous namespace
 
-bool applyDebugifyMetadata(Module &M,
-                           iterator_range<Module::iterator> Functions,
-                           StringRef Banner) {
+bool llvm::applyDebugifyMetadata(
+    Module &M, iterator_range<Module::iterator> Functions, StringRef Banner,
+    std::function<bool(DIBuilder &DIB, Function &F)> ApplyToMF) {
   // Skip modules with debug info.
   if (M.getNamedMetadata("llvm.dbg.cu")) {
     dbg() << Banner << "Skipping module with debug info\n";
@@ -63,6 +75,7 @@ bool applyDebugifyMetadata(Module &M,
 
   DIBuilder DIB(M);
   LLVMContext &Ctx = M.getContext();
+  auto *Int32Ty = Type::getInt32Ty(Ctx);
 
   // Get a DIType which corresponds to Ty.
   DenseMap<uint64_t, DIType *> TypeCache;
@@ -87,6 +100,7 @@ bool applyDebugifyMetadata(Module &M,
     if (isFunctionSkipped(F))
       continue;
 
+    bool InsertedDbgVal = false;
     auto SPType = DIB.createSubroutineType(DIB.getOrCreateTypeArray(None));
     DISubprogram::DISPFlags SPFlags =
         DISubprogram::SPFlagDefinition | DISubprogram::SPFlagOptimized;
@@ -95,11 +109,31 @@ bool applyDebugifyMetadata(Module &M,
     auto SP = DIB.createFunction(CU, F.getName(), F.getName(), File, NextLine,
                                  SPType, NextLine, DINode::FlagZero, SPFlags);
     F.setSubprogram(SP);
+
+    // Helper that inserts a dbg.value before \p InsertBefore, copying the
+    // location (and possibly the type, if it's non-void) from \p TemplateInst.
+    auto insertDbgVal = [&](Instruction &TemplateInst,
+                            Instruction *InsertBefore) {
+      std::string Name = utostr(NextVar++);
+      Value *V = &TemplateInst;
+      if (TemplateInst.getType()->isVoidTy())
+        V = ConstantInt::get(Int32Ty, 0);
+      const DILocation *Loc = TemplateInst.getDebugLoc().get();
+      auto LocalVar = DIB.createAutoVariable(SP, Name, File, Loc->getLine(),
+                                             getCachedDIType(V->getType()),
+                                             /*AlwaysPreserve=*/true);
+      DIB.insertDbgValueIntrinsic(V, LocalVar, DIB.createExpression(), Loc,
+                                  InsertBefore);
+    };
+
     for (BasicBlock &BB : F) {
       // Attach debug locations.
       for (Instruction &I : BB)
         I.setDebugLoc(DILocation::get(Ctx, NextLine++, 1, SP));
 
+      if (DebugifyLevel < Level::LocationsAndVariables)
+        continue;
+
       // Inserting debug values into EH pads can break IR invariants.
       if (BB.isEHPad())
         continue;
@@ -126,25 +160,30 @@ bool applyDebugifyMetadata(Module &M,
         if (!isa<PHINode>(I) && !I->isEHPad())
           InsertBefore = I->getNextNode();
 
-        std::string Name = utostr(NextVar++);
-        const DILocation *Loc = I->getDebugLoc().get();
-        auto LocalVar = DIB.createAutoVariable(SP, Name, File, Loc->getLine(),
-                                               getCachedDIType(I->getType()),
-                                               /*AlwaysPreserve=*/true);
-        DIB.insertDbgValueIntrinsic(I, LocalVar, DIB.createExpression(), Loc,
-                                    InsertBefore);
+        insertDbgVal(*I, InsertBefore);
+        InsertedDbgVal = true;
       }
     }
+    // Make sure we emit at least one dbg.value, otherwise MachineDebugify may
+    // not have anything to work with as it goes about inserting DBG_VALUEs.
+    // (It's common for MIR tests to be written containing skeletal IR with
+    // empty functions -- we're still interested in debugifying the MIR within
+    // those tests, and this helps with that.)
+    if (DebugifyLevel == Level::LocationsAndVariables && !InsertedDbgVal) {
+      auto *Term = findTerminatingInstruction(F.getEntryBlock());
+      insertDbgVal(*Term, Term);
+    }
+    if (ApplyToMF)
+      ApplyToMF(DIB, F);
     DIB.finalizeSubprogram(SP);
   }
   DIB.finalize();
 
   // Track the number of distinct lines and variables.
   NamedMDNode *NMD = M.getOrInsertNamedMetadata("llvm.debugify");
-  auto *IntTy = Type::getInt32Ty(Ctx);
   auto addDebugifyOperand = [&](unsigned N) {
     NMD->addOperand(MDNode::get(
-        Ctx, ValueAsMetadata::getConstant(ConstantInt::get(IntTy, N))));
+        Ctx, ValueAsMetadata::getConstant(ConstantInt::get(Int32Ty, N))));
   };
   addDebugifyOperand(NextLine - 1); // Original number of lines.
   addDebugifyOperand(NextVar - 1);  // Original number of variables.
@@ -159,6 +198,54 @@ bool applyDebugifyMetadata(Module &M,
   return true;
 }
 
+bool llvm::stripDebugifyMetadata(Module &M) {
+  bool Changed = false;
+
+  // Remove the llvm.debugify module-level named metadata.
+  NamedMDNode *DebugifyMD = M.getNamedMetadata("llvm.debugify");
+  if (DebugifyMD) {
+    M.eraseNamedMetadata(DebugifyMD);
+    Changed = true;
+  }
+
+  // Strip out all debug intrinsics and supporting metadata (subprograms, types,
+  // variables, etc).
+  Changed |= StripDebugInfo(M);
+
+  // Strip out the dead dbg.value prototype.
+  Function *DbgValF = M.getFunction("llvm.dbg.value");
+  if (DbgValF) {
+    assert(DbgValF->isDeclaration() && DbgValF->use_empty() &&
+           "Not all debug info stripped?");
+    DbgValF->eraseFromParent();
+    Changed = true;
+  }
+
+  // Strip out the module-level Debug Info Version metadata.
+  // FIXME: There must be an easier way to remove an operand from a NamedMDNode.
+  NamedMDNode *NMD = M.getModuleFlagsMetadata();
+  if (!NMD)
+    return Changed;
+  SmallVector<MDNode *, 4> Flags;
+  for (MDNode *Flag : NMD->operands())
+    Flags.push_back(Flag);
+  NMD->clearOperands();
+  for (MDNode *Flag : Flags) {
+    MDString *Key = dyn_cast_or_null<MDString>(Flag->getOperand(1));
+    if (Key->getString() == "Debug Info Version") {
+      Changed = true;
+      continue;
+    }
+    NMD->addOperand(Flag);
+  }
+  // If we left it empty we might as well remove it.
+  if (NMD->getNumOperands() == 0)
+    NMD->eraseFromParent();
+
+  return Changed;
+}
+
+namespace {
 /// Return true if a mis-sized diagnostic is issued for \p DVI.
 bool diagnoseMisSizedDbgValue(Module &M, DbgValueInst *DVI) {
   // The size of a dbg.value's value operand should match the size of the
@@ -206,7 +293,7 @@ bool checkDebugifyMetadata(Module &M,
   // Skip modules without debugify metadata.
   NamedMDNode *NMD = M.getNamedMetadata("llvm.debugify");
   if (!NMD) {
-    dbg() << Banner << "Skipping module without debugify metadata\n";
+    dbg() << Banner << ": Skipping module without debugify metadata\n";
     return false;
   }
 
@@ -233,7 +320,7 @@ bool checkDebugifyMetadata(Module &M,
 
     // Find missing lines.
     for (Instruction &I : instructions(F)) {
-      if (isa<DbgValueInst>(&I))
+      if (isa<DbgValueInst>(&I) || isa<PHINode>(&I))
         continue;
 
       auto DL = I.getDebugLoc();
@@ -243,11 +330,10 @@ bool checkDebugifyMetadata(Module &M,
       }
 
       if (!DL) {
-        dbg() << "ERROR: Instruction with empty DebugLoc in function ";
+        dbg() << "WARNING: Instruction with empty DebugLoc in function ";
         dbg() << F.getName() << " --";
         I.print(dbg());
         dbg() << "\n";
-        HasErrors = true;
       }
     }
 
@@ -287,12 +373,9 @@ bool checkDebugifyMetadata(Module &M,
     dbg() << " [" << NameOfWrappedPass << "]";
   dbg() << ": " << (HasErrors ? "FAIL" : "PASS") << '\n';
 
-  // Strip the Debugify Metadata if required.
-  if (Strip) {
-    StripDebugInfo(M);
-    M.eraseNamedMetadata(NMD);
-    return true;
-  }
+  // Strip debugify metadata if required.
+  if (Strip)
+    return stripDebugifyMetadata(M);
 
   return false;
 }
@@ -301,7 +384,8 @@ bool checkDebugifyMetadata(Module &M,
 /// legacy module pass manager.
 struct DebugifyModulePass : public ModulePass {
   bool runOnModule(Module &M) override {
-    return applyDebugifyMetadata(M, M.functions(), "ModuleDebugify: ");
+    return applyDebugifyMetadata(M, M.functions(),
+                                 "ModuleDebugify: ", /*ApplyToMF*/ nullptr);
   }
 
   DebugifyModulePass() : ModulePass(ID) {}
@@ -320,7 +404,7 @@ struct DebugifyFunctionPass : public FunctionPass {
     Module &M = *F.getParent();
     auto FuncIt = F.getIterator();
     return applyDebugifyMetadata(M, make_range(FuncIt, std::next(FuncIt)),
-                                 "FunctionDebugify: ");
+                                 "FunctionDebugify: ", /*ApplyToMF*/ nullptr);
   }
 
   DebugifyFunctionPass() : FunctionPass(ID) {}
@@ -395,7 +479,8 @@ FunctionPass *createDebugifyFunctionPass() {
 }
 
 PreservedAnalyses NewPMDebugifyPass::run(Module &M, ModuleAnalysisManager &) {
-  applyDebugifyMetadata(M, M.functions(), "ModuleDebugify: ");
+  applyDebugifyMetadata(M, M.functions(),
+                        "ModuleDebugify: ", /*ApplyToMF*/ nullptr);
   return PreservedAnalyses::all();
 }
 
diff --git a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
index 651f776a4915..f84ff9e5aad1 100644
--- a/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
+++ b/llvm/lib/Transforms/Utils/EntryExitInstrumenter.cpp
@@ -11,6 +11,7 @@
 #include "llvm/IR/DebugInfoMetadata.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/Instructions.h"
+#include "llvm/IR/Intrinsics.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
 #include "llvm/InitializePasses.h"
diff --git a/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp b/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
index 914babeb6829..cae9d9ee6d70 100644
--- a/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
+++ b/llvm/lib/Transforms/Utils/EscapeEnumerator.cpp
@@ -12,10 +12,11 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/EscapeEnumerator.h"
+#include "llvm/ADT/Triple.h"
 #include "llvm/Analysis/EHPersonalities.h"
-#include "llvm/Transforms/Utils/Local.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Module.h"
+#include "llvm/Transforms/Utils/Local.h"
+
 using namespace llvm;
 
 static FunctionCallee getDefaultPersonalityFn(Module *M) {
diff --git a/llvm/lib/Transforms/Utils/Evaluator.cpp b/llvm/lib/Transforms/Utils/Evaluator.cpp
index ad36790b8c6a..c5dfbf9d92d1 100644
--- a/llvm/lib/Transforms/Utils/Evaluator.cpp
+++ b/llvm/lib/Transforms/Utils/Evaluator.cpp
@@ -17,7 +17,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -196,8 +195,7 @@ evaluateBitcastFromPtr(Constant *Ptr, const DataLayout &DL,
     Constant *const IdxList[] = {IdxZero, IdxZero};
 
     Ptr = ConstantExpr::getGetElementPtr(Ty, Ptr, IdxList);
-    if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI))
-      Ptr = FoldedPtr;
+    Ptr = ConstantFoldConstant(Ptr, DL, TLI);
   }
   return Val;
 }
@@ -266,33 +264,33 @@ static Function *getFunction(Constant *C) {
 }
 
 Function *
-Evaluator::getCalleeWithFormalArgs(CallSite &CS,
-                                   SmallVector<Constant *, 8> &Formals) {
-  auto *V = CS.getCalledValue();
+Evaluator::getCalleeWithFormalArgs(CallBase &CB,
+                                   SmallVectorImpl<Constant *> &Formals) {
+  auto *V = CB.getCalledOperand();
   if (auto *Fn = getFunction(getVal(V)))
-    return getFormalParams(CS, Fn, Formals) ? Fn : nullptr;
+    return getFormalParams(CB, Fn, Formals) ? Fn : nullptr;
 
   auto *CE = dyn_cast<ConstantExpr>(V);
   if (!CE || CE->getOpcode() != Instruction::BitCast ||
-      !getFormalParams(CS, getFunction(CE->getOperand(0)), Formals))
+      !getFormalParams(CB, getFunction(CE->getOperand(0)), Formals))
     return nullptr;
 
   return dyn_cast<Function>(
       ConstantFoldLoadThroughBitcast(CE, CE->getOperand(0)->getType(), DL));
 }
 
-bool Evaluator::getFormalParams(CallSite &CS, Function *F,
-                                SmallVector<Constant *, 8> &Formals) {
+bool Evaluator::getFormalParams(CallBase &CB, Function *F,
+                                SmallVectorImpl<Constant *> &Formals) {
   if (!F)
     return false;
 
   auto *FTy = F->getFunctionType();
-  if (FTy->getNumParams() > CS.getNumArgOperands()) {
+  if (FTy->getNumParams() > CB.getNumArgOperands()) {
     LLVM_DEBUG(dbgs() << "Too few arguments for function.\n");
     return false;
   }
 
-  auto ArgI = CS.arg_begin();
+  auto ArgI = CB.arg_begin();
   for (auto ParI = FTy->param_begin(), ParE = FTy->param_end(); ParI != ParE;
        ++ParI) {
     auto *ArgC = ConstantFoldLoadThroughBitcast(getVal(*ArgI), *ParI, DL);
@@ -339,7 +337,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
         return false;  // no volatile/atomic accesses.
       }
       Constant *Ptr = getVal(SI->getOperand(1));
-      if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) {
+      Constant *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI);
+      if (Ptr != FoldedPtr) {
         LLVM_DEBUG(dbgs() << "Folding constant ptr expression: " << *Ptr);
         Ptr = FoldedPtr;
         LLVM_DEBUG(dbgs() << "; To: " << *Ptr << "\n");
@@ -448,7 +447,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       }
 
       Constant *Ptr = getVal(LI->getOperand(0));
-      if (auto *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI)) {
+      Constant *FoldedPtr = ConstantFoldConstant(Ptr, DL, TLI);
+      if (Ptr != FoldedPtr) {
         Ptr = FoldedPtr;
         LLVM_DEBUG(dbgs() << "Found a constant pointer expression, constant "
                              "folding: "
@@ -476,22 +476,22 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
       InstResult = AllocaTmps.back().get();
       LLVM_DEBUG(dbgs() << "Found an alloca. Result: " << *InstResult << "\n");
     } else if (isa<CallInst>(CurInst) || isa<InvokeInst>(CurInst)) {
-      CallSite CS(&*CurInst);
+      CallBase &CB = *cast<CallBase>(&*CurInst);
 
       // Debug info can safely be ignored here.
-      if (isa<DbgInfoIntrinsic>(CS.getInstruction())) {
+      if (isa<DbgInfoIntrinsic>(CB)) {
         LLVM_DEBUG(dbgs() << "Ignoring debug info.\n");
         ++CurInst;
         continue;
       }
 
       // Cannot handle inline asm.
-      if (isa<InlineAsm>(CS.getCalledValue())) {
+      if (CB.isInlineAsm()) {
         LLVM_DEBUG(dbgs() << "Found inline asm, can not evaluate.\n");
         return false;
       }
 
-      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(CS.getInstruction())) {
+      if (IntrinsicInst *II = dyn_cast<IntrinsicInst>(&CB)) {
         if (MemSetInst *MSI = dyn_cast<MemSetInst>(II)) {
           if (MSI->isVolatile()) {
             LLVM_DEBUG(dbgs() << "Can not optimize a volatile memset "
@@ -559,7 +559,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
 
       // Resolve function pointers.
       SmallVector<Constant *, 8> Formals;
-      Function *Callee = getCalleeWithFormalArgs(CS, Formals);
+      Function *Callee = getCalleeWithFormalArgs(CB, Formals);
       if (!Callee || Callee->isInterposable()) {
         LLVM_DEBUG(dbgs() << "Can not resolve function pointer.\n");
         return false;  // Cannot resolve.
@@ -567,9 +567,8 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
 
       if (Callee->isDeclaration()) {
         // If this is a function we can constant fold, do it.
-        if (Constant *C = ConstantFoldCall(cast<CallBase>(CS.getInstruction()),
-                                           Callee, Formals, TLI)) {
-          InstResult = castCallResultIfNeeded(CS.getCalledValue(), C);
+        if (Constant *C = ConstantFoldCall(&CB, Callee, Formals, TLI)) {
+          InstResult = castCallResultIfNeeded(CB.getCalledOperand(), C);
           if (!InstResult)
             return false;
           LLVM_DEBUG(dbgs() << "Constant folded function call. Result: "
@@ -592,7 +591,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
           return false;
         }
         ValueStack.pop_back();
-        InstResult = castCallResultIfNeeded(CS.getCalledValue(), RetVal);
+        InstResult = castCallResultIfNeeded(CB.getCalledOperand(), RetVal);
         if (RetVal && !InstResult)
           return false;
 
@@ -648,9 +647,7 @@ bool Evaluator::EvaluateBlock(BasicBlock::iterator CurInst,
     }
 
     if (!CurInst->use_empty()) {
-      if (auto *FoldedInstResult = ConstantFoldConstant(InstResult, DL, TLI))
-        InstResult = FoldedInstResult;
-
+      InstResult = ConstantFoldConstant(InstResult, DL, TLI);
       setVal(&*CurInst, InstResult);
     }
 
diff --git a/llvm/lib/Transforms/Utils/FixIrreducible.cpp b/llvm/lib/Transforms/Utils/FixIrreducible.cpp
new file mode 100644
index 000000000000..460ba9e97fc6
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/FixIrreducible.cpp
@@ -0,0 +1,337 @@
+//===- FixIrreducible.cpp - Convert irreducible control-flow into loops ---===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// An irreducible SCC is one which has multiple "header" blocks, i.e., blocks
+// with control-flow edges incident from outside the SCC.  This pass converts a
+// irreducible SCC into a natural loop by applying the following transformation:
+//
+// 1. Collect the set of headers H of the SCC.
+// 2. Collect the set of predecessors P of these headers. These may be inside as
+//    well as outside the SCC.
+// 3. Create block N and redirect every edge from set P to set H through N.
+//
+// This converts the SCC into a natural loop with N as the header: N is the only
+// block with edges incident from outside the SCC, and all backedges in the SCC
+// are incident on N, i.e., for every backedge, the head now dominates the tail.
+//
+// INPUT CFG: The blocks A and B form an irreducible loop with two headers.
+//
+//                        Entry
+//                       /     \
+//                      v       v
+//                      A ----> B
+//                      ^      /|
+//                       `----' |
+//                              v
+//                             Exit
+//
+// OUTPUT CFG: Edges incident on A and B are now redirected through a
+// new block N, forming a natural loop consisting of N, A and B.
+//
+//                        Entry
+//                          |
+//                          v
+//                    .---> N <---.
+//                   /     / \     \
+//                  |     /   \     |
+//                  \    v     v    /
+//                   `-- A     B --'
+//                             |
+//                             v
+//                            Exit
+//
+// The transformation is applied to every maximal SCC that is not already
+// recognized as a loop. The pass operates on all maximal SCCs found in the
+// function body outside of any loop, as well as those found inside each loop,
+// including inside any newly created loops. This ensures that any SCC hidden
+// inside a maximal SCC is also transformed.
+//
+// The actual transformation is handled by function CreateControlFlowHub, which
+// takes a set of incoming blocks (the predecessors) and outgoing blocks (the
+// headers). The function also moves every PHINode in an outgoing block to the
+// hub. Since the hub dominates all the outgoing blocks, each such PHINode
+// continues to dominate its uses. Since every header in an SCC has at least two
+// predecessors, every value used in the header (or later) but defined in a
+// predecessor (or earlier) is represented by a PHINode in a header. Hence the
+// above handling of PHINodes is sufficient and no further processing is
+// required to restore SSA.
+//
+// Limitation: The pass cannot handle switch statements and indirect
+//             branches. Both must be lowered to plain branches first.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/ADT/SCCIterator.h"
+#include "llvm/Analysis/LoopIterator.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "fix-irreducible"
+
+using namespace llvm;
+
+namespace {
+struct FixIrreducible : public FunctionPass {
+  static char ID;
+  FixIrreducible() : FunctionPass(ID) {
+    initializeFixIrreduciblePass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredID(LowerSwitchID);
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addPreservedID(LowerSwitchID);
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<LoopInfoWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+char FixIrreducible::ID = 0;
+
+FunctionPass *llvm::createFixIrreduciblePass() { return new FixIrreducible(); }
+
+INITIALIZE_PASS_BEGIN(FixIrreducible, "fix-irreducible",
+                      "Convert irreducible control-flow into natural loops",
+                      false /* Only looks at CFG */, false /* Analysis Pass */)
+INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(FixIrreducible, "fix-irreducible",
+                    "Convert irreducible control-flow into natural loops",
+                    false /* Only looks at CFG */, false /* Analysis Pass */)
+
+// When a new loop is created, existing children of the parent loop may now be
+// fully inside the new loop. Reconnect these as children of the new loop.
+static void reconnectChildLoops(LoopInfo &LI, Loop *ParentLoop, Loop *NewLoop,
+                                SetVector<BasicBlock *> &Blocks,
+                                SetVector<BasicBlock *> &Headers) {
+  auto &CandidateLoops = ParentLoop ? ParentLoop->getSubLoopsVector()
+                                    : LI.getTopLevelLoopsVector();
+  // The new loop cannot be its own child, and any candidate is a
+  // child iff its header is owned by the new loop. Move all the
+  // children to a new vector.
+  auto FirstChild = std::partition(
+      CandidateLoops.begin(), CandidateLoops.end(), [&](Loop *L) {
+        return L == NewLoop || Blocks.count(L->getHeader()) == 0;
+      });
+  SmallVector<Loop *, 8> ChildLoops(FirstChild, CandidateLoops.end());
+  CandidateLoops.erase(FirstChild, CandidateLoops.end());
+
+  for (auto II = ChildLoops.begin(), IE = ChildLoops.end(); II != IE; ++II) {
+    auto Child = *II;
+    LLVM_DEBUG(dbgs() << "child loop: " << Child->getHeader()->getName()
+                      << "\n");
+    // TODO: A child loop whose header is also a header in the current
+    // SCC gets destroyed since its backedges are removed. That may
+    // not be necessary if we can retain such backedges.
+    if (Headers.count(Child->getHeader())) {
+      for (auto BB : Child->blocks()) {
+        LI.changeLoopFor(BB, NewLoop);
+        LLVM_DEBUG(dbgs() << "moved block from child: " << BB->getName()
+                          << "\n");
+      }
+      LI.destroy(Child);
+      LLVM_DEBUG(dbgs() << "subsumed child loop (common header)\n");
+      continue;
+    }
+
+    Child->setParentLoop(nullptr);
+    NewLoop->addChildLoop(Child);
+    LLVM_DEBUG(dbgs() << "added child loop to new loop\n");
+  }
+}
+
+// Given a set of blocks and headers in an irreducible SCC, convert it into a
+// natural loop. Also insert this new loop at its appropriate place in the
+// hierarchy of loops.
+static void createNaturalLoopInternal(LoopInfo &LI, DominatorTree &DT,
+                                      Loop *ParentLoop,
+                                      SetVector<BasicBlock *> &Blocks,
+                                      SetVector<BasicBlock *> &Headers) {
+#ifndef NDEBUG
+  // All headers are part of the SCC
+  for (auto H : Headers) {
+    assert(Blocks.count(H));
+  }
+#endif
+
+  SetVector<BasicBlock *> Predecessors;
+  for (auto H : Headers) {
+    for (auto P : predecessors(H)) {
+      Predecessors.insert(P);
+    }
+  }
+
+  LLVM_DEBUG(
+      dbgs() << "Found predecessors:";
+      for (auto P : Predecessors) {
+        dbgs() << " " << P->getName();
+      }
+      dbgs() << "\n");
+
+  // Redirect all the backedges through a "hub" consisting of a series
+  // of guard blocks that manage the flow of control from the
+  // predecessors to the headers.
+  SmallVector<BasicBlock *, 8> GuardBlocks;
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  CreateControlFlowHub(&DTU, GuardBlocks, Predecessors, Headers, "irr");
+#if defined(EXPENSIVE_CHECKS)
+  assert(DT.verify(DominatorTree::VerificationLevel::Full));
+#else
+  assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+#endif
+
+  // Create a new loop from the now-transformed cycle
+  auto NewLoop = LI.AllocateLoop();
+  if (ParentLoop) {
+    ParentLoop->addChildLoop(NewLoop);
+  } else {
+    LI.addTopLevelLoop(NewLoop);
+  }
+
+  // Add the guard blocks to the new loop. The first guard block is
+  // the head of all the backedges, and it is the first to be inserted
+  // in the loop. This ensures that it is recognized as the
+  // header. Since the new loop is already in LoopInfo, the new blocks
+  // are also propagated up the chain of parent loops.
+  for (auto G : GuardBlocks) {
+    LLVM_DEBUG(dbgs() << "added guard block: " << G->getName() << "\n");
+    NewLoop->addBasicBlockToLoop(G, LI);
+  }
+
+  // Add the SCC blocks to the new loop.
+  for (auto BB : Blocks) {
+    NewLoop->addBlockEntry(BB);
+    if (LI.getLoopFor(BB) == ParentLoop) {
+      LLVM_DEBUG(dbgs() << "moved block from parent: " << BB->getName()
+                        << "\n");
+      LI.changeLoopFor(BB, NewLoop);
+    } else {
+      LLVM_DEBUG(dbgs() << "added block from child: " << BB->getName() << "\n");
+    }
+  }
+  LLVM_DEBUG(dbgs() << "header for new loop: "
+                    << NewLoop->getHeader()->getName() << "\n");
+
+  reconnectChildLoops(LI, ParentLoop, NewLoop, Blocks, Headers);
+
+  NewLoop->verifyLoop();
+  if (ParentLoop) {
+    ParentLoop->verifyLoop();
+  }
+#if defined(EXPENSIVE_CHECKS)
+  LI.verify(DT);
+#endif // EXPENSIVE_CHECKS
+}
+
+namespace llvm {
+// Enable the graph traits required for traversing a Loop body.
+template <> struct GraphTraits<Loop> : LoopBodyTraits {};
+} // namespace llvm
+
+// Overloaded wrappers to go with the function template below.
+static BasicBlock *unwrapBlock(BasicBlock *B) { return B; }
+static BasicBlock *unwrapBlock(LoopBodyTraits::NodeRef &N) { return N.second; }
+
+static void createNaturalLoop(LoopInfo &LI, DominatorTree &DT, Function *F,
+                              SetVector<BasicBlock *> &Blocks,
+                              SetVector<BasicBlock *> &Headers) {
+  createNaturalLoopInternal(LI, DT, nullptr, Blocks, Headers);
+}
+
+static void createNaturalLoop(LoopInfo &LI, DominatorTree &DT, Loop &L,
+                              SetVector<BasicBlock *> &Blocks,
+                              SetVector<BasicBlock *> &Headers) {
+  createNaturalLoopInternal(LI, DT, &L, Blocks, Headers);
+}
+
+// Convert irreducible SCCs; Graph G may be a Function* or a Loop&.
+template <class Graph>
+static bool makeReducible(LoopInfo &LI, DominatorTree &DT, Graph &&G) {
+  bool Changed = false;
+  for (auto Scc = scc_begin(G); !Scc.isAtEnd(); ++Scc) {
+    if (Scc->size() < 2)
+      continue;
+    SetVector<BasicBlock *> Blocks;
+    LLVM_DEBUG(dbgs() << "Found SCC:");
+    for (auto N : *Scc) {
+      auto BB = unwrapBlock(N);
+      LLVM_DEBUG(dbgs() << " " << BB->getName());
+      Blocks.insert(BB);
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+
+    // Minor optimization: The SCC blocks are usually discovered in an order
+    // that is the opposite of the order in which these blocks appear as branch
+    // targets. This results in a lot of condition inversions in the control
+    // flow out of the new ControlFlowHub, which can be mitigated if the orders
+    // match. So we discover the headers using the reverse of the block order.
+    SetVector<BasicBlock *> Headers;
+    LLVM_DEBUG(dbgs() << "Found headers:");
+    for (auto BB : reverse(Blocks)) {
+      for (const auto P : predecessors(BB)) {
+        // Skip unreachable predecessors.
+        if (!DT.isReachableFromEntry(P))
+          continue;
+        if (!Blocks.count(P)) {
+          LLVM_DEBUG(dbgs() << " " << BB->getName());
+          Headers.insert(BB);
+          break;
+        }
+      }
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+
+    if (Headers.size() == 1) {
+      assert(LI.isLoopHeader(Headers.front()));
+      LLVM_DEBUG(dbgs() << "Natural loop with a single header: skipped\n");
+      continue;
+    }
+    createNaturalLoop(LI, DT, G, Blocks, Headers);
+    Changed = true;
+  }
+  return Changed;
+}
+
+bool FixIrreducible::runOnFunction(Function &F) {
+  LLVM_DEBUG(dbgs() << "===== Fix irreducible control-flow in function: "
+                    << F.getName() << "\n");
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  bool Changed = false;
+  SmallVector<Loop *, 8> WorkList;
+
+  LLVM_DEBUG(dbgs() << "visiting top-level\n");
+  Changed |= makeReducible(LI, DT, &F);
+
+  // Any SCCs reduced are now already in the list of top-level loops, so simply
+  // add them all to the worklist.
+  for (auto L : LI) {
+    WorkList.push_back(L);
+  }
+
+  while (!WorkList.empty()) {
+    auto L = WorkList.back();
+    WorkList.pop_back();
+    LLVM_DEBUG(dbgs() << "visiting loop with header "
+                      << L->getHeader()->getName() << "\n");
+    Changed |= makeReducible(LI, DT, *L);
+    // Any SCCs reduced are now already in the list of child loops, so simply
+    // add them all to the worklist.
+    WorkList.append(L->begin(), L->end());
+  }
+
+  return Changed;
+}
diff --git a/llvm/lib/Transforms/Utils/FlattenCFG.cpp b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
index 893f23eb6048..0098dcaeb07a 100644
--- a/llvm/lib/Transforms/Utils/FlattenCFG.cpp
+++ b/llvm/lib/Transforms/Utils/FlattenCFG.cpp
@@ -45,12 +45,12 @@ class FlattenCFGOpt {
   bool MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder);
 
   /// Compare a pair of blocks: \p Block1 and \p Block2, which
-  /// are from two if-regions whose entry blocks are \p Head1 and \p
-  /// Head2.  \returns true if \p Block1 and \p Block2 contain identical
+  /// are from two if-regions, where \p Head2 is the entry block of the 2nd
+  /// if-region.  \returns true if \p Block1 and \p Block2 contain identical
   /// instructions, and have no memory reference alias with \p Head2.
   /// This is used as a legality check for merging if-regions.
-  bool CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
-                            BasicBlock *Block1, BasicBlock *Block2);
+  bool CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2,
+                            BasicBlock *Head2);
 
 public:
   FlattenCFGOpt(AliasAnalysis *AA) : AA(AA) {}
@@ -97,7 +97,7 @@ public:
 ///    br label %if.end;
 ///
 ///  Current implementation handles two cases.
-///  Case 1: \param BB is on the else-path.
+///  Case 1: BB is on the else-path.
 ///
 ///          BB1
 ///        /     |
@@ -105,7 +105,7 @@ public:
 ///      /   \   |
 ///     BB3   \  |     where, BB1, BB2 contain conditional branches.
 ///      \    |  /     BB3 contains unconditional branch.
-///       \   | /      BB4 corresponds to \param BB which is also the merge.
+///       \   | /      BB4 corresponds to BB which is also the merge.
 ///  BB => BB4
 ///
 ///
@@ -114,14 +114,14 @@ public:
 ///  if (a == b && c == d)
 ///    statement; // BB3
 ///
-///  Case 2: \param BB BB is on the then-path.
+///  Case 2: BB is on the then-path.
 ///
 ///             BB1
 ///          /      |
 ///         |      BB2
 ///         \    /    |  where BB1, BB2 contain conditional branches.
 ///  BB =>   BB3      |  BB3 contains unconditiona branch and corresponds
-///           \     /    to \param BB.  BB4 is the merge.
+///           \     /    to BB.  BB4 is the merge.
 ///             BB4
 ///
 ///  Corresponding source code:
@@ -129,9 +129,9 @@ public:
 ///  if (a == b || c == d)
 ///    statement;  // BB3
 ///
-///  In both cases,  \param BB is the common successor of conditional branches.
-///  In Case 1, \param BB (BB4) has an unconditional branch (BB3) as
-///  its predecessor.  In Case 2, \param BB (BB3) only has conditional branches
+///  In both cases, BB is the common successor of conditional branches.
+///  In Case 1, BB (BB4) has an unconditional branch (BB3) as
+///  its predecessor.  In Case 2, BB (BB3) only has conditional branches
 ///  as its predecessors.
 bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
   PHINode *PHI = dyn_cast<PHINode>(BB->begin());
@@ -315,25 +315,16 @@ bool FlattenCFGOpt::FlattenParallelAndOr(BasicBlock *BB, IRBuilder<> &Builder) {
   return true;
 }
 
-/// Compare blocks from two if-regions, where \param Head1 is the entry of the
-/// 1st if-region. \param Head2 is the entry of the 2nd if-region. \param
-/// Block1 is a block in the 1st if-region to compare. \param Block2 is a block
-//  in the 2nd if-region to compare.  \returns true if \param Block1 and \param
-/// Block2 have identical instructions and do not have memory reference alias
-/// with \param Head2.
-bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
-                                         BasicBlock *Block1,
-                                         BasicBlock *Block2) {
+/// Compare blocks from two if-regions, where \param Head2 is the entry of the
+/// 2nd if-region. \param Block1 is a block in the 1st if-region to compare.
+/// \param Block2 is a block in the 2nd if-region to compare.  \returns true if
+/// Block1 and Block2 have identical instructions and do not have
+/// memory reference alias with Head2.
+bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Block1, BasicBlock *Block2,
+                                         BasicBlock *Head2) {
   Instruction *PTI2 = Head2->getTerminator();
   Instruction *PBI2 = &Head2->front();
 
-  bool eq1 = (Block1 == Head1);
-  bool eq2 = (Block2 == Head2);
-  if (eq1 || eq2) {
-    // An empty then-path or else-path.
-    return (eq1 == eq2);
-  }
-
   // Check whether instructions in Block1 and Block2 are identical
   // and do not alias with instructions in Head2.
   BasicBlock::iterator iter1 = Block1->begin();
@@ -395,6 +386,29 @@ bool FlattenCFGOpt::CompareIfRegionBlock(BasicBlock *Head1, BasicBlock *Head2,
 /// To:
 /// if (a || b)
 ///   statement;
+///
+///
+/// And from:
+/// if (a)
+///   ;
+/// else
+///   statement;
+/// if (b)
+///   ;
+/// else
+///   statement;
+///
+/// To:
+/// if (a && b)
+///   ;
+/// else
+///   statement;
+///
+/// We always take the form of the first if-region. This means that if the
+/// statement in the first if-region, is in the "then-path", while in the second
+/// if-region it is in the "else-path", then we convert the second to the first
+/// form, by inverting the condition and the branch successors. The same
+/// approach goes for the opposite case.
 bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   BasicBlock *IfTrue2, *IfFalse2;
   Value *IfCond2 = GetIfCondition(BB, IfTrue2, IfFalse2);
@@ -415,22 +429,42 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   BasicBlock *FirstEntryBlock = CInst1->getParent();
 
   // Either then-path or else-path should be empty.
-  if ((IfTrue1 != FirstEntryBlock) && (IfFalse1 != FirstEntryBlock))
-    return false;
-  if ((IfTrue2 != SecondEntryBlock) && (IfFalse2 != SecondEntryBlock))
-    return false;
+  bool InvertCond2 = false;
+  BinaryOperator::BinaryOps CombineOp;
+  if (IfFalse1 == FirstEntryBlock) {
+    // The else-path is empty, so we must use "or" operation to combine the
+    // conditions.
+    CombineOp = BinaryOperator::Or;
+    if (IfFalse2 != SecondEntryBlock) {
+      if (IfTrue2 != SecondEntryBlock)
+        return false;
 
-  Instruction *PTI2 = SecondEntryBlock->getTerminator();
-  Instruction *PBI2 = &SecondEntryBlock->front();
+      InvertCond2 = true;
+      std::swap(IfTrue2, IfFalse2);
+    }
 
-  if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfTrue1,
-                            IfTrue2))
-    return false;
+    if (!CompareIfRegionBlock(IfTrue1, IfTrue2, SecondEntryBlock))
+      return false;
+  } else if (IfTrue1 == FirstEntryBlock) {
+    // The then-path is empty, so we must use "and" operation to combine the
+    // conditions.
+    CombineOp = BinaryOperator::And;
+    if (IfTrue2 != SecondEntryBlock) {
+      if (IfFalse2 != SecondEntryBlock)
+        return false;
+
+      InvertCond2 = true;
+      std::swap(IfTrue2, IfFalse2);
+    }
 
-  if (!CompareIfRegionBlock(FirstEntryBlock, SecondEntryBlock, IfFalse1,
-                            IfFalse2))
+    if (!CompareIfRegionBlock(IfFalse1, IfFalse2, SecondEntryBlock))
+      return false;
+  } else
     return false;
 
+  Instruction *PTI2 = SecondEntryBlock->getTerminator();
+  Instruction *PBI2 = &SecondEntryBlock->front();
+
   // Check whether \param SecondEntryBlock has side-effect and is safe to
   // speculate.
   for (BasicBlock::iterator BI(PBI2), BE(PTI2); BI != BE; ++BI) {
@@ -445,12 +479,22 @@ bool FlattenCFGOpt::MergeIfRegion(BasicBlock *BB, IRBuilder<> &Builder) {
   FirstEntryBlock->getInstList()
       .splice(FirstEntryBlock->end(), SecondEntryBlock->getInstList());
   BranchInst *PBI = cast<BranchInst>(FirstEntryBlock->getTerminator());
-  Value *CC = PBI->getCondition();
+  assert(PBI->getCondition() == IfCond2);
   BasicBlock *SaveInsertBB = Builder.GetInsertBlock();
   BasicBlock::iterator SaveInsertPt = Builder.GetInsertPoint();
   Builder.SetInsertPoint(PBI);
-  Value *NC = Builder.CreateOr(CInst1, CC);
-  PBI->replaceUsesOfWith(CC, NC);
+  if (InvertCond2) {
+    // If this is a "cmp" instruction, only used for branching (and nowhere
+    // else), then we can simply invert the predicate.
+    auto Cmp2 = dyn_cast<CmpInst>(CInst2);
+    if (Cmp2 && Cmp2->hasOneUse())
+      Cmp2->setPredicate(Cmp2->getInversePredicate());
+    else
+      CInst2 = cast<Instruction>(Builder.CreateNot(CInst2));
+    PBI->swapSuccessors();
+  }
+  Value *NC = Builder.CreateBinOp(CombineOp, CInst1, CInst2);
+  PBI->replaceUsesOfWith(IfCond2, NC);
   Builder.SetInsertPoint(SaveInsertBB, SaveInsertPt);
 
   // Handle PHI node to replace its predecessors to FirstEntryBlock.
@@ -496,6 +540,6 @@ bool FlattenCFGOpt::run(BasicBlock *BB) {
 /// FlattenCFG - This function is used to flatten a CFG.  For
 /// example, it uses parallel-and and parallel-or mode to collapse
 /// if-conditions and merge if-regions with identical statements.
-bool llvm::FlattenCFG(BasicBlock *BB, AliasAnalysis *AA) {
+bool llvm::FlattenCFG(BasicBlock *BB, AAResults *AA) {
   return FlattenCFGOpt(AA).run(BB);
 }
diff --git a/llvm/lib/Transforms/Utils/FunctionComparator.cpp b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
index a9b28754c8e9..101cb232d8ae 100644
--- a/llvm/lib/Transforms/Utils/FunctionComparator.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionComparator.cpp
@@ -20,7 +20,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
@@ -52,22 +51,28 @@ using namespace llvm;
 #define DEBUG_TYPE "functioncomparator"
 
 int FunctionComparator::cmpNumbers(uint64_t L, uint64_t R) const {
-  if (L < R) return -1;
-  if (L > R) return 1;
+  if (L < R)
+    return -1;
+  if (L > R)
+    return 1;
   return 0;
 }
 
 int FunctionComparator::cmpOrderings(AtomicOrdering L, AtomicOrdering R) const {
-  if ((int)L < (int)R) return -1;
-  if ((int)L > (int)R) return 1;
+  if ((int)L < (int)R)
+    return -1;
+  if ((int)L > (int)R)
+    return 1;
   return 0;
 }
 
 int FunctionComparator::cmpAPInts(const APInt &L, const APInt &R) const {
   if (int Res = cmpNumbers(L.getBitWidth(), R.getBitWidth()))
     return Res;
-  if (L.ugt(R)) return 1;
-  if (R.ugt(L)) return -1;
+  if (L.ugt(R))
+    return 1;
+  if (R.ugt(L))
+    return -1;
   return 0;
 }
 
@@ -166,21 +171,17 @@ int FunctionComparator::cmpRangeMetadata(const MDNode *L,
   return 0;
 }
 
-int FunctionComparator::cmpOperandBundlesSchema(const Instruction *L,
-                                                const Instruction *R) const {
-  ImmutableCallSite LCS(L);
-  ImmutableCallSite RCS(R);
-
-  assert(LCS && RCS && "Must be calls or invokes!");
-  assert(LCS.isCall() == RCS.isCall() && "Can't compare otherwise!");
+int FunctionComparator::cmpOperandBundlesSchema(const CallBase &LCS,
+                                                const CallBase &RCS) const {
+  assert(LCS.getOpcode() == RCS.getOpcode() && "Can't compare otherwise!");
 
   if (int Res =
           cmpNumbers(LCS.getNumOperandBundles(), RCS.getNumOperandBundles()))
     return Res;
 
-  for (unsigned i = 0, e = LCS.getNumOperandBundles(); i != e; ++i) {
-    auto OBL = LCS.getOperandBundleAt(i);
-    auto OBR = RCS.getOperandBundleAt(i);
+  for (unsigned I = 0, E = LCS.getNumOperandBundles(); I != E; ++I) {
+    auto OBL = LCS.getOperandBundleAt(I);
+    auto OBR = RCS.getOperandBundleAt(I);
 
     if (int Res = OBL.getTagName().compare(OBR.getTagName()))
       return Res;
@@ -227,9 +228,9 @@ int FunctionComparator::cmpConstants(const Constant *L,
     unsigned TyRWidth = 0;
 
     if (auto *VecTyL = dyn_cast<VectorType>(TyL))
-      TyLWidth = VecTyL->getBitWidth();
+      TyLWidth = VecTyL->getPrimitiveSizeInBits().getFixedSize();
     if (auto *VecTyR = dyn_cast<VectorType>(TyR))
-      TyRWidth = VecTyR->getBitWidth();
+      TyRWidth = VecTyR->getPrimitiveSizeInBits().getFixedSize();
 
     if (TyLWidth != TyRWidth)
       return cmpNumbers(TyLWidth, TyRWidth);
@@ -328,8 +329,8 @@ int FunctionComparator::cmpConstants(const Constant *L,
   case Value::ConstantVectorVal: {
     const ConstantVector *LV = cast<ConstantVector>(L);
     const ConstantVector *RV = cast<ConstantVector>(R);
-    unsigned NumElementsL = cast<VectorType>(TyL)->getNumElements();
-    unsigned NumElementsR = cast<VectorType>(TyR)->getNumElements();
+    unsigned NumElementsL = cast<FixedVectorType>(TyL)->getNumElements();
+    unsigned NumElementsR = cast<FixedVectorType>(TyR)->getNumElements();
     if (int Res = cmpNumbers(NumElementsL, NumElementsR))
       return Res;
     for (uint64_t i = 0; i < NumElementsL; ++i) {
@@ -361,12 +362,12 @@ int FunctionComparator::cmpConstants(const Constant *L,
     if (LBA->getFunction() == RBA->getFunction()) {
       // They are BBs in the same function. Order by which comes first in the
       // BB order of the function. This order is deterministic.
-      Function* F = LBA->getFunction();
+      Function *F = LBA->getFunction();
       BasicBlock *LBB = LBA->getBasicBlock();
       BasicBlock *RBB = RBA->getBasicBlock();
       if (LBB == RBB)
         return 0;
-      for(BasicBlock &BB : F->getBasicBlockList()) {
+      for (BasicBlock &BB : F->getBasicBlockList()) {
         if (&BB == LBB) {
           assert(&BB != RBB);
           return -1;
@@ -476,14 +477,25 @@ int FunctionComparator::cmpTypes(Type *TyL, Type *TyR) const {
     return 0;
   }
 
-  case Type::ArrayTyID:
-  case Type::VectorTyID: {
-    auto *STyL = cast<SequentialType>(TyL);
-    auto *STyR = cast<SequentialType>(TyR);
+  case Type::ArrayTyID: {
+    auto *STyL = cast<ArrayType>(TyL);
+    auto *STyR = cast<ArrayType>(TyR);
     if (STyL->getNumElements() != STyR->getNumElements())
       return cmpNumbers(STyL->getNumElements(), STyR->getNumElements());
     return cmpTypes(STyL->getElementType(), STyR->getElementType());
   }
+  case Type::FixedVectorTyID:
+  case Type::ScalableVectorTyID: {
+    auto *STyL = cast<VectorType>(TyL);
+    auto *STyR = cast<VectorType>(TyR);
+    if (STyL->getElementCount().Scalable != STyR->getElementCount().Scalable)
+      return cmpNumbers(STyL->getElementCount().Scalable,
+                        STyR->getElementCount().Scalable);
+    if (STyL->getElementCount().Min != STyR->getElementCount().Min)
+      return cmpNumbers(STyL->getElementCount().Min,
+                        STyR->getElementCount().Min);
+    return cmpTypes(STyL->getElementType(), STyR->getElementType());
+  }
   }
 }
 
@@ -551,7 +563,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res = cmpNumbers(LI->getSyncScopeID(),
                              cast<LoadInst>(R)->getSyncScopeID()))
       return Res;
-    return cmpRangeMetadata(LI->getMetadata(LLVMContext::MD_range),
+    return cmpRangeMetadata(
+        LI->getMetadata(LLVMContext::MD_range),
         cast<LoadInst>(R)->getMetadata(LLVMContext::MD_range));
   }
   if (const StoreInst *SI = dyn_cast<StoreInst>(L)) {
@@ -569,13 +582,13 @@ int FunctionComparator::cmpOperations(const Instruction *L,
   }
   if (const CmpInst *CI = dyn_cast<CmpInst>(L))
     return cmpNumbers(CI->getPredicate(), cast<CmpInst>(R)->getPredicate());
-  if (auto CSL = CallSite(const_cast<Instruction *>(L))) {
-    auto CSR = CallSite(const_cast<Instruction *>(R));
-    if (int Res = cmpNumbers(CSL.getCallingConv(), CSR.getCallingConv()))
+  if (auto *CBL = dyn_cast<CallBase>(L)) {
+    auto *CBR = cast<CallBase>(R);
+    if (int Res = cmpNumbers(CBL->getCallingConv(), CBR->getCallingConv()))
       return Res;
-    if (int Res = cmpAttrs(CSL.getAttributes(), CSR.getAttributes()))
+    if (int Res = cmpAttrs(CBL->getAttributes(), CBR->getAttributes()))
       return Res;
-    if (int Res = cmpOperandBundlesSchema(L, R))
+    if (int Res = cmpOperandBundlesSchema(*CBL, *CBR))
       return Res;
     if (const CallInst *CI = dyn_cast<CallInst>(L))
       if (int Res = cmpNumbers(CI->getTailCallKind(),
@@ -616,8 +629,8 @@ int FunctionComparator::cmpOperations(const Instruction *L,
     if (int Res = cmpNumbers(CXI->isVolatile(),
                              cast<AtomicCmpXchgInst>(R)->isVolatile()))
       return Res;
-    if (int Res = cmpNumbers(CXI->isWeak(),
-                             cast<AtomicCmpXchgInst>(R)->isWeak()))
+    if (int Res =
+            cmpNumbers(CXI->isWeak(), cast<AtomicCmpXchgInst>(R)->isWeak()))
       return Res;
     if (int Res =
             cmpOrderings(CXI->getSuccessOrdering(),
@@ -638,11 +651,21 @@ int FunctionComparator::cmpOperations(const Instruction *L,
                              cast<AtomicRMWInst>(R)->isVolatile()))
       return Res;
     if (int Res = cmpOrderings(RMWI->getOrdering(),
-                             cast<AtomicRMWInst>(R)->getOrdering()))
+                               cast<AtomicRMWInst>(R)->getOrdering()))
       return Res;
     return cmpNumbers(RMWI->getSyncScopeID(),
                       cast<AtomicRMWInst>(R)->getSyncScopeID());
   }
+  if (const ShuffleVectorInst *SVI = dyn_cast<ShuffleVectorInst>(L)) {
+    ArrayRef<int> LMask = SVI->getShuffleMask();
+    ArrayRef<int> RMask = cast<ShuffleVectorInst>(R)->getShuffleMask();
+    if (int Res = cmpNumbers(LMask.size(), RMask.size()))
+      return Res;
+    for (size_t i = 0, e = LMask.size(); i != e; ++i) {
+      if (int Res = cmpNumbers(LMask[i], RMask[i]))
+        return Res;
+    }
+  }
   if (const PHINode *PNL = dyn_cast<PHINode>(L)) {
     const PHINode *PNR = cast<PHINode>(R);
     // Ensure that in addition to the incoming values being identical
@@ -675,8 +698,8 @@ int FunctionComparator::cmpGEPs(const GEPOperator *GEPL,
   if (GEPL->accumulateConstantOffset(DL, OffsetL) &&
       GEPR->accumulateConstantOffset(DL, OffsetR))
     return cmpAPInts(OffsetL, OffsetR);
-  if (int Res = cmpTypes(GEPL->getSourceElementType(),
-                         GEPR->getSourceElementType()))
+  if (int Res =
+          cmpTypes(GEPL->getSourceElementType(), GEPR->getSourceElementType()))
     return Res;
 
   if (int Res = cmpNumbers(GEPL->getNumOperands(), GEPR->getNumOperands()))
@@ -829,8 +852,8 @@ int FunctionComparator::compareSignature() const {
   // Visit the arguments so that they get enumerated in the order they're
   // passed in.
   for (Function::const_arg_iterator ArgLI = FnL->arg_begin(),
-       ArgRI = FnR->arg_begin(),
-       ArgLE = FnL->arg_end();
+                                    ArgRI = FnR->arg_begin(),
+                                    ArgLE = FnL->arg_end();
        ArgLI != ArgLE; ++ArgLI, ++ArgRI) {
     if (cmpValues(&*ArgLI, &*ArgRI) != 0)
       llvm_unreachable("Arguments repeat!");
@@ -897,9 +920,7 @@ public:
   // Initialize to random constant, so the state isn't zero.
   HashAccumulator64() { Hash = 0x6acaa36bef8325c5ULL; }
 
-  void add(uint64_t V) {
-     Hash = hashing::detail::hash_16_bytes(Hash, V);
-  }
+  void add(uint64_t V) { Hash = hashing::detail::hash_16_bytes(Hash, V); }
 
   // No finishing is required, because the entire hash value is used.
   uint64_t getHash() { return Hash; }
diff --git a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
index 26d48ee0d23f..8df7ae9563d8 100644
--- a/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
+++ b/llvm/lib/Transforms/Utils/FunctionImportUtils.cpp
@@ -212,13 +212,6 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
         }
       }
     }
-    // Check the summaries to see if the symbol gets resolved to a known local
-    // definition.
-    if (VI && VI.isDSOLocal()) {
-      GV.setDSOLocal(true);
-      if (GV.hasDLLImportStorageClass())
-        GV.setDLLStorageClass(GlobalValue::DefaultStorageClass);
-    }
   }
 
   // We should always have a ValueInfo (i.e. GV in index) for definitions when
@@ -280,6 +273,20 @@ void FunctionImportGlobalProcessing::processGlobalForThinLTO(GlobalValue &GV) {
   } else
     GV.setLinkage(getLinkage(&GV, /* DoPromote */ false));
 
+  // When ClearDSOLocalOnDeclarations is true, clear dso_local if GV is
+  // converted to a declaration, to disable direct access. Don't do this if GV
+  // is implicitly dso_local due to a non-default visibility.
+  if (ClearDSOLocalOnDeclarations && GV.isDeclarationForLinker() &&
+      !GV.isImplicitDSOLocal()) {
+    GV.setDSOLocal(false);
+  } else if (VI && VI.isDSOLocal()) {
+    // If all summaries are dso_local, symbol gets resolved to a known local
+    // definition.
+    GV.setDSOLocal(true);
+    if (GV.hasDLLImportStorageClass())
+      GV.setDLLStorageClass(GlobalValue::DefaultStorageClass);
+  }
+
   // Remove functions imported as available externally defs from comdats,
   // as this is a declaration for the linker, and will be dropped eventually.
   // It is illegal for comdats to contain declarations.
@@ -319,7 +326,9 @@ bool FunctionImportGlobalProcessing::run() {
 }
 
 bool llvm::renameModuleForThinLTO(Module &M, const ModuleSummaryIndex &Index,
+                                  bool ClearDSOLocalOnDeclarations,
                                   SetVector<GlobalValue *> *GlobalsToImport) {
-  FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport);
+  FunctionImportGlobalProcessing ThinLTOProcessing(M, Index, GlobalsToImport,
+                                                   ClearDSOLocalOnDeclarations);
   return ThinLTOProcessing.run();
 }
diff --git a/llvm/lib/Transforms/Utils/GlobalStatus.cpp b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
index a2942869130d..fe58f0e0fe40 100644
--- a/llvm/lib/Transforms/Utils/GlobalStatus.cpp
+++ b/llvm/lib/Transforms/Utils/GlobalStatus.cpp
@@ -9,7 +9,6 @@
 #include "llvm/Transforms/Utils/GlobalStatus.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/GlobalValue.h"
@@ -164,8 +163,8 @@ static bool analyzeGlobalAux(const Value *V, GlobalStatus &GS,
         if (MSI->isVolatile())
           return true;
         GS.StoredType = GlobalStatus::Stored;
-      } else if (auto C = ImmutableCallSite(I)) {
-        if (!C.isCallee(&U))
+      } else if (const auto *CB = dyn_cast<CallBase>(I)) {
+        if (!CB->isCallee(&U))
           return true;
         GS.IsLoaded = true;
       } else {
diff --git a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
index 9192e74b9ace..9d8f59d62d6d 100644
--- a/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
+++ b/llvm/lib/Transforms/Utils/InjectTLIMappings.cpp
@@ -13,8 +13,12 @@
 
 #include "llvm/Transforms/Utils/InjectTLIMappings.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/DemandedBits.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/InstIterator.h"
+#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Transforms/Utils.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
 
@@ -30,40 +34,6 @@ STATISTIC(NumVFDeclAdded,
 STATISTIC(NumCompUsedAdded,
           "Number of `@llvm.compiler.used` operands that have been added.");
 
-/// Helper function to map the TLI name to a strings that holds
-/// scalar-to-vector mapping.
-///
-///    _ZGV<isa><mask><vlen><vparams>_<scalarname>(<vectorname>)
-///
-/// where:
-///
-/// <isa> = "_LLVM_"
-/// <mask> = "N". Note: TLI does not support masked interfaces.
-/// <vlen> = Number of concurrent lanes, stored in the `VectorizationFactor`
-///          field of the `VecDesc` struct.
-/// <vparams> = "v", as many as are the number of parameters of CI.
-/// <scalarname> = the name of the scalar function called by CI.
-/// <vectorname> = the name of the vector function mapped by the TLI.
-static std::string mangleTLIName(StringRef VectorName, const CallInst &CI,
-                                 unsigned VF) {
-  SmallString<256> Buffer;
-  llvm::raw_svector_ostream Out(Buffer);
-  Out << "_ZGV" << VFABI::_LLVM_ << "N" << VF;
-  for (unsigned I = 0; I < CI.getNumArgOperands(); ++I)
-    Out << "v";
-  Out << "_" << CI.getCalledFunction()->getName() << "(" << VectorName << ")";
-  return Out.str();
-}
-
-/// A helper function for converting Scalar types to vector types.
-/// If the incoming type is void, we return void. If the VF is 1, we return
-/// the scalar type.
-static Type *ToVectorTy(Type *Scalar, unsigned VF, bool isScalable = false) {
-  if (Scalar->isVoidTy() || VF == 1)
-    return Scalar;
-  return VectorType::get(Scalar, {VF, isScalable});
-}
-
 /// A helper function that adds the vector function declaration that
 /// vectorizes the CallInst CI with a vectorization factor of VF
 /// lanes. The TLI assumes that all parameters and the return type of
@@ -107,7 +77,7 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
   if (CI.isNoBuiltin() || !CI.getCalledFunction())
     return;
 
-  const std::string ScalarName = CI.getCalledFunction()->getName();
+  const std::string ScalarName = std::string(CI.getCalledFunction()->getName());
   // Nothing to be done if the TLI thinks the function is not
   // vectorizable.
   if (!TLI.isFunctionVectorizable(ScalarName))
@@ -120,9 +90,11 @@ static void addMappingsFromTLI(const TargetLibraryInfo &TLI, CallInst &CI) {
   //  All VFs in the TLI are powers of 2.
   for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName); VF <= WidestVF;
        VF *= 2) {
-    const std::string TLIName = TLI.getVectorizedFunction(ScalarName, VF);
+    const std::string TLIName =
+        std::string(TLI.getVectorizedFunction(ScalarName, VF));
     if (!TLIName.empty()) {
-      std::string MangledName = mangleTLIName(TLIName, CI, VF);
+      std::string MangledName = VFABI::mangleTLIVectorName(
+          TLIName, ScalarName, CI.getNumArgOperands(), VF);
       if (!OriginalSetOfMappings.count(MangledName)) {
         Mappings.push_back(MangledName);
         ++NumCallInjected;
@@ -168,6 +140,12 @@ void InjectTLIMappingsLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesCFG();
   AU.addRequired<TargetLibraryInfoWrapperPass>();
   AU.addPreserved<TargetLibraryInfoWrapperPass>();
+  AU.addPreserved<ScalarEvolutionWrapperPass>();
+  AU.addPreserved<AAResultsWrapperPass>();
+  AU.addPreserved<LoopAccessLegacyAnalysis>();
+  AU.addPreserved<DemandedBitsWrapperPass>();
+  AU.addPreserved<OptimizationRemarkEmitterWrapperPass>();
+  AU.addPreserved<GlobalsAAWrapperPass>();
 }
 
 ////////////////////////////////////////////////////////////////////////////////
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 6da612eb4e65..b0b7ca484798 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -34,7 +34,6 @@
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DIBuilder.h"
@@ -60,6 +59,7 @@
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
+#include "llvm/Transforms/Utils/AssumeBundleBuilder.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
@@ -79,16 +79,23 @@ EnableNoAliasConversion("enable-noalias-to-md-conversion", cl::init(true),
   cl::Hidden,
   cl::desc("Convert noalias attributes to metadata during inlining."));
 
+// Disabled by default, because the added alignment assumptions may increase
+// compile-time and block optimizations. This option is not suitable for use
+// with frontends that emit comprehensive parameter alignment annotations.
 static cl::opt<bool>
 PreserveAlignmentAssumptions("preserve-alignment-assumptions-during-inlining",
-  cl::init(true), cl::Hidden,
+  cl::init(false), cl::Hidden,
   cl::desc("Convert align attributes to assumptions during inlining."));
 
-llvm::InlineResult llvm::InlineFunction(CallBase *CB, InlineFunctionInfo &IFI,
-                                        AAResults *CalleeAAR,
-                                        bool InsertLifetime) {
-  return InlineFunction(CallSite(CB), IFI, CalleeAAR, InsertLifetime);
-}
+static cl::opt<bool> UpdateReturnAttributes(
+        "update-return-attrs", cl::init(true), cl::Hidden,
+            cl::desc("Update return attributes on calls within inlined body"));
+
+static cl::opt<unsigned> InlinerAttributeWindow(
+    "max-inst-checked-for-throw-during-inlining", cl::Hidden,
+    cl::desc("the maximum number of instructions analyzed for may throw during "
+             "attribute inference in inlined body"),
+    cl::init(4));
 
 namespace {
 
@@ -530,7 +537,7 @@ static BasicBlock *HandleCallsInBlockInlinedThroughInvoke(
     // instructions require no special handling.
     CallInst *CI = dyn_cast<CallInst>(I);
 
-    if (!CI || CI->doesNotThrow() || isa<InlineAsm>(CI->getCalledValue()))
+    if (!CI || CI->doesNotThrow() || CI->isInlineAsm())
       continue;
 
     // We do not need to (and in fact, cannot) convert possibly throwing calls
@@ -767,12 +774,10 @@ static void HandleInlinedEHPad(InvokeInst *II, BasicBlock *FirstNewBlock,
 /// When inlining a call site that has !llvm.mem.parallel_loop_access or
 /// llvm.access.group metadata, that metadata should be propagated to all
 /// memory-accessing cloned instructions.
-static void PropagateParallelLoopAccessMetadata(CallSite CS,
+static void PropagateParallelLoopAccessMetadata(CallBase &CB,
                                                 ValueToValueMapTy &VMap) {
-  MDNode *M =
-    CS.getInstruction()->getMetadata(LLVMContext::MD_mem_parallel_loop_access);
-  MDNode *CallAccessGroup =
-      CS.getInstruction()->getMetadata(LLVMContext::MD_access_group);
+  MDNode *M = CB.getMetadata(LLVMContext::MD_mem_parallel_loop_access);
+  MDNode *CallAccessGroup = CB.getMetadata(LLVMContext::MD_access_group);
   if (!M && !CallAccessGroup)
     return;
 
@@ -810,8 +815,8 @@ static void PropagateParallelLoopAccessMetadata(CallSite CS,
 /// not be differentiated (and this would lead to miscompiles because the
 /// non-aliasing property communicated by the metadata could have
 /// call-site-specific control dependencies).
-static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
-  const Function *CalledFunc = CS.getCalledFunction();
+static void CloneAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap) {
+  const Function *CalledFunc = CB.getCalledFunction();
   SetVector<const MDNode *> MD;
 
   // Note: We could only clone the metadata if it is already used in the
@@ -886,13 +891,11 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
       // If the call site also had alias scope metadata (a list of scopes to
       // which instructions inside it might belong), propagate those scopes to
       // the inlined instructions.
-      if (MDNode *CSM =
-              CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope))
+      if (MDNode *CSM = CB.getMetadata(LLVMContext::MD_alias_scope))
         NewMD = MDNode::concatenate(NewMD, CSM);
       NI->setMetadata(LLVMContext::MD_alias_scope, NewMD);
     } else if (NI->mayReadOrWriteMemory()) {
-      if (MDNode *M =
-              CS.getInstruction()->getMetadata(LLVMContext::MD_alias_scope))
+      if (MDNode *M = CB.getMetadata(LLVMContext::MD_alias_scope))
         NI->setMetadata(LLVMContext::MD_alias_scope, M);
     }
 
@@ -901,12 +904,11 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
       // If the call site also had noalias metadata (a list of scopes with
       // which instructions inside it don't alias), propagate those scopes to
       // the inlined instructions.
-      if (MDNode *CSM =
-              CS.getInstruction()->getMetadata(LLVMContext::MD_noalias))
+      if (MDNode *CSM = CB.getMetadata(LLVMContext::MD_noalias))
         NewMD = MDNode::concatenate(NewMD, CSM);
       NI->setMetadata(LLVMContext::MD_noalias, NewMD);
     } else if (NI->mayReadOrWriteMemory()) {
-      if (MDNode *M = CS.getInstruction()->getMetadata(LLVMContext::MD_noalias))
+      if (MDNode *M = CB.getMetadata(LLVMContext::MD_noalias))
         NI->setMetadata(LLVMContext::MD_noalias, M);
     }
   }
@@ -916,16 +918,16 @@ static void CloneAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap) {
 /// then add new alias scopes for each noalias argument, tag the mapped noalias
 /// parameters with noalias metadata specifying the new scope, and tag all
 /// non-derived loads, stores and memory intrinsics with the new alias scopes.
-static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
+static void AddAliasScopeMetadata(CallBase &CB, ValueToValueMapTy &VMap,
                                   const DataLayout &DL, AAResults *CalleeAAR) {
   if (!EnableNoAliasConversion)
     return;
 
-  const Function *CalledFunc = CS.getCalledFunction();
+  const Function *CalledFunc = CB.getCalledFunction();
   SmallVector<const Argument *, 4> NoAliasArgs;
 
   for (const Argument &Arg : CalledFunc->args())
-    if (Arg.hasNoAliasAttr() && !Arg.use_empty())
+    if (CB.paramHasAttr(Arg.getArgNo(), Attribute::NoAlias) && !Arg.use_empty())
       NoAliasArgs.push_back(&Arg);
 
   if (NoAliasArgs.empty())
@@ -951,7 +953,7 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
   for (unsigned i = 0, e = NoAliasArgs.size(); i != e; ++i) {
     const Argument *A = NoAliasArgs[i];
 
-    std::string Name = CalledFunc->getName();
+    std::string Name = std::string(CalledFunc->getName());
     if (A->hasName()) {
       Name += ": %";
       Name += A->getName();
@@ -1002,8 +1004,7 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
         IsFuncCall = true;
         if (CalleeAAR) {
           FunctionModRefBehavior MRB = CalleeAAR->getModRefBehavior(Call);
-          if (MRB == FMRB_OnlyAccessesArgumentPointees ||
-              MRB == FMRB_OnlyReadsArgumentPointees)
+          if (AAResults::onlyAccessesArgPointees(MRB))
             IsArgMemOnlyCall = true;
         }
 
@@ -1059,7 +1060,7 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
         // completely describe the aliasing properties using alias.scope
         // metadata (and, thus, won't add any).
         if (const Argument *A = dyn_cast<Argument>(V)) {
-          if (!A->hasNoAliasAttr())
+          if (!CB.paramHasAttr(A->getArgNo(), Attribute::NoAlias))
             UsesAliasingPtr = true;
         } else {
           UsesAliasingPtr = true;
@@ -1136,37 +1137,128 @@ static void AddAliasScopeMetadata(CallSite CS, ValueToValueMapTy &VMap,
   }
 }
 
+static bool MayContainThrowingOrExitingCall(Instruction *Begin,
+                                            Instruction *End) {
+
+  assert(Begin->getParent() == End->getParent() &&
+         "Expected to be in same basic block!");
+  unsigned NumInstChecked = 0;
+  // Check that all instructions in the range [Begin, End) are guaranteed to
+  // transfer execution to successor.
+  for (auto &I : make_range(Begin->getIterator(), End->getIterator()))
+    if (NumInstChecked++ > InlinerAttributeWindow ||
+        !isGuaranteedToTransferExecutionToSuccessor(&I))
+      return true;
+  return false;
+}
+
+static AttrBuilder IdentifyValidAttributes(CallBase &CB) {
+
+  AttrBuilder AB(CB.getAttributes(), AttributeList::ReturnIndex);
+  if (AB.empty())
+    return AB;
+  AttrBuilder Valid;
+  // Only allow these white listed attributes to be propagated back to the
+  // callee. This is because other attributes may only be valid on the call
+  // itself, i.e. attributes such as signext and zeroext.
+  if (auto DerefBytes = AB.getDereferenceableBytes())
+    Valid.addDereferenceableAttr(DerefBytes);
+  if (auto DerefOrNullBytes = AB.getDereferenceableOrNullBytes())
+    Valid.addDereferenceableOrNullAttr(DerefOrNullBytes);
+  if (AB.contains(Attribute::NoAlias))
+    Valid.addAttribute(Attribute::NoAlias);
+  if (AB.contains(Attribute::NonNull))
+    Valid.addAttribute(Attribute::NonNull);
+  return Valid;
+}
+
+static void AddReturnAttributes(CallBase &CB, ValueToValueMapTy &VMap) {
+  if (!UpdateReturnAttributes)
+    return;
+
+  AttrBuilder Valid = IdentifyValidAttributes(CB);
+  if (Valid.empty())
+    return;
+  auto *CalledFunction = CB.getCalledFunction();
+  auto &Context = CalledFunction->getContext();
+
+  for (auto &BB : *CalledFunction) {
+    auto *RI = dyn_cast<ReturnInst>(BB.getTerminator());
+    if (!RI || !isa<CallBase>(RI->getOperand(0)))
+      continue;
+    auto *RetVal = cast<CallBase>(RI->getOperand(0));
+    // Sanity check that the cloned RetVal exists and is a call, otherwise we
+    // cannot add the attributes on the cloned RetVal.
+    // Simplification during inlining could have transformed the cloned
+    // instruction.
+    auto *NewRetVal = dyn_cast_or_null<CallBase>(VMap.lookup(RetVal));
+    if (!NewRetVal)
+      continue;
+    // Backward propagation of attributes to the returned value may be incorrect
+    // if it is control flow dependent.
+    // Consider:
+    // @callee {
+    //  %rv = call @foo()
+    //  %rv2 = call @bar()
+    //  if (%rv2 != null)
+    //    return %rv2
+    //  if (%rv == null)
+    //    exit()
+    //  return %rv
+    // }
+    // caller() {
+    //   %val = call nonnull @callee()
+    // }
+    // Here we cannot add the nonnull attribute on either foo or bar. So, we
+    // limit the check to both RetVal and RI are in the same basic block and
+    // there are no throwing/exiting instructions between these instructions.
+    if (RI->getParent() != RetVal->getParent() ||
+        MayContainThrowingOrExitingCall(RetVal, RI))
+      continue;
+    // Add to the existing attributes of NewRetVal, i.e. the cloned call
+    // instruction.
+    // NB! When we have the same attribute already existing on NewRetVal, but
+    // with a differing value, the AttributeList's merge API honours the already
+    // existing attribute value (i.e. attributes such as dereferenceable,
+    // dereferenceable_or_null etc). See AttrBuilder::merge for more details.
+    AttributeList AL = NewRetVal->getAttributes();
+    AttributeList NewAL =
+        AL.addAttributes(Context, AttributeList::ReturnIndex, Valid);
+    NewRetVal->setAttributes(NewAL);
+  }
+}
+
 /// If the inlined function has non-byval align arguments, then
 /// add @llvm.assume-based alignment assumptions to preserve this information.
-static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
+static void AddAlignmentAssumptions(CallBase &CB, InlineFunctionInfo &IFI) {
   if (!PreserveAlignmentAssumptions || !IFI.GetAssumptionCache)
     return;
 
-  AssumptionCache *AC = &(*IFI.GetAssumptionCache)(*CS.getCaller());
-  auto &DL = CS.getCaller()->getParent()->getDataLayout();
+  AssumptionCache *AC = &IFI.GetAssumptionCache(*CB.getCaller());
+  auto &DL = CB.getCaller()->getParent()->getDataLayout();
 
   // To avoid inserting redundant assumptions, we should check for assumptions
   // already in the caller. To do this, we might need a DT of the caller.
   DominatorTree DT;
   bool DTCalculated = false;
 
-  Function *CalledFunc = CS.getCalledFunction();
+  Function *CalledFunc = CB.getCalledFunction();
   for (Argument &Arg : CalledFunc->args()) {
     unsigned Align = Arg.getType()->isPointerTy() ? Arg.getParamAlignment() : 0;
-    if (Align && !Arg.hasByValOrInAllocaAttr() && !Arg.hasNUses(0)) {
+    if (Align && !Arg.hasPassPointeeByValueAttr() && !Arg.hasNUses(0)) {
       if (!DTCalculated) {
-        DT.recalculate(*CS.getCaller());
+        DT.recalculate(*CB.getCaller());
         DTCalculated = true;
       }
 
       // If we can already prove the asserted alignment in the context of the
       // caller, then don't bother inserting the assumption.
-      Value *ArgVal = CS.getArgument(Arg.getArgNo());
-      if (getKnownAlignment(ArgVal, DL, CS.getInstruction(), AC, &DT) >= Align)
+      Value *ArgVal = CB.getArgOperand(Arg.getArgNo());
+      if (getKnownAlignment(ArgVal, DL, &CB, AC, &DT) >= Align)
         continue;
 
-      CallInst *NewAsmp = IRBuilder<>(CS.getInstruction())
-                              .CreateAlignmentAssumption(DL, ArgVal, Align);
+      CallInst *NewAsmp =
+          IRBuilder<>(&CB).CreateAlignmentAssumption(DL, ArgVal, Align);
       AC->registerAssumption(NewAsmp);
     }
   }
@@ -1176,13 +1268,13 @@ static void AddAlignmentAssumptions(CallSite CS, InlineFunctionInfo &IFI) {
 /// update the specified callgraph to reflect the changes we made.
 /// Note that it's possible that not all code was copied over, so only
 /// some edges of the callgraph may remain.
-static void UpdateCallGraphAfterInlining(CallSite CS,
+static void UpdateCallGraphAfterInlining(CallBase &CB,
                                          Function::iterator FirstNewBlock,
                                          ValueToValueMapTy &VMap,
                                          InlineFunctionInfo &IFI) {
   CallGraph &CG = *IFI.CG;
-  const Function *Caller = CS.getCaller();
-  const Function *Callee = CS.getCalledFunction();
+  const Function *Caller = CB.getCaller();
+  const Function *Callee = CB.getCalledFunction();
   CallGraphNode *CalleeNode = CG[Callee];
   CallGraphNode *CallerNode = CG[Caller];
 
@@ -1199,7 +1291,11 @@ static void UpdateCallGraphAfterInlining(CallSite CS,
   }
 
   for (; I != E; ++I) {
-    const Value *OrigCall = I->first;
+    // Skip 'refererence' call records.
+    if (!I->first)
+      continue;
+
+    const Value *OrigCall = *I->first;
 
     ValueToValueMapTy::iterator VMI = VMap.find(OrigCall);
     // Only copy the edge if the call was inlined!
@@ -1240,7 +1336,7 @@ static void UpdateCallGraphAfterInlining(CallSite CS,
 
   // Update the call graph by deleting the edge from Callee to Caller.  We must
   // do this after the loop above in case Caller and Callee are the same.
-  CallerNode->removeCallEdgeFor(*cast<CallBase>(CS.getInstruction()));
+  CallerNode->removeCallEdgeFor(*cast<CallBase>(&CB));
 }
 
 static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
@@ -1254,8 +1350,8 @@ static void HandleByValArgumentInit(Value *Dst, Value *Src, Module *M,
   // Always generate a memcpy of alignment 1 here because we don't know
   // the alignment of the src pointer.  Other optimizations can infer
   // better alignment.
-  Builder.CreateMemCpy(Dst, /*DstAlign*/ Align::None(), Src,
-                       /*SrcAlign*/ Align::None(), Size);
+  Builder.CreateMemCpy(Dst, /*DstAlign*/ Align(1), Src,
+                       /*SrcAlign*/ Align(1), Size);
 }
 
 /// When inlining a call site that has a byval argument,
@@ -1281,12 +1377,12 @@ static Value *HandleByValArgument(Value *Arg, Instruction *TheCall,
       return Arg;
 
     AssumptionCache *AC =
-        IFI.GetAssumptionCache ? &(*IFI.GetAssumptionCache)(*Caller) : nullptr;
+        IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr;
 
     // If the pointer is already known to be sufficiently aligned, or if we can
     // round it up to a larger alignment, then we don't need a temporary.
-    if (getOrEnforceKnownAlignment(Arg, ByValAlignment, DL, TheCall, AC) >=
-        ByValAlignment)
+    if (getOrEnforceKnownAlignment(Arg, Align(ByValAlignment), DL, TheCall,
+                                   AC) >= ByValAlignment)
       return Arg;
 
     // Otherwise, we have to make a memcpy to get a safe alignment.  This is bad
@@ -1356,34 +1452,6 @@ static DebugLoc inlineDebugLoc(DebugLoc OrigDL, DILocation *InlinedAt,
                        IA);
 }
 
-/// Returns the LoopID for a loop which has has been cloned from another
-/// function for inlining with the new inlined-at start and end locs.
-static MDNode *inlineLoopID(const MDNode *OrigLoopId, DILocation *InlinedAt,
-                            LLVMContext &Ctx,
-                            DenseMap<const MDNode *, MDNode *> &IANodes) {
-  assert(OrigLoopId && OrigLoopId->getNumOperands() > 0 &&
-         "Loop ID needs at least one operand");
-  assert(OrigLoopId && OrigLoopId->getOperand(0).get() == OrigLoopId &&
-         "Loop ID should refer to itself");
-
-  // Save space for the self-referential LoopID.
-  SmallVector<Metadata *, 4> MDs = {nullptr};
-
-  for (unsigned i = 1; i < OrigLoopId->getNumOperands(); ++i) {
-    Metadata *MD = OrigLoopId->getOperand(i);
-    // Update the DILocations to encode the inlined-at metadata.
-    if (DILocation *DL = dyn_cast<DILocation>(MD))
-      MDs.push_back(inlineDebugLoc(DL, InlinedAt, Ctx, IANodes));
-    else
-      MDs.push_back(MD);
-  }
-
-  MDNode *NewLoopID = MDNode::getDistinct(Ctx, MDs);
-  // Insert the self-referential LoopID.
-  NewLoopID->replaceOperandWith(0, NewLoopID);
-  return NewLoopID;
-}
-
 /// Update inlined instructions' line numbers to
 /// to encode location where these instructions are inlined.
 static void fixupLineNumbers(Function *Fn, Function::iterator FI,
@@ -1415,11 +1483,11 @@ static void fixupLineNumbers(Function *Fn, Function::iterator FI,
          BI != BE; ++BI) {
       // Loop metadata needs to be updated so that the start and end locs
       // reference inlined-at locations.
-      if (MDNode *LoopID = BI->getMetadata(LLVMContext::MD_loop)) {
-        MDNode *NewLoopID =
-            inlineLoopID(LoopID, InlinedAtNode, BI->getContext(), IANodes);
-        BI->setMetadata(LLVMContext::MD_loop, NewLoopID);
-      }
+      auto updateLoopInfoLoc = [&Ctx, &InlinedAtNode, &IANodes](
+                                   const DILocation &Loc) -> DILocation * {
+        return inlineDebugLoc(&Loc, InlinedAtNode, Ctx, IANodes).get();
+      };
+      updateLoopMetadataDebugLocations(*BI, updateLoopInfoLoc);
 
       if (!NoInlineLineTables)
         if (DebugLoc DL = BI->getDebugLoc()) {
@@ -1498,8 +1566,7 @@ static void updateCallerBFI(BasicBlock *CallSiteBlock,
 /// Update the branch metadata for cloned call instructions.
 static void updateCallProfile(Function *Callee, const ValueToValueMapTy &VMap,
                               const ProfileCount &CalleeEntryCount,
-                              const Instruction *TheCall,
-                              ProfileSummaryInfo *PSI,
+                              const CallBase &TheCall, ProfileSummaryInfo *PSI,
                               BlockFrequencyInfo *CallerBFI) {
   if (!CalleeEntryCount.hasValue() || CalleeEntryCount.isSynthetic() ||
       CalleeEntryCount.getCount() < 1)
@@ -1557,31 +1624,29 @@ void llvm::updateProfileCallee(
 /// instruction 'call B' is inlined, and 'B' calls 'C', then the call to 'C' now
 /// exists in the instruction stream.  Similarly this will inline a recursive
 /// function by one level.
-llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
+llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
                                         AAResults *CalleeAAR,
                                         bool InsertLifetime,
                                         Function *ForwardVarArgsTo) {
-  Instruction *TheCall = CS.getInstruction();
-  assert(TheCall->getParent() && TheCall->getFunction()
-         && "Instruction not in function!");
+  assert(CB.getParent() && CB.getFunction() && "Instruction not in function!");
 
   // FIXME: we don't inline callbr yet.
-  if (isa<CallBrInst>(TheCall))
-    return false;
+  if (isa<CallBrInst>(CB))
+    return InlineResult::failure("We don't inline callbr yet.");
 
   // If IFI has any state in it, zap it before we fill it in.
   IFI.reset();
 
-  Function *CalledFunc = CS.getCalledFunction();
+  Function *CalledFunc = CB.getCalledFunction();
   if (!CalledFunc ||               // Can't inline external function or indirect
       CalledFunc->isDeclaration()) // call!
-    return "external or indirect";
+    return InlineResult::failure("external or indirect");
 
   // The inliner does not know how to inline through calls with operand bundles
   // in general ...
-  if (CS.hasOperandBundles()) {
-    for (int i = 0, e = CS.getNumOperandBundles(); i != e; ++i) {
-      uint32_t Tag = CS.getOperandBundleAt(i).getTagID();
+  if (CB.hasOperandBundles()) {
+    for (int i = 0, e = CB.getNumOperandBundles(); i != e; ++i) {
+      uint32_t Tag = CB.getOperandBundleAt(i).getTagID();
       // ... but it knows how to inline through "deopt" operand bundles ...
       if (Tag == LLVMContext::OB_deopt)
         continue;
@@ -1589,15 +1654,15 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       if (Tag == LLVMContext::OB_funclet)
         continue;
 
-      return "unsupported operand bundle";
+      return InlineResult::failure("unsupported operand bundle");
     }
   }
 
   // If the call to the callee cannot throw, set the 'nounwind' flag on any
   // calls that we inline.
-  bool MarkNoUnwind = CS.doesNotThrow();
+  bool MarkNoUnwind = CB.doesNotThrow();
 
-  BasicBlock *OrigBB = TheCall->getParent();
+  BasicBlock *OrigBB = CB.getParent();
   Function *Caller = OrigBB->getParent();
 
   // GC poses two hazards to inlining, which only occur when the callee has GC:
@@ -1608,7 +1673,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     if (!Caller->hasGC())
       Caller->setGC(CalledFunc->getGC());
     else if (CalledFunc->getGC() != Caller->getGC())
-      return "incompatible GC";
+      return InlineResult::failure("incompatible GC");
   }
 
   // Get the personality function from the callee if it contains a landing pad.
@@ -1632,7 +1697,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // TODO: This isn't 100% true. Some personality functions are proper
     //       supersets of others and can be used in place of the other.
     else if (CalledPersonality != CallerPersonality)
-      return "incompatible personality";
+      return InlineResult::failure("incompatible personality");
   }
 
   // We need to figure out which funclet the callsite was in so that we may
@@ -1642,7 +1707,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     EHPersonality Personality = classifyEHPersonality(CallerPersonality);
     if (isScopedEHPersonality(Personality)) {
       Optional<OperandBundleUse> ParentFunclet =
-          CS.getOperandBundle(LLVMContext::OB_funclet);
+          CB.getOperandBundle(LLVMContext::OB_funclet);
       if (ParentFunclet)
         CallSiteEHPad = cast<FuncletPadInst>(ParentFunclet->Inputs.front());
 
@@ -1657,7 +1722,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
             // for catchpads.
             for (const BasicBlock &CalledBB : *CalledFunc) {
               if (isa<CatchSwitchInst>(CalledBB.getFirstNonPHI()))
-                return "catch in cleanup funclet";
+                return InlineResult::failure("catch in cleanup funclet");
             }
           }
         } else if (isAsynchronousEHPersonality(Personality)) {
@@ -1665,7 +1730,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
           // funclet in the callee.
           for (const BasicBlock &CalledBB : *CalledFunc) {
             if (CalledBB.isEHPad())
-              return "SEH in cleanup funclet";
+              return InlineResult::failure("SEH in cleanup funclet");
           }
         }
       }
@@ -1675,7 +1740,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // Determine if we are dealing with a call in an EHPad which does not unwind
   // to caller.
   bool EHPadForCallUnwindsLocally = false;
-  if (CallSiteEHPad && CS.isCall()) {
+  if (CallSiteEHPad && isa<CallInst>(CB)) {
     UnwindDestMemoTy FuncletUnwindMap;
     Value *CallSiteUnwindDestToken =
         getUnwindDestToken(CallSiteEHPad, FuncletUnwindMap);
@@ -1704,7 +1769,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
 
     // Calculate the vector of arguments to pass into the function cloner, which
     // matches up the formal to the actual argument values.
-    CallSite::arg_iterator AI = CS.arg_begin();
+    auto AI = CB.arg_begin();
     unsigned ArgNo = 0;
     for (Function::arg_iterator I = CalledFunc->arg_begin(),
          E = CalledFunc->arg_end(); I != E; ++I, ++AI, ++ArgNo) {
@@ -1714,8 +1779,8 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       // by them explicit.  However, we don't do this if the callee is readonly
       // or readnone, because the copy would be unneeded: the callee doesn't
       // modify the struct.
-      if (CS.isByValArgument(ArgNo)) {
-        ActualArg = HandleByValArgument(ActualArg, TheCall, CalledFunc, IFI,
+      if (CB.isByValArgument(ArgNo)) {
+        ActualArg = HandleByValArgument(ActualArg, &CB, CalledFunc, IFI,
                                         CalledFunc->getParamAlignment(ArgNo));
         if (ActualArg != *AI)
           ByValInit.push_back(std::make_pair(ActualArg, (Value*) *AI));
@@ -1724,10 +1789,17 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       VMap[&*I] = ActualArg;
     }
 
+    // TODO: Remove this when users have been updated to the assume bundles.
     // Add alignment assumptions if necessary. We do this before the inlined
     // instructions are actually cloned into the caller so that we can easily
     // check what will be known at the start of the inlined code.
-    AddAlignmentAssumptions(CS, IFI);
+    AddAlignmentAssumptions(CB, IFI);
+
+    AssumptionCache *AC =
+        IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr;
+
+    /// Preserve all attributes on of the call and its parameters.
+    salvageKnowledge(&CB, AC);
 
     // We want the inliner to prune the code as it copies.  We would LOVE to
     // have no dead or constant instructions leftover after inlining occurs
@@ -1735,7 +1807,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // happy with whatever the cloner can do.
     CloneAndPruneFunctionInto(Caller, CalledFunc, VMap,
                               /*ModuleLevelChanges=*/false, Returns, ".i",
-                              &InlinedFunctionInfo, TheCall);
+                              &InlinedFunctionInfo, &CB);
     // Remember the first block that is newly cloned over.
     FirstNewBlock = LastBlock; ++FirstNewBlock;
 
@@ -1744,7 +1816,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       updateCallerBFI(OrigBB, VMap, IFI.CallerBFI, IFI.CalleeBFI,
                       CalledFunc->front());
 
-    updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), TheCall,
+    updateCallProfile(CalledFunc, VMap, CalledFunc->getEntryCount(), CB,
                       IFI.PSI, IFI.CallerBFI);
 
     // Inject byval arguments initialization.
@@ -1753,21 +1825,22 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
                               &*FirstNewBlock, IFI);
 
     Optional<OperandBundleUse> ParentDeopt =
-        CS.getOperandBundle(LLVMContext::OB_deopt);
+        CB.getOperandBundle(LLVMContext::OB_deopt);
     if (ParentDeopt) {
       SmallVector<OperandBundleDef, 2> OpDefs;
 
       for (auto &VH : InlinedFunctionInfo.OperandBundleCallSites) {
-        Instruction *I = dyn_cast_or_null<Instruction>(VH);
-        if (!I) continue;  // instruction was DCE'd or RAUW'ed to undef
+        CallBase *ICS = dyn_cast_or_null<CallBase>(VH);
+        if (!ICS)
+          continue; // instruction was DCE'd or RAUW'ed to undef
 
         OpDefs.clear();
 
-        CallSite ICS(I);
-        OpDefs.reserve(ICS.getNumOperandBundles());
+        OpDefs.reserve(ICS->getNumOperandBundles());
 
-        for (unsigned i = 0, e = ICS.getNumOperandBundles(); i < e; ++i) {
-          auto ChildOB = ICS.getOperandBundleAt(i);
+        for (unsigned COBi = 0, COBe = ICS->getNumOperandBundles(); COBi < COBe;
+             ++COBi) {
+          auto ChildOB = ICS->getOperandBundleAt(COBi);
           if (ChildOB.getTagID() != LLVMContext::OB_deopt) {
             // If the inlined call has other operand bundles, let them be
             OpDefs.emplace_back(ChildOB);
@@ -1791,51 +1864,48 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
           OpDefs.emplace_back("deopt", std::move(MergedDeoptArgs));
         }
 
-        Instruction *NewI = nullptr;
-        if (isa<CallInst>(I))
-          NewI = CallInst::Create(cast<CallInst>(I), OpDefs, I);
-        else if (isa<CallBrInst>(I))
-          NewI = CallBrInst::Create(cast<CallBrInst>(I), OpDefs, I);
-        else
-          NewI = InvokeInst::Create(cast<InvokeInst>(I), OpDefs, I);
+        Instruction *NewI = CallBase::Create(ICS, OpDefs, ICS);
 
         // Note: the RAUW does the appropriate fixup in VMap, so we need to do
         // this even if the call returns void.
-        I->replaceAllUsesWith(NewI);
+        ICS->replaceAllUsesWith(NewI);
 
         VH = nullptr;
-        I->eraseFromParent();
+        ICS->eraseFromParent();
       }
     }
 
     // Update the callgraph if requested.
     if (IFI.CG)
-      UpdateCallGraphAfterInlining(CS, FirstNewBlock, VMap, IFI);
+      UpdateCallGraphAfterInlining(CB, FirstNewBlock, VMap, IFI);
 
     // For 'nodebug' functions, the associated DISubprogram is always null.
     // Conservatively avoid propagating the callsite debug location to
     // instructions inlined from a function whose DISubprogram is not null.
-    fixupLineNumbers(Caller, FirstNewBlock, TheCall,
+    fixupLineNumbers(Caller, FirstNewBlock, &CB,
                      CalledFunc->getSubprogram() != nullptr);
 
     // Clone existing noalias metadata if necessary.
-    CloneAliasScopeMetadata(CS, VMap);
+    CloneAliasScopeMetadata(CB, VMap);
 
     // Add noalias metadata if necessary.
-    AddAliasScopeMetadata(CS, VMap, DL, CalleeAAR);
+    AddAliasScopeMetadata(CB, VMap, DL, CalleeAAR);
+
+    // Clone return attributes on the callsite into the calls within the inlined
+    // function which feed into its return value.
+    AddReturnAttributes(CB, VMap);
 
     // Propagate llvm.mem.parallel_loop_access if necessary.
-    PropagateParallelLoopAccessMetadata(CS, VMap);
+    PropagateParallelLoopAccessMetadata(CB, VMap);
 
     // Register any cloned assumptions.
     if (IFI.GetAssumptionCache)
       for (BasicBlock &NewBlock :
            make_range(FirstNewBlock->getIterator(), Caller->end()))
-        for (Instruction &I : NewBlock) {
+        for (Instruction &I : NewBlock)
           if (auto *II = dyn_cast<IntrinsicInst>(&I))
             if (II->getIntrinsicID() == Intrinsic::assume)
-              (*IFI.GetAssumptionCache)(*Caller).registerAssumption(II);
-        }
+              IFI.GetAssumptionCache(*Caller).registerAssumption(II);
   }
 
   // If there are any alloca instructions in the block that used to be the entry
@@ -1877,24 +1947,20 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       Caller->getEntryBlock().getInstList().splice(
           InsertPoint, FirstNewBlock->getInstList(), AI->getIterator(), I);
     }
-    // Move any dbg.declares describing the allocas into the entry basic block.
-    DIBuilder DIB(*Caller->getParent());
-    for (auto &AI : IFI.StaticAllocas)
-      replaceDbgDeclareForAlloca(AI, AI, DIB, DIExpression::ApplyOffset, 0);
   }
 
   SmallVector<Value*,4> VarArgsToForward;
   SmallVector<AttributeSet, 4> VarArgsAttrs;
   for (unsigned i = CalledFunc->getFunctionType()->getNumParams();
-       i < CS.getNumArgOperands(); i++) {
-    VarArgsToForward.push_back(CS.getArgOperand(i));
-    VarArgsAttrs.push_back(CS.getAttributes().getParamAttributes(i));
+       i < CB.getNumArgOperands(); i++) {
+    VarArgsToForward.push_back(CB.getArgOperand(i));
+    VarArgsAttrs.push_back(CB.getAttributes().getParamAttributes(i));
   }
 
   bool InlinedMustTailCalls = false, InlinedDeoptimizeCalls = false;
   if (InlinedFunctionInfo.ContainsCalls) {
     CallInst::TailCallKind CallSiteTailKind = CallInst::TCK_None;
-    if (CallInst *CI = dyn_cast<CallInst>(TheCall))
+    if (CallInst *CI = dyn_cast<CallInst>(&CB))
       CallSiteTailKind = CI->getTailCallKind();
 
     // For inlining purposes, the "notail" marker is the same as no marker.
@@ -2056,7 +2122,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // any call instructions into invoke instructions.  This is sensitive to which
   // funclet pads were top-level in the inlinee, so must be done before
   // rewriting the "parent pad" links.
-  if (auto *II = dyn_cast<InvokeInst>(TheCall)) {
+  if (auto *II = dyn_cast<InvokeInst>(&CB)) {
     BasicBlock *UnwindDest = II->getUnwindDest();
     Instruction *FirstNonPHI = UnwindDest->getFirstNonPHI();
     if (isa<LandingPadInst>(FirstNonPHI)) {
@@ -2077,31 +2143,24 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
       // Add bundle operands to any top-level call sites.
       SmallVector<OperandBundleDef, 1> OpBundles;
       for (BasicBlock::iterator BBI = BB->begin(), E = BB->end(); BBI != E;) {
-        Instruction *I = &*BBI++;
-        CallSite CS(I);
-        if (!CS)
+        CallBase *I = dyn_cast<CallBase>(&*BBI++);
+        if (!I)
           continue;
 
         // Skip call sites which are nounwind intrinsics.
         auto *CalledFn =
-            dyn_cast<Function>(CS.getCalledValue()->stripPointerCasts());
-        if (CalledFn && CalledFn->isIntrinsic() && CS.doesNotThrow())
+            dyn_cast<Function>(I->getCalledOperand()->stripPointerCasts());
+        if (CalledFn && CalledFn->isIntrinsic() && I->doesNotThrow())
           continue;
 
         // Skip call sites which already have a "funclet" bundle.
-        if (CS.getOperandBundle(LLVMContext::OB_funclet))
+        if (I->getOperandBundle(LLVMContext::OB_funclet))
           continue;
 
-        CS.getOperandBundlesAsDefs(OpBundles);
+        I->getOperandBundlesAsDefs(OpBundles);
         OpBundles.emplace_back("funclet", CallSiteEHPad);
 
-        Instruction *NewInst;
-        if (CS.isCall())
-          NewInst = CallInst::Create(cast<CallInst>(I), OpBundles, I);
-        else if (CS.isCallBr())
-          NewInst = CallBrInst::Create(cast<CallBrInst>(I), OpBundles, I);
-        else
-          NewInst = InvokeInst::Create(cast<InvokeInst>(I), OpBundles, I);
+        Instruction *NewInst = CallBase::Create(I, OpBundles, I);
         NewInst->takeName(I);
         I->replaceAllUsesWith(NewInst);
         I->eraseFromParent();
@@ -2138,7 +2197,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // caller (but terminate it instead).  If the caller's return type does not
     // match the callee's return type, we also need to change the return type of
     // the intrinsic.
-    if (Caller->getReturnType() == TheCall->getType()) {
+    if (Caller->getReturnType() == CB.getType()) {
       auto NewEnd = llvm::remove_if(Returns, [](ReturnInst *RI) {
         return RI->getParent()->getTerminatingDeoptimizeCall() != nullptr;
       });
@@ -2197,7 +2256,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   if (InlinedMustTailCalls) {
     // Check if we need to bitcast the result of any musttail calls.
     Type *NewRetTy = Caller->getReturnType();
-    bool NeedBitCast = !TheCall->use_empty() && TheCall->getType() != NewRetTy;
+    bool NeedBitCast = !CB.use_empty() && CB.getType() != NewRetTy;
 
     // Handle the returns preceded by musttail calls separately.
     SmallVector<ReturnInst *, 8> NormalReturns;
@@ -2237,8 +2296,8 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     for (BasicBlock &NewBB :
          make_range(FirstNewBlock->getIterator(), Caller->end()))
       for (Instruction &I : NewBB)
-        if (auto CS = CallSite(&I))
-          IFI.InlinedCallSites.push_back(CS);
+        if (auto *CB = dyn_cast<CallBase>(&I))
+          IFI.InlinedCallSites.push_back(CB);
   }
 
   // If we cloned in _exactly one_ basic block, and if that block ends in a
@@ -2246,36 +2305,35 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // the calling basic block.
   if (Returns.size() == 1 && std::distance(FirstNewBlock, Caller->end()) == 1) {
     // Move all of the instructions right before the call.
-    OrigBB->getInstList().splice(TheCall->getIterator(),
-                                 FirstNewBlock->getInstList(),
+    OrigBB->getInstList().splice(CB.getIterator(), FirstNewBlock->getInstList(),
                                  FirstNewBlock->begin(), FirstNewBlock->end());
     // Remove the cloned basic block.
     Caller->getBasicBlockList().pop_back();
 
     // If the call site was an invoke instruction, add a branch to the normal
     // destination.
-    if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
-      BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), TheCall);
+    if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
+      BranchInst *NewBr = BranchInst::Create(II->getNormalDest(), &CB);
       NewBr->setDebugLoc(Returns[0]->getDebugLoc());
     }
 
     // If the return instruction returned a value, replace uses of the call with
     // uses of the returned value.
-    if (!TheCall->use_empty()) {
+    if (!CB.use_empty()) {
       ReturnInst *R = Returns[0];
-      if (TheCall == R->getReturnValue())
-        TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+      if (&CB == R->getReturnValue())
+        CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
       else
-        TheCall->replaceAllUsesWith(R->getReturnValue());
+        CB.replaceAllUsesWith(R->getReturnValue());
     }
     // Since we are now done with the Call/Invoke, we can delete it.
-    TheCall->eraseFromParent();
+    CB.eraseFromParent();
 
     // Since we are now done with the return instruction, delete it also.
     Returns[0]->eraseFromParent();
 
     // We are now done with the inlining.
-    return true;
+    return InlineResult::success();
   }
 
   // Otherwise, we have the normal case, of more than one block to inline or
@@ -2286,10 +2344,10 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // this is an invoke instruction or a call instruction.
   BasicBlock *AfterCallBB;
   BranchInst *CreatedBranchToNormalDest = nullptr;
-  if (InvokeInst *II = dyn_cast<InvokeInst>(TheCall)) {
+  if (InvokeInst *II = dyn_cast<InvokeInst>(&CB)) {
 
     // Add an unconditional branch to make this look like the CallInst case...
-    CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), TheCall);
+    CreatedBranchToNormalDest = BranchInst::Create(II->getNormalDest(), &CB);
 
     // Split the basic block.  This guarantees that no PHI nodes will have to be
     // updated due to new incoming edges, and make the invoke case more
@@ -2298,11 +2356,11 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
         OrigBB->splitBasicBlock(CreatedBranchToNormalDest->getIterator(),
                                 CalledFunc->getName() + ".exit");
 
-  } else {  // It's a call
+  } else { // It's a call
     // If this is a call instruction, we need to split the basic block that
     // the call lives in.
     //
-    AfterCallBB = OrigBB->splitBasicBlock(TheCall->getIterator(),
+    AfterCallBB = OrigBB->splitBasicBlock(CB.getIterator(),
                                           CalledFunc->getName() + ".exit");
   }
 
@@ -2335,12 +2393,12 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   if (Returns.size() > 1) {
     // The PHI node should go at the front of the new basic block to merge all
     // possible incoming values.
-    if (!TheCall->use_empty()) {
-      PHI = PHINode::Create(RTy, Returns.size(), TheCall->getName(),
+    if (!CB.use_empty()) {
+      PHI = PHINode::Create(RTy, Returns.size(), CB.getName(),
                             &AfterCallBB->front());
       // Anything that used the result of the function call should now use the
       // PHI node as their operand.
-      TheCall->replaceAllUsesWith(PHI);
+      CB.replaceAllUsesWith(PHI);
     }
 
     // Loop over all of the return instructions adding entries to the PHI node
@@ -2372,11 +2430,11 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   } else if (!Returns.empty()) {
     // Otherwise, if there is exactly one return value, just replace anything
     // using the return value of the call with the computed value.
-    if (!TheCall->use_empty()) {
-      if (TheCall == Returns[0]->getReturnValue())
-        TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+    if (!CB.use_empty()) {
+      if (&CB == Returns[0]->getReturnValue())
+        CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
       else
-        TheCall->replaceAllUsesWith(Returns[0]->getReturnValue());
+        CB.replaceAllUsesWith(Returns[0]->getReturnValue());
     }
 
     // Update PHI nodes that use the ReturnBB to use the AfterCallBB.
@@ -2394,14 +2452,14 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     // Delete the return instruction now and empty ReturnBB now.
     Returns[0]->eraseFromParent();
     ReturnBB->eraseFromParent();
-  } else if (!TheCall->use_empty()) {
+  } else if (!CB.use_empty()) {
     // No returns, but something is using the return value of the call.  Just
     // nuke the result.
-    TheCall->replaceAllUsesWith(UndefValue::get(TheCall->getType()));
+    CB.replaceAllUsesWith(UndefValue::get(CB.getType()));
   }
 
   // Since we are now done with the Call/Invoke, we can delete it.
-  TheCall->eraseFromParent();
+  CB.eraseFromParent();
 
   // If we inlined any musttail calls and the original return is now
   // unreachable, delete it.  It can only contain a bitcast and ret.
@@ -2429,7 +2487,7 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
   // block other optimizations.
   if (PHI) {
     AssumptionCache *AC =
-        IFI.GetAssumptionCache ? &(*IFI.GetAssumptionCache)(*Caller) : nullptr;
+        IFI.GetAssumptionCache ? &IFI.GetAssumptionCache(*Caller) : nullptr;
     auto &DL = Caller->getParent()->getDataLayout();
     if (Value *V = SimplifyInstruction(PHI, {DL, nullptr, nullptr, AC})) {
       PHI->replaceAllUsesWith(V);
@@ -2437,5 +2495,5 @@ llvm::InlineResult llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI,
     }
   }
 
-  return true;
+  return InlineResult::success();
 }
diff --git a/llvm/lib/Transforms/Utils/InstructionNamer.cpp b/llvm/lib/Transforms/Utils/InstructionNamer.cpp
index aac0b55801c4..8e339fe46d45 100644
--- a/llvm/lib/Transforms/Utils/InstructionNamer.cpp
+++ b/llvm/lib/Transforms/Utils/InstructionNamer.cpp
@@ -42,7 +42,7 @@ namespace {
 
         for (Instruction &I : BB)
           if (!I.hasName() && !I.getType()->isVoidTy())
-            I.setName("tmp");
+            I.setName("i");
       }
       return true;
     }
diff --git a/llvm/lib/Transforms/Utils/LCSSA.cpp b/llvm/lib/Transforms/Utils/LCSSA.cpp
index 5746d69260d5..b1a1c564d217 100644
--- a/llvm/lib/Transforms/Utils/LCSSA.cpp
+++ b/llvm/lib/Transforms/Utils/LCSSA.cpp
@@ -76,7 +76,7 @@ static bool isExitBlock(BasicBlock *BB,
 /// that are outside the current loop.  If so, insert LCSSA PHI nodes and
 /// rewrite the uses.
 bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
-                                    DominatorTree &DT, LoopInfo &LI,
+                                    const DominatorTree &DT, const LoopInfo &LI,
                                     ScalarEvolution *SE) {
   SmallVector<Use *, 16> UsesToRewrite;
   SmallSetVector<PHINode *, 16> PHIsToRemove;
@@ -128,7 +128,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
     if (auto *Inv = dyn_cast<InvokeInst>(I))
       DomBB = Inv->getNormalDest();
 
-    DomTreeNode *DomNode = DT.getNode(DomBB);
+    const DomTreeNode *DomNode = DT.getNode(DomBB);
 
     SmallVector<PHINode *, 16> AddedPHIs;
     SmallVector<PHINode *, 8> PostProcessPHIs;
@@ -274,7 +274,7 @@ bool llvm::formLCSSAForInstructions(SmallVectorImpl<Instruction *> &Worklist,
 
 // Compute the set of BasicBlocks in the loop `L` dominating at least one exit.
 static void computeBlocksDominatingExits(
-    Loop &L, DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks,
+    Loop &L, const DominatorTree &DT, SmallVector<BasicBlock *, 8> &ExitBlocks,
     SmallSetVector<BasicBlock *, 8> &BlocksDominatingExits) {
   SmallVector<BasicBlock *, 8> BBWorklist;
 
@@ -318,7 +318,7 @@ static void computeBlocksDominatingExits(
   }
 }
 
-bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
+bool llvm::formLCSSA(Loop &L, const DominatorTree &DT, const LoopInfo *LI,
                      ScalarEvolution *SE) {
   bool Changed = false;
 
@@ -383,8 +383,8 @@ bool llvm::formLCSSA(Loop &L, DominatorTree &DT, LoopInfo *LI,
 }
 
 /// Process a loop nest depth first.
-bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
-                                ScalarEvolution *SE) {
+bool llvm::formLCSSARecursively(Loop &L, const DominatorTree &DT,
+                                const LoopInfo *LI, ScalarEvolution *SE) {
   bool Changed = false;
 
   // Recurse depth-first through inner loops.
@@ -396,7 +396,7 @@ bool llvm::formLCSSARecursively(Loop &L, DominatorTree &DT, LoopInfo *LI,
 }
 
 /// Process all loops in the function, inner-most out.
-static bool formLCSSAOnAllLoops(LoopInfo *LI, DominatorTree &DT,
+static bool formLCSSAOnAllLoops(const LoopInfo *LI, const DominatorTree &DT,
                                 ScalarEvolution *SE) {
   bool Changed = false;
   for (auto &L : *LI)
diff --git a/llvm/lib/Transforms/Utils/Local.cpp b/llvm/lib/Transforms/Utils/Local.cpp
index b2d511c7c9a9..da40c342af3a 100644
--- a/llvm/lib/Transforms/Utils/Local.cpp
+++ b/llvm/lib/Transforms/Utils/Local.cpp
@@ -25,6 +25,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/TinyPtrVector.h"
+#include "llvm/Analysis/AssumeBundleQueries.h"
 #include "llvm/Analysis/ConstantFolding.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/EHPersonalities.h"
@@ -40,7 +41,6 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
@@ -75,6 +75,7 @@
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/ValueMapper.h"
 #include <algorithm>
 #include <cassert>
@@ -402,15 +403,29 @@ bool llvm::wouldInstructionBeTriviallyDead(Instruction *I,
         II->getIntrinsicID() == Intrinsic::launder_invariant_group)
       return true;
 
-    // Lifetime intrinsics are dead when their right-hand is undef.
-    if (II->isLifetimeStartOrEnd())
-      return isa<UndefValue>(II->getArgOperand(1));
+    if (II->isLifetimeStartOrEnd()) {
+      auto *Arg = II->getArgOperand(1);
+      // Lifetime intrinsics are dead when their right-hand is undef.
+      if (isa<UndefValue>(Arg))
+        return true;
+      // If the right-hand is an alloc, global, or argument and the only uses
+      // are lifetime intrinsics then the intrinsics are dead.
+      if (isa<AllocaInst>(Arg) || isa<GlobalValue>(Arg) || isa<Argument>(Arg))
+        return llvm::all_of(Arg->uses(), [](Use &Use) {
+          if (IntrinsicInst *IntrinsicUse =
+                  dyn_cast<IntrinsicInst>(Use.getUser()))
+            return IntrinsicUse->isLifetimeStartOrEnd();
+          return false;
+        });
+      return false;
+    }
 
     // Assumptions are dead if their condition is trivially true.  Guards on
     // true are operationally no-ops.  In the future we can consider more
     // sophisticated tradeoffs for guards considering potential for check
     // widening, but for now we keep things simple.
-    if (II->getIntrinsicID() == Intrinsic::assume ||
+    if ((II->getIntrinsicID() == Intrinsic::assume &&
+         isAssumeWithEmptyBundle(*II)) ||
         II->getIntrinsicID() == Intrinsic::experimental_guard) {
       if (ConstantInt *Cond = dyn_cast<ConstantInt>(II->getArgOperand(0)))
         return !Cond->isZero();
@@ -443,29 +458,49 @@ bool llvm::RecursivelyDeleteTriviallyDeadInstructions(
   if (!I || !isInstructionTriviallyDead(I, TLI))
     return false;
 
-  SmallVector<Instruction*, 16> DeadInsts;
+  SmallVector<WeakTrackingVH, 16> DeadInsts;
   DeadInsts.push_back(I);
   RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU);
 
   return true;
 }
 
+bool llvm::RecursivelyDeleteTriviallyDeadInstructionsPermissive(
+    SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI,
+    MemorySSAUpdater *MSSAU) {
+  unsigned S = 0, E = DeadInsts.size(), Alive = 0;
+  for (; S != E; ++S) {
+    auto *I = cast<Instruction>(DeadInsts[S]);
+    if (!isInstructionTriviallyDead(I)) {
+      DeadInsts[S] = nullptr;
+      ++Alive;
+    }
+  }
+  if (Alive == E)
+    return false;
+  RecursivelyDeleteTriviallyDeadInstructions(DeadInsts, TLI, MSSAU);
+  return true;
+}
+
 void llvm::RecursivelyDeleteTriviallyDeadInstructions(
-    SmallVectorImpl<Instruction *> &DeadInsts, const TargetLibraryInfo *TLI,
+    SmallVectorImpl<WeakTrackingVH> &DeadInsts, const TargetLibraryInfo *TLI,
     MemorySSAUpdater *MSSAU) {
   // Process the dead instruction list until empty.
   while (!DeadInsts.empty()) {
-    Instruction &I = *DeadInsts.pop_back_val();
-    assert(I.use_empty() && "Instructions with uses are not dead.");
-    assert(isInstructionTriviallyDead(&I, TLI) &&
+    Value *V = DeadInsts.pop_back_val();
+    Instruction *I = cast_or_null<Instruction>(V);
+    if (!I)
+      continue;
+    assert(isInstructionTriviallyDead(I, TLI) &&
            "Live instruction found in dead worklist!");
+    assert(I->use_empty() && "Instructions with uses are not dead.");
 
     // Don't lose the debug info while deleting the instructions.
-    salvageDebugInfo(I);
+    salvageDebugInfo(*I);
 
     // Null out all of the instruction's operands to see if any operand becomes
     // dead as we go.
-    for (Use &OpU : I.operands()) {
+    for (Use &OpU : I->operands()) {
       Value *OpV = OpU.get();
       OpU.set(nullptr);
 
@@ -480,9 +515,9 @@ void llvm::RecursivelyDeleteTriviallyDeadInstructions(
           DeadInsts.push_back(OpI);
     }
     if (MSSAU)
-      MSSAU->removeMemoryAccess(&I);
+      MSSAU->removeMemoryAccess(I);
 
-    I.eraseFromParent();
+    I->eraseFromParent();
   }
 }
 
@@ -521,19 +556,20 @@ static bool areAllUsesEqual(Instruction *I) {
 /// delete it.  If that makes any of its operands trivially dead, delete them
 /// too, recursively.  Return true if a change was made.
 bool llvm::RecursivelyDeleteDeadPHINode(PHINode *PN,
-                                        const TargetLibraryInfo *TLI) {
+                                        const TargetLibraryInfo *TLI,
+                                        llvm::MemorySSAUpdater *MSSAU) {
   SmallPtrSet<Instruction*, 4> Visited;
   for (Instruction *I = PN; areAllUsesEqual(I) && !I->mayHaveSideEffects();
        I = cast<Instruction>(*I->user_begin())) {
     if (I->use_empty())
-      return RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+      return RecursivelyDeleteTriviallyDeadInstructions(I, TLI, MSSAU);
 
     // If we find an instruction more than once, we're on a cycle that
     // won't prove fruitful.
     if (!Visited.insert(I).second) {
       // Break the cycle and delete the instruction and its operands.
       I->replaceAllUsesWith(UndefValue::get(I->getType()));
-      (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI);
+      (void)RecursivelyDeleteTriviallyDeadInstructions(I, TLI, MSSAU);
       return true;
     }
   }
@@ -1132,9 +1168,8 @@ bool llvm::EliminateDuplicatePHINodes(BasicBlock *BB) {
 /// often possible though. If alignment is important, a more reliable approach
 /// is to simply align all global variables and allocation instructions to
 /// their preferred alignment from the beginning.
-static unsigned enforceKnownAlignment(Value *V, unsigned Alignment,
-                                      unsigned PrefAlign,
-                                      const DataLayout &DL) {
+static Align enforceKnownAlignment(Value *V, Align Alignment, Align PrefAlign,
+                                   const DataLayout &DL) {
   assert(PrefAlign > Alignment);
 
   V = V->stripPointerCasts();
@@ -1146,21 +1181,21 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Alignment,
     // stripPointerCasts recurses through infinite layers of bitcasts,
     // while computeKnownBits is not allowed to traverse more than 6
     // levels.
-    Alignment = std::max(AI->getAlignment(), Alignment);
+    Alignment = std::max(AI->getAlign(), Alignment);
     if (PrefAlign <= Alignment)
       return Alignment;
 
     // If the preferred alignment is greater than the natural stack alignment
     // then don't round up. This avoids dynamic stack realignment.
-    if (DL.exceedsNaturalStackAlignment(Align(PrefAlign)))
+    if (DL.exceedsNaturalStackAlignment(PrefAlign))
       return Alignment;
-    AI->setAlignment(MaybeAlign(PrefAlign));
+    AI->setAlignment(PrefAlign);
     return PrefAlign;
   }
 
   if (auto *GO = dyn_cast<GlobalObject>(V)) {
     // TODO: as above, this shouldn't be necessary.
-    Alignment = std::max(GO->getAlignment(), Alignment);
+    Alignment = max(GO->getAlign(), Alignment);
     if (PrefAlign <= Alignment)
       return Alignment;
 
@@ -1171,18 +1206,18 @@ static unsigned enforceKnownAlignment(Value *V, unsigned Alignment,
     if (!GO->canIncreaseAlignment())
       return Alignment;
 
-    GO->setAlignment(MaybeAlign(PrefAlign));
+    GO->setAlignment(PrefAlign);
     return PrefAlign;
   }
 
   return Alignment;
 }
 
-unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
-                                          const DataLayout &DL,
-                                          const Instruction *CxtI,
-                                          AssumptionCache *AC,
-                                          const DominatorTree *DT) {
+Align llvm::getOrEnforceKnownAlignment(Value *V, MaybeAlign PrefAlign,
+                                       const DataLayout &DL,
+                                       const Instruction *CxtI,
+                                       AssumptionCache *AC,
+                                       const DominatorTree *DT) {
   assert(V->getType()->isPointerTy() &&
          "getOrEnforceKnownAlignment expects a pointer!");
 
@@ -1191,42 +1226,22 @@ unsigned llvm::getOrEnforceKnownAlignment(Value *V, unsigned PrefAlign,
 
   // Avoid trouble with ridiculously large TrailZ values, such as
   // those computed from a null pointer.
-  TrailZ = std::min(TrailZ, unsigned(sizeof(unsigned) * CHAR_BIT - 1));
-
-  unsigned Align = 1u << std::min(Known.getBitWidth() - 1, TrailZ);
+  // LLVM doesn't support alignments larger than (1 << MaxAlignmentExponent).
+  TrailZ = std::min(TrailZ, +Value::MaxAlignmentExponent);
 
-  // LLVM doesn't support alignments larger than this currently.
-  Align = std::min(Align, +Value::MaximumAlignment);
+  Align Alignment = Align(1ull << std::min(Known.getBitWidth() - 1, TrailZ));
 
-  if (PrefAlign > Align)
-    Align = enforceKnownAlignment(V, Align, PrefAlign, DL);
+  if (PrefAlign && *PrefAlign > Alignment)
+    Alignment = enforceKnownAlignment(V, Alignment, *PrefAlign, DL);
 
   // We don't need to make any adjustment.
-  return Align;
+  return Alignment;
 }
 
 ///===---------------------------------------------------------------------===//
 ///  Dbg Intrinsic utilities
 ///
 
-/// See if there is a dbg.value intrinsic for DIVar before I.
-static bool LdStHasDebugValue(DILocalVariable *DIVar, DIExpression *DIExpr,
-                              Instruction *I) {
-  // Since we can't guarantee that the original dbg.declare instrinsic
-  // is removed by LowerDbgDeclare(), we need to make sure that we are
-  // not inserting the same dbg.value intrinsic over and over.
-  BasicBlock::InstListType::iterator PrevI(I);
-  if (PrevI != I->getParent()->getInstList().begin()) {
-    --PrevI;
-    if (DbgValueInst *DVI = dyn_cast<DbgValueInst>(PrevI))
-      if (DVI->getValue() == I->getOperand(0) &&
-          DVI->getVariable() == DIVar &&
-          DVI->getExpression() == DIExpr)
-        return true;
-  }
-  return false;
-}
-
 /// See if there is a dbg.value intrinsic for DIVar for the PHI node.
 static bool PhiHasDebugValue(DILocalVariable *DIVar,
                              DIExpression *DIExpr,
@@ -1303,13 +1318,11 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
     // know which part) we insert an dbg.value instrinsic to indicate that we
     // know nothing about the variable's content.
     DV = UndefValue::get(DV->getType());
-    if (!LdStHasDebugValue(DIVar, DIExpr, SI))
-      Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
+    Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
     return;
   }
 
-  if (!LdStHasDebugValue(DIVar, DIExpr, SI))
-    Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
+  Builder.insertDbgValueIntrinsic(DV, DIVar, DIExpr, NewLoc, SI);
 }
 
 /// Inserts a llvm.dbg.value intrinsic before a load of an alloca'd value
@@ -1320,9 +1333,6 @@ void llvm::ConvertDebugDeclareToDebugValue(DbgVariableIntrinsic *DII,
   auto *DIExpr = DII->getExpression();
   assert(DIVar && "Missing variable");
 
-  if (LdStHasDebugValue(DIVar, DIExpr, LI))
-    return;
-
   if (!valueCoversEntireFragment(LI->getType(), DII)) {
     // FIXME: If only referring to a part of the variable described by the
     // dbg.declare, then we want to insert a dbg.value for the corresponding
@@ -1389,6 +1399,7 @@ static bool isStructure(AllocaInst *AI) {
 /// LowerDbgDeclare - Lowers llvm.dbg.declare intrinsics into appropriate set
 /// of llvm.dbg.value intrinsics.
 bool llvm::LowerDbgDeclare(Function &F) {
+  bool Changed = false;
   DIBuilder DIB(*F.getParent(), /*AllowUnresolved*/ false);
   SmallVector<DbgDeclareInst *, 4> Dbgs;
   for (auto &FI : F)
@@ -1397,7 +1408,7 @@ bool llvm::LowerDbgDeclare(Function &F) {
         Dbgs.push_back(DDI);
 
   if (Dbgs.empty())
-    return false;
+    return Changed;
 
   for (auto &I : Dbgs) {
     DbgDeclareInst *DDI = I;
@@ -1450,8 +1461,14 @@ bool llvm::LowerDbgDeclare(Function &F) {
       }
     }
     DDI->eraseFromParent();
+    Changed = true;
   }
-  return true;
+
+  if (Changed)
+  for (BasicBlock &BB : F)
+    RemoveRedundantDbgInstrs(&BB);
+
+  return Changed;
 }
 
 /// Propagate dbg.value intrinsics through the newly inserted PHIs.
@@ -1521,6 +1538,14 @@ TinyPtrVector<DbgVariableIntrinsic *> llvm::FindDbgAddrUses(Value *V) {
   return Declares;
 }
 
+TinyPtrVector<DbgDeclareInst *> llvm::FindDbgDeclareUses(Value *V) {
+  TinyPtrVector<DbgDeclareInst *> DDIs;
+  for (DbgVariableIntrinsic *DVI : FindDbgAddrUses(V))
+    if (auto *DDI = dyn_cast<DbgDeclareInst>(DVI))
+      DDIs.push_back(DDI);
+  return DDIs;
+}
+
 void llvm::findDbgValues(SmallVectorImpl<DbgValueInst *> &DbgValues, Value *V) {
   // This function is hot. Check whether the value has any metadata to avoid a
   // DenseMap lookup.
@@ -1547,8 +1572,8 @@ void llvm::findDbgUsers(SmallVectorImpl<DbgVariableIntrinsic *> &DbgUsers,
 }
 
 bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
-                             Instruction *InsertBefore, DIBuilder &Builder,
-                             uint8_t DIExprFlags, int Offset) {
+                             DIBuilder &Builder, uint8_t DIExprFlags,
+                             int Offset) {
   auto DbgAddrs = FindDbgAddrUses(Address);
   for (DbgVariableIntrinsic *DII : DbgAddrs) {
     DebugLoc Loc = DII->getDebugLoc();
@@ -1556,23 +1581,14 @@ bool llvm::replaceDbgDeclare(Value *Address, Value *NewAddress,
     auto *DIExpr = DII->getExpression();
     assert(DIVar && "Missing variable");
     DIExpr = DIExpression::prepend(DIExpr, DIExprFlags, Offset);
-    // Insert llvm.dbg.declare immediately before InsertBefore, and remove old
+    // Insert llvm.dbg.declare immediately before DII, and remove old
     // llvm.dbg.declare.
-    Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, InsertBefore);
-    if (DII == InsertBefore)
-      InsertBefore = InsertBefore->getNextNode();
+    Builder.insertDeclare(NewAddress, DIVar, DIExpr, Loc, DII);
     DII->eraseFromParent();
   }
   return !DbgAddrs.empty();
 }
 
-bool llvm::replaceDbgDeclareForAlloca(AllocaInst *AI, Value *NewAllocaAddress,
-                                      DIBuilder &Builder, uint8_t DIExprFlags,
-                                      int Offset) {
-  return replaceDbgDeclare(AI, NewAllocaAddress, AI->getNextNode(), Builder,
-                           DIExprFlags, Offset);
-}
-
 static void replaceOneDbgValueForAlloca(DbgValueInst *DVI, Value *NewAddress,
                                         DIBuilder &Builder, int Offset) {
   DebugLoc Loc = DVI->getDebugLoc();
@@ -1612,23 +1628,18 @@ static MetadataAsValue *wrapValueInMetadata(LLVMContext &C, Value *V) {
   return MetadataAsValue::get(C, ValueAsMetadata::get(V));
 }
 
-bool llvm::salvageDebugInfo(Instruction &I) {
+/// Where possible to salvage debug information for \p I do so
+/// and return True. If not possible mark undef and return False.
+void llvm::salvageDebugInfo(Instruction &I) {
   SmallVector<DbgVariableIntrinsic *, 1> DbgUsers;
   findDbgUsers(DbgUsers, &I);
-  if (DbgUsers.empty())
-    return false;
-
-  return salvageDebugInfoForDbgValues(I, DbgUsers);
-}
-
-void llvm::salvageDebugInfoOrMarkUndef(Instruction &I) {
-  if (!salvageDebugInfo(I))
-    replaceDbgUsesWithUndef(&I);
+  salvageDebugInfoForDbgValues(I, DbgUsers);
 }
 
-bool llvm::salvageDebugInfoForDbgValues(
+void llvm::salvageDebugInfoForDbgValues(
     Instruction &I, ArrayRef<DbgVariableIntrinsic *> DbgUsers) {
   auto &Ctx = I.getContext();
+  bool Salvaged = false;
   auto wrapMD = [&](Value *V) { return wrapValueInMetadata(Ctx, V); };
 
   for (auto *DII : DbgUsers) {
@@ -1643,14 +1654,22 @@ bool llvm::salvageDebugInfoForDbgValues(
     // salvageDebugInfoImpl should fail on examining the first element of
     // DbgUsers, or none of them.
     if (!DIExpr)
-      return false;
+      break;
 
     DII->setOperand(0, wrapMD(I.getOperand(0)));
     DII->setOperand(2, MetadataAsValue::get(Ctx, DIExpr));
     LLVM_DEBUG(dbgs() << "SALVAGE: " << *DII << '\n');
+    Salvaged = true;
   }
 
-  return true;
+  if (Salvaged)
+    return;
+
+  for (auto *DII : DbgUsers) {
+    Value *Undef = UndefValue::get(I.getType());
+    DII->setOperand(0, MetadataAsValue::get(DII->getContext(),
+                                            ValueAsMetadata::get(Undef)));
+  }
 }
 
 DIExpression *llvm::salvageDebugInfoImpl(Instruction &I,
@@ -1682,13 +1701,14 @@ DIExpression *llvm::salvageDebugInfoImpl(Instruction &I,
   };
 
   if (auto *CI = dyn_cast<CastInst>(&I)) {
-    // No-op casts and zexts are irrelevant for debug info.
-    if (CI->isNoopCast(DL) || isa<ZExtInst>(&I))
+    // No-op casts are irrelevant for debug info.
+    if (CI->isNoopCast(DL))
       return SrcDIExpr;
 
     Type *Type = CI->getType();
-    // Casts other than Trunc or SExt to scalar types cannot be salvaged.
-    if (Type->isVectorTy() || (!isa<TruncInst>(&I) && !isa<SExtInst>(&I)))
+    // Casts other than Trunc, SExt, or ZExt to scalar types cannot be salvaged.
+    if (Type->isVectorTy() ||
+        !(isa<TruncInst>(&I) || isa<SExtInst>(&I) || isa<ZExtInst>(&I)))
       return nullptr;
 
     Value *FromValue = CI->getOperand(0);
@@ -1805,7 +1825,7 @@ static bool rewriteDebugUsers(
 
   if (!UndefOrSalvage.empty()) {
     // Try to salvage the remaining debug users.
-    salvageDebugInfoOrMarkUndef(From);
+    salvageDebugInfo(From);
     Changed = true;
   }
 
@@ -1960,11 +1980,23 @@ CallInst *llvm::createCallMatchingInvoke(InvokeInst *II) {
   SmallVector<OperandBundleDef, 1> OpBundles;
   II->getOperandBundlesAsDefs(OpBundles);
   CallInst *NewCall = CallInst::Create(II->getFunctionType(),
-                                       II->getCalledValue(), Args, OpBundles);
+                                       II->getCalledOperand(), Args, OpBundles);
   NewCall->setCallingConv(II->getCallingConv());
   NewCall->setAttributes(II->getAttributes());
   NewCall->setDebugLoc(II->getDebugLoc());
   NewCall->copyMetadata(*II);
+
+  // If the invoke had profile metadata, try converting them for CallInst.
+  uint64_t TotalWeight;
+  if (NewCall->extractProfTotalWeight(TotalWeight)) {
+    // Set the total weight if it fits into i32, otherwise reset.
+    MDBuilder MDB(NewCall->getContext());
+    auto NewWeights = uint32_t(TotalWeight) != TotalWeight
+                          ? nullptr
+                          : MDB.createBranchWeights({uint32_t(TotalWeight)});
+    NewCall->setMetadata(LLVMContext::MD_prof, NewWeights);
+  }
+
   return NewCall;
 }
 
@@ -2011,7 +2043,7 @@ BasicBlock *llvm::changeToInvokeAndSplitBasicBlock(CallInst *CI,
   // as of this time.
 
   InvokeInst *II =
-      InvokeInst::Create(CI->getFunctionType(), CI->getCalledValue(), Split,
+      InvokeInst::Create(CI->getFunctionType(), CI->getCalledOperand(), Split,
                          UnwindEdge, InvokeArgs, OpBundles, CI->getName(), BB);
   II->setDebugLoc(CI->getDebugLoc());
   II->setCallingConv(CI->getCallingConv());
@@ -2042,7 +2074,7 @@ static bool markAliveBlocks(Function &F,
     // canonicalizes unreachable insts into stores to null or undef.
     for (Instruction &I : *BB) {
       if (auto *CI = dyn_cast<CallInst>(&I)) {
-        Value *Callee = CI->getCalledValue();
+        Value *Callee = CI->getCalledOperand();
         // Handle intrinsic calls.
         if (Function *F = dyn_cast<Function>(Callee)) {
           auto IntrinsicID = F->getIntrinsicID();
@@ -2117,7 +2149,7 @@ static bool markAliveBlocks(Function &F,
     Instruction *Terminator = BB->getTerminator();
     if (auto *II = dyn_cast<InvokeInst>(Terminator)) {
       // Turn invokes that call 'nounwind' functions into ordinary calls.
-      Value *Callee = II->getCalledValue();
+      Value *Callee = II->getCalledOperand();
       if ((isa<ConstantPointerNull>(Callee) &&
            !NullPointerIsDefined(BB->getParent())) ||
           isa<UndefValue>(Callee)) {
@@ -2243,7 +2275,7 @@ bool llvm::removeUnreachableBlocks(Function &F, DomTreeUpdater *DTU,
   SmallSetVector<BasicBlock *, 8> DeadBlockSet;
   for (BasicBlock &BB : F) {
     // Skip reachable basic blocks
-    if (Reachable.find(&BB) != Reachable.end())
+    if (Reachable.count(&BB))
       continue;
     DeadBlockSet.insert(&BB);
   }
@@ -2548,7 +2580,7 @@ bool llvm::callsGCLeafFunction(const CallBase *Call,
   // marked as 'gc-leaf-function.' All available Libcalls are
   // GC-leaf.
   LibFunc LF;
-  if (TLI.getLibFunc(ImmutableCallSite(Call), LF)) {
+  if (TLI.getLibFunc(*Call, LF)) {
     return TLI.has(LF);
   }
 
@@ -2928,21 +2960,40 @@ bool llvm::canReplaceOperandWithVariable(const Instruction *I, unsigned OpIdx) {
   default:
     return true;
   case Instruction::Call:
-  case Instruction::Invoke:
+  case Instruction::Invoke: {
+    const auto &CB = cast<CallBase>(*I);
+
     // Can't handle inline asm. Skip it.
-    if (isa<InlineAsm>(ImmutableCallSite(I).getCalledValue()))
-      return false;
-    // Many arithmetic intrinsics have no issue taking a
-    // variable, however it's hard to distingish these from
-    // specials such as @llvm.frameaddress that require a constant.
-    if (isa<IntrinsicInst>(I))
+    if (CB.isInlineAsm())
       return false;
 
     // Constant bundle operands may need to retain their constant-ness for
     // correctness.
-    if (ImmutableCallSite(I).isBundleOperand(OpIdx))
+    if (CB.isBundleOperand(OpIdx))
       return false;
-    return true;
+
+    if (OpIdx < CB.getNumArgOperands()) {
+      // Some variadic intrinsics require constants in the variadic arguments,
+      // which currently aren't markable as immarg.
+      if (isa<IntrinsicInst>(CB) &&
+          OpIdx >= CB.getFunctionType()->getNumParams()) {
+        // This is known to be OK for stackmap.
+        return CB.getIntrinsicID() == Intrinsic::experimental_stackmap;
+      }
+
+      // gcroot is a special case, since it requires a constant argument which
+      // isn't also required to be a simple ConstantInt.
+      if (CB.getIntrinsicID() == Intrinsic::gcroot)
+        return false;
+
+      // Some intrinsic operands are required to be immediates.
+      return !CB.paramHasAttr(OpIdx, Attribute::ImmArg);
+    }
+
+    // It is never allowed to replace the call argument to an intrinsic, but it
+    // may be possible for a call.
+    return !isa<IntrinsicInst>(CB);
+  }
   case Instruction::ShuffleVector:
     // Shufflevector masks are constant.
     return OpIdx != 2;
@@ -3006,3 +3057,37 @@ AllocaInst *llvm::findAllocaForValue(Value *V,
     AllocaForValue[V] = Res;
   return Res;
 }
+
+Value *llvm::invertCondition(Value *Condition) {
+  // First: Check if it's a constant
+  if (Constant *C = dyn_cast<Constant>(Condition))
+    return ConstantExpr::getNot(C);
+
+  // Second: If the condition is already inverted, return the original value
+  Value *NotCondition;
+  if (match(Condition, m_Not(m_Value(NotCondition))))
+    return NotCondition;
+
+  BasicBlock *Parent = nullptr;
+  Instruction *Inst = dyn_cast<Instruction>(Condition);
+  if (Inst)
+    Parent = Inst->getParent();
+  else if (Argument *Arg = dyn_cast<Argument>(Condition))
+    Parent = &Arg->getParent()->getEntryBlock();
+  assert(Parent && "Unsupported condition to invert");
+
+  // Third: Check all the users for an invert
+  for (User *U : Condition->users())
+    if (Instruction *I = dyn_cast<Instruction>(U))
+      if (I->getParent() == Parent && match(I, m_Not(m_Specific(Condition))))
+        return I;
+
+  // Last option: Create a new instruction
+  auto *Inverted =
+      BinaryOperator::CreateNot(Condition, Condition->getName() + ".inv");
+  if (Inst && !isa<PHINode>(Inst))
+    Inverted->insertAfter(Inst);
+  else
+    Inverted->insertBefore(&*Parent->getFirstInsertionPt());
+  return Inverted;
+}
diff --git a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
index c065e0269c64..8804bba975b6 100644
--- a/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopRotationUtils.cpp
@@ -46,6 +46,11 @@ using namespace llvm;
 
 STATISTIC(NumRotated, "Number of loops rotated");
 
+static cl::opt<bool>
+    MultiRotate("loop-rotate-multi", cl::init(false), cl::Hidden,
+                cl::desc("Allow loop rotation multiple times in order to reach "
+                         "a better latch exit"));
+
 namespace {
 /// A simple loop rotation transformation.
 class LoopRotate {
@@ -177,14 +182,16 @@ static void RewriteUsesOfClonedInstructions(BasicBlock *OrigHeader,
   }
 }
 
-// Look for a phi which is only used outside the loop (via a LCSSA phi)
-// in the exit from the header. This means that rotating the loop can
-// remove the phi.
-static bool shouldRotateLoopExitingLatch(Loop *L) {
+// Assuming both header and latch are exiting, look for a phi which is only
+// used outside the loop (via a LCSSA phi) in the exit from the header.
+// This means that rotating the loop can remove the phi.
+static bool profitableToRotateLoopExitingLatch(Loop *L) {
   BasicBlock *Header = L->getHeader();
-  BasicBlock *HeaderExit = Header->getTerminator()->getSuccessor(0);
+  BranchInst *BI = dyn_cast<BranchInst>(Header->getTerminator());
+  assert(BI && BI->isConditional() && "need header with conditional exit");
+  BasicBlock *HeaderExit = BI->getSuccessor(0);
   if (L->contains(HeaderExit))
-    HeaderExit = Header->getTerminator()->getSuccessor(1);
+    HeaderExit = BI->getSuccessor(1);
 
   for (auto &Phi : Header->phis()) {
     // Look for uses of this phi in the loop/via exits other than the header.
@@ -194,7 +201,50 @@ static bool shouldRotateLoopExitingLatch(Loop *L) {
       continue;
     return true;
   }
+  return false;
+}
+
+// Check that latch exit is deoptimizing (which means - very unlikely to happen)
+// and there is another exit from the loop which is non-deoptimizing.
+// If we rotate latch to that exit our loop has a better chance of being fully
+// canonical.
+//
+// It can give false positives in some rare cases.
+static bool canRotateDeoptimizingLatchExit(Loop *L) {
+  BasicBlock *Latch = L->getLoopLatch();
+  assert(Latch && "need latch");
+  BranchInst *BI = dyn_cast<BranchInst>(Latch->getTerminator());
+  // Need normal exiting latch.
+  if (!BI || !BI->isConditional())
+    return false;
+
+  BasicBlock *Exit = BI->getSuccessor(1);
+  if (L->contains(Exit))
+    Exit = BI->getSuccessor(0);
 
+  // Latch exit is non-deoptimizing, no need to rotate.
+  if (!Exit->getPostdominatingDeoptimizeCall())
+    return false;
+
+  SmallVector<BasicBlock *, 4> Exits;
+  L->getUniqueExitBlocks(Exits);
+  if (!Exits.empty()) {
+    // There is at least one non-deoptimizing exit.
+    //
+    // Note, that BasicBlock::getPostdominatingDeoptimizeCall is not exact,
+    // as it can conservatively return false for deoptimizing exits with
+    // complex enough control flow down to deoptimize call.
+    //
+    // That means here we can report success for a case where
+    // all exits are deoptimizing but one of them has complex enough
+    // control flow (e.g. with loops).
+    //
+    // That should be a very rare case and false positives for this function
+    // have compile-time effect only.
+    return any_of(Exits, [](const BasicBlock *BB) {
+      return !BB->getPostdominatingDeoptimizeCall();
+    });
+  }
   return false;
 }
 
@@ -208,319 +258,342 @@ static bool shouldRotateLoopExitingLatch(Loop *L) {
 /// rotation. LoopRotate should be repeatable and converge to a canonical
 /// form. This property is satisfied because simplifying the loop latch can only
 /// happen once across multiple invocations of the LoopRotate pass.
+///
+/// If -loop-rotate-multi is enabled we can do multiple rotations in one go
+/// so to reach a suitable (non-deoptimizing) exit.
 bool LoopRotate::rotateLoop(Loop *L, bool SimplifiedLatch) {
   // If the loop has only one block then there is not much to rotate.
   if (L->getBlocks().size() == 1)
     return false;
 
-  BasicBlock *OrigHeader = L->getHeader();
-  BasicBlock *OrigLatch = L->getLoopLatch();
-
-  BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
-  if (!BI || BI->isUnconditional())
-    return false;
-
-  // If the loop header is not one of the loop exiting blocks then
-  // either this loop is already rotated or it is not
-  // suitable for loop rotation transformations.
-  if (!L->isLoopExiting(OrigHeader))
-    return false;
-
-  // If the loop latch already contains a branch that leaves the loop then the
-  // loop is already rotated.
-  if (!OrigLatch)
-    return false;
-
-  // Rotate if either the loop latch does *not* exit the loop, or if the loop
-  // latch was just simplified. Or if we think it will be profitable.
-  if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
-      !shouldRotateLoopExitingLatch(L))
-    return false;
-
-  // Check size of original header and reject loop if it is very big or we can't
-  // duplicate blocks inside it.
-  {
-    SmallPtrSet<const Value *, 32> EphValues;
-    CodeMetrics::collectEphemeralValues(L, AC, EphValues);
-
-    CodeMetrics Metrics;
-    Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
-    if (Metrics.notDuplicatable) {
-      LLVM_DEBUG(
-          dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
-                 << " instructions: ";
-          L->dump());
-      return false;
-    }
-    if (Metrics.convergent) {
-      LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
-                           "instructions: ";
-                 L->dump());
-      return false;
+  bool Rotated = false;
+  do {
+    BasicBlock *OrigHeader = L->getHeader();
+    BasicBlock *OrigLatch = L->getLoopLatch();
+
+    BranchInst *BI = dyn_cast<BranchInst>(OrigHeader->getTerminator());
+    if (!BI || BI->isUnconditional())
+      return Rotated;
+
+    // If the loop header is not one of the loop exiting blocks then
+    // either this loop is already rotated or it is not
+    // suitable for loop rotation transformations.
+    if (!L->isLoopExiting(OrigHeader))
+      return Rotated;
+
+    // If the loop latch already contains a branch that leaves the loop then the
+    // loop is already rotated.
+    if (!OrigLatch)
+      return Rotated;
+
+    // Rotate if either the loop latch does *not* exit the loop, or if the loop
+    // latch was just simplified. Or if we think it will be profitable.
+    if (L->isLoopExiting(OrigLatch) && !SimplifiedLatch && IsUtilMode == false &&
+        !profitableToRotateLoopExitingLatch(L) &&
+        !canRotateDeoptimizingLatchExit(L))
+      return Rotated;
+
+    // Check size of original header and reject loop if it is very big or we can't
+    // duplicate blocks inside it.
+    {
+      SmallPtrSet<const Value *, 32> EphValues;
+      CodeMetrics::collectEphemeralValues(L, AC, EphValues);
+
+      CodeMetrics Metrics;
+      Metrics.analyzeBasicBlock(OrigHeader, *TTI, EphValues);
+      if (Metrics.notDuplicatable) {
+        LLVM_DEBUG(
+                   dbgs() << "LoopRotation: NOT rotating - contains non-duplicatable"
+                   << " instructions: ";
+                   L->dump());
+        return Rotated;
+      }
+      if (Metrics.convergent) {
+        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains convergent "
+                   "instructions: ";
+                   L->dump());
+        return Rotated;
+      }
+      if (Metrics.NumInsts > MaxHeaderSize) {
+        LLVM_DEBUG(dbgs() << "LoopRotation: NOT rotating - contains "
+                          << Metrics.NumInsts
+                          << " instructions, which is more than the threshold ("
+                          << MaxHeaderSize << " instructions): ";
+                   L->dump());
+        return Rotated;
+      }
     }
-    if (Metrics.NumInsts > MaxHeaderSize)
-      return false;
-  }
 
-  // Now, this loop is suitable for rotation.
-  BasicBlock *OrigPreheader = L->getLoopPreheader();
+    // Now, this loop is suitable for rotation.
+    BasicBlock *OrigPreheader = L->getLoopPreheader();
+
+    // If the loop could not be converted to canonical form, it must have an
+    // indirectbr in it, just give up.
+    if (!OrigPreheader || !L->hasDedicatedExits())
+      return Rotated;
+
+    // Anything ScalarEvolution may know about this loop or the PHI nodes
+    // in its header will soon be invalidated. We should also invalidate
+    // all outer loops because insertion and deletion of blocks that happens
+    // during the rotation may violate invariants related to backedge taken
+    // infos in them.
+    if (SE)
+      SE->forgetTopmostLoop(L);
+
+    LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
+    if (MSSAU && VerifyMemorySSA)
+      MSSAU->getMemorySSA()->verifyMemorySSA();
+
+    // Find new Loop header. NewHeader is a Header's one and only successor
+    // that is inside loop.  Header's other successor is outside the
+    // loop.  Otherwise loop is not suitable for rotation.
+    BasicBlock *Exit = BI->getSuccessor(0);
+    BasicBlock *NewHeader = BI->getSuccessor(1);
+    if (L->contains(Exit))
+      std::swap(Exit, NewHeader);
+    assert(NewHeader && "Unable to determine new loop header");
+    assert(L->contains(NewHeader) && !L->contains(Exit) &&
+           "Unable to determine loop header and exit blocks");
+
+    // This code assumes that the new header has exactly one predecessor.
+    // Remove any single-entry PHI nodes in it.
+    assert(NewHeader->getSinglePredecessor() &&
+           "New header doesn't have one pred!");
+    FoldSingleEntryPHINodes(NewHeader);
+
+    // Begin by walking OrigHeader and populating ValueMap with an entry for
+    // each Instruction.
+    BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
+    ValueToValueMapTy ValueMap, ValueMapMSSA;
+
+    // For PHI nodes, the value available in OldPreHeader is just the
+    // incoming value from OldPreHeader.
+    for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
+      InsertNewValueIntoMap(ValueMap, PN,
+                            PN->getIncomingValueForBlock(OrigPreheader));
+
+    // For the rest of the instructions, either hoist to the OrigPreheader if
+    // possible or create a clone in the OldPreHeader if not.
+    Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
+
+    // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
+    using DbgIntrinsicHash =
+      std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
+    auto makeHash = [](DbgVariableIntrinsic *D) -> DbgIntrinsicHash {
+      return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
+    };
+    SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
+    for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
+         I != E; ++I) {
+      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&*I))
+        DbgIntrinsics.insert(makeHash(DII));
+      else
+        break;
+    }
 
-  // If the loop could not be converted to canonical form, it must have an
-  // indirectbr in it, just give up.
-  if (!OrigPreheader || !L->hasDedicatedExits())
-    return false;
+    while (I != E) {
+      Instruction *Inst = &*I++;
+
+      // If the instruction's operands are invariant and it doesn't read or write
+      // memory, then it is safe to hoist.  Doing this doesn't change the order of
+      // execution in the preheader, but does prevent the instruction from
+      // executing in each iteration of the loop.  This means it is safe to hoist
+      // something that might trap, but isn't safe to hoist something that reads
+      // memory (without proving that the loop doesn't write).
+      if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
+          !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
+          !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
+        Inst->moveBefore(LoopEntryBranch);
+        continue;
+      }
 
-  // Anything ScalarEvolution may know about this loop or the PHI nodes
-  // in its header will soon be invalidated. We should also invalidate
-  // all outer loops because insertion and deletion of blocks that happens
-  // during the rotation may violate invariants related to backedge taken
-  // infos in them.
-  if (SE)
-    SE->forgetTopmostLoop(L);
+      // Otherwise, create a duplicate of the instruction.
+      Instruction *C = Inst->clone();
 
-  LLVM_DEBUG(dbgs() << "LoopRotation: rotating "; L->dump());
-  if (MSSAU && VerifyMemorySSA)
-    MSSAU->getMemorySSA()->verifyMemorySSA();
+      // Eagerly remap the operands of the instruction.
+      RemapInstruction(C, ValueMap,
+                       RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
 
-  // Find new Loop header. NewHeader is a Header's one and only successor
-  // that is inside loop.  Header's other successor is outside the
-  // loop.  Otherwise loop is not suitable for rotation.
-  BasicBlock *Exit = BI->getSuccessor(0);
-  BasicBlock *NewHeader = BI->getSuccessor(1);
-  if (L->contains(Exit))
-    std::swap(Exit, NewHeader);
-  assert(NewHeader && "Unable to determine new loop header");
-  assert(L->contains(NewHeader) && !L->contains(Exit) &&
-         "Unable to determine loop header and exit blocks");
-
-  // This code assumes that the new header has exactly one predecessor.
-  // Remove any single-entry PHI nodes in it.
-  assert(NewHeader->getSinglePredecessor() &&
-         "New header doesn't have one pred!");
-  FoldSingleEntryPHINodes(NewHeader);
-
-  // Begin by walking OrigHeader and populating ValueMap with an entry for
-  // each Instruction.
-  BasicBlock::iterator I = OrigHeader->begin(), E = OrigHeader->end();
-  ValueToValueMapTy ValueMap, ValueMapMSSA;
-
-  // For PHI nodes, the value available in OldPreHeader is just the
-  // incoming value from OldPreHeader.
-  for (; PHINode *PN = dyn_cast<PHINode>(I); ++I)
-    InsertNewValueIntoMap(ValueMap, PN,
-                          PN->getIncomingValueForBlock(OrigPreheader));
-
-  // For the rest of the instructions, either hoist to the OrigPreheader if
-  // possible or create a clone in the OldPreHeader if not.
-  Instruction *LoopEntryBranch = OrigPreheader->getTerminator();
-
-  // Record all debug intrinsics preceding LoopEntryBranch to avoid duplication.
-  using DbgIntrinsicHash =
-      std::pair<std::pair<Value *, DILocalVariable *>, DIExpression *>;
-  auto makeHash = [](DbgVariableIntrinsic *D) -> DbgIntrinsicHash {
-    return {{D->getVariableLocation(), D->getVariable()}, D->getExpression()};
-  };
-  SmallDenseSet<DbgIntrinsicHash, 8> DbgIntrinsics;
-  for (auto I = std::next(OrigPreheader->rbegin()), E = OrigPreheader->rend();
-       I != E; ++I) {
-    if (auto *DII = dyn_cast<DbgVariableIntrinsic>(&*I))
-      DbgIntrinsics.insert(makeHash(DII));
-    else
-      break;
-  }
+      // Avoid inserting the same intrinsic twice.
+      if (auto *DII = dyn_cast<DbgVariableIntrinsic>(C))
+        if (DbgIntrinsics.count(makeHash(DII))) {
+          C->deleteValue();
+          continue;
+        }
 
-  while (I != E) {
-    Instruction *Inst = &*I++;
-
-    // If the instruction's operands are invariant and it doesn't read or write
-    // memory, then it is safe to hoist.  Doing this doesn't change the order of
-    // execution in the preheader, but does prevent the instruction from
-    // executing in each iteration of the loop.  This means it is safe to hoist
-    // something that might trap, but isn't safe to hoist something that reads
-    // memory (without proving that the loop doesn't write).
-    if (L->hasLoopInvariantOperands(Inst) && !Inst->mayReadFromMemory() &&
-        !Inst->mayWriteToMemory() && !Inst->isTerminator() &&
-        !isa<DbgInfoIntrinsic>(Inst) && !isa<AllocaInst>(Inst)) {
-      Inst->moveBefore(LoopEntryBranch);
-      continue;
+      // With the operands remapped, see if the instruction constant folds or is
+      // otherwise simplifyable.  This commonly occurs because the entry from PHI
+      // nodes allows icmps and other instructions to fold.
+      Value *V = SimplifyInstruction(C, SQ);
+      if (V && LI->replacementPreservesLCSSAForm(C, V)) {
+        // If so, then delete the temporary instruction and stick the folded value
+        // in the map.
+        InsertNewValueIntoMap(ValueMap, Inst, V);
+        if (!C->mayHaveSideEffects()) {
+          C->deleteValue();
+          C = nullptr;
+        }
+      } else {
+        InsertNewValueIntoMap(ValueMap, Inst, C);
+      }
+      if (C) {
+        // Otherwise, stick the new instruction into the new block!
+        C->setName(Inst->getName());
+        C->insertBefore(LoopEntryBranch);
+
+        if (auto *II = dyn_cast<IntrinsicInst>(C))
+          if (II->getIntrinsicID() == Intrinsic::assume)
+            AC->registerAssumption(II);
+        // MemorySSA cares whether the cloned instruction was inserted or not, and
+        // not whether it can be remapped to a simplified value.
+        if (MSSAU)
+          InsertNewValueIntoMap(ValueMapMSSA, Inst, C);
+      }
     }
 
-    // Otherwise, create a duplicate of the instruction.
-    Instruction *C = Inst->clone();
-
-    // Eagerly remap the operands of the instruction.
-    RemapInstruction(C, ValueMap,
-                     RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+    // Along with all the other instructions, we just cloned OrigHeader's
+    // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
+    // successors by duplicating their incoming values for OrigHeader.
+    for (BasicBlock *SuccBB : successors(OrigHeader))
+      for (BasicBlock::iterator BI = SuccBB->begin();
+           PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
+        PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
+
+    // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
+    // OrigPreHeader's old terminator (the original branch into the loop), and
+    // remove the corresponding incoming values from the PHI nodes in OrigHeader.
+    LoopEntryBranch->eraseFromParent();
+
+    // Update MemorySSA before the rewrite call below changes the 1:1
+    // instruction:cloned_instruction_or_value mapping.
+    if (MSSAU) {
+      InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);
+      MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,
+                                          ValueMapMSSA);
+    }
 
-    // Avoid inserting the same intrinsic twice.
-    if (auto *DII = dyn_cast<DbgVariableIntrinsic>(C))
-      if (DbgIntrinsics.count(makeHash(DII))) {
-        C->deleteValue();
-        continue;
+    SmallVector<PHINode*, 2> InsertedPHIs;
+    // If there were any uses of instructions in the duplicated block outside the
+    // loop, update them, inserting PHI nodes as required
+    RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
+                                    &InsertedPHIs);
+
+    // Attach dbg.value intrinsics to the new phis if that phi uses a value that
+    // previously had debug metadata attached. This keeps the debug info
+    // up-to-date in the loop body.
+    if (!InsertedPHIs.empty())
+      insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
+
+    // NewHeader is now the header of the loop.
+    L->moveToHeader(NewHeader);
+    assert(L->getHeader() == NewHeader && "Latch block is our new header");
+
+    // Inform DT about changes to the CFG.
+    if (DT) {
+      // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
+      // the DT about the removed edge to the OrigHeader (that got removed).
+      SmallVector<DominatorTree::UpdateType, 3> Updates;
+      Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
+      Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
+      Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
+      DT->applyUpdates(Updates);
+
+      if (MSSAU) {
+        MSSAU->applyUpdates(Updates, *DT);
+        if (VerifyMemorySSA)
+          MSSAU->getMemorySSA()->verifyMemorySSA();
       }
+    }
 
-    // With the operands remapped, see if the instruction constant folds or is
-    // otherwise simplifyable.  This commonly occurs because the entry from PHI
-    // nodes allows icmps and other instructions to fold.
-    Value *V = SimplifyInstruction(C, SQ);
-    if (V && LI->replacementPreservesLCSSAForm(C, V)) {
-      // If so, then delete the temporary instruction and stick the folded value
-      // in the map.
-      InsertNewValueIntoMap(ValueMap, Inst, V);
-      if (!C->mayHaveSideEffects()) {
-        C->deleteValue();
-        C = nullptr;
+    // At this point, we've finished our major CFG changes.  As part of cloning
+    // the loop into the preheader we've simplified instructions and the
+    // duplicated conditional branch may now be branching on a constant.  If it is
+    // branching on a constant and if that constant means that we enter the loop,
+    // then we fold away the cond branch to an uncond branch.  This simplifies the
+    // loop in cases important for nested loops, and it also means we don't have
+    // to split as many edges.
+    BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
+    assert(PHBI->isConditional() && "Should be clone of BI condbr!");
+    if (!isa<ConstantInt>(PHBI->getCondition()) ||
+        PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
+        NewHeader) {
+      // The conditional branch can't be folded, handle the general case.
+      // Split edges as necessary to preserve LoopSimplify form.
+
+      // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
+      // thus is not a preheader anymore.
+      // Split the edge to form a real preheader.
+      BasicBlock *NewPH = SplitCriticalEdge(
+                                            OrigPreheader, NewHeader,
+                                            CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
+      NewPH->setName(NewHeader->getName() + ".lr.ph");
+
+      // Preserve canonical loop form, which means that 'Exit' should have only
+      // one predecessor. Note that Exit could be an exit block for multiple
+      // nested loops, causing both of the edges to now be critical and need to
+      // be split.
+      SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
+      bool SplitLatchEdge = false;
+      for (BasicBlock *ExitPred : ExitPreds) {
+        // We only need to split loop exit edges.
+        Loop *PredLoop = LI->getLoopFor(ExitPred);
+        if (!PredLoop || PredLoop->contains(Exit) ||
+            ExitPred->getTerminator()->isIndirectTerminator())
+          continue;
+        SplitLatchEdge |= L->getLoopLatch() == ExitPred;
+        BasicBlock *ExitSplit = SplitCriticalEdge(
+                                                  ExitPred, Exit,
+                                                  CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
+        ExitSplit->moveBefore(Exit);
       }
+      assert(SplitLatchEdge &&
+             "Despite splitting all preds, failed to split latch exit?");
     } else {
-      InsertNewValueIntoMap(ValueMap, Inst, C);
-    }
-    if (C) {
-      // Otherwise, stick the new instruction into the new block!
-      C->setName(Inst->getName());
-      C->insertBefore(LoopEntryBranch);
-
-      if (auto *II = dyn_cast<IntrinsicInst>(C))
-        if (II->getIntrinsicID() == Intrinsic::assume)
-          AC->registerAssumption(II);
-      // MemorySSA cares whether the cloned instruction was inserted or not, and
-      // not whether it can be remapped to a simplified value.
+      // We can fold the conditional branch in the preheader, this makes things
+      // simpler. The first step is to remove the extra edge to the Exit block.
+      Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
+      BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
+      NewBI->setDebugLoc(PHBI->getDebugLoc());
+      PHBI->eraseFromParent();
+
+      // With our CFG finalized, update DomTree if it is available.
+      if (DT) DT->deleteEdge(OrigPreheader, Exit);
+
+      // Update MSSA too, if available.
       if (MSSAU)
-        InsertNewValueIntoMap(ValueMapMSSA, Inst, C);
+        MSSAU->removeEdge(OrigPreheader, Exit);
     }
-  }
 
-  // Along with all the other instructions, we just cloned OrigHeader's
-  // terminator into OrigPreHeader. Fix up the PHI nodes in each of OrigHeader's
-  // successors by duplicating their incoming values for OrigHeader.
-  for (BasicBlock *SuccBB : successors(OrigHeader))
-    for (BasicBlock::iterator BI = SuccBB->begin();
-         PHINode *PN = dyn_cast<PHINode>(BI); ++BI)
-      PN->addIncoming(PN->getIncomingValueForBlock(OrigHeader), OrigPreheader);
-
-  // Now that OrigPreHeader has a clone of OrigHeader's terminator, remove
-  // OrigPreHeader's old terminator (the original branch into the loop), and
-  // remove the corresponding incoming values from the PHI nodes in OrigHeader.
-  LoopEntryBranch->eraseFromParent();
-
-  // Update MemorySSA before the rewrite call below changes the 1:1
-  // instruction:cloned_instruction_or_value mapping.
-  if (MSSAU) {
-    InsertNewValueIntoMap(ValueMapMSSA, OrigHeader, OrigPreheader);
-    MSSAU->updateForClonedBlockIntoPred(OrigHeader, OrigPreheader,
-                                        ValueMapMSSA);
-  }
+    assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
+    assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
 
-  SmallVector<PHINode*, 2> InsertedPHIs;
-  // If there were any uses of instructions in the duplicated block outside the
-  // loop, update them, inserting PHI nodes as required
-  RewriteUsesOfClonedInstructions(OrigHeader, OrigPreheader, ValueMap,
-                                  &InsertedPHIs);
-
-  // Attach dbg.value intrinsics to the new phis if that phi uses a value that
-  // previously had debug metadata attached. This keeps the debug info
-  // up-to-date in the loop body.
-  if (!InsertedPHIs.empty())
-    insertDebugValuesForPHIs(OrigHeader, InsertedPHIs);
-
-  // NewHeader is now the header of the loop.
-  L->moveToHeader(NewHeader);
-  assert(L->getHeader() == NewHeader && "Latch block is our new header");
-
-  // Inform DT about changes to the CFG.
-  if (DT) {
-    // The OrigPreheader branches to the NewHeader and Exit now. Then, inform
-    // the DT about the removed edge to the OrigHeader (that got removed).
-    SmallVector<DominatorTree::UpdateType, 3> Updates;
-    Updates.push_back({DominatorTree::Insert, OrigPreheader, Exit});
-    Updates.push_back({DominatorTree::Insert, OrigPreheader, NewHeader});
-    Updates.push_back({DominatorTree::Delete, OrigPreheader, OrigHeader});
-    DT->applyUpdates(Updates);
+    if (MSSAU && VerifyMemorySSA)
+      MSSAU->getMemorySSA()->verifyMemorySSA();
 
-    if (MSSAU) {
-      MSSAU->applyUpdates(Updates, *DT);
-      if (VerifyMemorySSA)
-        MSSAU->getMemorySSA()->verifyMemorySSA();
-    }
-  }
+    // Now that the CFG and DomTree are in a consistent state again, try to merge
+    // the OrigHeader block into OrigLatch.  This will succeed if they are
+    // connected by an unconditional branch.  This is just a cleanup so the
+    // emitted code isn't too gross in this common case.
+    DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+    MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
 
-  // At this point, we've finished our major CFG changes.  As part of cloning
-  // the loop into the preheader we've simplified instructions and the
-  // duplicated conditional branch may now be branching on a constant.  If it is
-  // branching on a constant and if that constant means that we enter the loop,
-  // then we fold away the cond branch to an uncond branch.  This simplifies the
-  // loop in cases important for nested loops, and it also means we don't have
-  // to split as many edges.
-  BranchInst *PHBI = cast<BranchInst>(OrigPreheader->getTerminator());
-  assert(PHBI->isConditional() && "Should be clone of BI condbr!");
-  if (!isa<ConstantInt>(PHBI->getCondition()) ||
-      PHBI->getSuccessor(cast<ConstantInt>(PHBI->getCondition())->isZero()) !=
-          NewHeader) {
-    // The conditional branch can't be folded, handle the general case.
-    // Split edges as necessary to preserve LoopSimplify form.
-
-    // Right now OrigPreHeader has two successors, NewHeader and ExitBlock, and
-    // thus is not a preheader anymore.
-    // Split the edge to form a real preheader.
-    BasicBlock *NewPH = SplitCriticalEdge(
-        OrigPreheader, NewHeader,
-        CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
-    NewPH->setName(NewHeader->getName() + ".lr.ph");
-
-    // Preserve canonical loop form, which means that 'Exit' should have only
-    // one predecessor. Note that Exit could be an exit block for multiple
-    // nested loops, causing both of the edges to now be critical and need to
-    // be split.
-    SmallVector<BasicBlock *, 4> ExitPreds(pred_begin(Exit), pred_end(Exit));
-    bool SplitLatchEdge = false;
-    for (BasicBlock *ExitPred : ExitPreds) {
-      // We only need to split loop exit edges.
-      Loop *PredLoop = LI->getLoopFor(ExitPred);
-      if (!PredLoop || PredLoop->contains(Exit) ||
-          ExitPred->getTerminator()->isIndirectTerminator())
-        continue;
-      SplitLatchEdge |= L->getLoopLatch() == ExitPred;
-      BasicBlock *ExitSplit = SplitCriticalEdge(
-          ExitPred, Exit,
-          CriticalEdgeSplittingOptions(DT, LI, MSSAU).setPreserveLCSSA());
-      ExitSplit->moveBefore(Exit);
-    }
-    assert(SplitLatchEdge &&
-           "Despite splitting all preds, failed to split latch exit?");
-  } else {
-    // We can fold the conditional branch in the preheader, this makes things
-    // simpler. The first step is to remove the extra edge to the Exit block.
-    Exit->removePredecessor(OrigPreheader, true /*preserve LCSSA*/);
-    BranchInst *NewBI = BranchInst::Create(NewHeader, PHBI);
-    NewBI->setDebugLoc(PHBI->getDebugLoc());
-    PHBI->eraseFromParent();
-
-    // With our CFG finalized, update DomTree if it is available.
-    if (DT) DT->deleteEdge(OrigPreheader, Exit);
-
-    // Update MSSA too, if available.
-    if (MSSAU)
-      MSSAU->removeEdge(OrigPreheader, Exit);
-  }
+    if (MSSAU && VerifyMemorySSA)
+      MSSAU->getMemorySSA()->verifyMemorySSA();
 
-  assert(L->getLoopPreheader() && "Invalid loop preheader after loop rotation");
-  assert(L->getLoopLatch() && "Invalid loop latch after loop rotation");
+    LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
 
-  if (MSSAU && VerifyMemorySSA)
-    MSSAU->getMemorySSA()->verifyMemorySSA();
+    ++NumRotated;
 
-  // Now that the CFG and DomTree are in a consistent state again, try to merge
-  // the OrigHeader block into OrigLatch.  This will succeed if they are
-  // connected by an unconditional branch.  This is just a cleanup so the
-  // emitted code isn't too gross in this common case.
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
-  MergeBlockIntoPredecessor(OrigHeader, &DTU, LI, MSSAU);
+    Rotated = true;
+    SimplifiedLatch = false;
 
-  if (MSSAU && VerifyMemorySSA)
-    MSSAU->getMemorySSA()->verifyMemorySSA();
+    // Check that new latch is a deoptimizing exit and then repeat rotation if possible.
+    // Deoptimizing latch exit is not a generally typical case, so we just loop over.
+    // TODO: if it becomes a performance bottleneck extend rotation algorithm
+    // to handle multiple rotations in one go.
+  } while (MultiRotate && canRotateDeoptimizingLatchExit(L));
 
-  LLVM_DEBUG(dbgs() << "LoopRotation: into "; L->dump());
 
-  ++NumRotated;
   return true;
 }
 
diff --git a/llvm/lib/Transforms/Utils/LoopSimplify.cpp b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
index 28f88f39a712..a8445e94e55a 100644
--- a/llvm/lib/Transforms/Utils/LoopSimplify.cpp
+++ b/llvm/lib/Transforms/Utils/LoopSimplify.cpp
@@ -230,6 +230,27 @@ static Loop *separateNestedLoop(Loop *L, BasicBlock *Preheader,
   if (!Preheader)
     return nullptr;
 
+  // Treat the presence of convergent functions conservatively. The
+  // transformation is invalid if calls to certain convergent
+  // functions (like an AMDGPU barrier) get included in the resulting
+  // inner loop. But blocks meant for the inner loop will be
+  // identified later at a point where it's too late to abort the
+  // transformation. Also, the convergent attribute is not really
+  // sufficient to express the semantics of functions that are
+  // affected by this transformation. So we choose to back off if such
+  // a function call is present until a better alternative becomes
+  // available. This is similar to the conservative treatment of
+  // convergent function calls in GVNHoist and JumpThreading.
+  for (auto BB : L->blocks()) {
+    for (auto &II : *BB) {
+      if (auto CI = dyn_cast<CallBase>(&II)) {
+        if (CI->isConvergent()) {
+          return nullptr;
+        }
+      }
+    }
+  }
+
   // The header is not a landing pad; preheader insertion should ensure this.
   BasicBlock *Header = L->getHeader();
   assert(!Header->isEHPad() && "Can't insert backedge to EH pad");
@@ -598,6 +619,7 @@ ReprocessLoop:
       if (!PreserveLCSSA || LI->replacementPreservesLCSSAForm(PN, V)) {
         PN->replaceAllUsesWith(V);
         PN->eraseFromParent();
+        Changed = true;
       }
     }
 
@@ -674,10 +696,8 @@ ReprocessLoop:
       LI->removeBlock(ExitingBlock);
 
       DomTreeNode *Node = DT->getNode(ExitingBlock);
-      const std::vector<DomTreeNodeBase<BasicBlock> *> &Children =
-        Node->getChildren();
-      while (!Children.empty()) {
-        DomTreeNode *Child = Children.front();
+      while (!Node->isLeaf()) {
+        DomTreeNode *Child = Node->back();
         DT->changeImmediateDominator(Child, Node->getIDom());
       }
       DT->eraseNode(ExitingBlock);
diff --git a/llvm/lib/Transforms/Utils/LoopUnroll.cpp b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
index 4b94b371e70a..3875c631f839 100644
--- a/llvm/lib/Transforms/Utils/LoopUnroll.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnroll.cpp
@@ -15,21 +15,46 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/ilist_iterator.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AssumptionCache.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DataLayout.h"
+#include "llvm/IR/CFG.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/GenericDomTree.h"
+#include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
@@ -38,6 +63,17 @@
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <algorithm>
+#include <assert.h>
+#include <type_traits>
+#include <vector>
+
+namespace llvm {
+class DataLayout;
+class Value;
+} // namespace llvm
+
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-unroll"
@@ -45,8 +81,8 @@ using namespace llvm;
 // TODO: Should these be here or in LoopUnroll?
 STATISTIC(NumCompletelyUnrolled, "Number of loops completely unrolled");
 STATISTIC(NumUnrolled, "Number of loops unrolled (completely or otherwise)");
-STATISTIC(NumUnrolledWithHeader, "Number of loops unrolled without a "
-                                 "conditional latch (completely or otherwise)");
+STATISTIC(NumUnrolledNotLatch, "Number of loops unrolled without a conditional "
+                               "latch (completely or otherwise)");
 
 static cl::opt<bool>
 UnrollRuntimeEpilog("unroll-runtime-epilog", cl::init(false), cl::Hidden,
@@ -63,39 +99,6 @@ UnrollVerifyDomtree("unroll-verify-domtree", cl::Hidden,
 #endif
                     );
 
-/// Convert the instruction operands from referencing the current values into
-/// those specified by VMap.
-void llvm::remapInstruction(Instruction *I, ValueToValueMapTy &VMap) {
-  for (unsigned op = 0, E = I->getNumOperands(); op != E; ++op) {
-    Value *Op = I->getOperand(op);
-
-    // Unwrap arguments of dbg.value intrinsics.
-    bool Wrapped = false;
-    if (auto *V = dyn_cast<MetadataAsValue>(Op))
-      if (auto *Unwrapped = dyn_cast<ValueAsMetadata>(V->getMetadata())) {
-        Op = Unwrapped->getValue();
-        Wrapped = true;
-      }
-
-    auto wrap = [&](Value *V) {
-      auto &C = I->getContext();
-      return Wrapped ? MetadataAsValue::get(C, ValueAsMetadata::get(V)) : V;
-    };
-
-    ValueToValueMapTy::iterator It = VMap.find(Op);
-    if (It != VMap.end())
-      I->setOperand(op, wrap(It->second));
-  }
-
-  if (PHINode *PN = dyn_cast<PHINode>(I)) {
-    for (unsigned i = 0, e = PN->getNumIncomingValues(); i != e; ++i) {
-      ValueToValueMapTy::iterator It = VMap.find(PN->getIncomingBlock(i));
-      if (It != VMap.end())
-        PN->setIncomingBlock(i, cast<BasicBlock>(It->second));
-    }
-  }
-}
-
 /// Check if unrolling created a situation where we need to insert phi nodes to
 /// preserve LCSSA form.
 /// \param Blocks is a vector of basic blocks representing unrolled loop.
@@ -199,18 +202,20 @@ static bool isEpilogProfitable(Loop *L) {
 /// simplify/dce pass of the instructions.
 void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
                                    ScalarEvolution *SE, DominatorTree *DT,
-                                   AssumptionCache *AC) {
+                                   AssumptionCache *AC,
+                                   const TargetTransformInfo *TTI) {
   // Simplify any new induction variables in the partially unrolled loop.
   if (SE && SimplifyIVs) {
     SmallVector<WeakTrackingVH, 16> DeadInsts;
-    simplifyLoopIVs(L, SE, DT, LI, DeadInsts);
+    simplifyLoopIVs(L, SE, DT, LI, TTI, DeadInsts);
 
     // Aggressively clean up dead instructions that simplifyLoopIVs already
     // identified. Any remaining should be cleaned up below.
-    while (!DeadInsts.empty())
-      if (Instruction *Inst =
-              dyn_cast_or_null<Instruction>(&*DeadInsts.pop_back_val()))
+    while (!DeadInsts.empty()) {
+      Value *V = DeadInsts.pop_back_val();
+      if (Instruction *Inst = dyn_cast_or_null<Instruction>(V))
         RecursivelyDeleteTriviallyDeadInstructions(Inst);
+    }
   }
 
   // At this point, the code is well formed.  We now do a quick sweep over the
@@ -277,6 +282,7 @@ void llvm::simplifyLoopAfterUnroll(Loop *L, bool SimplifyIVs, LoopInfo *LI,
 LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
                                   ScalarEvolution *SE, DominatorTree *DT,
                                   AssumptionCache *AC,
+                                  const TargetTransformInfo *TTI,
                                   OptimizationRemarkEmitter *ORE,
                                   bool PreserveLCSSA, Loop **RemainderLoop) {
 
@@ -298,48 +304,35 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     return LoopUnrollResult::Unmodified;
   }
 
-  // The current loop unroll pass can unroll loops with a single latch or header
-  // that's a conditional branch exiting the loop.
+  // The current loop unroll pass can unroll loops that have
+  // (1) single latch; and
+  // (2a) latch is unconditional; or
+  // (2b) latch is conditional and is an exiting block
   // FIXME: The implementation can be extended to work with more complicated
   // cases, e.g. loops with multiple latches.
   BasicBlock *Header = L->getHeader();
-  BranchInst *HeaderBI = dyn_cast<BranchInst>(Header->getTerminator());
-  BranchInst *BI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
-
-  // FIXME: Support loops without conditional latch and multiple exiting blocks.
-  if (!BI ||
-      (BI->isUnconditional() && (!HeaderBI || HeaderBI->isUnconditional() ||
-                                 L->getExitingBlock() != Header))) {
-    LLVM_DEBUG(dbgs() << "  Can't unroll; loop not terminated by a conditional "
-                         "branch in the latch or header.\n");
-    return LoopUnrollResult::Unmodified;
-  }
-
-  auto CheckLatchSuccessors = [&](unsigned S1, unsigned S2) {
-    return BI->isConditional() && BI->getSuccessor(S1) == Header &&
-           !L->contains(BI->getSuccessor(S2));
-  };
-
-  // If we have a conditional latch, it must exit the loop.
-  if (BI && BI->isConditional() && !CheckLatchSuccessors(0, 1) &&
-      !CheckLatchSuccessors(1, 0)) {
+  BranchInst *LatchBI = dyn_cast<BranchInst>(LatchBlock->getTerminator());
+
+  // A conditional branch which exits the loop, which can be optimized to an
+  // unconditional branch in the unrolled loop in some cases.
+  BranchInst *ExitingBI = nullptr;
+  bool LatchIsExiting = L->isLoopExiting(LatchBlock);
+  if (LatchIsExiting)
+    ExitingBI = LatchBI;
+  else if (BasicBlock *ExitingBlock = L->getExitingBlock())
+    ExitingBI = dyn_cast<BranchInst>(ExitingBlock->getTerminator());
+  if (!LatchBI || (LatchBI->isConditional() && !LatchIsExiting)) {
     LLVM_DEBUG(
         dbgs() << "Can't unroll; a conditional latch must exit the loop");
     return LoopUnrollResult::Unmodified;
   }
-
-  auto CheckHeaderSuccessors = [&](unsigned S1, unsigned S2) {
-    return HeaderBI && HeaderBI->isConditional() &&
-           L->contains(HeaderBI->getSuccessor(S1)) &&
-           !L->contains(HeaderBI->getSuccessor(S2));
-  };
-
-  // If we do not have a conditional latch, the header must exit the loop.
-  if (BI && !BI->isConditional() && HeaderBI && HeaderBI->isConditional() &&
-      !CheckHeaderSuccessors(0, 1) && !CheckHeaderSuccessors(1, 0)) {
-    LLVM_DEBUG(dbgs() << "Can't unroll; conditional header must exit the loop");
-    return LoopUnrollResult::Unmodified;
-  }
+  LLVM_DEBUG({
+    if (ExitingBI)
+      dbgs() << "  Exiting Block = " << ExitingBI->getParent()->getName()
+             << "\n";
+    else
+      dbgs() << "  No single exiting block\n";
+  });
 
   if (Header->hasAddressTaken()) {
     // The loop-rotate pass can be helpful to avoid this in many cases.
@@ -421,8 +414,8 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         bool HasConvergent = false;
         for (auto &BB : L->blocks())
           for (auto &I : *BB)
-            if (auto CS = CallSite(&I))
-              HasConvergent |= CS.isConvergent();
+            if (auto *CB = dyn_cast<CallBase>(&I))
+              HasConvergent |= CB->isConvergent();
         assert((!HasConvergent || ULO.TripMultiple % ULO.Count == 0) &&
                "Unroll count must divide trip multiple if loop contains a "
                "convergent operation.");
@@ -435,7 +428,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   if (RuntimeTripCount && ULO.TripMultiple % ULO.Count != 0 &&
       !UnrollRuntimeLoopRemainder(L, ULO.Count, ULO.AllowExpensiveTripCount,
                                   EpilogProfitability, ULO.UnrollRemainder,
-                                  ULO.ForgetAllSCEV, LI, SE, DT, AC,
+                                  ULO.ForgetAllSCEV, LI, SE, DT, AC, TTI,
                                   PreserveLCSSA, RemainderLoop)) {
     if (ULO.Force)
       RuntimeTripCount = false;
@@ -528,16 +521,13 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       SE->forgetTopmostLoop(L);
   }
 
-  bool ContinueOnTrue;
-  bool LatchIsExiting = BI->isConditional();
+  if (!LatchIsExiting)
+    ++NumUnrolledNotLatch;
+  Optional<bool> ContinueOnTrue = None;
   BasicBlock *LoopExit = nullptr;
-  if (LatchIsExiting) {
-    ContinueOnTrue = L->contains(BI->getSuccessor(0));
-    LoopExit = BI->getSuccessor(ContinueOnTrue);
-  } else {
-    NumUnrolledWithHeader++;
-    ContinueOnTrue = L->contains(HeaderBI->getSuccessor(0));
-    LoopExit = HeaderBI->getSuccessor(ContinueOnTrue);
+  if (ExitingBI) {
+    ContinueOnTrue = L->contains(ExitingBI->getSuccessor(0));
+    LoopExit = ExitingBI->getSuccessor(*ContinueOnTrue);
   }
 
   // For the first iteration of the loop, we should use the precloned values for
@@ -549,20 +539,14 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   }
 
   std::vector<BasicBlock *> Headers;
-  std::vector<BasicBlock *> HeaderSucc;
+  std::vector<BasicBlock *> ExitingBlocks;
+  std::vector<BasicBlock *> ExitingSucc;
   std::vector<BasicBlock *> Latches;
   Headers.push_back(Header);
   Latches.push_back(LatchBlock);
-
-  if (!LatchIsExiting) {
-    auto *Term = cast<BranchInst>(Header->getTerminator());
-    if (Term->isUnconditional() || L->contains(Term->getSuccessor(0))) {
-      assert(L->contains(Term->getSuccessor(0)));
-      HeaderSucc.push_back(Term->getSuccessor(0));
-    } else {
-      assert(L->contains(Term->getSuccessor(1)));
-      HeaderSucc.push_back(Term->getSuccessor(1));
-    }
+  if (ExitingBI) {
+    ExitingBlocks.push_back(ExitingBI->getParent());
+    ExitingSucc.push_back(ExitingBI->getSuccessor(!(*ContinueOnTrue)));
   }
 
   // The current on-the-fly SSA update requires blocks to be processed in
@@ -600,7 +584,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
           }
 
   for (unsigned It = 1; It != ULO.Count; ++It) {
-    std::vector<BasicBlock*> NewBlocks;
+    SmallVector<BasicBlock *, 8> NewBlocks;
     SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
     NewLoops[L] = L;
 
@@ -654,12 +638,14 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
       if (*BB == LatchBlock)
         Latches.push_back(New);
 
-      // Keep track of the successor of the new header in the current iteration.
-      for (auto *Pred : predecessors(*BB))
-        if (Pred == Header) {
-          HeaderSucc.push_back(New);
-          break;
-        }
+      // Keep track of the exiting block and its successor block contained in
+      // the loop for the current iteration.
+      if (ExitingBI) {
+        if (*BB == ExitingBlocks[0])
+          ExitingBlocks.push_back(New);
+        if (*BB == ExitingSucc[0])
+          ExitingSucc.push_back(New);
+      }
 
       NewBlocks.push_back(New);
       UnrolledLoopBlocks.push_back(New);
@@ -682,9 +668,9 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     }
 
     // Remap all instructions in the most recent iteration
+    remapInstructionsInBlocks(NewBlocks, LastValueMap);
     for (BasicBlock *NewBlock : NewBlocks) {
       for (Instruction &I : *NewBlock) {
-        ::remapInstruction(&I, LastValueMap);
         if (auto *II = dyn_cast<IntrinsicInst>(&I))
           if (II->getIntrinsicID() == Intrinsic::assume)
             AC->registerAssumption(II);
@@ -710,18 +696,19 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     }
   }
 
-  auto setDest = [LoopExit, ContinueOnTrue](BasicBlock *Src, BasicBlock *Dest,
-                                            ArrayRef<BasicBlock *> NextBlocks,
-                                            BasicBlock *BlockInLoop,
-                                            bool NeedConditional) {
+  auto setDest = [](BasicBlock *Src, BasicBlock *Dest, BasicBlock *BlockInLoop,
+                    bool NeedConditional, Optional<bool> ContinueOnTrue,
+                    bool IsDestLoopExit) {
     auto *Term = cast<BranchInst>(Src->getTerminator());
     if (NeedConditional) {
       // Update the conditional branch's successor for the following
       // iteration.
-      Term->setSuccessor(!ContinueOnTrue, Dest);
+      assert(ContinueOnTrue.hasValue() &&
+             "Expecting valid ContinueOnTrue when NeedConditional is true");
+      Term->setSuccessor(!(*ContinueOnTrue), Dest);
     } else {
       // Remove phi operands at this loop exit
-      if (Dest != LoopExit) {
+      if (!IsDestLoopExit) {
         BasicBlock *BB = Src;
         for (BasicBlock *Succ : successors(BB)) {
           // Preserve the incoming value from BB if we are jumping to the block
@@ -738,29 +725,27 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     }
   };
 
-  // Now that all the basic blocks for the unrolled iterations are in place,
-  // set up the branches to connect them.
-  if (LatchIsExiting) {
-    // Set up latches to branch to the new header in the unrolled iterations or
-    // the loop exit for the last latch in a fully unrolled loop.
-    for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
-      // The branch destination.
-      unsigned j = (i + 1) % e;
-      BasicBlock *Dest = Headers[j];
-      bool NeedConditional = true;
+  // Connect latches of the unrolled iterations to the headers of the next
+  // iteration. If the latch is also the exiting block, the conditional branch
+  // may have to be preserved.
+  for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
+    // The branch destination.
+    unsigned j = (i + 1) % e;
+    BasicBlock *Dest = Headers[j];
+    bool NeedConditional = LatchIsExiting;
 
-      if (RuntimeTripCount && j != 0) {
+    if (LatchIsExiting) {
+      if (RuntimeTripCount && j != 0)
         NeedConditional = false;
-      }
 
       // For a complete unroll, make the last iteration end with a branch
       // to the exit block.
       if (CompletelyUnroll) {
         if (j == 0)
           Dest = LoopExit;
-        // If using trip count upper bound to completely unroll, we need to keep
-        // the conditional branch except the last one because the loop may exit
-        // after any iteration.
+        // If using trip count upper bound to completely unroll, we need to
+        // keep the conditional branch except the last one because the loop
+        // may exit after any iteration.
         assert(NeedConditional &&
                "NeedCondition cannot be modified by both complete "
                "unrolling and runtime unrolling");
@@ -772,16 +757,18 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         // unconditional branch for some iterations.
         NeedConditional = false;
       }
-
-      setDest(Latches[i], Dest, Headers, Headers[i], NeedConditional);
     }
-  } else {
-    // Setup headers to branch to their new successors in the unrolled
-    // iterations.
-    for (unsigned i = 0, e = Headers.size(); i != e; ++i) {
+
+    setDest(Latches[i], Dest, Headers[i], NeedConditional, ContinueOnTrue,
+            Dest == LoopExit);
+  }
+
+  if (!LatchIsExiting) {
+    // If the latch is not exiting, we may be able to simplify the conditional
+    // branches in the unrolled exiting blocks.
+    for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
       // The branch destination.
       unsigned j = (i + 1) % e;
-      BasicBlock *Dest = HeaderSucc[i];
       bool NeedConditional = true;
 
       if (RuntimeTripCount && j != 0)
@@ -797,27 +784,19 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         // unconditional branch for some iterations.
         NeedConditional = false;
 
-      setDest(Headers[i], Dest, Headers, HeaderSucc[i], NeedConditional);
+      // Conditional branches from non-latch exiting block have successors
+      // either in the same loop iteration or outside the loop. The branches are
+      // already correct.
+      if (NeedConditional)
+        continue;
+      setDest(ExitingBlocks[i], ExitingSucc[i], ExitingSucc[i], NeedConditional,
+              None, false);
     }
 
-    // Set up latches to branch to the new header in the unrolled iterations or
-    // the loop exit for the last latch in a fully unrolled loop.
-
-    for (unsigned i = 0, e = Latches.size(); i != e; ++i) {
-      // The original branch was replicated in each unrolled iteration.
-      BranchInst *Term = cast<BranchInst>(Latches[i]->getTerminator());
-
-      // The branch destination.
-      unsigned j = (i + 1) % e;
-      BasicBlock *Dest = Headers[j];
-
-      // When completely unrolling, the last latch becomes unreachable.
-      if (CompletelyUnroll && j == 0)
-        new UnreachableInst(Term->getContext(), Term);
-      else
-        // Replace the conditional branch with an unconditional one.
-        BranchInst::Create(Dest, Term);
-
+    // When completely unrolling, the last latch becomes unreachable.
+    if (CompletelyUnroll) {
+      BranchInst *Term = cast<BranchInst>(Latches.back()->getTerminator());
+      new UnreachableInst(Term->getContext(), Term);
       Term->eraseFromParent();
     }
   }
@@ -830,15 +809,13 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
     for (auto *BB : OriginalLoopBlocks) {
       auto *BBDomNode = DT->getNode(BB);
       SmallVector<BasicBlock *, 16> ChildrenToUpdate;
-      for (auto *ChildDomNode : BBDomNode->getChildren()) {
+      for (auto *ChildDomNode : BBDomNode->children()) {
         auto *ChildBB = ChildDomNode->getBlock();
         if (!L->contains(ChildBB))
           ChildrenToUpdate.push_back(ChildBB);
       }
       BasicBlock *NewIDom;
-      BasicBlock *&TermBlock = LatchIsExiting ? LatchBlock : Header;
-      auto &TermBlocks = LatchIsExiting ? Latches : Headers;
-      if (BB == TermBlock) {
+      if (ExitingBI && BB == ExitingBlocks[0]) {
         // The latch is special because we emit unconditional branches in
         // some cases where the original loop contained a conditional branch.
         // Since the latch is always at the bottom of the loop, if the latch
@@ -846,13 +823,14 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
         // must also be a latch.  Specifically, the dominator is the first
         // latch which ends in a conditional branch, or the last latch if
         // there is no such latch.
-        // For loops exiting from the header, we limit the supported loops
-        // to have a single exiting block.
-        NewIDom = TermBlocks.back();
-        for (BasicBlock *Iter : TermBlocks) {
-          Instruction *Term = Iter->getTerminator();
+        // For loops exiting from non latch exiting block, we limit the
+        // branch simplification to single exiting block loops.
+        NewIDom = ExitingBlocks.back();
+        for (unsigned i = 0, e = ExitingBlocks.size(); i != e; ++i) {
+          Instruction *Term = ExitingBlocks[i]->getTerminator();
           if (isa<BranchInst>(Term) && cast<BranchInst>(Term)->isConditional()) {
-            NewIDom = Iter;
+            NewIDom =
+                DT->findNearestCommonDominator(ExitingBlocks[i], Latches[i]);
             break;
           }
         }
@@ -897,7 +875,7 @@ LoopUnrollResult llvm::UnrollLoop(Loop *L, UnrollLoopOptions ULO, LoopInfo *LI,
   // At this point, the code is well formed.  We now simplify the unrolled loop,
   // doing constant propagation and dead code elimination as we go.
   simplifyLoopAfterUnroll(L, !CompletelyUnroll && (ULO.Count > 1 || Peeled), LI,
-                          SE, DT, AC);
+                          SE, DT, AC, TTI);
 
   NumCompletelyUnrolled += CompletelyUnroll;
   ++NumUnrolled;
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
index f1965934b2d7..dd628f3e7e0c 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollAndJam.cpp
@@ -11,31 +11,54 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/Sequence.h"
 #include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
+#include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/Analysis/DependenceAnalysis.h"
-#include "llvm/Analysis/InstructionSimplify.h"
-#include "llvm/Analysis/LoopAnalysisManager.h"
+#include "llvm/Analysis/DomTreeUpdater.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopIterator.h"
-#include "llvm/Analysis/LoopPass.h"
+#include "llvm/Analysis/MustExecute.h"
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/Utils/Local.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/DataLayout.h"
 #include "llvm/IR/DebugInfoMetadata.h"
+#include "llvm/IR/DebugLoc.h"
+#include "llvm/IR/DiagnosticInfo.h"
 #include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/Instruction.h"
+#include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
-#include "llvm/IR/LLVMContext.h"
+#include "llvm/IR/Use.h"
+#include "llvm/IR/User.h"
+#include "llvm/IR/Value.h"
+#include "llvm/IR/ValueHandle.h"
+#include "llvm/IR/ValueMap.h"
+#include "llvm/Support/Casting.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/Support/ErrorHandling.h"
+#include "llvm/Support/GenericDomTree.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
-#include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
-#include "llvm/Transforms/Utils/SimplifyIndVar.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
+#include "llvm/Transforms/Utils/ValueMapper.h"
+#include <assert.h>
+#include <memory>
+#include <type_traits>
+#include <vector>
+
 using namespace llvm;
 
 #define DEBUG_TYPE "loop-unroll-and-jam"
@@ -47,17 +70,14 @@ typedef SmallPtrSet<BasicBlock *, 4> BasicBlockSet;
 
 // Partition blocks in an outer/inner loop pair into blocks before and after
 // the loop
-static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
-                                     BasicBlockSet &ForeBlocks,
-                                     BasicBlockSet &SubLoopBlocks,
-                                     BasicBlockSet &AftBlocks,
-                                     DominatorTree *DT) {
+static bool partitionLoopBlocks(Loop &L, BasicBlockSet &ForeBlocks,
+                                BasicBlockSet &AftBlocks, DominatorTree &DT) {
+  Loop *SubLoop = L.getSubLoops()[0];
   BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
-  SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
 
-  for (BasicBlock *BB : L->blocks()) {
+  for (BasicBlock *BB : L.blocks()) {
     if (!SubLoop->contains(BB)) {
-      if (DT->dominates(SubLoopLatch, BB))
+      if (DT.dominates(SubLoopLatch, BB))
         AftBlocks.insert(BB);
       else
         ForeBlocks.insert(BB);
@@ -71,14 +91,44 @@ static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
     if (BB == SubLoopPreHeader)
       continue;
     Instruction *TI = BB->getTerminator();
-    for (unsigned i = 0, e = TI->getNumSuccessors(); i != e; ++i)
-      if (!ForeBlocks.count(TI->getSuccessor(i)))
+    for (BasicBlock *Succ : successors(TI))
+      if (!ForeBlocks.count(Succ))
         return false;
   }
 
   return true;
 }
 
+/// Partition blocks in a loop nest into blocks before and after each inner
+/// loop.
+static bool partitionOuterLoopBlocks(
+    Loop &Root, Loop &JamLoop, BasicBlockSet &JamLoopBlocks,
+    DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
+    DenseMap<Loop *, BasicBlockSet> &AftBlocksMap, DominatorTree &DT) {
+  JamLoopBlocks.insert(JamLoop.block_begin(), JamLoop.block_end());
+
+  for (Loop *L : Root.getLoopsInPreorder()) {
+    if (L == &JamLoop)
+      break;
+
+    if (!partitionLoopBlocks(*L, ForeBlocksMap[L], AftBlocksMap[L], DT))
+      return false;
+  }
+
+  return true;
+}
+
+// TODO Remove when UnrollAndJamLoop changed to support unroll and jamming more
+// than 2 levels loop.
+static bool partitionOuterLoopBlocks(Loop *L, Loop *SubLoop,
+                                     BasicBlockSet &ForeBlocks,
+                                     BasicBlockSet &SubLoopBlocks,
+                                     BasicBlockSet &AftBlocks,
+                                     DominatorTree *DT) {
+  SubLoopBlocks.insert(SubLoop->block_begin(), SubLoop->block_end());
+  return partitionLoopBlocks(*L, ForeBlocks, AftBlocks, *DT);
+}
+
 // Looks at the phi nodes in Header for values coming from Latch. For these
 // instructions and all their operands calls Visit on them, keeping going for
 // all the operands in AftBlocks. Returns false if Visit returns false,
@@ -169,10 +219,12 @@ static void moveHeaderPhiOperandsToForeBlocks(BasicBlock *Header,
   If EpilogueLoop is non-null, it receives the epilogue loop (if it was
   necessary to create one and not fully unrolled).
 */
-LoopUnrollResult llvm::UnrollAndJamLoop(
-    Loop *L, unsigned Count, unsigned TripCount, unsigned TripMultiple,
-    bool UnrollRemainder, LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
-    AssumptionCache *AC, OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
+LoopUnrollResult
+llvm::UnrollAndJamLoop(Loop *L, unsigned Count, unsigned TripCount,
+                       unsigned TripMultiple, bool UnrollRemainder,
+                       LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT,
+                       AssumptionCache *AC, const TargetTransformInfo *TTI,
+                       OptimizationRemarkEmitter *ORE, Loop **EpilogueLoop) {
 
   // When we enter here we should have already checked that it is safe
   BasicBlock *Header = L->getHeader();
@@ -198,7 +250,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
     if (!UnrollRuntimeLoopRemainder(L, Count, /*AllowExpensiveTripCount*/ false,
                                     /*UseEpilogRemainder*/ true,
                                     UnrollRemainder, /*ForgetAllSCEV*/ false,
-                                    LI, SE, DT, AC, true, EpilogueLoop)) {
+                                    LI, SE, DT, AC, TTI, true, EpilogueLoop)) {
       LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; remainder loop could not be "
                            "generated when assuming runtime trip count\n");
       return LoopUnrollResult::Unmodified;
@@ -284,8 +336,7 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
 
   // Move any instructions from fore phi operands from AftBlocks into Fore.
   moveHeaderPhiOperandsToForeBlocks(
-      Header, LatchBlock, SubLoop->getLoopPreheader()->getTerminator(),
-      AftBlocks);
+      Header, LatchBlock, ForeBlocksLast[0]->getTerminator(), AftBlocks);
 
   // The current on-the-fly SSA update requires blocks to be processed in
   // reverse postorder so that LastValueMap contains the correct value at each
@@ -312,32 +363,32 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
 
   // Copy all blocks
   for (unsigned It = 1; It != Count; ++It) {
-    std::vector<BasicBlock *> NewBlocks;
+    SmallVector<BasicBlock *, 8> NewBlocks;
     // Maps Blocks[It] -> Blocks[It-1]
     DenseMap<Value *, Value *> PrevItValueMap;
+    SmallDenseMap<const Loop *, Loop *, 4> NewLoops;
+    NewLoops[L] = L;
+    NewLoops[SubLoop] = SubLoop;
 
     for (LoopBlocksDFS::RPOIterator BB = BlockBegin; BB != BlockEnd; ++BB) {
       ValueToValueMapTy VMap;
       BasicBlock *New = CloneBasicBlock(*BB, VMap, "." + Twine(It));
       Header->getParent()->getBasicBlockList().push_back(New);
 
-      if (ForeBlocks.count(*BB)) {
-        L->addBasicBlockToLoop(New, *LI);
+      // Tell LI about New.
+      addClonedBlockToLoopInfo(*BB, New, LI, NewLoops);
 
+      if (ForeBlocks.count(*BB)) {
         if (*BB == ForeBlocksFirst[0])
           ForeBlocksFirst.push_back(New);
         if (*BB == ForeBlocksLast[0])
           ForeBlocksLast.push_back(New);
       } else if (SubLoopBlocks.count(*BB)) {
-        SubLoop->addBasicBlockToLoop(New, *LI);
-
         if (*BB == SubLoopBlocksFirst[0])
           SubLoopBlocksFirst.push_back(New);
         if (*BB == SubLoopBlocksLast[0])
           SubLoopBlocksLast.push_back(New);
       } else if (AftBlocks.count(*BB)) {
-        L->addBasicBlockToLoop(New, *LI);
-
         if (*BB == AftBlocksFirst[0])
           AftBlocksFirst.push_back(New);
         if (*BB == AftBlocksLast[0])
@@ -379,9 +430,9 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
     }
 
     // Remap all instructions in the most recent iteration
+    remapInstructionsInBlocks(NewBlocks, LastValueMap);
     for (BasicBlock *NewBlock : NewBlocks) {
       for (Instruction &I : *NewBlock) {
-        ::remapInstruction(&I, LastValueMap);
         if (auto *II = dyn_cast<IntrinsicInst>(&I))
           if (II->getIntrinsicID() == Intrinsic::assume)
             AC->registerAssumption(II);
@@ -447,8 +498,8 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
   // Update ForeBlocks successors and phi nodes
   BranchInst *ForeTerm =
       cast<BranchInst>(ForeBlocksLast.back()->getTerminator());
-  BasicBlock *Dest = SubLoopBlocksFirst[0];
-  ForeTerm->setSuccessor(0, Dest);
+  assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
+  ForeTerm->setSuccessor(0, SubLoopBlocksFirst[0]);
 
   if (CompletelyUnroll) {
     while (PHINode *Phi = dyn_cast<PHINode>(ForeBlocksFirst[0]->begin())) {
@@ -465,8 +516,8 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
     // Remap ForeBlock successors from previous iteration to this
     BranchInst *ForeTerm =
         cast<BranchInst>(ForeBlocksLast[It - 1]->getTerminator());
-    BasicBlock *Dest = ForeBlocksFirst[It];
-    ForeTerm->setSuccessor(0, Dest);
+    assert(ForeTerm->getNumSuccessors() == 1 && "Expecting one successor");
+    ForeTerm->setSuccessor(0, ForeBlocksFirst[It]);
   }
 
   // Subloop successors and phis
@@ -495,12 +546,14 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
   }
 
   // Aft blocks successors and phis
-  BranchInst *Term = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
+  BranchInst *AftTerm = cast<BranchInst>(AftBlocksLast.back()->getTerminator());
   if (CompletelyUnroll) {
-    BranchInst::Create(LoopExit, Term);
-    Term->eraseFromParent();
+    BranchInst::Create(LoopExit, AftTerm);
+    AftTerm->eraseFromParent();
   } else {
-    Term->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
+    AftTerm->setSuccessor(!ContinueOnTrue, ForeBlocksFirst[0]);
+    assert(AftTerm->getSuccessor(ContinueOnTrue) == LoopExit &&
+           "Expecting the ContinueOnTrue successor of AftTerm to be LoopExit");
   }
   updatePHIBlocks(AftBlocksFirst[0], SubLoopBlocksLast[0],
                   SubLoopBlocksLast.back());
@@ -540,55 +593,48 @@ LoopUnrollResult llvm::UnrollAndJamLoop(
   MergeBlocks.insert(ForeBlocksLast.begin(), ForeBlocksLast.end());
   MergeBlocks.insert(SubLoopBlocksLast.begin(), SubLoopBlocksLast.end());
   MergeBlocks.insert(AftBlocksLast.begin(), AftBlocksLast.end());
-  while (!MergeBlocks.empty()) {
-    BasicBlock *BB = *MergeBlocks.begin();
-    BranchInst *Term = dyn_cast<BranchInst>(BB->getTerminator());
-    if (Term && Term->isUnconditional() && L->contains(Term->getSuccessor(0))) {
-      BasicBlock *Dest = Term->getSuccessor(0);
-      BasicBlock *Fold = Dest->getUniquePredecessor();
-      if (MergeBlockIntoPredecessor(Dest, &DTU, LI)) {
-        // Don't remove BB and add Fold as they are the same BB
-        assert(Fold == BB);
-        (void)Fold;
-        MergeBlocks.erase(Dest);
-      } else
-        MergeBlocks.erase(BB);
-    } else
-      MergeBlocks.erase(BB);
-  }
+
+  MergeBlockSuccessorsIntoGivenBlocks(MergeBlocks, L, &DTU, LI);
+
   // Apply updates to the DomTree.
   DT = &DTU.getDomTree();
 
   // At this point, the code is well formed.  We now do a quick sweep over the
   // inserted code, doing constant propagation and dead code elimination as we
   // go.
-  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC);
-  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC);
+  simplifyLoopAfterUnroll(SubLoop, true, LI, SE, DT, AC, TTI);
+  simplifyLoopAfterUnroll(L, !CompletelyUnroll && Count > 1, LI, SE, DT, AC,
+                          TTI);
 
   NumCompletelyUnrolledAndJammed += CompletelyUnroll;
   ++NumUnrolledAndJammed;
 
+  // Update LoopInfo if the loop is completely removed.
+  if (CompletelyUnroll)
+    LI->erase(L);
+
 #ifndef NDEBUG
   // We shouldn't have done anything to break loop simplify form or LCSSA.
-  Loop *OuterL = L->getParentLoop();
-  Loop *OutestLoop = OuterL ? OuterL : (!CompletelyUnroll ? L : SubLoop);
+  Loop *OutestLoop = SubLoop->getParentLoop()
+                         ? SubLoop->getParentLoop()->getParentLoop()
+                               ? SubLoop->getParentLoop()->getParentLoop()
+                               : SubLoop->getParentLoop()
+                         : SubLoop;
+  assert(DT->verify());
+  LI->verify(*DT);
   assert(OutestLoop->isRecursivelyLCSSAForm(*DT, *LI));
   if (!CompletelyUnroll)
     assert(L->isLoopSimplifyForm());
   assert(SubLoop->isLoopSimplifyForm());
-  assert(DT->verify());
+  SE->verify();
 #endif
 
-  // Update LoopInfo if the loop is completely removed.
-  if (CompletelyUnroll)
-    LI->erase(L);
-
   return CompletelyUnroll ? LoopUnrollResult::FullyUnrolled
                           : LoopUnrollResult::PartiallyUnrolled;
 }
 
 static bool getLoadsAndStores(BasicBlockSet &Blocks,
-                              SmallVector<Value *, 4> &MemInstr) {
+                              SmallVector<Instruction *, 4> &MemInstr) {
   // Scan the BBs and collect legal loads and stores.
   // Returns false if non-simple loads/stores are found.
   for (BasicBlock *BB : Blocks) {
@@ -609,97 +655,235 @@ static bool getLoadsAndStores(BasicBlockSet &Blocks,
   return true;
 }
 
-static bool checkDependencies(SmallVector<Value *, 4> &Earlier,
-                              SmallVector<Value *, 4> &Later,
-                              unsigned LoopDepth, bool InnerLoop,
-                              DependenceInfo &DI) {
-  // Use DA to check for dependencies between loads and stores that make unroll
-  // and jam invalid
-  for (Value *I : Earlier) {
-    for (Value *J : Later) {
-      Instruction *Src = cast<Instruction>(I);
-      Instruction *Dst = cast<Instruction>(J);
-      if (Src == Dst)
-        continue;
-      // Ignore Input dependencies.
-      if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
-        continue;
-
-      // Track dependencies, and if we find them take a conservative approach
-      // by allowing only = or < (not >), altough some > would be safe
-      // (depending upon unroll width).
-      // For the inner loop, we need to disallow any (> <) dependencies
-      // FIXME: Allow > so long as distance is less than unroll width
-      if (auto D = DI.depends(Src, Dst, true)) {
-        assert(D->isOrdered() && "Expected an output, flow or anti dep.");
-
-        if (D->isConfused()) {
-          LLVM_DEBUG(dbgs() << "  Confused dependency between:\n"
-                            << "  " << *Src << "\n"
-                            << "  " << *Dst << "\n");
+static bool preservesForwardDependence(Instruction *Src, Instruction *Dst,
+                                       unsigned UnrollLevel, unsigned JamLevel,
+                                       bool Sequentialized, Dependence *D) {
+  // UnrollLevel might carry the dependency Src --> Dst
+  // Does a different loop after unrolling?
+  for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
+       ++CurLoopDepth) {
+    auto JammedDir = D->getDirection(CurLoopDepth);
+    if (JammedDir == Dependence::DVEntry::LT)
+      return true;
+
+    if (JammedDir & Dependence::DVEntry::GT)
+      return false;
+  }
+
+  return true;
+}
+
+static bool preservesBackwardDependence(Instruction *Src, Instruction *Dst,
+                                        unsigned UnrollLevel, unsigned JamLevel,
+                                        bool Sequentialized, Dependence *D) {
+  // UnrollLevel might carry the dependency Dst --> Src
+  for (unsigned CurLoopDepth = UnrollLevel + 1; CurLoopDepth <= JamLevel;
+       ++CurLoopDepth) {
+    auto JammedDir = D->getDirection(CurLoopDepth);
+    if (JammedDir == Dependence::DVEntry::GT)
+      return true;
+
+    if (JammedDir & Dependence::DVEntry::LT)
+      return false;
+  }
+
+  // Backward dependencies are only preserved if not interleaved.
+  return Sequentialized;
+}
+
+// Check whether it is semantically safe Src and Dst considering any potential
+// dependency between them.
+//
+// @param UnrollLevel The level of the loop being unrolled
+// @param JamLevel    The level of the loop being jammed; if Src and Dst are on
+// different levels, the outermost common loop counts as jammed level
+//
+// @return true if is safe and false if there is a dependency violation.
+static bool checkDependency(Instruction *Src, Instruction *Dst,
+                            unsigned UnrollLevel, unsigned JamLevel,
+                            bool Sequentialized, DependenceInfo &DI) {
+  assert(UnrollLevel <= JamLevel &&
+         "Expecting JamLevel to be at least UnrollLevel");
+
+  if (Src == Dst)
+    return true;
+  // Ignore Input dependencies.
+  if (isa<LoadInst>(Src) && isa<LoadInst>(Dst))
+    return true;
+
+  // Check whether unroll-and-jam may violate a dependency.
+  // By construction, every dependency will be lexicographically non-negative
+  // (if it was, it would violate the current execution order), such as
+  //   (0,0,>,*,*)
+  // Unroll-and-jam changes the GT execution of two executions to the same
+  // iteration of the chosen unroll level. That is, a GT dependence becomes a GE
+  // dependence (or EQ, if we fully unrolled the loop) at the loop's position:
+  //   (0,0,>=,*,*)
+  // Now, the dependency is not necessarily non-negative anymore, i.e.
+  // unroll-and-jam may violate correctness.
+  std::unique_ptr<Dependence> D = DI.depends(Src, Dst, true);
+  if (!D)
+    return true;
+  assert(D->isOrdered() && "Expected an output, flow or anti dep.");
+
+  if (D->isConfused()) {
+    LLVM_DEBUG(dbgs() << "  Confused dependency between:\n"
+                      << "  " << *Src << "\n"
+                      << "  " << *Dst << "\n");
+    return false;
+  }
+
+  // If outer levels (levels enclosing the loop being unroll-and-jammed) have a
+  // non-equal direction, then the locations accessed in the inner levels cannot
+  // overlap in memory. We assumes the indexes never overlap into neighboring
+  // dimensions.
+  for (unsigned CurLoopDepth = 1; CurLoopDepth < UnrollLevel; ++CurLoopDepth)
+    if (!(D->getDirection(CurLoopDepth) & Dependence::DVEntry::EQ))
+      return true;
+
+  auto UnrollDirection = D->getDirection(UnrollLevel);
+
+  // If the distance carried by the unrolled loop is 0, then after unrolling
+  // that distance will become non-zero resulting in non-overlapping accesses in
+  // the inner loops.
+  if (UnrollDirection == Dependence::DVEntry::EQ)
+    return true;
+
+  if (UnrollDirection & Dependence::DVEntry::LT &&
+      !preservesForwardDependence(Src, Dst, UnrollLevel, JamLevel,
+                                  Sequentialized, D.get()))
+    return false;
+
+  if (UnrollDirection & Dependence::DVEntry::GT &&
+      !preservesBackwardDependence(Src, Dst, UnrollLevel, JamLevel,
+                                   Sequentialized, D.get()))
+    return false;
+
+  return true;
+}
+
+static bool
+checkDependencies(Loop &Root, const BasicBlockSet &SubLoopBlocks,
+                  const DenseMap<Loop *, BasicBlockSet> &ForeBlocksMap,
+                  const DenseMap<Loop *, BasicBlockSet> &AftBlocksMap,
+                  DependenceInfo &DI, LoopInfo &LI) {
+  SmallVector<BasicBlockSet, 8> AllBlocks;
+  for (Loop *L : Root.getLoopsInPreorder())
+    if (ForeBlocksMap.find(L) != ForeBlocksMap.end())
+      AllBlocks.push_back(ForeBlocksMap.lookup(L));
+  AllBlocks.push_back(SubLoopBlocks);
+  for (Loop *L : Root.getLoopsInPreorder())
+    if (AftBlocksMap.find(L) != AftBlocksMap.end())
+      AllBlocks.push_back(AftBlocksMap.lookup(L));
+
+  unsigned LoopDepth = Root.getLoopDepth();
+  SmallVector<Instruction *, 4> EarlierLoadsAndStores;
+  SmallVector<Instruction *, 4> CurrentLoadsAndStores;
+  for (BasicBlockSet &Blocks : AllBlocks) {
+    CurrentLoadsAndStores.clear();
+    if (!getLoadsAndStores(Blocks, CurrentLoadsAndStores))
+      return false;
+
+    Loop *CurLoop = LI.getLoopFor((*Blocks.begin())->front().getParent());
+    unsigned CurLoopDepth = CurLoop->getLoopDepth();
+
+    for (auto *Earlier : EarlierLoadsAndStores) {
+      Loop *EarlierLoop = LI.getLoopFor(Earlier->getParent());
+      unsigned EarlierDepth = EarlierLoop->getLoopDepth();
+      unsigned CommonLoopDepth = std::min(EarlierDepth, CurLoopDepth);
+      for (auto *Later : CurrentLoadsAndStores) {
+        if (!checkDependency(Earlier, Later, LoopDepth, CommonLoopDepth, false,
+                             DI))
           return false;
-        }
-        if (!InnerLoop) {
-          if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT) {
-            LLVM_DEBUG(dbgs() << "  > dependency between:\n"
-                              << "  " << *Src << "\n"
-                              << "  " << *Dst << "\n");
-            return false;
-          }
-        } else {
-          assert(LoopDepth + 1 <= D->getLevels());
-          if (D->getDirection(LoopDepth) & Dependence::DVEntry::GT &&
-              D->getDirection(LoopDepth + 1) & Dependence::DVEntry::LT) {
-            LLVM_DEBUG(dbgs() << "  < > dependency between:\n"
-                              << "  " << *Src << "\n"
-                              << "  " << *Dst << "\n");
-            return false;
-          }
-        }
       }
     }
+
+    size_t NumInsts = CurrentLoadsAndStores.size();
+    for (size_t I = 0; I < NumInsts; ++I) {
+      for (size_t J = I; J < NumInsts; ++J) {
+        if (!checkDependency(CurrentLoadsAndStores[I], CurrentLoadsAndStores[J],
+                             LoopDepth, CurLoopDepth, true, DI))
+          return false;
+      }
+    }
+
+    EarlierLoadsAndStores.append(CurrentLoadsAndStores.begin(),
+                                 CurrentLoadsAndStores.end());
   }
   return true;
 }
 
-static bool checkDependencies(Loop *L, BasicBlockSet &ForeBlocks,
-                              BasicBlockSet &SubLoopBlocks,
-                              BasicBlockSet &AftBlocks, DependenceInfo &DI) {
-  // Get all loads/store pairs for each blocks
-  SmallVector<Value *, 4> ForeMemInstr;
-  SmallVector<Value *, 4> SubLoopMemInstr;
-  SmallVector<Value *, 4> AftMemInstr;
-  if (!getLoadsAndStores(ForeBlocks, ForeMemInstr) ||
-      !getLoadsAndStores(SubLoopBlocks, SubLoopMemInstr) ||
-      !getLoadsAndStores(AftBlocks, AftMemInstr))
+static bool isEligibleLoopForm(const Loop &Root) {
+  // Root must have a child.
+  if (Root.getSubLoops().size() != 1)
     return false;
 
-  // Check for dependencies between any blocks that may change order
-  unsigned LoopDepth = L->getLoopDepth();
-  return checkDependencies(ForeMemInstr, SubLoopMemInstr, LoopDepth, false,
-                           DI) &&
-         checkDependencies(ForeMemInstr, AftMemInstr, LoopDepth, false, DI) &&
-         checkDependencies(SubLoopMemInstr, AftMemInstr, LoopDepth, false,
-                           DI) &&
-         checkDependencies(SubLoopMemInstr, SubLoopMemInstr, LoopDepth, true,
-                           DI);
+  const Loop *L = &Root;
+  do {
+    // All loops in Root need to be in simplify and rotated form.
+    if (!L->isLoopSimplifyForm())
+      return false;
+
+    if (!L->isRotatedForm())
+      return false;
+
+    if (L->getHeader()->hasAddressTaken()) {
+      LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
+      return false;
+    }
+
+    unsigned SubLoopsSize = L->getSubLoops().size();
+    if (SubLoopsSize == 0)
+      return true;
+
+    // Only one child is allowed.
+    if (SubLoopsSize != 1)
+      return false;
+
+    L = L->getSubLoops()[0];
+  } while (L);
+
+  return true;
+}
+
+static Loop *getInnerMostLoop(Loop *L) {
+  while (!L->getSubLoops().empty())
+    L = L->getSubLoops()[0];
+  return L;
 }
 
 bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
-                                DependenceInfo &DI) {
+                                DependenceInfo &DI, LoopInfo &LI) {
+  if (!isEligibleLoopForm(*L)) {
+    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Ineligible loop form\n");
+    return false;
+  }
+
   /* We currently handle outer loops like this:
         |
-    ForeFirst    <----\    }
-     Blocks           |    } ForeBlocks
-    ForeLast          |    }
-        |             |
-    SubLoopFirst  <\  |    }
-     Blocks        |  |    } SubLoopBlocks
-    SubLoopLast   -/  |    }
-        |             |
-    AftFirst          |    }
-     Blocks           |    } AftBlocks
-    AftLast     ------/    }
+    ForeFirst    <------\   }
+     Blocks             |   } ForeBlocks of L
+    ForeLast            |   }
+        |               |
+       ...              |
+        |               |
+    ForeFirst    <----\ |   }
+     Blocks           | |   } ForeBlocks of a inner loop of L
+    ForeLast          | |   }
+        |             | |
+    JamLoopFirst  <\  | |   }
+     Blocks        |  | |   } JamLoopBlocks of the innermost loop
+    JamLoopLast   -/  | |   }
+        |             | |
+    AftFirst          | |   }
+     Blocks           | |   } AftBlocks of a inner loop of L
+    AftLast     ------/ |   }
+        |               |
+       ...              |
+        |               |
+    AftFirst            |   }
+     Blocks             |   } AftBlocks of L
+    AftLast     --------/   }
         |
 
     There are (theoretically) any number of blocks in ForeBlocks, SubLoopBlocks
@@ -709,14 +893,16 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
     things further in the profitablility checks of the unroll and jam pass.
 
     Because of the way we rearrange basic blocks, we also require that
-    the Fore blocks on all unrolled iterations are safe to move before the
-    SubLoop blocks of all iterations. So we require that the phi node looping
-    operands of ForeHeader can be moved to at least the end of ForeEnd, so that
-    we can arrange cloned Fore Blocks before the subloop and match up Phi's
-    correctly.
+    the Fore blocks of L on all unrolled iterations are safe to move before the
+    blocks of the direct child of L of all iterations. So we require that the
+    phi node looping operands of ForeHeader can be moved to at least the end of
+    ForeEnd, so that we can arrange cloned Fore Blocks before the subloop and
+    match up Phi's correctly.
 
-    i.e. The old order of blocks used to be F1 S1_1 S1_2 A1 F2 S2_1 S2_2 A2.
-    It needs to be safe to tranform this to F1 F2 S1_1 S2_1 S1_2 S2_2 A1 A2.
+    i.e. The old order of blocks used to be
+           (F1)1 (F2)1 J1_1 J1_2 (A2)1 (A1)1 (F1)2 (F2)2 J2_1 J2_2 (A2)2 (A1)2.
+         It needs to be safe to transform this to
+           (F1)1 (F1)2 (F2)1 (F2)2 J1_1 J1_2 J2_1 J2_2 (A2)1 (A2)2 (A1)1 (A1)2.
 
     There are then a number of checks along the lines of no calls, no
     exceptions, inner loop IV is consistent, etc. Note that for loops requiring
@@ -724,35 +910,13 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
     UnrollAndJamLoop if the trip count cannot be easily calculated.
   */
 
-  if (!L->isLoopSimplifyForm() || L->getSubLoops().size() != 1)
-    return false;
-  Loop *SubLoop = L->getSubLoops()[0];
-  if (!SubLoop->isLoopSimplifyForm())
-    return false;
-
-  BasicBlock *Header = L->getHeader();
-  BasicBlock *Latch = L->getLoopLatch();
-  BasicBlock *Exit = L->getExitingBlock();
-  BasicBlock *SubLoopHeader = SubLoop->getHeader();
-  BasicBlock *SubLoopLatch = SubLoop->getLoopLatch();
-  BasicBlock *SubLoopExit = SubLoop->getExitingBlock();
-
-  if (Latch != Exit)
-    return false;
-  if (SubLoopLatch != SubLoopExit)
-    return false;
-
-  if (Header->hasAddressTaken() || SubLoopHeader->hasAddressTaken()) {
-    LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Address taken\n");
-    return false;
-  }
-
   // Split blocks into Fore/SubLoop/Aft based on dominators
+  Loop *JamLoop = getInnerMostLoop(L);
   BasicBlockSet SubLoopBlocks;
-  BasicBlockSet ForeBlocks;
-  BasicBlockSet AftBlocks;
-  if (!partitionOuterLoopBlocks(L, SubLoop, ForeBlocks, SubLoopBlocks,
-                                AftBlocks, &DT)) {
+  DenseMap<Loop *, BasicBlockSet> ForeBlocksMap;
+  DenseMap<Loop *, BasicBlockSet> AftBlocksMap;
+  if (!partitionOuterLoopBlocks(*L, *JamLoop, SubLoopBlocks, ForeBlocksMap,
+                                AftBlocksMap, DT)) {
     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Incompatible loop layout\n");
     return false;
   }
@@ -760,7 +924,7 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
   // Aft blocks may need to move instructions to fore blocks, which becomes more
   // difficult if there are multiple (potentially conditionally executed)
   // blocks. For now we just exclude loops with multiple aft blocks.
-  if (AftBlocks.size() != 1) {
+  if (AftBlocksMap[L].size() != 1) {
     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Can't currently handle "
                          "multiple blocks after the loop\n");
     return false;
@@ -768,7 +932,9 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
 
   // Check inner loop backedge count is consistent on all iterations of the
   // outer loop
-  if (!hasIterationCountInvariantInParent(SubLoop, SE)) {
+  if (any_of(L->getLoopsInPreorder(), [&SE](Loop *SubLoop) {
+        return !hasIterationCountInvariantInParent(SubLoop, SE);
+      })) {
     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; Inner loop iteration count is "
                          "not consistent on each iteration\n");
     return false;
@@ -789,6 +955,10 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
   //  ForeBlock phi operands before the subloop
 
   // Make sure we can move all instructions we need to before the subloop
+  BasicBlock *Header = L->getHeader();
+  BasicBlock *Latch = L->getLoopLatch();
+  BasicBlockSet AftBlocks = AftBlocksMap[L];
+  Loop *SubLoop = L->getSubLoops()[0];
   if (!processHeaderPhiOperands(
           Header, Latch, AftBlocks, [&AftBlocks, &SubLoop](Instruction *I) {
             if (SubLoop->contains(I->getParent()))
@@ -814,7 +984,8 @@ bool llvm::isSafeToUnrollAndJam(Loop *L, ScalarEvolution &SE, DominatorTree &DT,
   // Check for memory dependencies which prohibit the unrolling we are doing.
   // Because of the way we are unrolling Fore/Sub/Aft blocks, we need to check
   // there are no dependencies between Fore-Sub, Fore-Aft, Sub-Aft and Sub-Sub.
-  if (!checkDependencies(L, ForeBlocks, SubLoopBlocks, AftBlocks, DI)) {
+  if (!checkDependencies(*L, SubLoopBlocks, ForeBlocksMap, AftBlocksMap, DI,
+                         LI)) {
     LLVM_DEBUG(dbgs() << "Won't unroll-and-jam; failed dependency check\n");
     return false;
   }
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp b/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
index 7a168ff6f32b..c653aacbee6c 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollPeel.cpp
@@ -262,10 +262,9 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
     // iteration. See if that makes !Pred become unknown again.
     if (ICmpInst::isEquality(Pred) &&
         !SE.isKnownPredicate(ICmpInst::getInversePredicate(Pred), NextIterVal,
-                             RightSCEV)) {
-      assert(!SE.isKnownPredicate(Pred, IterVal, RightSCEV) &&
-             SE.isKnownPredicate(Pred, NextIterVal, RightSCEV) &&
-             "Expected Pred to go from known to unknown.");
+                             RightSCEV) &&
+        !SE.isKnownPredicate(Pred, IterVal, RightSCEV) &&
+        SE.isKnownPredicate(Pred, NextIterVal, RightSCEV)) {
       if (!CanPeelOneMoreIteration())
         continue; // Need to peel one more iteration, but can't. Give up.
       PeelOneMoreIteration(); // Great!
@@ -280,17 +279,20 @@ static unsigned countToEliminateCompares(Loop &L, unsigned MaxPeelCount,
 // Return the number of iterations we want to peel off.
 void llvm::computePeelCount(Loop *L, unsigned LoopSize,
                             TargetTransformInfo::UnrollingPreferences &UP,
+                            TargetTransformInfo::PeelingPreferences &PP,
                             unsigned &TripCount, ScalarEvolution &SE) {
   assert(LoopSize > 0 && "Zero loop size is not allowed!");
-  // Save the UP.PeelCount value set by the target in
-  // TTI.getUnrollingPreferences or by the flag -unroll-peel-count.
-  unsigned TargetPeelCount = UP.PeelCount;
-  UP.PeelCount = 0;
+  // Save the PP.PeelCount value set by the target in
+  // TTI.getPeelingPreferences or by the flag -unroll-peel-count.
+  unsigned TargetPeelCount = PP.PeelCount;
+  PP.PeelCount = 0;
   if (!canPeel(L))
     return;
 
-  // Only try to peel innermost loops.
-  if (!L->empty())
+  // Only try to peel innermost loops by default.
+  // The constraint can be relaxed by the target in TTI.getUnrollingPreferences
+  // or by the flag -unroll-allow-loop-nests-peeling.
+  if (!PP.AllowLoopNestsPeeling && !L->empty())
     return;
 
   // If the user provided a peel count, use that.
@@ -298,13 +300,13 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
   if (UserPeelCount) {
     LLVM_DEBUG(dbgs() << "Force-peeling first " << UnrollForcePeelCount
                       << " iterations.\n");
-    UP.PeelCount = UnrollForcePeelCount;
-    UP.PeelProfiledIterations = true;
+    PP.PeelCount = UnrollForcePeelCount;
+    PP.PeelProfiledIterations = true;
     return;
   }
 
   // Skip peeling if it's disabled.
-  if (!UP.AllowPeeling)
+  if (!PP.AllowPeeling)
     return;
 
   unsigned AlreadyPeeled = 0;
@@ -353,8 +355,8 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
         LLVM_DEBUG(dbgs() << "Peel " << DesiredPeelCount
                           << " iteration(s) to turn"
                           << " some Phis into invariants.\n");
-        UP.PeelCount = DesiredPeelCount;
-        UP.PeelProfiledIterations = false;
+        PP.PeelCount = DesiredPeelCount;
+        PP.PeelProfiledIterations = false;
         return;
       }
     }
@@ -366,7 +368,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
     return;
 
   // Do not apply profile base peeling if it is disabled.
-  if (!UP.PeelProfiledIterations)
+  if (!PP.PeelProfiledIterations)
     return;
   // If we don't know the trip count, but have reason to believe the average
   // trip count is low, peeling should be beneficial, since we will usually
@@ -386,7 +388,7 @@ void llvm::computePeelCount(Loop *L, unsigned LoopSize,
           (LoopSize * (*PeelCount + 1) <= UP.Threshold)) {
         LLVM_DEBUG(dbgs() << "Peeling first " << *PeelCount
                           << " iterations.\n");
-        UP.PeelCount = *PeelCount;
+        PP.PeelCount = *PeelCount;
         return;
       }
       LLVM_DEBUG(dbgs() << "Requested peel count: " << *PeelCount << "\n");
@@ -508,7 +510,10 @@ static void cloneLoopBlocks(
     BasicBlock *NewBB = CloneBasicBlock(*BB, VMap, ".peel", F);
     NewBlocks.push_back(NewBB);
 
-    if (ParentLoop)
+    // If an original block is an immediate child of the loop L, its copy
+    // is a child of a ParentLoop after peeling. If a block is a child of
+    // a nested loop, it is handled in the cloneLoop() call below.
+    if (ParentLoop && LI->getLoopFor(*BB) == L)
       ParentLoop->addBasicBlockToLoop(NewBB, *LI);
 
     VMap[*BB] = NewBB;
@@ -525,6 +530,12 @@ static void cloneLoopBlocks(
     }
   }
 
+  // Recursively create the new Loop objects for nested loops, if any,
+  // to preserve LoopInfo.
+  for (Loop *ChildLoop : *L) {
+    cloneLoop(ChildLoop, ParentLoop, VMap, LI, nullptr);
+  }
+
   // Hook-up the control flow for the newly inserted blocks.
   // The new header is hooked up directly to the "top", which is either
   // the original loop preheader (for the first iteration) or the previous
diff --git a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
index ddb7479924bd..2515b1676cb9 100644
--- a/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUnrollRuntime.cpp
@@ -25,7 +25,6 @@
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/LoopIterator.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Metadata.h"
@@ -37,6 +36,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/UnrollLoop.h"
 #include <algorithm>
 
@@ -543,13 +543,11 @@ static bool canProfitablyUnrollMultiExitLoop(
 ///        if (extraiters != 0) jump Epil: // Omitted if unroll factor is 2.
 /// EpilExit:
 
-bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
-                                      bool AllowExpensiveTripCount,
-                                      bool UseEpilogRemainder,
-                                      bool UnrollRemainder, bool ForgetAllSCEV,
-                                      LoopInfo *LI, ScalarEvolution *SE,
-                                      DominatorTree *DT, AssumptionCache *AC,
-                                      bool PreserveLCSSA, Loop **ResultLoop) {
+bool llvm::UnrollRuntimeLoopRemainder(
+    Loop *L, unsigned Count, bool AllowExpensiveTripCount,
+    bool UseEpilogRemainder, bool UnrollRemainder, bool ForgetAllSCEV,
+    LoopInfo *LI, ScalarEvolution *SE, DominatorTree *DT, AssumptionCache *AC,
+    const TargetTransformInfo *TTI, bool PreserveLCSSA, Loop **ResultLoop) {
   LLVM_DEBUG(dbgs() << "Trying runtime unrolling on Loop: \n");
   LLVM_DEBUG(L->dump());
   LLVM_DEBUG(UseEpilogRemainder ? dbgs() << "Using epilog remainder.\n"
@@ -637,7 +635,8 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
   const DataLayout &DL = Header->getModule()->getDataLayout();
   SCEVExpander Expander(*SE, DL, "loop-unroll");
   if (!AllowExpensiveTripCount &&
-      Expander.isHighCostExpansion(TripCountSC, L, PreHeaderBR)) {
+      Expander.isHighCostExpansion(TripCountSC, L, SCEVCheapExpansionBudget,
+                                   TTI, PreHeaderBR)) {
     LLVM_DEBUG(dbgs() << "High cost for expanding trip count scev!\n");
     return false;
   }
@@ -849,7 +848,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
     // dominator of the exit blocks.
     for (auto *BB : L->blocks()) {
       auto *DomNodeBB = DT->getNode(BB);
-      for (auto *DomChild : DomNodeBB->getChildren()) {
+      for (auto *DomChild : DomNodeBB->children()) {
         auto *DomChildBB = DomChild->getBlock();
         if (!L->contains(LI->getLoopFor(DomChildBB)))
           ChildrenToUpdate.push_back(DomChildBB);
@@ -949,7 +948,7 @@ bool llvm::UnrollRuntimeLoopRemainder(Loop *L, unsigned Count,
                     /*AllowExpensiveTripCount*/ false, /*PreserveCondBr*/ true,
                     /*PreserveOnlyFirst*/ false, /*TripMultiple*/ 1,
                     /*PeelCount*/ 0, /*UnrollRemainder*/ false, ForgetAllSCEV},
-                   LI, SE, DT, AC, /*ORE*/ nullptr, PreserveLCSSA);
+                   LI, SE, DT, AC, TTI, /*ORE*/ nullptr, PreserveLCSSA);
   }
 
   if (ResultLoop && UnrollResult != LoopUnrollResult::FullyUnrolled)
diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp
index c4c40189fda4..43363736684e 100644
--- a/llvm/lib/Transforms/Utils/LoopUtils.cpp
+++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp
@@ -11,12 +11,19 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/LoopUtils.h"
+#include "llvm/ADT/DenseSet.h"
+#include "llvm/ADT/Optional.h"
+#include "llvm/ADT/PriorityWorklist.h"
 #include "llvm/ADT/ScopeExit.h"
+#include "llvm/ADT/SetVector.h"
+#include "llvm/ADT/SmallPtrSet.h"
+#include "llvm/ADT/SmallVector.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/BasicAliasAnalysis.h"
 #include "llvm/Analysis/DomTreeUpdater.h"
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/InstructionSimplify.h"
+#include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/LoopPass.h"
 #include "llvm/Analysis/MemorySSA.h"
@@ -31,7 +38,9 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Operator.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ValueHandle.h"
 #include "llvm/InitializePasses.h"
@@ -39,10 +48,17 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 using namespace llvm;
 using namespace llvm::PatternMatch;
 
+static cl::opt<bool> ForceReductionIntrinsic(
+    "force-reduction-intrinsics", cl::Hidden,
+    cl::desc("Force creating reduction intrinsics for testing."),
+    cl::init(false));
+
 #define DEBUG_TYPE "loop-utils"
 
 static const char *LLVMLoopDisableNonforced = "llvm.loop.disable_nonforced";
@@ -496,20 +512,24 @@ llvm::collectChildrenInLoop(DomTreeNode *N, const Loop *CurLoop) {
 
   AddRegionToWorklist(N);
 
-  for (size_t I = 0; I < Worklist.size(); I++)
-    for (DomTreeNode *Child : Worklist[I]->getChildren())
+  for (size_t I = 0; I < Worklist.size(); I++) {
+    for (DomTreeNode *Child : Worklist[I]->children())
       AddRegionToWorklist(Child);
+  }
 
   return Worklist;
 }
 
-void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
-                          ScalarEvolution *SE = nullptr,
-                          LoopInfo *LI = nullptr) {
+void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT, ScalarEvolution *SE,
+                          LoopInfo *LI, MemorySSA *MSSA) {
   assert((!DT || L->isLCSSAForm(*DT)) && "Expected LCSSA!");
   auto *Preheader = L->getLoopPreheader();
   assert(Preheader && "Preheader should exist!");
 
+  std::unique_ptr<MemorySSAUpdater> MSSAU;
+  if (MSSA)
+    MSSAU = std::make_unique<MemorySSAUpdater>(MSSA);
+
   // Now that we know the removal is safe, remove the loop by changing the
   // branch from the preheader to go to the single exit block.
   //
@@ -582,18 +602,33 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
            "Should have exactly one value and that's from the preheader!");
   }
 
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  if (DT) {
+    DTU.applyUpdates({{DominatorTree::Insert, Preheader, ExitBlock}});
+    if (MSSA) {
+      MSSAU->applyUpdates({{DominatorTree::Insert, Preheader, ExitBlock}}, *DT);
+      if (VerifyMemorySSA)
+        MSSA->verifyMemorySSA();
+    }
+  }
+
   // Disconnect the loop body by branching directly to its exit.
   Builder.SetInsertPoint(Preheader->getTerminator());
   Builder.CreateBr(ExitBlock);
   // Remove the old branch.
   Preheader->getTerminator()->eraseFromParent();
 
-  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
   if (DT) {
-    // Update the dominator tree by informing it about the new edge from the
-    // preheader to the exit and the removed edge.
-    DTU.applyUpdates({{DominatorTree::Insert, Preheader, ExitBlock},
-                      {DominatorTree::Delete, Preheader, L->getHeader()}});
+    DTU.applyUpdates({{DominatorTree::Delete, Preheader, L->getHeader()}});
+    if (MSSA) {
+      MSSAU->applyUpdates({{DominatorTree::Delete, Preheader, L->getHeader()}},
+                          *DT);
+      SmallSetVector<BasicBlock *, 8> DeadBlockSet(L->block_begin(),
+                                                   L->block_end());
+      MSSAU->removeBlocks(DeadBlockSet);
+      if (VerifyMemorySSA)
+        MSSA->verifyMemorySSA();
+    }
   }
 
   // Use a map to unique and a vector to guarantee deterministic ordering.
@@ -654,6 +689,9 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
   for (auto *Block : L->blocks())
     Block->dropAllReferences();
 
+  if (MSSA && VerifyMemorySSA)
+    MSSA->verifyMemorySSA();
+
   if (LI) {
     // Erase the instructions and the blocks without having to worry
     // about ordering because we already dropped the references.
@@ -676,11 +714,11 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
     // its parent. While removeLoop/removeChildLoop remove the given loop but
     // not relink its subloops, which is what we want.
     if (Loop *ParentLoop = L->getParentLoop()) {
-      Loop::iterator I = find(ParentLoop->begin(), ParentLoop->end(), L);
+      Loop::iterator I = find(*ParentLoop, L);
       assert(I != ParentLoop->end() && "Couldn't find loop");
       ParentLoop->removeChildLoop(I);
     } else {
-      Loop::iterator I = find(LI->begin(), LI->end(), L);
+      Loop::iterator I = find(*LI, L);
       assert(I != LI->end() && "Couldn't find loop");
       LI->removeLoop(I);
     }
@@ -688,17 +726,17 @@ void llvm::deleteDeadLoop(Loop *L, DominatorTree *DT = nullptr,
   }
 }
 
-Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
-  // Support loops with an exiting latch and other existing exists only
-  // deoptimize.
-
-  // Get the branch weights for the loop's backedge.
+/// Checks if \p L has single exit through latch block except possibly
+/// "deoptimizing" exits. Returns branch instruction terminating the loop
+/// latch if above check is successful, nullptr otherwise.
+static BranchInst *getExpectedExitLoopLatchBranch(Loop *L) {
   BasicBlock *Latch = L->getLoopLatch();
   if (!Latch)
-    return None;
+    return nullptr;
+
   BranchInst *LatchBR = dyn_cast<BranchInst>(Latch->getTerminator());
   if (!LatchBR || LatchBR->getNumSuccessors() != 2 || !L->isLoopExiting(Latch))
-    return None;
+    return nullptr;
 
   assert((LatchBR->getSuccessor(0) == L->getHeader() ||
           LatchBR->getSuccessor(1) == L->getHeader()) &&
@@ -709,24 +747,73 @@ Optional<unsigned> llvm::getLoopEstimatedTripCount(Loop *L) {
   if (any_of(ExitBlocks, [](const BasicBlock *EB) {
         return !EB->getTerminatingDeoptimizeCall();
       }))
+    return nullptr;
+
+  return LatchBR;
+}
+
+Optional<unsigned>
+llvm::getLoopEstimatedTripCount(Loop *L,
+                                unsigned *EstimatedLoopInvocationWeight) {
+  // Support loops with an exiting latch and other existing exists only
+  // deoptimize.
+  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+  if (!LatchBranch)
     return None;
 
   // To estimate the number of times the loop body was executed, we want to
   // know the number of times the backedge was taken, vs. the number of times
   // we exited the loop.
   uint64_t BackedgeTakenWeight, LatchExitWeight;
-  if (!LatchBR->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
+  if (!LatchBranch->extractProfMetadata(BackedgeTakenWeight, LatchExitWeight))
     return None;
 
-  if (LatchBR->getSuccessor(0) != L->getHeader())
+  if (LatchBranch->getSuccessor(0) != L->getHeader())
+    std::swap(BackedgeTakenWeight, LatchExitWeight);
+
+  if (!LatchExitWeight)
+    return None;
+
+  if (EstimatedLoopInvocationWeight)
+    *EstimatedLoopInvocationWeight = LatchExitWeight;
+
+  // Estimated backedge taken count is a ratio of the backedge taken weight by
+  // the weight of the edge exiting the loop, rounded to nearest.
+  uint64_t BackedgeTakenCount =
+      llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight);
+  // Estimated trip count is one plus estimated backedge taken count.
+  return BackedgeTakenCount + 1;
+}
+
+bool llvm::setLoopEstimatedTripCount(Loop *L, unsigned EstimatedTripCount,
+                                     unsigned EstimatedloopInvocationWeight) {
+  // Support loops with an exiting latch and other existing exists only
+  // deoptimize.
+  BranchInst *LatchBranch = getExpectedExitLoopLatchBranch(L);
+  if (!LatchBranch)
+    return false;
+
+  // Calculate taken and exit weights.
+  unsigned LatchExitWeight = 0;
+  unsigned BackedgeTakenWeight = 0;
+
+  if (EstimatedTripCount > 0) {
+    LatchExitWeight = EstimatedloopInvocationWeight;
+    BackedgeTakenWeight = (EstimatedTripCount - 1) * LatchExitWeight;
+  }
+
+  // Make a swap if back edge is taken when condition is "false".
+  if (LatchBranch->getSuccessor(0) != L->getHeader())
     std::swap(BackedgeTakenWeight, LatchExitWeight);
 
-  if (!BackedgeTakenWeight || !LatchExitWeight)
-    return 0;
+  MDBuilder MDB(LatchBranch->getContext());
 
-  // Divide the count of the backedge by the count of the edge exiting the loop,
-  // rounding to nearest.
-  return llvm::divideNearest(BackedgeTakenWeight, LatchExitWeight);
+  // Set/Update profile metadata.
+  LatchBranch->setMetadata(
+      LLVMContext::MD_prof,
+      MDB.createBranchWeights(BackedgeTakenWeight, LatchExitWeight));
+
+  return true;
 }
 
 bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
@@ -751,7 +838,7 @@ bool llvm::hasIterationCountInvariantInParent(Loop *InnerLoop,
   return true;
 }
 
-Value *llvm::createMinMaxOp(IRBuilder<> &Builder,
+Value *llvm::createMinMaxOp(IRBuilderBase &Builder,
                             RecurrenceDescriptor::MinMaxRecurrenceKind RK,
                             Value *Left, Value *Right) {
   CmpInst::Predicate P = CmpInst::ICMP_NE;
@@ -780,29 +867,22 @@ Value *llvm::createMinMaxOp(IRBuilder<> &Builder,
 
   // We only match FP sequences that are 'fast', so we can unconditionally
   // set it on any generated instructions.
-  IRBuilder<>::FastMathFlagGuard FMFG(Builder);
+  IRBuilderBase::FastMathFlagGuard FMFG(Builder);
   FastMathFlags FMF;
   FMF.setFast();
   Builder.setFastMathFlags(FMF);
-
-  Value *Cmp;
-  if (RK == RecurrenceDescriptor::MRK_FloatMin ||
-      RK == RecurrenceDescriptor::MRK_FloatMax)
-    Cmp = Builder.CreateFCmp(P, Left, Right, "rdx.minmax.cmp");
-  else
-    Cmp = Builder.CreateICmp(P, Left, Right, "rdx.minmax.cmp");
-
+  Value *Cmp = Builder.CreateCmp(P, Left, Right, "rdx.minmax.cmp");
   Value *Select = Builder.CreateSelect(Cmp, Left, Right, "rdx.minmax.select");
   return Select;
 }
 
 // Helper to generate an ordered reduction.
 Value *
-llvm::getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src,
+llvm::getOrderedReduction(IRBuilderBase &Builder, Value *Acc, Value *Src,
                           unsigned Op,
                           RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
                           ArrayRef<Value *> RedOps) {
-  unsigned VF = Src->getType()->getVectorNumElements();
+  unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
 
   // Extract and apply reduction ops in ascending order:
   // e.g. ((((Acc + Scl[0]) + Scl[1]) + Scl[2]) + ) ... + Scl[VF-1]
@@ -829,29 +909,27 @@ llvm::getOrderedReduction(IRBuilder<> &Builder, Value *Acc, Value *Src,
 
 // Helper to generate a log2 shuffle reduction.
 Value *
-llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
+llvm::getShuffleReduction(IRBuilderBase &Builder, Value *Src, unsigned Op,
                           RecurrenceDescriptor::MinMaxRecurrenceKind MinMaxKind,
                           ArrayRef<Value *> RedOps) {
-  unsigned VF = Src->getType()->getVectorNumElements();
+  unsigned VF = cast<FixedVectorType>(Src->getType())->getNumElements();
   // VF is a power of 2 so we can emit the reduction using log2(VF) shuffles
   // and vector ops, reducing the set of values being computed by half each
   // round.
   assert(isPowerOf2_32(VF) &&
          "Reduction emission only supported for pow2 vectors!");
   Value *TmpVec = Src;
-  SmallVector<Constant *, 32> ShuffleMask(VF, nullptr);
+  SmallVector<int, 32> ShuffleMask(VF);
   for (unsigned i = VF; i != 1; i >>= 1) {
     // Move the upper half of the vector to the lower half.
     for (unsigned j = 0; j != i / 2; ++j)
-      ShuffleMask[j] = Builder.getInt32(i / 2 + j);
+      ShuffleMask[j] = i / 2 + j;
 
     // Fill the rest of the mask with undef.
-    std::fill(&ShuffleMask[i / 2], ShuffleMask.end(),
-              UndefValue::get(Builder.getInt32Ty()));
+    std::fill(&ShuffleMask[i / 2], ShuffleMask.end(), -1);
 
     Value *Shuf = Builder.CreateShuffleVector(
-        TmpVec, UndefValue::get(TmpVec->getType()),
-        ConstantVector::get(ShuffleMask), "rdx.shuf");
+        TmpVec, UndefValue::get(TmpVec->getType()), ShuffleMask, "rdx.shuf");
 
     if (Op != Instruction::ICmp && Op != Instruction::FCmp) {
       // The builder propagates its fast-math-flags setting.
@@ -864,6 +942,11 @@ llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
     }
     if (!RedOps.empty())
       propagateIRFlags(TmpVec, RedOps);
+
+    // We may compute the reassociated scalar ops in a way that does not
+    // preserve nsw/nuw etc. Conservatively, drop those flags.
+    if (auto *ReductionInst = dyn_cast<Instruction>(TmpVec))
+      ReductionInst->dropPoisonGeneratingFlags();
   }
   // The result is in the first element of the vector.
   return Builder.CreateExtractElement(TmpVec, Builder.getInt32(0));
@@ -872,10 +955,10 @@ llvm::getShuffleReduction(IRBuilder<> &Builder, Value *Src, unsigned Op,
 /// Create a simple vector reduction specified by an opcode and some
 /// flags (if generating min/max reductions).
 Value *llvm::createSimpleTargetReduction(
-    IRBuilder<> &Builder, const TargetTransformInfo *TTI, unsigned Opcode,
+    IRBuilderBase &Builder, const TargetTransformInfo *TTI, unsigned Opcode,
     Value *Src, TargetTransformInfo::ReductionFlags Flags,
     ArrayRef<Value *> RedOps) {
-  assert(isa<VectorType>(Src->getType()) && "Type must be a vector");
+  auto *SrcVTy = cast<VectorType>(Src->getType());
 
   std::function<Value *()> BuildFunc;
   using RD = RecurrenceDescriptor;
@@ -900,13 +983,13 @@ Value *llvm::createSimpleTargetReduction(
   case Instruction::FAdd:
     BuildFunc = [&]() {
       auto Rdx = Builder.CreateFAddReduce(
-          Constant::getNullValue(Src->getType()->getVectorElementType()), Src);
+          Constant::getNullValue(SrcVTy->getElementType()), Src);
       return Rdx;
     };
     break;
   case Instruction::FMul:
     BuildFunc = [&]() {
-      Type *Ty = Src->getType()->getVectorElementType();
+      Type *Ty = SrcVTy->getElementType();
       auto Rdx = Builder.CreateFMulReduce(ConstantFP::get(Ty, 1.0), Src);
       return Rdx;
     };
@@ -937,13 +1020,14 @@ Value *llvm::createSimpleTargetReduction(
     llvm_unreachable("Unhandled opcode");
     break;
   }
-  if (TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
+  if (ForceReductionIntrinsic ||
+      TTI->useReductionIntrinsic(Opcode, Src->getType(), Flags))
     return BuildFunc();
   return getShuffleReduction(Builder, Src, Opcode, MinMaxKind, RedOps);
 }
 
 /// Create a vector reduction using a given recurrence descriptor.
-Value *llvm::createTargetReduction(IRBuilder<> &B,
+Value *llvm::createTargetReduction(IRBuilderBase &B,
                                    const TargetTransformInfo *TTI,
                                    RecurrenceDescriptor &Desc, Value *Src,
                                    bool NoNaN) {
@@ -955,7 +1039,7 @@ Value *llvm::createTargetReduction(IRBuilder<> &B,
 
   // All ops in the reduction inherit fast-math-flags from the recurrence
   // descriptor.
-  IRBuilder<>::FastMathFlagGuard FMFGuard(B);
+  IRBuilderBase::FastMathFlagGuard FMFGuard(B);
   B.setFastMathFlags(Desc.getFastMathFlags());
 
   switch (RecKind) {
@@ -1042,3 +1126,586 @@ bool llvm::cannotBeMaxInLoop(const SCEV *S, const Loop *L, ScalarEvolution &SE,
          SE.isLoopEntryGuardedByCond(L, Predicate, S,
                                      SE.getConstant(Max));
 }
+
+//===----------------------------------------------------------------------===//
+// rewriteLoopExitValues - Optimize IV users outside the loop.
+// As a side effect, reduces the amount of IV processing within the loop.
+//===----------------------------------------------------------------------===//
+
+// Return true if the SCEV expansion generated by the rewriter can replace the
+// original value. SCEV guarantees that it produces the same value, but the way
+// it is produced may be illegal IR.  Ideally, this function will only be
+// called for verification.
+static bool isValidRewrite(ScalarEvolution *SE, Value *FromVal, Value *ToVal) {
+  // If an SCEV expression subsumed multiple pointers, its expansion could
+  // reassociate the GEP changing the base pointer. This is illegal because the
+  // final address produced by a GEP chain must be inbounds relative to its
+  // underlying object. Otherwise basic alias analysis, among other things,
+  // could fail in a dangerous way. Ultimately, SCEV will be improved to avoid
+  // producing an expression involving multiple pointers. Until then, we must
+  // bail out here.
+  //
+  // Retrieve the pointer operand of the GEP. Don't use GetUnderlyingObject
+  // because it understands lcssa phis while SCEV does not.
+  Value *FromPtr = FromVal;
+  Value *ToPtr = ToVal;
+  if (auto *GEP = dyn_cast<GEPOperator>(FromVal))
+    FromPtr = GEP->getPointerOperand();
+
+  if (auto *GEP = dyn_cast<GEPOperator>(ToVal))
+    ToPtr = GEP->getPointerOperand();
+
+  if (FromPtr != FromVal || ToPtr != ToVal) {
+    // Quickly check the common case
+    if (FromPtr == ToPtr)
+      return true;
+
+    // SCEV may have rewritten an expression that produces the GEP's pointer
+    // operand. That's ok as long as the pointer operand has the same base
+    // pointer. Unlike GetUnderlyingObject(), getPointerBase() will find the
+    // base of a recurrence. This handles the case in which SCEV expansion
+    // converts a pointer type recurrence into a nonrecurrent pointer base
+    // indexed by an integer recurrence.
+
+    // If the GEP base pointer is a vector of pointers, abort.
+    if (!FromPtr->getType()->isPointerTy() || !ToPtr->getType()->isPointerTy())
+      return false;
+
+    const SCEV *FromBase = SE->getPointerBase(SE->getSCEV(FromPtr));
+    const SCEV *ToBase = SE->getPointerBase(SE->getSCEV(ToPtr));
+    if (FromBase == ToBase)
+      return true;
+
+    LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: GEP rewrite bail out "
+                      << *FromBase << " != " << *ToBase << "\n");
+
+    return false;
+  }
+  return true;
+}
+
+static bool hasHardUserWithinLoop(const Loop *L, const Instruction *I) {
+  SmallPtrSet<const Instruction *, 8> Visited;
+  SmallVector<const Instruction *, 8> WorkList;
+  Visited.insert(I);
+  WorkList.push_back(I);
+  while (!WorkList.empty()) {
+    const Instruction *Curr = WorkList.pop_back_val();
+    // This use is outside the loop, nothing to do.
+    if (!L->contains(Curr))
+      continue;
+    // Do we assume it is a "hard" use which will not be eliminated easily?
+    if (Curr->mayHaveSideEffects())
+      return true;
+    // Otherwise, add all its users to worklist.
+    for (auto U : Curr->users()) {
+      auto *UI = cast<Instruction>(U);
+      if (Visited.insert(UI).second)
+        WorkList.push_back(UI);
+    }
+  }
+  return false;
+}
+
+// Collect information about PHI nodes which can be transformed in
+// rewriteLoopExitValues.
+struct RewritePhi {
+  PHINode *PN;               // For which PHI node is this replacement?
+  unsigned Ith;              // For which incoming value?
+  const SCEV *ExpansionSCEV; // The SCEV of the incoming value we are rewriting.
+  Instruction *ExpansionPoint; // Where we'd like to expand that SCEV?
+  bool HighCost;               // Is this expansion a high-cost?
+
+  Value *Expansion = nullptr;
+  bool ValidRewrite = false;
+
+  RewritePhi(PHINode *P, unsigned I, const SCEV *Val, Instruction *ExpansionPt,
+             bool H)
+      : PN(P), Ith(I), ExpansionSCEV(Val), ExpansionPoint(ExpansionPt),
+        HighCost(H) {}
+};
+
+// Check whether it is possible to delete the loop after rewriting exit
+// value. If it is possible, ignore ReplaceExitValue and do rewriting
+// aggressively.
+static bool canLoopBeDeleted(Loop *L, SmallVector<RewritePhi, 8> &RewritePhiSet) {
+  BasicBlock *Preheader = L->getLoopPreheader();
+  // If there is no preheader, the loop will not be deleted.
+  if (!Preheader)
+    return false;
+
+  // In LoopDeletion pass Loop can be deleted when ExitingBlocks.size() > 1.
+  // We obviate multiple ExitingBlocks case for simplicity.
+  // TODO: If we see testcase with multiple ExitingBlocks can be deleted
+  // after exit value rewriting, we can enhance the logic here.
+  SmallVector<BasicBlock *, 4> ExitingBlocks;
+  L->getExitingBlocks(ExitingBlocks);
+  SmallVector<BasicBlock *, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+  if (ExitBlocks.size() != 1 || ExitingBlocks.size() != 1)
+    return false;
+
+  BasicBlock *ExitBlock = ExitBlocks[0];
+  BasicBlock::iterator BI = ExitBlock->begin();
+  while (PHINode *P = dyn_cast<PHINode>(BI)) {
+    Value *Incoming = P->getIncomingValueForBlock(ExitingBlocks[0]);
+
+    // If the Incoming value of P is found in RewritePhiSet, we know it
+    // could be rewritten to use a loop invariant value in transformation
+    // phase later. Skip it in the loop invariant check below.
+    bool found = false;
+    for (const RewritePhi &Phi : RewritePhiSet) {
+      if (!Phi.ValidRewrite)
+        continue;
+      unsigned i = Phi.Ith;
+      if (Phi.PN == P && (Phi.PN)->getIncomingValue(i) == Incoming) {
+        found = true;
+        break;
+      }
+    }
+
+    Instruction *I;
+    if (!found && (I = dyn_cast<Instruction>(Incoming)))
+      if (!L->hasLoopInvariantOperands(I))
+        return false;
+
+    ++BI;
+  }
+
+  for (auto *BB : L->blocks())
+    if (llvm::any_of(*BB, [](Instruction &I) {
+          return I.mayHaveSideEffects();
+        }))
+      return false;
+
+  return true;
+}
+
+int llvm::rewriteLoopExitValues(Loop *L, LoopInfo *LI, TargetLibraryInfo *TLI,
+                                ScalarEvolution *SE,
+                                const TargetTransformInfo *TTI,
+                                SCEVExpander &Rewriter, DominatorTree *DT,
+                                ReplaceExitVal ReplaceExitValue,
+                                SmallVector<WeakTrackingVH, 16> &DeadInsts) {
+  // Check a pre-condition.
+  assert(L->isRecursivelyLCSSAForm(*DT, *LI) &&
+         "Indvars did not preserve LCSSA!");
+
+  SmallVector<BasicBlock*, 8> ExitBlocks;
+  L->getUniqueExitBlocks(ExitBlocks);
+
+  SmallVector<RewritePhi, 8> RewritePhiSet;
+  // Find all values that are computed inside the loop, but used outside of it.
+  // Because of LCSSA, these values will only occur in LCSSA PHI Nodes.  Scan
+  // the exit blocks of the loop to find them.
+  for (BasicBlock *ExitBB : ExitBlocks) {
+    // If there are no PHI nodes in this exit block, then no values defined
+    // inside the loop are used on this path, skip it.
+    PHINode *PN = dyn_cast<PHINode>(ExitBB->begin());
+    if (!PN) continue;
+
+    unsigned NumPreds = PN->getNumIncomingValues();
+
+    // Iterate over all of the PHI nodes.
+    BasicBlock::iterator BBI = ExitBB->begin();
+    while ((PN = dyn_cast<PHINode>(BBI++))) {
+      if (PN->use_empty())
+        continue; // dead use, don't replace it
+
+      if (!SE->isSCEVable(PN->getType()))
+        continue;
+
+      // It's necessary to tell ScalarEvolution about this explicitly so that
+      // it can walk the def-use list and forget all SCEVs, as it may not be
+      // watching the PHI itself. Once the new exit value is in place, there
+      // may not be a def-use connection between the loop and every instruction
+      // which got a SCEVAddRecExpr for that loop.
+      SE->forgetValue(PN);
+
+      // Iterate over all of the values in all the PHI nodes.
+      for (unsigned i = 0; i != NumPreds; ++i) {
+        // If the value being merged in is not integer or is not defined
+        // in the loop, skip it.
+        Value *InVal = PN->getIncomingValue(i);
+        if (!isa<Instruction>(InVal))
+          continue;
+
+        // If this pred is for a subloop, not L itself, skip it.
+        if (LI->getLoopFor(PN->getIncomingBlock(i)) != L)
+          continue; // The Block is in a subloop, skip it.
+
+        // Check that InVal is defined in the loop.
+        Instruction *Inst = cast<Instruction>(InVal);
+        if (!L->contains(Inst))
+          continue;
+
+        // Okay, this instruction has a user outside of the current loop
+        // and varies predictably *inside* the loop.  Evaluate the value it
+        // contains when the loop exits, if possible.  We prefer to start with
+        // expressions which are true for all exits (so as to maximize
+        // expression reuse by the SCEVExpander), but resort to per-exit
+        // evaluation if that fails.
+        const SCEV *ExitValue = SE->getSCEVAtScope(Inst, L->getParentLoop());
+        if (isa<SCEVCouldNotCompute>(ExitValue) ||
+            !SE->isLoopInvariant(ExitValue, L) ||
+            !isSafeToExpand(ExitValue, *SE)) {
+          // TODO: This should probably be sunk into SCEV in some way; maybe a
+          // getSCEVForExit(SCEV*, L, ExitingBB)?  It can be generalized for
+          // most SCEV expressions and other recurrence types (e.g. shift
+          // recurrences).  Is there existing code we can reuse?
+          const SCEV *ExitCount = SE->getExitCount(L, PN->getIncomingBlock(i));
+          if (isa<SCEVCouldNotCompute>(ExitCount))
+            continue;
+          if (auto *AddRec = dyn_cast<SCEVAddRecExpr>(SE->getSCEV(Inst)))
+            if (AddRec->getLoop() == L)
+              ExitValue = AddRec->evaluateAtIteration(ExitCount, *SE);
+          if (isa<SCEVCouldNotCompute>(ExitValue) ||
+              !SE->isLoopInvariant(ExitValue, L) ||
+              !isSafeToExpand(ExitValue, *SE))
+            continue;
+        }
+
+        // Computing the value outside of the loop brings no benefit if it is
+        // definitely used inside the loop in a way which can not be optimized
+        // away. Avoid doing so unless we know we have a value which computes
+        // the ExitValue already. TODO: This should be merged into SCEV
+        // expander to leverage its knowledge of existing expressions.
+        if (ReplaceExitValue != AlwaysRepl && !isa<SCEVConstant>(ExitValue) &&
+            !isa<SCEVUnknown>(ExitValue) && hasHardUserWithinLoop(L, Inst))
+          continue;
+
+        // Check if expansions of this SCEV would count as being high cost.
+        bool HighCost = Rewriter.isHighCostExpansion(
+            ExitValue, L, SCEVCheapExpansionBudget, TTI, Inst);
+
+        // Note that we must not perform expansions until after
+        // we query *all* the costs, because if we perform temporary expansion
+        // inbetween, one that we might not intend to keep, said expansion
+        // *may* affect cost calculation of the the next SCEV's we'll query,
+        // and next SCEV may errneously get smaller cost.
+
+        // Collect all the candidate PHINodes to be rewritten.
+        RewritePhiSet.emplace_back(PN, i, ExitValue, Inst, HighCost);
+      }
+    }
+  }
+
+  // Now that we've done preliminary filtering and billed all the SCEV's,
+  // we can perform the last sanity check - the expansion must be valid.
+  for (RewritePhi &Phi : RewritePhiSet) {
+    Phi.Expansion = Rewriter.expandCodeFor(Phi.ExpansionSCEV, Phi.PN->getType(),
+                                           Phi.ExpansionPoint);
+
+    LLVM_DEBUG(dbgs() << "rewriteLoopExitValues: AfterLoopVal = "
+                      << *(Phi.Expansion) << '\n'
+                      << "  LoopVal = " << *(Phi.ExpansionPoint) << "\n");
+
+    // FIXME: isValidRewrite() is a hack. it should be an assert, eventually.
+    Phi.ValidRewrite = isValidRewrite(SE, Phi.ExpansionPoint, Phi.Expansion);
+    if (!Phi.ValidRewrite) {
+      DeadInsts.push_back(Phi.Expansion);
+      continue;
+    }
+
+#ifndef NDEBUG
+    // If we reuse an instruction from a loop which is neither L nor one of
+    // its containing loops, we end up breaking LCSSA form for this loop by
+    // creating a new use of its instruction.
+    if (auto *ExitInsn = dyn_cast<Instruction>(Phi.Expansion))
+      if (auto *EVL = LI->getLoopFor(ExitInsn->getParent()))
+        if (EVL != L)
+          assert(EVL->contains(L) && "LCSSA breach detected!");
+#endif
+  }
+
+  // TODO: after isValidRewrite() is an assertion, evaluate whether
+  // it is beneficial to change how we calculate high-cost:
+  // if we have SCEV 'A' which we know we will expand, should we calculate
+  // the cost of other SCEV's after expanding SCEV 'A',
+  // thus potentially giving cost bonus to those other SCEV's?
+
+  bool LoopCanBeDel = canLoopBeDeleted(L, RewritePhiSet);
+  int NumReplaced = 0;
+
+  // Transformation.
+  for (const RewritePhi &Phi : RewritePhiSet) {
+    if (!Phi.ValidRewrite)
+      continue;
+
+    PHINode *PN = Phi.PN;
+    Value *ExitVal = Phi.Expansion;
+
+    // Only do the rewrite when the ExitValue can be expanded cheaply.
+    // If LoopCanBeDel is true, rewrite exit value aggressively.
+    if (ReplaceExitValue == OnlyCheapRepl && !LoopCanBeDel && Phi.HighCost) {
+      DeadInsts.push_back(ExitVal);
+      continue;
+    }
+
+    NumReplaced++;
+    Instruction *Inst = cast<Instruction>(PN->getIncomingValue(Phi.Ith));
+    PN->setIncomingValue(Phi.Ith, ExitVal);
+
+    // If this instruction is dead now, delete it. Don't do it now to avoid
+    // invalidating iterators.
+    if (isInstructionTriviallyDead(Inst, TLI))
+      DeadInsts.push_back(Inst);
+
+    // Replace PN with ExitVal if that is legal and does not break LCSSA.
+    if (PN->getNumIncomingValues() == 1 &&
+        LI->replacementPreservesLCSSAForm(PN, ExitVal)) {
+      PN->replaceAllUsesWith(ExitVal);
+      PN->eraseFromParent();
+    }
+  }
+
+  // The insertion point instruction may have been deleted; clear it out
+  // so that the rewriter doesn't trip over it later.
+  Rewriter.clearInsertPoint();
+  return NumReplaced;
+}
+
+/// Set weights for \p UnrolledLoop and \p RemainderLoop based on weights for
+/// \p OrigLoop.
+void llvm::setProfileInfoAfterUnrolling(Loop *OrigLoop, Loop *UnrolledLoop,
+                                        Loop *RemainderLoop, uint64_t UF) {
+  assert(UF > 0 && "Zero unrolled factor is not supported");
+  assert(UnrolledLoop != RemainderLoop &&
+         "Unrolled and Remainder loops are expected to distinct");
+
+  // Get number of iterations in the original scalar loop.
+  unsigned OrigLoopInvocationWeight = 0;
+  Optional<unsigned> OrigAverageTripCount =
+      getLoopEstimatedTripCount(OrigLoop, &OrigLoopInvocationWeight);
+  if (!OrigAverageTripCount)
+    return;
+
+  // Calculate number of iterations in unrolled loop.
+  unsigned UnrolledAverageTripCount = *OrigAverageTripCount / UF;
+  // Calculate number of iterations for remainder loop.
+  unsigned RemainderAverageTripCount = *OrigAverageTripCount % UF;
+
+  setLoopEstimatedTripCount(UnrolledLoop, UnrolledAverageTripCount,
+                            OrigLoopInvocationWeight);
+  setLoopEstimatedTripCount(RemainderLoop, RemainderAverageTripCount,
+                            OrigLoopInvocationWeight);
+}
+
+/// Utility that implements appending of loops onto a worklist.
+/// Loops are added in preorder (analogous for reverse postorder for trees),
+/// and the worklist is processed LIFO.
+template <typename RangeT>
+void llvm::appendReversedLoopsToWorklist(
+    RangeT &&Loops, SmallPriorityWorklist<Loop *, 4> &Worklist) {
+  // We use an internal worklist to build up the preorder traversal without
+  // recursion.
+  SmallVector<Loop *, 4> PreOrderLoops, PreOrderWorklist;
+
+  // We walk the initial sequence of loops in reverse because we generally want
+  // to visit defs before uses and the worklist is LIFO.
+  for (Loop *RootL : Loops) {
+    assert(PreOrderLoops.empty() && "Must start with an empty preorder walk.");
+    assert(PreOrderWorklist.empty() &&
+           "Must start with an empty preorder walk worklist.");
+    PreOrderWorklist.push_back(RootL);
+    do {
+      Loop *L = PreOrderWorklist.pop_back_val();
+      PreOrderWorklist.append(L->begin(), L->end());
+      PreOrderLoops.push_back(L);
+    } while (!PreOrderWorklist.empty());
+
+    Worklist.insert(std::move(PreOrderLoops));
+    PreOrderLoops.clear();
+  }
+}
+
+template <typename RangeT>
+void llvm::appendLoopsToWorklist(RangeT &&Loops,
+                                 SmallPriorityWorklist<Loop *, 4> &Worklist) {
+  appendReversedLoopsToWorklist(reverse(Loops), Worklist);
+}
+
+template void llvm::appendLoopsToWorklist<ArrayRef<Loop *> &>(
+    ArrayRef<Loop *> &Loops, SmallPriorityWorklist<Loop *, 4> &Worklist);
+
+template void
+llvm::appendLoopsToWorklist<Loop &>(Loop &L,
+                                    SmallPriorityWorklist<Loop *, 4> &Worklist);
+
+void llvm::appendLoopsToWorklist(LoopInfo &LI,
+                                 SmallPriorityWorklist<Loop *, 4> &Worklist) {
+  appendReversedLoopsToWorklist(LI, Worklist);
+}
+
+Loop *llvm::cloneLoop(Loop *L, Loop *PL, ValueToValueMapTy &VM,
+                      LoopInfo *LI, LPPassManager *LPM) {
+  Loop &New = *LI->AllocateLoop();
+  if (PL)
+    PL->addChildLoop(&New);
+  else
+    LI->addTopLevelLoop(&New);
+
+  if (LPM)
+    LPM->addLoop(New);
+
+  // Add all of the blocks in L to the new loop.
+  for (Loop::block_iterator I = L->block_begin(), E = L->block_end();
+       I != E; ++I)
+    if (LI->getLoopFor(*I) == L)
+      New.addBasicBlockToLoop(cast<BasicBlock>(VM[*I]), *LI);
+
+  // Add all of the subloops to the new loop.
+  for (Loop *I : *L)
+    cloneLoop(I, &New, VM, LI, LPM);
+
+  return &New;
+}
+
+/// IR Values for the lower and upper bounds of a pointer evolution.  We
+/// need to use value-handles because SCEV expansion can invalidate previously
+/// expanded values.  Thus expansion of a pointer can invalidate the bounds for
+/// a previous one.
+struct PointerBounds {
+  TrackingVH<Value> Start;
+  TrackingVH<Value> End;
+};
+
+/// Expand code for the lower and upper bound of the pointer group \p CG
+/// in \p TheLoop.  \return the values for the bounds.
+static PointerBounds expandBounds(const RuntimeCheckingPtrGroup *CG,
+                                  Loop *TheLoop, Instruction *Loc,
+                                  SCEVExpander &Exp, ScalarEvolution *SE) {
+  // TODO: Add helper to retrieve pointers to CG.
+  Value *Ptr = CG->RtCheck.Pointers[CG->Members[0]].PointerValue;
+  const SCEV *Sc = SE->getSCEV(Ptr);
+
+  unsigned AS = Ptr->getType()->getPointerAddressSpace();
+  LLVMContext &Ctx = Loc->getContext();
+
+  // Use this type for pointer arithmetic.
+  Type *PtrArithTy = Type::getInt8PtrTy(Ctx, AS);
+
+  if (SE->isLoopInvariant(Sc, TheLoop)) {
+    LLVM_DEBUG(dbgs() << "LAA: Adding RT check for a loop invariant ptr:"
+                      << *Ptr << "\n");
+    // Ptr could be in the loop body. If so, expand a new one at the correct
+    // location.
+    Instruction *Inst = dyn_cast<Instruction>(Ptr);
+    Value *NewPtr = (Inst && TheLoop->contains(Inst))
+                        ? Exp.expandCodeFor(Sc, PtrArithTy, Loc)
+                        : Ptr;
+    // We must return a half-open range, which means incrementing Sc.
+    const SCEV *ScPlusOne = SE->getAddExpr(Sc, SE->getOne(PtrArithTy));
+    Value *NewPtrPlusOne = Exp.expandCodeFor(ScPlusOne, PtrArithTy, Loc);
+    return {NewPtr, NewPtrPlusOne};
+  } else {
+    Value *Start = nullptr, *End = nullptr;
+    LLVM_DEBUG(dbgs() << "LAA: Adding RT check for range:\n");
+    Start = Exp.expandCodeFor(CG->Low, PtrArithTy, Loc);
+    End = Exp.expandCodeFor(CG->High, PtrArithTy, Loc);
+    LLVM_DEBUG(dbgs() << "Start: " << *CG->Low << " End: " << *CG->High
+                      << "\n");
+    return {Start, End};
+  }
+}
+
+/// Turns a collection of checks into a collection of expanded upper and
+/// lower bounds for both pointers in the check.
+static SmallVector<std::pair<PointerBounds, PointerBounds>, 4>
+expandBounds(const SmallVectorImpl<RuntimePointerCheck> &PointerChecks, Loop *L,
+             Instruction *Loc, ScalarEvolution *SE, SCEVExpander &Exp) {
+  SmallVector<std::pair<PointerBounds, PointerBounds>, 4> ChecksWithBounds;
+
+  // Here we're relying on the SCEV Expander's cache to only emit code for the
+  // same bounds once.
+  transform(PointerChecks, std::back_inserter(ChecksWithBounds),
+            [&](const RuntimePointerCheck &Check) {
+              PointerBounds First = expandBounds(Check.first, L, Loc, Exp, SE),
+                            Second =
+                                expandBounds(Check.second, L, Loc, Exp, SE);
+              return std::make_pair(First, Second);
+            });
+
+  return ChecksWithBounds;
+}
+
+std::pair<Instruction *, Instruction *> llvm::addRuntimeChecks(
+    Instruction *Loc, Loop *TheLoop,
+    const SmallVectorImpl<RuntimePointerCheck> &PointerChecks,
+    ScalarEvolution *SE) {
+  // TODO: Move noalias annotation code from LoopVersioning here and share with LV if possible.
+  // TODO: Pass  RtPtrChecking instead of PointerChecks and SE separately, if possible
+  const DataLayout &DL = TheLoop->getHeader()->getModule()->getDataLayout();
+  SCEVExpander Exp(*SE, DL, "induction");
+  auto ExpandedChecks = expandBounds(PointerChecks, TheLoop, Loc, SE, Exp);
+
+  LLVMContext &Ctx = Loc->getContext();
+  Instruction *FirstInst = nullptr;
+  IRBuilder<> ChkBuilder(Loc);
+  // Our instructions might fold to a constant.
+  Value *MemoryRuntimeCheck = nullptr;
+
+  // FIXME: this helper is currently a duplicate of the one in
+  // LoopVectorize.cpp.
+  auto GetFirstInst = [](Instruction *FirstInst, Value *V,
+                         Instruction *Loc) -> Instruction * {
+    if (FirstInst)
+      return FirstInst;
+    if (Instruction *I = dyn_cast<Instruction>(V))
+      return I->getParent() == Loc->getParent() ? I : nullptr;
+    return nullptr;
+  };
+
+  for (const auto &Check : ExpandedChecks) {
+    const PointerBounds &A = Check.first, &B = Check.second;
+    // Check if two pointers (A and B) conflict where conflict is computed as:
+    // start(A) <= end(B) && start(B) <= end(A)
+    unsigned AS0 = A.Start->getType()->getPointerAddressSpace();
+    unsigned AS1 = B.Start->getType()->getPointerAddressSpace();
+
+    assert((AS0 == B.End->getType()->getPointerAddressSpace()) &&
+           (AS1 == A.End->getType()->getPointerAddressSpace()) &&
+           "Trying to bounds check pointers with different address spaces");
+
+    Type *PtrArithTy0 = Type::getInt8PtrTy(Ctx, AS0);
+    Type *PtrArithTy1 = Type::getInt8PtrTy(Ctx, AS1);
+
+    Value *Start0 = ChkBuilder.CreateBitCast(A.Start, PtrArithTy0, "bc");
+    Value *Start1 = ChkBuilder.CreateBitCast(B.Start, PtrArithTy1, "bc");
+    Value *End0 = ChkBuilder.CreateBitCast(A.End, PtrArithTy1, "bc");
+    Value *End1 = ChkBuilder.CreateBitCast(B.End, PtrArithTy0, "bc");
+
+    // [A|B].Start points to the first accessed byte under base [A|B].
+    // [A|B].End points to the last accessed byte, plus one.
+    // There is no conflict when the intervals are disjoint:
+    // NoConflict = (B.Start >= A.End) || (A.Start >= B.End)
+    //
+    // bound0 = (B.Start < A.End)
+    // bound1 = (A.Start < B.End)
+    //  IsConflict = bound0 & bound1
+    Value *Cmp0 = ChkBuilder.CreateICmpULT(Start0, End1, "bound0");
+    FirstInst = GetFirstInst(FirstInst, Cmp0, Loc);
+    Value *Cmp1 = ChkBuilder.CreateICmpULT(Start1, End0, "bound1");
+    FirstInst = GetFirstInst(FirstInst, Cmp1, Loc);
+    Value *IsConflict = ChkBuilder.CreateAnd(Cmp0, Cmp1, "found.conflict");
+    FirstInst = GetFirstInst(FirstInst, IsConflict, Loc);
+    if (MemoryRuntimeCheck) {
+      IsConflict =
+          ChkBuilder.CreateOr(MemoryRuntimeCheck, IsConflict, "conflict.rdx");
+      FirstInst = GetFirstInst(FirstInst, IsConflict, Loc);
+    }
+    MemoryRuntimeCheck = IsConflict;
+  }
+
+  if (!MemoryRuntimeCheck)
+    return std::make_pair(nullptr, nullptr);
+
+  // We have to do this trickery because the IRBuilder might fold the check to a
+  // constant expression in which case there is no Instruction anchored in a
+  // the block.
+  Instruction *Check =
+      BinaryOperator::CreateAnd(MemoryRuntimeCheck, ConstantInt::getTrue(Ctx));
+  ChkBuilder.Insert(Check, "memcheck.conflict");
+  FirstInst = GetFirstInst(FirstInst, Check, Loc);
+  return std::make_pair(FirstInst, Check);
+}
diff --git a/llvm/lib/Transforms/Utils/LoopVersioning.cpp b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
index 50752bd78a65..16bd08c704ee 100644
--- a/llvm/lib/Transforms/Utils/LoopVersioning.cpp
+++ b/llvm/lib/Transforms/Utils/LoopVersioning.cpp
@@ -13,15 +13,16 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/Analysis/LoopAccessAnalysis.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 using namespace llvm;
 
@@ -44,9 +45,8 @@ LoopVersioning::LoopVersioning(const LoopAccessInfo &LAI, Loop *L, LoopInfo *LI,
   }
 }
 
-void LoopVersioning::setAliasChecks(
-    SmallVector<RuntimePointerChecking::PointerCheck, 4> Checks) {
-  AliasChecks = std::move(Checks);
+void LoopVersioning::setAliasChecks(ArrayRef<RuntimePointerCheck> Checks) {
+  AliasChecks = {Checks.begin(), Checks.end()};
 }
 
 void LoopVersioning::setSCEVChecks(SCEVUnionPredicate Check) {
@@ -62,8 +62,10 @@ void LoopVersioning::versionLoop(
 
   // Add the memcheck in the original preheader (this is empty initially).
   BasicBlock *RuntimeCheckBB = VersionedLoop->getLoopPreheader();
+  const auto &RtPtrChecking = *LAI.getRuntimePointerChecking();
   std::tie(FirstCheckInst, MemRuntimeCheck) =
-      LAI.addRuntimeChecks(RuntimeCheckBB->getTerminator(), AliasChecks);
+      addRuntimeChecks(RuntimeCheckBB->getTerminator(), VersionedLoop,
+                       AliasChecks, RtPtrChecking.getSE());
 
   const SCEVUnionPredicate &Pred = LAI.getPSE().getUnionPredicate();
   SCEVExpander Exp(*SE, RuntimeCheckBB->getModule()->getDataLayout(),
@@ -194,8 +196,7 @@ void LoopVersioning::prepareNoAliasMetadata() {
 
   // Go through the checks and for each pointer group, collect the scopes for
   // each non-aliasing pointer group.
-  DenseMap<const RuntimePointerChecking::CheckingPtrGroup *,
-           SmallVector<Metadata *, 4>>
+  DenseMap<const RuntimeCheckingPtrGroup *, SmallVector<Metadata *, 4>>
       GroupToNonAliasingScopes;
 
   for (const auto &Check : AliasChecks)
diff --git a/llvm/lib/Transforms/Utils/LowerInvoke.cpp b/llvm/lib/Transforms/Utils/LowerInvoke.cpp
index 1af0ce3d86cc..0b225e8abc4e 100644
--- a/llvm/lib/Transforms/Utils/LowerInvoke.cpp
+++ b/llvm/lib/Transforms/Utils/LowerInvoke.cpp
@@ -53,7 +53,7 @@ static bool runImpl(Function &F) {
       II->getOperandBundlesAsDefs(OpBundles);
       // Insert a normal call instruction...
       CallInst *NewCall =
-          CallInst::Create(II->getFunctionType(), II->getCalledValue(),
+          CallInst::Create(II->getFunctionType(), II->getCalledOperand(),
                            CallArgs, OpBundles, "", II);
       NewCall->takeName(II);
       NewCall->setCallingConv(II->getCallingConv());
diff --git a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
index 0cc085dc366c..616b4e8eb01c 100644
--- a/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
+++ b/llvm/lib/Transforms/Utils/LowerMemIntrinsics.cpp
@@ -14,17 +14,9 @@
 
 using namespace llvm;
 
-static unsigned getLoopOperandSizeInBytes(Type *Type) {
-  if (VectorType *VTy = dyn_cast<VectorType>(Type)) {
-    return VTy->getBitWidth() / 8;
-  }
-
-  return Type->getPrimitiveSizeInBits() / 8;
-}
-
 void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
                                      Value *DstAddr, ConstantInt *CopyLen,
-                                     unsigned SrcAlign, unsigned DestAlign,
+                                     Align SrcAlign, Align DstAlign,
                                      bool SrcIsVolatile, bool DstIsVolatile,
                                      const TargetTransformInfo &TTI) {
   // No need to expand zero length copies.
@@ -35,17 +27,18 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
   BasicBlock *PostLoopBB = nullptr;
   Function *ParentFunc = PreLoopBB->getParent();
   LLVMContext &Ctx = PreLoopBB->getContext();
+  const DataLayout &DL = ParentFunc->getParent()->getDataLayout();
+
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
 
   Type *TypeOfCopyLen = CopyLen->getType();
-  Type *LoopOpType =
-      TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAlign, DestAlign);
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
 
-  unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType);
+  unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
   uint64_t LoopEndCount = CopyLen->getZExtValue() / LoopOpSize;
 
-  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
-  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
-
   if (LoopEndCount != 0) {
     // Split
     PostLoopBB = PreLoopBB->splitBasicBlock(InsertBefore, "memcpy-split");
@@ -66,16 +59,20 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
       DstAddr = PLBuilder.CreateBitCast(DstAddr, DstOpType);
     }
 
+    Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+    Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+
     IRBuilder<> LoopBuilder(LoopBB);
     PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 2, "loop-index");
     LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0U), PreLoopBB);
     // Loop Body
     Value *SrcGEP =
         LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
-    Value *Load = LoopBuilder.CreateLoad(LoopOpType, SrcGEP, SrcIsVolatile);
+    Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP,
+                                                PartSrcAlign, SrcIsVolatile);
     Value *DstGEP =
         LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
-    LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+    LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
 
     Value *NewIndex =
         LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1U));
@@ -93,17 +90,17 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
     IRBuilder<> RBuilder(PostLoopBB ? PostLoopBB->getFirstNonPHI()
                                     : InsertBefore);
 
-    // Update the alignment based on the copy size used in the loop body.
-    SrcAlign = std::min(SrcAlign, LoopOpSize);
-    DestAlign = std::min(DestAlign, LoopOpSize);
-
     SmallVector<Type *, 5> RemainingOps;
     TTI.getMemcpyLoopResidualLoweringType(RemainingOps, Ctx, RemainingBytes,
-                                          SrcAlign, DestAlign);
+                                          SrcAS, DstAS, SrcAlign.value(),
+                                          DstAlign.value());
 
     for (auto OpTy : RemainingOps) {
+      Align PartSrcAlign(commonAlignment(SrcAlign, BytesCopied));
+      Align PartDstAlign(commonAlignment(DstAlign, BytesCopied));
+
       // Calaculate the new index
-      unsigned OperandSize = getLoopOperandSizeInBytes(OpTy);
+      unsigned OperandSize = DL.getTypeStoreSize(OpTy);
       uint64_t GepIndex = BytesCopied / OperandSize;
       assert(GepIndex * OperandSize == BytesCopied &&
              "Division should have no Remainder!");
@@ -114,7 +111,8 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
                              : RBuilder.CreateBitCast(SrcAddr, SrcPtrType);
       Value *SrcGEP = RBuilder.CreateInBoundsGEP(
           OpTy, CastedSrc, ConstantInt::get(TypeOfCopyLen, GepIndex));
-      Value *Load = RBuilder.CreateLoad(OpTy, SrcGEP, SrcIsVolatile);
+      Value *Load =
+          RBuilder.CreateAlignedLoad(OpTy, SrcGEP, PartSrcAlign, SrcIsVolatile);
 
       // Cast destination to operand type and store.
       PointerType *DstPtrType = PointerType::get(OpTy, DstAS);
@@ -123,7 +121,7 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
                              : RBuilder.CreateBitCast(DstAddr, DstPtrType);
       Value *DstGEP = RBuilder.CreateInBoundsGEP(
           OpTy, CastedDst, ConstantInt::get(TypeOfCopyLen, GepIndex));
-      RBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+      RBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
 
       BytesCopied += OperandSize;
     }
@@ -134,8 +132,8 @@ void llvm::createMemCpyLoopKnownSize(Instruction *InsertBefore, Value *SrcAddr,
 
 void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
                                        Value *SrcAddr, Value *DstAddr,
-                                       Value *CopyLen, unsigned SrcAlign,
-                                       unsigned DestAlign, bool SrcIsVolatile,
+                                       Value *CopyLen, Align SrcAlign,
+                                       Align DstAlign, bool SrcIsVolatile,
                                        bool DstIsVolatile,
                                        const TargetTransformInfo &TTI) {
   BasicBlock *PreLoopBB = InsertBefore->getParent();
@@ -143,16 +141,17 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
       PreLoopBB->splitBasicBlock(InsertBefore, "post-loop-memcpy-expansion");
 
   Function *ParentFunc = PreLoopBB->getParent();
+  const DataLayout &DL = ParentFunc->getParent()->getDataLayout();
   LLVMContext &Ctx = PreLoopBB->getContext();
+  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
+  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
 
-  Type *LoopOpType =
-      TTI.getMemcpyLoopLoweringType(Ctx, CopyLen, SrcAlign, DestAlign);
-  unsigned LoopOpSize = getLoopOperandSizeInBytes(LoopOpType);
+  Type *LoopOpType = TTI.getMemcpyLoopLoweringType(
+      Ctx, CopyLen, SrcAS, DstAS, SrcAlign.value(), DstAlign.value());
+  unsigned LoopOpSize = DL.getTypeStoreSize(LoopOpType);
 
   IRBuilder<> PLBuilder(PreLoopBB->getTerminator());
 
-  unsigned SrcAS = cast<PointerType>(SrcAddr->getType())->getAddressSpace();
-  unsigned DstAS = cast<PointerType>(DstAddr->getType())->getAddressSpace();
   PointerType *SrcOpType = PointerType::get(LoopOpType, SrcAS);
   PointerType *DstOpType = PointerType::get(LoopOpType, DstAS);
   if (SrcAddr->getType() != SrcOpType) {
@@ -177,13 +176,17 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
       BasicBlock::Create(Ctx, "loop-memcpy-expansion", ParentFunc, PostLoopBB);
   IRBuilder<> LoopBuilder(LoopBB);
 
+  Align PartSrcAlign(commonAlignment(SrcAlign, LoopOpSize));
+  Align PartDstAlign(commonAlignment(DstAlign, LoopOpSize));
+
   PHINode *LoopIndex = LoopBuilder.CreatePHI(CopyLenType, 2, "loop-index");
   LoopIndex->addIncoming(ConstantInt::get(CopyLenType, 0U), PreLoopBB);
 
   Value *SrcGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, SrcAddr, LoopIndex);
-  Value *Load = LoopBuilder.CreateLoad(LoopOpType, SrcGEP, SrcIsVolatile);
+  Value *Load = LoopBuilder.CreateAlignedLoad(LoopOpType, SrcGEP, PartSrcAlign,
+                                              SrcIsVolatile);
   Value *DstGEP = LoopBuilder.CreateInBoundsGEP(LoopOpType, DstAddr, LoopIndex);
-  LoopBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+  LoopBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
 
   Value *NewIndex =
       LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(CopyLenType, 1U));
@@ -234,10 +237,11 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
     Value *FullOffset = ResBuilder.CreateAdd(RuntimeBytesCopied, ResidualIndex);
     Value *SrcGEP =
         ResBuilder.CreateInBoundsGEP(Int8Type, SrcAsInt8, FullOffset);
-    Value *Load = ResBuilder.CreateLoad(Int8Type, SrcGEP, SrcIsVolatile);
+    Value *Load = ResBuilder.CreateAlignedLoad(Int8Type, SrcGEP, PartSrcAlign,
+                                               SrcIsVolatile);
     Value *DstGEP =
         ResBuilder.CreateInBoundsGEP(Int8Type, DstAsInt8, FullOffset);
-    ResBuilder.CreateStore(Load, DstGEP, DstIsVolatile);
+    ResBuilder.CreateAlignedStore(Load, DstGEP, PartDstAlign, DstIsVolatile);
 
     Value *ResNewIndex =
         ResBuilder.CreateAdd(ResidualIndex, ConstantInt::get(CopyLenType, 1U));
@@ -284,13 +288,14 @@ void llvm::createMemCpyLoopUnknownSize(Instruction *InsertBefore,
 //   }
 //   return dst;
 // }
-static void createMemMoveLoop(Instruction *InsertBefore,
-                              Value *SrcAddr, Value *DstAddr, Value *CopyLen,
-                              unsigned SrcAlign, unsigned DestAlign,
-                              bool SrcIsVolatile, bool DstIsVolatile) {
+static void createMemMoveLoop(Instruction *InsertBefore, Value *SrcAddr,
+                              Value *DstAddr, Value *CopyLen, Align SrcAlign,
+                              Align DstAlign, bool SrcIsVolatile,
+                              bool DstIsVolatile) {
   Type *TypeOfCopyLen = CopyLen->getType();
   BasicBlock *OrigBB = InsertBefore->getParent();
   Function *F = OrigBB->getParent();
+  const DataLayout &DL = F->getParent()->getDataLayout();
 
   Type *EltTy = cast<PointerType>(SrcAddr->getType())->getElementType();
 
@@ -318,6 +323,10 @@ static void createMemMoveLoop(Instruction *InsertBefore,
   BasicBlock *ExitBB = InsertBefore->getParent();
   ExitBB->setName("memmove_done");
 
+  unsigned PartSize = DL.getTypeStoreSize(EltTy);
+  Align PartSrcAlign(commonAlignment(SrcAlign, PartSize));
+  Align PartDstAlign(commonAlignment(DstAlign, PartSize));
+
   // Initial comparison of n == 0 that lets us skip the loops altogether. Shared
   // between both backwards and forward copy clauses.
   ICmpInst *CompareN =
@@ -331,11 +340,12 @@ static void createMemMoveLoop(Instruction *InsertBefore,
   PHINode *LoopPhi = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
   Value *IndexPtr = LoopBuilder.CreateSub(
       LoopPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_ptr");
-  Value *Element = LoopBuilder.CreateLoad(
+  Value *Element = LoopBuilder.CreateAlignedLoad(
       EltTy, LoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, IndexPtr),
-      "element");
-  LoopBuilder.CreateStore(
-      Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr));
+      PartSrcAlign, "element");
+  LoopBuilder.CreateAlignedStore(
+      Element, LoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, IndexPtr),
+      PartDstAlign);
   LoopBuilder.CreateCondBr(
       LoopBuilder.CreateICmpEQ(IndexPtr, ConstantInt::get(TypeOfCopyLen, 0)),
       ExitBB, LoopBB);
@@ -349,11 +359,11 @@ static void createMemMoveLoop(Instruction *InsertBefore,
     BasicBlock::Create(F->getContext(), "copy_forward_loop", F, ExitBB);
   IRBuilder<> FwdLoopBuilder(FwdLoopBB);
   PHINode *FwdCopyPhi = FwdLoopBuilder.CreatePHI(TypeOfCopyLen, 0, "index_ptr");
-  Value *FwdElement = FwdLoopBuilder.CreateLoad(
-      EltTy, FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi),
-      "element");
-  FwdLoopBuilder.CreateStore(
-      FwdElement, FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi));
+  Value *SrcGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, SrcAddr, FwdCopyPhi);
+  Value *FwdElement =
+      FwdLoopBuilder.CreateAlignedLoad(EltTy, SrcGEP, PartSrcAlign, "element");
+  Value *DstGEP = FwdLoopBuilder.CreateInBoundsGEP(EltTy, DstAddr, FwdCopyPhi);
+  FwdLoopBuilder.CreateAlignedStore(FwdElement, DstGEP, PartDstAlign);
   Value *FwdIndexPtr = FwdLoopBuilder.CreateAdd(
       FwdCopyPhi, ConstantInt::get(TypeOfCopyLen, 1), "index_increment");
   FwdLoopBuilder.CreateCondBr(FwdLoopBuilder.CreateICmpEQ(FwdIndexPtr, CopyLen),
@@ -365,12 +375,13 @@ static void createMemMoveLoop(Instruction *InsertBefore,
   ElseTerm->eraseFromParent();
 }
 
-static void createMemSetLoop(Instruction *InsertBefore,
-                             Value *DstAddr, Value *CopyLen, Value *SetValue,
-                             unsigned Align, bool IsVolatile) {
+static void createMemSetLoop(Instruction *InsertBefore, Value *DstAddr,
+                             Value *CopyLen, Value *SetValue, Align DstAlign,
+                             bool IsVolatile) {
   Type *TypeOfCopyLen = CopyLen->getType();
   BasicBlock *OrigBB = InsertBefore->getParent();
   Function *F = OrigBB->getParent();
+  const DataLayout &DL = F->getParent()->getDataLayout();
   BasicBlock *NewBB =
       OrigBB->splitBasicBlock(InsertBefore, "split");
   BasicBlock *LoopBB
@@ -388,14 +399,17 @@ static void createMemSetLoop(Instruction *InsertBefore,
       LoopBB);
   OrigBB->getTerminator()->eraseFromParent();
 
+  unsigned PartSize = DL.getTypeStoreSize(SetValue->getType());
+  Align PartAlign(commonAlignment(DstAlign, PartSize));
+
   IRBuilder<> LoopBuilder(LoopBB);
   PHINode *LoopIndex = LoopBuilder.CreatePHI(TypeOfCopyLen, 0);
   LoopIndex->addIncoming(ConstantInt::get(TypeOfCopyLen, 0), OrigBB);
 
-  LoopBuilder.CreateStore(
+  LoopBuilder.CreateAlignedStore(
       SetValue,
       LoopBuilder.CreateInBoundsGEP(SetValue->getType(), DstAddr, LoopIndex),
-      IsVolatile);
+      PartAlign, IsVolatile);
 
   Value *NewIndex =
       LoopBuilder.CreateAdd(LoopIndex, ConstantInt::get(TypeOfCopyLen, 1));
@@ -408,25 +422,27 @@ static void createMemSetLoop(Instruction *InsertBefore,
 void llvm::expandMemCpyAsLoop(MemCpyInst *Memcpy,
                               const TargetTransformInfo &TTI) {
   if (ConstantInt *CI = dyn_cast<ConstantInt>(Memcpy->getLength())) {
-    createMemCpyLoopKnownSize(/* InsertBefore */ Memcpy,
-                              /* SrcAddr */ Memcpy->getRawSource(),
-                              /* DstAddr */ Memcpy->getRawDest(),
-                              /* CopyLen */ CI,
-                              /* SrcAlign */ Memcpy->getSourceAlignment(),
-                              /* DestAlign */ Memcpy->getDestAlignment(),
-                              /* SrcIsVolatile */ Memcpy->isVolatile(),
-                              /* DstIsVolatile */ Memcpy->isVolatile(),
-                              /* TargetTransformInfo */ TTI);
+    createMemCpyLoopKnownSize(
+        /* InsertBefore */ Memcpy,
+        /* SrcAddr */ Memcpy->getRawSource(),
+        /* DstAddr */ Memcpy->getRawDest(),
+        /* CopyLen */ CI,
+        /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
+        /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
+        /* SrcIsVolatile */ Memcpy->isVolatile(),
+        /* DstIsVolatile */ Memcpy->isVolatile(),
+        /* TargetTransformInfo */ TTI);
   } else {
-    createMemCpyLoopUnknownSize(/* InsertBefore */ Memcpy,
-                                /* SrcAddr */ Memcpy->getRawSource(),
-                                /* DstAddr */ Memcpy->getRawDest(),
-                                /* CopyLen */ Memcpy->getLength(),
-                                /* SrcAlign */ Memcpy->getSourceAlignment(),
-                                /* DestAlign */ Memcpy->getDestAlignment(),
-                                /* SrcIsVolatile */ Memcpy->isVolatile(),
-                                /* DstIsVolatile */ Memcpy->isVolatile(),
-                                /* TargetTransfomrInfo */ TTI);
+    createMemCpyLoopUnknownSize(
+        /* InsertBefore */ Memcpy,
+        /* SrcAddr */ Memcpy->getRawSource(),
+        /* DstAddr */ Memcpy->getRawDest(),
+        /* CopyLen */ Memcpy->getLength(),
+        /* SrcAlign */ Memcpy->getSourceAlign().valueOrOne(),
+        /* DestAlign */ Memcpy->getDestAlign().valueOrOne(),
+        /* SrcIsVolatile */ Memcpy->isVolatile(),
+        /* DstIsVolatile */ Memcpy->isVolatile(),
+        /* TargetTransfomrInfo */ TTI);
   }
 }
 
@@ -435,8 +451,8 @@ void llvm::expandMemMoveAsLoop(MemMoveInst *Memmove) {
                     /* SrcAddr */ Memmove->getRawSource(),
                     /* DstAddr */ Memmove->getRawDest(),
                     /* CopyLen */ Memmove->getLength(),
-                    /* SrcAlign */ Memmove->getSourceAlignment(),
-                    /* DestAlign */ Memmove->getDestAlignment(),
+                    /* SrcAlign */ Memmove->getSourceAlign().valueOrOne(),
+                    /* DestAlign */ Memmove->getDestAlign().valueOrOne(),
                     /* SrcIsVolatile */ Memmove->isVolatile(),
                     /* DstIsVolatile */ Memmove->isVolatile());
 }
@@ -446,6 +462,6 @@ void llvm::expandMemSetAsLoop(MemSetInst *Memset) {
                    /* DstAddr */ Memset->getRawDest(),
                    /* CopyLen */ Memset->getLength(),
                    /* SetValue */ Memset->getValue(),
-                   /* Alignment */ Memset->getDestAlignment(),
+                   /* Alignment */ Memset->getDestAlign().valueOrOne(),
                    Memset->isVolatile());
 }
diff --git a/llvm/lib/Transforms/Utils/LowerSwitch.cpp b/llvm/lib/Transforms/Utils/LowerSwitch.cpp
index 4b9d0dadfc17..34e836d9660f 100644
--- a/llvm/lib/Transforms/Utils/LowerSwitch.cpp
+++ b/llvm/lib/Transforms/Utils/LowerSwitch.cpp
@@ -148,13 +148,6 @@ bool LowerSwitch::runOnFunction(Function &F) {
   LazyValueInfo *LVI = &getAnalysis<LazyValueInfoWrapperPass>().getLVI();
   auto *ACT = getAnalysisIfAvailable<AssumptionCacheTracker>();
   AssumptionCache *AC = ACT ? &ACT->getAssumptionCache(F) : nullptr;
-  // Prevent LazyValueInfo from using the DominatorTree as LowerSwitch does not
-  // preserve it and it becomes stale (when available) pretty much immediately.
-  // Currently the DominatorTree is only used by LowerSwitch indirectly via LVI
-  // and computeKnownBits to refine isValidAssumeForContext's results. Given
-  // that the latter can handle some of the simple cases w/o a DominatorTree,
-  // it's easier to refrain from using the tree than to keep it up to date.
-  LVI->disableDT();
 
   bool Changed = false;
   SmallPtrSet<BasicBlock*, 8> DeleteList;
diff --git a/llvm/lib/Transforms/Utils/ModuleUtils.cpp b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
index b94f57e4dc2c..ef9f18a2289e 100644
--- a/llvm/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/llvm/lib/Transforms/Utils/ModuleUtils.cpp
@@ -11,15 +11,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/DerivedTypes.h"
 #include "llvm/IR/Function.h"
 #include "llvm/IR/IRBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/Support/raw_ostream.h"
-
 using namespace llvm;
 
+#define DEBUG_TYPE "moduleutils"
+
 static void appendToGlobalArray(const char *Array, Module &M, Function *F,
                                 int Priority, Constant *Data) {
   IRBuilder<> IRB(M.getContext());
@@ -117,6 +119,15 @@ llvm::declareSanitizerInitFunction(Module &M, StringRef InitName,
       AttributeList());
 }
 
+Function *llvm::createSanitizerCtor(Module &M, StringRef CtorName) {
+  Function *Ctor = Function::Create(
+      FunctionType::get(Type::getVoidTy(M.getContext()), false),
+      GlobalValue::InternalLinkage, CtorName, &M);
+  BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor);
+  ReturnInst::Create(M.getContext(), CtorBB);
+  return Ctor;
+}
+
 std::pair<Function *, FunctionCallee> llvm::createSanitizerCtorAndInitFunctions(
     Module &M, StringRef CtorName, StringRef InitName,
     ArrayRef<Type *> InitArgTypes, ArrayRef<Value *> InitArgs,
@@ -126,11 +137,8 @@ std::pair<Function *, FunctionCallee> llvm::createSanitizerCtorAndInitFunctions(
          "Sanitizer's init function expects different number of arguments");
   FunctionCallee InitFunction =
       declareSanitizerInitFunction(M, InitName, InitArgTypes);
-  Function *Ctor = Function::Create(
-      FunctionType::get(Type::getVoidTy(M.getContext()), false),
-      GlobalValue::InternalLinkage, CtorName, &M);
-  BasicBlock *CtorBB = BasicBlock::Create(M.getContext(), "", Ctor);
-  IRBuilder<> IRB(ReturnInst::Create(M.getContext(), CtorBB));
+  Function *Ctor = createSanitizerCtor(M, CtorName);
+  IRBuilder<> IRB(Ctor->getEntryBlock().getTerminator());
   IRB.CreateCall(InitFunction, InitArgs);
   if (!VersionCheckName.empty()) {
     FunctionCallee VersionCheckFunction = M.getOrInsertFunction(
@@ -298,8 +306,9 @@ void VFABI::setVectorVariantNames(
   Module *M = CI->getModule();
 #ifndef NDEBUG
   for (const std::string &VariantMapping : VariantMappings) {
-    Optional<VFInfo> VI = VFABI::tryDemangleForVFABI(VariantMapping);
-    assert(VI.hasValue() && "Canno add an invalid VFABI name.");
+    LLVM_DEBUG(dbgs() << "VFABI: adding mapping '" << VariantMapping << "'\n");
+    Optional<VFInfo> VI = VFABI::tryDemangleForVFABI(VariantMapping, *M);
+    assert(VI.hasValue() && "Cannot add an invalid VFABI name.");
     assert(M->getNamedValue(VI.getValue().VectorName) &&
            "Cannot add variant to attribute: "
            "vector function declaration is missing.");
diff --git a/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp b/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
index 1c5c41abc682..7083789267d9 100644
--- a/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
+++ b/llvm/lib/Transforms/Utils/NameAnonGlobals.cpp
@@ -55,7 +55,7 @@ public:
     Hasher.final(Hash);
     SmallString<32> Result;
     MD5::stringifyResult(Hash, Result);
-    TheHash = Result.str();
+    TheHash = std::string(Result.str());
     return TheHash;
   }
 };
diff --git a/llvm/lib/Transforms/Utils/PredicateInfo.cpp b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
index dda2867f44b2..99b64a7462f6 100644
--- a/llvm/lib/Transforms/Utils/PredicateInfo.cpp
+++ b/llvm/lib/Transforms/Utils/PredicateInfo.cpp
@@ -31,6 +31,7 @@
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
 #include "llvm/InitializePasses.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/DebugCounter.h"
 #include "llvm/Support/FormattedStream.h"
@@ -39,7 +40,6 @@
 #define DEBUG_TYPE "predicateinfo"
 using namespace llvm;
 using namespace PatternMatch;
-using namespace llvm::PredicateInfoClasses;
 
 INITIALIZE_PASS_BEGIN(PredicateInfoPrinterLegacyPass, "print-predicateinfo",
                       "PredicateInfo Printer", false, false)
@@ -83,7 +83,6 @@ getBlockEdge(const PredicateBase *PB) {
 }
 
 namespace llvm {
-namespace PredicateInfoClasses {
 enum LocalNum {
   // Operations that must appear first in the block.
   LN_First,
@@ -109,8 +108,7 @@ struct ValueDFS {
 };
 
 // Perform a strict weak ordering on instructions and arguments.
-static bool valueComesBefore(OrderedInstructions &OI, const Value *A,
-                             const Value *B) {
+static bool valueComesBefore(const Value *A, const Value *B) {
   auto *ArgA = dyn_cast_or_null<Argument>(A);
   auto *ArgB = dyn_cast_or_null<Argument>(B);
   if (ArgA && !ArgB)
@@ -119,17 +117,14 @@ static bool valueComesBefore(OrderedInstructions &OI, const Value *A,
     return false;
   if (ArgA && ArgB)
     return ArgA->getArgNo() < ArgB->getArgNo();
-  return OI.dfsBefore(cast<Instruction>(A), cast<Instruction>(B));
+  return cast<Instruction>(A)->comesBefore(cast<Instruction>(B));
 }
 
-// This compares ValueDFS structures, creating OrderedBasicBlocks where
-// necessary to compare uses/defs in the same block.  Doing so allows us to walk
-// the minimum number of instructions necessary to compute our def/use ordering.
+// This compares ValueDFS structures. Doing so allows us to walk the minimum
+// number of instructions necessary to compute our def/use ordering.
 struct ValueDFS_Compare {
   DominatorTree &DT;
-  OrderedInstructions &OI;
-  ValueDFS_Compare(DominatorTree &DT, OrderedInstructions &OI)
-      : DT(DT), OI(OI) {}
+  ValueDFS_Compare(DominatorTree &DT) : DT(DT) {}
 
   bool operator()(const ValueDFS &A, const ValueDFS &B) const {
     if (&A == &B)
@@ -210,14 +205,14 @@ struct ValueDFS_Compare {
     // numbering will say the placed predicaeinfos should go first (IE
     // LN_beginning), so we won't be in this function. For assumes, we will end
     // up here, beause we need to order the def we will place relative to the
-    // assume.  So for the purpose of ordering, we pretend the def is the assume
-    // because that is where we will insert the info.
+    // assume.  So for the purpose of ordering, we pretend the def is right
+    // after the assume, because that is where we will insert the info.
     if (!VD.U) {
       assert(VD.PInfo &&
              "No def, no use, and no predicateinfo should not occur");
       assert(isa<PredicateAssume>(VD.PInfo) &&
              "Middle of block should only occur for assumes");
-      return cast<PredicateAssume>(VD.PInfo)->AssumeInst;
+      return cast<PredicateAssume>(VD.PInfo)->AssumeInst->getNextNode();
     }
     return nullptr;
   }
@@ -243,18 +238,71 @@ struct ValueDFS_Compare {
     auto *ArgB = dyn_cast_or_null<Argument>(BDef);
 
     if (ArgA || ArgB)
-      return valueComesBefore(OI, ArgA, ArgB);
+      return valueComesBefore(ArgA, ArgB);
 
     auto *AInst = getDefOrUser(ADef, A.U);
     auto *BInst = getDefOrUser(BDef, B.U);
-    return valueComesBefore(OI, AInst, BInst);
+    return valueComesBefore(AInst, BInst);
   }
 };
 
-} // namespace PredicateInfoClasses
+class PredicateInfoBuilder {
+  // Used to store information about each value we might rename.
+  struct ValueInfo {
+    SmallVector<PredicateBase *, 4> Infos;
+  };
+
+  PredicateInfo &PI;
+  Function &F;
+  DominatorTree &DT;
+  AssumptionCache &AC;
+
+  // This stores info about each operand or comparison result we make copies
+  // of. The real ValueInfos start at index 1, index 0 is unused so that we
+  // can more easily detect invalid indexing.
+  SmallVector<ValueInfo, 32> ValueInfos;
+
+  // This gives the index into the ValueInfos array for a given Value. Because
+  // 0 is not a valid Value Info index, you can use DenseMap::lookup and tell
+  // whether it returned a valid result.
+  DenseMap<Value *, unsigned int> ValueInfoNums;
+
+  // The set of edges along which we can only handle phi uses, due to critical
+  // edges.
+  DenseSet<std::pair<BasicBlock *, BasicBlock *>> EdgeUsesOnly;
+
+  ValueInfo &getOrCreateValueInfo(Value *);
+  const ValueInfo &getValueInfo(Value *) const;
+
+  void processAssume(IntrinsicInst *, BasicBlock *,
+                     SmallVectorImpl<Value *> &OpsToRename);
+  void processBranch(BranchInst *, BasicBlock *,
+                     SmallVectorImpl<Value *> &OpsToRename);
+  void processSwitch(SwitchInst *, BasicBlock *,
+                     SmallVectorImpl<Value *> &OpsToRename);
+  void renameUses(SmallVectorImpl<Value *> &OpsToRename);
+  void addInfoFor(SmallVectorImpl<Value *> &OpsToRename, Value *Op,
+                  PredicateBase *PB);
+
+  typedef SmallVectorImpl<ValueDFS> ValueDFSStack;
+  void convertUsesToDFSOrdered(Value *, SmallVectorImpl<ValueDFS> &);
+  Value *materializeStack(unsigned int &, ValueDFSStack &, Value *);
+  bool stackIsInScope(const ValueDFSStack &, const ValueDFS &) const;
+  void popStackUntilDFSScope(ValueDFSStack &, const ValueDFS &);
+
+public:
+  PredicateInfoBuilder(PredicateInfo &PI, Function &F, DominatorTree &DT,
+                       AssumptionCache &AC)
+      : PI(PI), F(F), DT(DT), AC(AC) {
+    // Push an empty operand info so that we can detect 0 as not finding one
+    ValueInfos.resize(1);
+  }
+
+  void buildPredicateInfo();
+};
 
-bool PredicateInfo::stackIsInScope(const ValueDFSStack &Stack,
-                                   const ValueDFS &VDUse) const {
+bool PredicateInfoBuilder::stackIsInScope(const ValueDFSStack &Stack,
+                                          const ValueDFS &VDUse) const {
   if (Stack.empty())
     return false;
   // If it's a phi only use, make sure it's for this phi node edge, and that the
@@ -281,15 +329,15 @@ bool PredicateInfo::stackIsInScope(const ValueDFSStack &Stack,
           VDUse.DFSOut <= Stack.back().DFSOut);
 }
 
-void PredicateInfo::popStackUntilDFSScope(ValueDFSStack &Stack,
-                                          const ValueDFS &VD) {
+void PredicateInfoBuilder::popStackUntilDFSScope(ValueDFSStack &Stack,
+                                                 const ValueDFS &VD) {
   while (!Stack.empty() && !stackIsInScope(Stack, VD))
     Stack.pop_back();
 }
 
 // Convert the uses of Op into a vector of uses, associating global and local
 // DFS info with each one.
-void PredicateInfo::convertUsesToDFSOrdered(
+void PredicateInfoBuilder::convertUsesToDFSOrdered(
     Value *Op, SmallVectorImpl<ValueDFS> &DFSOrderedSet) {
   for (auto &U : Op->uses()) {
     if (auto *I = dyn_cast<Instruction>(U.getUser())) {
@@ -338,19 +386,20 @@ void collectCmpOps(CmpInst *Comparison, SmallVectorImpl<Value *> &CmpOperands) {
 }
 
 // Add Op, PB to the list of value infos for Op, and mark Op to be renamed.
-void PredicateInfo::addInfoFor(SmallVectorImpl<Value *> &OpsToRename, Value *Op,
-                               PredicateBase *PB) {
+void PredicateInfoBuilder::addInfoFor(SmallVectorImpl<Value *> &OpsToRename,
+                                      Value *Op, PredicateBase *PB) {
   auto &OperandInfo = getOrCreateValueInfo(Op);
   if (OperandInfo.Infos.empty())
     OpsToRename.push_back(Op);
-  AllInfos.push_back(PB);
+  PI.AllInfos.push_back(PB);
   OperandInfo.Infos.push_back(PB);
 }
 
 // Process an assume instruction and place relevant operations we want to rename
 // into OpsToRename.
-void PredicateInfo::processAssume(IntrinsicInst *II, BasicBlock *AssumeBB,
-                                  SmallVectorImpl<Value *> &OpsToRename) {
+void PredicateInfoBuilder::processAssume(
+    IntrinsicInst *II, BasicBlock *AssumeBB,
+    SmallVectorImpl<Value *> &OpsToRename) {
   // See if we have a comparison we support
   SmallVector<Value *, 8> CmpOperands;
   SmallVector<Value *, 2> ConditionsToProcess;
@@ -389,8 +438,9 @@ void PredicateInfo::processAssume(IntrinsicInst *II, BasicBlock *AssumeBB,
 
 // Process a block terminating branch, and place relevant operations to be
 // renamed into OpsToRename.
-void PredicateInfo::processBranch(BranchInst *BI, BasicBlock *BranchBB,
-                                  SmallVectorImpl<Value *> &OpsToRename) {
+void PredicateInfoBuilder::processBranch(
+    BranchInst *BI, BasicBlock *BranchBB,
+    SmallVectorImpl<Value *> &OpsToRename) {
   BasicBlock *FirstBB = BI->getSuccessor(0);
   BasicBlock *SecondBB = BI->getSuccessor(1);
   SmallVector<BasicBlock *, 2> SuccsToProcess;
@@ -459,8 +509,9 @@ void PredicateInfo::processBranch(BranchInst *BI, BasicBlock *BranchBB,
 }
 // Process a block terminating switch, and place relevant operations to be
 // renamed into OpsToRename.
-void PredicateInfo::processSwitch(SwitchInst *SI, BasicBlock *BranchBB,
-                                  SmallVectorImpl<Value *> &OpsToRename) {
+void PredicateInfoBuilder::processSwitch(
+    SwitchInst *SI, BasicBlock *BranchBB,
+    SmallVectorImpl<Value *> &OpsToRename) {
   Value *Op = SI->getCondition();
   if ((!isa<Instruction>(Op) && !isa<Argument>(Op)) || Op->hasOneUse())
     return;
@@ -486,7 +537,7 @@ void PredicateInfo::processSwitch(SwitchInst *SI, BasicBlock *BranchBB,
 }
 
 // Build predicate info for our function
-void PredicateInfo::buildPredicateInfo() {
+void PredicateInfoBuilder::buildPredicateInfo() {
   DT.updateDFSNumbers();
   // Collect operands to rename from all conditional branch terminators, as well
   // as assume statements.
@@ -530,9 +581,9 @@ static Function *getCopyDeclaration(Module *M, Type *Ty) {
 
 // Given the renaming stack, make all the operands currently on the stack real
 // by inserting them into the IR.  Return the last operation's value.
-Value *PredicateInfo::materializeStack(unsigned int &Counter,
-                                       ValueDFSStack &RenameStack,
-                                       Value *OrigOp) {
+Value *PredicateInfoBuilder::materializeStack(unsigned int &Counter,
+                                             ValueDFSStack &RenameStack,
+                                             Value *OrigOp) {
   // Find the first thing we have to materialize
   auto RevIter = RenameStack.rbegin();
   for (; RevIter != RenameStack.rend(); ++RevIter)
@@ -549,6 +600,9 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
         RenameIter == RenameStack.begin() ? OrigOp : (RenameIter - 1)->Def;
     ValueDFS &Result = *RenameIter;
     auto *ValInfo = Result.PInfo;
+    ValInfo->RenamedOp = (RenameStack.end() - Start) == RenameStack.begin()
+                             ? OrigOp
+                             : (RenameStack.end() - Start - 1)->Def;
     // For edge predicates, we can just place the operand in the block before
     // the terminator.  For assume, we have to place it right before the assume
     // to ensure we dominate all of our uses.  Always insert right before the
@@ -558,21 +612,23 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
       IRBuilder<> B(getBranchTerminator(ValInfo));
       Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
       if (IF->users().empty())
-        CreatedDeclarations.insert(IF);
+        PI.CreatedDeclarations.insert(IF);
       CallInst *PIC =
           B.CreateCall(IF, Op, Op->getName() + "." + Twine(Counter++));
-      PredicateMap.insert({PIC, ValInfo});
+      PI.PredicateMap.insert({PIC, ValInfo});
       Result.Def = PIC;
     } else {
       auto *PAssume = dyn_cast<PredicateAssume>(ValInfo);
       assert(PAssume &&
              "Should not have gotten here without it being an assume");
-      IRBuilder<> B(PAssume->AssumeInst);
+      // Insert the predicate directly after the assume. While it also holds
+      // directly before it, assume(i1 true) is not a useful fact.
+      IRBuilder<> B(PAssume->AssumeInst->getNextNode());
       Function *IF = getCopyDeclaration(F.getParent(), Op->getType());
       if (IF->users().empty())
-        CreatedDeclarations.insert(IF);
+        PI.CreatedDeclarations.insert(IF);
       CallInst *PIC = B.CreateCall(IF, Op);
-      PredicateMap.insert({PIC, ValInfo});
+      PI.PredicateMap.insert({PIC, ValInfo});
       Result.Def = PIC;
     }
   }
@@ -598,8 +654,8 @@ Value *PredicateInfo::materializeStack(unsigned int &Counter,
 //
 // TODO: Use this algorithm to perform fast single-variable renaming in
 // promotememtoreg and memoryssa.
-void PredicateInfo::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
-  ValueDFS_Compare Compare(DT, OI);
+void PredicateInfoBuilder::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
+  ValueDFS_Compare Compare(DT);
   // Compute liveness, and rename in O(uses) per Op.
   for (auto *Op : OpsToRename) {
     LLVM_DEBUG(dbgs() << "Visiting " << *Op << "\n");
@@ -719,7 +775,8 @@ void PredicateInfo::renameUses(SmallVectorImpl<Value *> &OpsToRename) {
   }
 }
 
-PredicateInfo::ValueInfo &PredicateInfo::getOrCreateValueInfo(Value *Operand) {
+PredicateInfoBuilder::ValueInfo &
+PredicateInfoBuilder::getOrCreateValueInfo(Value *Operand) {
   auto OIN = ValueInfoNums.find(Operand);
   if (OIN == ValueInfoNums.end()) {
     // This will grow it
@@ -732,8 +789,8 @@ PredicateInfo::ValueInfo &PredicateInfo::getOrCreateValueInfo(Value *Operand) {
   return ValueInfos[OIN->second];
 }
 
-const PredicateInfo::ValueInfo &
-PredicateInfo::getValueInfo(Value *Operand) const {
+const PredicateInfoBuilder::ValueInfo &
+PredicateInfoBuilder::getValueInfo(Value *Operand) const {
   auto OINI = ValueInfoNums.lookup(Operand);
   assert(OINI != 0 && "Operand was not really in the Value Info Numbers");
   assert(OINI < ValueInfos.size() &&
@@ -743,10 +800,9 @@ PredicateInfo::getValueInfo(Value *Operand) const {
 
 PredicateInfo::PredicateInfo(Function &F, DominatorTree &DT,
                              AssumptionCache &AC)
-    : F(F), DT(DT), AC(AC), OI(&DT) {
-  // Push an empty operand info so that we can detect 0 as not finding one
-  ValueInfos.resize(1);
-  buildPredicateInfo();
+    : F(F) {
+  PredicateInfoBuilder Builder(*this, F, DT, AC);
+  Builder.buildPredicateInfo();
 }
 
 // Remove all declarations we created . The PredicateInfo consumers are
@@ -829,11 +885,11 @@ class PredicateInfoAnnotatedWriter : public AssemblyAnnotationWriter {
 public:
   PredicateInfoAnnotatedWriter(const PredicateInfo *M) : PredInfo(M) {}
 
-  virtual void emitBasicBlockStartAnnot(const BasicBlock *BB,
-                                        formatted_raw_ostream &OS) {}
+  void emitBasicBlockStartAnnot(const BasicBlock *BB,
+                                formatted_raw_ostream &OS) override {}
 
-  virtual void emitInstructionAnnot(const Instruction *I,
-                                    formatted_raw_ostream &OS) {
+  void emitInstructionAnnot(const Instruction *I,
+                            formatted_raw_ostream &OS) override {
     if (const auto *PI = PredInfo->getPredicateInfoFor(I)) {
       OS << "; Has predicate info\n";
       if (const auto *PB = dyn_cast<PredicateBranch>(PI)) {
@@ -842,18 +898,21 @@ public:
         PB->From->printAsOperand(OS);
         OS << ",";
         PB->To->printAsOperand(OS);
-        OS << "] }\n";
+        OS << "]";
       } else if (const auto *PS = dyn_cast<PredicateSwitch>(PI)) {
         OS << "; switch predicate info { CaseValue: " << *PS->CaseValue
            << " Switch:" << *PS->Switch << " Edge: [";
         PS->From->printAsOperand(OS);
         OS << ",";
         PS->To->printAsOperand(OS);
-        OS << "] }\n";
+        OS << "]";
       } else if (const auto *PA = dyn_cast<PredicateAssume>(PI)) {
         OS << "; assume predicate info {"
-           << " Comparison:" << *PA->Condition << " }\n";
+           << " Comparison:" << *PA->Condition;
       }
+      OS << ", RenamedOp: ";
+      PI->RenamedOp->printAsOperand(OS, false);
+      OS << " }\n";
     }
   }
 };
diff --git a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
index d58e1ea574ef..c7e9c919ec47 100644
--- a/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
+++ b/llvm/lib/Transforms/Utils/PromoteMemoryToRegister.cpp
@@ -595,11 +595,6 @@ void PromoteMem2Reg::run() {
     // Keep the reverse mapping of the 'Allocas' array for the rename pass.
     AllocaLookup[Allocas[AllocaNum]] = AllocaNum;
 
-    // At this point, we're committed to promoting the alloca using IDF's, and
-    // the standard SSA construction algorithm.  Determine which blocks need PHI
-    // nodes and see if we can optimize out some work by avoiding insertion of
-    // dead phi nodes.
-
     // Unique the set of defining blocks for efficient lookup.
     SmallPtrSet<BasicBlock *, 32> DefBlocks(Info.DefiningBlocks.begin(),
                                             Info.DefiningBlocks.end());
diff --git a/llvm/lib/Transforms/Utils/SSAUpdater.cpp b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
index bffdd115d940..57df2334c750 100644
--- a/llvm/lib/Transforms/Utils/SSAUpdater.cpp
+++ b/llvm/lib/Transforms/Utils/SSAUpdater.cpp
@@ -56,7 +56,7 @@ void SSAUpdater::Initialize(Type *Ty, StringRef Name) {
   else
     getAvailableVals(AV).clear();
   ProtoType = Ty;
-  ProtoName = Name;
+  ProtoName = std::string(Name);
 }
 
 bool SSAUpdater::HasValueForBlock(BasicBlock *BB) const {
@@ -195,11 +195,6 @@ void SSAUpdater::RewriteUse(Use &U) {
   else
     V = GetValueInMiddleOfBlock(User->getParent());
 
-  // Notify that users of the existing value that it is being replaced.
-  Value *OldVal = U.get();
-  if (OldVal != V && OldVal->hasValueHandle())
-    ValueHandleBase::ValueIsRAUWd(OldVal, V);
-
   U.set(V);
 }
 
diff --git a/llvm/lib/Analysis/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index dc5d02aa3a3c..71b48482f26a 100644
--- a/llvm/lib/Analysis/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -12,7 +12,7 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/SmallSet.h"
 #include "llvm/Analysis/InstructionSimplify.h"
@@ -24,10 +24,17 @@
 #include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 
 using namespace llvm;
+
+cl::opt<unsigned> llvm::SCEVCheapExpansionBudget(
+    "scev-cheap-expansion-budget", cl::Hidden, cl::init(4),
+    cl::desc("When performing SCEV expansion only if it is cheap to do, this "
+             "controls the budget that is considered cheap (default = 4)"));
+
 using namespace PatternMatch;
 
 /// ReuseOrCreateCast - Arrange for there to be a cast of V to Ty at IP,
@@ -280,14 +287,14 @@ static bool FactorOutConstant(const SCEV *&S, const SCEV *&Remainder,
   if (const SCEVMulExpr *M = dyn_cast<SCEVMulExpr>(S)) {
     // Size is known, check if there is a constant operand which is a multiple
     // of the given factor. If so, we can factor it.
-    const SCEVConstant *FC = cast<SCEVConstant>(Factor);
-    if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
-      if (!C->getAPInt().srem(FC->getAPInt())) {
-        SmallVector<const SCEV *, 4> NewMulOps(M->op_begin(), M->op_end());
-        NewMulOps[0] = SE.getConstant(C->getAPInt().sdiv(FC->getAPInt()));
-        S = SE.getMulExpr(NewMulOps);
-        return true;
-      }
+    if (const SCEVConstant *FC = dyn_cast<SCEVConstant>(Factor))
+      if (const SCEVConstant *C = dyn_cast<SCEVConstant>(M->getOperand(0)))
+        if (!C->getAPInt().srem(FC->getAPInt())) {
+          SmallVector<const SCEV *, 4> NewMulOps(M->op_begin(), M->op_end());
+          NewMulOps[0] = SE.getConstant(C->getAPInt().sdiv(FC->getAPInt()));
+          S = SE.getMulExpr(NewMulOps);
+          return true;
+        }
   }
 
   // In an AddRec, check if both start and step are divisible.
@@ -497,6 +504,10 @@ Value *SCEVExpander::expandAddToGEP(const SCEV *const *op_begin,
     if (ArrayType *ATy = dyn_cast<ArrayType>(ElTy))
       ElTy = ATy->getElementType();
     else
+      // FIXME: Handle VectorType.
+      // E.g., If ElTy is scalable vector, then ElSize is not a compile-time
+      // constant, therefore can not be factored out. The generated IR is less
+      // ideal with base 'V' cast to i8* and do ugly getelementptr over that.
       break;
   }
 
@@ -2129,84 +2140,190 @@ SCEVExpander::getRelatedExistingExpansion(const SCEV *S, const Instruction *At,
 }
 
 bool SCEVExpander::isHighCostExpansionHelper(
-    const SCEV *S, Loop *L, const Instruction *At,
-    SmallPtrSetImpl<const SCEV *> &Processed) {
+    const SCEV *S, Loop *L, const Instruction &At, int &BudgetRemaining,
+    const TargetTransformInfo &TTI, SmallPtrSetImpl<const SCEV *> &Processed,
+    SmallVectorImpl<const SCEV *> &Worklist) {
+  if (BudgetRemaining < 0)
+    return true; // Already run out of budget, give up.
+
+  // Was the cost of expansion of this expression already accounted for?
+  if (!Processed.insert(S).second)
+    return false; // We have already accounted for this expression.
 
   // If we can find an existing value for this scev available at the point "At"
   // then consider the expression cheap.
-  if (At && getRelatedExistingExpansion(S, At, L))
-    return false;
+  if (getRelatedExistingExpansion(S, &At, L))
+    return false; // Consider the expression to be free.
 
-  // Zero/One operand expressions
   switch (S->getSCEVType()) {
   case scUnknown:
   case scConstant:
-    return false;
-  case scTruncate:
-    return isHighCostExpansionHelper(cast<SCEVTruncateExpr>(S)->getOperand(),
-                                     L, At, Processed);
-  case scZeroExtend:
-    return isHighCostExpansionHelper(cast<SCEVZeroExtendExpr>(S)->getOperand(),
-                                     L, At, Processed);
-  case scSignExtend:
-    return isHighCostExpansionHelper(cast<SCEVSignExtendExpr>(S)->getOperand(),
-                                     L, At, Processed);
+    return false; // Assume to be zero-cost.
   }
 
-  if (!Processed.insert(S).second)
-    return false;
+  TargetTransformInfo::TargetCostKind CostKind =
+    TargetTransformInfo::TCK_RecipThroughput;
+
+  if (auto *CastExpr = dyn_cast<SCEVCastExpr>(S)) {
+    unsigned Opcode;
+    switch (S->getSCEVType()) {
+    case scTruncate:
+      Opcode = Instruction::Trunc;
+      break;
+    case scZeroExtend:
+      Opcode = Instruction::ZExt;
+      break;
+    case scSignExtend:
+      Opcode = Instruction::SExt;
+      break;
+    default:
+      llvm_unreachable("There are no other cast types.");
+    }
+    const SCEV *Op = CastExpr->getOperand();
+    BudgetRemaining -= TTI.getCastInstrCost(Opcode, /*Dst=*/S->getType(),
+                                            /*Src=*/Op->getType(), CostKind);
+    Worklist.emplace_back(Op);
+    return false; // Will answer upon next entry into this function.
+  }
 
   if (auto *UDivExpr = dyn_cast<SCEVUDivExpr>(S)) {
-    // If the divisor is a power of two and the SCEV type fits in a native
-    // integer (and the LHS not expensive), consider the division cheap
-    // irrespective of whether it occurs in the user code since it can be
-    // lowered into a right shift.
-    if (auto *SC = dyn_cast<SCEVConstant>(UDivExpr->getRHS()))
+    // If the divisor is a power of two count this as a logical right-shift.
+    if (auto *SC = dyn_cast<SCEVConstant>(UDivExpr->getRHS())) {
       if (SC->getAPInt().isPowerOf2()) {
-        if (isHighCostExpansionHelper(UDivExpr->getLHS(), L, At, Processed))
-          return true;
-        const DataLayout &DL =
-            L->getHeader()->getParent()->getParent()->getDataLayout();
-        unsigned Width = cast<IntegerType>(UDivExpr->getType())->getBitWidth();
-        return DL.isIllegalInteger(Width);
+        BudgetRemaining -=
+            TTI.getArithmeticInstrCost(Instruction::LShr, S->getType(),
+                                       CostKind);
+        // Note that we don't count the cost of RHS, because it is a constant,
+        // and we consider those to be free. But if that changes, we would need
+        // to log2() it first before calling isHighCostExpansionHelper().
+        Worklist.emplace_back(UDivExpr->getLHS());
+        return false; // Will answer upon next entry into this function.
       }
+    }
 
     // UDivExpr is very likely a UDiv that ScalarEvolution's HowFarToZero or
     // HowManyLessThans produced to compute a precise expression, rather than a
     // UDiv from the user's code. If we can't find a UDiv in the code with some
-    // simple searching, assume the former consider UDivExpr expensive to
-    // compute.
-    BasicBlock *ExitingBB = L->getExitingBlock();
-    if (!ExitingBB)
+    // simple searching, we need to account for it's cost.
+
+    // At the beginning of this function we already tried to find existing
+    // value for plain 'S'. Now try to lookup 'S + 1' since it is common
+    // pattern involving division. This is just a simple search heuristic.
+    if (getRelatedExistingExpansion(
+            SE.getAddExpr(S, SE.getConstant(S->getType(), 1)), &At, L))
+      return false; // Consider it to be free.
+
+    // Need to count the cost of this UDiv.
+    BudgetRemaining -=
+        TTI.getArithmeticInstrCost(Instruction::UDiv, S->getType(),
+                                   CostKind);
+    Worklist.insert(Worklist.end(), {UDivExpr->getLHS(), UDivExpr->getRHS()});
+    return false; // Will answer upon next entry into this function.
+  }
+
+  if (const auto *NAry = dyn_cast<SCEVAddRecExpr>(S)) {
+    Type *OpType = NAry->getType();
+
+    assert(NAry->getNumOperands() >= 2 &&
+           "Polynomial should be at least linear");
+
+    int AddCost =
+      TTI.getArithmeticInstrCost(Instruction::Add, OpType, CostKind);
+    int MulCost =
+      TTI.getArithmeticInstrCost(Instruction::Mul, OpType, CostKind);
+
+    // In this polynominal, we may have some zero operands, and we shouldn't
+    // really charge for those. So how many non-zero coeffients are there?
+    int NumTerms = llvm::count_if(NAry->operands(),
+                                  [](const SCEV *S) { return !S->isZero(); });
+    assert(NumTerms >= 1 && "Polynominal should have at least one term.");
+    assert(!(*std::prev(NAry->operands().end()))->isZero() &&
+           "Last operand should not be zero");
+
+    // Much like with normal add expr, the polynominal will require
+    // one less addition than the number of it's terms.
+    BudgetRemaining -= AddCost * (NumTerms - 1);
+    if (BudgetRemaining < 0)
       return true;
 
-    // At the beginning of this function we already tried to find existing value
-    // for plain 'S'. Now try to lookup 'S + 1' since it is common pattern
-    // involving division. This is just a simple search heuristic.
-    if (!At)
-      At = &ExitingBB->back();
-    if (!getRelatedExistingExpansion(
-            SE.getAddExpr(S, SE.getConstant(S->getType(), 1)), At, L))
+    // Ignoring constant term (operand 0), how many of the coeffients are u> 1?
+    int NumNonZeroDegreeNonOneTerms =
+        llvm::count_if(make_range(std::next(NAry->op_begin()), NAry->op_end()),
+                       [](const SCEV *S) {
+                         auto *SConst = dyn_cast<SCEVConstant>(S);
+                         return !SConst || SConst->getAPInt().ugt(1);
+                       });
+    // Here, *each* one of those will require a multiplication.
+    BudgetRemaining -= MulCost * NumNonZeroDegreeNonOneTerms;
+    if (BudgetRemaining < 0)
       return true;
-  }
 
-  // HowManyLessThans uses a Max expression whenever the loop is not guarded by
-  // the exit condition.
-  if (isa<SCEVMinMaxExpr>(S))
-    return true;
+    // What is the degree of this polynominal?
+    int PolyDegree = NAry->getNumOperands() - 1;
+    assert(PolyDegree >= 1 && "Should be at least affine.");
+
+    // The final term will be:
+    //   Op_{PolyDegree} * x ^ {PolyDegree}
+    // Where  x ^ {PolyDegree}  will again require PolyDegree-1 mul operations.
+    // Note that  x ^ {PolyDegree} = x * x ^ {PolyDegree-1}  so charging for
+    // x ^ {PolyDegree}  will give us  x ^ {2} .. x ^ {PolyDegree-1}  for free.
+    // FIXME: this is conservatively correct, but might be overly pessimistic.
+    BudgetRemaining -= MulCost * (PolyDegree - 1);
+    if (BudgetRemaining < 0)
+      return true;
+
+    // And finally, the operands themselves should fit within the budget.
+    Worklist.insert(Worklist.end(), NAry->operands().begin(),
+                    NAry->operands().end());
+    return false; // So far so good, though ops may be too costly?
+  }
 
-  // Recurse past nary expressions, which commonly occur in the
-  // BackedgeTakenCount. They may already exist in program code, and if not,
-  // they are not too expensive rematerialize.
   if (const SCEVNAryExpr *NAry = dyn_cast<SCEVNAryExpr>(S)) {
-    for (auto *Op : NAry->operands())
-      if (isHighCostExpansionHelper(Op, L, At, Processed))
-        return true;
+    Type *OpType = NAry->getType();
+
+    int PairCost;
+    switch (S->getSCEVType()) {
+    case scAddExpr:
+      PairCost =
+        TTI.getArithmeticInstrCost(Instruction::Add, OpType, CostKind);
+      break;
+    case scMulExpr:
+      // TODO: this is a very pessimistic cost modelling for Mul,
+      // because of Bin Pow algorithm actually used by the expander,
+      // see SCEVExpander::visitMulExpr(), ExpandOpBinPowN().
+      PairCost =
+        TTI.getArithmeticInstrCost(Instruction::Mul, OpType, CostKind);
+      break;
+    case scSMaxExpr:
+    case scUMaxExpr:
+    case scSMinExpr:
+    case scUMinExpr:
+      PairCost = TTI.getCmpSelInstrCost(Instruction::ICmp, OpType,
+                                        CmpInst::makeCmpResultType(OpType),
+                                        CostKind) +
+                 TTI.getCmpSelInstrCost(Instruction::Select, OpType,
+                                        CmpInst::makeCmpResultType(OpType),
+                                        CostKind);
+      break;
+    default:
+      llvm_unreachable("There are no other variants here.");
+    }
+
+    assert(NAry->getNumOperands() > 1 &&
+           "Nary expr should have more than 1 operand.");
+    // The simple nary expr will require one less op (or pair of ops)
+    // than the number of it's terms.
+    BudgetRemaining -= PairCost * (NAry->getNumOperands() - 1);
+    if (BudgetRemaining < 0)
+      return true;
+
+    // And finally, the operands themselves should fit within the budget.
+    Worklist.insert(Worklist.end(), NAry->operands().begin(),
+                    NAry->operands().end());
+    return false; // So far so good, though ops may be too costly?
   }
 
-  // If we haven't recognized an expensive SCEV pattern, assume it's an
-  // expression produced by program code.
-  return false;
+  llvm_unreachable("No other scev expressions possible.");
 }
 
 Value *SCEVExpander::expandCodeForPredicate(const SCEVPredicate *Pred,
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index d93ca4f04cdb..b450d71c996c 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -33,7 +33,6 @@
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/ConstantRange.h"
 #include "llvm/IR/Constants.h"
@@ -134,6 +133,11 @@ static cl::opt<unsigned> MaxSpeculationDepth(
     cl::desc("Limit maximum recursion depth when calculating costs of "
              "speculatively executed instructions"));
 
+static cl::opt<int>
+MaxSmallBlockSize("simplifycfg-max-small-block-size", cl::Hidden, cl::init(10),
+                  cl::desc("Max size of a block which is still considered "
+                           "small enough to thread through"));
+
 STATISTIC(NumBitMaps, "Number of switch instructions turned into bitmaps");
 STATISTIC(NumLinearMaps,
           "Number of switch instructions turned into linear mapping");
@@ -192,20 +196,34 @@ class SimplifyCFGOpt {
   bool FoldValueComparisonIntoPredecessors(Instruction *TI,
                                            IRBuilder<> &Builder);
 
-  bool SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
-  bool SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder);
-  bool SimplifySingleResume(ResumeInst *RI);
-  bool SimplifyCommonResume(ResumeInst *RI);
-  bool SimplifyCleanupReturn(CleanupReturnInst *RI);
-  bool SimplifyUnreachable(UnreachableInst *UI);
-  bool SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);
-  bool SimplifyIndirectBr(IndirectBrInst *IBI);
-  bool SimplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder);
-  bool SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder);
+  bool simplifyReturn(ReturnInst *RI, IRBuilder<> &Builder);
+  bool simplifyResume(ResumeInst *RI, IRBuilder<> &Builder);
+  bool simplifySingleResume(ResumeInst *RI);
+  bool simplifyCommonResume(ResumeInst *RI);
+  bool simplifyCleanupReturn(CleanupReturnInst *RI);
+  bool simplifyUnreachable(UnreachableInst *UI);
+  bool simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder);
+  bool simplifyIndirectBr(IndirectBrInst *IBI);
+  bool simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder);
+  bool simplifyUncondBranch(BranchInst *BI, IRBuilder<> &Builder);
+  bool simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder);
+  bool SimplifyCondBranchToTwoReturns(BranchInst *BI, IRBuilder<> &Builder);
 
   bool tryToSimplifyUncondBranchWithICmpInIt(ICmpInst *ICI,
                                              IRBuilder<> &Builder);
 
+  bool HoistThenElseCodeToIf(BranchInst *BI, const TargetTransformInfo &TTI);
+  bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
+                              const TargetTransformInfo &TTI);
+  bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
+                                  BasicBlock *TrueBB, BasicBlock *FalseBB,
+                                  uint32_t TrueWeight, uint32_t FalseWeight);
+  bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
+                                 const DataLayout &DL);
+  bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select);
+  bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI);
+  bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder);
+
 public:
   SimplifyCFGOpt(const TargetTransformInfo &TTI, const DataLayout &DL,
                  SmallPtrSetImpl<BasicBlock *> *LoopHeaders,
@@ -317,7 +335,7 @@ static unsigned ComputeSpeculationCost(const User *I,
                                        const TargetTransformInfo &TTI) {
   assert(isSafeToSpeculativelyExecute(I) &&
          "Instruction is not safe to speculatively execute!");
-  return TTI.getUserCost(I);
+  return TTI.getUserCost(I, TargetTransformInfo::TCK_SizeAndLatency);
 }
 
 /// If we have a merge point of an "if condition" as accepted above,
@@ -1235,8 +1253,8 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I);
 /// Given a conditional branch that goes to BB1 and BB2, hoist any common code
 /// in the two blocks up into the branch block. The caller of this function
 /// guarantees that BI's block dominates BB1 and BB2.
-static bool HoistThenElseCodeToIf(BranchInst *BI,
-                                  const TargetTransformInfo &TTI) {
+bool SimplifyCFGOpt::HoistThenElseCodeToIf(BranchInst *BI,
+                                           const TargetTransformInfo &TTI) {
   // This does very trivial matching, with limited scanning, to find identical
   // instructions in the two blocks.  In particular, we don't want to get into
   // O(M*N) situations here where M and N are the sizes of BB1 and BB2.  As
@@ -1287,6 +1305,14 @@ static bool HoistThenElseCodeToIf(BranchInst *BI,
     if (!TTI.isProfitableToHoist(I1) || !TTI.isProfitableToHoist(I2))
       return Changed;
 
+    // If any of the two call sites has nomerge attribute, stop hoisting.
+    if (const auto *CB1 = dyn_cast<CallBase>(I1))
+      if (CB1->cannotMerge())
+        return Changed;
+    if (const auto *CB2 = dyn_cast<CallBase>(I2))
+      if (CB2->cannotMerge())
+        return Changed;
+
     if (isa<DbgInfoIntrinsic>(I1) || isa<DbgInfoIntrinsic>(I2)) {
       assert (isa<DbgInfoIntrinsic>(I1) && isa<DbgInfoIntrinsic>(I2));
       // The debug location is an integral part of a debug info intrinsic
@@ -1444,6 +1470,13 @@ static bool isLifeTimeMarker(const Instruction *I) {
   return false;
 }
 
+// TODO: Refine this. This should avoid cases like turning constant memcpy sizes
+// into variables.
+static bool replacingOperandWithVariableIsCheap(const Instruction *I,
+                                                int OpIdx) {
+  return !isa<IntrinsicInst>(I);
+}
+
 // All instructions in Insts belong to different blocks that all unconditionally
 // branch to a common successor. Analyze each instruction and return true if it
 // would be possible to sink them into their successor, creating one common
@@ -1465,8 +1498,9 @@ static bool canSinkInstructions(
     // Conservatively return false if I is an inline-asm instruction. Sinking
     // and merging inline-asm instructions can potentially create arguments
     // that cannot satisfy the inline-asm constraints.
+    // If the instruction has nomerge attribute, return false.
     if (const auto *C = dyn_cast<CallBase>(I))
-      if (C->isInlineAsm())
+      if (C->isInlineAsm() || C->cannotMerge())
         return false;
 
     // Each instruction must have zero or one use.
@@ -1521,7 +1555,8 @@ static bool canSinkInstructions(
     return false;
 
   for (unsigned OI = 0, OE = I0->getNumOperands(); OI != OE; ++OI) {
-    if (I0->getOperand(OI)->getType()->isTokenTy())
+    Value *Op = I0->getOperand(OI);
+    if (Op->getType()->isTokenTy())
       // Don't touch any operand of token type.
       return false;
 
@@ -1530,7 +1565,8 @@ static bool canSinkInstructions(
       return I->getOperand(OI) == I0->getOperand(OI);
     };
     if (!all_of(Insts, SameAsI0)) {
-      if (!canReplaceOperandWithVariable(I0, OI))
+      if ((isa<Constant>(Op) && !replacingOperandWithVariableIsCheap(I0, OI)) ||
+          !canReplaceOperandWithVariable(I0, OI))
         // We can't create a PHI from this GEP.
         return false;
       // Don't create indirect calls! The called value is the final operand.
@@ -1960,8 +1996,8 @@ static Value *isSafeToSpeculateStore(Instruction *I, BasicBlock *BrBB,
 /// \endcode
 ///
 /// \returns true if the conditional block is removed.
-static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
-                                   const TargetTransformInfo &TTI) {
+bool SimplifyCFGOpt::SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
+                                            const TargetTransformInfo &TTI) {
   // Be conservative for now. FP select instruction can often be expensive.
   Value *BrCond = BI->getCondition();
   if (isa<FCmpInst>(BrCond))
@@ -2110,9 +2146,14 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
   }
 
   // Metadata can be dependent on the condition we are hoisting above.
-  // Conservatively strip all metadata on the instruction.
-  for (auto &I : *ThenBB)
+  // Conservatively strip all metadata on the instruction. Drop the debug loc
+  // to avoid making it appear as if the condition is a constant, which would
+  // be misleading while debugging.
+  for (auto &I : *ThenBB) {
+    if (!SpeculatedStoreValue || &I != SpeculatedStore)
+      I.setDebugLoc(DebugLoc());
     I.dropUnknownNonDebugMetadata();
+  }
 
   // Hoist the instructions.
   BB->getInstList().splice(BI->getIterator(), ThenBB->getInstList(),
@@ -2131,13 +2172,12 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
       continue;
 
     // Create a select whose true value is the speculatively executed value and
-    // false value is the preexisting value. Swap them if the branch
+    // false value is the pre-existing value. Swap them if the branch
     // destinations were inverted.
     Value *TrueV = ThenV, *FalseV = OrigV;
     if (Invert)
       std::swap(TrueV, FalseV);
-    Value *V = Builder.CreateSelect(
-        BrCond, TrueV, FalseV, "spec.select", BI);
+    Value *V = Builder.CreateSelect(BrCond, TrueV, FalseV, "spec.select", BI);
     PN.setIncomingValue(OrigI, V);
     PN.setIncomingValue(ThenI, V);
   }
@@ -2154,12 +2194,15 @@ static bool SpeculativelyExecuteBB(BranchInst *BI, BasicBlock *ThenBB,
 
 /// Return true if we can thread a branch across this block.
 static bool BlockIsSimpleEnoughToThreadThrough(BasicBlock *BB) {
-  unsigned Size = 0;
+  int Size = 0;
 
   for (Instruction &I : BB->instructionsWithoutDebug()) {
-    if (Size > 10)
+    if (Size > MaxSmallBlockSize)
       return false; // Don't clone large BB's.
-    ++Size;
+    // We will delete Phis while threading, so Phis should not be accounted in
+    // block's size
+    if (!isa<PHINode>(I))
+      ++Size;
 
     // We can only support instructions that do not define values that are
     // live outside of the current basic block.
@@ -2306,9 +2349,6 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
   // dependence information for this check, but simplifycfg can't keep it up
   // to date, and this catches most of the cases we care about anyway.
   BasicBlock *BB = PN->getParent();
-  const Function *Fn = BB->getParent();
-  if (Fn && Fn->hasFnAttribute(Attribute::OptForFuzzing))
-    return false;
 
   BasicBlock *IfTrue, *IfFalse;
   Value *IfCond = GetIfCondition(BB, IfTrue, IfFalse);
@@ -2454,8 +2494,8 @@ static bool FoldTwoEntryPHINode(PHINode *PN, const TargetTransformInfo &TTI,
 /// If we found a conditional branch that goes to two returning blocks,
 /// try to merge them together into one return,
 /// introducing a select if the return values disagree.
-static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
-                                           IRBuilder<> &Builder) {
+bool SimplifyCFGOpt::SimplifyCondBranchToTwoReturns(BranchInst *BI,
+                                                    IRBuilder<> &Builder) {
   assert(BI->isConditional() && "Must be a conditional branch");
   BasicBlock *TrueSucc = BI->getSuccessor(0);
   BasicBlock *FalseSucc = BI->getSuccessor(1);
@@ -2531,8 +2571,8 @@ static bool SimplifyCondBranchToTwoReturns(BranchInst *BI,
   (void)RI;
 
   LLVM_DEBUG(dbgs() << "\nCHANGING BRANCH TO TWO RETURNS INTO SELECT:"
-                    << "\n  " << *BI << "NewRet = " << *RI << "TRUEBLOCK: "
-                    << *TrueSucc << "FALSEBLOCK: " << *FalseSucc);
+                    << "\n  " << *BI << "\nNewRet = " << *RI << "\nTRUEBLOCK: "
+                    << *TrueSucc << "\nFALSEBLOCK: " << *FalseSucc);
 
   EraseTerminatorAndDCECond(BI);
 
@@ -2588,6 +2628,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
 
   const unsigned PredCount = pred_size(BB);
 
+  bool Changed = false;
+
   Instruction *Cond = nullptr;
   if (BI->isConditional())
     Cond = dyn_cast<Instruction>(BI->getCondition());
@@ -2611,17 +2653,18 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
             }
             // Quit if we can't remove this instruction.
             if (!tryCSEWithPredecessor(Curr, PB))
-              return false;
+              return Changed;
+            Changed = true;
           }
         }
 
     if (!Cond)
-      return false;
+      return Changed;
   }
 
   if (!Cond || (!isa<CmpInst>(Cond) && !isa<BinaryOperator>(Cond)) ||
       Cond->getParent() != BB || !Cond->hasOneUse())
-    return false;
+    return Changed;
 
   // Make sure the instruction after the condition is the cond branch.
   BasicBlock::iterator CondIt = ++Cond->getIterator();
@@ -2631,7 +2674,7 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
     ++CondIt;
 
   if (&*CondIt != BI)
-    return false;
+    return Changed;
 
   // Only allow this transformation if computing the condition doesn't involve
   // too many instructions and these involved instructions can be executed
@@ -2645,11 +2688,11 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
     if (isa<DbgInfoIntrinsic>(I))
       continue;
     if (!I->hasOneUse() || !isSafeToSpeculativelyExecute(&*I))
-      return false;
+      return Changed;
     // I has only one use and can be executed unconditionally.
     Instruction *User = dyn_cast<Instruction>(I->user_back());
     if (User == nullptr || User->getParent() != BB)
-      return false;
+      return Changed;
     // I is used in the same BB. Since BI uses Cond and doesn't have more slots
     // to use any other instruction, User must be an instruction between next(I)
     // and Cond.
@@ -2659,23 +2702,23 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
     NumBonusInsts += PredCount;
     // Early exits once we reach the limit.
     if (NumBonusInsts > BonusInstThreshold)
-      return false;
+      return Changed;
   }
 
   // Cond is known to be a compare or binary operator.  Check to make sure that
   // neither operand is a potentially-trapping constant expression.
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(0)))
     if (CE->canTrap())
-      return false;
+      return Changed;
   if (ConstantExpr *CE = dyn_cast<ConstantExpr>(Cond->getOperand(1)))
     if (CE->canTrap())
-      return false;
+      return Changed;
 
   // Finally, don't infinitely unroll conditional loops.
   BasicBlock *TrueDest = BI->getSuccessor(0);
   BasicBlock *FalseDest = (BI->isConditional()) ? BI->getSuccessor(1) : nullptr;
   if (TrueDest == BB || FalseDest == BB)
-    return false;
+    return Changed;
 
   for (pred_iterator PI = pred_begin(BB), E = pred_end(BB); PI != E; ++PI) {
     BasicBlock *PredBlock = *PI;
@@ -2715,6 +2758,8 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
     }
 
     LLVM_DEBUG(dbgs() << "FOLDING BRANCH TO COMMON DEST:\n" << *PBI << *BB);
+    Changed = true;
+
     IRBuilder<> Builder(PBI);
 
     // If we need to invert the condition in the pred block to match, do so now.
@@ -2744,6 +2789,12 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
       if (isa<DbgInfoIntrinsic>(BonusInst))
         continue;
       Instruction *NewBonusInst = BonusInst->clone();
+
+      // When we fold the bonus instructions we want to make sure we
+      // reset their debug locations in order to avoid stepping on dead
+      // code caused by folding dead branches.
+      NewBonusInst->setDebugLoc(DebugLoc());
+
       RemapInstruction(NewBonusInst, VMap,
                        RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
       VMap[&*BonusInst] = NewBonusInst;
@@ -2763,6 +2814,11 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
     // Clone Cond into the predecessor basic block, and or/and the
     // two conditions together.
     Instruction *CondInPred = Cond->clone();
+
+    // Reset the condition debug location to avoid jumping on dead code
+    // as the result of folding dead branches.
+    CondInPred->setDebugLoc(DebugLoc());
+
     RemapInstruction(CondInPred, VMap,
                      RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
     PredBlock->getInstList().insert(PBI->getIterator(), CondInPred);
@@ -2877,13 +2933,18 @@ bool llvm::FoldBranchToCommonDest(BranchInst *BI, MemorySSAUpdater *MSSAU,
     // could replace PBI's branch probabilities with BI's.
 
     // Copy any debug value intrinsics into the end of PredBlock.
-    for (Instruction &I : *BB)
-      if (isa<DbgInfoIntrinsic>(I))
-        I.clone()->insertBefore(PBI);
+    for (Instruction &I : *BB) {
+      if (isa<DbgInfoIntrinsic>(I)) {
+        Instruction *NewI = I.clone();
+        RemapInstruction(NewI, VMap,
+                         RF_NoModuleLevelChanges | RF_IgnoreMissingLocals);
+        NewI->insertBefore(PBI);
+      }
+    }
 
-    return true;
+    return Changed;
   }
-  return false;
+  return Changed;
 }
 
 // If there is only one store in BB1 and BB2, return it, otherwise return
@@ -3024,7 +3085,7 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
         return false; // Not in white-list - not worthwhile folding.
       // And finally, if this is a non-free instruction that we are okay
       // speculating, ensure that we consider the speculation budget.
-      BudgetRemaining -= TTI.getUserCost(&I);
+      BudgetRemaining -= TTI.getUserCost(&I, TargetTransformInfo::TCK_SizeAndLatency);
       if (BudgetRemaining < 0)
         return false; // Eagerly refuse to fold as soon as we're out of budget.
     }
@@ -3086,29 +3147,11 @@ static bool mergeConditionalStoreToAddress(BasicBlock *PTB, BasicBlock *PFB,
   PStore->getAAMetadata(AAMD, /*Merge=*/false);
   PStore->getAAMetadata(AAMD, /*Merge=*/true);
   SI->setAAMetadata(AAMD);
-  unsigned PAlignment = PStore->getAlignment();
-  unsigned QAlignment = QStore->getAlignment();
-  unsigned TypeAlignment =
-      DL.getABITypeAlignment(SI->getValueOperand()->getType());
-  unsigned MinAlignment;
-  unsigned MaxAlignment;
-  std::tie(MinAlignment, MaxAlignment) = std::minmax(PAlignment, QAlignment);
   // Choose the minimum alignment. If we could prove both stores execute, we
   // could use biggest one.  In this case, though, we only know that one of the
   // stores executes.  And we don't know it's safe to take the alignment from a
   // store that doesn't execute.
-  if (MinAlignment != 0) {
-    // Choose the minimum of all non-zero alignments.
-    SI->setAlignment(Align(MinAlignment));
-  } else if (MaxAlignment != 0) {
-    // Choose the minimal alignment between the non-zero alignment and the ABI
-    // default alignment for the type of the stored value.
-    SI->setAlignment(Align(std::min(MaxAlignment, TypeAlignment)));
-  } else {
-    // If both alignments are zero, use ABI default alignment for the type of
-    // the stored value.
-    SI->setAlignment(Align(TypeAlignment));
-  }
+  SI->setAlignment(std::min(PStore->getAlign(), QStore->getAlign()));
 
   QStore->eraseFromParent();
   PStore->eraseFromParent();
@@ -3514,10 +3557,11 @@ static bool SimplifyCondBranchToCondBranch(BranchInst *PBI, BranchInst *BI,
 // Takes care of updating the successors and removing the old terminator.
 // Also makes sure not to introduce new successors by assuming that edges to
 // non-successor TrueBBs and FalseBBs aren't reachable.
-static bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
-                                       BasicBlock *TrueBB, BasicBlock *FalseBB,
-                                       uint32_t TrueWeight,
-                                       uint32_t FalseWeight) {
+bool SimplifyCFGOpt::SimplifyTerminatorOnSelect(Instruction *OldTerm,
+                                                Value *Cond, BasicBlock *TrueBB,
+                                                BasicBlock *FalseBB,
+                                                uint32_t TrueWeight,
+                                                uint32_t FalseWeight) {
   // Remove any superfluous successor edges from the CFG.
   // First, figure out which successors to preserve.
   // If TrueBB and FalseBB are equal, only try to preserve one copy of that
@@ -3577,7 +3621,8 @@ static bool SimplifyTerminatorOnSelect(Instruction *OldTerm, Value *Cond,
 //   (switch (select cond, X, Y)) on constant X, Y
 // with a branch - conditional if X and Y lead to distinct BBs,
 // unconditional otherwise.
-static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
+bool SimplifyCFGOpt::SimplifySwitchOnSelect(SwitchInst *SI,
+                                            SelectInst *Select) {
   // Check for constant integer values in the select.
   ConstantInt *TrueVal = dyn_cast<ConstantInt>(Select->getTrueValue());
   ConstantInt *FalseVal = dyn_cast<ConstantInt>(Select->getFalseValue());
@@ -3613,7 +3658,8 @@ static bool SimplifySwitchOnSelect(SwitchInst *SI, SelectInst *Select) {
 //                             blockaddress(@fn, BlockB)))
 // with
 //   (br cond, BlockA, BlockB).
-static bool SimplifyIndirectBrOnSelect(IndirectBrInst *IBI, SelectInst *SI) {
+bool SimplifyCFGOpt::SimplifyIndirectBrOnSelect(IndirectBrInst *IBI,
+                                                SelectInst *SI) {
   // Check that both operands of the select are block addresses.
   BlockAddress *TBA = dyn_cast<BlockAddress>(SI->getTrueValue());
   BlockAddress *FBA = dyn_cast<BlockAddress>(SI->getFalseValue());
@@ -3748,8 +3794,9 @@ bool SimplifyCFGOpt::tryToSimplifyUncondBranchWithICmpInIt(
 /// The specified branch is a conditional branch.
 /// Check to see if it is branching on an or/and chain of icmp instructions, and
 /// fold it into a switch instruction if so.
-static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
-                                      const DataLayout &DL) {
+bool SimplifyCFGOpt::SimplifyBranchOnICmpChain(BranchInst *BI,
+                                               IRBuilder<> &Builder,
+                                               const DataLayout &DL) {
   Instruction *Cond = dyn_cast<Instruction>(BI->getCondition());
   if (!Cond)
     return false;
@@ -3863,19 +3910,19 @@ static bool SimplifyBranchOnICmpChain(BranchInst *BI, IRBuilder<> &Builder,
   return true;
 }
 
-bool SimplifyCFGOpt::SimplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
+bool SimplifyCFGOpt::simplifyResume(ResumeInst *RI, IRBuilder<> &Builder) {
   if (isa<PHINode>(RI->getValue()))
-    return SimplifyCommonResume(RI);
+    return simplifyCommonResume(RI);
   else if (isa<LandingPadInst>(RI->getParent()->getFirstNonPHI()) &&
            RI->getValue() == RI->getParent()->getFirstNonPHI())
     // The resume must unwind the exception that caused control to branch here.
-    return SimplifySingleResume(RI);
+    return simplifySingleResume(RI);
 
   return false;
 }
 
 // Simplify resume that is shared by several landing pads (phi of landing pad).
-bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
+bool SimplifyCFGOpt::simplifyCommonResume(ResumeInst *RI) {
   BasicBlock *BB = RI->getParent();
 
   // Check that there are no other instructions except for debug intrinsics
@@ -3953,18 +4000,38 @@ bool SimplifyCFGOpt::SimplifyCommonResume(ResumeInst *RI) {
   return !TrivialUnwindBlocks.empty();
 }
 
+// Check if cleanup block is empty
+static bool isCleanupBlockEmpty(Instruction *Inst, Instruction *RI) {
+  BasicBlock::iterator I = Inst->getIterator(), E = RI->getIterator();
+  while (++I != E) {
+    auto *II = dyn_cast<IntrinsicInst>(I);
+    if (!II)
+      return false;
+
+    Intrinsic::ID IntrinsicID = II->getIntrinsicID();
+    switch (IntrinsicID) {
+    case Intrinsic::dbg_declare:
+    case Intrinsic::dbg_value:
+    case Intrinsic::dbg_label:
+    case Intrinsic::lifetime_end:
+      break;
+    default:
+      return false;
+    }
+  }
+  return true;
+}
+
 // Simplify resume that is only used by a single (non-phi) landing pad.
-bool SimplifyCFGOpt::SimplifySingleResume(ResumeInst *RI) {
+bool SimplifyCFGOpt::simplifySingleResume(ResumeInst *RI) {
   BasicBlock *BB = RI->getParent();
   auto *LPInst = cast<LandingPadInst>(BB->getFirstNonPHI());
   assert(RI->getValue() == LPInst &&
          "Resume must unwind the exception that caused control to here");
 
   // Check that there are no other instructions except for debug intrinsics.
-  BasicBlock::iterator I = LPInst->getIterator(), E = RI->getIterator();
-  while (++I != E)
-    if (!isa<DbgInfoIntrinsic>(I))
-      return false;
+  if (!isCleanupBlockEmpty(LPInst, RI))
+    return false;
 
   // Turn all invokes that unwind here into calls and delete the basic block.
   for (pred_iterator PI = pred_begin(BB), PE = pred_end(BB); PI != PE;) {
@@ -4000,23 +4067,8 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI) {
     return false;
 
   // Check that there are no other instructions except for benign intrinsics.
-  BasicBlock::iterator I = CPInst->getIterator(), E = RI->getIterator();
-  while (++I != E) {
-    auto *II = dyn_cast<IntrinsicInst>(I);
-    if (!II)
-      return false;
-
-    Intrinsic::ID IntrinsicID = II->getIntrinsicID();
-    switch (IntrinsicID) {
-    case Intrinsic::dbg_declare:
-    case Intrinsic::dbg_value:
-    case Intrinsic::dbg_label:
-    case Intrinsic::lifetime_end:
-      break;
-    default:
-      return false;
-    }
-  }
+  if (!isCleanupBlockEmpty(CPInst, RI))
+    return false;
 
   // If the cleanup return we are simplifying unwinds to the caller, this will
   // set UnwindDest to nullptr.
@@ -4083,9 +4135,10 @@ static bool removeEmptyCleanup(CleanupReturnInst *RI) {
       // The iterator must be incremented here because the instructions are
       // being moved to another block.
       PHINode *PN = cast<PHINode>(I++);
-      if (PN->use_empty())
-        // If the PHI node has no uses, just leave it.  It will be erased
-        // when we erase BB below.
+      if (PN->use_empty() || !PN->isUsedOutsideOfBlock(BB))
+        // If the PHI node has no uses or all of its uses are in this basic
+        // block (meaning they are debug or lifetime intrinsics), just leave
+        // it.  It will be erased when we erase BB below.
         continue;
 
       // Otherwise, sink this PHI node into UnwindDest.
@@ -4148,7 +4201,7 @@ static bool mergeCleanupPad(CleanupReturnInst *RI) {
   return true;
 }
 
-bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) {
+bool SimplifyCFGOpt::simplifyCleanupReturn(CleanupReturnInst *RI) {
   // It is possible to transiantly have an undef cleanuppad operand because we
   // have deleted some, but not all, dead blocks.
   // Eventually, this block will be deleted.
@@ -4164,7 +4217,7 @@ bool SimplifyCFGOpt::SimplifyCleanupReturn(CleanupReturnInst *RI) {
   return false;
 }
 
-bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
+bool SimplifyCFGOpt::simplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   BasicBlock *BB = RI->getParent();
   if (!BB->getFirstNonPHIOrDbg()->isTerminator())
     return false;
@@ -4218,7 +4271,7 @@ bool SimplifyCFGOpt::SimplifyReturn(ReturnInst *RI, IRBuilder<> &Builder) {
   return false;
 }
 
-bool SimplifyCFGOpt::SimplifyUnreachable(UnreachableInst *UI) {
+bool SimplifyCFGOpt::simplifyUnreachable(UnreachableInst *UI) {
   BasicBlock *BB = UI->getParent();
 
   bool Changed = false;
@@ -4393,7 +4446,8 @@ static void createUnreachableSwitchDefault(SwitchInst *Switch) {
 
 /// Turn a switch with two reachable destinations into an integer range
 /// comparison and branch.
-static bool TurnSwitchRangeIntoICmp(SwitchInst *SI, IRBuilder<> &Builder) {
+bool SimplifyCFGOpt::TurnSwitchRangeIntoICmp(SwitchInst *SI,
+                                             IRBuilder<> &Builder) {
   assert(SI->getNumCases() > 1 && "Degenerate switch?");
 
   bool HasDefault =
@@ -5689,7 +5743,7 @@ static bool ReduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
   return true;
 }
 
-bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
+bool SimplifyCFGOpt::simplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   BasicBlock *BB = SI->getParent();
 
   if (isValueEqualityComparison(SI)) {
@@ -5740,7 +5794,7 @@ bool SimplifyCFGOpt::SimplifySwitch(SwitchInst *SI, IRBuilder<> &Builder) {
   return false;
 }
 
-bool SimplifyCFGOpt::SimplifyIndirectBr(IndirectBrInst *IBI) {
+bool SimplifyCFGOpt::simplifyIndirectBr(IndirectBrInst *IBI) {
   BasicBlock *BB = IBI->getParent();
   bool Changed = false;
 
@@ -5855,7 +5909,12 @@ static bool TryToMergeLandingPad(LandingPadInst *LPad, BranchInst *BI,
   return false;
 }
 
-bool SimplifyCFGOpt::SimplifyUncondBranch(BranchInst *BI,
+bool SimplifyCFGOpt::simplifyBranch(BranchInst *Branch, IRBuilder<> &Builder) {
+  return Branch->isUnconditional() ? simplifyUncondBranch(Branch, Builder)
+                                   : simplifyCondBranch(Branch, Builder);
+}
+
+bool SimplifyCFGOpt::simplifyUncondBranch(BranchInst *BI,
                                           IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
   BasicBlock *Succ = BI->getSuccessor(0);
@@ -5916,10 +5975,9 @@ static BasicBlock *allPredecessorsComeFromSameSource(BasicBlock *BB) {
   return PredPred;
 }
 
-bool SimplifyCFGOpt::SimplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
+bool SimplifyCFGOpt::simplifyCondBranch(BranchInst *BI, IRBuilder<> &Builder) {
   BasicBlock *BB = BI->getParent();
-  const Function *Fn = BB->getParent();
-  if (Fn && Fn->hasFnAttribute(Attribute::OptForFuzzing))
+  if (!Options.SimplifyCondBranch)
     return false;
 
   // Conditional branch
@@ -6064,9 +6122,9 @@ static bool passingValueIsAlwaysUndefined(Value *V, Instruction *I) {
                SI->getPointerOperand() == I;
 
     // A call to null is undefined.
-    if (auto CS = CallSite(Use))
-      return !NullPointerIsDefined(CS->getFunction()) &&
-             CS.getCalledValue() == I;
+    if (auto *CB = dyn_cast<CallBase>(Use))
+      return !NullPointerIsDefined(CB->getFunction()) &&
+             CB->getCalledOperand() == I;
   }
   return false;
 }
@@ -6133,39 +6191,38 @@ bool SimplifyCFGOpt::simplifyOnce(BasicBlock *BB) {
 
   IRBuilder<> Builder(BB);
 
-  // If there is a trivial two-entry PHI node in this basic block, and we can
-  // eliminate it, do so now.
-  if (auto *PN = dyn_cast<PHINode>(BB->begin()))
-    if (PN->getNumIncomingValues() == 2)
-      Changed |= FoldTwoEntryPHINode(PN, TTI, DL);
-
-  Builder.SetInsertPoint(BB->getTerminator());
-  if (auto *BI = dyn_cast<BranchInst>(BB->getTerminator())) {
-    if (BI->isUnconditional()) {
-      if (SimplifyUncondBranch(BI, Builder))
-        return true;
-    } else {
-      if (SimplifyCondBranch(BI, Builder))
-        return true;
-    }
-  } else if (auto *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
-    if (SimplifyReturn(RI, Builder))
-      return true;
-  } else if (auto *RI = dyn_cast<ResumeInst>(BB->getTerminator())) {
-    if (SimplifyResume(RI, Builder))
-      return true;
-  } else if (auto *RI = dyn_cast<CleanupReturnInst>(BB->getTerminator())) {
-    if (SimplifyCleanupReturn(RI))
-      return true;
-  } else if (auto *SI = dyn_cast<SwitchInst>(BB->getTerminator())) {
-    if (SimplifySwitch(SI, Builder))
-      return true;
-  } else if (auto *UI = dyn_cast<UnreachableInst>(BB->getTerminator())) {
-    if (SimplifyUnreachable(UI))
-      return true;
-  } else if (auto *IBI = dyn_cast<IndirectBrInst>(BB->getTerminator())) {
-    if (SimplifyIndirectBr(IBI))
-      return true;
+  if (Options.FoldTwoEntryPHINode) {
+    // If there is a trivial two-entry PHI node in this basic block, and we can
+    // eliminate it, do so now.
+    if (auto *PN = dyn_cast<PHINode>(BB->begin()))
+      if (PN->getNumIncomingValues() == 2)
+        Changed |= FoldTwoEntryPHINode(PN, TTI, DL);
+  }
+
+  Instruction *Terminator = BB->getTerminator();
+  Builder.SetInsertPoint(Terminator);
+  switch (Terminator->getOpcode()) {
+  case Instruction::Br:
+    Changed |= simplifyBranch(cast<BranchInst>(Terminator), Builder);
+    break;
+  case Instruction::Ret:
+    Changed |= simplifyReturn(cast<ReturnInst>(Terminator), Builder);
+    break;
+  case Instruction::Resume:
+    Changed |= simplifyResume(cast<ResumeInst>(Terminator), Builder);
+    break;
+  case Instruction::CleanupRet:
+    Changed |= simplifyCleanupReturn(cast<CleanupReturnInst>(Terminator));
+    break;
+  case Instruction::Switch:
+    Changed |= simplifySwitch(cast<SwitchInst>(Terminator), Builder);
+    break;
+  case Instruction::Unreachable:
+    Changed |= simplifyUnreachable(cast<UnreachableInst>(Terminator));
+    break;
+  case Instruction::IndirectBr:
+    Changed |= simplifyIndirectBr(cast<IndirectBrInst>(Terminator));
+    break;
   }
 
   return Changed;
diff --git a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
index cbb114f9a47a..d3d0c3341908 100644
--- a/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -17,7 +17,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Analysis/LoopInfo.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/IRBuilder.h"
@@ -27,6 +26,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 
 using namespace llvm;
 
@@ -54,6 +54,7 @@ namespace {
     LoopInfo         *LI;
     ScalarEvolution  *SE;
     DominatorTree    *DT;
+    const TargetTransformInfo *TTI;
     SCEVExpander     &Rewriter;
     SmallVectorImpl<WeakTrackingVH> &DeadInsts;
 
@@ -61,10 +62,11 @@ namespace {
 
   public:
     SimplifyIndvar(Loop *Loop, ScalarEvolution *SE, DominatorTree *DT,
-                   LoopInfo *LI, SCEVExpander &Rewriter,
+                   LoopInfo *LI, const TargetTransformInfo *TTI,
+                   SCEVExpander &Rewriter,
                    SmallVectorImpl<WeakTrackingVH> &Dead)
-        : L(Loop), LI(LI), SE(SE), DT(DT), Rewriter(Rewriter), DeadInsts(Dead),
-          Changed(false) {
+        : L(Loop), LI(LI), SE(SE), DT(DT), TTI(TTI), Rewriter(Rewriter),
+          DeadInsts(Dead), Changed(false) {
       assert(LI && "IV simplification requires LoopInfo");
     }
 
@@ -655,7 +657,7 @@ static Instruction *GetLoopInvariantInsertPosition(Loop *L, Instruction *Hint) {
   return Hint;
 }
 
-/// Replace the UseInst with a constant if possible.
+/// Replace the UseInst with a loop invariant expression if it is safe.
 bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) {
   if (!SE->isSCEVable(I->getType()))
     return false;
@@ -667,10 +669,17 @@ bool SimplifyIndvar::replaceIVUserWithLoopInvariant(Instruction *I) {
     return false;
 
   // Do not generate something ridiculous even if S is loop invariant.
-  if (Rewriter.isHighCostExpansion(S, L, I))
+  if (Rewriter.isHighCostExpansion(S, L, SCEVCheapExpansionBudget, TTI, I))
     return false;
 
   auto *IP = GetLoopInvariantInsertPosition(L, I);
+
+  if (!isSafeToExpandAt(S, IP, *SE)) {
+    LLVM_DEBUG(dbgs() << "INDVARS: Can not replace IV user: " << *I
+                      << " with non-speculable loop invariant: " << *S << '\n');
+    return false;
+  }
+
   auto *Invariant = Rewriter.expandCodeFor(S, I->getType(), IP);
 
   I->replaceAllUsesWith(Invariant);
@@ -931,10 +940,11 @@ void IVVisitor::anchor() { }
 /// Simplify instructions that use this induction variable
 /// by using ScalarEvolution to analyze the IV's recurrence.
 bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT,
-                       LoopInfo *LI, SmallVectorImpl<WeakTrackingVH> &Dead,
+                       LoopInfo *LI, const TargetTransformInfo *TTI,
+                       SmallVectorImpl<WeakTrackingVH> &Dead,
                        SCEVExpander &Rewriter, IVVisitor *V) {
-  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, Rewriter,
-                     Dead);
+  SimplifyIndvar SIV(LI->getLoopFor(CurrIV->getParent()), SE, DT, LI, TTI,
+                     Rewriter, Dead);
   SIV.simplifyUsers(CurrIV, V);
   return SIV.hasChanged();
 }
@@ -942,14 +952,16 @@ bool simplifyUsersOfIV(PHINode *CurrIV, ScalarEvolution *SE, DominatorTree *DT,
 /// Simplify users of induction variables within this
 /// loop. This does not actually change or add IVs.
 bool simplifyLoopIVs(Loop *L, ScalarEvolution *SE, DominatorTree *DT,
-                     LoopInfo *LI, SmallVectorImpl<WeakTrackingVH> &Dead) {
+                     LoopInfo *LI, const TargetTransformInfo *TTI,
+                     SmallVectorImpl<WeakTrackingVH> &Dead) {
   SCEVExpander Rewriter(*SE, SE->getDataLayout(), "indvars");
 #ifndef NDEBUG
   Rewriter.setDebugType(DEBUG_TYPE);
 #endif
   bool Changed = false;
   for (BasicBlock::iterator I = L->getHeader()->begin(); isa<PHINode>(I); ++I) {
-    Changed |= simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, Dead, Rewriter);
+    Changed |=
+        simplifyUsersOfIV(cast<PHINode>(I), SE, DT, LI, TTI, Dead, Rewriter);
   }
   return Changed;
 }
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index fa3a9d21f3df..cfcc3454a210 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -138,28 +138,6 @@ static Value *convertStrToNumber(CallInst *CI, StringRef &Str, int64_t Base) {
   return ConstantInt::get(CI->getType(), Result);
 }
 
-static bool isLocallyOpenedFile(Value *File, CallInst *CI, IRBuilder<> &B,
-                                const TargetLibraryInfo *TLI) {
-  CallInst *FOpen = dyn_cast<CallInst>(File);
-  if (!FOpen)
-    return false;
-
-  Function *InnerCallee = FOpen->getCalledFunction();
-  if (!InnerCallee)
-    return false;
-
-  LibFunc Func;
-  if (!TLI->getLibFunc(*InnerCallee, Func) || !TLI->has(Func) ||
-      Func != LibFunc_fopen)
-    return false;
-
-  inferLibFuncAttributes(*CI->getCalledFunction(), *TLI);
-  if (PointerMayBeCaptured(File, true, true))
-    return false;
-
-  return true;
-}
-
 static bool isOnlyUsedInComparisonWithZero(Value *V) {
   for (User *U : V->users()) {
     if (ICmpInst *IC = dyn_cast<ICmpInst>(U))
@@ -177,8 +155,7 @@ static bool canTransformToMemCmp(CallInst *CI, Value *Str, uint64_t Len,
   if (!isOnlyUsedInComparisonWithZero(CI))
     return false;
 
-  if (!isDereferenceableAndAlignedPointer(Str, Align::None(), APInt(64, Len),
-                                          DL))
+  if (!isDereferenceableAndAlignedPointer(Str, Align(1), APInt(64, Len), DL))
     return false;
 
   if (CI->getFunction()->hasFnAttribute(Attribute::SanitizeMemory))
@@ -252,7 +229,7 @@ static void annotateNonNullAndDereferenceable(CallInst *CI, ArrayRef<unsigned> A
 // String and Memory Library Call Optimizations
 //===----------------------------------------------------------------------===//
 
-Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilderBase &B) {
   // Extract some information from the instruction
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
@@ -274,7 +251,7 @@ Value *LibCallSimplifier::optimizeStrCat(CallInst *CI, IRBuilder<> &B) {
 }
 
 Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
-                                           IRBuilder<> &B) {
+                                           IRBuilderBase &B) {
   // We need to find the end of the destination string.  That's where the
   // memory is to be moved to. We just generate a call to strlen.
   Value *DstLen = emitStrLen(Dst, B, DL, TLI);
@@ -289,12 +266,12 @@ Value *LibCallSimplifier::emitStrLenMemCpy(Value *Src, Value *Dst, uint64_t Len,
   // We have enough information to now generate the memcpy call to do the
   // concatenation for us.  Make a memcpy to copy the nul byte with align = 1.
   B.CreateMemCpy(
-      CpyDst, Align::None(), Src, Align::None(),
+      CpyDst, Align(1), Src, Align(1),
       ConstantInt::get(DL.getIntPtrType(Src->getContext()), Len + 1));
   return Dst;
 }
 
-Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilderBase &B) {
   // Extract some information from the instruction.
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
@@ -337,7 +314,7 @@ Value *LibCallSimplifier::optimizeStrNCat(CallInst *CI, IRBuilder<> &B) {
   return emitStrLenMemCpy(Src, Dst, SrcLen, B);
 }
 
-Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilderBase &B) {
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
   Value *SrcStr = CI->getArgOperand(0);
@@ -382,7 +359,7 @@ Value *LibCallSimplifier::optimizeStrChr(CallInst *CI, IRBuilder<> &B) {
   return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strchr");
 }
 
-Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilderBase &B) {
   Value *SrcStr = CI->getArgOperand(0);
   ConstantInt *CharC = dyn_cast<ConstantInt>(CI->getArgOperand(1));
   annotateNonNullBasedOnAccess(CI, 0);
@@ -410,7 +387,7 @@ Value *LibCallSimplifier::optimizeStrRChr(CallInst *CI, IRBuilder<> &B) {
   return B.CreateGEP(B.getInt8Ty(), SrcStr, B.getInt64(I), "strrchr");
 }
 
-Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilderBase &B) {
   Value *Str1P = CI->getArgOperand(0), *Str2P = CI->getArgOperand(1);
   if (Str1P == Str2P) // strcmp(x,x)  -> 0
     return ConstantInt::get(CI->getType(), 0);
@@ -465,7 +442,7 @@ Value *LibCallSimplifier::optimizeStrCmp(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilderBase &B) {
   Value *Str1P = CI->getArgOperand(0);
   Value *Str2P = CI->getArgOperand(1);
   Value *Size = CI->getArgOperand(2);
@@ -533,7 +510,7 @@ Value *LibCallSimplifier::optimizeStrNCmp(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilderBase &B) {
   Value *Src = CI->getArgOperand(0);
   ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
   uint64_t SrcLen = GetStringLength(Src);
@@ -546,7 +523,7 @@ Value *LibCallSimplifier::optimizeStrNDup(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilderBase &B) {
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
   if (Dst == Src) // strcpy(x,x)  -> x
     return Src;
@@ -562,13 +539,13 @@ Value *LibCallSimplifier::optimizeStrCpy(CallInst *CI, IRBuilder<> &B) {
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
   CallInst *NewCI =
-      B.CreateMemCpy(Dst, Align::None(), Src, Align::None(),
+      B.CreateMemCpy(Dst, Align(1), Src, Align(1),
                      ConstantInt::get(DL.getIntPtrType(CI->getContext()), Len));
   NewCI->setAttributes(CI->getAttributes());
   return Dst;
 }
 
-Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilderBase &B) {
   Function *Callee = CI->getCalledFunction();
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1);
   if (Dst == Src) { // stpcpy(x,x)  -> x+strlen(x)
@@ -590,13 +567,12 @@ Value *LibCallSimplifier::optimizeStpCpy(CallInst *CI, IRBuilder<> &B) {
 
   // We have enough information to now generate the memcpy call to do the
   // copy for us.  Make a memcpy to copy the nul byte with align = 1.
-  CallInst *NewCI =
-      B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), LenV);
+  CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1), LenV);
   NewCI->setAttributes(CI->getAttributes());
   return DstEnd;
 }
 
-Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilderBase &B) {
   Function *Callee = CI->getCalledFunction();
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
@@ -626,7 +602,7 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
 
   if (SrcLen == 0) {
     // strncpy(x, "", y) -> memset(align 1 x, '\0', y)
-    CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, Align::None());
+    CallInst *NewCI = B.CreateMemSet(Dst, B.getInt8('\0'), Size, Align(1));
     AttrBuilder ArgAttrs(CI->getAttributes().getParamAttributes(0));
     NewCI->setAttributes(NewCI->getAttributes().addParamAttributes(
         CI->getContext(), 0, ArgAttrs));
@@ -639,13 +615,13 @@ Value *LibCallSimplifier::optimizeStrNCpy(CallInst *CI, IRBuilder<> &B) {
 
   Type *PT = Callee->getFunctionType()->getParamType(0);
   // strncpy(x, s, c) -> memcpy(align 1 x, align 1 s, c) [s and c are constant]
-  CallInst *NewCI = B.CreateMemCpy(Dst, Align::None(), Src, Align::None(),
+  CallInst *NewCI = B.CreateMemCpy(Dst, Align(1), Src, Align(1),
                                    ConstantInt::get(DL.getIntPtrType(PT), Len));
   NewCI->setAttributes(CI->getAttributes());
   return Dst;
 }
 
-Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilder<> &B,
+Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilderBase &B,
                                                unsigned CharSize) {
   Value *Src = CI->getArgOperand(0);
 
@@ -736,14 +712,14 @@ Value *LibCallSimplifier::optimizeStringLength(CallInst *CI, IRBuilder<> &B,
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrLen(CallInst *CI, IRBuilderBase &B) {
   if (Value *V = optimizeStringLength(CI, B, 8))
     return V;
   annotateNonNullBasedOnAccess(CI, 0);
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilderBase &B) {
   Module &M = *CI->getModule();
   unsigned WCharSize = TLI->getWCharSize(M) * 8;
   // We cannot perform this optimization without wchar_size metadata.
@@ -753,7 +729,7 @@ Value *LibCallSimplifier::optimizeWcslen(CallInst *CI, IRBuilder<> &B) {
   return optimizeStringLength(CI, B, WCharSize);
 }
 
-Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilderBase &B) {
   StringRef S1, S2;
   bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
   bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
@@ -780,7 +756,7 @@ Value *LibCallSimplifier::optimizeStrPBrk(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilderBase &B) {
   Value *EndPtr = CI->getArgOperand(1);
   if (isa<ConstantPointerNull>(EndPtr)) {
     // With a null EndPtr, this function won't capture the main argument.
@@ -791,7 +767,7 @@ Value *LibCallSimplifier::optimizeStrTo(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilderBase &B) {
   StringRef S1, S2;
   bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
   bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
@@ -812,7 +788,7 @@ Value *LibCallSimplifier::optimizeStrSpn(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilderBase &B) {
   StringRef S1, S2;
   bool HasS1 = getConstantStringInfo(CI->getArgOperand(0), S1);
   bool HasS2 = getConstantStringInfo(CI->getArgOperand(1), S2);
@@ -836,7 +812,7 @@ Value *LibCallSimplifier::optimizeStrCSpn(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilderBase &B) {
   // fold strstr(x, x) -> x.
   if (CI->getArgOperand(0) == CI->getArgOperand(1))
     return B.CreateBitCast(CI->getArgOperand(0), CI->getType());
@@ -893,13 +869,13 @@ Value *LibCallSimplifier::optimizeStrStr(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeMemRChr(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeMemRChr(CallInst *CI, IRBuilderBase &B) {
   if (isKnownNonZero(CI->getOperand(2), DL))
     annotateNonNullBasedOnAccess(CI, 0);
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilderBase &B) {
   Value *SrcStr = CI->getArgOperand(0);
   Value *Size = CI->getArgOperand(2);
   annotateNonNullAndDereferenceable(CI, 0, Size, DL);
@@ -988,7 +964,7 @@ Value *LibCallSimplifier::optimizeMemChr(CallInst *CI, IRBuilder<> &B) {
 }
 
 static Value *optimizeMemCmpConstantSize(CallInst *CI, Value *LHS, Value *RHS,
-                                         uint64_t Len, IRBuilder<> &B,
+                                         uint64_t Len, IRBuilderBase &B,
                                          const DataLayout &DL) {
   if (Len == 0) // memcmp(s1,s2,0) -> 0
     return Constant::getNullValue(CI->getType());
@@ -1065,7 +1041,7 @@ static Value *optimizeMemCmpConstantSize(CallInst *CI, Value *LHS, Value *RHS,
 
 // Most simplifications for memcmp also apply to bcmp.
 Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI,
-                                                   IRBuilder<> &B) {
+                                                   IRBuilderBase &B) {
   Value *LHS = CI->getArgOperand(0), *RHS = CI->getArgOperand(1);
   Value *Size = CI->getArgOperand(2);
 
@@ -1088,7 +1064,7 @@ Value *LibCallSimplifier::optimizeMemCmpBCmpCommon(CallInst *CI,
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilderBase &B) {
   if (Value *V = optimizeMemCmpBCmpCommon(CI, B))
     return V;
 
@@ -1105,24 +1081,24 @@ Value *LibCallSimplifier::optimizeMemCmp(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeBCmp(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeBCmp(CallInst *CI, IRBuilderBase &B) {
   return optimizeMemCmpBCmpCommon(CI, B);
 }
 
-Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeMemCpy(CallInst *CI, IRBuilderBase &B) {
   Value *Size = CI->getArgOperand(2);
   annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
   if (isa<IntrinsicInst>(CI))
     return nullptr;
 
   // memcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n)
-  CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align::None(),
-                                   CI->getArgOperand(1), Align::None(), Size);
+  CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align(1),
+                                   CI->getArgOperand(1), Align(1), Size);
   NewCI->setAttributes(CI->getAttributes());
   return CI->getArgOperand(0);
 }
 
-Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilderBase &B) {
   Value *Dst = CI->getArgOperand(0);
   Value *Src = CI->getArgOperand(1);
   ConstantInt *StopChar = dyn_cast<ConstantInt>(CI->getArgOperand(2));
@@ -1146,8 +1122,7 @@ Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilder<> &B) {
   size_t Pos = SrcStr.find(StopChar->getSExtValue() & 0xFF);
   if (Pos == StringRef::npos) {
     if (N->getZExtValue() <= SrcStr.size()) {
-      B.CreateMemCpy(Dst, Align::None(), Src, Align::None(),
-                     CI->getArgOperand(3));
+      B.CreateMemCpy(Dst, Align(1), Src, Align(1), CI->getArgOperand(3));
       return Constant::getNullValue(CI->getType());
     }
     return nullptr;
@@ -1156,37 +1131,37 @@ Value *LibCallSimplifier::optimizeMemCCpy(CallInst *CI, IRBuilder<> &B) {
   Value *NewN =
       ConstantInt::get(N->getType(), std::min(uint64_t(Pos + 1), N->getZExtValue()));
   // memccpy -> llvm.memcpy
-  B.CreateMemCpy(Dst, Align::None(), Src, Align::None(), NewN);
+  B.CreateMemCpy(Dst, Align(1), Src, Align(1), NewN);
   return Pos + 1 <= N->getZExtValue()
              ? B.CreateInBoundsGEP(B.getInt8Ty(), Dst, NewN)
              : Constant::getNullValue(CI->getType());
 }
 
-Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeMemPCpy(CallInst *CI, IRBuilderBase &B) {
   Value *Dst = CI->getArgOperand(0);
   Value *N = CI->getArgOperand(2);
   // mempcpy(x, y, n) -> llvm.memcpy(align 1 x, align 1 y, n), x + n
-  CallInst *NewCI = B.CreateMemCpy(Dst, Align::None(), CI->getArgOperand(1),
-                                   Align::None(), N);
+  CallInst *NewCI =
+      B.CreateMemCpy(Dst, Align(1), CI->getArgOperand(1), Align(1), N);
   NewCI->setAttributes(CI->getAttributes());
   return B.CreateInBoundsGEP(B.getInt8Ty(), Dst, N);
 }
 
-Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeMemMove(CallInst *CI, IRBuilderBase &B) {
   Value *Size = CI->getArgOperand(2);
   annotateNonNullAndDereferenceable(CI, {0, 1}, Size, DL);
   if (isa<IntrinsicInst>(CI))
     return nullptr;
 
   // memmove(x, y, n) -> llvm.memmove(align 1 x, align 1 y, n)
-  CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align::None(),
-                                    CI->getArgOperand(1), Align::None(), Size);
+  CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align(1),
+                                    CI->getArgOperand(1), Align(1), Size);
   NewCI->setAttributes(CI->getAttributes());
   return CI->getArgOperand(0);
 }
 
 /// Fold memset[_chk](malloc(n), 0, n) --> calloc(1, n).
-Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilder<> &B) {
+Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilderBase &B) {
   // This has to be a memset of zeros (bzero).
   auto *FillValue = dyn_cast<ConstantInt>(Memset->getArgOperand(1));
   if (!FillValue || FillValue->getZExtValue() != 0)
@@ -1229,7 +1204,7 @@ Value *LibCallSimplifier::foldMallocMemset(CallInst *Memset, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilderBase &B) {
   Value *Size = CI->getArgOperand(2);
   annotateNonNullAndDereferenceable(CI, 0, Size, DL);
   if (isa<IntrinsicInst>(CI))
@@ -1240,13 +1215,12 @@ Value *LibCallSimplifier::optimizeMemSet(CallInst *CI, IRBuilder<> &B) {
 
   // memset(p, v, n) -> llvm.memset(align 1 p, v, n)
   Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
-  CallInst *NewCI =
-      B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align::None());
+  CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val, Size, Align(1));
   NewCI->setAttributes(CI->getAttributes());
   return CI->getArgOperand(0);
 }
 
-Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilderBase &B) {
   if (isa<ConstantPointerNull>(CI->getArgOperand(0)))
     return emitMalloc(CI->getArgOperand(1), B, DL, TLI);
 
@@ -1258,9 +1232,10 @@ Value *LibCallSimplifier::optimizeRealloc(CallInst *CI, IRBuilder<> &B) {
 //===----------------------------------------------------------------------===//
 
 // Replace a libcall \p CI with a call to intrinsic \p IID
-static Value *replaceUnaryCall(CallInst *CI, IRBuilder<> &B, Intrinsic::ID IID) {
+static Value *replaceUnaryCall(CallInst *CI, IRBuilderBase &B,
+                               Intrinsic::ID IID) {
   // Propagate fast-math flags from the existing call to the new call.
-  IRBuilder<>::FastMathFlagGuard Guard(B);
+  IRBuilderBase::FastMathFlagGuard Guard(B);
   B.setFastMathFlags(CI->getFastMathFlags());
 
   Module *M = CI->getModule();
@@ -1294,7 +1269,7 @@ static Value *valueHasFloatPrecision(Value *Val) {
 }
 
 /// Shrink double -> float functions.
-static Value *optimizeDoubleFP(CallInst *CI, IRBuilder<> &B,
+static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B,
                                bool isBinary, bool isPrecise = false) {
   Function *CalleeFn = CI->getCalledFunction();
   if (!CI->getType()->isDoubleTy() || !CalleeFn)
@@ -1333,7 +1308,7 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilder<> &B,
   }
 
   // Propagate the math semantics from the current function to the new function.
-  IRBuilder<>::FastMathFlagGuard Guard(B);
+  IRBuilderBase::FastMathFlagGuard Guard(B);
   B.setFastMathFlags(CI->getFastMathFlags());
 
   // g((double) float) -> (double) gf(float)
@@ -1352,24 +1327,24 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilder<> &B,
 }
 
 /// Shrink double -> float for unary functions.
-static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilder<> &B,
+static Value *optimizeUnaryDoubleFP(CallInst *CI, IRBuilderBase &B,
                                     bool isPrecise = false) {
   return optimizeDoubleFP(CI, B, false, isPrecise);
 }
 
 /// Shrink double -> float for binary functions.
-static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilder<> &B,
+static Value *optimizeBinaryDoubleFP(CallInst *CI, IRBuilderBase &B,
                                      bool isPrecise = false) {
   return optimizeDoubleFP(CI, B, true, isPrecise);
 }
 
 // cabs(z) -> sqrt((creal(z)*creal(z)) + (cimag(z)*cimag(z)))
-Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilderBase &B) {
   if (!CI->isFast())
     return nullptr;
 
   // Propagate fast-math flags from the existing call to new instructions.
-  IRBuilder<>::FastMathFlagGuard Guard(B);
+  IRBuilderBase::FastMathFlagGuard Guard(B);
   B.setFastMathFlags(CI->getFastMathFlags());
 
   Value *Real, *Imag;
@@ -1393,11 +1368,11 @@ Value *LibCallSimplifier::optimizeCAbs(CallInst *CI, IRBuilder<> &B) {
 }
 
 static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func,
-                                      IRBuilder<> &B) {
+                                      IRBuilderBase &B) {
   if (!isa<FPMathOperator>(Call))
     return nullptr;
 
-  IRBuilder<>::FastMathFlagGuard Guard(B);
+  IRBuilderBase::FastMathFlagGuard Guard(B);
   B.setFastMathFlags(Call->getFastMathFlags());
 
   // TODO: Can this be shared to also handle LLVM intrinsics?
@@ -1427,7 +1402,7 @@ static Value *optimizeTrigReflections(CallInst *Call, LibFunc Func,
   return nullptr;
 }
 
-static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) {
+static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilderBase &B) {
   // Multiplications calculated using Addition Chains.
   // Refer: http://wwwhomes.uni-bielefeld.de/achim/addition_chain.html
 
@@ -1453,7 +1428,7 @@ static Value *getPow(Value *InnerChain[33], unsigned Exp, IRBuilder<> &B) {
 }
 
 // Return a properly extended 32-bit integer if the operation is an itofp.
-static Value *getIntToFPVal(Value *I2F, IRBuilder<> &B) {
+static Value *getIntToFPVal(Value *I2F, IRBuilderBase &B) {
   if (isa<SIToFPInst>(I2F) || isa<UIToFPInst>(I2F)) {
     Value *Op = cast<Instruction>(I2F)->getOperand(0);
     // Make sure that the exponent fits inside an int32_t,
@@ -1471,9 +1446,9 @@ static Value *getIntToFPVal(Value *I2F, IRBuilder<> &B) {
 /// Use exp{,2}(x * y) for pow(exp{,2}(x), y);
 /// ldexp(1.0, x) for pow(2.0, itofp(x)); exp2(n * x) for pow(2.0 ** n, x);
 /// exp10(x) for pow(10.0, x); exp2(log2(n) * x) for pow(n, x).
-Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
+Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilderBase &B) {
   Value *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1);
-  AttributeList Attrs = Pow->getCalledFunction()->getAttributes();
+  AttributeList Attrs; // Attributes are only meaningful on the original call
   Module *Mod = Pow->getModule();
   Type *Ty = Pow->getType();
   bool Ignored;
@@ -1588,9 +1563,14 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
     return emitUnaryFloatFnCall(Expo, TLI, LibFunc_exp10, LibFunc_exp10f,
                                 LibFunc_exp10l, B, Attrs);
 
-  // pow(n, x) -> exp2(log2(n) * x)
-  if (Pow->hasOneUse() && Pow->hasApproxFunc() && Pow->hasNoNaNs() &&
-      Pow->hasNoInfs() && BaseF->isNormal() && !BaseF->isNegative()) {
+  // pow(x, y) -> exp2(log2(x) * y)
+  if (Pow->hasApproxFunc() && Pow->hasNoNaNs() && BaseF->isFiniteNonZero() &&
+      !BaseF->isNegative()) {
+    // pow(1, inf) is defined to be 1 but exp2(log2(1) * inf) evaluates to NaN.
+    // Luckily optimizePow has already handled the x == 1 case.
+    assert(!match(Base, m_FPOne()) &&
+           "pow(1.0, y) should have been simplified earlier!");
+
     Value *Log = nullptr;
     if (Ty->isFloatTy())
       Log = ConstantFP::get(Ty, std::log2(BaseF->convertToFloat()));
@@ -1612,7 +1592,7 @@ Value *LibCallSimplifier::replacePowWithExp(CallInst *Pow, IRBuilder<> &B) {
 }
 
 static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno,
-                          Module *M, IRBuilder<> &B,
+                          Module *M, IRBuilderBase &B,
                           const TargetLibraryInfo *TLI) {
   // If errno is never set, then use the intrinsic for sqrt().
   if (NoErrno) {
@@ -1633,9 +1613,9 @@ static Value *getSqrtCall(Value *V, AttributeList Attrs, bool NoErrno,
 }
 
 /// Use square root in place of pow(x, +/-0.5).
-Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilder<> &B) {
+Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilderBase &B) {
   Value *Sqrt, *Base = Pow->getArgOperand(0), *Expo = Pow->getArgOperand(1);
-  AttributeList Attrs = Pow->getCalledFunction()->getAttributes();
+  AttributeList Attrs; // Attributes are only meaningful on the original call
   Module *Mod = Pow->getModule();
   Type *Ty = Pow->getType();
 
@@ -1676,13 +1656,13 @@ Value *LibCallSimplifier::replacePowWithSqrt(CallInst *Pow, IRBuilder<> &B) {
 }
 
 static Value *createPowWithIntegerExponent(Value *Base, Value *Expo, Module *M,
-                                           IRBuilder<> &B) {
+                                           IRBuilderBase &B) {
   Value *Args[] = {Base, Expo};
   Function *F = Intrinsic::getDeclaration(M, Intrinsic::powi, Base->getType());
   return B.CreateCall(F, Args);
 }
 
-Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilderBase &B) {
   Value *Base = Pow->getArgOperand(0);
   Value *Expo = Pow->getArgOperand(1);
   Function *Callee = Pow->getCalledFunction();
@@ -1693,12 +1673,8 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) {
   bool AllowApprox = Pow->hasApproxFunc();
   bool Ignored;
 
-  // Bail out if simplifying libcalls to pow() is disabled.
-  if (!hasFloatFn(TLI, Ty, LibFunc_pow, LibFunc_powf, LibFunc_powl))
-    return nullptr;
-
   // Propagate the math semantics from the call to any created instructions.
-  IRBuilder<>::FastMathFlagGuard Guard(B);
+  IRBuilderBase::FastMathFlagGuard Guard(B);
   B.setFastMathFlags(Pow->getFastMathFlags());
 
   // Shrink pow() to powf() if the arguments are single precision,
@@ -1748,7 +1724,7 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) {
     //       be different) and it should also consider optimizing for size.
     APFloat LimF(ExpoF->getSemantics(), 33),
             ExpoA(abs(*ExpoF));
-    if (ExpoA.compare(LimF) == APFloat::cmpLessThan) {
+    if (ExpoA < LimF) {
       // This transformation applies to integer or integer+0.5 exponents only.
       // For integer+0.5, we create a sqrt(Base) call.
       Value *Sqrt = nullptr;
@@ -1807,8 +1783,9 @@ Value *LibCallSimplifier::optimizePow(CallInst *Pow, IRBuilder<> &B) {
   return Shrunk;
 }
 
-Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilderBase &B) {
   Function *Callee = CI->getCalledFunction();
+  AttributeList Attrs; // Attributes are only meaningful on the original call
   StringRef Name = Callee->getName();
   Value *Ret = nullptr;
   if (UnsafeFPShrink && Name == TLI->getName(LibFunc_exp2) &&
@@ -1825,13 +1802,13 @@ Value *LibCallSimplifier::optimizeExp2(CallInst *CI, IRBuilder<> &B) {
     if (Value *Exp = getIntToFPVal(Op, B))
       return emitBinaryFloatFnCall(ConstantFP::get(Ty, 1.0), Exp, TLI,
                                    LibFunc_ldexp, LibFunc_ldexpf, LibFunc_ldexpl,
-                                   B, CI->getCalledFunction()->getAttributes());
+                                   B, Attrs);
   }
 
   return Ret;
 }
 
-Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilderBase &B) {
   // If we can shrink the call to a float function rather than a double
   // function, do that first.
   Function *Callee = CI->getCalledFunction();
@@ -1847,7 +1824,7 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
   // "Ideally, fmax would be sensitive to the sign of zero, for example
   // fmax(-0.0, +0.0) would return +0; however, implementation in software
   // might be impractical."
-  IRBuilder<>::FastMathFlagGuard Guard(B);
+  IRBuilderBase::FastMathFlagGuard Guard(B);
   FastMathFlags FMF = CI->getFastMathFlags();
   FMF.setNoSignedZeros();
   B.setFastMathFlags(FMF);
@@ -1858,9 +1835,9 @@ Value *LibCallSimplifier::optimizeFMinFMax(CallInst *CI, IRBuilder<> &B) {
   return B.CreateCall(F, { CI->getArgOperand(0), CI->getArgOperand(1) });
 }
 
-Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilderBase &B) {
   Function *LogFn = Log->getCalledFunction();
-  AttributeList Attrs = LogFn->getAttributes();
+  AttributeList Attrs; // Attributes are only meaningful on the original call
   StringRef LogNm = LogFn->getName();
   Intrinsic::ID LogID = LogFn->getIntrinsicID();
   Module *Mod = Log->getModule();
@@ -1963,12 +1940,12 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilder<> &B) {
   } else
     return Ret;
 
-  IRBuilder<>::FastMathFlagGuard Guard(B);
+  IRBuilderBase::FastMathFlagGuard Guard(B);
   B.setFastMathFlags(FastMathFlags::getFast());
 
   Intrinsic::ID ArgID = Arg->getIntrinsicID();
   LibFunc ArgLb = NotLibFunc;
-  TLI->getLibFunc(Arg, ArgLb);
+  TLI->getLibFunc(*Arg, ArgLb);
 
   // log(pow(x,y)) -> y*log(x)
   if (ArgLb == PowLb || ArgID == Intrinsic::pow) {
@@ -2010,7 +1987,7 @@ Value *LibCallSimplifier::optimizeLog(CallInst *Log, IRBuilder<> &B) {
   return Ret;
 }
 
-Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilderBase &B) {
   Function *Callee = CI->getCalledFunction();
   Value *Ret = nullptr;
   // TODO: Once we have a way (other than checking for the existince of the
@@ -2058,7 +2035,7 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
 
   // Fast math flags for any created instructions should match the sqrt
   // and multiply.
-  IRBuilder<>::FastMathFlagGuard Guard(B);
+  IRBuilderBase::FastMathFlagGuard Guard(B);
   B.setFastMathFlags(I->getFastMathFlags());
 
   // If we found a repeated factor, hoist it out of the square root and
@@ -2079,7 +2056,7 @@ Value *LibCallSimplifier::optimizeSqrt(CallInst *CI, IRBuilder<> &B) {
 }
 
 // TODO: Generalize to handle any trig function and its inverse.
-Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeTan(CallInst *CI, IRBuilderBase &B) {
   Function *Callee = CI->getCalledFunction();
   Value *Ret = nullptr;
   StringRef Name = Callee->getName();
@@ -2116,7 +2093,7 @@ static bool isTrigLibCall(CallInst *CI) {
          CI->hasFnAttr(Attribute::ReadNone);
 }
 
-static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
+static void insertSinCosCall(IRBuilderBase &B, Function *OrigCallee, Value *Arg,
                              bool UseFloat, Value *&Sin, Value *&Cos,
                              Value *&SinCos) {
   Type *ArgTy = Arg->getType();
@@ -2131,7 +2108,7 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
     // x86_64 can't use {float, float} since that would be returned in both
     // xmm0 and xmm1, which isn't what a real struct would do.
     ResTy = T.getArch() == Triple::x86_64
-                ? static_cast<Type *>(VectorType::get(ArgTy, 2))
+                ? static_cast<Type *>(FixedVectorType::get(ArgTy, 2))
                 : static_cast<Type *>(StructType::get(ArgTy, ArgTy));
   } else {
     Name = "__sincospi_stret";
@@ -2166,7 +2143,7 @@ static void insertSinCosCall(IRBuilder<> &B, Function *OrigCallee, Value *Arg,
   }
 }
 
-Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeSinCosPi(CallInst *CI, IRBuilderBase &B) {
   // Make sure the prototype is as expected, otherwise the rest of the
   // function is probably invalid and likely to abort.
   if (!isTrigLibCall(CI))
@@ -2247,7 +2224,7 @@ void LibCallSimplifier::classifyArgUse(
 // Integer Library Call Optimizations
 //===----------------------------------------------------------------------===//
 
-Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilderBase &B) {
   // ffs(x) -> x != 0 ? (i32)llvm.cttz(x)+1 : 0
   Value *Op = CI->getArgOperand(0);
   Type *ArgType = Op->getType();
@@ -2261,7 +2238,7 @@ Value *LibCallSimplifier::optimizeFFS(CallInst *CI, IRBuilder<> &B) {
   return B.CreateSelect(Cond, V, B.getInt32(0));
 }
 
-Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilderBase &B) {
   // fls(x) -> (i32)(sizeInBits(x) - llvm.ctlz(x, false))
   Value *Op = CI->getArgOperand(0);
   Type *ArgType = Op->getType();
@@ -2273,7 +2250,7 @@ Value *LibCallSimplifier::optimizeFls(CallInst *CI, IRBuilder<> &B) {
   return B.CreateIntCast(V, CI->getType(), false);
 }
 
-Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilderBase &B) {
   // abs(x) -> x <s 0 ? -x : x
   // The negation has 'nsw' because abs of INT_MIN is undefined.
   Value *X = CI->getArgOperand(0);
@@ -2282,7 +2259,7 @@ Value *LibCallSimplifier::optimizeAbs(CallInst *CI, IRBuilder<> &B) {
   return B.CreateSelect(IsNeg, NegX, X);
 }
 
-Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilderBase &B) {
   // isdigit(c) -> (c-'0') <u 10
   Value *Op = CI->getArgOperand(0);
   Op = B.CreateSub(Op, B.getInt32('0'), "isdigittmp");
@@ -2290,20 +2267,20 @@ Value *LibCallSimplifier::optimizeIsDigit(CallInst *CI, IRBuilder<> &B) {
   return B.CreateZExt(Op, CI->getType());
 }
 
-Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeIsAscii(CallInst *CI, IRBuilderBase &B) {
   // isascii(c) -> c <u 128
   Value *Op = CI->getArgOperand(0);
   Op = B.CreateICmpULT(Op, B.getInt32(128), "isascii");
   return B.CreateZExt(Op, CI->getType());
 }
 
-Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeToAscii(CallInst *CI, IRBuilderBase &B) {
   // toascii(c) -> c & 0x7f
   return B.CreateAnd(CI->getArgOperand(0),
                      ConstantInt::get(CI->getType(), 0x7F));
 }
 
-Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilderBase &B) {
   StringRef Str;
   if (!getConstantStringInfo(CI->getArgOperand(0), Str))
     return nullptr;
@@ -2311,7 +2288,7 @@ Value *LibCallSimplifier::optimizeAtoi(CallInst *CI, IRBuilder<> &B) {
   return convertStrToNumber(CI, Str, 10);
 }
 
-Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilderBase &B) {
   StringRef Str;
   if (!getConstantStringInfo(CI->getArgOperand(0), Str))
     return nullptr;
@@ -2332,7 +2309,7 @@ Value *LibCallSimplifier::optimizeStrtol(CallInst *CI, IRBuilder<> &B) {
 
 static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg);
 
-Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilder<> &B,
+Value *LibCallSimplifier::optimizeErrorReporting(CallInst *CI, IRBuilderBase &B,
                                                  int StreamArg) {
   Function *Callee = CI->getCalledFunction();
   // Error reporting calls should be cold, mark them as such.
@@ -2372,7 +2349,7 @@ static bool isReportingError(Function *Callee, CallInst *CI, int StreamArg) {
   return GV->getName() == "stderr";
 }
 
-Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilderBase &B) {
   // Check for a fixed format string.
   StringRef FormatStr;
   if (!getConstantStringInfo(CI->getArgOperand(0), FormatStr))
@@ -2425,7 +2402,7 @@ Value *LibCallSimplifier::optimizePrintFString(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilderBase &B) {
 
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
@@ -2462,7 +2439,8 @@ Value *LibCallSimplifier::optimizePrintF(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI,
+                                                IRBuilderBase &B) {
   // Check for a fixed format string.
   StringRef FormatStr;
   if (!getConstantStringInfo(CI->getArgOperand(1), FormatStr))
@@ -2477,8 +2455,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
 
     // sprintf(str, fmt) -> llvm.memcpy(align 1 str, align 1 fmt, strlen(fmt)+1)
     B.CreateMemCpy(
-        CI->getArgOperand(0), Align::None(), CI->getArgOperand(1),
-        Align::None(),
+        CI->getArgOperand(0), Align(1), CI->getArgOperand(1), Align(1),
         ConstantInt::get(DL.getIntPtrType(CI->getContext()),
                          FormatStr.size() + 1)); // Copy the null byte.
     return ConstantInt::get(CI->getType(), FormatStr.size());
@@ -2515,8 +2492,8 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
       return nullptr;
     Value *IncLen =
         B.CreateAdd(Len, ConstantInt::get(Len->getType(), 1), "leninc");
-    B.CreateMemCpy(CI->getArgOperand(0), Align::None(), CI->getArgOperand(2),
-                   Align::None(), IncLen);
+    B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(2),
+                   Align(1), IncLen);
 
     // The sprintf result is the unincremented number of bytes in the string.
     return B.CreateIntCast(Len, CI->getType(), false);
@@ -2524,7 +2501,7 @@ Value *LibCallSimplifier::optimizeSPrintFString(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilderBase &B) {
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
   if (Value *V = optimizeSPrintFString(CI, B)) {
@@ -2560,7 +2537,8 @@ Value *LibCallSimplifier::optimizeSPrintF(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI,
+                                                 IRBuilderBase &B) {
   // Check for size
   ConstantInt *Size = dyn_cast<ConstantInt>(CI->getArgOperand(1));
   if (!Size)
@@ -2587,8 +2565,7 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) {
     // snprintf(dst, size, fmt) -> llvm.memcpy(align 1 dst, align 1 fmt,
     // strlen(fmt)+1)
     B.CreateMemCpy(
-        CI->getArgOperand(0), Align::None(), CI->getArgOperand(2),
-        Align::None(),
+        CI->getArgOperand(0), Align(1), CI->getArgOperand(2), Align(1),
         ConstantInt::get(DL.getIntPtrType(CI->getContext()),
                          FormatStr.size() + 1)); // Copy the null byte.
     return ConstantInt::get(CI->getType(), FormatStr.size());
@@ -2629,9 +2606,8 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) {
       else if (N < Str.size() + 1)
         return nullptr;
 
-      B.CreateMemCpy(CI->getArgOperand(0), Align::None(), CI->getArgOperand(3),
-                     Align::None(),
-                     ConstantInt::get(CI->getType(), Str.size() + 1));
+      B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(3),
+                     Align(1), ConstantInt::get(CI->getType(), Str.size() + 1));
 
       // The snprintf result is the unincremented number of bytes in the string.
       return ConstantInt::get(CI->getType(), Str.size());
@@ -2640,7 +2616,7 @@ Value *LibCallSimplifier::optimizeSnPrintFString(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilderBase &B) {
   if (Value *V = optimizeSnPrintFString(CI, B)) {
     return V;
   }
@@ -2650,7 +2626,8 @@ Value *LibCallSimplifier::optimizeSnPrintF(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI,
+                                                IRBuilderBase &B) {
   optimizeErrorReporting(CI, B, 0);
 
   // All the optimizations depend on the format string.
@@ -2699,7 +2676,7 @@ Value *LibCallSimplifier::optimizeFPrintFString(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilderBase &B) {
   Function *Callee = CI->getCalledFunction();
   FunctionType *FT = Callee->getFunctionType();
   if (Value *V = optimizeFPrintFString(CI, B)) {
@@ -2734,7 +2711,7 @@ Value *LibCallSimplifier::optimizeFPrintF(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilderBase &B) {
   optimizeErrorReporting(CI, B, 3);
 
   // Get the element size and count.
@@ -2757,15 +2734,10 @@ Value *LibCallSimplifier::optimizeFWrite(CallInst *CI, IRBuilder<> &B) {
     }
   }
 
-  if (isLocallyOpenedFile(CI->getArgOperand(3), CI, B, TLI))
-    return emitFWriteUnlocked(CI->getArgOperand(0), CI->getArgOperand(1),
-                              CI->getArgOperand(2), CI->getArgOperand(3), B, DL,
-                              TLI);
-
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilderBase &B) {
   optimizeErrorReporting(CI, B, 1);
 
   // Don't rewrite fputs to fwrite when optimising for size because fwrite
@@ -2776,15 +2748,9 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
   if (OptForSize)
     return nullptr;
 
-  // Check if has any use
-  if (!CI->use_empty()) {
-    if (isLocallyOpenedFile(CI->getArgOperand(1), CI, B, TLI))
-      return emitFPutSUnlocked(CI->getArgOperand(0), CI->getArgOperand(1), B,
-                               TLI);
-    else
-      // We can't optimize if return value is used.
-      return nullptr;
-  }
+  // We can't optimize if return value is used.
+  if (!CI->use_empty())
+    return nullptr;
 
   // fputs(s,F) --> fwrite(s,strlen(s),1,F)
   uint64_t Len = GetStringLength(CI->getArgOperand(0));
@@ -2798,41 +2764,7 @@ Value *LibCallSimplifier::optimizeFPuts(CallInst *CI, IRBuilder<> &B) {
       CI->getArgOperand(1), B, DL, TLI);
 }
 
-Value *LibCallSimplifier::optimizeFPutc(CallInst *CI, IRBuilder<> &B) {
-  optimizeErrorReporting(CI, B, 1);
-
-  if (isLocallyOpenedFile(CI->getArgOperand(1), CI, B, TLI))
-    return emitFPutCUnlocked(CI->getArgOperand(0), CI->getArgOperand(1), B,
-                             TLI);
-
-  return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeFGetc(CallInst *CI, IRBuilder<> &B) {
-  if (isLocallyOpenedFile(CI->getArgOperand(0), CI, B, TLI))
-    return emitFGetCUnlocked(CI->getArgOperand(0), B, TLI);
-
-  return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeFGets(CallInst *CI, IRBuilder<> &B) {
-  if (isLocallyOpenedFile(CI->getArgOperand(2), CI, B, TLI))
-    return emitFGetSUnlocked(CI->getArgOperand(0), CI->getArgOperand(1),
-                             CI->getArgOperand(2), B, TLI);
-
-  return nullptr;
-}
-
-Value *LibCallSimplifier::optimizeFRead(CallInst *CI, IRBuilder<> &B) {
-  if (isLocallyOpenedFile(CI->getArgOperand(3), CI, B, TLI))
-    return emitFReadUnlocked(CI->getArgOperand(0), CI->getArgOperand(1),
-                             CI->getArgOperand(2), CI->getArgOperand(3), B, DL,
-                             TLI);
-
-  return nullptr;
-}
-
-Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilderBase &B) {
   annotateNonNullBasedOnAccess(CI, 0);
   if (!CI->use_empty())
     return nullptr;
@@ -2846,11 +2778,10 @@ Value *LibCallSimplifier::optimizePuts(CallInst *CI, IRBuilder<> &B) {
   return nullptr;
 }
 
-Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilder<> &B) {
+Value *LibCallSimplifier::optimizeBCopy(CallInst *CI, IRBuilderBase &B) {
   // bcopy(src, dst, n) -> llvm.memmove(dst, src, n)
-  return B.CreateMemMove(CI->getArgOperand(1), Align::None(),
-                         CI->getArgOperand(0), Align::None(),
-                         CI->getArgOperand(2));
+  return B.CreateMemMove(CI->getArgOperand(1), Align(1), CI->getArgOperand(0),
+                         Align(1), CI->getArgOperand(2));
 }
 
 bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
@@ -2863,7 +2794,7 @@ bool LibCallSimplifier::hasFloatVersion(StringRef FuncName) {
 }
 
 Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
-                                                      IRBuilder<> &Builder) {
+                                                      IRBuilderBase &Builder) {
   LibFunc Func;
   Function *Callee = CI->getCalledFunction();
   // Check for string/memory library functions.
@@ -2944,7 +2875,7 @@ Value *LibCallSimplifier::optimizeStringMemoryLibCall(CallInst *CI,
 
 Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
                                                        LibFunc Func,
-                                                       IRBuilder<> &Builder) {
+                                                       IRBuilderBase &Builder) {
   // Don't optimize calls that require strict floating point semantics.
   if (CI->isStrictFP())
     return nullptr;
@@ -3000,6 +2931,8 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
     return replaceUnaryCall(CI, Builder, Intrinsic::floor);
   case LibFunc_round:
     return replaceUnaryCall(CI, Builder, Intrinsic::round);
+  case LibFunc_roundeven:
+    return replaceUnaryCall(CI, Builder, Intrinsic::roundeven);
   case LibFunc_nearbyint:
     return replaceUnaryCall(CI, Builder, Intrinsic::nearbyint);
   case LibFunc_rint:
@@ -3044,7 +2977,7 @@ Value *LibCallSimplifier::optimizeFloatingPointLibCall(CallInst *CI,
   }
 }
 
-Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
+Value *LibCallSimplifier::optimizeCall(CallInst *CI, IRBuilderBase &Builder) {
   // TODO: Split out the code below that operates on FP calls so that
   //       we can all non-FP calls with the StrictFP attribute to be
   //       optimized.
@@ -3053,11 +2986,13 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
 
   LibFunc Func;
   Function *Callee = CI->getCalledFunction();
+  bool isCallingConvC = isCallingConvCCompatible(CI);
 
   SmallVector<OperandBundleDef, 2> OpBundles;
   CI->getOperandBundlesAsDefs(OpBundles);
-  IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles);
-  bool isCallingConvC = isCallingConvCCompatible(CI);
+
+  IRBuilderBase::OperandBundlesGuard Guard(Builder);
+  Builder.setDefaultOperandBundles(OpBundles);
 
   // Command-line parameter overrides instruction attribute.
   // This can't be moved to optimizeFloatingPointLibCall() because it may be
@@ -3097,14 +3032,20 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
   }
 
   // Also try to simplify calls to fortified library functions.
-  if (Value *SimplifiedFortifiedCI = FortifiedSimplifier.optimizeCall(CI)) {
+  if (Value *SimplifiedFortifiedCI =
+          FortifiedSimplifier.optimizeCall(CI, Builder)) {
     // Try to further simplify the result.
     CallInst *SimplifiedCI = dyn_cast<CallInst>(SimplifiedFortifiedCI);
     if (SimplifiedCI && SimplifiedCI->getCalledFunction()) {
-      // Use an IR Builder from SimplifiedCI if available instead of CI
-      // to guarantee we reach all uses we might replace later on.
-      IRBuilder<> TmpBuilder(SimplifiedCI);
-      if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, TmpBuilder)) {
+      // Ensure that SimplifiedCI's uses are complete, since some calls have
+      // their uses analyzed.
+      replaceAllUsesWith(CI, SimplifiedCI);
+
+      // Set insertion point to SimplifiedCI to guarantee we reach all uses
+      // we might replace later on.
+      IRBuilderBase::InsertPointGuard Guard(Builder);
+      Builder.SetInsertPoint(SimplifiedCI);
+      if (Value *V = optimizeStringMemoryLibCall(SimplifiedCI, Builder)) {
         // If we were able to further simplify, remove the now redundant call.
         substituteInParent(SimplifiedCI, V);
         return V;
@@ -3158,16 +3099,8 @@ Value *LibCallSimplifier::optimizeCall(CallInst *CI) {
       return optimizeFPrintF(CI, Builder);
     case LibFunc_fwrite:
       return optimizeFWrite(CI, Builder);
-    case LibFunc_fread:
-      return optimizeFRead(CI, Builder);
     case LibFunc_fputs:
       return optimizeFPuts(CI, Builder);
-    case LibFunc_fgets:
-      return optimizeFGets(CI, Builder);
-    case LibFunc_fputc:
-      return optimizeFPutc(CI, Builder);
-    case LibFunc_fgetc:
-      return optimizeFGetc(CI, Builder);
     case LibFunc_puts:
       return optimizePuts(CI, Builder);
     case LibFunc_perror:
@@ -3280,11 +3213,11 @@ FortifiedLibCallSimplifier::isFortifiedCallFoldable(CallInst *CI,
 }
 
 Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
-                                                     IRBuilder<> &B) {
+                                                     IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 3, 2)) {
-    CallInst *NewCI = B.CreateMemCpy(CI->getArgOperand(0), Align::None(),
-                                     CI->getArgOperand(1), Align::None(),
-                                     CI->getArgOperand(2));
+    CallInst *NewCI =
+        B.CreateMemCpy(CI->getArgOperand(0), Align(1), CI->getArgOperand(1),
+                       Align(1), CI->getArgOperand(2));
     NewCI->setAttributes(CI->getAttributes());
     return CI->getArgOperand(0);
   }
@@ -3292,11 +3225,11 @@ Value *FortifiedLibCallSimplifier::optimizeMemCpyChk(CallInst *CI,
 }
 
 Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
-                                                      IRBuilder<> &B) {
+                                                      IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 3, 2)) {
-    CallInst *NewCI = B.CreateMemMove(CI->getArgOperand(0), Align::None(),
-                                      CI->getArgOperand(1), Align::None(),
-                                      CI->getArgOperand(2));
+    CallInst *NewCI =
+        B.CreateMemMove(CI->getArgOperand(0), Align(1), CI->getArgOperand(1),
+                        Align(1), CI->getArgOperand(2));
     NewCI->setAttributes(CI->getAttributes());
     return CI->getArgOperand(0);
   }
@@ -3304,13 +3237,13 @@ Value *FortifiedLibCallSimplifier::optimizeMemMoveChk(CallInst *CI,
 }
 
 Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
-                                                     IRBuilder<> &B) {
+                                                     IRBuilderBase &B) {
   // TODO: Try foldMallocMemset() here.
 
   if (isFortifiedCallFoldable(CI, 3, 2)) {
     Value *Val = B.CreateIntCast(CI->getArgOperand(1), B.getInt8Ty(), false);
     CallInst *NewCI = B.CreateMemSet(CI->getArgOperand(0), Val,
-                                     CI->getArgOperand(2), Align::None());
+                                     CI->getArgOperand(2), Align(1));
     NewCI->setAttributes(CI->getAttributes());
     return CI->getArgOperand(0);
   }
@@ -3318,7 +3251,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemSetChk(CallInst *CI,
 }
 
 Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
-                                                      IRBuilder<> &B,
+                                                      IRBuilderBase &B,
                                                       LibFunc Func) {
   const DataLayout &DL = CI->getModule()->getDataLayout();
   Value *Dst = CI->getArgOperand(0), *Src = CI->getArgOperand(1),
@@ -3362,8 +3295,16 @@ Value *FortifiedLibCallSimplifier::optimizeStrpCpyChk(CallInst *CI,
   return Ret;
 }
 
+Value *FortifiedLibCallSimplifier::optimizeStrLenChk(CallInst *CI,
+                                                     IRBuilderBase &B) {
+  if (isFortifiedCallFoldable(CI, 1, None, 0))
+    return emitStrLen(CI->getArgOperand(0), B, CI->getModule()->getDataLayout(),
+                      TLI);
+  return nullptr;
+}
+
 Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
-                                                       IRBuilder<> &B,
+                                                       IRBuilderBase &B,
                                                        LibFunc Func) {
   if (isFortifiedCallFoldable(CI, 3, 2)) {
     if (Func == LibFunc_strncpy_chk)
@@ -3378,7 +3319,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrpNCpyChk(CallInst *CI,
 }
 
 Value *FortifiedLibCallSimplifier::optimizeMemCCpyChk(CallInst *CI,
-                                                      IRBuilder<> &B) {
+                                                      IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 4, 3))
     return emitMemCCpy(CI->getArgOperand(0), CI->getArgOperand(1),
                        CI->getArgOperand(2), CI->getArgOperand(3), B, TLI);
@@ -3387,7 +3328,7 @@ Value *FortifiedLibCallSimplifier::optimizeMemCCpyChk(CallInst *CI,
 }
 
 Value *FortifiedLibCallSimplifier::optimizeSNPrintfChk(CallInst *CI,
-                                                       IRBuilder<> &B) {
+                                                       IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 3, 1, None, 2)) {
     SmallVector<Value *, 8> VariadicArgs(CI->arg_begin() + 5, CI->arg_end());
     return emitSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
@@ -3398,7 +3339,7 @@ Value *FortifiedLibCallSimplifier::optimizeSNPrintfChk(CallInst *CI,
 }
 
 Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI,
-                                                      IRBuilder<> &B) {
+                                                      IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 2, None, None, 1)) {
     SmallVector<Value *, 8> VariadicArgs(CI->arg_begin() + 4, CI->arg_end());
     return emitSPrintf(CI->getArgOperand(0), CI->getArgOperand(3), VariadicArgs,
@@ -3409,7 +3350,7 @@ Value *FortifiedLibCallSimplifier::optimizeSPrintfChk(CallInst *CI,
 }
 
 Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI,
-                                                     IRBuilder<> &B) {
+                                                     IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 2))
     return emitStrCat(CI->getArgOperand(0), CI->getArgOperand(1), B, TLI);
 
@@ -3417,7 +3358,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrCatChk(CallInst *CI,
 }
 
 Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI,
-                                                   IRBuilder<> &B) {
+                                                   IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 3))
     return emitStrLCat(CI->getArgOperand(0), CI->getArgOperand(1),
                        CI->getArgOperand(2), B, TLI);
@@ -3426,7 +3367,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrLCat(CallInst *CI,
 }
 
 Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI,
-                                                      IRBuilder<> &B) {
+                                                      IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 3))
     return emitStrNCat(CI->getArgOperand(0), CI->getArgOperand(1),
                        CI->getArgOperand(2), B, TLI);
@@ -3435,7 +3376,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrNCatChk(CallInst *CI,
 }
 
 Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI,
-                                                      IRBuilder<> &B) {
+                                                      IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 3))
     return emitStrLCpy(CI->getArgOperand(0), CI->getArgOperand(1),
                        CI->getArgOperand(2), B, TLI);
@@ -3444,7 +3385,7 @@ Value *FortifiedLibCallSimplifier::optimizeStrLCpyChk(CallInst *CI,
 }
 
 Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI,
-                                                        IRBuilder<> &B) {
+                                                        IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 3, 1, None, 2))
     return emitVSNPrintf(CI->getArgOperand(0), CI->getArgOperand(1),
                          CI->getArgOperand(4), CI->getArgOperand(5), B, TLI);
@@ -3453,7 +3394,7 @@ Value *FortifiedLibCallSimplifier::optimizeVSNPrintfChk(CallInst *CI,
 }
 
 Value *FortifiedLibCallSimplifier::optimizeVSPrintfChk(CallInst *CI,
-                                                       IRBuilder<> &B) {
+                                                       IRBuilderBase &B) {
   if (isFortifiedCallFoldable(CI, 2, None, None, 1))
     return emitVSPrintf(CI->getArgOperand(0), CI->getArgOperand(3),
                         CI->getArgOperand(4), B, TLI);
@@ -3461,7 +3402,8 @@ Value *FortifiedLibCallSimplifier::optimizeVSPrintfChk(CallInst *CI,
   return nullptr;
 }
 
-Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
+Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI,
+                                                IRBuilderBase &Builder) {
   // FIXME: We shouldn't be changing "nobuiltin" or TLI unavailable calls here.
   // Some clang users checked for _chk libcall availability using:
   //   __has_builtin(__builtin___memcpy_chk)
@@ -3477,11 +3419,13 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
 
   LibFunc Func;
   Function *Callee = CI->getCalledFunction();
+  bool isCallingConvC = isCallingConvCCompatible(CI);
 
   SmallVector<OperandBundleDef, 2> OpBundles;
   CI->getOperandBundlesAsDefs(OpBundles);
-  IRBuilder<> Builder(CI, /*FPMathTag=*/nullptr, OpBundles);
-  bool isCallingConvC = isCallingConvCCompatible(CI);
+
+  IRBuilderBase::OperandBundlesGuard Guard(Builder);
+  Builder.setDefaultOperandBundles(OpBundles);
 
   // First, check that this is a known library functions and that the prototype
   // is correct.
@@ -3502,6 +3446,8 @@ Value *FortifiedLibCallSimplifier::optimizeCall(CallInst *CI) {
   case LibFunc_stpcpy_chk:
   case LibFunc_strcpy_chk:
     return optimizeStrpCpyChk(CI, Builder, Func);
+  case LibFunc_strlen_chk:
+    return optimizeStrLenChk(CI, Builder);
   case LibFunc_stpncpy_chk:
   case LibFunc_strncpy_chk:
     return optimizeStrpNCpyChk(CI, Builder, Func);
diff --git a/llvm/lib/Transforms/Utils/SizeOpts.cpp b/llvm/lib/Transforms/Utils/SizeOpts.cpp
index d2a400027d4b..e257c5a015f5 100644
--- a/llvm/lib/Transforms/Utils/SizeOpts.cpp
+++ b/llvm/lib/Transforms/Utils/SizeOpts.cpp
@@ -24,10 +24,25 @@ cl::opt<bool> PGSOLargeWorkingSetSizeOnly(
              "if the working set size is large (except for cold code.)"));
 
 cl::opt<bool> PGSOColdCodeOnly(
-    "pgso-cold-code-only", cl::Hidden, cl::init(true),
+    "pgso-cold-code-only", cl::Hidden, cl::init(false),
     cl::desc("Apply the profile guided size optimizations only "
              "to cold code."));
 
+cl::opt<bool> PGSOColdCodeOnlyForInstrPGO(
+    "pgso-cold-code-only-for-instr-pgo", cl::Hidden, cl::init(false),
+    cl::desc("Apply the profile guided size optimizations only "
+             "to cold code under instrumentation PGO."));
+
+cl::opt<bool> PGSOColdCodeOnlyForSamplePGO(
+    "pgso-cold-code-only-for-sample-pgo", cl::Hidden, cl::init(false),
+    cl::desc("Apply the profile guided size optimizations only "
+             "to cold code under sample PGO."));
+
+cl::opt<bool> PGSOColdCodeOnlyForPartialSamplePGO(
+    "pgso-cold-code-only-for-partial-sample-pgo", cl::Hidden, cl::init(false),
+    cl::desc("Apply the profile guided size optimizations only "
+             "to cold code under partial-profile sample PGO."));
+
 cl::opt<bool> PGSOIRPassOrTestOnly(
     "pgso-ir-pass-or-test-only", cl::Hidden, cl::init(false),
     cl::desc("Apply the profile guided size optimizations only"
@@ -38,12 +53,12 @@ cl::opt<bool> ForcePGSO(
     cl::desc("Force the (profiled-guided) size optimizations. "));
 
 cl::opt<int> PgsoCutoffInstrProf(
-    "pgso-cutoff-instr-prof", cl::Hidden, cl::init(250000), cl::ZeroOrMore,
+    "pgso-cutoff-instr-prof", cl::Hidden, cl::init(950000), cl::ZeroOrMore,
     cl::desc("The profile guided size optimization profile summary cutoff "
              "for instrumentation profile."));
 
 cl::opt<int> PgsoCutoffSampleProf(
-    "pgso-cutoff-sample-prof", cl::Hidden, cl::init(800000), cl::ZeroOrMore,
+    "pgso-cutoff-sample-prof", cl::Hidden, cl::init(990000), cl::ZeroOrMore,
     cl::desc("The profile guided size optimization profile summary cutoff "
              "for sample profile."));
 
@@ -60,6 +75,12 @@ struct BasicBlockBFIAdapter {
                                                     BlockFrequencyInfo &BFI) {
     return PSI->isFunctionHotInCallGraphNthPercentile(CutOff, F, BFI);
   }
+  static bool isFunctionColdInCallGraphNthPercentile(int CutOff,
+                                                     const Function *F,
+                                                     ProfileSummaryInfo *PSI,
+                                                     BlockFrequencyInfo &BFI) {
+    return PSI->isFunctionColdInCallGraphNthPercentile(CutOff, F, BFI);
+  }
   static bool isColdBlock(const BasicBlock *BB,
                           ProfileSummaryInfo *PSI,
                           BlockFrequencyInfo *BFI) {
@@ -71,6 +92,11 @@ struct BasicBlockBFIAdapter {
                                       BlockFrequencyInfo *BFI) {
     return PSI->isHotBlockNthPercentile(CutOff, BB, BFI);
   }
+  static bool isColdBlockNthPercentile(int CutOff, const BasicBlock *BB,
+                                       ProfileSummaryInfo *PSI,
+                                       BlockFrequencyInfo *BFI) {
+    return PSI->isColdBlockNthPercentile(CutOff, BB, BFI);
+  }
 };
 } // end anonymous namespace
 
@@ -84,6 +110,7 @@ bool llvm::shouldOptimizeForSize(const Function *F, ProfileSummaryInfo *PSI,
 bool llvm::shouldOptimizeForSize(const BasicBlock *BB, ProfileSummaryInfo *PSI,
                                  BlockFrequencyInfo *BFI,
                                  PGSOQueryType QueryType) {
+  assert(BB);
   return shouldOptimizeForSizeImpl<BasicBlockBFIAdapter>(BB, PSI, BFI,
                                                          QueryType);
 }
diff --git a/llvm/lib/Transforms/Utils/StripGCRelocates.cpp b/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
index 7880ea1c6c47..b559811d120b 100644
--- a/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
+++ b/llvm/lib/Transforms/Utils/StripGCRelocates.cpp
@@ -48,7 +48,7 @@ bool StripGCRelocates::runOnFunction(Function &F) {
   // i.e. not bound to a single statepoint token.
   for (Instruction &I : instructions(F)) {
     if (auto *GCR = dyn_cast<GCRelocateInst>(&I))
-      if (isStatepoint(GCR->getOperand(0)))
+      if (isa<GCStatepointInst>(GCR->getOperand(0)))
         GCRelocates.push_back(GCR);
   }
   // All gc.relocates are bound to a single statepoint token. The order of
diff --git a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
index aacf81d83519..ec4ea848a5d4 100644
--- a/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
+++ b/llvm/lib/Transforms/Utils/SymbolRewriter.cpp
@@ -117,8 +117,9 @@ public:
   const std::string Target;
 
   ExplicitRewriteDescriptor(StringRef S, StringRef T, const bool Naked)
-      : RewriteDescriptor(DT), Source(Naked ? StringRef("\01" + S.str()) : S),
-        Target(T) {}
+      : RewriteDescriptor(DT),
+        Source(std::string(Naked ? StringRef("\01" + S.str()) : S)),
+        Target(std::string(T)) {}
 
   bool performOnModule(Module &M) override;
 
@@ -159,7 +160,8 @@ public:
   const std::string Transform;
 
   PatternRewriteDescriptor(StringRef P, StringRef T)
-    : RewriteDescriptor(DT), Pattern(P), Transform(T) { }
+      : RewriteDescriptor(DT), Pattern(std::string(P)),
+        Transform(std::string(T)) {}
 
   bool performOnModule(Module &M) override;
 
@@ -189,7 +191,7 @@ performOnModule(Module &M) {
       continue;
 
     if (GlobalObject *GO = dyn_cast<GlobalObject>(&C))
-      rewriteComdat(M, GO, C.getName(), Name);
+      rewriteComdat(M, GO, std::string(C.getName()), Name);
 
     if (Value *V = (M.*Get)(Name))
       C.setValueName(V->getValueName());
@@ -352,19 +354,19 @@ parseRewriteFunctionDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
     if (KeyValue.equals("source")) {
       std::string Error;
 
-      Source = Value->getValue(ValueStorage);
+      Source = std::string(Value->getValue(ValueStorage));
       if (!Regex(Source).isValid(Error)) {
         YS.printError(Field.getKey(), "invalid regex: " + Error);
         return false;
       }
     } else if (KeyValue.equals("target")) {
-      Target = Value->getValue(ValueStorage);
+      Target = std::string(Value->getValue(ValueStorage));
     } else if (KeyValue.equals("transform")) {
-      Transform = Value->getValue(ValueStorage);
+      Transform = std::string(Value->getValue(ValueStorage));
     } else if (KeyValue.equals("naked")) {
       std::string Undecorated;
 
-      Undecorated = Value->getValue(ValueStorage);
+      Undecorated = std::string(Value->getValue(ValueStorage));
       Naked = StringRef(Undecorated).lower() == "true" || Undecorated == "1";
     } else {
       YS.printError(Field.getKey(), "unknown key for function");
@@ -421,15 +423,15 @@ parseRewriteGlobalVariableDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
     if (KeyValue.equals("source")) {
       std::string Error;
 
-      Source = Value->getValue(ValueStorage);
+      Source = std::string(Value->getValue(ValueStorage));
       if (!Regex(Source).isValid(Error)) {
         YS.printError(Field.getKey(), "invalid regex: " + Error);
         return false;
       }
     } else if (KeyValue.equals("target")) {
-      Target = Value->getValue(ValueStorage);
+      Target = std::string(Value->getValue(ValueStorage));
     } else if (KeyValue.equals("transform")) {
-      Transform = Value->getValue(ValueStorage);
+      Transform = std::string(Value->getValue(ValueStorage));
     } else {
       YS.printError(Field.getKey(), "unknown Key for Global Variable");
       return false;
@@ -484,15 +486,15 @@ parseRewriteGlobalAliasDescriptor(yaml::Stream &YS, yaml::ScalarNode *K,
     if (KeyValue.equals("source")) {
       std::string Error;
 
-      Source = Value->getValue(ValueStorage);
+      Source = std::string(Value->getValue(ValueStorage));
       if (!Regex(Source).isValid(Error)) {
         YS.printError(Field.getKey(), "invalid regex: " + Error);
         return false;
       }
     } else if (KeyValue.equals("target")) {
-      Target = Value->getValue(ValueStorage);
+      Target = std::string(Value->getValue(ValueStorage));
     } else if (KeyValue.equals("transform")) {
-      Transform = Value->getValue(ValueStorage);
+      Transform = std::string(Value->getValue(ValueStorage));
     } else {
       YS.printError(Field.getKey(), "unknown key for Global Alias");
       return false;
diff --git a/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
new file mode 100644
index 000000000000..b10deee3907c
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/UnifyLoopExits.cpp
@@ -0,0 +1,220 @@
+//===- UnifyLoopExits.cpp - Redirect exiting edges to one block -*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// For each natural loop with multiple exit blocks, this pass creates a new
+// block N such that all exiting blocks now branch to N, and then control flow
+// is redistributed to all the original exit blocks.
+//
+// Limitation: This assumes that all terminators in the CFG are direct branches
+//             (the "br" instruction). The presence of any other control flow
+//             such as indirectbr, switch or callbr will cause an assert.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Analysis/LoopInfo.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Transforms/Utils.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+
+#define DEBUG_TYPE "unify-loop-exits"
+
+using namespace llvm;
+
+namespace {
+struct UnifyLoopExits : public FunctionPass {
+  static char ID;
+  UnifyLoopExits() : FunctionPass(ID) {
+    initializeUnifyLoopExitsPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequiredID(LowerSwitchID);
+    AU.addRequired<LoopInfoWrapperPass>();
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addPreservedID(LowerSwitchID);
+    AU.addPreserved<LoopInfoWrapperPass>();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+  }
+
+  bool runOnFunction(Function &F) override;
+};
+} // namespace
+
+char UnifyLoopExits::ID = 0;
+
+FunctionPass *llvm::createUnifyLoopExitsPass() { return new UnifyLoopExits(); }
+
+INITIALIZE_PASS_BEGIN(UnifyLoopExits, "unify-loop-exits",
+                      "Fixup each natural loop to have a single exit block",
+                      false /* Only looks at CFG */, false /* Analysis Pass */)
+INITIALIZE_PASS_DEPENDENCY(LowerSwitch)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass)
+INITIALIZE_PASS_END(UnifyLoopExits, "unify-loop-exits",
+                    "Fixup each natural loop to have a single exit block",
+                    false /* Only looks at CFG */, false /* Analysis Pass */)
+
+// The current transform introduces new control flow paths which may break the
+// SSA requirement that every def must dominate all its uses. For example,
+// consider a value D defined inside the loop that is used by some instruction
+// U outside the loop. It follows that D dominates U, since the original
+// program has valid SSA form. After merging the exits, all paths from D to U
+// now flow through the unified exit block. In addition, there may be other
+// paths that do not pass through D, but now reach the unified exit
+// block. Thus, D no longer dominates U.
+//
+// Restore the dominance by creating a phi for each such D at the new unified
+// loop exit. But when doing this, ignore any uses U that are in the new unified
+// loop exit, since those were introduced specially when the block was created.
+//
+// The use of SSAUpdater seems like overkill for this operation. The location
+// for creating the new PHI is well-known, and also the set of incoming blocks
+// to the new PHI.
+static void restoreSSA(const DominatorTree &DT, const Loop *L,
+                       const SetVector<BasicBlock *> &Incoming,
+                       BasicBlock *LoopExitBlock) {
+  using InstVector = SmallVector<Instruction *, 8>;
+  using IIMap = DenseMap<Instruction *, InstVector>;
+  IIMap ExternalUsers;
+  for (auto BB : L->blocks()) {
+    for (auto &I : *BB) {
+      for (auto &U : I.uses()) {
+        auto UserInst = cast<Instruction>(U.getUser());
+        auto UserBlock = UserInst->getParent();
+        if (UserBlock == LoopExitBlock)
+          continue;
+        if (L->contains(UserBlock))
+          continue;
+        LLVM_DEBUG(dbgs() << "added ext use for " << I.getName() << "("
+                          << BB->getName() << ")"
+                          << ": " << UserInst->getName() << "("
+                          << UserBlock->getName() << ")"
+                          << "\n");
+        ExternalUsers[&I].push_back(UserInst);
+      }
+    }
+  }
+
+  for (auto II : ExternalUsers) {
+    // For each Def used outside the loop, create NewPhi in
+    // LoopExitBlock. NewPhi receives Def only along exiting blocks that
+    // dominate it, while the remaining values are undefined since those paths
+    // didn't exist in the original CFG.
+    auto Def = II.first;
+    LLVM_DEBUG(dbgs() << "externally used: " << Def->getName() << "\n");
+    auto NewPhi = PHINode::Create(Def->getType(), Incoming.size(),
+                                  Def->getName() + ".moved",
+                                  LoopExitBlock->getTerminator());
+    for (auto In : Incoming) {
+      LLVM_DEBUG(dbgs() << "predecessor " << In->getName() << ": ");
+      if (Def->getParent() == In || DT.dominates(Def, In)) {
+        LLVM_DEBUG(dbgs() << "dominated\n");
+        NewPhi->addIncoming(Def, In);
+      } else {
+        LLVM_DEBUG(dbgs() << "not dominated\n");
+        NewPhi->addIncoming(UndefValue::get(Def->getType()), In);
+      }
+    }
+
+    LLVM_DEBUG(dbgs() << "external users:");
+    for (auto U : II.second) {
+      LLVM_DEBUG(dbgs() << " " << U->getName());
+      U->replaceUsesOfWith(Def, NewPhi);
+    }
+    LLVM_DEBUG(dbgs() << "\n");
+  }
+}
+
+static bool unifyLoopExits(DominatorTree &DT, LoopInfo &LI, Loop *L) {
+  // To unify the loop exits, we need a list of the exiting blocks as
+  // well as exit blocks. The functions for locating these lists both
+  // traverse the entire loop body. It is more efficient to first
+  // locate the exiting blocks and then examine their successors to
+  // locate the exit blocks.
+  SetVector<BasicBlock *> ExitingBlocks;
+  SetVector<BasicBlock *> Exits;
+
+  // We need SetVectors, but the Loop API takes a vector, so we use a temporary.
+  SmallVector<BasicBlock *, 8> Temp;
+  L->getExitingBlocks(Temp);
+  for (auto BB : Temp) {
+    ExitingBlocks.insert(BB);
+    for (auto S : successors(BB)) {
+      auto SL = LI.getLoopFor(S);
+      // A successor is not an exit if it is directly or indirectly in the
+      // current loop.
+      if (SL == L || L->contains(SL))
+        continue;
+      Exits.insert(S);
+    }
+  }
+
+  LLVM_DEBUG(
+      dbgs() << "Found exit blocks:";
+      for (auto Exit : Exits) {
+        dbgs() << " " << Exit->getName();
+      }
+      dbgs() << "\n";
+
+      dbgs() << "Found exiting blocks:";
+      for (auto EB : ExitingBlocks) {
+        dbgs() << " " << EB->getName();
+      }
+      dbgs() << "\n";);
+
+  if (Exits.size() <= 1) {
+    LLVM_DEBUG(dbgs() << "loop does not have multiple exits; nothing to do\n");
+    return false;
+  }
+
+  SmallVector<BasicBlock *, 8> GuardBlocks;
+  DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Eager);
+  auto LoopExitBlock = CreateControlFlowHub(&DTU, GuardBlocks, ExitingBlocks,
+                                            Exits, "loop.exit");
+
+  restoreSSA(DT, L, ExitingBlocks, LoopExitBlock);
+
+#if defined(EXPENSIVE_CHECKS)
+  assert(DT.verify(DominatorTree::VerificationLevel::Full));
+#else
+  assert(DT.verify(DominatorTree::VerificationLevel::Fast));
+#endif // EXPENSIVE_CHECKS
+  L->verifyLoop();
+
+  // The guard blocks were created outside the loop, so they need to become
+  // members of the parent loop.
+  if (auto ParentLoop = L->getParentLoop()) {
+    for (auto G : GuardBlocks) {
+      ParentLoop->addBasicBlockToLoop(G, LI);
+    }
+    ParentLoop->verifyLoop();
+  }
+
+#if defined(EXPENSIVE_CHECKS)
+  LI.verify(DT);
+#endif // EXPENSIVE_CHECKS
+
+  return true;
+}
+
+bool UnifyLoopExits::runOnFunction(Function &F) {
+  LLVM_DEBUG(dbgs() << "===== Unifying loop exits in function " << F.getName()
+                    << "\n");
+  auto &LI = getAnalysis<LoopInfoWrapperPass>().getLoopInfo();
+  auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+
+  bool Changed = false;
+  auto Loops = LI.getLoopsInPreorder();
+  for (auto L : Loops) {
+    LLVM_DEBUG(dbgs() << "Loop: " << L->getHeader()->getName() << " (depth: "
+                      << LI.getLoopDepth(L->getHeader()) << ")\n");
+    Changed |= unifyLoopExits(DT, LI, L);
+  }
+  return Changed;
+}
diff --git a/llvm/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp b/llvm/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp
new file mode 100644
index 000000000000..5b58548e54dc
--- /dev/null
+++ b/llvm/lib/Transforms/Utils/UniqueInternalLinkageNames.cpp
@@ -0,0 +1,97 @@
+//===- UniqueInternalLinkageNames.cpp - Unique Internal Linkage Sym Names -===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file implements unique naming of internal linkage symbols with option
+// -funique-internal-linkage-symbols.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Utils/UniqueInternalLinkageNames.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/IR/Module.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Support/MD5.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+
+using namespace llvm;
+
+static bool uniqueifyInternalLinkageNames(Module &M) {
+  llvm::MD5 Md5;
+  Md5.update(M.getSourceFileName());
+  llvm::MD5::MD5Result R;
+  Md5.final(R);
+  SmallString<32> Str;
+  llvm::MD5::stringifyResult(R, Str);
+  std::string ModuleNameHash = (Twine(".") + Twine(Str)).str();
+  bool Changed = false;
+
+  // Append the module hash to all internal linkage functions.
+  for (auto &F : M) {
+    if (F.hasInternalLinkage()) {
+      F.setName(F.getName() + ModuleNameHash);
+      Changed = true;
+    }
+  }
+
+  // Append the module hash to all internal linkage globals.
+  for (auto &GV : M.globals()) {
+    if (GV.hasInternalLinkage()) {
+      GV.setName(GV.getName() + ModuleNameHash);
+      Changed = true;
+    }
+  }
+  return Changed;
+}
+
+namespace {
+
+// Legacy pass that provides a name to every anon globals.
+class UniqueInternalLinkageNamesLegacyPass : public ModulePass {
+
+public:
+  /// Pass identification, replacement for typeid
+  static char ID;
+
+  /// Specify pass name for debug output
+  StringRef getPassName() const override {
+    return "Unique Internal Linkage Names";
+  }
+
+  explicit UniqueInternalLinkageNamesLegacyPass() : ModulePass(ID) {
+    initializeUniqueInternalLinkageNamesLegacyPassPass(
+        *PassRegistry::getPassRegistry());
+  }
+
+  bool runOnModule(Module &M) override {
+    return uniqueifyInternalLinkageNames(M);
+  }
+};
+
+char UniqueInternalLinkageNamesLegacyPass::ID = 0;
+} // anonymous namespace
+
+PreservedAnalyses
+UniqueInternalLinkageNamesPass::run(Module &M, ModuleAnalysisManager &AM) {
+  if (!uniqueifyInternalLinkageNames(M))
+    return PreservedAnalyses::all();
+
+  return PreservedAnalyses::none();
+}
+
+INITIALIZE_PASS_BEGIN(UniqueInternalLinkageNamesLegacyPass,
+                      "unique-internal-linkage-names",
+                      "Uniqueify internal linkage names", false, false)
+INITIALIZE_PASS_END(UniqueInternalLinkageNamesLegacyPass,
+                    "unique-internal-linkage-names",
+                    "Uniqueify Internal linkage names", false, false)
+
+namespace llvm {
+ModulePass *createUniqueInternalLinkageNamesPass() {
+  return new UniqueInternalLinkageNamesLegacyPass();
+}
+} // namespace llvm
diff --git a/llvm/lib/Transforms/Utils/Utils.cpp b/llvm/lib/Transforms/Utils/Utils.cpp
index 7769c7493cda..ce98a739bea8 100644
--- a/llvm/lib/Transforms/Utils/Utils.cpp
+++ b/llvm/lib/Transforms/Utils/Utils.cpp
@@ -24,8 +24,11 @@ using namespace llvm;
 /// library.
 void llvm::initializeTransformUtils(PassRegistry &Registry) {
   initializeAddDiscriminatorsLegacyPassPass(Registry);
+  initializeAssumeSimplifyPassLegacyPassPass(Registry);
+  initializeAssumeBuilderPassLegacyPassPass(Registry);
   initializeBreakCriticalEdgesPass(Registry);
   initializeCanonicalizeAliasesLegacyPassPass(Registry);
+  initializeCanonicalizeFreezeInLoopsPass(Registry);
   initializeInstNamerPass(Registry);
   initializeLCSSAWrapperPassPass(Registry);
   initializeLibCallsShrinkWrapLegacyPassPass(Registry);
@@ -40,6 +43,9 @@ void llvm::initializeTransformUtils(PassRegistry &Registry) {
   initializeStripGCRelocatesPass(Registry);
   initializePredicateInfoPrinterLegacyPassPass(Registry);
   initializeInjectTLIMappingsLegacyPass(Registry);
+  initializeFixIrreduciblePass(Registry);
+  initializeUnifyLoopExitsPass(Registry);
+  initializeUniqueInternalLinkageNamesLegacyPassPass(Registry);
 }
 
 /// LLVMInitializeTransformUtils - C binding for initializeTransformUtilsPasses.
diff --git a/llvm/lib/Transforms/Utils/VNCoercion.cpp b/llvm/lib/Transforms/Utils/VNCoercion.cpp
index 591e1fd2dbee..6ff08cd28712 100644
--- a/llvm/lib/Transforms/Utils/VNCoercion.cpp
+++ b/llvm/lib/Transforms/Utils/VNCoercion.cpp
@@ -1,16 +1,18 @@
 #include "llvm/Transforms/Utils/VNCoercion.h"
-#include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/ConstantFolding.h"
-#include "llvm/Analysis/MemoryDependenceAnalysis.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/IR/IRBuilder.h"
-#include "llvm/IR/IntrinsicInst.h"
 #include "llvm/Support/Debug.h"
 
 #define DEBUG_TYPE "vncoerce"
+
 namespace llvm {
 namespace VNCoercion {
 
+static bool isFirstClassAggregateOrScalableType(Type *Ty) {
+  return Ty->isStructTy() || Ty->isArrayTy() || isa<ScalableVectorType>(Ty);
+}
+
 /// Return true if coerceAvailableValueToLoadType will succeed.
 bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
                                      const DataLayout &DL) {
@@ -18,20 +20,20 @@ bool canCoerceMustAliasedValueToLoad(Value *StoredVal, Type *LoadTy,
   if (StoredTy == LoadTy)
     return true;
 
-  // If the loaded or stored value is an first class array or struct, don't try
-  // to transform them.  We need to be able to bitcast to integer.
-  if (LoadTy->isStructTy() || LoadTy->isArrayTy() || StoredTy->isStructTy() ||
-      StoredTy->isArrayTy())
+  // If the loaded/stored value is a first class array/struct, or scalable type,
+  // don't try to transform them. We need to be able to bitcast to integer.
+  if (isFirstClassAggregateOrScalableType(LoadTy) ||
+      isFirstClassAggregateOrScalableType(StoredTy))
     return false;
 
-  uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy);
+  uint64_t StoreSize = DL.getTypeSizeInBits(StoredTy).getFixedSize();
 
   // The store size must be byte-aligned to support future type casts.
   if (llvm::alignTo(StoreSize, 8) != StoreSize)
     return false;
 
   // The store has to be at least as big as the load.
-  if (StoreSize < DL.getTypeSizeInBits(LoadTy))
+  if (StoreSize < DL.getTypeSizeInBits(LoadTy).getFixedSize())
     return false;
 
   // Don't coerce non-integral pointers to integers or vice versa.
@@ -55,14 +57,13 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
   assert(canCoerceMustAliasedValueToLoad(StoredVal, LoadedTy, DL) &&
          "precondition violation - materialization can't fail");
   if (auto *C = dyn_cast<Constant>(StoredVal))
-    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-      StoredVal = FoldedStoredVal;
+    StoredVal = ConstantFoldConstant(C, DL);
 
   // If this is already the right type, just return it.
   Type *StoredValTy = StoredVal->getType();
 
-  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy);
-  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy);
+  uint64_t StoredValSize = DL.getTypeSizeInBits(StoredValTy).getFixedSize();
+  uint64_t LoadedValSize = DL.getTypeSizeInBits(LoadedTy).getFixedSize();
 
   // If the store and reload are the same size, we can always reuse it.
   if (StoredValSize == LoadedValSize) {
@@ -89,8 +90,7 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
     }
 
     if (auto *C = dyn_cast<ConstantExpr>(StoredVal))
-      if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-        StoredVal = FoldedStoredVal;
+      StoredVal = ConstantFoldConstant(C, DL);
 
     return StoredVal;
   }
@@ -115,8 +115,8 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
   // If this is a big-endian system, we need to shift the value down to the low
   // bits so that a truncate will work.
   if (DL.isBigEndian()) {
-    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy) -
-                        DL.getTypeStoreSizeInBits(LoadedTy);
+    uint64_t ShiftAmt = DL.getTypeStoreSizeInBits(StoredValTy).getFixedSize() -
+                        DL.getTypeStoreSizeInBits(LoadedTy).getFixedSize();
     StoredVal = Helper.CreateLShr(
         StoredVal, ConstantInt::get(StoredVal->getType(), ShiftAmt));
   }
@@ -135,8 +135,7 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
   }
 
   if (auto *C = dyn_cast<Constant>(StoredVal))
-    if (auto *FoldedStoredVal = ConstantFoldConstant(C, DL))
-      StoredVal = FoldedStoredVal;
+    StoredVal = ConstantFoldConstant(C, DL);
 
   return StoredVal;
 }
@@ -148,7 +147,8 @@ static T *coerceAvailableValueToLoadTypeHelper(T *StoredVal, Type *LoadedTy,
 ///
 /// If we can't do it, return null.
 Value *coerceAvailableValueToLoadType(Value *StoredVal, Type *LoadedTy,
-                                      IRBuilder<> &IRB, const DataLayout &DL) {
+                                      IRBuilderBase &IRB,
+                                      const DataLayout &DL) {
   return coerceAvailableValueToLoadTypeHelper(StoredVal, LoadedTy, IRB, DL);
 }
 
@@ -164,9 +164,9 @@ static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
                                           Value *WritePtr,
                                           uint64_t WriteSizeInBits,
                                           const DataLayout &DL) {
-  // If the loaded or stored value is a first class array or struct, don't try
-  // to transform them.  We need to be able to bitcast to integer.
-  if (LoadTy->isStructTy() || LoadTy->isArrayTy())
+  // If the loaded/stored value is a first class array/struct, or scalable type,
+  // don't try to transform them. We need to be able to bitcast to integer.
+  if (isFirstClassAggregateOrScalableType(LoadTy))
     return -1;
 
   int64_t StoreOffset = 0, LoadOffset = 0;
@@ -184,7 +184,7 @@ static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
   // If the load and store don't overlap at all, the store doesn't provide
   // anything to the load.  In this case, they really don't alias at all, AA
   // must have gotten confused.
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy);
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize();
 
   if ((WriteSizeInBits & 7) | (LoadSize & 7))
     return -1;
@@ -218,10 +218,9 @@ static int analyzeLoadFromClobberingWrite(Type *LoadTy, Value *LoadPtr,
 int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
                                    StoreInst *DepSI, const DataLayout &DL) {
   auto *StoredVal = DepSI->getValueOperand();
-  
-  // Cannot handle reading from store of first-class aggregate yet.
-  if (StoredVal->getType()->isStructTy() ||
-      StoredVal->getType()->isArrayTy())
+
+  // Cannot handle reading from store of first-class aggregate or scalable type.
+  if (isFirstClassAggregateOrScalableType(StoredVal->getType()))
     return -1;
 
   // Don't coerce non-integral pointers to integers or vice versa.
@@ -235,11 +234,96 @@ int analyzeLoadFromClobberingStore(Type *LoadTy, Value *LoadPtr,
 
   Value *StorePtr = DepSI->getPointerOperand();
   uint64_t StoreSize =
-      DL.getTypeSizeInBits(DepSI->getValueOperand()->getType());
+      DL.getTypeSizeInBits(DepSI->getValueOperand()->getType()).getFixedSize();
   return analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, StorePtr, StoreSize,
                                         DL);
 }
 
+/// Looks at a memory location for a load (specified by MemLocBase, Offs, and
+/// Size) and compares it against a load.
+///
+/// If the specified load could be safely widened to a larger integer load
+/// that is 1) still efficient, 2) safe for the target, and 3) would provide
+/// the specified memory location value, then this function returns the size
+/// in bytes of the load width to use.  If not, this returns zero.
+static unsigned getLoadLoadClobberFullWidthSize(const Value *MemLocBase,
+                                                int64_t MemLocOffs,
+                                                unsigned MemLocSize,
+                                                const LoadInst *LI) {
+  // We can only extend simple integer loads.
+  if (!isa<IntegerType>(LI->getType()) || !LI->isSimple())
+    return 0;
+
+  // Load widening is hostile to ThreadSanitizer: it may cause false positives
+  // or make the reports more cryptic (access sizes are wrong).
+  if (LI->getParent()->getParent()->hasFnAttribute(Attribute::SanitizeThread))
+    return 0;
+
+  const DataLayout &DL = LI->getModule()->getDataLayout();
+
+  // Get the base of this load.
+  int64_t LIOffs = 0;
+  const Value *LIBase =
+      GetPointerBaseWithConstantOffset(LI->getPointerOperand(), LIOffs, DL);
+
+  // If the two pointers are not based on the same pointer, we can't tell that
+  // they are related.
+  if (LIBase != MemLocBase)
+    return 0;
+
+  // Okay, the two values are based on the same pointer, but returned as
+  // no-alias.  This happens when we have things like two byte loads at "P+1"
+  // and "P+3".  Check to see if increasing the size of the "LI" load up to its
+  // alignment (or the largest native integer type) will allow us to load all
+  // the bits required by MemLoc.
+
+  // If MemLoc is before LI, then no widening of LI will help us out.
+  if (MemLocOffs < LIOffs)
+    return 0;
+
+  // Get the alignment of the load in bytes.  We assume that it is safe to load
+  // any legal integer up to this size without a problem.  For example, if we're
+  // looking at an i8 load on x86-32 that is known 1024 byte aligned, we can
+  // widen it up to an i32 load.  If it is known 2-byte aligned, we can widen it
+  // to i16.
+  unsigned LoadAlign = LI->getAlignment();
+
+  int64_t MemLocEnd = MemLocOffs + MemLocSize;
+
+  // If no amount of rounding up will let MemLoc fit into LI, then bail out.
+  if (LIOffs + LoadAlign < MemLocEnd)
+    return 0;
+
+  // This is the size of the load to try.  Start with the next larger power of
+  // two.
+  unsigned NewLoadByteSize = LI->getType()->getPrimitiveSizeInBits() / 8U;
+  NewLoadByteSize = NextPowerOf2(NewLoadByteSize);
+
+  while (true) {
+    // If this load size is bigger than our known alignment or would not fit
+    // into a native integer register, then we fail.
+    if (NewLoadByteSize > LoadAlign ||
+        !DL.fitsInLegalInteger(NewLoadByteSize * 8))
+      return 0;
+
+    if (LIOffs + NewLoadByteSize > MemLocEnd &&
+        (LI->getParent()->getParent()->hasFnAttribute(
+             Attribute::SanitizeAddress) ||
+         LI->getParent()->getParent()->hasFnAttribute(
+             Attribute::SanitizeHWAddress)))
+      // We will be reading past the location accessed by the original program.
+      // While this is safe in a regular build, Address Safety analysis tools
+      // may start reporting false warnings. So, don't do widening.
+      return 0;
+
+    // If a load of this width would include all of MemLoc, then we succeed.
+    if (LIOffs + NewLoadByteSize >= MemLocEnd)
+      return NewLoadByteSize;
+
+    NewLoadByteSize <<= 1;
+  }
+}
+
 /// This function is called when we have a
 /// memdep query of a load that ends up being clobbered by another load.  See if
 /// the other load can feed into the second load.
@@ -255,7 +339,7 @@ int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
     return -1;
 
   Value *DepPtr = DepLI->getPointerOperand();
-  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType());
+  uint64_t DepSize = DL.getTypeSizeInBits(DepLI->getType()).getFixedSize();
   int R = analyzeLoadFromClobberingWrite(LoadTy, LoadPtr, DepPtr, DepSize, DL);
   if (R != -1)
     return R;
@@ -265,10 +349,10 @@ int analyzeLoadFromClobberingLoad(Type *LoadTy, Value *LoadPtr, LoadInst *DepLI,
   int64_t LoadOffs = 0;
   const Value *LoadBase =
       GetPointerBaseWithConstantOffset(LoadPtr, LoadOffs, DL);
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
 
-  unsigned Size = MemoryDependenceResults::getLoadLoadClobberFullWidthSize(
-      LoadBase, LoadOffs, LoadSize, DepLI);
+  unsigned Size =
+      getLoadLoadClobberFullWidthSize(LoadBase, LoadOffs, LoadSize, DepLI);
   if (Size == 0)
     return -1;
 
@@ -319,21 +403,17 @@ int analyzeLoadFromClobberingMemInst(Type *LoadTy, Value *LoadPtr,
   if (Offset == -1)
     return Offset;
 
-  // Don't coerce non-integral pointers to integers or vice versa, and the
-  // memtransfer is implicitly a raw byte code
-  if (DL.isNonIntegralPointerType(LoadTy->getScalarType()))
-    // TODO: Can allow nullptrs from constant zeros
-    return -1;
-
   unsigned AS = Src->getType()->getPointerAddressSpace();
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
-  Src =
-      ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS));
-  Constant *OffsetCst =
-      ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
-                                       OffsetCst);
+  if (Offset) {
+    Src = ConstantExpr::getBitCast(Src,
+                                   Type::getInt8PtrTy(Src->getContext(), AS));
+    Constant *OffsetCst =
+        ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+    Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()),
+                                         Src, OffsetCst);
+  }
   Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
   if (ConstantFoldLoadFromConstPtr(Src, LoadTy, DL))
     return Offset;
@@ -355,8 +435,9 @@ static T *getStoreValueForLoadHelper(T *SrcVal, unsigned Offset, Type *LoadTy,
     return SrcVal;
   }
 
-  uint64_t StoreSize = (DL.getTypeSizeInBits(SrcVal->getType()) + 7) / 8;
-  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy) + 7) / 8;
+  uint64_t StoreSize =
+      (DL.getTypeSizeInBits(SrcVal->getType()).getFixedSize() + 7) / 8;
+  uint64_t LoadSize = (DL.getTypeSizeInBits(LoadTy).getFixedSize() + 7) / 8;
   // Compute which bits of the stored value are being used by the load.  Convert
   // to an integer type to start with.
   if (SrcVal->getType()->isPtrOrPtrVectorTy())
@@ -408,8 +489,9 @@ Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
                            Instruction *InsertPt, const DataLayout &DL) {
   // If Offset+LoadTy exceeds the size of SrcVal, then we must be wanting to
   // widen SrcVal out to a larger load.
-  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  unsigned SrcValStoreSize =
+      DL.getTypeStoreSize(SrcVal->getType()).getFixedSize();
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
   if (Offset + LoadSize > SrcValStoreSize) {
     assert(SrcVal->isSimple() && "Cannot widen volatile/atomic load!");
     assert(SrcVal->getType()->isIntegerTy() && "Can't widen non-integer load");
@@ -431,7 +513,7 @@ Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
     PtrVal = Builder.CreateBitCast(PtrVal, DestPTy);
     LoadInst *NewLoad = Builder.CreateLoad(DestTy, PtrVal);
     NewLoad->takeName(SrcVal);
-    NewLoad->setAlignment(MaybeAlign(SrcVal->getAlignment()));
+    NewLoad->setAlignment(SrcVal->getAlign());
 
     LLVM_DEBUG(dbgs() << "GVN WIDENED LOAD: " << *SrcVal << "\n");
     LLVM_DEBUG(dbgs() << "TO: " << *NewLoad << "\n");
@@ -452,8 +534,9 @@ Value *getLoadValueForLoad(LoadInst *SrcVal, unsigned Offset, Type *LoadTy,
 
 Constant *getConstantLoadValueForLoad(Constant *SrcVal, unsigned Offset,
                                       Type *LoadTy, const DataLayout &DL) {
-  unsigned SrcValStoreSize = DL.getTypeStoreSize(SrcVal->getType());
-  unsigned LoadSize = DL.getTypeStoreSize(LoadTy);
+  unsigned SrcValStoreSize =
+      DL.getTypeStoreSize(SrcVal->getType()).getFixedSize();
+  unsigned LoadSize = DL.getTypeStoreSize(LoadTy).getFixedSize();
   if (Offset + LoadSize > SrcValStoreSize)
     return nullptr;
   return getConstantStoreValueForLoad(SrcVal, Offset, LoadTy, DL);
@@ -464,7 +547,7 @@ T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset,
                                 Type *LoadTy, HelperClass &Helper,
                                 const DataLayout &DL) {
   LLVMContext &Ctx = LoadTy->getContext();
-  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy) / 8;
+  uint64_t LoadSize = DL.getTypeSizeInBits(LoadTy).getFixedSize() / 8;
 
   // We know that this method is only called when the mem transfer fully
   // provides the bits for the load.
@@ -500,16 +583,18 @@ T *getMemInstValueForLoadHelper(MemIntrinsic *SrcInst, unsigned Offset,
   // Otherwise, this is a memcpy/memmove from a constant global.
   MemTransferInst *MTI = cast<MemTransferInst>(SrcInst);
   Constant *Src = cast<Constant>(MTI->getSource());
-  unsigned AS = Src->getType()->getPointerAddressSpace();
 
+  unsigned AS = Src->getType()->getPointerAddressSpace();
   // Otherwise, see if we can constant fold a load from the constant with the
   // offset applied as appropriate.
-  Src =
-      ConstantExpr::getBitCast(Src, Type::getInt8PtrTy(Src->getContext(), AS));
-  Constant *OffsetCst =
-      ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
-  Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()), Src,
-                                       OffsetCst);
+  if (Offset) {
+    Src = ConstantExpr::getBitCast(Src,
+                                   Type::getInt8PtrTy(Src->getContext(), AS));
+    Constant *OffsetCst =
+        ConstantInt::get(Type::getInt64Ty(Src->getContext()), (unsigned)Offset);
+    Src = ConstantExpr::getGetElementPtr(Type::getInt8Ty(Src->getContext()),
+                                         Src, OffsetCst);
+  }
   Src = ConstantExpr::getBitCast(Src, PointerType::get(LoadTy, AS));
   return ConstantFoldLoadFromConstPtr(Src, LoadTy, DL);
 }
diff --git a/llvm/lib/Transforms/Utils/ValueMapper.cpp b/llvm/lib/Transforms/Utils/ValueMapper.cpp
index da68d3713b40..f1b3fe8e2fa9 100644
--- a/llvm/lib/Transforms/Utils/ValueMapper.cpp
+++ b/llvm/lib/Transforms/Utils/ValueMapper.cpp
@@ -21,7 +21,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/BasicBlock.h"
-#include "llvm/IR/CallSite.h"
 #include "llvm/IR/Constant.h"
 #include "llvm/IR/Constants.h"
 #include "llvm/IR/DebugInfoMetadata.h"
@@ -369,7 +368,8 @@ Value *Mapper::mapValue(const Value *V) {
 
       if (NewTy != IA->getFunctionType())
         V = InlineAsm::get(NewTy, IA->getAsmString(), IA->getConstraintString(),
-                           IA->hasSideEffects(), IA->isAlignStack());
+                           IA->hasSideEffects(), IA->isAlignStack(),
+                           IA->getDialect());
     }
 
     return getVM()[V] = const_cast<Value *>(V);
@@ -888,17 +888,17 @@ void Mapper::remapInstruction(Instruction *I) {
     return;
 
   // If the instruction's type is being remapped, do so now.
-  if (auto CS = CallSite(I)) {
+  if (auto *CB = dyn_cast<CallBase>(I)) {
     SmallVector<Type *, 3> Tys;
-    FunctionType *FTy = CS.getFunctionType();
+    FunctionType *FTy = CB->getFunctionType();
     Tys.reserve(FTy->getNumParams());
     for (Type *Ty : FTy->params())
       Tys.push_back(TypeMapper->remapType(Ty));
-    CS.mutateFunctionType(FunctionType::get(
+    CB->mutateFunctionType(FunctionType::get(
         TypeMapper->remapType(I->getType()), Tys, FTy->isVarArg()));
 
-    LLVMContext &C = CS->getContext();
-    AttributeList Attrs = CS.getAttributes();
+    LLVMContext &C = CB->getContext();
+    AttributeList Attrs = CB->getAttributes();
     for (unsigned i = 0; i < Attrs.getNumAttrSets(); ++i) {
       if (Attrs.hasAttribute(i, Attribute::ByVal)) {
         Type *Ty = Attrs.getAttribute(i, Attribute::ByVal).getValueAsType();
@@ -910,7 +910,7 @@ void Mapper::remapInstruction(Instruction *I) {
             C, i, Attribute::getWithByValType(C, TypeMapper->remapType(Ty)));
       }
     }
-    CS.setAttributes(Attrs);
+    CB->setAttributes(Attrs);
     return;
   }
   if (auto *AI = dyn_cast<AllocaInst>(I))
diff --git a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
index 7478daa2a0a5..9b81afbb4b6c 100644
--- a/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoadStoreVectorizer.cpp
@@ -50,7 +50,6 @@
 #include "llvm/ADT/iterator_range.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Analysis/MemoryLocation.h"
-#include "llvm/Analysis/OrderedBasicBlock.h"
 #include "llvm/Analysis/ScalarEvolution.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
@@ -129,22 +128,6 @@ public:
 private:
   unsigned getPointerAddressSpace(Value *I);
 
-  unsigned getAlignment(LoadInst *LI) const {
-    unsigned Align = LI->getAlignment();
-    if (Align != 0)
-      return Align;
-
-    return DL.getABITypeAlignment(LI->getType());
-  }
-
-  unsigned getAlignment(StoreInst *SI) const {
-    unsigned Align = SI->getAlignment();
-    if (Align != 0)
-      return Align;
-
-    return DL.getABITypeAlignment(SI->getValueOperand()->getType());
-  }
-
   static const unsigned MaxDepth = 3;
 
   bool isConsecutiveAccess(Value *A, Value *B);
@@ -447,20 +430,78 @@ bool Vectorizer::lookThroughComplexAddresses(Value *PtrA, Value *PtrB,
 
   // Now we need to prove that adding IdxDiff to ValA won't overflow.
   bool Safe = false;
+  auto CheckFlags = [](Instruction *I, bool Signed) {
+    BinaryOperator *BinOpI = cast<BinaryOperator>(I);
+    return (Signed && BinOpI->hasNoSignedWrap()) ||
+           (!Signed && BinOpI->hasNoUnsignedWrap());
+  };
+
   // First attempt: if OpB is an add with NSW/NUW, and OpB is IdxDiff added to
   // ValA, we're okay.
   if (OpB->getOpcode() == Instruction::Add &&
       isa<ConstantInt>(OpB->getOperand(1)) &&
-      IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue())) {
-    if (Signed)
-      Safe = cast<BinaryOperator>(OpB)->hasNoSignedWrap();
-    else
-      Safe = cast<BinaryOperator>(OpB)->hasNoUnsignedWrap();
+      IdxDiff.sle(cast<ConstantInt>(OpB->getOperand(1))->getSExtValue()) &&
+      CheckFlags(OpB, Signed))
+    Safe = true;
+
+  // Second attempt: If both OpA and OpB is an add with NSW/NUW and with
+  // the same LHS operand, we can guarantee that the transformation is safe
+  // if we can prove that OpA won't overflow when IdxDiff added to the RHS
+  // of OpA.
+  // For example:
+  //  %tmp7 = add nsw i32 %tmp2, %v0
+  //  %tmp8 = sext i32 %tmp7 to i64
+  //  ...
+  //  %tmp11 = add nsw i32 %v0, 1
+  //  %tmp12 = add nsw i32 %tmp2, %tmp11
+  //  %tmp13 = sext i32 %tmp12 to i64
+  //
+  //  Both %tmp7 and %tmp2 has the nsw flag and the first operand
+  //  is %tmp2. It's guaranteed that adding 1 to %tmp7 won't overflow
+  //  because %tmp11 adds 1 to %v0 and both %tmp11 and %tmp12 has the
+  //  nsw flag.
+  OpA = dyn_cast<Instruction>(ValA);
+  if (!Safe && OpA && OpA->getOpcode() == Instruction::Add &&
+      OpB->getOpcode() == Instruction::Add &&
+      OpA->getOperand(0) == OpB->getOperand(0) && CheckFlags(OpA, Signed) &&
+      CheckFlags(OpB, Signed)) {
+    Value *RHSA = OpA->getOperand(1);
+    Value *RHSB = OpB->getOperand(1);
+    Instruction *OpRHSA = dyn_cast<Instruction>(RHSA);
+    Instruction *OpRHSB = dyn_cast<Instruction>(RHSB);
+    // Match `x +nsw/nuw y` and `x +nsw/nuw (y +nsw/nuw IdxDiff)`.
+    if (OpRHSB && OpRHSB->getOpcode() == Instruction::Add &&
+        CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSB->getOperand(1))) {
+      int64_t CstVal = cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue();
+      if (OpRHSB->getOperand(0) == RHSA && IdxDiff.getSExtValue() == CstVal)
+        Safe = true;
+    }
+    // Match `x +nsw/nuw (y +nsw/nuw -Idx)` and `x +nsw/nuw (y +nsw/nuw x)`.
+    if (OpRHSA && OpRHSA->getOpcode() == Instruction::Add &&
+        CheckFlags(OpRHSA, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1))) {
+      int64_t CstVal = cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue();
+      if (OpRHSA->getOperand(0) == RHSB && IdxDiff.getSExtValue() == -CstVal)
+        Safe = true;
+    }
+    // Match `x +nsw/nuw (y +nsw/nuw c)` and
+    // `x +nsw/nuw (y +nsw/nuw (c + IdxDiff))`.
+    if (OpRHSA && OpRHSB && OpRHSA->getOpcode() == Instruction::Add &&
+        OpRHSB->getOpcode() == Instruction::Add && CheckFlags(OpRHSA, Signed) &&
+        CheckFlags(OpRHSB, Signed) && isa<ConstantInt>(OpRHSA->getOperand(1)) &&
+        isa<ConstantInt>(OpRHSB->getOperand(1))) {
+      int64_t CstValA =
+          cast<ConstantInt>(OpRHSA->getOperand(1))->getSExtValue();
+      int64_t CstValB =
+          cast<ConstantInt>(OpRHSB->getOperand(1))->getSExtValue();
+      if (OpRHSA->getOperand(0) == OpRHSB->getOperand(0) &&
+          IdxDiff.getSExtValue() == (CstValB - CstValA))
+        Safe = true;
+    }
   }
 
   unsigned BitWidth = ValA->getType()->getScalarSizeInBits();
 
-  // Second attempt:
+  // Third attempt:
   // If all set bits of IdxDiff or any higher order bit other than the sign bit
   // are known to be zero in ValA, we can add Diff to it while guaranteeing no
   // overflow of any sort.
@@ -503,7 +544,6 @@ bool Vectorizer::lookThroughSelects(Value *PtrA, Value *PtrB,
 }
 
 void Vectorizer::reorder(Instruction *I) {
-  OrderedBasicBlock OBB(I->getParent());
   SmallPtrSet<Instruction *, 16> InstructionsToMove;
   SmallVector<Instruction *, 16> Worklist;
 
@@ -521,7 +561,7 @@ void Vectorizer::reorder(Instruction *I) {
       if (IM->getParent() != I->getParent())
         continue;
 
-      if (!OBB.dominates(IM, I)) {
+      if (!IM->comesBefore(I)) {
         InstructionsToMove.insert(IM);
         Worklist.push_back(IM);
       }
@@ -637,8 +677,6 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
     }
   }
 
-  OrderedBasicBlock OBB(Chain[0]->getParent());
-
   // Loop until we find an instruction in ChainInstrs that we can't vectorize.
   unsigned ChainInstrIdx = 0;
   Instruction *BarrierMemoryInstr = nullptr;
@@ -648,14 +686,14 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
 
     // If a barrier memory instruction was found, chain instructions that follow
     // will not be added to the valid prefix.
-    if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, ChainInstr))
+    if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(ChainInstr))
       break;
 
     // Check (in BB order) if any instruction prevents ChainInstr from being
     // vectorized. Find and store the first such "conflicting" instruction.
     for (Instruction *MemInstr : MemoryInstrs) {
       // If a barrier memory instruction was found, do not check past it.
-      if (BarrierMemoryInstr && OBB.dominates(BarrierMemoryInstr, MemInstr))
+      if (BarrierMemoryInstr && BarrierMemoryInstr->comesBefore(MemInstr))
         break;
 
       auto *MemLoad = dyn_cast<LoadInst>(MemInstr);
@@ -674,12 +712,12 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
       // vectorize it (the vectorized load is inserted at the location of the
       // first load in the chain).
       if (isa<StoreInst>(MemInstr) && ChainLoad &&
-          (IsInvariantLoad(ChainLoad) || OBB.dominates(ChainLoad, MemInstr)))
+          (IsInvariantLoad(ChainLoad) || ChainLoad->comesBefore(MemInstr)))
         continue;
 
       // Same case, but in reverse.
       if (MemLoad && isa<StoreInst>(ChainInstr) &&
-          (IsInvariantLoad(MemLoad) || OBB.dominates(MemLoad, ChainInstr)))
+          (IsInvariantLoad(MemLoad) || MemLoad->comesBefore(ChainInstr)))
         continue;
 
       if (!AA.isNoAlias(MemoryLocation::get(MemInstr),
@@ -705,7 +743,7 @@ Vectorizer::getVectorizablePrefix(ArrayRef<Instruction *> Chain) {
     // the basic block.
     if (IsLoadChain && BarrierMemoryInstr) {
       // The BarrierMemoryInstr is a store that precedes ChainInstr.
-      assert(OBB.dominates(BarrierMemoryInstr, ChainInstr));
+      assert(BarrierMemoryInstr->comesBefore(ChainInstr));
       break;
     }
   }
@@ -961,7 +999,7 @@ bool Vectorizer::vectorizeStoreChain(
   unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
   unsigned VF = VecRegSize / Sz;
   unsigned ChainSize = Chain.size();
-  unsigned Alignment = getAlignment(S0);
+  Align Alignment = S0->getAlign();
 
   if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
     InstructionsProcessed->insert(Chain.begin(), Chain.end());
@@ -992,10 +1030,10 @@ bool Vectorizer::vectorizeStoreChain(
   VectorType *VecTy;
   VectorType *VecStoreTy = dyn_cast<VectorType>(StoreTy);
   if (VecStoreTy)
-    VecTy = VectorType::get(StoreTy->getScalarType(),
-                            Chain.size() * VecStoreTy->getNumElements());
+    VecTy = FixedVectorType::get(StoreTy->getScalarType(),
+                                 Chain.size() * VecStoreTy->getNumElements());
   else
-    VecTy = VectorType::get(StoreTy, Chain.size());
+    VecTy = FixedVectorType::get(StoreTy, Chain.size());
 
   // If it's more than the max vector size or the target has a better
   // vector factor, break it into two pieces.
@@ -1019,18 +1057,20 @@ bool Vectorizer::vectorizeStoreChain(
   InstructionsProcessed->insert(Chain.begin(), Chain.end());
 
   // If the store is going to be misaligned, don't vectorize it.
-  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
+  if (accessIsMisaligned(SzInBytes, AS, Alignment.value())) {
     if (S0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
       auto Chains = splitOddVectorElts(Chain, Sz);
       return vectorizeStoreChain(Chains.first, InstructionsProcessed) |
              vectorizeStoreChain(Chains.second, InstructionsProcessed);
     }
 
-    unsigned NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
-                                                   StackAdjustedAlignment,
-                                                   DL, S0, nullptr, &DT);
-    if (NewAlign != 0)
+    Align NewAlign = getOrEnforceKnownAlignment(S0->getPointerOperand(),
+                                                Align(StackAdjustedAlignment),
+                                                DL, S0, nullptr, &DT);
+    if (NewAlign >= Alignment)
       Alignment = NewAlign;
+    else
+      return false;
   }
 
   if (!TTI.isLegalToVectorizeStoreChain(SzInBytes, Alignment, AS)) {
@@ -1112,7 +1152,7 @@ bool Vectorizer::vectorizeLoadChain(
   unsigned VecRegSize = TTI.getLoadStoreVecRegBitWidth(AS);
   unsigned VF = VecRegSize / Sz;
   unsigned ChainSize = Chain.size();
-  unsigned Alignment = getAlignment(L0);
+  Align Alignment = L0->getAlign();
 
   if (!isPowerOf2_32(Sz) || VF < 2 || ChainSize < 2) {
     InstructionsProcessed->insert(Chain.begin(), Chain.end());
@@ -1142,10 +1182,10 @@ bool Vectorizer::vectorizeLoadChain(
   VectorType *VecTy;
   VectorType *VecLoadTy = dyn_cast<VectorType>(LoadTy);
   if (VecLoadTy)
-    VecTy = VectorType::get(LoadTy->getScalarType(),
-                            Chain.size() * VecLoadTy->getNumElements());
+    VecTy = FixedVectorType::get(LoadTy->getScalarType(),
+                                 Chain.size() * VecLoadTy->getNumElements());
   else
-    VecTy = VectorType::get(LoadTy, Chain.size());
+    VecTy = FixedVectorType::get(LoadTy, Chain.size());
 
   // If it's more than the max vector size or the target has a better
   // vector factor, break it into two pieces.
@@ -1162,15 +1202,20 @@ bool Vectorizer::vectorizeLoadChain(
   InstructionsProcessed->insert(Chain.begin(), Chain.end());
 
   // If the load is going to be misaligned, don't vectorize it.
-  if (accessIsMisaligned(SzInBytes, AS, Alignment)) {
+  if (accessIsMisaligned(SzInBytes, AS, Alignment.value())) {
     if (L0->getPointerAddressSpace() != DL.getAllocaAddrSpace()) {
       auto Chains = splitOddVectorElts(Chain, Sz);
       return vectorizeLoadChain(Chains.first, InstructionsProcessed) |
              vectorizeLoadChain(Chains.second, InstructionsProcessed);
     }
 
-    Alignment = getOrEnforceKnownAlignment(
-        L0->getPointerOperand(), StackAdjustedAlignment, DL, L0, nullptr, &DT);
+    Align NewAlign = getOrEnforceKnownAlignment(L0->getPointerOperand(),
+                                                Align(StackAdjustedAlignment),
+                                                DL, L0, nullptr, &DT);
+    if (NewAlign >= Alignment)
+      Alignment = NewAlign;
+    else
+      return false;
   }
 
   if (!TTI.isLegalToVectorizeLoadChain(SzInBytes, Alignment, AS)) {
@@ -1194,7 +1239,8 @@ bool Vectorizer::vectorizeLoadChain(
 
   Value *Bitcast =
       Builder.CreateBitCast(L0->getPointerOperand(), VecTy->getPointerTo(AS));
-  LoadInst *LI = Builder.CreateAlignedLoad(VecTy, Bitcast, Alignment);
+  LoadInst *LI =
+      Builder.CreateAlignedLoad(VecTy, Bitcast, MaybeAlign(Alignment));
   propagateMetadata(LI, Chain);
 
   if (VecLoadTy) {
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
index 3f943f4c0688..23613775d896 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationLegality.cpp
@@ -13,14 +13,17 @@
 // pass. It should be easy to create an analysis pass around it if there
 // is a need (but D45420 needs to happen first).
 //
-#include "llvm/Transforms/Vectorize/LoopVectorize.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include "llvm/Analysis/Loads.h"
+#include "llvm/Analysis/LoopInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/IntrinsicInst.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/Transforms/Vectorize/LoopVectorize.h"
 
 using namespace llvm;
+using namespace PatternMatch;
 
 #define LV_NAME "loop-vectorize"
 #define DEBUG_TYPE LV_NAME
@@ -566,6 +569,28 @@ bool LoopVectorizationLegality::setupOuterLoopInductions() {
     return false;
 }
 
+/// Checks if a function is scalarizable according to the TLI, in
+/// the sense that it should be vectorized and then expanded in
+/// multiple scalarcalls. This is represented in the
+/// TLI via mappings that do not specify a vector name, as in the
+/// following example:
+///
+///    const VecDesc VecIntrinsics[] = {
+///      {"llvm.phx.abs.i32", "", 4}
+///    };
+static bool isTLIScalarize(const TargetLibraryInfo &TLI, const CallInst &CI) {
+  const StringRef ScalarName = CI.getCalledFunction()->getName();
+  bool Scalarize = TLI.isFunctionVectorizable(ScalarName);
+  // Check that all known VFs are not associated to a vector
+  // function, i.e. the vector name is emty.
+  if (Scalarize)
+    for (unsigned VF = 2, WidestVF = TLI.getWidestVF(ScalarName);
+         VF <= WidestVF; VF *= 2) {
+      Scalarize &= !TLI.isFunctionVectorizable(ScalarName, VF);
+    }
+  return Scalarize;
+}
+
 bool LoopVectorizationLegality::canVectorizeInstrs() {
   BasicBlock *Header = TheLoop->getHeader();
 
@@ -644,6 +669,7 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
 
         if (RecurrenceDescriptor::isFirstOrderRecurrence(Phi, TheLoop,
                                                          SinkAfter, DT)) {
+          AllowedExit.insert(Phi);
           FirstOrderRecurrences.insert(Phi);
           continue;
         }
@@ -667,10 +693,12 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
       //   * Have a mapping to an IR intrinsic.
       //   * Have a vector version available.
       auto *CI = dyn_cast<CallInst>(&I);
+
       if (CI && !getVectorIntrinsicIDForCall(CI, TLI) &&
           !isa<DbgInfoIntrinsic>(CI) &&
           !(CI->getCalledFunction() && TLI &&
-            TLI->isFunctionVectorizable(CI->getCalledFunction()->getName()))) {
+            (!VFDatabase::getMappings(*CI).empty() ||
+             isTLIScalarize(*TLI, *CI)))) {
         // If the call is a recognized math libary call, it is likely that
         // we can vectorize it given loosened floating-point constraints.
         LibFunc Func;
@@ -685,7 +713,8 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
           // but it's hard to provide meaningful yet generic advice.
           // Also, should this be guarded by allowExtraAnalysis() and/or be part
           // of the returned info from isFunctionVectorizable()?
-          reportVectorizationFailure("Found a non-intrinsic callsite",
+          reportVectorizationFailure(
+              "Found a non-intrinsic callsite",
               "library call cannot be vectorized. "
               "Try compiling with -fno-math-errno, -ffast-math, "
               "or similar flags",
@@ -739,11 +768,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         // supported on the target.
         if (ST->getMetadata(LLVMContext::MD_nontemporal)) {
           // Arbitrarily try a vector of 2 elements.
-          Type *VecTy = VectorType::get(T, /*NumElements=*/2);
+          auto *VecTy = FixedVectorType::get(T, /*NumElements=*/2);
           assert(VecTy && "did not find vectorized version of stored type");
-          const MaybeAlign Alignment = getLoadStoreAlignment(ST);
-          assert(Alignment && "Alignment should be set");
-          if (!TTI->isLegalNTStore(VecTy, *Alignment)) {
+          if (!TTI->isLegalNTStore(VecTy, ST->getAlign())) {
             reportVectorizationFailure(
                 "nontemporal store instruction cannot be vectorized",
                 "nontemporal store instruction cannot be vectorized",
@@ -756,11 +783,9 @@ bool LoopVectorizationLegality::canVectorizeInstrs() {
         if (LD->getMetadata(LLVMContext::MD_nontemporal)) {
           // For nontemporal loads, check that a nontemporal vector version is
           // supported on the target (arbitrarily try a vector of 2 elements).
-          Type *VecTy = VectorType::get(I.getType(), /*NumElements=*/2);
+          auto *VecTy = FixedVectorType::get(I.getType(), /*NumElements=*/2);
           assert(VecTy && "did not find vectorized version of load type");
-          const MaybeAlign Alignment = getLoadStoreAlignment(LD);
-          assert(Alignment && "Alignment should be set");
-          if (!TTI->isLegalNTLoad(VecTy, *Alignment)) {
+          if (!TTI->isLegalNTLoad(VecTy, LD->getAlign())) {
             reportVectorizationFailure(
                 "nontemporal load instruction cannot be vectorized",
                 "nontemporal load instruction cannot be vectorized",
@@ -897,6 +922,14 @@ bool LoopVectorizationLegality::blockCanBePredicated(
         if (C->canTrap())
           return false;
     }
+
+    // We can predicate blocks with calls to assume, as long as we drop them in
+    // case we flatten the CFG via predication.
+    if (match(&I, m_Intrinsic<Intrinsic::assume>())) {
+      ConditionalAssumes.insert(&I);
+      continue;
+    }
+
     // We might be able to hoist the load.
     if (I.mayReadFromMemory()) {
       auto *LI = dyn_cast<LoadInst>(&I);
@@ -947,14 +980,14 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
   // the memory pointed to can be dereferenced (with the access size implied by
   // the value's type) unconditionally within the loop header without
   // introducing a new fault.
-  SmallPtrSet<Value *, 8> SafePointes;
+  SmallPtrSet<Value *, 8> SafePointers;
 
   // Collect safe addresses.
   for (BasicBlock *BB : TheLoop->blocks()) {
     if (!blockNeedsPredication(BB)) {
       for (Instruction &I : *BB)
         if (auto *Ptr = getLoadStorePointerOperand(&I))
-          SafePointes.insert(Ptr);
+          SafePointers.insert(Ptr);
       continue;
     }
 
@@ -968,7 +1001,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
       LoadInst *LI = dyn_cast<LoadInst>(&I);
       if (LI && !mustSuppressSpeculation(*LI) &&
           isDereferenceableAndAlignedInLoop(LI, TheLoop, SE, *DT))
-        SafePointes.insert(LI->getPointerOperand());
+        SafePointers.insert(LI->getPointerOperand());
     }
   }
 
@@ -986,7 +1019,7 @@ bool LoopVectorizationLegality::canVectorizeWithIfConvert() {
 
     // We must be able to predicate all blocks that need to be predicated.
     if (blockNeedsPredication(BB)) {
-      if (!blockCanBePredicated(BB, SafePointes)) {
+      if (!blockCanBePredicated(BB, SafePointers)) {
         reportVectorizationFailure(
             "Control flow cannot be substituted for a select",
             "control flow cannot be substituted for a select",
@@ -1198,18 +1231,9 @@ bool LoopVectorizationLegality::prepareToFoldTailByMasking() {
 
   LLVM_DEBUG(dbgs() << "LV: checking if tail can be folded by masking.\n");
 
-  if (!PrimaryInduction) {
-    reportVectorizationFailure(
-        "No primary induction, cannot fold tail by masking",
-        "Missing a primary induction variable in the loop, which is "
-        "needed in order to fold tail by masking as required.",
-        "NoPrimaryInduction", ORE, TheLoop);
-    return false;
-  }
-
   SmallPtrSet<const Value *, 8> ReductionLiveOuts;
 
-  for (auto &Reduction : *getReductionVars())
+  for (auto &Reduction : getReductionVars())
     ReductionLiveOuts.insert(Reduction.second.getLoopExitInstr());
 
   // TODO: handle non-reduction outside users when tail is folded by masking.
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
index c3ca43fcd492..8dd06983cd84 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorizationPlanner.h
@@ -31,9 +31,12 @@
 
 namespace llvm {
 
+class LoopVectorizationLegality;
+class LoopVectorizationCostModel;
+class PredicatedScalarEvolution;
+
 /// VPlan-based builder utility analogous to IRBuilder.
 class VPBuilder {
-private:
   VPBasicBlock *BB = nullptr;
   VPBasicBlock::iterator InsertPt = VPBasicBlock::iterator();
 
@@ -204,6 +207,8 @@ class LoopVectorizationPlanner {
   /// The interleaved access analysis.
   InterleavedAccessInfo &IAI;
 
+  PredicatedScalarEvolution &PSE;
+
   SmallVector<VPlanPtr, 4> VPlans;
 
   /// This class is used to enable the VPlan to invoke a method of ILV. This is
@@ -229,13 +234,14 @@ public:
                            const TargetTransformInfo *TTI,
                            LoopVectorizationLegality *Legal,
                            LoopVectorizationCostModel &CM,
-                           InterleavedAccessInfo &IAI)
-      : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM),
-        IAI(IAI) {}
+                           InterleavedAccessInfo &IAI,
+                           PredicatedScalarEvolution &PSE)
+      : OrigLoop(L), LI(LI), TLI(TLI), TTI(TTI), Legal(Legal), CM(CM), IAI(IAI),
+        PSE(PSE) {}
 
   /// Plan how to best vectorize, return the best VF and its cost, or None if
   /// vectorization and interleaving should be avoided up front.
-  Optional<VectorizationFactor> plan(unsigned UserVF);
+  Optional<VectorizationFactor> plan(unsigned UserVF, unsigned UserIC);
 
   /// Use the VPlan-native path to plan how to best vectorize, return the best
   /// VF and its cost.
@@ -279,9 +285,10 @@ private:
 
   /// Build a VPlan using VPRecipes according to the information gather by
   /// Legal. This method is only used for the legacy inner loop vectorizer.
-  VPlanPtr
-  buildVPlanWithVPRecipes(VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
-                          SmallPtrSetImpl<Instruction *> &DeadInstructions);
+  VPlanPtr buildVPlanWithVPRecipes(
+      VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
+      SmallPtrSetImpl<Instruction *> &DeadInstructions,
+      const DenseMap<Instruction *, Instruction *> &SinkAfter);
 
   /// Build VPlans for power-of-2 VF's between \p MinVF and \p MaxVF inclusive,
   /// according to the information gathered by Legal when it checked if it is
diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 684a3098e564..35af8e425778 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -91,7 +91,6 @@
 #include "llvm/Analysis/OptimizationRemarkEmitter.h"
 #include "llvm/Analysis/ProfileSummaryInfo.h"
 #include "llvm/Analysis/ScalarEvolution.h"
-#include "llvm/Analysis/ScalarEvolutionExpander.h"
 #include "llvm/Analysis/ScalarEvolutionExpressions.h"
 #include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
@@ -134,9 +133,11 @@
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/InjectTLIMappings.h"
 #include "llvm/Transforms/Utils/LoopSimplify.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Utils/LoopVersioning.h"
+#include "llvm/Transforms/Utils/ScalarEvolutionExpander.h"
 #include "llvm/Transforms/Utils/SizeOpts.h"
 #include "llvm/Transforms/Vectorize/LoopVectorizationLegality.h"
 #include <algorithm>
@@ -294,15 +295,6 @@ cl::opt<bool> llvm::EnableLoopVectorization(
     "vectorize-loops", cl::init(true), cl::Hidden,
     cl::desc("Run the Loop vectorization passes"));
 
-/// A helper function for converting Scalar types to vector types.
-/// If the incoming type is void, we return void. If the VF is 1, we return
-/// the scalar type.
-static Type *ToVectorTy(Type *Scalar, unsigned VF) {
-  if (Scalar->isVoidTy() || VF == 1)
-    return Scalar;
-  return VectorType::get(Scalar, VF);
-}
-
 /// A helper function that returns the type of loaded or stored value.
 static Type *getMemInstValueType(Value *I) {
   assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
@@ -319,7 +311,7 @@ static bool hasIrregularType(Type *Ty, const DataLayout &DL, unsigned VF) {
   // Determine if an array of VF elements of type Ty is "bitcast compatible"
   // with a <VF x Ty> vector.
   if (VF > 1) {
-    auto *VectorTy = VectorType::get(Ty, VF);
+    auto *VectorTy = FixedVectorType::get(Ty, VF);
     return VF * DL.getTypeAllocSize(Ty) != DL.getTypeStoreSize(VectorTy);
   }
 
@@ -415,7 +407,16 @@ public:
   BasicBlock *createVectorizedLoopSkeleton();
 
   /// Widen a single instruction within the innermost loop.
-  void widenInstruction(Instruction &I);
+  void widenInstruction(Instruction &I, VPUser &Operands,
+                        VPTransformState &State);
+
+  /// Widen a single call instruction within the innermost loop.
+  void widenCallInstruction(CallInst &I, VPUser &ArgOperands,
+                            VPTransformState &State);
+
+  /// Widen a single select instruction within the innermost loop.
+  void widenSelectInstruction(SelectInst &I, VPUser &Operands,
+                              bool InvariantCond, VPTransformState &State);
 
   /// Fix the vectorized code, taking care of header phi's, live-outs, and more.
   void fixVectorizedLoop();
@@ -430,8 +431,9 @@ public:
 
   /// Vectorize a single GetElementPtrInst based on information gathered and
   /// decisions taken during planning.
-  void widenGEP(GetElementPtrInst *GEP, unsigned UF, unsigned VF,
-                bool IsPtrLoopInvariant, SmallBitVector &IsIndexLoopInvariant);
+  void widenGEP(GetElementPtrInst *GEP, VPUser &Indices, unsigned UF,
+                unsigned VF, bool IsPtrLoopInvariant,
+                SmallBitVector &IsIndexLoopInvariant, VPTransformState &State);
 
   /// Vectorize a single PHINode in a block. This method handles the induction
   /// variable canonicalization. It supports both VF = 1 for unrolled loops and
@@ -441,9 +443,11 @@ public:
   /// A helper function to scalarize a single Instruction in the innermost loop.
   /// Generates a sequence of scalar instances for each lane between \p MinLane
   /// and \p MaxLane, times each part between \p MinPart and \p MaxPart,
-  /// inclusive..
-  void scalarizeInstruction(Instruction *Instr, const VPIteration &Instance,
-                            bool IfPredicateInstr);
+  /// inclusive. Uses the VPValue operands from \p Operands instead of \p
+  /// Instr's operands.
+  void scalarizeInstruction(Instruction *Instr, VPUser &Operands,
+                            const VPIteration &Instance, bool IfPredicateInstr,
+                            VPTransformState &State);
 
   /// Widen an integer or floating-point induction variable \p IV. If \p Trunc
   /// is provided, the integer induction variable will first be truncated to
@@ -482,20 +486,21 @@ public:
   /// Construct the vector value of a scalarized value \p V one lane at a time.
   void packScalarIntoVectorValue(Value *V, const VPIteration &Instance);
 
-  /// Try to vectorize the interleaved access group that \p Instr belongs to
-  /// with the base address given in \p Addr, optionally masking the vector
-  /// operations if \p BlockInMask is non-null. Use \p State to translate given
-  /// VPValues to IR values in the vectorized loop.
-  void vectorizeInterleaveGroup(Instruction *Instr, VPTransformState &State,
-                                VPValue *Addr, VPValue *BlockInMask = nullptr);
+  /// Try to vectorize interleaved access group \p Group with the base address
+  /// given in \p Addr, optionally masking the vector operations if \p
+  /// BlockInMask is non-null. Use \p State to translate given VPValues to IR
+  /// values in the vectorized loop.
+  void vectorizeInterleaveGroup(const InterleaveGroup<Instruction> *Group,
+                                VPTransformState &State, VPValue *Addr,
+                                VPValue *BlockInMask = nullptr);
 
   /// Vectorize Load and Store instructions with the base address given in \p
   /// Addr, optionally masking the vector operations if \p BlockInMask is
   /// non-null. Use \p State to translate given VPValues to IR values in the
   /// vectorized loop.
   void vectorizeMemoryInstruction(Instruction *Instr, VPTransformState &State,
-                                  VPValue *Addr,
-                                  VPValue *BlockInMask = nullptr);
+                                  VPValue *Addr, VPValue *StoredValue,
+                                  VPValue *BlockInMask);
 
   /// Set the debug location in the builder using the debug location in
   /// the instruction.
@@ -682,7 +687,7 @@ protected:
   DominatorTree *DT;
 
   /// Alias Analysis.
-  AliasAnalysis *AA;
+  AAResults *AA;
 
   /// Target Library Info.
   const TargetLibraryInfo *TLI;
@@ -974,7 +979,7 @@ public:
 
   /// \return An upper bound for the vectorization factor, or None if
   /// vectorization and interleaving should be avoided up front.
-  Optional<unsigned> computeMaxVF();
+  Optional<unsigned> computeMaxVF(unsigned UserVF, unsigned UserIC);
 
   /// \return True if runtime checks are required for vectorization, and false
   /// otherwise.
@@ -1066,7 +1071,7 @@ public:
     auto UniformsPerVF = Uniforms.find(VF);
     assert(UniformsPerVF != Uniforms.end() &&
            "VF not yet analyzed for uniformity");
-    return UniformsPerVF->second.find(I) != UniformsPerVF->second.end();
+    return UniformsPerVF->second.count(I);
   }
 
   /// Returns true if \p I is known to be scalar after vectorization.
@@ -1082,7 +1087,7 @@ public:
     auto ScalarsPerVF = Scalars.find(VF);
     assert(ScalarsPerVF != Scalars.end() &&
            "Scalar values are not calculated for VF");
-    return ScalarsPerVF->second.find(I) != ScalarsPerVF->second.end();
+    return ScalarsPerVF->second.count(I);
   }
 
   /// \returns True if instruction \p I can be truncated to a smaller bitwidth
@@ -1200,27 +1205,27 @@ public:
 
   /// Returns true if the target machine supports masked store operation
   /// for the given \p DataType and kind of access to \p Ptr.
-  bool isLegalMaskedStore(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
+  bool isLegalMaskedStore(Type *DataType, Value *Ptr, Align Alignment) {
     return Legal->isConsecutivePtr(Ptr) &&
            TTI.isLegalMaskedStore(DataType, Alignment);
   }
 
   /// Returns true if the target machine supports masked load operation
   /// for the given \p DataType and kind of access to \p Ptr.
-  bool isLegalMaskedLoad(Type *DataType, Value *Ptr, MaybeAlign Alignment) {
+  bool isLegalMaskedLoad(Type *DataType, Value *Ptr, Align Alignment) {
     return Legal->isConsecutivePtr(Ptr) &&
            TTI.isLegalMaskedLoad(DataType, Alignment);
   }
 
   /// Returns true if the target machine supports masked scatter operation
   /// for the given \p DataType.
-  bool isLegalMaskedScatter(Type *DataType, MaybeAlign Alignment) {
+  bool isLegalMaskedScatter(Type *DataType, Align Alignment) {
     return TTI.isLegalMaskedScatter(DataType, Alignment);
   }
 
   /// Returns true if the target machine supports masked gather operation
   /// for the given \p DataType.
-  bool isLegalMaskedGather(Type *DataType, MaybeAlign Alignment) {
+  bool isLegalMaskedGather(Type *DataType, Align Alignment) {
     return TTI.isLegalMaskedGather(DataType, Alignment);
   }
 
@@ -1232,7 +1237,7 @@ public:
     if (!LI && !SI)
       return false;
     auto *Ty = getMemInstValueType(V);
-    MaybeAlign Align = getLoadStoreAlignment(V);
+    Align Align = getLoadStoreAlignment(V);
     return (LI && isLegalMaskedGather(Ty, Align)) ||
            (SI && isLegalMaskedScatter(Ty, Align));
   }
@@ -1309,11 +1314,19 @@ public:
   /// i.e. either vector version isn't available, or is too expensive.
   unsigned getVectorCallCost(CallInst *CI, unsigned VF, bool &NeedToScalarize);
 
+  /// Invalidates decisions already taken by the cost model.
+  void invalidateCostModelingDecisions() {
+    WideningDecisions.clear();
+    Uniforms.clear();
+    Scalars.clear();
+  }
+
 private:
   unsigned NumPredStores = 0;
 
-  /// \return An upper bound for the vectorization factor, larger than zero.
-  /// One is returned if vectorization should best be avoided due to cost.
+  /// \return An upper bound for the vectorization factor, a power-of-2 larger
+  /// than zero. One is returned if vectorization should best be avoided due
+  /// to cost.
   unsigned computeFeasibleMaxVF(unsigned ConstTripCount);
 
   /// The vectorization cost is a combination of the cost itself and a boolean
@@ -1598,9 +1611,8 @@ struct LoopVectorize : public FunctionPass {
 
   explicit LoopVectorize(bool InterleaveOnlyWhenForced = false,
                          bool VectorizeOnlyWhenForced = false)
-      : FunctionPass(ID) {
-    Impl.InterleaveOnlyWhenForced = InterleaveOnlyWhenForced;
-    Impl.VectorizeOnlyWhenForced = VectorizeOnlyWhenForced;
+      : FunctionPass(ID),
+        Impl({InterleaveOnlyWhenForced, VectorizeOnlyWhenForced}) {
     initializeLoopVectorizePass(*PassRegistry::getPassRegistry());
   }
 
@@ -1626,7 +1638,7 @@ struct LoopVectorize : public FunctionPass {
         [&](Loop &L) -> const LoopAccessInfo & { return LAA->getInfo(&L); };
 
     return Impl.runImpl(F, *SE, *LI, *TTI, *DT, *BFI, TLI, *DB, *AA, *AC,
-                        GetLAA, *ORE, PSI);
+                        GetLAA, *ORE, PSI).MadeAnyChange;
   }
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
@@ -1640,6 +1652,7 @@ struct LoopVectorize : public FunctionPass {
     AU.addRequired<LoopAccessLegacyAnalysis>();
     AU.addRequired<DemandedBitsWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addRequired<InjectTLIMappingsLegacy>();
 
     // We currently do not preserve loopinfo/dominator analyses with outer loop
     // vectorization. Until this is addressed, mark these analyses as preserved
@@ -1724,9 +1737,10 @@ void InnerLoopVectorizer::createVectorIntOrFpInductionPHI(
   // FIXME: If the step is non-constant, we create the vector splat with
   //        IRBuilder. IRBuilder can constant-fold the multiply, but it doesn't
   //        handle a constant vector splat.
-  Value *SplatVF = isa<Constant>(Mul)
-                       ? ConstantVector::getSplat(VF, cast<Constant>(Mul))
-                       : Builder.CreateVectorSplat(VF, Mul);
+  Value *SplatVF =
+      isa<Constant>(Mul)
+          ? ConstantVector::getSplat({VF, false}, cast<Constant>(Mul))
+          : Builder.CreateVectorSplat(VF, Mul);
   Builder.restoreIP(CurrIP);
 
   // We may need to add the step a number of times, depending on the unroll
@@ -1806,57 +1820,37 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
   assert((IV->getType()->isIntegerTy() || IV != OldInduction) &&
          "Primary induction variable must have an integer type");
 
-  auto II = Legal->getInductionVars()->find(IV);
-  assert(II != Legal->getInductionVars()->end() && "IV is not an induction");
+  auto II = Legal->getInductionVars().find(IV);
+  assert(II != Legal->getInductionVars().end() && "IV is not an induction");
 
   auto ID = II->second;
   assert(IV->getType() == ID.getStartValue()->getType() && "Types must match");
 
-  // The scalar value to broadcast. This will be derived from the canonical
-  // induction variable.
-  Value *ScalarIV = nullptr;
-
   // The value from the original loop to which we are mapping the new induction
   // variable.
   Instruction *EntryVal = Trunc ? cast<Instruction>(Trunc) : IV;
 
-  // True if we have vectorized the induction variable.
-  auto VectorizedIV = false;
-
-  // Determine if we want a scalar version of the induction variable. This is
-  // true if the induction variable itself is not widened, or if it has at
-  // least one user in the loop that is not widened.
-  auto NeedsScalarIV = VF > 1 && needsScalarInduction(EntryVal);
+  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
 
   // Generate code for the induction step. Note that induction steps are
   // required to be loop-invariant
-  assert(PSE.getSE()->isLoopInvariant(ID.getStep(), OrigLoop) &&
-         "Induction step should be loop invariant");
-  auto &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
-  Value *Step = nullptr;
-  if (PSE.getSE()->isSCEVable(IV->getType())) {
-    SCEVExpander Exp(*PSE.getSE(), DL, "induction");
-    Step = Exp.expandCodeFor(ID.getStep(), ID.getStep()->getType(),
-                             LoopVectorPreHeader->getTerminator());
-  } else {
-    Step = cast<SCEVUnknown>(ID.getStep())->getValue();
-  }
-
-  // Try to create a new independent vector induction variable. If we can't
-  // create the phi node, we will splat the scalar induction variable in each
-  // loop iteration.
-  if (VF > 1 && !shouldScalarizeInstruction(EntryVal)) {
-    createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
-    VectorizedIV = true;
-  }
+  auto CreateStepValue = [&](const SCEV *Step) -> Value * {
+    assert(PSE.getSE()->isLoopInvariant(Step, OrigLoop) &&
+           "Induction step should be loop invariant");
+    if (PSE.getSE()->isSCEVable(IV->getType())) {
+      SCEVExpander Exp(*PSE.getSE(), DL, "induction");
+      return Exp.expandCodeFor(Step, Step->getType(),
+                               LoopVectorPreHeader->getTerminator());
+    }
+    return cast<SCEVUnknown>(Step)->getValue();
+  };
 
-  // If we haven't yet vectorized the induction variable, or if we will create
-  // a scalar one, we need to define the scalar induction variable and step
-  // values. If we were given a truncation type, truncate the canonical
+  // The scalar value to broadcast. This is derived from the canonical
+  // induction variable. If a truncation type is given, truncate the canonical
   // induction variable and step. Otherwise, derive these values from the
   // induction descriptor.
-  if (!VectorizedIV || NeedsScalarIV) {
-    ScalarIV = Induction;
+  auto CreateScalarIV = [&](Value *&Step) -> Value * {
+    Value *ScalarIV = Induction;
     if (IV != OldInduction) {
       ScalarIV = IV->getType()->isIntegerTy()
                      ? Builder.CreateSExtOrTrunc(Induction, IV->getType())
@@ -1872,12 +1866,12 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
       ScalarIV = Builder.CreateTrunc(ScalarIV, TruncType);
       Step = Builder.CreateTrunc(Step, TruncType);
     }
-  }
+    return ScalarIV;
+  };
 
-  // If we haven't yet vectorized the induction variable, splat the scalar
-  // induction variable, and build the necessary step vectors.
-  // TODO: Don't do it unless the vectorized IV is really required.
-  if (!VectorizedIV) {
+  // Create the vector values from the scalar IV, in the absence of creating a
+  // vector IV.
+  auto CreateSplatIV = [&](Value *ScalarIV, Value *Step) {
     Value *Broadcasted = getBroadcastInstrs(ScalarIV);
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *EntryPart =
@@ -1887,23 +1881,53 @@ void InnerLoopVectorizer::widenIntOrFpInduction(PHINode *IV, TruncInst *Trunc) {
         addMetadata(EntryPart, Trunc);
       recordVectorLoopValueForInductionCast(ID, EntryVal, EntryPart, Part);
     }
+  };
+
+  // Now do the actual transformations, and start with creating the step value.
+  Value *Step = CreateStepValue(ID.getStep());
+  if (VF <= 1) {
+    Value *ScalarIV = CreateScalarIV(Step);
+    CreateSplatIV(ScalarIV, Step);
+    return;
+  }
+
+  // Determine if we want a scalar version of the induction variable. This is
+  // true if the induction variable itself is not widened, or if it has at
+  // least one user in the loop that is not widened.
+  auto NeedsScalarIV = needsScalarInduction(EntryVal);
+  if (!NeedsScalarIV) {
+    createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
+    return;
   }
 
-  // If an induction variable is only used for counting loop iterations or
-  // calculating addresses, it doesn't need to be widened. Create scalar steps
-  // that can be used by instructions we will later scalarize. Note that the
-  // addition of the scalar steps will not increase the number of instructions
-  // in the loop in the common case prior to InstCombine. We will be trading
-  // one vector extract for each scalar step.
-  if (NeedsScalarIV)
+  // Try to create a new independent vector induction variable. If we can't
+  // create the phi node, we will splat the scalar induction variable in each
+  // loop iteration.
+  if (!shouldScalarizeInstruction(EntryVal)) {
+    createVectorIntOrFpInductionPHI(ID, Step, EntryVal);
+    Value *ScalarIV = CreateScalarIV(Step);
+    // Create scalar steps that can be used by instructions we will later
+    // scalarize. Note that the addition of the scalar steps will not increase
+    // the number of instructions in the loop in the common case prior to
+    // InstCombine. We will be trading one vector extract for each scalar step.
     buildScalarSteps(ScalarIV, Step, EntryVal, ID);
+    return;
+  }
+
+  // All IV users are scalar instructions, so only emit a scalar IV, not a
+  // vectorised IV. Except when we tail-fold, then the splat IV feeds the
+  // predicate used by the masked loads/stores.
+  Value *ScalarIV = CreateScalarIV(Step);
+  if (!Cost->isScalarEpilogueAllowed())
+    CreateSplatIV(ScalarIV, Step);
+  buildScalarSteps(ScalarIV, Step, EntryVal, ID);
 }
 
 Value *InnerLoopVectorizer::getStepVector(Value *Val, int StartIdx, Value *Step,
                                           Instruction::BinaryOps BinOp) {
   // Create and check the types.
-  assert(Val->getType()->isVectorTy() && "Must be a vector");
-  int VLen = Val->getType()->getVectorNumElements();
+  auto *ValVTy = cast<VectorType>(Val->getType());
+  int VLen = ValVTy->getNumElements();
 
   Type *STy = Val->getType()->getScalarType();
   assert((STy->isIntegerTy() || STy->isFloatingPointTy()) &&
@@ -2052,7 +2076,7 @@ Value *InnerLoopVectorizer::getOrCreateVectorValue(Value *V, unsigned Part) {
       VectorLoopValueMap.setVectorValue(V, Part, VectorValue);
     } else {
       // Initialize packing with insertelements to start from undef.
-      Value *Undef = UndefValue::get(VectorType::get(V->getType(), VF));
+      Value *Undef = UndefValue::get(FixedVectorType::get(V->getType(), VF));
       VectorLoopValueMap.setVectorValue(V, Part, Undef);
       for (unsigned Lane = 0; Lane < VF; ++Lane)
         packScalarIntoVectorValue(V, {Part, Lane});
@@ -2118,13 +2142,12 @@ void InnerLoopVectorizer::packScalarIntoVectorValue(
 
 Value *InnerLoopVectorizer::reverseVector(Value *Vec) {
   assert(Vec->getType()->isVectorTy() && "Invalid type");
-  SmallVector<Constant *, 8> ShuffleMask;
+  SmallVector<int, 8> ShuffleMask;
   for (unsigned i = 0; i < VF; ++i)
-    ShuffleMask.push_back(Builder.getInt32(VF - i - 1));
+    ShuffleMask.push_back(VF - i - 1);
 
   return Builder.CreateShuffleVector(Vec, UndefValue::get(Vec->getType()),
-                                     ConstantVector::get(ShuffleMask),
-                                     "reverse");
+                                     ShuffleMask, "reverse");
 }
 
 // Return whether we allow using masked interleave-groups (for dealing with
@@ -2166,24 +2189,16 @@ static bool useMaskedInterleavedAccesses(const TargetTransformInfo &TTI) {
 //   %interleaved.vec = shuffle %R_G.vec, %B_U.vec,
 //        <0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11>    ; Interleave R,G,B elements
 //   store <12 x i32> %interleaved.vec              ; Write 4 tuples of R,G,B
-void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
-                                                   VPTransformState &State,
-                                                   VPValue *Addr,
-                                                   VPValue *BlockInMask) {
-  const InterleaveGroup<Instruction> *Group =
-      Cost->getInterleavedAccessGroup(Instr);
-  assert(Group && "Fail to get an interleaved access group.");
-
-  // Skip if current instruction is not the insert position.
-  if (Instr != Group->getInsertPos())
-    return;
-
+void InnerLoopVectorizer::vectorizeInterleaveGroup(
+    const InterleaveGroup<Instruction> *Group, VPTransformState &State,
+    VPValue *Addr, VPValue *BlockInMask) {
+  Instruction *Instr = Group->getInsertPos();
   const DataLayout &DL = Instr->getModule()->getDataLayout();
 
   // Prepare for the vector type of the interleaved load/store.
   Type *ScalarTy = getMemInstValueType(Instr);
   unsigned InterleaveFactor = Group->getFactor();
-  Type *VecTy = VectorType::get(ScalarTy, InterleaveFactor * VF);
+  auto *VecTy = FixedVectorType::get(ScalarTy, InterleaveFactor * VF);
 
   // Prepare for the new pointers.
   SmallVector<Value *, 2> AddrParts;
@@ -2252,21 +2267,21 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
         if (BlockInMask) {
           Value *BlockInMaskPart = State.get(BlockInMask, Part);
           auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
-          auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
           Value *ShuffledMask = Builder.CreateShuffleVector(
-              BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
+              BlockInMaskPart, Undefs,
+              createReplicatedMask(InterleaveFactor, VF), "interleaved.mask");
           GroupMask = MaskForGaps
                           ? Builder.CreateBinOp(Instruction::And, ShuffledMask,
                                                 MaskForGaps)
                           : ShuffledMask;
         }
         NewLoad =
-            Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlignment(),
+            Builder.CreateMaskedLoad(AddrParts[Part], Group->getAlign(),
                                      GroupMask, UndefVec, "wide.masked.vec");
       }
       else
         NewLoad = Builder.CreateAlignedLoad(VecTy, AddrParts[Part],
-                                            Group->getAlignment(), "wide.vec");
+                                            Group->getAlign(), "wide.vec");
       Group->addMetadata(NewLoad);
       NewLoads.push_back(NewLoad);
     }
@@ -2280,14 +2295,14 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
       if (!Member)
         continue;
 
-      Constant *StrideMask = createStrideMask(Builder, I, InterleaveFactor, VF);
+      auto StrideMask = createStrideMask(I, InterleaveFactor, VF);
       for (unsigned Part = 0; Part < UF; Part++) {
         Value *StridedVec = Builder.CreateShuffleVector(
             NewLoads[Part], UndefVec, StrideMask, "strided.vec");
 
         // If this member has different type, cast the result type.
         if (Member->getType() != ScalarTy) {
-          VectorType *OtherVTy = VectorType::get(Member->getType(), VF);
+          VectorType *OtherVTy = FixedVectorType::get(Member->getType(), VF);
           StridedVec = createBitOrPointerCast(StridedVec, OtherVTy, DL);
         }
 
@@ -2301,7 +2316,7 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
   }
 
   // The sub vector type for current instruction.
-  VectorType *SubVT = VectorType::get(ScalarTy, VF);
+  auto *SubVT = FixedVectorType::get(ScalarTy, VF);
 
   // Vectorize the interleaved store group.
   for (unsigned Part = 0; Part < UF; Part++) {
@@ -2329,23 +2344,23 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
     Value *WideVec = concatenateVectors(Builder, StoredVecs);
 
     // Interleave the elements in the wide vector.
-    Constant *IMask = createInterleaveMask(Builder, VF, InterleaveFactor);
-    Value *IVec = Builder.CreateShuffleVector(WideVec, UndefVec, IMask,
-                                              "interleaved.vec");
+    Value *IVec = Builder.CreateShuffleVector(
+        WideVec, UndefVec, createInterleaveMask(VF, InterleaveFactor),
+        "interleaved.vec");
 
     Instruction *NewStoreInstr;
     if (BlockInMask) {
       Value *BlockInMaskPart = State.get(BlockInMask, Part);
       auto *Undefs = UndefValue::get(BlockInMaskPart->getType());
-      auto *RepMask = createReplicatedMask(Builder, InterleaveFactor, VF);
       Value *ShuffledMask = Builder.CreateShuffleVector(
-          BlockInMaskPart, Undefs, RepMask, "interleaved.mask");
+          BlockInMaskPart, Undefs, createReplicatedMask(InterleaveFactor, VF),
+          "interleaved.mask");
       NewStoreInstr = Builder.CreateMaskedStore(
-          IVec, AddrParts[Part], Group->getAlignment(), ShuffledMask);
+          IVec, AddrParts[Part], Group->getAlign(), ShuffledMask);
     }
     else
-      NewStoreInstr = Builder.CreateAlignedStore(IVec, AddrParts[Part],
-                                                 Group->getAlignment());
+      NewStoreInstr =
+          Builder.CreateAlignedStore(IVec, AddrParts[Part], Group->getAlign());
 
     Group->addMetadata(NewStoreInstr);
   }
@@ -2354,27 +2369,26 @@ void InnerLoopVectorizer::vectorizeInterleaveGroup(Instruction *Instr,
 void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
                                                      VPTransformState &State,
                                                      VPValue *Addr,
+                                                     VPValue *StoredValue,
                                                      VPValue *BlockInMask) {
   // Attempt to issue a wide load.
   LoadInst *LI = dyn_cast<LoadInst>(Instr);
   StoreInst *SI = dyn_cast<StoreInst>(Instr);
 
   assert((LI || SI) && "Invalid Load/Store instruction");
+  assert((!SI || StoredValue) && "No stored value provided for widened store");
+  assert((!LI || !StoredValue) && "Stored value provided for widened load");
 
   LoopVectorizationCostModel::InstWidening Decision =
       Cost->getWideningDecision(Instr, VF);
-  assert(Decision != LoopVectorizationCostModel::CM_Unknown &&
-         "CM decision should be taken at this point");
-  if (Decision == LoopVectorizationCostModel::CM_Interleave)
-    return vectorizeInterleaveGroup(Instr, State, Addr, BlockInMask);
+  assert((Decision == LoopVectorizationCostModel::CM_Widen ||
+          Decision == LoopVectorizationCostModel::CM_Widen_Reverse ||
+          Decision == LoopVectorizationCostModel::CM_GatherScatter) &&
+         "CM decision is not to widen the memory instruction");
 
   Type *ScalarDataTy = getMemInstValueType(Instr);
-  Type *DataTy = VectorType::get(ScalarDataTy, VF);
-  // An alignment of 0 means target abi alignment. We need to use the scalar's
-  // target abi alignment in such a case.
-  const DataLayout &DL = Instr->getModule()->getDataLayout();
-  const Align Alignment =
-      DL.getValueOrABITypeAlignment(getLoadStoreAlignment(Instr), ScalarDataTy);
+  auto *DataTy = FixedVectorType::get(ScalarDataTy, VF);
+  const Align Alignment = getLoadStoreAlignment(Instr);
 
   // Determine if the pointer operand of the access is either consecutive or
   // reverse consecutive.
@@ -2431,12 +2445,12 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
 
     for (unsigned Part = 0; Part < UF; ++Part) {
       Instruction *NewSI = nullptr;
-      Value *StoredVal = getOrCreateVectorValue(SI->getValueOperand(), Part);
+      Value *StoredVal = State.get(StoredValue, Part);
       if (CreateGatherScatter) {
         Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
         Value *VectorGep = State.get(Addr, Part);
-        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep,
-                                            Alignment.value(), MaskPart);
+        NewSI = Builder.CreateMaskedScatter(StoredVal, VectorGep, Alignment,
+                                            MaskPart);
       } else {
         if (Reverse) {
           // If we store to reverse consecutive memory locations, then we need
@@ -2447,11 +2461,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
         }
         auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
         if (isMaskRequired)
-          NewSI = Builder.CreateMaskedStore(
-              StoredVal, VecPtr, Alignment.value(), BlockInMaskParts[Part]);
+          NewSI = Builder.CreateMaskedStore(StoredVal, VecPtr, Alignment,
+                                            BlockInMaskParts[Part]);
         else
-          NewSI =
-              Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment.value());
+          NewSI = Builder.CreateAlignedStore(StoredVal, VecPtr, Alignment);
       }
       addMetadata(NewSI, SI);
     }
@@ -2466,18 +2479,18 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
     if (CreateGatherScatter) {
       Value *MaskPart = isMaskRequired ? BlockInMaskParts[Part] : nullptr;
       Value *VectorGep = State.get(Addr, Part);
-      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment.value(), MaskPart,
+      NewLI = Builder.CreateMaskedGather(VectorGep, Alignment, MaskPart,
                                          nullptr, "wide.masked.gather");
       addMetadata(NewLI, LI);
     } else {
       auto *VecPtr = CreateVecPtr(Part, State.get(Addr, {0, 0}));
       if (isMaskRequired)
         NewLI = Builder.CreateMaskedLoad(
-            VecPtr, Alignment.value(), BlockInMaskParts[Part],
-            UndefValue::get(DataTy), "wide.masked.load");
+            VecPtr, Alignment, BlockInMaskParts[Part], UndefValue::get(DataTy),
+            "wide.masked.load");
       else
-        NewLI = Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment.value(),
-                                          "wide.load");
+        NewLI =
+            Builder.CreateAlignedLoad(DataTy, VecPtr, Alignment, "wide.load");
 
       // Add metadata to the load, but setVectorValue to the reverse shuffle.
       addMetadata(NewLI, LI);
@@ -2488,9 +2501,10 @@ void InnerLoopVectorizer::vectorizeMemoryInstruction(Instruction *Instr,
   }
 }
 
-void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
+void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr, VPUser &User,
                                                const VPIteration &Instance,
-                                               bool IfPredicateInstr) {
+                                               bool IfPredicateInstr,
+                                               VPTransformState &State) {
   assert(!Instr->getType()->isAggregateType() && "Can't handle vectors");
 
   setDebugLocFromInst(Builder, Instr);
@@ -2504,8 +2518,8 @@ void InnerLoopVectorizer::scalarizeInstruction(Instruction *Instr,
 
   // Replace the operands of the cloned instructions with their scalar
   // equivalents in the new loop.
-  for (unsigned op = 0, e = Instr->getNumOperands(); op != e; ++op) {
-    auto *NewOp = getOrCreateScalarValue(Instr->getOperand(op), Instance);
+  for (unsigned op = 0, e = User.getNumOperands(); op != e; ++op) {
+    auto *NewOp = State.get(User.getOperand(op), Instance);
     Cloned->setOperand(op, NewOp);
   }
   addNewMetadata(Cloned, Instr);
@@ -2578,7 +2592,7 @@ Value *InnerLoopVectorizer::getOrCreateTripCount(Loop *L) {
   // compare. The only way that we get a backedge taken count is that the
   // induction variable was signed and as such will not overflow. In such a case
   // truncation is legal.
-  if (BackedgeTakenCount->getType()->getPrimitiveSizeInBits() >
+  if (SE->getTypeSizeInBits(BackedgeTakenCount->getType()) >
       IdxTy->getPrimitiveSizeInBits())
     BackedgeTakenCount = SE->getTruncateOrNoop(BackedgeTakenCount, IdxTy);
   BackedgeTakenCount = SE->getNoopOrZeroExtend(BackedgeTakenCount, IdxTy);
@@ -2676,7 +2690,7 @@ Value *InnerLoopVectorizer::createBitOrPointerCast(Value *V, VectorType *DstVTy,
          "Only one type should be a floating point type");
   Type *IntTy =
       IntegerType::getIntNTy(V->getContext(), DL.getTypeSizeInBits(SrcElemTy));
-  VectorType *VecIntTy = VectorType::get(IntTy, VF);
+  auto *VecIntTy = FixedVectorType::get(IntTy, VF);
   Value *CastVal = Builder.CreateBitOrPointerCast(V, VecIntTy);
   return Builder.CreateBitOrPointerCast(CastVal, DstVTy);
 }
@@ -2774,12 +2788,17 @@ void InnerLoopVectorizer::emitMemRuntimeChecks(Loop *L, BasicBlock *Bypass) {
   // Generate the code that checks in runtime if arrays overlap. We put the
   // checks into a separate block to make the more common case of few elements
   // faster.
+  auto *LAI = Legal->getLAI();
+  const auto &RtPtrChecking = *LAI->getRuntimePointerChecking();
+  if (!RtPtrChecking.Need)
+    return;
   Instruction *FirstCheckInst;
   Instruction *MemRuntimeCheck;
   std::tie(FirstCheckInst, MemRuntimeCheck) =
-      Legal->getLAI()->addRuntimeChecks(MemCheckBlock->getTerminator());
-  if (!MemRuntimeCheck)
-    return;
+      addRuntimeChecks(MemCheckBlock->getTerminator(), OrigLoop,
+                       RtPtrChecking.getChecks(), RtPtrChecking.getSE());
+  assert(MemRuntimeCheck && "no RT checks generated although RtPtrChecking "
+                            "claimed checks are required");
 
   if (MemCheckBlock->getParent()->hasOptSize()) {
     assert(Cost->Hints->getForce() == LoopVectorizeHints::FK_Enabled &&
@@ -2858,6 +2877,18 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
     return B.CreateMul(X, Y);
   };
 
+  // Get a suitable insert point for SCEV expansion. For blocks in the vector
+  // loop, choose the end of the vector loop header (=LoopVectorBody), because
+  // the DomTree is not kept up-to-date for additional blocks generated in the
+  // vector loop. By using the header as insertion point, we guarantee that the
+  // expanded instructions dominate all their uses.
+  auto GetInsertPoint = [this, &B]() {
+    BasicBlock *InsertBB = B.GetInsertPoint()->getParent();
+    if (InsertBB != LoopVectorBody &&
+        LI->getLoopFor(LoopVectorBody) == LI->getLoopFor(InsertBB))
+      return LoopVectorBody->getTerminator();
+    return &*B.GetInsertPoint();
+  };
   switch (ID.getKind()) {
   case InductionDescriptor::IK_IntInduction: {
     assert(Index->getType() == StartValue->getType() &&
@@ -2865,7 +2896,7 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
     if (ID.getConstIntStepValue() && ID.getConstIntStepValue()->isMinusOne())
       return B.CreateSub(StartValue, Index);
     auto *Offset = CreateMul(
-        Index, Exp.expandCodeFor(Step, Index->getType(), &*B.GetInsertPoint()));
+        Index, Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint()));
     return CreateAdd(StartValue, Offset);
   }
   case InductionDescriptor::IK_PtrInduction: {
@@ -2873,8 +2904,8 @@ Value *InnerLoopVectorizer::emitTransformedIndex(
            "Expected constant step for pointer induction");
     return B.CreateGEP(
         StartValue->getType()->getPointerElementType(), StartValue,
-        CreateMul(Index, Exp.expandCodeFor(Step, Index->getType(),
-                                           &*B.GetInsertPoint())));
+        CreateMul(Index,
+                  Exp.expandCodeFor(Step, Index->getType(), GetInsertPoint())));
   }
   case InductionDescriptor::IK_FpInduction: {
     assert(Step->getType()->isFloatingPointTy() && "Expected FP Step value");
@@ -3034,8 +3065,7 @@ BasicBlock *InnerLoopVectorizer::createVectorizedLoopSkeleton() {
   // This variable saves the new starting index for the scalar loop. It is used
   // to test if there are any tail iterations left once the vector loop has
   // completed.
-  LoopVectorizationLegality::InductionList *List = Legal->getInductionVars();
-  for (auto &InductionEntry : *List) {
+  for (auto &InductionEntry : Legal->getInductionVars()) {
     PHINode *OrigPhi = InductionEntry.first;
     InductionDescriptor II = InductionEntry.second;
 
@@ -3258,7 +3288,6 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
                                                        unsigned VF,
                                                        bool &NeedToScalarize) {
   Function *F = CI->getCalledFunction();
-  StringRef FnName = CI->getCalledFunction()->getName();
   Type *ScalarRetTy = CI->getType();
   SmallVector<Type *, 4> Tys, ScalarTys;
   for (auto &ArgOp : CI->arg_operands())
@@ -3268,7 +3297,8 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
   // to be vectors, so we need to extract individual elements from there,
   // execute VF scalar calls, and then gather the result into the vector return
   // value.
-  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys);
+  unsigned ScalarCallCost = TTI.getCallInstrCost(F, ScalarRetTy, ScalarTys,
+                                                 TTI::TCK_RecipThroughput);
   if (VF == 1)
     return ScalarCallCost;
 
@@ -3286,11 +3316,15 @@ unsigned LoopVectorizationCostModel::getVectorCallCost(CallInst *CI,
   // If we can't emit a vector call for this function, then the currently found
   // cost is the cost we need to return.
   NeedToScalarize = true;
-  if (!TLI || !TLI->isFunctionVectorizable(FnName, VF) || CI->isNoBuiltin())
+  VFShape Shape = VFShape::get(*CI, {VF, false}, false /*HasGlobalPred*/);
+  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+
+  if (!TLI || CI->isNoBuiltin() || !VecFunc)
     return Cost;
 
   // If the corresponding vector cost is cheaper, return its cost.
-  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys);
+  unsigned VectorCallCost = TTI.getCallInstrCost(nullptr, RetTy, Tys,
+                                                 TTI::TCK_RecipThroughput);
   if (VectorCallCost < Cost) {
     NeedToScalarize = false;
     return VectorCallCost;
@@ -3303,22 +3337,20 @@ unsigned LoopVectorizationCostModel::getVectorIntrinsicCost(CallInst *CI,
   Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
   assert(ID && "Expected intrinsic call!");
 
-  FastMathFlags FMF;
-  if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
-    FMF = FPMO->getFastMathFlags();
-
-  SmallVector<Value *, 4> Operands(CI->arg_operands());
-  return TTI.getIntrinsicInstrCost(ID, CI->getType(), Operands, FMF, VF);
+  IntrinsicCostAttributes CostAttrs(ID, *CI, VF);
+  return TTI.getIntrinsicInstrCost(CostAttrs,
+                                   TargetTransformInfo::TCK_RecipThroughput);
 }
 
 static Type *smallestIntegerVectorType(Type *T1, Type *T2) {
-  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
-  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
+  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
+  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
   return I1->getBitWidth() < I2->getBitWidth() ? T1 : T2;
 }
+
 static Type *largestIntegerVectorType(Type *T1, Type *T2) {
-  auto *I1 = cast<IntegerType>(T1->getVectorElementType());
-  auto *I2 = cast<IntegerType>(T2->getVectorElementType());
+  auto *I1 = cast<IntegerType>(cast<VectorType>(T1)->getElementType());
+  auto *I2 = cast<IntegerType>(cast<VectorType>(T2)->getElementType());
   return I1->getBitWidth() > I2->getBitWidth() ? T1 : T2;
 }
 
@@ -3335,14 +3367,13 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
       continue;
     for (unsigned Part = 0; Part < UF; ++Part) {
       Value *I = getOrCreateVectorValue(KV.first, Part);
-      if (Erased.find(I) != Erased.end() || I->use_empty() ||
-          !isa<Instruction>(I))
+      if (Erased.count(I) || I->use_empty() || !isa<Instruction>(I))
         continue;
       Type *OriginalTy = I->getType();
       Type *ScalarTruncatedTy =
           IntegerType::get(OriginalTy->getContext(), KV.second);
-      Type *TruncatedTy = VectorType::get(ScalarTruncatedTy,
-                                          OriginalTy->getVectorNumElements());
+      auto *TruncatedTy = FixedVectorType::get(
+          ScalarTruncatedTy, cast<VectorType>(OriginalTy)->getNumElements());
       if (TruncatedTy == OriginalTy)
         continue;
 
@@ -3392,27 +3423,35 @@ void InnerLoopVectorizer::truncateToMinimalBitwidths() {
           break;
         }
       } else if (auto *SI = dyn_cast<ShuffleVectorInst>(I)) {
-        auto Elements0 = SI->getOperand(0)->getType()->getVectorNumElements();
+        auto Elements0 =
+            cast<VectorType>(SI->getOperand(0)->getType())->getNumElements();
         auto *O0 = B.CreateZExtOrTrunc(
-            SI->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements0));
-        auto Elements1 = SI->getOperand(1)->getType()->getVectorNumElements();
+            SI->getOperand(0),
+            FixedVectorType::get(ScalarTruncatedTy, Elements0));
+        auto Elements1 =
+            cast<VectorType>(SI->getOperand(1)->getType())->getNumElements();
         auto *O1 = B.CreateZExtOrTrunc(
-            SI->getOperand(1), VectorType::get(ScalarTruncatedTy, Elements1));
+            SI->getOperand(1),
+            FixedVectorType::get(ScalarTruncatedTy, Elements1));
 
-        NewI = B.CreateShuffleVector(O0, O1, SI->getMask());
+        NewI = B.CreateShuffleVector(O0, O1, SI->getShuffleMask());
       } else if (isa<LoadInst>(I) || isa<PHINode>(I)) {
         // Don't do anything with the operands, just extend the result.
         continue;
       } else if (auto *IE = dyn_cast<InsertElementInst>(I)) {
-        auto Elements = IE->getOperand(0)->getType()->getVectorNumElements();
+        auto Elements =
+            cast<VectorType>(IE->getOperand(0)->getType())->getNumElements();
         auto *O0 = B.CreateZExtOrTrunc(
-            IE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
+            IE->getOperand(0),
+            FixedVectorType::get(ScalarTruncatedTy, Elements));
         auto *O1 = B.CreateZExtOrTrunc(IE->getOperand(1), ScalarTruncatedTy);
         NewI = B.CreateInsertElement(O0, O1, IE->getOperand(2));
       } else if (auto *EE = dyn_cast<ExtractElementInst>(I)) {
-        auto Elements = EE->getOperand(0)->getType()->getVectorNumElements();
+        auto Elements =
+            cast<VectorType>(EE->getOperand(0)->getType())->getNumElements();
         auto *O0 = B.CreateZExtOrTrunc(
-            EE->getOperand(0), VectorType::get(ScalarTruncatedTy, Elements));
+            EE->getOperand(0),
+            FixedVectorType::get(ScalarTruncatedTy, Elements));
         NewI = B.CreateExtractElement(O0, EE->getOperand(2));
       } else {
         // If we don't know what to do, be conservative and don't do anything.
@@ -3471,7 +3510,7 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
   PSE.getSE()->forgetLoop(OrigLoop);
 
   // Fix-up external users of the induction variables.
-  for (auto &Entry : *Legal->getInductionVars())
+  for (auto &Entry : Legal->getInductionVars())
     fixupIVUsers(Entry.first, Entry.second,
                  getOrCreateVectorTripCount(LI->getLoopFor(LoopVectorBody)),
                  IVEndValues[Entry.first], LoopMiddleBlock);
@@ -3482,6 +3521,19 @@ void InnerLoopVectorizer::fixVectorizedLoop() {
 
   // Remove redundant induction instructions.
   cse(LoopVectorBody);
+
+  // Set/update profile weights for the vector and remainder loops as original
+  // loop iterations are now distributed among them. Note that original loop
+  // represented by LoopScalarBody becomes remainder loop after vectorization.
+  //
+  // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
+  // end up getting slightly roughened result but that should be OK since
+  // profile is not inherently precise anyway. Note also possible bypass of
+  // vector code caused by legality checks is ignored, assigning all the weight
+  // to the vector loop, optimistically.
+  setProfileInfoAfterUnrolling(LI->getLoopFor(LoopScalarBody),
+                               LI->getLoopFor(LoopVectorBody),
+                               LI->getLoopFor(LoopScalarBody), VF * UF);
 }
 
 void InnerLoopVectorizer::fixCrossIterationPHIs() {
@@ -3563,8 +3615,8 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   if (VF > 1) {
     Builder.SetInsertPoint(LoopVectorPreHeader->getTerminator());
     VectorInit = Builder.CreateInsertElement(
-        UndefValue::get(VectorType::get(VectorInit->getType(), VF)), VectorInit,
-        Builder.getInt32(VF - 1), "vector.recur.init");
+        UndefValue::get(FixedVectorType::get(VectorInit->getType(), VF)),
+        VectorInit, Builder.getInt32(VF - 1), "vector.recur.init");
   }
 
   // We constructed a temporary phi node in the first phase of vectorization.
@@ -3605,10 +3657,10 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
 
   // We will construct a vector for the recurrence by combining the values for
   // the current and previous iterations. This is the required shuffle mask.
-  SmallVector<Constant *, 8> ShuffleMask(VF);
-  ShuffleMask[0] = Builder.getInt32(VF - 1);
+  SmallVector<int, 8> ShuffleMask(VF);
+  ShuffleMask[0] = VF - 1;
   for (unsigned I = 1; I < VF; ++I)
-    ShuffleMask[I] = Builder.getInt32(I + VF - 1);
+    ShuffleMask[I] = I + VF - 1;
 
   // The vector from which to take the initial value for the current iteration
   // (actual or unrolled). Initially, this is the vector phi node.
@@ -3618,10 +3670,9 @@ void InnerLoopVectorizer::fixFirstOrderRecurrence(PHINode *Phi) {
   for (unsigned Part = 0; Part < UF; ++Part) {
     Value *PreviousPart = getOrCreateVectorValue(Previous, Part);
     Value *PhiPart = VectorLoopValueMap.getVectorValue(Phi, Part);
-    auto *Shuffle =
-        VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
-                                             ConstantVector::get(ShuffleMask))
-               : Incoming;
+    auto *Shuffle = VF > 1 ? Builder.CreateShuffleVector(Incoming, PreviousPart,
+                                                         ShuffleMask)
+                           : Incoming;
     PhiPart->replaceAllUsesWith(Shuffle);
     cast<Instruction>(PhiPart)->eraseFromParent();
     VectorLoopValueMap.resetVectorValue(Phi, Part, Shuffle);
@@ -3684,7 +3735,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   // Get it's reduction variable descriptor.
   assert(Legal->isReductionVariable(Phi) &&
          "Unable to find the reduction variable");
-  RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[Phi];
+  RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[Phi];
 
   RecurrenceDescriptor::RecurrenceKind RK = RdxDesc.getRecurrenceKind();
   TrackingVH<Value> ReductionStartValue = RdxDesc.getRecurrenceStartValue();
@@ -3725,7 +3776,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
       // incoming scalar reduction.
       VectorStart = ReductionStartValue;
     } else {
-      Identity = ConstantVector::getSplat(VF, Iden);
+      Identity = ConstantVector::getSplat({VF, false}, Iden);
 
       // This vector is the Identity vector where the first element is the
       // incoming scalar reduction.
@@ -3787,7 +3838,7 @@ void InnerLoopVectorizer::fixReduction(PHINode *Phi) {
   // then extend the loop exit value to enable InstCombine to evaluate the
   // entire expression in the smaller type.
   if (VF > 1 && Phi->getType() != RdxDesc.getRecurrenceType()) {
-    Type *RdxVecTy = VectorType::get(RdxDesc.getRecurrenceType(), VF);
+    Type *RdxVecTy = FixedVectorType::get(RdxDesc.getRecurrenceType(), VF);
     Builder.SetInsertPoint(
         LI->getLoopFor(LoopVectorBody)->getLoopLatch()->getTerminator());
     VectorParts RdxParts(UF);
@@ -4036,9 +4087,11 @@ void InnerLoopVectorizer::fixNonInductionPHIs() {
   }
 }
 
-void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
-                                   unsigned VF, bool IsPtrLoopInvariant,
-                                   SmallBitVector &IsIndexLoopInvariant) {
+void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, VPUser &Operands,
+                                   unsigned UF, unsigned VF,
+                                   bool IsPtrLoopInvariant,
+                                   SmallBitVector &IsIndexLoopInvariant,
+                                   VPTransformState &State) {
   // Construct a vector GEP by widening the operands of the scalar GEP as
   // necessary. We mark the vector GEP 'inbounds' if appropriate. A GEP
   // results in a vector of pointers when at least one operand of the GEP
@@ -4075,19 +4128,18 @@ void InnerLoopVectorizer::widenGEP(GetElementPtrInst *GEP, unsigned UF,
     for (unsigned Part = 0; Part < UF; ++Part) {
       // The pointer operand of the new GEP. If it's loop-invariant, we
       // won't broadcast it.
-      auto *Ptr = IsPtrLoopInvariant
-                      ? GEP->getPointerOperand()
-                      : getOrCreateVectorValue(GEP->getPointerOperand(), Part);
+      auto *Ptr = IsPtrLoopInvariant ? State.get(Operands.getOperand(0), {0, 0})
+                                     : State.get(Operands.getOperand(0), Part);
 
       // Collect all the indices for the new GEP. If any index is
       // loop-invariant, we won't broadcast it.
       SmallVector<Value *, 4> Indices;
-      for (auto Index : enumerate(GEP->indices())) {
-        Value *User = Index.value().get();
-        if (IsIndexLoopInvariant[Index.index()])
-          Indices.push_back(User);
+      for (unsigned I = 1, E = Operands.getNumOperands(); I < E; I++) {
+        VPValue *Operand = Operands.getOperand(I);
+        if (IsIndexLoopInvariant[I - 1])
+          Indices.push_back(State.get(Operand, {0, 0}));
         else
-          Indices.push_back(getOrCreateVectorValue(User, Part));
+          Indices.push_back(State.get(Operand, Part));
       }
 
       // Create the new GEP. Note that this GEP may be a scalar if VF == 1,
@@ -4114,7 +4166,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     // Create a vector phi with no operands - the vector phi operands will be
     // set at the end of vector code generation.
     Type *VecTy =
-        (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
+        (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
     Value *VecPhi = Builder.CreatePHI(VecTy, PN->getNumOperands(), "vec.phi");
     VectorLoopValueMap.setVectorValue(P, 0, VecPhi);
     OrigPHIsToFix.push_back(P);
@@ -4133,7 +4185,7 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
     for (unsigned Part = 0; Part < UF; ++Part) {
       // This is phase one of vectorizing PHIs.
       Type *VecTy =
-          (VF == 1) ? PN->getType() : VectorType::get(PN->getType(), VF);
+          (VF == 1) ? PN->getType() : FixedVectorType::get(PN->getType(), VF);
       Value *EntryPart = PHINode::Create(
           VecTy, 2, "vec.phi", &*LoopVectorBody->getFirstInsertionPt());
       VectorLoopValueMap.setVectorValue(P, Part, EntryPart);
@@ -4145,9 +4197,9 @@ void InnerLoopVectorizer::widenPHIInstruction(Instruction *PN, unsigned UF,
 
   // This PHINode must be an induction variable.
   // Make sure that we know about it.
-  assert(Legal->getInductionVars()->count(P) && "Not an induction variable");
+  assert(Legal->getInductionVars().count(P) && "Not an induction variable");
 
-  InductionDescriptor II = Legal->getInductionVars()->lookup(P);
+  InductionDescriptor II = Legal->getInductionVars().lookup(P);
   const DataLayout &DL = OrigLoop->getHeader()->getModule()->getDataLayout();
 
   // FIXME: The newly created binary instructions should contain nsw/nuw flags,
@@ -4203,11 +4255,14 @@ static bool mayDivideByZero(Instruction &I) {
   return !CInt || CInt->isZero();
 }
 
-void InnerLoopVectorizer::widenInstruction(Instruction &I) {
+void InnerLoopVectorizer::widenInstruction(Instruction &I, VPUser &User,
+                                           VPTransformState &State) {
   switch (I.getOpcode()) {
+  case Instruction::Call:
   case Instruction::Br:
   case Instruction::PHI:
   case Instruction::GetElementPtr:
+  case Instruction::Select:
     llvm_unreachable("This instruction is handled by a different recipe.");
   case Instruction::UDiv:
   case Instruction::SDiv:
@@ -4233,8 +4288,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
 
     for (unsigned Part = 0; Part < UF; ++Part) {
       SmallVector<Value *, 2> Ops;
-      for (Value *Op : I.operands())
-        Ops.push_back(getOrCreateVectorValue(Op, Part));
+      for (VPValue *VPOp : User.operands())
+        Ops.push_back(State.get(VPOp, Part));
 
       Value *V = Builder.CreateNAryOp(I.getOpcode(), Ops);
 
@@ -4248,35 +4303,6 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
 
     break;
   }
-  case Instruction::Select: {
-    // Widen selects.
-    // If the selector is loop invariant we can create a select
-    // instruction with a scalar condition. Otherwise, use vector-select.
-    auto *SE = PSE.getSE();
-    bool InvariantCond =
-        SE->isLoopInvariant(PSE.getSCEV(I.getOperand(0)), OrigLoop);
-    setDebugLocFromInst(Builder, &I);
-
-    // The condition can be loop invariant  but still defined inside the
-    // loop. This means that we can't just use the original 'cond' value.
-    // We have to take the 'vectorized' value and pick the first lane.
-    // Instcombine will make this a no-op.
-
-    auto *ScalarCond = getOrCreateScalarValue(I.getOperand(0), {0, 0});
-
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      Value *Cond = getOrCreateVectorValue(I.getOperand(0), Part);
-      Value *Op0 = getOrCreateVectorValue(I.getOperand(1), Part);
-      Value *Op1 = getOrCreateVectorValue(I.getOperand(2), Part);
-      Value *Sel =
-          Builder.CreateSelect(InvariantCond ? ScalarCond : Cond, Op0, Op1);
-      VectorLoopValueMap.setVectorValue(&I, Part, Sel);
-      addMetadata(Sel, &I);
-    }
-
-    break;
-  }
-
   case Instruction::ICmp:
   case Instruction::FCmp: {
     // Widen compares. Generate vector compares.
@@ -4284,8 +4310,8 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
     auto *Cmp = cast<CmpInst>(&I);
     setDebugLocFromInst(Builder, Cmp);
     for (unsigned Part = 0; Part < UF; ++Part) {
-      Value *A = getOrCreateVectorValue(Cmp->getOperand(0), Part);
-      Value *B = getOrCreateVectorValue(Cmp->getOperand(1), Part);
+      Value *A = State.get(User.getOperand(0), Part);
+      Value *B = State.get(User.getOperand(1), Part);
       Value *C = nullptr;
       if (FCmp) {
         // Propagate fast math flags.
@@ -4319,78 +4345,80 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
 
     /// Vectorize casts.
     Type *DestTy =
-        (VF == 1) ? CI->getType() : VectorType::get(CI->getType(), VF);
+        (VF == 1) ? CI->getType() : FixedVectorType::get(CI->getType(), VF);
 
     for (unsigned Part = 0; Part < UF; ++Part) {
-      Value *A = getOrCreateVectorValue(CI->getOperand(0), Part);
+      Value *A = State.get(User.getOperand(0), Part);
       Value *Cast = Builder.CreateCast(CI->getOpcode(), A, DestTy);
       VectorLoopValueMap.setVectorValue(&I, Part, Cast);
       addMetadata(Cast, &I);
     }
     break;
   }
+  default:
+    // This instruction is not vectorized by simple widening.
+    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
+    llvm_unreachable("Unhandled instruction!");
+  } // end of switch.
+}
 
-  case Instruction::Call: {
-    // Ignore dbg intrinsics.
-    if (isa<DbgInfoIntrinsic>(I))
-      break;
-    setDebugLocFromInst(Builder, &I);
-
-    Module *M = I.getParent()->getParent()->getParent();
-    auto *CI = cast<CallInst>(&I);
+void InnerLoopVectorizer::widenCallInstruction(CallInst &I, VPUser &ArgOperands,
+                                               VPTransformState &State) {
+  assert(!isa<DbgInfoIntrinsic>(I) &&
+         "DbgInfoIntrinsic should have been dropped during VPlan construction");
+  setDebugLocFromInst(Builder, &I);
 
-    StringRef FnName = CI->getCalledFunction()->getName();
-    Function *F = CI->getCalledFunction();
-    Type *RetTy = ToVectorTy(CI->getType(), VF);
-    SmallVector<Type *, 4> Tys;
-    for (Value *ArgOperand : CI->arg_operands())
-      Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
+  Module *M = I.getParent()->getParent()->getParent();
+  auto *CI = cast<CallInst>(&I);
 
-    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+  SmallVector<Type *, 4> Tys;
+  for (Value *ArgOperand : CI->arg_operands())
+    Tys.push_back(ToVectorTy(ArgOperand->getType(), VF));
 
-    // The flag shows whether we use Intrinsic or a usual Call for vectorized
-    // version of the instruction.
-    // Is it beneficial to perform intrinsic call compared to lib call?
-    bool NeedToScalarize;
-    unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
-    bool UseVectorIntrinsic =
-        ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
-    assert((UseVectorIntrinsic || !NeedToScalarize) &&
-           "Instruction should be scalarized elsewhere.");
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
-    for (unsigned Part = 0; Part < UF; ++Part) {
-      SmallVector<Value *, 4> Args;
-      for (unsigned i = 0, ie = CI->getNumArgOperands(); i != ie; ++i) {
-        Value *Arg = CI->getArgOperand(i);
-        // Some intrinsics have a scalar argument - don't replace it with a
-        // vector.
-        if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, i))
-          Arg = getOrCreateVectorValue(CI->getArgOperand(i), Part);
-        Args.push_back(Arg);
-      }
+  // The flag shows whether we use Intrinsic or a usual Call for vectorized
+  // version of the instruction.
+  // Is it beneficial to perform intrinsic call compared to lib call?
+  bool NeedToScalarize = false;
+  unsigned CallCost = Cost->getVectorCallCost(CI, VF, NeedToScalarize);
+  bool UseVectorIntrinsic =
+      ID && Cost->getVectorIntrinsicCost(CI, VF) <= CallCost;
+  assert((UseVectorIntrinsic || !NeedToScalarize) &&
+         "Instruction should be scalarized elsewhere.");
 
-      Function *VectorF;
-      if (UseVectorIntrinsic) {
-        // Use vector version of the intrinsic.
-        Type *TysForDecl[] = {CI->getType()};
-        if (VF > 1)
-          TysForDecl[0] = VectorType::get(CI->getType()->getScalarType(), VF);
-        VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
-      } else {
-        // Use vector version of the library call.
-        StringRef VFnName = TLI->getVectorizedFunction(FnName, VF);
-        assert(!VFnName.empty() && "Vector function name is empty.");
-        VectorF = M->getFunction(VFnName);
-        if (!VectorF) {
-          // Generate a declaration
-          FunctionType *FTy = FunctionType::get(RetTy, Tys, false);
-          VectorF =
-              Function::Create(FTy, Function::ExternalLinkage, VFnName, M);
-          VectorF->copyAttributesFrom(F);
-        }
-      }
-      assert(VectorF && "Can't create vector function.");
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    SmallVector<Value *, 4> Args;
+    for (auto &I : enumerate(ArgOperands.operands())) {
+      // Some intrinsics have a scalar argument - don't replace it with a
+      // vector.
+      Value *Arg;
+      if (!UseVectorIntrinsic || !hasVectorInstrinsicScalarOpd(ID, I.index()))
+        Arg = State.get(I.value(), Part);
+      else
+        Arg = State.get(I.value(), {0, 0});
+      Args.push_back(Arg);
+    }
 
+    Function *VectorF;
+    if (UseVectorIntrinsic) {
+      // Use vector version of the intrinsic.
+      Type *TysForDecl[] = {CI->getType()};
+      if (VF > 1)
+        TysForDecl[0] =
+            FixedVectorType::get(CI->getType()->getScalarType(), VF);
+      VectorF = Intrinsic::getDeclaration(M, ID, TysForDecl);
+      assert(VectorF && "Can't retrieve vector intrinsic.");
+    } else {
+      // Use vector version of the function call.
+      const VFShape Shape =
+          VFShape::get(*CI, {VF, false} /*EC*/, false /*HasGlobalPred*/);
+#ifndef NDEBUG
+      assert(VFDatabase(*CI).getVectorizedFunction(Shape) != nullptr &&
+             "Can't create vector function.");
+#endif
+        VectorF = VFDatabase(*CI).getVectorizedFunction(Shape);
+    }
       SmallVector<OperandBundleDef, 1> OpBundles;
       CI->getOperandBundlesAsDefs(OpBundles);
       CallInst *V = Builder.CreateCall(VectorF, Args, OpBundles);
@@ -4400,16 +4428,31 @@ void InnerLoopVectorizer::widenInstruction(Instruction &I) {
 
       VectorLoopValueMap.setVectorValue(&I, Part, V);
       addMetadata(V, &I);
-    }
-
-    break;
   }
+}
 
-  default:
-    // This instruction is not vectorized by simple widening.
-    LLVM_DEBUG(dbgs() << "LV: Found an unhandled instruction: " << I);
-    llvm_unreachable("Unhandled instruction!");
-  } // end of switch.
+void InnerLoopVectorizer::widenSelectInstruction(SelectInst &I,
+                                                 VPUser &Operands,
+                                                 bool InvariantCond,
+                                                 VPTransformState &State) {
+  setDebugLocFromInst(Builder, &I);
+
+  // The condition can be loop invariant  but still defined inside the
+  // loop. This means that we can't just use the original 'cond' value.
+  // We have to take the 'vectorized' value and pick the first lane.
+  // Instcombine will make this a no-op.
+  auto *InvarCond =
+      InvariantCond ? State.get(Operands.getOperand(0), {0, 0}) : nullptr;
+
+  for (unsigned Part = 0; Part < UF; ++Part) {
+    Value *Cond =
+        InvarCond ? InvarCond : State.get(Operands.getOperand(0), Part);
+    Value *Op0 = State.get(Operands.getOperand(1), Part);
+    Value *Op1 = State.get(Operands.getOperand(2), Part);
+    Value *Sel = Builder.CreateSelect(Cond, Op0, Op1);
+    VectorLoopValueMap.setVectorValue(&I, Part, Sel);
+    addMetadata(Sel, &I);
+  }
 }
 
 void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
@@ -4502,7 +4545,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
       }
     }
   for (auto *I : ScalarPtrs)
-    if (PossibleNonScalarPtrs.find(I) == PossibleNonScalarPtrs.end()) {
+    if (!PossibleNonScalarPtrs.count(I)) {
       LLVM_DEBUG(dbgs() << "LV: Found scalar instruction: " << *I << "\n");
       Worklist.insert(I);
     }
@@ -4513,7 +4556,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
   // TODO: Once we are able to vectorize pointer induction variables we should
   //       no longer insert them into the worklist here.
   auto *Latch = TheLoop->getLoopLatch();
-  for (auto &Induction : *Legal->getInductionVars()) {
+  for (auto &Induction : Legal->getInductionVars()) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
     if (Induction.second.getKind() != InductionDescriptor::IK_PtrInduction)
@@ -4556,7 +4599,7 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
 
   // An induction variable will remain scalar if all users of the induction
   // variable and induction variable update remain scalar.
-  for (auto &Induction : *Legal->getInductionVars()) {
+  for (auto &Induction : Legal->getInductionVars()) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
@@ -4568,6 +4611,11 @@ void LoopVectorizationCostModel::collectLoopScalars(unsigned VF) {
     if (Induction.second.getKind() == InductionDescriptor::IK_PtrInduction)
       continue;
 
+    // If tail-folding is applied, the primary induction variable will be used
+    // to feed a vector compare.
+    if (Ind == Legal->getPrimaryInduction() && foldTailByMasking())
+      continue;
+
     // Determine if all users of the induction variable are scalar after
     // vectorization.
     auto ScalarInd = llvm::all_of(Ind->users(), [&](User *U) -> bool {
@@ -4618,7 +4666,7 @@ bool LoopVectorizationCostModel::isScalarWithPredication(Instruction *I, unsigne
              "Widening decision should be ready at this moment");
       return WideningDecision == CM_Scalarize;
     }
-    const MaybeAlign Alignment = getLoadStoreAlignment(I);
+    const Align Alignment = getLoadStoreAlignment(I);
     return isa<LoadInst>(I) ? !(isLegalMaskedLoad(Ty, Ptr, Alignment) ||
                                 isLegalMaskedGather(Ty, Alignment))
                             : !(isLegalMaskedStore(Ty, Ptr, Alignment) ||
@@ -4665,7 +4713,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(Instruction *I,
          "Masked interleave-groups for predicated accesses are not enabled.");
 
   auto *Ty = getMemInstValueType(I);
-  const MaybeAlign Alignment = getLoadStoreAlignment(I);
+  const Align Alignment = getLoadStoreAlignment(I);
   return isa<LoadInst>(I) ? TTI.isLegalMaskedLoad(Ty, Alignment)
                           : TTI.isLegalMaskedStore(Ty, Alignment);
 }
@@ -4803,7 +4851,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
   // Add to the Worklist all consecutive and consecutive-like pointers that
   // aren't also identified as possibly non-uniform.
   for (auto *V : ConsecutiveLikePtrs)
-    if (PossibleNonUniformPtrs.find(V) == PossibleNonUniformPtrs.end())
+    if (!PossibleNonUniformPtrs.count(V))
       addToWorklistIfAllowed(V);
 
   // Expand Worklist in topological order: whenever a new instruction
@@ -4847,7 +4895,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(unsigned VF) {
   // nodes separately. An induction variable will remain uniform if all users
   // of the induction variable and induction variable update remain uniform.
   // The code below handles both pointer and non-pointer induction variables.
-  for (auto &Induction : *Legal->getInductionVars()) {
+  for (auto &Induction : Legal->getInductionVars()) {
     auto *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
 
@@ -4903,10 +4951,9 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
 
   // FIXME: Avoid specializing for stride==1 instead of bailing out.
   if (!Legal->getLAI()->getSymbolicStrides().empty()) {
-    reportVectorizationFailure("Runtime stride check is required with -Os/-Oz",
+    reportVectorizationFailure("Runtime stride check for small trip count",
         "runtime stride == 1 checks needed. Enable vectorization of "
-        "this loop with '#pragma clang loop vectorize(enable)' when "
-        "compiling with -Os/-Oz",
+        "this loop without such check by compiling with -Os/-Oz",
         "CantVersionLoopWithOptForSize", ORE, TheLoop);
     return true;
   }
@@ -4914,7 +4961,8 @@ bool LoopVectorizationCostModel::runtimeChecksRequired() {
   return false;
 }
 
-Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
+Optional<unsigned> LoopVectorizationCostModel::computeMaxVF(unsigned UserVF,
+                                                            unsigned UserIC) {
   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
     // TODO: It may by useful to do since it's still likely to be dynamically
     // uniform if the target can skip.
@@ -4936,7 +4984,7 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
 
   switch (ScalarEpilogueStatus) {
   case CM_ScalarEpilogueAllowed:
-    return computeFeasibleMaxVF(TC);
+    return UserVF ? UserVF : computeFeasibleMaxVF(TC);
   case CM_ScalarEpilogueNotNeededUsePredicate:
     LLVM_DEBUG(
         dbgs() << "LV: vector predicate hint/switch found.\n"
@@ -4964,11 +5012,18 @@ Optional<unsigned> LoopVectorizationCostModel::computeMaxVF() {
 
   // Invalidate interleave groups that require an epilogue if we can't mask
   // the interleave-group.
-  if (!useMaskedInterleavedAccesses(TTI))
+  if (!useMaskedInterleavedAccesses(TTI)) {
+    assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
+           "No decisions should have been taken at this point");
+    // Note: There is no need to invalidate any cost modeling decisions here, as
+    // non where taken so far.
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
+  }
 
-  unsigned MaxVF = computeFeasibleMaxVF(TC);
-  if (TC > 0 && TC % MaxVF == 0) {
+  unsigned MaxVF = UserVF ? UserVF : computeFeasibleMaxVF(TC);
+  assert((UserVF || isPowerOf2_32(MaxVF)) && "MaxVF must be a power of 2");
+  unsigned MaxVFtimesIC = UserIC ? MaxVF * UserIC : MaxVF;
+  if (TC > 0 && TC % MaxVFtimesIC == 0) {
     // Accept MaxVF if we do not have a tail.
     LLVM_DEBUG(dbgs() << "LV: No tail will remain for any chosen VF.\n");
     return MaxVF;
@@ -5015,7 +5070,9 @@ LoopVectorizationCostModel::computeFeasibleMaxVF(unsigned ConstTripCount) {
 
   WidestRegister = std::min(WidestRegister, MaxSafeRegisterWidth);
 
-  unsigned MaxVectorSize = WidestRegister / WidestType;
+  // Ensure MaxVF is a power of 2; the dependence distance bound may not be.
+  // Note that both WidestRegister and WidestType may not be a powers of 2.
+  unsigned MaxVectorSize = PowerOf2Floor(WidestRegister / WidestType);
 
   LLVM_DEBUG(dbgs() << "LV: The Smallest and Widest types: " << SmallestType
                     << " / " << WidestType << " bits.\n");
@@ -5140,7 +5197,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
       Type *T = I.getType();
 
       // Skip ignored values.
-      if (ValuesToIgnore.find(&I) != ValuesToIgnore.end())
+      if (ValuesToIgnore.count(&I))
         continue;
 
       // Only examine Loads, Stores and PHINodes.
@@ -5152,7 +5209,7 @@ LoopVectorizationCostModel::getSmallestAndWidestTypes() {
       if (auto *PN = dyn_cast<PHINode>(&I)) {
         if (!Legal->isReductionVariable(PN))
           continue;
-        RecurrenceDescriptor RdxDesc = (*Legal->getReductionVars())[PN];
+        RecurrenceDescriptor RdxDesc = Legal->getReductionVars()[PN];
         T = RdxDesc.getRecurrenceType();
       }
 
@@ -5294,7 +5351,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
 
   // Interleave if we vectorized this loop and there is a reduction that could
   // benefit from interleaving.
-  if (VF > 1 && !Legal->getReductionVars()->empty()) {
+  if (VF > 1 && !Legal->getReductionVars().empty()) {
     LLVM_DEBUG(dbgs() << "LV: Interleaving because of reductions.\n");
     return IC;
   }
@@ -5325,7 +5382,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
     // by this point), we can increase the critical path length if the loop
     // we're interleaving is inside another loop. Limit, by default to 2, so the
     // critical path only gets increased by one reduction operation.
-    if (!Legal->getReductionVars()->empty() && TheLoop->getLoopDepth() > 1) {
+    if (!Legal->getReductionVars().empty() && TheLoop->getLoopDepth() > 1) {
       unsigned F = static_cast<unsigned>(MaxNestedScalarReductionIC);
       SmallIC = std::min(SmallIC, F);
       StoresIC = std::min(StoresIC, F);
@@ -5345,7 +5402,7 @@ unsigned LoopVectorizationCostModel::selectInterleaveCount(unsigned VF,
 
   // Interleave if this is a large loop (small loops are already dealt with by
   // this point) that could benefit from interleaving.
-  bool HasReductions = !Legal->getReductionVars()->empty();
+  bool HasReductions = !Legal->getReductionVars().empty();
   if (TTI.enableAggressiveInterleaving(HasReductions)) {
     LLVM_DEBUG(dbgs() << "LV: Interleaving to expose ILP.\n");
     return IC;
@@ -5459,11 +5516,11 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
       OpenIntervals.erase(ToRemove);
 
     // Ignore instructions that are never used within the loop.
-    if (Ends.find(I) == Ends.end())
+    if (!Ends.count(I))
       continue;
 
     // Skip ignored values.
-    if (ValuesToIgnore.find(I) != ValuesToIgnore.end())
+    if (ValuesToIgnore.count(I))
       continue;
 
     // For each VF find the maximum usage of registers.
@@ -5483,7 +5540,7 @@ LoopVectorizationCostModel::calculateRegisterUsage(ArrayRef<unsigned> VFs) {
         collectUniformsAndScalars(VFs[j]);
         for (auto Inst : OpenIntervals) {
           // Skip ignored values for VF > 1.
-          if (VecValuesToIgnore.find(Inst) != VecValuesToIgnore.end())
+          if (VecValuesToIgnore.count(Inst))
             continue;
           if (isScalarAfterVectorization(Inst, VFs[j])) {
             unsigned ClassID = TTI.getRegisterClassForType(false, Inst->getType());
@@ -5676,9 +5733,11 @@ int LoopVectorizationCostModel::computePredInstDiscount(
     // Compute the scalarization overhead of needed insertelement instructions
     // and phi nodes.
     if (isScalarWithPredication(I) && !I->getType()->isVoidTy()) {
-      ScalarCost += TTI.getScalarizationOverhead(ToVectorTy(I->getType(), VF),
-                                                 true, false);
-      ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI);
+      ScalarCost += TTI.getScalarizationOverhead(
+          cast<VectorType>(ToVectorTy(I->getType(), VF)),
+          APInt::getAllOnesValue(VF), true, false);
+      ScalarCost += VF * TTI.getCFInstrCost(Instruction::PHI,
+                                            TTI::TCK_RecipThroughput);
     }
 
     // Compute the scalarization overhead of needed extractelement
@@ -5693,7 +5752,8 @@ int LoopVectorizationCostModel::computePredInstDiscount(
           Worklist.push_back(J);
         else if (needsExtract(J, VF))
           ScalarCost += TTI.getScalarizationOverhead(
-                              ToVectorTy(J->getType(),VF), false, true);
+              cast<VectorType>(ToVectorTy(J->getType(), VF)),
+              APInt::getAllOnesValue(VF), false, true);
       }
 
     // Scale the total scalar cost by block probability.
@@ -5719,8 +5779,7 @@ LoopVectorizationCostModel::expectedCost(unsigned VF) {
     // For each instruction in the old loop.
     for (Instruction &I : BB->instructionsWithoutDebug()) {
       // Skip ignored values.
-      if (ValuesToIgnore.find(&I) != ValuesToIgnore.end() ||
-          (VF > 1 && VecValuesToIgnore.find(&I) != VecValuesToIgnore.end()))
+      if (ValuesToIgnore.count(&I) || (VF > 1 && VecValuesToIgnore.count(&I)))
         continue;
 
       VectorizationCostTy C = getInstructionCost(&I, VF);
@@ -5806,9 +5865,10 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
 
   // Don't pass *I here, since it is scalar but will actually be part of a
   // vectorized loop where the user of it is a vectorized instruction.
-  const MaybeAlign Alignment = getLoadStoreAlignment(I);
+  const Align Alignment = getLoadStoreAlignment(I);
   Cost += VF * TTI.getMemoryOpCost(I->getOpcode(), ValTy->getScalarType(),
-                                   Alignment, AS);
+                                   Alignment, AS, 
+                                   TTI::TCK_RecipThroughput);
 
   // Get the overhead of the extractelement and insertelement instructions
   // we might create due to scalarization.
@@ -5832,20 +5892,22 @@ unsigned LoopVectorizationCostModel::getMemInstScalarizationCost(Instruction *I,
 unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
                                                              unsigned VF) {
   Type *ValTy = getMemInstValueType(I);
-  Type *VectorTy = ToVectorTy(ValTy, VF);
+  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
   Value *Ptr = getLoadStorePointerOperand(I);
   unsigned AS = getLoadStoreAddressSpace(I);
   int ConsecutiveStride = Legal->isConsecutivePtr(Ptr);
+  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   assert((ConsecutiveStride == 1 || ConsecutiveStride == -1) &&
          "Stride should be 1 or -1 for consecutive memory access");
-  const MaybeAlign Alignment = getLoadStoreAlignment(I);
+  const Align Alignment = getLoadStoreAlignment(I);
   unsigned Cost = 0;
   if (Legal->isMaskRequired(I))
-    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy,
-                                      Alignment ? Alignment->value() : 0, AS);
+    Cost += TTI.getMaskedMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
+                                      CostKind);
   else
-    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS, I);
+    Cost += TTI.getMemoryOpCost(I->getOpcode(), VectorTy, Alignment, AS,
+                                CostKind, I);
 
   bool Reverse = ConsecutiveStride < 0;
   if (Reverse)
@@ -5856,19 +5918,22 @@ unsigned LoopVectorizationCostModel::getConsecutiveMemOpCost(Instruction *I,
 unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
                                                          unsigned VF) {
   Type *ValTy = getMemInstValueType(I);
-  Type *VectorTy = ToVectorTy(ValTy, VF);
-  const MaybeAlign Alignment = getLoadStoreAlignment(I);
+  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+  const Align Alignment = getLoadStoreAlignment(I);
   unsigned AS = getLoadStoreAddressSpace(I);
+  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   if (isa<LoadInst>(I)) {
     return TTI.getAddressComputationCost(ValTy) +
-           TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS) +
+           TTI.getMemoryOpCost(Instruction::Load, ValTy, Alignment, AS,
+                               CostKind) +
            TTI.getShuffleCost(TargetTransformInfo::SK_Broadcast, VectorTy);
   }
   StoreInst *SI = cast<StoreInst>(I);
 
   bool isLoopInvariantStoreValue = Legal->isUniform(SI->getValueOperand());
   return TTI.getAddressComputationCost(ValTy) +
-         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS) +
+         TTI.getMemoryOpCost(Instruction::Store, ValTy, Alignment, AS,
+                             CostKind) +
          (isLoopInvariantStoreValue
               ? 0
               : TTI.getVectorInstrCost(Instruction::ExtractElement, VectorTy,
@@ -5878,27 +5943,27 @@ unsigned LoopVectorizationCostModel::getUniformMemOpCost(Instruction *I,
 unsigned LoopVectorizationCostModel::getGatherScatterCost(Instruction *I,
                                                           unsigned VF) {
   Type *ValTy = getMemInstValueType(I);
-  Type *VectorTy = ToVectorTy(ValTy, VF);
-  const MaybeAlign Alignment = getLoadStoreAlignment(I);
-  Value *Ptr = getLoadStorePointerOperand(I);
+  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+  const Align Alignment = getLoadStoreAlignment(I);
+  const Value *Ptr = getLoadStorePointerOperand(I);
 
   return TTI.getAddressComputationCost(VectorTy) +
-         TTI.getGatherScatterOpCost(I->getOpcode(), VectorTy, Ptr,
-                                    Legal->isMaskRequired(I),
-                                    Alignment ? Alignment->value() : 0);
+         TTI.getGatherScatterOpCost(
+             I->getOpcode(), VectorTy, Ptr, Legal->isMaskRequired(I), Alignment,
+             TargetTransformInfo::TCK_RecipThroughput, I);
 }
 
 unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
                                                             unsigned VF) {
   Type *ValTy = getMemInstValueType(I);
-  Type *VectorTy = ToVectorTy(ValTy, VF);
+  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
   unsigned AS = getLoadStoreAddressSpace(I);
 
   auto Group = getInterleavedAccessGroup(I);
   assert(Group && "Fail to get an interleaved access group.");
 
   unsigned InterleaveFactor = Group->getFactor();
-  Type *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
+  auto *WideVecTy = FixedVectorType::get(ValTy, VF * InterleaveFactor);
 
   // Holds the indices of existing members in an interleaved load group.
   // An interleaved store group doesn't need this as it doesn't allow gaps.
@@ -5913,8 +5978,8 @@ unsigned LoopVectorizationCostModel::getInterleaveGroupCost(Instruction *I,
   bool UseMaskForGaps =
       Group->requiresScalarEpilogue() && !isScalarEpilogueAllowed();
   unsigned Cost = TTI.getInterleavedMemoryOpCost(
-      I->getOpcode(), WideVecTy, Group->getFactor(), Indices,
-      Group->getAlignment(), AS, Legal->isMaskRequired(I), UseMaskForGaps);
+      I->getOpcode(), WideVecTy, Group->getFactor(), Indices, Group->getAlign(),
+      AS, TTI::TCK_RecipThroughput, Legal->isMaskRequired(I), UseMaskForGaps);
 
   if (Group->isReverse()) {
     // TODO: Add support for reversed masked interleaved access.
@@ -5932,11 +5997,12 @@ unsigned LoopVectorizationCostModel::getMemoryInstructionCost(Instruction *I,
   // moment.
   if (VF == 1) {
     Type *ValTy = getMemInstValueType(I);
-    const MaybeAlign Alignment = getLoadStoreAlignment(I);
+    const Align Alignment = getLoadStoreAlignment(I);
     unsigned AS = getLoadStoreAddressSpace(I);
 
     return TTI.getAddressComputationCost(ValTy) +
-           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS, I);
+           TTI.getMemoryOpCost(I->getOpcode(), ValTy, Alignment, AS,
+                               TTI::TCK_RecipThroughput, I);
   }
   return getWideningCost(I, VF);
 }
@@ -5955,7 +6021,7 @@ LoopVectorizationCostModel::getInstructionCost(Instruction *I, unsigned VF) {
   auto ForcedScalar = ForcedScalars.find(VF);
   if (VF > 1 && ForcedScalar != ForcedScalars.end()) {
     auto InstSet = ForcedScalar->second;
-    if (InstSet.find(I) != InstSet.end())
+    if (InstSet.count(I))
       return VectorizationCostTy((getInstructionCost(I, 1).first * VF), false);
   }
 
@@ -5977,7 +6043,8 @@ unsigned LoopVectorizationCostModel::getScalarizationOverhead(Instruction *I,
   Type *RetTy = ToVectorTy(I->getType(), VF);
   if (!RetTy->isVoidTy() &&
       (!isa<LoadInst>(I) || !TTI.supportsEfficientVectorElementLoadStore()))
-    Cost += TTI.getScalarizationOverhead(RetTy, true, false);
+    Cost += TTI.getScalarizationOverhead(
+        cast<VectorType>(RetTy), APInt::getAllOnesValue(VF), true, false);
 
   // Some targets keep addresses scalar.
   if (isa<LoadInst>(I) && !TTI.prefersVectorizedAddressing())
@@ -6157,6 +6224,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     RetTy = IntegerType::get(RetTy->getContext(), MinBWs[I]);
   VectorTy = isScalarAfterVectorization(I, VF) ? RetTy : ToVectorTy(RetTy, VF);
   auto SE = PSE.getSE();
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   // TODO: We need to estimate the cost of intrinsic calls.
   switch (I->getOpcode()) {
@@ -6173,21 +6241,20 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     bool ScalarPredicatedBB = false;
     BranchInst *BI = cast<BranchInst>(I);
     if (VF > 1 && BI->isConditional() &&
-        (PredicatedBBsAfterVectorization.find(BI->getSuccessor(0)) !=
-             PredicatedBBsAfterVectorization.end() ||
-         PredicatedBBsAfterVectorization.find(BI->getSuccessor(1)) !=
-             PredicatedBBsAfterVectorization.end()))
+        (PredicatedBBsAfterVectorization.count(BI->getSuccessor(0)) ||
+         PredicatedBBsAfterVectorization.count(BI->getSuccessor(1))))
       ScalarPredicatedBB = true;
 
     if (ScalarPredicatedBB) {
       // Return cost for branches around scalarized and predicated blocks.
-      Type *Vec_i1Ty =
-          VectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
-      return (TTI.getScalarizationOverhead(Vec_i1Ty, false, true) +
-              (TTI.getCFInstrCost(Instruction::Br) * VF));
+      auto *Vec_i1Ty =
+          FixedVectorType::get(IntegerType::getInt1Ty(RetTy->getContext()), VF);
+      return (TTI.getScalarizationOverhead(Vec_i1Ty, APInt::getAllOnesValue(VF),
+                                           false, true) +
+              (TTI.getCFInstrCost(Instruction::Br, CostKind) * VF));
     } else if (I->getParent() == TheLoop->getLoopLatch() || VF == 1)
       // The back-edge branch will remain, as will all scalar branches.
-      return TTI.getCFInstrCost(Instruction::Br);
+      return TTI.getCFInstrCost(Instruction::Br, CostKind);
     else
       // This branch will be eliminated by if-conversion.
       return 0;
@@ -6202,7 +6269,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     // NOTE: Don't use ToVectorTy as SK_ExtractSubvector expects a vector type.
     if (VF > 1 && Legal->isFirstOrderRecurrence(Phi))
       return TTI.getShuffleCost(TargetTransformInfo::SK_ExtractSubvector,
-                                VectorTy, VF - 1, VectorType::get(RetTy, 1));
+                                cast<VectorType>(VectorTy), VF - 1,
+                                FixedVectorType::get(RetTy, 1));
 
     // Phi nodes in non-header blocks (not inductions, reductions, etc.) are
     // converted into select instructions. We require N - 1 selects per phi
@@ -6211,9 +6279,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       return (Phi->getNumIncomingValues() - 1) *
              TTI.getCmpSelInstrCost(
                  Instruction::Select, ToVectorTy(Phi->getType(), VF),
-                 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF));
+                 ToVectorTy(Type::getInt1Ty(Phi->getContext()), VF),
+                 CostKind);
 
-    return TTI.getCFInstrCost(Instruction::PHI);
+    return TTI.getCFInstrCost(Instruction::PHI, CostKind);
   }
   case Instruction::UDiv:
   case Instruction::SDiv:
@@ -6230,10 +6299,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
       // that we will create. This cost is likely to be zero. The phi node
       // cost, if any, should be scaled by the block probability because it
       // models a copy at the end of each predicated block.
-      Cost += VF * TTI.getCFInstrCost(Instruction::PHI);
+      Cost += VF * TTI.getCFInstrCost(Instruction::PHI, CostKind);
 
       // The cost of the non-predicated instruction.
-      Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy);
+      Cost += VF * TTI.getArithmeticInstrCost(I->getOpcode(), RetTy, CostKind);
 
       // The cost of insertelement and extractelement instructions needed for
       // scalarization.
@@ -6274,13 +6343,15 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     SmallVector<const Value *, 4> Operands(I->operand_values());
     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
     return N * TTI.getArithmeticInstrCost(
-                   I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
+                   I->getOpcode(), VectorTy, CostKind,
+                   TargetTransformInfo::OK_AnyValue,
                    Op2VK, TargetTransformInfo::OP_None, Op2VP, Operands, I);
   }
   case Instruction::FNeg: {
     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
     return N * TTI.getArithmeticInstrCost(
-                   I->getOpcode(), VectorTy, TargetTransformInfo::OK_AnyValue,
+                   I->getOpcode(), VectorTy, CostKind,
+                   TargetTransformInfo::OK_AnyValue,
                    TargetTransformInfo::OK_AnyValue,
                    TargetTransformInfo::OP_None, TargetTransformInfo::OP_None,
                    I->getOperand(0), I);
@@ -6291,9 +6362,10 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     bool ScalarCond = (SE->isLoopInvariant(CondSCEV, TheLoop));
     Type *CondTy = SI->getCondition()->getType();
     if (!ScalarCond)
-      CondTy = VectorType::get(CondTy, VF);
+      CondTy = FixedVectorType::get(CondTy, VF);
 
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy, I);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, CondTy,
+                                  CostKind, I);
   }
   case Instruction::ICmp:
   case Instruction::FCmp: {
@@ -6302,7 +6374,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     if (canTruncateToMinimalBitwidth(Op0AsInstruction, VF))
       ValTy = IntegerType::get(ValTy->getContext(), MinBWs[Op0AsInstruction]);
     VectorTy = ToVectorTy(ValTy, VF);
-    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, I);
+    return TTI.getCmpSelInstrCost(I->getOpcode(), VectorTy, nullptr, CostKind,
+                                  I);
   }
   case Instruction::Store:
   case Instruction::Load: {
@@ -6335,7 +6408,7 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     if (isOptimizableIVTruncate(I, VF)) {
       auto *Trunc = cast<TruncInst>(I);
       return TTI.getCastInstrCost(Instruction::Trunc, Trunc->getDestTy(),
-                                  Trunc->getSrcTy(), Trunc);
+                                  Trunc->getSrcTy(), CostKind, Trunc);
     }
 
     Type *SrcScalarTy = I->getOperand(0)->getType();
@@ -6361,7 +6434,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
     }
 
     unsigned N = isScalarAfterVectorization(I, VF) ? VF : 1;
-    return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy, I);
+    return N * TTI.getCastInstrCost(I->getOpcode(), VectorTy, SrcVecTy,
+                                    CostKind, I);
   }
   case Instruction::Call: {
     bool NeedToScalarize;
@@ -6374,7 +6448,8 @@ unsigned LoopVectorizationCostModel::getInstructionCost(Instruction *I,
   default:
     // The cost of executing VF copies of the scalar instruction. This opcode
     // is unknown. Assume that it is the same as 'mul'.
-    return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy) +
+    return VF * TTI.getArithmeticInstrCost(Instruction::Mul, VectorTy,
+                                           CostKind) +
            getScalarizationOverhead(I, VF);
   } // end of switch.
 }
@@ -6397,6 +6472,7 @@ INITIALIZE_PASS_DEPENDENCY(LoopAccessLegacyAnalysis)
 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(ProfileSummaryInfoWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
 INITIALIZE_PASS_END(LoopVectorize, LV_NAME, lv_name, false, false)
 
 namespace llvm {
@@ -6424,14 +6500,14 @@ void LoopVectorizationCostModel::collectValuesToIgnore() {
 
   // Ignore type-promoting instructions we identified during reduction
   // detection.
-  for (auto &Reduction : *Legal->getReductionVars()) {
+  for (auto &Reduction : Legal->getReductionVars()) {
     RecurrenceDescriptor &RedDes = Reduction.second;
     SmallPtrSetImpl<Instruction *> &Casts = RedDes.getCastInsts();
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
   }
   // Ignore type-casting instructions we identified during induction
   // detection.
-  for (auto &Induction : *Legal->getInductionVars()) {
+  for (auto &Induction : Legal->getInductionVars()) {
     InductionDescriptor &IndDes = Induction.second;
     const SmallVectorImpl<Instruction *> &Casts = IndDes.getCastInsts();
     VecValuesToIgnore.insert(Casts.begin(), Casts.end());
@@ -6490,9 +6566,10 @@ LoopVectorizationPlanner::planInVPlanNativePath(unsigned UserVF) {
   return VectorizationFactor::Disabled();
 }
 
-Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
+Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF,
+                                                             unsigned UserIC) {
   assert(OrigLoop->empty() && "Inner loop expected.");
-  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF();
+  Optional<unsigned> MaybeMaxVF = CM.computeMaxVF(UserVF, UserIC);
   if (!MaybeMaxVF) // Cases that should not to be vectorized nor interleaved.
     return None;
 
@@ -6503,7 +6580,11 @@ Optional<VectorizationFactor> LoopVectorizationPlanner::plan(unsigned UserVF) {
         dbgs()
         << "LV: Invalidate all interleaved groups due to fold-tail by masking "
            "which requires masked-interleaved support.\n");
-    CM.InterleaveInfo.reset();
+    if (CM.InterleaveInfo.invalidateGroups())
+      // Invalidating interleave groups also requires invalidating all decisions
+      // based on them, which includes widening decisions and uniform and scalar
+      // values.
+      CM.invalidateCostModelingDecisions();
   }
 
   if (UserVF) {
@@ -6563,6 +6644,7 @@ void LoopVectorizationPlanner::executePlan(InnerLoopVectorizer &ILV,
                          &ILV,   CallbackILV};
   State.CFG.PrevBB = ILV.createVectorizedLoopSkeleton();
   State.TripCount = ILV.getOrCreateTripCount(nullptr);
+  State.CanonicalIV = ILV.Induction;
 
   //===------------------------------------------------===//
   //
@@ -6595,12 +6677,11 @@ void LoopVectorizationPlanner::collectTriviallyDeadInstructions(
   // We create new "steps" for induction variable updates to which the original
   // induction variables map. An original update instruction will be dead if
   // all its users except the induction variable are dead.
-  for (auto &Induction : *Legal->getInductionVars()) {
+  for (auto &Induction : Legal->getInductionVars()) {
     PHINode *Ind = Induction.first;
     auto *IndUpdate = cast<Instruction>(Ind->getIncomingValueForBlock(Latch));
     if (llvm::all_of(IndUpdate->users(), [&](User *U) -> bool {
-          return U == Ind || DeadInstructions.find(cast<Instruction>(U)) !=
-                                 DeadInstructions.end();
+          return U == Ind || DeadInstructions.count(cast<Instruction>(U));
         }))
       DeadInstructions.insert(IndUpdate);
 
@@ -6716,7 +6797,7 @@ VPValue *VPRecipeBuilder::createEdgeMask(BasicBlock *Src, BasicBlock *Dst,
   BranchInst *BI = dyn_cast<BranchInst>(Src->getTerminator());
   assert(BI && "Unexpected terminator found");
 
-  if (!BI->isConditional())
+  if (!BI->isConditional() || BI->getSuccessor(0) == BI->getSuccessor(1))
     return EdgeMaskCache[Edge] = SrcMask;
 
   VPValue *EdgeMask = Plan->getVPValue(BI->getCondition());
@@ -6749,9 +6830,21 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
 
     // Introduce the early-exit compare IV <= BTC to form header block mask.
     // This is used instead of IV < TC because TC may wrap, unlike BTC.
-    VPValue *IV = Plan->getVPValue(Legal->getPrimaryInduction());
+    // Start by constructing the desired canonical IV.
+    VPValue *IV = nullptr;
+    if (Legal->getPrimaryInduction())
+      IV = Plan->getVPValue(Legal->getPrimaryInduction());
+    else {
+      auto IVRecipe = new VPWidenCanonicalIVRecipe();
+      Builder.getInsertBlock()->appendRecipe(IVRecipe);
+      IV = IVRecipe->getVPValue();
+    }
     VPValue *BTC = Plan->getOrCreateBackedgeTakenCount();
-    BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
+    bool TailFolded = !CM.isScalarEpilogueAllowed();
+    if (TailFolded && CM.TTI.emitGetActiveLaneMask())
+      BlockMask = Builder.createNaryOp(VPInstruction::ActiveLaneMask, {IV, BTC});
+    else
+      BlockMask = Builder.createNaryOp(VPInstruction::ICmpULE, {IV, BTC});
     return BlockMaskCache[BB] = BlockMask;
   }
 
@@ -6775,8 +6868,8 @@ VPValue *VPRecipeBuilder::createBlockInMask(BasicBlock *BB, VPlanPtr &Plan) {
 VPWidenMemoryInstructionRecipe *
 VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
                                   VPlanPtr &Plan) {
-  if (!isa<LoadInst>(I) && !isa<StoreInst>(I))
-    return nullptr;
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Must be called with either a load or store");
 
   auto willWiden = [&](unsigned VF) -> bool {
     if (VF == 1)
@@ -6801,22 +6894,29 @@ VPRecipeBuilder::tryToWidenMemory(Instruction *I, VFRange &Range,
     Mask = createBlockInMask(I->getParent(), Plan);
 
   VPValue *Addr = Plan->getOrAddVPValue(getLoadStorePointerOperand(I));
-  return new VPWidenMemoryInstructionRecipe(*I, Addr, Mask);
+  if (LoadInst *Load = dyn_cast<LoadInst>(I))
+    return new VPWidenMemoryInstructionRecipe(*Load, Addr, Mask);
+
+  StoreInst *Store = cast<StoreInst>(I);
+  VPValue *StoredValue = Plan->getOrAddVPValue(Store->getValueOperand());
+  return new VPWidenMemoryInstructionRecipe(*Store, Addr, StoredValue, Mask);
 }
 
 VPWidenIntOrFpInductionRecipe *
-VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
-  if (PHINode *Phi = dyn_cast<PHINode>(I)) {
-    // Check if this is an integer or fp induction. If so, build the recipe that
-    // produces its scalar and vector values.
-    InductionDescriptor II = Legal->getInductionVars()->lookup(Phi);
-    if (II.getKind() == InductionDescriptor::IK_IntInduction ||
-        II.getKind() == InductionDescriptor::IK_FpInduction)
-      return new VPWidenIntOrFpInductionRecipe(Phi);
+VPRecipeBuilder::tryToOptimizeInductionPHI(PHINode *Phi) const {
+  // Check if this is an integer or fp induction. If so, build the recipe that
+  // produces its scalar and vector values.
+  InductionDescriptor II = Legal->getInductionVars().lookup(Phi);
+  if (II.getKind() == InductionDescriptor::IK_IntInduction ||
+      II.getKind() == InductionDescriptor::IK_FpInduction)
+    return new VPWidenIntOrFpInductionRecipe(Phi);
 
-    return nullptr;
-  }
+  return nullptr;
+}
 
+VPWidenIntOrFpInductionRecipe *
+VPRecipeBuilder::tryToOptimizeInductionTruncate(TruncInst *I,
+                                                VFRange &Range) const {
   // Optimize the special case where the source is a constant integer
   // induction variable. Notice that we can only optimize the 'trunc' case
   // because (a) FP conversions lose precision, (b) sext/zext may wrap, and
@@ -6830,54 +6930,89 @@ VPRecipeBuilder::tryToOptimizeInduction(Instruction *I, VFRange &Range) {
         [=](unsigned VF) -> bool { return CM.isOptimizableIVTruncate(K, VF); };
   };
 
-  if (isa<TruncInst>(I) && LoopVectorizationPlanner::getDecisionAndClampRange(
-                               isOptimizableIVTruncate(I), Range))
+  if (LoopVectorizationPlanner::getDecisionAndClampRange(
+          isOptimizableIVTruncate(I), Range))
     return new VPWidenIntOrFpInductionRecipe(cast<PHINode>(I->getOperand(0)),
-                                             cast<TruncInst>(I));
+                                             I);
   return nullptr;
 }
 
-VPBlendRecipe *VPRecipeBuilder::tryToBlend(Instruction *I, VPlanPtr &Plan) {
-  PHINode *Phi = dyn_cast<PHINode>(I);
-  if (!Phi || Phi->getParent() == OrigLoop->getHeader())
-    return nullptr;
-
+VPBlendRecipe *VPRecipeBuilder::tryToBlend(PHINode *Phi, VPlanPtr &Plan) {
   // We know that all PHIs in non-header blocks are converted into selects, so
   // we don't have to worry about the insertion order and we can just use the
   // builder. At this point we generate the predication tree. There may be
   // duplications since this is a simple recursive scan, but future
   // optimizations will clean it up.
 
-  SmallVector<VPValue *, 2> Masks;
+  SmallVector<VPValue *, 2> Operands;
   unsigned NumIncoming = Phi->getNumIncomingValues();
   for (unsigned In = 0; In < NumIncoming; In++) {
     VPValue *EdgeMask =
       createEdgeMask(Phi->getIncomingBlock(In), Phi->getParent(), Plan);
     assert((EdgeMask || NumIncoming == 1) &&
            "Multiple predecessors with one having a full mask");
+    Operands.push_back(Plan->getOrAddVPValue(Phi->getIncomingValue(In)));
     if (EdgeMask)
-      Masks.push_back(EdgeMask);
+      Operands.push_back(EdgeMask);
   }
-  return new VPBlendRecipe(Phi, Masks);
+  return new VPBlendRecipe(Phi, Operands);
 }
 
-bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
-                                 VFRange &Range) {
+VPWidenCallRecipe *VPRecipeBuilder::tryToWidenCall(CallInst *CI, VFRange &Range,
+                                                   VPlan &Plan) const {
 
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
-      [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
+      [this, CI](unsigned VF) { return CM.isScalarWithPredication(CI, VF); },
+      Range);
 
   if (IsPredicated)
-    return false;
+    return nullptr;
+
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+  if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
+             ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
+    return nullptr;
+
+  auto willWiden = [&](unsigned VF) -> bool {
+    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+    // The following case may be scalarized depending on the VF.
+    // The flag shows whether we use Intrinsic or a usual Call for vectorized
+    // version of the instruction.
+    // Is it beneficial to perform intrinsic call compared to lib call?
+    bool NeedToScalarize = false;
+    unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
+    bool UseVectorIntrinsic =
+        ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
+    return UseVectorIntrinsic || !NeedToScalarize;
+  };
+
+  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
+    return nullptr;
+
+  return new VPWidenCallRecipe(*CI, Plan.mapToVPValues(CI->arg_operands()));
+}
 
+bool VPRecipeBuilder::shouldWiden(Instruction *I, VFRange &Range) const {
+  assert(!isa<BranchInst>(I) && !isa<PHINode>(I) && !isa<LoadInst>(I) &&
+         !isa<StoreInst>(I) && "Instruction should have been handled earlier");
+  // Instruction should be widened, unless it is scalar after vectorization,
+  // scalarization is profitable or it is predicated.
+  auto WillScalarize = [this, I](unsigned VF) -> bool {
+    return CM.isScalarAfterVectorization(I, VF) ||
+           CM.isProfitableToScalarize(I, VF) ||
+           CM.isScalarWithPredication(I, VF);
+  };
+  return !LoopVectorizationPlanner::getDecisionAndClampRange(WillScalarize,
+                                                             Range);
+}
+
+VPWidenRecipe *VPRecipeBuilder::tryToWiden(Instruction *I, VPlan &Plan) const {
   auto IsVectorizableOpcode = [](unsigned Opcode) {
     switch (Opcode) {
     case Instruction::Add:
     case Instruction::And:
     case Instruction::AShr:
     case Instruction::BitCast:
-    case Instruction::Br:
-    case Instruction::Call:
     case Instruction::FAdd:
     case Instruction::FCmp:
     case Instruction::FDiv:
@@ -6891,11 +7026,9 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
     case Instruction::FSub:
     case Instruction::ICmp:
     case Instruction::IntToPtr:
-    case Instruction::Load:
     case Instruction::LShr:
     case Instruction::Mul:
     case Instruction::Or:
-    case Instruction::PHI:
     case Instruction::PtrToInt:
     case Instruction::SDiv:
     case Instruction::Select:
@@ -6903,7 +7036,6 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
     case Instruction::Shl:
     case Instruction::SIToFP:
     case Instruction::SRem:
-    case Instruction::Store:
     case Instruction::Sub:
     case Instruction::Trunc:
     case Instruction::UDiv:
@@ -6917,60 +7049,10 @@ bool VPRecipeBuilder::tryToWiden(Instruction *I, VPBasicBlock *VPBB,
   };
 
   if (!IsVectorizableOpcode(I->getOpcode()))
-    return false;
-
-  if (CallInst *CI = dyn_cast<CallInst>(I)) {
-    Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-    if (ID && (ID == Intrinsic::assume || ID == Intrinsic::lifetime_end ||
-               ID == Intrinsic::lifetime_start || ID == Intrinsic::sideeffect))
-      return false;
-  }
-
-  auto willWiden = [&](unsigned VF) -> bool {
-    if (!isa<PHINode>(I) && (CM.isScalarAfterVectorization(I, VF) ||
-                             CM.isProfitableToScalarize(I, VF)))
-      return false;
-    if (CallInst *CI = dyn_cast<CallInst>(I)) {
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-      // The following case may be scalarized depending on the VF.
-      // The flag shows whether we use Intrinsic or a usual Call for vectorized
-      // version of the instruction.
-      // Is it beneficial to perform intrinsic call compared to lib call?
-      bool NeedToScalarize;
-      unsigned CallCost = CM.getVectorCallCost(CI, VF, NeedToScalarize);
-      bool UseVectorIntrinsic =
-          ID && CM.getVectorIntrinsicCost(CI, VF) <= CallCost;
-      return UseVectorIntrinsic || !NeedToScalarize;
-    }
-    if (isa<LoadInst>(I) || isa<StoreInst>(I)) {
-      assert(CM.getWideningDecision(I, VF) ==
-                 LoopVectorizationCostModel::CM_Scalarize &&
-             "Memory widening decisions should have been taken care by now");
-      return false;
-    }
-    return true;
-  };
-
-  if (!LoopVectorizationPlanner::getDecisionAndClampRange(willWiden, Range))
-    return false;
-  // If this ingredient's recipe is to be recorded, keep its recipe a singleton
-  // to avoid having to split recipes later.
-  bool IsSingleton = Ingredient2Recipe.count(I);
+    return nullptr;
 
   // Success: widen this instruction.
-
-  // Use the default widening recipe. We optimize the common case where
-  // consecutive instructions can be represented by a single recipe.
-  if (!IsSingleton && !VPBB->empty() && LastExtensibleRecipe == &VPBB->back() &&
-      LastExtensibleRecipe->appendInstruction(I))
-    return true;
-
-  VPWidenRecipe *WidenRecipe = new VPWidenRecipe(I);
-  if (!IsSingleton)
-    LastExtensibleRecipe = WidenRecipe;
-  setRecipe(I, WidenRecipe);
-  VPBB->appendRecipe(WidenRecipe);
-  return true;
+  return new VPWidenRecipe(*I, Plan.mapToVPValues(I->operands()));
 }
 
 VPBasicBlock *VPRecipeBuilder::handleReplication(
@@ -6984,7 +7066,8 @@ VPBasicBlock *VPRecipeBuilder::handleReplication(
   bool IsPredicated = LoopVectorizationPlanner::getDecisionAndClampRange(
       [&](unsigned VF) { return CM.isScalarWithPredication(I, VF); }, Range);
 
-  auto *Recipe = new VPReplicateRecipe(I, IsUniform, IsPredicated);
+  auto *Recipe = new VPReplicateRecipe(I, Plan->mapToVPValues(I->operands()),
+                                       IsUniform, IsPredicated);
   setRecipe(I, Recipe);
 
   // Find if I uses a predicated instruction. If so, it will use its scalar
@@ -7041,43 +7124,45 @@ VPRegionBlock *VPRecipeBuilder::createReplicateRegion(Instruction *Instr,
   return Region;
 }
 
-bool VPRecipeBuilder::tryToCreateRecipe(Instruction *Instr, VFRange &Range,
-                                        VPlanPtr &Plan, VPBasicBlock *VPBB) {
-  VPRecipeBase *Recipe = nullptr;
-
-  // First, check for specific widening recipes that deal with memory
+VPRecipeBase *VPRecipeBuilder::tryToCreateWidenRecipe(Instruction *Instr,
+                                                      VFRange &Range,
+                                                      VPlanPtr &Plan) {
+  // First, check for specific widening recipes that deal with calls, memory
   // operations, inductions and Phi nodes.
-  if ((Recipe = tryToWidenMemory(Instr, Range, Plan)) ||
-      (Recipe = tryToOptimizeInduction(Instr, Range)) ||
-      (Recipe = tryToBlend(Instr, Plan)) ||
-      (isa<PHINode>(Instr) &&
-       (Recipe = new VPWidenPHIRecipe(cast<PHINode>(Instr))))) {
-    setRecipe(Instr, Recipe);
-    VPBB->appendRecipe(Recipe);
-    return true;
-  }
+  if (auto *CI = dyn_cast<CallInst>(Instr))
+    return tryToWidenCall(CI, Range, *Plan);
 
-  // Handle GEP widening.
-  if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Instr)) {
-    auto Scalarize = [&](unsigned VF) {
-      return CM.isScalarWithPredication(Instr, VF) ||
-             CM.isScalarAfterVectorization(Instr, VF) ||
-             CM.isProfitableToScalarize(Instr, VF);
-    };
-    if (LoopVectorizationPlanner::getDecisionAndClampRange(Scalarize, Range))
-      return false;
-    VPWidenGEPRecipe *Recipe = new VPWidenGEPRecipe(GEP, OrigLoop);
-    setRecipe(Instr, Recipe);
-    VPBB->appendRecipe(Recipe);
-    return true;
+  if (isa<LoadInst>(Instr) || isa<StoreInst>(Instr))
+    return tryToWidenMemory(Instr, Range, Plan);
+
+  VPRecipeBase *Recipe;
+  if (auto Phi = dyn_cast<PHINode>(Instr)) {
+    if (Phi->getParent() != OrigLoop->getHeader())
+      return tryToBlend(Phi, Plan);
+    if ((Recipe = tryToOptimizeInductionPHI(Phi)))
+      return Recipe;
+    return new VPWidenPHIRecipe(Phi);
   }
 
-  // Check if Instr is to be widened by a general VPWidenRecipe, after
-  // having first checked for specific widening recipes.
-  if (tryToWiden(Instr, VPBB, Range))
-    return true;
+  if (isa<TruncInst>(Instr) &&
+      (Recipe = tryToOptimizeInductionTruncate(cast<TruncInst>(Instr), Range)))
+    return Recipe;
 
-  return false;
+  if (!shouldWiden(Instr, Range))
+    return nullptr;
+
+  if (auto GEP = dyn_cast<GetElementPtrInst>(Instr))
+    return new VPWidenGEPRecipe(GEP, Plan->mapToVPValues(GEP->operands()),
+                                OrigLoop);
+
+  if (auto *SI = dyn_cast<SelectInst>(Instr)) {
+    bool InvariantCond =
+        PSE.getSE()->isLoopInvariant(PSE.getSCEV(SI->getOperand(0)), OrigLoop);
+    return new VPWidenSelectRecipe(*SI, Plan->mapToVPValues(SI->operands()),
+                                   InvariantCond);
+  }
+
+  return tryToWiden(Instr, *Plan);
 }
 
 void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
@@ -7097,13 +7182,14 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
       NeedDef.insert(Branch->getCondition());
   }
 
-  // If the tail is to be folded by masking, the primary induction variable
-  // needs to be represented in VPlan for it to model early-exit masking.
+  // If the tail is to be folded by masking, the primary induction variable, if
+  // exists needs to be represented in VPlan for it to model early-exit masking.
   // Also, both the Phi and the live-out instruction of each reduction are
   // required in order to introduce a select between them in VPlan.
   if (CM.foldTailByMasking()) {
-    NeedDef.insert(Legal->getPrimaryInduction());
-    for (auto &Reduction : *Legal->getReductionVars()) {
+    if (Legal->getPrimaryInduction())
+      NeedDef.insert(Legal->getPrimaryInduction());
+    for (auto &Reduction : Legal->getReductionVars()) {
       NeedDef.insert(Reduction.first);
       NeedDef.insert(Reduction.second.getLoopExitInstr());
     }
@@ -7118,28 +7204,39 @@ void LoopVectorizationPlanner::buildVPlansWithVPRecipes(unsigned MinVF,
   SmallPtrSet<Instruction *, 4> DeadInstructions;
   collectTriviallyDeadInstructions(DeadInstructions);
 
+  // Add assume instructions we need to drop to DeadInstructions, to prevent
+  // them from being added to the VPlan.
+  // TODO: We only need to drop assumes in blocks that get flattend. If the
+  // control flow is preserved, we should keep them.
+  auto &ConditionalAssumes = Legal->getConditionalAssumes();
+  DeadInstructions.insert(ConditionalAssumes.begin(), ConditionalAssumes.end());
+
+  DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
+  // Dead instructions do not need sinking. Remove them from SinkAfter.
+  for (Instruction *I : DeadInstructions)
+    SinkAfter.erase(I);
+
   for (unsigned VF = MinVF; VF < MaxVF + 1;) {
     VFRange SubRange = {VF, MaxVF + 1};
-    VPlans.push_back(
-        buildVPlanWithVPRecipes(SubRange, NeedDef, DeadInstructions));
+    VPlans.push_back(buildVPlanWithVPRecipes(SubRange, NeedDef,
+                                             DeadInstructions, SinkAfter));
     VF = SubRange.End;
   }
 }
 
 VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     VFRange &Range, SmallPtrSetImpl<Value *> &NeedDef,
-    SmallPtrSetImpl<Instruction *> &DeadInstructions) {
+    SmallPtrSetImpl<Instruction *> &DeadInstructions,
+    const DenseMap<Instruction *, Instruction *> &SinkAfter) {
 
   // Hold a mapping from predicated instructions to their recipes, in order to
   // fix their AlsoPack behavior if a user is determined to replicate and use a
   // scalar instead of vector value.
   DenseMap<Instruction *, VPReplicateRecipe *> PredInst2Recipe;
 
-  DenseMap<Instruction *, Instruction *> &SinkAfter = Legal->getSinkAfter();
-
   SmallPtrSet<const InterleaveGroup<Instruction> *, 1> InterleaveGroups;
 
-  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, Builder);
+  VPRecipeBuilder RecipeBuilder(OrigLoop, TLI, Legal, CM, PSE, Builder);
 
   // ---------------------------------------------------------------------------
   // Pre-construction: record ingredients whose recipes we'll need to further
@@ -7177,8 +7274,9 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   // ---------------------------------------------------------------------------
 
   // Create a dummy pre-entry VPBasicBlock to start building the VPlan.
+  auto Plan = std::make_unique<VPlan>();
   VPBasicBlock *VPBB = new VPBasicBlock("Pre-Entry");
-  auto Plan = std::make_unique<VPlan>(VPBB);
+  Plan->setEntry(VPBB);
 
   // Represent values that will have defs inside VPlan.
   for (Value *V : NeedDef)
@@ -7199,17 +7297,21 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
     Builder.setInsertPoint(VPBB);
 
     // Introduce each ingredient into VPlan.
+    // TODO: Model and preserve debug instrinsics in VPlan.
     for (Instruction &I : BB->instructionsWithoutDebug()) {
       Instruction *Instr = &I;
 
       // First filter out irrelevant instructions, to ensure no recipes are
       // built for them.
-      if (isa<BranchInst>(Instr) ||
-          DeadInstructions.find(Instr) != DeadInstructions.end())
+      if (isa<BranchInst>(Instr) || DeadInstructions.count(Instr))
         continue;
 
-      if (RecipeBuilder.tryToCreateRecipe(Instr, Range, Plan, VPBB))
+      if (auto Recipe =
+              RecipeBuilder.tryToCreateWidenRecipe(Instr, Range, Plan)) {
+        RecipeBuilder.setRecipe(Instr, Recipe);
+        VPBB->appendRecipe(Recipe);
         continue;
+      }
 
       // Otherwise, if all widening options failed, Instruction is to be
       // replicated. This may create a successor for VPBB.
@@ -7264,7 +7366,7 @@ VPlanPtr LoopVectorizationPlanner::buildVPlanWithVPRecipes(
   if (CM.foldTailByMasking()) {
     Builder.setInsertPoint(VPBB);
     auto *Cond = RecipeBuilder.createBlockInMask(OrigLoop->getHeader(), Plan);
-    for (auto &Reduction : *Legal->getReductionVars()) {
+    for (auto &Reduction : Legal->getReductionVars()) {
       VPValue *Phi = Plan->getVPValue(Reduction.first);
       VPValue *Red = Plan->getVPValue(Reduction.second.getLoopExitInstr());
       Builder.createNaryOp(Instruction::Select, {Cond, Red, Phi});
@@ -7330,32 +7432,37 @@ Value *LoopVectorizationPlanner::VPCallbackILV::getOrCreateScalarValue(
   return ILV.getOrCreateScalarValue(V, Instance);
 }
 
-void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent) const {
-  O << " +\n"
-    << Indent << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
+void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
+                               VPSlotTracker &SlotTracker) const {
+  O << "\"INTERLEAVE-GROUP with factor " << IG->getFactor() << " at ";
   IG->getInsertPos()->printAsOperand(O, false);
   O << ", ";
-  getAddr()->printAsOperand(O);
+  getAddr()->printAsOperand(O, SlotTracker);
   VPValue *Mask = getMask();
   if (Mask) {
     O << ", ";
-    Mask->printAsOperand(O);
+    Mask->printAsOperand(O, SlotTracker);
   }
-  O << "\\l\"";
   for (unsigned i = 0; i < IG->getFactor(); ++i)
     if (Instruction *I = IG->getMember(i))
-      O << " +\n"
-        << Indent << "\"  " << VPlanIngredient(I) << " " << i << "\\l\"";
+      O << "\\l\" +\n" << Indent << "\"  " << VPlanIngredient(I) << " " << i;
+}
+
+void VPWidenCallRecipe::execute(VPTransformState &State) {
+  State.ILV->widenCallInstruction(Ingredient, User, State);
+}
+
+void VPWidenSelectRecipe::execute(VPTransformState &State) {
+  State.ILV->widenSelectInstruction(Ingredient, User, InvariantCond, State);
 }
 
 void VPWidenRecipe::execute(VPTransformState &State) {
-  for (auto &Instr : make_range(Begin, End))
-    State.ILV->widenInstruction(Instr);
+  State.ILV->widenInstruction(Ingredient, User, State);
 }
 
 void VPWidenGEPRecipe::execute(VPTransformState &State) {
-  State.ILV->widenGEP(GEP, State.UF, State.VF, IsPtrLoopInvariant,
-                      IsIndexLoopInvariant);
+  State.ILV->widenGEP(GEP, User, State.UF, State.VF, IsPtrLoopInvariant,
+                      IsIndexLoopInvariant, State);
 }
 
 void VPWidenIntOrFpInductionRecipe::execute(VPTransformState &State) {
@@ -7376,27 +7483,27 @@ void VPBlendRecipe::execute(VPTransformState &State) {
   // duplications since this is a simple recursive scan, but future
   // optimizations will clean it up.
 
-  unsigned NumIncoming = Phi->getNumIncomingValues();
+  unsigned NumIncoming = getNumIncomingValues();
 
-  assert((User || NumIncoming == 1) &&
-         "Multiple predecessors with predecessors having a full mask");
   // Generate a sequence of selects of the form:
   // SELECT(Mask3, In3,
-  //      SELECT(Mask2, In2,
-  //                   ( ...)))
+  //        SELECT(Mask2, In2,
+  //               SELECT(Mask1, In1,
+  //                      In0)))
+  // Note that Mask0 is never used: lanes for which no path reaches this phi and
+  // are essentially undef are taken from In0.
   InnerLoopVectorizer::VectorParts Entry(State.UF);
   for (unsigned In = 0; In < NumIncoming; ++In) {
     for (unsigned Part = 0; Part < State.UF; ++Part) {
       // We might have single edge PHIs (blocks) - use an identity
       // 'select' for the first PHI operand.
-      Value *In0 =
-          State.ILV->getOrCreateVectorValue(Phi->getIncomingValue(In), Part);
+      Value *In0 = State.get(getIncomingValue(In), Part);
       if (In == 0)
         Entry[Part] = In0; // Initialize with the first incoming value.
       else {
         // Select between the current value and the previous incoming edge
         // based on the incoming mask.
-        Value *Cond = State.get(User->getOperand(In), Part);
+        Value *Cond = State.get(getMask(In), Part);
         Entry[Part] =
             State.Builder.CreateSelect(Cond, In0, Entry[Part], "predphi");
       }
@@ -7408,19 +7515,19 @@ void VPBlendRecipe::execute(VPTransformState &State) {
 
 void VPInterleaveRecipe::execute(VPTransformState &State) {
   assert(!State.Instance && "Interleave group being replicated.");
-  State.ILV->vectorizeInterleaveGroup(IG->getInsertPos(), State, getAddr(),
-                                      getMask());
+  State.ILV->vectorizeInterleaveGroup(IG, State, getAddr(), getMask());
 }
 
 void VPReplicateRecipe::execute(VPTransformState &State) {
   if (State.Instance) { // Generate a single instance.
-    State.ILV->scalarizeInstruction(Ingredient, *State.Instance, IsPredicated);
+    State.ILV->scalarizeInstruction(Ingredient, User, *State.Instance,
+                                    IsPredicated, State);
     // Insert scalar instance packing it into a vector.
     if (AlsoPack && State.VF > 1) {
       // If we're constructing lane 0, initialize to start from undef.
       if (State.Instance->Lane == 0) {
-        Value *Undef =
-            UndefValue::get(VectorType::get(Ingredient->getType(), State.VF));
+        Value *Undef = UndefValue::get(
+            FixedVectorType::get(Ingredient->getType(), State.VF));
         State.ValueMap.setVectorValue(Ingredient, State.Instance->Part, Undef);
       }
       State.ILV->packScalarIntoVectorValue(Ingredient, *State.Instance);
@@ -7434,7 +7541,8 @@ void VPReplicateRecipe::execute(VPTransformState &State) {
   unsigned EndLane = IsUniform ? 1 : State.VF;
   for (unsigned Part = 0; Part < State.UF; ++Part)
     for (unsigned Lane = 0; Lane < EndLane; ++Lane)
-      State.ILV->scalarizeInstruction(Ingredient, {Part, Lane}, IsPredicated);
+      State.ILV->scalarizeInstruction(Ingredient, User, {Part, Lane},
+                                      IsPredicated, State);
 }
 
 void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
@@ -7444,15 +7552,14 @@ void VPBranchOnMaskRecipe::execute(VPTransformState &State) {
   unsigned Lane = State.Instance->Lane;
 
   Value *ConditionBit = nullptr;
-  if (!User) // Block in mask is all-one.
-    ConditionBit = State.Builder.getTrue();
-  else {
-    VPValue *BlockInMask = User->getOperand(0);
+  VPValue *BlockInMask = getMask();
+  if (BlockInMask) {
     ConditionBit = State.get(BlockInMask, Part);
     if (ConditionBit->getType()->isVectorTy())
       ConditionBit = State.Builder.CreateExtractElement(
           ConditionBit, State.Builder.getInt32(Lane));
-  }
+  } else // Block in mask is all-one.
+    ConditionBit = State.Builder.getTrue();
 
   // Replace the temporary unreachable terminator with a new conditional branch,
   // whose two destinations will be set later when they are created.
@@ -7496,7 +7603,9 @@ void VPPredInstPHIRecipe::execute(VPTransformState &State) {
 }
 
 void VPWidenMemoryInstructionRecipe::execute(VPTransformState &State) {
-  State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), getMask());
+  VPValue *StoredValue = isa<StoreInst>(Instr) ? getStoredValue() : nullptr;
+  State.ILV->vectorizeMemoryInstruction(&Instr, State, getAddr(), StoredValue,
+                                        getMask());
 }
 
 // Determine how to lower the scalar epilogue, which depends on 1) optimising
@@ -7513,16 +7622,15 @@ static ScalarEpilogueLowering getScalarEpilogueLowering(
                                                      PGSOQueryType::IRPass);
   // 1) OptSize takes precedence over all other options, i.e. if this is set,
   // don't look at hints or options, and don't request a scalar epilogue.
-  if (OptSize && Hints.getForce() != LoopVectorizeHints::FK_Enabled)
+  if (OptSize)
     return CM_ScalarEpilogueNotAllowedOptSize;
 
   bool PredicateOptDisabled = PreferPredicateOverEpilog.getNumOccurrences() &&
                               !PreferPredicateOverEpilog;
 
   // 2) Next, if disabling predication is requested on the command line, honour
-  // this and request a scalar epilogue. Also do this if we don't have a
-  // primary induction variable, which is required for predication.
-  if (PredicateOptDisabled || !LVL.getPrimaryInduction())
+  // this and request a scalar epilogue.
+  if (PredicateOptDisabled)
     return CM_ScalarEpilogueAllowed;
 
   // 3) and 4) look if enabling predication is requested on the command line,
@@ -7549,6 +7657,10 @@ static bool processLoopInVPlanNativePath(
     OptimizationRemarkEmitter *ORE, BlockFrequencyInfo *BFI,
     ProfileSummaryInfo *PSI, LoopVectorizeHints &Hints) {
 
+  if (PSE.getBackedgeTakenCount() == PSE.getSE()->getCouldNotCompute()) {
+    LLVM_DEBUG(dbgs() << "LV: cannot compute the outer-loop trip count\n");
+    return false;
+  }
   assert(EnableVPlanNativePath && "VPlan-native path is disabled.");
   Function *F = L->getHeader()->getParent();
   InterleavedAccessInfo IAI(PSE, L, DT, LI, LVL->getLAI());
@@ -7561,7 +7673,7 @@ static bool processLoopInVPlanNativePath(
   // Use the planner for outer loop vectorization.
   // TODO: CM is not used at this point inside the planner. Turn CM into an
   // optional argument if we don't need it in the future.
-  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI);
+  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, LVL, CM, IAI, PSE);
 
   // Get user vectorization factor.
   const unsigned UserVF = Hints.getWidth();
@@ -7587,10 +7699,16 @@ static bool processLoopInVPlanNativePath(
   // Mark the loop as already vectorized to avoid vectorizing again.
   Hints.setAlreadyVectorized();
 
-  LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
+  assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
   return true;
 }
 
+LoopVectorizePass::LoopVectorizePass(LoopVectorizeOptions Opts)
+    : InterleaveOnlyWhenForced(Opts.InterleaveOnlyWhenForced ||
+                               !EnableLoopInterleaving),
+      VectorizeOnlyWhenForced(Opts.VectorizeOnlyWhenForced ||
+                              !EnableLoopVectorization) {}
+
 bool LoopVectorizePass::processLoop(Loop *L) {
   assert((EnableVPlanNativePath || L->empty()) &&
          "VPlan-native path is not enabled. Only process inner loops.");
@@ -7720,17 +7838,17 @@ bool LoopVectorizePass::processLoop(Loop *L) {
   CM.collectValuesToIgnore();
 
   // Use the planner for vectorization.
-  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI);
+  LoopVectorizationPlanner LVP(L, LI, TLI, TTI, &LVL, CM, IAI, PSE);
 
-  // Get user vectorization factor.
+  // Get user vectorization factor and interleave count.
   unsigned UserVF = Hints.getWidth();
+  unsigned UserIC = Hints.getInterleave();
 
   // Plan how to best vectorize, return the best VF and its cost.
-  Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF);
+  Optional<VectorizationFactor> MaybeVF = LVP.plan(UserVF, UserIC);
 
   VectorizationFactor VF = VectorizationFactor::Disabled();
   unsigned IC = 1;
-  unsigned UserIC = Hints.getInterleave();
 
   if (MaybeVF) {
     VF = *MaybeVF;
@@ -7883,14 +8001,14 @@ bool LoopVectorizePass::processLoop(Loop *L) {
     Hints.setAlreadyVectorized();
   }
 
-  LLVM_DEBUG(verifyFunction(*L->getHeader()->getParent()));
+  assert(!verifyFunction(*L->getHeader()->getParent(), &dbgs()));
   return true;
 }
 
-bool LoopVectorizePass::runImpl(
+LoopVectorizeResult LoopVectorizePass::runImpl(
     Function &F, ScalarEvolution &SE_, LoopInfo &LI_, TargetTransformInfo &TTI_,
     DominatorTree &DT_, BlockFrequencyInfo &BFI_, TargetLibraryInfo *TLI_,
-    DemandedBits &DB_, AliasAnalysis &AA_, AssumptionCache &AC_,
+    DemandedBits &DB_, AAResults &AA_, AssumptionCache &AC_,
     std::function<const LoopAccessInfo &(Loop &)> &GetLAA_,
     OptimizationRemarkEmitter &ORE_, ProfileSummaryInfo *PSI_) {
   SE = &SE_;
@@ -7915,9 +8033,9 @@ bool LoopVectorizePass::runImpl(
   // interleaving.
   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true)) &&
       TTI->getMaxInterleaveFactor(1) < 2)
-    return false;
+    return LoopVectorizeResult(false, false);
 
-  bool Changed = false;
+  bool Changed = false, CFGChanged = false;
 
   // The vectorizer requires loops to be in simplified form.
   // Since simplification may add new inner loops, it has to run before the
@@ -7925,7 +8043,7 @@ bool LoopVectorizePass::runImpl(
   // will simplify all loops, regardless of whether anything end up being
   // vectorized.
   for (auto &L : *LI)
-    Changed |=
+    Changed |= CFGChanged |=
         simplifyLoop(L, DT, LI, SE, AC, nullptr, false /* PreserveLCSSA */);
 
   // Build up a worklist of inner-loops to vectorize. This is necessary as
@@ -7946,11 +8064,11 @@ bool LoopVectorizePass::runImpl(
     // transform.
     Changed |= formLCSSARecursively(*L, *DT, LI, SE);
 
-    Changed |= processLoop(L);
+    Changed |= CFGChanged |= processLoop(L);
   }
 
   // Process each loop nest in the function.
-  return Changed;
+  return LoopVectorizeResult(Changed, CFGChanged);
 }
 
 PreservedAnalyses LoopVectorizePass::run(Function &F,
@@ -7975,13 +8093,12 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
       LoopStandardAnalysisResults AR = {AA, AC, DT, LI, SE, TLI, TTI, MSSA};
       return LAM.getResult<LoopAccessAnalysis>(L, AR);
     };
-    const ModuleAnalysisManager &MAM =
-        AM.getResult<ModuleAnalysisManagerFunctionProxy>(F).getManager();
+    auto &MAMProxy = AM.getResult<ModuleAnalysisManagerFunctionProxy>(F);
     ProfileSummaryInfo *PSI =
-        MAM.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
-    bool Changed =
+        MAMProxy.getCachedResult<ProfileSummaryAnalysis>(*F.getParent());
+    LoopVectorizeResult Result =
         runImpl(F, SE, LI, TTI, DT, BFI, &TLI, DB, AA, AC, GetLAA, ORE, PSI);
-    if (!Changed)
+    if (!Result.MadeAnyChange)
       return PreservedAnalyses::all();
     PreservedAnalyses PA;
 
@@ -7995,5 +8112,7 @@ PreservedAnalyses LoopVectorizePass::run(Function &F,
     }
     PA.preserve<BasicAA>();
     PA.preserve<GlobalsAA>();
+    if (!Result.MadeCFGChange)
+      PA.preserveSet<CFGAnalyses>();
     return PA;
 }
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index aabd974cd73e..5bc35aa4695f 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -47,6 +47,7 @@
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/Analysis/ValueTracking.h"
 #include "llvm/Analysis/VectorUtils.h"
+#include "llvm/Analysis/AssumptionCache.h"
 #include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/Constant.h"
@@ -85,6 +86,7 @@
 #include "llvm/Support/KnownBits.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/raw_ostream.h"
+#include "llvm/Transforms/Utils/InjectTLIMappings.h"
 #include "llvm/Transforms/Utils/LoopUtils.h"
 #include "llvm/Transforms/Vectorize.h"
 #include <algorithm>
@@ -107,9 +109,8 @@ using namespace slpvectorizer;
 
 STATISTIC(NumVectorInstructions, "Number of vector instructions generated");
 
-cl::opt<bool>
-    llvm::RunSLPVectorization("vectorize-slp", cl::init(false), cl::Hidden,
-                              cl::desc("Run the SLP vectorization passes"));
+cl::opt<bool> RunSLPVectorization("vectorize-slp", cl::init(true), cl::Hidden,
+                                  cl::desc("Run the SLP vectorization passes"));
 
 static cl::opt<int>
     SLPCostThreshold("slp-threshold", cl::init(0), cl::Hidden,
@@ -284,7 +285,7 @@ static bool isCommutative(Instruction *I) {
 static Optional<TargetTransformInfo::ShuffleKind>
 isShuffle(ArrayRef<Value *> VL) {
   auto *EI0 = cast<ExtractElementInst>(VL[0]);
-  unsigned Size = EI0->getVectorOperandType()->getVectorNumElements();
+  unsigned Size = EI0->getVectorOperandType()->getNumElements();
   Value *Vec1 = nullptr;
   Value *Vec2 = nullptr;
   enum ShuffleMode { Unknown, Select, Permute };
@@ -293,7 +294,7 @@ isShuffle(ArrayRef<Value *> VL) {
     auto *EI = cast<ExtractElementInst>(VL[I]);
     auto *Vec = EI->getVectorOperand();
     // All vector operands must have the same number of vector elements.
-    if (Vec->getType()->getVectorNumElements() != Size)
+    if (cast<VectorType>(Vec->getType())->getNumElements() != Size)
       return None;
     auto *Idx = dyn_cast<ConstantInt>(EI->getIndexOperand());
     if (!Idx)
@@ -377,6 +378,18 @@ static Value *isOneOf(const InstructionsState &S, Value *Op) {
   return S.OpValue;
 }
 
+/// \returns true if \p Opcode is allowed as part of of the main/alternate
+/// instruction for SLP vectorization.
+///
+/// Example of unsupported opcode is SDIV that can potentially cause UB if the
+/// "shuffled out" lane would result in division by zero.
+static bool isValidForAlternation(unsigned Opcode) {
+  if (Instruction::isIntDivRem(Opcode))
+    return false;
+
+  return true;
+}
+
 /// \returns analysis of the Instructions in \p VL described in
 /// InstructionsState, the Opcode that we suppose the whole list
 /// could be vectorized even if its structure is diverse.
@@ -399,7 +412,8 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
     if (IsBinOp && isa<BinaryOperator>(VL[Cnt])) {
       if (InstOpcode == Opcode || InstOpcode == AltOpcode)
         continue;
-      if (Opcode == AltOpcode) {
+      if (Opcode == AltOpcode && isValidForAlternation(InstOpcode) &&
+          isValidForAlternation(Opcode)) {
         AltOpcode = InstOpcode;
         AltIndex = Cnt;
         continue;
@@ -411,6 +425,9 @@ static InstructionsState getSameOpcode(ArrayRef<Value *> VL,
         if (InstOpcode == Opcode || InstOpcode == AltOpcode)
           continue;
         if (Opcode == AltOpcode) {
+          assert(isValidForAlternation(Opcode) &&
+                 isValidForAlternation(InstOpcode) &&
+                 "Cast isn't safe for alternation, logic needs to be updated!");
           AltOpcode = InstOpcode;
           AltIndex = Cnt;
           continue;
@@ -613,7 +630,7 @@ public:
   /// the stored value. Otherwise, the size is the width of the largest loaded
   /// value reaching V. This method is used by the vectorizer to calculate
   /// vectorization factors.
-  unsigned getVectorElementSize(Value *V) const;
+  unsigned getVectorElementSize(Value *V);
 
   /// Compute the minimum type sizes required to represent the entries in a
   /// vectorizable tree.
@@ -650,6 +667,15 @@ public:
   ///       may not be necessary.
   bool isLoadCombineReductionCandidate(unsigned ReductionOpcode) const;
 
+  /// Assume that a vector of stores of bitwise-or/shifted/zexted loaded values
+  /// can be load combined in the backend. Load combining may not be allowed in
+  /// the IR optimizer, so we do not want to alter the pattern. For example,
+  /// partially transforming a scalar bswap() pattern into vector code is
+  /// effectively impossible for the backend to undo.
+  /// TODO: If load combining is allowed in the IR optimizer, this analysis
+  ///       may not be necessary.
+  bool isLoadCombineCandidate() const;
+
   OptimizationRemarkEmitter *getORE() { return ORE; }
 
   /// This structure holds any data we need about the edges being traversed
@@ -816,13 +842,12 @@ public:
 
       // Extracts from consecutive indexes of the same vector better score as
       // the extracts could be optimized away.
-      auto *Ex1 = dyn_cast<ExtractElementInst>(V1);
-      auto *Ex2 = dyn_cast<ExtractElementInst>(V2);
-      if (Ex1 && Ex2 && Ex1->getVectorOperand() == Ex2->getVectorOperand() &&
-          cast<ConstantInt>(Ex1->getIndexOperand())->getZExtValue() + 1 ==
-              cast<ConstantInt>(Ex2->getIndexOperand())->getZExtValue()) {
+      Value *EV;
+      ConstantInt *Ex1Idx, *Ex2Idx;
+      if (match(V1, m_ExtractElt(m_Value(EV), m_ConstantInt(Ex1Idx))) &&
+          match(V2, m_ExtractElt(m_Deferred(EV), m_ConstantInt(Ex2Idx))) &&
+          Ex1Idx->getZExtValue() + 1 == Ex2Idx->getZExtValue())
         return VLOperands::ScoreConsecutiveExtracts;
-      }
 
       auto *I1 = dyn_cast<Instruction>(V1);
       auto *I2 = dyn_cast<Instruction>(V2);
@@ -852,7 +877,7 @@ public:
     int getExternalUsesCost(const std::pair<Value *, int> &LHS,
                             const std::pair<Value *, int> &RHS) {
       int Cost = 0;
-      SmallVector<std::pair<Value *, int>, 2> Values = {LHS, RHS};
+      std::array<std::pair<Value *, int>, 2> Values = {{LHS, RHS}};
       for (int Idx = 0, IdxE = Values.size(); Idx != IdxE; ++Idx) {
         Value *V = Values[Idx].first;
         // Calculate the absolute lane, using the minimum relative lane of LHS
@@ -1385,7 +1410,8 @@ private:
 
   /// \returns the scalarization cost for this type. Scalarization in this
   /// context means the creation of vectors from a group of scalars.
-  int getGatherCost(Type *Ty, const DenseSet<unsigned> &ShuffledIndices) const;
+  int getGatherCost(VectorType *Ty,
+                    const DenseSet<unsigned> &ShuffledIndices) const;
 
   /// \returns the scalarization cost for this list of values. Assuming that
   /// this subtree gets vectorized, we may need to extract the values from the
@@ -1422,7 +1448,7 @@ private:
       return VL.size() == ReuseShuffleIndices.size() &&
              std::equal(
                  VL.begin(), VL.end(), ReuseShuffleIndices.begin(),
-                 [this](Value *V, unsigned Idx) { return V == Scalars[Idx]; });
+                 [this](Value *V, int Idx) { return V == Scalars[Idx]; });
     }
 
     /// A vector of scalars.
@@ -1436,7 +1462,7 @@ private:
     EntryState State;
 
     /// Does this sequence require some shuffling?
-    SmallVector<unsigned, 4> ReuseShuffleIndices;
+    SmallVector<int, 4> ReuseShuffleIndices;
 
     /// Does this entry require reordering?
     ArrayRef<unsigned> ReorderIndices;
@@ -1690,6 +1716,9 @@ private:
   /// Maps a specific scalar to its tree entry.
   SmallDenseMap<Value*, TreeEntry *> ScalarToTreeEntry;
 
+  /// Maps a value to the proposed vectorizable size.
+  SmallDenseMap<Value *, unsigned> InstrElementSize;
+
   /// A list of scalars that we found that we need to keep as scalars.
   ValueSet MustGather;
 
@@ -2001,6 +2030,20 @@ private:
         if (TreeEntry *TE = BundleMember->TE) {
           int Lane = BundleMember->Lane;
           assert(Lane >= 0 && "Lane not set");
+
+          // Since vectorization tree is being built recursively this assertion
+          // ensures that the tree entry has all operands set before reaching
+          // this code. Couple of exceptions known at the moment are extracts
+          // where their second (immediate) operand is not added. Since
+          // immediates do not affect scheduler behavior this is considered
+          // okay.
+          auto *In = TE->getMainOp();
+          assert(In &&
+                 (isa<ExtractValueInst>(In) || isa<ExtractElementInst>(In) ||
+                  In->getNumOperands() == TE->getNumOperands()) &&
+                 "Missed TreeEntry operands?");
+          (void)In; // fake use to avoid build failure when assertions disabled
+
           for (unsigned OpIdx = 0, NumOperands = TE->getNumOperands();
                OpIdx != NumOperands; ++OpIdx)
             if (auto *I = dyn_cast<Instruction>(TE->getOperand(OpIdx)[Lane]))
@@ -2323,6 +2366,7 @@ BoUpSLP::~BoUpSLP() {
            "trying to erase instruction with users.");
     Pair.getFirst()->eraseFromParent();
   }
+  assert(!verifyFunction(*F, &dbgs()));
 }
 
 void BoUpSLP::eraseInstructions(ArrayRef<Value *> AV) {
@@ -2978,19 +3022,24 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
       return;
     }
     case Instruction::Call: {
-      // Check if the calls are all to the same vectorizable intrinsic.
+      // Check if the calls are all to the same vectorizable intrinsic or
+      // library function.
       CallInst *CI = cast<CallInst>(VL0);
-      // Check if this is an Intrinsic call or something that can be
-      // represented by an intrinsic call
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-      if (!isTriviallyVectorizable(ID)) {
+
+      VFShape Shape = VFShape::get(
+          *CI, {static_cast<unsigned int>(VL.size()), false /*Scalable*/},
+          false /*HasGlobalPred*/);
+      Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+
+      if (!VecFunc && !isTriviallyVectorizable(ID)) {
         BS.cancelScheduling(VL, VL0);
         newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
                      ReuseShuffleIndicies);
         LLVM_DEBUG(dbgs() << "SLP: Non-vectorizable call.\n");
         return;
       }
-      Function *Int = CI->getCalledFunction();
+      Function *F = CI->getCalledFunction();
       unsigned NumArgs = CI->getNumArgOperands();
       SmallVector<Value*, 4> ScalarArgs(NumArgs, nullptr);
       for (unsigned j = 0; j != NumArgs; ++j)
@@ -2998,8 +3047,10 @@ void BoUpSLP::buildTree_rec(ArrayRef<Value *> VL, unsigned Depth,
           ScalarArgs[j] = CI->getArgOperand(j);
       for (Value *V : VL) {
         CallInst *CI2 = dyn_cast<CallInst>(V);
-        if (!CI2 || CI2->getCalledFunction() != Int ||
+        if (!CI2 || CI2->getCalledFunction() != F ||
             getVectorIntrinsicIDForCall(CI2, TLI) != ID ||
+            (VecFunc &&
+             VecFunc != VFDatabase(*CI2).getVectorizedFunction(Shape)) ||
             !CI->hasIdenticalOperandBundleSchema(*CI2)) {
           BS.cancelScheduling(VL, VL0);
           newTreeEntry(VL, None /*not vectorized*/, S, UserTreeIdx,
@@ -3101,7 +3152,8 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
   unsigned N = 1;
   Type *EltTy = T;
 
-  while (isa<CompositeType>(EltTy)) {
+  while (isa<StructType>(EltTy) || isa<ArrayType>(EltTy) ||
+         isa<VectorType>(EltTy)) {
     if (auto *ST = dyn_cast<StructType>(EltTy)) {
       // Check that struct is homogeneous.
       for (const auto *Ty : ST->elements())
@@ -3109,16 +3161,19 @@ unsigned BoUpSLP::canMapToVector(Type *T, const DataLayout &DL) const {
           return 0;
       N *= ST->getNumElements();
       EltTy = *ST->element_begin();
+    } else if (auto *AT = dyn_cast<ArrayType>(EltTy)) {
+      N *= AT->getNumElements();
+      EltTy = AT->getElementType();
     } else {
-      auto *SeqT = cast<SequentialType>(EltTy);
-      N *= SeqT->getNumElements();
-      EltTy = SeqT->getElementType();
+      auto *VT = cast<VectorType>(EltTy);
+      N *= VT->getNumElements();
+      EltTy = VT->getElementType();
     }
   }
 
   if (!isValidElementType(EltTy))
     return 0;
-  uint64_t VTSize = DL.getTypeStoreSizeInBits(VectorType::get(EltTy, N));
+  uint64_t VTSize = DL.getTypeStoreSizeInBits(FixedVectorType::get(EltTy, N));
   if (VTSize < MinVecRegSize || VTSize > MaxVecRegSize || VTSize != DL.getTypeStoreSizeInBits(T))
     return 0;
   return N;
@@ -3148,7 +3203,7 @@ bool BoUpSLP::canReuseExtract(ArrayRef<Value *> VL, Value *OpValue,
     if (!LI || !LI->isSimple() || !LI->hasNUses(VL.size()))
       return false;
   } else {
-    NElts = Vec->getType()->getVectorNumElements();
+    NElts = cast<VectorType>(Vec->getType())->getNumElements();
   }
 
   if (NElts != VL.size())
@@ -3198,6 +3253,35 @@ bool BoUpSLP::areAllUsersVectorized(Instruction *I) const {
          });
 }
 
+static std::pair<unsigned, unsigned>
+getVectorCallCosts(CallInst *CI, VectorType *VecTy, TargetTransformInfo *TTI,
+                   TargetLibraryInfo *TLI) {
+  Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+  // Calculate the cost of the scalar and vector calls.
+  IntrinsicCostAttributes CostAttrs(ID, *CI, VecTy->getNumElements());
+  int IntrinsicCost =
+    TTI->getIntrinsicInstrCost(CostAttrs, TTI::TCK_RecipThroughput);
+
+  auto Shape =
+      VFShape::get(*CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
+                   false /*HasGlobalPred*/);
+  Function *VecFunc = VFDatabase(*CI).getVectorizedFunction(Shape);
+  int LibCost = IntrinsicCost;
+  if (!CI->isNoBuiltin() && VecFunc) {
+    // Calculate the cost of the vector library call.
+    SmallVector<Type *, 4> VecTys;
+    for (Use &Arg : CI->args())
+      VecTys.push_back(
+          FixedVectorType::get(Arg->getType(), VecTy->getNumElements()));
+
+    // If the corresponding vector call is cheaper, return its cost.
+    LibCost = TTI->getCallInstrCost(nullptr, VecTy, VecTys,
+                                    TTI::TCK_RecipThroughput);
+  }
+  return {IntrinsicCost, LibCost};
+}
+
 int BoUpSLP::getEntryCost(TreeEntry *E) {
   ArrayRef<Value*> VL = E->Scalars;
 
@@ -3206,12 +3290,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     ScalarTy = SI->getValueOperand()->getType();
   else if (CmpInst *CI = dyn_cast<CmpInst>(VL[0]))
     ScalarTy = CI->getOperand(0)->getType();
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
 
   // If we have computed a smaller type for the expression, update VecTy so
   // that the costs will be accurate.
   if (MinBWs.count(VL[0]))
-    VecTy = VectorType::get(
+    VecTy = FixedVectorType::get(
         IntegerType::get(F->getContext(), MinBWs[VL[0]].first), VL.size());
 
   unsigned ReuseShuffleNumbers = E->ReuseShuffleIndices.size();
@@ -3251,6 +3336,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     }
     return ReuseShuffleCost + getGatherCost(VL);
   }
+  assert(E->State == TreeEntry::Vectorize && "Unhandled state");
   assert(E->getOpcode() && allSameType(VL) && allSameBlock(VL) && "Invalid VL");
   Instruction *VL0 = E->getMainOp();
   unsigned ShuffleOrOp =
@@ -3260,7 +3346,7 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       return 0;
 
     case Instruction::ExtractValue:
-    case Instruction::ExtractElement:
+    case Instruction::ExtractElement: {
       if (NeedToShuffleReuses) {
         unsigned Idx = 0;
         for (unsigned I : E->ReuseShuffleIndices) {
@@ -3289,43 +3375,41 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
               TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, Idx);
         }
       }
-      if (E->State == TreeEntry::Vectorize) {
-        int DeadCost = ReuseShuffleCost;
-        if (!E->ReorderIndices.empty()) {
-          // TODO: Merge this shuffle with the ReuseShuffleCost.
-          DeadCost += TTI->getShuffleCost(
-              TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
-        }
-        for (unsigned i = 0, e = VL.size(); i < e; ++i) {
-          Instruction *E = cast<Instruction>(VL[i]);
-          // If all users are going to be vectorized, instruction can be
-          // considered as dead.
-          // The same, if have only one user, it will be vectorized for sure.
-          if (areAllUsersVectorized(E)) {
-            // Take credit for instruction that will become dead.
-            if (E->hasOneUse()) {
-              Instruction *Ext = E->user_back();
-              if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
-                  all_of(Ext->users(),
-                         [](User *U) { return isa<GetElementPtrInst>(U); })) {
-                // Use getExtractWithExtendCost() to calculate the cost of
-                // extractelement/ext pair.
-                DeadCost -= TTI->getExtractWithExtendCost(
-                    Ext->getOpcode(), Ext->getType(), VecTy, i);
-                // Add back the cost of s|zext which is subtracted separately.
-                DeadCost += TTI->getCastInstrCost(
-                    Ext->getOpcode(), Ext->getType(), E->getType(), Ext);
-                continue;
-              }
+      int DeadCost = ReuseShuffleCost;
+      if (!E->ReorderIndices.empty()) {
+        // TODO: Merge this shuffle with the ReuseShuffleCost.
+        DeadCost += TTI->getShuffleCost(
+            TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+      }
+      for (unsigned i = 0, e = VL.size(); i < e; ++i) {
+        Instruction *E = cast<Instruction>(VL[i]);
+        // If all users are going to be vectorized, instruction can be
+        // considered as dead.
+        // The same, if have only one user, it will be vectorized for sure.
+        if (areAllUsersVectorized(E)) {
+          // Take credit for instruction that will become dead.
+          if (E->hasOneUse()) {
+            Instruction *Ext = E->user_back();
+            if ((isa<SExtInst>(Ext) || isa<ZExtInst>(Ext)) &&
+                all_of(Ext->users(),
+                       [](User *U) { return isa<GetElementPtrInst>(U); })) {
+              // Use getExtractWithExtendCost() to calculate the cost of
+              // extractelement/ext pair.
+              DeadCost -= TTI->getExtractWithExtendCost(
+                  Ext->getOpcode(), Ext->getType(), VecTy, i);
+              // Add back the cost of s|zext which is subtracted separately.
+              DeadCost += TTI->getCastInstrCost(
+                  Ext->getOpcode(), Ext->getType(), E->getType(), CostKind,
+                  Ext);
+              continue;
             }
-            DeadCost -=
-                TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
           }
+          DeadCost -=
+              TTI->getVectorInstrCost(Instruction::ExtractElement, VecTy, i);
         }
-        return DeadCost;
       }
-      return ReuseShuffleCost + getGatherCost(VL);
-
+      return DeadCost;
+    }
     case Instruction::ZExt:
     case Instruction::SExt:
     case Instruction::FPToUI:
@@ -3340,7 +3424,8 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     case Instruction::BitCast: {
       Type *SrcTy = VL0->getOperand(0)->getType();
       int ScalarEltCost =
-          TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, VL0);
+          TTI->getCastInstrCost(E->getOpcode(), ScalarTy, SrcTy, CostKind,
+                                VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
@@ -3348,12 +3433,13 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       // Calculate the cost of this instruction.
       int ScalarCost = VL.size() * ScalarEltCost;
 
-      VectorType *SrcVecTy = VectorType::get(SrcTy, VL.size());
+      auto *SrcVecTy = FixedVectorType::get(SrcTy, VL.size());
       int VecCost = 0;
       // Check if the values are candidates to demote.
       if (!MinBWs.count(VL0) || VecTy != SrcVecTy) {
         VecCost = ReuseShuffleCost +
-                  TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy, VL0);
+                  TTI->getCastInstrCost(E->getOpcode(), VecTy, SrcVecTy,
+                                        CostKind, VL0);
       }
       return VecCost - ScalarCost;
     }
@@ -3362,13 +3448,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
     case Instruction::Select: {
       // Calculate the cost of this instruction.
       int ScalarEltCost = TTI->getCmpSelInstrCost(E->getOpcode(), ScalarTy,
-                                                  Builder.getInt1Ty(), VL0);
+                                                  Builder.getInt1Ty(),
+                                                  CostKind, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
-      VectorType *MaskTy = VectorType::get(Builder.getInt1Ty(), VL.size());
+      auto *MaskTy = FixedVectorType::get(Builder.getInt1Ty(), VL.size());
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
-      int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy, VL0);
+      int VecCost = TTI->getCmpSelInstrCost(E->getOpcode(), VecTy, MaskTy,
+                                            CostKind, VL0);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::FNeg:
@@ -3429,13 +3517,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
 
       SmallVector<const Value *, 4> Operands(VL0->operand_values());
       int ScalarEltCost = TTI->getArithmeticInstrCost(
-          E->getOpcode(), ScalarTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);
+          E->getOpcode(), ScalarTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
+          Operands, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
       int VecCost = TTI->getArithmeticInstrCost(
-          E->getOpcode(), VecTy, Op1VK, Op2VK, Op1VP, Op2VP, Operands, VL0);
+          E->getOpcode(), VecTy, CostKind, Op1VK, Op2VK, Op1VP, Op2VP,
+          Operands, VL0);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::GetElementPtr: {
@@ -3445,26 +3535,30 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
           TargetTransformInfo::OK_UniformConstantValue;
 
       int ScalarEltCost =
-          TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, Op1VK, Op2VK);
+          TTI->getArithmeticInstrCost(Instruction::Add, ScalarTy, CostKind,
+                                      Op1VK, Op2VK);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       int ScalarCost = VecTy->getNumElements() * ScalarEltCost;
       int VecCost =
-          TTI->getArithmeticInstrCost(Instruction::Add, VecTy, Op1VK, Op2VK);
+          TTI->getArithmeticInstrCost(Instruction::Add, VecTy, CostKind,
+                                      Op1VK, Op2VK);
       return ReuseShuffleCost + VecCost - ScalarCost;
     }
     case Instruction::Load: {
       // Cost of wide load - cost of scalar loads.
-      MaybeAlign alignment(cast<LoadInst>(VL0)->getAlignment());
+      Align alignment = cast<LoadInst>(VL0)->getAlign();
       int ScalarEltCost =
-          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0, VL0);
+          TTI->getMemoryOpCost(Instruction::Load, ScalarTy, alignment, 0,
+                               CostKind, VL0);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       int ScalarLdCost = VecTy->getNumElements() * ScalarEltCost;
       int VecLdCost =
-          TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0, VL0);
+          TTI->getMemoryOpCost(Instruction::Load, VecTy, alignment, 0,
+                               CostKind, VL0);
       if (!E->ReorderIndices.empty()) {
         // TODO: Merge this shuffle with the ReuseShuffleCost.
         VecLdCost += TTI->getShuffleCost(
@@ -3477,14 +3571,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       bool IsReorder = !E->ReorderIndices.empty();
       auto *SI =
           cast<StoreInst>(IsReorder ? VL[E->ReorderIndices.front()] : VL0);
-      MaybeAlign Alignment(SI->getAlignment());
+      Align Alignment = SI->getAlign();
       int ScalarEltCost =
-          TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0, VL0);
+          TTI->getMemoryOpCost(Instruction::Store, ScalarTy, Alignment, 0,
+                               CostKind, VL0);
       if (NeedToShuffleReuses)
         ReuseShuffleCost = -(ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       int ScalarStCost = VecTy->getNumElements() * ScalarEltCost;
       int VecStCost = TTI->getMemoryOpCost(Instruction::Store,
-                                           VecTy, Alignment, 0, VL0);
+                                           VecTy, Alignment, 0, CostKind, VL0);
       if (IsReorder) {
         // TODO: Merge this shuffle with the ReuseShuffleCost.
         VecStCost += TTI->getShuffleCost(
@@ -3497,24 +3592,15 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
 
       // Calculate the cost of the scalar and vector calls.
-      SmallVector<Type *, 4> ScalarTys;
-      for (unsigned op = 0, opc = CI->getNumArgOperands(); op != opc; ++op)
-        ScalarTys.push_back(CI->getArgOperand(op)->getType());
-
-      FastMathFlags FMF;
-      if (auto *FPMO = dyn_cast<FPMathOperator>(CI))
-        FMF = FPMO->getFastMathFlags();
-
-      int ScalarEltCost =
-          TTI->getIntrinsicInstrCost(ID, ScalarTy, ScalarTys, FMF);
+      IntrinsicCostAttributes CostAttrs(ID, *CI, 1, 1);
+      int ScalarEltCost = TTI->getIntrinsicInstrCost(CostAttrs, CostKind);
       if (NeedToShuffleReuses) {
         ReuseShuffleCost -= (ReuseShuffleNumbers - VL.size()) * ScalarEltCost;
       }
       int ScalarCallCost = VecTy->getNumElements() * ScalarEltCost;
 
-      SmallVector<Value *, 4> Args(CI->arg_operands());
-      int VecCallCost = TTI->getIntrinsicInstrCost(ID, CI->getType(), Args, FMF,
-                                                   VecTy->getNumElements());
+      auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
+      int VecCallCost = std::min(VecCallCosts.first, VecCallCosts.second);
 
       LLVM_DEBUG(dbgs() << "SLP: Call cost " << VecCallCost - ScalarCallCost
                         << " (" << VecCallCost << "-" << ScalarCallCost << ")"
@@ -3533,34 +3619,34 @@ int BoUpSLP::getEntryCost(TreeEntry *E) {
       if (NeedToShuffleReuses) {
         for (unsigned Idx : E->ReuseShuffleIndices) {
           Instruction *I = cast<Instruction>(VL[Idx]);
-          ReuseShuffleCost -= TTI->getInstructionCost(
-              I, TargetTransformInfo::TCK_RecipThroughput);
+          ReuseShuffleCost -= TTI->getInstructionCost(I, CostKind);
         }
         for (Value *V : VL) {
           Instruction *I = cast<Instruction>(V);
-          ReuseShuffleCost += TTI->getInstructionCost(
-              I, TargetTransformInfo::TCK_RecipThroughput);
+          ReuseShuffleCost += TTI->getInstructionCost(I, CostKind);
         }
       }
       for (Value *V : VL) {
         Instruction *I = cast<Instruction>(V);
         assert(E->isOpcodeOrAlt(I) && "Unexpected main/alternate opcode");
-        ScalarCost += TTI->getInstructionCost(
-            I, TargetTransformInfo::TCK_RecipThroughput);
+        ScalarCost += TTI->getInstructionCost(I, CostKind);
       }
       // VecCost is equal to sum of the cost of creating 2 vectors
       // and the cost of creating shuffle.
       int VecCost = 0;
       if (Instruction::isBinaryOp(E->getOpcode())) {
-        VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy);
-        VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy);
+        VecCost = TTI->getArithmeticInstrCost(E->getOpcode(), VecTy, CostKind);
+        VecCost += TTI->getArithmeticInstrCost(E->getAltOpcode(), VecTy,
+                                               CostKind);
       } else {
         Type *Src0SclTy = E->getMainOp()->getOperand(0)->getType();
         Type *Src1SclTy = E->getAltOp()->getOperand(0)->getType();
-        VectorType *Src0Ty = VectorType::get(Src0SclTy, VL.size());
-        VectorType *Src1Ty = VectorType::get(Src1SclTy, VL.size());
-        VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty);
-        VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty);
+        auto *Src0Ty = FixedVectorType::get(Src0SclTy, VL.size());
+        auto *Src1Ty = FixedVectorType::get(Src1SclTy, VL.size());
+        VecCost = TTI->getCastInstrCost(E->getOpcode(), VecTy, Src0Ty,
+                                        CostKind);
+        VecCost += TTI->getCastInstrCost(E->getAltOpcode(), VecTy, Src1Ty,
+                                         CostKind);
       }
       VecCost += TTI->getShuffleCost(TargetTransformInfo::SK_Select, VecTy, 0);
       return ReuseShuffleCost + VecCost - ScalarCost;
@@ -3596,24 +3682,20 @@ bool BoUpSLP::isFullyVectorizableTinyTree() const {
   return true;
 }
 
-bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
-  if (RdxOpcode != Instruction::Or)
-    return false;
-
-  unsigned NumElts = VectorizableTree[0]->Scalars.size();
-  Value *FirstReduced = VectorizableTree[0]->Scalars[0];
-
-  // Look past the reduction to find a source value. Arbitrarily follow the
+static bool isLoadCombineCandidateImpl(Value *Root, unsigned NumElts,
+                                       TargetTransformInfo *TTI) {
+  // Look past the root to find a source value. Arbitrarily follow the
   // path through operand 0 of any 'or'. Also, peek through optional
   // shift-left-by-constant.
-  Value *ZextLoad = FirstReduced;
-  while (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
-         match(ZextLoad, m_Shl(m_Value(), m_Constant())))
+  Value *ZextLoad = Root;
+  while (!isa<ConstantExpr>(ZextLoad) &&
+         (match(ZextLoad, m_Or(m_Value(), m_Value())) ||
+          match(ZextLoad, m_Shl(m_Value(), m_Constant()))))
     ZextLoad = cast<BinaryOperator>(ZextLoad)->getOperand(0);
 
-  // Check if the input to the reduction is an extended load.
+  // Check if the input is an extended load of the required or/shift expression.
   Value *LoadPtr;
-  if (!match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
+  if (ZextLoad == Root || !match(ZextLoad, m_ZExt(m_Load(m_Value(LoadPtr)))))
     return false;
 
   // Require that the total load bit width is a legal integer type.
@@ -3621,15 +3703,36 @@ bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
   // But <16 x i8> --> i128 is not, so the backend probably can't reduce it.
   Type *SrcTy = LoadPtr->getType()->getPointerElementType();
   unsigned LoadBitWidth = SrcTy->getIntegerBitWidth() * NumElts;
-  LLVMContext &Context = FirstReduced->getContext();
-  if (!TTI->isTypeLegal(IntegerType::get(Context, LoadBitWidth)))
+  if (!TTI->isTypeLegal(IntegerType::get(Root->getContext(), LoadBitWidth)))
     return false;
 
   // Everything matched - assume that we can fold the whole sequence using
   // load combining.
-  LLVM_DEBUG(dbgs() << "SLP: Assume load combining for scalar reduction of "
-             << *(cast<Instruction>(FirstReduced)) << "\n");
+  LLVM_DEBUG(dbgs() << "SLP: Assume load combining for tree starting at "
+             << *(cast<Instruction>(Root)) << "\n");
+
+  return true;
+}
+
+bool BoUpSLP::isLoadCombineReductionCandidate(unsigned RdxOpcode) const {
+  if (RdxOpcode != Instruction::Or)
+    return false;
 
+  unsigned NumElts = VectorizableTree[0]->Scalars.size();
+  Value *FirstReduced = VectorizableTree[0]->Scalars[0];
+  return isLoadCombineCandidateImpl(FirstReduced, NumElts, TTI);
+}
+
+bool BoUpSLP::isLoadCombineCandidate() const {
+  // Peek through a final sequence of stores and check if all operations are
+  // likely to be load-combined.
+  unsigned NumElts = VectorizableTree[0]->Scalars.size();
+  for (Value *Scalar : VectorizableTree[0]->Scalars) {
+    Value *X;
+    if (!match(Scalar, m_Store(m_Value(X), m_Value())) ||
+        !isLoadCombineCandidateImpl(X, NumElts, TTI))
+      return false;
+  }
   return true;
 }
 
@@ -3712,7 +3815,7 @@ int BoUpSLP::getSpillCost() const {
     if (NumCalls) {
       SmallVector<Type*, 4> V;
       for (auto *II : LiveValues)
-        V.push_back(VectorType::get(II->getType(), BundleWidth));
+        V.push_back(FixedVectorType::get(II->getType(), BundleWidth));
       Cost += NumCalls * TTI->getCostOfKeepingLiveOverCall(V);
     }
 
@@ -3776,13 +3879,13 @@ int BoUpSLP::getTreeCost() {
     // If we plan to rewrite the tree in a smaller type, we will need to sign
     // extend the extracted value back to the original type. Here, we account
     // for the extract and the added cost of the sign extend if needed.
-    auto *VecTy = VectorType::get(EU.Scalar->getType(), BundleWidth);
+    auto *VecTy = FixedVectorType::get(EU.Scalar->getType(), BundleWidth);
     auto *ScalarRoot = VectorizableTree[0]->Scalars[0];
     if (MinBWs.count(ScalarRoot)) {
       auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
       auto Extend =
           MinBWs[ScalarRoot].second ? Instruction::SExt : Instruction::ZExt;
-      VecTy = VectorType::get(MinTy, BundleWidth);
+      VecTy = FixedVectorType::get(MinTy, BundleWidth);
       ExtractCost += TTI->getExtractWithExtendCost(Extend, EU.Scalar->getType(),
                                                    VecTy, EU.Lane);
     } else {
@@ -3809,12 +3912,15 @@ int BoUpSLP::getTreeCost() {
   return Cost;
 }
 
-int BoUpSLP::getGatherCost(Type *Ty,
+int BoUpSLP::getGatherCost(VectorType *Ty,
                            const DenseSet<unsigned> &ShuffledIndices) const {
-  int Cost = 0;
-  for (unsigned i = 0, e = cast<VectorType>(Ty)->getNumElements(); i < e; ++i)
+  unsigned NumElts = Ty->getNumElements();
+  APInt DemandedElts = APInt::getNullValue(NumElts);
+  for (unsigned i = 0; i < NumElts; ++i)
     if (!ShuffledIndices.count(i))
-      Cost += TTI->getVectorInstrCost(Instruction::InsertElement, Ty, i);
+      DemandedElts.setBit(i);
+  int Cost = TTI->getScalarizationOverhead(Ty, DemandedElts, /*Insert*/ true,
+                                           /*Extract*/ false);
   if (!ShuffledIndices.empty())
     Cost += TTI->getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, Ty);
   return Cost;
@@ -3825,7 +3931,7 @@ int BoUpSLP::getGatherCost(ArrayRef<Value *> VL) const {
   Type *ScalarTy = VL[0]->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL[0]))
     ScalarTy = SI->getValueOperand()->getType();
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
   // Find the cost of inserting/extracting values from the vector.
   // Check if the same elements are inserted several times and count them as
   // shuffle candidates.
@@ -3965,9 +4071,9 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
             V = SV->getOperand(0);
           } else {
             // Reshuffle to get only unique values.
-            SmallVector<unsigned, 4> UniqueIdxs;
-            SmallSet<unsigned, 4> UsedIdxs;
-            for(unsigned Idx : E->ReuseShuffleIndices)
+            SmallVector<int, 4> UniqueIdxs;
+            SmallSet<int, 4> UsedIdxs;
+            for (int Idx : E->ReuseShuffleIndices)
               if (UsedIdxs.insert(Idx).second)
                 UniqueIdxs.emplace_back(Idx);
             V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
@@ -3984,7 +4090,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
     ScalarTy = SI->getValueOperand()->getType();
 
   // Check that every instruction appears once in this bundle.
-  SmallVector<unsigned, 4> ReuseShuffleIndicies;
+  SmallVector<int, 4> ReuseShuffleIndicies;
   SmallVector<Value *, 4> UniqueValues;
   if (VL.size() > 2) {
     DenseMap<Value *, unsigned> UniquePositions;
@@ -4002,7 +4108,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
     else
       VL = UniqueValues;
   }
-  VectorType *VecTy = VectorType::get(ScalarTy, VL.size());
+  auto *VecTy = FixedVectorType::get(ScalarTy, VL.size());
 
   Value *V = Gather(VL, VecTy);
   if (!ReuseShuffleIndicies.empty()) {
@@ -4017,7 +4123,7 @@ Value *BoUpSLP::vectorizeTree(ArrayRef<Value *> VL) {
 }
 
 static void inversePermutation(ArrayRef<unsigned> Indices,
-                               SmallVectorImpl<unsigned> &Mask) {
+                               SmallVectorImpl<int> &Mask) {
   Mask.clear();
   const unsigned E = Indices.size();
   Mask.resize(E);
@@ -4037,7 +4143,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
   Type *ScalarTy = VL0->getType();
   if (StoreInst *SI = dyn_cast<StoreInst>(VL0))
     ScalarTy = SI->getValueOperand()->getType();
-  VectorType *VecTy = VectorType::get(ScalarTy, E->Scalars.size());
+  auto *VecTy = FixedVectorType::get(ScalarTy, E->Scalars.size());
 
   bool NeedToShuffleReuses = !E->ReuseShuffleIndices.empty();
 
@@ -4056,6 +4162,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     return V;
   }
 
+  assert(E->State == TreeEntry::Vectorize && "Unhandled state");
   unsigned ShuffleOrOp =
       E->isAltShuffle() ? (unsigned)Instruction::ShuffleVector : E->getOpcode();
   switch (ShuffleOrOp) {
@@ -4096,72 +4203,45 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
     }
 
     case Instruction::ExtractElement: {
-      if (E->State == TreeEntry::Vectorize) {
-        Value *V = E->getSingleOperand(0);
-        if (!E->ReorderIndices.empty()) {
-          OrdersType Mask;
-          inversePermutation(E->ReorderIndices, Mask);
-          Builder.SetInsertPoint(VL0);
-          V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
-                                          "reorder_shuffle");
-        }
-        if (NeedToShuffleReuses) {
-          // TODO: Merge this shuffle with the ReorderShuffleMask.
-          if (E->ReorderIndices.empty())
-            Builder.SetInsertPoint(VL0);
-          V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                          E->ReuseShuffleIndices, "shuffle");
-        }
-        E->VectorizedValue = V;
-        return V;
+      Value *V = E->getSingleOperand(0);
+      if (!E->ReorderIndices.empty()) {
+        SmallVector<int, 4> Mask;
+        inversePermutation(E->ReorderIndices, Mask);
+        Builder.SetInsertPoint(VL0);
+        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy), Mask,
+                                        "reorder_shuffle");
       }
-      setInsertPointAfterBundle(E);
-      auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
+        // TODO: Merge this shuffle with the ReorderShuffleMask.
+        if (E->ReorderIndices.empty())
+          Builder.SetInsertPoint(VL0);
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
                                         E->ReuseShuffleIndices, "shuffle");
-        if (auto *I = dyn_cast<Instruction>(V)) {
-          GatherSeq.insert(I);
-          CSEBlocks.insert(I->getParent());
-        }
       }
       E->VectorizedValue = V;
       return V;
     }
     case Instruction::ExtractValue: {
-      if (E->State == TreeEntry::Vectorize) {
-        LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
-        Builder.SetInsertPoint(LI);
-        PointerType *PtrTy = PointerType::get(VecTy, LI->getPointerAddressSpace());
-        Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
-        LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlignment());
-        Value *NewV = propagateMetadata(V, E->Scalars);
-        if (!E->ReorderIndices.empty()) {
-          OrdersType Mask;
-          inversePermutation(E->ReorderIndices, Mask);
-          NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
-                                             "reorder_shuffle");
-        }
-        if (NeedToShuffleReuses) {
-          // TODO: Merge this shuffle with the ReorderShuffleMask.
-          NewV = Builder.CreateShuffleVector(
-              NewV, UndefValue::get(VecTy), E->ReuseShuffleIndices, "shuffle");
-        }
-        E->VectorizedValue = NewV;
-        return NewV;
+      LoadInst *LI = cast<LoadInst>(E->getSingleOperand(0));
+      Builder.SetInsertPoint(LI);
+      PointerType *PtrTy =
+          PointerType::get(VecTy, LI->getPointerAddressSpace());
+      Value *Ptr = Builder.CreateBitCast(LI->getOperand(0), PtrTy);
+      LoadInst *V = Builder.CreateAlignedLoad(VecTy, Ptr, LI->getAlign());
+      Value *NewV = propagateMetadata(V, E->Scalars);
+      if (!E->ReorderIndices.empty()) {
+        SmallVector<int, 4> Mask;
+        inversePermutation(E->ReorderIndices, Mask);
+        NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy), Mask,
+                                           "reorder_shuffle");
       }
-      setInsertPointAfterBundle(E);
-      auto *V = Gather(E->Scalars, VecTy);
       if (NeedToShuffleReuses) {
-        V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
-                                        E->ReuseShuffleIndices, "shuffle");
-        if (auto *I = dyn_cast<Instruction>(V)) {
-          GatherSeq.insert(I);
-          CSEBlocks.insert(I->getParent());
-        }
+        // TODO: Merge this shuffle with the ReorderShuffleMask.
+        NewV = Builder.CreateShuffleVector(NewV, UndefValue::get(VecTy),
+                                           E->ReuseShuffleIndices, "shuffle");
       }
-      E->VectorizedValue = V;
-      return V;
+      E->VectorizedValue = NewV;
+      return NewV;
     }
     case Instruction::ZExt:
     case Instruction::SExt:
@@ -4207,12 +4287,7 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       }
 
       CmpInst::Predicate P0 = cast<CmpInst>(VL0)->getPredicate();
-      Value *V;
-      if (E->getOpcode() == Instruction::FCmp)
-        V = Builder.CreateFCmp(P0, L, R);
-      else
-        V = Builder.CreateICmp(P0, L, R);
-
+      Value *V = Builder.CreateCmp(P0, L, R);
       propagateIRFlags(V, E->Scalars, VL0);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -4321,7 +4396,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       setInsertPointAfterBundle(E);
 
       LoadInst *LI = cast<LoadInst>(VL0);
-      Type *ScalarLoadTy = LI->getType();
       unsigned AS = LI->getPointerAddressSpace();
 
       Value *VecPtr = Builder.CreateBitCast(LI->getPointerOperand(),
@@ -4334,14 +4408,10 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (getTreeEntry(PO))
         ExternalUses.push_back(ExternalUser(PO, cast<User>(VecPtr), 0));
 
-      MaybeAlign Alignment = MaybeAlign(LI->getAlignment());
-      LI = Builder.CreateLoad(VecTy, VecPtr);
-      if (!Alignment)
-        Alignment = MaybeAlign(DL->getABITypeAlignment(ScalarLoadTy));
-      LI->setAlignment(Alignment);
+      LI = Builder.CreateAlignedLoad(VecTy, VecPtr, LI->getAlign());
       Value *V = propagateMetadata(LI, E->Scalars);
       if (IsReorder) {
-        OrdersType Mask;
+        SmallVector<int, 4> Mask;
         inversePermutation(E->ReorderIndices, Mask);
         V = Builder.CreateShuffleVector(V, UndefValue::get(V->getType()),
                                         Mask, "reorder_shuffle");
@@ -4359,23 +4429,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       bool IsReorder = !E->ReorderIndices.empty();
       auto *SI = cast<StoreInst>(
           IsReorder ? E->Scalars[E->ReorderIndices.front()] : VL0);
-      unsigned Alignment = SI->getAlignment();
       unsigned AS = SI->getPointerAddressSpace();
 
       setInsertPointAfterBundle(E);
 
       Value *VecValue = vectorizeTree(E->getOperand(0));
       if (IsReorder) {
-        OrdersType Mask;
-        inversePermutation(E->ReorderIndices, Mask);
+        SmallVector<int, 4> Mask(E->ReorderIndices.begin(),
+                                 E->ReorderIndices.end());
         VecValue = Builder.CreateShuffleVector(
-            VecValue, UndefValue::get(VecValue->getType()), E->ReorderIndices,
+            VecValue, UndefValue::get(VecValue->getType()), Mask,
             "reorder_shuffle");
       }
       Value *ScalarPtr = SI->getPointerOperand();
       Value *VecPtr = Builder.CreateBitCast(
           ScalarPtr, VecValue->getType()->getPointerTo(AS));
-      StoreInst *ST = Builder.CreateStore(VecValue, VecPtr);
+      StoreInst *ST = Builder.CreateAlignedStore(VecValue, VecPtr,
+                                                 SI->getAlign());
 
       // The pointer operand uses an in-tree scalar, so add the new BitCast to
       // ExternalUses to make sure that an extract will be generated in the
@@ -4383,10 +4453,6 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (getTreeEntry(ScalarPtr))
         ExternalUses.push_back(ExternalUser(ScalarPtr, cast<User>(VecPtr), 0));
 
-      if (!Alignment)
-        Alignment = DL->getABITypeAlignment(SI->getValueOperand()->getType());
-
-      ST->setAlignment(Align(Alignment));
       Value *V = propagateMetadata(ST, E->Scalars);
       if (NeedToShuffleReuses) {
         V = Builder.CreateShuffleVector(V, UndefValue::get(VecTy),
@@ -4445,13 +4511,19 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       if (Function *FI = CI->getCalledFunction())
         IID = FI->getIntrinsicID();
 
+      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
+
+      auto VecCallCosts = getVectorCallCosts(CI, VecTy, TTI, TLI);
+      bool UseIntrinsic = ID != Intrinsic::not_intrinsic &&
+                          VecCallCosts.first <= VecCallCosts.second;
+
       Value *ScalarArg = nullptr;
       std::vector<Value *> OpVecs;
       for (int j = 0, e = CI->getNumArgOperands(); j < e; ++j) {
         ValueList OpVL;
         // Some intrinsics have scalar arguments. This argument should not be
         // vectorized.
-        if (hasVectorInstrinsicScalarOpd(IID, j)) {
+        if (UseIntrinsic && hasVectorInstrinsicScalarOpd(IID, j)) {
           CallInst *CEI = cast<CallInst>(VL0);
           ScalarArg = CEI->getArgOperand(j);
           OpVecs.push_back(CEI->getArgOperand(j));
@@ -4463,10 +4535,17 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
         OpVecs.push_back(OpVec);
       }
 
-      Module *M = F->getParent();
-      Intrinsic::ID ID = getVectorIntrinsicIDForCall(CI, TLI);
-      Type *Tys[] = { VectorType::get(CI->getType(), E->Scalars.size()) };
-      Function *CF = Intrinsic::getDeclaration(M, ID, Tys);
+      Function *CF;
+      if (!UseIntrinsic) {
+        VFShape Shape = VFShape::get(
+            *CI, {static_cast<unsigned>(VecTy->getNumElements()), false},
+            false /*HasGlobalPred*/);
+        CF = VFDatabase(*CI).getVectorizedFunction(Shape);
+      } else {
+        Type *Tys[] = {FixedVectorType::get(CI->getType(), E->Scalars.size())};
+        CF = Intrinsic::getDeclaration(F->getParent(), ID, Tys);
+      }
+
       SmallVector<OperandBundleDef, 1> OpBundles;
       CI->getOperandBundlesAsDefs(OpBundles);
       Value *V = Builder.CreateCall(CF, OpVecs, OpBundles);
@@ -4527,24 +4606,23 @@ Value *BoUpSLP::vectorizeTree(TreeEntry *E) {
       // each vector operation.
       ValueList OpScalars, AltScalars;
       unsigned e = E->Scalars.size();
-      SmallVector<Constant *, 8> Mask(e);
+      SmallVector<int, 8> Mask(e);
       for (unsigned i = 0; i < e; ++i) {
         auto *OpInst = cast<Instruction>(E->Scalars[i]);
         assert(E->isOpcodeOrAlt(OpInst) && "Unexpected main/alternate opcode");
         if (OpInst->getOpcode() == E->getAltOpcode()) {
-          Mask[i] = Builder.getInt32(e + i);
+          Mask[i] = e + i;
           AltScalars.push_back(E->Scalars[i]);
         } else {
-          Mask[i] = Builder.getInt32(i);
+          Mask[i] = i;
           OpScalars.push_back(E->Scalars[i]);
         }
       }
 
-      Value *ShuffleMask = ConstantVector::get(Mask);
       propagateIRFlags(V0, OpScalars);
       propagateIRFlags(V1, AltScalars);
 
-      Value *V = Builder.CreateShuffleVector(V0, V1, ShuffleMask);
+      Value *V = Builder.CreateShuffleVector(V0, V1, Mask);
       if (Instruction *I = dyn_cast<Instruction>(V))
         V = propagateMetadata(I, E->Scalars);
       if (NeedToShuffleReuses) {
@@ -4586,7 +4664,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
       Builder.SetInsertPoint(&*++BasicBlock::iterator(I));
     auto BundleWidth = VectorizableTree[0]->Scalars.size();
     auto *MinTy = IntegerType::get(F->getContext(), MinBWs[ScalarRoot].first);
-    auto *VecTy = VectorType::get(MinTy, BundleWidth);
+    auto *VecTy = FixedVectorType::get(MinTy, BundleWidth);
     auto *Trunc = Builder.CreateTrunc(VectorRoot, VecTy);
     VectorizableTree[0]->VectorizedValue = Trunc;
   }
@@ -4715,6 +4793,7 @@ BoUpSLP::vectorizeTree(ExtraValueToDebugLocsMap &ExternallyUsedValues) {
   }
 
   Builder.ClearInsertionPoint();
+  InstrElementSize.clear();
 
   return VectorizableTree[0]->VectorizedValue;
 }
@@ -5251,20 +5330,26 @@ void BoUpSLP::scheduleBlock(BlockScheduling *BS) {
   BS->ScheduleStart = nullptr;
 }
 
-unsigned BoUpSLP::getVectorElementSize(Value *V) const {
+unsigned BoUpSLP::getVectorElementSize(Value *V) {
   // If V is a store, just return the width of the stored value without
   // traversing the expression tree. This is the common case.
   if (auto *Store = dyn_cast<StoreInst>(V))
     return DL->getTypeSizeInBits(Store->getValueOperand()->getType());
 
+  auto E = InstrElementSize.find(V);
+  if (E != InstrElementSize.end())
+    return E->second;
+
   // If V is not a store, we can traverse the expression tree to find loads
   // that feed it. The type of the loaded value may indicate a more suitable
   // width than V's type. We want to base the vector element size on the width
   // of memory operations where possible.
   SmallVector<Instruction *, 16> Worklist;
   SmallPtrSet<Instruction *, 16> Visited;
-  if (auto *I = dyn_cast<Instruction>(V))
+  if (auto *I = dyn_cast<Instruction>(V)) {
     Worklist.push_back(I);
+    Visited.insert(I);
+  }
 
   // Traverse the expression tree in bottom-up order looking for loads. If we
   // encounter an instruction we don't yet handle, we give up.
@@ -5272,7 +5357,6 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const {
   auto FoundUnknownInst = false;
   while (!Worklist.empty() && !FoundUnknownInst) {
     auto *I = Worklist.pop_back_val();
-    Visited.insert(I);
 
     // We should only be looking at scalar instructions here. If the current
     // instruction has a vector type, give up.
@@ -5292,7 +5376,7 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const {
              isa<CmpInst>(I) || isa<SelectInst>(I) || isa<BinaryOperator>(I)) {
       for (Use &U : I->operands())
         if (auto *J = dyn_cast<Instruction>(U.get()))
-          if (!Visited.count(J))
+          if (Visited.insert(J).second)
             Worklist.push_back(J);
     }
 
@@ -5301,13 +5385,17 @@ unsigned BoUpSLP::getVectorElementSize(Value *V) const {
       FoundUnknownInst = true;
   }
 
+  int Width = MaxWidth;
   // If we didn't encounter a memory access in the expression tree, or if we
-  // gave up for some reason, just return the width of V.
+  // gave up for some reason, just return the width of V. Otherwise, return the
+  // maximum width we found.
   if (!MaxWidth || FoundUnknownInst)
-    return DL->getTypeSizeInBits(V->getType());
+    Width = DL->getTypeSizeInBits(V->getType());
 
-  // Otherwise, return the maximum width we found.
-  return MaxWidth;
+  for (Instruction *I : Visited)
+    InstrElementSize[I] = Width;
+
+  return Width;
 }
 
 // Determine if a value V in a vectorizable expression Expr can be demoted to a
@@ -5560,6 +5648,7 @@ struct SLPVectorizer : public FunctionPass {
     AU.addRequired<DominatorTreeWrapperPass>();
     AU.addRequired<DemandedBitsWrapperPass>();
     AU.addRequired<OptimizationRemarkEmitterWrapperPass>();
+    AU.addRequired<InjectTLIMappingsLegacy>();
     AU.addPreserved<LoopInfoWrapperPass>();
     AU.addPreserved<DominatorTreeWrapperPass>();
     AU.addPreserved<AAResultsWrapperPass>();
@@ -5598,6 +5687,8 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
                                 LoopInfo *LI_, DominatorTree *DT_,
                                 AssumptionCache *AC_, DemandedBits *DB_,
                                 OptimizationRemarkEmitter *ORE_) {
+  if (!RunSLPVectorization)
+    return false;
   SE = SE_;
   TTI = TTI_;
   TLI = TLI_;
@@ -5657,7 +5748,6 @@ bool SLPVectorizerPass::runImpl(Function &F, ScalarEvolution *SE_,
   if (Changed) {
     R.optimizeGatherSequence();
     LLVM_DEBUG(dbgs() << "SLP: vectorized \"" << F.getName() << "\"\n");
-    LLVM_DEBUG(verifyFunction(F));
   }
   return Changed;
 }
@@ -5688,6 +5778,8 @@ bool SLPVectorizerPass::vectorizeStoreChain(ArrayRef<Value *> Chain, BoUpSLP &R,
   }
   if (R.isTreeTinyAndNotFullyVectorizable())
     return false;
+  if (R.isLoadCombineCandidate())
+    return false;
 
   R.computeMinimumValueSizes();
 
@@ -5841,37 +5933,28 @@ void SLPVectorizerPass::collectSeedInstructions(BasicBlock *BB) {
 bool SLPVectorizerPass::tryToVectorizePair(Value *A, Value *B, BoUpSLP &R) {
   if (!A || !B)
     return false;
-  Value *VL[] = { A, B };
-  return tryToVectorizeList(VL, R, /*UserCost=*/0, true);
+  Value *VL[] = {A, B};
+  return tryToVectorizeList(VL, R, /*AllowReorder=*/true);
 }
 
 bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
-                                           int UserCost, bool AllowReorder) {
+                                           bool AllowReorder,
+                                           ArrayRef<Value *> InsertUses) {
   if (VL.size() < 2)
     return false;
 
   LLVM_DEBUG(dbgs() << "SLP: Trying to vectorize a list of length = "
                     << VL.size() << ".\n");
 
-  // Check that all of the parts are scalar instructions of the same type,
+  // Check that all of the parts are instructions of the same type,
   // we permit an alternate opcode via InstructionsState.
   InstructionsState S = getSameOpcode(VL);
   if (!S.getOpcode())
     return false;
 
   Instruction *I0 = cast<Instruction>(S.OpValue);
-  unsigned Sz = R.getVectorElementSize(I0);
-  unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
-  unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
-  if (MaxVF < 2) {
-    R.getORE()->emit([&]() {
-      return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
-             << "Cannot SLP vectorize list: vectorization factor "
-             << "less than 2 is not supported";
-    });
-    return false;
-  }
-
+  // Make sure invalid types (including vector type) are rejected before
+  // determining vectorization factor for scalar instructions.
   for (Value *V : VL) {
     Type *Ty = V->getType();
     if (!isValidElementType(Ty)) {
@@ -5889,16 +5972,35 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
     }
   }
 
+  unsigned Sz = R.getVectorElementSize(I0);
+  unsigned MinVF = std::max(2U, R.getMinVecRegSize() / Sz);
+  unsigned MaxVF = std::max<unsigned>(PowerOf2Floor(VL.size()), MinVF);
+  if (MaxVF < 2) {
+    R.getORE()->emit([&]() {
+      return OptimizationRemarkMissed(SV_NAME, "SmallVF", I0)
+             << "Cannot SLP vectorize list: vectorization factor "
+             << "less than 2 is not supported";
+    });
+    return false;
+  }
+
   bool Changed = false;
   bool CandidateFound = false;
   int MinCost = SLPCostThreshold;
 
+  bool CompensateUseCost =
+      !InsertUses.empty() && llvm::all_of(InsertUses, [](const Value *V) {
+        return V && isa<InsertElementInst>(V);
+      });
+  assert((!CompensateUseCost || InsertUses.size() == VL.size()) &&
+         "Each scalar expected to have an associated InsertElement user.");
+
   unsigned NextInst = 0, MaxInst = VL.size();
   for (unsigned VF = MaxVF; NextInst + 1 < MaxInst && VF >= MinVF; VF /= 2) {
     // No actual vectorization should happen, if number of parts is the same as
     // provided vectorization factor (i.e. the scalar type is used for vector
     // code during codegen).
-    auto *VecTy = VectorType::get(VL[0]->getType(), VF);
+    auto *VecTy = FixedVectorType::get(VL[0]->getType(), VF);
     if (TTI->getNumberOfParts(VecTy) == VF)
       continue;
     for (unsigned I = NextInst; I < MaxInst; ++I) {
@@ -5940,8 +6042,48 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
         continue;
 
       R.computeMinimumValueSizes();
-      int Cost = R.getTreeCost() - UserCost;
+      int Cost = R.getTreeCost();
       CandidateFound = true;
+      if (CompensateUseCost) {
+        // TODO: Use TTI's getScalarizationOverhead for sequence of inserts
+        // rather than sum of single inserts as the latter may overestimate
+        // cost. This work should imply improving cost estimation for extracts
+        // that added in for external (for vectorization tree) users,i.e. that
+        // part should also switch to same interface.
+        // For example, the following case is projected code after SLP:
+        //  %4 = extractelement <4 x i64> %3, i32 0
+        //  %v0 = insertelement <4 x i64> undef, i64 %4, i32 0
+        //  %5 = extractelement <4 x i64> %3, i32 1
+        //  %v1 = insertelement <4 x i64> %v0, i64 %5, i32 1
+        //  %6 = extractelement <4 x i64> %3, i32 2
+        //  %v2 = insertelement <4 x i64> %v1, i64 %6, i32 2
+        //  %7 = extractelement <4 x i64> %3, i32 3
+        //  %v3 = insertelement <4 x i64> %v2, i64 %7, i32 3
+        //
+        // Extracts here added by SLP in order to feed users (the inserts) of
+        // original scalars and contribute to "ExtractCost" at cost evaluation.
+        // The inserts in turn form sequence to build an aggregate that
+        // detected by findBuildAggregate routine.
+        // SLP makes an assumption that such sequence will be optimized away
+        // later (instcombine) so it tries to compensate ExctractCost with
+        // cost of insert sequence.
+        // Current per element cost calculation approach is not quite accurate
+        // and tends to create bias toward favoring vectorization.
+        // Switching to the TTI interface might help a bit.
+        // Alternative solution could be pattern-match to detect a no-op or
+        // shuffle.
+        unsigned UserCost = 0;
+        for (unsigned Lane = 0; Lane < OpsWidth; Lane++) {
+          auto *IE = cast<InsertElementInst>(InsertUses[I + Lane]);
+          if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2)))
+            UserCost += TTI->getVectorInstrCost(
+                Instruction::InsertElement, IE->getType(), CI->getZExtValue());
+        }
+        LLVM_DEBUG(dbgs() << "SLP: Compensate cost of users by: " << UserCost
+                          << ".\n");
+        Cost -= UserCost;
+      }
+
       MinCost = std::min(MinCost, Cost);
 
       if (Cost < -SLPCostThreshold) {
@@ -6031,24 +6173,23 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
 ///        <0,2,...> or <1,3,..> while a splitting reduction will generate
 ///        <2,3, undef,undef> for a vector of 4 and NumElts = 2.
 /// \param IsLeft True will generate a mask of even elements, odd otherwise.
-static Value *createRdxShuffleMask(unsigned VecLen, unsigned NumEltsToRdx,
-                                   bool IsPairwise, bool IsLeft,
-                                   IRBuilder<> &Builder) {
+static SmallVector<int, 32> createRdxShuffleMask(unsigned VecLen,
+                                                 unsigned NumEltsToRdx,
+                                                 bool IsPairwise, bool IsLeft) {
   assert((IsPairwise || !IsLeft) && "Don't support a <0,1,undef,...> mask");
 
-  SmallVector<Constant *, 32> ShuffleMask(
-      VecLen, UndefValue::get(Builder.getInt32Ty()));
+  SmallVector<int, 32> ShuffleMask(VecLen, -1);
 
   if (IsPairwise)
     // Build a mask of 0, 2, ... (left) or 1, 3, ... (right).
     for (unsigned i = 0; i != NumEltsToRdx; ++i)
-      ShuffleMask[i] = Builder.getInt32(2 * i + !IsLeft);
+      ShuffleMask[i] = 2 * i + !IsLeft;
   else
     // Move the upper half of the vector to the lower half.
     for (unsigned i = 0; i != NumEltsToRdx; ++i)
-      ShuffleMask[i] = Builder.getInt32(NumEltsToRdx + i);
+      ShuffleMask[i] = NumEltsToRdx + i;
 
-  return ConstantVector::get(ShuffleMask);
+  return ShuffleMask;
 }
 
 namespace {
@@ -6840,7 +6981,7 @@ private:
   int getReductionCost(TargetTransformInfo *TTI, Value *FirstReducedVal,
                        unsigned ReduxWidth) {
     Type *ScalarTy = FirstReducedVal->getType();
-    Type *VecTy = VectorType::get(ScalarTy, ReduxWidth);
+    auto *VecTy = FixedVectorType::get(ScalarTy, ReduxWidth);
 
     int PairwiseRdxCost;
     int SplittingRdxCost;
@@ -6857,7 +6998,7 @@ private:
     case RK_Max:
     case RK_UMin:
     case RK_UMax: {
-      Type *VecCondTy = CmpInst::makeCmpResultType(VecTy);
+      auto *VecCondTy = cast<VectorType>(CmpInst::makeCmpResultType(VecTy));
       bool IsUnsigned = ReductionData.getKind() == RK_UMin ||
                         ReductionData.getKind() == RK_UMax;
       PairwiseRdxCost =
@@ -6922,10 +7063,8 @@ private:
 
     Value *TmpVec = VectorizedValue;
     for (unsigned i = ReduxWidth / 2; i != 0; i >>= 1) {
-      Value *LeftMask =
-          createRdxShuffleMask(ReduxWidth, i, true, true, Builder);
-      Value *RightMask =
-          createRdxShuffleMask(ReduxWidth, i, true, false, Builder);
+      auto LeftMask = createRdxShuffleMask(ReduxWidth, i, true, true);
+      auto RightMask = createRdxShuffleMask(ReduxWidth, i, true, false);
 
       Value *LeftShuf = Builder.CreateShuffleVector(
           TmpVec, UndefValue::get(TmpVec->getType()), LeftMask, "rdx.shuf.l");
@@ -6960,20 +7099,16 @@ private:
 /// \return true if it matches.
 static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
                                SmallVectorImpl<Value *> &BuildVectorOpds,
-                               int &UserCost) {
+                               SmallVectorImpl<Value *> &InsertElts) {
   assert((isa<InsertElementInst>(LastInsertInst) ||
           isa<InsertValueInst>(LastInsertInst)) &&
          "Expected insertelement or insertvalue instruction!");
-  UserCost = 0;
   do {
     Value *InsertedOperand;
-    if (auto *IE = dyn_cast<InsertElementInst>(LastInsertInst)) {
+    auto *IE = dyn_cast<InsertElementInst>(LastInsertInst);
+    if (IE) {
       InsertedOperand = IE->getOperand(1);
       LastInsertInst = IE->getOperand(0);
-      if (auto *CI = dyn_cast<ConstantInt>(IE->getOperand(2))) {
-        UserCost += TTI->getVectorInstrCost(Instruction::InsertElement,
-                                            IE->getType(), CI->getZExtValue());
-      }
     } else {
       auto *IV = cast<InsertValueInst>(LastInsertInst);
       InsertedOperand = IV->getInsertedValueOperand();
@@ -6981,16 +7116,17 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
     }
     if (isa<InsertElementInst>(InsertedOperand) ||
         isa<InsertValueInst>(InsertedOperand)) {
-      int TmpUserCost;
       SmallVector<Value *, 8> TmpBuildVectorOpds;
+      SmallVector<Value *, 8> TmpInsertElts;
       if (!findBuildAggregate(InsertedOperand, TTI, TmpBuildVectorOpds,
-                              TmpUserCost))
+                              TmpInsertElts))
         return false;
       BuildVectorOpds.append(TmpBuildVectorOpds.rbegin(),
                              TmpBuildVectorOpds.rend());
-      UserCost += TmpUserCost;
+      InsertElts.append(TmpInsertElts.rbegin(), TmpInsertElts.rend());
     } else {
       BuildVectorOpds.push_back(InsertedOperand);
+      InsertElts.push_back(IE);
     }
     if (isa<UndefValue>(LastInsertInst))
       break;
@@ -7000,6 +7136,7 @@ static bool findBuildAggregate(Value *LastInsertInst, TargetTransformInfo *TTI,
       return false;
   } while (true);
   std::reverse(BuildVectorOpds.begin(), BuildVectorOpds.end());
+  std::reverse(InsertElts.begin(), InsertElts.end());
   return true;
 }
 
@@ -7164,26 +7301,29 @@ bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Value *V,
 
 bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
                                                  BasicBlock *BB, BoUpSLP &R) {
-  int UserCost = 0;
   const DataLayout &DL = BB->getModule()->getDataLayout();
   if (!R.canMapToVector(IVI->getType(), DL))
     return false;
 
   SmallVector<Value *, 16> BuildVectorOpds;
-  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, UserCost))
+  SmallVector<Value *, 16> BuildVectorInsts;
+  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts) ||
+      BuildVectorOpds.size() < 2)
     return false;
 
   LLVM_DEBUG(dbgs() << "SLP: array mappable to vector: " << *IVI << "\n");
   // Aggregate value is unlikely to be processed in vector register, we need to
   // extract scalars into scalar registers, so NeedExtraction is set true.
-  return tryToVectorizeList(BuildVectorOpds, R, UserCost);
+  return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
+                            BuildVectorInsts);
 }
 
 bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
                                                    BasicBlock *BB, BoUpSLP &R) {
-  int UserCost;
+  SmallVector<Value *, 16> BuildVectorInsts;
   SmallVector<Value *, 16> BuildVectorOpds;
-  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, UserCost) ||
+  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
+      BuildVectorOpds.size() < 2 ||
       (llvm::all_of(BuildVectorOpds,
                     [](Value *V) { return isa<ExtractElementInst>(V); }) &&
        isShuffle(BuildVectorOpds)))
@@ -7191,7 +7331,8 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
 
   // Vectorize starting with the build vector operands ignoring the BuildVector
   // instructions for the purpose of scheduling and user extraction.
-  return tryToVectorizeList(BuildVectorOpds, R, UserCost);
+  return tryToVectorizeList(BuildVectorOpds, R, /*AllowReorder=*/false,
+                            BuildVectorInsts);
 }
 
 bool SLPVectorizerPass::vectorizeCmpInst(CmpInst *CI, BasicBlock *BB,
@@ -7228,6 +7369,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
   bool Changed = false;
   SmallVector<Value *, 4> Incoming;
   SmallPtrSet<Value *, 16> VisitedInstrs;
+  unsigned MaxVecRegSize = R.getMaxVecRegSize();
 
   bool HaveVectorizedPhiNodes = true;
   while (HaveVectorizedPhiNodes) {
@@ -7254,8 +7396,18 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
 
       // Look for the next elements with the same type.
       SmallVector<Value *, 4>::iterator SameTypeIt = IncIt;
+      Type *EltTy = (*IncIt)->getType();
+      unsigned EltSize = EltTy->isSized() ? DL->getTypeSizeInBits(EltTy)
+                                          : MaxVecRegSize;
+      unsigned MaxNumElts = MaxVecRegSize / EltSize;
+      if (MaxNumElts < 2) {
+        ++IncIt;
+        continue;
+      }
+
       while (SameTypeIt != E &&
-             (*SameTypeIt)->getType() == (*IncIt)->getType()) {
+             (*SameTypeIt)->getType() == EltTy &&
+             (SameTypeIt - IncIt) < MaxNumElts) {
         VisitedInstrs.insert(*SameTypeIt);
         ++SameTypeIt;
       }
@@ -7269,8 +7421,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       // is done when there are exactly two elements since tryToVectorizeList
       // asserts that there are only two values when AllowReorder is true.
       bool AllowReorder = NumElts == 2;
-      if (NumElts > 1 && tryToVectorizeList(makeArrayRef(IncIt, NumElts), R,
-                                            /*UserCost=*/0, AllowReorder)) {
+      if (NumElts > 1 &&
+          tryToVectorizeList(makeArrayRef(IncIt, NumElts), R, AllowReorder)) {
         // Success start over because instructions might have been changed.
         HaveVectorizedPhiNodes = true;
         Changed = true;
@@ -7370,9 +7522,12 @@ bool SLPVectorizerPass::vectorizeGEPIndices(BasicBlock *BB, BoUpSLP &R) {
                       << Entry.second.size() << ".\n");
 
     // Process the GEP list in chunks suitable for the target's supported
-    // vector size. If a vector register can't hold 1 element, we are done.
+    // vector size. If a vector register can't hold 1 element, we are done. We
+    // are trying to vectorize the index computations, so the maximum number of
+    // elements is based on the size of the index expression, rather than the
+    // size of the GEP itself (the target's pointer size).
     unsigned MaxVecRegSize = R.getMaxVecRegSize();
-    unsigned EltSize = R.getVectorElementSize(Entry.second[0]);
+    unsigned EltSize = R.getVectorElementSize(*Entry.second[0]->idx_begin());
     if (MaxVecRegSize < EltSize)
       continue;
 
@@ -7475,6 +7630,7 @@ INITIALIZE_PASS_DEPENDENCY(ScalarEvolutionWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(LoopSimplify)
 INITIALIZE_PASS_DEPENDENCY(DemandedBitsWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(OptimizationRemarkEmitterWrapperPass)
+INITIALIZE_PASS_DEPENDENCY(InjectTLIMappingsLegacy)
 INITIALIZE_PASS_END(SLPVectorizer, SV_NAME, lv_name, false, false)
 
 Pass *llvm::createSLPVectorizerPass() { return new SLPVectorizer(); }
diff --git a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
index 598fb00e956e..6f055ca80ff2 100644
--- a/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
+++ b/llvm/lib/Transforms/Vectorize/VPRecipeBuilder.h
@@ -18,7 +18,6 @@ namespace llvm {
 
 class LoopVectorizationLegality;
 class LoopVectorizationCostModel;
-class TargetTransformInfo;
 class TargetLibraryInfo;
 
 /// Helper class to create VPRecipies from IR instructions.
@@ -35,6 +34,8 @@ class VPRecipeBuilder {
   /// The profitablity analysis.
   LoopVectorizationCostModel &CM;
 
+  PredicatedScalarEvolution &PSE;
+
   VPBuilder &Builder;
 
   /// When we if-convert we need to create edge masks. We have to cache values
@@ -49,11 +50,57 @@ class VPRecipeBuilder {
 
   // VPlan-VPlan transformations support: Hold a mapping from ingredients to
   // their recipe. To save on memory, only do so for selected ingredients,
-  // marked by having a nullptr entry in this map. If those ingredients get a
-  // VPWidenRecipe, also avoid compressing other ingredients into it to avoid
-  // having to split such recipes later.
+  // marked by having a nullptr entry in this map.
   DenseMap<Instruction *, VPRecipeBase *> Ingredient2Recipe;
-  VPWidenRecipe *LastExtensibleRecipe = nullptr;
+
+  /// Check if \p I can be widened at the start of \p Range and possibly
+  /// decrease the range such that the returned value holds for the entire \p
+  /// Range. The function should not be called for memory instructions or calls.
+  bool shouldWiden(Instruction *I, VFRange &Range) const;
+
+  /// Check if the load or store instruction \p I should widened for \p
+  /// Range.Start and potentially masked. Such instructions are handled by a
+  /// recipe that takes an additional VPInstruction for the mask.
+  VPWidenMemoryInstructionRecipe *
+  tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan);
+
+  /// Check if an induction recipe should be constructed for \I. If so build and
+  /// return it. If not, return null.
+  VPWidenIntOrFpInductionRecipe *tryToOptimizeInductionPHI(PHINode *Phi) const;
+
+  /// Optimize the special case where the operand of \p I is a constant integer
+  /// induction variable.
+  VPWidenIntOrFpInductionRecipe *
+  tryToOptimizeInductionTruncate(TruncInst *I, VFRange &Range) const;
+
+  /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
+  /// a sequence of select instructions as the vectorizer currently performs
+  /// full if-conversion.
+  VPBlendRecipe *tryToBlend(PHINode *Phi, VPlanPtr &Plan);
+
+  /// Handle call instructions. If \p CI can be widened for \p Range.Start,
+  /// return a new VPWidenCallRecipe. Range.End may be decreased to ensure same
+  /// decision from \p Range.Start to \p Range.End.
+  VPWidenCallRecipe *tryToWidenCall(CallInst *CI, VFRange &Range,
+                                    VPlan &Plan) const;
+
+  /// Check if \p I has an opcode that can be widened and return a VPWidenRecipe
+  /// if it can. The function should only be called if the cost-model indicates
+  /// that widening should be performed.
+  VPWidenRecipe *tryToWiden(Instruction *I, VPlan &Plan) const;
+
+public:
+  VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
+                  LoopVectorizationLegality *Legal,
+                  LoopVectorizationCostModel &CM,
+                  PredicatedScalarEvolution &PSE, VPBuilder &Builder)
+      : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), PSE(PSE),
+        Builder(Builder) {}
+
+  /// Check if a recipe can be create for \p I withing the given VF \p Range.
+  /// If a recipe can be created, return it. Otherwise return nullptr.
+  VPRecipeBase *tryToCreateWidenRecipe(Instruction *Instr, VFRange &Range,
+                                       VPlanPtr &Plan);
 
   /// Set the recipe created for given ingredient. This operation is a no-op for
   /// ingredients that were not marked using a nullptr entry in the map.
@@ -65,7 +112,6 @@ class VPRecipeBuilder {
     Ingredient2Recipe[I] = R;
   }
 
-public:
   /// A helper function that computes the predicate of the block BB, assuming
   /// that the header block of the loop is set to True. It returns the *entry*
   /// mask for the block BB.
@@ -92,48 +138,11 @@ public:
     return Ingredient2Recipe[I];
   }
 
-  /// Check if \I is a memory instruction to be widened for \p Range.Start and
-  /// potentially masked. Such instructions are handled by a recipe that takes
-  /// an additional VPInstruction for the mask.
-  VPWidenMemoryInstructionRecipe *
-  tryToWidenMemory(Instruction *I, VFRange &Range, VPlanPtr &Plan);
-
-  /// Check if an induction recipe should be constructed for \I within the given
-  /// VF \p Range. If so build and return it. If not, return null. \p Range.End
-  /// may be decreased to ensure same decision from \p Range.Start to
-  /// \p Range.End.
-  VPWidenIntOrFpInductionRecipe *tryToOptimizeInduction(Instruction *I,
-                                                        VFRange &Range);
-
-  /// Handle non-loop phi nodes. Currently all such phi nodes are turned into
-  /// a sequence of select instructions as the vectorizer currently performs
-  /// full if-conversion.
-  VPBlendRecipe *tryToBlend(Instruction *I, VPlanPtr &Plan);
-
-  /// Check if \p I can be widened within the given VF \p Range. If \p I can be
-  /// widened for \p Range.Start, check if the last recipe of \p VPBB can be
-  /// extended to include \p I or else build a new VPWidenRecipe for it and
-  /// append it to \p VPBB. Return true if \p I can be widened for Range.Start,
-  /// false otherwise. Range.End may be decreased to ensure same decision from
-  /// \p Range.Start to \p Range.End.
-  bool tryToWiden(Instruction *I, VPBasicBlock *VPBB, VFRange &Range);
-
   /// Create a replicating region for instruction \p I that requires
   /// predication. \p PredRecipe is a VPReplicateRecipe holding \p I.
   VPRegionBlock *createReplicateRegion(Instruction *I, VPRecipeBase *PredRecipe,
                                        VPlanPtr &Plan);
 
-public:
-  VPRecipeBuilder(Loop *OrigLoop, const TargetLibraryInfo *TLI,
-                  LoopVectorizationLegality *Legal,
-                  LoopVectorizationCostModel &CM, VPBuilder &Builder)
-      : OrigLoop(OrigLoop), TLI(TLI), Legal(Legal), CM(CM), Builder(Builder) {}
-
-  /// Check if a recipe can be create for \p I withing the given VF \p Range.
-  /// If a recipe can be created, it adds it to \p VPBB.
-  bool tryToCreateRecipe(Instruction *Instr, VFRange &Range, VPlanPtr &Plan,
-                         VPBasicBlock *VPBB);
-
   /// Build a VPReplicationRecipe for \p I and enclose it within a Region if it
   /// is predicated. \return \p VPBB augmented with this new recipe if \p I is
   /// not predicated, otherwise \return a new VPBasicBlock that succeeds the new
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index f1c708720ccf..f5f28a3bffa1 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -49,13 +49,46 @@ extern cl::opt<bool> EnableVPlanNativePath;
 #define DEBUG_TYPE "vplan"
 
 raw_ostream &llvm::operator<<(raw_ostream &OS, const VPValue &V) {
-  if (const VPInstruction *Instr = dyn_cast<VPInstruction>(&V))
-    Instr->print(OS);
-  else
-    V.printAsOperand(OS);
+  const VPInstruction *Instr = dyn_cast<VPInstruction>(&V);
+  VPSlotTracker SlotTracker(
+      (Instr && Instr->getParent()) ? Instr->getParent()->getPlan() : nullptr);
+  V.print(OS, SlotTracker);
   return OS;
 }
 
+void VPValue::print(raw_ostream &OS, VPSlotTracker &SlotTracker) const {
+  if (const VPInstruction *Instr = dyn_cast<VPInstruction>(this))
+    Instr->print(OS, SlotTracker);
+  else
+    printAsOperand(OS, SlotTracker);
+}
+
+// Get the top-most entry block of \p Start. This is the entry block of the
+// containing VPlan. This function is templated to support both const and non-const blocks
+template <typename T> static T *getPlanEntry(T *Start) {
+  T *Next = Start;
+  T *Current = Start;
+  while ((Next = Next->getParent()))
+    Current = Next;
+
+  SmallSetVector<T *, 8> WorkList;
+  WorkList.insert(Current);
+
+  for (unsigned i = 0; i < WorkList.size(); i++) {
+    T *Current = WorkList[i];
+    if (Current->getNumPredecessors() == 0)
+      return Current;
+    auto &Predecessors = Current->getPredecessors();
+    WorkList.insert(Predecessors.begin(), Predecessors.end());
+  }
+
+  llvm_unreachable("VPlan without any entry node without predecessors");
+}
+
+VPlan *VPBlockBase::getPlan() { return getPlanEntry(this)->Plan; }
+
+const VPlan *VPBlockBase::getPlan() const { return getPlanEntry(this)->Plan; }
+
 /// \return the VPBasicBlock that is the entry of Block, possibly indirectly.
 const VPBasicBlock *VPBlockBase::getEntryBasicBlock() const {
   const VPBlockBase *Block = this;
@@ -71,6 +104,12 @@ VPBasicBlock *VPBlockBase::getEntryBasicBlock() {
   return cast<VPBasicBlock>(Block);
 }
 
+void VPBlockBase::setPlan(VPlan *ParentPlan) {
+  assert(ParentPlan->getEntry() == this &&
+         "Can only set plan on its entry block.");
+  Plan = ParentPlan;
+}
+
 /// \return the VPBasicBlock that is the exit of Block, possibly indirectly.
 const VPBasicBlock *VPBlockBase::getExitBasicBlock() const {
   const VPBlockBase *Block = this;
@@ -341,6 +380,20 @@ void VPInstruction::generateInstruction(VPTransformState &State,
     State.set(this, V, Part);
     break;
   }
+  case VPInstruction::ActiveLaneMask: {
+    // Get first lane of vector induction variable.
+    Value *VIVElem0 = State.get(getOperand(0), {Part, 0});
+    // Get first lane of backedge-taken-count.
+    Value *ScalarBTC = State.get(getOperand(1), {Part, 0});
+
+    auto *Int1Ty = Type::getInt1Ty(Builder.getContext());
+    auto *PredTy = FixedVectorType::get(Int1Ty, State.VF);
+    Instruction *Call = Builder.CreateIntrinsic(
+        Intrinsic::get_active_lane_mask, {PredTy, ScalarBTC->getType()},
+        {VIVElem0, ScalarBTC}, nullptr, "active.lane.mask");
+    State.set(this, Call, Part);
+    break;
+  }
   default:
     llvm_unreachable("Unsupported opcode for instruction");
   }
@@ -352,15 +405,22 @@ void VPInstruction::execute(VPTransformState &State) {
     generateInstruction(State, Part);
 }
 
-void VPInstruction::print(raw_ostream &O, const Twine &Indent) const {
-  O << " +\n" << Indent << "\"EMIT ";
-  print(O);
-  O << "\\l\"";
+void VPInstruction::print(raw_ostream &O, const Twine &Indent,
+                          VPSlotTracker &SlotTracker) const {
+  O << "\"EMIT ";
+  print(O, SlotTracker);
 }
 
 void VPInstruction::print(raw_ostream &O) const {
-  printAsOperand(O);
-  O << " = ";
+  VPSlotTracker SlotTracker(getParent()->getPlan());
+  print(O, SlotTracker);
+}
+
+void VPInstruction::print(raw_ostream &O, VPSlotTracker &SlotTracker) const {
+  if (hasResult()) {
+    printAsOperand(O, SlotTracker);
+    O << " = ";
+  }
 
   switch (getOpcode()) {
   case VPInstruction::Not:
@@ -375,13 +435,17 @@ void VPInstruction::print(raw_ostream &O) const {
   case VPInstruction::SLPStore:
     O << "combined store";
     break;
+  case VPInstruction::ActiveLaneMask:
+    O << "active lane mask";
+    break;
+
   default:
     O << Instruction::getOpcodeName(getOpcode());
   }
 
   for (const VPValue *Operand : operands()) {
     O << " ";
-    Operand->printAsOperand(O);
+    Operand->printAsOperand(O, SlotTracker);
   }
 }
 
@@ -395,7 +459,11 @@ void VPlan::execute(VPTransformState *State) {
     IRBuilder<> Builder(State->CFG.PrevBB->getTerminator());
     auto *TCMO = Builder.CreateSub(TC, ConstantInt::get(TC->getType(), 1),
                                    "trip.count.minus.1");
-    Value2VPValue[TCMO] = BackedgeTakenCount;
+    auto VF = State->VF;
+    Value *VTCMO =
+        VF == 1 ? TCMO : Builder.CreateVectorSplat(VF, TCMO, "broadcast");
+    for (unsigned Part = 0, UF = State->UF; Part < UF; ++Part)
+      State->set(BackedgeTakenCount, VTCMO, Part);
   }
 
   // 0. Set the reverse mapping from VPValues to Values for code generation.
@@ -533,15 +601,10 @@ void VPlanPrinter::dump() {
   OS << "graph [labelloc=t, fontsize=30; label=\"Vectorization Plan";
   if (!Plan.getName().empty())
     OS << "\\n" << DOT::EscapeString(Plan.getName());
-  if (!Plan.Value2VPValue.empty() || Plan.BackedgeTakenCount) {
-    OS << ", where:";
-    if (Plan.BackedgeTakenCount)
-      OS << "\\n" << *Plan.BackedgeTakenCount << " := BackedgeTakenCount";
-    for (auto Entry : Plan.Value2VPValue) {
-      OS << "\\n" << *Entry.second;
-      OS << DOT::EscapeString(" := ");
-      Entry.first->printAsOperand(OS, false);
-    }
+  if (Plan.BackedgeTakenCount) {
+    OS << ", where:\\n";
+    Plan.BackedgeTakenCount->print(OS, SlotTracker);
+    OS << " := BackedgeTakenCount";
   }
   OS << "\"]\n";
   OS << "node [shape=rect, fontname=Courier, fontsize=30]\n";
@@ -605,25 +668,28 @@ void VPlanPrinter::dumpBasicBlock(const VPBasicBlock *BasicBlock) {
   if (Pred) {
     OS << " +\n" << Indent << " \"BlockPredicate: ";
     if (const VPInstruction *PredI = dyn_cast<VPInstruction>(Pred)) {
-      PredI->printAsOperand(OS);
+      PredI->printAsOperand(OS, SlotTracker);
       OS << " (" << DOT::EscapeString(PredI->getParent()->getName())
          << ")\\l\"";
     } else
-      Pred->printAsOperand(OS);
+      Pred->printAsOperand(OS, SlotTracker);
   }
 
-  for (const VPRecipeBase &Recipe : *BasicBlock)
-    Recipe.print(OS, Indent);
+  for (const VPRecipeBase &Recipe : *BasicBlock) {
+    OS << " +\n" << Indent;
+    Recipe.print(OS, Indent, SlotTracker);
+    OS << "\\l\"";
+  }
 
   // Dump the condition bit.
   const VPValue *CBV = BasicBlock->getCondBit();
   if (CBV) {
     OS << " +\n" << Indent << " \"CondBit: ";
     if (const VPInstruction *CBI = dyn_cast<VPInstruction>(CBV)) {
-      CBI->printAsOperand(OS);
+      CBI->printAsOperand(OS, SlotTracker);
       OS << " (" << DOT::EscapeString(CBI->getParent()->getName()) << ")\\l\"";
     } else {
-      CBV->printAsOperand(OS);
+      CBV->printAsOperand(OS, SlotTracker);
       OS << "\"";
     }
   }
@@ -670,83 +736,121 @@ void VPlanPrinter::printAsIngredient(raw_ostream &O, Value *V) {
   O << DOT::EscapeString(IngredientString);
 }
 
-void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent) const {
-  O << " +\n" << Indent << "\"WIDEN\\l\"";
-  for (auto &Instr : make_range(Begin, End))
-    O << " +\n" << Indent << "\"  " << VPlanIngredient(&Instr) << "\\l\"";
+void VPWidenCallRecipe::print(raw_ostream &O, const Twine &Indent,
+                              VPSlotTracker &SlotTracker) const {
+  O << "\"WIDEN-CALL " << VPlanIngredient(&Ingredient);
+}
+
+void VPWidenSelectRecipe::print(raw_ostream &O, const Twine &Indent,
+                                VPSlotTracker &SlotTracker) const {
+  O << "\"WIDEN-SELECT" << VPlanIngredient(&Ingredient)
+    << (InvariantCond ? " (condition is loop invariant)" : "");
 }
 
-void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O,
-                                          const Twine &Indent) const {
-  O << " +\n" << Indent << "\"WIDEN-INDUCTION";
+void VPWidenRecipe::print(raw_ostream &O, const Twine &Indent,
+                          VPSlotTracker &SlotTracker) const {
+  O << "\"WIDEN\\l\"";
+  O << "\"  " << VPlanIngredient(&Ingredient);
+}
+
+void VPWidenIntOrFpInductionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                          VPSlotTracker &SlotTracker) const {
+  O << "\"WIDEN-INDUCTION";
   if (Trunc) {
     O << "\\l\"";
     O << " +\n" << Indent << "\"  " << VPlanIngredient(IV) << "\\l\"";
-    O << " +\n" << Indent << "\"  " << VPlanIngredient(Trunc) << "\\l\"";
+    O << " +\n" << Indent << "\"  " << VPlanIngredient(Trunc);
   } else
-    O << " " << VPlanIngredient(IV) << "\\l\"";
+    O << " " << VPlanIngredient(IV);
 }
 
-void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent) const {
-  O << " +\n" << Indent << "\"WIDEN-GEP ";
+void VPWidenGEPRecipe::print(raw_ostream &O, const Twine &Indent,
+                             VPSlotTracker &SlotTracker) const {
+  O << "\"WIDEN-GEP ";
   O << (IsPtrLoopInvariant ? "Inv" : "Var");
   size_t IndicesNumber = IsIndexLoopInvariant.size();
   for (size_t I = 0; I < IndicesNumber; ++I)
     O << "[" << (IsIndexLoopInvariant[I] ? "Inv" : "Var") << "]";
   O << "\\l\"";
-  O << " +\n" << Indent << "\"  "  << VPlanIngredient(GEP) << "\\l\"";
+  O << " +\n" << Indent << "\"  " << VPlanIngredient(GEP);
 }
 
-void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent) const {
-  O << " +\n" << Indent << "\"WIDEN-PHI " << VPlanIngredient(Phi) << "\\l\"";
+void VPWidenPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                             VPSlotTracker &SlotTracker) const {
+  O << "\"WIDEN-PHI " << VPlanIngredient(Phi);
 }
 
-void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent) const {
-  O << " +\n" << Indent << "\"BLEND ";
+void VPBlendRecipe::print(raw_ostream &O, const Twine &Indent,
+                          VPSlotTracker &SlotTracker) const {
+  O << "\"BLEND ";
   Phi->printAsOperand(O, false);
   O << " =";
-  if (!User) {
+  if (getNumIncomingValues() == 1) {
     // Not a User of any mask: not really blending, this is a
     // single-predecessor phi.
     O << " ";
-    Phi->getIncomingValue(0)->printAsOperand(O, false);
+    getIncomingValue(0)->printAsOperand(O, SlotTracker);
   } else {
-    for (unsigned I = 0, E = User->getNumOperands(); I < E; ++I) {
+    for (unsigned I = 0, E = getNumIncomingValues(); I < E; ++I) {
       O << " ";
-      Phi->getIncomingValue(I)->printAsOperand(O, false);
+      getIncomingValue(I)->printAsOperand(O, SlotTracker);
       O << "/";
-      User->getOperand(I)->printAsOperand(O);
+      getMask(I)->printAsOperand(O, SlotTracker);
     }
   }
-  O << "\\l\"";
 }
 
-void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent) const {
-  O << " +\n"
-    << Indent << "\"" << (IsUniform ? "CLONE " : "REPLICATE ")
+void VPReplicateRecipe::print(raw_ostream &O, const Twine &Indent,
+                              VPSlotTracker &SlotTracker) const {
+  O << "\"" << (IsUniform ? "CLONE " : "REPLICATE ")
     << VPlanIngredient(Ingredient);
   if (AlsoPack)
     O << " (S->V)";
-  O << "\\l\"";
 }
 
-void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent) const {
-  O << " +\n"
-    << Indent << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst)
-    << "\\l\"";
+void VPPredInstPHIRecipe::print(raw_ostream &O, const Twine &Indent,
+                                VPSlotTracker &SlotTracker) const {
+  O << "\"PHI-PREDICATED-INSTRUCTION " << VPlanIngredient(PredInst);
 }
 
-void VPWidenMemoryInstructionRecipe::print(raw_ostream &O,
-                                           const Twine &Indent) const {
-  O << " +\n" << Indent << "\"WIDEN " << VPlanIngredient(&Instr);
+void VPWidenMemoryInstructionRecipe::print(raw_ostream &O, const Twine &Indent,
+                                           VPSlotTracker &SlotTracker) const {
+  O << "\"WIDEN " << VPlanIngredient(&Instr);
   O << ", ";
-  getAddr()->printAsOperand(O);
+  getAddr()->printAsOperand(O, SlotTracker);
   VPValue *Mask = getMask();
   if (Mask) {
     O << ", ";
-    Mask->printAsOperand(O);
+    Mask->printAsOperand(O, SlotTracker);
   }
-  O << "\\l\"";
+}
+
+void VPWidenCanonicalIVRecipe::execute(VPTransformState &State) {
+  Value *CanonicalIV = State.CanonicalIV;
+  Type *STy = CanonicalIV->getType();
+  IRBuilder<> Builder(State.CFG.PrevBB->getTerminator());
+  auto VF = State.VF;
+  Value *VStart = VF == 1
+                      ? CanonicalIV
+                      : Builder.CreateVectorSplat(VF, CanonicalIV, "broadcast");
+  for (unsigned Part = 0, UF = State.UF; Part < UF; ++Part) {
+    SmallVector<Constant *, 8> Indices;
+    for (unsigned Lane = 0; Lane < VF; ++Lane)
+      Indices.push_back(ConstantInt::get(STy, Part * VF + Lane));
+    // If VF == 1, there is only one iteration in the loop above, thus the
+    // element pushed back into Indices is ConstantInt::get(STy, Part)
+    Constant *VStep = VF == 1 ? Indices.back() : ConstantVector::get(Indices);
+    // Add the consecutive indices to the vector value.
+    Value *CanonicalVectorIV = Builder.CreateAdd(VStart, VStep, "vec.iv");
+    State.set(getVPValue(), CanonicalVectorIV, Part);
+  }
+}
+
+void VPWidenCanonicalIVRecipe::print(raw_ostream &O, const Twine &Indent,
+                                     VPSlotTracker &SlotTracker) const {
+  O << "\"EMIT ";
+  getVPValue()->printAsOperand(O, SlotTracker);
+  O << " = WIDEN-CANONICAL-INDUCTION";
 }
 
 template void DomTreeBuilder::Calculate<VPDominatorTree>(VPDominatorTree &DT);
@@ -758,6 +862,21 @@ void VPValue::replaceAllUsesWith(VPValue *New) {
         User->setOperand(I, New);
 }
 
+void VPValue::printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const {
+  if (const Value *UV = getUnderlyingValue()) {
+    OS << "ir<";
+    UV->printAsOperand(OS, false);
+    OS << ">";
+    return;
+  }
+
+  unsigned Slot = Tracker.getSlot(this);
+  if (Slot == unsigned(-1))
+    OS << "<badref>";
+  else
+    OS << "vp<%" << Tracker.getSlot(this) << ">";
+}
+
 void VPInterleavedAccessInfo::visitRegion(VPRegionBlock *Region,
                                           Old2NewTy &Old2New,
                                           InterleavedAccessInfo &IAI) {
@@ -781,7 +900,7 @@ void VPInterleavedAccessInfo::visitBlock(VPBlockBase *Block, Old2NewTy &Old2New,
       auto NewIGIter = Old2New.find(IG);
       if (NewIGIter == Old2New.end())
         Old2New[IG] = new InterleaveGroup<VPInstruction>(
-            IG->getFactor(), IG->isReverse(), Align(IG->getAlignment()));
+            IG->getFactor(), IG->isReverse(), IG->getAlign());
 
       if (Inst == IG->getInsertPos())
         Old2New[IG]->setInsertPos(VPInst);
@@ -803,3 +922,57 @@ VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
   Old2NewTy Old2New;
   visitRegion(cast<VPRegionBlock>(Plan.getEntry()), Old2New, IAI);
 }
+
+void VPSlotTracker::assignSlot(const VPValue *V) {
+  assert(Slots.find(V) == Slots.end() && "VPValue already has a slot!");
+  const Value *UV = V->getUnderlyingValue();
+  if (UV)
+    return;
+  const auto *VPI = dyn_cast<VPInstruction>(V);
+  if (VPI && !VPI->hasResult())
+    return;
+
+  Slots[V] = NextSlot++;
+}
+
+void VPSlotTracker::assignSlots(const VPBlockBase *VPBB) {
+  if (auto *Region = dyn_cast<VPRegionBlock>(VPBB))
+    assignSlots(Region);
+  else
+    assignSlots(cast<VPBasicBlock>(VPBB));
+}
+
+void VPSlotTracker::assignSlots(const VPRegionBlock *Region) {
+  ReversePostOrderTraversal<const VPBlockBase *> RPOT(Region->getEntry());
+  for (const VPBlockBase *Block : RPOT)
+    assignSlots(Block);
+}
+
+void VPSlotTracker::assignSlots(const VPBasicBlock *VPBB) {
+  for (const VPRecipeBase &Recipe : *VPBB) {
+    if (const auto *VPI = dyn_cast<VPInstruction>(&Recipe))
+      assignSlot(VPI);
+    else if (const auto *VPIV = dyn_cast<VPWidenCanonicalIVRecipe>(&Recipe))
+      assignSlot(VPIV->getVPValue());
+  }
+}
+
+void VPSlotTracker::assignSlots(const VPlan &Plan) {
+
+  for (const VPValue *V : Plan.VPExternalDefs)
+    assignSlot(V);
+
+  for (auto &E : Plan.Value2VPValue)
+    if (!isa<VPInstruction>(E.second))
+      assignSlot(E.second);
+
+  for (const VPValue *V : Plan.VPCBVs)
+    assignSlot(V);
+
+  if (Plan.BackedgeTakenCount)
+    assignSlot(Plan.BackedgeTakenCount);
+
+  ReversePostOrderTraversal<const VPBlockBase *> RPOT(Plan.getEntry());
+  for (const VPBlockBase *Block : RPOT)
+    assignSlots(Block);
+}
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index c65abc3639d7..f07c94e7a3c7 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -48,8 +48,6 @@
 
 namespace llvm {
 
-class LoopVectorizationLegality;
-class LoopVectorizationCostModel;
 class BasicBlock;
 class DominatorTree;
 class InnerLoopVectorizer;
@@ -59,6 +57,7 @@ class raw_ostream;
 class Value;
 class VPBasicBlock;
 class VPRegionBlock;
+class VPSlotTracker;
 class VPlan;
 class VPlanSlp;
 
@@ -271,10 +270,20 @@ struct VPTransformState {
     return Callback.getOrCreateVectorValues(VPValue2Value[Def], Part);
   }
 
-  /// Get the generated Value for a given VPValue and given Part and Lane. Note
-  /// that as per-lane Defs are still created by ILV and managed in its ValueMap
-  /// this method currently just delegates the call to ILV.
+  /// Get the generated Value for a given VPValue and given Part and Lane.
   Value *get(VPValue *Def, const VPIteration &Instance) {
+    // If the Def is managed directly by VPTransformState, extract the lane from
+    // the relevant part. Note that currently only VPInstructions and external
+    // defs are managed by VPTransformState. Other Defs are still created by ILV
+    // and managed in its ValueMap. For those this method currently just
+    // delegates the call to ILV below.
+    if (Data.PerPartOutput.count(Def)) {
+      auto *VecPart = Data.PerPartOutput[Def][Instance.Part];
+      // TODO: Cache created scalar values.
+      return Builder.CreateExtractElement(VecPart,
+                                          Builder.getInt32(Instance.Lane));
+    }
+
     return Callback.getOrCreateScalarValue(VPValue2Value[Def], Instance);
   }
 
@@ -329,6 +338,9 @@ struct VPTransformState {
   /// Values they correspond to.
   VPValue2ValueTy VPValue2Value;
 
+  /// Hold the canonical scalar IV of the vector loop (start=0, step=VF*UF).
+  Value *CanonicalIV = nullptr;
+
   /// Hold the trip count of the scalar loop.
   Value *TripCount = nullptr;
 
@@ -343,7 +355,6 @@ struct VPTransformState {
 class VPBlockBase {
   friend class VPBlockUtils;
 
-private:
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
   /// An optional name for the block.
@@ -365,6 +376,10 @@ private:
   /// Current block predicate - null if the block does not need a predicate.
   VPValue *Predicate = nullptr;
 
+  /// VPlan containing the block. Can only be set on the entry block of the
+  /// plan.
+  VPlan *Plan = nullptr;
+
   /// Add \p Successor as the last successor to this block.
   void appendSuccessor(VPBlockBase *Successor) {
     assert(Successor && "Cannot add nullptr successor!");
@@ -418,6 +433,14 @@ public:
   VPRegionBlock *getParent() { return Parent; }
   const VPRegionBlock *getParent() const { return Parent; }
 
+  /// \return A pointer to the plan containing the current block.
+  VPlan *getPlan();
+  const VPlan *getPlan() const;
+
+  /// Sets the pointer of the plan containing the block. The block must be the
+  /// entry block into the VPlan.
+  void setPlan(VPlan *ParentPlan);
+
   void setParent(VPRegionBlock *P) { Parent = P; }
 
   /// \return the VPBasicBlock that is the entry of this VPBlockBase,
@@ -579,7 +602,6 @@ class VPRecipeBase : public ilist_node_with_parent<VPRecipeBase, VPBasicBlock> {
   friend VPBasicBlock;
   friend class VPBlockUtils;
 
-private:
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
   /// Each VPRecipe belongs to a single VPBasicBlock.
@@ -597,11 +619,14 @@ public:
     VPInterleaveSC,
     VPPredInstPHISC,
     VPReplicateSC,
+    VPWidenCallSC,
+    VPWidenCanonicalIVSC,
     VPWidenGEPSC,
     VPWidenIntOrFpInductionSC,
     VPWidenMemoryInstructionSC,
     VPWidenPHISC,
     VPWidenSC,
+    VPWidenSelectSC
   };
 
   VPRecipeBase(const unsigned char SC) : SubclassID(SC) {}
@@ -621,7 +646,8 @@ public:
   virtual void execute(struct VPTransformState &State) = 0;
 
   /// Each recipe prints itself.
-  virtual void print(raw_ostream &O, const Twine &Indent) const = 0;
+  virtual void print(raw_ostream &O, const Twine &Indent,
+                     VPSlotTracker &SlotTracker) const = 0;
 
   /// Insert an unlinked recipe into a basic block immediately before
   /// the specified recipe.
@@ -659,6 +685,7 @@ public:
     ICmpULE,
     SLPLoad,
     SLPStore,
+    ActiveLaneMask,
   };
 
 private:
@@ -707,10 +734,12 @@ public:
   void execute(VPTransformState &State) override;
 
   /// Print the Recipe.
-  void print(raw_ostream &O, const Twine &Indent) const override;
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
 
   /// Print the VPInstruction.
   void print(raw_ostream &O) const;
+  void print(raw_ostream &O, VPSlotTracker &SlotTracker) const;
 
   /// Return true if this instruction may modify memory.
   bool mayWriteToMemory() const {
@@ -719,23 +748,42 @@ public:
     return Opcode == Instruction::Store || Opcode == Instruction::Call ||
            Opcode == Instruction::Invoke || Opcode == SLPStore;
   }
+
+  bool hasResult() const {
+    // CallInst may or may not have a result, depending on the called function.
+    // Conservatively return calls have results for now.
+    switch (getOpcode()) {
+    case Instruction::Ret:
+    case Instruction::Br:
+    case Instruction::Store:
+    case Instruction::Switch:
+    case Instruction::IndirectBr:
+    case Instruction::Resume:
+    case Instruction::CatchRet:
+    case Instruction::Unreachable:
+    case Instruction::Fence:
+    case Instruction::AtomicRMW:
+      return false;
+    default:
+      return true;
+    }
+  }
 };
 
-/// VPWidenRecipe is a recipe for producing a copy of vector type for each
-/// Instruction in its ingredients independently, in order. This recipe covers
-/// most of the traditional vectorization cases where each ingredient transforms
-/// into a vectorized version of itself.
+/// VPWidenRecipe is a recipe for producing a copy of vector type its
+/// ingredient. This recipe covers most of the traditional vectorization cases
+/// where each ingredient transforms into a vectorized version of itself.
 class VPWidenRecipe : public VPRecipeBase {
-private:
-  /// Hold the ingredients by pointing to their original BasicBlock location.
-  BasicBlock::iterator Begin;
-  BasicBlock::iterator End;
+  /// Hold the instruction to be widened.
+  Instruction &Ingredient;
+
+  /// Hold VPValues for the operands of the ingredient.
+  VPUser User;
 
 public:
-  VPWidenRecipe(Instruction *I) : VPRecipeBase(VPWidenSC) {
-    End = I->getIterator();
-    Begin = End++;
-  }
+  template <typename IterT>
+  VPWidenRecipe(Instruction &I, iterator_range<IterT> Operands)
+      : VPRecipeBase(VPWidenSC), Ingredient(I), User(Operands) {}
 
   ~VPWidenRecipe() override = default;
 
@@ -747,28 +795,88 @@ public:
   /// Produce widened copies of all Ingredients.
   void execute(VPTransformState &State) override;
 
-  /// Augment the recipe to include Instr, if it lies at its End.
-  bool appendInstruction(Instruction *Instr) {
-    if (End != Instr->getIterator())
-      return false;
-    End++;
-    return true;
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
+/// A recipe for widening Call instructions.
+class VPWidenCallRecipe : public VPRecipeBase {
+  /// Hold the call to be widened.
+  CallInst &Ingredient;
+
+  /// Hold VPValues for the arguments of the call.
+  VPUser User;
+
+public:
+  template <typename IterT>
+  VPWidenCallRecipe(CallInst &I, iterator_range<IterT> CallArguments)
+      : VPRecipeBase(VPWidenCallSC), Ingredient(I), User(CallArguments) {}
+
+  ~VPWidenCallRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *V) {
+    return V->getVPRecipeID() == VPRecipeBase::VPWidenCallSC;
   }
 
+  /// Produce a widened version of the call instruction.
+  void execute(VPTransformState &State) override;
+
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent) const override;
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
+/// A recipe for widening select instructions.
+class VPWidenSelectRecipe : public VPRecipeBase {
+private:
+  /// Hold the select to be widened.
+  SelectInst &Ingredient;
+
+  /// Hold VPValues for the operands of the select.
+  VPUser User;
+
+  /// Is the condition of the select loop invariant?
+  bool InvariantCond;
+
+public:
+  template <typename IterT>
+  VPWidenSelectRecipe(SelectInst &I, iterator_range<IterT> Operands,
+                      bool InvariantCond)
+      : VPRecipeBase(VPWidenSelectSC), Ingredient(I), User(Operands),
+        InvariantCond(InvariantCond) {}
+
+  ~VPWidenSelectRecipe() override = default;
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *V) {
+    return V->getVPRecipeID() == VPRecipeBase::VPWidenSelectSC;
+  }
+
+  /// Produce a widened version of the select instruction.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
 };
 
 /// A recipe for handling GEP instructions.
 class VPWidenGEPRecipe : public VPRecipeBase {
-private:
   GetElementPtrInst *GEP;
+
+  /// Hold VPValues for the base and indices of the GEP.
+  VPUser User;
+
   bool IsPtrLoopInvariant;
   SmallBitVector IsIndexLoopInvariant;
 
 public:
-  VPWidenGEPRecipe(GetElementPtrInst *GEP, Loop *OrigLoop)
-      : VPRecipeBase(VPWidenGEPSC), GEP(GEP),
+  template <typename IterT>
+  VPWidenGEPRecipe(GetElementPtrInst *GEP, iterator_range<IterT> Operands,
+                   Loop *OrigLoop)
+      : VPRecipeBase(VPWidenGEPSC), GEP(GEP), User(Operands),
         IsIndexLoopInvariant(GEP->getNumIndices(), false) {
     IsPtrLoopInvariant = OrigLoop->isLoopInvariant(GEP->getPointerOperand());
     for (auto Index : enumerate(GEP->indices()))
@@ -786,13 +894,13 @@ public:
   void execute(VPTransformState &State) override;
 
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent) const override;
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
 };
 
 /// A recipe for handling phi nodes of integer and floating-point inductions,
 /// producing their vector and scalar values.
 class VPWidenIntOrFpInductionRecipe : public VPRecipeBase {
-private:
   PHINode *IV;
   TruncInst *Trunc;
 
@@ -811,12 +919,12 @@ public:
   void execute(VPTransformState &State) override;
 
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent) const override;
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
 };
 
 /// A recipe for handling all phi nodes except for integer and FP inductions.
 class VPWidenPHIRecipe : public VPRecipeBase {
-private:
   PHINode *Phi;
 
 public:
@@ -832,26 +940,27 @@ public:
   void execute(VPTransformState &State) override;
 
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent) const override;
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
 };
 
 /// A recipe for vectorizing a phi-node as a sequence of mask-based select
 /// instructions.
 class VPBlendRecipe : public VPRecipeBase {
-private:
   PHINode *Phi;
 
-  /// The blend operation is a User of a mask, if not null.
-  std::unique_ptr<VPUser> User;
+  /// The blend operation is a User of the incoming values and of their
+  /// respective masks, ordered [I0, M0, I1, M1, ...]. Note that a single value
+  /// might be incoming with a full mask for which there is no VPValue.
+  VPUser User;
 
 public:
-  VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Masks)
-      : VPRecipeBase(VPBlendSC), Phi(Phi) {
-    assert((Phi->getNumIncomingValues() == 1 ||
-            Phi->getNumIncomingValues() == Masks.size()) &&
-           "Expected the same number of incoming values and masks");
-    if (!Masks.empty())
-      User.reset(new VPUser(Masks));
+  VPBlendRecipe(PHINode *Phi, ArrayRef<VPValue *> Operands)
+      : VPRecipeBase(VPBlendSC), Phi(Phi), User(Operands) {
+    assert(Operands.size() > 0 &&
+           ((Operands.size() == 1) || (Operands.size() % 2 == 0)) &&
+           "Expected either a single incoming value or a positive even number "
+           "of operands");
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -859,17 +968,31 @@ public:
     return V->getVPRecipeID() == VPRecipeBase::VPBlendSC;
   }
 
+  /// Return the number of incoming values, taking into account that a single
+  /// incoming value has no mask.
+  unsigned getNumIncomingValues() const {
+    return (User.getNumOperands() + 1) / 2;
+  }
+
+  /// Return incoming value number \p Idx.
+  VPValue *getIncomingValue(unsigned Idx) const {
+    return User.getOperand(Idx * 2);
+  }
+
+  /// Return mask number \p Idx.
+  VPValue *getMask(unsigned Idx) const { return User.getOperand(Idx * 2 + 1); }
+
   /// Generate the phi/select nodes.
   void execute(VPTransformState &State) override;
 
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent) const override;
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
 };
 
 /// VPInterleaveRecipe is a recipe for transforming an interleave group of load
 /// or stores into one wide load/store and shuffles.
 class VPInterleaveRecipe : public VPRecipeBase {
-private:
   const InterleaveGroup<Instruction> *IG;
   VPUser User;
 
@@ -903,7 +1026,8 @@ public:
   void execute(VPTransformState &State) override;
 
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent) const override;
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
 
   const InterleaveGroup<Instruction> *getInterleaveGroup() { return IG; }
 };
@@ -913,10 +1037,12 @@ public:
 /// single copy of widened type for all lanes. If the instruction is known to be
 /// uniform only one copy, per lane zero, will be generated.
 class VPReplicateRecipe : public VPRecipeBase {
-private:
   /// The instruction being replicated.
   Instruction *Ingredient;
 
+  /// Hold VPValues for the operands of the ingredient.
+  VPUser User;
+
   /// Indicator if only a single replica per lane is needed.
   bool IsUniform;
 
@@ -927,9 +1053,11 @@ private:
   bool AlsoPack;
 
 public:
-  VPReplicateRecipe(Instruction *I, bool IsUniform, bool IsPredicated = false)
-      : VPRecipeBase(VPReplicateSC), Ingredient(I), IsUniform(IsUniform),
-        IsPredicated(IsPredicated) {
+  template <typename IterT>
+  VPReplicateRecipe(Instruction *I, iterator_range<IterT> Operands,
+                    bool IsUniform, bool IsPredicated = false)
+      : VPRecipeBase(VPReplicateSC), Ingredient(I), User(Operands),
+        IsUniform(IsUniform), IsPredicated(IsPredicated) {
     // Retain the previous behavior of predicateInstructions(), where an
     // insert-element of a predicated instruction got hoisted into the
     // predicated basic block iff it was its only user. This is achieved by
@@ -953,18 +1081,18 @@ public:
   void setAlsoPack(bool Pack) { AlsoPack = Pack; }
 
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent) const override;
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
 };
 
 /// A recipe for generating conditional branches on the bits of a mask.
 class VPBranchOnMaskRecipe : public VPRecipeBase {
-private:
-  std::unique_ptr<VPUser> User;
+  VPUser User;
 
 public:
   VPBranchOnMaskRecipe(VPValue *BlockInMask) : VPRecipeBase(VPBranchOnMaskSC) {
     if (BlockInMask) // nullptr means all-one mask.
-      User.reset(new VPUser({BlockInMask}));
+      User.addOperand(BlockInMask);
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -977,14 +1105,23 @@ public:
   void execute(VPTransformState &State) override;
 
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent) const override {
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override {
     O << " +\n" << Indent << "\"BRANCH-ON-MASK ";
-    if (User)
-      O << *User->getOperand(0);
+    if (VPValue *Mask = getMask())
+      Mask->print(O, SlotTracker);
     else
       O << " All-One";
     O << "\\l\"";
   }
+
+  /// Return the mask used by this recipe. Note that a full mask is represented
+  /// by a nullptr.
+  VPValue *getMask() const {
+    assert(User.getNumOperands() <= 1 && "should have either 0 or 1 operands");
+    // Mask is optional.
+    return User.getNumOperands() == 1 ? User.getOperand(0) : nullptr;
+  }
 };
 
 /// VPPredInstPHIRecipe is a recipe for generating the phi nodes needed when
@@ -993,7 +1130,6 @@ public:
 /// The phi nodes can be scalar or vector depending on the users of the value.
 /// This recipe works in concert with VPBranchOnMaskRecipe.
 class VPPredInstPHIRecipe : public VPRecipeBase {
-private:
   Instruction *PredInst;
 
 public:
@@ -1012,23 +1148,42 @@ public:
   void execute(VPTransformState &State) override;
 
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent) const override;
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
 };
 
 /// A Recipe for widening load/store operations.
+/// The recipe uses the following VPValues:
+/// - For load: Address, optional mask
+/// - For store: Address, stored value, optional mask
 /// TODO: We currently execute only per-part unless a specific instance is
 /// provided.
 class VPWidenMemoryInstructionRecipe : public VPRecipeBase {
-private:
   Instruction &Instr;
   VPUser User;
 
+  void setMask(VPValue *Mask) {
+    if (!Mask)
+      return;
+    User.addOperand(Mask);
+  }
+
+  bool isMasked() const {
+    return (isa<LoadInst>(Instr) && User.getNumOperands() == 2) ||
+           (isa<StoreInst>(Instr) && User.getNumOperands() == 3);
+  }
+
 public:
-  VPWidenMemoryInstructionRecipe(Instruction &Instr, VPValue *Addr,
-                                 VPValue *Mask)
-      : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Instr), User({Addr}) {
-    if (Mask)
-      User.addOperand(Mask);
+  VPWidenMemoryInstructionRecipe(LoadInst &Load, VPValue *Addr, VPValue *Mask)
+      : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Load), User({Addr}) {
+    setMask(Mask);
+  }
+
+  VPWidenMemoryInstructionRecipe(StoreInst &Store, VPValue *Addr,
+                                 VPValue *StoredValue, VPValue *Mask)
+      : VPRecipeBase(VPWidenMemoryInstructionSC), Instr(Store),
+        User({Addr, StoredValue}) {
+    setMask(Mask);
   }
 
   /// Method to support type inquiry through isa, cast, and dyn_cast.
@@ -1044,15 +1199,52 @@ public:
   /// Return the mask used by this recipe. Note that a full mask is represented
   /// by a nullptr.
   VPValue *getMask() const {
-    // Mask is optional and therefore the last, currently 2nd operand.
-    return User.getNumOperands() == 2 ? User.getOperand(1) : nullptr;
+    // Mask is optional and therefore the last operand.
+    return isMasked() ? User.getOperand(User.getNumOperands() - 1) : nullptr;
+  }
+
+  /// Return the address accessed by this recipe.
+  VPValue *getStoredValue() const {
+    assert(isa<StoreInst>(Instr) &&
+           "Stored value only available for store instructions");
+    return User.getOperand(1); // Stored value is the 2nd, mandatory operand.
   }
 
   /// Generate the wide load/store.
   void execute(VPTransformState &State) override;
 
   /// Print the recipe.
-  void print(raw_ostream &O, const Twine &Indent) const override;
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
+};
+
+/// A Recipe for widening the canonical induction variable of the vector loop.
+class VPWidenCanonicalIVRecipe : public VPRecipeBase {
+  /// A VPValue representing the canonical vector IV.
+  VPValue Val;
+
+public:
+  VPWidenCanonicalIVRecipe() : VPRecipeBase(VPWidenCanonicalIVSC) {}
+  ~VPWidenCanonicalIVRecipe() override = default;
+
+  /// Return the VPValue representing the canonical vector induction variable of
+  /// the vector loop.
+  const VPValue *getVPValue() const { return &Val; }
+  VPValue *getVPValue() { return &Val; }
+
+  /// Method to support type inquiry through isa, cast, and dyn_cast.
+  static inline bool classof(const VPRecipeBase *V) {
+    return V->getVPRecipeID() == VPRecipeBase::VPWidenCanonicalIVSC;
+  }
+
+  /// Generate a canonical vector induction variable of the vector loop, with
+  /// start = {<Part*VF, Part*VF+1, ..., Part*VF+VF-1> for 0 <= Part < UF}, and
+  /// step = <VF*UF, VF*UF, ..., VF*UF>.
+  void execute(VPTransformState &State) override;
+
+  /// Print the recipe.
+  void print(raw_ostream &O, const Twine &Indent,
+             VPSlotTracker &SlotTracker) const override;
 };
 
 /// VPBasicBlock serves as the leaf of the Hierarchical Control-Flow Graph. It
@@ -1144,7 +1336,6 @@ private:
 /// candidate VF's. The actual replication takes place only once the desired VF
 /// and UF have been determined.
 class VPRegionBlock : public VPBlockBase {
-private:
   /// Hold the Single Entry of the SESE region modelled by the VPRegionBlock.
   VPBlockBase *Entry;
 
@@ -1347,8 +1538,8 @@ struct GraphTraits<Inverse<VPRegionBlock *>>
 /// VPBlock.
 class VPlan {
   friend class VPlanPrinter;
+  friend class VPSlotTracker;
 
-private:
   /// Hold the single entry to the Hierarchical CFG of the VPlan.
   VPBlockBase *Entry;
 
@@ -1380,16 +1571,18 @@ private:
   SmallVector<VPValue *, 4> VPCBVs;
 
 public:
-  VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {}
+  VPlan(VPBlockBase *Entry = nullptr) : Entry(Entry) {
+    if (Entry)
+      Entry->setPlan(this);
+  }
 
   ~VPlan() {
     if (Entry)
       VPBlockBase::deleteCFG(Entry);
     for (auto &MapEntry : Value2VPValue)
-      if (MapEntry.second != BackedgeTakenCount)
-        delete MapEntry.second;
+      delete MapEntry.second;
     if (BackedgeTakenCount)
-      delete BackedgeTakenCount; // Delete once, if in Value2VPValue or not.
+      delete BackedgeTakenCount;
     for (VPValue *Def : VPExternalDefs)
       delete Def;
     for (VPValue *CBV : VPCBVs)
@@ -1402,7 +1595,11 @@ public:
   VPBlockBase *getEntry() { return Entry; }
   const VPBlockBase *getEntry() const { return Entry; }
 
-  VPBlockBase *setEntry(VPBlockBase *Block) { return Entry = Block; }
+  VPBlockBase *setEntry(VPBlockBase *Block) {
+    Entry = Block;
+    Block->setPlan(this);
+    return Entry;
+  }
 
   /// The backedge taken count of the original loop.
   VPValue *getOrCreateBackedgeTakenCount() {
@@ -1433,7 +1630,7 @@ public:
   void addVPValue(Value *V) {
     assert(V && "Trying to add a null Value to VPlan");
     assert(!Value2VPValue.count(V) && "Value already exists in VPlan");
-    Value2VPValue[V] = new VPValue();
+    Value2VPValue[V] = new VPValue(V);
   }
 
   VPValue *getVPValue(Value *V) {
@@ -1456,6 +1653,16 @@ public:
   /// Dump the plan to stderr (for debugging).
   void dump() const;
 
+  /// Returns a range mapping the values the range \p Operands to their
+  /// corresponding VPValues.
+  iterator_range<mapped_iterator<Use *, std::function<VPValue *(Value *)>>>
+  mapToVPValues(User::op_range Operands) {
+    std::function<VPValue *(Value *)> Fn = [this](Value *Op) {
+      return getOrAddVPValue(Op);
+    };
+    return map_range(Operands, Fn);
+  }
+
 private:
   /// Add to the given dominator tree the header block and every new basic block
   /// that was created between it and the latch block, inclusive.
@@ -1480,7 +1687,10 @@ private:
   unsigned BID = 0;
   SmallDenseMap<const VPBlockBase *, unsigned> BlockID;
 
-  VPlanPrinter(raw_ostream &O, const VPlan &P) : OS(O), Plan(P) {}
+  VPSlotTracker SlotTracker;
+
+  VPlanPrinter(raw_ostream &O, const VPlan &P)
+      : OS(O), Plan(P), SlotTracker(&P) {}
 
   /// Handle indentation.
   void bumpIndent(int b) { Indent = std::string((Depth += b) * TabWidth, ' '); }
@@ -1635,7 +1845,6 @@ public:
 };
 
 class VPInterleavedAccessInfo {
-private:
   DenseMap<VPInstruction *, InterleaveGroup<VPInstruction> *>
       InterleaveGroupMap;
 
@@ -1679,7 +1888,6 @@ public:
 /// Class that maps (parts of) an existing VPlan to trees of combined
 /// VPInstructions.
 class VPlanSlp {
-private:
   enum class OpMode { Failed, Load, Opcode };
 
   /// A DenseMapInfo implementation for using SmallVector<VPValue *, 4> as
diff --git a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
index 19f5d2c00c60..a42ebc9ee955 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanDominatorTree.h
@@ -30,7 +30,8 @@ using VPDomTreeNode = DomTreeNodeBase<VPBlockBase>;
 /// Template specializations of GraphTraits for VPDomTreeNode.
 template <>
 struct GraphTraits<VPDomTreeNode *>
-    : public DomTreeGraphTraitsBase<VPDomTreeNode, VPDomTreeNode::iterator> {};
+    : public DomTreeGraphTraitsBase<VPDomTreeNode,
+                                    VPDomTreeNode::const_iterator> {};
 
 template <>
 struct GraphTraits<const VPDomTreeNode *>
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 3f6a2efd55cc..3a4872a72122 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -18,7 +18,7 @@ using namespace llvm;
 
 void VPlanTransforms::VPInstructionsToVPRecipes(
     Loop *OrigLoop, VPlanPtr &Plan,
-    LoopVectorizationLegality::InductionList *Inductions,
+    LoopVectorizationLegality::InductionList &Inductions,
     SmallPtrSetImpl<Instruction *> &DeadInstructions) {
 
   auto *TopRegion = cast<VPRegionBlock>(Plan->getEntry());
@@ -41,7 +41,6 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
       continue;
 
     VPBasicBlock *VPBB = Base->getEntryBasicBlock();
-    VPRecipeBase *LastRecipe = nullptr;
     // Introduce each ingredient into VPlan.
     for (auto I = VPBB->begin(), E = VPBB->end(); I != E;) {
       VPRecipeBase *Ingredient = &*I++;
@@ -55,33 +54,29 @@ void VPlanTransforms::VPInstructionsToVPRecipes(
 
       VPRecipeBase *NewRecipe = nullptr;
       // Create VPWidenMemoryInstructionRecipe for loads and stores.
-      if (isa<LoadInst>(Inst) || isa<StoreInst>(Inst))
+      if (LoadInst *Load = dyn_cast<LoadInst>(Inst))
         NewRecipe = new VPWidenMemoryInstructionRecipe(
-            *Inst, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
+            *Load, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
             nullptr /*Mask*/);
+      else if (StoreInst *Store = dyn_cast<StoreInst>(Inst))
+        NewRecipe = new VPWidenMemoryInstructionRecipe(
+            *Store, Plan->getOrAddVPValue(getLoadStorePointerOperand(Inst)),
+            Plan->getOrAddVPValue(Store->getValueOperand()), nullptr /*Mask*/);
       else if (PHINode *Phi = dyn_cast<PHINode>(Inst)) {
-        InductionDescriptor II = Inductions->lookup(Phi);
+        InductionDescriptor II = Inductions.lookup(Phi);
         if (II.getKind() == InductionDescriptor::IK_IntInduction ||
             II.getKind() == InductionDescriptor::IK_FpInduction) {
           NewRecipe = new VPWidenIntOrFpInductionRecipe(Phi);
         } else
           NewRecipe = new VPWidenPHIRecipe(Phi);
       } else if (GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(Inst)) {
-        NewRecipe = new VPWidenGEPRecipe(GEP, OrigLoop);
-      } else {
-        // If the last recipe is a VPWidenRecipe, add Inst to it instead of
-        // creating a new recipe.
-        if (VPWidenRecipe *WidenRecipe =
-                dyn_cast_or_null<VPWidenRecipe>(LastRecipe)) {
-          WidenRecipe->appendInstruction(Inst);
-          Ingredient->eraseFromParent();
-          continue;
-        }
-        NewRecipe = new VPWidenRecipe(Inst);
-      }
+        NewRecipe = new VPWidenGEPRecipe(
+            GEP, Plan->mapToVPValues(GEP->operands()), OrigLoop);
+      } else
+        NewRecipe =
+            new VPWidenRecipe(*Inst, Plan->mapToVPValues(Inst->operands()));
 
       NewRecipe->insertBefore(Ingredient);
-      LastRecipe = NewRecipe;
       Ingredient->eraseFromParent();
     }
   }
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
index 0d3bd7da09a7..4b20e8b4e3b3 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.h
@@ -19,14 +19,12 @@
 
 namespace llvm {
 
-class VPlanTransforms {
-
-public:
+struct VPlanTransforms {
   /// Replaces the VPInstructions in \p Plan with corresponding
   /// widen recipes.
   static void VPInstructionsToVPRecipes(
       Loop *OrigLoop, VPlanPtr &Plan,
-      LoopVectorizationLegality::InductionList *Inductions,
+      LoopVectorizationLegality::InductionList &Inductions,
       SmallPtrSetImpl<Instruction *> &DeadInstructions);
 };
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanValue.h b/llvm/lib/Transforms/Vectorize/VPlanValue.h
index 464498c29d89..f73505d0279a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanValue.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanValue.h
@@ -22,13 +22,14 @@
 
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Value.h"
-#include "llvm/Support/Debug.h"
-#include "llvm/Support/raw_ostream.h"
+#include "llvm/ADT/iterator_range.h"
 
 namespace llvm {
 
 // Forward declarations.
+class raw_ostream;
+class Value;
+class VPSlotTracker;
 class VPUser;
 
 // This is the base class of the VPlan Def/Use graph, used for modeling the data
@@ -37,11 +38,11 @@ class VPUser;
 // and live-outs which the VPlan will need to fix accordingly.
 class VPValue {
   friend class VPBuilder;
-  friend class VPlanTransforms;
+  friend struct VPlanTransforms;
   friend class VPBasicBlock;
   friend class VPInterleavedAccessInfo;
+  friend class VPSlotTracker;
 
-private:
   const unsigned char SubclassID; ///< Subclass identifier (for isa/dyn_cast).
 
   SmallVector<VPUser *, 1> Users;
@@ -62,6 +63,7 @@ protected:
 
   /// Return the underlying Value attached to this VPValue.
   Value *getUnderlyingValue() { return UnderlyingVal; }
+  const Value *getUnderlyingValue() const { return UnderlyingVal; }
 
   // Set \p Val as the underlying Value of this VPValue.
   void setUnderlyingValue(Value *Val) {
@@ -85,9 +87,8 @@ public:
   /// for any other purpose, as the values may change as LLVM evolves.
   unsigned getVPValueID() const { return SubclassID; }
 
-  void printAsOperand(raw_ostream &OS) const {
-    OS << "%vp" << (unsigned short)(unsigned long long)this;
-  }
+  void printAsOperand(raw_ostream &OS, VPSlotTracker &Tracker) const;
+  void print(raw_ostream &OS, VPSlotTracker &Tracker) const;
 
   unsigned getNumUsers() const { return Users.size(); }
   void addUser(VPUser &User) { Users.push_back(&User); }
@@ -129,7 +130,6 @@ raw_ostream &operator<<(raw_ostream &OS, const VPValue &V);
 /// This class augments VPValue with operands which provide the inverse def-use
 /// edges from VPValue's users to their defs.
 class VPUser : public VPValue {
-private:
   SmallVector<VPValue *, 2> Operands;
 
 protected:
@@ -144,6 +144,12 @@ public:
   VPUser(ArrayRef<VPValue *> Operands) : VPUser(VPValue::VPUserSC, Operands) {}
   VPUser(std::initializer_list<VPValue *> Operands)
       : VPUser(ArrayRef<VPValue *>(Operands)) {}
+  template <typename IterT>
+  VPUser(iterator_range<IterT> Operands) : VPValue(VPValue::VPUserSC) {
+    for (VPValue *Operand : Operands)
+      addOperand(Operand);
+  }
+
   VPUser(const VPUser &) = delete;
   VPUser &operator=(const VPUser &) = delete;
 
@@ -180,6 +186,37 @@ public:
     return const_operand_range(op_begin(), op_end());
   }
 };
+class VPlan;
+class VPBasicBlock;
+class VPRegionBlock;
+
+/// This class can be used to assign consecutive numbers to all VPValues in a
+/// VPlan and allows querying the numbering for printing, similar to the
+/// ModuleSlotTracker for IR values.
+class VPSlotTracker {
+  DenseMap<const VPValue *, unsigned> Slots;
+  unsigned NextSlot = 0;
+
+  void assignSlots(const VPBlockBase *VPBB);
+  void assignSlots(const VPRegionBlock *Region);
+  void assignSlots(const VPBasicBlock *VPBB);
+  void assignSlot(const VPValue *V);
+
+  void assignSlots(const VPlan &Plan);
+
+public:
+  VPSlotTracker(const VPlan *Plan) {
+    if (Plan)
+      assignSlots(*Plan);
+  }
+
+  unsigned getSlot(const VPValue *V) const {
+    auto I = Slots.find(V);
+    if (I == Slots.end())
+      return -1;
+    return I->second;
+  }
+};
 
 } // namespace llvm
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index ab3e7e2282e7..b384c94121e9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "VPlanVerifier.h"
+#include "VPlan.h"
 #include "llvm/ADT/DepthFirstIterator.h"
 #include "llvm/Support/CommandLine.h"
 
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.h b/llvm/lib/Transforms/Vectorize/VPlanVerifier.h
index 7d2b26252172..8e8de441648a 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.h
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.h
@@ -24,14 +24,12 @@
 #ifndef LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
 #define LLVM_TRANSFORMS_VECTORIZE_VPLANVERIFIER_H
 
-#include "VPlan.h"
-
 namespace llvm {
+class VPRegionBlock;
 
-/// Class with utility functions that can be used to check the consistency and
+/// Struct with utility functions that can be used to check the consistency and
 /// invariants of a VPlan, including the components of its H-CFG.
-class VPlanVerifier {
-public:
+struct VPlanVerifier {
   /// Verify the invariants of the H-CFG starting from \p TopRegion. The
   /// verification process comprises the following steps:
   /// 1. Region/Block verification: Check the Region/Block verification
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
new file mode 100644
index 000000000000..64b41bf9cefa
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -0,0 +1,699 @@
+//===------- VectorCombine.cpp - Optimize partial vector operations -------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This pass optimizes scalar/vector interactions using target cost models. The
+// transforms implemented here may not fit in traditional loop-based or SLP
+// vectorization passes.
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/VectorCombine.h"
+#include "llvm/ADT/Statistic.h"
+#include "llvm/Analysis/BasicAliasAnalysis.h"
+#include "llvm/Analysis/GlobalsModRef.h"
+#include "llvm/Analysis/TargetTransformInfo.h"
+#include "llvm/Analysis/ValueTracking.h"
+#include "llvm/Analysis/VectorUtils.h"
+#include "llvm/IR/Dominators.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/IRBuilder.h"
+#include "llvm/IR/PatternMatch.h"
+#include "llvm/InitializePasses.h"
+#include "llvm/Pass.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/Local.h"
+#include "llvm/Transforms/Vectorize.h"
+
+using namespace llvm;
+using namespace llvm::PatternMatch;
+
+#define DEBUG_TYPE "vector-combine"
+STATISTIC(NumVecCmp, "Number of vector compares formed");
+STATISTIC(NumVecBO, "Number of vector binops formed");
+STATISTIC(NumVecCmpBO, "Number of vector compare + binop formed");
+STATISTIC(NumShufOfBitcast, "Number of shuffles moved after bitcast");
+STATISTIC(NumScalarBO, "Number of scalar binops formed");
+STATISTIC(NumScalarCmp, "Number of scalar compares formed");
+
+static cl::opt<bool> DisableVectorCombine(
+    "disable-vector-combine", cl::init(false), cl::Hidden,
+    cl::desc("Disable all vector combine transforms"));
+
+static cl::opt<bool> DisableBinopExtractShuffle(
+    "disable-binop-extract-shuffle", cl::init(false), cl::Hidden,
+    cl::desc("Disable binop extract to shuffle transforms"));
+
+static const unsigned InvalidIndex = std::numeric_limits<unsigned>::max();
+
+namespace {
+class VectorCombine {
+public:
+  VectorCombine(Function &F, const TargetTransformInfo &TTI,
+                const DominatorTree &DT)
+      : F(F), Builder(F.getContext()), TTI(TTI), DT(DT) {}
+
+  bool run();
+
+private:
+  Function &F;
+  IRBuilder<> Builder;
+  const TargetTransformInfo &TTI;
+  const DominatorTree &DT;
+
+  ExtractElementInst *getShuffleExtract(ExtractElementInst *Ext0,
+                                        ExtractElementInst *Ext1,
+                                        unsigned PreferredExtractIndex) const;
+  bool isExtractExtractCheap(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+                             unsigned Opcode,
+                             ExtractElementInst *&ConvertToShuffle,
+                             unsigned PreferredExtractIndex);
+  void foldExtExtCmp(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+                     Instruction &I);
+  void foldExtExtBinop(ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+                       Instruction &I);
+  bool foldExtractExtract(Instruction &I);
+  bool foldBitcastShuf(Instruction &I);
+  bool scalarizeBinopOrCmp(Instruction &I);
+  bool foldExtractedCmps(Instruction &I);
+};
+} // namespace
+
+static void replaceValue(Value &Old, Value &New) {
+  Old.replaceAllUsesWith(&New);
+  New.takeName(&Old);
+}
+
+/// Determine which, if any, of the inputs should be replaced by a shuffle
+/// followed by extract from a different index.
+ExtractElementInst *VectorCombine::getShuffleExtract(
+    ExtractElementInst *Ext0, ExtractElementInst *Ext1,
+    unsigned PreferredExtractIndex = InvalidIndex) const {
+  assert(isa<ConstantInt>(Ext0->getIndexOperand()) &&
+         isa<ConstantInt>(Ext1->getIndexOperand()) &&
+         "Expected constant extract indexes");
+
+  unsigned Index0 = cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue();
+  unsigned Index1 = cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue();
+
+  // If the extract indexes are identical, no shuffle is needed.
+  if (Index0 == Index1)
+    return nullptr;
+
+  Type *VecTy = Ext0->getVectorOperand()->getType();
+  assert(VecTy == Ext1->getVectorOperand()->getType() && "Need matching types");
+  int Cost0 = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
+  int Cost1 = TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+
+  // We are extracting from 2 different indexes, so one operand must be shuffled
+  // before performing a vector operation and/or extract. The more expensive
+  // extract will be replaced by a shuffle.
+  if (Cost0 > Cost1)
+    return Ext0;
+  if (Cost1 > Cost0)
+    return Ext1;
+
+  // If the costs are equal and there is a preferred extract index, shuffle the
+  // opposite operand.
+  if (PreferredExtractIndex == Index0)
+    return Ext1;
+  if (PreferredExtractIndex == Index1)
+    return Ext0;
+
+  // Otherwise, replace the extract with the higher index.
+  return Index0 > Index1 ? Ext0 : Ext1;
+}
+
+/// Compare the relative costs of 2 extracts followed by scalar operation vs.
+/// vector operation(s) followed by extract. Return true if the existing
+/// instructions are cheaper than a vector alternative. Otherwise, return false
+/// and if one of the extracts should be transformed to a shufflevector, set
+/// \p ConvertToShuffle to that extract instruction.
+bool VectorCombine::isExtractExtractCheap(ExtractElementInst *Ext0,
+                                          ExtractElementInst *Ext1,
+                                          unsigned Opcode,
+                                          ExtractElementInst *&ConvertToShuffle,
+                                          unsigned PreferredExtractIndex) {
+  assert(isa<ConstantInt>(Ext0->getOperand(1)) &&
+         isa<ConstantInt>(Ext1->getOperand(1)) &&
+         "Expected constant extract indexes");
+  Type *ScalarTy = Ext0->getType();
+  auto *VecTy = cast<VectorType>(Ext0->getOperand(0)->getType());
+  int ScalarOpCost, VectorOpCost;
+
+  // Get cost estimates for scalar and vector versions of the operation.
+  bool IsBinOp = Instruction::isBinaryOp(Opcode);
+  if (IsBinOp) {
+    ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
+    VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+  } else {
+    assert((Opcode == Instruction::ICmp || Opcode == Instruction::FCmp) &&
+           "Expected a compare");
+    ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy,
+                                          CmpInst::makeCmpResultType(ScalarTy));
+    VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy,
+                                          CmpInst::makeCmpResultType(VecTy));
+  }
+
+  // Get cost estimates for the extract elements. These costs will factor into
+  // both sequences.
+  unsigned Ext0Index = cast<ConstantInt>(Ext0->getOperand(1))->getZExtValue();
+  unsigned Ext1Index = cast<ConstantInt>(Ext1->getOperand(1))->getZExtValue();
+
+  int Extract0Cost =
+      TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext0Index);
+  int Extract1Cost =
+      TTI.getVectorInstrCost(Instruction::ExtractElement, VecTy, Ext1Index);
+
+  // A more expensive extract will always be replaced by a splat shuffle.
+  // For example, if Ext0 is more expensive:
+  // opcode (extelt V0, Ext0), (ext V1, Ext1) -->
+  // extelt (opcode (splat V0, Ext0), V1), Ext1
+  // TODO: Evaluate whether that always results in lowest cost. Alternatively,
+  //       check the cost of creating a broadcast shuffle and shuffling both
+  //       operands to element 0.
+  int CheapExtractCost = std::min(Extract0Cost, Extract1Cost);
+
+  // Extra uses of the extracts mean that we include those costs in the
+  // vector total because those instructions will not be eliminated.
+  int OldCost, NewCost;
+  if (Ext0->getOperand(0) == Ext1->getOperand(0) && Ext0Index == Ext1Index) {
+    // Handle a special case. If the 2 extracts are identical, adjust the
+    // formulas to account for that. The extra use charge allows for either the
+    // CSE'd pattern or an unoptimized form with identical values:
+    // opcode (extelt V, C), (extelt V, C) --> extelt (opcode V, V), C
+    bool HasUseTax = Ext0 == Ext1 ? !Ext0->hasNUses(2)
+                                  : !Ext0->hasOneUse() || !Ext1->hasOneUse();
+    OldCost = CheapExtractCost + ScalarOpCost;
+    NewCost = VectorOpCost + CheapExtractCost + HasUseTax * CheapExtractCost;
+  } else {
+    // Handle the general case. Each extract is actually a different value:
+    // opcode (extelt V0, C0), (extelt V1, C1) --> extelt (opcode V0, V1), C
+    OldCost = Extract0Cost + Extract1Cost + ScalarOpCost;
+    NewCost = VectorOpCost + CheapExtractCost +
+              !Ext0->hasOneUse() * Extract0Cost +
+              !Ext1->hasOneUse() * Extract1Cost;
+  }
+
+  ConvertToShuffle = getShuffleExtract(Ext0, Ext1, PreferredExtractIndex);
+  if (ConvertToShuffle) {
+    if (IsBinOp && DisableBinopExtractShuffle)
+      return true;
+
+    // If we are extracting from 2 different indexes, then one operand must be
+    // shuffled before performing the vector operation. The shuffle mask is
+    // undefined except for 1 lane that is being translated to the remaining
+    // extraction lane. Therefore, it is a splat shuffle. Ex:
+    // ShufMask = { undef, undef, 0, undef }
+    // TODO: The cost model has an option for a "broadcast" shuffle
+    //       (splat-from-element-0), but no option for a more general splat.
+    NewCost +=
+        TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, VecTy);
+  }
+
+  // Aggressively form a vector op if the cost is equal because the transform
+  // may enable further optimization.
+  // Codegen can reverse this transform (scalarize) if it was not profitable.
+  return OldCost < NewCost;
+}
+
+/// Create a shuffle that translates (shifts) 1 element from the input vector
+/// to a new element location.
+static Value *createShiftShuffle(Value *Vec, unsigned OldIndex,
+                                 unsigned NewIndex, IRBuilder<> &Builder) {
+  // The shuffle mask is undefined except for 1 lane that is being translated
+  // to the new element index. Example for OldIndex == 2 and NewIndex == 0:
+  // ShufMask = { 2, undef, undef, undef }
+  auto *VecTy = cast<FixedVectorType>(Vec->getType());
+  SmallVector<int, 32> ShufMask(VecTy->getNumElements(), UndefMaskElem);
+  ShufMask[NewIndex] = OldIndex;
+  Value *Undef = UndefValue::get(VecTy);
+  return Builder.CreateShuffleVector(Vec, Undef, ShufMask, "shift");
+}
+
+/// Given an extract element instruction with constant index operand, shuffle
+/// the source vector (shift the scalar element) to a NewIndex for extraction.
+/// Return null if the input can be constant folded, so that we are not creating
+/// unnecessary instructions.
+static ExtractElementInst *translateExtract(ExtractElementInst *ExtElt,
+                                            unsigned NewIndex,
+                                            IRBuilder<> &Builder) {
+  // If the extract can be constant-folded, this code is unsimplified. Defer
+  // to other passes to handle that.
+  Value *X = ExtElt->getVectorOperand();
+  Value *C = ExtElt->getIndexOperand();
+  assert(isa<ConstantInt>(C) && "Expected a constant index operand");
+  if (isa<Constant>(X))
+    return nullptr;
+
+  Value *Shuf = createShiftShuffle(X, cast<ConstantInt>(C)->getZExtValue(),
+                                   NewIndex, Builder);
+  return cast<ExtractElementInst>(Builder.CreateExtractElement(Shuf, NewIndex));
+}
+
+/// Try to reduce extract element costs by converting scalar compares to vector
+/// compares followed by extract.
+/// cmp (ext0 V0, C), (ext1 V1, C)
+void VectorCombine::foldExtExtCmp(ExtractElementInst *Ext0,
+                                  ExtractElementInst *Ext1, Instruction &I) {
+  assert(isa<CmpInst>(&I) && "Expected a compare");
+  assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
+             cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
+         "Expected matching constant extract indexes");
+
+  // cmp Pred (extelt V0, C), (extelt V1, C) --> extelt (cmp Pred V0, V1), C
+  ++NumVecCmp;
+  CmpInst::Predicate Pred = cast<CmpInst>(&I)->getPredicate();
+  Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
+  Value *VecCmp = Builder.CreateCmp(Pred, V0, V1);
+  Value *NewExt = Builder.CreateExtractElement(VecCmp, Ext0->getIndexOperand());
+  replaceValue(I, *NewExt);
+}
+
+/// Try to reduce extract element costs by converting scalar binops to vector
+/// binops followed by extract.
+/// bo (ext0 V0, C), (ext1 V1, C)
+void VectorCombine::foldExtExtBinop(ExtractElementInst *Ext0,
+                                    ExtractElementInst *Ext1, Instruction &I) {
+  assert(isa<BinaryOperator>(&I) && "Expected a binary operator");
+  assert(cast<ConstantInt>(Ext0->getIndexOperand())->getZExtValue() ==
+             cast<ConstantInt>(Ext1->getIndexOperand())->getZExtValue() &&
+         "Expected matching constant extract indexes");
+
+  // bo (extelt V0, C), (extelt V1, C) --> extelt (bo V0, V1), C
+  ++NumVecBO;
+  Value *V0 = Ext0->getVectorOperand(), *V1 = Ext1->getVectorOperand();
+  Value *VecBO =
+      Builder.CreateBinOp(cast<BinaryOperator>(&I)->getOpcode(), V0, V1);
+
+  // All IR flags are safe to back-propagate because any potential poison
+  // created in unused vector elements is discarded by the extract.
+  if (auto *VecBOInst = dyn_cast<Instruction>(VecBO))
+    VecBOInst->copyIRFlags(&I);
+
+  Value *NewExt = Builder.CreateExtractElement(VecBO, Ext0->getIndexOperand());
+  replaceValue(I, *NewExt);
+}
+
+/// Match an instruction with extracted vector operands.
+bool VectorCombine::foldExtractExtract(Instruction &I) {
+  // It is not safe to transform things like div, urem, etc. because we may
+  // create undefined behavior when executing those on unknown vector elements.
+  if (!isSafeToSpeculativelyExecute(&I))
+    return false;
+
+  Instruction *I0, *I1;
+  CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+  if (!match(&I, m_Cmp(Pred, m_Instruction(I0), m_Instruction(I1))) &&
+      !match(&I, m_BinOp(m_Instruction(I0), m_Instruction(I1))))
+    return false;
+
+  Value *V0, *V1;
+  uint64_t C0, C1;
+  if (!match(I0, m_ExtractElt(m_Value(V0), m_ConstantInt(C0))) ||
+      !match(I1, m_ExtractElt(m_Value(V1), m_ConstantInt(C1))) ||
+      V0->getType() != V1->getType())
+    return false;
+
+  // If the scalar value 'I' is going to be re-inserted into a vector, then try
+  // to create an extract to that same element. The extract/insert can be
+  // reduced to a "select shuffle".
+  // TODO: If we add a larger pattern match that starts from an insert, this
+  //       probably becomes unnecessary.
+  auto *Ext0 = cast<ExtractElementInst>(I0);
+  auto *Ext1 = cast<ExtractElementInst>(I1);
+  uint64_t InsertIndex = InvalidIndex;
+  if (I.hasOneUse())
+    match(I.user_back(),
+          m_InsertElt(m_Value(), m_Value(), m_ConstantInt(InsertIndex)));
+
+  ExtractElementInst *ExtractToChange;
+  if (isExtractExtractCheap(Ext0, Ext1, I.getOpcode(), ExtractToChange,
+                            InsertIndex))
+    return false;
+
+  if (ExtractToChange) {
+    unsigned CheapExtractIdx = ExtractToChange == Ext0 ? C1 : C0;
+    ExtractElementInst *NewExtract =
+        translateExtract(ExtractToChange, CheapExtractIdx, Builder);
+    if (!NewExtract)
+      return false;
+    if (ExtractToChange == Ext0)
+      Ext0 = NewExtract;
+    else
+      Ext1 = NewExtract;
+  }
+
+  if (Pred != CmpInst::BAD_ICMP_PREDICATE)
+    foldExtExtCmp(Ext0, Ext1, I);
+  else
+    foldExtExtBinop(Ext0, Ext1, I);
+
+  return true;
+}
+
+/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
+/// destination type followed by shuffle. This can enable further transforms by
+/// moving bitcasts or shuffles together.
+bool VectorCombine::foldBitcastShuf(Instruction &I) {
+  Value *V;
+  ArrayRef<int> Mask;
+  if (!match(&I, m_BitCast(
+                     m_OneUse(m_Shuffle(m_Value(V), m_Undef(), m_Mask(Mask))))))
+    return false;
+
+  // Disallow non-vector casts and length-changing shuffles.
+  // TODO: We could allow any shuffle.
+  auto *DestTy = dyn_cast<VectorType>(I.getType());
+  auto *SrcTy = cast<VectorType>(V->getType());
+  if (!DestTy || I.getOperand(0)->getType() != SrcTy)
+    return false;
+
+  // The new shuffle must not cost more than the old shuffle. The bitcast is
+  // moved ahead of the shuffle, so assume that it has the same cost as before.
+  if (TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, DestTy) >
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, SrcTy))
+    return false;
+
+  unsigned DestNumElts = DestTy->getNumElements();
+  unsigned SrcNumElts = SrcTy->getNumElements();
+  SmallVector<int, 16> NewMask;
+  if (SrcNumElts <= DestNumElts) {
+    // The bitcast is from wide to narrow/equal elements. The shuffle mask can
+    // always be expanded to the equivalent form choosing narrower elements.
+    assert(DestNumElts % SrcNumElts == 0 && "Unexpected shuffle mask");
+    unsigned ScaleFactor = DestNumElts / SrcNumElts;
+    narrowShuffleMaskElts(ScaleFactor, Mask, NewMask);
+  } else {
+    // The bitcast is from narrow elements to wide elements. The shuffle mask
+    // must choose consecutive elements to allow casting first.
+    assert(SrcNumElts % DestNumElts == 0 && "Unexpected shuffle mask");
+    unsigned ScaleFactor = SrcNumElts / DestNumElts;
+    if (!widenShuffleMaskElts(ScaleFactor, Mask, NewMask))
+      return false;
+  }
+  // bitcast (shuf V, MaskC) --> shuf (bitcast V), MaskC'
+  ++NumShufOfBitcast;
+  Value *CastV = Builder.CreateBitCast(V, DestTy);
+  Value *Shuf =
+      Builder.CreateShuffleVector(CastV, UndefValue::get(DestTy), NewMask);
+  replaceValue(I, *Shuf);
+  return true;
+}
+
+/// Match a vector binop or compare instruction with at least one inserted
+/// scalar operand and convert to scalar binop/cmp followed by insertelement.
+bool VectorCombine::scalarizeBinopOrCmp(Instruction &I) {
+  CmpInst::Predicate Pred = CmpInst::BAD_ICMP_PREDICATE;
+  Value *Ins0, *Ins1;
+  if (!match(&I, m_BinOp(m_Value(Ins0), m_Value(Ins1))) &&
+      !match(&I, m_Cmp(Pred, m_Value(Ins0), m_Value(Ins1))))
+    return false;
+
+  // Do not convert the vector condition of a vector select into a scalar
+  // condition. That may cause problems for codegen because of differences in
+  // boolean formats and register-file transfers.
+  // TODO: Can we account for that in the cost model?
+  bool IsCmp = Pred != CmpInst::Predicate::BAD_ICMP_PREDICATE;
+  if (IsCmp)
+    for (User *U : I.users())
+      if (match(U, m_Select(m_Specific(&I), m_Value(), m_Value())))
+        return false;
+
+  // Match against one or both scalar values being inserted into constant
+  // vectors:
+  // vec_op VecC0, (inselt VecC1, V1, Index)
+  // vec_op (inselt VecC0, V0, Index), VecC1
+  // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index)
+  // TODO: Deal with mismatched index constants and variable indexes?
+  Constant *VecC0 = nullptr, *VecC1 = nullptr;
+  Value *V0 = nullptr, *V1 = nullptr;
+  uint64_t Index0 = 0, Index1 = 0;
+  if (!match(Ins0, m_InsertElt(m_Constant(VecC0), m_Value(V0),
+                               m_ConstantInt(Index0))) &&
+      !match(Ins0, m_Constant(VecC0)))
+    return false;
+  if (!match(Ins1, m_InsertElt(m_Constant(VecC1), m_Value(V1),
+                               m_ConstantInt(Index1))) &&
+      !match(Ins1, m_Constant(VecC1)))
+    return false;
+
+  bool IsConst0 = !V0;
+  bool IsConst1 = !V1;
+  if (IsConst0 && IsConst1)
+    return false;
+  if (!IsConst0 && !IsConst1 && Index0 != Index1)
+    return false;
+
+  // Bail for single insertion if it is a load.
+  // TODO: Handle this once getVectorInstrCost can cost for load/stores.
+  auto *I0 = dyn_cast_or_null<Instruction>(V0);
+  auto *I1 = dyn_cast_or_null<Instruction>(V1);
+  if ((IsConst0 && I1 && I1->mayReadFromMemory()) ||
+      (IsConst1 && I0 && I0->mayReadFromMemory()))
+    return false;
+
+  uint64_t Index = IsConst0 ? Index1 : Index0;
+  Type *ScalarTy = IsConst0 ? V1->getType() : V0->getType();
+  Type *VecTy = I.getType();
+  assert(VecTy->isVectorTy() &&
+         (IsConst0 || IsConst1 || V0->getType() == V1->getType()) &&
+         (ScalarTy->isIntegerTy() || ScalarTy->isFloatingPointTy() ||
+          ScalarTy->isPointerTy()) &&
+         "Unexpected types for insert element into binop or cmp");
+
+  unsigned Opcode = I.getOpcode();
+  int ScalarOpCost, VectorOpCost;
+  if (IsCmp) {
+    ScalarOpCost = TTI.getCmpSelInstrCost(Opcode, ScalarTy);
+    VectorOpCost = TTI.getCmpSelInstrCost(Opcode, VecTy);
+  } else {
+    ScalarOpCost = TTI.getArithmeticInstrCost(Opcode, ScalarTy);
+    VectorOpCost = TTI.getArithmeticInstrCost(Opcode, VecTy);
+  }
+
+  // Get cost estimate for the insert element. This cost will factor into
+  // both sequences.
+  int InsertCost =
+      TTI.getVectorInstrCost(Instruction::InsertElement, VecTy, Index);
+  int OldCost = (IsConst0 ? 0 : InsertCost) + (IsConst1 ? 0 : InsertCost) +
+                VectorOpCost;
+  int NewCost = ScalarOpCost + InsertCost +
+                (IsConst0 ? 0 : !Ins0->hasOneUse() * InsertCost) +
+                (IsConst1 ? 0 : !Ins1->hasOneUse() * InsertCost);
+
+  // We want to scalarize unless the vector variant actually has lower cost.
+  if (OldCost < NewCost)
+    return false;
+
+  // vec_op (inselt VecC0, V0, Index), (inselt VecC1, V1, Index) -->
+  // inselt NewVecC, (scalar_op V0, V1), Index
+  if (IsCmp)
+    ++NumScalarCmp;
+  else
+    ++NumScalarBO;
+
+  // For constant cases, extract the scalar element, this should constant fold.
+  if (IsConst0)
+    V0 = ConstantExpr::getExtractElement(VecC0, Builder.getInt64(Index));
+  if (IsConst1)
+    V1 = ConstantExpr::getExtractElement(VecC1, Builder.getInt64(Index));
+
+  Value *Scalar =
+      IsCmp ? Builder.CreateCmp(Pred, V0, V1)
+            : Builder.CreateBinOp((Instruction::BinaryOps)Opcode, V0, V1);
+
+  Scalar->setName(I.getName() + ".scalar");
+
+  // All IR flags are safe to back-propagate. There is no potential for extra
+  // poison to be created by the scalar instruction.
+  if (auto *ScalarInst = dyn_cast<Instruction>(Scalar))
+    ScalarInst->copyIRFlags(&I);
+
+  // Fold the vector constants in the original vectors into a new base vector.
+  Constant *NewVecC = IsCmp ? ConstantExpr::getCompare(Pred, VecC0, VecC1)
+                            : ConstantExpr::get(Opcode, VecC0, VecC1);
+  Value *Insert = Builder.CreateInsertElement(NewVecC, Scalar, Index);
+  replaceValue(I, *Insert);
+  return true;
+}
+
+/// Try to combine a scalar binop + 2 scalar compares of extracted elements of
+/// a vector into vector operations followed by extract. Note: The SLP pass
+/// may miss this pattern because of implementation problems.
+bool VectorCombine::foldExtractedCmps(Instruction &I) {
+  // We are looking for a scalar binop of booleans.
+  // binop i1 (cmp Pred I0, C0), (cmp Pred I1, C1)
+  if (!I.isBinaryOp() || !I.getType()->isIntegerTy(1))
+    return false;
+
+  // The compare predicates should match, and each compare should have a
+  // constant operand.
+  // TODO: Relax the one-use constraints.
+  Value *B0 = I.getOperand(0), *B1 = I.getOperand(1);
+  Instruction *I0, *I1;
+  Constant *C0, *C1;
+  CmpInst::Predicate P0, P1;
+  if (!match(B0, m_OneUse(m_Cmp(P0, m_Instruction(I0), m_Constant(C0)))) ||
+      !match(B1, m_OneUse(m_Cmp(P1, m_Instruction(I1), m_Constant(C1)))) ||
+      P0 != P1)
+    return false;
+
+  // The compare operands must be extracts of the same vector with constant
+  // extract indexes.
+  // TODO: Relax the one-use constraints.
+  Value *X;
+  uint64_t Index0, Index1;
+  if (!match(I0, m_OneUse(m_ExtractElt(m_Value(X), m_ConstantInt(Index0)))) ||
+      !match(I1, m_OneUse(m_ExtractElt(m_Specific(X), m_ConstantInt(Index1)))))
+    return false;
+
+  auto *Ext0 = cast<ExtractElementInst>(I0);
+  auto *Ext1 = cast<ExtractElementInst>(I1);
+  ExtractElementInst *ConvertToShuf = getShuffleExtract(Ext0, Ext1);
+  if (!ConvertToShuf)
+    return false;
+
+  // The original scalar pattern is:
+  // binop i1 (cmp Pred (ext X, Index0), C0), (cmp Pred (ext X, Index1), C1)
+  CmpInst::Predicate Pred = P0;
+  unsigned CmpOpcode = CmpInst::isFPPredicate(Pred) ? Instruction::FCmp
+                                                    : Instruction::ICmp;
+  auto *VecTy = dyn_cast<FixedVectorType>(X->getType());
+  if (!VecTy)
+    return false;
+
+  int OldCost = TTI.getVectorInstrCost(Ext0->getOpcode(), VecTy, Index0);
+  OldCost += TTI.getVectorInstrCost(Ext1->getOpcode(), VecTy, Index1);
+  OldCost += TTI.getCmpSelInstrCost(CmpOpcode, I0->getType()) * 2;
+  OldCost += TTI.getArithmeticInstrCost(I.getOpcode(), I.getType());
+
+  // The proposed vector pattern is:
+  // vcmp = cmp Pred X, VecC
+  // ext (binop vNi1 vcmp, (shuffle vcmp, Index1)), Index0
+  int CheapIndex = ConvertToShuf == Ext0 ? Index1 : Index0;
+  int ExpensiveIndex = ConvertToShuf == Ext0 ? Index0 : Index1;
+  auto *CmpTy = cast<FixedVectorType>(CmpInst::makeCmpResultType(X->getType()));
+  int NewCost = TTI.getCmpSelInstrCost(CmpOpcode, X->getType());
+  NewCost +=
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteSingleSrc, CmpTy);
+  NewCost += TTI.getArithmeticInstrCost(I.getOpcode(), CmpTy);
+  NewCost += TTI.getVectorInstrCost(Ext0->getOpcode(), CmpTy, CheapIndex);
+
+  // Aggressively form vector ops if the cost is equal because the transform
+  // may enable further optimization.
+  // Codegen can reverse this transform (scalarize) if it was not profitable.
+  if (OldCost < NewCost)
+    return false;
+
+  // Create a vector constant from the 2 scalar constants.
+  SmallVector<Constant *, 32> CmpC(VecTy->getNumElements(),
+                                   UndefValue::get(VecTy->getElementType()));
+  CmpC[Index0] = C0;
+  CmpC[Index1] = C1;
+  Value *VCmp = Builder.CreateCmp(Pred, X, ConstantVector::get(CmpC));
+
+  Value *Shuf = createShiftShuffle(VCmp, ExpensiveIndex, CheapIndex, Builder);
+  Value *VecLogic = Builder.CreateBinOp(cast<BinaryOperator>(I).getOpcode(),
+                                        VCmp, Shuf);
+  Value *NewExt = Builder.CreateExtractElement(VecLogic, CheapIndex);
+  replaceValue(I, *NewExt);
+  ++NumVecCmpBO;
+  return true;
+}
+
+/// This is the entry point for all transforms. Pass manager differences are
+/// handled in the callers of this function.
+bool VectorCombine::run() {
+  if (DisableVectorCombine)
+    return false;
+
+  bool MadeChange = false;
+  for (BasicBlock &BB : F) {
+    // Ignore unreachable basic blocks.
+    if (!DT.isReachableFromEntry(&BB))
+      continue;
+    // Do not delete instructions under here and invalidate the iterator.
+    // Walk the block forwards to enable simple iterative chains of transforms.
+    // TODO: It could be more efficient to remove dead instructions
+    //       iteratively in this loop rather than waiting until the end.
+    for (Instruction &I : BB) {
+      if (isa<DbgInfoIntrinsic>(I))
+        continue;
+      Builder.SetInsertPoint(&I);
+      MadeChange |= foldExtractExtract(I);
+      MadeChange |= foldBitcastShuf(I);
+      MadeChange |= scalarizeBinopOrCmp(I);
+      MadeChange |= foldExtractedCmps(I);
+    }
+  }
+
+  // We're done with transforms, so remove dead instructions.
+  if (MadeChange)
+    for (BasicBlock &BB : F)
+      SimplifyInstructionsInBlock(&BB);
+
+  return MadeChange;
+}
+
+// Pass manager boilerplate below here.
+
+namespace {
+class VectorCombineLegacyPass : public FunctionPass {
+public:
+  static char ID;
+  VectorCombineLegacyPass() : FunctionPass(ID) {
+    initializeVectorCombineLegacyPassPass(*PassRegistry::getPassRegistry());
+  }
+
+  void getAnalysisUsage(AnalysisUsage &AU) const override {
+    AU.addRequired<DominatorTreeWrapperPass>();
+    AU.addRequired<TargetTransformInfoWrapperPass>();
+    AU.setPreservesCFG();
+    AU.addPreserved<DominatorTreeWrapperPass>();
+    AU.addPreserved<GlobalsAAWrapperPass>();
+    AU.addPreserved<AAResultsWrapperPass>();
+    AU.addPreserved<BasicAAWrapperPass>();
+    FunctionPass::getAnalysisUsage(AU);
+  }
+
+  bool runOnFunction(Function &F) override {
+    if (skipFunction(F))
+      return false;
+    auto &TTI = getAnalysis<TargetTransformInfoWrapperPass>().getTTI(F);
+    auto &DT = getAnalysis<DominatorTreeWrapperPass>().getDomTree();
+    VectorCombine Combiner(F, TTI, DT);
+    return Combiner.run();
+  }
+};
+} // namespace
+
+char VectorCombineLegacyPass::ID = 0;
+INITIALIZE_PASS_BEGIN(VectorCombineLegacyPass, "vector-combine",
+                      "Optimize scalar/vector ops", false,
+                      false)
+INITIALIZE_PASS_DEPENDENCY(DominatorTreeWrapperPass)
+INITIALIZE_PASS_END(VectorCombineLegacyPass, "vector-combine",
+                    "Optimize scalar/vector ops", false, false)
+Pass *llvm::createVectorCombinePass() {
+  return new VectorCombineLegacyPass();
+}
+
+PreservedAnalyses VectorCombinePass::run(Function &F,
+                                         FunctionAnalysisManager &FAM) {
+  TargetTransformInfo &TTI = FAM.getResult<TargetIRAnalysis>(F);
+  DominatorTree &DT = FAM.getResult<DominatorTreeAnalysis>(F);
+  VectorCombine Combiner(F, TTI, DT);
+  if (!Combiner.run())
+    return PreservedAnalyses::all();
+  PreservedAnalyses PA;
+  PA.preserveSet<CFGAnalyses>();
+  PA.preserve<GlobalsAA>();
+  PA.preserve<AAManager>();
+  PA.preserve<BasicAA>();
+  return PA;
+}
diff --git a/llvm/lib/Transforms/Vectorize/Vectorize.cpp b/llvm/lib/Transforms/Vectorize/Vectorize.cpp
index 6a4f9169c2af..0296a995ad29 100644
--- a/llvm/lib/Transforms/Vectorize/Vectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/Vectorize.cpp
@@ -21,12 +21,12 @@
 
 using namespace llvm;
 
-/// initializeVectorizationPasses - Initialize all passes linked into the
-/// Vectorization library.
+/// Initialize all passes linked into the Vectorization library.
 void llvm::initializeVectorization(PassRegistry &Registry) {
   initializeLoopVectorizePass(Registry);
   initializeSLPVectorizerPass(Registry);
   initializeLoadStoreVectorizerLegacyPassPass(Registry);
+  initializeVectorCombineLegacyPassPass(Registry);
 }
 
 void LLVMInitializeVectorization(LLVMPassRegistryRef R) {
diff --git a/llvm/lib/XRay/FDRTraceExpander.cpp b/llvm/lib/XRay/FDRTraceExpander.cpp
index cb7f66bccd7e..b68e997fe706 100644
--- a/llvm/lib/XRay/FDRTraceExpander.cpp
+++ b/llvm/lib/XRay/FDRTraceExpander.cpp
@@ -44,7 +44,7 @@ Error TraceExpander::visit(CustomEventRecord &R) {
     CurrentRecord.PId = PID;
     CurrentRecord.TId = TID;
     CurrentRecord.Type = RecordTypes::CUSTOM_EVENT;
-    CurrentRecord.Data = R.data();
+    CurrentRecord.Data = std::string(R.data());
     BuildingRecord = true;
   }
   return Error::success();
@@ -59,7 +59,7 @@ Error TraceExpander::visit(CustomEventRecordV5 &R) {
     CurrentRecord.PId = PID;
     CurrentRecord.TId = TID;
     CurrentRecord.Type = RecordTypes::CUSTOM_EVENT;
-    CurrentRecord.Data = R.data();
+    CurrentRecord.Data = std::string(R.data());
     BuildingRecord = true;
   }
   return Error::success();
@@ -75,7 +75,7 @@ Error TraceExpander::visit(TypedEventRecord &R) {
     CurrentRecord.TId = TID;
     CurrentRecord.RecordType = R.eventType();
     CurrentRecord.Type = RecordTypes::TYPED_EVENT;
-    CurrentRecord.Data = R.data();
+    CurrentRecord.Data = std::string(R.data());
     BuildingRecord = true;
   }
   return Error::success();
diff --git a/llvm/lib/XRay/FDRTraceWriter.cpp b/llvm/lib/XRay/FDRTraceWriter.cpp
index f50dc19b4be8..71c09bd4fce4 100644
--- a/llvm/lib/XRay/FDRTraceWriter.cpp
+++ b/llvm/lib/XRay/FDRTraceWriter.cpp
@@ -20,10 +20,9 @@ namespace {
 template <size_t Index> struct IndexedWriter {
   template <
       class Tuple,
-      typename std::enable_if<
-          (Index <
-           std::tuple_size<typename std::remove_reference<Tuple>::type>::value),
-          int>::type = 0>
+      std::enable_if_t<(Index <
+                        std::tuple_size<std::remove_reference_t<Tuple>>::value),
+                       int> = 0>
   static size_t write(support::endian::Writer &OS, Tuple &&T) {
     OS.write(std::get<Index>(T));
     return sizeof(std::get<Index>(T)) + IndexedWriter<Index + 1>::write(OS, T);
@@ -31,10 +30,9 @@ template <size_t Index> struct IndexedWriter {
 
   template <
       class Tuple,
-      typename std::enable_if<
-          (Index >=
-           std::tuple_size<typename std::remove_reference<Tuple>::type>::value),
-          int>::type = 0>
+      std::enable_if_t<(Index >=
+                        std::tuple_size<std::remove_reference_t<Tuple>>::value),
+                       int> = 0>
   static size_t write(support::endian::Writer &OS, Tuple &&) {
     return 0;
   }
diff --git a/llvm/lib/XRay/InstrumentationMap.cpp b/llvm/lib/XRay/InstrumentationMap.cpp
index 1e9b69a5f9dc..de0a9e60a511 100644
--- a/llvm/lib/XRay/InstrumentationMap.cpp
+++ b/llvm/lib/XRay/InstrumentationMap.cpp
@@ -52,26 +52,31 @@ using RelocMap = DenseMap<uint64_t, uint64_t>;
 
 static Error
 loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
-          InstrumentationMap::SledContainer &Sleds,
-          InstrumentationMap::FunctionAddressMap &FunctionAddresses,
-          InstrumentationMap::FunctionAddressReverseMap &FunctionIds) {
+        InstrumentationMap::SledContainer &Sleds,
+        InstrumentationMap::FunctionAddressMap &FunctionAddresses,
+        InstrumentationMap::FunctionAddressReverseMap &FunctionIds) {
   InstrumentationMap Map;
 
   // Find the section named "xray_instr_map".
   if ((!ObjFile.getBinary()->isELF() && !ObjFile.getBinary()->isMachO()) ||
       !(ObjFile.getBinary()->getArch() == Triple::x86_64 ||
         ObjFile.getBinary()->getArch() == Triple::ppc64le ||
+        ObjFile.getBinary()->getArch() == Triple::arm ||
         ObjFile.getBinary()->getArch() == Triple::aarch64))
     return make_error<StringError>(
-        "File format not supported (only does ELF and Mach-O little endian 64-bit).",
+        "File format not supported (only does ELF and Mach-O little endian "
+        "64-bit).",
         std::make_error_code(std::errc::not_supported));
 
   StringRef Contents = "";
   const auto &Sections = ObjFile.getBinary()->sections();
+  uint64_t Address = 0;
   auto I = llvm::find_if(Sections, [&](object::SectionRef Section) {
     Expected<StringRef> NameOrErr = Section.getName();
-    if (NameOrErr)
+    if (NameOrErr) {
+      Address = Section.getAddress();
       return *NameOrErr == "xray_instr_map";
+    }
     consumeError(NameOrErr.takeError());
     return false;
   });
@@ -91,11 +96,14 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
     uint32_t RelativeRelocation = [](object::ObjectFile *ObjFile) {
       if (const auto *ELFObj = dyn_cast<object::ELF32LEObjectFile>(ObjFile))
         return ELFObj->getELFFile()->getRelativeRelocationType();
-      else if (const auto *ELFObj = dyn_cast<object::ELF32BEObjectFile>(ObjFile))
+      else if (const auto *ELFObj =
+                   dyn_cast<object::ELF32BEObjectFile>(ObjFile))
         return ELFObj->getELFFile()->getRelativeRelocationType();
-      else if (const auto *ELFObj = dyn_cast<object::ELF64LEObjectFile>(ObjFile))
+      else if (const auto *ELFObj =
+                   dyn_cast<object::ELF64LEObjectFile>(ObjFile))
         return ELFObj->getELFFile()->getRelativeRelocationType();
-      else if (const auto *ELFObj = dyn_cast<object::ELF64BEObjectFile>(ObjFile))
+      else if (const auto *ELFObj =
+                   dyn_cast<object::ELF64BEObjectFile>(ObjFile))
         return ELFObj->getELFFile()->getRelativeRelocationType();
       else
         return static_cast<uint32_t>(0);
@@ -108,11 +116,21 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
 
     for (const object::SectionRef &Section : Sections) {
       for (const object::RelocationRef &Reloc : Section.relocations()) {
-        if (SupportsRelocation && SupportsRelocation(Reloc.getType())) {
+        if (ObjFile.getBinary()->getArch() == Triple::arm) {
+          if (SupportsRelocation && SupportsRelocation(Reloc.getType())) {
+            Expected<uint64_t> ValueOrErr = Reloc.getSymbol()->getValue();
+            if (!ValueOrErr)
+              return ValueOrErr.takeError();
+            Relocs.insert({Reloc.getOffset(), Resolver(Reloc, *ValueOrErr, 0)});
+          }
+        } else if (SupportsRelocation && SupportsRelocation(Reloc.getType())) {
           auto AddendOrErr = object::ELFRelocationRef(Reloc).getAddend();
           auto A = AddendOrErr ? *AddendOrErr : 0;
-          uint64_t resolved = Resolver(Reloc, Reloc.getSymbol()->getValue(), A);
-          Relocs.insert({Reloc.getOffset(), resolved});
+          Expected<uint64_t> ValueOrErr = Reloc.getSymbol()->getValue();
+          if (!ValueOrErr)
+            // TODO: Test this error.
+            return ValueOrErr.takeError();
+          Relocs.insert({Reloc.getOffset(), Resolver(Reloc, *ValueOrErr, A)});
         } else if (Reloc.getType() == RelativeRelocation) {
           if (auto AddendOrErr = object::ELFRelocationRef(Reloc).getAddend())
             Relocs.insert({Reloc.getOffset(), *AddendOrErr});
@@ -123,12 +141,13 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
 
   // Copy the instrumentation map data into the Sleds data structure.
   auto C = Contents.bytes_begin();
-  static constexpr size_t ELF64SledEntrySize = 32;
+  bool Is32Bit = ObjFile.getBinary()->makeTriple().isArch32Bit();
+  size_t ELFSledEntrySize = Is32Bit ? 16 : 32;
 
-  if ((C - Contents.bytes_end()) % ELF64SledEntrySize != 0)
+  if ((C - Contents.bytes_end()) % ELFSledEntrySize != 0)
     return make_error<StringError>(
         Twine("Instrumentation map entries not evenly divisible by size of "
-              "an XRay sled entry in ELF64."),
+              "an XRay sled entry."),
         std::make_error_code(std::errc::executable_format_error));
 
   auto RelocateOrElse = [&](uint64_t Offset, uint64_t Address) {
@@ -141,19 +160,26 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
     return Address;
   };
 
+  const int WordSize = Is32Bit ? 4 : 8;
   int32_t FuncId = 1;
   uint64_t CurFn = 0;
-  for (; C != Contents.bytes_end(); C += ELF64SledEntrySize) {
+  for (; C != Contents.bytes_end(); C += ELFSledEntrySize) {
     DataExtractor Extractor(
-        StringRef(reinterpret_cast<const char *>(C), ELF64SledEntrySize), true,
+        StringRef(reinterpret_cast<const char *>(C), ELFSledEntrySize), true,
         8);
     Sleds.push_back({});
     auto &Entry = Sleds.back();
     uint64_t OffsetPtr = 0;
     uint64_t AddrOff = OffsetPtr;
-    Entry.Address = RelocateOrElse(AddrOff, Extractor.getU64(&OffsetPtr));
+    if (Is32Bit)
+      Entry.Address = RelocateOrElse(AddrOff, Extractor.getU32(&OffsetPtr));
+    else
+      Entry.Address = RelocateOrElse(AddrOff, Extractor.getU64(&OffsetPtr));
     uint64_t FuncOff = OffsetPtr;
-    Entry.Function = RelocateOrElse(FuncOff, Extractor.getU64(&OffsetPtr));
+    if (Is32Bit)
+      Entry.Function = RelocateOrElse(FuncOff, Extractor.getU32(&OffsetPtr));
+    else
+      Entry.Function = RelocateOrElse(FuncOff, Extractor.getU64(&OffsetPtr));
     auto Kind = Extractor.getU8(&OffsetPtr);
     static constexpr SledEntry::FunctionKinds Kinds[] = {
         SledEntry::FunctionKinds::ENTRY, SledEntry::FunctionKinds::EXIT,
@@ -165,6 +191,11 @@ loadObj(StringRef Filename, object::OwningBinary<object::ObjectFile> &ObjFile,
           std::make_error_code(std::errc::executable_format_error));
     Entry.Kind = Kinds[Kind];
     Entry.AlwaysInstrument = Extractor.getU8(&OffsetPtr) != 0;
+    Entry.Version = Extractor.getU8(&OffsetPtr);
+    if (Entry.Version >= 2) {
+      Entry.Address += C - Contents.bytes_begin() + Address;
+      Entry.Function += C - Contents.bytes_begin() + WordSize + Address;
+    }
 
     // We do replicate the function id generation scheme implemented in the
     // XRay runtime.
@@ -209,8 +240,8 @@ loadYAML(sys::fs::file_t Fd, size_t FileSize, StringRef Filename,
   for (const auto &Y : YAMLSleds) {
     FunctionAddresses[Y.FuncId] = Y.Function;
     FunctionIds[Y.Function] = Y.FuncId;
-    Sleds.push_back(
-        SledEntry{Y.Address, Y.Function, Y.Kind, Y.AlwaysInstrument});
+    Sleds.push_back(SledEntry{Y.Address, Y.Function, Y.Kind, Y.AlwaysInstrument,
+                              Y.Version});
   }
   return Error::success();
 }
@@ -228,7 +259,8 @@ llvm::xray::loadInstrumentationMap(StringRef Filename) {
   if (!ObjectFileOrError) {
     auto E = ObjectFileOrError.takeError();
     // We try to load it as YAML if the ELF load didn't work.
-    Expected<sys::fs::file_t> FdOrErr = sys::fs::openNativeFileForRead(Filename);
+    Expected<sys::fs::file_t> FdOrErr =
+        sys::fs::openNativeFileForRead(Filename);
     if (!FdOrErr) {
       // Report the ELF load error if YAML failed.
       consumeError(FdOrErr.takeError());
@@ -250,7 +282,7 @@ llvm::xray::loadInstrumentationMap(StringRef Filename) {
                           Map.FunctionAddresses, Map.FunctionIds))
       return std::move(E);
   } else if (auto E = loadObj(Filename, *ObjectFileOrError, Map.Sleds,
-                                Map.FunctionAddresses, Map.FunctionIds)) {
+                              Map.FunctionAddresses, Map.FunctionIds)) {
     return std::move(E);
   }
   return Map;
diff --git a/llvm/lib/XRay/Trace.cpp b/llvm/lib/XRay/Trace.cpp
index 4f107e1059cc..5ceb269b6d1d 100644
--- a/llvm/lib/XRay/Trace.cpp
+++ b/llvm/lib/XRay/Trace.cpp
@@ -410,6 +410,7 @@ Expected<Trace> llvm::xray::loadTraceFile(StringRef Filename, bool Sort) {
   auto TraceOrError = loadTrace(LittleEndianDE, Sort);
   if (!TraceOrError) {
     DataExtractor BigEndianDE(Data, false, 8);
+    consumeError(TraceOrError.takeError());
     TraceOrError = loadTrace(BigEndianDE, Sort);
   }
   return TraceOrError;